diff --git a/.circleci/config.yml b/.circleci/config.yml
index 7b839f250dc1..4e6718005ab7 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -12,7 +12,7 @@ jobs:
     # Ensure running with CircleCI/huggingface
     check_circleci_user:
         docker:
-            - image: cimg/python:3.7.12
+            - image: cimg/python:3.8.12
         parallelism: 1
         steps:
             - run: echo $CIRCLE_PROJECT_USERNAME
@@ -26,13 +26,13 @@ jobs:
     fetch_tests:
         working_directory: ~/transformers
         docker:
-            - image: cimg/python:3.7.12
+            - image: cimg/python:3.8.12
         parallelism: 1
         steps:
             - checkout
-            - run: pip install --upgrade pip
-            - run: pip install GitPython
-            - run: pip install .
+            - run: pip install --upgrade --upgrade-strategy eager pip
+            - run: pip install -U --upgrade-strategy eager GitPython
+            - run: pip install -U --upgrade-strategy eager .
             - run: mkdir -p test_preparation
             - run: python utils/tests_fetcher.py | tee tests_fetched_summary.txt
             - store_artifacts:
@@ -43,6 +43,24 @@ jobs:
                 else
                     touch test_preparation/test_list.txt
                 fi
+            - run: |
+                  if [ -f examples_test_list.txt ]; then
+                      mv examples_test_list.txt test_preparation/examples_test_list.txt
+                  else
+                      touch test_preparation/examples_test_list.txt
+                  fi
+            - run: |
+                  if [ -f filtered_test_list_cross_tests.txt ]; then
+                      mv filtered_test_list_cross_tests.txt test_preparation/filtered_test_list_cross_tests.txt
+                  else
+                      touch test_preparation/filtered_test_list_cross_tests.txt
+                  fi
+            - run: |
+                if [ -f doctest_list.txt ]; then
+                    cp doctest_list.txt test_preparation/doctest_list.txt
+                else
+                    touch test_preparation/doctest_list.txt
+                fi
             - run: |
                 if [ -f test_repo_utils.txt ]; then
                     mv test_repo_utils.txt test_preparation/test_repo_utils.txt
@@ -56,15 +74,10 @@ jobs:
                 else
                     touch test_preparation/filtered_test_list.txt
                 fi
-            - run: python utils/tests_fetcher.py --filters tests examples | tee examples_tests_fetched_summary.txt
-            - run: |
-                  if [ -f test_list.txt ]; then
-                      mv test_list.txt test_preparation/examples_test_list.txt
-                  else
-                      touch test_preparation/examples_test_list.txt
-                  fi
             - store_artifacts:
                   path: test_preparation/test_list.txt
+            - store_artifacts:
+                  path: test_preparation/doctest_list.txt
             - store_artifacts:
                   path: ~/transformers/test_preparation/filtered_test_list.txt
             - store_artifacts:
@@ -78,6 +91,8 @@ jobs:
             - run: cp test_preparation/generated_config.yml test_preparation/generated_config.txt
             - store_artifacts:
                   path: test_preparation/generated_config.txt
+            - store_artifacts:
+                  path: test_preparation/filtered_test_list_cross_tests.txt
             - continuation/continue:
                   configuration_path: test_preparation/generated_config.yml
 
@@ -85,17 +100,17 @@ jobs:
     fetch_all_tests:
         working_directory: ~/transformers
         docker:
-            - image: cimg/python:3.7.12
+            - image: cimg/python:3.8.12
         parallelism: 1
         steps:
             - checkout
-            - run: pip install --upgrade pip
-            - run: pip install GitPython
-            - run: pip install .
+            - run: pip install --upgrade --upgrade-strategy eager pip
+            - run: pip install -U --upgrade-strategy eager GitPython
+            - run: pip install -U --upgrade-strategy eager .
             - run: |
                   mkdir test_preparation
                   echo -n "tests" > test_preparation/test_list.txt
-                  echo -n "tests" > test_preparation/examples_test_list.txt
+                  echo -n "all" > test_preparation/examples_test_list.txt
                   echo -n "tests/repo_utils" > test_preparation/test_repo_utils.txt
             - run: |
                   echo -n "tests" > test_list.txt
@@ -111,7 +126,7 @@ jobs:
     check_code_quality:
         working_directory: ~/transformers
         docker:
-            - image: cimg/python:3.7.12
+            - image: cimg/python:3.8.12
         resource_class: large
         environment:
             TRANSFORMERS_IS_CI: yes
@@ -121,31 +136,38 @@ jobs:
             - checkout
             - restore_cache:
                   keys:
-                      - v0.5-code_quality-{{ checksum "setup.py" }}
-                      - v0.5-code-quality
-            - run: pip install --upgrade pip
-            - run: pip install .[all,quality]
+                      - v0.7-code_quality-pip-{{ checksum "setup.py" }}
+                      - v0.7-code-quality-pip
+            - restore_cache:
+                  keys:
+                      - v0.7-code_quality-site-packages-{{ checksum "setup.py" }}
+                      - v0.7-code-quality-site-packages
+            - run: pip install --upgrade --upgrade-strategy eager pip
+            - run: pip install -U --upgrade-strategy eager .[all,quality]
             - save_cache:
-                  key: v0.5-code_quality-{{ checksum "setup.py" }}
+                  key: v0.7-code_quality-pip-{{ checksum "setup.py" }}
                   paths:
                       - '~/.cache/pip'
+            - save_cache:
+                  key: v0.7-code_quality-site-packages-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.pyenv/versions/'
             - run:
                 name: Show installed libraries and their versions
                 command: pip freeze | tee installed.txt
             - store_artifacts:
                   path: ~/transformers/installed.txt
-            - run: black --check --preview examples tests src utils
-            - run: isort --check-only examples tests src utils
+            - run: black --check examples tests src utils
+            - run: ruff examples tests src utils
             - run: python utils/custom_init_isort.py --check_only
             - run: python utils/sort_auto_mappings.py --check_only
-            - run: flake8 examples tests src utils
             - run: doc-builder style src/transformers docs/source --max_len 119 --check_only --path_to_docs docs/source
             - run: python utils/check_doc_toc.py
 
     check_repository_consistency:
         working_directory: ~/transformers
         docker:
-            - image: cimg/python:3.7.12
+            - image: cimg/python:3.8.12
         resource_class: large
         environment:
             TRANSFORMERS_IS_CI: yes
@@ -155,14 +177,22 @@ jobs:
             - checkout
             - restore_cache:
                   keys:
-                      - v0.5-repository_consistency-{{ checksum "setup.py" }}
-                      - v0.5-repository_consistency
-            - run: pip install --upgrade pip
-            - run: pip install .[all,quality]
+                      - v0.7-repository_consistency-pip-{{ checksum "setup.py" }}
+                      - v0.7-repository_consistency-pip
+            - restore_cache:
+                  keys:
+                      - v0.7-repository_consistency-site-packages-{{ checksum "setup.py" }}
+                      - v0.7-repository_consistency-site-packages
+            - run: pip install --upgrade --upgrade-strategy eager pip
+            - run: pip install -U --upgrade-strategy eager .[all,quality]
             - save_cache:
-                  key: v0.5-repository_consistency-{{ checksum "setup.py" }}
+                  key: v0.7-repository_consistency-pip-{{ checksum "setup.py" }}
                   paths:
                       - '~/.cache/pip'
+            - save_cache:
+                  key: v0.7-repository_consistency-site-packages-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.pyenv/versions/'
             - run:
                 name: Show installed libraries and their versions
                 command: pip freeze | tee installed.txt
@@ -174,9 +204,12 @@ jobs:
             - run: python utils/check_repo.py
             - run: python utils/check_inits.py
             - run: python utils/check_config_docstrings.py
+            - run: python utils/check_config_attributes.py
+            - run: python utils/check_doctest_list.py
             - run: make deps_table_check_updated
-            - run: python utils/tests_fetcher.py --sanity_check
             - run: python utils/update_metadata.py --check-only
+            - run: python utils/check_task_guides.py
+            - run: python utils/check_docstrings.py
 
 workflows:
     version: 2
diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
index 599691bf1006..45ba5cd10cc5 100644
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -15,16 +15,37 @@
 
 import argparse
 import copy
+import glob
 import os
+import random
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional
 
 import yaml
 
 
-COMMON_ENV_VARIABLES = {"OMP_NUM_THREADS": 1, "TRANSFORMERS_IS_CI": True, "PYTEST_TIMEOUT": 120}
-COMMON_PYTEST_OPTIONS = {"max-worker-restart": 0, "dist": "loadfile", "s": None}
-DEFAULT_DOCKER_IMAGE = [{"image": "cimg/python:3.7.12"}]
+COMMON_ENV_VARIABLES = {
+    "OMP_NUM_THREADS": 1,
+    "TRANSFORMERS_IS_CI": True,
+    "PYTEST_TIMEOUT": 120,
+    "RUN_PIPELINE_TESTS": False,
+    "RUN_PT_TF_CROSS_TESTS": False,
+    "RUN_PT_FLAX_CROSS_TESTS": False,
+}
+# Disable the use of {"s": None} as the output is way too long, causing the navigation on CircleCI impractical
+COMMON_PYTEST_OPTIONS = {"max-worker-restart": 0, "dist": "loadfile"}
+DEFAULT_DOCKER_IMAGE = [{"image": "cimg/python:3.8.12"}]
+
+
+class EmptyJob:
+    job_name = "empty"
+
+    def to_dict(self):
+        return {
+            "working_directory": "~/transformers",
+            "docker": copy.deepcopy(DEFAULT_DOCKER_IMAGE),
+            "steps":["checkout"],
+        }
 
 
 @dataclass
@@ -32,7 +53,7 @@ class CircleCIJob:
     name: str
     additional_env: Dict[str, Any] = None
     cache_name: str = None
-    cache_version: str = "0.5"
+    cache_version: str = "0.7"
     docker_image: List[Dict[str, str]] = None
     install_steps: List[str] = None
     marker: Optional[str] = None
@@ -42,6 +63,8 @@ class CircleCIJob:
     resource_class: Optional[str] = "xlarge"
     tests_to_run: Optional[List[str]] = None
     working_directory: str = "~/transformers"
+    # This should be only used for doctest job!
+    command_timeout: Optional[int] = None
 
     def __post_init__(self):
         # Deal with defaults for mutable attributes.
@@ -58,12 +81,21 @@ def __post_init__(self):
             self.pytest_options = {}
         if isinstance(self.tests_to_run, str):
             self.tests_to_run = [self.tests_to_run]
+        if self.parallelism is None:
+            self.parallelism = 1
 
     def to_dict(self):
+        env = COMMON_ENV_VARIABLES.copy()
+        env.update(self.additional_env)
+
+        cache_branch_prefix = os.environ.get("CIRCLE_BRANCH", "pull")
+        if cache_branch_prefix != "main":
+            cache_branch_prefix = "pull"
+
         job = {
             "working_directory": self.working_directory,
             "docker": self.docker_image,
-            "environment": {**COMMON_ENV_VARIABLES, **self.additional_env},
+            "environment": env,
         }
         if self.resource_class is not None:
             job["resource_class"] = self.resource_class
@@ -75,8 +107,21 @@ def to_dict(self):
             {
                 "restore_cache": {
                     "keys": [
-                        f"v{self.cache_version}-{self.cache_name}-" + '{{ checksum "setup.py" }}',
-                        f"v{self.cache_version}-{self.cache_name}-",
+                        # check the fully-matched cache first
+                        f"v{self.cache_version}-{self.cache_name}-{cache_branch_prefix}-pip-" + '{{ checksum "setup.py" }}',
+                        # try the partially-matched cache from `main`
+                        f"v{self.cache_version}-{self.cache_name}-main-pip-",
+                        # try the general partially-matched cache
+                        f"v{self.cache_version}-{self.cache_name}-{cache_branch_prefix}-pip-",
+                    ]
+                }
+            },
+            {
+                "restore_cache": {
+                    "keys": [
+                        f"v{self.cache_version}-{self.cache_name}-{cache_branch_prefix}-site-packages-" + '{{ checksum "setup.py" }}',
+                        f"v{self.cache_version}-{self.cache_name}-main-site-packages-",
+                        f"v{self.cache_version}-{self.cache_name}-{cache_branch_prefix}-site-packages-",
                     ]
                 }
             },
@@ -85,28 +130,137 @@ def to_dict(self):
         steps.append(
             {
                 "save_cache": {
-                    "key": f"v{self.cache_version}-{self.cache_name}-" + '{{ checksum "setup.py" }}',
+                    "key": f"v{self.cache_version}-{self.cache_name}-{cache_branch_prefix}-pip-" + '{{ checksum "setup.py" }}',
                     "paths": ["~/.cache/pip"],
                 }
             }
         )
+        steps.append(
+            {
+                "save_cache": {
+                    "key": f"v{self.cache_version}-{self.cache_name}-{cache_branch_prefix}-site-packages-" + '{{ checksum "setup.py" }}',
+                    "paths": ["~/.pyenv/versions/"],
+                }
+            }
+        )
         steps.append({"run": {"name": "Show installed libraries and their versions", "command": "pip freeze | tee installed.txt"}})
         steps.append({"store_artifacts": {"path": "~/transformers/installed.txt"}})
 
         all_options = {**COMMON_PYTEST_OPTIONS, **self.pytest_options}
-        pytest_flags = [f"--{key}={value}" if value is not None else f"-{key}" for key, value in all_options.items()]
+        pytest_flags = [f"--{key}={value}" if (value is not None or key in ["doctest-modules"]) else f"-{key}" for key, value in all_options.items()]
         pytest_flags.append(
             f"--make-reports={self.name}" if "examples" in self.name else f"--make-reports=tests_{self.name}"
         )
-        test_command = f"python -m pytest -n {self.pytest_num_workers} " + " ".join(pytest_flags)
-        if self.tests_to_run is None:
-            test_command += " << pipeline.parameters.tests_to_run >>"
+
+        steps.append({"run": {"name": "Create `test-results` directory", "command": "mkdir test-results"}})
+
+        test_command = ""
+        if self.command_timeout:
+            test_command = f"timeout {self.command_timeout} "
+        test_command += f"python -m pytest --junitxml=test-results/junit.xml -n {self.pytest_num_workers} " + " ".join(pytest_flags)
+
+        if self.parallelism == 1:
+            if self.tests_to_run is None:
+                test_command += " << pipeline.parameters.tests_to_run >>"
+            else:
+                test_command += " " + " ".join(self.tests_to_run)
         else:
-            test_command += " " + " ".join(self.tests_to_run)
+            # We need explicit list instead of `pipeline.parameters.tests_to_run` (only available at job runtime)
+            tests = self.tests_to_run
+            if tests is None:
+                folder = os.environ["test_preparation_dir"]
+                test_file = os.path.join(folder, "filtered_test_list.txt")
+                if os.path.exists(test_file):
+                    with open(test_file) as f:
+                        tests = f.read().split(" ")
+
+            # expand the test list
+            if tests == ["tests"]:
+                tests = [os.path.join("tests", x) for x in os.listdir("tests")]
+            expanded_tests = []
+            for test in tests:
+                if test.endswith(".py"):
+                    expanded_tests.append(test)
+                elif test == "tests/models":
+                    expanded_tests.extend([os.path.join(test, x) for x in os.listdir(test)])
+                elif test == "tests/pipelines":
+                    expanded_tests.extend([os.path.join(test, x) for x in os.listdir(test)])
+                else:
+                    expanded_tests.append(test)
+            # Avoid long tests always being collected together
+            random.shuffle(expanded_tests)
+            tests = " ".join(expanded_tests)
+
+            # Each executor to run ~10 tests
+            n_executors = max(len(tests) // 10, 1)
+            # Avoid empty test list on some executor(s) or launching too many executors
+            if n_executors > self.parallelism:
+                n_executors = self.parallelism
+            job["parallelism"] = n_executors
+
+            # Need to be newline separated for the command `circleci tests split` below
+            command = f'echo {tests} | tr " " "\\n" >> tests.txt'
+            steps.append({"run": {"name": "Get tests", "command": command}})
+
+            command = 'TESTS=$(circleci tests split tests.txt) && echo $TESTS > splitted_tests.txt'
+            steps.append({"run": {"name": "Split tests", "command": command}})
+
+            steps.append({"store_artifacts": {"path": "~/transformers/tests.txt"}})
+            steps.append({"store_artifacts": {"path": "~/transformers/splitted_tests.txt"}})
+
+            test_command = ""
+            if self.timeout:
+                test_command = f"timeout {self.timeout} "
+            test_command += f"python -m pytest -n {self.pytest_num_workers} " + " ".join(pytest_flags)
+            test_command += " $(cat splitted_tests.txt)"
         if self.marker is not None:
             test_command += f" -m {self.marker}"
-        test_command += " | tee tests_output.txt"
+
+        if self.name == "pr_documentation_tests":
+            # can't use ` | tee tee tests_output.txt` as usual
+            test_command += " > tests_output.txt"
+            # Save the return code, so we can check if it is timeout in the next step.
+            test_command += '; touch "$?".txt'
+            # Never fail the test step for the doctest job. We will check the results in the next step, and fail that
+            # step instead if the actual test failures are found. This is to avoid the timeout being reported as test
+            # failure.
+            test_command = f"({test_command}) || true"
+        else:
+            test_command += " || true"
         steps.append({"run": {"name": "Run tests", "command": test_command}})
+
+        # Deal with errors
+        check_test_command = f'if [ -s reports/{self.job_name}/errors.txt ]; '
+        check_test_command += 'then echo "Some tests errored out!"; echo ""; '
+        check_test_command += f'cat reports/{self.job_name}/errors.txt; '
+        check_test_command += 'echo ""; echo ""; '
+
+        py_command = f'import os; fp = open("reports/{self.job_name}/summary_short.txt"); failed = os.linesep.join([x for x in fp.read().split(os.linesep) if x.startswith("ERROR ")]); fp.close(); fp = open("summary_short.txt", "w"); fp.write(failed); fp.close()'
+        check_test_command += f"$(python3 -c '{py_command}'); "
+        check_test_command += f'cat summary_short.txt; echo ""; exit -1; '
+
+        # Deeal with failed tests
+        check_test_command += f'elif [ -s reports/{self.job_name}/failures_short.txt ]; '
+        check_test_command += 'then echo "Some tests failed!"; echo ""; '
+        check_test_command += f'cat reports/{self.job_name}/failures_short.txt; '
+        check_test_command += 'echo ""; echo ""; '
+
+        py_command = f'import os; fp = open("reports/{self.job_name}/summary_short.txt"); failed = os.linesep.join([x for x in fp.read().split(os.linesep) if x.startswith("FAILED ")]); fp.close(); fp = open("summary_short.txt", "w"); fp.write(failed); fp.close()'
+        check_test_command += f"$(python3 -c '{py_command}'); "
+        check_test_command += f'cat summary_short.txt; echo ""; exit -1; '
+
+        check_test_command += f'elif [ -s reports/{self.job_name}/stats.txt ]; then echo "All tests pass!"; '
+
+        # return code `124` means the previous (pytest run) step is timeout
+        if self.name == "pr_documentation_tests":
+            check_test_command += 'elif [ -f 124.txt ]; then echo "doctest timeout!"; '
+
+        check_test_command += 'else echo "other fatal error"; echo ""; exit -1; fi;'
+
+        steps.append({"run": {"name": "Check test results", "command": check_test_command}})
+
+        steps.append({"store_test_results": {"path": "test-results"}})
+
         steps.append({"store_artifacts": {"path": "~/transformers/tests_output.txt"}})
         steps.append({"store_artifacts": {"path": "~/transformers/reports"}})
         job["steps"] = steps
@@ -122,12 +276,12 @@ def job_name(self):
     "torch_and_tf",
     additional_env={"RUN_PT_TF_CROSS_TESTS": True},
     install_steps=[
-        "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng git-lfs",
+        "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng git-lfs cmake",
         "git lfs install",
-        "pip install --upgrade pip",
-        "pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]",
-        "pip install tensorflow_probability",
-        "pip install git+https://github.com/huggingface/accelerate",
+        "pip install --upgrade --upgrade-strategy eager pip",
+        "pip install -U --upgrade-strategy eager .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]",
+        "pip install -U --upgrade-strategy eager tensorflow_probability",
+        "pip install -U --upgrade-strategy eager git+https://github.com/huggingface/accelerate",
     ],
     marker="is_pt_tf_cross_test",
     pytest_options={"rA": None, "durations": 0},
@@ -139,9 +293,9 @@ def job_name(self):
     additional_env={"RUN_PT_FLAX_CROSS_TESTS": True},
     install_steps=[
         "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng",
-        "pip install --upgrade pip",
-        "pip install .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]",
-        "pip install git+https://github.com/huggingface/accelerate",
+        "pip install -U --upgrade-strategy eager --upgrade pip",
+        "pip install -U --upgrade-strategy eager .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]",
+        "pip install -U --upgrade-strategy eager git+https://github.com/huggingface/accelerate",
     ],
     marker="is_pt_flax_cross_test",
     pytest_options={"rA": None, "durations": 0},
@@ -152,23 +306,24 @@ def job_name(self):
     "torch",
     install_steps=[
         "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng time",
-        "pip install --upgrade pip",
-        "pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]",
-        "pip install git+https://github.com/huggingface/accelerate",
+        "pip install --upgrade --upgrade-strategy eager pip",
+        "pip install -U --upgrade-strategy eager .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]",
+        "pip install -U --upgrade-strategy eager git+https://github.com/huggingface/accelerate",
     ],
-    pytest_num_workers=3,
+    parallelism=1,
+    pytest_num_workers=6,
 )
 
 
 tf_job = CircleCIJob(
     "tf",
     install_steps=[
-        "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng",
-        "pip install --upgrade pip",
-        "pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]",
-        "pip install tensorflow_probability",
+        "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng cmake",
+        "pip install --upgrade --upgrade-strategy eager pip",
+        "pip install -U --upgrade-strategy eager .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]",
+        "pip install -U --upgrade-strategy eager tensorflow_probability",
     ],
-    pytest_options={"rA": None},
+    parallelism=1,
 )
 
 
@@ -176,34 +331,36 @@ def job_name(self):
     "flax",
     install_steps=[
         "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng",
-        "pip install --upgrade pip",
-        "pip install .[flax,testing,sentencepiece,flax-speech,vision]",
+        "pip install --upgrade --upgrade-strategy eager pip",
+        "pip install -U --upgrade-strategy eager .[flax,testing,sentencepiece,flax-speech,vision]",
     ],
-    pytest_options={"rA": None},
+    parallelism=1,
 )
 
 
 pipelines_torch_job = CircleCIJob(
     "pipelines_torch",
+    additional_env={"RUN_PIPELINE_TESTS": True},
     install_steps=[
         "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng",
-        "pip install --upgrade pip",
-        "pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm,video]",
+        "pip install --upgrade --upgrade-strategy eager pip",
+        "pip install -U --upgrade-strategy eager .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm,video]",
     ],
-    pytest_options={"rA": None},
-    tests_to_run="tests/pipelines/"
+    marker="is_pipeline_test",
+    pytest_num_workers=6,
 )
 
 
 pipelines_tf_job = CircleCIJob(
     "pipelines_tf",
+    additional_env={"RUN_PIPELINE_TESTS": True},
     install_steps=[
-        "pip install --upgrade pip",
-        "pip install .[sklearn,tf-cpu,testing,sentencepiece,vision]",
-        "pip install tensorflow_probability",
+        "sudo apt-get -y update && sudo apt-get install -y cmake",
+        "pip install --upgrade --upgrade-strategy eager pip",
+        "pip install -U --upgrade-strategy eager .[sklearn,tf-cpu,testing,sentencepiece,vision]",
+        "pip install -U --upgrade-strategy eager tensorflow_probability",
     ],
-    pytest_options={"rA": None},
-    tests_to_run="tests/pipelines/"
+    marker="is_pipeline_test",
 )
 
 
@@ -222,8 +379,8 @@ def job_name(self):
                 "sudo cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr/local\n"
                 "sudo make install\n",
         },
-        "pip install --upgrade pip",
-        "pip install .[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]",
+        "pip install --upgrade --upgrade-strategy eager pip",
+        "pip install -U --upgrade-strategy eager .[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]",
         "python -m unidic download",
     ],
     parallelism=None,
@@ -241,11 +398,10 @@ def job_name(self):
     cache_name="torch_examples",
     install_steps=[
         "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng",
-        "pip install --upgrade pip",
-        "pip install .[sklearn,torch,sentencepiece,testing,torch-speech]",
-        "pip install -r examples/pytorch/_tests_requirements.txt",
+        "pip install --upgrade --upgrade-strategy eager pip",
+        "pip install -U --upgrade-strategy eager .[sklearn,torch,sentencepiece,testing,torch-speech]",
+        "pip install -U --upgrade-strategy eager -r examples/pytorch/_tests_requirements.txt",
     ],
-    tests_to_run="./examples/pytorch/",
 )
 
 
@@ -253,11 +409,11 @@ def job_name(self):
     "examples_tensorflow",
     cache_name="tensorflow_examples",
     install_steps=[
-        "pip install --upgrade pip",
-        "pip install .[sklearn,tensorflow,sentencepiece,testing]",
-        "pip install -r examples/tensorflow/_tests_requirements.txt",
+        "sudo apt-get -y update && sudo apt-get install -y cmake",
+        "pip install --upgrade --upgrade-strategy eager pip",
+        "pip install -U --upgrade-strategy eager .[sklearn,tensorflow,sentencepiece,testing]",
+        "pip install -U --upgrade-strategy eager -r examples/tensorflow/_tests_requirements.txt",
     ],
-    tests_to_run="./examples/tensorflow/",
 )
 
 
@@ -265,22 +421,22 @@ def job_name(self):
     "examples_flax",
     cache_name="flax_examples",
     install_steps=[
-        "pip install --upgrade pip",
-        "pip install .[flax,testing,sentencepiece]",
-        "pip install -r examples/flax/_tests_requirements.txt",
+        "pip install --upgrade --upgrade-strategy eager pip",
+        "pip install -U --upgrade-strategy eager .[flax,testing,sentencepiece]",
+        "pip install -U --upgrade-strategy eager -r examples/flax/_tests_requirements.txt",
     ],
-    tests_to_run="./examples/flax/",
 )
 
 
 hub_job = CircleCIJob(
     "hub",
+    additional_env={"HUGGINGFACE_CO_STAGING": True},
     install_steps=[
         "sudo apt-get -y update && sudo apt-get install git-lfs",
         'git config --global user.email "ci@dummy.com"',
         'git config --global user.name "ci"',
-        "pip install --upgrade pip",
-        "pip install .[torch,sentencepiece,testing]",
+        "pip install --upgrade --upgrade-strategy eager pip",
+        "pip install -U --upgrade-strategy eager .[torch,sentencepiece,testing,vision]",
     ],
     marker="is_staging_test",
     pytest_num_workers=1,
@@ -290,8 +446,9 @@ def job_name(self):
 onnx_job = CircleCIJob(
     "onnx",
     install_steps=[
-        "pip install --upgrade pip",
-        "pip install .[torch,tf,testing,sentencepiece,onnxruntime,vision,rjieba]",
+        "sudo apt-get -y update && sudo apt-get install -y cmake",
+        "pip install --upgrade --upgrade-strategy eager pip",
+        "pip install -U --upgrade-strategy eager .[torch,tf,testing,sentencepiece,onnxruntime,vision,rjieba]",
     ],
     pytest_options={"k onnx": None},
     pytest_num_workers=1,
@@ -302,17 +459,23 @@ def job_name(self):
     "exotic_models",
     install_steps=[
         "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev",
-        "pip install --upgrade pip",
-        "pip install .[torch,testing,vision]",
-        "pip install torchvision",
-        "pip install 'git+https://github.com/facebookresearch/detectron2.git'",
+        "pip install --upgrade --upgrade-strategy eager pip",
+        "pip install -U --upgrade-strategy eager .[torch,testing,vision]",
+        "pip install -U --upgrade-strategy eager torchvision",
+        "pip install -U --upgrade-strategy eager scipy",
+        "pip install -U --upgrade-strategy eager 'git+https://github.com/facebookresearch/detectron2.git'",
         "sudo apt install tesseract-ocr",
-        "pip install pytesseract",
-        "pip install natten",
+        "pip install -U --upgrade-strategy eager pytesseract",
+        "pip install -U --upgrade-strategy eager natten",
+        "pip install -U --upgrade-strategy eager python-Levenshtein",
+        "pip install -U --upgrade-strategy eager opencv-python",
+        "pip install -U --upgrade-strategy eager nltk",
     ],
     tests_to_run=[
         "tests/models/*layoutlmv*",
         "tests/models/*nat",
+        "tests/models/deta",
+        "tests/models/nougat",
     ],
     pytest_num_workers=1,
     pytest_options={"durations": 100},
@@ -322,15 +485,57 @@ def job_name(self):
 repo_utils_job = CircleCIJob(
     "repo_utils",
     install_steps=[
-        "pip install --upgrade pip",
-        "pip install .[quality,testing]",
+        "pip install --upgrade --upgrade-strategy eager pip",
+        "pip install -U --upgrade-strategy eager .[quality,testing,torch]",
     ],
     parallelism=None,
     pytest_num_workers=1,
-    resource_class=None,
+    resource_class="large",
     tests_to_run="tests/repo_utils",
 )
 
+
+# We also include a `dummy.py` file in the files to be doc-tested to prevent edge case failure. Otherwise, the pytest
+# hangs forever during test collection while showing `collecting 0 items / 21 errors`. (To see this, we have to remove
+# the bash output redirection.)
+py_command = 'from utils.tests_fetcher import get_doctest_files; to_test = get_doctest_files() + ["dummy.py"]; to_test = " ".join(to_test); print(to_test)'
+py_command = f"$(python3 -c '{py_command}')"
+command = f'echo "{py_command}" > pr_documentation_tests_temp.txt'
+doc_test_job = CircleCIJob(
+    "pr_documentation_tests",
+    additional_env={"TRANSFORMERS_VERBOSITY": "error", "DATASETS_VERBOSITY": "error", "SKIP_CUDA_DOCTEST": "1"},
+    install_steps=[
+        "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng time ffmpeg",
+        "pip install --upgrade --upgrade-strategy eager pip",
+        "pip install -U --upgrade-strategy eager -e .[dev]",
+        "pip install -U --upgrade-strategy eager git+https://github.com/huggingface/accelerate",
+        "pip install --upgrade --upgrade-strategy eager pytest pytest-sugar",
+        "pip install -U --upgrade-strategy eager natten",
+        "find -name __pycache__ -delete",
+        "find . -name \*.pyc -delete",
+        # Add an empty file to keep the test step running correctly even no file is selected to be tested.
+        "touch dummy.py",
+        {
+            "name": "Get files to test",
+            "command": command,
+        },
+        {
+            "name": "Show information in `Get files to test`",
+            "command":
+                "cat pr_documentation_tests_temp.txt"
+        },
+        {
+            "name": "Get the last line in `pr_documentation_tests.txt`",
+            "command":
+                "tail -n1 pr_documentation_tests_temp.txt | tee pr_documentation_tests.txt"
+        },
+    ],
+    tests_to_run="$(cat pr_documentation_tests.txt)",  # noqa
+    pytest_options={"-doctest-modules": None, "doctest-glob": "*.md", "dist": "loadfile", "rvsA": None},
+    command_timeout=1200,  # test cannot run longer than 1200 seconds
+    pytest_num_workers=1,
+)
+
 REGULAR_TESTS = [
     torch_and_tf_job,
     torch_and_flax_job,
@@ -352,10 +557,14 @@ def job_name(self):
     pipelines_tf_job,
 ]
 REPO_UTIL_TESTS = [repo_utils_job]
+DOC_TESTS = [doc_test_job]
+
 
 def create_circleci_config(folder=None):
     if folder is None:
         folder = os.getcwd()
+    # Used in CircleCIJob.to_dict() to expand the test list (for using parallelism)
+    os.environ["test_preparation_dir"] = folder
     jobs = []
     all_test_file = os.path.join(folder, "test_list.txt")
     if os.path.exists(all_test_file):
@@ -375,25 +584,73 @@ def create_circleci_config(folder=None):
     if len(test_list) > 0:
         jobs.extend(REGULAR_TESTS)
 
+        extended_tests_to_run = set(test_list.split())
+        # Extend the test files for cross test jobs
+        for job in jobs:
+            if job.job_name in ["tests_torch_and_tf", "tests_torch_and_flax"]:
+                for test_path in copy.copy(extended_tests_to_run):
+                    dir_path, fn = os.path.split(test_path)
+                    if fn.startswith("test_modeling_tf_"):
+                        fn = fn.replace("test_modeling_tf_", "test_modeling_")
+                    elif fn.startswith("test_modeling_flax_"):
+                        fn = fn.replace("test_modeling_flax_", "test_modeling_")
+                    else:
+                        if job.job_name == "test_torch_and_tf":
+                            fn = fn.replace("test_modeling_", "test_modeling_tf_")
+                        elif job.job_name == "test_torch_and_flax":
+                            fn = fn.replace("test_modeling_", "test_modeling_flax_")
+                    new_test_file = str(os.path.join(dir_path, fn))
+                    if os.path.isfile(new_test_file):
+                        if new_test_file not in extended_tests_to_run:
+                            extended_tests_to_run.add(new_test_file)
+        extended_tests_to_run = sorted(extended_tests_to_run)
+        for job in jobs:
+            if job.job_name in ["tests_torch_and_tf", "tests_torch_and_flax"]:
+                job.tests_to_run = extended_tests_to_run
+        fn = "filtered_test_list_cross_tests.txt"
+        f_path = os.path.join(folder, fn)
+        with open(f_path, "w") as fp:
+            fp.write(" ".join(extended_tests_to_run))
+
     example_file = os.path.join(folder, "examples_test_list.txt")
     if os.path.exists(example_file) and os.path.getsize(example_file) > 0:
-        jobs.extend(EXAMPLES_TESTS)
-    
+        with open(example_file, "r", encoding="utf-8") as f:
+            example_tests = f.read()
+        for job in EXAMPLES_TESTS:
+            framework = job.name.replace("examples_", "").replace("torch", "pytorch")
+            if example_tests == "all":
+                job.tests_to_run = [f"examples/{framework}"]
+            else:
+                job.tests_to_run = [f for f in example_tests.split(" ") if f.startswith(f"examples/{framework}")]
+
+            if len(job.tests_to_run) > 0:
+                jobs.append(job)
+
+    doctest_file = os.path.join(folder, "doctest_list.txt")
+    if os.path.exists(doctest_file):
+        with open(doctest_file) as f:
+            doctest_list = f.read()
+    else:
+        doctest_list = []
+    if len(doctest_list) > 0:
+        jobs.extend(DOC_TESTS)
+
     repo_util_file = os.path.join(folder, "test_repo_utils.txt")
     if os.path.exists(repo_util_file) and os.path.getsize(repo_util_file) > 0:
         jobs.extend(REPO_UTIL_TESTS)
 
-    if len(jobs) > 0:
-        config = {"version": "2.1"}
-        config["parameters"] = {
-            # Only used to accept the parameters from the trigger
-            "nightly": {"type": "boolean", "default": False},
-            "tests_to_run": {"type": "string", "default": test_list},
-        }
-        config["jobs"] = {j.job_name: j.to_dict() for j in jobs}
-        config["workflows"] = {"version": 2, "run_tests": {"jobs": [j.job_name for j in jobs]}}
-        with open(os.path.join(folder, "generated_config.yml"), "w") as f:
-            f.write(yaml.dump(config, indent=2, width=1000000, sort_keys=False))
+    if len(jobs) == 0:
+        jobs = [EmptyJob()]
+    config = {"version": "2.1"}
+    config["parameters"] = {
+        # Only used to accept the parameters from the trigger
+        "nightly": {"type": "boolean", "default": False},
+        "tests_to_run": {"type": "string", "default": test_list},
+    }
+    config["jobs"] = {j.job_name: j.to_dict() for j in jobs}
+    config["workflows"] = {"version": 2, "run_tests": {"jobs": [j.job_name for j in jobs]}}
+    with open(os.path.join(folder, "generated_config.yml"), "w") as f:
+        f.write(yaml.dump(config, indent=2, width=1000000, sort_keys=False))
 
 
 if __name__ == "__main__":
diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml
index e4e78eceecea..1ec76462acfd 100644
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -26,8 +26,9 @@ body:
         Models:
 
           - text models: @ArthurZucker and @younesbelkada
-          - vision models: @amyeroberts and @NielsRogge
+          - vision models: @amyeroberts
           - speech models: @sanchit-gandhi
+          - graph models: @clefourrier
         
         Library:
         
@@ -36,14 +37,16 @@ body:
           - pipelines: @Narsil
           - tensorflow: @gante and @Rocketknight1
           - tokenizers: @ArthurZucker
-          - trainer: @sgugger
+          - trainer: @muellerzr and @pacman100
         
         Integrations:
         
-          - deepspeed: @stas00
+          - deepspeed: HF Trainer/Accelerate: @pacman100
           - ray/raytune: @richardliaw, @amogkam
+          - Big Model Inference: @SunMarc
+          - quantization (bitsandbytes, autogpt): @SunMarc and @younesbelkada
         
-        Documentation: @sgugger, @stevhliu and @MKhalusova
+        Documentation: @stevhliu and @MKhalusova
         
         Model hub:
 
@@ -59,7 +62,7 @@ body:
         Maintained examples (not research project or legacy):
         
           - Flax: @sanchit-gandhi
-          - PyTorch: @sgugger
+          - PyTorch: See Models above and tag the person corresponding to the modality of the example.
           - TensorFlow: @Rocketknight1
 
         Research projects are not maintained and should be taken as is.
diff --git a/.github/ISSUE_TEMPLATE/i18n.md b/.github/ISSUE_TEMPLATE/i18n.md
index 39d369a25324..52667f930508 100644
--- a/.github/ISSUE_TEMPLATE/i18n.md
+++ b/.github/ISSUE_TEMPLATE/i18n.md
@@ -23,23 +23,23 @@ Some notes:
 * Please translate in a gender-neutral way.
 * Add your translations to the folder called `<languageCode>` inside the [source folder](https://github.com/huggingface/transformers/tree/main/docs/source).
 * Register your translation in `<languageCode>/_toctree.yml`; please follow the order of the [English version](https://github.com/huggingface/transformers/blob/main/docs/source/en/_toctree.yml).
-* Once you're finished, open a pull request and tag this issue by including #issue-number in the description, where issue-number is the number of this issue. Please ping @ArthurZucker, @sgugger for review.
+* Once you're finished, open a pull request and tag this issue by including #issue-number in the description, where issue-number is the number of this issue. Please ping @stevhliu and @MKhalusova for review.
 * 🙋 If you'd like others to help you with the translation, you can also post in the 🤗 [forums](https://discuss.huggingface.co/).
 
 ## Get Started section
 
-- [ ] [index.mdx](https://github.com/huggingface/transformers/blob/main/docs/source/en/index.mdx) https://github.com/huggingface/transformers/pull/20180
-- [ ] [quicktour.mdx](https://github.com/huggingface/transformers/blob/main/docs/source/en/quicktour.mdx) (waiting for initial PR to go through)
-- [ ] [installation.mdx](https://github.com/huggingface/transformers/blob/main/docs/source/en/installation.mdx).
+- [ ] [index.md](https://github.com/huggingface/transformers/blob/main/docs/source/en/index.md) https://github.com/huggingface/transformers/pull/20180
+- [ ] [quicktour.md](https://github.com/huggingface/transformers/blob/main/docs/source/en/quicktour.md) (waiting for initial PR to go through)
+- [ ] [installation.md](https://github.com/huggingface/transformers/blob/main/docs/source/en/installation.md).
 
 ## Tutorial section
-- [ ] [pipeline_tutorial.mdx](https://github.com/huggingface/transformers/blob/main/docs/source/en/pipeline_tutorial.mdx)
-- [ ]  [autoclass_tutorial.mdx](https://github.com/huggingface/transformers/blob/master/docs/source/autoclass_tutorial.mdx)
-- [ ]  [preprocessing.mdx](https://github.com/huggingface/transformers/blob/main/docs/source/en/preprocessing.mdx)
-- [ ]  [training.mdx](https://github.com/huggingface/transformers/blob/main/docs/source/en/training.mdx)
-- [ ]  [accelerate.mdx](https://github.com/huggingface/transformers/blob/main/docs/source/en/accelerate.mdx)
-- [ ]  [model_sharing.mdx](https://github.com/huggingface/transformers/blob/main/docs/source/en/model_sharing.mdx)
-- [ ]  [multilingual.mdx](https://github.com/huggingface/transformers/blob/main/docs/source/en/multilingual.mdx)
+- [ ] [pipeline_tutorial.md](https://github.com/huggingface/transformers/blob/main/docs/source/en/pipeline_tutorial.md)
+- [ ]  [autoclass_tutorial.md](https://github.com/huggingface/transformers/blob/master/docs/source/autoclass_tutorial.md)
+- [ ]  [preprocessing.md](https://github.com/huggingface/transformers/blob/main/docs/source/en/preprocessing.md)
+- [ ]  [training.md](https://github.com/huggingface/transformers/blob/main/docs/source/en/training.md)
+- [ ]  [accelerate.md](https://github.com/huggingface/transformers/blob/main/docs/source/en/accelerate.md)
+- [ ]  [model_sharing.md](https://github.com/huggingface/transformers/blob/main/docs/source/en/model_sharing.md)
+- [ ]  [multilingual.md](https://github.com/huggingface/transformers/blob/main/docs/source/en/multilingual.md)
 
 <!--
 Keep on adding more as you go 🔥
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 4b3fe430d5c0..d9e6b15f00fd 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -40,8 +40,9 @@ members/contributors who may be interested in your PR.
 Models:
 
 - text models: @ArthurZucker and @younesbelkada
-- vision models: @amyeroberts and @NielsRogge
+- vision models: @amyeroberts
 - speech models: @sanchit-gandhi
+- graph models: @clefourrier
 
 Library:
 
@@ -50,14 +51,16 @@ Library:
 - pipelines: @Narsil
 - tensorflow: @gante and @Rocketknight1
 - tokenizers: @ArthurZucker
-- trainer: @sgugger
+- trainer: @muellerzr and @pacman100
 
 Integrations:
 
-- deepspeed: @stas00
+- deepspeed: HF Trainer/Accelerate: @pacman100
 - ray/raytune: @richardliaw, @amogkam
+- Big Model Inference: @SunMarc
+- quantization (bitsandbytes, autogpt): @SunMarc and @younesbelkada
 
-Documentation: @sgugger, @stevhliu and @MKhalusova
+Documentation: @stevhliu and @MKhalusova
 
 HF projects:
 
@@ -69,7 +72,7 @@ HF projects:
 Maintained examples (not research project or legacy):
 
 - Flax: @sanchit-gandhi
-- PyTorch: @sgugger
+- PyTorch: See Models above and tag the person corresponding to the modality of the example.
 - TensorFlow: @Rocketknight1
 
  -->
diff --git a/.github/conda/meta.yaml b/.github/conda/meta.yaml
index 6f6b680d1e6b..6bf33f842fbf 100644
--- a/.github/conda/meta.yaml
+++ b/.github/conda/meta.yaml
@@ -16,7 +16,6 @@ requirements:
     - pip
     - numpy >=1.17
     - dataclasses
-    - importlib_metadata
     - huggingface_hub
     - packaging
     - filelock
@@ -31,7 +30,6 @@ requirements:
     - python
     - numpy >=1.17
     - dataclasses
-    - importlib_metadata
     - huggingface_hub
     - packaging
     - filelock
diff --git a/.github/workflows/add-model-like.yml b/.github/workflows/add-model-like.yml
index 3ea3c89249fe..68133a7e2243 100644
--- a/.github/workflows/add-model-like.yml
+++ b/.github/workflows/add-model-like.yml
@@ -3,13 +3,13 @@ name: Add model like runner
 on:
   push:
     branches:
-      - main
-  pull_request:
-    paths:
-      - "src/**"
-      - "tests/**"
-      - ".github/**"
-    types: [opened, synchronize, reopened]
+      - none # put main here when this is fixed
+  #pull_request:
+  #  paths:
+  #    - "src/**"
+  #    - "tests/**"
+  #    - ".github/**"
+  #  types: [opened, synchronize, reopened]
 
 jobs:
   run_tests_templates_like:
diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml
index 03ecf450264d..710ad7afe77a 100644
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@@ -3,7 +3,7 @@ name: Build docker images (scheduled)
 on:
   push:
     branches:
-      - docker-image*
+      - build_ci_docker_image*
   repository_dispatch:
   workflow_call:
     inputs:
@@ -11,7 +11,7 @@ on:
         required: true
         type: string
   schedule:
-    - cron: "0 1 * * *"
+    - cron: "17 0 * * *"
 
 concurrency:
   group: docker-images-builds
@@ -22,21 +22,31 @@ jobs:
     name: "Latest PyTorch + TensorFlow [dev]"
     runs-on: ubuntu-latest
     steps:
+      - name: Cleanup disk
+        run: |
+          sudo ls -l /usr/local/lib/
+          sudo ls -l /usr/share/
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /usr/share/dotnet
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
       -
         name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
+        uses: docker/setup-buildx-action@v3
       -
         name: Check out code
         uses: actions/checkout@v3
       -
         name: Login to DockerHub
-        uses: docker/login-action@v2
+        uses: docker/login-action@v3
         with:
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_PASSWORD }}
       -
         name: Build and push
-        uses: docker/build-push-action@v3
+        uses: docker/build-push-action@v5
         with:
           context: ./docker/transformers-all-latest-gpu
           build-args: |
@@ -49,7 +59,7 @@ jobs:
         # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
         # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
         if: inputs.image_postfix != '-push-ci'
-        uses: docker/build-push-action@v3
+        uses: docker/build-push-action@v5
         with:
           context: ./docker/transformers-all-latest-gpu
           build-args: |
@@ -57,54 +67,35 @@ jobs:
           push: true
           tags: huggingface/transformers-all-latest-gpu-push-ci
 
-  latest-with-torch-nightly-docker:
-    name: "Nightly PyTorch + Stable TensorFlow"
-    # Push CI doesn't need this image
-    if: inputs.image_postfix != '-push-ci'
-    runs-on: ubuntu-latest
-    steps:
-      -
-        name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
-      -
-        name: Check out code
-        uses: actions/checkout@v3
-      -
-        name: Login to DockerHub
-        uses: docker/login-action@v2
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-      -
-        name: Build and push
-        uses: docker/build-push-action@v3
-        with:
-          context: ./docker/transformers-all-latest-gpu
-          build-args: |
-            REF=main
-            PYTORCH=pre
-          push: true
-          tags: huggingface/transformers-all-latest-torch-nightly-gpu
-
   latest-torch-deepspeed-docker:
     name: "Latest PyTorch + DeepSpeed"
     runs-on: ubuntu-latest
     steps:
+      - name: Cleanup disk
+        run: |
+          sudo ls -l /usr/local/lib/
+          sudo ls -l /usr/share/
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /usr/share/dotnet
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
       -
         name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
+        uses: docker/setup-buildx-action@v3
       -
         name: Check out code
         uses: actions/checkout@v3
       -
         name: Login to DockerHub
-        uses: docker/login-action@v2
+        uses: docker/login-action@v3
         with:
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_PASSWORD }}
       -
         name: Build and push
-        uses: docker/build-push-action@v3
+        uses: docker/build-push-action@v5
         with:
           context: ./docker/transformers-pytorch-deepspeed-latest-gpu
           build-args: |
@@ -117,15 +108,25 @@ jobs:
     name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)"
     runs-on: ubuntu-latest
     steps:
+      - name: Cleanup disk
+        run: |
+          sudo ls -l /usr/local/lib/
+          sudo ls -l /usr/share/
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /usr/share/dotnet
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
       -
         name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
+        uses: docker/setup-buildx-action@v3
       -
         name: Check out code
         uses: actions/checkout@v3
       -
         name: Login to DockerHub
-        uses: docker/login-action@v2
+        uses: docker/login-action@v3
         with:
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_PASSWORD }}
@@ -135,7 +136,7 @@ jobs:
         # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
         # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
         if: inputs.image_postfix != '-push-ci'
-        uses: docker/build-push-action@v3
+        uses: docker/build-push-action@v5
         with:
           context: ./docker/transformers-pytorch-deepspeed-latest-gpu
           build-args: |
@@ -143,87 +144,104 @@ jobs:
           push: true
           tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
 
-  nightly-torch-deepspeed-docker:
-    name: "Nightly PyTorch + DeepSpeed"
+  doc-builder:
+    name: "Doc builder"
     # Push CI doesn't need this image
     if: inputs.image_postfix != '-push-ci'
     runs-on: ubuntu-latest
     steps:
       -
         name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
+        uses: docker/setup-buildx-action@v3
       -
         name: Check out code
         uses: actions/checkout@v3
       -
         name: Login to DockerHub
-        uses: docker/login-action@v2
+        uses: docker/login-action@v3
         with:
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_PASSWORD }}
       -
         name: Build and push
-        uses: docker/build-push-action@v3
+        uses: docker/build-push-action@v5
         with:
-          context: ./docker/transformers-pytorch-deepspeed-nightly-gpu
-          build-args: |
-            REF=main
+          context: ./docker/transformers-doc-builder
           push: true
-          tags: huggingface/transformers-pytorch-deepspeed-nightly-gpu
+          tags: huggingface/transformers-doc-builder
 
-  doc-builder:
-    name: "Doc builder"
+  latest-pytorch:
+    name: "Latest PyTorch [dev]"
     # Push CI doesn't need this image
     if: inputs.image_postfix != '-push-ci'
     runs-on: ubuntu-latest
     steps:
+      - name: Cleanup disk
+        run: |
+          sudo ls -l /usr/local/lib/
+          sudo ls -l /usr/share/
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /usr/share/dotnet
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
       -
         name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
+        uses: docker/setup-buildx-action@v3
       -
         name: Check out code
         uses: actions/checkout@v3
       -
         name: Login to DockerHub
-        uses: docker/login-action@v2
+        uses: docker/login-action@v3
         with:
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_PASSWORD }}
       -
         name: Build and push
-        uses: docker/build-push-action@v3
+        uses: docker/build-push-action@v5
         with:
-          context: ./docker/transformers-doc-builder
+          context: ./docker/transformers-pytorch-gpu
+          build-args: |
+            REF=main
           push: true
-          tags: huggingface/transformers-doc-builder
+          tags: huggingface/transformers-pytorch-gpu
 
-  latest-pytorch:
-    name: "Latest PyTorch [dev]"
-    # Push CI doesn't need this image
-    if: inputs.image_postfix != '-push-ci'
-    runs-on: ubuntu-latest
+  latest-pytorch-amd:
+    name: "Latest PyTorch (AMD) [dev]"
+    runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210]
     steps:
-      -
-        name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
-      -
-        name: Check out code
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Check out code
         uses: actions/checkout@v3
-      -
-        name: Login to DockerHub
-        uses: docker/login-action@v2
+      - name: Login to DockerHub
+        uses: docker/login-action@v3
         with:
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      - name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-pytorch-amd-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-pytorch-amd-gpu${{ inputs.image_postfix }}
+      # Push CI images still need to be re-built daily
       -
-        name: Build and push
-        uses: docker/build-push-action@v3
+        name: Build and push (for Push CI) in a daily basis
+        # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
+        # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
+        if: inputs.image_postfix != '-push-ci'
+        uses: docker/build-push-action@v5
         with:
-          context: ./docker/transformers-pytorch-gpu
+          context: ./docker/transformers-pytorch-amd-gpu
           build-args: |
             REF=main
           push: true
-          tags: huggingface/transformers-pytorch-gpu
+          tags: huggingface/transformers-pytorch-amd-gpu-push-ci
 
   latest-tensorflow:
     name: "Latest TensorFlow [dev]"
@@ -233,19 +251,19 @@ jobs:
     steps:
       -
         name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
+        uses: docker/setup-buildx-action@v3
       -
         name: Check out code
         uses: actions/checkout@v3
       -
         name: Login to DockerHub
-        uses: docker/login-action@v2
+        uses: docker/login-action@v3
         with:
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_PASSWORD }}
       -
         name: Build and push
-        uses: docker/build-push-action@v3
+        uses: docker/build-push-action@v5
         with:
           context: ./docker/transformers-tensorflow-gpu
           build-args: |
diff --git a/.github/workflows/build-nightly-ci-docker-images.yml b/.github/workflows/build-nightly-ci-docker-images.yml
new file mode 100644
index 000000000000..1b8cab864d92
--- /dev/null
+++ b/.github/workflows/build-nightly-ci-docker-images.yml
@@ -0,0 +1,85 @@
+name: Build docker images (Nightly CI)
+
+on:
+  workflow_call:
+  push:
+    branches:
+      - build_nightly_ci_docker_image*
+
+concurrency:
+  group: docker-images-builds
+  cancel-in-progress: false
+
+jobs:
+  latest-with-torch-nightly-docker:
+    name: "Nightly PyTorch + Stable TensorFlow"
+    runs-on: ubuntu-latest
+    steps:
+      - name: Cleanup disk
+        run: |
+          sudo ls -l /usr/local/lib/
+          sudo ls -l /usr/share/
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /usr/share/dotnet
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+      -
+        name: Check out code
+        uses: actions/checkout@v3
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      -
+        name: Build and push
+        uses: docker/build-push-action@v3
+        with:
+          context: ./docker/transformers-all-latest-gpu
+          build-args: |
+            REF=main
+            PYTORCH=pre
+          push: true
+          tags: huggingface/transformers-all-latest-torch-nightly-gpu
+
+  nightly-torch-deepspeed-docker:
+    name: "Nightly PyTorch + DeepSpeed"
+    runs-on: ubuntu-latest
+    steps:
+      - name: Cleanup disk
+        run: |
+          sudo ls -l /usr/local/lib/
+          sudo ls -l /usr/share/
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /usr/share/dotnet
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+      -
+        name: Check out code
+        uses: actions/checkout@v3
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      -
+        name: Build and push
+        uses: docker/build-push-action@v3
+        with:
+          context: ./docker/transformers-pytorch-deepspeed-nightly-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-pytorch-deepspeed-nightly-gpu
\ No newline at end of file
diff --git a/.github/workflows/build-past-ci-docker-images.yml b/.github/workflows/build-past-ci-docker-images.yml
index 3a0e1612454c..aa47dfd08c2d 100644
--- a/.github/workflows/build-past-ci-docker-images.yml
+++ b/.github/workflows/build-past-ci-docker-images.yml
@@ -3,7 +3,7 @@ name: Build docker images (Past CI)
 on:
   push:
     branches:
-      - past-ci-docker-image*
+      - build_past_ci_docker_image*
 
 concurrency:
   group: docker-images-builds
@@ -15,7 +15,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        version: ["1.11", "1.10", "1.9", "1.8", "1.7", "1.6", "1.5", "1.4"]
+        version: ["1.13", "1.12", "1.11", "1.10"]
     runs-on: ubuntu-latest
     steps:
       -
@@ -24,6 +24,17 @@ jobs:
       -
         name: Check out code
         uses: actions/checkout@v3
+      -
+        id: get-base-image
+        name: Get Base Image
+        env:
+          framework_version: ${{ matrix.version }}
+        run: |
+          echo "base_image=$(python3 -c 'import os; from utils.past_ci_versions import past_versions_testing; base_image = past_versions_testing["pytorch"][os.environ["framework_version"]]["base_image"]; print(base_image)')" >> $GITHUB_OUTPUT
+      -
+        name: Print Base Image
+        run: |
+          echo ${{ steps.get-base-image.outputs.base_image }}
       -
         name: Login to DockerHub
         uses: docker/login-action@v2
@@ -37,6 +48,7 @@ jobs:
           context: ./docker/transformers-past-gpu
           build-args: |
             REF=main
+            BASE_DOCKER_IMAGE=${{ steps.get-base-image.outputs.base_image }}
             FRAMEWORK=pytorch
             VERSION=${{ matrix.version }}
           push: true
@@ -47,7 +59,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        version: ["2.8", "2.7", "2.6", "2.5"]
+        version: ["2.11", "2.10", "2.9", "2.8", "2.7", "2.6", "2.5"]
     runs-on: ubuntu-latest
     steps:
       -
@@ -57,37 +69,16 @@ jobs:
         name: Check out code
         uses: actions/checkout@v3
       -
-        name: Login to DockerHub
-        uses: docker/login-action@v2
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+        id: get-base-image
+        name: Get Base Image
+        env:
+          framework_version: ${{ matrix.version }}
+        run: |
+          echo "base_image=$(python3 -c 'import os; from utils.past_ci_versions import past_versions_testing; base_image = past_versions_testing["tensorflow"][os.environ["framework_version"]]["base_image"]; print(base_image)')" >> $GITHUB_OUTPUT
       -
-        name: Build and push
-        uses: docker/build-push-action@v3
-        with:
-          context: ./docker/transformers-past-gpu
-          build-args: |
-            REF=main
-            FRAMEWORK=tensorflow
-            VERSION=${{ matrix.version }}
-          push: true
-          tags: huggingface/transformers-tensorflow-past-${{ matrix.version }}-gpu
-
-  past-tensorflow-docker-2-4:
-    name: "Past TensorFlow Docker"
-    strategy:
-      fail-fast: false
-      matrix:
-        version: ["2.4"]
-    runs-on: ubuntu-latest
-    steps:
-      -
-        name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
-      -
-        name: Check out code
-        uses: actions/checkout@v3
+        name: Print Base Image
+        run: |
+          echo ${{ steps.get-base-image.outputs.base_image }}
       -
         name: Login to DockerHub
         uses: docker/login-action@v2
@@ -101,8 +92,8 @@ jobs:
           context: ./docker/transformers-past-gpu
           build-args: |
             REF=main
-            BASE_DOCKER_IMAGE=nvidia/cuda:11.0.3-cudnn8-devel-ubuntu20.04
+            BASE_DOCKER_IMAGE=${{ steps.get-base-image.outputs.base_image }}
             FRAMEWORK=tensorflow
             VERSION=${{ matrix.version }}
           push: true
-          tags: huggingface/transformers-tensorflow-past-${{ matrix.version }}-gpu
\ No newline at end of file
+          tags: huggingface/transformers-tensorflow-past-${{ matrix.version }}-gpu
diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml
index 9f29a7d7a7ef..6eecff24eb17 100644
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@@ -15,6 +15,7 @@ jobs:
       commit_sha: ${{ github.sha }}
       package: transformers
       notebook_folder: transformers_doc
-      languages: de en es it ko pt zh
+      languages: de en es fr it ko pt zh
     secrets:
       token: ${{ secrets.HUGGINGFACE_PUSH }}
+      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
index 0c8aa237f36e..640a0cb2f59f 100644
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -14,4 +14,4 @@ jobs:
       commit_sha: ${{ github.event.pull_request.head.sha }}
       pr_number: ${{ github.event.number }}
       package: transformers
-      languages: de en es it ko pt zh
+      languages: de en es fr it ko pt zh
diff --git a/.github/workflows/check_runner_status.yml b/.github/workflows/check_runner_status.yml
index 8912e32c94ee..7d0e3853b5df 100644
--- a/.github/workflows/check_runner_status.yml
+++ b/.github/workflows/check_runner_status.yml
@@ -57,6 +57,7 @@ jobs:
           CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
           CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
           CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
+          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
           CI_EVENT: runner status check
           RUNNER_STATUS: ${{ needs.check_runner_status.result }}
           OFFLINE_RUNNERS: ${{ needs.check_runner_status.outputs.offline_runners }}
diff --git a/.github/workflows/check_tiny_models.yml b/.github/workflows/check_tiny_models.yml
new file mode 100644
index 000000000000..5a4cb9622f06
--- /dev/null
+++ b/.github/workflows/check_tiny_models.yml
@@ -0,0 +1,82 @@
+name: Check Tiny Models
+
+on:
+  push:
+    branches:
+      - check_tiny_models*
+  repository_dispatch:
+  schedule:
+    - cron: "0 2 * * *"
+
+env:
+  TOKEN: ${{ secrets.TRANSFORMERS_HUB_BOT_HF_TOKEN }}
+
+jobs:
+  check_tiny_models:
+    name: Check tiny models
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout transformers
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 2
+
+      - uses: actions/checkout@v3
+      - name: Set up Python 3.8
+        uses: actions/setup-python@v4
+        with:
+          # Semantic version range syntax or exact version of a Python version
+          python-version: '3.8'
+          # Optional - x64 or x86 architecture, defaults to x64
+          architecture: 'x64'
+
+      - name: Install
+        run: |
+          sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng cmake
+          pip install --upgrade pip
+          python -m pip install -U .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm,video,tf-cpu]
+          pip install tensorflow_probability
+          python -m pip install -U natten
+
+      - name: Create all tiny models (locally)
+        run: |
+          python utils/create_dummy_models.py tiny_local_models --all --num_workers 2
+
+      - name: Local tiny model reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: tiny_local_model_creation_reports
+          path: tiny_local_models/reports
+
+      # GitHub-hosted runners have 2-core CPUs
+      - name: Run pipeline tests against all new (local) tiny models
+        run: |
+          OMP_NUM_THREADS=1 TRANSFORMERS_TINY_MODEL_PATH=tiny_local_models python -m pytest --max-worker-restart=0 -n 2 --dist=loadfile -s -rA --make-reports=tests_pipelines tests/models -m is_pipeline_test -k "test_pipeline_" | tee tests_output.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: tiny_local_model_creation_reports
+          path: reports/tests_pipelines
+
+      - name: Create + Upload tiny models for new model architecture(s)
+        run: | 
+          python utils/update_tiny_models.py --num_workers 2
+
+      - name: Full report
+        run: cat tiny_models/reports/tiny_model_creation_report.json
+
+      - name: Failure report
+        run: cat tiny_models/reports/simple_failed_report.txt
+
+      - name: Summary report
+        run: cat tiny_models/reports/tiny_model_summary.json
+
+      - name: New tiny model creation reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: tiny_model_creation_reports
+          path: tiny_models/reports
diff --git a/.github/workflows/delete_doc_comment.yml b/.github/workflows/delete_doc_comment.yml
index d894f991ca9c..8604019d76eb 100644
--- a/.github/workflows/delete_doc_comment.yml
+++ b/.github/workflows/delete_doc_comment.yml
@@ -1,13 +1,14 @@
-name: Delete dev documentation
+name: Delete doc comment
 
 on:
-  pull_request:
-    types: [ closed ]
+  workflow_run:
+    workflows: ["Delete doc comment trigger"]
+    types:
+      - completed
 
 
 jobs:
   delete:
     uses: huggingface/doc-builder/.github/workflows/delete_doc_comment.yml@main
-    with:
-      pr_number: ${{ github.event.number }}
-      package: transformers
+    secrets:
+      comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}
\ No newline at end of file
diff --git a/.github/workflows/delete_doc_comment_trigger.yml b/.github/workflows/delete_doc_comment_trigger.yml
new file mode 100644
index 000000000000..f87d9bd4dca7
--- /dev/null
+++ b/.github/workflows/delete_doc_comment_trigger.yml
@@ -0,0 +1,12 @@
+name: Delete doc comment trigger
+
+on:
+  pull_request:
+    types: [ closed ]
+
+
+jobs:
+  delete:
+    uses: huggingface/doc-builder/.github/workflows/delete_doc_comment_trigger.yml@main
+    with:
+      pr_number: ${{ github.event.number }}
diff --git a/.github/workflows/doctests.yml b/.github/workflows/doctests.yml
index d65698e2a4f3..236ccbcd253a 100644
--- a/.github/workflows/doctests.yml
+++ b/.github/workflows/doctests.yml
@@ -6,7 +6,7 @@ on:
       - doctest*
   repository_dispatch:
   schedule:
-    - cron: "0 2 * * *"
+    - cron: "17 2 * * *"
 
 
 env:
@@ -20,31 +20,36 @@ env:
 
 jobs:
   run_doctests:
-    runs-on: [self-hosted, doc-tests-gpu]
+    runs-on: [single-gpu, nvidia-gpu, t4, doctest-ci]
     container:
       image: huggingface/transformers-all-latest-gpu
       options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
     steps:
+      - name: uninstall transformers (installed during docker image build)
+        run: python3 -m pip uninstall -y transformers
+
       - uses: actions/checkout@v3
       - name: NVIDIA-SMI
         run: |
           nvidia-smi
 
+      - name: Install transformers in edit mode
+        run: python3 -m pip install -e .[flax]
+
       - name: GPU visibility
         run: |
           python3 utils/print_env.py
 
-      - name: Prepare files for doctests
-        run: |
-          python3 utils/prepare_for_doc_test.py src docs
+      - name: Show installed libraries and their versions
+        run: pip freeze
 
-      - name: Run doctests
+      - name: Get doctest files
         run: |
-          python3 -m pytest -v --make-reports doc_tests_gpu --doctest-modules $(cat utils/documentation_tests.txt) -sv --doctest-continue-on-failure --doctest-glob="*.mdx"
+          $(python3 -c 'from utils.tests_fetcher import get_all_doctest_files; to_test = get_all_doctest_files(); to_test = " ".join(to_test); fp = open("doc_tests.txt", "w"); fp.write(to_test); fp.close()')
 
-      - name: Clean files after doctests
+      - name: Run doctests
         run: |
-          python3 utils/prepare_for_doc_test.py src docs --remove_new_line
+          python3 -m pytest -v --make-reports doc_tests_gpu --doctest-modules $(cat doc_tests.txt) -sv --doctest-continue-on-failure --doctest-glob="*.md"
 
       - name: Failure short reports
         if: ${{ failure() }}
diff --git a/.github/workflows/self-nightly-past-ci-caller.yml b/.github/workflows/self-nightly-past-ci-caller.yml
new file mode 100644
index 000000000000..dfc258e5be85
--- /dev/null
+++ b/.github/workflows/self-nightly-past-ci-caller.yml
@@ -0,0 +1,145 @@
+name: Self-hosted runner (nightly-past-ci-caller)
+
+on:
+  schedule:
+    # 2:17 am on each Sunday and Thursday
+
+    - cron: "17 2 * * 0,4"
+  push:
+    branches:
+      - run_nightly_ci*
+      - run_past_ci*
+
+jobs:
+  build_nightly_ci_images:
+    name: Build Nightly CI Docker Images
+    if: (github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_nightly_ci'))
+    uses: ./.github/workflows/build-nightly-ci-docker-images.yml
+    secrets: inherit
+
+  run_nightly_ci:
+    name: Nightly CI
+    needs: [build_nightly_ci_images]
+    uses: ./.github/workflows/self-nightly-scheduled.yml
+    secrets: inherit
+
+  run_past_ci_pytorch_1-13:
+    name: PyTorch 1.13
+    if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')))
+    needs: [run_nightly_ci]
+    uses: ./.github/workflows/self-past.yml
+    with:
+      framework: pytorch
+      version: "1.13"
+      sha: ${{ github.sha }}
+    secrets: inherit
+
+  run_past_ci_pytorch_1-12:
+    name: PyTorch 1.12
+    if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')))
+    needs: [run_past_ci_pytorch_1-13]
+    uses: ./.github/workflows/self-past.yml
+    with:
+      framework: pytorch
+      version: "1.12"
+      sha: ${{ github.sha }}
+    secrets: inherit
+
+  run_past_ci_pytorch_1-11:
+    name: PyTorch 1.11
+    if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')))
+    needs: [run_past_ci_pytorch_1-12]
+    uses: ./.github/workflows/self-past.yml
+    with:
+      framework: pytorch
+      version: "1.11"
+      sha: ${{ github.sha }}
+    secrets: inherit
+
+  run_past_ci_pytorch_1-10:
+    name: PyTorch 1.10
+    if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')))
+    needs: [run_past_ci_pytorch_1-11]
+    uses: ./.github/workflows/self-past.yml
+    with:
+      framework: pytorch
+      version: "1.10"
+      sha: ${{ github.sha }}
+    secrets: inherit
+
+  run_past_ci_tensorflow_2-11:
+    name: TensorFlow 2.11
+    if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))
+    needs: [run_past_ci_pytorch_1-10]
+    uses: ./.github/workflows/self-past.yml
+    with:
+      framework: tensorflow
+      version: "2.11"
+      sha: ${{ github.sha }}
+    secrets: inherit
+
+  run_past_ci_tensorflow_2-10:
+    name: TensorFlow 2.10
+    if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))
+    needs: [run_past_ci_tensorflow_2-11]
+    uses: ./.github/workflows/self-past.yml
+    with:
+      framework: tensorflow
+      version: "2.10"
+      sha: ${{ github.sha }}
+    secrets: inherit
+
+  run_past_ci_tensorflow_2-9:
+    name: TensorFlow 2.9
+    if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))
+    needs: [run_past_ci_tensorflow_2-10]
+    uses: ./.github/workflows/self-past.yml
+    with:
+      framework: tensorflow
+      version: "2.9"
+      sha: ${{ github.sha }}
+    secrets: inherit
+
+  run_past_ci_tensorflow_2-8:
+    name: TensorFlow 2.8
+    if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))
+    needs: [run_past_ci_tensorflow_2-9]
+    uses: ./.github/workflows/self-past.yml
+    with:
+      framework: tensorflow
+      version: "2.8"
+      sha: ${{ github.sha }}
+    secrets: inherit
+
+  run_past_ci_tensorflow_2-7:
+    name: TensorFlow 2.7
+    if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))
+    needs: [run_past_ci_tensorflow_2-8]
+    uses: ./.github/workflows/self-past.yml
+    with:
+      framework: tensorflow
+      version: "2.7"
+      sha: ${{ github.sha }}
+    secrets: inherit
+
+  run_past_ci_tensorflow_2-6:
+    name: TensorFlow 2.6
+    if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))
+    needs: [run_past_ci_tensorflow_2-7]
+    uses: ./.github/workflows/self-past.yml
+    with:
+      framework: tensorflow
+      version: "2.6"
+      sha: ${{ github.sha }}
+    secrets: inherit
+
+  run_past_ci_tensorflow_2-5:
+    name: TensorFlow 2.5
+    if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))
+    needs: [run_past_ci_tensorflow_2-6]
+    uses: ./.github/workflows/self-past.yml
+    with:
+      framework: tensorflow
+      version: "2.5"
+      sha: ${{ github.sha }}
+    secrets: inherit
diff --git a/.github/workflows/self-nightly-scheduled.yml b/.github/workflows/self-nightly-scheduled.yml
index accccf6164bc..713e004d8e58 100644
--- a/.github/workflows/self-nightly-scheduled.yml
+++ b/.github/workflows/self-nightly-scheduled.yml
@@ -1,4 +1,4 @@
-name: Self-hosted runner (nightly)
+name: Self-hosted runner (nightly-ci)
 
 # Note that each job's dependencies go into a corresponding docker file.
 #
@@ -8,9 +8,7 @@ name: Self-hosted runner (nightly)
 
 on:
   repository_dispatch:
-# Disable temporarily until the test suite can be run under 12 hours.
-#  schedule:
-#    - cron: "0 16 * * *"
+  workflow_call:
 
 env:
   HF_HOME: /mnt/cache
@@ -33,7 +31,7 @@ jobs:
           fetch-depth: 2
 
       - name: Check Runner Status
-        run: python utils/check_self_hosted_runner.py --target_runners single-gpu-scheduled-ci-runner-docker,multi-gpu-scheduled-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
+        run: python utils/check_self_hosted_runner.py --target_runners single-gpu-past-ci-runner-docker,multi-gpu-past-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
 
   check_runners:
     name: Check Runners
@@ -41,7 +39,7 @@ jobs:
     strategy:
       matrix:
         machine_type: [single-gpu, multi-gpu]
-    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
+    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci]
     container:
       image: huggingface/transformers-all-latest-torch-nightly-gpu
       options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -56,7 +54,7 @@ jobs:
     strategy:
       matrix:
         machine_type: [single-gpu, multi-gpu]
-    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
+    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci]
     container:
       image: huggingface/transformers-all-latest-torch-nightly-gpu
       options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -96,7 +94,7 @@ jobs:
       matrix:
         folders: ${{ fromJson(needs.setup.outputs.matrix) }}
         machine_type: [single-gpu]
-    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
+    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci]
     container:
       image: huggingface/transformers-all-latest-torch-nightly-gpu
       options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -117,6 +115,10 @@ jobs:
         working-directory: /transformers
         run: git fetch && git checkout ${{ github.sha }}
 
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
       - name: NVIDIA-SMI
         run: |
           nvidia-smi
@@ -143,7 +145,7 @@ jobs:
         if: ${{ always() }}
         uses: actions/upload-artifact@v3
         with:
-          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
+          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_nightly
           path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
 
   run_tests_multi_gpu:
@@ -153,7 +155,7 @@ jobs:
       matrix:
         folders: ${{ fromJson(needs.setup.outputs.matrix) }}
         machine_type: [multi-gpu]
-    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
+    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci]
     container:
       image: huggingface/transformers-all-latest-torch-nightly-gpu
       options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -174,6 +176,10 @@ jobs:
         working-directory: /transformers
         run: git fetch && git checkout ${{ github.sha }}
 
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
       - name: NVIDIA-SMI
         run: |
           nvidia-smi
@@ -200,7 +206,7 @@ jobs:
         if: ${{ always() }}
         uses: actions/upload-artifact@v3
         with:
-          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
+          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_nightly
           path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
 
   run_all_tests_torch_cuda_extensions_gpu:
@@ -209,7 +215,7 @@ jobs:
       fail-fast: false
       matrix:
         machine_type: [single-gpu, multi-gpu]
-    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
+    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci]
     needs: setup
     container:
       image: huggingface/transformers-pytorch-deepspeed-nightly-gpu
@@ -219,6 +225,10 @@ jobs:
         working-directory: /workspace/transformers
         run: git fetch && git checkout ${{ github.sha }}
 
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /workspace/transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
       - name: Remove cached torch extensions
         run: rm -rf /github/home/.cache/torch_extensions/
 
@@ -229,7 +239,7 @@ jobs:
           python3 -m pip uninstall -y deepspeed
           rm -rf DeepSpeed
           git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
 
       - name: NVIDIA-SMI
         run: |
@@ -258,7 +268,7 @@ jobs:
         if: ${{ always() }}
         uses: actions/upload-artifact@v3
         with:
-          name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports
+          name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports_postfix_nightly
           path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu
 
   send_results:
@@ -291,7 +301,8 @@ jobs:
           CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
           CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
           CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
-          CI_EVENT: nightly-build
+          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
+          CI_EVENT: Nightly CI
           RUNNER_STATUS: ${{ needs.check_runner_status.result }}
           RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
           SETUP_STATUS: ${{ needs.setup.result }}
@@ -301,3 +312,11 @@ jobs:
           pip install slack_sdk
           pip show slack_sdk
           python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
+
+
+      # delete-artifact
+      - uses: geekyeggo/delete-artifact@v2
+        with:
+          name: |
+              single-*
+              multi-*
\ No newline at end of file
diff --git a/.github/workflows/self-past-caller.yml b/.github/workflows/self-past-caller.yml
deleted file mode 100644
index 2cc81dac8ca2..000000000000
--- a/.github/workflows/self-past-caller.yml
+++ /dev/null
@@ -1,136 +0,0 @@
-name: Self-hosted runner (past-ci-caller)
-
-on:
-  push:
-    branches:
-      - run-past-ci*
-
-jobs:
-  run_past_ci_pytorch_1-11:
-    name: PyTorch 1.11
-    if: always()
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: pytorch
-      version: "1.11"
-    secrets: inherit
-
-  run_past_ci_pytorch_1-10:
-    name: PyTorch 1.10
-    if: always()
-    needs: [run_past_ci_pytorch_1-11]
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: pytorch
-      version: "1.10"
-    secrets: inherit
-
-  run_past_ci_pytorch_1-9:
-    name: PyTorch 1.9
-    if: always()
-    needs: [run_past_ci_pytorch_1-10]
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: pytorch
-      version: "1.9"
-    secrets: inherit
-
-  run_past_ci_pytorch_1-8:
-    name: PyTorch 1.8
-    if: always()
-    needs: [run_past_ci_pytorch_1-9]
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: pytorch
-      version: "1.8"
-    secrets: inherit
-
-  run_past_ci_pytorch_1-7:
-    name: PyTorch 1.7
-    if: always()
-    needs: [run_past_ci_pytorch_1-8]
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: pytorch
-      version: "1.7"
-    secrets: inherit
-
-  run_past_ci_pytorch_1-6:
-    name: PyTorch 1.6
-    if: always()
-    needs: [run_past_ci_pytorch_1-7]
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: pytorch
-      version: "1.6"
-    secrets: inherit
-
-  run_past_ci_pytorch_1-5:
-    name: PyTorch 1.5
-    if: always()
-    needs: [run_past_ci_pytorch_1-6]
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: pytorch
-      version: "1.5"
-    secrets: inherit
-
-  run_past_ci_pytorch_1-4:
-    name: PyTorch 1.4
-    if: always()
-    needs: [run_past_ci_pytorch_1-5]
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: pytorch
-      version: "1.4"
-    secrets: inherit
-
-  run_past_ci_tensorflow_2-8:
-    name: TensorFlow 2.8
-    if: always()
-    needs: [run_past_ci_pytorch_1-4]
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: tensorflow
-      version: "2.8"
-    secrets: inherit
-
-  run_past_ci_tensorflow_2-7:
-    name: TensorFlow 2.7
-    if: always()
-    needs: [run_past_ci_tensorflow_2-8]
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: tensorflow
-      version: "2.7"
-    secrets: inherit
-
-  run_past_ci_tensorflow_2-6:
-    name: TensorFlow 2.6
-    if: always()
-    needs: [run_past_ci_tensorflow_2-7]
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: tensorflow
-      version: "2.6"
-    secrets: inherit
-
-  run_past_ci_tensorflow_2-5:
-    name: TensorFlow 2.5
-    if: always()
-    needs: [run_past_ci_tensorflow_2-6]
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: tensorflow
-      version: "2.5"
-    secrets: inherit
-
-  run_past_ci_tensorflow_2-4:
-    name: TensorFlow 2.4
-    if: always()
-    needs: [run_past_ci_tensorflow_2-5]
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: tensorflow
-      version: "2.4"
-    secrets: inherit
\ No newline at end of file
diff --git a/.github/workflows/self-past.yml b/.github/workflows/self-past.yml
index c59800445bdc..71f904c831e9 100644
--- a/.github/workflows/self-past.yml
+++ b/.github/workflows/self-past.yml
@@ -1,4 +1,4 @@
-name: Self-hosted runner (past)
+name: Self-hosted runner (past-ci)
 
 # Note that each job's dependencies go into a corresponding docker file.
 #
@@ -50,7 +50,7 @@ jobs:
     strategy:
       matrix:
         machine_type: [single-gpu, multi-gpu]
-    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
+    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci]
     container:
       image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
       options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -65,7 +65,7 @@ jobs:
     strategy:
       matrix:
         machine_type: [single-gpu, multi-gpu]
-    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
+    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci]
     container:
       image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
       options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -101,7 +101,7 @@ jobs:
       matrix:
         folders: ${{ fromJson(needs.setup.outputs.matrix) }}
         machine_type: [single-gpu]
-    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
+    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci]
     container:
       image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
       options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -111,6 +111,10 @@ jobs:
         working-directory: /transformers
         run: git fetch && git checkout ${{ inputs.sha }}
 
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
       - name: Echo folder ${{ matrix.folders }}
         shell: bash
         # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
@@ -126,6 +130,12 @@ jobs:
         run: |
           nvidia-smi
 
+      - name: Install
+        if: inputs.framework == 'pytorch'
+        working-directory: /transformers
+        run: |
+          python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
+
       - name: Environment
         working-directory: /transformers
         run: |
@@ -157,7 +167,7 @@ jobs:
         if: ${{ always() }}
         uses: actions/upload-artifact@v3
         with:
-          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
+          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }}
           path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
 
   run_tests_multi_gpu:
@@ -167,7 +177,7 @@ jobs:
       matrix:
         folders: ${{ fromJson(needs.setup.outputs.matrix) }}
         machine_type: [multi-gpu]
-    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
+    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci]
     container:
       image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
       options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -177,6 +187,10 @@ jobs:
         working-directory: /transformers
         run: git fetch && git checkout ${{ inputs.sha }}
 
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
       - name: Echo folder ${{ matrix.folders }}
         shell: bash
         # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
@@ -192,6 +206,12 @@ jobs:
         run: |
           nvidia-smi
 
+      - name: Install
+        if: inputs.framework == 'pytorch'
+        working-directory: /transformers
+        run: |
+          python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
+
       - name: Environment
         working-directory: /transformers
         run: |
@@ -223,14 +243,89 @@ jobs:
         if: ${{ always() }}
         uses: actions/upload-artifact@v3
         with:
-          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
+          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }}
           path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
 
+  run_all_tests_torch_cuda_extensions_gpu:
+    name: Torch CUDA extension tests
+    if: inputs.framework == 'pytorch'
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [single-gpu, multi-gpu]
+    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci]
+    needs: setup
+    container:
+      image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ github.sha }}
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: Install
+        working-directory: /transformers
+        run: |
+          python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
+
+      - name: Remove cached torch extensions
+        run: rm -rf /github/home/.cache/torch_extensions/
+
+      # To avoid unknown test failures
+      - name: Pre build DeepSpeed *again*
+        working-directory: /
+        run: |
+          python3 -m pip uninstall -y deepspeed
+          rm -rf DeepSpeed
+          git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run all tests on GPU
+        working-directory: /transformers
+        run: |
+          python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }}
+          path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu
+
   send_results:
     name: Send results to webhook
     runs-on: ubuntu-latest
     if: always()
-    needs: [check_runner_status, check_runners, setup, run_tests_single_gpu, run_tests_multi_gpu]
+    needs: [
+      check_runner_status,
+      check_runners,
+      setup,
+      run_tests_single_gpu,
+      run_tests_multi_gpu,
+      run_all_tests_torch_cuda_extensions_gpu
+    ]
     steps:
       - name: Preliminary job status
         shell: bash
@@ -254,6 +349,7 @@ jobs:
           CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
           CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
           CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
+          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
           CI_EVENT: Past CI - ${{ inputs.framework }}-${{ inputs.version }}
           RUNNER_STATUS: ${{ needs.check_runner_status.result }}
           RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
@@ -271,4 +367,11 @@ jobs:
         uses: actions/upload-artifact@v3
         with:
           name: test_failure_tables_${{ inputs.framework }}-${{ inputs.version }}
-          path: test_failure_tables
\ No newline at end of file
+          path: test_failure_tables
+
+      # delete-artifact
+      - uses: geekyeggo/delete-artifact@v2
+        with:
+          name: |
+              single-*
+              multi-*
\ No newline at end of file
diff --git a/.github/workflows/self-push-amd.yml b/.github/workflows/self-push-amd.yml
new file mode 100644
index 000000000000..0dfbbca7ba12
--- /dev/null
+++ b/.github/workflows/self-push-amd.yml
@@ -0,0 +1,337 @@
+name: Self-hosted runner AMD GPU (push)
+
+on:
+  workflow_run:
+    workflows: ["Self-hosted runner (push-caller)"]
+    branches: ["main"]
+    types: [completed]
+  push:
+    branches:
+      - ci_*
+      - ci-*
+    paths:
+      - "src/**"
+      - "tests/**"
+      - ".github/**"
+      - "templates/**"
+      - "utils/**"
+  repository_dispatch:
+
+env:
+  HF_HOME: /mnt/cache
+  TRANSFORMERS_IS_CI: yes
+  OMP_NUM_THREADS: 8
+  MKL_NUM_THREADS: 8
+  PYTEST_TIMEOUT: 60
+  TF_FORCE_GPU_ALLOW_GROWTH: true
+  RUN_PT_TF_CROSS_TESTS: 1
+
+jobs:
+  check_runner_status:
+    name: Check Runner Status
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout transformers
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 2
+
+      - name: Check Runner Status
+        run: python utils/check_self_hosted_runner.py --target_runners amd-mi210-single-gpu-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
+
+  check_runners:
+    name: Check Runners
+    needs: check_runner_status
+    strategy:
+      matrix:
+        machine_type: [single-gpu, multi-gpu]
+        gpu_flavor: [mi210]
+    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}']
+    container:
+      image: huggingface/transformers-pytorch-amd-gpu-push-ci  # <--- We test only for PyTorch for now
+      options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: ROCM-SMI
+        run: |
+          rocminfo  | grep "Agent" -A 14
+      - name: Show HIP environment
+        run: |
+          echo "HIP: $HIP_VISIBLE_DEVICES"
+          echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+  setup_gpu:
+    name: Setup
+    needs: check_runners
+    strategy:
+      matrix:
+        machine_type: [single-gpu, multi-gpu]
+        gpu_flavor: [mi210]
+    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}']
+    container:
+      image: huggingface/transformers-pytorch-amd-gpu-push-ci  # <--- We test only for PyTorch for now
+      options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+      test_map: ${{ steps.set-matrix.outputs.test_map }}
+    steps:
+      # Necessary to get the correct branch name and commit SHA for `workflow_run` event
+      # We also take into account the `push` event (we might want to test some changes in a branch)
+      - name: Prepare custom environment variables
+        shell: bash
+        # `CI_BRANCH_PUSH`: The branch name from the push event
+        # `CI_BRANCH_WORKFLOW_RUN`: The name of the branch on which this workflow is triggered by `workflow_run` event
+        # `CI_BRANCH`: The non-empty branch name from the above two (one and only one of them is empty)
+        # `CI_SHA_PUSH`: The commit SHA from the push event
+        # `CI_SHA_WORKFLOW_RUN`: The commit SHA that triggers this workflow by `workflow_run` event
+        # `CI_SHA`: The non-empty commit SHA from the above two (one and only one of them is empty)
+        run: |
+          CI_BRANCH_PUSH=${{ github.event.ref }}
+          CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
+          CI_BRANCH_WORKFLOW_RUN=${{ github.event.workflow_run.head_branch }}
+          CI_SHA_PUSH=${{ github.event.head_commit.id }}
+          CI_SHA_WORKFLOW_RUN=${{ github.event.workflow_run.head_sha }}
+          echo $CI_BRANCH_PUSH
+          echo $CI_BRANCH_WORKFLOW_RUN
+          echo $CI_SHA_PUSH
+          echo $CI_SHA_WORKFLOW_RUN
+          [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
+          [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
+
+      - name: print environment variables
+        run: |
+          echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
+          echo "env.CI_SHA = ${{ env.CI_SHA }}"
+
+      - name: Update clone using environment variables
+        working-directory: /transformers
+        run: |
+          echo "original branch = $(git branch --show-current)"
+          git fetch && git checkout ${{ env.CI_BRANCH }}
+          echo "updated branch = $(git branch --show-current)"
+          git checkout ${{ env.CI_SHA }}
+          echo "log = $(git log -n 1)"
+
+      - name: Cleanup
+        working-directory: /transformers
+        run: |
+          rm -rf tests/__pycache__
+          rm -rf tests/models/__pycache__
+          rm -rf reports
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Fetch the tests to run
+        working-directory: /transformers
+        # TODO: add `git-python` in the docker images
+        run: |
+          pip install --upgrade git-python
+          python3 utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
+
+      - name: Report fetched tests
+        uses: actions/upload-artifact@v3
+        with:
+          name: test_fetched
+          path: /transformers/test_preparation.txt
+
+      - id: set-matrix
+        name: Organize tests into models
+        working-directory: /transformers
+        # The `keys` is used as GitHub actions matrix for jobs, i.e. `models/bert`, `tokenization`, `pipeline`, etc.
+        # The `test_map` is used to get the actual identified test files under each key.
+        # If no test to run (so no `test_map.json` file), create a dummy map (empty matrix will fail)
+        run: |
+          if [ -f test_map.json ]; then
+              keys=$(python3 -c 'import json; fp = open("test_map.json"); test_map = json.load(fp); fp.close(); d = list(test_map.keys()); print(d)')
+              test_map=$(python3 -c 'import json; fp = open("test_map.json"); test_map = json.load(fp); fp.close(); print(test_map)')
+          else
+              keys=$(python3 -c 'keys = ["dummy"]; print(keys)')
+              test_map=$(python3 -c 'test_map = {"dummy": []}; print(test_map)')
+          fi
+          echo $keys
+          echo $test_map
+          echo "matrix=$keys" >> $GITHUB_OUTPUT
+          echo "test_map=$test_map" >> $GITHUB_OUTPUT
+
+  run_tests_amdgpu:
+    name: Model tests
+    needs: setup_gpu
+    # `dummy` means there is no test to run
+    if: contains(fromJson(needs.setup_gpu.outputs.matrix), 'dummy') != true
+    strategy:
+      fail-fast: false
+      matrix:
+        folders: ${{ fromJson(needs.setup_gpu.outputs.matrix) }}
+        machine_type: [single-gpu, multi-gpu]
+        gpu_flavor: [mi210]
+    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}']
+    container:
+      image: huggingface/transformers-pytorch-amd-gpu-push-ci  # <--- We test only for PyTorch for now
+      options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      # Necessary to get the correct branch name and commit SHA for `workflow_run` event
+      # We also take into account the `push` event (we might want to test some changes in a branch)
+      - name: Prepare custom environment variables
+        shell: bash
+        # For the meaning of these environment variables, see the job `Setup`
+        run: |
+          CI_BRANCH_PUSH=${{ github.event.ref }}
+          CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
+          CI_BRANCH_WORKFLOW_RUN=${{ github.event.workflow_run.head_branch }}
+          CI_SHA_PUSH=${{ github.event.head_commit.id }}
+          CI_SHA_WORKFLOW_RUN=${{ github.event.workflow_run.head_sha }}
+          echo $CI_BRANCH_PUSH
+          echo $CI_BRANCH_WORKFLOW_RUN
+          echo $CI_SHA_PUSH
+          echo $CI_SHA_WORKFLOW_RUN
+          [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
+          [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
+
+      - name: print environment variables
+        run: |
+          echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
+          echo "env.CI_SHA = ${{ env.CI_SHA }}"
+
+      - name: Update clone using environment variables
+        working-directory: /transformers
+        run: |
+          echo "original branch = $(git branch --show-current)"
+          git fetch && git checkout ${{ env.CI_BRANCH }}
+          echo "updated branch = $(git branch --show-current)"
+          git checkout ${{ env.CI_SHA }}
+          echo "log = $(git log -n 1)"
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: Echo folder ${{ matrix.folders }}
+        shell: bash
+        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
+        # set the artifact folder names (because the character `/` is not allowed).
+        run: |
+          echo "${{ matrix.folders }}"
+          echo "${{ fromJson(needs.setup_gpu.outputs.test_map)[matrix.folders] }}"
+          matrix_folders=${{ matrix.folders }}
+          matrix_folders=${matrix_folders/'models/'/'models_'}
+          echo "$matrix_folders"
+          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
+
+      - name: ROCM-SMI
+        run: |
+          rocminfo  | grep "Agent" -A 14
+      - name: Show HIP environment
+        run: |
+          echo "HIP: $HIP_VISIBLE_DEVICES"
+          echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run all non-slow selected tests on GPU
+        working-directory: /transformers
+        run: |
+          python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} ${{ fromJson(needs.setup_gpu.outputs.test_map)[matrix.folders] }}
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
+          path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
+
+  send_results:
+    name: Send results to webhook
+    runs-on: ubuntu-latest
+    if: always()
+    needs: [
+        check_runner_status,
+        check_runners,
+        setup_gpu,
+        run_tests_amdgpu,
+#        run_tests_torch_cuda_extensions_single_gpu,
+#        run_tests_torch_cuda_extensions_multi_gpu
+    ]
+    steps:
+      - name: Preliminary job status
+        shell: bash
+        # For the meaning of these environment variables, see the job `Setup`
+        run: |
+          echo "Runner availability: ${{ needs.check_runner_status.result }}"
+          echo "Setup status: ${{ needs.setup_gpu.result }}"
+          echo "Runner status: ${{ needs.check_runners.result }}"
+
+      # Necessary to get the correct branch name and commit SHA for `workflow_run` event
+      # We also take into account the `push` event (we might want to test some changes in a branch)
+      - name: Prepare custom environment variables
+        shell: bash
+        # For the meaning of these environment variables, see the job `Setup`
+        run: |
+          CI_BRANCH_PUSH=${{ github.event.ref }}
+          CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
+          CI_BRANCH_WORKFLOW_RUN=${{ github.event.workflow_run.head_branch }}
+          CI_SHA_PUSH=${{ github.event.head_commit.id }}
+          CI_SHA_WORKFLOW_RUN=${{ github.event.workflow_run.head_sha }}
+          echo $CI_BRANCH_PUSH
+          echo $CI_BRANCH_WORKFLOW_RUN
+          echo $CI_SHA_PUSH
+          echo $CI_SHA_WORKFLOW_RUN
+          [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
+          [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
+
+      - name: print environment variables
+        run: |
+          echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
+          echo "env.CI_SHA = ${{ env.CI_SHA }}"
+
+      - uses: actions/checkout@v3
+        # To avoid failure when multiple commits are merged into `main` in a short period of time.
+        # Checking out to an old commit beyond the fetch depth will get an error `fatal: reference is not a tree: ...
+        # (Only required for `workflow_run` event, where we get the latest HEAD on `main` instead of the event commit)
+        with:
+          fetch-depth: 20
+
+      - name: Update clone using environment variables
+        run: |
+          echo "original branch = $(git branch --show-current)"
+          git fetch && git checkout ${{ env.CI_BRANCH }}
+          echo "updated branch = $(git branch --show-current)"
+          git checkout ${{ env.CI_SHA }}
+          echo "log = $(git log -n 1)"
+
+      - uses: actions/download-artifact@v3
+      - name: Send message to Slack
+        env:
+          CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
+          CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
+          CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
+          CI_SLACK_CHANNEL_ID_AMD: ${{ secrets.CI_SLACK_CHANNEL_ID_AMD }}
+          CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
+          CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_AMD }}
+          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
+          CI_EVENT: push
+          CI_TITLE_PUSH: ${{ github.event.head_commit.message }}
+          CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }}
+          CI_SHA: ${{ env.CI_SHA }}
+          RUNNER_STATUS: ${{ needs.check_runner_status.result }}
+          RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
+          SETUP_STATUS: ${{ needs.setup_gpu.result }}
+
+        # We pass `needs.setup_gpu.outputs.matrix` as the argument. A processing in `notification_service.py` to change
+        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
+        run: |
+          pip install slack_sdk
+          pip show slack_sdk
+          python utils/notification_service.py "${{ needs.setup_gpu.outputs.matrix }}"
diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml
index b6c3a70e3eb8..e4b1b3b4b235 100644
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@@ -45,7 +45,7 @@ jobs:
     strategy:
       matrix:
         machine_type: [single-gpu, multi-gpu]
-    runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
+    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, push-ci]
     container:
       image: huggingface/transformers-all-latest-gpu-push-ci
       options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -60,7 +60,7 @@ jobs:
     strategy:
       matrix:
         machine_type: [single-gpu, multi-gpu]
-    runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
+    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, push-ci]
     container:
       image: huggingface/transformers-all-latest-gpu-push-ci
       options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -158,7 +158,7 @@ jobs:
       matrix:
         folders: ${{ fromJson(needs.setup.outputs.matrix) }}
         machine_type: [single-gpu]
-    runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
+    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, push-ci]
     container:
       image: huggingface/transformers-all-latest-gpu-push-ci
       options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -195,6 +195,10 @@ jobs:
           git checkout ${{ env.CI_SHA }}
           echo "log = $(git log -n 1)"
 
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
       - name: Echo folder ${{ matrix.folders }}
         shell: bash
         # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
@@ -247,7 +251,7 @@ jobs:
       matrix:
         folders: ${{ fromJson(needs.setup.outputs.matrix) }}
         machine_type: [multi-gpu]
-    runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
+    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, push-ci]
     container:
       image: huggingface/transformers-all-latest-gpu-push-ci
       options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -284,6 +288,10 @@ jobs:
           git checkout ${{ env.CI_SHA }}
           echo "log = $(git log -n 1)"
 
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
       - name: Echo folder ${{ matrix.folders }}
         shell: bash
         # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
@@ -336,7 +344,7 @@ jobs:
       fail-fast: false
       matrix:
         machine_type: [single-gpu]
-    runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
+    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, push-ci]
     container:
       image: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
       options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -373,6 +381,10 @@ jobs:
           git checkout ${{ env.CI_SHA }}
           echo "log = $(git log -n 1)"
 
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /workspace/transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
       - name: Remove cached torch extensions
         run: rm -rf /github/home/.cache/torch_extensions/
 
@@ -381,7 +393,7 @@ jobs:
         working-directory: /workspace
         run: |
           python3 -m pip uninstall -y deepspeed
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
 
       - name: NVIDIA-SMI
         run: |
@@ -422,7 +434,7 @@ jobs:
       fail-fast: false
       matrix:
         machine_type: [multi-gpu]
-    runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
+    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, push-ci]
     container:
       image: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
       options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -459,6 +471,10 @@ jobs:
           git checkout ${{ env.CI_SHA }}
           echo "log = $(git log -n 1)"
 
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /workspace/transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
       - name: Remove cached torch extensions
         run: rm -rf /github/home/.cache/torch_extensions/
 
@@ -467,7 +483,7 @@ jobs:
         working-directory: /workspace
         run: |
           python3 -m pip uninstall -y deepspeed
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
 
       - name: NVIDIA-SMI
         run: |
@@ -568,6 +584,7 @@ jobs:
           CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
           CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
           CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
+          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
           CI_EVENT: push
           CI_TITLE_PUSH: ${{ github.event.head_commit.message }}
           CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }}
diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
index 750f4a956943..2bd6bbade1cb 100644
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -9,7 +9,10 @@ name: Self-hosted runner (scheduled)
 on:
   repository_dispatch:
   schedule:
-    - cron: "0 2 * * *"
+    - cron: "17 2 * * *"
+  push:
+    branches:
+      - run_scheduled_ci*
 
 env:
   HF_HOME: /mnt/cache
@@ -40,7 +43,7 @@ jobs:
     strategy:
       matrix:
         machine_type: [single-gpu, multi-gpu]
-    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
+    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
     container:
       image: huggingface/transformers-all-latest-gpu
       options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -55,7 +58,7 @@ jobs:
     strategy:
       matrix:
         machine_type: [single-gpu, multi-gpu]
-    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
+    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
     container:
       image: huggingface/transformers-all-latest-gpu
       options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -95,7 +98,7 @@ jobs:
       matrix:
         folders: ${{ fromJson(needs.setup.outputs.matrix) }}
         machine_type: [single-gpu]
-    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
+    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
     container:
       image: huggingface/transformers-all-latest-gpu
       options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -116,6 +119,10 @@ jobs:
         working-directory: /transformers
         run: git fetch && git checkout ${{ github.sha }}
 
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
       - name: NVIDIA-SMI
         run: |
           nvidia-smi
@@ -152,7 +159,7 @@ jobs:
       matrix:
         folders: ${{ fromJson(needs.setup.outputs.matrix) }}
         machine_type: [multi-gpu]
-    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
+    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
     container:
       image: huggingface/transformers-all-latest-gpu
       options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -173,6 +180,10 @@ jobs:
         working-directory: /transformers
         run: git fetch && git checkout ${{ github.sha }}
 
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
       - name: NVIDIA-SMI
         run: |
           nvidia-smi
@@ -208,7 +219,7 @@ jobs:
       fail-fast: false
       matrix:
         machine_type: [single-gpu]
-    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
+    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
     container:
       image: huggingface/transformers-all-latest-gpu
       options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -218,6 +229,10 @@ jobs:
         working-directory: /transformers
         run: git fetch && git checkout ${{ github.sha }}
 
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
       - name: NVIDIA-SMI
         run: |
           nvidia-smi
@@ -255,7 +270,7 @@ jobs:
       fail-fast: false
       matrix:
         machine_type: [single-gpu, multi-gpu]
-    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
+    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
     container:
       image: huggingface/transformers-pytorch-gpu
       options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -265,6 +280,10 @@ jobs:
         working-directory: /transformers
         run: git fetch && git checkout ${{ github.sha }}
 
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
       - name: NVIDIA-SMI
         run: |
           nvidia-smi
@@ -301,7 +320,7 @@ jobs:
       fail-fast: false
       matrix:
         machine_type: [single-gpu, multi-gpu]
-    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
+    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
     container:
       image: huggingface/transformers-tensorflow-gpu
       options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -312,6 +331,10 @@ jobs:
         run: |
           git fetch && git checkout ${{ github.sha }}
 
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
       - name: NVIDIA-SMI
         run: |
           nvidia-smi
@@ -348,7 +371,7 @@ jobs:
       fail-fast: false
       matrix:
         machine_type: [single-gpu, multi-gpu]
-    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
+    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
     needs: setup
     container:
       image: huggingface/transformers-pytorch-deepspeed-latest-gpu
@@ -358,6 +381,10 @@ jobs:
         working-directory: /workspace/transformers
         run: git fetch && git checkout ${{ github.sha }}
 
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /workspace/transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
       - name: Remove cached torch extensions
         run: rm -rf /github/home/.cache/torch_extensions/
 
@@ -366,7 +393,7 @@ jobs:
         working-directory: /workspace
         run: |
           python3 -m pip uninstall -y deepspeed
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
 
       - name: NVIDIA-SMI
         run: |
@@ -482,13 +509,25 @@ jobs:
           CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
           CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
           CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
+          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
           CI_EVENT: scheduled
+          CI_SHA: ${{ github.sha }}
+          CI_WORKFLOW_REF: ${{ github.workflow_ref }}
           RUNNER_STATUS: ${{ needs.check_runner_status.result }}
           RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
           SETUP_STATUS: ${{ needs.setup.result }}
         # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
         # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
         run: |
+          sudo apt-get install -y curl
           pip install slack_sdk
           pip show slack_sdk
           python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
+
+      # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
+      - name: Failure table artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: test_failure_tables
+          path: test_failure_tables
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index 9412442a7d0a..1211d71a32e2 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -2,7 +2,7 @@ name: Stale Bot
 
 on:
   schedule:
-    - cron: "0 15 * * *"
+    - cron: "0 8 * * *"
 
 jobs:
   close_stale_issues:
@@ -17,7 +17,7 @@ jobs:
     - name: Setup Python
       uses: actions/setup-python@v4
       with:
-        python-version: 3.7
+        python-version: 3.8
 
     - name: Install requirements
       run: |
diff --git a/.github/workflows/update_metdata.yml b/.github/workflows/update_metdata.yml
index f6c9afd15b7e..de25fe0a08a2 100644
--- a/.github/workflows/update_metdata.yml
+++ b/.github/workflows/update_metdata.yml
@@ -4,7 +4,7 @@ on:
   push:
     branches:
       - main
-      - update_transformers_metadata
+      - update_transformers_metadata*
 
 jobs:
   build_and_package:
@@ -16,25 +16,12 @@ jobs:
     steps:
       - uses: actions/checkout@v3
 
-      - name: Load cached virtual environment
-        uses: actions/cache@v2
-        id: cache
-        with:
-          path: ~/venv/
-          key: v3-metadata-${{ hashFiles('setup.py') }}
-
-      - name: Create virtual environment on cache miss
-        if: steps.cache.outputs.cache-hit != 'true'
-        run: |
-          python -m venv ~/venv && . ~/venv/bin/activate
-          pip install --upgrade pip
-
       - name: Setup environment
         run: |
-          . ~/venv/bin/activate
-          pip install git+https://github.com/huggingface/transformers#egg=transformers[dev]
+          pip install --upgrade pip
+          pip install datasets pandas==2.0.3
+          pip install .[torch,tf,flax]
 
       - name: Update metadata
         run: |
-          . ~/venv/bin/activate
-          python utils/update_metadata.py --token ${{ secrets.SYLVAIN_HF_TOKEN }} --commit_sha ${{ github.sha }}
+          python utils/update_metadata.py --token ${{ secrets.LYSANDRE_HF_TOKEN }} --commit_sha ${{ github.sha }}
diff --git a/.github/workflows/upload_pr_documentation.yml b/.github/workflows/upload_pr_documentation.yml
new file mode 100644
index 000000000000..64befc595c42
--- /dev/null
+++ b/.github/workflows/upload_pr_documentation.yml
@@ -0,0 +1,16 @@
+name: Upload PR Documentation
+
+on:
+  workflow_run:
+    workflows: ["Build PR Documentation"]
+    types:
+      - completed
+
+jobs:
+  build:
+    uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main
+    with:
+      package_name: transformers
+    secrets:
+      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
+      comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index cf8183463613..337f2ef2c735 100644
--- a/.gitignore
+++ b/.gitignore
@@ -163,4 +163,7 @@ tags
 *.lock
 
 # DS_Store (MacOS)
-.DS_Store
\ No newline at end of file
+.DS_Store
+
+# ruff
+.ruff_cache
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index aae902fa4a8d..6cfa3e47398c 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -130,7 +130,7 @@ You will need basic `git` proficiency to contribute to
 manual. Type `git --help` in a shell and enjoy! If you prefer books, [Pro
 Git](https://git-scm.com/book/en/v2) is a very good reference.
 
-You'll need **[Python 3.7]((https://github.com/huggingface/transformers/blob/main/setup.py#L426))** or above to contribute to 🤗 Transformers. Follow the steps below to start contributing:
+You'll need **[Python 3.8]((https://github.com/huggingface/transformers/blob/main/setup.py#L426))** or above to contribute to 🤗 Transformers. Follow the steps below to start contributing:
 
 1. Fork the [repository](https://github.com/huggingface/transformers) by
    clicking on the **[Fork](https://github.com/huggingface/transformers/fork)** button on the repository's page. This creates a copy of the code
@@ -139,15 +139,15 @@ You'll need **[Python 3.7]((https://github.com/huggingface/transformers/blob/mai
 2. Clone your fork to your local disk, and add the base repository as a remote:
 
    ```bash
-   $ git clone git@github.com:<your Github handle>/transformers.git
-   $ cd transformers
-   $ git remote add upstream https://github.com/huggingface/transformers.git
+   git clone git@github.com:<your Github handle>/transformers.git
+   cd transformers
+   git remote add upstream https://github.com/huggingface/transformers.git
    ```
 
 3. Create a new branch to hold your development changes:
 
    ```bash
-   $ git checkout -b a-descriptive-name-for-my-changes
+   git checkout -b a-descriptive-name-for-my-changes
    ```
 
    🚨 **Do not** work on the `main` branch!
@@ -155,39 +155,41 @@ You'll need **[Python 3.7]((https://github.com/huggingface/transformers/blob/mai
 4. Set up a development environment by running the following command in a virtual environment:
 
    ```bash
-   $ pip install -e ".[dev]"
+   pip install -e ".[dev]"
    ```
 
    If 🤗 Transformers was already installed in the virtual environment, remove
    it with `pip uninstall transformers` before reinstalling it in editable
    mode with the `-e` flag.
    
-   Depending on your OS, you may need to install some external libraries as well if the `pip` installation fails.
-   
-   For macOS, you will likely need [MeCab](https://taku910.github.io/mecab/) which can be installed from Homebrew:
-   
+   Depending on your OS, and since the number of optional dependencies of Transformers is growing, you might get a
+   failure with this command. If that's the case make sure to install the Deep Learning framework you are working with
+   (PyTorch, TensorFlow and/or Flax) then do:
+
    ```bash
-   brew install mecab
+   pip install -e ".[quality]"
    ```
 
+   which should be enough for most use cases.
+
 5. Develop the features on your branch.
 
    As you work on your code, you should make sure the test suite
    passes. Run the tests impacted by your changes like this:
 
    ```bash
-   $ pytest tests/<TEST_TO_RUN>.py
+   pytest tests/<TEST_TO_RUN>.py
    ```
 
    For more information about tests, check out the
    [Testing](https://huggingface.co/docs/transformers/testing) guide.
 
-   🤗 Transformers relies on `black` and `isort` to format its source code
+   🤗 Transformers relies on `black` and `ruff` to format its source code
    consistently. After you make changes, apply automatic style corrections and code verifications
    that can't be automated in one go with:
 
    ```bash
-   $ make fixup
+   make fixup
    ```
 
    This target is also optimized to only work with files modified by the PR you're working on.
@@ -196,21 +198,21 @@ You'll need **[Python 3.7]((https://github.com/huggingface/transformers/blob/mai
    style corrections:
 
    ```bash
-   $ make style
+   make style
    ```
 
-   🤗 Transformers also uses `flake8` and a few custom scripts to check for coding mistakes. Quality
+   🤗 Transformers also uses `ruff` and a few custom scripts to check for coding mistakes. Quality
    controls are run by the CI, but you can run the same checks with:
 
    ```bash
-   $ make quality
+   make quality
    ```
 
    Finally, we have a lot of scripts to make sure we didn't forget to update
    some files when adding a new model. You can run these scripts with:
 
    ```bash
-   $ make repo-consistency
+   make repo-consistency
    ```
 
    To learn more about those checks and how to fix any issues with them, check out the
@@ -220,13 +222,13 @@ You'll need **[Python 3.7]((https://github.com/huggingface/transformers/blob/mai
    make sure you install the documentation builder:
    
    ```bash
-   $ pip install ".[docs]"
+   pip install ".[docs]"
    ```
 
    Run the following command from the root of the repository:
 
    ```bash
-   $ doc-builder build transformers docs/source/en --build_dir ~/tmp/test-build
+   doc-builder build transformers docs/source/en --build_dir ~/tmp/test-build
    ```
 
    This will build the documentation in the `~/tmp/test-build` folder where you can inspect the generated
@@ -236,8 +238,8 @@ You'll need **[Python 3.7]((https://github.com/huggingface/transformers/blob/mai
    record your changes locally with `git commit`:
 
    ```bash
-   $ git add modified_file.py
-   $ git commit
+   git add modified_file.py
+   git commit
    ```
 
    Please remember to write [good commit
@@ -247,14 +249,14 @@ You'll need **[Python 3.7]((https://github.com/huggingface/transformers/blob/mai
    repository, rebase your branch on `upstream/branch` *before* you open a pull request or if requested by a maintainer:
 
    ```bash
-   $ git fetch upstream
-   $ git rebase upstream/main
+   git fetch upstream
+   git rebase upstream/main
    ```
 
    Push your changes to your branch:
 
    ```bash
-   $ git push -u origin a-descriptive-name-for-my-changes
+   git push -u origin a-descriptive-name-for-my-changes
    ```
 
    If you've already opened a pull request, you'll need to force push with the `--force` flag. Otherwise, if the pull request hasn't been opened yet, you can just push your changes normally.
@@ -273,7 +275,7 @@ You'll need **[Python 3.7]((https://github.com/huggingface/transformers/blob/mai
 request description to make sure they are linked (and people viewing the issue know you
 are working on it).<br>
 ☐ To indicate a work in progress please prefix the title with `[WIP]`. These are
-useful to avoid duplicated work, and to differentiate it from PRs ready to be merged.
+useful to avoid duplicated work, and to differentiate it from PRs ready to be merged.<br>
 ☐ Make sure existing tests pass.<br>
 ☐ If adding a new feature, also add tests for it.<br>
    - If you are adding a new model, make sure you use
@@ -282,7 +284,7 @@ useful to avoid duplicated work, and to differentiate it from PRs ready to be me
      `RUN_SLOW=1 python -m pytest tests/models/my_new_model/test_my_new_model.py`.
    - If you are adding a new tokenizer, write tests and make sure
      `RUN_SLOW=1 python -m pytest tests/models/{your_model_name}/test_tokenization_{your_model_name}.py` passes.
-   CircleCI does not run the slow tests, but GitHub Actions does every night!<br>
+   - CircleCI does not run the slow tests, but GitHub Actions does every night!<br>
 
 ☐ All public methods must have informative docstrings (see
 [`modeling_bert.py`](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py)
@@ -307,14 +309,14 @@ We like `pytest` and `pytest-xdist` because it's faster. From the root of the
 repository, specify a *path to a subfolder or a test file* to run the test.
 
 ```bash
-$ python -m pytest -n auto --dist=loadfile -s -v ./tests/models/my_new_model
+python -m pytest -n auto --dist=loadfile -s -v ./tests/models/my_new_model
 ```
 
 Similarly, for the `examples` directory, specify a *path to a subfolder or test file* to run the test. For example, the following command tests the text classification subfolder in the PyTorch `examples` directory:
 
 ```bash
-$ pip install -r examples/xxx/requirements.txt  # only needed the first time
-$ python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/text-classification
+pip install -r examples/xxx/requirements.txt  # only needed the first time
+python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/text-classification
 ```
 
 In fact, this is actually how our `make test` and `make test-examples` commands are implemented (not including the `pip install`)!
@@ -333,11 +335,16 @@ Remember to specify a *path to a subfolder or a test file* to run the test. Othe
 </Tip>
 
 ```bash
-$ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./tests/models/my_new_model
-$ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/text-classification
+RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./tests/models/my_new_model
+RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/text-classification
 ```
 
-Like the slow tests, custom tokenizer tests are skipped but you can set the `RUN_CUSTOM_TOKENIZERS` environment variable to `yes` to run them.
+Like the slow tests, there are other environment variables available which not enabled by default during testing:
+- `RUN_CUSTOM_TOKENIZERS`: Enables tests for custom tokenizers.
+- `RUN_PT_FLAX_CROSS_TESTS`: Enables tests for PyTorch + Flax integration.
+- `RUN_PT_TF_CROSS_TESTS`: Enables tests for TensorFlow + PyTorch integration.
+
+More environment variables and additional information can be found in the [testing_utils.py](src/transformers/testing_utils.py).
 
 🤗 Transformers uses `pytest` as a test runner only. It doesn't use any
 `pytest`-specific features in the test suite itself.
@@ -346,8 +353,8 @@ This means `unittest` is fully supported. Here's how to run tests with
 `unittest`:
 
 ```bash
-$ python -m unittest discover -s tests -t . -v
-$ python -m unittest discover -s examples -t examples -v
+python -m unittest discover -s tests -t . -v
+python -m unittest discover -s examples -t examples -v
 ```
 
 ### Style guide
@@ -358,7 +365,7 @@ for more information.
 
 ### Develop on Windows
 
-On Windows (unless you're working in [Windows Subsytem for Linux](https://learn.microsoft.com/en-us/windows/wsl/) or WSL), you need to configure git to transform Windows `CRLF` line endings to Linux `LF` line endings:
+On Windows (unless you're working in [Windows Subsystem for Linux](https://learn.microsoft.com/en-us/windows/wsl/) or WSL), you need to configure git to transform Windows `CRLF` line endings to Linux `LF` line endings:
 
 ```bash
 git config core.autocrlf input
@@ -381,8 +388,8 @@ When updating the main branch of a forked repository, please follow these steps
 2. If a PR is absolutely necessary, use the following steps after checking out your branch:
 
 ```bash
-$ git checkout -b your-branch-for-syncing
-$ git pull --squash --no-commit upstream main
-$ git commit -m '<your message without GitHub references>'
-$ git push --set-upstream origin your-branch-for-syncing
+git checkout -b your-branch-for-syncing
+git pull --squash --no-commit upstream main
+git commit -m '<your message without GitHub references>'
+git push --set-upstream origin your-branch-for-syncing
 ```
diff --git a/ISSUES.md b/ISSUES.md
index 7c36da3c6804..95f2334b26c8 100644
--- a/ISSUES.md
+++ b/ISSUES.md
@@ -158,7 +158,7 @@ You are not required to read the following guidelines before opening an issue. H
     --do_train --n_train 500 --num_train_epochs 1 \
     --per_device_train_batch_size 1  --freeze_embeds \
     --src_lang en_XX --tgt_lang ro_RO --task translation \
-    --fp16 --sharded_ddp
+    --fp16
    ```
 
    If you don't break it up, one has to scroll horizontally which often makes it quite difficult to quickly see what's happening.
diff --git a/MANIFEST.in b/MANIFEST.in
deleted file mode 100644
index 1aba38f67a22..000000000000
--- a/MANIFEST.in
+++ /dev/null
@@ -1 +0,0 @@
-include LICENSE
diff --git a/Makefile b/Makefile
index 999ddd6ee156..0c51598594c0 100644
--- a/Makefile
+++ b/Makefile
@@ -9,9 +9,8 @@ modified_only_fixup:
 	$(eval modified_py_files := $(shell python utils/get_modified_files.py $(check_dirs)))
 	@if test -n "$(modified_py_files)"; then \
 		echo "Checking/fixing $(modified_py_files)"; \
-		black --preview $(modified_py_files); \
-		isort $(modified_py_files); \
-		flake8 $(modified_py_files); \
+		black $(modified_py_files); \
+		ruff $(modified_py_files) --fix; \
 	else \
 		echo "No library .py files were modified"; \
 	fi
@@ -40,17 +39,19 @@ repo-consistency:
 	python utils/check_repo.py
 	python utils/check_inits.py
 	python utils/check_config_docstrings.py
-	python utils/tests_fetcher.py --sanity_check
+	python utils/check_config_attributes.py
+	python utils/check_doctest_list.py
 	python utils/update_metadata.py --check-only
+	python utils/check_task_guides.py
+	python utils/check_docstrings.py
 
 # this target runs checks on all files
 
 quality:
-	black --check --preview $(check_dirs)
-	isort --check-only $(check_dirs)
+	black --check $(check_dirs) setup.py conftest.py
 	python utils/custom_init_isort.py --check_only
 	python utils/sort_auto_mappings.py --check_only
-	flake8 $(check_dirs)
+	ruff $(check_dirs) setup.py conftest.py
 	doc-builder style src/transformers docs/source --max_len 119 --check_only --path_to_docs docs/source
 	python utils/check_doc_toc.py
 
@@ -65,8 +66,8 @@ extra_style_checks:
 # this target runs checks on all files and potentially modifies some of them
 
 style:
-	black --preview $(check_dirs)
-	isort $(check_dirs)
+	black $(check_dirs) setup.py conftest.py
+	ruff $(check_dirs) setup.py conftest.py --fix
 	${MAKE} autogenerate_code
 	${MAKE} extra_style_checks
 
@@ -80,6 +81,9 @@ fix-copies:
 	python utils/check_copies.py --fix_and_overwrite
 	python utils/check_table.py --fix_and_overwrite
 	python utils/check_dummies.py --fix_and_overwrite
+	python utils/check_doctest_list.py --fix_and_overwrite
+	python utils/check_task_guides.py --fix_and_overwrite
+	python utils/check_docstrings.py --fix_and_overwrite
 
 # Run tests for the library
 
@@ -110,3 +114,10 @@ post-release:
 
 post-patch:
 	python utils/release.py --post_release --patch
+
+build-release:
+	rm -rf dist
+	rm -rf build
+	python setup.py bdist_wheel
+	python setup.py sdist
+	python utils/check_build.py
diff --git a/README.md b/README.md
index 0906c65deeda..c8afdd51d334 100644
--- a/README.md
+++ b/README.md
@@ -15,10 +15,15 @@ limitations under the License.
 -->
 
 <p align="center">
-    <br>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
-    <br>
-<p>
+  <picture>
+    <source media="(prefers-color-scheme: dark)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-dark.svg">
+    <source media="(prefers-color-scheme: light)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg">
+    <img alt="Hugging Face Transformers Library" src="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg" width="352" height="59" style="max-width: 100%;">
+  </picture>
+  <br/>
+  <br/>
+</p>
+
 <p align="center">
     <a href="https://circleci.com/gh/huggingface/transformers">
         <img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main">
@@ -46,8 +51,9 @@ limitations under the License.
         <a href="https://github.com/huggingface/transformers/blob/main/README_ko.md">한국어</a> |
         <a href="https://github.com/huggingface/transformers/blob/main/README_es.md">Español</a> |
         <a href="https://github.com/huggingface/transformers/blob/main/README_ja.md">日本語</a> |
-        <a href="https://github.com/huggingface/transformers/blob/main/README_hd.md">हिन्दी</a>
-    <p>
+        <a href="https://github.com/huggingface/transformers/blob/main/README_hd.md">हिन्दी</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/README_ru.md">Русский</a>
+    </p>
 </h4>
 
 <h3 align="center">
@@ -91,16 +97,35 @@ In Computer Vision:
 - [Image classification with ViT](https://huggingface.co/google/vit-base-patch16-224)
 - [Object Detection with DETR](https://huggingface.co/facebook/detr-resnet-50)
 - [Semantic Segmentation with SegFormer](https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512)
-- [Panoptic Segmentation with DETR](https://huggingface.co/facebook/detr-resnet-50-panoptic)
+- [Panoptic Segmentation with MaskFormer](https://huggingface.co/facebook/maskformer-swin-small-coco)
+- [Depth Estimation with DPT](https://huggingface.co/docs/transformers/model_doc/dpt)
+- [Video Classification with VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)
+- [Universal Segmentation with OneFormer](https://huggingface.co/shi-labs/oneformer_ade20k_dinat_large)
 
 In Audio:
 - [Automatic Speech Recognition with Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base-960h)
 - [Keyword Spotting with Wav2Vec2](https://huggingface.co/superb/wav2vec2-base-superb-ks)
+- [Audio Classification with Audio Spectrogram Transformer](https://huggingface.co/MIT/ast-finetuned-audioset-10-10-0.4593)
 
 In Multimodal tasks:
+- [Table Question Answering with TAPAS](https://huggingface.co/google/tapas-base-finetuned-wtq)
 - [Visual Question Answering with ViLT](https://huggingface.co/dandelin/vilt-b32-finetuned-vqa)
+- [Zero-shot Image Classification with CLIP](https://huggingface.co/openai/clip-vit-large-patch14)
+- [Document Question Answering with LayoutLM](https://huggingface.co/impira/layoutlm-document-qa)
+- [Zero-shot Video Classification with X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)
+
+
+## 100 projects using Transformers
+
+Transformers is more than a toolkit to use pretrained models: it's a community of projects built around it and the
+Hugging Face Hub. We want Transformers to enable developers, researchers, students, professors, engineers, and anyone
+else to build their dream projects.
+
+In order to celebrate the 100,000 stars of transformers, we have decided to put the spotlight on the
+community, and we have created the [awesome-transformers](./awesome-transformers.md) page which lists 100
+incredible projects built in the vicinity of transformers.
 
-**[Write With Transformer](https://transformer.huggingface.co)**, built by the Hugging Face team, is the official demo of this repo’s text generation capabilities.
+If you own or use a project that you believe should be part of the list, please open a PR to add it!
 
 ## If you are looking for custom support from the Hugging Face team
 
@@ -223,7 +248,7 @@ The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/sta
 
 ### With pip
 
-This repository is tested on Python 3.6+, Flax 0.3.2+, PyTorch 1.3.1+ and TensorFlow 2.3+.
+This repository is tested on Python 3.8+, Flax 0.4.1+, PyTorch 1.10+ and TensorFlow 2.6+.
 
 You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
 
@@ -263,8 +288,11 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 🤗 Transformers currently provides the following architectures (see [here](https://huggingface.co/docs/transformers/model_summary) for a high-level summary of each them):
 
 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-1. **[AltCLIP](https://huggingface.co/docs/transformers/main/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
+1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (from Google Research) released with the paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.
+1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
+1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
+1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
@@ -274,24 +302,31 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BioGpt](https://huggingface.co/docs/transformers/main/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
-1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
+1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
+1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BLIP](https://huggingface.co/docs/transformers/main/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
+1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
+1. **[BLIP-2](https://huggingface.co/docs/transformers/model_doc/blip-2)** (from Salesforce) released with the paper [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) by Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi.
 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
 1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
+1. **[BridgeTower](https://huggingface.co/docs/transformers/model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
+1. **[BROS](https://huggingface.co/docs/transformers/model_doc/bros)** (from NAVER CLOVA) released with the paper [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539) by Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park.
 1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
+1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
+1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
+1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
+1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
@@ -300,36 +335,53 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
 1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (from SenseTime Research) released with the paper [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai.
 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
+1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (from Google AI) released with the paper [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) by Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun.
+1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (from The University of Texas at Austin) released with the paper [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl.
 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
 1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi.
+1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (from Meta AI) released with the paper [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) by Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski.
 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
+1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
+1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
+1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (from Meta AI) released with the paper [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) by Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi.
 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
+1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.
 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models.  **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2 and ESMFold** were released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
+1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
+1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
+1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-1. **[GIT](https://huggingface.co/docs/transformers/main/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
+1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
-1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://openai.com/research/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
 1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
 1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
 1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
-1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
+1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://openai.com/research/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
 1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
-1. **[GPT-Sw3](https://huggingface.co/docs/transformers/main/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren.
+1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren.
+1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
+1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
+1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
+1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
+1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
+1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
+1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.
 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
@@ -338,6 +390,8 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
+1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
+1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (from The FAIR team of Meta AI) released with the paper [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom.
 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
@@ -346,32 +400,50 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
+1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
 1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
+1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (from Google AI) released with the paper [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) by Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos.
 1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
 1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
+1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (from Meta/USC/CMU/SJTU) released with the paper [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
 1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
+1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.
 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
+1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.
 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
+1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
+1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (from MosaiML) released with the repository [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team.
+1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA) for Approximate Self-Attention](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
+1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez.
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
+1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
+1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (from Meta AI) released with the paper [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) by Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic.
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
+1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
+1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
+1. **[Persimmon](https://huggingface.co/docs/transformers/model_doc/persimmon)** (from ADEPT) released in a [blog post](https://www.adept.ai/blog/persimmon-8b) by Erich Elsen, Augustus Odena, Maxwell Nye, Sağnak Taşırlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani.
 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
+1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.
 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
+1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi and Kyogu Lee.
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela.
 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
@@ -380,52 +452,65 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
 1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/main/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
-1. **[RoCBert](https://huggingface.co/docs/transformers/main/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
+1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
+1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
+1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (from Bo Peng), released on [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
+1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
 1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
+1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
 1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
+1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
-1. **[Swin2SR](https://huggingface.co/docs/transformers/main/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
-1. **[SwitchTransformers](https://huggingface.co/docs/transformers/main/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
+1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
+1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
-1. **[TimeSformer](https://huggingface.co/docs/transformers/main/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
+1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
+1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
+1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (from Google Research) released with the paper [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
+1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
 1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
-1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (from Meta AI) released with the paper [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) by Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He.
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
+1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (from HUST-VL) rreleased with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
+1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (from Kakao Enterprise) released with the paper [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son.
+1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
 1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (from OpenAI) released with the paper [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever.
 1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
+1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (from Meta AI) released with the paper [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) by Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe.
 1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
 1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
 1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
 1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
+1. **[XLM-V](https://huggingface.co/docs/transformers/model_doc/xlm-v)** (from Meta AI) released with the paper [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) by Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa.
 1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
 1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
@@ -448,7 +533,6 @@ These implementations have been tested on several datasets (see the example scri
 | [Training and fine-tuning](https://huggingface.co/docs/transformers/training) | Using the models provided by 🤗 Transformers in a PyTorch/TensorFlow training loop and the `Trainer` API |
 | [Quick tour: Fine-tuning/usage scripts](https://github.com/huggingface/transformers/tree/main/examples) | Example scripts for fine-tuning models on a wide range of tasks |
 | [Model sharing and uploading](https://huggingface.co/docs/transformers/model_sharing) | Upload and share your fine-tuned models with the community |
-| [Migration](https://huggingface.co/docs/transformers/migration) | Migrate to 🤗 Transformers from `pytorch-transformers` or `pytorch-pretrained-bert` |
 
 ## Citation
 
diff --git a/README_es.md b/README_es.md
index 341fd87923ca..78cfb415f975 100644
--- a/README_es.md
+++ b/README_es.md
@@ -18,7 +18,7 @@ limitations under the License.
     <br>
     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
     <br>
-<p>
+</p>
 <p align="center">
     <a href="https://circleci.com/gh/huggingface/transformers">
         <img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main">
@@ -47,7 +47,7 @@ limitations under the License.
         <b>Español</b> |
         <a href="https://github.com/huggingface/transformers/blob/main/README_ja.md">日本語</a> |
         <a href="https://github.com/huggingface/transformers/blob/main/README_hd.md">हिन्दी</a>
-    <p>
+    </p>
 </h4>
 
 <h3 align="center">
@@ -92,6 +92,7 @@ En visión de ordenador:
 - [Detección de objetos con DETR](https://huggingface.co/facebook/detr-resnet-50)
 - [Segmentación semántica con SegFormer](https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512)
 - [Segmentación panóptica con DETR](https://huggingface.co/facebook/detr-resnet-50-panoptic)
+- [Segmentación Universal con OneFormer (Segmentación Semántica, de Instancia y Panóptica con un solo modelo)](https://huggingface.co/shi-labs/oneformer_ade20k_dinat_large)
 
 En Audio:
 - [Reconocimiento de voz automático con Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base-960h)
@@ -223,7 +224,7 @@ El modelo en si es un [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.h
 
 ### Con pip
 
-Este repositorio está probado en Python 3.6+, Flax 0.3.2+, PyTorch 1.3.1+ y TensorFlow 2.3+.
+Este repositorio está probado en Python 3.8+, Flax 0.4.1+, PyTorch 1.10+ y TensorFlow 2.6+.
 
 Deberías instalar 🤗 Transformers en un [ambiente virtual](https://docs.python.org/3/library/venv.html). Si no estas familiarizado con los entornos virtuales de Python, consulta la [guía de usuario](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
 
@@ -263,8 +264,11 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 🤗 Transformers actualmente proporciona las siguientes arquitecturas (ver [aquí](https://huggingface.co/docs/transformers/model_summary) para un resumen de alto nivel de cada uno de ellas.):
 
 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-1. **[AltCLIP](https://huggingface.co/docs/transformers/main/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
+1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (from Google Research) released with the paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.
+1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
+1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
+1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
@@ -274,24 +278,31 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BioGpt](https://huggingface.co/docs/transformers/main/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
-1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
+1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
+1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BLIP](https://huggingface.co/docs/transformers/main/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
+1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
+1. **[BLIP-2](https://huggingface.co/docs/transformers/model_doc/blip-2)** (from Salesforce) released with the paper [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) by Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi.
 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
 1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
+1. **[BridgeTower](https://huggingface.co/docs/transformers/model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
+1. **[BROS](https://huggingface.co/docs/transformers/model_doc/bros)** (from NAVER CLOVA) released with the paper [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539) by Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park.
 1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
+1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
+1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
+1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
+1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
@@ -300,36 +311,53 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
 1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (from SenseTime Research) released with the paper [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai.
 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
+1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (from Google AI) released with the paper [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) by Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun.
+1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (from The University of Texas at Austin) released with the paper [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl.
 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
 1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi.
+1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (from Meta AI) released with the paper [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) by Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski.
 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
+1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
+1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
+1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (from Meta AI) released with the paper [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) by Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi.
 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
+1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.
 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models.  **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
+1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
+1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
+1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-1. **[GIT](https://huggingface.co/docs/transformers/main/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
+1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
 1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
 1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
 1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
 1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
 1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
-1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki. 
-1. **[GPT-Sw3](https://huggingface.co/docs/transformers/main/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren. 
+1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
+1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren.
+1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
+1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
+1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
+1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
+1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. 
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
+1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
+1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.
 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
@@ -338,6 +366,8 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
+1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
+1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (from The FAIR team of Meta AI) released with the paper [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/XXX) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom..
 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
@@ -346,32 +376,50 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
+1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
 1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
+1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (from Google AI) released with the paper [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) by Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos.
 1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
 1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
+1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (from Facebook) released with the paper [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
 1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
+1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The Mistral AI team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.. 
 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
+1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.
 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
+1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
+1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (from MosaiML) released with the repository [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team.
+1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
+1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez.
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
+1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
+1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (from Meta AI) released with the paper [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) by Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic.
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
+1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
+1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
+1. **[Persimmon](https://huggingface.co/docs/transformers/model_doc/persimmon)** (from ADEPT) released with the paper [blog post](https://www.adept.ai/blog/persimmon-8b) by Erich Elsen, Augustus Odena, Maxwell Nye, Sağnak Taşırlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani.
 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
+1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.
 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
+1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi, Kyogu Lee. 
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela.
 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
@@ -380,52 +428,65 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
 1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/main/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
-1. **[RoCBert](https://huggingface.co/docs/transformers/main/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
+1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
+1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
+1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (from Bo Peng) released with the paper [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
+1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
 1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
+1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
 1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
+1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
-1. **[Swin2SR](https://huggingface.co/docs/transformers/main/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
-1. **[SwitchTransformers](https://huggingface.co/docs/transformers/main/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
+1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
+1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)**  (from HuggingFace).
-1. **[TimeSformer](https://huggingface.co/docs/transformers/main/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
+1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
+1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
+1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (from Google Research) released with the paper [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
+1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
 1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
-1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (from Meta AI) released with the paper [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) by Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He.
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
+1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (from HUST-VL) released with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
+1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (from Kakao Enterprise) released with the paper [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son.
+1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
 1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (from OpenAI) released with the paper [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever.
 1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
+1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (from Meta AI) released with the paper [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) by Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe.
 1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
 1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
 1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
 1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
+1. **[XLM-V](https://huggingface.co/docs/transformers/model_doc/xlm-v)** (from Meta AI) released with the paper [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) by Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa.
 1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
 1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
@@ -465,4 +526,4 @@ Ahora nosotros tenemos un [papel](https://www.aclweb.org/anthology/2020.emnlp-de
     url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6",
     pages = "38--45"
 }
-```
+```
\ No newline at end of file
diff --git a/README_hd.md b/README_hd.md
index 194aa1ab7a8b..4cd0052bd295 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -43,7 +43,7 @@ checkpoint: जाँच बिंदु
     <br>
     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
     <br>
-<p>
+</p>
 <p align="center">
     <a href="https://circleci.com/gh/huggingface/transformers">
         <img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main">
@@ -72,7 +72,7 @@ checkpoint: जाँच बिंदु
         <a href="https://github.com/huggingface/transformers/blob/main/README_es.md">Español</a> |
         <a href="https://github.com/huggingface/transformers/blob/main/README_ja.md">日本語</a> |
         <b>हिन्दी</b> |
-    <p>
+    </p>
 </h4>
 
 <h3 align="center">
@@ -200,7 +200,7 @@ checkpoint: जाँच बिंदु
 
 ### पिप का उपयोग करना
 
-इस रिपॉजिटरी का परीक्षण Python 3.6+, Flax 0.3.2+, PyTorch 1.3.1+ और TensorFlow 2.3+ के तहत किया गया है।
+इस रिपॉजिटरी का परीक्षण Python 3.8+, Flax 0.4.1+, PyTorch 1.10+ और TensorFlow 2.6+ के तहत किया गया है।
 
 आप [वर्चुअल एनवायरनमेंट] (https://docs.python.org/3/library/venv.html) में 🤗 ट्रांसफॉर्मर इंस्टॉल कर सकते हैं। यदि आप अभी तक पायथन के वर्चुअल एनवायरनमेंट से परिचित नहीं हैं, तो कृपया इसे [उपयोगकर्ता निर्देश] (https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) पढ़ें।
 
@@ -236,8 +236,11 @@ conda install -c huggingface transformers
 🤗 ट्रांसफॉर्मर वर्तमान में निम्नलिखित आर्किटेक्चर का समर्थन करते हैं (मॉडल के अवलोकन के लिए [यहां] देखें (https://huggingface.co/docs/transformers/model_summary))：
 
 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (Google Research and the Toyota Technological Institute at Chicago) साथ थीसिस [ALBERT: A Lite BERT for Self-supervised भाषा प्रतिनिधित्व सीखना](https://arxiv.org/abs/1909.11942), झेंझोंग लैन, मिंगदा चेन, सेबेस्टियन गुडमैन, केविन गिम्पेल, पीयूष शर्मा, राडू सोरिकट
-1. **[AltCLIP](https://huggingface.co/docs/transformers/main/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
+1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (Google Research से) Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig. द्वाराअनुसंधान पत्र [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) के साथ जारी किया गया
+1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
+1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
+1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (फेसबुक) साथ थीसिस [बार्ट: प्राकृतिक भाषा निर्माण, अनुवाद के लिए अनुक्रम-से-अनुक्रम पूर्व प्रशिक्षण , और समझ] (https://arxiv.org/pdf/1910.13461.pdf) पर निर्भर माइक लुईस, यिनहान लियू, नमन गोयल, मार्जन ग़ज़विनिनेजाद, अब्देलरहमान मोहम्मद, ओमर लेवी, वेस स्टोयानोव और ल्यूक ज़ेटलमॉयर
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (से École polytechnique) साथ थीसिस [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) पर निर्भर Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis रिहाई।
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (VinAI Research से) साथ में पेपर [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701)गुयेन लुओंग ट्रान, डुओंग मिन्ह ले और डाट क्वोक गुयेन द्वारा पोस्ट किया गया।
@@ -247,24 +250,31 @@ conda install -c huggingface transformers
 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (VinAI Research से) साथ में पेपर [BERTweet: अंग्रेजी ट्वीट्स के लिए एक पूर्व-प्रशिक्षित भाषा मॉडल] (https://aclanthology.org/2020.emnlp-demos.2/) डाट क्वोक गुयेन, थान वु और अन्ह तुआन गुयेन द्वारा प्रकाशित।
 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (गूगल रिसर्च से) साथ वाला पेपर [बिग बर्ड: ट्रांसफॉर्मर्स फॉर लॉन्गर सीक्वेंस](https://arxiv .org/abs/2007.14062) मंज़िल ज़हीर, गुरु गुरुगणेश, अविनावा दुबे, जोशुआ आइंस्ली, क्रिस अल्बर्टी, सैंटियागो ओंटानोन, फिलिप फाम, अनिरुद्ध रावुला, किफ़ान वांग, ली यांग, अमर अहमद द्वारा।
 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (गूगल रिसर्च से) साथ में पेपर [बिग बर्ड: ट्रांसफॉर्मर्स फॉर लॉन्गर सीक्वेंस](https://arxiv.org/abs/2007.14062) मंज़िल ज़हीर, गुरु गुरुगणेश, अविनावा दुबे, जोशुआ आइंस्ली, क्रिस अल्बर्टी, सैंटियागो ओंटानन, फिलिप फाम द्वारा , अनिरुद्ध रावुला, किफ़ान वांग, ली यांग, अमर अहमद द्वारा पोस्ट किया गया।
-1. **[BioGpt](https://huggingface.co/docs/transformers/main/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
-1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
+1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
+1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (फेसबुक से) साथ में कागज [एक ओपन-डोमेन चैटबॉट बनाने की विधि](https://arxiv.org /abs/2004.13637) स्टीफन रोलर, एमिली दीनन, नमन गोयल, दा जू, मैरी विलियमसन, यिनहान लियू, जिंग जू, मायल ओट, कर्ट शस्टर, एरिक एम। स्मिथ, वाई-लैन बॉरो, जेसन वेस्टन द्वारा।
 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (फेसबुक से) साथ में पेपर [एक ओपन-डोमेन चैटबॉट बनाने की रेसिपी](https://arxiv .org/abs/2004.13637) स्टीफन रोलर, एमिली दीनन, नमन गोयल, दा जू, मैरी विलियमसन, यिनहान लियू, जिंग जू, मायल ओट, कर्ट शस्टर, एरिक एम स्मिथ, वाई-लैन बॉरो, जेसन वेस्टन द्वारा।
-1. **[BLIP](https://huggingface.co/docs/transformers/main/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
+1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
+1. **[BLIP-2](https://huggingface.co/docs/transformers/model_doc/blip-2)** (Salesforce से) Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi. द्वाराअनुसंधान पत्र [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) के साथ जारी किया गया
 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigSicence Workshop](https://bigscience.huggingface.co/).
 1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (एलेक्सा से) कागज के साथ [बीईआरटी के लिए ऑप्टिमल सबआर्किटेक्चर एक्सट्रैक्शन](https://arxiv.org/abs/ 2010.10499) एड्रियन डी विंटर और डैनियल जे पेरी द्वारा।
+1. **[BridgeTower](https://huggingface.co/docs/transformers/model_doc/bridgetower)** (हरबिन इंस्टिट्यूट ऑफ़ टेक्नोलॉजी/माइक्रोसॉफ्ट रिसर्च एशिया/इंटेल लैब्स से) कागज के साथ [ब्रिजटॉवर: विजन-लैंग्वेज रिप्रेजेंटेशन लर्निंग में एनकोडर्स के बीच ब्रिज बनाना](<https://arxiv.org/abs/2206.08657>) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
+1. **[BROS](https://huggingface.co/docs/transformers/model_doc/bros)** (NAVER CLOVA से) Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park. द्वाराअनुसंधान पत्र [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539) के साथ जारी किया गया
 1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (Google अनुसंधान से) साथ में कागज [ByT5: पूर्व-प्रशिक्षित बाइट-टू-बाइट मॉडल के साथ एक टोकन-मुक्त भविष्य की ओर] (https://arxiv.org/abs/2105.13626) Linting Xue, Aditya Barua, Noah Constant, रामी अल-रफू, शरण नारंग, मिहिर काले, एडम रॉबर्ट्स, कॉलिन रैफेल द्वारा पोस्ट किया गया।
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (इनरिया/फेसबुक/सोरबोन से) साथ में कागज [CamemBERT: एक टेस्टी फ्रेंच लैंग्वेज मॉडल](https:// arxiv.org/abs/1911.03894) लुई मार्टिन*, बेंजामिन मुलर*, पेड्रो जेवियर ऑर्टिज़ सुआरेज़*, योआन ड्यूपॉन्ट, लॉरेंट रोमरी, एरिक विलेमोन्टे डे ला क्लर्जरी, जैमे सेडाह और बेनोइट सगोट द्वारा।
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google रिसर्च से) साथ में दिया गया पेपर [कैनाइन: प्री-ट्रेनिंग ए एफिशिएंट टोकनाइजेशन-फ्री एनकोडर फॉर लैंग्वेज रिप्रेजेंटेशन]( https://arxiv.org/abs/2103.06874) जोनाथन एच क्लार्क, डैन गैरेट, यूलिया टर्क, जॉन विएटिंग द्वारा।
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
+1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (LAION-AI से) Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. द्वाराअनुसंधान पत्र [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) के साथ जारी किया गया
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI से) साथ वाला पेपर [लर्निंग ट्रांसफरेबल विजुअल मॉडल फ्रॉम नेचुरल लैंग्वेज सुपरविजन](https://arxiv.org /abs/2103.00020) एलेक रैडफोर्ड, जोंग वूक किम, क्रिस हैलासी, आदित्य रमेश, गेब्रियल गोह, संध्या अग्रवाल, गिरीश शास्त्री, अमांडा एस्केल, पामेला मिश्किन, जैक क्लार्क, ग्रेचेन क्रुएगर, इल्या सुत्स्केवर द्वारा।
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (सेल्सफोर्स से) साथ में पेपर [प्रोग्राम सिंथेसिस के लिए एक संवादात्मक प्रतिमान](https://arxiv.org/abs/2203.13474) एरिक निजकैंप, बो पैंग, हिरोआकी हयाशी, लिफू तू, हुआन वांग, यिंगबो झोउ, सिल्वियो सावरेस, कैमिंग जिओंग रिलीज।
+1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (MetaAI से) Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve. द्वाराअनुसंधान पत्र [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) के साथ जारी किया गया
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (माइक्रोसॉफ्ट रिसर्च एशिया से) कागज के साथ [फास्ट ट्रेनिंग कन्वर्जेंस के लिए सशर्त डीईटीआर](https://arxiv. org/abs/2108.06152) डेपू मेंग, ज़ियाओकांग चेन, ज़ेजिया फैन, गैंग ज़ेंग, होउकियांग ली, युहुई युआन, लेई सन, जिंगडोंग वांग द्वारा।
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (YituTech से) साथ में कागज [ConvBERT: स्पैन-आधारित डायनेमिक कनवल्शन के साथ BERT में सुधार](https://arxiv .org/abs/2008.02496) जिहांग जियांग, वीहाओ यू, डाकान झोउ, युनपेंग चेन, जियाशी फेंग, शुइचेंग यान द्वारा।
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (Facebook AI से) साथ वाला पेपर [A ConvNet for the 2020s](https://arxiv.org/abs /2201.03545) ज़ुआंग लियू, हेंज़ी माओ, चाओ-युआन वू, क्रिस्टोफ़ फीचटेनहोफ़र, ट्रेवर डेरेल, सैनिंग ज़ी द्वारा।
+1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (सिंघुआ यूनिवर्सिटी से) साथ में पेपर [सीपीएम: ए लार्ज-स्केल जेनेरेटिव चाइनीज प्री-ट्रेंड लैंग्वेज मॉडल](https : //arxiv.org/abs/2012.00413) झेंग्यान झांग, जू हान, हाओ झोउ, पेई के, युक्सियन गु, डेमिंग ये, युजिया किन, युशेंग सु, हाओझे जी, जियान गुआन, फैंचाओ क्यूई, ज़ियाओझी वांग, यानान झेंग द्वारा , गुओयांग ज़ेंग, हुआनकी काओ, शेंगकी चेन, डाइक्सुआन ली, ज़ेनबो सन, ज़ियुआन लियू, मिनली हुआंग, वेंटाओ हान, जी तांग, जुआनज़ी ली, ज़ियाओयान झू, माओसोंग सन।
+1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (सेल्सफोर्स से) साथ में पेपर [CTRL: ए कंडिशनल ट्रांसफॉर्मर लैंग्वेज मॉडल फॉर कंट्रोलेबल जेनरेशन](https://arxiv.org/abs/1909.05858) नीतीश शिरीष केसकर*, ब्रायन मैककैन*, लव आर. वार्ष्णेय, कैमिंग जिओंग और रिचर्ड द्वारा सोचर द्वारा जारी किया गया।
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (Microsoft से) साथ में दिया गया पेपर [CvT: इंट्रोड्यूसिंग कनवॉल्यूशन टू विजन ट्रांसफॉर्मर्स](https://arxiv.org/ एब्स/2103.15808) हैपिंग वू, बिन जिओ, नोएल कोडेला, मेंगचेन लियू, जियांग दाई, लू युआन, लेई झांग द्वारा।
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (फेसबुक से) साथ में कागज [Data2Vec: भाषण, दृष्टि और भाषा में स्व-पर्यवेक्षित सीखने के लिए एक सामान्य ढांचा] (https://arxiv.org/abs/2202.03555) एलेक्सी बाएव्स्की, वेई-निंग सू, कियानटोंग जू, अरुण बाबू, जियाताओ गु, माइकल औली द्वारा पोस्ट किया गया।
@@ -273,24 +283,34 @@ conda install -c huggingface transformers
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (बर्कले/फेसबुक/गूगल से) पेपर के साथ [डिसीजन ट्रांसफॉर्मर: रीनफोर्समेंट लर्निंग वाया सीक्वेंस मॉडलिंग](https : //arxiv.org/abs/2106.01345) लिली चेन, केविन लू, अरविंद राजेश्वरन, किमिन ली, आदित्य ग्रोवर, माइकल लास्किन, पीटर एबील, अरविंद श्रीनिवास, इगोर मोर्डच द्वारा पोस्ट किया गया।
 1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (सेंसटाइम रिसर्च से) साथ में पेपर [डिफॉर्मेबल डीईटीआर: डिफॉर्मेबल ट्रांसफॉर्मर्स फॉर एंड-टू-एंड ऑब्जेक्ट डिटेक्शन] (https://arxiv.org/abs/2010.04159) Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, जिफेंग दाई द्वारा पोस्ट किया गया।
 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (फेसबुक से) साथ में पेपर [ट्रेनिंग डेटा-एफिशिएंट इमेज ट्रांसफॉर्मर और डिस्टिलेशन थ्रू अटेंशन](https://arxiv .org/abs/2012.12877) ह्यूगो टौव्रोन, मैथ्यू कॉर्ड, मैथिज्स डूज़, फ़्रांसिस्को मस्सा, एलेक्ज़ेंडर सबलेरोल्स, हर्वे जेगौ द्वारा।
+1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (Google AI से) Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun. द्वाराअनुसंधान पत्र [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) के साथ जारी किया गया
+1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (from The University of Texas at Austin) released with the paper [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl.
 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (फेसबुक से) साथ में कागज [ट्रांसफॉर्मर्स के साथ एंड-टू-एंड ऑब्जेक्ट डिटेक्शन](https://arxiv. org/abs/2005.12872) निकोलस कैरियन, फ़्रांसिस्को मस्सा, गेब्रियल सिनेव, निकोलस उसुनियर, अलेक्जेंडर किरिलोव, सर्गेई ज़ागोरुयको द्वारा।
 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (माइक्रोसॉफ्ट रिसर्च से) कागज के साथ [DialoGPT: बड़े पैमाने पर जनरेटिव प्री-ट्रेनिंग फॉर कन्वर्सेशनल रिस्पांस जेनरेशन](https ://arxiv.org/abs/1911.00536) यिज़े झांग, सिकी सन, मिशेल गैली, येन-चुन चेन, क्रिस ब्रोकेट, जियांग गाओ, जियानफेंग गाओ, जिंगजिंग लियू, बिल डोलन द्वारा।
 1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi.
+1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (Meta AI से) Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski. द्वाराअनुसंधान पत्र [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) के साथ जारी किया गया
 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (हगिंगफेस से), साथ में कागज [डिस्टिलबर्ट, बीईआरटी का डिस्टिल्ड वर्जन: छोटा, तेज, सस्ता और हल्का] (https://arxiv.org/abs/1910.01108) विक्टर सनह, लिसांड्रे डेब्यू और थॉमस वुल्फ द्वारा पोस्ट किया गया। यही तरीका GPT-2 को [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/distillation), RoBERta से [DistilRoBERta](https://github.com) पर कंप्रेस करने के लिए भी लागू किया जाता है। / हगिंगफेस/ट्रांसफॉर्मर्स/ट्री/मेन/उदाहरण/डिस्टिलेशन), बहुभाषी BERT से [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/distillation) और डिस्टिलबर्ट का जर्मन संस्करण।
 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (माइक्रोसॉफ्ट रिसर्च से) साथ में पेपर [DiT: सेल्फ सुपरवाइज्ड प्री-ट्रेनिंग फॉर डॉक्यूमेंट इमेज ट्रांसफॉर्मर](https://arxiv.org/abs/2203.02378) जुनलॉन्ग ली, यिहेंग जू, टेंगचाओ लव, लेई कुई, चा झांग द्वारा फुरु वेई द्वारा पोस्ट किया गया।
 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (NAVER से) साथ में कागज [OCR-मुक्त डॉक्यूमेंट अंडरस्टैंडिंग ट्रांसफॉर्मर](https://arxiv.org/abs /2111.15664) गीवूक किम, टीकग्यू होंग, मूनबिन यिम, जियोंग्योन नाम, जिनयॉन्ग पार्क, जिनयॉन्ग यिम, वोनसेओक ह्वांग, सांगडू यूं, डोंगयून हान, सेउंग्युन पार्क द्वारा।
 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (फेसबुक से) साथ में पेपर [ओपन-डोमेन क्वेश्चन आंसरिंग के लिए डेंस पैसेज रिट्रीवल](https://arxiv. org/abs/2004.04906) व्लादिमीर करपुखिन, बरलास ओज़ुज़, सेवन मिन, पैट्रिक लुईस, लेडेल वू, सर्गेई एडुनोव, डैनकी चेन, और वेन-ताऊ यिह द्वारा।
 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (इंटेल लैब्स से) साथ में कागज [विज़न ट्रांसफॉर्मर्स फॉर डेंस प्रेडिक्शन](https://arxiv.org /abs/2103.13413) रेने रैनफ्टल, एलेक्सी बोचकोवस्की, व्लादलेन कोल्टन द्वारा।
+1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
+1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (Google रिसर्च/स्टैनफोर्ड यूनिवर्सिटी से) साथ में दिया गया पेपर [इलेक्ट्रा: जेनरेटर के बजाय भेदभाव करने वाले के रूप में टेक्स्ट एन्कोडर्स का पूर्व-प्रशिक्षण] (https://arxiv.org/abs/2003.10555) केविन क्लार्क, मिन्ह-थांग लुओंग, क्वोक वी. ले, क्रिस्टोफर डी. मैनिंग द्वारा पोस्ट किया गया।
+1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (Meta AI से) Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi. द्वाराअनुसंधान पत्र [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) के साथ जारी किया गया
 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (Google रिसर्च से) साथ में दिया गया पेपर [सीक्वेंस जेनरेशन टास्क के लिए प्री-ट्रेंड चेकपॉइंट का इस्तेमाल करना](https:/ /arxiv.org/abs/1907.12461) साशा रोठे, शशि नारायण, अलियाक्सि सेवेरिन द्वारा।
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)**(Baidu से) साथ देने वाला पेपर [ERNIE: एन्हांस्ड रिप्रेजेंटेशन थ्रू नॉलेज इंटीग्रेशन](https://arxiv.org/abs/1904.09223) यू सन, शुओहुआन वांग, युकुन ली, शिकुन फेंग, ज़ुई चेन, हान झांग, शिन तियान, डैनक्सियांग झू, हाओ तियान, हुआ वू द्वारा पोस्ट किया गया।
+1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (Baidu से) Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang. द्वाराअनुसंधान पत्र [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) के साथ जारी किया गया
 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (मेटा AI से) ट्रांसफॉर्मर प्रोटीन भाषा मॉडल हैं। **ESM-1b** पेपर के साथ जारी किया गया था [ अलेक्जेंडर राइव्स, जोशुआ मेयर, टॉम सर्कु, सिद्धार्थ गोयल, ज़ेमिंग लिन द्वारा जैविक संरचना और कार्य असुरक्षित सीखने को 250 मिलियन प्रोटीन अनुक्रमों तक स्केल करने से उभरता है] (https://www.pnas.org/content/118/15/e2016239118) जेसन लियू, डेमी गुओ, मायल ओट, सी. लॉरेंस ज़िटनिक, जेरी मा और रॉब फर्गस। **ESM-1v** को पेपर के साथ जारी किया गया था [भाषा मॉडल प्रोटीन फ़ंक्शन पर उत्परिवर्तन के प्रभावों की शून्य-शॉट भविष्यवाणी को सक्षम करते हैं] (https://doi.org/10.1101/2021.07.09.450648) जोशुआ मेयर, रोशन राव, रॉबर्ट वेरकुइल, जेसन लियू, टॉम सर्कु और अलेक्जेंडर राइव्स द्वारा। **ESM-2** को पेपर के साथ जारी किया गया था [भाषा मॉडल विकास के पैमाने पर प्रोटीन अनुक्रम सटीक संरचना भविष्यवाणी को सक्षम करते हैं](https://doi.org/10.1101/2022.07.20.500902) ज़ेमिंग लिन, हलील अकिन, रोशन राव, ब्रायन ही, झोंगकाई झू, वेंटिंग लू, ए द्वारा लान डॉस सैंटोस कोस्टा, मरियम फ़ज़ल-ज़रंडी, टॉम सर्कू, साल कैंडिडो, अलेक्जेंडर राइव्स।
-1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 
+1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
+1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
+1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (CNRS से) साथ वाला पेपर [FlauBERT: Unsupervised Language Model Pre-training for फ़्रेंच](https://arxiv .org/abs/1912.05372) Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, बेंजामिन लेकोउटेक्स, अलेक्जेंड्रे अल्लाउज़ेन, बेनोइट क्रैबे, लॉरेंट बेसेसियर, डिडिएर श्वाब द्वारा।
 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (FLAVA: A फाउंडेशनल लैंग्वेज एंड विजन अलाइनमेंट मॉडल) (https://arxiv) साथ वाला पेपर .org/abs/2112.04482) अमनप्रीत सिंह, रोंगहांग हू, वेदानुज गोस्वामी, गुइल्यूम कुएरॉन, वोज्शिएक गालुबा, मार्कस रोहरबैक, और डौवे कीला द्वारा।
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (गूगल रिसर्च से) साथ वाला पेपर [FNet: मिक्सिंग टोकन विद फूरियर ट्रांसफॉर्म्स](https://arxiv.org /abs/2105.03824) जेम्स ली-थॉर्प, जोशुआ आइंस्ली, इल्या एकस्टीन, सैंटियागो ओंटानन द्वारा।
+1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (Microsoft Research से) Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao. द्वाराअनुसंधान पत्र [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) के साथ जारी किया गया
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (सीएमयू/गूगल ब्रेन से) साथ में कागज [फ़नल-ट्रांसफॉर्मर: कुशल भाषा प्रसंस्करण के लिए अनुक्रमिक अतिरेक को छानना](https://arxiv.org/abs/2006.03236) जिहांग दाई, गुओकुन लाई, यिमिंग यांग, क्वोक वी. ले ​​द्वारा रिहाई।
-1. **[GIT](https://huggingface.co/docs/transformers/main/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
+1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (KAIST से) साथ वाला पेपर [वर्टिकल कटडेप्थ के साथ मोनोकुलर डेप्थ एस्टीमेशन के लिए ग्लोबल-लोकल पाथ नेटवर्क्स](https:/ /arxiv.org/abs/2201.07436) डोयोन किम, वूंगह्युन गा, प्युंगवान आह, डोंगग्यू जू, सेहवान चुन, जुनमो किम द्वारा।
 1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (OpenAI से) साथ में दिया गया पेपर [जेनरेटिव प्री-ट्रेनिंग द्वारा भाषा की समझ में सुधार](https://blog .openai.com/language-unsupervised/) एलेक रैडफोर्ड, कार्तिक नरसिम्हन, टिम सालिमन्स और इल्या सुत्स्केवर द्वारा।
 1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (EleutherAI से) रिपॉजिटरी के साथ [EleutherAI/gpt-neo](https://github.com/ EleutherAI /gpt-neo) रिलीज। सिड ब्लैक, स्टेला बिडरमैन, लियो गाओ, फिल वांग और कॉनर लेही द्वारा पोस्ट किया गया।
@@ -298,11 +318,18 @@ conda install -c huggingface transformers
 1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (अबेजा के जरिए) शिन्या ओटानी, ताकायोशी मकाबे, अनुज अरोड़ा, क्यो हटोरी द्वारा।
 1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (ओपनएआई से) साथ में पेपर [लैंग्वेज मॉडल्स अनसुपरवाइज्ड मल्टीटास्क लर्नर्स हैं](https://blog.openai.com/better-language-models/) एलेक रैडफोर्ड*, जेफरी वू*, रेवन चाइल्ड, डेविड लुआन, डारियो एमोडी* द्वारा * और इल्या सुत्सकेवर** ने पोस्ट किया।
 1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (EleutherAI से) साथ वाला पेपर [kingoflolz/mesh-transformer-jax](https://github. com/kingoflolz/mesh-transformer-jax/) बेन वांग और अरन कोमात्सुजाकी द्वारा।
-1. **[GPT-Sw3](https://huggingface.co/docs/transformers/main/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren.
+1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren.
+1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (BigCode से) Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. द्वाराअनुसंधान पत्र [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) के साथ जारी किया गया
+1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
+1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA से) साथ में कागज [GroupViT: टेक्स्ट सुपरविजन से सिमेंटिक सेगमेंटेशन इमर्जेस](https://arxiv .org/abs/2202.11094) जियारुई जू, शालिनी डी मेलो, सिफ़ी लियू, वोनमिन बायन, थॉमस ब्रेउएल, जान कौट्ज़, ज़ियाओलोंग वांग द्वारा।
+1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology से) Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. द्वाराअनुसंधान पत्र [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) के साथ जारी किया गया
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (फेसबुक से) साथ में पेपर [ह्यूबर्ट: सेल्फ सुपरवाइज्ड स्पीच रिप्रेजेंटेशन लर्निंग बाय मास्क्ड प्रेडिक्शन ऑफ हिडन यूनिट्स](https ://arxiv.org/abs/2106.07447) वेई-निंग सू, बेंजामिन बोल्टे, याओ-हंग ह्यूबर्ट त्साई, कुशाल लखोटिया, रुस्लान सालाखुतदीनोव, अब्देलरहमान मोहम्मद द्वारा।
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (बर्कले से) साथ में कागज [I-BERT: Integer-only BERT Quantization](https:// arxiv.org/abs/2101.01321) सेहून किम, अमीर घोलमी, ज़ेवेई याओ, माइकल डब्ल्यू महोनी, कर्ट केटज़र द्वारा।
+1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. 
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
+1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
+1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (Salesforce से) Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi. द्वाराअनुसंधान पत्र [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) के साथ जारी किया गया
 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
@@ -311,6 +338,8 @@ conda install -c huggingface transformers
 1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (मेटा AI से) साथ वाला पेपर [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https:/ /arxiv.org/abs/2104.01136) बेन ग्राहम, अलाएल्डिन एल-नौबी, ह्यूगो टौवरन, पियरे स्टॉक, आर्मंड जौलिन, हर्वे जेगौ, मैथिज डूज़ द्वारा।
 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (दक्षिण चीन प्रौद्योगिकी विश्वविद्यालय से) साथ में कागज [LiLT: एक सरल लेकिन प्रभावी भाषा-स्वतंत्र लेआउट ट्रांसफार्मर संरचित दस्तावेज़ समझ के लिए](https://arxiv.org/abs/2202.13669) जियापेंग वांग, लियानवेन जिन, काई डिंग द्वारा पोस्ट किया गया।
+1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (The FAIR team of Meta AI से) Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample. द्वाराअनुसंधान पत्र [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) के साथ जारी किया गया
+1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (The FAIR team of Meta AI से) Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom.. द्वाराअनुसंधान पत्र [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/XXX) के साथ जारी किया गया
 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (मैंडी गुओ, जोशुआ आइंस्ली, डेविड यूथस, सैंटियागो ओंटानन, जियानमो नि, यूं-हुआन सुंग, यिनफेई यांग द्वारा पोस्ट किया गया।
 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (स्टूडियो औसिया से) साथ में पेपर [LUKE: डीप कॉन्टेक्स्टुअलाइज्ड एंटिटी रिप्रेजेंटेशन विद एंटिटी-अवेयर सेल्फ-अटेंशन](https ://arxiv.org/abs/2010.01057) Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto द्वारा।
@@ -319,32 +348,50 @@ conda install -c huggingface transformers
 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (फेसबुक से) साथ देने वाला पेपर [बियॉन्ड इंग्लिश-सेंट्रिक मल्टीलिंगुअल मशीन ट्रांसलेशन](https://arxiv.org/ एब्स/2010.11125) एंजेला फैन, श्रुति भोसले, होल्गर श्वेन्क, झी मा, अहमद अल-किश्की, सिद्धार्थ गोयल, मनदीप बैनेस, ओनूर सेलेबी, गुइल्लाम वेन्जेक, विश्रव चौधरी, नमन गोयल, टॉम बर्च, विटाली लिपचिंस्की, सर्गेई एडुनोव, एडौर्ड द्वारा ग्रेव, माइकल औली, आर्मंड जौलिन द्वारा पोस्ट किया गया।
 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Jörg द्वारा [OPUS](http://opus.nlpl.eu/) डेटा से प्रशिक्षित मशीनी अनुवाद मॉडल पोस्ट किया गया टाइडेमैन द्वारा। [मैरियन फ्रेमवर्क](https://marian-nmt.github.io/) माइक्रोसॉफ्ट ट्रांसलेटर टीम द्वारा विकसित।
 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (माइक्रोसॉफ्ट रिसर्च एशिया से) साथ में पेपर [मार्कअपएलएम: विजुअली-रिच डॉक्यूमेंट अंडरस्टैंडिंग के लिए टेक्स्ट और मार्कअप लैंग्वेज का प्री-ट्रेनिंग] (https://arxiv.org/abs/2110.08518) जुनलॉन्ग ली, यिहेंग जू, लेई कुई, फुरु द्वारा वी द्वारा पोस्ट किया गया।
+1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (FAIR and UIUC से) Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar. द्वाराअनुसंधान पत्र [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) के साथ जारी किया गया
 1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (मेटा और UIUC से) पेपर के साथ जारी किया गया [प्रति-पिक्सेल वर्गीकरण वह सब नहीं है जिसकी आपको सिमेंटिक सेगमेंटेशन की आवश्यकता है] (https://arxiv.org/abs/2107.06278) बोवेन चेंग, अलेक्जेंडर जी. श्विंग, अलेक्जेंडर किरिलोव द्वारा >>>>>> रिबेस ठीक करें
+1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (Google AI से) Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos. द्वाराअनुसंधान पत्र [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) के साथ जारी किया गया
 1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (फेसबुक से) साथ में पेपर [न्यूरल मशीन ट्रांसलेशन के लिए मल्टीलिंगुअल डीनोइजिंग प्री-ट्रेनिंग](https://arxiv. org/abs/2001.08210) यिनहान लियू, जियाताओ गु, नमन गोयल, जियान ली, सर्गेई एडुनोव, मार्जन ग़ज़विनिनेजाद, माइक लुईस, ल्यूक ज़ेटलमॉयर द्वारा।
 1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (फेसबुक से) साथ में पेपर [एक्स्टेंसिबल बहुभाषी प्रीट्रेनिंग और फाइनट्यूनिंग के साथ बहुभाषी अनुवाद](https://arxiv युकिंग टैंग, चाउ ट्रान, जियान ली, पेंग-जेन चेन, नमन गोयल, विश्रव चौधरी, जियाताओ गु, एंजेला फैन द्वारा .org/abs/2008.00401)।
+1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (Facebook से) Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer. द्वाराअनुसंधान पत्र [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) के साथ जारी किया गया
 1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (NVIDIA से) कागज के साथ [Megatron-LM: मॉडल का उपयोग करके बहु-अरब पैरामीटर भाषा मॉडल का प्रशिक्षण Parallelism](https://arxiv.org/abs/1909.08053) मोहम्मद शोएबी, मोस्टोफा पटवारी, राउल पुरी, पैट्रिक लेग्रेस्ले, जेरेड कैस्पर और ब्रायन कैटानज़ारो द्वारा।
 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (NVIDIA से) साथ वाला पेपर [Megatron-LM: ट्रेनिंग मल्टी-बिलियन पैरामीटर लैंग्वेज मॉडल्स यूजिंग मॉडल पैरेललिज़्म] (https://arxiv.org/abs/1909.08053) मोहम्मद शोएबी, मोस्टोफा पटवारी, राउल पुरी, पैट्रिक लेग्रेस्ले, जेरेड कैस्पर और ब्रायन कैटानज़ारो द्वारा पोस्ट किया गया।
+1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (Alibaba Research से) Peng Wang, Cheng Da, and Cong Yao. द्वाराअनुसंधान पत्र [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) के साथ जारी किया गया
+1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The Mistral AI team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.. 
 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (फ्रॉम Studio Ousia) साथ में पेपर [mLUKE: द पावर ऑफ एंटिटी रिप्रेजेंटेशन इन मल्टीलिंगुअल प्रीट्रेन्ड लैंग्वेज मॉडल्स](https://arxiv.org/abs/2110.08151) रयोकन री, इकुया यामाडा, और योशिमासा त्सुरोका द्वारा।
+1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (Facebook से) Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli. द्वाराअनुसंधान पत्र [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) के साथ जारी किया गया
 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (सीएमयू/गूगल ब्रेन से) साथ में कागज [मोबाइलबर्ट: संसाधन-सीमित उपकरणों के लिए एक कॉम्पैक्ट टास्क-अज्ञेय बीईआरटी] (https://arxiv.org/abs/2004.02984) Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, और Denny Zhou द्वारा पोस्ट किया गया।
 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (Apple से) साथ में कागज [MobileViT: लाइट-वेट, जनरल-पर्पस, और मोबाइल-फ्रेंडली विजन ट्रांसफॉर्मर] (https://arxiv.org/abs/2110.02178) सचिन मेहता और मोहम्मद रस्तगरी द्वारा पोस्ट किया गया।
+1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (Apple से) Sachin Mehta and Mohammad Rastegari. द्वाराअनुसंधान पत्र [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) के साथ जारी किया गया
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
+1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (MosaiML से) the MosaicML NLP Team. द्वाराअनुसंधान पत्र [llm-foundry](https://github.com/mosaicml/llm-foundry/) के साथ जारी किया गया
+1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (the University of Wisconsin - Madison से) Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh. द्वाराअनुसंधान पत्र [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284) के साथ जारी किया गया
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI से) साथ वाला पेपर [mT5: एक व्यापक बहुभाषी पूर्व-प्रशिक्षित टेक्स्ट-टू-टेक्स्ट ट्रांसफॉर्मर]( https://arxiv.org/abs/2010.11934) लिंटिंग ज़ू, नोआ कॉन्सटेंट, एडम रॉबर्ट्स, मिहिर काले, रामी अल-रफू, आदित्य सिद्धांत, आदित्य बरुआ, कॉलिन रैफेल द्वारा पोस्ट किया गया।
+1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez.
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (हुआवेई नूह के आर्क लैब से) साथ में कागज़ [NEZHA: चीनी भाषा समझ के लिए तंत्रिका प्रासंगिक प्रतिनिधित्व](https :/ /arxiv.org/abs/1909.00204) जुन्किउ वेई, ज़ियाओज़े रेन, ज़िआओगुआंग ली, वेनयोंग हुआंग, यी लियाओ, याशेंग वांग, जियाशू लिन, शिन जियांग, जिओ चेन और कुन लियू द्वारा।
 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (फ्रॉम मेटा) साथ में पेपर [नो लैंग्वेज लेफ्ट बिहाइंड: स्केलिंग ह्यूमन-सेंटेड मशीन ट्रांसलेशन] (https://arxiv.org/abs/2207.04672) एनएलएलबी टीम द्वारा प्रकाशित।
+1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (Meta से) the NLLB team. द्वाराअनुसंधान पत्र [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) के साथ जारी किया गया
+1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (Meta AI से) Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic. द्वाराअनुसंधान पत्र [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) के साथ जारी किया गया
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (विस्कॉन्सिन विश्वविद्यालय - मैडिसन से) साथ में कागज [Nyströmformer: A Nyström- आधारित एल्गोरिथम आत्म-ध्यान का अनुमान लगाने के लिए ](https://arxiv.org/abs/2102.03902) युनयांग ज़िओंग, झानपेंग ज़ेंग, रुद्रसिस चक्रवर्ती, मिंगक्सिंग टैन, ग्लेन फंग, यिन ली, विकास सिंह द्वारा पोस्ट किया गया।
+1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (SHI Labs से) पेपर [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) जितेश जैन, जिआचेन ली, मांगटिक चिउ, अली हसनी, निकिता ओरलोव, हम्फ्री शि के द्वारा जारी किया गया है।
+1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI से) साथ में कागज [विज़न ट्रांसफॉर्मर्स के साथ सिंपल ओपन-वोकैबुलरी ऑब्जेक्ट डिटेक्शन](https:/ /arxiv.org/abs/2205.06230) मैथियास मिंडरर, एलेक्सी ग्रिट्सेंको, ऑस्टिन स्टोन, मैक्सिम न्यूमैन, डिर्क वीसेनबोर्न, एलेक्सी डोसोवित्स्की, अरविंद महेंद्रन, अनुराग अर्नब, मुस्तफा देहघानी, ज़ुओरन शेन, जिओ वांग, ज़ियाओहुआ झाई, थॉमस किफ़, और नील हॉल्सबी द्वारा पोस्ट किया गया।
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google की ओर से) साथ में दिया गया पेपर [लंबे इनपुट सारांश के लिए ट्रांसफ़ॉर्मरों को बेहतर तरीके से एक्सटेंड करना](https://arxiv .org/abs/2208.04347) जेसन फांग, याओ झाओ, पीटर जे लियू द्वारा।
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (दीपमाइंड से) साथ में पेपर [पर्सीवर आईओ: संरचित इनपुट और आउटपुट के लिए एक सामान्य वास्तुकला] (https://arxiv.org/abs/2107.14795) एंड्रयू जेगल, सेबेस्टियन बोरग्यूड, जीन-बैप्टिस्ट अलायराक, कार्ल डोर्श, कैटलिन इओनेस्कु, डेविड द्वारा डिंग, स्कंद कोप्पुला, डैनियल ज़ोरान, एंड्रयू ब्रॉक, इवान शेलहैमर, ओलिवियर हेनाफ, मैथ्यू एम। बोट्विनिक, एंड्रयू ज़िसरमैन, ओरिओल विनियल्स, जोआओ कैरेरा द्वारा पोस्ट किया गया।
+1. **[Persimmon](https://huggingface.co/docs/transformers/model_doc/persimmon)** (ADEPT से) Erich Elsen, Augustus Odena, Maxwell Nye, Sağnak Taşırlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani. द्वाराअनुसंधान पत्र [blog post](https://www.adept.ai/blog/persimmon-8b) के साथ जारी किया गया
 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (VinAI Research से) कागज के साथ [PhoBERT: वियतनामी के लिए पूर्व-प्रशिक्षित भाषा मॉडल](https://www .aclweb.org/anthology/2020.findings-emnlp.92/) डैट क्वोक गुयेन और अन्ह तुआन गुयेन द्वारा पोस्ट किया गया।
+1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (Google से) Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova. द्वाराअनुसंधान पत्र [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) के साथ जारी किया गया
 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (UCLA NLP से) साथ वाला पेपर [प्रोग्राम अंडरस्टैंडिंग एंड जेनरेशन के लिए यूनिफाइड प्री-ट्रेनिंग](https://arxiv .org/abs/2103.06333) वसी उद्दीन अहमद, सैकत चक्रवर्ती, बैशाखी रे, काई-वेई चांग द्वारा।
 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
+1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi, Kyogu Lee. 
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (माइक्रोसॉफ्ट रिसर्च से) साथ में पेपर [ProphetNet: प्रेडिक्टिंग फ्यूचर एन-ग्राम फॉर सीक्वेंस-टू-सीक्वेंस प्री-ट्रेनिंग ](https://arxiv.org/abs/2001.04063) यू यान, वीज़ेन क्यूई, येयुन गोंग, दयाहेंग लियू, नान डुआन, जिउशेंग चेन, रुओफ़ेई झांग और मिंग झोउ द्वारा पोस्ट किया गया।
+1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (Nanjing University, The University of Hong Kong etc. से) Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. द्वाराअनुसंधान पत्र [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) के साथ जारी किया गया
 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (NVIDIA से) साथ वाला पेपर [डीप लर्निंग इंफ़ेक्शन के लिए इंटीजर क्वांटिज़ेशन: प्रिंसिपल्स एंड एम्पिरिकल इवैल्यूएशन](https:// arxiv.org/abs/2004.09602) हाओ वू, पैट्रिक जुड, जिआओजी झांग, मिखाइल इसेव और पॉलियस माइकेविसियस द्वारा।
 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (फेसबुक से) साथ में कागज [रिट्रीवल-ऑगमेंटेड जेनरेशन फॉर नॉलेज-इंटेंसिव एनएलपी टास्क](https://arxiv .org/abs/2005.11401) पैट्रिक लुईस, एथन पेरेज़, अलेक्जेंड्रा पिक्टस, फैबियो पेट्रोनी, व्लादिमीर कारपुखिन, नमन गोयल, हेनरिक कुटलर, माइक लुईस, वेन-ताउ यिह, टिम रॉकटाशेल, सेबस्टियन रिडेल, डौवे कीला द्वारा।
 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (Google अनुसंधान से) केल्विन गु, केंटन ली, ज़ोरा तुंग, पानुपोंग पसुपत और मिंग-वेई चांग द्वारा साथ में दिया गया पेपर [REALM: रिट्रीवल-ऑगमेंटेड लैंग्वेज मॉडल प्री-ट्रेनिंग](https://arxiv.org/abs/2002.08909)।
@@ -353,52 +400,65 @@ conda install -c huggingface transformers
 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (गूगल रिसर्च से) साथ वाला पेपर [पूर्व-प्रशिक्षित भाषा मॉडल में एम्बेडिंग कपलिंग पर पुनर्विचार](https://arxiv .org/pdf/2010.12821.pdf) ह्युंग वोन चुंग, थिबॉल्ट फ़ेवरी, हेनरी त्साई, एम. जॉनसन, सेबेस्टियन रुडर द्वारा।
 1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (माइक्रोसॉफ्ट रिसर्च से) [डीप रेसिडुअल लर्निंग फॉर इमेज रिकग्निशन] (https://arxiv. org/abs/1512.03385) कैमिंग हे, जियांग्यु झांग, शाओकिंग रेन, जियान सन द्वारा।
 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (फेसबुक से), साथ में कागज [मजबूत रूप से अनुकूलित BERT प्रीट्रेनिंग दृष्टिकोण](https://arxiv.org/abs /1907.11692) यिनहान लियू, मायल ओट, नमन गोयल, जिंगफेई डू, मंदार जोशी, डैनकी चेन, ओमर लेवी, माइक लुईस, ल्यूक ज़ेटलमॉयर, वेसेलिन स्टोयानोव द्वारा।
-1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/main/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
-1. **[RoCBert](https://huggingface.co/docs/transformers/main/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
+1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
+1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (झुईई टेक्नोलॉजी से), साथ में पेपर [रोफॉर्मर: रोटरी पोजिशन एंबेडिंग के साथ एन्हांस्ड ट्रांसफॉर्मर] (https://arxiv.org/pdf/2104.09864v1.pdf) जियानलिन सु और यू लू और शेंगफेंग पैन और बो वेन और युनफेंग लियू द्वारा प्रकाशित।
+1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (Bo Peng से) Bo Peng. द्वाराअनुसंधान पत्र [this repo](https://github.com/BlinkDL/RWKV-LM) के साथ जारी किया गया
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
+1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (Meta AI से) Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. द्वाराअनुसंधान पत्र [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) के साथ जारी किया गया
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP से) साथ देने वाला पेपर [भाषण पहचान के लिए अनसुपरवाइज्ड प्री-ट्रेनिंग में परफॉर्मेंस-एफिशिएंसी ट्रेड-ऑफ्स](https ://arxiv.org/abs/2109.06870) फेलिक्स वू, क्वांगयुन किम, जिंग पैन, क्यू हान, किलियन क्यू. वेनबर्गर, योव आर्टज़ी द्वारा।
 1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (ASAPP से) साथ में पेपर [भाषण पहचान के लिए अनसुपरवाइज्ड प्री-ट्रेनिंग में परफॉर्मेंस-एफिशिएंसी ट्रेड-ऑफ्स] (https://arxiv.org/abs/2109.06870) फेलिक्स वू, क्वांगयुन किम, जिंग पैन, क्यू हान, किलियन क्यू. वेनबर्गर, योआव आर्टज़ी द्वारा पोस्ट किया गया।
+1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
 1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (फेसबुक से), साथ में पेपर [फेयरसेक S2T: फास्ट स्पीच-टू-टेक्स्ट मॉडलिंग विद फेयरसेक](https: //arxiv.org/abs/2010.05171) चांगहान वांग, यूं तांग, जुताई मा, ऐनी वू, दिमित्रो ओखोनको, जुआन पिनो द्वारा पोस्ट किया गया。
 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (फेसबुक से) साथ में पेपर [लार्ज-स्केल सेल्फ- एंड सेमी-सुपरवाइज्ड लर्निंग फॉर स्पीच ट्रांसलेशन](https://arxiv.org/abs/2104.06678) चांगहान वांग, ऐनी वू, जुआन पिनो, एलेक्सी बेवस्की, माइकल औली, एलेक्सिस द्वारा Conneau द्वारा पोस्ट किया गया।
 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (तेल अवीव यूनिवर्सिटी से) साथ में पेपर [स्पैन सिलेक्शन को प्री-ट्रेनिंग करके कुछ-शॉट क्वेश्चन आंसरिंग](https:// arxiv.org/abs/2101.00438) ओरि राम, युवल कर्स्टन, जोनाथन बेरेंट, अमीर ग्लोबर्सन, ओमर लेवी द्वारा।
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (बर्कले से) कागज के साथ [SqueezeBERT: कुशल तंत्रिका नेटवर्क के बारे में NLP को कंप्यूटर विज़न क्या सिखा सकता है?](https: //arxiv.org/abs/2006.11316) फॉरेस्ट एन. इनडोला, अल्बर्ट ई. शॉ, रवि कृष्णा, और कर्ट डब्ल्यू. केटज़र द्वारा।
+1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (MBZUAI से) Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan. द्वाराअनुसंधान पत्र [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) के साथ जारी किया गया
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (माइक्रोसॉफ्ट से) साथ में कागज [स्वाइन ट्रांसफॉर्मर: शिफ्टेड विंडोज का उपयोग कर पदानुक्रमित विजन ट्रांसफॉर्मर](https://arxiv .org/abs/2103.14030) ज़ी लियू, युटोंग लिन, यू काओ, हान हू, यिक्सुआन वेई, झेंग झांग, स्टीफन लिन, बैनिंग गुओ द्वारा।
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (Microsoft से) साथ वाला पेपर [Swin Transformer V2: स्केलिंग अप कैपेसिटी एंड रेजोल्यूशन](https:// ज़ी लियू, हान हू, युटोंग लिन, ज़ुलिआंग याओ, ज़ेंडा ज़ी, यिक्सुआन वेई, जिया निंग, यू काओ, झेंग झांग, ली डोंग, फुरु वेई, बैनिंग गुओ द्वारा arxiv.org/abs/2111.09883।
-1. **[Swin2SR](https://huggingface.co/docs/transformers/main/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
-1. **[SwitchTransformers](https://huggingface.co/docs/transformers/main/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
+1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
+1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (来自 Google AI)कॉलिन रैफेल और नोम शज़ीर और एडम रॉबर्ट्स और कैथरीन ली और शरण नारंग और माइकल मटेना द्वारा साथ में पेपर [एक एकीकृत टेक्स्ट-टू-टेक्स्ट ट्रांसफॉर्मर के साथ स्थानांतरण सीखने की सीमा की खोज] (https://arxiv.org/abs/1910.10683) और यांकी झोउ और वेई ली और पीटर जे लियू।
 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (Google AI से) साथ वाला पेपर [google-research/text-to-text-transfer- ट्रांसफॉर्मर](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) कॉलिन रैफेल और नोम शज़ीर और एडम रॉबर्ट्स और कैथरीन ली और शरण नारंग द्वारा और माइकल मटेना और यांकी झोउ और वेई ली और पीटर जे लियू।
 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (माइक्रोसॉफ्ट रिसर्च से) साथ में पेपर [पबटेबल्स-1एम: टूवर्ड्स कॉम्प्रिहेंसिव टेबल एक्सट्रैक्शन फ्रॉम अनस्ट्रक्चर्ड डॉक्यूमेंट्स ](https://arxiv.org/abs/2110.00061) ब्रैंडन स्मॉक, रोहित पेसाला, रॉबिन अब्राहम द्वारा पोस्ट किया गया।
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (Google AI से) साथ में कागज [TAPAS: पूर्व-प्रशिक्षण के माध्यम से कमजोर पर्यवेक्षण तालिका पार्सिंग](https:// arxiv.org/abs/2004.02349) जोनाथन हर्ज़िग, पावेल क्रिज़िस्तोफ़ नोवाक, थॉमस मुलर, फ्रांसेस्को पिकिन्नो और जूलियन मार्टिन ईसेन्च्लोस द्वारा।
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (माइक्रोसॉफ्ट रिसर्च से) साथ में पेपर [TAPEX: टेबल प्री-ट्रेनिंग थ्रू लर्निंग अ न्यूरल SQL एक्ज़ीक्यूटर](https: //arxiv.org/abs/2107.07653) कियान लियू, बेई चेन, जियाकी गुओ, मोर्टेज़ा ज़ियादी, ज़ेकी लिन, वीज़ू चेन, जियान-गुआंग लू द्वारा पोस्ट किया गया।
 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
-1. **[TimeSformer](https://huggingface.co/docs/transformers/main/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
+1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (Google/CMU की ओर से) कागज के साथ [संस्करण-एक्स: एक ब्लॉग मॉडल चौकस चौक मॉडल मॉडल] (https://arxivorg/abs/1901.02860) क्वोकोक वी. ले, रुस्लैन सलाखुतदी
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft) released with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
-1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler 
+1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
+1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
+1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (Google Research से) Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant. द्वाराअनुसंधान पत्र [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) के साथ जारी किया गया
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (माइक्रोसॉफ्ट रिसर्च से) साथ में दिया गया पेपर [UniSpeech: यूनिफाइड स्पीच रिप्रेजेंटेशन लर्निंग विद लेबलेड एंड अनलेबल्ड डेटा](https:/ /arxiv.org/abs/2101.07597) चेंगई वांग, यू वू, याओ कियान, केनिची कुमातानी, शुजी लियू, फुरु वेई, माइकल ज़ेंग, ज़ुएदोंग हुआंग द्वारा।
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (माइक्रोसॉफ्ट रिसर्च से) कागज के साथ [UNISPEECH-SAT: यूनिवर्सल स्पीच रिप्रेजेंटेशन लर्निंग विद स्पीकर अवेयर प्री-ट्रेनिंग ](https://arxiv.org/abs/2110.05752) सानयुआन चेन, यू वू, चेंग्यी वांग, झेंगयांग चेन, झूओ चेन, शुजी लियू, जियान वू, याओ कियान, फुरु वेई, जिन्यु ली, जियांगज़ान यू द्वारा पोस्ट किया गया।
+1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (सिंघुआ यूनिवर्सिटी और ननकाई यूनिवर्सिटी से) साथ में पेपर [विजुअल अटेंशन नेटवर्क](https://arxiv.org/ pdf/2202.09741.pdf) मेंग-हाओ गुओ, चेंग-ज़े लू, झेंग-निंग लियू, मिंग-मिंग चेंग, शि-मिन हू द्वारा।
 1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (मल्टीमीडिया कम्प्यूटिंग ग्रुप, नानजिंग यूनिवर्सिटी से) साथ में पेपर [वीडियोएमएई: मास्क्ड ऑटोएन्कोडर स्व-पर्यवेक्षित वीडियो प्री-ट्रेनिंग के लिए डेटा-कुशल सीखने वाले हैं] (https://arxiv.org/abs/2203.12602) ज़ान टोंग, यिबिंग सॉन्ग, जुए द्वारा वांग, लिमिन वांग द्वारा पोस्ट किया गया।
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (NAVER AI Lab/Kakao Enterprise/Kakao Brain से) साथ में कागज [ViLT: Vision-and-Language Transformer बिना कनवल्शन या रीजन सुपरविजन](https://arxiv.org/abs/2102.03334) वोनजे किम, बोक्यूंग सोन, इल्डू किम द्वारा पोस्ट किया गया।
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (गूगल एआई से) कागज के साथ [एक इमेज इज़ वर्थ 16x16 वर्ड्स: ट्रांसफॉर्मर्स फॉर इमेज रिकॉग्निशन एट स्केल](https://arxiv.org/abs/2010.11929) एलेक्सी डोसोवित्स्की, लुकास बेयर, अलेक्जेंडर कोलेसनिकोव, डिर्क वीसेनबोर्न, शियाओहुआ झाई, थॉमस अनटरथिनर, मुस्तफा देहघानी, मैथियास मिंडरर, जॉर्ज हेगोल्ड, सिल्वेन गेली, जैकब उस्ज़कोरेइट द्वारा हॉल्सबी द्वारा पोस्ट किया गया।
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (UCLA NLP से) साथ वाला पेपर [VisualBERT: A Simple and Performant Baseline for Vision and Language](https:/ /arxiv.org/pdf/1908.03557) लियुनियन हेरोल्ड ली, मार्क यात्स्कर, दा यिन, चो-जुई हसीह, काई-वेई चांग द्वारा।
-1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (Meta AI से) Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He. द्वाराअनुसंधान पत्र [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) के साथ जारी किया गया
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (मेटा एआई से) साथ में कागज [मास्कड ऑटोएन्कोडर स्केलेबल विजन लर्नर्स हैं](https://arxiv.org/ एब्स/2111.06377) कैमिंग हे, ज़िनेली चेन, सेनिंग ज़ी, यांगहो ली, पिओट्र डॉलर, रॉस गिर्शिक द्वारा।
+1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (HUST-VL से) Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang. द्वाराअनुसंधान पत्र [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) के साथ जारी किया गया
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (मेटा एआई से) साथ में कागज [लेबल-कुशल सीखने के लिए मास्क्ड स्याम देश के नेटवर्क](https://arxiv. org/abs/2204.07141) महमूद असरान, मथिल्डे कैरन, ईशान मिश्रा, पियोट्र बोजानोवस्की, फ्लोरियन बोर्डेस, पास्कल विंसेंट, आर्मंड जौलिन, माइकल रब्बत, निकोलस बल्लास द्वारा।
+1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (Kakao Enterprise से) Jaehyeon Kim, Jungil Kong, Juhee Son. द्वाराअनुसंधान पत्र [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) के साथ जारी किया गया
+1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (फेसबुक एआई से) साथ में पेपर [wav2vec 2.0: ए फ्रेमवर्क फॉर सेल्फ-सुपरवाइज्ड लर्निंग ऑफ स्पीच रिप्रेजेंटेशन] (https://arxiv.org/abs/2006.11477) एलेक्सी बेवस्की, हेनरी झोउ, अब्देलरहमान मोहम्मद, माइकल औली द्वारा।
 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (Facebook AI से) साथ वाला पेपर [FAIRSEQ S2T: FAIRSEQ के साथ फास्ट स्पीच-टू-टेक्स्ट मॉडलिंग ](https://arxiv.org/abs/2010.05171) चांगहान वांग, यूं तांग, जुताई मा, ऐनी वू, सरव्या पोपुरी, दिमित्रो ओखोनको, जुआन पिनो द्वारा पोस्ट किया गया।
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (Facebook AI से) साथ वाला पेपर [सरल और प्रभावी जीरो-शॉट क्रॉस-लिंगुअल फोनेम रिकॉग्निशन](https:/ /arxiv.org/abs/2109.11680) कियानटोंग जू, एलेक्सी बाएव्स्की, माइकल औली द्वारा।
 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (माइक्रोसॉफ्ट रिसर्च से) पेपर के साथ जारी किया गया [WavLM: फुल स्टैक के लिए बड़े पैमाने पर स्व-पर्यवेक्षित पूर्व-प्रशिक्षण स्पीच प्रोसेसिंग] (https://arxiv.org/abs/2110.13900) सानयुआन चेन, चेंगयी वांग, झेंगयांग चेन, यू वू, शुजी लियू, ज़ुओ चेन, जिन्यु ली, नाओयुकी कांडा, ताकुया योशियोका, ज़िओंग जिओ, जियान वू, लॉन्ग झोउ, शुओ रेन, यानमिन कियान, याओ कियान, जियान वू, माइकल ज़ेंग, फुरु वेई।
 1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (OpenAI से) साथ में कागज [बड़े पैमाने पर कमजोर पर्यवेक्षण के माध्यम से मजबूत भाषण पहचान](https://cdn. openai.com/papers/whisper.pdf) एलेक रैडफोर्ड, जोंग वूक किम, ताओ जू, ग्रेग ब्रॉकमैन, क्रिस्टीन मैकलीवे, इल्या सुत्स्केवर द्वारा।
 1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (माइक्रोसॉफ्ट रिसर्च से) कागज के साथ [एक्सपैंडिंग लैंग्वेज-इमेज प्रीट्रेन्ड मॉडल फॉर जनरल वीडियो रिकग्निशन](https: //arxiv.org/abs/2208.02816) बोलिन नी, होउवेन पेंग, मिंगाओ चेन, सोंगयांग झांग, गाओफेंग मेंग, जियानलोंग फू, शिमिंग जियांग, हैबिन लिंग द्वारा।
+1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (Meta AI से) Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe. द्वाराअनुसंधान पत्र [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) के साथ जारी किया गया
 1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (फेसबुक से) साथ में पेपर [क्रॉस-लिंगुअल लैंग्वेज मॉडल प्रीट्रेनिंग] (https://arxiv.org/abs/1901.07291) गिलाउम लैम्पल और एलेक्सिस कोनो द्वारा।
 1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (माइक्रोसॉफ्ट रिसर्च से) साथ में कागज [ProphetNet: प्रेडिक्टिंग फ्यूचर एन-ग्राम फॉर सीक्वेंस-टू- सीक्वेंस प्री-ट्रेनिंग](https://arxiv.org/abs/2001.04063) यू यान, वीज़ेन क्यूई, येयुन गोंग, दयाहेंग लियू, नान डुआन, जिउशेंग चेन, रुओफ़ेई झांग और मिंग झोउ द्वारा।
 1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (फेसबुक एआई से), साथ में पेपर [अनसुपरवाइज्ड क्रॉस-लिंगुअल रिप्रेजेंटेशन लर्निंग एट स्केल] (https://arxiv.org/abs/1911.02116) एलेक्सिस कोन्यू*, कार्तिकेय खंडेलवाल*, नमन गोयल, विश्रव चौधरी, गिलाउम वेनज़ेक, फ्रांसिस्को गुज़मैन द्वारा , एडौर्ड ग्रेव, मायल ओट, ल्यूक ज़ेटलमॉयर और वेसेलिन स्टोयानोव द्वारा।
 1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (Facebook AI से) साथ में कागज [बहुभाषी नकाबपोश भाषा के लिए बड़े पैमाने पर ट्रांसफॉर्मर ] मॉडलिंग](https://arxiv.org/abs/2105.00572) नमन गोयल, जिंगफेई डू, मायल ओट, गिरि अनंतरामन, एलेक्सिस कोनो द्वारा पोस्ट किया गया।
+1. **[XLM-V](https://huggingface.co/docs/transformers/model_doc/xlm-v)** (from Meta AI) released with the paper [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) by Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa.
 1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (Google/CMU से) साथ वाला पेपर [XLNet: जनरलाइज्ड ऑटोरेग्रेसिव प्रीट्रेनिंग फॉर लैंग्वेज अंडरस्टैंडिंग](https://arxiv ज़ीलिन यांग*, ज़िहांग दाई*, यिमिंग यांग, जैम कार्बोनेल, रुस्लान सलाखुतदीनोव, क्वोक वी. ले ​​द्वारा .org/abs/1906.08237)।
 1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (Facebook AI से) साथ वाला पेपर [XLS-R: सेल्फ सुपरवाइज्ड क्रॉस-लिंगुअल स्पीच रिप्रेजेंटेशन लर्निंग एट स्केल](https://arxiv.org/abs/2111.09296) अरुण बाबू, चांगहान वांग, एंड्रोस तजंद्रा, कुशाल लखोटिया, कियानटोंग जू, नमन गोयल, कृतिका सिंह, पैट्रिक वॉन प्लैटन, याथार्थ सराफ, जुआन पिनो, एलेक्सी बेवस्की, एलेक्सिस कोन्यू, माइकल औली द्वारा पोस्ट किया गया।
 1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (फेसबुक एआई से) साथ में पेपर [अनसुपरवाइज्ड क्रॉस-लिंगुअल रिप्रेजेंटेशन लर्निंग फॉर स्पीच रिकग्निशन] (https://arxiv.org/abs/2006.13979) एलेक्सिस कोन्यू, एलेक्सी बेवस्की, रोनन कोलोबर्ट, अब्देलरहमान मोहम्मद, माइकल औली द्वारा।
@@ -406,7 +466,7 @@ conda install -c huggingface transformers
 1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (विस्कॉन्सिन विश्वविद्यालय - मैडिसन से) साथ में पेपर [यू ओनली सैंपल (लगभग) ज़ानपेंग ज़ेंग, युनयांग ज़िओंग द्वारा , सत्य एन. रवि, शैलेश आचार्य, ग्लेन फंग, विकास सिंह द्वारा पोस्ट किया गया।
 1. एक नए मॉडल में योगदान देना चाहते हैं? नए मॉडल जोड़ने में आपका मार्गदर्शन करने के लिए हमारे पास एक **विस्तृत मार्गदर्शिका और टेम्प्लेट** है। आप उन्हें [`टेम्पलेट्स`](./templates) निर्देशिका में पा सकते हैं। पीआर शुरू करने से पहले [योगदान दिशानिर्देश] (./CONTRIBUTING.md) देखना और अनुरक्षकों से संपर्क करना या प्रतिक्रिया प्राप्त करने के लिए एक नया मुद्दा खोलना याद रखें।
 
-यह जांचने के लिए कि क्या किसी मॉडल में पहले से ही Flax, PyTorch या TensorFlow का कार्यान्वयन है, या यदि उसके पास Tokenizers लाइब्रेरी में संबंधित टोकन है, तो [यह तालिका] (https://huggingface.co/ docs/transformers/index#supported) देखें। -फ्रेमवर्क)।
+यह जांचने के लिए कि क्या किसी मॉडल में पहले से ही Flax, PyTorch या TensorFlow का कार्यान्वयन है, या यदि उसके पास Tokenizers लाइब्रेरी में संबंधित टोकन है, तो [यह तालिका](https://huggingface.co/docs/transformers/index#supported) देखें। -फ्रेमवर्क)।
 
 इन कार्यान्वयनों का परीक्षण कई डेटासेट पर किया गया है (देखें केस स्क्रिप्ट का उपयोग करें) और वैनिला कार्यान्वयन के लिए तुलनात्मक रूप से प्रदर्शन करना चाहिए। आप उपयोग के मामले के दस्तावेज़ [इस अनुभाग](https://huggingface.co/docs/transformers/examples) में व्यवहार का विवरण पढ़ सकते हैं।
 
diff --git a/README_ja.md b/README_ja.md
index 72f23dbeae3d..1ada3be1f4bc 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -53,7 +53,7 @@ user: ユーザ
     <br>
     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
     <br>
-<p>
+</p>
 <p align="center">
     <a href="https://circleci.com/gh/huggingface/transformers">
         <img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main">
@@ -82,7 +82,7 @@ user: ユーザ
         <a href="https://github.com/huggingface/transformers/blob/main/README_es.md">Español</a> |
         <b>日本語</b> |
         <a href="https://github.com/huggingface/transformers/blob/main/README_hd.md">हिन्दी</a>
-    <p>
+    </p>
 </h4>
 
 <h3 align="center">
@@ -258,7 +258,7 @@ And here is the equivalent code for TensorFlow:
 
 ### pipにて
 
-このリポジトリは、Python 3.6+, Flax 0.3.2+, PyTorch 1.3.1+, TensorFlow 2.3+ でテストされています。
+このリポジトリは、Python 3.8+, Flax 0.4.1+, PyTorch 1.10+, TensorFlow 2.6+ でテストされています。
 
 🤗Transformersは[仮想環境](https://docs.python.org/3/library/venv.html)にインストールする必要があります。Pythonの仮想環境に慣れていない場合は、[ユーザーガイド](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)を確認してください。
 
@@ -298,8 +298,11 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 🤗Transformersは現在、以下のアーキテクチャを提供しています（それぞれのハイレベルな要約は[こちら](https://huggingface.co/docs/transformers/model_summary)を参照してください）:
 
 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (Google Research and the Toyota Technological Institute at Chicago から) Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut から公開された研究論文: [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942)
-1. **[AltCLIP](https://huggingface.co/docs/transformers/main/model_doc/altclip)** (BAAI から) Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell から公開された研究論文: [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679)
+1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (Google Research から) Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig. から公開された研究論文 [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918)
+1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (BAAI から) Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell から公開された研究論文: [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679)
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (MIT から) Yuan Gong, Yu-An Chung, James Glass から公開された研究論文: [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778)
+1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
+1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (Facebook から) Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer から公開された研究論文: [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461)
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (École polytechnique から) Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis から公開された研究論文: [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321)
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (VinAI Research から) Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen から公開された研究論文: [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701)
@@ -309,24 +312,31 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (VinAI Research から) Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen から公開された研究論文: [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/)
 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (Google Research から) Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed から公開された研究論文: [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062)
 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (Google Research から) Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed から公開された研究論文: [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062)
-1. **[BioGpt](https://huggingface.co/docs/transformers/main/model_doc/biogpt)** (Microsoft Research AI4Science から) Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu から公開された研究論文: [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9)
-1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (Google AI から) Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil から公開された研究論文: [Big Transfer (BiT)](https://arxiv.org/abs/1912.11370)Houlsby.
+1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (Microsoft Research AI4Science から) Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu から公開された研究論文: [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9)
+1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (Google AI から) Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil から公開された研究論文: [Big Transfer (BiT)](https://arxiv.org/abs/1912.11370)Houlsby.
 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (Facebook から) Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston から公開された研究論文: [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637)
 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (Facebook から) Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston から公開された研究論文: [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637)
-1. **[BLIP](https://huggingface.co/docs/transformers/main/model_doc/blip)** (Salesforce から) Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi から公開された研究論文: [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086)
+1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (Salesforce から) Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi から公開された研究論文: [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086)
+1. **[BLIP-2](https://huggingface.co/docs/transformers/model_doc/blip-2)** (Salesforce から) Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi. から公開された研究論文 [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597)
 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (BigScience workshop から) [BigScience Workshop](https://bigscience.huggingface.co/) から公開されました.
 1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (Alexa から) Adrian de Wynter and Daniel J. Perry から公開された研究論文: [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499)
+1. **[BridgeTower](https://huggingface.co/docs/transformers/model_doc/bridgetower)** (Harbin Institute of Technology/Microsoft Research Asia/Intel Labs から) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
+1. **[BROS](https://huggingface.co/docs/transformers/model_doc/bros)** (NAVER CLOVA から) Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park. から公開された研究論文 [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539)
 1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (Google Research から) Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel から公開された研究論文: [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626)
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (Inria/Facebook/Sorbonne から) Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot から公開された研究論文: [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894)
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google Research から) Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting から公開された研究論文: [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874)
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (OFA-Sys から) An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou から公開された研究論文: [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335)
+1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (LAION-AI から) Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. から公開された研究論文 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687)
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI から) Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever から公開された研究論文: [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (University of Göttingen から) Timo Lüddecke and Alexander Ecker から公開された研究論文: [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003)
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (Salesforce から) Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong から公開された研究論文: [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474)
+1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (MetaAI から) Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve. から公開された研究論文 [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/)
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (Microsoft Research Asia から) Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang から公開された研究論文: [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152)
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (YituTech から) Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan から公開された研究論文: [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496)
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (Facebook AI から) Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie から公開された研究論文: [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545)
+1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (Tsinghua University から) Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun から公開された研究論文: [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413)
+1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (OpenBMB から) [OpenBMB](https://www.openbmb.org/) から公開されました.
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (Salesforce から) Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher から公開された研究論文: [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858)
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (Microsoft から) Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang から公開された研究論文: [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808)
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (Facebook から) Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli から公開された研究論文: [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555)
@@ -335,36 +345,53 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (Berkeley/Facebook/Google から) Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch から公開された研究論文: [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345)
 1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (SenseTime Research から) Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai から公開された研究論文: [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159)
 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (Facebook から) Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou から公開された研究論文: [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877)
+1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (Google AI から) Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun. から公開された研究論文 [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505)
+1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (The University of Texas at Austin から) Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl. から公開された研究論文 [NMS Strikes Back](https://arxiv.org/abs/2212.06137)
 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (Facebook から) Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko から公開された研究論文: [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872)
 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (Microsoft Research から) Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan から公開された研究論文: [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536)
 1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (SHI Labs から) Ali Hassani and Humphrey Shi から公開された研究論文: [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001)
+1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (Meta AI から) Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski. から公開された研究論文 [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193)
 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (HuggingFace から), Victor Sanh, Lysandre Debut and Thomas Wolf. 同じ手法で GPT2, RoBERTa と Multilingual BERT の圧縮を行いました.圧縮されたモデルはそれぞれ [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation)、[DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation)、[DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) と名付けられました. 公開された研究論文: [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108)
 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (Microsoft Research から) Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei から公開された研究論文: [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378)
 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (NAVER から), Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park から公開された研究論文: [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664)
 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (Facebook から) Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih から公開された研究論文: [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906)
 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (Intel Labs から) René Ranftl, Alexey Bochkovskiy, Vladlen Koltun から公開された研究論文: [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413)
+1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (Snap Research から) Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren. から公開された研究論文 [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191)
+1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (Google Research/Stanford University から) Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning から公開された研究論文: [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555)
+1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (Meta AI から) Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi. から公開された研究論文 [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438)
 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (Google Research から) Sascha Rothe, Shashi Narayan, Aliaksei Severyn から公開された研究論文: [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461)
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (Baidu から) Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu から公開された研究論文: [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223)
-1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (Meta AI から) はトランスフォーマープロテイン言語モデルです.  **ESM-1b** は Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus から公開された研究論文: [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118). **ESM-1v** は Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives　から公開された研究論文: [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648). **ESM-2** と　**ESMFold** は Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives から公開された研究論文: [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) 
+1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (Baidu から) Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang. から公開された研究論文 [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674)
+1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (Meta AI から) はトランスフォーマープロテイン言語モデルです.  **ESM-1b** は Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus から公開された研究論文: [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118). **ESM-1v** は Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives　から公開された研究論文: [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648). **ESM-2** と　**ESMFold** は Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives から公開された研究論文: [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902)
+1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (Google AI から) Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V から公開されたレポジトリー [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) Le, and Jason Wei
+1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (CNRS から) Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab から公開された研究論文: [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372)
 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (Facebook AI から) Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela から公開された研究論文: [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482)
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (Google Research から) James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon から公開された研究論文: [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824)
+1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (Microsoft Research から) Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao. から公開された研究論文 [Focal Modulation Networks](https://arxiv.org/abs/2203.11926)
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (CMU/Google Brain から) Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le から公開された研究論文: [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236)
-1. **[GIT](https://huggingface.co/docs/transformers/main/model_doc/git)** (Microsoft Research から) Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. から公開された研究論文 [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100)
+1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (Microsoft Research から) Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. から公開された研究論文 [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100)
 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (KAIST から) Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim から公開された研究論文: [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436)
 1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (OpenAI から) Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever から公開された研究論文: [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/)
 1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (EleutherAI から) Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy から公開されたレポジトリー : [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo)
 1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (EleutherAI から) Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach から公開された研究論文: [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745)
 1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (ABEJA から) Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori からリリース.
 1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (OpenAI から) Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever** から公開された研究論文: [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/)
-1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (EleutherAI から) Ben Wang and Aran Komatsuzaki から公開されたレポジトリー [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) 
-1. **[GPT-Sw3](https://huggingface.co/docs/transformers/main/model_doc/gpt-sw3)** (AI-Sweden から) Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren から公開された研究論文: [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) 
+1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (EleutherAI から) Ben Wang and Aran Komatsuzaki から公開されたレポジトリー [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/)
+1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (AI-Sweden から) Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren から公開された研究論文: [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf)
+1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (BigCode から) Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. から公開された研究論文 [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988)
+1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) 坂本俊之(tanreinama)からリリースされました.
+1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (Microsoft から) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu から公開された研究論文: [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234).
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA から) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang から公開された研究論文: [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094)
+1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology から) Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. から公開された研究論文 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf)
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook から) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed から公開された研究論文: [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447)
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (Berkeley から) Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer から公開された研究論文: [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321)
+1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. 
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (OpenAI から) Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever から公開された研究論文: [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/)
+1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
+1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (Salesforce から) Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi. から公開された研究論文 [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500)
 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (OpenAI から) Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever から公開された研究論文: [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf)
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (Microsoft Research Asia から) Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou から公開された研究論文: [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318)
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (Microsoft Research Asia から) Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou から公開された研究論文: [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740)
@@ -373,6 +400,8 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (AllenAI から) Iz Beltagy, Matthew E. Peters, Arman Cohan から公開された研究論文: [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150)
 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (Meta AI から) Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze から公開された研究論文: [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136)
 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (South China University of Technology から) Jiapeng Wang, Lianwen Jin, Kai Ding から公開された研究論文: [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669)
+1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (The FAIR team of Meta AI から) Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample. から公開された研究論文 [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)
+1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (The FAIR team of Meta AI から) Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom.. から公開された研究論文 [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/XXX)
 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (AllenAI から) Iz Beltagy, Matthew E. Peters, Arman Cohan から公開された研究論文: [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150)
 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (Google AI から) Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang から公開された研究論文: [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916)
 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (Studio Ousia から) Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto から公開された研究論文: [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057)
@@ -381,32 +410,50 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (Facebook から) Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin から公開された研究論文: [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125)
 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Jörg Tiedemann から. [OPUS](http://opus.nlpl.eu/) を使いながら学習された "Machine translation" (マシントランスレーション) モデル. [Marian Framework](https://marian-nmt.github.io/) はMicrosoft Translator Team　が現在開発中です.
 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (Microsoft Research Asia から) Junlong Li, Yiheng Xu, Lei Cui, Furu Wei から公開された研究論文: [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518)
+1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (FAIR and UIUC から) Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar. から公開された研究論文 [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527)
 1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (Meta and UIUC から) Bowen Cheng, Alexander G. Schwing, Alexander Kirillov から公開された研究論文: [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278)
+1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (Google AI から) Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos. から公開された研究論文 [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662)
 1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (Facebook から) Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer から公開された研究論文: [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210)
 1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (Facebook から) Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan から公開された研究論文: [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401)
+1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (Facebook から) Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer. から公開された研究論文 [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655)
 1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (NVIDIA から) Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro から公開された研究論文: [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053)
 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (NVIDIA から) Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro から公開された研究論文: [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053)
+1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (Alibaba Research から) Peng Wang, Cheng Da, and Cong Yao. から公開された研究論文 [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592)
+1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The Mistral AI team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.. 
 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (Studio Ousia から) Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka から公開された研究論文: [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151)
+1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (Facebook から) Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli. から公開された研究論文 [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516)
 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (CMU/Google Brain から) Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou から公開された研究論文: [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984)
 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (Google Inc. から) Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam から公開された研究論文: [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861)
 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (Google Inc. から) Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen から公開された研究論文: [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381)
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (Apple から) Sachin Mehta and Mohammad Rastegari から公開された研究論文: [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178)
+1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (Apple から) Sachin Mehta and Mohammad Rastegari. から公開された研究論文 [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680)
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (Microsoft Research から) Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu から公開された研究論文: [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297)
+1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (MosaiML から) the MosaicML NLP Team. から公開された研究論文 [llm-foundry](https://github.com/mosaicml/llm-foundry/)
+1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (the University of Wisconsin - Madison から) Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh. から公開された研究論文 [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284)
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI から) Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel から公開された研究論文: [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934)
+1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez.
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (RUC AI Box から) Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen から公開された研究論文: [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131)
 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (SHI Labs から) Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi から公開された研究論文: [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143)
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (Huawei Noah’s Ark Lab から) Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu から公開された研究論文: [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204)
 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (Meta から) the NLLB team から公開された研究論文: [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672)
+1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (Meta から) the NLLB team. から公開された研究論文 [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672)
+1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (Meta AI から) Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic. から公開された研究論文 [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418)
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (the University of Wisconsin - Madison から) Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh から公開された研究論文: [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902)
+1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (SHI Labs から) Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi から公開された研究論文: [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220)
+1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (Meta AI から) Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al から公開された研究論文: [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068)
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI から) Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby から公開された研究論文: [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230)
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (Google から) Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu から公開された研究論文: [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google から) Jason Phang, Yao Zhao, and Peter J. Liu から公開された研究論文: [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347)
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (Deepmind から) Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira から公開された研究論文: [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795)
+1. **[Persimmon](https://huggingface.co/docs/transformers/model_doc/persimmon)** (ADEPT から) Erich Elsen, Augustus Odena, Maxwell Nye, Sağnak Taşırlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani. から公開された研究論文 [blog post](https://www.adept.ai/blog/persimmon-8b)
 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (VinAI Research から) Dat Quoc Nguyen and Anh Tuan Nguyen から公開された研究論文: [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/)
+1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (Google から) Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova. から公開された研究論文 [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347)
 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (UCLA NLP から) Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang から公開された研究論文: [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333)
 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (Sea AI Labs から) Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng から公開された研究論文: [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418)
+1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi, Kyogu Lee. 
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (Microsoft Research から) Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou から公開された研究論文: [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063)
+1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (Nanjing University, The University of Hong Kong etc. から) Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. から公開された研究論文 [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf)
 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (NVIDIA から) Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius から公開された研究論文: [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602)
 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (Facebook から) Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela から公開された研究論文: [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401)
 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (Google Research から) Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang から公開された研究論文: [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909)
@@ -415,52 +462,65 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (Google Research から) Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder から公開された研究論文: [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821)
 1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (Microsoft Research から) Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun から公開された研究論文: [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385)
 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (Facebook から), Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov から公開された研究論文: [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692)
-1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/main/model_doc/roberta-prelayernorm)** (Facebook から) Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli から公開された研究論文: [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038)
-1. **[RoCBert](https://huggingface.co/docs/transformers/main/model_doc/roc_bert)** (WeChatAI から) HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou から公開された研究論文: [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf)
+1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (Facebook から) Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli から公開された研究論文: [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038)
+1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (WeChatAI から) HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou から公開された研究論文: [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf)
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (ZhuiyiTechnology から), Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu から公開された研究論文: [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864)
+1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (Bo Peng から) Bo Peng. から公開された研究論文 [this repo](https://github.com/BlinkDL/RWKV-LM)
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (NVIDIA から) Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo から公開された研究論文: [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203)
+1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (Meta AI から) Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. から公開された研究論文 [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf)
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP から) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi から公開された研究論文: [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870)
 1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (ASAPP から) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi から公開された研究論文: [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870)
+1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (Microsoft Research から) Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei. から公開された研究論文 [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205)
 1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (Facebook から), Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino から公開された研究論文: [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171)
 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (Facebook から), Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau から公開された研究論文: [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678)
 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (Tel Aviv University から), Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy から公開された研究論文: [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438)
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (Berkeley から) Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer から公開された研究論文: [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316)
+1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (MBZUAI から) Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan. から公開された研究論文 [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446)
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (Microsoft から) Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo から公開された研究論文: [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030)
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (Microsoft から) Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo から公開された研究論文: [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883)
-1. **[Swin2SR](https://huggingface.co/docs/transformers/main/model_doc/swin2sr)** (University of Würzburg から) Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte から公開された研究論文: [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345)
-1. **[SwitchTransformers](https://huggingface.co/docs/transformers/main/model_doc/switch_transformers)** (Google から) William Fedus, Barret Zoph, Noam Shazeer から公開された研究論文: [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961)
+1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (University of Würzburg から) Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte から公開された研究論文: [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345)
+1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (Google から) William Fedus, Barret Zoph, Noam Shazeer から公開された研究論文: [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961)
 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (Google AI から) Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu から公開された研究論文: [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683)
 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (Google AI から) Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu から公開されたレポジトリー [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511)
 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (Microsoft Research から) Brandon Smock, Rohith Pesala, Robin Abraham から公開された研究論文: [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061)
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (Google AI から) Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos から公開された研究論文: [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349)
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (Microsoft Research から) Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou から公開された研究論文: [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653)
 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)**  (HuggingFace から).
-1. **[TimeSformer](https://huggingface.co/docs/transformers/main/model_doc/timesformer)** (Facebook から) Gedas Bertasius, Heng Wang, Lorenzo Torresani から公開された研究論文: [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095)
+1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (Facebook から) Gedas Bertasius, Heng Wang, Lorenzo Torresani から公開された研究論文: [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095)
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (the University of California at Berkeley から) Michael Janner, Qiyang Li, Sergey Levine から公開された研究論文: [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039)
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (Google/CMU から) Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov から公開された研究論文: [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860)
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (Microsoft から), Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei から公開された研究論文: [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282)
+1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill から), Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal から公開された研究論文: [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156)
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (Google Research から) Yi Tay, Mostafa Dehghani, Vinh Q から公開された研究論文: [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
+1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (Google Research から) Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant. から公開された研究論文 [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi)
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (Microsoft Research から) Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang から公開された研究論文: [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597)
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (Microsoft Research から) Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu から公開された研究論文: [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752)
+1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (Peking University から) Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun. から公開された研究論文 [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221)
 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (Tsinghua University and Nankai University から) Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu から公開された研究論文: [Visual Attention Network](https://arxiv.org/abs/2202.09741)
 1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (Multimedia Computing Group, Nanjing University から) Zhan Tong, Yibing Song, Jue Wang, Limin Wang から公開された研究論文: [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602)
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (NAVER AI Lab/Kakao Enterprise/Kakao Brain から) Wonjae Kim, Bokyung Son, Ildoo Kim から公開された研究論文: [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334)
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (Google AI から) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby から公開された研究論文: [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929)
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (UCLA NLP から) Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang から公開された研究論文: [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557)
-1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (Google AI から) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby から公開された研究論文: [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929)
+1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (Google AI から) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby から公開された研究論文: [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929)
+1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (Meta AI から) Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He. から公開された研究論文 [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527)
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (Meta AI から) Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick から公開された研究論文: [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377)
+1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (HUST-VL から) Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang. から公開された研究論文 [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272)
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (Meta AI から) Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas から公開された研究論文: [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141)
+1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (Kakao Enterprise から) Jaehyeon Kim, Jungil Kong, Juhee Son. から公開された研究論文 [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103)
+1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (Facebook AI から) Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli から公開された研究論文: [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477)
 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (Facebook AI から) Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino から公開された研究論文: [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171)
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (Facebook AI から) Qiantong Xu, Alexei Baevski, Michael Auli から公開された研究論文: [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680)
 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (Microsoft Research から) Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei から公開された研究論文: [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900)
 1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (OpenAI から) Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever から公開された研究論文: [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf)
 1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (Microsoft Research から) Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling から公開された研究論文: [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816)
+1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (Meta AI から) Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe. から公開された研究論文 [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255)
 1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li から公開された研究論文: [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668)
 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (Facebook から) Guillaume Lample and Alexis Conneau から公開された研究論文: [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291)
 1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (Microsoft Research から) Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou から公開された研究論文: [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063)
 1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (Facebook AI から), Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov から公開された研究論文: [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116)
 1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (Facebook AI から), Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau から公開された研究論文: [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572)
+1. **[XLM-V](https://huggingface.co/docs/transformers/model_doc/xlm-v)** (Meta AI から) Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa から公開された研究論文: [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472)
 1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (Google/CMU から) Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le から公開された研究論文: [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237)
 1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (Facebook AI から) Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli から公開された研究論文: [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296)
 1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (Facebook AI から) Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli から公開された研究論文: [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979)
diff --git a/README_ko.md b/README_ko.md
index 8d0443fd6f50..35fc4e7f453f 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -18,7 +18,7 @@ limitations under the License.
     <br>
     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
     <br>
-<p>
+</p>
 <p align="center">
     <a href="https://circleci.com/gh/huggingface/transformers">
         <img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main">
@@ -47,7 +47,7 @@ limitations under the License.
         <a href="https://github.com/huggingface/transformers/blob/main/README_es.md">Español</a> |
         <a href="https://github.com/huggingface/transformers/blob/main/README_ja.md">日本語</a> |
         <a href="https://github.com/huggingface/transformers/blob/main/README_hd.md">हिन्दी</a>
-    <p>
+    </p>
 </h4>
 
 <h3 align="center">
@@ -175,7 +175,7 @@ limitations under the License.
 
 ### pip로 설치하기
 
-이 저장소는 Python 3.6+, Flax 0.3.2+, PyTorch 1.3.1+, TensorFlow 2.3+에서 테스트 되었습니다.
+이 저장소는 Python 3.8+, Flax 0.4.1+, PyTorch 1.10+, TensorFlow 2.6+에서 테스트 되었습니다.
 
 [가상 환경](https://docs.python.org/3/library/venv.html)에 🤗 Transformers를 설치하세요. Python 가상 환경에 익숙하지 않다면, [사용자 가이드](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)를 확인하세요.
 
@@ -213,8 +213,11 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 🤗 Transformers는 다음 모델들을 제공합니다 (각 모델의 요약은 [여기](https://huggingface.co/docs/transformers/model_summary)서 확인하세요):
 
 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-1. **[AltCLIP](https://huggingface.co/docs/transformers/main/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
+1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (Google Research 에서 제공)은 Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.의 [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918)논문과 함께 발표했습니다.
+1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
+1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
+1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
@@ -224,24 +227,31 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BioGpt](https://huggingface.co/docs/transformers/main/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
-1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
+1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
+1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BLIP](https://huggingface.co/docs/transformers/main/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
+1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
+1. **[BLIP-2](https://huggingface.co/docs/transformers/model_doc/blip-2)** (Salesforce 에서 제공)은 Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi.의 [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597)논문과 함께 발표했습니다.
 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
 1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (Alexa 에서) Adrian de Wynter and Daniel J. Perry 의 [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) 논문과 함께 발표했습니다.
+1. **[BridgeTower](https://huggingface.co/docs/transformers/model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
+1. **[BROS](https://huggingface.co/docs/transformers/model_doc/bros)** (NAVER CLOVA 에서 제공)은 Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park.의 [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539)논문과 함께 발표했습니다.
 1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (Google Research 에서) Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel 의 [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) 논문과 함께 발표했습니다.
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (Inria/Facebook/Sorbonne 에서) Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot 의 [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) 논문과 함께 발표했습니다.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google Research 에서) Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 의 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 논문과 함께 발표했습니다.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (OFA-Sys 에서) An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou 의 [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) 논문과 함께 발표했습니다.
+1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (LAION-AI 에서 제공)은 Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.의 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687)논문과 함께 발표했습니다.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI 에서) Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 의 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 논문과 함께 발표했습니다.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (University of Göttingen 에서) Timo Lüddecke and Alexander Ecker 의 [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) 논문과 함께 발표했습니다.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (Salesforce 에서) Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 의 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 논문과 함께 발표했습니다.
+1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (MetaAI 에서 제공)은 Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve.의 [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/)논문과 함께 발표했습니다.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (Microsoft Research Asia 에서) Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang 의 [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) 논문과 함께 발표했습니다.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (YituTech 에서) Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan 의 [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) 논문과 함께 발표했습니다.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (Facebook AI 에서) Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie 의 [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) 논문과 함께 발표했습니다.
+1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (Tsinghua University 에서) Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun 의 [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) 논문과 함께 발표했습니다.
+1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (Salesforce 에서) Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher 의 [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) 논문과 함께 발표했습니다.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (Microsoft 에서) Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang 의 [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) 논문과 함께 발표했습니다.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (Facebook 에서) Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli 의 [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) 논문과 함께 발표했습니다.
@@ -250,36 +260,53 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (Berkeley/Facebook/Google 에서) Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch 의 [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) 논문과 함께 발표했습니다.
 1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (SenseTime Research 에서) Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai 의 [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) 논문과 함께 발표했습니다.
 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (Facebook 에서) Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou 의 [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) 논문과 함께 발표했습니다.
+1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (Google AI 에서 제공)은 Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun.의 [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505)논문과 함께 발표했습니다.
+1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (The University of Texas at Austin 에서 제공)은 Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl.의 [NMS Strikes Back](https://arxiv.org/abs/2212.06137)논문과 함께 발표했습니다.
 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (Facebook 에서) Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko 의 [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) 논문과 함께 발표했습니다.
 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (Microsoft Research 에서) Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan 의 [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) 논문과 함께 발표했습니다.
 1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (SHI Labs 에서) Ali Hassani and Humphrey Shi 의 [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) 논문과 함께 발표했습니다.
+1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (Meta AI 에서 제공)은 Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski.의 [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193)논문과 함께 발표했습니다.
 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (HuggingFace 에서) Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/distillation) and a German version of DistilBERT 의 [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) 논문과 함께 발표했습니다.
 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (Microsoft Research 에서) Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei 의 [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) 논문과 함께 발표했습니다.
 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (NAVER 에서) Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park 의 [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) 논문과 함께 발표했습니다.
 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (Facebook 에서) Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih 의 [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) 논문과 함께 발표했습니다.
 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (Intel Labs 에서) René Ranftl, Alexey Bochkovskiy, Vladlen Koltun 의 [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) 논문과 함께 발표했습니다.
+1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
+1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (Google Research/Stanford University 에서) Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning 의 [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) 논문과 함께 발표했습니다.
+1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (Meta AI 에서 제공)은 Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi.의 [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438)논문과 함께 발표했습니다.
 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (Google Research 에서) Sascha Rothe, Shashi Narayan, Aliaksei Severyn 의 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 논문과 함께 발표했습니다.
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (Baidu 에서) Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu 의 [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) 논문과 함께 발표했습니다.
+1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (Baidu 에서 제공)은 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.의 [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674)논문과 함께 발표했습니다.
 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models.  **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
+1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
+1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
+1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-1. **[GIT](https://huggingface.co/docs/transformers/main/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
+1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
 1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
 1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
 1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (EleutherAI 에서) Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbac 의 [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) 논문과 함께 발표했습니다.
 1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
 1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (OpenAI 에서) Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever** 의 [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) 논문과 함께 발표했습니다.
-1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki. 
-1. **[GPT-Sw3](https://huggingface.co/docs/transformers/main/model_doc/gpt-sw3)** (AI-Sweden 에서) Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren. 의 [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) 논문과 함께 발표했습니다.
+1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
+1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (AI-Sweden 에서) Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren. 의 [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) 논문과 함께 발표했습니다.
+1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (BigCode 에서 제공)은 Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.의 [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988)논문과 함께 발표했습니다.
+1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
+1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu  의 [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234)  논문과 함께 발표했습니다.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA 에서) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang 의 [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 논문과 함께 발표했습니다.
+1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology 에서 제공)은 Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.의 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf)논문과 함께 발표했습니다.
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook 에서) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 의 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 논문과 함께 발표했습니다.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (Berkeley 에서) Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 의 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 논문과 함께 발표했습니다.
+1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. 
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (OpenAI 에서) Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever 의 [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) 논문과 함께 발표했습니다.
+1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
+1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (Salesforce 에서 제공)은 Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.의 [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500)논문과 함께 발표했습니다.
 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (OpenAI 에서) Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever 의 [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) 논문과 함께 발표했습니다.
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (Microsoft Research Asia 에서) Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou 의 [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) 논문과 함께 발표했습니다.
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (Microsoft Research Asia 에서) Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou 의 [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) 논문과 함께 발표했습니다.
@@ -288,6 +315,8 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (AllenAI 에서) Iz Beltagy, Matthew E. Peters, Arman Cohan 의 [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 논문과 함께 발표했습니다.
 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (Meta AI 에서) Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze 의 [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) 논문과 함께 발표했습니다.
 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (South China University of Technology 에서) Jiapeng Wang, Lianwen Jin, Kai Ding 의 [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) 논문과 함께 발표했습니다.
+1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (The FAIR team of Meta AI 에서 제공)은 Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.의 [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)논문과 함께 발표했습니다.
+1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (The FAIR team of Meta AI 에서 제공)은 Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom..의 [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/XXX)논문과 함께 발표했습니다.
 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (AllenAI 에서) Iz Beltagy, Matthew E. Peters, Arman Cohan 의 [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 논문과 함께 발표했습니다.
 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (Google AI 에서) Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang 의 [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) 논문과 함께 발표했습니다.
 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (Studio Ousia 에서) Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto 의 [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) 논문과 함께 발표했습니다.
@@ -296,32 +325,50 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (Facebook 에서) Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin 의 [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) 논문과 함께 발표했습니다.
 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (Microsoft Research Asia 에서) Junlong Li, Yiheng Xu, Lei Cui, Furu Wei 의 [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) 논문과 함께 발표했습니다.
+1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (FAIR and UIUC 에서 제공)은 Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.의 [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527)논문과 함께 발표했습니다.
 1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (Meta and UIUC 에서) Bowen Cheng, Alexander G. Schwing, Alexander Kirillov 의 [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) 논문과 함께 발표했습니다.
+1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (Google AI 에서 제공)은 Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos.의 [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662)논문과 함께 발표했습니다.
 1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (Facebook 에서) Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer 의 [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) 논문과 함께 발표했습니다.
 1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (Facebook 에서) Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan 의 [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) 논문과 함께 발표했습니다.
+1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (Facebook 에서 제공)은 Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.의 [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655)논문과 함께 발표했습니다.
 1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (NVIDIA 에서) Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 의 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 논문과 함께 발표했습니다.
 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (NVIDIA 에서) Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 의 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 논문과 함께 발표했습니다.
+1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (Alibaba Research 에서 제공)은 Peng Wang, Cheng Da, and Cong Yao.의 [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592)논문과 함께 발표했습니다.
+1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The Mistral AI team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.. 
 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (Studio Ousia 에서) Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka 의 [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) 논문과 함께 발표했습니다.
+1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (Facebook 에서 제공)은 Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.의 [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516)논문과 함께 발표했습니다.
 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (CMU/Google Brain 에서) Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou 의 [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) 논문과 함께 발표했습니다.
 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (Google Inc. 에서) Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam 의 [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) 논문과 함께 발표했습니다.
 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (Google Inc. 에서) Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen 의 [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) 논문과 함께 발표했습니다.
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (Apple 에서) Sachin Mehta and Mohammad Rastegari 의 [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) 논문과 함께 발표했습니다.
+1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (Apple 에서 제공)은 Sachin Mehta and Mohammad Rastegari.의 [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680)논문과 함께 발표했습니다.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (Microsoft Research 에서) Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu 의 [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 논문과 함께 발표했습니다.
+1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (MosaiML 에서 제공)은 the MosaicML NLP Team.의 [llm-foundry](https://github.com/mosaicml/llm-foundry/)논문과 함께 발표했습니다.
+1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (the University of Wisconsin - Madison 에서 제공)은 Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.의 [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284) 논문과 함께 발표했습니다.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI 에서) Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 의 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 논문과 함께 발표했습니다.
+1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez.
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (RUC AI Box 에서) Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen 의 [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) 논문과 함께 발표했습니다.
 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (SHI Labs 에서) Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi 의 [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) 논문과 함께 발표했습니다.
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (Huawei Noah’s Ark Lab 에서) Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu 의 [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) 논문과 함께 발표했습니다.
 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (Meta 에서) the NLLB team 의 [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) 논문과 함께 발표했습니다.
+1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (Meta 에서 제공)은 the NLLB team.의 [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672)논문과 함께 발표했습니다.
+1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (Meta AI 에서 제공)은 Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic.의 [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418)논문과 함께 발표했습니다.
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (the University of Wisconsin - Madison 에서) Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh 의 [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) 논문과 함께 발표했습니다.
+1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (SHI Labs 에서) Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi 의 [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) 논문과 함께 발표했습니다.
+1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (Meta AI 에서) Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al 의 [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 논문과 함께 발표했습니다.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI 에서) Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby 의 [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) 논문과 함께 발표했습니다.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (Google 에서) Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 의 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 논문과 함께 발표했습니다.
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google 에서) Jason Phang, Yao Zhao, Peter J. Liu 의 [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) 논문과 함께 발표했습니다.
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (Deepmind 에서) Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira 의 [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) 논문과 함께 발표했습니다.
+1. **[Persimmon](https://huggingface.co/docs/transformers/model_doc/persimmon)** (ADEPT 에서 제공)은 Erich Elsen, Augustus Odena, Maxwell Nye, Sağnak Taşırlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani.의 [blog post](https://www.adept.ai/blog/persimmon-8b)논문과 함께 발표했습니다.
 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (VinAI Research 에서) Dat Quoc Nguyen and Anh Tuan Nguyen 의 [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) 논문과 함께 발표했습니다.
+1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (Google 에서 제공)은 Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.의 [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347)논문과 함께 발표했습니다.
 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (UCLA NLP 에서) Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang 의 [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) 논문과 함께 발표했습니다.
 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (Sea AI Labs 에서) Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng 의 [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) 논문과 함께 발표했습니다.
+1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi, Kyogu Lee. 
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (Microsoft Research 에서) Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 의 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 논문과 함께 발표했습니다.
+1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (Nanjing University, The University of Hong Kong etc. 에서 제공)은 Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.의 [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf)논문과 함께 발표했습니다.
 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (NVIDIA 에서) Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius 의 [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) 논문과 함께 발표했습니다.
 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (Facebook 에서) Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela 의 [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) 논문과 함께 발표했습니다.
 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (Google Research 에서) Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang 의 [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) 논문과 함께 발표했습니다.
@@ -330,57 +377,70 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (Google Research 에서) Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder 의 [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) 논문과 함께 발표했습니다.
 1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (Microsoft Research 에서) Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun 의 [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) 논문과 함께 발표했습니다.
 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (Facebook 에서) Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov 의 a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) 논문과 함께 발표했습니다.
-1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/main/model_doc/roberta-prelayernorm)** (Facebook 에서) Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli 의 [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) 논문과 함께 발표했습니다.
-1. **[RoCBert](https://huggingface.co/docs/transformers/main/model_doc/roc_bert)** (WeChatAI 에서) HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou 의 [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) 논문과 함께 발표했습니다.
+1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (Facebook 에서) Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli 의 [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) 논문과 함께 발표했습니다.
+1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (WeChatAI 에서) HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou 의 [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) 논문과 함께 발표했습니다.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (ZhuiyiTechnology 에서) Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu 의 a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) 논문과 함께 발표했습니다.
+1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (Bo Peng 에서 제공)은 Bo Peng.의 [this repo](https://github.com/BlinkDL/RWKV-LM)논문과 함께 발표했습니다.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (NVIDIA 에서) Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo 의 [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) 논문과 함께 발표했습니다.
+1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (Meta AI 에서 제공)은 Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.의 [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf)논문과 함께 발표했습니다.
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP 에서) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 의 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 논문과 함께 발표했습니다.
 1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (ASAPP 에서) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 의 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 논문과 함께 발표했습니다.
+1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (Microsoft Research 에서 제공)은 Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.의 [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205)논문과 함께 발표했습니다.
 1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (Facebook 에서) Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino 의 [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) 논문과 함께 발표했습니다.
 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (Facebook 에서) Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau 의 [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) 논문과 함께 발표했습니다.
 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (Tel Aviv University 에서) Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy 의 [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) 논문과 함께 발표했습니다.
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (Berkeley 에서) Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer 의 [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) 논문과 함께 발표했습니다.
+1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (MBZUAI 에서 제공)은 Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.의 [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446)논문과 함께 발표했습니다.
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (Microsoft 에서) Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo 의 [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) 논문과 함께 발표했습니다.
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (Microsoft 에서) Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo 의 [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) 논문과 함께 발표했습니다.
-1. **[Swin2SR](https://huggingface.co/docs/transformers/main/model_doc/swin2sr)** (University of Würzburg 에서) Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte 의 [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) 논문과 함께 발표했습니다.
-1. **[SwitchTransformers](https://huggingface.co/docs/transformers/main/model_doc/switch_transformers)** (Google 에서) William Fedus, Barret Zoph, Noam Shazeer. 의 [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) 논문과 함께 발표했습니다.
+1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (University of Würzburg 에서) Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte 의 [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) 논문과 함께 발표했습니다.
+1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (Google 에서) William Fedus, Barret Zoph, Noam Shazeer. 의 [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) 논문과 함께 발표했습니다.
 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (Google AI 에서) Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 의 [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) 논문과 함께 발표했습니다.
 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (Microsoft Research 에서) Brandon Smock, Rohith Pesala, Robin Abraham 의 [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) 논문과 함께 발표했습니다.
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (Google AI 에서) Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos 의 [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) 논문과 함께 발표했습니다.
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (Microsoft Research 에서) Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou 의 [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) 논문과 함께 발표했습니다.
 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
-1. **[TimeSformer](https://huggingface.co/docs/transformers/main/model_doc/timesformer)** (Facebook 에서) Gedas Bertasius, Heng Wang, Lorenzo Torresani 의 [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) 논문과 함께 발표했습니다.
+1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (Facebook 에서) Gedas Bertasius, Heng Wang, Lorenzo Torresani 의 [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) 논문과 함께 발표했습니다.
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (the University of California at Berkeley 에서) Michael Janner, Qiyang Li, Sergey Levin 의 [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) 논문과 함께 발표했습니다.
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (Google/CMU 에서) Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov 의 [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) 논문과 함께 발표했습니다.
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (Microsoft 에서) Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei 의 [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) 논문과 함께 발표했습니다.
+1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill 에서) Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal 의 [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) 논문과 함께 발표했습니다.
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (Google Research 에서) Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzle 의 [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) 논문과 함께 발표했습니다.
+1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (Google Research 에서 제공)은 Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.의 [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi)논문과 함께 발표했습니다.
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (Microsoft Research 에서) Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang 의 [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) 논문과 함께 발표했습니다.
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (Microsoft Research 에서) Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu 의 [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) 논문과 함께 발표했습니다.
+1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (Peking University 에서 제공)은 Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.의 [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221)논문과 함께 발표했습니다.
 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (Tsinghua University and Nankai University 에서) Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu 의 [Visual Attention Network](https://arxiv.org/pdf/2202.09741.pdf) 논문과 함께 발표했습니다.
 1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (Multimedia Computing Group, Nanjing University 에서) Zhan Tong, Yibing Song, Jue Wang, Limin Wang 의 [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) 논문과 함께 발표했습니다.
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (NAVER AI Lab/Kakao Enterprise/Kakao Brain 에서) Wonjae Kim, Bokyung Son, Ildoo Kim 의 [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) 논문과 함께 발표했습니다.
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (Google AI 에서) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 의 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 논문과 함께 발표했습니다.
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (UCLA NLP 에서) Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang 의 [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) 논문과 함께 발표했습니다.
-1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (Google AI 에서) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 의 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 논문과 함께 발표했습니다.
+1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (Google AI 에서) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 의 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 논문과 함께 발표했습니다.
+1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (Meta AI 에서 제공)은 Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He.의 [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527)논문과 함께 발표했습니다.
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (Meta AI 에서) Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick 의 [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) 논문과 함께 발표했습니다.
+1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (HUST-VL 에서 제공)은 Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.의 [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272)논문과 함께 발표했습니다.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (Meta AI 에서) Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas 의 [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) 논문과 함께 발표했습니다.
+1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (Kakao Enterprise 에서 제공)은 Jaehyeon Kim, Jungil Kong, Juhee Son.의 [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103)논문과 함께 발표했습니다.
+1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (Facebook AI 에서) Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli 의 [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) 논문과 함께 발표했습니다.
 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (Facebook AI 에서) Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino 의 [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) 논문과 함께 발표했습니다.
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (Facebook AI 에서) Qiantong Xu, Alexei Baevski, Michael Auli 의 [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) 논문과 함께 발표했습니다.
 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (Microsoft Research 에서) Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei 의 [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) 논문과 함께 발표했습니다.
 1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (OpenAI 에서) Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever 의 [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) 논문과 함께 발표했습니다.
 1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (Microsoft Research 에서) Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling 의 [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) 논문과 함께 발표했습니다.
+1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (Meta AI 에서 제공)은 Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe.의 [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255)논문과 함께 발표했습니다.
 1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (Facebook AI 에서 제공) Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li 의 [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) 논문과 함께 발표했습니다.
 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (Facebook 에서) Guillaume Lample and Alexis Conneau 의 [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) 논문과 함께 발표했습니다.
 1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (Microsoft Research 에서) Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 의 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 논문과 함께 발표했습니다.
 1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (Facebook AI 에서) Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov 의 [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) 논문과 함께 발표했습니다.
 1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (Facebook AI 에서) Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau 의 [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) 논문과 함께 발표했습니다.
+1. **[XLM-V](https://huggingface.co/docs/transformers/model_doc/xlm-v)** (Meta AI 에서) Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa 의 [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) 논문과 함께 발표했습니다.
 1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (Google/CMU 에서) Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le 의 [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) 논문과 함께 발표했습니다.
 1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (Facebook AI 에서) Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli 의 [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) 논문과 함께 발표했습니다.
 1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (Facebook AI 에서) Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli 의 [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) 논문과 함께 발표했습니다.
 1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (Huazhong University of Science & Technology 에서) Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu 의 [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) 논문과 함께 발표했습니다.
-1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (the University of Wisconsin - Madison 에서) Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh 의 [You Only Sample (Almost) 논문과 함께 발표했습니다.
+1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (the University of Wisconsin - Madison 에서) Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh 의 [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) 논문과 함께 발표했습니다.
 1. 새로운 모델을 올리고 싶나요? 우리가 **상세한 가이드와 템플릿** 으로 새로운 모델을 올리도록 도와드릴게요. 가이드와 템플릿은 이 저장소의 [`templates`](./templates) 폴더에서 확인하실 수 있습니다. [컨트리뷰션 가이드라인](./CONTRIBUTING.md)을 꼭 확인해주시고, PR을 올리기 전에 메인테이너에게 연락하거나 이슈를 오픈해 피드백을 받으시길 바랍니다.
 
 각 모델이 Flax, PyTorch, TensorFlow으로 구현되었는지 또는 🤗 Tokenizers 라이브러리가 지원하는 토크나이저를 사용하는지 확인하려면, [이 표](https://huggingface.co/docs/transformers/index#supported-frameworks)를 확인하세요.
diff --git a/README_ru.md b/README_ru.md
new file mode 100644
index 000000000000..8a15bf871ea7
--- /dev/null
+++ b/README_ru.md
@@ -0,0 +1,549 @@
+<!---
+Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<p align="center">
+  <picture>
+    <source media="(prefers-color-scheme: dark)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-dark.svg">
+    <source media="(prefers-color-scheme: light)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg">
+    <img alt="Hugging Face Transformers Library" src="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg" width="352" height="59" style="max-width: 100%;">
+  </picture>
+  <br/>
+  <br/>
+</p>
+
+<p align="center">
+    <a href="https://circleci.com/gh/huggingface/transformers">
+        <img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main">
+    </a>
+    <a href="https://github.com/huggingface/transformers/blob/main/LICENSE">
+        <img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue">
+    </a>
+    <a href="https://huggingface.co/docs/transformers/index">
+        <img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/docs/transformers/index.svg?down_color=red&down_message=offline&up_message=online">
+    </a>
+    <a href="https://github.com/huggingface/transformers/releases">
+        <img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/transformers.svg">
+    </a>
+    <a href="https://github.com/huggingface/transformers/blob/main/CODE_OF_CONDUCT.md">
+        <img alt="Contributor Covenant" src="https://img.shields.io/badge/Contributor%20Covenant-v2.0%20adopted-ff69b4.svg">
+    </a>
+    <a href="https://zenodo.org/badge/latestdoi/155220641"><img src="https://zenodo.org/badge/155220641.svg" alt="DOI"></a>
+</p>
+
+<h4 align="center">
+    <p>
+        <a href="https://github.com/huggingface/transformers/blob/main/README.md">English</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/README_zh-hans.md">简体中文</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/README_zh-hant.md">繁體中文</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/README_ko.md">한국어</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/README_es.md">Español</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/README_ja.md">日本語</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/README_hd.md">हिन्दी</a> |
+        <b>Русский</b>
+    <p>
+</h4>
+
+<h3 align="center">
+    <p>Современное машинное обучение для JAX, PyTorch и TensorFlow</p>
+</h3>
+
+<h3 align="center">
+    <a href="https://hf.co/course"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/course_banner.png"></a>
+</h3>
+
+🤗 Transformers предоставляет тысячи предварительно обученных моделей для выполнения различных задач, таких как текст, зрение и аудио.
+
+Эти модели могут быть применены на:
+
+* 📝 Текст, для таких задач, как классификация текстов, извлечение информации, ответы на вопросы, обобщение, перевод, генерация текстов, на более чем 100 языках.
+* 🖼️ Изображения - для задач классификации изображений, обнаружения объектов и сегментации.
+* 🗣️ Аудио - для задач распознавания речи и классификации аудио.
+
+Модели transformers также могут выполнять несколько задач, такие как ответы на табличные вопросы, распознавание оптических символов, извлечение информации из отсканированных документов, классификация видео и ответы на визуальные вопросы.
+
+🤗 Transformers предоставляет API для быстрой загрузки и использования предварительно обученных моделей, их тонкой настройки на собственных датасетах и последующего взаимодействия ими с сообществом на нашем [сайте](https://huggingface.co/models). В то же время каждый python модуль, определяющий архитектуру, полностью автономен и может быть модифицирован для проведения быстрых исследовательских экспериментов.
+
+🤗 Transformers опирается на три самые популярные библиотеки глубокого обучения - [Jax](https://jax.readthedocs.io/en/latest/), [PyTorch](https://pytorch.org/) и [TensorFlow](https://www.tensorflow.org/) - и легко интегрируется между ними. Это позволяет легко обучать модели с помощью одной из них, а затем загружать их для выводов с помощью другой.
+
+## Онлайн демонстрация
+
+Большинство наших моделей можно протестировать непосредственно на их страницах с [сайта](https://huggingface.co/models). Мы также предлагаем [привтаный хостинг моделей, контроль версий и API для выводов](https://huggingface.co/pricing) для публичных и частных моделей.
+
+Вот несколько примеров:
+
+В области NLP ( Обработка текстов на естественном языке ):
+- [Маскированное заполнение слов с помощью BERT](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
+- [Распознавание сущностей с помощью Electra](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
+- [Генерация текста с помощью GPT-2](https://huggingface.co/gpt2?text=A+long+time+ago%2C+)
+- [Выводы на естественном языке с помощью RoBERTa](https://huggingface.co/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
+- [Обобщение с помощью BART](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
+- [Ответы на вопросы с помощью DistilBERT](https://huggingface.co/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
+- [Перевод с помощью T5](https://huggingface.co/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
+
+В области компьютерного зрения:
+- [Классификация изображений с помощью ViT](https://huggingface.co/google/vit-base-patch16-224)
+- [Обнаружение объектов с помощью DETR](https://huggingface.co/facebook/detr-resnet-50)
+- [Семантическая сегментация с помощью SegFormer](https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512)
+- [Сегментация паноптикума с помощью MaskFormer](https://huggingface.co/facebook/maskformer-swin-small-coco)
+- [Оценка глубины с помощью DPT](https://huggingface.co/docs/transformers/model_doc/dpt)
+- [Классификация видео с помощью VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)
+- [Универсальная сегментация с помощью OneFormer](https://huggingface.co/shi-labs/oneformer_ade20k_dinat_large)
+
+В области звука:
+- [Автоматическое распознавание речи с помощью Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base-960h)
+- [Поиск ключевых слов с помощью Wav2Vec2](https://huggingface.co/superb/wav2vec2-base-superb-ks)
+- [Классификация аудиоданных с помощью траснформера аудиоспектрограмм](https://huggingface.co/MIT/ast-finetuned-audioset-10-10-0.4593)
+
+В мультимодальных задачах:
+- [Ответы на вопросы по таблице с помощью TAPAS](https://huggingface.co/google/tapas-base-finetuned-wtq)
+- [Визуальные ответы на вопросы с помощью ViLT](https://huggingface.co/dandelin/vilt-b32-finetuned-vqa)
+- [Zero-shot классификация изображений с помощью CLIP](https://huggingface.co/openai/clip-vit-large-patch14)
+- [Ответы на вопросы по документам с помощью LayoutLM](https://huggingface.co/impira/layoutlm-document-qa)
+- [Zero-shot классификация видео с помощью X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)
+
+
+## 100 проектов, использующих Transformers
+
+Transformers - это не просто набор инструментов для использования предварительно обученных моделей: это сообщество проектов, созданное на его основе, и
+Hugging Face Hub. Мы хотим, чтобы Transformers позволил разработчикам, исследователям, студентам, профессорам, инженерам и всем желающим
+создавать проекты своей мечты.
+
+Чтобы отпраздновать 100 тысяч звезд Transformers, мы решили сделать акцент на сообществе, и создали страницу [awesome-transformers](./awesome-transformers.md), на которой перечислены 100
+невероятных проектов, созданных с помощью transformers.
+
+Если вы являетесь владельцем или пользователем проекта, который, по вашему мнению, должен быть включен в этот список, пожалуйста, откройте PR для его добавления!
+
+## Если вы хотите получить индивидуальную поддержку от команды Hugging Face
+
+<a target="_blank" href="https://huggingface.co/support">
+    <img alt="HuggingFace Expert Acceleration Program" src="https://cdn-media.huggingface.co/marketing/transformers/new-support-improved.png" style="max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
+</a><br>
+
+## Быстрый гайд
+
+Для использования модели на заданном входе (текст, изображение, звук, ...) мы предоставляем API `pipeline`. Конвейеры объединяют предварительно обученную модель с препроцессингом, который использовался при ее обучении. Вот как можно быстро использовать конвейер для классификации положительных и отрицательных текстов:
+
+```python
+>>> from transformers import pipeline
+
+# Выделение конвейера для анализа настроений
+>>> classifier = pipeline('sentiment-analysis')
+>>> classifier('Мы очень рады представить конвейер в transformers.')
+[{'label': 'POSITIVE', 'score': 0.9996980428695679}]
+```
+
+Вторая строка кода загружает и кэширует предварительно обученную модель, используемую конвейером, а третья оценивает ее на заданном тексте. Здесь ответ "POSITIVE" с уверенностью 99,97%.
+
+Во многих задачах, как в НЛП, так и в компьютерном зрении и речи, уже есть готовый `pipeline`. Например, мы можем легко извлечь обнаруженные объекты на изображении:
+
+``` python
+>>> import requests
+>>> from PIL import Image
+>>> from transformers import pipeline
+
+# Скачиваем изображение с милыми котиками
+>>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png"
+>>> image_data = requests.get(url, stream=True).raw
+>>> image = Image.open(image_data)
+
+# Выделение конвейера для обнаружения объектов
+>>> object_detector = pipeline('object-detection')
+>>> object_detector(image)
+[{'score': 0.9982201457023621,
+  'label': 'remote',
+  'box': {'xmin': 40, 'ymin': 70, 'xmax': 175, 'ymax': 117}},
+ {'score': 0.9960021376609802,
+  'label': 'remote',
+  'box': {'xmin': 333, 'ymin': 72, 'xmax': 368, 'ymax': 187}},
+ {'score': 0.9954745173454285,
+  'label': 'couch',
+  'box': {'xmin': 0, 'ymin': 1, 'xmax': 639, 'ymax': 473}},
+ {'score': 0.9988006353378296,
+  'label': 'cat',
+  'box': {'xmin': 13, 'ymin': 52, 'xmax': 314, 'ymax': 470}},
+ {'score': 0.9986783862113953,
+  'label': 'cat',
+  'box': {'xmin': 345, 'ymin': 23, 'xmax': 640, 'ymax': 368}}]
+```
+
+Здесь мы получаем список объектов, обнаруженных на изображении, с рамкой вокруг объекта и оценкой достоверности. Слева - исходное изображение, справа прогнозы:
+
+<h3 align="center">
+    <a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png" width="400"></a>
+    <a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample_post_processed.png" width="400"></a>
+</h3>
+
+Подробнее о задачах, поддерживаемых API `pipeline`, можно узнать в [этом учебном пособии](https://huggingface.co/docs/transformers/task_sum)
+
+В дополнение к `pipeline`, для загрузки и использования любой из предварительно обученных моделей в заданной задаче достаточно трех строк кода. Вот версия для PyTorch:
+```python
+>>> from transformers import AutoTokenizer, AutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+>>> model = AutoModel.from_pretrained("bert-base-uncased")
+
+>>> inputs = tokenizer("Привет мир!", return_tensors="pt")
+>>> outputs = model(**inputs)
+```
+
+А вот эквивалентный код для TensorFlow:
+```python
+>>> from transformers import AutoTokenizer, TFAutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+>>> model = TFAutoModel.from_pretrained("bert-base-uncased")
+
+>>> inputs = tokenizer("Привет мир!", return_tensors="tf")
+>>> outputs = model(**inputs)
+```
+
+Токенизатор отвечает за всю предварительную обработку, которую ожидает предварительно обученная модель, и может быть вызван непосредственно с помощью одной строки (как в приведенных выше примерах) или на списке. В результате будет получен словарь, который можно использовать в последующем коде или просто напрямую передать в модель с помощью оператора распаковки аргументов **.
+
+Сама модель представляет собой обычный [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) или [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) (в зависимости от используемого бэкенда), который можно использовать как обычно. [В этом руководстве](https://huggingface.co/docs/transformers/training) рассказывается, как интегрировать такую модель в классический цикл обучения PyTorch или TensorFlow, или как использовать наш API `Trainer` для быстрой тонкой настройки на новом датасете.
+
+## Почему необходимо использовать transformers?
+
+1. Простые в использовании современные модели:
+    - Высокая производительность в задачах понимания и генерации естественного языка, компьютерного зрения и аудио.
+    - Низкий входной барьер для преподавателей и практиков.
+    - Небольшое количество абстракций для пользователя и всего три класса для изучения.
+    - Единый API для использования всех наших предварительно обученных моделей.
+
+1. Более низкие вычислительные затраты, меньший "углеродный след":
+    - Исследователи могут обмениваться обученными моделями вместо того, чтобы постоянно их переобучать.
+    - Практики могут сократить время вычислений и производственные затраты.
+    - Десятки архитектур с более чем 60 000 предварительно обученных моделей для всех модальностей.
+
+1. Выбор подходящего фреймворка для каждого этапа жизни модели:
+    - Обучение самых современных моделей за 3 строки кода.
+    - Перемещайте одну модель между фреймворками TF2.0/PyTorch/JAX по своему усмотрению.
+    - Беспрепятственный выбор подходящего фреймворка для обучения, оценки и производства.
+
+1. Легко настроить модель или пример под свои нужды:
+    - Мы предоставляем примеры для каждой архитектуры, чтобы воспроизвести результаты, опубликованные их авторами.
+    - Внутренние компоненты модели раскрываются максимально последовательно.
+    - Файлы моделей можно использовать независимо от библиотеки для проведения быстрых экспериментов.
+
+## Почему я не должен использовать transformers?
+
+- Данная библиотека не является модульным набором строительных блоков для нейронных сетей. Код в файлах моделей специально не рефакторится дополнительными абстракциями, чтобы исследователи могли быстро итеративно работать с каждой из моделей, не погружаясь в дополнительные абстракции/файлы.
+- API обучения не предназначен для работы с любой моделью, а оптимизирован для работы с моделями, предоставляемыми библиотекой. Для работы с общими циклами машинного обучения следует использовать другую библиотеку (возможно, [Accelerate](https://huggingface.co/docs/accelerate)).
+- Несмотря на то, что мы стремимся представить как можно больше примеров использования, скрипты в нашей папке [примеров](https://github.com/huggingface/transformers/tree/main/examples) являются именно примерами. Предполагается, что они не будут работать "из коробки" для решения вашей конкретной задачи, и вам придется изменить несколько строк кода, чтобы адаптировать их под свои нужды.
+
+## Установка
+
+### С помощью pip
+
+Данный репозиторий протестирован на Python 3.8+, Flax 0.4.1+, PyTorch 1.10+ и TensorFlow 2.6+.
+
+Устанавливать 🤗 Transformers следует в [виртуальной среде](https://docs.python.org/3/library/venv.html). Если вы не знакомы с виртуальными средами Python, ознакомьтесь с [руководством пользователя](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
+
+Сначала создайте виртуальную среду с той версией Python, которую вы собираетесь использовать, и активируйте ее.
+
+Затем необходимо установить хотя бы один бекенд из Flax, PyTorch или TensorFlow.
+Пожалуйста, обратитесь к страницам [TensorFlow установочная страница](https://www.tensorflow.org/install/), [PyTorch установочная страница](https://pytorch.org/get-started/locally/#start-locally) и/или [Flax](https://github.com/google/flax#quick-install) и [Jax](https://github.com/google/jax#installation), где описаны команды установки для вашей платформы.
+
+После установки одного из этих бэкендов 🤗 Transformers может быть установлен с помощью pip следующим образом:
+
+```bash
+pip install transformers
+```
+
+Если вы хотите поиграть с примерами или вам нужен самый современный код и вы не можете ждать нового релиза, вы должны [установить библиотеку из исходного кода](https://huggingface.co/docs/transformers/installation#installing-from-source).
+
+### С помощью conda
+
+Начиная с версии Transformers v4.0.0, у нас появилсась поддержка conda: `huggingface`.
+
+Установить Transformers с помощью conda можно следующим образом:
+
+```bash
+conda install -c huggingface transformers
+```
+
+О том, как установить Flax, PyTorch или TensorFlow с помощью conda, читайте на страницах, посвященных их установке.
+
+> **_ЗАМЕТКА:_** В операционной системе Windows вам может быть предложено активировать режим разработчика, чтобы воспользоваться преимуществами кэширования. Если для вас это невозможно, сообщите нам об этом [здесь](https://github.com/huggingface/huggingface_hub/issues/1062).
+
+## Модельные архитектуры
+
+**[Все контрольные точки моделей](https://huggingface.co/models)**, предоставляемые 🤗 Transformers, беспрепятственно интегрируются с huggingface.co [model hub](https://huggingface.co/models), куда они загружаются непосредственно [пользователями](https://huggingface.co/users) и [организациями](https://huggingface.co/organizations).
+
+Текущее количество контрольных точек: ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)
+
+🤗 В настоящее время Transformers предоставляет следующие архитектуры (подробное описание каждой из них см. [здесь](https://huggingface.co/docs/transformers/model_summary)):
+
+1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
+1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (from Google Research) released with the paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.
+1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
+1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
+1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
+1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
+1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
+1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
+1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
+1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
+1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
+1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
+1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
+1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
+1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
+1. **[BLIP-2](https://huggingface.co/docs/transformers/model_doc/blip-2)** (from Salesforce) released with the paper [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) by Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi.
+1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
+1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
+1. **[BridgeTower](https://huggingface.co/docs/transformers/model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
+1. **[BROS](https://huggingface.co/docs/transformers/model_doc/bros)** (from NAVER CLOVA) released with the paper [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539) by Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park.
+1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
+1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
+1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
+1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
+1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
+1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
+1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
+1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
+1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve.
+1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
+1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
+1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
+1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
+1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
+1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
+1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
+1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
+1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
+1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
+1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (from SenseTime Research) released with the paper [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai.
+1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
+1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (from Google AI) released with the paper [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) by Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun.
+1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (from The University of Texas at Austin) released with the paper [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl.
+1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
+1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
+1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi.
+1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (from Meta AI) released with the paper [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) by Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski.
+1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
+1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
+1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
+1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
+1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
+1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
+1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
+1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
+1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (from Meta AI) released with the paper [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) by Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi.
+1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
+1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.
+1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models.  **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2 and ESMFold** were released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
+1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
+1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
+1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
+1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
+1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
+1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
+1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
+1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
+1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
+1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
+1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
+1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
+1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
+1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
+1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
+1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren.
+1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
+1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
+1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
+1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
+1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
+1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
+1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
+1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
+1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
+1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
+1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.
+1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
+1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
+1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
+1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
+1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutxlm)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
+1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
+1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
+1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
+1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (from The FAIR team of Meta AI) released with the paper [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/XXX) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom.
+1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
+1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
+1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
+1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
+1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
+1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
+1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
+1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
+1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
+1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (from Google AI) released with the paper [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) by Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos.
+1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
+1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
+1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (from Meta/USC/CMU/SJTU) released with the paper [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
+1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
+1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
+1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.
+1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
+1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
+1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
+1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
+1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
+1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
+1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (from MosaiML) released with the repository [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team.
+1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA) for Approximate Self-Attention](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
+1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
+1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez.
+1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
+1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
+1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
+1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
+1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
+1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
+1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
+1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
+1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
+1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
+1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
+1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
+1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
+1. **[Persimmon](https://huggingface.co/docs/transformers/main/model_doc/persimmon)** (from ADEPT) released in a [blog post](https://www.adept.ai/blog/persimmon-8b) by Erich Elsen, Augustus Odena, Maxwell Nye, Sağnak Taşırlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani.
+1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
+1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.
+1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
+1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
+1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi and Kyogu Lee.
+1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
+1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
+1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela.
+1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
+1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
+1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
+1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
+1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
+1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
+1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
+1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
+1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (from Bo Peng), released on [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
+1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
+1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
+1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
+1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
+1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
+1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
+1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
+1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
+1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
+1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
+1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
+1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
+1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
+1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
+1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
+1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
+1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
+1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
+1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
+1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
+1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
+1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
+1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
+1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (from Google Research) released with the paper [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.
+1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
+1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
+1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
+1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
+1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
+1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
+1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
+1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (from Meta AI) released with the paper [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) by Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He.
+1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
+1. **[ViTMatte](https://huggingface.co/docs/transformers/main/model_doc/vitmatte)** (from HUST-VL) rreleased with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
+1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
+1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (from Kakao Enterprise) released with the paper [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son.
+1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
+1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
+1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
+1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
+1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
+1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (from OpenAI) released with the paper [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever.
+1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
+1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (from Meta AI) released with the paper [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) by Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe.
+1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
+1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
+1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
+1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
+1. **[XLM-V](https://huggingface.co/docs/transformers/model_doc/xlm-v)** (from Meta AI) released with the paper [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) by Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa.
+1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
+1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
+1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
+1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
+1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
+1. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
+
+Чтобы проверить, есть ли у каждой модели реализация на Flax, PyTorch или TensorFlow, или связанный с ней токенизатор, поддерживаемый библиотекой 🤗 Tokenizers, обратитесь к [этой таблице](https://huggingface.co/docs/transformers/index#supported-frameworks).
+
+Эти реализации были протестированы на нескольких наборах данных (см. примеры скриптов) и должны соответствовать производительности оригинальных реализаций. Более подробную информацию о производительности можно найти в разделе "Примеры" [документации](https://github.com/huggingface/transformers/tree/main/examples).
+
+
+## Изучи больше
+
+| Секция | Описание |
+|-|-|
+| [Документация](https://huggingface.co/docs/transformers/) | Полная документация по API и гайды |
+| [Краткие описания задач](https://huggingface.co/docs/transformers/task_summary) | Задачи поддерживаются 🤗 Transformers |
+| [Пособие по предварительной обработке](https://huggingface.co/docs/transformers/preprocessing) | Использование класса `Tokenizer` для подготовки данных для моделей |
+| [Обучение и доработка](https://huggingface.co/docs/transformers/training) | Использование моделей, предоставляемых 🤗 Transformers, в цикле обучения PyTorch/TensorFlow и API `Trainer`. |
+| [Быстрый тур: Тонкая настройка/скрипты использования](https://github.com/huggingface/transformers/tree/main/examples) | Примеры скриптов для тонкой настройки моделей на широком спектре задач |
+| [Совместное использование и загрузка моделей](https://huggingface.co/docs/transformers/model_sharing) | Загружайте и делитесь с сообществом своими доработанными моделями |
+
+## Цитирование
+
+Теперь у нас есть [статья](https://www.aclweb.org/anthology/2020.emnlp-demos.6/), которую можно цитировать для библиотеки 🤗 Transformers:
+```bibtex
+@inproceedings{wolf-etal-2020-transformers,
+    title = "Transformers: State-of-the-Art Natural Language Processing",
+    author = "Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and Rémi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush",
+    booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
+    month = oct,
+    year = "2020",
+    address = "Online",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6",
+    pages = "38--45"
+}
+```
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 8a7b507599b3..8c03789b8506 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -26,7 +26,7 @@ token: 词符（并用括号标注原英文）
 tokenize: 词符化（并用括号标注原英文）
 tokenizer: 词符化器（并用括号标注原英文）
 transformer: transformer（不翻译）
-pipeline: 流水线
+pipeline: 流水线
 API: API (不翻译）
 inference: 推理
 Trainer: 训练器。当作为类名出现时不翻译。
@@ -43,7 +43,7 @@ checkpoint: 检查点
     <br>
     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
     <br>
-<p>
+</p>
 <p align="center">
     <a href="https://circleci.com/gh/huggingface/transformers">
         <img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main">
@@ -72,7 +72,7 @@ checkpoint: 检查点
         <a href="https://github.com/huggingface/transformers/blob/main/README_es.md">Español</a> |
         <a href="https://github.com/huggingface/transformers/blob/main/README_ja.md">日本語</a> |
         <a href="https://github.com/huggingface/transformers/blob/main/README_hd.md">हिन्दी</a>
-    <p>
+    </p>
 </h4>
 
 <h3 align="center">
@@ -83,11 +83,11 @@ checkpoint: 检查点
     <a href="https://hf.co/course"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/course_banner.png"></a>
 </h3>
 
-🤗 Transformers 提供了数以千计的预训练模型，支持 100 多种语言的文本分类、信息抽取、问答、摘要、翻译、文本生成。它的宗旨让最先进的 NLP 技术人人易用。
+🤗 Transformers 提供了数以千计的预训练模型，支持 100 多种语言的文本分类、信息抽取、问答、摘要、翻译、文本生成。它的宗旨是让最先进的 NLP 技术人人易用。
 
 🤗 Transformers 提供了便于快速下载和使用的API，让你可以把预训练模型用在给定文本、在你的数据集上微调然后通过 [model hub](https://huggingface.co/models) 与社区共享。同时，每个定义的 Python 模块均完全独立，方便修改和快速研究实验。
 
-🤗 Transformers 支持三个最热门的深度学习库： [Jax](https://jax.readthedocs.io/en/latest/), [PyTorch](https://pytorch.org/) and [TensorFlow](https://www.tensorflow.org/) — 并与之无缝整合。你可以直接使用一个框架训练你的模型然后用另一个加载和推理。
+🤗 Transformers 支持三个最热门的深度学习库： [Jax](https://jax.readthedocs.io/en/latest/), [PyTorch](https://pytorch.org/) 以及 [TensorFlow](https://www.tensorflow.org/) — 并与之无缝整合。你可以直接使用一个框架训练你的模型然后用另一个加载和推理。
 
 ## 在线演示
 
@@ -200,7 +200,7 @@ checkpoint: 检查点
 
 ### 使用 pip
 
-这个仓库已在 Python 3.6+、Flax 0.3.2+、PyTorch 1.3.1+ 和 TensorFlow 2.3+ 下经过测试。
+这个仓库已在 Python 3.8+、Flax 0.4.1+、PyTorch 1.10+ 和 TensorFlow 2.6+ 下经过测试。
 
 你可以在[虚拟环境](https://docs.python.org/3/library/venv.html)中安装 🤗 Transformers。如果你还不熟悉 Python 的虚拟环境，请阅此[用户说明](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)。
 
@@ -237,8 +237,11 @@ conda install -c huggingface transformers
 🤗 Transformers 目前支持如下的架构（模型概述请阅[这里](https://huggingface.co/docs/transformers/model_summary)）：
 
 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (来自 Google Research and the Toyota Technological Institute at Chicago) 伴随论文 [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), 由 Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut 发布。
-1. **[AltCLIP](https://huggingface.co/docs/transformers/main/model_doc/altclip)** (来自 BAAI) 伴随论文 [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) 由 Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell 发布。
+1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (来自 Google Research) 伴随论文 [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) 由 Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig 发布。
+1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (来自 BAAI) 伴随论文 [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) 由 Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell 发布。
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (来自 MIT) 伴随论文 [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) 由 Yuan Gong, Yu-An Chung, James Glass 发布。
+1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
+1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (来自 Facebook) 伴随论文 [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) 由 Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer 发布。
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (来自 École polytechnique) 伴随论文 [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) 由 Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis 发布。
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (来自 VinAI Research) 伴随论文 [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) 由 Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen 发布。
@@ -248,24 +251,31 @@ conda install -c huggingface transformers
 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (来自 VinAI Research) 伴随论文 [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) 由 Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen 发布。
 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。
 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。
-1. **[BioGpt](https://huggingface.co/docs/transformers/main/model_doc/biogpt)** (来自 Microsoft Research AI4Science) 伴随论文 [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) 由 Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu 发布。
-1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (来自 Google AI) 伴随论文 [Big Transfer (BiT) 由 Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby 发布。
+1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (来自 Microsoft Research AI4Science) 伴随论文 [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) 由 Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu 发布。
+1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (来自 Google AI) 伴随论文 [Big Transfer (BiT) 由 Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby 发布。
 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (来自 Facebook) 伴随论文 [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 由 Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston 发布。
 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (来自 Facebook) 伴随论文 [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 由 Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston 发布。
-1. **[BLIP](https://huggingface.co/docs/transformers/main/model_doc/blip)** (来自 Salesforce) 伴随论文 [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) 由 Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi 发布。
+1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (来自 Salesforce) 伴随论文 [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) 由 Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi 发布。
+1. **[BLIP-2](https://huggingface.co/docs/transformers/model_doc/blip-2)** (来自 Salesforce) 伴随论文 [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) 由 Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi 发布。
 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
 1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (来自 Alexa) 伴随论文 [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) 由 Adrian de Wynter and Daniel J. Perry 发布。
+1. **[BridgeTower](https://huggingface.co/docs/transformers/model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
+1. **[BROS](https://huggingface.co/docs/transformers/model_doc/bros)** (来自 NAVER CLOVA) 伴随论文 [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539) 由 Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park 发布。
 1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (来自 Google Research) 伴随论文 [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) 由 Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel 发布。
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (来自 Inria/Facebook/Sorbonne) 伴随论文 [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) 由 Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot 发布。
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (来自 Google Research) 伴随论文 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 由 Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 发布。
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (来自 OFA-Sys) 伴随论文 [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) 由 An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou 发布。
+1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (来自 LAION-AI) 伴随论文 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) 由 Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov 发布。
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (来自 OpenAI) 伴随论文 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 由 Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 发布。
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (来自 University of Göttingen) 伴随论文 [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) 由 Timo Lüddecke and Alexander Ecker 发布。
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (来自 Salesforce) 伴随论文 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 由 Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 发布。
+1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (来自 MetaAI) 伴随论文 [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) 由 Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve 发布。
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (来自 Microsoft Research Asia) 伴随论文 [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) 由 Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang 发布。
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (来自 YituTech) 伴随论文 [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) 由 Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan 发布。
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (来自 Facebook AI) 伴随论文 [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) 由 Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie 发布。
+1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (来自 Tsinghua University) 伴随论文 [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) 由 Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun 发布。
+1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (来自 Salesforce) 伴随论文 [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) 由 Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher 发布。
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (来自 Microsoft) 伴随论文 [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) 由 Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang 发布。
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (来自 Facebook) 伴随论文 [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) 由 Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli 发布。
@@ -274,36 +284,53 @@ conda install -c huggingface transformers
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (来自 Berkeley/Facebook/Google) 伴随论文 [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) 由 Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch 发布。
 1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (来自 SenseTime Research) 伴随论文 [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) 由 Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai 发布。
 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (来自 Facebook) 伴随论文 [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) 由 Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou 发布。
+1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (来自 Google AI) 伴随论文 [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) 由 Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun 发布。
+1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (来自 The University of Texas at Austin) 伴随论文 [NMS Strikes Back](https://arxiv.org/abs/2212.06137) 由 Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl 发布。
 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (来自 Facebook) 伴随论文 [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) 由 Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko 发布。
 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (来自 Microsoft Research) 伴随论文 [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) 由 Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan 发布。
 1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (来自 SHI Labs) 伴随论文 [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) 由 Ali Hassani and Humphrey Shi 发布。
+1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (来自 Meta AI) 伴随论文 [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) 由 Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski 发布。
 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (来自 HuggingFace), 伴随论文 [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) 由 Victor Sanh, Lysandre Debut and Thomas Wolf 发布。 同样的方法也应用于压缩 GPT-2 到 [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/distillation), RoBERTa 到 [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/distillation), Multilingual BERT 到 [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/distillation) 和德语版 DistilBERT。
 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (来自 Microsoft Research) 伴随论文 [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) 由 Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei 发布。
 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (来自 NAVER) 伴随论文 [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) 由 Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park 发布。
 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (来自 Facebook) 伴随论文 [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) 由 Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih 发布。
 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (来自 Intel Labs) 伴随论文 [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) 由 René Ranftl, Alexey Bochkovskiy, Vladlen Koltun 发布。
+1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (来自 Snap Research) 伴随论文 [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) 由 Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren 发布。
+1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (来自 Google Research/Stanford University) 伴随论文 [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) 由 Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning 发布。
+1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (来自 Meta AI) 伴随论文 [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) 由 Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi 发布。
 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (来自 Google Research) 伴随论文 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 由 Sascha Rothe, Shashi Narayan, Aliaksei Severyn 发布。
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (来自 Baidu) 伴随论文 [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu 发布。
+1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (来自 Baidu) 伴随论文 [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) 由 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang 发布。
 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models.  **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
+1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
+1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (来自 CNRS) 伴随论文 [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) 由 Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab 发布。
 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (来自 Facebook AI) 伴随论文 [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) 由 Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela 发布。
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (来自 Google Research) 伴随论文 [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) 由 James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon 发布。
+1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (来自 Microsoft Research) 伴随论文 [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) 由 Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao 发布。
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (来自 CMU/Google Brain) 伴随论文 [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) 由 Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le 发布。
-1. **[GIT](https://huggingface.co/docs/transformers/main/model_doc/git)** (来自 Microsoft Research) 伴随论文 [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) 由 Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang 发布。
+1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (来自 Microsoft Research) 伴随论文 [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) 由 Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang 发布。
 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (来自 KAIST) 伴随论文 [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) 由 Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim 发布。
 1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (来自 OpenAI) 伴随论文 [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) 由 Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever 发布。
 1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (来自 EleutherAI) 随仓库 [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) 发布。作者为 Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy 发布。
 1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
 1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (来自 ABEJA) 由 Shinya Otani, Takayoshi Makabe, Anuj Arora, Kyo Hattori。
 1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (来自 OpenAI) 伴随论文 [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) 由 Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever** 发布。
-1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (来自 EleutherAI) 伴随论文 [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) 由 Ben Wang and Aran Komatsuzaki 发布。 
-1. **[GPT-Sw3](https://huggingface.co/docs/transformers/main/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren. 
+1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (来自 EleutherAI) 伴随论文 [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) 由 Ben Wang and Aran Komatsuzaki 发布。
+1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren.
+1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (来自 BigCode) 伴随论文 [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) 由 Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra 发布。
+1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by 坂本俊之(tanreinama).
+1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (来自 UCSD, NVIDIA) 伴随论文 [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 由 Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang 发布。
+1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (来自 Allegro.pl, AGH University of Science and Technology) 伴随论文 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) 由 Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik 发布。
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (来自 Facebook) 伴随论文 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 由 Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 发布。
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (来自 Berkeley) 伴随论文 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 由 Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 发布。
+1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. 
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (来自 OpenAI) 伴随论文 [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) 由 Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever 发布。
+1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
+1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (来自 Salesforce) 伴随论文 [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) 由 Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi 发布。
 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) 由 Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou 发布。
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) 由 Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou 发布。
@@ -312,6 +339,8 @@ conda install -c huggingface transformers
 1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (来自 AllenAI) 伴随论文 [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 由 Iz Beltagy, Matthew E. Peters, Arman Cohan 发布。
 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (来自 Meta AI) 伴随论文 [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) 由 Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze 发布。
 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (来自 South China University of Technology) 伴随论文 [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) 由 Jiapeng Wang, Lianwen Jin, Kai Ding 发布。
+1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (来自 The FAIR team of Meta AI) 伴随论文 [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) 由 Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample 发布。
+1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (来自 The FAIR team of Meta AI) 伴随论文 [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/XXX) 由 Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom. 发布。
 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (来自 AllenAI) 伴随论文 [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 由 Iz Beltagy, Matthew E. Peters, Arman Cohan 发布。
 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (来自 Google AI) released 伴随论文 [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) 由 Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang 发布。
 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (来自 Studio Ousia) 伴随论文 [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) 由 Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto 发布。
@@ -320,32 +349,50 @@ conda install -c huggingface transformers
 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (来自 Facebook) 伴随论文 [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) 由 Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin 发布。
 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** 用 [OPUS](http://opus.nlpl.eu/) 数据训练的机器翻译模型由 Jörg Tiedemann 发布。[Marian Framework](https://marian-nmt.github.io/) 由微软翻译团队开发。
 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (来自 Microsoft Research Asia) 伴随论文 [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) 由 Junlong Li, Yiheng Xu, Lei Cui, Furu Wei 发布。
-1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov  >>>>>>> Fix rebase
+1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (来自 FAIR and UIUC) 伴随论文 [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) 由 Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar 发布。
+1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov
+1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (来自 Google AI) 伴随论文 [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) 由 Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos 发布。
 1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (来自 Facebook) 伴随论文 [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) 由 Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer 发布。
 1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (来自 Facebook) 伴随论文 [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) 由 Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan 发布。
+1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (来自 Facebook) 伴随论文 [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) 由 Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer 发布。
 1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (来自 NVIDIA) 伴随论文 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 由 Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 发布。
 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (来自 NVIDIA) 伴随论文 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 由 Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 发布。
+1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (来自 Alibaba Research) 伴随论文 [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) 由 Peng Wang, Cheng Da, and Cong Yao 发布。
+1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The Mistral AI team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.. 
 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (来自 Studio Ousia) 伴随论文 [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) 由 Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka 发布。
+1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (来自 Facebook) 伴随论文 [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) 由 Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli 发布。
 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (来自 CMU/Google Brain) 伴随论文 [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) 由 Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou 发布。
 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (来自 Google Inc.) 伴随论文 [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) 由 Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam 发布。
 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (来自 Google Inc.) 伴随论文 [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) 由 Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen 发布。
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (来自 Apple) 伴随论文 [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) 由 Sachin Mehta and Mohammad Rastegari 发布。
+1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (来自 Apple) 伴随论文 [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) 由 Sachin Mehta and Mohammad Rastegari 发布。
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (来自 Microsoft Research) 伴随论文 [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 由 Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu 发布。
+1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (来自 MosaiML) 伴随论文 [llm-foundry](https://github.com/mosaicml/llm-foundry/) 由 the MosaicML NLP Team 发布。
+1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (来自 the University of Wisconsin - Madison) 伴随论文 [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284) 由 Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh 发布。
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (来自 Google AI) 伴随论文 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 由 Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 发布。
+1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez.
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (来自 中国人民大学 AI Box) 伴随论文 [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) 由 Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen 发布。
 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (来自 SHI Labs) 伴随论文 [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) 由 Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi 发布。
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (来自华为诺亚方舟实验室) 伴随论文 [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) 由 Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu 发布。
 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (来自 Meta) 伴随论文 [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) 由 the NLLB team 发布。
+1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (来自 Meta) 伴随论文 [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) 由 the NLLB team 发布。
+1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (来自 Meta AI) 伴随论文 [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) 由 Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic 发布。
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (来自 the University of Wisconsin - Madison) 伴随论文 [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) 由 Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh 发布。
+1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (来自 SHI Labs)  伴随论文 [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) 由 Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi 发布。
+1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (来自 [s-JoL](https://huggingface.co/s-JoL)) 由 [Open-Llama](https://github.com/s-JoL/Open-Llama) 发布.
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (来自 Meta AI) 伴随论文 [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 由 Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al 发布。
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (来自 Google AI) 伴随论文 [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) 由 Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby 发布。
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (来自 Google) 伴随论文 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 由 Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 发布。
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (来自 Google) 伴随论文 [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) 由 Jason Phang, Yao Zhao, Peter J. Liu 发布。
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (来自 Deepmind) 伴随论文 [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) 由 Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira 发布。
+1. **[Persimmon](https://huggingface.co/docs/transformers/model_doc/persimmon)** (来自 ADEPT) 伴随论文 [blog post](https://www.adept.ai/blog/persimmon-8b) 由 Erich Elsen, Augustus Odena, Maxwell Nye, Sağnak Taşırlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani 发布。
 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (来自 VinAI Research) 伴随论文 [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) 由 Dat Quoc Nguyen and Anh Tuan Nguyen 发布。
+1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (来自 Google) 伴随论文 [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) 由 Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova 发布。
 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (来自 UCLA NLP) 伴随论文 [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) 由 Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang 发布。
 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (来自 Sea AI Labs) 伴随论文 [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) 由 Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng 发布。
+1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi, Kyogu Lee. 
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (来自 Microsoft Research) 伴随论文 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 由 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 发布。
+1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (来自 Nanjing University, The University of Hong Kong etc.) 伴随论文 [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) 由 Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao 发布。
 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (来自 NVIDIA) 伴随论文 [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) 由 Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius 发布。
 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (来自 Facebook) 伴随论文 [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) 由 Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela 发布。
 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (来自 Google Research) 伴随论文 [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) 由 Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang 发布。
@@ -354,57 +401,70 @@ conda install -c huggingface transformers
 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (来自 Google Research) 伴随论文 [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) 由 Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder 发布。
 1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (来自 Facebook), 伴随论文 [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) 由 Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov 发布。
-1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/main/model_doc/roberta-prelayernorm)** (来自 Facebook) 伴随论文 [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) 由 Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli 发布。
-1. **[RoCBert](https://huggingface.co/docs/transformers/main/model_doc/roc_bert)** (来自 WeChatAI), 伴随论文 [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) 由 HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou 发布。
+1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (来自 Facebook) 伴随论文 [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) 由 Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli 发布。
+1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (来自 WeChatAI), 伴随论文 [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) 由 HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou 发布。
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (来自 ZhuiyiTechnology), 伴随论文 [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) 由 Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu 发布。
+1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (来自 Bo Peng) 伴随论文 [this repo](https://github.com/BlinkDL/RWKV-LM) 由 Bo Peng 发布。
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (来自 NVIDIA) 伴随论文 [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) 由 Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo 发布。
+1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (来自 Meta AI) 伴随论文 [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) 由 Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick 发布。
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (来自 ASAPP) 伴随论文 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 由 Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 发布。
 1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (来自 ASAPP) 伴随论文 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 由 Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 发布。
+1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (来自 Microsoft Research) 伴随论文 [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) 由 Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei 发布。
 1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (来自 Facebook), 伴随论文 [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) 由 Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino 发布。
 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (来自 Facebook) 伴随论文 [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) 由 Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau 发布。
 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (来自 Tel Aviv University) 伴随论文 [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) 由 Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy 发布。
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (来自 Berkeley) 伴随论文 [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) 由 Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer 发布。
+1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (来自 MBZUAI) 伴随论文 [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) 由 Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan 发布。
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (来自 Microsoft) 伴随论文 [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) 由 Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo 发布。
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (来自 Microsoft) 伴随论文 [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) 由 Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo 发布。
-1. **[Swin2SR](https://huggingface.co/docs/transformers/main/model_doc/swin2sr)** (来自 University of Würzburg) 伴随论文 [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) 由 Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte 发布。
-1. **[SwitchTransformers](https://huggingface.co/docs/transformers/main/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer. 
+1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (来自 University of Würzburg) 伴随论文 [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) 由 Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte 发布。
+1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (来自 Google AI) 伴随论文 [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。
 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (来自 Google AI) 伴随论文 [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。
 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (来自 Microsoft Research) 伴随论文 [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) 由 Brandon Smock, Rohith Pesala, Robin Abraham 发布。
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (来自 Google AI) 伴随论文 [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) 由 Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos 发布。
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (来自 Microsoft Research) 伴随论文 [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) 由 Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou 发布。
 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
-1. **[TimeSformer](https://huggingface.co/docs/transformers/main/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
+1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (来自 Google/CMU) 伴随论文 [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) 由 Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov 发布。
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (来自 Microsoft) 伴随论文 [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) 由 Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei 发布。
+1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (来自 UNC Chapel Hill) 伴随论文 [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) 由 Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal 发布。
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
+1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (来自 Google Research) 伴随论文 [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) 由 Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant 发布。
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (来自 Microsoft Research) 伴随论文 [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) 由 Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang 发布。
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (来自 Microsoft Research) 伴随论文 [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) 由 Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu 发布。
+1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (来自 Peking University) 伴随论文 [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) 由 Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun 发布。
 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (来自 Tsinghua University and Nankai University) 伴随论文 [Visual Attention Network](https://arxiv.org/pdf/2202.09741.pdf) 由 Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu 发布。
 1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (来自 Multimedia Computing Group, Nanjing University) 伴随论文 [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) 由 Zhan Tong, Yibing Song, Jue Wang, Limin Wang 发布。
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (来自 NAVER AI Lab/Kakao Enterprise/Kakao Brain) 伴随论文 [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) 由 Wonjae Kim, Bokyung Son, Ildoo Kim 发布。
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (来自 Google AI) 伴随论文 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 由 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 发布。
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (来自 UCLA NLP) 伴随论文 [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) 由 Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang 发布。
-1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (来自 Google AI) 伴随论文 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 由 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 发布。
+1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (来自 Google AI) 伴随论文 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 由 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 发布。
+1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (来自 Meta AI) 伴随论文 [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) 由 Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He 发布。
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (来自 Meta AI) 伴随论文 [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) 由 Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick 发布。
+1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (来自 HUST-VL) 伴随论文 [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) 由 Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang 发布。
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (来自 Meta AI) 伴随论文 [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas 发布.
+1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (来自 Kakao Enterprise) 伴随论文 [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) 由 Jaehyeon Kim, Jungil Kong, Juhee Son 发布。
+1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (来自 Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) 由 Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (来自 Facebook AI) 伴随论文 [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) 由 Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli 发布。
 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (来自 Facebook AI) 伴随论文 [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) 由 Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino 发布。
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (来自 Facebook AI) 伴随论文 [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) 由 Qiantong Xu, Alexei Baevski, Michael Auli 发布。
 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
 1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (来自 OpenAI) 伴随论文 [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) 由 Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever 发布。
 1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (来自 Microsoft Research) 伴随论文 [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) 由 Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling 发布。
+1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (来自 Meta AI) 伴随论文 [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) 由 Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe 发布。
 1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (来自 Facebook) 伴随论文 [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) 由 Guillaume Lample and Alexis Conneau 发布。
 1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (来自 Microsoft Research) 伴随论文 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 由 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 发布。
 1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (来自 Facebook AI), 伴随论文 [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) 由 Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov 发布。
 1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (来自 Facebook AI) 伴随论文 [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) 由 Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau 发布。
+1. **[XLM-V](https://huggingface.co/docs/transformers/model_doc/xlm-v)** (来自 Meta AI) 伴随论文 [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) 由 Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa 发布。
 1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (来自 Google/CMU) 伴随论文 [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) 由 Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le 发布。
 1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (来自 Facebook AI) 伴随论文 [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) 由 Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli 发布。
 1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (来自 Facebook AI) 伴随论文 [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) 由 Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli 发布。
 1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (来自 Huazhong University of Science & Technology) 伴随论文 [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) 由 Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu 发布。
-1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (来自 the University of Wisconsin - Madison) 伴随论文 [You Only Sample (Almost) 由 Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh 发布。
+1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (来自 the University of Wisconsin - Madison) 伴随论文 [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) 由 Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh 发布。
 1. 想要贡献新的模型？我们这里有一份**详细指引和模板**来引导你添加新的模型。你可以在 [`templates`](./templates) 目录中找到他们。记得查看 [贡献指南](./CONTRIBUTING.md) 并在开始写 PR 前联系维护人员或开一个新的 issue 来获得反馈。
 
 要检查某个模型是否已有 Flax、PyTorch 或 TensorFlow 的实现，或其是否在 🤗 Tokenizers 库中有对应词符化器（tokenizer），敬请参阅[此表](https://huggingface.co/docs/transformers/index#supported-frameworks)。
@@ -416,7 +476,7 @@ conda install -c huggingface transformers
 
 | 章节 | 描述 |
 |-|-|
-| [文档](https://huggingface.co/transformers/) | 完整的 API 文档和教程 |
+| [文档](https://huggingface.co/docs/transformers/) | 完整的 API 文档和教程 |
 | [任务总结](https://huggingface.co/docs/transformers/task_summary) | 🤗 Transformers 支持的任务 |
 | [预处理教程](https://huggingface.co/docs/transformers/preprocessing) | 使用 `Tokenizer` 来为模型准备数据 |
 | [训练和微调](https://huggingface.co/docs/transformers/training) | 在 PyTorch/TensorFlow 的训练循环或 `Trainer` API 中使用 🤗 Transformers 提供的模型 |
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 5d0f1b9057a3..e16a47713c5e 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -55,7 +55,7 @@ user: 使用者
     <br>
     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
     <br>
-<p>
+</p>
 <p align="center">
     <a href="https://circleci.com/gh/huggingface/transformers">
         <img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main">
@@ -84,7 +84,7 @@ user: 使用者
         <a href="https://github.com/huggingface/transformers/blob/main/README_es.md">Español</a> |
         <a href="https://github.com/huggingface/transformers/blob/main/README_ja.md">日本語</a> |
         <a href="https://github.com/huggingface/transformers/blob/main/README_hd.md">हिन्दी</a>
-    <p>
+    </p>
 </h4>
 
 <h3 align="center">
@@ -212,7 +212,7 @@ Tokenizer 為所有的預訓練模型提供了預處理，並可以直接轉換
 
 ### 使用 pip
 
-這個 Repository 已在 Python 3.6+、Flax 0.3.2+、PyTorch 1.3.1+ 和 TensorFlow 2.3+ 下經過測試。
+這個 Repository 已在 Python 3.8+、Flax 0.4.1+、PyTorch 1.10+ 和 TensorFlow 2.6+ 下經過測試。
 
 你可以在[虛擬環境](https://docs.python.org/3/library/venv.html)中安裝 🤗 Transformers。如果你還不熟悉 Python 的虛擬環境，請閱此[使用者指引](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)。
 
@@ -249,8 +249,11 @@ conda install -c huggingface transformers
 🤗 Transformers 目前支援以下的架構（模型概覽請參閱[這裡](https://huggingface.co/docs/transformers/model_summary)）：
 
 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-1. **[AltCLIP](https://huggingface.co/docs/transformers/main/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
+1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (from Google Research) released with the paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.
+1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
+1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
+1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
@@ -260,24 +263,31 @@ conda install -c huggingface transformers
 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BioGpt](https://huggingface.co/docs/transformers/main/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
-1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
+1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
+1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BLIP](https://huggingface.co/docs/transformers/main/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
+1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
+1. **[BLIP-2](https://huggingface.co/docs/transformers/model_doc/blip-2)** (from Salesforce) released with the paper [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) by Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi.
 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
 1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
+1. **[BridgeTower](https://huggingface.co/docs/transformers/model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
+1. **[BROS](https://huggingface.co/docs/transformers/model_doc/bros)** (from NAVER CLOVA) released with the paper [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539) by Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park.
 1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
+1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
+1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
+1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
+1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
@@ -286,36 +296,53 @@ conda install -c huggingface transformers
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
 1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (from SenseTime Research) released with the paper [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai.
 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
+1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (from Google AI) released with the paper [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) by Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun.
+1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (from The University of Texas at Austin) released with the paper [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl.
 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
 1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi.
+1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (from Meta AI) released with the paper [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) by Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski.
 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/distillation) and a German version of DistilBERT.
 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER) released with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
+1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
+1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
+1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (from Meta AI) released with the paper [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) by Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi.
 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
+1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.
 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models.  **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
+1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
+1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
+1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-1. **[GIT](https://huggingface.co/docs/transformers/main/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
+1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
 1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
 1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
 1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
 1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
 1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
-1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released with the paper [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki. 
-1. **[GPT-Sw3](https://huggingface.co/docs/transformers/main/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren. 
+1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released with the paper [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
+1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren.
+1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
+1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by 坂本俊之(tanreinama).
+1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
+1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
+1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. 
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
+1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
+1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.
 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
@@ -324,6 +351,8 @@ conda install -c huggingface transformers
 1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
+1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
+1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (from The FAIR team of Meta AI) released with the paper [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/XXX) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom..
 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
@@ -332,32 +361,50 @@ conda install -c huggingface transformers
 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
+1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
 1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov
+1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (from Google AI) released with the paper [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) by Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos.
 1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
 1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
+1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (from Facebook) released with the paper [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
 1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
+1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The Mistral AI team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.. 
 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
+1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.
 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
+1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
+1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (from MosaiML) released with the paper [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team.
+1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
+1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez.
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
+1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
+1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (from Meta AI) released with the paper [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) by Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic.
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
+1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
+1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, Peter J. Liu.
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
+1. **[Persimmon](https://huggingface.co/docs/transformers/model_doc/persimmon)** (from ADEPT) released with the paper [blog post](https://www.adept.ai/blog/persimmon-8b) by Erich Elsen, Augustus Odena, Maxwell Nye, Sağnak Taşırlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani.
 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
+1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.
 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
+1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi, Kyogu Lee. 
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela.
 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
@@ -366,57 +413,70 @@ conda install -c huggingface transformers
 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
 1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/main/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
-1. **[RoCBert](https://huggingface.co/docs/transformers/main/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
+1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
+1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
+1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (from Bo Peng) released with the paper [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
+1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
 1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
+1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
 1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook) released with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University) released with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
+1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
-1. **[Swin2SR](https://huggingface.co/docs/transformers/main/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
-1. **[SwitchTransformers](https://huggingface.co/docs/transformers/main/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer. 
+1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
+1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released with the paper [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
-1. **[TimeSformer](https://huggingface.co/docs/transformers/main/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
+1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft) released with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
+1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
+1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (from Google Research) released with the paper [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
+1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/pdf/2202.09741.pdf) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
 1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
-1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (from Meta AI) released with the paper [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) by Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He.
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
+1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (from HUST-VL) released with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
+1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (from Kakao Enterprise) released with the paper [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son.
+1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
 1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (from OpenAI) released with the paper [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever.
 1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
+1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (from Meta AI) released with the paper [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) by Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe.
 1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
 1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
 1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
 1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (from Facebook AI) released with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
+1. **[XLM-V](https://huggingface.co/docs/transformers/model_doc/xlm-v)** (from Meta AI) released with the paper [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) by Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa.
 1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
 1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
 1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
-1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
+1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
 1. 想要貢獻新的模型？我們這裡有一份**詳細指引和模板**來引導你加入新的模型。你可以在 [`templates`](./templates) 目錄中找到它們。記得查看[貢獻指引](./CONTRIBUTING.md)並在開始寫 PR 前聯繫維護人員或開一個新的 issue 來獲得 feedbacks。
 
 要檢查某個模型是否已有 Flax、PyTorch 或 TensorFlow 的實作，或其是否在🤗 Tokenizers 函式庫中有對應的 tokenizer，敬請參閱[此表](https://huggingface.co/docs/transformers/index#supported-frameworks)。
diff --git a/awesome-transformers.md b/awesome-transformers.md
new file mode 100644
index 000000000000..013f88259c91
--- /dev/null
+++ b/awesome-transformers.md
@@ -0,0 +1,609 @@
+# Awesome projects built with Transformers
+
+This page lists awesome projects built on top of Transformers. Transformers is more than a toolkit to use pretrained
+models: it's a community of projects built around it and the Hugging Face Hub. We want Transformers to enable
+developers, researchers, students, professors, engineers, and anyone else to build their dream projects.
+
+In this list, we showcase incredibly impactful and novel projects that have pushed the field forward. We celebrate
+100 of these projects as we reach the milestone of 100k stars as a community; but we're very open to pull requests
+adding other projects to the list. If you believe a project should be here and it's not, then please, open a PR 
+to add it.
+
+## [gpt4all](https://github.com/nomic-ai/gpt4all)
+
+[gpt4all](https://github.com/nomic-ai/gpt4all) is an ecosystem of open-source chatbots trained on massive collections of clean assistant data including code, stories and dialogue. It offers open-source, large language models such as LLaMA and GPT-J trained in an assistant-style.
+
+Keywords: Open-source, LLaMa, GPT-J, instruction, assistant
+
+## [recommenders](https://github.com/microsoft/recommenders)
+
+This repository contains examples and best practices for building recommendation systems, provided as Jupyter notebooks. It goes over several aspects required to build efficient recommendation systems: data preparation, modeling, evaluation, model selection & optimization, as well as operationalization
+
+Keywords: Recommender systems, AzureML
+
+## [lama-cleaner](https://github.com/Sanster/lama-cleaner)
+
+Image inpainting tool powered by Stable Diffusion. Remove any unwanted object, defect, people from your pictures or erase and replace anything on your pictures.
+
+Keywords: inpainting, SD, Stable Diffusion
+
+## [flair](https://github.com/flairNLP/flair)
+
+FLAIR is a powerful PyTorch NLP framework, convering several important tasks: NER, sentiment-analysis, part-of-speech tagging, text and document embeddings, among other things.
+
+Keywords: NLP, text embedding, document embedding, biomedical, NER, PoS, sentiment-analysis
+
+## [mindsdb](https://github.com/mindsdb/mindsdb)
+
+MindsDB is a low-code ML platform, which automates and integrates several ML frameworks into the data stack as "AI Tables" to streamline the integration of AI into applications, making it accessible to developers of all skill levels.
+
+Keywords: Database, low-code, AI table
+
+## [langchain](https://github.com/hwchase17/langchain)
+
+[langchain](https://github.com/hwchase17/langchain) is aimed at assisting in the development of apps merging both LLMs and other sources of knowledge. The library allows chaining calls to applications, creating a sequence across many tools.
+
+Keywords: LLMs, Large Language Models, Agents, Chains
+
+## [LlamaIndex](https://github.com/jerryjliu/llama_index)
+
+[LlamaIndex](https://github.com/jerryjliu/llama_index) is a project that provides a central interface to connect your LLM's with external data. It provides various kinds of indices and retreival mechanisms to perform different LLM tasks and obtain knowledge-augmented results.
+
+Keywords: LLMs, Large Language Models, Data Retrieval, Indices, Knowledge Augmentation 
+
+## [ParlAI](https://github.com/facebookresearch/ParlAI)
+
+[ParlAI](https://github.com/facebookresearch/ParlAI) is a python framework for sharing, training and testing dialogue models, from open-domain chitchat, to task-oriented dialogue, to visual question answering. It provides more than 100 datasets under the same API, a large zoo of pretrained models, a set of agents, and has several integrations.
+
+Keywords: Dialogue, Chatbots, VQA, Datasets, Agents
+
+## [sentence-transformers](https://github.com/UKPLab/sentence-transformers)
+
+This framework provides an easy method to compute dense vector representations for sentences, paragraphs, and images. The models are based on transformer networks like BERT / RoBERTa / XLM-RoBERTa etc. and achieve state-of-the-art performance in various task. Text is embedding in vector space such that similar text is close and can efficiently be found using cosine similarity.
+
+Keywords: Dense vector representations, Text embeddings, Sentence embeddings
+
+## [ludwig](https://github.com/ludwig-ai/ludwig)
+
+Ludwig is a declarative machine learning framework that makes it easy to define machine learning pipelines using a simple and flexible data-driven configuration system. Ludwig is targeted at a wide variety of AI tasks. It provides a data-driven configuration system, training, prediction, and evaluation scripts, as well as a programmatic API.
+
+Keywords: Declarative, Data-driven, ML Framework
+
+## [InvokeAI](https://github.com/invoke-ai/InvokeAI)
+
+[InvokeAI](https://github.com/invoke-ai/InvokeAI) is an engine for Stable Diffusion models, aimed at professionals, artists, and enthusiasts. It leverages the latest AI-driven technologies through CLI as well as a WebUI.
+
+Keywords: Stable-Diffusion, WebUI, CLI
+
+## [PaddleNLP](https://github.com/PaddlePaddle/PaddleNLP)
+
+[PaddleNLP](https://github.com/PaddlePaddle/PaddleNLP) is an easy-to-use and powerful NLP library particularly targeted at the Chinese languages. It has support for multiple pre-trained model zoos, and supports a wide-range of NLP tasks from research to industrial applications.
+
+Keywords: NLP, Chinese, Research, Industry
+
+## [stanza](https://github.com/stanfordnlp/stanza)
+
+The Stanford NLP Group's official Python NLP library. It contains support for running various accurate natural language processing tools on 60+ languages and for accessing the Java Stanford CoreNLP software from Python.
+
+Keywords: NLP, Multilingual, CoreNLP
+
+## [DeepPavlov](https://github.com/deeppavlov/DeepPavlov)
+
+[DeepPavlov](https://github.com/deeppavlov/DeepPavlov) is an open-source conversational AI library. It is designed for the development of production ready chat-bots and complex conversational systems, as well as research in the area of NLP and, particularly, of dialog systems.
+
+Keywords: Conversational, Chatbot, Dialog
+
+## [alpaca-lora](https://github.com/tloen/alpaca-lora)
+
+Alpaca-lora contains code for reproducing the Stanford Alpaca results using low-rank adaptation (LoRA). The repository provides training (fine-tuning) as well as generation scripts.
+
+Keywords: LoRA, Parameter-efficient fine-tuning
+
+## [imagen-pytorch](https://github.com/lucidrains/imagen-pytorch)
+
+An open-source Implementation of Imagen, Google's closed-source Text-to-Image Neural Network that beats DALL-E2. As of release, it is the new SOTA for text-to-image synthesis.
+
+Keywords: Imagen, Text-to-image
+
+## [adapter-transformers](https://github.com/adapter-hub/adapter-transformers)
+
+[adapter-transformers](https://github.com/adapter-hub/adapter-transformers) is an extension of HuggingFace's Transformers library, integrating adapters into state-of-the-art language models by incorporating AdapterHub, a central repository for pre-trained adapter modules. It is a drop-in replacement for transformers, which is regularly updated to stay up-to-date with the developments of transformers.
+
+Keywords: Adapters, LoRA, Parameter-efficient fine-tuning, Hub
+
+## [NeMo](https://github.com/NVIDIA/NeMo)
+
+NVIDIA [NeMo](https://github.com/NVIDIA/NeMo) is a conversational AI toolkit built for researchers working on automatic speech recognition (ASR), text-to-speech synthesis (TTS), large language models (LLMs), and natural language processing (NLP). The primary objective of [NeMo](https://github.com/NVIDIA/NeMo) is to help researchers from industry and academia to reuse prior work (code and pretrained models) and make it easier to create new https://developer.nvidia.com/conversational-ai#started.
+
+Keywords: Conversational, ASR, TTS, LLMs, NLP
+
+## [Runhouse](https://github.com/run-house/runhouse)
+
+[Runhouse](https://github.com/run-house/runhouse) allows to send code and data to any of your compute or data infra, all in Python, and continue to interact with them normally from your existing code and environment. Runhouse developers mention:
+
+> Think of it as an expansion pack to your Python interpreter that lets it take detours to remote machines or manipulate remote data.
+
+Keywords: MLOps, Infrastructure, Data storage, Modeling
+
+## [MONAI](https://github.com/Project-MONAI/MONAI)
+
+[MONAI](https://github.com/Project-MONAI/MONAI) is a PyTorch-based, open-source framework for deep learning in healthcare imaging, part of PyTorch Ecosystem. Its ambitions are:
+- developing a community of academic, industrial and clinical researchers collaborating on a common foundation;
+- creating state-of-the-art, end-to-end training workflows for healthcare imaging;
+- providing researchers with the optimized and standardized way to create and evaluate deep learning models.
+
+Keywords: Healthcare imaging, Training, Evaluation
+
+## [simpletransformers](https://github.com/ThilinaRajapakse/simpletransformers)
+
+Simple Transformers lets you quickly train and evaluate Transformer models. Only 3 lines of code are needed to initialize, train, and evaluate a model. It supports a wide variety of NLP tasks.
+
+Keywords: Framework, simplicity, NLP
+
+## [JARVIS](https://github.com/microsoft/JARVIS)
+
+[JARVIS](https://github.com/microsoft/JARVIS) is a system attempting to merge LLMs such as GPT-4 with the rest of the open-source ML community: leveraging up to 60 downstream models in order to perform tasks identified by the LLM.
+
+Keywords: LLM, Agents, HF Hub
+
+## [transformers.js](https://xenova.github.io/transformers.js/)
+
+[transformers.js](https://xenova.github.io/transformers.js/) is a JavaScript library targeted at running models from transformers directly within the browser.
+
+Keywords: Transformers, JavaScript, browser
+
+## [bumblebee](https://github.com/elixir-nx/bumblebee)
+
+Bumblebee provides pre-trained Neural Network models on top of Axon, a neural networks library for the Elixir language. It includes integration with 🤗 Models, allowing anyone to download and perform Machine Learning tasks with few lines of code.
+
+Keywords: Elixir, Axon
+
+## [argilla](https://github.com/argilla-io/argilla)
+
+Argilla is an open-source platform providing advanced NLP labeling, monitoring, and workspaces. It is compatible with many open source ecosystems such as Hugging Face, Stanza, FLAIR, and others.
+
+Keywords: NLP, Labeling, Monitoring, Workspaces
+
+## [haystack](https://github.com/deepset-ai/haystack)
+
+Haystack is an open source NLP framework to interact with your data using Transformer models and LLMs. It offers production-ready tools to quickly build complex decision making, question answering, semantic search, text generation applications, and more.
+
+Keywords: NLP, Framework, LLM
+
+## [spaCy](https://github.com/explosion/spaCy)
+
+[spaCy](https://github.com/explosion/spaCy) is a library for advanced Natural Language Processing in Python and Cython. It's built on the very latest research, and was designed from day one to be used in real products. It offers support for transformers models through its third party package, spacy-transformers.
+
+Keywords: NLP, Framework
+
+## [speechbrain](https://github.com/speechbrain/speechbrain)
+
+SpeechBrain is an open-source and all-in-one conversational AI toolkit based on PyTorch.
+The goal is to create a single, flexible, and user-friendly toolkit that can be used to easily develop state-of-the-art speech technologies, including systems for speech recognition, speaker recognition, speech enhancement, speech separation, language identification, multi-microphone signal processing, and many others.
+
+Keywords: Conversational, Speech
+
+## [skorch](https://github.com/skorch-dev/skorch)
+
+Skorch is a scikit-learn compatible neural network library that wraps PyTorch. It has support for models within transformers, and tokenizers from tokenizers.
+
+Keywords: Scikit-Learn, PyTorch
+
+## [bertviz](https://github.com/jessevig/bertviz)
+
+BertViz is an interactive tool for visualizing attention in Transformer language models such as BERT, GPT2, or T5. It can be run inside a Jupyter or Colab notebook through a simple Python API that supports most Huggingface models.
+
+Keywords: Visualization, Transformers
+
+## [mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax)
+
+[mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax) is a haiku library using the xmap/pjit operators in JAX for model parallelism of transformers. This library is designed for scalability up to approximately 40B parameters on TPUv3s. It was the library used to train the GPT-J model.
+
+Keywords: Haiku, Model parallelism, LLM, TPU
+
+## [deepchem](https://github.com/deepchem/deepchem)
+
+DeepChem aims to provide a high quality open-source toolchain that democratizes the use of deep-learning in drug discovery, materials science, quantum chemistry, and biology.
+
+Keywords: Drug discovery, Materials Science, Quantum Chemistry, Biology
+
+## [OpenNRE](https://github.com/thunlp/OpenNRE)
+
+An Open-Source Package for Neural Relation Extraction (NRE). It is targeted at a wide range of users, from newcomers to relation extraction, to developers, researchers, or students.
+
+Keywords: Neural Relation Extraction, Framework
+
+## [pycorrector](https://github.com/shibing624/pycorrector)
+
+PyCorrector is a Chinese Text Error Correction Tool. It uses a language model to detect errors, pinyin feature and shape feature to correct Chinese text errors. it can be used for Chinese Pinyin and stroke input method.
+
+Keywords: Chinese, Error correction tool, Language model, Pinyin
+
+## [nlpaug](https://github.com/makcedward/nlpaug)
+
+This python library helps you with augmenting nlp for machine learning projects. It is a lightweight library featuring synthetic data generation for improving model performance, support for audio and text, and compatibility with several ecosystems (scikit-learn, pytorch, tensorflow).
+
+Keywords: Data augmentation, Synthetic data generation, Audio, NLP
+
+## [dream-textures](https://github.com/carson-katri/dream-textures)
+
+[dream-textures](https://github.com/carson-katri/dream-textures) is a library targeted at bringing stable-diffusion support within Blender. It supports several use-cases, such as image generation, texture projection, inpainting/outpainting, ControlNet, and upscaling.
+
+Keywords: Stable-Diffusion, Blender
+
+## [seldon-core](https://github.com/SeldonIO/seldon-core)
+
+Seldon core converts your ML models (Tensorflow, Pytorch, H2o, etc.) or language wrappers (Python, Java, etc.) into production REST/GRPC microservices.
+Seldon handles scaling to thousands of production machine learning models and provides advanced machine learning capabilities out of the box including Advanced Metrics, Request Logging, Explainers, Outlier Detectors, A/B Tests, Canaries and more.
+
+Keywords: Microservices, Modeling, Language wrappers
+
+## [open_model_zoo](https://github.com/openvinotoolkit/open_model_zoo)
+
+This repository includes optimized deep learning models and a set of demos to expedite development of high-performance deep learning inference applications. Use these free pre-trained models instead of training your own models to speed-up the development and production deployment process.
+
+Keywords: Optimized models, Demos
+
+## [ml-stable-diffusion](https://github.com/apple/ml-stable-diffusion)
+
+ML-Stable-Diffusion is a repository by Apple bringing Stable Diffusion support to Core ML, on Apple Silicon devices. It supports stable diffusion checkpoints hosted on the Hugging Face Hub.
+
+Keywords: Stable Diffusion, Apple Silicon, Core ML
+
+## [stable-dreamfusion](https://github.com/ashawkey/stable-dreamfusion)
+
+Stable-Dreamfusion is a pytorch implementation of the text-to-3D model Dreamfusion, powered by the Stable Diffusion text-to-2D model.
+
+Keywords: Text-to-3D, Stable Diffusion
+
+## [txtai](https://github.com/neuml/txtai)
+ 
+[txtai](https://github.com/neuml/txtai) is an open-source platform for semantic search and workflows powered by language models. txtai builds embeddings databases, which are a union of vector indexes and relational databases enabling similarity search with SQL. Semantic workflows connect language models together into unified applications.
+
+Keywords: Semantic search, LLM
+
+## [djl](https://github.com/deepjavalibrary/djl)
+
+Deep Java Library (DJL) is an open-source, high-level, engine-agnostic Java framework for deep learning. DJL is designed to be easy to get started with and simple to use for developers. DJL provides a native Java development experience and functions like any other regular Java library. DJL offers [a Java binding](https://github.com/deepjavalibrary/djl/tree/master/extensions/tokenizers) for HuggingFace Tokenizers and easy conversion toolkit for HuggingFace model to deploy in Java.
+
+Keywords: Java, Framework
+
+## [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness/)
+
+This project provides a unified framework to test generative language models on a large number of different evaluation tasks. It has support for more than 200 tasks, and supports different ecosystems: HF Transformers, GPT-NeoX, DeepSpeed, as well as the OpenAI API.
+
+Keywords: LLM, Evaluation, Few-shot
+
+## [gpt-neox](https://github.com/EleutherAI/gpt-neox)
+
+This repository records EleutherAI's library for training large-scale language models on GPUs. The framework is based on NVIDIA's Megatron Language Model and has been augmented with techniques from DeepSpeed as well as some novel optimizations. It is focused on training multi-billion-parameter models.
+
+Keywords: Training, LLM, Megatron, DeepSpeed
+
+## [muzic](https://github.com/microsoft/muzic)
+
+Muzic is a research project on AI music that empowers music understanding and generation with deep learning and artificial intelligence. Muzic was created by researchers from Microsoft Research Asia.
+
+Keywords: Music understanding, Music generation
+
+## [dalle-flow](https://github.com/jina-ai/dalle-flow)
+
+DALL·E Flow is an interactive workflow for generating high-definition images from a text prompt. Itt leverages DALL·E-Mega, GLID-3 XL, and Stable Diffusion to generate image candidates, and then calls CLIP-as-service to rank the candidates w.r.t. the prompt.
+The preferred candidate is fed to GLID-3 XL for diffusion, which often enriches the texture and background. Finally, the candidate is upscaled to 1024x1024 via SwinIR.
+
+Keywords: High-definition image generation, Stable Diffusion, DALL-E Mega, GLID-3 XL, CLIP, SwinIR
+
+## [lightseq](https://github.com/bytedance/lightseq)
+
+LightSeq is a high performance training and inference library for sequence processing and generation implemented in CUDA. It enables highly efficient computation of modern NLP and CV models such as BERT, GPT, Transformer, etc. It is therefore best useful for machine translation, text generation, image classification, and other sequence related tasks.
+
+Keywords: Training, Inference, Sequence Processing, Sequence Generation
+
+## [LaTeX-OCR](https://github.com/lukas-blecher/LaTeX-OCR)
+
+The goal of this project is to create a learning based system that takes an image of a math formula and returns corresponding LaTeX code.
+
+Keywords: OCR, LaTeX, Math formula
+
+## [open_clip](https://github.com/mlfoundations/open_clip)
+
+OpenCLIP is an open source implementation of OpenAI's CLIP.
+
+The goal of this repository is to enable training models with contrastive image-text supervision, and to investigate their properties such as robustness to distribution shift. 
+The starting point is an implementation of CLIP that matches the accuracy of the original CLIP models when trained on the same dataset. 
+
+Specifically, a ResNet-50 model trained with this codebase on OpenAI's 15 million image subset of YFCC achieves 32.7% top-1 accuracy on ImageNet.
+
+Keywords: CLIP, Open-source, Contrastive, Image-text
+
+## [dalle-playground](https://github.com/saharmor/dalle-playground)
+
+A playground to generate images from any text prompt using Stable Diffusion and Dall-E mini.
+
+Keywords: WebUI, Stable Diffusion, Dall-E mini
+
+## [FedML](https://github.com/FedML-AI/FedML)
+
+[FedML](https://github.com/FedML-AI/FedML) is a federated learning and analytics library enabling secure and collaborative machine learning on decentralized data anywhere at any scale.
+
+It supports large-scale cross-silo federated learning, and cross-device federated learning on smartphones/IoTs, and research simulation.
+
+Keywords: Federated Learning, Analytics, Collaborative ML, Decentralized
+
+## [gpt-code-clippy](https://github.com/CodedotAl/gpt-code-clippy)
+
+GPT-Code-Clippy (GPT-CC) is an open source version of GitHub Copilot, a language model -- based on GPT-3, called GPT-Codex -- that is fine-tuned on publicly available code from GitHub.
+
+Keywords: LLM, Code
+
+## [TextAttack](https://github.com/QData/TextAttack)
+
+[TextAttack](https://github.com/QData/TextAttack) 🐙 is a Python framework for adversarial attacks, data augmentation, and model training in NLP.
+
+Keywords: Adversarial attacks, Data augmentation, NLP
+
+## [OpenPrompt](https://github.com/thunlp/OpenPrompt)
+
+Prompt-learning is a paradigm to adapt pre-trained language models (PLMs) to downstream NLP tasks, which modify the input text with a textual template and directly uses PLMs to conduct pre-trained tasks. This library provides a standard, flexible and extensible framework to deploy the prompt-learning pipeline. [OpenPrompt](https://github.com/thunlp/OpenPrompt) supports loading PLMs directly from https://github.com/huggingface/transformers.
+
+## [text-generation-webui](https://github.com/oobabooga/text-generation-webui/)
+
+[text-generation-webui](https://github.com/oobabooga/text-generation-webui/) is a Gradio Web UI for running Large Language Models like LLaMA, llama.cpp, GPT-J, Pythia, OPT, and GALACTICA.
+
+Keywords: LLM, WebUI
+
+## [libra](https://github.com/Palashio/libra)
+
+An ergonomic machine learning [libra](https://github.com/Palashio/libra)ry for non-technical users. It focuses on ergonomics and on ensuring that training a model is as simple as it can be.
+
+Keywords: Ergonomic, Non-technical
+
+## [alibi](https://github.com/SeldonIO/alibi)
+
+Alibi is an open source Python library aimed at machine learning model inspection and interpretation. The focus of the library is to provide high-quality implementations of black-box, white-box, local and global explanation methods for classification and regression models.
+
+Keywords: Model inspection, Model interpretation, Black-box, White-box
+
+## [tortoise-tts](https://github.com/neonbjb/tortoise-tts)
+
+Tortoise is a text-to-speech program built with the following priorities: strong multi-voice capabilities, and highly realistic prosody and intonation.
+
+Keywords: Text-to-speech
+
+## [flower](https://github.com/adap/flower)
+
+Flower (flwr) is a framework for building federated learning systems. The design of Flower is based on a few guiding principles: customizability, extendability, framework agnosticity, and ease-of-use.
+
+Keywords: Federated learning systems, Customizable, Extendable, Framework-agnostic, Simplicity
+
+## [fast-bert](https://github.com/utterworks/fast-bert)
+
+Fast-Bert is a deep learning library that allows developers and data scientists to train and deploy BERT and XLNet based models for natural language processing tasks beginning with Text Classification. It is aimed at simplicity.
+
+Keywords: Deployment, BERT, XLNet
+
+## [towhee](https://github.com/towhee-io/towhee)
+
+Towhee makes it easy to build neural data processing pipelines for AI applications. We provide hundreds of models, algorithms, and transformations that can be used as standard pipeline building blocks. Users can use Towhee's Pythonic API to build a prototype of their pipeline and automatically optimize it for production-ready environments.
+
+Keywords: Data processing pipeline, Optimization
+
+## [alibi-detect](https://github.com/SeldonIO/alibi-detect)
+
+Alibi Detect is an open source Python library focused on outlier, adversarial and drift detection. The package aims to cover both online and offline detectors for tabular data, text, images and time series. Both TensorFlow and PyTorch backends are supported for drift detection.
+
+Keywords: Adversarial, Outlier, Drift detection
+
+## [FARM](https://github.com/deepset-ai/FARM)
+
+[FARM](https://github.com/deepset-ai/FARM) makes Transfer Learning with BERT & Co simple, fast and enterprise-ready. It's built upon transformers and provides additional features to simplify the life of developers: Parallelized preprocessing, highly modular design, multi-task learning, experiment tracking, easy debugging and close integration with AWS SageMaker.
+
+Keywords: Transfer Learning, Modular design, Multi-task learning, Experiment tracking
+
+## [aitextgen](https://github.com/minimaxir/aitextgen)
+
+A robust Python tool for text-based AI training and generation using OpenAI's GPT-2 and EleutherAI's GPT Neo/GPT-3 architecture.
+[aitextgen](https://github.com/minimaxir/aitextgen) is a Python package that leverages PyTorch, Hugging Face Transformers and pytorch-lightning with specific optimizations for text generation using GPT-2, plus many added features.
+
+Keywords: Training, Generation
+
+## [diffgram](https://github.com/diffgram/diffgram)
+
+Diffgram aims to integrate human supervision into platforms. We support your team programmatically changing the UI (Schema, layout, etc.) like in Streamlit. This means that you can collect and annotate timely data from users. In other words, we are the platform behind your platform, an integrated part of your application, to ship new & better AI products faster.
+
+Keywords: Human supervision, Platform
+
+## [ecco](https://github.com/jalammar/ecco)
+
+Explain, analyze, and visualize NLP language models. Ecco creates interactive visualizations directly in Jupyter notebooks explaining the behavior of Transformer-based language models (like GPT2, BERT, RoBERTA, T5, and T0).
+
+Keywords: Model explainability
+
+## [s3prl](https://github.com/s3prl/s3prl)
+
+[s3prl](https://github.com/s3prl/s3prl) stands for Self-Supervised Speech Pre-training and Representation Learning. Self-supervised speech pre-trained models are called upstream in this toolkit, and are utilized in various downstream tasks.
+
+Keywords: Speech, Training
+
+## [ru-dalle](https://github.com/ai-forever/ru-dalle)
+
+RuDALL-E aims to be similar to DALL-E, targeted to Russian.
+
+Keywords: DALL-E, Russian
+
+## [DeepKE](https://github.com/zjunlp/DeepKE)
+
+[DeepKE](https://github.com/zjunlp/DeepKE) is a knowledge extraction toolkit for knowledge graph construction supporting cnSchema，low-resource, document-level and multimodal scenarios for entity, relation and attribute extraction.
+
+Keywords: Knowledge Extraction, Knowledge Graphs
+
+## [Nebuly](https://github.com/nebuly-ai/nebuly)
+
+Nebuly is the next-generation platform to monitor and optimize your AI costs in one place. The platform connects to all your AI cost sources (compute, API providers, AI software licenses, etc) and centralizes them in one place to give you full visibility on a model basis. The platform also provides optimization recommendations and a co-pilot model that can guide during the optimization process. The platform builds on top of the open-source tools allowing you to optimize the different steps of your AI stack to squeeze out the best possible cost performances.
+
+Keywords: Optimization, Performance, Monitoring
+
+## [imaginAIry](https://github.com/brycedrennan/imaginAIry)
+
+Offers a CLI and a Python API to generate images with Stable Diffusion. It has support for many tools, like image structure control (controlnet), instruction-based image edits (InstructPix2Pix), prompt-based masking (clipseg), among others.
+
+Keywords: Stable Diffusion, CLI, Python API
+
+## [sparseml](https://github.com/neuralmagic/sparseml)
+
+SparseML is an open-source model optimization toolkit that enables you to create inference-optimized sparse models using pruning, quantization, and distillation algorithms. Models optimized with SparseML can then be exported to the ONNX and deployed with DeepSparse for GPU-class performance on CPU hardware.
+
+Keywords: Model optimization, Pruning, Quantization, Distillation
+
+## [opacus](https://github.com/pytorch/opacus)
+
+Opacus is a library that enables training PyTorch models with differential privacy. It supports training with minimal code changes required on the client, has little impact on training performance, and allows the client to online track the privacy budget expended at any given moment.
+
+Keywords: Differential privacy
+
+## [LAVIS](https://github.com/salesforce/LAVIS)
+
+[LAVIS](https://github.com/salesforce/LAVIS) is a Python deep learning library for LAnguage-and-VISion intelligence research and applications. This library aims to provide engineers and researchers with a one-stop solution to rapidly develop models for their specific multimodal scenarios, and benchmark them across standard and customized datasets. It features a unified interface design to access
+
+Keywords: Multimodal, NLP, Vision
+
+## [buzz](https://github.com/chidiwilliams/buzz)
+
+Buzz transcribes and translates audio offline on your personal computer. Powered by OpenAI's Whisper.
+
+Keywords: Audio transcription, Translation
+
+## [rust-bert](https://github.com/guillaume-be/rust-bert)
+
+Rust-native state-of-the-art Natural Language Processing models and pipelines. Port of Hugging Face's Transformers library, using the tch-rs crate and pre-processing from rust-tokenizers. Supports multi-threaded tokenization and GPU inference. This repository exposes the model base architecture, task-specific heads and ready-to-use pipelines.
+
+Keywords: Rust, BERT, Inference
+
+## [EasyNLP](https://github.com/alibaba/EasyNLP)
+
+[EasyNLP](https://github.com/alibaba/EasyNLP) is an easy-to-use NLP development and application toolkit in PyTorch, first released inside Alibaba in 2021. It is built with scalable distributed training strategies and supports a comprehensive suite of NLP algorithms for various NLP applications. [EasyNLP](https://github.com/alibaba/EasyNLP) integrates knowledge distillation and few-shot learning for landing large pre-trained models, together with various popular multi-modality pre-trained models. It provides a unified framework of model training, inference, and deployment for real-world applications.
+
+Keywords: NLP, Knowledge distillation, Few-shot learning, Multi-modality, Training, Inference, Deployment
+
+## [TurboTransformers](https://github.com/Tencent/TurboTransformers)
+
+A fast and user-friendly runtime for transformer inference (Bert, Albert, GPT2, Decoders, etc) on CPU and GPU.
+
+Keywords: Optimization, Performance
+
+## [hivemind](https://github.com/learning-at-home/hivemind)
+
+Hivemind is a PyTorch library for decentralized deep learning across the Internet. Its intended usage is training one large model on hundreds of computers from different universities, companies, and volunteers.
+
+Keywords: Decentralized training
+
+## [docquery](https://github.com/impira/docquery)
+
+DocQuery is a library and command-line tool that makes it easy to analyze semi-structured and unstructured documents (PDFs, scanned images, etc.) using large language models (LLMs). You simply point DocQuery at one or more documents and specify a question you want to ask. DocQuery is created by the team at Impira.
+
+Keywords: Semi-structured documents, Unstructured documents, LLM, Document Question Answering
+
+## [CodeGeeX](https://github.com/THUDM/CodeGeeX)
+
+[CodeGeeX](https://github.com/THUDM/CodeGeeX) is a large-scale multilingual code generation model with 13 billion parameters, pre-trained on a large code corpus of more than 20 programming languages. It has several unique features:
+- Multilingual code generation
+- Crosslingual code translation
+- Is a customizable programming assistant
+
+Keywords: Code Generation Model
+
+## [ktrain](https://github.com/amaiya/ktrain)
+
+[ktrain](https://github.com/amaiya/ktrain) is a lightweight wrapper for the deep learning library TensorFlow Keras (and other libraries) to help build, train, and deploy neural networks and other machine learning models. Inspired by ML framework extensions like fastai and ludwig, [ktrain](https://github.com/amaiya/ktrain) is designed to make deep learning and AI more accessible and easier to apply for both newcomers and experienced practitioners.
+
+Keywords: Keras wrapper, Model building, Training, Deployment
+
+## [FastDeploy](https://github.com/PaddlePaddle/FastDeploy)
+
+[FastDeploy](https://github.com/PaddlePaddle/FastDeploy) is an Easy-to-use and High Performance AI model deployment toolkit for Cloud, Mobile and Edge with packageout-of-the-box and unified experience, endend-to-end optimization for over fire160+ Text, Vision, Speech and Cross-modal AI models. Including image classification, object detection, OCR, face detection, matting, pp-tracking, NLP, stable diffusion, TTS and other tasks to meet developers' industrial deployment needs for multi-scenario, multi-hardware and multi-platform.
+
+Keywords: Model deployment, CLoud, Mobile, Edge
+
+## [underthesea](https://github.com/undertheseanlp/underthesea)
+
+[underthesea](https://github.com/undertheseanlp/underthesea) is a Vietnamese NLP toolkit. Underthesea is a suite of open source Python modules data sets and tutorials supporting research and development in Vietnamese Natural Language Processing. We provides extremely easy API to quickly apply pretrained NLP models to your Vietnamese text, such as word segmentation, part-of-speech tagging (PoS), named entity recognition (NER), text classification and dependency parsing.
+
+Keywords: Vietnamese, NLP
+
+## [hasktorch](https://github.com/hasktorch/hasktorch)
+
+Hasktorch is a library for tensors and neural networks in Haskell. It is an independent open source community project which leverages the core C++ libraries shared by PyTorch.
+
+Keywords: Haskell, Neural Networks
+
+## [donut](https://github.com/clovaai/donut)
+
+Donut, or Document understanding transformer, is a new method of document understanding that utilizes an OCR-free end-to-end Transformer model.
+
+Donut does not require off-the-shelf OCR engines/APIs, yet it shows state-of-the-art performances on various visual document understanding tasks, such as visual document classification or information extraction (a.k.a. document parsing).
+
+Keywords: Document Understanding
+
+## [transformers-interpret](https://github.com/cdpierse/transformers-interpret)
+
+Transformers Interpret is a model explainability tool designed to work exclusively with the transformers package.
+
+In line with the philosophy of the Transformers package Transformers Interpret allows any transformers model to be explained in just two lines. Explainers are available for both text and computer vision models. Visualizations are also available in notebooks and as savable png and html files
+
+Keywords: Model interpretation, Visualization
+
+## [mlrun](https://github.com/mlrun/mlrun)
+
+MLRun is an open MLOps platform for quickly building and managing continuous ML applications across their lifecycle. MLRun integrates into your development and CI/CD environment and automates the delivery of production data, ML pipelines, and online applications, significantly reducing engineering efforts, time to production, and computation resources. With MLRun, you can choose any IDE on your local machine or on the cloud. MLRun breaks the silos between data, ML, software, and DevOps/MLOps teams, enabling collaboration and fast continuous improvements.
+
+Keywords: MLOps
+
+## [FederatedScope](https://github.com/alibaba/FederatedScope)
+
+[FederatedScope](https://github.com/alibaba/FederatedScope) is a comprehensive federated learning platform that provides convenient usage and flexible customization for various federated learning tasks in both academia and industry. Based on an event-driven architecture, [FederatedScope](https://github.com/alibaba/FederatedScope) integrates rich collections of functionalities to satisfy the burgeoning demands from federated learning, and aims to build up an easy-to-use platform for promoting learning safely and effectively.
+
+Keywords: Federated learning, Event-driven
+
+## [pythainlp](https://github.com/PyThaiNLP/pythainlp)
+
+PyThaiNLP is a Python package for text processing and linguistic analysis, similar to NLTK with focus on Thai language.
+
+Keywords: Thai, NLP, NLTK
+
+## [FlagAI](https://github.com/FlagAI-Open/FlagAI)
+
+[FlagAI](https://github.com/FlagAI-Open/FlagAI) (Fast LArge-scale General AI models) is a fast, easy-to-use and extensible toolkit for large-scale model. Our goal is to support training, fine-tuning, and deployment of large-scale models on various downstream tasks with multi-modality.
+
+Keywords: Large models, Training, Fine-tuning, Deployment, Multi-modal
+
+## [pyserini](https://github.com/castorini/pyserini)
+
+[pyserini](https://github.com/castorini/pyserini) is a Python toolkit for reproducible information retrieval research with sparse and dense representations. Retrieval using sparse representations is provided via integration with the group's Anserini IR toolkit. Retrieval using dense representations is provided via integration with Facebook's Faiss library.
+
+Keywords: IR, Information Retrieval, Dense, Sparse
+
+## [baal](https://github.com/baal-org/baal)
+
+[baal](https://github.com/baal-org/baal) is an active learning library that supports both industrial applications and research usecases. [baal](https://github.com/baal-org/baal) currently supports Monte-Carlo Dropout, MCDropConnect, deep ensembles, and semi-supervised learning.
+
+Keywords: Active Learning, Research, Labeling
+
+## [cleanlab](https://github.com/cleanlab/cleanlab)
+
+[cleanlab](https://github.com/cleanlab/cleanlab) is the standard data-centric AI package for data quality and machine learning with messy, real-world data and labels. For text, image, tabular, audio (among others) datasets, you can use cleanlab to automatically: detect data issues (outliers, label errors, near duplicates, etc), train robust ML models, infer consensus + annotator-quality for multi-annotator data, suggest data to (re)label next (active learning).
+
+Keywords: Data-Centric AI, Data Quality, Noisy Labels, Outlier Detection, Active Learning  
+
+## [BentoML](https://github.com/bentoml/BentoML)
+
+[BentoML](https://github.com/bentoml) is the unified framework for for building, shipping, and scaling production-ready AI applications incorporating traditional ML, pre-trained AI models, Generative and Large Language Models. 
+All Hugging Face models and pipelines can be seamlessly integrated into BentoML applications, enabling the running of models on the most suitable hardware and independent scaling based on usage.
+
+Keywords: BentoML, Framework, Deployment, AI Applications
+
+## [LLaMA-Efficient-Tuning](https://github.com/hiyouga/LLaMA-Efficient-Tuning)
+
+[LLaMA-Efficient-Tuning](https://github.com/hiyouga/LLaMA-Efficient-Tuning) offers a user-friendly fine-tuning framework that incorporates PEFT. The repository includes training(fine-tuning) and inference examples for LLaMA-2, BLOOM, Falcon, Baichuan, Qwen, and other LLMs. A ChatGLM version is also available in [ChatGLM-Efficient-Tuning](https://github.com/hiyouga/ChatGLM-Efficient-Tuning).
+
+Keywords: PEFT, fine-tuning, LLaMA-2, ChatGLM, Qwen
+
diff --git a/conftest.py b/conftest.py
index c3d4f70326d9..247e5eb92d53 100644
--- a/conftest.py
+++ b/conftest.py
@@ -20,6 +20,10 @@
 import warnings
 from os.path import abspath, dirname, join
 
+import _pytest
+
+from transformers.testing_utils import HfDoctestModule, HfDocTestParser
+
 
 # allow having multiple repository checkouts and not needing to remember to rerun
 # 'pip install -e .[dev]' when switching between checkouts and running tests.
@@ -38,7 +42,10 @@ def pytest_configure(config):
     config.addinivalue_line(
         "markers", "is_pt_flax_cross_test: mark test to run only when PT and FLAX interactions are tested"
     )
+    config.addinivalue_line("markers", "is_pipeline_test: mark test to run only when pipelines are tested")
     config.addinivalue_line("markers", "is_staging_test: mark test to run only in the staging environment")
+    config.addinivalue_line("markers", "accelerate_tests: mark test that require accelerate")
+    config.addinivalue_line("markers", "tool_tests: mark the tool tests that are run on their specific schedule")
 
 
 def pytest_addoption(parser):
@@ -62,7 +69,7 @@ def pytest_sessionfinish(session, exitstatus):
 
 
 # Doctest custom flag to ignore output.
-IGNORE_RESULT = doctest.register_optionflag('IGNORE_RESULT')
+IGNORE_RESULT = doctest.register_optionflag("IGNORE_RESULT")
 
 OutputChecker = doctest.OutputChecker
 
@@ -75,3 +82,5 @@ def check_output(self, want, got, optionflags):
 
 
 doctest.OutputChecker = CustomOutputChecker
+_pytest.doctest.DoctestModule = HfDoctestModule
+doctest.DocTestParser = HfDocTestParser
diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile
index 1c79983d3b63..a6c672e1a9df 100644
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:11.6.2-cudnn8-devel-ubuntu20.04
+FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
 LABEL maintainer="Hugging Face"
 
 ARG DEBIAN_FRONTEND=noninteractive
@@ -9,11 +9,11 @@ SHELL ["sh", "-lc"]
 # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
 # to be used as arguments for docker build (so far).
 
-ARG PYTORCH='1.13.0'
+ARG PYTORCH='2.0.1'
 # (not always a valid torch version)
 ARG INTEL_TORCH_EXT='1.11.0'
 # Example: `cu102`, `cu113`, etc.
-ARG CUDA='cu116'
+ARG CUDA='cu118'
 
 RUN apt update
 RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs
@@ -22,7 +22,6 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip
 
 ARG REF=main
 RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
-RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime]
 
 # TODO: Handle these in a python utility script
 RUN [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile
@@ -32,26 +31,35 @@ RUN echo torch=$VERSION
 # TODO: We might need to specify proper versions that work with a specific torch version (especially for past CI).
 RUN [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
 
-RUN python3 -m pip install --no-cache-dir -U tensorflow==2.11
-RUN python3 -m pip install --no-cache-dir -U tensorflow_probability
-RUN python3 -m pip uninstall -y flax jax
+RUN python3 -m pip install --no-cache-dir -U tensorflow==2.13 protobuf==3.20.3 tensorflow_text tensorflow_probability
 
-# To include the change in this commit https://github.com/onnx/tensorflow-onnx/commit/ddca3a5eb2d912f20fe7e0568dd1a3013aee9fa3
-# Otherwise, we get tf2onnx==1.8 (caused by `flatbuffers` version),  and some tests fail with `ValueError: from_keras requires input_signature`.
-# TODO: remove this line once the conflict is resolved in these libraries.
-RUN python3 -m pip install --no-cache-dir git+https://github.com/onnx/tensorflow-onnx.git@ddca3a5eb2d912f20fe7e0568dd1a3013aee9fa3
+RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime]
+
+RUN python3 -m pip uninstall -y flax jax
 
-RUN python3 -m pip install --no-cache-dir intel_extension_for_pytorch==$INTEL_TORCH_EXT+cpu -f https://software.intel.com/ipex-whl-stable
+RUN python3 -m pip install --no-cache-dir intel_extension_for_pytorch==$INTEL_TORCH_EXT+cpu -f https://developer.intel.com/ipex-whl-stable-cpu
 
 RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
 RUN python3 -m pip install -U "itsdangerous<2.1.0"
 
 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
 
+RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/peft@main#egg=peft
+
 # Add bitsandbytes for mixed int8 testing
 RUN python3 -m pip install --no-cache-dir bitsandbytes
 
-RUN python3 -m pip install --no-cache-dir decord
+# Add auto-gptq for gtpq quantization testing
+RUN python3 -m pip install --no-cache-dir auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
+
+# Add einops for additional model testing
+RUN python3 -m pip install --no-cache-dir einops
+
+# For bettertransformer + gptq 
+RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum
+
+# For video model testing
+RUN python3 -m pip install --no-cache-dir decord av==9.2.0
 
 # For `dinat` model
 RUN python3 -m pip install --no-cache-dir natten -f https://shi-labs.com/natten/wheels/$CUDA/
diff --git a/docker/transformers-doc-builder/Dockerfile b/docker/transformers-doc-builder/Dockerfile
index 0e5b072d4889..c9f6adb63e0c 100644
--- a/docker/transformers-doc-builder/Dockerfile
+++ b/docker/transformers-doc-builder/Dockerfile
@@ -11,7 +11,6 @@ RUN apt-get -y update && apt-get install -y libsndfile1-dev && apt install -y te
 RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed]
 
 RUN python3 -m pip install --no-cache-dir torchvision git+https://github.com/facebookresearch/detectron2.git pytesseract
-RUN python3 -m pip install --no-cache-dir pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com
 RUN python3 -m pip install -U "itsdangerous<2.1.0"
 
 # Test if the image could successfully build the doc. before publishing the image
diff --git a/docker/transformers-past-gpu/Dockerfile b/docker/transformers-past-gpu/Dockerfile
index 99fb550c6a35..0cdc9ff07124 100644
--- a/docker/transformers-past-gpu/Dockerfile
+++ b/docker/transformers-past-gpu/Dockerfile
@@ -1,4 +1,4 @@
-ARG BASE_DOCKER_IMAGE="nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04"
+ARG BASE_DOCKER_IMAGE
 FROM $BASE_DOCKER_IMAGE
 LABEL maintainer="Hugging Face"
 
@@ -8,7 +8,7 @@ ARG DEBIAN_FRONTEND=noninteractive
 SHELL ["sh", "-lc"]
 
 RUN apt update
-RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs
+RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs libaio-dev
 RUN git lfs install
 RUN python3 -m pip install --no-cache-dir --upgrade pip
 
@@ -23,9 +23,11 @@ RUN cd transformers && python3 setup.py develop
 ARG FRAMEWORK
 ARG VERSION
 
+# Control `setuptools` version to avoid some issues
+RUN [ "$VERSION" != "1.10" ] && python3 -m pip install -U setuptools || python3 -m pip install -U "setuptools<=59.5"
+
 # Remove all frameworks
-# (`accelerate` requires `torch`, and this causes import issues for TF-only testing)
-RUN python3 -m pip uninstall -y torch torchvision torchaudio accelerate tensorflow jax flax
+RUN python3 -m pip uninstall -y torch torchvision torchaudio tensorflow jax flax
 
 # Get the libraries and their versions to install, and write installation command to `~/.profile`.
 RUN python3 ./transformers/utils/past_ci_versions.py --framework $FRAMEWORK --version $VERSION
@@ -34,4 +36,24 @@ RUN python3 ./transformers/utils/past_ci_versions.py --framework $FRAMEWORK --ve
 RUN echo "INSTALL_CMD = $INSTALL_CMD"
 RUN $INSTALL_CMD
 
+RUN [ "$FRAMEWORK" != "pytorch" ] && echo "`deepspeed-testing` installation is skipped" || python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
+
+# Remove `accelerate`: it requires `torch`, and this causes import issues for TF-only testing
+# We will install `accelerate@main` in Past CI workflow file
+RUN python3 -m pip uninstall -y accelerate
+
+# Uninstall `torch-tensorrt` and `apex` shipped with the base image
+RUN python3 -m pip uninstall -y torch-tensorrt apex
+
+# Pre-build **nightly** release of DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
+RUN python3 -m pip uninstall -y deepspeed
+# This has to be run inside the GPU VMs running the tests. (So far, it fails here due to GPU checks during compilation.)
+# Issue: https://github.com/microsoft/DeepSpeed/issues/2010
+# RUN git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build && \
+#    DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
+
 RUN python3 -m pip install -U "itsdangerous<2.1.0"
+
+# When installing in editable mode, `transformers` is not recognized as a package.
+# this line must be added in order for python to be aware of transformers.
+RUN cd transformers && python3 setup.py develop
diff --git a/docker/transformers-pytorch-amd-gpu/Dockerfile b/docker/transformers-pytorch-amd-gpu/Dockerfile
new file mode 100644
index 000000000000..f19cd4edb0e4
--- /dev/null
+++ b/docker/transformers-pytorch-amd-gpu/Dockerfile
@@ -0,0 +1,31 @@
+FROM rocm/pytorch:rocm5.6_ubuntu20.04_py3.8_pytorch_2.0.1
+LABEL maintainer="Hugging Face"
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+RUN apt update && \
+    apt install -y --no-install-recommends git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg && \
+    apt clean && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools ninja git+https://github.com/facebookresearch/detectron2.git pytesseract "itsdangerous<2.1.0"
+
+# If set to nothing, will install the latest version
+ARG PYTORCH='2.0.1'
+ARG TORCH_VISION='0.15.2'
+ARG TORCH_AUDIO='2.0.2'
+ARG ROCM='5.6'
+
+RUN git clone --depth 1 --branch v$TORCH_AUDIO https://github.com/pytorch/audio.git
+RUN cd audio && USE_ROCM=1 USE_CUDA=0 python setup.py install
+
+ARG REF=main
+WORKDIR /
+RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
+RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video]
+
+RUN python3 -m pip uninstall -y tensorflow flax
+
+# When installing in editable mode, `transformers` is not recognized as a package.
+# this line must be added in order for python to be aware of transformers.
+RUN cd transformers && python3 setup.py develop
diff --git a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
index d19092c2dcd4..c2ce626b474e 100644
--- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
@@ -1,12 +1,12 @@
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel_22-04.html#rel_22-04
-FROM nvcr.io/nvidia/pytorch:22.04-py3
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-22-12.html#rel-22-12
+FROM nvcr.io/nvidia/pytorch:22.12-py3
 LABEL maintainer="Hugging Face"
 
 ARG DEBIAN_FRONTEND=noninteractive
 
-ARG PYTORCH='1.13.0'
+ARG PYTORCH='2.0.1'
 # Example: `cu102`, `cu113`, etc.
-ARG CUDA='cu116'
+ARG CUDA='cu118'
 
 RUN apt -y update
 RUN apt install -y libaio-dev
@@ -15,6 +15,8 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip
 ARG REF=main
 RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
 
+RUN python3 -m pip uninstall -y torch torchvision torchaudio
+
 # Install latest release PyTorch
 # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
 # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
@@ -22,25 +24,31 @@ RUN python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchau
 
 RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
 
-RUN python3 -m pip install torch-tensorrt==1.3.0 --find-links https://github.com/pytorch/TensorRT/releases/expanded_assets/v1.3.0
+RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
+
+# Uninstall `transformer-engine` shipped with the base image
+RUN python3 -m pip uninstall -y transformer-engine
+
+# Uninstall `torch-tensorrt` shipped with the base image
+RUN python3 -m pip uninstall -y torch-tensorrt
 
 # recompile apex
 RUN python3 -m pip uninstall -y apex
 RUN git clone https://github.com/NVIDIA/apex
 #  `MAX_JOBS=1` disables parallel building to avoid cpu memory OOM when building image on GitHub Action (standard) runners
-RUN cd apex && MAX_JOBS=1 python3 -m pip install --global-option="--cpp_ext" --global-option="--cuda_ext" --no-cache -v --disable-pip-version-check .
+RUN cd apex && git checkout 82ee367f3da74b4cd62a1fb47aa9806f0f47b58b && MAX_JOBS=1 python3 -m pip install --global-option="--cpp_ext" --global-option="--cuda_ext" --no-cache -v --disable-pip-version-check .
 
 # Pre-build **latest** DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
 RUN python3 -m pip uninstall -y deepspeed
 # This has to be run (again) inside the GPU VMs running the tests.
 # The installation works here, but some tests fail, if we don't pre-build deepspeed again in the VMs running the tests.
 # TODO: Find out why test fail.
-RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
+RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
 
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
 
 # The base image ships with `pydantic==1.8.2` which is not working - i.e. the next command fails
-RUN python3 -m pip install -U --no-cache-dir pydantic
+RUN python3 -m pip install -U --no-cache-dir "pydantic<2"
 RUN python3 -c "from deepspeed.launcher.runner import main"
diff --git a/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
index 573e09c22a9c..b3ead0c61547 100644
--- a/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
@@ -1,10 +1,11 @@
-FROM nvcr.io/nvidia/pytorch:21.03-py3
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-22-12.html#rel-22-12
+FROM nvcr.io/nvidia/pytorch:22.12-py3
 LABEL maintainer="Hugging Face"
 
 ARG DEBIAN_FRONTEND=noninteractive
 
 # Example: `cu102`, `cu113`, etc.
-ARG CUDA='cu113'
+ARG CUDA='cu118'
 
 RUN apt -y update
 RUN apt install -y libaio-dev
@@ -13,6 +14,8 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip
 ARG REF=main
 RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
 
+RUN python3 -m pip uninstall -y torch torchvision torchaudio
+
 # Install **nightly** release PyTorch (flag `--pre`)
 # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
 # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
@@ -20,30 +23,38 @@ RUN python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio
 
 RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
 
+RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
+
+# Uninstall `transformer-engine` shipped with the base image
+RUN python3 -m pip uninstall -y transformer-engine
+
+# Uninstall `torch-tensorrt` and `apex` shipped with the base image
+RUN python3 -m pip uninstall -y torch-tensorrt apex
+
 # Pre-build **nightly** release of DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
 RUN python3 -m pip uninstall -y deepspeed
 # This has to be run inside the GPU VMs running the tests. (So far, it fails here due to GPU checks during compilation.)
 # Issue: https://github.com/microsoft/DeepSpeed/issues/2010
 # RUN git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build && \
-#    DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
-
-# For `torchdynamo` tests
-# (see https://github.com/huggingface/transformers/pull/17765)
-RUN git clone https://github.com/pytorch/functorch
-RUN python3 -m pip install --no-cache-dir ./functorch[aot]
-RUN cd functorch && python3 setup.py develop
-
-RUN git clone https://github.com/pytorch/torchdynamo
-RUN python3 -m pip install -r ./torchdynamo/requirements.txt
-RUN cd torchdynamo && python3 setup.py develop
-
-# install TensorRT
-RUN python3 -m pip install --no-cache-dir -U nvidia-pyindex
-RUN python3 -m pip install --no-cache-dir -U nvidia-tensorrt==8.2.4.2
+#    DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
 
-# install torch_tensorrt (fx path)
-RUN git clone https://github.com/pytorch/TensorRT.git
-RUN cd TensorRT/py && python3 setup.py install --fx-only
+## For `torchdynamo` tests
+## (see https://github.com/huggingface/transformers/pull/17765)
+#RUN git clone https://github.com/pytorch/functorch
+#RUN python3 -m pip install --no-cache-dir ./functorch[aot]
+#RUN cd functorch && python3 setup.py develop
+#
+#RUN git clone https://github.com/pytorch/torchdynamo
+#RUN python3 -m pip install -r ./torchdynamo/requirements.txt
+#RUN cd torchdynamo && python3 setup.py develop
+#
+## install TensorRT
+#RUN python3 -m pip install --no-cache-dir -U nvidia-pyindex
+#RUN python3 -m pip install --no-cache-dir -U nvidia-tensorrt==8.2.4.2
+#
+## install torch_tensorrt (fx path)
+#RUN git clone https://github.com/pytorch/TensorRT.git
+#RUN cd TensorRT/py && python3 setup.py install --fx-only
 
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
diff --git a/docker/transformers-pytorch-gpu/Dockerfile b/docker/transformers-pytorch-gpu/Dockerfile
index d34dcc116aeb..702a837abd01 100644
--- a/docker/transformers-pytorch-gpu/Dockerfile
+++ b/docker/transformers-pytorch-gpu/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:11.6.2-cudnn8-devel-ubuntu20.04
+FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
 LABEL maintainer="Hugging Face"
 
 ARG DEBIAN_FRONTEND=noninteractive
@@ -9,19 +9,20 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip
 
 ARG REF=main
 RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
-RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video]
 
 # If set to nothing, will install the latest version
-ARG PYTORCH='1.13.0'
+ARG PYTORCH='2.1.0'
 ARG TORCH_VISION=''
 ARG TORCH_AUDIO=''
 # Example: `cu102`, `cu113`, etc.
-ARG CUDA='cu116'
+ARG CUDA='cu118'
 
 RUN [ ${#PYTORCH} -gt 0 ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch'; python3 -m pip install --no-cache-dir -U $VERSION --extra-index-url https://download.pytorch.org/whl/$CUDA
 RUN [ ${#TORCH_VISION} -gt 0 ] && VERSION='torchvision=='TORCH_VISION'.*' ||  VERSION='torchvision'; python3 -m pip install --no-cache-dir -U $VERSION --extra-index-url https://download.pytorch.org/whl/$CUDA
 RUN [ ${#TORCH_AUDIO} -gt 0 ] && VERSION='torchaudio=='TORCH_AUDIO'.*' ||  VERSION='torchaudio'; python3 -m pip install --no-cache-dir -U $VERSION --extra-index-url https://download.pytorch.org/whl/$CUDA
 
+RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video]
+
 RUN python3 -m pip uninstall -y tensorflow flax
 
 RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
diff --git a/docker/transformers-tensorflow-gpu/Dockerfile b/docker/transformers-tensorflow-gpu/Dockerfile
index 09e8512f2ce8..df9039a0c4d2 100644
--- a/docker/transformers-tensorflow-gpu/Dockerfile
+++ b/docker/transformers-tensorflow-gpu/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04
+FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
 LABEL maintainer="Hugging Face"
 
 ARG DEBIAN_FRONTEND=noninteractive
@@ -12,7 +12,7 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers &&
 RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-tensorflow,testing]
 
 # If set to nothing, will install the latest version
-ARG TENSORFLOW='2.11'
+ARG TENSORFLOW='2.13'
 
 RUN [ ${#TENSORFLOW} -gt 0 ] && VERSION='tensorflow=='$TENSORFLOW'.*' ||  VERSION='tensorflow'; python3 -m pip install --no-cache-dir -U $VERSION
 RUN python3 -m pip uninstall -y torch flax
diff --git a/docs/README.md b/docs/README.md
index 9aa74d4de94b..9269cc5bd291 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -81,10 +81,10 @@ The `preview` command only works with existing doc files. When you add a complet
 
 ## Adding a new element to the navigation bar
 
-Accepted files are Markdown (.md or .mdx).
+Accepted files are Markdown (.md).
 
 Create a file with its extension and put it in the source directory. You can then link it to the toc-tree by putting
-the filename without the extension in the [`_toctree.yml`](https://github.com/huggingface/transformers/blob/main/docs/source/_toctree.yml) file.
+the filename without the extension in the [`_toctree.yml`](https://github.com/huggingface/transformers/blob/main/docs/source/en/_toctree.yml) file.
 
 ## Renaming section headers and moving sections
 
@@ -109,7 +109,7 @@ Sections that were moved:
 
 Use the relative style to link to the new file so that the versioned docs continue to work.
 
-For an example of a rich moved section set please see the very end of [the Trainer doc](https://github.com/huggingface/transformers/blob/main/docs/source/en/main_classes/trainer.mdx).
+For an example of a rich moved section set please see the very end of [the Trainer doc](https://github.com/huggingface/transformers/blob/main/docs/source/en/main_classes/trainer.md).
 
 
 ## Writing Documentation - Specification
@@ -138,7 +138,7 @@ When translating, refer to the guide at [./TRANSLATING.md](https://github.com/hu
 
 When adding a new model:
 
-- Create a file `xxx.mdx` or under `./source/model_doc` (don't hesitate to copy an existing file as template).
+- Create a file `xxx.md` or under `./source/model_doc` (don't hesitate to copy an existing file as template).
 - Link that file in `./source/_toctree.yml`.
 - Write a short overview of the model:
     - Overview with paper & authors
@@ -147,7 +147,7 @@ When adding a new model:
 - Add the classes that should be linked in the model. This generally includes the configuration, the tokenizer, and
   every model of that class (the base model, alongside models with additional heads), both in PyTorch and TensorFlow.
   The order is generally:
-    - Configuration,
+    - Configuration
     - Tokenizer
     - PyTorch base model
     - PyTorch head models
@@ -364,25 +364,9 @@ We use pytests' [doctest integration](https://docs.pytest.org/doctest.html) to v
 For Transformers, the doctests are run on a daily basis via GitHub Actions as can be 
 seen [here](https://github.com/huggingface/transformers/actions/workflows/doctests.yml).
 
-To include your example in the daily doctests, you need to add the filename that
-contains the example docstring to the [documentation_tests.txt](../utils/documentation_tests.txt).
-
 ### For Python files
 
-You will first need to run the following command (from the root of the repository) to prepare the doc file (doc-testing needs to add additional lines that we don't include in the doc source files):
-
-```bash
-python utils/prepare_for_doc_test.py src docs
-```
-
-If you work on a specific python module, say `modeling_wav2vec2.py`, you can run the command as follows (to avoid the unnecessary temporary changes in irrelevant files):
-
-```bash
-python utils/prepare_for_doc_test.py src/transformers/utils/doc.py src/transformers/models/wav2vec2/modeling_wav2vec2.py
-```
-(`utils/doc.py` should always be included)
-
-Then you can run all the tests in the docstrings of a given file with the following command, here is how we test the modeling file of Wav2Vec2 for instance:
+Run all the tests in the docstrings of a given file with the following command, here is how we test the modeling file of Wav2Vec2 for instance:
 
 ```bash
 pytest --doctest-modules src/transformers/models/wav2vec2/modeling_wav2vec2.py -sv --doctest-continue-on-failure
@@ -394,30 +378,12 @@ If you want to isolate a specific docstring, just add `::` after the file name t
 pytest --doctest-modules src/transformers/models/wav2vec2/modeling_wav2vec2.py::transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.forward -sv --doctest-continue-on-failure
 ```
 
-Once you're done, you can run the following command (still from the root of the repository) to undo the changes made by the first command before committing:
-
-```bash
-python utils/prepare_for_doc_test.py src docs --remove_new_line
-```
-
 ### For Markdown files
 
-You will first need to run the following command (from the root of the repository) to prepare the doc file (doc-testing needs to add additional lines that we don't include in the doc source files):
-
-```bash
-python utils/prepare_for_doc_test.py src docs
-```
-
-Then you can test locally a given file with this command (here testing the quicktour):
-
-```bash
-pytest --doctest-modules docs/source/quicktour.mdx -sv --doctest-continue-on-failure --doctest-glob="*.mdx"
-```
-
-Once you're done, you can run the following command (still from the root of the repository) to undo the changes made by the first command before committing:
+You can test locally a given file with this command (here testing the quicktour):
 
 ```bash
-python utils/prepare_for_doc_test.py src docs --remove_new_line
+pytest --doctest-modules docs/source/quicktour.md -sv --doctest-continue-on-failure --doctest-glob="*.md"
 ```
 
 ### Writing doctests
diff --git a/docs/TRANSLATING.md b/docs/TRANSLATING.md
index c6f5c45baf02..420e7a8b16a1 100644
--- a/docs/TRANSLATING.md
+++ b/docs/TRANSLATING.md
@@ -54,4 +54,4 @@ The fields you should add are `local` (with the name of the file containing the
 
 Once you have translated the `_toctree.yml` file, you can start translating the [MDX](https://mdxjs.com/) files associated with your docs chapter.
 
-> 🙋 If you'd like others to help you with the translation, you should [open an issue](https://github.com/huggingface/transformers/issues) and tag @sgugger.
+> 🙋 If you'd like others to help you with the translation, you should [open an issue](https://github.com/huggingface/transformers/issues) and tag @stevhliu and @MKhalusova.
diff --git a/docs/source/de/_toctree.yml b/docs/source/de/_toctree.yml
index 8b15c2c53e7c..d18a14ce9298 100644
--- a/docs/source/de/_toctree.yml
+++ b/docs/source/de/_toctree.yml
@@ -15,8 +15,28 @@
     title: Vorverarbeiten
   - local: training
     title: Optimierung eines vortrainierten Modells
+  - local: run_scripts
+    title: Trainieren mit einem Skript
   - local: accelerate
     title: Verteiltes Training mit 🤗 Accelerate
+  - local: peft
+    title: Laden und Trainieren von Adaptern mit 🤗 PEFT
   - local: model_sharing
     title: Ein Modell teilen
+  - local: transformers_agents
+    title: Agents
+  - local: llm_tutorial
+    title: Generation with LLMs
   title: Tutorials
+- sections:
+  - local: add_new_model
+    title: Wie fügt man ein Modell zu 🤗 Transformers hinzu?
+  - local: add_tensorflow_model
+    title: Wie konvertiert man ein 🤗 Transformers-Modell in TensorFlow?
+  - local: add_new_pipeline
+    title: Wie fügt man eine Pipeline zu 🤗 Transformers hinzu?
+  - local: testing
+    title: Testen
+  - local: pr_checks
+    title: Überprüfung einer Pull Request
+  title: Contribute
\ No newline at end of file
diff --git a/docs/source/de/accelerate.md b/docs/source/de/accelerate.md
new file mode 100644
index 000000000000..98a11cbdc417
--- /dev/null
+++ b/docs/source/de/accelerate.md
@@ -0,0 +1,136 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Verteiltes Training mit 🤗 Accelerate
+
+Da die Modelle immer größer werden, hat sich die Parallelität als Strategie zum Trainieren größerer Modelle auf begrenzter Hardware und zur Beschleunigung der Trainingsgeschwindigkeit um mehrere Größenordnungen erwiesen. Bei Hugging Face haben wir die Bibliothek [🤗 Accelerate](https://huggingface.co/docs/accelerate) entwickelt, um Nutzern zu helfen, ein 🤗 Transformers-Modell auf jeder Art von verteiltem Setup zu trainieren, egal ob es sich um mehrere GPUs auf einer Maschine oder mehrere GPUs auf mehreren Maschinen handelt. In diesem Tutorial lernen Sie, wie Sie Ihre native PyTorch-Trainingsschleife anpassen, um das Training in einer verteilten Umgebung zu ermöglichen.
+
+## Einrichtung
+
+Beginnen Sie mit der Installation von 🤗 Accelerate:
+
+```bash
+pip install accelerate
+```
+
+Dann importieren und erstellen Sie ein [`~accelerate.Accelerator`]-Objekt. Der [`~accelerate.Accelerator`] wird automatisch Ihre Art der verteilten Einrichtung erkennen und alle notwendigen Komponenten für das Training initialisieren. Sie müssen Ihr Modell nicht explizit auf einem Gerät platzieren.
+
+```py
+>>> from accelerate import Accelerator
+
+>>> accelerator = Accelerator()
+```
+
+## Vorbereiten auf die Beschleunigung
+
+Der nächste Schritt ist die Übergabe aller relevanten Trainingsobjekte an die Methode [`~accelerate.Accelerator.prepare`]. Dazu gehören Ihre Trainings- und Evaluierungs-DataLoader, ein Modell und ein Optimierer:
+
+```py
+>>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
+...     train_dataloader, eval_dataloader, model, optimizer
+... )
+```
+
+## Rückwärts
+
+Die letzte Ergänzung besteht darin, das typische `loss.backward()` in der Trainingsschleife durch die 🤗 Accelerate-Methode [`~accelerate.Accelerator.backward`] zu ersetzen:
+
+```py
+>>> for epoch in range(num_epochs):
+...     for batch in train_dataloader:
+...         outputs = model(**batch)
+...         loss = outputs.loss
+...         accelerator.backward(loss)
+
+...         optimizer.step()
+...         lr_scheduler.step()
+...         optimizer.zero_grad()
+...         progress_bar.update(1)
+```
+
+Wie Sie im folgenden Code sehen können, müssen Sie nur vier zusätzliche Codezeilen zu Ihrer Trainingsschleife hinzufügen, um verteiltes Training zu ermöglichen!
+
+```diff
++ from accelerate import Accelerator
+  from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
+
++ accelerator = Accelerator()
+
+  model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
+  optimizer = AdamW(model.parameters(), lr=3e-5)
+
+- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+- model.to(device)
+
++ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
++     train_dataloader, eval_dataloader, model, optimizer
++ )
+
+  num_epochs = 3
+  num_training_steps = num_epochs * len(train_dataloader)
+  lr_scheduler = get_scheduler(
+      "linear",
+      optimizer=optimizer,
+      num_warmup_steps=0,
+      num_training_steps=num_training_steps
+  )
+
+  progress_bar = tqdm(range(num_training_steps))
+
+  model.train()
+  for epoch in range(num_epochs):
+      for batch in train_dataloader:
+-         batch = {k: v.to(device) for k, v in batch.items()}
+          outputs = model(**batch)
+          loss = outputs.loss
+-         loss.backward()
++         accelerator.backward(loss)
+
+          optimizer.step()
+          lr_scheduler.step()
+          optimizer.zero_grad()
+          progress_bar.update(1)
+```
+
+## Trainieren
+
+Sobald Sie die entsprechenden Codezeilen hinzugefügt haben, starten Sie Ihr Training in einem Skript oder einem Notebook wie Colaboratory.
+
+### Trainieren mit einem Skript
+
+Wenn Sie Ihr Training mit einem Skript durchführen, führen Sie den folgenden Befehl aus, um eine Konfigurationsdatei zu erstellen und zu speichern:
+
+```bash
+accelerate config
+```
+
+Dann starten Sie Ihr Training mit:
+
+```bash
+accelerate launch train.py
+```
+
+### Trainieren mit einem Notebook
+
+🤗 Accelerate kann auch in einem Notebook laufen, wenn Sie planen, die TPUs von Colaboratory zu verwenden. Verpacken Sie den gesamten Code, der für das Training verantwortlich ist, in eine Funktion und übergeben Sie diese an [`~accelerate.notebook_launcher`]:
+
+```py
+>>> from accelerate import notebook_launcher
+
+>>> notebook_launcher(training_function)
+```
+
+Weitere Informationen über 🤗 Accelerate und seine umfangreichen Funktionen finden Sie in der [Dokumentation](https://huggingface.co/docs/accelerate).
\ No newline at end of file
diff --git a/docs/source/de/accelerate.mdx b/docs/source/de/accelerate.mdx
deleted file mode 100644
index 64f85f205f8a..000000000000
--- a/docs/source/de/accelerate.mdx
+++ /dev/null
@@ -1,132 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Verteiltes Training mit 🤗 Accelerate
-
-Da die Modelle immer größer werden, hat sich die Parallelität als Strategie zum Trainieren größerer Modelle auf begrenzter Hardware und zur Beschleunigung der Trainingsgeschwindigkeit um mehrere Größenordnungen erwiesen. Bei Hugging Face haben wir die Bibliothek [🤗 Accelerate](https://huggingface.co/docs/accelerate) entwickelt, um Nutzern zu helfen, ein 🤗 Transformers-Modell auf jeder Art von verteiltem Setup zu trainieren, egal ob es sich um mehrere GPUs auf einer Maschine oder mehrere GPUs auf mehreren Maschinen handelt. In diesem Tutorial lernen Sie, wie Sie Ihre native PyTorch-Trainingsschleife anpassen, um das Training in einer verteilten Umgebung zu ermöglichen.
-
-## Einrichtung
-
-Beginnen Sie mit der Installation von 🤗 Accelerate:
-
-```bash
-pip install accelerate
-```
-
-Dann importieren und erstellen Sie ein [`~accelerate.Accelerator`]-Objekt. Der [`~accelerate.Accelerator`] wird automatisch Ihre Art der verteilten Einrichtung erkennen und alle notwendigen Komponenten für das Training initialisieren. Sie müssen Ihr Modell nicht explizit auf einem Gerät platzieren.
-
-```py
->>> from accelerate import Accelerator
-
->>> accelerator = Accelerator()
-```
-
-## Vorbereiten auf die Beschleunigung
-
-Der nächste Schritt ist die Übergabe aller relevanten Trainingsobjekte an die Methode [`~accelerate.Accelerator.prepare`]. Dazu gehören Ihre Trainings- und Evaluierungs-DataLoader, ein Modell und ein Optimierer:
-
-```py
->>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
-...     train_dataloader, eval_dataloader, model, optimizer
-... )
-```
-
-## Rückwärts
-
-Die letzte Ergänzung besteht darin, das typische `loss.backward()` in der Trainingsschleife durch die 🤗 Accelerate-Methode [`~accelerate.Accelerator.backward`] zu ersetzen:
-
-```py
->>> for epoch in range(num_epochs):
-...     for batch in train_dataloader:
-...         outputs = model(**batch)
-...         loss = outputs.loss
-...         accelerator.backward(loss)
-
-...         optimizer.step()
-...         lr_scheduler.step()
-...         optimizer.zero_grad()
-...         progress_bar.update(1)
-```
-
-Wie Sie im folgenden Code sehen können, müssen Sie nur vier zusätzliche Codezeilen zu Ihrer Trainingsschleife hinzufügen, um verteiltes Training zu ermöglichen!
-
-```diff
-+ from accelerate import Accelerator
-  from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
-
-+ accelerator = Accelerator()
-
-  model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
-  optimizer = AdamW(model.parameters(), lr=3e-5)
-
-- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
-- model.to(device)
-
-+ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
-+     train_dataloader, eval_dataloader, model, optimizer
-+ )
-
-  num_epochs = 3
-  num_training_steps = num_epochs * len(train_dataloader)
-  lr_scheduler = get_scheduler(
-      "linear",
-      optimizer=optimizer,
-      num_warmup_steps=0,
-      num_training_steps=num_training_steps
-  )
-
-  progress_bar = tqdm(range(num_training_steps))
-
-  model.train()
-  for epoch in range(num_epochs):
-      for batch in train_dataloader:
--         batch = {k: v.to(device) for k, v in batch.items()}
-          outputs = model(**batch)
-          loss = outputs.loss
--         loss.backward()
-+         accelerator.backward(loss)
-
-          optimizer.step()
-          lr_scheduler.step()
-          optimizer.zero_grad()
-          progress_bar.update(1)
-```
-
-## Trainieren
-
-Sobald Sie die entsprechenden Codezeilen hinzugefügt haben, starten Sie Ihr Training in einem Skript oder einem Notebook wie Colaboratory.
-
-### Trainieren mit einem Skript
-
-Wenn Sie Ihr Training mit einem Skript durchführen, führen Sie den folgenden Befehl aus, um eine Konfigurationsdatei zu erstellen und zu speichern:
-
-```bash
-accelerate config
-```
-
-Dann starten Sie Ihr Training mit:
-
-```bash
-accelerate launch train.py
-```
-
-### Trainieren mit einem Notebook
-
-🤗 Accelerate kann auch in einem Notebook laufen, wenn Sie planen, die TPUs von Colaboratory zu verwenden. Verpacken Sie den gesamten Code, der für das Training verantwortlich ist, in eine Funktion und übergeben Sie diese an [`~accelerate.notebook_launcher`]:
-
-```py
->>> from accelerate import notebook_launcher
-
->>> notebook_launcher(training_function)
-```
-
-Weitere Informationen über 🤗 Accelerate und seine umfangreichen Funktionen finden Sie in der [Dokumentation](https://huggingface.co/docs/accelerate).
\ No newline at end of file
diff --git a/docs/source/de/add_new_model.md b/docs/source/de/add_new_model.md
new file mode 100644
index 000000000000..2c1f0f6a00ad
--- /dev/null
+++ b/docs/source/de/add_new_model.md
@@ -0,0 +1,895 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Wie kann ich ein Modell zu 🤗 Transformers hinzufügen?
+
+Die 🤗 Transformers-Bibliothek ist dank der Beiträge der Community oft in der Lage, neue Modelle anzubieten. Aber das kann ein anspruchsvolles Projekt sein und erfordert eine eingehende Kenntnis der 🤗 Transformers-Bibliothek und des zu implementierenden Modells. Bei Hugging Face versuchen wir, mehr Mitgliedern der Community die Möglichkeit zu geben, aktiv Modelle hinzuzufügen, und wir haben diese Anleitung zusammengestellt, die Sie durch den Prozess des Hinzufügens eines PyTorch-Modells führt (stellen Sie sicher, dass Sie [PyTorch installiert haben](https://pytorch.org/get-started/locally/)).
+
+<Tip>
+
+Wenn Sie daran interessiert sind, ein TensorFlow-Modell zu implementieren, werfen Sie einen Blick in die Anleitung [How to convert a 🤗 Transformers model to TensorFlow](add_tensorflow_model)!
+
+</Tip>
+
+Auf dem Weg dorthin, werden Sie:
+
+- Einblicke in bewährte Open-Source-Verfahren erhalten
+- die Konstruktionsprinzipien hinter einer der beliebtesten Deep-Learning-Bibliotheken verstehen
+- lernen Sie, wie Sie große Modelle effizient testen können
+- lernen Sie, wie Sie Python-Hilfsprogramme wie `black`, `ruff` und `make fix-copies` integrieren, um sauberen und lesbaren Code zu gewährleisten
+
+Ein Mitglied des Hugging Face-Teams wird Ihnen dabei zur Seite stehen, damit Sie nicht alleine sind. 🤗 ❤️
+
+Um loszulegen, öffnen Sie eine [New model addition](https://github.com/huggingface/transformers/issues/new?assignees=&labels=New+model&template=new-model-addition.yml) Ausgabe für das Modell, das Sie in 🤗 Transformers sehen möchten. Wenn Sie nicht besonders wählerisch sind, wenn es darum geht, ein bestimmtes Modell beizusteuern, können Sie nach dem [New model label](https://github.com/huggingface/transformers/labels/New%20model) filtern, um zu sehen, ob es noch unbeanspruchte Modellanfragen gibt, und daran arbeiten.
+
+Sobald Sie eine neue Modellanfrage eröffnet haben, sollten Sie sich zunächst mit 🤗 Transformers vertraut machen, falls Sie das noch nicht sind!
+
+## Allgemeiner Überblick über 🤗 Transformers
+
+Zunächst sollten Sie sich einen allgemeinen Überblick über 🤗 Transformers verschaffen. 🤗 Transformers ist eine sehr meinungsfreudige Bibliothek, es ist also möglich, dass
+Es besteht also die Möglichkeit, dass Sie mit einigen der Philosophien oder Designentscheidungen der Bibliothek nicht einverstanden sind. Aus unserer Erfahrung heraus haben wir jedoch
+dass die grundlegenden Designentscheidungen und Philosophien der Bibliothek entscheidend sind, um 🤗 Transformers effizient zu skalieren.
+Transformatoren zu skalieren und gleichzeitig die Wartungskosten auf einem vernünftigen Niveau zu halten.
+
+Ein guter erster Ansatzpunkt, um die Bibliothek besser zu verstehen, ist die Lektüre der [Dokumentation unserer Philosophie](Philosophie). Als Ergebnis unserer Arbeitsweise gibt es einige Entscheidungen, die wir versuchen, auf alle Modelle anzuwenden:
+
+- Komposition wird im Allgemeinen gegenüber Abstraktion bevorzugt
+- Die Duplizierung von Code ist nicht immer schlecht, wenn sie die Lesbarkeit oder Zugänglichkeit eines Modells stark verbessert
+- Modelldateien sind so in sich geschlossen wie möglich, so dass Sie, wenn Sie den Code eines bestimmten Modells lesen, idealerweise nur
+  in die entsprechende Datei `modeling_....py` schauen müssen.
+
+Unserer Meinung nach ist der Code der Bibliothek nicht nur ein Mittel, um ein Produkt bereitzustellen, *z.B.* die Möglichkeit, BERT für
+Inferenz zu verwenden, sondern auch als das Produkt selbst, das wir verbessern wollen. Wenn Sie also ein Modell hinzufügen, ist der Benutzer nicht nur die
+Person, die Ihr Modell verwenden wird, sondern auch jeder, der Ihren Code liest, zu verstehen versucht und ihn möglicherweise verbessert.
+
+Lassen Sie uns daher ein wenig tiefer in das allgemeine Design der Bibliothek einsteigen.
+
+### Überblick über die Modelle
+
+Um ein Modell erfolgreich hinzuzufügen, ist es wichtig, die Interaktion zwischen Ihrem Modell und seiner Konfiguration zu verstehen,
+[`PreTrainedModel`] und [`PretrainedConfig`]. Als Beispiel werden wir
+das Modell, das zu 🤗 Transformers hinzugefügt werden soll, `BrandNewBert` nennen.
+
+Schauen wir uns das mal an:
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_overview.png"/>
+
+Wie Sie sehen, machen wir in 🤗 Transformers von der Vererbung Gebrauch, aber wir beschränken die Abstraktionsebene auf ein absolutes Minimum.
+Minimum. Es gibt nie mehr als zwei Abstraktionsebenen für ein Modell in der Bibliothek. `BrandNewBertModel`
+erbt von `BrandNewBertPreTrainedModel`, das wiederum von [`PreTrainedModel`] erbt und
+das war's. In der Regel wollen wir sicherstellen, dass ein neues Modell nur von
+[`PreTrainedModel`] abhängt. Die wichtigen Funktionalitäten, die jedem neuen Modell automatisch zur Verfügung gestellt werden, sind
+Modell automatisch bereitgestellt werden, sind [`~PreTrainedModel.from_pretrained`] und
+[`~PreTrainedModel.save_pretrained`], die für die Serialisierung und Deserialisierung verwendet werden. Alle
+anderen wichtigen Funktionalitäten, wie `BrandNewBertModel.forward` sollten vollständig in der neuen
+Skript `modeling_brand_new_bert.py` definiert werden. Als nächstes wollen wir sicherstellen, dass ein Modell mit einer bestimmten Kopfebene, wie z.B.
+`BrandNewBertForMaskedLM` nicht von `BrandNewBertModel` erbt, sondern `BrandNewBertModel` verwendet
+als Komponente, die im Forward Pass aufgerufen werden kann, um die Abstraktionsebene niedrig zu halten. Jedes neue Modell erfordert eine
+Konfigurationsklasse, genannt `BrandNewBertConfig`. Diese Konfiguration wird immer als ein Attribut in
+[PreTrainedModel] gespeichert und kann daher über das Attribut `config` für alle Klassen aufgerufen werden
+die von `BrandNewBertPreTrainedModel` erben:
+
+```python
+model = BrandNewBertModel.from_pretrained("brandy/brand_new_bert")
+model.config  # model has access to its config
+```
+
+Ähnlich wie das Modell erbt die Konfiguration grundlegende Serialisierungs- und Deserialisierungsfunktionalitäten von
+[`PretrainedConfig`]. Beachten Sie, dass die Konfiguration und das Modell immer in zwei verschiedene Formate serialisiert werden
+unterschiedliche Formate serialisiert werden - das Modell in eine *pytorch_model.bin* Datei und die Konfiguration in eine *config.json* Datei. Aufruf von
+[~PreTrainedModel.save_pretrained`] wird automatisch
+[~PretrainedConfig.save_pretrained`] auf, so dass sowohl das Modell als auch die Konfiguration gespeichert werden.
+
+
+### Code-Stil
+
+Wenn Sie Ihr neues Modell kodieren, sollten Sie daran denken, dass Transformers eine Bibliothek mit vielen Meinungen ist und dass wir selbst ein paar Macken haben
+wie der Code geschrieben werden sollte :-)
+
+1. Der Vorwärtsdurchlauf Ihres Modells sollte vollständig in die Modellierungsdatei geschrieben werden und dabei völlig unabhängig von anderen
+   Modellen in der Bibliothek. Wenn Sie einen Block aus einem anderen Modell wiederverwenden möchten, kopieren Sie den Code und fügen ihn mit einem
+   `# Kopiert von` ein (siehe [hier](https://github.com/huggingface/transformers/blob/v4.17.0/src/transformers/models/roberta/modeling_roberta.py#L160)
+   für ein gutes Beispiel und [hier](pr_checks#check-copies) für weitere Dokumentation zu Copied from). 
+2. Der Code sollte vollständig verständlich sein, auch für einen Nicht-Muttersprachler. Das heißt, Sie sollten
+   beschreibende Variablennamen wählen und Abkürzungen vermeiden. Ein Beispiel: `activation` ist `act` vorzuziehen.
+   Von Variablennamen mit nur einem Buchstaben wird dringend abgeraten, es sei denn, es handelt sich um einen Index in einer for-Schleife.
+3. Generell ziehen wir längeren expliziten Code einem kurzen magischen Code vor.
+4. Vermeiden Sie die Unterklassifizierung von `nn.Sequential` in PyTorch, sondern unterklassifizieren Sie `nn.Module` und schreiben Sie den Vorwärtspass, so dass jeder
+   so dass jeder, der Ihren Code verwendet, ihn schnell debuggen kann, indem er Druckanweisungen oder Haltepunkte hinzufügt.
+5. Ihre Funktionssignatur sollte mit einer Typ-Annotation versehen sein. Im Übrigen sind gute Variablennamen viel lesbarer und verständlicher
+   verständlicher als Typ-Anmerkungen.
+
+### Übersicht der Tokenizer
+
+Noch nicht ganz fertig :-( Dieser Abschnitt wird bald hinzugefügt!
+
+## Schritt-für-Schritt-Rezept zum Hinzufügen eines Modells zu 🤗 Transformers
+
+Jeder hat andere Vorlieben, was die Portierung eines Modells angeht. Daher kann es sehr hilfreich sein, wenn Sie sich Zusammenfassungen ansehen
+wie andere Mitwirkende Modelle auf Hugging Face portiert haben. Hier ist eine Liste von Blogbeiträgen aus der Community, wie man ein Modell portiert:
+
+1. [Portierung eines GPT2-Modells](https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28) von [Thomas](https://huggingface.co/thomwolf)
+2. [Portierung des WMT19 MT-Modells](https://huggingface.co/blog/porting-fsmt) von [Stas](https://huggingface.co/stas)
+
+Aus Erfahrung können wir Ihnen sagen, dass die wichtigsten Dinge, die Sie beim Hinzufügen eines Modells beachten müssen, sind:
+
+- Erfinden Sie das Rad nicht neu! Die meisten Teile des Codes, den Sie für das neue 🤗 Transformers-Modell hinzufügen werden, existieren bereits
+  irgendwo in 🤗 Transformers. Nehmen Sie sich etwas Zeit, um ähnliche, bereits vorhandene Modelle und Tokenizer zu finden, die Sie kopieren können
+  von. [grep](https://www.gnu.org/software/grep/) und [rg](https://github.com/BurntSushi/ripgrep) sind Ihre
+  Freunde. Beachten Sie, dass es sehr gut möglich ist, dass der Tokenizer Ihres Modells auf einer Modellimplementierung basiert und
+  und der Modellierungscode Ihres Modells auf einer anderen. *Z.B.* Der Modellierungscode von FSMT basiert auf BART, während der Tokenizer-Code von FSMT
+  auf XLM basiert.
+- Es handelt sich eher um eine technische als um eine wissenschaftliche Herausforderung. Sie sollten mehr Zeit auf die Schaffung einer
+  eine effiziente Debugging-Umgebung zu schaffen, als zu versuchen, alle theoretischen Aspekte des Modells in dem Papier zu verstehen.
+- Bitten Sie um Hilfe, wenn Sie nicht weiterkommen! Modelle sind der Kernbestandteil von 🤗 Transformers, so dass wir bei Hugging Face mehr als
+  mehr als glücklich, Ihnen bei jedem Schritt zu helfen, um Ihr Modell hinzuzufügen. Zögern Sie nicht zu fragen, wenn Sie merken, dass Sie nicht weiterkommen.
+  Fortschritte machen.
+
+Im Folgenden versuchen wir, Ihnen ein allgemeines Rezept an die Hand zu geben, das uns bei der Portierung eines Modells auf 🤗 Transformers am nützlichsten erschien.
+
+Die folgende Liste ist eine Zusammenfassung all dessen, was getan werden muss, um ein Modell hinzuzufügen und kann von Ihnen als To-Do verwendet werden
+Liste verwenden:
+
+☐ (Optional) Verstehen der theoretischen Aspekte des Modells<br>
+☐ Vorbereiten der 🤗 Transformers-Entwicklungsumgebung<br>
+☐ Debugging-Umgebung des ursprünglichen Repositorys eingerichtet<br>
+☐ Skript erstellt, das den Durchlauf `forward()` unter Verwendung des ursprünglichen Repositorys und des Checkpoints erfolgreich durchführt<br>
+☐ Erfolgreich das Modellskelett zu 🤗 Transformers hinzugefügt<br>
+☐ Erfolgreiche Umwandlung des ursprünglichen Prüfpunkts in den 🤗 Transformers-Prüfpunkt<br>
+☐ Erfolgreich den Durchlauf `forward()` in 🤗 Transformers ausgeführt, der eine identische Ausgabe wie der ursprüngliche Prüfpunkt liefert<br>
+☐ Modell-Tests in 🤗 Transformers abgeschlossen<br>
+☐ Erfolgreich Tokenizer in 🤗 Transformers hinzugefügt<br>
+☐ End-to-End-Integrationstests ausgeführt<br>
+☐ Docs fertiggestellt<br>
+☐ Modellgewichte in den Hub hochgeladen<br>
+☐ Die Pull-Anfrage eingereicht<br>
+☐ (Optional) Hinzufügen eines Demo-Notizbuchs
+
+Für den Anfang empfehlen wir in der Regel, mit einem guten theoretischen Verständnis von `BrandNewBert` zu beginnen. Wie auch immer,
+wenn Sie es vorziehen, die theoretischen Aspekte des Modells *on-the-job* zu verstehen, dann ist es völlig in Ordnung, direkt in die
+in die Code-Basis von `BrandNewBert` einzutauchen. Diese Option könnte für Sie besser geeignet sein, wenn Ihre technischen Fähigkeiten besser sind als
+als Ihre theoretischen Fähigkeiten, wenn Sie Schwierigkeiten haben, die Arbeit von `BrandNewBert` zu verstehen, oder wenn Sie einfach Spaß am Programmieren
+mehr Spaß am Programmieren haben als am Lesen wissenschaftlicher Abhandlungen.
+
+### 1. (Optional) Theoretische Aspekte von BrandNewBert
+
+Sie sollten sich etwas Zeit nehmen, um die Abhandlung von *BrandNewBert* zu lesen, falls eine solche Beschreibung existiert. Möglicherweise gibt es große
+Abschnitte des Papiers, die schwer zu verstehen sind. Wenn das der Fall ist, ist das in Ordnung - machen Sie sich keine Sorgen! Das Ziel ist
+ist es nicht, ein tiefes theoretisches Verständnis des Papiers zu erlangen, sondern die notwendigen Informationen zu extrahieren, um
+das Modell effektiv in 🤗 Transformers zu implementieren. Das heißt, Sie müssen nicht zu viel Zeit auf die
+theoretischen Aspekten verbringen, sondern sich lieber auf die praktischen Aspekte konzentrieren, nämlich:
+
+- Welche Art von Modell ist *brand_new_bert*? BERT-ähnliches Modell nur für den Encoder? GPT2-ähnliches reines Decoder-Modell? BART-ähnliches
+  Encoder-Decoder-Modell? Sehen Sie sich die [model_summary](model_summary) an, wenn Sie mit den Unterschieden zwischen diesen Modellen nicht vertraut sind.
+- Was sind die Anwendungen von *brand_new_bert*? Textklassifizierung? Texterzeugung? Seq2Seq-Aufgaben, *z.B.,*
+  Zusammenfassungen?
+- Was ist die neue Eigenschaft des Modells, die es von BERT/GPT-2/BART unterscheidet?
+- Welches der bereits existierenden [🤗 Transformers-Modelle](https://huggingface.co/transformers/#contents) ist am ähnlichsten
+  ähnlich wie *brand_new_bert*?
+- Welche Art von Tokenizer wird verwendet? Ein Satzteil-Tokenisierer? Ein Wortstück-Tokenisierer? Ist es derselbe Tokenisierer, der für
+  für BERT oder BART?
+
+Nachdem Sie das Gefühl haben, einen guten Überblick über die Architektur des Modells erhalten zu haben, können Sie dem
+Hugging Face Team schreiben und Ihre Fragen stellen. Dazu können Fragen zur Architektur des Modells gehören,
+seiner Aufmerksamkeitsebene usw. Wir werden Ihnen gerne weiterhelfen.
+
+### 2. Bereiten Sie als nächstes Ihre Umgebung vor
+
+1. Forken Sie das [Repository](https://github.com/huggingface/transformers), indem Sie auf der Seite des Repositorys auf die Schaltfläche 'Fork' klicken.
+   Seite des Repositorys klicken. Dadurch wird eine Kopie des Codes unter Ihrem GitHub-Benutzerkonto erstellt.
+
+2. Klonen Sie Ihren `transformers` Fork auf Ihre lokale Festplatte und fügen Sie das Basis-Repository als Remote hinzu:
+
+```bash
+git clone https://github.com/[your Github handle]/transformers.git
+cd transformers
+git remote add upstream https://github.com/huggingface/transformers.git
+```
+
+3. Richten Sie eine Entwicklungsumgebung ein, indem Sie z.B. den folgenden Befehl ausführen:
+
+```bash
+python -m venv .env
+source .env/bin/activate
+pip install -e ".[dev]"
+```
+
+Abhängig von Ihrem Betriebssystem und da die Anzahl der optionalen Abhängigkeiten von Transformers wächst, kann es sein, dass Sie bei diesem Befehl einen
+Fehler mit diesem Befehl. Stellen Sie in diesem Fall sicher, dass Sie das Deep Learning Framework, mit dem Sie arbeiten, installieren
+(PyTorch, TensorFlow und/oder Flax) und führen Sie es aus:
+
+```bash
+pip install -e ".[quality]"
+```
+
+was für die meisten Anwendungsfälle ausreichend sein sollte. Sie können dann zum übergeordneten Verzeichnis zurückkehren
+
+```bash
+cd ..
+```
+
+4. Wir empfehlen, die PyTorch-Version von *brand_new_bert* zu Transformers hinzuzufügen. Um PyTorch zu installieren, folgen Sie bitte den
+   Anweisungen auf https://pytorch.org/get-started/locally/.
+
+**Anmerkung:** Sie müssen CUDA nicht installiert haben. Es reicht aus, das neue Modell auf der CPU zum Laufen zu bringen.
+
+5. Um *brand_new_bert* zu portieren, benötigen Sie außerdem Zugriff auf das Original-Repository:
+
+```bash
+git clone https://github.com/org_that_created_brand_new_bert_org/brand_new_bert.git
+cd brand_new_bert
+pip install -e .
+```
+
+Jetzt haben Sie eine Entwicklungsumgebung eingerichtet, um *brand_new_bert* auf 🤗 Transformers zu portieren.
+
+### 3.-4. Führen Sie einen Pre-Training-Checkpoint mit dem Original-Repository durch
+
+Zunächst werden Sie mit dem ursprünglichen *brand_new_bert* Repository arbeiten. Oft ist die ursprüngliche Implementierung sehr
+"forschungslastig". Das bedeutet, dass es an Dokumentation mangeln kann und der Code schwer zu verstehen sein kann. Aber das sollte
+genau Ihre Motivation sein, *brand_new_bert* neu zu implementieren. Eines unserer Hauptziele bei Hugging Face ist es, *die Menschen dazu zu bringen
+auf den Schultern von Giganten zu stehen*, was sich hier sehr gut darin ausdrückt, dass wir ein funktionierendes Modell nehmen und es umschreiben, um es so
+es so **zugänglich, benutzerfreundlich und schön** wie möglich zu machen. Dies ist die wichtigste Motivation für die Neuimplementierung von
+Modelle in 🤗 Transformers umzuwandeln - der Versuch, komplexe neue NLP-Technologie für **jeden** zugänglich zu machen.
+
+Sie sollten damit beginnen, indem Sie in das Original-Repository eintauchen.
+
+Die erfolgreiche Ausführung des offiziellen Pre-Trainingsmodells im Original-Repository ist oft **der schwierigste** Schritt.
+Unserer Erfahrung nach ist es sehr wichtig, dass Sie einige Zeit damit verbringen, sich mit der ursprünglichen Code-Basis vertraut zu machen. Sie müssen
+das Folgende herausfinden:
+
+- Wo finden Sie die vortrainierten Gewichte?
+- Wie lädt man die vorab trainierten Gewichte in das entsprechende Modell?
+- Wie kann der Tokenizer unabhängig vom Modell ausgeführt werden?
+- Verfolgen Sie einen Forward Pass, damit Sie wissen, welche Klassen und Funktionen für einen einfachen Forward Pass erforderlich sind. Normalerweise,
+  müssen Sie nur diese Funktionen reimplementieren.
+- Sie müssen in der Lage sein, die wichtigen Komponenten des Modells zu finden: Wo befindet sich die Klasse des Modells? Gibt es Unterklassen des Modells,
+  *z.B.* EncoderModel, DecoderModel? Wo befindet sich die Selbstaufmerksamkeitsschicht? Gibt es mehrere verschiedene Aufmerksamkeitsebenen,
+  *z.B.* *Selbstaufmerksamkeit*, *Kreuzaufmerksamkeit*...?
+- Wie können Sie das Modell in der ursprünglichen Umgebung des Repo debuggen? Müssen Sie *print* Anweisungen hinzufügen, können Sie
+  mit einem interaktiven Debugger wie *ipdb* arbeiten oder sollten Sie eine effiziente IDE zum Debuggen des Modells verwenden, wie z.B. PyCharm?
+
+Es ist sehr wichtig, dass Sie, bevor Sie mit der Portierung beginnen, den Code im Original-Repository **effizient** debuggen können
+Repository können! Denken Sie auch daran, dass Sie mit einer Open-Source-Bibliothek arbeiten, also zögern Sie nicht, ein Problem oder
+oder sogar eine Pull-Anfrage im Original-Repository zu stellen. Die Betreuer dieses Repositorys sind wahrscheinlich sehr froh darüber
+dass jemand in ihren Code schaut!
+
+An diesem Punkt liegt es wirklich an Ihnen, welche Debugging-Umgebung und Strategie Sie zum Debuggen des ursprünglichen
+Modell zu debuggen. Wir raten dringend davon ab, eine kostspielige GPU-Umgebung einzurichten, sondern arbeiten Sie einfach auf einer CPU, sowohl wenn Sie mit dem
+in das ursprüngliche Repository einzutauchen und auch, wenn Sie beginnen, die 🤗 Transformers-Implementierung des Modells zu schreiben. Nur
+ganz am Ende, wenn das Modell bereits erfolgreich auf 🤗 Transformers portiert wurde, sollte man überprüfen, ob das
+Modell auch auf der GPU wie erwartet funktioniert.
+
+Im Allgemeinen gibt es zwei mögliche Debugging-Umgebungen für die Ausführung des Originalmodells
+
+- [Jupyter notebooks](https://jupyter.org/) / [google colab](https://colab.research.google.com/notebooks/intro.ipynb)
+- Lokale Python-Skripte.
+
+Jupyter-Notebooks haben den Vorteil, dass sie eine zellenweise Ausführung ermöglichen, was hilfreich sein kann, um logische Komponenten besser voneinander zu trennen und
+logische Komponenten voneinander zu trennen und schnellere Debugging-Zyklen zu haben, da Zwischenergebnisse gespeichert werden können. Außerdem,
+Außerdem lassen sich Notebooks oft leichter mit anderen Mitwirkenden teilen, was sehr hilfreich sein kann, wenn Sie das Hugging Face Team um Hilfe bitten möchten.
+Face Team um Hilfe bitten. Wenn Sie mit Jupyter-Notizbüchern vertraut sind, empfehlen wir Ihnen dringend, mit ihnen zu arbeiten.
+
+Der offensichtliche Nachteil von Jupyter-Notizbüchern ist, dass Sie, wenn Sie nicht daran gewöhnt sind, mit ihnen zu arbeiten, einige Zeit damit verbringen müssen
+einige Zeit damit verbringen müssen, sich an die neue Programmierumgebung zu gewöhnen, und dass Sie möglicherweise Ihre bekannten Debugging-Tools nicht mehr verwenden können
+wie z.B. `ipdb` nicht mehr verwenden können.
+
+Für jede Codebasis ist es immer ein guter erster Schritt, einen **kleinen** vortrainierten Checkpoint zu laden und in der Lage zu sein, einen
+einzelnen Vorwärtsdurchlauf mit einem Dummy-Integer-Vektor von Eingabe-IDs als Eingabe zu reproduzieren. Ein solches Skript könnte wie folgt aussehen (in
+Pseudocode):
+
+```python
+model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/")
+input_ids = [0, 4, 5, 2, 3, 7, 9]  # vector of input ids
+original_output = model.predict(input_ids)
+```
+
+Was die Debugging-Strategie anbelangt, so können Sie im Allgemeinen aus mehreren Strategien wählen:
+
+- Zerlegen Sie das ursprüngliche Modell in viele kleine testbare Komponenten und führen Sie für jede dieser Komponenten einen Vorwärtsdurchlauf zur
+  Überprüfung
+- Zerlegen Sie das ursprüngliche Modell nur in den ursprünglichen *Tokenizer* und das ursprüngliche *Modell*, führen Sie einen Vorwärtsdurchlauf für diese Komponenten durch
+  und verwenden Sie dazwischenliegende Druckanweisungen oder Haltepunkte zur Überprüfung.
+
+Auch hier bleibt es Ihnen überlassen, welche Strategie Sie wählen. Oft ist die eine oder die andere Strategie vorteilhaft, je nach der ursprünglichen Codebasis
+Basis.
+
+Wenn die ursprüngliche Codebasis es Ihnen erlaubt, das Modell in kleinere Teilkomponenten zu zerlegen, *z.B.* wenn die ursprüngliche
+Code-Basis problemlos im Eager-Modus ausgeführt werden kann, lohnt es sich in der Regel, dies zu tun. Es gibt einige wichtige Vorteile
+am Anfang den schwierigeren Weg zu gehen:
+
+- Wenn Sie später das ursprüngliche Modell mit der Hugging Face-Implementierung vergleichen, können Sie automatisch überprüfen, ob
+  für jede Komponente einzeln überprüfen, ob die entsprechende Komponente der 🤗 Transformers-Implementierung übereinstimmt, anstatt sich auf
+  anstatt sich auf den visuellen Vergleich über Druckanweisungen zu verlassen
+- können Sie das große Problem der Portierung eines Modells in kleinere Probleme der Portierung einzelner Komponenten zerlegen
+  einzelnen Komponenten zu zerlegen und so Ihre Arbeit besser zu strukturieren
+- Die Aufteilung des Modells in logisch sinnvolle Komponenten hilft Ihnen, einen besseren Überblick über das Design des Modells zu bekommen
+  und somit das Modell besser zu verstehen
+- In einem späteren Stadium helfen Ihnen diese komponentenweisen Tests dabei, sicherzustellen, dass keine Regressionen auftreten, während Sie fortfahren
+  Ihren Code ändern
+
+[Lysandre's](https://gist.github.com/LysandreJik/db4c948f6b4483960de5cbac598ad4ed) Integrationstests für ELECTRA
+gibt ein schönes Beispiel dafür, wie dies geschehen kann.
+
+Wenn die ursprüngliche Codebasis jedoch sehr komplex ist oder nur die Ausführung von Zwischenkomponenten in einem kompilierten Modus erlaubt,
+könnte es zu zeitaufwändig oder sogar unmöglich sein, das Modell in kleinere testbare Teilkomponenten zu zerlegen. Ein gutes
+Beispiel ist die [T5's MeshTensorFlow](https://github.com/tensorflow/mesh/tree/master/mesh_tensorflow) Bibliothek, die sehr komplex ist
+sehr komplex ist und keine einfache Möglichkeit bietet, das Modell in seine Unterkomponenten zu zerlegen. Bei solchen Bibliotheken ist man
+oft auf die Überprüfung von Druckanweisungen angewiesen.
+
+Unabhängig davon, welche Strategie Sie wählen, ist die empfohlene Vorgehensweise oft die gleiche, nämlich dass Sie mit der Fehlersuche in den
+die Anfangsebenen zuerst und die Endebenen zuletzt debuggen.
+
+Es wird empfohlen, dass Sie die Ausgaben der folgenden Ebenen abrufen, entweder durch Druckanweisungen oder Unterkomponentenfunktionen
+Schichten in der folgenden Reihenfolge abrufen:
+
+1. Rufen Sie die Eingabe-IDs ab, die an das Modell übergeben wurden
+2. Rufen Sie die Worteinbettungen ab
+3. Rufen Sie die Eingabe der ersten Transformer-Schicht ab
+4. Rufen Sie die Ausgabe der ersten Transformer-Schicht ab
+5. Rufen Sie die Ausgabe der folgenden n - 1 Transformer-Schichten ab
+6. Rufen Sie die Ausgabe des gesamten BrandNewBert Modells ab
+
+Die Eingabe-IDs sollten dabei aus einem Array von Ganzzahlen bestehen, *z.B.* `input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]`
+
+Die Ausgaben der folgenden Schichten bestehen oft aus mehrdimensionalen Float-Arrays und können wie folgt aussehen:
+
+```
+[[
+ [-0.1465, -0.6501,  0.1993,  ...,  0.1451,  0.3430,  0.6024],
+ [-0.4417, -0.5920,  0.3450,  ..., -0.3062,  0.6182,  0.7132],
+ [-0.5009, -0.7122,  0.4548,  ..., -0.3662,  0.6091,  0.7648],
+ ...,
+ [-0.5613, -0.6332,  0.4324,  ..., -0.3792,  0.7372,  0.9288],
+ [-0.5416, -0.6345,  0.4180,  ..., -0.3564,  0.6992,  0.9191],
+ [-0.5334, -0.6403,  0.4271,  ..., -0.3339,  0.6533,  0.8694]]],
+```
+
+Wir erwarten, dass jedes zu 🤗 Transformers hinzugefügte Modell eine Reihe von Integrationstests besteht, was bedeutet, dass das ursprüngliche
+Modell und die neu implementierte Version in 🤗 Transformers exakt dieselbe Ausgabe liefern müssen, und zwar mit einer Genauigkeit von 0,001!
+Da es normal ist, dass das exakt gleiche Modell, das in verschiedenen Bibliotheken geschrieben wurde, je nach Bibliotheksrahmen eine leicht unterschiedliche Ausgabe liefern kann
+eine leicht unterschiedliche Ausgabe liefern kann, akzeptieren wir eine Fehlertoleranz von 1e-3 (0,001). Es reicht nicht aus, wenn das Modell
+fast das gleiche Ergebnis liefert, sie müssen fast identisch sein. Daher werden Sie sicherlich die Zwischenergebnisse
+Zwischenergebnisse der 🤗 Transformers-Version mehrfach mit den Zwischenergebnissen der ursprünglichen Implementierung von
+*brand_new_bert* vergleichen. In diesem Fall ist eine **effiziente** Debugging-Umgebung des ursprünglichen Repositorys absolut
+wichtig ist. Hier sind einige Ratschläge, um Ihre Debugging-Umgebung so effizient wie möglich zu gestalten.
+
+- Finden Sie den besten Weg, um Zwischenergebnisse zu debuggen. Ist das ursprüngliche Repository in PyTorch geschrieben? Dann sollten Sie
+  dann sollten Sie sich wahrscheinlich die Zeit nehmen, ein längeres Skript zu schreiben, das das ursprüngliche Modell in kleinere Unterkomponenten zerlegt, um
+  Zwischenwerte abzurufen. Ist das ursprüngliche Repository in Tensorflow 1 geschrieben? Dann müssen Sie sich möglicherweise auf die
+  TensorFlow Druckoperationen wie [tf.print](https://www.tensorflow.org/api_docs/python/tf/print) verlassen, um die
+  Zwischenwerte auszugeben. Ist das ursprüngliche Repository in Jax geschrieben? Dann stellen Sie sicher, dass das Modell **nicht jitted** ist, wenn
+  wenn Sie den Vorwärtsdurchlauf ausführen, *z.B.* schauen Sie sich [dieser Link](https://github.com/google/jax/issues/196) an.
+- Verwenden Sie den kleinsten vortrainierten Prüfpunkt, den Sie finden können. Je kleiner der Prüfpunkt ist, desto schneller wird Ihr Debugging-Zyklus
+  wird. Es ist nicht effizient, wenn Ihr vorab trainiertes Modell so groß ist, dass Ihr Vorwärtsdurchlauf mehr als 10 Sekunden dauert.
+  Falls nur sehr große Checkpoints verfügbar sind, kann es sinnvoller sein, ein Dummy-Modell in der neuen
+  Umgebung mit zufällig initialisierten Gewichten zu erstellen und diese Gewichte zum Vergleich mit der 🤗 Transformers-Version
+  Ihres Modells
+- Vergewissern Sie sich, dass Sie den einfachsten Weg wählen, um einen Forward Pass im ursprünglichen Repository aufzurufen. Idealerweise sollten Sie
+  die Funktion im originalen Repository finden, die **nur** einen einzigen Vorwärtspass aufruft, *d.h.* die oft aufgerufen wird
+  Vorhersagen", "Auswerten", "Vorwärts" oder "Aufruf" genannt wird. Sie wollen keine Funktion debuggen, die `forward` aufruft
+  mehrfach aufruft, *z.B.* um Text zu erzeugen, wie `autoregressive_sample`, `generate`.
+- Versuchen Sie, die Tokenisierung vom *Forward*-Pass des Modells zu trennen. Wenn das Original-Repository Beispiele zeigt, bei denen
+  Sie eine Zeichenkette eingeben müssen, dann versuchen Sie herauszufinden, an welcher Stelle im Vorwärtsaufruf die Zeichenketteneingabe in Eingabe-IDs geändert wird
+  geändert wird und beginnen Sie an dieser Stelle. Das könnte bedeuten, dass Sie möglicherweise selbst ein kleines Skript schreiben oder den
+  Originalcode so ändern müssen, dass Sie die ids direkt eingeben können, anstatt eine Zeichenkette einzugeben.
+- Vergewissern Sie sich, dass sich das Modell in Ihrem Debugging-Setup **nicht** im Trainingsmodus befindet, der oft dazu führt, dass das Modell
+  Dies führt häufig zu zufälligen Ergebnissen, da das Modell mehrere Dropout-Schichten enthält. Stellen Sie sicher, dass der Vorwärtsdurchlauf in Ihrer Debugging
+  Umgebung **deterministisch** ist, damit die Dropout-Schichten nicht verwendet werden. Oder verwenden Sie *transformers.utils.set_seed*.
+  wenn sich die alte und die neue Implementierung im selben Framework befinden.
+
+Im folgenden Abschnitt finden Sie genauere Details/Tipps, wie Sie dies für *brand_new_bert* tun können.
+
+### 5.-14. Portierung von BrandNewBert auf 🤗 Transformatoren
+
+Als nächstes können Sie endlich damit beginnen, neuen Code zu 🤗 Transformers hinzuzufügen. Gehen Sie in den Klon Ihres 🤗 Transformers Forks:
+
+```bash
+cd transformers
+```
+
+In dem speziellen Fall, dass Sie ein Modell hinzufügen, dessen Architektur genau mit der Modellarchitektur eines
+Modells übereinstimmt, müssen Sie nur ein Konvertierungsskript hinzufügen, wie in [diesem Abschnitt](#write-a-conversion-script) beschrieben.
+In diesem Fall können Sie einfach die gesamte Modellarchitektur des bereits vorhandenen Modells wiederverwenden.
+
+Andernfalls beginnen wir mit der Erstellung eines neuen Modells. Sie haben hier zwei Möglichkeiten:
+
+- `transformers-cli add-new-model-like`, um ein neues Modell wie ein bestehendes hinzuzufügen
+- `transformers-cli add-new-model`, um ein neues Modell aus unserer Vorlage hinzuzufügen (sieht dann aus wie BERT oder Bart, je nachdem, welche Art von Modell Sie wählen)
+
+In beiden Fällen werden Sie mit einem Fragebogen aufgefordert, die grundlegenden Informationen zu Ihrem Modell auszufüllen. Für den zweiten Befehl müssen Sie `cookiecutter` installieren, weitere Informationen dazu finden Sie [hier](https://github.com/huggingface/transformers/tree/main/templates/adding_a_new_model).
+
+**Eröffnen Sie einen Pull Request auf dem Haupt-Repositorium huggingface/transformers**
+
+Bevor Sie mit der Anpassung des automatisch generierten Codes beginnen, ist es nun an der Zeit, einen "Work in progress (WIP)" Pull
+Anfrage, *z.B.* "[WIP] Add *brand_new_bert*", in 🤗 Transformers zu öffnen, damit Sie und das Hugging Face Team
+Seite an Seite an der Integration des Modells in 🤗 Transformers arbeiten können.
+
+Sie sollten Folgendes tun:
+
+1. Erstellen Sie eine Verzweigung mit einem beschreibenden Namen von Ihrer Hauptverzweigung
+
+```bash
+git checkout -b add_brand_new_bert
+```
+
+2. Bestätigen Sie den automatisch generierten Code:
+
+```bash
+git add .
+git commit
+```
+
+3. Abrufen und zurücksetzen auf die aktuelle Haupt
+
+```bash
+git fetch upstream
+git rebase upstream/main
+```
+
+4. Übertragen Sie die Änderungen auf Ihr Konto mit:
+
+```bash
+git push -u origin a-descriptive-name-for-my-changes
+```
+
+5. Wenn Sie zufrieden sind, gehen Sie auf die Webseite Ihrer Abspaltung auf GitHub. Klicken Sie auf "Pull request". Stellen Sie sicher, dass Sie das
+   GitHub-Handle einiger Mitglieder des Hugging Face-Teams als Reviewer hinzuzufügen, damit das Hugging Face-Team über zukünftige Änderungen informiert wird.
+   zukünftige Änderungen benachrichtigt wird.
+
+6. Ändern Sie den PR in einen Entwurf, indem Sie auf der rechten Seite der GitHub-Pull-Request-Webseite auf "In Entwurf umwandeln" klicken.
+
+Vergessen Sie im Folgenden nicht, wenn Sie Fortschritte gemacht haben, Ihre Arbeit zu committen und in Ihr Konto zu pushen, damit sie in der Pull-Anfrage erscheint.
+damit sie in der Pull-Anfrage angezeigt wird. Außerdem sollten Sie darauf achten, dass Sie Ihre Arbeit von Zeit zu Zeit mit dem aktuellen main
+von Zeit zu Zeit zu aktualisieren, indem Sie dies tun:
+
+```bash
+git fetch upstream
+git merge upstream/main
+```
+
+Generell sollten Sie alle Fragen, die Sie in Bezug auf das Modell oder Ihre Implementierung haben, in Ihrem PR stellen und
+in der PR diskutiert/gelöst werden. Auf diese Weise wird das Hugging Face Team immer benachrichtigt, wenn Sie neuen Code einreichen oder
+wenn Sie eine Frage haben. Es ist oft sehr hilfreich, das Hugging Face-Team auf Ihren hinzugefügten Code hinzuweisen, damit das Hugging Face-Team Ihr Problem oder Ihre Frage besser verstehen kann.
+Face-Team Ihr Problem oder Ihre Frage besser verstehen kann.
+
+Gehen Sie dazu auf die Registerkarte "Geänderte Dateien", auf der Sie alle Ihre Änderungen sehen, gehen Sie zu einer Zeile, zu der Sie eine Frage stellen möchten
+eine Frage stellen möchten, und klicken Sie auf das "+"-Symbol, um einen Kommentar hinzuzufügen. Wenn eine Frage oder ein Problem gelöst wurde,
+können Sie auf die Schaltfläche "Lösen" des erstellten Kommentars klicken.
+
+Auf dieselbe Weise wird das Hugging Face-Team Kommentare öffnen, wenn es Ihren Code überprüft. Wir empfehlen, die meisten Fragen
+auf GitHub in Ihrem PR zu stellen. Für einige sehr allgemeine Fragen, die für die Öffentlichkeit nicht sehr nützlich sind, können Sie das
+Hugging Face Team per Slack oder E-Mail zu stellen.
+
+**5. Passen Sie den Code der generierten Modelle für brand_new_bert** an.
+
+Zunächst werden wir uns nur auf das Modell selbst konzentrieren und uns nicht um den Tokenizer kümmern. Den gesamten relevanten Code sollten Sie
+finden Sie in den generierten Dateien `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` und
+`src/transformers/models/brand_new_bert/configuration_brand_new_bert.py`.
+
+Jetzt können Sie endlich mit dem Programmieren beginnen :). Der generierte Code in
+`src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` wird entweder die gleiche Architektur wie BERT haben, wenn
+wenn es sich um ein reines Encoder-Modell handelt oder BART, wenn es sich um ein Encoder-Decoder-Modell handelt. An diesem Punkt sollten Sie sich daran erinnern, was
+was Sie am Anfang über die theoretischen Aspekte des Modells gelernt haben: *Wie unterscheidet sich das Modell von BERT oder
+BART?*". Implementieren Sie diese Änderungen, was oft bedeutet, dass Sie die *Selbstaufmerksamkeitsschicht*, die Reihenfolge der Normalisierungsschicht usw. ändern müssen.
+Schicht usw... Auch hier ist es oft nützlich, sich die ähnliche Architektur bereits bestehender Modelle in Transformers anzusehen, um ein besseres Gefühl dafür zu bekommen
+ein besseres Gefühl dafür zu bekommen, wie Ihr Modell implementiert werden sollte.
+
+**Beachten Sie**, dass Sie an diesem Punkt nicht sehr sicher sein müssen, dass Ihr Code völlig korrekt oder sauber ist. Vielmehr ist es
+Sie sollten vielmehr eine erste *unbereinigte*, kopierte Version des ursprünglichen Codes in
+src/transformers/models/brand_new_bert/modeling_brand_new_bert.py" hinzuzufügen, bis Sie das Gefühl haben, dass der gesamte notwendige Code
+hinzugefügt wurde. Unserer Erfahrung nach ist es viel effizienter, schnell eine erste Version des erforderlichen Codes hinzuzufügen und
+den Code iterativ mit dem Konvertierungsskript zu verbessern/korrigieren, wie im nächsten Abschnitt beschrieben. Das einzige, was
+zu diesem Zeitpunkt funktionieren muss, ist, dass Sie die 🤗 Transformers-Implementierung von *brand_new_bert* instanziieren können, *d.h.* der
+folgende Befehl sollte funktionieren:
+
+```python
+from transformers import BrandNewBertModel, BrandNewBertConfig
+
+model = BrandNewBertModel(BrandNewBertConfig())
+```
+
+Der obige Befehl erstellt ein Modell gemäß den Standardparametern, die in `BrandNewBertConfig()` definiert sind, mit
+zufälligen Gewichten und stellt damit sicher, dass die `init()` Methoden aller Komponenten funktionieren.
+
+Beachten Sie, dass alle zufälligen Initialisierungen in der Methode `_init_weights` Ihres `BrandnewBertPreTrainedModel` stattfinden sollten.
+Klasse erfolgen sollte. Sie sollte alle Blattmodule in Abhängigkeit von den Variablen der Konfiguration initialisieren. Hier ist ein Beispiel mit der
+BERT `_init_weights` Methode:
+
+```py
+def _init_weights(self, module):
+    """Initialize the weights"""
+    if isinstance(module, nn.Linear):
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        if module.bias is not None:
+            module.bias.data.zero_()
+    elif isinstance(module, nn.Embedding):
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        if module.padding_idx is not None:
+            module.weight.data[module.padding_idx].zero_()
+    elif isinstance(module, nn.LayerNorm):
+        module.bias.data.zero_()
+        module.weight.data.fill_(1.0)
+```
+
+Sie können weitere benutzerdefinierte Schemata verwenden, wenn Sie eine spezielle Initialisierung für einige Module benötigen. Zum Beispiel in
+`Wav2Vec2ForPreTraining` müssen die letzten beiden linearen Schichten die Initialisierung des regulären PyTorch `nn.Linear` haben.
+aber alle anderen sollten eine Initialisierung wie oben verwenden. Dies ist wie folgt kodiert:
+
+```py
+def _init_weights(self, module):
+    """Initialize the weights"""
+    if isinstnace(module, Wav2Vec2ForPreTraining):
+        module.project_hid.reset_parameters()
+        module.project_q.reset_parameters()
+        module.project_hid._is_hf_initialized = True
+        module.project_q._is_hf_initialized = True
+    elif isinstance(module, nn.Linear):
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        if module.bias is not None:
+            module.bias.data.zero_()
+```
+
+Das Flag `_is_hf_initialized` wird intern verwendet, um sicherzustellen, dass wir ein Submodul nur einmal initialisieren. Wenn Sie es auf
+True` für `module.project_q` und `module.project_hid` setzen, stellen wir sicher, dass die benutzerdefinierte Initialisierung, die wir vorgenommen haben, später nicht überschrieben wird,
+die Funktion `_init_weights` nicht auf sie angewendet wird.
+
+**6. Schreiben Sie ein Konvertierungsskript**
+
+Als nächstes sollten Sie ein Konvertierungsskript schreiben, mit dem Sie den Checkpoint, den Sie zum Debuggen von *brand_new_bert* im
+im ursprünglichen Repository in einen Prüfpunkt konvertieren, der mit Ihrer gerade erstellten 🤗 Transformers-Implementierung von
+*brand_new_bert*. Es ist nicht ratsam, das Konvertierungsskript von Grund auf neu zu schreiben, sondern die bereits
+bestehenden Konvertierungsskripten in 🤗 Transformers nach einem Skript zu suchen, das für die Konvertierung eines ähnlichen Modells verwendet wurde, das im
+demselben Framework wie *brand_new_bert* geschrieben wurde. Normalerweise reicht es aus, ein bereits vorhandenes Konvertierungsskript zu kopieren und
+es für Ihren Anwendungsfall leicht anzupassen. Zögern Sie nicht, das Hugging Face Team zu bitten, Sie auf ein ähnliches, bereits vorhandenes
+Konvertierungsskript für Ihr Modell zu finden.
+
+- Wenn Sie ein Modell von TensorFlow nach PyTorch portieren, ist ein guter Ausgangspunkt das Konvertierungsskript von BERT [hier] (https://github.com/huggingface/transformers/blob/7acfa95afb8194f8f9c1f4d2c6028224dbed35a2/src/transformers/models/bert/modeling_bert.py#L91)
+- Wenn Sie ein Modell von PyTorch nach PyTorch portieren, ist ein guter Ausgangspunkt das Konvertierungsskript von BART [hier](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py)
+
+Im Folgenden werden wir kurz erklären, wie PyTorch-Modelle Ebenengewichte speichern und Ebenennamen definieren. In PyTorch wird der
+Name einer Ebene durch den Namen des Klassenattributs definiert, das Sie der Ebene geben. Lassen Sie uns ein Dummy-Modell in
+PyTorch, das wir `SimpleModel` nennen, wie folgt:
+
+```python
+from torch import nn
+
+
+class SimpleModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.dense = nn.Linear(10, 10)
+        self.intermediate = nn.Linear(10, 10)
+        self.layer_norm = nn.LayerNorm(10)
+```
+
+Jetzt können wir eine Instanz dieser Modelldefinition erstellen, die alle Gewichte ausfüllt: `dense`, `intermediate`,
+`layer_norm` mit zufälligen Gewichten. Wir können das Modell ausdrucken, um seine Architektur zu sehen
+
+```python
+model = SimpleModel()
+
+print(model)
+```
+
+Dies gibt folgendes aus:
+
+```
+SimpleModel(
+  (dense): Linear(in_features=10, out_features=10, bias=True)
+  (intermediate): Linear(in_features=10, out_features=10, bias=True)
+  (layer_norm): LayerNorm((10,), eps=1e-05, elementwise_affine=True)
+)
+```
+
+Wir können sehen, dass die Ebenennamen durch den Namen des Klassenattributs in PyTorch definiert sind. Sie können die Gewichtswerte
+Werte einer bestimmten Ebene anzeigen lassen:
+
+```python
+print(model.dense.weight.data)
+```
+
+um zu sehen, dass die Gewichte zufällig initialisiert wurden
+
+```
+tensor([[-0.0818,  0.2207, -0.0749, -0.0030,  0.0045, -0.1569, -0.1598,  0.0212,
+         -0.2077,  0.2157],
+        [ 0.1044,  0.0201,  0.0990,  0.2482,  0.3116,  0.2509,  0.2866, -0.2190,
+          0.2166, -0.0212],
+        [-0.2000,  0.1107, -0.1999, -0.3119,  0.1559,  0.0993,  0.1776, -0.1950,
+         -0.1023, -0.0447],
+        [-0.0888, -0.1092,  0.2281,  0.0336,  0.1817, -0.0115,  0.2096,  0.1415,
+         -0.1876, -0.2467],
+        [ 0.2208, -0.2352, -0.1426, -0.2636, -0.2889, -0.2061, -0.2849, -0.0465,
+          0.2577,  0.0402],
+        [ 0.1502,  0.2465,  0.2566,  0.0693,  0.2352, -0.0530,  0.1859, -0.0604,
+          0.2132,  0.1680],
+        [ 0.1733, -0.2407, -0.1721,  0.1484,  0.0358, -0.0633, -0.0721, -0.0090,
+          0.2707, -0.2509],
+        [-0.1173,  0.1561,  0.2945,  0.0595, -0.1996,  0.2988, -0.0802,  0.0407,
+          0.1829, -0.1568],
+        [-0.1164, -0.2228, -0.0403,  0.0428,  0.1339,  0.0047,  0.1967,  0.2923,
+          0.0333, -0.0536],
+        [-0.1492, -0.1616,  0.1057,  0.1950, -0.2807, -0.2710, -0.1586,  0.0739,
+          0.2220,  0.2358]]).
+```
+
+Im Konvertierungsskript sollten Sie diese zufällig initialisierten Gewichte mit den genauen Gewichten der
+entsprechenden Ebene im Kontrollpunkt. *Z.B.*
+
+```python
+# retrieve matching layer weights, e.g. by
+# recursive algorithm
+layer_name = "dense"
+pretrained_weight = array_of_dense_layer
+
+model_pointer = getattr(model, "dense")
+
+model_pointer.weight.data = torch.from_numpy(pretrained_weight)
+```
+
+Dabei müssen Sie sicherstellen, dass jedes zufällig initialisierte Gewicht Ihres PyTorch-Modells und sein entsprechendes
+Checkpoint-Gewicht in **Form und Name** genau übereinstimmen. Zu diesem Zweck ist es **notwendig**, assert
+Anweisungen für die Form hinzuzufügen und die Namen der Checkpoint-Gewichte auszugeben. Sie sollten z.B. Anweisungen hinzufügen wie:
+
+```python
+assert (
+    model_pointer.weight.shape == pretrained_weight.shape
+), f"Pointer shape of random weight {model_pointer.shape} and array shape of checkpoint weight {pretrained_weight.shape} mismatched"
+```
+
+Außerdem sollten Sie die Namen der beiden Gewichte ausdrucken, um sicherzustellen, dass sie übereinstimmen, *z.B.*.
+
+```python
+logger.info(f"Initialize PyTorch weight {layer_name} from {pretrained_weight.name}")
+```
+
+Wenn entweder die Form oder der Name nicht übereinstimmt, haben Sie wahrscheinlich das falsche Kontrollpunktgewicht einer zufällig
+Ebene der 🤗 Transformers-Implementierung zugewiesen.
+
+Eine falsche Form ist höchstwahrscheinlich auf eine falsche Einstellung der Konfigurationsparameter in `BrandNewBertConfig()` zurückzuführen, die
+nicht genau mit denen übereinstimmen, die für den zu konvertierenden Prüfpunkt verwendet wurden. Es könnte aber auch sein, dass
+die PyTorch-Implementierung eines Layers erfordert, dass das Gewicht vorher transponiert wird.
+
+Schließlich sollten Sie auch überprüfen, ob **alle** erforderlichen Gewichte initialisiert sind und alle Checkpoint-Gewichte ausgeben, die
+die nicht zur Initialisierung verwendet wurden, um sicherzustellen, dass das Modell korrekt konvertiert wurde. Es ist völlig normal, dass die
+Konvertierungsversuche entweder mit einer falschen Shape-Anweisung oder einer falschen Namenszuweisung fehlschlagen. Das liegt höchstwahrscheinlich daran, dass entweder
+Sie haben falsche Parameter in `BrandNewBertConfig()` verwendet, haben eine falsche Architektur in der 🤗 Transformers
+Implementierung, Sie haben einen Fehler in den `init()` Funktionen einer der Komponenten der 🤗 Transformers
+Implementierung oder Sie müssen eine der Kontrollpunktgewichte transponieren.
+
+Dieser Schritt sollte mit dem vorherigen Schritt wiederholt werden, bis alle Gewichte des Kontrollpunkts korrekt in das
+Transformers-Modell geladen sind. Nachdem Sie den Prüfpunkt korrekt in die 🤗 Transformers-Implementierung geladen haben, können Sie das Modell
+das Modell unter einem Ordner Ihrer Wahl `/path/to/converted/checkpoint/folder` speichern, der dann sowohl ein
+Datei `pytorch_model.bin` und eine Datei `config.json` enthalten sollte:
+
+```python
+model.save_pretrained("/path/to/converted/checkpoint/folder")
+```
+
+**7. Implementieren Sie den Vorwärtspass**
+
+Nachdem es Ihnen gelungen ist, die trainierten Gewichte korrekt in die 🤗 Transformers-Implementierung zu laden, sollten Sie nun dafür sorgen
+sicherstellen, dass der Forward Pass korrekt implementiert ist. In [Machen Sie sich mit dem ursprünglichen Repository vertraut](#34-run-a-pretrained-checkpoint-using-the-original-repository) haben Sie bereits ein Skript erstellt, das einen Forward Pass
+Durchlauf des Modells unter Verwendung des Original-Repositorys durchführt. Jetzt sollten Sie ein analoges Skript schreiben, das die 🤗 Transformers
+Implementierung anstelle der Originalimplementierung verwenden. Es sollte wie folgt aussehen:
+
+```python
+model = BrandNewBertModel.from_pretrained("/path/to/converted/checkpoint/folder")
+input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]
+output = model(input_ids).last_hidden_states
+```
+
+Es ist sehr wahrscheinlich, dass die 🤗 Transformers-Implementierung und die ursprüngliche Modell-Implementierung nicht genau die gleiche Ausgabe liefern.
+beim ersten Mal nicht die gleiche Ausgabe liefern oder dass der Vorwärtsdurchlauf einen Fehler auslöst. Seien Sie nicht enttäuscht - das ist zu erwarten! Erstens,
+sollten Sie sicherstellen, dass der Vorwärtsdurchlauf keine Fehler auslöst. Es passiert oft, dass die falschen Dimensionen verwendet werden
+verwendet werden, was zu einem *Dimensionality mismatch* Fehler führt oder dass der falsche Datentyp verwendet wird, *z.B.* `torch.long`
+anstelle von `torch.float32`. Zögern Sie nicht, das Hugging Face Team um Hilfe zu bitten, wenn Sie bestimmte Fehler nicht lösen können.
+bestimmte Fehler nicht lösen können.
+
+Um sicherzustellen, dass die Implementierung von 🤗 Transformers korrekt funktioniert, müssen Sie sicherstellen, dass die Ausgaben
+einer Genauigkeit von `1e-3` entsprechen. Zunächst sollten Sie sicherstellen, dass die Ausgabeformen identisch sind, *d.h.*.
+Die Ausgabeform *outputs.shape* sollte für das Skript der 🤗 Transformers-Implementierung und die ursprüngliche
+Implementierung ergeben. Als nächstes sollten Sie sicherstellen, dass auch die Ausgabewerte identisch sind. Dies ist einer der schwierigsten
+Teile des Hinzufügens eines neuen Modells. Häufige Fehler, warum die Ausgaben nicht identisch sind, sind:
+
+- Einige Ebenen wurden nicht hinzugefügt, *d.h.* eine *Aktivierungsebene* wurde nicht hinzugefügt, oder die Restverbindung wurde vergessen
+- Die Worteinbettungsmatrix wurde nicht gebunden
+- Es werden die falschen Positionseinbettungen verwendet, da die ursprüngliche Implementierung einen Offset verwendet
+- Dropout wird während des Vorwärtsdurchlaufs angewendet. Um dies zu beheben, stellen Sie sicher, dass *model.training auf False* steht und dass keine Dropout
+  Schicht während des Vorwärtsdurchlaufs fälschlicherweise aktiviert wird, *d.h.* übergeben Sie *self.training* an [PyTorch's functional dropout](https://pytorch.org/docs/stable/nn.functional.html?highlight=dropout#torch.nn.functional.dropout)
+
+Der beste Weg, das Problem zu beheben, besteht normalerweise darin, sich den Vorwärtsdurchlauf der ursprünglichen Implementierung und die 🤗
+Transformers-Implementierung nebeneinander zu sehen und zu prüfen, ob es Unterschiede gibt. Idealerweise sollten Sie die
+Zwischenergebnisse beider Implementierungen des Vorwärtsdurchlaufs debuggen/ausdrucken, um die genaue Position im Netzwerk zu finden, an der die 🤗
+Transformers-Implementierung eine andere Ausgabe zeigt als die ursprüngliche Implementierung. Stellen Sie zunächst sicher, dass die
+hartcodierten `input_ids` in beiden Skripten identisch sind. Überprüfen Sie dann, ob die Ausgaben der ersten Transformation von
+der `input_ids` (normalerweise die Worteinbettungen) identisch sind. Und dann arbeiten Sie sich bis zur allerletzten Schicht des
+Netzwerks. Irgendwann werden Sie einen Unterschied zwischen den beiden Implementierungen feststellen, der Sie auf den Fehler
+in der Implementierung von 🤗 Transformers hinweist. Unserer Erfahrung nach ist ein einfacher und effizienter Weg, viele Druckanweisungen hinzuzufügen
+sowohl in der Original-Implementierung als auch in der 🤗 Transformers-Implementierung an den gleichen Stellen im Netzwerk
+hinzuzufügen und nacheinander Druckanweisungen zu entfernen, die dieselben Werte für Zwischenpräsentationen anzeigen.
+
+Wenn Sie sicher sind, dass beide Implementierungen die gleiche Ausgabe liefern, überprüfen Sie die Ausgaben mit
+`torch.allclose(original_output, output, atol=1e-3)` überprüfen, haben Sie den schwierigsten Teil hinter sich! Herzlichen Glückwunsch - die
+Arbeit, die noch zu erledigen ist, sollte ein Kinderspiel sein 😊.
+
+**8. Hinzufügen aller notwendigen Modelltests**
+
+An diesem Punkt haben Sie erfolgreich ein neues Modell hinzugefügt. Es ist jedoch sehr gut möglich, dass das Modell noch nicht
+noch nicht vollständig mit dem erforderlichen Design übereinstimmt. Um sicherzustellen, dass die Implementierung vollständig kompatibel mit 🤗 Transformers ist, sollten alle
+gemeinsamen Tests bestehen. Der Cookiecutter sollte automatisch eine Testdatei für Ihr Modell hinzugefügt haben, wahrscheinlich unter
+demselben `tests/models/brand_new_bert/test_modeling_brand_new_bert.py`. Führen Sie diese Testdatei aus, um zu überprüfen, ob alle gängigen
+Tests bestehen:
+
+```bash
+pytest tests/models/brand_new_bert/test_modeling_brand_new_bert.py
+```
+
+Nachdem Sie alle allgemeinen Tests festgelegt haben, müssen Sie nun sicherstellen, dass all die schöne Arbeit, die Sie geleistet haben, gut getestet ist, damit
+
+- a) die Community Ihre Arbeit leicht nachvollziehen kann, indem sie sich spezifische Tests von *brand_new_bert* ansieht
+- b) zukünftige Änderungen an Ihrem Modell keine wichtigen Funktionen des Modells zerstören.
+
+Als erstes sollten Sie Integrationstests hinzufügen. Diese Integrationstests tun im Wesentlichen dasselbe wie die Debugging-Skripte
+die Sie zuvor zur Implementierung des Modells in 🤗 Transformers verwendet haben. Eine Vorlage für diese Modelltests wurde bereits von dem
+Cookiecutter hinzugefügt, die `BrandNewBertModelIntegrationTests` heißt und nur noch von Ihnen ausgefüllt werden muss. Um sicherzustellen, dass diese
+Tests erfolgreich sind, führen Sie
+
+```bash
+RUN_SLOW=1 pytest -sv tests/models/brand_new_bert/test_modeling_brand_new_bert.py::BrandNewBertModelIntegrationTests
+```
+
+<Tip>
+
+Falls Sie Windows verwenden, sollten Sie `RUN_SLOW=1` durch `SET RUN_SLOW=1` ersetzen.
+
+</Tip>
+
+Zweitens sollten alle Funktionen, die speziell für *brand_new_bert* sind, zusätzlich in einem separaten Test getestet werden unter
+`BrandNewBertModelTester`/``BrandNewBertModelTest`. Dieser Teil wird oft vergessen, ist aber in zweierlei Hinsicht äußerst nützlich
+Weise:
+
+- Er hilft dabei, das Wissen, das Sie während der Modellerweiterung erworben haben, an die Community weiterzugeben, indem er zeigt, wie die
+  speziellen Funktionen von *brand_new_bert* funktionieren sollten.
+- Künftige Mitwirkende können Änderungen am Modell schnell testen, indem sie diese speziellen Tests ausführen.
+
+
+**9. Implementieren Sie den Tokenizer**
+
+Als nächstes sollten wir den Tokenizer von *brand_new_bert* hinzufügen. Normalerweise ist der Tokenizer äquivalent oder sehr ähnlich zu einem
+bereits vorhandenen Tokenizer von 🤗 Transformers.
+
+Es ist sehr wichtig, die ursprüngliche Tokenizer-Datei zu finden/extrahieren und es zu schaffen, diese Datei in die 🤗
+Transformers Implementierung des Tokenizers zu laden.
+
+Um sicherzustellen, dass der Tokenizer korrekt funktioniert, empfiehlt es sich, zunächst ein Skript im ursprünglichen Repository zu erstellen
+zu erstellen, das eine Zeichenkette eingibt und die `input_ids` zurückgibt. Es könnte etwa so aussehen (in Pseudocode):
+
+```python
+input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words."
+model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/")
+input_ids = model.tokenize(input_str)
+```
+
+Möglicherweise müssen Sie noch einmal einen Blick in das ursprüngliche Repository werfen, um die richtige Tokenizer-Funktion zu finden, oder Sie müssen
+Sie müssen vielleicht sogar Änderungen an Ihrem Klon des Original-Repositorys vornehmen, um nur die `input_ids` auszugeben. Nach dem Schreiben
+ein funktionierendes Tokenisierungsskript geschrieben, das das ursprüngliche Repository verwendet, sollten Sie ein analoges Skript für 🤗 Transformers
+erstellt werden. Es sollte ähnlich wie dieses aussehen:
+
+```python
+from transformers import BrandNewBertTokenizer
+
+input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words."
+
+tokenizer = BrandNewBertTokenizer.from_pretrained("/path/to/tokenizer/folder/")
+
+input_ids = tokenizer(input_str).input_ids
+```
+
+Wenn beide `input_ids` die gleichen Werte ergeben, sollte als letzter Schritt auch eine Tokenizer-Testdatei hinzugefügt werden.
+
+Analog zu den Modellierungstestdateien von *brand_new_bert* sollten auch die Tokenisierungs-Testdateien von *brand_new_bert*
+eine Reihe von fest kodierten Integrationstests enthalten.
+
+**10. Führen Sie End-to-End-Integrationstests aus**
+
+Nachdem Sie den Tokenizer hinzugefügt haben, sollten Sie auch ein paar End-to-End-Integrationstests, die sowohl das Modell als auch den
+Tokenizer zu `tests/models/brand_new_bert/test_modeling_brand_new_bert.py` in 🤗 Transformers.
+Ein solcher Test sollte bei einem aussagekräftigen
+Text-zu-Text-Beispiel zeigen, dass die Implementierung von 🤗 Transformers wie erwartet funktioniert. Ein aussagekräftiges Text-zu-Text-Beispiel kann
+z.B. *ein Quell-zu-Ziel-Übersetzungspaar, ein Artikel-zu-Zusammenfassung-Paar, ein Frage-zu-Antwort-Paar, usw... Wenn keiner der
+der portierten Prüfpunkte in einer nachgelagerten Aufgabe feinabgestimmt wurde, genügt es, sich einfach auf die Modelltests zu verlassen. In einem
+letzten Schritt, um sicherzustellen, dass das Modell voll funktionsfähig ist, sollten Sie alle Tests auch auf der GPU durchführen. Es kann
+Es kann vorkommen, dass Sie vergessen haben, einige `.to(self.device)` Anweisungen zu internen Tensoren des Modells hinzuzufügen, was in einem solchen
+Test zu einem Fehler führen würde. Falls Sie keinen Zugang zu einem Grafikprozessor haben, kann das Hugging Face Team diese Tests für Sie durchführen.
+Tests für Sie übernehmen.
+
+**11. Docstring hinzufügen**
+
+Nun sind alle notwendigen Funktionen für *brand_new_bert* hinzugefügt - Sie sind fast fertig! Das Einzige, was Sie noch hinzufügen müssen, ist
+ein schöner Docstring und eine Doku-Seite. Der Cookiecutter sollte eine Vorlagendatei namens
+`docs/source/model_doc/brand_new_bert.md` hinzugefügt haben, die Sie ausfüllen sollten. Die Benutzer Ihres Modells werden in der Regel zuerst einen Blick auf
+diese Seite ansehen, bevor sie Ihr Modell verwenden. Daher muss die Dokumentation verständlich und prägnant sein. Es ist sehr nützlich für
+die Gemeinschaft, einige *Tipps* hinzuzufügen, um zu zeigen, wie das Modell verwendet werden sollte. Zögern Sie nicht, das Hugging Face-Team anzupingen
+bezüglich der Docstrings.
+
+Stellen Sie als nächstes sicher, dass der zu `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` hinzugefügte docstring
+korrekt ist und alle erforderlichen Eingaben und Ausgaben enthält. Wir haben eine ausführliche Anleitung zum Schreiben von Dokumentationen und unserem Docstring-Format [hier](writing-documentation). Es ist immer gut, sich daran zu erinnern, dass die Dokumentation
+mindestens so sorgfältig behandelt werden sollte wie der Code in 🤗 Transformers, denn die Dokumentation ist in der Regel der erste Kontaktpunkt der
+Berührungspunkt der Community mit dem Modell ist.
+
+**Code refactor**
+
+Großartig, jetzt haben Sie den gesamten erforderlichen Code für *brand_new_bert* hinzugefügt. An diesem Punkt sollten Sie einige mögliche
+falschen Codestil korrigieren, indem Sie ausführen:
+
+```bash
+make style
+```
+
+und überprüfen Sie, ob Ihr Kodierungsstil die Qualitätsprüfung besteht:
+
+```bash
+make quality
+```
+
+Es gibt noch ein paar andere sehr strenge Designtests in 🤗 Transformers, die möglicherweise noch fehlschlagen, was sich in den
+den Tests Ihres Pull Requests. Dies liegt oft an fehlenden Informationen im Docstring oder an einer falschen
+Benennung. Das Hugging Face Team wird Ihnen sicherlich helfen, wenn Sie hier nicht weiterkommen.
+
+Und schließlich ist es immer eine gute Idee, den eigenen Code zu refaktorisieren, nachdem man sichergestellt hat, dass er korrekt funktioniert. Wenn alle
+Tests bestanden haben, ist es nun an der Zeit, den hinzugefügten Code noch einmal durchzugehen und einige Überarbeitungen vorzunehmen.
+
+Sie haben nun den Codierungsteil abgeschlossen, herzlichen Glückwunsch! 🎉 Sie sind großartig! 😎
+
+**12. Laden Sie die Modelle in den Model Hub hoch**
+
+In diesem letzten Teil sollten Sie alle Checkpoints konvertieren und in den Modell-Hub hochladen und eine Modellkarte für jeden
+hochgeladenen Modell-Kontrollpunkt. Sie können sich mit den Hub-Funktionen vertraut machen, indem Sie unsere [Model sharing and uploading Page](model_sharing) lesen. Hier sollten Sie mit dem Hugging Face-Team zusammenarbeiten, um einen passenden Namen für jeden
+Checkpoint festzulegen und die erforderlichen Zugriffsrechte zu erhalten, um das Modell unter der Organisation des Autors *brand_new_bert* hochladen zu können.
+*brand_new_bert*. Die Methode `push_to_hub`, die in allen Modellen in `transformers` vorhanden ist, ist ein schneller und effizienter Weg, Ihren Checkpoint in den Hub zu pushen. Ein kleines Snippet ist unten eingefügt:
+
+```python
+brand_new_bert.push_to_hub("brand_new_bert")
+# Uncomment the following line to push to an organization.
+# brand_new_bert.push_to_hub("<organization>/brand_new_bert")
+```
+
+Es lohnt sich, etwas Zeit darauf zu verwenden, für jeden Kontrollpunkt passende Musterkarten zu erstellen. Die Modellkarten sollten die
+spezifischen Merkmale dieses bestimmten Prüfpunkts hervorheben, * z.B.* auf welchem Datensatz wurde der Prüfpunkt
+vortrainiert/abgestimmt? Für welche nachgelagerte Aufgabe sollte das Modell verwendet werden? Und fügen Sie auch etwas Code bei, wie Sie
+wie das Modell korrekt verwendet wird.
+
+**13. (Optional) Notizbuch hinzufügen**
+
+Es ist sehr hilfreich, ein Notizbuch hinzuzufügen, in dem im Detail gezeigt wird, wie *brand_new_bert* für Schlussfolgerungen verwendet werden kann und/oder
+bei einer nachgelagerten Aufgabe feinabgestimmt wird. Dies ist nicht zwingend erforderlich, um Ihren PR zusammenzuführen, aber sehr nützlich für die Gemeinschaft.
+
+**14. Reichen Sie Ihren fertigen PR ein**
+
+Sie sind jetzt mit der Programmierung fertig und können zum letzten Schritt übergehen, nämlich der Zusammenführung Ihres PR mit main. Normalerweise hat das
+Hugging Face Team Ihnen an diesem Punkt bereits geholfen haben, aber es lohnt sich, sich etwas Zeit zu nehmen, um Ihrem fertigen
+PR eine schöne Beschreibung zu geben und eventuell Kommentare zu Ihrem Code hinzuzufügen, wenn Sie Ihren Gutachter auf bestimmte Designentscheidungen hinweisen wollen.
+Gutachter hinweisen wollen.
+
+### Teilen Sie Ihre Arbeit!!
+
+Jetzt ist es an der Zeit, von der Community Anerkennung für Ihre Arbeit zu bekommen! Die Fertigstellung einer Modellergänzung ist ein wichtiger
+Beitrag zu Transformers und der gesamten NLP-Gemeinschaft. Ihr Code und die portierten vortrainierten Modelle werden sicherlich
+von Hunderten und vielleicht sogar Tausenden von Entwicklern und Forschern genutzt werden. Sie sollten stolz auf Ihre Arbeit sein und Ihre
+Ihre Leistung mit der Gemeinschaft teilen.
+
+**Sie haben ein weiteres Modell erstellt, das für jeden in der Community super einfach zugänglich ist! 🤯**
diff --git a/docs/source/de/add_new_pipeline.md b/docs/source/de/add_new_pipeline.md
new file mode 100644
index 000000000000..7615ac7bfd59
--- /dev/null
+++ b/docs/source/de/add_new_pipeline.md
@@ -0,0 +1,258 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Wie erstellt man eine benutzerdefinierte Pipeline?
+
+In dieser Anleitung sehen wir uns an, wie Sie eine benutzerdefinierte Pipeline erstellen und sie auf dem [Hub](hf.co/models) freigeben oder sie der
+🤗 Transformers-Bibliothek hinzufügen.
+
+Zuallererst müssen Sie entscheiden, welche Roheingaben die Pipeline verarbeiten kann. Es kann sich um Strings, rohe Bytes,
+Dictionaries oder was auch immer die wahrscheinlichste gewünschte Eingabe ist. Versuchen Sie, diese Eingaben so rein wie möglich in Python zu halten
+denn das macht die Kompatibilität einfacher (auch mit anderen Sprachen über JSON). Dies werden die Eingaben der
+Pipeline (`Vorverarbeitung`).
+
+Definieren Sie dann die `Outputs`. Dieselbe Richtlinie wie für die Eingänge. Je einfacher, desto besser. Dies werden die Ausgaben der
+Methode `Postprocess`.
+
+Beginnen Sie damit, die Basisklasse `Pipeline` mit den 4 Methoden zu erben, die für die Implementierung von `preprocess` benötigt werden,
+Weiterleiten", "Nachbearbeitung" und "Parameter säubern".
+
+
+```python
+from transformers import Pipeline
+
+
+class MyPipeline(Pipeline):
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_kwargs = {}
+        if "maybe_arg" in kwargs:
+            preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
+        return preprocess_kwargs, {}, {}
+
+    def preprocess(self, inputs, maybe_arg=2):
+        model_input = Tensor(inputs["input_ids"])
+        return {"model_input": model_input}
+
+    def _forward(self, model_inputs):
+        # model_inputs == {"model_input": model_input}
+        outputs = self.model(**model_inputs)
+        # Maybe {"logits": Tensor(...)}
+        return outputs
+
+    def postprocess(self, model_outputs):
+        best_class = model_outputs["logits"].softmax(-1)
+        return best_class
+```
+
+Die Struktur dieser Aufteilung soll eine relativ nahtlose Unterstützung für CPU/GPU ermöglichen und gleichzeitig die Durchführung von
+Vor-/Nachbearbeitung auf der CPU in verschiedenen Threads
+
+Preprocess" nimmt die ursprünglich definierten Eingaben und wandelt sie in etwas um, das in das Modell eingespeist werden kann. Es kann
+mehr Informationen enthalten und ist normalerweise ein `Dict`.
+
+`_forward` ist das Implementierungsdetail und ist nicht dafür gedacht, direkt aufgerufen zu werden. Weiterleiten" ist die bevorzugte
+aufgerufene Methode, da sie Sicherheitsvorkehrungen enthält, die sicherstellen, dass alles auf dem erwarteten Gerät funktioniert. Wenn etwas
+mit einem realen Modell verknüpft ist, gehört es in die Methode `_forward`, alles andere gehört in die Methoden preprocess/postprocess.
+
+Die Methode `Postprocess` nimmt die Ausgabe von `_forward` und verwandelt sie in die endgültige Ausgabe, die zuvor festgelegt wurde.
+zuvor entschieden wurde.
+
+Die Methode `_sanitize_parameters` ermöglicht es dem Benutzer, beliebige Parameter zu übergeben, wann immer er möchte, sei es bei der Initialisierung
+Zeit `pipeline(...., maybe_arg=4)` oder zur Aufrufzeit `pipe = pipeline(...); output = pipe(...., maybe_arg=4)`.
+
+Die Rückgabe von `_sanitize_parameters` sind die 3 Dicts von kwargs, die direkt an `preprocess` übergeben werden,
+`_forward` und `postprocess` übergeben werden. Füllen Sie nichts aus, wenn der Aufrufer keinen zusätzlichen Parameter angegeben hat. Das
+erlaubt es, die Standardargumente in der Funktionsdefinition beizubehalten, was immer "natürlicher" ist.
+
+Ein klassisches Beispiel wäre das Argument `top_k` in der Nachbearbeitung bei Klassifizierungsaufgaben.
+
+```python
+>>> pipe = pipeline("my-new-task")
+>>> pipe("This is a test")
+[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}, {"label": "3-star", "score": 0.05}
+{"label": "4-star", "score": 0.025}, {"label": "5-star", "score": 0.025}]
+
+>>> pipe("This is a test", top_k=2)
+[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}]
+```
+
+In order to achieve that, we'll update our `postprocess` method with a default parameter to `5`. and edit
+`_sanitize_parameters` to allow this new parameter.
+
+
+```python
+def postprocess(self, model_outputs, top_k=5):
+    best_class = model_outputs["logits"].softmax(-1)
+    # Add logic to handle top_k
+    return best_class
+
+
+def _sanitize_parameters(self, **kwargs):
+    preprocess_kwargs = {}
+    if "maybe_arg" in kwargs:
+        preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
+
+    postprocess_kwargs = {}
+    if "top_k" in kwargs:
+        postprocess_kwargs["top_k"] = kwargs["top_k"]
+    return preprocess_kwargs, {}, postprocess_kwargs
+```
+
+Versuchen Sie, die Eingaben/Ausgaben sehr einfach und idealerweise JSON-serialisierbar zu halten, da dies die Verwendung der Pipeline sehr einfach macht
+ohne dass die Benutzer neue Arten von Objekten verstehen müssen. Es ist auch relativ üblich, viele verschiedene Arten von Argumenten zu unterstützen
+von Argumenten zu unterstützen (Audiodateien, die Dateinamen, URLs oder reine Bytes sein können).
+
+
+
+## Hinzufügen zur Liste der unterstützten Aufgaben
+
+Um Ihre `neue Aufgabe` in die Liste der unterstützten Aufgaben aufzunehmen, müssen Sie sie zur `PIPELINE_REGISTRY` hinzufügen:
+
+```python
+from transformers.pipelines import PIPELINE_REGISTRY
+
+PIPELINE_REGISTRY.register_pipeline(
+    "new-task",
+    pipeline_class=MyPipeline,
+    pt_model=AutoModelForSequenceClassification,
+)
+```
+
+Wenn Sie möchten, können Sie ein Standardmodell angeben. In diesem Fall sollte es mit einer bestimmten Revision (die der Name einer Verzweigung oder ein Commit-Hash sein kann, hier haben wir `"abcdef"` genommen) sowie mit dem Typ versehen sein:
+
+```python
+PIPELINE_REGISTRY.register_pipeline(
+    "new-task",
+    pipeline_class=MyPipeline,
+    pt_model=AutoModelForSequenceClassification,
+    default={"pt": ("user/awesome_model", "abcdef")},
+    type="text",  # current support type: text, audio, image, multimodal
+)
+```
+
+## Teilen Sie Ihre Pipeline auf dem Hub
+
+Um Ihre benutzerdefinierte Pipeline auf dem Hub freizugeben, müssen Sie lediglich den benutzerdefinierten Code Ihrer `Pipeline`-Unterklasse in einer
+Python-Datei speichern. Nehmen wir zum Beispiel an, Sie möchten eine benutzerdefinierte Pipeline für die Klassifizierung von Satzpaaren wie folgt verwenden:
+
+```py
+import numpy as np
+
+from transformers import Pipeline
+
+
+def softmax(outputs):
+    maxes = np.max(outputs, axis=-1, keepdims=True)
+    shifted_exp = np.exp(outputs - maxes)
+    return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
+
+
+class PairClassificationPipeline(Pipeline):
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_kwargs = {}
+        if "second_text" in kwargs:
+            preprocess_kwargs["second_text"] = kwargs["second_text"]
+        return preprocess_kwargs, {}, {}
+
+    def preprocess(self, text, second_text=None):
+        return self.tokenizer(text, text_pair=second_text, return_tensors=self.framework)
+
+    def _forward(self, model_inputs):
+        return self.model(**model_inputs)
+
+    def postprocess(self, model_outputs):
+        logits = model_outputs.logits[0].numpy()
+        probabilities = softmax(logits)
+
+        best_class = np.argmax(probabilities)
+        label = self.model.config.id2label[best_class]
+        score = probabilities[best_class].item()
+        logits = logits.tolist()
+        return {"label": label, "score": score, "logits": logits}
+```
+
+Die Implementierung ist Framework-unabhängig und funktioniert für PyTorch- und TensorFlow-Modelle. Wenn wir dies in einer Datei
+einer Datei namens `pair_classification.py` gespeichert haben, können wir sie importieren und wie folgt registrieren:
+
+```py
+from pair_classification import PairClassificationPipeline
+from transformers.pipelines import PIPELINE_REGISTRY
+from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification
+
+PIPELINE_REGISTRY.register_pipeline(
+    "pair-classification",
+    pipeline_class=PairClassificationPipeline,
+    pt_model=AutoModelForSequenceClassification,
+    tf_model=TFAutoModelForSequenceClassification,
+)
+```
+
+Sobald dies geschehen ist, können wir es mit einem vortrainierten Modell verwenden. Zum Beispiel wurde `sgugger/finetuned-bert-mrpc` auf den
+auf den MRPC-Datensatz abgestimmt, der Satzpaare als Paraphrasen oder nicht klassifiziert.
+
+```py
+from transformers import pipeline
+
+classifier = pipeline("pair-classification", model="sgugger/finetuned-bert-mrpc")
+```
+
+Dann können wir sie auf dem Hub mit der Methode `save_pretrained` in einem `Repository` freigeben:
+
+```py
+from huggingface_hub import Repository
+
+repo = Repository("test-dynamic-pipeline", clone_from="{your_username}/test-dynamic-pipeline")
+classifier.save_pretrained("test-dynamic-pipeline")
+repo.push_to_hub()
+```
+
+Dadurch wird die Datei, in der Sie `PairClassificationPipeline` definiert haben, in den Ordner `"test-dynamic-pipeline"` kopiert,
+und speichert das Modell und den Tokenizer der Pipeline, bevor Sie alles in das Repository verschieben
+`{Ihr_Benutzername}/test-dynamic-pipeline`. Danach kann jeder die Pipeline verwenden, solange er die Option
+`trust_remote_code=True` angeben:
+
+```py
+from transformers import pipeline
+
+classifier = pipeline(model="{your_username}/test-dynamic-pipeline", trust_remote_code=True)
+```
+
+## Hinzufügen der Pipeline zu 🤗 Transformers
+
+Wenn Sie Ihre Pipeline zu 🤗 Transformers beitragen möchten, müssen Sie ein neues Modul im Untermodul `pipelines` hinzufügen
+mit dem Code Ihrer Pipeline hinzufügen. Fügen Sie es dann der Liste der in `pipelines/__init__.py` definierten Aufgaben hinzu.
+
+Dann müssen Sie noch Tests hinzufügen. Erstellen Sie eine neue Datei `tests/test_pipelines_MY_PIPELINE.py` mit Beispielen für die anderen Tests.
+
+Die Funktion `run_pipeline_test` ist sehr allgemein gehalten und läuft auf kleinen Zufallsmodellen auf jeder möglichen
+Architektur, wie durch `model_mapping` und `tf_model_mapping` definiert.
+
+Dies ist sehr wichtig, um die zukünftige Kompatibilität zu testen, d.h. wenn jemand ein neues Modell für
+`XXXForQuestionAnswering` hinzufügt, wird der Pipeline-Test versuchen, mit diesem Modell zu arbeiten. Da die Modelle zufällig sind, ist es
+ist es unmöglich, die tatsächlichen Werte zu überprüfen. Deshalb gibt es eine Hilfsfunktion `ANY`, die einfach versucht, die
+Ausgabe der Pipeline TYPE.
+
+Außerdem *müssen* Sie 2 (idealerweise 4) Tests implementieren.
+
+- test_small_model_pt` : Definieren Sie 1 kleines Modell für diese Pipeline (es spielt keine Rolle, ob die Ergebnisse keinen Sinn ergeben)
+  und testen Sie die Ausgaben der Pipeline. Die Ergebnisse sollten die gleichen sein wie bei `test_small_model_tf`.
+- test_small_model_tf : Definieren Sie 1 kleines Modell für diese Pipeline (es spielt keine Rolle, ob die Ergebnisse keinen Sinn ergeben)
+  und testen Sie die Ausgaben der Pipeline. Die Ergebnisse sollten die gleichen sein wie bei `test_small_model_pt`.
+- test_large_model_pt` (`optional`): Testet die Pipeline an einer echten Pipeline, bei der die Ergebnisse
+  Sinn machen. Diese Tests sind langsam und sollten als solche gekennzeichnet werden. Hier geht es darum, die Pipeline zu präsentieren und sicherzustellen
+  sicherzustellen, dass es in zukünftigen Versionen keine Abweichungen gibt.
+- test_large_model_tf` (`optional`): Testet die Pipeline an einer echten Pipeline, bei der die Ergebnisse
+  Sinn machen. Diese Tests sind langsam und sollten als solche gekennzeichnet werden. Hier geht es darum, die Pipeline zu präsentieren und sicherzustellen
+  sicherzustellen, dass es in zukünftigen Versionen keine Abweichungen gibt.
diff --git a/docs/source/de/add_tensorflow_model.md b/docs/source/de/add_tensorflow_model.md
new file mode 100644
index 000000000000..cc640aeb5e64
--- /dev/null
+++ b/docs/source/de/add_tensorflow_model.md
@@ -0,0 +1,356 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Wie konvertiert man ein 🤗 Transformers-Modell in TensorFlow?
+
+Die Tatsache, dass mehrere Frameworks für die Verwendung mit 🤗 Transformers zur Verfügung stehen, gibt Ihnen die Flexibilität, deren Stärken beim Entwurf Ihrer Anwendung auszuspielen.
+Ihre Anwendung zu entwerfen, aber das bedeutet auch, dass die Kompatibilität für jedes Modell einzeln hinzugefügt werden muss. Die gute Nachricht ist, dass
+das Hinzufügen von TensorFlow-Kompatibilität zu einem bestehenden Modell einfacher ist als [das Hinzufügen eines neuen Modells von Grund auf](add_new_model)!
+Ob Sie ein tieferes Verständnis für große TensorFlow-Modelle haben möchten, einen wichtigen Open-Source-Beitrag leisten oder
+TensorFlow für das Modell Ihrer Wahl aktivieren wollen, dieser Leitfaden ist für Sie.
+
+Dieser Leitfaden befähigt Sie, ein Mitglied unserer Gemeinschaft, TensorFlow-Modellgewichte und/oder
+Architekturen beizusteuern, die in 🤗 Transformers verwendet werden sollen, und zwar mit minimaler Betreuung durch das Hugging Face Team. Das Schreiben eines neuen Modells
+ist keine Kleinigkeit, aber ich hoffe, dass dieser Leitfaden dazu beiträgt, dass es weniger eine Achterbahnfahrt 🎢 und mehr ein Spaziergang im Park 🚶 ist.
+Die Nutzung unserer kollektiven Erfahrungen ist absolut entscheidend, um diesen Prozess immer einfacher zu machen, und deshalb möchten wir
+ermutigen Sie daher, Verbesserungsvorschläge für diesen Leitfaden zu machen!
+
+Bevor Sie tiefer eintauchen, empfehlen wir Ihnen, die folgenden Ressourcen zu lesen, wenn Sie neu in 🤗 Transformers sind:
+- [Allgemeiner Überblick über 🤗 Transformers](add_new_model#general-overview-of-transformers)
+- [Die TensorFlow-Philosophie von Hugging Face](https://huggingface.co/blog/tensorflow-philosophy)
+
+Im Rest dieses Leitfadens werden Sie lernen, was nötig ist, um eine neue TensorFlow Modellarchitektur hinzuzufügen, die
+Verfahren zur Konvertierung von PyTorch in TensorFlow-Modellgewichte und wie Sie Unstimmigkeiten zwischen ML
+Frameworks. Legen Sie los!
+
+<Tip>
+
+Sind Sie unsicher, ob das Modell, das Sie verwenden möchten, bereits eine entsprechende TensorFlow-Architektur hat?
+
+&nbsp;
+
+Überprüfen Sie das Feld `model_type` in der `config.json` des Modells Ihrer Wahl
+([Beispiel](https://huggingface.co/bert-base-uncased/blob/main/config.json#L14)). Wenn der entsprechende Modellordner in
+🤗 Transformers eine Datei hat, deren Name mit "modeling_tf" beginnt, bedeutet dies, dass es eine entsprechende TensorFlow
+Architektur hat ([Beispiel](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert)).
+
+</Tip>
+
+
+## Schritt-für-Schritt-Anleitung zum Hinzufügen von TensorFlow-Modellarchitektur-Code
+
+Es gibt viele Möglichkeiten, eine große Modellarchitektur zu entwerfen, und viele Möglichkeiten, diesen Entwurf zu implementieren. Wie auch immer,
+Sie erinnern sich vielleicht an unseren [allgemeinen Überblick über 🤗 Transformers](add_new_model#general-overview-of-transformers)
+wissen, dass wir ein meinungsfreudiger Haufen sind - die Benutzerfreundlichkeit von 🤗 Transformers hängt von konsistenten Designentscheidungen ab. Aus
+Erfahrung können wir Ihnen ein paar wichtige Dinge über das Hinzufügen von TensorFlow-Modellen sagen:
+
+- Erfinden Sie das Rad nicht neu! In den meisten Fällen gibt es mindestens zwei Referenzimplementierungen, die Sie überprüfen sollten: das
+PyTorch-Äquivalent des Modells, das Sie implementieren, und andere TensorFlow-Modelle für dieselbe Klasse von Problemen.
+- Gute Modellimplementierungen überleben den Test der Zeit. Dies geschieht nicht, weil der Code hübsch ist, sondern eher
+sondern weil der Code klar, einfach zu debuggen und darauf aufzubauen ist. Wenn Sie den Maintainern das Leben mit Ihrer
+TensorFlow-Implementierung leicht machen, indem Sie die gleichen Muster wie in anderen TensorFlow-Modellen nachbilden und die Abweichung
+zur PyTorch-Implementierung minimieren, stellen Sie sicher, dass Ihr Beitrag lange Bestand haben wird.
+- Bitten Sie um Hilfe, wenn Sie nicht weiterkommen! Das 🤗 Transformers-Team ist da, um zu helfen, und wir haben wahrscheinlich Lösungen für die gleichen
+Probleme gefunden, vor denen Sie stehen.
+
+Hier finden Sie einen Überblick über die Schritte, die zum Hinzufügen einer TensorFlow-Modellarchitektur erforderlich sind:
+1. Wählen Sie das Modell, das Sie konvertieren möchten
+2. Bereiten Sie die Transformers-Entwicklungsumgebung vor.
+3. (Optional) Verstehen Sie die theoretischen Aspekte und die bestehende Implementierung
+4. Implementieren Sie die Modellarchitektur
+5. Implementieren Sie Modelltests
+6. Reichen Sie den Pull-Antrag ein
+7. (Optional) Erstellen Sie Demos und teilen Sie diese mit der Welt
+
+### 1.-3. Bereiten Sie Ihren Modellbeitrag vor
+
+**1. Wählen Sie das Modell, das Sie konvertieren möchten**
+
+Beginnen wir mit den Grundlagen: Als erstes müssen Sie die Architektur kennen, die Sie konvertieren möchten. Wenn Sie
+Sie sich nicht auf eine bestimmte Architektur festgelegt haben, ist es eine gute Möglichkeit, das 🤗 Transformers-Team um Vorschläge zu bitten.
+Wir werden Sie zu den wichtigsten Architekturen führen, die auf der TensorFlow-Seite noch fehlen.
+Seite fehlen. Wenn das spezifische Modell, das Sie mit TensorFlow verwenden möchten, bereits eine Implementierung der TensorFlow-Architektur in
+🤗 Transformers, aber es fehlen Gewichte, können Sie direkt in den
+Abschnitt [Gewichtskonvertierung](#adding-tensorflow-weights-to-hub)
+auf dieser Seite.
+
+Der Einfachheit halber wird im Rest dieser Anleitung davon ausgegangen, dass Sie sich entschieden haben, mit der TensorFlow-Version von
+*BrandNewBert* (dasselbe Beispiel wie in der [Anleitung](add_new_model), um ein neues Modell von Grund auf hinzuzufügen).
+
+<Tip>
+
+Bevor Sie mit der Arbeit an einer TensorFlow-Modellarchitektur beginnen, sollten Sie sich vergewissern, dass es keine laufenden Bemühungen in dieser Richtung gibt.
+Sie können nach `BrandNewBert` auf der
+[pull request GitHub page](https://github.com/huggingface/transformers/pulls?q=is%3Apr), um zu bestätigen, dass es keine
+TensorFlow-bezogene Pull-Anfrage gibt.
+
+</Tip>
+
+
+**2. Transformers-Entwicklungsumgebung vorbereiten**
+
+Nachdem Sie die Modellarchitektur ausgewählt haben, öffnen Sie einen PR-Entwurf, um Ihre Absicht zu signalisieren, daran zu arbeiten. Folgen Sie den
+Anweisungen, um Ihre Umgebung einzurichten und einen PR-Entwurf zu öffnen.
+
+1. Forken Sie das [repository](https://github.com/huggingface/transformers), indem Sie auf der Seite des Repositorys auf die Schaltfläche 'Fork' klicken.
+   Seite des Repositorys klicken. Dadurch wird eine Kopie des Codes unter Ihrem GitHub-Benutzerkonto erstellt.
+
+2. Klonen Sie Ihren `transformers` Fork auf Ihre lokale Festplatte und fügen Sie das Basis-Repository als Remote hinzu:
+
+```bash
+git clone https://github.com/[your Github handle]/transformers.git
+cd transformers
+git remote add upstream https://github.com/huggingface/transformers.git
+```
+
+3. Richten Sie eine Entwicklungsumgebung ein, indem Sie z.B. den folgenden Befehl ausführen:
+
+```bash
+python -m venv .env
+source .env/bin/activate
+pip install -e ".[dev]"
+```
+
+Abhängig von Ihrem Betriebssystem und da die Anzahl der optionalen Abhängigkeiten von Transformers wächst, kann es sein, dass Sie bei diesem Befehl einen
+Fehler mit diesem Befehl erhalten. Wenn das der Fall ist, stellen Sie sicher, dass Sie TensorFlow installieren und dann ausführen:
+
+```bash
+pip install -e ".[quality]"
+```
+
+**Hinweis:** Sie müssen CUDA nicht installiert haben. Es reicht aus, das neue Modell auf der CPU laufen zu lassen.
+
+4. Erstellen Sie eine Verzweigung mit einem beschreibenden Namen von Ihrer Hauptverzweigung
+
+```bash
+git checkout -b add_tf_brand_new_bert
+```
+
+5. Abrufen und zurücksetzen auf die aktuelle Hauptversion
+
+```bash
+git fetch upstream
+git rebase upstream/main
+```
+
+6. Fügen Sie eine leere `.py` Datei in `transformers/src/models/brandnewbert/` mit dem Namen `modeling_tf_brandnewbert.py` hinzu. Dies wird
+Ihre TensorFlow-Modelldatei sein.
+
+7. Übertragen Sie die Änderungen auf Ihr Konto mit:
+
+```bash
+git add .
+git commit -m "initial commit"
+git push -u origin add_tf_brand_new_bert
+```
+
+8. Wenn Sie zufrieden sind, gehen Sie auf die Webseite Ihrer Abspaltung auf GitHub. Klicken Sie auf "Pull request". Stellen Sie sicher, dass Sie das
+   GitHub-Handle einiger Mitglieder des Hugging Face-Teams als Reviewer hinzuzufügen, damit das Hugging Face-Team über zukünftige Änderungen informiert wird.
+   zukünftige Änderungen benachrichtigt wird.
+
+9. Ändern Sie den PR in einen Entwurf, indem Sie auf der rechten Seite der GitHub-Pull-Request-Webseite auf "In Entwurf umwandeln" klicken.
+
+
+Jetzt haben Sie eine Entwicklungsumgebung eingerichtet, um *BrandNewBert* nach TensorFlow in 🤗 Transformers zu portieren.
+
+
+**3. (Optional) Verstehen Sie die theoretischen Aspekte und die bestehende Implementierung**
+
+Sie sollten sich etwas Zeit nehmen, um die Arbeit von *BrandNewBert* zu lesen, falls eine solche Beschreibung existiert. Möglicherweise gibt es große
+Abschnitte des Papiers, die schwer zu verstehen sind. Wenn das der Fall ist, ist das in Ordnung - machen Sie sich keine Sorgen! Das Ziel ist
+ist es nicht, ein tiefes theoretisches Verständnis des Papiers zu erlangen, sondern die notwendigen Informationen zu extrahieren, um
+das Modell mit Hilfe von TensorFlow effektiv in 🤗 Transformers neu zu implementieren. Das heißt, Sie müssen nicht zu viel Zeit auf die
+viel Zeit auf die theoretischen Aspekte verwenden, sondern sich lieber auf die praktischen Aspekte konzentrieren, nämlich auf die bestehende Modelldokumentation
+Seite (z.B. [model docs for BERT](model_doc/bert)).
+
+Nachdem Sie die Grundlagen der Modelle, die Sie implementieren wollen, verstanden haben, ist es wichtig, die bestehende
+Implementierung zu verstehen. Dies ist eine gute Gelegenheit, sich zu vergewissern, dass eine funktionierende Implementierung mit Ihren Erwartungen an das
+Modell entspricht, und um technische Herausforderungen auf der TensorFlow-Seite vorauszusehen.
+
+Es ist ganz natürlich, dass Sie sich von der Menge an Informationen, die Sie gerade aufgesogen haben, überwältigt fühlen. Es ist
+Es ist definitiv nicht erforderlich, dass Sie in dieser Phase alle Facetten des Modells verstehen. Dennoch empfehlen wir Ihnen dringend
+ermutigen wir Sie, alle dringenden Fragen in unserem [Forum](https://discuss.huggingface.co/) zu klären.
+
+
+### 4. Implementierung des Modells
+
+Jetzt ist es an der Zeit, endlich mit dem Programmieren zu beginnen. Als Ausgangspunkt empfehlen wir die PyTorch-Datei selbst: Kopieren Sie den Inhalt von
+modeling_brand_new_bert.py` in `src/transformers/models/brand_new_bert/` nach
+modeling_tf_brand_new_bert.py`. Das Ziel dieses Abschnitts ist es, die Datei zu ändern und die Importstruktur von
+🤗 Transformers zu aktualisieren, so dass Sie `TFBrandNewBert` und
+`TFBrandNewBert.from_pretrained(model_repo, from_pt=True)` erfolgreich ein funktionierendes TensorFlow *BrandNewBert* Modell lädt.
+
+Leider gibt es kein Rezept, um ein PyTorch-Modell in TensorFlow zu konvertieren. Sie können jedoch unsere Auswahl an
+Tipps befolgen, um den Prozess so reibungslos wie möglich zu gestalten:
+- Stellen Sie `TF` dem Namen aller Klassen voran (z.B. wird `BrandNewBert` zu `TFBrandNewBert`).
+- Die meisten PyTorch-Operationen haben einen direkten TensorFlow-Ersatz. Zum Beispiel entspricht `torch.nn.Linear` der Klasse
+  `tf.keras.layers.Dense`, `torch.nn.Dropout` entspricht `tf.keras.layers.Dropout`, usw. Wenn Sie sich nicht sicher sind
+  über eine bestimmte Operation nicht sicher sind, können Sie die [TensorFlow-Dokumentation](https://www.tensorflow.org/api_docs/python/tf)
+  oder die [PyTorch-Dokumentation](https://pytorch.org/docs/stable/).
+- Suchen Sie nach Mustern in der Codebasis von 🤗 Transformers. Wenn Sie auf eine bestimmte Operation stoßen, für die es keinen direkten Ersatz gibt
+   Ersatz hat, stehen die Chancen gut, dass jemand anderes bereits das gleiche Problem hatte.
+- Behalten Sie standardmäßig die gleichen Variablennamen und die gleiche Struktur wie in PyTorch bei. Dies erleichtert die Fehlersuche, die Verfolgung von
+   Probleme zu verfolgen und spätere Korrekturen vorzunehmen.
+- Einige Ebenen haben in jedem Framework unterschiedliche Standardwerte. Ein bemerkenswertes Beispiel ist die Schicht für die Batch-Normalisierung
+   epsilon (`1e-5` in [PyTorch](https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm2d.html#torch.nn.BatchNorm2d)
+   und `1e-3` in [TensorFlow](https://www.tensorflow.org/api_docs/python/tf/keras/layers/BatchNormalization)).
+   Prüfen Sie die Dokumentation genau!
+- Die Variablen `nn.Parameter` von PyTorch müssen in der Regel innerhalb von TF Layer's `build()` initialisiert werden. Siehe das folgende
+   Beispiel: [PyTorch](https://github.com/huggingface/transformers/blob/655f72a6896c0533b1bdee519ed65a059c2425ac/src/transformers/models/vit_mae/modeling_vit_mae.py#L212) /
+   [TensorFlow](https://github.com/huggingface/transformers/blob/655f72a6896c0533b1bdee519ed65a059c2425ac/src/transformers/models/vit_mae/modeling_tf_vit_mae.py#L220)
+- Wenn das PyTorch-Modell ein `#copied from ...` am Anfang einer Funktion hat, stehen die Chancen gut, dass Ihr TensorFlow-Modell diese Funktion auch
+   diese Funktion von der Architektur ausleihen kann, von der sie kopiert wurde, vorausgesetzt, es hat eine TensorFlow-Architektur.
+- Die korrekte Zuweisung des Attributs `name` in TensorFlow-Funktionen ist entscheidend, um das `from_pt=True` Gewicht zu erreichen
+   Cross-Loading. Name" ist fast immer der Name der entsprechenden Variablen im PyTorch-Code. Wenn `name` nicht
+   nicht richtig gesetzt ist, sehen Sie dies in der Fehlermeldung beim Laden der Modellgewichte.
+- Die Logik der Basismodellklasse, `BrandNewBertModel`, befindet sich in `TFBrandNewBertMainLayer`, einer Keras
+   Schicht-Unterklasse ([Beispiel](https://github.com/huggingface/transformers/blob/4fd32a1f499e45f009c2c0dea4d81c321cba7e02/src/transformers/models/bert/modeling_tf_bert.py#L719)).
+   TFBrandNewBertModel" ist lediglich ein Wrapper für diese Schicht.
+- Keras-Modelle müssen erstellt werden, um die vorher trainierten Gewichte zu laden. Aus diesem Grund muss `TFBrandNewBertPreTrainedModel`
+   ein Beispiel für die Eingaben in das Modell enthalten, die `dummy_inputs`
+   ([Beispiel](https://github.com/huggingface/transformers/blob/4fd32a1f499e45f009c2c0dea4d81c321cba7e02/src/transformers/models/bert/modeling_tf_bert.py#L916)).
+- Wenn Sie nicht weiterkommen, fragen Sie nach Hilfe - wir sind für Sie da! 🤗
+
+Neben der Modelldatei selbst müssen Sie auch die Verweise auf die Modellklassen und die zugehörigen
+Dokumentationsseiten hinzufügen. Sie können diesen Teil ganz nach den Mustern in anderen PRs erledigen
+([Beispiel](https://github.com/huggingface/transformers/pull/18020/files)). Hier ist eine Liste der erforderlichen manuellen
+Änderungen:
+- Fügen Sie alle öffentlichen Klassen von *BrandNewBert* in `src/transformers/__init__.py` ein.
+- Fügen Sie *BrandNewBert* Klassen zu den entsprechenden Auto Klassen in `src/transformers/models/auto/modeling_tf_auto.py` hinzu.
+- Fügen Sie die *BrandNewBert* zugehörigen Klassen für träges Laden in `src/transformers/utils/dummy_tf_objects.py` hinzu.
+- Aktualisieren Sie die Importstrukturen für die öffentlichen Klassen in `src/transformers/models/brand_new_bert/__init__.py`.
+- Fügen Sie die Dokumentationszeiger auf die öffentlichen Methoden von *BrandNewBert* in `docs/source/de/model_doc/brand_new_bert.md` hinzu.
+- Fügen Sie sich selbst zur Liste der Mitwirkenden an *BrandNewBert* in `docs/source/de/model_doc/brand_new_bert.md` hinzu.
+- Fügen Sie schließlich ein grünes Häkchen ✅ in der TensorFlow-Spalte von *BrandNewBert* in `docs/source/de/index.md` hinzu.
+
+Wenn Sie mit Ihrer Implementierung zufrieden sind, führen Sie die folgende Checkliste aus, um zu bestätigen, dass Ihre Modellarchitektur
+fertig ist:
+1. Alle Schichten, die sich zur Trainingszeit anders verhalten (z.B. Dropout), werden mit einem `Training` Argument aufgerufen, das
+von den Top-Level-Klassen weitergegeben wird
+2. Sie haben `#copied from ...` verwendet, wann immer es möglich war.
+3. Die Funktion `TFBrandNewBertMainLayer` und alle Klassen, die sie verwenden, haben ihre Funktion `call` mit `@unpack_inputs` dekoriert
+4. TFBrandNewBertMainLayer` ist mit `@keras_serializable` dekoriert
+5. Ein TensorFlow-Modell kann aus PyTorch-Gewichten mit `TFBrandNewBert.from_pretrained(model_repo, from_pt=True)` geladen werden.
+6. Sie können das TensorFlow Modell mit dem erwarteten Eingabeformat aufrufen
+
+
+### 5. Modell-Tests hinzufügen
+
+Hurra, Sie haben ein TensorFlow-Modell implementiert! Jetzt ist es an der Zeit, Tests hinzuzufügen, um sicherzustellen, dass sich Ihr Modell wie erwartet verhält.
+erwartet. Wie im vorigen Abschnitt schlagen wir vor, dass Sie zunächst die Datei `test_modeling_brand_new_bert.py` in
+`tests/models/brand_new_bert/` in die Datei `test_modeling_tf_brand_new_bert.py` zu kopieren und dann die notwendigen
+TensorFlow-Ersetzungen vornehmen. Für den Moment sollten Sie in allen Aufrufen von `.from_pretrained()` das Flag `from_pt=True` verwenden, um die
+die vorhandenen PyTorch-Gewichte zu laden.
+
+Wenn Sie damit fertig sind, kommt der Moment der Wahrheit: Führen Sie die Tests durch! 😬
+
+```bash
+NVIDIA_TF32_OVERRIDE=0 RUN_SLOW=1 RUN_PT_TF_CROSS_TESTS=1 \
+py.test -vv tests/models/brand_new_bert/test_modeling_tf_brand_new_bert.py
+```
+
+Das wahrscheinlichste Ergebnis ist, dass Sie eine Reihe von Fehlern sehen werden. Machen Sie sich keine Sorgen, das ist zu erwarten! Das Debuggen von ML-Modellen ist
+notorisch schwierig, und der Schlüssel zum Erfolg ist Geduld (und `breakpoint()`). Nach unserer Erfahrung sind die schwierigsten
+Probleme aus subtilen Unstimmigkeiten zwischen ML-Frameworks, zu denen wir am Ende dieses Leitfadens ein paar Hinweise geben.
+In anderen Fällen kann es sein, dass ein allgemeiner Test nicht direkt auf Ihr Modell anwendbar ist; in diesem Fall empfehlen wir eine Überschreibung
+auf der Ebene der Modelltestklasse. Zögern Sie nicht, in Ihrem Entwurf einer Pull-Anfrage um Hilfe zu bitten, wenn
+Sie nicht weiterkommen.
+
+Wenn alle Tests erfolgreich waren, können Sie Ihr Modell in die 🤗 Transformers-Bibliothek aufnehmen! 🎉
+
+### 6.-7. Stellen Sie sicher, dass jeder Ihr Modell verwenden kann
+
+**6. Reichen Sie den Pull Request ein**
+
+Sobald Sie mit der Implementierung und den Tests fertig sind, ist es an der Zeit, eine Pull-Anfrage einzureichen. Bevor Sie Ihren Code einreichen,
+führen Sie unser Dienstprogramm zur Codeformatierung, `make fixup` 🪄, aus. Damit werden automatisch alle Formatierungsfehler behoben, die dazu führen würden, dass
+unsere automatischen Prüfungen fehlschlagen würden.
+
+Nun ist es an der Zeit, Ihren Entwurf einer Pull-Anfrage in eine echte Pull-Anfrage umzuwandeln. Klicken Sie dazu auf die Schaltfläche "Bereit für
+Review" und fügen Sie Joao (`@gante`) und Matt (`@Rocketknight1`) als Reviewer hinzu. Eine Modell-Pull-Anfrage benötigt
+mindestens 3 Reviewer, aber sie werden sich darum kümmern, geeignete zusätzliche Reviewer für Ihr Modell zu finden.
+
+Nachdem alle Gutachter mit dem Stand Ihres PR zufrieden sind, entfernen Sie als letzten Aktionspunkt das Flag `from_pt=True` in
+.from_pretrained()-Aufrufen zu entfernen. Da es keine TensorFlow-Gewichte gibt, müssen Sie sie hinzufügen! Lesen Sie den Abschnitt
+unten, um zu erfahren, wie Sie dies tun können.
+
+Wenn schließlich die TensorFlow-Gewichte zusammengeführt werden, Sie mindestens 3 Genehmigungen von Prüfern haben und alle CI-Checks grün sind
+grün sind, überprüfen Sie die Tests ein letztes Mal lokal
+
+```bash
+NVIDIA_TF32_OVERRIDE=0 RUN_SLOW=1 RUN_PT_TF_CROSS_TESTS=1 \
+py.test -vv tests/models/brand_new_bert/test_modeling_tf_brand_new_bert.py
+```
+
+und wir werden Ihren PR zusammenführen! Herzlichen Glückwunsch zu dem Meilenstein 🎉.
+
+**7. (Optional) Erstellen Sie Demos und teilen Sie sie mit der Welt**
+
+Eine der schwierigsten Aufgaben bei Open-Source ist die Entdeckung. Wie können die anderen Benutzer von der Existenz Ihres
+fabelhaften TensorFlow-Beitrags erfahren? Mit der richtigen Kommunikation, natürlich! 📣
+
+Es gibt vor allem zwei Möglichkeiten, Ihr Modell mit der Community zu teilen:
+- Erstellen Sie Demos. Dazu gehören Gradio-Demos, Notebooks und andere unterhaltsame Möglichkeiten, Ihr Modell vorzuführen. Wir raten Ihnen
+   ermutigen Sie, ein Notizbuch zu unseren [community-driven demos](https://huggingface.co/docs/transformers/community) hinzuzufügen.
+- Teilen Sie Geschichten in sozialen Medien wie Twitter und LinkedIn. Sie sollten stolz auf Ihre Arbeit sein und sie mit der
+   Ihre Leistung mit der Community teilen - Ihr Modell kann nun von Tausenden von Ingenieuren und Forschern auf der ganzen Welt genutzt werden
+   der Welt genutzt werden 🌍! Wir werden Ihre Beiträge gerne retweeten und Ihnen helfen, Ihre Arbeit mit der Community zu teilen.
+
+
+## Hinzufügen von TensorFlow-Gewichten zum 🤗 Hub
+
+Unter der Annahme, dass die TensorFlow-Modellarchitektur in 🤗 Transformers verfügbar ist, ist die Umwandlung von PyTorch-Gewichten in
+TensorFlow-Gewichte ist ein Kinderspiel!
+
+Hier sehen Sie, wie es geht:
+1. Stellen Sie sicher, dass Sie in Ihrem Terminal bei Ihrem Hugging Face Konto angemeldet sind. Sie können sich mit dem folgenden Befehl anmelden
+   `huggingface-cli login` (Ihre Zugangstoken finden Sie [hier](https://huggingface.co/settings/tokens))
+2. Führen Sie `transformers-cli pt-to-tf --model-name foo/bar` aus, wobei `foo/bar` der Name des Modell-Repositorys ist
+   ist, das die PyTorch-Gewichte enthält, die Sie konvertieren möchten.
+3. Markieren Sie `@joaogante` und `@Rocketknight1` in dem 🤗 Hub PR, den der obige Befehl gerade erstellt hat
+
+Das war's! 🎉
+
+
+## Fehlersuche in verschiedenen ML-Frameworks 🐛
+
+Irgendwann, wenn Sie eine neue Architektur hinzufügen oder TensorFlow-Gewichte für eine bestehende Architektur erstellen, werden Sie
+stoßen Sie vielleicht auf Fehler, die sich über Unstimmigkeiten zwischen PyTorch und TensorFlow beschweren. Sie könnten sich sogar dazu entschließen, den
+Modellarchitektur-Code für die beiden Frameworks zu öffnen, und stellen fest, dass sie identisch aussehen. Was ist denn da los? 🤔
+
+Lassen Sie uns zunächst darüber sprechen, warum es wichtig ist, diese Diskrepanzen zu verstehen. Viele Community-Mitglieder werden 🤗
+Transformers-Modelle und vertrauen darauf, dass sich unsere Modelle wie erwartet verhalten. Wenn es eine große Diskrepanz gibt
+zwischen den beiden Frameworks auftritt, bedeutet dies, dass das Modell nicht der Referenzimplementierung für mindestens eines der Frameworks folgt.
+der Frameworks folgt. Dies kann zu stillen Fehlern führen, bei denen das Modell zwar läuft, aber eine schlechte Leistung aufweist. Dies ist
+wohl schlimmer als ein Modell, das überhaupt nicht läuft! Aus diesem Grund streben wir an, dass die Abweichung zwischen den Frameworks kleiner als
+1e-5" in allen Phasen des Modells.
+
+Wie bei anderen numerischen Problemen auch, steckt der Teufel im Detail. Und wie bei jedem detailorientierten Handwerk ist die geheime
+Zutat hier Geduld. Hier ist unser Vorschlag für den Arbeitsablauf, wenn Sie auf diese Art von Problemen stoßen:
+1. Lokalisieren Sie die Quelle der Abweichungen. Das Modell, das Sie konvertieren, hat wahrscheinlich bis zu einem gewissen Punkt nahezu identische innere Variablen.
+   bestimmten Punkt. Platzieren Sie `Breakpoint()`-Anweisungen in den Architekturen der beiden Frameworks und vergleichen Sie die Werte der
+   numerischen Variablen von oben nach unten, bis Sie die Quelle der Probleme gefunden haben.
+2. Nachdem Sie nun die Ursache des Problems gefunden haben, setzen Sie sich mit dem 🤗 Transformers-Team in Verbindung. Es ist möglich
+   dass wir ein ähnliches Problem schon einmal gesehen haben und umgehend eine Lösung anbieten können. Als Ausweichmöglichkeit können Sie beliebte Seiten
+   wie StackOverflow und GitHub-Probleme.
+3. Wenn keine Lösung in Sicht ist, bedeutet das, dass Sie tiefer gehen müssen. Die gute Nachricht ist, dass Sie das Problem gefunden haben.
+   Problem ausfindig gemacht haben, so dass Sie sich auf die problematische Anweisung konzentrieren und den Rest des Modells ausblenden können! Die schlechte Nachricht ist
+   dass Sie sich in die Quellimplementierung der besagten Anweisung einarbeiten müssen. In manchen Fällen finden Sie vielleicht ein
+   Problem mit einer Referenzimplementierung - verzichten Sie nicht darauf, ein Problem im Upstream-Repository zu öffnen.
+
+In einigen Fällen können wir nach Rücksprache mit dem 🤗 Transformers-Team zu dem Schluss kommen, dass die Behebung der Abweichung nicht machbar ist.
+Wenn die Abweichung in den Ausgabeschichten des Modells sehr klein ist (aber möglicherweise groß in den versteckten Zuständen), können wir
+könnten wir beschließen, sie zu ignorieren und das Modell zu verteilen. Die oben erwähnte CLI `pt-to-tf` hat ein `--max-error`
+Flag, um die Fehlermeldung bei der Gewichtskonvertierung zu überschreiben.
diff --git a/docs/source/de/autoclass_tutorial.md b/docs/source/de/autoclass_tutorial.md
new file mode 100644
index 000000000000..7707f7b39b49
--- /dev/null
+++ b/docs/source/de/autoclass_tutorial.md
@@ -0,0 +1,131 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Vortrainierte Instanzen mit einer AutoClass laden
+
+Bei so vielen verschiedenen Transformator-Architekturen kann es eine Herausforderung sein, eine für Ihren Checkpoint zu erstellen. Als Teil der 🤗 Transformers Kernphilosophie, die Bibliothek leicht, einfach und flexibel nutzbar zu machen, leitet eine `AutoClass` automatisch die richtige Architektur aus einem gegebenen Checkpoint ab und lädt sie. Mit der Methode `from_pretrained()` kann man schnell ein vortrainiertes Modell für eine beliebige Architektur laden, so dass man keine Zeit und Ressourcen aufwenden muss, um ein Modell von Grund auf zu trainieren. Die Erstellung dieser Art von Checkpoint-agnostischem Code bedeutet, dass Ihr Code, wenn er für einen Checkpoint funktioniert, auch mit einem anderen Checkpoint funktionieren wird - solange er für eine ähnliche Aufgabe trainiert wurde - selbst wenn die Architektur unterschiedlich ist.
+
+<Tip>
+
+Denken Sie daran, dass sich die Architektur auf das Skelett des Modells bezieht und die Checkpoints die Gewichte für eine bestimmte Architektur sind. Zum Beispiel ist [BERT](https://huggingface.co/bert-base-uncased) eine Architektur, während `bert-base-uncased` ein Checkpoint ist. Modell ist ein allgemeiner Begriff, der entweder Architektur oder Prüfpunkt bedeuten kann.
+
+</Tip>
+
+In dieser Anleitung lernen Sie, wie man:
+
+* Einen vortrainierten Tokenizer lädt.
+* Einen vortrainierten Merkmalsextraktor lädt.
+* Einen vortrainierten Prozessor lädt.
+* Ein vortrainiertes Modell lädt.
+
+## AutoTokenizer
+
+Nahezu jede NLP-Aufgabe beginnt mit einem Tokenizer. Ein Tokenizer wandelt Ihre Eingabe in ein Format um, das vom Modell verarbeitet werden kann.
+
+Laden Sie einen Tokenizer mit [`AutoTokenizer.from_pretrained`]:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+```
+
+Dann tokenisieren Sie Ihre Eingabe wie unten gezeigt:
+
+```py
+>>> sequence = "In a hole in the ground there lived a hobbit."
+>>> print(tokenizer(sequence))
+{'input_ids': [101, 1999, 1037, 4920, 1999, 1996, 2598, 2045, 2973, 1037, 7570, 10322, 4183, 1012, 102], 
+ 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+ 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+```
+
+## AutoFeatureExtractor
+
+Für Audio- und Bildverarbeitungsaufgaben verarbeitet ein Merkmalsextraktor das Audiosignal oder Bild in das richtige Eingabeformat.
+
+Laden Sie einen Merkmalsextraktor mit [`AutoFeatureExtractor.from_pretrained`]:
+
+```py
+>>> from transformers import AutoFeatureExtractor
+
+>>> feature_extractor = AutoFeatureExtractor.from_pretrained(
+...     "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
+... )
+```
+
+## AutoProcessor
+
+Multimodale Aufgaben erfordern einen Prozessor, der zwei Arten von Vorverarbeitungswerkzeugen kombiniert. Das Modell [LayoutLMV2](model_doc/layoutlmv2) beispielsweise benötigt einen Feature-Extraktor für Bilder und einen Tokenizer für Text; ein Prozessor kombiniert beide.
+
+Laden Sie einen Prozessor mit [`AutoProcessor.from_pretrained`]:
+
+```py
+>>> from transformers import AutoProcessor
+
+>>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased")
+```
+
+## AutoModel
+
+<frameworkcontent>
+<pt>
+Mit den `AutoModelFor`-Klassen können Sie schließlich ein vortrainiertes Modell für eine bestimmte Aufgabe laden (siehe [hier](model_doc/auto) für eine vollständige Liste der verfügbaren Aufgaben). Laden Sie zum Beispiel ein Modell für die Sequenzklassifikation mit [`AutoModelForSequenceClassification.from_pretrained`]:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+```
+
+Sie können denselben Prüfpunkt problemlos wiederverwenden, um eine Architektur für eine andere Aufgabe zu laden:
+
+```py
+>>> from transformers import AutoModelForTokenClassification
+
+>>> model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
+```
+
+<Tip warning={true}>
+
+Für PyTorch-Modelle verwendet die Methode `from_pretrained()` `torch.load()`, die intern `pickle` verwendet und als unsicher bekannt ist. Generell sollte man niemals ein Modell laden, das aus einer nicht vertrauenswürdigen Quelle stammen könnte, oder das manipuliert worden sein könnte. Dieses Sicherheitsrisiko wird für öffentliche Modelle, die auf dem Hugging Face Hub gehostet werden, teilweise gemildert, da diese bei jeder Übertragung [auf Malware](https://huggingface.co/docs/hub/security-malware) gescannt werden. Siehe die [Hub-Dokumentation](https://huggingface.co/docs/hub/security) für Best Practices wie [signierte Commit-Verifizierung](https://huggingface.co/docs/hub/security-gpg#signing-commits-with-gpg) mit GPG.
+
+TensorFlow- und Flax-Checkpoints sind nicht betroffen und können in PyTorch-Architekturen mit den Kwargs `from_tf` und `from_flax` für die Methode `from_pretrained` geladen werden, um dieses Problem zu umgehen.
+
+</Tip>
+
+Im Allgemeinen empfehlen wir die Verwendung der Klasse "AutoTokenizer" und der Klasse "AutoModelFor", um trainierte Instanzen von Modellen zu laden. Dadurch wird sichergestellt, dass Sie jedes Mal die richtige Architektur laden. Im nächsten [Tutorial] (Vorverarbeitung) erfahren Sie, wie Sie Ihren neu geladenen Tokenizer, Feature Extractor und Prozessor verwenden, um einen Datensatz für die Feinabstimmung vorzuverarbeiten.
+</pt>
+<tf>
+Mit den Klassen `TFAutoModelFor` schließlich können Sie ein vortrainiertes Modell für eine bestimmte Aufgabe laden (siehe [hier](model_doc/auto) für eine vollständige Liste der verfügbaren Aufgaben). Laden Sie zum Beispiel ein Modell für die Sequenzklassifikation mit [`TFAutoModelForSequenceClassification.from_pretrained`]:
+
+```py
+>>> from transformers import TFAutoModelForSequenceClassification
+
+>>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+```
+
+Sie können denselben Prüfpunkt problemlos wiederverwenden, um eine Architektur für eine andere Aufgabe zu laden:
+
+```py
+>>> from transformers import TFAutoModelForTokenClassification
+
+>>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
+```
+
+Im Allgemeinen empfehlen wir, die Klasse "AutoTokenizer" und die Klasse "TFAutoModelFor" zu verwenden, um vortrainierte Instanzen von Modellen zu laden. Dadurch wird sichergestellt, dass Sie jedes Mal die richtige Architektur laden. Im nächsten [Tutorial] (Vorverarbeitung) erfahren Sie, wie Sie Ihren neu geladenen Tokenizer, Feature Extractor und Prozessor verwenden, um einen Datensatz für die Feinabstimmung vorzuverarbeiten.
+</tf>
+</frameworkcontent>
diff --git a/docs/source/de/autoclass_tutorial.mdx b/docs/source/de/autoclass_tutorial.mdx
deleted file mode 100644
index 95247cd04ba0..000000000000
--- a/docs/source/de/autoclass_tutorial.mdx
+++ /dev/null
@@ -1,127 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Vortrainierte Instanzen mit einer AutoClass laden
-
-Bei so vielen verschiedenen Transformator-Architekturen kann es eine Herausforderung sein, eine für Ihren Checkpoint zu erstellen. Als Teil der 🤗 Transformers Kernphilosophie, die Bibliothek leicht, einfach und flexibel nutzbar zu machen, leitet eine `AutoClass` automatisch die richtige Architektur aus einem gegebenen Checkpoint ab und lädt sie. Mit der Methode `from_pretrained()` kann man schnell ein vortrainiertes Modell für eine beliebige Architektur laden, so dass man keine Zeit und Ressourcen aufwenden muss, um ein Modell von Grund auf zu trainieren. Die Erstellung dieser Art von Checkpoint-agnostischem Code bedeutet, dass Ihr Code, wenn er für einen Checkpoint funktioniert, auch mit einem anderen Checkpoint funktionieren wird - solange er für eine ähnliche Aufgabe trainiert wurde - selbst wenn die Architektur unterschiedlich ist.
-
-<Tip>
-
-Denken Sie daran, dass sich die Architektur auf das Skelett des Modells bezieht und die Checkpoints die Gewichte für eine bestimmte Architektur sind. Zum Beispiel ist [BERT](https://huggingface.co/bert-base-uncased) eine Architektur, während `bert-base-uncased` ein Checkpoint ist. Modell ist ein allgemeiner Begriff, der entweder Architektur oder Prüfpunkt bedeuten kann.
-
-</Tip>
-
-In dieser Anleitung lernen Sie, wie man:
-
-* Einen vortrainierten Tokenizer lädt.
-* Einen vortrainierten Merkmalsextraktor lädt.
-* Einen vortrainierten Prozessor lädt.
-* Ein vortrainiertes Modell lädt.
-
-## AutoTokenizer
-
-Nahezu jede NLP-Aufgabe beginnt mit einem Tokenizer. Ein Tokenizer wandelt Ihre Eingabe in ein Format um, das vom Modell verarbeitet werden kann.
-
-Laden Sie einen Tokenizer mit [`AutoTokenizer.from_pretrained`]:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
-```
-
-Dann tokenisieren Sie Ihre Eingabe wie unten gezeigt:
-
-```py
->>> sequence = "In a hole in the ground there lived a hobbit."
->>> print(tokenizer(sequence))
-{'input_ids': [101, 1999, 1037, 4920, 1999, 1996, 2598, 2045, 2973, 1037, 7570, 10322, 4183, 1012, 102], 
- 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
- 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
-```
-
-## AutoFeatureExtractor
-
-Für Audio- und Bildverarbeitungsaufgaben verarbeitet ein Merkmalsextraktor das Audiosignal oder Bild in das richtige Eingabeformat.
-
-Laden Sie einen Merkmalsextraktor mit [`AutoFeatureExtractor.from_pretrained`]:
-
-```py
->>> from transformers import AutoFeatureExtractor
-
->>> feature_extractor = AutoFeatureExtractor.from_pretrained(
-...     "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
-... )
-```
-
-## AutoProcessor
-
-Multimodale Aufgaben erfordern einen Prozessor, der zwei Arten von Vorverarbeitungswerkzeugen kombiniert. Das Modell [LayoutLMV2](model_doc/layoutlmv2) beispielsweise benötigt einen Feature-Extraktor für Bilder und einen Tokenizer für Text; ein Prozessor kombiniert beide.
-
-Laden Sie einen Prozessor mit [`AutoProcessor.from_pretrained`]:
-
-```py
->>> from transformers import AutoProcessor
-
->>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased")
-```
-
-## AutoModel
-
-<frameworkcontent>
-<pt>
-Mit den `AutoModelFor`-Klassen können Sie schließlich ein vortrainiertes Modell für eine bestimmte Aufgabe laden (siehe [hier](model_doc/auto) für eine vollständige Liste der verfügbaren Aufgaben). Laden Sie zum Beispiel ein Modell für die Sequenzklassifikation mit [`AutoModelForSequenceClassification.from_pretrained`]:
-
-```py
->>> from transformers import AutoModelForSequenceClassification
-
->>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
-```
-
-Sie können denselben Prüfpunkt problemlos wiederverwenden, um eine Architektur für eine andere Aufgabe zu laden:
-
-```py
->>> from transformers import AutoModelForTokenClassification
-
->>> model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
-```
-
-<Tip warning={true}>
-
-Für PyTorch-Modelle verwendet die Methode `from_pretrained()` `torch.load()`, die intern `pickle` verwendet und als unsicher bekannt ist. Generell sollte man niemals ein Modell laden, das aus einer nicht vertrauenswürdigen Quelle stammen könnte, oder das manipuliert worden sein könnte. Dieses Sicherheitsrisiko wird für öffentliche Modelle, die auf dem Hugging Face Hub gehostet werden, teilweise gemildert, da diese bei jeder Übertragung [auf Malware](https://huggingface.co/docs/hub/security-malware) gescannt werden. Siehe die [Hub-Dokumentation](https://huggingface.co/docs/hub/security) für Best Practices wie [signierte Commit-Verifizierung](https://huggingface.co/docs/hub/security-gpg#signing-commits-with-gpg) mit GPG.
-
-TensorFlow- und Flax-Checkpoints sind nicht betroffen und können in PyTorch-Architekturen mit den Kwargs `from_tf` und `from_flax` für die Methode `from_pretrained` geladen werden, um dieses Problem zu umgehen.
-
-</Tip>
-
-Im Allgemeinen empfehlen wir die Verwendung der Klasse "AutoTokenizer" und der Klasse "AutoModelFor", um trainierte Instanzen von Modellen zu laden. Dadurch wird sichergestellt, dass Sie jedes Mal die richtige Architektur laden. Im nächsten [Tutorial] (Vorverarbeitung) erfahren Sie, wie Sie Ihren neu geladenen Tokenizer, Feature Extractor und Prozessor verwenden, um einen Datensatz für die Feinabstimmung vorzuverarbeiten.
-</pt>
-<tf>
-Mit den Klassen `TFAutoModelFor` schließlich können Sie ein vortrainiertes Modell für eine bestimmte Aufgabe laden (siehe [hier](model_doc/auto) für eine vollständige Liste der verfügbaren Aufgaben). Laden Sie zum Beispiel ein Modell für die Sequenzklassifikation mit [`TFAutoModelForSequenceClassification.from_pretrained`]:
-
-```py
->>> from transformers import TFAutoModelForSequenceClassification
-
->>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
-```
-
-Sie können denselben Prüfpunkt problemlos wiederverwenden, um eine Architektur für eine andere Aufgabe zu laden:
-
-```py
->>> from transformers import TFAutoModelForTokenClassification
-
->>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
-```
-
-Im Allgemeinen empfehlen wir, die Klasse "AutoTokenizer" und die Klasse "TFAutoModelFor" zu verwenden, um vortrainierte Instanzen von Modellen zu laden. Dadurch wird sichergestellt, dass Sie jedes Mal die richtige Architektur laden. Im nächsten [Tutorial] (Vorverarbeitung) erfahren Sie, wie Sie Ihren neu geladenen Tokenizer, Feature Extractor und Prozessor verwenden, um einen Datensatz für die Feinabstimmung vorzuverarbeiten.
-</tf>
-</frameworkcontent>
diff --git a/docs/source/de/index.md b/docs/source/de/index.md
new file mode 100644
index 000000000000..4742a99f643c
--- /dev/null
+++ b/docs/source/de/index.md
@@ -0,0 +1,334 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 🤗 Transformers
+
+Maschinelles Lernen auf dem neuesten Stand der Technik für PyTorch, TensorFlow und JAX.
+
+🤗 Transformers bietet APIs zum einfachen Herunterladen und Trainieren von vortrainierten Modellen auf dem neuesten Stand der Technik. Die Verwendung von vortrainierten Modellen kann Rechenkosten sparen und den CO2-Fußabdruck reduzieren und Zeit sparen, die für das Training eines Modells von Grund auf benötigt wird. Die Modelle können für verschiedene Modalitäten verwendet werden, wie z. B.:
+
+* 📝 Text: Textklassifizierung, Informationsextrahierung, Beantwortung von Fragen, Zusammenfassung, Übersetzung und Texterstellung in über 100 Sprachen.
+* 🖼️ Bilder: Bildklassifizierung, Objekterkennung und Segmentierung.
+* 🗣️ Audio: Spracherkennung und Audioklassifizierung.
+* 🐙 Multimodal: Beantwortung von Tabellenfragen, optische Zeichenerkennung, Informationsextraktion aus gescannten Dokumenten, Videoklassifizierung und Beantwortung visueller Fragen.
+
+Unsere Bibliothek unterstützt die nahtlose Integration von drei der beliebtesten Deep-Learning-Bibliotheken: [PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/) und [JAX](https://jax.readthedocs.io/en/latest/). Trainieren Sie Ihr Modell in drei Codezeilen in einem Framework und laden Sie es zur Inferenz mit einem anderen.
+
+Jede 🤗 Transformers-Architektur ist in einem eigenständigen Python-Modul definiert, so dass sie leicht für Forschung und Experimente angepasst werden kann.
+
+## Wenn Sie auf der Suche nach individueller Unterstützung durch das Hugging Face-Team sind
+
+<a target="_blank" href="https://huggingface.co/support">
+    <img alt="HuggingFace Expert Acceleration Program" src="https://cdn-media.huggingface.co/marketing/transformers/new-support-improved.png" style="width: 100%; max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
+</a>
+
+## Inhalt
+
+Die Dokumentation ist in fünf Teile gegliedert:
+
+- **GET STARTED** enthält eine kurze Tour und Installationsanweisungen, um mit 🤗 Transformers loszulegen.
+- **TUTORIALS** sind ein hervorragender Ausgangspunkt, wenn Sie neu in unserer Bibliothek sind. Dieser Abschnitt hilft Ihnen, die grundlegenden Fähigkeiten zu erlangen, die Sie benötigen, um mit 🤗 Transformers zu arbeiten.
+- **HOW-TO GUIDES** zeigen Ihnen, wie Sie ein bestimmtes Ziel erreichen können, z. B. die Feinabstimmung eines vortrainierten Modells für die Sprachmodellierung oder die Erstellung eines benutzerdefinierten Modellkopfs.
+- **KONZEPTUELLE ANLEITUNGEN** bietet weitere Diskussionen und Erklärungen zu den zugrunde liegenden Konzepten und Ideen hinter Modellen, Aufgaben und der Designphilosophie von 🤗 Transformers. 
+- **API** beschreibt jede Klasse und Funktion, gruppiert in:
+
+  - **MAIN CLASSES** für die Hauptklassen, die die wichtigsten APIs der Bibliothek darstellen.
+  - MODELLE** für die Klassen und Funktionen, die zu jedem in der Bibliothek implementierten Modell gehören.
+  - **INTERNAL HELPERS** für die Klassen und Funktionen, die wir intern verwenden.
+
+Die Bibliothek enthält derzeit JAX-, PyTorch- und TensorFlow-Implementierungen, vortrainierte Modellgewichte, Nutzungsskripte und Konvertierungsprogramme für die folgenden Modelle.
+
+### Unterstütze Modelle
+
+<!--This list is updated automatically from the README with _make fix-copies_. Do not update manually! -->
+
+1. **[ALBERT](model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
+1. **[ALIGN](model_doc/align)** (from Google Research) released with the paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.
+1. **[BART](model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
+1. **[BARThez](model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
+1. **[BARTpho](model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
+1. **[BEiT](model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
+1. **[BERT](model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
+1. **[BERT For Sequence Generation](model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[BERTweet](model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
+1. **[BigBird-Pegasus](model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[BigBird-RoBERTa](model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[Blenderbot](model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BlenderbotSmall](model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BLOOM](model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
+1. **[BORT](model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
+1. **[ByT5](model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
+1. **[CamemBERT](model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
+1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
+1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
+1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
+1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
+1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
+1. **[ConvNeXTV2](model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
+1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
+1. **[CTRL](model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
+1. **[CvT](model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
+1. **[Data2Vec](model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
+1. **[DeBERTa](model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[DeBERTa-v2](model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[Decision Transformer](model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
+1. **[DeiT](model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
+1. **[DETR](model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
+1. **[DialoGPT](model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
+1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
+1. **[DiT](model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
+1. **[DPR](model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
+1. **[DPT](master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
+1. **[EfficientNet](model_doc/efficientnet)** (from Google Research) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946)  by Mingxing Tan and Quoc V. Le.
+1. **[ELECTRA](model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
+1. **[EncoderDecoder](model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[FlauBERT](model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
+1. **[FLAVA](model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
+1. **[FNet](model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
+1. **[Funnel Transformer](model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
+1. **[GLPN](model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
+1. **[GPT](model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+1. **[GPT Neo](model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
+1. **[GPT NeoX](model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
+1. **[GPT-2](model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
+1. **[GPT-J](model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
+1. **[GPTSAN-japanese](model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
+1. **[GroupViT](model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
+1. **[Hubert](model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
+1. **[I-BERT](model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
+1. **[ImageGPT](model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
+1. **[LayoutLM](model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
+1. **[LayoutLMv2](model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
+1. **[LayoutLMv3](model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
+1. **[LayoutXLM](model_doc/layoutxlm)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
+1. **[LED](model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+1. **[LeViT](model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
+1. **[Longformer](model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+1. **[LongT5](model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
+1. **[LUKE](model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
+1. **[LXMERT](model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
+1. **[M-CTC-T](model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
+1. **[M2M100](model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
+1. **[MarianMT](model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
+1. **[Mask2Former](model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
+1. **[MaskFormer](model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
+1. **[mBART](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
+1. **[mBART-50](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
+1. **[Megatron-BERT](model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[Megatron-GPT2](model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[mLUKE](model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
+1. **[MobileBERT](model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
+1. **[MobileViT](model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
+1. **[MPNet](model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
+1. **[MT5](model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
+1. **[MVP](model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
+1. **[Nezha](model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
+1. **[NLLB](model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
+1. **[Nyströmformer](model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
+1. **[OneFormer](model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
+1. **[OPT](master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
+1. **[OWL-ViT](model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
+1. **[Pegasus](model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
+1. **[Perceiver IO](model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
+1. **[PhoBERT](model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
+1. **[PLBart](model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
+1. **[PoolFormer](model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
+1. **[ProphetNet](model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[QDQBert](model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
+1. **[RAG](model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela.
+1. **[REALM](model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
+1. **[Reformer](model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+1. **[RegNet](model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
+1. **[RemBERT](model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
+1. **[ResNet](model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
+1. **[RoBERTa](model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
+1. **[RoFormer](model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
+1. **[SegFormer](model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
+1. **[SEW](model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
+1. **[SEW-D](model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
+1. **[SpeechToTextTransformer](model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
+1. **[SpeechToTextTransformer2](model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
+1. **[Splinter](model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
+1. **[SqueezeBERT](model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
+1. **[Swin Transformer](model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
+1. **[Swin Transformer V2](model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
+1. **[T5](model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+1. **[T5v1.1](model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+1. **[TAPAS](model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
+1. **[TAPEX](model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
+1. **[Trajectory Transformer](model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
+1. **[Transformer-XL](model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+1. **[TrOCR](model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
+1. **[UL2](model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
+1. **[UMT5](model_doc/umt5)** (from Google Research) released with the paper [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.
+1. **[UniSpeech](model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
+1. **[UniSpeechSat](model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
+1. **[VAN](model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
+1. **[VideoMAE](model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
+1. **[ViLT](model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
+1. **[Vision Transformer (ViT)](model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[VisualBERT](model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
+1. **[ViTMAE](model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
+1. **[Wav2Vec2](model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
+1. **[Wav2Vec2-Conformer](model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
+1. **[Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
+1. **[WavLM](model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
+1. **[XGLM](model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
+1. **[XLM](model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
+1. **[XLM-ProphetNet](model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[XLM-RoBERTa](model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
+1. **[XLM-RoBERTa-XL](model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
+1. **[XLM-V](model_doc/xlm-v)** (from Meta AI) released with the paper [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) by Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa.
+1. **[XLNet](model_doc/xlnet)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
+1. **[XLS-R](model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
+1. **[XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
+1. **[YOLOS](model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
+1. **[YOSO](model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
+
+
+### Unterstützte Frameworks
+
+Die folgende Tabelle zeigt die derzeitige Unterstützung in der Bibliothek für jedes dieser Modelle, unabhängig davon, ob sie einen Python
+Tokenizer haben (als "langsam" bezeichnet), ein "schneller" Tokenizer, der von der 🤗 Tokenizers Bibliothek unterstützt wird, ob sie Unterstützung in Jax (via
+Flax), PyTorch, und/oder TensorFlow haben.
+
+<!--This table is updated automatically from the auto modules with _make fix-copies_. Do not update manually!-->
+
+|            Model            | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support |
+|:---------------------------:|:--------------:|:--------------:|:---------------:|:------------------:|:------------:|
+|           ALBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|            BART             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|            BEiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|            BERT             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|       Bert Generation       |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           BigBird           |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
+|       BigBird-Pegasus       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         Blenderbot          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|       BlenderbotSmall       |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|            BLOOM            |       ❌       |       ✅       |       ✅        |         ❌         |      ✅      |
+|          CamemBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           CANINE            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            CLIP             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|           CodeGen           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|          ConvNeXT           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             CvT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        Data2VecAudio        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        Data2VecText         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|       Data2VecVision        |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|           DeBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|         DeBERTa-v2          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|    Decision Transformer     |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            DeiT             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            DETR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         DistilBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|             DPR             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|             DPT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           ELECTRA           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|       Encoder decoder       |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+| FairSeq Machine-Translation |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          FlauBERT           |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            FLAVA            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            FNet             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|     Funnel Transformer      |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            GLPN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           GPT Neo           |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|          GPT NeoX           |       ❌       |       ✅       |       ✅        |         ❌         |      ❌      |
+|            GPT-J            |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|          GroupViT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Hubert            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|           I-BERT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          ImageGPT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          LayoutLM           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|         LayoutLMv2          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|         LayoutLMv3          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|             LED             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            LeViT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         Longformer          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           LongT5            |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|            LUKE             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           LXMERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           M-CTC-T           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           M2M100            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Marian            |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
+|         MaskFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            mBART            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|        Megatron-BERT        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         MobileBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|          MobileViT          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            MPNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|             MT5             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|             MVP             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|            Nezha            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        Nyströmformer        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         OpenAI GPT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|        OpenAI GPT-2         |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|             OPT             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|           OWL-ViT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Pegasus           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|          Perceiver          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           PLBart            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         PoolFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         ProphetNet          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           QDQBert           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             RAG             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            REALM            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          Reformer           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|           RegNet            |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|           RemBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           ResNet            |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|          RetriBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|           RoBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|          RoFormer           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|          SegFormer          |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             SEW             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            SEW-D            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|   Speech Encoder decoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|         Speech2Text         |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|        Speech2Text2         |       ✅       |       ❌       |       ❌        |         ❌         |      ❌      |
+|          Splinter           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|         SqueezeBERT         |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|      Swin Transformer       |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|     Swin Transformer V2     |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             T5              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|            TAPAS            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|   Trajectory Transformer    |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|       Transformer-XL        |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            TrOCR            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          UniSpeech          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        UniSpeechSat         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             VAN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          VideoMAE           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            ViLT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|   Vision Encoder decoder    |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|    VisionTextDualEncoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|         VisualBERT          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             ViT             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|           ViTMAE            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|          Wav2Vec2           |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
+|     Wav2Vec2-Conformer      |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            WavLM            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            XGLM             |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
+|             XLM             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|       XLM-ProphetNet        |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         XLM-RoBERTa         |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|       XLM-RoBERTa-XL        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            XLNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            YOLOS            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            YOSO             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+
+<!-- End table-->
diff --git a/docs/source/de/index.mdx b/docs/source/de/index.mdx
deleted file mode 100644
index c7d6511053ec..000000000000
--- a/docs/source/de/index.mdx
+++ /dev/null
@@ -1,322 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# 🤗 Transformers
-
-Maschinelles Lernen auf dem neuesten Stand der Technik für PyTorch, TensorFlow und JAX.
-
-🤗 Transformers bietet APIs zum einfachen Herunterladen und Trainieren von vortrainierten Modellen auf dem neuesten Stand der Technik. Die Verwendung von vortrainierten Modellen kann Rechenkosten sparen und den CO2-Fußabdruck reduzieren und Zeit sparen, die für das Training eines Modells von Grund auf benötigt wird. Die Modelle können für verschiedene Modalitäten verwendet werden, wie z. B.:
-
-* 📝 Text: Textklassifizierung, Informationsextrahierung, Beantwortung von Fragen, Zusammenfassung, Übersetzung und Texterstellung in über 100 Sprachen.
-* 🖼️ Bilder: Bildklassifizierung, Objekterkennung und Segmentierung.
-* 🗣️ Audio: Spracherkennung und Audioklassifizierung.
-* 🐙 Multimodal: Beantwortung von Tabellenfragen, optische Zeichenerkennung, Informationsextraktion aus gescannten Dokumenten, Videoklassifizierung und Beantwortung visueller Fragen.
-
-Unsere Bibliothek unterstützt die nahtlose Integration von drei der beliebtesten Deep-Learning-Bibliotheken: [PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/) und [JAX](https://jax.readthedocs.io/en/latest/). Trainieren Sie Ihr Modell in drei Codezeilen in einem Framework und laden Sie es zur Inferenz mit einem anderen.
-
-Jede 🤗 Transformers-Architektur ist in einem eigenständigen Python-Modul definiert, so dass sie leicht für Forschung und Experimente angepasst werden kann.
-
-## Wenn Sie auf der Suche nach individueller Unterstützung durch das Hugging Face-Team sind
-
-<a target="_blank" href="https://huggingface.co/support">
-    <img alt="HuggingFace Expert Acceleration Program" src="https://cdn-media.huggingface.co/marketing/transformers/new-support-improved.png" style="width: 100%; max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
-</a>
-
-## Inhalt
-
-Die Dokumentation ist in fünf Teile gegliedert:
-
-- **GET STARTED** enthält eine kurze Tour und Installationsanweisungen, um mit 🤗 Transformers loszulegen.
-- **TUTORIALS** sind ein hervorragender Ausgangspunkt, wenn Sie neu in unserer Bibliothek sind. Dieser Abschnitt hilft Ihnen, die grundlegenden Fähigkeiten zu erlangen, die Sie benötigen, um mit 🤗 Transformers zu arbeiten.
-- **HOW-TO GUIDES** zeigen Ihnen, wie Sie ein bestimmtes Ziel erreichen können, z. B. die Feinabstimmung eines vortrainierten Modells für die Sprachmodellierung oder die Erstellung eines benutzerdefinierten Modellkopfs.
-- **KONZEPTUELLE ANLEITUNGEN** bietet weitere Diskussionen und Erklärungen zu den zugrunde liegenden Konzepten und Ideen hinter Modellen, Aufgaben und der Designphilosophie von 🤗 Transformers. 
-- **API** beschreibt jede Klasse und Funktion, gruppiert in:
-
-  - **MAIN CLASSES** für die Hauptklassen, die die wichtigsten APIs der Bibliothek darstellen.
-  - MODELLE** für die Klassen und Funktionen, die zu jedem in der Bibliothek implementierten Modell gehören.
-  - **INTERNAL HELPERS** für die Klassen und Funktionen, die wir intern verwenden.
-
-Die Bibliothek enthält derzeit JAX-, PyTorch- und TensorFlow-Implementierungen, vortrainierte Modellgewichte, Nutzungsskripte und Konvertierungsprogramme für die folgenden Modelle.
-
-### Unterstütze Modelle
-
-<!--This list is updated automatically from the README with _make fix-copies_. Do not update manually! -->
-
-1. **[ALBERT](model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-1. **[BART](model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
-1. **[BARThez](model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
-1. **[BARTpho](model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
-1. **[BEiT](model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
-1. **[BERT](model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
-1. **[BERT For Sequence Generation](model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[BERTweet](model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
-1. **[BigBird-Pegasus](model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BigBird-RoBERTa](model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[Blenderbot](model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BlenderbotSmall](model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BLOOM](model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
-1. **[BORT](model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
-1. **[ByT5](model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
-1. **[CamemBERT](model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
-1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
-1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
-1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
-1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
-1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
-1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
-1. **[CTRL](model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
-1. **[CvT](model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
-1. **[Data2Vec](model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
-1. **[DeBERTa](model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[DeBERTa-v2](model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[Decision Transformer](model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
-1. **[DeiT](model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
-1. **[DETR](model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
-1. **[DialoGPT](model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
-1. **[DiT](model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
-1. **[DPR](model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
-1. **[DPT](master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
-1. **[ELECTRA](model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
-1. **[EncoderDecoder](model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[FlauBERT](model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
-1. **[FLAVA](model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
-1. **[FNet](model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
-1. **[Funnel Transformer](model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-1. **[GLPN](model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
-1. **[GPT](model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
-1. **[GPT Neo](model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
-1. **[GPT NeoX](model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
-1. **[GPT-2](model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
-1. **[GPT-J](model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
-1. **[GroupViT](model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
-1. **[Hubert](model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
-1. **[I-BERT](model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
-1. **[ImageGPT](model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
-1. **[LayoutLM](model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
-1. **[LayoutLMv2](model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
-1. **[LayoutLMv3](model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
-1. **[LayoutXLM](model_doc/layoutxlm)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
-1. **[LED](model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-1. **[LeViT](model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
-1. **[Longformer](model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-1. **[LongT5](model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
-1. **[LUKE](model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
-1. **[LXMERT](model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
-1. **[M-CTC-T](model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
-1. **[M2M100](model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
-1. **[MarianMT](model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
-1. **[MaskFormer](model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
-1. **[mBART](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-1. **[mBART-50](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-1. **[Megatron-BERT](model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-1. **[Megatron-GPT2](model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-1. **[mLUKE](model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
-1. **[MobileBERT](model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
-1. **[MobileViT](model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
-1. **[MPNet](model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
-1. **[MT5](model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-1. **[MVP](model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
-1. **[Nezha](model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
-1. **[NLLB](model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
-1. **[Nyströmformer](model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
-1. **[OPT](master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
-1. **[OWL-ViT](model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
-1. **[Pegasus](model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
-1. **[Perceiver IO](model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
-1. **[PhoBERT](model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
-1. **[PLBart](model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
-1. **[PoolFormer](model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
-1. **[ProphetNet](model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-1. **[QDQBert](model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
-1. **[RAG](model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela.
-1. **[REALM](model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
-1. **[Reformer](model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
-1. **[RegNet](model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
-1. **[RemBERT](model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
-1. **[ResNet](model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
-1. **[RoBERTa](model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-1. **[RoFormer](model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
-1. **[SegFormer](model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
-1. **[SEW](model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
-1. **[SEW-D](model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
-1. **[SpeechToTextTransformer](model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
-1. **[SpeechToTextTransformer2](model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
-1. **[Splinter](model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
-1. **[SqueezeBERT](model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
-1. **[Swin Transformer](model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
-1. **[Swin Transformer V2](model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
-1. **[T5](model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-1. **[T5v1.1](model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-1. **[TAPAS](model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
-1. **[TAPEX](model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
-1. **[Trajectory Transformer](model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
-1. **[Transformer-XL](model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-1. **[TrOCR](model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
-1. **[UL2](model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
-1. **[UniSpeech](model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
-1. **[UniSpeechSat](model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
-1. **[VAN](model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
-1. **[VideoMAE](model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
-1. **[ViLT](model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
-1. **[Vision Transformer (ViT)](model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-1. **[VisualBERT](model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
-1. **[ViTMAE](model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
-1. **[Wav2Vec2](model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
-1. **[Wav2Vec2-Conformer](model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
-1. **[Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
-1. **[WavLM](model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
-1. **[XGLM](model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
-1. **[XLM](model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
-1. **[XLM-ProphetNet](model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-1. **[XLM-RoBERTa](model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
-1. **[XLM-RoBERTa-XL](model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
-1. **[XLNet](model_doc/xlnet)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-1. **[XLS-R](model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
-1. **[XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
-1. **[YOLOS](model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
-1. **[YOSO](model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
-
-
-### Unterstützte Frameworks
-
-Die folgende Tabelle zeigt die derzeitige Unterstützung in der Bibliothek für jedes dieser Modelle, unabhängig davon, ob sie einen Python
-Tokenizer haben (als "langsam" bezeichnet), ein "schneller" Tokenizer, der von der 🤗 Tokenizers Bibliothek unterstützt wird, ob sie Unterstützung in Jax (via
-Flax), PyTorch, und/oder TensorFlow haben.
-
-<!--This table is updated automatically from the auto modules with _make fix-copies_. Do not update manually!-->
-
-|            Model            | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support |
-|:---------------------------:|:--------------:|:--------------:|:---------------:|:------------------:|:------------:|
-|           ALBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|            BART             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|            BEiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-|            BERT             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|       Bert Generation       |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           BigBird           |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
-|       BigBird-Pegasus       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         Blenderbot          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|       BlenderbotSmall       |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|            BLOOM            |       ❌       |       ✅       |       ✅        |         ❌         |      ❌      |
-|          CamemBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|           CANINE            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            CLIP             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|           CodeGen           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|          ConvNeXT           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|             CvT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|        Data2VecAudio        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|        Data2VecText         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|       Data2VecVision        |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|           DeBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|         DeBERTa-v2          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|    Decision Transformer     |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            DeiT             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            DETR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         DistilBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|             DPR             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|             DPT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           ELECTRA           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|       Encoder decoder       |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-| FairSeq Machine-Translation |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          FlauBERT           |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            FLAVA            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            FNet             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|     Funnel Transformer      |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|            GLPN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           GPT Neo           |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-|          GPT NeoX           |       ❌       |       ✅       |       ✅        |         ❌         |      ❌      |
-|            GPT-J            |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|          GroupViT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           Hubert            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|           I-BERT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          ImageGPT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          LayoutLM           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|         LayoutLMv2          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|         LayoutLMv3          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|             LED             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|            LeViT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         Longformer          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|           LongT5            |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-|            LUKE             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           LXMERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|           M-CTC-T           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           M2M100            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           Marian            |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
-|         MaskFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            mBART            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|        Megatron-BERT        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         MobileBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|          MobileViT          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            MPNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|             MT5             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|             MVP             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|            Nezha            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|        Nyströmformer        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         OpenAI GPT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|        OpenAI GPT-2         |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|             OPT             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|           OWL-ViT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           Pegasus           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|          Perceiver          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           PLBart            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         PoolFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         ProphetNet          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           QDQBert           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             RAG             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            REALM            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|          Reformer           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|           RegNet            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|           RemBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|           ResNet            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|          RetriBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|           RoBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|          RoFormer           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|          SegFormer          |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|             SEW             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            SEW-D            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|   Speech Encoder decoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-|         Speech2Text         |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|        Speech2Text2         |       ✅       |       ❌       |       ❌        |         ❌         |      ❌      |
-|          Splinter           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|         SqueezeBERT         |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|      Swin Transformer       |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|     Swin Transformer V2     |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             T5              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|            TAPAS            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|   Trajectory Transformer    |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|       Transformer-XL        |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            TrOCR            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          UniSpeech          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|        UniSpeechSat         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             VAN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          VideoMAE           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            ViLT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|   Vision Encoder decoder    |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|    VisionTextDualEncoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-|         VisualBERT          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             ViT             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|           ViTMAE            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|          Wav2Vec2           |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
-|     Wav2Vec2-Conformer      |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            WavLM            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            XGLM             |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
-|             XLM             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|       XLM-ProphetNet        |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         XLM-RoBERTa         |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|       XLM-RoBERTa-XL        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            XLNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|            YOLOS            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            YOSO             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-
-<!-- End table-->
diff --git a/docs/source/de/installation.md b/docs/source/de/installation.md
new file mode 100644
index 000000000000..295c9cad97bc
--- /dev/null
+++ b/docs/source/de/installation.md
@@ -0,0 +1,250 @@
+<!---
+Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Installation
+
+Installieren Sie 🤗 Transformers für die Deep-Learning-Bibliothek, mit der Sie arbeiten, richten Sie Ihren Cache ein und konfigurieren Sie 🤗 Transformers optional für den Offline-Betrieb.
+
+🤗 Transformers wurde unter Python 3.6+, PyTorch 1.1.0+, TensorFlow 2.0+, und Flax getestet. Folgen Sie den Installationsanweisungen unten für die von Ihnen verwendete Deep-Learning-Bibliothek:
+
+* [PyTorch](https://pytorch.org/get-started/locally/) installation instructions.
+* [TensorFlow 2.0](https://www.tensorflow.org/install/pip) installation instructions.
+* [Flax](https://flax.readthedocs.io/en/latest/) installation instructions.
+
+## Installation mit pip
+
+Sie sollten 🤗 Transformers in einer [virtuellen Umgebung](https://docs.python.org/3/library/venv.html) installieren. Wenn Sie mit virtuellen Python-Umgebungen nicht vertraut sind, werfen Sie einen Blick auf diese [Anleitung](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). Eine virtuelle Umgebung macht es einfacher, verschiedene Projekte zu verwalten und Kompatibilitätsprobleme zwischen Abhängigkeiten zu vermeiden.
+
+Beginnen wir mit der Erstellung einer virtuellen Umgebung in Ihrem Projektverzeichnis:
+
+
+```bash
+python -m venv .env
+```
+
+Aktivieren wir die virtuelle Umgebung. Unter Linux und MacOs:
+
+```bash
+source .env/bin/activate
+```
+Aktivieren wir die virtuelle Umgebung unter Windows
+
+```bash
+.env/Scripts/activate
+```
+
+Jetzt können wir die 🤗 Transformers mit dem folgenden Befehl installieren:
+
+```bash
+pip install transformers
+```
+
+Bei reiner CPU-Unterstützung können wir 🤗 Transformers und eine Deep-Learning-Bibliothek bequem in einer Zeile installieren. Installieren wir zum Beispiel 🤗 Transformers und PyTorch mit:
+
+```bash
+pip install transformers[torch]
+```
+
+🤗 Transformers und TensorFlow 2.0:
+
+```bash
+pip install transformers[tf-cpu]
+```
+
+🤗 Transformers und Flax:
+
+```bash
+pip install transformers[flax]
+```
+
+Überprüfen wir abschließend, ob 🤗 Transformers ordnungsgemäß installiert wurde, indem wir den folgenden Befehl ausführen. Es wird ein vortrainiertes Modell heruntergeladen:
+
+```bash
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"
+```
+
+Dann wird die Kategorie und die Wahrscheinlichkeit ausgegeben:
+
+```bash
+[{'label': 'POSITIVE', 'score': 0.9998704791069031}]
+```
+
+## Installation aus dem Code
+
+Installieren wir 🤗 Transformers aus dem Quellcode mit dem folgenden Befehl:
+
+```bash
+pip install git+https://github.com/huggingface/transformers
+```
+
+Dieser Befehl installiert die aktuelle `main` Version und nicht die neueste `stable` Version. Die `main`-Version ist nützlich, um mit den neuesten Entwicklungen Schritt zu halten. Zum Beispiel, wenn ein Fehler seit der letzten offiziellen Version behoben wurde, aber eine neue Version noch nicht veröffentlicht wurde. Das bedeutet jedoch, dass die "Hauptversion" nicht immer stabil ist. Wir bemühen uns, die Hauptversion einsatzbereit zu halten, und die meisten Probleme werden normalerweise innerhalb weniger Stunden oder eines Tages behoben. Wenn Sie auf ein Problem stoßen, öffnen Sie bitte ein [Issue] (https://github.com/huggingface/transformers/issues), damit wir es noch schneller beheben können!
+
+Überprüfen wir, ob 🤗 Transformers richtig installiert wurde, indem Sie den folgenden Befehl ausführen:
+
+
+```bash
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I love you'))"
+```
+
+## Editierbare Installation
+
+Sie benötigen eine bearbeitbare Installation, wenn Sie:
+
+* die "Haupt"-Version des Quellcodes verwenden möchten.
+* Zu 🤗 Transformers beitragen und Änderungen am Code testen wollen.
+
+Klonen Sie das Repository und installieren 🤗 Transformers mit den folgenden Befehlen:
+
+```bash
+git clone https://github.com/huggingface/transformers.git
+cd transformers
+pip install -e .
+```
+
+Diese Befehle verknüpfen den Ordner, in den Sie das Repository geklont haben, mit den Pfaden Ihrer Python-Bibliotheken. Python wird nun in dem Ordner suchen, in den Sie geklont haben, zusätzlich zu den normalen Bibliothekspfaden. Wenn zum Beispiel Ihre Python-Pakete normalerweise in `~/anaconda3/envs/main/lib/python3.7/site-packages/` installiert sind, wird Python auch den Ordner durchsuchen, in den Sie geklont haben: `~/transformers/`.
+
+
+<Tip warning={true}>
+
+Sie müssen den Ordner `transformers` behalten, wenn Sie die Bibliothek weiter verwenden wollen.
+
+</Tip>
+
+Jetzt können Sie Ihren Klon mit dem folgenden Befehl ganz einfach auf die neueste Version von 🤗 Transformers aktualisieren:
+
+
+```bash
+cd ~/transformers/
+git pull
+```
+
+Ihre Python-Umgebung wird beim nächsten Ausführen die `main`-Version von 🤗 Transformers finden.
+
+## Installation mit conda
+
+Installation von dem conda Kanal `huggingface`:
+
+```bash
+conda install -c huggingface transformers
+```
+
+## Cache Einrichtung
+
+Vorgefertigte Modelle werden heruntergeladen und lokal zwischengespeichert unter: `~/.cache/huggingface/hub`. Dies ist das Standardverzeichnis, das durch die Shell-Umgebungsvariable "TRANSFORMERS_CACHE" vorgegeben ist. Unter Windows wird das Standardverzeichnis durch `C:\Benutzer\Benutzername\.cache\huggingface\hub` angegeben. Sie können die unten aufgeführten Shell-Umgebungsvariablen - in der Reihenfolge ihrer Priorität - ändern, um ein anderes Cache-Verzeichnis anzugeben:
+
+1. Shell-Umgebungsvariable (Standard): `HUGGINGFACE_HUB_CACHE` oder `TRANSFORMERS_CACHE`.
+2. Shell-Umgebungsvariable: `HF_HOME`.
+3. Shell-Umgebungsvariable: `XDG_CACHE_HOME` + `/huggingface`.
+
+
+<Tip>
+
+Transformers verwendet die Shell-Umgebungsvariablen `PYTORCH_TRANSFORMERS_CACHE` oder `PYTORCH_PRETRAINED_BERT_CACHE`, wenn Sie von einer früheren Iteration dieser Bibliothek kommen und diese Umgebungsvariablen gesetzt haben, sofern Sie nicht die Shell-Umgebungsvariable `TRANSFORMERS_CACHE` angeben.
+  
+</Tip>
+
+## Offline Modus
+
+Transformers ist in der Lage, in einer Firewall- oder Offline-Umgebung zu laufen, indem es nur lokale Dateien verwendet. Setzen Sie die Umgebungsvariable `TRANSFORMERS_OFFLINE=1`, um dieses Verhalten zu aktivieren.
+
+<Tip>
+
+Fügen sie [🤗 Datasets](https://huggingface.co/docs/datasets/) zu Ihrem Offline-Trainingsworkflow hinzufügen, indem Sie die Umgebungsvariable `HF_DATASETS_OFFLINE=1` setzen.
+
+</Tip>
+
+So würden Sie beispielsweise ein Programm in einem normalen Netzwerk mit einer Firewall für externe Instanzen mit dem folgenden Befehl ausführen:
+
+```bash
+python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
+```
+
+Führen Sie das gleiche Programm in einer Offline-Instanz mit aus:
+
+```bash
+HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
+python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
+```
+
+Das Skript sollte nun laufen, ohne sich aufzuhängen oder eine Zeitüberschreitung abzuwarten, da es weiß, dass es nur nach lokalen Dateien suchen soll.
+
+
+### Abrufen von Modellen und Tokenizern zur Offline-Verwendung
+
+Eine andere Möglichkeit, 🤗 Transformers offline zu verwenden, besteht darin, die Dateien im Voraus herunterzuladen und dann auf ihren lokalen Pfad zu verweisen, wenn Sie sie offline verwenden müssen. Es gibt drei Möglichkeiten, dies zu tun:
+
+* Laden Sie eine Datei über die Benutzeroberfläche des [Model Hub](https://huggingface.co/models) herunter, indem Sie auf das ↓-Symbol klicken.
+
+    ![download-icon](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/download-icon.png)
+
+* Verwenden Sie den [PreTrainedModel.from_pretrained] und [PreTrainedModel.save_pretrained] Workflow:
+
+    1. Laden Sie Ihre Dateien im Voraus mit [`PreTrainedModel.from_pretrained`] herunter:
+
+    ```py
+    >>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/T0_3B")
+    >>> model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0_3B")
+    ```
+
+    2. Speichern Sie Ihre Dateien in einem bestimmten Verzeichnis mit [`PreTrainedModel.save_pretrained`]:
+
+    ```py
+    >>> tokenizer.save_pretrained("./your/path/bigscience_t0")
+    >>> model.save_pretrained("./your/path/bigscience_t0")
+    ```
+
+    3. Wenn Sie nun offline sind, laden Sie Ihre Dateien mit [`PreTrainedModel.from_pretrained`] aus dem bestimmten Verzeichnis:
+
+    ```py
+    >>> tokenizer = AutoTokenizer.from_pretrained("./your/path/bigscience_t0")
+    >>> model = AutoModel.from_pretrained("./your/path/bigscience_t0")
+    ```
+
+* Programmatisches Herunterladen von Dateien mit der [huggingface_hub](https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub) Bibliothek:
+
+    1. Installieren Sie die "huggingface_hub"-Bibliothek in Ihrer virtuellen Umgebung:
+
+    ```bash
+    python -m pip install huggingface_hub
+    ```
+
+    2. Verwenden Sie die Funktion [`hf_hub_download`](https://huggingface.co/docs/hub/adding-a-library#download-files-from-the-hub), um eine Datei in einen bestimmten Pfad herunterzuladen. Der folgende Befehl lädt zum Beispiel die Datei "config.json" aus dem Modell [T0](https://huggingface.co/bigscience/T0_3B) in den gewünschten Pfad herunter:
+
+    ```py
+    >>> from huggingface_hub import hf_hub_download
+
+    >>> hf_hub_download(repo_id="bigscience/T0_3B", filename="config.json", cache_dir="./your/path/bigscience_t0")
+    ```
+
+Sobald Ihre Datei heruntergeladen und lokal zwischengespeichert ist, geben Sie den lokalen Pfad an, um sie zu laden und zu verwenden:
+
+```py
+>>> from transformers import AutoConfig
+
+>>> config = AutoConfig.from_pretrained("./your/path/bigscience_t0/config.json")
+```
+
+<Tip>
+
+Weitere Informationen zum Herunterladen von Dateien, die auf dem Hub gespeichert sind, finden Sie im Abschnitt [Wie man Dateien vom Hub herunterlädt] (https://huggingface.co/docs/hub/how-to-downstream).
+  
+</Tip>
diff --git a/docs/source/de/installation.mdx b/docs/source/de/installation.mdx
deleted file mode 100644
index 3103830ee7fd..000000000000
--- a/docs/source/de/installation.mdx
+++ /dev/null
@@ -1,246 +0,0 @@
-<!---
-Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Installation
-
-Installieren Sie 🤗 Transformers für die Deep-Learning-Bibliothek, mit der Sie arbeiten, richten Sie Ihren Cache ein und konfigurieren Sie 🤗 Transformers optional für den Offline-Betrieb.
-
-🤗 Transformers wurde unter Python 3.6+, PyTorch 1.1.0+, TensorFlow 2.0+, und Flax getestet. Folgen Sie den Installationsanweisungen unten für die von Ihnen verwendete Deep-Learning-Bibliothek:
-
-* [PyTorch](https://pytorch.org/get-started/locally/) installation instructions.
-* [TensorFlow 2.0](https://www.tensorflow.org/install/pip) installation instructions.
-* [Flax](https://flax.readthedocs.io/en/latest/) installation instructions.
-
-## Installation mit pip
-
-Sie sollten 🤗 Transformers in einer [virtuellen Umgebung](https://docs.python.org/3/library/venv.html) installieren. Wenn Sie mit virtuellen Python-Umgebungen nicht vertraut sind, werfen Sie einen Blick auf diese [Anleitung](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). Eine virtuelle Umgebung macht es einfacher, verschiedene Projekte zu verwalten und Kompatibilitätsprobleme zwischen Abhängigkeiten zu vermeiden.
-
-Beginnen wir mit der Erstellung einer virtuellen Umgebung in Ihrem Projektverzeichnis:
-
-
-```bash
-python -m venv .env
-```
-
-Aktivieren wir die virtuelle Umgebung. Unter Linux und MacOs:
-
-```bash
-source .env/bin/activate
-```
-Aktivieren wir die virtuelle Umgebung unter Windows
-
-```bash
-.env/Scripts/activate
-```
-
-Jetzt können wir die 🤗 Transformers mit dem folgenden Befehl installieren:
-
-```bash
-pip install transformers
-```
-
-Bei reiner CPU-Unterstützung können wir 🤗 Transformers und eine Deep-Learning-Bibliothek bequem in einer Zeile installieren. Installieren wir zum Beispiel 🤗 Transformers und PyTorch mit:
-
-```bash
-pip install transformers[torch]
-```
-
-🤗 Transformers und TensorFlow 2.0:
-
-```bash
-pip install transformers[tf-cpu]
-```
-
-🤗 Transformers und Flax:
-
-```bash
-pip install transformers[flax]
-```
-
-Überprüfen wir abschließend, ob 🤗 Transformers ordnungsgemäß installiert wurde, indem wir den folgenden Befehl ausführen. Es wird ein vortrainiertes Modell heruntergeladen:
-
-```bash
-python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"
-```
-
-Dann wird die Kategorie und die Wahrscheinlichkeit ausgegeben:
-
-```bash
-[{'label': 'POSITIVE', 'score': 0.9998704791069031}]
-```
-
-## Installation aus dem Code
-
-Installieren wir 🤗 Transformers aus dem Quellcode mit dem folgenden Befehl:
-
-```bash
-pip install git+https://github.com/huggingface/transformers
-```
-
-Dieser Befehl installiert die aktuelle `main` Version und nicht die neueste `stable` Version. Die `main`-Version ist nützlich, um mit den neuesten Entwicklungen Schritt zu halten. Zum Beispiel, wenn ein Fehler seit der letzten offiziellen Version behoben wurde, aber eine neue Version noch nicht veröffentlicht wurde. Das bedeutet jedoch, dass die "Hauptversion" nicht immer stabil ist. Wir bemühen uns, die Hauptversion einsatzbereit zu halten, und die meisten Probleme werden normalerweise innerhalb weniger Stunden oder eines Tages behoben. Wenn Sie auf ein Problem stoßen, öffnen Sie bitte ein [Issue] (https://github.com/huggingface/transformers/issues), damit wir es noch schneller beheben können!
-
-Überprüfen wir, ob 🤗 Transformers richtig installiert wurde, indem Sie den folgenden Befehl ausführen:
-
-
-```bash
-python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I love you'))"
-```
-
-## Editierbare Installation
-
-Sie benötigen eine bearbeitbare Installation, wenn Sie:
-
-* die "Haupt"-Version des Quellcodes verwenden möchten.
-* Zu 🤗 Transformers beitragen und Änderungen am Code testen wollen.
-
-Klonen Sie das Repository und installieren 🤗 Transformers mit den folgenden Befehlen:
-
-```bash
-git clone https://github.com/huggingface/transformers.git
-cd transformers
-pip install -e .
-```
-
-Diese Befehle verknüpfen den Ordner, in den Sie das Repository geklont haben, mit den Pfaden Ihrer Python-Bibliotheken. Python wird nun in dem Ordner suchen, in den Sie geklont haben, zusätzlich zu den normalen Bibliothekspfaden. Wenn zum Beispiel Ihre Python-Pakete normalerweise in `~/anaconda3/envs/main/lib/python3.7/site-packages/` installiert sind, wird Python auch den Ordner durchsuchen, in den Sie geklont haben: `~/transformers/`.
-
-
-<Tip warning={true}>
-
-Sie müssen den Ordner `transformers` behalten, wenn Sie die Bibliothek weiter verwenden wollen.
-
-</Tip>
-
-Jetzt können Sie Ihren Klon mit dem folgenden Befehl ganz einfach auf die neueste Version von 🤗 Transformers aktualisieren:
-
-
-```bash
-cd ~/transformers/
-git pull
-```
-
-Ihre Python-Umgebung wird beim nächsten Ausführen die `main`-Version von 🤗 Transformers finden.
-
-## Installation mit conda
-
-Installation von dem conda Kanal `huggingface`:
-
-```bash
-conda install -c huggingface transformers
-```
-
-## Cache Einrichtung
-
-Vorgefertigte Modelle werden heruntergeladen und lokal zwischengespeichert unter: `~/.cache/huggingface/hub`. Dies ist das Standardverzeichnis, das durch die Shell-Umgebungsvariable "TRANSFORMERS_CACHE" vorgegeben ist. Unter Windows wird das Standardverzeichnis durch `C:\Benutzer\Benutzername\.cache\huggingface\hub` angegeben. Sie können die unten aufgeführten Shell-Umgebungsvariablen - in der Reihenfolge ihrer Priorität - ändern, um ein anderes Cache-Verzeichnis anzugeben:
-
-1. Shell-Umgebungsvariable (Standard): `HUGGINGFACE_HUB_CACHE` oder `TRANSFORMERS_CACHE`.
-2. Shell-Umgebungsvariable: `HF_HOME`.
-3. Shell-Umgebungsvariable: `XDG_CACHE_HOME` + `/huggingface`.
-
-
-<Tip>
-
-Transformers verwendet die Shell-Umgebungsvariablen `PYTORCH_TRANSFORMERS_CACHE` oder `PYTORCH_PRETRAINED_BERT_CACHE`, wenn Sie von einer früheren Iteration dieser Bibliothek kommen und diese Umgebungsvariablen gesetzt haben, sofern Sie nicht die Shell-Umgebungsvariable `TRANSFORMERS_CACHE` angeben.
-  
-</Tip>
-
-## Offline Modus
-
-Transformers ist in der Lage, in einer Firewall- oder Offline-Umgebung zu laufen, indem es nur lokale Dateien verwendet. Setzen Sie die Umgebungsvariable `TRANSFORMERS_OFFLINE=1`, um dieses Verhalten zu aktivieren.
-
-<Tip>
-
-Fügen sie [🤗 Datasets](https://huggingface.co/docs/datasets/) zu Ihrem Offline-Trainingsworkflow hinzufügen, indem Sie die Umgebungsvariable `HF_DATASETS_OFFLINE=1` setzen.
-
-</Tip>
-
-So würden Sie beispielsweise ein Programm in einem normalen Netzwerk mit einer Firewall für externe Instanzen mit dem folgenden Befehl ausführen:
-
-```bash
-python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
-```
-
-Führen Sie das gleiche Programm in einer Offline-Instanz mit aus:
-
-```bash
-HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
-python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
-```
-
-Das Skript sollte nun laufen, ohne sich aufzuhängen oder eine Zeitüberschreitung abzuwarten, da es weiß, dass es nur nach lokalen Dateien suchen soll.
-
-
-### Abrufen von Modellen und Tokenizern zur Offline-Verwendung
-
-Eine andere Möglichkeit, 🤗 Transformers offline zu verwenden, besteht darin, die Dateien im Voraus herunterzuladen und dann auf ihren lokalen Pfad zu verweisen, wenn Sie sie offline verwenden müssen. Es gibt drei Möglichkeiten, dies zu tun:
-
-* Laden Sie eine Datei über die Benutzeroberfläche des [Model Hub](https://huggingface.co/models) herunter, indem Sie auf das ↓-Symbol klicken.
-
-    ![download-icon](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/download-icon.png)
-
-* Verwenden Sie den [PreTrainedModel.from_pretrained] und [PreTrainedModel.save_pretrained] Workflow:
-
-    1. Laden Sie Ihre Dateien im Voraus mit [`PreTrainedModel.from_pretrained`] herunter:
-
-    ```py
-    >>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-
-    >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/T0_3B")
-    >>> model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0_3B")
-    ```
-
-    2. Speichern Sie Ihre Dateien in einem bestimmten Verzeichnis mit [`PreTrainedModel.save_pretrained`]:
-
-    ```py
-    >>> tokenizer.save_pretrained("./your/path/bigscience_t0")
-    >>> model.save_pretrained("./your/path/bigscience_t0")
-    ```
-
-    3. Wenn Sie nun offline sind, laden Sie Ihre Dateien mit [`PreTrainedModel.from_pretrained`] aus dem bestimmten Verzeichnis:
-
-    ```py
-    >>> tokenizer = AutoTokenizer.from_pretrained("./your/path/bigscience_t0")
-    >>> model = AutoModel.from_pretrained("./your/path/bigscience_t0")
-    ```
-
-* Programmatisches Herunterladen von Dateien mit der [huggingface_hub](https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub) Bibliothek:
-
-    1. Installieren Sie die "huggingface_hub"-Bibliothek in Ihrer virtuellen Umgebung:
-
-    ```bash
-    python -m pip install huggingface_hub
-    ```
-
-    2. Verwenden Sie die Funktion [`hf_hub_download`](https://huggingface.co/docs/hub/adding-a-library#download-files-from-the-hub), um eine Datei in einen bestimmten Pfad herunterzuladen. Der folgende Befehl lädt zum Beispiel die Datei "config.json" aus dem Modell [T0](https://huggingface.co/bigscience/T0_3B) in den gewünschten Pfad herunter:
-
-    ```py
-    >>> from huggingface_hub import hf_hub_download
-
-    >>> hf_hub_download(repo_id="bigscience/T0_3B", filename="config.json", cache_dir="./your/path/bigscience_t0")
-    ```
-
-Sobald Ihre Datei heruntergeladen und lokal zwischengespeichert ist, geben Sie den lokalen Pfad an, um sie zu laden und zu verwenden:
-
-```py
->>> from transformers import AutoConfig
-
->>> config = AutoConfig.from_pretrained("./your/path/bigscience_t0/config.json")
-```
-
-<Tip>
-
-Weitere Informationen zum Herunterladen von Dateien, die auf dem Hub gespeichert sind, finden Sie im Abschnitt [Wie man Dateien vom Hub herunterlädt] (https://huggingface.co/docs/hub/how-to-downstream).
-  
-</Tip>
diff --git a/docs/source/de/llm_tutorial.md b/docs/source/de/llm_tutorial.md
new file mode 100644
index 000000000000..1c5da4103283
--- /dev/null
+++ b/docs/source/de/llm_tutorial.md
@@ -0,0 +1,221 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+
+# Generation with LLMs
+
+[[open-in-colab]]
+
+LLMs (Large Language Models) sind die Schlüsselkomponente bei der Texterstellung. Kurz gesagt, bestehen sie aus großen, vortrainierten Transformationsmodellen, die darauf trainiert sind, das nächste Wort (oder genauer gesagt Token) aus einem Eingabetext vorherzusagen. Da sie jeweils ein Token vorhersagen, müssen Sie etwas Aufwändigeres tun, um neue Sätze zu generieren, als nur das Modell aufzurufen - Sie müssen eine autoregressive Generierung durchführen.
+
+Die autoregressive Generierung ist ein Verfahren zur Inferenzzeit, bei dem ein Modell mit seinen eigenen generierten Ausgaben iterativ aufgerufen wird, wenn einige anfängliche Eingaben vorliegen. In 🤗 Transformers wird dies von der Methode [`~generation.GenerationMixin.generate`] übernommen, die allen Modellen mit generativen Fähigkeiten zur Verfügung steht.
+
+Dieses Tutorial zeigt Ihnen, wie Sie:
+
+* Text mit einem LLM generieren
+* Vermeiden Sie häufige Fallstricke
+* Nächste Schritte, damit Sie das Beste aus Ihrem LLM herausholen können
+
+Bevor Sie beginnen, stellen Sie sicher, dass Sie alle erforderlichen Bibliotheken installiert haben:
+
+```bash
+pip install transformers bitsandbytes>=0.39.0 -q
+```
+
+
+## Text generieren
+
+Ein Sprachmodell, das für [causal language modeling](tasks/language_modeling) trainiert wurde, nimmt eine Folge von Text-Token als Eingabe und gibt die Wahrscheinlichkeitsverteilung für das nächste Token zurück.
+
+<!-- [GIF 1 -- FWD PASS] -->
+<figure class="image table text-center m-0 w-full">
+    <video
+        style="max-width: 90%; margin: auto;"
+        autoplay loop muted playsinline
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/assisted-generation/gif_1_1080p.mov"
+    ></video>
+    <figcaption>"Forward pass of an LLM"</figcaption>
+</figure>
+
+Ein wichtiger Aspekt der autoregressiven Generierung mit LLMs ist die Auswahl des nächsten Tokens aus dieser Wahrscheinlichkeitsverteilung. In diesem Schritt ist alles möglich, solange Sie am Ende ein Token für die nächste Iteration haben. Das heißt, es kann so einfach sein wie die Auswahl des wahrscheinlichsten Tokens aus der Wahrscheinlichkeitsverteilung oder so komplex wie die Anwendung von einem Dutzend Transformationen vor der Stichprobenziehung aus der resultierenden Verteilung.
+
+<!-- [GIF 2 -- TEXT GENERATION] -->
+<figure class="image table text-center m-0 w-full">
+    <video
+        style="max-width: 90%; margin: auto;"
+        autoplay loop muted playsinline
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/assisted-generation/gif_2_1080p.mov"
+    ></video>
+    <figcaption>"Die autoregressive Generierung wählt iterativ das nächste Token aus einer Wahrscheinlichkeitsverteilung aus, um Text zu erzeugen"</figcaption>
+</figure>
+
+Der oben dargestellte Prozess wird iterativ wiederholt, bis eine bestimmte Abbruchbedingung erreicht ist. Im Idealfall wird die Abbruchbedingung vom Modell vorgegeben, das lernen sollte, wann es ein Ende-der-Sequenz-Token (EOS) ausgeben muss. Ist dies nicht der Fall, stoppt die Generierung, wenn eine vordefinierte Maximallänge erreicht ist.
+
+Damit sich Ihr Modell so verhält, wie Sie es für Ihre Aufgabe erwarten, müssen Sie den Schritt der Token-Auswahl und die Abbruchbedingung richtig einstellen. Aus diesem Grund haben wir zu jedem Modell eine [`~generation.GenerationConfig`]-Datei, die eine gute generative Standardparametrisierung enthält und zusammen mit Ihrem Modell geladen wird.
+
+Lassen Sie uns über Code sprechen!
+
+<Tip>
+
+Wenn Sie an der grundlegenden Verwendung von LLMs interessiert sind, ist unsere High-Level-Schnittstelle [`Pipeline`](pipeline_tutorial) ein guter Ausgangspunkt. LLMs erfordern jedoch oft fortgeschrittene Funktionen wie Quantisierung und Feinsteuerung des Token-Auswahlschritts, was am besten über [`~generation.GenerationMixin.generate`] erfolgt. Die autoregressive Generierung mit LLMs ist ebenfalls ressourcenintensiv und sollte für einen angemessenen Durchsatz auf einer GPU ausgeführt werden.
+
+</Tip>
+
+<!-- TODO: update example to llama 2 (or a newer popular baseline) when it becomes ungated -->
+Zunächst müssen Sie das Modell laden.
+
+```py
+>>> from transformers import AutoModelForCausalLM
+
+>>> model = AutoModelForCausalLM.from_pretrained(
+...     "openlm-research/open_llama_7b", device_map="auto", load_in_4bit=True
+... )
+```
+
+Sie werden zwei Flags in dem Aufruf `from_pretrained` bemerken:
+
+ - `device_map` stellt sicher, dass das Modell auf Ihre GPU(s) übertragen wird
+ - `load_in_4bit` wendet [dynamische 4-Bit-Quantisierung](main_classes/quantization) an, um die Ressourcenanforderungen massiv zu reduzieren
+
+Es gibt noch andere Möglichkeiten, ein Modell zu initialisieren, aber dies ist eine gute Grundlage, um mit einem LLM zu beginnen.
+
+Als nächstes müssen Sie Ihre Texteingabe mit einem [tokenizer](tokenizer_summary) vorverarbeiten.
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b")
+>>> model_inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to("cuda")
+```
+
+Die Variable `model_inputs` enthält die tokenisierte Texteingabe sowie die Aufmerksamkeitsmaske. Obwohl [`~generation.GenerationMixin.generate`] sein Bestes tut, um die Aufmerksamkeitsmaske abzuleiten, wenn sie nicht übergeben wird, empfehlen wir, sie für optimale Ergebnisse wann immer möglich zu übergeben.
+
+Rufen Sie schließlich die Methode [~generation.GenerationMixin.generate] auf, um die generierten Token zurückzugeben, die vor dem Drucken in Text umgewandelt werden sollten.
+
+```py
+>>> generated_ids = model.generate(**model_inputs)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'A list of colors: red, blue, green, yellow, black, white, and brown'
+```
+
+Und das war's! Mit ein paar Zeilen Code können Sie sich die Macht eines LLM zunutze machen.
+
+
+## Häufige Fallstricke
+
+Es gibt viele [Generierungsstrategien](generation_strategies), und manchmal sind die Standardwerte für Ihren Anwendungsfall vielleicht nicht geeignet. Wenn Ihre Ausgaben nicht mit dem übereinstimmen, was Sie erwarten, haben wir eine Liste der häufigsten Fallstricke erstellt und wie Sie diese vermeiden können.
+
+```py
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b")
+>>> tokenizer.pad_token = tokenizer.eos_token  # Llama has no pad token by default
+>>> model = AutoModelForCausalLM.from_pretrained(
+...     "openlm-research/open_llama_7b", device_map="auto", load_in_4bit=True
+... )
+```
+
+### Generierte Ausgabe ist zu kurz/lang
+
+Wenn in der Datei [~generation.GenerationConfig`] nichts angegeben ist, gibt `generate` standardmäßig bis zu 20 Token zurück. Wir empfehlen dringend, `max_new_tokens` in Ihrem `generate`-Aufruf manuell zu setzen, um die maximale Anzahl neuer Token zu kontrollieren, die zurückgegeben werden können. Beachten Sie, dass LLMs (genauer gesagt, [decoder-only models](https://huggingface.co/learn/nlp-course/chapter1/6?fw=pt)) auch die Eingabeaufforderung als Teil der Ausgabe zurückgeben.
+
+
+```py
+>>> model_inputs = tokenizer(["A sequence of numbers: 1, 2"], return_tensors="pt").to("cuda")
+
+>>> # By default, the output will contain up to 20 tokens
+>>> generated_ids = model.generate(**model_inputs)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'A sequence of numbers: 1, 2, 3, 4, 5'
+
+>>> # Setting `max_new_tokens` allows you to control the maximum length
+>>> generated_ids = model.generate(**model_inputs, max_new_tokens=50)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'A sequence of numbers: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,'
+```
+
+### Falscher Generierungsmodus
+
+Standardmäßig und sofern nicht in der Datei [~generation.GenerationConfig`] angegeben, wählt `generate` bei jeder Iteration das wahrscheinlichste Token aus (gierige Dekodierung). Je nach Aufgabe kann dies unerwünscht sein; kreative Aufgaben wie Chatbots oder das Schreiben eines Aufsatzes profitieren vom Sampling. Andererseits profitieren Aufgaben, bei denen es auf die Eingabe ankommt, wie z.B. Audiotranskription oder Übersetzung, von der gierigen Dekodierung. Aktivieren Sie das Sampling mit `do_sample=True`. Mehr zu diesem Thema erfahren Sie in diesem [Blogbeitrag] (https://huggingface.co/blog/how-to-generate).
+
+```py
+>>> # Set seed or reproducibility -- you don't need this unless you want full reproducibility
+>>> from transformers import set_seed
+>>> set_seed(0)
+
+>>> model_inputs = tokenizer(["I am a cat."], return_tensors="pt").to("cuda")
+
+>>> # LLM + greedy decoding = repetitive, boring output
+>>> generated_ids = model.generate(**model_inputs)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'I am a cat. I am a cat. I am a cat. I am a cat'
+
+>>> # With sampling, the output becomes more creative!
+>>> generated_ids = model.generate(**model_inputs, do_sample=True)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'I am a cat.\nI just need to be. I am always.\nEvery time'
+```
+
+### Falsche Auffüllseite
+
+LLMs sind [decoder-only](https://huggingface.co/learn/nlp-course/chapter1/6?fw=pt)-Architekturen, d.h. sie iterieren weiter über Ihre Eingabeaufforderung. Wenn Ihre Eingaben nicht die gleiche Länge haben, müssen sie aufgefüllt werden. Da LLMs nicht darauf trainiert sind, mit aufgefüllten Token fortzufahren, muss Ihre Eingabe links aufgefüllt werden. Vergessen Sie auch nicht, die Aufmerksamkeitsmaske an generate zu übergeben!
+
+```py
+>>> # The tokenizer initialized above has right-padding active by default: the 1st sequence,
+>>> # which is shorter, has padding on the right side. Generation fails.
+>>> model_inputs = tokenizer(
+...     ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
+... ).to("cuda")
+>>> generated_ids = model.generate(**model_inputs)
+>>> tokenizer.batch_decode(generated_ids[0], skip_special_tokens=True)[0]
+''
+
+>>> # With left-padding, it works as expected!
+>>> tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b", padding_side="left")
+>>> tokenizer.pad_token = tokenizer.eos_token  # Llama has no pad token by default
+>>> model_inputs = tokenizer(
+...     ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
+... ).to("cuda")
+>>> generated_ids = model.generate(**model_inputs)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'1, 2, 3, 4, 5, 6,'
+```
+
+<!-- TODO: when the prompting guide is ready, mention the importance of setting the right prompt in this section -->
+
+## Weitere Ressourcen
+
+Während der Prozess der autoregressiven Generierung relativ einfach ist, kann die optimale Nutzung Ihres LLM ein schwieriges Unterfangen sein, da es viele bewegliche Teile gibt. Für Ihre nächsten Schritte, die Ihnen helfen, tiefer in die LLM-Nutzung und das Verständnis einzutauchen:
+
+<!-- TODO: mit neuen Anleitungen vervollständigen -->
+### Fortgeschrittene Nutzung generieren
+
+1. [Leitfaden](generation_strategies) zur Steuerung verschiedener Generierungsmethoden, zur Einrichtung der Generierungskonfigurationsdatei und zum Streaming der Ausgabe;
+2. API-Referenz zu [`~generation.GenerationConfig`], [`~generation.GenerationMixin.generate`] und [generate-bezogene Klassen](internal/generation_utils).
+
+### LLM-Ranglisten
+
+1. [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard), das sich auf die Qualität der Open-Source-Modelle konzentriert;
+2. [Open LLM-Perf Leaderboard](https://huggingface.co/spaces/optimum/llm-perf-leaderboard), das sich auf den LLM-Durchsatz konzentriert.
+
+### Latenz und Durchsatz
+
+1. [Leitfaden](main_classes/quantization) zur dynamischen Quantisierung, der Ihnen zeigt, wie Sie Ihren Speicherbedarf drastisch reduzieren können.
+
+### Verwandte Bibliotheken
+
+1. [text-generation-inference](https://github.com/huggingface/text-generation-inference), ein produktionsreifer Server für LLMs;
+2. [`optimum`](https://github.com/huggingface/optimum), eine Erweiterung von 🤗 Transformers, die für bestimmte Hardware-Geräte optimiert.
diff --git a/docs/source/de/model_sharing.md b/docs/source/de/model_sharing.md
new file mode 100644
index 000000000000..415277e00e5e
--- /dev/null
+++ b/docs/source/de/model_sharing.md
@@ -0,0 +1,232 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Ein Modell teilen
+
+Die letzten beiden Tutorials haben gezeigt, wie man ein Modell mit PyTorch, Keras und 🤗 Accelerate für verteilte Setups feinabstimmen kann. Der nächste Schritt besteht darin, Ihr Modell mit der Community zu teilen! Bei Hugging Face glauben wir an den offenen Austausch von Wissen und Ressourcen, um künstliche Intelligenz für alle zu demokratisieren. Wir ermutigen Sie, Ihr Modell mit der Community zu teilen, um anderen zu helfen, Zeit und Ressourcen zu sparen.
+
+In diesem Tutorial lernen Sie zwei Methoden kennen, wie Sie ein trainiertes oder verfeinertes Modell auf dem [Model Hub](https://huggingface.co/models) teilen können:
+
+- Programmgesteuertes Übertragen Ihrer Dateien auf den Hub.
+- Ziehen Sie Ihre Dateien per Drag-and-Drop über die Weboberfläche in den Hub.
+
+<iframe width="560" height="315" src="https://www.youtube.com/embed/XvSGPZFEjDY" title="YouTube video player"
+frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
+picture-in-picture" allowfullscreen></iframe>
+
+<Tip>
+
+Um ein Modell mit der Öffentlichkeit zu teilen, benötigen Sie ein Konto auf [huggingface.co](https://huggingface.co/join). Sie können auch einer bestehenden Organisation beitreten oder eine neue Organisation gründen.
+
+</Tip>
+
+## Repository-Funktionen
+
+Jedes Repository im Model Hub verhält sich wie ein typisches GitHub-Repository. Unsere Repositorys bieten Versionierung, Commit-Historie und die Möglichkeit, Unterschiede zu visualisieren.
+
+Die integrierte Versionierung des Model Hub basiert auf Git und [git-lfs](https://git-lfs.github.com/). Mit anderen Worten: Sie können ein Modell als ein Repository behandeln, was eine bessere Zugriffskontrolle und Skalierbarkeit ermöglicht. Die Versionskontrolle ermöglicht *Revisionen*, eine Methode zum Anheften einer bestimmten Version eines Modells mit einem Commit-Hash, Tag oder Branch.
+
+Folglich können Sie eine bestimmte Modellversion mit dem Parameter "Revision" laden:
+
+```py
+>>> model = AutoModel.from_pretrained(
+...     "julien-c/EsperBERTo-small", revision="v2.0.1"  # tag name, or branch name, or commit hash
+... )
+```
+
+Dateien lassen sich auch in einem Repository leicht bearbeiten, und Sie können die Commit-Historie sowie die Unterschiede einsehen:
+
+![vis_diff](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vis_diff.png)
+
+## Einrichtung
+
+Bevor Sie ein Modell für den Hub freigeben, benötigen Sie Ihre Hugging Face-Anmeldedaten. Wenn Sie Zugang zu einem Terminal haben, führen Sie den folgenden Befehl in der virtuellen Umgebung aus, in der 🤗 Transformers installiert ist. Dadurch werden Ihre Zugangsdaten in Ihrem Hugging Face-Cache-Ordner (standardmäßig `~/.cache/`) gespeichert:
+
+```bash
+huggingface-cli login
+```
+
+Wenn Sie ein Notebook wie Jupyter oder Colaboratory verwenden, stellen Sie sicher, dass Sie die [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library) Bibliothek installiert haben. Diese Bibliothek ermöglicht Ihnen die programmatische Interaktion mit dem Hub.
+
+```bash
+pip install huggingface_hub
+```
+
+Verwenden Sie dann `notebook_login`, um sich beim Hub anzumelden, und folgen Sie dem Link [hier](https://huggingface.co/settings/token), um ein Token für die Anmeldung zu generieren:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## Ein Modell für alle Frameworks konvertieren
+
+Um sicherzustellen, dass Ihr Modell von jemandem verwendet werden kann, der mit einem anderen Framework arbeitet, empfehlen wir Ihnen, Ihr Modell sowohl mit PyTorch- als auch mit TensorFlow-Checkpoints zu konvertieren und hochzuladen. Während Benutzer immer noch in der Lage sind, Ihr Modell von einem anderen Framework zu laden, wenn Sie diesen Schritt überspringen, wird es langsamer sein, weil 🤗 Transformers den Checkpoint on-the-fly konvertieren müssen.
+
+Die Konvertierung eines Checkpoints für ein anderes Framework ist einfach. Stellen Sie sicher, dass Sie PyTorch und TensorFlow installiert haben (siehe [hier](installation) für Installationsanweisungen), und finden Sie dann das spezifische Modell für Ihre Aufgabe in dem anderen Framework. 
+
+<frameworkcontent>
+<pt>
+Geben Sie `from_tf=True` an, um einen Prüfpunkt von TensorFlow nach PyTorch zu konvertieren:
+
+```py
+>>> pt_model = DistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_tf=True)
+>>> pt_model.save_pretrained("path/to/awesome-name-you-picked")
+```
+</pt>
+<tf>
+Geben Sie `from_pt=True` an, um einen Prüfpunkt von PyTorch nach TensorFlow zu konvertieren:
+
+```py
+>>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_pt=True)
+```
+
+Dann können Sie Ihr neues TensorFlow-Modell mit seinem neuen Checkpoint speichern:
+
+```py
+>>> tf_model.save_pretrained("path/to/awesome-name-you-picked")
+```
+</tf>
+<jax>
+Wenn ein Modell in Flax verfügbar ist, können Sie auch einen Kontrollpunkt von PyTorch nach Flax konvertieren:
+
+```py
+>>> flax_model = FlaxDistilBertForSequenceClassification.from_pretrained(
+...     "path/to/awesome-name-you-picked", from_pt=True
+... )
+```
+</jax>
+</frameworkcontent>
+
+## Ein Modell während des Trainings hochladen
+
+<frameworkcontent>
+<pt>
+<Youtube id="Z1-XMy-GNLQ"/>
+
+Die Weitergabe eines Modells an den Hub ist so einfach wie das Hinzufügen eines zusätzlichen Parameters oder Rückrufs. Erinnern Sie sich an das [Feinabstimmungs-Tutorial](training), in der Klasse [`TrainingArguments`] geben Sie Hyperparameter und zusätzliche Trainingsoptionen an. Eine dieser Trainingsoptionen beinhaltet die Möglichkeit, ein Modell direkt an den Hub zu pushen. Setzen Sie `push_to_hub=True` in Ihrer [`TrainingArguments`]:
+
+```py
+>>> training_args = TrainingArguments(output_dir="my-awesome-model", push_to_hub=True)
+```
+
+Übergeben Sie Ihre Trainingsargumente wie gewohnt an [`Trainer`]:
+
+```py
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=small_train_dataset,
+...     eval_dataset=small_eval_dataset,
+...     compute_metrics=compute_metrics,
+... )
+```
+
+Nach der Feinabstimmung Ihres Modells rufen Sie [`~transformers.Trainer.push_to_hub`] auf [`Trainer`] auf, um das trainierte Modell an den Hub zu übertragen. Transformers fügt sogar automatisch Trainings-Hyperparameter, Trainingsergebnisse und Framework-Versionen zu Ihrer Modellkarte hinzu!
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+Geben Sie ein Modell mit [`PushToHubCallback`] an den Hub weiter. In der [`PushToHubCallback`] Funktion, fügen Sie hinzu:
+
+- Ein Ausgabeverzeichnis für Ihr Modell.
+- Einen Tokenizer.
+- Die `hub_model_id`, die Ihr Hub-Benutzername und Modellname ist.
+
+```py
+>>> from transformers import PushToHubCallback
+
+>>> push_to_hub_callback = PushToHubCallback(
+...     output_dir="./your_model_save_path", tokenizer=tokenizer, hub_model_id="your-username/my-awesome-model"
+... )
+```
+
+Fügen Sie den Callback zu [`fit`](https://keras.io/api/models/model_training_apis/) hinzu, und 🤗 Transformers wird das trainierte Modell an den Hub weiterleiten:
+
+```py
+>>> model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3, callbacks=push_to_hub_callback)
+```
+</tf>
+</frameworkcontent>
+
+## Verwenden Sie die Funktion `push_to_hub`.
+
+Sie können `push_to_hub` auch direkt für Ihr Modell aufrufen, um es in den Hub hochzuladen.
+
+Geben Sie den Namen Ihres Modells in "push_to_hub" an:
+
+```py
+>>> pt_model.push_to_hub("my-awesome-model")
+```
+
+Dadurch wird ein Repository unter Ihrem Benutzernamen mit dem Modellnamen `my-awesome-model` erstellt. Benutzer können nun Ihr Modell mit der Funktion `from_pretrained` laden:
+
+```py
+>>> from transformers import AutoModel
+
+>>> model = AutoModel.from_pretrained("your_username/my-awesome-model")
+```
+
+Wenn Sie zu einer Organisation gehören und Ihr Modell stattdessen unter dem Namen der Organisation pushen wollen, fügen Sie diesen einfach zur `repo_id` hinzu:
+
+```py
+>>> pt_model.push_to_hub("my-awesome-org/my-awesome-model")
+```
+
+Die Funktion "push_to_hub" kann auch verwendet werden, um andere Dateien zu einem Modell-Repository hinzuzufügen. Zum Beispiel kann man einen Tokenizer zu einem Modell-Repository hinzufügen:
+
+```py
+>>> tokenizer.push_to_hub("my-awesome-model")
+```
+
+Oder vielleicht möchten Sie die TensorFlow-Version Ihres fein abgestimmten PyTorch-Modells hinzufügen:
+
+```py
+>>> tf_model.push_to_hub("my-awesome-model")
+```
+
+Wenn Sie nun zu Ihrem Hugging Face-Profil navigieren, sollten Sie Ihr neu erstelltes Modell-Repository sehen. Wenn Sie auf die Registerkarte **Dateien** klicken, werden alle Dateien angezeigt, die Sie in das Repository hochgeladen haben.
+
+Weitere Einzelheiten zum Erstellen und Hochladen von Dateien in ein Repository finden Sie in der Hub-Dokumentation [hier](https://huggingface.co/docs/hub/how-to-upstream).
+
+## Hochladen mit der Weboberfläche
+
+Benutzer, die einen no-code Ansatz bevorzugen, können ein Modell über das Webinterface des Hubs hochladen. Besuchen Sie [huggingface.co/new](https://huggingface.co/new) um ein neues Repository zu erstellen:
+
+![new_model_repo](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/new_model_repo.png)
+
+Fügen Sie von hier aus einige Informationen über Ihr Modell hinzu:
+
+- Wählen Sie den **Besitzer** des Repositorys. Dies können Sie selbst oder eine der Organisationen sein, denen Sie angehören.
+- Wählen Sie einen Namen für Ihr Modell, der auch der Name des Repositorys sein wird.
+- Wählen Sie, ob Ihr Modell öffentlich oder privat ist.
+- Geben Sie die Lizenzverwendung für Ihr Modell an.
+
+Klicken Sie nun auf die Registerkarte **Dateien** und klicken Sie auf die Schaltfläche **Datei hinzufügen**, um eine neue Datei in Ihr Repository hochzuladen. Ziehen Sie dann eine Datei per Drag-and-Drop hoch und fügen Sie eine Übergabemeldung hinzu.
+
+![upload_file](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/upload_file.png)
+
+## Hinzufügen einer Modellkarte
+
+Um sicherzustellen, dass die Benutzer die Fähigkeiten, Grenzen, möglichen Verzerrungen und ethischen Aspekte Ihres Modells verstehen, fügen Sie bitte eine Modellkarte zu Ihrem Repository hinzu. Die Modellkarte wird in der Datei `README.md` definiert. Sie können eine Modellkarte hinzufügen, indem Sie:
+
+* Manuelles Erstellen und Hochladen einer "README.md"-Datei.
+* Klicken Sie auf die Schaltfläche **Modellkarte bearbeiten** in Ihrem Modell-Repository.
+
+Werfen Sie einen Blick auf die DistilBert [model card](https://huggingface.co/distilbert-base-uncased) als gutes Beispiel für die Art von Informationen, die eine Modellkarte enthalten sollte. Weitere Details über andere Optionen, die Sie in der Datei "README.md" einstellen können, wie z.B. den Kohlenstoff-Fußabdruck eines Modells oder Beispiele für Widgets, finden Sie in der Dokumentation [hier](https://huggingface.co/docs/hub/models-cards).
\ No newline at end of file
diff --git a/docs/source/de/model_sharing.mdx b/docs/source/de/model_sharing.mdx
deleted file mode 100644
index 50318595ffc2..000000000000
--- a/docs/source/de/model_sharing.mdx
+++ /dev/null
@@ -1,228 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Ein Modell teilen
-
-Die letzten beiden Tutorials haben gezeigt, wie man ein Modell mit PyTorch, Keras und 🤗 Accelerate für verteilte Setups feinabstimmen kann. Der nächste Schritt besteht darin, Ihr Modell mit der Community zu teilen! Bei Hugging Face glauben wir an den offenen Austausch von Wissen und Ressourcen, um künstliche Intelligenz für alle zu demokratisieren. Wir ermutigen Sie, Ihr Modell mit der Community zu teilen, um anderen zu helfen, Zeit und Ressourcen zu sparen.
-
-In diesem Tutorial lernen Sie zwei Methoden kennen, wie Sie ein trainiertes oder verfeinertes Modell auf dem [Model Hub](https://huggingface.co/models) teilen können:
-
-- Programmgesteuertes Übertragen Ihrer Dateien auf den Hub.
-- Ziehen Sie Ihre Dateien per Drag-and-Drop über die Weboberfläche in den Hub.
-
-<iframe width="560" height="315" src="https://www.youtube.com/embed/XvSGPZFEjDY" title="YouTube video player"
-frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
-picture-in-picture" allowfullscreen></iframe>
-
-<Tip>
-
-Um ein Modell mit der Öffentlichkeit zu teilen, benötigen Sie ein Konto auf [huggingface.co](https://huggingface.co/join). Sie können auch einer bestehenden Organisation beitreten oder eine neue Organisation gründen.
-
-</Tip>
-
-## Repository-Funktionen
-
-Jedes Repository im Model Hub verhält sich wie ein typisches GitHub-Repository. Unsere Repositorys bieten Versionierung, Commit-Historie und die Möglichkeit, Unterschiede zu visualisieren.
-
-Die integrierte Versionierung des Model Hub basiert auf Git und [git-lfs](https://git-lfs.github.com/). Mit anderen Worten: Sie können ein Modell als ein Repository behandeln, was eine bessere Zugriffskontrolle und Skalierbarkeit ermöglicht. Die Versionskontrolle ermöglicht *Revisionen*, eine Methode zum Anheften einer bestimmten Version eines Modells mit einem Commit-Hash, Tag oder Branch.
-
-Folglich können Sie eine bestimmte Modellversion mit dem Parameter "Revision" laden:
-
-```py
->>> model = AutoModel.from_pretrained(
-...     "julien-c/EsperBERTo-small", revision="v2.0.1"  # tag name, or branch name, or commit hash
-... )
-```
-
-Dateien lassen sich auch in einem Repository leicht bearbeiten, und Sie können die Commit-Historie sowie die Unterschiede einsehen:
-
-![vis_diff](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vis_diff.png)
-
-## Einrichtung
-
-Bevor Sie ein Modell für den Hub freigeben, benötigen Sie Ihre Hugging Face-Anmeldedaten. Wenn Sie Zugang zu einem Terminal haben, führen Sie den folgenden Befehl in der virtuellen Umgebung aus, in der 🤗 Transformers installiert ist. Dadurch werden Ihre Zugangsdaten in Ihrem Hugging Face-Cache-Ordner (standardmäßig `~/.cache/`) gespeichert:
-
-```bash
-huggingface-cli login
-```
-
-Wenn Sie ein Notebook wie Jupyter oder Colaboratory verwenden, stellen Sie sicher, dass Sie die [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library) Bibliothek installiert haben. Diese Bibliothek ermöglicht Ihnen die programmatische Interaktion mit dem Hub.
-
-```bash
-pip install huggingface_hub
-```
-
-Verwenden Sie dann `notebook_login`, um sich beim Hub anzumelden, und folgen Sie dem Link [hier](https://huggingface.co/settings/token), um ein Token für die Anmeldung zu generieren:
-
-```py
->>> from huggingface_hub import notebook_login
-
->>> notebook_login()
-```
-
-## Ein Modell für alle Frameworks konvertieren
-
-Um sicherzustellen, dass Ihr Modell von jemandem verwendet werden kann, der mit einem anderen Framework arbeitet, empfehlen wir Ihnen, Ihr Modell sowohl mit PyTorch- als auch mit TensorFlow-Checkpoints zu konvertieren und hochzuladen. Während Benutzer immer noch in der Lage sind, Ihr Modell von einem anderen Framework zu laden, wenn Sie diesen Schritt überspringen, wird es langsamer sein, weil 🤗 Transformers den Checkpoint on-the-fly konvertieren müssen.
-
-Die Konvertierung eines Checkpoints für ein anderes Framework ist einfach. Stellen Sie sicher, dass Sie PyTorch und TensorFlow installiert haben (siehe [hier](installation) für Installationsanweisungen), und finden Sie dann das spezifische Modell für Ihre Aufgabe in dem anderen Framework. 
-
-<frameworkcontent>
-<pt>
-Geben Sie `from_tf=True` an, um einen Prüfpunkt von TensorFlow nach PyTorch zu konvertieren:
-
-```py
->>> pt_model = DistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_tf=True)
->>> pt_model.save_pretrained("path/to/awesome-name-you-picked")
-```
-</pt>
-<tf>
-Geben Sie `from_pt=True` an, um einen Prüfpunkt von PyTorch nach TensorFlow zu konvertieren:
-
-```py
->>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_pt=True)
-```
-
-Dann können Sie Ihr neues TensorFlow-Modell mit seinem neuen Checkpoint speichern:
-
-```py
->>> tf_model.save_pretrained("path/to/awesome-name-you-picked")
-```
-</tf>
-<jax>
-Wenn ein Modell in Flax verfügbar ist, können Sie auch einen Kontrollpunkt von PyTorch nach Flax konvertieren:
-
-```py
->>> flax_model = FlaxDistilBertForSequenceClassification.from_pretrained(
-...     "path/to/awesome-name-you-picked", from_pt=True
-... )
-```
-</jax>
-</frameworkcontent>
-
-## Ein Modell während des Trainings hochladen
-
-<frameworkcontent>
-<pt>
-<Youtube id="Z1-XMy-GNLQ"/>
-
-Die Weitergabe eines Modells an den Hub ist so einfach wie das Hinzufügen eines zusätzlichen Parameters oder Rückrufs. Erinnern Sie sich an das [Feinabstimmungs-Tutorial](training), in der Klasse [`TrainingArguments`] geben Sie Hyperparameter und zusätzliche Trainingsoptionen an. Eine dieser Trainingsoptionen beinhaltet die Möglichkeit, ein Modell direkt an den Hub zu pushen. Setzen Sie `push_to_hub=True` in Ihrer [`TrainingArguments`]:
-
-```py
->>> training_args = TrainingArguments(output_dir="my-awesome-model", push_to_hub=True)
-```
-
-Übergeben Sie Ihre Trainingsargumente wie gewohnt an [`Trainer`]:
-
-```py
->>> trainer = Trainer(
-...     model=model,
-...     args=training_args,
-...     train_dataset=small_train_dataset,
-...     eval_dataset=small_eval_dataset,
-...     compute_metrics=compute_metrics,
-... )
-```
-
-Nach der Feinabstimmung Ihres Modells rufen Sie [`~transformers.Trainer.push_to_hub`] auf [`Trainer`] auf, um das trainierte Modell an den Hub zu übertragen. Transformers fügt sogar automatisch Trainings-Hyperparameter, Trainingsergebnisse und Framework-Versionen zu Ihrer Modellkarte hinzu!
-
-```py
->>> trainer.push_to_hub()
-```
-</pt>
-<tf>
-Geben Sie ein Modell mit [`PushToHubCallback`] an den Hub weiter. In der [`PushToHubCallback`] Funktion, fügen Sie hinzu:
-
-- Ein Ausgabeverzeichnis für Ihr Modell.
-- Einen Tokenizer.
-- Die `hub_model_id`, die Ihr Hub-Benutzername und Modellname ist.
-
-```py
->>> from transformers.keras.callbacks import PushToHubCallback
-
->>> push_to_hub_callback = PushToHubCallback(
-...     output_dir="./your_model_save_path", tokenizer=tokenizer, hub_model_id="your-username/my-awesome-model"
-... )
-```
-
-Fügen Sie den Callback zu [`fit`](https://keras.io/api/models/model_training_apis/) hinzu, und 🤗 Transformers wird das trainierte Modell an den Hub weiterleiten:
-
-```py
->>> model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3, callbacks=push_to_hub_callback)
-```
-</tf>
-</frameworkcontent>
-
-## Verwenden Sie die Funktion `push_to_hub`.
-
-Sie können `push_to_hub` auch direkt für Ihr Modell aufrufen, um es in den Hub hochzuladen.
-
-Geben Sie den Namen Ihres Modells in "push_to_hub" an:
-
-```py
->>> pt_model.push_to_hub("my-awesome-model")
-```
-
-Dadurch wird ein Repository unter Ihrem Benutzernamen mit dem Modellnamen `my-awesome-model` erstellt. Benutzer können nun Ihr Modell mit der Funktion `from_pretrained` laden:
-
-```py
->>> from transformers import AutoModel
-
->>> model = AutoModel.from_pretrained("your_username/my-awesome-model")
-```
-
-Wenn Sie zu einer Organisation gehören und Ihr Modell stattdessen unter dem Namen der Organisation pushen wollen, fügen Sie diesen einfach zur `repo_id` hinzu:
-
-```py
->>> pt_model.push_to_hub("my-awesome-org/my-awesome-model")
-```
-
-Die Funktion "push_to_hub" kann auch verwendet werden, um andere Dateien zu einem Modell-Repository hinzuzufügen. Zum Beispiel kann man einen Tokenizer zu einem Modell-Repository hinzufügen:
-
-```py
->>> tokenizer.push_to_hub("my-awesome-model")
-```
-
-Oder vielleicht möchten Sie die TensorFlow-Version Ihres fein abgestimmten PyTorch-Modells hinzufügen:
-
-```py
->>> tf_model.push_to_hub("my-awesome-model")
-```
-
-Wenn Sie nun zu Ihrem Hugging Face-Profil navigieren, sollten Sie Ihr neu erstelltes Modell-Repository sehen. Wenn Sie auf die Registerkarte **Dateien** klicken, werden alle Dateien angezeigt, die Sie in das Repository hochgeladen haben.
-
-Weitere Einzelheiten zum Erstellen und Hochladen von Dateien in ein Repository finden Sie in der Hub-Dokumentation [hier](https://huggingface.co/docs/hub/how-to-upstream).
-
-## Hochladen mit der Weboberfläche
-
-Benutzer, die einen no-code Ansatz bevorzugen, können ein Modell über das Webinterface des Hubs hochladen. Besuchen Sie [huggingface.co/new](https://huggingface.co/new) um ein neues Repository zu erstellen:
-
-![new_model_repo](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/new_model_repo.png)
-
-Fügen Sie von hier aus einige Informationen über Ihr Modell hinzu:
-
-- Wählen Sie den **Besitzer** des Repositorys. Dies können Sie selbst oder eine der Organisationen sein, denen Sie angehören.
-- Wählen Sie einen Namen für Ihr Modell, der auch der Name des Repositorys sein wird.
-- Wählen Sie, ob Ihr Modell öffentlich oder privat ist.
-- Geben Sie die Lizenzverwendung für Ihr Modell an.
-
-Klicken Sie nun auf die Registerkarte **Dateien** und klicken Sie auf die Schaltfläche **Datei hinzufügen**, um eine neue Datei in Ihr Repository hochzuladen. Ziehen Sie dann eine Datei per Drag-and-Drop hoch und fügen Sie eine Übergabemeldung hinzu.
-
-![upload_file](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/upload_file.png)
-
-## Hinzufügen einer Modellkarte
-
-Um sicherzustellen, dass die Benutzer die Fähigkeiten, Grenzen, möglichen Verzerrungen und ethischen Aspekte Ihres Modells verstehen, fügen Sie bitte eine Modellkarte zu Ihrem Repository hinzu. Die Modellkarte wird in der Datei `README.md` definiert. Sie können eine Modellkarte hinzufügen, indem Sie:
-
-* Manuelles Erstellen und Hochladen einer "README.md"-Datei.
-* Klicken Sie auf die Schaltfläche **Modellkarte bearbeiten** in Ihrem Modell-Repository.
-
-Werfen Sie einen Blick auf die DistilBert [model card](https://huggingface.co/distilbert-base-uncased) als gutes Beispiel für die Art von Informationen, die eine Modellkarte enthalten sollte. Weitere Details über andere Optionen, die Sie in der Datei "README.md" einstellen können, wie z.B. den Kohlenstoff-Fußabdruck eines Modells oder Beispiele für Widgets, finden Sie in der Dokumentation [hier](https://huggingface.co/docs/hub/models-cards).
\ No newline at end of file
diff --git a/docs/source/de/peft.md b/docs/source/de/peft.md
new file mode 100644
index 000000000000..bdc0684d798d
--- /dev/null
+++ b/docs/source/de/peft.md
@@ -0,0 +1,216 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+# Adapter mit 🤗 PEFT laden
+
+[[open-in-colab]]
+
+Die [Parameter-Efficient Fine Tuning (PEFT)](https://huggingface.co/blog/peft) Methoden frieren die vorab trainierten Modellparameter während der Feinabstimmung ein und fügen eine kleine Anzahl trainierbarer Parameter (die Adapter) hinzu. Die Adapter werden trainiert, um aufgabenspezifische Informationen zu lernen. Es hat sich gezeigt, dass dieser Ansatz sehr speichereffizient ist und weniger Rechenleistung beansprucht, während die Ergebnisse mit denen eines vollständig feinabgestimmten Modells vergleichbar sind. 
+
+Adapter, die mit PEFT trainiert wurden, sind in der Regel um eine Größenordnung kleiner als das vollständige Modell, so dass sie bequem gemeinsam genutzt, gespeichert und geladen werden können.
+
+<div class="flex flex-col justify-center">
+  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/peft/PEFT-hub-screenshot.png"/>
+  <figcaption class="text-center">Die Adaptergewichte für ein OPTForCausalLM-Modell, die auf dem Hub gespeichert sind, sind nur ~6MB groß, verglichen mit der vollen Größe der Modellgewichte, die ~700MB betragen können.</figcaption>
+</div>
+
+Wenn Sie mehr über die 🤗 PEFT-Bibliothek erfahren möchten, sehen Sie sich die [Dokumentation](https://huggingface.co/docs/peft/index) an.
+
+## Setup
+
+Starten Sie mit der Installation von 🤗 PEFT:
+
+```bash
+pip install peft
+```
+
+Wenn Sie die brandneuen Funktionen ausprobieren möchten, sollten Sie die Bibliothek aus dem Quellcode installieren:
+
+```bash
+pip install git+https://github.com/huggingface/peft.git
+```
+
+## Unterstützte PEFT-Modelle
+
+Transformers unterstützt nativ einige PEFT-Methoden, d.h. Sie können lokal oder auf dem Hub gespeicherte Adaptergewichte laden und sie mit wenigen Zeilen Code einfach ausführen oder trainieren. Die folgenden Methoden werden unterstützt:
+
+- [Low Rank Adapters](https://huggingface.co/docs/peft/conceptual_guides/lora)
+- [IA3](https://huggingface.co/docs/peft/conceptual_guides/ia3)
+- [AdaLoRA](https://arxiv.org/abs/2303.10512)
+
+Wenn Sie andere PEFT-Methoden, wie z.B. Prompt Learning oder Prompt Tuning, verwenden möchten, oder über die 🤗 PEFT-Bibliothek im Allgemeinen, lesen Sie bitte die [Dokumentation](https://huggingface.co/docs/peft/index).
+
+
+## Laden Sie einen PEFT-Adapter
+
+Um ein PEFT-Adaptermodell von 🤗 Transformers zu laden und zu verwenden, stellen Sie sicher, dass das Hub-Repository oder das lokale Verzeichnis eine `adapter_config.json`-Datei und die Adaptergewichte enthält, wie im obigen Beispielbild gezeigt. Dann können Sie das PEFT-Adaptermodell mit der Klasse `AutoModelFor` laden. Um zum Beispiel ein PEFT-Adaptermodell für die kausale Sprachmodellierung zu laden:
+
+1. Geben Sie die PEFT-Modell-ID an.
+2. übergeben Sie es an die Klasse [`AutoModelForCausalLM`].
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+peft_model_id = "ybelkada/opt-350m-lora"
+model = AutoModelForCausalLM.from_pretrained(peft_model_id)
+```
+
+<Tip>
+
+Sie können einen PEFT-Adapter entweder mit einer `AutoModelFor`-Klasse oder der Basismodellklasse wie `OPTForCausalLM` oder `LlamaForCausalLM` laden.
+
+</Tip>
+
+Sie können einen PEFT-Adapter auch laden, indem Sie die Methode `load_adapter` aufrufen:
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_id = "facebook/opt-350m"
+peft_model_id = "ybelkada/opt-350m-lora"
+
+model = AutoModelForCausalLM.from_pretrained(model_id)
+model.load_adapter(peft_model_id)
+```
+
+## Laden in 8bit oder 4bit
+
+Die `bitsandbytes`-Integration unterstützt Datentypen mit 8bit und 4bit Genauigkeit, was für das Laden großer Modelle nützlich ist, weil es Speicher spart (lesen Sie den `bitsandbytes`-Integrations [guide](./quantization#bitsandbytes-integration), um mehr zu erfahren). Fügen Sie die Parameter `load_in_8bit` oder `load_in_4bit` zu [`~PreTrainedModel.from_pretrained`] hinzu und setzen Sie `device_map="auto"`, um das Modell effektiv auf Ihre Hardware zu verteilen:
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+peft_model_id = "ybelkada/opt-350m-lora"
+model = AutoModelForCausalLM.from_pretrained(peft_model_id, device_map="auto", load_in_8bit=True)
+```
+
+## Einen neuen Adapter hinzufügen
+
+Sie können [`~peft.PeftModel.add_adapter`] verwenden, um einen neuen Adapter zu einem Modell mit einem bestehenden Adapter hinzuzufügen, solange der neue Adapter vom gleichen Typ ist wie der aktuelle Adapter. Wenn Sie zum Beispiel einen bestehenden LoRA-Adapter an ein Modell angehängt haben:
+
+```py
+from transformers import AutoModelForCausalLM, OPTForCausalLM, AutoTokenizer
+from peft import PeftConfig
+
+model_id = "facebook/opt-350m"
+model = AutoModelForCausalLM.from_pretrained(model_id)
+
+lora_config = LoraConfig(
+    target_modules=["q_proj", "k_proj"],
+    init_lora_weights=False
+)
+
+model.add_adapter(lora_config, adapter_name="adapter_1")
+```
+
+Um einen neuen Adapter hinzuzufügen:
+
+```py
+# attach new adapter with same config
+model.add_adapter(lora_config, adapter_name="adapter_2")
+```
+
+Jetzt können Sie mit [`~peft.PeftModel.set_adapter`] festlegen, welcher Adapter verwendet werden soll:
+
+```py
+# use adapter_1
+model.set_adapter("adapter_1")
+output = model.generate(**inputs)
+print(tokenizer.decode(output_disabled[0], skip_special_tokens=True))
+
+# use adapter_2
+model.set_adapter("adapter_2")
+output_enabled = model.generate(**inputs)
+print(tokenizer.decode(output_enabled[0], skip_special_tokens=True))
+```
+
+## Aktivieren und Deaktivieren von Adaptern
+
+Sobald Sie einen Adapter zu einem Modell hinzugefügt haben, können Sie das Adaptermodul aktivieren oder deaktivieren. So aktivieren Sie das Adaptermodul:
+
+```py
+from transformers import AutoModelForCausalLM, OPTForCausalLM, AutoTokenizer
+from peft import PeftConfig
+
+model_id = "facebook/opt-350m"
+adapter_model_id = "ybelkada/opt-350m-lora"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+text = "Hello"
+inputs = tokenizer(text, return_tensors="pt")
+
+model = AutoModelForCausalLM.from_pretrained(model_id)
+peft_config = PeftConfig.from_pretrained(adapter_model_id)
+
+# to initiate with random weights
+peft_config.init_lora_weights = False
+
+model.add_adapter(peft_config)
+model.enable_adapters()
+output = model.generate(**inputs)
+```
+
+So deaktivieren Sie das Adaptermodul:
+
+```py
+model.disable_adapters()
+output = model.generate(**inputs)
+```
+
+## PEFT-Adapter trainieren
+
+PEFT-Adapter werden von der Klasse [`Trainer`] unterstützt, so dass Sie einen Adapter für Ihren speziellen Anwendungsfall trainieren können. Dazu müssen Sie nur ein paar weitere Codezeilen hinzufügen. Zum Beispiel, um einen LoRA-Adapter zu trainieren:
+
+<Tip>
+
+Wenn Sie mit der Feinabstimmung eines Modells mit [`Trainer`] noch nicht vertraut sind, werfen Sie einen Blick auf das Tutorial [Feinabstimmung eines vortrainierten Modells](Training).
+
+</Tip>
+
+1. Definieren Sie Ihre Adapterkonfiguration mit dem Aufgabentyp und den Hyperparametern (siehe [`~peft.LoraConfig`] für weitere Details darüber, was die Hyperparameter tun).
+
+```py
+from peft import LoraConfig
+
+peft_config = LoraConfig(
+    lora_alpha=16,
+    lora_dropout=0.1,
+    r=64,
+    bias="none",
+    task_type="CAUSAL_LM",
+)
+```
+
+2. Fügen Sie dem Modell einen Adapter hinzu.
+
+```py
+model.add_adapter(peft_config)
+```
+
+3. Jetzt können Sie das Modell an [`Trainer`] übergeben!
+
+```py
+trainer = Trainer(model=model, ...)
+trainer.train()
+```
+
+So speichern Sie Ihren trainierten Adapter und laden ihn wieder:
+
+```py
+model.save_pretrained(save_dir)
+model = AutoModelForCausalLM.from_pretrained(save_dir)
+```
+
+<!--
+TODO: (@younesbelkada @stevhliu)
+-   Link to PEFT docs for further details
+-   Trainer  
+-   8-bit / 4-bit examples ?
+-->
diff --git a/docs/source/de/pipeline_tutorial.md b/docs/source/de/pipeline_tutorial.md
new file mode 100644
index 000000000000..06ab440d73a6
--- /dev/null
+++ b/docs/source/de/pipeline_tutorial.md
@@ -0,0 +1,175 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Pipelines für Inferenzen
+
+Die [`pipeline`] macht es einfach, jedes beliebige Modell aus dem [Hub](https://huggingface.co/models) für die Inferenz auf jede Sprache, Computer Vision, Sprache und multimodale Aufgaben zu verwenden. Selbst wenn Sie keine Erfahrung mit einer bestimmten Modalität haben oder nicht mit dem zugrundeliegenden Code hinter den Modellen vertraut sind, können Sie sie mit der [`pipeline`] für Inferenzen verwenden! In diesem Beispiel lernen Sie, wie:
+
+* Eine [`pipeline`] für Inferenz zu verwenden.
+* Einen bestimmten Tokenizer oder ein bestimmtes Modell zu verwenden.
+* Eine [`pipeline`] für Audio-, Vision- und multimodale Aufgaben zu verwenden.
+
+<Tip>
+
+Eine vollständige Liste der unterstützten Aufgaben und verfügbaren Parameter finden Sie in der [`pipeline`]-Dokumentation.
+
+</Tip>
+
+## Verwendung von Pipelines
+
+Obwohl jede Aufgabe eine zugehörige [`pipeline`] hat, ist es einfacher, die allgemeine [`pipeline`]-Abstraktion zu verwenden, die alle aufgabenspezifischen Pipelines enthält. Die [`pipeline`] lädt automatisch ein Standardmodell und eine Vorverarbeitungsklasse, die für Ihre Aufgabe inferenzfähig ist.
+
+1. Beginnen Sie mit der Erstellung einer [`pipeline`] und geben Sie eine Inferenzaufgabe an:
+
+```py
+>>> from transformers import pipeline
+
+>>> generator = pipeline(task="text-generation")
+```
+
+2. Übergeben Sie Ihren Eingabetext an die [`pipeline`]:
+
+```py
+>>> generator(
+...     "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone"
+... )  # doctest: +SKIP
+[{'generated_text': 'Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone, Seven for the Iron-priests at the door to the east, and thirteen for the Lord Kings at the end of the mountain'}]
+```
+
+Wenn Sie mehr als eine Eingabe haben, übergeben Sie die Eingabe als Liste:
+
+```py
+>>> generator(
+...     [
+...         "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone",
+...         "Nine for Mortal Men, doomed to die, One for the Dark Lord on his dark throne",
+...     ]
+... )  # doctest: +SKIP
+```
+
+Alle zusätzlichen Parameter für Ihre Aufgabe können auch in die [`pipeline`] aufgenommen werden. Die Aufgabe `Text-Generierung` hat eine [`~generation.GenerationMixin.generate`]-Methode mit mehreren Parametern zur Steuerung der Ausgabe. Wenn Sie zum Beispiel mehr als eine Ausgabe erzeugen wollen, setzen Sie den Parameter `num_return_sequences`:
+
+```py
+>>> generator(
+...     "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone",
+...     num_return_sequences=2,
+... )  # doctest: +SKIP
+```
+
+### Wählen Sie ein Modell und einen Tokenizer
+
+Die [`pipeline`] akzeptiert jedes Modell aus dem [Hub] (https://huggingface.co/models). Auf dem Hub gibt es Tags, mit denen Sie nach einem Modell filtern können, das Sie für Ihre Aufgabe verwenden möchten. Sobald Sie ein passendes Modell ausgewählt haben, laden Sie es mit der entsprechenden `AutoModelFor` und [`AutoTokenizer`] Klasse. Laden Sie zum Beispiel die Klasse [`AutoModelForCausalLM`] für eine kausale Sprachmodellierungsaufgabe:
+
+```py
+>>> from transformers import AutoTokenizer, AutoModelForCausalLM
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+>>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+```
+
+Erstellen Sie eine [`pipeline`] für Ihre Aufgabe, und geben Sie das Modell und den Tokenizer an, die Sie geladen haben:
+
+```py
+>>> from transformers import pipeline
+
+>>> generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
+```
+
+Übergeben Sie Ihren Eingabetext an die [`pipeline`] , um einen Text zu erzeugen:
+
+```py
+>>> generator(
+...     "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone"
+... )  # doctest: +SKIP
+[{'generated_text': 'Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone, Seven for the Dragon-lords (for them to rule in a world ruled by their rulers, and all who live within the realm'}]
+```
+
+## Audio-Pipeline
+
+Die [`pipeline`] unterstützt auch Audioaufgaben wie Audioklassifizierung und automatische Spracherkennung.
+
+Lassen Sie uns zum Beispiel die Emotion in diesem Audioclip klassifizieren:
+
+```py
+>>> from datasets import load_dataset
+>>> import torch
+
+>>> torch.manual_seed(42)  # doctest: +IGNORE_RESULT
+>>> ds = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+>>> audio_file = ds[0]["audio"]["path"]
+```
+
+Finden Sie ein [Audioklassifikation](https://huggingface.co/models?pipeline_tag=audio-classification) Modell auf dem Model Hub für Emotionserkennung und laden Sie es in die [`pipeline`]:
+
+```py
+>>> from transformers import pipeline
+
+>>> audio_classifier = pipeline(
+...     task="audio-classification", model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
+... )
+```
+
+Übergeben Sie die Audiodatei an die [`pipeline`]:
+
+```py
+>>> preds = audio_classifier(audio_file)
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> preds
+[{'score': 0.1315, 'label': 'calm'}, {'score': 0.1307, 'label': 'neutral'}, {'score': 0.1274, 'label': 'sad'}, {'score': 0.1261, 'label': 'fearful'}, {'score': 0.1242, 'label': 'happy'}]
+```
+
+## Bildverarbeitungs-Pipeline
+
+Die Verwendung einer [`pipeline`] für Bildverarbeitungsaufgaben ist praktisch identisch.
+
+Geben Sie Ihre Aufgabe an und übergeben Sie Ihr Bild an den Klassifikator. Das Bild kann ein Link oder ein lokaler Pfad zu dem Bild sein. Zum Beispiel: Welche Katzenart ist unten abgebildet?
+
+![pipeline-cat-chonk](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg)
+
+```py
+>>> from transformers import pipeline
+
+>>> vision_classifier = pipeline(task="image-classification")
+>>> preds = vision_classifier(
+...     images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> preds
+[{'score': 0.4335, 'label': 'lynx, catamount'}, {'score': 0.0348, 'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'}, {'score': 0.0324, 'label': 'snow leopard, ounce, Panthera uncia'}, {'score': 0.0239, 'label': 'Egyptian cat'}, {'score': 0.0229, 'label': 'tiger cat'}]
+```
+
+## Multimodale Pipeline
+
+Die [`pipeline`] unterstützt mehr als eine Modalität. Eine Aufgabe zur Beantwortung visueller Fragen (VQA) kombiniert zum Beispiel Text und Bild. Verwenden Sie einen beliebigen Bildlink und eine Frage, die Sie zu dem Bild stellen möchten. Das Bild kann eine URL oder ein lokaler Pfad zu dem Bild sein.
+
+Wenn Sie zum Beispiel das gleiche Bild wie in der obigen Vision-Pipeline verwenden:
+
+```py
+>>> image = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+>>> question = "Where is the cat?"
+```
+
+Erstellen Sie eine Pipeline für "vqa" und übergeben Sie ihr das Bild und die Frage:
+
+```py
+>>> from transformers import pipeline
+
+>>> vqa = pipeline(task="vqa")
+>>> preds = vqa(image=image, question=question)
+>>> preds = [{"score": round(pred["score"], 4), "answer": pred["answer"]} for pred in preds]
+>>> preds
+[{'score': 0.9112, 'answer': 'snow'}, {'score': 0.8796, 'answer': 'in snow'}, {'score': 0.6717, 'answer': 'outside'}, {'score': 0.0291, 'answer': 'on ground'}, {'score': 0.027, 'answer': 'ground'}]
+```
diff --git a/docs/source/de/pipeline_tutorial.mdx b/docs/source/de/pipeline_tutorial.mdx
deleted file mode 100644
index 19c37c35dea1..000000000000
--- a/docs/source/de/pipeline_tutorial.mdx
+++ /dev/null
@@ -1,171 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Pipelines für Inferenzen
-
-Die [`pipeline`] macht es einfach, jedes beliebige Modell aus dem [Hub](https://huggingface.co/models) für die Inferenz auf jede Sprache, Computer Vision, Sprache und multimodale Aufgaben zu verwenden. Selbst wenn Sie keine Erfahrung mit einer bestimmten Modalität haben oder nicht mit dem zugrundeliegenden Code hinter den Modellen vertraut sind, können Sie sie mit der [`pipeline`] für Inferenzen verwenden! In diesem Beispiel lernen Sie, wie:
-
-* Eine [`pipeline`] für Inferenz zu verwenden.
-* Einen bestimmten Tokenizer oder ein bestimmtes Modell zu verwenden.
-* Eine [`pipeline`] für Audio-, Vision- und multimodale Aufgaben zu verwenden.
-
-<Tip>
-
-Eine vollständige Liste der unterstützten Aufgaben und verfügbaren Parameter finden Sie in der [`pipeline`]-Dokumentation.
-
-</Tip>
-
-## Verwendung von Pipelines
-
-Obwohl jede Aufgabe eine zugehörige [`pipeline`] hat, ist es einfacher, die allgemeine [`pipeline`]-Abstraktion zu verwenden, die alle aufgabenspezifischen Pipelines enthält. Die [`pipeline`] lädt automatisch ein Standardmodell und eine Vorverarbeitungsklasse, die für Ihre Aufgabe inferenzfähig ist.
-
-1. Beginnen Sie mit der Erstellung einer [`pipeline`] und geben Sie eine Inferenzaufgabe an:
-
-```py
->>> from transformers import pipeline
-
->>> generator = pipeline(task="text-generation")
-```
-
-2. Übergeben Sie Ihren Eingabetext an die [`pipeline`]:
-
-```py
->>> generator(
-...     "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone"
-... )  # doctest: +SKIP
-[{'generated_text': 'Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone, Seven for the Iron-priests at the door to the east, and thirteen for the Lord Kings at the end of the mountain'}]
-```
-
-Wenn Sie mehr als eine Eingabe haben, übergeben Sie die Eingabe als Liste:
-
-```py
->>> generator(
-...     [
-...         "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone",
-...         "Nine for Mortal Men, doomed to die, One for the Dark Lord on his dark throne",
-...     ]
-... )  # doctest: +SKIP
-```
-
-Alle zusätzlichen Parameter für Ihre Aufgabe können auch in die [`pipeline`] aufgenommen werden. Die Aufgabe `Text-Generierung` hat eine [`~generation.GenerationMixin.generate`]-Methode mit mehreren Parametern zur Steuerung der Ausgabe. Wenn Sie zum Beispiel mehr als eine Ausgabe erzeugen wollen, setzen Sie den Parameter `num_return_sequences`:
-
-```py
->>> generator(
-...     "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone",
-...     num_return_sequences=2,
-... )  # doctest: +SKIP
-```
-
-### Wählen Sie ein Modell und einen Tokenizer
-
-Die [`pipeline`] akzeptiert jedes Modell aus dem [Hub] (https://huggingface.co/models). Auf dem Hub gibt es Tags, mit denen Sie nach einem Modell filtern können, das Sie für Ihre Aufgabe verwenden möchten. Sobald Sie ein passendes Modell ausgewählt haben, laden Sie es mit der entsprechenden `AutoModelFor` und [`AutoTokenizer`] Klasse. Laden Sie zum Beispiel die Klasse [`AutoModelForCausalLM`] für eine kausale Sprachmodellierungsaufgabe:
-
-```py
->>> from transformers import AutoTokenizer, AutoModelForCausalLM
-
->>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
->>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
-```
-
-Erstellen Sie eine [`pipeline`] für Ihre Aufgabe, und geben Sie das Modell und den Tokenizer an, die Sie geladen haben:
-
-```py
->>> from transformers import pipeline
-
->>> generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
-```
-
-Übergeben Sie Ihren Eingabetext an die [`pipeline`] , um einen Text zu erzeugen:
-
-```py
->>> generator(
-...     "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone"
-... )  # doctest: +SKIP
-[{'generated_text': 'Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone, Seven for the Dragon-lords (for them to rule in a world ruled by their rulers, and all who live within the realm'}]
-```
-
-## Audio-Pipeline
-
-Die [`pipeline`] unterstützt auch Audioaufgaben wie Audioklassifizierung und automatische Spracherkennung.
-
-Lassen Sie uns zum Beispiel die Emotion in diesem Audioclip klassifizieren:
-
-```py
->>> from datasets import load_dataset
->>> import torch
-
->>> torch.manual_seed(42)  # doctest: +IGNORE_RESULT
->>> ds = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
->>> audio_file = ds[0]["audio"]["path"]
-```
-
-Finden Sie ein [Audioklassifikation](https://huggingface.co/models?pipeline_tag=audio-classification) Modell auf dem Model Hub für Emotionserkennung und laden Sie es in die [`pipeline`]:
-
-```py
->>> from transformers import pipeline
-
->>> audio_classifier = pipeline(
-...     task="audio-classification", model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
-... )
-```
-
-Übergeben Sie die Audiodatei an die [`pipeline`]:
-
-```py
->>> preds = audio_classifier(audio_file)
->>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
->>> preds
-[{'score': 0.1315, 'label': 'calm'}, {'score': 0.1307, 'label': 'neutral'}, {'score': 0.1274, 'label': 'sad'}, {'score': 0.1261, 'label': 'fearful'}, {'score': 0.1242, 'label': 'happy'}]
-```
-
-## Bildverarbeitungs-Pipeline
-
-Die Verwendung einer [`pipeline`] für Bildverarbeitungsaufgaben ist praktisch identisch.
-
-Geben Sie Ihre Aufgabe an und übergeben Sie Ihr Bild an den Klassifikator. Das Bild kann ein Link oder ein lokaler Pfad zu dem Bild sein. Zum Beispiel: Welche Katzenart ist unten abgebildet?
-
-![pipeline-cat-chonk](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg)
-
-```py
->>> from transformers import pipeline
-
->>> vision_classifier = pipeline(task="image-classification")
->>> preds = vision_classifier(
-...     images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
-... )
->>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
->>> preds
-[{'score': 0.4335, 'label': 'lynx, catamount'}, {'score': 0.0348, 'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'}, {'score': 0.0324, 'label': 'snow leopard, ounce, Panthera uncia'}, {'score': 0.0239, 'label': 'Egyptian cat'}, {'score': 0.0229, 'label': 'tiger cat'}]
-```
-
-## Multimodale Pipeline
-
-Die [`pipeline`] unterstützt mehr als eine Modalität. Eine Aufgabe zur Beantwortung visueller Fragen (VQA) kombiniert zum Beispiel Text und Bild. Verwenden Sie einen beliebigen Bildlink und eine Frage, die Sie zu dem Bild stellen möchten. Das Bild kann eine URL oder ein lokaler Pfad zu dem Bild sein.
-
-Wenn Sie zum Beispiel das gleiche Bild wie in der obigen Vision-Pipeline verwenden:
-
-```py
->>> image = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
->>> question = "Where is the cat?"
-```
-
-Erstellen Sie eine Pipeline für "vqa" und übergeben Sie ihr das Bild und die Frage:
-
-```py
->>> from transformers import pipeline
-
->>> vqa = pipeline(task="vqa")
->>> preds = vqa(image=image, question=question)
->>> preds = [{"score": round(pred["score"], 4), "answer": pred["answer"]} for pred in preds]
->>> preds
-[{'score': 0.9112, 'answer': 'snow'}, {'score': 0.8796, 'answer': 'in snow'}, {'score': 0.6717, 'answer': 'outside'}, {'score': 0.0291, 'answer': 'on ground'}, {'score': 0.027, 'answer': 'ground'}]
-```
diff --git a/docs/source/de/pr_checks.md b/docs/source/de/pr_checks.md
new file mode 100644
index 000000000000..ee2bbf489b8e
--- /dev/null
+++ b/docs/source/de/pr_checks.md
@@ -0,0 +1,199 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Überprüfungen bei einer Pull-Anfrage
+
+Wenn Sie eine Pull-Anfrage für 🤗 Transformers öffnen, wird eine ganze Reihe von Prüfungen durchgeführt, um sicherzustellen, dass der Patch, den Sie hinzufügen, nichts Bestehendes zerstört. Es gibt vier Arten von Prüfungen:
+- reguläre Tests
+- Erstellung der Dokumentation
+- Stil von Code und Dokumentation
+- allgemeine Konsistenz des Repository
+
+In diesem Dokument werden wir versuchen zu erklären, worum es sich bei diesen verschiedenen Prüfungen handelt und wie Sie sie lokal debuggen können, wenn eine der Prüfungen in Ihrer PR fehlschlägt.
+
+Beachten Sie, dass Sie im Idealfall eine Dev-Installation benötigen:
+
+```bash
+pip install transformers[dev]
+```
+
+oder für eine bearbeitbare Installation:
+
+```bash
+pip install -e .[dev]
+```
+
+innerhalb des Transformers Repo. Da die Anzahl der optionalen Abhängigkeiten von Transformers stark zugenommen hat, ist es möglich, dass Sie nicht alle davon bekommen können. Wenn die Dev-Installation fehlschlägt, stellen Sie sicher, dass Sie das Deep Learning-Framework, mit dem Sie arbeiten, installieren (PyTorch, TensorFlow und/oder Flax).
+
+```bash
+pip install transformers[quality]
+```
+
+oder für eine bearbeitbare Installation:
+
+```bash
+pip install -e .[quality]
+```
+
+
+## Tests
+
+Alle Jobs, die mit `ci/circleci: run_tests_` beginnen, führen Teile der Transformers-Testsuite aus. Jeder dieser Jobs konzentriert sich auf einen Teil der Bibliothek in einer bestimmten Umgebung: `ci/circleci: run_tests_pipelines_tf` zum Beispiel führt den Pipelines-Test in einer Umgebung aus, in der nur TensorFlow installiert ist.
+
+Beachten Sie, dass nur ein Teil der Testsuite jedes Mal ausgeführt wird, um zu vermeiden, dass Tests ausgeführt werden, wenn es keine wirkliche Änderung in den Modulen gibt, die sie testen: ein Dienstprogramm wird ausgeführt, um die Unterschiede in der Bibliothek zwischen vor und nach dem PR zu ermitteln (was GitHub Ihnen auf der Registerkarte "Files changes" anzeigt) und die Tests auszuwählen, die von diesem Unterschied betroffen sind. Dieses Dienstprogramm kann lokal mit ausgeführt werden:
+
+```bash
+python utils/tests_fetcher.py
+```
+
+aus dem Stammverzeichnis des Transformers-Repositoriums. Es wird:
+
+1. Überprüfen Sie für jede Datei im Diff, ob die Änderungen im Code oder nur in Kommentaren oder Docstrings enthalten sind. Nur die Dateien mit echten Codeänderungen werden beibehalten.
+2. Erstellen Sie eine interne Map, die für jede Datei des Quellcodes der Bibliothek alle Dateien angibt, auf die sie rekursiv Einfluss nimmt. Von Modul A wird gesagt, dass es sich auf Modul B auswirkt, wenn Modul B Modul A importiert. Für die rekursive Auswirkung benötigen wir eine Kette von Modulen, die von Modul A zu Modul B führt und in der jedes Modul das vorherige importiert.
+3. Wenden Sie diese Zuordnung auf die in Schritt 1 gesammelten Dateien an. So erhalten wir die Liste der Modelldateien, die von der PR betroffen sind.
+4. Ordnen Sie jede dieser Dateien der/den entsprechenden Testdatei(en) zu und erhalten Sie die Liste der auszuführenden Tests.
+
+Wenn Sie das Skript lokal ausführen, sollten Sie die Ergebnisse von Schritt 1, 3 und 4 ausgegeben bekommen und somit wissen, welche Tests ausgeführt werden. Das Skript erstellt außerdem eine Datei namens `test_list.txt`, die die Liste der auszuführenden Tests enthält, die Sie mit dem folgenden Befehl lokal ausführen können:
+
+```bash
+python -m pytest -n 8 --dist=loadfile -rA -s $(cat test_list.txt)
+```
+
+Für den Fall, dass Ihnen etwas entgangen ist, wird die komplette Testreihe ebenfalls täglich ausgeführt.
+
+## Dokumentation erstellen
+
+Der Job `build_pr_documentation` erstellt und generiert eine Vorschau der Dokumentation, um sicherzustellen, dass alles in Ordnung ist, wenn Ihr PR zusammengeführt wird. Ein Bot fügt einen Link zur Vorschau der Dokumentation zu Ihrem PR hinzu. Alle Änderungen, die Sie an dem PR vornehmen, werden automatisch in der Vorschau aktualisiert. Wenn die Dokumentation nicht erstellt werden kann, klicken Sie auf **Details** neben dem fehlgeschlagenen Auftrag, um zu sehen, wo der Fehler liegt. Oft ist der Fehler so einfach wie eine fehlende Datei im `toctree`.
+
+Wenn Sie daran interessiert sind, die Dokumentation lokal zu erstellen oder in der Vorschau anzusehen, werfen Sie einen Blick in die [`README.md`](https://github.com/huggingface/transformers/tree/main/docs) im Ordner docs.
+
+## Code und Dokumentationsstil
+
+Die Formatierung des Codes erfolgt für alle Quelldateien, die Beispiele und die Tests mit `black` und `ruff`. Wir haben auch ein benutzerdefiniertes Tool, das sich um die Formatierung von docstrings und `rst`-Dateien kümmert (`utils/style_doc.py`), sowie um die Reihenfolge der Lazy-Importe, die in den Transformers `__init__.py`-Dateien durchgeführt werden (`utils/custom_init_isort.py`). All dies können Sie starten, indem Sie Folgendes ausführen
+
+```bash
+make style
+```
+
+Das CI prüft, ob diese innerhalb der Prüfung `ci/circleci: check_code_quality` angewendet wurden. Es führt auch `ruff` aus, das einen grundlegenden Blick auf Ihren Code wirft und sich beschwert, wenn es eine undefinierte Variable findet oder eine, die nicht verwendet wird. Um diese Prüfung lokal auszuführen, verwenden Sie
+
+```bash
+make quality
+```
+
+Dies kann sehr viel Zeit in Anspruch nehmen. Um dasselbe nur für die Dateien zu tun, die Sie im aktuellen Zweig geändert haben, führen Sie
+
+```bash
+make fixup
+```
+
+Dieser letzte Befehl führt auch alle zusätzlichen Prüfungen für die Konsistenz des Repositorys durch. Schauen wir uns diese an.
+
+## Repository-Konsistenz
+
+Dies fasst alle Tests zusammen, die sicherstellen, dass Ihr PR das Repository in einem guten Zustand verlässt. Sie können diese Prüfung lokal durchführen, indem Sie Folgendes ausführen:
+
+```bash
+make repo-consistency
+```
+
+Dies überprüft, ob:
+
+- Alle zum Init hinzugefügten Objekte sind dokumentiert (ausgeführt von `utils/check_repo.py`)
+- Alle `__init__.py`-Dateien haben in ihren beiden Abschnitten den gleichen Inhalt (ausgeführt von `utils/check_inits.py`)
+- Der gesamte Code, der als Kopie eines anderen Moduls identifiziert wurde, stimmt mit dem Original überein (ausgeführt von `utils/check_copies.py`)
+- Alle Konfigurationsklassen haben mindestens einen gültigen Prüfpunkt, der in ihren Dokumentationen erwähnt wird (ausgeführt von `utils/check_config_docstrings.py`)
+- Alle Konfigurationsklassen enthalten nur Attribute, die in den entsprechenden Modellierungsdateien verwendet werden (ausgeführt von `utils/check_config_attributes.py`)
+- Die Übersetzungen der READMEs und der Index des Dokuments haben die gleiche Modellliste wie die Haupt-README (durchgeführt von `utils/check_copies.py`)
+- Die automatisch generierten Tabellen in der Dokumentation sind auf dem neuesten Stand (ausgeführt von `utils/check_table.py`)
+- Die Bibliothek verfügt über alle Objekte, auch wenn nicht alle optionalen Abhängigkeiten installiert sind (ausgeführt von `utils/check_dummies.py`)
+
+Sollte diese Prüfung fehlschlagen, müssen die ersten beiden Punkte manuell korrigiert werden, die letzten vier können automatisch für Sie korrigiert werden, indem Sie den Befehl
+
+```bash
+make fix-copies
+```
+
+Zusätzliche Prüfungen betreffen PRs, die neue Modelle hinzufügen, vor allem, dass:
+
+- Alle hinzugefügten Modelle befinden sich in einer Auto-Zuordnung (durchgeführt von `utils/check_repo.py`)
+<!-- TODO Sylvain, add a check that makes sure the common tests are implemented.-->
+- Alle Modelle werden ordnungsgemäß getestet (ausgeführt von `utils/check_repo.py`)
+
+<!-- TODO Sylvain, add the following
+- All models are added to the main README, inside the main doc
+- All checkpoints used actually exist on the Hub
+
+-->
+
+### Kopien prüfen
+
+Da die Transformers-Bibliothek in Bezug auf den Modellcode sehr eigenwillig ist und jedes Modell vollständig in einer einzigen Datei implementiert sein sollte, ohne sich auf andere Modelle zu stützen, haben wir einen Mechanismus hinzugefügt, der überprüft, ob eine Kopie des Codes einer Ebene eines bestimmten Modells mit dem Original übereinstimmt. Auf diese Weise können wir bei einer Fehlerbehebung alle anderen betroffenen Modelle sehen und entscheiden, ob wir die Änderung weitergeben oder die Kopie zerstören.
+
+<Tip>
+
+Wenn eine Datei eine vollständige Kopie einer anderen Datei ist, sollten Sie sie in der Konstante `FULL_COPIES` von `utils/check_copies.py` registrieren.
+
+</Tip>
+
+Dieser Mechanismus stützt sich auf Kommentare der Form `# Kopiert von xxx`. Das `xxx` sollte den gesamten Pfad zu der Klasse der Funktion enthalten, die darunter kopiert wird. Zum Beispiel ist `RobertaSelfOutput` eine direkte Kopie der Klasse `BertSelfOutput`. Sie können also [hier](https://github.com/huggingface/transformers/blob/2bd7a27a671fd1d98059124024f580f8f5c0f3b5/src/transformers/models/roberta/modeling_roberta.py#L289) sehen, dass sie einen Kommentar hat:
+
+```py
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput
+```
+
+Beachten Sie, dass Sie dies nicht auf eine ganze Klasse anwenden, sondern auf die entsprechenden Methoden, von denen kopiert wird. Zum Beispiel [hier](https://github.com/huggingface/transformers/blob/2bd7a27a671fd1d98059124024f580f8f5c0f3b5/src/transformers/models/roberta/modeling_roberta.py#L598) können Sie sehen, wie `RobertaPreTrainedModel._init_weights` von der gleichen Methode in `BertPreTrainedModel` mit dem Kommentar kopiert wird:
+
+```py
+# Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
+```
+
+Manchmal ist die Kopie bis auf die Namen genau gleich: zum Beispiel verwenden wir in `RobertaAttention` `RobertaSelfAttention` anstelle von `BertSelfAttention`, aber ansonsten ist der Code genau derselbe. Aus diesem Grund unterstützt `#Copied from` einfache String-Ersetzungen mit der folgenden Syntax: `Kopiert von xxx mit foo->bar`. Das bedeutet, dass der Code kopiert wird, wobei alle Instanzen von "foo" durch "bar" ersetzt werden. Sie können sehen, wie es [hier](https://github.com/huggingface/transformers/blob/2bd7a27a671fd1d98059124024f580f8f5c0f3b5/src/transformers/models/roberta/modeling_roberta.py#L304C1-L304C86) in `RobertaAttention` mit dem Kommentar verwendet wird:
+
+```py
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Roberta
+```
+
+Beachten Sie, dass um den Pfeil herum keine Leerzeichen stehen sollten (es sei denn, das Leerzeichen ist Teil des zu ersetzenden Musters, natürlich).
+
+Sie können mehrere Muster durch ein Komma getrennt hinzufügen. Zum Beispiel ist hier `CamemberForMaskedLM` eine direkte Kopie von `RobertaForMaskedLM` mit zwei Ersetzungen: `Roberta` zu `Camembert` und `ROBERTA` zu `CAMEMBERT`. Sie können [hier](https://github.com/huggingface/transformers/blob/15082a9dc6950ecae63a0d3e5060b2fc7f15050a/src/transformers/models/camembert/modeling_camembert.py#L929) sehen, wie dies mit dem Kommentar gemacht wird:
+
+```py
+# Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM with Roberta->Camembert, ROBERTA->CAMEMBERT
+```
+
+Wenn die Reihenfolge eine Rolle spielt (weil eine der Ersetzungen mit einer vorherigen in Konflikt geraten könnte), werden die Ersetzungen von links nach rechts ausgeführt.
+
+<Tip>
+
+Wenn die Ersetzungen die Formatierung ändern (wenn Sie z.B. einen kurzen Namen durch einen sehr langen Namen ersetzen), wird die Kopie nach Anwendung des automatischen Formats überprüft.
+
+</Tip>
+
+Eine andere Möglichkeit, wenn es sich bei den Mustern nur um verschiedene Umschreibungen derselben Ersetzung handelt (mit einer groß- und einer kleingeschriebenen Variante), besteht darin, die Option `all-casing` hinzuzufügen. [Hier](https://github.com/huggingface/transformers/blob/15082a9dc6950ecae63a0d3e5060b2fc7f15050a/src/transformers/models/mobilebert/modeling_mobilebert.py#L1237) ist ein Beispiel in `MobileBertForSequenceClassification` mit dem Kommentar:
+
+```py
+# Copied from transformers.models.bert.modeling_bert.BertForSequenceClassification with Bert->MobileBert all-casing
+```
+
+In diesem Fall wird der Code von `BertForSequenceClassification` kopiert, indem er ersetzt wird:
+- `Bert` durch `MobileBert` (zum Beispiel bei der Verwendung von `MobileBertModel` in der Init)
+- `bert` durch `mobilebert` (zum Beispiel bei der Definition von `self.mobilebert`)
+- `BERT` durch `MOBILEBERT` (in der Konstante `MOBILEBERT_INPUTS_DOCSTRING`)
diff --git a/docs/source/de/preprocessing.md b/docs/source/de/preprocessing.md
new file mode 100644
index 000000000000..1e8f6ff4062a
--- /dev/null
+++ b/docs/source/de/preprocessing.md
@@ -0,0 +1,506 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Vorverarbeiten
+
+[[open-in-colab]]
+
+Bevor Sie Ihre Daten in einem Modell verwenden können, müssen die Daten in ein für das Modell akzeptables Format gebracht werden. Ein Modell versteht keine Rohtexte, Bilder oder Audiodaten. Diese Eingaben müssen in Zahlen umgewandelt und zu Tensoren zusammengesetzt werden. In dieser Anleitung werden Sie:
+
+* Textdaten mit einem Tokenizer vorverarbeiten.
+* Bild- oder Audiodaten mit einem Feature Extractor vorverarbeiten.
+* Daten für eine multimodale Aufgabe mit einem Prozessor vorverarbeiten.
+
+## NLP
+
+<Youtube id="Yffk5aydLzg"/>
+
+Das wichtigste Werkzeug zur Verarbeitung von Textdaten ist ein [Tokenizer](main_classes/tokenizer). Ein Tokenizer zerlegt Text zunächst nach einer Reihe von Regeln in *Token*. Die Token werden in Zahlen umgewandelt, die zum Aufbau von Tensoren als Eingabe für ein Modell verwendet werden. Alle zusätzlichen Eingaben, die ein Modell benötigt, werden ebenfalls vom Tokenizer hinzugefügt.
+
+<Tip>
+
+Wenn Sie ein vortrainiertes Modell verwenden möchten, ist es wichtig, den zugehörigen vortrainierten Tokenizer zu verwenden. Dadurch wird sichergestellt, dass der Text auf die gleiche Weise aufgeteilt wird wie das Pretraining-Korpus und die gleichen entsprechenden Token-zu-Index (in der Regel als *vocab* bezeichnet) während des Pretrainings verwendet werden.
+
+</Tip>
+
+Laden Sie einen vortrainierten Tokenizer mit der Klasse [AutoTokenizer], um schnell loszulegen. Damit wird das *vocab* heruntergeladen, das verwendet wird, wenn ein Modell vortrainiert wird.
+
+### Tokenize
+
+Laden Sie einen vortrainierten Tokenizer mit [`AutoTokenizer.from_pretrained`]:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+```
+
+Dann übergeben Sie Ihren Satz an den Tokenizer:
+
+```py
+>>> encoded_input = tokenizer("Do not meddle in the affairs of wizards, for they are subtle and quick to anger.")
+>>> print(encoded_input)
+{'input_ids': [101, 2079, 2025, 19960, 10362, 1999, 1996, 3821, 1997, 16657, 1010, 2005, 2027, 2024, 11259, 1998, 4248, 2000, 4963, 1012, 102], 
+ 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+ 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+```
+
+Der Tokenizer gibt ein Wörterbuch mit drei wichtigen Elementen zurück:
+
+* [input_ids](glossary#input-ids) sind die Indizes, die den einzelnen Token im Satz entsprechen.
+* [attention_mask](glossary#attention-mask) gibt an, ob ein Token beachtet werden soll oder nicht.
+* [token_type_ids](glossary#token-type-ids) gibt an, zu welcher Sequenz ein Token gehört, wenn es mehr als eine Sequenz gibt.
+
+Sie können die `input_ids` dekodieren, um die ursprüngliche Eingabe zurückzugeben:
+
+```py
+>>> tokenizer.decode(encoded_input["input_ids"])
+'[CLS] Do not meddle in the affairs of wizards, for they are subtle and quick to anger. [SEP]'
+```
+
+Wie Sie sehen können, hat der Tokenisierer zwei spezielle Token - `CLS` und `SEP` (Klassifikator und Separator) - zum Satz hinzugefügt. Nicht alle Modelle benötigen
+spezielle Token, aber wenn dies der Fall ist, fügt der Tokenisierer sie automatisch für Sie hinzu.
+
+Wenn Sie mehrere Sätze verarbeiten wollen, übergeben Sie die Sätze als Liste an den Tokenizer:
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_inputs = tokenizer(batch_sentences)
+>>> print(encoded_inputs)
+{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102], 
+               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], 
+               [101, 1327, 1164, 5450, 23434, 136, 102]], 
+ 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], 
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+                    [0, 0, 0, 0, 0, 0, 0]], 
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], 
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
+                    [1, 1, 1, 1, 1, 1, 1]]}
+```
+
+### Pad
+
+Dies bringt uns zu einem wichtigen Thema. Wenn Sie einen Haufen von Sätzen verarbeiten, sind diese nicht immer gleich lang. Das ist ein Problem, weil Tensoren, die Eingabe für das Modell, eine einheitliche Form haben müssen. Padding ist eine Strategie, die sicherstellt, dass Tensoren rechteckig sind, indem ein spezielles *Padding-Token* zu Sätzen mit weniger Token hinzugefügt wird.
+
+Setzen Sie den Parameter "padding" auf "true", um die kürzeren Sequenzen im Stapel so aufzufüllen, dass sie der längsten Sequenz entsprechen:
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch_sentences, padding=True)
+>>> print(encoded_input)
+{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], 
+               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], 
+               [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], 
+ 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], 
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
+                    [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]}
+```
+
+Beachten Sie, dass der Tokenizer den ersten und den dritten Satz mit einer "0" aufgefüllt hat, weil sie kürzer sind!
+
+### Kürzung
+
+Auf der anderen Seite des Spektrums kann es vorkommen, dass eine Sequenz zu lang für ein Modell ist. In diesem Fall müssen Sie die Sequenz auf eine kürzere Länge kürzen.
+
+Setzen Sie den Parameter "truncation" auf "true", um eine Sequenz auf die vom Modell akzeptierte Höchstlänge zu kürzen:
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True)
+>>> print(encoded_input)
+{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], 
+               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], 
+               [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], 
+ 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], 
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
+                    [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]}
+```
+
+### Tensoren erstellen
+
+Schließlich möchten Sie, dass der Tokenizer die tatsächlichen Tensoren zurückgibt, die dem Modell zugeführt werden.
+
+Setzen Sie den Parameter `return_tensors` entweder auf `pt` für PyTorch, oder `tf` für TensorFlow:
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt")
+>>> print(encoded_input)
+{'input_ids': tensor([[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
+                      [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
+                      [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]]), 
+ 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 
+ 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+                           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                           [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}
+```
+</pt>
+<tf>
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="tf")
+>>> print(encoded_input)
+{'input_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
+array([[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
+       [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
+       [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]],
+      dtype=int32)>, 
+ 'token_type_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
+array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>, 
+ 'attention_mask': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
+array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+       [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>}
+```
+</tf>
+</frameworkcontent>
+
+## Audio
+
+Audioeingaben werden anders vorverarbeitet als Texteingaben, aber das Endziel bleibt dasselbe: numerische Sequenzen zu erstellen, die das Modell verstehen kann. Ein [feature extractor](main_classes/feature_extractor) dient dem ausdrücklichen Zweck, Merkmale aus Rohbild- oder Audiodaten zu extrahieren und in Tensoren zu konvertieren. Bevor Sie beginnen, installieren Sie 🤗 Datasets, um einen Audio-Datensatz zu laden, mit dem Sie experimentieren können:
+
+```bash
+pip install datasets
+```
+
+Laden Sie den [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) Datensatz (weitere Informationen zum Laden eines Datensatzes finden Sie im 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub.html)):
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
+```
+
+Greifen Sie auf das erste Element der `audio`-Spalte zu, um einen Blick auf die Eingabe zu werfen. Durch den Aufruf der Spalte "audio" wird die Audiodatei automatisch geladen und neu gesampelt:
+
+```py
+>>> dataset[0]["audio"]
+{'array': array([ 0.        ,  0.00024414, -0.00024414, ..., -0.00024414,
+         0.        ,  0.        ], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
+ 'sampling_rate': 8000}
+```
+
+Dies gibt drei Elemente zurück:
+
+* "array" ist das Sprachsignal, das als 1D-Array geladen - und möglicherweise neu gesampelt - wurde.
+* Pfad" zeigt auf den Speicherort der Audiodatei.
+* `sampling_rate` bezieht sich darauf, wie viele Datenpunkte im Sprachsignal pro Sekunde gemessen werden.
+
+### Resample
+
+Für dieses Tutorial werden Sie das Modell [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) verwenden. Wie Sie aus der Modellkarte ersehen können, ist das Wav2Vec2-Modell auf 16kHz abgetastetes Sprachaudio vortrainiert. Es ist wichtig, dass die Abtastrate Ihrer Audiodaten mit der Abtastrate des Datensatzes übereinstimmt, der für das Pre-Training des Modells verwendet wurde. Wenn die Abtastrate Ihrer Daten nicht dieselbe ist, müssen Sie Ihre Audiodaten neu abtasten. 
+
+Der Datensatz [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) hat zum Beispiel eine Abtastrate von 8000 kHz. Um das Wav2Vec2-Modell mit diesem Datensatz verwenden zu können, müssen Sie die Abtastrate auf 16 kHz erhöhen:
+
+```py
+>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
+>>> dataset[0]["audio"]
+{'array': array([ 0.        ,  0.00024414, -0.00024414, ..., -0.00024414,
+         0.        ,  0.        ], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
+ 'sampling_rate': 8000}
+```
+
+1. Verwenden Sie die Methode [~datasets.Dataset.cast_column] von 🤗 Datasets, um die Abtastrate auf 16kHz zu erhöhen:
+
+```py
+>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000))
+```
+
+2. Laden Sie die Audiodatei:
+
+```py
+>>> dataset[0]["audio"]
+{'array': array([ 2.3443763e-05,  2.1729663e-04,  2.2145823e-04, ...,
+         3.8356509e-05, -7.3497440e-06, -2.1754686e-05], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
+ 'sampling_rate': 16000}
+```
+
+Wie Sie sehen können, ist die Abtastrate jetzt 16kHz!
+
+### Merkmalsextraktor
+
+Der nächste Schritt ist das Laden eines Merkmalsextraktors, um die Eingabe zu normalisieren und aufzufüllen. Beim Auffüllen von Textdaten wird für kürzere Sequenzen ein `0` hinzugefügt. Die gleiche Idee gilt für Audiodaten, und der Audio-Feature-Extraktor fügt eine `0` - interpretiert als Stille - zu `array` hinzu.
+
+Laden Sie den Merkmalsextraktor mit [`AutoFeatureExtractor.from_pretrained`]:
+
+```py
+>>> from transformers import AutoFeatureExtractor
+
+>>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
+```
+
+Übergeben Sie das Audio-"Array" an den Feature-Extraktor. Wir empfehlen auch, das Argument `sampling_rate` im Feature Extractor hinzuzufügen, um eventuell auftretende stille Fehler besser zu beheben.
+
+```py
+>>> audio_input = [dataset[0]["audio"]["array"]]
+>>> feature_extractor(audio_input, sampling_rate=16000)
+{'input_values': [array([ 3.8106556e-04,  2.7506407e-03,  2.8015103e-03, ...,
+        5.6335266e-04,  4.6588284e-06, -1.7142107e-04], dtype=float32)]}
+```
+
+### Auffüllen und Kürzen
+
+Genau wie beim Tokenizer können Sie variable Sequenzen in einem Stapel durch Auffüllen oder Abschneiden behandeln. Werfen Sie einen Blick auf die Sequenzlänge dieser beiden Audiobeispiele:
+
+```py
+>>> dataset[0]["audio"]["array"].shape
+(173398,)
+
+>>> dataset[1]["audio"]["array"].shape
+(106496,)
+```
+
+Wie Sie sehen können, hat das erste Beispiel eine längere Sequenz als das zweite Beispiel. Lassen Sie uns eine Funktion erstellen, die den Datensatz vorverarbeitet. Geben Sie eine maximale Länge der Probe an, und der Feature-Extraktor wird die Sequenzen entweder auffüllen oder abschneiden, damit sie dieser Länge entsprechen:
+
+```py
+>>> def preprocess_function(examples):
+...     audio_arrays = [x["array"] for x in examples["audio"]]
+...     inputs = feature_extractor(
+...         audio_arrays,
+...         sampling_rate=16000,
+...         padding=True,
+...         max_length=100000,
+...         truncation=True,
+...     )
+...     return inputs
+```
+
+Wenden Sie die Funktion auf die ersten paar Beispiele im Datensatz an:
+
+```py
+>>> processed_dataset = preprocess_function(dataset[:5])
+```
+
+Schauen Sie sich nun noch einmal die verarbeiteten Beispiel-Längen an:
+
+```py
+>>> processed_dataset["input_values"][0].shape
+(100000,)
+
+>>> processed_dataset["input_values"][1].shape
+(100000,)
+```
+
+Die Länge der ersten beiden Beispiele entspricht nun der von Ihnen angegebenen Maximallänge.
+
+## Bildverarbeitung
+
+Ein Merkmalsextraktor wird auch verwendet, um Bilder für Bildverarbeitungsaufgaben zu verarbeiten. Auch hier besteht das Ziel darin, das Rohbild in eine Reihe von Tensoren als Eingabe zu konvertieren.
+
+Laden wir den [food101](https://huggingface.co/datasets/food101) Datensatz für dieses Tutorial. Verwenden Sie den Parameter 🤗 Datasets `split`, um nur eine kleine Stichprobe aus dem Trainingssplit zu laden, da der Datensatz recht groß ist:
+
+```py
+>>> from datasets import load_dataset
+
+>>> dataset = load_dataset("food101", split="train[:100]")
+```
+
+Als Nächstes sehen Sie sich das Bild mit dem Merkmal 🤗 Datensätze [Bild] (https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=image#datasets.Image) an:
+
+```py
+>>> dataset[0]["image"]
+```
+
+![vision-preprocess-tutorial.png](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vision-preprocess-tutorial.png)
+
+### Merkmalsextraktor
+
+Laden Sie den Merkmalsextraktor mit [`AutoImageProcessor.from_pretrained`]:
+
+```py
+>>> from transformers import AutoImageProcessor
+
+>>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
+```
+
+### Datenerweiterung
+
+Bei Bildverarbeitungsaufgaben ist es üblich, den Bildern als Teil der Vorverarbeitung eine Art von Datenerweiterung hinzuzufügen. Sie können Erweiterungen mit jeder beliebigen Bibliothek hinzufügen, aber in diesem Tutorial werden Sie das Modul [`transforms`](https://pytorch.org/vision/stable/transforms.html) von torchvision verwenden.
+
+1. Normalisieren Sie das Bild und verwenden Sie [`Compose`](https://pytorch.org/vision/master/generated/torchvision.transforms.Compose.html), um einige Transformationen - [`RandomResizedCrop`](https://pytorch.org/vision/main/generated/torchvision.transforms.RandomResizedCrop.html) und [`ColorJitter`](https://pytorch.org/vision/main/generated/torchvision.transforms.ColorJitter.html) - miteinander zu verknüpfen:
+
+```py
+>>> from torchvision.transforms import Compose, Normalize, RandomResizedCrop, ColorJitter, ToTensor
+
+>>> normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
+>>> _transforms = Compose(
+...     [RandomResizedCrop(image_processor.size["height"]), ColorJitter(brightness=0.5, hue=0.5), ToTensor(), normalize]
+... )
+```
+
+2. Das Modell akzeptiert [`pixel_values`](model_doc/visionencoderdecoder#transformers.VisionEncoderDecoderModel.forward.pixel_values) als Eingabe. Dieser Wert wird vom Merkmalsextraktor erzeugt. Erstellen Sie eine Funktion, die `pixel_values` aus den Transformationen erzeugt:
+
+```py
+>>> def transforms(examples):
+...     examples["pixel_values"] = [_transforms(image.convert("RGB")) for image in examples["image"]]
+...     return examples
+```
+
+3. Dann verwenden Sie 🤗 Datasets [`set_transform`](https://huggingface.co/docs/datasets/process.html#format-transform), um die Transformationen im laufenden Betrieb anzuwenden:
+
+```py
+>>> dataset.set_transform(transforms)
+```
+
+4. Wenn Sie nun auf das Bild zugreifen, werden Sie feststellen, dass der Feature Extractor die Modelleingabe "pixel_values" hinzugefügt hat:
+
+```py
+>>> dataset[0]["image"]
+{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=384x512 at 0x7F1A7B0630D0>,
+ 'label': 6,
+ 'pixel_values': tensor([[[ 0.0353,  0.0745,  0.1216,  ..., -0.9922, -0.9922, -0.9922],
+          [-0.0196,  0.0667,  0.1294,  ..., -0.9765, -0.9843, -0.9922],
+          [ 0.0196,  0.0824,  0.1137,  ..., -0.9765, -0.9686, -0.8667],
+          ...,
+          [ 0.0275,  0.0745,  0.0510,  ..., -0.1137, -0.1216, -0.0824],
+          [ 0.0667,  0.0824,  0.0667,  ..., -0.0588, -0.0745, -0.0980],
+          [ 0.0353,  0.0353,  0.0431,  ..., -0.0039, -0.0039, -0.0588]],
+ 
+         [[ 0.2078,  0.2471,  0.2863,  ..., -0.9451, -0.9373, -0.9451],
+          [ 0.1608,  0.2471,  0.3098,  ..., -0.9373, -0.9451, -0.9373],
+          [ 0.2078,  0.2706,  0.3020,  ..., -0.9608, -0.9373, -0.8275],
+          ...,
+          [-0.0353,  0.0118, -0.0039,  ..., -0.2392, -0.2471, -0.2078],
+          [ 0.0196,  0.0353,  0.0196,  ..., -0.1843, -0.2000, -0.2235],
+          [-0.0118, -0.0039, -0.0039,  ..., -0.0980, -0.0980, -0.1529]],
+ 
+         [[ 0.3961,  0.4431,  0.4980,  ..., -0.9216, -0.9137, -0.9216],
+          [ 0.3569,  0.4510,  0.5216,  ..., -0.9059, -0.9137, -0.9137],
+          [ 0.4118,  0.4745,  0.5216,  ..., -0.9137, -0.8902, -0.7804],
+          ...,
+          [-0.2314, -0.1922, -0.2078,  ..., -0.4196, -0.4275, -0.3882],
+          [-0.1843, -0.1686, -0.2000,  ..., -0.3647, -0.3804, -0.4039],
+          [-0.1922, -0.1922, -0.1922,  ..., -0.2941, -0.2863, -0.3412]]])}
+```
+
+Hier sehen Sie, wie das Bild nach der Vorverarbeitung aussieht. Wie von den angewandten Transformationen zu erwarten, wurde das Bild willkürlich beschnitten und seine Farbeigenschaften sind anders.
+
+```py
+>>> import numpy as np
+>>> import matplotlib.pyplot as plt
+
+>>> img = dataset[0]["pixel_values"]
+>>> plt.imshow(img.permute(1, 2, 0))
+```
+
+![preprocessed_image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/preprocessed_image.png)
+
+## Multimodal
+
+Für multimodale Aufgaben werden Sie eine Kombination aus allem, was Sie bisher gelernt haben, verwenden und Ihre Fähigkeiten auf eine Aufgabe der automatischen Spracherkennung (ASR) anwenden. Dies bedeutet, dass Sie einen:
+
+* Feature Extractor zur Vorverarbeitung der Audiodaten.
+* Tokenizer, um den Text zu verarbeiten.
+
+Kehren wir zum [LJ Speech](https://huggingface.co/datasets/lj_speech) Datensatz zurück:
+
+```py
+>>> from datasets import load_dataset
+
+>>> lj_speech = load_dataset("lj_speech", split="train")
+```
+
+Da Sie hauptsächlich an den Spalten "Audio" und "Text" interessiert sind, entfernen Sie die anderen Spalten:
+
+```py
+>>> lj_speech = lj_speech.map(remove_columns=["file", "id", "normalized_text"])
+```
+
+Schauen Sie sich nun die Spalten "Audio" und "Text" an:
+
+```py
+>>> lj_speech[0]["audio"]
+{'array': array([-7.3242188e-04, -7.6293945e-04, -6.4086914e-04, ...,
+         7.3242188e-04,  2.1362305e-04,  6.1035156e-05], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav',
+ 'sampling_rate': 22050}
+
+>>> lj_speech[0]["text"]
+'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition'
+```
+
+Erinnern Sie sich an den früheren Abschnitt über die Verarbeitung von Audiodaten: Sie sollten immer die Abtastrate Ihrer Audiodaten [resample](preprocessing#audio), damit sie mit der Abtastrate des Datensatzes übereinstimmt, der für das Vortraining eines Modells verwendet wird:
+
+```py
+>>> lj_speech = lj_speech.cast_column("audio", Audio(sampling_rate=16_000))
+```
+
+### Prozessor
+
+Ein Processor kombiniert einen Feature-Extraktor und einen Tokenizer. Laden Sie einen Processor mit [`AutoProcessor.from_pretrained]:
+
+```py
+>>> from transformers import AutoProcessor
+
+>>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
+```
+
+1. Erstellen Sie eine Funktion, die die Audiodaten zu `input_values` verarbeitet und den Text zu `labels` tokenisiert. Dies sind Ihre Eingaben für das Modell:
+
+```py
+>>> def prepare_dataset(example):
+...     audio = example["audio"]
+
+...     example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000))
+
+...     return example
+```
+
+2. Wenden Sie die Funktion "prepare_dataset" auf ein Beispiel an:
+
+```py
+>>> prepare_dataset(lj_speech[0])
+```
+
+Beachten Sie, dass der Processor `input_values` und `labels` hinzugefügt hat. Auch die Abtastrate wurde korrekt auf 16kHz heruntergerechnet.
+
+Toll, Sie sollten jetzt in der Lage sein, Daten für jede Modalität vorzuverarbeiten und sogar verschiedene Modalitäten zu kombinieren! Im nächsten Kurs lernen Sie, wie Sie ein Modell mit Ihren neu aufbereiteten Daten feinabstimmen können.
diff --git a/docs/source/de/preprocessing.mdx b/docs/source/de/preprocessing.mdx
deleted file mode 100644
index ea6c185cc101..000000000000
--- a/docs/source/de/preprocessing.mdx
+++ /dev/null
@@ -1,502 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Vorverarbeiten
-
-[[open-in-colab]]
-
-Bevor Sie Ihre Daten in einem Modell verwenden können, müssen die Daten in ein für das Modell akzeptables Format gebracht werden. Ein Modell versteht keine Rohtexte, Bilder oder Audiodaten. Diese Eingaben müssen in Zahlen umgewandelt und zu Tensoren zusammengesetzt werden. In dieser Anleitung werden Sie:
-
-* Textdaten mit einem Tokenizer vorverarbeiten.
-* Bild- oder Audiodaten mit einem Feature Extractor vorverarbeiten.
-* Daten für eine multimodale Aufgabe mit einem Prozessor vorverarbeiten.
-
-## NLP
-
-<Youtube id="Yffk5aydLzg"/>
-
-Das wichtigste Werkzeug zur Verarbeitung von Textdaten ist ein [Tokenizer](main_classes/tokenizer). Ein Tokenizer zerlegt Text zunächst nach einer Reihe von Regeln in *Token*. Die Token werden in Zahlen umgewandelt, die zum Aufbau von Tensoren als Eingabe für ein Modell verwendet werden. Alle zusätzlichen Eingaben, die ein Modell benötigt, werden ebenfalls vom Tokenizer hinzugefügt.
-
-<Tip>
-
-Wenn Sie ein vortrainiertes Modell verwenden möchten, ist es wichtig, den zugehörigen vortrainierten Tokenizer zu verwenden. Dadurch wird sichergestellt, dass der Text auf die gleiche Weise aufgeteilt wird wie das Pretraining-Korpus und die gleichen entsprechenden Token-zu-Index (in der Regel als *vocab* bezeichnet) während des Pretrainings verwendet werden.
-
-</Tip>
-
-Laden Sie einen vortrainierten Tokenizer mit der Klasse [AutoTokenizer], um schnell loszulegen. Damit wird das *vocab* heruntergeladen, das verwendet wird, wenn ein Modell vortrainiert wird.
-
-### Tokenize
-
-Laden Sie einen vortrainierten Tokenizer mit [`AutoTokenizer.from_pretrained`]:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
-```
-
-Dann übergeben Sie Ihren Satz an den Tokenizer:
-
-```py
->>> encoded_input = tokenizer("Do not meddle in the affairs of wizards, for they are subtle and quick to anger.")
->>> print(encoded_input)
-{'input_ids': [101, 2079, 2025, 19960, 10362, 1999, 1996, 3821, 1997, 16657, 1010, 2005, 2027, 2024, 11259, 1998, 4248, 2000, 4963, 1012, 102], 
- 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
- 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
-```
-
-Der Tokenizer gibt ein Wörterbuch mit drei wichtigen Elementen zurück:
-
-* [input_ids](glossary#input-ids) sind die Indizes, die den einzelnen Token im Satz entsprechen.
-* [attention_mask](glossary#attention-mask) gibt an, ob ein Token beachtet werden soll oder nicht.
-* [token_type_ids](glossary#token-type-ids) gibt an, zu welcher Sequenz ein Token gehört, wenn es mehr als eine Sequenz gibt.
-
-Sie können die `input_ids` dekodieren, um die ursprüngliche Eingabe zurückzugeben:
-
-```py
->>> tokenizer.decode(encoded_input["input_ids"])
-'[CLS] Do not meddle in the affairs of wizards, for they are subtle and quick to anger. [SEP]'
-```
-
-Wie Sie sehen können, hat der Tokenisierer zwei spezielle Token - `CLS` und `SEP` (Klassifikator und Separator) - zum Satz hinzugefügt. Nicht alle Modelle benötigen
-spezielle Token, aber wenn dies der Fall ist, fügt der Tokenisierer sie automatisch für Sie hinzu.
-
-Wenn Sie mehrere Sätze verarbeiten wollen, übergeben Sie die Sätze als Liste an den Tokenizer:
-
-```py
->>> batch_sentences = [
-...     "But what about second breakfast?",
-...     "Don't think he knows about second breakfast, Pip.",
-...     "What about elevensies?",
-... ]
->>> encoded_inputs = tokenizer(batch_sentences)
->>> print(encoded_inputs)
-{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102], 
-               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], 
-               [101, 1327, 1164, 5450, 23434, 136, 102]], 
- 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], 
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
-                    [0, 0, 0, 0, 0, 0, 0]], 
- 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], 
-                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
-                    [1, 1, 1, 1, 1, 1, 1]]}
-```
-
-### Pad
-
-Dies bringt uns zu einem wichtigen Thema. Wenn Sie einen Haufen von Sätzen verarbeiten, sind diese nicht immer gleich lang. Das ist ein Problem, weil Tensoren, die Eingabe für das Modell, eine einheitliche Form haben müssen. Padding ist eine Strategie, die sicherstellt, dass Tensoren rechteckig sind, indem ein spezielles *Padding-Token* zu Sätzen mit weniger Token hinzugefügt wird.
-
-Setzen Sie den Parameter "padding" auf "true", um die kürzeren Sequenzen im Stapel so aufzufüllen, dass sie der längsten Sequenz entsprechen:
-
-```py
->>> batch_sentences = [
-...     "But what about second breakfast?",
-...     "Don't think he knows about second breakfast, Pip.",
-...     "What about elevensies?",
-... ]
->>> encoded_input = tokenizer(batch_sentences, padding=True)
->>> print(encoded_input)
-{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], 
-               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], 
-               [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], 
- 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 
- 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], 
-                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
-                    [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]}
-```
-
-Beachten Sie, dass der Tokenizer den ersten und den dritten Satz mit einer "0" aufgefüllt hat, weil sie kürzer sind!
-
-### Kürzung
-
-Auf der anderen Seite des Spektrums kann es vorkommen, dass eine Sequenz zu lang für ein Modell ist. In diesem Fall müssen Sie die Sequenz auf eine kürzere Länge kürzen.
-
-Setzen Sie den Parameter "truncation" auf "true", um eine Sequenz auf die vom Modell akzeptierte Höchstlänge zu kürzen:
-
-```py
->>> batch_sentences = [
-...     "But what about second breakfast?",
-...     "Don't think he knows about second breakfast, Pip.",
-...     "What about elevensies?",
-... ]
->>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True)
->>> print(encoded_input)
-{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], 
-               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], 
-               [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], 
- 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 
- 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], 
-                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
-                    [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]}
-```
-
-### Tensoren erstellen
-
-Schließlich möchten Sie, dass der Tokenizer die tatsächlichen Tensoren zurückgibt, die dem Modell zugeführt werden.
-
-Setzen Sie den Parameter `return_tensors` entweder auf `pt` für PyTorch, oder `tf` für TensorFlow:
-
-<frameworkcontent>
-<pt>
-
-```py
->>> batch_sentences = [
-...     "But what about second breakfast?",
-...     "Don't think he knows about second breakfast, Pip.",
-...     "What about elevensies?",
-... ]
->>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt")
->>> print(encoded_input)
-{'input_ids': tensor([[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
-                      [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
-                      [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]]), 
- 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 
- 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
-                           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                           [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}
-```
-</pt>
-<tf>
-```py
->>> batch_sentences = [
-...     "But what about second breakfast?",
-...     "Don't think he knows about second breakfast, Pip.",
-...     "What about elevensies?",
-... ]
->>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="tf")
->>> print(encoded_input)
-{'input_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
-array([[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
-       [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
-       [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]],
-      dtype=int32)>, 
- 'token_type_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
-array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>, 
- 'attention_mask': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
-array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
-       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-       [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>}
-```
-</tf>
-</frameworkcontent>
-
-## Audio
-
-Audioeingaben werden anders vorverarbeitet als Texteingaben, aber das Endziel bleibt dasselbe: numerische Sequenzen zu erstellen, die das Modell verstehen kann. Ein [feature extractor](main_classes/feature_extractor) dient dem ausdrücklichen Zweck, Merkmale aus Rohbild- oder Audiodaten zu extrahieren und in Tensoren zu konvertieren. Bevor Sie beginnen, installieren Sie 🤗 Datasets, um einen Audio-Datensatz zu laden, mit dem Sie experimentieren können:
-
-```bash
-pip install datasets
-```
-
-Laden Sie den [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) Datensatz (weitere Informationen zum Laden eines Datensatzes finden Sie im 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub.html)):
-
-```py
->>> from datasets import load_dataset, Audio
-
->>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
-```
-
-Greifen Sie auf das erste Element der `audio`-Spalte zu, um einen Blick auf die Eingabe zu werfen. Durch den Aufruf der Spalte "audio" wird die Audiodatei automatisch geladen und neu gesampelt:
-
-```py
->>> dataset[0]["audio"]
-{'array': array([ 0.        ,  0.00024414, -0.00024414, ..., -0.00024414,
-         0.        ,  0.        ], dtype=float32),
- 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
- 'sampling_rate': 8000}
-```
-
-Dies gibt drei Elemente zurück:
-
-* "array" ist das Sprachsignal, das als 1D-Array geladen - und möglicherweise neu gesampelt - wurde.
-* Pfad" zeigt auf den Speicherort der Audiodatei.
-* `sampling_rate` bezieht sich darauf, wie viele Datenpunkte im Sprachsignal pro Sekunde gemessen werden.
-
-### Resample
-
-Für dieses Tutorial werden Sie das Modell [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) verwenden. Wie Sie aus der Modellkarte ersehen können, ist das Wav2Vec2-Modell auf 16kHz abgetastetes Sprachaudio vortrainiert. Es ist wichtig, dass die Abtastrate Ihrer Audiodaten mit der Abtastrate des Datensatzes übereinstimmt, der für das Pre-Training des Modells verwendet wurde. Wenn die Abtastrate Ihrer Daten nicht dieselbe ist, müssen Sie Ihre Audiodaten neu abtasten. 
-
-Der Datensatz [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) hat zum Beispiel eine Abtastrate von 8000 kHz. Um das Wav2Vec2-Modell mit diesem Datensatz verwenden zu können, müssen Sie die Abtastrate auf 16 kHz erhöhen:
-
-```py
->>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
->>> dataset[0]["audio"]
-{'array': array([ 0.        ,  0.00024414, -0.00024414, ..., -0.00024414,
-         0.        ,  0.        ], dtype=float32),
- 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
- 'sampling_rate': 8000}
-```
-
-1. Verwenden Sie die Methode [~datasets.Dataset.cast_column] von 🤗 Datasets, um die Abtastrate auf 16kHz zu erhöhen:
-
-```py
->>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000))
-```
-
-2. Laden Sie die Audiodatei:
-
-```py
->>> dataset[0]["audio"]
-{'array': array([ 2.3443763e-05,  2.1729663e-04,  2.2145823e-04, ...,
-         3.8356509e-05, -7.3497440e-06, -2.1754686e-05], dtype=float32),
- 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
- 'sampling_rate': 16000}
-```
-
-Wie Sie sehen können, ist die Abtastrate jetzt 16kHz!
-
-### Merkmalsextraktor
-
-Der nächste Schritt ist das Laden eines Merkmalsextraktors, um die Eingabe zu normalisieren und aufzufüllen. Beim Auffüllen von Textdaten wird für kürzere Sequenzen ein `0` hinzugefügt. Die gleiche Idee gilt für Audiodaten, und der Audio-Feature-Extraktor fügt eine `0` - interpretiert als Stille - zu `array` hinzu.
-
-Laden Sie den Merkmalsextraktor mit [`AutoFeatureExtractor.from_pretrained`]:
-
-```py
->>> from transformers import AutoFeatureExtractor
-
->>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
-```
-
-Übergeben Sie das Audio-"Array" an den Feature-Extraktor. Wir empfehlen auch, das Argument `sampling_rate` im Feature Extractor hinzuzufügen, um eventuell auftretende stille Fehler besser zu beheben.
-
-```py
->>> audio_input = [dataset[0]["audio"]["array"]]
->>> feature_extractor(audio_input, sampling_rate=16000)
-{'input_values': [array([ 3.8106556e-04,  2.7506407e-03,  2.8015103e-03, ...,
-        5.6335266e-04,  4.6588284e-06, -1.7142107e-04], dtype=float32)]}
-```
-
-### Auffüllen und Kürzen
-
-Genau wie beim Tokenizer können Sie variable Sequenzen in einem Stapel durch Auffüllen oder Abschneiden behandeln. Werfen Sie einen Blick auf die Sequenzlänge dieser beiden Audiobeispiele:
-
-```py
->>> dataset[0]["audio"]["array"].shape
-(173398,)
-
->>> dataset[1]["audio"]["array"].shape
-(106496,)
-```
-
-Wie Sie sehen können, hat das erste Beispiel eine längere Sequenz als das zweite Beispiel. Lassen Sie uns eine Funktion erstellen, die den Datensatz vorverarbeitet. Geben Sie eine maximale Länge der Probe an, und der Feature-Extraktor wird die Sequenzen entweder auffüllen oder abschneiden, damit sie dieser Länge entsprechen:
-
-```py
->>> def preprocess_function(examples):
-...     audio_arrays = [x["array"] for x in examples["audio"]]
-...     inputs = feature_extractor(
-...         audio_arrays,
-...         sampling_rate=16000,
-...         padding=True,
-...         max_length=100000,
-...         truncation=True,
-...     )
-...     return inputs
-```
-
-Wenden Sie die Funktion auf die ersten paar Beispiele im Datensatz an:
-
-```py
->>> processed_dataset = preprocess_function(dataset[:5])
-```
-
-Schauen Sie sich nun noch einmal die verarbeiteten Beispiel-Längen an:
-
-```py
->>> processed_dataset["input_values"][0].shape
-(100000,)
-
->>> processed_dataset["input_values"][1].shape
-(100000,)
-```
-
-Die Länge der ersten beiden Beispiele entspricht nun der von Ihnen angegebenen Maximallänge.
-
-## Bildverarbeitung
-
-Ein Merkmalsextraktor wird auch verwendet, um Bilder für Bildverarbeitungsaufgaben zu verarbeiten. Auch hier besteht das Ziel darin, das Rohbild in eine Reihe von Tensoren als Eingabe zu konvertieren.
-
-Laden wir den [food101](https://huggingface.co/datasets/food101) Datensatz für dieses Tutorial. Verwenden Sie den Parameter 🤗 Datasets `split`, um nur eine kleine Stichprobe aus dem Trainingssplit zu laden, da der Datensatz recht groß ist:
-
-```py
->>> from datasets import load_dataset
-
->>> dataset = load_dataset("food101", split="train[:100]")
-```
-
-Als Nächstes sehen Sie sich das Bild mit dem Merkmal 🤗 Datensätze [Bild] (https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=image#datasets.Image) an:
-
-```py
->>> dataset[0]["image"]
-```
-
-![vision-preprocess-tutorial.png](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vision-preprocess-tutorial.png)
-
-### Merkmalsextraktor
-
-Laden Sie den Merkmalsextraktor mit [`AutoFeatureExtractor.from_pretrained`]:
-
-```py
->>> from transformers import AutoFeatureExtractor
-
->>> feature_extractor = AutoFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
-```
-
-### Datenerweiterung
-
-Bei Bildverarbeitungsaufgaben ist es üblich, den Bildern als Teil der Vorverarbeitung eine Art von Datenerweiterung hinzuzufügen. Sie können Erweiterungen mit jeder beliebigen Bibliothek hinzufügen, aber in diesem Tutorial werden Sie das Modul [`transforms`](https://pytorch.org/vision/stable/transforms.html) von torchvision verwenden.
-
-1. Normalisieren Sie das Bild und verwenden Sie [`Compose`](https://pytorch.org/vision/master/generated/torchvision.transforms.Compose.html), um einige Transformationen - [`RandomResizedCrop`](https://pytorch.org/vision/main/generated/torchvision.transforms.RandomResizedCrop.html) und [`ColorJitter`](https://pytorch.org/vision/main/generated/torchvision.transforms.ColorJitter.html) - miteinander zu verknüpfen:
-
-```py
->>> from torchvision.transforms import Compose, Normalize, RandomResizedCrop, ColorJitter, ToTensor
-
->>> normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
->>> _transforms = Compose(
-...     [RandomResizedCrop(feature_extractor.size), ColorJitter(brightness=0.5, hue=0.5), ToTensor(), normalize]
-... )
-```
-
-2. Das Modell akzeptiert [`pixel_values`](model_doc/visionencoderdecoder#transformers.VisionEncoderDecoderModel.forward.pixel_values) als Eingabe. Dieser Wert wird vom Merkmalsextraktor erzeugt. Erstellen Sie eine Funktion, die `pixel_values` aus den Transformationen erzeugt:
-
-```py
->>> def transforms(examples):
-...     examples["pixel_values"] = [_transforms(image.convert("RGB")) for image in examples["image"]]
-...     return examples
-```
-
-3. Dann verwenden Sie 🤗 Datasets [`set_transform`](https://huggingface.co/docs/datasets/process.html#format-transform), um die Transformationen im laufenden Betrieb anzuwenden:
-
-```py
->>> dataset.set_transform(transforms)
-```
-
-4. Wenn Sie nun auf das Bild zugreifen, werden Sie feststellen, dass der Feature Extractor die Modelleingabe "pixel_values" hinzugefügt hat:
-
-```py
->>> dataset[0]["image"]
-{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=384x512 at 0x7F1A7B0630D0>,
- 'label': 6,
- 'pixel_values': tensor([[[ 0.0353,  0.0745,  0.1216,  ..., -0.9922, -0.9922, -0.9922],
-          [-0.0196,  0.0667,  0.1294,  ..., -0.9765, -0.9843, -0.9922],
-          [ 0.0196,  0.0824,  0.1137,  ..., -0.9765, -0.9686, -0.8667],
-          ...,
-          [ 0.0275,  0.0745,  0.0510,  ..., -0.1137, -0.1216, -0.0824],
-          [ 0.0667,  0.0824,  0.0667,  ..., -0.0588, -0.0745, -0.0980],
-          [ 0.0353,  0.0353,  0.0431,  ..., -0.0039, -0.0039, -0.0588]],
- 
-         [[ 0.2078,  0.2471,  0.2863,  ..., -0.9451, -0.9373, -0.9451],
-          [ 0.1608,  0.2471,  0.3098,  ..., -0.9373, -0.9451, -0.9373],
-          [ 0.2078,  0.2706,  0.3020,  ..., -0.9608, -0.9373, -0.8275],
-          ...,
-          [-0.0353,  0.0118, -0.0039,  ..., -0.2392, -0.2471, -0.2078],
-          [ 0.0196,  0.0353,  0.0196,  ..., -0.1843, -0.2000, -0.2235],
-          [-0.0118, -0.0039, -0.0039,  ..., -0.0980, -0.0980, -0.1529]],
- 
-         [[ 0.3961,  0.4431,  0.4980,  ..., -0.9216, -0.9137, -0.9216],
-          [ 0.3569,  0.4510,  0.5216,  ..., -0.9059, -0.9137, -0.9137],
-          [ 0.4118,  0.4745,  0.5216,  ..., -0.9137, -0.8902, -0.7804],
-          ...,
-          [-0.2314, -0.1922, -0.2078,  ..., -0.4196, -0.4275, -0.3882],
-          [-0.1843, -0.1686, -0.2000,  ..., -0.3647, -0.3804, -0.4039],
-          [-0.1922, -0.1922, -0.1922,  ..., -0.2941, -0.2863, -0.3412]]])}
-```
-
-Hier sehen Sie, wie das Bild nach der Vorverarbeitung aussieht. Wie von den angewandten Transformationen zu erwarten, wurde das Bild willkürlich beschnitten und seine Farbeigenschaften sind anders.
-
-```py
->>> import numpy as np
->>> import matplotlib.pyplot as plt
-
->>> img = dataset[0]["pixel_values"]
->>> plt.imshow(img.permute(1, 2, 0))
-```
-
-![preprocessed_image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/preprocessed_image.png)
-
-## Multimodal
-
-Für multimodale Aufgaben werden Sie eine Kombination aus allem, was Sie bisher gelernt haben, verwenden und Ihre Fähigkeiten auf eine Aufgabe der automatischen Spracherkennung (ASR) anwenden. Dies bedeutet, dass Sie einen:
-
-* Feature Extractor zur Vorverarbeitung der Audiodaten.
-* Tokenizer, um den Text zu verarbeiten.
-
-Kehren wir zum [LJ Speech](https://huggingface.co/datasets/lj_speech) Datensatz zurück:
-
-```py
->>> from datasets import load_dataset
-
->>> lj_speech = load_dataset("lj_speech", split="train")
-```
-
-Da Sie hauptsächlich an den Spalten "Audio" und "Text" interessiert sind, entfernen Sie die anderen Spalten:
-
-```py
->>> lj_speech = lj_speech.map(remove_columns=["file", "id", "normalized_text"])
-```
-
-Schauen Sie sich nun die Spalten "Audio" und "Text" an:
-
-```py
->>> lj_speech[0]["audio"]
-{'array': array([-7.3242188e-04, -7.6293945e-04, -6.4086914e-04, ...,
-         7.3242188e-04,  2.1362305e-04,  6.1035156e-05], dtype=float32),
- 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav',
- 'sampling_rate': 22050}
-
->>> lj_speech[0]["text"]
-'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition'
-```
-
-Erinnern Sie sich an den früheren Abschnitt über die Verarbeitung von Audiodaten: Sie sollten immer die Abtastrate Ihrer Audiodaten [resample](preprocessing#audio), damit sie mit der Abtastrate des Datensatzes übereinstimmt, der für das Vortraining eines Modells verwendet wird:
-
-```py
->>> lj_speech = lj_speech.cast_column("audio", Audio(sampling_rate=16_000))
-```
-
-### Prozessor
-
-Ein Processor kombiniert einen Feature-Extraktor und einen Tokenizer. Laden Sie einen Processor mit [`AutoProcessor.from_pretrained]:
-
-```py
->>> from transformers import AutoProcessor
-
->>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
-```
-
-1. Erstellen Sie eine Funktion, die die Audiodaten zu `input_values` verarbeitet und den Text zu `labels` tokenisiert. Dies sind Ihre Eingaben für das Modell:
-
-```py
->>> def prepare_dataset(example):
-...     audio = example["audio"]
-
-...     example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000))
-
-...     return example
-```
-
-2. Wenden Sie die Funktion "prepare_dataset" auf ein Beispiel an:
-
-```py
->>> prepare_dataset(lj_speech[0])
-```
-
-Beachten Sie, dass der Processor `input_values` und `labels` hinzugefügt hat. Auch die Abtastrate wurde korrekt auf 16kHz heruntergerechnet.
-
-Toll, Sie sollten jetzt in der Lage sein, Daten für jede Modalität vorzuverarbeiten und sogar verschiedene Modalitäten zu kombinieren! Im nächsten Kurs lernen Sie, wie Sie ein Modell mit Ihren neu aufbereiteten Daten feinabstimmen können.
diff --git a/docs/source/de/quicktour.md b/docs/source/de/quicktour.md
new file mode 100644
index 000000000000..139869e5d1ee
--- /dev/null
+++ b/docs/source/de/quicktour.md
@@ -0,0 +1,438 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Schnellstart
+
+[[open-in-colab]]
+
+Mit 🤗 Transformers können Sie sofort loslegen! Verwenden Sie die [`pipeline`] für schnelle Inferenz und laden Sie schnell ein vortrainiertes Modell und einen Tokenizer mit einer [AutoClass](./model_doc/auto), um Ihre Text-, Bild- oder Audioaufgabe zu lösen.
+
+<Tip>
+
+Alle in der Dokumentation vorgestellten Codebeispiele haben oben links einen Umschalter für PyTorch und TensorFlow. Wenn
+nicht, wird erwartet, dass der Code für beide Backends ohne Änderungen funktioniert.
+
+</Tip>
+
+## Pipeline
+
+[`pipeline`] ist der einfachste Weg, ein vortrainiertes Modell für eine bestimmte Aufgabe zu verwenden.
+
+<Youtube id="tiZFewofSLM"/>
+
+Die [`pipeline`] unterstützt viele gängige Aufgaben:
+
+**Text**:
+* Stimmungsanalyse: Klassifizierung der Polarität eines gegebenen Textes.
+* Textgenerierung (auf Englisch): Generierung von Text aus einer gegebenen Eingabe.
+* Name-Entity-Recognition (NER): Kennzeichnung jedes Worts mit der Entität, die es repräsentiert (Person, Datum, Ort usw.).
+* Beantwortung von Fragen: Extrahieren der Antwort aus dem Kontext, wenn ein gewisser Kontext und eine Frage gegeben sind.
+* Fill-mask: Ausfüllen von Lücken in einem Text mit maskierten Wörtern.
+* Zusammenfassung: Erstellung einer Zusammenfassung einer langen Text- oder Dokumentensequenz.
+* Übersetzung: Übersetzen eines Textes in eine andere Sprache.
+* Merkmalsextraktion: Erstellen einer Tensordarstellung des Textes.
+
+**Bild**:
+* Bildklassifizierung: Klassifizierung eines Bildes.
+* Bildsegmentierung: Klassifizierung jedes Pixels in einem Bild.
+* Objekterkennung: Erkennen von Objekten innerhalb eines Bildes.
+
+**Audio**:
+* Audioklassifizierung: Zuweisung eines Labels zu einem bestimmten Audiosegment.
+* Automatische Spracherkennung (ASR): Transkription von Audiodaten in Text.
+
+<Tip>
+
+Für mehr Details über die [`pipeline`] und assoziierte Aufgaben, schauen Sie in die Dokumentation [hier](./main_classes/pipelines).
+
+</Tip>
+
+### Verwendung der Pipeline
+
+Im folgenden Beispiel werden Sie die [`pipeline`] für die Stimmungsanalyse verwenden.
+
+Installieren Sie die folgenden Abhängigkeiten, falls Sie dies nicht bereits getan haben:
+
+<frameworkcontent>
+<pt>
+
+```bash
+pip install torch
+```
+</pt>
+<tf>
+
+```bash
+pip install tensorflow
+```
+</tf>
+</frameworkcontent>
+
+Importieren sie die [`pipeline`] und spezifizieren sie die Aufgabe, welche sie lösen möchten:
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline("sentiment-analysis")
+```
+
+Die Pipeline lädt ein standardmäßiges [vortrainiertes Modell] (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english) und einen Tokenizer für die Stimmungs-Analyse herunter und speichert sie. Jetzt können Sie den "Klassifikator" auf Ihren Zieltext anwenden:
+
+```py
+>>> classifier("We are very happy to show you the 🤗 Transformers library.")
+[{'label': 'POSITIVE', 'score': 0.9998}]
+```
+
+For more than one sentence, pass a list of sentences to the [`pipeline`] which returns a list of dictionaries:
+
+```py
+>>> results = classifier(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."])
+>>> for result in results:
+...     print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
+label: POSITIVE, with score: 0.9998
+label: NEGATIVE, with score: 0.5309
+```
+
+Die [`pipeline`] kann auch über einen ganzen Datensatz iterieren. Starten wir mit der Installation der [🤗 Datasets](https://huggingface.co/docs/datasets/) Bibliothek:
+
+```bash
+pip install datasets 
+```
+
+Erstellen wir eine [`pipeline`] mit der Aufgabe die wir lösen und dem Modell welches wir nutzen möchten.
+
+```py
+>>> import torch
+>>> from transformers import pipeline
+
+>>> speech_recognizer = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
+```
+
+Als nächstes laden wir den Datensatz (siehe 🤗 Datasets [Quick Start](https://huggingface.co/docs/datasets/quickstart.html) für mehr Details) welches wir nutzen möchten. Zum Beispiel laden wir den [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) Datensatz:
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")  # doctest: +IGNORE_RESULT
+```
+
+Wir müssen sicherstellen, dass die Abtastrate des Datensatzes der Abtastrate entspricht, mit der `facebook/wav2vec2-base-960h` trainiert wurde.
+
+```py
+>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=speech_recognizer.feature_extractor.sampling_rate))
+```
+
+Audiodateien werden automatisch geladen und neu abgetastet, wenn die Spalte "audio" aufgerufen wird.
+Extrahieren wir die rohen Wellenform-Arrays der ersten 4 Beispiele und übergeben wir sie als Liste an die Pipeline:
+
+```py
+>>> result = speech_recognizer(dataset[:4]["audio"])
+>>> print([d["text"] for d in result])
+['I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT', "FODING HOW I'D SET UP A JOIN TO HET WITH MY WIFE AND WHERE THE AP MIGHT BE", "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE AP SO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AND I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS", 'HOW DO I THURN A JOIN A COUNT']
+```
+
+Bei einem größeren Datensatz mit vielen Eingaben (wie bei Sprache oder Bildverarbeitung) sollten Sie einen Generator anstelle einer Liste übergeben, der alle Eingaben in den Speicher lädt. Weitere Informationen finden Sie in der [Pipeline-Dokumentation](./main_classes/pipelines).
+
+### Ein anderes Modell und einen anderen Tokenizer in der Pipeline verwenden
+
+Die [`pipeline`] kann jedes Modell aus dem [Model Hub] (https://huggingface.co/models) verwenden, wodurch es einfach ist, die [`pipeline`] für andere Anwendungsfälle anzupassen. Wenn Sie beispielsweise ein Modell wünschen, das französischen Text verarbeiten kann, verwenden Sie die Tags im Model Hub, um nach einem geeigneten Modell zu filtern. Das oberste gefilterte Ergebnis liefert ein mehrsprachiges [BERT-Modell](https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment), das auf die Stimmungsanalyse abgestimmt ist. Großartig, verwenden wir dieses Modell!
+
+```py
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+```
+
+<frameworkcontent>
+<pt>
+Use the [`AutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and it's associated tokenizer (more on an `AutoClass` below):
+
+```py
+>>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained(model_name)
+>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+```
+</pt>
+<tf>
+Use the [`TFAutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and it's associated tokenizer (more on an `TFAutoClass` below):
+
+```py
+>>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+
+>>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
+>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+```
+</tf>
+</frameworkcontent>
+
+Dann können Sie das Modell und den Tokenizer in der [`pipeline`] angeben und den `Klassifikator` auf Ihren Zieltext anwenden:
+
+```py
+>>> classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
+>>> classifier("Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers.")
+[{'label': '5 stars', 'score': 0.7273}]
+```
+
+Wenn Sie kein Modell für Ihren Anwendungsfall finden können, müssen Sie ein vortrainiertes Modell auf Ihren Daten feinabstimmen. Schauen Sie sich unser [Feinabstimmungs-Tutorial](./training) an, um zu erfahren, wie das geht. Und schließlich, nachdem Sie Ihr trainiertes Modell verfeinert haben, sollten Sie es mit der Community im Model Hub teilen (siehe Tutorial [hier](./model_sharing)), um NLP für alle zu demokratisieren! 🤗
+
+## AutoClass
+
+<Youtube id="AhChOFRegn4"/>
+
+Unter der Haube arbeiten die Klassen [`AutoModelForSequenceClassification`] und [`AutoTokenizer`] zusammen, um die [`pipeline`] zu betreiben. Eine [`AutoClass`](./model_doc/auto) ist eine Abkürzung, die automatisch die Architektur eines trainierten Modells aus dessen Namen oder Pfad abruft. Sie müssen nur die passende `AutoClass` für Ihre Aufgabe und den zugehörigen Tokenizer mit [`AutoTokenizer`] auswählen. 
+
+Kehren wir zu unserem Beispiel zurück und sehen wir uns an, wie Sie die `AutoClass` verwenden können, um die Ergebnisse der [`pipeline`] zu replizieren.
+
+### AutoTokenizer
+
+Ein Tokenizer ist für die Vorverarbeitung von Text in ein für das Modell verständliches Format zuständig. Zunächst zerlegt der Tokenisierer den Text in Wörter, die *Token* genannt werden. Es gibt mehrere Regeln für den Tokenisierungsprozess, z. B. wie und auf welcher Ebene ein Wort aufgespalten wird (weitere Informationen über Tokenisierung [hier](./tokenizer_summary)). Das Wichtigste ist jedoch, dass Sie den Tokenizer mit demselben Modellnamen instanziieren müssen, um sicherzustellen, dass Sie dieselben Tokenisierungsregeln verwenden, mit denen ein Modell zuvor trainiert wurde.
+Laden sie einen Tokenizer mit [`AutoTokenizer`]:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+```
+
+Anschließend wandelt der Tokenizer die Token in Zahlen um, um einen Tensor als Eingabe für das Modell zu konstruieren. Dieser wird als *Vokabular* des Modells bezeichnet.
+
+Übergeben Sie Ihren Text an den Tokenizer:
+
+```py
+>>> encoding = tokenizer("We are very happy to show you the 🤗 Transformers library.")
+>>> print(encoding)
+{'input_ids': [101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103, 100, 58263, 13299, 119, 102],
+ 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+ 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+```
+
+Der Tokenizer gibt ein Wörterbuch zurück, das Folgendes enthält:
+
+* [input_ids](./glossary#input-ids): numerische Repräsentationen Ihrer Token.
+* [atttention_mask](.glossary#attention-mask): gibt an, welche Token beachtet werden sollen.
+
+Genau wie die [`pipeline`] akzeptiert der Tokenizer eine Liste von Eingaben. Darüber hinaus kann der Tokenizer den Text auch auffüllen und kürzen, um einen Stapel mit einheitlicher Länge zurückzugeben:
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> pt_batch = tokenizer(
+...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
+...     padding=True,
+...     truncation=True,
+...     max_length=512,
+...     return_tensors="pt",
+... )
+```
+</pt>
+<tf>
+
+```py
+>>> tf_batch = tokenizer(
+...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
+...     padding=True,
+...     truncation=True,
+...     max_length=512,
+...     return_tensors="tf",
+... )
+```
+</tf>
+</frameworkcontent>
+
+Lesen Sie das Tutorial [preprocessing](./preprocessing) für weitere Details zur Tokenisierung.
+
+### AutoModel
+
+<frameworkcontent>
+<pt>
+🤗 Transformers bietet eine einfache und einheitliche Möglichkeit, vortrainierte Instanzen zu laden. Das bedeutet, dass Sie ein [`AutoModel`] laden können, wie Sie einen [`AutoTokenizer`] laden würden. Der einzige Unterschied ist die Auswahl des richtigen [`AutoModel`] für die Aufgabe. Da Sie eine Text- oder Sequenzklassifizierung vornehmen, laden Sie [`AutoModelForSequenceClassification`]:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)
+```
+
+<Tip>
+
+In der [Aufgabenzusammenfassung](./task_summary) steht, welche [AutoModel]-Klasse für welche Aufgabe zu verwenden ist.
+
+</Tip>
+
+Jetzt können Sie Ihren vorverarbeiteten Stapel von Eingaben direkt an das Modell übergeben. Sie müssen nur das Wörterbuch entpacken, indem Sie `**` hinzufügen:
+
+```py
+>>> pt_outputs = pt_model(**pt_batch)
+```
+
+Das Modell gibt die endgültigen Aktivierungen in dem Attribut "logits" aus. Wenden Sie die Softmax-Funktion auf die "logits" an, um die Wahrscheinlichkeiten zu erhalten:
+  
+```py
+>>> from torch import nn
+
+>>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
+>>> print(pt_predictions)
+tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
+        [0.2084, 0.1826, 0.1969, 0.1755, 0.2365]], grad_fn=<SoftmaxBackward0>)
+```
+</pt>
+<tf>
+🤗 Transformers bietet eine einfache und einheitliche Methode zum Laden von vortrainierten Instanzen. Das bedeutet, dass Sie ein [`TFAutoModel`] genauso laden können, wie Sie einen [`AutoTokenizer`] laden würden. Der einzige Unterschied ist die Auswahl des richtigen [`TFAutoModel`] für die Aufgabe. Da Sie Text - oder Sequenz - Klassifizierung machen, laden Sie [`TFAutoModelForSequenceClassification`]:
+
+```py
+>>> from transformers import TFAutoModelForSequenceClassification
+
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
+```
+
+<Tip>
+
+In der [Aufgabenzusammenfassung](./task_summary) steht, welche [AutoModel]-Klasse für welche Aufgabe zu verwenden ist.
+
+</Tip>
+
+Jetzt können Sie Ihren vorverarbeiteten Stapel von Eingaben direkt an das Modell übergeben, indem Sie die Wörterbuchschlüssel direkt an die Tensoren übergeben:
+  
+```py
+>>> tf_outputs = tf_model(tf_batch)
+```
+
+Das Modell gibt die endgültigen Aktivierungen in dem Attribut "logits" aus. Wenden Sie die Softmax-Funktion auf die "logits" an, um die Wahrscheinlichkeiten zu erhalten:
+
+```py
+>>> import tensorflow as tf
+
+>>> tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1)
+>>> tf_predictions  # doctest: +IGNORE_RESULT
+```
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+Alle 🤗 Transformers-Modelle (PyTorch oder TensorFlow) geben die Tensoren *vor* der endgültigen Aktivierungsfunktion
+Funktion (wie Softmax) aus, da die endgültige Aktivierungsfunktion oft mit dem Verlusten verschmolzen ist.
+
+</Tip>
+
+Modelle sind ein standardmäßiges [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) oder ein [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model), sodass Sie sie in Ihrer üblichen Trainingsschleife verwenden können. Um jedoch die Dinge einfacher zu machen, bietet 🤗 Transformers eine [`Trainer`]-Klasse für PyTorch, die Funktionalität für verteiltes Training, gemischte Präzision und mehr bietet. Für TensorFlow können Sie die Methode `fit` aus [Keras](https://keras.io/) verwenden. Siehe das [training tutorial](./training) für weitere Details.
+
+<Tip>
+
+Transformers-Modellausgaben sind spezielle Datenklassen, so dass ihre Attribute in einer IDE automatisch vervollständigt werden.
+Die Modellausgänge verhalten sich auch wie ein Tupel oder ein Wörterbuch (z.B. können Sie mit einem Integer, einem Slice oder einem String indexieren), wobei die Attribute, die "None" sind, ignoriert werden.
+
+</Tip>
+
+### Modell speichern
+
+<frameworkcontent>
+<pt>
+Sobald Ihr Modell feinabgestimmt ist, können Sie es mit seinem Tokenizer speichern, indem Sie [`PreTrainedModel.save_pretrained`] verwenden:
+
+```py
+>>> pt_save_directory = "./pt_save_pretrained"
+>>> tokenizer.save_pretrained(pt_save_directory)  # doctest: +IGNORE_RESULT
+>>> pt_model.save_pretrained(pt_save_directory)
+```
+
+Wenn Sie bereit sind, das Modell erneut zu verwenden, laden Sie es mit [`PreTrainedModel.from_pretrained`]:
+
+```py
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained")
+```
+</pt>
+<tf>
+Sobald Ihr Modell feinabgestimmt ist, können Sie es mit seinem Tokenizer unter Verwendung von [`TFPreTrainedModel.save_pretrained`] speichern:
+
+```py
+>>> tf_save_directory = "./tf_save_pretrained"
+>>> tokenizer.save_pretrained(tf_save_directory)  # doctest: +IGNORE_RESULT
+>>> tf_model.save_pretrained(tf_save_directory)
+```
+
+Wenn Sie bereit sind, das Modell wieder zu verwenden, laden Sie es mit [`TFPreTrainedModel.from_pretrained`]:
+
+```py
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("./tf_save_pretrained")
+```
+</tf>
+</frameworkcontent>
+
+Ein besonders cooles 🤗 Transformers-Feature ist die Möglichkeit, ein Modell zu speichern und es entweder als PyTorch- oder TensorFlow-Modell wieder zu laden. Der Parameter "from_pt" oder "from_tf" kann das Modell von einem Framework in das andere konvertieren:
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> from transformers import AutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
+```
+</pt>
+<tf>
+
+```py
+>>> from transformers import TFAutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
+```
+</tf>
+</frameworkcontent>
+
+## Custom model builds
+
+Sie können die Konfigurationsklasse des Modells ändern, um zu bestimmen, wie ein Modell aufgebaut ist. Die Konfiguration legt die Attribute eines Modells fest, z. B. die Anzahl der verborgenen Schichten oder der Aufmerksamkeitsköpfe. Wenn Sie ein Modell aus einer benutzerdefinierten Konfigurationsklasse initialisieren, beginnen Sie bei Null. Die Modellattribute werden zufällig initialisiert, und Sie müssen das Modell trainieren, bevor Sie es verwenden können, um aussagekräftige Ergebnisse zu erhalten.
+
+Beginnen Sie mit dem Import von [`AutoConfig`] und laden Sie dann das trainierte Modell, das Sie ändern möchten. Innerhalb von [`AutoConfig.from_pretrained`] können Sie das Attribut angeben, das Sie ändern möchten, z. B. die Anzahl der Aufmerksamkeitsköpfe:
+
+```py
+>>> from transformers import AutoConfig
+
+>>> my_config = AutoConfig.from_pretrained("distilbert-base-uncased", n_heads=12)
+```
+
+<frameworkcontent>
+<pt>
+Create a model from your custom configuration with [`AutoModel.from_config`]:
+
+```py
+>>> from transformers import AutoModel
+
+>>> my_model = AutoModel.from_config(my_config)
+```
+</pt>
+<tf>
+Create a model from your custom configuration with [`TFAutoModel.from_config`]:
+
+```py
+>>> from transformers import TFAutoModel
+
+>>> my_model = TFAutoModel.from_config(my_config)
+```
+</tf>
+</frameworkcontent>
+
+Weitere Informationen zur Erstellung von benutzerdefinierten Konfigurationen finden Sie in der Anleitung [Erstellen einer benutzerdefinierten Architektur](./create_a_model).
+
+## Wie geht es weiter?
+
+Nachdem Sie nun die 🤗 Transformers-Kurztour abgeschlossen haben, schauen Sie sich unsere Anleitungen an und erfahren Sie, wie Sie spezifischere Dinge tun können, wie das Schreiben eines benutzerdefinierten Modells, die Feinabstimmung eines Modells für eine Aufgabe und wie man ein Modell mit einem Skript trainiert. Wenn Sie mehr über die Kernkonzepte von 🤗 Transformers erfahren möchten, nehmen Sie sich eine Tasse Kaffee und werfen Sie einen Blick auf unsere konzeptionellen Leitfäden!
diff --git a/docs/source/de/quicktour.mdx b/docs/source/de/quicktour.mdx
deleted file mode 100644
index 4c668bf419b1..000000000000
--- a/docs/source/de/quicktour.mdx
+++ /dev/null
@@ -1,428 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Schnellstart
-
-[[open-in-colab]]
-
-Mit 🤗 Transformers können Sie sofort loslegen! Verwenden Sie die [`pipeline`] für schnelle Inferenz und laden Sie schnell ein vortrainiertes Modell und einen Tokenizer mit einer [AutoClass](./model_doc/auto), um Ihre Text-, Bild- oder Audioaufgabe zu lösen.
-
-<Tip>
-
-Alle in der Dokumentation vorgestellten Codebeispiele haben oben links einen Umschalter für PyTorch und TensorFlow. Wenn
-nicht, wird erwartet, dass der Code für beide Backends ohne Änderungen funktioniert.
-
-</Tip>
-
-## Pipeline
-
-[`pipeline`] ist der einfachste Weg, ein vortrainiertes Modell für eine bestimmte Aufgabe zu verwenden.
-
-<Youtube id="tiZFewofSLM"/>
-
-Die [`pipeline`] unterstützt viele gängige Aufgaben:
-
-**Text**:
-* Stimmungsanalyse: Klassifizierung der Polarität eines gegebenen Textes.
-* Textgenerierung (auf Englisch): Generierung von Text aus einer gegebenen Eingabe.
-* Name-Entity-Recognition (NER): Kennzeichnung jedes Worts mit der Entität, die es repräsentiert (Person, Datum, Ort usw.).
-* Beantwortung von Fragen: Extrahieren der Antwort aus dem Kontext, wenn ein gewisser Kontext und eine Frage gegeben sind.
-* Fill-mask: Ausfüllen von Lücken in einem Text mit maskierten Wörtern.
-* Zusammenfassung: Erstellung einer Zusammenfassung einer langen Text- oder Dokumentensequenz.
-* Übersetzung: Übersetzen eines Textes in eine andere Sprache.
-* Merkmalsextraktion: Erstellen einer Tensordarstellung des Textes.
-
-**Bild**:
-* Bildklassifizierung: Klassifizierung eines Bildes.
-* Bildsegmentierung: Klassifizierung jedes Pixels in einem Bild.
-* Objekterkennung: Erkennen von Objekten innerhalb eines Bildes.
-
-**Audio**:
-* Audioklassifizierung: Zuweisung eines Labels zu einem bestimmten Audiosegment.
-* Automatische Spracherkennung (ASR): Transkription von Audiodaten in Text.
-
-<Tip>
-
-Für mehr Details über die [`pipeline`] und assoziierte Aufgaben, schauen Sie in die Dokumentation [hier](./main_classes/pipelines).
-
-</Tip>
-
-### Verwendung der Pipeline
-
-Im folgenden Beispiel werden Sie die [`pipeline`] für die Stimmungsanalyse verwenden.
-
-Installieren Sie die folgenden Abhängigkeiten, falls Sie dies nicht bereits getan haben:
-
-<frameworkcontent>
-<pt>
-```bash
-pip install torch
-```
-</pt>
-<tf>
-```bash
-pip install tensorflow
-```
-</tf>
-</frameworkcontent>
-
-Importieren sie die [`pipeline`] und spezifizieren sie die Aufgabe, welche sie lösen möchten:
-
-```py
->>> from transformers import pipeline
-
->>> classifier = pipeline("sentiment-analysis")
-```
-
-Die Pipeline lädt ein standardmäßiges [vortrainiertes Modell] (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english) und einen Tokenizer für die Stimmungs-Analyse herunter und speichert sie. Jetzt können Sie den "Klassifikator" auf Ihren Zieltext anwenden:
-
-```py
->>> classifier("We are very happy to show you the 🤗 Transformers library.")
-[{'label': 'POSITIVE', 'score': 0.9998}]
-```
-
-For more than one sentence, pass a list of sentences to the [`pipeline`] which returns a list of dictionaries:
-
-```py
->>> results = classifier(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."])
->>> for result in results:
-...     print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
-label: POSITIVE, with score: 0.9998
-label: NEGATIVE, with score: 0.5309
-```
-
-Die [`pipeline`] kann auch über einen ganzen Datensatz iterieren. Starten wir mit der Installation der [🤗 Datasets](https://huggingface.co/docs/datasets/) Bibliothek:
-
-```bash
-pip install datasets 
-```
-
-Erstellen wir eine [`pipeline`] mit der Aufgabe die wir lösen und dem Modell welches wir nutzen möchten.
-
-```py
->>> import torch
->>> from transformers import pipeline
-
->>> speech_recognizer = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
-```
-
-Als nächstes laden wir den Datensatz (siehe 🤗 Datasets [Quick Start](https://huggingface.co/docs/datasets/quickstart.html) für mehr Details) welches wir nutzen möchten. Zum Beispiel laden wir den [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) Datensatz:
-
-```py
->>> from datasets import load_dataset, Audio
-
->>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")  # doctest: +IGNORE_RESULT
-```
-
-Wir müssen sicherstellen, dass die Abtastrate des Datensatzes der Abtastrate entspricht, mit der `facebook/wav2vec2-base-960h` trainiert wurde.
-
-```py
->>> dataset = dataset.cast_column("audio", Audio(sampling_rate=speech_recognizer.feature_extractor.sampling_rate))
-```
-
-Audiodateien werden automatisch geladen und neu abgetastet, wenn die Spalte "audio" aufgerufen wird.
-Extrahieren wir die rohen Wellenform-Arrays der ersten 4 Beispiele und übergeben wir sie als Liste an die Pipeline:
-
-```py
->>> result = speech_recognizer(dataset[:4]["audio"])
->>> print([d["text"] for d in result])
-['I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT', "FODING HOW I'D SET UP A JOIN TO HET WITH MY WIFE AND WHERE THE AP MIGHT BE", "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE AP SO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AND I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS", 'HOW DO I THURN A JOIN A COUNT']
-```
-
-Bei einem größeren Datensatz mit vielen Eingaben (wie bei Sprache oder Bildverarbeitung) sollten Sie einen Generator anstelle einer Liste übergeben, der alle Eingaben in den Speicher lädt. Weitere Informationen finden Sie in der [Pipeline-Dokumentation](./main_classes/pipelines).
-
-### Ein anderes Modell und einen anderen Tokenizer in der Pipeline verwenden
-
-Die [`pipeline`] kann jedes Modell aus dem [Model Hub] (https://huggingface.co/models) verwenden, wodurch es einfach ist, die [`pipeline`] für andere Anwendungsfälle anzupassen. Wenn Sie beispielsweise ein Modell wünschen, das französischen Text verarbeiten kann, verwenden Sie die Tags im Model Hub, um nach einem geeigneten Modell zu filtern. Das oberste gefilterte Ergebnis liefert ein mehrsprachiges [BERT-Modell](https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment), das auf die Stimmungsanalyse abgestimmt ist. Großartig, verwenden wir dieses Modell!
-
-```py
->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
-```
-
-<frameworkcontent>
-<pt>
-Use the [`AutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and it's associated tokenizer (more on an `AutoClass` below):
-
-```py
->>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
-
->>> model = AutoModelForSequenceClassification.from_pretrained(model_name)
->>> tokenizer = AutoTokenizer.from_pretrained(model_name)
-```
-</pt>
-<tf>
-Use the [`TFAutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and it's associated tokenizer (more on an `TFAutoClass` below):
-
-```py
->>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
-
->>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
->>> tokenizer = AutoTokenizer.from_pretrained(model_name)
-```
-</tf>
-</frameworkcontent>
-
-Dann können Sie das Modell und den Tokenizer in der [`pipeline`] angeben und den `Klassifikator` auf Ihren Zieltext anwenden:
-
-```py
->>> classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
->>> classifier("Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers.")
-[{'label': '5 stars', 'score': 0.7273}]
-```
-
-Wenn Sie kein Modell für Ihren Anwendungsfall finden können, müssen Sie ein vortrainiertes Modell auf Ihren Daten feinabstimmen. Schauen Sie sich unser [Feinabstimmungs-Tutorial](./training) an, um zu erfahren, wie das geht. Und schließlich, nachdem Sie Ihr trainiertes Modell verfeinert haben, sollten Sie es mit der Community im Model Hub teilen (siehe Tutorial [hier](./model_sharing)), um NLP für alle zu demokratisieren! 🤗
-
-## AutoClass
-
-<Youtube id="AhChOFRegn4"/>
-
-Unter der Haube arbeiten die Klassen [`AutoModelForSequenceClassification`] und [`AutoTokenizer`] zusammen, um die [`pipeline`] zu betreiben. Eine [`AutoClass`](./model_doc/auto) ist eine Abkürzung, die automatisch die Architektur eines trainierten Modells aus dessen Namen oder Pfad abruft. Sie müssen nur die passende `AutoClass` für Ihre Aufgabe und den zugehörigen Tokenizer mit [`AutoTokenizer`] auswählen. 
-
-Kehren wir zu unserem Beispiel zurück und sehen wir uns an, wie Sie die `AutoClass` verwenden können, um die Ergebnisse der [`pipeline`] zu replizieren.
-
-### AutoTokenizer
-
-Ein Tokenizer ist für die Vorverarbeitung von Text in ein für das Modell verständliches Format zuständig. Zunächst zerlegt der Tokenisierer den Text in Wörter, die *Token* genannt werden. Es gibt mehrere Regeln für den Tokenisierungsprozess, z. B. wie und auf welcher Ebene ein Wort aufgespalten wird (weitere Informationen über Tokenisierung [hier](./tokenizer_summary)). Das Wichtigste ist jedoch, dass Sie den Tokenizer mit demselben Modellnamen instanziieren müssen, um sicherzustellen, dass Sie dieselben Tokenisierungsregeln verwenden, mit denen ein Modell zuvor trainiert wurde.
-Laden sie einen Tokenizer mit [`AutoTokenizer`]:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
->>> tokenizer = AutoTokenizer.from_pretrained(model_name)
-```
-
-Anschließend wandelt der Tokenizer die Token in Zahlen um, um einen Tensor als Eingabe für das Modell zu konstruieren. Dieser wird als *Vokabular* des Modells bezeichnet.
-
-Übergeben Sie Ihren Text an den Tokenizer:
-
-```py
->>> encoding = tokenizer("We are very happy to show you the 🤗 Transformers library.")
->>> print(encoding)
-{'input_ids': [101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103, 100, 58263, 13299, 119, 102],
- 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
- 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
-```
-
-Der Tokenizer gibt ein Wörterbuch zurück, das Folgendes enthält:
-
-* [input_ids](./glossary#input-ids): numerische Repräsentationen Ihrer Token.
-* [atttention_mask](.glossary#attention-mask): gibt an, welche Token beachtet werden sollen.
-
-Genau wie die [`pipeline`] akzeptiert der Tokenizer eine Liste von Eingaben. Darüber hinaus kann der Tokenizer den Text auch auffüllen und kürzen, um einen Stapel mit einheitlicher Länge zurückzugeben:
-
-<frameworkcontent>
-<pt>
-```py
->>> pt_batch = tokenizer(
-...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
-...     padding=True,
-...     truncation=True,
-...     max_length=512,
-...     return_tensors="pt",
-... )
-```
-</pt>
-<tf>
-```py
->>> tf_batch = tokenizer(
-...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
-...     padding=True,
-...     truncation=True,
-...     max_length=512,
-...     return_tensors="tf",
-... )
-```
-</tf>
-</frameworkcontent>
-
-Lesen Sie das Tutorial [preprocessing](./preprocessing) für weitere Details zur Tokenisierung.
-
-### AutoModel
-
-<frameworkcontent>
-<pt>
-🤗 Transformers bietet eine einfache und einheitliche Möglichkeit, vortrainierte Instanzen zu laden. Das bedeutet, dass Sie ein [`AutoModel`] laden können, wie Sie einen [`AutoTokenizer`] laden würden. Der einzige Unterschied ist die Auswahl des richtigen [`AutoModel`] für die Aufgabe. Da Sie eine Text- oder Sequenzklassifizierung vornehmen, laden Sie [`AutoModelForSequenceClassification`]:
-
-```py
->>> from transformers import AutoModelForSequenceClassification
-
->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
->>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)
-```
-
-<Tip>
-
-In der [Aufgabenzusammenfassung](./task_summary) steht, welche [AutoModel]-Klasse für welche Aufgabe zu verwenden ist.
-
-</Tip>
-
-Jetzt können Sie Ihren vorverarbeiteten Stapel von Eingaben direkt an das Modell übergeben. Sie müssen nur das Wörterbuch entpacken, indem Sie `**` hinzufügen:
-
-```py
->>> pt_outputs = pt_model(**pt_batch)
-```
-
-Das Modell gibt die endgültigen Aktivierungen in dem Attribut "logits" aus. Wenden Sie die Softmax-Funktion auf die "logits" an, um die Wahrscheinlichkeiten zu erhalten:
-  
-```py
->>> from torch import nn
-
->>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
->>> print(pt_predictions)
-tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
-        [0.2084, 0.1826, 0.1969, 0.1755, 0.2365]], grad_fn=<SoftmaxBackward0>)
-```
-</pt>
-<tf>
-🤗 Transformers bietet eine einfache und einheitliche Methode zum Laden von vortrainierten Instanzen. Das bedeutet, dass Sie ein [`TFAutoModel`] genauso laden können, wie Sie einen [`AutoTokenizer`] laden würden. Der einzige Unterschied ist die Auswahl des richtigen [`TFAutoModel`] für die Aufgabe. Da Sie Text - oder Sequenz - Klassifizierung machen, laden Sie [`TFAutoModelForSequenceClassification`]:
-
-```py
->>> from transformers import TFAutoModelForSequenceClassification
-
->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
-```
-
-<Tip>
-
-In der [Aufgabenzusammenfassung](./task_summary) steht, welche [AutoModel]-Klasse für welche Aufgabe zu verwenden ist.
-
-</Tip>
-
-Jetzt können Sie Ihren vorverarbeiteten Stapel von Eingaben direkt an das Modell übergeben, indem Sie die Wörterbuchschlüssel direkt an die Tensoren übergeben:
-  
-```py
->>> tf_outputs = tf_model(tf_batch)
-```
-
-Das Modell gibt die endgültigen Aktivierungen in dem Attribut "logits" aus. Wenden Sie die Softmax-Funktion auf die "logits" an, um die Wahrscheinlichkeiten zu erhalten:
-
-```py
->>> import tensorflow as tf
-
->>> tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1)
->>> tf_predictions  # doctest: +IGNORE_RESULT
-```
-</tf>
-</frameworkcontent>
-
-<Tip>
-
-Alle 🤗 Transformers-Modelle (PyTorch oder TensorFlow) geben die Tensoren *vor* der endgültigen Aktivierungsfunktion
-Funktion (wie Softmax) aus, da die endgültige Aktivierungsfunktion oft mit dem Verlusten verschmolzen ist.
-
-</Tip>
-
-Modelle sind ein standardmäßiges [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) oder ein [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model), sodass Sie sie in Ihrer üblichen Trainingsschleife verwenden können. Um jedoch die Dinge einfacher zu machen, bietet 🤗 Transformers eine [`Trainer`]-Klasse für PyTorch, die Funktionalität für verteiltes Training, gemischte Präzision und mehr bietet. Für TensorFlow können Sie die Methode `fit` aus [Keras](https://keras.io/) verwenden. Siehe das [training tutorial](./training) für weitere Details.
-
-<Tip>
-
-Transformers-Modellausgaben sind spezielle Datenklassen, so dass ihre Attribute in einer IDE automatisch vervollständigt werden.
-Die Modellausgänge verhalten sich auch wie ein Tupel oder ein Wörterbuch (z.B. können Sie mit einem Integer, einem Slice oder einem String indexieren), wobei die Attribute, die "None" sind, ignoriert werden.
-
-</Tip>
-
-### Modell speichern
-
-<frameworkcontent>
-<pt>
-Sobald Ihr Modell feinabgestimmt ist, können Sie es mit seinem Tokenizer speichern, indem Sie [`PreTrainedModel.save_pretrained`] verwenden:
-
-```py
->>> pt_save_directory = "./pt_save_pretrained"
->>> tokenizer.save_pretrained(pt_save_directory)  # doctest: +IGNORE_RESULT
->>> pt_model.save_pretrained(pt_save_directory)
-```
-
-Wenn Sie bereit sind, das Modell erneut zu verwenden, laden Sie es mit [`PreTrainedModel.from_pretrained`]:
-
-```py
->>> pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained")
-```
-</pt>
-<tf>
-Sobald Ihr Modell feinabgestimmt ist, können Sie es mit seinem Tokenizer unter Verwendung von [`TFPreTrainedModel.save_pretrained`] speichern:
-
-```py
->>> tf_save_directory = "./tf_save_pretrained"
->>> tokenizer.save_pretrained(tf_save_directory)  # doctest: +IGNORE_RESULT
->>> tf_model.save_pretrained(tf_save_directory)
-```
-
-Wenn Sie bereit sind, das Modell wieder zu verwenden, laden Sie es mit [`TFPreTrainedModel.from_pretrained`]:
-
-```py
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("./tf_save_pretrained")
-```
-</tf>
-</frameworkcontent>
-
-Ein besonders cooles 🤗 Transformers-Feature ist die Möglichkeit, ein Modell zu speichern und es entweder als PyTorch- oder TensorFlow-Modell wieder zu laden. Der Parameter "from_pt" oder "from_tf" kann das Modell von einem Framework in das andere konvertieren:
-
-<frameworkcontent>
-<pt>
-```py
->>> from transformers import AutoModel
-
->>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
->>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
-```
-</pt>
-<tf>
-```py
->>> from transformers import TFAutoModel
-
->>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
-```
-</tf>
-</frameworkcontent>
-
-## Custom model builds
-
-Sie können die Konfigurationsklasse des Modells ändern, um zu bestimmen, wie ein Modell aufgebaut ist. Die Konfiguration legt die Attribute eines Modells fest, z. B. die Anzahl der verborgenen Schichten oder der Aufmerksamkeitsköpfe. Wenn Sie ein Modell aus einer benutzerdefinierten Konfigurationsklasse initialisieren, beginnen Sie bei Null. Die Modellattribute werden zufällig initialisiert, und Sie müssen das Modell trainieren, bevor Sie es verwenden können, um aussagekräftige Ergebnisse zu erhalten.
-
-Beginnen Sie mit dem Import von [`AutoConfig`] und laden Sie dann das trainierte Modell, das Sie ändern möchten. Innerhalb von [`AutoConfig.from_pretrained`] können Sie das Attribut angeben, das Sie ändern möchten, z. B. die Anzahl der Aufmerksamkeitsköpfe:
-
-```py
->>> from transformers import AutoConfig
-
->>> my_config = AutoConfig.from_pretrained("distilbert-base-uncased", n_heads=12)
-```
-
-<frameworkcontent>
-<pt>
-Create a model from your custom configuration with [`AutoModel.from_config`]:
-
-```py
->>> from transformers import AutoModel
-
->>> my_model = AutoModel.from_config(my_config)
-```
-</pt>
-<tf>
-Create a model from your custom configuration with [`TFAutoModel.from_config`]:
-
-```py
->>> from transformers import TFAutoModel
-
->>> my_model = TFAutoModel.from_config(my_config)
-```
-</tf>
-</frameworkcontent>
-
-Weitere Informationen zur Erstellung von benutzerdefinierten Konfigurationen finden Sie in der Anleitung [Erstellen einer benutzerdefinierten Architektur](./create_a_model).
-
-## Wie geht es weiter?
-
-Nachdem Sie nun die 🤗 Transformers-Kurztour abgeschlossen haben, schauen Sie sich unsere Anleitungen an und erfahren Sie, wie Sie spezifischere Dinge tun können, wie das Schreiben eines benutzerdefinierten Modells, die Feinabstimmung eines Modells für eine Aufgabe und wie man ein Modell mit einem Skript trainiert. Wenn Sie mehr über die Kernkonzepte von 🤗 Transformers erfahren möchten, nehmen Sie sich eine Tasse Kaffee und werfen Sie einen Blick auf unsere konzeptionellen Leitfäden!
diff --git a/docs/source/de/run_scripts.md b/docs/source/de/run_scripts.md
new file mode 100644
index 000000000000..2902d4c08414
--- /dev/null
+++ b/docs/source/de/run_scripts.md
@@ -0,0 +1,351 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Trainieren mit einem Skript
+
+Neben den 🤗 Transformers [notebooks](./noteboks/README) gibt es auch Beispielskripte, die zeigen, wie man ein Modell für eine Aufgabe mit [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow) oder [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax) trainiert.
+
+Sie werden auch Skripte finden, die wir in unseren [Forschungsprojekten](https://github.com/huggingface/transformers/tree/main/examples/research_projects) und [Legacy-Beispielen](https://github.com/huggingface/transformers/tree/main/examples/legacy) verwendet haben und die größtenteils von der Community stammen. Diese Skripte werden nicht aktiv gepflegt und erfordern eine bestimmte Version von 🤗 Transformers, die höchstwahrscheinlich nicht mit der neuesten Version der Bibliothek kompatibel ist.
+
+Es wird nicht erwartet, dass die Beispielskripte bei jedem Problem sofort funktionieren. Möglicherweise müssen Sie das Skript an das Problem anpassen, das Sie zu lösen versuchen. Um Ihnen dabei zu helfen, legen die meisten Skripte vollständig offen, wie die Daten vorverarbeitet werden, so dass Sie sie nach Bedarf für Ihren Anwendungsfall bearbeiten können.
+
+Für jede Funktion, die Sie in einem Beispielskript implementieren möchten, diskutieren Sie bitte im [Forum] (https://discuss.huggingface.co/) oder in einem [issue] (https://github.com/huggingface/transformers/issues), bevor Sie einen Pull Request einreichen. Wir freuen uns zwar über Fehlerkorrekturen, aber es ist unwahrscheinlich, dass wir einen Pull Request zusammenführen, der mehr Funktionalität auf Kosten der Lesbarkeit hinzufügt.
+
+Diese Anleitung zeigt Ihnen, wie Sie ein Beispiel für ein Trainingsskript zur Zusammenfassung in [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization) und [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization) ausführen können. Sofern nicht anders angegeben, sollten alle Beispiele mit beiden Frameworks funktionieren.
+
+## Einrichtung
+
+Um die neueste Version der Beispielskripte erfolgreich auszuführen, **müssen Sie 🤗 Transformers aus dem Quellcode** in einer neuen virtuellen Umgebung installieren:
+
+```bash
+git clone https://github.com/huggingface/transformers
+cd transformers
+pip install .
+```
+
+Für ältere Versionen der Beispielskripte klicken Sie auf die Umschalttaste unten:
+
+<details>
+  <summary>Beispiele für ältere Versionen von 🤗 Transformers</summary>
+	<ul>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.5.1/examples">v4.5.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.4.2/examples">v4.4.2</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.3.3/examples">v4.3.3</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.2.2/examples">v4.2.2</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.1.1/examples">v4.1.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.0.1/examples">v4.0.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.5.1/examples">v3.5.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.4.0/examples">v3.4.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.3.1/examples">v3.3.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.2.0/examples">v3.2.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.1.0/examples">v3.1.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.0.2/examples">v3.0.2</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.11.0/examples">v2.11.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.10.0/examples">v2.10.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.9.1/examples">v2.9.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.8.0/examples">v2.8.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.7.0/examples">v2.7.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.6.0/examples">v2.6.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.5.1/examples">v2.5.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.4.0/examples">v2.4.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.3.0/examples">v2.3.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.2.0/examples">v2.2.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.1.0/examples">v2.1.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.0.0/examples">v2.0.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v1.2.0/examples">v1.2.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v1.1.0/examples">v1.1.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v1.0.0/examples">v1.0.0</a></li>
+	</ul>
+</details>
+
+Dann stellen Sie Ihren aktuellen Klon von 🤗 Transformers auf eine bestimmte Version um, z.B. v3.5.1:
+
+```bash
+git checkout tags/v3.5.1
+```
+
+Nachdem Sie die richtige Bibliotheksversion eingerichtet haben, navigieren Sie zu dem Beispielordner Ihrer Wahl und installieren die beispielspezifischen Anforderungen:
+
+```bash
+pip install -r requirements.txt
+```
+
+## Ein Skript ausführen
+
+<frameworkcontent>
+<pt>
+Das Beispielskript lädt einen Datensatz aus der 🤗 [Datasets](https://huggingface.co/docs/datasets/) Bibliothek herunter und verarbeitet ihn vor. Dann nimmt das Skript eine Feinabstimmung eines Datensatzes mit dem [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) auf einer Architektur vor, die eine Zusammenfassung unterstützt. Das folgende Beispiel zeigt, wie die Feinabstimmung von [T5-small](https://huggingface.co/t5-small) auf dem Datensatz [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) durchgeführt wird. Das T5-Modell benötigt aufgrund der Art und Weise, wie es trainiert wurde, ein zusätzliches Argument `source_prefix`. Mit dieser Eingabeaufforderung weiß T5, dass es sich um eine Zusammenfassungsaufgabe handelt.
+
+```bash
+python examples/pytorch/summarization/run_summarization.py \
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+</pt>
+<tf>
+Das Beispielskript lädt einen Datensatz aus der 🤗 [Datasets](https://huggingface.co/docs/datasets/) Bibliothek herunter und verarbeitet ihn vor. Anschließend nimmt das Skript die Feinabstimmung eines Datensatzes mit Keras auf einer Architektur vor, die die Zusammenfassung unterstützt. Das folgende Beispiel zeigt, wie die Feinabstimmung von [T5-small](https://huggingface.co/t5-small) auf dem [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) Datensatz durchgeführt wird. Das T5-Modell benötigt aufgrund der Art und Weise, wie es trainiert wurde, ein zusätzliches Argument `source_prefix`. Mit dieser Eingabeaufforderung weiß T5, dass es sich um eine Zusammenfassungsaufgabe handelt.
+
+```bash
+python examples/tensorflow/summarization/run_summarization.py  \
+    --model_name_or_path t5-small \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --output_dir /tmp/tst-summarization  \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 16 \
+    --num_train_epochs 3 \
+    --do_train \
+    --do_eval
+```
+</tf>
+</frameworkcontent>
+
+## Verteiltes Training und gemischte Präzision
+
+Der [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) unterstützt verteiltes Training und gemischte Präzision, d.h. Sie können ihn auch in einem Skript verwenden. So aktivieren Sie diese beiden Funktionen:
+
+- Fügen Sie das Argument `fp16` hinzu, um gemischte Genauigkeit zu aktivieren.
+- Legen Sie die Anzahl der zu verwendenden GPUs mit dem Argument `nproc_per_node` fest.
+
+```bash
+python -m torch.distributed.launch \
+    --nproc_per_node 8 pytorch/summarization/run_summarization.py \
+    --fp16 \
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+
+TensorFlow-Skripte verwenden eine [`MirroredStrategy`](https://www.tensorflow.org/guide/distributed_training#mirroredstrategy) für verteiltes Training, und Sie müssen dem Trainingsskript keine zusätzlichen Argumente hinzufügen. Das TensorFlow-Skript verwendet standardmäßig mehrere GPUs, wenn diese verfügbar sind.
+
+## Ein Skript auf einer TPU ausführen
+
+<frameworkcontent>
+<pt>
+Tensor Processing Units (TPUs) sind speziell für die Beschleunigung der Leistung konzipiert. PyTorch unterstützt TPUs mit dem [XLA](https://www.tensorflow.org/xla) Deep Learning Compiler (siehe [hier](https://github.com/pytorch/xla/blob/master/README.md) für weitere Details). Um eine TPU zu verwenden, starten Sie das Skript `xla_spawn.py` und verwenden das Argument `num_cores`, um die Anzahl der TPU-Kerne festzulegen, die Sie verwenden möchten.
+
+```bash
+python xla_spawn.py --num_cores 8 \
+    summarization/run_summarization.py \
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+</pt>
+<tf>
+Tensor Processing Units (TPUs) sind speziell für die Beschleunigung der Leistung konzipiert. TensorFlow Skripte verwenden eine [`TPUStrategy`](https://www.tensorflow.org/guide/distributed_training#tpustrategy) für das Training auf TPUs. Um eine TPU zu verwenden, übergeben Sie den Namen der TPU-Ressource an das Argument `tpu`.
+
+```bash
+python run_summarization.py  \
+    --tpu name_of_tpu_resource \
+    --model_name_or_path t5-small \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --output_dir /tmp/tst-summarization  \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 16 \
+    --num_train_epochs 3 \
+    --do_train \
+    --do_eval
+```
+</tf>
+</frameworkcontent>
+
+## Führen Sie ein Skript mit 🤗 Accelerate aus.
+
+🤗 [Accelerate](https://huggingface.co/docs/accelerate) ist eine reine PyTorch-Bibliothek, die eine einheitliche Methode für das Training eines Modells auf verschiedenen Arten von Setups (nur CPU, mehrere GPUs, TPUs) bietet und dabei die vollständige Transparenz der PyTorch-Trainingsschleife beibehält. Stellen Sie sicher, dass Sie 🤗 Accelerate installiert haben, wenn Sie es nicht bereits haben:
+
+> Hinweis: Da Accelerate schnell weiterentwickelt wird, muss die Git-Version von Accelerate installiert sein, um die Skripte auszuführen.
+```bash
+pip install git+https://github.com/huggingface/accelerate
+```
+
+Anstelle des Skripts `run_summarization.py` müssen Sie das Skript `run_summarization_no_trainer.py` verwenden. Die von Accelerate unterstützten Skripte haben eine Datei `task_no_trainer.py` im Ordner. Beginnen Sie mit dem folgenden Befehl, um eine Konfigurationsdatei zu erstellen und zu speichern:
+
+```bash
+accelerate config
+```
+
+Testen Sie Ihre Einrichtung, um sicherzustellen, dass sie korrekt konfiguriert ist:
+
+```bash
+accelerate test
+```
+
+Jetzt sind Sie bereit, das Training zu starten:
+
+```bash
+accelerate launch run_summarization_no_trainer.py \
+    --model_name_or_path t5-small \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir ~/tmp/tst-summarization
+```
+
+## Verwenden Sie einen benutzerdefinierten Datensatz
+
+Das Verdichtungsskript unterstützt benutzerdefinierte Datensätze, solange es sich um eine CSV- oder JSON-Line-Datei handelt. Wenn Sie Ihren eigenen Datensatz verwenden, müssen Sie mehrere zusätzliche Argumente angeben:
+
+- `train_file` und `validation_file` geben den Pfad zu Ihren Trainings- und Validierungsdateien an.
+- text_column` ist der Eingabetext, der zusammengefasst werden soll.
+- Summary_column" ist der auszugebende Zieltext.
+
+Ein Zusammenfassungsskript, das einen benutzerdefinierten Datensatz verwendet, würde wie folgt aussehen:
+
+```bash
+python examples/pytorch/summarization/run_summarization.py \
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --train_file path_to_csv_or_jsonlines_file \
+    --validation_file path_to_csv_or_jsonlines_file \
+    --text_column text_column_name \
+    --summary_column summary_column_name \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --overwrite_output_dir \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --predict_with_generate
+```
+
+## Testen Sie ein Skript
+
+Es ist oft eine gute Idee, Ihr Skript an einer kleineren Anzahl von Beispielen für Datensätze auszuführen, um sicherzustellen, dass alles wie erwartet funktioniert, bevor Sie sich auf einen ganzen Datensatz festlegen, dessen Fertigstellung Stunden dauern kann. Verwenden Sie die folgenden Argumente, um den Datensatz auf eine maximale Anzahl von Stichproben zu beschränken:
+
+- `max_train_samples`
+- `max_eval_samples`
+- `max_predict_samples`
+
+```bash
+python examples/pytorch/summarization/run_summarization.py \
+    --model_name_or_path t5-small \
+    --max_train_samples 50 \
+    --max_eval_samples 50 \
+    --max_predict_samples 50 \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+
+Nicht alle Beispielskripte unterstützen das Argument `max_predict_samples`. Wenn Sie sich nicht sicher sind, ob Ihr Skript dieses Argument unterstützt, fügen Sie das Argument `-h` hinzu, um dies zu überprüfen:
+
+```bash
+examples/pytorch/summarization/run_summarization.py -h
+```
+
+## Training vom Kontrollpunkt fortsetzen
+
+Eine weitere hilfreiche Option, die Sie aktivieren können, ist die Wiederaufnahme des Trainings von einem früheren Kontrollpunkt aus. Auf diese Weise können Sie im Falle einer Unterbrechung Ihres Trainings dort weitermachen, wo Sie aufgehört haben, ohne von vorne beginnen zu müssen. Es gibt zwei Methoden, um das Training von einem Kontrollpunkt aus wieder aufzunehmen.
+
+Die erste Methode verwendet das Argument `output_dir previous_output_dir`, um das Training ab dem letzten in `output_dir` gespeicherten Kontrollpunkt wieder aufzunehmen. In diesem Fall sollten Sie `overwrite_output_dir` entfernen:
+
+```bash
+python examples/pytorch/summarization/run_summarization.py
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --output_dir previous_output_dir \
+    --predict_with_generate
+```
+
+Die zweite Methode verwendet das Argument `Resume_from_checkpoint path_to_specific_checkpoint`, um das Training ab einem bestimmten Checkpoint-Ordner wieder aufzunehmen.
+
+```bash
+python examples/pytorch/summarization/run_summarization.py
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --resume_from_checkpoint path_to_specific_checkpoint \
+    --predict_with_generate
+```
+
+## Teilen Sie Ihr Modell
+
+Alle Skripte können Ihr endgültiges Modell in den [Model Hub](https://huggingface.co/models) hochladen. Stellen Sie sicher, dass Sie bei Hugging Face angemeldet sind, bevor Sie beginnen:
+
+```bash
+huggingface-cli login
+```
+
+Dann fügen Sie dem Skript das Argument `push_to_hub` hinzu. Mit diesem Argument wird ein Repository mit Ihrem Hugging Face-Benutzernamen und dem in `output_dir` angegebenen Ordnernamen erstellt.
+
+Wenn Sie Ihrem Repository einen bestimmten Namen geben möchten, fügen Sie ihn mit dem Argument `push_to_hub_model_id` hinzu. Das Repository wird automatisch unter Ihrem Namensraum aufgeführt.
+
+Das folgende Beispiel zeigt, wie Sie ein Modell mit einem bestimmten Repository-Namen hochladen können:
+
+```bash
+python examples/pytorch/summarization/run_summarization.py
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --push_to_hub \
+    --push_to_hub_model_id finetuned-t5-cnn_dailymail \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
\ No newline at end of file
diff --git a/docs/source/de/testing.md b/docs/source/de/testing.md
new file mode 100644
index 000000000000..e921484fa2f6
--- /dev/null
+++ b/docs/source/de/testing.md
@@ -0,0 +1,1293 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Testen
+
+
+Werfen wir einen Blick darauf, wie 🤗 Transformers-Modelle getestet werden und wie Sie neue Tests schreiben und die vorhandenen verbessern können.
+
+Es gibt 2 Testsuiten im Repository:
+
+1. `tests` -- Tests für die allgemeine API
+2. `examples` -- Tests hauptsächlich für verschiedene Anwendungen, die nicht Teil der API sind
+
+## Wie Transformatoren getestet werden
+
+1. Sobald ein PR eingereicht wurde, wird er mit 9 CircleCi Jobs getestet. Jeder neue Commit zu diesem PR wird erneut getestet. Diese Aufträge
+   sind in dieser [Konfigurationsdatei](https://github.com/huggingface/transformers/tree/main/.circleci/config.yml) definiert, so dass Sie bei Bedarf die gleiche Umgebung auf Ihrem Rechner reproduzieren können.
+   Umgebung auf Ihrem Rechner reproduzieren können.
+
+   Diese CI-Jobs führen keine `@slow`-Tests durch.
+
+2. Es gibt 3 Jobs, die von [github actions](https://github.com/huggingface/transformers/actions) ausgeführt werden:
+
+   - [torch hub integration](https://github.com/huggingface/transformers/tree/main/.github/workflows/github-torch-hub.yml): prüft, ob die torch hub
+     Integration funktioniert.
+
+   - [self-hosted (push)](https://github.com/huggingface/transformers/tree/main/.github/workflows/self-push.yml): führt schnelle Tests auf der GPU nur bei Commits auf
+     `main`. Es wird nur ausgeführt, wenn ein Commit auf `main` den Code in einem der folgenden Ordner aktualisiert hat: `src`,
+     `tests`, `.github` (um zu verhindern, dass er auf hinzugefügten Modellkarten, Notebooks usw. läuft)
+
+   - [self-hosted runner](https://github.com/huggingface/transformers/tree/main/.github/workflows/self-scheduled.yml): führt normale und langsame Tests auf GPU in
+     `tests` und `examples`:
+
+```bash
+RUN_SLOW=1 pytest tests/
+RUN_SLOW=1 pytest examples/
+```
+
+   Die Ergebnisse können Sie [hier](https://github.com/huggingface/transformers/actions) sehen.
+
+
+
+## Tests ausführen
+
+
+
+
+
+### Auswahl der auszuführenden Tests
+
+In diesem Dokument wird ausführlich erläutert, wie Tests ausgeführt werden können. Wenn Sie nach der Lektüre noch mehr Details benötigen
+finden Sie diese [hier](https://docs.pytest.org/en/latest/usage.html).
+
+Hier sind einige der nützlichsten Möglichkeiten, Tests auszuführen.
+
+Alle ausführen:
+
+```console
+pytest
+```
+
+oder:
+
+```bash
+make test
+```
+
+Beachten Sie, dass Letzteres wie folgt definiert ist:
+
+```bash
+python -m pytest -n auto --dist=loadfile -s -v ./tests/
+```
+
+was pytest anweist:
+
+- so viele Testprozesse laufen zu lassen, wie es CPU-Kerne gibt (was zu viele sein könnten, wenn Sie nicht über eine Menge RAM verfügen!)
+- sicherzustellen, dass alle Tests aus derselben Datei von demselben Testprozess ausgeführt werden
+- Erfassen Sie keine Ausgaben
+- im ausführlichen Modus laufen lassen
+
+
+
+### Abrufen der Liste aller Tests
+
+Alle Tests der Testsuite:
+
+```bash
+pytest --collect-only -q
+```
+
+Alle Tests einer bestimmten Testdatei:
+
+```bash
+pytest tests/test_optimization.py --collect-only -q
+```
+
+### Führen Sie ein bestimmtes Testmodul aus
+
+Um ein einzelnes Testmodul auszuführen:
+
+```bash
+pytest tests/utils/test_logging.py
+```
+
+### Spezifische Tests ausführen
+
+Da unittest in den meisten Tests verwendet wird, müssen Sie, um bestimmte Untertests auszuführen, den Namen der unittest
+Klasse, die diese Tests enthält. Er könnte zum Beispiel lauten:
+
+```bash
+pytest tests/test_optimization.py::OptimizationTest::test_adam_w
+```
+
+Hier:
+
+- `tests/test_optimization.py` - die Datei mit den Tests
+- `OptimizationTest` - der Name der Klasse
+- `test_adam_w` - der Name der spezifischen Testfunktion
+
+Wenn die Datei mehrere Klassen enthält, können Sie auswählen, dass nur die Tests einer bestimmten Klasse ausgeführt werden sollen. Zum Beispiel:
+
+```bash
+pytest tests/test_optimization.py::OptimizationTest
+```
+
+führt alle Tests innerhalb dieser Klasse aus.
+
+Wie bereits erwähnt, können Sie sehen, welche Tests in der Klasse `OptimizationTest` enthalten sind, indem Sie sie ausführen:
+
+```bash
+pytest tests/test_optimization.py::OptimizationTest --collect-only -q
+```
+
+Sie können Tests mit Hilfe von Schlüsselwortausdrücken ausführen.
+
+Um nur Tests auszuführen, deren Name `adam` enthält:
+
+```bash
+pytest -k adam tests/test_optimization.py
+```
+
+Die logischen `und` und `oder` können verwendet werden, um anzugeben, ob alle Schlüsselwörter übereinstimmen sollen oder nur eines. `nicht` kann verwendet werden, um
+negieren.
+
+Um alle Tests auszuführen, außer denen, deren Name `adam` enthält:
+
+```bash
+pytest -k "not adam" tests/test_optimization.py
+```
+
+Und Sie können die beiden Muster in einem kombinieren:
+
+```bash
+pytest -k "ada and not adam" tests/test_optimization.py
+```
+
+Um zum Beispiel sowohl `test_adafactor` als auch `test_adam_w` auszuführen, können Sie verwenden:
+
+```bash
+pytest -k "test_adam_w or test_adam_w" tests/test_optimization.py
+```
+
+Beachten Sie, dass wir hier `oder` verwenden, da wir wollen, dass eines der Schlüsselwörter übereinstimmt, um beide einzuschließen.
+
+Wenn Sie nur Tests einschließen möchten, die beide Muster enthalten, müssen Sie `und` verwenden:
+
+```bash
+pytest -k "test and ada" tests/test_optimization.py
+```
+
+### Führen Sie `accelerate` Tests durch
+
+Manchmal müssen Sie `accelerate` Tests für Ihre Modelle ausführen. Dazu fügen Sie einfach `-m accelerate_tests` zu Ihrem Befehl hinzu, wenn Sie diese Tests bei einem `OPT`-Lauf ausführen möchten:
+```bash
+RUN_SLOW=1 pytest -m accelerate_tests tests/models/opt/test_modeling_opt.py 
+```
+
+
+### Dokumentationstests ausführen 
+
+Um zu testen, ob die Dokumentationsbeispiele korrekt sind, sollten Sie überprüfen, ob die `doctests` erfolgreich sind. 
+Lassen Sie uns als Beispiel den docstring von [WhisperModel.forward](https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py#L1017-L1035) verwenden: 
+
+```python 
+r"""
+Returns:
+
+Example:
+    ```python
+    >>> import torch
+    >>> from transformers import WhisperModel, WhisperFeatureExtractor
+    >>> from datasets import load_dataset
+
+    >>> model = WhisperModel.from_pretrained("openai/whisper-base")
+    >>> feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base")
+    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+    >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
+    >>> input_features = inputs.input_features
+    >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
+    >>> last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state
+    >>> list(last_hidden_state.shape)
+    [1, 2, 512]
+    ```"""
+
+```
+
+Führen Sie einfach die folgende Zeile aus, um automatisch jedes docstring-Beispiel in der gewünschten Datei zu testen: 
+```bash 
+pytest --doctest-modules <path_to_file_or_dir>
+```
+Wenn die Datei eine Markdown-Erweiterung hat, sollten Sie das Argument `--doctest-glob="*.md"` hinzufügen.
+
+### Nur geänderte Tests ausführen
+
+Mit [pytest-picked](https://github.com/anapaulagomes/pytest-picked) können Sie die Tests ausführen, die sich auf die unstaged Dateien oder den aktuellen Zweig (gemäß Git) beziehen. Auf diese Weise können Sie schnell testen, ob Ihre Änderungen nichts kaputt gemacht haben.
+nichts kaputt gemacht haben, da die Tests für Dateien, die Sie nicht verändert haben, nicht ausgeführt werden.
+
+```bash
+pip install pytest-picked
+```
+
+```bash
+pytest --picked
+```
+
+Alle Tests werden von Dateien und Ordnern ausgeführt, die geändert, aber noch nicht übergeben wurden.
+
+### Fehlgeschlagene Tests bei Änderung der Quelle automatisch wiederholen
+
+[pytest-xdist](https://github.com/pytest-dev/pytest-xdist) bietet eine sehr nützliche Funktion zur Erkennung aller fehlgeschlagenen
+Tests zu erkennen und dann darauf zu warten, dass Sie Dateien ändern, um die fehlgeschlagenen Tests so lange zu wiederholen, bis sie erfolgreich sind, während Sie die
+sie reparieren. So müssen Sie pytest nicht erneut starten, nachdem Sie die Korrektur vorgenommen haben. Dies wird so lange wiederholt, bis alle Tests bestanden sind.
+Danach wird erneut ein vollständiger Durchlauf durchgeführt.
+
+```bash
+pip install pytest-xdist
+```
+
+So rufen Sie den Modus auf: `pytest -f` oder `pytest --looponfail`
+
+Datei-Änderungen werden erkannt, indem die Wurzelverzeichnisse von `looponfailroots` und alle ihre Inhalte (rekursiv) untersucht werden.
+Wenn die Vorgabe für diesen Wert für Sie nicht funktioniert, können Sie ihn in Ihrem Projekt ändern, indem Sie eine Konfigurations
+Option in der Datei `setup.cfg` ändern:
+
+```ini
+[tool:pytest]
+looponfailroots = transformers tests
+```
+
+oder die Dateien `pytest.ini`/`tox.ini``:
+
+```ini
+[pytest]
+looponfailroots = transformers tests
+```
+
+Dies würde dazu führen, dass nur nach Dateiänderungen in den jeweiligen Verzeichnissen gesucht wird, die relativ zum Verzeichnis der ini-Datei angegeben sind.
+Verzeichnis.
+
+[pytest-watch](https://github.com/joeyespo/pytest-watch) ist eine alternative Implementierung dieser Funktionalität.
+
+
+### Überspringen eines Testmoduls
+
+Wenn Sie alle Testmodule ausführen möchten, mit Ausnahme einiger weniger, können Sie diese ausschließen, indem Sie eine explizite Liste der auszuführenden Tests angeben. Für
+Beispiel: Um alle Tests außer `test_modeling_*.py` auszuführen:
+
+```bash
+pytest *ls -1 tests/*py | grep -v test_modeling*
+```
+
+### Status leeren
+
+CI-Builds und wenn Isolation wichtig ist (gegen Geschwindigkeit), sollte der Cache geleert werden:
+
+```bash
+pytest --cache-clear tests
+```
+
+### Tests parallel ausführen
+
+Wie bereits erwähnt, führt `make test` über das Plugin `pytest-xdist` Tests parallel aus (Argument `-n X`, z.B. `-n 2`
+um 2 Jobs parallel laufen zu lassen).
+
+Mit der Option `--dist=` von `pytest-xdist` können Sie steuern, wie die Tests gruppiert werden. Mit `--dist=loadfile` werden die
+Tests, die sich in einer Datei befinden, in denselben Prozess.
+
+Da die Reihenfolge der ausgeführten Tests unterschiedlich und nicht vorhersehbar ist, kann die Ausführung der Testsuite mit `pytest-xdist`
+zu Fehlern führt (was bedeutet, dass wir einige unentdeckte gekoppelte Tests haben), verwenden Sie [pytest-replay](https://github.com/ESSS/pytest-replay), um die Tests in der gleichen Reihenfolge abzuspielen, was dabei helfen sollte
+diese fehlgeschlagene Sequenz auf ein Minimum zu reduzieren.
+
+### Testreihenfolge und Wiederholung
+
+Es ist gut, die Tests mehrmals zu wiederholen, nacheinander, zufällig oder in Gruppen, um mögliche
+Abhängigkeiten und zustandsbezogene Fehler zu erkennen (Abriss). Und die einfache, mehrfache Wiederholung ist einfach gut, um
+einige Probleme zu erkennen, die durch die Zufälligkeit von DL aufgedeckt werden.
+
+
+#### Wiederholungstests
+
+- [pytest-flakefinder](https://github.com/dropbox/pytest-flakefinder):
+
+```bash
+pip install pytest-flakefinder
+```
+
+Und führen Sie dann jeden Test mehrmals durch (standardmäßig 50):
+
+```bash
+pytest --flake-finder --flake-runs=5 tests/test_failing_test.py
+```
+
+<Tip>
+
+Dieses Plugin funktioniert nicht mit dem `-n` Flag von `pytest-xdist`.
+
+</Tip>
+
+<Tip>
+
+Es gibt noch ein anderes Plugin `pytest-repeat`, aber es funktioniert nicht mit `unittest`.
+
+</Tip>
+
+#### Run tests in a random order
+
+```bash
+pip install pytest-random-order
+```
+
+Wichtig: Das Vorhandensein von `pytest-random-order` sorgt für eine automatische Zufallsanordnung der Tests, es sind keine Konfigurationsänderungen oder
+Befehlszeilenoptionen sind nicht erforderlich.
+
+Wie bereits erläutert, ermöglicht dies die Erkennung von gekoppelten Tests - bei denen der Zustand eines Tests den Zustand eines anderen beeinflusst. Wenn
+`pytest-random-order` installiert ist, gibt es den Zufallswert aus, der für diese Sitzung verwendet wurde, z.B:
+
+```bash
+pytest tests
+[...]
+Using --random-order-bucket=module
+Using --random-order-seed=573663
+```
+
+Wenn eine bestimmte Sequenz fehlschlägt, können Sie sie reproduzieren, indem Sie genau diesen Seed hinzufügen, z.B:
+
+```bash
+pytest --random-order-seed=573663
+[...]
+Using --random-order-bucket=module
+Using --random-order-seed=573663
+```
+
+Es wird nur dann die exakte Reihenfolge reproduzieren, wenn Sie genau dieselbe Liste von Tests (oder gar keine Liste) verwenden. Sobald Sie beginnen, die Liste
+die Liste manuell einzugrenzen, können Sie sich nicht mehr auf den Seed verlassen, sondern müssen die Tests manuell in der genauen Reihenfolge auflisten
+auflisten und pytest anweisen, sie nicht zu randomisieren, indem Sie `--random-order-bucket=none` verwenden, z.B.:
+
+```bash
+pytest --random-order-bucket=none tests/test_a.py tests/test_c.py tests/test_b.py
+```
+
+So deaktivieren Sie das Shuffling für alle Tests:
+
+```bash
+pytest --random-order-bucket=none
+```
+
+Standardmäßig ist `--random-order-bucket=module` impliziert, wodurch die Dateien auf den Modulebenen gemischt werden. Es kann auch
+auf den Ebenen `class`, `package`, `global` und `none` mischen. Die vollständigen Details entnehmen Sie bitte der
+[Dokumentation] (https://github.com/jbasko/pytest-random-order).
+
+Eine weitere Alternative zur Randomisierung ist: [`pytest-random`](https://github.com/pytest-dev/pytest-randomly). Dieses
+Modul hat eine sehr ähnliche Funktionalität/Schnittstelle, aber es hat nicht die Eimermodi, die in
+`pytest-random-order` zur Verfügung. Es hat das gleiche Problem, dass es sich nach der Installation aufdrängt.
+
+### Variationen von Aussehen und Bedienung
+
+#### pytest-zucker
+
+[pytest-sugar](https://github.com/Frozenball/pytest-sugar) ist ein Plugin, das das Erscheinungsbild verbessert, eine
+Fortschrittsbalken hinzufügt und Tests, die fehlschlagen, sowie die Bestätigung sofort anzeigt. Es wird bei der Installation automatisch aktiviert.
+
+```bash
+pip install pytest-sugar
+```
+
+Um Tests ohne sie durchzuführen, führen Sie aus:
+
+```bash
+pytest -p no:sugar
+```
+
+oder deinstallieren Sie es.
+
+
+
+#### Melden Sie den Namen jedes Subtests und seinen Fortschritt
+
+Für einen einzelnen oder eine Gruppe von Tests über `pytest` (nach `pip install pytest-pspec`):
+
+```bash
+pytest --pspec tests/test_optimization.py
+```
+
+#### Zeigt fehlgeschlagene Tests sofort an
+
+[pytest-instafail](https://github.com/pytest-dev/pytest-instafail) zeigt Fehlschläge und Fehler sofort an, anstatt
+bis zum Ende der Testsitzung zu warten.
+
+```bash
+pip install pytest-instafail
+```
+
+```bash
+pytest --instafail
+```
+
+### Zu GPU oder nicht zu GPU
+
+Bei einem GPU-aktivierten Setup fügen Sie zum Testen im reinen CPU-Modus `CUDA_VISIBLE_DEVICES=""` hinzu:
+
+```bash
+CUDA_VISIBLE_DEVICES="" pytest tests/utils/test_logging.py
+```
+
+oder wenn Sie mehrere Grafikprozessoren haben, können Sie angeben, welcher von `pytest` verwendet werden soll. Wenn Sie zum Beispiel nur den
+zweiten Grafikkarte zu verwenden, wenn Sie die Grafikkarten `0` und `1` haben, können Sie folgendes ausführen:
+
+```bash
+CUDA_VISIBLE_DEVICES="1" pytest tests/utils/test_logging.py
+```
+
+Dies ist praktisch, wenn Sie verschiedene Aufgaben auf verschiedenen GPUs ausführen möchten.
+
+Einige Tests müssen nur auf der CPU ausgeführt werden, andere entweder auf der CPU, der GPU oder der TPU und wieder andere auf mehreren GPUs. Die folgenden skip
+Dekorateure werden verwendet, um die Anforderungen von Tests in Bezug auf CPU/GPU/TPU festzulegen:
+
+- `require_torch` - dieser Test wird nur unter Torch ausgeführt
+- `require_torch_gpu` - wie `require_torch` plus erfordert mindestens 1 GPU
+- `require_torch_multi_gpu` - wie `require_torch` und zusätzlich mindestens 2 GPUs erforderlich
+- `require_torch_non_multi_gpu` - wie `require_torch` plus benötigt 0 oder 1 GPUs
+- `require_torch_up_to_2_gpus` - wie `require_torch` plus erfordert 0 oder 1 oder 2 GPUs
+- `require_torch_tpu` - wie `require_torch` plus erfordert mindestens 1 TPU
+
+Lassen Sie uns die GPU-Anforderungen in der folgenden Tabelle darstellen:
+
+
+| n gpus | decorator                      |
+|--------|--------------------------------|
+| `>= 0` | `@require_torch`               |
+| `>= 1` | `@require_torch_gpu`           |
+| `>= 2` | `@require_torch_multi_gpu`     |
+| `< 2`  | `@require_torch_non_multi_gpu` |
+| `< 3`  | `@require_torch_up_to_2_gpus`  |
+
+
+Hier ist zum Beispiel ein Test, der nur ausgeführt werden muss, wenn 2 oder mehr GPUs verfügbar sind und pytorch installiert ist:
+
+```python no-style
+@require_torch_multi_gpu
+def test_example_with_multi_gpu():
+```
+
+Wenn ein Test `tensorflow` benötigt, verwenden Sie den Dekorator `require_tf`. Zum Beispiel:
+
+```python no-style
+@require_tf
+def test_tf_thing_with_tensorflow():
+```
+
+Diese Dekors können gestapelt werden. Wenn zum Beispiel ein Test langsam ist und mindestens eine GPU unter pytorch benötigt, können Sie
+wie Sie ihn einrichten können:
+
+```python no-style
+@require_torch_gpu
+@slow
+def test_example_slow_on_gpu():
+```
+
+Einige Dekoratoren wie `@parametrized` schreiben Testnamen um, daher müssen `@require_*`-Sprungdekoratoren als letztes aufgeführt werden.
+zuletzt aufgeführt werden, damit sie korrekt funktionieren. Hier ist ein Beispiel für die korrekte Verwendung:
+
+```python no-style
+@parameterized.expand(...)
+@require_torch_multi_gpu
+def test_integration_foo():
+```
+
+Dieses Problem mit der Reihenfolge gibt es bei `@pytest.mark.parametrize` nicht, Sie können es an den Anfang oder an den Schluss setzen und es wird trotzdem funktionieren.
+funktionieren. Aber es funktioniert nur bei Nicht-Unittests.
+
+Innerhalb von Tests:
+
+- Wie viele GPUs sind verfügbar:
+
+```python
+from transformers.testing_utils import get_gpu_count
+
+n_gpu = get_gpu_count()  # works with torch and tf
+```
+
+### Testen mit einem bestimmten PyTorch-Backend oder Gerät
+
+Um die Testsuite auf einem bestimmten Torch-Gerät auszuführen, fügen Sie `TRANSFORMERS_TEST_DEVICE="$Gerät"` hinzu, wobei `$Gerät` das Ziel-Backend ist. Zum Beispiel, um nur auf der CPU zu testen:
+```bash
+TRANSFORMERS_TEST_DEVICE="cpu" pytest tests/utils/test_logging.py
+```
+
+Diese Variable ist nützlich, um benutzerdefinierte oder weniger verbreitete PyTorch-Backends wie `mps` zu testen. Sie kann auch verwendet werden, um den gleichen Effekt wie `CUDA_VISIBLE_DEVICES` zu erzielen, indem Sie bestimmte GPUs anvisieren oder im reinen CPU-Modus testen.
+
+Bestimmte Geräte erfordern einen zusätzlichen Import, nachdem Sie `torch` zum ersten Mal importiert haben. Dies kann über die Umgebungsvariable `TRANSFORMERS_TEST_BACKEND` festgelegt werden:
+```bash
+TRANSFORMERS_TEST_BACKEND="torch_npu" pytest tests/utils/test_logging.py
+```
+
+
+### Verteiltes Training
+
+`pytest` kann nicht direkt mit verteiltem Training umgehen. Wenn dies versucht wird, tun die Unterprozesse nicht das Richtige
+und denken am Ende, sie seien `pytest` und beginnen, die Testsuite in Schleifen auszuführen. Es funktioniert jedoch, wenn man
+einen normalen Prozess erzeugt, der dann mehrere Worker erzeugt und die IO-Pipes verwaltet.
+
+Hier sind einige Tests, die dies verwenden:
+
+- [test_trainer_distributed.py](https://github.com/huggingface/transformers/tree/main/tests/trainer/test_trainer_distributed.py)
+- [test_deepspeed.py](https://github.com/huggingface/transformers/tree/main/tests/deepspeed/test_deepspeed.py)
+
+Um direkt mit der Ausführung zu beginnen, suchen Sie in diesen Tests nach dem Aufruf `execute_subprocess_async`.
+
+Sie benötigen mindestens 2 GPUs, um diese Tests in Aktion zu sehen:
+
+```bash
+CUDA_VISIBLE_DEVICES=0,1 RUN_SLOW=1 pytest -sv tests/test_trainer_distributed.py
+```
+
+### Erfassung von Ausgaben
+
+Während der Testausführung werden alle Ausgaben, die an `stdout` und `stderr` gesendet werden, aufgezeichnet. Wenn ein Test oder eine Setup-Methode fehlschlägt, wird die
+wird die entsprechende aufgezeichnete Ausgabe in der Regel zusammen mit dem Fehler-Traceback angezeigt.
+
+Um die Aufzeichnung von Ausgaben zu deaktivieren und `stdout` und `stderr` normal zu erhalten, verwenden Sie `-s` oder `--capture=no`:
+
+```bash
+pytest -s tests/utils/test_logging.py
+```
+
+So senden Sie Testergebnisse an die JUnit-Formatausgabe:
+
+```bash
+py.test tests --junitxml=result.xml
+```
+
+### Farbsteuerung
+
+Keine Farbe zu haben (z.B. gelb auf weißem Hintergrund ist nicht lesbar):
+
+```bash
+pytest --color=no tests/utils/test_logging.py
+```
+
+### Testbericht an den Online-Dienst pastebin senden
+
+Erstellen Sie eine URL für jeden Testfehler:
+
+```bash
+pytest --pastebin=failed tests/utils/test_logging.py
+```
+
+Dadurch werden Informationen über den Testlauf an einen entfernten Paste-Dienst übermittelt und eine URL für jeden Fehlschlag bereitgestellt. Sie können die
+Tests wie gewohnt auswählen oder z.B. -x hinzufügen, wenn Sie nur einen bestimmten Fehler senden möchten.
+
+Erstellen einer URL für ein ganzes Testsitzungsprotokoll:
+
+```bash
+pytest --pastebin=all tests/utils/test_logging.py
+```
+
+## Tests schreiben
+
+🤗 Die Tests von Transformers basieren auf `unittest`, werden aber von `pytest` ausgeführt, so dass die meiste Zeit Funktionen aus beiden Systemen
+verwendet werden können.
+
+Sie können [hier](https://docs.pytest.org/en/stable/unittest.html) nachlesen, welche Funktionen unterstützt werden, aber das Wichtigste ist
+Wichtig ist, dass die meisten `pytest`-Fixtures nicht funktionieren. Auch die Parametrisierung nicht, aber wir verwenden das Modul
+`parametrisiert`, das auf ähnliche Weise funktioniert.
+
+
+### Parametrisierung
+
+Oft besteht die Notwendigkeit, denselben Test mehrmals auszuführen, aber mit unterschiedlichen Argumenten. Das könnte innerhalb des Tests geschehen
+des Tests gemacht werden, aber dann gibt es keine Möglichkeit, den Test mit nur einem Satz von Argumenten auszuführen.
+
+```python
+# test_this1.py
+import unittest
+from parameterized import parameterized
+
+
+class TestMathUnitTest(unittest.TestCase):
+    @parameterized.expand(
+        [
+            ("negative", -1.5, -2.0),
+            ("integer", 1, 1.0),
+            ("large fraction", 1.6, 1),
+        ]
+    )
+    def test_floor(self, name, input, expected):
+        assert_equal(math.floor(input), expected)
+```
+
+Nun wird dieser Test standardmäßig 3 Mal ausgeführt, wobei jedes Mal die letzten 3 Argumente von `test_floor` den entsprechenden Argumenten in der Parameterliste zugeordnet werden.
+die entsprechenden Argumente in der Parameterliste.
+
+Sie können auch nur die Parameter `negativ` und `ganzzahlig` mit ausführen:
+
+```bash
+pytest -k "negative and integer" tests/test_mytest.py
+```
+
+oder alle Untertests außer `negativ`, mit:
+
+```bash
+pytest -k "not negative" tests/test_mytest.py
+```
+
+Neben der Verwendung des gerade erwähnten Filters `-k` können Sie auch den genauen Namen jedes Untertests herausfinden und jeden
+oder alle unter Verwendung ihrer genauen Namen ausführen.
+
+```bash
+pytest test_this1.py --collect-only -q
+```
+
+und es wird aufgelistet:
+
+```bash
+test_this1.py::TestMathUnitTest::test_floor_0_negative
+test_this1.py::TestMathUnitTest::test_floor_1_integer
+test_this1.py::TestMathUnitTest::test_floor_2_large_fraction
+```
+
+Jetzt können Sie also nur 2 spezifische Untertests durchführen:
+
+```bash
+pytest test_this1.py::TestMathUnitTest::test_floor_0_negative  test_this1.py::TestMathUnitTest::test_floor_1_integer
+```
+
+Das Modul [parametrisiert](https://pypi.org/project/parameterized/), das sich bereits in den Entwickler-Abhängigkeiten befindet
+von `transformers` befindet, funktioniert sowohl für `unittests` als auch für `pytest` Tests.
+
+Wenn es sich bei dem Test jedoch nicht um einen `Unittest` handelt, können Sie `pytest.mark.parametrize` verwenden (oder Sie können sehen, dass es in
+einigen bestehenden Tests verwendet wird, meist unter `Beispiele`).
+
+Hier ist das gleiche Beispiel, diesmal unter Verwendung der `parametrize`-Markierung von `pytest`:
+
+```python
+# test_this2.py
+import pytest
+
+
+@pytest.mark.parametrize(
+    "name, input, expected",
+    [
+        ("negative", -1.5, -2.0),
+        ("integer", 1, 1.0),
+        ("large fraction", 1.6, 1),
+    ],
+)
+def test_floor(name, input, expected):
+    assert_equal(math.floor(input), expected)
+```
+
+Genau wie bei `parametrisiert` können Sie mit `pytest.mark.parametrize` genau steuern, welche Subtests ausgeführt werden
+ausgeführt werden, wenn der Filter `-k` nicht ausreicht. Allerdings erzeugt diese Parametrisierungsfunktion einen etwas anderen Satz von
+Namen für die Untertests. Sie sehen folgendermaßen aus:
+
+```bash
+pytest test_this2.py --collect-only -q
+```
+
+und es wird aufgelistet:
+
+```bash
+test_this2.py::test_floor[integer-1-1.0]
+test_this2.py::test_floor[negative--1.5--2.0]
+test_this2.py::test_floor[large fraction-1.6-1]
+```
+
+Jetzt können Sie also nur den spezifischen Test durchführen:
+
+```bash
+pytest test_this2.py::test_floor[negative--1.5--2.0] test_this2.py::test_floor[integer-1-1.0]
+```
+
+wie im vorherigen Beispiel.
+
+
+
+### Dateien und Verzeichnisse
+
+In Tests müssen wir oft wissen, wo sich Dinge relativ zur aktuellen Testdatei befinden, und das ist nicht trivial, da der Test
+von mehreren Verzeichnissen aus aufgerufen werden kann oder sich in Unterverzeichnissen mit unterschiedlicher Tiefe befinden kann. Eine Hilfsklasse
+`transformers.test_utils.TestCasePlus` löst dieses Problem, indem sie alle grundlegenden Pfade sortiert und einfache
+Zugriffsmöglichkeiten auf sie bietet:
+
+- `pathlib`-Objekte (alle vollständig aufgelöst):
+
+  - `test_file_path` - der aktuelle Testdateipfad, d.h. `__file__`
+  - `test_file_dir` - das Verzeichnis, das die aktuelle Testdatei enthält
+  - `tests_dir` - das Verzeichnis der `tests` Testreihe
+  - `examples_dir` - das Verzeichnis der `examples` Test-Suite
+  - repo_root_dir` - das Verzeichnis des Repositorys
+  - src_dir` - das Verzeichnis von `src` (d.h. wo sich das Unterverzeichnis `transformers` befindet)
+
+- stringifizierte Pfade - wie oben, aber diese geben Pfade als Strings zurück, anstatt als `pathlib`-Objekte:
+
+  - `test_file_path_str`
+  - `test_file_dir_str`
+  - `tests_dir_str`
+  - `examples_dir_str`
+  - `repo_root_dir_str`
+  - `src_dir_str`
+
+Um diese zu verwenden, müssen Sie lediglich sicherstellen, dass der Test in einer Unterklasse von
+`transformers.test_utils.TestCasePlus` befindet. Zum Beispiel:
+
+```python
+from transformers.testing_utils import TestCasePlus
+
+
+class PathExampleTest(TestCasePlus):
+    def test_something_involving_local_locations(self):
+        data_dir = self.tests_dir / "fixtures/tests_samples/wmt_en_ro"
+```
+
+Wenn Sie Pfade nicht über `pathlib` manipulieren müssen oder nur einen Pfad als String benötigen, können Sie jederzeit
+`str()` auf das `pathlib`-Objekt anwenden oder die Accessoren mit der Endung `_str` verwenden. Zum Beispiel:
+
+```python
+from transformers.testing_utils import TestCasePlus
+
+
+class PathExampleTest(TestCasePlus):
+    def test_something_involving_stringified_locations(self):
+        examples_dir = self.examples_dir_str
+```
+
+### Temporäre Dateien und Verzeichnisse
+
+Die Verwendung eindeutiger temporärer Dateien und Verzeichnisse ist für die parallele Durchführung von Tests unerlässlich, damit sich die Tests nicht gegenseitig überschreiben.
+Daten gegenseitig überschreiben. Außerdem möchten wir, dass die temporären Dateien und Verzeichnisse am Ende jedes Tests, der sie erstellt hat, gelöscht werden.
+erstellt hat. Daher ist die Verwendung von Paketen wie `tempfile`, die diese Anforderungen erfüllen, unerlässlich.
+
+Beim Debuggen von Tests müssen Sie jedoch sehen können, was in der temporären Datei oder dem temporären Verzeichnis gespeichert wird und Sie möchten
+Sie müssen den genauen Pfad kennen und dürfen ihn nicht bei jedem neuen Testdurchlauf zufällig ändern.
+
+Für solche Zwecke ist die Hilfsklasse `transformers.test_utils.TestCasePlus` am besten geeignet. Sie ist eine Unterklasse von
+Unittest.TestCase`, so dass wir in den Testmodulen einfach von ihr erben können.
+
+Hier ist ein Beispiel für die Verwendung dieser Klasse:
+
+```python
+from transformers.testing_utils import TestCasePlus
+
+
+class ExamplesTests(TestCasePlus):
+    def test_whatever(self):
+        tmp_dir = self.get_auto_remove_tmp_dir()
+```
+
+Dieser Code erstellt ein eindeutiges temporäres Verzeichnis und setzt `tmp_dir` auf dessen Speicherort.
+
+- Erstellen Sie ein eindeutiges temporäres Verzeichnis:
+
+```python
+def test_whatever(self):
+    tmp_dir = self.get_auto_remove_tmp_dir()
+```
+
+tmp_dir" enthält den Pfad zu dem erstellten temporären Verzeichnis. Es wird am Ende des Tests automatisch entfernt.
+Tests entfernt.
+
+- Erstellen Sie ein temporäres Verzeichnis meiner Wahl, stellen Sie sicher, dass es leer ist, bevor der Test beginnt, und leeren Sie es nach dem Test nicht.
+
+```python
+def test_whatever(self):
+    tmp_dir = self.get_auto_remove_tmp_dir("./xxx")
+```
+
+Dies ist nützlich für die Fehlersuche, wenn Sie ein bestimmtes Verzeichnis überwachen und sicherstellen möchten, dass die vorherigen Tests keine Daten darin hinterlassen haben.
+keine Daten dort hinterlassen haben.
+
+- Sie können das Standardverhalten außer Kraft setzen, indem Sie die Argumente `before` und `after` direkt überschreiben, was zu einem der folgenden Verhaltensweisen führt
+  folgenden Verhaltensweisen:
+
+  - `before=True`: das temporäre Verzeichnis wird immer zu Beginn des Tests gelöscht.
+  - `before=False`: wenn das temporäre Verzeichnis bereits existiert, bleiben alle vorhandenen Dateien dort erhalten.
+  - `after=True`: das temporäre Verzeichnis wird immer am Ende des Tests gelöscht.
+  - `after=False`: das temporäre Verzeichnis wird am Ende des Tests immer beibehalten.
+
+<Tip>
+
+Um das Äquivalent von `rm -r` sicher ausführen zu können, sind nur Unterverzeichnisse des Projektarchivs checkout erlaubt, wenn
+ein explizites `tmp_dir` verwendet wird, so dass nicht versehentlich ein `/tmp` oder ein ähnlich wichtiger Teil des Dateisystems vernichtet wird.
+d.h. geben Sie bitte immer Pfade an, die mit `./` beginnen.
+
+</Tip>
+
+<Tip>
+
+Jeder Test kann mehrere temporäre Verzeichnisse registrieren, die alle automatisch entfernt werden, sofern nicht anders gewünscht.
+anders.
+
+</Tip>
+
+### Temporäre Überschreibung von sys.path
+
+Wenn Sie `sys.path` vorübergehend überschreiben müssen, um z.B. von einem anderen Test zu importieren, können Sie den
+Kontextmanager `ExtendSysPath` verwenden. Beispiel:
+
+
+```python
+import os
+from transformers.testing_utils import ExtendSysPath
+
+bindir = os.path.abspath(os.path.dirname(__file__))
+with ExtendSysPath(f"{bindir}/.."):
+    from test_trainer import TrainerIntegrationCommon  # noqa
+```
+
+### Überspringen von Tests
+
+Dies ist nützlich, wenn ein Fehler gefunden und ein neuer Test geschrieben wird, der Fehler aber noch nicht behoben ist. Damit wir ihn
+in das Haupt-Repository zu übertragen, müssen wir sicherstellen, dass er bei `make test` übersprungen wird.
+
+Methoden:
+
+- Ein **Skip** bedeutet, dass Sie erwarten, dass Ihr Test nur dann erfolgreich ist, wenn einige Bedingungen erfüllt sind, andernfalls sollte pytest den Test überspringen.
+  die Ausführung des Tests ganz überspringen. Übliche Beispiele sind das Überspringen von Tests, die nur unter Windows laufen, auf Nicht-Windows-Plattformen oder das Überspringen von
+  Tests, die von einer externen Ressource abhängen, die im Moment nicht verfügbar ist (z.B. eine Datenbank).
+
+- Ein **xfail** bedeutet, dass Sie erwarten, dass ein Test aus irgendeinem Grund fehlschlägt. Ein gängiges Beispiel ist ein Test für eine Funktion, die noch nicht
+  noch nicht implementiert oder ein noch nicht behobener Fehler. Wenn ein Test trotz eines erwarteten Fehlschlags bestanden wird (markiert mit
+  pytest.mark.xfail), ist dies ein xpass und wird in der Testzusammenfassung gemeldet.
+
+Einer der wichtigsten Unterschiede zwischen den beiden ist, dass `skip` den Test nicht ausführt, während `xfail` dies tut. Wenn also der
+Code, der fehlerhaft ist, einen schlechten Zustand verursacht, der sich auf andere Tests auswirkt, sollten Sie also nicht `xfail` verwenden.
+
+#### Implementierung
+
+- Hier sehen Sie, wie Sie einen ganzen Test bedingungslos überspringen können:
+
+```python no-style
+@unittest.skip("this bug needs to be fixed")
+def test_feature_x():
+```
+
+oder mit pytest:
+
+```python no-style
+@pytest.mark.skip(reason="this bug needs to be fixed")
+```
+
+oder mit dem `xfail` Weg:
+
+```python no-style
+@pytest.mark.xfail
+def test_feature_x():
+```
+
+- Hier erfahren Sie, wie Sie einen Test aufgrund einer internen Prüfung innerhalb des Tests auslassen können:
+
+```python
+def test_feature_x():
+    if not has_something():
+        pytest.skip("unsupported configuration")
+```
+
+oder das ganze Modul:
+
+```python
+import pytest
+
+if not pytest.config.getoption("--custom-flag"):
+    pytest.skip("--custom-flag is missing, skipping tests", allow_module_level=True)
+```
+
+oder mit dem `xfail` Weg:
+
+```python
+def test_feature_x():
+    pytest.xfail("expected to fail until bug XYZ is fixed")
+```
+
+- Hier erfahren Sie, wie Sie alle Tests in einem Modul überspringen können, wenn ein Import fehlt:
+
+```python
+docutils = pytest.importorskip("docutils", minversion="0.3")
+```
+
+- Einen Test aufgrund einer Bedingung überspringen:
+
+```python no-style
+@pytest.mark.skipif(sys.version_info < (3,6), reason="requires python3.6 or higher")
+def test_feature_x():
+```
+
+oder:
+
+```python no-style
+@unittest.skipIf(torch_device == "cpu", "Can't do half precision")
+def test_feature_x():
+```
+
+oder überspringen Sie das ganze Modul:
+
+```python no-style
+@pytest.mark.skipif(sys.platform == 'win32', reason="does not run on windows")
+class TestClass():
+    def test_feature_x(self):
+```
+
+Weitere Details, Beispiele und Möglichkeiten finden Sie [hier](https://docs.pytest.org/en/latest/skipping.html).
+
+### Langsame Tests
+
+Die Bibliothek der Tests wächst ständig, und einige der Tests brauchen Minuten, um ausgeführt zu werden, daher können wir es uns nicht leisten, eine Stunde zu warten, bis die
+eine Stunde auf die Fertigstellung der Testsuite auf CI zu warten. Daher sollten langsame Tests, mit einigen Ausnahmen für wichtige Tests, wie im folgenden Beispiel
+wie im folgenden Beispiel markiert werden:
+
+```python no-style
+from transformers.testing_utils import slow
+@slow
+def test_integration_foo():
+```
+
+Sobald ein Test als `@langsam` markiert ist, setzen Sie die Umgebungsvariable `RUN_SLOW=1`, um solche Tests auszuführen, z.B:
+
+```bash
+RUN_SLOW=1 pytest tests
+```
+
+Einige Dekoratoren wie `@parameterized` schreiben Testnamen um, daher müssen `@slow` und die übrigen Skip-Dekoratoren
+`@require_*` müssen als letztes aufgeführt werden, damit sie korrekt funktionieren. Hier ist ein Beispiel für die korrekte Verwendung:
+
+```python no-style
+@parameteriz ed.expand(...)
+@slow
+def test_integration_foo():
+```
+
+Wie zu Beginn dieses Dokuments erläutert, werden langsame Tests nach einem Zeitplan ausgeführt und nicht in PRs CI
+Prüfungen. Es ist also möglich, dass einige Probleme bei der Einreichung eines PRs übersehen werden und zusammengeführt werden. Solche Probleme werden
+werden beim nächsten geplanten CI-Job abgefangen. Das bedeutet aber auch, dass es wichtig ist, die langsamen Tests auf Ihrem
+Rechner auszuführen, bevor Sie den PR einreichen.
+
+Hier ist ein grober Entscheidungsmechanismus für die Auswahl der Tests, die als langsam markiert werden sollen:
+
+Wenn der Test auf eine der internen Komponenten der Bibliothek ausgerichtet ist (z.B. Modellierungsdateien, Tokenisierungsdateien,
+Pipelines), dann sollten wir diesen Test in der nicht langsamen Testsuite ausführen. Wenn er sich auf einen anderen Aspekt der Bibliothek bezieht,
+wie z.B. die Dokumentation oder die Beispiele, dann sollten wir diese Tests in der langsamen Testsuite durchführen. Und dann, zur Verfeinerung
+Ansatz zu verfeinern, sollten wir Ausnahmen einführen:
+
+- Alle Tests, die einen umfangreichen Satz von Gewichten oder einen Datensatz mit einer Größe von mehr als ~50MB herunterladen müssen (z.B. Modell- oder
+  Tokenizer-Integrationstests, Pipeline-Integrationstests) sollten auf langsam gesetzt werden. Wenn Sie ein neues Modell hinzufügen, sollten Sie
+  sollten Sie eine kleine Version des Modells (mit zufälligen Gewichtungen) für Integrationstests erstellen und in den Hub hochladen. Dies wird
+  wird in den folgenden Abschnitten erläutert.
+- Alle Tests, die ein Training durchführen müssen, das nicht speziell auf Schnelligkeit optimiert ist, sollten auf langsam gesetzt werden.
+- Wir können Ausnahmen einführen, wenn einige dieser Tests, die nicht langsam sein sollten, unerträglich langsam sind, und sie auf
+  @langsam`. Auto-Modellierungstests, die große Dateien auf der Festplatte speichern und laden, sind ein gutes Beispiel für Tests, die als
+  als `@langsam` markiert sind.
+- Wenn ein Test in weniger als 1 Sekunde auf CI abgeschlossen wird (einschließlich eventueller Downloads), sollte es sich trotzdem um einen normalen Test handeln.
+
+Insgesamt müssen alle nicht langsamen Tests die verschiedenen Interna abdecken und dabei schnell bleiben. Zum Beispiel,
+kann eine signifikante Abdeckung erreicht werden, indem Sie mit speziell erstellten kleinen Modellen mit zufälligen Gewichten testen. Solche Modelle
+haben eine sehr geringe Anzahl von Schichten (z.B. 2), Vokabeln (z.B. 1000), usw. Dann können die `@slow`-Tests große
+langsame Modelle verwenden, um qualitative Tests durchzuführen. Um die Verwendung dieser Modelle zu sehen, suchen Sie einfach nach *winzigen* Modellen mit:
+
+```bash
+grep tiny tests examples
+```
+
+Hier ist ein Beispiel für ein [Skript](https://github.com/huggingface/transformers/tree/main/scripts/fsmt/fsmt-make-tiny-model.py), das das winzige Modell erstellt hat
+[stas/tiny-wmt19-en-de](https://huggingface.co/stas/tiny-wmt19-en-de). Sie können es ganz einfach an Ihre eigene
+Architektur Ihres Modells anpassen.
+
+Es ist leicht, die Laufzeit falsch zu messen, wenn zum Beispiel ein großes Modell heruntergeladen wird, aber wenn
+Sie es lokal testen, würden die heruntergeladenen Dateien zwischengespeichert und somit die Download-Zeit nicht gemessen werden. Prüfen Sie daher den
+Ausführungsgeschwindigkeitsbericht in den CI-Protokollen (die Ausgabe von `pytest --durations=0 tests`).
+
+Dieser Bericht ist auch nützlich, um langsame Ausreißer zu finden, die nicht als solche gekennzeichnet sind oder die neu geschrieben werden müssen, um schnell zu sein.
+Wenn Sie bemerken, dass die Testsuite beim CI langsam wird, zeigt die oberste Liste dieses Berichts die langsamsten
+Tests.
+
+
+### Testen der stdout/stderr-Ausgabe
+
+Um Funktionen zu testen, die in `stdout` und/oder `stderr` schreiben, kann der Test auf diese Ströme zugreifen, indem er die
+[capsys system](https://docs.pytest.org/en/latest/capture.html) von `pytest` zugreifen. So wird dies bewerkstelligt:
+
+```python
+import sys
+
+
+def print_to_stdout(s):
+    print(s)
+
+
+def print_to_stderr(s):
+    sys.stderr.write(s)
+
+
+def test_result_and_stdout(capsys):
+    msg = "Hello"
+    print_to_stdout(msg)
+    print_to_stderr(msg)
+    out, err = capsys.readouterr()  # consume the captured output streams
+    # optional: if you want to replay the consumed streams:
+    sys.stdout.write(out)
+    sys.stderr.write(err)
+    # test:
+    assert msg in out
+    assert msg in err
+```
+
+Und natürlich wird `stderr` in den meisten Fällen als Teil einer Ausnahme auftreten, so dass try/except in einem solchen Fall verwendet werden muss
+Fall verwendet werden:
+
+```python
+def raise_exception(msg):
+    raise ValueError(msg)
+
+
+def test_something_exception():
+    msg = "Not a good value"
+    error = ""
+    try:
+        raise_exception(msg)
+    except Exception as e:
+        error = str(e)
+        assert msg in error, f"{msg} is in the exception:\n{error}"
+```
+
+Ein anderer Ansatz zur Erfassung von stdout ist `contextlib.redirect_stdout`:
+
+```python
+from io import StringIO
+from contextlib import redirect_stdout
+
+
+def print_to_stdout(s):
+    print(s)
+
+
+def test_result_and_stdout():
+    msg = "Hello"
+    buffer = StringIO()
+    with redirect_stdout(buffer):
+        print_to_stdout(msg)
+    out = buffer.getvalue()
+    # optional: if you want to replay the consumed streams:
+    sys.stdout.write(out)
+    # test:
+    assert msg in out
+```
+
+Ein wichtiges potenzielles Problem beim Erfassen von stdout ist, dass es `r` Zeichen enthalten kann, die bei normalem `print`
+alles zurücksetzen, was bisher gedruckt wurde. Mit `pytest` gibt es kein Problem, aber mit `pytest -s` werden diese
+werden diese Zeichen in den Puffer aufgenommen. Um den Test mit und ohne `-s` laufen zu lassen, müssen Sie also eine zusätzliche Bereinigung
+zusätzliche Bereinigung der erfassten Ausgabe vornehmen, indem Sie `re.sub(r'~.*\r', '', buf, 0, re.M)` verwenden.
+
+Aber dann haben wir einen Hilfskontextmanager-Wrapper, der sich automatisch um alles kümmert, unabhängig davon, ob er
+einige "*.*.*.*" enthält oder nicht:
+
+```python
+from transformers.testing_utils import CaptureStdout
+
+with CaptureStdout() as cs:
+    function_that_writes_to_stdout()
+print(cs.out)
+```
+
+Hier ist ein vollständiges Testbeispiel:
+
+```python
+from transformers.testing_utils import CaptureStdout
+
+msg = "Secret message\r"
+final = "Hello World"
+with CaptureStdout() as cs:
+    print(msg + final)
+assert cs.out == final + "\n", f"captured: {cs.out}, expecting {final}"
+```
+
+Wenn Sie `stderr` aufzeichnen möchten, verwenden Sie stattdessen die Klasse `CaptureStderr`:
+
+```python
+from transformers.testing_utils import CaptureStderr
+
+with CaptureStderr() as cs:
+    function_that_writes_to_stderr()
+print(cs.err)
+```
+
+Wenn Sie beide Streams auf einmal erfassen müssen, verwenden Sie die übergeordnete Klasse `CaptureStd`:
+
+```python
+from transformers.testing_utils import CaptureStd
+
+with CaptureStd() as cs:
+    function_that_writes_to_stdout_and_stderr()
+print(cs.err, cs.out)
+```
+
+Um das Debuggen von Testproblemen zu erleichtern, geben diese Kontextmanager standardmäßig die aufgezeichneten Streams beim Verlassen
+aus dem Kontext wieder.
+
+
+### Erfassen von Logger-Streams
+
+Wenn Sie die Ausgabe eines Loggers validieren müssen, können Sie `CaptureLogger` verwenden:
+
+```python
+from transformers import logging
+from transformers.testing_utils import CaptureLogger
+
+msg = "Testing 1, 2, 3"
+logging.set_verbosity_info()
+logger = logging.get_logger("transformers.models.bart.tokenization_bart")
+with CaptureLogger(logger) as cl:
+    logger.info(msg)
+assert cl.out, msg + "\n"
+```
+
+### Testen mit Umgebungsvariablen
+
+Wenn Sie die Auswirkungen von Umgebungsvariablen für einen bestimmten Test testen möchten, können Sie einen Hilfsdekorator verwenden
+`transformers.testing_utils.mockenv`
+
+```python
+from transformers.testing_utils import mockenv
+
+
+class HfArgumentParserTest(unittest.TestCase):
+    @mockenv(TRANSFORMERS_VERBOSITY="error")
+    def test_env_override(self):
+        env_level_str = os.getenv("TRANSFORMERS_VERBOSITY", None)
+```
+
+Manchmal muss ein externes Programm aufgerufen werden, was die Einstellung von `PYTHONPATH` in `os.environ` erfordert, um mehrere lokale Pfade einzuschließen.
+mehrere lokale Pfade. Eine Hilfsklasse `transformers.test_utils.TestCasePlus` hilft Ihnen dabei:
+
+```python
+from transformers.testing_utils import TestCasePlus
+
+
+class EnvExampleTest(TestCasePlus):
+    def test_external_prog(self):
+        env = self.get_env()
+        # now call the external program, passing `env` to it
+```
+
+Je nachdem, ob die Testdatei in der Testsuite `tests` oder in `examples` war, wird sie korrekt eingerichtet
+env[PYTHONPATH]` eines dieser beiden Verzeichnisse und auch das `src` Verzeichnis, um sicherzustellen, dass der Test gegen das aktuelle
+um sicherzustellen, dass der Test mit dem aktuellen Projektarchiv durchgeführt wird, und schließlich mit dem, was in `env[PYTHONPATH]` bereits eingestellt war, bevor der Test aufgerufen wurde.
+wenn überhaupt.
+
+Diese Hilfsmethode erstellt eine Kopie des Objekts `os.environ`, so dass das Original intakt bleibt.
+
+
+### Reproduzierbare Ergebnisse erhalten
+
+In manchen Situationen möchten Sie vielleicht die Zufälligkeit Ihrer Tests beseitigen. Um identische, reproduzierbare Ergebnisse zu erhalten, müssen Sie
+müssen Sie den Seed festlegen:
+
+```python
+seed = 42
+
+# python RNG
+import random
+
+random.seed(seed)
+
+# pytorch RNGs
+import torch
+
+torch.manual_seed(seed)
+torch.backends.cudnn.deterministic = True
+if torch.cuda.is_available():
+    torch.cuda.manual_seed_all(seed)
+
+# numpy RNG
+import numpy as np
+
+np.random.seed(seed)
+
+# tf RNG
+tf.random.set_seed(seed)
+```
+
+### Tests debuggen
+
+Um einen Debugger an der Stelle zu starten, an der die Warnung auftritt, gehen Sie wie folgt vor:
+
+```bash
+pytest tests/utils/test_logging.py -W error::UserWarning --pdb
+```
+
+## Arbeiten mit Github-Aktionen-Workflows
+
+Um einen CI-Job für einen Self-Push-Workflow auszulösen, müssen Sie:
+
+1. Erstellen Sie einen neuen Zweig auf `transformers` Ursprung (keine Gabelung!).
+2. Der Name der Verzweigung muss entweder mit `ci_` oder `ci-` beginnen (`main` löst ihn auch aus, aber wir können keine PRs auf
+   `main`). Es wird auch nur für bestimmte Pfade ausgelöst - Sie können die aktuelle Definition finden, falls sie
+   falls sie sich seit der Erstellung dieses Dokuments geändert hat [hier](https://github.com/huggingface/transformers/blob/main/.github/workflows/self-push.yml) unter *push:*
+3. Erstellen Sie einen PR von diesem Zweig.
+4. Dann können Sie sehen, wie der Job erscheint [hier](https://github.com/huggingface/transformers/actions/workflows/self-push.yml). Er wird möglicherweise nicht sofort ausgeführt, wenn es
+   ein Backlog vorhanden ist.
+
+
+
+
+## Testen experimenteller CI-Funktionen
+
+Das Testen von CI-Funktionen kann potenziell problematisch sein, da es die normale CI-Funktion beeinträchtigen kann. Wenn also eine
+neue CI-Funktion hinzugefügt werden soll, sollte dies wie folgt geschehen.
+
+1. Erstellen Sie einen neuen Auftrag, der die zu testende Funktion testet.
+2. Der neue Job muss immer erfolgreich sein, so dass er uns ein grünes ✓ gibt (Details unten).
+3. Lassen Sie ihn einige Tage lang laufen, um zu sehen, dass eine Vielzahl verschiedener PR-Typen darauf laufen (Benutzer-Gabelzweige,
+   nicht geforkte Zweige, Zweige, die von github.com UI direct file edit stammen, verschiedene erzwungene Pushes, etc. - es gibt
+   es gibt so viele), während Sie die Protokolle des experimentellen Jobs überwachen (nicht den gesamten Job grün, da er absichtlich immer
+   grün)
+4. Wenn klar ist, dass alles in Ordnung ist, fügen Sie die neuen Änderungen in die bestehenden Jobs ein.
+
+Auf diese Weise wird der normale Arbeitsablauf nicht durch Experimente mit der CI-Funktionalität selbst beeinträchtigt.
+
+Wie können wir nun dafür sorgen, dass der Auftrag immer erfolgreich ist, während die neue CI-Funktion entwickelt wird?
+
+Einige CIs, wie TravisCI, unterstützen ignore-step-failure und melden den gesamten Job als erfolgreich, aber CircleCI und
+Github Actions unterstützen dies zum jetzigen Zeitpunkt nicht.
+
+Sie können also die folgende Abhilfe verwenden:
+
+1. Setzen Sie `set +euo pipefail` am Anfang des Ausführungsbefehls, um die meisten potenziellen Fehler im Bash-Skript zu unterdrücken.
+2. Der letzte Befehl muss ein Erfolg sein: `echo "done"` oder einfach `true` reicht aus.
+
+Hier ist ein Beispiel:
+
+```yaml
+- run:
+    name: run CI experiment
+    command: |
+        set +euo pipefail
+        echo "setting run-all-despite-any-errors-mode"
+        this_command_will_fail
+        echo "but bash continues to run"
+        # emulate another failure
+        false
+        # but the last command must be a success
+        echo "during experiment do not remove: reporting success to CI, even if there were failures"
+```
+
+Für einfache Befehle können Sie auch Folgendes tun:
+
+```bash
+cmd_that_may_fail || true
+```
+
+Wenn Sie mit den Ergebnissen zufrieden sind, integrieren Sie den experimentellen Schritt oder Job natürlich in den Rest der normalen Jobs,
+Entfernen Sie dabei `set +euo pipefail` oder andere Dinge, die Sie eventuell hinzugefügt haben, um sicherzustellen, dass der experimentelle Auftrag nicht
+den normalen CI-Betrieb nicht beeinträchtigt.
+
+Dieser ganze Prozess wäre viel einfacher gewesen, wenn wir nur etwas wie `allow-failure` für den
+experimentellen Schritt festlegen könnten und ihn scheitern lassen würden, ohne den Gesamtstatus der PRs zu beeinträchtigen. Aber wie bereits erwähnt, haben CircleCI und
+Github Actions dies im Moment nicht unterstützen.
+
+Sie können in diesen CI-spezifischen Threads für diese Funktion stimmen und sehen, wo sie steht:
+
+- [Github Actions:](https://github.com/actions/toolkit/issues/399)
+- [CircleCI:](https://ideas.circleci.com/ideas/CCI-I-344)
diff --git a/docs/source/de/training.md b/docs/source/de/training.md
new file mode 100644
index 000000000000..493de3052bbf
--- /dev/null
+++ b/docs/source/de/training.md
@@ -0,0 +1,433 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Optimierung eines vortrainierten Modells
+
+[[open-in-colab]]
+
+Die Verwendung eines vorab trainierten Modells hat erhebliche Vorteile. Es reduziert die Rechenkosten und den CO2-Fußabdruck und ermöglicht Ihnen die Verwendung von Modellen, die dem neuesten Stand der Technik entsprechen, ohne dass Sie ein Modell von Grund auf neu trainieren müssen. Transformers bietet Zugang zu Tausenden von vortrainierten Modellen für eine Vielzahl von Aufgaben. Wenn Sie ein vorab trainiertes Modell verwenden, trainieren Sie es auf einem für Ihre Aufgabe spezifischen Datensatz. Dies wird als Feinabstimmung bezeichnet und ist eine unglaublich leistungsfähige Trainingstechnik. In diesem Tutorial werden Sie ein vortrainiertes Modell mit einem Deep-Learning-Framework Ihrer Wahl feinabstimmen:
+
+* Feinabstimmung eines vorab trainierten Modells mit 🤗 Transformers [`Trainer`].
+* Feinabstimmung eines vorab trainierten Modells in TensorFlow mit Keras.
+* Feinabstimmung eines vorab trainierten Modells in nativem PyTorch.
+
+<a id='data-processing'></a>
+
+## Vorbereitung eines Datensatzes
+
+<Youtube id="_BZearw7f0w"/>
+
+Bevor Sie die Feinabstimmung eines vortrainierten Modells vornehmen können, müssen Sie einen Datensatz herunterladen und für das Training vorbereiten. Im vorangegangenen Leitfaden haben Sie gelernt, wie man Daten für das Training aufbereitet, und jetzt haben Sie die Gelegenheit, diese Fähigkeiten zu testen!
+
+Laden Sie zunächst den Datensatz [Yelp Reviews](https://huggingface.co/datasets/yelp_review_full):
+
+```py
+>>> from datasets import load_dataset
+
+>>> dataset = load_dataset("yelp_review_full")
+>>> dataset["train"][100]
+{'label': 0,
+ 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. I\'ve worked at more than one location. I expect bad days, bad moods, and the occasional mistake. But I have yet to have a decent experience at this store. It will remain a place I avoid unless someone in my party needs to avoid illness from low blood sugar. Perhaps I should go back to the racially biased service of Steak n Shake instead!'}
+```
+
+Wie Sie nun wissen, benötigen Sie einen Tokenizer, um den Text zu verarbeiten und eine Auffüll- und Abschneidungsstrategie einzubauen, um mit variablen Sequenzlängen umzugehen. Um Ihren Datensatz in einem Schritt zu verarbeiten, verwenden Sie die 🤗 Methode Datasets [`map`](https://huggingface.co/docs/datasets/process.html#map), um eine Vorverarbeitungsfunktion auf den gesamten Datensatz anzuwenden:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+
+
+>>> def tokenize_function(examples):
+...     return tokenizer(examples["text"], padding="max_length", truncation=True)
+
+
+>>> tokenized_datasets = dataset.map(tokenize_function, batched=True)
+```
+
+Wenn Sie möchten, können Sie eine kleinere Teilmenge des gesamten Datensatzes für die Feinabstimmung erstellen, um den Zeitaufwand zu verringern:
+
+```py
+>>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
+>>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
+```
+
+<a id='trainer'></a>
+
+## Training
+
+An dieser Stelle sollten Sie dem Abschnitt folgen, der dem Rahmen entspricht, den Sie verwenden möchten. Sie können über die Links
+in der rechten Seitenleiste können Sie zu dem gewünschten Abschnitt springen - und wenn Sie den gesamten Inhalt eines bestimmten Frameworks ausblenden möchten,
+klicken Sie einfach auf die Schaltfläche oben rechts im Block des jeweiligen Frameworks!
+
+<frameworkcontent>
+<pt>
+<Youtube id="nvBXf7s7vTI"/>
+
+## Trainieren mit PyTorch Trainer
+
+🤗 Transformers bietet eine [`Trainer`]-Klasse, die für das Training von 🤗 Transformers-Modellen optimiert ist und es einfacher macht, mit dem Training zu beginnen, ohne manuell eine eigene Trainingsschleife zu schreiben. Die [`Trainer`]-API unterstützt eine breite Palette von Trainingsoptionen und Funktionen wie Logging, Gradientenakkumulation und gemischte Präzision.
+
+Beginnen Sie mit dem Laden Ihres Modells und geben Sie die Anzahl der erwarteten Labels an. Aus dem Yelp Review [dataset card](https://huggingface.co/datasets/yelp_review_full#data-fields) wissen Sie, dass es fünf Labels gibt:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+```
+
+<Tip>
+
+Es wird eine Warnung angezeigt, dass einige der trainierten Parameter nicht verwendet werden und einige Parameter zufällig
+initialisiert werden. Machen Sie sich keine Sorgen, das ist völlig normal! Der vorher trainierte Kopf des BERT-Modells wird verworfen und durch einen zufällig initialisierten Klassifikationskopf ersetzt. Sie werden diesen neuen Modellkopf in Ihrer Sequenzklassifizierungsaufgabe feinabstimmen, indem Sie das Wissen des vortrainierten Modells auf ihn übertragen.
+
+</Tip>
+
+### Hyperparameter für das Training
+
+Als Nächstes erstellen Sie eine Klasse [`TrainingArguments`], die alle Hyperparameter enthält, die Sie einstellen können, sowie Flags zur Aktivierung verschiedener Trainingsoptionen. Für dieses Lernprogramm können Sie mit den Standard- [Hyperparametern](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments) beginnen, aber Sie können mit diesen experimentieren, um Ihre optimalen Einstellungen zu finden.
+
+Geben Sie an, wo die Kontrollpunkte Ihres Trainings gespeichert werden sollen:
+
+```py
+>>> from transformers import TrainingArguments
+
+>>> training_args = TrainingArguments(output_dir="test_trainer")
+```
+
+### Auswerten
+
+Der [`Trainer`] wertet die Leistung des Modells während des Trainings nicht automatisch aus. Sie müssen [`Trainer`] eine Funktion übergeben, um Metriken zu berechnen und zu berichten. Die [🤗 Evaluate](https://huggingface.co/docs/evaluate/index) Bibliothek bietet eine einfache [`accuracy`](https://huggingface.co/spaces/evaluate-metric/accuracy) Funktion, die Sie mit der [`evaluate.load`] Funktion laden können (siehe diese [quicktour](https://huggingface.co/docs/evaluate/a_quick_tour) für weitere Informationen):
+
+```py
+>>> import numpy as np
+>>> import evaluate
+
+>>> metric = evaluate.load("accuracy")
+```
+
+Rufen Sie [`~evaluate.compute`] auf `metric` auf, um die Genauigkeit Ihrer Vorhersagen zu berechnen. Bevor Sie Ihre Vorhersagen an `compute` übergeben, müssen Sie die Vorhersagen in Logits umwandeln (denken Sie daran, dass alle 🤗 Transformers-Modelle Logits zurückgeben):
+
+```py
+>>> def compute_metrics(eval_pred):
+...     logits, labels = eval_pred
+...     predictions = np.argmax(logits, axis=-1)
+...     return metric.compute(predictions=predictions, references=labels)
+```
+
+Wenn Sie Ihre Bewertungsmetriken während der Feinabstimmung überwachen möchten, geben Sie den Parameter `evaluation_strategy` in Ihren Trainingsargumenten an, um die Bewertungsmetrik am Ende jeder Epoche zu ermitteln:
+
+```py
+>>> from transformers import TrainingArguments, Trainer
+
+>>> training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
+```
+
+### Trainer
+
+Erstellen Sie ein [`Trainer`]-Objekt mit Ihrem Modell, Trainingsargumenten, Trainings- und Testdatensätzen und einer Evaluierungsfunktion:
+
+```py
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=small_train_dataset,
+...     eval_dataset=small_eval_dataset,
+...     compute_metrics=compute_metrics,
+... )
+```
+
+Anschließend können Sie Ihr Modell durch den Aufruf von [`~transformers.Trainer.train`] optimieren:
+
+```py
+>>> trainer.train()
+```
+</pt>
+<tf>
+<a id='keras'></a>
+
+<Youtube id="rnTGBy2ax1c"/>
+
+## Trainieren Sie ein TensorFlow-Modell mit Keras
+
+Sie können auch 🤗 Transformers Modelle in TensorFlow mit der Keras API trainieren!
+
+### Laden von Daten für Keras
+
+Wenn Sie ein 🤗 Transformers Modell mit der Keras API trainieren wollen, müssen Sie Ihren Datensatz in ein Format konvertieren, das
+Keras versteht. Wenn Ihr Datensatz klein ist, können Sie das Ganze einfach in NumPy-Arrays konvertieren und an Keras übergeben.
+Probieren wir das zuerst aus, bevor wir etwas Komplizierteres tun.
+
+Laden Sie zunächst ein Dataset. Wir werden den CoLA-Datensatz aus dem [GLUE-Benchmark](https://huggingface.co/datasets/glue) verwenden,
+da es sich um eine einfache Aufgabe zur Klassifizierung von binärem Text handelt, und nehmen vorerst nur den Trainingssplit.
+
+```py
+from datasets import load_dataset
+
+dataset = load_dataset("glue", "cola")
+dataset = dataset["train"]  # Just take the training split for now
+```
+
+Als nächstes laden Sie einen Tokenizer und tokenisieren die Daten als NumPy-Arrays. Beachten Sie, dass die Beschriftungen bereits eine Liste von 0 und 1en sind,
+Wir können sie also ohne Tokenisierung direkt in ein NumPy-Array konvertieren!
+
+```py
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+tokenized_data = tokenizer(dataset["text"], return_tensors="np", padding=True)
+# Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras
+tokenized_data = dict(tokenized_data)
+
+labels = np.array(dataset["label"])  # Label is already an array of 0 and 1
+```
+
+Schließlich laden, [`compile`](https://keras.io/api/models/model_training_apis/#compile-method) und [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) Sie das Modell:
+
+```py
+from transformers import TFAutoModelForSequenceClassification
+from tensorflow.keras.optimizers import Adam
+
+# Load and compile our model
+model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
+# Lower learning rates are often better for fine-tuning transformers
+model.compile(optimizer=Adam(3e-5))
+
+model.fit(tokenized_data, labels)
+```
+
+<Tip>
+
+Sie müssen Ihren Modellen kein Verlustargument übergeben, wenn Sie sie `compile()`! Hugging-Face-Modelle wählen automatisch
+einen Loss, der für ihre Aufgabe und Modellarchitektur geeignet ist, wenn dieses Argument leer gelassen wird. Sie können jederzeit außer Kraft setzen, indem Sie selbst einen Loss angeben, wenn Sie das möchten!
+
+</Tip>
+
+Dieser Ansatz eignet sich hervorragend für kleinere Datensätze, aber bei größeren Datensätzen kann er zu einem Problem werden. Warum?
+Weil das tokenisierte Array und die Beschriftungen vollständig in den Speicher geladen werden müssten, und weil NumPy nicht mit
+"gezackte" Arrays nicht verarbeiten kann, so dass jedes tokenisierte Sample auf die Länge des längsten Samples im gesamten Datensatz aufgefüllt werden müsste.
+Datensatzes aufgefüllt werden. Dadurch wird das Array noch größer, und all die aufgefüllten Token verlangsamen auch das Training!
+
+### Laden von Daten als tf.data.Dataset
+
+Wenn Sie eine Verlangsamung des Trainings vermeiden wollen, können Sie Ihre Daten stattdessen als `tf.data.Dataset` laden. Sie können zwar Ihre eigene
+tf.data"-Pipeline schreiben können, wenn Sie wollen, haben wir zwei bequeme Methoden, um dies zu tun:
+
+- [`~TFPreTrainedModel.prepare_tf_dataset`]: Dies ist die Methode, die wir in den meisten Fällen empfehlen. Da es sich um eine Methode
+Ihres Modells ist, kann sie das Modell inspizieren, um automatisch herauszufinden, welche Spalten als Modelleingaben verwendet werden können, und
+verwirft die anderen, um einen einfacheren, leistungsfähigeren Datensatz zu erstellen.
+- [~datasets.Dataset.to_tf_dataset`]: Diese Methode ist eher auf niedriger Ebene angesiedelt und ist nützlich, wenn Sie genau kontrollieren wollen, wie
+Dataset erstellt wird, indem man genau angibt, welche `columns` und `label_cols` einbezogen werden sollen.
+
+Bevor Sie [~TFPreTrainedModel.prepare_tf_dataset`] verwenden können, müssen Sie die Tokenizer-Ausgaben als Spalten zu Ihrem Datensatz hinzufügen, wie in
+dem folgenden Codebeispiel:
+
+```py
+def tokenize_dataset(data):
+    # Keys of the returned dictionary will be added to the dataset as columns
+    return tokenizer(data["text"])
+
+
+dataset = dataset.map(tokenize_dataset)
+```
+
+Denken Sie daran, dass Hugging Face-Datensätze standardmäßig auf der Festplatte gespeichert werden, so dass dies nicht zu einem erhöhten Arbeitsspeicherbedarf führen wird! Sobald die
+Spalten hinzugefügt wurden, können Sie Batches aus dem Datensatz streamen und zu jedem Batch Auffüllungen hinzufügen, was die Anzahl der Auffüllungs-Token im Vergleich zum Auffüllen des gesamten Datensatzes reduziert.
+
+
+```py
+>>> tf_dataset = model.prepare_tf_dataset(dataset, batch_size=16, shuffle=True, tokenizer=tokenizer)
+```
+
+Beachten Sie, dass Sie im obigen Codebeispiel den Tokenizer an `prepare_tf_dataset` übergeben müssen, damit die Stapel beim Laden korrekt aufgefüllt werden können.
+Wenn alle Stichproben in Ihrem Datensatz die gleiche Länge haben und kein Auffüllen erforderlich ist, können Sie dieses Argument weglassen.
+Wenn Sie etwas Komplexeres als nur das Auffüllen von Stichproben benötigen (z. B. das Korrumpieren von Token für die maskierte Sprachmodellierung), können Sie das Argument
+Modellierung), können Sie stattdessen das Argument `collate_fn` verwenden, um eine Funktion zu übergeben, die aufgerufen wird, um die
+Liste von Stichproben in einen Stapel umwandelt und alle gewünschten Vorverarbeitungen vornimmt. Siehe unsere
+[examples](https://github.com/huggingface/transformers/tree/main/examples) oder
+[notebooks](https://huggingface.co/docs/transformers/notebooks), um diesen Ansatz in Aktion zu sehen.
+
+Sobald Sie einen `tf.data.Dataset` erstellt haben, können Sie das Modell wie zuvor kompilieren und anpassen:
+
+```py
+model.compile(optimizer=Adam(3e-5))
+
+model.fit(tf_dataset)
+```
+
+</tf>
+</frameworkcontent>
+
+<a id='pytorch_native'></a>
+
+## Trainieren in nativem PyTorch
+
+<frameworkcontent>
+<pt>
+<Youtube id="Dh9CL8fyG80"/>
+
+[`Trainer`] kümmert sich um die Trainingsschleife und ermöglicht die Feinabstimmung eines Modells in einer einzigen Codezeile. Für Benutzer, die es vorziehen, ihre eigene Trainingsschleife zu schreiben, können Sie auch eine Feinabstimmung eines 🤗 Transformers-Modells in nativem PyTorch vornehmen.
+
+An diesem Punkt müssen Sie möglicherweise Ihr Notebook neu starten oder den folgenden Code ausführen, um etwas Speicher freizugeben:
+
+```py
+del model
+del pytorch_model
+del trainer
+torch.cuda.empty_cache()
+```
+
+Als Nächstes müssen Sie den Datensatz `tokenized_dataset` manuell nachbearbeiten, um ihn für das Training vorzubereiten.
+
+1. Entfernen Sie die Spalte "Text", da das Modell keinen Rohtext als Eingabe akzeptiert:
+
+    ```py
+    >>> tokenized_datasets = tokenized_datasets.remove_columns(["text"])
+    ```
+
+2. Benennen Sie die Spalte "Label" in "Labels" um, da das Modell erwartet, dass das Argument "Labels" genannt wird:
+
+    ```py
+    >>> tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
+    ```
+
+3. Stellen Sie das Format des Datensatzes so ein, dass PyTorch-Tensoren anstelle von Listen zurückgegeben werden:
+
+    ```py
+    >>> tokenized_datasets.set_format("torch")
+    ```
+
+Erstellen Sie dann eine kleinere Teilmenge des Datensatzes, wie zuvor gezeigt, um die Feinabstimmung zu beschleunigen:
+
+```py
+>>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
+>>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
+```
+
+### DataLoader
+
+Erstellen Sie einen `DataLoader` für Ihre Trainings- und Testdatensätze, damit Sie über die Datenstapel iterieren können:
+
+```py
+>>> from torch.utils.data import DataLoader
+
+>>> train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
+>>> eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)
+```
+
+Laden Sie Ihr Modell mit der Anzahl der erwarteten Kennzeichnungen:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+```
+
+### Optimierer und Lernratensteuerung
+
+Erstellen Sie einen Optimierer und einen Scheduler für die Lernrate, um das Modell fein abzustimmen. Wir verwenden den Optimierer [`AdamW`](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html) aus PyTorch:
+
+```py
+>>> from torch.optim import AdamW
+
+>>> optimizer = AdamW(model.parameters(), lr=5e-5)
+```
+
+Erstellen Sie den Standard-Lernratenplaner aus [`Trainer`]:
+
+```py
+>>> from transformers import get_scheduler
+
+>>> num_epochs = 3
+>>> num_training_steps = num_epochs * len(train_dataloader)
+>>> lr_scheduler = get_scheduler(
+...     name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
+... )
+```
+
+Geben Sie schließlich `device` an, um einen Grafikprozessor zu verwenden, wenn Sie Zugang zu einem solchen haben. Andernfalls kann das Training auf einer CPU mehrere Stunden statt ein paar Minuten dauern.
+
+```py
+>>> import torch
+
+>>> device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+>>> model.to(device)
+```
+
+<Tip>
+
+Holen Sie sich mit einem gehosteten Notebook wie [Colaboratory](https://colab.research.google.com/) oder [SageMaker StudioLab](https://studiolab.sagemaker.aws/) kostenlosen Zugang zu einem Cloud-GPU, wenn Sie noch keinen haben.
+
+</Tip>
+
+Großartig, Sie sind bereit für das Training! 🥳 
+
+### Trainingsschleife
+
+Um Ihren Trainingsfortschritt zu verfolgen, verwenden Sie die [tqdm](https://tqdm.github.io/) Bibliothek, um einen Fortschrittsbalken über die Anzahl der Trainingsschritte hinzuzufügen:
+
+```py
+>>> from tqdm.auto import tqdm
+
+>>> progress_bar = tqdm(range(num_training_steps))
+
+>>> model.train()
+>>> for epoch in range(num_epochs):
+...     for batch in train_dataloader:
+...         batch = {k: v.to(device) for k, v in batch.items()}
+...         outputs = model(**batch)
+...         loss = outputs.loss
+...         loss.backward()
+
+...         optimizer.step()
+...         lr_scheduler.step()
+...         optimizer.zero_grad()
+...         progress_bar.update(1)
+```
+
+### Auswertung
+
+Genauso wie Sie eine Bewertungsfunktion zu [`Trainer`] hinzugefügt haben, müssen Sie dasselbe tun, wenn Sie Ihre eigene Trainingsschleife schreiben. Aber anstatt die Metrik am Ende jeder Epoche zu berechnen und zu melden, werden Sie dieses Mal alle Stapel mit [`~evaluate.add_batch`] akkumulieren und die Metrik ganz am Ende berechnen.
+
+```py
+>>> import evaluate
+
+>>> metric = evaluate.load("accuracy")
+>>> model.eval()
+>>> for batch in eval_dataloader:
+...     batch = {k: v.to(device) for k, v in batch.items()}
+...     with torch.no_grad():
+...         outputs = model(**batch)
+
+...     logits = outputs.logits
+...     predictions = torch.argmax(logits, dim=-1)
+...     metric.add_batch(predictions=predictions, references=batch["labels"])
+
+>>> metric.compute()
+```
+</pt>
+</frameworkcontent>
+
+<a id='additional-resources'></a>
+
+## Zusätzliche Ressourcen
+
+Weitere Beispiele für die Feinabstimmung finden Sie unter:
+
+- [🤗 Transformers Examples](https://github.com/huggingface/transformers/tree/main/examples) enthält Skripte
+  um gängige NLP-Aufgaben in PyTorch und TensorFlow zu trainieren.
+
+- [🤗 Transformers Notebooks](notebooks) enthält verschiedene Notebooks zur Feinabstimmung eines Modells für bestimmte Aufgaben in PyTorch und TensorFlow.
\ No newline at end of file
diff --git a/docs/source/de/training.mdx b/docs/source/de/training.mdx
deleted file mode 100644
index e38779ba5571..000000000000
--- a/docs/source/de/training.mdx
+++ /dev/null
@@ -1,429 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Optimierung eines vortrainierten Modells
-
-[[open-in-colab]]
-
-Die Verwendung eines vorab trainierten Modells hat erhebliche Vorteile. Es reduziert die Rechenkosten und den CO2-Fußabdruck und ermöglicht Ihnen die Verwendung von Modellen, die dem neuesten Stand der Technik entsprechen, ohne dass Sie ein Modell von Grund auf neu trainieren müssen. Transformers bietet Zugang zu Tausenden von vortrainierten Modellen für eine Vielzahl von Aufgaben. Wenn Sie ein vorab trainiertes Modell verwenden, trainieren Sie es auf einem für Ihre Aufgabe spezifischen Datensatz. Dies wird als Feinabstimmung bezeichnet und ist eine unglaublich leistungsfähige Trainingstechnik. In diesem Tutorial werden Sie ein vortrainiertes Modell mit einem Deep-Learning-Framework Ihrer Wahl feinabstimmen:
-
-* Feinabstimmung eines vorab trainierten Modells mit 🤗 Transformers [`Trainer`].
-* Feinabstimmung eines vorab trainierten Modells in TensorFlow mit Keras.
-* Feinabstimmung eines vorab trainierten Modells in nativem PyTorch.
-
-<a id='data-processing'></a>
-
-## Vorbereitung eines Datensatzes
-
-<Youtube id="_BZearw7f0w"/>
-
-Bevor Sie die Feinabstimmung eines vortrainierten Modells vornehmen können, müssen Sie einen Datensatz herunterladen und für das Training vorbereiten. Im vorangegangenen Leitfaden haben Sie gelernt, wie man Daten für das Training aufbereitet, und jetzt haben Sie die Gelegenheit, diese Fähigkeiten zu testen!
-
-Laden Sie zunächst den Datensatz [Yelp Reviews](https://huggingface.co/datasets/yelp_review_full):
-
-```py
->>> from datasets import load_dataset
-
->>> dataset = load_dataset("yelp_review_full")
->>> dataset["train"][100]
-{'label': 0,
- 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. I\'ve worked at more than one location. I expect bad days, bad moods, and the occasional mistake. But I have yet to have a decent experience at this store. It will remain a place I avoid unless someone in my party needs to avoid illness from low blood sugar. Perhaps I should go back to the racially biased service of Steak n Shake instead!'}
-```
-
-Wie Sie nun wissen, benötigen Sie einen Tokenizer, um den Text zu verarbeiten und eine Auffüll- und Abschneidungsstrategie einzubauen, um mit variablen Sequenzlängen umzugehen. Um Ihren Datensatz in einem Schritt zu verarbeiten, verwenden Sie die 🤗 Methode Datasets [`map`](https://huggingface.co/docs/datasets/process.html#map), um eine Vorverarbeitungsfunktion auf den gesamten Datensatz anzuwenden:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
-
-
->>> def tokenize_function(examples):
-...     return tokenizer(examples["text"], padding="max_length", truncation=True)
-
-
->>> tokenized_datasets = dataset.map(tokenize_function, batched=True)
-```
-
-Wenn Sie möchten, können Sie eine kleinere Teilmenge des gesamten Datensatzes für die Feinabstimmung erstellen, um den Zeitaufwand zu verringern:
-
-```py
->>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
->>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
-```
-
-<a id='trainer'></a>
-
-## Training
-
-An dieser Stelle sollten Sie dem Abschnitt folgen, der dem Rahmen entspricht, den Sie verwenden möchten. Sie können über die Links
-in der rechten Seitenleiste können Sie zu dem gewünschten Abschnitt springen - und wenn Sie den gesamten Inhalt eines bestimmten Frameworks ausblenden möchten,
-klicken Sie einfach auf die Schaltfläche oben rechts im Block des jeweiligen Frameworks!
-
-<frameworkcontent>
-<pt>
-<Youtube id="nvBXf7s7vTI"/>
-
-## Trainieren mit PyTorch Trainer
-
-🤗 Transformers bietet eine [`Trainer`]-Klasse, die für das Training von 🤗 Transformers-Modellen optimiert ist und es einfacher macht, mit dem Training zu beginnen, ohne manuell eine eigene Trainingsschleife zu schreiben. Die [`Trainer`]-API unterstützt eine breite Palette von Trainingsoptionen und Funktionen wie Logging, Gradientenakkumulation und gemischte Präzision.
-
-Beginnen Sie mit dem Laden Ihres Modells und geben Sie die Anzahl der erwarteten Labels an. Aus dem Yelp Review [dataset card](https://huggingface.co/datasets/yelp_review_full#data-fields) wissen Sie, dass es fünf Labels gibt:
-
-```py
->>> from transformers import AutoModelForSequenceClassification
-
->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
-```
-
-<Tip>
-
-Es wird eine Warnung angezeigt, dass einige der trainierten Parameter nicht verwendet werden und einige Parameter zufällig
-initialisiert werden. Machen Sie sich keine Sorgen, das ist völlig normal! Der vorher trainierte Kopf des BERT-Modells wird verworfen und durch einen zufällig initialisierten Klassifikationskopf ersetzt. Sie werden diesen neuen Modellkopf in Ihrer Sequenzklassifizierungsaufgabe feinabstimmen, indem Sie das Wissen des vortrainierten Modells auf ihn übertragen.
-
-</Tip>
-
-### Hyperparameter für das Training
-
-Als Nächstes erstellen Sie eine Klasse [`TrainingArguments`], die alle Hyperparameter enthält, die Sie einstellen können, sowie Flags zur Aktivierung verschiedener Trainingsoptionen. Für dieses Lernprogramm können Sie mit den Standard- [Hyperparametern](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments) beginnen, aber Sie können mit diesen experimentieren, um Ihre optimalen Einstellungen zu finden.
-
-Geben Sie an, wo die Kontrollpunkte Ihres Trainings gespeichert werden sollen:
-
-```py
->>> from transformers import TrainingArguments
-
->>> training_args = TrainingArguments(output_dir="test_trainer")
-```
-
-### Auswerten
-
-Der [`Trainer`] wertet die Leistung des Modells während des Trainings nicht automatisch aus. Sie müssen [`Trainer`] eine Funktion übergeben, um Metriken zu berechnen und zu berichten. Die [🤗 Evaluate](https://huggingface.co/docs/evaluate/index) Bibliothek bietet eine einfache [`accuracy`](https://huggingface.co/spaces/evaluate-metric/accuracy) Funktion, die Sie mit der [`evaluate.load`] Funktion laden können (siehe diese [quicktour](https://huggingface.co/docs/evaluate/a_quick_tour) für weitere Informationen):
-
-```py
->>> import numpy as np
->>> import evaluate
-
->>> metric = evaluate.load("accuracy")
-```
-
-Rufen Sie [`~evaluate.compute`] auf `metric` auf, um die Genauigkeit Ihrer Vorhersagen zu berechnen. Bevor Sie Ihre Vorhersagen an `compute` übergeben, müssen Sie die Vorhersagen in Logits umwandeln (denken Sie daran, dass alle 🤗 Transformers-Modelle Logits zurückgeben):
-
-```py
->>> def compute_metrics(eval_pred):
-...     logits, labels = eval_pred
-...     predictions = np.argmax(logits, axis=-1)
-...     return metric.compute(predictions=predictions, references=labels)
-```
-
-Wenn Sie Ihre Bewertungsmetriken während der Feinabstimmung überwachen möchten, geben Sie den Parameter `evaluation_strategy` in Ihren Trainingsargumenten an, um die Bewertungsmetrik am Ende jeder Epoche zu ermitteln:
-
-```py
->>> from transformers import TrainingArguments, Trainer
-
->>> training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
-```
-
-### Trainer
-
-Erstellen Sie ein [`Trainer`]-Objekt mit Ihrem Modell, Trainingsargumenten, Trainings- und Testdatensätzen und einer Evaluierungsfunktion:
-
-```py
->>> trainer = Trainer(
-...     model=model,
-...     args=training_args,
-...     train_dataset=small_train_dataset,
-...     eval_dataset=small_eval_dataset,
-...     compute_metrics=compute_metrics,
-... )
-```
-
-Anschließend können Sie Ihr Modell durch den Aufruf von [`~transformers.Trainer.train`] optimieren:
-
-```py
->>> trainer.train()
-```
-</pt>
-<tf>
-<a id='keras'></a>
-
-<Youtube id="rnTGBy2ax1c"/>
-
-## Trainieren Sie ein TensorFlow-Modell mit Keras
-
-Sie können auch 🤗 Transformers Modelle in TensorFlow mit der Keras API trainieren!
-
-### Laden von Daten für Keras
-
-Wenn Sie ein 🤗 Transformers Modell mit der Keras API trainieren wollen, müssen Sie Ihren Datensatz in ein Format konvertieren, das
-Keras versteht. Wenn Ihr Datensatz klein ist, können Sie das Ganze einfach in NumPy-Arrays konvertieren und an Keras übergeben.
-Probieren wir das zuerst aus, bevor wir etwas Komplizierteres tun.
-
-Laden Sie zunächst ein Dataset. Wir werden den CoLA-Datensatz aus dem [GLUE-Benchmark](https://huggingface.co/datasets/glue) verwenden,
-da es sich um eine einfache Aufgabe zur Klassifizierung von binärem Text handelt, und nehmen vorerst nur den Trainingssplit.
-
-```py
-from datasets import load_dataset
-
-dataset = load_dataset("glue", "cola")
-dataset = dataset["train"]  # Just take the training split for now
-```
-
-Als nächstes laden Sie einen Tokenizer und tokenisieren die Daten als NumPy-Arrays. Beachten Sie, dass die Beschriftungen bereits eine Liste von 0 und 1en sind,
-Wir können sie also ohne Tokenisierung direkt in ein NumPy-Array konvertieren!
-
-```py
-from transformers import AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
-tokenized_data = tokenizer(dataset["text"], return_tensors="np", padding=True)
-# Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras
-tokenized_data = dict(tokenized_data)
-
-labels = np.array(dataset["label"])  # Label is already an array of 0 and 1
-```
-
-Schließlich laden, [`compile`](https://keras.io/api/models/model_training_apis/#compile-method) und [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) Sie das Modell:
-
-```py
-from transformers import TFAutoModelForSequenceClassification
-from tensorflow.keras.optimizers import Adam
-
-# Load and compile our model
-model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
-# Lower learning rates are often better for fine-tuning transformers
-model.compile(optimizer=Adam(3e-5))
-
-model.fit(tokenized_data, labels)
-```
-
-<Tip>
-
-Sie müssen Ihren Modellen kein Verlustargument übergeben, wenn Sie sie `compile()`! Hugging-Face-Modelle wählen automatisch
-einen Loss, der für ihre Aufgabe und Modellarchitektur geeignet ist, wenn dieses Argument leer gelassen wird. Sie können jederzeit außer Kraft setzen, indem Sie selbst einen Loss angeben, wenn Sie das möchten!
-
-</Tip>
-
-Dieser Ansatz eignet sich hervorragend für kleinere Datensätze, aber bei größeren Datensätzen kann er zu einem Problem werden. Warum?
-Weil das tokenisierte Array und die Beschriftungen vollständig in den Speicher geladen werden müssten, und weil NumPy nicht mit
-"gezackte" Arrays nicht verarbeiten kann, so dass jedes tokenisierte Sample auf die Länge des längsten Samples im gesamten Datensatz aufgefüllt werden müsste.
-Datensatzes aufgefüllt werden. Dadurch wird das Array noch größer, und all die aufgefüllten Token verlangsamen auch das Training!
-
-### Laden von Daten als tf.data.Dataset
-
-Wenn Sie eine Verlangsamung des Trainings vermeiden wollen, können Sie Ihre Daten stattdessen als `tf.data.Dataset` laden. Sie können zwar Ihre eigene
-tf.data"-Pipeline schreiben können, wenn Sie wollen, haben wir zwei bequeme Methoden, um dies zu tun:
-
-- [`~TFPreTrainedModel.prepare_tf_dataset`]: Dies ist die Methode, die wir in den meisten Fällen empfehlen. Da es sich um eine Methode
-Ihres Modells ist, kann sie das Modell inspizieren, um automatisch herauszufinden, welche Spalten als Modelleingaben verwendet werden können, und
-verwirft die anderen, um einen einfacheren, leistungsfähigeren Datensatz zu erstellen.
-- [~datasets.Dataset.to_tf_dataset`]: Diese Methode ist eher auf niedriger Ebene angesiedelt und ist nützlich, wenn Sie genau kontrollieren wollen, wie
-Dataset erstellt wird, indem man genau angibt, welche `columns` und `label_cols` einbezogen werden sollen.
-
-Bevor Sie [~TFPreTrainedModel.prepare_tf_dataset`] verwenden können, müssen Sie die Tokenizer-Ausgaben als Spalten zu Ihrem Datensatz hinzufügen, wie in
-dem folgenden Codebeispiel:
-
-```py
-def tokenize_dataset(data):
-    # Keys of the returned dictionary will be added to the dataset as columns
-    return tokenizer(data["text"])
-
-
-dataset = dataset.map(tokenize_dataset)
-```
-
-Denken Sie daran, dass Hugging Face-Datensätze standardmäßig auf der Festplatte gespeichert werden, so dass dies nicht zu einem erhöhten Arbeitsspeicherbedarf führen wird! Sobald die
-Spalten hinzugefügt wurden, können Sie Batches aus dem Datensatz streamen und zu jedem Batch Auffüllungen hinzufügen, was die Anzahl der Auffüllungs-Token im Vergleich zum Auffüllen des gesamten Datensatzes reduziert.
-
-
-```py
->>> tf_dataset = model.prepare_tf_dataset(dataset, batch_size=16, shuffle=True, tokenizer=tokenizer)
-```
-
-Beachten Sie, dass Sie im obigen Codebeispiel den Tokenizer an `prepare_tf_dataset` übergeben müssen, damit die Stapel beim Laden korrekt aufgefüllt werden können.
-Wenn alle Stichproben in Ihrem Datensatz die gleiche Länge haben und kein Auffüllen erforderlich ist, können Sie dieses Argument weglassen.
-Wenn Sie etwas Komplexeres als nur das Auffüllen von Stichproben benötigen (z. B. das Korrumpieren von Token für die maskierte Sprachmodellierung), können Sie das Argument
-Modellierung), können Sie stattdessen das Argument `collate_fn` verwenden, um eine Funktion zu übergeben, die aufgerufen wird, um die
-Liste von Stichproben in einen Stapel umwandelt und alle gewünschten Vorverarbeitungen vornimmt. Siehe unsere
-[examples](https://github.com/huggingface/transformers/tree/main/examples) oder
-[notebooks](https://huggingface.co/docs/transformers/notebooks), um diesen Ansatz in Aktion zu sehen.
-
-Sobald Sie einen `tf.data.Dataset` erstellt haben, können Sie das Modell wie zuvor kompilieren und anpassen:
-
-```py
-model.compile(optimizer=Adam(3e-5))
-
-model.fit(tf_dataset)
-```
-
-</tf>
-</frameworkcontent>
-
-<a id='pytorch_native'></a>
-
-## Trainieren in nativem PyTorch
-
-<frameworkcontent>
-<pt>
-<Youtube id="Dh9CL8fyG80"/>
-
-[`Trainer`] kümmert sich um die Trainingsschleife und ermöglicht die Feinabstimmung eines Modells in einer einzigen Codezeile. Für Benutzer, die es vorziehen, ihre eigene Trainingsschleife zu schreiben, können Sie auch eine Feinabstimmung eines 🤗 Transformers-Modells in nativem PyTorch vornehmen.
-
-An diesem Punkt müssen Sie möglicherweise Ihr Notebook neu starten oder den folgenden Code ausführen, um etwas Speicher freizugeben:
-
-```py
-del model
-del pytorch_model
-del trainer
-torch.cuda.empty_cache()
-```
-
-Als Nächstes müssen Sie den Datensatz `tokenized_dataset` manuell nachbearbeiten, um ihn für das Training vorzubereiten.
-
-1. Entfernen Sie die Spalte "Text", da das Modell keinen Rohtext als Eingabe akzeptiert:
-
-    ```py
-    >>> tokenized_datasets = tokenized_datasets.remove_columns(["text"])
-    ```
-
-2. Benennen Sie die Spalte "Label" in "Labels" um, da das Modell erwartet, dass das Argument "Labels" genannt wird:
-
-    ```py
-    >>> tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
-    ```
-
-3. Stellen Sie das Format des Datensatzes so ein, dass PyTorch-Tensoren anstelle von Listen zurückgegeben werden:
-
-    ```py
-    >>> tokenized_datasets.set_format("torch")
-    ```
-
-Erstellen Sie dann eine kleinere Teilmenge des Datensatzes, wie zuvor gezeigt, um die Feinabstimmung zu beschleunigen:
-
-```py
->>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
->>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
-```
-
-### DataLoader
-
-Erstellen Sie einen `DataLoader` für Ihre Trainings- und Testdatensätze, damit Sie über die Datenstapel iterieren können:
-
-```py
->>> from torch.utils.data import DataLoader
-
->>> train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
->>> eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)
-```
-
-Laden Sie Ihr Modell mit der Anzahl der erwarteten Kennzeichnungen:
-
-```py
->>> from transformers import AutoModelForSequenceClassification
-
->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
-```
-
-### Optimierer und Lernratensteuerung
-
-Erstellen Sie einen Optimierer und einen Scheduler für die Lernrate, um das Modell fein abzustimmen. Wir verwenden den Optimierer [`AdamW`](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html) aus PyTorch:
-
-```py
->>> from torch.optim import AdamW
-
->>> optimizer = AdamW(model.parameters(), lr=5e-5)
-```
-
-Erstellen Sie den Standard-Lernratenplaner aus [`Trainer`]:
-
-```py
->>> from transformers import get_scheduler
-
->>> num_epochs = 3
->>> num_training_steps = num_epochs * len(train_dataloader)
->>> lr_scheduler = get_scheduler(
-...     name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
-... )
-```
-
-Geben Sie schließlich `device` an, um einen Grafikprozessor zu verwenden, wenn Sie Zugang zu einem solchen haben. Andernfalls kann das Training auf einer CPU mehrere Stunden statt ein paar Minuten dauern.
-
-```py
->>> import torch
-
->>> device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
->>> model.to(device)
-```
-
-<Tip>
-
-Holen Sie sich mit einem gehosteten Notebook wie [Colaboratory](https://colab.research.google.com/) oder [SageMaker StudioLab](https://studiolab.sagemaker.aws/) kostenlosen Zugang zu einem Cloud-GPU, wenn Sie noch keinen haben.
-
-</Tip>
-
-Großartig, Sie sind bereit für das Training! 🥳 
-
-### Trainingsschleife
-
-Um Ihren Trainingsfortschritt zu verfolgen, verwenden Sie die [tqdm](https://tqdm.github.io/) Bibliothek, um einen Fortschrittsbalken über die Anzahl der Trainingsschritte hinzuzufügen:
-
-```py
->>> from tqdm.auto import tqdm
-
->>> progress_bar = tqdm(range(num_training_steps))
-
->>> model.train()
->>> for epoch in range(num_epochs):
-...     for batch in train_dataloader:
-...         batch = {k: v.to(device) for k, v in batch.items()}
-...         outputs = model(**batch)
-...         loss = outputs.loss
-...         loss.backward()
-
-...         optimizer.step()
-...         lr_scheduler.step()
-...         optimizer.zero_grad()
-...         progress_bar.update(1)
-```
-
-### Auswertung
-
-Genauso wie Sie eine Bewertungsfunktion zu [`Trainer`] hinzugefügt haben, müssen Sie dasselbe tun, wenn Sie Ihre eigene Trainingsschleife schreiben. Aber anstatt die Metrik am Ende jeder Epoche zu berechnen und zu melden, werden Sie dieses Mal alle Stapel mit [`~evaluate.add_batch`] akkumulieren und die Metrik ganz am Ende berechnen.
-
-```py
->>> import evaluate
-
->>> metric = evaluate.load("accuracy")
->>> model.eval()
->>> for batch in eval_dataloader:
-...     batch = {k: v.to(device) for k, v in batch.items()}
-...     with torch.no_grad():
-...         outputs = model(**batch)
-
-...     logits = outputs.logits
-...     predictions = torch.argmax(logits, dim=-1)
-...     metric.add_batch(predictions=predictions, references=batch["labels"])
-
->>> metric.compute()
-```
-</pt>
-</frameworkcontent>
-
-<a id='additional-resources'></a>
-
-## Zusätzliche Ressourcen
-
-Weitere Beispiele für die Feinabstimmung finden Sie unter:
-
-- [🤗 Transformers Examples](https://github.com/huggingface/transformers/tree/main/examples) enthält Skripte
-  um gängige NLP-Aufgaben in PyTorch und TensorFlow zu trainieren.
-
-- [🤗 Transformers Notebooks](notebooks) enthält verschiedene Notebooks zur Feinabstimmung eines Modells für bestimmte Aufgaben in PyTorch und TensorFlow.
\ No newline at end of file
diff --git a/docs/source/de/transformers_agents.md b/docs/source/de/transformers_agents.md
new file mode 100644
index 000000000000..1d676c395e17
--- /dev/null
+++ b/docs/source/de/transformers_agents.md
@@ -0,0 +1,323 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Transformers Agents
+
+<Tip warning={true}>
+
+Transformers Agents ist eine experimentelle API, die jederzeit geändert werden kann. Die von den Agenten zurückgegebenen Ergebnisse
+zurückgegeben werden, können variieren, da sich die APIs oder die zugrunde liegenden Modelle ändern können.
+
+</Tip>
+
+Transformers Version v4.29.0, die auf dem Konzept von *Tools* und *Agenten* aufbaut. Sie können damit spielen in
+[dieses Colab](https://colab.research.google.com/drive/1c7MHD-T1forUPGcC_jlwsIptOzpG3hSj).
+
+Kurz gesagt, es bietet eine API für natürliche Sprache auf der Grundlage von Transformers: Wir definieren eine Reihe von kuratierten Tools und entwerfen einen 
+Agenten, um natürliche Sprache zu interpretieren und diese Werkzeuge zu verwenden. Es ist von vornherein erweiterbar; wir haben einige relevante Tools kuratiert, 
+aber wir werden Ihnen zeigen, wie das System einfach erweitert werden kann, um jedes von der Community entwickelte Tool zu verwenden.
+
+Beginnen wir mit einigen Beispielen dafür, was mit dieser neuen API erreicht werden kann. Sie ist besonders leistungsfähig, wenn es um 
+Sie ist besonders leistungsstark, wenn es um multimodale Aufgaben geht. Lassen Sie uns also eine Runde drehen, um Bilder zu erzeugen und Text vorzulesen.
+
+```py
+agent.run("Caption the following image", image=image)
+```
+
+| **Input**                                                                                                                   | **Output**                        |
+|-----------------------------------------------------------------------------------------------------------------------------|-----------------------------------|
+| <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/beaver.png" width=200> | A beaver is swimming in the water |
+
+---
+
+```py
+agent.run("Read the following text out loud", text=text)
+```
+| **Input**                                                                                                               | **Output**                                   |
+|-------------------------------------------------------------------------------------------------------------------------|----------------------------------------------|
+| A beaver is swimming in the water | <audio controls><source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tts_example.wav" type="audio/wav"> your browser does not support the audio element. </audio>
+
+---
+
+```py
+agent.run(
+    "In the following `document`, where will the TRRF Scientific Advisory Council Meeting take place?",
+    document=document,
+)
+```
+| **Input**                                                                                                                   | **Output**     |
+|-----------------------------------------------------------------------------------------------------------------------------|----------------|
+| <img src="https://datasets-server.huggingface.co/assets/hf-internal-testing/example-documents/--/hf-internal-testing--example-documents/test/0/image/image.jpg" width=200> | ballroom foyer |
+
+## Schnellstart
+
+Bevor Sie `agent.run` verwenden können, müssen Sie einen Agenten instanziieren, der ein großes Sprachmodell (LLM) ist. 
+Wir bieten Unterstützung für openAI-Modelle sowie für OpenSource-Alternativen von BigCode und OpenAssistant. Die openAI
+Modelle sind leistungsfähiger (erfordern aber einen openAI-API-Schlüssel, können also nicht kostenlos verwendet werden); Hugging Face
+bietet kostenlosen Zugang zu Endpunkten für BigCode- und OpenAssistant-Modelle.
+
+To start with, please install the `agents` extras in order to install all default dependencies.
+```bash
+pip install transformers[agents]
+```
+
+Um openAI-Modelle zu verwenden, instanziieren Sie einen [`OpenAiAgent`], nachdem Sie die `openai`-Abhängigkeit installiert haben:
+
+```bash
+pip install openai
+```
+
+
+```py
+from transformers import OpenAiAgent
+
+agent = OpenAiAgent(model="text-davinci-003", api_key="<your_api_key>")
+```
+
+Um BigCode oder OpenAssistant zu verwenden, melden Sie sich zunächst an, um Zugriff auf die Inference API zu erhalten:
+
+```py
+from huggingface_hub import login
+
+login("<YOUR_TOKEN>")
+```
+
+Dann instanziieren Sie den Agenten
+
+```py
+from transformers import HfAgent
+
+# Starcoder
+agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
+# StarcoderBase
+# agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoderbase")
+# OpenAssistant
+# agent = HfAgent(url_endpoint="https://api-inference.huggingface.co/models/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5")
+```
+
+Dies geschieht mit der Inferenz-API, die Hugging Face derzeit kostenlos zur Verfügung stellt. Wenn Sie Ihren eigenen Inferenz
+Endpunkt für dieses Modell (oder einen anderen) haben, können Sie die obige URL durch Ihren URL-Endpunkt ersetzen.
+
+<Tip>
+
+StarCoder und OpenAssistant sind kostenlos und leisten bei einfachen Aufgaben bewundernswert gute Arbeit. Allerdings halten die Kontrollpunkte
+nicht, wenn es um komplexere Aufforderungen geht. Wenn Sie mit einem solchen Problem konfrontiert sind, empfehlen wir Ihnen, das OpenAI
+Modell auszuprobieren, das zwar leider nicht quelloffen ist, aber zur Zeit eine bessere Leistung erbringt.
+
+</Tip>
+
+Sie sind jetzt startklar! Lassen Sie uns in die beiden APIs eintauchen, die Ihnen jetzt zur Verfügung stehen.
+
+### Einzelne Ausführung (run)
+
+Die Methode der einmaligen Ausführung ist die Verwendung der [`~Agent.run`] Methode des Agenten:
+
+```py
+agent.run("Draw me a picture of rivers and lakes.")
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200>
+
+Es wählt automatisch das (oder die) Werkzeug(e) aus, das (die) für die von Ihnen gewünschte Aufgabe geeignet ist (sind) und führt es (sie) entsprechend aus. Es
+kann eine oder mehrere Aufgaben in der gleichen Anweisung ausführen (je komplexer Ihre Anweisung ist, desto wahrscheinlicher ist ein
+der Agent scheitern).
+
+```py
+agent.run("Draw me a picture of the sea then transform the picture to add an island")
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/sea_and_island.png" width=200>
+
+<br/>
+
+
+Jede [`~Agent.run`] Operation ist unabhängig, so dass Sie sie mehrmals hintereinander mit unterschiedlichen Aufgaben ausführen können.
+
+Beachten Sie, dass Ihr `Agent` nur ein großsprachiges Modell ist, so dass kleine Variationen in Ihrer Eingabeaufforderung völlig unterschiedliche Ergebnisse liefern können.
+unterschiedliche Ergebnisse liefern. Es ist wichtig, dass Sie die Aufgabe, die Sie ausführen möchten, so genau wie möglich erklären. Wir gehen noch weiter ins Detail
+wie man gute Prompts schreibt [hier](custom_tools#writing-good-user-inputs).
+
+Wenn Sie einen Status über Ausführungszeiten hinweg beibehalten oder dem Agenten Nicht-Text-Objekte übergeben möchten, können Sie dies tun, indem Sie
+Variablen, die der Agent verwenden soll. Sie könnten zum Beispiel das erste Bild von Flüssen und Seen erzeugen, 
+und das Modell bitten, dieses Bild zu aktualisieren und eine Insel hinzuzufügen, indem Sie Folgendes tun:
+
+```python
+picture = agent.run("Generate a picture of rivers and lakes.")
+updated_picture = agent.run("Transform the image in `picture` to add an island to it.", picture=picture)
+```
+
+<Tip>
+
+Dies kann hilfreich sein, wenn das Modell Ihre Anfrage nicht verstehen kann und die Werkzeuge verwechselt. Ein Beispiel wäre:
+
+```py
+agent.run("Draw me the picture of a capybara swimming in the sea")
+```
+
+Hier könnte das Modell auf zwei Arten interpretieren:
+- Die Funktion `Text-zu-Bild` erzeugt ein Wasserschwein, das im Meer schwimmt.
+- Oder Sie lassen das `Text-zu-Bild` ein Wasserschwein erzeugen und verwenden dann das Werkzeug `Bildtransformation`, um es im Meer schwimmen zu lassen.
+
+Falls Sie das erste Szenario erzwingen möchten, können Sie dies tun, indem Sie die Eingabeaufforderung als Argument übergeben:
+
+```py
+agent.run("Draw me a picture of the `prompt`", prompt="a capybara swimming in the sea")
+```
+
+</Tip>
+
+
+### Chat-basierte Ausführung (Chat)
+
+Der Agent verfügt auch über einen Chat-basierten Ansatz, der die Methode [`~Agent.chat`] verwendet:
+
+```py
+agent.chat("Generate a picture of rivers and lakes")
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200> 
+
+```py
+agent.chat("Transform the picture so that there is a rock in there")
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes_and_beaver.png" width=200>
+
+<br/>
+
+Dies ist ein interessanter Ansatz, wenn Sie den Zustand über Anweisungen hinweg beibehalten möchten. Er ist besser für Experimente geeignet, 
+eignet sich aber eher für einzelne Anweisungen als für komplexe Anweisungen (die die [`~Agent.run`]
+Methode besser verarbeiten kann).
+
+Diese Methode kann auch Argumente entgegennehmen, wenn Sie Nicht-Text-Typen oder bestimmte Aufforderungen übergeben möchten.
+
+### ⚠️ Fernausführung
+
+Zu Demonstrationszwecken und damit es mit allen Setups verwendet werden kann, haben wir Remote-Executors für mehrere 
+der Standard-Tools erstellt, auf die der Agent in dieser Version Zugriff hat. Diese werden erstellt mit 
+[inference endpoints](https://huggingface.co/inference-endpoints).
+
+Wir haben diese vorerst deaktiviert, aber um zu sehen, wie Sie selbst Remote Executors Tools einrichten können,
+empfehlen wir die Lektüre des [custom tool guide](./custom_tools).
+
+### Was passiert hier? Was sind Tools und was sind Agenten?
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/diagram.png">
+
+#### Agenten
+
+Der "Agent" ist hier ein großes Sprachmodell, das wir auffordern, Zugang zu einem bestimmten Satz von Tools zu erhalten.
+
+LLMs sind ziemlich gut darin, kleine Codeproben zu erzeugen. Diese API macht sich das zunutze, indem sie das 
+LLM ein kleines Codebeispiel gibt, das eine Aufgabe mit einer Reihe von Werkzeugen ausführt. Diese Aufforderung wird dann ergänzt durch die 
+Aufgabe, die Sie Ihrem Agenten geben, und die Beschreibung der Werkzeuge, die Sie ihm geben. Auf diese Weise erhält er Zugriff auf die Dokumentation der 
+Tools, insbesondere die erwarteten Eingaben und Ausgaben, und kann den entsprechenden Code generieren.
+
+#### Tools
+
+Tools sind sehr einfach: Sie bestehen aus einer einzigen Funktion mit einem Namen und einer Beschreibung. Wir verwenden dann die Beschreibungen dieser Tools 
+um den Agenten aufzufordern. Anhand der Eingabeaufforderung zeigen wir dem Agenten, wie er die Tools nutzen kann, um das zu tun, was in der 
+in der Abfrage angefordert wurde.
+
+Dies geschieht mit brandneuen Tools und nicht mit Pipelines, denn der Agent schreibt besseren Code mit sehr atomaren Tools. 
+Pipelines sind stärker refaktorisiert und fassen oft mehrere Aufgaben in einer einzigen zusammen. Tools sind dafür gedacht, sich auf
+eine einzige, sehr einfache Aufgabe konzentrieren.
+
+#### Code-Ausführung?!
+
+Dieser Code wird dann mit unserem kleinen Python-Interpreter auf den mit Ihren Tools übergebenen Eingaben ausgeführt. 
+Wir hören Sie schon schreien "Willkürliche Codeausführung!", aber lassen Sie uns erklären, warum das nicht der Fall ist.
+
+Die einzigen Funktionen, die aufgerufen werden können, sind die von Ihnen zur Verfügung gestellten Tools und die Druckfunktion, so dass Sie bereits eingeschränkt sind 
+eingeschränkt, was ausgeführt werden kann. Sie sollten sicher sein, wenn es sich auf die Werkzeuge für das Umarmungsgesicht beschränkt. 
+
+Dann lassen wir keine Attributsuche oder Importe zu (die ohnehin nicht benötigt werden, um die 
+Inputs/Outputs an eine kleine Gruppe von Funktionen), so dass alle offensichtlichen Angriffe (und Sie müssten den LLM 
+dazu auffordern, sie auszugeben) kein Problem darstellen sollten. Wenn Sie auf Nummer sicher gehen wollen, können Sie die 
+run()-Methode mit dem zusätzlichen Argument return_code=True ausführen. In diesem Fall gibt der Agent nur den auszuführenden Code 
+zur Ausführung zurück und Sie können entscheiden, ob Sie ihn ausführen möchten oder nicht.
+
+Die Ausführung bricht bei jeder Zeile ab, in der versucht wird, eine illegale Operation auszuführen, oder wenn ein regulärer Python-Fehler 
+mit dem vom Agenten generierten Code.
+
+### Ein kuratierter Satz von Tools
+
+Wir haben eine Reihe von Tools identifiziert, die solche Agenten unterstützen können. Hier ist eine aktualisierte Liste der Tools, die wir integriert haben 
+in `transformers` integriert haben:
+
+- **Beantwortung von Fragen zu Dokumenten**: Beantworten Sie anhand eines Dokuments (z.B. PDF) im Bildformat eine Frage zu diesem Dokument ([Donut](./model_doc/donut))
+- Beantworten von Textfragen**: Geben Sie einen langen Text und eine Frage an, beantworten Sie die Frage im Text ([Flan-T5](./model_doc/flan-t5))
+- **Unbedingte Bildunterschriften**: Beschriften Sie das Bild! ([BLIP](./model_doc/blip))
+- **Bildfragebeantwortung**: Beantworten Sie bei einem Bild eine Frage zu diesem Bild ([VILT](./model_doc/vilt))
+- **Bildsegmentierung**: Geben Sie ein Bild und einen Prompt an und geben Sie die Segmentierungsmaske dieses Prompts aus ([CLIPSeg](./model_doc/clipseg))
+- **Sprache in Text**: Geben Sie eine Audioaufnahme einer sprechenden Person an und transkribieren Sie die Sprache in Text ([Whisper](./model_doc/whisper))
+- **Text in Sprache**: wandelt Text in Sprache um ([SpeechT5](./model_doc/speecht5))
+- **Zero-Shot-Textklassifizierung**: Ermitteln Sie anhand eines Textes und einer Liste von Bezeichnungen, welcher Bezeichnung der Text am ehesten entspricht ([BART](./model_doc/bart))
+- **Textzusammenfassung**: fassen Sie einen langen Text in einem oder wenigen Sätzen zusammen ([BART](./model_doc/bart))
+- **Übersetzung**: Übersetzen des Textes in eine bestimmte Sprache ([NLLB](./model_doc/nllb))
+
+Diese Tools sind in Transformatoren integriert und können auch manuell verwendet werden, zum Beispiel:
+
+```py
+from transformers import load_tool
+
+tool = load_tool("text-to-speech")
+audio = tool("This is a text to speech tool")
+```
+
+### Benutzerdefinierte Tools
+
+Wir haben zwar eine Reihe von Tools identifiziert, sind aber der festen Überzeugung, dass der Hauptwert dieser Implementierung darin besteht 
+die Möglichkeit, benutzerdefinierte Tools schnell zu erstellen und weiterzugeben.
+
+Indem Sie den Code eines Tools in einen Hugging Face Space oder ein Modell-Repository stellen, können Sie das Tool 
+direkt mit dem Agenten nutzen. Wir haben ein paar neue Funktionen hinzugefügt 
+**transformers-agnostic** Tools zur [`huggingface-tools` Organisation](https://huggingface.co/huggingface-tools) hinzugefügt:
+
+- **Text-Downloader**: zum Herunterladen eines Textes von einer Web-URL
+- **Text zu Bild**: erzeugt ein Bild nach einer Eingabeaufforderung und nutzt dabei stabile Diffusion
+- **Bildtransformation**: verändert ein Bild anhand eines Ausgangsbildes und einer Eingabeaufforderung, unter Ausnutzung der stabilen pix2pix-Diffusion
+- **Text zu Video**: Erzeugen eines kleinen Videos nach einer Eingabeaufforderung, unter Verwendung von damo-vilab
+
+Das Text-zu-Bild-Tool, das wir von Anfang an verwendet haben, ist ein Remote-Tool, das sich in 
+[*huggingface-tools/text-to-image*](https://huggingface.co/spaces/huggingface-tools/text-to-image)! Wir werden
+weiterhin solche Tools für diese und andere Organisationen veröffentlichen, um diese Implementierung weiter zu verbessern.
+
+Die Agenten haben standardmäßig Zugriff auf die Tools, die sich auf [*huggingface-tools*](https://huggingface.co/huggingface-tools) befinden.
+Wie Sie Ihre eigenen Tools schreiben und freigeben können und wie Sie jedes benutzerdefinierte Tool, das sich auf dem Hub befindet, nutzen können, erklären wir in [folgender Anleitung](custom_tools).
+
+### Code-Erzeugung
+
+Bisher haben wir gezeigt, wie Sie die Agenten nutzen können, um Aktionen für Sie durchzuführen. Der Agent generiert jedoch nur Code
+den wir dann mit einem sehr eingeschränkten Python-Interpreter ausführen. Falls Sie den generierten Code in einer anderen Umgebung verwenden möchten 
+einer anderen Umgebung verwenden möchten, können Sie den Agenten auffordern, den Code zusammen mit einer Tooldefinition und genauen Importen zurückzugeben.
+
+Zum Beispiel die folgende Anweisung
+```python
+agent.run("Draw me a picture of rivers and lakes", return_code=True)
+```
+
+gibt den folgenden Code zurück
+
+```python
+from transformers import load_tool
+
+image_generator = load_tool("huggingface-tools/text-to-image")
+
+image = image_generator(prompt="rivers and lakes")
+```
+
+die Sie dann selbst ändern und ausführen können.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
old mode 100755
new mode 100644
index 3573c6070cdc..d8e92f43b4f3
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -8,67 +8,55 @@
   title: Get started
 - sections:
   - local: pipeline_tutorial
-    title: Pipelines for inference
+    title: Run inference with pipelines
   - local: autoclass_tutorial
-    title: Load pretrained instances with an AutoClass
+    title: Write portable code with AutoClass
   - local: preprocessing
-    title: Preprocess
+    title: Preprocess data
   - local: training
     title: Fine-tune a pretrained model
+  - local: run_scripts
+    title: Train with a script
   - local: accelerate
-    title: Distributed training with 🤗 Accelerate
+    title: Set up distributed training with 🤗 Accelerate
+  - local: peft
+    title: Load and train adapters with 🤗 PEFT
   - local: model_sharing
-    title: Share a model
+    title: Share your model
+  - local: transformers_agents
+    title: Agents
+  - local: llm_tutorial
+    title: Generation with LLMs
   title: Tutorials
 - sections:
-  - sections:
-    - local: create_a_model
-      title: Create a custom architecture
-    - local: custom_models
-      title: Sharing custom models
-    - local: run_scripts
-      title: Train with a script
-    - local: sagemaker
-      title: Run training on Amazon SageMaker
-    - local: converting_tensorflow_models
-      title: Converting from TensorFlow checkpoints
-    - local: serialization
-      title: Export to ONNX
-    - local: torchscript
-      title: Export to TorchScript
-    - local: troubleshooting
-      title: Troubleshoot
-    title: General usage
-  - sections:
-    - local: fast_tokenizers
-      title: Use tokenizers from 🤗 Tokenizers
-    - local: multilingual
-      title: Inference for multilingual models
-    - sections:
-      - local: tasks/sequence_classification
-        title: Text classification
-      - local: tasks/token_classification
-        title: Token classification
-      - local: tasks/question_answering
-        title: Question answering
-      - local: tasks/language_modeling
-        title: Language modeling
-      - local: tasks/translation
-        title: Translation
-      - local: tasks/summarization
-        title: Summarization
-      - local: tasks/multiple_choice
-        title: Multiple choice
-      title: Task guides
-      isExpanded: false
+  - isExpanded: false
+    sections:
+    - local: tasks/sequence_classification
+      title: Text classification
+    - local: tasks/token_classification
+      title: Token classification
+    - local: tasks/question_answering
+      title: Question answering
+    - local: tasks/language_modeling
+      title: Causal language modeling
+    - local: tasks/masked_language_modeling
+      title: Masked language modeling
+    - local: tasks/translation
+      title: Translation
+    - local: tasks/summarization
+      title: Summarization
+    - local: tasks/multiple_choice
+      title: Multiple choice
     title: Natural Language Processing
-  - sections:
+  - isExpanded: false
+    sections:
     - local: tasks/audio_classification
       title: Audio classification
     - local: tasks/asr
       title: Automatic speech recognition
     title: Audio
-  - sections:
+  - isExpanded: false
+    sections:
     - local: tasks/image_classification
       title: Image classification
     - local: tasks/semantic_segmentation
@@ -77,22 +65,89 @@
       title: Video classification
     - local: tasks/object_detection
       title: Object detection
+    - local: tasks/zero_shot_object_detection
+      title: Zero-shot object detection
+    - local: tasks/zero_shot_image_classification
+      title: Zero-shot image classification
+    - local: tasks/monocular_depth_estimation
+      title: Depth estimation
     title: Computer Vision
+  - isExpanded: false
+    sections:
+    - local: tasks/image_captioning
+      title: Image captioning
+    - local: tasks/document_question_answering
+      title: Document Question Answering
+    - local: tasks/visual_question_answering
+      title: Visual Question Answering
+    - local: tasks/text-to-speech
+      title: Text to speech
+    title: Multimodal
+  - isExpanded: false
+    sections:
+    - local: generation_strategies
+      title: Customize the generation strategy
+    title: Generation
+  - isExpanded: false
+    sections:
+    - local: tasks/idefics
+      title: Image tasks with IDEFICS
+    title: Prompting
+  title: Task Guides
+- sections:
+  - local: fast_tokenizers
+    title: Use fast tokenizers from 🤗 Tokenizers
+  - local: multilingual
+    title: Run inference with multilingual models
+  - local: create_a_model
+    title: Use model-specific APIs
+  - local: custom_models
+    title: Share a custom model
+  - local: chat_templating
+    title: Templates for chat models
+  - local: sagemaker
+    title: Run training on Amazon SageMaker
+  - local: serialization
+    title: Export to ONNX
+  - local: tflite
+    title: Export to TFLite
+  - local: torchscript
+    title: Export to TorchScript
+  - local: benchmarks
+    title: Benchmarks
+  - local: notebooks
+    title: Notebooks with examples
+  - local: community
+    title: Community resources
+  - local: custom_tools
+    title: Custom Tools and Prompts
+  - local: troubleshooting
+    title: Troubleshoot
+  title: Developer guides
+- sections:
+  - local: performance
+    title: Overview
   - sections:
-    - local: performance
-      title: Overview
     - local: perf_train_gpu_one
-      title: Training on one GPU
+      title: Methods and tools for efficient training on a single GPU
     - local: perf_train_gpu_many
-      title: Training on many GPUs
+      title: Multiple GPUs and parallelism
     - local: perf_train_cpu
-      title: Training on CPU
+      title: Efficient training on CPU
     - local: perf_train_cpu_many
-      title: Training on many CPUs
+      title: Distributed CPU training
     - local: perf_train_tpu
       title: Training on TPUs
+    - local: perf_train_tpu_tf
+      title: Training on TPU with TensorFlow
     - local: perf_train_special
       title: Training on Specialized Hardware
+    - local: perf_hardware
+      title: Custom hardware for training
+    - local: hpo_train
+      title: Hyperparameter Search using Trainer API
+    title: Efficient training techniques
+  - sections:
     - local: perf_infer_cpu
       title: Inference on CPU
     - local: perf_infer_gpu_one
@@ -101,49 +156,45 @@
       title: Inference on many GPUs
     - local: perf_infer_special
       title: Inference on Specialized Hardware
-    - local: perf_hardware
-      title: Custom hardware for training
-    - local: big_models
-      title: Instantiating a big model
-    - local: debugging
-      title: Debugging
-    - local: hpo_train
-      title: Hyperparameter Search using Trainer API
-    title: Performance and scalability
-  - sections:
-    - local: contributing
-      title: How to contribute to transformers?
-    - local: add_new_model
-      title: How to add a model to 🤗 Transformers?
-    - local: add_tensorflow_model
-      title: How to convert a 🤗 Transformers model to TensorFlow?
-    - local: add_new_pipeline
-      title: How to add a pipeline to 🤗 Transformers?
-    - local: testing
-      title: Testing
-    - local: pr_checks
-      title: Checks on a Pull Request
-    title: Contribute
-  - local: notebooks
-    title: 🤗 Transformers Notebooks
-  - local: community
-    title: Community resources
-  - local: benchmarks
-    title: Benchmarks
-  - local: migration
-    title: Migrating from previous packages
-  title: How-to guides
+    title: Optimizing inference
+  - local: big_models
+    title: Instantiating a big model
+  - local: debugging
+    title: Troubleshooting
+  - local: tf_xla
+    title: XLA Integration for TensorFlow Models
+  - local: perf_torch_compile
+    title: Optimize inference using `torch.compile()`
+  title: Performance and scalability
+- sections:
+  - local: contributing
+    title: How to contribute to transformers?
+  - local: add_new_model
+    title: How to add a model to 🤗 Transformers?
+  - local: add_tensorflow_model
+    title: How to convert a 🤗 Transformers model to TensorFlow?
+  - local: add_new_pipeline
+    title: How to add a pipeline to 🤗 Transformers?
+  - local: testing
+    title: Testing
+  - local: pr_checks
+    title: Checks on a Pull Request
+  title: Contribute
 - sections:
   - local: philosophy
     title: Philosophy
   - local: glossary
     title: Glossary
   - local: task_summary
-    title: Summary of the tasks
+    title: What 🤗 Transformers can do
+  - local: tasks_explained
+    title: How 🤗 Transformers solve tasks
   - local: model_summary
-    title: Summary of the models
+    title: The Transformer model family
   - local: tokenizer_summary
     title: Summary of the tokenizers
+  - local: attention
+    title: Attention mechanisms
   - local: pad_truncation
     title: Padding and truncation
   - local: bertology
@@ -152,9 +203,13 @@
     title: Perplexity of fixed-length models
   - local: pipeline_webserver
     title: Pipelines for webserver inference
+  - local: model_memory_anatomy
+    title: Model training anatomy
   title: Conceptual guides
 - sections:
   - sections:
+    - local: main_classes/agent
+      title: Agents and Tools
     - local: model_doc/auto
       title: Auto Classes
     - local: main_classes/callback
@@ -181,6 +236,8 @@
       title: Pipelines
     - local: main_classes/processors
       title: Processors
+    - local: main_classes/quantization
+      title: Quantization
     - local: main_classes/tokenizer
       title: Tokenizer
     - local: main_classes/trainer
@@ -233,10 +290,14 @@
         title: CANINE
       - local: model_doc/codegen
         title: CodeGen
+      - local: model_doc/code_llama
+        title: CodeLlama
       - local: model_doc/convbert
         title: ConvBERT
       - local: model_doc/cpm
         title: CPM
+      - local: model_doc/cpmant
+        title: CPMANT
       - local: model_doc/ctrl
         title: CTRL
       - local: model_doc/deberta
@@ -255,10 +316,16 @@
         title: Encoder Decoder Models
       - local: model_doc/ernie
         title: ERNIE
+      - local: model_doc/ernie_m
+        title: ErnieM
       - local: model_doc/esm
         title: ESM
+      - local: model_doc/falcon
+        title: Falcon
       - local: model_doc/flan-t5
         title: FLAN-T5
+      - local: model_doc/flan-ul2
+        title: FLAN-UL2
       - local: model_doc/flaubert
         title: FlauBERT
       - local: model_doc/fnet
@@ -279,6 +346,10 @@
         title: GPT-J
       - local: model_doc/gpt2
         title: GPT2
+      - local: model_doc/gpt_bigcode
+        title: GPTBigCode
+      - local: model_doc/gptsan-japanese
+        title: GPTSAN Japanese
       - local: model_doc/gpt-sw3
         title: GPTSw3
       - local: model_doc/herbert
@@ -287,12 +358,12 @@
         title: I-BERT
       - local: model_doc/jukebox
         title: Jukebox
-      - local: model_doc/layoutlm
-        title: LayoutLM
       - local: model_doc/led
         title: LED
-      - local: model_doc/lilt
-        title: LiLT
+      - local: model_doc/llama
+        title: LLaMA
+      - local: model_doc/llama2
+        title: Llama2
       - local: model_doc/longformer
         title: Longformer
       - local: model_doc/longt5
@@ -307,16 +378,24 @@
         title: MarkupLM
       - local: model_doc/mbart
         title: MBart and MBart-50
+      - local: model_doc/mega
+        title: MEGA
       - local: model_doc/megatron-bert
         title: MegatronBERT
       - local: model_doc/megatron_gpt2
         title: MegatronGPT2
+      - local: model_doc/mistral
+        title: Mistral
       - local: model_doc/mluke
         title: mLUKE
       - local: model_doc/mobilebert
         title: MobileBERT
       - local: model_doc/mpnet
         title: MPNet
+      - local: model_doc/mpt
+        title: MPT
+      - local: model_doc/mra
+        title: MRA
       - local: model_doc/mt5
         title: MT5
       - local: model_doc/mvp
@@ -325,14 +404,20 @@
         title: NEZHA
       - local: model_doc/nllb
         title: NLLB
+      - local: model_doc/nllb-moe
+        title: NLLB-MoE
       - local: model_doc/nystromformer
         title: Nyströmformer
+      - local: model_doc/open-llama
+        title: Open-Llama
       - local: model_doc/opt
         title: OPT
       - local: model_doc/pegasus
         title: Pegasus
       - local: model_doc/pegasus_x
         title: PEGASUS-X
+      - local: model_doc/persimmon
+        title: Persimmon
       - local: model_doc/phobert
         title: PhoBERT
       - local: model_doc/plbart
@@ -359,6 +444,8 @@
         title: RoCBert
       - local: model_doc/roformer
         title: RoFormer
+      - local: model_doc/rwkv
+        title: RWKV
       - local: model_doc/splinter
         title: Splinter
       - local: model_doc/squeezebert
@@ -369,14 +456,16 @@
         title: T5
       - local: model_doc/t5v1.1
         title: T5v1.1
-      - local: model_doc/tapas
-        title: TAPAS
       - local: model_doc/tapex
         title: TAPEX
       - local: model_doc/transfo-xl
         title: Transformer XL
       - local: model_doc/ul2
         title: UL2
+      - local: model_doc/umt5
+        title: UMT5
+      - local: model_doc/xmod
+        title: X-MOD
       - local: model_doc/xglm
         title: XGLM
       - local: model_doc/xlm
@@ -387,6 +476,8 @@
         title: XLM-RoBERTa
       - local: model_doc/xlm-roberta-xl
         title: XLM-RoBERTa-XL
+      - local: model_doc/xlm-v
+        title: XLM-V
       - local: model_doc/xlnet
         title: XLNet
       - local: model_doc/yoso
@@ -402,26 +493,40 @@
         title: Conditional DETR
       - local: model_doc/convnext
         title: ConvNeXT
+      - local: model_doc/convnextv2
+        title: ConvNeXTV2
       - local: model_doc/cvt
         title: CvT
       - local: model_doc/deformable_detr
         title: Deformable DETR
       - local: model_doc/deit
         title: DeiT
+      - local: model_doc/deta
+        title: DETA
       - local: model_doc/detr
         title: DETR
       - local: model_doc/dinat
         title: DiNAT
+      - local: model_doc/dinov2
+        title: DINOV2
       - local: model_doc/dit
         title: DiT
       - local: model_doc/dpt
         title: DPT
+      - local: model_doc/efficientformer
+        title: EfficientFormer
+      - local: model_doc/efficientnet
+        title: EfficientNet
+      - local: model_doc/focalnet
+        title: FocalNet
       - local: model_doc/glpn
         title: GLPN
       - local: model_doc/imagegpt
         title: ImageGPT
       - local: model_doc/levit
         title: LeViT
+      - local: model_doc/mask2former
+        title: Mask2Former
       - local: model_doc/maskformer
         title: MaskFormer
       - local: model_doc/mobilenet_v1
@@ -430,16 +535,22 @@
         title: MobileNetV2
       - local: model_doc/mobilevit
         title: MobileViT
+      - local: model_doc/mobilevitv2
+        title: MobileViTV2
       - local: model_doc/nat
         title: NAT
       - local: model_doc/poolformer
         title: PoolFormer
+      - local: model_doc/pvt
+        title: Pyramid Vision Transformer (PVT)
       - local: model_doc/regnet
         title: RegNet
       - local: model_doc/resnet
         title: ResNet
       - local: model_doc/segformer
         title: SegFormer
+      - local: model_doc/swiftformer
+        title: SwiftFormer
       - local: model_doc/swin
         title: Swin Transformer
       - local: model_doc/swinv2
@@ -450,6 +561,8 @@
         title: Table Transformer
       - local: model_doc/timesformer
         title: TimeSformer
+      - local: model_doc/upernet
+        title: UperNet
       - local: model_doc/van
         title: VAN
       - local: model_doc/videomae
@@ -458,10 +571,16 @@
         title: Vision Transformer (ViT)
       - local: model_doc/vit_hybrid
         title: ViT Hybrid
+      - local: model_doc/vitdet
+        title: ViTDet
       - local: model_doc/vit_mae
         title: ViTMAE
+      - local: model_doc/vitmatte
+        title: ViTMatte
       - local: model_doc/vit_msn
         title: ViTMSN
+      - local: model_doc/vivit
+        title: ViViT
       - local: model_doc/yolos
         title: YOLOS
       title: Vision models
@@ -469,10 +588,22 @@
       sections:
       - local: model_doc/audio-spectrogram-transformer
         title: Audio Spectrogram Transformer
+      - local: model_doc/bark
+        title: Bark
+      - local: model_doc/clap
+        title: CLAP
+      - local: model_doc/encodec
+        title: EnCodec
       - local: model_doc/hubert
         title: Hubert
       - local: model_doc/mctct
         title: MCTCT
+      - local: model_doc/mms
+        title: MMS
+      - local: model_doc/musicgen
+        title: MusicGen
+      - local: model_doc/pop2piano
+        title: Pop2Piano
       - local: model_doc/sew
         title: SEW
       - local: model_doc/sew-d
@@ -481,10 +612,14 @@
         title: Speech2Text
       - local: model_doc/speech_to_text_2
         title: Speech2Text2
+      - local: model_doc/speecht5
+        title: SpeechT5
       - local: model_doc/unispeech
         title: UniSpeech
       - local: model_doc/unispeech-sat
         title: UniSpeech-SAT
+      - local: model_doc/vits
+        title: VITS
       - local: model_doc/wav2vec2
         title: Wav2Vec2
       - local: model_doc/wav2vec2-conformer
@@ -502,10 +637,18 @@
       title: Audio models
     - isExpanded: false
       sections:
+      - local: model_doc/align
+        title: ALIGN
       - local: model_doc/altclip
         title: AltCLIP
       - local: model_doc/blip
         title: BLIP
+      - local: model_doc/blip-2
+        title: BLIP-2
+      - local: model_doc/bridgetower
+        title: BridgeTower
+      - local: model_doc/bros
+        title: BROS
       - local: model_doc/chinese_clip
         title: Chinese-CLIP
       - local: model_doc/clip
@@ -514,6 +657,8 @@
         title: CLIPSeg
       - local: model_doc/data2vec
         title: Data2Vec
+      - local: model_doc/deplot
+        title: DePlot
       - local: model_doc/donut
         title: Donut
       - local: model_doc/flava
@@ -522,22 +667,46 @@
         title: GIT
       - local: model_doc/groupvit
         title: GroupViT
+      - local: model_doc/idefics
+        title: IDEFICS
+      - local: model_doc/instructblip
+        title: InstructBLIP
+      - local: model_doc/layoutlm
+        title: LayoutLM
       - local: model_doc/layoutlmv2
         title: LayoutLMV2
       - local: model_doc/layoutlmv3
         title: LayoutLMV3
       - local: model_doc/layoutxlm
         title: LayoutXLM
+      - local: model_doc/lilt
+        title: LiLT
       - local: model_doc/lxmert
         title: LXMERT
+      - local: model_doc/matcha
+        title: MatCha
+      - local: model_doc/mgp-str
+        title: MGP-STR
+      - local: model_doc/nougat
+        title: Nougat
+      - local: model_doc/oneformer
+        title: OneFormer
       - local: model_doc/owlvit
         title: OWL-ViT
       - local: model_doc/perceiver
         title: Perceiver
+      - local: model_doc/pix2struct
+        title: Pix2Struct
+      - local: model_doc/sam
+        title: Segment Anything
       - local: model_doc/speech-encoder-decoder
         title: Speech Encoder Decoder Models
+      - local: model_doc/tapas
+        title: TAPAS
       - local: model_doc/trocr
         title: TrOCR
+      - local: model_doc/tvlt
+        title: TVLT
       - local: model_doc/vilt
         title: ViLT
       - local: model_doc/vision-encoder-decoder
@@ -558,9 +727,18 @@
       title: Reinforcement learning models
     - isExpanded: false
       sections:
+      - local: model_doc/autoformer
+        title: Autoformer
+      - local: model_doc/informer
+        title: Informer
       - local: model_doc/time_series_transformer
         title: Time Series Transformer
       title: Time series models
+    - isExpanded: false
+      sections:
+      - local: model_doc/graphormer
+        title: Graphormer
+      title: Graph models
     title: Models
   - sections:
     - local: internal/modeling_utils
@@ -575,7 +753,11 @@
       title: Utilities for Generation
     - local: internal/image_processing_utils
       title: Utilities for Image Processors
+    - local: internal/audio_utils
+      title: Utilities for Audio processing
     - local: internal/file_utils
       title: General Utilities
+    - local: internal/time_series_utils
+      title: Utilities for Time Series
     title: Internal Helpers
-  title: API
\ No newline at end of file
+  title: API
diff --git a/docs/source/en/accelerate.md b/docs/source/en/accelerate.md
new file mode 100644
index 000000000000..b0f0e4efe647
--- /dev/null
+++ b/docs/source/en/accelerate.md
@@ -0,0 +1,136 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Distributed training with 🤗 Accelerate
+
+As models get bigger, parallelism has emerged as a strategy for training larger models on limited hardware and accelerating training speed by several orders of magnitude. At Hugging Face, we created the [🤗 Accelerate](https://huggingface.co/docs/accelerate) library to help users easily train a 🤗 Transformers model on any type of distributed setup, whether it is multiple GPU's on one machine or multiple GPU's across several machines. In this tutorial, learn how to customize your native PyTorch training loop to enable training in a distributed environment.
+
+## Setup
+
+Get started by installing 🤗 Accelerate:
+
+```bash
+pip install accelerate
+```
+
+Then import and create an [`~accelerate.Accelerator`] object. The [`~accelerate.Accelerator`] will automatically detect your type of distributed setup and initialize all the necessary components for training. You don't need to explicitly place your model on a device.
+
+```py
+>>> from accelerate import Accelerator
+
+>>> accelerator = Accelerator()
+```
+
+## Prepare to accelerate
+
+The next step is to pass all the relevant training objects to the [`~accelerate.Accelerator.prepare`] method. This includes your training and evaluation DataLoaders, a model and an optimizer:
+
+```py
+>>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
+...     train_dataloader, eval_dataloader, model, optimizer
+... )
+```
+
+## Backward
+
+The last addition is to replace the typical `loss.backward()` in your training loop with 🤗 Accelerate's [`~accelerate.Accelerator.backward`]method:
+
+```py
+>>> for epoch in range(num_epochs):
+...     for batch in train_dataloader:
+...         outputs = model(**batch)
+...         loss = outputs.loss
+...         accelerator.backward(loss)
+
+...         optimizer.step()
+...         lr_scheduler.step()
+...         optimizer.zero_grad()
+...         progress_bar.update(1)
+```
+
+As you can see in the following code, you only need to add four additional lines of code to your training loop to enable distributed training!
+
+```diff
++ from accelerate import Accelerator
+  from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
+
++ accelerator = Accelerator()
+
+  model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
+  optimizer = AdamW(model.parameters(), lr=3e-5)
+
+- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+- model.to(device)
+
++ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
++     train_dataloader, eval_dataloader, model, optimizer
++ )
+
+  num_epochs = 3
+  num_training_steps = num_epochs * len(train_dataloader)
+  lr_scheduler = get_scheduler(
+      "linear",
+      optimizer=optimizer,
+      num_warmup_steps=0,
+      num_training_steps=num_training_steps
+  )
+
+  progress_bar = tqdm(range(num_training_steps))
+
+  model.train()
+  for epoch in range(num_epochs):
+      for batch in train_dataloader:
+-         batch = {k: v.to(device) for k, v in batch.items()}
+          outputs = model(**batch)
+          loss = outputs.loss
+-         loss.backward()
++         accelerator.backward(loss)
+
+          optimizer.step()
+          lr_scheduler.step()
+          optimizer.zero_grad()
+          progress_bar.update(1)
+```
+
+## Train
+
+Once you've added the relevant lines of code, launch your training in a script or a notebook like Colaboratory.
+
+### Train with a script
+
+If you are running your training from a script, run the following command to create and save a configuration file:
+
+```bash
+accelerate config
+```
+
+Then launch your training with:
+
+```bash
+accelerate launch train.py
+```
+
+### Train with a notebook
+
+🤗 Accelerate can also run in a notebook if you're planning on using Colaboratory's TPUs. Wrap all the code responsible for training in a function, and pass it to [`~accelerate.notebook_launcher`]:
+
+```py
+>>> from accelerate import notebook_launcher
+
+>>> notebook_launcher(training_function)
+```
+
+For more information about 🤗 Accelerate and its rich features, refer to the [documentation](https://huggingface.co/docs/accelerate).
diff --git a/docs/source/en/accelerate.mdx b/docs/source/en/accelerate.mdx
deleted file mode 100644
index 02e05df39074..000000000000
--- a/docs/source/en/accelerate.mdx
+++ /dev/null
@@ -1,132 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Distributed training with 🤗 Accelerate
-
-As models get bigger, parallelism has emerged as a strategy for training larger models on limited hardware and accelerating training speed by several orders of magnitude. At Hugging Face, we created the [🤗 Accelerate](https://huggingface.co/docs/accelerate) library to help users easily train a 🤗 Transformers model on any type of distributed setup, whether it is multiple GPU's on one machine or multiple GPU's across several machines. In this tutorial, learn how to customize your native PyTorch training loop to enable training in a distributed environment.
-
-## Setup
-
-Get started by installing 🤗 Accelerate:
-
-```bash
-pip install accelerate
-```
-
-Then import and create an [`~accelerate.Accelerator`] object. The [`~accelerate.Accelerator`] will automatically detect your type of distributed setup and initialize all the necessary components for training. You don't need to explicitly place your model on a device.
-
-```py
->>> from accelerate import Accelerator
-
->>> accelerator = Accelerator()
-```
-
-## Prepare to accelerate
-
-The next step is to pass all the relevant training objects to the [`~accelerate.Accelerator.prepare`] method. This includes your training and evaluation DataLoaders, a model and an optimizer:
-
-```py
->>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
-...     train_dataloader, eval_dataloader, model, optimizer
-... )
-```
-
-## Backward
-
-The last addition is to replace the typical `loss.backward()` in your training loop with 🤗 Accelerate's [`~accelerate.Accelerator.backward`]method:
-
-```py
->>> for epoch in range(num_epochs):
-...     for batch in train_dataloader:
-...         outputs = model(**batch)
-...         loss = outputs.loss
-...         accelerator.backward(loss)
-
-...         optimizer.step()
-...         lr_scheduler.step()
-...         optimizer.zero_grad()
-...         progress_bar.update(1)
-```
-
-As you can see in the following code, you only need to add four additional lines of code to your training loop to enable distributed training!
-
-```diff
-+ from accelerate import Accelerator
-  from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
-
-+ accelerator = Accelerator()
-
-  model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
-  optimizer = AdamW(model.parameters(), lr=3e-5)
-
-- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
-- model.to(device)
-
-+ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
-+     train_dataloader, eval_dataloader, model, optimizer
-+ )
-
-  num_epochs = 3
-  num_training_steps = num_epochs * len(train_dataloader)
-  lr_scheduler = get_scheduler(
-      "linear",
-      optimizer=optimizer,
-      num_warmup_steps=0,
-      num_training_steps=num_training_steps
-  )
-
-  progress_bar = tqdm(range(num_training_steps))
-
-  model.train()
-  for epoch in range(num_epochs):
-      for batch in train_dataloader:
--         batch = {k: v.to(device) for k, v in batch.items()}
-          outputs = model(**batch)
-          loss = outputs.loss
--         loss.backward()
-+         accelerator.backward(loss)
-
-          optimizer.step()
-          lr_scheduler.step()
-          optimizer.zero_grad()
-          progress_bar.update(1)
-```
-
-## Train
-
-Once you've added the relevant lines of code, launch your training in a script or a notebook like Colaboratory.
-
-### Train with a script
-
-If you are running your training from a script, run the following command to create and save a configuration file:
-
-```bash
-accelerate config
-```
-
-Then launch your training with:
-
-```bash
-accelerate launch train.py
-```
-
-### Train with a notebook
-
-🤗 Accelerate can also run in a notebook if you're planning on using Colaboratory's TPUs. Wrap all the code responsible for training in a function, and pass it to [`~accelerate.notebook_launcher`]:
-
-```py
->>> from accelerate import notebook_launcher
-
->>> notebook_launcher(training_function)
-```
-
-For more information about 🤗 Accelerate and it's rich features, refer to the [documentation](https://huggingface.co/docs/accelerate).
\ No newline at end of file
diff --git a/docs/source/en/add_new_model.md b/docs/source/en/add_new_model.md
new file mode 100644
index 000000000000..6766c8ecf048
--- /dev/null
+++ b/docs/source/en/add_new_model.md
@@ -0,0 +1,895 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# How to add a model to 🤗 Transformers?
+
+The 🤗 Transformers library is often able to offer new models thanks to community contributors. But this can be a challenging project and requires an in-depth knowledge of the 🤗 Transformers library and the model to implement. At Hugging Face, we're trying to empower more of the community to actively add models and we've put together this guide to walk you through the process of adding a PyTorch model (make sure you have [PyTorch installed](https://pytorch.org/get-started/locally/)).
+
+<Tip>
+
+If you're interested in implementing a TensorFlow model, take a look at the [How to convert a 🤗 Transformers model to TensorFlow](add_tensorflow_model) guide!
+
+</Tip>
+
+Along the way, you'll:
+
+- get insights into open-source best practices
+- understand the design principles behind one of the most popular deep learning libraries
+- learn how to efficiently test large models
+- learn how to integrate Python utilities like `black`, `ruff`, and `make fix-copies` to ensure clean and readable code
+
+A Hugging Face team member will be available to help you along the way so you'll never be alone. 🤗 ❤️
+
+To get started, open a [New model addition](https://github.com/huggingface/transformers/issues/new?assignees=&labels=New+model&template=new-model-addition.yml) issue for the model you want to see in 🤗 Transformers. If you're not especially picky about contributing a specific model, you can filter by the [New model label](https://github.com/huggingface/transformers/labels/New%20model) to see if there are any unclaimed model requests and work on it.
+
+Once you've opened a new model request, the first step is to get familiar with 🤗 Transformers if you aren't already!
+
+## General overview of 🤗 Transformers
+
+First, you should get a general overview of 🤗 Transformers. 🤗 Transformers is a very opinionated library, so there is a
+chance that you don't agree with some of the library's philosophies or design choices. From our experience, however, we
+found that the fundamental design choices and philosophies of the library are crucial to efficiently scale 🤗
+Transformers while keeping maintenance costs at a reasonable level.
+
+A good first starting point to better understand the library is to read the [documentation of our philosophy](philosophy). As a result of our way of working, there are some choices that we try to apply to all models:
+
+- Composition is generally favored over-abstraction
+- Duplicating code is not always bad if it strongly improves the readability or accessibility of a model
+- Model files are as self-contained as possible so that when you read the code of a specific model, you ideally only
+  have to look into the respective `modeling_....py` file.
+
+In our opinion, the library's code is not just a means to provide a product, *e.g.* the ability to use BERT for
+inference, but also as the very product that we want to improve. Hence, when adding a model, the user is not only the
+person who will use your model, but also everybody who will read, try to understand, and possibly tweak your code.
+
+With this in mind, let's go a bit deeper into the general library design.
+
+### Overview of models
+
+To successfully add a model, it is important to understand the interaction between your model and its config,
+[`PreTrainedModel`], and [`PretrainedConfig`]. For exemplary purposes, we will
+call the model to be added to 🤗 Transformers `BrandNewBert`.
+
+Let's take a look:
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_overview.png"/>
+
+As you can see, we do make use of inheritance in 🤗 Transformers, but we keep the level of abstraction to an absolute
+minimum. There are never more than two levels of abstraction for any model in the library. `BrandNewBertModel`
+inherits from `BrandNewBertPreTrainedModel` which in turn inherits from [`PreTrainedModel`] and
+that's it. As a general rule, we want to make sure that a new model only depends on
+[`PreTrainedModel`]. The important functionalities that are automatically provided to every new
+model are [`~PreTrainedModel.from_pretrained`] and
+[`~PreTrainedModel.save_pretrained`], which are used for serialization and deserialization. All of the
+other important functionalities, such as `BrandNewBertModel.forward` should be completely defined in the new
+`modeling_brand_new_bert.py` script. Next, we want to make sure that a model with a specific head layer, such as
+`BrandNewBertForMaskedLM` does not inherit from `BrandNewBertModel`, but rather uses `BrandNewBertModel`
+as a component that can be called in its forward pass to keep the level of abstraction low. Every new model requires a
+configuration class, called `BrandNewBertConfig`. This configuration is always stored as an attribute in
+[`PreTrainedModel`], and thus can be accessed via the `config` attribute for all classes
+inheriting from `BrandNewBertPreTrainedModel`:
+
+```python
+model = BrandNewBertModel.from_pretrained("brandy/brand_new_bert")
+model.config  # model has access to its config
+```
+
+Similar to the model, the configuration inherits basic serialization and deserialization functionalities from
+[`PretrainedConfig`]. Note that the configuration and the model are always serialized into two
+different formats - the model to a *pytorch_model.bin* file and the configuration to a *config.json* file. Calling
+[`~PreTrainedModel.save_pretrained`] will automatically call
+[`~PretrainedConfig.save_pretrained`], so that both model and configuration are saved.
+
+
+### Code style
+
+When coding your new model, keep in mind that Transformers is an opinionated library and we have a few quirks of our
+own regarding how code should be written :-)
+
+1. The forward pass of your model should be fully written in the modeling file while being fully independent of other
+   models in the library. If you want to reuse a block from another model, copy the code and paste it with a
+   `# Copied from` comment on top (see [here](https://github.com/huggingface/transformers/blob/v4.17.0/src/transformers/models/roberta/modeling_roberta.py#L160)
+   for a good example and [there](pr_checks#check-copies) for more documentation on Copied from). 
+2. The code should be fully understandable, even by a non-native English speaker. This means you should pick
+   descriptive variable names and avoid abbreviations. As an example, `activation` is preferred to `act`.
+   One-letter variable names are strongly discouraged unless it's an index in a for loop.
+3. More generally we prefer longer explicit code to short magical one.
+4. Avoid subclassing `nn.Sequential` in PyTorch but subclass `nn.Module` and write the forward pass, so that anyone
+   using your code can quickly debug it by adding print statements or breaking points.
+5. Your function signature should be type-annotated. For the rest, good variable names are way more readable and
+   understandable than type annotations.
+
+### Overview of tokenizers
+
+Not quite ready yet :-( This section will be added soon!
+
+## Step-by-step recipe to add a model to 🤗 Transformers
+
+Everyone has different preferences of how to port a model so it can be very helpful for you to take a look at summaries
+of how other contributors ported models to Hugging Face. Here is a list of community blog posts on how to port a model:
+
+1. [Porting GPT2 Model](https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28) by [Thomas](https://huggingface.co/thomwolf)
+2. [Porting WMT19 MT Model](https://huggingface.co/blog/porting-fsmt) by [Stas](https://huggingface.co/stas)
+
+From experience, we can tell you that the most important things to keep in mind when adding a model are:
+
+-  Don't reinvent the wheel! Most parts of the code you will add for the new 🤗 Transformers model already exist
+  somewhere in 🤗 Transformers. Take some time to find similar, already existing models and tokenizers you can copy
+  from. [grep](https://www.gnu.org/software/grep/) and [rg](https://github.com/BurntSushi/ripgrep) are your
+  friends. Note that it might very well happen that your model's tokenizer is based on one model implementation, and
+  your model's modeling code on another one. *E.g.* FSMT's modeling code is based on BART, while FSMT's tokenizer code
+  is based on XLM.
+-  It's more of an engineering challenge than a scientific challenge. You should spend more time creating an
+  efficient debugging environment rather than trying to understand all theoretical aspects of the model in the paper.
+-  Ask for help, when you're stuck! Models are the core component of 🤗 Transformers so we at Hugging Face are more
+  than happy to help you at every step to add your model. Don't hesitate to ask if you notice you are not making
+  progress.
+
+In the following, we try to give you a general recipe that we found most useful when porting a model to 🤗 Transformers.
+
+The following list is a summary of everything that has to be done to add a model and can be used by you as a To-Do
+List:
+
+☐ (Optional) Understood the model's theoretical aspects<br>
+☐ Prepared 🤗 Transformers dev environment<br>
+☐ Set up debugging environment of the original repository<br>
+☐ Created script that successfully runs the `forward()` pass using the original repository and checkpoint<br>
+☐ Successfully added the model skeleton to 🤗 Transformers<br>
+☐ Successfully converted original checkpoint to 🤗 Transformers checkpoint<br>
+☐ Successfully ran `forward()` pass in 🤗 Transformers that gives identical output to original checkpoint<br>
+☐ Finished model tests in 🤗 Transformers<br>
+☐ Successfully added tokenizer in 🤗 Transformers<br>
+☐ Run end-to-end integration tests<br>
+☐ Finished docs<br>
+☐ Uploaded model weights to the Hub<br>
+☐ Submitted the pull request<br>
+☐ (Optional) Added a demo notebook
+
+To begin with, we usually recommend starting by getting a good theoretical understanding of `BrandNewBert`. However,
+if you prefer to understand the theoretical aspects of the model *on-the-job*, then it is totally fine to directly dive
+into the `BrandNewBert`'s code-base. This option might suit you better if your engineering skills are better than
+your theoretical skill, if you have trouble understanding `BrandNewBert`'s paper, or if you just enjoy programming
+much more than reading scientific papers.
+
+### 1. (Optional) Theoretical aspects of BrandNewBert
+
+You should take some time to read *BrandNewBert's* paper, if such descriptive work exists. There might be large
+sections of the paper that are difficult to understand. If this is the case, this is fine - don't worry! The goal is
+not to get a deep theoretical understanding of the paper, but to extract the necessary information required to
+effectively re-implement the model in 🤗 Transformers. That being said, you don't have to spend too much time on the
+theoretical aspects, but rather focus on the practical ones, namely:
+
+-  What type of model is *brand_new_bert*? BERT-like encoder-only model? GPT2-like decoder-only model? BART-like
+  encoder-decoder model? Look at the [model_summary](model_summary) if you're not familiar with the differences between those.
+-  What are the applications of *brand_new_bert*? Text classification? Text generation? Seq2Seq tasks, *e.g.,*
+  summarization?
+-  What is the novel feature of the model that makes it different from BERT/GPT-2/BART?
+-  Which of the already existing [🤗 Transformers models](https://huggingface.co/transformers/#contents) is most
+  similar to *brand_new_bert*?
+-  What type of tokenizer is used? A sentencepiece tokenizer? Word piece tokenizer? Is it the same tokenizer as used
+  for BERT or BART?
+
+After you feel like you have gotten a good overview of the architecture of the model, you might want to write to the
+Hugging Face team with any questions you might have. This might include questions regarding the model's architecture,
+its attention layer, etc. We will be more than happy to help you.
+
+### 2. Next prepare your environment
+
+1. Fork the [repository](https://github.com/huggingface/transformers) by clicking on the ‘Fork' button on the
+   repository's page. This creates a copy of the code under your GitHub user account.
+
+2. Clone your `transformers` fork to your local disk, and add the base repository as a remote:
+
+```bash
+git clone https://github.com/[your Github handle]/transformers.git
+cd transformers
+git remote add upstream https://github.com/huggingface/transformers.git
+```
+
+3. Set up a development environment, for instance by running the following command:
+
+```bash
+python -m venv .env
+source .env/bin/activate
+pip install -e ".[dev]"
+```
+
+Depending on your OS, and since the number of optional dependencies of Transformers is growing, you might get a
+failure with this command. If that's the case make sure to install the Deep Learning framework you are working with
+(PyTorch, TensorFlow and/or Flax) then do:
+
+```bash
+pip install -e ".[quality]"
+```
+
+which should be enough for most use cases. You can then return to the parent directory
+
+```bash
+cd ..
+```
+
+4. We recommend adding the PyTorch version of *brand_new_bert* to Transformers. To install PyTorch, please follow the
+   instructions on https://pytorch.org/get-started/locally/.
+
+**Note:** You don't need to have CUDA installed. Making the new model work on CPU is sufficient.
+
+5. To port *brand_new_bert*, you will also need access to its original repository:
+
+```bash
+git clone https://github.com/org_that_created_brand_new_bert_org/brand_new_bert.git
+cd brand_new_bert
+pip install -e .
+```
+
+Now you have set up a development environment to port *brand_new_bert* to 🤗 Transformers.
+
+### 3.-4. Run a pretrained checkpoint using the original repository
+
+At first, you will work on the original *brand_new_bert* repository. Often, the original implementation is very
+“researchy”. Meaning that documentation might be lacking and the code can be difficult to understand. But this should
+be exactly your motivation to reimplement *brand_new_bert*. At Hugging Face, one of our main goals is to *make people
+stand on the shoulders of giants* which translates here very well into taking a working model and rewriting it to make
+it as **accessible, user-friendly, and beautiful** as possible. This is the number-one motivation to re-implement
+models into 🤗 Transformers - trying to make complex new NLP technology accessible to **everybody**.
+
+You should start thereby by diving into the original repository.
+
+Successfully running the official pretrained model in the original repository is often **the most difficult** step.
+From our experience, it is very important to spend some time getting familiar with the original code-base. You need to
+figure out the following:
+
+- Where to find the pretrained weights?
+- How to load the pretrained weights into the corresponding model?
+- How to run the tokenizer independently from the model?
+- Trace one forward pass so that you know which classes and functions are required for a simple forward pass. Usually,
+  you only have to reimplement those functions.
+- Be able to locate the important components of the model: Where is the model's class? Are there model sub-classes,
+  *e.g.* EncoderModel, DecoderModel? Where is the self-attention layer? Are there multiple different attention layers,
+  *e.g.* *self-attention*, *cross-attention*...?
+- How can you debug the model in the original environment of the repo? Do you have to add *print* statements, can you
+  work with an interactive debugger like *ipdb*, or should you use an efficient IDE to debug the model, like PyCharm?
+
+It is very important that before you start the porting process, you can **efficiently** debug code in the original
+repository! Also, remember that you are working with an open-source library, so do not hesitate to open an issue, or
+even a pull request in the original repository. The maintainers of this repository are most likely very happy about
+someone looking into their code!
+
+At this point, it is really up to you which debugging environment and strategy you prefer to use to debug the original
+model. We strongly advise against setting up a costly GPU environment, but simply work on a CPU both when starting to
+dive into the original repository and also when starting to write the 🤗 Transformers implementation of the model. Only
+at the very end, when the model has already been successfully ported to 🤗 Transformers, one should verify that the
+model also works as expected on GPU.
+
+In general, there are two possible debugging environments for running the original model
+
+-  [Jupyter notebooks](https://jupyter.org/) / [google colab](https://colab.research.google.com/notebooks/intro.ipynb)
+-  Local python scripts.
+
+Jupyter notebooks have the advantage that they allow for cell-by-cell execution which can be helpful to better split
+logical components from one another and to have faster debugging cycles as intermediate results can be stored. Also,
+notebooks are often easier to share with other contributors, which might be very helpful if you want to ask the Hugging
+Face team for help. If you are familiar with Jupyter notebooks, we strongly recommend you work with them.
+
+The obvious disadvantage of Jupyter notebooks is that if you are not used to working with them you will have to spend
+some time adjusting to the new programming environment and you might not be able to use your known debugging tools
+anymore, like `ipdb`.
+
+For each code-base, a good first step is always to load a **small** pretrained checkpoint and to be able to reproduce a
+single forward pass using a dummy integer vector of input IDs as an input. Such a script could look like this (in
+pseudocode):
+
+```python
+model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/")
+input_ids = [0, 4, 5, 2, 3, 7, 9]  # vector of input ids
+original_output = model.predict(input_ids)
+```
+
+Next, regarding the debugging strategy, there are generally a few from which to choose from:
+
+- Decompose the original model into many small testable components and run a forward pass on each of those for
+  verification
+- Decompose the original model only into the original *tokenizer* and the original *model*, run a forward pass on
+  those, and use intermediate print statements or breakpoints for verification
+
+Again, it is up to you which strategy to choose. Often, one or the other is advantageous depending on the original code
+base.
+
+If the original code-base allows you to decompose the model into smaller sub-components, *e.g.* if the original
+code-base can easily be run in eager mode, it is usually worth the effort to do so. There are some important advantages
+to taking the more difficult road in the beginning:
+
+- at a later stage when comparing the original model to the Hugging Face implementation, you can verify automatically
+  for each component individually that the corresponding component of the 🤗 Transformers implementation matches instead
+  of relying on visual comparison via print statements
+- it can give you some rope to decompose the big problem of porting a model into smaller problems of just porting
+  individual components and thus structure your work better
+- separating the model into logical meaningful components will help you to get a better overview of the model's design
+  and thus to better understand the model
+- at a later stage those component-by-component tests help you to ensure that no regression occurs as you continue
+  changing your code
+
+[Lysandre's](https://gist.github.com/LysandreJik/db4c948f6b4483960de5cbac598ad4ed) integration checks for ELECTRA
+gives a nice example of how this can be done.
+
+However, if the original code-base is very complex or only allows intermediate components to be run in a compiled mode,
+it might be too time-consuming or even impossible to separate the model into smaller testable sub-components. A good
+example is [T5's MeshTensorFlow](https://github.com/tensorflow/mesh/tree/master/mesh_tensorflow) library which is
+very complex and does not offer a simple way to decompose the model into its sub-components. For such libraries, one
+often relies on verifying print statements.
+
+No matter which strategy you choose, the recommended procedure is often the same that you should start to debug the
+starting layers first and the ending layers last.
+
+It is recommended that you retrieve the output, either by print statements or sub-component functions, of the following
+layers in the following order:
+
+1. Retrieve the input IDs passed to the model
+2. Retrieve the word embeddings
+3. Retrieve the input of the first Transformer layer
+4. Retrieve the output of the first Transformer layer
+5. Retrieve the output of the following n - 1 Transformer layers
+6. Retrieve the output of the whole BrandNewBert Model
+
+Input IDs should thereby consists of an array of integers, *e.g.* `input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]`
+
+The outputs of the following layers often consist of multi-dimensional float arrays and can look like this:
+
+```
+[[
+ [-0.1465, -0.6501,  0.1993,  ...,  0.1451,  0.3430,  0.6024],
+ [-0.4417, -0.5920,  0.3450,  ..., -0.3062,  0.6182,  0.7132],
+ [-0.5009, -0.7122,  0.4548,  ..., -0.3662,  0.6091,  0.7648],
+ ...,
+ [-0.5613, -0.6332,  0.4324,  ..., -0.3792,  0.7372,  0.9288],
+ [-0.5416, -0.6345,  0.4180,  ..., -0.3564,  0.6992,  0.9191],
+ [-0.5334, -0.6403,  0.4271,  ..., -0.3339,  0.6533,  0.8694]]],
+```
+
+We expect that every model added to 🤗 Transformers passes a couple of integration tests, meaning that the original
+model and the reimplemented version in 🤗 Transformers have to give the exact same output up to a precision of 0.001!
+Since it is normal that the exact same model written in different libraries can give a slightly different output
+depending on the library framework, we accept an error tolerance of 1e-3 (0.001). It is not enough if the model gives
+nearly the same output, they have to be almost identical. Therefore, you will certainly compare the intermediate
+outputs of the 🤗 Transformers version multiple times against the intermediate outputs of the original implementation of
+*brand_new_bert* in which case an **efficient** debugging environment of the original repository is absolutely
+important. Here is some advice to make your debugging environment as efficient as possible.
+
+- Find the best way of debugging intermediate results. Is the original repository written in PyTorch? Then you should
+  probably take the time to write a longer script that decomposes the original model into smaller sub-components to
+  retrieve intermediate values. Is the original repository written in Tensorflow 1? Then you might have to rely on
+  TensorFlow print operations like [tf.print](https://www.tensorflow.org/api_docs/python/tf/print) to output
+  intermediate values. Is the original repository written in Jax? Then make sure that the model is **not jitted** when
+  running the forward pass, *e.g.* check-out [this link](https://github.com/google/jax/issues/196).
+- Use the smallest pretrained checkpoint you can find. The smaller the checkpoint, the faster your debug cycle
+  becomes. It is not efficient if your pretrained model is so big that your forward pass takes more than 10 seconds.
+  In case only very large checkpoints are available, it might make more sense to create a dummy model in the new
+  environment with randomly initialized weights and save those weights for comparison with the 🤗 Transformers version
+  of your model
+- Make sure you are using the easiest way of calling a forward pass in the original repository. Ideally, you want to
+  find the function in the original repository that **only** calls a single forward pass, *i.e.* that is often called
+  `predict`, `evaluate`, `forward` or `__call__`. You don't want to debug a function that calls `forward`
+  multiple times, *e.g.* to generate text, like `autoregressive_sample`, `generate`.
+- Try to separate the tokenization from the model's *forward* pass. If the original repository shows examples where
+  you have to input a string, then try to find out where in the forward call the string input is changed to input ids
+  and start from this point. This might mean that you have to possibly write a small script yourself or change the
+  original code so that you can directly input the ids instead of an input string.
+- Make sure that the model in your debugging setup is **not** in training mode, which often causes the model to yield
+  random outputs due to multiple dropout layers in the model. Make sure that the forward pass in your debugging
+  environment is **deterministic** so that the dropout layers are not used. Or use *transformers.utils.set_seed*
+  if the old and new implementations are in the same framework.
+
+The following section gives you more specific details/tips on how you can do this for *brand_new_bert*.
+
+### 5.-14. Port BrandNewBert to 🤗 Transformers
+
+Next, you can finally start adding new code to 🤗 Transformers. Go into the clone of your 🤗 Transformers' fork:
+
+```bash
+cd transformers
+```
+
+In the special case that you are adding a model whose architecture exactly matches the model architecture of an
+existing model you only have to add a conversion script as described in [this section](#write-a-conversion-script).
+In this case, you can just re-use the whole model architecture of the already existing model.
+
+Otherwise, let's start generating a new model. You have two choices here:
+
+- `transformers-cli add-new-model-like` to add a new model like an existing one
+- `transformers-cli add-new-model` to add a new model from our template (will look like BERT or Bart depending on the type of model you select)
+
+In both cases, you will be prompted with a questionnaire to fill in the basic information of your model. The second command requires to install `cookiecutter`, you can find more information on it [here](https://github.com/huggingface/transformers/tree/main/templates/adding_a_new_model).
+
+**Open a Pull Request on the main huggingface/transformers repo**
+
+Before starting to adapt the automatically generated code, now is the time to open a “Work in progress (WIP)” pull
+request, *e.g.* “[WIP] Add *brand_new_bert*”, in 🤗 Transformers so that you and the Hugging Face team can work
+side-by-side on integrating the model into 🤗 Transformers.
+
+You should do the following:
+
+1. Create a branch with a descriptive name from your main branch
+
+```bash
+git checkout -b add_brand_new_bert
+```
+
+2. Commit the automatically generated code:
+
+```bash
+git add .
+git commit
+```
+
+3. Fetch and rebase to current main
+
+```bash
+git fetch upstream
+git rebase upstream/main
+```
+
+4. Push the changes to your account using:
+
+```bash
+git push -u origin a-descriptive-name-for-my-changes
+```
+
+5. Once you are satisfied, go to the webpage of your fork on GitHub. Click on “Pull request”. Make sure to add the
+   GitHub handle of some members of the Hugging Face team as reviewers, so that the Hugging Face team gets notified for
+   future changes.
+
+6. Change the PR into a draft by clicking on “Convert to draft” on the right of the GitHub pull request web page.
+
+In the following, whenever you have made some progress, don't forget to commit your work and push it to your account so
+that it shows in the pull request. Additionally, you should make sure to update your work with the current main from
+time to time by doing:
+
+```bash
+git fetch upstream
+git merge upstream/main
+```
+
+In general, all questions you might have regarding the model or your implementation should be asked in your PR and
+discussed/solved in the PR. This way, the Hugging Face team will always be notified when you are committing new code or
+if you have a question. It is often very helpful to point the Hugging Face team to your added code so that the Hugging
+Face team can efficiently understand your problem or question.
+
+To do so, you can go to the “Files changed” tab where you see all of your changes, go to a line regarding which you
+want to ask a question, and click on the “+” symbol to add a comment. Whenever a question or problem has been solved,
+you can click on the “Resolve” button of the created comment.
+
+In the same way, the Hugging Face team will open comments when reviewing your code. We recommend asking most questions
+on GitHub on your PR. For some very general questions that are not very useful for the public, feel free to ping the
+Hugging Face team by Slack or email.
+
+**5. Adapt the generated models code for brand_new_bert**
+
+At first, we will focus only on the model itself and not care about the tokenizer. All the relevant code should be
+found in the generated files `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` and
+`src/transformers/models/brand_new_bert/configuration_brand_new_bert.py`.
+
+Now you can finally start coding :). The generated code in
+`src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` will either have the same architecture as BERT if
+it's an encoder-only model or BART if it's an encoder-decoder model. At this point, you should remind yourself what
+you've learned in the beginning about the theoretical aspects of the model: *How is the model different from BERT or
+BART?*". Implement those changes which often means changing the *self-attention* layer, the order of the normalization
+layer, etc… Again, it is often useful to look at the similar architecture of already existing models in Transformers to
+get a better feeling of how your model should be implemented.
+
+**Note** that at this point, you don't have to be very sure that your code is fully correct or clean. Rather, it is
+advised to add a first *unclean*, copy-pasted version of the original code to
+`src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` until you feel like all the necessary code is
+added. From our experience, it is much more efficient to quickly add a first version of the required code and
+improve/correct the code iteratively with the conversion script as described in the next section. The only thing that
+has to work at this point is that you can instantiate the 🤗 Transformers implementation of *brand_new_bert*, *i.e.* the
+following command should work:
+
+```python
+from transformers import BrandNewBertModel, BrandNewBertConfig
+
+model = BrandNewBertModel(BrandNewBertConfig())
+```
+
+The above command will create a model according to the default parameters as defined in `BrandNewBertConfig()` with
+random weights, thus making sure that the `init()` methods of all components works.
+
+Note that all random initialization should happen in the `_init_weights` method of your `BrandnewBertPreTrainedModel`
+class. It should initialize all leaf modules depending on the variables of the config. Here is an example with the
+BERT `_init_weights` method:
+
+```py
+def _init_weights(self, module):
+    """Initialize the weights"""
+    if isinstance(module, nn.Linear):
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        if module.bias is not None:
+            module.bias.data.zero_()
+    elif isinstance(module, nn.Embedding):
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        if module.padding_idx is not None:
+            module.weight.data[module.padding_idx].zero_()
+    elif isinstance(module, nn.LayerNorm):
+        module.bias.data.zero_()
+        module.weight.data.fill_(1.0)
+```
+
+You can have some more custom schemes if you need a special initialization for some modules. For instance, in
+`Wav2Vec2ForPreTraining`, the last two linear layers need to have the initialization of the regular PyTorch `nn.Linear`
+but all the other ones should use an initialization as above. This is coded like this:
+
+```py
+def _init_weights(self, module):
+    """Initialize the weights"""
+    if isinstnace(module, Wav2Vec2ForPreTraining):
+        module.project_hid.reset_parameters()
+        module.project_q.reset_parameters()
+        module.project_hid._is_hf_initialized = True
+        module.project_q._is_hf_initialized = True
+    elif isinstance(module, nn.Linear):
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        if module.bias is not None:
+            module.bias.data.zero_()
+```
+
+The `_is_hf_initialized` flag is internally used to make sure we only initialize a submodule once. By setting it to
+`True` for `module.project_q` and `module.project_hid`, we make sure the custom initialization we did is not overridden later on,
+the `_init_weights` function won't be applied to them.
+
+**6. Write a conversion script**
+
+Next, you should write a conversion script that lets you convert the checkpoint you used to debug *brand_new_bert* in
+the original repository to a checkpoint compatible with your just created 🤗 Transformers implementation of
+*brand_new_bert*. It is not advised to write the conversion script from scratch, but rather to look through already
+existing conversion scripts in 🤗 Transformers for one that has been used to convert a similar model that was written in
+the same framework as *brand_new_bert*. Usually, it is enough to copy an already existing conversion script and
+slightly adapt it for your use case. Don't hesitate to ask the Hugging Face team to point you to a similar already
+existing conversion script for your model.
+
+- If you are porting a model from TensorFlow to PyTorch, a good starting point might be BERT's conversion script [here](https://github.com/huggingface/transformers/blob/7acfa95afb8194f8f9c1f4d2c6028224dbed35a2/src/transformers/models/bert/modeling_bert.py#L91)
+- If you are porting a model from PyTorch to PyTorch, a good starting point might be BART's conversion script [here](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py)
+
+In the following, we'll quickly explain how PyTorch models store layer weights and define layer names. In PyTorch, the
+name of a layer is defined by the name of the class attribute you give the layer. Let's define a dummy model in
+PyTorch, called `SimpleModel` as follows:
+
+```python
+from torch import nn
+
+
+class SimpleModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.dense = nn.Linear(10, 10)
+        self.intermediate = nn.Linear(10, 10)
+        self.layer_norm = nn.LayerNorm(10)
+```
+
+Now we can create an instance of this model definition which will fill all weights: `dense`, `intermediate`,
+`layer_norm` with random weights. We can print the model to see its architecture
+
+```python
+model = SimpleModel()
+
+print(model)
+```
+
+This will print out the following:
+
+```
+SimpleModel(
+  (dense): Linear(in_features=10, out_features=10, bias=True)
+  (intermediate): Linear(in_features=10, out_features=10, bias=True)
+  (layer_norm): LayerNorm((10,), eps=1e-05, elementwise_affine=True)
+)
+```
+
+We can see that the layer names are defined by the name of the class attribute in PyTorch. You can print out the weight
+values of a specific layer:
+
+```python
+print(model.dense.weight.data)
+```
+
+to see that the weights were randomly initialized
+
+```
+tensor([[-0.0818,  0.2207, -0.0749, -0.0030,  0.0045, -0.1569, -0.1598,  0.0212,
+         -0.2077,  0.2157],
+        [ 0.1044,  0.0201,  0.0990,  0.2482,  0.3116,  0.2509,  0.2866, -0.2190,
+          0.2166, -0.0212],
+        [-0.2000,  0.1107, -0.1999, -0.3119,  0.1559,  0.0993,  0.1776, -0.1950,
+         -0.1023, -0.0447],
+        [-0.0888, -0.1092,  0.2281,  0.0336,  0.1817, -0.0115,  0.2096,  0.1415,
+         -0.1876, -0.2467],
+        [ 0.2208, -0.2352, -0.1426, -0.2636, -0.2889, -0.2061, -0.2849, -0.0465,
+          0.2577,  0.0402],
+        [ 0.1502,  0.2465,  0.2566,  0.0693,  0.2352, -0.0530,  0.1859, -0.0604,
+          0.2132,  0.1680],
+        [ 0.1733, -0.2407, -0.1721,  0.1484,  0.0358, -0.0633, -0.0721, -0.0090,
+          0.2707, -0.2509],
+        [-0.1173,  0.1561,  0.2945,  0.0595, -0.1996,  0.2988, -0.0802,  0.0407,
+          0.1829, -0.1568],
+        [-0.1164, -0.2228, -0.0403,  0.0428,  0.1339,  0.0047,  0.1967,  0.2923,
+          0.0333, -0.0536],
+        [-0.1492, -0.1616,  0.1057,  0.1950, -0.2807, -0.2710, -0.1586,  0.0739,
+          0.2220,  0.2358]]).
+```
+
+In the conversion script, you should fill those randomly initialized weights with the exact weights of the
+corresponding layer in the checkpoint. *E.g.*
+
+```python
+# retrieve matching layer weights, e.g. by
+# recursive algorithm
+layer_name = "dense"
+pretrained_weight = array_of_dense_layer
+
+model_pointer = getattr(model, "dense")
+
+model_pointer.weight.data = torch.from_numpy(pretrained_weight)
+```
+
+While doing so, you must verify that each randomly initialized weight of your PyTorch model and its corresponding
+pretrained checkpoint weight exactly match in both **shape and name**. To do so, it is **necessary** to add assert
+statements for the shape and print out the names of the checkpoints weights. E.g. you should add statements like:
+
+```python
+assert (
+    model_pointer.weight.shape == pretrained_weight.shape
+), f"Pointer shape of random weight {model_pointer.shape} and array shape of checkpoint weight {pretrained_weight.shape} mismatched"
+```
+
+Besides, you should also print out the names of both weights to make sure they match, *e.g.*
+
+```python
+logger.info(f"Initialize PyTorch weight {layer_name} from {pretrained_weight.name}")
+```
+
+If either the shape or the name doesn't match, you probably assigned the wrong checkpoint weight to a randomly
+initialized layer of the 🤗 Transformers implementation.
+
+An incorrect shape is most likely due to an incorrect setting of the config parameters in `BrandNewBertConfig()` that
+do not exactly match those that were used for the checkpoint you want to convert. However, it could also be that
+PyTorch's implementation of a layer requires the weight to be transposed beforehand.
+
+Finally, you should also check that **all** required weights are initialized and print out all checkpoint weights that
+were not used for initialization to make sure the model is correctly converted. It is completely normal, that the
+conversion trials fail with either a wrong shape statement or a wrong name assignment. This is most likely because either
+you used incorrect parameters in `BrandNewBertConfig()`, have a wrong architecture in the 🤗 Transformers
+implementation, you have a bug in the `init()` functions of one of the components of the 🤗 Transformers
+implementation or you need to transpose one of the checkpoint weights.
+
+This step should be iterated with the previous step until all weights of the checkpoint are correctly loaded in the
+Transformers model. Having correctly loaded the checkpoint into the 🤗 Transformers implementation, you can then save
+the model under a folder of your choice `/path/to/converted/checkpoint/folder` that should then contain both a
+`pytorch_model.bin` file and a `config.json` file:
+
+```python
+model.save_pretrained("/path/to/converted/checkpoint/folder")
+```
+
+**7. Implement the forward pass**
+
+Having managed to correctly load the pretrained weights into the 🤗 Transformers implementation, you should now make
+sure that the forward pass is correctly implemented. In [Get familiar with the original repository](#34-run-a-pretrained-checkpoint-using-the-original-repository), you have already created a script that runs a forward
+pass of the model using the original repository. Now you should write an analogous script using the 🤗 Transformers
+implementation instead of the original one. It should look as follows:
+
+```python
+model = BrandNewBertModel.from_pretrained("/path/to/converted/checkpoint/folder")
+input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]
+output = model(input_ids).last_hidden_states
+```
+
+It is very likely that the 🤗 Transformers implementation and the original model implementation don't give the exact
+same output the very first time or that the forward pass throws an error. Don't be disappointed - it's expected! First,
+you should make sure that the forward pass doesn't throw any errors. It often happens that the wrong dimensions are
+used leading to a *Dimensionality mismatch* error or that the wrong data type object is used, *e.g.* `torch.long`
+instead of `torch.float32`. Don't hesitate to ask the Hugging Face team for help, if you don't manage to solve
+certain errors.
+
+The final part to make sure the 🤗 Transformers implementation works correctly is to ensure that the outputs are
+equivalent to a precision of `1e-3`. First, you should ensure that the output shapes are identical, *i.e.*
+`outputs.shape` should yield the same value for the script of the 🤗 Transformers implementation and the original
+implementation. Next, you should make sure that the output values are identical as well. This one of the most difficult
+parts of adding a new model. Common mistakes why the outputs are not identical are:
+
+- Some layers were not added, *i.e.* an *activation* layer was not added, or the residual connection was forgotten
+- The word embedding matrix was not tied
+- The wrong positional embeddings are used because the original implementation uses on offset
+- Dropout is applied during the forward pass. To fix this make sure *model.training is False* and that no dropout
+  layer is falsely activated during the forward pass, *i.e.* pass *self.training* to [PyTorch's functional dropout](https://pytorch.org/docs/stable/nn.functional.html?highlight=dropout#torch.nn.functional.dropout)
+
+The best way to fix the problem is usually to look at the forward pass of the original implementation and the 🤗
+Transformers implementation side-by-side and check if there are any differences. Ideally, you should debug/print out
+intermediate outputs of both implementations of the forward pass to find the exact position in the network where the 🤗
+Transformers implementation shows a different output than the original implementation. First, make sure that the
+hard-coded `input_ids` in both scripts are identical. Next, verify that the outputs of the first transformation of
+the `input_ids` (usually the word embeddings) are identical. And then work your way up to the very last layer of the
+network. At some point, you will notice a difference between the two implementations, which should point you to the bug
+in the 🤗 Transformers implementation. From our experience, a simple and efficient way is to add many print statements
+in both the original implementation and 🤗 Transformers implementation, at the same positions in the network
+respectively, and to successively remove print statements showing the same values for intermediate presentations.
+
+When you're confident that both implementations yield the same output, verify the outputs with
+`torch.allclose(original_output, output, atol=1e-3)`, you're done with the most difficult part! Congratulations - the
+work left to be done should be a cakewalk 😊.
+
+**8. Adding all necessary model tests**
+
+At this point, you have successfully added a new model. However, it is very much possible that the model does not yet
+fully comply with the required design. To make sure, the implementation is fully compatible with 🤗 Transformers, all
+common tests should pass. The Cookiecutter should have automatically added a test file for your model, probably under
+the same `tests/models/brand_new_bert/test_modeling_brand_new_bert.py`. Run this test file to verify that all common
+tests pass:
+
+```bash
+pytest tests/models/brand_new_bert/test_modeling_brand_new_bert.py
+```
+
+Having fixed all common tests, it is now crucial to ensure that all the nice work you have done is well tested, so that
+
+- a) The community can easily understand your work by looking at specific tests of *brand_new_bert*
+- b) Future changes to your model will not break any important feature of the model.
+
+At first, integration tests should be added. Those integration tests essentially do the same as the debugging scripts
+you used earlier to implement the model to 🤗 Transformers. A template of those model tests has already added by the
+Cookiecutter, called `BrandNewBertModelIntegrationTests` and only has to be filled out by you. To ensure that those
+tests are passing, run
+
+```bash
+RUN_SLOW=1 pytest -sv tests/models/brand_new_bert/test_modeling_brand_new_bert.py::BrandNewBertModelIntegrationTests
+```
+
+<Tip>
+
+In case you are using Windows, you should replace `RUN_SLOW=1` with `SET RUN_SLOW=1`
+
+</Tip>
+
+Second, all features that are special to *brand_new_bert* should be tested additionally in a separate test under
+`BrandNewBertModelTester`/``BrandNewBertModelTest`. This part is often forgotten but is extremely useful in two
+ways:
+
+- It helps to transfer the knowledge you have acquired during the model addition to the community by showing how the
+  special features of *brand_new_bert* should work.
+- Future contributors can quickly test changes to the model by running those special tests.
+
+
+**9. Implement the tokenizer**
+
+Next, we should add the tokenizer of *brand_new_bert*. Usually, the tokenizer is equivalent to or very similar to an
+already existing tokenizer of 🤗 Transformers.
+
+It is very important to find/extract the original tokenizer file and to manage to load this file into the 🤗
+Transformers' implementation of the tokenizer.
+
+To ensure that the tokenizer works correctly, it is recommended to first create a script in the original repository
+that inputs a string and returns the `input_ids``. It could look similar to this (in pseudo-code):
+
+```python
+input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words."
+model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/")
+input_ids = model.tokenize(input_str)
+```
+
+You might have to take a deeper look again into the original repository to find the correct tokenizer function or you
+might even have to do changes to your clone of the original repository to only output the `input_ids`. Having written
+a functional tokenization script that uses the original repository, an analogous script for 🤗 Transformers should be
+created. It should look similar to this:
+
+```python
+from transformers import BrandNewBertTokenizer
+
+input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words."
+
+tokenizer = BrandNewBertTokenizer.from_pretrained("/path/to/tokenizer/folder/")
+
+input_ids = tokenizer(input_str).input_ids
+```
+
+When both `input_ids` yield the same values, as a final step a tokenizer test file should also be added.
+
+Analogous to the modeling test files of *brand_new_bert*, the tokenization test files of *brand_new_bert* should
+contain a couple of hard-coded integration tests.
+
+**10. Run End-to-end integration tests**
+
+Having added the tokenizer, you should also add a couple of end-to-end integration tests using both the model and the
+tokenizer to `tests/models/brand_new_bert/test_modeling_brand_new_bert.py` in 🤗 Transformers.
+Such a test should show on a meaningful
+text-to-text sample that the 🤗 Transformers implementation works as expected. A meaningful text-to-text sample can
+include *e.g.* a source-to-target-translation pair, an article-to-summary pair, a question-to-answer pair, etc… If none
+of the ported checkpoints has been fine-tuned on a downstream task it is enough to simply rely on the model tests. In a
+final step to ensure that the model is fully functional, it is advised that you also run all tests on GPU. It can
+happen that you forgot to add some `.to(self.device)` statements to internal tensors of the model, which in such a
+test would show in an error. In case you have no access to a GPU, the Hugging Face team can take care of running those
+tests for you.
+
+**11. Add Docstring**
+
+Now, all the necessary functionality for *brand_new_bert* is added - you're almost done! The only thing left to add is
+a nice docstring and a doc page. The Cookiecutter should have added a template file called
+`docs/source/model_doc/brand_new_bert.md` that you should fill out. Users of your model will usually first look at
+this page before using your model. Hence, the documentation must be understandable and concise. It is very useful for
+the community to add some *Tips* to show how the model should be used. Don't hesitate to ping the Hugging Face team
+regarding the docstrings.
+
+Next, make sure that the docstring added to `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` is
+correct and included all necessary inputs and outputs. We have a detailed guide about writing documentation and our docstring format [here](writing-documentation). It is always to good to remind oneself that documentation should
+be treated at least as carefully as the code in 🤗 Transformers since the documentation is usually the first contact
+point of the community with the model.
+
+**Code refactor**
+
+Great, now you have added all the necessary code for *brand_new_bert*. At this point, you should correct some potential
+incorrect code style by running:
+
+```bash
+make style
+```
+
+and verify that your coding style passes the quality check:
+
+```bash
+make quality
+```
+
+There are a couple of other very strict design tests in 🤗 Transformers that might still be failing, which shows up in
+the tests of your pull request. This is often because of some missing information in the docstring or some incorrect
+naming. The Hugging Face team will surely help you if you're stuck here.
+
+Lastly, it is always a good idea to refactor one's code after having ensured that the code works correctly. With all
+tests passing, now it's a good time to go over the added code again and do some refactoring.
+
+You have now finished the coding part, congratulation! 🎉 You are Awesome! 😎
+
+**12. Upload the models to the model hub**
+
+In this final part, you should convert and upload all checkpoints to the model hub and add a model card for each
+uploaded model checkpoint. You can get familiar with the hub functionalities by reading our [Model sharing and uploading Page](model_sharing). You should work alongside the Hugging Face team here to decide on a fitting name for each
+checkpoint and to get the required access rights to be able to upload the model under the author's organization of
+*brand_new_bert*. The `push_to_hub` method, present in all models in `transformers`, is a quick and efficient way to push your checkpoint to the hub. A little snippet is pasted below:
+
+```python
+brand_new_bert.push_to_hub("brand_new_bert")
+# Uncomment the following line to push to an organization.
+# brand_new_bert.push_to_hub("<organization>/brand_new_bert")
+```
+
+It is worth spending some time to create fitting model cards for each checkpoint. The model cards should highlight the
+specific characteristics of this particular checkpoint, *e.g.* On which dataset was the checkpoint
+pretrained/fine-tuned on? On what down-stream task should the model be used? And also include some code on how to
+correctly use the model.
+
+**13. (Optional) Add notebook**
+
+It is very helpful to add a notebook that showcases in-detail how *brand_new_bert* can be used for inference and/or
+fine-tuned on a downstream task. This is not mandatory to merge your PR, but very useful for the community.
+
+**14. Submit your finished PR**
+
+You're done programming now and can move to the last step, which is getting your PR merged into main. Usually, the
+Hugging Face team should have helped you already at this point, but it is worth taking some time to give your finished
+PR a nice description and eventually add comments to your code, if you want to point out certain design choices to your
+reviewer.
+
+### Share your work!!
+
+Now, it's time to get some credit from the community for your work! Having completed a model addition is a major
+contribution to Transformers and the whole NLP community. Your code and the ported pre-trained models will certainly be
+used by hundreds and possibly even thousands of developers and researchers. You should be proud of your work and share
+your achievements with the community.
+
+**You have made another model that is super easy to access for everyone in the community! 🤯**
diff --git a/docs/source/en/add_new_model.mdx b/docs/source/en/add_new_model.mdx
deleted file mode 100644
index d22f2326f852..000000000000
--- a/docs/source/en/add_new_model.mdx
+++ /dev/null
@@ -1,841 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
--->
-
-# How to add a model to 🤗 Transformers?
-
-The 🤗 Transformers library is often able to offer new models thanks to community contributors. But this can be a challenging project and requires an in-depth knowledge of the 🤗 Transformers library and the model to implement. At Hugging Face, we're trying to empower more of the community to actively add models and we've put together this guide to walk you through the process of adding a PyTorch model (make sure you have [PyTorch installed](https://pytorch.org/get-started/locally/)).
-
-<Tip>
-
-If you're interested in implementing a TensorFlow model, take a look at the [How to convert a 🤗 Transformers model to TensorFlow](add_tensorflow_model) guide!
-
-</Tip>
-
-Along the way, you'll:
-
-- get insights into open-source best practices
-- understand the design principles behind one of the most popular deep learning libraries
-- learn how to efficiently test large models
-- learn how to integrate Python utilities like `black`, `isort`, and `make fix-copies` to ensure clean and readable code
-
-A Hugging Face team member will be available to help you along the way so you'll never be alone. 🤗 ❤️
-
-To get started, open a [New model addition](https://github.com/huggingface/transformers/issues/new?assignees=&labels=New+model&template=new-model-addition.yml) issue for the model you want to see in 🤗 Transformers. If you're not especially picky about contributing a specific model, you can filter by the [New model label](https://github.com/huggingface/transformers/labels/New%20model) to see if there are any unclaimed model requests and work on it.
-
-Once you've opened a new model request, the first step is to get familiar with 🤗 Transformers if you aren't already!
-
-## General overview of 🤗 Transformers
-
-First, you should get a general overview of 🤗 Transformers. 🤗 Transformers is a very opinionated library, so there is a
-chance that you don't agree with some of the library's philosophies or design choices. From our experience, however, we
-found that the fundamental design choices and philosophies of the library are crucial to efficiently scale 🤗
-Transformers while keeping maintenance costs at a reasonable level.
-
-A good first starting point to better understand the library is to read the [documentation of our philosophy](philosophy). As a result of our way of working, there are some choices that we try to apply to all models:
-
-- Composition is generally favored over-abstraction
-- Duplicating code is not always bad if it strongly improves the readability or accessibility of a model
-- Model files are as self-contained as possible so that when you read the code of a specific model, you ideally only
-  have to look into the respective `modeling_....py` file.
-
-In our opinion, the library's code is not just a means to provide a product, *e.g.* the ability to use BERT for
-inference, but also as the very product that we want to improve. Hence, when adding a model, the user is not only the
-person that will use your model, but also everybody that will read, try to understand, and possibly tweak your code.
-
-With this in mind, let's go a bit deeper into the general library design.
-
-### Overview of models
-
-To successfully add a model, it is important to understand the interaction between your model and its config,
-[`PreTrainedModel`], and [`PretrainedConfig`]. For exemplary purposes, we will
-call the model to be added to 🤗 Transformers `BrandNewBert`.
-
-Let's take a look:
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_overview.png"/>
-
-As you can see, we do make use of inheritance in 🤗 Transformers, but we keep the level of abstraction to an absolute
-minimum. There are never more than two levels of abstraction for any model in the library. `BrandNewBertModel`
-inherits from `BrandNewBertPreTrainedModel` which in turn inherits from [`PreTrainedModel`] and
-that's it. As a general rule, we want to make sure that a new model only depends on
-[`PreTrainedModel`]. The important functionalities that are automatically provided to every new
-model are [`~PreTrainedModel.from_pretrained`] and
-[`~PreTrainedModel.save_pretrained`], which are used for serialization and deserialization. All of the
-other important functionalities, such as `BrandNewBertModel.forward` should be completely defined in the new
-`modeling_brand_new_bert.py` script. Next, we want to make sure that a model with a specific head layer, such as
-`BrandNewBertForMaskedLM` does not inherit from `BrandNewBertModel`, but rather uses `BrandNewBertModel`
-as a component that can be called in its forward pass to keep the level of abstraction low. Every new model requires a
-configuration class, called `BrandNewBertConfig`. This configuration is always stored as an attribute in
-[`PreTrainedModel`], and thus can be accessed via the `config` attribute for all classes
-inheriting from `BrandNewBertPreTrainedModel`:
-
-```python
-model = BrandNewBertModel.from_pretrained("brandy/brand_new_bert")
-model.config  # model has access to its config
-```
-
-Similar to the model, the configuration inherits basic serialization and deserialization functionalities from
-[`PretrainedConfig`]. Note that the configuration and the model are always serialized into two
-different formats - the model to a *pytorch_model.bin* file and the configuration to a *config.json* file. Calling
-[`~PreTrainedModel.save_pretrained`] will automatically call
-[`~PretrainedConfig.save_pretrained`], so that both model and configuration are saved.
-
-
-### Code style
-
-When coding your new model, keep in mind that Transformers is an opinionated library and we have a few quirks of our
-own regarding how code should be written :-)
-
-1. The forward pass of your model should be fully written in the modeling file while being fully independent of other
-   models in the library. If you want to reuse a block from another model, copy the code and paste it with a
-   `# Copied from` comment on top (see [here](https://github.com/huggingface/transformers/blob/v4.17.0/src/transformers/models/roberta/modeling_roberta.py#L160)
-   for a good example).
-2. The code should be fully understandable, even by a non-native English speaker. This means you should pick
-   descriptive variable names and avoid abbreviations. As an example, `activation` is preferred to `act`.
-   One-letter variable names are strongly discouraged unless it's an index in a for loop.
-3. More generally we prefer longer explicit code to short magical one.
-4. Avoid subclassing `nn.Sequential` in PyTorch but subclass `nn.Module` and write the forward pass, so that anyone
-   using your code can quickly debug it by adding print statements or breaking points.
-5. Your function signature should be type-annotated. For the rest, good variable names are way more readable and
-   understandable than type annotations.
-
-### Overview of tokenizers
-
-Not quite ready yet :-( This section will be added soon!
-
-## Step-by-step recipe to add a model to 🤗 Transformers
-
-Everyone has different preferences of how to port a model so it can be very helpful for you to take a look at summaries
-of how other contributors ported models to Hugging Face. Here is a list of community blog posts on how to port a model:
-
-1. [Porting GPT2 Model](https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28) by [Thomas](https://huggingface.co/thomwolf)
-2. [Porting WMT19 MT Model](https://huggingface.co/blog/porting-fsmt) by [Stas](https://huggingface.co/stas)
-
-From experience, we can tell you that the most important things to keep in mind when adding a model are:
-
--  Don't reinvent the wheel! Most parts of the code you will add for the new 🤗 Transformers model already exist
-  somewhere in 🤗 Transformers. Take some time to find similar, already existing models and tokenizers you can copy
-  from. [grep](https://www.gnu.org/software/grep/) and [rg](https://github.com/BurntSushi/ripgrep) are your
-  friends. Note that it might very well happen that your model's tokenizer is based on one model implementation, and
-  your model's modeling code on another one. *E.g.* FSMT's modeling code is based on BART, while FSMT's tokenizer code
-  is based on XLM.
--  It's more of an engineering challenge than a scientific challenge. You should spend more time on creating an
-  efficient debugging environment than trying to understand all theoretical aspects of the model in the paper.
--  Ask for help, when you're stuck! Models are the core component of 🤗 Transformers so that we at Hugging Face are more
-  than happy to help you at every step to add your model. Don't hesitate to ask if you notice you are not making
-  progress.
-
-In the following, we try to give you a general recipe that we found most useful when porting a model to 🤗 Transformers.
-
-The following list is a summary of everything that has to be done to add a model and can be used by you as a To-Do
-List:
-
-☐ (Optional) Understood the model's theoretical aspects<br>
-☐ Prepared 🤗 Transformers dev environment<br>
-☐ Set up debugging environment of the original repository<br>
-☐ Created script that successfully runs the `forward()` pass using the original repository and checkpoint<br>
-☐ Successfully added the model skeleton to 🤗 Transformers<br>
-☐ Successfully converted original checkpoint to 🤗 Transformers checkpoint<br>
-☐ Successfully ran `forward()` pass in 🤗 Transformers that gives identical output to original checkpoint<br>
-☐ Finished model tests in 🤗 Transformers<br>
-☐ Successfully added tokenizer in 🤗 Transformers<br>
-☐ Run end-to-end integration tests<br>
-☐ Finished docs<br>
-☐ Uploaded model weights to the Hub<br>
-☐ Submitted the pull request<br>
-☐ (Optional) Added a demo notebook
-
-To begin with, we usually recommend to start by getting a good theoretical understanding of `BrandNewBert`. However,
-if you prefer to understand the theoretical aspects of the model *on-the-job*, then it is totally fine to directly dive
-into the `BrandNewBert`'s code-base. This option might suit you better, if your engineering skills are better than
-your theoretical skill, if you have trouble understanding `BrandNewBert`'s paper, or if you just enjoy programming
-much more than reading scientific papers.
-
-### 1. (Optional) Theoretical aspects of BrandNewBert
-
-You should take some time to read *BrandNewBert's* paper, if such descriptive work exists. There might be large
-sections of the paper that are difficult to understand. If this is the case, this is fine - don't worry! The goal is
-not to get a deep theoretical understanding of the paper, but to extract the necessary information required to
-effectively re-implement the model in 🤗 Transformers. That being said, you don't have to spend too much time on the
-theoretical aspects, but rather focus on the practical ones, namely:
-
--  What type of model is *brand_new_bert*? BERT-like encoder-only model? GPT2-like decoder-only model? BART-like
-  encoder-decoder model? Look at the [model_summary](model_summary) if you're not familiar with the differences between those.
--  What are the applications of *brand_new_bert*? Text classification? Text generation? Seq2Seq tasks, *e.g.,*
-  summarization?
--  What is the novel feature of the model making it different from BERT/GPT-2/BART?
--  Which of the already existing [🤗 Transformers models](https://huggingface.co/transformers/#contents) is most
-  similar to *brand_new_bert*?
--  What type of tokenizer is used? A sentencepiece tokenizer? Word piece tokenizer? Is it the same tokenizer as used
-  for BERT or BART?
-
-After you feel like you have gotten a good overview of the architecture of the model, you might want to write to the
-Hugging Face team with any questions you might have. This might include questions regarding the model's architecture,
-its attention layer, etc. We will be more than happy to help you.
-
-### 2. Next prepare your environment
-
-1. Fork the [repository](https://github.com/huggingface/transformers) by clicking on the ‘Fork' button on the
-   repository's page. This creates a copy of the code under your GitHub user account.
-
-2. Clone your `transformers` fork to your local disk, and add the base repository as a remote:
-
-```bash
-git clone https://github.com/[your Github handle]/transformers.git
-cd transformers
-git remote add upstream https://github.com/huggingface/transformers.git
-```
-
-3. Set up a development environment, for instance by running the following command:
-
-```bash
-python -m venv .env
-source .env/bin/activate
-pip install -e ".[dev]"
-```
-
-and return to the parent directory
-
-```bash
-cd ..
-```
-
-4. We recommend adding the PyTorch version of *brand_new_bert* to Transformers. To install PyTorch, please follow the
-   instructions on https://pytorch.org/get-started/locally/.
-
-**Note:** You don't need to have CUDA installed. Making the new model work on CPU is sufficient.
-
-5. To port *brand_new_bert*, you will also need access to its original repository:
-
-```bash
-git clone https://github.com/org_that_created_brand_new_bert_org/brand_new_bert.git
-cd brand_new_bert
-pip install -e .
-```
-
-Now you have set up a development environment to port *brand_new_bert* to 🤗 Transformers.
-
-### 3.-4. Run a pretrained checkpoint using the original repository
-
-At first, you will work on the original *brand_new_bert* repository. Often, the original implementation is very
-“researchy”. Meaning that documentation might be lacking and the code can be difficult to understand. But this should
-be exactly your motivation to reimplement *brand_new_bert*. At Hugging Face, one of our main goals is to *make people
-stand on the shoulders of giants* which translates here very well into taking a working model and rewriting it to make
-it as **accessible, user-friendly, and beautiful** as possible. This is the number-one motivation to re-implement
-models into 🤗 Transformers - trying to make complex new NLP technology accessible to **everybody**.
-
-You should start thereby by diving into the original repository.
-
-Successfully running the official pretrained model in the original repository is often **the most difficult** step.
-From our experience, it is very important to spend some time getting familiar with the original code-base. You need to
-figure out the following:
-
-- Where to find the pretrained weights?
-- How to load the pretrained weights into the corresponding model?
-- How to run the tokenizer independently from the model?
-- Trace one forward pass so that you know which classes and functions are required for a simple forward pass. Usually,
-  you only have to reimplement those functions.
-- Be able to locate the important components of the model: Where is the model's class? Are there model sub-classes,
-  *e.g.* EncoderModel, DecoderModel? Where is the self-attention layer? Are there multiple different attention layers,
-  *e.g.* *self-attention*, *cross-attention*...?
-- How can you debug the model in the original environment of the repo? Do you have to add *print* statements, can you
-  work with an interactive debugger like *ipdb*, or should you use an efficient IDE to debug the model, like PyCharm?
-
-It is very important that before you start the porting process, that you can **efficiently** debug code in the original
-repository! Also, remember that you are working with an open-source library, so do not hesitate to open an issue, or
-even a pull request in the original repository. The maintainers of this repository are most likely very happy about
-someone looking into their code!
-
-At this point, it is really up to you which debugging environment and strategy you prefer to use to debug the original
-model. We strongly advise against setting up a costly GPU environment, but simply work on a CPU both when starting to
-dive into the original repository and also when starting to write the 🤗 Transformers implementation of the model. Only
-at the very end, when the model has already been successfully ported to 🤗 Transformers, one should verify that the
-model also works as expected on GPU.
-
-In general, there are two possible debugging environments for running the original model
-
--  [Jupyter notebooks](https://jupyter.org/) / [google colab](https://colab.research.google.com/notebooks/intro.ipynb)
--  Local python scripts.
-
-Jupyter notebooks have the advantage that they allow for cell-by-cell execution which can be helpful to better split
-logical components from one another and to have faster debugging cycles as intermediate results can be stored. Also,
-notebooks are often easier to share with other contributors, which might be very helpful if you want to ask the Hugging
-Face team for help. If you are familiar with Jupyter notebooks, we strongly recommend you to work with them.
-
-The obvious disadvantage of Jupyter notebooks is that if you are not used to working with them you will have to spend
-some time adjusting to the new programming environment and that you might not be able to use your known debugging tools
-anymore, like `ipdb`.
-
-For each code-base, a good first step is always to load a **small** pretrained checkpoint and to be able to reproduce a
-single forward pass using a dummy integer vector of input IDs as an input. Such a script could look like this (in
-pseudocode):
-
-```python
-model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/")
-input_ids = [0, 4, 5, 2, 3, 7, 9]  # vector of input ids
-original_output = model.predict(input_ids)
-```
-
-Next, regarding the debugging strategy, there are generally a few from which to choose from:
-
-- Decompose the original model into many small testable components and run a forward pass on each of those for
-  verification
-- Decompose the original model only into the original *tokenizer* and the original *model*, run a forward pass on
-  those, and use intermediate print statements or breakpoints for verification
-
-Again, it is up to you which strategy to choose. Often, one or the other is advantageous depending on the original code
-base.
-
-If the original code-base allows you to decompose the model into smaller sub-components, *e.g.* if the original
-code-base can easily be run in eager mode, it is usually worth the effort to do so. There are some important advantages
-to taking the more difficult road in the beginning:
-
-- at a later stage when comparing the original model to the Hugging Face implementation, you can verify automatically
-  for each component individually that the corresponding component of the 🤗 Transformers implementation matches instead
-  of relying on visual comparison via print statements
-- it can give you some rope to decompose the big problem of porting a model into smaller problems of just porting
-  individual components and thus structure your work better
-- separating the model into logical meaningful components will help you to get a better overview of the model's design
-  and thus to better understand the model
-- at a later stage those component-by-component tests help you to ensure that no regression occurs as you continue
-  changing your code
-
-[Lysandre's](https://gist.github.com/LysandreJik/db4c948f6b4483960de5cbac598ad4ed) integration checks for ELECTRA
-gives a nice example of how this can be done.
-
-However, if the original code-base is very complex or only allows intermediate components to be run in a compiled mode,
-it might be too time-consuming or even impossible to separate the model into smaller testable sub-components. A good
-example is [T5's MeshTensorFlow](https://github.com/tensorflow/mesh/tree/master/mesh_tensorflow) library which is
-very complex and does not offer a simple way to decompose the model into its sub-components. For such libraries, one
-often relies on verifying print statements.
-
-No matter which strategy you choose, the recommended procedure is often the same in that you should start to debug the
-starting layers first and the ending layers last.
-
-It is recommended that you retrieve the output, either by print statements or sub-component functions, of the following
-layers in the following order:
-
-1. Retrieve the input IDs passed to the model
-2. Retrieve the word embeddings
-3. Retrieve the input of the first Transformer layer
-4. Retrieve the output of the first Transformer layer
-5. Retrieve the output of the following n - 1 Transformer layers
-6. Retrieve the output of the whole BrandNewBert Model
-
-Input IDs should thereby consists of an array of integers, *e.g.* `input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]`
-
-The outputs of the following layers often consist of multi-dimensional float arrays and can look like this:
-
-```
-[[
- [-0.1465, -0.6501,  0.1993,  ...,  0.1451,  0.3430,  0.6024],
- [-0.4417, -0.5920,  0.3450,  ..., -0.3062,  0.6182,  0.7132],
- [-0.5009, -0.7122,  0.4548,  ..., -0.3662,  0.6091,  0.7648],
- ...,
- [-0.5613, -0.6332,  0.4324,  ..., -0.3792,  0.7372,  0.9288],
- [-0.5416, -0.6345,  0.4180,  ..., -0.3564,  0.6992,  0.9191],
- [-0.5334, -0.6403,  0.4271,  ..., -0.3339,  0.6533,  0.8694]]],
-```
-
-We expect that every model added to 🤗 Transformers passes a couple of integration tests, meaning that the original
-model and the reimplemented version in 🤗 Transformers have to give the exact same output up to a precision of 0.001!
-Since it is normal that the exact same model written in different libraries can give a slightly different output
-depending on the library framework, we accept an error tolerance of 1e-3 (0.001). It is not enough if the model gives
-nearly the same output, they have to be the almost identical. Therefore, you will certainly compare the intermediate
-outputs of the 🤗 Transformers version multiple times against the intermediate outputs of the original implementation of
-*brand_new_bert* in which case an **efficient** debugging environment of the original repository is absolutely
-important. Here is some advice is to make your debugging environment as efficient as possible.
-
-- Find the best way of debugging intermediate results. Is the original repository written in PyTorch? Then you should
-  probably take the time to write a longer script that decomposes the original model into smaller sub-components to
-  retrieve intermediate values. Is the original repository written in Tensorflow 1? Then you might have to rely on
-  TensorFlow print operations like [tf.print](https://www.tensorflow.org/api_docs/python/tf/print) to output
-  intermediate values. Is the original repository written in Jax? Then make sure that the model is **not jitted** when
-  running the forward pass, *e.g.* check-out [this link](https://github.com/google/jax/issues/196).
-- Use the smallest pretrained checkpoint you can find. The smaller the checkpoint, the faster your debug cycle
-  becomes. It is not efficient if your pretrained model is so big that your forward pass takes more than 10 seconds.
-  In case only very large checkpoints are available, it might make more sense to create a dummy model in the new
-  environment with randomly initialized weights and save those weights for comparison with the 🤗 Transformers version
-  of your model
-- Make sure you are using the easiest way of calling a forward pass in the original repository. Ideally, you want to
-  find the function in the original repository that **only** calls a single forward pass, *i.e.* that is often called
-  `predict`, `evaluate`, `forward` or `__call__`. You don't want to debug a function that calls `forward`
-  multiple times, *e.g.* to generate text, like `autoregressive_sample`, `generate`.
-- Try to separate the tokenization from the model's *forward* pass. If the original repository shows examples where
-  you have to input a string, then try to find out where in the forward call the string input is changed to input ids
-  and start from this point. This might mean that you have to possibly write a small script yourself or change the
-  original code so that you can directly input the ids instead of an input string.
-- Make sure that the model in your debugging setup is **not** in training mode, which often causes the model to yield
-  random outputs due to multiple dropout layers in the model. Make sure that the forward pass in your debugging
-  environment is **deterministic** so that the dropout layers are not used. Or use *transformers.utils.set_seed*
-  if the old and new implementations are in the same framework.
-
-The following section gives you more specific details/tips on how you can do this for *brand_new_bert*.
-
-### 5.-14. Port BrandNewBert to 🤗 Transformers
-
-Next, you can finally start adding new code to 🤗 Transformers. Go into the clone of your 🤗 Transformers' fork:
-
-```bash
-cd transformers
-```
-
-In the special case that you are adding a model whose architecture exactly matches the model architecture of an
-existing model you only have to add a conversion script as described in [this section](#write-a-conversion-script).
-In this case, you can just re-use the whole model architecture of the already existing model.
-
-Otherwise, let's start generating a new model. You have two choices here:
-
-- `transformers-cli add-new-model-like` to add a new model like an existing one
-- `transformers-cli add-new-model` to add a new model from our template (will look like BERT or Bart depending on the type of model you select)
-
-In both cases, you will be prompted with a questionnaire to fill the basic information of your model. The second command requires to install `cookiecutter`, you can find more information on it [here](https://github.com/huggingface/transformers/tree/main/templates/adding_a_new_model).
-
-**Open a Pull Request on the main huggingface/transformers repo**
-
-Before starting to adapt the automatically generated code, now is the time to open a “Work in progress (WIP)” pull
-request, *e.g.* “[WIP] Add *brand_new_bert*”, in 🤗 Transformers so that you and the Hugging Face team can work
-side-by-side on integrating the model into 🤗 Transformers.
-
-You should do the following:
-
-1. Create a branch with a descriptive name from your main branch
-
-```bash
-git checkout -b add_brand_new_bert
-```
-
-2. Commit the automatically generated code:
-
-```bash
-git add .
-git commit
-```
-
-3. Fetch and rebase to current main
-
-```bash
-git fetch upstream
-git rebase upstream/main
-```
-
-4. Push the changes to your account using:
-
-```bash
-git push -u origin a-descriptive-name-for-my-changes
-```
-
-5. Once you are satisfied, go to the webpage of your fork on GitHub. Click on “Pull request”. Make sure to add the
-   GitHub handle of some members of the Hugging Face team as reviewers, so that the Hugging Face team gets notified for
-   future changes.
-
-6. Change the PR into a draft by clicking on “Convert to draft” on the right of the GitHub pull request web page.
-
-In the following, whenever you have done some progress, don't forget to commit your work and push it to your account so
-that it shows in the pull request. Additionally, you should make sure to update your work with the current main from
-time to time by doing:
-
-```bash
-git fetch upstream
-git merge upstream/main
-```
-
-In general, all questions you might have regarding the model or your implementation should be asked in your PR and
-discussed/solved in the PR. This way, the Hugging Face team will always be notified when you are committing new code or
-if you have a question. It is often very helpful to point the Hugging Face team to your added code so that the Hugging
-Face team can efficiently understand your problem or question.
-
-To do so, you can go to the “Files changed” tab where you see all of your changes, go to a line regarding which you
-want to ask a question, and click on the “+” symbol to add a comment. Whenever a question or problem has been solved,
-you can click on the “Resolve” button of the created comment.
-
-In the same way, the Hugging Face team will open comments when reviewing your code. We recommend asking most questions
-on GitHub on your PR. For some very general questions that are not very useful for the public, feel free to ping the
-Hugging Face team by Slack or email.
-
-**5. Adapt the generated models code for brand_new_bert**
-
-At first, we will focus only on the model itself and not care about the tokenizer. All the relevant code should be
-found in the generated files `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` and
-`src/transformers/models/brand_new_bert/configuration_brand_new_bert.py`.
-
-Now you can finally start coding :). The generated code in
-`src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` will either have the same architecture as BERT if
-it's an encoder-only model or BART if it's an encoder-decoder model. At this point, you should remind yourself what
-you've learned in the beginning about the theoretical aspects of the model: *How is the model different from BERT or
-BART?*". Implement those changes which often means to change the *self-attention* layer, the order of the normalization
-layer, etc… Again, it is often useful to look at the similar architecture of already existing models in Transformers to
-get a better feeling of how your model should be implemented.
-
-**Note** that at this point, you don't have to be very sure that your code is fully correct or clean. Rather, it is
-advised to add a first *unclean*, copy-pasted version of the original code to
-`src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` until you feel like all the necessary code is
-added. From our experience, it is much more efficient to quickly add a first version of the required code and
-improve/correct the code iteratively with the conversion script as described in the next section. The only thing that
-has to work at this point is that you can instantiate the 🤗 Transformers implementation of *brand_new_bert*, *i.e.* the
-following command should work:
-
-```python
-from transformers import BrandNewBertModel, BrandNewBertConfig
-
-model = BrandNewBertModel(BrandNewBertConfig())
-```
-
-The above command will create a model according to the default parameters as defined in `BrandNewBertConfig()` with
-random weights, thus making sure that the `init()` methods of all components works.
-
-**6. Write a conversion script**
-
-Next, you should write a conversion script that lets you convert the checkpoint you used to debug *brand_new_bert* in
-the original repository to a checkpoint compatible with your just created 🤗 Transformers implementation of
-*brand_new_bert*. It is not advised to write the conversion script from scratch, but rather to look through already
-existing conversion scripts in 🤗 Transformers for one that has been used to convert a similar model that was written in
-the same framework as *brand_new_bert*. Usually, it is enough to copy an already existing conversion script and
-slightly adapt it for your use case. Don't hesitate to ask the Hugging Face team to point you to a similar already
-existing conversion script for your model.
-
-- If you are porting a model from TensorFlow to PyTorch, a good starting point might be BERT's conversion script [here](https://github.com/huggingface/transformers/blob/7acfa95afb8194f8f9c1f4d2c6028224dbed35a2/src/transformers/models/bert/modeling_bert.py#L91)
-- If you are porting a model from PyTorch to PyTorch, a good starting point might be BART's conversion script [here](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py)
-
-In the following, we'll quickly explain how PyTorch models store layer weights and define layer names. In PyTorch, the
-name of a layer is defined by the name of the class attribute you give the layer. Let's define a dummy model in
-PyTorch, called `SimpleModel` as follows:
-
-```python
-from torch import nn
-
-
-class SimpleModel(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.dense = nn.Linear(10, 10)
-        self.intermediate = nn.Linear(10, 10)
-        self.layer_norm = nn.LayerNorm(10)
-```
-
-Now we can create an instance of this model definition which will fill all weights: `dense`, `intermediate`,
-`layer_norm` with random weights. We can print the model to see its architecture
-
-```python
-model = SimpleModel()
-
-print(model)
-```
-
-This will print out the following:
-
-```
-SimpleModel(
-  (dense): Linear(in_features=10, out_features=10, bias=True)
-  (intermediate): Linear(in_features=10, out_features=10, bias=True)
-  (layer_norm): LayerNorm((10,), eps=1e-05, elementwise_affine=True)
-)
-```
-
-We can see that the layer names are defined by the name of the class attribute in PyTorch. You can print out the weight
-values of a specific layer:
-
-```python
-print(model.dense.weight.data)
-```
-
-to see that the weights were randomly initialized
-
-```
-tensor([[-0.0818,  0.2207, -0.0749, -0.0030,  0.0045, -0.1569, -0.1598,  0.0212,
-         -0.2077,  0.2157],
-        [ 0.1044,  0.0201,  0.0990,  0.2482,  0.3116,  0.2509,  0.2866, -0.2190,
-          0.2166, -0.0212],
-        [-0.2000,  0.1107, -0.1999, -0.3119,  0.1559,  0.0993,  0.1776, -0.1950,
-         -0.1023, -0.0447],
-        [-0.0888, -0.1092,  0.2281,  0.0336,  0.1817, -0.0115,  0.2096,  0.1415,
-         -0.1876, -0.2467],
-        [ 0.2208, -0.2352, -0.1426, -0.2636, -0.2889, -0.2061, -0.2849, -0.0465,
-          0.2577,  0.0402],
-        [ 0.1502,  0.2465,  0.2566,  0.0693,  0.2352, -0.0530,  0.1859, -0.0604,
-          0.2132,  0.1680],
-        [ 0.1733, -0.2407, -0.1721,  0.1484,  0.0358, -0.0633, -0.0721, -0.0090,
-          0.2707, -0.2509],
-        [-0.1173,  0.1561,  0.2945,  0.0595, -0.1996,  0.2988, -0.0802,  0.0407,
-          0.1829, -0.1568],
-        [-0.1164, -0.2228, -0.0403,  0.0428,  0.1339,  0.0047,  0.1967,  0.2923,
-          0.0333, -0.0536],
-        [-0.1492, -0.1616,  0.1057,  0.1950, -0.2807, -0.2710, -0.1586,  0.0739,
-          0.2220,  0.2358]]).
-```
-
-In the conversion script, you should fill those randomly initialized weights with the exact weights of the
-corresponding layer in the checkpoint. *E.g.*
-
-```python
-# retrieve matching layer weights, e.g. by
-# recursive algorithm
-layer_name = "dense"
-pretrained_weight = array_of_dense_layer
-
-model_pointer = getattr(model, "dense")
-
-model_pointer.weight.data = torch.from_numpy(pretrained_weight)
-```
-
-While doing so, you must verify that each randomly initialized weight of your PyTorch model and its corresponding
-pretrained checkpoint weight exactly match in both **shape and name**. To do so, it is **necessary** to add assert
-statements for the shape and print out the names of the checkpoints weights. E.g. you should add statements like:
-
-```python
-assert (
-    model_pointer.weight.shape == pretrained_weight.shape
-), f"Pointer shape of random weight {model_pointer.shape} and array shape of checkpoint weight {pretrained_weight.shape} mismatched"
-```
-
-Besides, you should also print out the names of both weights to make sure they match, *e.g.*
-
-```python
-logger.info(f"Initialize PyTorch weight {layer_name} from {pretrained_weight.name}")
-```
-
-If either the shape or the name doesn't match, you probably assigned the wrong checkpoint weight to a randomly
-initialized layer of the 🤗 Transformers implementation.
-
-An incorrect shape is most likely due to an incorrect setting of the config parameters in `BrandNewBertConfig()` that
-do not exactly match those that were used for the checkpoint you want to convert. However, it could also be that
-PyTorch's implementation of a layer requires the weight to be transposed beforehand.
-
-Finally, you should also check that **all** required weights are initialized and print out all checkpoint weights that
-were not used for initialization to make sure the model is correctly converted. It is completely normal, that the
-conversion trials fail with either a wrong shape statement or wrong name assignment. This is most likely because either
-you used incorrect parameters in `BrandNewBertConfig()`, have a wrong architecture in the 🤗 Transformers
-implementation, you have a bug in the `init()` functions of one of the components of the 🤗 Transformers
-implementation or you need to transpose one of the checkpoint weights.
-
-This step should be iterated with the previous step until all weights of the checkpoint are correctly loaded in the
-Transformers model. Having correctly loaded the checkpoint into the 🤗 Transformers implementation, you can then save
-the model under a folder of your choice `/path/to/converted/checkpoint/folder` that should then contain both a
-`pytorch_model.bin` file and a `config.json` file:
-
-```python
-model.save_pretrained("/path/to/converted/checkpoint/folder")
-```
-
-**7. Implement the forward pass**
-
-Having managed to correctly load the pretrained weights into the 🤗 Transformers implementation, you should now make
-sure that the forward pass is correctly implemented. In [Get familiar with the original repository](#run-a-pretrained-checkpoint-using-the-original-repository), you have already created a script that runs a forward
-pass of the model using the original repository. Now you should write an analogous script using the 🤗 Transformers
-implementation instead of the original one. It should look as follows:
-
-```python
-model = BrandNewBertModel.from_pretrained("/path/to/converted/checkpoint/folder")
-input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]
-output = model(input_ids).last_hidden_states
-```
-
-It is very likely that the 🤗 Transformers implementation and the original model implementation don't give the exact
-same output the very first time or that the forward pass throws an error. Don't be disappointed - it's expected! First,
-you should make sure that the forward pass doesn't throw any errors. It often happens that the wrong dimensions are
-used leading to a *Dimensionality mismatch* error or that the wrong data type object is used, *e.g.* `torch.long`
-instead of `torch.float32`. Don't hesitate to ask the Hugging Face team for help, if you don't manage to solve
-certain errors.
-
-The final part to make sure the 🤗 Transformers implementation works correctly is to ensure that the outputs are
-equivalent to a precision of `1e-3`. First, you should ensure that the output shapes are identical, *i.e.*
-`outputs.shape` should yield the same value for the script of the 🤗 Transformers implementation and the original
-implementation. Next, you should make sure that the output values are identical as well. This one of the most difficult
-parts of adding a new model. Common mistakes why the outputs are not identical are:
-
-- Some layers were not added, *i.e.* an *activation* layer was not added, or the residual connection was forgotten
-- The word embedding matrix was not tied
-- The wrong positional embeddings are used because the original implementation uses on offset
-- Dropout is applied during the forward pass. To fix this make sure *model.training is False* and that no dropout
-  layer is falsely activated during the forward pass, *i.e.* pass *self.training* to [PyTorch's functional dropout](https://pytorch.org/docs/stable/nn.functional.html?highlight=dropout#torch.nn.functional.dropout)
-
-The best way to fix the problem is usually to look at the forward pass of the original implementation and the 🤗
-Transformers implementation side-by-side and check if there are any differences. Ideally, you should debug/print out
-intermediate outputs of both implementations of the forward pass to find the exact position in the network where the 🤗
-Transformers implementation shows a different output than the original implementation. First, make sure that the
-hard-coded `input_ids` in both scripts are identical. Next, verify that the outputs of the first transformation of
-the `input_ids` (usually the word embeddings) are identical. And then work your way up to the very last layer of the
-network. At some point, you will notice a difference between the two implementations, which should point you to the bug
-in the 🤗 Transformers implementation. From our experience, a simple and efficient way is to add many print statements
-in both the original implementation and 🤗 Transformers implementation, at the same positions in the network
-respectively, and to successively remove print statements showing the same values for intermediate presentations.
-
-When you're confident that both implementations yield the same output, verifying the outputs with
-`torch.allclose(original_output, output, atol=1e-3)`, you're done with the most difficult part! Congratulations - the
-work left to be done should be a cakewalk 😊.
-
-**8. Adding all necessary model tests**
-
-At this point, you have successfully added a new model. However, it is very much possible that the model does not yet
-fully comply with the required design. To make sure, the implementation is fully compatible with 🤗 Transformers, all
-common tests should pass. The Cookiecutter should have automatically added a test file for your model, probably under
-the same `tests/models/brand_new_bert/test_modeling_brand_new_bert.py`. Run this test file to verify that all common
-tests pass:
-
-```bash
-pytest tests/models/brand_new_bert/test_modeling_brand_new_bert.py
-```
-
-Having fixed all common tests, it is now crucial to ensure that all the nice work you have done is well tested, so that
-
-- a) The community can easily understand your work by looking at specific tests of *brand_new_bert*
-- b) Future changes to your model will not break any important feature of the model.
-
-At first, integration tests should be added. Those integration tests essentially do the same as the debugging scripts
-you used earlier to implement the model to 🤗 Transformers. A template of those model tests is already added by the
-Cookiecutter, called `BrandNewBertModelIntegrationTests` and only has to be filled out by you. To ensure that those
-tests are passing, run
-
-```bash
-RUN_SLOW=1 pytest -sv tests/models/brand_new_bert/test_modeling_brand_new_bert.py::BrandNewBertModelIntegrationTests
-```
-
-<Tip>
-
-In case you are using Windows, you should replace `RUN_SLOW=1` with `SET RUN_SLOW=1`
-
-</Tip>
-
-Second, all features that are special to *brand_new_bert* should be tested additionally in a separate test under
-`BrandNewBertModelTester`/``BrandNewBertModelTest`. This part is often forgotten but is extremely useful in two
-ways:
-
-- It helps to transfer the knowledge you have acquired during the model addition to the community by showing how the
-  special features of *brand_new_bert* should work.
-- Future contributors can quickly test changes to the model by running those special tests.
-
-
-**9. Implement the tokenizer**
-
-Next, we should add the tokenizer of *brand_new_bert*. Usually, the tokenizer is equivalent or very similar to an
-already existing tokenizer of 🤗 Transformers.
-
-It is very important to find/extract the original tokenizer file and to manage to load this file into the 🤗
-Transformers' implementation of the tokenizer.
-
-To ensure that the tokenizer works correctly, it is recommended to first create a script in the original repository
-that inputs a string and returns the `input_ids``. It could look similar to this (in pseudo-code):
-
-```python
-input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words."
-model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/")
-input_ids = model.tokenize(input_str)
-```
-
-You might have to take a deeper look again into the original repository to find the correct tokenizer function or you
-might even have to do changes to your clone of the original repository to only output the `input_ids`. Having written
-a functional tokenization script that uses the original repository, an analogous script for 🤗 Transformers should be
-created. It should look similar to this:
-
-```python
-from transformers import BrandNewBertTokenizer
-
-input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words."
-
-tokenizer = BrandNewBertTokenizer.from_pretrained("/path/to/tokenizer/folder/")
-
-input_ids = tokenizer(input_str).input_ids
-```
-
-When both `input_ids` yield the same values, as a final step a tokenizer test file should also be added.
-
-Analogous to the modeling test files of *brand_new_bert*, the tokenization test files of *brand_new_bert* should
-contain a couple of hard-coded integration tests.
-
-**10. Run End-to-end integration tests**
-
-Having added the tokenizer, you should also add a couple of end-to-end integration tests using both the model and the
-tokenizer to `tests/models/brand_new_bert/test_modeling_brand_new_bert.py` in 🤗 Transformers.
-Such a test should show on a meaningful
-text-to-text sample that the 🤗 Transformers implementation works as expected. A meaningful text-to-text sample can
-include *e.g.* a source-to-target-translation pair, an article-to-summary pair, a question-to-answer pair, etc… If none
-of the ported checkpoints has been fine-tuned on a downstream task it is enough to simply rely on the model tests. In a
-final step to ensure that the model is fully functional, it is advised that you also run all tests on GPU. It can
-happen that you forgot to add some `.to(self.device)` statements to internal tensors of the model, which in such a
-test would show in an error. In case you have no access to a GPU, the Hugging Face team can take care of running those
-tests for you.
-
-**11. Add Docstring**
-
-Now, all the necessary functionality for *brand_new_bert* is added - you're almost done! The only thing left to add is
-a nice docstring and a doc page. The Cookiecutter should have added a template file called
-`docs/source/model_doc/brand_new_bert.mdx` that you should fill out. Users of your model will usually first look at
-this page before using your model. Hence, the documentation must be understandable and concise. It is very useful for
-the community to add some *Tips* to show how the model should be used. Don't hesitate to ping the Hugging Face team
-regarding the docstrings.
-
-Next, make sure that the docstring added to `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` is
-correct and included all necessary inputs and outputs. We have a detailed guide about writing documentation and our docstring format [here](writing-documentation). It is always to good to remind oneself that documentation should
-be treated at least as carefully as the code in 🤗 Transformers since the documentation is usually the first contact
-point of the community with the model.
-
-**Code refactor**
-
-Great, now you have added all the necessary code for *brand_new_bert*. At this point, you should correct some potential
-incorrect code style by running:
-
-```bash
-make style
-```
-
-and verify that your coding style passes the quality check:
-
-```bash
-make quality
-```
-
-There are a couple of other very strict design tests in 🤗 Transformers that might still be failing, which shows up in
-the tests of your pull request. This is often because of some missing information in the docstring or some incorrect
-naming. The Hugging Face team will surely help you if you're stuck here.
-
-Lastly, it is always a good idea to refactor one's code after having ensured that the code works correctly. With all
-tests passing, now it's a good time to go over the added code again and do some refactoring.
-
-You have now finished the coding part, congratulation! 🎉 You are Awesome! 😎
-
-**12. Upload the models to the model hub**
-
-In this final part, you should convert and upload all checkpoints to the model hub and add a model card for each
-uploaded model checkpoint. You can get familiar with the hub functionalities by reading our [Model sharing and uploading Page](model_sharing). You should work alongside the Hugging Face team here to decide on a fitting name for each
-checkpoint and to get the required access rights to be able to upload the model under the author's organization of
-*brand_new_bert*. The `push_to_hub` method, present in all models in `transformers`, is a quick and efficient way to push your checkpoint to the hub. A little snippet is pasted below:
-
-```python
-brand_new_bert.push_to_hub("brand_new_bert")
-# Uncomment the following line to push to an organization.
-# brand_new_bert.push_to_hub("<organization>/brand_new_bert")
-```
-
-It is worth spending some time to create fitting model cards for each checkpoint. The model cards should highlight the
-specific characteristics of this particular checkpoint, *e.g.* On which dataset was the checkpoint
-pretrained/fine-tuned on? On what down-stream task should the model be used? And also include some code on how to
-correctly use the model.
-
-**13. (Optional) Add notebook**
-
-It is very helpful to add a notebook that showcases in-detail how *brand_new_bert* can be used for inference and/or
-fine-tuned on a downstream task. This is not mandatory to merge your PR, but very useful for the community.
-
-**14. Submit your finished PR**
-
-You're done programming now and can move to the last step, which is getting your PR merged into main. Usually, the
-Hugging Face team should have helped you already at this point, but it is worth taking some time to give your finished
-PR a nice description and eventually add comments to your code, if you want to point out certain design choices to your
-reviewer.
-
-### Share your work!!
-
-Now, it's time to get some credit from the community for your work! Having completed a model addition is a major
-contribution to Transformers and the whole NLP community. Your code and the ported pre-trained models will certainly be
-used by hundreds and possibly even thousands of developers and researchers. You should be proud of your work and share
-your achievement with the community.
-
-**You have made another model that is super easy to access for everyone in the community! 🤯**
diff --git a/docs/source/en/add_new_pipeline.md b/docs/source/en/add_new_pipeline.md
new file mode 100644
index 000000000000..70f62bf9909e
--- /dev/null
+++ b/docs/source/en/add_new_pipeline.md
@@ -0,0 +1,258 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# How to create a custom pipeline?
+
+In this guide, we will see how to create a custom pipeline and share it on the [Hub](hf.co/models) or add it to the
+🤗 Transformers library.
+
+First and foremost, you need to decide the raw entries the pipeline will be able to take. It can be strings, raw bytes,
+dictionaries or whatever seems to be the most likely desired input. Try to keep these inputs as pure Python as possible
+as it makes compatibility easier (even through other languages via JSON). Those will be the `inputs` of the
+pipeline (`preprocess`).
+
+Then define the `outputs`. Same policy as the `inputs`. The simpler, the better. Those will be the outputs of
+`postprocess` method.
+
+Start by inheriting the base class `Pipeline` with the 4 methods needed to implement `preprocess`,
+`_forward`, `postprocess`, and `_sanitize_parameters`.
+
+
+```python
+from transformers import Pipeline
+
+
+class MyPipeline(Pipeline):
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_kwargs = {}
+        if "maybe_arg" in kwargs:
+            preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
+        return preprocess_kwargs, {}, {}
+
+    def preprocess(self, inputs, maybe_arg=2):
+        model_input = Tensor(inputs["input_ids"])
+        return {"model_input": model_input}
+
+    def _forward(self, model_inputs):
+        # model_inputs == {"model_input": model_input}
+        outputs = self.model(**model_inputs)
+        # Maybe {"logits": Tensor(...)}
+        return outputs
+
+    def postprocess(self, model_outputs):
+        best_class = model_outputs["logits"].softmax(-1)
+        return best_class
+```
+
+The structure of this breakdown is to support relatively seamless support for CPU/GPU, while supporting doing
+pre/postprocessing on the CPU on different threads
+
+`preprocess` will take the originally defined inputs, and turn them into something feedable to the model. It might
+contain more information and is usually a `Dict`.
+
+`_forward` is the implementation detail and is not meant to be called directly. `forward` is the preferred
+called method as it contains safeguards to make sure everything is working on the expected device. If anything is
+linked to a real model it belongs in the `_forward` method, anything else is in the preprocess/postprocess.
+
+`postprocess` methods will take the output of `_forward` and turn it into the final output that was decided
+earlier.
+
+`_sanitize_parameters` exists to allow users to pass any parameters whenever they wish, be it at initialization
+time `pipeline(...., maybe_arg=4)` or at call time `pipe = pipeline(...); output = pipe(...., maybe_arg=4)`.
+
+The returns of `_sanitize_parameters` are the 3 dicts of kwargs that will be passed directly to `preprocess`,
+`_forward`, and `postprocess`. Don't fill anything if the caller didn't call with any extra parameter. That
+allows to keep the default arguments in the function definition which is always more "natural".
+
+A classic example would be a `top_k` argument in the post processing in classification tasks.
+
+```python
+>>> pipe = pipeline("my-new-task")
+>>> pipe("This is a test")
+[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}, {"label": "3-star", "score": 0.05}
+{"label": "4-star", "score": 0.025}, {"label": "5-star", "score": 0.025}]
+
+>>> pipe("This is a test", top_k=2)
+[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}]
+```
+
+In order to achieve that, we'll update our `postprocess` method with a default parameter to `5`. and edit
+`_sanitize_parameters` to allow this new parameter.
+
+
+```python
+def postprocess(self, model_outputs, top_k=5):
+    best_class = model_outputs["logits"].softmax(-1)
+    # Add logic to handle top_k
+    return best_class
+
+
+def _sanitize_parameters(self, **kwargs):
+    preprocess_kwargs = {}
+    if "maybe_arg" in kwargs:
+        preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
+
+    postprocess_kwargs = {}
+    if "top_k" in kwargs:
+        postprocess_kwargs["top_k"] = kwargs["top_k"]
+    return preprocess_kwargs, {}, postprocess_kwargs
+```
+
+Try to keep the inputs/outputs very simple and ideally JSON-serializable as it makes the pipeline usage very easy
+without requiring users to understand new kinds of objects. It's also relatively common to support many different types
+of arguments for ease of use (audio files, which can be filenames, URLs or pure bytes)
+
+
+
+## Adding it to the list of supported tasks
+
+To register your `new-task` to the list of supported tasks, you have to add it to the `PIPELINE_REGISTRY`:
+
+```python
+from transformers.pipelines import PIPELINE_REGISTRY
+
+PIPELINE_REGISTRY.register_pipeline(
+    "new-task",
+    pipeline_class=MyPipeline,
+    pt_model=AutoModelForSequenceClassification,
+)
+```
+
+You can specify a default model if you want, in which case it should come with a specific revision (which can be the name of a branch or a commit hash, here we took `"abcdef"`) as well as the type:
+
+```python
+PIPELINE_REGISTRY.register_pipeline(
+    "new-task",
+    pipeline_class=MyPipeline,
+    pt_model=AutoModelForSequenceClassification,
+    default={"pt": ("user/awesome_model", "abcdef")},
+    type="text",  # current support type: text, audio, image, multimodal
+)
+```
+
+## Share your pipeline on the Hub
+
+To share your custom pipeline on the Hub, you just have to save the custom code of your `Pipeline` subclass in a
+python file. For instance, let's say we want to use a custom pipeline for sentence pair classification like this:
+
+```py
+import numpy as np
+
+from transformers import Pipeline
+
+
+def softmax(outputs):
+    maxes = np.max(outputs, axis=-1, keepdims=True)
+    shifted_exp = np.exp(outputs - maxes)
+    return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
+
+
+class PairClassificationPipeline(Pipeline):
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_kwargs = {}
+        if "second_text" in kwargs:
+            preprocess_kwargs["second_text"] = kwargs["second_text"]
+        return preprocess_kwargs, {}, {}
+
+    def preprocess(self, text, second_text=None):
+        return self.tokenizer(text, text_pair=second_text, return_tensors=self.framework)
+
+    def _forward(self, model_inputs):
+        return self.model(**model_inputs)
+
+    def postprocess(self, model_outputs):
+        logits = model_outputs.logits[0].numpy()
+        probabilities = softmax(logits)
+
+        best_class = np.argmax(probabilities)
+        label = self.model.config.id2label[best_class]
+        score = probabilities[best_class].item()
+        logits = logits.tolist()
+        return {"label": label, "score": score, "logits": logits}
+```
+
+The implementation is framework agnostic, and will work for PyTorch and TensorFlow models. If we have saved this in
+a file named `pair_classification.py`, we can then import it and register it like this:
+
+```py
+from pair_classification import PairClassificationPipeline
+from transformers.pipelines import PIPELINE_REGISTRY
+from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification
+
+PIPELINE_REGISTRY.register_pipeline(
+    "pair-classification",
+    pipeline_class=PairClassificationPipeline,
+    pt_model=AutoModelForSequenceClassification,
+    tf_model=TFAutoModelForSequenceClassification,
+)
+```
+
+Once this is done, we can use it with a pretrained model. For instance `sgugger/finetuned-bert-mrpc` has been
+fine-tuned on the MRPC dataset, which classifies pairs of sentences as paraphrases or not.
+
+```py
+from transformers import pipeline
+
+classifier = pipeline("pair-classification", model="sgugger/finetuned-bert-mrpc")
+```
+
+Then we can share it on the Hub by using the `save_pretrained` method in a `Repository`:
+
+```py
+from huggingface_hub import Repository
+
+repo = Repository("test-dynamic-pipeline", clone_from="{your_username}/test-dynamic-pipeline")
+classifier.save_pretrained("test-dynamic-pipeline")
+repo.push_to_hub()
+```
+
+This will copy the file where you defined `PairClassificationPipeline` inside the folder `"test-dynamic-pipeline"`,
+along with saving the model and tokenizer of the pipeline, before pushing everything into the repository
+`{your_username}/test-dynamic-pipeline`. After that, anyone can use it as long as they provide the option
+`trust_remote_code=True`:
+
+```py
+from transformers import pipeline
+
+classifier = pipeline(model="{your_username}/test-dynamic-pipeline", trust_remote_code=True)
+```
+
+## Add the pipeline to 🤗 Transformers
+
+If you want to contribute your pipeline to 🤗 Transformers, you will need to add a new module in the `pipelines` submodule
+with the code of your pipeline, then add it to the list of tasks defined in `pipelines/__init__.py`.
+
+Then you will need to add tests. Create a new file `tests/test_pipelines_MY_PIPELINE.py` with examples of the other tests.
+
+The `run_pipeline_test` function will be very generic and run on small random models on every possible
+architecture as defined by `model_mapping` and `tf_model_mapping`.
+
+This is very important to test future compatibility, meaning if someone adds a new model for
+`XXXForQuestionAnswering` then the pipeline test will attempt to run on it. Because the models are random it's
+impossible to check for actual values, that's why there is a helper `ANY` that will simply attempt to match the
+output of the pipeline TYPE.
+
+You also *need* to implement 2 (ideally 4) tests.
+
+- `test_small_model_pt` : Define 1 small model for this pipeline (doesn't matter if the results don't make sense)
+  and test the pipeline outputs. The results should be the same as `test_small_model_tf`.
+- `test_small_model_tf` : Define 1 small model for this pipeline (doesn't matter if the results don't make sense)
+  and test the pipeline outputs. The results should be the same as `test_small_model_pt`.
+- `test_large_model_pt` (`optional`): Tests the pipeline on a real pipeline where the results are supposed to
+  make sense. These tests are slow and should be marked as such. Here the goal is to showcase the pipeline and to make
+  sure there is no drift in future releases.
+- `test_large_model_tf` (`optional`): Tests the pipeline on a real pipeline where the results are supposed to
+  make sense. These tests are slow and should be marked as such. Here the goal is to showcase the pipeline and to make
+  sure there is no drift in future releases.
diff --git a/docs/source/en/add_new_pipeline.mdx b/docs/source/en/add_new_pipeline.mdx
deleted file mode 100644
index b0cc2cd0ff72..000000000000
--- a/docs/source/en/add_new_pipeline.mdx
+++ /dev/null
@@ -1,254 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
--->
-
-# How to create a custom pipeline?
-
-In this guide, we will see how to create a custom pipeline and share it on the [Hub](hf.co/models) or add it to the
-🤗 Transformers library.
-
-First and foremost, you need to decide the raw entries the pipeline will be able to take. It can be strings, raw bytes,
-dictionaries or whatever seems to be the most likely desired input. Try to keep these inputs as pure Python as possible
-as it makes compatibility easier (even through other languages via JSON). Those will be the `inputs` of the
-pipeline (`preprocess`).
-
-Then define the `outputs`. Same policy as the `inputs`. The simpler, the better. Those will be the outputs of
-`postprocess` method.
-
-Start by inheriting the base class `Pipeline` with the 4 methods needed to implement `preprocess`,
-`_forward`, `postprocess`, and `_sanitize_parameters`.
-
-
-```python
-from transformers import Pipeline
-
-
-class MyPipeline(Pipeline):
-    def _sanitize_parameters(self, **kwargs):
-        preprocess_kwargs = {}
-        if "maybe_arg" in kwargs:
-            preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
-        return preprocess_kwargs, {}, {}
-
-    def preprocess(self, inputs, maybe_arg=2):
-        model_input = Tensor(inputs["input_ids"])
-        return {"model_input": model_input}
-
-    def _forward(self, model_inputs):
-        # model_inputs == {"model_input": model_input}
-        outputs = self.model(**model_inputs)
-        # Maybe {"logits": Tensor(...)}
-        return outputs
-
-    def postprocess(self, model_outputs):
-        best_class = model_outputs["logits"].softmax(-1)
-        return best_class
-```
-
-The structure of this breakdown is to support relatively seamless support for CPU/GPU, while supporting doing
-pre/postprocessing on the CPU on different threads
-
-`preprocess` will take the originally defined inputs, and turn them into something feedable to the model. It might
-contain more information and is usually a `Dict`.
-
-`_forward` is the implementation detail and is not meant to be called directly. `forward` is the preferred
-called method as it contains safeguards to make sure everything is working on the expected device. If anything is
-linked to a real model it belongs in the `_forward` method, anything else is in the preprocess/postprocess.
-
-`postprocess` methods will take the output of `_forward` and turn it into the final output that was decided
-earlier.
-
-`_sanitize_parameters` exists to allow users to pass any parameters whenever they wish, be it at initialization
-time `pipeline(...., maybe_arg=4)` or at call time `pipe = pipeline(...); output = pipe(...., maybe_arg=4)`.
-
-The returns of `_sanitize_parameters` are the 3 dicts of kwargs that will be passed directly to `preprocess`,
-`_forward`, and `postprocess`. Don't fill anything if the caller didn't call with any extra parameter. That
-allows to keep the default arguments in the function definition which is always more "natural".
-
-A classic example would be a `top_k` argument in the post processing in classification tasks.
-
-```python
->>> pipe = pipeline("my-new-task")
->>> pipe("This is a test")
-[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}, {"label": "3-star", "score": 0.05}
-{"label": "4-star", "score": 0.025}, {"label": "5-star", "score": 0.025}]
-
->>> pipe("This is a test", top_k=2)
-[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}]
-```
-
-In order to achieve that, we'll update our `postprocess` method with a default parameter to `5`. and edit
-`_sanitize_parameters` to allow this new parameter.
-
-
-```python
-def postprocess(self, model_outputs, top_k=5):
-    best_class = model_outputs["logits"].softmax(-1)
-    # Add logic to handle top_k
-    return best_class
-
-
-def _sanitize_parameters(self, **kwargs):
-    preprocess_kwargs = {}
-    if "maybe_arg" in kwargs:
-        preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
-
-    postprocess_kwargs = {}
-    if "top_k" in kwargs:
-        postprocess_kwargs["top_k"] = kwargs["top_k"]
-    return preprocess_kwargs, {}, postprocess_kwargs
-```
-
-Try to keep the inputs/outputs very simple and ideally JSON-serializable as it makes the pipeline usage very easy
-without requiring users to understand new kind of objects. It's also relatively common to support many different types
-of arguments for ease of use (audio files, can be filenames, URLs or pure bytes)
-
-
-
-## Adding it to the list of supported tasks
-
-To register your `new-task` to the list of supported tasks, you have to add it to the `PIPELINE_REGISTRY`:
-
-```python
-from transformers.pipelines import PIPELINE_REGISTRY
-
-PIPELINE_REGISTRY.register_pipeline(
-    "new-task",
-    pipeline_class=MyPipeline,
-    pt_model=AutoModelForSequenceClassification,
-)
-```
-
-You can specify a default model if you want, in which case it should come with a specific revision (which can be the name of a branch or a commit hash, here we took `"abcdef"`) as well as the type:
-
-```python
-PIPELINE_REGISTRY.register_pipeline(
-    "new-task",
-    pipeline_class=MyPipeline,
-    pt_model=AutoModelForSequenceClassification,
-    default={"pt": ("user/awesome_model", "abcdef")},
-    type="text",  # current support type: text, audio, image, multimodal
-)
-```
-
-## Share your pipeline on the Hub
-
-To share your custom pipeline on the Hub, you just have to save the custom code of your `Pipeline` subclass in a
-python file. For instance, let's say we want to use a custom pipeline for sentence pair classification like this:
-
-```py
-import numpy as np
-
-from transformers import Pipeline
-
-
-def softmax(outputs):
-    maxes = np.max(outputs, axis=-1, keepdims=True)
-    shifted_exp = np.exp(outputs - maxes)
-    return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
-
-
-class PairClassificationPipeline(Pipeline):
-    def _sanitize_parameters(self, **kwargs):
-        preprocess_kwargs = {}
-        if "second_text" in kwargs:
-            preprocess_kwargs["second_text"] = kwargs["second_text"]
-        return preprocess_kwargs, {}, {}
-
-    def preprocess(self, text, second_text=None):
-        return self.tokenizer(text, text_pair=second_text, return_tensors=self.framework)
-
-    def _forward(self, model_inputs):
-        return self.model(**model_inputs)
-
-    def postprocess(self, model_outputs):
-        logits = model_outputs.logits[0].numpy()
-        probabilities = softmax(logits)
-
-        best_class = np.argmax(probabilities)
-        label = self.model.config.id2label[best_class]
-        score = probabilities[best_class].item()
-        logits = logits.tolist()
-        return {"label": label, "score": score, "logits": logits}
-```
-
-The implementation is framework agnostic, and will work for PyTorch and TensorFlow models. If we have saved this in
-a file named `pair_classification.py`, we can then import it and register it like this:
-
-```py
-from pair_classification import PairClassificationPipeline
-from transformers.pipelines import PIPELINE_REGISTRY
-from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification
-
-PIPELINE_REGISTRY.register_pipeline(
-    "pair-classification",
-    pipeline_class=PairClassificationPipeline,
-    pt_model=AutoModelForSequenceClassification,
-    tf_model=TFAutoModelForSequenceClassification,
-)
-```
-
-Once this is done, we can use it with a pretrained model. For instance `sgugger/finetuned-bert-mrpc` has been
-fine-tuned on the MRPC dataset, which classifies pairs of sentences as paraphrases or not.
-
-```py
-from transformers import pipeline
-
-classifier = pipeline("pair-classification", model="sgugger/finetuned-bert-mrpc")
-```
-
-Then we can share it on the Hub by using the `save_pretrained` method in a `Repository`:
-
-```py
-from huggingface_hub import Repository
-
-repo = Repository("test-dynamic-pipeline", clone_from="{your_username}/test-dynamic-pipeline")
-classifier.save_pretrained("test-dynamic-pipeline")
-repo.push_to_hub()
-```
-
-This will copy the file where you defined `PairClassificationPipeline` inside the folder `"test-dynamic-pipeline"`,
-along with saving the model and tokenizer of the pipeline, before pushing everything in the repository
-`{your_username}/test-dynamic-pipeline`. After that anyone can use it as long as they provide the option
-`trust_remote_code=True`:
-
-```py
-from transformers import pipeline
-
-classifier = pipeline(model="{your_username}/test-dynamic-pipeline", trust_remote_code=True)
-```
-
-## Add the pipeline to 🤗 Transformers
-
-If you want to contribute your pipeline to 🤗 Transformers, you will need to add a new module in the `pipelines` submodule
-with the code of your pipeline, then add it in the list of tasks defined in `pipelines/__init__.py`.
-
-Then you will need to add tests. Create a new file `tests/test_pipelines_MY_PIPELINE.py` with example with the other tests.
-
-The `run_pipeline_test` function will be very generic and run on small random models on every possible
-architecture as defined by `model_mapping` and `tf_model_mapping`.
-
-This is very important to test future compatibility, meaning if someone adds a new model for
-`XXXForQuestionAnswering` then the pipeline test will attempt to run on it. Because the models are random it's
-impossible to check for actual values, that's why there is a helper `ANY` that will simply attempt to match the
-output of the pipeline TYPE.
-
-You also *need* to implement 2 (ideally 4) tests.
-
-- `test_small_model_pt` : Define 1 small model for this pipeline (doesn't matter if the results don't make sense)
-  and test the pipeline outputs. The results should be the same as `test_small_model_tf`.
-- `test_small_model_tf` : Define 1 small model for this pipeline (doesn't matter if the results don't make sense)
-  and test the pipeline outputs. The results should be the same as `test_small_model_pt`.
-- `test_large_model_pt` (`optional`): Tests the pipeline on a real pipeline where the results are supposed to
-  make sense. These tests are slow and should be marked as such. Here the goal is to showcase the pipeline and to make
-  sure there is no drift in future releases.
-- `test_large_model_tf` (`optional`): Tests the pipeline on a real pipeline where the results are supposed to
-  make sense. These tests are slow and should be marked as such. Here the goal is to showcase the pipeline and to make
-  sure there is no drift in future releases.
diff --git a/docs/source/en/add_tensorflow_model.md b/docs/source/en/add_tensorflow_model.md
new file mode 100644
index 000000000000..7ea81a9fe976
--- /dev/null
+++ b/docs/source/en/add_tensorflow_model.md
@@ -0,0 +1,356 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# How to convert a 🤗 Transformers model to TensorFlow?
+
+Having multiple frameworks available to use with 🤗 Transformers gives you flexibility to play their strengths when
+designing your application, but it implies that compatibility must be added on a per-model basis. The good news is that
+adding TensorFlow compatibility to an existing model is simpler than [adding a new model from scratch](add_new_model)!
+Whether you wish to have a deeper understanding of large TensorFlow models, make a major open-source contribution, or
+enable TensorFlow for your model of choice, this guide is for you.
+
+This guide empowers you, a member of our community, to contribute TensorFlow model weights and/or
+architectures to be used in 🤗 Transformers, with minimal supervision from the Hugging Face team. Writing a new model
+is no small feat, but hopefully this guide will make it less of a rollercoaster 🎢 and more of a walk in the park 🚶.
+Harnessing our collective experiences is absolutely critical to make this process increasingly easier, and thus we
+highly encourage that you suggest improvements to this guide!
+
+Before you dive deeper, it is recommended that you check the following resources if you're new to 🤗 Transformers:
+- [General overview of 🤗 Transformers](add_new_model#general-overview-of-transformers)
+- [Hugging Face's TensorFlow Philosophy](https://huggingface.co/blog/tensorflow-philosophy)
+
+In the remainder of this guide, you will learn what's needed to add a new TensorFlow model architecture, the
+procedure to convert PyTorch into TensorFlow model weights, and how to efficiently debug mismatches across ML
+frameworks. Let's get started!
+
+<Tip>
+
+Are you unsure whether the model you wish to use already has a corresponding TensorFlow architecture?
+
+&nbsp;
+
+Check the `model_type` field of the `config.json` of your model of choice
+([example](https://huggingface.co/bert-base-uncased/blob/main/config.json#L14)). If the corresponding model folder in
+🤗 Transformers has a file whose name starts with "modeling_tf", it means that it has a corresponding TensorFlow
+architecture ([example](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert)).
+
+</Tip>
+
+
+## Step-by-step guide to add TensorFlow model architecture code
+
+There are many ways to design a large model architecture, and multiple ways of implementing said design. However,
+you might recall from our [general overview of 🤗 Transformers](add_new_model#general-overview-of-transformers)
+that we are an opinionated bunch - the ease of use of 🤗 Transformers relies on consistent design choices. From
+experience, we can tell you a few important things about adding TensorFlow models:
+
+- Don't reinvent the wheel! More often than not, there are at least two reference implementations you should check: the
+PyTorch equivalent of the model you are implementing and other TensorFlow models for the same class of problems.
+- Great model implementations survive the test of time. This doesn't happen because the code is pretty, but rather
+because the code is clear, easy to debug and build upon. If you make the life of the maintainers easy with your
+TensorFlow implementation, by replicating the same patterns as in other TensorFlow models and minimizing the mismatch
+to the PyTorch implementation, you ensure your contribution will be long lived.
+- Ask for help when you're stuck! The 🤗 Transformers team is here to help, and we've probably found solutions to the same
+problems you're facing.
+
+Here's an overview of the steps needed to add a TensorFlow model architecture:
+1. Select the model you wish to convert
+2. Prepare transformers dev environment
+3. (Optional) Understand theoretical aspects and the existing implementation
+4. Implement the model architecture
+5. Implement model tests
+6. Submit the pull request
+7. (Optional) Build demos and share with the world
+
+### 1.-3. Prepare your model contribution
+
+**1. Select the model you wish to convert**
+
+Let's start off with the basics: the first thing you need to know is the architecture you want to convert. If you
+don't have your eyes set on a specific architecture, asking the 🤗 Transformers team for suggestions is a great way to
+maximize your impact - we will guide you towards the most prominent architectures that are missing on the TensorFlow
+side. If the specific model you want to use with TensorFlow already has a TensorFlow architecture implementation in
+🤗 Transformers but is lacking weights, feel free to jump straight into the
+[weight conversion section](#adding-tensorflow-weights-to-hub)
+of this page.
+
+For simplicity, the remainder of this guide assumes you've decided to contribute with the TensorFlow version of
+*BrandNewBert* (the same example as in the [guide](add_new_model) to add a new model from scratch).
+
+<Tip>
+
+Before starting the work on a TensorFlow model architecture, double-check that there is no ongoing effort to do so.
+You can search for `BrandNewBert` on the
+[pull request GitHub page](https://github.com/huggingface/transformers/pulls?q=is%3Apr) to confirm that there is no
+TensorFlow-related pull request.
+
+</Tip>
+
+
+**2. Prepare transformers dev environment**
+
+Having selected the model architecture, open a draft PR to signal your intention to work on it. Follow the
+instructions below to set up your environment and open a draft PR.
+
+1. Fork the [repository](https://github.com/huggingface/transformers) by clicking on the 'Fork' button on the
+   repository's page. This creates a copy of the code under your GitHub user account.
+
+2. Clone your `transformers` fork to your local disk, and add the base repository as a remote:
+
+```bash
+git clone https://github.com/[your Github handle]/transformers.git
+cd transformers
+git remote add upstream https://github.com/huggingface/transformers.git
+```
+
+3. Set up a development environment, for instance by running the following command:
+
+```bash
+python -m venv .env
+source .env/bin/activate
+pip install -e ".[dev]"
+```
+
+Depending on your OS, and since the number of optional dependencies of Transformers is growing, you might get a
+failure with this command. If that's the case make sure to install TensorFlow then do:
+
+```bash
+pip install -e ".[quality]"
+```
+
+**Note:** You don't need to have CUDA installed. Making the new model work on CPU is sufficient.
+
+4. Create a branch with a descriptive name from your main branch
+
+```bash
+git checkout -b add_tf_brand_new_bert
+```
+
+5. Fetch and rebase to current main
+
+```bash
+git fetch upstream
+git rebase upstream/main
+```
+
+6. Add an empty `.py` file in `transformers/src/models/brandnewbert/` named `modeling_tf_brandnewbert.py`. This will
+be your TensorFlow model file.
+
+7. Push the changes to your account using:
+
+```bash
+git add .
+git commit -m "initial commit"
+git push -u origin add_tf_brand_new_bert
+```
+
+8. Once you are satisfied, go to the webpage of your fork on GitHub. Click on “Pull request”. Make sure to add the
+   GitHub handle of some members of the Hugging Face team as reviewers, so that the Hugging Face team gets notified for
+   future changes.
+
+9. Change the PR into a draft by clicking on “Convert to draft” on the right of the GitHub pull request web page.
+
+
+Now you have set up a development environment to port *BrandNewBert* to TensorFlow in 🤗 Transformers.
+
+
+**3. (Optional) Understand theoretical aspects and the existing implementation**
+
+You should take some time to read *BrandNewBert's* paper, if such descriptive work exists. There might be large
+sections of the paper that are difficult to understand. If this is the case, this is fine - don't worry! The goal is
+not to get a deep theoretical understanding of the paper, but to extract the necessary information required to
+effectively re-implement the model in 🤗 Transformers using TensorFlow. That being said, you don't have to spend too
+much time on the theoretical aspects, but rather focus on the practical ones, namely the existing model documentation
+page (e.g. [model docs for BERT](model_doc/bert)).
+
+After you've grasped the basics of the models you are about to implement, it's important to understand the existing
+implementation. This is a great chance to confirm that a working implementation matches your expectations for the
+model, as well as to foresee technical challenges on the TensorFlow side.
+
+It's perfectly natural that you feel overwhelmed with the amount of information that you've just absorbed. It is
+definitely not a requirement that you understand all facets of the model at this stage. Nevertheless, we highly
+encourage you to clear any pressing questions in our [forum](https://discuss.huggingface.co/).
+
+
+### 4. Model implementation
+
+Now it's time to finally start coding. Our suggested starting point is the PyTorch file itself: copy the contents of
+`modeling_brand_new_bert.py` inside `src/transformers/models/brand_new_bert/` into
+`modeling_tf_brand_new_bert.py`. The goal of this section is to modify the file and update the import structure of
+🤗 Transformers such that you can import `TFBrandNewBert` and
+`TFBrandNewBert.from_pretrained(model_repo, from_pt=True)` successfully loads a working TensorFlow *BrandNewBert* model.
+
+Sadly, there is no prescription to convert a PyTorch model into TensorFlow. You can, however, follow our selection of
+tips to make the process as smooth as possible:
+- Prepend `TF` to the name of all classes (e.g. `BrandNewBert` becomes `TFBrandNewBert`).
+- Most PyTorch operations have a direct TensorFlow replacement. For example, `torch.nn.Linear` corresponds to
+  `tf.keras.layers.Dense`, `torch.nn.Dropout` corresponds to `tf.keras.layers.Dropout`, etc. If you're not sure
+  about a specific operation, you can use the [TensorFlow documentation](https://www.tensorflow.org/api_docs/python/tf)
+  or the [PyTorch documentation](https://pytorch.org/docs/stable/).
+- Look for patterns in the 🤗 Transformers codebase. If you come across a certain operation that doesn't have a direct
+   replacement, the odds are that someone else already had the same problem.
+- By default, keep the same variable names and structure as in PyTorch. This will make it easier to debug, track
+   issues, and add fixes down the line.
+- Some layers have different default values in each framework. A notable example is the batch normalization layer's
+   epsilon (`1e-5` in [PyTorch](https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm2d.html#torch.nn.BatchNorm2d)
+   and `1e-3` in [TensorFlow](https://www.tensorflow.org/api_docs/python/tf/keras/layers/BatchNormalization)).
+   Double-check the documentation!
+- PyTorch's `nn.Parameter` variables typically need to be initialized within TF Layer's `build()`. See the following
+   example: [PyTorch](https://github.com/huggingface/transformers/blob/655f72a6896c0533b1bdee519ed65a059c2425ac/src/transformers/models/vit_mae/modeling_vit_mae.py#L212) /
+   [TensorFlow](https://github.com/huggingface/transformers/blob/655f72a6896c0533b1bdee519ed65a059c2425ac/src/transformers/models/vit_mae/modeling_tf_vit_mae.py#L220)
+- If the PyTorch model has a `#copied from ...` on top of a function, the odds are that your TensorFlow model can also
+   borrow that function from the architecture it was copied from, assuming it has a TensorFlow architecture.
+- Assigning the `name` attribute correctly in TensorFlow functions is critical to do the `from_pt=True` weight
+   cross-loading. `name` is almost always the name of the corresponding variable in the PyTorch code. If `name` is not
+   properly set, you will see it in the error message when loading the model weights.
+- The logic of the base model class, `BrandNewBertModel`, will actually reside in `TFBrandNewBertMainLayer`, a Keras
+   layer subclass ([example](https://github.com/huggingface/transformers/blob/4fd32a1f499e45f009c2c0dea4d81c321cba7e02/src/transformers/models/bert/modeling_tf_bert.py#L719)).
+   `TFBrandNewBertModel` will simply be a wrapper around this layer.
+- Keras models need to be built in order to load pretrained weights. For that reason, `TFBrandNewBertPreTrainedModel`
+   will need to hold an example of inputs to the model, the `dummy_inputs`
+   ([example](https://github.com/huggingface/transformers/blob/4fd32a1f499e45f009c2c0dea4d81c321cba7e02/src/transformers/models/bert/modeling_tf_bert.py#L916)).
+- If you get stuck, ask for help - we're here to help you! 🤗
+
+In addition to the model file itself, you will also need to add the pointers to the model classes and related
+documentation pages. You can complete this part entirely following the patterns in other PRs
+([example](https://github.com/huggingface/transformers/pull/18020/files)). Here's a list of the needed manual
+changes:
+- Include all public classes of *BrandNewBert* in `src/transformers/__init__.py`
+- Add *BrandNewBert* classes to the corresponding Auto classes in `src/transformers/models/auto/modeling_tf_auto.py`
+- Add the lazy loading classes related to *BrandNewBert* in `src/transformers/utils/dummy_tf_objects.py`
+- Update the import structures for the public classes in `src/transformers/models/brand_new_bert/__init__.py`
+- Add the documentation pointers to the public methods of *BrandNewBert* in `docs/source/en/model_doc/brand_new_bert.md`
+- Add yourself to the list of contributors to *BrandNewBert* in `docs/source/en/model_doc/brand_new_bert.md`
+- Finally, add a green tick ✅ to the TensorFlow column of *BrandNewBert* in `docs/source/en/index.md`
+
+When you're happy with your implementation, run the following checklist to confirm that your model architecture is
+ready:
+1. All layers that behave differently at train time (e.g. Dropout) are called with a `training` argument, which is
+propagated all the way from the top-level classes
+2. You have used `#copied from ...` whenever possible
+3. `TFBrandNewBertMainLayer` and all classes that use it have their `call` function decorated with `@unpack_inputs`
+4. `TFBrandNewBertMainLayer` is decorated with `@keras_serializable`
+5. A TensorFlow model can be loaded from PyTorch weights using `TFBrandNewBert.from_pretrained(model_repo, from_pt=True)`
+6. You can call the TensorFlow model using the expected input format
+
+
+### 5. Add model tests
+
+Hurray, you've implemented a TensorFlow model! Now it's time to add tests to make sure that your model behaves as
+expected. As in the previous section, we suggest you start by copying the `test_modeling_brand_new_bert.py` file in
+`tests/models/brand_new_bert/` into `test_modeling_tf_brand_new_bert.py`, and continue by making the necessary
+TensorFlow replacements. For now, in all `.from_pretrained()` calls, you should use the `from_pt=True` flag to load
+the existing PyTorch weights.
+
+After you're done, it's time for the moment of truth: run the tests! 😬
+
+```bash
+NVIDIA_TF32_OVERRIDE=0 RUN_SLOW=1 RUN_PT_TF_CROSS_TESTS=1 \
+py.test -vv tests/models/brand_new_bert/test_modeling_tf_brand_new_bert.py
+```
+
+The most likely outcome is that you'll see a bunch of errors. Don't worry, this is expected! Debugging ML models is
+notoriously hard, and the key ingredient to success is patience (and `breakpoint()`). In our experience, the hardest
+problems arise from subtle mismatches between ML frameworks, for which we have a few pointers at the end of this guide.
+In other cases, a general test might not be directly applicable to your model, in which case we suggest an override
+at the model test class level. Regardless of the issue, don't hesitate to ask for help in your draft pull request if
+you're stuck.
+
+When all tests pass, congratulations, your model is nearly ready to be added to the 🤗 Transformers library! 🎉
+
+### 6.-7. Ensure everyone can use your model
+
+**6. Submit the pull request**
+
+Once you're done with the implementation and the tests, it's time to submit a pull request. Before pushing your code,
+run our code formatting utility, `make fixup` 🪄. This will automatically fix any formatting issues, which would cause
+our automatic checks to fail.
+
+It's now time to convert your draft pull request into a real pull request. To do so, click on the "Ready for
+review" button and add Joao (`@gante`) and Matt (`@Rocketknight1`) as reviewers. A model pull request will need
+at least 3 reviewers, but they will take care of finding appropriate additional reviewers for your model.
+
+After all reviewers are happy with the state of your PR, the final action point is to remove the `from_pt=True` flag in
+`.from_pretrained()` calls. Since there are no TensorFlow weights, you will have to add them! Check the section
+below for instructions on how to do it.
+
+Finally, when the TensorFlow weights get merged, you have at least 3 reviewer approvals, and all CI checks are
+green, double-check the tests locally one last time
+
+```bash
+NVIDIA_TF32_OVERRIDE=0 RUN_SLOW=1 RUN_PT_TF_CROSS_TESTS=1 \
+py.test -vv tests/models/brand_new_bert/test_modeling_tf_brand_new_bert.py
+```
+
+and we will merge your PR! Congratulations on the milestone 🎉
+
+**7. (Optional) Build demos and share with the world**
+
+One of the hardest parts about open-source is discovery. How can the other users learn about the existence of your
+fabulous TensorFlow contribution? With proper communication, of course! 📣
+
+There are two main ways to share your model with the community:
+- Build demos. These include Gradio demos, notebooks, and other fun ways to show off your model. We highly
+   encourage you to add a notebook to our [community-driven demos](https://huggingface.co/docs/transformers/community).
+- Share stories on social media like Twitter and LinkedIn. You should be proud of your work and share
+   your achievement with the community - your model can now be used by thousands of engineers and researchers around
+   the world 🌍! We will be happy to retweet your posts and help you share your work with the community.
+
+
+## Adding TensorFlow weights to 🤗 Hub
+
+Assuming that the TensorFlow model architecture is available in 🤗 Transformers, converting PyTorch weights into
+TensorFlow weights is a breeze!
+
+Here's how to do it:
+1. Make sure you are logged into your Hugging Face account in your terminal. You can log in using the command
+   `huggingface-cli login` (you can find your access tokens [here](https://huggingface.co/settings/tokens))
+2. Run `transformers-cli pt-to-tf --model-name foo/bar`, where `foo/bar` is the name of the model repository
+   containing the PyTorch weights you want to convert
+3. Tag `@joaogante` and `@Rocketknight1` in the 🤗 Hub PR the command above has just created
+
+That's it! 🎉
+
+
+## Debugging mismatches across ML frameworks 🐛
+
+At some point, when adding a new architecture or when creating TensorFlow weights for an existing architecture, you
+might come across errors complaining about mismatches between PyTorch and TensorFlow. You might even decide to open the
+model architecture code for the two frameworks, and find that they look identical. What's going on? 🤔
+
+First of all, let's talk about why understanding these mismatches matters. Many community members will use 🤗
+Transformers models out of the box, and trust that our models behave as expected. When there is a large mismatch
+between the two frameworks, it implies that the model is not following the reference implementation for at least one
+of the frameworks. This might lead to silent failures, in which the model runs but has poor performance. This is
+arguably worse than a model that fails to run at all! To that end, we aim at having a framework mismatch smaller than
+`1e-5` at all stages of the model.
+
+As in other numerical problems, the devil is in the details. And as in any detail-oriented craft, the secret
+ingredient here is patience. Here is our suggested workflow for when you come across this type of issues:
+1. Locate the source of mismatches. The model you're converting probably has near identical inner variables up to a
+   certain point. Place `breakpoint()` statements in the two frameworks' architectures, and compare the values of the
+   numerical variables in a top-down fashion until you find the source of the problems.
+2. Now that you've pinpointed the source of the issue, get in touch with the 🤗 Transformers team. It is possible
+   that we've seen a similar problem before and can promptly provide a solution. As a fallback, scan popular pages
+   like StackOverflow and GitHub issues.
+3. If there is no solution in sight, it means you'll have to go deeper. The good news is that you've located the
+   issue, so you can focus on the problematic instruction, abstracting away the rest of the model! The bad news is
+   that you'll have to venture into the source implementation of said instruction. In some cases, you might find an
+   issue with a reference implementation - don't abstain from opening an issue in the upstream repository.
+
+In some cases, in discussion with the 🤗 Transformers team, we might find that fixing the mismatch is infeasible.
+When the mismatch is very small in the output layers of the model (but potentially large in the hidden states), we
+might decide to ignore it in favor of distributing the model. The `pt-to-tf` CLI mentioned above has a `--max-error`
+flag to override the error message at weight conversion time.
diff --git a/docs/source/en/add_tensorflow_model.mdx b/docs/source/en/add_tensorflow_model.mdx
deleted file mode 100644
index e145a7d00184..000000000000
--- a/docs/source/en/add_tensorflow_model.mdx
+++ /dev/null
@@ -1,346 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
--->
-
-# How to convert a 🤗 Transformers model to TensorFlow?
-
-Having multiple frameworks available to use with 🤗 Transformers gives you flexibility to play their strengths when
-designing your application, but it implies that compatibility must be added on a per-model basis. The good news is that
-adding TensorFlow compatibility to an existing model is simpler than [adding a new model from scratch](add_new_model)!
-Whether you wish to have a deeper understanding of large TensorFlow models, make a major open-source contribution, or
-enable TensorFlow for your model of choice, this guide is for you.
-
-This guide empowers you, a member of our community, to contribute TensorFlow model weights and/or
-architectures to be used in 🤗 Transformers, with minimal supervision from the Hugging Face team. Writing a new model
-is no small feat, but hopefully this guide will make it less of a rollercoaster 🎢 and more of a walk in the park 🚶.
-Harnessing our collective experiences is absolutely critical to make this process increasingly easier, and thus we
-highly encourage that you suggest improvements to this guide!
-
-Before you dive deeper, it is recommended that you check the following resources if you're new to 🤗 Transformers:
-- [General overview of 🤗 Transformers](add_new_model#general-overview-of-transformers)
-- [Hugging Face's TensorFlow Philosophy](https://huggingface.co/blog/tensorflow-philosophy)
-
-In the remainder of this guide, you will learn what's needed to add a new TensorFlow model architecture, the
-procedure to convert PyTorch into TensorFlow model weights, and how to efficiently debug mismatches across ML
-frameworks. Let's get started!
-
-<Tip>
-
-Are you unsure whether the model you wish to use already has a corresponding TensorFlow architecture?
-
-&nbsp;
-
-Check the `model_type` field of the `config.json` of your model of choice
-([example](https://huggingface.co/bert-base-uncased/blob/main/config.json#L14)). If the corresponding model folder in
-🤗 Transformers has a file whose name starts with "modeling_tf", it means that it has a corresponding TensorFlow
-architecture ([example](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert)).
-
-</Tip>
-
-
-## Step-by-step guide to add TensorFlow model architecture code
-
-There are many ways to design a large model architecture, and multiple ways of implementing said design. However,
-you might recall from our [general overview of 🤗 Transformers](add_new_model#general-overview-of-transformers)
-that we are an opinionated bunch - the ease of use of 🤗 Transformers relies on consistent design choices. From
-experience, we can tell you a few important things about adding TensorFlow models:
-
-- Don't reinvent the wheel! More often that not, there are at least two reference implementations you should check: the
-PyTorch equivalent of the model you are implementing and other TensorFlow models for the same class of problems.
-- Great model implementations survive the test of time. This doesn't happen because the code is pretty, but rather
-because the code is clear, easy to debug and build upon. If you make the life of the maintainers easy with your
-TensorFlow implementation, by replicating the same patterns as in other TensorFlow models and minimizing the mismatch
-to the PyTorch implementation, you ensure your contribution will be long lived.
-- Ask for help when you're stuck! The 🤗 Transformers team is here to help, and we've probably found solutions to the same
-problems you're facing.
-
-Here's an overview of the steps needed to add a TensorFlow model architecture:
-1. Select the model you wish to convert
-2. Prepare transformers dev environment
-3. (Optional) Understand theoretical aspects and the existing implementation
-4. Implement the model architecture
-5. Implement model tests
-6. Submit the pull request
-7. (Optional) Build demos and share with the world
-
-### 1.-3. Prepare your model contribution
-
-**1. Select the model you wish to convert**
-
-Let's start off with the basics: the first thing you need to know is the architecture you want to convert. If you
-don't have your eyes set on a specific architecture, asking the 🤗 Transformers team for suggestions is a great way to
-maximize your impact - we will guide you towards the most prominent architectures that are missing on the TensorFlow
-side. If the specific model you want to use with TensorFlow already has a TensorFlow architecture implementation in
-🤗 Transformers but is lacking weights, feel free to jump straight into the
-[weight conversion section](#adding-tensorflow-weights-to-hub)
-of this page.
-
-For simplicity, the remainder of this guide assumes you've decided to contribute with the TensorFlow version of
-*BrandNewBert* (the same example as in the [guide](add_new_model) to add a new model from scratch).
-
-<Tip>
-
-Before starting the work on a TensorFlow model architecture, double-check that there is no ongoing effort to do so.
-You can search for `BrandNewBert` on the
-[pull request GitHub page](https://github.com/huggingface/transformers/pulls?q=is%3Apr) to confirm that there is no
-TensorFlow-related pull request.
-
-</Tip>
-
-
-**2. Prepare transformers dev environment**
-
-Having selected the model architecture, open an draft PR to signal your intention to work on it. Follow the
-instructions below to set up your environment and open a draft PR.
-
-1. Fork the [repository](https://github.com/huggingface/transformers) by clicking on the 'Fork' button on the
-   repository's page. This creates a copy of the code under your GitHub user account.
-
-2. Clone your `transformers` fork to your local disk, and add the base repository as a remote:
-
-```bash
-git clone https://github.com/[your Github handle]/transformers.git
-cd transformers
-git remote add upstream https://github.com/huggingface/transformers.git
-```
-
-3. Set up a development environment, for instance by running the following command:
-
-```bash
-python -m venv .env
-source .env/bin/activate
-pip install -e ".[dev]"
-```
-
-**Note:** You don't need to have CUDA installed. Making the new model work on CPU is sufficient.
-
-4. Create a branch with a descriptive name from your main branch
-
-```bash
-git checkout -b add_tf_brand_new_bert
-```
-
-5. Fetch and rebase to current main
-
-```bash
-git fetch upstream
-git rebase upstream/main
-```
-
-6. Add an empty `.py` file in `transformers/src/models/brandnewbert/` named `modeling_tf_brandnewbert.py`. This will
-be your TensorFlow model file.
-
-7. Push the changes to your account using:
-
-```bash
-git add .
-git commit -m "initial commit"
-git push -u origin add_tf_brand_new_bert
-```
-
-8. Once you are satisfied, go to the webpage of your fork on GitHub. Click on “Pull request”. Make sure to add the
-   GitHub handle of some members of the Hugging Face team as reviewers, so that the Hugging Face team gets notified for
-   future changes.
-
-9. Change the PR into a draft by clicking on “Convert to draft” on the right of the GitHub pull request web page.
-
-
-Now you have set up a development environment to port *BrandNewBert* to TensorFlow in 🤗 Transformers.
-
-
-**3. (Optional) Understand theoretical aspects and the existing implementation**
-
-You should take some time to read *BrandNewBert's* paper, if such descriptive work exists. There might be large
-sections of the paper that are difficult to understand. If this is the case, this is fine - don't worry! The goal is
-not to get a deep theoretical understanding of the paper, but to extract the necessary information required to
-effectively re-implement the model in 🤗 Transformers using TensorFlow. That being said, you don't have to spend too
-much time on the theoretical aspects, but rather focus on the practical ones, namely the existing model documentation
-page (e.g. [model docs for BERT](model_doc/bert)).
-
-After you've grasped the basics of the models you are about to implement, it's important to understand the existing
-implementation. This is a great chance to confirm that a working implementation matches your expectations for the
-model, as well as to foresee technical challenges on the TensorFlow side.
-
-It's perfectly natural that you feel overwhelmed with the amount of information that you've just absorbed. It is
-definitely not a requirement that you understand all facets of the model at this stage. Nevertheless, we highly
-encourage you to clear any pressing questions in our [forum](https://discuss.huggingface.co/).
-
-
-### 4. Model implementation
-
-Now it's time to finally start coding. Our suggested starting point is the PyTorch file itself: copy the contents of
-`modeling_brand_new_bert.py` inside `src/transformers/models/brand_new_bert/` into
-`modeling_tf_brand_new_bert.py`. The goal of this section is to modify the file and update the import structure of
-🤗 Transformers such that you can import `TFBrandNewBert` and
-`TFBrandNewBert.from_pretrained(model_repo, from_pt=True)` successfully loads a working TensorFlow *BrandNewBert* model.
-
-Sadly, there is no prescription to convert a PyTorch model into TensorFlow. You can, however, follow our selection of
-tips to make the process as smooth as possible:
-- Prepend `TF` to the name of all classes (e.g. `BrandNewBert` becomes `TFBrandNewBert`).
-- Most PyTorch operations have a direct TensorFlow replacement. For example, `torch.nn.Linear` corresponds to
-  `tf.keras.layers.Dense`, `torch.nn.Dropout` corresponds to `tf.keras.layers.Dropout`, etc. If you're not sure
-  about a specific operation, you can use the [TensorFlow documentation](https://www.tensorflow.org/api_docs/python/tf)
-  or the [PyTorch documentation](https://pytorch.org/docs/stable/).
-- Look for patterns in the 🤗 Transformers codebase. If you come across a certain operation that doesn't have a direct
-   replacement, the odds are that someone else already had the same problem.
-- By default, keep the same variable names and structure as in PyTorch. This will make it easier to debug, track
-   issues, and add fixes down the line.
-- Some layers have different default values in each framework. A notable example is the batch normalization layer's
-   epsilon (`1e-5` in [PyTorch](https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm2d.html#torch.nn.BatchNorm2d)
-   and `1e-3` in [TensorFlow](https://www.tensorflow.org/api_docs/python/tf/keras/layers/BatchNormalization)).
-   Double-check the documentation!
-- PyTorch's `nn.Parameter` variables typically need to be initialized within TF Layer's `build()`. See the following
-   example: [PyTorch](https://github.com/huggingface/transformers/blob/655f72a6896c0533b1bdee519ed65a059c2425ac/src/transformers/models/vit_mae/modeling_vit_mae.py#L212) /
-   [TensorFlow](https://github.com/huggingface/transformers/blob/655f72a6896c0533b1bdee519ed65a059c2425ac/src/transformers/models/vit_mae/modeling_tf_vit_mae.py#L220)
-- If the PyTorch model has a `#copied from ...` on top of a function, the odds are that your TensorFlow model can also
-   borrow that function from the architecture it was copied from, assuming it has a TensorFlow architecture.
-- Assigning the `name` attribute correctly in TensorFlow functions is critical to do the `from_pt=True` weight
-   cross-loading. `name` is almost always the name of the corresponding variable in the PyTorch code. If `name` is not
-   properly set, you will see it in the error message when loading the model weights.
-- The logic of the base model class, `BrandNewBertModel`, will actually reside in `TFBrandNewBertMainLayer`, a Keras
-   layer subclass ([example](https://github.com/huggingface/transformers/blob/4fd32a1f499e45f009c2c0dea4d81c321cba7e02/src/transformers/models/bert/modeling_tf_bert.py#L719)).
-   `TFBrandNewBertModel` will simply be a wrapper around this layer.
-- Keras models need to be built in order to load pretrained weights. For that reason, `TFBrandNewBertPreTrainedModel`
-   will need to hold an example of inputs to the model, the `dummy_inputs`
-   ([example](https://github.com/huggingface/transformers/blob/4fd32a1f499e45f009c2c0dea4d81c321cba7e02/src/transformers/models/bert/modeling_tf_bert.py#L916)).
-- If you get stuck, ask for help - we're here to help you! 🤗
-
-In addition to the model file itself, you will also need to add the pointers to the model classes and related
-documentation pages. You can complete this part entirely following the patterns in other PRs
-([example](https://github.com/huggingface/transformers/pull/18020/files)). Here's a list of the needed manual
-changes:
-- Include all public classes of *BrandNewBert* in `src/transformers/__init__.py`
-- Add *BrandNewBert* classes to the corresponding Auto classes in `src/transformers/models/auto/modeling_tf_auto.py`
-- Include the modeling file in the documentation test file list in `utils/documentation_tests.txt`
-- Add the lazy loading classes related to *BrandNewBert* in `src/transformers/utils/dummy_tf_objects.py`
-- Update the import structures for the public classes in `src/transformers/models/brand_new_bert/__init__.py`
-- Add the documentation pointers to the public methods of *BrandNewBert* in `docs/source/en/model_doc/brand_new_bert.mdx`
-- Add yourself to the list of contributors to *BrandNewBert* in `docs/source/en/model_doc/brand_new_bert.mdx`
-- Finally, add a green tick ✅ to the TensorFlow column of *BrandNewBert* in `docs/source/en/index.mdx`
-
-When you're happy with your implementation, run the following checklist to confirm that your model architecture is
-ready:
-1. All layers that behave differently at train time (e.g. Dropout) are called with a `training` argument, which is
-propagated all the way from the top-level classes
-2. You have used `#copied from ...` whenever possible
-3. `TFBrandNewBertMainLayer` and all classes that use it have their `call` function decorated with `@unpack_inputs`
-4. `TFBrandNewBertMainLayer` is decorated with `@keras_serializable`
-5. A TensorFlow model can be loaded from PyTorch weights using `TFBrandNewBert.from_pretrained(model_repo, from_pt=True)`
-6. You can call the TensorFlow model using the expected input format
-
-
-### 5. Add model tests
-
-Hurray, you've implemented a TensorFlow model! Now it's time to add tests to make sure that your model behaves as
-expected. As in the previous section, we suggest you start by copying the `test_modeling_brand_new_bert.py` file in
-`tests/models/brand_new_bert/` into `test_modeling_tf_brand_new_bert.py`, and continue by making the necessary
-TensorFlow replacements. For now, in all `.from_pretrained()` calls, you should use the `from_pt=True` flag to load
-the existing PyTorch weights.
-
-After you're done, it's time for the moment of truth: run the tests! 😬
-
-```bash
-NVIDIA_TF32_OVERRIDE=0 RUN_SLOW=1 RUN_PT_TF_CROSS_TESTS=1 \
-py.test -vv tests/models/brand_new_bert/test_modeling_tf_brand_new_bert.py
-```
-
-The most likely outcome is that you'll see a bunch of errors. Don't worry, this is expected! Debugging ML models is
-notoriously hard, and the key ingredient to success is patience (and `breakpoint()`). In our experience, the hardest
-problems arise from subtle mismatches between ML frameworks, for which we have a few pointers at the end of this guide.
-In other cases, a general test might not be directly applicable to your model, in which case we suggest an override
-at the model test class level. Regardless of the issue, don't hesitate to ask for help in your draft pull request if
-you're stuck.
-
-When all tests pass, congratulations, your model is nearly ready to be added to the 🤗 Transformers library! 🎉
-
-### 6.-7. Ensure everyone can use your model
-
-**6. Submit the pull request**
-
-Once you're done with the implementation and the tests, it's time to submit a pull request. Before pushing your code,
-run our code formatting utility, `make fixup` 🪄. This will automatically fix any formatting issues, which would cause
-our automatic checks to fail.
-
-It's now time to convert your draft pull request into a real pull request. To do so, click on the "Ready for
-review" button and add Joao (`@gante`) and Matt (`@Rocketknight1`) as reviewers. A model pull request will need
-at least 3 reviewers, but they will take care of finding appropriate additional reviewers for your model.
-
-After all reviewers are happy with the state of your PR, the final action point is to remove the `from_pt=True` flag in
-`.from_pretrained()` calls. Since there are no TensorFlow weights, you will have to add them! Check the section
-below for instructions on how to do it.
-
-Finally, when the TensorFlow weights get merged, you have at least 3 reviewer approvals, and all CI checks are
-green, double-check the tests locally one last time
-
-```bash
-NVIDIA_TF32_OVERRIDE=0 RUN_SLOW=1 RUN_PT_TF_CROSS_TESTS=1 \
-py.test -vv tests/models/brand_new_bert/test_modeling_tf_brand_new_bert.py
-```
-
-and we will merge your PR! Congratulations on the milestone 🎉
-
-**7. (Optional) Build demos and share with the world**
-
-One of the hardest parts about open-source is discovery. How can the other users learn about the existence of your
-fabulous TensorFlow contribution? With proper communication, of course! 📣
-
-There are two main ways to share your model with the community:
-- Build demos. These include Gradio demos, notebooks, and other fun ways to show off your model. We highly
-   encourage you to add a notebook to our [community-driven demos](https://huggingface.co/docs/transformers/community).
-- Share stories on social media like Twitter and LinkedIn. You should be proud of your work and share
-   your achievement with the community - your model can now be used by thousands of engineers and researchers around
-   the world 🌍! We will be happy to retweet your posts and help you share your work with the community.
-
-
-## Adding TensorFlow weights to 🤗 Hub
-
-Assuming that the TensorFlow model architecture is available in 🤗 Transformers, converting PyTorch weights into
-TensorFlow weights is a breeze!
-
-Here's how to do it:
-1. Make sure you are logged into your Hugging Face account in your terminal. You can log in using the command
-   `huggingface-cli login` (you can find your access tokens [here](https://huggingface.co/settings/tokens))
-2. Run `transformers-cli pt-to-tf --model-name foo/bar`, where `foo/bar` is the name of the model repository
-   containing the PyTorch weights you want to convert
-3. Tag `@joaogante` and `@Rocketknight1` in the 🤗 Hub PR the command above has just created
-
-That's it! 🎉
-
-
-## Debugging mismatches across ML frameworks 🐛
-
-At some point, when adding a new architecture or when creating TensorFlow weights for an existing architecture, you
-might come across errors compaining about mismatches between PyTorch and TensorFlow. You might even decide to open the
-model architecture code for the two frameworks, and find that they look identical. What's going on? 🤔
-
-First of all, let's talk about why understanding these mismatches matters. Many community members will use 🤗
-Transformers models out of the box, and trust that our models behave as expected. When there is a large mismatch
-between the two frameworks, it implies that the model is not following the reference implementation for at least one
-of the frameworks. This might lead to silent failures, in which the model runs but has poor performance. This is
-arguably worse than a model that fails to run at all! To that end, we aim at having a framework mismatch smaller than
-`1e-5` at all stages of the model.
-
-As in other numerical problems, the devil is in the details. And as in any detail-oriented craft, the secret
-ingredient here is patience. Here is our suggested workflow for when you come across this type of issues:
-1. Locate the source of mismatches. The model you're converting probably has near identical inner variables up to a
-   certain point. Place `breakpoint()` statements in the two frameworks' architectures, and compare the values of the
-   numerical variables in a top-down fashion until you find the source of the problems.
-2. Now that you've pinpointed the source of the issue, get in touch with the 🤗 Transformers team. It is possible
-   that we've seen a similar problem before and can promptly provide a solution. As a fallback, scan popular pages
-   like StackOverflow and GitHub issues.
-3. If there is no solution in sight, it means you'll have to go deeper. The good news is that you've located the
-   issue, so you can focus on the problematic instruction, abstracting away the rest of the model! The bad news is
-   that you'll have to venture into the source implementation of said instruction. In some cases, you might find an
-   issue with a reference implementation - don't abstain from opening an issue in the upstream repository.
-
-In some cases, in dicussion with the 🤗 Transformers team, we might find that the fixing the mismatch is infeasible.
-When the mismatch is very small in the output layers of the model (but potentially large in the hidden states), we
-might decide to ignore it in favor of distributing the model. The `pt-to-tf` CLI mentioned above has a `--max-error`
-flag to override the error message at weight conversion time.
diff --git a/docs/source/en/attention.md b/docs/source/en/attention.md
new file mode 100644
index 000000000000..3a4f93b33ff2
--- /dev/null
+++ b/docs/source/en/attention.md
@@ -0,0 +1,61 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Attention mechanisms
+
+Most transformer models use full attention in the sense that the attention matrix is square. It can be a big
+computational bottleneck when you have long texts. Longformer and reformer are models that try to be more efficient and
+use a sparse version of the attention matrix to speed up training.
+
+## LSH attention
+
+[Reformer](#reformer) uses LSH attention. In the softmax(QK^t), only the biggest elements (in the softmax
+dimension) of the matrix QK^t are going to give useful contributions. So for each query q in Q, we can consider only
+the keys k in K that are close to q. A hash function is used to determine if q and k are close. The attention mask is
+modified to mask the current token (except at the first position), because it will give a query and a key equal (so
+very similar to each other). Since the hash can be a bit random, several hash functions are used in practice
+(determined by a n_rounds parameter) and then are averaged together.
+
+## Local attention
+
+[Longformer](#longformer) uses local attention: often, the local context (e.g., what are the two tokens to the
+left and right?) is enough to take action for a given token. Also, by stacking attention layers that have a small
+window, the last layer will have a receptive field of more than just the tokens in the window, allowing them to build a
+representation of the whole sentence.
+
+Some preselected input tokens are also given global attention: for those few tokens, the attention matrix can access
+all tokens and this process is symmetric: all other tokens have access to those specific tokens (on top of the ones in
+their local window). This is shown in Figure 2d of the paper, see below for a sample attention mask:
+
+<div class="flex justify-center">
+    <img scale="50 %" align="center" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/local_attention_mask.png"/>
+</div>
+
+Using those attention matrices with less parameters then allows the model to have inputs having a bigger sequence
+length.
+
+## Other tricks
+
+### Axial positional encodings
+
+[Reformer](#reformer) uses axial positional encodings: in traditional transformer models, the positional encoding
+E is a matrix of size \\(l\\) by \\(d\\), \\(l\\) being the sequence length and \\(d\\) the dimension of the
+hidden state. If you have very long texts, this matrix can be huge and take way too much space on the GPU. To alleviate
+that, axial positional encodings consist of factorizing that big matrix E in two smaller matrices E1 and E2, with
+dimensions \\(l_{1} \times d_{1}\\) and \\(l_{2} \times d_{2}\\), such that \\(l_{1} \times l_{2} = l\\) and
+\\(d_{1} + d_{2} = d\\) (with the product for the lengths, this ends up being way smaller). The embedding for time
+step \\(j\\) in E is obtained by concatenating the embeddings for timestep \\(j \% l1\\) in E1 and \\(j // l1\\)
+in E2.
diff --git a/docs/source/en/autoclass_tutorial.md b/docs/source/en/autoclass_tutorial.md
new file mode 100644
index 000000000000..833882aa713e
--- /dev/null
+++ b/docs/source/en/autoclass_tutorial.md
@@ -0,0 +1,143 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Load pretrained instances with an AutoClass
+
+With so many different Transformer architectures, it can be challenging to create one for your checkpoint. As a part of 🤗 Transformers core philosophy to make the library easy, simple and flexible to use, an `AutoClass` automatically infers and loads the correct architecture from a given checkpoint. The `from_pretrained()` method lets you quickly load a pretrained model for any architecture so you don't have to devote time and resources to train a model from scratch. Producing this type of checkpoint-agnostic code means if your code works for one checkpoint, it will work with another checkpoint - as long as it was trained for a similar task - even if the architecture is different.
+
+<Tip>
+
+Remember, architecture refers to the skeleton of the model and checkpoints are the weights for a given architecture. For example, [BERT](https://huggingface.co/bert-base-uncased) is an architecture, while `bert-base-uncased` is a checkpoint. Model is a general term that can mean either architecture or checkpoint.
+
+</Tip>
+
+In this tutorial, learn to:
+
+* Load a pretrained tokenizer.
+* Load a pretrained image processor
+* Load a pretrained feature extractor.
+* Load a pretrained processor.
+* Load a pretrained model.
+
+## AutoTokenizer
+
+Nearly every NLP task begins with a tokenizer. A tokenizer converts your input into a format that can be processed by the model.
+
+Load a tokenizer with [`AutoTokenizer.from_pretrained`]:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+```
+
+Then tokenize your input as shown below:
+
+```py
+>>> sequence = "In a hole in the ground there lived a hobbit."
+>>> print(tokenizer(sequence))
+{'input_ids': [101, 1999, 1037, 4920, 1999, 1996, 2598, 2045, 2973, 1037, 7570, 10322, 4183, 1012, 102], 
+ 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+ 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+```
+
+## AutoImageProcessor
+
+For vision tasks, an image processor processes the image into the correct input format.
+
+```py
+>>> from transformers import AutoImageProcessor
+
+>>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
+```
+
+
+## AutoFeatureExtractor
+
+For audio tasks, a feature extractor processes the audio signal the correct input format.
+
+Load a feature extractor with [`AutoFeatureExtractor.from_pretrained`]:
+
+```py
+>>> from transformers import AutoFeatureExtractor
+
+>>> feature_extractor = AutoFeatureExtractor.from_pretrained(
+...     "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
+... )
+```
+
+## AutoProcessor
+
+Multimodal tasks require a processor that combines two types of preprocessing tools. For example, the [LayoutLMV2](model_doc/layoutlmv2) model requires an image processor to handle images and a tokenizer to handle text; a processor combines both of them.
+
+Load a processor with [`AutoProcessor.from_pretrained`]:
+
+```py
+>>> from transformers import AutoProcessor
+
+>>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased")
+```
+
+## AutoModel
+
+<frameworkcontent>
+<pt>
+Finally, the `AutoModelFor` classes let you load a pretrained model for a given task (see [here](model_doc/auto) for a complete list of available tasks). For example, load a model for sequence classification with [`AutoModelForSequenceClassification.from_pretrained`]:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+```
+
+Easily reuse the same checkpoint to load an architecture for a different task:
+
+```py
+>>> from transformers import AutoModelForTokenClassification
+
+>>> model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
+```
+
+<Tip warning={true}>
+
+For PyTorch models, the `from_pretrained()` method uses `torch.load()` which internally uses `pickle` and is known to be insecure. In general, never load a model that could have come from an untrusted source, or that could have been tampered with. This security risk is partially mitigated for public models hosted on the Hugging Face Hub, which are [scanned for malware](https://huggingface.co/docs/hub/security-malware) at each commit. See the [Hub documentation](https://huggingface.co/docs/hub/security) for best practices like [signed commit verification](https://huggingface.co/docs/hub/security-gpg#signing-commits-with-gpg) with GPG.
+
+TensorFlow and Flax checkpoints are not affected, and can be loaded within PyTorch architectures using the `from_tf` and `from_flax` kwargs for the `from_pretrained` method to circumvent this issue.
+
+</Tip>
+
+Generally, we recommend using the `AutoTokenizer` class and the `AutoModelFor` class to load pretrained instances of models. This will ensure you load the correct architecture every time. In the next [tutorial](preprocessing), learn how to use your newly loaded tokenizer, image processor, feature extractor and processor to preprocess a dataset for fine-tuning.
+</pt>
+<tf>
+Finally, the `TFAutoModelFor` classes let you load a pretrained model for a given task (see [here](model_doc/auto) for a complete list of available tasks). For example, load a model for sequence classification with [`TFAutoModelForSequenceClassification.from_pretrained`]:
+
+```py
+>>> from transformers import TFAutoModelForSequenceClassification
+
+>>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+```
+
+Easily reuse the same checkpoint to load an architecture for a different task:
+
+```py
+>>> from transformers import TFAutoModelForTokenClassification
+
+>>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
+```
+
+Generally, we recommend using the `AutoTokenizer` class and the `TFAutoModelFor` class to load pretrained instances of models. This will ensure you load the correct architecture every time. In the next [tutorial](preprocessing), learn how to use your newly loaded tokenizer, image processor, feature extractor and processor to preprocess a dataset for fine-tuning.
+</tf>
+</frameworkcontent>
diff --git a/docs/source/en/autoclass_tutorial.mdx b/docs/source/en/autoclass_tutorial.mdx
deleted file mode 100644
index 6b44e41a856c..000000000000
--- a/docs/source/en/autoclass_tutorial.mdx
+++ /dev/null
@@ -1,139 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Load pretrained instances with an AutoClass
-
-With so many different Transformer architectures, it can be challenging to create one for your checkpoint. As a part of 🤗 Transformers core philosophy to make the library easy, simple and flexible to use, an `AutoClass` automatically infer and load the correct architecture from a given checkpoint. The `from_pretrained()` method lets you quickly load a pretrained model for any architecture so you don't have to devote time and resources to train a model from scratch. Producing this type of checkpoint-agnostic code means if your code works for one checkpoint, it will work with another checkpoint - as long as it was trained for a similar task - even if the architecture is different.
-
-<Tip>
-
-Remember, architecture refers to the skeleton of the model and checkpoints are the weights for a given architecture. For example, [BERT](https://huggingface.co/bert-base-uncased) is an architecture, while `bert-base-uncased` is a checkpoint. Model is a general term that can mean either architecture or checkpoint.
-
-</Tip>
-
-In this tutorial, learn to:
-
-* Load a pretrained tokenizer.
-* Load a pretrained image processor
-* Load a pretrained feature extractor.
-* Load a pretrained processor.
-* Load a pretrained model.
-
-## AutoTokenizer
-
-Nearly every NLP task begins with a tokenizer. A tokenizer converts your input into a format that can be processed by the model.
-
-Load a tokenizer with [`AutoTokenizer.from_pretrained`]:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
-```
-
-Then tokenize your input as shown below:
-
-```py
->>> sequence = "In a hole in the ground there lived a hobbit."
->>> print(tokenizer(sequence))
-{'input_ids': [101, 1999, 1037, 4920, 1999, 1996, 2598, 2045, 2973, 1037, 7570, 10322, 4183, 1012, 102], 
- 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
- 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
-```
-
-## AutoImageProcessor
-
-For vision tasks, an image processor processes the image into the correct input format.
-
-```py
->>> from transformers import AutoImageProcessor
-
->>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
-```
-
-
-## AutoFeatureExtractor
-
-For audio tasks, a feature extractor processes the audio signal the correct input format.
-
-Load a feature extractor with [`AutoFeatureExtractor.from_pretrained`]:
-
-```py
->>> from transformers import AutoFeatureExtractor
-
->>> feature_extractor = AutoFeatureExtractor.from_pretrained(
-...     "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
-... )
-```
-
-## AutoProcessor
-
-Multimodal tasks require a processor that combines two types of preprocessing tools. For example, the [LayoutLMV2](model_doc/layoutlmv2) model requires an image processor to handle images and a tokenizer to handle text; a processor combines both of them.
-
-Load a processor with [`AutoProcessor.from_pretrained`]:
-
-```py
->>> from transformers import AutoProcessor
-
->>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased")
-```
-
-## AutoModel
-
-<frameworkcontent>
-<pt>
-Finally, the `AutoModelFor` classes let you load a pretrained model for a given task (see [here](model_doc/auto) for a complete list of available tasks). For example, load a model for sequence classification with [`AutoModelForSequenceClassification.from_pretrained`]:
-
-```py
->>> from transformers import AutoModelForSequenceClassification
-
->>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
-```
-
-Easily reuse the same checkpoint to load an architecture for a different task:
-
-```py
->>> from transformers import AutoModelForTokenClassification
-
->>> model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
-```
-
-<Tip warning={true}>
-
-For PyTorch models, the `from_pretrained()` method uses `torch.load()` which internally uses `pickle` and is known to be insecure. In general, never load a model that could have come from an untrusted source, or that could have been tampered with. This security risk is partially mitigated for public models hosted on the Hugging Face Hub, which are [scanned for malware](https://huggingface.co/docs/hub/security-malware) at each commit. See the [Hub documentation](https://huggingface.co/docs/hub/security) for best practices like [signed commit verification](https://huggingface.co/docs/hub/security-gpg#signing-commits-with-gpg) with GPG.
-
-TensorFlow and Flax checkpoints are not affected, and can be loaded within PyTorch architectures using the `from_tf` and `from_flax` kwargs for the `from_pretrained` method to circumvent this issue.
-
-</Tip>
-
-Generally, we recommend using the `AutoTokenizer` class and the `AutoModelFor` class to load pretrained instances of models. This will ensure you load the correct architecture every time. In the next [tutorial](preprocessing), learn how to use your newly loaded tokenizer, image processor, feature extractor and processor to preprocess a dataset for fine-tuning.
-</pt>
-<tf>
-Finally, the `TFAutoModelFor` classes let you load a pretrained model for a given task (see [here](model_doc/auto) for a complete list of available tasks). For example, load a model for sequence classification with [`TFAutoModelForSequenceClassification.from_pretrained`]:
-
-```py
->>> from transformers import TFAutoModelForSequenceClassification
-
->>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
-```
-
-Easily reuse the same checkpoint to load an architecture for a different task:
-
-```py
->>> from transformers import TFAutoModelForTokenClassification
-
->>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
-```
-
-Generally, we recommend using the `AutoTokenizer` class and the `TFAutoModelFor` class to load pretrained instances of models. This will ensure you load the correct architecture every time. In the next [tutorial](preprocessing), learn how to use your newly loaded tokenizer, image processor, feature extractor and processor to preprocess a dataset for fine-tuning.
-</tf>
-</frameworkcontent>
diff --git a/docs/source/en/benchmarks.md b/docs/source/en/benchmarks.md
new file mode 100644
index 000000000000..5023d2486979
--- /dev/null
+++ b/docs/source/en/benchmarks.md
@@ -0,0 +1,387 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Benchmarks
+
+<Tip warning={true}>
+
+Hugging Face's Benchmarking tools are deprecated and it is advised to use external Benchmarking libraries to measure the speed 
+and memory complexity of Transformer models.
+
+</Tip>
+
+[[open-in-colab]]
+
+Let's take a look at how 🤗 Transformers models can be benchmarked, best practices, and already available benchmarks.
+
+A notebook explaining in more detail how to benchmark 🤗 Transformers models can be found [here](https://github.com/huggingface/notebooks/tree/main/examples/benchmark.ipynb).
+
+## How to benchmark 🤗 Transformers models
+
+The classes [`PyTorchBenchmark`] and [`TensorFlowBenchmark`] allow to flexibly benchmark 🤗 Transformers models. The benchmark classes allow us to measure the _peak memory usage_ and _required time_ for both _inference_ and _training_.
+
+<Tip>
+
+Hereby, _inference_ is defined by a single forward pass, and _training_ is defined by a single forward pass and
+backward pass.
+
+</Tip>
+
+The benchmark classes [`PyTorchBenchmark`] and [`TensorFlowBenchmark`] expect an object of type [`PyTorchBenchmarkArguments`] and
+[`TensorFlowBenchmarkArguments`], respectively, for instantiation. [`PyTorchBenchmarkArguments`] and [`TensorFlowBenchmarkArguments`] are data classes and contain all relevant configurations for their corresponding benchmark class. In the following example, it is shown how a BERT model of type _bert-base-cased_ can be benchmarked.
+
+<frameworkcontent>
+<pt>
+```py
+>>> from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments
+
+>>> args = PyTorchBenchmarkArguments(models=["bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512])
+>>> benchmark = PyTorchBenchmark(args)
+```
+</pt>
+<tf>
+```py
+>>> from transformers import TensorFlowBenchmark, TensorFlowBenchmarkArguments
+
+>>> args = TensorFlowBenchmarkArguments(
+...     models=["bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512]
+... )
+>>> benchmark = TensorFlowBenchmark(args)
+```
+</tf>
+</frameworkcontent>
+
+Here, three arguments are given to the benchmark argument data classes, namely `models`, `batch_sizes`, and
+`sequence_lengths`. The argument `models` is required and expects a `list` of model identifiers from the
+[model hub](https://huggingface.co/models) The `list` arguments `batch_sizes` and `sequence_lengths` define
+the size of the `input_ids` on which the model is benchmarked. There are many more parameters that can be configured
+via the benchmark argument data classes. For more detail on these one can either directly consult the files
+`src/transformers/benchmark/benchmark_args_utils.py`, `src/transformers/benchmark/benchmark_args.py` (for PyTorch)
+and `src/transformers/benchmark/benchmark_args_tf.py` (for Tensorflow). Alternatively, running the following shell
+commands from root will print out a descriptive list of all configurable parameters for PyTorch and Tensorflow
+respectively.
+
+<frameworkcontent>
+<pt>
+```bash
+python examples/pytorch/benchmarking/run_benchmark.py --help
+```
+
+An instantiated benchmark object can then simply be run by calling `benchmark.run()`.
+
+```py
+>>> results = benchmark.run()
+>>> print(results)
+====================       INFERENCE - SPEED - RESULT       ====================
+--------------------------------------------------------------------------------
+Model Name             Batch Size     Seq Length     Time in s                  
+--------------------------------------------------------------------------------
+bert-base-uncased          8               8             0.006     
+bert-base-uncased          8               32            0.006     
+bert-base-uncased          8              128            0.018     
+bert-base-uncased          8              512            0.088     
+--------------------------------------------------------------------------------
+
+====================      INFERENCE - MEMORY - RESULT       ====================
+--------------------------------------------------------------------------------
+Model Name             Batch Size     Seq Length    Memory in MB 
+--------------------------------------------------------------------------------
+bert-base-uncased          8               8             1227
+bert-base-uncased          8               32            1281
+bert-base-uncased          8              128            1307
+bert-base-uncased          8              512            1539
+--------------------------------------------------------------------------------
+
+====================        ENVIRONMENT INFORMATION         ====================
+
+- transformers_version: 2.11.0
+- framework: PyTorch
+- use_torchscript: False
+- framework_version: 1.4.0
+- python_version: 3.6.10
+- system: Linux
+- cpu: x86_64
+- architecture: 64bit
+- date: 2020-06-29
+- time: 08:58:43.371351
+- fp16: False
+- use_multiprocessing: True
+- only_pretrain_model: False
+- cpu_ram_mb: 32088
+- use_gpu: True
+- num_gpus: 1
+- gpu: TITAN RTX
+- gpu_ram_mb: 24217
+- gpu_power_watts: 280.0
+- gpu_performance_state: 2
+- use_tpu: False
+```
+</pt>
+<tf>
+```bash
+python examples/tensorflow/benchmarking/run_benchmark_tf.py --help
+```
+
+An instantiated benchmark object can then simply be run by calling `benchmark.run()`.
+
+```py
+>>> results = benchmark.run()
+>>> print(results)
+>>> results = benchmark.run()
+>>> print(results)
+====================       INFERENCE - SPEED - RESULT       ====================
+--------------------------------------------------------------------------------
+Model Name             Batch Size     Seq Length     Time in s                  
+--------------------------------------------------------------------------------
+bert-base-uncased          8               8             0.005
+bert-base-uncased          8               32            0.008
+bert-base-uncased          8              128            0.022
+bert-base-uncased          8              512            0.105
+--------------------------------------------------------------------------------
+
+====================      INFERENCE - MEMORY - RESULT       ====================
+--------------------------------------------------------------------------------
+Model Name             Batch Size     Seq Length    Memory in MB 
+--------------------------------------------------------------------------------
+bert-base-uncased          8               8             1330
+bert-base-uncased          8               32            1330
+bert-base-uncased          8              128            1330
+bert-base-uncased          8              512            1770
+--------------------------------------------------------------------------------
+
+====================        ENVIRONMENT INFORMATION         ====================
+
+- transformers_version: 2.11.0
+- framework: Tensorflow
+- use_xla: False
+- framework_version: 2.2.0
+- python_version: 3.6.10
+- system: Linux
+- cpu: x86_64
+- architecture: 64bit
+- date: 2020-06-29
+- time: 09:26:35.617317
+- fp16: False
+- use_multiprocessing: True
+- only_pretrain_model: False
+- cpu_ram_mb: 32088
+- use_gpu: True
+- num_gpus: 1
+- gpu: TITAN RTX
+- gpu_ram_mb: 24217
+- gpu_power_watts: 280.0
+- gpu_performance_state: 2
+- use_tpu: False
+```
+</tf>
+</frameworkcontent>
+
+By default, the _time_ and the _required memory_ for _inference_ are benchmarked. In the example output above the first
+two sections show the result corresponding to _inference time_ and _inference memory_. In addition, all relevant
+information about the computing environment, _e.g._ the GPU type, the system, the library versions, etc... are printed
+out in the third section under _ENVIRONMENT INFORMATION_. This information can optionally be saved in a _.csv_ file
+when adding the argument `save_to_csv=True` to [`PyTorchBenchmarkArguments`] and
+[`TensorFlowBenchmarkArguments`] respectively. In this case, every section is saved in a separate
+_.csv_ file. The path to each _.csv_ file can optionally be defined via the argument data classes.
+
+Instead of benchmarking pre-trained models via their model identifier, _e.g._ `bert-base-uncased`, the user can
+alternatively benchmark an arbitrary configuration of any available model class. In this case, a `list` of
+configurations must be inserted with the benchmark args as follows.
+
+<frameworkcontent>
+<pt>
+```py
+>>> from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments, BertConfig
+
+>>> args = PyTorchBenchmarkArguments(
+...     models=["bert-base", "bert-384-hid", "bert-6-lay"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512]
+... )
+>>> config_base = BertConfig()
+>>> config_384_hid = BertConfig(hidden_size=384)
+>>> config_6_lay = BertConfig(num_hidden_layers=6)
+
+>>> benchmark = PyTorchBenchmark(args, configs=[config_base, config_384_hid, config_6_lay])
+>>> benchmark.run()
+====================       INFERENCE - SPEED - RESULT       ====================
+--------------------------------------------------------------------------------
+Model Name             Batch Size     Seq Length       Time in s                  
+--------------------------------------------------------------------------------
+bert-base                  8              128            0.006
+bert-base                  8              512            0.006
+bert-base                  8              128            0.018     
+bert-base                  8              512            0.088     
+bert-384-hid              8               8             0.006     
+bert-384-hid              8               32            0.006     
+bert-384-hid              8              128            0.011     
+bert-384-hid              8              512            0.054     
+bert-6-lay                 8               8             0.003     
+bert-6-lay                 8               32            0.004     
+bert-6-lay                 8              128            0.009     
+bert-6-lay                 8              512            0.044
+--------------------------------------------------------------------------------
+
+====================      INFERENCE - MEMORY - RESULT       ====================
+--------------------------------------------------------------------------------
+Model Name             Batch Size     Seq Length      Memory in MB 
+--------------------------------------------------------------------------------
+bert-base                  8               8             1277
+bert-base                  8               32            1281
+bert-base                  8              128            1307     
+bert-base                  8              512            1539     
+bert-384-hid              8               8             1005     
+bert-384-hid              8               32            1027     
+bert-384-hid              8              128            1035     
+bert-384-hid              8              512            1255     
+bert-6-lay                 8               8             1097     
+bert-6-lay                 8               32            1101     
+bert-6-lay                 8              128            1127     
+bert-6-lay                 8              512            1359
+--------------------------------------------------------------------------------
+
+====================        ENVIRONMENT INFORMATION         ====================
+
+- transformers_version: 2.11.0
+- framework: PyTorch
+- use_torchscript: False
+- framework_version: 1.4.0
+- python_version: 3.6.10
+- system: Linux
+- cpu: x86_64
+- architecture: 64bit
+- date: 2020-06-29
+- time: 09:35:25.143267
+- fp16: False
+- use_multiprocessing: True
+- only_pretrain_model: False
+- cpu_ram_mb: 32088
+- use_gpu: True
+- num_gpus: 1
+- gpu: TITAN RTX
+- gpu_ram_mb: 24217
+- gpu_power_watts: 280.0
+- gpu_performance_state: 2
+- use_tpu: False
+```
+</pt>
+<tf>
+```py
+>>> from transformers import TensorFlowBenchmark, TensorFlowBenchmarkArguments, BertConfig
+
+>>> args = TensorFlowBenchmarkArguments(
+...     models=["bert-base", "bert-384-hid", "bert-6-lay"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512]
+... )
+>>> config_base = BertConfig()
+>>> config_384_hid = BertConfig(hidden_size=384)
+>>> config_6_lay = BertConfig(num_hidden_layers=6)
+
+>>> benchmark = TensorFlowBenchmark(args, configs=[config_base, config_384_hid, config_6_lay])
+>>> benchmark.run()
+====================       INFERENCE - SPEED - RESULT       ====================
+--------------------------------------------------------------------------------
+Model Name             Batch Size     Seq Length       Time in s                  
+--------------------------------------------------------------------------------
+bert-base                  8               8             0.005
+bert-base                  8               32            0.008
+bert-base                  8              128            0.022
+bert-base                  8              512            0.106
+bert-384-hid              8               8             0.005
+bert-384-hid              8               32            0.007
+bert-384-hid              8              128            0.018
+bert-384-hid              8              512            0.064
+bert-6-lay                 8               8             0.002
+bert-6-lay                 8               32            0.003
+bert-6-lay                 8              128            0.0011
+bert-6-lay                 8              512            0.074
+--------------------------------------------------------------------------------
+
+====================      INFERENCE - MEMORY - RESULT       ====================
+--------------------------------------------------------------------------------
+Model Name             Batch Size     Seq Length      Memory in MB 
+--------------------------------------------------------------------------------
+bert-base                  8               8             1330
+bert-base                  8               32            1330
+bert-base                  8              128            1330
+bert-base                  8              512            1770
+bert-384-hid              8               8             1330
+bert-384-hid              8               32            1330
+bert-384-hid              8              128            1330
+bert-384-hid              8              512            1540
+bert-6-lay                 8               8             1330
+bert-6-lay                 8               32            1330
+bert-6-lay                 8              128            1330
+bert-6-lay                 8              512            1540
+--------------------------------------------------------------------------------
+
+====================        ENVIRONMENT INFORMATION         ====================
+
+- transformers_version: 2.11.0
+- framework: Tensorflow
+- use_xla: False
+- framework_version: 2.2.0
+- python_version: 3.6.10
+- system: Linux
+- cpu: x86_64
+- architecture: 64bit
+- date: 2020-06-29
+- time: 09:38:15.487125
+- fp16: False
+- use_multiprocessing: True
+- only_pretrain_model: False
+- cpu_ram_mb: 32088
+- use_gpu: True
+- num_gpus: 1
+- gpu: TITAN RTX
+- gpu_ram_mb: 24217
+- gpu_power_watts: 280.0
+- gpu_performance_state: 2
+- use_tpu: False
+```
+</tf>
+</frameworkcontent>
+
+Again, _inference time_ and _required memory_ for _inference_ are measured, but this time for customized configurations
+of the `BertModel` class. This feature can especially be helpful when deciding for which configuration the model
+should be trained.
+
+
+## Benchmark best practices
+
+This section lists a couple of best practices one should be aware of when benchmarking a model.
+
+- Currently, only single device benchmarking is supported. When benchmarking on GPU, it is recommended that the user
+  specifies on which device the code should be run by setting the `CUDA_VISIBLE_DEVICES` environment variable in the
+  shell, _e.g._ `export CUDA_VISIBLE_DEVICES=0` before running the code.
+- The option `no_multi_processing` should only be set to `True` for testing and debugging. To ensure accurate
+  memory measurement it is recommended to run each memory benchmark in a separate process by making sure
+  `no_multi_processing` is set to `True`.
+- One should always state the environment information when sharing the results of a model benchmark. Results can vary
+  heavily between different GPU devices, library versions, etc., so that benchmark results on their own are not very
+  useful for the community.
+
+
+## Sharing your benchmark
+
+Previously all available core models (10 at the time) have been benchmarked for _inference time_, across many different
+settings: using PyTorch, with and without TorchScript, using TensorFlow, with and without XLA. All of those tests were
+done across CPUs (except for TensorFlow XLA) and GPUs.
+
+The approach is detailed in the [following blogpost](https://medium.com/huggingface/benchmarking-transformers-pytorch-and-tensorflow-e2917fb891c2) and the results are
+available [here](https://docs.google.com/spreadsheets/d/1sryqufw2D0XlUH4sq3e9Wnxu5EAQkaohzrJbd5HdQ_w/edit?usp=sharing).
+
+With the new _benchmark_ tools, it is easier than ever to share your benchmark results with the community
+
+- [PyTorch Benchmarking Results](https://github.com/huggingface/transformers/tree/main/examples/pytorch/benchmarking/README.md).
+- [TensorFlow Benchmarking Results](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/benchmarking/README.md).
diff --git a/docs/source/en/benchmarks.mdx b/docs/source/en/benchmarks.mdx
deleted file mode 100644
index 244112001f5c..000000000000
--- a/docs/source/en/benchmarks.mdx
+++ /dev/null
@@ -1,383 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Benchmarks
-
-<Tip warning={true}>
-
-Hugging Face's Benchmarking tools are deprecated and it is advised to use external Benchmarking libraries to measure the speed 
-and memory complexity of Transformer models.
-
-</Tip>
-
-[[open-in-colab]]
-
-Let's take a look at how 🤗 Transformers models can be benchmarked, best practices, and already available benchmarks.
-
-A notebook explaining in more detail how to benchmark 🤗 Transformers models can be found [here](https://github.com/huggingface/notebooks/tree/main/examples/benchmark.ipynb).
-
-## How to benchmark 🤗 Transformers models
-
-The classes [`PyTorchBenchmark`] and [`TensorFlowBenchmark`] allow to flexibly benchmark 🤗 Transformers models. The benchmark classes allow us to measure the _peak memory usage_ and _required time_ for both _inference_ and _training_.
-
-<Tip>
-
-Hereby, _inference_ is defined by a single forward pass, and _training_ is defined by a single forward pass and
-backward pass.
-
-</Tip>
-
-The benchmark classes [`PyTorchBenchmark`] and [`TensorFlowBenchmark`] expect an object of type [`PyTorchBenchmarkArguments`] and
-[`TensorFlowBenchmarkArguments`], respectively, for instantiation. [`PyTorchBenchmarkArguments`] and [`TensorFlowBenchmarkArguments`] are data classes and contain all relevant configurations for their corresponding benchmark class. In the following example, it is shown how a BERT model of type _bert-base-cased_ can be benchmarked.
-
-<frameworkcontent>
-<pt>
-```py
->>> from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments
-
->>> args = PyTorchBenchmarkArguments(models=["bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512])
->>> benchmark = PyTorchBenchmark(args)
-```
-</pt>
-<tf>
-```py
->>> from transformers import TensorFlowBenchmark, TensorFlowBenchmarkArguments
-
->>> args = TensorFlowBenchmarkArguments(
-...     models=["bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512]
-... )
->>> benchmark = TensorFlowBenchmark(args)
-```
-</tf>
-</frameworkcontent>
-
-Here, three arguments are given to the benchmark argument data classes, namely `models`, `batch_sizes`, and
-`sequence_lengths`. The argument `models` is required and expects a `list` of model identifiers from the
-[model hub](https://huggingface.co/models) The `list` arguments `batch_sizes` and `sequence_lengths` define
-the size of the `input_ids` on which the model is benchmarked. There are many more parameters that can be configured
-via the benchmark argument data classes. For more detail on these one can either directly consult the files
-`src/transformers/benchmark/benchmark_args_utils.py`, `src/transformers/benchmark/benchmark_args.py` (for PyTorch)
-and `src/transformers/benchmark/benchmark_args_tf.py` (for Tensorflow). Alternatively, running the following shell
-commands from root will print out a descriptive list of all configurable parameters for PyTorch and Tensorflow
-respectively.
-
-<frameworkcontent>
-<pt>
-```bash
-python examples/pytorch/benchmarking/run_benchmark.py --help
-```
-
-An instantiated benchmark object can then simply be run by calling `benchmark.run()`.
-
-```py
->>> results = benchmark.run()
->>> print(results)
-====================       INFERENCE - SPEED - RESULT       ====================
---------------------------------------------------------------------------------
-Model Name             Batch Size     Seq Length     Time in s                  
---------------------------------------------------------------------------------
-bert-base-uncased          8               8             0.006     
-bert-base-uncased          8               32            0.006     
-bert-base-uncased          8              128            0.018     
-bert-base-uncased          8              512            0.088     
---------------------------------------------------------------------------------
-
-====================      INFERENCE - MEMORY - RESULT       ====================
---------------------------------------------------------------------------------
-Model Name             Batch Size     Seq Length    Memory in MB 
---------------------------------------------------------------------------------
-bert-base-uncased          8               8             1227
-bert-base-uncased          8               32            1281
-bert-base-uncased          8              128            1307
-bert-base-uncased          8              512            1539
---------------------------------------------------------------------------------
-
-====================        ENVIRONMENT INFORMATION         ====================
-
-- transformers_version: 2.11.0
-- framework: PyTorch
-- use_torchscript: False
-- framework_version: 1.4.0
-- python_version: 3.6.10
-- system: Linux
-- cpu: x86_64
-- architecture: 64bit
-- date: 2020-06-29
-- time: 08:58:43.371351
-- fp16: False
-- use_multiprocessing: True
-- only_pretrain_model: False
-- cpu_ram_mb: 32088
-- use_gpu: True
-- num_gpus: 1
-- gpu: TITAN RTX
-- gpu_ram_mb: 24217
-- gpu_power_watts: 280.0
-- gpu_performance_state: 2
-- use_tpu: False
-```
-</pt>
-<tf>
-```bash
-python examples/tensorflow/benchmarking/run_benchmark_tf.py --help
-```
-
-An instantiated benchmark object can then simply be run by calling `benchmark.run()`.
-
-```py
->>> results = benchmark.run()
->>> print(results)
->>> results = benchmark.run()
->>> print(results)
-====================       INFERENCE - SPEED - RESULT       ====================
---------------------------------------------------------------------------------
-Model Name             Batch Size     Seq Length     Time in s                  
---------------------------------------------------------------------------------
-bert-base-uncased          8               8             0.005
-bert-base-uncased          8               32            0.008
-bert-base-uncased          8              128            0.022
-bert-base-uncased          8              512            0.105
---------------------------------------------------------------------------------
-
-====================      INFERENCE - MEMORY - RESULT       ====================
---------------------------------------------------------------------------------
-Model Name             Batch Size     Seq Length    Memory in MB 
---------------------------------------------------------------------------------
-bert-base-uncased          8               8             1330
-bert-base-uncased          8               32            1330
-bert-base-uncased          8              128            1330
-bert-base-uncased          8              512            1770
---------------------------------------------------------------------------------
-
-====================        ENVIRONMENT INFORMATION         ====================
-
-- transformers_version: 2.11.0
-- framework: Tensorflow
-- use_xla: False
-- framework_version: 2.2.0
-- python_version: 3.6.10
-- system: Linux
-- cpu: x86_64
-- architecture: 64bit
-- date: 2020-06-29
-- time: 09:26:35.617317
-- fp16: False
-- use_multiprocessing: True
-- only_pretrain_model: False
-- cpu_ram_mb: 32088
-- use_gpu: True
-- num_gpus: 1
-- gpu: TITAN RTX
-- gpu_ram_mb: 24217
-- gpu_power_watts: 280.0
-- gpu_performance_state: 2
-- use_tpu: False
-```
-</tf>
-</frameworkcontent>
-
-By default, the _time_ and the _required memory_ for _inference_ are benchmarked. In the example output above the first
-two sections show the result corresponding to _inference time_ and _inference memory_. In addition, all relevant
-information about the computing environment, _e.g._ the GPU type, the system, the library versions, etc... are printed
-out in the third section under _ENVIRONMENT INFORMATION_. This information can optionally be saved in a _.csv_ file
-when adding the argument `save_to_csv=True` to [`PyTorchBenchmarkArguments`] and
-[`TensorFlowBenchmarkArguments`] respectively. In this case, every section is saved in a separate
-_.csv_ file. The path to each _.csv_ file can optionally be defined via the argument data classes.
-
-Instead of benchmarking pre-trained models via their model identifier, _e.g._ `bert-base-uncased`, the user can
-alternatively benchmark an arbitrary configuration of any available model class. In this case, a `list` of
-configurations must be inserted with the benchmark args as follows.
-
-<frameworkcontent>
-<pt>
-```py
->>> from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments, BertConfig
-
->>> args = PyTorchBenchmarkArguments(
-...     models=["bert-base", "bert-384-hid", "bert-6-lay"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512]
-... )
->>> config_base = BertConfig()
->>> config_384_hid = BertConfig(hidden_size=384)
->>> config_6_lay = BertConfig(num_hidden_layers=6)
-
->>> benchmark = PyTorchBenchmark(args, configs=[config_base, config_384_hid, config_6_lay])
->>> benchmark.run()
-====================       INFERENCE - SPEED - RESULT       ====================
---------------------------------------------------------------------------------
-Model Name             Batch Size     Seq Length       Time in s                  
---------------------------------------------------------------------------------
-bert-base                  8              128            0.006
-bert-base                  8              512            0.006
-bert-base                  8              128            0.018     
-bert-base                  8              512            0.088     
-bert-384-hid              8               8             0.006     
-bert-384-hid              8               32            0.006     
-bert-384-hid              8              128            0.011     
-bert-384-hid              8              512            0.054     
-bert-6-lay                 8               8             0.003     
-bert-6-lay                 8               32            0.004     
-bert-6-lay                 8              128            0.009     
-bert-6-lay                 8              512            0.044
---------------------------------------------------------------------------------
-
-====================      INFERENCE - MEMORY - RESULT       ====================
---------------------------------------------------------------------------------
-Model Name             Batch Size     Seq Length      Memory in MB 
---------------------------------------------------------------------------------
-bert-base                  8               8             1277
-bert-base                  8               32            1281
-bert-base                  8              128            1307     
-bert-base                  8              512            1539     
-bert-384-hid              8               8             1005     
-bert-384-hid              8               32            1027     
-bert-384-hid              8              128            1035     
-bert-384-hid              8              512            1255     
-bert-6-lay                 8               8             1097     
-bert-6-lay                 8               32            1101     
-bert-6-lay                 8              128            1127     
-bert-6-lay                 8              512            1359
---------------------------------------------------------------------------------
-
-====================        ENVIRONMENT INFORMATION         ====================
-
-- transformers_version: 2.11.0
-- framework: PyTorch
-- use_torchscript: False
-- framework_version: 1.4.0
-- python_version: 3.6.10
-- system: Linux
-- cpu: x86_64
-- architecture: 64bit
-- date: 2020-06-29
-- time: 09:35:25.143267
-- fp16: False
-- use_multiprocessing: True
-- only_pretrain_model: False
-- cpu_ram_mb: 32088
-- use_gpu: True
-- num_gpus: 1
-- gpu: TITAN RTX
-- gpu_ram_mb: 24217
-- gpu_power_watts: 280.0
-- gpu_performance_state: 2
-- use_tpu: False
-```
-</pt>
-<tf>
-```py
->>> from transformers import TensorFlowBenchmark, TensorFlowBenchmarkArguments, BertConfig
-
->>> args = TensorFlowBenchmarkArguments(
-...     models=["bert-base", "bert-384-hid", "bert-6-lay"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512]
-... )
->>> config_base = BertConfig()
->>> config_384_hid = BertConfig(hidden_size=384)
->>> config_6_lay = BertConfig(num_hidden_layers=6)
-
->>> benchmark = TensorFlowBenchmark(args, configs=[config_base, config_384_hid, config_6_lay])
->>> benchmark.run()
-====================       INFERENCE - SPEED - RESULT       ====================
---------------------------------------------------------------------------------
-Model Name             Batch Size     Seq Length       Time in s                  
---------------------------------------------------------------------------------
-bert-base                  8               8             0.005
-bert-base                  8               32            0.008
-bert-base                  8              128            0.022
-bert-base                  8              512            0.106
-bert-384-hid              8               8             0.005
-bert-384-hid              8               32            0.007
-bert-384-hid              8              128            0.018
-bert-384-hid              8              512            0.064
-bert-6-lay                 8               8             0.002
-bert-6-lay                 8               32            0.003
-bert-6-lay                 8              128            0.0011
-bert-6-lay                 8              512            0.074
---------------------------------------------------------------------------------
-
-====================      INFERENCE - MEMORY - RESULT       ====================
---------------------------------------------------------------------------------
-Model Name             Batch Size     Seq Length      Memory in MB 
---------------------------------------------------------------------------------
-bert-base                  8               8             1330
-bert-base                  8               32            1330
-bert-base                  8              128            1330
-bert-base                  8              512            1770
-bert-384-hid              8               8             1330
-bert-384-hid              8               32            1330
-bert-384-hid              8              128            1330
-bert-384-hid              8              512            1540
-bert-6-lay                 8               8             1330
-bert-6-lay                 8               32            1330
-bert-6-lay                 8              128            1330
-bert-6-lay                 8              512            1540
---------------------------------------------------------------------------------
-
-====================        ENVIRONMENT INFORMATION         ====================
-
-- transformers_version: 2.11.0
-- framework: Tensorflow
-- use_xla: False
-- framework_version: 2.2.0
-- python_version: 3.6.10
-- system: Linux
-- cpu: x86_64
-- architecture: 64bit
-- date: 2020-06-29
-- time: 09:38:15.487125
-- fp16: False
-- use_multiprocessing: True
-- only_pretrain_model: False
-- cpu_ram_mb: 32088
-- use_gpu: True
-- num_gpus: 1
-- gpu: TITAN RTX
-- gpu_ram_mb: 24217
-- gpu_power_watts: 280.0
-- gpu_performance_state: 2
-- use_tpu: False
-```
-</tf>
-</frameworkcontent>
-
-Again, _inference time_ and _required memory_ for _inference_ are measured, but this time for customized configurations
-of the `BertModel` class. This feature can especially be helpful when deciding for which configuration the model
-should be trained.
-
-
-## Benchmark best practices
-
-This section lists a couple of best practices one should be aware of when benchmarking a model.
-
-- Currently, only single device benchmarking is supported. When benchmarking on GPU, it is recommended that the user
-  specifies on which device the code should be run by setting the `CUDA_VISIBLE_DEVICES` environment variable in the
-  shell, _e.g._ `export CUDA_VISIBLE_DEVICES=0` before running the code.
-- The option `no_multi_processing` should only be set to `True` for testing and debugging. To ensure accurate
-  memory measurement it is recommended to run each memory benchmark in a separate process by making sure
-  `no_multi_processing` is set to `True`.
-- One should always state the environment information when sharing the results of a model benchmark. Results can vary
-  heavily between different GPU devices, library versions, etc., so that benchmark results on their own are not very
-  useful for the community.
-
-
-## Sharing your benchmark
-
-Previously all available core models (10 at the time) have been benchmarked for _inference time_, across many different
-settings: using PyTorch, with and without TorchScript, using TensorFlow, with and without XLA. All of those tests were
-done across CPUs (except for TensorFlow XLA) and GPUs.
-
-The approach is detailed in the [following blogpost](https://medium.com/huggingface/benchmarking-transformers-pytorch-and-tensorflow-e2917fb891c2) and the results are
-available [here](https://docs.google.com/spreadsheets/d/1sryqufw2D0XlUH4sq3e9Wnxu5EAQkaohzrJbd5HdQ_w/edit?usp=sharing).
-
-With the new _benchmark_ tools, it is easier than ever to share your benchmark results with the community
-
-- [PyTorch Benchmarking Results](https://github.com/huggingface/transformers/tree/main/examples/pytorch/benchmarking/README.md).
-- [TensorFlow Benchmarking Results](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/benchmarking/README.md).
diff --git a/docs/source/en/bertology.md b/docs/source/en/bertology.md
new file mode 100644
index 000000000000..ba1b4bd4002b
--- /dev/null
+++ b/docs/source/en/bertology.md
@@ -0,0 +1,41 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BERTology
+
+There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT
+(that some call "BERTology"). Some good examples of this field are:
+
+
+- BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick:
+  https://arxiv.org/abs/1905.05950
+- Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650
+- What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D.
+  Manning: https://arxiv.org/abs/1906.04341
+- CAT-probing: A Metric-based Approach to Interpret How Pre-trained Models for Programming Language Attend Code Structure: https://arxiv.org/abs/2210.04633
+
+In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to
+help people access the inner representations, mainly adapted from the great work of Paul Michel
+(https://arxiv.org/abs/1905.10650):
+
+
+- accessing all the hidden-states of BERT/GPT/GPT-2,
+- accessing all the attention weights for each head of BERT/GPT/GPT-2,
+- retrieving heads output values and gradients to be able to compute head importance score and prune head as explained
+  in https://arxiv.org/abs/1905.10650.
+
+To help you understand and use these features, we have added a specific example script: [bertology.py](https://github.com/huggingface/transformers/tree/main/examples/research_projects/bertology/run_bertology.py) while extract information and prune a model pre-trained on
+GLUE.
diff --git a/docs/source/en/bertology.mdx b/docs/source/en/bertology.mdx
deleted file mode 100644
index e64379d6580d..000000000000
--- a/docs/source/en/bertology.mdx
+++ /dev/null
@@ -1,36 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# BERTology
-
-There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT
-(that some call "BERTology"). Some good examples of this field are:
-
-
-- BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick:
-  https://arxiv.org/abs/1905.05950
-- Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650
-- What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D.
-  Manning: https://arxiv.org/abs/1906.04341
-
-In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to
-help people access the inner representations, mainly adapted from the great work of Paul Michel
-(https://arxiv.org/abs/1905.10650):
-
-
-- accessing all the hidden-states of BERT/GPT/GPT-2,
-- accessing all the attention weights for each head of BERT/GPT/GPT-2,
-- retrieving heads output values and gradients to be able to compute head importance score and prune head as explained
-  in https://arxiv.org/abs/1905.10650.
-
-To help you understand and use these features, we have added a specific example script: [bertology.py](https://github.com/huggingface/transformers/tree/main/examples/research_projects/bertology/run_bertology.py) while extract information and prune a model pre-trained on
-GLUE.
diff --git a/docs/source/en/big_models.md b/docs/source/en/big_models.md
new file mode 100644
index 000000000000..9b57e4331760
--- /dev/null
+++ b/docs/source/en/big_models.md
@@ -0,0 +1,123 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Instantiating a big model
+
+When you want to use a very big pretrained model, one challenge is to minimize the use of the RAM. The usual workflow
+from PyTorch is:
+
+1. Create your model with random weights.
+2. Load your pretrained weights.
+3. Put those pretrained weights in your random model.
+
+Step 1 and 2 both require a full version of the model in memory, which is not a problem in most cases, but if your model starts weighing several GigaBytes, those two copies can make you get out of RAM. Even worse, if you are using `torch.distributed` to launch a distributed training, each process will load the pretrained model and store these two copies in RAM.
+
+<Tip>
+
+Note that the randomly created model is initialized with "empty" tensors, which take the space in memory without filling it (thus the random values are whatever was in this chunk of memory at a given time). The random initialization following the appropriate distribution for the kind of model/parameters instantiated (like a normal distribution for instance) is only performed after step 3 on the non-initialized weights, to be as fast as possible! 
+
+</Tip>
+
+In this guide, we explore the solutions Transformers offer to deal with this issue. Note that this is an area of active development, so the APIs explained here may change slightly in the future.
+
+## Sharded checkpoints
+
+Since version 4.18.0, model checkpoints that end up taking more than 10GB of space are automatically sharded in smaller pieces. In terms of having one single checkpoint when you do `model.save_pretrained(save_dir)`, you will end up with several partial checkpoints (each of which being of size < 10GB) and an index that maps parameter names to the files they are stored in.
+
+You can control the maximum size before sharding with the `max_shard_size` parameter, so for the sake of an example, we'll use a normal-size models with a small shard size: let's take a traditional BERT model.
+
+```py
+from transformers import AutoModel
+
+model = AutoModel.from_pretrained("bert-base-cased")
+```
+
+If you save it using [`~PreTrainedModel.save_pretrained`], you will get a new folder with two files: the config of the model and its weights:
+
+```py
+>>> import os
+>>> import tempfile
+
+>>> with tempfile.TemporaryDirectory() as tmp_dir:
+...     model.save_pretrained(tmp_dir)
+...     print(sorted(os.listdir(tmp_dir)))
+['config.json', 'pytorch_model.bin']
+```
+
+Now let's use a maximum shard size of 200MB:
+
+```py
+>>> with tempfile.TemporaryDirectory() as tmp_dir:
+...     model.save_pretrained(tmp_dir, max_shard_size="200MB")
+...     print(sorted(os.listdir(tmp_dir)))
+['config.json', 'pytorch_model-00001-of-00003.bin', 'pytorch_model-00002-of-00003.bin', 'pytorch_model-00003-of-00003.bin', 'pytorch_model.bin.index.json']
+```
+
+On top of the configuration of the model, we see three different weights files, and an `index.json` file which is our index. A checkpoint like this can be fully reloaded using the [`~PreTrainedModel.from_pretrained`] method:
+
+```py
+>>> with tempfile.TemporaryDirectory() as tmp_dir:
+...     model.save_pretrained(tmp_dir, max_shard_size="200MB")
+...     new_model = AutoModel.from_pretrained(tmp_dir)
+```
+
+The main advantage of doing this for big models is that during step 2 of the workflow shown above, each shard of the checkpoint is loaded after the previous one, capping the memory usage in RAM to the model size plus the size of the biggest shard.
+
+Behind the scenes, the index file is used to determine which keys are in the checkpoint, and where the corresponding weights are stored. We can load that index like any json and get a dictionary:
+
+```py
+>>> import json
+
+>>> with tempfile.TemporaryDirectory() as tmp_dir:
+...     model.save_pretrained(tmp_dir, max_shard_size="200MB")
+...     with open(os.path.join(tmp_dir, "pytorch_model.bin.index.json"), "r") as f:
+...         index = json.load(f)
+
+>>> print(index.keys())
+dict_keys(['metadata', 'weight_map'])
+```
+
+The metadata just consists of the total size of the model for now. We plan to add other information in the future:
+
+```py
+>>> index["metadata"]
+{'total_size': 433245184}
+```
+
+The weights map is the main part of this index, which maps each parameter name (as usually found in a PyTorch model `state_dict`) to the file it's stored in:
+
+```py
+>>> index["weight_map"]
+{'embeddings.LayerNorm.bias': 'pytorch_model-00001-of-00003.bin',
+ 'embeddings.LayerNorm.weight': 'pytorch_model-00001-of-00003.bin',
+ ...
+```
+
+If you want to directly load such a sharded checkpoint inside a model without using [`~PreTrainedModel.from_pretrained`] (like you would do `model.load_state_dict()` for a full checkpoint) you should use [`~modeling_utils.load_sharded_checkpoint`]:
+
+```py
+>>> from transformers.modeling_utils import load_sharded_checkpoint
+
+>>> with tempfile.TemporaryDirectory() as tmp_dir:
+...     model.save_pretrained(tmp_dir, max_shard_size="200MB")
+...     load_sharded_checkpoint(model, tmp_dir)
+```
+
+## Low memory loading
+
+Sharded checkpoints reduce the memory usage during step 2 of the workflow mentioned above, but in order to use that model in a low memory setting, we recommend leveraging our tools based on the Accelerate library.
+
+Please read the following guide for more information: [Large model loading using Accelerate](./main_classes/model#large-model-loading)
diff --git a/docs/source/en/big_models.mdx b/docs/source/en/big_models.mdx
deleted file mode 100644
index 971403b62d4a..000000000000
--- a/docs/source/en/big_models.mdx
+++ /dev/null
@@ -1,119 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Instantiating a big model
-
-When you want to use a very big pretrained model, one challenge is to minimize the use of the RAM. The usual workflow
-from PyTorch is:
-
-1. Create your model with random weights.
-2. Load your pretrained weights.
-3. Put those pretrained weights in your random model.
-
-Step 1 and 2 both require a full version of the model in memory, which is not a problem in most cases, but if your model starts weighing several GigaBytes, those two copies can make you got our of RAM. Even worse, if you are using `torch.distributed` to launch a distributed training, each process will load the pretrained model and store these two copies in RAM.
-
-<Tip>
-
-Note that the randomly created model is initialized with "empty" tensors, which take the space in memory without filling it (thus the random values are whatever was in this chunk of memory at a given time). The random initialization following the appropriate distribution for the kind of model/parameters instatiated (like a normal distribution for instance) is only performed after step 3 on the non-initialized weights, to be as fast as possible! 
-
-</Tip>
-
-In this guide, we explore the solutions Transformers offer to deal with this issue. Note that this is an area of active development, so the APIs explained here may change slightly in the future.
-
-## Sharded checkpoints
-
-Since version 4.18.0, model checkpoints that end up taking more than 10GB of space are automatically sharded in smaller pieces. In terms of having one single checkpoint when you do `model.save_pretrained(save_dir)`, you will end up with several partial checkpoints (each of which being of size < 10GB) and an index that maps parameter names to the files they are stored in.
-
-You can control the maximum size before sharding with the `max_shard_size` parameter, so for the sake of an example, we'll use a normal-size models with a small shard size: let's take a traditional BERT model.
-
-```py
-from transformers import AutoModel
-
-model = AutoModel.from_pretrained("bert-base-cased")
-```
-
-If you save it using [`~PreTrainedModel.save_pretrained`], you will get a new folder with two files: the config of the model and its weights:
-
-```py
->>> import os
->>> import tempfile
-
->>> with tempfile.TemporaryDirectory() as tmp_dir:
-...     model.save_pretrained(tmp_dir)
-...     print(sorted(os.listdir(tmp_dir)))
-['config.json', 'pytorch_model.bin']
-```
-
-Now let's use a maximum shard size of 200MB:
-
-```py
->>> with tempfile.TemporaryDirectory() as tmp_dir:
-...     model.save_pretrained(tmp_dir, max_shard_size="200MB")
-...     print(sorted(os.listdir(tmp_dir)))
-['config.json', 'pytorch_model-00001-of-00003.bin', 'pytorch_model-00002-of-00003.bin', 'pytorch_model-00003-of-00003.bin', 'pytorch_model.bin.index.json']
-```
-
-On top of the configuration of the model, we see three different weights files, and an `index.json` file which is our index. A checkpoint like this can be fully reloaded using the [`~PreTrainedModel.from_pretrained`] method:
-
-```py
->>> with tempfile.TemporaryDirectory() as tmp_dir:
-...     model.save_pretrained(tmp_dir, max_shard_size="200MB")
-...     new_model = AutoModel.from_pretrained(tmp_dir)
-```
-
-The main advantage of doing this for big models is that during step 2 of the workflow shown above, each shard of the checkpoint is loaded after the previous one, capping the memory usage in RAM to the model size plus the size of the biggest shard.
-
-Behind the scenes, the index file is used to determine which keys are in the checkpoint, and where the corresponding weights are stored. We can load that index like any json and get a dictionary:
-
-```py
->>> import json
-
->>> with tempfile.TemporaryDirectory() as tmp_dir:
-...     model.save_pretrained(tmp_dir, max_shard_size="200MB")
-...     with open(os.path.join(tmp_dir, "pytorch_model.bin.index.json"), "r") as f:
-...         index = json.load(f)
-
->>> print(index.keys())
-dict_keys(['metadata', 'weight_map'])
-```
-
-The metadata just consists of the total size of the model for now. We plan to add other information in the future:
-
-```py
->>> index["metadata"]
-{'total_size': 433245184}
-```
-
-The weights map is the main part of this index, which maps each parameter name (as usually found in a PyTorch model `state_dict`) to the file it's stored in:
-
-```py
->>> index["weight_map"]
-{'embeddings.LayerNorm.bias': 'pytorch_model-00001-of-00003.bin',
- 'embeddings.LayerNorm.weight': 'pytorch_model-00001-of-00003.bin',
- ...
-```
-
-If you want to directly load such a sharded checkpoint inside a model without using [`~PreTrainedModel.from_pretrained`] (like you would do `model.load_state_dict()` for a full checkpoint) you should use [`~modeling_utils.load_sharded_checkpoint`]:
-
-```py
->>> from transformers.modeling_utils import load_sharded_checkpoint
-
->>> with tempfile.TemporaryDirectory() as tmp_dir:
-...     model.save_pretrained(tmp_dir, max_shard_size="200MB")
-...     load_sharded_checkpoint(model, tmp_dir)
-```
-
-## Low memory loading
-
-Sharded checkpoints reduce the memory usage during step 2 of the workflow mentioned above, but in order to use that model in a low memory setting, we recommend leveraging our tools based on the Accelerate library.
-
-Please read the following guide for more information: [Large model loading using Accelerate](./main_classes/model#large-model-loading)
\ No newline at end of file
diff --git a/docs/source/en/chat_templating.md b/docs/source/en/chat_templating.md
new file mode 100644
index 000000000000..115c9f51677d
--- /dev/null
+++ b/docs/source/en/chat_templating.md
@@ -0,0 +1,367 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Templates for Chat Models
+
+## Introduction
+
+An increasingly common use case for LLMs is **chat**. In a chat context, rather than continuing a single string
+of text (as is the case with a standard language model), the model instead continues a conversation that consists
+of one or more **messages**, each of which includes a **role** as well as message text.
+
+Most commonly, these roles are "user" for messages sent by the user, and "assistant" for messages sent by the model.
+Some models also support a "system" role. System messages are usually sent at the beginning of the conversation
+and include directives about how the model should behave in the subsequent chat.
+
+All language models, including models fine-tuned for chat, operate on linear sequences of tokens and do not intrinsically
+have special handling for roles. This means that role information is usually injected by adding control tokens
+between messages, to indicate both the message boundary and the relevant roles.
+
+Unfortunately, there isn't (yet!) a standard for which tokens to use, and so different models have been trained
+with wildly different formatting and control tokens for chat. This can be a real problem for users - if you use the
+wrong format, then the model will be confused by your input, and your performance will be a lot worse than it should be.
+This is the problem that **chat templates** aim to resolve. 
+
+Chat conversations are typically represented as a list of dictionaries, where each dictionary contains `role`
+and `content` keys, and represents a single chat message. Chat templates are strings containing a Jinja template that
+specifies how to format a conversation for a given model into a single tokenizable sequence. By storing this information
+with the tokenizer, we can ensure that models get input data in the format they expect.
+
+Let's make this concrete with a quick example using the `BlenderBot` model. BlenderBot has an extremely simple default 
+template, which mostly just adds whitespace between rounds of dialogue:
+
+```python
+>>> from transformers import AutoTokenizer
+>>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
+
+>>> chat = [
+...   {"role": "user", "content": "Hello, how are you?"},
+...   {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
+...   {"role": "user", "content": "I'd like to show off how chat templating works!"},
+... ]
+
+>>> tokenizer.apply_chat_template(chat, tokenize=False)
+" Hello, how are you?  I'm doing great. How can I help you today?   I'd like to show off how chat templating works!</s>"
+```
+
+Notice how the entire chat is condensed into a single string. If we use `tokenize=True`, which is the default setting,
+that string will also be tokenized for us. To see a more complex template in action, though, let's use the 
+`meta-llama/Llama-2-7b-chat-hf` model. Note that this model has gated access, so you will have to
+[request access on the repo](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) if you want to run this code yourself:
+
+```python
+>> from transformers import AutoTokenizer
+>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
+
+>> chat = [
+...   {"role": "user", "content": "Hello, how are you?"},
+...   {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
+...   {"role": "user", "content": "I'd like to show off how chat templating works!"},
+... ]
+
+>> tokenizer.use_default_system_prompt = False
+>> tokenizer.apply_chat_template(chat, tokenize=False)
+"<s>[INST] Hello, how are you? [/INST] I'm doing great. How can I help you today? </s><s>[INST] I'd like to show off how chat templating works! [/INST]"
+```
+
+Note that this time, the tokenizer has added the control tokens [INST] and [/INST] to indicate the start and end of 
+user messages (but not assistant messages!)
+
+## How do chat templates work?
+
+The chat template for a model is stored on the `tokenizer.chat_template` attribute. If no chat template is set, the
+default template for that model class is used instead. Let's take a look at the template for `BlenderBot`:
+
+```python
+
+>>> from transformers import AutoTokenizer
+>>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
+
+>>> tokenizer.default_chat_template
+"{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ '  ' }}{% endif %}{% endfor %}{{ eos_token }}"
+```
+
+That's kind of intimidating. Let's add some newlines and indentation to make it more readable. Note that the first
+newline after each block as well as any preceding whitespace before a block are ignored by default, using the 
+Jinja `trim_blocks` and `lstrip_blocks` flags. However, be cautious - although leading whitespace on each
+line is stripped, spaces between blocks on the same line are not. We strongly recommend checking that your template
+isn't printing extra spaces where it shouldn't be!
+
+```
+{% for message in messages %}
+    {% if message['role'] == 'user' %}
+        {{ ' ' }}
+    {% endif %}
+    {{ message['content'] }}
+    {% if not loop.last %}
+        {{ '  ' }}
+    {% endif %}
+{% endfor %}
+{{ eos_token }}
+```
+
+If you've never seen one of these before, this is a [Jinja template](https://jinja.palletsprojects.com/en/3.1.x/templates/).
+Jinja is a templating language that allows you to write simple code that generates text. In many ways, the code and
+syntax resembles Python. In pure Python, this template would look something like this:
+
+```python
+for idx, message in enumerate(messages):
+    if message['role'] == 'user':
+        print(' ')
+    print(message['content'])
+    if not idx == len(messages) - 1:  # Check for the last message in the conversation
+        print('  ')
+print(eos_token)
+```
+
+Effectively, the template does three things:
+1. For each message, if the message is a user message, add a blank space before it, otherwise print nothing.
+2. Add the message content
+3. If the message is not the last message, add two spaces after it. After the final message, print the EOS token.
+
+This is a pretty simple template - it doesn't add any control tokens, and it doesn't support "system" messages, which 
+are a common way to give the model directives about how it should behave in the subsequent conversation.
+But Jinja gives you a lot of flexibility to do those things! Let's see a Jinja template that can format inputs
+similarly to the way LLaMA formats them (note that the real LLaMA template includes handling for default system
+messages and slightly different system message handling in general - don't use this one in your actual code!)
+
+```
+{% for message in messages %}
+    {% if message['role'] == 'user' %}
+        {{ bos_token + '[INST] ' + message['content'] + ' [/INST]' }}
+    {% elif message['role'] == 'system' %}
+        {{ '<<SYS>>\\n' + message['content'] + '\\n<</SYS>>\\n\\n' }}
+    {% elif message['role'] == 'assistant' %}
+        {{ ' '  + message['content'] + ' ' + eos_token }}
+    {% endif %}
+{% endfor %}
+```
+
+Hopefully if you stare at this for a little bit you can see what this template is doing - it adds specific tokens based
+on the "role" of each message, which represents who sent it. User, assistant and system messages are clearly
+distinguishable to the model because of the tokens they're wrapped in.
+
+## How do I create a chat template?
+
+Simple, just write a jinja template and set `tokenizer.chat_template`. You may find it easier to start with an 
+existing template from another model and simply edit it for your needs! For example, we could take the LLaMA template
+above and add "[ASST]" and "[/ASST]" to assistant messages:
+
+```
+{% for message in messages %}
+    {% if message['role'] == 'user' %}
+        {{ bos_token + '[INST] ' + message['content'].strip() + ' [/INST]' }}
+    {% elif message['role'] == 'system' %}
+        {{ '<<SYS>>\\n' + message['content'].strip() + '\\n<</SYS>>\\n\\n' }}
+    {% elif message['role'] == 'assistant' %}
+        {{ '[ASST] '  + message['content'] + ' [/ASST]' + eos_token }}
+    {% endif %}
+{% endfor %}
+```
+
+Now, simply set the `tokenizer.chat_template` attribute. Next time you use [`~PreTrainedTokenizer.apply_chat_template`], it will
+use your new template! This attribute will be saved in the `tokenizer_config.json` file, so you can use
+[`~utils.PushToHubMixin.push_to_hub`] to upload your new template to the Hub and make sure everyone's using the right
+template for your model!
+
+```python
+template = tokenizer.chat_template
+template = template.replace("SYS", "SYSTEM")  # Change the system token
+tokenizer.chat_template = template  # Set the new template
+tokenizer.push_to_hub("model_name")  # Upload your new template to the Hub!
+```
+
+The method [`~PreTrainedTokenizer.apply_chat_template`] which uses your chat template is called by the [`ConversationalPipeline`] class, so 
+once you set the correct chat template, your model will automatically become compatible with [`ConversationalPipeline`].
+
+## What are "default" templates?
+
+Before the introduction of chat templates, chat handling was hardcoded at the model class level. For backwards 
+compatibility, we have retained this class-specific handling as default templates, also set at the class level. If a
+model does not have a chat template set, but there is a default template for its model class, the `ConversationalPipeline`
+class and methods like `apply_chat_template` will use the class template instead. You can find out what the default
+template for your tokenizer is by checking the `tokenizer.default_chat_template` attribute.
+
+This is something we do purely for backward compatibility reasons, to avoid breaking any existing workflows. Even when
+the class template is appropriate for your model, we strongly recommend overriding the default template by
+setting the `chat_template` attribute explicitly to make it clear to users that your model has been correctly configured
+for chat, and to future-proof in case the default templates are ever altered or deprecated.
+
+## What template should I use?
+
+When setting the template for a model that's already been trained for chat, you should ensure that the template
+exactly matches the message formatting that the model saw during training, or else you will probably experience
+performance degradation. This is true even if you're training the model further - you will probably get the best 
+performance if you keep the chat tokens constant. This is very analogous to tokenization - you generally get the
+best performance for inference or fine-tuning when you precisely match the tokenization used during training.
+
+If you're training a model from scratch, or fine-tuning a base language model for chat, on the other hand,
+you have a lot of freedom to choose an appropriate template! LLMs are smart enough to learn to handle lots of different
+input formats. Our default template for models that don't have a class-specific template follows the 
+[ChatML format](https://github.com/openai/openai-python/blob/main/chatml.md), and this is a good, flexible choice for many use-cases. It looks like this:
+
+```
+{% for message in messages %}
+    {{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}
+{% endfor %}
+```
+
+If you like this one, here it is in one-liner form, ready to copy into your code. The one-liner also includes
+handy support for "generation prompts" - see the next section for more!
+
+```
+tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+```
+
+This template wraps each message in `<|im_start|>` and `<|im_end|>` tokens, and simply writes the role as a string, which
+allows for flexibility in the roles you train with. The output looks like this:
+
+```
+<|im_start|>system
+You are a helpful chatbot that will do its best not to say anything so stupid that people tweet about it.<|im_end|>
+<|im_start|>user
+How are you?<|im_end|>
+<|im_start|>assistant
+I'm doing great!<|im_end|>
+```
+
+The "user", "system" and "assistant" roles are the standard for chat, and we recommend using them when it makes sense,
+particularly if you want your model to operate well with [`ConversationalPipeline`]. However, you are not limited
+to these roles - templating is extremely flexible, and any string can be a role.
+
+## What are "generation prompts"?
+
+You may notice that the `apply_chat_template` method has an `add_generation_prompt` argument. This argument tells
+the template to add tokens that indicate the start of a bot response. For example, consider the following chat:
+
+```python
+messages = [
+    {"role": "user", "content": "Hi there!"},
+    {"role": "assistant", "content": "Nice to meet you!"},
+    {"role": "user", "content": "Can I ask a question?"}
+]
+```
+
+Here's what this will look like without a generation prompt, using the ChatML template we described above:
+
+```python
+>> tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
+"""<|im_start|>user
+Hi there!<|im_end|>
+<|im_start|>assistant
+Nice to meet you!<|im_end|>
+<|im_start|>user
+Can I ask a question?<|im_end|>
+"""
+```
+
+And here's what it looks like **with** a generation prompt:
+
+```python
+>> tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+"""<|im_start|>user
+Hi there!<|im_end|>
+<|im_start|>assistant
+Nice to meet you!<|im_end|>
+<|im_start|>user
+Can I ask a question?<|im_end|>
+<|im_start|>assistant
+"""
+```
+
+Note that this time, we've added the tokens that indicate the start of a bot response. This ensures that when the model
+generates text it will write a bot response instead of doing something unexpected, like continuing the user's 
+message. Remember, chat models are still just language models - they're trained to continue text, and chat is just a 
+special kind of text to them! You need to guide them with the appropriate control tokens so they know what they're 
+supposed to be doing.
+
+Not all models require generation prompts. Some models, like BlenderBot and LLaMA, don't have any
+special tokens before bot responses. In these cases, the `add_generation_prompt` argument will have no effect. The exact
+effect that `add_generation_prompt` has will depend on the template being used.
+
+## I want to use chat templates! How should I get started?
+
+If you have any chat models, you should set their `tokenizer.chat_template` attribute and test it using
+[`~PreTrainedTokenizer.apply_chat_template`]. This applies even if you're not the model owner - if you're using a model
+with an empty chat template, or one that's still using the default class template, please open a [pull request](https://huggingface.co/docs/hub/repositories-pull-requests-discussions) to
+the model repository so that this attribute can be set properly!
+
+Once the attribute is set, that's it, you're done! `tokenizer.apply_chat_template` will now work correctly for that
+model, which means it is also automatically supported in places like `ConversationalPipeline`!
+
+By ensuring that models have this attribute, we can make sure that the whole community gets to use the full power of
+open-source models. Formatting mismatches have been haunting the field and silently harming performance for too long - 
+it's time to put an end to them!
+
+## Template writing tips
+
+If you're unfamiliar with Jinja, we generally find that the easiest way to write a chat template is to first
+write a short Python script that formats messages the way you want, and then convert that script into a template.
+
+Remember that the template handler will receive the conversation history as a variable called `messages`. Each
+message is a dictionary with two keys, `role` and `content`. You will be able to access `messages` in your template
+just like you can in Python, which means you can loop over it with `{% for message in messages %}` or access
+individual messages with, for example, `{{ messages[0] }}`.
+
+You can also use the following tips to convert your code to Jinja:
+
+### For loops
+
+For loops in Jinja look like this:
+
+```
+{% for message in messages %}
+{{ message['content'] }}
+{% endfor %}
+```
+
+Note that whatever's inside the {{ expression block }} will be printed to the output. You can use operators like
+`+` to combine strings inside expression blocks.
+
+### If statements
+
+If statements in Jinja look like this:
+
+```
+{% if message['role'] == 'user' %}
+{{ message['content'] }}
+{% endif %}
+```
+
+Note how where Python uses whitespace to mark the beginnings and ends of `for` and `if` blocks, Jinja requires you
+to explicitly end them with `{% endfor %}` and `{% endif %}`.
+
+### Special variables
+
+Inside your template, you will have access to the list of `messages`, but you can also access several other special
+variables. These include special tokens like `bos_token` and `eos_token`, as well as the `add_generation_prompt`
+variable that we discussed above. You can also use the `loop` variable to access information about the current loop
+iteration, for example  using `{% if loop.last %}` to check if the current message is the last message in the 
+conversation. Here's an example that puts these ideas together to add a generation prompt at the end of the
+conversation if add_generation_prompt is `True`:
+
+```
+{% if loop.last and add_generation_prompt %}
+{{ bos_token + 'Assistant:\n' }}
+{% endif %}
+```
+
+### Notes on whitespace
+
+As much as possible, we've tried to get Jinja to ignore whitespace outside of {{ expressions }}. However, be aware
+that Jinja is a general-purpose templating engine, and it may treat whitespace between blocks on the same line
+as significant and print it to the output. We **strongly** recommend checking that your template isn't printing extra
+spaces where it shouldn't be before you upload it!
\ No newline at end of file
diff --git a/docs/source/en/community.md b/docs/source/en/community.md
new file mode 100644
index 000000000000..0305844a1be8
--- /dev/null
+++ b/docs/source/en/community.md
@@ -0,0 +1,69 @@
+<!--⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+# Community
+
+This page regroups resources around 🤗 Transformers developed by the community.
+
+## Community resources:
+
+| Resource     |      Description      |      Author      |
+|:----------|:-------------|------:|
+| [Hugging Face Transformers Glossary Flashcards](https://www.darigovresearch.com/huggingface-transformers-glossary-flashcards) | A set of flashcards based on the [Transformers Docs Glossary](glossary) that has been put into a form which can be easily learned/revised using [Anki ](https://apps.ankiweb.net/) an open source, cross platform app specifically designed for long term knowledge retention. See this [Introductory video on how to use the flashcards](https://www.youtube.com/watch?v=Dji_h7PILrw). | [Darigov Research](https://www.darigovresearch.com/) |
+
+## Community notebooks:
+
+| Notebook     |      Description      |      Author      |      |
+|:----------|:-------------|:-------------|------:|
+| [Fine-tune a pre-trained Transformer to generate lyrics](https://github.com/AlekseyKorshuk/huggingartists) | How to generate lyrics in the style of your favorite artist by fine-tuning a GPT-2 model |  [Aleksey Korshuk](https://github.com/AlekseyKorshuk) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AlekseyKorshuk/huggingartists/blob/master/huggingartists-demo.ipynb) |
+| [Train T5 in Tensorflow 2 ](https://github.com/snapthat/TF-T5-text-to-text) | How to train T5 for any task using Tensorflow 2. This notebook demonstrates a Question & Answer task implemented in Tensorflow 2 using SQUAD | [Muhammad Harris](https://github.com/HarrisDePerceptron) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snapthat/TF-T5-text-to-text/blob/master/snapthatT5/notebooks/TF-T5-Datasets%20Training.ipynb) |
+| [Train T5 on TPU](https://github.com/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb)  | How to train T5 on SQUAD with Transformers and Nlp | [Suraj Patil](https://github.com/patil-suraj) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=QLGiFCDqvuil) |
+| [Fine-tune T5 for Classification and Multiple Choice](https://github.com/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb)  | How to fine-tune T5 for classification and multiple choice tasks using a text-to-text format with PyTorch Lightning |  [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) |
+| [Fine-tune DialoGPT on New Datasets and Languages](https://github.com/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb)  | How to fine-tune the DialoGPT model on a new dataset for open-dialog conversational chatbots |  [Nathan Cooper](https://github.com/ncoop57) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) |
+| [Long Sequence Modeling with Reformer](https://github.com/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb)  | How to train on sequences as long as 500,000 tokens with Reformer |  [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb)  |
+| [Fine-tune BART for Summarization](https://github.com/ohmeow/ohmeow_website/blob/master/posts/2021-05-25-mbart-sequence-classification-with-blurr.ipynb) | How to fine-tune BART for summarization with fastai using blurr | [Wayde Gilliam](https://ohmeow.com/) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ohmeow/ohmeow_website/blob/master/posts/2021-05-25-mbart-sequence-classification-with-blurr.ipynb) |
+| [Fine-tune a pre-trained Transformer on anyone's tweets](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) | How to generate tweets in the style of your favorite Twitter account by fine-tuning a GPT-2 model |  [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) |
+| [Optimize 🤗 Hugging Face models with Weights & Biases](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) | A complete tutorial showcasing W&B integration with Hugging Face | [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) |
+| [Pretrain Longformer](https://github.com/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb)  | How to build a "long" version of existing pretrained models |  [Iz Beltagy](https://beltagy.net) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) |
+| [Fine-tune Longformer for QA](https://github.com/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) | How to fine-tune longformer model for QA task | [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) |
+| [Evaluate Model with 🤗nlp](https://github.com/patrickvonplaten/notebooks/blob/master/How_to_evaluate_Longformer_on_TriviaQA_using_NLP.ipynb) | How to evaluate longformer on TriviaQA with `nlp` | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1m7eTGlPmLRgoPkkA7rkhQdZ9ydpmsdLE?usp=sharing) |
+| [Fine-tune T5 for Sentiment Span Extraction](https://github.com/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb)  | How to fine-tune T5 for sentiment span extraction using a text-to-text format with PyTorch Lightning |  [Lorenzo Ampil](https://github.com/enzoampil) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb) |
+| [Fine-tune DistilBert for Multiclass Classification](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb) | How to fine-tune DistilBert for multiclass classification with PyTorch | [Abhishek Kumar Mishra](https://github.com/abhimishra91) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb)|
+|[Fine-tune BERT for Multi-label Classification](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)|How to fine-tune BERT for multi-label classification using PyTorch|[Abhishek Kumar Mishra](https://github.com/abhimishra91) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)|
+|[Fine-tune T5 for Summarization](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)|How to fine-tune T5 for summarization in PyTorch and track experiments with WandB|[Abhishek Kumar Mishra](https://github.com/abhimishra91) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)|
+|[Speed up Fine-Tuning in Transformers with Dynamic Padding / Bucketing](https://github.com/ELS-RD/transformers-notebook/blob/master/Divide_Hugging_Face_Transformers_training_time_by_2_or_more.ipynb)|How to speed up fine-tuning by a factor of 2 using dynamic padding / bucketing|[Michael Benesty](https://github.com/pommedeterresautee) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1CBfRU1zbfu7-ijiOqAAQUA-RJaxfcJoO?usp=sharing)|
+|[Pretrain Reformer for Masked Language Modeling](https://github.com/patrickvonplaten/notebooks/blob/master/Reformer_For_Masked_LM.ipynb)| How to train a Reformer model with bi-directional self-attention layers | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tzzh0i8PgDQGV3SMFUGxM7_gGae3K-uW?usp=sharing)|
+|[Expand and Fine Tune Sci-BERT](https://github.com/lordtt13/word-embeddings/blob/master/COVID-19%20Research%20Data/COVID-SciBERT.ipynb)| How to increase vocabulary of a pretrained SciBERT model from AllenAI on the CORD dataset and pipeline it. | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1rqAR40goxbAfez1xvF3hBJphSCsvXmh8)|
+|[Fine Tune BlenderBotSmall for Summarization using the Trainer API](https://github.com/lordtt13/transformers-experiments/blob/master/Custom%20Tasks/fine-tune-blenderbot_small-for-summarization.ipynb)| How to fine-tune BlenderBotSmall for summarization on a custom dataset, using the Trainer API. | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/19Wmupuls7mykSGyRN_Qo6lPQhgp56ymq?usp=sharing)|
+|[Fine-tune Electra and interpret with Integrated Gradients](https://github.com/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb) | How to fine-tune Electra for sentiment analysis and interpret predictions with Captum Integrated Gradients | [Eliza Szczechla](https://elsanns.github.io) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb)|
+|[fine-tune a non-English GPT-2 Model with Trainer class](https://github.com/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb) | How to fine-tune a non-English GPT-2 Model with Trainer class | [Philipp Schmid](https://www.philschmid.de) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb)|
+|[Fine-tune a DistilBERT Model for Multi Label Classification task](https://github.com/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb) | How to fine-tune a DistilBERT Model for Multi Label Classification task | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb)|
+|[Fine-tune ALBERT for sentence-pair classification](https://github.com/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb) | How to fine-tune an ALBERT model or another BERT-based model for the sentence-pair classification task | [Nadir El Manouzi](https://github.com/NadirEM) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb)|
+|[Fine-tune Roberta for sentiment analysis](https://github.com/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb) | How to fine-tune a Roberta model for sentiment analysis | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb)|
+|[Evaluating Question Generation Models](https://github.com/flexudy-pipe/qugeev) | How accurate are the answers to questions generated by your seq2seq transformer model? | [Pascal Zoleko](https://github.com/zolekode) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1bpsSqCQU-iw_5nNoRm_crPq6FRuJthq_?usp=sharing)|
+|[Classify text with DistilBERT and Tensorflow](https://github.com/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb) | How to fine-tune DistilBERT for text classification in TensorFlow | [Peter Bayerle](https://github.com/peterbayerle) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb)|
+|[Leverage BERT for Encoder-Decoder Summarization on CNN/Dailymail](https://github.com/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb) | How to warm-start a *EncoderDecoderModel* with a *bert-base-uncased* checkpoint for summarization on CNN/Dailymail | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb)|
+|[Leverage RoBERTa for Encoder-Decoder Summarization on BBC XSum](https://github.com/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb) | How to warm-start a shared *EncoderDecoderModel* with a *roberta-base* checkpoint for summarization on BBC/XSum | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb)|
+|[Fine-tune TAPAS on Sequential Question Answering (SQA)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) | How to fine-tune *TapasForQuestionAnswering* with a *tapas-base* checkpoint on the Sequential Question Answering (SQA) dataset | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb)|
+|[Evaluate TAPAS on Table Fact Checking (TabFact)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb) | How to evaluate a fine-tuned *TapasForSequenceClassification* with a *tapas-base-finetuned-tabfact* checkpoint using a combination of the 🤗 datasets and 🤗 transformers libraries | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb)|
+|[Fine-tuning mBART for translation](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb) | How to fine-tune mBART using Seq2SeqTrainer for Hindi to English translation | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb)|
+|[Fine-tune LayoutLM on FUNSD (a form understanding dataset)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb) | How to fine-tune *LayoutLMForTokenClassification* on the FUNSD dataset for information extraction from scanned documents | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb)|
+|[Fine-Tune DistilGPT2 and Generate Text](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb) | How to fine-tune DistilGPT2 and generate text | [Aakash Tripathi](https://github.com/tripathiaakash) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb)|
+|[Fine-Tune LED on up to 8K tokens](https://github.com/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb) | How to fine-tune LED on pubmed for long-range summarization | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb)|
+|[Evaluate LED on Arxiv](https://github.com/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb) | How to effectively evaluate LED on long-range summarization | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb)|
+|[Fine-tune LayoutLM on RVL-CDIP (a document image classification dataset)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb) | How to fine-tune *LayoutLMForSequenceClassification* on the RVL-CDIP dataset for scanned document classification | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb)|
+|[Wav2Vec2 CTC decoding with GPT2 adjustment](https://github.com/voidful/huggingface_notebook/blob/main/xlsr_gpt.ipynb) | How to decode CTC sequence with language model adjustment | [Eric Lam](https://github.com/voidful) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1e_z5jQHYbO2YKEaUgzb1ww1WwiAyydAj?usp=sharing)|
+|[Fine-tune BART for summarization in two languages with Trainer class](https://github.com/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb) | How to fine-tune BART for summarization in two languages with Trainer class | [Eliza Szczechla](https://github.com/elsanns) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb)|
+|[Evaluate Big Bird on Trivia QA](https://github.com/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb) | How to evaluate BigBird on long document question answering on Trivia QA | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb)|
+| [Create video captions using Wav2Vec2](https://github.com/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) | How to create YouTube captions from any video by transcribing the audio with Wav2Vec | [Niklas Muennighoff](https://github.com/Muennighoff) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) |
+| [Fine-tune the Vision Transformer on CIFAR-10 using PyTorch Lightning](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) | How to fine-tune the Vision Transformer (ViT) on CIFAR-10 using HuggingFace Transformers, Datasets and PyTorch Lightning | [Niels Rogge](https://github.com/nielsrogge) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) |
+| [Fine-tune the Vision Transformer on CIFAR-10 using the 🤗 Trainer](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) | How to fine-tune the Vision Transformer (ViT) on CIFAR-10 using HuggingFace Transformers, Datasets and the 🤗 Trainer | [Niels Rogge](https://github.com/nielsrogge) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) |
+| [Evaluate LUKE on Open Entity, an entity typing dataset](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) | How to evaluate *LukeForEntityClassification* on the Open Entity dataset | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) |
+| [Evaluate LUKE on TACRED, a relation extraction dataset](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) | How to evaluate *LukeForEntityPairClassification* on the TACRED dataset | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) |
+| [Evaluate LUKE on CoNLL-2003, an important NER benchmark](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) | How to evaluate *LukeForEntitySpanClassification* on the CoNLL-2003 dataset | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) |
+| [Evaluate BigBird-Pegasus on PubMed dataset](https://github.com/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) | How to evaluate *BigBirdPegasusForConditionalGeneration* on PubMed dataset | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) |
+| [Speech Emotion Classification with Wav2Vec2](https://github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) | How to leverage a pretrained Wav2Vec2 model for Emotion Classification on the MEGA dataset | [Mehrdad Farahani](https://github.com/m3hrdadfi) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) |
+| [Detect objects in an image with DETR](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) | How to use a trained *DetrForObjectDetection* model to detect objects in an image and visualize attention | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) |
+| [Fine-tune DETR on a custom object detection dataset](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) | How to fine-tune *DetrForObjectDetection* on a custom object detection dataset | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) |
+| [Finetune T5 for Named Entity Recognition](https://github.com/ToluClassics/Notebooks/blob/main/T5_Ner_Finetuning.ipynb) | How to fine-tune *T5* on a Named Entity Recognition Task | [Ogundepo Odunayo](https://github.com/ToluClassics) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1obr78FY_cBmWY5ODViCmzdY6O1KB65Vc?usp=sharing) |
diff --git a/docs/source/en/community.mdx b/docs/source/en/community.mdx
deleted file mode 100644
index 808b16779dd9..000000000000
--- a/docs/source/en/community.mdx
+++ /dev/null
@@ -1,65 +0,0 @@
-﻿# Community
-
-This page regroups resources around 🤗 Transformers developed by the community.
-
-## Community resources:
-
-| Resource     |      Description      |      Author      |
-|:----------|:-------------|------:|
-| [Hugging Face Transformers Glossary Flashcards](https://www.darigovresearch.com/huggingface-transformers-glossary-flashcards) | A set of flashcards based on the [Transformers Docs Glossary](glossary) that has been put into a form which can be easily learnt/revised using [Anki ](https://apps.ankiweb.net/) an open source, cross platform app specifically designed for long term knowledge retention. See this [Introductory video on how to use the flashcards](https://www.youtube.com/watch?v=Dji_h7PILrw). | [Darigov Research](https://www.darigovresearch.com/) |
-
-## Community notebooks:
-
-| Notebook     |      Description      |      Author      |      |
-|:----------|:-------------|:-------------|------:|
-| [Fine-tune a pre-trained Transformer to generate lyrics](https://github.com/AlekseyKorshuk/huggingartists) | How to generate lyrics in the style of your favorite artist by fine-tuning a GPT-2 model |  [Aleksey Korshuk](https://github.com/AlekseyKorshuk) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AlekseyKorshuk/huggingartists/blob/master/huggingartists-demo.ipynb) |
-| [Train T5 in Tensorflow 2 ](https://github.com/snapthat/TF-T5-text-to-text) | How to train T5 for any task using Tensorflow 2. This notebook demonstrates a Question & Answer task implemented in Tensorflow 2 using SQUAD | [Muhammad Harris](https://github.com/HarrisDePerceptron) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snapthat/TF-T5-text-to-text/blob/master/snapthatT5/notebooks/TF-T5-Datasets%20Training.ipynb) |
-| [Train T5 on TPU](https://github.com/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb)  | How to train T5 on SQUAD with Transformers and Nlp | [Suraj Patil](https://github.com/patil-suraj) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=QLGiFCDqvuil) |
-| [Fine-tune T5 for Classification and Multiple Choice](https://github.com/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb)  | How to fine-tune T5 for classification and multiple choice tasks using a text-to-text format with PyTorch Lightning |  [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) |
-| [Fine-tune DialoGPT on New Datasets and Languages](https://github.com/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb)  | How to fine-tune the DialoGPT model on a new dataset for open-dialog conversational chatbots |  [Nathan Cooper](https://github.com/ncoop57) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) |
-| [Long Sequence Modeling with Reformer](https://github.com/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb)  | How to train on sequences as long as 500,000 tokens with Reformer |  [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb)  |
-| [Fine-tune BART for Summarization](https://github.com/ohmeow/ohmeow_website/blob/master/_notebooks/2020-05-23-text-generation-with-blurr.ipynb) | How to fine-tune BART for summarization with fastai using blurr | [Wayde Gilliam](https://ohmeow.com/) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ohmeow/ohmeow_website/blob/master/_notebooks/2020-05-23-text-generation-with-blurr.ipynb) |
-| [Fine-tune a pre-trained Transformer on anyone's tweets](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) | How to generate tweets in the style of your favorite Twitter account by fine-tuning a GPT-2 model |  [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) |
-| [Optimize 🤗 Hugging Face models with Weights & Biases](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) | A complete tutorial showcasing W&B integration with Hugging Face | [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) |
-| [Pretrain Longformer](https://github.com/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb)  | How to build a "long" version of existing pretrained models |  [Iz Beltagy](https://beltagy.net) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) |
-| [Fine-tune Longformer for QA](https://github.com/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) | How to fine-tune longformer model for QA task | [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) |
-| [Evaluate Model with 🤗nlp](https://github.com/patrickvonplaten/notebooks/blob/master/How_to_evaluate_Longformer_on_TriviaQA_using_NLP.ipynb) | How to evaluate longformer on TriviaQA with `nlp` | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1m7eTGlPmLRgoPkkA7rkhQdZ9ydpmsdLE?usp=sharing) |
-| [Fine-tune T5 for Sentiment Span Extraction](https://github.com/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb)  | How to fine-tune T5 for sentiment span extraction using a text-to-text format with PyTorch Lightning |  [Lorenzo Ampil](https://github.com/enzoampil) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb) |
-| [Fine-tune DistilBert for Multiclass Classification](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb) | How to fine-tune DistilBert for multiclass classification with PyTorch | [Abhishek Kumar Mishra](https://github.com/abhimishra91) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb)|
-|[Fine-tune BERT for Multi-label Classification](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)|How to fine-tune BERT for multi-label classification using PyTorch|[Abhishek Kumar Mishra](https://github.com/abhimishra91) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)|
-|[Fine-tune T5 for Summarization](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)|How to fine-tune T5 for summarization in PyTorch and track experiments with WandB|[Abhishek Kumar Mishra](https://github.com/abhimishra91) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)|
-|[Speed up Fine-Tuning in Transformers with Dynamic Padding / Bucketing](https://github.com/ELS-RD/transformers-notebook/blob/master/Divide_Hugging_Face_Transformers_training_time_by_2_or_more.ipynb)|How to speed up fine-tuning by a factor of 2 using dynamic padding / bucketing|[Michael Benesty](https://github.com/pommedeterresautee) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1CBfRU1zbfu7-ijiOqAAQUA-RJaxfcJoO?usp=sharing)|
-|[Pretrain Reformer for Masked Language Modeling](https://github.com/patrickvonplaten/notebooks/blob/master/Reformer_For_Masked_LM.ipynb)| How to train a Reformer model with bi-directional self-attention layers | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tzzh0i8PgDQGV3SMFUGxM7_gGae3K-uW?usp=sharing)|
-|[Expand and Fine Tune Sci-BERT](https://github.com/lordtt13/word-embeddings/blob/master/COVID-19%20Research%20Data/COVID-SciBERT.ipynb)| How to increase vocabulary of a pretrained SciBERT model from AllenAI on the CORD dataset and pipeline it. | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1rqAR40goxbAfez1xvF3hBJphSCsvXmh8)|
-|[Fine Tune BlenderBotSmall for Summarization using the Trainer API](https://github.com/lordtt13/transformers-experiments/blob/master/Custom%20Tasks/fine-tune-blenderbot_small-for-summarization.ipynb)| How to fine tune BlenderBotSmall for summarization on a custom dataset, using the Trainer API. | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/19Wmupuls7mykSGyRN_Qo6lPQhgp56ymq?usp=sharing)|
-|[Fine-tune Electra and interpret with Integrated Gradients](https://github.com/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb) | How to fine-tune Electra for sentiment analysis and interpret predictions with Captum Integrated Gradients | [Eliza Szczechla](https://elsanns.github.io) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb)|
-|[fine-tune a non-English GPT-2 Model with Trainer class](https://github.com/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb) | How to fine-tune a non-English GPT-2 Model with Trainer class | [Philipp Schmid](https://www.philschmid.de) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb)|
-|[Fine-tune a DistilBERT Model for Multi Label Classification task](https://github.com/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb) | How to fine-tune a DistilBERT Model for Multi Label Classification task | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb)|
-|[Fine-tune ALBERT for sentence-pair classification](https://github.com/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb) | How to fine-tune an ALBERT model or another BERT-based model for the sentence-pair classification task | [Nadir El Manouzi](https://github.com/NadirEM) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb)|
-|[Fine-tune Roberta for sentiment analysis](https://github.com/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb) | How to fine-tune a Roberta model for sentiment analysis | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb)|
-|[Evaluating Question Generation Models](https://github.com/flexudy-pipe/qugeev) | How accurate are the answers to questions generated by your seq2seq transformer model? | [Pascal Zoleko](https://github.com/zolekode) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1bpsSqCQU-iw_5nNoRm_crPq6FRuJthq_?usp=sharing)|
-|[Classify text with DistilBERT and Tensorflow](https://github.com/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb) | How to fine-tune DistilBERT for text classification in TensorFlow | [Peter Bayerle](https://github.com/peterbayerle) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb)|
-|[Leverage BERT for Encoder-Decoder Summarization on CNN/Dailymail](https://github.com/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb) | How to warm-start a *EncoderDecoderModel* with a *bert-base-uncased* checkpoint for summarization on CNN/Dailymail | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb)|
-|[Leverage RoBERTa for Encoder-Decoder Summarization on BBC XSum](https://github.com/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb) | How to warm-start a shared *EncoderDecoderModel* with a *roberta-base* checkpoint for summarization on BBC/XSum | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb)|
-|[Fine-tune TAPAS on Sequential Question Answering (SQA)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) | How to fine-tune *TapasForQuestionAnswering* with a *tapas-base* checkpoint on the Sequential Question Answering (SQA) dataset | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb)|
-|[Evaluate TAPAS on Table Fact Checking (TabFact)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb) | How to evaluate a fine-tuned *TapasForSequenceClassification* with a *tapas-base-finetuned-tabfact* checkpoint using a combination of the 🤗 datasets and 🤗 transformers libraries | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb)|
-|[Fine-tuning mBART for translation](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb) | How to fine-tune mBART using Seq2SeqTrainer for Hindi to English translation | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb)|
-|[Fine-tune LayoutLM on FUNSD (a form understanding dataset)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb) | How to fine-tune *LayoutLMForTokenClassification* on the FUNSD dataset for information extraction from scanned documents | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb)|
-|[Fine-Tune DistilGPT2 and Generate Text](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb) | How to fine-tune DistilGPT2 and generate text | [Aakash Tripathi](https://github.com/tripathiaakash) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb)|
-|[Fine-Tune LED on up to 8K tokens](https://github.com/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb) | How to fine-tune LED on pubmed for long-range summarization | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb)|
-|[Evaluate LED on Arxiv](https://github.com/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb) | How to effectively evaluate LED on long-range summarization | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb)|
-|[Fine-tune LayoutLM on RVL-CDIP (a document image classification dataset)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb) | How to fine-tune *LayoutLMForSequenceClassification* on the RVL-CDIP dataset for scanned document classification | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb)|
-|[Wav2Vec2 CTC decoding with GPT2 adjustment](https://github.com/voidful/huggingface_notebook/blob/main/xlsr_gpt.ipynb) | How to decode CTC sequence with language model adjustment | [Eric Lam](https://github.com/voidful) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1e_z5jQHYbO2YKEaUgzb1ww1WwiAyydAj?usp=sharing)|
-|[Fine-tune BART for summarization in two languages with Trainer class](https://github.com/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb) | How to fine-tune BART for summarization in two languages with Trainer class | [Eliza Szczechla](https://github.com/elsanns) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb)|
-|[Evaluate Big Bird on Trivia QA](https://github.com/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb) | How to evaluate BigBird on long document question answering on Trivia QA | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb)|
-| [Create video captions using Wav2Vec2](https://github.com/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) | How to create YouTube captions from any video by transcribing the audio with Wav2Vec | [Niklas Muennighoff](https://github.com/Muennighoff) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) |
-| [Fine-tune the Vision Transformer on CIFAR-10 using PyTorch Lightning](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) | How to fine-tune the Vision Transformer (ViT) on CIFAR-10 using HuggingFace Transformers, Datasets and PyTorch Lightning | [Niels Rogge](https://github.com/nielsrogge) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) |
-| [Fine-tune the Vision Transformer on CIFAR-10 using the 🤗 Trainer](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) | How to fine-tune the Vision Transformer (ViT) on CIFAR-10 using HuggingFace Transformers, Datasets and the 🤗 Trainer | [Niels Rogge](https://github.com/nielsrogge) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) |
-| [Evaluate LUKE on Open Entity, an entity typing dataset](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) | How to evaluate *LukeForEntityClassification* on the Open Entity dataset | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) |
-| [Evaluate LUKE on TACRED, a relation extraction dataset](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) | How to evaluate *LukeForEntityPairClassification* on the TACRED dataset | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) |
-| [Evaluate LUKE on CoNLL-2003, an important NER benchmark](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) | How to evaluate *LukeForEntitySpanClassification* on the CoNLL-2003 dataset | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) |
-| [Evaluate BigBird-Pegasus on PubMed dataset](https://github.com/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) | How to evaluate *BigBirdPegasusForConditionalGeneration* on PubMed dataset | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) |
-| [Speech Emotion Classification with Wav2Vec2](https://github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) | How to leverage a pretrained Wav2Vec2 model for Emotion Classification on the MEGA dataset | [Mehrdad Farahani](https://github.com/m3hrdadfi) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) |
-| [Detect objects in an image with DETR](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) | How to use a trained *DetrForObjectDetection* model to detect objects in an image and visualize attention | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) |
-| [Fine-tune DETR on a custom object detection dataset](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) | How to fine-tune *DetrForObjectDetection* on a custom object detection dataset | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) |
-| [Finetune T5 for Named Entity Recognition](https://github.com/ToluClassics/Notebooks/blob/main/T5_Ner_Finetuning.ipynb) | How to fine-tune *T5* on a Named Entity Recognition Task | [Ogundepo Odunayo](https://github.com/ToluClassics) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1obr78FY_cBmWY5ODViCmzdY6O1KB65Vc?usp=sharing) |
diff --git a/docs/source/en/converting_tensorflow_models.mdx b/docs/source/en/converting_tensorflow_models.mdx
deleted file mode 100644
index 8dc51dd61670..000000000000
--- a/docs/source/en/converting_tensorflow_models.mdx
+++ /dev/null
@@ -1,162 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Converting From Tensorflow Checkpoints
-
-A command-line interface is provided to convert original Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM checkpoints to models
-that can be loaded using the `from_pretrained` methods of the library.
-
-<Tip>
-
-Since 2.3.0 the conversion script is now part of the transformers CLI (**transformers-cli**) available in any
-transformers >= 2.3.0 installation.
-
-The documentation below reflects the **transformers-cli convert** command format.
-
-</Tip>
-
-## BERT
-
-You can convert any TensorFlow checkpoint for BERT (in particular [the pre-trained models released by Google](https://github.com/google-research/bert#pre-trained-models)) in a PyTorch save file by using the
-[convert_bert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py) script.
-
-This CLI takes as input a TensorFlow checkpoint (three files starting with `bert_model.ckpt`) and the associated
-configuration file (`bert_config.json`), and creates a PyTorch model for this configuration, loads the weights from
-the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can
-be imported using `from_pretrained()` (see example in [quicktour](quicktour) , [run_glue.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification/run_glue.py) ).
-
-You only need to run this conversion script **once** to get a PyTorch model. You can then disregard the TensorFlow
-checkpoint (the three files starting with `bert_model.ckpt`) but be sure to keep the configuration file (\
-`bert_config.json`) and the vocabulary file (`vocab.txt`) as these are needed for the PyTorch model too.
-
-To run this specific conversion script you will need to have TensorFlow and PyTorch installed (`pip install tensorflow`). The rest of the repository only requires PyTorch.
-
-Here is an example of the conversion process for a pre-trained `BERT-Base Uncased` model:
-
-```bash
-export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
-
-transformers-cli convert --model_type bert \
-  --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
-  --config $BERT_BASE_DIR/bert_config.json \
-  --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
-```
-
-You can download Google's pre-trained models for the conversion [here](https://github.com/google-research/bert#pre-trained-models).
-
-## ALBERT
-
-Convert TensorFlow model checkpoints of ALBERT to PyTorch using the
-[convert_albert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py) script.
-
-The CLI takes as input a TensorFlow checkpoint (three files starting with `model.ckpt-best`) and the accompanying
-configuration file (`albert_config.json`), then creates and saves a PyTorch model. To run this conversion you will
-need to have TensorFlow and PyTorch installed.
-
-Here is an example of the conversion process for the pre-trained `ALBERT Base` model:
-
-```bash
-export ALBERT_BASE_DIR=/path/to/albert/albert_base
-
-transformers-cli convert --model_type albert \
-  --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
-  --config $ALBERT_BASE_DIR/albert_config.json \
-  --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
-```
-
-You can download Google's pre-trained models for the conversion [here](https://github.com/google-research/albert#pre-trained-models).
-
-## OpenAI GPT
-
-Here is an example of the conversion process for a pre-trained OpenAI GPT model, assuming that your NumPy checkpoint
-save as the same format than OpenAI pretrained model (see [here](https://github.com/openai/finetune-transformer-lm)\
-)
-
-```bash
-export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
-
-transformers-cli convert --model_type gpt \
-  --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
-  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-  [--config OPENAI_GPT_CONFIG] \
-  [--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \
-```
-
-## OpenAI GPT-2
-
-Here is an example of the conversion process for a pre-trained OpenAI GPT-2 model (see [here](https://github.com/openai/gpt-2))
-
-```bash
-export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights
-
-transformers-cli convert --model_type gpt2 \
-  --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
-  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-  [--config OPENAI_GPT2_CONFIG] \
-  [--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]
-```
-
-## Transformer-XL
-
-Here is an example of the conversion process for a pre-trained Transformer-XL model (see [here](https://github.com/kimiyoung/transformer-xl/tree/master/tf#obtain-and-evaluate-pretrained-sota-models))
-
-```bash
-export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
-
-transformers-cli convert --model_type transfo_xl \
-  --tf_checkpoint $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
-  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-  [--config TRANSFO_XL_CONFIG] \
-  [--finetuning_task_name TRANSFO_XL_FINETUNED_TASK]
-```
-
-## XLNet
-
-Here is an example of the conversion process for a pre-trained XLNet model:
-
-```bash
-export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
-export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
-
-transformers-cli convert --model_type xlnet \
-  --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
-  --config $TRANSFO_XL_CONFIG_PATH \
-  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-  [--finetuning_task_name XLNET_FINETUNED_TASK] \
-```
-
-## XLM
-
-Here is an example of the conversion process for a pre-trained XLM model:
-
-```bash
-export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
-
-transformers-cli convert --model_type xlm \
-  --tf_checkpoint $XLM_CHECKPOINT_PATH \
-  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT
- [--config XML_CONFIG] \
- [--finetuning_task_name XML_FINETUNED_TASK]
-```
-
-## T5
-
-Here is an example of the conversion process for a pre-trained T5 model:
-
-```bash
-export T5=/path/to/t5/uncased_L-12_H-768_A-12
-
-transformers-cli convert --model_type t5 \
-  --tf_checkpoint $T5/t5_model.ckpt \
-  --config $T5/t5_config.json \
-  --pytorch_dump_output $T5/pytorch_model.bin
-```
diff --git a/docs/source/en/create_a_model.md b/docs/source/en/create_a_model.md
new file mode 100644
index 000000000000..ba384d437b80
--- /dev/null
+++ b/docs/source/en/create_a_model.md
@@ -0,0 +1,389 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Create a custom architecture
+
+An [`AutoClass`](model_doc/auto) automatically infers the model architecture and downloads pretrained configuration and weights. Generally, we recommend using an `AutoClass` to produce checkpoint-agnostic code. But users who want more control over specific model parameters can create a custom 🤗 Transformers model from just a few base classes. This could be particularly useful for anyone who is interested in studying, training or experimenting with a 🤗 Transformers model. In this guide, dive deeper into creating a custom model without an `AutoClass`. Learn how to:
+
+- Load and customize a model configuration.
+- Create a model architecture.
+- Create a slow and fast tokenizer for text.
+- Create an image processor for vision tasks.
+- Create a feature extractor for audio tasks.
+- Create a processor for multimodal tasks.
+
+## Configuration
+
+A [configuration](main_classes/configuration) refers to a model's specific attributes. Each model configuration has different attributes; for instance, all NLP models have the `hidden_size`, `num_attention_heads`, `num_hidden_layers` and `vocab_size` attributes in common. These attributes specify the number of attention heads or hidden layers to construct a model with.
+
+Get a closer look at [DistilBERT](model_doc/distilbert) by accessing [`DistilBertConfig`] to inspect it's attributes:
+
+```py
+>>> from transformers import DistilBertConfig
+
+>>> config = DistilBertConfig()
+>>> print(config)
+DistilBertConfig {
+  "activation": "gelu",
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "transformers_version": "4.16.2",
+  "vocab_size": 30522
+}
+```
+
+[`DistilBertConfig`] displays all the default attributes used to build a base [`DistilBertModel`]. All attributes are customizable, creating space for experimentation. For example, you can customize a default model to:
+
+- Try a different activation function with the `activation` parameter.
+- Use a higher dropout ratio for the attention probabilities with the `attention_dropout` parameter.
+
+```py
+>>> my_config = DistilBertConfig(activation="relu", attention_dropout=0.4)
+>>> print(my_config)
+DistilBertConfig {
+  "activation": "relu",
+  "attention_dropout": 0.4,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "transformers_version": "4.16.2",
+  "vocab_size": 30522
+}
+```
+
+Pretrained model attributes can be modified in the [`~PretrainedConfig.from_pretrained`] function:
+
+```py
+>>> my_config = DistilBertConfig.from_pretrained("distilbert-base-uncased", activation="relu", attention_dropout=0.4)
+```
+
+Once you are satisfied with your model configuration, you can save it with [`~PretrainedConfig.save_pretrained`]. Your configuration file is stored as a JSON file in the specified save directory:
+
+```py
+>>> my_config.save_pretrained(save_directory="./your_model_save_path")
+```
+
+To reuse the configuration file, load it with [`~PretrainedConfig.from_pretrained`]:
+
+```py
+>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/config.json")
+```
+
+<Tip>
+
+You can also save your configuration file as a dictionary or even just the difference between your custom configuration attributes and the default configuration attributes! See the [configuration](main_classes/configuration) documentation for more details.
+
+</Tip>
+
+## Model
+
+The next step is to create a [model](main_classes/models). The model - also loosely referred to as the architecture - defines what each layer is doing and what operations are happening. Attributes like `num_hidden_layers` from the configuration are used to define the architecture. Every model shares the base class [`PreTrainedModel`] and a few common methods like resizing input embeddings and pruning self-attention heads. In addition, all models are also either a [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html), [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) or [`flax.linen.Module`](https://flax.readthedocs.io/en/latest/flax.linen.html#module) subclass. This means models are compatible with each of their respective framework's usage.
+
+<frameworkcontent>
+<pt>
+Load your custom configuration attributes into the model:
+
+```py
+>>> from transformers import DistilBertModel
+
+>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/config.json")
+>>> model = DistilBertModel(my_config)
+```
+
+This creates a model with random values instead of pretrained weights. You won't be able to use this model for anything useful yet until you train it. Training is a costly and time-consuming process. It is generally better to use a pretrained model to obtain better results faster, while using only a fraction of the resources required for training.
+
+Create a pretrained model with [`~PreTrainedModel.from_pretrained`]:
+
+```py
+>>> model = DistilBertModel.from_pretrained("distilbert-base-uncased")
+```
+
+When you load pretrained weights, the default model configuration is automatically loaded if the model is provided by 🤗 Transformers. However, you can still replace - some or all of - the default model configuration attributes with your own if you'd like:
+
+```py
+>>> model = DistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
+```
+</pt>
+<tf>
+Load your custom configuration attributes into the model:
+
+```py
+>>> from transformers import TFDistilBertModel
+
+>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json")
+>>> tf_model = TFDistilBertModel(my_config)
+```
+
+This creates a model with random values instead of pretrained weights. You won't be able to use this model for anything useful yet until you train it. Training is a costly and time-consuming process. It is generally better to use a pretrained model to obtain better results faster, while using only a fraction of the resources required for training.
+
+Create a pretrained model with [`~TFPreTrainedModel.from_pretrained`]:
+
+```py
+>>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased")
+```
+
+When you load pretrained weights, the default model configuration is automatically loaded if the model is provided by 🤗 Transformers. However, you can still replace - some or all of - the default model configuration attributes with your own if you'd like:
+
+```py
+>>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
+```
+</tf>
+</frameworkcontent>
+
+### Model heads
+
+At this point, you have a base DistilBERT model which outputs the *hidden states*. The hidden states are passed as inputs to a model head to produce the final output. 🤗 Transformers provides a different model head for each task as long as a model supports the task (i.e., you can't use DistilBERT for a sequence-to-sequence task like translation).
+
+<frameworkcontent>
+<pt>
+For example, [`DistilBertForSequenceClassification`] is a base DistilBERT model with a sequence classification head. The sequence classification head is a linear layer on top of the pooled outputs.
+
+```py
+>>> from transformers import DistilBertForSequenceClassification
+
+>>> model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
+```
+
+Easily reuse this checkpoint for another task by switching to a different model head. For a question answering task, you would use the [`DistilBertForQuestionAnswering`] model head. The question answering head is similar to the sequence classification head except it is a linear layer on top of the hidden states output.
+
+```py
+>>> from transformers import DistilBertForQuestionAnswering
+
+>>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+```
+</pt>
+<tf>
+For example, [`TFDistilBertForSequenceClassification`] is a base DistilBERT model with a sequence classification head. The sequence classification head is a linear layer on top of the pooled outputs.
+
+```py
+>>> from transformers import TFDistilBertForSequenceClassification
+
+>>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
+```
+
+Easily reuse this checkpoint for another task by switching to a different model head. For a question answering task, you would use the [`TFDistilBertForQuestionAnswering`] model head. The question answering head is similar to the sequence classification head except it is a linear layer on top of the hidden states output.
+
+```py
+>>> from transformers import TFDistilBertForQuestionAnswering
+
+>>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+```
+</tf>
+</frameworkcontent>
+
+## Tokenizer
+
+The last base class you need before using a model for textual data is a [tokenizer](main_classes/tokenizer) to convert raw text to tensors. There are two types of tokenizers you can use with 🤗 Transformers:
+
+- [`PreTrainedTokenizer`]: a Python implementation of a tokenizer.
+- [`PreTrainedTokenizerFast`]: a tokenizer from our Rust-based [🤗 Tokenizer](https://huggingface.co/docs/tokenizers/python/latest/) library. This tokenizer type is significantly faster - especially during batch tokenization - due to its Rust implementation. The fast tokenizer also offers additional methods like *offset mapping* which maps tokens to their original words or characters.
+
+Both tokenizers support common methods such as encoding and decoding, adding new tokens, and managing special tokens.
+
+<Tip warning={true}>
+
+Not every model supports a fast tokenizer. Take a look at this [table](index#supported-frameworks) to check if a model has fast tokenizer support.
+
+</Tip>
+
+If you trained your own tokenizer, you can create one from your *vocabulary* file:
+
+```py
+>>> from transformers import DistilBertTokenizer
+
+>>> my_tokenizer = DistilBertTokenizer(vocab_file="my_vocab_file.txt", do_lower_case=False, padding_side="left")
+```
+
+It is important to remember the vocabulary from a custom tokenizer will be different from the vocabulary generated by a pretrained model's tokenizer. You need to use a pretrained model's vocabulary if you are using a pretrained model, otherwise the inputs won't make sense. Create a tokenizer with a pretrained model's vocabulary with the [`DistilBertTokenizer`] class:
+
+```py
+>>> from transformers import DistilBertTokenizer
+
+>>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
+```
+
+Create a fast tokenizer with the [`DistilBertTokenizerFast`] class:
+
+```py
+>>> from transformers import DistilBertTokenizerFast
+
+>>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
+```
+
+<Tip>
+
+By default, [`AutoTokenizer`] will try to load a fast tokenizer. You can disable this behavior by setting `use_fast=False` in `from_pretrained`.
+
+</Tip>
+
+## Image Processor
+
+An image processor processes vision inputs. It inherits from the base [`~image_processing_utils.ImageProcessingMixin`] class.
+
+To use, create an image processor associated with the model you're using. For example, create a default [`ViTImageProcessor`] if you are using [ViT](model_doc/vit) for image classification:
+
+```py
+>>> from transformers import ViTImageProcessor
+
+>>> vit_extractor = ViTImageProcessor()
+>>> print(vit_extractor)
+ViTImageProcessor {
+  "do_normalize": true,
+  "do_resize": true,
+  "image_processor_type": "ViTImageProcessor",
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "resample": 2,
+  "size": 224
+}
+```
+
+<Tip>
+
+If you aren't looking for any customization, just use the `from_pretrained` method to load a model's default image processor parameters.
+
+</Tip>
+
+Modify any of the [`ViTImageProcessor`] parameters to create your custom image processor:
+
+```py
+>>> from transformers import ViTImageProcessor
+
+>>> my_vit_extractor = ViTImageProcessor(resample="PIL.Image.BOX", do_normalize=False, image_mean=[0.3, 0.3, 0.3])
+>>> print(my_vit_extractor)
+ViTImageProcessor {
+  "do_normalize": false,
+  "do_resize": true,
+  "image_processor_type": "ViTImageProcessor",
+  "image_mean": [
+    0.3,
+    0.3,
+    0.3
+  ],
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "resample": "PIL.Image.BOX",
+  "size": 224
+}
+```
+
+## Feature Extractor
+
+A feature extractor processes audio inputs. It inherits from the base [`~feature_extraction_utils.FeatureExtractionMixin`] class, and may also inherit from the [`SequenceFeatureExtractor`] class for processing audio inputs.
+
+To use, create a feature extractor associated with the model you're using. For example, create a default [`Wav2Vec2FeatureExtractor`] if you are using [Wav2Vec2](model_doc/wav2vec2) for audio classification:
+
+```py
+>>> from transformers import Wav2Vec2FeatureExtractor
+
+>>> w2v2_extractor = Wav2Vec2FeatureExtractor()
+>>> print(w2v2_extractor)
+Wav2Vec2FeatureExtractor {
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "return_attention_mask": false,
+  "sampling_rate": 16000
+}
+```
+
+<Tip>
+
+If you aren't looking for any customization, just use the `from_pretrained` method to load a model's default feature extractor parameters.
+
+</Tip>
+
+Modify any of the [`Wav2Vec2FeatureExtractor`] parameters to create your custom feature extractor:
+
+```py
+>>> from transformers import Wav2Vec2FeatureExtractor
+
+>>> w2v2_extractor = Wav2Vec2FeatureExtractor(sampling_rate=8000, do_normalize=False)
+>>> print(w2v2_extractor)
+Wav2Vec2FeatureExtractor {
+  "do_normalize": false,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "return_attention_mask": false,
+  "sampling_rate": 8000
+}
+```
+
+
+## Processor
+
+For models that support multimodal tasks, 🤗 Transformers offers a processor class that conveniently wraps processing classes such as a feature extractor and a tokenizer into a single object. For example, let's use the [`Wav2Vec2Processor`] for an automatic speech recognition task (ASR). ASR transcribes audio to text, so you will need a feature extractor and a tokenizer.
+
+Create a feature extractor to handle the audio inputs:
+
+```py
+>>> from transformers import Wav2Vec2FeatureExtractor
+
+>>> feature_extractor = Wav2Vec2FeatureExtractor(padding_value=1.0, do_normalize=True)
+```
+
+Create a tokenizer to handle the text inputs:
+
+```py
+>>> from transformers import Wav2Vec2CTCTokenizer
+
+>>> tokenizer = Wav2Vec2CTCTokenizer(vocab_file="my_vocab_file.txt")
+```
+
+Combine the feature extractor and tokenizer in [`Wav2Vec2Processor`]:
+
+```py
+>>> from transformers import Wav2Vec2Processor
+
+>>> processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+```
+
+With two basic classes - configuration and model - and an additional preprocessing class (tokenizer, image processor, feature extractor, or processor), you can create any of the models supported by 🤗 Transformers. Each of these base classes are configurable, allowing you to use the specific attributes you want. You can easily setup a model for training or modify an existing pretrained model to fine-tune.
diff --git a/docs/source/en/create_a_model.mdx b/docs/source/en/create_a_model.mdx
deleted file mode 100644
index b0bafa4589b2..000000000000
--- a/docs/source/en/create_a_model.mdx
+++ /dev/null
@@ -1,385 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Create a custom architecture
-
-An [`AutoClass`](model_doc/auto) automatically infers the model architecture and downloads pretrained configuration and weights. Generally, we recommend using an `AutoClass` to produce checkpoint-agnostic code. But users who want more control over specific model parameters can create a custom 🤗 Transformers model from just a few base classes. This could be particularly useful for anyone who is interested in studying, training or experimenting with a 🤗 Transformers model. In this guide, dive deeper into creating a custom model without an `AutoClass`. Learn how to:
-
-- Load and customize a model configuration.
-- Create a model architecture.
-- Create a slow and fast tokenizer for text.
-- Create an image processor for vision tasks.
-- Create a feature extractor for audio tasks.
-- Create a processor for multimodal tasks.
-
-## Configuration
-
-A [configuration](main_classes/configuration) refers to a model's specific attributes. Each model configuration has different attributes; for instance, all NLP models have the `hidden_size`, `num_attention_heads`, `num_hidden_layers` and `vocab_size` attributes in common. These attributes specify the number of attention heads or hidden layers to construct a model with.
-
-Get a closer look at [DistilBERT](model_doc/distilbert) by accessing [`DistilBertConfig`] to inspect it's attributes:
-
-```py
->>> from transformers import DistilBertConfig
-
->>> config = DistilBertConfig()
->>> print(config)
-DistilBertConfig {
-  "activation": "gelu",
-  "attention_dropout": 0.1,
-  "dim": 768,
-  "dropout": 0.1,
-  "hidden_dim": 3072,
-  "initializer_range": 0.02,
-  "max_position_embeddings": 512,
-  "model_type": "distilbert",
-  "n_heads": 12,
-  "n_layers": 6,
-  "pad_token_id": 0,
-  "qa_dropout": 0.1,
-  "seq_classif_dropout": 0.2,
-  "sinusoidal_pos_embds": false,
-  "transformers_version": "4.16.2",
-  "vocab_size": 30522
-}
-```
-
-[`DistilBertConfig`] displays all the default attributes used to build a base [`DistilBertModel`]. All attributes are customizable, creating space for experimentation. For example, you can customize a default model to:
-
-- Try a different activation function with the `activation` parameter.
-- Use a higher dropout ratio for the attention probabilities with the `attention_dropout` parameter.
-
-```py
->>> my_config = DistilBertConfig(activation="relu", attention_dropout=0.4)
->>> print(my_config)
-DistilBertConfig {
-  "activation": "relu",
-  "attention_dropout": 0.4,
-  "dim": 768,
-  "dropout": 0.1,
-  "hidden_dim": 3072,
-  "initializer_range": 0.02,
-  "max_position_embeddings": 512,
-  "model_type": "distilbert",
-  "n_heads": 12,
-  "n_layers": 6,
-  "pad_token_id": 0,
-  "qa_dropout": 0.1,
-  "seq_classif_dropout": 0.2,
-  "sinusoidal_pos_embds": false,
-  "transformers_version": "4.16.2",
-  "vocab_size": 30522
-}
-```
-
-Pretrained model attributes can be modified in the [`~PretrainedConfig.from_pretrained`] function:
-
-```py
->>> my_config = DistilBertConfig.from_pretrained("distilbert-base-uncased", activation="relu", attention_dropout=0.4)
-```
-
-Once you are satisfied with your model configuration, you can save it with [`~PretrainedConfig.save_pretrained`]. Your configuration file is stored as a JSON file in the specified save directory:
-
-```py
->>> my_config.save_pretrained(save_directory="./your_model_save_path")
-```
-
-To reuse the configuration file, load it with [`~PretrainedConfig.from_pretrained`]:
-
-```py
->>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json")
-```
-
-<Tip>
-
-You can also save your configuration file as a dictionary or even just the difference between your custom configuration attributes and the default configuration attributes! See the [configuration](main_classes/configuration) documentation for more details.
-
-</Tip>
-
-## Model
-
-The next step is to create a [model](main_classes/models). The model - also loosely referred to as the architecture - defines what each layer is doing and what operations are happening. Attributes like `num_hidden_layers` from the configuration are used to define the architecture. Every model shares the base class [`PreTrainedModel`] and a few common methods like resizing input embeddings and pruning self-attention heads. In addition, all models are also either a [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html), [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) or [`flax.linen.Module`](https://flax.readthedocs.io/en/latest/flax.linen.html#module) subclass. This means models are compatible with each of their respective framework's usage.
-
-<frameworkcontent>
-<pt>
-Load your custom configuration attributes into the model:
-
-```py
->>> from transformers import DistilBertModel
-
->>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json")
->>> model = DistilBertModel(my_config)
-```
-
-This creates a model with random values instead of pretrained weights. You won't be able to use this model for anything useful yet until you train it. Training is a costly and time-consuming process. It is generally better to use a pretrained model to obtain better results faster, while using only a fraction of the resources required for training.
-
-Create a pretrained model with [`~PreTrainedModel.from_pretrained`]:
-
-```py
->>> model = DistilBertModel.from_pretrained("distilbert-base-uncased")
-```
-
-When you load pretrained weights, the default model configuration is automatically loaded if the model is provided by 🤗 Transformers. However, you can still replace - some or all of - the default model configuration attributes with your own if you'd like:
-
-```py
->>> model = DistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
-```
-</pt>
-<tf>
-Load your custom configuration attributes into the model:
-
-```py
->>> from transformers import TFDistilBertModel
-
->>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json")
->>> tf_model = TFDistilBertModel(my_config)
-```
-
-This creates a model with random values instead of pretrained weights. You won't be able to use this model for anything useful yet until you train it. Training is a costly and time-consuming process. It is generally better to use a pretrained model to obtain better results faster, while using only a fraction of the resources required for training.
-
-Create a pretrained model with [`~TFPreTrainedModel.from_pretrained`]:
-
-```py
->>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased")
-```
-
-When you load pretrained weights, the default model configuration is automatically loaded if the model is provided by 🤗 Transformers. However, you can still replace - some or all of - the default model configuration attributes with your own if you'd like:
-
-```py
->>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
-```
-</tf>
-</frameworkcontent>
-
-### Model heads
-
-At this point, you have a base DistilBERT model which outputs the *hidden states*. The hidden states are passed as inputs to a model head to produce the final output. 🤗 Transformers provides a different model head for each task as long as a model supports the task (i.e., you can't use DistilBERT for a sequence-to-sequence task like translation).
-
-<frameworkcontent>
-<pt>
-For example, [`DistilBertForSequenceClassification`] is a base DistilBERT model with a sequence classification head. The sequence classification head is a linear layer on top of the pooled outputs.
-
-```py
->>> from transformers import DistilBertForSequenceClassification
-
->>> model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
-```
-
-Easily reuse this checkpoint for another task by switching to a different model head. For a question answering task, you would use the [`DistilBertForQuestionAnswering`] model head. The question answering head is similar to the sequence classification head except it is a linear layer on top of the hidden states output.
-
-```py
->>> from transformers import DistilBertForQuestionAnswering
-
->>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
-```
-</pt>
-<tf>
-For example, [`TFDistilBertForSequenceClassification`] is a base DistilBERT model with a sequence classification head. The sequence classification head is a linear layer on top of the pooled outputs.
-
-```py
->>> from transformers import TFDistilBertForSequenceClassification
-
->>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
-```
-
-Easily reuse this checkpoint for another task by switching to a different model head. For a question answering task, you would use the [`TFDistilBertForQuestionAnswering`] model head. The question answering head is similar to the sequence classification head except it is a linear layer on top of the hidden states output.
-
-```py
->>> from transformers import TFDistilBertForQuestionAnswering
-
->>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
-```
-</tf>
-</frameworkcontent>
-
-## Tokenizer
-
-The last base class you need before using a model for textual data is a [tokenizer](main_classes/tokenizer) to convert raw text to tensors. There are two types of tokenizers you can use with 🤗 Transformers:
-
-- [`PreTrainedTokenizer`]: a Python implementation of a tokenizer.
-- [`PreTrainedTokenizerFast`]: a tokenizer from our Rust-based [🤗 Tokenizer](https://huggingface.co/docs/tokenizers/python/latest/) library. This tokenizer type is significantly faster - especially during batch tokenization - due to it's Rust implementation. The fast tokenizer also offers additional methods like *offset mapping* which maps tokens to their original words or characters.
-
-Both tokenizers support common methods such as encoding and decoding, adding new tokens, and managing special tokens.
-
-<Tip warning={true}>
-
-Not every model supports a fast tokenizer. Take a look at this [table](index#supported-frameworks) to check if a model has fast tokenizer support.
-
-</Tip>
-
-If you trained your own tokenizer, you can create one from your *vocabulary* file:
-
-```py
->>> from transformers import DistilBertTokenizer
-
->>> my_tokenizer = DistilBertTokenizer(vocab_file="my_vocab_file.txt", do_lower_case=False, padding_side="left")
-```
-
-It is important to remember the vocabulary from a custom tokenizer will be different from the vocabulary generated by a pretrained model's tokenizer. You need to use a pretrained model's vocabulary if you are using a pretrained model, otherwise the inputs won't make sense. Create a tokenizer with a pretrained model's vocabulary with the [`DistilBertTokenizer`] class:
-
-```py
->>> from transformers import DistilBertTokenizer
-
->>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
-```
-
-Create a fast tokenizer with the [`DistilBertTokenizerFast`] class:
-
-```py
->>> from transformers import DistilBertTokenizerFast
-
->>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
-```
-
-<Tip>
-
-By default, [`AutoTokenizer`] will try to load a fast tokenizer. You can disable this behavior by setting `use_fast=False` in `from_pretrained`.
-
-</Tip>
-
-## Image Processor
-
-An image processor processes vision inputs. It inherits from the base [`~image_processing_utils.ImageProcessingMixin`] class.
-
-To use, create an image processor associated with the model you're using. For example, create a default [`ViTImageProcessor`] if you are using [ViT](model_doc/vit) for image classification:
-
-```py
->>> from transformers import ViTImageProcessor
-
->>> vit_extractor = ViTImageProcessor()
->>> print(vit_extractor)
-ViTImageProcessor {
-  "do_normalize": true,
-  "do_resize": true,
-  "feature_extractor_type": "ViTImageProcessor",
-  "image_mean": [
-    0.5,
-    0.5,
-    0.5
-  ],
-  "image_std": [
-    0.5,
-    0.5,
-    0.5
-  ],
-  "resample": 2,
-  "size": 224
-}
-```
-
-<Tip>
-
-If you aren't looking for any customization, just use the `from_pretrained` method to load a model's default image processor parameters.
-
-</Tip>
-
-Modify any of the [`ViTImageProcessor`] parameters to create your custom image processor:
-
-```py
->>> from transformers import ViTImageProcessor
-
->>> my_vit_extractor = ViTImageProcessor(resample="PIL.Image.BOX", do_normalize=False, image_mean=[0.3, 0.3, 0.3])
->>> print(my_vit_extractor)
-ViTImageProcessor {
-  "do_normalize": false,
-  "do_resize": true,
-  "feature_extractor_type": "ViTImageProcessor",
-  "image_mean": [
-    0.3,
-    0.3,
-    0.3
-  ],
-  "image_std": [
-    0.5,
-    0.5,
-    0.5
-  ],
-  "resample": "PIL.Image.BOX",
-  "size": 224
-}
-```
-
-## Feature Extractor
-
-A feature extractor processes audio inputs. It inherits from the base [`~feature_extraction_utils.FeatureExtractionMixin`] class, and may also inherit from the [`SequenceFeatureExtractor`] class for processing audio inputs.
-
-To use, create a feature extractor associated with the model you're using. For example, create a default [`Wav2Vec2FeatureExtractor`] if you are using [Wav2Vec2](model_doc/wav2vec2) for audio classification:
-
-```py
->>> from transformers import Wav2Vec2FeatureExtractor
-
->>> w2v2_extractor = Wav2Vec2FeatureExtractor()
->>> print(w2v2_extractor)
-Wav2Vec2FeatureExtractor {
-  "do_normalize": true,
-  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
-  "feature_size": 1,
-  "padding_side": "right",
-  "padding_value": 0.0,
-  "return_attention_mask": false,
-  "sampling_rate": 16000
-}
-```
-
-<Tip>
-
-If you aren't looking for any customization, just use the `from_pretrained` method to load a model's default feature extractor parameters.
-
-</Tip>
-
-Modify any of the [`Wav2Vec2FeatureExtractor`] parameters to create your custom feature extractor:
-
-```py
->>> from transformers import Wav2Vec2FeatureExtractor
-
->>> w2v2_extractor = Wav2Vec2FeatureExtractor(sampling_rate=8000, do_normalize=False)
->>> print(w2v2_extractor)
-Wav2Vec2FeatureExtractor {
-  "do_normalize": false,
-  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
-  "feature_size": 1,
-  "padding_side": "right",
-  "padding_value": 0.0,
-  "return_attention_mask": false,
-  "sampling_rate": 8000
-}
-```
-
-
-## Processor
-
-For models that support multimodal tasks, 🤗 Transformers offers a processor class that conveniently wraps processing classes such as a feature extractor and a tokenizer into a single object. For example, let's use the [`Wav2Vec2Processor`] for an automatic speech recognition task (ASR). ASR transcribes audio to text, so you will need a feature extractor and a tokenizer.
-
-Create a feature extractor to handle the audio inputs:
-
-```py
->>> from transformers import Wav2Vec2FeatureExtractor
-
->>> feature_extractor = Wav2Vec2FeatureExtractor(padding_value=1.0, do_normalize=True)
-```
-
-Create a tokenizer to handle the text inputs:
-
-```py
->>> from transformers import Wav2Vec2CTCTokenizer
-
->>> tokenizer = Wav2Vec2CTCTokenizer(vocab_file="my_vocab_file.txt")
-```
-
-Combine the feature extractor and tokenizer in [`Wav2Vec2Processor`]:
-
-```py
->>> from transformers import Wav2Vec2Processor
-
->>> processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
-```
-
-With two basic classes - configuration and model - and an additional preprocessing class (tokenizer, image processor, feature extractor, or processor), you can create any of the models supported by 🤗 Transformers. Each of these base classes are configurable, allowing you to use the specific attributes you want. You can easily setup a model for training or modify an existing pretrained model to fine-tune.
diff --git a/docs/source/en/custom_models.md b/docs/source/en/custom_models.md
new file mode 100644
index 000000000000..d709772eed06
--- /dev/null
+++ b/docs/source/en/custom_models.md
@@ -0,0 +1,356 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Sharing custom models
+
+The 🤗 Transformers library is designed to be easily extensible. Every model is fully coded in a given subfolder
+of the repository with no abstraction, so you can easily copy a modeling file and tweak it to your needs.
+
+If you are writing a brand new model, it might be easier to start from scratch. In this tutorial, we will show you
+how to write a custom model and its configuration so it can be used inside Transformers, and how you can share it
+with the community (with the code it relies on) so that anyone can use it, even if it's not present in the 🤗
+Transformers library.
+
+We will illustrate all of this on a ResNet model, by wrapping the ResNet class of the
+[timm library](https://github.com/rwightman/pytorch-image-models) into a [`PreTrainedModel`].
+
+## Writing a custom configuration
+
+Before we dive into the model, let's first write its configuration. The configuration of a model is an object that
+will contain all the necessary information to build the model. As we will see in the next section, the model can only
+take a `config` to be initialized, so we really need that object to be as complete as possible.
+
+In our example, we will take a couple of arguments of the ResNet class that we might want to tweak. Different
+configurations will then give us the different types of ResNets that are possible. We then just store those arguments,
+after checking the validity of a few of them.
+
+```python
+from transformers import PretrainedConfig
+from typing import List
+
+
+class ResnetConfig(PretrainedConfig):
+    model_type = "resnet"
+
+    def __init__(
+        self,
+        block_type="bottleneck",
+        layers: List[int] = [3, 4, 6, 3],
+        num_classes: int = 1000,
+        input_channels: int = 3,
+        cardinality: int = 1,
+        base_width: int = 64,
+        stem_width: int = 64,
+        stem_type: str = "",
+        avg_down: bool = False,
+        **kwargs,
+    ):
+        if block_type not in ["basic", "bottleneck"]:
+            raise ValueError(f"`block_type` must be 'basic' or bottleneck', got {block_type}.")
+        if stem_type not in ["", "deep", "deep-tiered"]:
+            raise ValueError(f"`stem_type` must be '', 'deep' or 'deep-tiered', got {stem_type}.")
+
+        self.block_type = block_type
+        self.layers = layers
+        self.num_classes = num_classes
+        self.input_channels = input_channels
+        self.cardinality = cardinality
+        self.base_width = base_width
+        self.stem_width = stem_width
+        self.stem_type = stem_type
+        self.avg_down = avg_down
+        super().__init__(**kwargs)
+```
+
+The three important things to remember when writing you own configuration are the following:
+- you have to inherit from `PretrainedConfig`,
+- the `__init__` of your `PretrainedConfig` must accept any kwargs,
+- those `kwargs` need to be passed to the superclass `__init__`.
+
+The inheritance is to make sure you get all the functionality from the 🤗 Transformers library, while the two other
+constraints come from the fact a `PretrainedConfig` has more fields than the ones you are setting. When reloading a
+config with the `from_pretrained` method, those fields need to be accepted by your config and then sent to the
+superclass.
+
+Defining a `model_type` for your configuration (here `model_type="resnet"`) is not mandatory, unless you want to
+register your model with the auto classes (see last section).
+
+With this done, you can easily create and save your configuration like you would do with any other model config of the
+library. Here is how we can create a resnet50d config and save it:
+
+```py
+resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True)
+resnet50d_config.save_pretrained("custom-resnet")
+```
+
+This will save a file named `config.json` inside the folder `custom-resnet`. You can then reload your config with the
+`from_pretrained` method:
+
+```py
+resnet50d_config = ResnetConfig.from_pretrained("custom-resnet")
+```
+
+You can also use any other method of the [`PretrainedConfig`] class, like [`~PretrainedConfig.push_to_hub`] to
+directly upload your config to the Hub.
+
+## Writing a custom model
+
+Now that we have our ResNet configuration, we can go on writing the model. We will actually write two: one that
+extracts the hidden features from a batch of images (like [`BertModel`]) and one that is suitable for image
+classification (like [`BertForSequenceClassification`]).
+
+As we mentioned before, we'll only write a loose wrapper of the model to keep it simple for this example. The only
+thing we need to do before writing this class is a map between the block types and actual block classes. Then the
+model is defined from the configuration by passing everything to the `ResNet` class:
+
+```py
+from transformers import PreTrainedModel
+from timm.models.resnet import BasicBlock, Bottleneck, ResNet
+from .configuration_resnet import ResnetConfig
+
+
+BLOCK_MAPPING = {"basic": BasicBlock, "bottleneck": Bottleneck}
+
+
+class ResnetModel(PreTrainedModel):
+    config_class = ResnetConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        block_layer = BLOCK_MAPPING[config.block_type]
+        self.model = ResNet(
+            block_layer,
+            config.layers,
+            num_classes=config.num_classes,
+            in_chans=config.input_channels,
+            cardinality=config.cardinality,
+            base_width=config.base_width,
+            stem_width=config.stem_width,
+            stem_type=config.stem_type,
+            avg_down=config.avg_down,
+        )
+
+    def forward(self, tensor):
+        return self.model.forward_features(tensor)
+```
+
+For the model that will classify images, we just change the forward method:
+
+```py
+import torch
+
+
+class ResnetModelForImageClassification(PreTrainedModel):
+    config_class = ResnetConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        block_layer = BLOCK_MAPPING[config.block_type]
+        self.model = ResNet(
+            block_layer,
+            config.layers,
+            num_classes=config.num_classes,
+            in_chans=config.input_channels,
+            cardinality=config.cardinality,
+            base_width=config.base_width,
+            stem_width=config.stem_width,
+            stem_type=config.stem_type,
+            avg_down=config.avg_down,
+        )
+
+    def forward(self, tensor, labels=None):
+        logits = self.model(tensor)
+        if labels is not None:
+            loss = torch.nn.cross_entropy(logits, labels)
+            return {"loss": loss, "logits": logits}
+        return {"logits": logits}
+```
+
+In both cases, notice how we inherit from `PreTrainedModel` and call the superclass initialization with the `config`
+(a bit like when you write a regular `torch.nn.Module`). The line that sets the `config_class` is not mandatory, unless
+you want to register your model with the auto classes (see last section).
+
+<Tip>
+
+If your model is very similar to a model inside the library, you can re-use the same configuration as this model.
+
+</Tip>
+
+You can have your model return anything you want, but returning a dictionary like we did for
+`ResnetModelForImageClassification`, with the loss included when labels are passed, will make your model directly
+usable inside the [`Trainer`] class. Using another output format is fine as long as you are planning on using your own
+training loop or another library for training.
+
+Now that we have our model class, let's create one:
+
+```py
+resnet50d = ResnetModelForImageClassification(resnet50d_config)
+```
+
+Again, you can use any of the methods of [`PreTrainedModel`], like [`~PreTrainedModel.save_pretrained`] or
+[`~PreTrainedModel.push_to_hub`]. We will use the second in the next section, and see how to push the model weights
+with the code of our model. But first, let's load some pretrained weights inside our model.
+
+In your own use case, you will probably be training your custom model on your own data. To go fast for this tutorial,
+we will use the pretrained version of the resnet50d. Since our model is just a wrapper around it, it's going to be
+easy to transfer those weights:
+
+```py
+import timm
+
+pretrained_model = timm.create_model("resnet50d", pretrained=True)
+resnet50d.model.load_state_dict(pretrained_model.state_dict())
+```
+
+Now let's see how to make sure that when we do [`~PreTrainedModel.save_pretrained`] or [`~PreTrainedModel.push_to_hub`], the
+code of the model is saved.
+
+## Sending the code to the Hub
+
+<Tip warning={true}>
+
+This API is experimental and may have some slight breaking changes in the next releases.
+
+</Tip>
+
+First, make sure your model is fully defined in a `.py` file. It can rely on relative imports to some other files as
+long as all the files are in the same directory (we don't support submodules for this feature yet). For our example,
+we'll define a `modeling_resnet.py` file and a `configuration_resnet.py` file in a folder of the current working
+directory named `resnet_model`. The configuration file contains the code for `ResnetConfig` and the modeling file
+contains the code of `ResnetModel` and `ResnetModelForImageClassification`.
+
+```
+.
+└── resnet_model
+    ├── __init__.py
+    ├── configuration_resnet.py
+    └── modeling_resnet.py
+```
+
+The `__init__.py` can be empty, it's just there so that Python detects `resnet_model` can be use as a module.
+
+<Tip warning={true}>
+
+If copying a modeling files from the library, you will need to replace all the relative imports at the top of the file
+to import from the `transformers` package.
+
+</Tip>
+
+Note that you can re-use (or subclass) an existing configuration/model.
+
+To share your model with the community, follow those steps: first import the ResNet model and config from the newly
+created files:
+
+```py
+from resnet_model.configuration_resnet import ResnetConfig
+from resnet_model.modeling_resnet import ResnetModel, ResnetModelForImageClassification
+```
+
+Then you have to tell the library you want to copy the code files of those objects when using the `save_pretrained`
+method and properly register them with a given Auto class (especially for models), just run:
+
+```py
+ResnetConfig.register_for_auto_class()
+ResnetModel.register_for_auto_class("AutoModel")
+ResnetModelForImageClassification.register_for_auto_class("AutoModelForImageClassification")
+```
+
+Note that there is no need to specify an auto class for the configuration (there is only one auto class for them,
+[`AutoConfig`]) but it's different for models. Your custom model could be suitable for many different tasks, so you
+have to specify which one of the auto classes is the correct one for your model.
+
+Next, let's create the config and models as we did before:
+
+```py
+resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True)
+resnet50d = ResnetModelForImageClassification(resnet50d_config)
+
+pretrained_model = timm.create_model("resnet50d", pretrained=True)
+resnet50d.model.load_state_dict(pretrained_model.state_dict())
+```
+
+Now to send the model to the Hub, make sure you are logged in. Either run in your terminal:
+
+```bash
+huggingface-cli login
+```
+
+or from a notebook:
+
+```py
+from huggingface_hub import notebook_login
+
+notebook_login()
+```
+
+You can then push to your own namespace (or an organization you are a member of) like this:
+
+```py
+resnet50d.push_to_hub("custom-resnet50d")
+```
+
+On top of the modeling weights and the configuration in json format, this also copied the modeling and
+configuration `.py` files in the folder `custom-resnet50d` and uploaded the result to the Hub. You can check the result
+in this [model repo](https://huggingface.co/sgugger/custom-resnet50d).
+
+See the [sharing tutorial](model_sharing) for more information on the push to Hub method.
+
+## Using a model with custom code
+
+You can use any configuration, model or tokenizer with custom code files in its repository with the auto-classes and
+the `from_pretrained` method. All files and code uploaded to the Hub are scanned for malware (refer to the [Hub security](https://huggingface.co/docs/hub/security#malware-scanning) documentation for more information), but you should still 
+review the model code and author to avoid executing malicious code on your machine. Set `trust_remote_code=True` to use
+a model with custom code:
+
+```py
+from transformers import AutoModelForImageClassification
+
+model = AutoModelForImageClassification.from_pretrained("sgugger/custom-resnet50d", trust_remote_code=True)
+```
+
+It is also strongly encouraged to pass a commit hash as a `revision` to make sure the author of the models did not
+update the code with some malicious new lines (unless you fully trust the authors of the models).
+
+```py
+commit_hash = "ed94a7c6247d8aedce4647f00f20de6875b5b292"
+model = AutoModelForImageClassification.from_pretrained(
+    "sgugger/custom-resnet50d", trust_remote_code=True, revision=commit_hash
+)
+```
+
+Note that when browsing the commit history of the model repo on the Hub, there is a button to easily copy the commit
+hash of any commit.
+
+## Registering a model with custom code to the auto classes
+
+If you are writing a library that extends 🤗 Transformers, you may want to extend the auto classes to include your own
+model. This is different from pushing the code to the Hub in the sense that users will need to import your library to
+get the custom models (contrarily to automatically downloading the model code from the Hub).
+
+As long as your config has a `model_type` attribute that is different from existing model types, and that your model
+classes have the right `config_class` attributes, you can just add them to the auto classes like this:
+
+```py
+from transformers import AutoConfig, AutoModel, AutoModelForImageClassification
+
+AutoConfig.register("resnet", ResnetConfig)
+AutoModel.register(ResnetConfig, ResnetModel)
+AutoModelForImageClassification.register(ResnetConfig, ResnetModelForImageClassification)
+```
+
+Note that the first argument used when registering your custom config to [`AutoConfig`] needs to match the `model_type`
+of your custom config, and the first argument used when registering your custom models to any auto model class needs
+to match the `config_class` of those models.
diff --git a/docs/source/en/custom_models.mdx b/docs/source/en/custom_models.mdx
deleted file mode 100644
index f5ad55856243..000000000000
--- a/docs/source/en/custom_models.mdx
+++ /dev/null
@@ -1,352 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Sharing custom models
-
-The 🤗 Transformers library is designed to be easily extensible. Every model is fully coded in a given subfolder
-of the repository with no abstraction, so you can easily copy a modeling file and tweak it to your needs.
-
-If you are writing a brand new model, it might be easier to start from scratch. In this tutorial, we will show you
-how to write a custom model and its configuration so it can be used inside Transformers, and how you can share it
-with the community (with the code it relies on) so that anyone can use it, even if it's not present in the 🤗
-Transformers library.
-
-We will illustrate all of this on a ResNet model, by wrapping the ResNet class of the
-[timm library](https://github.com/rwightman/pytorch-image-models) into a [`PreTrainedModel`].
-
-## Writing a custom configuration
-
-Before we dive into the model, let's first write its configuration. The configuration of a model is an object that
-will contain all the necessary information to build the model. As we will see in the next section, the model can only
-take a `config` to be initialized, so we really need that object to be as complete as possible.
-
-In our example, we will take a couple of arguments of the ResNet class that we might want to tweak. Different
-configurations will then give us the different types of ResNets that are possible. We then just store those arguments,
-after checking the validity of a few of them.
-
-```python
-from transformers import PretrainedConfig
-from typing import List
-
-
-class ResnetConfig(PretrainedConfig):
-    model_type = "resnet"
-
-    def __init__(
-        self,
-        block_type="bottleneck",
-        layers: List[int] = [3, 4, 6, 3],
-        num_classes: int = 1000,
-        input_channels: int = 3,
-        cardinality: int = 1,
-        base_width: int = 64,
-        stem_width: int = 64,
-        stem_type: str = "",
-        avg_down: bool = False,
-        **kwargs,
-    ):
-        if block_type not in ["basic", "bottleneck"]:
-            raise ValueError(f"`block_type` must be 'basic' or bottleneck', got {block_type}.")
-        if stem_type not in ["", "deep", "deep-tiered"]:
-            raise ValueError(f"`stem_type` must be '', 'deep' or 'deep-tiered', got {stem_type}.")
-
-        self.block_type = block_type
-        self.layers = layers
-        self.num_classes = num_classes
-        self.input_channels = input_channels
-        self.cardinality = cardinality
-        self.base_width = base_width
-        self.stem_width = stem_width
-        self.stem_type = stem_type
-        self.avg_down = avg_down
-        super().__init__(**kwargs)
-```
-
-The three important things to remember when writing you own configuration are the following:
-- you have to inherit from `PretrainedConfig`,
-- the `__init__` of your `PretrainedConfig` must accept any kwargs,
-- those `kwargs` need to be passed to the superclass `__init__`.
-
-The inheritance is to make sure you get all the functionality from the 🤗 Transformers library, while the two other
-constraints come from the fact a `PretrainedConfig` has more fields than the ones you are setting. When reloading a
-config with the `from_pretrained` method, those fields need to be accepted by your config and then sent to the
-superclass.
-
-Defining a `model_type` for your configuration (here `model_type="resnet"`) is not mandatory, unless you want to
-register your model with the auto classes (see last section).
-
-With this done, you can easily create and save your configuration like you would do with any other model config of the
-library. Here is how we can create a resnet50d config and save it:
-
-```py
-resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True)
-resnet50d_config.save_pretrained("custom-resnet")
-```
-
-This will save a file named `config.json` inside the folder `custom-resnet`. You can then reload your config with the
-`from_pretrained` method:
-
-```py
-resnet50d_config = ResnetConfig.from_pretrained("custom-resnet")
-```
-
-You can also use any other method of the [`PretrainedConfig`] class, like [`~PretrainedConfig.push_to_hub`] to
-directly upload your config to the Hub.
-
-## Writing a custom model
-
-Now that we have our ResNet configuration, we can go on writing the model. We will actually write two: one that
-extracts the hidden features from a batch of images (like [`BertModel`]) and one that is suitable for image
-classification (like [`BertForSequenceClassification`]).
-
-As we mentioned before, we'll only write a loose wrapper of the model to keep it simple for this example. The only
-thing we need to do before writing this class is a map between the block types and actual block classes. Then the
-model is defined from the configuration by passing everything to the `ResNet` class:
-
-```py
-from transformers import PreTrainedModel
-from timm.models.resnet import BasicBlock, Bottleneck, ResNet
-from .configuration_resnet import ResnetConfig
-
-
-BLOCK_MAPPING = {"basic": BasicBlock, "bottleneck": Bottleneck}
-
-
-class ResnetModel(PreTrainedModel):
-    config_class = ResnetConfig
-
-    def __init__(self, config):
-        super().__init__(config)
-        block_layer = BLOCK_MAPPING[config.block_type]
-        self.model = ResNet(
-            block_layer,
-            config.layers,
-            num_classes=config.num_classes,
-            in_chans=config.input_channels,
-            cardinality=config.cardinality,
-            base_width=config.base_width,
-            stem_width=config.stem_width,
-            stem_type=config.stem_type,
-            avg_down=config.avg_down,
-        )
-
-    def forward(self, tensor):
-        return self.model.forward_features(tensor)
-```
-
-For the model that will classify images, we just change the forward method:
-
-```py
-import torch
-
-
-class ResnetModelForImageClassification(PreTrainedModel):
-    config_class = ResnetConfig
-
-    def __init__(self, config):
-        super().__init__(config)
-        block_layer = BLOCK_MAPPING[config.block_type]
-        self.model = ResNet(
-            block_layer,
-            config.layers,
-            num_classes=config.num_classes,
-            in_chans=config.input_channels,
-            cardinality=config.cardinality,
-            base_width=config.base_width,
-            stem_width=config.stem_width,
-            stem_type=config.stem_type,
-            avg_down=config.avg_down,
-        )
-
-    def forward(self, tensor, labels=None):
-        logits = self.model(tensor)
-        if labels is not None:
-            loss = torch.nn.cross_entropy(logits, labels)
-            return {"loss": loss, "logits": logits}
-        return {"logits": logits}
-```
-
-In both cases, notice how we inherit from `PreTrainedModel` and call the superclass initialization with the `config`
-(a bit like when you write a regular `torch.nn.Module`). The line that sets the `config_class` is not mandatory, unless
-you want to register your model with the auto classes (see last section).
-
-<Tip>
-
-If your model is very similar to a model inside the library, you can re-use the same configuration as this model.
-
-</Tip>
-
-You can have your model return anything you want, but returning a dictionary like we did for
-`ResnetModelForImageClassification`, with the loss included when labels are passed, will make your model directly
-usable inside the [`Trainer`] class. Using another output format is fine as long as you are planning on using your own
-training loop or another library for training.
-
-Now that we have our model class, let's create one:
-
-```py
-resnet50d = ResnetModelForImageClassification(resnet50d_config)
-```
-
-Again, you can use any of the methods of [`PreTrainedModel`], like [`~PreTrainedModel.save_pretrained`] or
-[`~PreTrainedModel.push_to_hub`]. We will use the second in the next section, and see how to push the model weights
-with the code of our model. But first, let's load some pretrained weights inside our model.
-
-In your own use case, you will probably be training your custom model on your own data. To go fast for this tutorial,
-we will use the pretrained version of the resnet50d. Since our model is just a wrapper around it, it's going to be
-easy to transfer those weights:
-
-```py
-import timm
-
-pretrained_model = timm.create_model("resnet50d", pretrained=True)
-resnet50d.model.load_state_dict(pretrained_model.state_dict())
-```
-
-Now let's see how to make sure that when we do [`~PreTrainedModel.save_pretrained`] or [`~PreTrainedModel.push_to_hub`], the
-code of the model is saved.
-
-## Sending the code to the Hub
-
-<Tip warning={true}>
-
-This API is experimental and may have some slight breaking changes in the next releases.
-
-</Tip>
-
-First, make sure your model is fully defined in a `.py` file. It can rely on relative imports to some other files as
-long as all the files are in the same directory (we don't support submodules for this feature yet). For our example,
-we'll define a `modeling_resnet.py` file and a `configuration_resnet.py` file in a folder of the current working
-directory named `resnet_model`. The configuration file contains the code for `ResnetConfig` and the modeling file
-contains the code of `ResnetModel` and `ResnetModelForImageClassification`.
-
-```
-.
-└── resnet_model
-    ├── __init__.py
-    ├── configuration_resnet.py
-    └── modeling_resnet.py
-```
-
-The `__init__.py` can be empty, it's just there so that Python detects `resnet_model` can be use as a module.
-
-<Tip warning={true}>
-
-If copying a modeling files from the library, you will need to replace all the relative imports at the top of the file
-to import from the `transformers` package.
-
-</Tip>
-
-Note that you can re-use (or subclass) an existing configuration/model.
-
-To share your model with the community, follow those steps: first import the ResNet model and config from the newly
-created files:
-
-```py
-from resnet_model.configuration_resnet import ResnetConfig
-from resnet_model.modeling_resnet import ResnetModel, ResnetModelForImageClassification
-```
-
-Then you have to tell the library you want to copy the code files of those objects when using the `save_pretrained`
-method and properly register them with a given Auto class (especially for models), just run:
-
-```py
-ResnetConfig.register_for_auto_class()
-ResnetModel.register_for_auto_class("AutoModel")
-ResnetModelForImageClassification.register_for_auto_class("AutoModelForImageClassification")
-```
-
-Note that there is no need to specify an auto class for the configuration (there is only one auto class for them,
-[`AutoConfig`]) but it's different for models. Your custom model could be suitable for many different tasks, so you
-have to specify which one of the auto classes is the correct one for your model.
-
-Next, let's create the config and models as we did before:
-
-```py
-resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True)
-resnet50d = ResnetModelForImageClassification(resnet50d_config)
-
-pretrained_model = timm.create_model("resnet50d", pretrained=True)
-resnet50d.model.load_state_dict(pretrained_model.state_dict())
-```
-
-Now to send the model to the Hub, make sure you are logged in. Either run in your terminal:
-
-```bash
-huggingface-cli login
-```
-
-or from a notebook:
-
-```py
-from huggingface_hub import notebook_login
-
-notebook_login()
-```
-
-You can then push to your own namespace (or an organization you are a member of) like this:
-
-```py
-resnet50d.push_to_hub("custom-resnet50d")
-```
-
-On top of the modeling weights and the configuration in json format, this also copied the modeling and
-configuration `.py` files in the folder `custom-resnet50d` and uploaded the result to the Hub. You can check the result
-in this [model repo](https://huggingface.co/sgugger/custom-resnet50d).
-
-See the [sharing tutorial](model_sharing) for more information on the push to Hub method.
-
-## Using a model with custom code
-
-You can use any configuration, model or tokenizer with custom code files in its repository with the auto-classes and
-the `from_pretrained` method. All files and code uploaded to the Hub are scanned for malware (refer to the [Hub security](https://huggingface.co/docs/hub/security#malware-scanning) documentation for more information), but you should still 
-review the model code and author to avoid executing malicious code on your machine. Set `trust_remote_code=True` to use
-a model with custom code:
-
-```py
-from transformers import AutoModelForImageClassification
-
-model = AutoModelForImageClassification.from_pretrained("sgugger/custom-resnet50d", trust_remote_code=True)
-```
-
-It is also strongly encouraged to pass a commit hash as a `revision` to make sure the author of the models did not
-update the code with some malicious new lines (unless you fully trust the authors of the models).
-
-```py
-commit_hash = "ed94a7c6247d8aedce4647f00f20de6875b5b292"
-model = AutoModelForImageClassification.from_pretrained(
-    "sgugger/custom-resnet50d", trust_remote_code=True, revision=commit_hash
-)
-```
-
-Note that when browsing the commit history of the model repo on the Hub, there is a button to easily copy the commit
-hash of any commit.
-
-## Registering a model with custom code to the auto classes
-
-If you are writing a library that extends 🤗 Transformers, you may want to extend the auto classes to include your own
-model. This is different from pushing the code to the Hub in the sense that users will need to import your library to
-get the custom models (contrarily to automatically downloading the model code from the Hub).
-
-As long as your config has a `model_type` attribute that is different from existing model types, and that your model
-classes have the right `config_class` attributes, you can just add them to the auto classes likes this:
-
-```py
-from transformers import AutoConfig, AutoModel, AutoModelForImageClassification
-
-AutoConfig.register("resnet", ResnetConfig)
-AutoModel.register(ResnetConfig, ResnetModel)
-AutoModelForImageClassification.register(ResnetConfig, ResnetModelForImageClassification)
-```
-
-Note that the first argument used when registering your custom config to [`AutoConfig`] needs to match the `model_type`
-of your custom config, and the first argument used when registering your custom models to any auto model class needs
-to match the `config_class` of those models.
diff --git a/docs/source/en/custom_tools.md b/docs/source/en/custom_tools.md
new file mode 100644
index 000000000000..86183a80752e
--- /dev/null
+++ b/docs/source/en/custom_tools.md
@@ -0,0 +1,789 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Custom Tools and Prompts
+
+<Tip>
+
+If you are not aware of what tools and agents are in the context of transformers, we recommend you read the
+[Transformers Agents](transformers_agents) page first.
+
+</Tip>
+
+<Tip warning={true}>
+
+Transformers Agents is an experimental API that is subject to change at any time. Results returned by the agents
+can vary as the APIs or underlying models are prone to change.
+
+</Tip>
+
+Creating and using custom tools and prompts is paramount to empowering the agent and having it perform new tasks.
+In this guide we'll take a look at:
+
+- How to customize the prompt
+- How to use custom tools
+- How to create custom tools
+
+## Customizing the prompt
+
+As explained in [Transformers Agents](transformers_agents) agents can run in [`~Agent.run`] and [`~Agent.chat`] mode.
+Both the `run` and `chat` modes underlie the same logic. The language model powering the agent is conditioned on a long 
+prompt and completes the prompt by generating the next tokens until the stop token is reached.
+The only difference between the two modes is that during the `chat` mode the prompt is extended with 
+previous user inputs and model generations. This allows the agent to have access to past interactions,
+seemingly giving the agent some kind of memory.
+
+### Structure of the prompt
+
+Let's take a closer look at how the prompt is structured to understand how it can be best customized.
+The prompt is structured broadly into four parts.
+
+- 1. Introduction: how the agent should behave, explanation of the concept of tools.
+- 2. Description of all the tools. This is defined by a `<<all_tools>>` token that is dynamically replaced at runtime with the tools defined/chosen by the user.
+- 3. A set of examples of tasks and their solution
+- 4. Current example, and request for solution.
+
+To better understand each part, let's look at a shortened version of how the `run` prompt can look like:
+
+````text
+I will ask you to perform a task, your job is to come up with a series of simple commands in Python that will perform the task.
+[...]
+You can print intermediate results if it makes sense to do so.
+
+Tools:
+- document_qa: This is a tool that answers a question about a document (pdf). It takes an input named `document` which should be the document containing the information, as well as a `question` that is the question about the document. It returns a text that contains the answer to the question.
+- image_captioner: This is a tool that generates a description of an image. It takes an input named `image` which should be the image to the caption and returns a text that contains the description in English.
+[...]
+
+Task: "Answer the question in the variable `question` about the image stored in the variable `image`. The question is in French."
+
+I will use the following tools: `translator` to translate the question into English and then `image_qa` to answer the question on the input image.
+
+Answer:
+```py
+translated_question = translator(question=question, src_lang="French", tgt_lang="English")
+print(f"The translated question is {translated_question}.")
+answer = image_qa(image=image, question=translated_question)
+print(f"The answer is {answer}")
+```
+
+Task: "Identify the oldest person in the `document` and create an image showcasing the result as a banner."
+
+I will use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer.
+
+Answer:
+```py
+answer = document_qa(document, question="What is the oldest person?")
+print(f"The answer is {answer}.")
+image = image_generator("A banner showing " + answer)
+```
+
+[...]
+
+Task: "Draw me a picture of rivers and lakes"
+
+I will use the following
+````
+
+The introduction (the text before *"Tools:"*) explains precisely how the model shall behave and what it should do.
+This part most likely does not need to be customized as the agent shall always behave the same way.
+
+The second part (the bullet points below *"Tools"*) is dynamically added upon calling `run` or `chat`. There are 
+exactly as many bullet points as there are tools in `agent.toolbox` and each bullet point consists of the name 
+and description of the tool:
+
+```text
+- <tool.name>: <tool.description>
+```
+
+Let's verify this quickly by loading the document_qa tool and printing out the name and description.
+
+```py
+from transformers import load_tool
+
+document_qa = load_tool("document-question-answering")
+print(f"- {document_qa.name}: {document_qa.description}")
+```
+
+which gives:
+```text
+- document_qa: This is a tool that answers a question about a document (pdf). It takes an input named `document` which should be the document containing the information, as well as a `question` that is the question about the document. It returns a text that contains the answer to the question.
+```
+
+We can see that the tool name is short and precise. The description includes two parts, the first explaining 
+what the tool does and the second states what input arguments and return values are expected.
+
+A good tool name and tool description are very important for the agent to correctly use it. Note that the only
+information the agent has about the tool is its name and description, so one should make sure that both 
+are precisely written and match the style of the existing tools in the toolbox. In particular make sure the description
+mentions all the arguments expected by name in code-style, along with the expected type and a description of what they
+are.
+
+<Tip>
+
+Check the naming and description of the curated Transformers tools to better understand what name and 
+description a tool is expected to have. You can see all tools with the [`Agent.toolbox`] property.
+
+</Tip>
+
+The third part includes a set of curated examples that show the agent exactly what code it should produce
+for what kind of user request. The large language models empowering the agent are extremely good at 
+recognizing patterns in a prompt and repeating the pattern with new data. Therefore, it is very important
+that the examples are written in a way that maximizes the likelihood of the agent to generating correct,
+executable code in practice. 
+
+Let's have a look at one example:
+
+````text
+Task: "Identify the oldest person in the `document` and create an image showcasing the result as a banner."
+
+I will use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer.
+
+Answer:
+```py
+answer = document_qa(document, question="What is the oldest person?")
+print(f"The answer is {answer}.")
+image = image_generator("A banner showing " + answer)
+```
+
+````
+
+The pattern the model is prompted to repeat has three parts: The task statement, the agent's explanation of 
+what it intends to do, and finally the generated code. Every example that is part of the prompt has this exact 
+pattern, thus making sure that the agent will reproduce exactly the same pattern when generating new tokens.
+
+The prompt examples are curated by the Transformers team and rigorously evaluated on a set of 
+[problem statements](https://github.com/huggingface/transformers/blob/main/src/transformers/tools/evaluate_agent.py)
+to ensure that the agent's prompt is as good as possible to solve real use cases of the agent.
+
+The final part of the prompt corresponds to:
+```text
+Task: "Draw me a picture of rivers and lakes"
+
+I will use the following
+```
+
+is a final and unfinished example that the agent is tasked to complete. The unfinished example
+is dynamically created based on the actual user input. For the above example, the user ran:
+
+```py
+agent.run("Draw me a picture of rivers and lakes")
+```
+
+The user input - *a.k.a* the task: *"Draw me a picture of rivers and lakes"* is cast into the 
+prompt template: "Task: <task> \n\n I will use the following". This sentence makes up the final lines of the 
+prompt the agent is conditioned on, therefore strongly influencing the agent to finish the example 
+exactly in the same way it was previously done in the examples.
+
+Without going into too much detail, the chat template has the same prompt structure with the 
+examples having a slightly different style, *e.g.*:
+
+````text
+[...]
+
+=====
+
+Human: Answer the question in the variable `question` about the image stored in the variable `image`.
+
+Assistant: I will use the tool `image_qa` to answer the question on the input image.
+
+```py
+answer = image_qa(text=question, image=image)
+print(f"The answer is {answer}")
+```
+
+Human: I tried this code, it worked but didn't give me a good result. The question is in French
+
+Assistant: In this case, the question needs to be translated first. I will use the tool `translator` to do this.
+
+```py
+translated_question = translator(question=question, src_lang="French", tgt_lang="English")
+print(f"The translated question is {translated_question}.")
+answer = image_qa(text=translated_question, image=image)
+print(f"The answer is {answer}")
+```
+
+=====
+
+[...]
+````
+
+Contrary, to the examples of the `run` prompt, each `chat` prompt example has one or more exchanges between the 
+*Human* and the *Assistant*. Every exchange is structured similarly to the example of the `run` prompt. 
+The user's input is appended to behind *Human:* and the agent is prompted to first generate what needs to be done 
+before generating code. An exchange can be based on previous exchanges, therefore allowing the user to refer
+to past exchanges as is done *e.g.* above by the user's input of "I tried **this** code" refers to the 
+previously generated code of the agent.
+
+Upon running `.chat`, the user's input or *task* is cast into an unfinished example of the form:
+```text
+Human: <user-input>\n\nAssistant:
+```
+which the agent completes. Contrary to the `run` command, the `chat` command then appends the completed example
+to the prompt, thus giving the agent more context for the next `chat` turn.
+
+Great now that we know how the prompt is structured, let's see how we can customize it!
+
+### Writing good user inputs
+
+While large language models are getting better and better at understanding users' intentions, it helps 
+enormously to be as precise as possible to help the agent pick the correct task. What does it mean to be 
+as precise as possible?
+
+The agent sees a list of tool names and their description in its prompt. The more tools are added the 
+more difficult it becomes for the agent to choose the correct tool and it's even more difficult to choose
+the correct sequences of tools to run. Let's look at a common failure case, here we will only return 
+the code to analyze it.
+
+```py
+from transformers import HfAgent
+
+agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
+
+agent.run("Show me a tree", return_code=True)
+```
+
+gives:
+
+```text
+==Explanation from the agent==
+I will use the following tool: `image_segmenter` to create a segmentation mask for the image.
+
+
+==Code generated by the agent==
+mask = image_segmenter(image, prompt="tree")
+```
+
+which is probably not what we wanted. Instead, it is more likely that we want an image of a tree to be generated.
+To steer the agent more towards using a specific tool it can therefore be very helpful to use important keywords that 
+are present in the tool's name and description. Let's have a look.
+```py
+agent.toolbox["image_generator"].description
+```
+
+```text
+'This is a tool that creates an image according to a prompt, which is a text description. It takes an input named `prompt` which contains the image description and outputs an image.
+```
+
+The name and description make use of the keywords "image", "prompt", "create" and "generate". Using these words will most likely work better here. Let's refine our prompt a bit.
+
+```py
+agent.run("Create an image of a tree", return_code=True)
+```
+
+gives:
+```text
+==Explanation from the agent==
+I will use the following tool `image_generator` to generate an image of a tree.
+
+
+==Code generated by the agent==
+image = image_generator(prompt="tree")
+```
+
+Much better! That looks more like what we want. In short, when you notice that the agent struggles to 
+correctly map your task to the correct tools, try looking up the most pertinent keywords of the tool's name
+and description and try refining your task request with it.
+
+### Customizing the tool descriptions
+
+As we've seen before the agent has access to each of the tools' names and descriptions. The base tools 
+should have very precise names and descriptions, however, you might find that it could help to change the 
+the description or name of a tool for your specific use case. This might become especially important 
+when you've added multiple tools that are very similar or if you want to use your agent only for a certain 
+domain, *e.g.* image generation and transformations.
+
+A common problem is that the agent confuses image generation with image transformation/modification when 
+used a lot for image generation tasks, *e.g.*
+```py
+agent.run("Make an image of a house and a car", return_code=True)
+```
+returns
+```text
+==Explanation from the agent== 
+I will use the following tools `image_generator` to generate an image of a house and `image_transformer` to transform the image of a car into the image of a house.
+
+==Code generated by the agent==
+house_image = image_generator(prompt="A house")
+car_image = image_generator(prompt="A car")
+house_car_image = image_transformer(image=car_image, prompt="A house")
+```
+
+which is probably not exactly what we want here. It seems like the agent has a difficult time 
+to understand the difference between `image_generator` and `image_transformer` and often uses the two together.
+
+We can help the agent here by changing the tool name and description of `image_transformer`. Let's instead call it `modifier`
+to disassociate it a bit from "image" and "prompt":
+```py
+agent.toolbox["modifier"] = agent.toolbox.pop("image_transformer")
+agent.toolbox["modifier"].description = agent.toolbox["modifier"].description.replace(
+    "transforms an image according to a prompt", "modifies an image"
+)
+```
+
+Now "modify" is a strong cue to use the new image processor which should help with the above prompt. Let's run it again.
+
+```py
+agent.run("Make an image of a house and a car", return_code=True)
+```
+
+Now we're getting:
+```text
+==Explanation from the agent==
+I will use the following tools: `image_generator` to generate an image of a house, then `image_generator` to generate an image of a car.
+
+
+==Code generated by the agent==
+house_image = image_generator(prompt="A house")
+car_image = image_generator(prompt="A car")
+```
+
+which is definitely closer to what we had in mind! However, we want to have both the house and car in the same image. Steering the task more toward single image generation should help:
+
+```py
+agent.run("Create image: 'A house and car'", return_code=True)
+```
+
+```text
+==Explanation from the agent==
+I will use the following tool: `image_generator` to generate an image.
+
+
+==Code generated by the agent==
+image = image_generator(prompt="A house and car")
+```
+
+<Tip warning={true}>
+
+Agents are still brittle for many use cases, especially when it comes to 
+slightly more complex use cases like generating an image of multiple objects.
+Both the agent itself and the underlying prompt will be further improved in the coming 
+months making sure that agents become more robust to a variety of user inputs.
+
+</Tip>
+
+### Customizing the whole prompt
+
+To give the user maximum flexibility, the whole prompt template as explained in [above](#structure-of-the-prompt)
+can be overwritten by the user. In this case make sure that your custom prompt includes an introduction section, 
+a tool section, an example section, and an unfinished example section. If you want to overwrite the `run` prompt template, 
+you can do as follows:
+
+```py
+template = """ [...] """
+
+agent = HfAgent(your_endpoint, run_prompt_template=template)
+```
+
+<Tip warning={true}>
+
+Please make sure to have the `<<all_tools>>` string and the `<<prompt>>` defined somewhere in the `template` so that the agent can be aware 
+of the tools, it has available to it as well as correctly insert the user's prompt.
+
+</Tip>
+
+Similarly, one can overwrite the `chat` prompt template. Note that the `chat` mode always uses the following format for the exchanges:
+```text
+Human: <<task>>
+
+Assistant:
+```
+
+Therefore it is important that the examples of the custom `chat` prompt template also make use of this format.
+You can overwrite the `chat` template at instantiation as follows.
+
+```
+template = """ [...] """
+
+agent = HfAgent(url_endpoint=your_endpoint, chat_prompt_template=template)
+```
+
+<Tip warning={true}>
+
+Please make sure to have the `<<all_tools>>` string defined somewhere in the `template` so that the agent can be aware 
+of the tools, it has available to it.
+
+</Tip>
+
+In both cases, you can pass a repo ID instead of the prompt template if you would like to use a template hosted by someone in the community. The default prompts live in [this repo](https://huggingface.co/datasets/huggingface-tools/default-prompts) as an example.
+
+To upload your custom prompt on a repo on the Hub and share it with the community just make sure:
+- to use a dataset repository
+- to put the prompt template for the `run` command in a file named `run_prompt_template.txt`
+- to put the prompt template for the `chat` command in a file named `chat_prompt_template.txt`
+
+## Using custom tools
+
+In this section, we'll be leveraging two existing custom tools that are specific to image generation:
+
+- We replace [huggingface-tools/image-transformation](https://huggingface.co/spaces/huggingface-tools/image-transformation),
+  with [diffusers/controlnet-canny-tool](https://huggingface.co/spaces/diffusers/controlnet-canny-tool) 
+  to allow for more image modifications.
+- We add a new tool for image upscaling to the default toolbox: 
+  [diffusers/latent-upscaler-tool](https://huggingface.co/spaces/diffusers/latent-upscaler-tool) replace the existing image-transformation tool.
+
+We'll start by loading the custom tools with the convenient [`load_tool`] function:
+
+```py
+from transformers import load_tool
+
+controlnet_transformer = load_tool("diffusers/controlnet-canny-tool")
+upscaler = load_tool("diffusers/latent-upscaler-tool")
+```
+
+Upon adding custom tools to an agent, the tools' descriptions and names are automatically
+included in the agents' prompts. Thus, it is imperative that custom tools have
+a well-written description and name in order for the agent to understand how to use them.
+Let's take a look at the description and name of `controlnet_transformer`:
+
+```py
+print(f"Description: '{controlnet_transformer.description}'")
+print(f"Name: '{controlnet_transformer.name}'")
+```
+
+gives 
+```text
+Description: 'This is a tool that transforms an image with ControlNet according to a prompt. 
+It takes two inputs: `image`, which should be the image to transform, and `prompt`, which should be the prompt to use to change it. It returns the modified image.'
+Name: 'image_transformer'
+```
+
+The name and description are accurate and fit the style of the [curated set of tools](./transformers_agents#a-curated-set-of-tools).
+Next, let's instantiate an agent with `controlnet_transformer` and `upscaler`:
+
+```py
+tools = [controlnet_transformer, upscaler]
+agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=tools)
+```
+
+This command should give you the following info:
+
+```text
+image_transformer has been replaced by <transformers_modules.diffusers.controlnet-canny-tool.bd76182c7777eba9612fc03c0
+8718a60c0aa6312.image_transformation.ControlNetTransformationTool object at 0x7f1d3bfa3a00> as provided in `additional_tools`
+```
+
+The set of curated tools already has an `image_transformer` tool which is hereby replaced with our custom tool.
+
+<Tip>
+
+Overwriting existing tools can be beneficial if we want to use a custom tool exactly for the same task as an existing tool 
+because the agent is well-versed in using the specific task. Beware that the custom tool should follow the exact same API 
+as the overwritten tool in this case, or you should adapt the prompt template to make sure all examples using that
+tool are updated.
+
+</Tip>
+
+The upscaler tool was given the name `image_upscaler` which is not yet present in the default toolbox and is therefore simply added to the list of tools.
+You can always have a look at the toolbox that is currently available to the agent via the `agent.toolbox` attribute:
+
+```py
+print("\n".join([f"- {a}" for a in agent.toolbox.keys()]))
+```
+
+```text
+- document_qa
+- image_captioner
+- image_qa
+- image_segmenter
+- transcriber
+- summarizer
+- text_classifier
+- text_qa
+- text_reader
+- translator
+- image_transformer
+- text_downloader
+- image_generator
+- video_generator
+- image_upscaler
+```
+
+Note how `image_upscaler` is now part of the agents' toolbox.
+
+Let's now try out the new tools! We will re-use the image we generated in [Transformers Agents Quickstart](./transformers_agents#single-execution-run).
+
+```py
+from diffusers.utils import load_image
+
+image = load_image(
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png"
+)
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200> 
+
+Let's transform the image into a beautiful winter landscape:
+
+```py
+image = agent.run("Transform the image: 'A frozen lake and snowy forest'", image=image)
+```
+
+```text
+==Explanation from the agent==
+I will use the following tool: `image_transformer` to transform the image.
+
+
+==Code generated by the agent==
+image = image_transformer(image, prompt="A frozen lake and snowy forest")
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes_winter.png" width=200> 
+
+The new image processing tool is based on ControlNet which can make very strong modifications to the image.
+By default the image processing tool returns an image of size 512x512 pixels. Let's see if we can upscale it.
+
+```py
+image = agent.run("Upscale the image", image)
+```
+
+```text
+==Explanation from the agent==
+I will use the following tool: `image_upscaler` to upscale the image.
+
+
+==Code generated by the agent==
+upscaled_image = image_upscaler(image)
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes_winter_upscale.png" width=400> 
+
+The agent automatically mapped our prompt "Upscale the image" to the just added upscaler tool purely based on the description and name of the upscaler tool 
+and was able to correctly run it.
+
+Next, let's have a look at how you can create a new custom tool.
+
+### Adding new tools
+
+In this section, we show how to create a new tool that can be added to the agent.
+
+#### Creating a new tool
+
+We'll first start by creating a tool. We'll add the not-so-useful yet fun task of fetching the model on the Hugging Face
+Hub with the most downloads for a given task.
+
+We can do that with the following code:
+
+```python
+from huggingface_hub import list_models
+
+task = "text-classification"
+
+model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
+print(model.id)
+```
+
+For the task `text-classification`, this returns `'facebook/bart-large-mnli'`, for `translation` it returns `'t5-base`.
+
+How do we convert this to a tool that the agent can leverage? All tools depend on the superclass `Tool` that holds the
+main attributes necessary. We'll create a class that inherits from it:
+
+```python
+from transformers import Tool
+
+
+class HFModelDownloadsTool(Tool):
+    pass
+```
+
+This class has a few needs:
+- An attribute `name`, which corresponds to the name of the tool itself. To be in tune with other tools which have a
+  performative name, we'll name it `model_download_counter`.
+- An attribute `description`, which will be used to populate the prompt of the agent.
+- `inputs` and `outputs` attributes. Defining this will help the python interpreter make educated choices about types,
+  and will allow for a gradio-demo to be spawned when we push our tool to the Hub. They're both a list of expected
+  values, which can be `text`, `image`, or `audio`.
+- A `__call__` method which contains the inference code. This is the code we've played with above!
+
+Here's what our class looks like now:
+
+```python
+from transformers import Tool
+from huggingface_hub import list_models
+
+
+class HFModelDownloadsTool(Tool):
+    name = "model_download_counter"
+    description = (
+        "This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub. "
+        "It takes the name of the category (such as text-classification, depth-estimation, etc), and "
+        "returns the name of the checkpoint."
+    )
+
+    inputs = ["text"]
+    outputs = ["text"]
+
+    def __call__(self, task: str):
+        model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
+        return model.id
+```
+
+We now have our tool handy. Save it in a file and import it from your main script. Let's name this file
+`model_downloads.py`, so the resulting import code looks like this:
+
+```python
+from model_downloads import HFModelDownloadsTool
+
+tool = HFModelDownloadsTool()
+```
+
+In order to let others benefit from it and for simpler initialization, we recommend pushing it to the Hub under your 
+namespace. To do so, just call `push_to_hub` on the `tool` variable:
+
+```python
+tool.push_to_hub("hf-model-downloads")
+```
+
+You now have your code on the Hub! Let's take a look at the final step, which is to have the agent use it.
+
+#### Having the agent use the tool
+
+We now have our tool that lives on the Hub which can be instantiated as such (change the user name for your tool):
+
+```python
+from transformers import load_tool
+
+tool = load_tool("lysandre/hf-model-downloads")
+```
+
+In order to use it in the agent, simply pass it in the `additional_tools` parameter of the agent initialization method:
+
+```python
+from transformers import HfAgent
+
+agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=[tool])
+
+agent.run(
+    "Can you read out loud the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?"
+)
+```
+which outputs the following:
+```text
+==Code generated by the agent==
+model = model_download_counter(task="text-to-video")
+print(f"The model with the most downloads is {model}.")
+audio_model = text_reader(model)
+
+
+==Result==
+The model with the most downloads is damo-vilab/text-to-video-ms-1.7b.
+```
+
+and generates the following audio.
+
+| **Audio**                                                                                                                                            |
+|------------------------------------------------------------------------------------------------------------------------------------------------------|
+| <audio controls><source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/damo.wav" type="audio/wav"/> |
+
+
+<Tip>
+
+Depending on the LLM, some are quite brittle and require very exact prompts in order to work well. Having a well-defined
+name and description of the tool is paramount to having it be leveraged by the agent.
+
+</Tip>
+
+### Replacing existing tools
+
+Replacing existing tools can be done simply by assigning a new item to the agent's toolbox. Here's how one would do so:
+
+```python
+from transformers import HfAgent, load_tool
+
+agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
+agent.toolbox["image-transformation"] = load_tool("diffusers/controlnet-canny-tool")
+```
+
+<Tip>
+
+Beware when replacing tools with others! This will also adjust the agent's prompt. This can be good if you have a better
+prompt suited for the task, but it can also result in your tool being selected way more than others or for other
+tools to be selected instead of the one you have defined.
+
+</Tip>
+
+## Leveraging gradio-tools
+
+[gradio-tools](https://github.com/freddyaboulton/gradio-tools) is a powerful library that allows using Hugging
+Face Spaces as tools. It supports many existing Spaces as well as custom Spaces to be designed with it.
+
+We offer support for `gradio_tools` by using the `Tool.from_gradio` method. For example, we want to take
+advantage of the `StableDiffusionPromptGeneratorTool` tool offered in the `gradio-tools` toolkit so as to
+improve our prompts and generate better images.
+
+We first import the tool from `gradio_tools` and instantiate it:
+
+```python
+from gradio_tools import StableDiffusionPromptGeneratorTool
+
+gradio_tool = StableDiffusionPromptGeneratorTool()
+```
+
+We pass that instance to the `Tool.from_gradio` method:
+
+```python
+from transformers import Tool
+
+tool = Tool.from_gradio(gradio_tool)
+```
+
+Now we can manage it exactly as we would a usual custom tool. We leverage it to improve our prompt
+` a rabbit wearing a space suit`:
+
+```python
+from transformers import HfAgent
+
+agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=[tool])
+
+agent.run("Generate an image of the `prompt` after improving it.", prompt="A rabbit wearing a space suit")
+```
+
+The model adequately leverages the tool:
+```text
+==Explanation from the agent==
+I will use the following  tools: `StableDiffusionPromptGenerator` to improve the prompt, then `image_generator` to generate an image according to the improved prompt.
+
+
+==Code generated by the agent==
+improved_prompt = StableDiffusionPromptGenerator(prompt)
+print(f"The improved prompt is {improved_prompt}.")
+image = image_generator(improved_prompt)
+```
+
+Before finally generating the image:
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png">
+
+<Tip warning={true}>
+
+gradio-tools requires *textual* inputs and outputs, even when working with different modalities. This implementation
+works with image and audio objects. The two are currently incompatible, but will rapidly become compatible as we
+work to improve the support.
+
+</Tip>
+
+## Future compatibility with Langchain
+
+We love Langchain and think it has a very compelling suite of tools. In order to handle these tools,
+Langchain requires *textual* inputs and outputs, even when working with different modalities.
+This is often the serialized version (i.e., saved to disk) of the objects.
+
+This difference means that multi-modality isn't handled between transformers-agents and langchain.
+We aim for this limitation to be resolved in future versions, and welcome any help from avid langchain
+users to help us achieve this compatibility.
+
+We would love to have better support. If you would like to help, please 
+[open an issue](https://github.com/huggingface/transformers/issues/new) and share what you have in mind.
diff --git a/docs/source/en/debugging.md b/docs/source/en/debugging.md
new file mode 100644
index 000000000000..b1a430e7344e
--- /dev/null
+++ b/docs/source/en/debugging.md
@@ -0,0 +1,339 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Debugging
+
+## Multi-GPU Network Issues Debug
+
+When training or inferencing with `DistributedDataParallel` and multiple GPU, if you run into issue of inter-communication between processes and/or nodes, you can use the following script to diagnose network issues.
+
+```bash
+wget https://raw.githubusercontent.com/huggingface/transformers/main/scripts/distributed/torch-distributed-gpu-test.py
+```
+
+For example to test how 2 GPUs interact do:
+
+```bash
+python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
+```
+If both processes can talk to each and allocate GPU memory each will print an OK status.
+
+For more GPUs or nodes adjust the arguments in the script.
+
+You will find a lot more details inside the diagnostics script and even a recipe to how you could run it in a SLURM environment.
+
+An additional level of debug is to add `NCCL_DEBUG=INFO` environment variable as follows:
+
+```bash
+NCCL_DEBUG=INFO python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
+```
+
+This will dump a lot of NCCL-related debug information, which you can then search online if you find that some problems are reported. Or if you're not sure how to interpret the output you can share the log file in an Issue.
+
+
+
+## Underflow and Overflow Detection
+
+<Tip>
+
+This feature is currently available for PyTorch-only.
+
+</Tip>
+
+<Tip>
+
+For multi-GPU training it requires DDP (`torch.distributed.launch`).
+
+</Tip>
+
+<Tip>
+
+This feature can be used with any `nn.Module`-based model.
+
+</Tip>
+
+If you start getting `loss=NaN` or the model inhibits some other abnormal behavior due to `inf` or `nan` in
+activations or weights one needs to discover where the first underflow or overflow happens and what led to it. Luckily
+you can accomplish that easily by activating a special module that will do the detection automatically.
+
+If you're using [`Trainer`], you just need to add:
+
+```bash
+--debug underflow_overflow
+```
+
+to the normal command line arguments, or pass `debug="underflow_overflow"` when creating the
+[`TrainingArguments`] object.
+
+If you're using your own training loop or another Trainer you can accomplish the same with:
+
+```python
+from transformers.debug_utils import DebugUnderflowOverflow
+
+debug_overflow = DebugUnderflowOverflow(model)
+```
+
+[`~debug_utils.DebugUnderflowOverflow`] inserts hooks into the model that immediately after each
+forward call will test input and output variables and also the corresponding module's weights. As soon as `inf` or
+`nan` is detected in at least one element of the activations or weights, the program will assert and print a report
+like this (this was caught with `google/mt5-small` under fp16 mixed precision):
+
+```
+Detected inf/nan during batch_number=0
+Last 21 forward frames:
+abs min  abs max  metadata
+                  encoder.block.1.layer.1.DenseReluDense.dropout Dropout
+0.00e+00 2.57e+02 input[0]
+0.00e+00 2.85e+02 output
+[...]
+                  encoder.block.2.layer.0 T5LayerSelfAttention
+6.78e-04 3.15e+03 input[0]
+2.65e-04 3.42e+03 output[0]
+             None output[1]
+2.25e-01 1.00e+04 output[2]
+                  encoder.block.2.layer.1.layer_norm T5LayerNorm
+8.69e-02 4.18e-01 weight
+2.65e-04 3.42e+03 input[0]
+1.79e-06 4.65e+00 output
+                  encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
+2.17e-07 4.50e+00 weight
+1.79e-06 4.65e+00 input[0]
+2.68e-06 3.70e+01 output
+                  encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
+8.08e-07 2.66e+01 weight
+1.79e-06 4.65e+00 input[0]
+1.27e-04 2.37e+02 output
+                  encoder.block.2.layer.1.DenseReluDense.dropout Dropout
+0.00e+00 8.76e+03 input[0]
+0.00e+00 9.74e+03 output
+                  encoder.block.2.layer.1.DenseReluDense.wo Linear
+1.01e-06 6.44e+00 weight
+0.00e+00 9.74e+03 input[0]
+3.18e-04 6.27e+04 output
+                  encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
+1.79e-06 4.65e+00 input[0]
+3.18e-04 6.27e+04 output
+                  encoder.block.2.layer.1.dropout Dropout
+3.18e-04 6.27e+04 input[0]
+0.00e+00      inf output
+```
+
+The example output has been trimmed in the middle for brevity.
+
+The second column shows the value of the absolute largest element, so if you have a closer look at the last few frames,
+the inputs and outputs were in the range of `1e4`. So when this training was done under fp16 mixed precision the very
+last step overflowed (since under `fp16` the largest number before `inf` is `64e3`). To avoid overflows under
+`fp16` the activations must remain way below `1e4`, because `1e4 * 1e4 = 1e8` so any matrix multiplication with
+large activations is going to lead to a numerical overflow condition.
+
+At the very start of the trace you can discover at which batch number the problem occurred (here `Detected inf/nan during batch_number=0` means the problem occurred on the first batch).
+
+Each reported frame starts by declaring the fully qualified entry for the corresponding module this frame is reporting
+for. If we look just at this frame:
+
+```
+                  encoder.block.2.layer.1.layer_norm T5LayerNorm
+8.69e-02 4.18e-01 weight
+2.65e-04 3.42e+03 input[0]
+1.79e-06 4.65e+00 output
+```
+
+Here, `encoder.block.2.layer.1.layer_norm` indicates that it was a layer norm for the first layer, of the second
+block of the encoder. And the specific calls of the `forward` is `T5LayerNorm`.
+
+Let's look at the last few frames of that report:
+
+```
+Detected inf/nan during batch_number=0
+Last 21 forward frames:
+abs min  abs max  metadata
+[...]
+                  encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
+2.17e-07 4.50e+00 weight
+1.79e-06 4.65e+00 input[0]
+2.68e-06 3.70e+01 output
+                  encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
+8.08e-07 2.66e+01 weight
+1.79e-06 4.65e+00 input[0]
+1.27e-04 2.37e+02 output
+                  encoder.block.2.layer.1.DenseReluDense.wo Linear
+1.01e-06 6.44e+00 weight
+0.00e+00 9.74e+03 input[0]
+3.18e-04 6.27e+04 output
+                  encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
+1.79e-06 4.65e+00 input[0]
+3.18e-04 6.27e+04 output
+                  encoder.block.2.layer.1.dropout Dropout
+3.18e-04 6.27e+04 input[0]
+0.00e+00      inf output
+```
+
+The last frame reports for `Dropout.forward` function with the first entry for the only input and the second for the
+only output. You can see that it was called from an attribute `dropout` inside `DenseReluDense` class. We can see
+that it happened during the first layer, of the 2nd block, during the very first batch. Finally, the absolute largest
+input elements was `6.27e+04` and same for the output was `inf`.
+
+You can see here, that `T5DenseGatedGeluDense.forward` resulted in output activations, whose absolute max value was
+around 62.7K, which is very close to fp16's top limit of 64K. In the next frame we have `Dropout` which renormalizes
+the weights, after it zeroed some of the elements, which pushes the absolute max value to more than 64K, and we get an
+overflow (`inf`).
+
+As you can see it's the previous frames that we need to look into when the numbers start going into very large for fp16
+numbers.
+
+Let's match the report to the code from `models/t5/modeling_t5.py`:
+
+```python
+class T5DenseGatedGeluDense(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+        self.gelu_act = ACT2FN["gelu_new"]
+
+    def forward(self, hidden_states):
+        hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
+        hidden_linear = self.wi_1(hidden_states)
+        hidden_states = hidden_gelu * hidden_linear
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+```
+
+Now it's easy to see the `dropout` call, and all the previous calls as well.
+
+Since the detection is happening in a forward hook, these reports are printed immediately after each `forward`
+returns.
+
+Going back to the full report, to act on it and to fix the problem, we need to go a few frames up where the numbers
+started to go up and most likely switch to the `fp32` mode here, so that the numbers don't overflow when multiplied
+or summed up. Of course, there might be other solutions. For example, we could turn off `amp` temporarily if it's
+enabled, after moving the original `forward` into a helper wrapper, like so:
+
+```python
+def _forward(self, hidden_states):
+    hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
+    hidden_linear = self.wi_1(hidden_states)
+    hidden_states = hidden_gelu * hidden_linear
+    hidden_states = self.dropout(hidden_states)
+    hidden_states = self.wo(hidden_states)
+    return hidden_states
+
+
+import torch
+
+
+def forward(self, hidden_states):
+    if torch.is_autocast_enabled():
+        with torch.cuda.amp.autocast(enabled=False):
+            return self._forward(hidden_states)
+    else:
+        return self._forward(hidden_states)
+```
+
+Since the automatic detector only reports on inputs and outputs of full frames, once you know where to look, you may
+want to analyse the intermediary stages of any specific `forward` function as well. In such a case you can use the
+`detect_overflow` helper function to inject the detector where you want it, for example:
+
+```python
+from debug_utils import detect_overflow
+
+
+class T5LayerFF(nn.Module):
+    [...]
+
+    def forward(self, hidden_states):
+        forwarded_states = self.layer_norm(hidden_states)
+        detect_overflow(forwarded_states, "after layer_norm")
+        forwarded_states = self.DenseReluDense(forwarded_states)
+        detect_overflow(forwarded_states, "after DenseReluDense")
+        return hidden_states + self.dropout(forwarded_states)
+```
+
+You can see that we added 2 of these and now we track if `inf` or `nan` for `forwarded_states` was detected
+somewhere in between.
+
+Actually, the detector already reports these because each of the calls in the example above is a `nn.Module`, but
+let's say if you had some local direct calculations this is how you'd do that.
+
+Additionally, if you're instantiating the debugger in your own code, you can adjust the number of frames printed from
+its default, e.g.:
+
+```python
+from transformers.debug_utils import DebugUnderflowOverflow
+
+debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100)
+```
+
+### Specific batch absolute min and max value tracing
+
+The same debugging class can be used for per-batch tracing with the underflow/overflow detection feature turned off.
+
+Let's say you want to watch the absolute min and max values for all the ingredients of each `forward` call of a given
+batch, and only do that for batches 1 and 3. Then you instantiate this class as:
+
+```python
+debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3])
+```
+
+And now full batches 1 and 3 will be traced using the same format as the underflow/overflow detector does.
+
+Batches are 0-indexed.
+
+This is helpful if you know that the program starts misbehaving after a certain batch number, so you can fast-forward
+right to that area. Here is a sample truncated output for such configuration:
+
+```
+                  *** Starting batch number=1 ***
+abs min  abs max  metadata
+                  shared Embedding
+1.01e-06 7.92e+02 weight
+0.00e+00 2.47e+04 input[0]
+5.36e-05 7.92e+02 output
+[...]
+                  decoder.dropout Dropout
+1.60e-07 2.27e+01 input[0]
+0.00e+00 2.52e+01 output
+                  decoder T5Stack
+     not a tensor output
+                  lm_head Linear
+1.01e-06 7.92e+02 weight
+0.00e+00 1.11e+00 input[0]
+6.06e-02 8.39e+01 output
+                   T5ForConditionalGeneration
+     not a tensor output
+
+                  *** Starting batch number=3 ***
+abs min  abs max  metadata
+                  shared Embedding
+1.01e-06 7.92e+02 weight
+0.00e+00 2.78e+04 input[0]
+5.36e-05 7.92e+02 output
+[...]
+```
+
+Here you will get a huge number of frames dumped - as many as there were forward calls in your model, so it may or may
+not what you want, but sometimes it can be easier to use for debugging purposes than a normal debugger. For example, if
+a problem starts happening at batch number 150. So you can dump traces for batches 149 and 150 and compare where
+numbers started to diverge.
+
+You can also specify the batch number after which to stop the training, with:
+
+```python
+debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3], abort_after_batch_num=3)
+```
diff --git a/docs/source/en/debugging.mdx b/docs/source/en/debugging.mdx
deleted file mode 100644
index 92dfe639c1f4..000000000000
--- a/docs/source/en/debugging.mdx
+++ /dev/null
@@ -1,335 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Debugging
-
-## Multi-GPU Network Issues Debug
-
-When training or inferencing with `DistributedDataParallel` and multiple GPU, if you run into issue of inter-communication between processes and/or nodes, you can use the following script to diagnose network issues.
-
-```bash
-wget https://raw.githubusercontent.com/huggingface/transformers/main/scripts/distributed/torch-distributed-gpu-test.py
-```
-
-For example to test how 2 GPUs interact do:
-
-```bash
-python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
-```
-If both processes can talk to each and allocate GPU memory each will print an OK status.
-
-For more GPUs or nodes adjust the arguments in the script.
-
-You will find a lot more details inside the diagnostics script and even a recipe to how you could run it in a SLURM environment.
-
-An additional level of debug is to add `NCCL_DEBUG=INFO` environment variable as follows:
-
-```bash
-NCCL_DEBUG=INFO python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
-```
-
-This will dump a lot of NCCL-related debug information, which you can then search online if you find that some problems are reported. Or if you're not sure how to interpret the output you can share the log file in an Issue.
-
-
-
-## Underflow and Overflow Detection
-
-<Tip>
-
-This feature is currently available for PyTorch-only.
-
-</Tip>
-
-<Tip>
-
-For multi-GPU training it requires DDP (`torch.distributed.launch`).
-
-</Tip>
-
-<Tip>
-
-This feature can be used with any `nn.Module`-based model.
-
-</Tip>
-
-If you start getting `loss=NaN` or the model inhibits some other abnormal behavior due to `inf` or `nan` in
-activations or weights one needs to discover where the first underflow or overflow happens and what led to it. Luckily
-you can accomplish that easily by activating a special module that will do the detection automatically.
-
-If you're using [`Trainer`], you just need to add:
-
-```bash
---debug underflow_overflow
-```
-
-to the normal command line arguments, or pass `debug="underflow_overflow"` when creating the
-[`TrainingArguments`] object.
-
-If you're using your own training loop or another Trainer you can accomplish the same with:
-
-```python
-from transformers.debug_utils import DebugUnderflowOverflow
-
-debug_overflow = DebugUnderflowOverflow(model)
-```
-
-[`~debug_utils.DebugUnderflowOverflow`] inserts hooks into the model that immediately after each
-forward call will test input and output variables and also the corresponding module's weights. As soon as `inf` or
-`nan` is detected in at least one element of the activations or weights, the program will assert and print a report
-like this (this was caught with `google/mt5-small` under fp16 mixed precision):
-
-```
-Detected inf/nan during batch_number=0
-Last 21 forward frames:
-abs min  abs max  metadata
-                  encoder.block.1.layer.1.DenseReluDense.dropout Dropout
-0.00e+00 2.57e+02 input[0]
-0.00e+00 2.85e+02 output
-[...]
-                  encoder.block.2.layer.0 T5LayerSelfAttention
-6.78e-04 3.15e+03 input[0]
-2.65e-04 3.42e+03 output[0]
-             None output[1]
-2.25e-01 1.00e+04 output[2]
-                  encoder.block.2.layer.1.layer_norm T5LayerNorm
-8.69e-02 4.18e-01 weight
-2.65e-04 3.42e+03 input[0]
-1.79e-06 4.65e+00 output
-                  encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
-2.17e-07 4.50e+00 weight
-1.79e-06 4.65e+00 input[0]
-2.68e-06 3.70e+01 output
-                  encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
-8.08e-07 2.66e+01 weight
-1.79e-06 4.65e+00 input[0]
-1.27e-04 2.37e+02 output
-                  encoder.block.2.layer.1.DenseReluDense.dropout Dropout
-0.00e+00 8.76e+03 input[0]
-0.00e+00 9.74e+03 output
-                  encoder.block.2.layer.1.DenseReluDense.wo Linear
-1.01e-06 6.44e+00 weight
-0.00e+00 9.74e+03 input[0]
-3.18e-04 6.27e+04 output
-                  encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
-1.79e-06 4.65e+00 input[0]
-3.18e-04 6.27e+04 output
-                  encoder.block.2.layer.1.dropout Dropout
-3.18e-04 6.27e+04 input[0]
-0.00e+00      inf output
-```
-
-The example output has been trimmed in the middle for brevity.
-
-The second column shows the value of the absolute largest element, so if you have a closer look at the last few frames,
-the inputs and outputs were in the range of `1e4`. So when this training was done under fp16 mixed precision the very
-last step overflowed (since under `fp16` the largest number before `inf` is `64e3`). To avoid overflows under
-`fp16` the activations must remain way below `1e4`, because `1e4 * 1e4 = 1e8` so any matrix multiplication with
-large activations is going to lead to a numerical overflow condition.
-
-At the very start of the trace you can discover at which batch number the problem occurred (here `Detected inf/nan during batch_number=0` means the problem occurred on the first batch).
-
-Each reported frame starts by declaring the fully qualified entry for the corresponding module this frame is reporting
-for. If we look just at this frame:
-
-```
-                  encoder.block.2.layer.1.layer_norm T5LayerNorm
-8.69e-02 4.18e-01 weight
-2.65e-04 3.42e+03 input[0]
-1.79e-06 4.65e+00 output
-```
-
-Here, `encoder.block.2.layer.1.layer_norm` indicates that it was a layer norm for the first layer, of the second
-block of the encoder. And the specific calls of the `forward` is `T5LayerNorm`.
-
-Let's look at the last few frames of that report:
-
-```
-Detected inf/nan during batch_number=0
-Last 21 forward frames:
-abs min  abs max  metadata
-[...]
-                  encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
-2.17e-07 4.50e+00 weight
-1.79e-06 4.65e+00 input[0]
-2.68e-06 3.70e+01 output
-                  encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
-8.08e-07 2.66e+01 weight
-1.79e-06 4.65e+00 input[0]
-1.27e-04 2.37e+02 output
-                  encoder.block.2.layer.1.DenseReluDense.wo Linear
-1.01e-06 6.44e+00 weight
-0.00e+00 9.74e+03 input[0]
-3.18e-04 6.27e+04 output
-                  encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
-1.79e-06 4.65e+00 input[0]
-3.18e-04 6.27e+04 output
-                  encoder.block.2.layer.1.dropout Dropout
-3.18e-04 6.27e+04 input[0]
-0.00e+00      inf output
-```
-
-The last frame reports for `Dropout.forward` function with the first entry for the only input and the second for the
-only output. You can see that it was called from an attribute `dropout` inside `DenseReluDense` class. We can see
-that it happened during the first layer, of the 2nd block, during the very first batch. Finally, the absolute largest
-input elements was `6.27e+04` and same for the output was `inf`.
-
-You can see here, that `T5DenseGatedGeluDense.forward` resulted in output activations, whose absolute max value was
-around 62.7K, which is very close to fp16's top limit of 64K. In the next frame we have `Dropout` which renormalizes
-the weights, after it zeroed some of the elements, which pushes the absolute max value to more than 64K, and we get an
-overflow (`inf`).
-
-As you can see it's the previous frames that we need to look into when the numbers start going into very large for fp16
-numbers.
-
-Let's match the report to the code from `models/t5/modeling_t5.py`:
-
-```python
-class T5DenseGatedGeluDense(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
-        self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
-        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
-        self.dropout = nn.Dropout(config.dropout_rate)
-        self.gelu_act = ACT2FN["gelu_new"]
-
-    def forward(self, hidden_states):
-        hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
-        hidden_linear = self.wi_1(hidden_states)
-        hidden_states = hidden_gelu * hidden_linear
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.wo(hidden_states)
-        return hidden_states
-```
-
-Now it's easy to see the `dropout` call, and all the previous calls as well.
-
-Since the detection is happening in a forward hook, these reports are printed immediately after each `forward`
-returns.
-
-Going back to the full report, to act on it and to fix the problem, we need to go a few frames up where the numbers
-started to go up and most likely switch to the `fp32` mode here, so that the numbers don't overflow when multiplied
-or summed up. Of course, there might be other solutions. For example, we could turn off `amp` temporarily if it's
-enabled, after moving the original `forward` into a helper wrapper, like so:
-
-```python
-def _forward(self, hidden_states):
-    hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
-    hidden_linear = self.wi_1(hidden_states)
-    hidden_states = hidden_gelu * hidden_linear
-    hidden_states = self.dropout(hidden_states)
-    hidden_states = self.wo(hidden_states)
-    return hidden_states
-
-
-import torch
-
-
-def forward(self, hidden_states):
-    if torch.is_autocast_enabled():
-        with torch.cuda.amp.autocast(enabled=False):
-            return self._forward(hidden_states)
-    else:
-        return self._forward(hidden_states)
-```
-
-Since the automatic detector only reports on inputs and outputs of full frames, once you know where to look, you may
-want to analyse the intermediary stages of any specific `forward` function as well. In such a case you can use the
-`detect_overflow` helper function to inject the detector where you want it, for example:
-
-```python
-from debug_utils import detect_overflow
-
-
-class T5LayerFF(nn.Module):
-    [...]
-
-    def forward(self, hidden_states):
-        forwarded_states = self.layer_norm(hidden_states)
-        detect_overflow(forwarded_states, "after layer_norm")
-        forwarded_states = self.DenseReluDense(forwarded_states)
-        detect_overflow(forwarded_states, "after DenseReluDense")
-        return hidden_states + self.dropout(forwarded_states)
-```
-
-You can see that we added 2 of these and now we track if `inf` or `nan` for `forwarded_states` was detected
-somewhere in between.
-
-Actually, the detector already reports these because each of the calls in the example above is a `nn.Module`, but
-let's say if you had some local direct calculations this is how you'd do that.
-
-Additionally, if you're instantiating the debugger in your own code, you can adjust the number of frames printed from
-its default, e.g.:
-
-```python
-from transformers.debug_utils import DebugUnderflowOverflow
-
-debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100)
-```
-
-### Specific batch absolute min and max value tracing
-
-The same debugging class can be used for per-batch tracing with the underflow/overflow detection feature turned off.
-
-Let's say you want to watch the absolute min and max values for all the ingredients of each `forward` call of a given
-batch, and only do that for batches 1 and 3. Then you instantiate this class as:
-
-```python
-debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3])
-```
-
-And now full batches 1 and 3 will be traced using the same format as the underflow/overflow detector does.
-
-Batches are 0-indexed.
-
-This is helpful if you know that the program starts misbehaving after a certain batch number, so you can fast-forward
-right to that area. Here is a sample truncated output for such configuration:
-
-```
-                  *** Starting batch number=1 ***
-abs min  abs max  metadata
-                  shared Embedding
-1.01e-06 7.92e+02 weight
-0.00e+00 2.47e+04 input[0]
-5.36e-05 7.92e+02 output
-[...]
-                  decoder.dropout Dropout
-1.60e-07 2.27e+01 input[0]
-0.00e+00 2.52e+01 output
-                  decoder T5Stack
-     not a tensor output
-                  lm_head Linear
-1.01e-06 7.92e+02 weight
-0.00e+00 1.11e+00 input[0]
-6.06e-02 8.39e+01 output
-                   T5ForConditionalGeneration
-     not a tensor output
-
-                  *** Starting batch number=3 ***
-abs min  abs max  metadata
-                  shared Embedding
-1.01e-06 7.92e+02 weight
-0.00e+00 2.78e+04 input[0]
-5.36e-05 7.92e+02 output
-[...]
-```
-
-Here you will get a huge number of frames dumped - as many as there were forward calls in your model, so it may or may
-not what you want, but sometimes it can be easier to use for debugging purposes than a normal debugger. For example, if
-a problem starts happening at batch number 150. So you can dump traces for batches 149 and 150 and compare where
-numbers started to diverge.
-
-You can also specify the batch number after which to stop the training, with:
-
-```python
-debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3], abort_after_batch_num=3)
-```
diff --git a/docs/source/en/fast_tokenizers.md b/docs/source/en/fast_tokenizers.md
new file mode 100644
index 000000000000..aebc17106008
--- /dev/null
+++ b/docs/source/en/fast_tokenizers.md
@@ -0,0 +1,74 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Use tokenizers from 🤗 Tokenizers
+
+The [`PreTrainedTokenizerFast`] depends on the [🤗 Tokenizers](https://huggingface.co/docs/tokenizers) library. The tokenizers obtained from the 🤗 Tokenizers library can be
+loaded very simply into 🤗 Transformers.
+
+Before getting in the specifics, let's first start by creating a dummy tokenizer in a few lines:
+
+```python
+>>> from tokenizers import Tokenizer
+>>> from tokenizers.models import BPE
+>>> from tokenizers.trainers import BpeTrainer
+>>> from tokenizers.pre_tokenizers import Whitespace
+
+>>> tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
+>>> trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
+
+>>> tokenizer.pre_tokenizer = Whitespace()
+>>> files = [...]
+>>> tokenizer.train(files, trainer)
+```
+
+We now have a tokenizer trained on the files we defined. We can either continue using it in that runtime, or save it to
+a JSON file for future re-use.
+
+## Loading directly from the tokenizer object
+
+Let's see how to leverage this tokenizer object in the 🤗 Transformers library. The
+[`PreTrainedTokenizerFast`] class allows for easy instantiation, by accepting the instantiated
+*tokenizer* object as an argument:
+
+```python
+>>> from transformers import PreTrainedTokenizerFast
+
+>>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
+```
+
+This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to [the tokenizer
+page](main_classes/tokenizer) for more information.
+
+## Loading from a JSON file
+
+In order to load a tokenizer from a JSON file, let's first start by saving our tokenizer:
+
+```python
+>>> tokenizer.save("tokenizer.json")
+```
+
+The path to which we saved this file can be passed to the [`PreTrainedTokenizerFast`] initialization
+method using the `tokenizer_file` parameter:
+
+```python
+>>> from transformers import PreTrainedTokenizerFast
+
+>>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
+```
+
+This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to [the tokenizer
+page](main_classes/tokenizer) for more information.
diff --git a/docs/source/en/fast_tokenizers.mdx b/docs/source/en/fast_tokenizers.mdx
deleted file mode 100644
index 234f9bb42d19..000000000000
--- a/docs/source/en/fast_tokenizers.mdx
+++ /dev/null
@@ -1,70 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Use tokenizers from 🤗 Tokenizers
-
-The [`PreTrainedTokenizerFast`] depends on the [🤗 Tokenizers](https://huggingface.co/docs/tokenizers) library. The tokenizers obtained from the 🤗 Tokenizers library can be
-loaded very simply into 🤗 Transformers.
-
-Before getting in the specifics, let's first start by creating a dummy tokenizer in a few lines:
-
-```python
->>> from tokenizers import Tokenizer
->>> from tokenizers.models import BPE
->>> from tokenizers.trainers import BpeTrainer
->>> from tokenizers.pre_tokenizers import Whitespace
-
->>> tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
->>> trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
-
->>> tokenizer.pre_tokenizer = Whitespace()
->>> files = [...]
->>> tokenizer.train(files, trainer)
-```
-
-We now have a tokenizer trained on the files we defined. We can either continue using it in that runtime, or save it to
-a JSON file for future re-use.
-
-## Loading directly from the tokenizer object
-
-Let's see how to leverage this tokenizer object in the 🤗 Transformers library. The
-[`PreTrainedTokenizerFast`] class allows for easy instantiation, by accepting the instantiated
-*tokenizer* object as an argument:
-
-```python
->>> from transformers import PreTrainedTokenizerFast
-
->>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
-```
-
-This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to [the tokenizer
-page](main_classes/tokenizer) for more information.
-
-## Loading from a JSON file
-
-In order to load a tokenizer from a JSON file, let's first start by saving our tokenizer:
-
-```python
->>> tokenizer.save("tokenizer.json")
-```
-
-The path to which we saved this file can be passed to the [`PreTrainedTokenizerFast`] initialization
-method using the `tokenizer_file` parameter:
-
-```python
->>> from transformers import PreTrainedTokenizerFast
-
->>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
-```
-
-This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to [the tokenizer
-page](main_classes/tokenizer) for more information.
diff --git a/docs/source/en/generation_strategies.md b/docs/source/en/generation_strategies.md
new file mode 100644
index 000000000000..51f92e061032
--- /dev/null
+++ b/docs/source/en/generation_strategies.md
@@ -0,0 +1,388 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Text generation strategies
+
+Text generation is essential to many NLP tasks, such as open-ended text generation, summarization, translation, and
+more. It also plays a role in a variety of mixed-modality applications that have text as an output like speech-to-text
+and vision-to-text. Some of the models that can generate text include
+GPT2, XLNet, OpenAI GPT, CTRL, TransformerXL, XLM, Bart, T5, GIT, Whisper.
+
+Check out a few examples that use [`~transformers.generation_utils.GenerationMixin.generate`] method to produce
+text outputs for different tasks:
+* [Text summarization](./tasks/summarization#inference)
+* [Image captioning](./model_doc/git#transformers.GitForCausalLM.forward.example)
+* [Audio transcription](./model_doc/whisper#transformers.WhisperForConditionalGeneration.forward.example)
+
+Note that the inputs to the generate method depend on the model's modality. They are returned by the model's preprocessor
+class, such as AutoTokenizer or AutoProcessor. If a model's preprocessor creates more than one kind of input, pass all
+the inputs to generate(). You can learn more about the individual model's preprocessor in the corresponding model's documentation.
+
+The process of selecting output tokens to generate text is known as decoding, and you can customize the decoding strategy
+that the `generate()` method will use. Modifying a decoding strategy does not change the values of any trainable parameters.
+However, it can have a noticeable impact on the quality of the generated output. It can help reduce repetition in the text
+and make it more coherent.
+
+This guide describes:
+* default generation configuration
+* common decoding strategies and their main parameters
+* saving and sharing custom generation configurations with your fine-tuned model on 🤗 Hub
+
+## Default text generation configuration
+
+A decoding strategy for a model is defined in its generation configuration. When using pre-trained models for inference
+within a [`pipeline`], the models call the `PreTrainedModel.generate()` method that applies a default generation
+configuration under the hood. The default configuration is also used when no custom configuration has been saved with
+the model.
+
+When you load a model explicitly, you can inspect the generation configuration that comes with it through
+ `model.generation_config`:
+
+```python
+>>> from transformers import AutoModelForCausalLM
+
+>>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+>>> model.generation_config
+GenerationConfig {
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+}
+```
+
+Printing out the `model.generation_config` reveals only the values that are different from the default generation
+configuration, and does not list any of the default values.
+
+The default generation configuration limits the size of the output combined with the input prompt to a maximum of 20
+tokens to avoid running into resource limitations. The default decoding strategy is greedy search, which is the simplest decoding strategy that picks a token with the highest probability as the next token. For many tasks
+and small output sizes this works well. However, when used to generate longer outputs, greedy search can start
+producing highly repetitive results.
+
+## Customize text generation
+
+You can override any `generation_config` by passing the parameters and their values directly to the [`generate`] method:
+
+```python
+>>> my_model.generate(**inputs, num_beams=4, do_sample=True)  # doctest: +SKIP
+```
+
+Even if the default decoding strategy mostly works for your task, you can still tweak a few things. Some of the
+commonly adjusted parameters include:
+
+- `max_new_tokens`: the maximum number of tokens to generate. In other words, the size of the output sequence, not
+including the tokens in the prompt. As an alternative to using the output's length as a stopping criteria, you can choose 
+to stop generation whenever the full generation exceeds some amount of time. To learn more, check [`StoppingCriteria`].
+- `num_beams`: by specifying a number of beams higher than 1, you are effectively switching from greedy search to
+beam search. This strategy evaluates several hypotheses at each time step and eventually chooses the hypothesis that
+has the overall highest probability for the entire sequence. This has the advantage of identifying high-probability
+sequences that start with a lower probability initial tokens and would've been ignored by the greedy search.
+- `do_sample`: if set to `True`, this parameter enables decoding strategies such as multinomial sampling, beam-search
+multinomial sampling, Top-K sampling and Top-p sampling. All these strategies select the next token from the probability
+distribution over the entire vocabulary with various strategy-specific adjustments.
+- `num_return_sequences`: the number of sequence candidates to return for each input. This option is only available for
+the decoding strategies that support multiple sequence candidates, e.g. variations of beam search and sampling. Decoding
+strategies like greedy search and contrastive search return a single output sequence.
+
+## Save a custom decoding strategy with your model
+
+If you would like to share your fine-tuned model with a specific generation configuration, you can:
+* Create a [`GenerationConfig`] class instance
+* Specify the decoding strategy parameters
+* Save your generation configuration with [`GenerationConfig.save_pretrained`], making sure to leave its `config_file_name` argument empty
+* Set `push_to_hub` to `True` to upload your config to the model's repo
+
+```python
+>>> from transformers import AutoModelForCausalLM, GenerationConfig
+
+>>> model = AutoModelForCausalLM.from_pretrained("my_account/my_model")  # doctest: +SKIP
+>>> generation_config = GenerationConfig(
+...     max_new_tokens=50, do_sample=True, top_k=50, eos_token_id=model.config.eos_token_id
+... )
+>>> generation_config.save_pretrained("my_account/my_model", push_to_hub=True)  # doctest: +SKIP
+```
+
+You can also store several generation configurations in a single directory, making use of the `config_file_name`
+argument in [`GenerationConfig.save_pretrained`]. You can later instantiate them with [`GenerationConfig.from_pretrained`]. This is useful if you want to
+store several generation configurations for a single model (e.g. one for creative text generation with sampling, and
+one for summarization with beam search). You must have the right Hub permissions to add configuration files to a model.
+
+```python
+>>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig
+
+>>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
+
+>>> translation_generation_config = GenerationConfig(
+...     num_beams=4,
+...     early_stopping=True,
+...     decoder_start_token_id=0,
+...     eos_token_id=model.config.eos_token_id,
+...     pad_token=model.config.pad_token_id,
+... )
+
+>>> # Tip: add `push_to_hub=True` to push to the Hub
+>>> translation_generation_config.save_pretrained("/tmp", "translation_generation_config.json")
+
+>>> # You could then use the named generation config file to parameterize generation
+>>> generation_config = GenerationConfig.from_pretrained("/tmp", "translation_generation_config.json")
+>>> inputs = tokenizer("translate English to French: Configuration files are easy to use!", return_tensors="pt")
+>>> outputs = model.generate(**inputs, generation_config=generation_config)
+>>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
+['Les fichiers de configuration sont faciles à utiliser!']
+```
+
+## Streaming
+
+The `generate()` supports streaming, through its `streamer` input. The `streamer` input is compatible with any instance
+from a class that has the following methods: `put()` and `end()`. Internally, `put()` is used to push new tokens and
+`end()` is used to flag the end of text generation.
+
+<Tip warning={true}>
+
+The API for the streamer classes is still under development and may change in the future.
+
+</Tip>
+
+In practice, you can craft your own streaming class for all sorts of purposes! We also have basic streaming classes
+ready for you to use. For example, you can use the [`TextStreamer`] class to stream the output of `generate()` into
+your screen, one word at a time:
+
+```python
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
+
+>>> tok = AutoTokenizer.from_pretrained("gpt2")
+>>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+>>> inputs = tok(["An increasing sequence: one,"], return_tensors="pt")
+>>> streamer = TextStreamer(tok)
+
+>>> # Despite returning the usual output, the streamer will also print the generated text to stdout.
+>>> _ = model.generate(**inputs, streamer=streamer, max_new_tokens=20)
+An increasing sequence: one, two, three, four, five, six, seven, eight, nine, ten, eleven,
+```
+
+## Decoding strategies
+
+Certain combinations of the `generate()` parameters, and ultimately `generation_config`, can be used to enable specific
+decoding strategies. If you are new to this concept, we recommend reading [this blog post that illustrates how common decoding strategies work](https://huggingface.co/blog/how-to-generate).
+
+Here, we'll show some of the parameters that control the decoding strategies and illustrate how you can use them.
+
+### Greedy Search
+
+[`generate`] uses greedy search decoding by default so you don't have to pass any parameters to enable it. This means the parameters `num_beams` is set to 1 and `do_sample=False`.
+
+```python
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer
+
+>>> prompt = "I look forward to"
+>>> checkpoint = "distilgpt2"
+
+>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+>>> inputs = tokenizer(prompt, return_tensors="pt")
+
+>>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
+>>> outputs = model.generate(**inputs)
+>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+['I look forward to seeing you all again!\n\n\n\n\n\n\n\n\n\n\n']
+```
+
+### Contrastive search
+
+The contrastive search decoding strategy was proposed in the 2022 paper [A Contrastive Framework for Neural Text Generation](https://arxiv.org/abs/2202.06417).
+It demonstrates superior results for generating non-repetitive yet coherent long outputs. To learn how contrastive search
+works, check out [this blog post](https://huggingface.co/blog/introducing-csearch).
+The two main parameters that enable and control the behavior of contrastive search are `penalty_alpha` and `top_k`:
+
+```python
+>>> from transformers import AutoTokenizer, AutoModelForCausalLM
+
+>>> checkpoint = "gpt2-large"
+>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+>>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
+
+>>> prompt = "Hugging Face Company is"
+>>> inputs = tokenizer(prompt, return_tensors="pt")
+
+>>> outputs = model.generate(**inputs, penalty_alpha=0.6, top_k=4, max_new_tokens=100)
+>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+['Hugging Face Company is a family owned and operated business. We pride ourselves on being the best
+in the business and our customer service is second to none.\n\nIf you have any questions about our
+products or services, feel free to contact us at any time. We look forward to hearing from you!']
+```
+
+### Multinomial sampling
+
+As opposed to greedy search that always chooses a token with the highest probability as the
+next token, multinomial sampling (also called ancestral sampling) randomly selects the next token based on the probability distribution over the entire
+vocabulary given by the model. Every token with a non-zero probability has a chance of being selected, thus reducing the
+risk of repetition.
+
+To enable multinomial sampling set `do_sample=True` and `num_beams=1`.
+
+```python
+>>> from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
+>>> set_seed(0)  # For reproducibility
+
+>>> checkpoint = "gpt2-large"
+>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+>>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
+
+>>> prompt = "Today was an amazing day because"
+>>> inputs = tokenizer(prompt, return_tensors="pt")
+
+>>> outputs = model.generate(**inputs, do_sample=True, num_beams=1, max_new_tokens=100)
+>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+['Today was an amazing day because when you go to the World Cup and you don\'t, or when you don\'t get invited,
+that\'s a terrible feeling."']
+```
+
+### Beam-search decoding
+
+Unlike greedy search, beam-search decoding keeps several hypotheses at each time step and eventually chooses
+the hypothesis that has the overall highest probability for the entire sequence. This has the advantage of identifying high-probability
+sequences that start with lower probability initial tokens and would've been ignored by the greedy search.
+
+To enable this decoding strategy, specify the `num_beams` (aka number of hypotheses to keep track of) that is greater than 1.
+
+```python
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer
+
+>>> prompt = "It is astonishing how one can"
+>>> checkpoint = "gpt2-medium"
+
+>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+>>> inputs = tokenizer(prompt, return_tensors="pt")
+
+>>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
+
+>>> outputs = model.generate(**inputs, num_beams=5, max_new_tokens=50)
+>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+['It is astonishing how one can have such a profound impact on the lives of so many people in such a short period of
+time."\n\nHe added: "I am very proud of the work I have been able to do in the last few years.\n\n"I have']
+```
+
+### Beam-search multinomial sampling
+
+As the name implies, this decoding strategy combines beam search with multinomial sampling. You need to specify
+the `num_beams` greater than 1, and set `do_sample=True` to use this decoding strategy.
+
+```python
+>>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, set_seed
+>>> set_seed(0)  # For reproducibility
+
+>>> prompt = "translate English to German: The house is wonderful."
+>>> checkpoint = "t5-small"
+
+>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+>>> inputs = tokenizer(prompt, return_tensors="pt")
+
+>>> model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
+
+>>> outputs = model.generate(**inputs, num_beams=5, do_sample=True)
+>>> tokenizer.decode(outputs[0], skip_special_tokens=True)
+'Das Haus ist wunderbar.'
+```
+
+### Diverse beam search decoding
+
+The diverse beam search decoding strategy is an extension of the beam search strategy that allows for generating a more diverse
+set of beam sequences to choose from. To learn how it works, refer to [Diverse Beam Search: Decoding Diverse Solutions from Neural Sequence Models](https://arxiv.org/pdf/1610.02424.pdf).
+This approach has three main parameters: `num_beams`, `num_beam_groups`, and `diversity_penalty`.
+The diversity penalty ensures the outputs are distinct across groups, and beam search is used within each group.
+
+
+```python
+>>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+
+>>> checkpoint = "google/pegasus-xsum"
+>>> prompt = (
+...     "The Permaculture Design Principles are a set of universal design principles "
+...     "that can be applied to any location, climate and culture, and they allow us to design "
+...     "the most efficient and sustainable human habitation and food production systems. "
+...     "Permaculture is a design system that encompasses a wide variety of disciplines, such "
+...     "as ecology, landscape design, environmental science and energy conservation, and the "
+...     "Permaculture design principles are drawn from these various disciplines. Each individual "
+...     "design principle itself embodies a complete conceptual framework based on sound "
+...     "scientific principles. When we bring all these separate  principles together, we can "
+...     "create a design system that both looks at whole systems, the parts that these systems "
+...     "consist of, and how those parts interact with each other to create a complex, dynamic, "
+...     "living system. Each design principle serves as a tool that allows us to integrate all "
+...     "the separate parts of a design, referred to as elements, into a functional, synergistic, "
+...     "whole system, where the elements harmoniously interact and work together in the most "
+...     "efficient way possible."
+... )
+
+>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+>>> inputs = tokenizer(prompt, return_tensors="pt")
+
+>>> model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
+
+>>> outputs = model.generate(**inputs, num_beams=5, num_beam_groups=5, max_new_tokens=30, diversity_penalty=1.0)
+>>> tokenizer.decode(outputs[0], skip_special_tokens=True)
+'The Design Principles are a set of universal design principles that can be applied to any location, climate and
+culture, and they allow us to design the'
+```
+
+This guide illustrates the main parameters that enable various decoding strategies. More advanced parameters exist for the
+[`generate`] method, which gives you even further control over the [`generate`] method's behavior.
+For the complete list of the available parameters, refer to the [API documentation](./main_classes/text_generation.md).
+
+### Assisted Decoding
+
+Assisted decoding is a modification of the decoding strategies above that uses an assistant model with the same
+tokenizer (ideally a much smaller model) to greedily generate a few candidate tokens. The main model then validates
+the candidate tokens in a single forward pass, which speeds up the decoding process. Currently, only greedy search
+and sampling are supported with assisted decoding, and doesn't support batched inputs. To learn more about assisted
+decoding, check [this blog post](https://huggingface.co/blog/assisted-generation).
+
+To enable assisted decoding, set the `assistant_model` argument with a model.
+
+```python
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer
+
+>>> prompt = "Alice and Bob"
+>>> checkpoint = "EleutherAI/pythia-1.4b-deduped"
+>>> assistant_checkpoint = "EleutherAI/pythia-160m-deduped"
+
+>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+>>> inputs = tokenizer(prompt, return_tensors="pt")
+
+>>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
+>>> assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint)
+>>> outputs = model.generate(**inputs, assistant_model=assistant_model)
+>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a']
+```
+
+When using assisted decoding with sampling methods, you can use the `temperature` argument to control the randomness
+just like in multinomial sampling. However, in assisted decoding, reducing the temperature will help improving latency.
+
+```python
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
+>>> set_seed(42)  # For reproducibility
+
+>>> prompt = "Alice and Bob"
+>>> checkpoint = "EleutherAI/pythia-1.4b-deduped"
+>>> assistant_checkpoint = "EleutherAI/pythia-160m-deduped"
+
+>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+>>> inputs = tokenizer(prompt, return_tensors="pt")
+
+>>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
+>>> assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint)
+>>> outputs = model.generate(**inputs, assistant_model=assistant_model, do_sample=True, temperature=0.5)
+>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+['Alice and Bob are going to the same party. It is a small party, in a small']
+```
diff --git a/docs/source/en/glossary.md b/docs/source/en/glossary.md
new file mode 100644
index 000000000000..ea1e53ab5987
--- /dev/null
+++ b/docs/source/en/glossary.md
@@ -0,0 +1,491 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Glossary
+
+This glossary defines general machine learning and 🤗 Transformers terms to help you better understand the
+documentation.
+
+## A
+
+### attention mask
+
+The attention mask is an optional argument used when batching sequences together.
+
+<Youtube id="M6adb1j2jPI"/>
+
+This argument indicates to the model which tokens should be attended to, and which should not.
+
+For example, consider these two sequences:
+
+```python
+>>> from transformers import BertTokenizer
+
+>>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+
+>>> sequence_a = "This is a short sequence."
+>>> sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."
+
+>>> encoded_sequence_a = tokenizer(sequence_a)["input_ids"]
+>>> encoded_sequence_b = tokenizer(sequence_b)["input_ids"]
+```
+
+The encoded versions have different lengths:
+
+```python
+>>> len(encoded_sequence_a), len(encoded_sequence_b)
+(8, 19)
+```
+
+Therefore, we can't put them together in the same tensor as-is. The first sequence needs to be padded up to the length
+of the second one, or the second one needs to be truncated down to the length of the first one.
+
+In the first case, the list of IDs will be extended by the padding indices. We can pass a list to the tokenizer and ask
+it to pad like this:
+
+```python
+>>> padded_sequences = tokenizer([sequence_a, sequence_b], padding=True)
+```
+
+We can see that 0s have been added on the right of the first sentence to make it the same length as the second one:
+
+```python
+>>> padded_sequences["input_ids"]
+[[101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102]]
+```
+
+This can then be converted into a tensor in PyTorch or TensorFlow. The attention mask is a binary tensor indicating the
+position of the padded indices so that the model does not attend to them. For the [`BertTokenizer`], `1` indicates a
+value that should be attended to, while `0` indicates a padded value. This attention mask is in the dictionary returned
+by the tokenizer under the key "attention_mask":
+
+```python
+>>> padded_sequences["attention_mask"]
+[[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
+```
+
+### autoencoding models
+
+See [encoder models](#encoder-models) and [masked language modeling](#masked-language-modeling-mlm)
+
+### autoregressive models
+
+See [causal language modeling](#causal-language-modeling) and [decoder models](#decoder-models)
+
+## B
+
+### backbone
+
+The backbone is the network (embeddings and layers) that outputs the raw hidden states or features. It is usually connected to a [head](#head) which accepts the features as its input to make a prediction. For example, [`ViTModel`] is a backbone without a specific head on top. Other models can also use [`VitModel`] as a backbone such as [DPT](model_doc/dpt).
+
+## C
+
+### causal language modeling
+
+A pretraining task where the model reads the texts in order and has to predict the next word. It's usually done by
+reading the whole sentence but using a mask inside the model to hide the future tokens at a certain timestep.
+
+### channel
+
+Color images are made up of some combination of values in three channels - red, green, and blue (RGB) - and grayscale images only have one channel. In 🤗 Transformers, the channel can be the first or last dimension of an image's tensor: [`n_channels`, `height`, `width`] or [`height`, `width`, `n_channels`].
+
+### connectionist temporal classification (CTC)
+
+An algorithm which allows a model to learn without knowing exactly how the input and output are aligned; CTC calculates the distribution of all possible outputs for a given input and chooses the most likely output from it. CTC is commonly used in speech recognition tasks because speech doesn't always cleanly align with the transcript for a variety of reasons such as a speaker's different speech rates.
+
+### convolution
+
+A type of layer in a neural network where the input matrix is multiplied element-wise by a smaller matrix (kernel or filter) and the values are summed up in a new matrix. This is known as a convolutional operation which is repeated over the entire input matrix. Each operation is applied to a different segment of the input matrix. Convolutional neural networks (CNNs) are commonly used in computer vision.
+
+## D
+
+### decoder input IDs
+
+This input is specific to encoder-decoder models, and contains the input IDs that will be fed to the decoder. These
+inputs should be used for sequence to sequence tasks, such as translation or summarization, and are usually built in a
+way specific to each model.
+
+Most encoder-decoder models (BART, T5) create their `decoder_input_ids` on their own from the `labels`. In such models,
+passing the `labels` is the preferred way to handle training.
+
+Please check each model's docs to see how they handle these input IDs for sequence to sequence training.
+
+### decoder models
+
+Also referred to as autoregressive models, decoder models involve a pretraining task (called causal language modeling) where the model reads the texts in order and has to predict the next word. It's usually done by
+reading the whole sentence with a mask to hide future tokens at a certain timestep.
+
+<Youtube id="d_ixlCubqQw"/>
+
+### deep learning (DL)
+
+Machine learning algorithms which uses neural networks with several layers.
+
+## E
+
+### encoder models
+
+Also known as autoencoding models, encoder models take an input (such as text or images) and transform them into a condensed numerical representation called an embedding. Oftentimes, encoder models are pretrained using techniques like [masked language modeling](#masked-language-modeling-mlm), which masks parts of the input sequence and forces the model to create more meaningful representations.
+
+<Youtube id="H39Z_720T5s"/>
+
+## F
+
+### feature extraction
+
+The process of selecting and transforming raw data into a set of features that are more informative and useful for machine learning algorithms. Some examples of feature extraction include transforming raw text into word embeddings and extracting important features such as edges or shapes from image/video data.
+
+### feed forward chunking
+
+In each residual attention block in transformers the self-attention layer is usually followed by 2 feed forward layers.
+The intermediate embedding size of the feed forward layers is often bigger than the hidden size of the model (e.g., for
+`bert-base-uncased`).
+
+For an input of size `[batch_size, sequence_length]`, the memory required to store the intermediate feed forward
+embeddings `[batch_size, sequence_length, config.intermediate_size]` can account for a large fraction of the memory
+use. The authors of [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) noticed that since the
+computation is independent of the `sequence_length` dimension, it is mathematically equivalent to compute the output
+embeddings of both feed forward layers `[batch_size, config.hidden_size]_0, ..., [batch_size, config.hidden_size]_n`
+individually and concat them afterward to `[batch_size, sequence_length, config.hidden_size]` with `n =
+sequence_length`, which trades increased computation time against reduced memory use, but yields a mathematically
+**equivalent** result.
+
+For models employing the function [`apply_chunking_to_forward`], the `chunk_size` defines the number of output
+embeddings that are computed in parallel and thus defines the trade-off between memory and time complexity. If
+`chunk_size` is set to 0, no feed forward chunking is done.
+
+### finetuned models
+
+Finetuning is a form of transfer learning which involves taking a pretrained model, freezing its weights, and replacing the output layer with a newly added [model head](#head). The model head is trained on your target dataset.
+
+See the [Fine-tune a pretrained model](https://huggingface.co/docs/transformers/training) tutorial for more details, and learn how to fine-tune models with 🤗 Transformers.
+
+## H
+
+### head
+
+The model head refers to the last layer of a neural network that accepts the raw hidden states and projects them onto a different dimension. There is a different model head for each task. For example:
+
+  * [`GPT2ForSequenceClassification`] is a sequence classification head - a linear layer - on top of the base [`GPT2Model`].
+  * [`ViTForImageClassification`] is an image classification head - a linear layer on top of the final hidden state of the `CLS` token - on top of the base [`ViTModel`].
+  * [`Wav2Vec2ForCTC`] ia a language modeling head with [CTC](#connectionist-temporal-classification-(CTC)) on top of the base [`Wav2Vec2Model`].
+
+## I
+
+### image patch
+
+Vision-based Transformers models split an image into smaller patches which are linearly embedded, and then passed as a sequence to the model. You can find the `patch_size` - or resolution - of the model in its configuration.
+
+### inference
+
+Inference is the process of evaluating a model on new data after training is complete. See the [Pipeline for inference](https://huggingface.co/docs/transformers/pipeline_tutorial) tutorial to learn how to perform inference with 🤗 Transformers.
+
+### input IDs
+
+The input ids are often the only required parameters to be passed to the model as input. They are token indices,
+numerical representations of tokens building the sequences that will be used as input by the model.
+
+<Youtube id="VFp38yj8h3A"/>
+
+Each tokenizer works differently but the underlying mechanism remains the same. Here's an example using the BERT
+tokenizer, which is a [WordPiece](https://arxiv.org/pdf/1609.08144.pdf) tokenizer:
+
+```python
+>>> from transformers import BertTokenizer
+
+>>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+
+>>> sequence = "A Titan RTX has 24GB of VRAM"
+```
+
+The tokenizer takes care of splitting the sequence into tokens available in the tokenizer vocabulary.
+
+```python
+>>> tokenized_sequence = tokenizer.tokenize(sequence)
+```
+
+The tokens are either words or subwords. Here for instance, "VRAM" wasn't in the model vocabulary, so it's been split
+in "V", "RA" and "M". To indicate those tokens are not separate words but parts of the same word, a double-hash prefix
+is added for "RA" and "M":
+
+```python
+>>> print(tokenized_sequence)
+['A', 'Titan', 'R', '##T', '##X', 'has', '24', '##GB', 'of', 'V', '##RA', '##M']
+```
+
+These tokens can then be converted into IDs which are understandable by the model. This can be done by directly feeding
+the sentence to the tokenizer, which leverages the Rust implementation of [🤗
+Tokenizers](https://github.com/huggingface/tokenizers) for peak performance.
+
+```python
+>>> inputs = tokenizer(sequence)
+```
+
+The tokenizer returns a dictionary with all the arguments necessary for its corresponding model to work properly. The
+token indices are under the key `input_ids`:
+
+```python
+>>> encoded_sequence = inputs["input_ids"]
+>>> print(encoded_sequence)
+[101, 138, 18696, 155, 1942, 3190, 1144, 1572, 13745, 1104, 159, 9664, 2107, 102]
+```
+
+Note that the tokenizer automatically adds "special tokens" (if the associated model relies on them) which are special
+IDs the model sometimes uses.
+
+If we decode the previous sequence of ids,
+
+```python
+>>> decoded_sequence = tokenizer.decode(encoded_sequence)
+```
+
+we will see
+
+```python
+>>> print(decoded_sequence)
+[CLS] A Titan RTX has 24GB of VRAM [SEP]
+```
+
+because this is the way a [`BertModel`] is going to expect its inputs.
+
+## L
+
+### labels
+
+The labels are an optional argument which can be passed in order for the model to compute the loss itself. These labels
+should be the expected prediction of the model: it will use the standard loss in order to compute the loss between its
+predictions and the expected value (the label).
+
+These labels are different according to the model head, for example:
+
+- For sequence classification models, ([`BertForSequenceClassification`]), the model expects a tensor of dimension
+  `(batch_size)` with each value of the batch corresponding to the expected label of the entire sequence.
+- For token classification models, ([`BertForTokenClassification`]), the model expects a tensor of dimension
+  `(batch_size, seq_length)` with each value corresponding to the expected label of each individual token.
+- For masked language modeling, ([`BertForMaskedLM`]), the model expects a tensor of dimension `(batch_size,
+  seq_length)` with each value corresponding to the expected label of each individual token: the labels being the token
+  ID for the masked token, and values to be ignored for the rest (usually -100).
+- For sequence to sequence tasks, ([`BartForConditionalGeneration`], [`MBartForConditionalGeneration`]), the model
+  expects a tensor of dimension `(batch_size, tgt_seq_length)` with each value corresponding to the target sequences
+  associated with each input sequence. During training, both BART and T5 will make the appropriate
+  `decoder_input_ids` and decoder attention masks internally. They usually do not need to be supplied. This does not
+  apply to models leveraging the Encoder-Decoder framework.
+- For image classification models, ([`ViTForImageClassification`]), the model expects a tensor of dimension
+  `(batch_size)` with each value of the batch corresponding to the expected label of each individual image.
+- For semantic segmentation models, ([`SegformerForSemanticSegmentation`]), the model expects a tensor of dimension
+  `(batch_size, height, width)` with each value of the batch corresponding to the expected label of each individual pixel.
+- For object detection models, ([`DetrForObjectDetection`]), the model expects a list of dictionaries with a
+  `class_labels` and `boxes` key where each value of the batch corresponds to the expected label and number of bounding boxes of each individual image.
+- For automatic speech recognition models, ([`Wav2Vec2ForCTC`]), the model expects a tensor of dimension `(batch_size,
+  target_length)` with each value corresponding to the expected label of each individual token.
+  
+<Tip>
+
+Each model's labels may be different, so be sure to always check the documentation of each model for more information
+about their specific labels!
+
+</Tip>
+
+The base models ([`BertModel`]) do not accept labels, as these are the base transformer models, simply outputting
+features.
+
+### large language models (LLM)
+
+A generic term that refers to transformer language models (GPT-3, BLOOM, OPT) that were trained on a large quantity of data. These models also tend to have a large number of learnable parameters (e.g. 175 billion for GPT-3).
+
+## M
+
+### masked language modeling (MLM)
+
+A pretraining task where the model sees a corrupted version of the texts, usually done by
+masking some tokens randomly, and has to predict the original text.
+
+### multimodal
+
+A task that combines texts with another kind of inputs (for instance images).
+
+## N
+
+### Natural language generation (NLG)
+
+All tasks related to generating text (for instance, [Write With Transformers](https://transformer.huggingface.co/), translation).
+
+### Natural language processing (NLP)
+
+A generic way to say "deal with texts".
+
+### Natural language understanding (NLU)
+
+All tasks related to understanding what is in a text (for instance classifying the
+whole text, individual words).
+
+## P
+
+### pipeline
+
+A pipeline in 🤗 Transformers is an abstraction referring to a series of steps that are executed in a specific order to preprocess and transform data and return a prediction from a model. Some example stages found in a pipeline might be data preprocessing, feature extraction, and normalization.
+
+For more details, see [Pipelines for inference](https://huggingface.co/docs/transformers/pipeline_tutorial).
+
+### pixel values
+
+A tensor of the numerical representations of an image that is passed to a model. The pixel values have a shape of [`batch_size`, `num_channels`, `height`, `width`], and are generated from an image processor.
+
+### pooling
+
+An operation that reduces a matrix into a smaller matrix, either by taking the maximum or average of the pooled dimension(s). Pooling layers are commonly found between convolutional layers to downsample the feature representation.
+
+### position IDs
+
+Contrary to RNNs that have the position of each token embedded within them, transformers are unaware of the position of
+each token. Therefore, the position IDs (`position_ids`) are used by the model to identify each token's position in the
+list of tokens.
+
+They are an optional parameter. If no `position_ids` are passed to the model, the IDs are automatically created as
+absolute positional embeddings.
+
+Absolute positional embeddings are selected in the range `[0, config.max_position_embeddings - 1]`. Some models use
+other types of positional embeddings, such as sinusoidal position embeddings or relative position embeddings.
+
+### preprocessing
+
+The task of preparing raw data into a format that can be easily consumed by machine learning models. For example, text is typically preprocessed by tokenization. To gain a better idea of what preprocessing looks like for other input types, check out the [Preprocess](https://huggingface.co/docs/transformers/preprocessing) tutorial.
+
+### pretrained model
+
+A model that has been pretrained on some data (for instance all of Wikipedia). Pretraining methods involve a
+self-supervised objective, which can be reading the text and trying to predict the next word (see [causal language
+modeling](#causal-language-modeling)) or masking some words and trying to predict them (see [masked language
+modeling](#masked-language-modeling-mlm)). 
+
+  Speech and vision models have their own pretraining objectives. For example, Wav2Vec2 is a speech model pretrained on a contrastive task which requires the model to identify the "true" speech representation from a set of "false" speech representations. On the other hand, BEiT is a vision model pretrained on a masked image modeling task which masks some of the image patches and requires the model to predict the masked patches (similar to the masked language modeling objective).
+
+## R
+
+### recurrent neural network (RNN)
+
+A type of model that uses a loop over a layer to process texts.
+
+### representation learning
+
+A subfield of machine learning which focuses on learning meaningful representations of raw data. Some examples of representation learning techniques include word embeddings, autoencoders, and Generative Adversarial Networks (GANs).
+
+## S
+
+### sampling rate
+
+A measurement in hertz of the number of samples (the audio signal) taken per second. The sampling rate is a result of discretizing a continuous signal such as speech.
+
+### self-attention
+
+Each element of the input finds out which other elements of the input they should attend to.
+
+### self-supervised learning 
+
+A category of machine learning techniques in which a model creates its own learning objective from unlabeled data. It differs from [unsupervised learning](#unsupervised-learning) and [supervised learning](#supervised-learning) in that the learning process is supervised, but not explicitly from the user. 
+
+One example of self-supervised learning is [masked language modeling](#masked-language-modeling-mlm), where a model is passed sentences with a proportion of its tokens removed and learns to predict the missing tokens.
+
+### semi-supervised learning
+
+A broad category of machine learning training techniques that leverages a small amount of labeled data with a larger quantity of unlabeled data to improve the accuracy of a model, unlike [supervised learning](#supervised-learning) and [unsupervised learning](#unsupervised-learning).
+
+An example of a semi-supervised learning approach is "self-training", in which a model is trained on labeled data, and then used to make predictions on the unlabeled data. The portion of the unlabeled data that the model predicts with the most confidence gets added to the labeled dataset and used to retrain the model.
+
+### sequence-to-sequence (seq2seq)
+
+Models that generate a new sequence from an input, like translation models, or summarization models (such as
+[Bart](model_doc/bart) or [T5](model_doc/t5)).
+
+### stride
+
+In [convolution](#convolution) or [pooling](#pooling), the stride refers to the distance the kernel is moved over a matrix. A stride of 1 means the kernel is moved one pixel over at a time, and a stride of 2 means the kernel is moved two pixels over at a time.
+
+### supervised learning
+
+A form of model training that directly uses labeled data to correct and instruct model performance. Data is fed into the model being trained, and its predictions are compared to the known labels. The model updates its weights based on how incorrect its predictions were, and the process is repeated to optimize model performance.
+
+## T
+
+### token
+
+A part of a sentence, usually a word, but can also be a subword (non-common words are often split in subwords) or a
+punctuation symbol.
+
+### token Type IDs
+
+Some models' purpose is to do classification on pairs of sentences or question answering.
+
+<Youtube id="0u3ioSwev3s"/>
+
+These require two different sequences to be joined in a single "input_ids" entry, which usually is performed with the
+help of special tokens, such as the classifier (`[CLS]`) and separator (`[SEP]`) tokens. For example, the BERT model
+builds its two sequence input as such:
+
+```python
+>>> # [CLS] SEQUENCE_A [SEP] SEQUENCE_B [SEP]
+```
+
+We can use our tokenizer to automatically generate such a sentence by passing the two sequences to `tokenizer` as two
+arguments (and not a list, like before) like this:
+
+```python
+>>> from transformers import BertTokenizer
+
+>>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+>>> sequence_a = "HuggingFace is based in NYC"
+>>> sequence_b = "Where is HuggingFace based?"
+
+>>> encoded_dict = tokenizer(sequence_a, sequence_b)
+>>> decoded = tokenizer.decode(encoded_dict["input_ids"])
+```
+
+which will return:
+
+```python
+>>> print(decoded)
+[CLS] HuggingFace is based in NYC [SEP] Where is HuggingFace based? [SEP]
+```
+
+This is enough for some models to understand where one sequence ends and where another begins. However, other models,
+such as BERT, also deploy token type IDs (also called segment IDs). They are represented as a binary mask identifying
+the two types of sequence in the model.
+
+The tokenizer returns this mask as the "token_type_ids" entry:
+
+```python
+>>> encoded_dict["token_type_ids"]
+[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+```
+
+The first sequence, the "context" used for the question, has all its tokens represented by a `0`, whereas the second
+sequence, corresponding to the "question", has all its tokens represented by a `1`.
+
+Some models, like [`XLNetModel`] use an additional token represented by a `2`.
+
+### transfer learning
+
+A technique that involves taking a pretrained model and adapting it to a dataset specific to your task. Instead of training a model from scratch, you can leverage knowledge obtained from an existing model as a starting point. This speeds up the learning process and reduces the amount of training data needed.
+
+### transformer
+
+Self-attention based deep learning model architecture.
+
+## U
+
+### unsupervised learning
+
+A form of model training in which data provided to the model is not labeled. Unsupervised learning techniques leverage statistical information of the data distribution to find patterns useful for the task at hand.
diff --git a/docs/source/en/glossary.mdx b/docs/source/en/glossary.mdx
deleted file mode 100644
index 4c984f389b92..000000000000
--- a/docs/source/en/glossary.mdx
+++ /dev/null
@@ -1,415 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Glossary
-
-This glossary defines general machine learning and 🤗 Transformers terms to help you better understand the
-documentation.
-
-## A
-
-### attention mask
-
-The attention mask is an optional argument used when batching sequences together.
-
-<Youtube id="M6adb1j2jPI"/>
-
-This argument indicates to the model which tokens should be attended to, and which should not.
-
-For example, consider these two sequences:
-
-```python
->>> from transformers import BertTokenizer
-
->>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
-
->>> sequence_a = "This is a short sequence."
->>> sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."
-
->>> encoded_sequence_a = tokenizer(sequence_a)["input_ids"]
->>> encoded_sequence_b = tokenizer(sequence_b)["input_ids"]
-```
-
-The encoded versions have different lengths:
-
-```python
->>> len(encoded_sequence_a), len(encoded_sequence_b)
-(8, 19)
-```
-
-Therefore, we can't put them together in the same tensor as-is. The first sequence needs to be padded up to the length
-of the second one, or the second one needs to be truncated down to the length of the first one.
-
-In the first case, the list of IDs will be extended by the padding indices. We can pass a list to the tokenizer and ask
-it to pad like this:
-
-```python
->>> padded_sequences = tokenizer([sequence_a, sequence_b], padding=True)
-```
-
-We can see that 0s have been added on the right of the first sentence to make it the same length as the second one:
-
-```python
->>> padded_sequences["input_ids"]
-[[101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102]]
-```
-
-This can then be converted into a tensor in PyTorch or TensorFlow. The attention mask is a binary tensor indicating the
-position of the padded indices so that the model does not attend to them. For the [`BertTokenizer`], `1` indicates a
-value that should be attended to, while `0` indicates a padded value. This attention mask is in the dictionary returned
-by the tokenizer under the key "attention_mask":
-
-```python
->>> padded_sequences["attention_mask"]
-[[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
-```
-
-### autoencoding models 
-
-see [masked language modeling](#masked-language-modeling)
-
-### autoregressive models
-
-see [causal language modeling](#causal-language-modeling)
-
-## B
-
-### backbone
-
-The backbone is the network (embeddings and layers) that outputs the raw hidden states or features. It is usually connected to a [head](#head) which accepts the features as its input to make a prediction. For example, [`ViTModel`] is a backbone without a specific head on top. Other models can also use [`VitModel`] as a backbone such as [DPT](model_doc/dpt).
-
-## C
-
-### channel
-
-Color images are made up of some combination of values in three channels - red, green, and blue (RGB) - and grayscale images only have one channel. In 🤗 Transformers, the channel can be the first or last dimension of an image's tensor: [`n_channels`, `height`, `width`] or [`height`, `width`, `n_channels`].
-
-### causal language modeling
-
-A pretraining task where the model reads the texts in order and has to predict the next word. It's usually done by
-reading the whole sentence but using a mask inside the model to hide the future tokens at a certain timestep.
-
-### connectionist temporal classification (CTC)
-
-An algorithm which allows a model to learn without knowing exactly how the input and output are aligned; CTC calculates the distribution of all possible outputs for a given input and chooses the most likely output from it. CTC is commonly used in speech recognition tasks because speech doesn't always cleanly align with the transcript for a variety of reasons such as a speaker's different speech rates.
-
-### convolution
-
-A type of layer in a neural network where the input matrix is multiplied element-wise by a smaller matrix (kernel or filter) and the values are summed up in a new matrix. This is known as a convolutional operation which is repeated over the entire input matrix. Each operation is applied to a different segment of the input matrix. Convolutional neural networks (CNNs) are commonly used in computer vision.
-
-## D
-
-### decoder input IDs
-
-This input is specific to encoder-decoder models, and contains the input IDs that will be fed to the decoder. These
-inputs should be used for sequence to sequence tasks, such as translation or summarization, and are usually built in a
-way specific to each model.
-
-Most encoder-decoder models (BART, T5) create their `decoder_input_ids` on their own from the `labels`. In such models,
-passing the `labels` is the preferred way to handle training.
-
-Please check each model's docs to see how they handle these input IDs for sequence to sequence training.
-
-### deep learning
-
-Machine learning algorithms which uses neural networks with several layers.
-
-## F
-
-### feed forward chunking
-
-In each residual attention block in transformers the self-attention layer is usually followed by 2 feed forward layers.
-The intermediate embedding size of the feed forward layers is often bigger than the hidden size of the model (e.g., for
-`bert-base-uncased`).
-
-For an input of size `[batch_size, sequence_length]`, the memory required to store the intermediate feed forward
-embeddings `[batch_size, sequence_length, config.intermediate_size]` can account for a large fraction of the memory
-use. The authors of [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) noticed that since the
-computation is independent of the `sequence_length` dimension, it is mathematically equivalent to compute the output
-embeddings of both feed forward layers `[batch_size, config.hidden_size]_0, ..., [batch_size, config.hidden_size]_n`
-individually and concat them afterward to `[batch_size, sequence_length, config.hidden_size]` with `n =
-sequence_length`, which trades increased computation time against reduced memory use, but yields a mathematically
-**equivalent** result.
-
-For models employing the function [`apply_chunking_to_forward`], the `chunk_size` defines the number of output
-embeddings that are computed in parallel and thus defines the trade-off between memory and time complexity. If
-`chunk_size` is set to 0, no feed forward chunking is done.
-
-## H
-
-### head
-
-The model head refers to the last layer of a neural network that accepts the raw hidden states and projects them onto a different dimension. There is a different model head for each task. For example:
-
-  * [`GPT2ForSequenceClassification`] is a sequence classification head - a linear layer - on top of the base [`GPT2Model`].
-  * [`ViTForImageClassification`] is an image classification head - a linear layer on top of the final hidden state of the `CLS` token - on top of the base [`ViTModel`].
-  * [`Wav2Vec2ForCTC`] ia a language modeling head with [CTC](#connectionist-temporal-classification-(CTC)) on top of the base [`Wav2Vec2Model`].
-
-## I
-
-### image patch
-
-Vision-based Transformers models split an image into smaller patches which are linearly embedded, and then passed as a sequence to the model. You can find the `patch_size` - or resolution - of the model in it's configuration.
-
-### input IDs
-
-The input ids are often the only required parameters to be passed to the model as input. They are token indices,
-numerical representations of tokens building the sequences that will be used as input by the model.
-
-<Youtube id="VFp38yj8h3A"/>
-
-Each tokenizer works differently but the underlying mechanism remains the same. Here's an example using the BERT
-tokenizer, which is a [WordPiece](https://arxiv.org/pdf/1609.08144.pdf) tokenizer:
-
-```python
->>> from transformers import BertTokenizer
-
->>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
-
->>> sequence = "A Titan RTX has 24GB of VRAM"
-```
-
-The tokenizer takes care of splitting the sequence into tokens available in the tokenizer vocabulary.
-
-```python
->>> tokenized_sequence = tokenizer.tokenize(sequence)
-```
-
-The tokens are either words or subwords. Here for instance, "VRAM" wasn't in the model vocabulary, so it's been split
-in "V", "RA" and "M". To indicate those tokens are not separate words but parts of the same word, a double-hash prefix
-is added for "RA" and "M":
-
-```python
->>> print(tokenized_sequence)
-['A', 'Titan', 'R', '##T', '##X', 'has', '24', '##GB', 'of', 'V', '##RA', '##M']
-```
-
-These tokens can then be converted into IDs which are understandable by the model. This can be done by directly feeding
-the sentence to the tokenizer, which leverages the Rust implementation of [🤗
-Tokenizers](https://github.com/huggingface/tokenizers) for peak performance.
-
-```python
->>> inputs = tokenizer(sequence)
-```
-
-The tokenizer returns a dictionary with all the arguments necessary for its corresponding model to work properly. The
-token indices are under the key `input_ids`:
-
-```python
->>> encoded_sequence = inputs["input_ids"]
->>> print(encoded_sequence)
-[101, 138, 18696, 155, 1942, 3190, 1144, 1572, 13745, 1104, 159, 9664, 2107, 102]
-```
-
-Note that the tokenizer automatically adds "special tokens" (if the associated model relies on them) which are special
-IDs the model sometimes uses.
-
-If we decode the previous sequence of ids,
-
-```python
->>> decoded_sequence = tokenizer.decode(encoded_sequence)
-```
-
-we will see
-
-```python
->>> print(decoded_sequence)
-[CLS] A Titan RTX has 24GB of VRAM [SEP]
-```
-
-because this is the way a [`BertModel`] is going to expect its inputs.
-
-## L
-
-### labels
-
-The labels are an optional argument which can be passed in order for the model to compute the loss itself. These labels
-should be the expected prediction of the model: it will use the standard loss in order to compute the loss between its
-predictions and the expected value (the label).
-
-These labels are different according to the model head, for example:
-
-- For sequence classification models, ([`BertForSequenceClassification`]), the model expects a tensor of dimension
-  `(batch_size)` with each value of the batch corresponding to the expected label of the entire sequence.
-- For token classification models, ([`BertForTokenClassification`]), the model expects a tensor of dimension
-  `(batch_size, seq_length)` with each value corresponding to the expected label of each individual token.
-- For masked language modeling, ([`BertForMaskedLM`]), the model expects a tensor of dimension `(batch_size,
-  seq_length)` with each value corresponding to the expected label of each individual token: the labels being the token
-  ID for the masked token, and values to be ignored for the rest (usually -100).
-- For sequence to sequence tasks, ([`BartForConditionalGeneration`], [`MBartForConditionalGeneration`]), the model
-  expects a tensor of dimension `(batch_size, tgt_seq_length)` with each value corresponding to the target sequences
-  associated with each input sequence. During training, both BART and T5 will make the appropriate
-  `decoder_input_ids` and decoder attention masks internally. They usually do not need to be supplied. This does not
-  apply to models leveraging the Encoder-Decoder framework.
-- For image classification models, ([`ViTForImageClassification`]), the model expects a tensor of dimension
-  `(batch_size)` with each value of the batch corresponding to the expected label of each individual image.
-- For semantic segmentation models, ([`SegformerForSemanticSegmentation`]), the model expects a tensor of dimension
-  `(batch_size, height, width)` with each value of the batch corresponding to the expected label of each individual pixel.
-- For object detection models, ([`DetrForObjectDetection`]), the model expects a list of dictionaries with a
-  `class_labels` and `boxes` key where each value of the batch corresponds to the expected label and number of bounding boxes of each individual image.
-- For automatic speech recognition models, ([`Wav2Vec2ForCTC`]), the model expects a tensor of dimension `(batch_size,
-  target_length)` with each value corresponding to the expected label of each individual token.
-  
-<Tip>
-
-Each model's labels may be different, so be sure to always check the documentation of each model for more information
-about their specific labels!
-
-</Tip>
-
-The base models ([`BertModel`]) do not accept labels, as these are the base transformer models, simply outputting
-features.
-
-## M
-
-### masked language modeling
-
-A pretraining task where the model sees a corrupted version of the texts, usually done by
-masking some tokens randomly, and has to predict the original text.
-
-### multimodal
-
-A task that combines texts with another kind of inputs (for instance images).
-
-## N
-
-### Natural language generation
-
-All tasks related to generating text (for instance talk with transformers, translation).
-
-### Natural language processing
-
-A generic way to say "deal with texts".
-
-### Natural language understanding
-
-All tasks related to understanding what is in a text (for instance classifying the
-whole text, individual words).
-
-## P
-
-### pixel values
-
-A tensor of the numerical representations of an image that is passed to a model. The pixel values have a shape of [`batch_size`, `num_channels`, `height`, `width`], and are generated from an image processor.
-
-### pooling
-
-An operation that reduces a matrix into a smaller matrix, either by taking the maximum or average of the pooled dimension(s). Pooling layers are commonly found between convolutional layers to downsample the feature representation.
-
-### position IDs
-
-Contrary to RNNs that have the position of each token embedded within them, transformers are unaware of the position of
-each token. Therefore, the position IDs (`position_ids`) are used by the model to identify each token's position in the
-list of tokens.
-
-They are an optional parameter. If no `position_ids` are passed to the model, the IDs are automatically created as
-absolute positional embeddings.
-
-Absolute positional embeddings are selected in the range `[0, config.max_position_embeddings - 1]`. Some models use
-other types of positional embeddings, such as sinusoidal position embeddings or relative position embeddings.
-
-
-### pretrained model
-
-A model that has been pretrained on some data (for instance all of Wikipedia). Pretraining methods involve a
-self-supervised objective, which can be reading the text and trying to predict the next word (see [causal language
-modeling](#causal-language-modeling)) or masking some words and trying to predict them (see [masked language
-modeling](#masked-language-modeling)). 
-
-  Speech and vision models have their own pretraining objectives. For example, Wav2Vec2 is a speech model pretrained on a contrastive task which requires the model to identify the "true" speech representation from a set of "false" speech representations. On the other hand, BEiT is a vision model pretrained on a masked image modeling task which masks some of the image patches and requires the model to predict the masked patches (similar to the masked language modeling objective).
-
-## R
-
-### recurrent neural network
-
-A type of model that uses a loop over a layer to process texts.
-
-## S
-
-### sampling rate
-
-A measurement in hertz of the number of samples (the audio signal) taken per second. The sampling rate is a result of discretizing a continuous signal such as speech.
-
-### self-attention
-
-Each element of the input finds out which other elements of the input they should attend to.
-
-### sequence-to-sequence (seq2seq)
-
-Models that generate a new sequence from an input, like translation models, or summarization models (such as
-[Bart](model_doc/bart) or [T5](model_doc/t5)).
-
-### stride
-
-In [convolution](#convolution) or [pooling](#pooling), the stride refers to the distance the kernel is moved over a matrix. A stride of 1 means the kernel is moved one pixel over at a time, and a stride of 2 means the kernel is moved two pixels over at a time.
-
-## T
-
-### token
-
-A part of a sentence, usually a word, but can also be a subword (non-common words are often split in subwords) or a
-punctuation symbol.
-
-### token Type IDs
-
-Some models' purpose is to do classification on pairs of sentences or question answering.
-
-<Youtube id="0u3ioSwev3s"/>
-
-These require two different sequences to be joined in a single "input_ids" entry, which usually is performed with the
-help of special tokens, such as the classifier (`[CLS]`) and separator (`[SEP]`) tokens. For example, the BERT model
-builds its two sequence input as such:
-
-```python
->>> # [CLS] SEQUENCE_A [SEP] SEQUENCE_B [SEP]
-```
-
-We can use our tokenizer to automatically generate such a sentence by passing the two sequences to `tokenizer` as two
-arguments (and not a list, like before) like this:
-
-```python
->>> from transformers import BertTokenizer
-
->>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
->>> sequence_a = "HuggingFace is based in NYC"
->>> sequence_b = "Where is HuggingFace based?"
-
->>> encoded_dict = tokenizer(sequence_a, sequence_b)
->>> decoded = tokenizer.decode(encoded_dict["input_ids"])
-```
-
-which will return:
-
-```python
->>> print(decoded)
-[CLS] HuggingFace is based in NYC [SEP] Where is HuggingFace based? [SEP]
-```
-
-This is enough for some models to understand where one sequence ends and where another begins. However, other models,
-such as BERT, also deploy token type IDs (also called segment IDs). They are represented as a binary mask identifying
-the two types of sequence in the model.
-
-The tokenizer returns this mask as the "token_type_ids" entry:
-
-```python
->>> encoded_dict["token_type_ids"]
-[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
-```
-
-The first sequence, the "context" used for the question, has all its tokens represented by a `0`, whereas the second
-sequence, corresponding to the "question", has all its tokens represented by a `1`.
-
-Some models, like [`XLNetModel`] use an additional token represented by a `2`.
-
-### transformer
-
-Self-attention based deep learning model architecture.
\ No newline at end of file
diff --git a/docs/source/en/hpo_train.md b/docs/source/en/hpo_train.md
new file mode 100644
index 000000000000..882193d9e837
--- /dev/null
+++ b/docs/source/en/hpo_train.md
@@ -0,0 +1,136 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Hyperparameter Search using Trainer API
+
+🤗 Transformers provides a [`Trainer`] class optimized for training 🤗 Transformers models, making it easier to start training without manually writing your own training loop. The [`Trainer`] provides API for hyperparameter search. This doc shows how to enable it in example. 
+
+## Hyperparameter Search backend
+
+[`Trainer`] supports four hyperparameter search backends currently:
+[optuna](https://optuna.org/), [sigopt](https://sigopt.com/), [raytune](https://docs.ray.io/en/latest/tune/index.html) and [wandb](https://wandb.ai/site/sweeps).
+
+you should install them before using them as the hyperparameter search backend
+```bash
+pip install optuna/sigopt/wandb/ray[tune] 
+```
+
+## How to enable Hyperparameter search in example
+
+Define the hyperparameter search space, different backends need different format.
+
+For sigopt, see sigopt [object_parameter](https://docs.sigopt.com/ai-module-api-references/api_reference/objects/object_parameter), it's like following:
+```py
+>>> def sigopt_hp_space(trial):
+...     return [
+...         {"bounds": {"min": 1e-6, "max": 1e-4}, "name": "learning_rate", "type": "double"},
+...         {
+...             "categorical_values": ["16", "32", "64", "128"],
+...             "name": "per_device_train_batch_size",
+...             "type": "categorical",
+...         },
+...     ]
+```
+
+For optuna, see optuna [object_parameter](https://optuna.readthedocs.io/en/stable/tutorial/10_key_features/002_configurations.html#sphx-glr-tutorial-10-key-features-002-configurations-py), it's like following:
+
+```py
+>>> def optuna_hp_space(trial):
+...     return {
+...         "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
+...         "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32, 64, 128]),
+...     }
+```
+
+Optuna provides multi-objective HPO. You can pass `direction` in `hyperparameter_search` and define your own compute_objective to return multiple objective values. The Pareto Front (`List[BestRun]`) will be returned in hyperparameter_search, you should refer to the test case `TrainerHyperParameterMultiObjectOptunaIntegrationTest` in [test_trainer](https://github.com/huggingface/transformers/blob/main/tests/trainer/test_trainer.py). It's like following
+
+```py
+>>> best_trials = trainer.hyperparameter_search(
+...     direction=["minimize", "maximize"],
+...     backend="optuna",
+...     hp_space=optuna_hp_space,
+...     n_trials=20,
+...     compute_objective=compute_objective,
+... )
+```
+
+For raytune, see raytune [object_parameter](https://docs.ray.io/en/latest/tune/api/search_space.html), it's like following:
+
+```py
+>>> def ray_hp_space(trial):
+...     return {
+...         "learning_rate": tune.loguniform(1e-6, 1e-4),
+...         "per_device_train_batch_size": tune.choice([16, 32, 64, 128]),
+...     }
+```
+
+For wandb, see wandb [object_parameter](https://docs.wandb.ai/guides/sweeps/configuration), it's like following:
+
+```py
+>>> def wandb_hp_space(trial):
+...     return {
+...         "method": "random",
+...         "metric": {"name": "objective", "goal": "minimize"},
+...         "parameters": {
+...             "learning_rate": {"distribution": "uniform", "min": 1e-6, "max": 1e-4},
+...             "per_device_train_batch_size": {"values": [16, 32, 64, 128]},
+...         },
+...     }
+```
+
+Define a `model_init` function and pass it to the [`Trainer`], as an example:
+```py
+>>> def model_init(trial):
+...     return AutoModelForSequenceClassification.from_pretrained(
+...         model_args.model_name_or_path,
+...         from_tf=bool(".ckpt" in model_args.model_name_or_path),
+...         config=config,
+...         cache_dir=model_args.cache_dir,
+...         revision=model_args.model_revision,
+...         use_auth_token=True if model_args.use_auth_token else None,
+...     )
+```
+
+Create a [`Trainer`] with your `model_init` function, training arguments, training and test datasets, and evaluation function:
+
+```py
+>>> trainer = Trainer(
+...     model=None,
+...     args=training_args,
+...     train_dataset=small_train_dataset,
+...     eval_dataset=small_eval_dataset,
+...     compute_metrics=compute_metrics,
+...     tokenizer=tokenizer,
+...     model_init=model_init,
+...     data_collator=data_collator,
+... )
+```
+
+Call hyperparameter search, get the best trial parameters, backend could be `"optuna"`/`"sigopt"`/`"wandb"`/`"ray"`. direction can be`"minimize"` or `"maximize"`, which indicates whether to optimize greater or lower objective.
+
+You could define your own compute_objective function, if not defined, the default compute_objective will be called, and the sum of eval metric like f1 is returned as objective value.
+
+```py
+>>> best_trial = trainer.hyperparameter_search(
+...     direction="maximize",
+...     backend="optuna",
+...     hp_space=optuna_hp_space,
+...     n_trials=20,
+...     compute_objective=compute_objective,
+... )
+```
+
+## Hyperparameter search For DDP finetune
+Currently, Hyperparameter search for DDP is enabled for optuna and sigopt. Only the rank-zero process will generate the search trial and pass the argument to other ranks.
diff --git a/docs/source/en/hpo_train.mdx b/docs/source/en/hpo_train.mdx
deleted file mode 100644
index 5a7cc429acc1..000000000000
--- a/docs/source/en/hpo_train.mdx
+++ /dev/null
@@ -1,120 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
--->
-
-# Hyperparameter Search using Trainer API
-
-🤗 Transformers provides a [`Trainer`] class optimized for training 🤗 Transformers models, making it easier to start training without manually writing your own training loop. The [`Trainer`] provides API for hyperparameter search. This doc shows how to enable it in example. 
-
-## Hyperparameter Search backend
-
-[`Trainer`] supports four hyperparameter search backends currently:
-[optuna](https://optuna.org/), [sigopt](https://sigopt.com/), [raytune](https://docs.ray.io/en/latest/tune/index.html) and [wandb](https://wandb.ai/site/sweeps).
-
-you should install them before using them as the hyperparameter search backend
-```bash
-pip install optuna/sigopt/wandb/ray[tune] 
-```
-
-## How to enable Hyperparameter search in example
-
-Define the hyperparameter search space, different backends need different format.
-
-For sigopt, see sigopt [object_parameter](https://docs.sigopt.com/ai-module-api-references/api_reference/objects/object_parameter), it's like following:
-```py
->>> def sigopt_hp_space(trial):
-...     return [
-...         {"bounds": {"min": 1e-6, "max": 1e-4}, "name": "learning_rate", "type": "double"},
-...         {
-...             "categorical_values": ["16", "32", "64", "128"],
-...             "name": "per_device_train_batch_size",
-...             "type": "categorical",
-...         },
-...     ]
-```
-
-For optuna, see optuna [object_parameter](https://optuna.readthedocs.io/en/stable/tutorial/10_key_features/002_configurations.html#sphx-glr-tutorial-10-key-features-002-configurations-py), it's like following:
-
-```py
->>> def optuna_hp_space(trial):
-...     return {
-...         "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
-...         "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32, 64, 128]),
-...     }
-```
-
-For raytune, see raytune [object_parameter](https://docs.ray.io/en/latest/tune/api_docs/search_space.html), it's like following:
-
-```py
->>> def ray_hp_space(trial):
-...     return {
-...         "learning_rate": tune.loguniform(1e-6, 1e-4),
-...         "per_device_train_batch_size": tune.choice([16, 32, 64, 128]),
-...     }
-```
-
-For wandb, see wandb [object_parameter](https://docs.wandb.ai/guides/sweeps/configuration), it's like following:
-
-```py
->>> def wandb_hp_space(trial):
-...     return {
-...         "method": "random",
-...         "metric": {"name": "objective", "goal": "minimize"},
-...         "parameters": {
-...             "learning_rate": {"distribution": "uniform", "min": 1e-6, "max": 1e-4},
-...             "per_device_train_batch_size": {"values": [16, 32, 64, 128]},
-...         },
-...     }
-```
-
-Define a `model_init` function and pass it to the [`Trainer`], as an example:
-```py
->>> def model_init(trial):
-...     return AutoModelForSequenceClassification.from_pretrained(
-...         model_args.model_name_or_path,
-...         from_tf=bool(".ckpt" in model_args.model_name_or_path),
-...         config=config,
-...         cache_dir=model_args.cache_dir,
-...         revision=model_args.model_revision,
-...         use_auth_token=True if model_args.use_auth_token else None,
-...     )
-```
-
-Create a [`Trainer`] with your `model_init` function, training arguments, training and test datasets, and evaluation function:
-
-```py
->>> trainer = Trainer(
-...     model=None,
-...     args=training_args,
-...     train_dataset=small_train_dataset,
-...     eval_dataset=small_eval_dataset,
-...     compute_metrics=compute_metrics,
-...     tokenizer=tokenizer,
-...     model_init=model_init,
-...     data_collator=data_collator,
-... )
-```
-
-Call hyperparameter search, get the best trial parameters, backend could be `"optuna"`/`"sigopt"`/`"wandb"`/`"ray"`. direction can be`"minimize"` or `"maximize"`, which indicates whether to optimize greater or lower objective.
-
-You could define your own compute_objective function, if not defined, the default compute_objective will be called, and the sum of eval metric like f1 is returned as objective value.
-
-```py
->>> best_trial = trainer.hyperparameter_search(
-...     direction="maximize",
-...     backend="optuna",
-...     hp_space=optuna_hp_space,
-...     n_trials=20,
-...     compute_objective=compute_objective,
-... )
-```
-
-## Hyperparameter search For DDP finetune
-Currently, Hyperparameter search for DDP is enabled for optuna and sigopt. Only the rank-zero process will generate the search trial and pass the argument to other ranks.
\ No newline at end of file
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
new file mode 100644
index 000000000000..e20389a2ab49
--- /dev/null
+++ b/docs/source/en/index.md
@@ -0,0 +1,299 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+# 🤗 Transformers
+
+State-of-the-art Machine Learning for [PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/), and [JAX](https://jax.readthedocs.io/en/latest/).
+
+🤗 Transformers provides APIs and tools to easily download and train state-of-the-art pretrained models. Using pretrained models can reduce your compute costs, carbon footprint, and save you the time and resources required to train a model from scratch. These models support common tasks in different modalities, such as:
+
+📝 **Natural Language Processing**: text classification, named entity recognition, question answering, language modeling, summarization, translation, multiple choice, and text generation.<br>
+🖼️ **Computer Vision**: image classification, object detection, and segmentation.<br>
+🗣️ **Audio**: automatic speech recognition and audio classification.<br>
+🐙 **Multimodal**: table question answering, optical character recognition, information extraction from scanned documents, video classification, and visual question answering.
+
+🤗 Transformers support framework interoperability between PyTorch, TensorFlow, and JAX. This provides the flexibility to use a different framework at each stage of a model's life; train a model in three lines of code in one framework, and load it for inference in another. Models can also be exported to a format like ONNX and TorchScript for deployment in production environments.
+
+Join the growing community on the [Hub](https://huggingface.co/models), [forum](https://discuss.huggingface.co/), or [Discord](https://discord.com/invite/JfAtkvEtRb) today!
+
+## If you are looking for custom support from the Hugging Face team
+
+<a target="_blank" href="https://huggingface.co/support">
+    <img alt="HuggingFace Expert Acceleration Program" src="https://cdn-media.huggingface.co/marketing/transformers/new-support-improved.png" style="width: 100%; max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
+</a>
+
+## Contents
+
+The documentation is organized into five sections:
+
+- **GET STARTED** provides a quick tour of the library and installation instructions to get up and running.
+- **TUTORIALS** are a great place to start if you're a beginner. This section will help you gain the basic skills you need to start using the library.
+- **HOW-TO GUIDES** show you how to achieve a specific goal, like finetuning a pretrained model for language modeling or how to write and share a custom model.
+- **CONCEPTUAL GUIDES** offers more discussion and explanation of the underlying concepts and ideas behind models, tasks, and the design philosophy of 🤗 Transformers.
+- **API** describes all classes and functions:
+
+  - **MAIN CLASSES** details the most important classes like configuration, model, tokenizer, and pipeline.
+  - **MODELS** details the classes and functions related to each model implemented in the library.
+  - **INTERNAL HELPERS** details utility classes and functions used internally.
+
+
+## Supported models and frameworks
+
+The table below represents the current support in the library for each of those models, whether they have a Python
+tokenizer (called "slow"). A "fast" tokenizer backed by the 🤗 Tokenizers library, whether they have support in Jax (via
+Flax), PyTorch, and/or TensorFlow.
+
+<!--This table is updated automatically from the auto modules with _make fix-copies_. Do not update manually!-->
+
+|                                  Model                                   | PyTorch support | TensorFlow support | Flax Support |
+|:------------------------------------------------------------------------:|:---------------:|:------------------:|:------------:|
+|                        [ALBERT](model_doc/albert)                        |       ✅        |         ✅         |      ✅      |
+|                         [ALIGN](model_doc/align)                         |       ✅        |         ❌         |      ❌      |
+|                       [AltCLIP](model_doc/altclip)                       |       ✅        |         ❌         |      ❌      |
+| [Audio Spectrogram Transformer](model_doc/audio-spectrogram-transformer) |       ✅        |         ❌         |      ❌      |
+|                    [Autoformer](model_doc/autoformer)                    |       ✅        |         ❌         |      ❌      |
+|                          [Bark](model_doc/bark)                          |       ✅        |         ❌         |      ❌      |
+|                          [BART](model_doc/bart)                          |       ✅        |         ✅         |      ✅      |
+|                       [BARThez](model_doc/barthez)                       |       ✅        |         ✅         |      ✅      |
+|                       [BARTpho](model_doc/bartpho)                       |       ✅        |         ✅         |      ✅      |
+|                          [BEiT](model_doc/beit)                          |       ✅        |         ❌         |      ✅      |
+|                          [BERT](model_doc/bert)                          |       ✅        |         ✅         |      ✅      |
+|               [Bert Generation](model_doc/bert-generation)               |       ✅        |         ❌         |      ❌      |
+|                 [BertJapanese](model_doc/bert-japanese)                  |       ✅        |         ✅         |      ✅      |
+|                      [BERTweet](model_doc/bertweet)                      |       ✅        |         ✅         |      ✅      |
+|                      [BigBird](model_doc/big_bird)                       |       ✅        |         ❌         |      ✅      |
+|               [BigBird-Pegasus](model_doc/bigbird_pegasus)               |       ✅        |         ❌         |      ❌      |
+|                        [BioGpt](model_doc/biogpt)                        |       ✅        |         ❌         |      ❌      |
+|                           [BiT](model_doc/bit)                           |       ✅        |         ❌         |      ❌      |
+|                    [Blenderbot](model_doc/blenderbot)                    |       ✅        |         ✅         |      ✅      |
+|              [BlenderbotSmall](model_doc/blenderbot-small)               |       ✅        |         ✅         |      ✅      |
+|                          [BLIP](model_doc/blip)                          |       ✅        |         ✅         |      ❌      |
+|                        [BLIP-2](model_doc/blip-2)                        |       ✅        |         ❌         |      ❌      |
+|                         [BLOOM](model_doc/bloom)                         |       ✅        |         ❌         |      ✅      |
+|                          [BORT](model_doc/bort)                          |       ✅        |         ✅         |      ✅      |
+|                   [BridgeTower](model_doc/bridgetower)                   |       ✅        |         ❌         |      ❌      |
+|                          [BROS](model_doc/bros)                          |       ✅        |         ❌         |      ❌      |
+|                          [ByT5](model_doc/byt5)                          |       ✅        |         ✅         |      ✅      |
+|                     [CamemBERT](model_doc/camembert)                     |       ✅        |         ✅         |      ❌      |
+|                        [CANINE](model_doc/canine)                        |       ✅        |         ❌         |      ❌      |
+|                  [Chinese-CLIP](model_doc/chinese_clip)                  |       ✅        |         ❌         |      ❌      |
+|                          [CLAP](model_doc/clap)                          |       ✅        |         ❌         |      ❌      |
+|                          [CLIP](model_doc/clip)                          |       ✅        |         ✅         |      ✅      |
+|                       [CLIPSeg](model_doc/clipseg)                       |       ✅        |         ❌         |      ❌      |
+|                       [CodeGen](model_doc/codegen)                       |       ✅        |         ❌         |      ❌      |
+|                    [CodeLlama](model_doc/code_llama)                     |       ✅        |         ❌         |      ❌      |
+|              [Conditional DETR](model_doc/conditional_detr)              |       ✅        |         ❌         |      ❌      |
+|                      [ConvBERT](model_doc/convbert)                      |       ✅        |         ✅         |      ❌      |
+|                      [ConvNeXT](model_doc/convnext)                      |       ✅        |         ✅         |      ❌      |
+|                    [ConvNeXTV2](model_doc/convnextv2)                    |       ✅        |         ❌         |      ❌      |
+|                           [CPM](model_doc/cpm)                           |       ✅        |         ✅         |      ✅      |
+|                       [CPM-Ant](model_doc/cpmant)                        |       ✅        |         ❌         |      ❌      |
+|                          [CTRL](model_doc/ctrl)                          |       ✅        |         ✅         |      ❌      |
+|                           [CvT](model_doc/cvt)                           |       ✅        |         ✅         |      ❌      |
+|                   [Data2VecAudio](model_doc/data2vec)                    |       ✅        |         ❌         |      ❌      |
+|                    [Data2VecText](model_doc/data2vec)                    |       ✅        |         ❌         |      ❌      |
+|                   [Data2VecVision](model_doc/data2vec)                   |       ✅        |         ✅         |      ❌      |
+|                       [DeBERTa](model_doc/deberta)                       |       ✅        |         ✅         |      ❌      |
+|                    [DeBERTa-v2](model_doc/deberta-v2)                    |       ✅        |         ✅         |      ❌      |
+|          [Decision Transformer](model_doc/decision_transformer)          |       ✅        |         ❌         |      ❌      |
+|               [Deformable DETR](model_doc/deformable_detr)               |       ✅        |         ❌         |      ❌      |
+|                          [DeiT](model_doc/deit)                          |       ✅        |         ✅         |      ❌      |
+|                        [DePlot](model_doc/deplot)                        |       ✅        |         ❌         |      ❌      |
+|                          [DETA](model_doc/deta)                          |       ✅        |         ❌         |      ❌      |
+|                          [DETR](model_doc/detr)                          |       ✅        |         ❌         |      ❌      |
+|                      [DialoGPT](model_doc/dialogpt)                      |       ✅        |         ✅         |      ✅      |
+|                         [DiNAT](model_doc/dinat)                         |       ✅        |         ❌         |      ❌      |
+|                        [DINOv2](model_doc/dinov2)                        |       ✅        |         ❌         |      ❌      |
+|                    [DistilBERT](model_doc/distilbert)                    |       ✅        |         ✅         |      ✅      |
+|                           [DiT](model_doc/dit)                           |       ✅        |         ❌         |      ✅      |
+|                       [DonutSwin](model_doc/donut)                       |       ✅        |         ❌         |      ❌      |
+|                           [DPR](model_doc/dpr)                           |       ✅        |         ✅         |      ❌      |
+|                           [DPT](model_doc/dpt)                           |       ✅        |         ❌         |      ❌      |
+|               [EfficientFormer](model_doc/efficientformer)               |       ✅        |         ✅         |      ❌      |
+|                  [EfficientNet](model_doc/efficientnet)                  |       ✅        |         ❌         |      ❌      |
+|                       [ELECTRA](model_doc/electra)                       |       ✅        |         ✅         |      ✅      |
+|                       [EnCodec](model_doc/encodec)                       |       ✅        |         ❌         |      ❌      |
+|               [Encoder decoder](model_doc/encoder-decoder)               |       ✅        |         ✅         |      ✅      |
+|                         [ERNIE](model_doc/ernie)                         |       ✅        |         ❌         |      ❌      |
+|                       [ErnieM](model_doc/ernie_m)                        |       ✅        |         ❌         |      ❌      |
+|                           [ESM](model_doc/esm)                           |       ✅        |         ✅         |      ❌      |
+|              [FairSeq Machine-Translation](model_doc/fsmt)               |       ✅        |         ❌         |      ❌      |
+|                        [Falcon](model_doc/falcon)                        |       ✅        |         ❌         |      ❌      |
+|                       [FLAN-T5](model_doc/flan-t5)                       |       ✅        |         ✅         |      ✅      |
+|                      [FLAN-UL2](model_doc/flan-ul2)                      |       ✅        |         ✅         |      ✅      |
+|                      [FlauBERT](model_doc/flaubert)                      |       ✅        |         ✅         |      ❌      |
+|                         [FLAVA](model_doc/flava)                         |       ✅        |         ❌         |      ❌      |
+|                          [FNet](model_doc/fnet)                          |       ✅        |         ❌         |      ❌      |
+|                      [FocalNet](model_doc/focalnet)                      |       ✅        |         ❌         |      ❌      |
+|                  [Funnel Transformer](model_doc/funnel)                  |       ✅        |         ✅         |      ❌      |
+|                           [GIT](model_doc/git)                           |       ✅        |         ❌         |      ❌      |
+|                          [GLPN](model_doc/glpn)                          |       ✅        |         ❌         |      ❌      |
+|                       [GPT Neo](model_doc/gpt_neo)                       |       ✅        |         ❌         |      ✅      |
+|                      [GPT NeoX](model_doc/gpt_neox)                      |       ✅        |         ❌         |      ❌      |
+|             [GPT NeoX Japanese](model_doc/gpt_neox_japanese)             |       ✅        |         ❌         |      ❌      |
+|                         [GPT-J](model_doc/gptj)                          |       ✅        |         ✅         |      ✅      |
+|                       [GPT-Sw3](model_doc/gpt-sw3)                       |       ✅        |         ✅         |      ✅      |
+|                   [GPTBigCode](model_doc/gpt_bigcode)                    |       ✅        |         ❌         |      ❌      |
+|               [GPTSAN-japanese](model_doc/gptsan-japanese)               |       ✅        |         ❌         |      ❌      |
+|                    [Graphormer](model_doc/graphormer)                    |       ✅        |         ❌         |      ❌      |
+|                      [GroupViT](model_doc/groupvit)                      |       ✅        |         ✅         |      ❌      |
+|                       [HerBERT](model_doc/herbert)                       |       ✅        |         ✅         |      ✅      |
+|                        [Hubert](model_doc/hubert)                        |       ✅        |         ✅         |      ❌      |
+|                        [I-BERT](model_doc/ibert)                         |       ✅        |         ❌         |      ❌      |
+|                       [IDEFICS](model_doc/idefics)                       |       ✅        |         ❌         |      ❌      |
+|                      [ImageGPT](model_doc/imagegpt)                      |       ✅        |         ❌         |      ❌      |
+|                      [Informer](model_doc/informer)                      |       ✅        |         ❌         |      ❌      |
+|                  [InstructBLIP](model_doc/instructblip)                  |       ✅        |         ❌         |      ❌      |
+|                       [Jukebox](model_doc/jukebox)                       |       ✅        |         ❌         |      ❌      |
+|                      [LayoutLM](model_doc/layoutlm)                      |       ✅        |         ✅         |      ❌      |
+|                    [LayoutLMv2](model_doc/layoutlmv2)                    |       ✅        |         ❌         |      ❌      |
+|                    [LayoutLMv3](model_doc/layoutlmv3)                    |       ✅        |         ✅         |      ❌      |
+|                     [LayoutXLM](model_doc/layoutxlm)                     |       ✅        |         ❌         |      ❌      |
+|                           [LED](model_doc/led)                           |       ✅        |         ✅         |      ❌      |
+|                         [LeViT](model_doc/levit)                         |       ✅        |         ❌         |      ❌      |
+|                          [LiLT](model_doc/lilt)                          |       ✅        |         ❌         |      ❌      |
+|                         [LLaMA](model_doc/llama)                         |       ✅        |         ❌         |      ❌      |
+|                        [Llama2](model_doc/llama2)                        |       ✅        |         ❌         |      ❌      |
+|                    [Longformer](model_doc/longformer)                    |       ✅        |         ✅         |      ❌      |
+|                        [LongT5](model_doc/longt5)                        |       ✅        |         ❌         |      ✅      |
+|                          [LUKE](model_doc/luke)                          |       ✅        |         ❌         |      ❌      |
+|                        [LXMERT](model_doc/lxmert)                        |       ✅        |         ✅         |      ❌      |
+|                        [M-CTC-T](model_doc/mctct)                        |       ✅        |         ❌         |      ❌      |
+|                       [M2M100](model_doc/m2m_100)                        |       ✅        |         ❌         |      ❌      |
+|                        [Marian](model_doc/marian)                        |       ✅        |         ✅         |      ✅      |
+|                      [MarkupLM](model_doc/markuplm)                      |       ✅        |         ❌         |      ❌      |
+|                   [Mask2Former](model_doc/mask2former)                   |       ✅        |         ❌         |      ❌      |
+|                    [MaskFormer](model_doc/maskformer)                    |       ✅        |         ❌         |      ❌      |
+|                        [MatCha](model_doc/matcha)                        |       ✅        |         ❌         |      ❌      |
+|                         [mBART](model_doc/mbart)                         |       ✅        |         ✅         |      ✅      |
+|                      [mBART-50](model_doc/mbart50)                       |       ✅        |         ✅         |      ✅      |
+|                          [MEGA](model_doc/mega)                          |       ✅        |         ❌         |      ❌      |
+|                 [Megatron-BERT](model_doc/megatron-bert)                 |       ✅        |         ❌         |      ❌      |
+|                 [Megatron-GPT2](model_doc/megatron_gpt2)                 |       ✅        |         ✅         |      ✅      |
+|                       [MGP-STR](model_doc/mgp-str)                       |       ✅        |         ❌         |      ❌      |
+|                       [Mistral](model_doc/mistral)                       |       ✅        |         ❌         |      ❌      |
+|                         [mLUKE](model_doc/mluke)                         |       ✅        |         ❌         |      ❌      |
+|                           [MMS](model_doc/mms)                           |       ✅        |         ✅         |      ✅      |
+|                    [MobileBERT](model_doc/mobilebert)                    |       ✅        |         ✅         |      ❌      |
+|                  [MobileNetV1](model_doc/mobilenet_v1)                   |       ✅        |         ❌         |      ❌      |
+|                  [MobileNetV2](model_doc/mobilenet_v2)                   |       ✅        |         ❌         |      ❌      |
+|                     [MobileViT](model_doc/mobilevit)                     |       ✅        |         ✅         |      ❌      |
+|                   [MobileViTV2](model_doc/mobilevitv2)                   |       ✅        |         ❌         |      ❌      |
+|                         [MPNet](model_doc/mpnet)                         |       ✅        |         ✅         |      ❌      |
+|                           [MPT](model_doc/mpt)                           |       ✅        |         ❌         |      ❌      |
+|                           [MRA](model_doc/mra)                           |       ✅        |         ❌         |      ❌      |
+|                           [MT5](model_doc/mt5)                           |       ✅        |         ✅         |      ✅      |
+|                      [MusicGen](model_doc/musicgen)                      |       ✅        |         ❌         |      ❌      |
+|                           [MVP](model_doc/mvp)                           |       ✅        |         ❌         |      ❌      |
+|                           [NAT](model_doc/nat)                           |       ✅        |         ❌         |      ❌      |
+|                         [Nezha](model_doc/nezha)                         |       ✅        |         ❌         |      ❌      |
+|                          [NLLB](model_doc/nllb)                          |       ✅        |         ❌         |      ❌      |
+|                      [NLLB-MOE](model_doc/nllb-moe)                      |       ✅        |         ❌         |      ❌      |
+|                        [Nougat](model_doc/nougat)                        |       ✅        |         ✅         |      ✅      |
+|                 [Nyströmformer](model_doc/nystromformer)                 |       ✅        |         ❌         |      ❌      |
+|                     [OneFormer](model_doc/oneformer)                     |       ✅        |         ❌         |      ❌      |
+|                    [OpenAI GPT](model_doc/openai-gpt)                    |       ✅        |         ✅         |      ❌      |
+|                      [OpenAI GPT-2](model_doc/gpt2)                      |       ✅        |         ✅         |      ✅      |
+|                    [OpenLlama](model_doc/open-llama)                     |       ✅        |         ❌         |      ❌      |
+|                           [OPT](model_doc/opt)                           |       ✅        |         ✅         |      ✅      |
+|                       [OWL-ViT](model_doc/owlvit)                        |       ✅        |         ❌         |      ❌      |
+|                       [Pegasus](model_doc/pegasus)                       |       ✅        |         ✅         |      ✅      |
+|                     [PEGASUS-X](model_doc/pegasus_x)                     |       ✅        |         ❌         |      ❌      |
+|                     [Perceiver](model_doc/perceiver)                     |       ✅        |         ❌         |      ❌      |
+|                     [Persimmon](model_doc/persimmon)                     |       ✅        |         ❌         |      ❌      |
+|                       [PhoBERT](model_doc/phobert)                       |       ✅        |         ✅         |      ✅      |
+|                    [Pix2Struct](model_doc/pix2struct)                    |       ✅        |         ❌         |      ❌      |
+|                        [PLBart](model_doc/plbart)                        |       ✅        |         ❌         |      ❌      |
+|                    [PoolFormer](model_doc/poolformer)                    |       ✅        |         ❌         |      ❌      |
+|                     [Pop2Piano](model_doc/pop2piano)                     |       ✅        |         ❌         |      ❌      |
+|                    [ProphetNet](model_doc/prophetnet)                    |       ✅        |         ❌         |      ❌      |
+|                           [PVT](model_doc/pvt)                           |       ✅        |         ❌         |      ❌      |
+|                       [QDQBert](model_doc/qdqbert)                       |       ✅        |         ❌         |      ❌      |
+|                           [RAG](model_doc/rag)                           |       ✅        |         ✅         |      ❌      |
+|                         [REALM](model_doc/realm)                         |       ✅        |         ❌         |      ❌      |
+|                      [Reformer](model_doc/reformer)                      |       ✅        |         ❌         |      ❌      |
+|                        [RegNet](model_doc/regnet)                        |       ✅        |         ✅         |      ✅      |
+|                       [RemBERT](model_doc/rembert)                       |       ✅        |         ✅         |      ❌      |
+|                        [ResNet](model_doc/resnet)                        |       ✅        |         ✅         |      ✅      |
+|                     [RetriBERT](model_doc/retribert)                     |       ✅        |         ❌         |      ❌      |
+|                       [RoBERTa](model_doc/roberta)                       |       ✅        |         ✅         |      ✅      |
+|          [RoBERTa-PreLayerNorm](model_doc/roberta-prelayernorm)          |       ✅        |         ✅         |      ✅      |
+|                      [RoCBert](model_doc/roc_bert)                       |       ✅        |         ❌         |      ❌      |
+|                      [RoFormer](model_doc/roformer)                      |       ✅        |         ✅         |      ✅      |
+|                          [RWKV](model_doc/rwkv)                          |       ✅        |         ❌         |      ❌      |
+|                           [SAM](model_doc/sam)                           |       ✅        |         ✅         |      ❌      |
+|                     [SegFormer](model_doc/segformer)                     |       ✅        |         ✅         |      ❌      |
+|                           [SEW](model_doc/sew)                           |       ✅        |         ❌         |      ❌      |
+|                         [SEW-D](model_doc/sew-d)                         |       ✅        |         ❌         |      ❌      |
+|        [Speech Encoder decoder](model_doc/speech-encoder-decoder)        |       ✅        |         ❌         |      ✅      |
+|                 [Speech2Text](model_doc/speech_to_text)                  |       ✅        |         ✅         |      ❌      |
+|                      [SpeechT5](model_doc/speecht5)                      |       ✅        |         ❌         |      ❌      |
+|                      [Splinter](model_doc/splinter)                      |       ✅        |         ❌         |      ❌      |
+|                   [SqueezeBERT](model_doc/squeezebert)                   |       ✅        |         ❌         |      ❌      |
+|                   [SwiftFormer](model_doc/swiftformer)                   |       ✅        |         ❌         |      ❌      |
+|                    [Swin Transformer](model_doc/swin)                    |       ✅        |         ✅         |      ❌      |
+|                 [Swin Transformer V2](model_doc/swinv2)                  |       ✅        |         ❌         |      ❌      |
+|                       [Swin2SR](model_doc/swin2sr)                       |       ✅        |         ❌         |      ❌      |
+|           [SwitchTransformers](model_doc/switch_transformers)            |       ✅        |         ❌         |      ❌      |
+|                            [T5](model_doc/t5)                            |       ✅        |         ✅         |      ✅      |
+|                        [T5v1.1](model_doc/t5v1.1)                        |       ✅        |         ✅         |      ✅      |
+|             [Table Transformer](model_doc/table-transformer)             |       ✅        |         ❌         |      ❌      |
+|                         [TAPAS](model_doc/tapas)                         |       ✅        |         ✅         |      ❌      |
+|                         [TAPEX](model_doc/tapex)                         |       ✅        |         ✅         |      ✅      |
+|       [Time Series Transformer](model_doc/time_series_transformer)       |       ✅        |         ❌         |      ❌      |
+|                   [TimeSformer](model_doc/timesformer)                   |       ✅        |         ❌         |      ❌      |
+|        [Trajectory Transformer](model_doc/trajectory_transformer)        |       ✅        |         ❌         |      ❌      |
+|                  [Transformer-XL](model_doc/transfo-xl)                  |       ✅        |         ✅         |      ❌      |
+|                         [TrOCR](model_doc/trocr)                         |       ✅        |         ❌         |      ❌      |
+|                          [TVLT](model_doc/tvlt)                          |       ✅        |         ❌         |      ❌      |
+|                           [UL2](model_doc/ul2)                           |       ✅        |         ✅         |      ✅      |
+|                          [UMT5](model_doc/umt5)                          |       ✅        |         ❌         |      ❌      |
+|                     [UniSpeech](model_doc/unispeech)                     |       ✅        |         ❌         |      ❌      |
+|                 [UniSpeechSat](model_doc/unispeech-sat)                  |       ✅        |         ❌         |      ❌      |
+|                       [UPerNet](model_doc/upernet)                       |       ✅        |         ❌         |      ❌      |
+|                           [VAN](model_doc/van)                           |       ✅        |         ❌         |      ❌      |
+|                      [VideoMAE](model_doc/videomae)                      |       ✅        |         ❌         |      ❌      |
+|                          [ViLT](model_doc/vilt)                          |       ✅        |         ❌         |      ❌      |
+|        [Vision Encoder decoder](model_doc/vision-encoder-decoder)        |       ✅        |         ✅         |      ✅      |
+|       [VisionTextDualEncoder](model_doc/vision-text-dual-encoder)        |       ✅        |         ✅         |      ✅      |
+|                   [VisualBERT](model_doc/visual_bert)                    |       ✅        |         ❌         |      ❌      |
+|                           [ViT](model_doc/vit)                           |       ✅        |         ✅         |      ✅      |
+|                    [ViT Hybrid](model_doc/vit_hybrid)                    |       ✅        |         ❌         |      ❌      |
+|                        [VitDet](model_doc/vitdet)                        |       ✅        |         ❌         |      ❌      |
+|                       [ViTMAE](model_doc/vit_mae)                        |       ✅        |         ✅         |      ❌      |
+|                      [ViTMatte](model_doc/vitmatte)                      |       ✅        |         ❌         |      ❌      |
+|                       [ViTMSN](model_doc/vit_msn)                        |       ✅        |         ❌         |      ❌      |
+|                          [VITS](model_doc/vits)                          |       ✅        |         ❌         |      ❌      |
+|                         [ViViT](model_doc/vivit)                         |       ✅        |         ❌         |      ❌      |
+|                      [Wav2Vec2](model_doc/wav2vec2)                      |       ✅        |         ✅         |      ✅      |
+|            [Wav2Vec2-Conformer](model_doc/wav2vec2-conformer)            |       ✅        |         ❌         |      ❌      |
+|              [Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme)               |       ✅        |         ✅         |      ✅      |
+|                         [WavLM](model_doc/wavlm)                         |       ✅        |         ❌         |      ❌      |
+|                       [Whisper](model_doc/whisper)                       |       ✅        |         ✅         |      ✅      |
+|                        [X-CLIP](model_doc/xclip)                         |       ✅        |         ❌         |      ❌      |
+|                         [X-MOD](model_doc/xmod)                          |       ✅        |         ❌         |      ❌      |
+|                          [XGLM](model_doc/xglm)                          |       ✅        |         ✅         |      ✅      |
+|                           [XLM](model_doc/xlm)                           |       ✅        |         ✅         |      ❌      |
+|                [XLM-ProphetNet](model_doc/xlm-prophetnet)                |       ✅        |         ❌         |      ❌      |
+|                   [XLM-RoBERTa](model_doc/xlm-roberta)                   |       ✅        |         ✅         |      ✅      |
+|                [XLM-RoBERTa-XL](model_doc/xlm-roberta-xl)                |       ✅        |         ❌         |      ❌      |
+|                         [XLM-V](model_doc/xlm-v)                         |       ✅        |         ✅         |      ✅      |
+|                         [XLNet](model_doc/xlnet)                         |       ✅        |         ✅         |      ❌      |
+|                         [XLS-R](model_doc/xls_r)                         |       ✅        |         ✅         |      ✅      |
+|                 [XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)                 |       ✅        |         ✅         |      ✅      |
+|                         [YOLOS](model_doc/yolos)                         |       ✅        |         ❌         |      ❌      |
+|                          [YOSO](model_doc/yoso)                          |       ✅        |         ❌         |      ❌      |
+
+<!-- End table-->
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
deleted file mode 100644
index 7f5f80dba063..000000000000
--- a/docs/source/en/index.mdx
+++ /dev/null
@@ -1,390 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# 🤗 Transformers
-
-State-of-the-art Machine Learning for [PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/), and [JAX](https://jax.readthedocs.io/en/latest/).
-
-🤗 Transformers provides APIs and tools to easily download and train state-of-the-art pretrained models. Using pretrained models can reduce your compute costs, carbon footprint, and save you the time and resources required to train a model from scratch. These models support common tasks in different modalities, such as:
-
-📝 **Natural Language Processing**: text classification, named entity recognition, question answering, language modeling, summarization, translation, multiple choice, and text generation.<br>
-🖼️ **Computer Vision**: image classification, object detection, and segmentation.<br>
-🗣️ **Audio**: automatic speech recognition and audio classification.<br>
-🐙 **Multimodal**: table question answering, optical character recognition, information extraction from scanned documents, video classification, and visual question answering.
-
-🤗 Transformers support framework interoperability between PyTorch, TensorFlow, and JAX. This provides the flexibility to use a different framework at each stage of a model's life; train a model in three lines of code in one framework, and load it for inference in another. Models can also be exported to a format like ONNX and TorchScript for deployment in production environments.
-
-Join the growing community on the [Hub](https://huggingface.co/models), [forum](https://discuss.huggingface.co/), or [Discord](https://discord.com/invite/JfAtkvEtRb) today!
-
-## If you are looking for custom support from the Hugging Face team
-
-<a target="_blank" href="https://huggingface.co/support">
-    <img alt="HuggingFace Expert Acceleration Program" src="https://cdn-media.huggingface.co/marketing/transformers/new-support-improved.png" style="width: 100%; max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
-</a>
-
-## Contents
-
-The documentation is organized into five sections:
-
-- **GET STARTED** provides a quick tour of the library and installation instructions to get up and running.
-- **TUTORIALS** are a great place to start if you're a beginner. This section will help you gain the basic skills you need to start using the library.
-- **HOW-TO GUIDES** show you how to achieve a specific goal, like finetuning a pretrained model for language modeling or how to write and share a custom model.
-- **CONCEPTUAL GUIDES** offers more discussion and explanation of the underlying concepts and ideas behind models, tasks, and the design philosophy of 🤗 Transformers.
-- **API** describes all classes and functions:
-
-  - **MAIN CLASSES** details the most important classes like configuration, model, tokenizer, and pipeline.
-  - **MODELS** details the classes and functions related to each model implemented in the library.
-  - **INTERNAL HELPERS** details utility classes and functions used internally.
-
-### Supported models
-
-<!--This list is updated automatically from the README with _make fix-copies_. Do not update manually! -->
-
-1. **[ALBERT](model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-1. **[AltCLIP](model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
-1. **[Audio Spectrogram Transformer](model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
-1. **[BART](model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
-1. **[BARThez](model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
-1. **[BARTpho](model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
-1. **[BEiT](model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
-1. **[BERT](model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
-1. **[BERT For Sequence Generation](model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[BERTweet](model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
-1. **[BigBird-Pegasus](model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BigBird-RoBERTa](model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BioGpt](model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
-1. **[BiT](model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
-1. **[Blenderbot](model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BlenderbotSmall](model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BLIP](model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
-1. **[BLOOM](model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
-1. **[BORT](model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
-1. **[ByT5](model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
-1. **[CamemBERT](model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
-1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
-1. **[Chinese-CLIP](model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
-1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
-1. **[CLIPSeg](model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
-1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
-1. **[Conditional DETR](model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
-1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
-1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
-1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
-1. **[CTRL](model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
-1. **[CvT](model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
-1. **[Data2Vec](model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
-1. **[DeBERTa](model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[DeBERTa-v2](model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[Decision Transformer](model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
-1. **[Deformable DETR](model_doc/deformable_detr)** (from SenseTime Research) released with the paper [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai.
-1. **[DeiT](model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
-1. **[DETR](model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
-1. **[DialoGPT](model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-1. **[DiNAT](model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi.
-1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
-1. **[DiT](model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
-1. **[Donut](model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
-1. **[DPR](model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
-1. **[DPT](master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
-1. **[ELECTRA](model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
-1. **[EncoderDecoder](model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[ERNIE](model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
-1. **[ESM](model_doc/esm)** (from Meta AI) are transformer protein language models.  **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2 and ESMFold** were released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
-1. **[FLAN-T5](model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
-1. **[FlauBERT](model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
-1. **[FLAVA](model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
-1. **[FNet](model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
-1. **[Funnel Transformer](model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-1. **[GIT](model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
-1. **[GLPN](model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
-1. **[GPT](model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
-1. **[GPT Neo](model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
-1. **[GPT NeoX](model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
-1. **[GPT NeoX Japanese](model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
-1. **[GPT-2](model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
-1. **[GPT-J](model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
-1. **[GPT-Sw3](model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren.
-1. **[GroupViT](model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
-1. **[Hubert](model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
-1. **[I-BERT](model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
-1. **[ImageGPT](model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
-1. **[Jukebox](model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
-1. **[LayoutLM](model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
-1. **[LayoutLMv2](model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
-1. **[LayoutLMv3](model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
-1. **[LayoutXLM](model_doc/layoutxlm)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
-1. **[LED](model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-1. **[LeViT](model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
-1. **[LiLT](model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
-1. **[Longformer](model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-1. **[LongT5](model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
-1. **[LUKE](model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
-1. **[LXMERT](model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
-1. **[M-CTC-T](model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
-1. **[M2M100](model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
-1. **[MarianMT](model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
-1. **[MarkupLM](model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
-1. **[MaskFormer](model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
-1. **[mBART](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-1. **[mBART-50](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-1. **[Megatron-BERT](model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-1. **[Megatron-GPT2](model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-1. **[mLUKE](model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
-1. **[MobileBERT](model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
-1. **[MobileNetV1](model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
-1. **[MobileNetV2](model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
-1. **[MobileViT](model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
-1. **[MPNet](model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
-1. **[MT5](model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-1. **[MVP](model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
-1. **[NAT](model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
-1. **[Nezha](model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
-1. **[NLLB](model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
-1. **[Nyströmformer](model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
-1. **[OPT](master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
-1. **[OWL-ViT](model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
-1. **[Pegasus](model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
-1. **[PEGASUS-X](model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
-1. **[Perceiver IO](model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
-1. **[PhoBERT](model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
-1. **[PLBart](model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
-1. **[PoolFormer](model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
-1. **[ProphetNet](model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-1. **[QDQBert](model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
-1. **[RAG](model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela.
-1. **[REALM](model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
-1. **[Reformer](model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
-1. **[RegNet](model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
-1. **[RemBERT](model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
-1. **[ResNet](model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
-1. **[RoBERTa](model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-1. **[RoBERTa-PreLayerNorm](model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
-1. **[RoCBert](model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
-1. **[RoFormer](model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
-1. **[SegFormer](model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
-1. **[SEW](model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
-1. **[SEW-D](model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
-1. **[SpeechToTextTransformer](model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
-1. **[SpeechToTextTransformer2](model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
-1. **[Splinter](model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
-1. **[SqueezeBERT](model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
-1. **[Swin Transformer](model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
-1. **[Swin Transformer V2](model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
-1. **[Swin2SR](model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
-1. **[SwitchTransformers](model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
-1. **[T5](model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-1. **[T5v1.1](model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-1. **[Table Transformer](model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
-1. **[TAPAS](model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
-1. **[TAPEX](model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
-1. **[Time Series Transformer](model_doc/time_series_transformer)** (from HuggingFace).
-1. **[TimeSformer](model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
-1. **[Trajectory Transformer](model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
-1. **[Transformer-XL](model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-1. **[TrOCR](model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
-1. **[UL2](model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
-1. **[UniSpeech](model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
-1. **[UniSpeechSat](model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
-1. **[VAN](model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
-1. **[VideoMAE](model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
-1. **[ViLT](model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
-1. **[Vision Transformer (ViT)](model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-1. **[VisualBERT](model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
-1. **[ViT Hybrid](model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-1. **[ViTMAE](model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
-1. **[ViTMSN](model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
-1. **[Wav2Vec2](model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
-1. **[Wav2Vec2-Conformer](model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
-1. **[Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
-1. **[WavLM](model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
-1. **[Whisper](model_doc/whisper)** (from OpenAI) released with the paper [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever.
-1. **[X-CLIP](model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
-1. **[XGLM](model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
-1. **[XLM](model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
-1. **[XLM-ProphetNet](model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-1. **[XLM-RoBERTa](model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
-1. **[XLM-RoBERTa-XL](model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
-1. **[XLNet](model_doc/xlnet)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-1. **[XLS-R](model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
-1. **[XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
-1. **[YOLOS](model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
-1. **[YOSO](model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
-
-
-### Supported frameworks
-
-The table below represents the current support in the library for each of those models, whether they have a Python
-tokenizer (called "slow"). A "fast" tokenizer backed by the 🤗 Tokenizers library, whether they have support in Jax (via
-Flax), PyTorch, and/or TensorFlow.
-
-<!--This table is updated automatically from the auto modules with _make fix-copies_. Do not update manually!-->
-
-|             Model             | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support |
-|:-----------------------------:|:--------------:|:--------------:|:---------------:|:------------------:|:------------:|
-|            ALBERT             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|            AltCLIP            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-| Audio Spectrogram Transformer |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             BART              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|             BEiT              |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-|             BERT              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|        Bert Generation        |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            BigBird            |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
-|        BigBird-Pegasus        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            BioGpt             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|              BiT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          Blenderbot           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|        BlenderbotSmall        |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|             BLIP              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             BLOOM             |       ❌       |       ✅       |       ✅        |         ❌         |      ❌      |
-|           CamemBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|            CANINE             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         Chinese-CLIP          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             CLIP              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|            CLIPSeg            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            CodeGen            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|       Conditional DETR        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           ConvBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|           ConvNeXT            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|             CTRL              |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|              CvT              |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|         Data2VecAudio         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         Data2VecText          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|        Data2VecVision         |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            DeBERTa            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|          DeBERTa-v2           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|     Decision Transformer      |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|        Deformable DETR        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             DeiT              |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|             DETR              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             DiNAT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          DistilBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|           DonutSwin           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|              DPR              |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|              DPT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            ELECTRA            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|        Encoder decoder        |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|             ERNIE             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|              ESM              |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|  FairSeq Machine-Translation  |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           FlauBERT            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|             FLAVA             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             FNet              |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|      Funnel Transformer       |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|              GIT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             GLPN              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            GPT Neo            |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-|           GPT NeoX            |       ❌       |       ✅       |       ✅        |         ❌         |      ❌      |
-|       GPT NeoX Japanese       |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             GPT-J             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|            GPT-Sw3            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|           GroupViT            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            Hubert             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            I-BERT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           ImageGPT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            Jukebox            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           LayoutLM            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|          LayoutLMv2           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|          LayoutLMv3           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|              LED              |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|             LeViT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             LiLT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          Longformer           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|            LongT5             |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-|             LUKE              |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            LXMERT             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|            M-CTC-T            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            M2M100             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            Marian             |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
-|           MarkupLM            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|          MaskFormer           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|        MaskFormerSwin         |       ❌       |       ❌       |       ❌        |         ❌         |      ❌      |
-|             mBART             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|         Megatron-BERT         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          MobileBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|          MobileNetV1          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          MobileNetV2          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           MobileViT           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|             MPNet             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|              MT5              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|              MVP              |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|              NAT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             Nezha             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         Nyströmformer         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          OpenAI GPT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|         OpenAI GPT-2          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|              OPT              |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|            OWL-ViT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            Pegasus            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|           PEGASUS-X           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           Perceiver           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            PLBart             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          PoolFormer           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          ProphetNet           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            QDQBert            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|              RAG              |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|             REALM             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|           Reformer            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|            RegNet             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            RemBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|            ResNet             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|           RetriBERT           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|            RoBERTa            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|     RoBERTa-PreLayerNorm      |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|            RoCBert            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           RoFormer            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|           SegFormer           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|              SEW              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             SEW-D             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|    Speech Encoder decoder     |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-|          Speech2Text          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|         Speech2Text2          |       ✅       |       ❌       |       ❌        |         ❌         |      ❌      |
-|           Splinter            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|          SqueezeBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|       Swin Transformer        |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|      Swin Transformer V2      |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            Swin2SR            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|      SwitchTransformers       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|              T5               |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|       Table Transformer       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             TAPAS             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|    Time Series Transformer    |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          TimeSformer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|    Trajectory Transformer     |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|        Transformer-XL         |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|             TrOCR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           UniSpeech           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         UniSpeechSat          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|              VAN              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           VideoMAE            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             ViLT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|    Vision Encoder decoder     |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|     VisionTextDualEncoder     |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-|          VisualBERT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|              ViT              |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|          ViT Hybrid           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            ViTMAE             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            ViTMSN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           Wav2Vec2            |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
-|      Wav2Vec2-Conformer       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             WavLM             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            Whisper            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            X-CLIP             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             XGLM              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|              XLM              |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|        XLM-ProphetNet         |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          XLM-RoBERTa          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|        XLM-RoBERTa-XL         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             XLNet             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|             YOLOS             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             YOSO              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-
-<!-- End table-->
\ No newline at end of file
diff --git a/docs/source/en/installation.md b/docs/source/en/installation.md
new file mode 100644
index 000000000000..b75074fbecac
--- /dev/null
+++ b/docs/source/en/installation.md
@@ -0,0 +1,256 @@
+<!---
+Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Installation
+
+Install 🤗 Transformers for whichever deep learning library you're working with, setup your cache, and optionally configure 🤗 Transformers to run offline.
+
+🤗 Transformers is tested on Python 3.6+, PyTorch 1.1.0+, TensorFlow 2.0+, and Flax. Follow the installation instructions below for the deep learning library you are using:
+
+* [PyTorch](https://pytorch.org/get-started/locally/) installation instructions.
+* [TensorFlow 2.0](https://www.tensorflow.org/install/pip) installation instructions.
+* [Flax](https://flax.readthedocs.io/en/latest/) installation instructions.
+
+## Install with pip
+
+You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, take a look at this [guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). A virtual environment makes it easier to manage different projects, and avoid compatibility issues between dependencies.
+
+Start by creating a virtual environment in your project directory:
+
+```bash
+python -m venv .env
+```
+
+Activate the virtual environment. On Linux and MacOs:
+
+```bash
+source .env/bin/activate
+```
+Activate Virtual environment on Windows
+
+```bash
+.env/Scripts/activate
+```
+
+Now you're ready to install 🤗 Transformers with the following command:
+
+```bash
+pip install transformers
+```
+
+For CPU-support only, you can conveniently install 🤗 Transformers and a deep learning library in one line. For example, install 🤗 Transformers and PyTorch with:
+
+```bash
+pip install 'transformers[torch]'
+```
+
+🤗 Transformers and TensorFlow 2.0:
+
+```bash
+pip install 'transformers[tf-cpu]'
+```
+
+<Tip warning={true}>
+
+M1 / ARM Users
+    
+You will need to install the following before installing TensorFLow 2.0
+```
+brew install cmake
+brew install pkg-config
+```
+
+</Tip>
+
+🤗 Transformers and Flax:
+
+```bash
+pip install 'transformers[flax]'
+```
+
+Finally, check if 🤗 Transformers has been properly installed by running the following command. It will download a pretrained model:
+
+```bash
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"
+```
+
+Then print out the label and score:
+
+```bash
+[{'label': 'POSITIVE', 'score': 0.9998704791069031}]
+```
+
+## Install from source
+
+Install 🤗 Transformers from source with the following command:
+
+```bash
+pip install git+https://github.com/huggingface/transformers
+```
+
+This command installs the bleeding edge `main` version rather than the latest `stable` version. The `main` version is useful for staying up-to-date with the latest developments. For instance, if a bug has been fixed since the last official release but a new release hasn't been rolled out yet. However, this means the `main` version may not always be stable. We strive to keep the `main` version operational, and most issues are usually resolved within a few hours or a day. If you run into a problem, please open an [Issue](https://github.com/huggingface/transformers/issues) so we can fix it even sooner!
+
+Check if 🤗 Transformers has been properly installed by running the following command:
+
+```bash
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I love you'))"
+```
+
+## Editable install
+
+You will need an editable install if you'd like to:
+
+* Use the `main` version of the source code.
+* Contribute to 🤗 Transformers and need to test changes in the code.
+
+Clone the repository and install 🤗 Transformers with the following commands:
+
+```bash
+git clone https://github.com/huggingface/transformers.git
+cd transformers
+pip install -e .
+```
+
+These commands will link the folder you cloned the repository to and your Python library paths. Python will now look inside the folder you cloned to in addition to the normal library paths. For example, if your Python packages are typically installed in `~/anaconda3/envs/main/lib/python3.7/site-packages/`, Python will also search the folder you cloned to: `~/transformers/`.
+
+<Tip warning={true}>
+
+You must keep the `transformers` folder if you want to keep using the library.
+
+</Tip>
+
+Now you can easily update your clone to the latest version of 🤗 Transformers with the following command:
+
+```bash
+cd ~/transformers/
+git pull
+```
+
+Your Python environment will find the `main` version of 🤗 Transformers on the next run.
+
+## Install with conda
+
+Install from the conda channel `huggingface`:
+
+```bash
+conda install -c huggingface transformers
+```
+
+## Cache setup
+
+Pretrained models are downloaded and locally cached at: `~/.cache/huggingface/hub`. This is the default directory given by the shell environment variable `TRANSFORMERS_CACHE`. On Windows, the default directory is given by `C:\Users\username\.cache\huggingface\hub`. You can change the shell environment variables shown below - in order of priority - to specify a different cache directory:
+
+1. Shell environment variable (default): `HUGGINGFACE_HUB_CACHE` or `TRANSFORMERS_CACHE`.
+2. Shell environment variable: `HF_HOME`.
+3. Shell environment variable: `XDG_CACHE_HOME` + `/huggingface`.
+
+<Tip>
+
+🤗 Transformers will use the shell environment variables `PYTORCH_TRANSFORMERS_CACHE` or `PYTORCH_PRETRAINED_BERT_CACHE` if you are coming from an earlier iteration of this library and have set those environment variables, unless you specify the shell environment variable `TRANSFORMERS_CACHE`.
+
+</Tip>
+
+## Offline mode
+
+Run 🤗 Transformers in a firewalled or offline environment with locally cached files by setting the environment variable `TRANSFORMERS_OFFLINE=1`.
+
+<Tip>
+
+Add [🤗 Datasets](https://huggingface.co/docs/datasets/) to your offline training workflow with the environment variable `HF_DATASETS_OFFLINE=1`.
+
+</Tip>
+
+```bash
+HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
+python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
+```
+
+This script should run without hanging or waiting to timeout because it won't attempt to download the model from the Hub.
+
+You can also bypass loading a model from the Hub from each [`~PreTrainedModel.from_pretrained`] call with the [`local_files_only`] parameter. When set to `True`, only local files are loaded:
+
+```py
+from transformers import T5Model
+
+model = T5Model.from_pretrained("./path/to/local/directory", local_files_only=True)
+```
+
+### Fetch models and tokenizers to use offline
+
+Another option for using 🤗 Transformers offline is to download the files ahead of time, and then point to their local path when you need to use them offline. There are three ways to do this:
+
+* Download a file through the user interface on the [Model Hub](https://huggingface.co/models) by clicking on the ↓ icon.
+
+    ![download-icon](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/download-icon.png)
+
+* Use the [`PreTrainedModel.from_pretrained`] and [`PreTrainedModel.save_pretrained`] workflow:
+
+    1. Download your files ahead of time with [`PreTrainedModel.from_pretrained`]:
+
+    ```py
+    >>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/T0_3B")
+    >>> model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0_3B")
+    ```
+
+    2. Save your files to a specified directory with [`PreTrainedModel.save_pretrained`]:
+
+    ```py
+    >>> tokenizer.save_pretrained("./your/path/bigscience_t0")
+    >>> model.save_pretrained("./your/path/bigscience_t0")
+    ```
+
+    3. Now when you're offline, reload your files with [`PreTrainedModel.from_pretrained`] from the specified directory:
+
+    ```py
+    >>> tokenizer = AutoTokenizer.from_pretrained("./your/path/bigscience_t0")
+    >>> model = AutoModel.from_pretrained("./your/path/bigscience_t0")
+    ```
+
+* Programmatically download files with the [huggingface_hub](https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub) library:
+
+    1. Install the `huggingface_hub` library in your virtual environment:
+
+    ```bash
+    python -m pip install huggingface_hub
+    ```
+
+    2. Use the [`hf_hub_download`](https://huggingface.co/docs/hub/adding-a-library#download-files-from-the-hub) function to download a file to a specific path. For example, the following command downloads the `config.json` file from the [T0](https://huggingface.co/bigscience/T0_3B) model to your desired path:
+
+    ```py
+    >>> from huggingface_hub import hf_hub_download
+
+    >>> hf_hub_download(repo_id="bigscience/T0_3B", filename="config.json", cache_dir="./your/path/bigscience_t0")
+    ```
+
+Once your file is downloaded and locally cached, specify it's local path to load and use it:
+
+```py
+>>> from transformers import AutoConfig
+
+>>> config = AutoConfig.from_pretrained("./your/path/bigscience_t0/config.json")
+```
+
+<Tip>
+
+See the [How to download files from the Hub](https://huggingface.co/docs/hub/how-to-downstream) section for more details on downloading files stored on the Hub.
+
+</Tip>
diff --git a/docs/source/en/installation.mdx b/docs/source/en/installation.mdx
deleted file mode 100644
index 4ff4e04436c7..000000000000
--- a/docs/source/en/installation.mdx
+++ /dev/null
@@ -1,240 +0,0 @@
-<!---
-Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Installation
-
-Install 🤗 Transformers for whichever deep learning library you're working with, setup your cache, and optionally configure 🤗 Transformers to run offline.
-
-🤗 Transformers is tested on Python 3.6+, PyTorch 1.1.0+, TensorFlow 2.0+, and Flax. Follow the installation instructions below for the deep learning library you are using:
-
-* [PyTorch](https://pytorch.org/get-started/locally/) installation instructions.
-* [TensorFlow 2.0](https://www.tensorflow.org/install/pip) installation instructions.
-* [Flax](https://flax.readthedocs.io/en/latest/) installation instructions.
-
-## Install with pip
-
-You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, take a look at this [guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). A virtual environment makes it easier to manage different projects, and avoid compatibility issues between dependencies.
-
-Start by creating a virtual environment in your project directory:
-
-```bash
-python -m venv .env
-```
-
-Activate the virtual environment. On Linux and MacOs:
-
-```bash
-source .env/bin/activate
-```
-Activate Virtual environment on Windows
-
-```bash
-.env/Scripts/activate
-```
-
-Now you're ready to install 🤗 Transformers with the following command:
-
-```bash
-pip install transformers
-```
-
-For CPU-support only, you can conveniently install 🤗 Transformers and a deep learning library in one line. For example, install 🤗 Transformers and PyTorch with:
-
-```bash
-pip install transformers[torch]
-```
-
-🤗 Transformers and TensorFlow 2.0:
-
-```bash
-pip install transformers[tf-cpu]
-```
-
-🤗 Transformers and Flax:
-
-```bash
-pip install transformers[flax]
-```
-
-Finally, check if 🤗 Transformers has been properly installed by running the following command. It will download a pretrained model:
-
-```bash
-python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"
-```
-
-Then print out the label and score:
-
-```bash
-[{'label': 'POSITIVE', 'score': 0.9998704791069031}]
-```
-
-## Install from source
-
-Install 🤗 Transformers from source with the following command:
-
-```bash
-pip install git+https://github.com/huggingface/transformers
-```
-
-This command installs the bleeding edge `main` version rather than the latest `stable` version. The `main` version is useful for staying up-to-date with the latest developments. For instance, if a bug has been fixed since the last official release but a new release hasn't been rolled out yet. However, this means the `main` version may not always be stable. We strive to keep the `main` version operational, and most issues are usually resolved within a few hours or a day. If you run into a problem, please open an [Issue](https://github.com/huggingface/transformers/issues) so we can fix it even sooner!
-
-Check if 🤗 Transformers has been properly installed by running the following command:
-
-```bash
-python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I love you'))"
-```
-
-## Editable install
-
-You will need an editable install if you'd like to:
-
-* Use the `main` version of the source code.
-* Contribute to 🤗 Transformers and need to test changes in the code.
-
-Clone the repository and install 🤗 Transformers with the following commands:
-
-```bash
-git clone https://github.com/huggingface/transformers.git
-cd transformers
-pip install -e .
-```
-
-These commands will link the folder you cloned the repository to and your Python library paths. Python will now look inside the folder you cloned to in addition to the normal library paths. For example, if your Python packages are typically installed in `~/anaconda3/envs/main/lib/python3.7/site-packages/`, Python will also search the folder you cloned to: `~/transformers/`.
-
-<Tip warning={true}>
-
-You must keep the `transformers` folder if you want to keep using the library.
-
-</Tip>
-
-Now you can easily update your clone to the latest version of 🤗 Transformers with the following command:
-
-```bash
-cd ~/transformers/
-git pull
-```
-
-Your Python environment will find the `main` version of 🤗 Transformers on the next run.
-
-## Install with conda
-
-Install from the conda channel `huggingface`:
-
-```bash
-conda install -c huggingface transformers
-```
-
-## Cache setup
-
-Pretrained models are downloaded and locally cached at: `~/.cache/huggingface/hub`. This is the default directory given by the shell environment variable `TRANSFORMERS_CACHE`. On Windows, the default directory is given by `C:\Users\username\.cache\huggingface\hub`. You can change the shell environment variables shown below - in order of priority - to specify a different cache directory:
-
-1. Shell environment variable (default): `HUGGINGFACE_HUB_CACHE` or `TRANSFORMERS_CACHE`.
-2. Shell environment variable: `HF_HOME`.
-3. Shell environment variable: `XDG_CACHE_HOME` + `/huggingface`.
-
-<Tip>
-
-🤗 Transformers will use the shell environment variables `PYTORCH_TRANSFORMERS_CACHE` or `PYTORCH_PRETRAINED_BERT_CACHE` if you are coming from an earlier iteration of this library and have set those environment variables, unless you specify the shell environment variable `TRANSFORMERS_CACHE`.
-
-</Tip>
-
-## Offline mode
-
-🤗 Transformers is able to run in a firewalled or offline environment by only using local files. Set the environment variable `TRANSFORMERS_OFFLINE=1` to enable this behavior.
-
-<Tip>
-
-Add [🤗 Datasets](https://huggingface.co/docs/datasets/) to your offline training workflow by setting the environment variable `HF_DATASETS_OFFLINE=1`.
-
-</Tip>
-
-For example, you would typically run a program on a normal network firewalled to external instances with the following command:
-
-```bash
-python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
-```
-
-Run this same program in an offline instance with:
-
-```bash
-HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
-python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
-```
-
-The script should now run without hanging or waiting to timeout because it knows it should only look for local files.
-
-### Fetch models and tokenizers to use offline
-
-Another option for using 🤗 Transformers offline is to download the files ahead of time, and then point to their local path when you need to use them offline. There are three ways to do this:
-
-* Download a file through the user interface on the [Model Hub](https://huggingface.co/models) by clicking on the ↓ icon.
-
-    ![download-icon](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/download-icon.png)
-
-* Use the [`PreTrainedModel.from_pretrained`] and [`PreTrainedModel.save_pretrained`] workflow:
-
-    1. Download your files ahead of time with [`PreTrainedModel.from_pretrained`]:
-
-    ```py
-    >>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-
-    >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/T0_3B")
-    >>> model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0_3B")
-    ```
-
-    2. Save your files to a specified directory with [`PreTrainedModel.save_pretrained`]:
-
-    ```py
-    >>> tokenizer.save_pretrained("./your/path/bigscience_t0")
-    >>> model.save_pretrained("./your/path/bigscience_t0")
-    ```
-
-    3. Now when you're offline, reload your files with [`PreTrainedModel.from_pretrained`] from the specified directory:
-
-    ```py
-    >>> tokenizer = AutoTokenizer.from_pretrained("./your/path/bigscience_t0")
-    >>> model = AutoModel.from_pretrained("./your/path/bigscience_t0")
-    ```
-
-* Programmatically download files with the [huggingface_hub](https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub) library:
-
-    1. Install the `huggingface_hub` library in your virtual environment:
-
-    ```bash
-    python -m pip install huggingface_hub
-    ```
-
-    2. Use the [`hf_hub_download`](https://huggingface.co/docs/hub/adding-a-library#download-files-from-the-hub) function to download a file to a specific path. For example, the following command downloads the `config.json` file from the [T0](https://huggingface.co/bigscience/T0_3B) model to your desired path:
-
-    ```py
-    >>> from huggingface_hub import hf_hub_download
-
-    >>> hf_hub_download(repo_id="bigscience/T0_3B", filename="config.json", cache_dir="./your/path/bigscience_t0")
-    ```
-
-Once your file is downloaded and locally cached, specify it's local path to load and use it:
-
-```py
->>> from transformers import AutoConfig
-
->>> config = AutoConfig.from_pretrained("./your/path/bigscience_t0/config.json")
-```
-
-<Tip>
-
-See the [How to download files from the Hub](https://huggingface.co/docs/hub/how-to-downstream) section for more details on downloading files stored on the Hub.
-
-</Tip>
\ No newline at end of file
diff --git a/docs/source/en/internal/audio_utils.md b/docs/source/en/internal/audio_utils.md
new file mode 100644
index 000000000000..e6a39c7c1c49
--- /dev/null
+++ b/docs/source/en/internal/audio_utils.md
@@ -0,0 +1,39 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Utilities for `FeatureExtractors`
+
+This page lists all the utility functions that can be used by the audio [`FeatureExtractor`] in order to compute special features from a raw audio using common algorithms such as *Short Time Fourier Transform* or *log mel spectrogram*.
+
+Most of those are only useful if you are studying the code of the audio processors in the library.
+
+## Audio Transformations
+
+[[autodoc]] audio_utils.hertz_to_mel
+
+[[autodoc]] audio_utils.mel_to_hertz
+
+[[autodoc]] audio_utils.mel_filter_bank
+
+[[autodoc]] audio_utils.optimal_fft_length
+
+[[autodoc]] audio_utils.window_function
+
+[[autodoc]] audio_utils.spectrogram
+
+[[autodoc]] audio_utils.power_to_db
+
+[[autodoc]] audio_utils.amplitude_to_db
diff --git a/docs/source/en/internal/file_utils.md b/docs/source/en/internal/file_utils.md
new file mode 100644
index 000000000000..6f5657f7743c
--- /dev/null
+++ b/docs/source/en/internal/file_utils.md
@@ -0,0 +1,50 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# General Utilities
+
+This page lists all of Transformers general utility functions that are found in the file `utils.py`.
+
+Most of those are only useful if you are studying the general code in the library.
+
+
+## Enums and namedtuples
+
+[[autodoc]] utils.ExplicitEnum
+
+[[autodoc]] utils.PaddingStrategy
+
+[[autodoc]] utils.TensorType
+
+## Special Decorators
+
+[[autodoc]] utils.add_start_docstrings
+
+[[autodoc]] utils.add_start_docstrings_to_model_forward
+
+[[autodoc]] utils.add_end_docstrings
+
+[[autodoc]] utils.add_code_sample_docstrings
+
+[[autodoc]] utils.replace_return_docstrings
+
+## Special Properties
+
+[[autodoc]] utils.cached_property
+
+## Other Utilities
+
+[[autodoc]] utils._LazyModule
diff --git a/docs/source/en/internal/file_utils.mdx b/docs/source/en/internal/file_utils.mdx
deleted file mode 100644
index 936629314358..000000000000
--- a/docs/source/en/internal/file_utils.mdx
+++ /dev/null
@@ -1,46 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# General Utilities
-
-This page lists all of Transformers general utility functions that are found in the file `utils.py`.
-
-Most of those are only useful if you are studying the general code in the library.
-
-
-## Enums and namedtuples
-
-[[autodoc]] utils.ExplicitEnum
-
-[[autodoc]] utils.PaddingStrategy
-
-[[autodoc]] utils.TensorType
-
-## Special Decorators
-
-[[autodoc]] utils.add_start_docstrings
-
-[[autodoc]] utils.add_start_docstrings_to_model_forward
-
-[[autodoc]] utils.add_end_docstrings
-
-[[autodoc]] utils.add_code_sample_docstrings
-
-[[autodoc]] utils.replace_return_docstrings
-
-## Special Properties
-
-[[autodoc]] utils.cached_property
-
-## Other Utilities
-
-[[autodoc]] utils._LazyModule
diff --git a/docs/source/en/internal/generation_utils.md b/docs/source/en/internal/generation_utils.md
new file mode 100644
index 000000000000..906ee4ea620b
--- /dev/null
+++ b/docs/source/en/internal/generation_utils.md
@@ -0,0 +1,370 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Utilities for Generation
+
+This page lists all the utility functions used by [`~generation.GenerationMixin.generate`],
+[`~generation.GenerationMixin.greedy_search`],
+[`~generation.GenerationMixin.contrastive_search`],
+[`~generation.GenerationMixin.sample`],
+[`~generation.GenerationMixin.beam_search`],
+[`~generation.GenerationMixin.beam_sample`],
+[`~generation.GenerationMixin.group_beam_search`], and
+[`~generation.GenerationMixin.constrained_beam_search`].
+
+Most of those are only useful if you are studying the code of the generate methods in the library.
+
+## Generate Outputs
+
+The output of [`~generation.GenerationMixin.generate`] is an instance of a subclass of
+[`~utils.ModelOutput`]. This output is a data structure containing all the information returned
+by [`~generation.GenerationMixin.generate`], but that can also be used as tuple or dictionary.
+
+Here's an example:
+
+```python
+from transformers import GPT2Tokenizer, GPT2LMHeadModel
+
+tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+model = GPT2LMHeadModel.from_pretrained("gpt2")
+
+inputs = tokenizer("Hello, my dog is cute and ", return_tensors="pt")
+generation_output = model.generate(**inputs, return_dict_in_generate=True, output_scores=True)
+```
+
+The `generation_output` object is a [`~generation.GreedySearchDecoderOnlyOutput`], as we can
+see in the documentation of that class below, it means it has the following attributes:
+
+- `sequences`: the generated sequences of tokens
+- `scores` (optional): the prediction scores of the language modelling head, for each generation step
+- `hidden_states` (optional): the hidden states of the model, for each generation step
+- `attentions` (optional): the attention weights of the model, for each generation step
+
+Here we have the `scores` since we passed along `output_scores=True`, but we don't have `hidden_states` and
+`attentions` because we didn't pass `output_hidden_states=True` or `output_attentions=True`.
+
+You can access each attribute as you would usually do, and if that attribute has not been returned by the model, you
+will get `None`. Here for instance `generation_output.scores` are all the generated prediction scores of the
+language modeling head, and `generation_output.attentions` is `None`.
+
+When using our `generation_output` object as a tuple, it only keeps the attributes that don't have `None` values.
+Here, for instance, it has two elements, `loss` then `logits`, so
+
+```python
+generation_output[:2]
+```
+
+will return the tuple `(generation_output.sequences, generation_output.scores)` for instance.
+
+When using our `generation_output` object as a dictionary, it only keeps the attributes that don't have `None`
+values. Here, for instance, it has two keys that are `sequences` and `scores`.
+
+We document here all output types.
+
+
+### PyTorch
+
+[[autodoc]] generation.GreedySearchEncoderDecoderOutput
+
+[[autodoc]] generation.GreedySearchDecoderOnlyOutput
+
+[[autodoc]] generation.SampleEncoderDecoderOutput
+
+[[autodoc]] generation.SampleDecoderOnlyOutput
+
+[[autodoc]] generation.BeamSearchEncoderDecoderOutput
+
+[[autodoc]] generation.BeamSearchDecoderOnlyOutput
+
+[[autodoc]] generation.BeamSampleEncoderDecoderOutput
+
+[[autodoc]] generation.BeamSampleDecoderOnlyOutput
+
+[[autodoc]] generation.ContrastiveSearchEncoderDecoderOutput
+
+[[autodoc]] generation.ContrastiveSearchDecoderOnlyOutput
+
+### TensorFlow
+
+[[autodoc]] generation.TFGreedySearchEncoderDecoderOutput
+
+[[autodoc]] generation.TFGreedySearchDecoderOnlyOutput
+
+[[autodoc]] generation.TFSampleEncoderDecoderOutput
+
+[[autodoc]] generation.TFSampleDecoderOnlyOutput
+
+[[autodoc]] generation.TFBeamSearchEncoderDecoderOutput
+
+[[autodoc]] generation.TFBeamSearchDecoderOnlyOutput
+
+[[autodoc]] generation.TFBeamSampleEncoderDecoderOutput
+
+[[autodoc]] generation.TFBeamSampleDecoderOnlyOutput
+
+[[autodoc]] generation.TFContrastiveSearchEncoderDecoderOutput
+
+[[autodoc]] generation.TFContrastiveSearchDecoderOnlyOutput
+
+### FLAX
+
+[[autodoc]] generation.FlaxSampleOutput
+
+[[autodoc]] generation.FlaxGreedySearchOutput
+
+[[autodoc]] generation.FlaxBeamSearchOutput
+
+## LogitsProcessor
+
+A [`LogitsProcessor`] can be used to modify the prediction scores of a language model head for
+generation.
+
+### PyTorch
+
+[[autodoc]] AlternatingCodebooksLogitsProcessor
+    - __call__
+
+[[autodoc]] ClassifierFreeGuidanceLogitsProcessor
+    - __call__
+
+[[autodoc]] EncoderNoRepeatNGramLogitsProcessor
+    - __call__
+
+[[autodoc]] EncoderRepetitionPenaltyLogitsProcessor
+    - __call__
+
+[[autodoc]] EpsilonLogitsWarper
+    - __call__
+
+[[autodoc]] EtaLogitsWarper
+    - __call__
+
+[[autodoc]] ExponentialDecayLengthPenalty
+    - __call__
+
+[[autodoc]] ForcedBOSTokenLogitsProcessor
+    - __call__
+
+[[autodoc]] ForcedEOSTokenLogitsProcessor
+    - __call__
+
+[[autodoc]] ForceTokensLogitsProcessor
+    - __call__
+
+[[autodoc]] HammingDiversityLogitsProcessor
+    - __call__
+
+[[autodoc]] InfNanRemoveLogitsProcessor
+    - __call__
+
+[[autodoc]] LogitNormalization
+    - __call__
+
+[[autodoc]] LogitsProcessor
+    - __call__
+
+[[autodoc]] LogitsProcessorList
+    - __call__
+
+[[autodoc]] LogitsWarper
+    - __call__
+
+[[autodoc]] MinLengthLogitsProcessor
+    - __call__
+
+[[autodoc]] MinNewTokensLengthLogitsProcessor
+    - __call__
+
+[[autodoc]] NoBadWordsLogitsProcessor
+    - __call__
+
+[[autodoc]] NoRepeatNGramLogitsProcessor
+    - __call__
+
+[[autodoc]] PrefixConstrainedLogitsProcessor
+    - __call__
+
+[[autodoc]] RepetitionPenaltyLogitsProcessor
+    - __call__
+
+[[autodoc]] SequenceBiasLogitsProcessor
+    - __call__
+
+[[autodoc]] SuppressTokensAtBeginLogitsProcessor
+    - __call__
+
+[[autodoc]] SuppressTokensLogitsProcessor
+    - __call__
+
+[[autodoc]] TemperatureLogitsWarper
+    - __call__
+
+[[autodoc]] TopKLogitsWarper
+    - __call__
+
+[[autodoc]] TopPLogitsWarper
+    - __call__
+
+[[autodoc]] TypicalLogitsWarper
+    - __call__
+
+[[autodoc]] UnbatchedClassifierFreeGuidanceLogitsProcessor
+    - __call__
+
+[[autodoc]] WhisperTimeStampLogitsProcessor
+    - __call__
+
+### TensorFlow
+
+[[autodoc]] TFForcedBOSTokenLogitsProcessor
+    - __call__
+
+[[autodoc]] TFForcedEOSTokenLogitsProcessor
+    - __call__
+
+[[autodoc]] TFForceTokensLogitsProcessor
+    - __call__
+
+[[autodoc]] TFLogitsProcessor
+    - __call__
+
+[[autodoc]] TFLogitsProcessorList
+    - __call__
+
+[[autodoc]] TFLogitsWarper
+    - __call__
+
+[[autodoc]] TFMinLengthLogitsProcessor
+    - __call__
+
+[[autodoc]] TFNoBadWordsLogitsProcessor
+    - __call__
+
+[[autodoc]] TFNoRepeatNGramLogitsProcessor
+    - __call__
+
+[[autodoc]] TFRepetitionPenaltyLogitsProcessor
+    - __call__
+
+[[autodoc]] TFSuppressTokensAtBeginLogitsProcessor
+    - __call__
+
+[[autodoc]] TFSuppressTokensLogitsProcessor
+    - __call__
+
+[[autodoc]] TFTemperatureLogitsWarper
+    - __call__
+
+[[autodoc]] TFTopKLogitsWarper
+    - __call__
+
+[[autodoc]] TFTopPLogitsWarper
+    - __call__
+
+### FLAX
+
+[[autodoc]] FlaxForcedBOSTokenLogitsProcessor
+    - __call__
+
+[[autodoc]] FlaxForcedEOSTokenLogitsProcessor
+    - __call__
+
+[[autodoc]] FlaxForceTokensLogitsProcessor
+    - __call__
+
+[[autodoc]] FlaxLogitsProcessor
+    - __call__
+
+[[autodoc]] FlaxLogitsProcessorList
+    - __call__
+
+[[autodoc]] FlaxLogitsWarper
+    - __call__
+
+[[autodoc]] FlaxMinLengthLogitsProcessor
+    - __call__
+
+[[autodoc]] FlaxSuppressTokensAtBeginLogitsProcessor
+    - __call__
+
+[[autodoc]] FlaxSuppressTokensLogitsProcessor
+    - __call__
+
+[[autodoc]] FlaxTemperatureLogitsWarper
+    - __call__
+
+[[autodoc]] FlaxTopKLogitsWarper
+    - __call__
+
+[[autodoc]] FlaxTopPLogitsWarper
+    - __call__
+
+[[autodoc]] FlaxWhisperTimeStampLogitsProcessor
+    - __call__
+
+## StoppingCriteria
+
+A [`StoppingCriteria`] can be used to change when to stop generation (other than EOS token). Please note that this is exclusivelly available to our PyTorch implementations.
+
+[[autodoc]] StoppingCriteria
+    - __call__
+
+[[autodoc]] StoppingCriteriaList
+    - __call__
+
+[[autodoc]] MaxLengthCriteria
+    - __call__
+
+[[autodoc]] MaxTimeCriteria
+    - __call__
+
+## Constraints
+
+A [`Constraint`] can be used to force the generation to include specific tokens or sequences in the output. Please note that this is exclusivelly available to our PyTorch implementations.
+
+[[autodoc]] Constraint
+
+[[autodoc]] PhrasalConstraint
+
+[[autodoc]] DisjunctiveConstraint
+
+[[autodoc]] ConstraintListState
+
+## BeamSearch
+
+[[autodoc]] BeamScorer
+    - process
+    - finalize
+
+[[autodoc]] BeamSearchScorer
+    - process
+    - finalize
+
+[[autodoc]] ConstrainedBeamSearchScorer
+    - process
+    - finalize
+
+## Utilities
+
+[[autodoc]] top_k_top_p_filtering
+
+[[autodoc]] tf_top_k_top_p_filtering
+
+## Streamers
+
+[[autodoc]] TextStreamer
+
+[[autodoc]] TextIteratorStreamer
diff --git a/docs/source/en/internal/generation_utils.mdx b/docs/source/en/internal/generation_utils.mdx
deleted file mode 100644
index 3c86b7dc3f09..000000000000
--- a/docs/source/en/internal/generation_utils.mdx
+++ /dev/null
@@ -1,267 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Utilities for Generation
-
-This page lists all the utility functions used by [`~generation.GenerationMixin.generate`],
-[`~generation.GenerationMixin.greedy_search`],
-[`~generation.GenerationMixin.contrastive_search`],
-[`~generation.GenerationMixin.sample`],
-[`~generation.GenerationMixin.beam_search`],
-[`~generation.GenerationMixin.beam_sample`],
-[`~generation.GenerationMixin.group_beam_search`], and
-[`~generation.GenerationMixin.constrained_beam_search`].
-
-Most of those are only useful if you are studying the code of the generate methods in the library.
-
-## Generate Outputs
-
-The output of [`~generation.GenerationMixin.generate`] is an instance of a subclass of
-[`~utils.ModelOutput`]. This output is a data structure containing all the information returned
-by [`~generation.GenerationMixin.generate`], but that can also be used as tuple or dictionary.
-
-Here's an example:
-
-```python
-from transformers import GPT2Tokenizer, GPT2LMHeadModel
-
-tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-model = GPT2LMHeadModel.from_pretrained("gpt2")
-
-inputs = tokenizer("Hello, my dog is cute and ", return_tensors="pt")
-generation_output = model.generate(**inputs, return_dict_in_generate=True, output_scores=True)
-```
-
-The `generation_output` object is a [`~generation.GreedySearchDecoderOnlyOutput`], as we can
-see in the documentation of that class below, it means it has the following attributes:
-
-- `sequences`: the generated sequences of tokens
-- `scores` (optional): the prediction scores of the language modelling head, for each generation step
-- `hidden_states` (optional): the hidden states of the model, for each generation step
-- `attentions` (optional): the attention weights of the model, for each generation step
-
-Here we have the `scores` since we passed along `output_scores=True`, but we don't have `hidden_states` and
-`attentions` because we didn't pass `output_hidden_states=True` or `output_attentions=True`.
-
-You can access each attribute as you would usually do, and if that attribute has not been returned by the model, you
-will get `None`. Here for instance `generation_output.scores` are all the generated prediction scores of the
-language modeling head, and `generation_output.attentions` is `None`.
-
-When using our `generation_output` object as a tuple, it only keeps the attributes that don't have `None` values.
-Here, for instance, it has two elements, `loss` then `logits`, so
-
-```python
-generation_output[:2]
-```
-
-will return the tuple `(generation_output.sequences, generation_output.scores)` for instance.
-
-When using our `generation_output` object as a dictionary, it only keeps the attributes that don't have `None`
-values. Here, for instance, it has two keys that are `sequences` and `scores`.
-
-We document here all output types.
-
-
-### GreedySearchOutput
-
-[[autodoc]] generation.GreedySearchDecoderOnlyOutput
-
-[[autodoc]] generation.GreedySearchEncoderDecoderOutput
-
-[[autodoc]] generation.FlaxGreedySearchOutput
-
-### SampleOutput
-
-[[autodoc]] generation.SampleDecoderOnlyOutput
-
-[[autodoc]] generation.SampleEncoderDecoderOutput
-
-[[autodoc]] generation.FlaxSampleOutput
-
-### BeamSearchOutput
-
-[[autodoc]] generation.BeamSearchDecoderOnlyOutput
-
-[[autodoc]] generation.BeamSearchEncoderDecoderOutput
-
-### BeamSampleOutput
-
-[[autodoc]] generation.BeamSampleDecoderOnlyOutput
-
-[[autodoc]] generation.BeamSampleEncoderDecoderOutput
-
-## LogitsProcessor
-
-A [`LogitsProcessor`] can be used to modify the prediction scores of a language model head for
-generation.
-
-[[autodoc]] LogitsProcessor
-    - __call__
-
-[[autodoc]] LogitsProcessorList
-    - __call__
-
-[[autodoc]] LogitsWarper
-    - __call__
-
-[[autodoc]] MinLengthLogitsProcessor
-    - __call__
-
-[[autodoc]] MinNewTokensLengthLogitsProcessor
-    - __call__
-
-[[autodoc]] TemperatureLogitsWarper
-    - __call__
-
-[[autodoc]] RepetitionPenaltyLogitsProcessor
-    - __call__
-
-[[autodoc]] TopPLogitsWarper
-    - __call__
-
-[[autodoc]] TopKLogitsWarper
-    - __call__
-
-[[autodoc]] TypicalLogitsWarper
-    - __call__
-
-[[autodoc]] NoRepeatNGramLogitsProcessor
-    - __call__
-
-[[autodoc]] NoBadWordsLogitsProcessor
-    - __call__
-
-[[autodoc]] PrefixConstrainedLogitsProcessor
-    - __call__
-
-[[autodoc]] HammingDiversityLogitsProcessor
-    - __call__
-
-[[autodoc]] ForcedBOSTokenLogitsProcessor
-    - __call__
-
-[[autodoc]] ForcedEOSTokenLogitsProcessor
-    - __call__
-
-[[autodoc]] InfNanRemoveLogitsProcessor
-    - __call__
-
-[[autodoc]] TFLogitsProcessor
-    - __call__
-
-[[autodoc]] TFLogitsProcessorList
-    - __call__
-
-[[autodoc]] TFLogitsWarper
-    - __call__
-
-[[autodoc]] TFTemperatureLogitsWarper
-    - __call__
-
-[[autodoc]] TFTopPLogitsWarper
-    - __call__
-
-[[autodoc]] TFTopKLogitsWarper
-    - __call__
-
-[[autodoc]] TFMinLengthLogitsProcessor
-    - __call__
-
-[[autodoc]] TFNoBadWordsLogitsProcessor
-    - __call__
-
-[[autodoc]] TFNoRepeatNGramLogitsProcessor
-    - __call__
-
-[[autodoc]] TFRepetitionPenaltyLogitsProcessor
-    - __call__
-
-[[autodoc]] TFForcedBOSTokenLogitsProcessor
-    - __call__
-
-[[autodoc]] TFForcedEOSTokenLogitsProcessor
-    - __call__
-
-[[autodoc]] FlaxLogitsProcessor
-    - __call__
-
-[[autodoc]] FlaxLogitsProcessorList
-    - __call__
-
-[[autodoc]] FlaxLogitsWarper
-    - __call__
-
-[[autodoc]] FlaxTemperatureLogitsWarper
-    - __call__
-
-[[autodoc]] FlaxTopPLogitsWarper
-    - __call__
-
-[[autodoc]] FlaxTopKLogitsWarper
-    - __call__
-
-[[autodoc]] FlaxForcedBOSTokenLogitsProcessor
-    - __call__
-
-[[autodoc]] FlaxForcedEOSTokenLogitsProcessor
-    - __call__
-
-[[autodoc]] FlaxMinLengthLogitsProcessor
-    - __call__
-
-## StoppingCriteria
-
-A [`StoppingCriteria`] can be used to change when to stop generation (other than EOS token).
-
-[[autodoc]] StoppingCriteria
-    - __call__
-
-[[autodoc]] StoppingCriteriaList
-    - __call__
-
-[[autodoc]] MaxLengthCriteria
-    - __call__
-
-[[autodoc]] MaxTimeCriteria
-    - __call__
-
-## Constraints
-
-A [`Constraint`] can be used to force the generation to include specific tokens or sequences in the output.
-
-[[autodoc]] Constraint
-
-[[autodoc]] PhrasalConstraint
-
-[[autodoc]] DisjunctiveConstraint
-
-[[autodoc]] ConstraintListState
-
-## BeamSearch
-
-[[autodoc]] BeamScorer
-    - process
-    - finalize
-
-[[autodoc]] BeamSearchScorer
-    - process
-    - finalize
-
-[[autodoc]] ConstrainedBeamSearchScorer
-    - process
-    - finalize
-
-## Utilities
-
-[[autodoc]] top_k_top_p_filtering
-
-[[autodoc]] tf_top_k_top_p_filtering
diff --git a/docs/source/en/internal/image_processing_utils.md b/docs/source/en/internal/image_processing_utils.md
new file mode 100644
index 000000000000..42f99f361703
--- /dev/null
+++ b/docs/source/en/internal/image_processing_utils.md
@@ -0,0 +1,48 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Utilities for Image Processors
+
+This page lists all the utility functions used by the image processors, mainly the functional
+transformations used to process the images.
+
+Most of those are only useful if you are studying the code of the image processors in the library.
+
+## Image Transformations
+
+[[autodoc]] image_transforms.center_crop
+
+[[autodoc]] image_transforms.center_to_corners_format
+
+[[autodoc]] image_transforms.corners_to_center_format
+
+[[autodoc]] image_transforms.id_to_rgb
+
+[[autodoc]] image_transforms.normalize
+
+[[autodoc]] image_transforms.pad
+
+[[autodoc]] image_transforms.rgb_to_id
+
+[[autodoc]] image_transforms.rescale
+
+[[autodoc]] image_transforms.resize
+
+[[autodoc]] image_transforms.to_pil_image
+
+## ImageProcessingMixin
+
+[[autodoc]] image_processing_utils.ImageProcessingMixin
diff --git a/docs/source/en/internal/image_processing_utils.mdx b/docs/source/en/internal/image_processing_utils.mdx
deleted file mode 100644
index 831458bedab1..000000000000
--- a/docs/source/en/internal/image_processing_utils.mdx
+++ /dev/null
@@ -1,44 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Utilities for Image Processors
-
-This page lists all the utility functions used by the image processors, mainly the functional
-transformations used to process the images.
-
-Most of those are only useful if you are studying the code of the image processors in the library.
-
-## Image Transformations
-
-[[autodoc]] image_transforms.center_crop
-
-[[autodoc]] image_transforms.center_to_corners_format
-
-[[autodoc]] image_transforms.corners_to_center_format
-
-[[autodoc]] image_transforms.id_to_rgb
-
-[[autodoc]] image_transforms.normalize
-
-[[autodoc]] image_transforms.pad
-
-[[autodoc]] image_transforms.rgb_to_id
-
-[[autodoc]] image_transforms.rescale
-
-[[autodoc]] image_transforms.resize
-
-[[autodoc]] image_transforms.to_pil_image
-
-## ImageProcessingMixin
-
-[[autodoc]] image_processing_utils.ImageProcessingMixin
diff --git a/docs/source/en/internal/modeling_utils.md b/docs/source/en/internal/modeling_utils.md
new file mode 100644
index 000000000000..afc8123558f5
--- /dev/null
+++ b/docs/source/en/internal/modeling_utils.md
@@ -0,0 +1,83 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Custom Layers and Utilities
+
+This page lists all the custom layers used by the library, as well as the utility functions it provides for modeling.
+
+Most of those are only useful if you are studying the code of the models in the library.
+
+
+## Pytorch custom modules
+
+[[autodoc]] pytorch_utils.Conv1D
+
+[[autodoc]] modeling_utils.PoolerStartLogits
+    - forward
+
+[[autodoc]] modeling_utils.PoolerEndLogits
+    - forward
+
+[[autodoc]] modeling_utils.PoolerAnswerClass
+    - forward
+
+[[autodoc]] modeling_utils.SquadHeadOutput
+
+[[autodoc]] modeling_utils.SQuADHead
+    - forward
+
+[[autodoc]] modeling_utils.SequenceSummary
+    - forward
+
+## PyTorch Helper Functions
+
+[[autodoc]] pytorch_utils.apply_chunking_to_forward
+
+[[autodoc]] pytorch_utils.find_pruneable_heads_and_indices
+
+[[autodoc]] pytorch_utils.prune_layer
+
+[[autodoc]] pytorch_utils.prune_conv1d_layer
+
+[[autodoc]] pytorch_utils.prune_linear_layer
+
+## TensorFlow custom layers
+
+[[autodoc]] modeling_tf_utils.TFConv1D
+
+[[autodoc]] modeling_tf_utils.TFSequenceSummary
+
+## TensorFlow loss functions
+
+[[autodoc]] modeling_tf_utils.TFCausalLanguageModelingLoss
+
+[[autodoc]] modeling_tf_utils.TFMaskedLanguageModelingLoss
+
+[[autodoc]] modeling_tf_utils.TFMultipleChoiceLoss
+
+[[autodoc]] modeling_tf_utils.TFQuestionAnsweringLoss
+
+[[autodoc]] modeling_tf_utils.TFSequenceClassificationLoss
+
+[[autodoc]] modeling_tf_utils.TFTokenClassificationLoss
+
+## TensorFlow Helper Functions
+
+[[autodoc]] modeling_tf_utils.get_initializer
+
+[[autodoc]] modeling_tf_utils.keras_serializable
+
+[[autodoc]] modeling_tf_utils.shape_list
diff --git a/docs/source/en/internal/modeling_utils.mdx b/docs/source/en/internal/modeling_utils.mdx
deleted file mode 100644
index 914b8ca36798..000000000000
--- a/docs/source/en/internal/modeling_utils.mdx
+++ /dev/null
@@ -1,82 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Custom Layers and Utilities
-
-This page lists all the custom layers used by the library, as well as the utility functions it provides for modeling.
-
-Most of those are only useful if you are studying the code of the models in the library.
-
-
-## Pytorch custom modules
-
-[[autodoc]] pytorch_utils.Conv1D
-
-[[autodoc]] modeling_utils.PoolerStartLogits
-    - forward
-
-[[autodoc]] modeling_utils.PoolerEndLogits
-    - forward
-
-[[autodoc]] modeling_utils.PoolerAnswerClass
-    - forward
-
-[[autodoc]] modeling_utils.SquadHeadOutput
-
-[[autodoc]] modeling_utils.SQuADHead
-    - forward
-
-[[autodoc]] modeling_utils.SequenceSummary
-    - forward
-
-## PyTorch Helper Functions
-
-[[autodoc]] pytorch_utils.apply_chunking_to_forward
-
-[[autodoc]] pytorch_utils.find_pruneable_heads_and_indices
-
-[[autodoc]] pytorch_utils.prune_layer
-
-[[autodoc]] pytorch_utils.prune_conv1d_layer
-
-[[autodoc]] pytorch_utils.prune_linear_layer
-
-## TensorFlow custom layers
-
-[[autodoc]] modeling_tf_utils.TFConv1D
-
-[[autodoc]] modeling_tf_utils.TFSharedEmbeddings
-    - call
-
-[[autodoc]] modeling_tf_utils.TFSequenceSummary
-
-## TensorFlow loss functions
-
-[[autodoc]] modeling_tf_utils.TFCausalLanguageModelingLoss
-
-[[autodoc]] modeling_tf_utils.TFMaskedLanguageModelingLoss
-
-[[autodoc]] modeling_tf_utils.TFMultipleChoiceLoss
-
-[[autodoc]] modeling_tf_utils.TFQuestionAnsweringLoss
-
-[[autodoc]] modeling_tf_utils.TFSequenceClassificationLoss
-
-[[autodoc]] modeling_tf_utils.TFTokenClassificationLoss
-
-## TensorFlow Helper Functions
-
-[[autodoc]] modeling_tf_utils.get_initializer
-
-[[autodoc]] modeling_tf_utils.keras_serializable
-
-[[autodoc]] modeling_tf_utils.shape_list
diff --git a/docs/source/en/internal/pipelines_utils.md b/docs/source/en/internal/pipelines_utils.md
new file mode 100644
index 000000000000..6ea6de9a61b8
--- /dev/null
+++ b/docs/source/en/internal/pipelines_utils.md
@@ -0,0 +1,44 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Utilities for pipelines
+
+This page lists all the utility functions the library provides for pipelines.
+
+Most of those are only useful if you are studying the code of the models in the library.
+
+
+## Argument handling
+
+[[autodoc]] pipelines.ArgumentHandler
+
+[[autodoc]] pipelines.ZeroShotClassificationArgumentHandler
+
+[[autodoc]] pipelines.QuestionAnsweringArgumentHandler
+
+## Data format
+
+[[autodoc]] pipelines.PipelineDataFormat
+
+[[autodoc]] pipelines.CsvPipelineDataFormat
+
+[[autodoc]] pipelines.JsonPipelineDataFormat
+
+[[autodoc]] pipelines.PipedPipelineDataFormat
+
+## Utilities
+
+[[autodoc]] pipelines.PipelineException
diff --git a/docs/source/en/internal/pipelines_utils.mdx b/docs/source/en/internal/pipelines_utils.mdx
deleted file mode 100644
index ed8e75b414bb..000000000000
--- a/docs/source/en/internal/pipelines_utils.mdx
+++ /dev/null
@@ -1,40 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Utilities for pipelines
-
-This page lists all the utility functions the library provides for pipelines.
-
-Most of those are only useful if you are studying the code of the models in the library.
-
-
-## Argument handling
-
-[[autodoc]] pipelines.ArgumentHandler
-
-[[autodoc]] pipelines.ZeroShotClassificationArgumentHandler
-
-[[autodoc]] pipelines.QuestionAnsweringArgumentHandler
-
-## Data format
-
-[[autodoc]] pipelines.PipelineDataFormat
-
-[[autodoc]] pipelines.CsvPipelineDataFormat
-
-[[autodoc]] pipelines.JsonPipelineDataFormat
-
-[[autodoc]] pipelines.PipedPipelineDataFormat
-
-## Utilities
-
-[[autodoc]] pipelines.PipelineException
diff --git a/docs/source/en/internal/time_series_utils.md b/docs/source/en/internal/time_series_utils.md
new file mode 100644
index 000000000000..11c562fbe32a
--- /dev/null
+++ b/docs/source/en/internal/time_series_utils.md
@@ -0,0 +1,29 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Time Series Utilities
+
+This page lists all the utility functions and classes that can be used for Time Series based models.
+
+Most of those are only useful if you are studying the code of the time series models or you wish to add to the collection of distributional output classes.
+
+## Distributional Output
+
+[[autodoc]] time_series_utils.NormalOutput
+
+[[autodoc]] time_series_utils.StudentTOutput
+
+[[autodoc]] time_series_utils.NegativeBinomialOutput
diff --git a/docs/source/en/internal/tokenization_utils.md b/docs/source/en/internal/tokenization_utils.md
new file mode 100644
index 000000000000..5aa650991760
--- /dev/null
+++ b/docs/source/en/internal/tokenization_utils.md
@@ -0,0 +1,42 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Utilities for Tokenizers
+
+This page lists all the utility functions used by the tokenizers, mainly the class
+[`~tokenization_utils_base.PreTrainedTokenizerBase`] that implements the common methods between
+[`PreTrainedTokenizer`] and [`PreTrainedTokenizerFast`] and the mixin
+[`~tokenization_utils_base.SpecialTokensMixin`].
+
+Most of those are only useful if you are studying the code of the tokenizers in the library.
+
+## PreTrainedTokenizerBase
+
+[[autodoc]] tokenization_utils_base.PreTrainedTokenizerBase
+    - __call__
+    - all
+
+## SpecialTokensMixin
+
+[[autodoc]] tokenization_utils_base.SpecialTokensMixin
+
+## Enums and namedtuples
+
+[[autodoc]] tokenization_utils_base.TruncationStrategy
+
+[[autodoc]] tokenization_utils_base.CharSpan
+
+[[autodoc]] tokenization_utils_base.TokenSpan
diff --git a/docs/source/en/internal/tokenization_utils.mdx b/docs/source/en/internal/tokenization_utils.mdx
deleted file mode 100644
index 24e81f702064..000000000000
--- a/docs/source/en/internal/tokenization_utils.mdx
+++ /dev/null
@@ -1,38 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Utilities for Tokenizers
-
-This page lists all the utility functions used by the tokenizers, mainly the class
-[`~tokenization_utils_base.PreTrainedTokenizerBase`] that implements the common methods between
-[`PreTrainedTokenizer`] and [`PreTrainedTokenizerFast`] and the mixin
-[`~tokenization_utils_base.SpecialTokensMixin`].
-
-Most of those are only useful if you are studying the code of the tokenizers in the library.
-
-## PreTrainedTokenizerBase
-
-[[autodoc]] tokenization_utils_base.PreTrainedTokenizerBase
-    - __call__
-    - all
-
-## SpecialTokensMixin
-
-[[autodoc]] tokenization_utils_base.SpecialTokensMixin
-
-## Enums and namedtuples
-
-[[autodoc]] tokenization_utils_base.TruncationStrategy
-
-[[autodoc]] tokenization_utils_base.CharSpan
-
-[[autodoc]] tokenization_utils_base.TokenSpan
diff --git a/docs/source/en/internal/trainer_utils.md b/docs/source/en/internal/trainer_utils.md
new file mode 100644
index 000000000000..e3f8a9b04536
--- /dev/null
+++ b/docs/source/en/internal/trainer_utils.md
@@ -0,0 +1,49 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Utilities for Trainer
+
+This page lists all the utility functions used by [`Trainer`].
+
+Most of those are only useful if you are studying the code of the Trainer in the library.
+
+## Utilities
+
+[[autodoc]] EvalPrediction
+
+[[autodoc]] IntervalStrategy
+
+[[autodoc]] enable_full_determinism
+
+[[autodoc]] set_seed
+
+[[autodoc]] torch_distributed_zero_first
+
+## Callbacks internals
+
+[[autodoc]] trainer_callback.CallbackHandler
+
+## Distributed Evaluation
+
+[[autodoc]] trainer_pt_utils.DistributedTensorGatherer
+
+## Distributed Evaluation
+
+[[autodoc]] HfArgumentParser
+
+## Debug Utilities
+
+[[autodoc]] debug_utils.DebugUnderflowOverflow
diff --git a/docs/source/en/internal/trainer_utils.mdx b/docs/source/en/internal/trainer_utils.mdx
deleted file mode 100644
index bba182d5ab64..000000000000
--- a/docs/source/en/internal/trainer_utils.mdx
+++ /dev/null
@@ -1,45 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Utilities for Trainer
-
-This page lists all the utility functions used by [`Trainer`].
-
-Most of those are only useful if you are studying the code of the Trainer in the library.
-
-## Utilities
-
-[[autodoc]] EvalPrediction
-
-[[autodoc]] IntervalStrategy
-
-[[autodoc]] enable_full_determinism
-
-[[autodoc]] set_seed
-
-[[autodoc]] torch_distributed_zero_first
-
-## Callbacks internals
-
-[[autodoc]] trainer_callback.CallbackHandler
-
-## Distributed Evaluation
-
-[[autodoc]] trainer_pt_utils.DistributedTensorGatherer
-
-## Distributed Evaluation
-
-[[autodoc]] HfArgumentParser
-
-## Debug Utilities
-
-[[autodoc]] debug_utils.DebugUnderflowOverflow
diff --git a/docs/source/en/llm_tutorial.md b/docs/source/en/llm_tutorial.md
new file mode 100644
index 000000000000..0be71233a20a
--- /dev/null
+++ b/docs/source/en/llm_tutorial.md
@@ -0,0 +1,221 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+
+# Generation with LLMs
+
+[[open-in-colab]]
+
+LLMs, or Large Language Models, are the key component behind text generation. In a nutshell, they consist of large pretrained transformer models trained to predict the next word (or, more precisely, token) given some input text. Since they predict one token at a time, you need to do something more elaborate to generate new sentences other than just calling the model -- you need to do autoregressive generation.
+
+Autoregressive generation is the inference-time procedure of iteratively calling a model with its own generated outputs, given a few initial inputs. In 🤗 Transformers, this is handled by the [`~generation.GenerationMixin.generate`] method, which is available to all models with generative capabilities.
+
+This tutorial will show you how to:
+
+* Generate text with an LLM
+* Avoid common pitfalls
+* Next steps to help you get the most out of your LLM
+
+Before you begin, make sure you have all the necessary libraries installed:
+
+```bash
+pip install transformers bitsandbytes>=0.39.0 -q
+```
+
+
+## Generate text
+
+A language model trained for [causal language modeling](tasks/language_modeling) takes a sequence of text tokens as input and returns the probability distribution for the next token.
+
+<!-- [GIF 1 -- FWD PASS] -->
+<figure class="image table text-center m-0 w-full">
+    <video
+        style="max-width: 90%; margin: auto;"
+        autoplay loop muted playsinline
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/assisted-generation/gif_1_1080p.mov"
+    ></video>
+    <figcaption>"Forward pass of an LLM"</figcaption>
+</figure>
+
+A critical aspect of autoregressive generation with LLMs is how to select the next token from this probability distribution. Anything goes in this step as long as you end up with a token for the next iteration. This means it can be as simple as selecting the most likely token from the probability distribution or as complex as applying a dozen transformations before sampling from the resulting distribution.
+
+<!-- [GIF 2 -- TEXT GENERATION] -->
+<figure class="image table text-center m-0 w-full">
+    <video
+        style="max-width: 90%; margin: auto;"
+        autoplay loop muted playsinline
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/assisted-generation/gif_2_1080p.mov"
+    ></video>
+    <figcaption>"Autoregressive generation iteratively selects the next token from a probability distribution to generate text"</figcaption>
+</figure>
+
+The process depicted above is repeated iteratively until some stopping condition is reached. Ideally, the stopping condition is dictated by the model, which should learn when to output an end-of-sequence (`EOS`) token. If this is not the case, generation stops when some predefined maximum length is reached.
+
+Properly setting up the token selection step and the stopping condition is essential to make your model behave as you'd expect on your task. That is why we have a [`~generation.GenerationConfig`] file associated with each model, which contains a good default generative parameterization and is loaded alongside your model.
+
+Let's talk code!
+
+<Tip>
+
+If you're interested in basic LLM usage, our high-level [`Pipeline`](pipeline_tutorial) interface is a great starting point. However, LLMs often require advanced features like quantization and fine control of the token selection step, which is best done through [`~generation.GenerationMixin.generate`]. Autoregressive generation with LLMs is also resource-intensive and should be executed on a GPU for adequate throughput.
+
+</Tip>
+
+<!-- TODO: update example to llama 2 (or a newer popular baseline) when it becomes ungated -->
+First, you need to load the model.
+
+```py
+>>> from transformers import AutoModelForCausalLM
+
+>>> model = AutoModelForCausalLM.from_pretrained(
+...     "openlm-research/open_llama_7b", device_map="auto", load_in_4bit=True
+... )
+```
+
+You'll notice two flags in the `from_pretrained` call:
+
+ - `device_map` ensures the model is moved to your GPU(s)
+ - `load_in_4bit` applies [4-bit dynamic quantization](main_classes/quantization) to massively reduce the resource requirements
+
+There are other ways to initialize a model, but this is a good baseline to begin with an LLM.
+
+Next, you need to preprocess your text input with a [tokenizer](tokenizer_summary).
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b")
+>>> model_inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to("cuda")
+```
+
+The `model_inputs` variable holds the tokenized text input, as well as the attention mask. While [`~generation.GenerationMixin.generate`] does its best effort to infer the attention mask when it is not passed, we recommend passing it whenever possible for optimal results.
+
+Finally, call the [`~generation.GenerationMixin.generate`] method to returns the generated tokens, which should be converted to text before printing.
+
+```py
+>>> generated_ids = model.generate(**model_inputs)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'A list of colors: red, blue, green, yellow, black, white, and brown'
+```
+
+And that's it! In a few lines of code, you can harness the power of an LLM.
+
+
+## Common pitfalls
+
+There are many [generation strategies](generation_strategies), and sometimes the default values may not be appropriate for your use case. If your outputs aren't aligned with what you're expecting, we've created a list of the most common pitfalls and how to avoid them.
+
+```py
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b")
+>>> tokenizer.pad_token = tokenizer.eos_token  # Llama has no pad token by default
+>>> model = AutoModelForCausalLM.from_pretrained(
+...     "openlm-research/open_llama_7b", device_map="auto", load_in_4bit=True
+... )
+```
+
+### Generated output is too short/long
+
+If not specified in the [`~generation.GenerationConfig`] file, `generate` returns up to 20 tokens by default. We highly recommend manually setting `max_new_tokens` in your `generate` call to control the maximum number of new tokens it can return. Keep in mind LLMs (more precisely, [decoder-only models](https://huggingface.co/learn/nlp-course/chapter1/6?fw=pt)) also return the input prompt as part of the output.
+
+
+```py
+>>> model_inputs = tokenizer(["A sequence of numbers: 1, 2"], return_tensors="pt").to("cuda")
+
+>>> # By default, the output will contain up to 20 tokens
+>>> generated_ids = model.generate(**model_inputs)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'A sequence of numbers: 1, 2, 3, 4, 5'
+
+>>> # Setting `max_new_tokens` allows you to control the maximum length
+>>> generated_ids = model.generate(**model_inputs, max_new_tokens=50)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'A sequence of numbers: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,'
+```
+
+### Incorrect generation mode
+
+By default, and unless specified in the [`~generation.GenerationConfig`] file, `generate` selects the most likely token at each iteration (greedy decoding). Depending on your task, this may be undesirable; creative tasks like chatbots or writing an essay benefit from sampling. On the other hand, input-grounded tasks like audio transcription or translation benefit from greedy decoding. Enable sampling with `do_sample=True`, and you can learn more about this topic in this [blog post](https://huggingface.co/blog/how-to-generate).
+
+```py
+>>> # Set seed or reproducibility -- you don't need this unless you want full reproducibility
+>>> from transformers import set_seed
+>>> set_seed(0)
+
+>>> model_inputs = tokenizer(["I am a cat."], return_tensors="pt").to("cuda")
+
+>>> # LLM + greedy decoding = repetitive, boring output
+>>> generated_ids = model.generate(**model_inputs)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'I am a cat. I am a cat. I am a cat. I am a cat'
+
+>>> # With sampling, the output becomes more creative!
+>>> generated_ids = model.generate(**model_inputs, do_sample=True)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'I am a cat.\nI just need to be. I am always.\nEvery time'
+```
+
+### Wrong padding side
+
+LLMs are [decoder-only](https://huggingface.co/learn/nlp-course/chapter1/6?fw=pt) architectures, meaning they continue to iterate on your input prompt. If your inputs do not have the same length, they need to be padded. Since LLMs are not trained to continue from pad tokens, your input needs to be left-padded. Make sure you also don't forget to pass the attention mask to generate!
+
+```py
+>>> # The tokenizer initialized above has right-padding active by default: the 1st sequence,
+>>> # which is shorter, has padding on the right side. Generation fails.
+>>> model_inputs = tokenizer(
+...     ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
+... ).to("cuda")
+>>> generated_ids = model.generate(**model_inputs)
+>>> tokenizer.batch_decode(generated_ids[0], skip_special_tokens=True)[0]
+''
+
+>>> # With left-padding, it works as expected!
+>>> tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b", padding_side="left")
+>>> tokenizer.pad_token = tokenizer.eos_token  # Llama has no pad token by default
+>>> model_inputs = tokenizer(
+...     ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
+... ).to("cuda")
+>>> generated_ids = model.generate(**model_inputs)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'1, 2, 3, 4, 5, 6,'
+```
+
+<!-- TODO: when the prompting guide is ready, mention the importance of setting the right prompt in this section -->
+
+## Further resources
+
+While the autoregressive generation process is relatively straightforward, making the most out of your LLM can be a challenging endeavor because there are many moving parts. For your next steps to help you dive deeper into LLM usage and understanding:
+
+<!-- TODO: complete with new guides -->
+### Advanced generate usage
+
+1. [Guide](generation_strategies) on how to control different generation methods, how to set up the generation configuration file, and how to stream the output;
+2. API reference on [`~generation.GenerationConfig`], [`~generation.GenerationMixin.generate`], and [generate-related classes](internal/generation_utils).
+
+### LLM leaderboards
+
+1. [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard), which focuses on the quality of the open-source models;
+2. [Open LLM-Perf Leaderboard](https://huggingface.co/spaces/optimum/llm-perf-leaderboard), which focuses on LLM throughput.
+
+### Latency and throughput
+
+1. [Guide](main_classes/quantization) on dynamic quantization, which shows you how to drastically reduce your memory requirements.
+
+### Related libraries
+
+1. [`text-generation-inference`](https://github.com/huggingface/text-generation-inference), a production-ready server for LLMs;
+2. [`optimum`](https://github.com/huggingface/optimum), an extension of 🤗 Transformers that optimizes for specific hardware devices.
diff --git a/docs/source/en/main_classes/agent.md b/docs/source/en/main_classes/agent.md
new file mode 100644
index 000000000000..dfcd375a81ab
--- /dev/null
+++ b/docs/source/en/main_classes/agent.md
@@ -0,0 +1,105 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Agents & Tools
+
+<Tip warning={true}>
+
+Transformers Agents is an experimental API which is subject to change at any time. Results returned by the agents
+can vary as the APIs or underlying models are prone to change.
+
+</Tip>
+
+To learn more about agents and tools make sure to read the [introductory guide](../transformers_agents). This page
+contains the API docs for the underlying classes.
+
+## Agents
+
+We provide three types of agents: [`HfAgent`] uses inference endpoints for opensource models, [`LocalAgent`] uses a model of your choice locally and [`OpenAiAgent`] uses OpenAI closed models.
+
+### HfAgent
+
+[[autodoc]] HfAgent
+
+### LocalAgent
+
+[[autodoc]] LocalAgent
+
+### OpenAiAgent
+
+[[autodoc]] OpenAiAgent
+
+### AzureOpenAiAgent
+
+[[autodoc]] AzureOpenAiAgent
+
+### Agent
+
+[[autodoc]] Agent
+    - chat
+    - run
+    - prepare_for_new_chat
+
+## Tools
+
+### load_tool
+
+[[autodoc]] load_tool
+
+### Tool
+
+[[autodoc]] Tool
+
+### PipelineTool
+
+[[autodoc]] PipelineTool
+
+### RemoteTool
+
+[[autodoc]] RemoteTool
+
+### launch_gradio_demo
+
+[[autodoc]] launch_gradio_demo
+
+## Agent Types
+
+Agents can handle any type of object in-between tools; tools, being completely multimodal, can accept and return
+text, image, audio, video, among other types. In order to increase compatibility between tools, as well as to 
+correctly render these returns in ipython (jupyter, colab, ipython notebooks, ...), we implement wrapper classes
+around these types.
+
+The wrapped objects should continue behaving as initially; a text object should still behave as a string, an image
+object should still behave as a `PIL.Image`.
+
+These types have three specific purposes:
+
+- Calling `to_raw` on the type should return the underlying object
+- Calling `to_string` on the type should return the object as a string: that can be the string in case of an `AgentText`
+  but will be the path of the serialized version of the object in other instances
+- Displaying it in an ipython kernel should display the object correctly
+
+### AgentText
+
+[[autodoc]] transformers.tools.agent_types.AgentText
+
+### AgentImage
+
+[[autodoc]] transformers.tools.agent_types.AgentImage
+
+### AgentAudio
+
+[[autodoc]] transformers.tools.agent_types.AgentAudio
diff --git a/docs/source/en/main_classes/callback.md b/docs/source/en/main_classes/callback.md
new file mode 100644
index 000000000000..ccfdf2568324
--- /dev/null
+++ b/docs/source/en/main_classes/callback.md
@@ -0,0 +1,127 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Callbacks
+
+Callbacks are objects that can customize the behavior of the training loop in the PyTorch
+[`Trainer`] (this feature is not yet implemented in TensorFlow) that can inspect the training loop
+state (for progress reporting, logging on TensorBoard or other ML platforms...) and take decisions (like early
+stopping).
+
+Callbacks are "read only" pieces of code, apart from the [`TrainerControl`] object they return, they
+cannot change anything in the training loop. For customizations that require changes in the training loop, you should
+subclass [`Trainer`] and override the methods you need (see [trainer](trainer) for examples).
+
+By default a [`Trainer`] will use the following callbacks:
+
+- [`DefaultFlowCallback`] which handles the default behavior for logging, saving and evaluation.
+- [`PrinterCallback`] or [`ProgressCallback`] to display progress and print the
+  logs (the first one is used if you deactivate tqdm through the [`TrainingArguments`], otherwise
+  it's the second one).
+- [`~integrations.TensorBoardCallback`] if tensorboard is accessible (either through PyTorch >= 1.4
+  or tensorboardX).
+- [`~integrations.WandbCallback`] if [wandb](https://www.wandb.com/) is installed.
+- [`~integrations.CometCallback`] if [comet_ml](https://www.comet.ml/site/) is installed.
+- [`~integrations.MLflowCallback`] if [mlflow](https://www.mlflow.org/) is installed.
+- [`~integrations.NeptuneCallback`] if [neptune](https://neptune.ai/) is installed.
+- [`~integrations.AzureMLCallback`] if [azureml-sdk](https://pypi.org/project/azureml-sdk/) is
+  installed.
+- [`~integrations.CodeCarbonCallback`] if [codecarbon](https://pypi.org/project/codecarbon/) is
+  installed.
+- [`~integrations.ClearMLCallback`] if [clearml](https://github.com/allegroai/clearml) is installed.
+- [`~integrations.DagsHubCallback`] if [dagshub](https://dagshub.com/) is installed.
+- [`~integrations.FlyteCallback`] if [flyte](https://flyte.org/) is installed.
+
+The main class that implements callbacks is [`TrainerCallback`]. It gets the
+[`TrainingArguments`] used to instantiate the [`Trainer`], can access that
+Trainer's internal state via [`TrainerState`], and can take some actions on the training loop via
+[`TrainerControl`].
+
+
+## Available Callbacks
+
+Here is the list of the available [`TrainerCallback`] in the library:
+
+[[autodoc]] integrations.CometCallback
+    - setup
+
+[[autodoc]] DefaultFlowCallback
+
+[[autodoc]] PrinterCallback
+
+[[autodoc]] ProgressCallback
+
+[[autodoc]] EarlyStoppingCallback
+
+[[autodoc]] integrations.TensorBoardCallback
+
+[[autodoc]] integrations.WandbCallback
+    - setup
+
+[[autodoc]] integrations.MLflowCallback
+    - setup
+
+[[autodoc]] integrations.AzureMLCallback
+
+[[autodoc]] integrations.CodeCarbonCallback
+
+[[autodoc]] integrations.NeptuneCallback
+
+[[autodoc]] integrations.ClearMLCallback
+
+[[autodoc]] integrations.DagsHubCallback
+
+[[autodoc]] integrations.FlyteCallback
+
+## TrainerCallback
+
+[[autodoc]] TrainerCallback
+
+Here is an example of how to register a custom callback with the PyTorch [`Trainer`]:
+
+```python
+class MyCallback(TrainerCallback):
+    "A callback that prints a message at the beginning of training"
+
+    def on_train_begin(self, args, state, control, **kwargs):
+        print("Starting training")
+
+
+trainer = Trainer(
+    model,
+    args,
+    train_dataset=train_dataset,
+    eval_dataset=eval_dataset,
+    callbacks=[MyCallback],  # We can either pass the callback class this way or an instance of it (MyCallback())
+)
+```
+
+Another way to register a callback is to call `trainer.add_callback()` as follows:
+
+```python
+trainer = Trainer(...)
+trainer.add_callback(MyCallback)
+# Alternatively, we can pass an instance of the callback class
+trainer.add_callback(MyCallback())
+```
+
+## TrainerState
+
+[[autodoc]] TrainerState
+
+## TrainerControl
+
+[[autodoc]] TrainerControl
diff --git a/docs/source/en/main_classes/callback.mdx b/docs/source/en/main_classes/callback.mdx
deleted file mode 100644
index f591f80d25e1..000000000000
--- a/docs/source/en/main_classes/callback.mdx
+++ /dev/null
@@ -1,117 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Callbacks
-
-Callbacks are objects that can customize the behavior of the training loop in the PyTorch
-[`Trainer`] (this feature is not yet implemented in TensorFlow) that can inspect the training loop
-state (for progress reporting, logging on TensorBoard or other ML platforms...) and take decisions (like early
-stopping).
-
-Callbacks are "read only" pieces of code, apart from the [`TrainerControl`] object they return, they
-cannot change anything in the training loop. For customizations that require changes in the training loop, you should
-subclass [`Trainer`] and override the methods you need (see [trainer](trainer) for examples).
-
-By default a [`Trainer`] will use the following callbacks:
-
-- [`DefaultFlowCallback`] which handles the default behavior for logging, saving and evaluation.
-- [`PrinterCallback`] or [`ProgressCallback`] to display progress and print the
-  logs (the first one is used if you deactivate tqdm through the [`TrainingArguments`], otherwise
-  it's the second one).
-- [`~integrations.TensorBoardCallback`] if tensorboard is accessible (either through PyTorch >= 1.4
-  or tensorboardX).
-- [`~integrations.WandbCallback`] if [wandb](https://www.wandb.com/) is installed.
-- [`~integrations.CometCallback`] if [comet_ml](https://www.comet.ml/site/) is installed.
-- [`~integrations.MLflowCallback`] if [mlflow](https://www.mlflow.org/) is installed.
-- [`~integrations.NeptuneCallback`] if [neptune](https://neptune.ai/) is installed.
-- [`~integrations.AzureMLCallback`] if [azureml-sdk](https://pypi.org/project/azureml-sdk/) is
-  installed.
-- [`~integrations.CodeCarbonCallback`] if [codecarbon](https://pypi.org/project/codecarbon/) is
-  installed.
-- [`~integrations.ClearMLCallback`] if [clearml](https://github.com/allegroai/clearml) is installed.
-
-The main class that implements callbacks is [`TrainerCallback`]. It gets the
-[`TrainingArguments`] used to instantiate the [`Trainer`], can access that
-Trainer's internal state via [`TrainerState`], and can take some actions on the training loop via
-[`TrainerControl`].
-
-
-## Available Callbacks
-
-Here is the list of the available [`TrainerCallback`] in the library:
-
-[[autodoc]] integrations.CometCallback
-    - setup
-
-[[autodoc]] DefaultFlowCallback
-
-[[autodoc]] PrinterCallback
-
-[[autodoc]] ProgressCallback
-
-[[autodoc]] EarlyStoppingCallback
-
-[[autodoc]] integrations.TensorBoardCallback
-
-[[autodoc]] integrations.WandbCallback
-    - setup
-
-[[autodoc]] integrations.MLflowCallback
-    - setup
-
-[[autodoc]] integrations.AzureMLCallback
-
-[[autodoc]] integrations.CodeCarbonCallback
-
-[[autodoc]] integrations.NeptuneCallback
-
-[[autodoc]] integrations.ClearMLCallback
-
-## TrainerCallback
-
-[[autodoc]] TrainerCallback
-
-Here is an example of how to register a custom callback with the PyTorch [`Trainer`]:
-
-```python
-class MyCallback(TrainerCallback):
-    "A callback that prints a message at the beginning of training"
-
-    def on_train_begin(self, args, state, control, **kwargs):
-        print("Starting training")
-
-
-trainer = Trainer(
-    model,
-    args,
-    train_dataset=train_dataset,
-    eval_dataset=eval_dataset,
-    callbacks=[MyCallback],  # We can either pass the callback class this way or an instance of it (MyCallback())
-)
-```
-
-Another way to register a callback is to call `trainer.add_callback()` as follows:
-
-```python
-trainer = Trainer(...)
-trainer.add_callback(MyCallback)
-# Alternatively, we can pass an instance of the callback class
-trainer.add_callback(MyCallback())
-```
-
-## TrainerState
-
-[[autodoc]] TrainerState
-
-## TrainerControl
-
-[[autodoc]] TrainerControl
diff --git a/docs/source/en/main_classes/configuration.md b/docs/source/en/main_classes/configuration.md
new file mode 100644
index 000000000000..0cfef06d3ce9
--- /dev/null
+++ b/docs/source/en/main_classes/configuration.md
@@ -0,0 +1,32 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Configuration
+
+The base class [`PretrainedConfig`] implements the common methods for loading/saving a configuration
+either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded
+from HuggingFace's AWS S3 repository).
+
+Each derived config class implements model specific attributes. Common attributes present in all config classes are:
+`hidden_size`, `num_attention_heads`, and `num_hidden_layers`. Text models further implement:
+`vocab_size`.
+
+
+## PretrainedConfig
+
+[[autodoc]] PretrainedConfig
+    - push_to_hub
+    - all
diff --git a/docs/source/en/main_classes/configuration.mdx b/docs/source/en/main_classes/configuration.mdx
deleted file mode 100644
index 541781eff76a..000000000000
--- a/docs/source/en/main_classes/configuration.mdx
+++ /dev/null
@@ -1,28 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Configuration
-
-The base class [`PretrainedConfig`] implements the common methods for loading/saving a configuration
-either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded
-from HuggingFace's AWS S3 repository).
-
-Each derived config class implements model specific attributes. Common attributes present in all config classes are:
-`hidden_size`, `num_attention_heads`, and `num_hidden_layers`. Text models further implement:
-`vocab_size`.
-
-
-## PretrainedConfig
-
-[[autodoc]] PretrainedConfig
-    - push_to_hub
-    - all
diff --git a/docs/source/en/main_classes/data_collator.md b/docs/source/en/main_classes/data_collator.md
new file mode 100644
index 000000000000..74e653dd1185
--- /dev/null
+++ b/docs/source/en/main_classes/data_collator.md
@@ -0,0 +1,68 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Data Collator
+
+Data collators are objects that will form a batch by using a list of dataset elements as input. These elements are of
+the same type as the elements of `train_dataset` or `eval_dataset`.
+
+To be able to build batches, data collators may apply some processing (like padding). Some of them (like
+[`DataCollatorForLanguageModeling`]) also apply some random data augmentation (like random masking)
+on the formed batch.
+
+Examples of use can be found in the [example scripts](../examples) or [example notebooks](../notebooks).
+
+
+## Default data collator
+
+[[autodoc]] data.data_collator.default_data_collator
+
+## DefaultDataCollator
+
+[[autodoc]] data.data_collator.DefaultDataCollator
+
+## DataCollatorWithPadding
+
+[[autodoc]] data.data_collator.DataCollatorWithPadding
+
+## DataCollatorForTokenClassification
+
+[[autodoc]] data.data_collator.DataCollatorForTokenClassification
+
+## DataCollatorForSeq2Seq
+
+[[autodoc]] data.data_collator.DataCollatorForSeq2Seq
+
+## DataCollatorForLanguageModeling
+
+[[autodoc]] data.data_collator.DataCollatorForLanguageModeling
+    - numpy_mask_tokens
+    - tf_mask_tokens
+    - torch_mask_tokens
+
+## DataCollatorForWholeWordMask
+
+[[autodoc]] data.data_collator.DataCollatorForWholeWordMask
+    - numpy_mask_tokens
+    - tf_mask_tokens
+    - torch_mask_tokens
+
+## DataCollatorForPermutationLanguageModeling
+
+[[autodoc]] data.data_collator.DataCollatorForPermutationLanguageModeling
+    - numpy_mask_tokens
+    - tf_mask_tokens
+    - torch_mask_tokens
diff --git a/docs/source/en/main_classes/data_collator.mdx b/docs/source/en/main_classes/data_collator.mdx
deleted file mode 100644
index ee1c1418e493..000000000000
--- a/docs/source/en/main_classes/data_collator.mdx
+++ /dev/null
@@ -1,64 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Data Collator
-
-Data collators are objects that will form a batch by using a list of dataset elements as input. These elements are of
-the same type as the elements of `train_dataset` or `eval_dataset`.
-
-To be able to build batches, data collators may apply some processing (like padding). Some of them (like
-[`DataCollatorForLanguageModeling`]) also apply some random data augmentation (like random masking)
-on the formed batch.
-
-Examples of use can be found in the [example scripts](../examples) or [example notebooks](../notebooks).
-
-
-## Default data collator
-
-[[autodoc]] data.data_collator.default_data_collator
-
-## DefaultDataCollator
-
-[[autodoc]] data.data_collator.DefaultDataCollator
-
-## DataCollatorWithPadding
-
-[[autodoc]] data.data_collator.DataCollatorWithPadding
-
-## DataCollatorForTokenClassification
-
-[[autodoc]] data.data_collator.DataCollatorForTokenClassification
-
-## DataCollatorForSeq2Seq
-
-[[autodoc]] data.data_collator.DataCollatorForSeq2Seq
-
-## DataCollatorForLanguageModeling
-
-[[autodoc]] data.data_collator.DataCollatorForLanguageModeling
-    - numpy_mask_tokens
-    - tf_mask_tokens
-    - torch_mask_tokens
-
-## DataCollatorForWholeWordMask
-
-[[autodoc]] data.data_collator.DataCollatorForWholeWordMask
-    - numpy_mask_tokens
-    - tf_mask_tokens
-    - torch_mask_tokens
-
-## DataCollatorForPermutationLanguageModeling
-
-[[autodoc]] data.data_collator.DataCollatorForPermutationLanguageModeling
-    - numpy_mask_tokens
-    - tf_mask_tokens
-    - torch_mask_tokens
diff --git a/docs/source/en/main_classes/deepspeed.md b/docs/source/en/main_classes/deepspeed.md
new file mode 100644
index 000000000000..277610ce9cda
--- /dev/null
+++ b/docs/source/en/main_classes/deepspeed.md
@@ -0,0 +1,2319 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# DeepSpeed Integration
+
+[DeepSpeed](https://github.com/microsoft/DeepSpeed) implements everything described in the [ZeRO paper](https://arxiv.org/abs/1910.02054). Currently it provides full support for:
+
+1. Optimizer state partitioning (ZeRO stage 1)
+2. Gradient partitioning (ZeRO stage 2)
+3. Parameter partitioning (ZeRO stage 3)
+4. Custom mixed precision training handling
+5. A range of fast CUDA-extension-based optimizers
+6. ZeRO-Offload to CPU and NVMe
+
+ZeRO-Offload has its own dedicated paper: [ZeRO-Offload: Democratizing Billion-Scale Model Training](https://arxiv.org/abs/2101.06840). And NVMe-support is described in the paper [ZeRO-Infinity: Breaking the GPU
+Memory Wall for Extreme Scale Deep Learning](https://arxiv.org/abs/2104.07857).
+
+DeepSpeed ZeRO-2 is primarily used only for training, as its features are of no use to inference.
+
+DeepSpeed ZeRO-3 can be used for inference as well, since it allows huge models to be loaded on multiple GPUs, which
+won't be possible on a single GPU.
+
+🤗 Transformers integrates [DeepSpeed](https://github.com/microsoft/DeepSpeed) via 2 options:
+
+1. Integration of the core DeepSpeed features via [`Trainer`]. This is an everything-done-for-you type
+   of integration - just supply your custom config file or use our template and you have nothing else to do. Most of
+   this document is focused on this feature.
+2. If you don't use [`Trainer`] and want to use your own Trainer where you integrated DeepSpeed
+   yourself, core functionality functions like `from_pretrained` and `from_config` include integration of essential
+   parts of DeepSpeed like `zero.Init` for ZeRO stage 3 and higher. To tap into this feature read the docs on
+   [non-Trainer DeepSpeed Integration](#nontrainer-deepspeed-integration).
+
+What is integrated:
+
+Training:
+
+1. DeepSpeed ZeRO training supports the full ZeRO stages 1, 2 and 3 with ZeRO-Infinity (CPU and NVME offload).
+
+Inference:
+
+1. DeepSpeed ZeRO Inference supports ZeRO stage 3 with ZeRO-Infinity. It uses the same ZeRO protocol as training, but
+   it doesn't use an optimizer and a lr scheduler and only stage 3 is relevant. For more details see:
+   [zero-inference](#zero-inference).
+
+There is also DeepSpeed Inference - this is a totally different technology which uses Tensor Parallelism instead of
+ZeRO (coming soon).
+
+
+
+<a id='deepspeed-trainer-integration'></a>
+
+
+## Trainer Deepspeed Integration
+
+
+<a id='deepspeed-installation'></a>
+
+### Installation
+
+Install the library via pypi:
+
+```bash
+pip install deepspeed
+```
+
+or via `transformers`' `extras`:
+
+```bash
+pip install transformers[deepspeed]
+```
+
+or find more details on [the DeepSpeed's GitHub page](https://github.com/microsoft/deepspeed#installation) and
+[advanced install](https://www.deepspeed.ai/tutorials/advanced-install/).
+
+If you're still struggling with the build, first make sure to read [CUDA Extension Installation Notes](trainer#cuda-extension-installation-notes).
+
+If you don't prebuild the extensions and rely on them to be built at run time and you tried all of the above solutions
+to no avail, the next thing to try is to pre-build the modules before installing them.
+
+To make a local build for DeepSpeed:
+
+```bash
+git clone https://github.com/microsoft/DeepSpeed/
+cd DeepSpeed
+rm -rf build
+TORCH_CUDA_ARCH_LIST="8.6" DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 pip install . \
+--global-option="build_ext" --global-option="-j8" --no-cache -v \
+--disable-pip-version-check 2>&1 | tee build.log
+```
+
+If you intend to use NVMe offload you will also need to include `DS_BUILD_AIO=1` in the instructions above (and also
+install *libaio-dev* system-wide).
+
+Edit `TORCH_CUDA_ARCH_LIST` to insert the code for the architectures of the GPU cards you intend to use. Assuming all
+your cards are the same you can get the arch via:
+
+```bash
+CUDA_VISIBLE_DEVICES=0 python -c "import torch; print(torch.cuda.get_device_capability())"
+```
+
+So if you get `8, 6`, then use `TORCH_CUDA_ARCH_LIST="8.6"`. If you have multiple different cards, you can list all
+of them like so `TORCH_CUDA_ARCH_LIST="6.1;8.6"`
+
+If you need to use the same setup on multiple machines, make a binary wheel:
+
+```bash
+git clone https://github.com/microsoft/DeepSpeed/
+cd DeepSpeed
+rm -rf build
+TORCH_CUDA_ARCH_LIST="8.6" DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 \
+python setup.py build_ext -j8 bdist_wheel
+```
+
+it will generate something like `dist/deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl` which now you can install
+as `pip install deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl` locally or on any other machine.
+
+Again, remember to ensure to adjust `TORCH_CUDA_ARCH_LIST` to the target architectures.
+
+You can find the complete list of NVIDIA GPUs and their corresponding **Compute Capabilities** (same as arch in this
+context) [here](https://developer.nvidia.com/cuda-gpus).
+
+You can check the archs pytorch was built with using:
+
+```bash
+python -c "import torch; print(torch.cuda.get_arch_list())"
+```
+
+Here is how to find out the arch for one of the installed GPUs. For example, for GPU 0:
+
+```bash
+CUDA_VISIBLE_DEVICES=0 python -c "import torch; \
+print(torch.cuda.get_device_properties(torch.device('cuda')))"
+```
+
+If the output is:
+
+```bash
+_CudaDeviceProperties(name='GeForce RTX 3090', major=8, minor=6, total_memory=24268MB, multi_processor_count=82)
+```
+
+then you know that this card's arch is `8.6`.
+
+You can also leave `TORCH_CUDA_ARCH_LIST` out completely and then the build program will automatically query the
+architecture of the GPUs the build is made on. This may or may not match the GPUs on the target machines, that's why
+it's best to specify the desired archs explicitly.
+
+If after trying everything suggested you still encounter build issues, please, proceed with the GitHub Issue of
+[Deepspeed](https://github.com/microsoft/DeepSpeed/issues),
+
+
+
+<a id='deepspeed-multi-gpu'></a>
+
+### Deployment with multiple GPUs
+
+To deploy the DeepSpeed integration adjust the [`Trainer`] command line arguments to include a new argument `--deepspeed ds_config.json`, where `ds_config.json` is the DeepSpeed configuration file as
+   documented [here](https://www.deepspeed.ai/docs/config-json/). The file naming is up to you.
+   It's recommended to use DeepSpeed's `add_config_arguments` utility to add the necessary command line arguments to your code.
+   For more information please see [DeepSpeed's Argument Parsing](https://deepspeed.readthedocs.io/en/latest/initialize.html#argument-parsing) doc.
+
+You can use a launcher of your choice here. You can continue using the pytorch launcher:
+
+```bash
+torch.distributed.run --nproc_per_node=2 your_program.py <normal cl args> --deepspeed ds_config.json
+```
+or use the launcher provided by `deepspeed`:
+
+```bash
+deepspeed --num_gpus=2 your_program.py <normal cl args> --deepspeed ds_config.json
+```
+
+As you can see the arguments aren't the same, but for most needs either of them works. The
+full details on how to configure various nodes and GPUs can be found [here](https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node).
+
+When you use the `deepspeed` launcher and you want to use all available gpus you can just omit the `--num_gpus` flag.
+
+Here is an example of running `run_translation.py` under DeepSpeed deploying all available GPUs:
+
+```bash
+deepspeed examples/pytorch/translation/run_translation.py \
+--deepspeed tests/deepspeed/ds_config_zero3.json \
+--model_name_or_path t5-small --per_device_train_batch_size 1 \
+--output_dir output_dir --overwrite_output_dir --fp16 \
+--do_train --max_train_samples 500 --num_train_epochs 1 \
+--dataset_name wmt16 --dataset_config "ro-en" \
+--source_lang en --target_lang ro
+```
+
+Note that in the DeepSpeed documentation you are likely to see `--deepspeed --deepspeed_config ds_config.json` - i.e.
+two DeepSpeed-related arguments, but for the sake of simplicity, and since there are already so many arguments to deal
+with, we combined the two into a single argument.
+
+For some practical usage examples, please, see this [post](https://github.com/huggingface/transformers/issues/8771#issuecomment-759248400).
+
+
+
+<a id='deepspeed-one-gpu'></a>
+
+### Deployment with one GPU
+
+To deploy DeepSpeed with one GPU adjust the [`Trainer`] command line arguments as follows:
+
+```bash
+deepspeed --num_gpus=1 examples/pytorch/translation/run_translation.py \
+--deepspeed tests/deepspeed/ds_config_zero2.json \
+--model_name_or_path t5-small --per_device_train_batch_size 1 \
+--output_dir output_dir --overwrite_output_dir --fp16 \
+--do_train --max_train_samples 500 --num_train_epochs 1 \
+--dataset_name wmt16 --dataset_config "ro-en" \
+--source_lang en --target_lang ro
+```
+
+This is almost the same as with multiple-GPUs, but here we tell DeepSpeed explicitly to use just one GPU via
+`--num_gpus=1`. By default, DeepSpeed deploys all GPUs it can see on the given node. If you have only 1 GPU to start
+with, then you don't need this argument. The following [documentation](https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node) discusses the launcher options.
+
+Why would you want to use DeepSpeed with just one GPU?
+
+1. It has a ZeRO-offload feature which can delegate some computations and memory to the host's CPU and RAM, and thus
+   leave more GPU resources for model's needs - e.g. larger batch size, or enabling a fitting of a very big model which
+   normally won't fit.
+2. It provides a smart GPU memory management system, that minimizes memory fragmentation, which again allows you to fit
+   bigger models and data batches.
+
+While we are going to discuss the configuration in details next, the key to getting a huge improvement on a single GPU
+with DeepSpeed is to have at least the following configuration in the configuration file:
+
+```json
+{
+  "zero_optimization": {
+     "stage": 2,
+     "offload_optimizer": {
+         "device": "cpu",
+         "pin_memory": true
+     },
+     "allgather_partitions": true,
+     "allgather_bucket_size": 2e8,
+     "reduce_scatter": true,
+     "reduce_bucket_size": 2e8,
+     "overlap_comm": true,
+     "contiguous_gradients": true
+  }
+}
+```
+
+which enables optimizer offload and some other important features. You may experiment with the buffer sizes, you will
+find more details in the discussion below.
+
+For a practical usage example of this type of deployment, please, see this [post](https://github.com/huggingface/transformers/issues/8771#issuecomment-759176685).
+
+You may also try the ZeRO-3 with CPU and NVMe offload as explained further in this document.
+
+<!--- TODO: Benchmark whether we can get better performance out of ZeRO-3 vs. ZeRO-2 on a single GPU, and then
+recommend ZeRO-3 config as starting one. -->
+
+Notes:
+
+- if you need to run on a specific GPU, which is different from GPU 0, you can't use `CUDA_VISIBLE_DEVICES` to limit
+  the visible scope of available GPUs. Instead, you have to use the following syntax:
+
+  ```bash
+  deepspeed --include localhost:1 examples/pytorch/translation/run_translation.py ...
+  ```
+
+  In this example, we tell DeepSpeed to use GPU 1 (second gpu).
+
+
+
+<a id='deepspeed-multi-node'></a>
+
+### Deployment with multiple Nodes
+
+The information in this section isn't not specific to the DeepSpeed integration and is applicable to any multi-node program. But DeepSpeed provides a `deepspeed` launcher that is easier to use than other launchers unless you are in a SLURM environment.
+
+For the duration of this section let's assume that you have 2 nodes with 8 gpus each. And you can reach the first node with `ssh hostname1` and second node with `ssh hostname2`, and both must be able to reach each other via ssh locally without a password. Of course, you will need to rename these host (node) names to the actual host names you are working with.
+
+#### The torch.distributed.run launcher
+
+
+For example, to use `torch.distributed.run`, you could do:
+
+```bash
+python -m torch.distributed.run --nproc_per_node=8 --nnode=2 --node_rank=0 --master_addr=hostname1 \
+--master_port=9901 your_program.py <normal cl args> --deepspeed ds_config.json
+```
+
+You have to ssh to each node and run this same command on each one of them! There is no rush, the launcher will wait until both nodes will synchronize.
+
+For more information please see [torchrun](https://pytorch.org/docs/stable/elastic/run.html). Incidentally, this is also the launcher that replaced `torch.distributed.launch` a few pytorch versions back.
+
+
+#### The deepspeed launcher
+
+To use the `deepspeed` launcher instead, you have to first create a `hostfile` file:
+
+```
+hostname1 slots=8
+hostname2 slots=8
+```
+and then you can launch it as:
+
+```bash
+deepspeed --num_gpus 8 --num_nodes 2 --hostfile hostfile --master_addr hostname1 --master_port=9901 \
+your_program.py <normal cl args> --deepspeed ds_config.json
+```
+
+Unlike the `torch.distributed.run` launcher, `deepspeed` will automatically launch this command on both nodes!
+
+For more information please see [Resource Configuration (multi-node)](https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node).
+
+
+#### Launching in a SLURM environment
+
+In the SLURM environment the following approach can be used. The following is a slurm script `launch.slurm` which you will need to adapt it to your specific SLURM environment.
+
+```bash
+#SBATCH --job-name=test-nodes        # name
+#SBATCH --nodes=2                    # nodes
+#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=10           # number of cores per tasks
+#SBATCH --gres=gpu:8                 # number of gpus
+#SBATCH --time 20:00:00              # maximum execution time (HH:MM:SS)
+#SBATCH --output=%x-%j.out           # output file name
+
+export GPUS_PER_NODE=8
+export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+export MASTER_PORT=9901
+
+srun --jobid $SLURM_JOBID bash -c 'python -m torch.distributed.run \
+ --nproc_per_node $GPUS_PER_NODE --nnodes $SLURM_NNODES --node_rank $SLURM_PROCID \
+ --master_addr $MASTER_ADDR --master_port $MASTER_PORT \
+your_program.py <normal cl args> --deepspeed ds_config.json'
+```
+
+All is left is to schedule it to run:
+```bash
+sbatch launch.slurm
+```
+
+`srun` will take care of launching the program simultaneously on all nodes.
+
+
+#### Use of Non-shared filesystem
+
+By default DeepSpeed expects that a multi-node environment uses a shared storage. If this is not the case and each node can only see the local filesystem, you need to adjust the config file to include a  [`checkpoint`_section](https://www.deepspeed.ai/docs/config-json/#checkpoint-options) with the following setting:
+
+```json
+{
+  "checkpoint": {
+    "use_node_local_storage": true
+  }
+}
+```
+
+Alternatively, you can also use the [`Trainer`]'s `--save_on_each_node` argument, and the above config will be added automatically for you.
+
+
+<a id='deepspeed-notebook'></a>
+
+### Deployment in Notebooks
+
+The problem with running notebook cells as a script is that there is no normal `deepspeed` launcher to rely on, so
+under certain setups we have to emulate it.
+
+If you're using only 1 GPU, here is how you'd have to adjust your training code in the notebook to use DeepSpeed.
+
+```python
+# DeepSpeed requires a distributed environment even when only one process is used.
+# This emulates a launcher in the notebook
+import os
+
+os.environ["MASTER_ADDR"] = "localhost"
+os.environ["MASTER_PORT"] = "9994"  # modify if RuntimeError: Address already in use
+os.environ["RANK"] = "0"
+os.environ["LOCAL_RANK"] = "0"
+os.environ["WORLD_SIZE"] = "1"
+
+# Now proceed as normal, plus pass the deepspeed config file
+training_args = TrainingArguments(..., deepspeed="ds_config_zero3.json")
+trainer = Trainer(...)
+trainer.train()
+```
+
+Note: `...` stands for the normal arguments that you'd pass to the functions.
+
+If you want to use more than 1 GPU, you must use a multi-process environment for DeepSpeed to work. That is, you have
+to use the launcher for that purpose and this cannot be accomplished by emulating the distributed environment presented
+at the beginning of this section.
+
+If you want to create the config file on the fly in the notebook in the current directory, you could have a dedicated
+cell with:
+
+```python no-style
+%%bash
+cat <<'EOT' > ds_config_zero3.json
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
+EOT
+```
+
+If the training script is in a normal file and not in the notebook cells, you can launch `deepspeed` normally via
+shell from a cell. For example, to use `run_translation.py` you would launch it with:
+
+```python no-style
+!git clone https://github.com/huggingface/transformers
+!cd transformers; deepspeed examples/pytorch/translation/run_translation.py ...
+```
+
+or with `%%bash` magic, where you can write a multi-line code for the shell program to run:
+
+```python no-style
+%%bash
+
+git clone https://github.com/huggingface/transformers
+cd transformers
+deepspeed examples/pytorch/translation/run_translation.py ...
+```
+
+In such case you don't need any of the code presented at the beginning of this section.
+
+Note: While `%%bash` magic is neat, but currently it buffers the output so you won't see the logs until the process
+completes.
+
+
+
+
+<a id='deepspeed-config'></a>
+
+### Configuration
+
+For the complete guide to the DeepSpeed configuration options that can be used in its configuration file please refer
+to the [following documentation](https://www.deepspeed.ai/docs/config-json/).
+
+You can find dozens of DeepSpeed configuration examples that address various practical needs in [the DeepSpeedExamples
+repo](https://github.com/microsoft/DeepSpeedExamples):
+
+```bash
+git clone https://github.com/microsoft/DeepSpeedExamples
+cd DeepSpeedExamples
+find . -name '*json'
+```
+
+Continuing the code from above, let's say you're looking to configure the Lamb optimizer. So you can search through the
+example `.json` files with:
+
+```bash
+grep -i Lamb $(find . -name '*json')
+```
+
+Some more examples are to be found in the [main repo](https://github.com/microsoft/DeepSpeed) as well.
+
+When using DeepSpeed you always need to supply a DeepSpeed configuration file, yet some configuration parameters have
+to be configured via the command line. You will find the nuances in the rest of this guide.
+
+To get an idea of what DeepSpeed configuration file looks like, here is one that activates ZeRO stage 2 features,
+including optimizer states cpu offload, uses `AdamW` optimizer and `WarmupLR` scheduler and will enable mixed
+precision training if `--fp16` is passed:
+
+```json
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+
+    "zero_optimization": {
+        "stage": 2,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "allgather_partitions": true,
+        "allgather_bucket_size": 2e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 2e8,
+        "contiguous_gradients": true
+    },
+
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+}
+```
+
+When you execute the program, DeepSpeed will log the configuration it received from the [`Trainer`]
+to the console, so you can see exactly what was the final configuration passed to it.
+
+
+
+<a id='deepspeed-config-passing'></a>
+
+### Passing Configuration
+
+As discussed in this document normally the DeepSpeed configuration is passed as a path to a json file, but if you're
+not using the command line interface to configure the training, and instead instantiate the
+[`Trainer`] via [`TrainingArguments`] then for the `deepspeed` argument you can
+pass a nested `dict`. This allows you to create the configuration on the fly and doesn't require you to write it to
+the file system before passing it to [`TrainingArguments`].
+
+To summarize you can do:
+
+```python
+TrainingArguments(..., deepspeed="/path/to/ds_config.json")
+```
+
+or:
+
+```python
+ds_config_dict = dict(scheduler=scheduler_params, optimizer=optimizer_params)
+TrainingArguments(..., deepspeed=ds_config_dict)
+```
+
+<a id='deepspeed-config-shared'></a>
+
+### Shared Configuration
+
+
+<Tip warning={true}>
+
+This section is a must-read
+
+</Tip>
+
+Some configuration values are required by both the [`Trainer`] and DeepSpeed to function correctly,
+therefore, to prevent conflicting definitions, which could lead to hard to detect errors, we chose to configure those
+via the [`Trainer`] command line arguments.
+
+Additionally, some configuration values are derived automatically based on the model's configuration, so instead of
+remembering to manually adjust multiple values, it's the best to let the [`Trainer`] do the majority
+of configuration for you.
+
+Therefore, in the rest of this guide you will find a special configuration value: `auto`, which when set will be
+automatically replaced with the correct or most efficient value. Please feel free to choose to ignore this
+recommendation and set the values explicitly, in which case be very careful that your the
+[`Trainer`] arguments and DeepSpeed configurations agree. For example, are you using the same
+learning rate, or batch size, or gradient accumulation settings? if these mismatch the training may fail in very
+difficult to detect ways. You have been warned.
+
+There are multiple other values that are specific to DeepSpeed-only and those you will have to set manually to suit
+your needs.
+
+In your own programs, you can also use the following approach if you'd like to modify the DeepSpeed config as a master
+and configure [`TrainingArguments`] based on that. The steps are:
+
+1. Create or load the DeepSpeed configuration to be used as a master configuration
+2. Create the [`TrainingArguments`] object based on these values
+
+Do note that some values, such as `scheduler.params.total_num_steps` are calculated by
+[`Trainer`] during `train`, but you can of course do the math yourself.
+
+<a id='deepspeed-zero'></a>
+
+### ZeRO
+
+[Zero Redundancy Optimizer (ZeRO)](https://www.deepspeed.ai/tutorials/zero/) is the workhorse of DeepSpeed. It
+supports 3 different levels (stages) of optimization. The first one is not quite interesting for scalability purposes,
+therefore this document focuses on stages 2 and 3. Stage 3 is further improved by the latest addition of ZeRO-Infinity.
+You will find more indepth information in the DeepSpeed documentation.
+
+The `zero_optimization` section of the configuration file is the most important part ([docs](https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training)), since that is where you define
+which ZeRO stages you want to enable and how to configure them. You will find the explanation for each parameter in the
+DeepSpeed docs.
+
+This section has to be configured exclusively via DeepSpeed configuration - the [`Trainer`] provides
+no equivalent command line arguments.
+
+Note: currently DeepSpeed doesn't validate parameter names, so if you misspell any, it'll use the default setting for
+the parameter that got misspelled. You can watch the DeepSpeed engine start up log messages to see what values it is
+going to use.
+
+
+
+<a id='deepspeed-zero2-config'></a>
+
+#### ZeRO-2 Config
+
+The following is an example of configuration for ZeRO stage 2:
+
+```json
+{
+    "zero_optimization": {
+        "stage": 2,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "allgather_partitions": true,
+        "allgather_bucket_size": 5e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 5e8,
+        "contiguous_gradients": true
+    }
+}
+```
+
+**Performance tuning:**
+
+- enabling `offload_optimizer` should reduce GPU RAM usage (it requires `"stage": 2`)
+- `"overlap_comm": true` trades off increased GPU RAM usage to lower all-reduce latency. `overlap_comm` uses 4.5x
+  the `allgather_bucket_size` and `reduce_bucket_size` values. So if they are set to 5e8, this requires a 9GB
+  footprint (`5e8 x 2Bytes x 2 x 4.5`). Therefore, if you have a GPU with 8GB or less RAM, to avoid getting
+  OOM-errors you will need to reduce those parameters to about `2e8`, which would require 3.6GB. You will want to do
+  the same on larger capacity GPU as well, if you're starting to hit OOM.
+- when reducing these buffers you're trading communication speed to avail more GPU RAM. The smaller the buffer size is,
+  the slower the communication gets, and the more GPU RAM will be available to other tasks. So if a bigger batch size is
+  important, getting a slightly slower training time could be a good trade.
+
+Additionally, `deepspeed==0.4.4` added a new option `round_robin_gradients` which you can enable with:
+
+```json
+{
+    "zero_optimization": {
+        "round_robin_gradients": true
+    }
+}
+```
+
+This is a stage 2 optimization for CPU offloading that parallelizes gradient copying to CPU memory among ranks by fine-grained gradient partitioning. Performance benefit grows with gradient accumulation steps (more copying between optimizer steps) or GPU count (increased parallelism).
+
+
+<a id='deepspeed-zero3-config'></a>
+
+#### ZeRO-3 Config
+
+The following is an example of configuration for ZeRO stage 3:
+
+```json
+{
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    }
+}
+```
+
+If you are getting OOMs, because your model or activations don't fit into the GPU memory and you have unutilized CPU
+memory offloading the optimizer states and parameters to CPU memory with `"device": "cpu"` may solve this limitation.
+If you don't want to offload to CPU memory, use `none` instead of `cpu` for the `device` entry. Offloading to
+NVMe is discussed further down.
+
+Pinned memory is enabled with `pin_memory` set to `true`. This feature can improve the throughput at the cost of
+making less memory available to other processes. Pinned memory is set aside to the specific process that requested it
+and its typically accessed much faster than normal CPU memory.
+
+**Performance tuning:**
+
+- `stage3_max_live_parameters`: `1e9`
+- `stage3_max_reuse_distance`: `1e9`
+
+If hitting OOM reduce `stage3_max_live_parameters` and `stage3_max_reuse_distance`. They should have minimal impact
+on performance unless you are doing activation checkpointing. `1e9` would consume ~2GB. The memory is shared by
+`stage3_max_live_parameters` and `stage3_max_reuse_distance`, so it's not additive, it's just 2GB total.
+
+`stage3_max_live_parameters` is the upper limit on how many full parameters you want to keep on the GPU at any given
+time. "reuse distance" is a metric we are using to figure out when will a parameter be used again in the future, and we
+use the `stage3_max_reuse_distance` to decide whether to throw away the parameter or to keep it. If a parameter is
+going to be used again in near future (less than `stage3_max_reuse_distance`) then we keep it to reduce communication
+overhead. This is super helpful when you have activation checkpointing enabled, where we do a forward recompute and
+backward passes a single layer granularity and want to keep the parameter in the forward recompute till the backward
+
+The following configuration values depend on the model's hidden size:
+
+- `reduce_bucket_size`: `hidden_size*hidden_size`
+- `stage3_prefetch_bucket_size`: `0.9 * hidden_size * hidden_size`
+- `stage3_param_persistence_threshold`: `10 * hidden_size`
+
+therefore set these values to `auto` and the [`Trainer`] will automatically assign the recommended
+values. But, of course, feel free to set these explicitly as well.
+
+`stage3_gather_16bit_weights_on_model_save` enables model fp16 weights consolidation when model gets saved. With large
+models and multiple GPUs this is an expensive operation both in terms of memory and speed. It's currently required if
+you plan to resume the training. Watch out for future updates that will remove this limitation and make things more
+flexible.
+
+If you're migrating from ZeRO-2 configuration note that `allgather_partitions`, `allgather_bucket_size` and
+`reduce_scatter` configuration parameters are not used in ZeRO-3. If you keep these in the config file they will just
+be ignored.
+
+- `sub_group_size`: `1e9`
+
+`sub_group_size` controls the granularity in which parameters are updated during optimizer steps. Parameters are
+grouped into buckets of `sub_group_size` and each buckets is updated one at a time. When used with NVMe offload in
+ZeRO-Infinity, `sub_group_size` therefore controls the granularity in which model states are moved in and out of CPU
+memory from NVMe during the optimizer step. This prevents running out of CPU memory for extremely large models.
+
+You can leave `sub_group_size` to its default value of *1e9* when not using NVMe offload. You may want to change its
+default value in the following cases:
+
+1. Running into OOM during optimizer step: Reduce `sub_group_size` to reduce memory utilization of temporary buffers
+2. Optimizer Step is taking a long time: Increase `sub_group_size` to improve bandwidth utilization as a result of
+   the increased data buffers.
+
+
+#### ZeRO-0 Config
+
+Note that we're listing Stage 0 and 1 last since they are rarely used.
+
+Stage 0 is disabling all types of sharding and just using DeepSpeed as DDP. You can turn it on with:
+
+```json
+{
+    "zero_optimization": {
+        "stage": 0
+    }
+}
+```
+
+This will essentially disable ZeRO without you needing to change anything else.
+
+
+#### ZeRO-1 Config
+
+
+Stage 1 is Stage 2 minus gradient sharding. You can always try it to speed things a tiny bit to only shard the optimizer states with:
+
+
+```json
+{
+    "zero_optimization": {
+        "stage": 1
+    }
+}
+```
+
+
+
+<a id='deepspeed-nvme'></a>
+
+### NVMe Support
+
+ZeRO-Infinity allows for training incredibly large models by extending GPU and CPU memory with NVMe memory. Thanks to
+smart partitioning and tiling algorithms each GPU needs to send and receive very small amounts of data during
+offloading so modern NVMe proved to be fit to allow for an even larger total memory pool available to your training
+process. ZeRO-Infinity requires ZeRO-3 enabled.
+
+The following configuration example enables NVMe to offload both optimizer states and the params:
+
+```json
+{
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "nvme",
+            "nvme_path": "/local_nvme",
+            "pin_memory": true,
+            "buffer_count": 4,
+            "fast_init": false
+        },
+        "offload_param": {
+            "device": "nvme",
+            "nvme_path": "/local_nvme",
+            "pin_memory": true,
+            "buffer_count": 5,
+            "buffer_size": 1e8,
+            "max_in_cpu": 1e9
+        },
+        "aio": {
+            "block_size": 262144,
+            "queue_depth": 32,
+            "thread_count": 1,
+            "single_submit": false,
+            "overlap_events": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+}
+```
+
+You can choose to offload both optimizer states and params to NVMe, or just one of them or none. For example, if you
+have copious amounts of CPU memory available, by all means offload to CPU memory only as it'd be faster (hint:
+*"device": "cpu"*).
+
+Here is the full documentation for offloading [optimizer states](https://www.deepspeed.ai/docs/config-json/#optimizer-offloading) and [parameters](https://www.deepspeed.ai/docs/config-json/#parameter-offloading).
+
+Make sure that your `nvme_path` is actually an NVMe, since it will work with the normal hard drive or SSD, but it'll
+be much much slower. The fast scalable training was designed with modern NVMe transfer speeds in mind (as of this
+writing one can have ~3.5GB/s read, ~3GB/s write peak speeds).
+
+In order to figure out the optimal `aio` configuration block you must run a benchmark on your target setup, as
+[explained here](https://github.com/microsoft/DeepSpeed/issues/998).
+
+
+
+<a id='deepspeed-zero2-zero3-performance'></a>
+
+#### ZeRO-2 vs ZeRO-3 Performance
+
+ZeRO-3 is likely to be slower than ZeRO-2 if everything else is configured the same because the former has to gather
+model weights in addition to what ZeRO-2 does. If ZeRO-2 meets your needs and you don't need to scale beyond a few GPUs
+then you may choose to stick to it. It's important to understand that ZeRO-3 enables a much higher scalability capacity
+at a cost of speed.
+
+It's possible to adjust ZeRO-3 configuration to make it perform closer to ZeRO-2:
+
+- set `stage3_param_persistence_threshold` to a very large number - larger than the largest parameter, e.g., `6 * hidden_size * hidden_size`. This will keep the parameters on the GPUs.
+- turn off `offload_params` since ZeRO-2 doesn't have that option.
+
+The performance will likely improve significantly with just `offload_params` turned off, even if you don't change
+`stage3_param_persistence_threshold`. Of course, these changes will impact the size of the model you can train. So
+these help you to trade scalability for speed depending on your needs.
+
+
+
+<a id='deepspeed-zero2-example'></a>
+
+#### ZeRO-2 Example
+
+Here is a full ZeRO-2 auto-configuration file `ds_config_zero2.json`:
+
+```json
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+
+    "zero_optimization": {
+        "stage": 2,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "allgather_partitions": true,
+        "allgather_bucket_size": 2e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 2e8,
+        "contiguous_gradients": true
+    },
+
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
+```
+
+Here is a full ZeRO-2 all-enabled manually set configuration file. It is here mainly for you to see what the typical
+values look like, but we highly recommend using the one with multiple `auto` settings in it.
+
+```json
+{
+    "fp16": {
+        "enabled": true,
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": 3e-5,
+            "betas": [0.8, 0.999],
+            "eps": 1e-8,
+            "weight_decay": 3e-7
+        }
+    },
+
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": 0,
+            "warmup_max_lr": 3e-5,
+            "warmup_num_steps": 500
+        }
+    },
+
+    "zero_optimization": {
+        "stage": 2,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "allgather_partitions": true,
+        "allgather_bucket_size": 2e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 2e8,
+        "contiguous_gradients": true
+    },
+
+    "steps_per_print": 2000,
+    "wall_clock_breakdown": false
+}
+```
+
+<a id='deepspeed-zero3-example'></a>
+
+#### ZeRO-3 Example
+
+Here is a full ZeRO-3 auto-configuration file `ds_config_zero3.json`:
+
+
+```json
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
+```
+
+Here is a full ZeRO-3 all-enabled manually set configuration file. It is here mainly for you to see what the typical
+values look like, but we highly recommend using the one with multiple `auto` settings in it.
+
+```json
+{
+    "fp16": {
+        "enabled": true,
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": 3e-5,
+            "betas": [0.8, 0.999],
+            "eps": 1e-8,
+            "weight_decay": 3e-7
+        }
+    },
+
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": 0,
+            "warmup_max_lr": 3e-5,
+            "warmup_num_steps": 500
+        }
+    },
+
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": 1e6,
+        "stage3_prefetch_bucket_size": 0.94e6,
+        "stage3_param_persistence_threshold": 1e4,
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+
+    "steps_per_print": 2000,
+    "wall_clock_breakdown": false
+}
+```
+
+#### How to Choose Which ZeRO Stage and Offloads To Use For Best Performance
+
+So now you know there are all these different stages. How to decide which of them to use? This section will attempt to address this question.
+
+In general the following applies:
+
+- Speed-wise (left is faster than right)
+
+Stage 0 (DDP) > Stage 1 > Stage 2 > Stage 2 + offload > Stage 3 > Stage 3 + offloads
+
+- GPU Memory usage-wise (right is more GPU memory efficient than left)
+
+Stage 0 (DDP) < Stage 1 < Stage 2 < Stage 2 + offload < Stage 3 < Stage 3 + offloads
+
+So when you want to get the fastest execution while fitting into minimal number of GPUs, here is the process you could follow. We start with the fastest approach and if running into GPU OOM we then go to the next slower approach, but which will use less GPU memory. And so on and so forth.
+
+First of all set batch size to 1 (you can always use gradient accumulation for any desired effective batch size).
+
+1. Enable `--gradient_checkpointing 1` (HF Trainer) or directly `model.gradient_checkpointing_enable()` - if OOM then
+2. Try ZeRO stage 2 first. if OOM then
+3. Try ZeRO stage 2 + `offload_optimizer` - if OOM then
+4. Switch to ZeRO stage 3 - if OOM then
+5. Enable `offload_param` to `cpu` - if OOM then
+6. Enable `offload_optimizer` to `cpu` - if OOM then
+
+7. If you still can't fit a batch size of 1 first check various default values and lower them if you can. For example, if you use `generate` and you don't use a wide search beam make it narrower as it'd take a lot of memory.
+
+8. Definitely use mixed half-precision over fp32 - so bf16 on Ampere and higher GPUs and fp16 on older gpu architectures.
+
+9. If you still OOM you could add more hardware or enable ZeRO-Infinity - that is switch offloads `offload_param` and  `offload_optimizer` to `nvme`. You need to make sure it's a very fast nvme. As an anecdote I was able to infer BLOOM-176B on a tiny GPU using ZeRO-Infinity except it was extremely slow. But it worked!
+
+You can, of course, work through these steps in reverse by starting with the most GPU memory efficient config and then going backwards. Or try bi-secting it.
+
+Once you have your batch size 1 not leading to OOM, measure your effective throughput.
+
+Next try to increase the batch size to as large as you can, since the higher the batch size the more efficient the GPUs are as they perform the best when matrices they multiply are huge.
+
+Now the performance optimization game starts. You can turn off some offload features or step down in ZeRO stages and increase/decrease batch size and again measure your effective throughput. Rinse and repeat until satisfied.
+
+Don't spend forever on it, but if you're about to start a 3 months training - do spend a few days on it to find the most effective throughput-wise setup. So that your training cost will be the lowest and you will finish training faster. In the current crazy-paced ML world, if it takes you an extra month to train something you are likely to miss a golden opportunity. Of course, this is only me sharing an observation and in no way I'm trying to rush you. Before beginning to train BLOOM-176B I spent 2 days on this process and was able to increase throughput from 90 to 150 TFLOPs! This effort saved us more than one month of training time.
+
+These notes were written primarily for the training mode, but they should mostly apply for inference as well. For example, during inference Gradient Checkpointing is a no-op since it is only useful during training. Additionally, we found out that if you are doing a multi-GPU inference and not using [DeepSpeed-Inference](https://www.deepspeed.ai/tutorials/inference-tutorial/), [Accelerate](https://huggingface.co/blog/bloom-inference-pytorch-scripts) should provide a superior performance.
+
+
+Other quick related performance notes:
+- if you are training something from scratch always try to have tensors with shapes that are divisible by 16 (e.g. hidden size). For batch size try divisible by 2 at least. There are [wave and tile quanitization](https://developer.nvidia.com/blog/optimizing-gpu-performance-tensor-cores/) divisibility that is hardware-specific if you want to squeeze even higher performance from your GPUs.
+
+
+### Activation Checkpointing or Gradient Checkpointing
+
+Activation checkpointing and gradient checkpointing are two distinct terms that refer to the same methodology. It's very confusing but this is how it is.
+
+Gradient checkpointing allows one to trade speed for GPU memory, which either allows one to overcome a GPU OOM, or increase their batch size, which often leads to a better performance.
+
+HF Transformers models don't know anything about DeepSpeed's activation checkpointing, so if you try to enable that feature in the DeepSpeed config file, nothing will happen.
+
+Therefore you have two ways to take advantage of this very beneficial feature:
+
+1. If you want to use a HF Transformers models you can do `model.gradient_checkpointing_enable()` or use `--gradient_checkpointing` in the HF Trainer, which will automatically enable this for you. `torch.utils.checkpoint` is used there.
+2. If you write your own model and you want to use DeepSpeed's activation checkpointing you can use the [API prescribed there](https://deepspeed.readthedocs.io/en/latest/activation-checkpointing.html). You can also take the HF Transformers modeling code and replace `torch.utils.checkpoint` with the DeepSpeed's API. The latter is more flexible since it allows you to offload the forward activations to the CPU memory instead of recalculating them.
+
+
+### Optimizer and Scheduler
+
+As long as you don't enable `offload_optimizer` you can mix and match DeepSpeed and HuggingFace schedulers and
+optimizers, with the exception of using the combination of HuggingFace scheduler and DeepSpeed optimizer:
+
+| Combos       | HF Scheduler | DS Scheduler |
+|:-------------|:-------------|:-------------|
+| HF Optimizer | Yes          | Yes          |
+| DS Optimizer | No           | Yes          |
+
+It is possible to use a non-DeepSpeed optimizer when `offload_optimizer` is enabled, as long as it has both CPU and
+GPU implementation (except LAMB).
+
+
+
+
+<a id='deepspeed-optimizer'></a>
+
+#### Optimizer
+
+
+DeepSpeed's main optimizers are Adam, AdamW, OneBitAdam, and Lamb. These have been thoroughly tested with ZeRO and are
+thus recommended to be used. It, however, can import other optimizers from `torch`. The full documentation is [here](https://www.deepspeed.ai/docs/config-json/#optimizer-parameters).
+
+If you don't configure the `optimizer` entry in the configuration file, the [`Trainer`] will
+automatically set it to `AdamW` and will use the supplied values or the defaults for the following command line
+arguments: `--learning_rate`, `--adam_beta1`, `--adam_beta2`, `--adam_epsilon` and `--weight_decay`.
+
+Here is an example of the auto-configured `optimizer` entry for `AdamW`:
+
+```json
+{
+   "optimizer": {
+       "type": "AdamW",
+       "params": {
+         "lr": "auto",
+         "betas": "auto",
+         "eps": "auto",
+         "weight_decay": "auto"
+       }
+   }
+}
+```
+
+Note that the command line arguments will set the values in the configuration file. This is so that there is one
+definitive source of the values and to avoid hard to find errors when for example, the learning rate is set to
+different values in different places. Command line rules. The values that get overridden are:
+
+- `lr` with the value of `--learning_rate`
+- `betas` with the value of `--adam_beta1 --adam_beta2`
+- `eps` with the value of `--adam_epsilon`
+- `weight_decay` with the value of `--weight_decay`
+
+Therefore please remember to tune the shared hyperparameters on the command line.
+
+You can also set the values explicitly:
+
+```json
+{
+   "optimizer": {
+       "type": "AdamW",
+       "params": {
+         "lr": 0.001,
+         "betas": [0.8, 0.999],
+         "eps": 1e-8,
+         "weight_decay": 3e-7
+       }
+   }
+}
+```
+
+But then you're on your own synchronizing the [`Trainer`] command line arguments and the DeepSpeed
+configuration.
+
+If you want to use another optimizer which is not listed above, you will have to add to the top level configuration.
+
+```json
+{
+   "zero_allow_untested_optimizer": true
+}
+```
+
+Similarly to `AdamW`, you can configure other officially supported optimizers. Just remember that those may have different config values. e.g. for Adam you will want `weight_decay` around `0.01`.
+
+Additionally, offload works the best when it's used with Deepspeed's CPU Adam optimizer. If you want to use a different optimizer with offload, since `deepspeed==0.8.3` you need to also add:
+
+
+```json
+{
+   "zero_force_ds_cpu_optimizer": false
+}
+```
+to the top level configuration.
+
+
+
+<a id='deepspeed-scheduler'></a>
+
+#### Scheduler
+
+DeepSpeed supports `LRRangeTest`, `OneCycle`, `WarmupLR` and `WarmupDecayLR` learning rate schedulers. The full
+documentation is [here](https://www.deepspeed.ai/docs/config-json/#scheduler-parameters).
+
+Here is where the schedulers overlap between 🤗 Transformers and DeepSpeed:
+
+- `WarmupLR` via `--lr_scheduler_type constant_with_warmup`
+- `WarmupDecayLR` via `--lr_scheduler_type linear`. This is also the default value for `--lr_scheduler_type`,
+  therefore, if you don't configure the scheduler this is scheduler that will get configured by default.
+
+If you don't configure the `scheduler` entry in the configuration file, the [`Trainer`] will use
+the values of `--lr_scheduler_type`, `--learning_rate` and `--warmup_steps` or `--warmup_ratio` to configure a
+🤗 Transformers version of it.
+
+Here is an example of the auto-configured `scheduler` entry for `WarmupLR`:
+
+```json
+{
+   "scheduler": {
+         "type": "WarmupLR",
+         "params": {
+             "warmup_min_lr": "auto",
+             "warmup_max_lr": "auto",
+             "warmup_num_steps": "auto"
+         }
+     }
+}
+```
+
+Since *"auto"* is used the [`Trainer`] arguments will set the correct values in the configuration
+file. This is so that there is one definitive source of the values and to avoid hard to find errors when, for example,
+the learning rate is set to different values in different places. Command line rules. The values that get set are:
+
+- `warmup_min_lr` with the value of `0`.
+- `warmup_max_lr` with the value of `--learning_rate`.
+- `warmup_num_steps` with the value of `--warmup_steps` if provided. Otherwise will use `--warmup_ratio`
+  multiplied by the number of training steps and rounded up.
+- `total_num_steps` with either the value of `--max_steps` or if it is not provided, derived automatically at run
+  time based on the environment and the size of the dataset and other command line arguments (needed for
+  `WarmupDecayLR`).
+
+You can, of course, take over any or all of the configuration values and set those yourself:
+
+```json
+{
+   "scheduler": {
+         "type": "WarmupLR",
+         "params": {
+             "warmup_min_lr": 0,
+             "warmup_max_lr": 0.001,
+             "warmup_num_steps": 1000
+         }
+     }
+}
+```
+
+But then you're on your own synchronizing the [`Trainer`] command line arguments and the DeepSpeed
+configuration.
+
+For example, for `WarmupDecayLR`, you can use the following entry:
+
+```json
+{
+   "scheduler": {
+         "type": "WarmupDecayLR",
+         "params": {
+             "last_batch_iteration": -1,
+             "total_num_steps": "auto",
+             "warmup_min_lr": "auto",
+             "warmup_max_lr": "auto",
+             "warmup_num_steps": "auto"
+         }
+     }
+}
+```
+
+and `total_num_steps`, `warmup_max_lr`, `warmup_num_steps` and `total_num_steps` will be set at loading time.
+
+
+
+
+<a id='deepspeed-fp32'></a>
+
+### fp32 Precision
+
+Deepspeed supports the full fp32 and the fp16 mixed precision.
+
+Because of the much reduced memory needs and faster speed one gets with the fp16 mixed precision, the only time you
+will want to not use it is when the model you're using doesn't behave well under this training mode. Typically this
+happens when the model wasn't pretrained in the fp16 mixed precision (e.g. often this happens with bf16-pretrained
+models). Such models may overflow or underflow leading to `NaN` loss. If this is your case then you will want to use
+the full fp32 mode, by explicitly disabling the otherwise default fp16 mixed precision mode with:
+
+```json
+{
+    "fp16": {
+        "enabled": false,
+    }
+}
+```
+
+If you're using the Ampere-architecture based GPU, pytorch version 1.7 and higher will automatically switch to using
+the much more efficient tf32 format for some operations, but the results will still be in fp32. For details and
+benchmarks, please, see [TensorFloat-32(TF32) on Ampere devices](https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices). The document includes
+instructions on how to disable this automatic conversion if for some reason you prefer not to use it.
+
+With the 🤗 Trainer you can use `--tf32` to enable it, or disable it with `--tf32 0` or `--no_tf32`. By default the PyTorch default is used.
+
+
+
+<a id='deepspeed-amp'></a>
+
+### Automatic Mixed Precision
+
+You can use automatic mixed precision with either a pytorch-like AMP way or the apex-like way:
+
+### fp16
+
+To configure pytorch AMP-like mode with fp16 (float16) set:
+
+```json
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    }
+}
+```
+
+and the [`Trainer`] will automatically enable or disable it based on the value of
+`args.fp16_backend`. The rest of config values are up to you.
+
+This mode gets enabled when `--fp16 --fp16_backend amp` or `--fp16_full_eval` command line args are passed.
+
+You can also enable/disable this mode explicitly:
+
+```json
+{
+    "fp16": {
+        "enabled": true,
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    }
+}
+```
+
+But then you're on your own synchronizing the [`Trainer`] command line arguments and the DeepSpeed
+configuration.
+
+Here is the [documentation](https://www.deepspeed.ai/docs/config-json/#fp16-training-options).
+
+### bf16
+
+If bf16 (bfloat16) is desired instead of fp16 then the following configuration section is to be used:
+
+```json
+{
+    "bf16": {
+        "enabled": "auto"
+    }
+}
+```
+
+bf16 has the same dynamic range as fp32 and thus doesn't require loss scaling.
+
+This mode gets enabled when `--bf16` or `--bf16_full_eval` command line args are passed.
+
+You can also enable/disable this mode explicitly:
+
+```json
+{
+    "bf16": {
+        "enabled": true
+    }
+}
+```
+
+<Tip>
+
+As of `deepspeed==0.6.0` the bf16 support is new and experimental.
+
+If you use [gradient accumulation](#gradient-accumulation) with bf16-enabled, you need to be aware that it'll accumulate gradients in bf16, which may not be what you want due to this format's low precision, as it may lead to a lossy accumulation.
+
+A work is being done to fix that and provide an option to use a higher precision `dtype` (fp16 or fp32).
+
+</Tip>
+
+
+### NCCL Collectives
+
+There is the `dtype` of the training regime and there is a separate `dtype` that is used for communication collectives like various reduction and gathering/scattering operations.
+
+All gather/scatter ops are performed in the same `dtype` the data is in, so if you're using bf16 training regime it gets gathered in bf16 - gathering is a non-lossy operation.
+
+Various reduce operations can be quite lossy, for example when gradients are averaged across multiple-gpus, if the communications are done in fp16 or bf16 the outcome is likely be lossy - since when one ads multiple numbers in low precision the result isn't exact. More so with bf16 as it has a lower precision than fp16. Often fp16 is good enough as the loss is minimal when averaging grads which are typically very small. Therefore, by default for half precision training fp16 is used as the default for reduction operations. But you have full control over this functionality and if you choose you can add a small overhead and ensure that reductions will be using fp32 as the accumulation dtype and only when the result is ready it'll get downcast to the half precision `dtype` you're training in.
+
+In order to override the default you simply add a new configuration entry:
+
+```json
+{
+    "communication_data_type": "fp32"
+}
+```
+The valid values as of this writing are "fp16", "bfp16", "fp32".
+
+note: stage zero 3 had a bug with regards to bf16 comm dtype that was fixed in `deepspeed==0.8.1`
+
+
+
+### apex
+
+To configure apex AMP-like mode set:
+
+```json
+"amp": {
+    "enabled": "auto",
+    "opt_level": "auto"
+}
+```
+
+and the [`Trainer`] will automatically configure it based on the values of `args.fp16_backend` and
+`args.fp16_opt_level`.
+
+This mode gets enabled when `--fp16 --fp16_backend apex --fp16_opt_level 01` command line args are passed.
+
+You can also configure this mode explicitly:
+
+```json
+{
+    "amp": {
+        "enabled": true,
+        "opt_level": "O1"
+    }
+}
+```
+
+But then you're on your own synchronizing the [`Trainer`] command line arguments and the DeepSpeed
+configuration.
+
+Here is the [documentation](https://www.deepspeed.ai/docs/config-json/#automatic-mixed-precision-amp-training-options).
+
+
+
+<a id='deepspeed-bs'></a>
+
+### Batch Size
+
+To configure batch size, use:
+
+```json
+{
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto"
+}
+```
+
+and the [`Trainer`] will automatically set `train_micro_batch_size_per_gpu` to the value of
+`args.per_device_train_batch_size` and `train_batch_size` to `args.world_size * args.per_device_train_batch_size * args.gradient_accumulation_steps`.
+
+You can also set the values explicitly:
+
+```json
+{
+    "train_batch_size": 12,
+    "train_micro_batch_size_per_gpu": 4
+}
+```
+
+But then you're on your own synchronizing the [`Trainer`] command line arguments and the DeepSpeed
+configuration.
+
+
+
+<a id='deepspeed-grad-acc'></a>
+
+### Gradient Accumulation
+
+To configure gradient accumulation set:
+
+```json
+{
+    "gradient_accumulation_steps": "auto"
+}
+```
+
+and the [`Trainer`] will automatically set it to the value of `args.gradient_accumulation_steps`.
+
+You can also set the value explicitly:
+
+```json
+{
+    "gradient_accumulation_steps": 3
+}
+```
+
+But then you're on your own synchronizing the [`Trainer`] command line arguments and the DeepSpeed
+configuration.
+
+
+
+<a id='deepspeed-grad-clip'></a>
+
+### Gradient Clipping
+
+To configure gradient gradient clipping set:
+
+```json
+{
+    "gradient_clipping": "auto"
+}
+```
+
+and the [`Trainer`] will automatically set it to the value of `args.max_grad_norm`.
+
+You can also set the value explicitly:
+
+```json
+{
+    "gradient_clipping": 1.0
+}
+```
+
+But then you're on your own synchronizing the [`Trainer`] command line arguments and the DeepSpeed
+configuration.
+
+
+
+<a id='deepspeed-weight-extraction'></a>
+
+### Getting The Model Weights Out
+
+As long as you continue training and resuming using DeepSpeed you don't need to worry about anything. DeepSpeed stores
+fp32 master weights in its custom checkpoint optimizer files, which are `global_step*/*optim_states.pt` (this is glob
+pattern), and are saved under the normal checkpoint.
+
+**FP16 Weights:**
+
+When a model is saved under ZeRO-2, you end up having the normal `pytorch_model.bin` file with the model weights, but
+they are only the fp16 version of the weights.
+
+Under ZeRO-3, things are much more complicated, since the model weights are partitioned out over multiple GPUs,
+therefore `"stage3_gather_16bit_weights_on_model_save": true` is required to get the `Trainer` to save the fp16
+version of the weights. If this setting is `False` `pytorch_model.bin` won't be created. This is because by default DeepSpeed's `state_dict` contains a placeholder and not the real weights. If we were to save this `state_dict` it won't be possible to load it back.
+
+
+```json
+{
+    "zero_optimization": {
+        "stage3_gather_16bit_weights_on_model_save": true
+    }
+}
+```
+
+**FP32 Weights:**
+
+While the fp16 weights are fine for resuming training, if you finished finetuning your model and want to upload it to
+the [models hub](https://huggingface.co/models) or pass it to someone else you most likely will want to get the fp32
+weights. This ideally shouldn't be done during training since this is a process that requires a lot of memory, and
+therefore best to be performed offline after the training is complete. But if desired and you have plenty of free CPU
+memory it can be done in the same training script. The following sections will discuss both approaches.
+
+
+**Live FP32 Weights Recovery:**
+
+This approach may not work if you model is large and you have little free CPU memory left, at the end of the training.
+
+If you have saved at least one checkpoint, and you want to use the latest one, you can do the following:
+
+```python
+from transformers.trainer_utils import get_last_checkpoint
+from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+
+checkpoint_dir = get_last_checkpoint(trainer.args.output_dir)
+fp32_model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+```
+
+If you're using the `--load_best_model_at_end` class:*~transformers.TrainingArguments* argument (to track the best
+checkpoint), then you can finish the training by first saving the final model explicitly and then do the same as above:
+
+```python
+from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+
+checkpoint_dir = os.path.join(trainer.args.output_dir, "checkpoint-final")
+trainer.deepspeed.save_checkpoint(checkpoint_dir)
+fp32_model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+```
+
+<Tip>
+
+Note, that once `load_state_dict_from_zero_checkpoint` was run, the `model` will no longer be usable in the
+DeepSpeed context of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+`model.load_state_dict(state_dict)` will remove all the DeepSpeed magic from it. So do this only at the very end
+of the training.
+
+</Tip>
+
+Of course, you don't have to use class:*~transformers.Trainer* and you can adjust the examples above to your own
+trainer.
+
+If for some reason you want more refinement, you can also extract the fp32 `state_dict` of the weights and apply
+these yourself as is shown in the following example:
+
+```python
+from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+
+state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir)  # already on cpu
+model = model.cpu()
+model.load_state_dict(state_dict)
+```
+
+**Offline FP32 Weights Recovery:**
+
+DeepSpeed creates a special conversion script `zero_to_fp32.py` which it places in the top-level of the checkpoint
+folder. Using this script you can extract the weights at any point. The script is standalone and you no longer need to
+have the configuration file or a `Trainer` to do the extraction.
+
+Let's say your checkpoint folder looks like this:
+
+```bash
+$ ls -l output_dir/checkpoint-1/
+-rw-rw-r-- 1 stas stas 1.4K Mar 27 20:42 config.json
+drwxrwxr-x 2 stas stas 4.0K Mar 25 19:52 global_step1/
+-rw-rw-r-- 1 stas stas   12 Mar 27 13:16 latest
+-rw-rw-r-- 1 stas stas 827K Mar 27 20:42 optimizer.pt
+-rw-rw-r-- 1 stas stas 231M Mar 27 20:42 pytorch_model.bin
+-rw-rw-r-- 1 stas stas  623 Mar 27 20:42 scheduler.pt
+-rw-rw-r-- 1 stas stas 1.8K Mar 27 20:42 special_tokens_map.json
+-rw-rw-r-- 1 stas stas 774K Mar 27 20:42 spiece.model
+-rw-rw-r-- 1 stas stas 1.9K Mar 27 20:42 tokenizer_config.json
+-rw-rw-r-- 1 stas stas  339 Mar 27 20:42 trainer_state.json
+-rw-rw-r-- 1 stas stas 2.3K Mar 27 20:42 training_args.bin
+-rwxrw-r-- 1 stas stas 5.5K Mar 27 13:16 zero_to_fp32.py*
+```
+
+In this example there is just one DeepSpeed checkpoint sub-folder *global_step1*. Therefore to reconstruct the fp32
+weights just run:
+
+```bash
+python zero_to_fp32.py . pytorch_model.bin
+```
+
+This is it. `pytorch_model.bin` will now contain the full fp32 model weights consolidated from multiple GPUs.
+
+The script will automatically be able to handle either a ZeRO-2 or ZeRO-3 checkpoint.
+
+`python zero_to_fp32.py -h` will give you usage details.
+
+The script will auto-discover the deepspeed sub-folder using the contents of the file `latest`, which in the current
+example will contain `global_step1`.
+
+Note: currently the script requires 2x general RAM of the final fp32 model weights.
+
+
+### ZeRO-3 and Infinity Nuances
+
+ZeRO-3 is quite different from ZeRO-2 because of its param sharding feature.
+
+ZeRO-Infinity further extends ZeRO-3 to support NVMe memory and multiple other speed and scalability improvements.
+
+While all the efforts were made for things to just work without needing any special changes to your models, in certain
+circumstances you may find the following information to be needed.
+
+
+
+#### Constructing Massive Models
+
+DeepSpeed/ZeRO-3 can handle models with Trillions of parameters which may not fit onto the existing RAM. In such cases,
+but also if you want the initialization to happen much faster, initialize the model using *deepspeed.zero.Init()*
+context manager (which is also a function decorator), like so:
+
+```python
+from transformers import T5ForConditionalGeneration, T5Config
+import deepspeed
+
+with deepspeed.zero.Init():
+    config = T5Config.from_pretrained("t5-small")
+    model = T5ForConditionalGeneration(config)
+```
+
+As you can see this gives you a randomly initialized model.
+
+If you want to use a pretrained model, `model_class.from_pretrained` will activate this feature as long as
+`is_deepspeed_zero3_enabled()` returns `True`, which currently is setup by the
+[`TrainingArguments`] object if the passed DeepSpeed configuration file contains ZeRO-3 config
+section. Thus you must create the [`TrainingArguments`] object **before** calling
+`from_pretrained`. Here is an example of a possible sequence:
+
+```python
+from transformers import AutoModel, Trainer, TrainingArguments
+
+training_args = TrainingArguments(..., deepspeed=ds_config)
+model = AutoModel.from_pretrained("t5-small")
+trainer = Trainer(model=model, args=training_args, ...)
+```
+
+If you're using the official example scripts and your command line arguments include `--deepspeed ds_config.json`
+with ZeRO-3 config enabled, then everything is already done for you, since this is how example scripts are written.
+
+Note: If the fp16 weights of the model can't fit onto the memory of a single GPU this feature must be used.
+
+For full details on this method and other related features please refer to [Constructing Massive Models](https://deepspeed.readthedocs.io/en/latest/zero3.html#constructing-massive-models).
+
+Also when loading fp16-pretrained models, you will want to tell `from_pretrained` to use
+`torch_dtype=torch.float16`. For details, please, see [from_pretrained-torch-dtype](#from_pretrained-torch-dtype).
+
+
+#### Gathering Parameters
+
+Under ZeRO-3 on multiple GPUs no single GPU has all the parameters unless it's the parameters for the currently
+executing layer. So if you need to access all parameters from all layers at once there is a specific method to do it.
+Most likely you won't need it, but if you do please refer to [Gathering Parameters](https://deepspeed.readthedocs.io/en/latest/zero3.html#manual-parameter-coordination)
+
+We do however use it internally in several places, one such example is when loading pretrained model weights in
+`from_pretrained`. We load one layer at a time and immediately partition it to all participating GPUs, as for very
+large models it won't be possible to load it on one GPU and then spread it out to multiple GPUs, due to memory
+limitations.
+
+Also under ZeRO-3, if you write your own code and run into a model parameter weight that looks like:
+
+```python
+tensor([1.0], device="cuda:0", dtype=torch.float16, requires_grad=True)
+```
+
+stress on `tensor([1.])`, or if you get an error where it says the parameter is of size `1`, instead of some much
+larger multi-dimensional shape, this means that the parameter is partitioned and what you see is a ZeRO-3 placeholder.
+
+
+
+<a id='deepspeed-zero-inference'></a>
+
+
+### ZeRO Inference
+
+ZeRO Inference uses the same config as ZeRO-3 Training. You just don't need the optimizer and scheduler sections. In
+fact you can leave these in the config file if you want to share the same one with the training. They will just be
+ignored.
+
+Otherwise you just need to pass the usual [`TrainingArguments`] arguments. For example:
+
+```bash
+deepspeed --num_gpus=2 your_program.py <normal cl args> --do_eval --deepspeed ds_config.json
+```
+
+The only important thing is that you need to use a ZeRO-3 configuration, since ZeRO-2 provides no benefit whatsoever
+for the inference as only ZeRO-3 performs sharding of parameters, whereas ZeRO-1 shards gradients and optimizer states.
+
+Here is an example of running `run_translation.py` under DeepSpeed deploying all available GPUs:
+
+```bash
+deepspeed examples/pytorch/translation/run_translation.py \
+--deepspeed tests/deepspeed/ds_config_zero3.json \
+--model_name_or_path t5-small --output_dir output_dir \
+--do_eval --max_eval_samples 50 --warmup_steps 50  \
+--max_source_length 128 --val_max_target_length 128 \
+--overwrite_output_dir --per_device_eval_batch_size 4 \
+--predict_with_generate --dataset_config "ro-en" --fp16 \
+--source_lang en --target_lang ro --dataset_name wmt16 \
+--source_prefix "translate English to Romanian: "
+```
+
+Since for inference there is no need for additional large memory used by the optimizer states and the gradients you
+should be able to fit much larger batches and/or sequence length onto the same hardware.
+
+Additionally DeepSpeed is currently developing a related product called Deepspeed-Inference which has no relationship
+to the ZeRO technology, but instead uses tensor parallelism to scale models that can't fit onto a single GPU. This is a
+work in progress and we will provide the integration once that product is complete.
+
+
+### Memory Requirements
+
+Since Deepspeed ZeRO can offload memory to CPU (and NVMe) the framework provides utils that allow one to tell how much CPU and GPU memory will be needed depending on the number of GPUs being used.
+
+Let's estimate how much memory is needed to finetune "bigscience/T0_3B" on a single GPU:
+
+```bash
+$ python -c 'from transformers import AutoModel; \
+from deepspeed.runtime.zero.stage3 import estimate_zero3_model_states_mem_needs_all_live; \
+model = AutoModel.from_pretrained("bigscience/T0_3B"); \
+estimate_zero3_model_states_mem_needs_all_live(model, num_gpus_per_node=1, num_nodes=1)'
+[...]
+Estimated memory needed for params, optim states and gradients for a:
+HW: Setup with 1 node, 1 GPU per node.
+SW: Model with 2783M total params, 65M largest layer params.
+  per CPU  |  per GPU |   Options
+   70.00GB |   0.25GB | offload_param=cpu , offload_optimizer=cpu , zero_init=1
+   70.00GB |   0.25GB | offload_param=cpu , offload_optimizer=cpu , zero_init=0
+   62.23GB |   5.43GB | offload_param=none, offload_optimizer=cpu , zero_init=1
+   62.23GB |   5.43GB | offload_param=none, offload_optimizer=cpu , zero_init=0
+    0.37GB |  46.91GB | offload_param=none, offload_optimizer=none, zero_init=1
+   15.56GB |  46.91GB | offload_param=none, offload_optimizer=none, zero_init=0
+```
+
+So you can fit it on a single 80GB GPU and no CPU offload, or a tiny 8GB GPU but then need ~60GB of CPU memory. (Remember this is just the memory for params, optimizer states and gradients - you will need a bit more memory for cuda kernels, activations and temps.)
+
+Then it's a tradeoff of cost vs speed. It'll be cheaper to buy/rent a smaller GPU (or less GPUs since you can use multiple GPUs with Deepspeed ZeRO. But then it'll be slower, so even if you don't care about how fast something will be done, the slowdown has a direct impact on the duration of using the GPU and thus bigger cost. So experiment and compare which works the best.
+
+If you have enough GPU memory make sure to disable the CPU/NVMe offload as it'll make everything faster.
+
+For example, let's repeat the same for 2 GPUs:
+
+```bash
+$ python -c 'from transformers import AutoModel; \
+from deepspeed.runtime.zero.stage3 import estimate_zero3_model_states_mem_needs_all_live; \
+model = AutoModel.from_pretrained("bigscience/T0_3B"); \
+estimate_zero3_model_states_mem_needs_all_live(model, num_gpus_per_node=2, num_nodes=1)'
+[...]
+Estimated memory needed for params, optim states and gradients for a:
+HW: Setup with 1 node, 2 GPUs per node.
+SW: Model with 2783M total params, 65M largest layer params.
+  per CPU  |  per GPU |   Options
+   70.00GB |   0.25GB | offload_param=cpu , offload_optimizer=cpu , zero_init=1
+   70.00GB |   0.25GB | offload_param=cpu , offload_optimizer=cpu , zero_init=0
+   62.23GB |   2.84GB | offload_param=none, offload_optimizer=cpu , zero_init=1
+   62.23GB |   2.84GB | offload_param=none, offload_optimizer=cpu , zero_init=0
+    0.74GB |  23.58GB | offload_param=none, offload_optimizer=none, zero_init=1
+   31.11GB |  23.58GB | offload_param=none, offload_optimizer=none, zero_init=0
+
+```
+
+So here you'd want 2x 32GB GPUs or higher without offloading to CPU.
+
+For full information please see [memory estimators](https://deepspeed.readthedocs.io/en/latest/memory.html).
+
+
+
+### Filing Issues
+
+Here is how to file an issue so that we could quickly get to the bottom of the issue and help you to unblock your work.
+
+In your report please always include:
+
+1. the full Deepspeed config file in the report
+
+2. either the command line arguments if you were using the [`Trainer`] or
+   [`TrainingArguments`] arguments if you were scripting the Trainer setup yourself. Please do not
+   dump the [`TrainingArguments`] as it has dozens of entries that are irrelevant.
+
+3. Output of:
+
+    ```bash
+    python -c 'import torch; print(f"torch: {torch.__version__}")'
+    python -c 'import transformers; print(f"transformers: {transformers.__version__}")'
+    python -c 'import deepspeed; print(f"deepspeed: {deepspeed.__version__}")'
+    ```
+
+4. If possible include a link to a Google Colab notebook that we can reproduce the problem with. You can use this
+   [notebook](https://github.com/stas00/porting/blob/master/transformers/deepspeed/DeepSpeed_on_colab_CLI.ipynb) as
+   a starting point.
+
+5. Unless it's impossible please always use a standard dataset that we can use and not something custom.
+
+6. If possible try to use one of the existing [examples](https://github.com/huggingface/transformers/tree/main/examples/pytorch) to reproduce the problem with.
+
+Things to consider:
+
+- Deepspeed is often not the cause of the problem.
+
+  Some of the filed issues proved to be Deepspeed-unrelated. That is once Deepspeed was removed from the setup, the
+  problem was still there.
+
+  Therefore, if it's not absolutely obvious it's a DeepSpeed-related problem, as in you can see that there is an
+  exception and you can see that DeepSpeed modules are involved, first re-test your setup without DeepSpeed in it.
+  And only if the problem persists then do mentioned Deepspeed and supply all the required details.
+
+- If it's clear to you that the issue is in the DeepSpeed core and not the integration part, please file the Issue
+  directly with [Deepspeed](https://github.com/microsoft/DeepSpeed/). If you aren't sure, please do not worry,
+  either Issue tracker will do, we will figure it out once you posted it and redirect you to another Issue tracker if
+  need be.
+
+
+
+### Troubleshooting
+
+#### the `deepspeed` process gets killed at startup without a traceback
+
+If the `deepspeed` process gets killed at launch time without a traceback, that usually means that the program tried
+to allocate more CPU memory than your system has or your process is allowed to allocate and the OS kernel killed that
+process. This is because your configuration file most likely has either `offload_optimizer` or `offload_param` or
+both configured to offload to `cpu`. If you have NVMe, experiment with offloading to NVMe if you're running under
+ZeRO-3. Here is how you can [estimate how much memory is needed for a specific model](https://deepspeed.readthedocs.io/en/latest/memory.html).
+
+
+#### training and/or eval/predict loss is `NaN`
+
+This often happens when one takes a model pre-trained in bf16 mixed precision mode and tries to use it under fp16 (with or without mixed precision). Most models trained on TPU and often the ones released by Google are in this category (e.g. almost all t5-based models). Here the solution is to either use fp32 or bf16 if your hardware supports it (TPU, Ampere GPUs or newer).
+
+The other problem may have to do with using fp16. When you configure this section:
+
+```json
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    }
+}
+```
+
+and you see in your log that Deepspeed reports `OVERFLOW!` as follows:
+
+```
+0%|                                                                                                                             | 0/189 [00:00<?, ?it/s]
+ [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 262144, reducing to 262144
+  1%|▌                                                                                                                    | 1/189 [00:00<01:26,  2.17it/s]
+ [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 262144, reducing to 131072.0
+  1%|█▏
+ [...]
+ [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1, reducing to 1
+ 14%|████████████████▌                                                                                                   | 27/189 [00:14<01:13,  2.21it/s]
+ [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1, reducing to 1
+ 15%|█████████████████▏                                                                                                  | 28/189 [00:14<01:13,  2.18it/s]
+ [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1, reducing to 1
+ 15%|█████████████████▊                                                                                                  | 29/189 [00:15<01:13,  2.18it/s]
+ [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1, reducing to 1
+[...]
+```
+
+that means that the Deepspeed loss scaler can't figure out a scaling co-efficient that overcomes loss overflow.
+
+(the log was massaged to be more readable here.)
+
+In this case you usually need to raise the value of `initial_scale_power`. Setting it to `"initial_scale_power": 32` will typically resolve the problem.
+
+
+
+### Notes
+
+- DeepSpeed works with the PyTorch [`Trainer`] but not TF [`TFTrainer`].
+- While DeepSpeed has a pip installable PyPI package, it is highly recommended that it gets installed from [source](https://github.com/microsoft/deepspeed#installation) to best match your hardware and also if you need to enable
+  certain features, like 1-bit Adam, which aren't available in the pypi distribution.
+- You don't have to use the [`Trainer`] to use DeepSpeed with 🤗 Transformers - you can use any model
+  with your own trainer, and you will have to adapt the latter according to [the DeepSpeed integration instructions](https://www.deepspeed.ai/getting-started/#writing-deepspeed-models).
+
+
+
+
+
+## Non-Trainer Deepspeed Integration
+
+The [`~integrations.HfDeepSpeedConfig`] is used to integrate Deepspeed into the 🤗 Transformers core
+functionality, when [`Trainer`] is not used. The only thing that it does is handling Deepspeed ZeRO-3 param gathering and automatically splitting the model onto multiple gpus during `from_pretrained` call. Everything else you have to do by yourself.
+
+When using [`Trainer`] everything is automatically taken care of.
+
+When not using [`Trainer`], to efficiently deploy DeepSpeed ZeRO-3, you must instantiate the
+[`~integrations.HfDeepSpeedConfig`] object before instantiating the model and keep that object alive.
+
+If you're using Deepspeed ZeRO-1 or ZeRO-2 you don't need to use `HfDeepSpeedConfig` at all.
+
+For example for a pretrained model:
+
+```python
+from transformers.integrations import HfDeepSpeedConfig
+from transformers import AutoModel
+import deepspeed
+
+ds_config = {...}  # deepspeed config object or path to the file
+# must run before instantiating the model to detect zero 3
+dschf = HfDeepSpeedConfig(ds_config)  # keep this object alive
+model = AutoModel.from_pretrained("gpt2")
+engine = deepspeed.initialize(model=model, config_params=ds_config, ...)
+```
+
+or for non-pretrained model:
+
+```python
+from transformers.integrations import HfDeepSpeedConfig
+from transformers import AutoModel, AutoConfig
+import deepspeed
+
+ds_config = {...}  # deepspeed config object or path to the file
+# must run before instantiating the model to detect zero 3
+dschf = HfDeepSpeedConfig(ds_config)  # keep this object alive
+config = AutoConfig.from_pretrained("gpt2")
+model = AutoModel.from_config(config)
+engine = deepspeed.initialize(model=model, config_params=ds_config, ...)
+```
+
+Please note that if you're not using the [`Trainer`] integration, you're completely on your own. Basically follow the documentation on the [Deepspeed](https://www.deepspeed.ai/) website. Also you have to configure explicitly the config file - you can't use `"auto"` values and you will have to put real values instead.
+
+## HfDeepSpeedConfig
+
+[[autodoc]] integrations.HfDeepSpeedConfig
+    - all
+
+### Custom DeepSpeed ZeRO Inference
+
+Here is an example of how one could do DeepSpeed ZeRO Inference without using [`Trainer`] when one can't fit a model onto a single GPU. The solution includes using additional GPUs or/and offloading GPU memory to CPU memory.
+
+The important nuance to understand here is that the way ZeRO is designed you can process different inputs on different GPUs in parallel.
+
+The example has copious notes and is self-documenting.
+
+Make sure to:
+
+1. disable CPU offload if you have enough GPU memory (since it slows things down)
+2. enable bf16 if you own an Ampere or a newer GPU to make things faster. If you don't have that hardware you may enable fp16 as long as you don't use any model that was pre-trained in bf16 mixed precision (such as most t5 models). These usually overflow in fp16 and you will see garbage as output.
+
+```python
+#!/usr/bin/env python
+
+# This script demonstrates how to use Deepspeed ZeRO in an inference mode when one can't fit a model
+# into a single GPU
+#
+# 1. Use 1 GPU with CPU offload
+# 2. Or use multiple GPUs instead
+#
+# First you need to install deepspeed: pip install deepspeed
+#
+# Here we use a 3B "bigscience/T0_3B" model which needs about 15GB GPU RAM - so 1 largish or 2
+# small GPUs can handle it. or 1 small GPU and a lot of CPU memory.
+#
+# To use a larger model like "bigscience/T0" which needs about 50GB, unless you have an 80GB GPU -
+# you will need 2-4 gpus. And then you can adapt the script to handle more gpus if you want to
+# process multiple inputs at once.
+#
+# The provided deepspeed config also activates CPU memory offloading, so chances are that if you
+# have a lot of available CPU memory and you don't mind a slowdown you should be able to load a
+# model that doesn't normally fit into a single GPU. If you have enough GPU memory the program will
+# run faster if you don't want offload to CPU - so disable that section then.
+#
+# To deploy on 1 gpu:
+#
+# deepspeed --num_gpus 1 t0.py
+# or:
+# python -m torch.distributed.run --nproc_per_node=1 t0.py
+#
+# To deploy on 2 gpus:
+#
+# deepspeed --num_gpus 2 t0.py
+# or:
+# python -m torch.distributed.run --nproc_per_node=2 t0.py
+
+
+from transformers import AutoTokenizer, AutoConfig, AutoModelForSeq2SeqLM
+from transformers.integrations import HfDeepSpeedConfig
+import deepspeed
+import os
+import torch
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"  # To avoid warnings about parallelism in tokenizers
+
+# distributed setup
+local_rank = int(os.getenv("LOCAL_RANK", "0"))
+world_size = int(os.getenv("WORLD_SIZE", "1"))
+torch.cuda.set_device(local_rank)
+deepspeed.init_distributed()
+
+model_name = "bigscience/T0_3B"
+
+config = AutoConfig.from_pretrained(model_name)
+model_hidden_size = config.d_model
+
+# batch size has to be divisible by world_size, but can be bigger than world_size
+train_batch_size = 1 * world_size
+
+# ds_config notes
+#
+# - enable bf16 if you use Ampere or higher GPU - this will run in mixed precision and will be
+# faster.
+#
+# - for older GPUs you can enable fp16, but it'll only work for non-bf16 pretrained models - e.g.
+# all official t5 models are bf16-pretrained
+#
+# - set offload_param.device to "none" or completely remove the `offload_param` section if you don't
+# - want CPU offload
+#
+# - if using `offload_param` you can manually finetune stage3_param_persistence_threshold to control
+# - which params should remain on gpus - the larger the value the smaller the offload size
+#
+# For indepth info on Deepspeed config see
+# https://huggingface.co/docs/transformers/main/main_classes/deepspeed
+
+# keeping the same format as json for consistency, except it uses lower case for true/false
+# fmt: off
+ds_config = {
+    "fp16": {
+        "enabled": False
+    },
+    "bf16": {
+        "enabled": False
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": True
+        },
+        "overlap_comm": True,
+        "contiguous_gradients": True,
+        "reduce_bucket_size": model_hidden_size * model_hidden_size,
+        "stage3_prefetch_bucket_size": 0.9 * model_hidden_size * model_hidden_size,
+        "stage3_param_persistence_threshold": 10 * model_hidden_size
+    },
+    "steps_per_print": 2000,
+    "train_batch_size": train_batch_size,
+    "train_micro_batch_size_per_gpu": 1,
+    "wall_clock_breakdown": False
+}
+# fmt: on
+
+# next line instructs transformers to partition the model directly over multiple gpus using
+# deepspeed.zero.Init when model's `from_pretrained` method is called.
+#
+# **it has to be run before loading the model AutoModelForSeq2SeqLM.from_pretrained(model_name)**
+#
+# otherwise the model will first be loaded normally and only partitioned at forward time which is
+# less efficient and when there is little CPU RAM may fail
+dschf = HfDeepSpeedConfig(ds_config)  # keep this object alive
+
+# now a model can be loaded.
+model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+
+# initialise Deepspeed ZeRO and store only the engine object
+ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0]
+ds_engine.module.eval()  # inference
+
+# Deepspeed ZeRO can process unrelated inputs on each GPU. So for 2 gpus you process 2 inputs at once.
+# If you use more GPUs adjust for more.
+# And of course if you have just one input to process you then need to pass the same string to both gpus
+# If you use only one GPU, then you will have only rank 0.
+rank = torch.distributed.get_rank()
+if rank == 0:
+    text_in = "Is this review positive or negative? Review: this is the best cast iron skillet you will ever buy"
+elif rank == 1:
+    text_in = "Is this review positive or negative? Review: this is the worst restaurant ever"
+
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+inputs = tokenizer.encode(text_in, return_tensors="pt").to(device=local_rank)
+with torch.no_grad():
+    outputs = ds_engine.module.generate(inputs, synced_gpus=True)
+text_out = tokenizer.decode(outputs[0], skip_special_tokens=True)
+print(f"rank{rank}:\n   in={text_in}\n  out={text_out}")
+```
+
+Let's save it as `t0.py` and run it:
+```
+$ deepspeed --num_gpus 2 t0.py
+rank0:
+   in=Is this review positive or negative? Review: this is the best cast iron skillet you will ever buy
+  out=Positive
+rank1:
+   in=Is this review positive or negative? Review: this is the worst restaurant ever
+  out=negative
+```
+
+This was a very basic example and you will want to adapt it to your needs.
+
+### `generate` nuances
+
+When using multiple GPUs with ZeRO Stage-3, one has to synchronize the GPUs by calling `generate(..., synced_gpus=True)`. If this is not done if one GPU finished generating before other GPUs the whole system will hang as the rest of the GPUs will not be able to received the shard of weights from the GPU that stopped generating.
+
+Starting from `transformers>=4.28`, if `synced_gpus` isn't explicitly specified, it'll be set to `True` automatically if these conditions are detected. But you can still override the value of `synced_gpus` if need to.
+
+
+
+## Testing Deepspeed Integration
+
+If you submit a PR that involves DeepSpeed integration please note our CircleCI PR CI setup has no GPUs, so we only run tests requiring gpus on a different CI nightly. Therefore if you get a green CI report in your PR it doesn't mean DeepSpeed tests pass.
+
+To run DeepSpeed tests, please run at least:
+
+```
+RUN_SLOW=1 pytest tests/deepspeed/test_deepspeed.py
+```
+
+If you changed any of the modeling or pytorch examples code, then run the model zoo tests as well. The following will run all DeepSpeed tests:
+
+```
+RUN_SLOW=1 pytest tests/deepspeed
+```
+
+
+
+
+## Main DeepSpeed Resources
+
+- [Project's github](https://github.com/microsoft/deepspeed)
+- [Usage docs](https://www.deepspeed.ai/getting-started/)
+- [API docs](https://deepspeed.readthedocs.io/en/latest/index.html)
+- [Blog posts](https://www.microsoft.com/en-us/research/search/?q=deepspeed)
+
+Papers:
+
+- [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https://arxiv.org/abs/1910.02054)
+- [ZeRO-Offload: Democratizing Billion-Scale Model Training](https://arxiv.org/abs/2101.06840)
+- [ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning](https://arxiv.org/abs/2104.07857)
+
+Finally, please, remember that, HuggingFace [`Trainer`] only integrates DeepSpeed, therefore if you
+have any problems or questions with regards to DeepSpeed usage, please, file an issue with [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/issues).
diff --git a/docs/source/en/main_classes/deepspeed.mdx b/docs/source/en/main_classes/deepspeed.mdx
deleted file mode 100644
index 7926ddb5c684..000000000000
--- a/docs/source/en/main_classes/deepspeed.mdx
+++ /dev/null
@@ -1,2079 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# DeepSpeed Integration
-
-[DeepSpeed](https://github.com/microsoft/DeepSpeed) implements everything described in the [ZeRO paper](https://arxiv.org/abs/1910.02054). Currently it provides full support for:
-
-1. Optimizer state partitioning (ZeRO stage 1)
-2. Gradient partitioning (ZeRO stage 2)
-3. Parameter partitioning (ZeRO stage 3)
-4. Custom mixed precision training handling
-5. A range of fast CUDA-extension-based optimizers
-6. ZeRO-Offload to CPU and NVMe
-
-ZeRO-Offload has its own dedicated paper: [ZeRO-Offload: Democratizing Billion-Scale Model Training](https://arxiv.org/abs/2101.06840). And NVMe-support is described in the paper [ZeRO-Infinity: Breaking the GPU
-Memory Wall for Extreme Scale Deep Learning](https://arxiv.org/abs/2104.07857).
-
-DeepSpeed ZeRO-2 is primarily used only for training, as its features are of no use to inference.
-
-DeepSpeed ZeRO-3 can be used for inference as well, since it allows huge models to be loaded on multiple GPUs, which
-won't be possible on a single GPU.
-
-🤗 Transformers integrates [DeepSpeed](https://github.com/microsoft/DeepSpeed) via 2 options:
-
-1. Integration of the core DeepSpeed features via [`Trainer`]. This is an everything-done-for-you type
-   of integration - just supply your custom config file or use our template and you have nothing else to do. Most of
-   this document is focused on this feature.
-2. If you don't use [`Trainer`] and want to use your own Trainer where you integrated DeepSpeed
-   yourself, core functionality functions like `from_pretrained` and `from_config` include integration of essential
-   parts of DeepSpeed like `zero.Init` for ZeRO stage 3 and higher. To tap into this feature read the docs on
-   [non-Trainer DeepSpeed Integration](#nontrainer-deepspeed-integration).
-
-What is integrated:
-
-Training:
-
-1. DeepSpeed ZeRO training supports the full ZeRO stages 1, 2 and 3 with ZeRO-Infinity (CPU and NVME offload).
-
-Inference:
-
-1. DeepSpeed ZeRO Inference supports ZeRO stage 3 with ZeRO-Infinity. It uses the same ZeRO protocol as training, but
-   it doesn't use an optimizer and a lr scheduler and only stage 3 is relevant. For more details see:
-   [zero-inference](#zero-inference).
-
-There is also DeepSpeed Inference - this is a totally different technology which uses Tensor Parallelism instead of
-ZeRO (coming soon).
-
-
-
-<a id='deepspeed-trainer-integration'></a>
-
-
-## Trainer Deepspeed Integration
-
-
-<a id='deepspeed-installation'></a>
-
-### Installation
-
-Install the library via pypi:
-
-```bash
-pip install deepspeed
-```
-
-or via `transformers`' `extras`:
-
-```bash
-pip install transformers[deepspeed]
-```
-
-or find more details on [the DeepSpeed's GitHub page](https://github.com/microsoft/deepspeed#installation) and
-[advanced install](https://www.deepspeed.ai/tutorials/advanced-install/).
-
-If you're still struggling with the build, first make sure to read [CUDA Extension Installation Notes](trainer#cuda-extension-installation-notes).
-
-If you don't prebuild the extensions and rely on them to be built at run time and you tried all of the above solutions
-to no avail, the next thing to try is to pre-build the modules before installing them.
-
-To make a local build for DeepSpeed:
-
-```bash
-git clone https://github.com/microsoft/DeepSpeed/
-cd DeepSpeed
-rm -rf build
-TORCH_CUDA_ARCH_LIST="8.6" DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 pip install . \
---global-option="build_ext" --global-option="-j8" --no-cache -v \
---disable-pip-version-check 2>&1 | tee build.log
-```
-
-If you intend to use NVMe offload you will also need to include `DS_BUILD_AIO=1` in the instructions above (and also
-install *libaio-dev* system-wide).
-
-Edit `TORCH_CUDA_ARCH_LIST` to insert the code for the architectures of the GPU cards you intend to use. Assuming all
-your cards are the same you can get the arch via:
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python -c "import torch; print(torch.cuda.get_device_capability())"
-```
-
-So if you get `8, 6`, then use `TORCH_CUDA_ARCH_LIST="8.6"`. If you have multiple different cards, you can list all
-of them like so `TORCH_CUDA_ARCH_LIST="6.1;8.6"`
-
-If you need to use the same setup on multiple machines, make a binary wheel:
-
-```bash
-git clone https://github.com/microsoft/DeepSpeed/
-cd DeepSpeed
-rm -rf build
-TORCH_CUDA_ARCH_LIST="8.6" DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 \
-python setup.py build_ext -j8 bdist_wheel
-```
-
-it will generate something like `dist/deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl` which now you can install
-as `pip install deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl` locally or on any other machine.
-
-Again, remember to ensure to adjust `TORCH_CUDA_ARCH_LIST` to the target architectures.
-
-You can find the complete list of NVIDIA GPUs and their corresponding **Compute Capabilities** (same as arch in this
-context) [here](https://developer.nvidia.com/cuda-gpus).
-
-You can check the archs pytorch was built with using:
-
-```bash
-python -c "import torch; print(torch.cuda.get_arch_list())"
-```
-
-Here is how to find out the arch for one of the installed GPUs. For example, for GPU 0:
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python -c "import torch; \
-print(torch.cuda.get_device_properties(torch.device('cuda')))"
-```
-
-If the output is:
-
-```bash
-_CudaDeviceProperties(name='GeForce RTX 3090', major=8, minor=6, total_memory=24268MB, multi_processor_count=82)
-```
-
-then you know that this card's arch is `8.6`.
-
-You can also leave `TORCH_CUDA_ARCH_LIST` out completely and then the build program will automatically query the
-architecture of the GPUs the build is made on. This may or may not match the GPUs on the target machines, that's why
-it's best to specify the desired archs explicitly.
-
-If after trying everything suggested you still encounter build issues, please, proceed with the GitHub Issue of
-[Deepspeed](https://github.com/microsoft/DeepSpeed/issues),
-
-
-
-<a id='deepspeed-multi-gpu'></a>
-
-### Deployment with multiple GPUs
-
-To deploy this feature with multiple GPUs adjust the [`Trainer`] command line arguments as
-following:
-
-1. replace `python -m torch.distributed.launch` with `deepspeed`.
-2. add a new argument `--deepspeed ds_config.json`, where `ds_config.json` is the DeepSpeed configuration file as
-   documented [here](https://www.deepspeed.ai/docs/config-json/). The file naming is up to you.
-
-Therefore, if your original command line looked as follows:
-
-```bash
-python -m torch.distributed.launch --nproc_per_node=2 your_program.py <normal cl args>
-```
-
-Now it should be:
-
-```bash
-deepspeed --num_gpus=2 your_program.py <normal cl args> --deepspeed ds_config.json
-```
-
-Unlike, `torch.distributed.launch` where you have to specify how many GPUs to use with `--nproc_per_node`, with the
-`deepspeed` launcher you don't have to use the corresponding `--num_gpus` if you want all of your GPUs used. The
-full details on how to configure various nodes and GPUs can be found [here](https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node).
-
-In fact, you can continue using `-m torch.distributed.launch` with DeepSpeed as long as you don't need to use
-`deepspeed` launcher-specific arguments. Typically if you don't need a multi-node setup you're not required to use
-the `deepspeed` launcher. But since in the DeepSpeed documentation it'll be used everywhere, for consistency we will
-use it here as well.
-
-Here is an example of running `run_translation.py` under DeepSpeed deploying all available GPUs:
-
-```bash
-deepspeed examples/pytorch/translation/run_translation.py \
---deepspeed tests/deepspeed/ds_config_zero3.json \
---model_name_or_path t5-small --per_device_train_batch_size 1 \
---output_dir output_dir --overwrite_output_dir --fp16 \
---do_train --max_train_samples 500 --num_train_epochs 1 \
---dataset_name wmt16 --dataset_config "ro-en" \
---source_lang en --target_lang ro
-```
-
-Note that in the DeepSpeed documentation you are likely to see `--deepspeed --deepspeed_config ds_config.json` - i.e.
-two DeepSpeed-related arguments, but for the sake of simplicity, and since there are already so many arguments to deal
-with, we combined the two into a single argument.
-
-For some practical usage examples, please, see this [post](https://github.com/huggingface/transformers/issues/8771#issuecomment-759248400).
-
-
-
-<a id='deepspeed-one-gpu'></a>
-
-### Deployment with one GPU
-
-To deploy DeepSpeed with one GPU adjust the [`Trainer`] command line arguments as follows:
-
-```bash
-deepspeed --num_gpus=1 examples/pytorch/translation/run_translation.py \
---deepspeed tests/deepspeed/ds_config_zero2.json \
---model_name_or_path t5-small --per_device_train_batch_size 1 \
---output_dir output_dir --overwrite_output_dir --fp16 \
---do_train --max_train_samples 500 --num_train_epochs 1 \
---dataset_name wmt16 --dataset_config "ro-en" \
---source_lang en --target_lang ro
-```
-
-This is almost the same as with multiple-GPUs, but here we tell DeepSpeed explicitly to use just one GPU via
-`--num_gpus=1`. By default, DeepSpeed deploys all GPUs it can see on the given node. If you have only 1 GPU to start
-with, then you don't need this argument. The following [documentation](https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node) discusses the launcher options.
-
-Why would you want to use DeepSpeed with just one GPU?
-
-1. It has a ZeRO-offload feature which can delegate some computations and memory to the host's CPU and RAM, and thus
-   leave more GPU resources for model's needs - e.g. larger batch size, or enabling a fitting of a very big model which
-   normally won't fit.
-2. It provides a smart GPU memory management system, that minimizes memory fragmentation, which again allows you to fit
-   bigger models and data batches.
-
-While we are going to discuss the configuration in details next, the key to getting a huge improvement on a single GPU
-with DeepSpeed is to have at least the following configuration in the configuration file:
-
-```json
-{
-  "zero_optimization": {
-     "stage": 2,
-     "offload_optimizer": {
-         "device": "cpu",
-         "pin_memory": true
-     },
-     "allgather_partitions": true,
-     "allgather_bucket_size": 2e8,
-     "reduce_scatter": true,
-     "reduce_bucket_size": 2e8,
-     "overlap_comm": true,
-     "contiguous_gradients": true
-  }
-}
-```
-
-which enables optimizer offload and some other important features. You may experiment with the buffer sizes, you will
-find more details in the discussion below.
-
-For a practical usage example of this type of deployment, please, see this [post](https://github.com/huggingface/transformers/issues/8771#issuecomment-759176685).
-
-You may also try the ZeRO-3 with CPU and NVMe offload as explained further in this document.
-
-<!--- TODO: Benchmark whether we can get better performance out of ZeRO-3 vs. ZeRO-2 on a single GPU, and then
-recommend ZeRO-3 config as starting one. -->
-
-Notes:
-
-- if you need to run on a specific GPU, which is different from GPU 0, you can't use `CUDA_VISIBLE_DEVICES` to limit
-  the visible scope of available GPUs. Instead, you have to use the following syntax:
-
-  ```bash
-  deepspeed --include localhost:1 examples/pytorch/translation/run_translation.py ...
-  ```
-
-  In this example, we tell DeepSpeed to use GPU 1 (second gpu).
-
-
-
-<a id='deepspeed-notebook'></a>
-
-### Deployment in Notebooks
-
-The problem with running notebook cells as a script is that there is no normal `deepspeed` launcher to rely on, so
-under certain setups we have to emulate it.
-
-If you're using only 1 GPU, here is how you'd have to adjust your training code in the notebook to use DeepSpeed.
-
-```python
-# DeepSpeed requires a distributed environment even when only one process is used.
-# This emulates a launcher in the notebook
-import os
-
-os.environ["MASTER_ADDR"] = "localhost"
-os.environ["MASTER_PORT"] = "9994"  # modify if RuntimeError: Address already in use
-os.environ["RANK"] = "0"
-os.environ["LOCAL_RANK"] = "0"
-os.environ["WORLD_SIZE"] = "1"
-
-# Now proceed as normal, plus pass the deepspeed config file
-training_args = TrainingArguments(..., deepspeed="ds_config_zero3.json")
-trainer = Trainer(...)
-trainer.train()
-```
-
-Note: `...` stands for the normal arguments that you'd pass to the functions.
-
-If you want to use more than 1 GPU, you must use a multi-process environment for DeepSpeed to work. That is, you have
-to use the launcher for that purpose and this cannot be accomplished by emulating the distributed environment presented
-at the beginning of this section.
-
-If you want to create the config file on the fly in the notebook in the current directory, you could have a dedicated
-cell with:
-
-```python no-style
-%%bash
-cat <<'EOT' > ds_config_zero3.json
-{
-    "fp16": {
-        "enabled": "auto",
-        "loss_scale": 0,
-        "loss_scale_window": 1000,
-        "initial_scale_power": 16,
-        "hysteresis": 2,
-        "min_loss_scale": 1
-    },
-
-    "optimizer": {
-        "type": "AdamW",
-        "params": {
-            "lr": "auto",
-            "betas": "auto",
-            "eps": "auto",
-            "weight_decay": "auto"
-        }
-    },
-
-    "scheduler": {
-        "type": "WarmupLR",
-        "params": {
-            "warmup_min_lr": "auto",
-            "warmup_max_lr": "auto",
-            "warmup_num_steps": "auto"
-        }
-    },
-
-    "zero_optimization": {
-        "stage": 3,
-        "offload_optimizer": {
-            "device": "cpu",
-            "pin_memory": true
-        },
-        "offload_param": {
-            "device": "cpu",
-            "pin_memory": true
-        },
-        "overlap_comm": true,
-        "contiguous_gradients": true,
-        "sub_group_size": 1e9,
-        "reduce_bucket_size": "auto",
-        "stage3_prefetch_bucket_size": "auto",
-        "stage3_param_persistence_threshold": "auto",
-        "stage3_max_live_parameters": 1e9,
-        "stage3_max_reuse_distance": 1e9,
-        "stage3_gather_16bit_weights_on_model_save": true
-    },
-
-    "gradient_accumulation_steps": "auto",
-    "gradient_clipping": "auto",
-    "steps_per_print": 2000,
-    "train_batch_size": "auto",
-    "train_micro_batch_size_per_gpu": "auto",
-    "wall_clock_breakdown": false
-}
-EOT
-```
-
-If the training script is in a normal file and not in the notebook cells, you can launch `deepspeed` normally via
-shell from a cell. For example, to use `run_translation.py` you would launch it with:
-
-```python no-style
-!git clone https://github.com/huggingface/transformers
-!cd transformers; deepspeed examples/pytorch/translation/run_translation.py ...
-```
-
-or with `%%bash` magic, where you can write a multi-line code for the shell program to run:
-
-```python no-style
-%%bash
-
-git clone https://github.com/huggingface/transformers
-cd transformers
-deepspeed examples/pytorch/translation/run_translation.py ...
-```
-
-In such case you don't need any of the code presented at the beginning of this section.
-
-Note: While `%%bash` magic is neat, but currently it buffers the output so you won't see the logs until the process
-completes.
-
-
-
-
-<a id='deepspeed-config'></a>
-
-### Configuration
-
-For the complete guide to the DeepSpeed configuration options that can be used in its configuration file please refer
-to the [following documentation](https://www.deepspeed.ai/docs/config-json/).
-
-You can find dozens of DeepSpeed configuration examples that address various practical needs in [the DeepSpeedExamples
-repo](https://github.com/microsoft/DeepSpeedExamples):
-
-```bash
-git clone https://github.com/microsoft/DeepSpeedExamples
-cd DeepSpeedExamples
-find . -name '*json'
-```
-
-Continuing the code from above, let's say you're looking to configure the Lamb optimizer. So you can search through the
-example `.json` files with:
-
-```bash
-grep -i Lamb $(find . -name '*json')
-```
-
-Some more examples are to be found in the [main repo](https://github.com/microsoft/DeepSpeed) as well.
-
-When using DeepSpeed you always need to supply a DeepSpeed configuration file, yet some configuration parameters have
-to be configured via the command line. You will find the nuances in the rest of this guide.
-
-To get an idea of what DeepSpeed configuration file looks like, here is one that activates ZeRO stage 2 features,
-including optimizer states cpu offload, uses `AdamW` optimizer and `WarmupLR` scheduler and will enable mixed
-precision training if `--fp16` is passed:
-
-```json
-{
-    "fp16": {
-        "enabled": "auto",
-        "loss_scale": 0,
-        "loss_scale_window": 1000,
-        "initial_scale_power": 16,
-        "hysteresis": 2,
-        "min_loss_scale": 1
-    },
-
-    "optimizer": {
-        "type": "AdamW",
-        "params": {
-            "lr": "auto",
-            "betas": "auto",
-            "eps": "auto",
-            "weight_decay": "auto"
-        }
-    },
-
-    "scheduler": {
-        "type": "WarmupLR",
-        "params": {
-            "warmup_min_lr": "auto",
-            "warmup_max_lr": "auto",
-            "warmup_num_steps": "auto"
-        }
-    },
-
-    "zero_optimization": {
-        "stage": 2,
-        "offload_optimizer": {
-            "device": "cpu",
-            "pin_memory": true
-        },
-        "allgather_partitions": true,
-        "allgather_bucket_size": 2e8,
-        "overlap_comm": true,
-        "reduce_scatter": true,
-        "reduce_bucket_size": 2e8,
-        "contiguous_gradients": true
-    },
-
-    "gradient_accumulation_steps": "auto",
-    "gradient_clipping": "auto",
-    "train_batch_size": "auto",
-    "train_micro_batch_size_per_gpu": "auto",
-}
-```
-
-When you execute the program, DeepSpeed will log the configuration it received from the [`Trainer`]
-to the console, so you can see exactly what was the final configuration passed to it.
-
-
-
-<a id='deepspeed-config-passing'></a>
-
-### Passing Configuration
-
-As discussed in this document normally the DeepSpeed configuration is passed as a path to a json file, but if you're
-not using the command line interface to configure the training, and instead instantiate the
-[`Trainer`] via [`TrainingArguments`] then for the `deepspeed` argument you can
-pass a nested `dict`. This allows you to create the configuration on the fly and doesn't require you to write it to
-the file system before passing it to [`TrainingArguments`].
-
-To summarize you can do:
-
-```python
-TrainingArguments(..., deepspeed="/path/to/ds_config.json")
-```
-
-or:
-
-```python
-ds_config_dict = dict(scheduler=scheduler_params, optimizer=optimizer_params)
-TrainingArguments(..., deepspeed=ds_config_dict)
-```
-
-<a id='deepspeed-config-shared'></a>
-
-### Shared Configuration
-
-
-<Tip warning={true}>
-
-This section is a must-read
-
-</Tip>
-
-Some configuration values are required by both the [`Trainer`] and DeepSpeed to function correctly,
-therefore, to prevent conflicting definitions, which could lead to hard to detect errors, we chose to configure those
-via the [`Trainer`] command line arguments.
-
-Additionally, some configuration values are derived automatically based on the model's configuration, so instead of
-remembering to manually adjust multiple values, it's the best to let the [`Trainer`] do the majority
-of configuration for you.
-
-Therefore, in the rest of this guide you will find a special configuration value: `auto`, which when set will be
-automatically replaced with the correct or most efficient value. Please feel free to choose to ignore this
-recommendation and set the values explicitly, in which case be very careful that your the
-[`Trainer`] arguments and DeepSpeed configurations agree. For example, are you using the same
-learning rate, or batch size, or gradient accumulation settings? if these mismatch the training may fail in very
-difficult to detect ways. You have been warned.
-
-There are multiple other values that are specific to DeepSpeed-only and those you will have to set manually to suit
-your needs.
-
-In your own programs, you can also use the following approach if you'd like to modify the DeepSpeed config as a master
-and configure [`TrainingArguments`] based on that. The steps are:
-
-1. Create or load the DeepSpeed configuration to be used as a master configuration
-2. Create the [`TrainingArguments`] object based on these values
-
-Do note that some values, such as `scheduler.params.total_num_steps` are calculated by
-[`Trainer`] during `train`, but you can of course do the math yourself.
-
-<a id='deepspeed-zero'></a>
-
-### ZeRO
-
-[Zero Redundancy Optimizer (ZeRO)](https://www.deepspeed.ai/tutorials/zero/) is the workhorse of DeepSpeed. It
-supports 3 different levels (stages) of optimization. The first one is not quite interesting for scalability purposes,
-therefore this document focuses on stages 2 and 3. Stage 3 is further improved by the latest addition of ZeRO-Infinity.
-You will find more indepth information in the DeepSpeed documentation.
-
-The `zero_optimization` section of the configuration file is the most important part ([docs](https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training)), since that is where you define
-which ZeRO stages you want to enable and how to configure them. You will find the explanation for each parameter in the
-DeepSpeed docs.
-
-This section has to be configured exclusively via DeepSpeed configuration - the [`Trainer`] provides
-no equivalent command line arguments.
-
-Note: currently DeepSpeed doesn't validate parameter names, so if you misspell any, it'll use the default setting for
-the parameter that got misspelled. You can watch the DeepSpeed engine start up log messages to see what values it is
-going to use.
-
-
-
-<a id='deepspeed-zero2-config'></a>
-
-#### ZeRO-2 Config
-
-The following is an example of configuration for ZeRO stage 2:
-
-```json
-{
-    "zero_optimization": {
-        "stage": 2,
-        "offload_optimizer": {
-            "device": "cpu",
-            "pin_memory": true
-        },
-        "allgather_partitions": true,
-        "allgather_bucket_size": 5e8,
-        "overlap_comm": true,
-        "reduce_scatter": true,
-        "reduce_bucket_size": 5e8,
-        "contiguous_gradients": true
-    }
-}
-```
-
-**Performance tuning:**
-
-- enabling `offload_optimizer` should reduce GPU RAM usage (it requires `"stage": 2`)
-- `"overlap_comm": true` trades off increased GPU RAM usage to lower all-reduce latency. `overlap_comm` uses 4.5x
-  the `allgather_bucket_size` and `reduce_bucket_size` values. So if they are set to 5e8, this requires a 9GB
-  footprint (`5e8 x 2Bytes x 2 x 4.5`). Therefore, if you have a GPU with 8GB or less RAM, to avoid getting
-  OOM-errors you will need to reduce those parameters to about `2e8`, which would require 3.6GB. You will want to do
-  the same on larger capacity GPU as well, if you're starting to hit OOM.
-- when reducing these buffers you're trading communication speed to avail more GPU RAM. The smaller the buffer size is,
-  the slower the communication gets, and the more GPU RAM will be available to other tasks. So if a bigger batch size is
-  important, getting a slightly slower training time could be a good trade.
-
-Additionally, `deepspeed==0.4.4` added a new option `round_robin_gradients` which you can enable with:
-
-```json
-{
-    "zero_optimization": {
-        "round_robin_gradients": true
-    }
-}
-```
-
-This is a stage 2 optimization for CPU offloading that parallelizes gradient copying to CPU memory among ranks by fine-grained gradient partitioning. Performance benefit grows with gradient accumulation steps (more copying between optimizer steps) or GPU count (increased parallelism).
-
-
-<a id='deepspeed-zero3-config'></a>
-
-#### ZeRO-3 Config
-
-The following is an example of configuration for ZeRO stage 3:
-
-```json
-{
-    "zero_optimization": {
-        "stage": 3,
-        "offload_optimizer": {
-            "device": "cpu",
-            "pin_memory": true
-        },
-        "offload_param": {
-            "device": "cpu",
-            "pin_memory": true
-        },
-        "overlap_comm": true,
-        "contiguous_gradients": true,
-        "sub_group_size": 1e9,
-        "reduce_bucket_size": "auto",
-        "stage3_prefetch_bucket_size": "auto",
-        "stage3_param_persistence_threshold": "auto",
-        "stage3_max_live_parameters": 1e9,
-        "stage3_max_reuse_distance": 1e9,
-        "stage3_gather_16bit_weights_on_model_save": true
-    }
-}
-```
-
-If you are getting OOMs, because your model or activations don't fit into the GPU memory and you have unutilized CPU
-memory offloading the optimizer states and parameters to CPU memory with `"device": "cpu"` may solve this limitation.
-If you don't want to offload to CPU memory, use `none` instead of `cpu` for the `device` entry. Offloading to
-NVMe is discussed further down.
-
-Pinned memory is enabled with `pin_memory` set to `true`. This feature can improve the throughput at the cost of
-making less memory available to other processes. Pinned memory is set aside to the specific process that requested it
-and its typically accessed much faster than normal CPU memory.
-
-**Performance tuning:**
-
-- `stage3_max_live_parameters`: `1e9`
-- `stage3_max_reuse_distance`: `1e9`
-
-If hitting OOM reduce `stage3_max_live_parameters` and `stage3_max_reuse_distance`. They should have minimal impact
-on performance unless you are doing activation checkpointing. `1e9` would consume ~2GB. The memory is shared by
-`stage3_max_live_parameters` and `stage3_max_reuse_distance`, so it's not additive, it's just 2GB total.
-
-`stage3_max_live_parameters` is the upper limit on how many full parameters you want to keep on the GPU at any given
-time. "reuse distance" is a metric we are using to figure out when will a parameter be used again in the future, and we
-use the `stage3_max_reuse_distance` to decide whether to throw away the parameter or to keep it. If a parameter is
-going to be used again in near future (less than `stage3_max_reuse_distance`) then we keep it to reduce communication
-overhead. This is super helpful when you have activation checkpointing enabled, where we do a forward recompute and
-backward passes a a single layer granularity and want to keep the parameter in the forward recompute till the backward
-
-The following configuration values depend on the model's hidden size:
-
-- `reduce_bucket_size`: `hidden_size*hidden_size`
-- `stage3_prefetch_bucket_size`: `0.9 * hidden_size * hidden_size`
-- `stage3_param_persistence_threshold`: `10 * hidden_size`
-
-therefore set these values to `auto` and the [`Trainer`] will automatically assign the recommended
-values. But, of course, feel free to set these explicitly as well.
-
-`stage3_gather_16bit_weights_on_model_save` enables model fp16 weights consolidation when model gets saved. With large
-models and multiple GPUs this is an expensive operation both in terms of memory and speed. It's currently required if
-you plan to resume the training. Watch out for future updates that will remove this limitation and make things more
-flexible.
-
-If you're migrating from ZeRO-2 configuration note that `allgather_partitions`, `allgather_bucket_size` and
-`reduce_scatter` configuration parameters are not used in ZeRO-3. If you keep these in the config file they will just
-be ignored.
-
-- `sub_group_size`: `1e9`
-
-`sub_group_size` controls the granularity in which parameters are updated during optimizer steps. Parameters are
-grouped into buckets of `sub_group_size` and each buckets is updated one at a time. When used with NVMe offload in
-ZeRO-Infinity, `sub_group_size` therefore controls the granularity in which model states are moved in and out of CPU
-memory from NVMe during the optimizer step. This prevents running out of CPU memory for extremely large models.
-
-You can leave `sub_group_size` to its default value of *1e9* when not using NVMe offload. You may want to change its
-default value in the following cases:
-
-1. Running into OOM during optimizer step: Reduce `sub_group_size` to reduce memory utilization of temporary buffers
-2. Optimizer Step is taking a long time: Increase `sub_group_size` to improve bandwidth utilization as a result of
-   the increased data buffers.
-
-
-<a id='deepspeed-nvme'></a>
-
-### NVMe Support
-
-ZeRO-Infinity allows for training incredibly large models by extending GPU and CPU memory with NVMe memory. Thanks to
-smart partitioning and tiling algorithms each GPU needs to send and receive very small amounts of data during
-offloading so modern NVMe proved to be fit to allow for an even larger total memory pool available to your training
-process. ZeRO-Infinity requires ZeRO-3 enabled.
-
-The following configuration example enables NVMe to offload both optimizer states and the params:
-
-```json
-{
-    "zero_optimization": {
-        "stage": 3,
-        "offload_optimizer": {
-            "device": "nvme",
-            "nvme_path": "/local_nvme",
-            "pin_memory": true,
-            "buffer_count": 4,
-            "fast_init": false
-        },
-        "offload_param": {
-            "device": "nvme",
-            "nvme_path": "/local_nvme",
-            "pin_memory": true,
-            "buffer_count": 5,
-            "buffer_size": 1e8,
-            "max_in_cpu": 1e9
-        },
-        "aio": {
-            "block_size": 262144,
-            "queue_depth": 32,
-            "thread_count": 1,
-            "single_submit": false,
-            "overlap_events": true
-        },
-        "overlap_comm": true,
-        "contiguous_gradients": true,
-        "sub_group_size": 1e9,
-        "reduce_bucket_size": "auto",
-        "stage3_prefetch_bucket_size": "auto",
-        "stage3_param_persistence_threshold": "auto",
-        "stage3_max_live_parameters": 1e9,
-        "stage3_max_reuse_distance": 1e9,
-        "stage3_gather_16bit_weights_on_model_save": true
-    },
-}
-```
-
-You can choose to offload both optimizer states and params to NVMe, or just one of them or none. For example, if you
-have copious amounts of CPU memory available, by all means offload to CPU memory only as it'd be faster (hint:
-*"device": "cpu"*).
-
-Here is the full documentation for offloading [optimizer states](https://www.deepspeed.ai/docs/config-json/#optimizer-offloading) and [parameters](https://www.deepspeed.ai/docs/config-json/#parameter-offloading).
-
-Make sure that your `nvme_path` is actually an NVMe, since it will work with the normal hard drive or SSD, but it'll
-be much much slower. The fast scalable training was designed with modern NVMe transfer speeds in mind (as of this
-writing one can have ~3.5GB/s read, ~3GB/s write peak speeds).
-
-In order to figure out the optimal `aio` configuration block you must run a benchmark on your target setup, as
-[explained here](https://github.com/microsoft/DeepSpeed/issues/998).
-
-
-
-<a id='deepspeed-zero2-zero3-performance'></a>
-
-#### ZeRO-2 vs ZeRO-3 Performance
-
-ZeRO-3 is likely to be slower than ZeRO-2 if everything else is configured the same because the former has to gather
-model weights in addition to what ZeRO-2 does. If ZeRO-2 meets your needs and you don't need to scale beyond a few GPUs
-then you may choose to stick to it. It's important to understand that ZeRO-3 enables a much higher scalability capacity
-at a cost of speed.
-
-It's possible to adjust ZeRO-3 configuration to make it perform closer to ZeRO-2:
-
-- set `stage3_param_persistence_threshold` to a very large number - larger than the largest parameter, e.g., `6 * hidden_size * hidden_size`. This will keep the parameters on the GPUs.
-- turn off `offload_params` since ZeRO-2 doesn't have that option.
-
-The performance will likely improve significantly with just `offload_params` turned off, even if you don't change
-`stage3_param_persistence_threshold`. Of course, these changes will impact the size of the model you can train. So
-these help you to trade scalability for speed depending on your needs.
-
-
-
-<a id='deepspeed-zero2-example'></a>
-
-#### ZeRO-2 Example
-
-Here is a full ZeRO-2 auto-configuration file `ds_config_zero2.json`:
-
-```json
-{
-    "fp16": {
-        "enabled": "auto",
-        "loss_scale": 0,
-        "loss_scale_window": 1000,
-        "initial_scale_power": 16,
-        "hysteresis": 2,
-        "min_loss_scale": 1
-    },
-
-    "optimizer": {
-        "type": "AdamW",
-        "params": {
-            "lr": "auto",
-            "betas": "auto",
-            "eps": "auto",
-            "weight_decay": "auto"
-        }
-    },
-
-    "scheduler": {
-        "type": "WarmupLR",
-        "params": {
-            "warmup_min_lr": "auto",
-            "warmup_max_lr": "auto",
-            "warmup_num_steps": "auto"
-        }
-    },
-
-    "zero_optimization": {
-        "stage": 2,
-        "offload_optimizer": {
-            "device": "cpu",
-            "pin_memory": true
-        },
-        "allgather_partitions": true,
-        "allgather_bucket_size": 2e8,
-        "overlap_comm": true,
-        "reduce_scatter": true,
-        "reduce_bucket_size": 2e8,
-        "contiguous_gradients": true
-    },
-
-    "gradient_accumulation_steps": "auto",
-    "gradient_clipping": "auto",
-    "steps_per_print": 2000,
-    "train_batch_size": "auto",
-    "train_micro_batch_size_per_gpu": "auto",
-    "wall_clock_breakdown": false
-}
-```
-
-Here is a full ZeRO-2 all-enabled manually set configuration file. It is here mainly for you to see what the typical
-values look like, but we highly recommend using the one with multiple `auto` settings in it.
-
-```json
-{
-    "fp16": {
-        "enabled": true,
-        "loss_scale": 0,
-        "loss_scale_window": 1000,
-        "initial_scale_power": 16,
-        "hysteresis": 2,
-        "min_loss_scale": 1
-    },
-
-    "optimizer": {
-        "type": "AdamW",
-        "params": {
-            "lr": 3e-5,
-            "betas": [0.8, 0.999],
-            "eps": 1e-8,
-            "weight_decay": 3e-7
-        }
-    },
-
-    "scheduler": {
-        "type": "WarmupLR",
-        "params": {
-            "warmup_min_lr": 0,
-            "warmup_max_lr": 3e-5,
-            "warmup_num_steps": 500
-        }
-    },
-
-    "zero_optimization": {
-        "stage": 2,
-        "offload_optimizer": {
-            "device": "cpu",
-            "pin_memory": true
-        },
-        "allgather_partitions": true,
-        "allgather_bucket_size": 2e8,
-        "overlap_comm": true,
-        "reduce_scatter": true,
-        "reduce_bucket_size": 2e8,
-        "contiguous_gradients": true
-    },
-
-    "steps_per_print": 2000,
-    "wall_clock_breakdown": false
-}
-```
-
-<a id='deepspeed-zero3-example'></a>
-
-#### ZeRO-3 Example
-
-Here is a full ZeRO-3 auto-configuration file `ds_config_zero3.json`:
-
-
-```json
-{
-    "fp16": {
-        "enabled": "auto",
-        "loss_scale": 0,
-        "loss_scale_window": 1000,
-        "initial_scale_power": 16,
-        "hysteresis": 2,
-        "min_loss_scale": 1
-    },
-
-    "optimizer": {
-        "type": "AdamW",
-        "params": {
-            "lr": "auto",
-            "betas": "auto",
-            "eps": "auto",
-            "weight_decay": "auto"
-        }
-    },
-
-    "scheduler": {
-        "type": "WarmupLR",
-        "params": {
-            "warmup_min_lr": "auto",
-            "warmup_max_lr": "auto",
-            "warmup_num_steps": "auto"
-        }
-    },
-
-    "zero_optimization": {
-        "stage": 3,
-        "offload_optimizer": {
-            "device": "cpu",
-            "pin_memory": true
-        },
-        "offload_param": {
-            "device": "cpu",
-            "pin_memory": true
-        },
-        "overlap_comm": true,
-        "contiguous_gradients": true,
-        "sub_group_size": 1e9,
-        "reduce_bucket_size": "auto",
-        "stage3_prefetch_bucket_size": "auto",
-        "stage3_param_persistence_threshold": "auto",
-        "stage3_max_live_parameters": 1e9,
-        "stage3_max_reuse_distance": 1e9,
-        "stage3_gather_16bit_weights_on_model_save": true
-    },
-
-    "gradient_accumulation_steps": "auto",
-    "gradient_clipping": "auto",
-    "steps_per_print": 2000,
-    "train_batch_size": "auto",
-    "train_micro_batch_size_per_gpu": "auto",
-    "wall_clock_breakdown": false
-}
-```
-
-Here is a full ZeRO-3 all-enabled manually set configuration file. It is here mainly for you to see what the typical
-values look like, but we highly recommend using the one with multiple `auto` settings in it.
-
-```json
-{
-    "fp16": {
-        "enabled": true,
-        "loss_scale": 0,
-        "loss_scale_window": 1000,
-        "initial_scale_power": 16,
-        "hysteresis": 2,
-        "min_loss_scale": 1
-    },
-
-    "optimizer": {
-        "type": "AdamW",
-        "params": {
-            "lr": 3e-5,
-            "betas": [0.8, 0.999],
-            "eps": 1e-8,
-            "weight_decay": 3e-7
-        }
-    },
-
-    "scheduler": {
-        "type": "WarmupLR",
-        "params": {
-            "warmup_min_lr": 0,
-            "warmup_max_lr": 3e-5,
-            "warmup_num_steps": 500
-        }
-    },
-
-    "zero_optimization": {
-        "stage": 3,
-        "offload_optimizer": {
-            "device": "cpu",
-            "pin_memory": true
-        },
-        "offload_param": {
-            "device": "cpu",
-            "pin_memory": true
-        },
-        "overlap_comm": true,
-        "contiguous_gradients": true,
-        "sub_group_size": 1e9,
-        "reduce_bucket_size": 1e6,
-        "stage3_prefetch_bucket_size": 0.94e6,
-        "stage3_param_persistence_threshold": 1e4,
-        "stage3_max_live_parameters": 1e9,
-        "stage3_max_reuse_distance": 1e9,
-        "stage3_gather_16bit_weights_on_model_save": true
-    },
-
-    "steps_per_print": 2000,
-    "wall_clock_breakdown": false
-}
-```
-
-### Optimizer and Scheduler
-
-As long as you don't enable `offload_optimizer` you can mix and match DeepSpeed and HuggingFace schedulers and
-optimizers, with the exception of using the combination of HuggingFace scheduler and DeepSpeed optimizer:
-
-| Combos       | HF Scheduler | DS Scheduler |
-| HF Optimizer | Yes          | Yes          |
-| DS Optimizer | No           | Yes          |
-
-It is possible to use a non-DeepSpeed optimizer when `offload_optimizer` is enabled, as long as it has both CPU and
-GPU implementation (except LAMB).
-
-
-
-
-<a id='deepspeed-optimizer'></a>
-
-#### Optimizer
-
-
-DeepSpeed's main optimizers are Adam, AdamW, OneBitAdam, and Lamb. These have been thoroughly tested with ZeRO and are
-thus recommended to be used. It, however, can import other optimizers from `torch`. The full documentation is [here](https://www.deepspeed.ai/docs/config-json/#optimizer-parameters).
-
-If you don't configure the `optimizer` entry in the configuration file, the [`Trainer`] will
-automatically set it to `AdamW` and will use the supplied values or the defaults for the following command line
-arguments: `--learning_rate`, `--adam_beta1`, `--adam_beta2`, `--adam_epsilon` and `--weight_decay`.
-
-Here is an example of the auto-configured `optimizer` entry for `AdamW`:
-
-```json
-{
-   "optimizer": {
-       "type": "AdamW",
-       "params": {
-         "lr": "auto",
-         "betas": "auto",
-         "eps": "auto",
-         "weight_decay": "auto"
-       }
-   }
-}
-```
-
-Note that the command line arguments will set the values in the configuration file. This is so that there is one
-definitive source of the values and to avoid hard to find errors when for example, the learning rate is set to
-different values in different places. Command line rules. The values that get overridden are:
-
-- `lr` with the value of `--learning_rate`
-- `betas` with the value of `--adam_beta1 --adam_beta2`
-- `eps` with the value of `--adam_epsilon`
-- `weight_decay` with the value of `--weight_decay`
-
-Therefore please remember to tune the shared hyperparameters on the command line.
-
-You can also set the values explicitly:
-
-```json
-{
-   "optimizer": {
-       "type": "AdamW",
-       "params": {
-         "lr": 0.001,
-         "betas": [0.8, 0.999],
-         "eps": 1e-8,
-         "weight_decay": 3e-7
-       }
-   }
-}
-```
-
-But then you're on your own synchronizing the [`Trainer`] command line arguments and the DeepSpeed
-configuration.
-
-If you want to use another optimizer which is not listed above, you will have to add to the top level configuration.
-
-```json
-{
-   "zero_allow_untested_optimizer": true
-}
-```
-
-Similarly to `AdamW`, you can configure other officially supported optimizers. Just remember that may have different
-config values. e.g. for Adam you will want `weight_decay` around `0.01`.
-
-
-
-<a id='deepspeed-scheduler'></a>
-
-#### Scheduler
-
-DeepSpeed supports `LRRangeTest`, `OneCycle`, `WarmupLR` and `WarmupDecayLR` learning rate schedulers. The full
-documentation is [here](https://www.deepspeed.ai/docs/config-json/#scheduler-parameters).
-
-Here is where the schedulers overlap between 🤗 Transformers and DeepSpeed:
-
-- `WarmupLR` via `--lr_scheduler_type constant_with_warmup`
-- `WarmupDecayLR` via `--lr_scheduler_type linear`. This is also the default value for `--lr_scheduler_type`,
-  therefore, if you don't configure the scheduler this is scheduler that will get configured by default.
-
-If you don't configure the `scheduler` entry in the configuration file, the [`Trainer`] will use
-the values of `--lr_scheduler_type`, `--learning_rate` and `--warmup_steps` or `--warmup_ratio` to configure a
-🤗 Transformers version of it.
-
-Here is an example of the auto-configured `scheduler` entry for `WarmupLR`:
-
-```json
-{
-   "scheduler": {
-         "type": "WarmupLR",
-         "params": {
-             "warmup_min_lr": "auto",
-             "warmup_max_lr": "auto",
-             "warmup_num_steps": "auto"
-         }
-     }
-}
-```
-
-Since *"auto"* is used the [`Trainer`] arguments will set the correct values in the configuration
-file. This is so that there is one definitive source of the values and to avoid hard to find errors when, for example,
-the learning rate is set to different values in different places. Command line rules. The values that get set are:
-
-- `warmup_min_lr` with the value of `0`.
-- `warmup_max_lr` with the value of `--learning_rate`.
-- `warmup_num_steps` with the value of `--warmup_steps` if provided. Otherwise will use `--warmup_ratio`
-  multiplied by the number of training steps and rounded up.
-- `total_num_steps` with either the value of `--max_steps` or if it is not provided, derived automatically at run
-  time based on the environment and the size of the dataset and other command line arguments (needed for
-  `WarmupDecayLR`).
-
-You can, of course, take over any or all of the configuration values and set those yourself:
-
-```json
-{
-   "scheduler": {
-         "type": "WarmupLR",
-         "params": {
-             "warmup_min_lr": 0,
-             "warmup_max_lr": 0.001,
-             "warmup_num_steps": 1000
-         }
-     }
-}
-```
-
-But then you're on your own synchronizing the [`Trainer`] command line arguments and the DeepSpeed
-configuration.
-
-For example, for `WarmupDecayLR`, you can use the following entry:
-
-```json
-{
-   "scheduler": {
-         "type": "WarmupDecayLR",
-         "params": {
-             "last_batch_iteration": -1,
-             "total_num_steps": "auto",
-             "warmup_min_lr": "auto",
-             "warmup_max_lr": "auto",
-             "warmup_num_steps": "auto"
-         }
-     }
-}
-```
-
-and `total_num_steps`, `warmup_max_lr`, `warmup_num_steps` and `total_num_steps` will be set at loading time.
-
-
-
-
-<a id='deepspeed-fp32'></a>
-
-### fp32 Precision
-
-Deepspeed supports the full fp32 and the fp16 mixed precision.
-
-Because of the much reduced memory needs and faster speed one gets with the fp16 mixed precision, the only time you
-will want to not use it is when the model you're using doesn't behave well under this training mode. Typically this
-happens when the model wasn't pretrained in the fp16 mixed precision (e.g. often this happens with bf16-pretrained
-models). Such models may overflow or underflow leading to `NaN` loss. If this is your case then you will want to use
-the full fp32 mode, by explicitly disabling the otherwise default fp16 mixed precision mode with:
-
-```json
-{
-    "fp16": {
-        "enabled": "false",
-    }
-}
-```
-
-If you're using the Ampere-architecture based GPU, pytorch version 1.7 and higher will automatically switch to using
-the much more efficient tf32 format for some operations, but the results will still be in fp32. For details and
-benchmarks, please, see [TensorFloat-32(TF32) on Ampere devices](https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices). The document includes
-instructions on how to disable this automatic conversion if for some reason you prefer not to use it.
-
-With the 🤗 Trainer you can use `--tf32` to enable it, or disable it with `--tf32 0` or `--no_tf32`. By default the PyTorch default is used.
-
-
-
-<a id='deepspeed-amp'></a>
-
-### Automatic Mixed Precision
-
-You can use automatic mixed precision with either a pytorch-like AMP way or the apex-like way:
-
-### fp16
-
-To configure pytorch AMP-like mode with fp16 (float16) set:
-
-```json
-{
-    "fp16": {
-        "enabled": "auto",
-        "loss_scale": 0,
-        "loss_scale_window": 1000,
-        "initial_scale_power": 16,
-        "hysteresis": 2,
-        "min_loss_scale": 1
-    }
-}
-```
-
-and the [`Trainer`] will automatically enable or disable it based on the value of
-`args.fp16_backend`. The rest of config values are up to you.
-
-This mode gets enabled when `--fp16 --fp16_backend amp` or `--fp16_full_eval` command line args are passed.
-
-You can also enable/disable this mode explicitly:
-
-```json
-{
-    "fp16": {
-        "enabled": true,
-        "loss_scale": 0,
-        "loss_scale_window": 1000,
-        "initial_scale_power": 16,
-        "hysteresis": 2,
-        "min_loss_scale": 1
-    }
-}
-```
-
-But then you're on your own synchronizing the [`Trainer`] command line arguments and the DeepSpeed
-configuration.
-
-Here is the [documentation](https://www.deepspeed.ai/docs/config-json/#fp16-training-options).
-
-### bf16
-
-If bf16 (bfloat16) is desired instead of fp16 then the following configuration section is to be used:
-
-```json
-{
-    "bf16": {
-        "enabled": "auto"
-    }
-}
-```
-
-bf16 has the same dynamic range as fp32 and thus doesn't require loss scaling.
-
-This mode gets enabled when `--bf16` or `--bf16_full_eval` command line args are passed.
-
-You can also enable/disable this mode explicitly:
-
-```json
-{
-    "bf16": {
-        "enabled": true
-    }
-}
-```
-
-<Tip>
-
-As of `deepspeed==0.6.0` the bf16 support is new and experimental.
-
-If you use [gradient accumulation](#gradient-accumulation) with bf16-enabled, you need to be aware that it'll accumulate gradients in bf16, which may not be what you want due to this format's low precision, as it may lead to a lossy accumulation.
-
-</Tip>
-
-
-### apex
-
-To configure apex AMP-like mode set:
-
-```json
-"amp": {
-    "enabled": "auto",
-    "opt_level": "auto"
-}
-```
-
-and the [`Trainer`] will automatically configure it based on the values of `args.fp16_backend` and
-`args.fp16_opt_level`.
-
-This mode gets enabled when `--fp16 --fp16_backend apex --fp16_opt_level 01` command line args are passed.
-
-You can also configure this mode explicitly:
-
-```json
-{
-    "amp": {
-        "enabled": true,
-        "opt_level": "O1"
-    }
-}
-```
-
-But then you're on your own synchronizing the [`Trainer`] command line arguments and the DeepSpeed
-configuration.
-
-Here is the [documentation](https://www.deepspeed.ai/docs/config-json/#automatic-mixed-precision-amp-training-options).
-
-
-
-<a id='deepspeed-bs'></a>
-
-### Batch Size
-
-To configure batch size, use:
-
-```json
-{
-    "train_batch_size": "auto",
-    "train_micro_batch_size_per_gpu": "auto"
-}
-```
-
-and the [`Trainer`] will automatically set `train_micro_batch_size_per_gpu` to the value of
-`args.per_device_train_batch_size` and `train_batch_size` to `args.world_size * args.per_device_train_batch_size * args.gradient_accumulation_steps`.
-
-You can also set the values explicitly:
-
-```json
-{
-    "train_batch_size": 12,
-    "train_micro_batch_size_per_gpu": 4
-}
-```
-
-But then you're on your own synchronizing the [`Trainer`] command line arguments and the DeepSpeed
-configuration.
-
-
-
-<a id='deepspeed-grad-acc'></a>
-
-### Gradient Accumulation
-
-To configure gradient accumulation set:
-
-```json
-{
-    "gradient_accumulation_steps": "auto"
-}
-```
-
-and the [`Trainer`] will automatically set it to the value of `args.gradient_accumulation_steps`.
-
-You can also set the value explicitly:
-
-```json
-{
-    "gradient_accumulation_steps": 3
-}
-```
-
-But then you're on your own synchronizing the [`Trainer`] command line arguments and the DeepSpeed
-configuration.
-
-
-
-<a id='deepspeed-grad-clip'></a>
-
-### Gradient Clipping
-
-To configure gradient gradient clipping set:
-
-```json
-{
-    "gradient_clipping": "auto"
-}
-```
-
-and the [`Trainer`] will automatically set it to the value of `args.max_grad_norm`.
-
-You can also set the value explicitly:
-
-```json
-{
-    "gradient_clipping": 1.0
-}
-```
-
-But then you're on your own synchronizing the [`Trainer`] command line arguments and the DeepSpeed
-configuration.
-
-
-
-<a id='deepspeed-weight-extraction'></a>
-
-### Getting The Model Weights Out
-
-As long as you continue training and resuming using DeepSpeed you don't need to worry about anything. DeepSpeed stores
-fp32 master weights in its custom checkpoint optimizer files, which are `global_step*/*optim_states.pt` (this is glob
-pattern), and are saved under the normal checkpoint.
-
-**FP16 Weights:**
-
-When a model is saved under ZeRO-2, you end up having the normal `pytorch_model.bin` file with the model weights, but
-they are only the fp16 version of the weights.
-
-Under ZeRO-3, things are much more complicated, since the model weights are partitioned out over multiple GPUs,
-therefore `"stage3_gather_16bit_weights_on_model_save": true` is required to get the `Trainer` to save the fp16
-version of the weights. If this setting is `False` `pytorch_model.bin` won't be created. This is because by default DeepSpeed's `state_dict` contains a placeholder and not the real weights. If we were to save this `state_dict` it won't be possible to load it back.
-
-
-```json
-{
-    "zero_optimization": {
-        "stage3_gather_16bit_weights_on_model_save": true
-    }
-}
-```
-
-**FP32 Weights:**
-
-While the fp16 weights are fine for resuming training, if you finished finetuning your model and want to upload it to
-the [models hub](https://huggingface.co/models) or pass it to someone else you most likely will want to get the fp32
-weights. This ideally shouldn't be done during training since this is a process that requires a lot of memory, and
-therefore best to be performed offline after the training is complete. But if desired and you have plenty of free CPU
-memory it can be done in the same training script. The following sections will discuss both approaches.
-
-
-**Live FP32 Weights Recovery:**
-
-This approach may not work if you model is large and you have little free CPU memory left, at the end of the training.
-
-If you have saved at least one checkpoint, and you want to use the latest one, you can do the following:
-
-```python
-from transformers.trainer_utils import get_last_checkpoint
-from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
-
-checkpoint_dir = get_last_checkpoint(trainer.args.output_dir)
-fp32_model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
-```
-
-If you're using the `--load_best_model_at_end` class:*~transformers.TrainingArguments* argument (to track the best
-checkpoint), then you can finish the training by first saving the final model explicitly and then do the same as above:
-
-```python
-from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
-
-checkpoint_dir = os.path.join(trainer.args.output_dir, "checkpoint-final")
-trainer.deepspeed.save_checkpoint(checkpoint_dir)
-fp32_model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
-```
-
-<Tip>
-
-Note, that once `load_state_dict_from_zero_checkpoint` was run, the `model` will no longer be usable in the
-DeepSpeed context of the same application. i.e. you will need to re-initialize the deepspeed engine, since
-`model.load_state_dict(state_dict)` will remove all the DeepSpeed magic from it. So do this only at the very end
-of the training.
-
-</Tip>
-
-Of course, you don't have to use class:*~transformers.Trainer* and you can adjust the examples above to your own
-trainer.
-
-If for some reason you want more refinement, you can also extract the fp32 `state_dict` of the weights and apply
-these yourself as is shown in the following example:
-
-```python
-from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
-
-state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir)  # already on cpu
-model = model.cpu()
-model.load_state_dict(state_dict)
-```
-
-**Offline FP32 Weights Recovery:**
-
-DeepSpeed creates a special conversion script `zero_to_fp32.py` which it places in the top-level of the checkpoint
-folder. Using this script you can extract the weights at any point. The script is standalone and you no longer need to
-have the configuration file or a `Trainer` to do the extraction.
-
-Let's say your checkpoint folder looks like this:
-
-```bash
-$ ls -l output_dir/checkpoint-1/
--rw-rw-r-- 1 stas stas 1.4K Mar 27 20:42 config.json
-drwxrwxr-x 2 stas stas 4.0K Mar 25 19:52 global_step1/
--rw-rw-r-- 1 stas stas   12 Mar 27 13:16 latest
--rw-rw-r-- 1 stas stas 827K Mar 27 20:42 optimizer.pt
--rw-rw-r-- 1 stas stas 231M Mar 27 20:42 pytorch_model.bin
--rw-rw-r-- 1 stas stas  623 Mar 27 20:42 scheduler.pt
--rw-rw-r-- 1 stas stas 1.8K Mar 27 20:42 special_tokens_map.json
--rw-rw-r-- 1 stas stas 774K Mar 27 20:42 spiece.model
--rw-rw-r-- 1 stas stas 1.9K Mar 27 20:42 tokenizer_config.json
--rw-rw-r-- 1 stas stas  339 Mar 27 20:42 trainer_state.json
--rw-rw-r-- 1 stas stas 2.3K Mar 27 20:42 training_args.bin
--rwxrw-r-- 1 stas stas 5.5K Mar 27 13:16 zero_to_fp32.py*
-```
-
-In this example there is just one DeepSpeed checkpoint sub-folder *global_step1*. Therefore to reconstruct the fp32
-weights just run:
-
-```bash
-python zero_to_fp32.py . pytorch_model.bin
-```
-
-This is it. `pytorch_model.bin` will now contain the full fp32 model weights consolidated from multiple GPUs.
-
-The script will automatically be able to handle either a ZeRO-2 or ZeRO-3 checkpoint.
-
-`python zero_to_fp32.py -h` will give you usage details.
-
-The script will auto-discover the deepspeed sub-folder using the contents of the file `latest`, which in the current
-example will contain `global_step1`.
-
-Note: currently the script requires 2x general RAM of the final fp32 model weights.
-
-
-### ZeRO-3 and Infinity Nuances
-
-ZeRO-3 is quite different from ZeRO-2 because of its param sharding feature.
-
-ZeRO-Infinity further extends ZeRO-3 to support NVMe memory and multiple other speed and scalability improvements.
-
-While all the efforts were made for things to just work without needing any special changes to your models, in certain
-circumstances you may find the following information to be needed.
-
-
-
-#### Constructing Massive Models
-
-DeepSpeed/ZeRO-3 can handle models with Trillions of parameters which may not fit onto the existing RAM. In such cases,
-but also if you want the initialization to happen much faster, initialize the model using *deepspeed.zero.Init()*
-context manager (which is also a function decorator), like so:
-
-```python
-from transformers import T5ForConditionalGeneration, T5Config
-import deepspeed
-
-with deepspeed.zero.Init():
-    config = T5Config.from_pretrained("t5-small")
-    model = T5ForConditionalGeneration(config)
-```
-
-As you can see this gives you a randomly initialized model.
-
-If you want to use a pretrained model, `model_class.from_pretrained` will activate this feature as long as
-`is_deepspeed_zero3_enabled()` returns `True`, which currently is setup by the
-[`TrainingArguments`] object if the passed DeepSpeed configuration file contains ZeRO-3 config
-section. Thus you must create the [`TrainingArguments`] object **before** calling
-`from_pretrained`. Here is an example of a possible sequence:
-
-```python
-from transformers import AutoModel, Trainer, TrainingArguments
-
-training_args = TrainingArguments(..., deepspeed=ds_config)
-model = AutoModel.from_pretrained("t5-small")
-trainer = Trainer(model=model, args=training_args, ...)
-```
-
-If you're using the official example scripts and your command line arguments include `--deepspeed ds_config.json`
-with ZeRO-3 config enabled, then everything is already done for you, since this is how example scripts are written.
-
-Note: If the fp16 weights of the model can't fit onto the memory of a single GPU this feature must be used.
-
-For full details on this method and other related features please refer to [Constructing Massive Models](https://deepspeed.readthedocs.io/en/latest/zero3.html#constructing-massive-models).
-
-Also when loading fp16-pretrained models, you will want to tell `from_pretrained` to use
-`torch_dtype=torch.float16`. For details, please, see [from_pretrained-torch-dtype](#from_pretrained-torch-dtype).
-
-
-#### Gathering Parameters
-
-Under ZeRO-3 on multiple GPUs no single GPU has all the parameters unless it's the parameters for the currently
-executing layer. So if you need to access all parameters from all layers at once there is a specific method to do it.
-Most likely you won't need it, but if you do please refer to [Gathering Parameters](https://deepspeed.readthedocs.io/en/latest/zero3.html#manual-parameter-coordination)
-
-We do however use it internally in several places, one such example is when loading pretrained model weights in
-`from_pretrained`. We load one layer at a time and immediately partition it to all participating GPUs, as for very
-large models it won't be possible to load it on one GPU and then spread it out to multiple GPUs, due to memory
-limitations.
-
-Also under ZeRO-3, if you write your own code and run into a model parameter weight that looks like:
-
-```python
-tensor([1.0], device="cuda:0", dtype=torch.float16, requires_grad=True)
-```
-
-stress on `tensor([1.])`, or if you get an error where it says the parameter is of size `1`, instead of some much
-larger multi-dimensional shape, this means that the parameter is partitioned and what you see is a ZeRO-3 placeholder.
-
-
-
-<a id='deepspeed-zero-inference'></a>
-
-
-### ZeRO Inference
-
-ZeRO Inference uses the same config as ZeRO-3 Training. You just don't need the optimizer and scheduler sections. In
-fact you can leave these in the config file if you want to share the same one with the training. They will just be
-ignored.
-
-Otherwise you just need to pass the usual [`TrainingArguments`] arguments. For example:
-
-```bash
-deepspeed --num_gpus=2 your_program.py <normal cl args> --do_eval --deepspeed ds_config.json
-```
-
-The only important thing is that you need to use a ZeRO-3 configuration, since ZeRO-2 provides no benefit whatsoever
-for the inference as only ZeRO-3 performs sharding of parameters, whereas ZeRO-1 shards gradients and optimizer states.
-
-Here is an example of running `run_translation.py` under DeepSpeed deploying all available GPUs:
-
-```bash
-deepspeed examples/pytorch/translation/run_translation.py \
---deepspeed tests/deepspeed/ds_config_zero3.json \
---model_name_or_path t5-small --output_dir output_dir \
---do_eval --max_eval_samples 50 --warmup_steps 50  \
---max_source_length 128 --val_max_target_length 128 \
---overwrite_output_dir --per_device_eval_batch_size 4 \
---predict_with_generate --dataset_config "ro-en" --fp16 \
---source_lang en --target_lang ro --dataset_name wmt16 \
---source_prefix "translate English to Romanian: "
-```
-
-Since for inference there is no need for additional large memory used by the optimizer states and the gradients you
-should be able to fit much larger batches and/or sequence length onto the same hardware.
-
-Additionally DeepSpeed is currently developing a related product called Deepspeed-Inference which has no relationship
-to the ZeRO technology, but instead uses tensor parallelism to scale models that can't fit onto a single GPU. This is a
-work in progress and we will provide the integration once that product is complete.
-
-
-### Memory Requirements
-
-Since Deepspeed ZeRO can offload memory to CPU (and NVMe) the framework provides utils that allow one to tell how much CPU and GPU memory will be needed depending on the number of GPUs being used.
-
-Let's estimate how much memory is needed to finetune "bigscience/T0_3B" on a single GPU:
-
-```bash
-$ python -c 'from transformers import AutoModel; \
-from deepspeed.runtime.zero.stage3 import estimate_zero3_model_states_mem_needs_all_live; \
-model = AutoModel.from_pretrained("bigscience/T0_3B"); \
-estimate_zero3_model_states_mem_needs_all_live(model, num_gpus_per_node=1, num_nodes=1)'
-[...]
-Estimated memory needed for params, optim states and gradients for a:
-HW: Setup with 1 node, 1 GPU per node.
-SW: Model with 2783M total params, 65M largest layer params.
-  per CPU  |  per GPU |   Options
-   70.00GB |   0.25GB | offload_param=cpu , offload_optimizer=cpu , zero_init=1
-   70.00GB |   0.25GB | offload_param=cpu , offload_optimizer=cpu , zero_init=0
-   62.23GB |   5.43GB | offload_param=none, offload_optimizer=cpu , zero_init=1
-   62.23GB |   5.43GB | offload_param=none, offload_optimizer=cpu , zero_init=0
-    0.37GB |  46.91GB | offload_param=none, offload_optimizer=none, zero_init=1
-   15.56GB |  46.91GB | offload_param=none, offload_optimizer=none, zero_init=0
-```
-
-So you can fit it on a single 80GB GPU and no CPU offload, or a tiny 8GB GPU but then need ~60GB of CPU memory. (Remember this is just the memory for params, optimizer states and gradients - you will need a bit more memory for cuda kernels, activations and temps.)
-
-Then it's a tradeoff of cost vs speed. It'll be cheaper to buy/rent a smaller GPU (or less GPUs since you can use multiple GPUs with Deepspeed ZeRO. But then it'll be slower, so even if you don't care about how fast something will be done, the slowdown has a direct impact on the duration of using the GPU and thus bigger cost. So experiment and compare which works the best.
-
-If you have enough GPU memory make sure to disable the CPU/NVMe offload as it'll make everything faster.
-
-For example, let's repeat the same for 2 GPUs:
-
-```bash
-$ python -c 'from transformers import AutoModel; \
-from deepspeed.runtime.zero.stage3 import estimate_zero3_model_states_mem_needs_all_live; \
-model = AutoModel.from_pretrained("bigscience/T0_3B"); \
-estimate_zero3_model_states_mem_needs_all_live(model, num_gpus_per_node=2, num_nodes=1)'
-[...]
-Estimated memory needed for params, optim states and gradients for a:
-HW: Setup with 1 node, 2 GPUs per node.
-SW: Model with 2783M total params, 65M largest layer params.
-  per CPU  |  per GPU |   Options
-   70.00GB |   0.25GB | offload_param=cpu , offload_optimizer=cpu , zero_init=1
-   70.00GB |   0.25GB | offload_param=cpu , offload_optimizer=cpu , zero_init=0
-   62.23GB |   2.84GB | offload_param=none, offload_optimizer=cpu , zero_init=1
-   62.23GB |   2.84GB | offload_param=none, offload_optimizer=cpu , zero_init=0
-    0.74GB |  23.58GB | offload_param=none, offload_optimizer=none, zero_init=1
-   31.11GB |  23.58GB | offload_param=none, offload_optimizer=none, zero_init=0
-
-```
-
-So here you'd want 2x 32GB GPUs or higher without offloading to CPU.
-
-For full information please see [memory estimators](https://deepspeed.readthedocs.io/en/latest/memory.html).
-
-
-
-### Filing Issues
-
-Here is how to file an issue so that we could quickly get to the bottom of the issue and help you to unblock your work.
-
-In your report please always include:
-
-1. the full Deepspeed config file in the report
-
-2. either the command line arguments if you were using the [`Trainer`] or
-   [`TrainingArguments`] arguments if you were scripting the Trainer setup yourself. Please do not
-   dump the [`TrainingArguments`] as it has dozens of entries that are irrelevant.
-
-3. Output of:
-
-    ```bash
-    python -c 'import torch; print(f"torch: {torch.__version__}")'
-    python -c 'import transformers; print(f"transformers: {transformers.__version__}")'
-    python -c 'import deepspeed; print(f"deepspeed: {deepspeed.__version__}")'
-    ```
-
-4. If possible include a link to a Google Colab notebook that we can reproduce the problem with. You can use this
-   [notebook](https://github.com/stas00/porting/blob/master/transformers/deepspeed/DeepSpeed_on_colab_CLI.ipynb) as
-   a starting point.
-
-5. Unless it's impossible please always use a standard dataset that we can use and not something custom.
-
-6. If possible try to use one of the existing [examples](https://github.com/huggingface/transformers/tree/main/examples/pytorch) to reproduce the problem with.
-
-Things to consider:
-
-- Deepspeed is often not the cause of the problem.
-
-  Some of the filed issues proved to be Deepspeed-unrelated. That is once Deepspeed was removed from the setup, the
-  problem was still there.
-
-  Therefore, if it's not absolutely obvious it's a DeepSpeed-related problem, as in you can see that there is an
-  exception and you can see that DeepSpeed modules are involved, first re-test your setup without DeepSpeed in it.
-  And only if the problem persists then do mentioned Deepspeed and supply all the required details.
-
-- If it's clear to you that the issue is in the DeepSpeed core and not the integration part, please file the Issue
-  directly with [Deepspeed](https://github.com/microsoft/DeepSpeed/). If you aren't sure, please do not worry,
-  either Issue tracker will do, we will figure it out once you posted it and redirect you to another Issue tracker if
-  need be.
-
-
-
-### Troubleshooting
-
-#### the `deepspeed` process gets killed at startup without a traceback
-
-If the `deepspeed` process gets killed at launch time without a traceback, that usually means that the program tried
-to allocate more CPU memory than your system has or your process is allowed to allocate and the OS kernel killed that
-process. This is because your configuration file most likely has either `offload_optimizer` or `offload_param` or
-both configured to offload to `cpu`. If you have NVMe, experiment with offloading to NVMe if you're running under
-ZeRO-3. Here is how you can [estimate how much memory is needed for a specific model](https://deepspeed.readthedocs.io/en/latest/memory.html).
-
-
-#### training and/or eval/predict loss is `NaN`
-
-This often happens when one takes a model pre-trained in bf16 mixed precision mode and tries to use it under fp16 (with or without mixed precision). Most models trained on TPU and often the ones released by Google are in this category (e.g. almost all t5-based models). Here the solution is to either use fp32 or bf16 if your hardware supports it (TPU, Ampere GPUs or newer).
-
-The other problem may have to do with using fp16. When you configure this section:
-
-```json
-{
-    "fp16": {
-        "enabled": "auto",
-        "loss_scale": 0,
-        "loss_scale_window": 1000,
-        "initial_scale_power": 16,
-        "hysteresis": 2,
-        "min_loss_scale": 1
-    }
-}
-```
-
-and you see in your log that Deepspeed reports `OVERFLOW!` as follows:
-
-```
-0%|                                                                                                                             | 0/189 [00:00<?, ?it/s]
- [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 262144, reducing to 262144
-  1%|▌                                                                                                                    | 1/189 [00:00<01:26,  2.17it/s]
- [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 262144, reducing to 131072.0
-  1%|█▏
- [...]
- [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1, reducing to 1
- 14%|████████████████▌                                                                                                   | 27/189 [00:14<01:13,  2.21it/s]
- [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1, reducing to 1
- 15%|█████████████████▏                                                                                                  | 28/189 [00:14<01:13,  2.18it/s]
- [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1, reducing to 1
- 15%|█████████████████▊                                                                                                  | 29/189 [00:15<01:13,  2.18it/s]
- [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1, reducing to 1
-[...]
-```
-
-that means that the Deepspeed loss scaler can't figure out a scaling co-efficient that overcomes loss overflow.
-
-(the log was massaged to be more readable here.)
-
-In this case you usually need to raise the value of `initial_scale_power`. Setting it to `"initial_scale_power": 32` will typically resolve the problem.
-
-
-
-### Notes
-
-- DeepSpeed works with the PyTorch [`Trainer`] but not TF [`TFTrainer`].
-- While DeepSpeed has a pip installable PyPI package, it is highly recommended that it gets installed from [source](https://github.com/microsoft/deepspeed#installation) to best match your hardware and also if you need to enable
-  certain features, like 1-bit Adam, which aren't available in the pypi distribution.
-- You don't have to use the [`Trainer`] to use DeepSpeed with 🤗 Transformers - you can use any model
-  with your own trainer, and you will have to adapt the latter according to [the DeepSpeed integration instructions](https://www.deepspeed.ai/getting-started/#writing-deepspeed-models).
-
-
-
-
-
-## Non-Trainer Deepspeed Integration
-
-The [`~deepspeed.HfDeepSpeedConfig`] is used to integrate Deepspeed into the 🤗 Transformers core
-functionality, when [`Trainer`] is not used. The only thing that it does is handling Deepspeed ZeRO-3 param gathering and automatically splitting the model onto multiple gpus during `from_pretrained` call. Everything else you have to do by yourself.
-
-When using [`Trainer`] everything is automatically taken care of.
-
-When not using [`Trainer`], to efficiently deploy DeepSpeed ZeRO-3, you must instantiate the
-[`~deepspeed.HfDeepSpeedConfig`] object before instantiating the model and keep that object alive.
-
-If you're using Deepspeed ZeRO-1 or ZeRO-2 you don't need to use `HfDeepSpeedConfig` at all.
-
-For example for a pretrained model:
-
-```python
-from transformers.deepspeed import HfDeepSpeedConfig
-from transformers import AutoModel
-import deepspeed
-
-ds_config = {...}  # deepspeed config object or path to the file
-# must run before instantiating the model to detect zero 3
-dschf = HfDeepSpeedConfig(ds_config)  # keep this object alive
-model = AutoModel.from_pretrained("gpt2")
-engine = deepspeed.initialize(model=model, config_params=ds_config, ...)
-```
-
-or for non-pretrained model:
-
-```python
-from transformers.deepspeed import HfDeepSpeedConfig
-from transformers import AutoModel, AutoConfig
-import deepspeed
-
-ds_config = {...}  # deepspeed config object or path to the file
-# must run before instantiating the model to detect zero 3
-dschf = HfDeepSpeedConfig(ds_config)  # keep this object alive
-config = AutoConfig.from_pretrained("gpt2")
-model = AutoModel.from_config(config)
-engine = deepspeed.initialize(model=model, config_params=ds_config, ...)
-```
-
-Please note that if you're not using the [`Trainer`] integration, you're completely on your own. Basically follow the documentation on the [Deepspeed](https://www.deepspeed.ai/) website. Also you have to configure explicitly the config file - you can't use `"auto"` values and you will have to put real values instead.
-
-## HfDeepSpeedConfig
-
-[[autodoc]] deepspeed.HfDeepSpeedConfig
-    - all
-
-### Custom DeepSpeed ZeRO Inference
-
-Here is an example of how one could do DeepSpeed ZeRO Inference without using [`Trainer`] when one can't fit a model onto a single GPU. The solution includes using additional GPUs or/and offloading GPU memory to CPU memory.
-
-The important nuance to understand here is that the way ZeRO is designed you can process different inputs on different GPUs in parallel.
-
-The example has copious notes and is self-documenting.
-
-Make sure to:
-
-1. disable CPU offload if you have enough GPU memory (since it slows things down)
-2. enable bf16 if you own an Ampere or a newer GPU to make things faster. If you don't have that hardware you may enable fp16 as long as you don't use any model that was pre-trained in bf16 mixed precision (such as most t5 models). These usually overflow in fp16 and you will see garbage as output.
-
-```python
-#!/usr/bin/env python
-
-# This script demonstrates how to use Deepspeed ZeRO in an inference mode when one can't fit a model
-# into a single GPU
-#
-# 1. Use 1 GPU with CPU offload
-# 2. Or use multiple GPUs instead
-#
-# First you need to install deepspeed: pip install deepspeed
-#
-# Here we use a 3B "bigscience/T0_3B" model which needs about 15GB GPU RAM - so 1 largish or 2
-# small GPUs can handle it. or 1 small GPU and a lot of CPU memory.
-#
-# To use a larger model like "bigscience/T0" which needs about 50GB, unless you have an 80GB GPU -
-# you will need 2-4 gpus. And then you can adapt the script to handle more gpus if you want to
-# process multiple inputs at once.
-#
-# The provided deepspeed config also activates CPU memory offloading, so chances are that if you
-# have a lot of available CPU memory and you don't mind a slowdown you should be able to load a
-# model that doesn't normally fit into a single GPU. If you have enough GPU memory the program will
-# run faster if you don't want offload to CPU - so disable that section then.
-#
-# To deploy on 1 gpu:
-#
-# deepspeed --num_gpus 1 t0.py
-# or:
-# python -m torch.distributed.run --nproc_per_node=1 t0.py
-#
-# To deploy on 2 gpus:
-#
-# deepspeed --num_gpus 2 t0.py
-# or:
-# python -m torch.distributed.run --nproc_per_node=2 t0.py
-
-
-from transformers import AutoTokenizer, AutoConfig, AutoModelForSeq2SeqLM
-from transformers.deepspeed import HfDeepSpeedConfig
-import deepspeed
-import os
-import torch
-
-os.environ["TOKENIZERS_PARALLELISM"] = "false"  # To avoid warnings about parallelism in tokenizers
-
-# distributed setup
-local_rank = int(os.getenv("LOCAL_RANK", "0"))
-world_size = int(os.getenv("WORLD_SIZE", "1"))
-torch.cuda.set_device(local_rank)
-deepspeed.init_distributed()
-
-model_name = "bigscience/T0_3B"
-
-config = AutoConfig.from_pretrained(model_name)
-model_hidden_size = config.d_model
-
-# batch size has to be divisible by world_size, but can be bigger than world_size
-train_batch_size = 1 * world_size
-
-# ds_config notes
-#
-# - enable bf16 if you use Ampere or higher GPU - this will run in mixed precision and will be
-# faster.
-#
-# - for older GPUs you can enable fp16, but it'll only work for non-bf16 pretrained models - e.g.
-# all official t5 models are bf16-pretrained
-#
-# - set offload_param.device to "none" or completely remove the `offload_param` section if you don't
-# - want CPU offload
-#
-# - if using `offload_param` you can manually finetune stage3_param_persistence_threshold to control
-# - which params should remain on gpus - the larger the value the smaller the offload size
-#
-# For indepth info on Deepspeed config see
-# https://huggingface.co/docs/transformers/main/main_classes/deepspeed
-
-# keeping the same format as json for consistency, except it uses lower case for true/false
-# fmt: off
-ds_config = {
-    "fp16": {
-        "enabled": False
-    },
-    "bf16": {
-        "enabled": False
-    },
-    "zero_optimization": {
-        "stage": 3,
-        "offload_param": {
-            "device": "cpu",
-            "pin_memory": True
-        },
-        "overlap_comm": True,
-        "contiguous_gradients": True,
-        "reduce_bucket_size": model_hidden_size * model_hidden_size,
-        "stage3_prefetch_bucket_size": 0.9 * model_hidden_size * model_hidden_size,
-        "stage3_param_persistence_threshold": 10 * model_hidden_size
-    },
-    "steps_per_print": 2000,
-    "train_batch_size": train_batch_size,
-    "train_micro_batch_size_per_gpu": 1,
-    "wall_clock_breakdown": False
-}
-# fmt: on
-
-# next line instructs transformers to partition the model directly over multiple gpus using
-# deepspeed.zero.Init when model's `from_pretrained` method is called.
-#
-# **it has to be run before loading the model AutoModelForSeq2SeqLM.from_pretrained(model_name)**
-#
-# otherwise the model will first be loaded normally and only partitioned at forward time which is
-# less efficient and when there is little CPU RAM may fail
-dschf = HfDeepSpeedConfig(ds_config)  # keep this object alive
-
-# now a model can be loaded.
-model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
-
-# initialise Deepspeed ZeRO and store only the engine object
-ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0]
-ds_engine.module.eval()  # inference
-
-# Deepspeed ZeRO can process unrelated inputs on each GPU. So for 2 gpus you process 2 inputs at once.
-# If you use more GPUs adjust for more.
-# And of course if you have just one input to process you then need to pass the same string to both gpus
-# If you use only one GPU, then you will have only rank 0.
-rank = torch.distributed.get_rank()
-if rank == 0:
-    text_in = "Is this review positive or negative? Review: this is the best cast iron skillet you will ever buy"
-elif rank == 1:
-    text_in = "Is this review positive or negative? Review: this is the worst restaurant ever"
-
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-inputs = tokenizer.encode(text_in, return_tensors="pt").to(device=local_rank)
-with torch.no_grad():
-    outputs = ds_engine.module.generate(inputs, synced_gpus=True)
-text_out = tokenizer.decode(outputs[0], skip_special_tokens=True)
-print(f"rank{rank}:\n   in={text_in}\n  out={text_out}")
-```
-
-Let's save it as `t0.py` and run it:
-```
-$ deepspeed --num_gpus 2 t0.py
-rank0:
-   in=Is this review positive or negative? Review: this is the best cast iron skillet you will ever buy
-  out=Positive
-rank1:
-   in=Is this review positive or negative? Review: this is the worst restaurant ever
-  out=negative
-```
-
-This was a very basic example and you will want to adapt it to your needs.
-
-
-## Main DeepSpeed Resources
-
-- [Project's github](https://github.com/microsoft/deepspeed)
-- [Usage docs](https://www.deepspeed.ai/getting-started/)
-- [API docs](https://deepspeed.readthedocs.io/en/latest/index.html)
-- [Blog posts](https://www.microsoft.com/en-us/research/search/?q=deepspeed)
-
-Papers:
-
-- [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https://arxiv.org/abs/1910.02054)
-- [ZeRO-Offload: Democratizing Billion-Scale Model Training](https://arxiv.org/abs/2101.06840)
-- [ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning](https://arxiv.org/abs/2104.07857)
-
-Finally, please, remember that, HuggingFace [`Trainer`] only integrates DeepSpeed, therefore if you
-have any problems or questions with regards to DeepSpeed usage, please, file an issue with [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/issues).
diff --git a/docs/source/en/main_classes/feature_extractor.md b/docs/source/en/main_classes/feature_extractor.md
new file mode 100644
index 000000000000..029034fa60de
--- /dev/null
+++ b/docs/source/en/main_classes/feature_extractor.md
@@ -0,0 +1,42 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Feature Extractor
+
+A feature extractor is in charge of preparing input features for audio or vision models. This includes feature extraction
+from sequences, *e.g.*, pre-processing audio files to Log-Mel Spectrogram features, feature extraction from images
+*e.g.* cropping image image files, but also padding, normalization, and conversion to Numpy, PyTorch, and TensorFlow
+tensors.
+
+
+## FeatureExtractionMixin
+
+[[autodoc]] feature_extraction_utils.FeatureExtractionMixin
+    - from_pretrained
+    - save_pretrained
+
+## SequenceFeatureExtractor
+
+[[autodoc]] SequenceFeatureExtractor
+    - pad
+
+## BatchFeature
+
+[[autodoc]] BatchFeature
+
+## ImageFeatureExtractionMixin
+
+[[autodoc]] image_utils.ImageFeatureExtractionMixin
diff --git a/docs/source/en/main_classes/feature_extractor.mdx b/docs/source/en/main_classes/feature_extractor.mdx
deleted file mode 100644
index 41ee21e4b12e..000000000000
--- a/docs/source/en/main_classes/feature_extractor.mdx
+++ /dev/null
@@ -1,38 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Feature Extractor
-
-A feature extractor is in charge of preparing input features for audio or vision models. This includes feature extraction
-from sequences, *e.g.*, pre-processing audio files to Log-Mel Spectrogram features, feature extraction from images
-*e.g.* cropping image image files, but also padding, normalization, and conversion to Numpy, PyTorch, and TensorFlow
-tensors.
-
-
-## FeatureExtractionMixin
-
-[[autodoc]] feature_extraction_utils.FeatureExtractionMixin
-    - from_pretrained
-    - save_pretrained
-
-## SequenceFeatureExtractor
-
-[[autodoc]] SequenceFeatureExtractor
-    - pad
-
-## BatchFeature
-
-[[autodoc]] BatchFeature
-
-## ImageFeatureExtractionMixin
-
-[[autodoc]] image_utils.ImageFeatureExtractionMixin
diff --git a/docs/source/en/main_classes/image_processor.md b/docs/source/en/main_classes/image_processor.md
new file mode 100644
index 000000000000..04a3cd1337a5
--- /dev/null
+++ b/docs/source/en/main_classes/image_processor.md
@@ -0,0 +1,34 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Image Processor
+
+An image processor is in charge of preparing input features for vision models and post processing their outputs. This includes transformations such as resizing, normalization, and conversion to PyTorch, TensorFlow, Flax and Numpy tensors. It may also include model specific post-processing such as converting logits to segmentation masks.
+
+
+## ImageProcessingMixin
+
+[[autodoc]] image_processing_utils.ImageProcessingMixin
+    - from_pretrained
+    - save_pretrained
+
+## BatchFeature
+
+[[autodoc]] BatchFeature
+
+## BaseImageProcessor
+
+[[autodoc]] image_processing_utils.BaseImageProcessor
diff --git a/docs/source/en/main_classes/image_processor.mdx b/docs/source/en/main_classes/image_processor.mdx
deleted file mode 100644
index 6a108397213f..000000000000
--- a/docs/source/en/main_classes/image_processor.mdx
+++ /dev/null
@@ -1,30 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Image Processor
-
-An image processor is in charge of preparing input features for vision models and post processing their outputs. This includes transformations such as resizing, normalization, and conversion to PyTorch, TensorFlow, Flax and Numpy tensors. It may also include model specific post-processing such as converting logits to segmentation masks.
-
-
-## ImageProcessingMixin
-
-[[autodoc]] image_processing_utils.ImageProcessingMixin
-    - from_pretrained
-    - save_pretrained
-
-## BatchFeature
-
-[[autodoc]] BatchFeature
-
-## BaseImageProcessor
-
-[[autodoc]] image_processing_utils.BaseImageProcessor
diff --git a/docs/source/en/main_classes/keras_callbacks.md b/docs/source/en/main_classes/keras_callbacks.md
new file mode 100644
index 000000000000..c9932300dbc5
--- /dev/null
+++ b/docs/source/en/main_classes/keras_callbacks.md
@@ -0,0 +1,28 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Keras callbacks
+
+When training a Transformers model with Keras, there are some library-specific callbacks available to automate common
+tasks:
+
+## KerasMetricCallback
+
+[[autodoc]] KerasMetricCallback
+
+## PushToHubCallback
+
+[[autodoc]] PushToHubCallback
diff --git a/docs/source/en/main_classes/keras_callbacks.mdx b/docs/source/en/main_classes/keras_callbacks.mdx
deleted file mode 100644
index bc44a0967cc9..000000000000
--- a/docs/source/en/main_classes/keras_callbacks.mdx
+++ /dev/null
@@ -1,24 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Keras callbacks
-
-When training a Transformers model with Keras, there are some library-specific callbacks available to automate common
-tasks:
-
-## KerasMetricCallback
-
-[[autodoc]] KerasMetricCallback
-
-## PushToHubCallback
-
-[[autodoc]] PushToHubCallback
diff --git a/docs/source/en/main_classes/logging.md b/docs/source/en/main_classes/logging.md
new file mode 100644
index 000000000000..d117281f650f
--- /dev/null
+++ b/docs/source/en/main_classes/logging.md
@@ -0,0 +1,102 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Logging
+
+🤗 Transformers has a centralized logging system, so that you can setup the verbosity of the library easily.
+
+Currently the default verbosity of the library is `WARNING`.
+
+To change the level of verbosity, just use one of the direct setters. For instance, here is how to change the verbosity
+to the INFO level.
+
+```python
+import transformers
+
+transformers.logging.set_verbosity_info()
+```
+
+You can also use the environment variable `TRANSFORMERS_VERBOSITY` to override the default verbosity. You can set it
+to one of the following: `debug`, `info`, `warning`, `error`, `critical`. For example:
+
+```bash
+TRANSFORMERS_VERBOSITY=error ./myprogram.py
+```
+
+Additionally, some `warnings` can be disabled by setting the environment variable
+`TRANSFORMERS_NO_ADVISORY_WARNINGS` to a true value, like *1*. This will disable any warning that is logged using
+[`logger.warning_advice`]. For example:
+
+```bash
+TRANSFORMERS_NO_ADVISORY_WARNINGS=1 ./myprogram.py
+```
+
+Here is an example of how to use the same logger as the library in your own module or script:
+
+```python
+from transformers.utils import logging
+
+logging.set_verbosity_info()
+logger = logging.get_logger("transformers")
+logger.info("INFO")
+logger.warning("WARN")
+```
+
+
+All the methods of this logging module are documented below, the main ones are
+[`logging.get_verbosity`] to get the current level of verbosity in the logger and
+[`logging.set_verbosity`] to set the verbosity to the level of your choice. In order (from the least
+verbose to the most verbose), those levels (with their corresponding int values in parenthesis) are:
+
+- `transformers.logging.CRITICAL` or `transformers.logging.FATAL` (int value, 50): only report the most
+  critical errors.
+- `transformers.logging.ERROR` (int value, 40): only report errors.
+- `transformers.logging.WARNING` or `transformers.logging.WARN` (int value, 30): only reports error and
+  warnings. This the default level used by the library.
+- `transformers.logging.INFO` (int value, 20): reports error, warnings and basic information.
+- `transformers.logging.DEBUG` (int value, 10): report all information.
+
+By default, `tqdm` progress bars will be displayed during model download. [`logging.disable_progress_bar`] and [`logging.enable_progress_bar`] can be used to suppress or unsuppress this behavior.
+
+## Base setters
+
+[[autodoc]] logging.set_verbosity_error
+
+[[autodoc]] logging.set_verbosity_warning
+
+[[autodoc]] logging.set_verbosity_info
+
+[[autodoc]] logging.set_verbosity_debug
+
+## Other functions
+
+[[autodoc]] logging.get_verbosity
+
+[[autodoc]] logging.set_verbosity
+
+[[autodoc]] logging.get_logger
+
+[[autodoc]] logging.enable_default_handler
+
+[[autodoc]] logging.disable_default_handler
+
+[[autodoc]] logging.enable_explicit_format
+
+[[autodoc]] logging.reset_format
+
+[[autodoc]] logging.enable_progress_bar
+
+[[autodoc]] logging.disable_progress_bar
diff --git a/docs/source/en/main_classes/logging.mdx b/docs/source/en/main_classes/logging.mdx
deleted file mode 100644
index 9d4432a7290d..000000000000
--- a/docs/source/en/main_classes/logging.mdx
+++ /dev/null
@@ -1,98 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Logging
-
-🤗 Transformers has a centralized logging system, so that you can setup the verbosity of the library easily.
-
-Currently the default verbosity of the library is `WARNING`.
-
-To change the level of verbosity, just use one of the direct setters. For instance, here is how to change the verbosity
-to the INFO level.
-
-```python
-import transformers
-
-transformers.logging.set_verbosity_info()
-```
-
-You can also use the environment variable `TRANSFORMERS_VERBOSITY` to override the default verbosity. You can set it
-to one of the following: `debug`, `info`, `warning`, `error`, `critical`. For example:
-
-```bash
-TRANSFORMERS_VERBOSITY=error ./myprogram.py
-```
-
-Additionally, some `warnings` can be disabled by setting the environment variable
-`TRANSFORMERS_NO_ADVISORY_WARNINGS` to a true value, like *1*. This will disable any warning that is logged using
-[`logger.warning_advice`]. For example:
-
-```bash
-TRANSFORMERS_NO_ADVISORY_WARNINGS=1 ./myprogram.py
-```
-
-Here is an example of how to use the same logger as the library in your own module or script:
-
-```python
-from transformers.utils import logging
-
-logging.set_verbosity_info()
-logger = logging.get_logger("transformers")
-logger.info("INFO")
-logger.warning("WARN")
-```
-
-
-All the methods of this logging module are documented below, the main ones are
-[`logging.get_verbosity`] to get the current level of verbosity in the logger and
-[`logging.set_verbosity`] to set the verbosity to the level of your choice. In order (from the least
-verbose to the most verbose), those levels (with their corresponding int values in parenthesis) are:
-
-- `transformers.logging.CRITICAL` or `transformers.logging.FATAL` (int value, 50): only report the most
-  critical errors.
-- `transformers.logging.ERROR` (int value, 40): only report errors.
-- `transformers.logging.WARNING` or `transformers.logging.WARN` (int value, 30): only reports error and
-  warnings. This the default level used by the library.
-- `transformers.logging.INFO` (int value, 20): reports error, warnings and basic information.
-- `transformers.logging.DEBUG` (int value, 10): report all information.
-
-By default, `tqdm` progress bars will be displayed during model download. [`logging.disable_progress_bar`] and [`logging.enable_progress_bar`] can be used to suppress or unsuppress this behavior.
-
-## Base setters
-
-[[autodoc]] logging.set_verbosity_error
-
-[[autodoc]] logging.set_verbosity_warning
-
-[[autodoc]] logging.set_verbosity_info
-
-[[autodoc]] logging.set_verbosity_debug
-
-## Other functions
-
-[[autodoc]] logging.get_verbosity
-
-[[autodoc]] logging.set_verbosity
-
-[[autodoc]] logging.get_logger
-
-[[autodoc]] logging.enable_default_handler
-
-[[autodoc]] logging.disable_default_handler
-
-[[autodoc]] logging.enable_explicit_format
-
-[[autodoc]] logging.reset_format
-
-[[autodoc]] logging.enable_progress_bar
-
-[[autodoc]] logging.disable_progress_bar
diff --git a/docs/source/en/main_classes/model.md b/docs/source/en/main_classes/model.md
new file mode 100644
index 000000000000..da907f80ee48
--- /dev/null
+++ b/docs/source/en/main_classes/model.md
@@ -0,0 +1,167 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Models
+
+The base classes [`PreTrainedModel`], [`TFPreTrainedModel`], and
+[`FlaxPreTrainedModel`] implement the common methods for loading/saving a model either from a local
+file or directory, or from a pretrained model configuration provided by the library (downloaded from HuggingFace's AWS
+S3 repository).
+
+[`PreTrainedModel`] and [`TFPreTrainedModel`] also implement a few methods which
+are common among all the models to:
+
+- resize the input token embeddings when new tokens are added to the vocabulary
+- prune the attention heads of the model.
+
+The other methods that are common to each model are defined in [`~modeling_utils.ModuleUtilsMixin`]
+(for the PyTorch models) and [`~modeling_tf_utils.TFModuleUtilsMixin`] (for the TensorFlow models) or
+for text generation, [`~generation.GenerationMixin`] (for the PyTorch models),
+[`~generation.TFGenerationMixin`] (for the TensorFlow models) and
+[`~generation.FlaxGenerationMixin`] (for the Flax/JAX models).
+
+
+## PreTrainedModel
+
+[[autodoc]] PreTrainedModel
+    - push_to_hub
+    - all
+
+<a id='from_pretrained-torch-dtype'></a>
+
+### Large model loading
+
+In Transformers 4.20.0, the [`~PreTrainedModel.from_pretrained`] method has been reworked to accommodate large models using [Accelerate](https://huggingface.co/docs/accelerate/big_modeling). This requires Accelerate >= 0.9.0 and PyTorch >= 1.9.0. Instead of creating the full model, then loading the pretrained weights inside it (which takes twice the size of the model in RAM, one for the randomly initialized model, one for the weights), there is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded.
+
+This option can be activated with `low_cpu_mem_usage=True`. The model is first created on the Meta device (with empty weights) and the state dict is then loaded inside it (shard by shard in the case of a sharded checkpoint). This way the maximum RAM used is the full size of the model only.
+
+```py
+from transformers import AutoModelForSeq2SeqLM
+
+t0pp = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0pp", low_cpu_mem_usage=True)
+```
+
+Moreover, you can directly place the model on different devices if it doesn't fully fit in RAM (only works for inference for now). With `device_map="auto"`, Accelerate will determine where to put each layer to maximize the use of your fastest devices (GPUs) and offload the rest on the CPU, or even the hard drive if you don't have enough GPU RAM (or CPU RAM). Even if the model is split across several devices, it will run as you would normally expect.
+
+When passing a `device_map`, `low_cpu_mem_usage` is automatically set to `True`, so you don't need to specify it:
+
+```py
+from transformers import AutoModelForSeq2SeqLM
+
+t0pp = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0pp", device_map="auto")
+```
+
+You can inspect how the model was split across devices by looking at its `hf_device_map` attribute:
+
+```py
+t0pp.hf_device_map
+```
+
+```python out
+{'shared': 0,
+ 'decoder.embed_tokens': 0,
+ 'encoder': 0,
+ 'decoder.block.0': 0,
+ 'decoder.block.1': 1,
+ 'decoder.block.2': 1,
+ 'decoder.block.3': 1,
+ 'decoder.block.4': 1,
+ 'decoder.block.5': 1,
+ 'decoder.block.6': 1,
+ 'decoder.block.7': 1,
+ 'decoder.block.8': 1,
+ 'decoder.block.9': 1,
+ 'decoder.block.10': 1,
+ 'decoder.block.11': 1,
+ 'decoder.block.12': 1,
+ 'decoder.block.13': 1,
+ 'decoder.block.14': 1,
+ 'decoder.block.15': 1,
+ 'decoder.block.16': 1,
+ 'decoder.block.17': 1,
+ 'decoder.block.18': 1,
+ 'decoder.block.19': 1,
+ 'decoder.block.20': 1,
+ 'decoder.block.21': 1,
+ 'decoder.block.22': 'cpu',
+ 'decoder.block.23': 'cpu',
+ 'decoder.final_layer_norm': 'cpu',
+ 'decoder.dropout': 'cpu',
+ 'lm_head': 'cpu'}
+```
+
+You can also write your own device map following the same format (a dictionary layer name to device). It should map all parameters of the model to a given device, but you don't have to detail where all the submodules of one layer go if that layer is entirely on the same device. For instance, the following device map would work properly for T0pp (as long as you have the GPU memory):
+
+```python
+device_map = {"shared": 0, "encoder": 0, "decoder": 1, "lm_head": 1}
+```
+
+Another way to minimize the memory impact of your model is to instantiate it at a lower precision dtype (like `torch.float16`) or use direct quantization techniques as described below.
+
+### Model Instantiation dtype
+
+Under Pytorch a model normally gets instantiated with `torch.float32` format. This can be an issue if one tries to
+load a model whose weights are in fp16, since it'd require twice as much memory. To overcome this limitation, you can
+either explicitly pass the desired `dtype` using `torch_dtype` argument:
+
+```python
+model = T5ForConditionalGeneration.from_pretrained("t5", torch_dtype=torch.float16)
+```
+
+or, if you want the model to always load in the most optimal memory pattern, you can use the special value `"auto"`,
+and then `dtype` will be automatically derived from the model's weights:
+
+```python
+model = T5ForConditionalGeneration.from_pretrained("t5", torch_dtype="auto")
+```
+
+Models instantiated from scratch can also be told which `dtype` to use with:
+
+```python
+config = T5Config.from_pretrained("t5")
+model = AutoModel.from_config(config)
+```
+
+Due to Pytorch design, this functionality is only available for floating dtypes.
+
+
+## ModuleUtilsMixin
+
+[[autodoc]] modeling_utils.ModuleUtilsMixin
+
+## TFPreTrainedModel
+
+[[autodoc]] TFPreTrainedModel
+    - push_to_hub
+    - all
+
+## TFModelUtilsMixin
+
+[[autodoc]] modeling_tf_utils.TFModelUtilsMixin
+
+## FlaxPreTrainedModel
+
+[[autodoc]] FlaxPreTrainedModel
+    - push_to_hub
+    - all
+
+## Pushing to the Hub
+
+[[autodoc]] utils.PushToHubMixin
+
+## Sharded checkpoints
+
+[[autodoc]] modeling_utils.load_sharded_checkpoint
diff --git a/docs/source/en/main_classes/model.mdx b/docs/source/en/main_classes/model.mdx
deleted file mode 100644
index fee685b3efc7..000000000000
--- a/docs/source/en/main_classes/model.mdx
+++ /dev/null
@@ -1,163 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Models
-
-The base classes [`PreTrainedModel`], [`TFPreTrainedModel`], and
-[`FlaxPreTrainedModel`] implement the common methods for loading/saving a model either from a local
-file or directory, or from a pretrained model configuration provided by the library (downloaded from HuggingFace's AWS
-S3 repository).
-
-[`PreTrainedModel`] and [`TFPreTrainedModel`] also implement a few methods which
-are common among all the models to:
-
-- resize the input token embeddings when new tokens are added to the vocabulary
-- prune the attention heads of the model.
-
-The other methods that are common to each model are defined in [`~modeling_utils.ModuleUtilsMixin`]
-(for the PyTorch models) and [`~modeling_tf_utils.TFModuleUtilsMixin`] (for the TensorFlow models) or
-for text generation, [`~generation.GenerationMixin`] (for the PyTorch models),
-[`~generation.TFGenerationMixin`] (for the TensorFlow models) and
-[`~generation.FlaxGenerationMixin`] (for the Flax/JAX models).
-
-
-## PreTrainedModel
-
-[[autodoc]] PreTrainedModel
-    - push_to_hub
-    - all
-
-<a id='from_pretrained-torch-dtype'></a>
-
-### Large model loading
-
-In Transformers 4.20.0, the [`~PreTrainedModel.from_pretrained`] method has been reworked to accommodate large models using [Accelerate](https://huggingface.co/docs/accelerate/big_modeling). This requires Accelerate >= 0.9.0 and PyTorch >= 1.9.0. Instead of creating the full model, then loading the pretrained weights inside it (which takes twice the size of the model in RAM, one for the randomly initialized model, one for the weights), there is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded.
-
-This option can be activated with `low_cpu_mem_usage=True`. The model is first created on the Meta device (with empty weights) and the state dict is then loaded inside it (shard by shard in the case of a sharded checkpoint). This way the maximum RAM used is the full size of the model only.
-
-```py
-from transformers import AutoModelForSeq2SeqLM
-
-t0pp = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0pp", low_cpu_mem_usage=True)
-```
-
-Moreover, you can directly place the model on different devices if it doesn't fully fit in RAM (only works for inference for now). With `device_map="auto"`, Accelerate will determine where to put each layer to maximize the use of your fastest devices (GPUs) and offload the rest on the CPU, or even the hard drive if you don't have enough GPU RAM (or CPU RAM). Even if the model is split across several devices, it will run as you would normally expect.
-
-When passing a `device_map`, `low_cpu_mem_usage` is automatically set to `True`, so you don't need to specify it:
-
-```py
-from transformers import AutoModelForSeq2SeqLM
-
-t0pp = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0pp", device_map="auto")
-```
-
-You can inspect how the model was split across devices by looking at its `hf_device_map` attribute:
-
-```py
-t0pp.hf_device_map
-```
-
-```python out
-{'shared': 0,
- 'decoder.embed_tokens': 0,
- 'encoder': 0,
- 'decoder.block.0': 0,
- 'decoder.block.1': 1,
- 'decoder.block.2': 1,
- 'decoder.block.3': 1,
- 'decoder.block.4': 1,
- 'decoder.block.5': 1,
- 'decoder.block.6': 1,
- 'decoder.block.7': 1,
- 'decoder.block.8': 1,
- 'decoder.block.9': 1,
- 'decoder.block.10': 1,
- 'decoder.block.11': 1,
- 'decoder.block.12': 1,
- 'decoder.block.13': 1,
- 'decoder.block.14': 1,
- 'decoder.block.15': 1,
- 'decoder.block.16': 1,
- 'decoder.block.17': 1,
- 'decoder.block.18': 1,
- 'decoder.block.19': 1,
- 'decoder.block.20': 1,
- 'decoder.block.21': 1,
- 'decoder.block.22': 'cpu',
- 'decoder.block.23': 'cpu',
- 'decoder.final_layer_norm': 'cpu',
- 'decoder.dropout': 'cpu',
- 'lm_head': 'cpu'}
-```
-
-You can also write your own device map following the same format (a dictionary layer name to device). It should map all parameters of the model to a given device, but you don't have to detail where all the submosules of one layer go if that layer is entirely on the same device. For instance, the following device map would work properly for T0pp (as long as you have the GPU memory):
-
-```python
-device_map = {"shared": 0, "encoder": 0, "decoder": 1, "lm_head": 1}
-```
-
-Another way to minimize the memory impact of your model is to instantiate it at a lower precision dtype (like `torch.float16`) or use direct quantization techniques as described below.
-
-### Model Instantiation dtype
-
-Under Pytorch a model normally gets instantiated with `torch.float32` format. This can be an issue if one tries to
-load a model whose weights are in fp16, since it'd require twice as much memory. To overcome this limitation, you can
-either explicitly pass the desired `dtype` using `torch_dtype` argument:
-
-```python
-model = T5ForConditionalGeneration.from_pretrained("t5", torch_dtype=torch.float16)
-```
-
-or, if you want the model to always load in the most optimal memory pattern, you can use the special value `"auto"`,
-and then `dtype` will be automatically derived from the model's weights:
-
-```python
-model = T5ForConditionalGeneration.from_pretrained("t5", torch_dtype="auto")
-```
-
-Models instantiated from scratch can also be told which `dtype` to use with:
-
-```python
-config = T5Config.from_pretrained("t5")
-model = AutoModel.from_config(config)
-```
-
-Due to Pytorch design, this functionality is only available for floating dtypes.
-
-
-## ModuleUtilsMixin
-
-[[autodoc]] modeling_utils.ModuleUtilsMixin
-
-## TFPreTrainedModel
-
-[[autodoc]] TFPreTrainedModel
-    - push_to_hub
-    - all
-
-## TFModelUtilsMixin
-
-[[autodoc]] modeling_tf_utils.TFModelUtilsMixin
-
-## FlaxPreTrainedModel
-
-[[autodoc]] FlaxPreTrainedModel
-    - push_to_hub
-    - all
-
-## Pushing to the Hub
-
-[[autodoc]] utils.PushToHubMixin
-
-## Sharded checkpoints
-
-[[autodoc]] modeling_utils.load_sharded_checkpoint
diff --git a/docs/source/en/main_classes/onnx.md b/docs/source/en/main_classes/onnx.md
new file mode 100644
index 000000000000..81d31c97e88d
--- /dev/null
+++ b/docs/source/en/main_classes/onnx.md
@@ -0,0 +1,54 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Exporting 🤗 Transformers models to ONNX
+
+🤗 Transformers provides a `transformers.onnx` package that enables you to
+convert model checkpoints to an ONNX graph by leveraging configuration objects.
+
+See the [guide](../serialization) on exporting 🤗 Transformers models for more
+details.
+
+## ONNX Configurations
+
+We provide three abstract classes that you should inherit from, depending on the
+type of model architecture you wish to export:
+
+* Encoder-based models inherit from [`~onnx.config.OnnxConfig`]
+* Decoder-based models inherit from [`~onnx.config.OnnxConfigWithPast`]
+* Encoder-decoder models inherit from [`~onnx.config.OnnxSeq2SeqConfigWithPast`]
+
+### OnnxConfig
+
+[[autodoc]] onnx.config.OnnxConfig
+
+### OnnxConfigWithPast
+
+[[autodoc]] onnx.config.OnnxConfigWithPast
+
+### OnnxSeq2SeqConfigWithPast
+
+[[autodoc]] onnx.config.OnnxSeq2SeqConfigWithPast
+
+## ONNX Features
+
+Each ONNX configuration is associated with a set of _features_ that enable you
+to export models for different types of topologies or tasks.
+
+### FeaturesManager
+
+[[autodoc]] onnx.features.FeaturesManager
+
diff --git a/docs/source/en/main_classes/onnx.mdx b/docs/source/en/main_classes/onnx.mdx
deleted file mode 100644
index ff20f315a1a9..000000000000
--- a/docs/source/en/main_classes/onnx.mdx
+++ /dev/null
@@ -1,50 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Exporting 🤗 Transformers models to ONNX
-
-🤗 Transformers provides a `transformers.onnx` package that enables you to
-convert model checkpoints to an ONNX graph by leveraging configuration objects.
-
-See the [guide](../serialization) on exporting 🤗 Transformers models for more
-details.
-
-## ONNX Configurations
-
-We provide three abstract classes that you should inherit from, depending on the
-type of model architecture you wish to export:
-
-* Encoder-based models inherit from [`~onnx.config.OnnxConfig`]
-* Decoder-based models inherit from [`~onnx.config.OnnxConfigWithPast`]
-* Encoder-decoder models inherit from [`~onnx.config.OnnxSeq2SeqConfigWithPast`]
-
-### OnnxConfig
-
-[[autodoc]] onnx.config.OnnxConfig
-
-### OnnxConfigWithPast
-
-[[autodoc]] onnx.config.OnnxConfigWithPast
-
-### OnnxSeq2SeqConfigWithPast
-
-[[autodoc]] onnx.config.OnnxSeq2SeqConfigWithPast
-
-## ONNX Features
-
-Each ONNX configuration is associated with a set of _features_ that enable you
-to export models for different types of topologies or tasks.
-
-### FeaturesManager
-
-[[autodoc]] onnx.features.FeaturesManager
-
diff --git a/docs/source/en/main_classes/optimizer_schedules.md b/docs/source/en/main_classes/optimizer_schedules.md
new file mode 100644
index 000000000000..dfcab9e91465
--- /dev/null
+++ b/docs/source/en/main_classes/optimizer_schedules.md
@@ -0,0 +1,77 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Optimization
+
+The `.optimization` module provides:
+
+- an optimizer with weight decay fixed that can be used to fine-tuned models, and
+- several schedules in the form of schedule objects that inherit from `_LRSchedule`:
+- a gradient accumulation class to accumulate the gradients of multiple batches
+
+## AdamW (PyTorch)
+
+[[autodoc]] AdamW
+
+## AdaFactor (PyTorch)
+
+[[autodoc]] Adafactor
+
+## AdamWeightDecay (TensorFlow)
+
+[[autodoc]] AdamWeightDecay
+
+[[autodoc]] create_optimizer
+
+## Schedules
+
+### Learning Rate Schedules (Pytorch)
+
+[[autodoc]] SchedulerType
+
+[[autodoc]] get_scheduler
+
+[[autodoc]] get_constant_schedule
+
+[[autodoc]] get_constant_schedule_with_warmup
+
+<img alt="" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/warmup_constant_schedule.png"/>
+
+[[autodoc]] get_cosine_schedule_with_warmup
+
+<img alt="" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/warmup_cosine_schedule.png"/>
+
+[[autodoc]] get_cosine_with_hard_restarts_schedule_with_warmup
+
+<img alt="" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/warmup_cosine_hard_restarts_schedule.png"/>
+
+[[autodoc]] get_linear_schedule_with_warmup
+
+<img alt="" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/warmup_linear_schedule.png"/>
+
+[[autodoc]] get_polynomial_decay_schedule_with_warmup
+
+[[autodoc]] get_inverse_sqrt_schedule
+
+### Warmup (TensorFlow)
+
+[[autodoc]] WarmUp
+
+## Gradient Strategies
+
+### GradientAccumulator (TensorFlow)
+
+[[autodoc]] GradientAccumulator
diff --git a/docs/source/en/main_classes/optimizer_schedules.mdx b/docs/source/en/main_classes/optimizer_schedules.mdx
deleted file mode 100644
index c842c8d1aa6a..000000000000
--- a/docs/source/en/main_classes/optimizer_schedules.mdx
+++ /dev/null
@@ -1,71 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Optimization
-
-The `.optimization` module provides:
-
-- an optimizer with weight decay fixed that can be used to fine-tuned models, and
-- several schedules in the form of schedule objects that inherit from `_LRSchedule`:
-- a gradient accumulation class to accumulate the gradients of multiple batches
-
-## AdamW (PyTorch)
-
-[[autodoc]] AdamW
-
-## AdaFactor (PyTorch)
-
-[[autodoc]] Adafactor
-
-## AdamWeightDecay (TensorFlow)
-
-[[autodoc]] AdamWeightDecay
-
-[[autodoc]] create_optimizer
-
-## Schedules
-
-### Learning Rate Schedules (Pytorch)
-
-[[autodoc]] SchedulerType
-
-[[autodoc]] get_scheduler
-
-[[autodoc]] get_constant_schedule
-
-[[autodoc]] get_constant_schedule_with_warmup
-
-<img alt="" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/warmup_constant_schedule.png"/>
-
-[[autodoc]] get_cosine_schedule_with_warmup
-
-<img alt="" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/warmup_cosine_schedule.png"/>
-
-[[autodoc]] get_cosine_with_hard_restarts_schedule_with_warmup
-
-<img alt="" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/warmup_cosine_hard_restarts_schedule.png"/>
-
-[[autodoc]] get_linear_schedule_with_warmup
-
-<img alt="" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/warmup_linear_schedule.png"/>
-
-[[autodoc]] get_polynomial_decay_schedule_with_warmup
-
-### Warmup (TensorFlow)
-
-[[autodoc]] WarmUp
-
-## Gradient Strategies
-
-### GradientAccumulator (TensorFlow)
-
-[[autodoc]] GradientAccumulator
diff --git a/docs/source/en/main_classes/output.md b/docs/source/en/main_classes/output.md
new file mode 100644
index 000000000000..578b9e6542d1
--- /dev/null
+++ b/docs/source/en/main_classes/output.md
@@ -0,0 +1,320 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Model outputs
+
+All models have outputs that are instances of subclasses of [`~utils.ModelOutput`]. Those are
+data structures containing all the information returned by the model, but that can also be used as tuples or
+dictionaries.
+
+Let's see how this looks in an example:
+
+```python
+from transformers import BertTokenizer, BertForSequenceClassification
+import torch
+
+tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
+
+inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+outputs = model(**inputs, labels=labels)
+```
+
+The `outputs` object is a [`~modeling_outputs.SequenceClassifierOutput`], as we can see in the
+documentation of that class below, it means it has an optional `loss`, a `logits`, an optional `hidden_states` and
+an optional `attentions` attribute. Here we have the `loss` since we passed along `labels`, but we don't have
+`hidden_states` and `attentions` because we didn't pass `output_hidden_states=True` or
+`output_attentions=True`.
+
+<Tip>
+
+When passing `output_hidden_states=True` you may expect the `outputs.hidden_states[-1]` to match `outputs.last_hidden_states` exactly.
+However, this is not always the case. Some models apply normalization or subsequent process to the last hidden state when it's returned.
+</Tip>
+
+
+You can access each attribute as you would usually do, and if that attribute has not been returned by the model, you
+will get `None`. Here for instance `outputs.loss` is the loss computed by the model, and `outputs.attentions` is
+`None`.
+
+When considering our `outputs` object as tuple, it only considers the attributes that don't have `None` values.
+Here for instance, it has two elements, `loss` then `logits`, so
+
+```python
+outputs[:2]
+```
+
+will return the tuple `(outputs.loss, outputs.logits)` for instance.
+
+When considering our `outputs` object as dictionary, it only considers the attributes that don't have `None`
+values. Here for instance, it has two keys that are `loss` and `logits`.
+
+We document here the generic model outputs that are used by more than one model type. Specific output types are
+documented on their corresponding model page.
+
+## ModelOutput
+
+[[autodoc]] utils.ModelOutput
+    - to_tuple
+
+## BaseModelOutput
+
+[[autodoc]] modeling_outputs.BaseModelOutput
+
+## BaseModelOutputWithPooling
+
+[[autodoc]] modeling_outputs.BaseModelOutputWithPooling
+
+## BaseModelOutputWithCrossAttentions
+
+[[autodoc]] modeling_outputs.BaseModelOutputWithCrossAttentions
+
+## BaseModelOutputWithPoolingAndCrossAttentions
+
+[[autodoc]] modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions
+
+## BaseModelOutputWithPast
+
+[[autodoc]] modeling_outputs.BaseModelOutputWithPast
+
+## BaseModelOutputWithPastAndCrossAttentions
+
+[[autodoc]] modeling_outputs.BaseModelOutputWithPastAndCrossAttentions
+
+## Seq2SeqModelOutput
+
+[[autodoc]] modeling_outputs.Seq2SeqModelOutput
+
+## CausalLMOutput
+
+[[autodoc]] modeling_outputs.CausalLMOutput
+
+## CausalLMOutputWithCrossAttentions
+
+[[autodoc]] modeling_outputs.CausalLMOutputWithCrossAttentions
+
+## CausalLMOutputWithPast
+
+[[autodoc]] modeling_outputs.CausalLMOutputWithPast
+
+## MaskedLMOutput
+
+[[autodoc]] modeling_outputs.MaskedLMOutput
+
+## Seq2SeqLMOutput
+
+[[autodoc]] modeling_outputs.Seq2SeqLMOutput
+
+## NextSentencePredictorOutput
+
+[[autodoc]] modeling_outputs.NextSentencePredictorOutput
+
+## SequenceClassifierOutput
+
+[[autodoc]] modeling_outputs.SequenceClassifierOutput
+
+## Seq2SeqSequenceClassifierOutput
+
+[[autodoc]] modeling_outputs.Seq2SeqSequenceClassifierOutput
+
+## MultipleChoiceModelOutput
+
+[[autodoc]] modeling_outputs.MultipleChoiceModelOutput
+
+## TokenClassifierOutput
+
+[[autodoc]] modeling_outputs.TokenClassifierOutput
+
+## QuestionAnsweringModelOutput
+
+[[autodoc]] modeling_outputs.QuestionAnsweringModelOutput
+
+## Seq2SeqQuestionAnsweringModelOutput
+
+[[autodoc]] modeling_outputs.Seq2SeqQuestionAnsweringModelOutput
+
+## Seq2SeqSpectrogramOutput
+
+[[autodoc]] modeling_outputs.Seq2SeqSpectrogramOutput
+
+## SemanticSegmenterOutput
+
+[[autodoc]] modeling_outputs.SemanticSegmenterOutput
+
+## ImageClassifierOutput
+
+[[autodoc]] modeling_outputs.ImageClassifierOutput
+
+## ImageClassifierOutputWithNoAttention
+
+[[autodoc]] modeling_outputs.ImageClassifierOutputWithNoAttention
+
+## DepthEstimatorOutput
+
+[[autodoc]] modeling_outputs.DepthEstimatorOutput
+
+## Wav2Vec2BaseModelOutput
+
+[[autodoc]] modeling_outputs.Wav2Vec2BaseModelOutput
+
+## XVectorOutput
+
+[[autodoc]] modeling_outputs.XVectorOutput
+
+## Seq2SeqTSModelOutput
+
+[[autodoc]] modeling_outputs.Seq2SeqTSModelOutput
+
+## Seq2SeqTSPredictionOutput
+
+[[autodoc]] modeling_outputs.Seq2SeqTSPredictionOutput
+
+## SampleTSPredictionOutput
+
+[[autodoc]] modeling_outputs.SampleTSPredictionOutput
+
+## TFBaseModelOutput
+
+[[autodoc]] modeling_tf_outputs.TFBaseModelOutput
+
+## TFBaseModelOutputWithPooling
+
+[[autodoc]] modeling_tf_outputs.TFBaseModelOutputWithPooling
+
+## TFBaseModelOutputWithPoolingAndCrossAttentions
+
+[[autodoc]] modeling_tf_outputs.TFBaseModelOutputWithPoolingAndCrossAttentions
+
+## TFBaseModelOutputWithPast
+
+[[autodoc]] modeling_tf_outputs.TFBaseModelOutputWithPast
+
+## TFBaseModelOutputWithPastAndCrossAttentions
+
+[[autodoc]] modeling_tf_outputs.TFBaseModelOutputWithPastAndCrossAttentions
+
+## TFSeq2SeqModelOutput
+
+[[autodoc]] modeling_tf_outputs.TFSeq2SeqModelOutput
+
+## TFCausalLMOutput
+
+[[autodoc]] modeling_tf_outputs.TFCausalLMOutput
+
+## TFCausalLMOutputWithCrossAttentions
+
+[[autodoc]] modeling_tf_outputs.TFCausalLMOutputWithCrossAttentions
+
+## TFCausalLMOutputWithPast
+
+[[autodoc]] modeling_tf_outputs.TFCausalLMOutputWithPast
+
+## TFMaskedLMOutput
+
+[[autodoc]] modeling_tf_outputs.TFMaskedLMOutput
+
+## TFSeq2SeqLMOutput
+
+[[autodoc]] modeling_tf_outputs.TFSeq2SeqLMOutput
+
+## TFNextSentencePredictorOutput
+
+[[autodoc]] modeling_tf_outputs.TFNextSentencePredictorOutput
+
+## TFSequenceClassifierOutput
+
+[[autodoc]] modeling_tf_outputs.TFSequenceClassifierOutput
+
+## TFSeq2SeqSequenceClassifierOutput
+
+[[autodoc]] modeling_tf_outputs.TFSeq2SeqSequenceClassifierOutput
+
+## TFMultipleChoiceModelOutput
+
+[[autodoc]] modeling_tf_outputs.TFMultipleChoiceModelOutput
+
+## TFTokenClassifierOutput
+
+[[autodoc]] modeling_tf_outputs.TFTokenClassifierOutput
+
+## TFQuestionAnsweringModelOutput
+
+[[autodoc]] modeling_tf_outputs.TFQuestionAnsweringModelOutput
+
+## TFSeq2SeqQuestionAnsweringModelOutput
+
+[[autodoc]] modeling_tf_outputs.TFSeq2SeqQuestionAnsweringModelOutput
+
+## FlaxBaseModelOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxBaseModelOutput
+
+## FlaxBaseModelOutputWithPast
+
+[[autodoc]] modeling_flax_outputs.FlaxBaseModelOutputWithPast
+
+## FlaxBaseModelOutputWithPooling
+
+[[autodoc]] modeling_flax_outputs.FlaxBaseModelOutputWithPooling
+
+## FlaxBaseModelOutputWithPastAndCrossAttentions
+
+[[autodoc]] modeling_flax_outputs.FlaxBaseModelOutputWithPastAndCrossAttentions
+
+## FlaxSeq2SeqModelOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxSeq2SeqModelOutput
+
+## FlaxCausalLMOutputWithCrossAttentions
+
+[[autodoc]] modeling_flax_outputs.FlaxCausalLMOutputWithCrossAttentions
+
+## FlaxMaskedLMOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxMaskedLMOutput
+
+## FlaxSeq2SeqLMOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxSeq2SeqLMOutput
+
+## FlaxNextSentencePredictorOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxNextSentencePredictorOutput
+
+## FlaxSequenceClassifierOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxSequenceClassifierOutput
+
+## FlaxSeq2SeqSequenceClassifierOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxSeq2SeqSequenceClassifierOutput
+
+## FlaxMultipleChoiceModelOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxMultipleChoiceModelOutput
+
+## FlaxTokenClassifierOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxTokenClassifierOutput
+
+## FlaxQuestionAnsweringModelOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxQuestionAnsweringModelOutput
+
+## FlaxSeq2SeqQuestionAnsweringModelOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxSeq2SeqQuestionAnsweringModelOutput
diff --git a/docs/source/en/main_classes/output.mdx b/docs/source/en/main_classes/output.mdx
deleted file mode 100644
index 391955ce2913..000000000000
--- a/docs/source/en/main_classes/output.mdx
+++ /dev/null
@@ -1,293 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Model outputs
-
-All models have outputs that are instances of subclasses of [`~utils.ModelOutput`]. Those are
-data structures containing all the information returned by the model, but that can also be used as tuples or
-dictionaries.
-
-Let's see how this looks in an example:
-
-```python
-from transformers import BertTokenizer, BertForSequenceClassification
-import torch
-
-tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
-model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
-
-inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-outputs = model(**inputs, labels=labels)
-```
-
-The `outputs` object is a [`~modeling_outputs.SequenceClassifierOutput`], as we can see in the
-documentation of that class below, it means it has an optional `loss`, a `logits` an optional `hidden_states` and
-an optional `attentions` attribute. Here we have the `loss` since we passed along `labels`, but we don't have
-`hidden_states` and `attentions` because we didn't pass `output_hidden_states=True` or
-`output_attentions=True`.
-
-You can access each attribute as you would usually do, and if that attribute has not been returned by the model, you
-will get `None`. Here for instance `outputs.loss` is the loss computed by the model, and `outputs.attentions` is
-`None`.
-
-When considering our `outputs` object as tuple, it only considers the attributes that don't have `None` values.
-Here for instance, it has two elements, `loss` then `logits`, so
-
-```python
-outputs[:2]
-```
-
-will return the tuple `(outputs.loss, outputs.logits)` for instance.
-
-When considering our `outputs` object as dictionary, it only considers the attributes that don't have `None`
-values. Here for instance, it has two keys that are `loss` and `logits`.
-
-We document here the generic model outputs that are used by more than one model type. Specific output types are
-documented on their corresponding model page.
-
-## ModelOutput
-
-[[autodoc]] utils.ModelOutput
-    - to_tuple
-
-## BaseModelOutput
-
-[[autodoc]] modeling_outputs.BaseModelOutput
-
-## BaseModelOutputWithPooling
-
-[[autodoc]] modeling_outputs.BaseModelOutputWithPooling
-
-## BaseModelOutputWithCrossAttentions
-
-[[autodoc]] modeling_outputs.BaseModelOutputWithCrossAttentions
-
-## BaseModelOutputWithPoolingAndCrossAttentions
-
-[[autodoc]] modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions
-
-## BaseModelOutputWithPast
-
-[[autodoc]] modeling_outputs.BaseModelOutputWithPast
-
-## BaseModelOutputWithPastAndCrossAttentions
-
-[[autodoc]] modeling_outputs.BaseModelOutputWithPastAndCrossAttentions
-
-## Seq2SeqModelOutput
-
-[[autodoc]] modeling_outputs.Seq2SeqModelOutput
-
-## CausalLMOutput
-
-[[autodoc]] modeling_outputs.CausalLMOutput
-
-## CausalLMOutputWithCrossAttentions
-
-[[autodoc]] modeling_outputs.CausalLMOutputWithCrossAttentions
-
-## CausalLMOutputWithPast
-
-[[autodoc]] modeling_outputs.CausalLMOutputWithPast
-
-## MaskedLMOutput
-
-[[autodoc]] modeling_outputs.MaskedLMOutput
-
-## Seq2SeqLMOutput
-
-[[autodoc]] modeling_outputs.Seq2SeqLMOutput
-
-## NextSentencePredictorOutput
-
-[[autodoc]] modeling_outputs.NextSentencePredictorOutput
-
-## SequenceClassifierOutput
-
-[[autodoc]] modeling_outputs.SequenceClassifierOutput
-
-## Seq2SeqSequenceClassifierOutput
-
-[[autodoc]] modeling_outputs.Seq2SeqSequenceClassifierOutput
-
-## MultipleChoiceModelOutput
-
-[[autodoc]] modeling_outputs.MultipleChoiceModelOutput
-
-## TokenClassifierOutput
-
-[[autodoc]] modeling_outputs.TokenClassifierOutput
-
-## QuestionAnsweringModelOutput
-
-[[autodoc]] modeling_outputs.QuestionAnsweringModelOutput
-
-## Seq2SeqQuestionAnsweringModelOutput
-
-[[autodoc]] modeling_outputs.Seq2SeqQuestionAnsweringModelOutput
-
-## SemanticSegmenterOutput
-
-[[autodoc]] modeling_outputs.SemanticSegmenterOutput
-
-## ImageClassifierOutput
-
-[[autodoc]] modeling_outputs.ImageClassifierOutput
-
-## ImageClassifierOutputWithNoAttention
-
-[[autodoc]] modeling_outputs.ImageClassifierOutputWithNoAttention
-
-## DepthEstimatorOutput
-
-[[autodoc]] modeling_outputs.DepthEstimatorOutput
-
-## Wav2Vec2BaseModelOutput
-
-[[autodoc]] modeling_outputs.Wav2Vec2BaseModelOutput
-
-## XVectorOutput
-
-[[autodoc]] modeling_outputs.XVectorOutput
-
-## TFBaseModelOutput
-
-[[autodoc]] modeling_tf_outputs.TFBaseModelOutput
-
-## TFBaseModelOutputWithPooling
-
-[[autodoc]] modeling_tf_outputs.TFBaseModelOutputWithPooling
-
-## TFBaseModelOutputWithPoolingAndCrossAttentions
-
-[[autodoc]] modeling_tf_outputs.TFBaseModelOutputWithPoolingAndCrossAttentions
-
-## TFBaseModelOutputWithPast
-
-[[autodoc]] modeling_tf_outputs.TFBaseModelOutputWithPast
-
-## TFBaseModelOutputWithPastAndCrossAttentions
-
-[[autodoc]] modeling_tf_outputs.TFBaseModelOutputWithPastAndCrossAttentions
-
-## TFSeq2SeqModelOutput
-
-[[autodoc]] modeling_tf_outputs.TFSeq2SeqModelOutput
-
-## TFCausalLMOutput
-
-[[autodoc]] modeling_tf_outputs.TFCausalLMOutput
-
-## TFCausalLMOutputWithCrossAttentions
-
-[[autodoc]] modeling_tf_outputs.TFCausalLMOutputWithCrossAttentions
-
-## TFCausalLMOutputWithPast
-
-[[autodoc]] modeling_tf_outputs.TFCausalLMOutputWithPast
-
-## TFMaskedLMOutput
-
-[[autodoc]] modeling_tf_outputs.TFMaskedLMOutput
-
-## TFSeq2SeqLMOutput
-
-[[autodoc]] modeling_tf_outputs.TFSeq2SeqLMOutput
-
-## TFNextSentencePredictorOutput
-
-[[autodoc]] modeling_tf_outputs.TFNextSentencePredictorOutput
-
-## TFSequenceClassifierOutput
-
-[[autodoc]] modeling_tf_outputs.TFSequenceClassifierOutput
-
-## TFSeq2SeqSequenceClassifierOutput
-
-[[autodoc]] modeling_tf_outputs.TFSeq2SeqSequenceClassifierOutput
-
-## TFMultipleChoiceModelOutput
-
-[[autodoc]] modeling_tf_outputs.TFMultipleChoiceModelOutput
-
-## TFTokenClassifierOutput
-
-[[autodoc]] modeling_tf_outputs.TFTokenClassifierOutput
-
-## TFQuestionAnsweringModelOutput
-
-[[autodoc]] modeling_tf_outputs.TFQuestionAnsweringModelOutput
-
-## TFSeq2SeqQuestionAnsweringModelOutput
-
-[[autodoc]] modeling_tf_outputs.TFSeq2SeqQuestionAnsweringModelOutput
-
-## FlaxBaseModelOutput
-
-[[autodoc]] modeling_flax_outputs.FlaxBaseModelOutput
-
-## FlaxBaseModelOutputWithPast
-
-[[autodoc]] modeling_flax_outputs.FlaxBaseModelOutputWithPast
-
-## FlaxBaseModelOutputWithPooling
-
-[[autodoc]] modeling_flax_outputs.FlaxBaseModelOutputWithPooling
-
-## FlaxBaseModelOutputWithPastAndCrossAttentions
-
-[[autodoc]] modeling_flax_outputs.FlaxBaseModelOutputWithPastAndCrossAttentions
-
-## FlaxSeq2SeqModelOutput
-
-[[autodoc]] modeling_flax_outputs.FlaxSeq2SeqModelOutput
-
-## FlaxCausalLMOutputWithCrossAttentions
-
-[[autodoc]] modeling_flax_outputs.FlaxCausalLMOutputWithCrossAttentions
-
-## FlaxMaskedLMOutput
-
-[[autodoc]] modeling_flax_outputs.FlaxMaskedLMOutput
-
-## FlaxSeq2SeqLMOutput
-
-[[autodoc]] modeling_flax_outputs.FlaxSeq2SeqLMOutput
-
-## FlaxNextSentencePredictorOutput
-
-[[autodoc]] modeling_flax_outputs.FlaxNextSentencePredictorOutput
-
-## FlaxSequenceClassifierOutput
-
-[[autodoc]] modeling_flax_outputs.FlaxSequenceClassifierOutput
-
-## FlaxSeq2SeqSequenceClassifierOutput
-
-[[autodoc]] modeling_flax_outputs.FlaxSeq2SeqSequenceClassifierOutput
-
-## FlaxMultipleChoiceModelOutput
-
-[[autodoc]] modeling_flax_outputs.FlaxMultipleChoiceModelOutput
-
-## FlaxTokenClassifierOutput
-
-[[autodoc]] modeling_flax_outputs.FlaxTokenClassifierOutput
-
-## FlaxQuestionAnsweringModelOutput
-
-[[autodoc]] modeling_flax_outputs.FlaxQuestionAnsweringModelOutput
-
-## FlaxSeq2SeqQuestionAnsweringModelOutput
-
-[[autodoc]] modeling_flax_outputs.FlaxSeq2SeqQuestionAnsweringModelOutput
diff --git a/docs/source/en/main_classes/pipelines.md b/docs/source/en/main_classes/pipelines.md
new file mode 100644
index 000000000000..6a1175d1f6ca
--- /dev/null
+++ b/docs/source/en/main_classes/pipelines.md
@@ -0,0 +1,492 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Pipelines
+
+The pipelines are a great and easy way to use models for inference. These pipelines are objects that abstract most of
+the complex code from the library, offering a simple API dedicated to several tasks, including Named Entity
+Recognition, Masked Language Modeling, Sentiment Analysis, Feature Extraction and Question Answering. See the
+[task summary](../task_summary) for examples of use.
+
+There are two categories of pipeline abstractions to be aware about:
+
+- The [`pipeline`] which is the most powerful object encapsulating all other pipelines.
+- Task-specific pipelines are available for [audio](#audio), [computer vision](#computer-vision), [natural language processing](#natural-language-processing), and [multimodal](#multimodal) tasks.
+
+## The pipeline abstraction
+
+The *pipeline* abstraction is a wrapper around all the other available pipelines. It is instantiated as any other
+pipeline but can provide additional quality of life.
+
+Simple call on one item:
+
+```python
+>>> pipe = pipeline("text-classification")
+>>> pipe("This restaurant is awesome")
+[{'label': 'POSITIVE', 'score': 0.9998743534088135}]
+```
+
+If you want to use a specific model from the [hub](https://huggingface.co) you can ignore the task if the model on
+the hub already defines it:
+
+```python
+>>> pipe = pipeline(model="roberta-large-mnli")
+>>> pipe("This restaurant is awesome")
+[{'label': 'NEUTRAL', 'score': 0.7313136458396912}]
+```
+
+To call a pipeline on many items, you can call it with a *list*.
+
+```python
+>>> pipe = pipeline("text-classification")
+>>> pipe(["This restaurant is awesome", "This restaurant is awful"])
+[{'label': 'POSITIVE', 'score': 0.9998743534088135},
+ {'label': 'NEGATIVE', 'score': 0.9996669292449951}]
+```
+
+To iterate over full datasets it is recommended to use a `dataset` directly. This means you don't need to allocate
+the whole dataset at once, nor do you need to do batching yourself. This should work just as fast as custom loops on
+GPU. If it doesn't don't hesitate to create an issue.
+
+```python
+import datasets
+from transformers import pipeline
+from transformers.pipelines.pt_utils import KeyDataset
+from tqdm.auto import tqdm
+
+pipe = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h", device=0)
+dataset = datasets.load_dataset("superb", name="asr", split="test")
+
+# KeyDataset (only *pt*) will simply return the item in the dict returned by the dataset item
+# as we're not interested in the *target* part of the dataset. For sentence pair use KeyPairDataset
+for out in tqdm(pipe(KeyDataset(dataset, "file"))):
+    print(out)
+    # {"text": "NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD NIGHT HUSBAND"}
+    # {"text": ....}
+    # ....
+```
+
+For ease of use, a generator is also possible:
+
+
+```python
+from transformers import pipeline
+
+pipe = pipeline("text-classification")
+
+
+def data():
+    while True:
+        # This could come from a dataset, a database, a queue or HTTP request
+        # in a server
+        # Caveat: because this is iterative, you cannot use `num_workers > 1` variable
+        # to use multiple threads to preprocess data. You can still have 1 thread that
+        # does the preprocessing while the main runs the big inference
+        yield "This is a test"
+
+
+for out in pipe(data()):
+    print(out)
+    # {"text": "NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD NIGHT HUSBAND"}
+    # {"text": ....}
+    # ....
+```
+
+[[autodoc]] pipeline
+
+## Pipeline batching
+
+All pipelines can use batching. This will work
+whenever the pipeline uses its streaming ability (so when passing lists or `Dataset` or `generator`).
+
+```python
+from transformers import pipeline
+from transformers.pipelines.pt_utils import KeyDataset
+import datasets
+
+dataset = datasets.load_dataset("imdb", name="plain_text", split="unsupervised")
+pipe = pipeline("text-classification", device=0)
+for out in pipe(KeyDataset(dataset, "text"), batch_size=8, truncation="only_first"):
+    print(out)
+    # [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
+    # Exactly the same output as before, but the content are passed
+    # as batches to the model
+```
+
+<Tip warning={true}>
+
+However, this is not automatically a win for performance. It can be either a 10x speedup or 5x slowdown depending
+on hardware, data and the actual model being used.
+
+Example where it's mostly a speedup:
+
+</Tip>
+
+```python
+from transformers import pipeline
+from torch.utils.data import Dataset
+from tqdm.auto import tqdm
+
+pipe = pipeline("text-classification", device=0)
+
+
+class MyDataset(Dataset):
+    def __len__(self):
+        return 5000
+
+    def __getitem__(self, i):
+        return "This is a test"
+
+
+dataset = MyDataset()
+
+for batch_size in [1, 8, 64, 256]:
+    print("-" * 30)
+    print(f"Streaming batch_size={batch_size}")
+    for out in tqdm(pipe(dataset, batch_size=batch_size), total=len(dataset)):
+        pass
+```
+
+```
+# On GTX 970
+------------------------------
+Streaming no batching
+100%|██████████████████████████████████████████████████████████████████████| 5000/5000 [00:26<00:00, 187.52it/s]
+------------------------------
+Streaming batch_size=8
+100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [00:04<00:00, 1205.95it/s]
+------------------------------
+Streaming batch_size=64
+100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [00:02<00:00, 2478.24it/s]
+------------------------------
+Streaming batch_size=256
+100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [00:01<00:00, 2554.43it/s]
+(diminishing returns, saturated the GPU)
+```
+
+Example where it's most a slowdown:
+
+```python
+class MyDataset(Dataset):
+    def __len__(self):
+        return 5000
+
+    def __getitem__(self, i):
+        if i % 64 == 0:
+            n = 100
+        else:
+            n = 1
+        return "This is a test" * n
+```
+
+This is a occasional very long sentence compared to the other. In that case, the **whole** batch will need to be 400
+tokens long, so the whole batch will be [64, 400] instead of [64, 4], leading to the high slowdown. Even worse, on
+bigger batches, the program simply crashes.
+
+
+```
+------------------------------
+Streaming no batching
+100%|█████████████████████████████████████████████████████████████████████| 1000/1000 [00:05<00:00, 183.69it/s]
+------------------------------
+Streaming batch_size=8
+100%|█████████████████████████████████████████████████████████████████████| 1000/1000 [00:03<00:00, 265.74it/s]
+------------------------------
+Streaming batch_size=64
+100%|██████████████████████████████████████████████████████████████████████| 1000/1000 [00:26<00:00, 37.80it/s]
+------------------------------
+Streaming batch_size=256
+  0%|                                                                                 | 0/1000 [00:00<?, ?it/s]
+Traceback (most recent call last):
+  File "/home/nicolas/src/transformers/test.py", line 42, in <module>
+    for out in tqdm(pipe(dataset, batch_size=256), total=len(dataset)):
+....
+    q = q / math.sqrt(dim_per_head)  # (bs, n_heads, q_length, dim_per_head)
+RuntimeError: CUDA out of memory. Tried to allocate 376.00 MiB (GPU 0; 3.95 GiB total capacity; 1.72 GiB already allocated; 354.88 MiB free; 2.46 GiB reserved in total by PyTorch)
+```
+
+There are no good (general) solutions for this problem, and your mileage may vary depending on your use cases. Rule of
+thumb:
+
+For users, a rule of thumb is:
+
+- **Measure performance on your load, with your hardware. Measure, measure, and keep measuring. Real numbers are the
+  only way to go.**
+- If you are latency constrained (live product doing inference), don't batch
+- If you are using CPU, don't batch.
+- If you are using throughput (you want to run your model on a bunch of static data), on GPU, then:
+
+  - If you have no clue about the size of the sequence_length ("natural" data), by default don't batch, measure and
+    try tentatively to add it, add OOM checks to recover when it will fail (and it will at some point if you don't
+    control the sequence_length.)
+  - If your sequence_length is super regular, then batching is more likely to be VERY interesting, measure and push
+    it until you get OOMs.
+  - The larger the GPU the more likely batching is going to be more interesting
+- As soon as you enable batching, make sure you can handle OOMs nicely.
+
+## Pipeline chunk batching
+
+`zero-shot-classification` and `question-answering` are slightly specific in the sense, that a single input might yield
+multiple forward pass of a model. Under normal circumstances, this would yield issues with `batch_size` argument.
+
+In order to circumvent this issue, both of these pipelines are a bit specific, they are `ChunkPipeline` instead of
+regular `Pipeline`. In short:
+
+
+```python
+preprocessed = pipe.preprocess(inputs)
+model_outputs = pipe.forward(preprocessed)
+outputs = pipe.postprocess(model_outputs)
+```
+
+Now becomes:
+
+
+```python
+all_model_outputs = []
+for preprocessed in pipe.preprocess(inputs):
+    model_outputs = pipe.forward(preprocessed)
+    all_model_outputs.append(model_outputs)
+outputs = pipe.postprocess(all_model_outputs)
+```
+
+This should be very transparent to your code because the pipelines are used in
+the same way.
+
+This is a simplified view, since the pipeline can handle automatically the batch to ! Meaning you don't have to care
+about how many forward passes you inputs are actually going to trigger, you can optimize the `batch_size`
+independently of the inputs. The caveats from the previous section still apply.
+
+## Pipeline custom code
+
+If you want to override a specific pipeline.
+
+Don't hesitate to create an issue for your task at hand, the goal of the pipeline is to be easy to use and support most
+cases, so `transformers` could maybe support your use case.
+
+
+If you want to try simply you can:
+
+- Subclass your pipeline of choice
+
+```python
+class MyPipeline(TextClassificationPipeline):
+    def postprocess():
+        # Your code goes here
+        scores = scores * 100
+        # And here
+
+
+my_pipeline = MyPipeline(model=model, tokenizer=tokenizer, ...)
+# or if you use *pipeline* function, then:
+my_pipeline = pipeline(model="xxxx", pipeline_class=MyPipeline)
+```
+
+That should enable you to do all the custom code you want.
+
+
+## Implementing a pipeline
+
+[Implementing a new pipeline](../add_new_pipeline)
+
+## Audio
+
+Pipelines available for audio tasks include the following.
+
+### AudioClassificationPipeline
+
+[[autodoc]] AudioClassificationPipeline
+    - __call__
+    - all
+
+### AutomaticSpeechRecognitionPipeline
+
+[[autodoc]] AutomaticSpeechRecognitionPipeline
+    - __call__
+    - all
+
+### TextToAudioPipeline
+
+[[autodoc]] TextToAudioPipeline
+    - __call__
+    - all
+
+
+### ZeroShotAudioClassificationPipeline
+
+[[autodoc]] ZeroShotAudioClassificationPipeline
+    - __call__
+    - all
+
+## Computer vision
+
+Pipelines available for computer vision tasks include the following.
+
+### DepthEstimationPipeline
+[[autodoc]] DepthEstimationPipeline
+    - __call__
+    - all
+
+### ImageClassificationPipeline
+
+[[autodoc]] ImageClassificationPipeline
+    - __call__
+    - all
+
+### ImageSegmentationPipeline
+
+[[autodoc]] ImageSegmentationPipeline
+    - __call__
+    - all
+
+### ImageToImagePipeline
+
+[[autodoc]] ImageToImagePipeline
+    - __call__
+    - all
+
+### ObjectDetectionPipeline
+
+[[autodoc]] ObjectDetectionPipeline
+    - __call__
+    - all
+
+### VideoClassificationPipeline
+
+[[autodoc]] VideoClassificationPipeline
+    - __call__
+    - all
+
+### ZeroShotImageClassificationPipeline
+
+[[autodoc]] ZeroShotImageClassificationPipeline
+    - __call__
+    - all
+
+### ZeroShotObjectDetectionPipeline
+
+[[autodoc]] ZeroShotObjectDetectionPipeline
+    - __call__
+    - all
+
+## Natural Language Processing
+
+Pipelines available for natural language processing tasks include the following.
+
+### ConversationalPipeline
+
+[[autodoc]] Conversation
+
+[[autodoc]] ConversationalPipeline
+    - __call__
+    - all
+
+### FillMaskPipeline
+
+[[autodoc]] FillMaskPipeline
+    - __call__
+    - all
+
+### NerPipeline
+
+[[autodoc]] NerPipeline
+
+See [`TokenClassificationPipeline`] for all details.
+
+### QuestionAnsweringPipeline
+
+[[autodoc]] QuestionAnsweringPipeline
+    - __call__
+    - all
+
+### SummarizationPipeline
+
+[[autodoc]] SummarizationPipeline
+    - __call__
+    - all
+
+### TableQuestionAnsweringPipeline
+
+[[autodoc]] TableQuestionAnsweringPipeline
+    - __call__
+
+### TextClassificationPipeline
+
+[[autodoc]] TextClassificationPipeline
+    - __call__
+    - all
+
+### TextGenerationPipeline
+
+[[autodoc]] TextGenerationPipeline
+    - __call__
+    - all
+
+### Text2TextGenerationPipeline
+
+[[autodoc]] Text2TextGenerationPipeline
+    - __call__
+    - all
+
+### TokenClassificationPipeline
+
+[[autodoc]] TokenClassificationPipeline
+    - __call__
+    - all
+
+### TranslationPipeline
+
+[[autodoc]] TranslationPipeline
+    - __call__
+    - all
+
+### ZeroShotClassificationPipeline
+
+[[autodoc]] ZeroShotClassificationPipeline
+    - __call__
+    - all
+
+## Multimodal
+
+Pipelines available for multimodal tasks include the following.
+
+### DocumentQuestionAnsweringPipeline
+
+[[autodoc]] DocumentQuestionAnsweringPipeline
+    - __call__
+    - all
+
+### FeatureExtractionPipeline
+
+[[autodoc]] FeatureExtractionPipeline
+    - __call__
+    - all
+
+### ImageToTextPipeline
+
+[[autodoc]] ImageToTextPipeline
+    - __call__
+    - all
+
+### VisualQuestionAnsweringPipeline
+
+[[autodoc]] VisualQuestionAnsweringPipeline
+    - __call__
+    - all
+
+## Parent class: `Pipeline`
+
+[[autodoc]] Pipeline
diff --git a/docs/source/en/main_classes/pipelines.mdx b/docs/source/en/main_classes/pipelines.mdx
deleted file mode 100644
index e5ee3902028e..000000000000
--- a/docs/source/en/main_classes/pipelines.mdx
+++ /dev/null
@@ -1,469 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Pipelines
-
-The pipelines are a great and easy way to use models for inference. These pipelines are objects that abstract most of
-the complex code from the library, offering a simple API dedicated to several tasks, including Named Entity
-Recognition, Masked Language Modeling, Sentiment Analysis, Feature Extraction and Question Answering. See the
-[task summary](../task_summary) for examples of use.
-
-There are two categories of pipeline abstractions to be aware about:
-
-- The [`pipeline`] which is the most powerful object encapsulating all other pipelines.
-- Task-specific pipelines are available for [audio](#audio), [computer vision](#computer-vision), [natural language processing](#natural-language-processing), and [multimodal](#multimodal) tasks.
-
-## The pipeline abstraction
-
-The *pipeline* abstraction is a wrapper around all the other available pipelines. It is instantiated as any other
-pipeline but can provide additional quality of life.
-
-Simple call on one item:
-
-```python
->>> pipe = pipeline("text-classification")
->>> pipe("This restaurant is awesome")
-[{'label': 'POSITIVE', 'score': 0.9998743534088135}]
-```
-
-If you want to use a specific model from the [hub](https://huggingface.co) you can ignore the task if the model on
-the hub already defines it:
-
-```python
->>> pipe = pipeline(model="roberta-large-mnli")
->>> pipe("This restaurant is awesome")
-[{'label': 'NEUTRAL', 'score': 0.7313136458396912}]
-```
-
-To call a pipeline on many items, you can call it with a *list*.
-
-```python
->>> pipe = pipeline("text-classification")
->>> pipe(["This restaurant is awesome", "This restaurant is awful"])
-[{'label': 'POSITIVE', 'score': 0.9998743534088135},
- {'label': 'NEGATIVE', 'score': 0.9996669292449951}]
-```
-
-To iterate over full datasets it is recommended to use a `dataset` directly. This means you don't need to allocate
-the whole dataset at once, nor do you need to do batching yourself. This should work just as fast as custom loops on
-GPU. If it doesn't don't hesitate to create an issue.
-
-```python
-import datasets
-from transformers import pipeline
-from transformers.pipelines.pt_utils import KeyDataset
-from tqdm.auto import tqdm
-
-pipe = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h", device=0)
-dataset = datasets.load_dataset("superb", name="asr", split="test")
-
-# KeyDataset (only *pt*) will simply return the item in the dict returned by the dataset item
-# as we're not interested in the *target* part of the dataset. For sentence pair use KeyPairDataset
-for out in tqdm(pipe(KeyDataset(dataset, "file"))):
-    print(out)
-    # {"text": "NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD NIGHT HUSBAND"}
-    # {"text": ....}
-    # ....
-```
-
-For ease of use, a generator is also possible:
-
-
-```python
-from transformers import pipeline
-
-pipe = pipeline("text-classification")
-
-
-def data():
-    while True:
-        # This could come from a dataset, a database, a queue or HTTP request
-        # in a server
-        # Caveat: because this is iterative, you cannot use `num_workers > 1` variable
-        # to use multiple threads to preprocess data. You can still have 1 thread that
-        # does the preprocessing while the main runs the big inference
-        yield "This is a test"
-
-
-for out in pipe(data()):
-    print(out)
-    # {"text": "NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD NIGHT HUSBAND"}
-    # {"text": ....}
-    # ....
-```
-
-[[autodoc]] pipeline
-
-## Pipeline batching
-
-All pipelines can use batching. This will work
-whenever the pipeline uses its streaming ability (so when passing lists or `Dataset` or `generator`).
-
-```python
-from transformers import pipeline
-from transformers.pipelines.pt_utils import KeyDataset
-import datasets
-
-dataset = datasets.load_dataset("imdb", name="plain_text", split="unsupervised")
-pipe = pipeline("text-classification", device=0)
-for out in pipe(KeyDataset(dataset, "text"), batch_size=8, truncation="only_first"):
-    print(out)
-    # [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
-    # Exactly the same output as before, but the content are passed
-    # as batches to the model
-```
-
-<Tip warning={true}>
-
-However, this is not automatically a win for performance. It can be either a 10x speedup or 5x slowdown depending
-on hardware, data and the actual model being used.
-
-Example where it's mostly a speedup:
-
-</Tip>
-
-```python
-from transformers import pipeline
-from torch.utils.data import Dataset
-from tqdm.auto import tqdm
-
-pipe = pipeline("text-classification", device=0)
-
-
-class MyDataset(Dataset):
-    def __len__(self):
-        return 5000
-
-    def __getitem__(self, i):
-        return "This is a test"
-
-
-dataset = MyDataset()
-
-for batch_size in [1, 8, 64, 256]:
-    print("-" * 30)
-    print(f"Streaming batch_size={batch_size}")
-    for out in tqdm(pipe(dataset, batch_size=batch_size), total=len(dataset)):
-        pass
-```
-
-```
-# On GTX 970
-------------------------------
-Streaming no batching
-100%|██████████████████████████████████████████████████████████████████████| 5000/5000 [00:26<00:00, 187.52it/s]
-------------------------------
-Streaming batch_size=8
-100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [00:04<00:00, 1205.95it/s]
-------------------------------
-Streaming batch_size=64
-100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [00:02<00:00, 2478.24it/s]
-------------------------------
-Streaming batch_size=256
-100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [00:01<00:00, 2554.43it/s]
-(diminishing returns, saturated the GPU)
-```
-
-Example where it's most a slowdown:
-
-```python
-class MyDataset(Dataset):
-    def __len__(self):
-        return 5000
-
-    def __getitem__(self, i):
-        if i % 64 == 0:
-            n = 100
-        else:
-            n = 1
-        return "This is a test" * n
-```
-
-This is a occasional very long sentence compared to the other. In that case, the **whole** batch will need to be 400
-tokens long, so the whole batch will be [64, 400] instead of [64, 4], leading to the high slowdown. Even worse, on
-bigger batches, the program simply crashes.
-
-
-```
-------------------------------
-Streaming no batching
-100%|█████████████████████████████████████████████████████████████████████| 1000/1000 [00:05<00:00, 183.69it/s]
-------------------------------
-Streaming batch_size=8
-100%|█████████████████████████████████████████████████████████████████████| 1000/1000 [00:03<00:00, 265.74it/s]
-------------------------------
-Streaming batch_size=64
-100%|██████████████████████████████████████████████████████████████████████| 1000/1000 [00:26<00:00, 37.80it/s]
-------------------------------
-Streaming batch_size=256
-  0%|                                                                                 | 0/1000 [00:00<?, ?it/s]
-Traceback (most recent call last):
-  File "/home/nicolas/src/transformers/test.py", line 42, in <module>
-    for out in tqdm(pipe(dataset, batch_size=256), total=len(dataset)):
-....
-    q = q / math.sqrt(dim_per_head)  # (bs, n_heads, q_length, dim_per_head)
-RuntimeError: CUDA out of memory. Tried to allocate 376.00 MiB (GPU 0; 3.95 GiB total capacity; 1.72 GiB already allocated; 354.88 MiB free; 2.46 GiB reserved in total by PyTorch)
-```
-
-There are no good (general) solutions for this problem, and your mileage may vary depending on your use cases. Rule of
-thumb:
-
-For users, a rule of thumb is:
-
-- **Measure performance on your load, with your hardware. Measure, measure, and keep measuring. Real numbers are the
-  only way to go.**
-- If you are latency constrained (live product doing inference), don't batch
-- If you are using CPU, don't batch.
-- If you are using throughput (you want to run your model on a bunch of static data), on GPU, then:
-
-  - If you have no clue about the size of the sequence_length ("natural" data), by default don't batch, measure and
-    try tentatively to add it, add OOM checks to recover when it will fail (and it will at some point if you don't
-    control the sequence_length.)
-  - If your sequence_length is super regular, then batching is more likely to be VERY interesting, measure and push
-    it until you get OOMs.
-  - The larger the GPU the more likely batching is going to be more interesting
-- As soon as you enable batching, make sure you can handle OOMs nicely.
-
-## Pipeline chunk batching
-
-`zero-shot-classification` and `question-answering` are slightly specific in the sense, that a single input might yield
-multiple forward pass of a model. Under normal circumstances, this would yield issues with `batch_size` argument.
-
-In order to circumvent this issue, both of these pipelines are a bit specific, they are `ChunkPipeline` instead of
-regular `Pipeline`. In short:
-
-
-```python
-preprocessed = pipe.preprocess(inputs)
-model_outputs = pipe.forward(preprocessed)
-outputs = pipe.postprocess(model_outputs)
-```
-
-Now becomes:
-
-
-```python
-all_model_outputs = []
-for preprocessed in pipe.preprocess(inputs):
-    model_outputs = pipe.forward(preprocessed)
-    all_model_outputs.append(model_outputs)
-outputs = pipe.postprocess(all_model_outputs)
-```
-
-This should be very transparent to your code because the pipelines are used in
-the same way.
-
-This is a simplified view, since the pipeline can handle automatically the batch to ! Meaning you don't have to care
-about how many forward passes you inputs are actually going to trigger, you can optimize the `batch_size`
-independently of the inputs. The caveats from the previous section still apply.
-
-## Pipeline custom code
-
-If you want to override a specific pipeline.
-
-Don't hesitate to create an issue for your task at hand, the goal of the pipeline is to be easy to use and support most
-cases, so `transformers` could maybe support your use case.
-
-
-If you want to try simply you can:
-
-- Subclass your pipeline of choice
-
-```python
-class MyPipeline(TextClassificationPipeline):
-    def postprocess():
-        # Your code goes here
-        scores = scores * 100
-        # And here
-
-
-my_pipeline = MyPipeline(model=model, tokenizer=tokenizer, ...)
-# or if you use *pipeline* function, then:
-my_pipeline = pipeline(model="xxxx", pipeline_class=MyPipeline)
-```
-
-That should enable you to do all the custom code you want.
-
-
-## Implementing a pipeline
-
-[Implementing a new pipeline](../add_new_pipeline)
-
-## Audio
-
-Pipelines available for audio tasks include the following.
-
-### AudioClassificationPipeline
-
-[[autodoc]] AudioClassificationPipeline
-    - __call__
-    - all
-
-### AutomaticSpeechRecognitionPipeline
-
-[[autodoc]] AutomaticSpeechRecognitionPipeline
-    - __call__
-    - all
-
-## Computer vision
-
-Pipelines available for computer vision tasks include the following.
-
-### DepthEstimationPipeline
-[[autodoc]] DepthEstimationPipeline
-    - __call__
-    - all
-
-### ImageClassificationPipeline
-
-[[autodoc]] ImageClassificationPipeline
-    - __call__
-    - all
-
-### ImageSegmentationPipeline
-
-[[autodoc]] ImageSegmentationPipeline
-    - __call__
-    - all
-
-### ObjectDetectionPipeline
-
-[[autodoc]] ObjectDetectionPipeline
-    - __call__
-    - all
-
-### VideoClassificationPipeline
-
-[[autodoc]] VideoClassificationPipeline
-    - __call__
-    - all
-
-### ZeroShotImageClassificationPipeline
-
-[[autodoc]] ZeroShotImageClassificationPipeline
-    - __call__
-    - all
-
-### ZeroShotObjectDetectionPipeline
-
-[[autodoc]] ZeroShotObjectDetectionPipeline
-    - __call__
-    - all
-
-## Natural Language Processing
-
-Pipelines available for natural language processing tasks include the following.
-
-### ConversationalPipeline
-
-[[autodoc]] Conversation
-
-[[autodoc]] ConversationalPipeline
-    - __call__
-    - all
-
-### FillMaskPipeline
-
-[[autodoc]] FillMaskPipeline
-    - __call__
-    - all
-
-### NerPipeline
-
-[[autodoc]] NerPipeline
-
-See [`TokenClassificationPipeline`] for all details.
-
-### QuestionAnsweringPipeline
-
-[[autodoc]] QuestionAnsweringPipeline
-    - __call__
-    - all
-
-### SummarizationPipeline
-
-[[autodoc]] SummarizationPipeline
-    - __call__
-    - all
-
-### TableQuestionAnsweringPipeline
-
-[[autodoc]] TableQuestionAnsweringPipeline
-    - __call__
-
-### TextClassificationPipeline
-
-[[autodoc]] TextClassificationPipeline
-    - __call__
-    - all
-
-### TextGenerationPipeline
-
-[[autodoc]] TextGenerationPipeline
-    - __call__
-    - all
-
-### Text2TextGenerationPipeline
-
-[[autodoc]] Text2TextGenerationPipeline
-    - __call__
-    - all
-
-### TokenClassificationPipeline
-
-[[autodoc]] TokenClassificationPipeline
-    - __call__
-    - all
-
-### TranslationPipeline
-
-[[autodoc]] TranslationPipeline
-    - __call__
-    - all
-
-### ZeroShotClassificationPipeline
-
-[[autodoc]] ZeroShotClassificationPipeline
-    - __call__
-    - all
-
-## Multimodal
-
-Pipelines available for multimodal tasks include the following.
-
-### DocumentQuestionAnsweringPipeline
-
-[[autodoc]] DocumentQuestionAnsweringPipeline
-    - __call__
-    - all
-
-### FeatureExtractionPipeline
-
-[[autodoc]] FeatureExtractionPipeline
-    - __call__
-    - all
-
-### ImageToTextPipeline
-
-[[autodoc]] ImageToTextPipeline
-    - __call__
-    - all
-
-### VisualQuestionAnsweringPipeline
-
-[[autodoc]] VisualQuestionAnsweringPipeline
-    - __call__
-    - all
-
-## Parent class: `Pipeline`
-
-[[autodoc]] Pipeline
diff --git a/docs/source/en/main_classes/processors.md b/docs/source/en/main_classes/processors.md
new file mode 100644
index 000000000000..9763122ef4f9
--- /dev/null
+++ b/docs/source/en/main_classes/processors.md
@@ -0,0 +1,163 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Processors
+
+Processors can mean two different things in the Transformers library:
+- the objects that pre-process inputs for multi-modal models such as [Wav2Vec2](../model_doc/wav2vec2) (speech and text)
+  or [CLIP](../model_doc/clip) (text and vision)
+- deprecated objects that were used in older versions of the library to preprocess data for GLUE or SQUAD.
+
+## Multi-modal processors
+
+Any multi-modal model will require an object to encode or decode the data that groups several modalities (among text,
+vision and audio). This is handled by objects called processors, which group together two or more processing objects
+such as tokenizers (for the text modality), image processors (for vision) and feature extractors (for audio).
+
+Those processors inherit from the following base class that implements the saving and loading functionality:
+
+[[autodoc]] ProcessorMixin
+
+## Deprecated processors
+
+All processors follow the same architecture which is that of the
+[`~data.processors.utils.DataProcessor`]. The processor returns a list of
+[`~data.processors.utils.InputExample`]. These
+[`~data.processors.utils.InputExample`] can be converted to
+[`~data.processors.utils.InputFeatures`] in order to be fed to the model.
+
+[[autodoc]] data.processors.utils.DataProcessor
+
+[[autodoc]] data.processors.utils.InputExample
+
+[[autodoc]] data.processors.utils.InputFeatures
+
+## GLUE
+
+[General Language Understanding Evaluation (GLUE)](https://gluebenchmark.com/) is a benchmark that evaluates the
+performance of models across a diverse set of existing NLU tasks. It was released together with the paper [GLUE: A
+multi-task benchmark and analysis platform for natural language understanding](https://openreview.net/pdf?id=rJ4km2R5t7)
+
+This library hosts a total of 10 processors for the following tasks: MRPC, MNLI, MNLI (mismatched), CoLA, SST2, STSB,
+QQP, QNLI, RTE and WNLI.
+
+Those processors are:
+
+- [`~data.processors.utils.MrpcProcessor`]
+- [`~data.processors.utils.MnliProcessor`]
+- [`~data.processors.utils.MnliMismatchedProcessor`]
+- [`~data.processors.utils.Sst2Processor`]
+- [`~data.processors.utils.StsbProcessor`]
+- [`~data.processors.utils.QqpProcessor`]
+- [`~data.processors.utils.QnliProcessor`]
+- [`~data.processors.utils.RteProcessor`]
+- [`~data.processors.utils.WnliProcessor`]
+
+Additionally, the following method can be used to load values from a data file and convert them to a list of
+[`~data.processors.utils.InputExample`].
+
+[[autodoc]] data.processors.glue.glue_convert_examples_to_features
+
+
+## XNLI
+
+[The Cross-Lingual NLI Corpus (XNLI)](https://www.nyu.edu/projects/bowman/xnli/) is a benchmark that evaluates the
+quality of cross-lingual text representations. XNLI is crowd-sourced dataset based on [*MultiNLI*](http://www.nyu.edu/projects/bowman/multinli/): pairs of text are labeled with textual entailment annotations for 15
+different languages (including both high-resource language such as English and low-resource languages such as Swahili).
+
+It was released together with the paper [XNLI: Evaluating Cross-lingual Sentence Representations](https://arxiv.org/abs/1809.05053)
+
+This library hosts the processor to load the XNLI data:
+
+- [`~data.processors.utils.XnliProcessor`]
+
+Please note that since the gold labels are available on the test set, evaluation is performed on the test set.
+
+An example using these processors is given in the [run_xnli.py](https://github.com/huggingface/transformers/tree/main/examples/legacy/text-classification/run_xnli.py) script.
+
+
+## SQuAD
+
+[The Stanford Question Answering Dataset (SQuAD)](https://rajpurkar.github.io/SQuAD-explorer//) is a benchmark that
+evaluates the performance of models on question answering. Two versions are available, v1.1 and v2.0. The first version
+(v1.1) was released together with the paper [SQuAD: 100,000+ Questions for Machine Comprehension of Text](https://arxiv.org/abs/1606.05250). The second version (v2.0) was released alongside the paper [Know What You Don't
+Know: Unanswerable Questions for SQuAD](https://arxiv.org/abs/1806.03822).
+
+This library hosts a processor for each of the two versions:
+
+### Processors
+
+Those processors are:
+
+- [`~data.processors.utils.SquadV1Processor`]
+- [`~data.processors.utils.SquadV2Processor`]
+
+They both inherit from the abstract class [`~data.processors.utils.SquadProcessor`]
+
+[[autodoc]] data.processors.squad.SquadProcessor
+    - all
+
+Additionally, the following method can be used to convert SQuAD examples into
+[`~data.processors.utils.SquadFeatures`] that can be used as model inputs.
+
+[[autodoc]] data.processors.squad.squad_convert_examples_to_features
+
+
+These processors as well as the aforementioned method can be used with files containing the data as well as with the
+*tensorflow_datasets* package. Examples are given below.
+
+
+### Example usage
+
+Here is an example using the processors as well as the conversion method using data files:
+
+```python
+# Loading a V2 processor
+processor = SquadV2Processor()
+examples = processor.get_dev_examples(squad_v2_data_dir)
+
+# Loading a V1 processor
+processor = SquadV1Processor()
+examples = processor.get_dev_examples(squad_v1_data_dir)
+
+features = squad_convert_examples_to_features(
+    examples=examples,
+    tokenizer=tokenizer,
+    max_seq_length=max_seq_length,
+    doc_stride=args.doc_stride,
+    max_query_length=max_query_length,
+    is_training=not evaluate,
+)
+```
+
+Using *tensorflow_datasets* is as easy as using a data file:
+
+```python
+# tensorflow_datasets only handle Squad V1.
+tfds_examples = tfds.load("squad")
+examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
+
+features = squad_convert_examples_to_features(
+    examples=examples,
+    tokenizer=tokenizer,
+    max_seq_length=max_seq_length,
+    doc_stride=args.doc_stride,
+    max_query_length=max_query_length,
+    is_training=not evaluate,
+)
+```
+
+Another example using these processors is given in the [run_squad.py](https://github.com/huggingface/transformers/tree/main/examples/legacy/question-answering/run_squad.py) script.
diff --git a/docs/source/en/main_classes/processors.mdx b/docs/source/en/main_classes/processors.mdx
deleted file mode 100644
index 5530720b1cb6..000000000000
--- a/docs/source/en/main_classes/processors.mdx
+++ /dev/null
@@ -1,159 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Processors
-
-Processors can mean two different things in the Transformers library:
-- the objects that pre-process inputs for multi-modal models such as [Wav2Vec2](../model_doc/wav2vec2) (speech and text)
-  or [CLIP](../model_doc/clip) (text and vision)
-- deprecated objects that were used in older versions of the library to preprocess data for GLUE or SQUAD.
-
-## Multi-modal processors
-
-Any multi-modal model will require an object to encode or decode the data that groups several modalities (among text,
-vision and audio). This is handled by objects called processors, which group together two or more processing objects
-such as tokenizers (for the text modality), image processors (for vision) and feature extractors (for audio).
-
-Those processors inherit from the following base class that implements the saving and loading functionality:
-
-[[autodoc]] ProcessorMixin
-
-## Deprecated processors
-
-All processors follow the same architecture which is that of the
-[`~data.processors.utils.DataProcessor`]. The processor returns a list of
-[`~data.processors.utils.InputExample`]. These
-[`~data.processors.utils.InputExample`] can be converted to
-[`~data.processors.utils.InputFeatures`] in order to be fed to the model.
-
-[[autodoc]] data.processors.utils.DataProcessor
-
-[[autodoc]] data.processors.utils.InputExample
-
-[[autodoc]] data.processors.utils.InputFeatures
-
-## GLUE
-
-[General Language Understanding Evaluation (GLUE)](https://gluebenchmark.com/) is a benchmark that evaluates the
-performance of models across a diverse set of existing NLU tasks. It was released together with the paper [GLUE: A
-multi-task benchmark and analysis platform for natural language understanding](https://openreview.net/pdf?id=rJ4km2R5t7)
-
-This library hosts a total of 10 processors for the following tasks: MRPC, MNLI, MNLI (mismatched), CoLA, SST2, STSB,
-QQP, QNLI, RTE and WNLI.
-
-Those processors are:
-
-- [`~data.processors.utils.MrpcProcessor`]
-- [`~data.processors.utils.MnliProcessor`]
-- [`~data.processors.utils.MnliMismatchedProcessor`]
-- [`~data.processors.utils.Sst2Processor`]
-- [`~data.processors.utils.StsbProcessor`]
-- [`~data.processors.utils.QqpProcessor`]
-- [`~data.processors.utils.QnliProcessor`]
-- [`~data.processors.utils.RteProcessor`]
-- [`~data.processors.utils.WnliProcessor`]
-
-Additionally, the following method can be used to load values from a data file and convert them to a list of
-[`~data.processors.utils.InputExample`].
-
-[[autodoc]] data.processors.glue.glue_convert_examples_to_features
-
-
-## XNLI
-
-[The Cross-Lingual NLI Corpus (XNLI)](https://www.nyu.edu/projects/bowman/xnli/) is a benchmark that evaluates the
-quality of cross-lingual text representations. XNLI is crowd-sourced dataset based on [*MultiNLI*](http://www.nyu.edu/projects/bowman/multinli/): pairs of text are labeled with textual entailment annotations for 15
-different languages (including both high-resource language such as English and low-resource languages such as Swahili).
-
-It was released together with the paper [XNLI: Evaluating Cross-lingual Sentence Representations](https://arxiv.org/abs/1809.05053)
-
-This library hosts the processor to load the XNLI data:
-
-- [`~data.processors.utils.XnliProcessor`]
-
-Please note that since the gold labels are available on the test set, evaluation is performed on the test set.
-
-An example using these processors is given in the [run_xnli.py](https://github.com/huggingface/transformers/tree/main/examples/legacy/text-classification/run_xnli.py) script.
-
-
-## SQuAD
-
-[The Stanford Question Answering Dataset (SQuAD)](https://rajpurkar.github.io/SQuAD-explorer//) is a benchmark that
-evaluates the performance of models on question answering. Two versions are available, v1.1 and v2.0. The first version
-(v1.1) was released together with the paper [SQuAD: 100,000+ Questions for Machine Comprehension of Text](https://arxiv.org/abs/1606.05250). The second version (v2.0) was released alongside the paper [Know What You Don't
-Know: Unanswerable Questions for SQuAD](https://arxiv.org/abs/1806.03822).
-
-This library hosts a processor for each of the two versions:
-
-### Processors
-
-Those processors are:
-
-- [`~data.processors.utils.SquadV1Processor`]
-- [`~data.processors.utils.SquadV2Processor`]
-
-They both inherit from the abstract class [`~data.processors.utils.SquadProcessor`]
-
-[[autodoc]] data.processors.squad.SquadProcessor
-    - all
-
-Additionally, the following method can be used to convert SQuAD examples into
-[`~data.processors.utils.SquadFeatures`] that can be used as model inputs.
-
-[[autodoc]] data.processors.squad.squad_convert_examples_to_features
-
-
-These processors as well as the aforementioned method can be used with files containing the data as well as with the
-*tensorflow_datasets* package. Examples are given below.
-
-
-### Example usage
-
-Here is an example using the processors as well as the conversion method using data files:
-
-```python
-# Loading a V2 processor
-processor = SquadV2Processor()
-examples = processor.get_dev_examples(squad_v2_data_dir)
-
-# Loading a V1 processor
-processor = SquadV1Processor()
-examples = processor.get_dev_examples(squad_v1_data_dir)
-
-features = squad_convert_examples_to_features(
-    examples=examples,
-    tokenizer=tokenizer,
-    max_seq_length=max_seq_length,
-    doc_stride=args.doc_stride,
-    max_query_length=max_query_length,
-    is_training=not evaluate,
-)
-```
-
-Using *tensorflow_datasets* is as easy as using a data file:
-
-```python
-# tensorflow_datasets only handle Squad V1.
-tfds_examples = tfds.load("squad")
-examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
-
-features = squad_convert_examples_to_features(
-    examples=examples,
-    tokenizer=tokenizer,
-    max_seq_length=max_seq_length,
-    doc_stride=args.doc_stride,
-    max_query_length=max_query_length,
-    is_training=not evaluate,
-)
-```
-
-Another example using these processors is given in the [run_squad.py](https://github.com/huggingface/transformers/tree/main/examples/legacy/question-answering/run_squad.py) script.
diff --git a/docs/source/en/main_classes/quantization.md b/docs/source/en/main_classes/quantization.md
new file mode 100644
index 000000000000..ad618f97e44a
--- /dev/null
+++ b/docs/source/en/main_classes/quantization.md
@@ -0,0 +1,438 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Quantize 🤗 Transformers models
+
+## `AutoGPTQ` Integration
+
+🤗 Transformers has integrated `optimum` API to perform GPTQ quantization on language models. You can load and quantize your model in 8, 4, 3 or even 2 bits without a big drop of performance and faster inference speed! This is supported by most GPU hardwares.
+
+To learn more about the the quantization model, check out: 
+- the [GPTQ](https://arxiv.org/pdf/2210.17323.pdf) paper
+- the `optimum` [guide](https://huggingface.co/docs/optimum/llm_quantization/usage_guides/quantization) on GPTQ quantization
+- the [`AutoGPTQ`](https://github.com/PanQiWei/AutoGPTQ) library used as the backend
+
+### Requirements
+
+You need to have the following requirements installed to run the code below: 
+
+- Install latest `AutoGPTQ` library
+`pip install auto-gptq`
+
+- Install latest `optimum` from source 
+`pip install git+https://github.com/huggingface/optimum.git`
+
+- Install latest `transformers` from source 
+`pip install git+https://github.com/huggingface/transformers.git`
+
+- Install latest `accelerate` library 
+`pip install --upgrade accelerate`
+
+Note that GPTQ integration supports for now only text models and you may encounter unexpected behaviour for vision, speech or multi-modal models.
+
+### Load and quantize a model
+
+GPTQ is a quantization method that requires weights calibration before using the quantized models. If you want to quantize transformers model from scratch, it might take some time before producing the quantized model (~5 min on a Google colab for `facebook/opt-350m` model). 
+
+Hence, there are two different scenarios where you want to use GPTQ-quantized models. The first use case would be to load models that has been already quantized by other users that are available on the Hub, the second use case would be to quantize your model from scratch and save it or push it on the Hub so that other users can also use it.
+#### GPTQ Configuration
+
+In order to load and quantize a model, you need to create a [`GPTQConfig`]. You need to pass the number of `bits`, a `dataset` in order to calibrate the quantization and the `tokenizer` of the model in order prepare the dataset.
+
+```python 
+model_id = "facebook/opt-125m"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+gptq_config = GPTQConfig(bits=4, dataset = "c4", tokenizer=tokenizer)
+```
+
+Note that you can pass your own dataset as a list of string. However, it is highly recommended to use the dataset from the GPTQ paper. 
+```python
+dataset = ["auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."]
+quantization = GPTQConfig(bits=4, dataset = dataset, tokenizer=tokenizer)
+```
+
+#### Quantization
+
+You can quantize a model by using `from_pretrained` and setting the `quantization_config`. 
+
+```python
+from transformers import AutoModelForCausalLM
+model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=gptq_config)
+```
+Note that you will need a GPU to quantize a model. We will put the model in the cpu and move the modules back and forth to the gpu in order to quantize them.
+
+If you want to maximize your gpus usage while using cpu offload, you can set `device_map = "auto"`.
+```python
+from transformers import AutoModelForCausalLM
+model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", quantization_config=gptq_config)
+```
+Note that disk offload is not supported. Furthermore, if you are out of memory because of the dataset, you may have to pass `max_memory` in `from_pretained`. Checkout this [guide](https://huggingface.co/docs/accelerate/usage_guides/big_modeling#designing-a-device-map) to learn more about `device_map` and `max_memory`.
+
+<Tip warning={true}>
+GPTQ quantization only works for text model for now. Futhermore, the quantization process can a lot of time depending on one's hardware (175B model = 4 gpu hours using NVIDIA A100). Please check on the hub if there is not a GPTQ quantized version of the model. If not, you can submit a demand on github. 
+</Tip>
+
+### Push quantized model to 🤗 Hub
+
+You can push the quantized model like any 🤗 model to Hub with `push_to_hub`. The quantization config will be saved and pushed along the model. 
+
+```python
+quantized_model.push_to_hub("opt-125m-gptq")
+tokenizer.push_to_hub("opt-125m-gptq")
+```
+
+If you want to save your quantized model on your local machine, you can also do it with `save_pretrained`: 
+```python
+quantized_model.save_pretrained("opt-125m-gptq")
+tokenizer.save_pretrained("opt-125m-gptq")
+```
+
+Note that if you have quantized your model with a `device_map`, make sure to move the entire model to one of your gpus or the `cpu` before saving it. 
+```python
+quantized_model.to("cpu")
+quantized_model.save_pretrained("opt-125m-gptq")
+```
+
+### Load a quantized model from the 🤗 Hub
+
+You can load a quantized model from the Hub by using `from_pretrained`.
+Make sure that the pushed weights are quantized, by checking that the attribute `quantization_config` is present in the model configuration object.
+
+```python
+from transformers import AutoModelForCausalLM
+model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq")
+```
+
+If you want to load a model faster and without allocating more memory than needed, the `device_map` argument also works with quantized model. Make sure that you have `accelerate` library installed.
+```python
+from transformers import AutoModelForCausalLM
+model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto")
+```
+
+### Exllama kernels for faster inference
+
+For 4-bit model, you can use the exllama kernels in order to a faster inference speed. It is activated by default. You can change that behavior by passing `disable_exllama` in [`GPTQConfig`]. This will overwrite the quantization config stored in the config. Note that you will only be able to overwrite the attributes related to the kernels. Furthermore, you need to have the entire model on gpus if you want to use exllama kernels. 
+
+```py
+import torch
+gptq_config = GPTQConfig(bits=4, disable_exllama=False)
+model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto", quantization_config = gptq_config)
+```
+
+Note that only 4-bit models are supported for now. Furthermore, it is recommended to deactivate the exllama kernels if you are finetuning a quantized model with peft. 
+
+#### Fine-tune a quantized model 
+
+With the official support of adapters in the Hugging Face ecosystem, you can fine-tune models that have been quantized with GPTQ. 
+Please have a look at [`peft`](https://github.com/huggingface/peft) library for more details.
+
+### Example demo
+
+Check out the Google Colab [notebook](https://colab.research.google.com/drive/1_TIrmuKOFhuRRiTWN94iLKUFu6ZX4ceb?usp=sharing) to learn how to quantize your model with GPTQ and how finetune the quantized model with peft. 
+
+### GPTQConfig
+
+[[autodoc]] GPTQConfig
+
+
+## `bitsandbytes` Integration
+
+🤗 Transformers is closely integrated with most used modules on `bitsandbytes`. You can load your model in 8-bit precision with few lines of code.
+This is supported by most of the GPU hardwares since the `0.37.0` release of `bitsandbytes`.
+
+Learn more about the quantization method in the [LLM.int8()](https://arxiv.org/abs/2208.07339) paper, or the [blogpost](https://huggingface.co/blog/hf-bitsandbytes-integration) about the collaboration.
+
+Since its `0.39.0` release, you can load any model that supports `device_map` using 4-bit quantization, leveraging FP4 data type.
+
+If you want to quantize your own pytorch model, check out this [documentation](https://huggingface.co/docs/accelerate/main/en/usage_guides/quantization) from 🤗 Accelerate library. 
+
+Here are the things you can do using `bitsandbytes` integration
+
+### General usage
+
+You can quantize a model by using the `load_in_8bit` or `load_in_4bit` argument when calling the [`~PreTrainedModel.from_pretrained`] method as long as your model supports loading with 🤗 Accelerate and contains `torch.nn.Linear` layers. This should work for any modality as well.
+
+```python
+from transformers import AutoModelForCausalLM
+
+model_8bit = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", load_in_8bit=True)
+model_4bit = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", load_in_4bit=True)
+```
+
+By default all other modules (e.g. `torch.nn.LayerNorm`) will be converted in `torch.float16`, but if you want to change their `dtype` you can overwrite the `torch_dtype` argument:
+
+```python
+>>> import torch
+>>> from transformers import AutoModelForCausalLM
+
+>>> model_8bit = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", load_in_8bit=True, torch_dtype=torch.float32)
+>>> model_8bit.model.decoder.layers[-1].final_layer_norm.weight.dtype
+torch.float32
+```
+
+
+### FP4 quantization 
+
+#### Requirements
+
+Make sure that you have installed the requirements below before running any of the code snippets below.
+
+- Latest `bitsandbytes` library
+`pip install bitsandbytes>=0.39.0`
+
+- Install latest `accelerate`
+`pip install --upgrade accelerate`
+
+- Install latest `transformers`
+`pip install --upgrade transformers`
+
+#### Tips and best practices
+
+- **Advanced usage:** Refer to [this Google Colab notebook](https://colab.research.google.com/drive/1ge2F1QSK8Q7h0hn3YKuBCOAS0bK8E0wf) for advanced usage of 4-bit quantization with all the possible options.
+
+- **Faster inference with `batch_size=1` :** Since the `0.40.0` release of bitsandbytes, for `batch_size=1` you can benefit from fast inference. Check out [these release notes](https://github.com/TimDettmers/bitsandbytes/releases/tag/0.40.0) and make sure to have a version that is greater than `0.40.0` to benefit from this feature out of the box. 
+
+- **Training:** According to [QLoRA paper](https://arxiv.org/abs/2305.14314), for training 4-bit base models (e.g. using LoRA adapters) one should use `bnb_4bit_quant_type='nf4'`. 
+
+- **Inference:** For inference, `bnb_4bit_quant_type` does not have a huge impact on the performance. However for consistency with the model's weights, make sure you use the same `bnb_4bit_compute_dtype` and `torch_dtype` arguments.
+
+#### Load a large model in 4bit
+
+By using `load_in_4bit=True` when calling the `.from_pretrained` method, you can divide your memory use by 4 (roughly).
+
+```python
+# pip install transformers accelerate bitsandbytes
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_id = "bigscience/bloom-1b7"
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_4bit=True)
+```
+
+<Tip warning={true}>
+
+Note that once a model has been loaded in 4-bit it is currently not possible to push the quantized weights on the Hub. Note also that you cannot train 4-bit weights as this is not supported yet. However you can use 4-bit models to train extra parameters, this will be covered in the next section.
+
+</Tip>
+
+### Load a large model in 8bit
+
+You can load a model by roughly halving the memory requirements by using `load_in_8bit=True` argument when calling `.from_pretrained` method
+
+
+```python
+# pip install transformers accelerate bitsandbytes
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_id = "bigscience/bloom-1b7"
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_8bit=True)
+```
+
+Then, use your model as you would usually use a [`PreTrainedModel`].
+
+You can check the memory footprint of your model with `get_memory_footprint` method.
+
+```python
+print(model.get_memory_footprint())
+```
+
+With this integration we were able to load large models on smaller devices and run them without any issue.  
+
+<Tip warning={true}>
+
+Note that once a model has been loaded in 8-bit it is currently not possible to push the quantized weights on the Hub except if you use the latest `transformers` and `bitsandbytes`. Note also that you cannot train 8-bit weights as this is not supported yet. However you can use 8-bit models to train extra parameters, this will be covered in the next section.
+Note also that `device_map` is optional but setting `device_map = 'auto'` is prefered for inference as it will dispatch efficiently the model on the available ressources.
+
+</Tip>
+
+#### Advanced use cases
+
+Here we will cover some advanced use cases you can perform with FP4 quantization 
+
+##### Change the compute dtype
+
+The compute dtype is used to change the dtype that will be used during computation. For example, hidden states could be in `float32` but computation can be set to bf16 for speedups. By default, the compute dtype is set to `float32`.
+
+```python
+import torch
+from transformers import BitsAndBytesConfig
+
+quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
+```
+
+##### Using NF4 (Normal Float 4) data type 
+
+You can also use the NF4 data type, which is a new 4bit datatype adapted for weights that have been initialized using a normal distribution. For that run:
+
+```python
+from transformers import BitsAndBytesConfig
+
+nf4_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+)
+
+model_nf4 = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=nf4_config)
+```
+
+##### Use nested quantization for more memory efficient inference
+
+We also advise users to use the nested quantization technique. This saves more memory at no additional performance - from our empirical observations, this enables fine-tuning llama-13b model on an NVIDIA-T4 16GB with a sequence length of 1024, batch size of 1 and gradient accumulation steps of 4.
+
+```python
+from transformers import BitsAndBytesConfig
+
+double_quant_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=True,
+)
+
+model_double_quant = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=double_quant_config)
+```
+
+
+### Push quantized models on the 🤗 Hub
+
+You can push a quantized model on the Hub by naively using `push_to_hub` method. This will first push the quantization configuration file, then push the quantized model weights.
+Make sure to use `bitsandbytes>0.37.2` (at this time of writing, we tested it on `bitsandbytes==0.38.0.post1`) to be able to use this feature. 
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-560m", device_map="auto", load_in_8bit=True)
+tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
+
+model.push_to_hub("bloom-560m-8bit")
+```
+
+<Tip warning={true}>
+
+Pushing 8bit models on the Hub is strongely encouraged for large models. This will allow the community to benefit from the memory footprint reduction and loading for example large models on a Google Colab.
+
+</Tip>
+
+### Load a quantized model from the 🤗 Hub
+
+You can load a quantized model from the Hub by using `from_pretrained` method. Make sure that the pushed weights are quantized, by checking that the attribute `quantization_config` is present in the model configuration object.
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model = AutoModelForCausalLM.from_pretrained("{your_username}/bloom-560m-8bit", device_map="auto")
+```
+Note that in this case, you don't need to specify the arguments `load_in_8bit=True`, but you need to make sure that `bitsandbytes` and `accelerate` are installed.
+Note also that `device_map` is optional but setting `device_map = 'auto'` is prefered for inference as it will dispatch efficiently the model on the available ressources.
+
+### Advanced use cases
+
+This section is intended to advanced users, that want to explore what it is possible to do beyond loading and running 8-bit models.
+
+#### Offload between `cpu` and `gpu`
+
+One of the advanced use case of this is being able to load a model and dispatch the weights between `CPU` and `GPU`. Note that the weights that will be dispatched on CPU **will not** be converted in 8-bit, thus kept in `float32`. This feature is intended for users that want to fit a very large model and dispatch the model between GPU and CPU.
+
+First, load a [`BitsAndBytesConfig`] from `transformers` and set the attribute `llm_int8_enable_fp32_cpu_offload` to `True`:
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)
+```
+
+Let's say you want to load `bigscience/bloom-1b7` model, and you have just enough GPU RAM to fit the entire model except the `lm_head`. Therefore write a custom device_map as follows:
+```python
+device_map = {
+    "transformer.word_embeddings": 0,
+    "transformer.word_embeddings_layernorm": 0,
+    "lm_head": "cpu",
+    "transformer.h": 0,
+    "transformer.ln_f": 0,
+}
+```
+
+And load your model as follows:
+```python
+model_8bit = AutoModelForCausalLM.from_pretrained(
+    "bigscience/bloom-1b7",
+    device_map=device_map,
+    quantization_config=quantization_config,
+)
+```
+
+And that's it! Enjoy your model!
+
+#### Play with `llm_int8_threshold`
+
+You can play with the `llm_int8_threshold` argument to change the threshold of the outliers. An "outlier" is a hidden state value that is greater than a certain threshold. 
+This corresponds to the outlier threshold for outlier detection as described in `LLM.int8()` paper. Any hidden states value that is above this threshold will be considered an outlier and the operation on those values will be done in fp16. Values are usually normally distributed, that is, most values are in the range [-3.5, 3.5], but there are some exceptional systematic outliers that are very differently distributed for large models. These outliers are often in the interval [-60, -6] or [6, 60]. Int8 quantization works well for values of magnitude ~5, but beyond that, there is a significant performance penalty. A good default threshold is 6, but a lower threshold might be needed for more unstable models (small models, fine-tuning).
+This argument can impact the inference speed of the model. We suggest to play with this parameter to find which one is the best for your use case.
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+model_id = "bigscience/bloom-1b7"
+
+quantization_config = BitsAndBytesConfig(
+    llm_int8_threshold=10,
+)
+
+model_8bit = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map=device_map,
+    quantization_config=quantization_config,
+)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+```
+
+#### Skip the conversion of some modules
+
+Some models has several modules that needs to be not converted in 8-bit to ensure stability. For example Jukebox model has several `lm_head` modules that should be skipped. Play with `llm_int8_skip_modules` 
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+model_id = "bigscience/bloom-1b7"
+
+quantization_config = BitsAndBytesConfig(
+    llm_int8_skip_modules=["lm_head"],
+)
+
+model_8bit = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map=device_map,
+    quantization_config=quantization_config,
+)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+```
+
+#### Fine-tune a model that has been loaded in 8-bit
+
+With the official support of adapters in the Hugging Face ecosystem, you can fine-tune models that have been loaded in 8-bit. 
+This enables fine-tuning large models such as `flan-t5-large` or `facebook/opt-6.7b` in a single google Colab. Please have a look at [`peft`](https://github.com/huggingface/peft) library for more details.
+
+Note that you don't need to pass `device_map` when loading the model for training. It will automatically load your model on your GPU. You can also set the device map to a specific device if needed (e.g. `cuda:0`, `0`, `torch.device('cuda:0')`). Please note that `device_map=auto` should be used for inference only. 
+
+### BitsAndBytesConfig
+
+[[autodoc]] BitsAndBytesConfig
+
+
+## Quantization with 🤗 `optimum` 
+
+Please have a look at [Optimum documentation](https://huggingface.co/docs/optimum/index) to learn more about quantization methods that are supported by `optimum` and see if these are applicable for your use case.
diff --git a/docs/source/en/main_classes/text_generation.md b/docs/source/en/main_classes/text_generation.md
new file mode 100644
index 000000000000..309d7298eec7
--- /dev/null
+++ b/docs/source/en/main_classes/text_generation.md
@@ -0,0 +1,63 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Generation
+
+Each framework has a generate method for text generation implemented in their respective `GenerationMixin` class:
+
+- PyTorch [`~generation.GenerationMixin.generate`] is implemented in [`~generation.GenerationMixin`].
+- TensorFlow [`~generation.TFGenerationMixin.generate`] is implemented in [`~generation.TFGenerationMixin`].
+- Flax/JAX [`~generation.FlaxGenerationMixin.generate`] is implemented in [`~generation.FlaxGenerationMixin`].
+
+Regardless of your framework of choice, you can parameterize the generate method with a [`~generation.GenerationConfig`]
+class instance. Please refer to this class for the complete list of generation parameters, which control the behavior
+of the generation method.
+
+To learn how to inspect a model's generation configuration, what are the defaults, how to change the parameters ad hoc,
+and how to create and save a customized generation configuration, refer to the
+[text generation strategies guide](../generation_strategies). The guide also explains how to use related features,
+like token streaming.
+
+## GenerationConfig
+
+[[autodoc]] generation.GenerationConfig
+	- from_pretrained
+	- from_model_config
+	- save_pretrained
+
+## GenerationMixin
+
+[[autodoc]] generation.GenerationMixin
+	- generate
+	- compute_transition_scores
+	- greedy_search
+	- sample
+	- beam_search
+	- beam_sample
+	- contrastive_search
+	- group_beam_search
+	- constrained_beam_search
+
+## TFGenerationMixin
+
+[[autodoc]] generation.TFGenerationMixin
+	- generate
+	- compute_transition_scores
+
+## FlaxGenerationMixin
+
+[[autodoc]] generation.FlaxGenerationMixin
+	- generate
diff --git a/docs/source/en/main_classes/text_generation.mdx b/docs/source/en/main_classes/text_generation.mdx
deleted file mode 100644
index 1d00406ac1e5..000000000000
--- a/docs/source/en/main_classes/text_generation.mdx
+++ /dev/null
@@ -1,116 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Generation
-
-Each framework has a generate method for auto-regressive text generation implemented in their respective `GenerationMixin` class:
-
-- PyTorch [`~generation.GenerationMixin.generate`] is implemented in [`~generation.GenerationMixin`].
-- TensorFlow [`~generation.TFGenerationMixin.generate`] is implemented in [`~generation.TFGenerationMixin`].
-- Flax/JAX [`~generation.FlaxGenerationMixin.generate`] is implemented in [`~generation.FlaxGenerationMixin`].
-
-Regardless of your framework of choice, you can parameterize the generate method with a [`~generation.GenerationConfig`]
-class instance. Please refer to this class for the complete list of generation parameters, which control the behavior
-of the generation method.
-
-All models have a default generation configuration that will be used if you don't provide one. If you have a loaded
-model instance `model`, you can inspect the default generation configuration with `model.generation_config`. If you'd
-like to set a new default generation configuration, you can create a new [`~generation.GenerationConfig`] instance and
-store it with `save_pretrained`, making sure to leave its `config_file_name` argument empty.
-
-```python
-from transformers import AutoModelForCausalLM, GenerationConfig
-
-model = AutoModelForCausalLM.from_pretrained("my_account/my_model")
-
-# Inspect the default generation configuration
-print(model.generation_config)
-
-# Set a new default generation configuration
-generation_config = GenerationConfig(
-    max_new_tokens=50, do_sample=True, top_k=50, eos_token_id=model.config.eos_token_id
-)
-generation_config.save_pretrained("my_account/my_model", push_to_hub=True)
-```
-
-<Tip>
-
-If you inspect a serialized [`~generation.GenerationConfig`] file or print a class instance, you will notice that
-default values are omitted. Some attributes, like `max_length`, have a conservative default value, to avoid running
-into resource limitations. Make sure you double-check the defaults in the documentation.
-
-</Tip>
-
-You can also store several generation parametrizations in a single directory, making use of the `config_file_name`
-argument in `save_pretrained`. You can latter instantiate them with `from_pretrained`. This is useful if you want to
-store several generation configurations for a single model (e.g. one for creative text generation with sampling, and
-other for summarization with beam search).
-
-```python
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig
-
-tokenizer = AutoTokenizer.from_pretrained("t5-small")
-model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
-
-translation_generation_config = GenerationConfig(
-    num_beams=4,
-    early_stopping=True,
-    decoder_start_token_id=0,
-    eos_token_id=model.config.eos_token_id,
-    pad_token=model.config.pad_token_id,
-)
-# If you were working on a model for which your had the right Hub permissions, you could store a named generation
-# config as follows
-translation_generation_config.save_pretrained("t5-small", "translation_generation_config.json", push_to_hub=True)
-
-# You could then use the named generation config file to parameterize generation
-generation_config = GenerationConfig.from_pretrained("t5-small", "translation_generation_config.json")
-inputs = tokenizer("translate English to French: Configuration files are easy to use!", return_tensors="pt")
-outputs = model.generate(**inputs, generation_config=generation_config)
-print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
-# ['Les fichiers de configuration sont faciles à utiliser !']
-```
-
-Finally, you can specify ad hoc modifications to the used generation configuration by passing the attribute you
-wish to override directly to the generate method (e.g. `model.generate(inputs, max_new_tokens=512)`). Each
-framework's `generate` method docstring (available below) has a few illustrative examples on the different strategies
-to parameterize it.
-
-
-## GenerationConfig
-
-[[autodoc]] generation.GenerationConfig
-	- from_pretrained
-	- from_model_config
-	- save_pretrained
-
-## GenerationMixin
-
-[[autodoc]] generation.GenerationMixin
-	- generate
-	- greedy_search
-	- sample
-	- beam_search
-	- beam_sample
-	- contrastive_search
-	- group_beam_search
-	- constrained_beam_search
-
-## TFGenerationMixin
-
-[[autodoc]] generation.TFGenerationMixin
-	- generate
-
-## FlaxGenerationMixin
-
-[[autodoc]] generation.FlaxGenerationMixin
-	- generate
diff --git a/docs/source/en/main_classes/tokenizer.md b/docs/source/en/main_classes/tokenizer.md
new file mode 100644
index 000000000000..bf43014ee51d
--- /dev/null
+++ b/docs/source/en/main_classes/tokenizer.md
@@ -0,0 +1,81 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Tokenizer
+
+A tokenizer is in charge of preparing the inputs for a model. The library contains tokenizers for all the models. Most
+of the tokenizers are available in two flavors: a full python implementation and a "Fast" implementation based on the
+Rust library [🤗 Tokenizers](https://github.com/huggingface/tokenizers). The "Fast" implementations allows:
+
+1. a significant speed-up in particular when doing batched tokenization and
+2. additional methods to map between the original string (character and words) and the token space (e.g. getting the
+   index of the token comprising a given character or the span of characters corresponding to a given token). 
+
+The base classes [`PreTrainedTokenizer`] and [`PreTrainedTokenizerFast`]
+implement the common methods for encoding string inputs in model inputs (see below) and instantiating/saving python and
+"Fast" tokenizers either from a local file or directory or from a pretrained tokenizer provided by the library
+(downloaded from HuggingFace's AWS S3 repository). They both rely on
+[`~tokenization_utils_base.PreTrainedTokenizerBase`] that contains the common methods, and
+[`~tokenization_utils_base.SpecialTokensMixin`].
+
+[`PreTrainedTokenizer`] and [`PreTrainedTokenizerFast`] thus implement the main
+methods for using all the tokenizers:
+
+- Tokenizing (splitting strings in sub-word token strings), converting tokens strings to ids and back, and
+  encoding/decoding (i.e., tokenizing and converting to integers).
+- Adding new tokens to the vocabulary in a way that is independent of the underlying structure (BPE, SentencePiece...).
+- Managing special tokens (like mask, beginning-of-sentence, etc.): adding them, assigning them to attributes in the
+  tokenizer for easy access and making sure they are not split during tokenization.
+
+[`BatchEncoding`] holds the output of the
+[`~tokenization_utils_base.PreTrainedTokenizerBase`]'s encoding methods (`__call__`,
+`encode_plus` and `batch_encode_plus`) and is derived from a Python dictionary. When the tokenizer is a pure python
+tokenizer, this class behaves just like a standard python dictionary and holds the various model inputs computed by
+these methods (`input_ids`, `attention_mask`...). When the tokenizer is a "Fast" tokenizer (i.e., backed by
+HuggingFace [tokenizers library](https://github.com/huggingface/tokenizers)), this class provides in addition
+several advanced alignment methods which can be used to map between the original string (character and words) and the
+token space (e.g., getting the index of the token comprising a given character or the span of characters corresponding
+to a given token).
+
+
+## PreTrainedTokenizer
+
+[[autodoc]] PreTrainedTokenizer
+    - __call__
+    - apply_chat_template
+    - batch_decode
+    - decode
+    - encode
+    - push_to_hub
+    - all
+
+## PreTrainedTokenizerFast
+
+The [`PreTrainedTokenizerFast`] depend on the [tokenizers](https://huggingface.co/docs/tokenizers) library. The tokenizers obtained from the 🤗 tokenizers library can be
+loaded very simply into 🤗 transformers. Take a look at the [Using tokenizers from 🤗 tokenizers](../fast_tokenizers) page to understand how this is done.
+
+[[autodoc]] PreTrainedTokenizerFast
+    - __call__
+    - apply_chat_template
+    - batch_decode
+    - decode
+    - encode
+    - push_to_hub
+    - all
+
+## BatchEncoding
+
+[[autodoc]] BatchEncoding
diff --git a/docs/source/en/main_classes/tokenizer.mdx b/docs/source/en/main_classes/tokenizer.mdx
deleted file mode 100644
index 032373435cd5..000000000000
--- a/docs/source/en/main_classes/tokenizer.mdx
+++ /dev/null
@@ -1,75 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Tokenizer
-
-A tokenizer is in charge of preparing the inputs for a model. The library contains tokenizers for all the models. Most
-of the tokenizers are available in two flavors: a full python implementation and a "Fast" implementation based on the
-Rust library [🤗 Tokenizers](https://github.com/huggingface/tokenizers). The "Fast" implementations allows:
-
-1. a significant speed-up in particular when doing batched tokenization and
-2. additional methods to map between the original string (character and words) and the token space (e.g. getting the
-   index of the token comprising a given character or the span of characters corresponding to a given token). 
-
-The base classes [`PreTrainedTokenizer`] and [`PreTrainedTokenizerFast`]
-implement the common methods for encoding string inputs in model inputs (see below) and instantiating/saving python and
-"Fast" tokenizers either from a local file or directory or from a pretrained tokenizer provided by the library
-(downloaded from HuggingFace's AWS S3 repository). They both rely on
-[`~tokenization_utils_base.PreTrainedTokenizerBase`] that contains the common methods, and
-[`~tokenization_utils_base.SpecialTokensMixin`].
-
-[`PreTrainedTokenizer`] and [`PreTrainedTokenizerFast`] thus implement the main
-methods for using all the tokenizers:
-
-- Tokenizing (splitting strings in sub-word token strings), converting tokens strings to ids and back, and
-  encoding/decoding (i.e., tokenizing and converting to integers).
-- Adding new tokens to the vocabulary in a way that is independent of the underlying structure (BPE, SentencePiece...).
-- Managing special tokens (like mask, beginning-of-sentence, etc.): adding them, assigning them to attributes in the
-  tokenizer for easy access and making sure they are not split during tokenization.
-
-[`BatchEncoding`] holds the output of the
-[`~tokenization_utils_base.PreTrainedTokenizerBase`]'s encoding methods (`__call__`,
-`encode_plus` and `batch_encode_plus`) and is derived from a Python dictionary. When the tokenizer is a pure python
-tokenizer, this class behaves just like a standard python dictionary and holds the various model inputs computed by
-these methods (`input_ids`, `attention_mask`...). When the tokenizer is a "Fast" tokenizer (i.e., backed by
-HuggingFace [tokenizers library](https://github.com/huggingface/tokenizers)), this class provides in addition
-several advanced alignment methods which can be used to map between the original string (character and words) and the
-token space (e.g., getting the index of the token comprising a given character or the span of characters corresponding
-to a given token).
-
-
-## PreTrainedTokenizer
-
-[[autodoc]] PreTrainedTokenizer
-    - __call__
-    - batch_decode
-    - decode
-    - encode
-    - push_to_hub
-    - all
-
-## PreTrainedTokenizerFast
-
-The [`PreTrainedTokenizerFast`] depend on the [tokenizers](https://huggingface.co/docs/tokenizers) library. The tokenizers obtained from the 🤗 tokenizers library can be
-loaded very simply into 🤗 transformers. Take a look at the [Using tokenizers from 🤗 tokenizers](../fast_tokenizers) page to understand how this is done.
-
-[[autodoc]] PreTrainedTokenizerFast
-    - __call__
-    - batch_decode
-    - decode
-    - encode
-    - push_to_hub
-    - all
-
-## BatchEncoding
-
-[[autodoc]] BatchEncoding
diff --git a/docs/source/en/main_classes/trainer.md b/docs/source/en/main_classes/trainer.md
new file mode 100644
index 000000000000..f433b820af8b
--- /dev/null
+++ b/docs/source/en/main_classes/trainer.md
@@ -0,0 +1,734 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Trainer
+
+The [`Trainer`] class provides an API for feature-complete training in PyTorch for most standard use cases. It's used in most of the [example scripts](https://github.com/huggingface/transformers/tree/main/examples).
+
+Before instantiating your [`Trainer`], create a [`TrainingArguments`] to access all the points of customization during training.
+
+The API supports distributed training on multiple GPUs/TPUs, mixed precision through [NVIDIA Apex](https://github.com/NVIDIA/apex) and Native AMP for PyTorch.
+
+The [`Trainer`] contains the basic training loop which supports the above features. To inject custom behavior you can subclass them and override the following methods:
+
+- **get_train_dataloader** -- Creates the training DataLoader.
+- **get_eval_dataloader** -- Creates the evaluation DataLoader.
+- **get_test_dataloader** -- Creates the test DataLoader.
+- **log** -- Logs information on the various objects watching training.
+- **create_optimizer_and_scheduler** -- Sets up the optimizer and learning rate scheduler if they were not passed at
+  init. Note, that you can also subclass or override the `create_optimizer` and `create_scheduler` methods
+  separately.
+- **create_optimizer** -- Sets up the optimizer if it wasn't passed at init.
+- **create_scheduler** -- Sets up the learning rate scheduler if it wasn't passed at init.
+- **compute_loss** - Computes the loss on a batch of training inputs.
+- **training_step** -- Performs a training step.
+- **prediction_step** -- Performs an evaluation/test step.
+- **evaluate** -- Runs an evaluation loop and returns metrics.
+- **predict** -- Returns predictions (with metrics if labels are available) on a test set.
+
+<Tip warning={true}>
+
+The [`Trainer`] class is optimized for 🤗 Transformers models and can have surprising behaviors
+when you use it on other models. When using it on your own model, make sure:
+
+- your model always return tuples or subclasses of [`~utils.ModelOutput`].
+- your model can compute the loss if a `labels` argument is provided and that loss is returned as the first
+  element of the tuple (if your model returns tuples)
+- your model can accept multiple label arguments (use the `label_names` in your [`TrainingArguments`] to indicate their name to the [`Trainer`]) but none of them should be named `"label"`.
+
+</Tip>
+
+Here is an example of how to customize [`Trainer`] to use a weighted loss (useful when you have an unbalanced training set):
+
+```python
+from torch import nn
+from transformers import Trainer
+
+
+class CustomTrainer(Trainer):
+    def compute_loss(self, model, inputs, return_outputs=False):
+        labels = inputs.pop("labels")
+        # forward pass
+        outputs = model(**inputs)
+        logits = outputs.get("logits")
+        # compute custom loss (suppose one has 3 labels with different weights)
+        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0, 3.0], device=model.device))
+        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
+        return (loss, outputs) if return_outputs else loss
+```
+
+Another way to customize the training loop behavior for the PyTorch [`Trainer`] is to use [callbacks](callback) that can inspect the training loop state (for progress reporting, logging on TensorBoard or other ML platforms...) and take decisions (like early stopping).
+
+
+## Trainer
+
+[[autodoc]] Trainer
+    - all
+
+## Seq2SeqTrainer
+
+[[autodoc]] Seq2SeqTrainer
+    - evaluate
+    - predict
+
+## TrainingArguments
+
+[[autodoc]] TrainingArguments
+    - all
+
+## Seq2SeqTrainingArguments
+
+[[autodoc]] Seq2SeqTrainingArguments
+    - all
+
+## Checkpoints
+
+By default, [`Trainer`] will save all checkpoints in the `output_dir` you set in the
+[`TrainingArguments`] you are using. Those will go in subfolder named `checkpoint-xxx` with xxx
+being the step at which the training was at.
+
+Resuming training from a checkpoint can be done when calling [`Trainer.train`] with either:
+
+- `resume_from_checkpoint=True` which will resume training from the latest checkpoint
+- `resume_from_checkpoint=checkpoint_dir` which will resume training from the specific checkpoint in the directory
+  passed.
+
+In addition, you can easily save your checkpoints on the Model Hub when using `push_to_hub=True`. By default, all
+the models saved in intermediate checkpoints are saved in different commits, but not the optimizer state. You can adapt
+the `hub-strategy` value of your [`TrainingArguments`] to either:
+
+- `"checkpoint"`: the latest checkpoint is also pushed in a subfolder named last-checkpoint, allowing you to
+  resume training easily with `trainer.train(resume_from_checkpoint="output_dir/last-checkpoint")`.
+- `"all_checkpoints"`: all checkpoints are pushed like they appear in the output folder (so you will get one
+  checkpoint folder per folder in your final repository)
+
+
+## Logging
+
+By default [`Trainer`] will use `logging.INFO` for the main process and `logging.WARNING` for the replicas if any.
+
+These defaults can be overridden to use any of the 5 `logging` levels with [`TrainingArguments`]'s
+arguments:
+
+- `log_level` - for the main process
+- `log_level_replica` - for the replicas
+
+Further, if [`TrainingArguments`]'s `log_on_each_node` is set to `False` only the main node will
+use the log level settings for its main process, all other nodes will use the log level settings for replicas.
+
+Note that [`Trainer`] is going to set `transformers`'s log level separately for each node in its
+[`Trainer.__init__`]. So you may want to set this sooner (see the next example) if you tap into other
+`transformers` functionality before creating the [`Trainer`] object.
+
+Here is an example of how this can be used in an application:
+
+```python
+[...]
+logger = logging.getLogger(__name__)
+
+# Setup logging
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+    datefmt="%m/%d/%Y %H:%M:%S",
+    handlers=[logging.StreamHandler(sys.stdout)],
+)
+
+# set the main code and the modules it uses to the same log-level according to the node
+log_level = training_args.get_process_log_level()
+logger.setLevel(log_level)
+datasets.utils.logging.set_verbosity(log_level)
+transformers.utils.logging.set_verbosity(log_level)
+
+trainer = Trainer(...)
+```
+
+And then if you only want to see warnings on the main node and all other nodes to not print any most likely duplicated
+warnings you could run it as:
+
+```bash
+my_app.py ... --log_level warning --log_level_replica error
+```
+
+In the multi-node environment if you also don't want the logs to repeat for each node's main process, you will want to
+change the above to:
+
+```bash
+my_app.py ... --log_level warning --log_level_replica error --log_on_each_node 0
+```
+
+and then only the main process of the first node will log at the "warning" level, and all other processes on the main
+node and all processes on other nodes will log at the "error" level.
+
+If you need your application to be as quiet as possible you could do:
+
+```bash
+my_app.py ... --log_level error --log_level_replica error --log_on_each_node 0
+```
+
+(add `--log_on_each_node 0` if on multi-node environment)
+
+
+## Randomness
+
+When resuming from a checkpoint generated by [`Trainer`] all efforts are made to restore the
+_python_, _numpy_ and _pytorch_ RNG states to the same states as they were at the moment of saving that checkpoint,
+which should make the "stop and resume" style of training as close as possible to non-stop training.
+
+However, due to various default non-deterministic pytorch settings this might not fully work. If you want full
+determinism please refer to [Controlling sources of randomness](https://pytorch.org/docs/stable/notes/randomness). As explained in the document, that some of those settings
+that make things deterministic (.e.g., `torch.backends.cudnn.deterministic`) may slow things down, therefore this
+can't be done by default, but you can enable those yourself if needed.
+
+
+## Specific GPUs Selection
+
+Let's discuss how you can tell your program which GPUs are to be used and in what order.
+
+When using [`DistributedDataParallel`](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) to use only a subset of your GPUs, you simply specify the number of GPUs to use. For example, if you have 4 GPUs, but you wish to use the first 2 you can do:
+
+```bash
+python -m torch.distributed.launch --nproc_per_node=2  trainer-program.py ...
+```
+
+if you have either [`accelerate`](https://github.com/huggingface/accelerate) or [`deepspeed`](https://github.com/microsoft/DeepSpeed) installed you can also accomplish the same by using one of:
+```bash
+accelerate launch --num_processes 2 trainer-program.py ...
+```
+
+```bash
+deepspeed --num_gpus 2 trainer-program.py ...
+```
+
+You don't need to use the Accelerate or [the Deepspeed integration](Deepspeed) features to use these launchers.
+
+
+Until now you were able to tell the program how many GPUs to use. Now let's discuss how to select specific GPUs and control their order.
+
+The following environment variables help you control which GPUs to use and their order.
+
+**`CUDA_VISIBLE_DEVICES`**
+
+If you have multiple GPUs and you'd like to use only 1 or a few of those GPUs, set the environment variable `CUDA_VISIBLE_DEVICES` to a list of the GPUs to be used.
+
+For example, let's say you have 4 GPUs: 0, 1, 2 and 3. To run only on the physical GPUs 0 and 2, you can do:
+
+```bash
+CUDA_VISIBLE_DEVICES=0,2 python -m torch.distributed.launch trainer-program.py ...
+```
+
+So now pytorch will see only 2 GPUs, where your physical GPUs 0 and 2 are mapped to `cuda:0` and `cuda:1` correspondingly.
+
+You can even change their order:
+
+```bash
+CUDA_VISIBLE_DEVICES=2,0 python -m torch.distributed.launch trainer-program.py ...
+```
+
+Here your physical GPUs 0 and 2 are mapped to `cuda:1` and `cuda:0` correspondingly.
+
+The above examples were all for `DistributedDataParallel` use pattern, but the same method works for [`DataParallel`](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) as well:
+```bash
+CUDA_VISIBLE_DEVICES=2,0 python trainer-program.py ...
+```
+
+To emulate an environment without GPUs simply set this environment variable to an empty value like so:
+
+```bash
+CUDA_VISIBLE_DEVICES= python trainer-program.py ...
+```
+
+As with any environment variable you can, of course, export those instead of adding these to the command line, as in:
+
+
+```bash
+export CUDA_VISIBLE_DEVICES=0,2
+python -m torch.distributed.launch trainer-program.py ...
+```
+
+but this approach can be confusing since you may forget you set up the environment variable earlier and not understand why the wrong GPUs are used. Therefore, it's a common practice to set the environment variable just for a specific run on the same command line as it's shown in most examples of this section.
+
+**`CUDA_DEVICE_ORDER`**
+
+There is an additional environment variable `CUDA_DEVICE_ORDER` that controls how the physical devices are ordered. The two choices are:
+
+1. ordered by PCIe bus IDs (matches `nvidia-smi`'s order) - this is the default.
+
+```bash
+export CUDA_DEVICE_ORDER=PCI_BUS_ID
+```
+
+2. ordered by GPU compute capabilities
+
+```bash
+export CUDA_DEVICE_ORDER=FASTEST_FIRST
+```
+
+Most of the time you don't need to care about this environment variable, but it's very helpful if you have a lopsided setup where you have an old and a new GPUs physically inserted in such a way so that the slow older card appears to be first. One way to fix that is to swap the cards. But if you can't swap the cards (e.g., if the cooling of the devices gets impacted) then setting `CUDA_DEVICE_ORDER=FASTEST_FIRST` will always put the newer faster card first. It'll be somewhat confusing though since `nvidia-smi` will still report them in the PCIe order.
+
+The other solution to swapping the order is to use:
+
+```bash
+export CUDA_VISIBLE_DEVICES=1,0
+```
+In this example we are working with just 2 GPUs, but of course the same would apply to as many GPUs as your computer has.
+
+Also if you do set this environment variable it's the best to set it in your `~/.bashrc` file or some other startup config file and forget about it.
+
+
+
+
+## Trainer Integrations
+
+The [`Trainer`] has been extended to support libraries that may dramatically improve your training
+time and fit much bigger models.
+
+Currently it supports third party solutions, [DeepSpeed](https://github.com/microsoft/DeepSpeed) and [PyTorch FSDP](https://pytorch.org/docs/stable/fsdp.html), which implement parts of the paper [ZeRO: Memory Optimizations
+Toward Training Trillion Parameter Models, by Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, Yuxiong He](https://arxiv.org/abs/1910.02054).
+
+This provided support is new and experimental as of this writing. While the support for DeepSpeed and PyTorch FSDP is active and we welcome issues around it, we don't support the FairScale integration anymore since it has been integrated in PyTorch main (see the [PyTorch FSDP integration](#pytorch-fully-sharded-data-parallel))
+
+<a id='zero-install-notes'></a>
+
+### CUDA Extension Installation Notes
+
+As of this writing, Deepspeed require compilation of CUDA C++ code, before it can be used.
+
+While all installation issues should be dealt with through the corresponding GitHub Issues of [Deepspeed](https://github.com/microsoft/DeepSpeed/issues), there are a few common issues that one may encounter while building
+any PyTorch extension that needs to build CUDA extensions.
+
+Therefore, if you encounter a CUDA-related build issue while doing the following:
+
+```bash
+pip install deepspeed
+```
+
+please, read the following notes first.
+
+In these notes we give examples for what to do when `pytorch` has been built with CUDA `10.2`. If your situation is
+different remember to adjust the version number to the one you are after.
+
+#### Possible problem #1
+
+While, Pytorch comes with its own CUDA toolkit, to build these two projects you must have an identical version of CUDA
+installed system-wide.
+
+For example, if you installed `pytorch` with `cudatoolkit==10.2` in the Python environment, you also need to have
+CUDA `10.2` installed system-wide.
+
+The exact location may vary from system to system, but `/usr/local/cuda-10.2` is the most common location on many
+Unix systems. When CUDA is correctly set up and added to the `PATH` environment variable, one can find the
+installation location by doing:
+
+```bash
+which nvcc
+```
+
+If you don't have CUDA installed system-wide, install it first. You will find the instructions by using your favorite
+search engine. For example, if you're on Ubuntu you may want to search for: [ubuntu cuda 10.2 install](https://www.google.com/search?q=ubuntu+cuda+10.2+install).
+
+#### Possible problem #2
+
+Another possible common problem is that you may have more than one CUDA toolkit installed system-wide. For example you
+may have:
+
+```bash
+/usr/local/cuda-10.2
+/usr/local/cuda-11.0
+```
+
+Now, in this situation you need to make sure that your `PATH` and `LD_LIBRARY_PATH` environment variables contain
+the correct paths to the desired CUDA version. Typically, package installers will set these to contain whatever the
+last version was installed. If you encounter the problem, where the package build fails because it can't find the right
+CUDA version despite you having it installed system-wide, it means that you need to adjust the 2 aforementioned
+environment variables.
+
+First, you may look at their contents:
+
+```bash
+echo $PATH
+echo $LD_LIBRARY_PATH
+```
+
+so you get an idea of what is inside.
+
+It's possible that `LD_LIBRARY_PATH` is empty.
+
+`PATH` lists the locations of where executables can be found and `LD_LIBRARY_PATH` is for where shared libraries
+are to looked for. In both cases, earlier entries have priority over the later ones. `:` is used to separate multiple
+entries.
+
+Now, to tell the build program where to find the specific CUDA toolkit, insert the desired paths to be listed first by
+doing:
+
+```bash
+export PATH=/usr/local/cuda-10.2/bin:$PATH
+export LD_LIBRARY_PATH=/usr/local/cuda-10.2/lib64:$LD_LIBRARY_PATH
+```
+
+Note that we aren't overwriting the existing values, but prepending instead.
+
+Of course, adjust the version number, the full path if need be. Check that the directories you assign actually do
+exist. `lib64` sub-directory is where the various CUDA `.so` objects, like `libcudart.so` reside, it's unlikely
+that your system will have it named differently, but if it is adjust it to reflect your reality.
+
+
+#### Possible problem #3
+
+Some older CUDA versions may refuse to build with newer compilers. For example, you my have `gcc-9` but it wants
+`gcc-7`.
+
+There are various ways to go about it.
+
+If you can install the latest CUDA toolkit it typically should support the newer compiler.
+
+Alternatively, you could install the lower version of the compiler in addition to the one you already have, or you may
+already have it but it's not the default one, so the build system can't see it. If you have `gcc-7` installed but the
+build system complains it can't find it, the following might do the trick:
+
+```bash
+sudo ln -s /usr/bin/gcc-7  /usr/local/cuda-10.2/bin/gcc
+sudo ln -s /usr/bin/g++-7  /usr/local/cuda-10.2/bin/g++
+```
+
+Here, we are making a symlink to `gcc-7` from `/usr/local/cuda-10.2/bin/gcc` and since
+`/usr/local/cuda-10.2/bin/` should be in the `PATH` environment variable (see the previous problem's solution), it
+should find `gcc-7` (and `g++7`) and then the build will succeed.
+
+As always make sure to edit the paths in the example to match your situation.
+
+
+### PyTorch Fully Sharded Data parallel
+
+To accelerate training huge models on larger batch sizes, we can use a fully sharded data parallel model.
+This type of data parallel paradigm enables fitting more data and larger models by sharding the optimizer states, gradients and parameters.
+To read more about it and the benefits, check out the [Fully Sharded Data Parallel blog](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/).
+We have integrated the latest PyTorch's Fully Sharded Data Parallel (FSDP) training feature.
+All you need to do is enable it through the config.
+
+**Required PyTorch version for FSDP support**: PyTorch Nightly (or 1.12.0 if you read this after it has been released)
+as the model saving with FSDP activated is only available with recent fixes.
+
+**Usage**:
+
+- Make sure you have added the distributed launcher
+`-m torch.distributed.launch --nproc_per_node=NUMBER_OF_GPUS_YOU_HAVE` if you haven't been using it already.
+
+- **Sharding Strategy**: 
+  - FULL_SHARD : Shards optimizer states + gradients + model parameters across data parallel workers/GPUs.
+    For this, add `--fsdp full_shard` to the command line arguments. 
+  - SHARD_GRAD_OP : Shards optimizer states + gradients across data parallel workers/GPUs.
+    For this, add `--fsdp shard_grad_op` to the command line arguments.
+  - NO_SHARD : No sharding. For this, add `--fsdp no_shard` to the command line arguments.
+- To offload the parameters and gradients to the CPU, 
+  add `--fsdp "full_shard offload"` or `--fsdp "shard_grad_op offload"` to the command line arguments.
+- To automatically recursively wrap layers with FSDP using `default_auto_wrap_policy`, 
+  add `--fsdp "full_shard auto_wrap"` or `--fsdp "shard_grad_op auto_wrap"` to the command line arguments.
+- To enable both CPU offloading and auto wrapping, 
+  add `--fsdp "full_shard offload auto_wrap"` or `--fsdp "shard_grad_op offload auto_wrap"` to the command line arguments.
+- Remaining FSDP config is passed via `--fsdp_config <path_to_fsdp_config.json>`. It is either a location of
+  FSDP json config file (e.g., `fsdp_config.json`) or an already loaded json file as `dict`. 
+  - If auto wrapping is enabled, you can either use transformer based auto wrap policy or size based auto wrap policy.
+    - For transformer based auto wrap policy, it is recommended to specify `fsdp_transformer_layer_cls_to_wrap` in the config file. If not specified, the default value is `model._no_split_modules` when available.
+      This specifies the list of transformer layer class name (case-sensitive) to wrap ,e.g, [`BertLayer`], [`GPTJBlock`], [`T5Block`] ....
+      This is important because submodules that share weights (e.g., embedding layer) should not end up in different FSDP wrapped units.
+      Using this policy, wrapping happens for each block containing Multi-Head Attention followed by couple of MLP layers. 
+      Remaining layers including the shared embeddings are conveniently wrapped in same outermost FSDP unit.
+      Therefore, use this for transformer based models.
+    - For size based auto wrap policy, please add `fsdp_min_num_params` in the config file. 
+      It specifies FSDP's minimum number of parameters for auto wrapping.
+  - `fsdp_backward_prefetch` can be specified in the config file. It controls when to prefetch next set of parameters. 
+    `backward_pre` and `backward_pos` are available options. 
+    For more information refer `torch.distributed.fsdp.fully_sharded_data_parallel.BackwardPrefetch`
+  - `fsdp_forward_prefetch` can be specified in the config file. It controls when to prefetch next set of parameters. 
+    If `"True"`, FSDP explicitly prefetches the next upcoming all-gather while executing in the forward pass. 
+  - `limit_all_gathers` can be specified in the config file. 
+    If `"True"`, FSDP explicitly synchronizes the CPU thread to prevent too many in-flight all-gathers.
+  - `activation_checkpointing` can be specified in the config file.
+    If `"True"`, FSDP activation checkpointing is a technique to reduce memory usage by clearing activations of
+    certain layers and recomputing them during a backward pass. Effectively, this trades extra computation time
+    for reduced memory usage.
+
+**Few caveats to be aware of**
+- it is incompatible with `generate`, thus is incompatible with `--predict_with_generate` 
+  in all seq2seq/clm scripts (translation/summarization/clm etc.).  
+  Please refer issue [#21667](https://github.com/huggingface/transformers/issues/21667)
+
+### PyTorch/XLA Fully Sharded Data parallel
+
+For all the TPU users, great news! PyTorch/XLA now supports FSDP.
+All the latest Fully Sharded Data Parallel (FSDP) training are supported.
+For more information refer to the [Scaling PyTorch models on Cloud TPUs with FSDP](https://pytorch.org/blog/scaling-pytorch-models-on-cloud-tpus-with-fsdp/) and [PyTorch/XLA implementation of FSDP](https://github.com/pytorch/xla/tree/master/torch_xla/distributed/fsdp)
+All you need to do is enable it through the config.
+
+**Required PyTorch/XLA version for FSDP support**: >=2.0
+
+**Usage**:
+
+Pass `--fsdp "full shard"` along with following changes to be made in `--fsdp_config <path_to_fsdp_config.json>`:
+- `xla` should be set to `True` to enable PyTorch/XLA FSDP.
+- `xla_fsdp_settings` The value is a dictionary which stores the XLA FSDP wrapping parameters.
+  For a complete list of options, please see [here](
+  https://github.com/pytorch/xla/blob/master/torch_xla/distributed/fsdp/xla_fully_sharded_data_parallel.py).
+- `xla_fsdp_grad_ckpt`. When `True`, uses gradient checkpointing over each nested XLA FSDP wrapped layer. 
+  This setting can only be used when the xla flag is set to true, and an auto wrapping policy is specified through
+  `fsdp_min_num_params` or `fsdp_transformer_layer_cls_to_wrap`. 
+- You can either use transformer based auto wrap policy or size based auto wrap policy.
+  - For transformer based auto wrap policy, it is recommended to specify `fsdp_transformer_layer_cls_to_wrap` in the config file. If not specified, the default value is `model._no_split_modules` when available.
+    This specifies the list of transformer layer class name (case-sensitive) to wrap ,e.g, [`BertLayer`], [`GPTJBlock`], [`T5Block`] ....
+    This is important because submodules that share weights (e.g., embedding layer) should not end up in different FSDP wrapped units.
+    Using this policy, wrapping happens for each block containing Multi-Head Attention followed by couple of MLP layers. 
+    Remaining layers including the shared embeddings are conveniently wrapped in same outermost FSDP unit.
+    Therefore, use this for transformer based models.
+  - For size based auto wrap policy, please add `fsdp_min_num_params` in the config file. 
+    It specifies FSDP's minimum number of parameters for auto wrapping.
+
+
+### Using Trainer for accelerated PyTorch Training on Mac 
+
+With PyTorch v1.12 release, developers and researchers can take advantage of Apple silicon GPUs for significantly faster model training. 
+This unlocks the ability to perform machine learning workflows like prototyping and fine-tuning locally, right on Mac.
+Apple's Metal Performance Shaders (MPS) as a backend for PyTorch enables this and can be used via the new `"mps"` device. 
+This will map computational graphs and primitives on the MPS Graph framework and tuned kernels provided by MPS.
+For more information please refer official documents [Introducing Accelerated PyTorch Training on Mac](https://pytorch.org/blog/introducing-accelerated-pytorch-training-on-mac/)
+and [MPS BACKEND](https://pytorch.org/docs/stable/notes/mps.html). 
+
+<Tip warning={false}>
+
+We strongly recommend to install PyTorch >= 1.13 (nightly version at the time of writing) on your MacOS machine. 
+It has major fixes related to model correctness and performance improvements for transformer based models.
+Please refer to https://github.com/pytorch/pytorch/issues/82707 for more details.
+
+</Tip>
+
+**Benefits of Training and Inference using Apple Silicon Chips**
+
+1. Enables users to train larger networks or batch sizes locally
+2. Reduces data retrieval latency and provides the GPU with direct access to the full memory store due to unified memory architecture. 
+Therefore, improving end-to-end performance.
+3. Reduces costs associated with cloud-based development or the need for additional local GPUs.
+
+**Pre-requisites**: To install torch with mps support, 
+please follow this nice medium article [GPU-Acceleration Comes to PyTorch on M1 Macs](https://medium.com/towards-data-science/gpu-acceleration-comes-to-pytorch-on-m1-macs-195c399efcc1).
+
+**Usage**:
+`mps` device will be used by default if available similar to the way `cuda` device is used.
+Therefore, no action from user is required. 
+For example, you can run the official Glue text classififcation task (from the root folder) using Apple Silicon GPU with below command:
+
+```bash
+export TASK_NAME=mrpc
+
+python examples/pytorch/text-classification/run_glue.py \
+  --model_name_or_path bert-base-cased \
+  --task_name $TASK_NAME \
+  --do_train \
+  --do_eval \
+  --max_seq_length 128 \
+  --per_device_train_batch_size 32 \
+  --learning_rate 2e-5 \
+  --num_train_epochs 3 \
+  --output_dir /tmp/$TASK_NAME/ \
+  --overwrite_output_dir
+```
+
+**A few caveats to be aware of**
+
+1. Some PyTorch operations have not been implemented in mps and will throw an error. 
+One way to get around that is to set the environment variable `PYTORCH_ENABLE_MPS_FALLBACK=1`, 
+which will fallback to CPU for these operations. It still throws a UserWarning however.
+2. Distributed setups `gloo` and `nccl` are not working with `mps` device. 
+This means that currently only single GPU of `mps` device type can be used.
+
+Finally, please, remember that, 🤗 `Trainer` only integrates MPS backend, therefore if you
+have any problems or questions with regards to MPS backend usage, please, 
+file an issue with [PyTorch GitHub](https://github.com/pytorch/pytorch/issues).
+
+
+## Using Accelerate Launcher with Trainer
+
+Accelerate now powers Trainer. In terms of what users should expect:
+- They can keep using the Trainer ingterations such as FSDP, DeepSpeed vis trainer arguments without any changes on their part.
+- They can now use Accelerate Launcher with Trainer (recommended).
+
+Steps to use Accelerate Launcher with Trainer:
+1. Make sure 🤗 Accelerate is installed, you can't use the `Trainer` without it anyway. If not `pip install accelerate`. You may also need to update your version of Accelerate: `pip install accelerate --upgrade`
+2. Run `accelerate config` and fill the questionnaire. Below are example accelerate configs:
+  a. DDP Multi-node Multi-GPU config:
+    ```yaml
+    compute_environment: LOCAL_MACHINE                                                                                             
+    distributed_type: MULTI_GPU                                                                                                    
+    downcast_bf16: 'no'
+    gpu_ids: all
+    machine_rank: 0 #change rank as per the node
+    main_process_ip: 192.168.20.1
+    main_process_port: 9898
+    main_training_function: main
+    mixed_precision: fp16
+    num_machines: 2
+    num_processes: 8
+    rdzv_backend: static
+    same_network: true
+    tpu_env: []
+    tpu_use_cluster: false
+    tpu_use_sudo: false
+    use_cpu: false
+    ```
+
+  b. FSDP config:
+    ```yaml
+    compute_environment: LOCAL_MACHINE
+    distributed_type: FSDP
+    downcast_bf16: 'no'
+    fsdp_config:
+      fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+      fsdp_backward_prefetch_policy: BACKWARD_PRE
+      fsdp_forward_prefetch: true
+      fsdp_offload_params: false
+      fsdp_sharding_strategy: 1
+      fsdp_state_dict_type: FULL_STATE_DICT
+      fsdp_sync_module_states: true
+      fsdp_transformer_layer_cls_to_wrap: BertLayer
+      fsdp_use_orig_params: true
+    machine_rank: 0
+    main_training_function: main
+    mixed_precision: bf16
+    num_machines: 1
+    num_processes: 2
+    rdzv_backend: static
+    same_network: true
+    tpu_env: []
+    tpu_use_cluster: false
+    tpu_use_sudo: false
+    use_cpu: false
+    ```
+  c. DeepSpeed config pointing to a file:
+    ```yaml
+    compute_environment: LOCAL_MACHINE
+    deepspeed_config:
+      deepspeed_config_file: /home/user/configs/ds_zero3_config.json
+      zero3_init_flag: true
+    distributed_type: DEEPSPEED
+    downcast_bf16: 'no'
+    machine_rank: 0
+    main_training_function: main
+    num_machines: 1
+    num_processes: 4
+    rdzv_backend: static
+    same_network: true
+    tpu_env: []
+    tpu_use_cluster: false
+    tpu_use_sudo: false
+    use_cpu: false
+    ```
+
+  d. DeepSpeed config using accelerate plugin:
+    ```yaml
+    compute_environment: LOCAL_MACHINE                                                                                             
+    deepspeed_config:                                                                                                              
+      gradient_accumulation_steps: 1
+      gradient_clipping: 0.7
+      offload_optimizer_device: cpu
+      offload_param_device: cpu
+      zero3_init_flag: true
+      zero_stage: 2
+    distributed_type: DEEPSPEED
+    downcast_bf16: 'no'
+    machine_rank: 0
+    main_training_function: main
+    mixed_precision: bf16
+    num_machines: 1
+    num_processes: 4
+    rdzv_backend: static
+    same_network: true
+    tpu_env: []
+    tpu_use_cluster: false
+    tpu_use_sudo: false
+    use_cpu: false
+    ```
+
+3. Run the Trainer script with args other than the ones handled above by accelerate config or launcher args.
+Below is an example to run `run_glue.py` using `accelerate launcher` with FSDP config from above. 
+
+```bash
+cd transformers
+
+accelerate launch \
+./examples/pytorch/text-classification/run_glue.py \
+--model_name_or_path bert-base-cased \
+--task_name $TASK_NAME \
+--do_train \
+--do_eval \
+--max_seq_length 128 \
+--per_device_train_batch_size 16 \
+--learning_rate 5e-5 \
+--num_train_epochs 3 \
+--output_dir /tmp/$TASK_NAME/ \
+--overwrite_output_dir
+```
+
+4. You can also directly use the cmd args for `accelerate launch`. Above example would map to:
+
+```bash
+cd transformers
+
+accelerate launch --num_processes=2 \
+--use_fsdp \
+--mixed_precision=bf16 \
+--fsdp_auto_wrap_policy=TRANSFORMER_BASED_WRAP  \
+--fsdp_transformer_layer_cls_to_wrap="BertLayer" \
+--fsdp_sharding_strategy=1 \
+--fsdp_state_dict_type=FULL_STATE_DICT \
+./examples/pytorch/text-classification/run_glue.py
+--model_name_or_path bert-base-cased \
+--task_name $TASK_NAME \
+--do_train \
+--do_eval \
+--max_seq_length 128 \
+--per_device_train_batch_size 16 \
+--learning_rate 5e-5 \
+--num_train_epochs 3 \
+--output_dir /tmp/$TASK_NAME/ \
+--overwrite_output_dir
+```
+
+For more information, please refer the 🤗 Accelerate CLI guide: [Launching your 🤗 Accelerate scripts](https://huggingface.co/docs/accelerate/basic_tutorials/launch).
+
+Sections that were moved:
+
+[ <a href="./deepspeed#deepspeed-trainer-integration">DeepSpeed</a><a id="deepspeed"></a>
+| <a href="./deepspeed#deepspeed-installation">Installation</a><a id="installation"></a>
+| <a href="./deepspeed#deepspeed-multi-gpu">Deployment with multiple GPUs</a><a id="deployment-with-multiple-gpus"></a>
+| <a href="./deepspeed#deepspeed-one-gpu">Deployment with one GPU</a><a id="deployment-with-one-gpu"></a>
+| <a href="./deepspeed#deepspeed-notebook">Deployment in Notebooks</a><a id="deployment-in-notebooks"></a>
+| <a href="./deepspeed#deepspeed-config">Configuration</a><a id="configuration"></a>
+| <a href="./deepspeed#deepspeed-config-passing">Passing Configuration</a><a id="passing-configuration"></a>
+| <a href="./deepspeed#deepspeed-config-shared">Shared Configuration</a><a id="shared-configuration"></a>
+| <a href="./deepspeed#deepspeed-zero">ZeRO</a><a id="zero"></a>
+| <a href="./deepspeed#deepspeed-zero2-config">ZeRO-2 Config</a><a id="zero-2-config"></a>
+| <a href="./deepspeed#deepspeed-zero3-config">ZeRO-3 Config</a><a id="zero-3-config"></a>
+| <a href="./deepspeed#deepspeed-nvme">NVMe Support</a><a id="nvme-support"></a>
+| <a href="./deepspeed#deepspeed-zero2-zero3-performance">ZeRO-2 vs ZeRO-3 Performance</a><a id="zero-2-vs-zero-3-performance"></a>
+| <a href="./deepspeed#deepspeed-zero2-example">ZeRO-2 Example</a><a id="zero-2-example"></a>
+| <a href="./deepspeed#deepspeed-zero3-example">ZeRO-3 Example</a><a id="zero-3-example"></a>
+| <a href="./deepspeed#deepspeed-optimizer">Optimizer</a><a id="optimizer"></a>
+| <a href="./deepspeed#deepspeed-scheduler">Scheduler</a><a id="scheduler"></a>
+| <a href="./deepspeed#deepspeed-fp32">fp32 Precision</a><a id="fp32-precision"></a>
+| <a href="./deepspeed#deepspeed-amp">Automatic Mixed Precision</a><a id="automatic-mixed-precision"></a>
+| <a href="./deepspeed#deepspeed-bs">Batch Size</a><a id="batch-size"></a>
+| <a href="./deepspeed#deepspeed-grad-acc">Gradient Accumulation</a><a id="gradient-accumulation"></a>
+| <a href="./deepspeed#deepspeed-grad-clip">Gradient Clipping</a><a id="gradient-clipping"></a>
+| <a href="./deepspeed#deepspeed-weight-extraction">Getting The Model Weights Out</a><a id="getting-the-model-weights-out"></a>
+]
diff --git a/docs/source/en/main_classes/trainer.mdx b/docs/source/en/main_classes/trainer.mdx
deleted file mode 100644
index a0b914cd40af..000000000000
--- a/docs/source/en/main_classes/trainer.mdx
+++ /dev/null
@@ -1,679 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Trainer
-
-The [`Trainer`] class provides an API for feature-complete training in PyTorch for most standard use cases. It's used in most of the [example scripts](https://github.com/huggingface/transformers/tree/main/examples).
-
-Before instantiating your [`Trainer`], create a [`TrainingArguments`] to access all the points of customization during training.
-
-The API supports distributed training on multiple GPUs/TPUs, mixed precision through [NVIDIA Apex](https://github.com/NVIDIA/apex) and Native AMP for PyTorch.
-
-The [`Trainer`] contains the basic training loop which supports the above features. To inject custom behavior you can subclass them and override the following methods:
-
-- **get_train_dataloader** -- Creates the training DataLoader.
-- **get_eval_dataloader** -- Creates the evaluation DataLoader.
-- **get_test_dataloader** -- Creates the test DataLoader.
-- **log** -- Logs information on the various objects watching training.
-- **create_optimizer_and_scheduler** -- Sets up the optimizer and learning rate scheduler if they were not passed at
-  init. Note, that you can also subclass or override the `create_optimizer` and `create_scheduler` methods
-  separately.
-- **create_optimizer** -- Sets up the optimizer if it wasn't passed at init.
-- **create_scheduler** -- Sets up the learning rate scheduler if it wasn't passed at init.
-- **compute_loss** - Computes the loss on a batch of training inputs.
-- **training_step** -- Performs a training step.
-- **prediction_step** -- Performs an evaluation/test step.
-- **evaluate** -- Runs an evaluation loop and returns metrics.
-- **predict** -- Returns predictions (with metrics if labels are available) on a test set.
-
-<Tip warning={true}>
-
-The [`Trainer`] class is optimized for 🤗 Transformers models and can have surprising behaviors
-when you use it on other models. When using it on your own model, make sure:
-
-- your model always return tuples or subclasses of [`~utils.ModelOutput`].
-- your model can compute the loss if a `labels` argument is provided and that loss is returned as the first
-  element of the tuple (if your model returns tuples)
-- your model can accept multiple label arguments (use the `label_names` in your [`TrainingArguments`] to indicate their name to the [`Trainer`]) but none of them should be named `"label"`.
-
-</Tip>
-
-Here is an example of how to customize [`Trainer`] to use a weighted loss (useful when you have an unbalanced training set):
-
-```python
-from torch import nn
-from transformers import Trainer
-
-
-class CustomTrainer(Trainer):
-    def compute_loss(self, model, inputs, return_outputs=False):
-        labels = inputs.get("labels")
-        # forward pass
-        outputs = model(**inputs)
-        logits = outputs.get("logits")
-        # compute custom loss (suppose one has 3 labels with different weights)
-        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0, 3.0]))
-        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
-        return (loss, outputs) if return_outputs else loss
-```
-
-Another way to customize the training loop behavior for the PyTorch [`Trainer`] is to use [callbacks](callback) that can inspect the training loop state (for progress reporting, logging on TensorBoard or other ML platforms...) and take decisions (like early stopping).
-
-
-## Trainer
-
-[[autodoc]] Trainer
-    - all
-
-## Seq2SeqTrainer
-
-[[autodoc]] Seq2SeqTrainer
-    - evaluate
-    - predict
-
-## TrainingArguments
-
-[[autodoc]] TrainingArguments
-    - all
-
-## Seq2SeqTrainingArguments
-
-[[autodoc]] Seq2SeqTrainingArguments
-    - all
-
-## Checkpoints
-
-By default, [`Trainer`] will save all checkpoints in the `output_dir` you set in the
-[`TrainingArguments`] you are using. Those will go in subfolder named `checkpoint-xxx` with xxx
-being the step at which the training was at.
-
-Resuming training from a checkpoint can be done when calling [`Trainer.train`] with either:
-
-- `resume_from_checkpoint=True` which will resume training from the latest checkpoint
-- `resume_from_checkpoint=checkpoint_dir` which will resume training from the specific checkpoint in the directory
-  passed.
-
-In addition, you can easily save your checkpoints on the Model Hub when using `push_to_hub=True`. By default, all
-the models saved in intermediate checkpoints are saved in different commits, but not the optimizer state. You can adapt
-the `hub-strategy` value of your [`TrainingArguments`] to either:
-
-- `"checkpoint"`: the latest checkpoint is also pushed in a subfolder named last-checkpoint, allowing you to
-  resume training easily with `trainer.train(resume_from_checkpoint="output_dir/last-checkpoint")`.
-- `"all_checkpoints"`: all checkpoints are pushed like they appear in the output folder (so you will get one
-  checkpoint folder per folder in your final repository)
-
-
-## Logging
-
-By default [`Trainer`] will use `logging.INFO` for the main process and `logging.WARNING` for the replicas if any.
-
-These defaults can be overridden to use any of the 5 `logging` levels with [`TrainingArguments`]'s
-arguments:
-
-- `log_level` - for the main process
-- `log_level_replica` - for the replicas
-
-Further, if [`TrainingArguments`]'s `log_on_each_node` is set to `False` only the main node will
-use the log level settings for its main process, all other nodes will use the log level settings for replicas.
-
-Note that [`Trainer`] is going to set `transformers`'s log level separately for each node in its
-[`Trainer.__init__`]. So you may want to set this sooner (see the next example) if you tap into other
-`transformers` functionality before creating the [`Trainer`] object.
-
-Here is an example of how this can be used in an application:
-
-```python
-[...]
-logger = logging.getLogger(__name__)
-
-# Setup logging
-logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-    datefmt="%m/%d/%Y %H:%M:%S",
-    handlers=[logging.StreamHandler(sys.stdout)],
-)
-
-# set the main code and the modules it uses to the same log-level according to the node
-log_level = training_args.get_process_log_level()
-logger.setLevel(log_level)
-datasets.utils.logging.set_verbosity(log_level)
-transformers.utils.logging.set_verbosity(log_level)
-
-trainer = Trainer(...)
-```
-
-And then if you only want to see warnings on the main node and all other nodes to not print any most likely duplicated
-warnings you could run it as:
-
-```bash
-my_app.py ... --log_level warning --log_level_replica error
-```
-
-In the multi-node environment if you also don't want the logs to repeat for each node's main process, you will want to
-change the above to:
-
-```bash
-my_app.py ... --log_level warning --log_level_replica error --log_on_each_node 0
-```
-
-and then only the main process of the first node will log at the "warning" level, and all other processes on the main
-node and all processes on other nodes will log at the "error" level.
-
-If you need your application to be as quiet as possible you could do:
-
-```bash
-my_app.py ... --log_level error --log_level_replica error --log_on_each_node 0
-```
-
-(add `--log_on_each_node 0` if on multi-node environment)
-
-
-## Randomness
-
-When resuming from a checkpoint generated by [`Trainer`] all efforts are made to restore the
-_python_, _numpy_ and _pytorch_ RNG states to the same states as they were at the moment of saving that checkpoint,
-which should make the "stop and resume" style of training as close as possible to non-stop training.
-
-However, due to various default non-deterministic pytorch settings this might not fully work. If you want full
-determinism please refer to [Controlling sources of randomness](https://pytorch.org/docs/stable/notes/randomness). As explained in the document, that some of those settings
-that make things deterministic (.e.g., `torch.backends.cudnn.deterministic`) may slow things down, therefore this
-can't be done by default, but you can enable those yourself if needed.
-
-
-## Specific GPUs Selection
-
-Let's discuss how you can tell your program which GPUs are to be used and in what order.
-
-When using [`DistributedDataParallel`](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) to use only a subset of your GPUs, you simply specify the number of GPUs to use. For example, if you have 4 GPUs, but you wish to use the first 2 you can do:
-
-```bash
-python -m torch.distributed.launch --nproc_per_node=2  trainer-program.py ...
-```
-
-if you have either [`accelerate`](https://github.com/huggingface/accelerate) or [`deepspeed`](https://github.com/microsoft/DeepSpeed) installed you can also accomplish the same by using one of:
-```bash
-accelerate launch --num_processes 2 trainer-program.py ...
-```
-
-```bash
-deepspeed --num_gpus 2 trainer-program.py ...
-```
-
-You don't need to use the Accelerate or [the Deepspeed integration](Deepspeed) features to use these launchers.
-
-
-Until now you were able to tell the program how many GPUs to use. Now let's discuss how to select specific GPUs and control their order.
-
-The following environment variables help you control which GPUs to use and their order.
-
-**`CUDA_VISIBLE_DEVICES`**
-
-If you have multiple GPUs and you'd like to use only 1 or a few of those GPUs, set the environment variable `CUDA_VISIBLE_DEVICES` to a list of the GPUs to be used.
-
-For example, let's say you have 4 GPUs: 0, 1, 2 and 3. To run only on the physical GPUs 0 and 2, you can do:
-
-```bash
-CUDA_VISIBLE_DEVICES=0,2 python -m torch.distributed.launch trainer-program.py ...
-```
-
-So now pytorch will see only 2 GPUs, where your physical GPUs 0 and 2 are mapped to `cuda:0` and `cuda:1` correspondingly.
-
-You can even change their order:
-
-```bash
-CUDA_VISIBLE_DEVICES=2,0 python -m torch.distributed.launch trainer-program.py ...
-```
-
-Here your physical GPUs 0 and 2 are mapped to `cuda:1` and `cuda:0` correspondingly.
-
-The above examples were all for `DistributedDataParallel` use pattern, but the same method works for [`DataParallel`](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) as well:
-```bash
-CUDA_VISIBLE_DEVICES=2,0 python trainer-program.py ...
-```
-
-To emulate an environment without GPUs simply set this environment variable to an empty value like so:
-
-```bash
-CUDA_VISIBLE_DEVICES= python trainer-program.py ...
-```
-
-As with any environment variable you can, of course, export those instead of adding these to the command line, as in:
-
-
-```bash
-export CUDA_VISIBLE_DEVICES=0,2
-python -m torch.distributed.launch trainer-program.py ...
-```
-
-but this approach can be confusing since you may forget you set up the environment variable earlier and not understand why the wrong GPUs are used. Therefore, it's a common practice to set the environment variable just for a specific run on the same command line as it's shown in most examples of this section.
-
-**`CUDA_DEVICE_ORDER`**
-
-There is an additional environment variable `CUDA_DEVICE_ORDER` that controls how the physical devices are ordered. The two choices are:
-
-1. ordered by PCIe bus IDs (matches `nvidia-smi`'s order) - this is the default.
-
-```bash
-export CUDA_DEVICE_ORDER=PCI_BUS_ID
-```
-
-2. ordered by GPU compute capabilities
-
-```bash
-export CUDA_DEVICE_ORDER=FASTEST_FIRST
-```
-
-Most of the time you don't need to care about this environment variable, but it's very helpful if you have a lopsided setup where you have an old and a new GPUs physically inserted in such a way so that the slow older card appears to be first. One way to fix that is to swap the cards. But if you can't swap the cards (e.g., if the cooling of the devices gets impacted) then setting `CUDA_DEVICE_ORDER=FASTEST_FIRST` will always put the newer faster card first. It'll be somewhat confusing though since `nvidia-smi` will still report them in the PCIe order.
-
-The other solution to swapping the order is to use:
-
-```bash
-export CUDA_VISIBLE_DEVICES=1,0
-```
-In this example we are working with just 2 GPUs, but of course the same would apply to as many GPUs as your computer has.
-
-Also if you do set this environment variable it's the best to set it in your `~/.bashrc` file or some other startup config file and forget about it.
-
-
-
-
-## Trainer Integrations
-
-The [`Trainer`] has been extended to support libraries that may dramatically improve your training
-time and fit much bigger models.
-
-Currently it supports third party solutions, [DeepSpeed](https://github.com/microsoft/DeepSpeed), [PyTorch FSDP](https://pytorch.org/docs/stable/fsdp.html) and [FairScale](https://github.com/facebookresearch/fairscale/), which implement parts of the paper [ZeRO: Memory Optimizations
-Toward Training Trillion Parameter Models, by Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, Yuxiong He](https://arxiv.org/abs/1910.02054).
-
-This provided support is new and experimental as of this writing. While the support for DeepSpeed and PyTorch FSDP is active and we welcome issues around it, we don't support the FairScale integration anymore since it has been integrated in PyTorch main (see the [PyTorch FSDP integration](#pytorch-fully-sharded-data-parallel))
-
-<a id='zero-install-notes'></a>
-
-### CUDA Extension Installation Notes
-
-As of this writing, both FairScale and Deepspeed require compilation of CUDA C++ code, before they can be used.
-
-While all installation issues should be dealt with through the corresponding GitHub Issues of [FairScale](https://github.com/facebookresearch/fairscale/issues) and [Deepspeed](https://github.com/microsoft/DeepSpeed/issues), there are a few common issues that one may encounter while building
-any PyTorch extension that needs to build CUDA extensions.
-
-Therefore, if you encounter a CUDA-related build issue while doing one of the following or both:
-
-```bash
-pip install fairscale
-pip install deepspeed
-```
-
-please, read the following notes first.
-
-In these notes we give examples for what to do when `pytorch` has been built with CUDA `10.2`. If your situation is
-different remember to adjust the version number to the one you are after.
-
-#### Possible problem #1
-
-While, Pytorch comes with its own CUDA toolkit, to build these two projects you must have an identical version of CUDA
-installed system-wide.
-
-For example, if you installed `pytorch` with `cudatoolkit==10.2` in the Python environment, you also need to have
-CUDA `10.2` installed system-wide.
-
-The exact location may vary from system to system, but `/usr/local/cuda-10.2` is the most common location on many
-Unix systems. When CUDA is correctly set up and added to the `PATH` environment variable, one can find the
-installation location by doing:
-
-```bash
-which nvcc
-```
-
-If you don't have CUDA installed system-wide, install it first. You will find the instructions by using your favorite
-search engine. For example, if you're on Ubuntu you may want to search for: [ubuntu cuda 10.2 install](https://www.google.com/search?q=ubuntu+cuda+10.2+install).
-
-#### Possible problem #2
-
-Another possible common problem is that you may have more than one CUDA toolkit installed system-wide. For example you
-may have:
-
-```bash
-/usr/local/cuda-10.2
-/usr/local/cuda-11.0
-```
-
-Now, in this situation you need to make sure that your `PATH` and `LD_LIBRARY_PATH` environment variables contain
-the correct paths to the desired CUDA version. Typically, package installers will set these to contain whatever the
-last version was installed. If you encounter the problem, where the package build fails because it can't find the right
-CUDA version despite you having it installed system-wide, it means that you need to adjust the 2 aforementioned
-environment variables.
-
-First, you may look at their contents:
-
-```bash
-echo $PATH
-echo $LD_LIBRARY_PATH
-```
-
-so you get an idea of what is inside.
-
-It's possible that `LD_LIBRARY_PATH` is empty.
-
-`PATH` lists the locations of where executables can be found and `LD_LIBRARY_PATH` is for where shared libraries
-are to looked for. In both cases, earlier entries have priority over the later ones. `:` is used to separate multiple
-entries.
-
-Now, to tell the build program where to find the specific CUDA toolkit, insert the desired paths to be listed first by
-doing:
-
-```bash
-export PATH=/usr/local/cuda-10.2/bin:$PATH
-export LD_LIBRARY_PATH=/usr/local/cuda-10.2/lib64:$LD_LIBRARY_PATH
-```
-
-Note that we aren't overwriting the existing values, but prepending instead.
-
-Of course, adjust the version number, the full path if need be. Check that the directories you assign actually do
-exist. `lib64` sub-directory is where the various CUDA `.so` objects, like `libcudart.so` reside, it's unlikely
-that your system will have it named differently, but if it is adjust it to reflect your reality.
-
-
-#### Possible problem #3
-
-Some older CUDA versions may refuse to build with newer compilers. For example, you my have `gcc-9` but it wants
-`gcc-7`.
-
-There are various ways to go about it.
-
-If you can install the latest CUDA toolkit it typically should support the newer compiler.
-
-Alternatively, you could install the lower version of the compiler in addition to the one you already have, or you may
-already have it but it's not the default one, so the build system can't see it. If you have `gcc-7` installed but the
-build system complains it can't find it, the following might do the trick:
-
-```bash
-sudo ln -s /usr/bin/gcc-7  /usr/local/cuda-10.2/bin/gcc
-sudo ln -s /usr/bin/g++-7  /usr/local/cuda-10.2/bin/g++
-```
-
-Here, we are making a symlink to `gcc-7` from `/usr/local/cuda-10.2/bin/gcc` and since
-`/usr/local/cuda-10.2/bin/` should be in the `PATH` environment variable (see the previous problem's solution), it
-should find `gcc-7` (and `g++7`) and then the build will succeed.
-
-As always make sure to edit the paths in the example to match your situation.
-
-### FairScale
-
-<Tip warning={true}>
-
-This integration is not supported anymore, we recommend you either use DeepSpeed or PyTorch FSDP.
-
-</Tip>
-
-By integrating [FairScale](https://github.com/facebookresearch/fairscale/) the [`Trainer`]
-provides support for the following features from [the ZeRO paper](https://arxiv.org/abs/1910.02054):
-
-1. Optimizer State Sharding
-2. Gradient Sharding
-3. Model Parameters Sharding (new and very experimental)
-4. CPU offload (new and very experimental)
-
-You will need at least two GPUs to use this feature.
-
-
-**Installation**:
-
-Install the library via pypi:
-
-```bash
-pip install fairscale
-```
-
-or via `transformers`' `extras`:
-
-```bash
-pip install transformers[fairscale]
-```
-
-(available starting from `transformers==4.6.0`) or find more details on [the FairScale's GitHub page](https://github.com/facebookresearch/fairscale/#installation).
-
-If you're still struggling with the build, first make sure to read [CUDA Extension Installation Notes](#zero-install-notes).
-
-If it's still not resolved the build issue, here are a few more ideas.
-
-`fairscale` seems to have an issue with the recently introduced by pip build isolation feature. If you have a problem
-with it, you may want to try one of:
-
-```bash
-pip install fairscale --no-build-isolation .
-```
-
-or:
-
-```bash
-git clone https://github.com/facebookresearch/fairscale/
-cd fairscale
-rm -r dist build
-python setup.py bdist_wheel
-pip uninstall -y fairscale
-pip install dist/fairscale-*.whl
-```
-
-`fairscale` also has issues with building against pytorch-nightly, so if you use it you may have to try one of:
-
-```bash
-pip uninstall -y fairscale; pip install fairscale --pre \
--f https://download.pytorch.org/whl/nightly/cu110/torch_nightly \
---no-cache --no-build-isolation
-```
-
-or:
-
-```bash
-pip install -v --disable-pip-version-check . \
--f https://download.pytorch.org/whl/nightly/cu110/torch_nightly --pre
-```
-
-Of course, adjust the urls to match the cuda version you use.
-
-If after trying everything suggested you still encounter build issues, please, proceed with the GitHub Issue of
-[FairScale](https://github.com/facebookresearch/fairscale/issues).
-
-
-
-**Usage**:
-
-To use the first version of Sharded data-parallelism, add `--sharded_ddp simple` to the command line arguments, and
-make sure you have added the distributed launcher `-m torch.distributed.launch --nproc_per_node=NUMBER_OF_GPUS_YOU_HAVE` if you haven't been using it already.
-
-For example here is how you could use it for `run_translation.py` with 2 GPUs:
-
-```bash
-python -m torch.distributed.launch --nproc_per_node=2 examples/pytorch/translation/run_translation.py \
---model_name_or_path t5-small --per_device_train_batch_size 1   \
---output_dir output_dir --overwrite_output_dir \
---do_train --max_train_samples 500 --num_train_epochs 1 \
---dataset_name wmt16 --dataset_config "ro-en" \
---source_lang en --target_lang ro \
---fp16 --sharded_ddp simple
-```
-
-Notes:
-
-- This feature requires distributed training (so multiple GPUs).
-- It is not implemented for TPUs.
-- It works with `--fp16` too, to make things even faster.
-- One of the main benefits of enabling `--sharded_ddp simple` is that it uses a lot less GPU memory, so you should be
-  able to use significantly larger batch sizes using the same hardware (e.g. 3x and even bigger) which should lead to
-  significantly shorter training time.
-
-3. To use the second version of Sharded data-parallelism, add `--sharded_ddp zero_dp_2` or `--sharded_ddp zero_dp_3` to the command line arguments, and make sure you have added the distributed launcher `-m torch.distributed.launch --nproc_per_node=NUMBER_OF_GPUS_YOU_HAVE` if you haven't been using it already.
-
-For example here is how you could use it for `run_translation.py` with 2 GPUs:
-
-```bash
-python -m torch.distributed.launch --nproc_per_node=2 examples/pytorch/translation/run_translation.py \
---model_name_or_path t5-small --per_device_train_batch_size 1   \
---output_dir output_dir --overwrite_output_dir \
---do_train --max_train_samples 500 --num_train_epochs 1 \
---dataset_name wmt16 --dataset_config "ro-en" \
---source_lang en --target_lang ro \
---fp16 --sharded_ddp zero_dp_2
-```
-
-`zero_dp_2` is an optimized version of the simple wrapper, while `zero_dp_3` fully shards model weights,
-gradients and optimizer states.
-
-Both are compatible with adding `cpu_offload` to enable ZeRO-offload (activate it like this: `--sharded_ddp "zero_dp_2 cpu_offload"`).
-
-Notes:
-
-- This feature requires distributed training (so multiple GPUs).
-- It is not implemented for TPUs.
-- It works with `--fp16` too, to make things even faster.
-- The `cpu_offload` additional option requires `--fp16`.
-- This is an area of active development, so make sure you have a source install of fairscale to use this feature as
-  some bugs you encounter may have been fixed there already.
-
-Known caveats:
-
-- This feature is incompatible with `--predict_with_generate` in the _run_translation.py_ script.
-- Using `--sharded_ddp zero_dp_3` requires wrapping each layer of the model in the special container
-  `FullyShardedDataParallelism` of fairscale. It should be used with the option `auto_wrap` if you are not
-  doing this yourself: `--sharded_ddp "zero_dp_3 auto_wrap"`.
-
-### PyTorch Fully Sharded Data parallel
-
-To accelerate training huge models on larger batch sizes, we can use a fully sharded data parallel model.
-This type of data parallel paradigm enables fitting more data and larger models by sharding the optimizer states, gradients and parameters.
-To read more about it and the benefits, check out the [Fully Sharded Data Parallel blog](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/).
-We have integrated the latest PyTorch's Fully Sharded Data Parallel (FSDP) training feature.
-All you need to do is enable it through the config.
-
-**Required PyTorch version for FSDP support**: PyTorch Nightly (or 1.12.0 if you read this after it has been released)
-as the model saving with FSDP activated is only available with recent fixes.
-
-**Usage**:
-
-- Make sure you have added the distributed launcher
-`-m torch.distributed.launch --nproc_per_node=NUMBER_OF_GPUS_YOU_HAVE` if you haven't been using it already.
-
-- **Sharding Strategy**: 
-  - FULL_SHARD : Shards optimizer states + gradients + model parameters across data parallel workers/GPUs.
-  For this, add `--fsdp full_shard` to the command line arguments. 
-  - SHARD_GRAD_OP : Shards optimizer states + gradients across data parallel workers/GPUs.
-    For this, add `--fsdp shard_grad_op` to the command line arguments.
-  - NO_SHARD : No sharding. For this, add `--fsdp no_shard` to the command line arguments.
-- To offload the parameters and gradients to the CPU, 
-add `--fsdp "full_shard offload"` or `--fsdp "shard_grad_op offload"` to the command line arguments.
--  To automatically recursively wrap layers with FSDP using `default_auto_wrap_policy`, 
-add `--fsdp "full_shard auto_wrap"` or `--fsdp "shard_grad_op auto_wrap"` to the command line arguments.
-- To enable both CPU offloading and auto wrapping, 
-add `--fsdp "full_shard offload auto_wrap"` or `--fsdp "shard_grad_op offload auto_wrap"` to the command line arguments.
-- If auto wrapping is enabled, you can either use transformer based auto wrap policy or size based auto wrap policy.
-  - For transformer based auto wrap policy, please add `--fsdp_transformer_layer_cls_to_wrap <value>` to command line arguments.
-  This specifies the transformer layer class name (case-sensitive) to wrap ,e.g, `BertLayer`, `GPTJBlock`, `T5Block` ....
-  This is important because submodules that share weights (e.g., embedding layer) should not end up in different FSDP wrapped units. 
-  Using this policy, wrapping happens for each block containing Multi-Head Attention followed by couple of MLP layers. 
-  Remaining layers including the shared embeddings are conveniently wrapped in same outermost FSDP unit.
-  Therefore, use this for transformer based models.
-  - For size based auto wrap policy, please add `--fsdp_min_num_params <number>` to command line arguments.
-  It specifies FSDP's minimum number of parameters for auto wrapping.
-
-**Few caveats to be aware of**
-- Mixed precision is currently not supported with FSDP as we wait for PyTorch to fix support for it.
-More details in this [issues](https://github.com/pytorch/pytorch/issues/75676).
-- FSDP currently doesn't support multiple parameter groups. 
-More details mentioned in this [issue](https://github.com/pytorch/pytorch/issues/76501)
-(`The original model parameters' .grads are not set, meaning that they cannot be optimized separately (which is why we cannot support multiple parameter groups)`).
-
-### Using Trainer for accelerated PyTorch Training on Mac 
-
-With PyTorch v1.12 release, developers and researchers can take advantage of Apple silicon GPUs for significantly faster model training. 
-This unlocks the ability to perform machine learning workflows like prototyping and fine-tuning locally, right on Mac.
-Apple's Metal Performance Shaders (MPS) as a backend for PyTorch enables this and can be used via the new `"mps"` device. 
-This will map computational graphs and primitives on the MPS Graph framework and tuned kernels provided by MPS.
-For more information please refer official documents [Introducing Accelerated PyTorch Training on Mac](https://pytorch.org/blog/introducing-accelerated-pytorch-training-on-mac/)
-and [MPS BACKEND](https://pytorch.org/docs/stable/notes/mps.html). 
-
-<Tip warning={false}>
-
-We strongly recommend to install PyTorch >= 1.13 (nightly version at the time of writing) on your MacOS machine. 
-It has major fixes related to model correctness and performance improvements for transformer based models.
-Please refer to https://github.com/pytorch/pytorch/issues/82707 for more details.
-
-</Tip>
-
-**Benefits of Training and Inference using Apple Silicon Chips**
-
-1. Enables users to train larger networks or batch sizes locally
-2. Reduces data retrieval latency and provides the GPU with direct access to the full memory store due to unified memory architecture. 
-Therefore, improving end-to-end performance.
-3. Reduces costs associated with cloud-based development or the need for additional local GPUs.
-
-**Pre-requisites**: To install torch with mps support, 
-please follow this nice medium article [GPU-Acceleration Comes to PyTorch on M1 Macs](https://medium.com/towards-data-science/gpu-acceleration-comes-to-pytorch-on-m1-macs-195c399efcc1).
-
-**Usage**:
-User has to just pass `--use_mps_device` argument. 
-For example, you can run the official Glue text classififcation task (from the root folder) using Apple Silicon GPU with below command:
-
-```bash
-export TASK_NAME=mrpc
-
-python examples/pytorch/text-classification/run_glue.py \
-  --model_name_or_path bert-base-cased \
-  --task_name $TASK_NAME \
-  --do_train \
-  --do_eval \
-  --max_seq_length 128 \
-  --per_device_train_batch_size 32 \
-  --learning_rate 2e-5 \
-  --num_train_epochs 3 \
-  --output_dir /tmp/$TASK_NAME/ \
-  --use_mps_device \
-  --overwrite_output_dir
-```
-
-**A few caveats to be aware of**
-
-1. Some PyTorch operations have not been implemented in mps and will throw an error. 
-One way to get around that is to set the environment variable `PYTORCH_ENABLE_MPS_FALLBACK=1`, 
-which will fallback to CPU for these operations. It still throws a UserWarning however.
-2. Distributed setups `gloo` and `nccl` are not working with `mps` device. 
-This means that currently only single GPU of `mps` device type can be used.
-
-Finally, please, remember that, 🤗 `Trainer` only integrates MPS backend, therefore if you
-have any problems or questions with regards to MPS backend usage, please, 
-file an issue with [PyTorch GitHub](https://github.com/pytorch/pytorch/issues).
-
-Sections that were moved:
-
-[ <a href="./deepspeed#deepspeed-trainer-integration">DeepSpeed</a><a id="deepspeed"></a>
-| <a href="./deepspeed#deepspeed-installation">Installation</a><a id="installation"></a>
-| <a href="./deepspeed#deepspeed-multi-gpu">Deployment with multiple GPUs</a><a id="deployment-with-multiple-gpus"></a>
-| <a href="./deepspeed#deepspeed-one-gpu">Deployment with one GPU</a><a id="deployment-with-one-gpu"></a>
-| <a href="./deepspeed#deepspeed-notebook">Deployment in Notebooks</a><a id="deployment-in-notebooks"></a>
-| <a href="./deepspeed#deepspeed-config">Configuration</a><a id="configuration"></a>
-| <a href="./deepspeed#deepspeed-config-passing">Passing Configuration</a><a id="passing-configuration"></a>
-| <a href="./deepspeed#deepspeed-config-shared">Shared Configuration</a><a id="shared-configuration"></a>
-| <a href="./deepspeed#deepspeed-zero">ZeRO</a><a id="zero"></a>
-| <a href="./deepspeed#deepspeed-zero2-config">ZeRO-2 Config</a><a id="zero-2-config"></a>
-| <a href="./deepspeed#deepspeed-zero3-config">ZeRO-3 Config</a><a id="zero-3-config"></a>
-| <a href="./deepspeed#deepspeed-nvme">NVMe Support</a><a id="nvme-support"></a>
-| <a href="./deepspeed#deepspeed-zero2-zero3-performance">ZeRO-2 vs ZeRO-3 Performance</a><a id="zero-2-vs-zero-3-performance"></a>
-| <a href="./deepspeed#deepspeed-zero2-example">ZeRO-2 Example</a><a id="zero-2-example"></a>
-| <a href="./deepspeed#deepspeed-zero3-example">ZeRO-3 Example</a><a id="zero-3-example"></a>
-| <a href="./deepspeed#deepspeed-optimizer">Optimizer</a><a id="optimizer"></a>
-| <a href="./deepspeed#deepspeed-scheduler">Scheduler</a><a id="scheduler"></a>
-| <a href="./deepspeed#deepspeed-fp32">fp32 Precision</a><a id="fp32-precision"></a>
-| <a href="./deepspeed#deepspeed-amp">Automatic Mixed Precision</a><a id="automatic-mixed-precision"></a>
-| <a href="./deepspeed#deepspeed-bs">Batch Size</a><a id="batch-size"></a>
-| <a href="./deepspeed#deepspeed-grad-acc">Gradient Accumulation</a><a id="gradient-accumulation"></a>
-| <a href="./deepspeed#deepspeed-grad-clip">Gradient Clipping</a><a id="gradient-clipping"></a>
-| <a href="./deepspeed#deepspeed-weight-extraction">Getting The Model Weights Out</a><a id="getting-the-model-weights-out"></a>
-]
diff --git a/docs/source/en/migration.mdx b/docs/source/en/migration.mdx
deleted file mode 100644
index 7abf95875154..000000000000
--- a/docs/source/en/migration.mdx
+++ /dev/null
@@ -1,315 +0,0 @@
-<!---
-Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Migrating from previous packages
-
-## Migrating from transformers `v3.x` to `v4.x`
-
-A couple of changes were introduced when the switch from version 3 to version 4 was done. Below is a summary of the
-expected changes:
-
-#### 1. AutoTokenizers and pipelines now use fast (rust) tokenizers by default.
-
-The python and rust tokenizers have roughly the same API, but the rust tokenizers have a more complete feature set.
-
-This introduces two breaking changes:
-- The handling of overflowing tokens between the python and rust tokenizers is different.
-- The rust tokenizers do not accept integers in the encoding methods.
-
-##### How to obtain the same behavior as v3.x in v4.x
-
-- The pipelines now contain additional features out of the box. See the [token-classification pipeline with the `grouped_entities` flag](main_classes/pipelines#transformers.TokenClassificationPipeline).
-- The auto-tokenizers now return rust tokenizers. In order to obtain the python tokenizers instead, the user may use the `use_fast` flag by setting it to `False`:
-
-In version `v3.x`:
-```py
-from transformers import AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
-```
-to obtain the same in version `v4.x`:
-```py
-from transformers import AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False)
-```
-
-#### 2. SentencePiece is removed from the required dependencies
-
-The requirement on the SentencePiece dependency has been lifted from the `setup.py`. This is done so that we may have a channel on anaconda cloud without relying on `conda-forge`. This means that the tokenizers that depend on the SentencePiece library will not be available with a standard `transformers` installation.
-
-This includes the **slow** versions of:
-- `XLNetTokenizer`
-- `AlbertTokenizer`
-- `CamembertTokenizer`
-- `MBartTokenizer`
-- `PegasusTokenizer`
-- `T5Tokenizer`
-- `ReformerTokenizer`
-- `XLMRobertaTokenizer`
-
-##### How to obtain the same behavior as v3.x in v4.x
-
-In order to obtain the same behavior as version `v3.x`, you should install `sentencepiece` additionally:
-
-In version `v3.x`:
-```bash
-pip install transformers
-```
-to obtain the same in version `v4.x`:
-```bash
-pip install transformers[sentencepiece]
-```
-or
-```bash
-pip install transformers sentencepiece
-```
-#### 3. The architecture of the repo has been updated so that each model resides in its folder
-
-The past and foreseeable addition of new models means that the number of files in the directory `src/transformers` keeps growing and becomes harder to navigate and understand. We made the choice to put each model and the files accompanying it in their own sub-directories.
-
-This is a breaking change as importing intermediary layers using a model's module directly needs to be done via a different path.
-
-##### How to obtain the same behavior as v3.x in v4.x
-
-In order to obtain the same behavior as version `v3.x`, you should update the path used to access the layers.
-
-In version `v3.x`:
-```bash
-from transformers.modeling_bert import BertLayer
-```
-to obtain the same in version `v4.x`:
-```bash
-from transformers.models.bert.modeling_bert import BertLayer
-```
-
-#### 4. Switching the `return_dict` argument to `True` by default
-
-The [`return_dict` argument](main_classes/output) enables the return of dict-like python objects containing the model outputs, instead of the standard tuples. This object is self-documented as keys can be used to retrieve values, while also behaving as a tuple as users may retrieve objects by index or by slice.
-
-This is a breaking change as the limitation of that tuple is that it cannot be unpacked: `value0, value1 = outputs` will not work.
-
-##### How to obtain the same behavior as v3.x in v4.x
-
-In order to obtain the same behavior as version `v3.x`, you should specify the `return_dict` argument to `False`, either in the model configuration or during the forward pass.
-
-In version `v3.x`:
-```bash
-model = BertModel.from_pretrained("bert-base-cased")
-outputs = model(**inputs)
-```
-to obtain the same in version `v4.x`:
-```bash
-model = BertModel.from_pretrained("bert-base-cased")
-outputs = model(**inputs, return_dict=False)
-```
-or
-```bash
-model = BertModel.from_pretrained("bert-base-cased", return_dict=False)
-outputs = model(**inputs)
-```
-
-#### 5. Removed some deprecated attributes
-
-Attributes that were deprecated have been removed if they had been deprecated for at least a month. The full list of deprecated attributes can be found in [#8604](https://github.com/huggingface/transformers/pull/8604).
-
-Here is a list of these attributes/methods/arguments and what their replacements should be:
-
-In several models, the labels become consistent with the other models:
-- `masked_lm_labels` becomes `labels` in `AlbertForMaskedLM` and `AlbertForPreTraining`.
-- `masked_lm_labels` becomes `labels` in `BertForMaskedLM` and `BertForPreTraining`.
-- `masked_lm_labels` becomes `labels` in `DistilBertForMaskedLM`.
-- `masked_lm_labels` becomes `labels` in `ElectraForMaskedLM`.
-- `masked_lm_labels` becomes `labels` in `LongformerForMaskedLM`.
-- `masked_lm_labels` becomes `labels` in `MobileBertForMaskedLM`.
-- `masked_lm_labels` becomes `labels` in `RobertaForMaskedLM`.
-- `lm_labels` becomes `labels` in `BartForConditionalGeneration`.
-- `lm_labels` becomes `labels` in `GPT2DoubleHeadsModel`.
-- `lm_labels` becomes `labels` in `OpenAIGPTDoubleHeadsModel`.
-- `lm_labels` becomes `labels` in `T5ForConditionalGeneration`.
-
-In several models, the caching mechanism becomes consistent with the other models:
-- `decoder_cached_states` becomes `past_key_values` in all BART-like, FSMT and T5 models.
-- `decoder_past_key_values` becomes `past_key_values` in all BART-like, FSMT and T5 models.
-- `past` becomes `past_key_values` in all CTRL models.
-- `past` becomes `past_key_values` in all GPT-2 models.
-
-Regarding the tokenizer classes:
-- The tokenizer attribute `max_len` becomes `model_max_length`.
-- The tokenizer attribute `return_lengths` becomes `return_length`.
-- The tokenizer encoding argument `is_pretokenized` becomes `is_split_into_words`.
-
-Regarding the `Trainer` class:
-- The `Trainer` argument `tb_writer` is removed in favor of the callback `TensorBoardCallback(tb_writer=...)`.
-- The `Trainer` argument `prediction_loss_only` is removed in favor of the class argument `args.prediction_loss_only`.
-- The `Trainer` attribute `data_collator` should be a callable.
-- The `Trainer` method `_log` is deprecated in favor of `log`.
-- The `Trainer` method `_training_step` is deprecated in favor of `training_step`.
-- The `Trainer` method `_prediction_loop` is deprecated in favor of `prediction_loop`.
-- The `Trainer` method `is_local_master` is deprecated in favor of `is_local_process_zero`.
-- The `Trainer` method `is_world_master` is deprecated in favor of `is_world_process_zero`.
-
-Regarding the `TFTrainer` class:
-- The `TFTrainer` argument `prediction_loss_only` is removed in favor of the class argument `args.prediction_loss_only`.
-- The `Trainer` method `_log` is deprecated in favor of `log`.
-- The `TFTrainer` method `_prediction_loop` is deprecated in favor of `prediction_loop`.
-- The `TFTrainer` method `_setup_wandb` is deprecated in favor of `setup_wandb`.
-- The `TFTrainer` method `_run_model` is deprecated in favor of `run_model`.
-
-Regarding the `TrainingArguments` class:
-- The `TrainingArguments` argument `evaluate_during_training` is deprecated in favor of `evaluation_strategy`.
-
-Regarding the Transfo-XL model:
-- The Transfo-XL configuration attribute `tie_weight` becomes `tie_words_embeddings`.
-- The Transfo-XL modeling method `reset_length` becomes `reset_memory_length`.
-
-Regarding pipelines:
-- The `FillMaskPipeline` argument `topk` becomes `top_k`.
-
-
-
-## Migrating from pytorch-transformers to 🤗 Transformers
-
-Here is a quick summary of what you should take care of when migrating from `pytorch-transformers` to 🤗 Transformers.
-
-### Positional order of some models' keywords inputs (`attention_mask`, `token_type_ids`...) changed
-
-To be able to use Torchscript (see #1010, #1204 and #1195) the specific order of some models **keywords inputs** (`attention_mask`, `token_type_ids`...) has been changed.
-
-If you used to call the models with keyword names for keyword arguments, e.g. `model(inputs_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)`, this should not cause any change.
-
-If you used to call the models with positional inputs for keyword arguments, e.g. `model(inputs_ids, attention_mask, token_type_ids)`, you may have to double check the exact order of input arguments.
-
-## Migrating from pytorch-pretrained-bert
-
-Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to 🤗 Transformers
-
-### Models always output `tuples`
-
-The main breaking change when migrating from `pytorch-pretrained-bert` to 🤗 Transformers is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
-
-The exact content of the tuples for each model are detailed in the models' docstrings and the [documentation](https://huggingface.co/transformers/).
-
-In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.
-
-Here is a `pytorch-pretrained-bert` to 🤗 Transformers conversion example for a `BertForSequenceClassification` classification model:
-
-```python
-# Let's load our model
-model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
-
-# If you used to have this line in pytorch-pretrained-bert:
-loss = model(input_ids, labels=labels)
-
-# Now just use this line in 🤗 Transformers to extract the loss from the output tuple:
-outputs = model(input_ids, labels=labels)
-loss = outputs[0]
-
-# In 🤗 Transformers you can also have access to the logits:
-loss, logits = outputs[:2]
-
-# And even the attention weights if you configure the model to output them (and other outputs too, see the docstrings and documentation)
-model = BertForSequenceClassification.from_pretrained("bert-base-uncased", output_attentions=True)
-outputs = model(input_ids, labels=labels)
-loss, logits, attentions = outputs
-```
-
-### Serialization
-
-Breaking change in the `from_pretrained()`method:
-
-1. Models are now set in evaluation mode by default when instantiated with the `from_pretrained()` method. To train them don't forget to set them back in training mode (`model.train()`) to activate the dropout modules.
-
-2. The additional `*inputs` and `**kwargs` arguments supplied to the `from_pretrained()` method used to be directly passed to the underlying model's class `__init__()` method. They are now used to update the model configuration attribute first which can break derived model classes build based on the previous `BertForSequenceClassification` examples. More precisely, the positional arguments `*inputs` provided to `from_pretrained()` are directly forwarded the model `__init__()` method while the keyword arguments `**kwargs` (i) which match configuration class attributes are used to update said attributes (ii) which don't match any configuration class attributes are forwarded to the model `__init__()` method.
-
-Also, while not a breaking change, the serialization methods have been standardized and you probably should switch to the new method `save_pretrained(save_directory)` if you were using any other serialization method before.
-
-Here is an example:
-
-```python
-### Let's load a model and tokenizer
-model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
-tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
-
-### Do some stuff to our model and tokenizer
-# Ex: add new tokens to the vocabulary and embeddings of our model
-tokenizer.add_tokens(["[SPECIAL_TOKEN_1]", "[SPECIAL_TOKEN_2]"])
-model.resize_token_embeddings(len(tokenizer))
-# Train our model
-train(model)
-
-### Now let's save our model and tokenizer to a directory
-model.save_pretrained("./my_saved_model_directory/")
-tokenizer.save_pretrained("./my_saved_model_directory/")
-
-### Reload the model and the tokenizer
-model = BertForSequenceClassification.from_pretrained("./my_saved_model_directory/")
-tokenizer = BertTokenizer.from_pretrained("./my_saved_model_directory/")
-```
-
-### Optimizers: BertAdam & OpenAIAdam are now AdamW, schedules are standard PyTorch schedules
-
-The two optimizers previously included, `BertAdam` and `OpenAIAdam`, have been replaced by a single `AdamW` optimizer which has a few differences:
-
-- it only implements weights decay correction,
-- schedules are now externals (see below),
-- gradient clipping is now also external (see below).
-
-The new optimizer `AdamW` matches PyTorch `Adam` optimizer API and let you use standard PyTorch or apex methods for the schedule and clipping.
-
-The schedules are now standard [PyTorch learning rate schedulers](https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate) and not part of the optimizer anymore.
-
-Here is a conversion examples from `BertAdam` with a linear warmup and decay schedule to `AdamW` and the same schedule:
-
-```python
-# Parameters:
-lr = 1e-3
-max_grad_norm = 1.0
-num_training_steps = 1000
-num_warmup_steps = 100
-warmup_proportion = float(num_warmup_steps) / float(num_training_steps)  # 0.1
-
-### Previously BertAdam optimizer was instantiated like this:
-optimizer = BertAdam(
-    model.parameters(),
-    lr=lr,
-    schedule="warmup_linear",
-    warmup=warmup_proportion,
-    num_training_steps=num_training_steps,
-)
-### and used like this:
-for batch in train_data:
-    loss = model(batch)
-    loss.backward()
-    optimizer.step()
-
-### In 🤗 Transformers, optimizer and schedules are split and instantiated like this:
-optimizer = AdamW(
-    model.parameters(), lr=lr, correct_bias=False
-)  # To reproduce BertAdam specific behavior set correct_bias=False
-scheduler = get_linear_schedule_with_warmup(
-    optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps
-)  # PyTorch scheduler
-### and used like this:
-for batch in train_data:
-    loss = model(batch)
-    loss.backward()
-    torch.nn.utils.clip_grad_norm_(
-        model.parameters(), max_grad_norm
-    )  # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
-    optimizer.step()
-    scheduler.step()
-```
diff --git a/docs/source/en/model_doc/albert.md b/docs/source/en/model_doc/albert.md
new file mode 100644
index 000000000000..9e821f2f4d02
--- /dev/null
+++ b/docs/source/en/model_doc/albert.md
@@ -0,0 +1,195 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# ALBERT
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=albert">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-albert-blueviolet">
+</a>
+<a href="https://huggingface.co/spaces/docs-demos/albert-base-v2">
+<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
+</a>
+</div>
+
+## Overview
+
+The ALBERT model was proposed in [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942) by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma,
+Radu Soricut. It presents two parameter-reduction techniques to lower memory consumption and increase the training
+speed of BERT:
+
+- Splitting the embedding matrix into two smaller matrices.
+- Using repeating layers split among groups.
+
+The abstract from the paper is the following:
+
+*Increasing model size when pretraining natural language representations often results in improved performance on
+downstream tasks. However, at some point further model increases become harder due to GPU/TPU memory limitations,
+longer training times, and unexpected model degradation. To address these problems, we present two parameter-reduction
+techniques to lower memory consumption and increase the training speed of BERT. Comprehensive empirical evidence shows
+that our proposed methods lead to models that scale much better compared to the original BERT. We also use a
+self-supervised loss that focuses on modeling inter-sentence coherence, and show it consistently helps downstream tasks
+with multi-sentence inputs. As a result, our best model establishes new state-of-the-art results on the GLUE, RACE, and
+SQuAD benchmarks while having fewer parameters compared to BERT-large.*
+
+Tips:
+
+- ALBERT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather
+  than the left.
+- ALBERT uses repeating layers which results in a small memory footprint, however the computational cost remains
+  similar to a BERT-like architecture with the same number of hidden layers as it has to iterate through the same
+  number of (repeating) layers.
+- Embedding size E is different from hidden size H justified because the embeddings are context independent (one embedding vector represents one token), whereas hidden states are context dependent (one hidden state represents a sequence of tokens) so it's more logical to have H >> E. Also, the embedding matrix is large since it's V x E (V being the vocab size). If E < H, it has less parameters.
+- Layers are split in groups that share parameters (to save memory).
+Next sentence prediction is replaced by a sentence ordering prediction: in the inputs, we have two sentences A and B (that are consecutive) and we either feed A followed by B or B followed by A. The model must predict if they have been swapped or not.
+
+
+This model was contributed by [lysandre](https://huggingface.co/lysandre). This model jax version was contributed by
+[kamalkraj](https://huggingface.co/kamalkraj). The original code can be found [here](https://github.com/google-research/ALBERT).
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+- [Multiple choice task guide](../tasks/multiple_choice)
+
+## AlbertConfig
+
+[[autodoc]] AlbertConfig
+
+## AlbertTokenizer
+
+[[autodoc]] AlbertTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## AlbertTokenizerFast
+
+[[autodoc]] AlbertTokenizerFast
+
+## Albert specific outputs
+
+[[autodoc]] models.albert.modeling_albert.AlbertForPreTrainingOutput
+
+[[autodoc]] models.albert.modeling_tf_albert.TFAlbertForPreTrainingOutput
+
+## AlbertModel
+
+[[autodoc]] AlbertModel
+    - forward
+
+## AlbertForPreTraining
+
+[[autodoc]] AlbertForPreTraining
+    - forward
+
+## AlbertForMaskedLM
+
+[[autodoc]] AlbertForMaskedLM
+    - forward
+
+## AlbertForSequenceClassification
+
+[[autodoc]] AlbertForSequenceClassification
+    - forward
+
+## AlbertForMultipleChoice
+
+[[autodoc]] AlbertForMultipleChoice
+
+## AlbertForTokenClassification
+
+[[autodoc]] AlbertForTokenClassification
+    - forward
+
+## AlbertForQuestionAnswering
+
+[[autodoc]] AlbertForQuestionAnswering
+    - forward
+
+## TFAlbertModel
+
+[[autodoc]] TFAlbertModel
+    - call
+
+## TFAlbertForPreTraining
+
+[[autodoc]] TFAlbertForPreTraining
+    - call
+
+## TFAlbertForMaskedLM
+
+[[autodoc]] TFAlbertForMaskedLM
+    - call
+
+## TFAlbertForSequenceClassification
+
+[[autodoc]] TFAlbertForSequenceClassification
+    - call
+
+## TFAlbertForMultipleChoice
+
+[[autodoc]] TFAlbertForMultipleChoice
+    - call
+
+## TFAlbertForTokenClassification
+
+[[autodoc]] TFAlbertForTokenClassification
+    - call
+
+## TFAlbertForQuestionAnswering
+
+[[autodoc]] TFAlbertForQuestionAnswering
+    - call
+
+## FlaxAlbertModel
+
+[[autodoc]] FlaxAlbertModel
+    - __call__
+
+## FlaxAlbertForPreTraining
+
+[[autodoc]] FlaxAlbertForPreTraining
+    - __call__
+
+## FlaxAlbertForMaskedLM
+
+[[autodoc]] FlaxAlbertForMaskedLM
+    - __call__
+
+## FlaxAlbertForSequenceClassification
+
+[[autodoc]] FlaxAlbertForSequenceClassification
+    - __call__
+
+## FlaxAlbertForMultipleChoice
+
+[[autodoc]] FlaxAlbertForMultipleChoice
+    - __call__
+
+## FlaxAlbertForTokenClassification
+
+[[autodoc]] FlaxAlbertForTokenClassification
+    - __call__
+
+## FlaxAlbertForQuestionAnswering
+
+[[autodoc]] FlaxAlbertForQuestionAnswering
+    - __call__
diff --git a/docs/source/en/model_doc/albert.mdx b/docs/source/en/model_doc/albert.mdx
deleted file mode 100644
index 54873c0fa9c6..000000000000
--- a/docs/source/en/model_doc/albert.mdx
+++ /dev/null
@@ -1,170 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# ALBERT
-
-## Overview
-
-The ALBERT model was proposed in [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942) by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma,
-Radu Soricut. It presents two parameter-reduction techniques to lower memory consumption and increase the training
-speed of BERT:
-
-- Splitting the embedding matrix into two smaller matrices.
-- Using repeating layers split among groups.
-
-The abstract from the paper is the following:
-
-*Increasing model size when pretraining natural language representations often results in improved performance on
-downstream tasks. However, at some point further model increases become harder due to GPU/TPU memory limitations,
-longer training times, and unexpected model degradation. To address these problems, we present two parameter-reduction
-techniques to lower memory consumption and increase the training speed of BERT. Comprehensive empirical evidence shows
-that our proposed methods lead to models that scale much better compared to the original BERT. We also use a
-self-supervised loss that focuses on modeling inter-sentence coherence, and show it consistently helps downstream tasks
-with multi-sentence inputs. As a result, our best model establishes new state-of-the-art results on the GLUE, RACE, and
-SQuAD benchmarks while having fewer parameters compared to BERT-large.*
-
-Tips:
-
-- ALBERT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather
-  than the left.
-- ALBERT uses repeating layers which results in a small memory footprint, however the computational cost remains
-  similar to a BERT-like architecture with the same number of hidden layers as it has to iterate through the same
-  number of (repeating) layers.
-
-This model was contributed by [lysandre](https://huggingface.co/lysandre). This model jax version was contributed by
-[kamalkraj](https://huggingface.co/kamalkraj). The original code can be found [here](https://github.com/google-research/ALBERT).
-
-## AlbertConfig
-
-[[autodoc]] AlbertConfig
-
-## AlbertTokenizer
-
-[[autodoc]] AlbertTokenizer
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-
-## AlbertTokenizerFast
-
-[[autodoc]] AlbertTokenizerFast
-
-## Albert specific outputs
-
-[[autodoc]] models.albert.modeling_albert.AlbertForPreTrainingOutput
-
-[[autodoc]] models.albert.modeling_tf_albert.TFAlbertForPreTrainingOutput
-
-## AlbertModel
-
-[[autodoc]] AlbertModel
-    - forward
-
-## AlbertForPreTraining
-
-[[autodoc]] AlbertForPreTraining
-    - forward
-
-## AlbertForMaskedLM
-
-[[autodoc]] AlbertForMaskedLM
-    - forward
-
-## AlbertForSequenceClassification
-
-[[autodoc]] AlbertForSequenceClassification
-    - forward
-
-## AlbertForMultipleChoice
-
-[[autodoc]] AlbertForMultipleChoice
-
-## AlbertForTokenClassification
-
-[[autodoc]] AlbertForTokenClassification
-    - forward
-
-## AlbertForQuestionAnswering
-
-[[autodoc]] AlbertForQuestionAnswering
-    - forward
-
-## TFAlbertModel
-
-[[autodoc]] TFAlbertModel
-    - call
-
-## TFAlbertForPreTraining
-
-[[autodoc]] TFAlbertForPreTraining
-    - call
-
-## TFAlbertForMaskedLM
-
-[[autodoc]] TFAlbertForMaskedLM
-    - call
-
-## TFAlbertForSequenceClassification
-
-[[autodoc]] TFAlbertForSequenceClassification
-    - call
-
-## TFAlbertForMultipleChoice
-
-[[autodoc]] TFAlbertForMultipleChoice
-    - call
-
-## TFAlbertForTokenClassification
-
-[[autodoc]] TFAlbertForTokenClassification
-    - call
-
-## TFAlbertForQuestionAnswering
-
-[[autodoc]] TFAlbertForQuestionAnswering
-    - call
-
-## FlaxAlbertModel
-
-[[autodoc]] FlaxAlbertModel
-    - __call__
-
-## FlaxAlbertForPreTraining
-
-[[autodoc]] FlaxAlbertForPreTraining
-    - __call__
-
-## FlaxAlbertForMaskedLM
-
-[[autodoc]] FlaxAlbertForMaskedLM
-    - __call__
-
-## FlaxAlbertForSequenceClassification
-
-[[autodoc]] FlaxAlbertForSequenceClassification
-    - __call__
-
-## FlaxAlbertForMultipleChoice
-
-[[autodoc]] FlaxAlbertForMultipleChoice
-    - __call__
-
-## FlaxAlbertForTokenClassification
-
-[[autodoc]] FlaxAlbertForTokenClassification
-    - __call__
-
-## FlaxAlbertForQuestionAnswering
-
-[[autodoc]] FlaxAlbertForQuestionAnswering
-    - __call__
diff --git a/docs/source/en/model_doc/align.md b/docs/source/en/model_doc/align.md
new file mode 100644
index 000000000000..faf76853f609
--- /dev/null
+++ b/docs/source/en/model_doc/align.md
@@ -0,0 +1,105 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# ALIGN
+
+## Overview
+
+The ALIGN model was proposed in [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig. ALIGN is a multi-modal vision and language model. It can be used for image-text similarity and for zero-shot image classification. ALIGN features a dual-encoder architecture with [EfficientNet](efficientnet) as its vision encoder and [BERT](bert) as its text encoder, and learns to align visual and text representations with contrastive learning. Unlike previous work, ALIGN leverages a massive noisy dataset and shows that the scale of the corpus can be used to achieve SOTA representations with a simple recipe.
+
+The abstract from the paper is the following:
+
+*Pre-trained representations are becoming crucial for many NLP and perception tasks. While representation learning in NLP has transitioned to training on raw text without human annotations, visual and vision-language representations still rely heavily on curated training datasets that are expensive or require expert knowledge. For vision applications, representations are mostly learned using datasets with explicit class labels such as ImageNet or OpenImages. For vision-language, popular datasets like Conceptual Captions, MSCOCO, or CLIP all involve a non-trivial data collection (and cleaning) process. This costly curation process limits the size of datasets and hence hinders the scaling of trained models. In this paper, we leverage a noisy dataset of over one billion image alt-text pairs, obtained without expensive filtering or post-processing steps in the Conceptual Captions dataset. A simple dual-encoder architecture learns to align visual and language representations of the image and text pairs using a contrastive loss. We show that the scale of our corpus can make up for its noise and leads to state-of-the-art representations even with such a simple learning scheme. Our visual representation achieves strong performance when transferred to classification tasks such as ImageNet and VTAB. The aligned visual and language representations enables zero-shot image classification and also set new state-of-the-art results on Flickr30K and MSCOCO image-text retrieval benchmarks, even when compared with more sophisticated cross-attention models. The representations also enable cross-modality search with complex text and text + image queries.*
+
+## Usage
+
+ALIGN uses EfficientNet to get visual features and BERT to get the text features. Both the text and visual features are then projected to a latent space with identical dimension. The dot product between the projected image and text features is then used as a similarity score.
+
+[`AlignProcessor`] wraps [`EfficientNetImageProcessor`] and [`BertTokenizer`] into a single instance to both encode the text and preprocess the images. The following example shows how to get the image-text similarity scores using [`AlignProcessor`] and [`AlignModel`].
+
+```python
+import requests
+import torch
+from PIL import Image
+from transformers import AlignProcessor, AlignModel
+
+processor = AlignProcessor.from_pretrained("kakaobrain/align-base")
+model = AlignModel.from_pretrained("kakaobrain/align-base")
+
+url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+candidate_labels = ["an image of a cat", "an image of a dog"]
+
+inputs = processor(text=candidate_labels, images=image, return_tensors="pt")
+
+with torch.no_grad():
+    outputs = model(**inputs)
+
+# this is the image-text similarity score
+logits_per_image = outputs.logits_per_image
+
+# we can take the softmax to get the label probabilities
+probs = logits_per_image.softmax(dim=1)
+print(probs)
+```
+
+This model was contributed by [Alara Dirik](https://huggingface.co/adirik).
+The original code is not released, this implementation is based on the Kakao Brain implementation based on the original paper.
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ALIGN.
+
+- A blog post on [ALIGN and the COYO-700M dataset](https://huggingface.co/blog/vit-align).
+- A zero-shot image classification [demo](https://huggingface.co/spaces/adirik/ALIGN-zero-shot-image-classification).
+- [Model card](https://huggingface.co/kakaobrain/align-base) of `kakaobrain/align-base` model.
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we will review it. The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+
+## AlignConfig
+
+[[autodoc]] AlignConfig
+    - from_text_vision_configs
+
+## AlignTextConfig
+
+[[autodoc]] AlignTextConfig
+
+## AlignVisionConfig
+
+[[autodoc]] AlignVisionConfig
+
+## AlignProcessor
+
+[[autodoc]] AlignProcessor
+
+## AlignModel
+
+[[autodoc]] AlignModel
+    - forward
+    - get_text_features
+    - get_image_features
+
+## AlignTextModel
+
+[[autodoc]] AlignTextModel
+    - forward
+
+## AlignVisionModel
+
+[[autodoc]] AlignVisionModel
+    - forward
diff --git a/docs/source/en/model_doc/altclip.md b/docs/source/en/model_doc/altclip.md
new file mode 100644
index 000000000000..23cdcb63fbd2
--- /dev/null
+++ b/docs/source/en/model_doc/altclip.md
@@ -0,0 +1,111 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# AltCLIP
+
+## Overview
+
+The AltCLIP model was proposed in [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679v2) by Zhongzhi Chen, Guang Liu, Bo-Wen Zhang, Fulong Ye, Qinghong Yang, Ledell Wu. AltCLIP
+(Altering the Language Encoder in CLIP) is a neural network trained on a variety of image-text and text-text pairs. By switching CLIP's
+text encoder with a pretrained multilingual text encoder XLM-R, we could obtain very close performances with CLIP on almost all tasks, and extended original CLIP's capabilities such as multilingual understanding.
+
+The abstract from the paper is the following:
+
+*In this work, we present a conceptually simple and effective method to train a strong bilingual multimodal representation model. 
+Starting from the pretrained multimodal representation model CLIP released by OpenAI, we switched its text encoder with a pretrained 
+multilingual text encoder XLM-R, and aligned both languages and image representations by a two-stage training schema consisting of 
+teacher learning and contrastive learning. We validate our method through evaluations of a wide range of tasks. We set new state-of-the-art 
+performances on a bunch of tasks including ImageNet-CN, Flicker30k- CN, and COCO-CN. Further, we obtain very close performances with 
+CLIP on almost all tasks, suggesting that one can simply alter the text encoder in CLIP for extended capabilities such as multilingual understanding.*
+
+## Usage
+
+The usage of AltCLIP is very similar to the CLIP. the difference between CLIP is the text encoder. Note that we use bidirectional attention instead of casual attention
+and we take the [CLS] token in XLM-R to represent text embedding.
+
+AltCLIP is a multi-modal vision and language model. It can be used for image-text similarity and for zero-shot image
+classification. AltCLIP uses a ViT like transformer to get visual features and a bidirectional language model to get the text
+features. Both the text and visual features are then projected to a latent space with identical dimension. The dot
+product between the projected image and text features is then used as a similar score.
+
+To feed images to the Transformer encoder, each image is split into a sequence of fixed-size non-overlapping patches,
+which are then linearly embedded. A [CLS] token is added to serve as representation of an entire image. The authors
+also add absolute position embeddings, and feed the resulting sequence of vectors to a standard Transformer encoder.
+The [`CLIPImageProcessor`] can be used to resize (or rescale) and normalize images for the model.
+
+The [`AltCLIPProcessor`] wraps a [`CLIPImageProcessor`] and a [`XLMRobertaTokenizer`] into a single instance to both
+encode the text and prepare the images. The following example shows how to get the image-text similarity scores using
+[`AltCLIPProcessor`] and [`AltCLIPModel`].
+
+
+```python
+>>> from PIL import Image
+>>> import requests
+
+>>> from transformers import AltCLIPModel, AltCLIPProcessor
+
+>>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
+>>> processor = AltCLIPProcessor.from_pretrained("BAAI/AltCLIP")
+
+>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
+
+>>> outputs = model(**inputs)
+>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
+```
+
+Tips:
+
+This model is build on `CLIPModel`, so use it like a original CLIP. 
+
+This model was contributed by [jongjyh](https://huggingface.co/jongjyh).
+
+## AltCLIPConfig
+
+[[autodoc]] AltCLIPConfig
+    - from_text_vision_configs
+
+## AltCLIPTextConfig
+
+[[autodoc]] AltCLIPTextConfig
+
+## AltCLIPVisionConfig
+
+[[autodoc]] AltCLIPVisionConfig
+
+## AltCLIPProcessor
+
+[[autodoc]] AltCLIPProcessor
+
+## AltCLIPModel
+
+[[autodoc]] AltCLIPModel
+    - forward
+    - get_text_features
+    - get_image_features
+
+## AltCLIPTextModel
+
+[[autodoc]] AltCLIPTextModel
+    - forward
+
+## AltCLIPVisionModel
+
+[[autodoc]] AltCLIPVisionModel
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/altclip.mdx b/docs/source/en/model_doc/altclip.mdx
deleted file mode 100644
index 681bea22c72e..000000000000
--- a/docs/source/en/model_doc/altclip.mdx
+++ /dev/null
@@ -1,107 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# AltCLIP
-
-## Overview
-
-The AltCLIP model was proposed in [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679v2) by Zhongzhi Chen, Guang Liu, Bo-Wen Zhang, Fulong Ye, Qinghong Yang, Ledell Wu. AltCLIP
-(Altering the Language Encoder in CLIP) is a neural network trained on a variety of image-text and text-text pairs. By switching CLIP's
-text encoder with a pretrained multilingual text encoder XLM-R, we could obtain very close performances with CLIP on almost all tasks, and extended original CLIP's capabilities such as multilingual understanding.
-
-The abstract from the paper is the following:
-
-*In this work, we present a conceptually simple and effective method to train a strong bilingual multimodal representation model. 
-Starting from the pretrained multimodal representation model CLIP released by OpenAI, we switched its text encoder with a pretrained 
-multilingual text encoder XLM-R, and aligned both languages and image representations by a two-stage training schema consisting of 
-teacher learning and contrastive learning. We validate our method through evaluations of a wide range of tasks. We set new state-of-the-art 
-performances on a bunch of tasks including ImageNet-CN, Flicker30k- CN, and COCO-CN. Further, we obtain very close performances with 
-CLIP on almost all tasks, suggesting that one can simply alter the text encoder in CLIP for extended capabilities such as multilingual understanding.*
-
-## Usage
-
-The usage of AltCLIP is very similar to the CLIP. the difference between CLIP is the text encoder. Note that we use bidirectional attention instead of casual attention
-and we take the [CLS] token in XLM-R to represent text embedding.
-
-AltCLIP is a multi-modal vision and language model. It can be used for image-text similarity and for zero-shot image
-classification. AltCLIP uses a ViT like transformer to get visual features and a bidirectional language model to get the text
-features. Both the text and visual features are then projected to a latent space with identical dimension. The dot
-product between the projected image and text features is then used as a similar score.
-
-To feed images to the Transformer encoder, each image is split into a sequence of fixed-size non-overlapping patches,
-which are then linearly embedded. A [CLS] token is added to serve as representation of an entire image. The authors
-also add absolute position embeddings, and feed the resulting sequence of vectors to a standard Transformer encoder.
-The [`CLIPImageProcessor`] can be used to resize (or rescale) and normalize images for the model.
-
-The [`AltCLIPProcessor`] wraps a [`CLIPImageProcessor`] and a [`XLMRobertaTokenizer`] into a single instance to both
-encode the text and prepare the images. The following example shows how to get the image-text similarity scores using
-[`AltCLIPProcessor`] and [`AltCLIPModel`].
-
-
-```python
->>> from PIL import Image
->>> import requests
-
->>> from transformers import AltCLIPModel, AltCLIPProcessor
-
->>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
->>> processor = AltCLIPProcessor.from_pretrained("BAAI/AltCLIP")
-
->>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
->>> image = Image.open(requests.get(url, stream=True).raw)
-
->>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
-
->>> outputs = model(**inputs)
->>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
->>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
-```
-
-Tips:
-
-This model is build on `CLIPModel`, so use it like a original CLIP. 
-
-This model was contributed by [jongjyh](https://huggingface.co/jongjyh).
-
-## AltCLIPConfig
-
-[[autodoc]] AltCLIPConfig
-    - from_text_vision_configs
-
-## AltCLIPTextConfig
-
-[[autodoc]] AltCLIPTextConfig
-
-## AltCLIPVisionConfig
-
-[[autodoc]] AltCLIPVisionConfig
-
-## AltCLIPProcessor
-
-[[autodoc]] AltCLIPProcessor
-
-## AltCLIPModel
-
-[[autodoc]] AltCLIPModel
-    - forward
-    - get_text_features
-    - get_image_features
-
-## AltCLIPTextModel
-
-[[autodoc]] AltCLIPTextModel
-    - forward
-
-## AltCLIPVisionModel
-
-[[autodoc]] AltCLIPVisionModel
-    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/audio-spectrogram-transformer.md b/docs/source/en/model_doc/audio-spectrogram-transformer.md
new file mode 100644
index 000000000000..df9fe78c2d4c
--- /dev/null
+++ b/docs/source/en/model_doc/audio-spectrogram-transformer.md
@@ -0,0 +1,75 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Audio Spectrogram Transformer
+
+## Overview
+
+The Audio Spectrogram Transformer model was proposed in [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
+The Audio Spectrogram Transformer applies a [Vision Transformer](vit) to audio, by turning audio into an image (spectrogram). The model obtains state-of-the-art results
+for audio classification.
+
+The abstract from the paper is the following:
+
+*In the past decade, convolutional neural networks (CNNs) have been widely adopted as the main building block for end-to-end audio classification models, which aim to learn a direct mapping from audio spectrograms to corresponding labels. To better capture long-range global context, a recent trend is to add a self-attention mechanism on top of the CNN, forming a CNN-attention hybrid model. However, it is unclear whether the reliance on a CNN is necessary, and if neural networks purely based on attention are sufficient to obtain good performance in audio classification. In this paper, we answer the question by introducing the Audio Spectrogram Transformer (AST), the first convolution-free, purely attention-based model for audio classification. We evaluate AST on various audio classification benchmarks, where it achieves new state-of-the-art results of 0.485 mAP on AudioSet, 95.6% accuracy on ESC-50, and 98.1% accuracy on Speech Commands V2.*
+
+Tips:
+
+- When fine-tuning the Audio Spectrogram Transformer (AST) on your own dataset, it's recommended to take care of the input normalization (to make
+sure the input has mean of 0 and std of 0.5). [`ASTFeatureExtractor`] takes care of this. Note that it uses the AudioSet
+mean and std by default. You can check [`ast/src/get_norm_stats.py`](https://github.com/YuanGongND/ast/blob/master/src/get_norm_stats.py) to see how
+the authors compute the stats for a downstream dataset.
+- Note that the AST needs a low learning rate (the authors use a 10 times smaller learning rate compared to their CNN model proposed in the
+[PSLA paper](https://arxiv.org/abs/2102.01243)) and converges quickly, so please search for a suitable learning rate and learning rate scheduler for your task.
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/audio_spectogram_transformer_architecture.png"
+alt="drawing" width="600"/>
+
+<small> Audio pectrogram Transformer architecture. Taken from the <a href="https://arxiv.org/abs/2104.01778">original paper</a>.</small>
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr).
+The original code can be found [here](https://github.com/YuanGongND/ast).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with the Audio Spectrogram Transformer.
+
+<PipelineTag pipeline="audio-classification"/>
+
+- A notebook illustrating inference with AST for audio classification can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/AST).
+- [`ASTForAudioClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/audio-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/audio_classification.ipynb).
+- See also: [Audio classification](../tasks/audio_classification).
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## ASTConfig
+
+[[autodoc]] ASTConfig
+
+## ASTFeatureExtractor
+
+[[autodoc]] ASTFeatureExtractor
+    - __call__
+
+## ASTModel
+
+[[autodoc]] ASTModel
+    - forward
+
+## ASTForAudioClassification
+
+[[autodoc]] ASTForAudioClassification
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/audio-spectrogram-transformer.mdx b/docs/source/en/model_doc/audio-spectrogram-transformer.mdx
deleted file mode 100644
index d6093198fc68..000000000000
--- a/docs/source/en/model_doc/audio-spectrogram-transformer.mdx
+++ /dev/null
@@ -1,60 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Audio Spectrogram Transformer
-
-## Overview
-
-The Audio Spectrogram Transformer model was proposed in [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
-The Audio Spectrogram Transformer applies a [Vision Transformer](vit) to audio, by turning audio into an image (spectrogram). The model obtains state-of-the-art results
-for audio classification.
-
-The abstract from the paper is the following:
-
-*In the past decade, convolutional neural networks (CNNs) have been widely adopted as the main building block for end-to-end audio classification models, which aim to learn a direct mapping from audio spectrograms to corresponding labels. To better capture long-range global context, a recent trend is to add a self-attention mechanism on top of the CNN, forming a CNN-attention hybrid model. However, it is unclear whether the reliance on a CNN is necessary, and if neural networks purely based on attention are sufficient to obtain good performance in audio classification. In this paper, we answer the question by introducing the Audio Spectrogram Transformer (AST), the first convolution-free, purely attention-based model for audio classification. We evaluate AST on various audio classification benchmarks, where it achieves new state-of-the-art results of 0.485 mAP on AudioSet, 95.6% accuracy on ESC-50, and 98.1% accuracy on Speech Commands V2.*
-
-Tips:
-
-- When fine-tuning the Audio Spectrogram Transformer (AST) on your own dataset, it's recommended to take care of the input normalization (to make
-sure the input has mean of 0 and std of 0.5). [`ASTFeatureExtractor`] takes care of this. Note that it uses the AudioSet
-mean and std by default. You can check [`ast/src/get_norm_stats.py`](https://github.com/YuanGongND/ast/blob/master/src/get_norm_stats.py) to see how
-the authors compute the stats for a downstream dataset.
-- Note that the AST needs a low learning rate (the authors use a 10 times smaller learning rate compared to their CNN model proposed in the
-[PSLA paper](https://arxiv.org/abs/2102.01243)) and converges quickly, so please search for a suitable learning rate and learning rate scheduler for your task.
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/audio_spectogram_transformer_architecture.png"
-alt="drawing" width="600"/>
-
-<small> Audio pectrogram Transformer architecture. Taken from the <a href="https://arxiv.org/abs/2104.01778">original paper</a>.</small>
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr).
-The original code can be found [here](https://github.com/YuanGongND/ast).
-
-
-## ASTConfig
-
-[[autodoc]] ASTConfig
-
-## ASTFeatureExtractor
-
-[[autodoc]] ASTFeatureExtractor
-    - __call__
-
-## ASTModel
-
-[[autodoc]] ASTModel
-    - forward
-
-## ASTForAudioClassification
-
-[[autodoc]] ASTForAudioClassification
-    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/auto.md b/docs/source/en/model_doc/auto.md
new file mode 100644
index 000000000000..75d162bbcf13
--- /dev/null
+++ b/docs/source/en/model_doc/auto.md
@@ -0,0 +1,379 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Auto Classes
+
+In many cases, the architecture you want to use can be guessed from the name or the path of the pretrained model you
+are supplying to the `from_pretrained()` method. AutoClasses are here to do this job for you so that you
+automatically retrieve the relevant model given the name/path to the pretrained weights/config/vocabulary.
+
+Instantiating one of [`AutoConfig`], [`AutoModel`], and
+[`AutoTokenizer`] will directly create a class of the relevant architecture. For instance
+
+
+```python
+model = AutoModel.from_pretrained("bert-base-cased")
+```
+
+will create a model that is an instance of [`BertModel`].
+
+There is one class of `AutoModel` for each task, and for each backend (PyTorch, TensorFlow, or Flax).
+
+## Extending the Auto Classes
+
+Each of the auto classes has a method to be extended with your custom classes. For instance, if you have defined a
+custom class of model `NewModel`, make sure you have a `NewModelConfig` then you can add those to the auto
+classes like this:
+
+```python
+from transformers import AutoConfig, AutoModel
+
+AutoConfig.register("new-model", NewModelConfig)
+AutoModel.register(NewModelConfig, NewModel)
+```
+
+You will then be able to use the auto classes like you would usually do!
+
+<Tip warning={true}>
+
+If your `NewModelConfig` is a subclass of [`~transformer.PretrainedConfig`], make sure its
+`model_type` attribute is set to the same key you use when registering the config (here `"new-model"`).
+
+Likewise, if your `NewModel` is a subclass of [`PreTrainedModel`], make sure its
+`config_class` attribute is set to the same class you use when registering the model (here
+`NewModelConfig`).
+
+</Tip>
+
+## AutoConfig
+
+[[autodoc]] AutoConfig
+
+## AutoTokenizer
+
+[[autodoc]] AutoTokenizer
+
+## AutoFeatureExtractor
+
+[[autodoc]] AutoFeatureExtractor
+
+## AutoImageProcessor
+
+[[autodoc]] AutoImageProcessor
+
+## AutoProcessor
+
+[[autodoc]] AutoProcessor
+
+## Generic model classes
+
+The following auto classes are available for instantiating a base model class without a specific head.
+
+### AutoModel
+
+[[autodoc]] AutoModel
+
+### TFAutoModel
+
+[[autodoc]] TFAutoModel
+
+### FlaxAutoModel
+
+[[autodoc]] FlaxAutoModel
+
+## Generic pretraining classes
+
+The following auto classes are available for instantiating a model with a pretraining head.
+
+### AutoModelForPreTraining
+
+[[autodoc]] AutoModelForPreTraining
+
+### TFAutoModelForPreTraining
+
+[[autodoc]] TFAutoModelForPreTraining
+
+### FlaxAutoModelForPreTraining
+
+[[autodoc]] FlaxAutoModelForPreTraining
+
+## Natural Language Processing
+
+The following auto classes are available for the following natural language processing tasks.
+
+### AutoModelForCausalLM
+
+[[autodoc]] AutoModelForCausalLM
+
+### TFAutoModelForCausalLM
+
+[[autodoc]] TFAutoModelForCausalLM
+
+### FlaxAutoModelForCausalLM
+
+[[autodoc]] FlaxAutoModelForCausalLM
+
+### AutoModelForMaskedLM
+
+[[autodoc]] AutoModelForMaskedLM
+
+### TFAutoModelForMaskedLM
+
+[[autodoc]] TFAutoModelForMaskedLM
+
+### FlaxAutoModelForMaskedLM
+
+[[autodoc]] FlaxAutoModelForMaskedLM
+
+### AutoModelForMaskGeneration
+
+[[autodoc]] AutoModelForMaskGeneration
+
+### TFAutoModelForMaskGeneration
+
+[[autodoc]] TFAutoModelForMaskGeneration
+
+### AutoModelForSeq2SeqLM
+
+[[autodoc]] AutoModelForSeq2SeqLM
+
+### TFAutoModelForSeq2SeqLM
+
+[[autodoc]] TFAutoModelForSeq2SeqLM
+
+### FlaxAutoModelForSeq2SeqLM
+
+[[autodoc]] FlaxAutoModelForSeq2SeqLM
+
+### AutoModelForSequenceClassification
+
+[[autodoc]] AutoModelForSequenceClassification
+
+### TFAutoModelForSequenceClassification
+
+[[autodoc]] TFAutoModelForSequenceClassification
+
+### FlaxAutoModelForSequenceClassification
+
+[[autodoc]] FlaxAutoModelForSequenceClassification
+
+### AutoModelForMultipleChoice
+
+[[autodoc]] AutoModelForMultipleChoice
+
+### TFAutoModelForMultipleChoice
+
+[[autodoc]] TFAutoModelForMultipleChoice
+
+### FlaxAutoModelForMultipleChoice
+
+[[autodoc]] FlaxAutoModelForMultipleChoice
+
+### AutoModelForNextSentencePrediction
+
+[[autodoc]] AutoModelForNextSentencePrediction
+
+### TFAutoModelForNextSentencePrediction
+
+[[autodoc]] TFAutoModelForNextSentencePrediction
+
+### FlaxAutoModelForNextSentencePrediction
+
+[[autodoc]] FlaxAutoModelForNextSentencePrediction
+
+### AutoModelForTokenClassification
+
+[[autodoc]] AutoModelForTokenClassification
+
+### TFAutoModelForTokenClassification
+
+[[autodoc]] TFAutoModelForTokenClassification
+
+### FlaxAutoModelForTokenClassification
+
+[[autodoc]] FlaxAutoModelForTokenClassification
+
+### AutoModelForQuestionAnswering
+
+[[autodoc]] AutoModelForQuestionAnswering
+
+### TFAutoModelForQuestionAnswering
+
+[[autodoc]] TFAutoModelForQuestionAnswering
+
+### FlaxAutoModelForQuestionAnswering
+
+[[autodoc]] FlaxAutoModelForQuestionAnswering
+
+### AutoModelForTextEncoding
+
+[[autodoc]] AutoModelForTextEncoding
+
+### TFAutoModelForTextEncoding
+
+[[autodoc]] TFAutoModelForTextEncoding
+
+## Computer vision
+
+The following auto classes are available for the following computer vision tasks.
+
+### AutoModelForDepthEstimation
+
+[[autodoc]] AutoModelForDepthEstimation
+
+### AutoModelForImageClassification
+
+[[autodoc]] AutoModelForImageClassification
+
+### TFAutoModelForImageClassification
+
+[[autodoc]] TFAutoModelForImageClassification
+
+### FlaxAutoModelForImageClassification
+
+[[autodoc]] FlaxAutoModelForImageClassification
+
+### AutoModelForVideoClassification
+
+[[autodoc]] AutoModelForVideoClassification
+
+### AutoModelForMaskedImageModeling
+
+[[autodoc]] AutoModelForMaskedImageModeling
+
+### TFAutoModelForMaskedImageModeling
+
+[[autodoc]] TFAutoModelForMaskedImageModeling
+
+### AutoModelForObjectDetection
+
+[[autodoc]] AutoModelForObjectDetection
+
+### AutoModelForImageSegmentation
+
+[[autodoc]] AutoModelForImageSegmentation
+
+### AutoModelForImageToImage
+
+[[autodoc]] AutoModelForImageToImage
+
+### AutoModelForSemanticSegmentation
+
+[[autodoc]] AutoModelForSemanticSegmentation
+
+### TFAutoModelForSemanticSegmentation
+
+[[autodoc]] TFAutoModelForSemanticSegmentation
+
+### AutoModelForInstanceSegmentation
+
+[[autodoc]] AutoModelForInstanceSegmentation
+
+### AutoModelForUniversalSegmentation
+
+[[autodoc]] AutoModelForUniversalSegmentation
+
+### AutoModelForZeroShotImageClassification
+
+[[autodoc]] AutoModelForZeroShotImageClassification
+
+### TFAutoModelForZeroShotImageClassification
+
+[[autodoc]] TFAutoModelForZeroShotImageClassification
+
+### AutoModelForZeroShotObjectDetection
+
+[[autodoc]] AutoModelForZeroShotObjectDetection
+
+## Audio
+
+The following auto classes are available for the following audio tasks.
+
+### AutoModelForAudioClassification
+
+[[autodoc]] AutoModelForAudioClassification
+
+### AutoModelForAudioFrameClassification
+
+[[autodoc]] TFAutoModelForAudioClassification
+
+### TFAutoModelForAudioFrameClassification
+
+[[autodoc]] AutoModelForAudioFrameClassification
+
+### AutoModelForCTC
+
+[[autodoc]] AutoModelForCTC
+
+### AutoModelForSpeechSeq2Seq
+
+[[autodoc]] AutoModelForSpeechSeq2Seq
+
+### TFAutoModelForSpeechSeq2Seq
+
+[[autodoc]] TFAutoModelForSpeechSeq2Seq
+
+### FlaxAutoModelForSpeechSeq2Seq
+
+[[autodoc]] FlaxAutoModelForSpeechSeq2Seq
+
+### AutoModelForAudioXVector
+
+[[autodoc]] AutoModelForAudioXVector
+
+### AutoModelForTextToSpectrogram
+
+[[autodoc]] AutoModelForTextToSpectrogram
+
+### AutoModelForTextToWaveform
+
+[[autodoc]] AutoModelForTextToWaveform
+
+## Multimodal
+
+The following auto classes are available for the following multimodal tasks.
+
+### AutoModelForTableQuestionAnswering
+
+[[autodoc]] AutoModelForTableQuestionAnswering
+
+### TFAutoModelForTableQuestionAnswering
+
+[[autodoc]] TFAutoModelForTableQuestionAnswering
+
+### AutoModelForDocumentQuestionAnswering
+
+[[autodoc]] AutoModelForDocumentQuestionAnswering
+
+### TFAutoModelForDocumentQuestionAnswering
+
+[[autodoc]] TFAutoModelForDocumentQuestionAnswering
+
+### AutoModelForVisualQuestionAnswering
+
+[[autodoc]] AutoModelForVisualQuestionAnswering
+
+### AutoModelForVision2Seq
+
+[[autodoc]] AutoModelForVision2Seq
+
+### TFAutoModelForVision2Seq
+
+[[autodoc]] TFAutoModelForVision2Seq
+
+### FlaxAutoModelForVision2Seq
+
+[[autodoc]] FlaxAutoModelForVision2Seq
diff --git a/docs/source/en/model_doc/auto.mdx b/docs/source/en/model_doc/auto.mdx
deleted file mode 100644
index b39920151db4..000000000000
--- a/docs/source/en/model_doc/auto.mdx
+++ /dev/null
@@ -1,327 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Auto Classes
-
-In many cases, the architecture you want to use can be guessed from the name or the path of the pretrained model you
-are supplying to the `from_pretrained()` method. AutoClasses are here to do this job for you so that you
-automatically retrieve the relevant model given the name/path to the pretrained weights/config/vocabulary.
-
-Instantiating one of [`AutoConfig`], [`AutoModel`], and
-[`AutoTokenizer`] will directly create a class of the relevant architecture. For instance
-
-
-```python
-model = AutoModel.from_pretrained("bert-base-cased")
-```
-
-will create a model that is an instance of [`BertModel`].
-
-There is one class of `AutoModel` for each task, and for each backend (PyTorch, TensorFlow, or Flax).
-
-## Extending the Auto Classes
-
-Each of the auto classes has a method to be extended with your custom classes. For instance, if you have defined a
-custom class of model `NewModel`, make sure you have a `NewModelConfig` then you can add those to the auto
-classes like this:
-
-```python
-from transformers import AutoConfig, AutoModel
-
-AutoConfig.register("new-model", NewModelConfig)
-AutoModel.register(NewModelConfig, NewModel)
-```
-
-You will then be able to use the auto classes like you would usually do!
-
-<Tip warning={true}>
-
-If your `NewModelConfig` is a subclass of [`~transformer.PretrainedConfig`], make sure its
-`model_type` attribute is set to the same key you use when registering the config (here `"new-model"`).
-
-Likewise, if your `NewModel` is a subclass of [`PreTrainedModel`], make sure its
-`config_class` attribute is set to the same class you use when registering the model (here
-`NewModelConfig`).
-
-</Tip>
-
-## AutoConfig
-
-[[autodoc]] AutoConfig
-
-## AutoTokenizer
-
-[[autodoc]] AutoTokenizer
-
-## AutoFeatureExtractor
-
-[[autodoc]] AutoFeatureExtractor
-
-## AutoImageProcessor
-
-[[autodoc]] AutoImageProcessor
-
-## AutoProcessor
-
-[[autodoc]] AutoProcessor
-
-## Generic model classes
-
-The following auto classes are available for instantiating a base model class without a specific head.
-
-### AutoModel
-
-[[autodoc]] AutoModel
-
-### TFAutoModel
-
-[[autodoc]] TFAutoModel
-
-### FlaxAutoModel
-
-[[autodoc]] FlaxAutoModel
-
-## Generic pretraining classes
-
-The following auto classes are available for instantiating a model with a pretraining head.
-
-### AutoModelForPreTraining
-
-[[autodoc]] AutoModelForPreTraining
-
-### TFAutoModelForPreTraining
-
-[[autodoc]] TFAutoModelForPreTraining
-
-### FlaxAutoModelForPreTraining
-
-[[autodoc]] FlaxAutoModelForPreTraining
-
-## Natural Language Processing
-
-The following auto classes are available for the following natural language processing tasks.
-
-### AutoModelForCausalLM
-
-[[autodoc]] AutoModelForCausalLM
-
-### TFAutoModelForCausalLM
-
-[[autodoc]] TFAutoModelForCausalLM
-
-### FlaxAutoModelForCausalLM
-
-[[autodoc]] FlaxAutoModelForCausalLM
-
-### AutoModelForMaskedLM
-
-[[autodoc]] AutoModelForMaskedLM
-
-### TFAutoModelForMaskedLM
-
-[[autodoc]] TFAutoModelForMaskedLM
-
-### FlaxAutoModelForMaskedLM
-
-[[autodoc]] FlaxAutoModelForMaskedLM
-
-### AutoModelForSeq2SeqLM
-
-[[autodoc]] AutoModelForSeq2SeqLM
-
-### TFAutoModelForSeq2SeqLM
-
-[[autodoc]] TFAutoModelForSeq2SeqLM
-
-### FlaxAutoModelForSeq2SeqLM
-
-[[autodoc]] FlaxAutoModelForSeq2SeqLM
-
-### AutoModelForSequenceClassification
-
-[[autodoc]] AutoModelForSequenceClassification
-
-### TFAutoModelForSequenceClassification
-
-[[autodoc]] TFAutoModelForSequenceClassification
-
-### FlaxAutoModelForSequenceClassification
-
-[[autodoc]] FlaxAutoModelForSequenceClassification
-
-### AutoModelForMultipleChoice
-
-[[autodoc]] AutoModelForMultipleChoice
-
-### TFAutoModelForMultipleChoice
-
-[[autodoc]] TFAutoModelForMultipleChoice
-
-### FlaxAutoModelForMultipleChoice
-
-[[autodoc]] FlaxAutoModelForMultipleChoice
-
-### AutoModelForNextSentencePrediction
-
-[[autodoc]] AutoModelForNextSentencePrediction
-
-### TFAutoModelForNextSentencePrediction
-
-[[autodoc]] TFAutoModelForNextSentencePrediction
-
-### FlaxAutoModelForNextSentencePrediction
-
-[[autodoc]] FlaxAutoModelForNextSentencePrediction
-
-### AutoModelForTokenClassification
-
-[[autodoc]] AutoModelForTokenClassification
-
-### TFAutoModelForTokenClassification
-
-[[autodoc]] TFAutoModelForTokenClassification
-
-### FlaxAutoModelForTokenClassification
-
-[[autodoc]] FlaxAutoModelForTokenClassification
-
-### AutoModelForQuestionAnswering
-
-[[autodoc]] AutoModelForQuestionAnswering
-
-### TFAutoModelForQuestionAnswering
-
-[[autodoc]] TFAutoModelForQuestionAnswering
-
-### FlaxAutoModelForQuestionAnswering
-
-[[autodoc]] FlaxAutoModelForQuestionAnswering
-
-## Computer vision
-
-The following auto classes are available for the following computer vision tasks.
-
-### AutoModelForDepthEstimation
-
-[[autodoc]] AutoModelForDepthEstimation
-
-### AutoModelForImageClassification
-
-[[autodoc]] AutoModelForImageClassification
-
-### TFAutoModelForImageClassification
-
-[[autodoc]] TFAutoModelForImageClassification
-
-### FlaxAutoModelForImageClassification
-
-[[autodoc]] FlaxAutoModelForImageClassification
-
-### AutoModelForVideoClassification
-
-[[autodoc]] AutoModelForVideoClassification
-
-### AutoModelForMaskedImageModeling
-
-[[autodoc]] AutoModelForMaskedImageModeling
-
-### AutoModelForObjectDetection
-
-[[autodoc]] AutoModelForObjectDetection
-
-### AutoModelForImageSegmentation
-
-[[autodoc]] AutoModelForImageSegmentation
-
-### AutoModelForSemanticSegmentation
-
-[[autodoc]] AutoModelForSemanticSegmentation
-
-### TFAutoModelForSemanticSegmentation
-
-[[autodoc]] TFAutoModelForSemanticSegmentation
-
-### AutoModelForInstanceSegmentation
-
-[[autodoc]] AutoModelForInstanceSegmentation
-
-### AutoModelForUniversalSegmentation
-
-[[autodoc]] AutoModelForUniversalSegmentation
-
-### AutoModelForZeroShotObjectDetection
-
-[[autodoc]] AutoModelForZeroShotObjectDetection
-
-## Audio
-
-The following auto classes are available for the following audio tasks.
-
-### AutoModelForAudioClassification
-
-[[autodoc]] AutoModelForAudioClassification
-
-### AutoModelForAudioFrameClassification
-
-[[autodoc]] AutoModelForAudioFrameClassification
-
-### AutoModelForCTC
-
-[[autodoc]] AutoModelForCTC
-
-### AutoModelForSpeechSeq2Seq
-
-[[autodoc]] AutoModelForSpeechSeq2Seq
-
-### TFAutoModelForSpeechSeq2Seq
-
-[[autodoc]] TFAutoModelForSpeechSeq2Seq
-
-### AutoModelForAudioXVector
-
-[[autodoc]] AutoModelForAudioXVector
-
-## Multimodal
-
-The following auto classes are available for the following multimodal tasks.
-
-### AutoModelForTableQuestionAnswering
-
-[[autodoc]] AutoModelForTableQuestionAnswering
-
-### TFAutoModelForTableQuestionAnswering
-
-[[autodoc]] TFAutoModelForTableQuestionAnswering
-
-### AutoModelForDocumentQuestionAnswering
-
-[[autodoc]] AutoModelForDocumentQuestionAnswering
-
-### TFAutoModelForDocumentQuestionAnswering
-
-[[autodoc]] TFAutoModelForDocumentQuestionAnswering
-
-### AutoModelForVisualQuestionAnswering
-
-[[autodoc]] AutoModelForVisualQuestionAnswering
-
-### AutoModelForVision2Seq
-
-[[autodoc]] AutoModelForVision2Seq
-
-### TFAutoModelForVision2Seq
-
-[[autodoc]] TFAutoModelForVision2Seq
-
-### FlaxAutoModelForVision2Seq
-
-[[autodoc]] FlaxAutoModelForVision2Seq
diff --git a/docs/source/en/model_doc/autoformer.md b/docs/source/en/model_doc/autoformer.md
new file mode 100644
index 000000000000..20977c71cae9
--- /dev/null
+++ b/docs/source/en/model_doc/autoformer.md
@@ -0,0 +1,52 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Autoformer
+
+## Overview
+
+The Autoformer model was proposed in [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
+
+This model augments the Transformer as a deep decomposition architecture, which can progressively decompose the trend and seasonal components during the forecasting process.
+
+The abstract from the paper is the following:
+
+*Extending the forecasting time is a critical demand for real applications, such as extreme weather early warning and long-term energy consumption planning. This paper studies the long-term forecasting problem of time series. Prior Transformer-based models adopt various self-attention mechanisms to discover the long-range dependencies. However, intricate temporal patterns of the long-term future prohibit the model from finding reliable dependencies. Also, Transformers have to adopt the sparse versions of point-wise self-attentions for long series efficiency, resulting in the information utilization bottleneck. Going beyond Transformers, we design Autoformer as a novel decomposition architecture with an Auto-Correlation mechanism. We break with the pre-processing convention of series decomposition and renovate it as a basic inner block of deep models. This design empowers Autoformer with progressive decomposition capacities for complex time series. Further, inspired by the stochastic process theory, we design the Auto-Correlation mechanism based on the series periodicity, which conducts the dependencies discovery and representation aggregation at the sub-series level. Auto-Correlation outperforms self-attention in both efficiency and accuracy. In long-term forecasting, Autoformer yields state-of-the-art accuracy, with a 38% relative improvement on six benchmarks, covering five practical applications: energy, traffic, economics, weather and disease.*
+
+This model was contributed by [elisim](https://huggingface.co/elisim) and [kashif](https://huggingface.co/kashif).
+The original code can be found [here](https://github.com/thuml/Autoformer).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+- Check out the Autoformer blog-post in HuggingFace blog: [Yes, Transformers are Effective for Time Series Forecasting (+ Autoformer)](https://huggingface.co/blog/autoformer)
+
+## AutoformerConfig
+
+[[autodoc]] AutoformerConfig
+
+
+## AutoformerModel
+
+[[autodoc]] AutoformerModel
+    - forward
+
+
+## AutoformerForPrediction
+
+[[autodoc]] AutoformerForPrediction
+    - forward
diff --git a/docs/source/en/model_doc/bark.md b/docs/source/en/model_doc/bark.md
new file mode 100644
index 000000000000..63edb38dc956
--- /dev/null
+++ b/docs/source/en/model_doc/bark.md
@@ -0,0 +1,201 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Bark
+
+## Overview
+
+Bark is a transformer-based text-to-speech model proposed by Suno AI in [suno-ai/bark](https://github.com/suno-ai/bark). 
+
+
+Bark is made of 4 main models:
+
+- [`BarkSemanticModel`] (also referred to as the 'text' model): a causal auto-regressive transformer model that takes as input tokenized text, and predicts semantic text tokens that capture the meaning of the text.
+- [`BarkCoarseModel`] (also referred to as the 'coarse acoustics' model): a causal autoregressive transformer, that takes as input the results of the [`BarkSemanticModel`] model. It aims at predicting the first two audio codebooks necessary for EnCodec.
+- [`BarkFineModel`] (the 'fine acoustics' model), this time a non-causal autoencoder transformer, which iteratively predicts the last codebooks based on the sum of the previous codebooks embeddings.
+- having predicted all the codebook channels from the [`EncodecModel`], Bark uses it to decode the output audio array.
+
+It should be noted that each of the first three modules can support conditional speaker embeddings to condition the output sound according to specific predefined voice.
+
+### Optimizing Bark
+
+Bark can be optimized with just a few extra lines of code, which **significantly reduces its memory footprint** and **accelerates inference**.
+
+#### Using half-precision
+
+You can speed up inference and reduce memory footprint by 50% simply by loading the model in half-precision.
+
+```python
+from transformers import BarkModel
+import torch
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model = BarkModel.from_pretrained("suno/bark-small", torch_dtype=torch.float16).to(device)
+```
+
+#### Using 🤗 Better Transformer
+
+Better Transformer is an 🤗 Optimum feature that performs kernel fusion under the hood. You can gain 20% to 30% in speed with zero performance degradation. It only requires one line of code to export the model to 🤗 Better Transformer:
+
+```python
+model =  model.to_bettertransformer()
+```
+
+Note that 🤗 Optimum must be installed before using this feature. [Here's how to install it.](https://huggingface.co/docs/optimum/installation)
+
+#### Using CPU offload
+
+As mentioned above, Bark is made up of 4 sub-models, which are called up sequentially during audio generation. In other words, while one sub-model is in use, the other sub-models are idle.
+
+If you're using a CUDA device, a simple solution to benefit from an 80% reduction in memory footprint is to offload the GPU's submodels when they're idle. This operation is called CPU offloading. You can use it with one line of code.
+
+```python
+model.enable_cpu_offload()
+```
+
+Note that 🤗 Accelerate must be installed before using this feature. [Here's how to install it.](https://huggingface.co/docs/accelerate/basic_tutorials/install)
+
+#### Combining optimizaton techniques
+
+You can combine optimization techniques, and use CPU offload, half-precision and 🤗 Better Transformer all at once.
+
+```python
+from transformers import BarkModel
+import torch
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+# load in fp16
+model = BarkModel.from_pretrained("suno/bark-small", torch_dtype=torch.float16).to(device)
+
+# convert to bettertransformer
+model = BetterTransformer.transform(model, keep_original_model=False)
+
+# enable CPU offload
+model.enable_cpu_offload()
+```
+
+Find out more on inference optimization techniques [here](https://huggingface.co/docs/transformers/perf_infer_gpu_one).
+
+### Tips
+
+Suno offers a library of voice presets in a number of languages [here](https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c).
+These presets are also uploaded in the hub [here](https://huggingface.co/suno/bark-small/tree/main/speaker_embeddings) or [here](https://huggingface.co/suno/bark/tree/main/speaker_embeddings).
+
+```python
+>>> from transformers import AutoProcessor, BarkModel
+
+>>> processor = AutoProcessor.from_pretrained("suno/bark")
+>>> model = BarkModel.from_pretrained("suno/bark")
+
+>>> voice_preset = "v2/en_speaker_6"
+
+>>> inputs = processor("Hello, my dog is cute", voice_preset=voice_preset)
+
+>>> audio_array = model.generate(**inputs)
+>>> audio_array = audio_array.cpu().numpy().squeeze()
+```
+
+Bark can generate highly realistic, **multilingual** speech as well as other audio - including music, background noise and simple sound effects. 
+
+```python
+>>> # Multilingual speech - simplified Chinese
+>>> inputs = processor("惊人的！我会说中文")
+
+>>> # Multilingual speech - French - let's use a voice_preset as well
+>>> inputs = processor("Incroyable! Je peux générer du son.", voice_preset="fr_speaker_5")
+
+>>> # Bark can also generate music. You can help it out by adding music notes around your lyrics.
+>>> inputs = processor("♪ Hello, my dog is cute ♪")
+
+>>> audio_array = model.generate(**inputs)
+>>> audio_array = audio_array.cpu().numpy().squeeze()
+```
+
+The model can also produce **nonverbal communications** like laughing, sighing and crying.
+
+
+```python
+>>> # Adding non-speech cues to the input text
+>>> inputs = processor("Hello uh ... [clears throat], my dog is cute [laughter]")
+
+>>> audio_array = model.generate(**inputs)
+>>> audio_array = audio_array.cpu().numpy().squeeze()
+```
+
+To save the audio, simply take the sample rate from the model config and some scipy utility:
+
+```python
+>>> from scipy.io.wavfile import write as write_wav
+
+>>> # save audio to disk, but first take the sample rate from the model config
+>>> sample_rate = model.generation_config.sample_rate
+>>> write_wav("bark_generation.wav", sample_rate, audio_array)
+```
+
+
+This model was contributed by [Yoach Lacombe (ylacombe)](https://huggingface.co/ylacombe) and [Sanchit Gandhi (sanchit-gandhi)](https://github.com/sanchit-gandhi).
+The original code can be found [here](https://github.com/suno-ai/bark).
+
+
+## BarkConfig
+
+[[autodoc]] BarkConfig
+    - all
+
+## BarkProcessor
+
+[[autodoc]] BarkProcessor
+    - all
+    - __call__
+
+## BarkModel
+
+[[autodoc]] BarkModel
+    - generate
+    - enable_cpu_offload
+
+## BarkSemanticModel
+
+[[autodoc]] BarkSemanticModel
+    - forward
+
+## BarkCoarseModel
+
+[[autodoc]] BarkCoarseModel
+    - forward
+
+## BarkFineModel
+
+[[autodoc]] BarkFineModel
+    - forward
+
+## BarkCausalModel
+
+[[autodoc]] BarkCausalModel
+    - forward
+
+## BarkCoarseConfig
+
+[[autodoc]] BarkCoarseConfig
+    - all
+
+## BarkFineConfig
+
+[[autodoc]] BarkFineConfig
+    - all
+
+## BarkSemanticConfig
+
+[[autodoc]] BarkSemanticConfig
+    - all
+
diff --git a/docs/source/en/model_doc/bart.md b/docs/source/en/model_doc/bart.md
new file mode 100644
index 000000000000..dcf149fd85e1
--- /dev/null
+++ b/docs/source/en/model_doc/bart.md
@@ -0,0 +1,224 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BART
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=bart">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-bart-blueviolet">
+</a>
+<a href="https://huggingface.co/spaces/docs-demos/bart-large-mnli">
+<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
+</a>
+</div>
+
+**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title) and assign
+@patrickvonplaten
+
+## Overview
+
+The Bart model was proposed in [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation,
+Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan
+Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer on 29 Oct, 2019.
+
+According to the abstract,
+
+- Bart uses a standard seq2seq/machine translation architecture with a bidirectional encoder (like BERT) and a
+  left-to-right decoder (like GPT).
+- The pretraining task involves randomly shuffling the order of the original sentences and a novel in-filling scheme,
+  where spans of text are replaced with a single mask token.
+- BART is particularly effective when fine tuned for text generation but also works well for comprehension tasks. It
+  matches the performance of RoBERTa with comparable training resources on GLUE and SQuAD, achieves new
+  state-of-the-art results on a range of abstractive dialogue, question answering, and summarization tasks, with gains
+  of up to 6 ROUGE.
+
+Tips:
+
+- BART is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
+  the left.
+- Sequence-to-sequence model with an encoder and a decoder. Encoder is fed a corrupted version of the tokens, decoder is fed the original tokens (but has a mask to hide the future words like a regular transformers decoder). A composition of the following transformations are applied on the pretraining tasks for the encoder:
+
+  * mask random tokens (like in BERT)
+  * delete random tokens
+  * mask a span of k tokens with a single mask token (a span of 0 tokens is an insertion of a mask token)
+  * permute sentences
+  * rotate the document to make it start at a specific token
+
+This model was contributed by [sshleifer](https://huggingface.co/sshleifer). The Authors' code can be found [here](https://github.com/pytorch/fairseq/tree/master/examples/bart).
+
+
+### Examples
+
+- Examples and scripts for fine-tuning BART and other models for sequence to sequence tasks can be found in
+  [examples/pytorch/summarization/](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization/README.md).
+- An example of how to train [`BartForConditionalGeneration`] with a Hugging Face `datasets`
+  object can be found in this [forum discussion](https://discuss.huggingface.co/t/train-bart-for-conditional-generation-e-g-summarization/1904).
+- [Distilled checkpoints](https://huggingface.co/models?search=distilbart) are described in this [paper](https://arxiv.org/abs/2010.13002).
+
+
+## Implementation Notes
+
+- Bart doesn't use `token_type_ids` for sequence classification. Use [`BartTokenizer`] or
+  [`~BartTokenizer.encode`] to get the proper splitting.
+- The forward pass of [`BartModel`] will create the `decoder_input_ids` if they are not passed.
+  This is different than some other modeling APIs. A typical use case of this feature is mask filling.
+- Model predictions are intended to be identical to the original implementation when
+  `forced_bos_token_id=0`. This only works, however, if the string you pass to
+  [`fairseq.encode`] starts with a space.
+- [`~generation.GenerationMixin.generate`] should be used for conditional generation tasks like
+  summarization, see the example in that docstrings.
+- Models that load the *facebook/bart-large-cnn* weights will not have a `mask_token_id`, or be able to perform
+  mask-filling tasks.
+
+## Mask Filling
+
+The `facebook/bart-base` and `facebook/bart-large` checkpoints can be used to fill multi-token masks.
+
+```python
+from transformers import BartForConditionalGeneration, BartTokenizer
+
+model = BartForConditionalGeneration.from_pretrained("facebook/bart-large", forced_bos_token_id=0)
+tok = BartTokenizer.from_pretrained("facebook/bart-large")
+example_english_phrase = "UN Chief Says There Is No <mask> in Syria"
+batch = tok(example_english_phrase, return_tensors="pt")
+generated_ids = model.generate(batch["input_ids"])
+assert tok.batch_decode(generated_ids, skip_special_tokens=True) == [
+    "UN Chief Says There Is No Plan to Stop Chemical Weapons in Syria"
+]
+```
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with BART. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+<PipelineTag pipeline="summarization"/>
+
+- A blog post on [Distributed Training: Train BART/T5 for Summarization using 🤗 Transformers and Amazon SageMaker](https://huggingface.co/blog/sagemaker-distributed-training-seq2seq).
+- A notebook on how to [finetune BART for summarization with fastai using blurr](https://colab.research.google.com/github/ohmeow/ohmeow_website/blob/master/posts/2021-05-25-mbart-sequence-classification-with-blurr.ipynb). 🌎
+- A notebook on how to [finetune BART for summarization in two languages with Trainer class](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb). 🌎
+- [`BartForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb).
+- [`TFBartForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb).
+- [`FlaxBartForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/summarization).
+- [Summarization](https://huggingface.co/course/chapter7/5?fw=pt#summarization) chapter of the 🤗 Hugging Face course.
+- [Summarization task guide](../tasks/summarization)
+
+<PipelineTag pipeline="fill-mask"/>
+
+- [`BartForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#robertabertdistilbert-and-masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
+- [`TFBartForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_mlmpy) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
+- [`FlaxBartForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/masked_language_modeling_flax.ipynb).
+- [Masked language modeling](https://huggingface.co/course/chapter7/3?fw=pt) chapter of the 🤗 Hugging Face Course.
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+
+<PipelineTag pipeline="translation"/>
+
+- A notebook on how to [finetune mBART using Seq2SeqTrainer for Hindi to English translation](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb). 🌎
+- [`BartForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/translation) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation.ipynb).
+- [`TFBartForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/translation) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation-tf.ipynb).
+- [Translation task guide](../tasks/translation)
+
+See also:
+- [Text classification task guide](../tasks/sequence_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Causal language modeling task guide](../tasks/language_modeling)
+
+## BartConfig
+
+[[autodoc]] BartConfig
+    - all
+
+## BartTokenizer
+
+[[autodoc]] BartTokenizer
+    - all
+
+## BartTokenizerFast
+
+[[autodoc]] BartTokenizerFast
+    - all
+
+## BartModel
+
+[[autodoc]] BartModel
+    - forward
+
+## BartForConditionalGeneration
+
+[[autodoc]] BartForConditionalGeneration
+    - forward
+
+## BartForSequenceClassification
+
+[[autodoc]] BartForSequenceClassification
+    - forward
+
+## BartForQuestionAnswering
+
+[[autodoc]] BartForQuestionAnswering
+    - forward
+
+## BartForCausalLM
+
+[[autodoc]] BartForCausalLM
+    - forward
+
+## TFBartModel
+
+[[autodoc]] TFBartModel
+    - call
+
+## TFBartForConditionalGeneration
+
+[[autodoc]] TFBartForConditionalGeneration
+    - call
+
+## TFBartForSequenceClassification
+
+[[autodoc]] TFBartForSequenceClassification
+    - call
+
+## FlaxBartModel
+
+[[autodoc]] FlaxBartModel
+    - __call__
+    - encode
+    - decode
+
+## FlaxBartForConditionalGeneration
+
+[[autodoc]] FlaxBartForConditionalGeneration
+    - __call__
+    - encode
+    - decode
+
+## FlaxBartForSequenceClassification
+
+[[autodoc]] FlaxBartForSequenceClassification
+    - __call__
+    - encode
+    - decode
+
+## FlaxBartForQuestionAnswering
+
+[[autodoc]] FlaxBartForQuestionAnswering
+    - __call__
+    - encode
+    - decode
+
+## FlaxBartForCausalLM
+
+[[autodoc]] FlaxBartForCausalLM
+    - __call__
diff --git a/docs/source/en/model_doc/bart.mdx b/docs/source/en/model_doc/bart.mdx
deleted file mode 100644
index 9fae10212b90..000000000000
--- a/docs/source/en/model_doc/bart.mdx
+++ /dev/null
@@ -1,196 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# BART
-
-**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title) and assign
-@patrickvonplaten
-
-## Overview
-
-The Bart model was proposed in [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation,
-Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan
-Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer on 29 Oct, 2019.
-
-According to the abstract,
-
-- Bart uses a standard seq2seq/machine translation architecture with a bidirectional encoder (like BERT) and a
-  left-to-right decoder (like GPT).
-- The pretraining task involves randomly shuffling the order of the original sentences and a novel in-filling scheme,
-  where spans of text are replaced with a single mask token.
-- BART is particularly effective when fine tuned for text generation but also works well for comprehension tasks. It
-  matches the performance of RoBERTa with comparable training resources on GLUE and SQuAD, achieves new
-  state-of-the-art results on a range of abstractive dialogue, question answering, and summarization tasks, with gains
-  of up to 6 ROUGE.
-
-Tips:
-
-- BART is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
-  the left.
-
-This model was contributed by [sshleifer](https://huggingface.co/sshleifer). The Authors' code can be found [here](https://github.com/pytorch/fairseq/tree/master/examples/bart).
-
-
-### Examples
-
-- Examples and scripts for fine-tuning BART and other models for sequence to sequence tasks can be found in
-  [examples/pytorch/summarization/](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization/README.md).
-- An example of how to train [`BartForConditionalGeneration`] with a Hugging Face `datasets`
-  object can be found in this [forum discussion](https://discuss.huggingface.co/t/train-bart-for-conditional-generation-e-g-summarization/1904).
-- [Distilled checkpoints](https://huggingface.co/models?search=distilbart) are described in this [paper](https://arxiv.org/abs/2010.13002).
-
-
-## Implementation Notes
-
-- Bart doesn't use `token_type_ids` for sequence classification. Use [`BartTokenizer`] or
-  [`~BartTokenizer.encode`] to get the proper splitting.
-- The forward pass of [`BartModel`] will create the `decoder_input_ids` if they are not passed.
-  This is different than some other modeling APIs. A typical use case of this feature is mask filling.
-- Model predictions are intended to be identical to the original implementation when
-  `forced_bos_token_id=0`. This only works, however, if the string you pass to
-  [`fairseq.encode`] starts with a space.
-- [`~generation.GenerationMixin.generate`] should be used for conditional generation tasks like
-  summarization, see the example in that docstrings.
-- Models that load the *facebook/bart-large-cnn* weights will not have a `mask_token_id`, or be able to perform
-  mask-filling tasks.
-
-## Mask Filling
-
-The `facebook/bart-base` and `facebook/bart-large` checkpoints can be used to fill multi-token masks.
-
-```python
-from transformers import BartForConditionalGeneration, BartTokenizer
-
-model = BartForConditionalGeneration.from_pretrained("facebook/bart-large", forced_bos_token_id=0)
-tok = BartTokenizer.from_pretrained("facebook/bart-large")
-example_english_phrase = "UN Chief Says There Is No <mask> in Syria"
-batch = tok(example_english_phrase, return_tensors="pt")
-generated_ids = model.generate(batch["input_ids"])
-assert tok.batch_decode(generated_ids, skip_special_tokens=True) == [
-    "UN Chief Says There Is No Plan to Stop Chemical Weapons in Syria"
-]
-```
-
-## Resources
-
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with BART. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
-<PipelineTag pipeline="summarization"/>
-
-- A blog post on [Distributed Training: Train BART/T5 for Summarization using 🤗 Transformers and Amazon SageMaker](https://huggingface.co/blog/sagemaker-distributed-training-seq2seq).
-- A notebook on how to [finetune BART for summarization with fastai using blurr](https://colab.research.google.com/github/ohmeow/ohmeow_website/blob/master/_notebooks/2020-05-23-text-generation-with-blurr.ipynb). 🌎
-- A notebook on how to [finetune BART for summarization in two languages with Trainer class](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb). 🌎
-- [`BartForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization) and [noteboook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb).
-- [`TFBartForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb).
-- [`FlaxBartForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/summarization).
-- [Summarization](https://huggingface.co/course/chapter7/5?fw=pt#summarization) chapter of the 🤗 Hugging Face course.
-
-<PipelineTag pipeline="fill-mask"/>
-
-- [`BartForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#robertabertdistilbert-and-masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
-- [`TFBartForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_mlmpy) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
-- [`FlaxBartForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/masked_language_modeling_flax.ipynb).
-- [Masked language modeling](https://huggingface.co/course/chapter7/3?fw=pt) chapter of the 🤗 Hugging Face Course.
-
-<PipelineTag pipeline="translation"/>
-
-- A notebook on how to [finetune mBART using Seq2SeqTrainer for Hindi to English translation](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb). 🌎
-- [`BartForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/translation) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation.ipynb).
-- [`TFBartForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/translation) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation-tf.ipynb).
-
-## BartConfig
-
-[[autodoc]] BartConfig
-    - all
-
-## BartTokenizer
-
-[[autodoc]] BartTokenizer
-    - all
-
-## BartTokenizerFast
-
-[[autodoc]] BartTokenizerFast
-    - all
-
-## BartModel
-
-[[autodoc]] BartModel
-    - forward
-
-## BartForConditionalGeneration
-
-[[autodoc]] BartForConditionalGeneration
-    - forward
-
-## BartForSequenceClassification
-
-[[autodoc]] BartForSequenceClassification
-    - forward
-
-## BartForQuestionAnswering
-
-[[autodoc]] BartForQuestionAnswering
-    - forward
-
-## BartForCausalLM
-
-[[autodoc]] BartForCausalLM
-    - forward
-
-## TFBartModel
-
-[[autodoc]] TFBartModel
-    - call
-
-## TFBartForConditionalGeneration
-
-[[autodoc]] TFBartForConditionalGeneration
-    - call
-
-## TFBartForSequenceClassification
-
-[[autodoc]] TFBartForSequenceClassification
-    - call
-
-## FlaxBartModel
-
-[[autodoc]] FlaxBartModel
-    - __call__
-    - encode
-    - decode
-
-## FlaxBartForConditionalGeneration
-
-[[autodoc]] FlaxBartForConditionalGeneration
-    - __call__
-    - encode
-    - decode
-
-## FlaxBartForSequenceClassification
-
-[[autodoc]] FlaxBartForSequenceClassification
-    - __call__
-    - encode
-    - decode
-
-## FlaxBartForQuestionAnswering
-
-[[autodoc]] FlaxBartForQuestionAnswering
-    - __call__
-    - encode
-    - decode
-
-## FlaxBartForCausalLM
-
-[[autodoc]] FlaxBartForCausalLM
-    - __call__
diff --git a/docs/source/en/model_doc/barthez.md b/docs/source/en/model_doc/barthez.md
new file mode 100644
index 000000000000..fdeb8e2fed23
--- /dev/null
+++ b/docs/source/en/model_doc/barthez.md
@@ -0,0 +1,54 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BARThez
+
+## Overview
+
+The BARThez model was proposed in [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis on 23 Oct,
+2020.
+
+The abstract of the paper:
+
+
+*Inductive transfer learning, enabled by self-supervised learning, have taken the entire Natural Language Processing
+(NLP) field by storm, with models such as BERT and BART setting new state of the art on countless natural language
+understanding tasks. While there are some notable exceptions, most of the available models and research have been
+conducted for the English language. In this work, we introduce BARThez, the first BART model for the French language
+(to the best of our knowledge). BARThez was pretrained on a very large monolingual French corpus from past research
+that we adapted to suit BART's perturbation schemes. Unlike already existing BERT-based French language models such as
+CamemBERT and FlauBERT, BARThez is particularly well-suited for generative tasks, since not only its encoder but also
+its decoder is pretrained. In addition to discriminative tasks from the FLUE benchmark, we evaluate BARThez on a novel
+summarization dataset, OrangeSum, that we release with this paper. We also continue the pretraining of an already
+pretrained multilingual BART on BARThez's corpus, and we show that the resulting model, which we call mBARTHez,
+provides a significant boost over vanilla BARThez, and is on par with or outperforms CamemBERT and FlauBERT.*
+
+This model was contributed by [moussakam](https://huggingface.co/moussakam). The Authors' code can be found [here](https://github.com/moussaKam/BARThez).
+
+
+### Examples
+
+- BARThez can be fine-tuned on sequence-to-sequence tasks in a similar way as BART, check:
+  [examples/pytorch/summarization/](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization/README.md).
+
+
+## BarthezTokenizer
+
+[[autodoc]] BarthezTokenizer
+
+## BarthezTokenizerFast
+
+[[autodoc]] BarthezTokenizerFast
diff --git a/docs/source/en/model_doc/barthez.mdx b/docs/source/en/model_doc/barthez.mdx
deleted file mode 100644
index f1969e8e9424..000000000000
--- a/docs/source/en/model_doc/barthez.mdx
+++ /dev/null
@@ -1,50 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# BARThez
-
-## Overview
-
-The BARThez model was proposed in [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis on 23 Oct,
-2020.
-
-The abstract of the paper:
-
-
-*Inductive transfer learning, enabled by self-supervised learning, have taken the entire Natural Language Processing
-(NLP) field by storm, with models such as BERT and BART setting new state of the art on countless natural language
-understanding tasks. While there are some notable exceptions, most of the available models and research have been
-conducted for the English language. In this work, we introduce BARThez, the first BART model for the French language
-(to the best of our knowledge). BARThez was pretrained on a very large monolingual French corpus from past research
-that we adapted to suit BART's perturbation schemes. Unlike already existing BERT-based French language models such as
-CamemBERT and FlauBERT, BARThez is particularly well-suited for generative tasks, since not only its encoder but also
-its decoder is pretrained. In addition to discriminative tasks from the FLUE benchmark, we evaluate BARThez on a novel
-summarization dataset, OrangeSum, that we release with this paper. We also continue the pretraining of an already
-pretrained multilingual BART on BARThez's corpus, and we show that the resulting model, which we call mBARTHez,
-provides a significant boost over vanilla BARThez, and is on par with or outperforms CamemBERT and FlauBERT.*
-
-This model was contributed by [moussakam](https://huggingface.co/moussakam). The Authors' code can be found [here](https://github.com/moussaKam/BARThez).
-
-
-### Examples
-
-- BARThez can be fine-tuned on sequence-to-sequence tasks in a similar way as BART, check:
-  [examples/pytorch/summarization/](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization/README.md).
-
-
-## BarthezTokenizer
-
-[[autodoc]] BarthezTokenizer
-
-## BarthezTokenizerFast
-
-[[autodoc]] BarthezTokenizerFast
diff --git a/docs/source/en/model_doc/bartpho.md b/docs/source/en/model_doc/bartpho.md
new file mode 100644
index 000000000000..3529c11a7ed2
--- /dev/null
+++ b/docs/source/en/model_doc/bartpho.md
@@ -0,0 +1,86 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BARTpho
+
+## Overview
+
+The BARTpho model was proposed in [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
+
+The abstract from the paper is the following:
+
+*We present BARTpho with two versions -- BARTpho_word and BARTpho_syllable -- the first public large-scale monolingual
+sequence-to-sequence models pre-trained for Vietnamese. Our BARTpho uses the "large" architecture and pre-training
+scheme of the sequence-to-sequence denoising model BART, thus especially suitable for generative NLP tasks. Experiments
+on a downstream task of Vietnamese text summarization show that in both automatic and human evaluations, our BARTpho
+outperforms the strong baseline mBART and improves the state-of-the-art. We release BARTpho to facilitate future
+research and applications of generative Vietnamese NLP tasks.*
+
+Example of use:
+
+```python
+>>> import torch
+>>> from transformers import AutoModel, AutoTokenizer
+
+>>> bartpho = AutoModel.from_pretrained("vinai/bartpho-syllable")
+
+>>> tokenizer = AutoTokenizer.from_pretrained("vinai/bartpho-syllable")
+
+>>> line = "Chúng tôi là những nghiên cứu viên."
+
+>>> input_ids = tokenizer(line, return_tensors="pt")
+
+>>> with torch.no_grad():
+...     features = bartpho(**input_ids)  # Models outputs are now tuples
+
+>>> # With TensorFlow 2.0+:
+>>> from transformers import TFAutoModel
+
+>>> bartpho = TFAutoModel.from_pretrained("vinai/bartpho-syllable")
+>>> input_ids = tokenizer(line, return_tensors="tf")
+>>> features = bartpho(**input_ids)
+```
+
+Tips:
+
+- Following mBART, BARTpho uses the "large" architecture of BART with an additional layer-normalization layer on top of
+  both the encoder and decoder. Thus, usage examples in the [documentation of BART](bart), when adapting to use
+  with BARTpho, should be adjusted by replacing the BART-specialized classes with the mBART-specialized counterparts.
+  For example:
+
+```python
+>>> from transformers import MBartForConditionalGeneration
+
+>>> bartpho = MBartForConditionalGeneration.from_pretrained("vinai/bartpho-syllable")
+>>> TXT = "Chúng tôi là <mask> nghiên cứu viên."
+>>> input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"]
+>>> logits = bartpho(input_ids).logits
+>>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
+>>> probs = logits[0, masked_index].softmax(dim=0)
+>>> values, predictions = probs.topk(5)
+>>> print(tokenizer.decode(predictions).split())
+```
+
+- This implementation is only for tokenization: "monolingual_vocab_file" consists of Vietnamese-specialized types
+  extracted from the pre-trained SentencePiece model "vocab_file" that is available from the multilingual XLM-RoBERTa.
+  Other languages, if employing this pre-trained multilingual SentencePiece model "vocab_file" for subword
+  segmentation, can reuse BartphoTokenizer with their own language-specialized "monolingual_vocab_file".
+
+This model was contributed by [dqnguyen](https://huggingface.co/dqnguyen). The original code can be found [here](https://github.com/VinAIResearch/BARTpho).
+
+## BartphoTokenizer
+
+[[autodoc]] BartphoTokenizer
diff --git a/docs/source/en/model_doc/bartpho.mdx b/docs/source/en/model_doc/bartpho.mdx
deleted file mode 100644
index d940173b42f8..000000000000
--- a/docs/source/en/model_doc/bartpho.mdx
+++ /dev/null
@@ -1,82 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# BARTpho
-
-## Overview
-
-The BARTpho model was proposed in [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
-
-The abstract from the paper is the following:
-
-*We present BARTpho with two versions -- BARTpho_word and BARTpho_syllable -- the first public large-scale monolingual
-sequence-to-sequence models pre-trained for Vietnamese. Our BARTpho uses the "large" architecture and pre-training
-scheme of the sequence-to-sequence denoising model BART, thus especially suitable for generative NLP tasks. Experiments
-on a downstream task of Vietnamese text summarization show that in both automatic and human evaluations, our BARTpho
-outperforms the strong baseline mBART and improves the state-of-the-art. We release BARTpho to facilitate future
-research and applications of generative Vietnamese NLP tasks.*
-
-Example of use:
-
-```python
->>> import torch
->>> from transformers import AutoModel, AutoTokenizer
-
->>> bartpho = AutoModel.from_pretrained("vinai/bartpho-syllable")
-
->>> tokenizer = AutoTokenizer.from_pretrained("vinai/bartpho-syllable")
-
->>> line = "Chúng tôi là những nghiên cứu viên."
-
->>> input_ids = tokenizer(line, return_tensors="pt")
-
->>> with torch.no_grad():
-...     features = bartpho(**input_ids)  # Models outputs are now tuples
-
->>> # With TensorFlow 2.0+:
->>> from transformers import TFAutoModel
-
->>> bartpho = TFAutoModel.from_pretrained("vinai/bartpho-syllable")
->>> input_ids = tokenizer(line, return_tensors="tf")
->>> features = bartpho(**input_ids)
-```
-
-Tips:
-
-- Following mBART, BARTpho uses the "large" architecture of BART with an additional layer-normalization layer on top of
-  both the encoder and decoder. Thus, usage examples in the [documentation of BART](bart), when adapting to use
-  with BARTpho, should be adjusted by replacing the BART-specialized classes with the mBART-specialized counterparts.
-  For example:
-
-```python
->>> from transformers import MBartForConditionalGeneration
-
->>> bartpho = MBartForConditionalGeneration.from_pretrained("vinai/bartpho-syllable")
->>> TXT = "Chúng tôi là <mask> nghiên cứu viên."
->>> input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"]
->>> logits = bartpho(input_ids).logits
->>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
->>> probs = logits[0, masked_index].softmax(dim=0)
->>> values, predictions = probs.topk(5)
->>> print(tokenizer.decode(predictions).split())
-```
-
-- This implementation is only for tokenization: "monolingual_vocab_file" consists of Vietnamese-specialized types
-  extracted from the pre-trained SentencePiece model "vocab_file" that is available from the multilingual XLM-RoBERTa.
-  Other languages, if employing this pre-trained multilingual SentencePiece model "vocab_file" for subword
-  segmentation, can reuse BartphoTokenizer with their own language-specialized "monolingual_vocab_file".
-
-This model was contributed by [dqnguyen](https://huggingface.co/dqnguyen). The original code can be found [here](https://github.com/VinAIResearch/BARTpho).
-
-## BartphoTokenizer
-
-[[autodoc]] BartphoTokenizer
diff --git a/docs/source/en/model_doc/beit.md b/docs/source/en/model_doc/beit.md
new file mode 100644
index 000000000000..69586724713d
--- /dev/null
+++ b/docs/source/en/model_doc/beit.md
@@ -0,0 +1,143 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BEiT
+
+## Overview
+
+The BEiT model was proposed in [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by
+Hangbo Bao, Li Dong and Furu Wei. Inspired by BERT, BEiT is the first paper that makes self-supervised pre-training of
+Vision Transformers (ViTs) outperform supervised pre-training. Rather than pre-training the model to predict the class
+of an image (as done in the [original ViT paper](https://arxiv.org/abs/2010.11929)), BEiT models are pre-trained to
+predict visual tokens from the codebook of OpenAI's [DALL-E model](https://arxiv.org/abs/2102.12092) given masked
+patches.
+
+The abstract from the paper is the following:
+
+*We introduce a self-supervised vision representation model BEiT, which stands for Bidirectional Encoder representation
+from Image Transformers. Following BERT developed in the natural language processing area, we propose a masked image
+modeling task to pretrain vision Transformers. Specifically, each image has two views in our pre-training, i.e, image
+patches (such as 16x16 pixels), and visual tokens (i.e., discrete tokens). We first "tokenize" the original image into
+visual tokens. Then we randomly mask some image patches and fed them into the backbone Transformer. The pre-training
+objective is to recover the original visual tokens based on the corrupted image patches. After pre-training BEiT, we
+directly fine-tune the model parameters on downstream tasks by appending task layers upon the pretrained encoder.
+Experimental results on image classification and semantic segmentation show that our model achieves competitive results
+with previous pre-training methods. For example, base-size BEiT achieves 83.2% top-1 accuracy on ImageNet-1K,
+significantly outperforming from-scratch DeiT training (81.8%) with the same setup. Moreover, large-size BEiT obtains
+86.3% only using ImageNet-1K, even outperforming ViT-L with supervised pre-training on ImageNet-22K (85.2%).*
+
+Tips:
+
+- BEiT models are regular Vision Transformers, but pre-trained in a self-supervised way rather than supervised. They
+  outperform both the [original model (ViT)](vit) as well as [Data-efficient Image Transformers (DeiT)](deit) when fine-tuned on ImageNet-1K and CIFAR-100. You can check out demo notebooks regarding inference as well as
+  fine-tuning on custom data [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/VisionTransformer) (you can just replace
+  [`ViTFeatureExtractor`] by [`BeitImageProcessor`] and
+  [`ViTForImageClassification`] by [`BeitForImageClassification`]).
+- There's also a demo notebook available which showcases how to combine DALL-E's image tokenizer with BEiT for
+  performing masked image modeling. You can find it [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/BEiT).
+- As the BEiT models expect each image to be of the same size (resolution), one can use
+  [`BeitImageProcessor`] to resize (or rescale) and normalize images for the model.
+- Both the patch resolution and image resolution used during pre-training or fine-tuning are reflected in the name of
+  each checkpoint. For example, `microsoft/beit-base-patch16-224` refers to a base-sized architecture with patch
+  resolution of 16x16 and fine-tuning resolution of 224x224. All checkpoints can be found on the [hub](https://huggingface.co/models?search=microsoft/beit).
+- The available checkpoints are either (1) pre-trained on [ImageNet-22k](http://www.image-net.org/) (a collection of
+  14 million images and 22k classes) only, (2) also fine-tuned on ImageNet-22k or (3) also fine-tuned on [ImageNet-1k](http://www.image-net.org/challenges/LSVRC/2012/) (also referred to as ILSVRC 2012, a collection of 1.3 million
+  images and 1,000 classes).
+- BEiT uses relative position embeddings, inspired by the T5 model. During pre-training, the authors shared the
+  relative position bias among the several self-attention layers. During fine-tuning, each layer's relative position
+  bias is initialized with the shared relative position bias obtained after pre-training. Note that, if one wants to
+  pre-train a model from scratch, one needs to either set the `use_relative_position_bias` or the
+  `use_relative_position_bias` attribute of [`BeitConfig`] to `True` in order to add
+  position embeddings.
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/beit_architecture.jpg"
+alt="drawing" width="600"/>
+
+<small> BEiT pre-training. Taken from the <a href="https://arxiv.org/abs/2106.08254">original paper.</a> </small>
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The JAX/FLAX version of this model was
+contributed by [kamalkraj](https://huggingface.co/kamalkraj). The original code can be found [here](https://github.com/microsoft/unilm/tree/master/beit).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with BEiT.
+
+<PipelineTag pipeline="image-classification"/>
+
+- [`BeitForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
+- See also: [Image classification task guide](../tasks/image_classification)
+
+**Semantic segmentation**
+- [Semantic segmentation task guide](../tasks/semantic_segmentation)
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## BEiT specific outputs
+
+[[autodoc]] models.beit.modeling_beit.BeitModelOutputWithPooling
+
+[[autodoc]] models.beit.modeling_flax_beit.FlaxBeitModelOutputWithPooling
+
+## BeitConfig
+
+[[autodoc]] BeitConfig
+
+## BeitFeatureExtractor
+
+[[autodoc]] BeitFeatureExtractor
+    - __call__
+    - post_process_semantic_segmentation
+
+## BeitImageProcessor
+
+[[autodoc]] BeitImageProcessor
+    - preprocess
+    - post_process_semantic_segmentation
+
+## BeitModel
+
+[[autodoc]] BeitModel
+    - forward
+
+## BeitForMaskedImageModeling
+
+[[autodoc]] BeitForMaskedImageModeling
+    - forward
+
+## BeitForImageClassification
+
+[[autodoc]] BeitForImageClassification
+    - forward
+
+## BeitForSemanticSegmentation
+
+[[autodoc]] BeitForSemanticSegmentation
+    - forward
+
+## FlaxBeitModel
+
+[[autodoc]] FlaxBeitModel
+    - __call__
+
+## FlaxBeitForMaskedImageModeling
+
+[[autodoc]] FlaxBeitForMaskedImageModeling
+    - __call__
+
+## FlaxBeitForImageClassification
+
+[[autodoc]] FlaxBeitForImageClassification
+    - __call__
diff --git a/docs/source/en/model_doc/beit.mdx b/docs/source/en/model_doc/beit.mdx
deleted file mode 100644
index dea2522fb1c5..000000000000
--- a/docs/source/en/model_doc/beit.mdx
+++ /dev/null
@@ -1,126 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# BEiT
-
-## Overview
-
-The BEiT model was proposed in [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by
-Hangbo Bao, Li Dong and Furu Wei. Inspired by BERT, BEiT is the first paper that makes self-supervised pre-training of
-Vision Transformers (ViTs) outperform supervised pre-training. Rather than pre-training the model to predict the class
-of an image (as done in the [original ViT paper](https://arxiv.org/abs/2010.11929)), BEiT models are pre-trained to
-predict visual tokens from the codebook of OpenAI's [DALL-E model](https://arxiv.org/abs/2102.12092) given masked
-patches.
-
-The abstract from the paper is the following:
-
-*We introduce a self-supervised vision representation model BEiT, which stands for Bidirectional Encoder representation
-from Image Transformers. Following BERT developed in the natural language processing area, we propose a masked image
-modeling task to pretrain vision Transformers. Specifically, each image has two views in our pre-training, i.e, image
-patches (such as 16x16 pixels), and visual tokens (i.e., discrete tokens). We first "tokenize" the original image into
-visual tokens. Then we randomly mask some image patches and fed them into the backbone Transformer. The pre-training
-objective is to recover the original visual tokens based on the corrupted image patches. After pre-training BEiT, we
-directly fine-tune the model parameters on downstream tasks by appending task layers upon the pretrained encoder.
-Experimental results on image classification and semantic segmentation show that our model achieves competitive results
-with previous pre-training methods. For example, base-size BEiT achieves 83.2% top-1 accuracy on ImageNet-1K,
-significantly outperforming from-scratch DeiT training (81.8%) with the same setup. Moreover, large-size BEiT obtains
-86.3% only using ImageNet-1K, even outperforming ViT-L with supervised pre-training on ImageNet-22K (85.2%).*
-
-Tips:
-
-- BEiT models are regular Vision Transformers, but pre-trained in a self-supervised way rather than supervised. They
-  outperform both the [original model (ViT)](vit) as well as [Data-efficient Image Transformers (DeiT)](deit) when fine-tuned on ImageNet-1K and CIFAR-100. You can check out demo notebooks regarding inference as well as
-  fine-tuning on custom data [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/VisionTransformer) (you can just replace
-  [`ViTFeatureExtractor`] by [`BeitImageProcessor`] and
-  [`ViTForImageClassification`] by [`BeitForImageClassification`]).
-- There's also a demo notebook available which showcases how to combine DALL-E's image tokenizer with BEiT for
-  performing masked image modeling. You can find it [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/BEiT).
-- As the BEiT models expect each image to be of the same size (resolution), one can use
-  [`BeitImageProcessor`] to resize (or rescale) and normalize images for the model.
-- Both the patch resolution and image resolution used during pre-training or fine-tuning are reflected in the name of
-  each checkpoint. For example, `microsoft/beit-base-patch16-224` refers to a base-sized architecture with patch
-  resolution of 16x16 and fine-tuning resolution of 224x224. All checkpoints can be found on the [hub](https://huggingface.co/models?search=microsoft/beit).
-- The available checkpoints are either (1) pre-trained on [ImageNet-22k](http://www.image-net.org/) (a collection of
-  14 million images and 22k classes) only, (2) also fine-tuned on ImageNet-22k or (3) also fine-tuned on [ImageNet-1k](http://www.image-net.org/challenges/LSVRC/2012/) (also referred to as ILSVRC 2012, a collection of 1.3 million
-  images and 1,000 classes).
-- BEiT uses relative position embeddings, inspired by the T5 model. During pre-training, the authors shared the
-  relative position bias among the several self-attention layers. During fine-tuning, each layer's relative position
-  bias is initialized with the shared relative position bias obtained after pre-training. Note that, if one wants to
-  pre-train a model from scratch, one needs to either set the `use_relative_position_bias` or the
-  `use_relative_position_bias` attribute of [`BeitConfig`] to `True` in order to add
-  position embeddings.
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/beit_architecture.jpg"
-alt="drawing" width="600"/>
-
-<small> BEiT pre-training. Taken from the <a href="https://arxiv.org/abs/2106.08254">original paper.</a> </small>
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr). The JAX/FLAX version of this model was
-contributed by [kamalkraj](https://huggingface.co/kamalkraj). The original code can be found [here](https://github.com/microsoft/unilm/tree/master/beit).
-
-
-## BEiT specific outputs
-
-[[autodoc]] models.beit.modeling_beit.BeitModelOutputWithPooling
-
-[[autodoc]] models.beit.modeling_flax_beit.FlaxBeitModelOutputWithPooling
-
-## BeitConfig
-
-[[autodoc]] BeitConfig
-
-## BeitFeatureExtractor
-
-[[autodoc]] BeitFeatureExtractor
-    - __call__
-    - post_process_semantic_segmentation
-
-## BeitImageProcessor
-
-[[autodoc]] BeitImageProcessor
-    - preprocess
-    - post_process_semantic_segmentation
-
-## BeitModel
-
-[[autodoc]] BeitModel
-    - forward
-
-## BeitForMaskedImageModeling
-
-[[autodoc]] BeitForMaskedImageModeling
-    - forward
-
-## BeitForImageClassification
-
-[[autodoc]] BeitForImageClassification
-    - forward
-
-## BeitForSemanticSegmentation
-
-[[autodoc]] BeitForSemanticSegmentation
-    - forward
-
-## FlaxBeitModel
-
-[[autodoc]] FlaxBeitModel
-    - __call__
-
-## FlaxBeitForMaskedImageModeling
-
-[[autodoc]] FlaxBeitForMaskedImageModeling
-    - __call__
-
-## FlaxBeitForImageClassification
-
-[[autodoc]] FlaxBeitForImageClassification
-    - __call__
diff --git a/docs/source/en/model_doc/bert-generation.md b/docs/source/en/model_doc/bert-generation.md
new file mode 100644
index 000000000000..9cc7bac6c7e4
--- /dev/null
+++ b/docs/source/en/model_doc/bert-generation.md
@@ -0,0 +1,108 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BertGeneration
+
+## Overview
+
+The BertGeneration model is a BERT model that can be leveraged for sequence-to-sequence tasks using
+[`EncoderDecoderModel`] as proposed in [Leveraging Pre-trained Checkpoints for Sequence Generation
+Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+
+The abstract from the paper is the following:
+
+*Unsupervised pretraining of large neural models has recently revolutionized Natural Language Processing. By
+warm-starting from the publicly released checkpoints, NLP practitioners have pushed the state-of-the-art on multiple
+benchmarks while saving significant amounts of compute time. So far the focus has been mainly on the Natural Language
+Understanding tasks. In this paper, we demonstrate the efficacy of pre-trained checkpoints for Sequence Generation. We
+developed a Transformer-based sequence-to-sequence model that is compatible with publicly available pre-trained BERT,
+GPT-2 and RoBERTa checkpoints and conducted an extensive empirical study on the utility of initializing our model, both
+encoder and decoder, with these checkpoints. Our models result in new state-of-the-art results on Machine Translation,
+Text Summarization, Sentence Splitting, and Sentence Fusion.*
+
+Usage:
+
+- The model can be used in combination with the [`EncoderDecoderModel`] to leverage two pretrained
+  BERT checkpoints for subsequent fine-tuning.
+
+```python
+>>> # leverage checkpoints for Bert2Bert model...
+>>> # use BERT's cls token as BOS token and sep token as EOS token
+>>> encoder = BertGenerationEncoder.from_pretrained("bert-large-uncased", bos_token_id=101, eos_token_id=102)
+>>> # add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token
+>>> decoder = BertGenerationDecoder.from_pretrained(
+...     "bert-large-uncased", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102
+... )
+>>> bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder)
+
+>>> # create tokenizer...
+>>> tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")
+
+>>> input_ids = tokenizer(
+...     "This is a long article to summarize", add_special_tokens=False, return_tensors="pt"
+... ).input_ids
+>>> labels = tokenizer("This is a short summary", return_tensors="pt").input_ids
+
+>>> # train...
+>>> loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels).loss
+>>> loss.backward()
+```
+
+- Pretrained [`EncoderDecoderModel`] are also directly available in the model hub, e.g.,
+
+
+```python
+>>> # instantiate sentence fusion model
+>>> sentence_fuser = EncoderDecoderModel.from_pretrained("google/roberta2roberta_L-24_discofuse")
+>>> tokenizer = AutoTokenizer.from_pretrained("google/roberta2roberta_L-24_discofuse")
+
+>>> input_ids = tokenizer(
+...     "This is the first sentence. This is the second sentence.", add_special_tokens=False, return_tensors="pt"
+... ).input_ids
+
+>>> outputs = sentence_fuser.generate(input_ids)
+
+>>> print(tokenizer.decode(outputs[0]))
+```
+
+Tips:
+
+- [`BertGenerationEncoder`] and [`BertGenerationDecoder`] should be used in
+  combination with [`EncoderDecoder`].
+- For summarization, sentence splitting, sentence fusion and translation, no special tokens are required for the input.
+  Therefore, no EOS token should be added to the end of the input.
+
+This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The original code can be
+found [here](https://tfhub.dev/s?module-type=text-generation&subtype=module,placeholder).
+
+## BertGenerationConfig
+
+[[autodoc]] BertGenerationConfig
+
+## BertGenerationTokenizer
+
+[[autodoc]] BertGenerationTokenizer
+    - save_vocabulary
+
+## BertGenerationEncoder
+
+[[autodoc]] BertGenerationEncoder
+    - forward
+
+## BertGenerationDecoder
+
+[[autodoc]] BertGenerationDecoder
+    - forward
diff --git a/docs/source/en/model_doc/bert-generation.mdx b/docs/source/en/model_doc/bert-generation.mdx
deleted file mode 100644
index e300917ea5e6..000000000000
--- a/docs/source/en/model_doc/bert-generation.mdx
+++ /dev/null
@@ -1,104 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# BertGeneration
-
-## Overview
-
-The BertGeneration model is a BERT model that can be leveraged for sequence-to-sequence tasks using
-[`EncoderDecoderModel`] as proposed in [Leveraging Pre-trained Checkpoints for Sequence Generation
-Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-
-The abstract from the paper is the following:
-
-*Unsupervised pretraining of large neural models has recently revolutionized Natural Language Processing. By
-warm-starting from the publicly released checkpoints, NLP practitioners have pushed the state-of-the-art on multiple
-benchmarks while saving significant amounts of compute time. So far the focus has been mainly on the Natural Language
-Understanding tasks. In this paper, we demonstrate the efficacy of pre-trained checkpoints for Sequence Generation. We
-developed a Transformer-based sequence-to-sequence model that is compatible with publicly available pre-trained BERT,
-GPT-2 and RoBERTa checkpoints and conducted an extensive empirical study on the utility of initializing our model, both
-encoder and decoder, with these checkpoints. Our models result in new state-of-the-art results on Machine Translation,
-Text Summarization, Sentence Splitting, and Sentence Fusion.*
-
-Usage:
-
-- The model can be used in combination with the [`EncoderDecoderModel`] to leverage two pretrained
-  BERT checkpoints for subsequent fine-tuning.
-
-```python
->>> # leverage checkpoints for Bert2Bert model...
->>> # use BERT's cls token as BOS token and sep token as EOS token
->>> encoder = BertGenerationEncoder.from_pretrained("bert-large-uncased", bos_token_id=101, eos_token_id=102)
->>> # add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token
->>> decoder = BertGenerationDecoder.from_pretrained(
-...     "bert-large-uncased", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102
-... )
->>> bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder)
-
->>> # create tokenizer...
->>> tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")
-
->>> input_ids = tokenizer(
-...     "This is a long article to summarize", add_special_tokens=False, return_tensors="pt"
-... ).input_ids
->>> labels = tokenizer("This is a short summary", return_tensors="pt").input_ids
-
->>> # train...
->>> loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels).loss
->>> loss.backward()
-```
-
-- Pretrained [`EncoderDecoderModel`] are also directly available in the model hub, e.g.,
-
-
-```python
->>> # instantiate sentence fusion model
->>> sentence_fuser = EncoderDecoderModel.from_pretrained("google/roberta2roberta_L-24_discofuse")
->>> tokenizer = AutoTokenizer.from_pretrained("google/roberta2roberta_L-24_discofuse")
-
->>> input_ids = tokenizer(
-...     "This is the first sentence. This is the second sentence.", add_special_tokens=False, return_tensors="pt"
-... ).input_ids
-
->>> outputs = sentence_fuser.generate(input_ids)
-
->>> print(tokenizer.decode(outputs[0]))
-```
-
-Tips:
-
-- [`BertGenerationEncoder`] and [`BertGenerationDecoder`] should be used in
-  combination with [`EncoderDecoder`].
-- For summarization, sentence splitting, sentence fusion and translation, no special tokens are required for the input.
-  Therefore, no EOS token should be added to the end of the input.
-
-This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The original code can be
-found [here](https://tfhub.dev/s?module-type=text-generation&subtype=module,placeholder).
-
-## BertGenerationConfig
-
-[[autodoc]] BertGenerationConfig
-
-## BertGenerationTokenizer
-
-[[autodoc]] BertGenerationTokenizer
-    - save_vocabulary
-
-## BertGenerationEncoder
-
-[[autodoc]] BertGenerationEncoder
-    - forward
-
-## BertGenerationDecoder
-
-[[autodoc]] BertGenerationDecoder
-    - forward
diff --git a/docs/source/en/model_doc/bert-japanese.md b/docs/source/en/model_doc/bert-japanese.md
new file mode 100644
index 000000000000..208b775307a6
--- /dev/null
+++ b/docs/source/en/model_doc/bert-japanese.md
@@ -0,0 +1,78 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BertJapanese
+
+## Overview
+
+The BERT models trained on Japanese text.
+
+There are models with two different tokenization methods:
+
+- Tokenize with MeCab and WordPiece. This requires some extra dependencies, [fugashi](https://github.com/polm/fugashi) which is a wrapper around [MeCab](https://taku910.github.io/mecab/).
+- Tokenize into characters.
+
+To use *MecabTokenizer*, you should `pip install transformers["ja"]` (or `pip install -e .["ja"]` if you install
+from source) to install dependencies.
+
+See [details on cl-tohoku repository](https://github.com/cl-tohoku/bert-japanese).
+
+Example of using a model with MeCab and WordPiece tokenization:
+
+```python
+>>> import torch
+>>> from transformers import AutoModel, AutoTokenizer
+
+>>> bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese")
+>>> tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")
+
+>>> ## Input Japanese Text
+>>> line = "吾輩は猫である。"
+
+>>> inputs = tokenizer(line, return_tensors="pt")
+
+>>> print(tokenizer.decode(inputs["input_ids"][0]))
+[CLS] 吾輩 は 猫 で ある 。 [SEP]
+
+>>> outputs = bertjapanese(**inputs)
+```
+
+Example of using a model with Character tokenization:
+
+```python
+>>> bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese-char")
+>>> tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-char")
+
+>>> ## Input Japanese Text
+>>> line = "吾輩は猫である。"
+
+>>> inputs = tokenizer(line, return_tensors="pt")
+
+>>> print(tokenizer.decode(inputs["input_ids"][0]))
+[CLS] 吾 輩 は 猫 で あ る 。 [SEP]
+
+>>> outputs = bertjapanese(**inputs)
+```
+
+Tips:
+
+- This implementation is the same as BERT, except for tokenization method. Refer to the [documentation of BERT](bert) for more usage examples.
+
+This model was contributed by [cl-tohoku](https://huggingface.co/cl-tohoku).
+
+## BertJapaneseTokenizer
+
+[[autodoc]] BertJapaneseTokenizer
diff --git a/docs/source/en/model_doc/bert-japanese.mdx b/docs/source/en/model_doc/bert-japanese.mdx
deleted file mode 100644
index 312714b379e8..000000000000
--- a/docs/source/en/model_doc/bert-japanese.mdx
+++ /dev/null
@@ -1,74 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# BertJapanese
-
-## Overview
-
-The BERT models trained on Japanese text.
-
-There are models with two different tokenization methods:
-
-- Tokenize with MeCab and WordPiece. This requires some extra dependencies, [fugashi](https://github.com/polm/fugashi) which is a wrapper around [MeCab](https://taku910.github.io/mecab/).
-- Tokenize into characters.
-
-To use *MecabTokenizer*, you should `pip install transformers["ja"]` (or `pip install -e .["ja"]` if you install
-from source) to install dependencies.
-
-See [details on cl-tohoku repository](https://github.com/cl-tohoku/bert-japanese).
-
-Example of using a model with MeCab and WordPiece tokenization:
-
-```python
->>> import torch
->>> from transformers import AutoModel, AutoTokenizer
-
->>> bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese")
->>> tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")
-
->>> ## Input Japanese Text
->>> line = "吾輩は猫である。"
-
->>> inputs = tokenizer(line, return_tensors="pt")
-
->>> print(tokenizer.decode(inputs["input_ids"][0]))
-[CLS] 吾輩 は 猫 で ある 。 [SEP]
-
->>> outputs = bertjapanese(**inputs)
-```
-
-Example of using a model with Character tokenization:
-
-```python
->>> bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese-char")
->>> tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-char")
-
->>> ## Input Japanese Text
->>> line = "吾輩は猫である。"
-
->>> inputs = tokenizer(line, return_tensors="pt")
-
->>> print(tokenizer.decode(inputs["input_ids"][0]))
-[CLS] 吾 輩 は 猫 で あ る 。 [SEP]
-
->>> outputs = bertjapanese(**inputs)
-```
-
-Tips:
-
-- This implementation is the same as BERT, except for tokenization method. Refer to the [documentation of BERT](bert) for more usage examples.
-
-This model was contributed by [cl-tohoku](https://huggingface.co/cl-tohoku).
-
-## BertJapaneseTokenizer
-
-[[autodoc]] BertJapaneseTokenizer
diff --git a/docs/source/en/model_doc/bert.md b/docs/source/en/model_doc/bert.md
new file mode 100644
index 000000000000..19d15cfc05a4
--- /dev/null
+++ b/docs/source/en/model_doc/bert.md
@@ -0,0 +1,289 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BERT
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=bert">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-bert-blueviolet">
+</a>
+<a href="https://huggingface.co/spaces/docs-demos/bert-base-uncased">
+<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
+</a>
+</div>
+
+## Overview
+
+The BERT model was proposed in [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. It's a
+bidirectional transformer pretrained using a combination of masked language modeling objective and next sentence
+prediction on a large corpus comprising the Toronto Book Corpus and Wikipedia.
+
+The abstract from the paper is the following:
+
+*We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations
+from Transformers. Unlike recent language representation models, BERT is designed to pre-train deep bidirectional
+representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result,
+the pre-trained BERT model can be fine-tuned with just one additional output layer to create state-of-the-art models
+for a wide range of tasks, such as question answering and language inference, without substantial task-specific
+architecture modifications.*
+
+*BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural
+language processing tasks, including pushing the GLUE score to 80.5% (7.7% point absolute improvement), MultiNLI
+accuracy to 86.7% (4.6% absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute
+improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement).*
+
+Tips:
+
+- BERT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
+  the left.
+- BERT was trained with the masked language modeling (MLM) and next sentence prediction (NSP) objectives. It is
+  efficient at predicting masked tokens and at NLU in general, but is not optimal for text generation.
+- Corrupts the inputs by using random masking, more precisely, during pretraining, a given percentage of tokens (usually 15%) is masked by:
+
+    * a special mask token with probability 0.8
+    * a random token different from the one masked with probability 0.1
+    * the same token with probability 0.1
+    
+- The model must predict the original sentence, but has a second objective: inputs are two sentences A and B (with a separation token in between). With probability 50%, the sentences are consecutive in the corpus, in the remaining 50% they are not related. The model has to predict if the sentences are consecutive or not.
+
+
+
+This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/google-research/bert).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with BERT. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+<PipelineTag pipeline="text-classification"/>
+
+- A blog post on [BERT Text Classification in a different language](https://www.philschmid.de/bert-text-classification-in-a-different-language).
+- A notebook for [Finetuning BERT (and friends) for multi-label text classification](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/BERT/Fine_tuning_BERT_(and_friends)_for_multi_label_text_classification.ipynb).
+- A notebook on how to [Finetune BERT for multi-label classification using PyTorch](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb). 🌎
+- A notebook on how to [warm-start an EncoderDecoder model with BERT for summarization](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb).
+- [`BertForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb).
+- [`TFBertForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb).
+- [`FlaxBertForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_flax.ipynb).
+- [Text classification task guide](../tasks/sequence_classification)
+
+<PipelineTag pipeline="token-classification"/>
+
+- A blog post on how to use [Hugging Face Transformers with Keras: Fine-tune a non-English BERT for Named Entity Recognition](https://www.philschmid.de/huggingface-transformers-keras-tf).
+- A notebook for [Finetuning BERT for named-entity recognition](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/Custom_Named_Entity_Recognition_with_BERT_only_first_wordpiece.ipynb) using only the first wordpiece of each word in the word label during tokenization. To propagate the label of the word to all wordpieces, see this [version](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/BERT/Custom_Named_Entity_Recognition_with_BERT.ipynb) of the notebook instead.
+- [`BertForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/token-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb).
+- [`TFBertForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/token-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb).
+- [`FlaxBertForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/token-classification).
+- [Token classification](https://huggingface.co/course/chapter7/2?fw=pt) chapter of the 🤗 Hugging Face Course.
+- [Token classification task guide](../tasks/token_classification)
+
+<PipelineTag pipeline="fill-mask"/>
+
+- [`BertForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#robertabertdistilbert-and-masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
+- [`TFBertForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_mlmpy) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
+- [`FlaxBertForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/masked_language_modeling_flax.ipynb).
+- [Masked language modeling](https://huggingface.co/course/chapter7/3?fw=pt) chapter of the 🤗 Hugging Face Course.
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+
+<PipelineTag pipeline="question-answering"/>
+
+- [`BertForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb).
+- [`TFBertForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb).
+- [`FlaxBertForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/question-answering).
+- [Question answering](https://huggingface.co/course/chapter7/7?fw=pt) chapter of the 🤗 Hugging Face Course.
+- [Question answering task guide](../tasks/question_answering)
+
+**Multiple choice**
+- [`BertForMultipleChoice`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/multiple-choice) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb).
+- [`TFBertForMultipleChoice`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/multiple-choice) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb).
+- [Multiple choice task guide](../tasks/multiple_choice)
+
+⚡️ **Inference**
+- A blog post on how to [Accelerate BERT inference with Hugging Face Transformers and AWS Inferentia](https://huggingface.co/blog/bert-inferentia-sagemaker).
+- A blog post on how to [Accelerate BERT inference with DeepSpeed-Inference on GPUs](https://www.philschmid.de/bert-deepspeed-inference).
+
+⚙️ **Pretraining**
+- A blog post on [Pre-Training BERT with Hugging Face Transformers and Habana Gaudi](https://www.philschmid.de/pre-training-bert-habana).
+
+🚀 **Deploy**
+- A blog post on how to [Convert Transformers to ONNX with Hugging Face Optimum](https://www.philschmid.de/convert-transformers-to-onnx).
+- A blog post on how to [Setup Deep Learning environment for Hugging Face Transformers with Habana Gaudi on AWS](https://www.philschmid.de/getting-started-habana-gaudi#conclusion).
+- A blog post on [Autoscaling BERT with Hugging Face Transformers, Amazon SageMaker and Terraform module](https://www.philschmid.de/terraform-huggingface-amazon-sagemaker-advanced).
+- A blog post on [Serverless BERT with HuggingFace, AWS Lambda, and Docker](https://www.philschmid.de/serverless-bert-with-huggingface-aws-lambda-docker).
+- A blog post on [Hugging Face Transformers BERT fine-tuning using Amazon SageMaker and Training Compiler](https://www.philschmid.de/huggingface-amazon-sagemaker-training-compiler).
+- A blog post on [Task-specific knowledge distillation for BERT using Transformers & Amazon SageMaker](https://www.philschmid.de/knowledge-distillation-bert-transformers).
+
+## BertConfig
+
+[[autodoc]] BertConfig
+    - all
+
+## BertTokenizer
+
+[[autodoc]] BertTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## BertTokenizerFast
+
+[[autodoc]] BertTokenizerFast
+
+## TFBertTokenizer
+
+[[autodoc]] TFBertTokenizer
+
+## Bert specific outputs
+
+[[autodoc]] models.bert.modeling_bert.BertForPreTrainingOutput
+
+[[autodoc]] models.bert.modeling_tf_bert.TFBertForPreTrainingOutput
+
+[[autodoc]] models.bert.modeling_flax_bert.FlaxBertForPreTrainingOutput
+
+## BertModel
+
+[[autodoc]] BertModel
+    - forward
+
+## BertForPreTraining
+
+[[autodoc]] BertForPreTraining
+    - forward
+
+## BertLMHeadModel
+
+[[autodoc]] BertLMHeadModel
+    - forward
+
+## BertForMaskedLM
+
+[[autodoc]] BertForMaskedLM
+    - forward
+
+## BertForNextSentencePrediction
+
+[[autodoc]] BertForNextSentencePrediction
+    - forward
+
+## BertForSequenceClassification
+
+[[autodoc]] BertForSequenceClassification
+    - forward
+
+## BertForMultipleChoice
+
+[[autodoc]] BertForMultipleChoice
+    - forward
+
+## BertForTokenClassification
+
+[[autodoc]] BertForTokenClassification
+    - forward
+
+## BertForQuestionAnswering
+
+[[autodoc]] BertForQuestionAnswering
+    - forward
+
+## TFBertModel
+
+[[autodoc]] TFBertModel
+    - call
+
+## TFBertForPreTraining
+
+[[autodoc]] TFBertForPreTraining
+    - call
+
+## TFBertModelLMHeadModel
+
+[[autodoc]] TFBertLMHeadModel
+    - call
+
+## TFBertForMaskedLM
+
+[[autodoc]] TFBertForMaskedLM
+    - call
+
+## TFBertForNextSentencePrediction
+
+[[autodoc]] TFBertForNextSentencePrediction
+    - call
+
+## TFBertForSequenceClassification
+
+[[autodoc]] TFBertForSequenceClassification
+    - call
+
+## TFBertForMultipleChoice
+
+[[autodoc]] TFBertForMultipleChoice
+    - call
+
+## TFBertForTokenClassification
+
+[[autodoc]] TFBertForTokenClassification
+    - call
+
+## TFBertForQuestionAnswering
+
+[[autodoc]] TFBertForQuestionAnswering
+    - call
+
+## FlaxBertModel
+
+[[autodoc]] FlaxBertModel
+    - __call__
+
+## FlaxBertForPreTraining
+
+[[autodoc]] FlaxBertForPreTraining
+    - __call__
+
+## FlaxBertForCausalLM
+
+[[autodoc]] FlaxBertForCausalLM
+    - __call__
+
+## FlaxBertForMaskedLM
+
+[[autodoc]] FlaxBertForMaskedLM
+    - __call__
+
+## FlaxBertForNextSentencePrediction
+
+[[autodoc]] FlaxBertForNextSentencePrediction
+    - __call__
+
+## FlaxBertForSequenceClassification
+
+[[autodoc]] FlaxBertForSequenceClassification
+    - __call__
+
+## FlaxBertForMultipleChoice
+
+[[autodoc]] FlaxBertForMultipleChoice
+    - __call__
+
+## FlaxBertForTokenClassification
+
+[[autodoc]] FlaxBertForTokenClassification
+    - __call__
+
+## FlaxBertForQuestionAnswering
+
+[[autodoc]] FlaxBertForQuestionAnswering
+    - __call__
diff --git a/docs/source/en/model_doc/bert.mdx b/docs/source/en/model_doc/bert.mdx
deleted file mode 100644
index e1549b8b39d3..000000000000
--- a/docs/source/en/model_doc/bert.mdx
+++ /dev/null
@@ -1,262 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# BERT
-
-## Overview
-
-The BERT model was proposed in [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. It's a
-bidirectional transformer pretrained using a combination of masked language modeling objective and next sentence
-prediction on a large corpus comprising the Toronto Book Corpus and Wikipedia.
-
-The abstract from the paper is the following:
-
-*We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations
-from Transformers. Unlike recent language representation models, BERT is designed to pre-train deep bidirectional
-representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result,
-the pre-trained BERT model can be fine-tuned with just one additional output layer to create state-of-the-art models
-for a wide range of tasks, such as question answering and language inference, without substantial task-specific
-architecture modifications.*
-
-*BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural
-language processing tasks, including pushing the GLUE score to 80.5% (7.7% point absolute improvement), MultiNLI
-accuracy to 86.7% (4.6% absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute
-improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement).*
-
-Tips:
-
-- BERT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
-  the left.
-- BERT was trained with the masked language modeling (MLM) and next sentence prediction (NSP) objectives. It is
-  efficient at predicting masked tokens and at NLU in general, but is not optimal for text generation.
-
-This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/google-research/bert).
-
-## Resources
-
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with BERT. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
-<PipelineTag pipeline="text-classification"/>
-
-- A blog post on [BERT Text Classification in a different language](https://www.philschmid.de/bert-text-classification-in-a-different-language).
-- A notebook for [Finetuning BERT (and friends) for multi-label text classification](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/BERT/Fine_tuning_BERT_(and_friends)_for_multi_label_text_classification.ipynb).
-- A notebook on how to [Finetune BERT for multi-label classification using PyTorch](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb). 🌎
-- A notebook on how to [warm-start an EncoderDecoder model with BERT for summarization](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb).
-- [`BertForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb).
-- [`TFBertForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb).
-- [`FlaxBertForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_flax.ipynb).
-
-<PipelineTag pipeline="token-classification"/>
-
-- A blog post on how to use [Hugging Face Transformers with Keras: Fine-tune a non-English BERT for Named Entity Recognition](https://www.philschmid.de/huggingface-transformers-keras-tf).
-- A notebook for [Finetuning BERT for named-entity recognition](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/Custom_Named_Entity_Recognition_with_BERT_only_first_wordpiece.ipynb) using only the first wordpiece of each word in the word label during tokenization. To propagate the label of the word to all wordpieces, see this [version](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/BERT/Custom_Named_Entity_Recognition_with_BERT.ipynb) of the notebook instead.
-- [`BertForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/token-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb).
-- [`TFBertForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/token-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb).
-- [`FlaxBertForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/token-classification).
-- [Token classification](https://huggingface.co/course/chapter7/2?fw=pt) chapter of the 🤗 Hugging Face Course.
-
-<PipelineTag pipeline="fill-mask"/>
-
-- [`BertForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#robertabertdistilbert-and-masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
-- [`TFBertForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_mlmpy) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
-- [`FlaxBertForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/masked_language_modeling_flax.ipynb).
-- [Masked language modeling](https://huggingface.co/course/chapter7/3?fw=pt) chapter of the 🤗 Hugging Face Course.
-
-<PipelineTag pipeline="question-answering"/>
-
-- [`BertForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb).
-- [`TFBertForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb).
-- [`FlaxBertForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/question-answering).
-- [Question answering](https://huggingface.co/course/chapter7/7?fw=pt) chapter of the 🤗 Hugging Face Course.
-
-**Multiple choice**
-- [`BertForMultipleChoice`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/multiple-choice) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb).
-- [`TFBertForMultipleChoice`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/multiple-choice) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb).
-
-⚡️ **Inference**
-- A blog post on how to [Accelerate BERT inference with Hugging Face Transformers and AWS Inferentia](https://huggingface.co/blog/bert-inferentia-sagemaker).
-- A blog post on how to [Accelerate BERT inference with DeepSpeed-Inference on GPUs](https://www.philschmid.de/bert-deepspeed-inference).
-
-⚙️ **Pretraining**
-- A blog post on [Pre-Training BERT with Hugging Face Transformers and Habana Gaudi](https://www.philschmid.de/pre-training-bert-habana).
-
-🚀 **Deploy**
-- A blog post on how to [Convert Transformers to ONNX with Hugging Face Optimum](https://www.philschmid.de/convert-transformers-to-onnx).
-- A blog post on how to [Setup Deep Learning environment for Hugging Face Transformers with Habana Gaudi on AWS](https://www.philschmid.de/getting-started-habana-gaudi#conclusion).
-- A blog post on [Autoscaling BERT with Hugging Face Transformers, Amazon SageMaker and Terraform module](https://www.philschmid.de/terraform-huggingface-amazon-sagemaker-advanced).
-- A blog post on [Serverless BERT with HuggingFace, AWS Lambda, and Docker](https://www.philschmid.de/serverless-bert-with-huggingface-aws-lambda-docker).
-- A blog post on [Hugging Face Transformers BERT fine-tuning using Amazon SageMaker and Training Compiler](https://www.philschmid.de/huggingface-amazon-sagemaker-training-compiler).
-- A blog post on [Task-specific knowledge distillation for BERT using Transformers & Amazon SageMaker](https://www.philschmid.de/knowledge-distillation-bert-transformers).
-
-## BertConfig
-
-[[autodoc]] BertConfig
-    - all
-
-## BertTokenizer
-
-[[autodoc]] BertTokenizer
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-
-## BertTokenizerFast
-
-[[autodoc]] BertTokenizerFast
-
-## TFBertTokenizer
-
-[[autodoc]] TFBertTokenizer
-
-## Bert specific outputs
-
-[[autodoc]] models.bert.modeling_bert.BertForPreTrainingOutput
-
-[[autodoc]] models.bert.modeling_tf_bert.TFBertForPreTrainingOutput
-
-[[autodoc]] models.bert.modeling_flax_bert.FlaxBertForPreTrainingOutput
-
-## BertModel
-
-[[autodoc]] BertModel
-    - forward
-
-## BertForPreTraining
-
-[[autodoc]] BertForPreTraining
-    - forward
-
-## BertLMHeadModel
-
-[[autodoc]] BertLMHeadModel
-    - forward
-
-## BertForMaskedLM
-
-[[autodoc]] BertForMaskedLM
-    - forward
-
-## BertForNextSentencePrediction
-
-[[autodoc]] BertForNextSentencePrediction
-    - forward
-
-## BertForSequenceClassification
-
-[[autodoc]] BertForSequenceClassification
-    - forward
-
-## BertForMultipleChoice
-
-[[autodoc]] BertForMultipleChoice
-    - forward
-
-## BertForTokenClassification
-
-[[autodoc]] BertForTokenClassification
-    - forward
-
-## BertForQuestionAnswering
-
-[[autodoc]] BertForQuestionAnswering
-    - forward
-
-## TFBertModel
-
-[[autodoc]] TFBertModel
-    - call
-
-## TFBertForPreTraining
-
-[[autodoc]] TFBertForPreTraining
-    - call
-
-## TFBertModelLMHeadModel
-
-[[autodoc]] TFBertLMHeadModel
-    - call
-
-## TFBertForMaskedLM
-
-[[autodoc]] TFBertForMaskedLM
-    - call
-
-## TFBertForNextSentencePrediction
-
-[[autodoc]] TFBertForNextSentencePrediction
-    - call
-
-## TFBertForSequenceClassification
-
-[[autodoc]] TFBertForSequenceClassification
-    - call
-
-## TFBertForMultipleChoice
-
-[[autodoc]] TFBertForMultipleChoice
-    - call
-
-## TFBertForTokenClassification
-
-[[autodoc]] TFBertForTokenClassification
-    - call
-
-## TFBertForQuestionAnswering
-
-[[autodoc]] TFBertForQuestionAnswering
-    - call
-
-## FlaxBertModel
-
-[[autodoc]] FlaxBertModel
-    - __call__
-
-## FlaxBertForPreTraining
-
-[[autodoc]] FlaxBertForPreTraining
-    - __call__
-
-## FlaxBertForCausalLM
-
-[[autodoc]] FlaxBertForCausalLM
-    - __call__
-
-## FlaxBertForMaskedLM
-
-[[autodoc]] FlaxBertForMaskedLM
-    - __call__
-
-## FlaxBertForNextSentencePrediction
-
-[[autodoc]] FlaxBertForNextSentencePrediction
-    - __call__
-
-## FlaxBertForSequenceClassification
-
-[[autodoc]] FlaxBertForSequenceClassification
-    - __call__
-
-## FlaxBertForMultipleChoice
-
-[[autodoc]] FlaxBertForMultipleChoice
-    - __call__
-
-## FlaxBertForTokenClassification
-
-[[autodoc]] FlaxBertForTokenClassification
-    - __call__
-
-## FlaxBertForQuestionAnswering
-
-[[autodoc]] FlaxBertForQuestionAnswering
-    - __call__
diff --git a/docs/source/en/model_doc/bertweet.md b/docs/source/en/model_doc/bertweet.md
new file mode 100644
index 000000000000..50629445aee8
--- /dev/null
+++ b/docs/source/en/model_doc/bertweet.md
@@ -0,0 +1,62 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BERTweet
+
+## Overview
+
+The BERTweet model was proposed in [BERTweet: A pre-trained language model for English Tweets](https://www.aclweb.org/anthology/2020.emnlp-demos.2.pdf) by Dat Quoc Nguyen, Thanh Vu, Anh Tuan Nguyen.
+
+The abstract from the paper is the following:
+
+*We present BERTweet, the first public large-scale pre-trained language model for English Tweets. Our BERTweet, having
+the same architecture as BERT-base (Devlin et al., 2019), is trained using the RoBERTa pre-training procedure (Liu et
+al., 2019). Experiments show that BERTweet outperforms strong baselines RoBERTa-base and XLM-R-base (Conneau et al.,
+2020), producing better performance results than the previous state-of-the-art models on three Tweet NLP tasks:
+Part-of-speech tagging, Named-entity recognition and text classification.*
+
+Example of use:
+
+```python
+>>> import torch
+>>> from transformers import AutoModel, AutoTokenizer
+
+>>> bertweet = AutoModel.from_pretrained("vinai/bertweet-base")
+
+>>> # For transformers v4.x+:
+>>> tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)
+
+>>> # For transformers v3.x:
+>>> # tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
+
+>>> # INPUT TWEET IS ALREADY NORMALIZED!
+>>> line = "SC has first two presumptive cases of coronavirus , DHEC confirms HTTPURL via @USER :cry:"
+
+>>> input_ids = torch.tensor([tokenizer.encode(line)])
+
+>>> with torch.no_grad():
+...     features = bertweet(input_ids)  # Models outputs are now tuples
+
+>>> # With TensorFlow 2.0+:
+>>> # from transformers import TFAutoModel
+>>> # bertweet = TFAutoModel.from_pretrained("vinai/bertweet-base")
+```
+
+This model was contributed by [dqnguyen](https://huggingface.co/dqnguyen). The original code can be found [here](https://github.com/VinAIResearch/BERTweet).
+
+## BertweetTokenizer
+
+[[autodoc]] BertweetTokenizer
diff --git a/docs/source/en/model_doc/bertweet.mdx b/docs/source/en/model_doc/bertweet.mdx
deleted file mode 100644
index df55360646f9..000000000000
--- a/docs/source/en/model_doc/bertweet.mdx
+++ /dev/null
@@ -1,58 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# BERTweet
-
-## Overview
-
-The BERTweet model was proposed in [BERTweet: A pre-trained language model for English Tweets](https://www.aclweb.org/anthology/2020.emnlp-demos.2.pdf) by Dat Quoc Nguyen, Thanh Vu, Anh Tuan Nguyen.
-
-The abstract from the paper is the following:
-
-*We present BERTweet, the first public large-scale pre-trained language model for English Tweets. Our BERTweet, having
-the same architecture as BERT-base (Devlin et al., 2019), is trained using the RoBERTa pre-training procedure (Liu et
-al., 2019). Experiments show that BERTweet outperforms strong baselines RoBERTa-base and XLM-R-base (Conneau et al.,
-2020), producing better performance results than the previous state-of-the-art models on three Tweet NLP tasks:
-Part-of-speech tagging, Named-entity recognition and text classification.*
-
-Example of use:
-
-```python
->>> import torch
->>> from transformers import AutoModel, AutoTokenizer
-
->>> bertweet = AutoModel.from_pretrained("vinai/bertweet-base")
-
->>> # For transformers v4.x+:
->>> tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)
-
->>> # For transformers v3.x:
->>> # tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
-
->>> # INPUT TWEET IS ALREADY NORMALIZED!
->>> line = "SC has first two presumptive cases of coronavirus , DHEC confirms HTTPURL via @USER :cry:"
-
->>> input_ids = torch.tensor([tokenizer.encode(line)])
-
->>> with torch.no_grad():
-...     features = bertweet(input_ids)  # Models outputs are now tuples
-
->>> # With TensorFlow 2.0+:
->>> # from transformers import TFAutoModel
->>> # bertweet = TFAutoModel.from_pretrained("vinai/bertweet-base")
-```
-
-This model was contributed by [dqnguyen](https://huggingface.co/dqnguyen). The original code can be found [here](https://github.com/VinAIResearch/BERTweet).
-
-## BertweetTokenizer
-
-[[autodoc]] BertweetTokenizer
diff --git a/docs/source/en/model_doc/big_bird.md b/docs/source/en/model_doc/big_bird.md
new file mode 100644
index 000000000000..b8bbb388d6e9
--- /dev/null
+++ b/docs/source/en/model_doc/big_bird.md
@@ -0,0 +1,166 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BigBird
+
+## Overview
+
+The BigBird model was proposed in [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by
+Zaheer, Manzil and Guruganesh, Guru and Dubey, Kumar Avinava and Ainslie, Joshua and Alberti, Chris and Ontanon,
+Santiago and Pham, Philip and Ravula, Anirudh and Wang, Qifan and Yang, Li and others. BigBird, is a sparse-attention
+based transformer which extends Transformer based models, such as BERT to much longer sequences. In addition to sparse
+attention, BigBird also applies global attention as well as random attention to the input sequence. Theoretically, it
+has been shown that applying sparse, global, and random attention approximates full attention, while being
+computationally much more efficient for longer sequences. As a consequence of the capability to handle longer context,
+BigBird has shown improved performance on various long document NLP tasks, such as question answering and
+summarization, compared to BERT or RoBERTa.
+
+The abstract from the paper is the following:
+
+*Transformers-based models, such as BERT, have been one of the most successful deep learning models for NLP.
+Unfortunately, one of their core limitations is the quadratic dependency (mainly in terms of memory) on the sequence
+length due to their full attention mechanism. To remedy this, we propose, BigBird, a sparse attention mechanism that
+reduces this quadratic dependency to linear. We show that BigBird is a universal approximator of sequence functions and
+is Turing complete, thereby preserving these properties of the quadratic, full attention model. Along the way, our
+theoretical analysis reveals some of the benefits of having O(1) global tokens (such as CLS), that attend to the entire
+sequence as part of the sparse attention mechanism. The proposed sparse attention can handle sequences of length up to
+8x of what was previously possible using similar hardware. As a consequence of the capability to handle longer context,
+BigBird drastically improves performance on various NLP tasks such as question answering and summarization. We also
+propose novel applications to genomics data.*
+
+Tips:
+
+- For an in-detail explanation on how BigBird's attention works, see [this blog post](https://huggingface.co/blog/big-bird).
+- BigBird comes with 2 implementations: **original_full** & **block_sparse**. For the sequence length < 1024, using
+  **original_full** is advised as there is no benefit in using **block_sparse** attention.
+- The code currently uses window size of 3 blocks and 2 global blocks.
+- Sequence length must be divisible by block size.
+- Current implementation supports only **ITC**.
+- Current implementation doesn't support **num_random_blocks = 0**
+- BigBird is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
+  the left.
+
+This model was contributed by [vasudevgupta](https://huggingface.co/vasudevgupta). The original code can be found
+[here](https://github.com/google-research/bigbird).
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Causal language modeling task guide](../tasks/language_modeling)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+- [Multiple choice task guide](../tasks/multiple_choice)
+
+## BigBirdConfig
+
+[[autodoc]] BigBirdConfig
+
+## BigBirdTokenizer
+
+[[autodoc]] BigBirdTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## BigBirdTokenizerFast
+
+[[autodoc]] BigBirdTokenizerFast
+
+## BigBird specific outputs
+
+[[autodoc]] models.big_bird.modeling_big_bird.BigBirdForPreTrainingOutput
+
+## BigBirdModel
+
+[[autodoc]] BigBirdModel
+    - forward
+
+## BigBirdForPreTraining
+
+[[autodoc]] BigBirdForPreTraining
+    - forward
+
+## BigBirdForCausalLM
+
+[[autodoc]] BigBirdForCausalLM
+    - forward
+
+## BigBirdForMaskedLM
+
+[[autodoc]] BigBirdForMaskedLM
+    - forward
+
+## BigBirdForSequenceClassification
+
+[[autodoc]] BigBirdForSequenceClassification
+    - forward
+
+## BigBirdForMultipleChoice
+
+[[autodoc]] BigBirdForMultipleChoice
+    - forward
+
+## BigBirdForTokenClassification
+
+[[autodoc]] BigBirdForTokenClassification
+    - forward
+
+## BigBirdForQuestionAnswering
+
+[[autodoc]] BigBirdForQuestionAnswering
+    - forward
+
+## FlaxBigBirdModel
+
+[[autodoc]] FlaxBigBirdModel
+    - __call__
+
+## FlaxBigBirdForPreTraining
+
+[[autodoc]] FlaxBigBirdForPreTraining
+    - __call__
+
+## FlaxBigBirdForCausalLM
+
+[[autodoc]] FlaxBigBirdForCausalLM
+    - __call__
+
+## FlaxBigBirdForMaskedLM
+
+[[autodoc]] FlaxBigBirdForMaskedLM
+    - __call__
+
+## FlaxBigBirdForSequenceClassification
+
+[[autodoc]] FlaxBigBirdForSequenceClassification
+    - __call__
+
+## FlaxBigBirdForMultipleChoice
+
+[[autodoc]] FlaxBigBirdForMultipleChoice
+    - __call__
+
+## FlaxBigBirdForTokenClassification
+
+[[autodoc]] FlaxBigBirdForTokenClassification
+    - __call__
+
+## FlaxBigBirdForQuestionAnswering
+
+[[autodoc]] FlaxBigBirdForQuestionAnswering
+    - __call__
diff --git a/docs/source/en/model_doc/big_bird.mdx b/docs/source/en/model_doc/big_bird.mdx
deleted file mode 100644
index fa15d32cdb1c..000000000000
--- a/docs/source/en/model_doc/big_bird.mdx
+++ /dev/null
@@ -1,153 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# BigBird
-
-## Overview
-
-The BigBird model was proposed in [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by
-Zaheer, Manzil and Guruganesh, Guru and Dubey, Kumar Avinava and Ainslie, Joshua and Alberti, Chris and Ontanon,
-Santiago and Pham, Philip and Ravula, Anirudh and Wang, Qifan and Yang, Li and others. BigBird, is a sparse-attention
-based transformer which extends Transformer based models, such as BERT to much longer sequences. In addition to sparse
-attention, BigBird also applies global attention as well as random attention to the input sequence. Theoretically, it
-has been shown that applying sparse, global, and random attention approximates full attention, while being
-computationally much more efficient for longer sequences. As a consequence of the capability to handle longer context,
-BigBird has shown improved performance on various long document NLP tasks, such as question answering and
-summarization, compared to BERT or RoBERTa.
-
-The abstract from the paper is the following:
-
-*Transformers-based models, such as BERT, have been one of the most successful deep learning models for NLP.
-Unfortunately, one of their core limitations is the quadratic dependency (mainly in terms of memory) on the sequence
-length due to their full attention mechanism. To remedy this, we propose, BigBird, a sparse attention mechanism that
-reduces this quadratic dependency to linear. We show that BigBird is a universal approximator of sequence functions and
-is Turing complete, thereby preserving these properties of the quadratic, full attention model. Along the way, our
-theoretical analysis reveals some of the benefits of having O(1) global tokens (such as CLS), that attend to the entire
-sequence as part of the sparse attention mechanism. The proposed sparse attention can handle sequences of length up to
-8x of what was previously possible using similar hardware. As a consequence of the capability to handle longer context,
-BigBird drastically improves performance on various NLP tasks such as question answering and summarization. We also
-propose novel applications to genomics data.*
-
-Tips:
-
-- For an in-detail explanation on how BigBird's attention works, see [this blog post](https://huggingface.co/blog/big-bird).
-- BigBird comes with 2 implementations: **original_full** & **block_sparse**. For the sequence length < 1024, using
-  **original_full** is advised as there is no benefit in using **block_sparse** attention.
-- The code currently uses window size of 3 blocks and 2 global blocks.
-- Sequence length must be divisible by block size.
-- Current implementation supports only **ITC**.
-- Current implementation doesn't support **num_random_blocks = 0**
-- BigBird is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
-  the left.
-
-This model was contributed by [vasudevgupta](https://huggingface.co/vasudevgupta). The original code can be found
-[here](https://github.com/google-research/bigbird).
-
-## BigBirdConfig
-
-[[autodoc]] BigBirdConfig
-
-## BigBirdTokenizer
-
-[[autodoc]] BigBirdTokenizer
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-
-## BigBirdTokenizerFast
-
-[[autodoc]] BigBirdTokenizerFast
-
-## BigBird specific outputs
-
-[[autodoc]] models.big_bird.modeling_big_bird.BigBirdForPreTrainingOutput
-
-## BigBirdModel
-
-[[autodoc]] BigBirdModel
-    - forward
-
-## BigBirdForPreTraining
-
-[[autodoc]] BigBirdForPreTraining
-    - forward
-
-## BigBirdForCausalLM
-
-[[autodoc]] BigBirdForCausalLM
-    - forward
-
-## BigBirdForMaskedLM
-
-[[autodoc]] BigBirdForMaskedLM
-    - forward
-
-## BigBirdForSequenceClassification
-
-[[autodoc]] BigBirdForSequenceClassification
-    - forward
-
-## BigBirdForMultipleChoice
-
-[[autodoc]] BigBirdForMultipleChoice
-    - forward
-
-## BigBirdForTokenClassification
-
-[[autodoc]] BigBirdForTokenClassification
-    - forward
-
-## BigBirdForQuestionAnswering
-
-[[autodoc]] BigBirdForQuestionAnswering
-    - forward
-
-## FlaxBigBirdModel
-
-[[autodoc]] FlaxBigBirdModel
-    - __call__
-
-## FlaxBigBirdForPreTraining
-
-[[autodoc]] FlaxBigBirdForPreTraining
-    - __call__
-
-## FlaxBigBirdForCausalLM
-
-[[autodoc]] FlaxBigBirdForCausalLM
-    - __call__
-
-## FlaxBigBirdForMaskedLM
-
-[[autodoc]] FlaxBigBirdForMaskedLM
-    - __call__
-
-## FlaxBigBirdForSequenceClassification
-
-[[autodoc]] FlaxBigBirdForSequenceClassification
-    - __call__
-
-## FlaxBigBirdForMultipleChoice
-
-[[autodoc]] FlaxBigBirdForMultipleChoice
-    - __call__
-
-## FlaxBigBirdForTokenClassification
-
-[[autodoc]] FlaxBigBirdForTokenClassification
-    - __call__
-
-## FlaxBigBirdForQuestionAnswering
-
-[[autodoc]] FlaxBigBirdForQuestionAnswering
-    - __call__
diff --git a/docs/source/en/model_doc/bigbird_pegasus.md b/docs/source/en/model_doc/bigbird_pegasus.md
new file mode 100644
index 000000000000..d767f548a768
--- /dev/null
+++ b/docs/source/en/model_doc/bigbird_pegasus.md
@@ -0,0 +1,95 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BigBirdPegasus
+
+## Overview
+
+The BigBird model was proposed in [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by
+Zaheer, Manzil and Guruganesh, Guru and Dubey, Kumar Avinava and Ainslie, Joshua and Alberti, Chris and Ontanon,
+Santiago and Pham, Philip and Ravula, Anirudh and Wang, Qifan and Yang, Li and others. BigBird, is a sparse-attention
+based transformer which extends Transformer based models, such as BERT to much longer sequences. In addition to sparse
+attention, BigBird also applies global attention as well as random attention to the input sequence. Theoretically, it
+has been shown that applying sparse, global, and random attention approximates full attention, while being
+computationally much more efficient for longer sequences. As a consequence of the capability to handle longer context,
+BigBird has shown improved performance on various long document NLP tasks, such as question answering and
+summarization, compared to BERT or RoBERTa.
+
+The abstract from the paper is the following:
+
+*Transformers-based models, such as BERT, have been one of the most successful deep learning models for NLP.
+Unfortunately, one of their core limitations is the quadratic dependency (mainly in terms of memory) on the sequence
+length due to their full attention mechanism. To remedy this, we propose, BigBird, a sparse attention mechanism that
+reduces this quadratic dependency to linear. We show that BigBird is a universal approximator of sequence functions and
+is Turing complete, thereby preserving these properties of the quadratic, full attention model. Along the way, our
+theoretical analysis reveals some of the benefits of having O(1) global tokens (such as CLS), that attend to the entire
+sequence as part of the sparse attention mechanism. The proposed sparse attention can handle sequences of length up to
+8x of what was previously possible using similar hardware. As a consequence of the capability to handle longer context,
+BigBird drastically improves performance on various NLP tasks such as question answering and summarization. We also
+propose novel applications to genomics data.*
+
+Tips:
+
+- For an in-detail explanation on how BigBird's attention works, see [this blog post](https://huggingface.co/blog/big-bird).
+- BigBird comes with 2 implementations: **original_full** & **block_sparse**. For the sequence length < 1024, using
+  **original_full** is advised as there is no benefit in using **block_sparse** attention.
+- The code currently uses window size of 3 blocks and 2 global blocks.
+- Sequence length must be divisible by block size.
+- Current implementation supports only **ITC**.
+- Current implementation doesn't support **num_random_blocks = 0**.
+- BigBirdPegasus uses the [PegasusTokenizer](https://github.com/huggingface/transformers/blob/main/src/transformers/models/pegasus/tokenization_pegasus.py).
+- BigBird is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
+  the left.
+
+The original code can be found [here](https://github.com/google-research/bigbird).
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Causal language modeling task guide](../tasks/language_modeling)
+- [Translation task guide](../tasks/translation)
+- [Summarization task guide](../tasks/summarization)
+
+## BigBirdPegasusConfig
+
+[[autodoc]] BigBirdPegasusConfig
+    - all
+
+## BigBirdPegasusModel
+
+[[autodoc]] BigBirdPegasusModel
+    - forward
+
+## BigBirdPegasusForConditionalGeneration
+
+[[autodoc]] BigBirdPegasusForConditionalGeneration
+    - forward
+
+## BigBirdPegasusForSequenceClassification
+
+[[autodoc]] BigBirdPegasusForSequenceClassification
+    - forward
+
+## BigBirdPegasusForQuestionAnswering
+
+[[autodoc]] BigBirdPegasusForQuestionAnswering
+    - forward
+
+## BigBirdPegasusForCausalLM
+
+[[autodoc]] BigBirdPegasusForCausalLM
+    - forward
diff --git a/docs/source/en/model_doc/bigbird_pegasus.mdx b/docs/source/en/model_doc/bigbird_pegasus.mdx
deleted file mode 100644
index 1ba4b71d73bb..000000000000
--- a/docs/source/en/model_doc/bigbird_pegasus.mdx
+++ /dev/null
@@ -1,83 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# BigBirdPegasus
-
-## Overview
-
-The BigBird model was proposed in [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by
-Zaheer, Manzil and Guruganesh, Guru and Dubey, Kumar Avinava and Ainslie, Joshua and Alberti, Chris and Ontanon,
-Santiago and Pham, Philip and Ravula, Anirudh and Wang, Qifan and Yang, Li and others. BigBird, is a sparse-attention
-based transformer which extends Transformer based models, such as BERT to much longer sequences. In addition to sparse
-attention, BigBird also applies global attention as well as random attention to the input sequence. Theoretically, it
-has been shown that applying sparse, global, and random attention approximates full attention, while being
-computationally much more efficient for longer sequences. As a consequence of the capability to handle longer context,
-BigBird has shown improved performance on various long document NLP tasks, such as question answering and
-summarization, compared to BERT or RoBERTa.
-
-The abstract from the paper is the following:
-
-*Transformers-based models, such as BERT, have been one of the most successful deep learning models for NLP.
-Unfortunately, one of their core limitations is the quadratic dependency (mainly in terms of memory) on the sequence
-length due to their full attention mechanism. To remedy this, we propose, BigBird, a sparse attention mechanism that
-reduces this quadratic dependency to linear. We show that BigBird is a universal approximator of sequence functions and
-is Turing complete, thereby preserving these properties of the quadratic, full attention model. Along the way, our
-theoretical analysis reveals some of the benefits of having O(1) global tokens (such as CLS), that attend to the entire
-sequence as part of the sparse attention mechanism. The proposed sparse attention can handle sequences of length up to
-8x of what was previously possible using similar hardware. As a consequence of the capability to handle longer context,
-BigBird drastically improves performance on various NLP tasks such as question answering and summarization. We also
-propose novel applications to genomics data.*
-
-Tips:
-
-- For an in-detail explanation on how BigBird's attention works, see [this blog post](https://huggingface.co/blog/big-bird).
-- BigBird comes with 2 implementations: **original_full** & **block_sparse**. For the sequence length < 1024, using
-  **original_full** is advised as there is no benefit in using **block_sparse** attention.
-- The code currently uses window size of 3 blocks and 2 global blocks.
-- Sequence length must be divisible by block size.
-- Current implementation supports only **ITC**.
-- Current implementation doesn't support **num_random_blocks = 0**.
-- BigBirdPegasus uses the [PegasusTokenizer](https://github.com/huggingface/transformers/blob/main/src/transformers/models/pegasus/tokenization_pegasus.py).
-- BigBird is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
-  the left.
-
-The original code can be found [here](https://github.com/google-research/bigbird).
-
-## BigBirdPegasusConfig
-
-[[autodoc]] BigBirdPegasusConfig
-    - all
-
-## BigBirdPegasusModel
-
-[[autodoc]] BigBirdPegasusModel
-    - forward
-
-## BigBirdPegasusForConditionalGeneration
-
-[[autodoc]] BigBirdPegasusForConditionalGeneration
-    - forward
-
-## BigBirdPegasusForSequenceClassification
-
-[[autodoc]] BigBirdPegasusForSequenceClassification
-    - forward
-
-## BigBirdPegasusForQuestionAnswering
-
-[[autodoc]] BigBirdPegasusForQuestionAnswering
-    - forward
-
-## BigBirdPegasusForCausalLM
-
-[[autodoc]] BigBirdPegasusForCausalLM
-    - forward
diff --git a/docs/source/en/model_doc/biogpt.md b/docs/source/en/model_doc/biogpt.md
new file mode 100644
index 000000000000..29327df21a02
--- /dev/null
+++ b/docs/source/en/model_doc/biogpt.md
@@ -0,0 +1,72 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BioGPT
+
+## Overview
+
+The BioGPT model was proposed in [BioGPT: generative pre-trained transformer for biomedical text generation and mining
+](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu. BioGPT is a domain-specific generative pre-trained Transformer language model for biomedical text generation and mining. BioGPT follows the Transformer language model backbone, and is pre-trained on 15M PubMed abstracts from scratch.
+
+The abstract from the paper is the following:
+
+*Pre-trained language models have attracted increasing attention in the biomedical domain, inspired by their great success in the general natural language domain. Among the two main branches of pre-trained language models in the general language domain, i.e. BERT (and its variants) and GPT (and its variants), the first one has been extensively studied in the biomedical domain, such as BioBERT and PubMedBERT. While they have achieved great success on a variety of discriminative downstream biomedical tasks, the lack of generation ability constrains their application scope. In this paper, we propose BioGPT, a domain-specific generative Transformer language model pre-trained on large-scale biomedical literature. We evaluate BioGPT on six biomedical natural language processing tasks and demonstrate that our model outperforms previous models on most tasks. Especially, we get 44.98%, 38.42% and 40.76% F1 score on BC5CDR, KD-DTI and DDI end-to-end relation extraction tasks, respectively, and 78.2% accuracy on PubMedQA, creating a new record. Our case study on text generation further demonstrates the advantage of BioGPT on biomedical literature to generate fluent descriptions for biomedical terms.*
+
+Tips:
+
+- BioGPT is a model with absolute position embeddings so it’s usually advised to pad the inputs on the right rather than the left.
+- BioGPT was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next token in a sequence. Leveraging this feature allows BioGPT to generate syntactically coherent text as it can be observed in the run_generation.py example script.
+- The model can take the `past_key_values` (for PyTorch) as input, which is the previously computed key/value attention pairs. Using this (past_key_values or past) value prevents the model from re-computing pre-computed values in the context of text generation. For PyTorch, see past_key_values argument of the BioGptForCausalLM.forward() method for more information on its usage.
+
+This model was contributed by [kamalkraj](https://huggingface.co/kamalkraj). The original code can be found [here](https://github.com/microsoft/BioGPT).
+
+## Documentation resources
+
+- [Causal language modeling task guide](../tasks/language_modeling)
+
+## BioGptConfig
+
+[[autodoc]] BioGptConfig
+
+
+## BioGptTokenizer
+
+[[autodoc]] BioGptTokenizer
+    - save_vocabulary
+
+
+## BioGptModel
+
+[[autodoc]] BioGptModel
+    - forward
+
+
+## BioGptForCausalLM
+
+[[autodoc]] BioGptForCausalLM
+    - forward
+
+    
+## BioGptForTokenClassification
+
+[[autodoc]] BioGptForTokenClassification
+    - forward
+
+
+## BioGptForSequenceClassification
+
+[[autodoc]] BioGptForSequenceClassification
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/biogpt.mdx b/docs/source/en/model_doc/biogpt.mdx
deleted file mode 100644
index 84bd96d76850..000000000000
--- a/docs/source/en/model_doc/biogpt.mdx
+++ /dev/null
@@ -1,52 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# BioGPT
-
-## Overview
-
-The BioGPT model was proposed in [BioGPT: generative pre-trained transformer for biomedical text generation and mining
-](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu. BioGPT is a domain-specific generative pre-trained Transformer language model for biomedical text generation and mining. BioGPT follows the Transformer language model backbone, and is pre-trained on 15M PubMed abstracts from scratch.
-
-The abstract from the paper is the following:
-
-*Pre-trained language models have attracted increasing attention in the biomedical domain, inspired by their great success in the general natural language domain. Among the two main branches of pre-trained language models in the general language domain, i.e. BERT (and its variants) and GPT (and its variants), the first one has been extensively studied in the biomedical domain, such as BioBERT and PubMedBERT. While they have achieved great success on a variety of discriminative downstream biomedical tasks, the lack of generation ability constrains their application scope. In this paper, we propose BioGPT, a domain-specific generative Transformer language model pre-trained on large-scale biomedical literature. We evaluate BioGPT on six biomedical natural language processing tasks and demonstrate that our model outperforms previous models on most tasks. Especially, we get 44.98%, 38.42% and 40.76% F1 score on BC5CDR, KD-DTI and DDI end-to-end relation extraction tasks, respectively, and 78.2% accuracy on PubMedQA, creating a new record. Our case study on text generation further demonstrates the advantage of BioGPT on biomedical literature to generate fluent descriptions for biomedical terms.*
-
-Tips:
-
-- BioGPT is a model with absolute position embeddings so it’s usually advised to pad the inputs on the right rather than the left.
-- BioGPT was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next token in a sequence. Leveraging this feature allows BioGPT to generate syntactically coherent text as it can be observed in the run_generation.py example script.
-- The model can take the `past_key_values` (for PyTorch) as input, which is the previously computed key/value attention pairs. Using this (past_key_values or past) value prevents the model from re-computing pre-computed values in the context of text generation. For PyTorch, see past_key_values argument of the BioGptForCausalLM.forward() method for more information on its usage.
-
-This model was contributed by [kamalkraj](https://huggingface.co/kamalkraj). The original code can be found [here](https://github.com/microsoft/BioGPT).
-
-## BioGptConfig
-
-[[autodoc]] BioGptConfig
-
-
-## BioGptTokenizer
-
-[[autodoc]] BioGptTokenizer
-    - save_vocabulary
-
-
-## BioGptModel
-
-[[autodoc]] BioGptModel
-    - forward
-
-
-## BioGptForCausalLM
-
-[[autodoc]] BioGptForCausalLM
-    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/bit.md b/docs/source/en/model_doc/bit.md
new file mode 100644
index 000000000000..80b9fdd2caff
--- /dev/null
+++ b/docs/source/en/model_doc/bit.md
@@ -0,0 +1,66 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Big Transfer (BiT)
+
+## Overview
+
+The BiT model was proposed in [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
+BiT is a simple recipe for scaling up pre-training of [ResNet](resnet)-like architectures (specifically, ResNetv2). The method results in significant improvements for transfer learning.
+
+The abstract from the paper is the following:
+
+*Transfer of pre-trained representations improves sample efficiency and simplifies hyperparameter tuning when training deep neural networks for vision. We revisit the paradigm of pre-training on large supervised datasets and fine-tuning the model on a target task. We scale up pre-training, and propose a simple recipe that we call Big Transfer (BiT). By combining a few carefully selected components, and transferring using a simple heuristic, we achieve strong performance on over 20 datasets. BiT performs well across a surprisingly wide range of data regimes -- from 1 example per class to 1M total examples. BiT achieves 87.5% top-1 accuracy on ILSVRC-2012, 99.4% on CIFAR-10, and 76.3% on the 19 task Visual Task Adaptation Benchmark (VTAB). On small datasets, BiT attains 76.8% on ILSVRC-2012 with 10 examples per class, and 97.0% on CIFAR-10 with 10 examples per class. We conduct detailed analysis of the main components that lead to high transfer performance.*
+
+Tips:
+
+- BiT models are equivalent to ResNetv2 in terms of architecture, except that: 1) all batch normalization layers are replaced by [group normalization](https://arxiv.org/abs/1803.08494),
+2) [weight standardization](https://arxiv.org/abs/1903.10520) is used for convolutional layers. The authors show that the combination of both is useful for training with large batch sizes, and has a significant
+impact on transfer learning.
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr).
+The original code can be found [here](https://github.com/google-research/big_transfer).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with BiT.
+
+<PipelineTag pipeline="image-classification"/>
+
+- [`BitForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
+- See also: [Image classification task guide](../tasks/image_classification)
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## BitConfig
+
+[[autodoc]] BitConfig
+
+## BitImageProcessor
+
+[[autodoc]] BitImageProcessor
+    - preprocess
+
+## BitModel
+
+[[autodoc]] BitModel
+    - forward
+
+## BitForImageClassification
+
+[[autodoc]] BitForImageClassification
+    - forward
+
diff --git a/docs/source/en/model_doc/bit.mdx b/docs/source/en/model_doc/bit.mdx
deleted file mode 100644
index 7190db9c7859..000000000000
--- a/docs/source/en/model_doc/bit.mdx
+++ /dev/null
@@ -1,65 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Big Transfer (BiT)
-
-## Overview
-
-The BiT model was proposed in [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
-BiT is a simple recipe for scaling up pre-training of [ResNet](resnet)-like architectures (specifically, ResNetv2). The method results in significant improvements for transfer learning.
-
-The abstract from the paper is the following:
-
-*Transfer of pre-trained representations improves sample efficiency and simplifies hyperparameter tuning when training deep neural networks for vision. We revisit the paradigm of pre-training on large supervised datasets and fine-tuning the model on a target task. We scale up pre-training, and propose a simple recipe that we call Big Transfer (BiT). By combining a few carefully selected components, and transferring using a simple heuristic, we achieve strong performance on over 20 datasets. BiT performs well across a surprisingly wide range of data regimes -- from 1 example per class to 1M total examples. BiT achieves 87.5% top-1 accuracy on ILSVRC-2012, 99.4% on CIFAR-10, and 76.3% on the 19 task Visual Task Adaptation Benchmark (VTAB). On small datasets, BiT attains 76.8% on ILSVRC-2012 with 10 examples per class, and 97.0% on CIFAR-10 with 10 examples per class. We conduct detailed analysis of the main components that lead to high transfer performance.*
-
-Tips:
-
-- BiT models are equivalent to ResNetv2 in terms of architecture, except that: 1) all batch normalization layers are replaced by [group normalization](https://arxiv.org/abs/1803.08494),
-2) [weight standardization](https://arxiv.org/abs/1903.10520) is used for convolutional layers. The authors show that the combination of both is useful for training with large batch sizes, and has a significant
-impact on transfer learning.
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr).
-The original code can be found [here](https://github.com/google-research/big_transfer).
-
-
-## Resources
-
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with BiT.
-
-<PipelineTag pipeline="image-classification"/>
-
-- [`BitForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
-
-If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
-## BitConfig
-
-[[autodoc]] BitConfig
-
-
-## BitImageProcessor
-
-[[autodoc]] BitImageProcessor
-    - preprocess
-
-
-## BitModel
-
-[[autodoc]] BitModel
-    - forward
-
-
-## BitForImageClassification
-
-[[autodoc]] BitForImageClassification
-    - forward
-
diff --git a/docs/source/en/model_doc/blenderbot-small.md b/docs/source/en/model_doc/blenderbot-small.md
new file mode 100644
index 000000000000..c126bc9b1451
--- /dev/null
+++ b/docs/source/en/model_doc/blenderbot-small.md
@@ -0,0 +1,110 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Blenderbot Small
+
+Note that [`BlenderbotSmallModel`] and
+[`BlenderbotSmallForConditionalGeneration`] are only used in combination with the checkpoint
+[facebook/blenderbot-90M](https://huggingface.co/facebook/blenderbot-90M). Larger Blenderbot checkpoints should
+instead be used with [`BlenderbotModel`] and
+[`BlenderbotForConditionalGeneration`]
+
+## Overview
+
+The Blender chatbot model was proposed in [Recipes for building an open-domain chatbot](https://arxiv.org/pdf/2004.13637.pdf) Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu,
+Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston on 30 Apr 2020.
+
+The abstract of the paper is the following:
+
+*Building open-domain chatbots is a challenging area for machine learning research. While prior work has shown that
+scaling neural models in the number of parameters and the size of the data they are trained on gives improved results,
+we show that other ingredients are important for a high-performing chatbot. Good conversation requires a number of
+skills that an expert conversationalist blends in a seamless way: providing engaging talking points and listening to
+their partners, and displaying knowledge, empathy and personality appropriately, while maintaining a consistent
+persona. We show that large scale models can learn these skills when given appropriate training data and choice of
+generation strategy. We build variants of these recipes with 90M, 2.7B and 9.4B parameter models, and make our models
+and code publicly available. Human evaluations show our best models are superior to existing approaches in multi-turn
+dialogue in terms of engagingness and humanness measurements. We then discuss the limitations of this work by analyzing
+failure cases of our models.*
+
+Tips:
+
+- Blenderbot Small is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
+  the left.
+
+This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The authors' code can be
+found [here](https://github.com/facebookresearch/ParlAI).
+
+## Documentation resources
+
+- [Causal language modeling task guide](../tasks/language_modeling)
+- [Translation task guide](../tasks/translation)
+- [Summarization task guide](../tasks/summarization)
+
+## BlenderbotSmallConfig
+
+[[autodoc]] BlenderbotSmallConfig
+
+## BlenderbotSmallTokenizer
+
+[[autodoc]] BlenderbotSmallTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## BlenderbotSmallTokenizerFast
+
+[[autodoc]] BlenderbotSmallTokenizerFast
+
+## BlenderbotSmallModel
+
+[[autodoc]] BlenderbotSmallModel
+    - forward
+
+## BlenderbotSmallForConditionalGeneration
+
+[[autodoc]] BlenderbotSmallForConditionalGeneration
+    - forward
+
+## BlenderbotSmallForCausalLM
+
+[[autodoc]] BlenderbotSmallForCausalLM
+    - forward
+
+## TFBlenderbotSmallModel
+
+[[autodoc]] TFBlenderbotSmallModel
+    - call
+
+## TFBlenderbotSmallForConditionalGeneration
+
+[[autodoc]] TFBlenderbotSmallForConditionalGeneration
+    - call
+
+## FlaxBlenderbotSmallModel
+
+[[autodoc]] FlaxBlenderbotSmallModel
+    - __call__
+    - encode
+    - decode
+
+## FlaxBlenderbotForConditionalGeneration
+
+[[autodoc]] FlaxBlenderbotSmallForConditionalGeneration
+    - __call__
+    - encode
+    - decode
diff --git a/docs/source/en/model_doc/blenderbot-small.mdx b/docs/source/en/model_doc/blenderbot-small.mdx
deleted file mode 100644
index c4b157cac119..000000000000
--- a/docs/source/en/model_doc/blenderbot-small.mdx
+++ /dev/null
@@ -1,100 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Blenderbot Small
-
-Note that [`BlenderbotSmallModel`] and
-[`BlenderbotSmallForConditionalGeneration`] are only used in combination with the checkpoint
-[facebook/blenderbot-90M](https://huggingface.co/facebook/blenderbot-90M). Larger Blenderbot checkpoints should
-instead be used with [`BlenderbotModel`] and
-[`BlenderbotForConditionalGeneration`]
-
-## Overview
-
-The Blender chatbot model was proposed in [Recipes for building an open-domain chatbot](https://arxiv.org/pdf/2004.13637.pdf) Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu,
-Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston on 30 Apr 2020.
-
-The abstract of the paper is the following:
-
-*Building open-domain chatbots is a challenging area for machine learning research. While prior work has shown that
-scaling neural models in the number of parameters and the size of the data they are trained on gives improved results,
-we show that other ingredients are important for a high-performing chatbot. Good conversation requires a number of
-skills that an expert conversationalist blends in a seamless way: providing engaging talking points and listening to
-their partners, and displaying knowledge, empathy and personality appropriately, while maintaining a consistent
-persona. We show that large scale models can learn these skills when given appropriate training data and choice of
-generation strategy. We build variants of these recipes with 90M, 2.7B and 9.4B parameter models, and make our models
-and code publicly available. Human evaluations show our best models are superior to existing approaches in multi-turn
-dialogue in terms of engagingness and humanness measurements. We then discuss the limitations of this work by analyzing
-failure cases of our models.*
-
-Tips:
-
-- Blenderbot Small is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
-  the left.
-
-This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The authors' code can be
-found [here](https://github.com/facebookresearch/ParlAI) .
-
-## BlenderbotSmallConfig
-
-[[autodoc]] BlenderbotSmallConfig
-
-## BlenderbotSmallTokenizer
-
-[[autodoc]] BlenderbotSmallTokenizer
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-
-## BlenderbotSmallTokenizerFast
-
-[[autodoc]] BlenderbotSmallTokenizerFast
-
-## BlenderbotSmallModel
-
-[[autodoc]] BlenderbotSmallModel
-    - forward
-
-## BlenderbotSmallForConditionalGeneration
-
-[[autodoc]] BlenderbotSmallForConditionalGeneration
-    - forward
-
-## BlenderbotSmallForCausalLM
-
-[[autodoc]] BlenderbotSmallForCausalLM
-    - forward
-
-## TFBlenderbotSmallModel
-
-[[autodoc]] TFBlenderbotSmallModel
-    - call
-
-## TFBlenderbotSmallForConditionalGeneration
-
-[[autodoc]] TFBlenderbotSmallForConditionalGeneration
-    - call
-
-## FlaxBlenderbotSmallModel
-
-[[autodoc]] FlaxBlenderbotSmallModel
-    - __call__
-    - encode
-    - decode
-
-## FlaxBlenderbotForConditionalGeneration
-
-[[autodoc]] FlaxBlenderbotSmallForConditionalGeneration
-    - __call__
-    - encode
-    - decode
diff --git a/docs/source/en/model_doc/blenderbot.md b/docs/source/en/model_doc/blenderbot.md
new file mode 100644
index 000000000000..5a10af77b698
--- /dev/null
+++ b/docs/source/en/model_doc/blenderbot.md
@@ -0,0 +1,134 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Blenderbot
+
+**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title) .
+
+## Overview
+
+The Blender chatbot model was proposed in [Recipes for building an open-domain chatbot](https://arxiv.org/pdf/2004.13637.pdf) Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu,
+Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston on 30 Apr 2020.
+
+The abstract of the paper is the following:
+
+*Building open-domain chatbots is a challenging area for machine learning research. While prior work has shown that
+scaling neural models in the number of parameters and the size of the data they are trained on gives improved results,
+we show that other ingredients are important for a high-performing chatbot. Good conversation requires a number of
+skills that an expert conversationalist blends in a seamless way: providing engaging talking points and listening to
+their partners, and displaying knowledge, empathy and personality appropriately, while maintaining a consistent
+persona. We show that large scale models can learn these skills when given appropriate training data and choice of
+generation strategy. We build variants of these recipes with 90M, 2.7B and 9.4B parameter models, and make our models
+and code publicly available. Human evaluations show our best models are superior to existing approaches in multi-turn
+dialogue in terms of engagingness and humanness measurements. We then discuss the limitations of this work by analyzing
+failure cases of our models.*
+
+Tips:
+
+- Blenderbot is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
+  the left.
+
+This model was contributed by [sshleifer](https://huggingface.co/sshleifer). The authors' code can be found [here](https://github.com/facebookresearch/ParlAI) .
+
+
+## Implementation Notes
+
+- Blenderbot uses a standard [seq2seq model transformer](https://arxiv.org/pdf/1706.03762.pdf) based architecture.
+- Available checkpoints can be found in the [model hub](https://huggingface.co/models?search=blenderbot).
+- This is the *default* Blenderbot model class. However, some smaller checkpoints, such as
+  `facebook/blenderbot_small_90M`, have a different architecture and consequently should be used with
+  [BlenderbotSmall](blenderbot-small).
+
+
+## Usage
+
+Here is an example of model usage:
+
+```python
+>>> from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration
+
+>>> mname = "facebook/blenderbot-400M-distill"
+>>> model = BlenderbotForConditionalGeneration.from_pretrained(mname)
+>>> tokenizer = BlenderbotTokenizer.from_pretrained(mname)
+>>> UTTERANCE = "My friends are cool but they eat too many carbs."
+>>> inputs = tokenizer([UTTERANCE], return_tensors="pt")
+>>> reply_ids = model.generate(**inputs)
+>>> print(tokenizer.batch_decode(reply_ids))
+["<s> That's unfortunate. Are they trying to lose weight or are they just trying to be healthier?</s>"]
+```
+
+## Documentation resources
+
+- [Causal language modeling task guide](../tasks/language_modeling)
+- [Translation task guide](../tasks/translation)
+- [Summarization task guide](../tasks/summarization)
+
+## BlenderbotConfig
+
+[[autodoc]] BlenderbotConfig
+
+## BlenderbotTokenizer
+
+[[autodoc]] BlenderbotTokenizer
+    - build_inputs_with_special_tokens
+
+## BlenderbotTokenizerFast
+
+[[autodoc]] BlenderbotTokenizerFast
+    - build_inputs_with_special_tokens
+
+## BlenderbotModel
+
+See `transformers.BartModel` for arguments to *forward* and *generate*
+
+[[autodoc]] BlenderbotModel
+    - forward
+
+## BlenderbotForConditionalGeneration
+
+See [`~transformers.BartForConditionalGeneration`] for arguments to *forward* and *generate*
+
+[[autodoc]] BlenderbotForConditionalGeneration
+    - forward
+
+## BlenderbotForCausalLM
+
+[[autodoc]] BlenderbotForCausalLM
+    - forward
+
+## TFBlenderbotModel
+
+[[autodoc]] TFBlenderbotModel
+    - call
+
+## TFBlenderbotForConditionalGeneration
+
+[[autodoc]] TFBlenderbotForConditionalGeneration
+    - call
+
+## FlaxBlenderbotModel
+
+[[autodoc]] FlaxBlenderbotModel
+    - __call__
+    - encode
+    - decode
+
+## FlaxBlenderbotForConditionalGeneration
+
+[[autodoc]] FlaxBlenderbotForConditionalGeneration
+    - __call__
+    - encode
+    - decode
diff --git a/docs/source/en/model_doc/blenderbot.mdx b/docs/source/en/model_doc/blenderbot.mdx
deleted file mode 100644
index 75706e13ec1a..000000000000
--- a/docs/source/en/model_doc/blenderbot.mdx
+++ /dev/null
@@ -1,124 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Blenderbot
-
-**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title) .
-
-## Overview
-
-The Blender chatbot model was proposed in [Recipes for building an open-domain chatbot](https://arxiv.org/pdf/2004.13637.pdf) Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu,
-Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston on 30 Apr 2020.
-
-The abstract of the paper is the following:
-
-*Building open-domain chatbots is a challenging area for machine learning research. While prior work has shown that
-scaling neural models in the number of parameters and the size of the data they are trained on gives improved results,
-we show that other ingredients are important for a high-performing chatbot. Good conversation requires a number of
-skills that an expert conversationalist blends in a seamless way: providing engaging talking points and listening to
-their partners, and displaying knowledge, empathy and personality appropriately, while maintaining a consistent
-persona. We show that large scale models can learn these skills when given appropriate training data and choice of
-generation strategy. We build variants of these recipes with 90M, 2.7B and 9.4B parameter models, and make our models
-and code publicly available. Human evaluations show our best models are superior to existing approaches in multi-turn
-dialogue in terms of engagingness and humanness measurements. We then discuss the limitations of this work by analyzing
-failure cases of our models.*
-
-Tips:
-
-- Blenderbot is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
-  the left.
-
-This model was contributed by [sshleifer](https://huggingface.co/sshleifer). The authors' code can be found [here](https://github.com/facebookresearch/ParlAI) .
-
-
-## Implementation Notes
-
-- Blenderbot uses a standard [seq2seq model transformer](https://arxiv.org/pdf/1706.03762.pdf) based architecture.
-- Available checkpoints can be found in the [model hub](https://huggingface.co/models?search=blenderbot).
-- This is the *default* Blenderbot model class. However, some smaller checkpoints, such as
-  `facebook/blenderbot_small_90M`, have a different architecture and consequently should be used with
-  [BlenderbotSmall](blenderbot-small).
-
-
-## Usage
-
-Here is an example of model usage:
-
-```python
->>> from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration
-
->>> mname = "facebook/blenderbot-400M-distill"
->>> model = BlenderbotForConditionalGeneration.from_pretrained(mname)
->>> tokenizer = BlenderbotTokenizer.from_pretrained(mname)
->>> UTTERANCE = "My friends are cool but they eat too many carbs."
->>> inputs = tokenizer([UTTERANCE], return_tensors="pt")
->>> reply_ids = model.generate(**inputs)
->>> print(tokenizer.batch_decode(reply_ids))
-["<s> That's unfortunate. Are they trying to lose weight or are they just trying to be healthier?</s>"]
-```
-
-## BlenderbotConfig
-
-[[autodoc]] BlenderbotConfig
-
-## BlenderbotTokenizer
-
-[[autodoc]] BlenderbotTokenizer
-    - build_inputs_with_special_tokens
-
-## BlenderbotTokenizerFast
-
-[[autodoc]] BlenderbotTokenizerFast
-    - build_inputs_with_special_tokens
-
-## BlenderbotModel
-
-See `transformers.BartModel` for arguments to *forward* and *generate*
-
-[[autodoc]] BlenderbotModel
-    - forward
-
-## BlenderbotForConditionalGeneration
-
-See [`~transformers.BartForConditionalGeneration`] for arguments to *forward* and *generate*
-
-[[autodoc]] BlenderbotForConditionalGeneration
-    - forward
-
-## BlenderbotForCausalLM
-
-[[autodoc]] BlenderbotForCausalLM
-    - forward
-
-## TFBlenderbotModel
-
-[[autodoc]] TFBlenderbotModel
-    - call
-
-## TFBlenderbotForConditionalGeneration
-
-[[autodoc]] TFBlenderbotForConditionalGeneration
-    - call
-
-## FlaxBlenderbotModel
-
-[[autodoc]] FlaxBlenderbotModel
-    - __call__
-    - encode
-    - decode
-
-## FlaxBlenderbotForConditionalGeneration
-
-[[autodoc]] FlaxBlenderbotForConditionalGeneration
-    - __call__
-    - encode
-    - decode
diff --git a/docs/source/en/model_doc/blip-2.md b/docs/source/en/model_doc/blip-2.md
new file mode 100644
index 000000000000..0890e612561a
--- /dev/null
+++ b/docs/source/en/model_doc/blip-2.md
@@ -0,0 +1,90 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BLIP-2
+
+## Overview
+
+The BLIP-2 model was proposed in [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) by
+Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi. BLIP-2 leverages frozen pre-trained image encoders and large language models (LLMs) by training a lightweight, 12-layer Transformer
+encoder in between them, achieving state-of-the-art performance on various vision-language tasks. Most notably, BLIP-2 improves upon [Flamingo](https://arxiv.org/abs/2204.14198), an 80 billion parameter model, by 8.7%
+on zero-shot VQAv2 with 54x fewer trainable parameters. 
+
+The abstract from the paper is the following:
+
+*The cost of vision-and-language pre-training has become increasingly prohibitive due to end-to-end training of large-scale models. This paper proposes BLIP-2, a generic and efficient pre-training strategy that bootstraps vision-language pre-training from off-the-shelf frozen pre-trained image encoders and frozen large language models. BLIP-2 bridges the modality gap with a lightweight Querying Transformer, which is pre-trained in two stages. The first stage bootstraps vision-language representation learning from a frozen image encoder. The second stage bootstraps vision-to-language generative learning from a frozen language model. BLIP-2 achieves state-of-the-art performance on various vision-language tasks, despite having significantly fewer trainable parameters than existing methods. For example, our model outperforms Flamingo80B by 8.7% on zero-shot VQAv2 with 54x fewer trainable parameters. We also demonstrate the model's emerging capabilities of zero-shot image-to-text generation that can follow natural language instructions.*
+
+Tips:
+
+- BLIP-2 can be used for conditional text generation given an image and an optional text prompt. At inference time, it's recommended to use the [`generate`] method.
+- One can use [`Blip2Processor`] to prepare images for the model, and decode the predicted tokens ID's back to text.
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/blip2_architecture.jpg"
+alt="drawing" width="600"/> 
+
+<small> BLIP-2 architecture. Taken from the <a href="https://arxiv.org/abs/2301.12597">original paper.</a> </small>
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr).
+The original code can be found [here](https://github.com/salesforce/LAVIS/tree/5ee63d688ba4cebff63acee04adaef2dee9af207).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with BLIP-2.
+
+- Demo notebooks for BLIP-2 for image captioning, visual question answering (VQA) and chat-like conversations can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/BLIP-2).
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## Blip2Config
+
+[[autodoc]] Blip2Config
+    - from_vision_qformer_text_configs
+
+## Blip2VisionConfig
+
+[[autodoc]] Blip2VisionConfig
+
+## Blip2QFormerConfig
+
+[[autodoc]] Blip2QFormerConfig
+
+## Blip2Processor
+
+[[autodoc]] Blip2Processor
+
+## Blip2VisionModel
+
+[[autodoc]] Blip2VisionModel
+    - forward
+
+## Blip2QFormerModel
+
+[[autodoc]] Blip2QFormerModel
+    - forward
+
+## Blip2Model
+
+[[autodoc]] Blip2Model
+    - forward
+    - get_text_features
+    - get_image_features
+    - get_qformer_features
+
+## Blip2ForConditionalGeneration
+
+[[autodoc]] Blip2ForConditionalGeneration
+    - forward
+    - generate
\ No newline at end of file
diff --git a/docs/source/en/model_doc/blip.md b/docs/source/en/model_doc/blip.md
new file mode 100644
index 000000000000..8afed63311f8
--- /dev/null
+++ b/docs/source/en/model_doc/blip.md
@@ -0,0 +1,136 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BLIP
+
+## Overview
+
+The BLIP model was proposed in [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
+
+BLIP is a model that is able to perform various multi-modal tasks including
+- Visual Question Answering 
+- Image-Text retrieval (Image-text matching)
+- Image Captioning
+
+The abstract from the paper is the following:
+
+*Vision-Language Pre-training (VLP) has advanced the performance for many vision-language tasks. 
+However, most existing pre-trained models only excel in either understanding-based tasks or generation-based tasks. Furthermore, performance improvement has been largely achieved by scaling up the dataset with noisy image-text pairs collected from the web, which is a suboptimal source of supervision. In this paper, we propose BLIP, a new VLP framework which transfers flexibly to both vision-language understanding and generation tasks. BLIP effectively utilizes the noisy web data by bootstrapping the captions, where a captioner generates synthetic captions and a filter removes the noisy ones. We achieve state-of-the-art results on a wide range of vision-language tasks, such as image-text retrieval (+2.7% in average recall@1), image captioning (+2.8% in CIDEr), and VQA (+1.6% in VQA score). BLIP also demonstrates strong generalization ability when directly transferred to videolanguage tasks in a zero-shot manner. Code, models, and datasets are released.*
+
+![BLIP.gif](https://cdn-uploads.huggingface.co/production/uploads/1670928184033-62441d1d9fdefb55a0b7d12c.gif)
+
+This model was contributed by [ybelkada](https://huggingface.co/ybelkada).
+The original code can be found [here](https://github.com/salesforce/BLIP).
+
+## Resources
+
+- [Jupyter notebook](https://github.com/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb) on how to fine-tune BLIP for image captioning on a custom dataset
+
+
+## BlipConfig
+
+[[autodoc]] BlipConfig
+    - from_text_vision_configs
+
+## BlipTextConfig
+
+[[autodoc]] BlipTextConfig
+
+## BlipVisionConfig
+
+[[autodoc]] BlipVisionConfig
+
+## BlipProcessor
+
+[[autodoc]] BlipProcessor
+
+
+## BlipImageProcessor
+
+[[autodoc]] BlipImageProcessor
+    - preprocess
+
+## BlipModel
+
+[[autodoc]] BlipModel
+    - forward
+    - get_text_features
+    - get_image_features
+
+## BlipTextModel
+
+[[autodoc]] BlipTextModel
+    - forward
+
+
+## BlipVisionModel
+
+[[autodoc]] BlipVisionModel
+    - forward
+
+
+## BlipForConditionalGeneration
+
+[[autodoc]] BlipForConditionalGeneration
+    - forward
+
+
+## BlipForImageTextRetrieval
+
+[[autodoc]] BlipForImageTextRetrieval
+    - forward
+
+
+## BlipForQuestionAnswering
+
+[[autodoc]] BlipForQuestionAnswering
+    - forward
+
+## TFBlipModel
+
+[[autodoc]] TFBlipModel
+    - call
+    - get_text_features
+    - get_image_features
+
+## TFBlipTextModel
+
+[[autodoc]] TFBlipTextModel
+    - call
+
+
+## TFBlipVisionModel
+
+[[autodoc]] TFBlipVisionModel
+    - call
+
+
+## TFBlipForConditionalGeneration
+
+[[autodoc]] TFBlipForConditionalGeneration
+    - call
+
+
+## TFBlipForImageTextRetrieval
+
+[[autodoc]] TFBlipForImageTextRetrieval
+    - call
+
+
+## TFBlipForQuestionAnswering
+
+[[autodoc]] TFBlipForQuestionAnswering
+    - call
\ No newline at end of file
diff --git a/docs/source/en/model_doc/blip.mdx b/docs/source/en/model_doc/blip.mdx
deleted file mode 100644
index 81f51bfd688a..000000000000
--- a/docs/source/en/model_doc/blip.mdx
+++ /dev/null
@@ -1,92 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# BLIP
-
-## Overview
-
-The BLIP model was proposed in [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
-
-BLIP is a model that is able to perform various multi-modal tasks including
-- Visual Question Answering 
-- Image-Text retrieval (Image-text matching)
-- Image Captioning
-
-The abstract from the paper is the following:
-
-*Vision-Language Pre-training (VLP) has advanced the performance for many vision-language tasks. 
-However, most existing pre-trained models only excel in either understanding-based tasks or generation-based tasks. Furthermore, performance improvement has been largely achieved by scaling up the dataset with noisy image-text pairs collected from the web, which is a suboptimal source of supervision. In this paper, we propose BLIP, a new VLP framework which transfers flexibly to both vision-language understanding and generation tasks. BLIP effectively utilizes the noisy web data by bootstrapping the captions, where a captioner generates synthetic captions and a filter removes the noisy ones. We achieve state-of-the-art results on a wide range of vision-language tasks, such as image-text retrieval (+2.7% in average recall@1), image captioning (+2.8% in CIDEr), and VQA (+1.6% in VQA score). BLIP also demonstrates strong generalization ability when directly transferred to videolanguage tasks in a zero-shot manner. Code, models, and datasets are released.*
-
-![BLIP.gif](https://s3.amazonaws.com/moonup/production/uploads/1670928184033-62441d1d9fdefb55a0b7d12c.gif)
-
-This model was contributed by [ybelkada](https://huggingface.co/ybelkada).
-The original code can be found [here](https://github.com/salesforce/BLIP).
-
-
-## BlipConfig
-
-[[autodoc]] BlipConfig
-    - from_text_vision_configs
-
-## BlipTextConfig
-
-[[autodoc]] BlipTextConfig
-
-## BlipVisionConfig
-
-[[autodoc]] BlipVisionConfig
-
-## BlipProcessor
-
-[[autodoc]] BlipProcessor
-
-
-## BlipImageProcessor
-
-[[autodoc]] BlipImageProcessor
-    - preprocess
-
-## BlipModel
-
-[[autodoc]] BlipModel
-    - forward
-    - get_text_features
-    - get_image_features
-
-## BlipTextModel
-
-[[autodoc]] BlipTextModel
-    - forward
-
-
-## BlipVisionModel
-
-[[autodoc]] BlipVisionModel
-    - forward
-
-
-## BlipForConditionalGeneration
-
-[[autodoc]] BlipForConditionalGeneration
-    - forward
-
-
-## BlipForImageTextRetrieval
-
-[[autodoc]] BlipForImageTextRetrieval
-    - forward
-
-
-## BlipForQuestionAnswering
-
-[[autodoc]] BlipForQuestionAnswering
-    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/bloom.md b/docs/source/en/model_doc/bloom.md
new file mode 100644
index 000000000000..3c155fa58782
--- /dev/null
+++ b/docs/source/en/model_doc/bloom.md
@@ -0,0 +1,97 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BLOOM
+
+## Overview
+
+The BLOOM model has been proposed with its various versions through the [BigScience Workshop](https://bigscience.huggingface.co/). BigScience is inspired by other open science initiatives where researchers have pooled their time and resources to collectively achieve a higher impact.
+The architecture of BLOOM is essentially similar to GPT3 (auto-regressive model for next token prediction), but has been trained on 46 different languages and 13 programming languages.
+Several smaller versions of the models have been trained on the same dataset. BLOOM is available in the following versions:
+
+- [bloom-560m](https://huggingface.co/bigscience/bloom-560m)
+- [bloom-1b1](https://huggingface.co/bigscience/bloom-1b1)
+- [bloom-1b7](https://huggingface.co/bigscience/bloom-1b7)
+- [bloom-3b](https://huggingface.co/bigscience/bloom-3b)
+- [bloom-7b1](https://huggingface.co/bigscience/bloom-7b1)
+- [bloom](https://huggingface.co/bigscience/bloom) (176B parameters)
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with BLOOM. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+<PipelineTag pipeline="text-generation"/>
+
+- [`BloomForCausalLM`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#gpt-2gpt-and-causal-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
+
+See also:
+- [Causal language modeling task guide](../tasks/language_modeling)
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+
+
+⚡️ Inference
+- A blog on [Optimization story: Bloom inference](https://huggingface.co/blog/bloom-inference-optimization).
+- A blog on [Incredibly Fast BLOOM Inference with DeepSpeed and Accelerate](https://huggingface.co/blog/bloom-inference-pytorch-scripts).
+
+⚙️ Training
+- A blog on [The Technology Behind BLOOM Training](https://huggingface.co/blog/bloom-megatron-deepspeed).
+
+## BloomConfig
+
+[[autodoc]] BloomConfig
+    - all
+
+## BloomModel
+
+[[autodoc]] BloomModel
+    - forward
+
+## BloomTokenizerFast
+
+[[autodoc]] BloomTokenizerFast
+    - all
+
+## BloomForCausalLM
+
+[[autodoc]] BloomForCausalLM
+    - forward
+
+## BloomForSequenceClassification
+
+[[autodoc]] BloomForSequenceClassification
+    - forward
+
+## BloomForTokenClassification
+
+[[autodoc]] BloomForTokenClassification
+    - forward
+
+## BloomForQuestionAnswering
+
+[[autodoc]] BloomForQuestionAnswering
+    - forward
+
+## FlaxBloomModel
+
+[[autodoc]] FlaxBloomModel
+    - __call__
+
+## FlaxBloomForCausalLM
+
+[[autodoc]] FlaxBloomForCausalLM
+    - __call__
diff --git a/docs/source/en/model_doc/bloom.mdx b/docs/source/en/model_doc/bloom.mdx
deleted file mode 100644
index a3a2aa81d79c..000000000000
--- a/docs/source/en/model_doc/bloom.mdx
+++ /dev/null
@@ -1,77 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# BLOOM
-
-## Overview
-
-The BLOOM model has been proposed with its various versions through the [BigScience Workshop](https://bigscience.huggingface.co/). BigScience is inspired by other open science initiatives where researchers have pooled their time and resources to collectively achieve a higher impact.
-The architecture of BLOOM is essentially similar to GPT3 (auto-regressive model for next token prediction), but has been trained on 46 different languages and 13 programming languages.
-Several smaller versions of the models have been trained on the same dataset. BLOOM is available in the following versions:
-
-- [bloom-560m](https://huggingface.co/bigscience/bloom-560m)
-- [bloom-1b1](https://huggingface.co/bigscience/bloom-1b1)
-- [bloom-1b7](https://huggingface.co/bigscience/bloom-1b7)
-- [bloom-3b](https://huggingface.co/bigscience/bloom-3b)
-- [bloom-7b1](https://huggingface.co/bigscience/bloom-7b1)
-- [bloom](https://huggingface.co/bigscience/bloom) (176B parameters)
-
-## Resources
-
-
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with BLOOM. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
-<PipelineTag pipeline="text-generation"/>
-
-- [`BloomForCausalLM`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#gpt-2gpt-and-causal-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
-
-⚡️ Inference
-- A blog on [Optimization story: Bloom inference](https://huggingface.co/blog/bloom-inference-optimization).
-- A blog on [Incredibly Fast BLOOM Inference with DeepSpeed and Accelerate](https://huggingface.co/blog/bloom-inference-pytorch-scripts).
-
-⚙️ Training
-- A blog on [The Technology Behind BLOOM Training](https://huggingface.co/blog/bloom-megatron-deepspeed).
-
-## BloomConfig
-
-[[autodoc]] BloomConfig
-    - all
-
-## BloomModel
-
-[[autodoc]] BloomModel
-    - forward
-
-## BloomTokenizerFast
-
-[[autodoc]] BloomTokenizerFast
-    - all
-
-## BloomForCausalLM
-
-[[autodoc]] BloomForCausalLM
-    - forward
-
-## BloomForSequenceClassification
-
-[[autodoc]] BloomForSequenceClassification
-    - forward
-
-## BloomForTokenClassification
-
-[[autodoc]] BloomForTokenClassification
-    - forward
-
-## BloomForQuestionAnswering
-
-[[autodoc]] BloomForQuestionAnswering
-    - forward
diff --git a/docs/source/en/model_doc/bort.md b/docs/source/en/model_doc/bort.md
new file mode 100644
index 000000000000..dccf2b560b68
--- /dev/null
+++ b/docs/source/en/model_doc/bort.md
@@ -0,0 +1,55 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BORT
+
+<Tip warning={true}>
+
+This model is in maintenance mode only, so we won't accept any new PRs changing its code.
+
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
+You can do so by running the following command: `pip install -U transformers==4.30.0`.
+
+</Tip>
+
+## Overview
+
+The BORT model was proposed in [Optimal Subarchitecture Extraction for BERT](https://arxiv.org/abs/2010.10499) by
+Adrian de Wynter and Daniel J. Perry. It is an optimal subset of architectural parameters for the BERT, which the
+authors refer to as "Bort".
+
+The abstract from the paper is the following:
+
+*We extract an optimal subset of architectural parameters for the BERT architecture from Devlin et al. (2018) by
+applying recent breakthroughs in algorithms for neural architecture search. This optimal subset, which we refer to as
+"Bort", is demonstrably smaller, having an effective (that is, not counting the embedding layer) size of 5.5% the
+original BERT-large architecture, and 16% of the net size. Bort is also able to be pretrained in 288 GPU hours, which
+is 1.2% of the time required to pretrain the highest-performing BERT parametric architectural variant, RoBERTa-large
+(Liu et al., 2019), and about 33% of that of the world-record, in GPU hours, required to train BERT-large on the same
+hardware. It is also 7.9x faster on a CPU, as well as being better performing than other compressed variants of the
+architecture, and some of the non-compressed variants: it obtains performance improvements of between 0.3% and 31%,
+absolute, with respect to BERT-large, on multiple public natural language understanding (NLU) benchmarks.*
+
+Tips:
+
+- BORT's model architecture is based on BERT, so one can refer to [BERT's documentation page](bert) for the
+  model's API as well as usage examples.
+- BORT uses the RoBERTa tokenizer instead of the BERT tokenizer, so one can refer to [RoBERTa's documentation page](roberta) for the tokenizer's API as well as usage examples.
+- BORT requires a specific fine-tuning algorithm, called [Agora](https://adewynter.github.io/notes/bort_algorithms_and_applications.html#fine-tuning-with-algebraic-topology) ,
+  that is sadly not open-sourced yet. It would be very useful for the community, if someone tries to implement the
+  algorithm to make BORT fine-tuning work.
+
+This model was contributed by [stefan-it](https://huggingface.co/stefan-it). The original code can be found [here](https://github.com/alexa/bort/).
diff --git a/docs/source/en/model_doc/bort.mdx b/docs/source/en/model_doc/bort.mdx
deleted file mode 100644
index e90f042b6566..000000000000
--- a/docs/source/en/model_doc/bort.mdx
+++ /dev/null
@@ -1,42 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# BORT
-
-## Overview
-
-The BORT model was proposed in [Optimal Subarchitecture Extraction for BERT](https://arxiv.org/abs/2010.10499) by
-Adrian de Wynter and Daniel J. Perry. It is an optimal subset of architectural parameters for the BERT, which the
-authors refer to as "Bort".
-
-The abstract from the paper is the following:
-
-*We extract an optimal subset of architectural parameters for the BERT architecture from Devlin et al. (2018) by
-applying recent breakthroughs in algorithms for neural architecture search. This optimal subset, which we refer to as
-"Bort", is demonstrably smaller, having an effective (that is, not counting the embedding layer) size of 5.5% the
-original BERT-large architecture, and 16% of the net size. Bort is also able to be pretrained in 288 GPU hours, which
-is 1.2% of the time required to pretrain the highest-performing BERT parametric architectural variant, RoBERTa-large
-(Liu et al., 2019), and about 33% of that of the world-record, in GPU hours, required to train BERT-large on the same
-hardware. It is also 7.9x faster on a CPU, as well as being better performing than other compressed variants of the
-architecture, and some of the non-compressed variants: it obtains performance improvements of between 0.3% and 31%,
-absolute, with respect to BERT-large, on multiple public natural language understanding (NLU) benchmarks.*
-
-Tips:
-
-- BORT's model architecture is based on BERT, so one can refer to [BERT's documentation page](bert) for the
-  model's API as well as usage examples.
-- BORT uses the RoBERTa tokenizer instead of the BERT tokenizer, so one can refer to [RoBERTa's documentation page](roberta) for the tokenizer's API as well as usage examples.
-- BORT requires a specific fine-tuning algorithm, called [Agora](https://adewynter.github.io/notes/bort_algorithms_and_applications.html#fine-tuning-with-algebraic-topology) ,
-  that is sadly not open-sourced yet. It would be very useful for the community, if someone tries to implement the
-  algorithm to make BORT fine-tuning work.
-
-This model was contributed by [stefan-it](https://huggingface.co/stefan-it). The original code can be found [here](https://github.com/alexa/bort/).
diff --git a/docs/source/en/model_doc/bridgetower.md b/docs/source/en/model_doc/bridgetower.md
new file mode 100644
index 000000000000..ba98cea91d21
--- /dev/null
+++ b/docs/source/en/model_doc/bridgetower.md
@@ -0,0 +1,171 @@
+<!--Copyright 2023 The Intel Labs Team Authors, The Microsoft Research Team Authors and HuggingFace Inc. team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BridgeTower
+
+## Overview
+
+The BridgeTower model was proposed in [BridgeTower: Building Bridges Between Encoders in Vision-Language Representative Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan. The goal of this model is to build a
+bridge between each uni-modal encoder and the cross-modal encoder to enable comprehensive and detailed interaction at each layer of the cross-modal encoder thus achieving remarkable performance on various downstream tasks with almost negligible additional performance and computational costs.
+
+This paper has been accepted to the [AAAI'23](https://aaai.org/Conferences/AAAI-23/) conference. 
+
+The abstract from the paper is the following:
+
+*Vision-Language (VL) models with the TWO-TOWER architecture have dominated visual-language representation learning in recent years.
+Current VL models either use lightweight uni-modal encoders and learn to extract, align and fuse both modalities simultaneously in a deep cross-modal encoder, or feed the last-layer uni-modal representations from the deep pre-trained uni-modal encoders into the top cross-modal encoder.
+Both approaches potentially restrict vision-language representation learning and limit model performance. In this paper, we propose BRIDGETOWER, which introduces multiple bridge layers that build a connection between the top layers of uni-modal encoders and each layer of the crossmodal encoder.
+This enables effective bottom-up cross-modal alignment and fusion between visual and textual representations of different semantic levels of pre-trained uni-modal encoders in the cross-modal encoder. Pre-trained with only 4M images, BRIDGETOWER achieves state-of-the-art performance on various downstream vision-language tasks.
+In particular, on the VQAv2 test-std set, BRIDGETOWER achieves an accuracy of 78.73%, outperforming the previous state-of-the-art model METER by 1.09% with the same pre-training data and almost negligible additional parameters and computational costs.
+Notably, when further scaling the model, BRIDGETOWER achieves an accuracy of 81.15%, surpassing models that are pre-trained on orders-of-magnitude larger datasets.*
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/bridgetower_architecture%20.jpg"
+alt="drawing" width="600"/>
+
+<small> BridgeTower architecture. Taken from the <a href="https://arxiv.org/abs/2206.08657">original paper.</a> </small>
+
+## Usage
+
+BridgeTower consists of a visual encoder, a textual encoder and cross-modal encoder with multiple lightweight bridge layers.
+The goal of this approach was to build a bridge between each uni-modal encoder and the cross-modal encoder to enable comprehensive and detailed interaction at each layer of the cross-modal encoder.
+In principle, one can apply any visual, textual or cross-modal encoder in the proposed architecture.
+
+The [`BridgeTowerProcessor`] wraps [`RobertaTokenizer`] and [`BridgeTowerImageProcessor`] into a single instance to both
+encode the text and prepare the images respectively.
+
+The following example shows how to run contrastive learning using [`BridgeTowerProcessor`] and [`BridgeTowerForContrastiveLearning`].
+```python
+>>> from transformers import BridgeTowerProcessor, BridgeTowerForContrastiveLearning
+>>> import requests
+>>> from PIL import Image
+
+>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+>>> texts = ["An image of two cats chilling on a couch", "A football player scoring a goal"]
+
+>>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc")
+>>> model = BridgeTowerForContrastiveLearning.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc")
+
+>>> # forward pass
+>>> scores = dict()
+>>> for text in texts:
+...     # prepare inputs
+...     encoding = processor(image, text, return_tensors="pt")
+...     outputs = model(**encoding)
+...     scores[text] = outputs
+```
+
+The following example shows how to run image-text retrieval using [`BridgeTowerProcessor`] and [`BridgeTowerForImageAndTextRetrieval`].
+```python
+>>> from transformers import BridgeTowerProcessor, BridgeTowerForImageAndTextRetrieval
+>>> import requests
+>>> from PIL import Image
+
+>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+>>> texts = ["An image of two cats chilling on a couch", "A football player scoring a goal"]
+
+>>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")
+>>> model = BridgeTowerForImageAndTextRetrieval.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")
+
+>>> # forward pass
+>>> scores = dict()
+>>> for text in texts:
+...     # prepare inputs
+...     encoding = processor(image, text, return_tensors="pt")
+...     outputs = model(**encoding)
+...     scores[text] = outputs.logits[0, 1].item()
+```
+
+The following example shows how to run masked language modeling using [`BridgeTowerProcessor`] and [`BridgeTowerForMaskedLM`].
+
+```python
+>>> from transformers import BridgeTowerProcessor, BridgeTowerForMaskedLM
+>>> from PIL import Image
+>>> import requests
+
+>>> url = "http://images.cocodataset.org/val2017/000000360943.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+>>> text = "a <mask> looking out of the window"
+
+>>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")
+>>> model = BridgeTowerForMaskedLM.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")
+
+>>> # prepare inputs
+>>> encoding = processor(image, text, return_tensors="pt")
+
+>>> # forward pass
+>>> outputs = model(**encoding)
+
+>>> results = processor.decode(outputs.logits.argmax(dim=-1).squeeze(0).tolist())
+
+>>> print(results)
+.a cat looking out of the window.
+```
+
+This model was contributed by [Anahita Bhiwandiwalla](https://huggingface.co/anahita-b), [Tiep Le](https://huggingface.co/Tile) and [Shaoyen Tseng](https://huggingface.co/shaoyent). The original code can be found [here](https://github.com/microsoft/BridgeTower).
+
+
+Tips:
+
+- This implementation of BridgeTower uses [`RobertaTokenizer`] to generate text embeddings and OpenAI's CLIP/ViT model to compute visual embeddings.
+- Checkpoints for pre-trained [bridgeTower-base](https://huggingface.co/BridgeTower/bridgetower-base) and [bridgetower masked language modeling and image text matching](https://huggingface.co/BridgeTower/bridgetower-base-itm-mlm) are released.
+- Please refer to [Table 5](https://arxiv.org/pdf/2206.08657.pdf) for BridgeTower's performance on Image Retrieval and other down stream tasks.
+- The PyTorch version of this model is only available in torch 1.10 and higher.
+
+
+## BridgeTowerConfig
+
+[[autodoc]] BridgeTowerConfig
+
+## BridgeTowerTextConfig
+
+[[autodoc]] BridgeTowerTextConfig
+
+## BridgeTowerVisionConfig
+
+[[autodoc]] BridgeTowerVisionConfig
+
+## BridgeTowerImageProcessor
+
+[[autodoc]] BridgeTowerImageProcessor
+    - preprocess
+
+## BridgeTowerProcessor
+
+[[autodoc]] BridgeTowerProcessor
+    - __call__
+
+## BridgeTowerModel
+
+[[autodoc]] BridgeTowerModel
+    - forward
+
+## BridgeTowerForContrastiveLearning
+
+[[autodoc]] BridgeTowerForContrastiveLearning
+    - forward
+
+## BridgeTowerForMaskedLM
+
+[[autodoc]] BridgeTowerForMaskedLM
+    - forward
+
+## BridgeTowerForImageAndTextRetrieval
+
+[[autodoc]] BridgeTowerForImageAndTextRetrieval
+    - forward
+
diff --git a/docs/source/en/model_doc/bros.md b/docs/source/en/model_doc/bros.md
new file mode 100644
index 000000000000..1c8e3f50605c
--- /dev/null
+++ b/docs/source/en/model_doc/bros.md
@@ -0,0 +1,115 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# BROS
+
+## Overview
+
+The BROS model was proposed in [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539) by Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park.
+
+BROS stands for *BERT Relying On Spatiality*. It is an encoder-only Transformer model that takes a sequence of tokens and their bounding boxes as inputs and outputs a sequence of hidden states. BROS encode relative spatial information instead of using absolute spatial information.
+
+It is pre-trained with two objectives: a token-masked language modeling objective (TMLM) used in BERT, and a novel area-masked language modeling objective (AMLM)
+In TMLM, tokens are randomly masked, and the model predicts the masked tokens using spatial information and other unmasked tokens.
+AMLM is a 2D version of TMLM. It randomly masks text tokens and predicts with the same information as TMLM, but it masks text blocks (areas).
+
+`BrosForTokenClassification` has a simple linear layer on top of BrosModel. It predicts the label of each token.
+`BrosSpadeEEForTokenClassification` has an `initial_token_classifier` and `subsequent_token_classifier` on top of BrosModel. `initial_token_classifier` is used to predict the first token of each entity, and `subsequent_token_classifier` is used to predict the next token of within entity. `BrosSpadeELForTokenClassification` has an `entity_linker` on top of BrosModel. `entity_linker` is used to predict the relation between two entities.
+
+`BrosForTokenClassification` and `BrosSpadeEEForTokenClassification` essentially perform the same job. However, `BrosForTokenClassification` assumes input tokens are perfectly serialized (which is very challenging task since they exist in a 2D space), while `BrosSpadeEEForTokenClassification` allows for more flexibility in handling serialization errors as it predicts next connection tokens from one token.
+
+`BrosSpadeELForTokenClassification` perform the intra-entity linking task. It predicts relation from one token (of one entity) to another token (of another entity) if these two entities share some relation.
+
+BROS achieves comparable or better result on Key Information Extraction (KIE) benchmarks such as FUNSD, SROIE, CORD and SciTSR, without relying on explicit visual features.
+
+
+The abstract from the paper is the following:
+
+*Key information extraction (KIE) from document images requires understanding the contextual and spatial semantics of texts in two-dimensional (2D) space. Many recent studies try to solve the task by developing pre-trained language models focusing on combining visual features from document images with texts and their layout. On the other hand, this paper tackles the problem by going back to the basic: effective combination of text and layout. Specifically, we propose a pre-trained language model, named BROS (BERT Relying On Spatiality), that encodes relative positions of texts in 2D space and learns from unlabeled documents with area-masking strategy. With this optimized training scheme for understanding texts in 2D space, BROS shows comparable or better performance compared to previous methods on four KIE benchmarks (FUNSD, SROIE*, CORD, and SciTSR) without relying on visual features. This paper also reveals two real-world challenges in KIE tasks-(1) minimizing the error from incorrect text ordering and (2) efficient learning from fewer downstream examples-and demonstrates the superiority of BROS over previous methods.*
+
+Tips:
+
+- [`~transformers.BrosModel.forward`] requires `input_ids` and `bbox` (bounding box). Each bounding box should be in (x0, y0, x1, y1) format (top-left corner, bottom-right corner). Obtaining of Bounding boxes depends on external OCR system. The `x` coordinate should be normalized by document image width, and the `y` coordinate should be normalized by document image height.
+
+```python
+def expand_and_normalize_bbox(bboxes, doc_width, doc_height):
+    # here, bboxes are numpy array
+
+    # Normalize bbox -> 0 ~ 1
+    bboxes[:, [0, 2]] = bboxes[:, [0, 2]] / width
+    bboxes[:, [1, 3]] = bboxes[:, [1, 3]] / height
+```
+
+- [`~transformers.BrosForTokenClassification.forward`, `~transformers.BrosSpadeEEForTokenClassification.forward`, `~transformers.BrosSpadeEEForTokenClassification.forward`] require not only `input_ids` and `bbox` but also `box_first_token_mask` for loss calculation. It is a mask to filter out non-first tokens of each box. You can obtain this mask by saving start token indices of bounding boxes when creating `input_ids` from words. You can make `box_first_token_mask` with following code,
+
+
+```python
+def make_box_first_token_mask(bboxes, words, tokenizer, max_seq_length=512):
+
+    box_first_token_mask = np.zeros(max_seq_length, dtype=np.bool_)
+
+    # encode(tokenize) each word from words (List[str])
+    input_ids_list: List[List[int]] = [tokenizer.encode(e, add_special_tokens=False) for e in words]
+
+    # get the length of each box
+    tokens_length_list: List[int] = [len(l) for l in input_ids_list]
+
+    box_end_token_indices = np.array(list(itertools.accumulate(tokens_length_list)))
+    box_start_token_indices = box_end_token_indices - np.array(tokens_length_list)
+
+    # filter out the indices that are out of max_seq_length
+    box_end_token_indices = box_end_token_indices[box_end_token_indices < max_seq_length - 1]
+    if len(box_start_token_indices) > len(box_end_token_indices):
+        box_start_token_indices = box_start_token_indices[: len(box_end_token_indices)]
+
+    # set box_start_token_indices to True
+    box_first_token_mask[box_start_token_indices] = True
+
+    return box_first_token_mask
+
+```
+
+- Demo scripts can be found [here](https://github.com/clovaai/bros).
+
+This model was contributed by [jinho8345](https://huggingface.co/jinho8345). The original code can be found [here](https://github.com/clovaai/bros).
+
+## BrosConfig
+
+[[autodoc]] BrosConfig
+
+## BrosProcessor
+
+[[autodoc]] BrosProcessor
+    - __call__
+
+## BrosModel
+
+[[autodoc]] BrosModel
+    - forward
+
+
+## BrosForTokenClassification
+
+[[autodoc]] BrosForTokenClassification
+    - forward
+
+
+## BrosSpadeEEForTokenClassification
+
+[[autodoc]] BrosSpadeEEForTokenClassification
+    - forward
+
+
+## BrosSpadeELForTokenClassification
+
+[[autodoc]] BrosSpadeELForTokenClassification
+    - forward
diff --git a/docs/source/en/model_doc/byt5.md b/docs/source/en/model_doc/byt5.md
new file mode 100644
index 000000000000..2df7c4ddaa24
--- /dev/null
+++ b/docs/source/en/model_doc/byt5.md
@@ -0,0 +1,151 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# ByT5
+
+## Overview
+
+The ByT5 model was presented in [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir
+Kale, Adam Roberts, Colin Raffel.
+
+The abstract from the paper is the following:
+
+*Most widely-used pre-trained language models operate on sequences of tokens corresponding to word or subword units.
+Encoding text as a sequence of tokens requires a tokenizer, which is typically created as an independent artifact from
+the model. Token-free models that instead operate directly on raw text (bytes or characters) have many benefits: they
+can process text in any language out of the box, they are more robust to noise, and they minimize technical debt by
+removing complex and error-prone text preprocessing pipelines. Since byte or character sequences are longer than token
+sequences, past work on token-free models has often introduced new model architectures designed to amortize the cost of
+operating directly on raw text. In this paper, we show that a standard Transformer architecture can be used with
+minimal modifications to process byte sequences. We carefully characterize the trade-offs in terms of parameter count,
+training FLOPs, and inference speed, and show that byte-level models are competitive with their token-level
+counterparts. We also demonstrate that byte-level models are significantly more robust to noise and perform better on
+tasks that are sensitive to spelling and pronunciation. As part of our contribution, we release a new set of
+pre-trained byte-level Transformer models based on the T5 architecture, as well as all code and data used in our
+experiments.*
+
+This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The original code can be
+found [here](https://github.com/google-research/byt5).
+
+ByT5's architecture is based on the T5v1.1 model, so one can refer to [T5v1.1's documentation page](t5v1.1). They
+only differ in how inputs should be prepared for the model, see the code examples below.
+
+Since ByT5 was pre-trained unsupervisedly, there's no real advantage to using a task prefix during single-task
+fine-tuning. If you are doing multi-task fine-tuning, you should use a prefix.
+
+
+### Example
+
+ByT5 works on raw UTF-8 bytes, so it can be used without a tokenizer:
+
+```python
+>>> from transformers import T5ForConditionalGeneration
+>>> import torch
+
+>>> model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
+
+>>> num_special_tokens = 3
+>>> # Model has 3 special tokens which take up the input ids 0,1,2 of ByT5.
+>>> # => Need to shift utf-8 character encodings by 3 before passing ids to model.
+
+>>> input_ids = torch.tensor([list("Life is like a box of chocolates.".encode("utf-8"))]) + num_special_tokens
+
+>>> labels = torch.tensor([list("La vie est comme une boîte de chocolat.".encode("utf-8"))]) + num_special_tokens
+
+>>> loss = model(input_ids, labels=labels).loss
+>>> loss.item()
+2.66
+```
+
+For batched inference and training it is however recommended to make use of the tokenizer:
+
+```python
+>>> from transformers import T5ForConditionalGeneration, AutoTokenizer
+
+>>> model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
+>>> tokenizer = AutoTokenizer.from_pretrained("google/byt5-small")
+
+>>> model_inputs = tokenizer(
+...     ["Life is like a box of chocolates.", "Today is Monday."], padding="longest", return_tensors="pt"
+... )
+>>> labels_dict = tokenizer(
+...     ["La vie est comme une boîte de chocolat.", "Aujourd'hui c'est lundi."], padding="longest", return_tensors="pt"
+... )
+>>> labels = labels_dict.input_ids
+
+>>> loss = model(**model_inputs, labels=labels).loss
+>>> loss.item()
+17.9
+```
+
+Similar to [T5](t5), ByT5 was trained on the span-mask denoising task. However, 
+since the model works directly on characters, the pretraining task is a bit 
+different. Let's corrupt some characters of the 
+input sentence `"The dog chases a ball in the park."` and ask ByT5 to predict them 
+for us.
+
+```python
+>>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+>>> import torch
+
+>>> tokenizer = AutoTokenizer.from_pretrained("google/byt5-base")
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("google/byt5-base")
+
+>>> input_ids_prompt = "The dog chases a ball in the park."
+>>> input_ids = tokenizer(input_ids_prompt).input_ids
+
+>>> # Note that we cannot add "{extra_id_...}" to the string directly
+>>> # as the Byte tokenizer would incorrectly merge the tokens
+>>> # For ByT5, we need to work directly on the character level
+>>> # Contrary to T5, ByT5 does not use sentinel tokens for masking, but instead
+>>> # uses final utf character ids.
+>>> # UTF-8 is represented by 8 bits and ByT5 has 3 special tokens.
+>>> # => There are 2**8+2 = 259 input ids and mask tokens count down from index 258.
+>>> # => mask to "The dog [258]a ball [257]park."
+
+>>> input_ids = torch.tensor([input_ids[:8] + [258] + input_ids[14:21] + [257] + input_ids[28:]])
+>>> input_ids
+tensor([[ 87, 107, 104,  35, 103, 114, 106,  35, 258,  35, 100,  35, 101, 100, 111, 111, 257,  35, 115, 100, 117, 110,  49,   1]])
+
+>>> # ByT5 produces only one char at a time so we need to produce many more output characters here -> set `max_length=100`.
+>>> output_ids = model.generate(input_ids, max_length=100)[0].tolist()
+>>> output_ids
+[0, 258, 108, 118,  35, 119, 107, 104,  35, 114, 113, 104,  35, 122, 107, 114,  35, 103, 114, 104, 118, 257,  35, 108, 113,  35, 119, 107, 104,  35, 103, 108, 118, 102, 114, 256, 108, 113,  35, 119, 107, 104, 35, 115, 100, 117, 110,  49,  35,  87, 107, 104,  35, 103, 114, 106, 35, 108, 118,  35, 119, 107, 104,  35, 114, 113, 104,  35, 122, 107, 114,  35, 103, 114, 104, 118,  35, 100,  35, 101, 100, 111, 111,  35, 108, 113, 255,  35, 108, 113,  35, 119, 107, 104,  35, 115, 100, 117, 110,  49]
+
+>>> # ^- Note how 258 descends to 257, 256, 255
+
+>>> # Now we need to split on the sentinel tokens, let's write a short loop for this
+>>> output_ids_list = []
+>>> start_token = 0
+>>> sentinel_token = 258
+>>> while sentinel_token in output_ids:
+...     split_idx = output_ids.index(sentinel_token)
+...     output_ids_list.append(output_ids[start_token:split_idx])
+...     start_token = split_idx
+...     sentinel_token -= 1
+
+>>> output_ids_list.append(output_ids[start_token:])
+>>> output_string = tokenizer.batch_decode(output_ids_list)
+>>> output_string
+['<pad>', 'is the one who does', ' in the disco', 'in the park. The dog is the one who does a ball in', ' in the park.']
+```
+
+
+## ByT5Tokenizer
+
+[[autodoc]] ByT5Tokenizer
+
+See [`ByT5Tokenizer`] for all details.
diff --git a/docs/source/en/model_doc/byt5.mdx b/docs/source/en/model_doc/byt5.mdx
deleted file mode 100644
index dc4c5a6caf8f..000000000000
--- a/docs/source/en/model_doc/byt5.mdx
+++ /dev/null
@@ -1,147 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# ByT5
-
-## Overview
-
-The ByT5 model was presented in [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir
-Kale, Adam Roberts, Colin Raffel.
-
-The abstract from the paper is the following:
-
-*Most widely-used pre-trained language models operate on sequences of tokens corresponding to word or subword units.
-Encoding text as a sequence of tokens requires a tokenizer, which is typically created as an independent artifact from
-the model. Token-free models that instead operate directly on raw text (bytes or characters) have many benefits: they
-can process text in any language out of the box, they are more robust to noise, and they minimize technical debt by
-removing complex and error-prone text preprocessing pipelines. Since byte or character sequences are longer than token
-sequences, past work on token-free models has often introduced new model architectures designed to amortize the cost of
-operating directly on raw text. In this paper, we show that a standard Transformer architecture can be used with
-minimal modifications to process byte sequences. We carefully characterize the trade-offs in terms of parameter count,
-training FLOPs, and inference speed, and show that byte-level models are competitive with their token-level
-counterparts. We also demonstrate that byte-level models are significantly more robust to noise and perform better on
-tasks that are sensitive to spelling and pronunciation. As part of our contribution, we release a new set of
-pre-trained byte-level Transformer models based on the T5 architecture, as well as all code and data used in our
-experiments.*
-
-This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The original code can be
-found [here](https://github.com/google-research/byt5).
-
-ByT5's architecture is based on the T5v1.1 model, so one can refer to [T5v1.1's documentation page](t5v1.1). They
-only differ in how inputs should be prepared for the model, see the code examples below.
-
-Since ByT5 was pre-trained unsupervisedly, there's no real advantage to using a task prefix during single-task
-fine-tuning. If you are doing multi-task fine-tuning, you should use a prefix.
-
-
-### Example
-
-ByT5 works on raw UTF-8 bytes, so it can be used without a tokenizer:
-
-```python
->>> from transformers import T5ForConditionalGeneration
->>> import torch
-
->>> model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
-
->>> num_special_tokens = 3
->>> # Model has 3 special tokens which take up the input ids 0,1,2 of ByT5.
->>> # => Need to shift utf-8 character encodings by 3 before passing ids to model.
-
->>> input_ids = torch.tensor([list("Life is like a box of chocolates.".encode("utf-8"))]) + num_special_tokens
-
->>> labels = torch.tensor([list("La vie est comme une boîte de chocolat.".encode("utf-8"))]) + num_special_tokens
-
->>> loss = model(input_ids, labels=labels).loss
->>> loss.item()
-2.66
-```
-
-For batched inference and training it is however recommended to make use of the tokenizer:
-
-```python
->>> from transformers import T5ForConditionalGeneration, AutoTokenizer
-
->>> model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
->>> tokenizer = AutoTokenizer.from_pretrained("google/byt5-small")
-
->>> model_inputs = tokenizer(
-...     ["Life is like a box of chocolates.", "Today is Monday."], padding="longest", return_tensors="pt"
-... )
->>> labels_dict = tokenizer(
-...     ["La vie est comme une boîte de chocolat.", "Aujourd'hui c'est lundi."], padding="longest", return_tensors="pt"
-... )
->>> labels = labels_dict.input_ids
-
->>> loss = model(**model_inputs, labels=labels).loss
->>> loss.item()
-17.9
-```
-
-Similar to [T5](t5), ByT5 was trained on the span-mask denoising task. However, 
-since the model works directly on characters, the pretraining task is a bit 
-different. Let's corrupt some characters of the 
-input sentence `"The dog chases a ball in the park."` and ask ByT5 to predict them 
-for us.
-
-```python
->>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
->>> import torch
-
->>> tokenizer = AutoTokenizer.from_pretrained("google/byt5-base")
->>> model = AutoModelForSeq2SeqLM.from_pretrained("google/byt5-base")
-
->>> input_ids_prompt = "The dog chases a ball in the park."
->>> input_ids = tokenizer(input_ids_prompt).input_ids
-
->>> # Note that we cannot add "{extra_id_...}" to the string directly
->>> # as the Byte tokenizer would incorrectly merge the tokens
->>> # For ByT5, we need to work directly on the character level
->>> # Contrary to T5, ByT5 does not use sentinel tokens for masking, but instead
->>> # uses final utf character ids.
->>> # UTF-8 is represented by 8 bits and ByT5 has 3 special tokens.
->>> # => There are 2**8+2 = 259 input ids and mask tokens count down from index 258.
->>> # => mask to "The dog [258]a ball [257]park."
-
->>> input_ids = torch.tensor([input_ids[:8] + [258] + input_ids[14:21] + [257] + input_ids[28:]])
->>> input_ids
-tensor([[ 87, 107, 104,  35, 103, 114, 106,  35, 258,  35, 100,  35, 101, 100, 111, 111, 257,  35, 115, 100, 117, 110,  49,   1]])
-
->>> # ByT5 produces only one char at a time so we need to produce many more output characters here -> set `max_length=100`.
->>> output_ids = model.generate(input_ids, max_length=100)[0].tolist()
->>> output_ids
-[0, 258, 108, 118,  35, 119, 107, 104,  35, 114, 113, 104,  35, 122, 107, 114,  35, 103, 114, 104, 118, 257,  35, 108, 113,  35, 119, 107, 104,  35, 103, 108, 118, 102, 114, 256, 108, 113,  35, 119, 107, 104, 35, 115, 100, 117, 110,  49,  35,  87, 107, 104,  35, 103, 114, 106, 35, 108, 118,  35, 119, 107, 104,  35, 114, 113, 104,  35, 122, 107, 114,  35, 103, 114, 104, 118,  35, 100,  35, 101, 100, 111, 111,  35, 108, 113, 255,  35, 108, 113,  35, 119, 107, 104,  35, 115, 100, 117, 110,  49]
-
->>> # ^- Note how 258 descends to 257, 256, 255
-
->>> # Now we need to split on the sentinel tokens, let's write a short loop for this
->>> output_ids_list = []
->>> start_token = 0
->>> sentinel_token = 258
->>> while sentinel_token in output_ids:
-...     split_idx = output_ids.index(sentinel_token)
-...     output_ids_list.append(output_ids[start_token:split_idx])
-...     start_token = split_idx
-...     sentinel_token -= 1
-
->>> output_ids_list.append(output_ids[start_token:])
->>> output_string = tokenizer.batch_decode(output_ids_list)
->>> output_string
-['<pad>', 'is the one who does', ' in the disco', 'in the park. The dog is the one who does a ball in', ' in the park.']
-```
-
-
-## ByT5Tokenizer
-
-[[autodoc]] ByT5Tokenizer
-
-See [`ByT5Tokenizer`] for all details.
diff --git a/docs/source/en/model_doc/camembert.md b/docs/source/en/model_doc/camembert.md
new file mode 100644
index 000000000000..3ec4cd5dd0b1
--- /dev/null
+++ b/docs/source/en/model_doc/camembert.md
@@ -0,0 +1,123 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# CamemBERT
+
+## Overview
+
+The CamemBERT model was proposed in [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by
+Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suárez, Yoann Dupont, Laurent Romary, Éric Villemonte de la
+Clergerie, Djamé Seddah, and Benoît Sagot. It is based on Facebook's RoBERTa model released in 2019. It is a model
+trained on 138GB of French text.
+
+The abstract from the paper is the following:
+
+*Pretrained language models are now ubiquitous in Natural Language Processing. Despite their success, most available
+models have either been trained on English data or on the concatenation of data in multiple languages. This makes
+practical use of such models --in all languages except English-- very limited. Aiming to address this issue for French,
+we release CamemBERT, a French version of the Bi-directional Encoders for Transformers (BERT). We measure the
+performance of CamemBERT compared to multilingual models in multiple downstream tasks, namely part-of-speech tagging,
+dependency parsing, named-entity recognition, and natural language inference. CamemBERT improves the state of the art
+for most of the tasks considered. We release the pretrained model for CamemBERT hoping to foster research and
+downstream applications for French NLP.*
+
+Tips:
+
+- This implementation is the same as RoBERTa. Refer to the [documentation of RoBERTa](roberta) for usage examples
+  as well as the information relative to the inputs and outputs.
+
+This model was contributed by [camembert](https://huggingface.co/camembert). The original code can be found [here](https://camembert-model.fr/).
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Causal language modeling task guide](../tasks/language_modeling)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+- [Multiple choice task guide](../tasks/multiple_choice)
+
+## CamembertConfig
+
+[[autodoc]] CamembertConfig
+
+## CamembertTokenizer
+
+[[autodoc]] CamembertTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## CamembertTokenizerFast
+
+[[autodoc]] CamembertTokenizerFast
+
+## CamembertModel
+
+[[autodoc]] CamembertModel
+
+## CamembertForCausalLM
+
+[[autodoc]] CamembertForCausalLM
+
+## CamembertForMaskedLM
+
+[[autodoc]] CamembertForMaskedLM
+
+## CamembertForSequenceClassification
+
+[[autodoc]] CamembertForSequenceClassification
+
+## CamembertForMultipleChoice
+
+[[autodoc]] CamembertForMultipleChoice
+
+## CamembertForTokenClassification
+
+[[autodoc]] CamembertForTokenClassification
+
+## CamembertForQuestionAnswering
+
+[[autodoc]] CamembertForQuestionAnswering
+
+## TFCamembertModel
+
+[[autodoc]] TFCamembertModel
+
+## TFCamembertForCasualLM
+
+[[autodoc]] TFCamembertForCausalLM
+
+## TFCamembertForMaskedLM
+
+[[autodoc]] TFCamembertForMaskedLM
+
+## TFCamembertForSequenceClassification
+
+[[autodoc]] TFCamembertForSequenceClassification
+
+## TFCamembertForMultipleChoice
+
+[[autodoc]] TFCamembertForMultipleChoice
+
+## TFCamembertForTokenClassification
+
+[[autodoc]] TFCamembertForTokenClassification
+
+## TFCamembertForQuestionAnswering
+
+[[autodoc]] TFCamembertForQuestionAnswering
diff --git a/docs/source/en/model_doc/camembert.mdx b/docs/source/en/model_doc/camembert.mdx
deleted file mode 100644
index a35d5aefca67..000000000000
--- a/docs/source/en/model_doc/camembert.mdx
+++ /dev/null
@@ -1,110 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# CamemBERT
-
-## Overview
-
-The CamemBERT model was proposed in [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by
-Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suárez, Yoann Dupont, Laurent Romary, Éric Villemonte de la
-Clergerie, Djamé Seddah, and Benoît Sagot. It is based on Facebook's RoBERTa model released in 2019. It is a model
-trained on 138GB of French text.
-
-The abstract from the paper is the following:
-
-*Pretrained language models are now ubiquitous in Natural Language Processing. Despite their success, most available
-models have either been trained on English data or on the concatenation of data in multiple languages. This makes
-practical use of such models --in all languages except English-- very limited. Aiming to address this issue for French,
-we release CamemBERT, a French version of the Bi-directional Encoders for Transformers (BERT). We measure the
-performance of CamemBERT compared to multilingual models in multiple downstream tasks, namely part-of-speech tagging,
-dependency parsing, named-entity recognition, and natural language inference. CamemBERT improves the state of the art
-for most of the tasks considered. We release the pretrained model for CamemBERT hoping to foster research and
-downstream applications for French NLP.*
-
-Tips:
-
-- This implementation is the same as RoBERTa. Refer to the [documentation of RoBERTa](roberta) for usage examples
-  as well as the information relative to the inputs and outputs.
-
-This model was contributed by [camembert](https://huggingface.co/camembert). The original code can be found [here](https://camembert-model.fr/).
-
-## CamembertConfig
-
-[[autodoc]] CamembertConfig
-
-## CamembertTokenizer
-
-[[autodoc]] CamembertTokenizer
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-
-## CamembertTokenizerFast
-
-[[autodoc]] CamembertTokenizerFast
-
-## CamembertModel
-
-[[autodoc]] CamembertModel
-
-## CamembertForCausalLM
-
-[[autodoc]] CamembertForCausalLM
-
-## CamembertForMaskedLM
-
-[[autodoc]] CamembertForMaskedLM
-
-## CamembertForSequenceClassification
-
-[[autodoc]] CamembertForSequenceClassification
-
-## CamembertForMultipleChoice
-
-[[autodoc]] CamembertForMultipleChoice
-
-## CamembertForTokenClassification
-
-[[autodoc]] CamembertForTokenClassification
-
-## CamembertForQuestionAnswering
-
-[[autodoc]] CamembertForQuestionAnswering
-
-## TFCamembertModel
-
-[[autodoc]] TFCamembertModel
-
-## TFCamembertForCasualLM
-
-[[autodoc]] TFCamembertForCausalLM
-
-## TFCamembertForMaskedLM
-
-[[autodoc]] TFCamembertForMaskedLM
-
-## TFCamembertForSequenceClassification
-
-[[autodoc]] TFCamembertForSequenceClassification
-
-## TFCamembertForMultipleChoice
-
-[[autodoc]] TFCamembertForMultipleChoice
-
-## TFCamembertForTokenClassification
-
-[[autodoc]] TFCamembertForTokenClassification
-
-## TFCamembertForQuestionAnswering
-
-[[autodoc]] TFCamembertForQuestionAnswering
diff --git a/docs/source/en/model_doc/canine.md b/docs/source/en/model_doc/canine.md
new file mode 100644
index 000000000000..748ec63eccce
--- /dev/null
+++ b/docs/source/en/model_doc/canine.md
@@ -0,0 +1,144 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# CANINE
+
+## Overview
+
+The CANINE model was proposed in [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language
+Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting. It's
+among the first papers that trains a Transformer without using an explicit tokenization step (such as Byte Pair
+Encoding (BPE), WordPiece or SentencePiece). Instead, the model is trained directly at a Unicode character-level.
+Training at a character-level inevitably comes with a longer sequence length, which CANINE solves with an efficient
+downsampling strategy, before applying a deep Transformer encoder.
+
+The abstract from the paper is the following:
+
+*Pipelined NLP systems have largely been superseded by end-to-end neural modeling, yet nearly all commonly-used models
+still require an explicit tokenization step. While recent tokenization approaches based on data-derived subword
+lexicons are less brittle than manually engineered tokenizers, these techniques are not equally suited to all
+languages, and the use of any fixed vocabulary may limit a model's ability to adapt. In this paper, we present CANINE,
+a neural encoder that operates directly on character sequences, without explicit tokenization or vocabulary, and a
+pre-training strategy that operates either directly on characters or optionally uses subwords as a soft inductive bias.
+To use its finer-grained input effectively and efficiently, CANINE combines downsampling, which reduces the input
+sequence length, with a deep transformer stack, which encodes context. CANINE outperforms a comparable mBERT model by
+2.8 F1 on TyDi QA, a challenging multilingual benchmark, despite having 28% fewer model parameters.*
+
+Tips:
+
+- CANINE uses no less than 3 Transformer encoders internally: 2 "shallow" encoders (which only consist of a single
+  layer) and 1 "deep" encoder (which is a regular BERT encoder). First, a "shallow" encoder is used to contextualize
+  the character embeddings, using local attention. Next, after downsampling, a "deep" encoder is applied. Finally,
+  after upsampling, a "shallow" encoder is used to create the final character embeddings. Details regarding up- and
+  downsampling can be found in the paper.
+- CANINE uses a max sequence length of 2048 characters by default. One can use [`CanineTokenizer`]
+  to prepare text for the model.
+- Classification can be done by placing a linear layer on top of the final hidden state of the special [CLS] token
+  (which has a predefined Unicode code point). For token classification tasks however, the downsampled sequence of
+  tokens needs to be upsampled again to match the length of the original character sequence (which is 2048). The
+  details for this can be found in the paper.
+-  Models:
+
+  - [google/canine-c](https://huggingface.co/google/canine-c): Pre-trained with autoregressive character loss,
+    12-layer, 768-hidden, 12-heads, 121M parameters (size ~500 MB).
+  - [google/canine-s](https://huggingface.co/google/canine-s): Pre-trained with subword loss, 12-layer,
+    768-hidden, 12-heads, 121M parameters (size ~500 MB).
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/google-research/language/tree/master/language/canine).
+
+
+### Example
+
+CANINE works on raw characters, so it can be used without a tokenizer:
+
+```python
+>>> from transformers import CanineModel
+>>> import torch
+
+>>> model = CanineModel.from_pretrained("google/canine-c")  # model pre-trained with autoregressive character loss
+
+>>> text = "hello world"
+>>> # use Python's built-in ord() function to turn each character into its unicode code point id
+>>> input_ids = torch.tensor([[ord(char) for char in text]])
+
+>>> outputs = model(input_ids)  # forward pass
+>>> pooled_output = outputs.pooler_output
+>>> sequence_output = outputs.last_hidden_state
+```
+
+For batched inference and training, it is however recommended to make use of the tokenizer (to pad/truncate all
+sequences to the same length):
+
+```python
+>>> from transformers import CanineTokenizer, CanineModel
+
+>>> model = CanineModel.from_pretrained("google/canine-c")
+>>> tokenizer = CanineTokenizer.from_pretrained("google/canine-c")
+
+>>> inputs = ["Life is like a box of chocolates.", "You never know what you gonna get."]
+>>> encoding = tokenizer(inputs, padding="longest", truncation=True, return_tensors="pt")
+
+>>> outputs = model(**encoding)  # forward pass
+>>> pooled_output = outputs.pooler_output
+>>> sequence_output = outputs.last_hidden_state
+```
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Multiple choice task guide](../tasks/multiple_choice)
+
+## CANINE specific outputs
+
+[[autodoc]] models.canine.modeling_canine.CanineModelOutputWithPooling
+
+## CanineConfig
+
+[[autodoc]] CanineConfig
+
+## CanineTokenizer
+
+[[autodoc]] CanineTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+
+## CanineModel
+
+[[autodoc]] CanineModel
+    - forward
+
+## CanineForSequenceClassification
+
+[[autodoc]] CanineForSequenceClassification
+    - forward
+
+## CanineForMultipleChoice
+
+[[autodoc]] CanineForMultipleChoice
+    - forward
+
+## CanineForTokenClassification
+
+[[autodoc]] CanineForTokenClassification
+    - forward
+
+## CanineForQuestionAnswering
+
+[[autodoc]] CanineForQuestionAnswering
+    - forward
diff --git a/docs/source/en/model_doc/canine.mdx b/docs/source/en/model_doc/canine.mdx
deleted file mode 100644
index e73777d00082..000000000000
--- a/docs/source/en/model_doc/canine.mdx
+++ /dev/null
@@ -1,133 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# CANINE
-
-## Overview
-
-The CANINE model was proposed in [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language
-Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting. It's
-among the first papers that trains a Transformer without using an explicit tokenization step (such as Byte Pair
-Encoding (BPE), WordPiece or SentencePiece). Instead, the model is trained directly at a Unicode character-level.
-Training at a character-level inevitably comes with a longer sequence length, which CANINE solves with an efficient
-downsampling strategy, before applying a deep Transformer encoder.
-
-The abstract from the paper is the following:
-
-*Pipelined NLP systems have largely been superseded by end-to-end neural modeling, yet nearly all commonly-used models
-still require an explicit tokenization step. While recent tokenization approaches based on data-derived subword
-lexicons are less brittle than manually engineered tokenizers, these techniques are not equally suited to all
-languages, and the use of any fixed vocabulary may limit a model's ability to adapt. In this paper, we present CANINE,
-a neural encoder that operates directly on character sequences, without explicit tokenization or vocabulary, and a
-pre-training strategy that operates either directly on characters or optionally uses subwords as a soft inductive bias.
-To use its finer-grained input effectively and efficiently, CANINE combines downsampling, which reduces the input
-sequence length, with a deep transformer stack, which encodes context. CANINE outperforms a comparable mBERT model by
-2.8 F1 on TyDi QA, a challenging multilingual benchmark, despite having 28% fewer model parameters.*
-
-Tips:
-
-- CANINE uses no less than 3 Transformer encoders internally: 2 "shallow" encoders (which only consist of a single
-  layer) and 1 "deep" encoder (which is a regular BERT encoder). First, a "shallow" encoder is used to contextualize
-  the character embeddings, using local attention. Next, after downsampling, a "deep" encoder is applied. Finally,
-  after upsampling, a "shallow" encoder is used to create the final character embeddings. Details regarding up- and
-  downsampling can be found in the paper.
-- CANINE uses a max sequence length of 2048 characters by default. One can use [`CanineTokenizer`]
-  to prepare text for the model.
-- Classification can be done by placing a linear layer on top of the final hidden state of the special [CLS] token
-  (which has a predefined Unicode code point). For token classification tasks however, the downsampled sequence of
-  tokens needs to be upsampled again to match the length of the original character sequence (which is 2048). The
-  details for this can be found in the paper.
--  Models:
-
-  - [google/canine-c](https://huggingface.co/google/canine-c): Pre-trained with autoregressive character loss,
-    12-layer, 768-hidden, 12-heads, 121M parameters (size ~500 MB).
-  - [google/canine-s](https://huggingface.co/google/canine-s): Pre-trained with subword loss, 12-layer,
-    768-hidden, 12-heads, 121M parameters (size ~500 MB).
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/google-research/language/tree/master/language/canine).
-
-
-### Example
-
-CANINE works on raw characters, so it can be used without a tokenizer:
-
-```python
->>> from transformers import CanineModel
->>> import torch
-
->>> model = CanineModel.from_pretrained("google/canine-c")  # model pre-trained with autoregressive character loss
-
->>> text = "hello world"
->>> # use Python's built-in ord() function to turn each character into its unicode code point id
->>> input_ids = torch.tensor([[ord(char) for char in text]])
-
->>> outputs = model(input_ids)  # forward pass
->>> pooled_output = outputs.pooler_output
->>> sequence_output = outputs.last_hidden_state
-```
-
-For batched inference and training, it is however recommended to make use of the tokenizer (to pad/truncate all
-sequences to the same length):
-
-```python
->>> from transformers import CanineTokenizer, CanineModel
-
->>> model = CanineModel.from_pretrained("google/canine-c")
->>> tokenizer = CanineTokenizer.from_pretrained("google/canine-c")
-
->>> inputs = ["Life is like a box of chocolates.", "You never know what you gonna get."]
->>> encoding = tokenizer(inputs, padding="longest", truncation=True, return_tensors="pt")
-
->>> outputs = model(**encoding)  # forward pass
->>> pooled_output = outputs.pooler_output
->>> sequence_output = outputs.last_hidden_state
-```
-
-## CANINE specific outputs
-
-[[autodoc]] models.canine.modeling_canine.CanineModelOutputWithPooling
-
-## CanineConfig
-
-[[autodoc]] CanineConfig
-
-## CanineTokenizer
-
-[[autodoc]] CanineTokenizer
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-
-## CanineModel
-
-[[autodoc]] CanineModel
-    - forward
-
-## CanineForSequenceClassification
-
-[[autodoc]] CanineForSequenceClassification
-    - forward
-
-## CanineForMultipleChoice
-
-[[autodoc]] CanineForMultipleChoice
-    - forward
-
-## CanineForTokenClassification
-
-[[autodoc]] CanineForTokenClassification
-    - forward
-
-## CanineForQuestionAnswering
-
-[[autodoc]] CanineForQuestionAnswering
-    - forward
diff --git a/docs/source/en/model_doc/chinese_clip.md b/docs/source/en/model_doc/chinese_clip.md
new file mode 100644
index 000000000000..430a734014c5
--- /dev/null
+++ b/docs/source/en/model_doc/chinese_clip.md
@@ -0,0 +1,112 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Chinese-CLIP
+
+## Overview
+
+The Chinese-CLIP model was proposed in [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
+Chinese-CLIP is an implementation of CLIP (Radford et al., 2021) on a large-scale dataset of Chinese image-text pairs. It is capable of performing cross-modal retrieval and also playing as a vision backbone for vision tasks like zero-shot image classification, open-domain object detection, etc. The original Chinese-CLIP code is released [at this link](https://github.com/OFA-Sys/Chinese-CLIP).
+
+The abstract from the paper is the following:
+
+*The tremendous success of CLIP (Radford et al., 2021) has promoted the research and application of contrastive learning for vision-language pretraining. In this work, we construct a large-scale dataset of image-text pairs in Chinese, where most data are retrieved from publicly available datasets, and we pretrain Chinese CLIP models on the new dataset. We develop 5 Chinese CLIP models of multiple sizes, spanning from 77 to 958 million parameters. Furthermore, we propose a two-stage pretraining method, where the model is first trained with the image encoder frozen and then trained with all parameters being optimized, to achieve enhanced model performance. Our comprehensive experiments demonstrate that Chinese CLIP can achieve the state-of-the-art performance on MUGE, Flickr30K-CN, and COCO-CN in the setups of zero-shot learning and finetuning, and it is able to achieve competitive performance in zero-shot image classification based on the evaluation on the ELEVATER benchmark (Li et al., 2022). Our codes, pretrained models, and demos have been released.*
+
+## Usage
+
+The code snippet below shows how to compute image & text features and similarities:
+
+```python
+>>> from PIL import Image
+>>> import requests
+>>> from transformers import ChineseCLIPProcessor, ChineseCLIPModel
+
+>>> model = ChineseCLIPModel.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16")
+>>> processor = ChineseCLIPProcessor.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16")
+
+>>> url = "https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/pokemon.jpeg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+>>> # Squirtle, Bulbasaur, Charmander, Pikachu in English
+>>> texts = ["杰尼龟", "妙蛙种子", "小火龙", "皮卡丘"]
+
+>>> # compute image feature
+>>> inputs = processor(images=image, return_tensors="pt")
+>>> image_features = model.get_image_features(**inputs)
+>>> image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)  # normalize
+
+>>> # compute text features
+>>> inputs = processor(text=texts, padding=True, return_tensors="pt")
+>>> text_features = model.get_text_features(**inputs)
+>>> text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True)  # normalize
+
+>>> # compute image-text similarity scores
+>>> inputs = processor(text=texts, images=image, return_tensors="pt", padding=True)
+>>> outputs = model(**inputs)
+>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+>>> probs = logits_per_image.softmax(dim=1)  # probs: [[1.2686e-03, 5.4499e-02, 6.7968e-04, 9.4355e-01]]
+```
+
+Currently, we release the following scales of pretrained Chinese-CLIP models at HF Model Hub:
+
+- [OFA-Sys/chinese-clip-vit-base-patch16](https://huggingface.co/OFA-Sys/chinese-clip-vit-base-patch16)
+- [OFA-Sys/chinese-clip-vit-large-patch14](https://huggingface.co/OFA-Sys/chinese-clip-vit-large-patch14)
+- [OFA-Sys/chinese-clip-vit-large-patch14-336px](https://huggingface.co/OFA-Sys/chinese-clip-vit-large-patch14-336px)
+- [OFA-Sys/chinese-clip-vit-huge-patch14](https://huggingface.co/OFA-Sys/chinese-clip-vit-huge-patch14)
+
+The Chinese-CLIP model was contributed by [OFA-Sys](https://huggingface.co/OFA-Sys). 
+
+## ChineseCLIPConfig
+
+[[autodoc]] ChineseCLIPConfig
+    - from_text_vision_configs
+
+## ChineseCLIPTextConfig
+
+[[autodoc]] ChineseCLIPTextConfig
+
+## ChineseCLIPVisionConfig
+
+[[autodoc]] ChineseCLIPVisionConfig
+
+## ChineseCLIPImageProcessor
+
+[[autodoc]] ChineseCLIPImageProcessor
+    - preprocess
+
+## ChineseCLIPFeatureExtractor
+
+[[autodoc]] ChineseCLIPFeatureExtractor
+
+## ChineseCLIPProcessor
+
+[[autodoc]] ChineseCLIPProcessor
+
+## ChineseCLIPModel
+
+[[autodoc]] ChineseCLIPModel
+    - forward
+    - get_text_features
+    - get_image_features
+
+## ChineseCLIPTextModel
+
+[[autodoc]] ChineseCLIPTextModel
+    - forward
+
+## ChineseCLIPVisionModel
+
+[[autodoc]] ChineseCLIPVisionModel
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/chinese_clip.mdx b/docs/source/en/model_doc/chinese_clip.mdx
deleted file mode 100644
index d8973759ed5a..000000000000
--- a/docs/source/en/model_doc/chinese_clip.mdx
+++ /dev/null
@@ -1,108 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Chinese-CLIP
-
-## Overview
-
-The Chinese-CLIP model was proposed in [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
-Chinese-CLIP is an implementation of CLIP (Radford et al., 2021) on a large-scale dataset of Chinese image-text pairs. It is capable of performing cross-modal retrieval and also playing as a vision backbone for vision tasks like zero-shot image classification, open-domain object detection, etc. The original Chinese-CLIP code is released [at this link](https://github.com/OFA-Sys/Chinese-CLIP).
-
-The abstract from the paper is the following:
-
-*The tremendous success of CLIP (Radford et al., 2021) has promoted the research and application of contrastive learning for vision-language pretraining. In this work, we construct a large-scale dataset of image-text pairs in Chinese, where most data are retrieved from publicly available datasets, and we pretrain Chinese CLIP models on the new dataset. We develop 5 Chinese CLIP models of multiple sizes, spanning from 77 to 958 million parameters. Furthermore, we propose a two-stage pretraining method, where the model is first trained with the image encoder frozen and then trained with all parameters being optimized, to achieve enhanced model performance. Our comprehensive experiments demonstrate that Chinese CLIP can achieve the state-of-the-art performance on MUGE, Flickr30K-CN, and COCO-CN in the setups of zero-shot learning and finetuning, and it is able to achieve competitive performance in zero-shot image classification based on the evaluation on the ELEVATER benchmark (Li et al., 2022). Our codes, pretrained models, and demos have been released.*
-
-## Usage
-
-The code snippet below shows how to compute image & text features and similarities:
-
-```python
->>> from PIL import Image
->>> import requests
->>> from transformers import ChineseCLIPProcessor, ChineseCLIPModel
-
->>> model = ChineseCLIPModel.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16")
->>> processor = ChineseCLIPProcessor.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16")
-
->>> url = "https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/pokemon.jpeg"
->>> image = Image.open(requests.get(url, stream=True).raw)
->>> # Squirtle, Bulbasaur, Charmander, Pikachu in English
->>> texts = ["杰尼龟", "妙蛙种子", "小火龙", "皮卡丘"]
-
->>> # compute image feature
->>> inputs = processor(images=image, return_tensors="pt")
->>> image_features = model.get_image_features(**inputs)
->>> image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)  # normalize
-
->>> # compute text features
->>> inputs = processor(text=texts, padding=True, return_tensors="pt")
->>> text_features = model.get_text_features(**inputs)
->>> text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True)  # normalize
-
->>> # compute image-text similarity scores
->>> inputs = processor(text=texts, images=image, return_tensors="pt", padding=True)
->>> outputs = model(**inputs)
->>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
->>> probs = logits_per_image.softmax(dim=1)  # probs: [[1.2686e-03, 5.4499e-02, 6.7968e-04, 9.4355e-01]]
-```
-
-Currently, we release the following scales of pretrained Chinese-CLIP models at HF Model Hub:
-
-- [OFA-Sys/chinese-clip-vit-base-patch16](https://huggingface.co/OFA-Sys/chinese-clip-vit-base-patch16)
-- [OFA-Sys/chinese-clip-vit-large-patch14](https://huggingface.co/OFA-Sys/chinese-clip-vit-large-patch14)
-- [OFA-Sys/chinese-clip-vit-large-patch14-336px](https://huggingface.co/OFA-Sys/chinese-clip-vit-large-patch14-336px)
-- [OFA-Sys/chinese-clip-vit-huge-patch14](https://huggingface.co/OFA-Sys/chinese-clip-vit-huge-patch14)
-
-The Chinese-CLIP model was contributed by [OFA-Sys](https://huggingface.co/OFA-Sys). 
-
-## ChineseCLIPConfig
-
-[[autodoc]] ChineseCLIPConfig
-    - from_text_vision_configs
-
-## ChineseCLIPTextConfig
-
-[[autodoc]] ChineseCLIPTextConfig
-
-## ChineseCLIPVisionConfig
-
-[[autodoc]] ChineseCLIPVisionConfig
-
-## ChineseCLIPImageProcessor
-
-[[autodoc]] ChineseCLIPImageProcessor
-    - preprocess
-
-## ChineseCLIPFeatureExtractor
-
-[[autodoc]] ChineseCLIPFeatureExtractor
-
-## ChineseCLIPProcessor
-
-[[autodoc]] ChineseCLIPProcessor
-
-## ChineseCLIPModel
-
-[[autodoc]] ChineseCLIPModel
-    - forward
-    - get_text_features
-    - get_image_features
-
-## ChineseCLIPTextModel
-
-[[autodoc]] ChineseCLIPTextModel
-    - forward
-
-## ChineseCLIPVisionModel
-
-[[autodoc]] ChineseCLIPVisionModel
-    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/clap.md b/docs/source/en/model_doc/clap.md
new file mode 100644
index 000000000000..54082ec8aada
--- /dev/null
+++ b/docs/source/en/model_doc/clap.md
@@ -0,0 +1,81 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# CLAP
+
+## Overview
+
+The CLAP model was proposed in [Large Scale Contrastive Language-Audio pretraining with
+feature fusion and keyword-to-caption augmentation](https://arxiv.org/pdf/2211.06687.pdf) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
+
+CLAP (Contrastive Language-Audio Pretraining) is a neural network trained on a variety of (audio, text) pairs. It can be instructed in to predict the most relevant text snippet, given an audio, without directly optimizing for the task. The CLAP model uses a SWINTransformer to get audio features from a log-Mel spectrogram input, and a RoBERTa model to get text features. Both the text and audio features are then projected to a latent space with identical dimension. The dot product between the projected audio and text features is then used as a similar score.
+
+The abstract from the paper is the following:
+
+*Contrastive learning has shown remarkable success in the field of multimodal representation learning. In this paper, we propose a pipeline of contrastive language-audio pretraining to develop an audio representation by combining audio data with natural language descriptions. To accomplish this target, we first release LAION-Audio-630K, a large collection of 633,526 audio-text pairs from different data sources. Second, we construct a contrastive language-audio pretraining model by considering different audio encoders and text encoders. We incorporate the feature fusion mechanism and keyword-to-caption augmentation into the model design to further enable the model to process audio inputs of variable lengths and enhance the performance. Third, we perform comprehensive experiments to evaluate our model across three tasks: text-to-audio retrieval, zero-shot audio classification, and supervised audio classification. The results demonstrate that our model achieves superior performance in text-to-audio retrieval task. In audio classification tasks, the model achieves state-of-the-art performance in the zeroshot setting and is able to obtain performance comparable to models' results in the non-zero-shot setting. LAION-Audio-6*
+
+This model was contributed by [Younes Belkada](https://huggingface.co/ybelkada) and [Arthur Zucker](https://huggingface.co/ArtZucker) .
+The original code can be found [here](https://github.com/LAION-AI/Clap).
+
+
+## ClapConfig
+
+[[autodoc]] ClapConfig
+    - from_text_audio_configs
+
+## ClapTextConfig
+
+[[autodoc]] ClapTextConfig
+
+## ClapAudioConfig
+
+[[autodoc]] ClapAudioConfig
+
+## ClapFeatureExtractor
+
+[[autodoc]] ClapFeatureExtractor
+
+## ClapProcessor
+
+[[autodoc]] ClapProcessor
+
+## ClapModel
+
+[[autodoc]] ClapModel
+    - forward
+    - get_text_features
+    - get_audio_features
+
+## ClapTextModel
+
+[[autodoc]] ClapTextModel
+    - forward
+
+## ClapTextModelWithProjection
+
+[[autodoc]] ClapTextModelWithProjection
+    - forward
+
+## ClapAudioModel
+
+[[autodoc]] ClapAudioModel
+    - forward
+
+## ClapAudioModelWithProjection
+
+[[autodoc]] ClapAudioModelWithProjection
+    - forward
+
diff --git a/docs/source/en/model_doc/clip.md b/docs/source/en/model_doc/clip.md
new file mode 100644
index 000000000000..7b050296aeac
--- /dev/null
+++ b/docs/source/en/model_doc/clip.md
@@ -0,0 +1,196 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# CLIP
+
+## Overview
+
+The CLIP model was proposed in [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh,
+Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever. CLIP
+(Contrastive Language-Image Pre-Training) is a neural network trained on a variety of (image, text) pairs. It can be
+instructed in natural language to predict the most relevant text snippet, given an image, without directly optimizing
+for the task, similarly to the zero-shot capabilities of GPT-2 and 3.
+
+The abstract from the paper is the following:
+
+*State-of-the-art computer vision systems are trained to predict a fixed set of predetermined object categories. This
+restricted form of supervision limits their generality and usability since additional labeled data is needed to specify
+any other visual concept. Learning directly from raw text about images is a promising alternative which leverages a
+much broader source of supervision. We demonstrate that the simple pre-training task of predicting which caption goes
+with which image is an efficient and scalable way to learn SOTA image representations from scratch on a dataset of 400
+million (image, text) pairs collected from the internet. After pre-training, natural language is used to reference
+learned visual concepts (or describe new ones) enabling zero-shot transfer of the model to downstream tasks. We study
+the performance of this approach by benchmarking on over 30 different existing computer vision datasets, spanning tasks
+such as OCR, action recognition in videos, geo-localization, and many types of fine-grained object classification. The
+model transfers non-trivially to most tasks and is often competitive with a fully supervised baseline without the need
+for any dataset specific training. For instance, we match the accuracy of the original ResNet-50 on ImageNet zero-shot
+without needing to use any of the 1.28 million training examples it was trained on. We release our code and pre-trained
+model weights at this https URL.*
+
+## Usage
+
+CLIP is a multi-modal vision and language model. It can be used for image-text similarity and for zero-shot image
+classification. CLIP uses a ViT like transformer to get visual features and a causal language model to get the text
+features. Both the text and visual features are then projected to a latent space with identical dimension. The dot
+product between the projected image and text features is then used as a similar score.
+
+To feed images to the Transformer encoder, each image is split into a sequence of fixed-size non-overlapping patches,
+which are then linearly embedded. A [CLS] token is added to serve as representation of an entire image. The authors
+also add absolute position embeddings, and feed the resulting sequence of vectors to a standard Transformer encoder.
+The [`CLIPImageProcessor`] can be used to resize (or rescale) and normalize images for the model.
+
+The [`CLIPTokenizer`] is used to encode the text. The [`CLIPProcessor`] wraps
+[`CLIPImageProcessor`] and [`CLIPTokenizer`] into a single instance to both
+encode the text and prepare the images. The following example shows how to get the image-text similarity scores using
+[`CLIPProcessor`] and [`CLIPModel`].
+
+
+```python
+>>> from PIL import Image
+>>> import requests
+
+>>> from transformers import CLIPProcessor, CLIPModel
+
+>>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+>>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
+
+>>> outputs = model(**inputs)
+>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
+```
+
+This model was contributed by [valhalla](https://huggingface.co/valhalla). The original code can be found [here](https://github.com/openai/CLIP).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with CLIP.
+
+- A blog post on [How to fine-tune CLIP on 10,000 image-text pairs](https://huggingface.co/blog/fine-tune-clip-rsicd).
+- CLIP is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/contrastive-image-text).
+- A [notebook](https://colab.research.google.com/drive/1zip3zmrbuKerAfC1d2uS1mqQS-QykXnl?usp=sharing) on how to fine-tune the CLIP model with Korean multimodal dataset. 🌎🇰🇷
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we will review it.
+The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## CLIPConfig
+
+[[autodoc]] CLIPConfig
+    - from_text_vision_configs
+
+## CLIPTextConfig
+
+[[autodoc]] CLIPTextConfig
+
+## CLIPVisionConfig
+
+[[autodoc]] CLIPVisionConfig
+
+## CLIPTokenizer
+
+[[autodoc]] CLIPTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## CLIPTokenizerFast
+
+[[autodoc]] CLIPTokenizerFast
+
+## CLIPImageProcessor
+
+[[autodoc]] CLIPImageProcessor
+    - preprocess
+
+## CLIPFeatureExtractor
+
+[[autodoc]] CLIPFeatureExtractor
+
+## CLIPProcessor
+
+[[autodoc]] CLIPProcessor
+
+## CLIPModel
+
+[[autodoc]] CLIPModel
+    - forward
+    - get_text_features
+    - get_image_features
+
+## CLIPTextModel
+
+[[autodoc]] CLIPTextModel
+    - forward
+
+## CLIPTextModelWithProjection
+
+[[autodoc]] CLIPTextModelWithProjection
+    - forward
+
+## CLIPVisionModelWithProjection
+
+[[autodoc]] CLIPVisionModelWithProjection
+    - forward
+
+
+## CLIPVisionModel
+
+[[autodoc]] CLIPVisionModel
+    - forward
+
+## TFCLIPModel
+
+[[autodoc]] TFCLIPModel
+    - call
+    - get_text_features
+    - get_image_features
+
+## TFCLIPTextModel
+
+[[autodoc]] TFCLIPTextModel
+    - call
+
+## TFCLIPVisionModel
+
+[[autodoc]] TFCLIPVisionModel
+    - call
+
+## FlaxCLIPModel
+
+[[autodoc]] FlaxCLIPModel
+    - __call__
+    - get_text_features
+    - get_image_features
+
+## FlaxCLIPTextModel
+
+[[autodoc]] FlaxCLIPTextModel
+    - __call__
+
+## FlaxCLIPTextModelWithProjection
+
+[[autodoc]] FlaxCLIPTextModelWithProjection
+    - __call__
+
+## FlaxCLIPVisionModel
+
+[[autodoc]] FlaxCLIPVisionModel
+    - __call__
diff --git a/docs/source/en/model_doc/clip.mdx b/docs/source/en/model_doc/clip.mdx
deleted file mode 100644
index 943a0f7f5a4e..000000000000
--- a/docs/source/en/model_doc/clip.mdx
+++ /dev/null
@@ -1,195 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# CLIP
-
-## Overview
-
-The CLIP model was proposed in [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh,
-Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever. CLIP
-(Contrastive Language-Image Pre-Training) is a neural network trained on a variety of (image, text) pairs. It can be
-instructed in natural language to predict the most relevant text snippet, given an image, without directly optimizing
-for the task, similarly to the zero-shot capabilities of GPT-2 and 3.
-
-The abstract from the paper is the following:
-
-*State-of-the-art computer vision systems are trained to predict a fixed set of predetermined object categories. This
-restricted form of supervision limits their generality and usability since additional labeled data is needed to specify
-any other visual concept. Learning directly from raw text about images is a promising alternative which leverages a
-much broader source of supervision. We demonstrate that the simple pre-training task of predicting which caption goes
-with which image is an efficient and scalable way to learn SOTA image representations from scratch on a dataset of 400
-million (image, text) pairs collected from the internet. After pre-training, natural language is used to reference
-learned visual concepts (or describe new ones) enabling zero-shot transfer of the model to downstream tasks. We study
-the performance of this approach by benchmarking on over 30 different existing computer vision datasets, spanning tasks
-such as OCR, action recognition in videos, geo-localization, and many types of fine-grained object classification. The
-model transfers non-trivially to most tasks and is often competitive with a fully supervised baseline without the need
-for any dataset specific training. For instance, we match the accuracy of the original ResNet-50 on ImageNet zero-shot
-without needing to use any of the 1.28 million training examples it was trained on. We release our code and pre-trained
-model weights at this https URL.*
-
-## Usage
-
-CLIP is a multi-modal vision and language model. It can be used for image-text similarity and for zero-shot image
-classification. CLIP uses a ViT like transformer to get visual features and a causal language model to get the text
-features. Both the text and visual features are then projected to a latent space with identical dimension. The dot
-product between the projected image and text features is then used as a similar score.
-
-To feed images to the Transformer encoder, each image is split into a sequence of fixed-size non-overlapping patches,
-which are then linearly embedded. A [CLS] token is added to serve as representation of an entire image. The authors
-also add absolute position embeddings, and feed the resulting sequence of vectors to a standard Transformer encoder.
-The [`CLIPFeatureExtractor`] can be used to resize (or rescale) and normalize images for the model.
-
-The [`CLIPTokenizer`] is used to encode the text. The [`CLIPProcessor`] wraps
-[`CLIPFeatureExtractor`] and [`CLIPTokenizer`] into a single instance to both
-encode the text and prepare the images. The following example shows how to get the image-text similarity scores using
-[`CLIPProcessor`] and [`CLIPModel`].
-
-
-```python
->>> from PIL import Image
->>> import requests
-
->>> from transformers import CLIPProcessor, CLIPModel
-
->>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
->>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
-
->>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
->>> image = Image.open(requests.get(url, stream=True).raw)
-
->>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
-
->>> outputs = model(**inputs)
->>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
->>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
-```
-
-This model was contributed by [valhalla](https://huggingface.co/valhalla). The original code can be found [here](https://github.com/openai/CLIP).
-
-## Resources
-
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with CLIP. If you're
-interested in submitting a resource to be included here, please feel free to open a Pull Request and we will review it.
-The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
-<PipelineTag pipeline="text-to-image"/>
-- A blog post on [How to use CLIP to retrieve images from text](https://huggingface.co/blog/fine-tune-clip-rsicd).
-- A blog bost on [How to use CLIP for Japanese text to image generation](https://huggingface.co/blog/japanese-stable-diffusion).
-
-
-<PipelineTag pipeline="image-to-text"/>
-- A notebook showing [Video to text matching with CLIP for videos](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/X-CLIP/Video_text_matching_with_X_CLIP.ipynb).
-
-
-<PipelineTag pipeline="zero-shot-classification"/>
-- A notebook showing [Zero shot video classification using CLIP for video](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/X-CLIP/Zero_shot_classify_a_YouTube_video_with_X_CLIP.ipynb).
-
-
-## CLIPConfig
-
-[[autodoc]] CLIPConfig
-    - from_text_vision_configs
-
-## CLIPTextConfig
-
-[[autodoc]] CLIPTextConfig
-
-## CLIPVisionConfig
-
-[[autodoc]] CLIPVisionConfig
-
-## CLIPTokenizer
-
-[[autodoc]] CLIPTokenizer
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-
-## CLIPTokenizerFast
-
-[[autodoc]] CLIPTokenizerFast
-
-## CLIPImageProcessor
-
-[[autodoc]] CLIPImageProcessor
-    - preprocess
-
-## CLIPFeatureExtractor
-
-[[autodoc]] CLIPFeatureExtractor
-
-## CLIPProcessor
-
-[[autodoc]] CLIPProcessor
-
-## CLIPModel
-
-[[autodoc]] CLIPModel
-    - forward
-    - get_text_features
-    - get_image_features
-
-## CLIPTextModel
-
-[[autodoc]] CLIPTextModel
-    - forward
-
-## CLIPTextModelWithProjection
-
-[[autodoc]] CLIPTextModelWithProjection
-    - forward
-
-## CLIPVisionModelWithProjection
-
-[[autodoc]] CLIPVisionModelWithProjection
-    - forward
-
-
-## CLIPVisionModel
-
-[[autodoc]] CLIPVisionModel
-    - forward
-
-## TFCLIPModel
-
-[[autodoc]] TFCLIPModel
-    - call
-    - get_text_features
-    - get_image_features
-
-## TFCLIPTextModel
-
-[[autodoc]] TFCLIPTextModel
-    - call
-
-## TFCLIPVisionModel
-
-[[autodoc]] TFCLIPVisionModel
-    - call
-
-## FlaxCLIPModel
-
-[[autodoc]] FlaxCLIPModel
-    - __call__
-    - get_text_features
-    - get_image_features
-
-## FlaxCLIPTextModel
-
-[[autodoc]] FlaxCLIPTextModel
-    - __call__
-
-## FlaxCLIPVisionModel
-
-[[autodoc]] FlaxCLIPVisionModel
-    - __call__
diff --git a/docs/source/en/model_doc/clipseg.md b/docs/source/en/model_doc/clipseg.md
new file mode 100644
index 000000000000..c4c60a48d055
--- /dev/null
+++ b/docs/source/en/model_doc/clipseg.md
@@ -0,0 +1,104 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# CLIPSeg
+
+## Overview
+
+The CLIPSeg model was proposed in [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke
+and Alexander Ecker. CLIPSeg adds a minimal decoder on top of a frozen [CLIP](clip) model for zero- and one-shot image segmentation.
+
+The abstract from the paper is the following:
+
+*Image segmentation is usually addressed by training a
+model for a fixed set of object classes. Incorporating additional classes or more complex queries later is expensive
+as it requires re-training the model on a dataset that encompasses these expressions. Here we propose a system
+that can generate image segmentations based on arbitrary
+prompts at test time. A prompt can be either a text or an
+image. This approach enables us to create a unified model
+(trained once) for three common segmentation tasks, which
+come with distinct challenges: referring expression segmentation, zero-shot segmentation and one-shot segmentation.
+We build upon the CLIP model as a backbone which we extend with a transformer-based decoder that enables dense
+prediction. After training on an extended version of the
+PhraseCut dataset, our system generates a binary segmentation map for an image based on a free-text prompt or on
+an additional image expressing the query. We analyze different variants of the latter image-based prompts in detail.
+This novel hybrid input allows for dynamic adaptation not
+only to the three segmentation tasks mentioned above, but
+to any binary segmentation task where a text or image query
+can be formulated. Finally, we find our system to adapt well
+to generalized queries involving affordances or properties*
+
+Tips:
+
+- [`CLIPSegForImageSegmentation`] adds a decoder on top of [`CLIPSegModel`]. The latter is identical to [`CLIPModel`].
+- [`CLIPSegForImageSegmentation`] can generate image segmentations based on arbitrary prompts at test time. A prompt can be either a text
+(provided to the model as `input_ids`) or an image (provided to the model as `conditional_pixel_values`). One can also provide custom
+conditional embeddings (provided to the model as `conditional_embeddings`).
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/clipseg_architecture.png"
+alt="drawing" width="600"/> 
+
+<small> CLIPSeg overview. Taken from the <a href="https://arxiv.org/abs/2112.10003">original paper.</a> </small>
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr).
+The original code can be found [here](https://github.com/timojl/clipseg).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with CLIPSeg. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+<PipelineTag pipeline="image-segmentation"/>
+
+- A notebook that illustrates [zero-shot image segmentation with CLIPSeg](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/CLIPSeg/Zero_shot_image_segmentation_with_CLIPSeg.ipynb).
+
+## CLIPSegConfig
+
+[[autodoc]] CLIPSegConfig
+    - from_text_vision_configs
+
+## CLIPSegTextConfig
+
+[[autodoc]] CLIPSegTextConfig
+
+## CLIPSegVisionConfig
+
+[[autodoc]] CLIPSegVisionConfig
+
+## CLIPSegProcessor
+
+[[autodoc]] CLIPSegProcessor
+
+## CLIPSegModel
+
+[[autodoc]] CLIPSegModel
+    - forward
+    - get_text_features
+    - get_image_features
+
+## CLIPSegTextModel
+
+[[autodoc]] CLIPSegTextModel
+    - forward
+
+## CLIPSegVisionModel
+
+[[autodoc]] CLIPSegVisionModel
+    - forward
+
+## CLIPSegForImageSegmentation
+
+[[autodoc]] CLIPSegForImageSegmentation
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/clipseg.mdx b/docs/source/en/model_doc/clipseg.mdx
deleted file mode 100644
index 94b58275f6d8..000000000000
--- a/docs/source/en/model_doc/clipseg.mdx
+++ /dev/null
@@ -1,100 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# CLIPSeg
-
-## Overview
-
-The CLIPSeg model was proposed in [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke
-and Alexander Ecker. CLIPSeg adds a minimal decoder on top of a frozen [CLIP](clip) model for zero- and one-shot image segmentation.
-
-The abstract from the paper is the following:
-
-*Image segmentation is usually addressed by training a
-model for a fixed set of object classes. Incorporating additional classes or more complex queries later is expensive
-as it requires re-training the model on a dataset that encompasses these expressions. Here we propose a system
-that can generate image segmentations based on arbitrary
-prompts at test time. A prompt can be either a text or an
-image. This approach enables us to create a unified model
-(trained once) for three common segmentation tasks, which
-come with distinct challenges: referring expression segmentation, zero-shot segmentation and one-shot segmentation.
-We build upon the CLIP model as a backbone which we extend with a transformer-based decoder that enables dense
-prediction. After training on an extended version of the
-PhraseCut dataset, our system generates a binary segmentation map for an image based on a free-text prompt or on
-an additional image expressing the query. We analyze different variants of the latter image-based prompts in detail.
-This novel hybrid input allows for dynamic adaptation not
-only to the three segmentation tasks mentioned above, but
-to any binary segmentation task where a text or image query
-can be formulated. Finally, we find our system to adapt well
-to generalized queries involving affordances or properties*
-
-Tips:
-
-- [`CLIPSegForImageSegmentation`] adds a decoder on top of [`CLIPSegModel`]. The latter is identical to [`CLIPModel`].
-- [`CLIPSegForImageSegmentation`] can generate image segmentations based on arbitrary prompts at test time. A prompt can be either a text
-(provided to the model as `input_ids`) or an image (provided to the model as `conditional_pixel_values`). One can also provide custom
-conditional embeddings (provided to the model as `conditional_embeddings`).
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/clipseg_architecture.png"
-alt="drawing" width="600"/> 
-
-<small> CLIPSeg overview. Taken from the <a href="https://arxiv.org/abs/2112.10003">original paper.</a> </small>
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr).
-The original code can be found [here](https://github.com/timojl/clipseg).
-
-## Resources
-
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with CLIPSeg. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
-<PipelineTag pipeline="image-segmentation"/>
-
-- A notebook that illustrates [zero-shot image segmentation with CLIPSeg](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/CLIPSeg/Zero_shot_image_segmentation_with_CLIPSeg.ipynb).
-
-## CLIPSegConfig
-
-[[autodoc]] CLIPSegConfig
-    - from_text_vision_configs
-
-## CLIPSegTextConfig
-
-[[autodoc]] CLIPSegTextConfig
-
-## CLIPSegVisionConfig
-
-[[autodoc]] CLIPSegVisionConfig
-
-## CLIPSegProcessor
-
-[[autodoc]] CLIPSegProcessor
-
-## CLIPSegModel
-
-[[autodoc]] CLIPSegModel
-    - forward
-    - get_text_features
-    - get_image_features
-
-## CLIPSegTextModel
-
-[[autodoc]] CLIPSegTextModel
-    - forward
-
-## CLIPSegVisionModel
-
-[[autodoc]] CLIPSegVisionModel
-    - forward
-
-## CLIPSegForImageSegmentation
-
-[[autodoc]] CLIPSegForImageSegmentation
-    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/code_llama.md b/docs/source/en/model_doc/code_llama.md
new file mode 100644
index 000000000000..a60cf1641533
--- /dev/null
+++ b/docs/source/en/model_doc/code_llama.md
@@ -0,0 +1,118 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# CodeLlama
+
+## Overview
+
+The Code Llama model was proposed in [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve.
+
+The abstract from the paper is the following:
+
+*We release Code Llama, a family of large language models for code based on Llama 2 providing state-of-the-art performance among open models, infilling capabilities, support for large input contexts, and zero-shot instruction following ability for programming tasks. We provide multiple flavors to cover a wide range of applications: foundation models (Code Llama), Python specializations (Code Llama - Python), and instruction-following models (Code Llama - Instruct) with 7B, 13B and 34B parameters each. All models are trained on sequences of 16k tokens and show improvements on inputs with up to 100k tokens. 7B and 13B Code Llama and Code Llama - Instruct variants support infilling based on surrounding content. Code Llama reaches state-of-the-art performance among open models on several code benchmarks, with scores of up to 53% and 55% on HumanEval and MBPP, respectively. Notably, Code Llama - Python 7B outperforms Llama 2 70B on HumanEval and MBPP, and all our models outperform every other publicly available model on MultiPL-E. We release Code Llama under a permissive license that allows for both research and commercial use.*
+
+Check out all Code Llama models [here](https://huggingface.co/models?search=code_llama) and the officially released ones in the [codellama org](https://huggingface.co/codellama).
+
+<Tip warning={true}>
+
+The `Llama2` family models, on which Code Llama is based, were trained using `bfloat16`, but the original inference uses `float16`. Let's look at the different precisions:
+
+* `float32`: PyTorch convention on model initialization is to load models in `float32`, no matter with which `dtype` the model weights were stored. `transformers` also follows this convention for consistency with PyTorch. This will be picked by default. If you want the `AutoModel` API to cast the load the checkpoints with the storage weights type, you must specify `torch_dtype="auto"`, e.g. `model = AutoModelForCausalLM.from_pretrained("path", torch_dtype = "auto")`.
+* `bfloat16`: Code Llama was trained with this precision, so we recommend using it for further training or fine-tuning.
+* `float16`: We recommend running inference using this precision, as it's usually faster than `bfloat16`, and evaluation metrics show no discernible degradation with respect to `bfloat16`. You can also run inference using `bfloat16`, and we recommend you check inference results with both `float16` and `bfloat16` after fine-tuning.
+
+As mentioned above, the `dtype` of the storage weights is mostly irrelevant unless you are using `torch_dtype="auto"` when initializing a model using. The reason is that the model will first be downloaded (using the `dtype` of the checkpoints online) and then will be casted to the default `dtype` of `torch` (becomes `torch.float32`). If there is a specified `torch_dtype`, it will be used instead.
+
+</Tip>
+
+Tips:
+
+- These models have the same architecture as the `Llama2` models
+- The infilling task is supported out of the box. You should be using the `tokenizer.fill_token` where you want your input to be filled.
+- The model conversion script is the same as for the `Llama2` family:
+
+Here is a sample usage
+```bash
+python src/transformers/models/llama/convert_llama_weights_to_hf.py \
+    --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path
+```
+Note that executing the script requires enough CPU RAM to host the whole model in float16 precision (even if the biggest versions
+come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
+
+- After conversion, the model and tokenizer can be loaded via:
+
+```python
+>>> from transformers import LlamaForCausalLM, CodeLlamaTokenizer
+
+>>> tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-hf")
+>>> model = LlamaForCausalLM.from_pretrained("codellama/CodeLlama-7b-hf")
+>>> PROMPT = '''def remove_non_ascii(s: str) -> str:
+    """ <FILL_ME>
+    return result
+'''
+>>> input_ids = tokenizer(PROMPT, return_tensors="pt")["input_ids"]
+>>> generated_ids = model.generate(input_ids, max_new_tokens=128)
+
+>>> filling = tokenizer.batch_decode(generated_ids[:, input_ids.shape[1]:], skip_special_tokens = True)[0]
+>>> print(PROMPT.replace("<FILL_ME>", filling))
+def remove_non_ascii(s: str) -> str:
+    """ Remove non-ASCII characters from a string.
+
+    Args:
+        s: The string to remove non-ASCII characters from.
+
+    Returns:
+        The string with non-ASCII characters removed.
+    """
+    result = ""
+    for c in s:
+        if ord(c) < 128:
+            result += c
+    return result
+```
+
+If you only want the infilled part:
+```python
+>>> from transformers import pipeline
+>>> import torch
+
+>>> generator = pipeline("text-generation",model="codellama/CodeLlama-7b-hf",torch_dtype=torch.float16, device_map="auto")
+>>> generator('def remove_non_ascii(s: str) -> str:\n    """ <FILL_ME>\n    return result', max_new_tokens = 128, return_type = 1)
+```
+
+Under the hood, the tokenizer [automatically splits by `<FILL_ME>`](https://huggingface.co/docs/transformers/main/model_doc/code_llama#transformers.CodeLlamaTokenizer.fill_token) to create a formatted input string that follows [the original training pattern](https://github.com/facebookresearch/codellama/blob/cb51c14ec761370ba2e2bc351374a79265d0465e/llama/generation.py#L402). This is more robust than preparing the pattern yourself: it avoids pitfalls, such as token glueing, that are very hard to debug.  To see how much CPU and GPU memory you need for this model or others, try [this calculator](https://huggingface.co/spaces/hf-accelerate/model-memory-usage) which can help determine that value.
+
+- The LLaMA tokenizer is a BPE model based on [sentencepiece](https://github.com/google/sentencepiece). One quirk of sentencepiece is that when decoding a sequence, if the first token is the start of the word (e.g. "Banana"), the tokenizer does not prepend the prefix space to the string.
+
+This model was contributed by [ArthurZucker](https://huggingface.co/ArthurZ). The original code of the authors can be found [here](https://github.com/facebookresearch/llama).
+
+
+## CodeLlamaTokenizer
+
+[[autodoc]] CodeLlamaTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## CodeLlamaTokenizerFast
+
+[[autodoc]] CodeLlamaTokenizerFast
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - update_post_processor
+    - save_vocabulary
diff --git a/docs/source/en/model_doc/codegen.md b/docs/source/en/model_doc/codegen.md
new file mode 100644
index 000000000000..695f45f9ae17
--- /dev/null
+++ b/docs/source/en/model_doc/codegen.md
@@ -0,0 +1,89 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# CodeGen
+
+## Overview
+
+The CodeGen model was proposed in [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, and Caiming Xiong.
+
+CodeGen is an autoregressive language model for program synthesis trained sequentially on [The Pile](https://pile.eleuther.ai/), BigQuery, and BigPython.
+
+The abstract from the paper is the following:
+
+*Program synthesis strives to generate a computer program as a solution to a given problem specification. We propose a conversational program synthesis approach via large language models, which addresses the challenges of searching over a vast program space and user intent specification faced in prior approaches. Our new approach casts the process of writing a specification and program as a multi-turn conversation between a user and a system. It treats program synthesis as a sequence prediction problem, in which the specification is expressed in natural language and the desired program is conditionally sampled. We train a family of large language models, called CodeGen, on natural language and programming language data. With weak supervision in the data and the scaling up of data size and model size, conversational capacities emerge from the simple autoregressive language modeling. To study the model behavior on conversational program synthesis, we develop a multi-turn programming benchmark (MTPB), where solving each problem requires multi-step synthesis via multi-turn conversation between the user and the model. Our findings show the emergence of conversational capabilities and the effectiveness of the proposed conversational program synthesis paradigm. In addition, our model CodeGen (with up to 16B parameters trained on TPU-v4) outperforms OpenAI's Codex on the HumanEval benchmark. We make the training library JaxFormer including checkpoints available as open source contribution: [this https URL](https://github.com/salesforce/codegen).* 
+
+This model was contributed by [Hiroaki Hayashi](https://huggingface.co/rooa).
+The original code can be found [here](https://github.com/salesforce/codegen).
+
+## Checkpoint Naming
+
+* CodeGen model [checkpoints](https://huggingface.co/models?other=codegen) are available on different pre-training data with variable sizes.
+* The format is: `Salesforce/codegen-{size}-{data}`, where
+  * `size`: `350M`, `2B`, `6B`, `16B`
+  * `data`: 
+    * `nl`: Pre-trained on the Pile
+    * `multi`: Initialized with `nl`, then further pre-trained on multiple programming languages data
+    * `mono`: Initialized with `multi`, then further pre-trained on Python data
+* For example, `Salesforce/codegen-350M-mono` offers a 350 million-parameter checkpoint pre-trained sequentially on the Pile, multiple programming languages, and Python.
+
+## How to use
+
+```python
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer
+
+>>> checkpoint = "Salesforce/codegen-350M-mono"
+>>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
+>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+
+>>> text = "def hello_world():"
+
+>>> completion = model.generate(**tokenizer(text, return_tensors="pt"))
+
+>>> print(tokenizer.decode(completion[0]))
+def hello_world():
+    print("Hello World")
+
+hello_world()
+```
+
+## Documentation resources
+
+- [Causal language modeling task guide](../tasks/language_modeling)
+
+## CodeGenConfig
+
+[[autodoc]] CodeGenConfig
+    - all
+
+## CodeGenTokenizer
+
+[[autodoc]] CodeGenTokenizer
+    - save_vocabulary
+
+## CodeGenTokenizerFast
+
+[[autodoc]] CodeGenTokenizerFast
+
+## CodeGenModel
+
+[[autodoc]] CodeGenModel
+    - forward
+
+## CodeGenForCausalLM
+
+[[autodoc]] CodeGenForCausalLM
+    - forward
diff --git a/docs/source/en/model_doc/codegen.mdx b/docs/source/en/model_doc/codegen.mdx
deleted file mode 100644
index c46649e00133..000000000000
--- a/docs/source/en/model_doc/codegen.mdx
+++ /dev/null
@@ -1,81 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# CodeGen
-
-## Overview
-
-The CodeGen model was proposed in [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, and Caiming Xiong.
-
-CodeGen is an autoregressive language model for program synthesis trained sequentially on [The Pile](https://pile.eleuther.ai/), BigQuery, and BigPython.
-
-The abstract from the paper is the following:
-
-*Program synthesis strives to generate a computer program as a solution to a given problem specification. We propose a conversational program synthesis approach via large language models, which addresses the challenges of searching over a vast program space and user intent specification faced in prior approaches. Our new approach casts the process of writing a specification and program as a multi-turn conversation between a user and a system. It treats program synthesis as a sequence prediction problem, in which the specification is expressed in natural language and the desired program is conditionally sampled. We train a family of large language models, called CodeGen, on natural language and programming language data. With weak supervision in the data and the scaling up of data size and model size, conversational capacities emerge from the simple autoregressive language modeling. To study the model behavior on conversational program synthesis, we develop a multi-turn programming benchmark (MTPB), where solving each problem requires multi-step synthesis via multi-turn conversation between the user and the model. Our findings show the emergence of conversational capabilities and the effectiveness of the proposed conversational program synthesis paradigm. In addition, our model CodeGen (with up to 16B parameters trained on TPU-v4) outperforms OpenAI's Codex on the HumanEval benchmark. We make the training library JaxFormer including checkpoints available as open source contribution: [this https URL](https://github.com/salesforce/codegen).* 
-
-This model was contributed by [Hiroaki Hayashi](https://huggingface.co/rooa).
-The original code can be found [here](https://github.com/salesforce/codegen).
-
-## Checkpoint Naming
-
-* CodeGen model [checkpoints](https://huggingface.co/models?other=codegen) are available on different pre-training data with variable sizes.
-* The format is: `Salesforce/codegen-{size}-{data}`, where
-  * `size`: `350M`, `2B`, `6B`, `16B`
-  * `data`: 
-    * `nl`: Pre-trained on the Pile
-    * `multi`: Initialized with `nl`, then further pre-trained on multiple programming languages data
-    * `mono`: Initialized with `multi`, then further pre-trained on Python data
-* For example, `Salesforce/codegen-350M-mono` offers a 350 million-parameter checkpoint pre-trained sequentially on the Pile, multiple programming languages, and Python.
-
-## How to use
-
-```python
->>> from transformers import AutoModelForCausalLM, AutoTokenizer
-
->>> checkpoint = "Salesforce/codegen-350M-mono"
->>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
->>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-
->>> text = "def hello_world():"
-
->>> completion = model.generate(**tokenizer(text, return_tensors="pt"))
-
->>> print(tokenizer.decode(completion[0]))
-def hello_world():
-    print("Hello World")
-
-hello_world()
-```
-
-## CodeGenConfig
-
-[[autodoc]] CodeGenConfig
-    - all
-
-## CodeGenTokenizer
-
-[[autodoc]] CodeGenTokenizer
-    - save_vocabulary
-
-## CodeGenTokenizerFast
-
-[[autodoc]] CodeGenTokenizerFast
-
-## CodeGenModel
-
-[[autodoc]] CodeGenModel
-    - forward
-
-## CodeGenForCausalLM
-
-[[autodoc]] CodeGenForCausalLM
-    - forward
diff --git a/docs/source/en/model_doc/conditional_detr.md b/docs/source/en/model_doc/conditional_detr.md
new file mode 100644
index 000000000000..8993fb384316
--- /dev/null
+++ b/docs/source/en/model_doc/conditional_detr.md
@@ -0,0 +1,73 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Conditional DETR
+
+## Overview
+
+The Conditional DETR model was proposed in [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang. Conditional DETR presents a conditional cross-attention mechanism for fast DETR training. Conditional DETR converges 6.7× to 10× faster than DETR.
+
+The abstract from the paper is the following:
+
+*The recently-developed DETR approach applies the transformer encoder and decoder architecture to object detection and achieves promising performance. In this paper, we handle the critical issue, slow training convergence, and present a conditional cross-attention mechanism for fast DETR training. Our approach is motivated by that the cross-attention in DETR relies highly on the content embeddings for localizing the four extremities and predicting the box, which increases the need for high-quality content embeddings and thus the training difficulty. Our approach, named conditional DETR, learns a conditional spatial query from the decoder embedding for decoder multi-head cross-attention. The benefit is that through the conditional spatial query, each cross-attention head is able to attend to a band containing a distinct region, e.g., one object extremity or a region inside the object box. This narrows down the spatial range for localizing the distinct regions for object classification and box regression, thus relaxing the dependence on the content embeddings and easing the training. Empirical results show that conditional DETR converges 6.7× faster for the backbones R50 and R101 and 10× faster for stronger backbones DC5-R50 and DC5-R101. Code is available at https://github.com/Atten4Vis/ConditionalDETR.*
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/conditional_detr_curve.jpg"
+alt="drawing" width="600"/>
+
+<small> Conditional DETR shows much faster convergence compared to the original DETR. Taken from the <a href="https://arxiv.org/abs/2108.06152">original paper</a>.</small>
+
+This model was contributed by [DepuMeng](https://huggingface.co/DepuMeng). The original code can be found [here](https://github.com/Atten4Vis/ConditionalDETR).
+
+## Documentation resources
+
+- [Object detection task guide](../tasks/object_detection)
+
+## ConditionalDetrConfig
+
+[[autodoc]] ConditionalDetrConfig
+
+## ConditionalDetrImageProcessor
+
+[[autodoc]] ConditionalDetrImageProcessor
+    - preprocess
+    - post_process_object_detection
+    - post_process_instance_segmentation
+    - post_process_semantic_segmentation
+    - post_process_panoptic_segmentation
+
+## ConditionalDetrFeatureExtractor
+
+[[autodoc]] ConditionalDetrFeatureExtractor
+    - __call__
+    - post_process_object_detection
+    - post_process_instance_segmentation
+    - post_process_semantic_segmentation
+    - post_process_panoptic_segmentation
+
+## ConditionalDetrModel
+
+[[autodoc]] ConditionalDetrModel
+    - forward
+
+## ConditionalDetrForObjectDetection
+
+[[autodoc]] ConditionalDetrForObjectDetection
+    - forward
+
+## ConditionalDetrForSegmentation
+
+[[autodoc]] ConditionalDetrForSegmentation
+    - forward
diff --git a/docs/source/en/model_doc/conditional_detr.mdx b/docs/source/en/model_doc/conditional_detr.mdx
deleted file mode 100644
index 40cdbee34502..000000000000
--- a/docs/source/en/model_doc/conditional_detr.mdx
+++ /dev/null
@@ -1,68 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Conditional DETR
-
-## Overview
-
-The Conditional DETR model was proposed in [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang. Conditional DETR presents a conditional cross-attention mechanism for fast DETR training. Conditional DETR converges 6.7× to 10× faster than DETR.
-
-The abstract from the paper is the following:
-
-*The recently-developed DETR approach applies the transformer encoder and decoder architecture to object detection and achieves promising performance. In this paper, we handle the critical issue, slow training convergence, and present a conditional cross-attention mechanism for fast DETR training. Our approach is motivated by that the cross-attention in DETR relies highly on the content embeddings for localizing the four extremities and predicting the box, which increases the need for high-quality content embeddings and thus the training difficulty. Our approach, named conditional DETR, learns a conditional spatial query from the decoder embedding for decoder multi-head cross-attention. The benefit is that through the conditional spatial query, each cross-attention head is able to attend to a band containing a distinct region, e.g., one object extremity or a region inside the object box. This narrows down the spatial range for localizing the distinct regions for object classification and box regression, thus relaxing the dependence on the content embeddings and easing the training. Empirical results show that conditional DETR converges 6.7× faster for the backbones R50 and R101 and 10× faster for stronger backbones DC5-R50 and DC5-R101. Code is available at https://github.com/Atten4Vis/ConditionalDETR.*
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/conditional_detr_curve.jpg"
-alt="drawing" width="600"/>
-
-<small> Conditional DETR shows much faster convergence compared to the original DETR. Taken from the <a href="https://arxiv.org/abs/2108.06152">original paper</a>.</small>
-
-This model was contributed by [DepuMeng](https://huggingface.co/DepuMeng). The original code can be found [here](https://github.com/Atten4Vis/ConditionalDETR).
-
-
-## ConditionalDetrConfig
-
-[[autodoc]] ConditionalDetrConfig
-
-## ConditionalDetrImageProcessor
-
-[[autodoc]] ConditionalDetrImageProcessor
-    - preprocess
-    - pad_and_create_pixel_mask
-    - post_process_object_detection
-    - post_process_instance_segmentation
-    - post_process_semantic_segmentation
-    - post_process_panoptic_segmentation
-
-## ConditionalDetrFeatureExtractor
-
-[[autodoc]] ConditionalDetrFeatureExtractor
-    - __call__
-    - pad_and_create_pixel_mask
-    - post_process_object_detection
-    - post_process_instance_segmentation
-    - post_process_semantic_segmentation
-    - post_process_panoptic_segmentation
-
-## ConditionalDetrModel
-
-[[autodoc]] ConditionalDetrModel
-    - forward
-
-## ConditionalDetrForObjectDetection
-
-[[autodoc]] ConditionalDetrForObjectDetection
-    - forward
-
-## ConditionalDetrForSegmentation
-
-[[autodoc]] ConditionalDetrForSegmentation
-    - forward
diff --git a/docs/source/en/model_doc/convbert.md b/docs/source/en/model_doc/convbert.md
new file mode 100644
index 000000000000..8a0aa7a946cc
--- /dev/null
+++ b/docs/source/en/model_doc/convbert.md
@@ -0,0 +1,134 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# ConvBERT
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=convbert">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-convbert-blueviolet">
+</a>
+<a href="https://huggingface.co/spaces/docs-demos/conv-bert-base">
+<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
+</a>
+</div>
+
+## Overview
+
+The ConvBERT model was proposed in [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng
+Yan.
+
+The abstract from the paper is the following:
+
+*Pre-trained language models like BERT and its variants have recently achieved impressive performance in various
+natural language understanding tasks. However, BERT heavily relies on the global self-attention block and thus suffers
+large memory footprint and computation cost. Although all its attention heads query on the whole input sequence for
+generating the attention map from a global perspective, we observe some heads only need to learn local dependencies,
+which means the existence of computation redundancy. We therefore propose a novel span-based dynamic convolution to
+replace these self-attention heads to directly model local dependencies. The novel convolution heads, together with the
+rest self-attention heads, form a new mixed attention block that is more efficient at both global and local context
+learning. We equip BERT with this mixed attention design and build a ConvBERT model. Experiments have shown that
+ConvBERT significantly outperforms BERT and its variants in various downstream tasks, with lower training cost and
+fewer model parameters. Remarkably, ConvBERTbase model achieves 86.4 GLUE score, 0.7 higher than ELECTRAbase, while
+using less than 1/4 training cost. Code and pre-trained models will be released.*
+
+ConvBERT training tips are similar to those of BERT.
+
+This model was contributed by [abhishek](https://huggingface.co/abhishek). The original implementation can be found
+here: https://github.com/yitu-opensource/ConvBert
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+- [Multiple choice task guide](../tasks/multiple_choice)
+
+## ConvBertConfig
+
+[[autodoc]] ConvBertConfig
+
+## ConvBertTokenizer
+
+[[autodoc]] ConvBertTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## ConvBertTokenizerFast
+
+[[autodoc]] ConvBertTokenizerFast
+
+## ConvBertModel
+
+[[autodoc]] ConvBertModel
+    - forward
+
+## ConvBertForMaskedLM
+
+[[autodoc]] ConvBertForMaskedLM
+    - forward
+
+## ConvBertForSequenceClassification
+
+[[autodoc]] ConvBertForSequenceClassification
+    - forward
+
+## ConvBertForMultipleChoice
+
+[[autodoc]] ConvBertForMultipleChoice
+    - forward
+
+## ConvBertForTokenClassification
+
+[[autodoc]] ConvBertForTokenClassification
+    - forward
+
+## ConvBertForQuestionAnswering
+
+[[autodoc]] ConvBertForQuestionAnswering
+    - forward
+
+## TFConvBertModel
+
+[[autodoc]] TFConvBertModel
+    - call
+
+## TFConvBertForMaskedLM
+
+[[autodoc]] TFConvBertForMaskedLM
+    - call
+
+## TFConvBertForSequenceClassification
+
+[[autodoc]] TFConvBertForSequenceClassification
+    - call
+
+## TFConvBertForMultipleChoice
+
+[[autodoc]] TFConvBertForMultipleChoice
+    - call
+
+## TFConvBertForTokenClassification
+
+[[autodoc]] TFConvBertForTokenClassification
+    - call
+
+## TFConvBertForQuestionAnswering
+
+[[autodoc]] TFConvBertForQuestionAnswering
+    - call
diff --git a/docs/source/en/model_doc/convbert.mdx b/docs/source/en/model_doc/convbert.mdx
deleted file mode 100644
index 723640f5aa63..000000000000
--- a/docs/source/en/model_doc/convbert.mdx
+++ /dev/null
@@ -1,113 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# ConvBERT
-
-## Overview
-
-The ConvBERT model was proposed in [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng
-Yan.
-
-The abstract from the paper is the following:
-
-*Pre-trained language models like BERT and its variants have recently achieved impressive performance in various
-natural language understanding tasks. However, BERT heavily relies on the global self-attention block and thus suffers
-large memory footprint and computation cost. Although all its attention heads query on the whole input sequence for
-generating the attention map from a global perspective, we observe some heads only need to learn local dependencies,
-which means the existence of computation redundancy. We therefore propose a novel span-based dynamic convolution to
-replace these self-attention heads to directly model local dependencies. The novel convolution heads, together with the
-rest self-attention heads, form a new mixed attention block that is more efficient at both global and local context
-learning. We equip BERT with this mixed attention design and build a ConvBERT model. Experiments have shown that
-ConvBERT significantly outperforms BERT and its variants in various downstream tasks, with lower training cost and
-fewer model parameters. Remarkably, ConvBERTbase model achieves 86.4 GLUE score, 0.7 higher than ELECTRAbase, while
-using less than 1/4 training cost. Code and pre-trained models will be released.*
-
-ConvBERT training tips are similar to those of BERT.
-
-This model was contributed by [abhishek](https://huggingface.co/abhishek). The original implementation can be found
-here: https://github.com/yitu-opensource/ConvBert
-
-## ConvBertConfig
-
-[[autodoc]] ConvBertConfig
-
-## ConvBertTokenizer
-
-[[autodoc]] ConvBertTokenizer
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-
-## ConvBertTokenizerFast
-
-[[autodoc]] ConvBertTokenizerFast
-
-## ConvBertModel
-
-[[autodoc]] ConvBertModel
-    - forward
-
-## ConvBertForMaskedLM
-
-[[autodoc]] ConvBertForMaskedLM
-    - forward
-
-## ConvBertForSequenceClassification
-
-[[autodoc]] ConvBertForSequenceClassification
-    - forward
-
-## ConvBertForMultipleChoice
-
-[[autodoc]] ConvBertForMultipleChoice
-    - forward
-
-## ConvBertForTokenClassification
-
-[[autodoc]] ConvBertForTokenClassification
-    - forward
-
-## ConvBertForQuestionAnswering
-
-[[autodoc]] ConvBertForQuestionAnswering
-    - forward
-
-## TFConvBertModel
-
-[[autodoc]] TFConvBertModel
-    - call
-
-## TFConvBertForMaskedLM
-
-[[autodoc]] TFConvBertForMaskedLM
-    - call
-
-## TFConvBertForSequenceClassification
-
-[[autodoc]] TFConvBertForSequenceClassification
-    - call
-
-## TFConvBertForMultipleChoice
-
-[[autodoc]] TFConvBertForMultipleChoice
-    - call
-
-## TFConvBertForTokenClassification
-
-[[autodoc]] TFConvBertForTokenClassification
-    - call
-
-## TFConvBertForQuestionAnswering
-
-[[autodoc]] TFConvBertForQuestionAnswering
-    - call
diff --git a/docs/source/en/model_doc/convnext.md b/docs/source/en/model_doc/convnext.md
new file mode 100644
index 000000000000..acbb0265b2e6
--- /dev/null
+++ b/docs/source/en/model_doc/convnext.md
@@ -0,0 +1,91 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# ConvNeXT
+
+## Overview
+
+The ConvNeXT model was proposed in [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
+ConvNeXT is a pure convolutional model (ConvNet), inspired by the design of Vision Transformers, that claims to outperform them.
+
+The abstract from the paper is the following:
+
+*The "Roaring 20s" of visual recognition began with the introduction of Vision Transformers (ViTs), which quickly superseded ConvNets as the state-of-the-art image classification model.
+A vanilla ViT, on the other hand, faces difficulties when applied to general computer vision tasks such as object detection and semantic segmentation. It is the hierarchical Transformers
+(e.g., Swin Transformers) that reintroduced several ConvNet priors, making Transformers practically viable as a generic vision backbone and demonstrating remarkable performance on a wide
+variety of vision tasks. However, the effectiveness of such hybrid approaches is still largely credited to the intrinsic superiority of Transformers, rather than the inherent inductive
+biases of convolutions. In this work, we reexamine the design spaces and test the limits of what a pure ConvNet can achieve. We gradually "modernize" a standard ResNet toward the design
+of a vision Transformer, and discover several key components that contribute to the performance difference along the way. The outcome of this exploration is a family of pure ConvNet models
+dubbed ConvNeXt. Constructed entirely from standard ConvNet modules, ConvNeXts compete favorably with Transformers in terms of accuracy and scalability, achieving 87.8% ImageNet top-1 accuracy
+and outperforming Swin Transformers on COCO detection and ADE20K segmentation, while maintaining the simplicity and efficiency of standard ConvNets.*
+
+Tips:
+
+- See the code examples below each model regarding usage.
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/convnext_architecture.jpg"
+alt="drawing" width="600"/>
+
+<small> ConvNeXT architecture. Taken from the <a href="https://arxiv.org/abs/2201.03545">original paper</a>.</small>
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr). TensorFlow version of the model was contributed by [ariG23498](https://github.com/ariG23498),
+[gante](https://github.com/gante), and [sayakpaul](https://github.com/sayakpaul) (equal contribution). The original code can be found [here](https://github.com/facebookresearch/ConvNeXt).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ConvNeXT.
+
+<PipelineTag pipeline="image-classification"/>
+
+- [`ConvNextForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
+- See also: [Image classification task guide](../tasks/image_classification)
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## ConvNextConfig
+
+[[autodoc]] ConvNextConfig
+
+## ConvNextFeatureExtractor
+
+[[autodoc]] ConvNextFeatureExtractor
+
+## ConvNextImageProcessor
+
+[[autodoc]] ConvNextImageProcessor
+    - preprocess
+
+## ConvNextModel
+
+[[autodoc]] ConvNextModel
+    - forward
+
+## ConvNextForImageClassification
+
+[[autodoc]] ConvNextForImageClassification
+    - forward
+
+
+## TFConvNextModel
+
+[[autodoc]] TFConvNextModel
+    - call
+
+
+## TFConvNextForImageClassification
+
+[[autodoc]] TFConvNextForImageClassification
+    - call
diff --git a/docs/source/en/model_doc/convnext.mdx b/docs/source/en/model_doc/convnext.mdx
deleted file mode 100644
index 538c68ea2936..000000000000
--- a/docs/source/en/model_doc/convnext.mdx
+++ /dev/null
@@ -1,79 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# ConvNeXT
-
-## Overview
-
-The ConvNeXT model was proposed in [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
-ConvNeXT is a pure convolutional model (ConvNet), inspired by the design of Vision Transformers, that claims to outperform them.
-
-The abstract from the paper is the following:
-
-*The "Roaring 20s" of visual recognition began with the introduction of Vision Transformers (ViTs), which quickly superseded ConvNets as the state-of-the-art image classification model.
-A vanilla ViT, on the other hand, faces difficulties when applied to general computer vision tasks such as object detection and semantic segmentation. It is the hierarchical Transformers
-(e.g., Swin Transformers) that reintroduced several ConvNet priors, making Transformers practically viable as a generic vision backbone and demonstrating remarkable performance on a wide
-variety of vision tasks. However, the effectiveness of such hybrid approaches is still largely credited to the intrinsic superiority of Transformers, rather than the inherent inductive
-biases of convolutions. In this work, we reexamine the design spaces and test the limits of what a pure ConvNet can achieve. We gradually "modernize" a standard ResNet toward the design
-of a vision Transformer, and discover several key components that contribute to the performance difference along the way. The outcome of this exploration is a family of pure ConvNet models
-dubbed ConvNeXt. Constructed entirely from standard ConvNet modules, ConvNeXts compete favorably with Transformers in terms of accuracy and scalability, achieving 87.8% ImageNet top-1 accuracy
-and outperforming Swin Transformers on COCO detection and ADE20K segmentation, while maintaining the simplicity and efficiency of standard ConvNets.*
-
-Tips:
-
-- See the code examples below each model regarding usage.
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/convnext_architecture.jpg"
-alt="drawing" width="600"/>
-
-<small> ConvNeXT architecture. Taken from the <a href="https://arxiv.org/abs/2201.03545">original paper</a>.</small>
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr). TensorFlow version of the model was contributed by [ariG23498](https://github.com/ariG23498),
-[gante](https://github.com/gante), and [sayakpaul](https://github.com/sayakpaul) (equal contribution). The original code can be found [here](https://github.com/facebookresearch/ConvNeXt).
-
-## ConvNextConfig
-
-[[autodoc]] ConvNextConfig
-
-
-## ConvNextFeatureExtractor
-
-[[autodoc]] ConvNextFeatureExtractor
-
-
-## ConvNextImageProcessor
-
-[[autodoc]] ConvNextImageProcessor
-    - preprocess
-
-## ConvNextModel
-
-[[autodoc]] ConvNextModel
-    - forward
-
-
-## ConvNextForImageClassification
-
-[[autodoc]] ConvNextForImageClassification
-    - forward
-
-
-## TFConvNextModel
-
-[[autodoc]] TFConvNextModel
-    - call
-
-
-## TFConvNextForImageClassification
-
-[[autodoc]] TFConvNextForImageClassification
-    - call
diff --git a/docs/source/en/model_doc/convnextv2.md b/docs/source/en/model_doc/convnextv2.md
new file mode 100644
index 000000000000..9479cdd56fa1
--- /dev/null
+++ b/docs/source/en/model_doc/convnextv2.md
@@ -0,0 +1,61 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# ConvNeXt V2
+
+## Overview
+
+The ConvNeXt V2 model was proposed in [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
+ConvNeXt V2 is a pure convolutional model (ConvNet), inspired by the design of Vision Transformers, and a successor of [ConvNeXT](convnext).
+
+The abstract from the paper is the following:
+
+*Driven by improved architectures and better representation learning frameworks, the field of visual recognition has enjoyed rapid modernization and performance boost in the early 2020s. For example, modern ConvNets, represented by ConvNeXt, have demonstrated strong performance in various scenarios. While these models were originally designed for supervised learning with ImageNet labels, they can also potentially benefit from self-supervised learning techniques such as masked  autoencoders (MAE). However, we found that simply combining these two approaches leads to subpar performance. In this paper, we propose a fully convolutional masked autoencoder framework and a new Global Response Normalization (GRN) layer that can be added to the ConvNeXt architecture to enhance inter-channel feature competition. This co-design of self-supervised learning techniques and architectural improvement results in a new model family called ConvNeXt V2, which significantly improves the performance of pure ConvNets on various recognition benchmarks, including ImageNet classification, COCO detection, and ADE20K segmentation. We also provide pre-trained ConvNeXt V2 models of various sizes, ranging from an efficient 3.7M-parameter Atto model with 76.7% top-1 accuracy on ImageNet, to a 650M Huge model that achieves a state-of-the-art 88.9% accuracy using only public training data.*
+
+Tips:
+
+- See the code examples below each model regarding usage.
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/convnextv2_architecture.png"
+alt="drawing" width="600"/>
+
+<small> ConvNeXt V2 architecture. Taken from the <a href="https://arxiv.org/abs/2301.00808">original paper</a>.</small>
+
+This model was contributed by [adirik](https://huggingface.co/adirik). The original code can be found [here](https://github.com/facebookresearch/ConvNeXt-V2).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ConvNeXt V2.
+
+<PipelineTag pipeline="image-classification"/>
+
+- [`ConvNextV2ForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## ConvNextV2Config
+
+[[autodoc]] ConvNextV2Config
+
+## ConvNextV2Model
+
+[[autodoc]] ConvNextV2Model
+    - forward
+
+## ConvNextV2ForImageClassification
+
+[[autodoc]] ConvNextV2ForImageClassification
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/cpm.md b/docs/source/en/model_doc/cpm.md
new file mode 100644
index 000000000000..a2ecf1a1e092
--- /dev/null
+++ b/docs/source/en/model_doc/cpm.md
@@ -0,0 +1,48 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# CPM
+
+## Overview
+
+The CPM model was proposed in [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin,
+Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen,
+Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
+
+The abstract from the paper is the following:
+
+*Pre-trained Language Models (PLMs) have proven to be beneficial for various downstream NLP tasks. Recently, GPT-3,
+with 175 billion parameters and 570GB training data, drew a lot of attention due to the capacity of few-shot (even
+zero-shot) learning. However, applying GPT-3 to address Chinese NLP tasks is still challenging, as the training corpus
+of GPT-3 is primarily English, and the parameters are not publicly available. In this technical report, we release the
+Chinese Pre-trained Language Model (CPM) with generative pre-training on large-scale Chinese training data. To the best
+of our knowledge, CPM, with 2.6 billion parameters and 100GB Chinese training data, is the largest Chinese pre-trained
+language model, which could facilitate several downstream Chinese NLP tasks, such as conversation, essay generation,
+cloze test, and language understanding. Extensive experiments demonstrate that CPM achieves strong performance on many
+NLP tasks in the settings of few-shot (even zero-shot) learning.*
+
+This model was contributed by [canwenxu](https://huggingface.co/canwenxu). The original implementation can be found
+here: https://github.com/TsinghuaAI/CPM-Generate
+
+Note: We only have a tokenizer here, since the model architecture is the same as GPT-2.
+
+## CpmTokenizer
+
+[[autodoc]] CpmTokenizer
+
+## CpmTokenizerFast
+
+[[autodoc]] CpmTokenizerFast
diff --git a/docs/source/en/model_doc/cpm.mdx b/docs/source/en/model_doc/cpm.mdx
deleted file mode 100644
index ac8ed8fdbafb..000000000000
--- a/docs/source/en/model_doc/cpm.mdx
+++ /dev/null
@@ -1,44 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# CPM
-
-## Overview
-
-The CPM model was proposed in [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin,
-Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen,
-Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
-
-The abstract from the paper is the following:
-
-*Pre-trained Language Models (PLMs) have proven to be beneficial for various downstream NLP tasks. Recently, GPT-3,
-with 175 billion parameters and 570GB training data, drew a lot of attention due to the capacity of few-shot (even
-zero-shot) learning. However, applying GPT-3 to address Chinese NLP tasks is still challenging, as the training corpus
-of GPT-3 is primarily English, and the parameters are not publicly available. In this technical report, we release the
-Chinese Pre-trained Language Model (CPM) with generative pre-training on large-scale Chinese training data. To the best
-of our knowledge, CPM, with 2.6 billion parameters and 100GB Chinese training data, is the largest Chinese pre-trained
-language model, which could facilitate several downstream Chinese NLP tasks, such as conversation, essay generation,
-cloze test, and language understanding. Extensive experiments demonstrate that CPM achieves strong performance on many
-NLP tasks in the settings of few-shot (even zero-shot) learning.*
-
-This model was contributed by [canwenxu](https://huggingface.co/canwenxu). The original implementation can be found
-here: https://github.com/TsinghuaAI/CPM-Generate
-
-Note: We only have a tokenizer here, since the model architecture is the same as GPT-2.
-
-## CpmTokenizer
-
-[[autodoc]] CpmTokenizer
-
-## CpmTokenizerFast
-
-[[autodoc]] CpmTokenizerFast
diff --git a/docs/source/en/model_doc/cpmant.md b/docs/source/en/model_doc/cpmant.md
new file mode 100644
index 000000000000..2c4ad92a629e
--- /dev/null
+++ b/docs/source/en/model_doc/cpmant.md
@@ -0,0 +1,48 @@
+<!--Copyright 2022 The HuggingFace Team and The OpenBMB Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# CPMAnt
+
+## Overview
+
+CPM-Ant is an open-source Chinese pre-trained language model (PLM) with 10B parameters. It is also the first milestone of the live training process of CPM-Live. The training process is cost-effective and environment-friendly. CPM-Ant also achieves promising results with delta tuning on the CUGE benchmark. Besides the full model, we also provide various compressed versions to meet the requirements of different hardware configurations. [See more](https://github.com/OpenBMB/CPM-Live/tree/cpm-ant/cpm-live)
+
+Tips:
+
+This model was contributed by [OpenBMB](https://huggingface.co/openbmb). The original code can be found [here](https://github.com/OpenBMB/CPM-Live/tree/cpm-ant/cpm-live).
+
+⚙️ Training & Inference
+- A tutorial on [CPM-Live](https://github.com/OpenBMB/CPM-Live/tree/cpm-ant/cpm-live).
+
+## CpmAntConfig
+
+[[autodoc]] CpmAntConfig
+    - all
+
+## CpmAntTokenizer
+
+[[autodoc]] CpmAntTokenizer
+    - all
+
+## CpmAntModel
+
+[[autodoc]] CpmAntModel
+    - all
+    
+## CpmAntForCausalLM
+
+[[autodoc]] CpmAntForCausalLM
+    - all
\ No newline at end of file
diff --git a/docs/source/en/model_doc/ctrl.md b/docs/source/en/model_doc/ctrl.md
new file mode 100644
index 000000000000..9c2413d27769
--- /dev/null
+++ b/docs/source/en/model_doc/ctrl.md
@@ -0,0 +1,104 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# CTRL
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=ctrl">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-ctrl-blueviolet">
+</a>
+<a href="https://huggingface.co/spaces/docs-demos/tiny-ctrl">
+<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
+</a>
+</div>
+
+## Overview
+
+CTRL model was proposed in [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and
+Richard Socher. It's a causal (unidirectional) transformer pre-trained using language modeling on a very large corpus
+of ~140 GB of text data with the first token reserved as a control code (such as Links, Books, Wikipedia etc.).
+
+The abstract from the paper is the following:
+
+*Large-scale language models show promising text generation capabilities, but users cannot easily control particular
+aspects of the generated text. We release CTRL, a 1.63 billion-parameter conditional transformer language model,
+trained to condition on control codes that govern style, content, and task-specific behavior. Control codes were
+derived from structure that naturally co-occurs with raw text, preserving the advantages of unsupervised learning while
+providing more explicit control over text generation. These codes also allow CTRL to predict which parts of the
+training data are most likely given a sequence. This provides a potential method for analyzing large amounts of data
+via model-based source attribution.*
+
+Tips:
+
+- CTRL makes use of control codes to generate text: it requires generations to be started by certain words, sentences
+  or links to generate coherent text. Refer to the [original implementation](https://github.com/salesforce/ctrl) for
+  more information.
+- CTRL is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
+  the left.
+- CTRL was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next
+  token in a sequence. Leveraging this feature allows CTRL to generate syntactically coherent text as it can be
+  observed in the *run_generation.py* example script.
+- The PyTorch models can take the `past_key_values` as input, which is the previously computed key/value attention pairs.
+  TensorFlow models accepts `past` as input. Using the `past_key_values` value prevents the model from re-computing
+  pre-computed values in the context of text generation. See the [`forward`](model_doc/ctrl#transformers.CTRLModel.forward)
+  method for more information on the usage of this argument.
+
+This model was contributed by [keskarnitishr](https://huggingface.co/keskarnitishr). The original code can be found
+[here](https://github.com/salesforce/ctrl).
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Causal language modeling task guide](../tasks/language_modeling)
+
+## CTRLConfig
+
+[[autodoc]] CTRLConfig
+
+## CTRLTokenizer
+
+[[autodoc]] CTRLTokenizer
+    - save_vocabulary
+
+## CTRLModel
+
+[[autodoc]] CTRLModel
+    - forward
+
+## CTRLLMHeadModel
+
+[[autodoc]] CTRLLMHeadModel
+    - forward
+
+## CTRLForSequenceClassification
+
+[[autodoc]] CTRLForSequenceClassification
+    - forward
+
+## TFCTRLModel
+
+[[autodoc]] TFCTRLModel
+    - call
+
+## TFCTRLLMHeadModel
+
+[[autodoc]] TFCTRLLMHeadModel
+    - call
+
+## TFCTRLForSequenceClassification
+
+[[autodoc]] TFCTRLForSequenceClassification
+    - call
diff --git a/docs/source/en/model_doc/ctrl.mdx b/docs/source/en/model_doc/ctrl.mdx
deleted file mode 100644
index 6f35e487e148..000000000000
--- a/docs/source/en/model_doc/ctrl.mdx
+++ /dev/null
@@ -1,87 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# CTRL
-
-## Overview
-
-CTRL model was proposed in [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and
-Richard Socher. It's a causal (unidirectional) transformer pre-trained using language modeling on a very large corpus
-of ~140 GB of text data with the first token reserved as a control code (such as Links, Books, Wikipedia etc.).
-
-The abstract from the paper is the following:
-
-*Large-scale language models show promising text generation capabilities, but users cannot easily control particular
-aspects of the generated text. We release CTRL, a 1.63 billion-parameter conditional transformer language model,
-trained to condition on control codes that govern style, content, and task-specific behavior. Control codes were
-derived from structure that naturally co-occurs with raw text, preserving the advantages of unsupervised learning while
-providing more explicit control over text generation. These codes also allow CTRL to predict which parts of the
-training data are most likely given a sequence. This provides a potential method for analyzing large amounts of data
-via model-based source attribution.*
-
-Tips:
-
-- CTRL makes use of control codes to generate text: it requires generations to be started by certain words, sentences
-  or links to generate coherent text. Refer to the [original implementation](https://github.com/salesforce/ctrl) for
-  more information.
-- CTRL is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
-  the left.
-- CTRL was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next
-  token in a sequence. Leveraging this feature allows CTRL to generate syntactically coherent text as it can be
-  observed in the *run_generation.py* example script.
-- The PyTorch models can take the `past_key_values` as input, which is the previously computed key/value attention pairs.
-  TensorFlow models accepts `past` as input. Using the `past_key_values` value prevents the model from re-computing
-  pre-computed values in the context of text generation. See the [`forward`](model_doc/ctrl#transformers.CTRLModel.forward)
-  method for more information on the usage of this argument.
-
-This model was contributed by [keskarnitishr](https://huggingface.co/keskarnitishr). The original code can be found
-[here](https://github.com/salesforce/ctrl).
-
-
-## CTRLConfig
-
-[[autodoc]] CTRLConfig
-
-## CTRLTokenizer
-
-[[autodoc]] CTRLTokenizer
-    - save_vocabulary
-
-## CTRLModel
-
-[[autodoc]] CTRLModel
-    - forward
-
-## CTRLLMHeadModel
-
-[[autodoc]] CTRLLMHeadModel
-    - forward
-
-## CTRLForSequenceClassification
-
-[[autodoc]] CTRLForSequenceClassification
-    - forward
-
-## TFCTRLModel
-
-[[autodoc]] TFCTRLModel
-    - call
-
-## TFCTRLLMHeadModel
-
-[[autodoc]] TFCTRLLMHeadModel
-    - call
-
-## TFCTRLForSequenceClassification
-
-[[autodoc]] TFCTRLForSequenceClassification
-    - call
diff --git a/docs/source/en/model_doc/cvt.md b/docs/source/en/model_doc/cvt.md
new file mode 100644
index 000000000000..6c9aea5ec863
--- /dev/null
+++ b/docs/source/en/model_doc/cvt.md
@@ -0,0 +1,79 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Convolutional Vision Transformer (CvT)
+
+## Overview
+
+The CvT model was proposed in [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan and Lei Zhang. The Convolutional vision Transformer (CvT) improves the [Vision Transformer (ViT)](vit) in performance and efficiency by introducing convolutions into ViT to yield the best of both designs.
+
+The abstract from the paper is the following:
+
+*We present in this paper a new architecture, named Convolutional vision Transformer (CvT), that improves Vision Transformer (ViT) 
+in performance and efficiency by introducing convolutions into ViT to yield the best of both designs. This is accomplished through 
+two primary modifications: a hierarchy of Transformers containing a new convolutional token embedding, and a convolutional Transformer 
+block leveraging a convolutional projection. These changes introduce desirable properties of convolutional neural networks (CNNs) 
+to the ViT architecture (\ie shift, scale, and distortion invariance) while maintaining the merits of Transformers (\ie dynamic attention, 
+global context, and better generalization). We validate CvT by conducting extensive experiments, showing that this approach achieves 
+state-of-the-art performance over other Vision Transformers and ResNets on ImageNet-1k, with fewer parameters and lower FLOPs. In addition, 
+performance gains are maintained when pretrained on larger datasets (\eg ImageNet-22k) and fine-tuned to downstream tasks. Pre-trained on 
+ImageNet-22k, our CvT-W24 obtains a top-1 accuracy of 87.7\% on the ImageNet-1k val set. Finally, our results show that the positional encoding, 
+a crucial component in existing Vision Transformers, can be safely removed in our model, simplifying the design for higher resolution vision tasks.*
+
+Tips:
+
+- CvT models are regular Vision Transformers, but trained with convolutions. They outperform the [original model (ViT)](vit) when fine-tuned on ImageNet-1K and CIFAR-100.
+- You can check out demo notebooks regarding inference as well as fine-tuning on custom data [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/VisionTransformer) (you can just replace [`ViTFeatureExtractor`] by [`AutoImageProcessor`] and [`ViTForImageClassification`] by [`CvtForImageClassification`]).
+- The available checkpoints are either (1) pre-trained on [ImageNet-22k](http://www.image-net.org/) (a collection of 14 million images and 22k classes) only, (2) also fine-tuned on ImageNet-22k or (3) also fine-tuned on [ImageNet-1k](http://www.image-net.org/challenges/LSVRC/2012/) (also referred to as ILSVRC 2012, a collection of 1.3 million
+  images and 1,000 classes).
+
+This model was contributed by [anugunj](https://huggingface.co/anugunj). The original code can be found [here](https://github.com/microsoft/CvT).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with CvT.
+
+<PipelineTag pipeline="image-classification"/>
+
+- [`CvtForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
+- See also: [Image classification task guide](../tasks/image_classification)
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## CvtConfig
+
+[[autodoc]] CvtConfig
+
+## CvtModel
+
+[[autodoc]] CvtModel
+    - forward
+
+## CvtForImageClassification
+
+[[autodoc]] CvtForImageClassification
+    - forward
+
+## TFCvtModel
+
+[[autodoc]] TFCvtModel
+    - call
+
+## TFCvtForImageClassification
+
+[[autodoc]] TFCvtForImageClassification
+    - call
+
diff --git a/docs/source/en/model_doc/cvt.mdx b/docs/source/en/model_doc/cvt.mdx
deleted file mode 100644
index 873450cf8351..000000000000
--- a/docs/source/en/model_doc/cvt.mdx
+++ /dev/null
@@ -1,64 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Convolutional Vision Transformer (CvT)
-
-## Overview
-
-The CvT model was proposed in [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan and Lei Zhang. The Convolutional vision Transformer (CvT) improves the [Vision Transformer (ViT)](vit) in performance and efficiency by introducing convolutions into ViT to yield the best of both designs.
-
-The abstract from the paper is the following:
-
-*We present in this paper a new architecture, named Convolutional vision Transformer (CvT), that improves Vision Transformer (ViT) 
-in performance and efficiency by introducing convolutions into ViT to yield the best of both designs. This is accomplished through 
-two primary modifications: a hierarchy of Transformers containing a new convolutional token embedding, and a convolutional Transformer 
-block leveraging a convolutional projection. These changes introduce desirable properties of convolutional neural networks (CNNs) 
-to the ViT architecture (\ie shift, scale, and distortion invariance) while maintaining the merits of Transformers (\ie dynamic attention, 
-global context, and better generalization). We validate CvT by conducting extensive experiments, showing that this approach achieves 
-state-of-the-art performance over other Vision Transformers and ResNets on ImageNet-1k, with fewer parameters and lower FLOPs. In addition, 
-performance gains are maintained when pretrained on larger datasets (\eg ImageNet-22k) and fine-tuned to downstream tasks. Pre-trained on 
-ImageNet-22k, our CvT-W24 obtains a top-1 accuracy of 87.7\% on the ImageNet-1k val set. Finally, our results show that the positional encoding, 
-a crucial component in existing Vision Transformers, can be safely removed in our model, simplifying the design for higher resolution vision tasks.*
-
-Tips:
-
-- CvT models are regular Vision Transformers, but trained with convolutions. They outperform the [original model (ViT)](vit) when fine-tuned on ImageNet-1K and CIFAR-100.
-- You can check out demo notebooks regarding inference as well as fine-tuning on custom data [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/VisionTransformer) (you can just replace [`ViTFeatureExtractor`] by [`AutoImageProcessor`] and [`ViTForImageClassification`] by [`CvtForImageClassification`]).
-- The available checkpoints are either (1) pre-trained on [ImageNet-22k](http://www.image-net.org/) (a collection of 14 million images and 22k classes) only, (2) also fine-tuned on ImageNet-22k or (3) also fine-tuned on [ImageNet-1k](http://www.image-net.org/challenges/LSVRC/2012/) (also referred to as ILSVRC 2012, a collection of 1.3 million
-  images and 1,000 classes).
-
-This model was contributed by [anugunj](https://huggingface.co/anugunj). The original code can be found [here](https://github.com/microsoft/CvT).
-
-## CvtConfig
-
-[[autodoc]] CvtConfig
-
-## CvtModel
-
-[[autodoc]] CvtModel
-    - forward
-
-## CvtForImageClassification
-
-[[autodoc]] CvtForImageClassification
-    - forward
-
-## TFCvtModel
-
-[[autodoc]] TFCvtModel
-    - call
-
-## TFCvtForImageClassification
-
-[[autodoc]] TFCvtForImageClassification
-    - call
-
diff --git a/docs/source/en/model_doc/data2vec.md b/docs/source/en/model_doc/data2vec.md
new file mode 100644
index 000000000000..dc05c44be90c
--- /dev/null
+++ b/docs/source/en/model_doc/data2vec.md
@@ -0,0 +1,180 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Data2Vec
+
+## Overview
+
+The Data2Vec model was proposed in [data2vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/pdf/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu and Michael Auli.
+Data2Vec proposes a unified framework for self-supervised learning across different data modalities - text, audio and images.
+Importantly, predicted targets for pre-training are contextualized latent representations of the inputs, rather than modality-specific, context-independent targets.
+
+The abstract from the paper is the following:
+
+*While the general idea of self-supervised learning is identical across modalities, the actual algorithms and
+objectives differ widely because they were developed with a single modality in mind. To get us closer to general
+self-supervised learning, we present data2vec, a framework that uses the same learning method for either speech,
+NLP or computer vision. The core idea is to predict latent representations of the full input data based on a
+masked view of the input in a selfdistillation setup using a standard Transformer architecture.
+Instead of predicting modality-specific targets such as words, visual tokens or units of human speech which
+are local in nature, data2vec predicts contextualized latent representations that contain information from
+the entire input. Experiments on the major benchmarks of speech recognition, image classification, and
+natural language understanding demonstrate a new state of the art or competitive performance to predominant approaches.
+Models and code are available at www.github.com/pytorch/fairseq/tree/master/examples/data2vec.*
+
+Tips:
+
+- Data2VecAudio, Data2VecText, and Data2VecVision have all been trained using the same self-supervised learning method.
+- For Data2VecAudio, preprocessing is identical to [`Wav2Vec2Model`], including feature extraction
+- For Data2VecText, preprocessing is identical to [`RobertaModel`], including tokenization.
+- For Data2VecVision, preprocessing is identical to [`BeitModel`], including feature extraction.
+
+This model was contributed by [edugp](https://huggingface.co/edugp) and [patrickvonplaten](https://huggingface.co/patrickvonplaten).
+[sayakpaul](https://github.com/sayakpaul) and [Rocketknight1](https://github.com/Rocketknight1) contributed Data2Vec for vision in TensorFlow.
+
+The original code (for NLP and Speech) can be found [here](https://github.com/pytorch/fairseq/tree/main/examples/data2vec).
+The original code for vision can be found [here](https://github.com/facebookresearch/data2vec_vision/tree/main/beit).
+
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Data2Vec.
+
+<PipelineTag pipeline="image-classification"/>
+
+- [`Data2VecVisionForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
+- To fine-tune [`TFData2VecVisionForImageClassification`] on a custom dataset, see [this notebook](https://colab.research.google.com/github/sayakpaul/TF-2.0-Hacks/blob/master/data2vec_vision_image_classification.ipynb).
+
+**Data2VecText documentation resources**
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Causal language modeling task guide](../tasks/language_modeling)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+- [Multiple choice task guide](../tasks/multiple_choice)
+
+**Data2VecAudio documentation resources**
+- [Audio classification task guide](../tasks/audio_classification)
+- [Automatic speech recognition task guide](../tasks/asr)
+
+**Data2VecVision documentation resources**
+- [Image classification](../tasks/image_classification)
+- [Semantic segmentation](../tasks/semantic_segmentation)
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## Data2VecTextConfig
+
+[[autodoc]] Data2VecTextConfig
+
+## Data2VecAudioConfig
+
+[[autodoc]] Data2VecAudioConfig
+
+## Data2VecVisionConfig
+
+[[autodoc]] Data2VecVisionConfig
+
+
+## Data2VecAudioModel
+
+[[autodoc]] Data2VecAudioModel
+    - forward
+
+## Data2VecAudioForAudioFrameClassification
+
+[[autodoc]] Data2VecAudioForAudioFrameClassification
+    - forward
+
+## Data2VecAudioForCTC
+
+[[autodoc]] Data2VecAudioForCTC
+    - forward
+
+## Data2VecAudioForSequenceClassification
+
+[[autodoc]] Data2VecAudioForSequenceClassification
+    - forward
+
+## Data2VecAudioForXVector
+
+[[autodoc]] Data2VecAudioForXVector
+    - forward
+
+## Data2VecTextModel
+
+[[autodoc]] Data2VecTextModel
+    - forward
+
+## Data2VecTextForCausalLM
+
+[[autodoc]] Data2VecTextForCausalLM
+    - forward
+
+## Data2VecTextForMaskedLM
+
+[[autodoc]] Data2VecTextForMaskedLM
+    - forward
+
+## Data2VecTextForSequenceClassification
+
+[[autodoc]] Data2VecTextForSequenceClassification
+    - forward
+
+## Data2VecTextForMultipleChoice
+
+[[autodoc]] Data2VecTextForMultipleChoice
+    - forward
+
+## Data2VecTextForTokenClassification
+
+[[autodoc]] Data2VecTextForTokenClassification
+    - forward
+
+## Data2VecTextForQuestionAnswering
+
+[[autodoc]] Data2VecTextForQuestionAnswering
+    - forward
+
+## Data2VecVisionModel
+
+[[autodoc]] Data2VecVisionModel
+    - forward
+
+## Data2VecVisionForImageClassification
+
+[[autodoc]] Data2VecVisionForImageClassification
+    - forward
+
+## Data2VecVisionForSemanticSegmentation
+
+[[autodoc]] Data2VecVisionForSemanticSegmentation
+    - forward
+
+## TFData2VecVisionModel
+
+[[autodoc]] TFData2VecVisionModel
+    - call
+
+## TFData2VecVisionForImageClassification
+
+[[autodoc]] TFData2VecVisionForImageClassification
+    - call
+
+## TFData2VecVisionForSemanticSegmentation
+
+[[autodoc]] TFData2VecVisionForSemanticSegmentation
+    - call
diff --git a/docs/source/en/model_doc/data2vec.mdx b/docs/source/en/model_doc/data2vec.mdx
deleted file mode 100644
index 8623d64afee7..000000000000
--- a/docs/source/en/model_doc/data2vec.mdx
+++ /dev/null
@@ -1,152 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Data2Vec
-
-## Overview
-
-The Data2Vec model was proposed in [data2vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/pdf/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu and Michael Auli.
-Data2Vec proposes a unified framework for self-supervised learning across different data modalities - text, audio and images.
-Importantly, predicted targets for pre-training are contextualized latent representations of the inputs, rather than modality-specific, context-independent targets.
-
-The abstract from the paper is the following:
-
-*While the general idea of self-supervised learning is identical across modalities, the actual algorithms and
-objectives differ widely because they were developed with a single modality in mind. To get us closer to general
-self-supervised learning, we present data2vec, a framework that uses the same learning method for either speech,
-NLP or computer vision. The core idea is to predict latent representations of the full input data based on a
-masked view of the input in a selfdistillation setup using a standard Transformer architecture.
-Instead of predicting modality-specific targets such as words, visual tokens or units of human speech which
-are local in nature, data2vec predicts contextualized latent representations that contain information from
-the entire input. Experiments on the major benchmarks of speech recognition, image classification, and
-natural language understanding demonstrate a new state of the art or competitive performance to predominant approaches.
-Models and code are available at www.github.com/pytorch/fairseq/tree/master/examples/data2vec.*
-
-Tips:
-
-- Data2VecAudio, Data2VecText, and Data2VecVision have all been trained using the same self-supervised learning method.
-- For Data2VecAudio, preprocessing is identical to [`Wav2Vec2Model`], including feature extraction
-- For Data2VecText, preprocessing is identical to [`RobertaModel`], including tokenization.
-- For Data2VecVision, preprocessing is identical to [`BeitModel`], including feature extraction.
-- To know how a pre-trained Data2Vec vision model can be fine-tuned on the task of image classification, you can check out
-[this notebook](https://colab.research.google.com/github/sayakpaul/TF-2.0-Hacks/blob/master/data2vec_vision_image_classification.ipynb).
-
-
-This model was contributed by [edugp](https://huggingface.co/edugp) and [patrickvonplaten](https://huggingface.co/patrickvonplaten).
-[sayakpaul](https://github.com/sayakpaul) and [Rocketknight1](https://github.com/Rocketknight1) contributed Data2Vec for vision in TensorFlow.
-
-The original code (for NLP and Speech) can be found [here](https://github.com/pytorch/fairseq/tree/main/examples/data2vec).
-The original code for vision can be found [here](https://github.com/facebookresearch/data2vec_vision/tree/main/beit).
-
-
-## Data2VecTextConfig
-
-[[autodoc]] Data2VecTextConfig
-
-## Data2VecAudioConfig
-
-[[autodoc]] Data2VecAudioConfig
-
-## Data2VecVisionConfig
-
-[[autodoc]] Data2VecVisionConfig
-
-
-## Data2VecAudioModel
-
-[[autodoc]] Data2VecAudioModel
-    - forward
-
-## Data2VecAudioForAudioFrameClassification
-
-[[autodoc]] Data2VecAudioForAudioFrameClassification
-    - forward
-
-## Data2VecAudioForCTC
-
-[[autodoc]] Data2VecAudioForCTC
-    - forward
-
-## Data2VecAudioForSequenceClassification
-
-[[autodoc]] Data2VecAudioForSequenceClassification
-    - forward
-
-## Data2VecAudioForXVector
-
-[[autodoc]] Data2VecAudioForXVector
-    - forward
-
-## Data2VecTextModel
-
-[[autodoc]] Data2VecTextModel
-    - forward
-
-## Data2VecTextForCausalLM
-
-[[autodoc]] Data2VecTextForCausalLM
-    - forward
-
-## Data2VecTextForMaskedLM
-
-[[autodoc]] Data2VecTextForMaskedLM
-    - forward
-
-## Data2VecTextForSequenceClassification
-
-[[autodoc]] Data2VecTextForSequenceClassification
-    - forward
-
-## Data2VecTextForMultipleChoice
-
-[[autodoc]] Data2VecTextForMultipleChoice
-    - forward
-
-## Data2VecTextForTokenClassification
-
-[[autodoc]] Data2VecTextForTokenClassification
-    - forward
-
-## Data2VecTextForQuestionAnswering
-
-[[autodoc]] Data2VecTextForQuestionAnswering
-    - forward
-
-## Data2VecVisionModel
-
-[[autodoc]] Data2VecVisionModel
-    - forward
-
-## Data2VecVisionForImageClassification
-
-[[autodoc]] Data2VecVisionForImageClassification
-    - forward
-
-## Data2VecVisionForSemanticSegmentation
-
-[[autodoc]] Data2VecVisionForSemanticSegmentation
-    - forward
-
-## TFData2VecVisionModel
-
-[[autodoc]] TFData2VecVisionModel
-    - call
-
-## TFData2VecVisionForImageClassification
-
-[[autodoc]] TFData2VecVisionForImageClassification
-    - call
-
-## TFData2VecVisionForSemanticSegmentation
-
-[[autodoc]] TFData2VecVisionForSemanticSegmentation
-    - call
diff --git a/docs/source/en/model_doc/deberta-v2.md b/docs/source/en/model_doc/deberta-v2.md
new file mode 100644
index 000000000000..8dec57a17173
--- /dev/null
+++ b/docs/source/en/model_doc/deberta-v2.md
@@ -0,0 +1,159 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# DeBERTa-v2
+
+## Overview
+
+The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen It is based on Google's
+BERT model released in 2018 and Facebook's RoBERTa model released in 2019.
+
+It builds on RoBERTa with disentangled attention and enhanced mask decoder training with half of the data used in
+RoBERTa.
+
+The abstract from the paper is the following:
+
+*Recent progress in pre-trained neural language models has significantly improved the performance of many natural
+language processing (NLP) tasks. In this paper we propose a new model architecture DeBERTa (Decoding-enhanced BERT with
+disentangled attention) that improves the BERT and RoBERTa models using two novel techniques. The first is the
+disentangled attention mechanism, where each word is represented using two vectors that encode its content and
+position, respectively, and the attention weights among words are computed using disentangled matrices on their
+contents and relative positions. Second, an enhanced mask decoder is used to replace the output softmax layer to
+predict the masked tokens for model pretraining. We show that these two techniques significantly improve the efficiency
+of model pretraining and performance of downstream tasks. Compared to RoBERTa-Large, a DeBERTa model trained on half of
+the training data performs consistently better on a wide range of NLP tasks, achieving improvements on MNLI by +0.9%
+(90.2% vs. 91.1%), on SQuAD v2.0 by +2.3% (88.4% vs. 90.7%) and RACE by +3.6% (83.2% vs. 86.8%). The DeBERTa code and
+pre-trained models will be made publicly available at https://github.com/microsoft/DeBERTa.*
+
+
+The following information is visible directly on the [original implementation
+repository](https://github.com/microsoft/DeBERTa). DeBERTa v2 is the second version of the DeBERTa model. It includes
+the 1.5B model used for the SuperGLUE single-model submission and achieving 89.9, versus human baseline 89.8. You can
+find more details about this submission in the authors'
+[blog](https://www.microsoft.com/en-us/research/blog/microsoft-deberta-surpasses-human-performance-on-the-superglue-benchmark/)
+
+New in v2:
+
+- **Vocabulary** In v2 the tokenizer is changed to use a new vocabulary of size 128K built from the training data.
+  Instead of a GPT2-based tokenizer, the tokenizer is now
+  [sentencepiece-based](https://github.com/google/sentencepiece) tokenizer.
+- **nGiE(nGram Induced Input Encoding)** The DeBERTa-v2 model uses an additional convolution layer aside with the first
+  transformer layer to better learn the local dependency of input tokens.
+- **Sharing position projection matrix with content projection matrix in attention layer** Based on previous
+  experiments, this can save parameters without affecting the performance.
+- **Apply bucket to encode relative positions** The DeBERTa-v2 model uses log bucket to encode relative positions
+  similar to T5.
+- **900M model & 1.5B model** Two additional model sizes are available: 900M and 1.5B, which significantly improves the
+  performance of downstream tasks.
+
+This model was contributed by [DeBERTa](https://huggingface.co/DeBERTa). This model TF 2.0 implementation was
+contributed by [kamalkraj](https://huggingface.co/kamalkraj). The original code can be found [here](https://github.com/microsoft/DeBERTa).
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+- [Multiple choice task guide](../tasks/multiple_choice)
+
+## DebertaV2Config
+
+[[autodoc]] DebertaV2Config
+
+## DebertaV2Tokenizer
+
+[[autodoc]] DebertaV2Tokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## DebertaV2TokenizerFast
+
+[[autodoc]] DebertaV2TokenizerFast
+    - build_inputs_with_special_tokens
+    - create_token_type_ids_from_sequences
+
+## DebertaV2Model
+
+[[autodoc]] DebertaV2Model
+    - forward
+
+## DebertaV2PreTrainedModel
+
+[[autodoc]] DebertaV2PreTrainedModel
+    - forward
+
+## DebertaV2ForMaskedLM
+
+[[autodoc]] DebertaV2ForMaskedLM
+    - forward
+
+## DebertaV2ForSequenceClassification
+
+[[autodoc]] DebertaV2ForSequenceClassification
+    - forward
+
+## DebertaV2ForTokenClassification
+
+[[autodoc]] DebertaV2ForTokenClassification
+    - forward
+
+## DebertaV2ForQuestionAnswering
+
+[[autodoc]] DebertaV2ForQuestionAnswering
+    - forward
+
+## DebertaV2ForMultipleChoice
+
+[[autodoc]] DebertaV2ForMultipleChoice
+    - forward
+
+## TFDebertaV2Model
+
+[[autodoc]] TFDebertaV2Model
+    - call
+
+## TFDebertaV2PreTrainedModel
+
+[[autodoc]] TFDebertaV2PreTrainedModel
+    - call
+
+## TFDebertaV2ForMaskedLM
+
+[[autodoc]] TFDebertaV2ForMaskedLM
+    - call
+
+## TFDebertaV2ForSequenceClassification
+
+[[autodoc]] TFDebertaV2ForSequenceClassification
+    - call
+
+## TFDebertaV2ForTokenClassification
+
+[[autodoc]] TFDebertaV2ForTokenClassification
+    - call
+
+## TFDebertaV2ForQuestionAnswering
+
+[[autodoc]] TFDebertaV2ForQuestionAnswering
+    - call
+
+## TFDebertaV2ForMultipleChoice
+
+[[autodoc]] TFDebertaV2ForMultipleChoice
+    - call
diff --git a/docs/source/en/model_doc/deberta-v2.mdx b/docs/source/en/model_doc/deberta-v2.mdx
deleted file mode 100644
index 18b2c4f16d01..000000000000
--- a/docs/source/en/model_doc/deberta-v2.mdx
+++ /dev/null
@@ -1,143 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# DeBERTa-v2
-
-## Overview
-
-The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen It is based on Google's
-BERT model released in 2018 and Facebook's RoBERTa model released in 2019.
-
-It builds on RoBERTa with disentangled attention and enhanced mask decoder training with half of the data used in
-RoBERTa.
-
-The abstract from the paper is the following:
-
-*Recent progress in pre-trained neural language models has significantly improved the performance of many natural
-language processing (NLP) tasks. In this paper we propose a new model architecture DeBERTa (Decoding-enhanced BERT with
-disentangled attention) that improves the BERT and RoBERTa models using two novel techniques. The first is the
-disentangled attention mechanism, where each word is represented using two vectors that encode its content and
-position, respectively, and the attention weights among words are computed using disentangled matrices on their
-contents and relative positions. Second, an enhanced mask decoder is used to replace the output softmax layer to
-predict the masked tokens for model pretraining. We show that these two techniques significantly improve the efficiency
-of model pretraining and performance of downstream tasks. Compared to RoBERTa-Large, a DeBERTa model trained on half of
-the training data performs consistently better on a wide range of NLP tasks, achieving improvements on MNLI by +0.9%
-(90.2% vs. 91.1%), on SQuAD v2.0 by +2.3% (88.4% vs. 90.7%) and RACE by +3.6% (83.2% vs. 86.8%). The DeBERTa code and
-pre-trained models will be made publicly available at https://github.com/microsoft/DeBERTa.*
-
-
-The following information is visible directly on the [original implementation
-repository](https://github.com/microsoft/DeBERTa). DeBERTa v2 is the second version of the DeBERTa model. It includes
-the 1.5B model used for the SuperGLUE single-model submission and achieving 89.9, versus human baseline 89.8. You can
-find more details about this submission in the authors'
-[blog](https://www.microsoft.com/en-us/research/blog/microsoft-deberta-surpasses-human-performance-on-the-superglue-benchmark/)
-
-New in v2:
-
-- **Vocabulary** In v2 the tokenizer is changed to use a new vocabulary of size 128K built from the training data.
-  Instead of a GPT2-based tokenizer, the tokenizer is now
-  [sentencepiece-based](https://github.com/google/sentencepiece) tokenizer.
-- **nGiE(nGram Induced Input Encoding)** The DeBERTa-v2 model uses an additional convolution layer aside with the first
-  transformer layer to better learn the local dependency of input tokens.
-- **Sharing position projection matrix with content projection matrix in attention layer** Based on previous
-  experiments, this can save parameters without affecting the performance.
-- **Apply bucket to encode relative positions** The DeBERTa-v2 model uses log bucket to encode relative positions
-  similar to T5.
-- **900M model & 1.5B model** Two additional model sizes are available: 900M and 1.5B, which significantly improves the
-  performance of downstream tasks.
-
-This model was contributed by [DeBERTa](https://huggingface.co/DeBERTa). This model TF 2.0 implementation was
-contributed by [kamalkraj](https://huggingface.co/kamalkraj). The original code can be found [here](https://github.com/microsoft/DeBERTa).
-
-
-## DebertaV2Config
-
-[[autodoc]] DebertaV2Config
-
-## DebertaV2Tokenizer
-
-[[autodoc]] DebertaV2Tokenizer
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-
-## DebertaV2TokenizerFast
-
-[[autodoc]] DebertaV2TokenizerFast
-    - build_inputs_with_special_tokens
-    - create_token_type_ids_from_sequences
-
-## DebertaV2Model
-
-[[autodoc]] DebertaV2Model
-    - forward
-
-## DebertaV2PreTrainedModel
-
-[[autodoc]] DebertaV2PreTrainedModel
-    - forward
-
-## DebertaV2ForMaskedLM
-
-[[autodoc]] DebertaV2ForMaskedLM
-    - forward
-
-## DebertaV2ForSequenceClassification
-
-[[autodoc]] DebertaV2ForSequenceClassification
-    - forward
-
-## DebertaV2ForTokenClassification
-
-[[autodoc]] DebertaV2ForTokenClassification
-    - forward
-
-## DebertaV2ForQuestionAnswering
-
-[[autodoc]] DebertaV2ForQuestionAnswering
-    - forward
-
-## DebertaV2ForMultipleChoice
-
-[[autodoc]] DebertaV2ForMultipleChoice
-    - forward
-
-## TFDebertaV2Model
-
-[[autodoc]] TFDebertaV2Model
-    - call
-
-## TFDebertaV2PreTrainedModel
-
-[[autodoc]] TFDebertaV2PreTrainedModel
-    - call
-
-## TFDebertaV2ForMaskedLM
-
-[[autodoc]] TFDebertaV2ForMaskedLM
-    - call
-
-## TFDebertaV2ForSequenceClassification
-
-[[autodoc]] TFDebertaV2ForSequenceClassification
-    - call
-
-## TFDebertaV2ForTokenClassification
-
-[[autodoc]] TFDebertaV2ForTokenClassification
-    - call
-
-## TFDebertaV2ForQuestionAnswering
-
-[[autodoc]] TFDebertaV2ForQuestionAnswering
-    - call
diff --git a/docs/source/en/model_doc/deberta.md b/docs/source/en/model_doc/deberta.md
new file mode 100644
index 000000000000..ed66364a4b5a
--- /dev/null
+++ b/docs/source/en/model_doc/deberta.md
@@ -0,0 +1,154 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# DeBERTa
+
+## Overview
+
+The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen It is based on Google's
+BERT model released in 2018 and Facebook's RoBERTa model released in 2019.
+
+It builds on RoBERTa with disentangled attention and enhanced mask decoder training with half of the data used in
+RoBERTa.
+
+The abstract from the paper is the following:
+
+*Recent progress in pre-trained neural language models has significantly improved the performance of many natural
+language processing (NLP) tasks. In this paper we propose a new model architecture DeBERTa (Decoding-enhanced BERT with
+disentangled attention) that improves the BERT and RoBERTa models using two novel techniques. The first is the
+disentangled attention mechanism, where each word is represented using two vectors that encode its content and
+position, respectively, and the attention weights among words are computed using disentangled matrices on their
+contents and relative positions. Second, an enhanced mask decoder is used to replace the output softmax layer to
+predict the masked tokens for model pretraining. We show that these two techniques significantly improve the efficiency
+of model pretraining and performance of downstream tasks. Compared to RoBERTa-Large, a DeBERTa model trained on half of
+the training data performs consistently better on a wide range of NLP tasks, achieving improvements on MNLI by +0.9%
+(90.2% vs. 91.1%), on SQuAD v2.0 by +2.3% (88.4% vs. 90.7%) and RACE by +3.6% (83.2% vs. 86.8%). The DeBERTa code and
+pre-trained models will be made publicly available at https://github.com/microsoft/DeBERTa.*
+
+
+This model was contributed by [DeBERTa](https://huggingface.co/DeBERTa). This model TF 2.0 implementation was
+contributed by [kamalkraj](https://huggingface.co/kamalkraj) . The original code can be found [here](https://github.com/microsoft/DeBERTa).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DeBERTa. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+<PipelineTag pipeline="text-classification"/>
+
+- A blog post on how to [Accelerate Large Model Training using DeepSpeed](https://huggingface.co/blog/accelerate-deepspeed) with DeBERTa.
+- A blog post on [Supercharged Customer Service with Machine Learning](https://huggingface.co/blog/supercharge-customer-service-with-machine-learning) with DeBERTa.
+- [`DebertaForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb).
+- [`TFDebertaForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb).
+- [Text classification task guide](../tasks/sequence_classification)
+
+<PipelineTag pipeline="token-classification" />
+
+- [`DebertaForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/token-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb).
+- [`TFDebertaForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/token-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb).
+- [Token classification](https://huggingface.co/course/chapter7/2?fw=pt) chapter of the 🤗 Hugging Face Course.
+- [Byte-Pair Encoding tokenization](https://huggingface.co/course/chapter6/5?fw=pt) chapter of the 🤗 Hugging Face Course.
+- [Token classification task guide](../tasks/token_classification)
+
+<PipelineTag pipeline="fill-mask"/>
+
+- [`DebertaForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#robertabertdistilbert-and-masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
+- [`TFDebertaForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_mlmpy) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
+- [Masked language modeling](https://huggingface.co/course/chapter7/3?fw=pt) chapter of the 🤗 Hugging Face Course.
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+
+<PipelineTag pipeline="question-answering"/>
+
+- [`DebertaForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb).
+- [`TFDebertaForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb).
+- [Question answering](https://huggingface.co/course/chapter7/7?fw=pt) chapter of the 🤗 Hugging Face Course.
+- [Question answering task guide](../tasks/question_answering)
+
+## DebertaConfig
+
+[[autodoc]] DebertaConfig
+
+## DebertaTokenizer
+
+[[autodoc]] DebertaTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## DebertaTokenizerFast
+
+[[autodoc]] DebertaTokenizerFast
+    - build_inputs_with_special_tokens
+    - create_token_type_ids_from_sequences
+
+## DebertaModel
+
+[[autodoc]] DebertaModel
+    - forward
+
+## DebertaPreTrainedModel
+
+[[autodoc]] DebertaPreTrainedModel
+
+## DebertaForMaskedLM
+
+[[autodoc]] DebertaForMaskedLM
+    - forward
+
+## DebertaForSequenceClassification
+
+[[autodoc]] DebertaForSequenceClassification
+    - forward
+
+## DebertaForTokenClassification
+
+[[autodoc]] DebertaForTokenClassification
+    - forward
+
+## DebertaForQuestionAnswering
+
+[[autodoc]] DebertaForQuestionAnswering
+    - forward
+
+## TFDebertaModel
+
+[[autodoc]] TFDebertaModel
+    - call
+
+## TFDebertaPreTrainedModel
+
+[[autodoc]] TFDebertaPreTrainedModel
+    - call
+
+## TFDebertaForMaskedLM
+
+[[autodoc]] TFDebertaForMaskedLM
+    - call
+
+## TFDebertaForSequenceClassification
+
+[[autodoc]] TFDebertaForSequenceClassification
+    - call
+
+## TFDebertaForTokenClassification
+
+[[autodoc]] TFDebertaForTokenClassification
+    - call
+
+## TFDebertaForQuestionAnswering
+
+[[autodoc]] TFDebertaForQuestionAnswering
+    - call
diff --git a/docs/source/en/model_doc/deberta.mdx b/docs/source/en/model_doc/deberta.mdx
deleted file mode 100644
index 33b1ec6a5104..000000000000
--- a/docs/source/en/model_doc/deberta.mdx
+++ /dev/null
@@ -1,146 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# DeBERTa
-
-## Overview
-
-The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen It is based on Google's
-BERT model released in 2018 and Facebook's RoBERTa model released in 2019.
-
-It builds on RoBERTa with disentangled attention and enhanced mask decoder training with half of the data used in
-RoBERTa.
-
-The abstract from the paper is the following:
-
-*Recent progress in pre-trained neural language models has significantly improved the performance of many natural
-language processing (NLP) tasks. In this paper we propose a new model architecture DeBERTa (Decoding-enhanced BERT with
-disentangled attention) that improves the BERT and RoBERTa models using two novel techniques. The first is the
-disentangled attention mechanism, where each word is represented using two vectors that encode its content and
-position, respectively, and the attention weights among words are computed using disentangled matrices on their
-contents and relative positions. Second, an enhanced mask decoder is used to replace the output softmax layer to
-predict the masked tokens for model pretraining. We show that these two techniques significantly improve the efficiency
-of model pretraining and performance of downstream tasks. Compared to RoBERTa-Large, a DeBERTa model trained on half of
-the training data performs consistently better on a wide range of NLP tasks, achieving improvements on MNLI by +0.9%
-(90.2% vs. 91.1%), on SQuAD v2.0 by +2.3% (88.4% vs. 90.7%) and RACE by +3.6% (83.2% vs. 86.8%). The DeBERTa code and
-pre-trained models will be made publicly available at https://github.com/microsoft/DeBERTa.*
-
-
-This model was contributed by [DeBERTa](https://huggingface.co/DeBERTa). This model TF 2.0 implementation was
-contributed by [kamalkraj](https://huggingface.co/kamalkraj) . The original code can be found [here](https://github.com/microsoft/DeBERTa).
-
-## Resources
-
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DeBERTa. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
-<PipelineTag pipeline="text-classification"/>
-
-- A blog post on how to [Accelerate Large Model Training using DeepSpeed](https://huggingface.co/blog/accelerate-deepspeed) with DeBERTa.
-- A blog post on [Supercharged Customer Service with Machine Learning](https://huggingface.co/blog/supercharge-customer-service-with-machine-learning) with DeBERTa.
-- [`DebertaForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb).
-- [`TFDebertaForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb).
-
-<PipelineTag pipeline="token-classification" />
-
-- [`DebertaForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/token-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb).
-- [`TFDebertaForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/token-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb).
-- [Token classification](https://huggingface.co/course/chapter7/2?fw=pt) chapter of the 🤗 Hugging Face Course.
-- [Byte-Pair Encoding tokenization](https://huggingface.co/course/chapter6/5?fw=pt) chapter of the 🤗 Hugging Face Course.
-
-<PipelineTag pipeline="fill-mask"/>
-
-- [`DebertaForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#robertabertdistilbert-and-masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
-- [`TFDebertaForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_mlmpy) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
-- [Masked language modeling](https://huggingface.co/course/chapter7/3?fw=pt) chapter of the 🤗 Hugging Face Course.
-
-<PipelineTag pipeline="question-answering"/>
-
-- [`DebertaForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb).
-- [`TFDebertaForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb).
-- [Question answering](https://huggingface.co/course/chapter7/7?fw=pt) chapter of the 🤗 Hugging Face Course.
-
-## DebertaConfig
-
-[[autodoc]] DebertaConfig
-
-## DebertaTokenizer
-
-[[autodoc]] DebertaTokenizer
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-
-## DebertaTokenizerFast
-
-[[autodoc]] DebertaTokenizerFast
-    - build_inputs_with_special_tokens
-    - create_token_type_ids_from_sequences
-
-## DebertaModel
-
-[[autodoc]] DebertaModel
-    - forward
-
-## DebertaPreTrainedModel
-
-[[autodoc]] DebertaPreTrainedModel
-
-## DebertaForMaskedLM
-
-[[autodoc]] DebertaForMaskedLM
-    - forward
-
-## DebertaForSequenceClassification
-
-[[autodoc]] DebertaForSequenceClassification
-    - forward
-
-## DebertaForTokenClassification
-
-[[autodoc]] DebertaForTokenClassification
-    - forward
-
-## DebertaForQuestionAnswering
-
-[[autodoc]] DebertaForQuestionAnswering
-    - forward
-
-## TFDebertaModel
-
-[[autodoc]] TFDebertaModel
-    - call
-
-## TFDebertaPreTrainedModel
-
-[[autodoc]] TFDebertaPreTrainedModel
-    - call
-
-## TFDebertaForMaskedLM
-
-[[autodoc]] TFDebertaForMaskedLM
-    - call
-
-## TFDebertaForSequenceClassification
-
-[[autodoc]] TFDebertaForSequenceClassification
-    - call
-
-## TFDebertaForTokenClassification
-
-[[autodoc]] TFDebertaForTokenClassification
-    - call
-
-## TFDebertaForQuestionAnswering
-
-[[autodoc]] TFDebertaForQuestionAnswering
-    - call
diff --git a/docs/source/en/model_doc/decision_transformer.md b/docs/source/en/model_doc/decision_transformer.md
new file mode 100644
index 000000000000..a46673d87ac8
--- /dev/null
+++ b/docs/source/en/model_doc/decision_transformer.md
@@ -0,0 +1,55 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Decision Transformer
+
+## Overview
+
+The Decision Transformer model was proposed in [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345)  
+by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
+
+The abstract from the paper is the following:
+
+*We introduce a framework that abstracts Reinforcement Learning (RL) as a sequence modeling problem. 
+This allows us to draw upon the simplicity and scalability of the Transformer architecture, and associated advances
+ in language modeling such as GPT-x and BERT. In particular, we present Decision Transformer, an architecture that 
+ casts the problem of RL as conditional sequence modeling. Unlike prior approaches to RL that fit value functions or 
+ compute policy gradients, Decision Transformer simply outputs the optimal actions by leveraging a causally masked 
+ Transformer. By conditioning an autoregressive model on the desired return (reward), past states, and actions, our 
+ Decision Transformer model can generate future actions that achieve the desired return. Despite its simplicity, 
+ Decision Transformer matches or exceeds the performance of state-of-the-art model-free offline RL baselines on 
+ Atari, OpenAI Gym, and Key-to-Door tasks.*
+
+Tips:
+
+This version of the model is for tasks where the state is a vector, image-based states will come soon.
+
+This model was contributed by [edbeeching](https://huggingface.co/edbeeching). The original code can be found [here](https://github.com/kzl/decision-transformer).
+
+## DecisionTransformerConfig
+
+[[autodoc]] DecisionTransformerConfig
+
+
+## DecisionTransformerGPT2Model
+
+[[autodoc]] DecisionTransformerGPT2Model
+    - forward
+
+## DecisionTransformerModel
+
+[[autodoc]] DecisionTransformerModel
+    - forward
diff --git a/docs/source/en/model_doc/decision_transformer.mdx b/docs/source/en/model_doc/decision_transformer.mdx
deleted file mode 100644
index 2f0c70b6791b..000000000000
--- a/docs/source/en/model_doc/decision_transformer.mdx
+++ /dev/null
@@ -1,51 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Decision Transformer
-
-## Overview
-
-The Decision Transformer model was proposed in [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345)  
-by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
-
-The abstract from the paper is the following:
-
-*We introduce a framework that abstracts Reinforcement Learning (RL) as a sequence modeling problem. 
-This allows us to draw upon the simplicity and scalability of the Transformer architecture, and associated advances
- in language modeling such as GPT-x and BERT. In particular, we present Decision Transformer, an architecture that 
- casts the problem of RL as conditional sequence modeling. Unlike prior approaches to RL that fit value functions or 
- compute policy gradients, Decision Transformer simply outputs the optimal actions by leveraging a causally masked 
- Transformer. By conditioning an autoregressive model on the desired return (reward), past states, and actions, our 
- Decision Transformer model can generate future actions that achieve the desired return. Despite its simplicity, 
- Decision Transformer matches or exceeds the performance of state-of-the-art model-free offline RL baselines on 
- Atari, OpenAI Gym, and Key-to-Door tasks.*
-
-Tips:
-
-This version of the model is for tasks where the state is a vector, image-based states will come soon.
-
-This model was contributed by [edbeeching](https://huggingface.co/edbeeching). The original code can be found [here](https://github.com/kzl/decision-transformer).
-
-## DecisionTransformerConfig
-
-[[autodoc]] DecisionTransformerConfig
-
-
-## DecisionTransformerGPT2Model
-
-[[autodoc]] DecisionTransformerGPT2Model
-    - forward
-
-## DecisionTransformerModel
-
-[[autodoc]] DecisionTransformerModel
-    - forward
diff --git a/docs/source/en/model_doc/deformable_detr.md b/docs/source/en/model_doc/deformable_detr.md
new file mode 100644
index 000000000000..0bceb0bdf39b
--- /dev/null
+++ b/docs/source/en/model_doc/deformable_detr.md
@@ -0,0 +1,75 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Deformable DETR
+
+## Overview
+
+The Deformable DETR model was proposed in [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai.
+Deformable DETR mitigates the slow convergence issues and limited feature spatial resolution of the original [DETR](detr) by leveraging a new deformable attention module which only attends to a small set of key sampling points around a reference.
+
+The abstract from the paper is the following:
+
+*DETR has been recently proposed to eliminate the need for many hand-designed components in object detection while demonstrating good performance. However, it suffers from slow convergence and limited feature spatial resolution, due to the limitation of Transformer attention modules in processing image feature maps. To mitigate these issues, we proposed Deformable DETR, whose attention modules only attend to a small set of key sampling points around a reference. Deformable DETR can achieve better performance than DETR (especially on small objects) with 10 times less training epochs. Extensive experiments on the COCO benchmark demonstrate the effectiveness of our approach.*
+
+Tips:
+
+- One can use [`DeformableDetrImageProcessor`] to prepare images (and optional targets) for the model.
+- Training Deformable DETR is equivalent to training the original [DETR](detr) model. See the [resources](#resources) section below for demo notebooks.
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/deformable_detr_architecture.png"
+alt="drawing" width="600"/>
+
+<small> Deformable DETR architecture. Taken from the <a href="https://arxiv.org/abs/2010.04159">original paper</a>.</small>
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/fundamentalvision/Deformable-DETR).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Deformable DETR.
+
+<PipelineTag pipeline="object-detection"/>
+
+- Demo notebooks regarding inference + fine-tuning on a custom dataset for [`DeformableDetrForObjectDetection`] can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/Deformable-DETR).
+- See also: [Object detection task guide](../tasks/object_detection).
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## DeformableDetrImageProcessor
+
+[[autodoc]] DeformableDetrImageProcessor
+    - preprocess
+    - post_process_object_detection
+
+## DeformableDetrFeatureExtractor
+
+[[autodoc]] DeformableDetrFeatureExtractor
+    - __call__
+    - post_process_object_detection
+
+## DeformableDetrConfig
+
+[[autodoc]] DeformableDetrConfig
+
+## DeformableDetrModel
+
+[[autodoc]] DeformableDetrModel
+    - forward
+
+## DeformableDetrForObjectDetection
+
+[[autodoc]] DeformableDetrForObjectDetection
+    - forward
diff --git a/docs/source/en/model_doc/deformable_detr.mdx b/docs/source/en/model_doc/deformable_detr.mdx
deleted file mode 100644
index 30683bce1725..000000000000
--- a/docs/source/en/model_doc/deformable_detr.mdx
+++ /dev/null
@@ -1,65 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Deformable DETR
-
-## Overview
-
-The Deformable DETR model was proposed in [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai.
-Deformable DETR mitigates the slow convergence issues and limited feature spatial resolution of the original [DETR](detr) by leveraging a new deformable attention module which only attends to a small set of key sampling points around a reference.
-
-The abstract from the paper is the following:
-
-*DETR has been recently proposed to eliminate the need for many hand-designed components in object detection while demonstrating good performance. However, it suffers from slow convergence and limited feature spatial resolution, due to the limitation of Transformer attention modules in processing image feature maps. To mitigate these issues, we proposed Deformable DETR, whose attention modules only attend to a small set of key sampling points around a reference. Deformable DETR can achieve better performance than DETR (especially on small objects) with 10 times less training epochs. Extensive experiments on the COCO benchmark demonstrate the effectiveness of our approach.*
-
-Tips:
-
-- One can use [`DeformableDetrImageProcessor`] to prepare images (and optional targets) for the model.
-- Training Deformable DETR is equivalent to training the original [DETR](detr) model. Demo notebooks can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DETR).
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/deformable_detr_architecture.png"
-alt="drawing" width="600"/>
-
-<small> Deformable DETR architecture. Taken from the <a href="https://arxiv.org/abs/2010.04159">original paper</a>.</small>
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/fundamentalvision/Deformable-DETR).
-
-## DeformableDetrImageProcessor
-
-[[autodoc]] DeformableDetrImageProcessor
-    - preprocess
-    - pad_and_create_pixel_mask
-    - post_process_object_detection
-
-## DeformableDetrFeatureExtractor
-
-[[autodoc]] DeformableDetrFeatureExtractor
-    - __call__
-    - pad_and_create_pixel_mask
-    - post_process_object_detection
-
-
-## DeformableDetrConfig
-
-[[autodoc]] DeformableDetrConfig
-
-
-## DeformableDetrModel
-
-[[autodoc]] DeformableDetrModel
-    - forward
-
-
-## DeformableDetrForObjectDetection
-
-[[autodoc]] DeformableDetrForObjectDetection
-    - forward
diff --git a/docs/source/en/model_doc/deit.md b/docs/source/en/model_doc/deit.md
new file mode 100644
index 000000000000..ef32e05ebd92
--- /dev/null
+++ b/docs/source/en/model_doc/deit.md
@@ -0,0 +1,145 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# DeiT
+
+<Tip>
+
+This is a recently introduced model so the API hasn't been tested extensively. There may be some bugs or slight
+breaking changes to fix it in the future. If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title).
+
+</Tip>
+
+## Overview
+
+The DeiT model was proposed in [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre
+Sablayrolles, Hervé Jégou. The [Vision Transformer (ViT)](vit) introduced in [Dosovitskiy et al., 2020](https://arxiv.org/abs/2010.11929) has shown that one can match or even outperform existing convolutional neural
+networks using a Transformer encoder (BERT-like). However, the ViT models introduced in that paper required training on
+expensive infrastructure for multiple weeks, using external data. DeiT (data-efficient image transformers) are more
+efficiently trained transformers for image classification, requiring far less data and far less computing resources
+compared to the original ViT models.
+
+The abstract from the paper is the following:
+
+*Recently, neural networks purely based on attention were shown to address image understanding tasks such as image
+classification. However, these visual transformers are pre-trained with hundreds of millions of images using an
+expensive infrastructure, thereby limiting their adoption. In this work, we produce a competitive convolution-free
+transformer by training on Imagenet only. We train them on a single computer in less than 3 days. Our reference vision
+transformer (86M parameters) achieves top-1 accuracy of 83.1% (single-crop evaluation) on ImageNet with no external
+data. More importantly, we introduce a teacher-student strategy specific to transformers. It relies on a distillation
+token ensuring that the student learns from the teacher through attention. We show the interest of this token-based
+distillation, especially when using a convnet as a teacher. This leads us to report results competitive with convnets
+for both Imagenet (where we obtain up to 85.2% accuracy) and when transferring to other tasks. We share our code and
+models.*
+
+Tips:
+
+- Compared to ViT, DeiT models use a so-called distillation token to effectively learn from a teacher (which, in the
+  DeiT paper, is a ResNet like-model). The distillation token is learned through backpropagation, by interacting with
+  the class ([CLS]) and patch tokens through the self-attention layers.
+- There are 2 ways to fine-tune distilled models, either (1) in a classic way, by only placing a prediction head on top
+  of the final hidden state of the class token and not using the distillation signal, or (2) by placing both a
+  prediction head on top of the class token and on top of the distillation token. In that case, the [CLS] prediction
+  head is trained using regular cross-entropy between the prediction of the head and the ground-truth label, while the
+  distillation prediction head is trained using hard distillation (cross-entropy between the prediction of the
+  distillation head and the label predicted by the teacher). At inference time, one takes the average prediction
+  between both heads as final prediction. (2) is also called "fine-tuning with distillation", because one relies on a
+  teacher that has already been fine-tuned on the downstream dataset. In terms of models, (1) corresponds to
+  [`DeiTForImageClassification`] and (2) corresponds to
+  [`DeiTForImageClassificationWithTeacher`].
+- Note that the authors also did try soft distillation for (2) (in which case the distillation prediction head is
+  trained using KL divergence to match the softmax output of the teacher), but hard distillation gave the best results.
+- All released checkpoints were pre-trained and fine-tuned on ImageNet-1k only. No external data was used. This is in
+  contrast with the original ViT model, which used external data like the JFT-300M dataset/Imagenet-21k for
+  pre-training.
+- The authors of DeiT also released more efficiently trained ViT models, which you can directly plug into
+  [`ViTModel`] or [`ViTForImageClassification`]. Techniques like data
+  augmentation, optimization, and regularization were used in order to simulate training on a much larger dataset
+  (while only using ImageNet-1k for pre-training). There are 4 variants available (in 3 different sizes):
+  *facebook/deit-tiny-patch16-224*, *facebook/deit-small-patch16-224*, *facebook/deit-base-patch16-224* and
+  *facebook/deit-base-patch16-384*. Note that one should use [`DeiTImageProcessor`] in order to
+  prepare images for the model.
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The TensorFlow version of this model was added by [amyeroberts](https://huggingface.co/amyeroberts).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DeiT.
+
+<PipelineTag pipeline="image-classification"/>
+
+- [`DeiTForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
+- See also: [Image classification task guide](../tasks/image_classification)
+
+Besides that:
+
+- [`DeiTForMaskedImageModeling`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## DeiTConfig
+
+[[autodoc]] DeiTConfig
+
+## DeiTFeatureExtractor
+
+[[autodoc]] DeiTFeatureExtractor
+    - __call__
+
+## DeiTImageProcessor
+
+[[autodoc]] DeiTImageProcessor
+    - preprocess
+
+## DeiTModel
+
+[[autodoc]] DeiTModel
+    - forward
+
+## DeiTForMaskedImageModeling
+
+[[autodoc]] DeiTForMaskedImageModeling
+    - forward
+
+## DeiTForImageClassification
+
+[[autodoc]] DeiTForImageClassification
+    - forward
+
+## DeiTForImageClassificationWithTeacher
+
+[[autodoc]] DeiTForImageClassificationWithTeacher
+    - forward
+
+## TFDeiTModel
+
+[[autodoc]] TFDeiTModel
+    - call
+
+## TFDeiTForMaskedImageModeling
+
+[[autodoc]] TFDeiTForMaskedImageModeling
+    - call
+
+## TFDeiTForImageClassification
+
+[[autodoc]] TFDeiTForImageClassification
+    - call
+
+## TFDeiTForImageClassificationWithTeacher
+
+[[autodoc]] TFDeiTForImageClassificationWithTeacher
+    - call
diff --git a/docs/source/en/model_doc/deit.mdx b/docs/source/en/model_doc/deit.mdx
deleted file mode 100644
index 45e9f598f97e..000000000000
--- a/docs/source/en/model_doc/deit.mdx
+++ /dev/null
@@ -1,127 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# DeiT
-
-<Tip>
-
-This is a recently introduced model so the API hasn't been tested extensively. There may be some bugs or slight
-breaking changes to fix it in the future. If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title).
-
-</Tip>
-
-## Overview
-
-The DeiT model was proposed in [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre
-Sablayrolles, Hervé Jégou. The [Vision Transformer (ViT)](vit) introduced in [Dosovitskiy et al., 2020](https://arxiv.org/abs/2010.11929) has shown that one can match or even outperform existing convolutional neural
-networks using a Transformer encoder (BERT-like). However, the ViT models introduced in that paper required training on
-expensive infrastructure for multiple weeks, using external data. DeiT (data-efficient image transformers) are more
-efficiently trained transformers for image classification, requiring far less data and far less computing resources
-compared to the original ViT models.
-
-The abstract from the paper is the following:
-
-*Recently, neural networks purely based on attention were shown to address image understanding tasks such as image
-classification. However, these visual transformers are pre-trained with hundreds of millions of images using an
-expensive infrastructure, thereby limiting their adoption. In this work, we produce a competitive convolution-free
-transformer by training on Imagenet only. We train them on a single computer in less than 3 days. Our reference vision
-transformer (86M parameters) achieves top-1 accuracy of 83.1% (single-crop evaluation) on ImageNet with no external
-data. More importantly, we introduce a teacher-student strategy specific to transformers. It relies on a distillation
-token ensuring that the student learns from the teacher through attention. We show the interest of this token-based
-distillation, especially when using a convnet as a teacher. This leads us to report results competitive with convnets
-for both Imagenet (where we obtain up to 85.2% accuracy) and when transferring to other tasks. We share our code and
-models.*
-
-Tips:
-
-- Compared to ViT, DeiT models use a so-called distillation token to effectively learn from a teacher (which, in the
-  DeiT paper, is a ResNet like-model). The distillation token is learned through backpropagation, by interacting with
-  the class ([CLS]) and patch tokens through the self-attention layers.
-- There are 2 ways to fine-tune distilled models, either (1) in a classic way, by only placing a prediction head on top
-  of the final hidden state of the class token and not using the distillation signal, or (2) by placing both a
-  prediction head on top of the class token and on top of the distillation token. In that case, the [CLS] prediction
-  head is trained using regular cross-entropy between the prediction of the head and the ground-truth label, while the
-  distillation prediction head is trained using hard distillation (cross-entropy between the prediction of the
-  distillation head and the label predicted by the teacher). At inference time, one takes the average prediction
-  between both heads as final prediction. (2) is also called "fine-tuning with distillation", because one relies on a
-  teacher that has already been fine-tuned on the downstream dataset. In terms of models, (1) corresponds to
-  [`DeiTForImageClassification`] and (2) corresponds to
-  [`DeiTForImageClassificationWithTeacher`].
-- Note that the authors also did try soft distillation for (2) (in which case the distillation prediction head is
-  trained using KL divergence to match the softmax output of the teacher), but hard distillation gave the best results.
-- All released checkpoints were pre-trained and fine-tuned on ImageNet-1k only. No external data was used. This is in
-  contrast with the original ViT model, which used external data like the JFT-300M dataset/Imagenet-21k for
-  pre-training.
-- The authors of DeiT also released more efficiently trained ViT models, which you can directly plug into
-  [`ViTModel`] or [`ViTForImageClassification`]. Techniques like data
-  augmentation, optimization, and regularization were used in order to simulate training on a much larger dataset
-  (while only using ImageNet-1k for pre-training). There are 4 variants available (in 3 different sizes):
-  *facebook/deit-tiny-patch16-224*, *facebook/deit-small-patch16-224*, *facebook/deit-base-patch16-224* and
-  *facebook/deit-base-patch16-384*. Note that one should use [`DeiTImageProcessor`] in order to
-  prepare images for the model.
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr). The TensorFlow version of this model was added by [amyeroberts](https://huggingface.co/amyeroberts).
-
-
-## DeiTConfig
-
-[[autodoc]] DeiTConfig
-
-## DeiTFeatureExtractor
-
-[[autodoc]] DeiTFeatureExtractor
-    - __call__
-
-## DeiTImageProcessor
-
-[[autodoc]] DeiTImageProcessor
-    - preprocess
-
-## DeiTModel
-
-[[autodoc]] DeiTModel
-    - forward
-
-## DeiTForMaskedImageModeling
-
-[[autodoc]] DeiTForMaskedImageModeling
-    - forward
-
-## DeiTForImageClassification
-
-[[autodoc]] DeiTForImageClassification
-    - forward
-
-## DeiTForImageClassificationWithTeacher
-
-[[autodoc]] DeiTForImageClassificationWithTeacher
-    - forward
-
-## TFDeiTModel
-
-[[autodoc]] TFDeiTModel
-    - call
-
-## TFDeiTForMaskedImageModeling
-
-[[autodoc]] TFDeiTForMaskedImageModeling
-    - call
-
-## TFDeiTForImageClassification
-
-[[autodoc]] TFDeiTForImageClassification
-    - call
-
-## TFDeiTForImageClassificationWithTeacher
-
-[[autodoc]] TFDeiTForImageClassificationWithTeacher
-    - call
diff --git a/docs/source/en/model_doc/deplot.md b/docs/source/en/model_doc/deplot.md
new file mode 100644
index 000000000000..f425a8268fdf
--- /dev/null
+++ b/docs/source/en/model_doc/deplot.md
@@ -0,0 +1,62 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# DePlot
+
+## Overview 
+
+DePlot was proposed in the paper [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) from Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun.
+
+The abstract of the paper states the following:
+
+*Visual language such as charts and plots is ubiquitous in the human world. Comprehending plots and charts requires strong reasoning skills. Prior state-of-the-art (SOTA) models require at least tens of thousands of training examples and their reasoning capabilities are still much limited, especially on complex human-written queries. This paper presents the first one-shot solution to visual language reasoning. We decompose the challenge of visual language reasoning into two steps: (1) plot-to-text translation, and (2) reasoning over the translated text. The key in this method is a modality conversion module, named as DePlot, which translates the image of a plot or chart to a linearized table. The output of DePlot can then be directly used to prompt a pretrained large language model (LLM), exploiting the few-shot reasoning capabilities of LLMs. To obtain DePlot, we standardize the plot-to-table task by establishing unified task formats and metrics, and train DePlot end-to-end on this task. DePlot can then be used off-the-shelf together with LLMs in a plug-and-play fashion. Compared with a SOTA model finetuned on more than >28k data points, DePlot+LLM with just one-shot prompting achieves a 24.0% improvement over finetuned SOTA on human-written queries from the task of chart QA.*
+
+## Model description
+
+DePlot is a model that is trained using `Pix2Struct` architecture. You can find more information about `Pix2Struct` in the [Pix2Struct documentation](https://huggingface.co/docs/transformers/main/en/model_doc/pix2struct).
+DePlot is a Visual Question Answering subset of `Pix2Struct` architecture. It renders the input question on the image and predicts the answer.
+
+## Usage
+
+Currently one checkpoint is available for DePlot:
+
+- `google/deplot`: DePlot fine-tuned on ChartQA dataset 
+
+
+```python
+from transformers import AutoProcessor, Pix2StructForConditionalGeneration
+import requests
+from PIL import Image
+
+model = Pix2StructForConditionalGeneration.from_pretrained("google/deplot")
+processor = AutoProcessor.from_pretrained("google/deplot")
+url = "https://raw.githubusercontent.com/vis-nlp/ChartQA/main/ChartQA%20Dataset/val/png/5090.png"
+image = Image.open(requests.get(url, stream=True).raw)
+
+inputs = processor(images=image, text="Generate underlying data table of the figure below:", return_tensors="pt")
+predictions = model.generate(**inputs, max_new_tokens=512)
+print(processor.decode(predictions[0], skip_special_tokens=True))
+```
+
+## Fine-tuning
+
+To fine-tune DePlot, refer to the pix2struct [fine-tuning notebook](https://github.com/huggingface/notebooks/blob/main/examples/image_captioning_pix2struct.ipynb). For `Pix2Struct` models, we have found out that fine-tuning the model with Adafactor and cosine learning rate scheduler leads to faster convergence:
+```python
+from transformers.optimization import Adafactor, get_cosine_schedule_with_warmup
+
+optimizer = Adafactor(self.parameters(), scale_parameter=False, relative_step=False, lr=0.01, weight_decay=1e-05)
+scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=1000, num_training_steps=40000)
+```
\ No newline at end of file
diff --git a/docs/source/en/model_doc/deta.md b/docs/source/en/model_doc/deta.md
new file mode 100644
index 000000000000..d384f5564e5e
--- /dev/null
+++ b/docs/source/en/model_doc/deta.md
@@ -0,0 +1,71 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# DETA
+
+## Overview
+
+The DETA model was proposed in [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl.
+DETA (short for Detection Transformers with Assignment) improves [Deformable DETR](deformable_detr) by replacing the one-to-one bipartite Hungarian matching loss
+with one-to-many label assignments used in traditional detectors with non-maximum suppression (NMS). This leads to significant gains of up to 2.5 mAP.
+
+The abstract from the paper is the following:
+
+*Detection Transformer (DETR) directly transforms queries to unique objects by using one-to-one bipartite matching during training and enables end-to-end object detection. Recently, these models have surpassed traditional detectors on COCO with undeniable elegance. However, they differ from traditional detectors in multiple designs, including model architecture and training schedules, and thus the effectiveness of one-to-one matching is not fully understood. In this work, we conduct a strict comparison between the one-to-one Hungarian matching in DETRs and the one-to-many label assignments in traditional detectors with non-maximum supervision (NMS). Surprisingly, we observe one-to-many assignments with NMS consistently outperform standard one-to-one matching under the same setting, with a significant gain of up to 2.5 mAP. Our detector that trains Deformable-DETR with traditional IoU-based label assignment achieved 50.2 COCO mAP within 12 epochs (1x schedule) with ResNet50 backbone, outperforming all existing traditional or transformer-based detectors in this setting. On multiple datasets, schedules, and architectures, we consistently show bipartite matching is unnecessary for performant detection transformers. Furthermore, we attribute the success of detection transformers to their expressive transformer architecture.*
+
+Tips:
+
+- One can use [`DetaImageProcessor`] to prepare images and optional targets for the model.
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/deta_architecture.jpg"
+alt="drawing" width="600"/>
+
+<small> DETA overview. Taken from the <a href="https://arxiv.org/abs/2212.06137">original paper</a>. </small>
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr).
+The original code can be found [here](https://github.com/jozhang97/DETA).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DETA.
+
+- Demo notebooks for DETA can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DETA).
+- See also: [Object detection task guide](../tasks/object_detection)
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## DetaConfig
+
+[[autodoc]] DetaConfig
+
+
+## DetaImageProcessor
+
+[[autodoc]] DetaImageProcessor
+    - preprocess
+    - post_process_object_detection
+
+
+## DetaModel
+
+[[autodoc]] DetaModel
+    - forward
+
+
+## DetaForObjectDetection
+
+[[autodoc]] DetaForObjectDetection
+    - forward
diff --git a/docs/source/en/model_doc/detr.md b/docs/source/en/model_doc/detr.md
new file mode 100644
index 000000000000..2c03a0f8b851
--- /dev/null
+++ b/docs/source/en/model_doc/detr.md
@@ -0,0 +1,211 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# DETR
+
+## Overview
+
+The DETR model was proposed in [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by
+Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov and Sergey Zagoruyko. DETR
+consists of a convolutional backbone followed by an encoder-decoder Transformer which can be trained end-to-end for
+object detection. It greatly simplifies a lot of the complexity of models like Faster-R-CNN and Mask-R-CNN, which use
+things like region proposals, non-maximum suppression procedure and anchor generation. Moreover, DETR can also be
+naturally extended to perform panoptic segmentation, by simply adding a mask head on top of the decoder outputs.
+
+The abstract from the paper is the following:
+
+*We present a new method that views object detection as a direct set prediction problem. Our approach streamlines the
+detection pipeline, effectively removing the need for many hand-designed components like a non-maximum suppression
+procedure or anchor generation that explicitly encode our prior knowledge about the task. The main ingredients of the
+new framework, called DEtection TRansformer or DETR, are a set-based global loss that forces unique predictions via
+bipartite matching, and a transformer encoder-decoder architecture. Given a fixed small set of learned object queries,
+DETR reasons about the relations of the objects and the global image context to directly output the final set of
+predictions in parallel. The new model is conceptually simple and does not require a specialized library, unlike many
+other modern detectors. DETR demonstrates accuracy and run-time performance on par with the well-established and
+highly-optimized Faster RCNN baseline on the challenging COCO object detection dataset. Moreover, DETR can be easily
+generalized to produce panoptic segmentation in a unified manner. We show that it significantly outperforms competitive
+baselines.*
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/facebookresearch/detr).
+
+Here's a TLDR explaining how [`~transformers.DetrForObjectDetection`] works:
+
+First, an image is sent through a pre-trained convolutional backbone (in the paper, the authors use
+ResNet-50/ResNet-101). Let's assume we also add a batch dimension. This means that the input to the backbone is a
+tensor of shape `(batch_size, 3, height, width)`, assuming the image has 3 color channels (RGB). The CNN backbone
+outputs a new lower-resolution feature map, typically of shape `(batch_size, 2048, height/32, width/32)`. This is
+then projected to match the hidden dimension of the Transformer of DETR, which is `256` by default, using a
+`nn.Conv2D` layer. So now, we have a tensor of shape `(batch_size, 256, height/32, width/32).` Next, the
+feature map is flattened and transposed to obtain a tensor of shape `(batch_size, seq_len, d_model)` =
+`(batch_size, width/32*height/32, 256)`. So a difference with NLP models is that the sequence length is actually
+longer than usual, but with a smaller `d_model` (which in NLP is typically 768 or higher).
+
+Next, this is sent through the encoder, outputting `encoder_hidden_states` of the same shape (you can consider
+these as image features). Next, so-called **object queries** are sent through the decoder. This is a tensor of shape
+`(batch_size, num_queries, d_model)`, with `num_queries` typically set to 100 and initialized with zeros.
+These input embeddings are learnt positional encodings that the authors refer to as object queries, and similarly to
+the encoder, they are added to the input of each attention layer. Each object query will look for a particular object
+in the image. The decoder updates these embeddings through multiple self-attention and encoder-decoder attention layers
+to output `decoder_hidden_states` of the same shape: `(batch_size, num_queries, d_model)`. Next, two heads
+are added on top for object detection: a linear layer for classifying each object query into one of the objects or "no
+object", and a MLP to predict bounding boxes for each query.
+
+The model is trained using a **bipartite matching loss**: so what we actually do is compare the predicted classes +
+bounding boxes of each of the N = 100 object queries to the ground truth annotations, padded up to the same length N
+(so if an image only contains 4 objects, 96 annotations will just have a "no object" as class and "no bounding box" as
+bounding box). The [Hungarian matching algorithm](https://en.wikipedia.org/wiki/Hungarian_algorithm) is used to find
+an optimal one-to-one mapping of each of the N queries to each of the N annotations. Next, standard cross-entropy (for
+the classes) and a linear combination of the L1 and [generalized IoU loss](https://giou.stanford.edu/) (for the
+bounding boxes) are used to optimize the parameters of the model.
+
+DETR can be naturally extended to perform panoptic segmentation (which unifies semantic segmentation and instance
+segmentation). [`~transformers.DetrForSegmentation`] adds a segmentation mask head on top of
+[`~transformers.DetrForObjectDetection`]. The mask head can be trained either jointly, or in a two steps process,
+where one first trains a [`~transformers.DetrForObjectDetection`] model to detect bounding boxes around both
+"things" (instances) and "stuff" (background things like trees, roads, sky), then freeze all the weights and train only
+the mask head for 25 epochs. Experimentally, these two approaches give similar results. Note that predicting boxes is
+required for the training to be possible, since the Hungarian matching is computed using distances between boxes.
+
+Tips:
+
+- DETR uses so-called **object queries** to detect objects in an image. The number of queries determines the maximum
+  number of objects that can be detected in a single image, and is set to 100 by default (see parameter
+  `num_queries` of [`~transformers.DetrConfig`]). Note that it's good to have some slack (in COCO, the
+  authors used 100, while the maximum number of objects in a COCO image is ~70).
+- The decoder of DETR updates the query embeddings in parallel. This is different from language models like GPT-2,
+  which use autoregressive decoding instead of parallel. Hence, no causal attention mask is used.
+- DETR adds position embeddings to the hidden states at each self-attention and cross-attention layer before projecting
+  to queries and keys. For the position embeddings of the image, one can choose between fixed sinusoidal or learned
+  absolute position embeddings. By default, the parameter `position_embedding_type` of
+  [`~transformers.DetrConfig`] is set to `"sine"`.
+- During training, the authors of DETR did find it helpful to use auxiliary losses in the decoder, especially to help
+  the model output the correct number of objects of each class. If you set the parameter `auxiliary_loss` of
+  [`~transformers.DetrConfig`] to `True`, then prediction feedforward neural networks and Hungarian losses
+  are added after each decoder layer (with the FFNs sharing parameters).
+- If you want to train the model in a distributed environment across multiple nodes, then one should update the
+  _num_boxes_ variable in the _DetrLoss_ class of _modeling_detr.py_. When training on multiple nodes, this should be
+  set to the average number of target boxes across all nodes, as can be seen in the original implementation [here](https://github.com/facebookresearch/detr/blob/a54b77800eb8e64e3ad0d8237789fcbf2f8350c5/models/detr.py#L227-L232).
+- [`~transformers.DetrForObjectDetection`] and [`~transformers.DetrForSegmentation`] can be initialized with
+  any convolutional backbone available in the [timm library](https://github.com/rwightman/pytorch-image-models).
+  Initializing with a MobileNet backbone for example can be done by setting the `backbone` attribute of
+  [`~transformers.DetrConfig`] to `"tf_mobilenetv3_small_075"`, and then initializing the model with that
+  config.
+- DETR resizes the input images such that the shortest side is at least a certain amount of pixels while the longest is
+  at most 1333 pixels. At training time, scale augmentation is used such that the shortest side is randomly set to at
+  least 480 and at most 800 pixels. At inference time, the shortest side is set to 800. One can use
+  [`~transformers.DetrImageProcessor`] to prepare images (and optional annotations in COCO format) for the
+  model. Due to this resizing, images in a batch can have different sizes. DETR solves this by padding images up to the
+  largest size in a batch, and by creating a pixel mask that indicates which pixels are real/which are padding.
+  Alternatively, one can also define a custom `collate_fn` in order to batch images together, using
+  [`~transformers.DetrImageProcessor.pad_and_create_pixel_mask`].
+- The size of the images will determine the amount of memory being used, and will thus determine the `batch_size`.
+  It is advised to use a batch size of 2 per GPU. See [this Github thread](https://github.com/facebookresearch/detr/issues/150) for more info.
+
+There are three ways to instantiate a DETR model (depending on what you prefer):
+
+Option 1: Instantiate DETR with pre-trained weights for entire model
+```py
+>>> from transformers import DetrForObjectDetection
+
+>>> model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")
+```
+
+Option 2: Instantiate DETR with randomly initialized weights for Transformer, but pre-trained weights for backbone
+```py
+>>> from transformers import DetrConfig, DetrForObjectDetection
+
+>>> config = DetrConfig()
+>>> model = DetrForObjectDetection(config)
+```
+Option 3: Instantiate DETR with randomly initialized weights for backbone + Transformer
+```py
+>>> config = DetrConfig(use_pretrained_backbone=False)
+>>> model = DetrForObjectDetection(config)
+```
+
+As a summary, consider the following table:
+
+| Task | Object detection | Instance segmentation | Panoptic segmentation |
+|------|------------------|-----------------------|-----------------------|
+| **Description** | Predicting bounding boxes and class labels around objects in an image | Predicting masks around objects (i.e. instances) in an image | Predicting masks around both objects (i.e. instances) as well as "stuff" (i.e. background things like trees and roads) in an image |
+| **Model** | [`~transformers.DetrForObjectDetection`] | [`~transformers.DetrForSegmentation`] | [`~transformers.DetrForSegmentation`] |
+| **Example dataset** | COCO detection | COCO detection, COCO panoptic | COCO panoptic  |                                                                        |
+| **Format of annotations to provide to**  [`~transformers.DetrImageProcessor`] | {'image_id': `int`, 'annotations': `List[Dict]`} each Dict being a COCO object annotation  | {'image_id': `int`, 'annotations': `List[Dict]`}  (in case of COCO detection) or {'file_name': `str`, 'image_id': `int`, 'segments_info': `List[Dict]`} (in case of COCO panoptic) | {'file_name': `str`, 'image_id': `int`, 'segments_info': `List[Dict]`} and masks_path (path to directory containing PNG files of the masks) |
+| **Postprocessing** (i.e. converting the output of the model to COCO API) | [`~transformers.DetrImageProcessor.post_process`] | [`~transformers.DetrImageProcessor.post_process_segmentation`] | [`~transformers.DetrImageProcessor.post_process_segmentation`], [`~transformers.DetrImageProcessor.post_process_panoptic`] |
+| **evaluators** | `CocoEvaluator` with `iou_types="bbox"` | `CocoEvaluator` with `iou_types="bbox"` or `"segm"` | `CocoEvaluator` with `iou_tupes="bbox"` or `"segm"`, `PanopticEvaluator` |
+
+In short, one should prepare the data either in COCO detection or COCO panoptic format, then use
+[`~transformers.DetrImageProcessor`] to create `pixel_values`, `pixel_mask` and optional
+`labels`, which can then be used to train (or fine-tune) a model. For evaluation, one should first convert the
+outputs of the model using one of the postprocessing methods of [`~transformers.DetrImageProcessor`]. These can
+be be provided to either `CocoEvaluator` or `PanopticEvaluator`, which allow you to calculate metrics like
+mean Average Precision (mAP) and Panoptic Quality (PQ). The latter objects are implemented in the [original repository](https://github.com/facebookresearch/detr). See the [example notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DETR) for more info regarding evaluation.
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DETR.
+
+<PipelineTag pipeline="object-detection"/>
+
+- All example notebooks illustrating fine-tuning [`DetrForObjectDetection`] and [`DetrForSegmentation`] on a custom dataset an be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DETR).
+- See also: [Object detection task guide](../tasks/object_detection)
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## DETR specific outputs
+
+[[autodoc]] models.detr.modeling_detr.DetrModelOutput
+
+[[autodoc]] models.detr.modeling_detr.DetrObjectDetectionOutput
+
+[[autodoc]] models.detr.modeling_detr.DetrSegmentationOutput
+
+## DetrConfig
+
+[[autodoc]] DetrConfig
+
+## DetrImageProcessor
+
+[[autodoc]] DetrImageProcessor
+    - preprocess
+    - post_process_object_detection
+    - post_process_semantic_segmentation
+    - post_process_instance_segmentation
+    - post_process_panoptic_segmentation
+
+## DetrFeatureExtractor
+
+[[autodoc]] DetrFeatureExtractor
+    - __call__
+    - post_process_object_detection
+    - post_process_semantic_segmentation
+    - post_process_instance_segmentation
+    - post_process_panoptic_segmentation
+
+## DetrModel
+
+[[autodoc]] DetrModel
+    - forward
+
+## DetrForObjectDetection
+
+[[autodoc]] DetrForObjectDetection
+    - forward
+
+## DetrForSegmentation
+
+[[autodoc]] DetrForSegmentation
+    - forward
diff --git a/docs/source/en/model_doc/detr.mdx b/docs/source/en/model_doc/detr.mdx
deleted file mode 100644
index 28defdf791a8..000000000000
--- a/docs/source/en/model_doc/detr.mdx
+++ /dev/null
@@ -1,201 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# DETR
-
-## Overview
-
-The DETR model was proposed in [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by
-Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov and Sergey Zagoruyko. DETR
-consists of a convolutional backbone followed by an encoder-decoder Transformer which can be trained end-to-end for
-object detection. It greatly simplifies a lot of the complexity of models like Faster-R-CNN and Mask-R-CNN, which use
-things like region proposals, non-maximum suppression procedure and anchor generation. Moreover, DETR can also be
-naturally extended to perform panoptic segmentation, by simply adding a mask head on top of the decoder outputs.
-
-The abstract from the paper is the following:
-
-*We present a new method that views object detection as a direct set prediction problem. Our approach streamlines the
-detection pipeline, effectively removing the need for many hand-designed components like a non-maximum suppression
-procedure or anchor generation that explicitly encode our prior knowledge about the task. The main ingredients of the
-new framework, called DEtection TRansformer or DETR, are a set-based global loss that forces unique predictions via
-bipartite matching, and a transformer encoder-decoder architecture. Given a fixed small set of learned object queries,
-DETR reasons about the relations of the objects and the global image context to directly output the final set of
-predictions in parallel. The new model is conceptually simple and does not require a specialized library, unlike many
-other modern detectors. DETR demonstrates accuracy and run-time performance on par with the well-established and
-highly-optimized Faster RCNN baseline on the challenging COCO object detection dataset. Moreover, DETR can be easily
-generalized to produce panoptic segmentation in a unified manner. We show that it significantly outperforms competitive
-baselines.*
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/facebookresearch/detr).
-
-The quickest way to get started with DETR is by checking the [example notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DETR) (which showcase both inference and
-fine-tuning on custom data).
-
-Here's a TLDR explaining how [`~transformers.DetrForObjectDetection`] works:
-
-First, an image is sent through a pre-trained convolutional backbone (in the paper, the authors use
-ResNet-50/ResNet-101). Let's assume we also add a batch dimension. This means that the input to the backbone is a
-tensor of shape `(batch_size, 3, height, width)`, assuming the image has 3 color channels (RGB). The CNN backbone
-outputs a new lower-resolution feature map, typically of shape `(batch_size, 2048, height/32, width/32)`. This is
-then projected to match the hidden dimension of the Transformer of DETR, which is `256` by default, using a
-`nn.Conv2D` layer. So now, we have a tensor of shape `(batch_size, 256, height/32, width/32).` Next, the
-feature map is flattened and transposed to obtain a tensor of shape `(batch_size, seq_len, d_model)` =
-`(batch_size, width/32*height/32, 256)`. So a difference with NLP models is that the sequence length is actually
-longer than usual, but with a smaller `d_model` (which in NLP is typically 768 or higher).
-
-Next, this is sent through the encoder, outputting `encoder_hidden_states` of the same shape (you can consider
-these as image features). Next, so-called **object queries** are sent through the decoder. This is a tensor of shape
-`(batch_size, num_queries, d_model)`, with `num_queries` typically set to 100 and initialized with zeros.
-These input embeddings are learnt positional encodings that the authors refer to as object queries, and similarly to
-the encoder, they are added to the input of each attention layer. Each object query will look for a particular object
-in the image. The decoder updates these embeddings through multiple self-attention and encoder-decoder attention layers
-to output `decoder_hidden_states` of the same shape: `(batch_size, num_queries, d_model)`. Next, two heads
-are added on top for object detection: a linear layer for classifying each object query into one of the objects or "no
-object", and a MLP to predict bounding boxes for each query.
-
-The model is trained using a **bipartite matching loss**: so what we actually do is compare the predicted classes +
-bounding boxes of each of the N = 100 object queries to the ground truth annotations, padded up to the same length N
-(so if an image only contains 4 objects, 96 annotations will just have a "no object" as class and "no bounding box" as
-bounding box). The [Hungarian matching algorithm](https://en.wikipedia.org/wiki/Hungarian_algorithm) is used to find
-an optimal one-to-one mapping of each of the N queries to each of the N annotations. Next, standard cross-entropy (for
-the classes) and a linear combination of the L1 and [generalized IoU loss](https://giou.stanford.edu/) (for the
-bounding boxes) are used to optimize the parameters of the model.
-
-DETR can be naturally extended to perform panoptic segmentation (which unifies semantic segmentation and instance
-segmentation). [`~transformers.DetrForSegmentation`] adds a segmentation mask head on top of
-[`~transformers.DetrForObjectDetection`]. The mask head can be trained either jointly, or in a two steps process,
-where one first trains a [`~transformers.DetrForObjectDetection`] model to detect bounding boxes around both
-"things" (instances) and "stuff" (background things like trees, roads, sky), then freeze all the weights and train only
-the mask head for 25 epochs. Experimentally, these two approaches give similar results. Note that predicting boxes is
-required for the training to be possible, since the Hungarian matching is computed using distances between boxes.
-
-Tips:
-
-- DETR uses so-called **object queries** to detect objects in an image. The number of queries determines the maximum
-  number of objects that can be detected in a single image, and is set to 100 by default (see parameter
-  `num_queries` of [`~transformers.DetrConfig`]). Note that it's good to have some slack (in COCO, the
-  authors used 100, while the maximum number of objects in a COCO image is ~70).
-- The decoder of DETR updates the query embeddings in parallel. This is different from language models like GPT-2,
-  which use autoregressive decoding instead of parallel. Hence, no causal attention mask is used.
-- DETR adds position embeddings to the hidden states at each self-attention and cross-attention layer before projecting
-  to queries and keys. For the position embeddings of the image, one can choose between fixed sinusoidal or learned
-  absolute position embeddings. By default, the parameter `position_embedding_type` of
-  [`~transformers.DetrConfig`] is set to `"sine"`.
-- During training, the authors of DETR did find it helpful to use auxiliary losses in the decoder, especially to help
-  the model output the correct number of objects of each class. If you set the parameter `auxiliary_loss` of
-  [`~transformers.DetrConfig`] to `True`, then prediction feedforward neural networks and Hungarian losses
-  are added after each decoder layer (with the FFNs sharing parameters).
-- If you want to train the model in a distributed environment across multiple nodes, then one should update the
-  _num_boxes_ variable in the _DetrLoss_ class of _modeling_detr.py_. When training on multiple nodes, this should be
-  set to the average number of target boxes across all nodes, as can be seen in the original implementation [here](https://github.com/facebookresearch/detr/blob/a54b77800eb8e64e3ad0d8237789fcbf2f8350c5/models/detr.py#L227-L232).
-- [`~transformers.DetrForObjectDetection`] and [`~transformers.DetrForSegmentation`] can be initialized with
-  any convolutional backbone available in the [timm library](https://github.com/rwightman/pytorch-image-models).
-  Initializing with a MobileNet backbone for example can be done by setting the `backbone` attribute of
-  [`~transformers.DetrConfig`] to `"tf_mobilenetv3_small_075"`, and then initializing the model with that
-  config.
-- DETR resizes the input images such that the shortest side is at least a certain amount of pixels while the longest is
-  at most 1333 pixels. At training time, scale augmentation is used such that the shortest side is randomly set to at
-  least 480 and at most 800 pixels. At inference time, the shortest side is set to 800. One can use
-  [`~transformers.DetrImageProcessor`] to prepare images (and optional annotations in COCO format) for the
-  model. Due to this resizing, images in a batch can have different sizes. DETR solves this by padding images up to the
-  largest size in a batch, and by creating a pixel mask that indicates which pixels are real/which are padding.
-  Alternatively, one can also define a custom `collate_fn` in order to batch images together, using
-  [`~transformers.DetrImageProcessor.pad_and_create_pixel_mask`].
-- The size of the images will determine the amount of memory being used, and will thus determine the `batch_size`.
-  It is advised to use a batch size of 2 per GPU. See [this Github thread](https://github.com/facebookresearch/detr/issues/150) for more info.
-
-There are three ways to instantiate a DETR model (depending on what you prefer):
-
-Option 1: Instantiate DETR with pre-trained weights for entire model
-```py
->>> from transformers import DetrForObjectDetection
-
->>> model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")
-```
-
-Option 2: Instantiate DETR with randomly initialized weights for Transformer, but pre-trained weights for backbone
-```py
->>> from transformers import DetrConfig, DetrForObjectDetection
-
->>> config = DetrConfig()
->>> model = DetrForObjectDetection(config)
-```
-Option 3: Instantiate DETR with randomly initialized weights for backbone + Transformer
-```py
->>> config = DetrConfig(use_pretrained_backbone=False)
->>> model = DetrForObjectDetection(config)
-```
-
-As a summary, consider the following table:
-
-| Task | Object detection | Instance segmentation | Panoptic segmentation |
-|------|------------------|-----------------------|-----------------------|
-| **Description** | Predicting bounding boxes and class labels around objects in an image | Predicting masks around objects (i.e. instances) in an image | Predicting masks around both objects (i.e. instances) as well as "stuff" (i.e. background things like trees and roads) in an image |
-| **Model** | [`~transformers.DetrForObjectDetection`] | [`~transformers.DetrForSegmentation`] | [`~transformers.DetrForSegmentation`] |
-| **Example dataset** | COCO detection | COCO detection, COCO panoptic | COCO panoptic  |                                                                        |
-| **Format of annotations to provide to**  [`~transformers.DetrImageProcessor`] | {'image_id': `int`, 'annotations': `List[Dict]`} each Dict being a COCO object annotation  | {'image_id': `int`, 'annotations': `List[Dict]`}  (in case of COCO detection) or {'file_name': `str`, 'image_id': `int`, 'segments_info': `List[Dict]`} (in case of COCO panoptic) | {'file_name': `str`, 'image_id': `int`, 'segments_info': `List[Dict]`} and masks_path (path to directory containing PNG files of the masks) |
-| **Postprocessing** (i.e. converting the output of the model to COCO API) | [`~transformers.DetrImageProcessor.post_process`] | [`~transformers.DetrImageProcessor.post_process_segmentation`] | [`~transformers.DetrImageProcessor.post_process_segmentation`], [`~transformers.DetrImageProcessor.post_process_panoptic`] |
-| **evaluators** | `CocoEvaluator` with `iou_types="bbox"` | `CocoEvaluator` with `iou_types="bbox"` or `"segm"` | `CocoEvaluator` with `iou_tupes="bbox"` or `"segm"`, `PanopticEvaluator` |
-
-In short, one should prepare the data either in COCO detection or COCO panoptic format, then use
-[`~transformers.DetrImageProcessor`] to create `pixel_values`, `pixel_mask` and optional
-`labels`, which can then be used to train (or fine-tune) a model. For evaluation, one should first convert the
-outputs of the model using one of the postprocessing methods of [`~transformers.DetrImageProcessor`]. These can
-be be provided to either `CocoEvaluator` or `PanopticEvaluator`, which allow you to calculate metrics like
-mean Average Precision (mAP) and Panoptic Quality (PQ). The latter objects are implemented in the [original repository](https://github.com/facebookresearch/detr). See the [example notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DETR) for more info regarding evaluation.
-
-
-## DETR specific outputs
-
-[[autodoc]] models.detr.modeling_detr.DetrModelOutput
-
-[[autodoc]] models.detr.modeling_detr.DetrObjectDetectionOutput
-
-[[autodoc]] models.detr.modeling_detr.DetrSegmentationOutput
-
-## DetrConfig
-
-[[autodoc]] DetrConfig
-
-## DetrImageProcessor
-
-[[autodoc]] DetrImageProcessor
-    - preprocess
-    - post_process_object_detection
-    - post_process_semantic_segmentation
-    - post_process_instance_segmentation
-    - post_process_panoptic_segmentation
-
-## DetrFeatureExtractor
-
-[[autodoc]] DetrFeatureExtractor
-    - __call__
-    - pad_and_create_pixel_mask
-    - post_process_object_detection
-    - post_process_semantic_segmentation
-    - post_process_instance_segmentation
-    - post_process_panoptic_segmentation
-
-## DetrModel
-
-[[autodoc]] DetrModel
-    - forward
-
-## DetrForObjectDetection
-
-[[autodoc]] DetrForObjectDetection
-    - forward
-
-## DetrForSegmentation
-
-[[autodoc]] DetrForSegmentation
-    - forward
diff --git a/docs/source/en/model_doc/dialogpt.md b/docs/source/en/model_doc/dialogpt.md
new file mode 100644
index 000000000000..70929409b294
--- /dev/null
+++ b/docs/source/en/model_doc/dialogpt.md
@@ -0,0 +1,53 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# DialoGPT
+
+## Overview
+
+DialoGPT was proposed in [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao,
+Jianfeng Gao, Jingjing Liu, Bill Dolan. It's a GPT2 Model trained on 147M conversation-like exchanges extracted from
+Reddit.
+
+The abstract from the paper is the following:
+
+*We present a large, tunable neural conversational response generation model, DialoGPT (dialogue generative pre-trained
+transformer). Trained on 147M conversation-like exchanges extracted from Reddit comment chains over a period spanning
+from 2005 through 2017, DialoGPT extends the Hugging Face PyTorch transformer to attain a performance close to human
+both in terms of automatic and human evaluation in single-turn dialogue settings. We show that conversational systems
+that leverage DialoGPT generate more relevant, contentful and context-consistent responses than strong baseline
+systems. The pre-trained model and training pipeline are publicly released to facilitate research into neural response
+generation and the development of more intelligent open-domain dialogue systems.*
+
+Tips:
+
+- DialoGPT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather
+  than the left.
+- DialoGPT was trained with a causal language modeling (CLM) objective on conversational data and is therefore powerful
+  at response generation in open-domain dialogue systems.
+- DialoGPT enables the user to create a chat bot in just 10 lines of code as shown on [DialoGPT's model card](https://huggingface.co/microsoft/DialoGPT-medium).
+
+Training:
+
+In order to train or fine-tune DialoGPT, one can use causal language modeling training. To cite the official paper: *We
+follow the OpenAI GPT-2 to model a multiturn dialogue session as a long text and frame the generation task as language
+modeling. We first concatenate all dialog turns within a dialogue session into a long text x_1,..., x_N (N is the
+sequence length), ended by the end-of-text token.* For more information please confer to the original paper.
+
+
+DialoGPT's architecture is based on the GPT2 model, so one can refer to [GPT2's documentation page](gpt2).
+
+The original code can be found [here](https://github.com/microsoft/DialoGPT).
diff --git a/docs/source/en/model_doc/dialogpt.mdx b/docs/source/en/model_doc/dialogpt.mdx
deleted file mode 100644
index 62c6b45130e3..000000000000
--- a/docs/source/en/model_doc/dialogpt.mdx
+++ /dev/null
@@ -1,49 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# DialoGPT
-
-## Overview
-
-DialoGPT was proposed in [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao,
-Jianfeng Gao, Jingjing Liu, Bill Dolan. It's a GPT2 Model trained on 147M conversation-like exchanges extracted from
-Reddit.
-
-The abstract from the paper is the following:
-
-*We present a large, tunable neural conversational response generation model, DialoGPT (dialogue generative pre-trained
-transformer). Trained on 147M conversation-like exchanges extracted from Reddit comment chains over a period spanning
-from 2005 through 2017, DialoGPT extends the Hugging Face PyTorch transformer to attain a performance close to human
-both in terms of automatic and human evaluation in single-turn dialogue settings. We show that conversational systems
-that leverage DialoGPT generate more relevant, contentful and context-consistent responses than strong baseline
-systems. The pre-trained model and training pipeline are publicly released to facilitate research into neural response
-generation and the development of more intelligent open-domain dialogue systems.*
-
-Tips:
-
-- DialoGPT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather
-  than the left.
-- DialoGPT was trained with a causal language modeling (CLM) objective on conversational data and is therefore powerful
-  at response generation in open-domain dialogue systems.
-- DialoGPT enables the user to create a chat bot in just 10 lines of code as shown on [DialoGPT's model card](https://huggingface.co/microsoft/DialoGPT-medium).
-
-Training:
-
-In order to train or fine-tune DialoGPT, one can use causal language modeling training. To cite the official paper: *We
-follow the OpenAI GPT-2 to model a multiturn dialogue session as a long text and frame the generation task as language
-modeling. We first concatenate all dialog turns within a dialogue session into a long text x_1,..., x_N (N is the
-sequence length), ended by the end-of-text token.* For more information please confer to the original paper.
-
-
-DialoGPT's architecture is based on the GPT2 model, so one can refer to [GPT2's documentation page](gpt2).
-
-The original code can be found [here](https://github.com/microsoft/DialoGPT).
diff --git a/docs/source/en/model_doc/dinat.md b/docs/source/en/model_doc/dinat.md
new file mode 100644
index 000000000000..2317b13b7f9c
--- /dev/null
+++ b/docs/source/en/model_doc/dinat.md
@@ -0,0 +1,91 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Dilated Neighborhood Attention Transformer
+
+## Overview
+
+DiNAT was proposed in [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001)
+by Ali Hassani and Humphrey Shi.
+
+It extends [NAT](nat) by adding a Dilated Neighborhood Attention pattern to capture global context,
+and shows significant performance improvements over it.
+
+The abstract from the paper is the following:
+
+*Transformers are quickly becoming one of the most heavily applied deep learning architectures across modalities,
+domains, and tasks. In vision, on top of ongoing efforts into plain transformers, hierarchical transformers have
+also gained significant attention, thanks to their performance and easy integration into existing frameworks.
+These models typically employ localized attention mechanisms, such as the sliding-window Neighborhood Attention (NA)
+or Swin Transformer's Shifted Window Self Attention. While effective at reducing self attention's quadratic complexity,
+local attention weakens two of the most desirable properties of self attention: long range inter-dependency modeling,
+and global receptive field. In this paper, we introduce Dilated Neighborhood Attention (DiNA), a natural, flexible and
+efficient extension to NA that can capture more global context and expand receptive fields exponentially at no
+additional cost. NA's local attention and DiNA's sparse global attention complement each other, and therefore we
+introduce Dilated Neighborhood Attention Transformer (DiNAT), a new hierarchical vision transformer built upon both.
+DiNAT variants enjoy significant improvements over strong baselines such as NAT, Swin, and ConvNeXt.
+Our large model is faster and ahead of its Swin counterpart by 1.5% box AP in COCO object detection,
+1.3% mask AP in COCO instance segmentation, and 1.1% mIoU in ADE20K semantic segmentation.
+Paired with new frameworks, our large variant is the new state of the art panoptic segmentation model on COCO (58.2 PQ)
+and ADE20K (48.5 PQ), and instance segmentation model on Cityscapes (44.5 AP) and ADE20K (35.4 AP) (no extra data).
+It also matches the state of the art specialized semantic segmentation models on ADE20K (58.2 mIoU),
+and ranks second on Cityscapes (84.5 mIoU) (no extra data). *
+
+Tips:
+- One can use the [`AutoImageProcessor`] API to prepare images for the model.
+- DiNAT can be used as a *backbone*. When `output_hidden_states = True`,
+it will output both `hidden_states` and `reshaped_hidden_states`. The `reshaped_hidden_states` have a shape of `(batch, num_channels, height, width)` rather than `(batch_size, height, width, num_channels)`.
+
+Notes:
+- DiNAT depends on [NATTEN](https://github.com/SHI-Labs/NATTEN/)'s implementation of Neighborhood Attention and Dilated Neighborhood Attention.
+You can install it with pre-built wheels for Linux by referring to [shi-labs.com/natten](https://shi-labs.com/natten), or build on your system by running `pip install natten`.
+Note that the latter will likely take time to compile. NATTEN does not support Windows devices yet.
+- Patch size of 4 is only supported at the moment.
+
+<img
+src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/dilated-neighborhood-attention-pattern.jpg"
+alt="drawing" width="600"/>
+
+<small> Neighborhood Attention with different dilation values.
+Taken from the <a href="https://arxiv.org/abs/2209.15001">original paper</a>.</small>
+
+This model was contributed by [Ali Hassani](https://huggingface.co/alihassanijr).
+The original code can be found [here](https://github.com/SHI-Labs/Neighborhood-Attention-Transformer).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DiNAT.
+
+<PipelineTag pipeline="image-classification"/>
+
+- [`DinatForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
+- See also: [Image classification task guide](../tasks/image_classification)
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## DinatConfig
+
+[[autodoc]] DinatConfig
+
+## DinatModel
+
+[[autodoc]] DinatModel
+    - forward
+
+## DinatForImageClassification
+
+[[autodoc]] DinatForImageClassification
+    - forward
diff --git a/docs/source/en/model_doc/dinat.mdx b/docs/source/en/model_doc/dinat.mdx
deleted file mode 100644
index c8cebd921e0b..000000000000
--- a/docs/source/en/model_doc/dinat.mdx
+++ /dev/null
@@ -1,78 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Dilated Neighborhood Attention Transformer
-
-## Overview
-
-DiNAT was proposed in [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001)
-by Ali Hassani and Humphrey Shi.
-
-It extends [NAT](nat) by adding a Dilated Neighborhood Attention pattern to capture global context,
-and shows significant performance improvements over it.
-
-The abstract from the paper is the following:
-
-*Transformers are quickly becoming one of the most heavily applied deep learning architectures across modalities,
-domains, and tasks. In vision, on top of ongoing efforts into plain transformers, hierarchical transformers have
-also gained significant attention, thanks to their performance and easy integration into existing frameworks.
-These models typically employ localized attention mechanisms, such as the sliding-window Neighborhood Attention (NA)
-or Swin Transformer's Shifted Window Self Attention. While effective at reducing self attention's quadratic complexity,
-local attention weakens two of the most desirable properties of self attention: long range inter-dependency modeling,
-and global receptive field. In this paper, we introduce Dilated Neighborhood Attention (DiNA), a natural, flexible and
-efficient extension to NA that can capture more global context and expand receptive fields exponentially at no
-additional cost. NA's local attention and DiNA's sparse global attention complement each other, and therefore we
-introduce Dilated Neighborhood Attention Transformer (DiNAT), a new hierarchical vision transformer built upon both.
-DiNAT variants enjoy significant improvements over strong baselines such as NAT, Swin, and ConvNeXt.
-Our large model is faster and ahead of its Swin counterpart by 1.5% box AP in COCO object detection,
-1.3% mask AP in COCO instance segmentation, and 1.1% mIoU in ADE20K semantic segmentation.
-Paired with new frameworks, our large variant is the new state of the art panoptic segmentation model on COCO (58.2 PQ)
-and ADE20K (48.5 PQ), and instance segmentation model on Cityscapes (44.5 AP) and ADE20K (35.4 AP) (no extra data).
-It also matches the state of the art specialized semantic segmentation models on ADE20K (58.2 mIoU),
-and ranks second on Cityscapes (84.5 mIoU) (no extra data). *
-
-Tips:
-- One can use the [`AutoImageProcessor`] API to prepare images for the model.
-- DiNAT can be used as a *backbone*. When `output_hidden_states = True`,
-it will output both `hidden_states` and `reshaped_hidden_states`. The `reshaped_hidden_states` have a shape of `(batch, num_channels, height, width)` rather than `(batch_size, height, width, num_channels)`.
-
-Notes:
-- DiNAT depends on [NATTEN](https://github.com/SHI-Labs/NATTEN/)'s implementation of Neighborhood Attention and Dilated Neighborhood Attention.
-You can install it with pre-built wheels for Linux by referring to [shi-labs.com/natten](https://shi-labs.com/natten), or build on your system by running `pip install natten`.
-Note that the latter will likely take time to compile. NATTEN does not support Windows devices yet.
-- Patch size of 4 is only supported at the moment.
-
-<img
-src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/dilated-neighborhood-attention-pattern.jpg"
-alt="drawing" width="600"/>
-
-<small> Neighborhood Attention with different dilation values.
-Taken from the <a href="https://arxiv.org/abs/2209.15001">original paper</a>.</small>
-
-This model was contributed by [Ali Hassani](https://huggingface.co/alihassanijr).
-The original code can be found [here](https://github.com/SHI-Labs/Neighborhood-Attention-Transformer).
-
-
-## DinatConfig
-
-[[autodoc]] DinatConfig
-
-
-## DinatModel
-
-[[autodoc]] DinatModel
-    - forward
-
-## DinatForImageClassification
-
-[[autodoc]] DinatForImageClassification
-    - forward
diff --git a/docs/source/en/model_doc/dinov2.md b/docs/source/en/model_doc/dinov2.md
new file mode 100644
index 000000000000..71bdc11a9dfb
--- /dev/null
+++ b/docs/source/en/model_doc/dinov2.md
@@ -0,0 +1,45 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# DINOv2
+
+## Overview
+
+The DINOv2 model was proposed in [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) by
+Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski.
+DINOv2 is an upgrade of [DINO](https://arxiv.org/abs/2104.14294), a self-supervised method applied on [Vision Transformers](vit). This method enables all-purpose visual features, i.e., features that work across image distributions and tasks without finetuning.
+
+The abstract from the paper is the following:
+
+*The recent breakthroughs in natural language processing for model pretraining on large quantities of data have opened the way for similar foundation models in computer vision. These models could greatly simplify the use of images in any system by producing all-purpose visual features, i.e., features that work across image distributions and tasks without finetuning. This work shows that existing pretraining methods, especially self-supervised methods, can produce such features if trained on enough curated data from diverse sources. We revisit existing approaches and combine different techniques to scale our pretraining in terms of data and model size. Most of the technical contributions aim at accelerating and stabilizing the training at scale. In terms of data, we propose an automatic pipeline to build a dedicated, diverse, and curated image dataset instead of uncurated data, as typically done in the self-supervised literature. In terms of models, we train a ViT model (Dosovitskiy et al., 2020) with 1B parameters and distill it into a series of smaller models that surpass the best available all-purpose features, OpenCLIP (Ilharco et al., 2021) on most of the benchmarks at image and pixel levels.*
+
+Tips:
+
+- One can use [`AutoImageProcessor`] class to prepare images for the model.
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr).
+The original code can be found [here](https://github.com/facebookresearch/dinov2).
+
+
+## Dinov2Config
+
+[[autodoc]] Dinov2Config
+
+## Dinov2Model
+
+[[autodoc]] Dinov2Model
+    - forward
+
+## Dinov2ForImageClassification
+
+[[autodoc]] Dinov2ForImageClassification
+    - forward
diff --git a/docs/source/en/model_doc/distilbert.md b/docs/source/en/model_doc/distilbert.md
new file mode 100644
index 000000000000..2e68119a20b2
--- /dev/null
+++ b/docs/source/en/model_doc/distilbert.md
@@ -0,0 +1,235 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# DistilBERT
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=distilbert">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-distilbert-blueviolet">
+</a>
+<a href="https://huggingface.co/spaces/docs-demos/distilbert-base-uncased">
+<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
+</a>
+<a href="https://huggingface.co/papers/1910.01108">
+<img alt="Paper page" src="https://img.shields.io/badge/Paper%20page-1910.01108-green">
+</a>
+</div>
+
+## Overview
+
+The DistilBERT model was proposed in the blog post [Smaller, faster, cheaper, lighter: Introducing DistilBERT, a
+distilled version of BERT](https://medium.com/huggingface/distilbert-8cf3380435b5), and the paper [DistilBERT, a
+distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/papers/1910.01108). DistilBERT is a
+small, fast, cheap and light Transformer model trained by distilling BERT base. It has 40% less parameters than
+*bert-base-uncased*, runs 60% faster while preserving over 95% of BERT's performances as measured on the GLUE language
+understanding benchmark.
+
+The abstract from the paper is the following:
+
+*As Transfer Learning from large-scale pre-trained models becomes more prevalent in Natural Language Processing (NLP),
+operating these large models in on-the-edge and/or under constrained computational training or inference budgets
+remains challenging. In this work, we propose a method to pre-train a smaller general-purpose language representation
+model, called DistilBERT, which can then be fine-tuned with good performances on a wide range of tasks like its larger
+counterparts. While most prior work investigated the use of distillation for building task-specific models, we leverage
+knowledge distillation during the pretraining phase and show that it is possible to reduce the size of a BERT model by
+40%, while retaining 97% of its language understanding capabilities and being 60% faster. To leverage the inductive
+biases learned by larger models during pretraining, we introduce a triple loss combining language modeling,
+distillation and cosine-distance losses. Our smaller, faster and lighter model is cheaper to pre-train and we
+demonstrate its capabilities for on-device computations in a proof-of-concept experiment and a comparative on-device
+study.*
+
+Tips:
+
+- DistilBERT doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. Just
+  separate your segments with the separation token `tokenizer.sep_token` (or `[SEP]`).
+- DistilBERT doesn't have options to select the input positions (`position_ids` input). This could be added if
+  necessary though, just let us know if you need this option.
+- Same as BERT but smaller. Trained by distillation of the pretrained BERT model, meaning it’s been trained to predict the same probabilities as the larger model. The actual objective is a combination of:
+
+    * finding the same probabilities as the teacher model
+    * predicting the masked tokens correctly (but no next-sentence objective)
+    * a cosine similarity between the hidden states of the student and the teacher model
+
+This model was contributed by [victorsanh](https://huggingface.co/victorsanh). This model jax version was
+contributed by [kamalkraj](https://huggingface.co/kamalkraj). The original code can be found [here](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DistilBERT. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+<PipelineTag pipeline="text-classification"/>
+
+- A blog post on [Getting Started with Sentiment Analysis using Python](https://huggingface.co/blog/sentiment-analysis-python) with DistilBERT.
+- A blog post on how to [train DistilBERT with Blurr for sequence classification](https://huggingface.co/blog/fastai).
+- A blog post on how to use [Ray to tune DistilBERT hyperparameters](https://huggingface.co/blog/ray-tune).
+- A blog post on how to [train DistilBERT with Hugging Face and Amazon SageMaker](https://huggingface.co/blog/the-partnership-amazon-sagemaker-and-hugging-face).
+- A notebook on how to [finetune DistilBERT for multi-label classification](https://colab.research.google.com/github/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb). 🌎
+- A notebook on how to [finetune DistilBERT for multiclass classification with PyTorch](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb). 🌎
+- A notebook on how to [finetune DistilBERT for text classification in TensorFlow](https://colab.research.google.com/github/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb). 🌎
+- [`DistilBertForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb).
+- [`TFDistilBertForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb).
+- [`FlaxDistilBertForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_flax.ipynb).
+- [Text classification task guide](../tasks/sequence_classification)
+
+
+<PipelineTag pipeline="token-classification"/>
+
+- [`DistilBertForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/token-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb).
+- [`TFDistilBertForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/token-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb).
+- [`FlaxDistilBertForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/token-classification).
+- [Token classification](https://huggingface.co/course/chapter7/2?fw=pt) chapter of the 🤗 Hugging Face Course.
+- [Token classification task guide](../tasks/token_classification)
+
+
+<PipelineTag pipeline="fill-mask"/>
+
+- [`DistilBertForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#robertabertdistilbert-and-masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
+- [`TFDistilBertForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_mlmpy) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
+- [`FlaxDistilBertForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/masked_language_modeling_flax.ipynb).
+- [Masked language modeling](https://huggingface.co/course/chapter7/3?fw=pt) chapter of the 🤗 Hugging Face Course.
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+
+<PipelineTag pipeline="question-answering"/>
+
+- [`DistilBertForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb).
+- [`TFDistilBertForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb).
+- [`FlaxDistilBertForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/question-answering).
+- [Question answering](https://huggingface.co/course/chapter7/7?fw=pt) chapter of the 🤗 Hugging Face Course.
+- [Question answering task guide](../tasks/question_answering)
+
+**Multiple choice**
+- [`DistilBertForMultipleChoice`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/multiple-choice) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb).
+- [`TFDistilBertForMultipleChoice`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/multiple-choice) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb).
+- [Multiple choice task guide](../tasks/multiple_choice)
+
+⚗️ Optimization
+
+- A blog post on how to [quantize DistilBERT with 🤗 Optimum and Intel](https://huggingface.co/blog/intel).
+- A blog post on how [Optimizing Transformers for GPUs with 🤗 Optimum](https://www.philschmid.de/optimizing-transformers-with-optimum-gpu).
+- A blog post on [Optimizing Transformers with Hugging Face Optimum](https://www.philschmid.de/optimizing-transformers-with-optimum).
+
+⚡️ Inference
+
+- A blog post on how to [Accelerate BERT inference with Hugging Face Transformers and AWS Inferentia](https://huggingface.co/blog/bert-inferentia-sagemaker) with DistilBERT.
+- A blog post on [Serverless Inference with Hugging Face's Transformers, DistilBERT and Amazon SageMaker](https://www.philschmid.de/sagemaker-serverless-huggingface-distilbert).
+
+🚀 Deploy
+
+- A blog post on how to [deploy DistilBERT on Google Cloud](https://huggingface.co/blog/how-to-deploy-a-pipeline-to-google-clouds).
+- A blog post on how to [deploy DistilBERT with Amazon SageMaker](https://huggingface.co/blog/deploy-hugging-face-models-easily-with-amazon-sagemaker).
+- A blog post on how to [Deploy BERT with Hugging Face Transformers, Amazon SageMaker and Terraform module](https://www.philschmid.de/terraform-huggingface-amazon-sagemaker).
+
+## DistilBertConfig
+
+[[autodoc]] DistilBertConfig
+
+## DistilBertTokenizer
+
+[[autodoc]] DistilBertTokenizer
+
+## DistilBertTokenizerFast
+
+[[autodoc]] DistilBertTokenizerFast
+
+## DistilBertModel
+
+[[autodoc]] DistilBertModel
+    - forward
+
+## DistilBertForMaskedLM
+
+[[autodoc]] DistilBertForMaskedLM
+    - forward
+
+## DistilBertForSequenceClassification
+
+[[autodoc]] DistilBertForSequenceClassification
+    - forward
+
+## DistilBertForMultipleChoice
+
+[[autodoc]] DistilBertForMultipleChoice
+    - forward
+
+## DistilBertForTokenClassification
+
+[[autodoc]] DistilBertForTokenClassification
+    - forward
+
+## DistilBertForQuestionAnswering
+
+[[autodoc]] DistilBertForQuestionAnswering
+    - forward
+
+## TFDistilBertModel
+
+[[autodoc]] TFDistilBertModel
+    - call
+
+## TFDistilBertForMaskedLM
+
+[[autodoc]] TFDistilBertForMaskedLM
+    - call
+
+## TFDistilBertForSequenceClassification
+
+[[autodoc]] TFDistilBertForSequenceClassification
+    - call
+
+## TFDistilBertForMultipleChoice
+
+[[autodoc]] TFDistilBertForMultipleChoice
+    - call
+
+## TFDistilBertForTokenClassification
+
+[[autodoc]] TFDistilBertForTokenClassification
+    - call
+
+## TFDistilBertForQuestionAnswering
+
+[[autodoc]] TFDistilBertForQuestionAnswering
+    - call
+
+## FlaxDistilBertModel
+
+[[autodoc]] FlaxDistilBertModel
+    - __call__
+
+## FlaxDistilBertForMaskedLM
+
+[[autodoc]] FlaxDistilBertForMaskedLM
+    - __call__
+
+## FlaxDistilBertForSequenceClassification
+
+[[autodoc]] FlaxDistilBertForSequenceClassification
+    - __call__
+
+## FlaxDistilBertForMultipleChoice
+
+[[autodoc]] FlaxDistilBertForMultipleChoice
+    - __call__
+
+## FlaxDistilBertForTokenClassification
+
+[[autodoc]] FlaxDistilBertForTokenClassification
+    - __call__
+
+## FlaxDistilBertForQuestionAnswering
+
+[[autodoc]] FlaxDistilBertForQuestionAnswering
+    - __call__
diff --git a/docs/source/en/model_doc/distilbert.mdx b/docs/source/en/model_doc/distilbert.mdx
deleted file mode 100644
index 89900a563a55..000000000000
--- a/docs/source/en/model_doc/distilbert.mdx
+++ /dev/null
@@ -1,209 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# DistilBERT
-
-## Overview
-
-The DistilBERT model was proposed in the blog post [Smaller, faster, cheaper, lighter: Introducing DistilBERT, a
-distilled version of BERT](https://medium.com/huggingface/distilbert-8cf3380435b5), and the paper [DistilBERT, a
-distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108). DistilBERT is a
-small, fast, cheap and light Transformer model trained by distilling BERT base. It has 40% less parameters than
-*bert-base-uncased*, runs 60% faster while preserving over 95% of BERT's performances as measured on the GLUE language
-understanding benchmark.
-
-The abstract from the paper is the following:
-
-*As Transfer Learning from large-scale pre-trained models becomes more prevalent in Natural Language Processing (NLP),
-operating these large models in on-the-edge and/or under constrained computational training or inference budgets
-remains challenging. In this work, we propose a method to pre-train a smaller general-purpose language representation
-model, called DistilBERT, which can then be fine-tuned with good performances on a wide range of tasks like its larger
-counterparts. While most prior work investigated the use of distillation for building task-specific models, we leverage
-knowledge distillation during the pretraining phase and show that it is possible to reduce the size of a BERT model by
-40%, while retaining 97% of its language understanding capabilities and being 60% faster. To leverage the inductive
-biases learned by larger models during pretraining, we introduce a triple loss combining language modeling,
-distillation and cosine-distance losses. Our smaller, faster and lighter model is cheaper to pre-train and we
-demonstrate its capabilities for on-device computations in a proof-of-concept experiment and a comparative on-device
-study.*
-
-Tips:
-
-- DistilBERT doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. Just
-  separate your segments with the separation token `tokenizer.sep_token` (or `[SEP]`).
-- DistilBERT doesn't have options to select the input positions (`position_ids` input). This could be added if
-  necessary though, just let us know if you need this option.
-
-This model was contributed by [victorsanh](https://huggingface.co/victorsanh). This model jax version was
-contributed by [kamalkraj](https://huggingface.co/kamalkraj). The original code can be found [here](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation).
-
-## Resources
-
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DistilBERT. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
-<PipelineTag pipeline="text-classification"/>
-
-- A blog post on [Getting Started with Sentiment Analysis using Python](https://huggingface.co/blog/sentiment-analysis-python) with DistilBERT.
-- A blog post on how to [train DistilBERT with Blurr for sequence classification](https://huggingface.co/blog/fastai).
-- A blog post on how to use [Ray to tune DistilBERT hyperparameters](https://huggingface.co/blog/ray-tune).
-- A blog post on how to [train DistilBERT with Hugging Face and Amazon SageMaker](https://huggingface.co/blog/the-partnership-amazon-sagemaker-and-hugging-face).
-- A notebook on how to [finetune DistilBERT for multi-label classification](https://colab.research.google.com/github/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb). 🌎
-- A notebook on how to [finetune DistilBERT for multiclass classification with PyTorch](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb). 🌎
-- A notebook on how to [finetune DistilBERT for text classification in TensorFlow](https://colab.research.google.com/github/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb). 🌎
-- [`DistilBertForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb).
-- [`TFDistilBertForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb).
-- [`FlaxDistilBertForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_flax.ipynb).
-
-
-<PipelineTag pipeline="token-classification"/>
-
-- [`DistilBertForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/token-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb).
-- [`TFDistilBertForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/token-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb).
-- [`FlaxDistilBertForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/token-classification).
-- [Token classification](https://huggingface.co/course/chapter7/2?fw=pt) chapter of the 🤗 Hugging Face Course.
-
-
-<PipelineTag pipeline="fill-mask"/>
-
-- [`DistilBertForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#robertabertdistilbert-and-masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
-- [`TFDistilBertForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_mlmpy) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
-- [`FlaxDistilBertForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/masked_language_modeling_flax.ipynb).
-- [Masked language modeling](https://huggingface.co/course/chapter7/3?fw=pt) chapter of the 🤗 Hugging Face Course.
-
-<PipelineTag pipeline="question-answering"/>
-
-- [`DistilBertForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb).
-- [`TFDistilBertForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb).
-- [`FlaxDistilBertForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/question-answering).
-- [Question answering](https://huggingface.co/course/chapter7/7?fw=pt) chapter of the 🤗 Hugging Face Course.
-
-**Multiple choice**
-- [`DistilBertForMultipleChoice`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/multiple-choice) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb).
-- [`TFDistilBertForMultipleChoice`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/multiple-choice) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb).
-
-⚗️ Optimization
-
-- A blog post on how to [quantize DistilBERT with 🤗 Optimum and Intel](https://huggingface.co/blog/intel).
-- A blog post on how [Optimizing Transformers for GPUs with 🤗 Optimum](https://www.philschmid.de/optimizing-transformers-with-optimum-gpu).
-- A blog post on [Optimizing Transformers with Hugging Face Optimum](https://www.philschmid.de/optimizing-transformers-with-optimum).
-
-⚡️ Inference
-
-- A blog post on how to [Accelerate BERT inference with Hugging Face Transformers and AWS Inferentia](https://huggingface.co/blog/bert-inferentia-sagemaker) with DistilBERT.
-- A blog post on [Serverless Inference with Hugging Face's Transformers, DistilBERT and Amazon SageMaker](https://www.philschmid.de/sagemaker-serverless-huggingface-distilbert).
-
-🚀 Deploy
-
-- A blog post on how to [deploy DistilBERT on Google Cloud](https://huggingface.co/blog/how-to-deploy-a-pipeline-to-google-clouds).
-- A blog post on how to [deploy DistilBERT with Amazon SageMaker](https://huggingface.co/blog/deploy-hugging-face-models-easily-with-amazon-sagemaker).
-- A blog post on how to [Deploy BERT with Hugging Face Transformers, Amazon SageMaker and Terraform module](https://www.philschmid.de/terraform-huggingface-amazon-sagemaker).
-
-## DistilBertConfig
-
-[[autodoc]] DistilBertConfig
-
-## DistilBertTokenizer
-
-[[autodoc]] DistilBertTokenizer
-
-## DistilBertTokenizerFast
-
-[[autodoc]] DistilBertTokenizerFast
-
-## DistilBertModel
-
-[[autodoc]] DistilBertModel
-    - forward
-
-## DistilBertForMaskedLM
-
-[[autodoc]] DistilBertForMaskedLM
-    - forward
-
-## DistilBertForSequenceClassification
-
-[[autodoc]] DistilBertForSequenceClassification
-    - forward
-
-## DistilBertForMultipleChoice
-
-[[autodoc]] DistilBertForMultipleChoice
-    - forward
-
-## DistilBertForTokenClassification
-
-[[autodoc]] DistilBertForTokenClassification
-    - forward
-
-## DistilBertForQuestionAnswering
-
-[[autodoc]] DistilBertForQuestionAnswering
-    - forward
-
-## TFDistilBertModel
-
-[[autodoc]] TFDistilBertModel
-    - call
-
-## TFDistilBertForMaskedLM
-
-[[autodoc]] TFDistilBertForMaskedLM
-    - call
-
-## TFDistilBertForSequenceClassification
-
-[[autodoc]] TFDistilBertForSequenceClassification
-    - call
-
-## TFDistilBertForMultipleChoice
-
-[[autodoc]] TFDistilBertForMultipleChoice
-    - call
-
-## TFDistilBertForTokenClassification
-
-[[autodoc]] TFDistilBertForTokenClassification
-    - call
-
-## TFDistilBertForQuestionAnswering
-
-[[autodoc]] TFDistilBertForQuestionAnswering
-    - call
-
-## FlaxDistilBertModel
-
-[[autodoc]] FlaxDistilBertModel
-    - __call__
-
-## FlaxDistilBertForMaskedLM
-
-[[autodoc]] FlaxDistilBertForMaskedLM
-    - __call__
-
-## FlaxDistilBertForSequenceClassification
-
-[[autodoc]] FlaxDistilBertForSequenceClassification
-    - __call__
-
-## FlaxDistilBertForMultipleChoice
-
-[[autodoc]] FlaxDistilBertForMultipleChoice
-    - __call__
-
-## FlaxDistilBertForTokenClassification
-
-[[autodoc]] FlaxDistilBertForTokenClassification
-    - __call__
-
-## FlaxDistilBertForQuestionAnswering
-
-[[autodoc]] FlaxDistilBertForQuestionAnswering
-    - __call__
diff --git a/docs/source/en/model_doc/dit.md b/docs/source/en/model_doc/dit.md
new file mode 100644
index 000000000000..7d5f873e78bb
--- /dev/null
+++ b/docs/source/en/model_doc/dit.md
@@ -0,0 +1,81 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# DiT
+
+## Overview
+
+DiT was proposed in [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
+DiT applies the self-supervised objective of [BEiT](beit) (BERT pre-training of Image Transformers) to 42 million document images, allowing for state-of-the-art results on tasks including:
+
+- document image classification: the [RVL-CDIP](https://www.cs.cmu.edu/~aharley/rvl-cdip/) dataset (a collection of
+  400,000 images belonging to one of 16 classes).
+- document layout analysis: the [PubLayNet](https://github.com/ibm-aur-nlp/PubLayNet) dataset (a collection of more
+  than 360,000 document images constructed by automatically parsing PubMed XML files).
+- table detection: the [ICDAR 2019 cTDaR](https://github.com/cndplab-founder/ICDAR2019_cTDaR) dataset (a collection of
+  600 training images and 240 testing images).
+
+The abstract from the paper is the following:
+
+*Image Transformer has recently achieved significant progress for natural image understanding, either using supervised (ViT, DeiT, etc.) or self-supervised (BEiT, MAE, etc.) pre-training techniques. In this paper, we propose DiT, a self-supervised pre-trained Document Image Transformer model using large-scale unlabeled text images for Document AI tasks, which is essential since no supervised counterparts ever exist due to the lack of human labeled document images. We leverage DiT as the backbone network in a variety of vision-based Document AI tasks, including document image classification, document layout analysis, as well as table detection. Experiment results have illustrated that the self-supervised pre-trained DiT model achieves new state-of-the-art results on these downstream tasks, e.g. document image classification (91.11 → 92.69), document layout analysis (91.0 → 94.9) and table detection (94.23 → 96.55). *
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/dit_architecture.jpg"
+alt="drawing" width="600"/> 
+
+<small> Summary of the approach. Taken from the [original paper](https://arxiv.org/abs/2203.02378). </small>
+
+One can directly use the weights of DiT with the AutoModel API:
+
+```python
+from transformers import AutoModel
+
+model = AutoModel.from_pretrained("microsoft/dit-base")
+```
+
+This will load the model pre-trained on masked image modeling. Note that this won't include the language modeling head on top, used to predict visual tokens.
+
+To include the head, you can load the weights into a `BeitForMaskedImageModeling` model, like so:
+
+```python
+from transformers import BeitForMaskedImageModeling
+
+model = BeitForMaskedImageModeling.from_pretrained("microsoft/dit-base")
+```
+
+You can also load a fine-tuned model from the [hub](https://huggingface.co/models?other=dit), like so:
+
+```python
+from transformers import AutoModelForImageClassification
+
+model = AutoModelForImageClassification.from_pretrained("microsoft/dit-base-finetuned-rvlcdip")
+```
+
+This particular checkpoint was fine-tuned on [RVL-CDIP](https://www.cs.cmu.edu/~aharley/rvl-cdip/), an important benchmark for document image classification.
+A notebook that illustrates inference for document image classification can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DiT/Inference_with_DiT_(Document_Image_Transformer)_for_document_image_classification.ipynb).
+
+As DiT's architecture is equivalent to that of BEiT, one can refer to [BEiT's documentation page](beit) for all tips, code examples and notebooks.
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/microsoft/unilm/tree/master/dit).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DiT.
+
+<PipelineTag pipeline="image-classification"/>
+
+- [`BeitForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
\ No newline at end of file
diff --git a/docs/source/en/model_doc/dit.mdx b/docs/source/en/model_doc/dit.mdx
deleted file mode 100644
index e3830ce7c3e1..000000000000
--- a/docs/source/en/model_doc/dit.mdx
+++ /dev/null
@@ -1,67 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# DiT
-
-## Overview
-
-DiT was proposed in [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
-DiT applies the self-supervised objective of [BEiT](beit) (BERT pre-training of Image Transformers) to 42 million document images, allowing for state-of-the-art results on tasks including:
-
-- document image classification: the [RVL-CDIP](https://www.cs.cmu.edu/~aharley/rvl-cdip/) dataset (a collection of
-  400,000 images belonging to one of 16 classes).
-- document layout analysis: the [PubLayNet](https://github.com/ibm-aur-nlp/PubLayNet) dataset (a collection of more
-  than 360,000 document images constructed by automatically parsing PubMed XML files).
-- table detection: the [ICDAR 2019 cTDaR](https://github.com/cndplab-founder/ICDAR2019_cTDaR) dataset (a collection of
-  600 training images and 240 testing images).
-
-The abstract from the paper is the following:
-
-*Image Transformer has recently achieved significant progress for natural image understanding, either using supervised (ViT, DeiT, etc.) or self-supervised (BEiT, MAE, etc.) pre-training techniques. In this paper, we propose DiT, a self-supervised pre-trained Document Image Transformer model using large-scale unlabeled text images for Document AI tasks, which is essential since no supervised counterparts ever exist due to the lack of human labeled document images. We leverage DiT as the backbone network in a variety of vision-based Document AI tasks, including document image classification, document layout analysis, as well as table detection. Experiment results have illustrated that the self-supervised pre-trained DiT model achieves new state-of-the-art results on these downstream tasks, e.g. document image classification (91.11 → 92.69), document layout analysis (91.0 → 94.9) and table detection (94.23 → 96.55). *
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/dit_architecture.jpg"
-alt="drawing" width="600"/> 
-
-<small> Summary of the approach. Taken from the [original paper](https://arxiv.org/abs/2203.02378). </small>
-
-One can directly use the weights of DiT with the AutoModel API:
-
-```python
-from transformers import AutoModel
-
-model = AutoModel.from_pretrained("microsoft/dit-base")
-```
-
-This will load the model pre-trained on masked image modeling. Note that this won't include the language modeling head on top, used to predict visual tokens.
-
-To include the head, you can load the weights into a `BeitForMaskedImageModeling` model, like so:
-
-```python
-from transformers import BeitForMaskedImageModeling
-
-model = BeitForMaskedImageModeling.from_pretrained("microsoft/dit-base")
-```
-
-You can also load a fine-tuned model from the [hub](https://huggingface.co/models?other=dit), like so:
-
-```python
-from transformers import AutoModelForImageClassification
-
-model = AutoModelForImageClassification.from_pretrained("microsoft/dit-base-finetuned-rvlcdip")
-```
-
-This particular checkpoint was fine-tuned on [RVL-CDIP](https://www.cs.cmu.edu/~aharley/rvl-cdip/), an important benchmark for document image classification.
-A notebook that illustrates inference for document image classification can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DiT/Inference_with_DiT_(Document_Image_Transformer)_for_document_image_classification.ipynb).
-
-As DiT's architecture is equivalent to that of BEiT, one can refer to [BEiT's documentation page](beit) for all tips, code examples and notebooks.
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/microsoft/unilm/tree/master/dit).
\ No newline at end of file
diff --git a/docs/source/en/model_doc/donut.md b/docs/source/en/model_doc/donut.md
new file mode 100644
index 000000000000..cfbf79972d57
--- /dev/null
+++ b/docs/source/en/model_doc/donut.md
@@ -0,0 +1,217 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the
+License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+specific language governing permissions and limitations under the License. -->
+
+# Donut
+
+## Overview
+
+The Donut model was proposed in [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by
+Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
+Donut consists of an image Transformer encoder and an autoregressive text Transformer decoder to perform document understanding
+tasks such as document image classification, form understanding and visual question answering.
+
+The abstract from the paper is the following:
+
+*Understanding document images (e.g., invoices) is a core but challenging task since it requires complex functions such as reading text and a holistic understanding of the document. Current Visual Document Understanding (VDU) methods outsource the task of reading text to off-the-shelf Optical Character Recognition (OCR) engines and focus on the understanding task with the OCR outputs. Although such OCR-based approaches have shown promising performance, they suffer from 1) high computational costs for using OCR; 2) inflexibility of OCR models on languages or types of document; 3) OCR error propagation to the subsequent process. To address these issues, in this paper, we introduce a novel OCR-free VDU model named Donut, which stands for Document understanding transformer. As the first step in OCR-free VDU research, we propose a simple architecture (i.e., Transformer) with a pre-training objective (i.e., cross-entropy loss). Donut is conceptually simple yet effective. Through extensive experiments and analyses, we show a simple OCR-free VDU model, Donut, achieves state-of-the-art performances on various VDU tasks in terms of both speed and accuracy. In addition, we offer a synthetic data generator that helps the model pre-training to be flexible in various languages and domains.*
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/donut_architecture.jpg"
+alt="drawing" width="600"/>
+
+<small> Donut high-level overview. Taken from the <a href="https://arxiv.org/abs/2111.15664">original paper</a>. </small>
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found
+[here](https://github.com/clovaai/donut).
+
+Tips:
+
+- The quickest way to get started with Donut is by checking the [tutorial
+  notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/Donut), which show how to use the model
+  at inference time as well as fine-tuning on custom data.
+- Donut is always used within the [VisionEncoderDecoder](vision-encoder-decoder) framework.
+
+## Inference
+
+Donut's [`VisionEncoderDecoder`] model accepts images as input and makes use of
+[`~generation.GenerationMixin.generate`] to autoregressively generate text given the input image.
+
+The [`DonutImageProcessor`] class is responsible for preprocessing the input image and
+[`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`] decodes the generated target tokens to the target string. The
+[`DonutProcessor`] wraps [`DonutImageProcessor`] and [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]
+into a single instance to both extract the input features and decode the predicted token ids.
+
+- Step-by-step Document Image Classification
+
+```py
+>>> import re
+
+>>> from transformers import DonutProcessor, VisionEncoderDecoderModel
+>>> from datasets import load_dataset
+>>> import torch
+
+>>> processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-rvlcdip")
+>>> model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-rvlcdip")
+
+>>> device = "cuda" if torch.cuda.is_available() else "cpu"
+>>> model.to(device)  # doctest: +IGNORE_RESULT
+
+>>> # load document image
+>>> dataset = load_dataset("hf-internal-testing/example-documents", split="test")
+>>> image = dataset[1]["image"]
+
+>>> # prepare decoder inputs
+>>> task_prompt = "<s_rvlcdip>"
+>>> decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
+
+>>> pixel_values = processor(image, return_tensors="pt").pixel_values
+
+>>> outputs = model.generate(
+...     pixel_values.to(device),
+...     decoder_input_ids=decoder_input_ids.to(device),
+...     max_length=model.decoder.config.max_position_embeddings,
+...     pad_token_id=processor.tokenizer.pad_token_id,
+...     eos_token_id=processor.tokenizer.eos_token_id,
+...     use_cache=True,
+...     bad_words_ids=[[processor.tokenizer.unk_token_id]],
+...     return_dict_in_generate=True,
+... )
+
+>>> sequence = processor.batch_decode(outputs.sequences)[0]
+>>> sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
+>>> sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
+>>> print(processor.token2json(sequence))
+{'class': 'advertisement'}
+```
+
+- Step-by-step Document Parsing
+
+```py
+>>> import re
+
+>>> from transformers import DonutProcessor, VisionEncoderDecoderModel
+>>> from datasets import load_dataset
+>>> import torch
+
+>>> processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
+>>> model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
+
+>>> device = "cuda" if torch.cuda.is_available() else "cpu"
+>>> model.to(device)  # doctest: +IGNORE_RESULT
+
+>>> # load document image
+>>> dataset = load_dataset("hf-internal-testing/example-documents", split="test")
+>>> image = dataset[2]["image"]
+
+>>> # prepare decoder inputs
+>>> task_prompt = "<s_cord-v2>"
+>>> decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
+
+>>> pixel_values = processor(image, return_tensors="pt").pixel_values
+
+>>> outputs = model.generate(
+...     pixel_values.to(device),
+...     decoder_input_ids=decoder_input_ids.to(device),
+...     max_length=model.decoder.config.max_position_embeddings,
+...     pad_token_id=processor.tokenizer.pad_token_id,
+...     eos_token_id=processor.tokenizer.eos_token_id,
+...     use_cache=True,
+...     bad_words_ids=[[processor.tokenizer.unk_token_id]],
+...     return_dict_in_generate=True,
+... )
+
+>>> sequence = processor.batch_decode(outputs.sequences)[0]
+>>> sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
+>>> sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
+>>> print(processor.token2json(sequence))
+{'menu': {'nm': 'CINNAMON SUGAR', 'unitprice': '17,000', 'cnt': '1 x', 'price': '17,000'}, 'sub_total': {'subtotal_price': '17,000'}, 'total': {'total_price': '17,000', 'cashprice': '20,000', 'changeprice': '3,000'}}
+```
+
+- Step-by-step Document Visual Question Answering (DocVQA)
+
+```py
+>>> import re
+
+>>> from transformers import DonutProcessor, VisionEncoderDecoderModel
+>>> from datasets import load_dataset
+>>> import torch
+
+>>> processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
+>>> model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
+
+>>> device = "cuda" if torch.cuda.is_available() else "cpu"
+>>> model.to(device)  # doctest: +IGNORE_RESULT
+
+>>> # load document image from the DocVQA dataset
+>>> dataset = load_dataset("hf-internal-testing/example-documents", split="test")
+>>> image = dataset[0]["image"]
+
+>>> # prepare decoder inputs
+>>> task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
+>>> question = "When is the coffee break?"
+>>> prompt = task_prompt.replace("{user_input}", question)
+>>> decoder_input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids
+
+>>> pixel_values = processor(image, return_tensors="pt").pixel_values
+
+>>> outputs = model.generate(
+...     pixel_values.to(device),
+...     decoder_input_ids=decoder_input_ids.to(device),
+...     max_length=model.decoder.config.max_position_embeddings,
+...     pad_token_id=processor.tokenizer.pad_token_id,
+...     eos_token_id=processor.tokenizer.eos_token_id,
+...     use_cache=True,
+...     bad_words_ids=[[processor.tokenizer.unk_token_id]],
+...     return_dict_in_generate=True,
+... )
+
+>>> sequence = processor.batch_decode(outputs.sequences)[0]
+>>> sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
+>>> sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
+>>> print(processor.token2json(sequence))
+{'question': 'When is the coffee break?', 'answer': '11-14 to 11:39 a.m.'}
+```
+
+See the [model hub](https://huggingface.co/models?filter=donut) to look for Donut checkpoints.
+
+## Training
+
+We refer to the [tutorial notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/Donut).
+
+## DonutSwinConfig
+
+[[autodoc]] DonutSwinConfig
+
+## DonutImageProcessor
+
+[[autodoc]] DonutImageProcessor
+    - preprocess
+
+## DonutFeatureExtractor
+
+[[autodoc]] DonutFeatureExtractor
+    - __call__
+
+## DonutProcessor
+
+[[autodoc]] DonutProcessor
+    - __call__
+    - from_pretrained
+    - save_pretrained
+    - batch_decode
+    - decode
+
+## DonutSwinModel
+
+[[autodoc]] DonutSwinModel
+    - forward
diff --git a/docs/source/en/model_doc/donut.mdx b/docs/source/en/model_doc/donut.mdx
deleted file mode 100644
index 62ce32fd9c80..000000000000
--- a/docs/source/en/model_doc/donut.mdx
+++ /dev/null
@@ -1,219 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the
-License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
-"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. -->
-
-# Donut
-
-## Overview
-
-The Donut model was proposed in [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by
-Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
-Donut consists of an image Transformer encoder and an autoregressive text Transformer decoder to perform document understanding
-tasks such as document image classification, form understanding and visual question answering.
-
-The abstract from the paper is the following:
-
-*Understanding document images (e.g., invoices) is a core but challenging task since it requires complex functions such as reading text and a holistic understanding of the document. Current Visual Document Understanding (VDU) methods outsource the task of reading text to off-the-shelf Optical Character Recognition (OCR) engines and focus on the understanding task with the OCR outputs. Although such OCR-based approaches have shown promising performance, they suffer from 1) high computational costs for using OCR; 2) inflexibility of OCR models on languages or types of document; 3) OCR error propagation to the subsequent process. To address these issues, in this paper, we introduce a novel OCR-free VDU model named Donut, which stands for Document understanding transformer. As the first step in OCR-free VDU research, we propose a simple architecture (i.e., Transformer) with a pre-training objective (i.e., cross-entropy loss). Donut is conceptually simple yet effective. Through extensive experiments and analyses, we show a simple OCR-free VDU model, Donut, achieves state-of-the-art performances on various VDU tasks in terms of both speed and accuracy. In addition, we offer a synthetic data generator that helps the model pre-training to be flexible in various languages and domains.*
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/donut_architecture.jpg"
-alt="drawing" width="600"/>
-
-<small> Donut high-level overview. Taken from the <a href="https://arxiv.org/abs/2111.15664">original paper</a>. </small>
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found
-[here](https://github.com/clovaai/donut).
-
-Tips:
-
-- The quickest way to get started with Donut is by checking the [tutorial
-  notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/Donut), which show how to use the model
-  at inference time as well as fine-tuning on custom data.
-- Donut is always used within the [VisionEncoderDecoder](vision-encoder-decoder) framework.
-
-## Inference
-
-Donut's [`VisionEncoderDecoder`] model accepts images as input and makes use of
-[`~generation.GenerationMixin.generate`] to autoregressively generate text given the input image.
-
-The [`DonutFeatureExtractor`] class is responsible for preprocessing the input image and
-[`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`] decodes the generated target tokens to the target string. The
-[`DonutProcessor`] wraps [`DonutFeatureExtractor`] and [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]
-into a single instance to both extract the input features and decode the predicted token ids.
-
-- Step-by-step Document Image Classification
-
-```py
->>> import re
-
->>> from transformers import DonutProcessor, VisionEncoderDecoderModel
->>> from datasets import load_dataset
->>> import torch
-
->>> processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-rvlcdip")
->>> model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-rvlcdip")
-
->>> device = "cuda" if torch.cuda.is_available() else "cpu"
->>> model.to(device)  # doctest: +IGNORE_RESULT
-
->>> # load document image
->>> dataset = load_dataset("hf-internal-testing/example-documents", split="test")
->>> image = dataset[1]["image"]
-
->>> # prepare decoder inputs
->>> task_prompt = "<s_rvlcdip>"
->>> decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
-
->>> pixel_values = processor(image, return_tensors="pt").pixel_values
-
->>> outputs = model.generate(
-...     pixel_values.to(device),
-...     decoder_input_ids=decoder_input_ids.to(device),
-...     max_length=model.decoder.config.max_position_embeddings,
-...     early_stopping=True,
-...     pad_token_id=processor.tokenizer.pad_token_id,
-...     eos_token_id=processor.tokenizer.eos_token_id,
-...     use_cache=True,
-...     num_beams=1,
-...     bad_words_ids=[[processor.tokenizer.unk_token_id]],
-...     return_dict_in_generate=True,
-... )
-
->>> sequence = processor.batch_decode(outputs.sequences)[0]
->>> sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
->>> sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
->>> print(processor.token2json(sequence))
-{'class': 'advertisement'}
-```
-
-- Step-by-step Document Parsing
-
-```py
->>> import re
-
->>> from transformers import DonutProcessor, VisionEncoderDecoderModel
->>> from datasets import load_dataset
->>> import torch
-
->>> processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
->>> model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
-
->>> device = "cuda" if torch.cuda.is_available() else "cpu"
->>> model.to(device)  # doctest: +IGNORE_RESULT
-
->>> # load document image
->>> dataset = load_dataset("hf-internal-testing/example-documents", split="test")
->>> image = dataset[2]["image"]
-
->>> # prepare decoder inputs
->>> task_prompt = "<s_cord-v2>"
->>> decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
-
->>> pixel_values = processor(image, return_tensors="pt").pixel_values
-
->>> outputs = model.generate(
-...     pixel_values.to(device),
-...     decoder_input_ids=decoder_input_ids.to(device),
-...     max_length=model.decoder.config.max_position_embeddings,
-...     early_stopping=True,
-...     pad_token_id=processor.tokenizer.pad_token_id,
-...     eos_token_id=processor.tokenizer.eos_token_id,
-...     use_cache=True,
-...     num_beams=1,
-...     bad_words_ids=[[processor.tokenizer.unk_token_id]],
-...     return_dict_in_generate=True,
-... )
-
->>> sequence = processor.batch_decode(outputs.sequences)[0]
->>> sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
->>> sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
->>> print(processor.token2json(sequence))
-{'menu': {'nm': 'CINNAMON SUGAR', 'unitprice': '17,000', 'cnt': '1 x', 'price': '17,000'}, 'sub_total': {'subtotal_price': '17,000'}, 'total': {'total_price': '17,000', 'cashprice': '20,000', 'changeprice': '3,000'}}
-```
-
-- Step-by-step Document Visual Question Answering (DocVQA)
-
-```py
->>> import re
-
->>> from transformers import DonutProcessor, VisionEncoderDecoderModel
->>> from datasets import load_dataset
->>> import torch
-
->>> processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
->>> model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
-
->>> device = "cuda" if torch.cuda.is_available() else "cpu"
->>> model.to(device)  # doctest: +IGNORE_RESULT
-
->>> # load document image from the DocVQA dataset
->>> dataset = load_dataset("hf-internal-testing/example-documents", split="test")
->>> image = dataset[0]["image"]
-
->>> # prepare decoder inputs
->>> task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
->>> question = "When is the coffee break?"
->>> prompt = task_prompt.replace("{user_input}", question)
->>> decoder_input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids
-
->>> pixel_values = processor(image, return_tensors="pt").pixel_values
-
->>> outputs = model.generate(
-...     pixel_values.to(device),
-...     decoder_input_ids=decoder_input_ids.to(device),
-...     max_length=model.decoder.config.max_position_embeddings,
-...     early_stopping=True,
-...     pad_token_id=processor.tokenizer.pad_token_id,
-...     eos_token_id=processor.tokenizer.eos_token_id,
-...     use_cache=True,
-...     num_beams=1,
-...     bad_words_ids=[[processor.tokenizer.unk_token_id]],
-...     return_dict_in_generate=True,
-... )
-
->>> sequence = processor.batch_decode(outputs.sequences)[0]
->>> sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
->>> sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
->>> print(processor.token2json(sequence))
-{'question': 'When is the coffee break?', 'answer': '11-14 to 11:39 a.m.'}
-```
-
-See the [model hub](https://huggingface.co/models?filter=donut) to look for Donut checkpoints.
-
-## Training
-
-We refer to the [tutorial notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/Donut).
-
-## DonutSwinConfig
-
-[[autodoc]] DonutSwinConfig
-
-## DonutImageProcessor
-
-[[autodoc]] DonutImageProcessor
-    - preprocess
-
-## DonutFeatureExtractor
-
-[[autodoc]] DonutFeatureExtractor
-    - __call__
-
-## DonutProcessor
-
-[[autodoc]] DonutProcessor
-    - __call__
-    - from_pretrained
-    - save_pretrained
-    - batch_decode
-    - decode
-
-## DonutSwinModel
-
-[[autodoc]] DonutSwinModel
-    - forward
diff --git a/docs/source/en/model_doc/dpr.md b/docs/source/en/model_doc/dpr.md
new file mode 100644
index 000000000000..10bc76b72dd6
--- /dev/null
+++ b/docs/source/en/model_doc/dpr.md
@@ -0,0 +1,117 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# DPR
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=dpr">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-dpr-blueviolet">
+</a>
+<a href="https://huggingface.co/spaces/docs-demos/dpr-question_encoder-bert-base-multilingual">
+<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
+</a>
+</div>
+
+## Overview
+
+Dense Passage Retrieval (DPR) is a set of tools and models for state-of-the-art open-domain Q&A research. It was
+introduced in [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by
+Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, Wen-tau Yih.
+
+The abstract from the paper is the following:
+
+*Open-domain question answering relies on efficient passage retrieval to select candidate contexts, where traditional
+sparse vector space models, such as TF-IDF or BM25, are the de facto method. In this work, we show that retrieval can
+be practically implemented using dense representations alone, where embeddings are learned from a small number of
+questions and passages by a simple dual-encoder framework. When evaluated on a wide range of open-domain QA datasets,
+our dense retriever outperforms a strong Lucene-BM25 system largely by 9%-19% absolute in terms of top-20 passage
+retrieval accuracy, and helps our end-to-end QA system establish new state-of-the-art on multiple open-domain QA
+benchmarks.*
+
+This model was contributed by [lhoestq](https://huggingface.co/lhoestq). The original code can be found [here](https://github.com/facebookresearch/DPR).
+
+Tips:
+- DPR consists in three models:
+
+    * Question encoder: encode questions as vectors
+    * Context encoder: encode contexts as vectors
+    * Reader: extract the answer of the questions inside retrieved contexts, along with a relevance score (high if the inferred span actually answers the question).
+
+## DPRConfig
+
+[[autodoc]] DPRConfig
+
+## DPRContextEncoderTokenizer
+
+[[autodoc]] DPRContextEncoderTokenizer
+
+## DPRContextEncoderTokenizerFast
+
+[[autodoc]] DPRContextEncoderTokenizerFast
+
+## DPRQuestionEncoderTokenizer
+
+[[autodoc]] DPRQuestionEncoderTokenizer
+
+## DPRQuestionEncoderTokenizerFast
+
+[[autodoc]] DPRQuestionEncoderTokenizerFast
+
+## DPRReaderTokenizer
+
+[[autodoc]] DPRReaderTokenizer
+
+## DPRReaderTokenizerFast
+
+[[autodoc]] DPRReaderTokenizerFast
+
+## DPR specific outputs
+
+[[autodoc]] models.dpr.modeling_dpr.DPRContextEncoderOutput
+
+[[autodoc]] models.dpr.modeling_dpr.DPRQuestionEncoderOutput
+
+[[autodoc]] models.dpr.modeling_dpr.DPRReaderOutput
+
+## DPRContextEncoder
+
+[[autodoc]] DPRContextEncoder
+    - forward
+
+## DPRQuestionEncoder
+
+[[autodoc]] DPRQuestionEncoder
+    - forward
+
+## DPRReader
+
+[[autodoc]] DPRReader
+    - forward
+
+## TFDPRContextEncoder
+
+[[autodoc]] TFDPRContextEncoder
+    - call
+
+## TFDPRQuestionEncoder
+
+[[autodoc]] TFDPRQuestionEncoder
+    - call
+
+## TFDPRReader
+
+[[autodoc]] TFDPRReader
+    - call
diff --git a/docs/source/en/model_doc/dpr.mdx b/docs/source/en/model_doc/dpr.mdx
deleted file mode 100644
index 2010a17fcc6d..000000000000
--- a/docs/source/en/model_doc/dpr.mdx
+++ /dev/null
@@ -1,98 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# DPR
-
-## Overview
-
-Dense Passage Retrieval (DPR) is a set of tools and models for state-of-the-art open-domain Q&A research. It was
-introduced in [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by
-Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, Wen-tau Yih.
-
-The abstract from the paper is the following:
-
-*Open-domain question answering relies on efficient passage retrieval to select candidate contexts, where traditional
-sparse vector space models, such as TF-IDF or BM25, are the de facto method. In this work, we show that retrieval can
-be practically implemented using dense representations alone, where embeddings are learned from a small number of
-questions and passages by a simple dual-encoder framework. When evaluated on a wide range of open-domain QA datasets,
-our dense retriever outperforms a strong Lucene-BM25 system largely by 9%-19% absolute in terms of top-20 passage
-retrieval accuracy, and helps our end-to-end QA system establish new state-of-the-art on multiple open-domain QA
-benchmarks.*
-
-This model was contributed by [lhoestq](https://huggingface.co/lhoestq). The original code can be found [here](https://github.com/facebookresearch/DPR).
-
-
-## DPRConfig
-
-[[autodoc]] DPRConfig
-
-## DPRContextEncoderTokenizer
-
-[[autodoc]] DPRContextEncoderTokenizer
-
-## DPRContextEncoderTokenizerFast
-
-[[autodoc]] DPRContextEncoderTokenizerFast
-
-## DPRQuestionEncoderTokenizer
-
-[[autodoc]] DPRQuestionEncoderTokenizer
-
-## DPRQuestionEncoderTokenizerFast
-
-[[autodoc]] DPRQuestionEncoderTokenizerFast
-
-## DPRReaderTokenizer
-
-[[autodoc]] DPRReaderTokenizer
-
-## DPRReaderTokenizerFast
-
-[[autodoc]] DPRReaderTokenizerFast
-
-## DPR specific outputs
-
-[[autodoc]] models.dpr.modeling_dpr.DPRContextEncoderOutput
-
-[[autodoc]] models.dpr.modeling_dpr.DPRQuestionEncoderOutput
-
-[[autodoc]] models.dpr.modeling_dpr.DPRReaderOutput
-
-## DPRContextEncoder
-
-[[autodoc]] DPRContextEncoder
-    - forward
-
-## DPRQuestionEncoder
-
-[[autodoc]] DPRQuestionEncoder
-    - forward
-
-## DPRReader
-
-[[autodoc]] DPRReader
-    - forward
-
-## TFDPRContextEncoder
-
-[[autodoc]] TFDPRContextEncoder
-    - call
-
-## TFDPRQuestionEncoder
-
-[[autodoc]] TFDPRQuestionEncoder
-    - call
-
-## TFDPRReader
-
-[[autodoc]] TFDPRReader
-    - call
diff --git a/docs/source/en/model_doc/dpt.md b/docs/source/en/model_doc/dpt.md
new file mode 100644
index 000000000000..5e3e25343cdd
--- /dev/null
+++ b/docs/source/en/model_doc/dpt.md
@@ -0,0 +1,75 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# DPT
+
+## Overview
+
+The DPT model was proposed in [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
+DPT is a model that leverages the [Vision Transformer (ViT)](vit) as backbone for dense prediction tasks like semantic segmentation and depth estimation.
+
+The abstract from the paper is the following:
+
+*We introduce dense vision transformers, an architecture that leverages vision transformers in place of convolutional networks as a backbone for dense prediction tasks. We assemble tokens from various stages of the vision transformer into image-like representations at various resolutions and progressively combine them into full-resolution predictions using a convolutional decoder. The transformer backbone processes representations at a constant and relatively high resolution and has a global receptive field at every stage. These properties allow the dense vision transformer to provide finer-grained and more globally coherent predictions when compared to fully-convolutional networks. Our experiments show that this architecture yields substantial improvements on dense prediction tasks, especially when a large amount of training data is available. For monocular depth estimation, we observe an improvement of up to 28% in relative performance when compared to a state-of-the-art fully-convolutional network. When applied to semantic segmentation, dense vision transformers set a new state of the art on ADE20K with 49.02% mIoU. We further show that the architecture can be fine-tuned on smaller datasets such as NYUv2, KITTI, and Pascal Context where it also sets the new state of the art.*
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/dpt_architecture.jpg"
+alt="drawing" width="600"/>
+
+<small> DPT architecture. Taken from the <a href="https://arxiv.org/abs/2103.13413" target="_blank">original paper</a>. </small>
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/isl-org/DPT).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DPT.
+
+- Demo notebooks for [`DPTForDepthEstimation`] can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DPT).
+
+- [Semantic segmentation task guide](../tasks/semantic_segmentation)
+- [Monocular depth estimation task guide](../tasks/monocular_depth_estimation)
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## DPTConfig
+
+[[autodoc]] DPTConfig
+
+## DPTFeatureExtractor
+
+[[autodoc]] DPTFeatureExtractor
+    - __call__
+    - post_process_semantic_segmentation
+
+## DPTImageProcessor
+
+[[autodoc]] DPTImageProcessor
+    - preprocess
+    - post_process_semantic_segmentation
+
+## DPTModel
+
+[[autodoc]] DPTModel
+    - forward
+
+## DPTForDepthEstimation
+
+[[autodoc]] DPTForDepthEstimation
+    - forward
+
+## DPTForSemanticSegmentation
+
+[[autodoc]] DPTForSemanticSegmentation
+    - forward
diff --git a/docs/source/en/model_doc/dpt.mdx b/docs/source/en/model_doc/dpt.mdx
deleted file mode 100644
index 46049d7a053b..000000000000
--- a/docs/source/en/model_doc/dpt.mdx
+++ /dev/null
@@ -1,65 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# DPT
-
-## Overview
-
-The DPT model was proposed in [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
-DPT is a model that leverages the [Vision Transformer (ViT)](vit) as backbone for dense prediction tasks like semantic segmentation and depth estimation.
-
-The abstract from the paper is the following:
-
-*We introduce dense vision transformers, an architecture that leverages vision transformers in place of convolutional networks as a backbone for dense prediction tasks. We assemble tokens from various stages of the vision transformer into image-like representations at various resolutions and progressively combine them into full-resolution predictions using a convolutional decoder. The transformer backbone processes representations at a constant and relatively high resolution and has a global receptive field at every stage. These properties allow the dense vision transformer to provide finer-grained and more globally coherent predictions when compared to fully-convolutional networks. Our experiments show that this architecture yields substantial improvements on dense prediction tasks, especially when a large amount of training data is available. For monocular depth estimation, we observe an improvement of up to 28% in relative performance when compared to a state-of-the-art fully-convolutional network. When applied to semantic segmentation, dense vision transformers set a new state of the art on ADE20K with 49.02% mIoU. We further show that the architecture can be fine-tuned on smaller datasets such as NYUv2, KITTI, and Pascal Context where it also sets the new state of the art.*
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/dpt_architecture.jpg"
-alt="drawing" width="600"/>
-
-<small> DPT architecture. Taken from the <a href="https://arxiv.org/abs/2103.13413" target="_blank">original paper</a>. </small>
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/isl-org/DPT).
-
-## DPTConfig
-
-[[autodoc]] DPTConfig
-
-
-## DPTFeatureExtractor
-
-[[autodoc]] DPTFeatureExtractor
-    - __call__
-    - post_process_semantic_segmentation
-
-
-## DPTImageProcessor
-
-[[autodoc]] DPTImageProcessor
-    - preprocess
-    - post_process_semantic_segmentation
-
-
-## DPTModel
-
-[[autodoc]] DPTModel
-    - forward
-
-
-## DPTForDepthEstimation
-
-[[autodoc]] DPTForDepthEstimation
-    - forward
-
-
-## DPTForSemanticSegmentation
-
-[[autodoc]] DPTForSemanticSegmentation
-    - forward
diff --git a/docs/source/en/model_doc/efficientformer.md b/docs/source/en/model_doc/efficientformer.md
new file mode 100644
index 000000000000..1f16f9811b77
--- /dev/null
+++ b/docs/source/en/model_doc/efficientformer.md
@@ -0,0 +1,87 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# EfficientFormer
+
+## Overview
+
+The EfficientFormer model was proposed in [EfficientFormer: Vision Transformers at MobileNet Speed](https://arxiv.org/abs/2206.01191) 
+by Yanyu Li, Geng Yuan, Yang Wen, Eric Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.  EfficientFormer proposes a
+dimension-consistent pure transformer that can be run on mobile devices for dense prediction tasks like image classification, object
+detection and semantic segmentation.
+
+The abstract from the paper is the following:
+
+*Vision Transformers (ViT) have shown rapid progress in computer vision tasks, achieving promising results on various benchmarks. 
+However, due to the massive number of parameters and model design, e.g., attention mechanism, ViT-based models are generally 
+times slower than lightweight convolutional networks. Therefore, the deployment of ViT for real-time applications is particularly 
+challenging, especially on resource-constrained hardware such as mobile devices. Recent efforts try to reduce the computation 
+complexity of ViT through network architecture search or hybrid design with MobileNet block, yet the inference speed is still 
+unsatisfactory. This leads to an important question: can transformers run as fast as MobileNet while obtaining high performance? 
+To answer this, we first revisit the network architecture and operators used in ViT-based models and identify inefficient designs. 
+Then we introduce a dimension-consistent pure transformer (without MobileNet blocks) as a design paradigm. 
+Finally, we perform latency-driven slimming to get a series of final models dubbed EfficientFormer. 
+Extensive experiments show the superiority of EfficientFormer in performance and speed on mobile devices. 
+Our fastest model, EfficientFormer-L1, achieves 79.2% top-1 accuracy on ImageNet-1K with only 1.6 ms inference latency on 
+iPhone 12 (compiled with CoreML), which { runs as fast as MobileNetV2×1.4 (1.6 ms, 74.7% top-1),} and our largest model, 
+EfficientFormer-L7, obtains 83.3% accuracy with only 7.0 ms latency. Our work proves that properly designed transformers can 
+reach extremely low latency on mobile devices while maintaining high performance.*
+
+This model was contributed by [novice03](https://huggingface.co/novice03) and [Bearnardd](https://huggingface.co/Bearnardd).
+The original code can be found [here](https://github.com/snap-research/EfficientFormer). The TensorFlow version of this model was added by [D-Roberts](https://huggingface.co/D-Roberts).
+
+## Documentation resources
+
+- [Image classification task guide](../tasks/image_classification)
+
+## EfficientFormerConfig
+
+[[autodoc]] EfficientFormerConfig
+
+## EfficientFormerImageProcessor
+
+[[autodoc]] EfficientFormerImageProcessor
+    - preprocess
+
+## EfficientFormerModel
+
+[[autodoc]] EfficientFormerModel
+    - forward
+
+## EfficientFormerForImageClassification
+
+[[autodoc]] EfficientFormerForImageClassification
+    - forward
+
+## EfficientFormerForImageClassificationWithTeacher
+
+[[autodoc]] EfficientFormerForImageClassificationWithTeacher
+    - forward
+
+## TFEfficientFormerModel
+
+[[autodoc]] TFEfficientFormerModel
+    - call
+
+## TFEfficientFormerForImageClassification
+
+[[autodoc]] TFEfficientFormerForImageClassification
+    - call
+
+## TFEfficientFormerForImageClassificationWithTeacher
+
+[[autodoc]] TFEfficientFormerForImageClassificationWithTeacher
+    - call
diff --git a/docs/source/en/model_doc/efficientnet.md b/docs/source/en/model_doc/efficientnet.md
new file mode 100644
index 000000000000..a69b255dba5e
--- /dev/null
+++ b/docs/source/en/model_doc/efficientnet.md
@@ -0,0 +1,51 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# EfficientNet
+
+## Overview
+
+The EfficientNet model was proposed in [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) 
+by Mingxing Tan and Quoc V. Le. EfficientNets are a family of image classification models, which achieve state-of-the-art accuracy, yet being an order-of-magnitude smaller and faster than previous models.
+
+The abstract from the paper is the following:
+
+*Convolutional Neural Networks (ConvNets) are commonly developed at a fixed resource budget, and then scaled up for better accuracy if more resources are available. In this paper, we systematically study model scaling and identify that carefully balancing network depth, width, and resolution can lead to better performance. Based on this observation, we propose a new scaling method that uniformly scales all dimensions of depth/width/resolution using a simple yet highly effective compound coefficient. We demonstrate the effectiveness of this method on scaling up MobileNets and ResNet.
+To go even further, we use neural architecture search to design a new baseline network and scale it up to obtain a family of models, called EfficientNets, which achieve much better accuracy and efficiency than previous ConvNets. In particular, our EfficientNet-B7 achieves state-of-the-art 84.3% top-1 accuracy on ImageNet, while being 8.4x smaller and 6.1x faster on inference than the best existing ConvNet. Our EfficientNets also transfer well and achieve state-of-the-art accuracy on CIFAR-100 (91.7%), Flowers (98.8%), and 3 other transfer learning datasets, with an order of magnitude fewer parameters.*
+
+This model was contributed by [adirik](https://huggingface.co/adirik).
+The original code can be found [here](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet).
+
+
+## EfficientNetConfig
+
+[[autodoc]] EfficientNetConfig
+
+## EfficientNetImageProcessor
+
+[[autodoc]] EfficientNetImageProcessor
+    - preprocess
+
+## EfficientNetModel
+
+[[autodoc]] EfficientNetModel
+    - forward
+
+## EfficientNetForImageClassification
+
+[[autodoc]] EfficientNetForImageClassification
+    - forward
+
diff --git a/docs/source/en/model_doc/electra.md b/docs/source/en/model_doc/electra.md
new file mode 100644
index 000000000000..26830950ae3a
--- /dev/null
+++ b/docs/source/en/model_doc/electra.md
@@ -0,0 +1,211 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# ELECTRA
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=electra">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-electra-blueviolet">
+</a>
+<a href="https://huggingface.co/spaces/docs-demos/electra_large_discriminator_squad2_512">
+<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
+</a>
+</div>
+
+## Overview
+
+The ELECTRA model was proposed in the paper [ELECTRA: Pre-training Text Encoders as Discriminators Rather Than
+Generators](https://openreview.net/pdf?id=r1xMH1BtvB). ELECTRA is a new pretraining approach which trains two
+transformer models: the generator and the discriminator. The generator's role is to replace tokens in a sequence, and
+is therefore trained as a masked language model. The discriminator, which is the model we're interested in, tries to
+identify which tokens were replaced by the generator in the sequence.
+
+The abstract from the paper is the following:
+
+*Masked language modeling (MLM) pretraining methods such as BERT corrupt the input by replacing some tokens with [MASK]
+and then train a model to reconstruct the original tokens. While they produce good results when transferred to
+downstream NLP tasks, they generally require large amounts of compute to be effective. As an alternative, we propose a
+more sample-efficient pretraining task called replaced token detection. Instead of masking the input, our approach
+corrupts it by replacing some tokens with plausible alternatives sampled from a small generator network. Then, instead
+of training a model that predicts the original identities of the corrupted tokens, we train a discriminative model that
+predicts whether each token in the corrupted input was replaced by a generator sample or not. Thorough experiments
+demonstrate this new pretraining task is more efficient than MLM because the task is defined over all input tokens
+rather than just the small subset that was masked out. As a result, the contextual representations learned by our
+approach substantially outperform the ones learned by BERT given the same model size, data, and compute. The gains are
+particularly strong for small models; for example, we train a model on one GPU for 4 days that outperforms GPT (trained
+using 30x more compute) on the GLUE natural language understanding benchmark. Our approach also works well at scale,
+where it performs comparably to RoBERTa and XLNet while using less than 1/4 of their compute and outperforms them when
+using the same amount of compute.*
+
+Tips:
+
+- ELECTRA is the pretraining approach, therefore there is nearly no changes done to the underlying model: BERT. The
+  only change is the separation of the embedding size and the hidden size: the embedding size is generally smaller,
+  while the hidden size is larger. An additional projection layer (linear) is used to project the embeddings from their
+  embedding size to the hidden size. In the case where the embedding size is the same as the hidden size, no projection
+  layer is used.
+- ELECTRA is a transformer model pretrained with the use of another (small) masked language model. The inputs are corrupted by that language model, which takes an input text that is randomly masked and outputs a text in which ELECTRA has to predict which token is an original and which one has been replaced. Like for GAN training, the small language model is trained for a few steps (but with the original texts as objective, not to fool the ELECTRA model like in a traditional GAN setting) then the ELECTRA model is trained for a few steps.
+- The ELECTRA checkpoints saved using [Google Research's implementation](https://github.com/google-research/electra)
+  contain both the generator and discriminator. The conversion script requires the user to name which model to export
+  into the correct architecture. Once converted to the HuggingFace format, these checkpoints may be loaded into all
+  available ELECTRA models, however. This means that the discriminator may be loaded in the
+  [`ElectraForMaskedLM`] model, and the generator may be loaded in the
+  [`ElectraForPreTraining`] model (the classification head will be randomly initialized as it
+  doesn't exist in the generator).
+
+This model was contributed by [lysandre](https://huggingface.co/lysandre). The original code can be found [here](https://github.com/google-research/electra).
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Causal language modeling task guide](../tasks/language_modeling)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+- [Multiple choice task guide](../tasks/multiple_choice)
+
+## ElectraConfig
+
+[[autodoc]] ElectraConfig
+
+## ElectraTokenizer
+
+[[autodoc]] ElectraTokenizer
+
+## ElectraTokenizerFast
+
+[[autodoc]] ElectraTokenizerFast
+
+## Electra specific outputs
+
+[[autodoc]] models.electra.modeling_electra.ElectraForPreTrainingOutput
+
+[[autodoc]] models.electra.modeling_tf_electra.TFElectraForPreTrainingOutput
+
+## ElectraModel
+
+[[autodoc]] ElectraModel
+    - forward
+
+## ElectraForPreTraining
+
+[[autodoc]] ElectraForPreTraining
+    - forward
+
+## ElectraForCausalLM
+
+[[autodoc]] ElectraForCausalLM
+    - forward
+
+## ElectraForMaskedLM
+
+[[autodoc]] ElectraForMaskedLM
+    - forward
+
+## ElectraForSequenceClassification
+
+[[autodoc]] ElectraForSequenceClassification
+    - forward
+
+## ElectraForMultipleChoice
+
+[[autodoc]] ElectraForMultipleChoice
+    - forward
+
+## ElectraForTokenClassification
+
+[[autodoc]] ElectraForTokenClassification
+    - forward
+
+## ElectraForQuestionAnswering
+
+[[autodoc]] ElectraForQuestionAnswering
+    - forward
+
+## TFElectraModel
+
+[[autodoc]] TFElectraModel
+    - call
+
+## TFElectraForPreTraining
+
+[[autodoc]] TFElectraForPreTraining
+    - call
+
+## TFElectraForMaskedLM
+
+[[autodoc]] TFElectraForMaskedLM
+    - call
+
+## TFElectraForSequenceClassification
+
+[[autodoc]] TFElectraForSequenceClassification
+    - call
+
+## TFElectraForMultipleChoice
+
+[[autodoc]] TFElectraForMultipleChoice
+    - call
+
+## TFElectraForTokenClassification
+
+[[autodoc]] TFElectraForTokenClassification
+    - call
+
+## TFElectraForQuestionAnswering
+
+[[autodoc]] TFElectraForQuestionAnswering
+    - call
+
+## FlaxElectraModel
+
+[[autodoc]] FlaxElectraModel
+    - __call__
+
+## FlaxElectraForPreTraining
+
+[[autodoc]] FlaxElectraForPreTraining
+    - __call__
+
+## FlaxElectraForCausalLM
+
+[[autodoc]] FlaxElectraForCausalLM
+    - __call__
+
+## FlaxElectraForMaskedLM
+
+[[autodoc]] FlaxElectraForMaskedLM
+    - __call__
+
+## FlaxElectraForSequenceClassification
+
+[[autodoc]] FlaxElectraForSequenceClassification
+    - __call__
+
+## FlaxElectraForMultipleChoice
+
+[[autodoc]] FlaxElectraForMultipleChoice
+    - __call__
+
+## FlaxElectraForTokenClassification
+
+[[autodoc]] FlaxElectraForTokenClassification
+    - __call__
+
+## FlaxElectraForQuestionAnswering
+
+[[autodoc]] FlaxElectraForQuestionAnswering
+    - __call__
diff --git a/docs/source/en/model_doc/electra.mdx b/docs/source/en/model_doc/electra.mdx
deleted file mode 100644
index c5a38f243117..000000000000
--- a/docs/source/en/model_doc/electra.mdx
+++ /dev/null
@@ -1,189 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# ELECTRA
-
-## Overview
-
-The ELECTRA model was proposed in the paper [ELECTRA: Pre-training Text Encoders as Discriminators Rather Than
-Generators](https://openreview.net/pdf?id=r1xMH1BtvB). ELECTRA is a new pretraining approach which trains two
-transformer models: the generator and the discriminator. The generator's role is to replace tokens in a sequence, and
-is therefore trained as a masked language model. The discriminator, which is the model we're interested in, tries to
-identify which tokens were replaced by the generator in the sequence.
-
-The abstract from the paper is the following:
-
-*Masked language modeling (MLM) pretraining methods such as BERT corrupt the input by replacing some tokens with [MASK]
-and then train a model to reconstruct the original tokens. While they produce good results when transferred to
-downstream NLP tasks, they generally require large amounts of compute to be effective. As an alternative, we propose a
-more sample-efficient pretraining task called replaced token detection. Instead of masking the input, our approach
-corrupts it by replacing some tokens with plausible alternatives sampled from a small generator network. Then, instead
-of training a model that predicts the original identities of the corrupted tokens, we train a discriminative model that
-predicts whether each token in the corrupted input was replaced by a generator sample or not. Thorough experiments
-demonstrate this new pretraining task is more efficient than MLM because the task is defined over all input tokens
-rather than just the small subset that was masked out. As a result, the contextual representations learned by our
-approach substantially outperform the ones learned by BERT given the same model size, data, and compute. The gains are
-particularly strong for small models; for example, we train a model on one GPU for 4 days that outperforms GPT (trained
-using 30x more compute) on the GLUE natural language understanding benchmark. Our approach also works well at scale,
-where it performs comparably to RoBERTa and XLNet while using less than 1/4 of their compute and outperforms them when
-using the same amount of compute.*
-
-Tips:
-
-- ELECTRA is the pretraining approach, therefore there is nearly no changes done to the underlying model: BERT. The
-  only change is the separation of the embedding size and the hidden size: the embedding size is generally smaller,
-  while the hidden size is larger. An additional projection layer (linear) is used to project the embeddings from their
-  embedding size to the hidden size. In the case where the embedding size is the same as the hidden size, no projection
-  layer is used.
-- The ELECTRA checkpoints saved using [Google Research's implementation](https://github.com/google-research/electra)
-  contain both the generator and discriminator. The conversion script requires the user to name which model to export
-  into the correct architecture. Once converted to the HuggingFace format, these checkpoints may be loaded into all
-  available ELECTRA models, however. This means that the discriminator may be loaded in the
-  [`ElectraForMaskedLM`] model, and the generator may be loaded in the
-  [`ElectraForPreTraining`] model (the classification head will be randomly initialized as it
-  doesn't exist in the generator).
-
-This model was contributed by [lysandre](https://huggingface.co/lysandre). The original code can be found [here](https://github.com/google-research/electra).
-
-
-## ElectraConfig
-
-[[autodoc]] ElectraConfig
-
-## ElectraTokenizer
-
-[[autodoc]] ElectraTokenizer
-
-## ElectraTokenizerFast
-
-[[autodoc]] ElectraTokenizerFast
-
-## Electra specific outputs
-
-[[autodoc]] models.electra.modeling_electra.ElectraForPreTrainingOutput
-
-[[autodoc]] models.electra.modeling_tf_electra.TFElectraForPreTrainingOutput
-
-## ElectraModel
-
-[[autodoc]] ElectraModel
-    - forward
-
-## ElectraForPreTraining
-
-[[autodoc]] ElectraForPreTraining
-    - forward
-
-## ElectraForCausalLM
-
-[[autodoc]] ElectraForCausalLM
-    - forward
-
-## ElectraForMaskedLM
-
-[[autodoc]] ElectraForMaskedLM
-    - forward
-
-## ElectraForSequenceClassification
-
-[[autodoc]] ElectraForSequenceClassification
-    - forward
-
-## ElectraForMultipleChoice
-
-[[autodoc]] ElectraForMultipleChoice
-    - forward
-
-## ElectraForTokenClassification
-
-[[autodoc]] ElectraForTokenClassification
-    - forward
-
-## ElectraForQuestionAnswering
-
-[[autodoc]] ElectraForQuestionAnswering
-    - forward
-
-## TFElectraModel
-
-[[autodoc]] TFElectraModel
-    - call
-
-## TFElectraForPreTraining
-
-[[autodoc]] TFElectraForPreTraining
-    - call
-
-## TFElectraForMaskedLM
-
-[[autodoc]] TFElectraForMaskedLM
-    - call
-
-## TFElectraForSequenceClassification
-
-[[autodoc]] TFElectraForSequenceClassification
-    - call
-
-## TFElectraForMultipleChoice
-
-[[autodoc]] TFElectraForMultipleChoice
-    - call
-
-## TFElectraForTokenClassification
-
-[[autodoc]] TFElectraForTokenClassification
-    - call
-
-## TFElectraForQuestionAnswering
-
-[[autodoc]] TFElectraForQuestionAnswering
-    - call
-
-## FlaxElectraModel
-
-[[autodoc]] FlaxElectraModel
-    - __call__
-
-## FlaxElectraForPreTraining
-
-[[autodoc]] FlaxElectraForPreTraining
-    - __call__
-
-## FlaxElectraForCausalLM
-
-[[autodoc]] FlaxElectraForCausalLM
-    - __call__
-
-## FlaxElectraForMaskedLM
-
-[[autodoc]] FlaxElectraForMaskedLM
-    - __call__
-
-## FlaxElectraForSequenceClassification
-
-[[autodoc]] FlaxElectraForSequenceClassification
-    - __call__
-
-## FlaxElectraForMultipleChoice
-
-[[autodoc]] FlaxElectraForMultipleChoice
-    - __call__
-
-## FlaxElectraForTokenClassification
-
-[[autodoc]] FlaxElectraForTokenClassification
-    - __call__
-
-## FlaxElectraForQuestionAnswering
-
-[[autodoc]] FlaxElectraForQuestionAnswering
-    - __call__
diff --git a/docs/source/en/model_doc/encodec.md b/docs/source/en/model_doc/encodec.md
new file mode 100644
index 000000000000..bc7f64676eee
--- /dev/null
+++ b/docs/source/en/model_doc/encodec.md
@@ -0,0 +1,63 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# EnCodec
+
+## Overview
+
+The EnCodec neural codec model was proposed in [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) by Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi.
+
+The abstract from the paper is the following:
+
+*We introduce a state-of-the-art real-time, high-fidelity, audio codec leveraging neural networks. It consists in a streaming encoder-decoder architecture with quantized latent space trained in an end-to-end fashion. We simplify and speed-up the training by using a single multiscale spectrogram adversary that efficiently reduces artifacts and produce high-quality samples. We introduce a novel loss balancer mechanism to stabilize training: the weight of a loss now defines the fraction of the overall gradient it should represent, thus decoupling the choice of this hyper-parameter from the typical scale of the loss. Finally, we study how lightweight Transformer models can be used to further compress the obtained representation by up to 40%, while staying faster than real time. We provide a detailed description of the key design choices of the proposed model including: training objective, architectural changes and a study of various perceptual loss functions. We present an extensive subjective evaluation (MUSHRA tests) together with an ablation study for a range of bandwidths and audio domains, including speech, noisy-reverberant speech, and music. Our approach is superior to the baselines methods across all evaluated settings, considering both 24 kHz monophonic and 48 kHz stereophonic audio.*
+
+This model was contributed by [Matthijs](https://huggingface.co/Matthijs), [Patrick Von Platen](https://huggingface.co/patrickvonplaten) and [Arthur Zucker](https://huggingface.co/ArthurZ). 
+The original code can be found [here](https://github.com/facebookresearch/encodec).
+Here is a quick example of how to encode and decode an audio using this model:
+
+```python 
+>>> from datasets import load_dataset, Audio
+>>> from transformers import EncodecModel, AutoProcessor
+>>> librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+
+>>> model = EncodecModel.from_pretrained("facebook/encodec_24khz")
+>>> processor = AutoProcessor.from_pretrained("facebook/encodec_24khz")
+>>> librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate))
+>>> audio_sample = librispeech_dummy[-1]["audio"]["array"]
+>>> inputs = processor(raw_audio=audio_sample, sampling_rate=processor.sampling_rate, return_tensors="pt")
+
+>>> encoder_outputs = model.encode(inputs["input_values"], inputs["padding_mask"])
+>>> audio_values = model.decode(encoder_outputs.audio_codes, encoder_outputs.audio_scales, inputs["padding_mask"])[0]
+>>> # or the equivalent with a forward pass
+>>> audio_values = model(inputs["input_values"], inputs["padding_mask"]).audio_values
+```
+
+
+## EncodecConfig
+
+[[autodoc]] EncodecConfig
+
+## EncodecFeatureExtractor
+
+[[autodoc]] EncodecFeatureExtractor
+    - __call__
+
+## EncodecModel
+
+[[autodoc]] EncodecModel
+    - decode
+    - encode
+    - forward
diff --git a/docs/source/en/model_doc/encoder-decoder.md b/docs/source/en/model_doc/encoder-decoder.md
new file mode 100644
index 000000000000..8e26a3b9e407
--- /dev/null
+++ b/docs/source/en/model_doc/encoder-decoder.md
@@ -0,0 +1,168 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Encoder Decoder Models
+
+## Overview
+
+The [`EncoderDecoderModel`] can be used to initialize a sequence-to-sequence model with any
+pretrained autoencoding model as the encoder and any pretrained autoregressive model as the decoder.
+
+The effectiveness of initializing sequence-to-sequence models with pretrained checkpoints for sequence generation tasks
+was shown in [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by
+Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+
+After such an [`EncoderDecoderModel`] has been trained/fine-tuned, it can be saved/loaded just like
+any other models (see the examples for more information).
+
+An application of this architecture could be to leverage two pretrained [`BertModel`] as the encoder
+and decoder for a summarization model as was shown in: [Text Summarization with Pretrained Encoders](https://arxiv.org/abs/1908.08345) by Yang Liu and Mirella Lapata.
+
+## Randomly initializing `EncoderDecoderModel` from model configurations.
+
+[`EncoderDecoderModel`] can be randomly initialized from an encoder and a decoder config. In the following example, we show how to do this using the default [`BertModel`] configuration for the encoder and the default [`BertForCausalLM`] configuration for the decoder.
+
+```python
+>>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel
+
+>>> config_encoder = BertConfig()
+>>> config_decoder = BertConfig()
+
+>>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
+>>> model = EncoderDecoderModel(config=config)
+```
+
+## Initialising `EncoderDecoderModel` from a pretrained encoder and a pretrained decoder.
+
+[`EncoderDecoderModel`] can be initialized from a pretrained encoder checkpoint and a pretrained decoder checkpoint. Note that any pretrained auto-encoding model, *e.g.* BERT, can serve as the encoder and both pretrained auto-encoding models, *e.g.* BERT, pretrained causal language models, *e.g.* GPT2, as well as the pretrained decoder part of sequence-to-sequence models, *e.g.* decoder of BART, can be used as the decoder.
+Depending on which architecture you choose as the decoder, the cross-attention layers might be randomly initialized.
+Initializing [`EncoderDecoderModel`] from a pretrained encoder and decoder checkpoint requires the model to be fine-tuned on a downstream task, as has been shown in [the *Warm-starting-encoder-decoder blog post*](https://huggingface.co/blog/warm-starting-encoder-decoder).
+To do so, the `EncoderDecoderModel` class provides a [`EncoderDecoderModel.from_encoder_decoder_pretrained`] method.
+
+```python
+>>> from transformers import EncoderDecoderModel, BertTokenizer
+
+>>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+>>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")
+```
+
+## Loading an existing `EncoderDecoderModel` checkpoint and perform inference.
+
+To load fine-tuned checkpoints of the `EncoderDecoderModel` class, [`EncoderDecoderModel`] provides the `from_pretrained(...)` method just like any other model architecture in Transformers.
+
+To perform inference, one uses the [`generate`] method, which allows to autoregressively generate text. This method supports various forms of decoding, such as greedy, beam search and multinomial sampling.
+
+```python
+>>> from transformers import AutoTokenizer, EncoderDecoderModel
+
+>>> # load a fine-tuned seq2seq model and corresponding tokenizer
+>>> model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert_cnn_daily_mail")
+>>> tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/bert2bert_cnn_daily_mail")
+
+>>> # let's perform inference on a long piece of text
+>>> ARTICLE_TO_SUMMARIZE = (
+...     "PG&E stated it scheduled the blackouts in response to forecasts for high winds "
+...     "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were "
+...     "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."
+... )
+>>> input_ids = tokenizer(ARTICLE_TO_SUMMARIZE, return_tensors="pt").input_ids
+
+>>> # autoregressively generate summary (uses greedy decoding by default)
+>>> generated_ids = model.generate(input_ids)
+>>> generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+>>> print(generated_text)
+nearly 800 thousand customers were affected by the shutoffs. the aim is to reduce the risk of wildfires. nearly 800, 000 customers were expected to be affected by high winds amid dry conditions. pg & e said it scheduled the blackouts to last through at least midday tomorrow.
+```
+
+## Loading a PyTorch checkpoint into `TFEncoderDecoderModel`.
+
+[`TFEncoderDecoderModel.from_pretrained`] currently doesn't support initializing the model from a
+pytorch checkpoint. Passing `from_pt=True` to this method will throw an exception. If there are only pytorch
+checkpoints for a particular encoder-decoder model, a workaround is:
+
+```python
+>>> # a workaround to load from pytorch checkpoint
+>>> from transformers import EncoderDecoderModel, TFEncoderDecoderModel
+
+>>> _model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16")
+
+>>> _model.encoder.save_pretrained("./encoder")
+>>> _model.decoder.save_pretrained("./decoder")
+
+>>> model = TFEncoderDecoderModel.from_encoder_decoder_pretrained(
+...     "./encoder", "./decoder", encoder_from_pt=True, decoder_from_pt=True
+... )
+>>> # This is only for copying some specific attributes of this particular model.
+>>> model.config = _model.config
+```
+
+## Training
+
+Once the model is created, it can be fine-tuned similar to BART, T5 or any other encoder-decoder model.
+As you can see, only 2 inputs are required for the model in order to compute a loss: `input_ids` (which are the
+`input_ids` of the encoded input sequence) and `labels` (which are the `input_ids` of the encoded
+target sequence).
+
+```python
+>>> from transformers import BertTokenizer, EncoderDecoderModel
+
+>>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+>>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")
+
+>>> model.config.decoder_start_token_id = tokenizer.cls_token_id
+>>> model.config.pad_token_id = tokenizer.pad_token_id
+
+>>> input_ids = tokenizer(
+...     "The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side.During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was  finished in 1930. It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft).Excluding transmitters, the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct.",
+...     return_tensors="pt",
+... ).input_ids
+
+>>> labels = tokenizer(
+...     "the eiffel tower surpassed the washington monument to become the tallest structure in the world. it was the first structure to reach a height of 300 metres in paris in 1930. it is now taller than the chrysler building by 5. 2 metres ( 17 ft ) and is the second tallest free - standing structure in paris.",
+...     return_tensors="pt",
+... ).input_ids
+
+>>> # the forward function automatically creates the correct decoder_input_ids
+>>> loss = model(input_ids=input_ids, labels=labels).loss
+```
+
+Detailed [colab](https://colab.research.google.com/drive/1WIk2bxglElfZewOHboPFNj8H44_VAyKE?usp=sharing#scrollTo=ZwQIEhKOrJpl) for training.
+
+This model was contributed by [thomwolf](https://github.com/thomwolf). This model's TensorFlow and Flax versions
+were contributed by [ydshieh](https://github.com/ydshieh).
+
+
+## EncoderDecoderConfig
+
+[[autodoc]] EncoderDecoderConfig
+
+## EncoderDecoderModel
+
+[[autodoc]] EncoderDecoderModel
+    - forward
+    - from_encoder_decoder_pretrained
+
+## TFEncoderDecoderModel
+
+[[autodoc]] TFEncoderDecoderModel
+    - call
+    - from_encoder_decoder_pretrained
+
+## FlaxEncoderDecoderModel
+
+[[autodoc]] FlaxEncoderDecoderModel
+    - __call__
+    - from_encoder_decoder_pretrained
diff --git a/docs/source/en/model_doc/encoder-decoder.mdx b/docs/source/en/model_doc/encoder-decoder.mdx
deleted file mode 100644
index 8130b4945d4c..000000000000
--- a/docs/source/en/model_doc/encoder-decoder.mdx
+++ /dev/null
@@ -1,164 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Encoder Decoder Models
-
-## Overview
-
-The [`EncoderDecoderModel`] can be used to initialize a sequence-to-sequence model with any
-pretrained autoencoding model as the encoder and any pretrained autoregressive model as the decoder.
-
-The effectiveness of initializing sequence-to-sequence models with pretrained checkpoints for sequence generation tasks
-was shown in [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by
-Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-
-After such an [`EncoderDecoderModel`] has been trained/fine-tuned, it can be saved/loaded just like
-any other models (see the examples for more information).
-
-An application of this architecture could be to leverage two pretrained [`BertModel`] as the encoder
-and decoder for a summarization model as was shown in: [Text Summarization with Pretrained Encoders](https://arxiv.org/abs/1908.08345) by Yang Liu and Mirella Lapata.
-
-## Randomly initializing `EncoderDecoderModel` from model configurations.
-
-[`EncoderDecoderModel`] can be randomly initialized from an encoder and a decoder config. In the following example, we show how to do this using the default [`BertModel`] configuration for the encoder and the default [`BertForCausalLM`] configuration for the decoder.
-
-```python
->>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel
-
->>> config_encoder = BertConfig()
->>> config_decoder = BertConfig()
-
->>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
->>> model = EncoderDecoderModel(config=config)
-```
-
-## Initialising `EncoderDecoderModel` from a pretrained encoder and a pretrained decoder.
-
-[`EncoderDecoderModel`] can be initialized from a pretrained encoder checkpoint and a pretrained decoder checkpoint. Note that any pretrained auto-encoding model, *e.g.* BERT, can serve as the encoder and both pretrained auto-encoding models, *e.g.* BERT, pretrained causal language models, *e.g.* GPT2, as well as the pretrained decoder part of sequence-to-sequence models, *e.g.* decoder of BART, can be used as the decoder.
-Depending on which architecture you choose as the decoder, the cross-attention layers might be randomly initialized.
-Initializing [`EncoderDecoderModel`] from a pretrained encoder and decoder checkpoint requires the model to be fine-tuned on a downstream task, as has been shown in [the *Warm-starting-encoder-decoder blog post*](https://huggingface.co/blog/warm-starting-encoder-decoder).
-To do so, the `EncoderDecoderModel` class provides a [`EncoderDecoderModel.from_encoder_decoder_pretrained`] method.
-
-```python
->>> from transformers import EncoderDecoderModel, BertTokenizer
-
->>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
->>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")
-```
-
-## Loading an existing `EncoderDecoderModel` checkpoint and perform inference.
-
-To load fine-tuned checkpoints of the `EncoderDecoderModel` class, [`EncoderDecoderModel`] provides the `from_pretrained(...)` method just like any other model architecture in Transformers.
-
-To perform inference, one uses the [`generate`] method, which allows to autoregressively generate text. This method supports various forms of decoding, such as greedy, beam search and multinomial sampling.
-
-```python
->>> from transformers import AutoTokenizer, EncoderDecoderModel
-
->>> # load a fine-tuned seq2seq model and corresponding tokenizer
->>> model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert_cnn_daily_mail")
->>> tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/bert2bert_cnn_daily_mail")
-
->>> # let's perform inference on a long piece of text
->>> ARTICLE_TO_SUMMARIZE = (
-...     "PG&E stated it scheduled the blackouts in response to forecasts for high winds "
-...     "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were "
-...     "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."
-... )
->>> input_ids = tokenizer(ARTICLE_TO_SUMMARIZE, return_tensors="pt").input_ids
-
->>> # autoregressively generate summary (uses greedy decoding by default)
->>> generated_ids = model.generate(input_ids)
->>> generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
->>> print(generated_text)
-nearly 800 thousand customers were affected by the shutoffs. the aim is to reduce the risk of wildfires. nearly 800, 000 customers were expected to be affected by high winds amid dry conditions. pg & e said it scheduled the blackouts to last through at least midday tomorrow.
-```
-
-## Loading a PyTorch checkpoint into `TFEncoderDecoderModel`.
-
-[`TFEncoderDecoderModel.from_pretrained`] currently doesn't support initializing the model from a
-pytorch checkpoint. Passing `from_pt=True` to this method will throw an exception. If there are only pytorch
-checkpoints for a particular encoder-decoder model, a workaround is:
-
-```python
->>> # a workaround to load from pytorch checkpoint
->>> from transformers import EncoderDecoderModel, TFEncoderDecoderModel
-
->>> _model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16")
-
->>> _model.encoder.save_pretrained("./encoder")
->>> _model.decoder.save_pretrained("./decoder")
-
->>> model = TFEncoderDecoderModel.from_encoder_decoder_pretrained(
-...     "./encoder", "./decoder", encoder_from_pt=True, decoder_from_pt=True
-... )
->>> # This is only for copying some specific attributes of this particular model.
->>> model.config = _model.config
-```
-
-## Training
-
-Once the model is created, it can be fine-tuned similar to BART, T5 or any other encoder-decoder model.
-As you can see, only 2 inputs are required for the model in order to compute a loss: `input_ids` (which are the
-`input_ids` of the encoded input sequence) and `labels` (which are the `input_ids` of the encoded
-target sequence).
-
-```python
->>> from transformers import BertTokenizer, EncoderDecoderModel
-
->>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
->>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")
-
->>> model.config.decoder_start_token_id = tokenizer.cls_token_id
->>> model.config.pad_token_id = tokenizer.pad_token_id
-
->>> input_ids = tokenizer(
-...     "The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side.During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was  finished in 1930. It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft).Excluding transmitters, the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct.",
-...     return_tensors="pt",
-... ).input_ids
-
->>> labels = tokenizer(
-...     "the eiffel tower surpassed the washington monument to become the tallest structure in the world. it was the first structure to reach a height of 300 metres in paris in 1930. it is now taller than the chrysler building by 5. 2 metres ( 17 ft ) and is the second tallest free - standing structure in paris.",
-...     return_tensors="pt",
-... ).input_ids
-
->>> # the forward function automatically creates the correct decoder_input_ids
->>> loss = model(input_ids=input_ids, labels=labels).loss
-```
-
-Detailed [colab](https://colab.research.google.com/drive/1WIk2bxglElfZewOHboPFNj8H44_VAyKE?usp=sharing#scrollTo=ZwQIEhKOrJpl) for training.
-
-This model was contributed by [thomwolf](https://github.com/thomwolf). This model's TensorFlow and Flax versions
-were contributed by [ydshieh](https://github.com/ydshieh).
-
-
-## EncoderDecoderConfig
-
-[[autodoc]] EncoderDecoderConfig
-
-## EncoderDecoderModel
-
-[[autodoc]] EncoderDecoderModel
-    - forward
-    - from_encoder_decoder_pretrained
-
-## TFEncoderDecoderModel
-
-[[autodoc]] TFEncoderDecoderModel
-    - call
-    - from_encoder_decoder_pretrained
-
-## FlaxEncoderDecoderModel
-
-[[autodoc]] FlaxEncoderDecoderModel
-    - __call__
-    - from_encoder_decoder_pretrained
diff --git a/docs/source/en/model_doc/ernie.md b/docs/source/en/model_doc/ernie.md
new file mode 100644
index 000000000000..a64291a7d4f5
--- /dev/null
+++ b/docs/source/en/model_doc/ernie.md
@@ -0,0 +1,115 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# ERNIE
+
+## Overview
+ERNIE is a series of powerful models proposed by baidu, especially in Chinese tasks,
+including [ERNIE1.0](https://arxiv.org/abs/1904.09223), [ERNIE2.0](https://ojs.aaai.org/index.php/AAAI/article/view/6428),
+[ERNIE3.0](https://arxiv.org/abs/2107.02137), [ERNIE-Gram](https://arxiv.org/abs/2010.12148), [ERNIE-health](https://arxiv.org/abs/2110.07244), etc.
+
+These models are contributed by [nghuyong](https://huggingface.co/nghuyong) and the official code can be found in [PaddleNLP](https://github.com/PaddlePaddle/PaddleNLP) (in PaddlePaddle).
+
+### How to use
+Take `ernie-1.0-base-zh` as an example:
+
+```Python
+from transformers import AutoTokenizer, AutoModel
+tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-1.0-base-zh")
+model = AutoModel.from_pretrained("nghuyong/ernie-1.0-base-zh")
+```
+
+### Supported Models
+
+|     Model Name      | Language |           Description           |
+|:-------------------:|:--------:|:-------------------------------:|
+|  ernie-1.0-base-zh  | Chinese  | Layer:12, Heads:12, Hidden:768  |
+|  ernie-2.0-base-en  | English  | Layer:12, Heads:12, Hidden:768  |
+| ernie-2.0-large-en  | English  | Layer:24, Heads:16, Hidden:1024 |
+|  ernie-3.0-base-zh  | Chinese  | Layer:12, Heads:12, Hidden:768  |
+| ernie-3.0-medium-zh | Chinese  |  Layer:6, Heads:12, Hidden:768  |
+|  ernie-3.0-mini-zh  | Chinese  |  Layer:6, Heads:12, Hidden:384  |
+| ernie-3.0-micro-zh  | Chinese  |  Layer:4, Heads:12, Hidden:384  |
+|  ernie-3.0-nano-zh  | Chinese  |  Layer:4, Heads:12, Hidden:312  |
+|   ernie-health-zh   | Chinese  | Layer:12, Heads:12, Hidden:768  |
+|    ernie-gram-zh    | Chinese  | Layer:12, Heads:12, Hidden:768  |
+
+You can find all the supported models from huggingface's model hub: [huggingface.co/nghuyong](https://huggingface.co/nghuyong), and model details from paddle's official
+repo: [PaddleNLP](https://paddlenlp.readthedocs.io/zh/latest/model_zoo/transformers/ERNIE/contents.html)
+and [ERNIE](https://github.com/PaddlePaddle/ERNIE/blob/repro).
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Causal language modeling task guide](../tasks/language_modeling)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+- [Multiple choice task guide](../tasks/multiple_choice)
+
+## ErnieConfig
+
+[[autodoc]] ErnieConfig
+    - all
+
+## Ernie specific outputs
+
+[[autodoc]] models.ernie.modeling_ernie.ErnieForPreTrainingOutput
+
+## ErnieModel
+
+[[autodoc]] ErnieModel
+    - forward
+
+## ErnieForPreTraining
+
+[[autodoc]] ErnieForPreTraining
+    - forward
+
+## ErnieForCausalLM
+
+[[autodoc]] ErnieForCausalLM
+    - forward
+
+## ErnieForMaskedLM
+
+[[autodoc]] ErnieForMaskedLM
+    - forward
+
+## ErnieForNextSentencePrediction
+
+[[autodoc]] ErnieForNextSentencePrediction
+    - forward
+
+## ErnieForSequenceClassification
+
+[[autodoc]] ErnieForSequenceClassification
+    - forward
+
+## ErnieForMultipleChoice
+
+[[autodoc]] ErnieForMultipleChoice
+    - forward
+
+## ErnieForTokenClassification
+
+[[autodoc]] ErnieForTokenClassification
+    - forward
+
+## ErnieForQuestionAnswering
+
+[[autodoc]] ErnieForQuestionAnswering
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/ernie.mdx b/docs/source/en/model_doc/ernie.mdx
deleted file mode 100644
index 6ec3f1047320..000000000000
--- a/docs/source/en/model_doc/ernie.mdx
+++ /dev/null
@@ -1,102 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# ERNIE
-
-## Overview
-ERNIE is a series of powerful models proposed by baidu, especially in Chinese tasks,
-including [ERNIE1.0](https://arxiv.org/abs/1904.09223), [ERNIE2.0](https://ojs.aaai.org/index.php/AAAI/article/view/6428),
-[ERNIE3.0](https://arxiv.org/abs/2107.02137), [ERNIE-Gram](https://arxiv.org/abs/2010.12148), [ERNIE-health](https://arxiv.org/abs/2110.07244), etc.
-
-These models are contributed by [nghuyong](https://huggingface.co/nghuyong) and the official code can be found in [PaddleNLP](https://github.com/PaddlePaddle/PaddleNLP) (in PaddlePaddle).
-
-### How to use
-Take `ernie-1.0-base-zh` as an example:
-
-```Python
-from transformers import AutoTokenizer, AutoModel
-tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-1.0-base-zh")
-model = AutoModel.from_pretrained("nghuyong/ernie-1.0-base-zh")
-```
-
-### Supported Models
-
-|     Model Name      | Language |           Description           |
-|:-------------------:|:--------:|:-------------------------------:|
-|  ernie-1.0-base-zh  | Chinese  | Layer:12, Heads:12, Hidden:768  |
-|  ernie-2.0-base-en  | English  | Layer:12, Heads:12, Hidden:768  |
-| ernie-2.0-large-en  | English  | Layer:24, Heads:16, Hidden:1024 |
-|  ernie-3.0-base-zh  | Chinese  | Layer:12, Heads:12, Hidden:768  |
-| ernie-3.0-medium-zh | Chinese  |  Layer:6, Heads:12, Hidden:768  |
-|  ernie-3.0-mini-zh  | Chinese  |  Layer:6, Heads:12, Hidden:384  |
-| ernie-3.0-micro-zh  | Chinese  |  Layer:4, Heads:12, Hidden:384  |
-|  ernie-3.0-nano-zh  | Chinese  |  Layer:4, Heads:12, Hidden:312  |
-|   ernie-health-zh   | Chinese  | Layer:12, Heads:12, Hidden:768  |
-|    ernie-gram-zh    | Chinese  | Layer:12, Heads:12, Hidden:768  |
-
-You can find all the supported models from huggingface's model hub: [huggingface.co/nghuyong](https://huggingface.co/nghuyong), and model details from paddle's official
-repo: [PaddleNLP](https://paddlenlp.readthedocs.io/zh/latest/model_zoo/transformers/ERNIE/contents.html)
-and [ERNIE](https://github.com/PaddlePaddle/ERNIE/blob/repro).
-
-## ErnieConfig
-
-[[autodoc]] ErnieConfig
-    - all
-
-## Ernie specific outputs
-
-[[autodoc]] models.ernie.modeling_ernie.ErnieForPreTrainingOutput
-
-## ErnieModel
-
-[[autodoc]] ErnieModel
-    - forward
-
-## ErnieForPreTraining
-
-[[autodoc]] ErnieForPreTraining
-    - forward
-
-## ErnieForCausalLM
-
-[[autodoc]] ErnieForCausalLM
-    - forward
-
-## ErnieForMaskedLM
-
-[[autodoc]] ErnieForMaskedLM
-    - forward
-
-## ErnieForNextSentencePrediction
-
-[[autodoc]] ErnieForNextSentencePrediction
-    - forward
-
-## ErnieForSequenceClassification
-
-[[autodoc]] ErnieForSequenceClassification
-    - forward
-
-## ErnieForMultipleChoice
-
-[[autodoc]] ErnieForMultipleChoice
-    - forward
-
-## ErnieForTokenClassification
-
-[[autodoc]] ErnieForTokenClassification
-    - forward
-
-## ErnieForQuestionAnswering
-
-[[autodoc]] ErnieForQuestionAnswering
-    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/ernie_m.md b/docs/source/en/model_doc/ernie_m.md
new file mode 100644
index 000000000000..83e08e09bfcf
--- /dev/null
+++ b/docs/source/en/model_doc/ernie_m.md
@@ -0,0 +1,91 @@
+<!--Copyright 2023 The HuggingFace and Baidu Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# ErnieM
+
+## Overview
+
+The ErnieM model was proposed in [ERNIE-M: Enhanced Multilingual Representation by Aligning
+Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674)  by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun,
+Hao Tian, Hua Wu, Haifeng Wang.
+
+The abstract from the paper is the following:
+
+*Recent studies have demonstrated that pre-trained cross-lingual models achieve impressive performance in downstream cross-lingual tasks. This improvement benefits from learning a large amount of monolingual and parallel corpora. Although it is generally acknowledged that parallel corpora are critical for improving the model performance, existing methods are often constrained by the size of parallel corpora, especially for lowresource languages. In this paper, we propose ERNIE-M, a new training method that encourages the model to align the representation of multiple languages with monolingual corpora, to overcome the constraint that the parallel corpus size places on the model performance. Our key insight is to integrate back-translation into the pre-training process. We generate pseudo-parallel sentence pairs on a monolingual corpus to enable the learning of semantic alignments between different languages, thereby enhancing the semantic modeling of cross-lingual models. Experimental results show that ERNIE-M outperforms existing cross-lingual models and delivers new state-of-the-art results in various cross-lingual downstream tasks.*
+
+Tips:
+
+1. Ernie-M is a BERT-like model so it is a stacked Transformer Encoder.
+2. Instead of using MaskedLM for pretraining (like BERT) the authors used two novel techniques: `Cross-attention Masked Language Modeling` and `Back-translation Masked Language Modeling`. For now these two LMHead objectives are not implemented here.
+3. It is a multilingual language model.
+4. Next Sentence Prediction was not used in pretraining process.
+
+
+This model was contributed by [Susnato Dhar](https://huggingface.co/susnato). The original code can be found [here](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/paddlenlp/transformers/ernie_m).
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Multiple choice task guide](../tasks/multiple_choice)
+
+## ErnieMConfig
+
+[[autodoc]] ErnieMConfig
+
+
+## ErnieMTokenizer
+
+[[autodoc]] ErnieMTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+
+## ErnieMModel
+
+[[autodoc]] ErnieMModel
+    - forward
+
+## ErnieMForSequenceClassification
+
+[[autodoc]] ErnieMForSequenceClassification
+    - forward
+
+
+## ErnieMForMultipleChoice
+
+[[autodoc]] ErnieMForMultipleChoice
+    - forward
+
+
+## ErnieMForTokenClassification
+
+[[autodoc]] ErnieMForTokenClassification
+    - forward
+
+
+## ErnieMForQuestionAnswering
+
+[[autodoc]] ErnieMForQuestionAnswering
+    - forward
+
+## ErnieMForInformationExtraction
+
+[[autodoc]] ErnieMForInformationExtraction
+    - forward
diff --git a/docs/source/en/model_doc/esm.md b/docs/source/en/model_doc/esm.md
new file mode 100644
index 000000000000..47b25650847e
--- /dev/null
+++ b/docs/source/en/model_doc/esm.md
@@ -0,0 +1,156 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# ESM
+
+## Overview
+This page provides code and pre-trained weights for Transformer protein language models from Meta AI's Fundamental 
+AI Research Team, providing the state-of-the-art ESMFold and ESM-2, and the previously released ESM-1b and ESM-1v.
+Transformer protein language models were introduced in the paper [Biological structure and function emerge from scaling
+unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by 
+Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, 
+C. Lawrence Zitnick, Jerry Ma, and Rob Fergus.
+The first version of this paper was [preprinted in 2019](https://www.biorxiv.org/content/10.1101/622803v1?versioned=true).
+
+ESM-2 outperforms all tested single-sequence protein language models across a range of structure prediction tasks,
+and enables atomic resolution structure prediction.
+It was released with the paper [Language models of protein sequences at the scale of evolution enable accurate
+structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie,
+Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido and Alexander Rives.
+
+Also introduced in this paper was ESMFold. It uses an ESM-2 stem with a head that can predict folded protein
+structures with state-of-the-art accuracy. Unlike [AlphaFold2](https://www.nature.com/articles/s41586-021-03819-2),
+it relies on the token embeddings from the large pre-trained protein language model stem and does not perform a multiple
+sequence alignment (MSA) step at inference time, which means that ESMFold checkpoints are fully "standalone" -
+they do not require a database of known protein sequences and structures with associated external query tools
+to make predictions, and are much faster as a result.
+
+
+The abstract from 
+"Biological structure and function emerge from scaling unsupervised learning to 250 
+million protein sequences" is
+
+
+*In the field of artificial intelligence, a combination of scale in data and model capacity enabled by unsupervised
+learning has led to major advances in representation learning and statistical generation. In the life sciences, the
+anticipated growth of sequencing promises unprecedented data on natural sequence diversity. Protein language modeling
+at the scale of evolution is a logical step toward predictive and generative artificial intelligence for biology. To
+this end, we use unsupervised learning to train a deep contextual language model on 86 billion amino acids across 250
+million protein sequences spanning evolutionary diversity. The resulting model contains information about biological
+properties in its representations. The representations are learned from sequence data alone. The learned representation
+space has a multiscale organization reflecting structure from the level of biochemical properties of amino acids to
+remote homology of proteins. Information about secondary and tertiary structure is encoded in the representations and
+can be identified by linear projections. Representation learning produces features that generalize across a range of
+applications, enabling state-of-the-art supervised prediction of mutational effect and secondary structure and
+improving state-of-the-art features for long-range contact prediction.*
+
+
+The abstract from
+"Language models of protein sequences at the scale of evolution enable accurate structure prediction" is
+
+*Large language models have recently been shown to develop emergent capabilities with scale, going beyond
+simple pattern matching to perform higher level reasoning and generate lifelike images and text. While
+language models trained on protein sequences have been studied at a smaller scale, little is known about
+what they learn about biology as they are scaled up. In this work we train models up to 15 billion parameters,
+the largest language models of proteins to be evaluated to date. We find that as models are scaled they learn
+information enabling the prediction of the three-dimensional structure of a protein at the resolution of
+individual atoms. We present ESMFold for high accuracy end-to-end atomic level structure prediction directly
+from the individual sequence of a protein. ESMFold has similar accuracy to AlphaFold2 and RoseTTAFold for
+sequences with low perplexity that are well understood by the language model. ESMFold inference is an
+order of magnitude faster than AlphaFold2, enabling exploration of the structural space of metagenomic
+proteins in practical timescales.*
+
+
+Tips:
+
+- ESM models are trained with a masked language modeling (MLM) objective.
+
+The original code can be found [here](https://github.com/facebookresearch/esm) and was
+was developed by the Fundamental AI Research team at Meta AI.
+ESM-1b, ESM-1v and ESM-2 were contributed to huggingface by [jasonliu](https://huggingface.co/jasonliu)
+and [Matt](https://huggingface.co/Rocketknight1).
+
+ESMFold was contributed to huggingface by [Matt](https://huggingface.co/Rocketknight1) and
+[Sylvain](https://huggingface.co/sgugger), with a big thank you to Nikita Smetanin, Roshan Rao and Tom Sercu for their
+help throughout the process!
+
+The HuggingFace port of ESMFold uses portions of the [openfold](https://github.com/aqlaboratory/openfold) library.
+The `openfold` library is licensed under the Apache License 2.0.
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+
+## EsmConfig
+
+[[autodoc]] EsmConfig
+    - all
+
+## EsmTokenizer
+
+[[autodoc]] EsmTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+
+## EsmModel
+
+[[autodoc]] EsmModel
+    - forward
+
+## EsmForMaskedLM
+
+[[autodoc]] EsmForMaskedLM
+    - forward
+
+## EsmForSequenceClassification
+
+[[autodoc]] EsmForSequenceClassification
+    - forward
+
+## EsmForTokenClassification
+
+[[autodoc]] EsmForTokenClassification
+    - forward
+
+## EsmForProteinFolding
+
+[[autodoc]] EsmForProteinFolding
+    - forward
+
+## TFEsmModel
+
+[[autodoc]] TFEsmModel
+    - call
+
+## TFEsmForMaskedLM
+
+[[autodoc]] TFEsmForMaskedLM
+    - call
+
+## TFEsmForSequenceClassification
+
+[[autodoc]] TFEsmForSequenceClassification
+    - call
+
+## TFEsmForTokenClassification
+
+[[autodoc]] TFEsmForTokenClassification
+    - call
diff --git a/docs/source/en/model_doc/esm.mdx b/docs/source/en/model_doc/esm.mdx
deleted file mode 100644
index 9462e9db0877..000000000000
--- a/docs/source/en/model_doc/esm.mdx
+++ /dev/null
@@ -1,146 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# ESM
-
-## Overview
-This page provides code and pre-trained weights for Transformer protein language models from Meta AI's Fundamental 
-AI Research Team, providing the state-of-the-art ESMFold and ESM-2, and the previously released ESM-1b and ESM-1v.
-Transformer protein language models were introduced in the paper [Biological structure and function emerge from scaling
-unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by 
-Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, 
-C. Lawrence Zitnick, Jerry Ma, and Rob Fergus.
-The first version of this paper was [preprinted in 2019](https://www.biorxiv.org/content/10.1101/622803v1?versioned=true).
-
-ESM-2 outperforms all tested single-sequence protein language models across a range of structure prediction tasks,
-and enables atomic resolution structure prediction.
-It was released with the paper [Language models of protein sequences at the scale of evolution enable accurate
-structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie,
-Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido and Alexander Rives.
-
-Also introduced in this paper was ESMFold. It uses an ESM-2 stem with a head that can predict folded protein
-structures with state-of-the-art accuracy. Unlike [AlphaFold2](https://www.nature.com/articles/s41586-021-03819-2),
-it relies on the token embeddings from the large pre-trained protein language model stem and does not perform a multiple
-sequence alignment (MSA) step at inference time, which means that ESMFold checkpoints are fully "standalone" -
-they do not require a database of known protein sequences and structures with associated external query tools
-to make predictions, and are much faster as a result.
-
-
-The abstract from 
-"Biological structure and function emerge from scaling unsupervised learning to 250 
-million protein sequences" is
-
-
-*In the field of artificial intelligence, a combination of scale in data and model capacity enabled by unsupervised
-learning has led to major advances in representation learning and statistical generation. In the life sciences, the
-anticipated growth of sequencing promises unprecedented data on natural sequence diversity. Protein language modeling
-at the scale of evolution is a logical step toward predictive and generative artificial intelligence for biology. To
-this end, we use unsupervised learning to train a deep contextual language model on 86 billion amino acids across 250
-million protein sequences spanning evolutionary diversity. The resulting model contains information about biological
-properties in its representations. The representations are learned from sequence data alone. The learned representation
-space has a multiscale organization reflecting structure from the level of biochemical properties of amino acids to
-remote homology of proteins. Information about secondary and tertiary structure is encoded in the representations and
-can be identified by linear projections. Representation learning produces features that generalize across a range of
-applications, enabling state-of-the-art supervised prediction of mutational effect and secondary structure and
-improving state-of-the-art features for long-range contact prediction.*
-
-
-The abstract from
-"Language models of protein sequences at the scale of evolution enable accurate structure prediction" is
-
-*Large language models have recently been shown to develop emergent capabilities with scale, going beyond
-simple pattern matching to perform higher level reasoning and generate lifelike images and text. While
-language models trained on protein sequences have been studied at a smaller scale, little is known about
-what they learn about biology as they are scaled up. In this work we train models up to 15 billion parameters,
-the largest language models of proteins to be evaluated to date. We find that as models are scaled they learn
-information enabling the prediction of the three-dimensional structure of a protein at the resolution of
-individual atoms. We present ESMFold for high accuracy end-to-end atomic level structure prediction directly
-from the individual sequence of a protein. ESMFold has similar accuracy to AlphaFold2 and RoseTTAFold for
-sequences with low perplexity that are well understood by the language model. ESMFold inference is an
-order of magnitude faster than AlphaFold2, enabling exploration of the structural space of metagenomic
-proteins in practical timescales.*
-
-
-Tips:
-
-- ESM models are trained with a masked language modeling (MLM) objective.
-
-The original code can be found [here](https://github.com/facebookresearch/esm) and was
-was developed by the Fundamental AI Research team at Meta AI.
-ESM-1b, ESM-1v and ESM-2 were contributed to huggingface by [jasonliu](https://huggingface.co/jasonliu)
-and [Matt](https://huggingface.co/Rocketknight1).
-
-ESMFold was contributed to huggingface by [Matt](https://huggingface.co/Rocketknight1) and
-[Sylvain](https://huggingface.co/sgugger), with a big thank you to Nikita Smetanin, Roshan Rao and Tom Sercu for their
-help throughout the process!
-
-The HuggingFace port of ESMFold uses portions of the [openfold](https://github.com/aqlaboratory/openfold) library.
-The `openfold` library is licensed under the Apache License 2.0.
-
-## EsmConfig
-
-[[autodoc]] EsmConfig
-    - all
-
-## EsmTokenizer
-
-[[autodoc]] EsmTokenizer
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-
-
-## EsmModel
-
-[[autodoc]] EsmModel
-    - forward
-
-## EsmForMaskedLM
-
-[[autodoc]] EsmForMaskedLM
-    - forward
-
-## EsmForSequenceClassification
-
-[[autodoc]] EsmForSequenceClassification
-    - forward
-
-## EsmForTokenClassification
-
-[[autodoc]] EsmForTokenClassification
-    - forward
-
-## EsmForProteinFolding
-
-[[autodoc]] EsmForProteinFolding
-    - forward
-
-## TFEsmModel
-
-[[autodoc]] TFEsmModel
-    - call
-
-## TFEsmForMaskedLM
-
-[[autodoc]] TFEsmForMaskedLM
-    - call
-
-## TFEsmForSequenceClassification
-
-[[autodoc]] TFEsmForSequenceClassification
-    - call
-
-## TFEsmForTokenClassification
-
-[[autodoc]] TFEsmForTokenClassification
-    - call
diff --git a/docs/source/en/model_doc/falcon.md b/docs/source/en/model_doc/falcon.md
new file mode 100644
index 000000000000..9bf6c32a4ec5
--- /dev/null
+++ b/docs/source/en/model_doc/falcon.md
@@ -0,0 +1,84 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Falcon
+
+## Overview
+
+Falcon is a class of causal decoder-only models built by [TII](https://www.tii.ae/). The largest Falcon checkpoints
+have been trained on >=1T tokens of text, with a particular emphasis on the [RefinedWeb](https://arxiv.org/abs/2306.01116)
+corpus. They are made available under the Apache 2.0 license.
+
+
+Falcon's architecture is modern and optimized for inference, with multi-query attention and support for efficient
+attention variants like `FlashAttention`. Both 'base' models trained only as causal language models as well as
+'instruct' models that have received further fine-tuning are available.
+
+
+Falcon models are (as of 2023) some of the largest and most powerful open-source language models,
+and consistently rank highly in the [OpenLLM leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard).
+
+## Converting custom checkpoints 
+
+<Tip>
+
+Falcon models were initially added to the Hugging Face Hub as custom code checkpoints. However, Falcon is now fully
+supported in the Transformers library. If you fine-tuned a model from a custom code checkpoint, we recommend converting
+your checkpoint to the new in-library format, as this should give significant improvements to stability and
+performance, especially for generation, as well as removing the need to use `trust_remote_code=True`!
+
+</Tip>
+
+You can convert custom code checkpoints to full Transformers checkpoints using the `convert_custom_code_checkpoint.py` 
+script located in the
+[Falcon model directory](https://github.com/huggingface/transformers/tree/main/src/transformers/models/falcon)
+of the Transformers library. To use this script, simply call it with 
+`python convert_custom_code_checkpoint.py --checkpoint_dir my_model`. This will convert your checkpoint in-place, and
+you can immediately load it from the directory afterwards with e.g. `from_pretrained()`. If your model hasn't been
+uploaded to the Hub, we recommend making a backup before attempting the conversion, just in case!
+
+
+## FalconConfig
+
+[[autodoc]] FalconConfig
+    - all
+
+## FalconModel
+
+[[autodoc]] FalconModel
+    - forward
+
+## FalconForCausalLM
+
+[[autodoc]] FalconForCausalLM
+    - forward
+
+## FalconForSequenceClassification
+
+[[autodoc]] FalconForSequenceClassification
+    - forward
+
+## FalconForTokenClassification
+
+[[autodoc]] FalconForTokenClassification
+    - forward
+
+## FalconForQuestionAnswering
+
+[[autodoc]] FalconForQuestionAnswering
+    - forward
+
+
diff --git a/docs/source/en/model_doc/flan-t5.md b/docs/source/en/model_doc/flan-t5.md
new file mode 100644
index 000000000000..5d781f75b179
--- /dev/null
+++ b/docs/source/en/model_doc/flan-t5.md
@@ -0,0 +1,53 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# FLAN-T5
+
+## Overview
+
+FLAN-T5 was released in the paper [Scaling Instruction-Finetuned Language Models](https://arxiv.org/pdf/2210.11416.pdf) - it is an enhanced version of T5 that has been finetuned in a mixture of tasks.
+
+One can directly use FLAN-T5 weights without finetuning the model:
+
+```python
+>>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
+>>> tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
+
+>>> inputs = tokenizer("A step by step recipe to make bolognese pasta:", return_tensors="pt")
+>>> outputs = model.generate(**inputs)
+>>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
+['Pour a cup of bolognese into a large bowl and add the pasta']
+```
+
+FLAN-T5 includes the same improvements as T5 version 1.1 (see [here](https://huggingface.co/docs/transformers/model_doc/t5v1.1) for the full details of the model's improvements.)
+
+Google has released the following variants:
+
+- [google/flan-t5-small](https://huggingface.co/google/flan-t5-small)
+
+- [google/flan-t5-base](https://huggingface.co/google/flan-t5-base)
+
+- [google/flan-t5-large](https://huggingface.co/google/flan-t5-large)
+
+- [google/flan-t5-xl](https://huggingface.co/google/flan-t5-xl)
+
+- [google/flan-t5-xxl](https://huggingface.co/google/flan-t5-xxl).
+
+One can refer to [T5's documentation page](t5) for all tips, code examples and notebooks. As well as the FLAN-T5 model card for more details regarding training and evaluation of the model.
+
+The original checkpoints can be found [here](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints).
diff --git a/docs/source/en/model_doc/flan-t5.mdx b/docs/source/en/model_doc/flan-t5.mdx
deleted file mode 100644
index 5a2d6fc934fd..000000000000
--- a/docs/source/en/model_doc/flan-t5.mdx
+++ /dev/null
@@ -1,49 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# FLAN-T5
-
-## Overview
-
-FLAN-T5 was released in the paper [Scaling Instruction-Finetuned Language Models](https://arxiv.org/pdf/2210.11416.pdf) - it is an enhanced version of T5 that has been finetuned in a mixture of tasks.
-
-One can directly use FLAN-T5 weights without finetuning the model:
-
-```python
->>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
-
->>> model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
->>> tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
-
->>> inputs = tokenizer("A step by step recipe to make bolognese pasta:", return_tensors="pt")
->>> outputs = model.generate(**inputs)
->>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
-['Pour a cup of bolognese into a large bowl and add the pasta']
-```
-
-FLAN-T5 includes the same improvements as T5 version 1.1 (see [here](https://huggingface.co/docs/transformers/model_doc/t5v1.1) for the full details of the model's improvements.)
-
-Google has released the following variants:
-
-- [google/flan-t5-small](https://huggingface.co/google/flan-t5-small)
-
-- [google/flan-t5-base](https://huggingface.co/google/flan-t5-base)
-
-- [google/flan-t5-large](https://huggingface.co/google/flan-t5-large)
-
-- [google/flan-t5-xl](https://huggingface.co/google/flan-t5-xl)
-
-- [google/flan-t5-xxl](https://huggingface.co/google/flan-t5-xxl).
-
-One can refer to [T5's documentation page](t5) for all tips, code examples and notebooks. As well as the FLAN-T5 model card for more details regarding training and evaluation of the model.
-
-The original checkpoints can be found [here](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints).
diff --git a/docs/source/en/model_doc/flan-ul2.md b/docs/source/en/model_doc/flan-ul2.md
new file mode 100644
index 000000000000..e20603920888
--- /dev/null
+++ b/docs/source/en/model_doc/flan-ul2.md
@@ -0,0 +1,56 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# FLAN-UL2
+
+## Overview
+
+Flan-UL2 is an encoder decoder model based on the T5 architecture. It uses the same configuration as the [UL2](ul2) model released earlier last year. 
+It was fine tuned using the "Flan" prompt tuning and dataset collection. Similiar to `Flan-T5`,  one can directly use FLAN-UL2 weights without finetuning the model:
+
+
+According ot the original blog here are the notable improvements:
+
+- The original UL2 model was only trained with receptive field of 512, which made it non-ideal for N-shot prompting where N is large.
+- The Flan-UL2 checkpoint uses a receptive field of 2048 which makes it more usable for few-shot in-context learning.
+- The original UL2 model also had mode switch tokens that was rather mandatory to get good performance. However, they were a little cumbersome as this requires often some changes during inference or finetuning. In this update/change, we continue training UL2 20B for an additional 100k steps (with small batch) to forget “mode tokens” before applying Flan instruction tuning. This Flan-UL2 checkpoint does not require mode tokens anymore.
+Google has released the following variants:
+
+
+One can refer to [T5's documentation page](t5) for all tips, code examples and notebooks. As well as the FLAN-T5 model card for more details regarding training and evaluation of the model.
+
+The original checkpoints can be found [here](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints).
+
+
+## Running on low resource devices
+
+The model is pretty heavy (~40GB in half precision) so if you just want to run the model, make sure you load your model in 8bit, and use `device_map="auto"` to make sure  you don't have any OOM issue!
+
+```python
+>>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-ul2", load_in_8bit=True, device_map="auto")
+>>> tokenizer = AutoTokenizer.from_pretrained("google/flan-ul2")
+
+>>> inputs = tokenizer("A step by step recipe to make bolognese pasta:", return_tensors="pt")
+>>> outputs = model.generate(**inputs)
+>>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
+['In a large skillet, brown the ground beef and onion over medium heat. Add the garlic']
+```
+
+## Inference
+
+The inference protocol is exaclty the same as any `T5` model, please have a look at the [T5's documentation page](t5) for more details.
diff --git a/docs/source/en/model_doc/flaubert.md b/docs/source/en/model_doc/flaubert.md
new file mode 100644
index 000000000000..3e85bd6fa9d9
--- /dev/null
+++ b/docs/source/en/model_doc/flaubert.md
@@ -0,0 +1,132 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# FlauBERT
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=flaubert">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-flaubert-blueviolet">
+</a>
+<a href="https://huggingface.co/spaces/docs-demos/flaubert_small_cased">
+<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
+</a>
+</div>
+
+## Overview
+
+The FlauBERT model was proposed in the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le et al. It's a transformer model pretrained using a masked language
+modeling (MLM) objective (like BERT).
+
+The abstract from the paper is the following:
+
+*Language models have become a key step to achieve state-of-the art results in many different Natural Language
+Processing (NLP) tasks. Leveraging the huge amount of unlabeled texts nowadays available, they provide an efficient way
+to pre-train continuous word representations that can be fine-tuned for a downstream task, along with their
+contextualization at the sentence level. This has been widely demonstrated for English using contextualized
+representations (Dai and Le, 2015; Peters et al., 2018; Howard and Ruder, 2018; Radford et al., 2018; Devlin et al.,
+2019; Yang et al., 2019b). In this paper, we introduce and share FlauBERT, a model learned on a very large and
+heterogeneous French corpus. Models of different sizes are trained using the new CNRS (French National Centre for
+Scientific Research) Jean Zay supercomputer. We apply our French language models to diverse NLP tasks (text
+classification, paraphrasing, natural language inference, parsing, word sense disambiguation) and show that most of the
+time they outperform other pretraining approaches. Different versions of FlauBERT as well as a unified evaluation
+protocol for the downstream tasks, called FLUE (French Language Understanding Evaluation), are shared to the research
+community for further reproducible experiments in French NLP.*
+
+This model was contributed by [formiel](https://huggingface.co/formiel). The original code can be found [here](https://github.com/getalp/Flaubert).
+
+Tips:
+- Like RoBERTa, without the sentence ordering prediction (so just trained on the MLM objective).
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+- [Multiple choice task guide](../tasks/multiple_choice)
+
+## FlaubertConfig
+
+[[autodoc]] FlaubertConfig
+
+## FlaubertTokenizer
+
+[[autodoc]] FlaubertTokenizer
+
+## FlaubertModel
+
+[[autodoc]] FlaubertModel
+    - forward
+
+## FlaubertWithLMHeadModel
+
+[[autodoc]] FlaubertWithLMHeadModel
+    - forward
+
+## FlaubertForSequenceClassification
+
+[[autodoc]] FlaubertForSequenceClassification
+    - forward
+
+## FlaubertForMultipleChoice
+
+[[autodoc]] FlaubertForMultipleChoice
+    - forward
+
+## FlaubertForTokenClassification
+
+[[autodoc]] FlaubertForTokenClassification
+    - forward
+
+## FlaubertForQuestionAnsweringSimple
+
+[[autodoc]] FlaubertForQuestionAnsweringSimple
+    - forward
+
+## FlaubertForQuestionAnswering
+
+[[autodoc]] FlaubertForQuestionAnswering
+    - forward
+
+## TFFlaubertModel
+
+[[autodoc]] TFFlaubertModel
+    - call
+
+## TFFlaubertWithLMHeadModel
+
+[[autodoc]] TFFlaubertWithLMHeadModel
+    - call
+
+## TFFlaubertForSequenceClassification
+
+[[autodoc]] TFFlaubertForSequenceClassification
+    - call
+
+## TFFlaubertForMultipleChoice
+
+[[autodoc]] TFFlaubertForMultipleChoice
+    - call
+
+## TFFlaubertForTokenClassification
+
+[[autodoc]] TFFlaubertForTokenClassification
+    - call
+
+## TFFlaubertForQuestionAnsweringSimple
+
+[[autodoc]] TFFlaubertForQuestionAnsweringSimple
+    - call
diff --git a/docs/source/en/model_doc/flaubert.mdx b/docs/source/en/model_doc/flaubert.mdx
deleted file mode 100644
index c36e0eec8f4c..000000000000
--- a/docs/source/en/model_doc/flaubert.mdx
+++ /dev/null
@@ -1,109 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# FlauBERT
-
-## Overview
-
-The FlauBERT model was proposed in the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le et al. It's a transformer model pretrained using a masked language
-modeling (MLM) objective (like BERT).
-
-The abstract from the paper is the following:
-
-*Language models have become a key step to achieve state-of-the art results in many different Natural Language
-Processing (NLP) tasks. Leveraging the huge amount of unlabeled texts nowadays available, they provide an efficient way
-to pre-train continuous word representations that can be fine-tuned for a downstream task, along with their
-contextualization at the sentence level. This has been widely demonstrated for English using contextualized
-representations (Dai and Le, 2015; Peters et al., 2018; Howard and Ruder, 2018; Radford et al., 2018; Devlin et al.,
-2019; Yang et al., 2019b). In this paper, we introduce and share FlauBERT, a model learned on a very large and
-heterogeneous French corpus. Models of different sizes are trained using the new CNRS (French National Centre for
-Scientific Research) Jean Zay supercomputer. We apply our French language models to diverse NLP tasks (text
-classification, paraphrasing, natural language inference, parsing, word sense disambiguation) and show that most of the
-time they outperform other pretraining approaches. Different versions of FlauBERT as well as a unified evaluation
-protocol for the downstream tasks, called FLUE (French Language Understanding Evaluation), are shared to the research
-community for further reproducible experiments in French NLP.*
-
-This model was contributed by [formiel](https://huggingface.co/formiel). The original code can be found [here](https://github.com/getalp/Flaubert).
-
-
-## FlaubertConfig
-
-[[autodoc]] FlaubertConfig
-
-## FlaubertTokenizer
-
-[[autodoc]] FlaubertTokenizer
-
-## FlaubertModel
-
-[[autodoc]] FlaubertModel
-    - forward
-
-## FlaubertWithLMHeadModel
-
-[[autodoc]] FlaubertWithLMHeadModel
-    - forward
-
-## FlaubertForSequenceClassification
-
-[[autodoc]] FlaubertForSequenceClassification
-    - forward
-
-## FlaubertForMultipleChoice
-
-[[autodoc]] FlaubertForMultipleChoice
-    - forward
-
-## FlaubertForTokenClassification
-
-[[autodoc]] FlaubertForTokenClassification
-    - forward
-
-## FlaubertForQuestionAnsweringSimple
-
-[[autodoc]] FlaubertForQuestionAnsweringSimple
-    - forward
-
-## FlaubertForQuestionAnswering
-
-[[autodoc]] FlaubertForQuestionAnswering
-    - forward
-
-## TFFlaubertModel
-
-[[autodoc]] TFFlaubertModel
-    - call
-
-## TFFlaubertWithLMHeadModel
-
-[[autodoc]] TFFlaubertWithLMHeadModel
-    - call
-
-## TFFlaubertForSequenceClassification
-
-[[autodoc]] TFFlaubertForSequenceClassification
-    - call
-
-## TFFlaubertForMultipleChoice
-
-[[autodoc]] TFFlaubertForMultipleChoice
-    - call
-
-## TFFlaubertForTokenClassification
-
-[[autodoc]] TFFlaubertForTokenClassification
-    - call
-
-## TFFlaubertForQuestionAnsweringSimple
-
-[[autodoc]] TFFlaubertForQuestionAnsweringSimple
-    - call
diff --git a/docs/source/en/model_doc/flava.md b/docs/source/en/model_doc/flava.md
new file mode 100644
index 000000000000..ae9da0d184a5
--- /dev/null
+++ b/docs/source/en/model_doc/flava.md
@@ -0,0 +1,105 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# FLAVA
+
+## Overview
+
+The FLAVA model was proposed in [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela and is accepted at CVPR 2022.
+
+The paper aims at creating a single unified foundation model which can work across vision, language
+as well as vision-and-language multimodal tasks.
+
+The abstract from the paper is the following:
+
+*State-of-the-art vision and vision-and-language models rely on large-scale visio-linguistic pretraining for obtaining good performance on a variety
+of downstream tasks. Generally, such models are often either cross-modal (contrastive) or multi-modal
+(with earlier fusion) but not both; and they often only target specific modalities or tasks. A promising
+direction would be to use a single holistic universal model, as a "foundation", that targets all modalities
+at once -- a true vision and language foundation model should be good at vision tasks, language tasks, and
+cross- and multi-modal vision and language tasks. We introduce FLAVA as such a model and demonstrate
+impressive performance on a wide range of 35 tasks spanning these target modalities.*
+
+
+This model was contributed by [aps](https://huggingface.co/aps). The original code can be found [here](https://github.com/facebookresearch/multimodal/tree/main/examples/flava).
+
+
+## FlavaConfig
+
+[[autodoc]] FlavaConfig
+
+## FlavaTextConfig
+
+[[autodoc]] FlavaTextConfig
+
+## FlavaImageConfig
+
+[[autodoc]] FlavaImageConfig
+
+## FlavaMultimodalConfig
+
+[[autodoc]] FlavaMultimodalConfig
+
+## FlavaImageCodebookConfig
+
+[[autodoc]] FlavaImageCodebookConfig
+
+## FlavaProcessor
+
+[[autodoc]] FlavaProcessor
+
+## FlavaFeatureExtractor
+
+[[autodoc]] FlavaFeatureExtractor
+
+## FlavaImageProcessor
+
+[[autodoc]] FlavaImageProcessor
+    - preprocess
+
+## FlavaForPreTraining
+
+[[autodoc]] FlavaForPreTraining
+    - forward
+
+## FlavaModel
+
+[[autodoc]] FlavaModel
+    - forward
+    - get_text_features
+    - get_image_features
+
+## FlavaImageCodebook
+
+[[autodoc]] FlavaImageCodebook
+    - forward
+    - get_codebook_indices
+    - get_codebook_probs
+
+## FlavaTextModel
+
+[[autodoc]] FlavaTextModel
+    - forward
+
+## FlavaImageModel
+
+[[autodoc]] FlavaImageModel
+    - forward
+
+## FlavaMultimodalModel
+
+[[autodoc]] FlavaMultimodalModel
+    - forward
diff --git a/docs/source/en/model_doc/flava.mdx b/docs/source/en/model_doc/flava.mdx
deleted file mode 100644
index 4df11a5758a2..000000000000
--- a/docs/source/en/model_doc/flava.mdx
+++ /dev/null
@@ -1,101 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# FLAVA
-
-## Overview
-
-The FLAVA model was proposed in [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela and is accepted at CVPR 2022.
-
-The paper aims at creating a single unified foundation model which can work across vision, language
-as well as vision-and-language multimodal tasks.
-
-The abstract from the paper is the following:
-
-*State-of-the-art vision and vision-and-language models rely on large-scale visio-linguistic pretraining for obtaining good performance on a variety
-of downstream tasks. Generally, such models are often either cross-modal (contrastive) or multi-modal
-(with earlier fusion) but not both; and they often only target specific modalities or tasks. A promising
-direction would be to use a single holistic universal model, as a "foundation", that targets all modalities
-at once -- a true vision and language foundation model should be good at vision tasks, language tasks, and
-cross- and multi-modal vision and language tasks. We introduce FLAVA as such a model and demonstrate
-impressive performance on a wide range of 35 tasks spanning these target modalities.*
-
-
-This model was contributed by [aps](https://huggingface.co/aps). The original code can be found [here](https://github.com/facebookresearch/multimodal/tree/main/examples/flava).
-
-
-## FlavaConfig
-
-[[autodoc]] FlavaConfig
-
-## FlavaTextConfig
-
-[[autodoc]] FlavaTextConfig
-
-## FlavaImageConfig
-
-[[autodoc]] FlavaImageConfig
-
-## FlavaMultimodalConfig
-
-[[autodoc]] FlavaMultimodalConfig
-
-## FlavaImageCodebookConfig
-
-[[autodoc]] FlavaImageCodebookConfig
-
-## FlavaProcessor
-
-[[autodoc]] FlavaProcessor
-
-## FlavaFeatureExtractor
-
-[[autodoc]] FlavaFeatureExtractor
-
-## FlavaImageProcessor
-
-[[autodoc]] FlavaImageProcessor
-    - preprocess
-
-## FlavaForPreTraining
-
-[[autodoc]] FlavaForPreTraining
-    - forward
-
-## FlavaModel
-
-[[autodoc]] FlavaModel
-    - forward
-    - get_text_features
-    - get_image_features
-
-## FlavaImageCodebook
-
-[[autodoc]] FlavaImageCodebook
-    - forward
-    - get_codebook_indices
-    - get_codebook_probs
-
-## FlavaTextModel
-
-[[autodoc]] FlavaTextModel
-    - forward
-
-## FlavaImageModel
-
-[[autodoc]] FlavaImageModel
-    - forward
-
-## FlavaMultimodalModel
-
-[[autodoc]] FlavaMultimodalModel
-    - forward
diff --git a/docs/source/en/model_doc/fnet.md b/docs/source/en/model_doc/fnet.md
new file mode 100644
index 000000000000..a6d862f8a1a7
--- /dev/null
+++ b/docs/source/en/model_doc/fnet.md
@@ -0,0 +1,110 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# FNet
+
+## Overview
+
+The FNet model was proposed in [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by
+James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon. The model replaces the self-attention layer in a BERT
+model with a fourier transform which returns only the real parts of the transform. The model is significantly faster
+than the BERT model because it has fewer parameters and is more memory efficient. The model achieves about 92-97%
+accuracy of BERT counterparts on GLUE benchmark, and trains much faster than the BERT model. The abstract from the
+paper is the following:
+
+*We show that Transformer encoder architectures can be sped up, with limited accuracy costs, by replacing the
+self-attention sublayers with simple linear transformations that "mix" input tokens. These linear mixers, along with
+standard nonlinearities in feed-forward layers, prove competent at modeling semantic relationships in several text
+classification tasks. Most surprisingly, we find that replacing the self-attention sublayer in a Transformer encoder
+with a standard, unparameterized Fourier Transform achieves 92-97% of the accuracy of BERT counterparts on the GLUE
+benchmark, but trains 80% faster on GPUs and 70% faster on TPUs at standard 512 input lengths. At longer input lengths,
+our FNet model is significantly faster: when compared to the "efficient" Transformers on the Long Range Arena
+benchmark, FNet matches the accuracy of the most accurate models, while outpacing the fastest models across all
+sequence lengths on GPUs (and across relatively shorter lengths on TPUs). Finally, FNet has a light memory footprint
+and is particularly efficient at smaller model sizes; for a fixed speed and accuracy budget, small FNet models
+outperform Transformer counterparts.*
+
+Tips on usage:
+
+- The model was trained without an attention mask as it is based on Fourier Transform. The model was trained with
+  maximum sequence length 512 which includes pad tokens. Hence, it is highly recommended to use the same maximum
+  sequence length for fine-tuning and inference.
+
+This model was contributed by [gchhablani](https://huggingface.co/gchhablani). The original code can be found [here](https://github.com/google-research/google-research/tree/master/f_net).
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+- [Multiple choice task guide](../tasks/multiple_choice)
+
+## FNetConfig
+
+[[autodoc]] FNetConfig
+
+## FNetTokenizer
+
+[[autodoc]] FNetTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## FNetTokenizerFast
+
+[[autodoc]] FNetTokenizerFast
+
+## FNetModel
+
+[[autodoc]] FNetModel
+    - forward
+
+## FNetForPreTraining
+
+[[autodoc]] FNetForPreTraining
+    - forward
+
+## FNetForMaskedLM
+
+[[autodoc]] FNetForMaskedLM
+    - forward
+
+## FNetForNextSentencePrediction
+
+[[autodoc]] FNetForNextSentencePrediction
+    - forward
+
+## FNetForSequenceClassification
+
+[[autodoc]] FNetForSequenceClassification
+    - forward
+
+## FNetForMultipleChoice
+
+[[autodoc]] FNetForMultipleChoice
+    - forward
+
+## FNetForTokenClassification
+
+[[autodoc]] FNetForTokenClassification
+    - forward
+
+## FNetForQuestionAnswering
+
+[[autodoc]] FNetForQuestionAnswering
+    - forward
diff --git a/docs/source/en/model_doc/fnet.mdx b/docs/source/en/model_doc/fnet.mdx
deleted file mode 100644
index 19afcc805110..000000000000
--- a/docs/source/en/model_doc/fnet.mdx
+++ /dev/null
@@ -1,98 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# FNet
-
-## Overview
-
-The FNet model was proposed in [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by
-James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon. The model replaces the self-attention layer in a BERT
-model with a fourier transform which returns only the real parts of the transform. The model is significantly faster
-than the BERT model because it has fewer parameters and is more memory efficient. The model achieves about 92-97%
-accuracy of BERT counterparts on GLUE benchmark, and trains much faster than the BERT model. The abstract from the
-paper is the following:
-
-*We show that Transformer encoder architectures can be sped up, with limited accuracy costs, by replacing the
-self-attention sublayers with simple linear transformations that "mix" input tokens. These linear mixers, along with
-standard nonlinearities in feed-forward layers, prove competent at modeling semantic relationships in several text
-classification tasks. Most surprisingly, we find that replacing the self-attention sublayer in a Transformer encoder
-with a standard, unparameterized Fourier Transform achieves 92-97% of the accuracy of BERT counterparts on the GLUE
-benchmark, but trains 80% faster on GPUs and 70% faster on TPUs at standard 512 input lengths. At longer input lengths,
-our FNet model is significantly faster: when compared to the "efficient" Transformers on the Long Range Arena
-benchmark, FNet matches the accuracy of the most accurate models, while outpacing the fastest models across all
-sequence lengths on GPUs (and across relatively shorter lengths on TPUs). Finally, FNet has a light memory footprint
-and is particularly efficient at smaller model sizes; for a fixed speed and accuracy budget, small FNet models
-outperform Transformer counterparts.*
-
-Tips on usage:
-
-- The model was trained without an attention mask as it is based on Fourier Transform. The model was trained with
-  maximum sequence length 512 which includes pad tokens. Hence, it is highly recommended to use the same maximum
-  sequence length for fine-tuning and inference.
-
-This model was contributed by [gchhablani](https://huggingface.co/gchhablani). The original code can be found [here](https://github.com/google-research/google-research/tree/master/f_net).
-
-## FNetConfig
-
-[[autodoc]] FNetConfig
-
-## FNetTokenizer
-
-[[autodoc]] FNetTokenizer
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-
-## FNetTokenizerFast
-
-[[autodoc]] FNetTokenizerFast
-
-## FNetModel
-
-[[autodoc]] FNetModel
-    - forward
-
-## FNetForPreTraining
-
-[[autodoc]] FNetForPreTraining
-    - forward
-
-## FNetForMaskedLM
-
-[[autodoc]] FNetForMaskedLM
-    - forward
-
-## FNetForNextSentencePrediction
-
-[[autodoc]] FNetForNextSentencePrediction
-    - forward
-
-## FNetForSequenceClassification
-
-[[autodoc]] FNetForSequenceClassification
-    - forward
-
-## FNetForMultipleChoice
-
-[[autodoc]] FNetForMultipleChoice
-    - forward
-
-## FNetForTokenClassification
-
-[[autodoc]] FNetForTokenClassification
-    - forward
-
-## FNetForQuestionAnswering
-
-[[autodoc]] FNetForQuestionAnswering
-    - forward
diff --git a/docs/source/en/model_doc/focalnet.md b/docs/source/en/model_doc/focalnet.md
new file mode 100644
index 000000000000..21a75440b136
--- /dev/null
+++ b/docs/source/en/model_doc/focalnet.md
@@ -0,0 +1,55 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# FocalNet
+
+## Overview
+
+The FocalNet model was proposed in [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
+FocalNets completely replace self-attention (used in models like [ViT](vit) and [Swin](swin)) by a focal modulation mechanism for modeling token interactions in vision.
+The authors claim that FocalNets outperform self-attention based models with similar computational costs on the tasks of image classification, object detection, and segmentation.
+
+The abstract from the paper is the following:
+
+*We propose focal modulation networks (FocalNets in short), where self-attention (SA) is completely replaced by a focal modulation mechanism for modeling token interactions in vision. Focal modulation comprises three components: (i) hierarchical contextualization, implemented using a stack of depth-wise convolutional layers, to encode visual contexts from short to long ranges, (ii) gated aggregation to selectively gather contexts for each query token based on its
+content, and (iii) element-wise modulation or affine transformation to inject the aggregated context into the query. Extensive experiments show FocalNets outperform the state-of-the-art SA counterparts (e.g., Swin and Focal Transformers) with similar computational costs on the tasks of image classification, object detection, and segmentation. Specifically, FocalNets with tiny and base size achieve 82.3% and 83.9% top-1 accuracy on ImageNet-1K. After pretrained on ImageNet-22K in 224 resolution, it attains 86.5% and 87.3% top-1 accuracy when finetuned with resolution 224 and 384, respectively. When transferred to downstream tasks, FocalNets exhibit clear superiority. For object detection with Mask R-CNN, FocalNet base trained with 1\times outperforms the Swin counterpart by 2.1 points and already surpasses Swin trained with 3\times schedule (49.0 v.s. 48.5). For semantic segmentation with UPerNet, FocalNet base at single-scale outperforms Swin by 2.4, and beats Swin at multi-scale (50.5 v.s. 49.7). Using large FocalNet and Mask2former, we achieve 58.5 mIoU for ADE20K semantic segmentation, and 57.9 PQ for COCO Panoptic Segmentation. Using huge FocalNet and DINO, we achieved 64.3 and 64.4 mAP on COCO minival and test-dev, respectively, establishing new SoTA on top of much larger attention-based models like Swinv2-G and BEIT-3.*
+
+Tips:
+
+- One can use the [`AutoImageProcessor`] class to prepare images for the model.
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr).
+The original code can be found [here](https://github.com/microsoft/FocalNet).
+
+
+## FocalNetConfig
+
+[[autodoc]] FocalNetConfig
+
+## FocalNetModel
+
+[[autodoc]] FocalNetModel
+    - forward
+
+## FocalNetForMaskedImageModeling
+
+[[autodoc]] FocalNetForMaskedImageModeling
+    - forward
+
+## FocalNetForImageClassification
+
+[[autodoc]] FocalNetForImageClassification
+    - forward
diff --git a/docs/source/en/model_doc/fsmt.md b/docs/source/en/model_doc/fsmt.md
new file mode 100644
index 000000000000..49625f6c472e
--- /dev/null
+++ b/docs/source/en/model_doc/fsmt.md
@@ -0,0 +1,67 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# FSMT
+
+**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title) and assign
+@stas00.
+
+## Overview
+
+FSMT (FairSeq MachineTranslation) models were introduced in [Facebook FAIR's WMT19 News Translation Task Submission](https://arxiv.org/abs/1907.06616) by Nathan Ng, Kyra Yee, Alexei Baevski, Myle Ott, Michael Auli, Sergey Edunov.
+
+The abstract of the paper is the following:
+
+*This paper describes Facebook FAIR's submission to the WMT19 shared news translation task. We participate in two
+language pairs and four language directions, English <-> German and English <-> Russian. Following our submission from
+last year, our baseline systems are large BPE-based transformer models trained with the Fairseq sequence modeling
+toolkit which rely on sampled back-translations. This year we experiment with different bitext data filtering schemes,
+as well as with adding filtered back-translated data. We also ensemble and fine-tune our models on domain-specific
+data, then decode using noisy channel model reranking. Our submissions are ranked first in all four directions of the
+human evaluation campaign. On En->De, our system significantly outperforms other systems as well as human translations.
+This system improves upon our WMT'18 submission by 4.5 BLEU points.*
+
+This model was contributed by [stas](https://huggingface.co/stas). The original code can be found
+[here](https://github.com/pytorch/fairseq/tree/master/examples/wmt19).
+
+## Implementation Notes
+
+- FSMT uses source and target vocabulary pairs that aren't combined into one. It doesn't share embeddings tokens
+  either. Its tokenizer is very similar to [`XLMTokenizer`] and the main model is derived from
+  [`BartModel`].
+
+
+## FSMTConfig
+
+[[autodoc]] FSMTConfig
+
+## FSMTTokenizer
+
+[[autodoc]] FSMTTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## FSMTModel
+
+[[autodoc]] FSMTModel
+    - forward
+
+## FSMTForConditionalGeneration
+
+[[autodoc]] FSMTForConditionalGeneration
+    - forward
diff --git a/docs/source/en/model_doc/fsmt.mdx b/docs/source/en/model_doc/fsmt.mdx
deleted file mode 100644
index 25c8d85cf486..000000000000
--- a/docs/source/en/model_doc/fsmt.mdx
+++ /dev/null
@@ -1,63 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# FSMT
-
-**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title) and assign
-@stas00.
-
-## Overview
-
-FSMT (FairSeq MachineTranslation) models were introduced in [Facebook FAIR's WMT19 News Translation Task Submission](https://arxiv.org/abs/1907.06616) by Nathan Ng, Kyra Yee, Alexei Baevski, Myle Ott, Michael Auli, Sergey Edunov.
-
-The abstract of the paper is the following:
-
-*This paper describes Facebook FAIR's submission to the WMT19 shared news translation task. We participate in two
-language pairs and four language directions, English <-> German and English <-> Russian. Following our submission from
-last year, our baseline systems are large BPE-based transformer models trained with the Fairseq sequence modeling
-toolkit which rely on sampled back-translations. This year we experiment with different bitext data filtering schemes,
-as well as with adding filtered back-translated data. We also ensemble and fine-tune our models on domain-specific
-data, then decode using noisy channel model reranking. Our submissions are ranked first in all four directions of the
-human evaluation campaign. On En->De, our system significantly outperforms other systems as well as human translations.
-This system improves upon our WMT'18 submission by 4.5 BLEU points.*
-
-This model was contributed by [stas](https://huggingface.co/stas). The original code can be found
-[here](https://github.com/pytorch/fairseq/tree/master/examples/wmt19).
-
-## Implementation Notes
-
-- FSMT uses source and target vocabulary pairs that aren't combined into one. It doesn't share embeddings tokens
-  either. Its tokenizer is very similar to [`XLMTokenizer`] and the main model is derived from
-  [`BartModel`].
-
-
-## FSMTConfig
-
-[[autodoc]] FSMTConfig
-
-## FSMTTokenizer
-
-[[autodoc]] FSMTTokenizer
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-
-## FSMTModel
-
-[[autodoc]] FSMTModel
-    - forward
-
-## FSMTForConditionalGeneration
-
-[[autodoc]] FSMTForConditionalGeneration
-    - forward
diff --git a/docs/source/en/model_doc/funnel.md b/docs/source/en/model_doc/funnel.md
new file mode 100644
index 000000000000..3cc4eb0aaed6
--- /dev/null
+++ b/docs/source/en/model_doc/funnel.md
@@ -0,0 +1,176 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Funnel Transformer
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=funnel">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-funnel-blueviolet">
+</a>
+<a href="https://huggingface.co/spaces/docs-demos/funnel-transformer-small">
+<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
+</a>
+</div>
+
+
+## Overview
+
+The Funnel Transformer model was proposed in the paper [Funnel-Transformer: Filtering out Sequential Redundancy for
+Efficient Language Processing](https://arxiv.org/abs/2006.03236). It is a bidirectional transformer model, like
+BERT, but with a pooling operation after each block of layers, a bit like in traditional convolutional neural networks
+(CNN) in computer vision.
+
+The abstract from the paper is the following:
+
+*With the success of language pretraining, it is highly desirable to develop more efficient architectures of good
+scalability that can exploit the abundant unlabeled data at a lower cost. To improve the efficiency, we examine the
+much-overlooked redundancy in maintaining a full-length token-level presentation, especially for tasks that only
+require a single-vector presentation of the sequence. With this intuition, we propose Funnel-Transformer which
+gradually compresses the sequence of hidden states to a shorter one and hence reduces the computation cost. More
+importantly, by re-investing the saved FLOPs from length reduction in constructing a deeper or wider model, we further
+improve the model capacity. In addition, to perform token-level predictions as required by common pretraining
+objectives, Funnel-Transformer is able to recover a deep representation for each token from the reduced hidden sequence
+via a decoder. Empirically, with comparable or fewer FLOPs, Funnel-Transformer outperforms the standard Transformer on
+a wide variety of sequence-level prediction tasks, including text classification, language understanding, and reading
+comprehension.*
+
+Tips:
+
+- Since Funnel Transformer uses pooling, the sequence length of the hidden states changes after each block of layers. This way, their length is divided by 2, which speeds up the computation of the next hidden states.
+  The base model therefore has a final sequence length that is a quarter of the original one. This model can be used
+  directly for tasks that just require a sentence summary (like sequence classification or multiple choice). For other
+  tasks, the full model is used; this full model has a decoder that upsamples the final hidden states to the same
+  sequence length as the input.
+- For tasks such as classification, this is not a problem, but for tasks like masked language modeling or token classification, we need a hidden state with the same sequence length as the original input. In those cases, the final hidden states are upsampled to the input sequence length and go through two additional layers. That's why there are two versions of each checkpoint. The version suffixed with “-base” contains only the three blocks, while the version without that suffix contains the three blocks and the upsampling head with its additional layers.
+- The Funnel Transformer checkpoints are all available with a full version and a base version. The first ones should be
+  used for [`FunnelModel`], [`FunnelForPreTraining`],
+  [`FunnelForMaskedLM`], [`FunnelForTokenClassification`] and
+  [`FunnelForQuestionAnswering`]. The second ones should be used for
+  [`FunnelBaseModel`], [`FunnelForSequenceClassification`] and
+  [`FunnelForMultipleChoice`].
+
+This model was contributed by [sgugger](https://huggingface.co/sgugger). The original code can be found [here](https://github.com/laiguokun/Funnel-Transformer).
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+- [Multiple choice task guide](../tasks/multiple_choice)
+
+
+## FunnelConfig
+
+[[autodoc]] FunnelConfig
+
+## FunnelTokenizer
+
+[[autodoc]] FunnelTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## FunnelTokenizerFast
+
+[[autodoc]] FunnelTokenizerFast
+
+## Funnel specific outputs
+
+[[autodoc]] models.funnel.modeling_funnel.FunnelForPreTrainingOutput
+
+[[autodoc]] models.funnel.modeling_tf_funnel.TFFunnelForPreTrainingOutput
+
+## FunnelBaseModel
+
+[[autodoc]] FunnelBaseModel
+    - forward
+
+## FunnelModel
+
+[[autodoc]] FunnelModel
+    - forward
+
+## FunnelModelForPreTraining
+
+[[autodoc]] FunnelForPreTraining
+    - forward
+
+## FunnelForMaskedLM
+
+[[autodoc]] FunnelForMaskedLM
+    - forward
+
+## FunnelForSequenceClassification
+
+[[autodoc]] FunnelForSequenceClassification
+    - forward
+
+## FunnelForMultipleChoice
+
+[[autodoc]] FunnelForMultipleChoice
+    - forward
+
+## FunnelForTokenClassification
+
+[[autodoc]] FunnelForTokenClassification
+    - forward
+
+## FunnelForQuestionAnswering
+
+[[autodoc]] FunnelForQuestionAnswering
+    - forward
+
+## TFFunnelBaseModel
+
+[[autodoc]] TFFunnelBaseModel
+    - call
+
+## TFFunnelModel
+
+[[autodoc]] TFFunnelModel
+    - call
+
+## TFFunnelModelForPreTraining
+
+[[autodoc]] TFFunnelForPreTraining
+    - call
+
+## TFFunnelForMaskedLM
+
+[[autodoc]] TFFunnelForMaskedLM
+    - call
+
+## TFFunnelForSequenceClassification
+
+[[autodoc]] TFFunnelForSequenceClassification
+    - call
+
+## TFFunnelForMultipleChoice
+
+[[autodoc]] TFFunnelForMultipleChoice
+    - call
+
+## TFFunnelForTokenClassification
+
+[[autodoc]] TFFunnelForTokenClassification
+    - call
+
+## TFFunnelForQuestionAnswering
+
+[[autodoc]] TFFunnelForQuestionAnswering
+    - call
diff --git a/docs/source/en/model_doc/funnel.mdx b/docs/source/en/model_doc/funnel.mdx
deleted file mode 100644
index b7743d84a6a8..000000000000
--- a/docs/source/en/model_doc/funnel.mdx
+++ /dev/null
@@ -1,153 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Funnel Transformer
-
-## Overview
-
-The Funnel Transformer model was proposed in the paper [Funnel-Transformer: Filtering out Sequential Redundancy for
-Efficient Language Processing](https://arxiv.org/abs/2006.03236). It is a bidirectional transformer model, like
-BERT, but with a pooling operation after each block of layers, a bit like in traditional convolutional neural networks
-(CNN) in computer vision.
-
-The abstract from the paper is the following:
-
-*With the success of language pretraining, it is highly desirable to develop more efficient architectures of good
-scalability that can exploit the abundant unlabeled data at a lower cost. To improve the efficiency, we examine the
-much-overlooked redundancy in maintaining a full-length token-level presentation, especially for tasks that only
-require a single-vector presentation of the sequence. With this intuition, we propose Funnel-Transformer which
-gradually compresses the sequence of hidden states to a shorter one and hence reduces the computation cost. More
-importantly, by re-investing the saved FLOPs from length reduction in constructing a deeper or wider model, we further
-improve the model capacity. In addition, to perform token-level predictions as required by common pretraining
-objectives, Funnel-Transformer is able to recover a deep representation for each token from the reduced hidden sequence
-via a decoder. Empirically, with comparable or fewer FLOPs, Funnel-Transformer outperforms the standard Transformer on
-a wide variety of sequence-level prediction tasks, including text classification, language understanding, and reading
-comprehension.*
-
-Tips:
-
-- Since Funnel Transformer uses pooling, the sequence length of the hidden states changes after each block of layers.
-  The base model therefore has a final sequence length that is a quarter of the original one. This model can be used
-  directly for tasks that just require a sentence summary (like sequence classification or multiple choice). For other
-  tasks, the full model is used; this full model has a decoder that upsamples the final hidden states to the same
-  sequence length as the input.
-- The Funnel Transformer checkpoints are all available with a full version and a base version. The first ones should be
-  used for [`FunnelModel`], [`FunnelForPreTraining`],
-  [`FunnelForMaskedLM`], [`FunnelForTokenClassification`] and
-  class:*~transformers.FunnelForQuestionAnswering*. The second ones should be used for
-  [`FunnelBaseModel`], [`FunnelForSequenceClassification`] and
-  [`FunnelForMultipleChoice`].
-
-This model was contributed by [sgugger](https://huggingface.co/sgugger). The original code can be found [here](https://github.com/laiguokun/Funnel-Transformer).
-
-
-## FunnelConfig
-
-[[autodoc]] FunnelConfig
-
-## FunnelTokenizer
-
-[[autodoc]] FunnelTokenizer
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-
-## FunnelTokenizerFast
-
-[[autodoc]] FunnelTokenizerFast
-
-## Funnel specific outputs
-
-[[autodoc]] models.funnel.modeling_funnel.FunnelForPreTrainingOutput
-
-[[autodoc]] models.funnel.modeling_tf_funnel.TFFunnelForPreTrainingOutput
-
-## FunnelBaseModel
-
-[[autodoc]] FunnelBaseModel
-    - forward
-
-## FunnelModel
-
-[[autodoc]] FunnelModel
-    - forward
-
-## FunnelModelForPreTraining
-
-[[autodoc]] FunnelForPreTraining
-    - forward
-
-## FunnelForMaskedLM
-
-[[autodoc]] FunnelForMaskedLM
-    - forward
-
-## FunnelForSequenceClassification
-
-[[autodoc]] FunnelForSequenceClassification
-    - forward
-
-## FunnelForMultipleChoice
-
-[[autodoc]] FunnelForMultipleChoice
-    - forward
-
-## FunnelForTokenClassification
-
-[[autodoc]] FunnelForTokenClassification
-    - forward
-
-## FunnelForQuestionAnswering
-
-[[autodoc]] FunnelForQuestionAnswering
-    - forward
-
-## TFFunnelBaseModel
-
-[[autodoc]] TFFunnelBaseModel
-    - call
-
-## TFFunnelModel
-
-[[autodoc]] TFFunnelModel
-    - call
-
-## TFFunnelModelForPreTraining
-
-[[autodoc]] TFFunnelForPreTraining
-    - call
-
-## TFFunnelForMaskedLM
-
-[[autodoc]] TFFunnelForMaskedLM
-    - call
-
-## TFFunnelForSequenceClassification
-
-[[autodoc]] TFFunnelForSequenceClassification
-    - call
-
-## TFFunnelForMultipleChoice
-
-[[autodoc]] TFFunnelForMultipleChoice
-    - call
-
-## TFFunnelForTokenClassification
-
-[[autodoc]] TFFunnelForTokenClassification
-    - call
-
-## TFFunnelForQuestionAnswering
-
-[[autodoc]] TFFunnelForQuestionAnswering
-    - call
diff --git a/docs/source/en/model_doc/git.md b/docs/source/en/model_doc/git.md
new file mode 100644
index 000000000000..b0c96200af3c
--- /dev/null
+++ b/docs/source/en/model_doc/git.md
@@ -0,0 +1,80 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# GIT
+
+## Overview
+
+The GIT model was proposed in [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by
+Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. GIT is a decoder-only Transformer
+that leverages [CLIP](clip)'s vision encoder to condition the model on vision inputs besides text. The model obtains state-of-the-art results on
+image captioning and visual question answering benchmarks.
+
+The abstract from the paper is the following:
+
+*In this paper, we design and train a Generative Image-to-text Transformer, GIT, to unify vision-language tasks such as image/video captioning and question answering. While generative models provide a consistent network architecture between pre-training and fine-tuning, existing work typically contains complex structures (uni/multi-modal encoder/decoder) and depends on external modules such as object detectors/taggers and optical character recognition (OCR). In GIT, we simplify the architecture as one image encoder and one text decoder under a single language modeling task. We also scale up the pre-training data and the model size to boost the model performance. Without bells and whistles, our GIT establishes new state of the arts on 12 challenging benchmarks with a large margin. For instance, our model surpasses the human performance for the first time on TextCaps (138.2 vs. 125.5 in CIDEr). Furthermore, we present a new scheme of generation-based image classification and scene text recognition, achieving decent performance on standard benchmarks.*
+
+Tips:
+
+- GIT is implemented in a very similar way to GPT-2, the only difference being that the model is also conditioned on `pixel_values`.
+- One can use [`GitProcessor`] to prepare images for the model, and the `generate` method for autoregressive generation.
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/git_architecture.jpg"
+alt="drawing" width="600"/>
+
+<small> GIT architecture. Taken from the <a href="https://arxiv.org/abs/2205.14100" target="_blank">original paper</a>. </small>
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr).
+The original code can be found [here](https://github.com/microsoft/GenerativeImage2Text).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with GIT.
+
+- Demo notebooks regarding inference + fine-tuning GIT on custom data can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/GIT).
+- See also: [Causal language modeling task guide](../tasks/language_modeling)
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we will review it.
+The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## GitVisionConfig
+
+[[autodoc]] GitVisionConfig
+
+## GitVisionModel
+
+[[autodoc]] GitVisionModel
+    - forward
+
+## GitConfig
+
+[[autodoc]] GitConfig
+    - all
+
+## GitProcessor
+
+[[autodoc]] GitProcessor
+    - __call__
+
+## GitModel
+
+[[autodoc]] GitModel
+    - forward
+
+## GitForCausalLM
+
+[[autodoc]] GitForCausalLM
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/git.mdx b/docs/source/en/model_doc/git.mdx
deleted file mode 100644
index bc918383af12..000000000000
--- a/docs/source/en/model_doc/git.mdx
+++ /dev/null
@@ -1,66 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# GIT
-
-## Overview
-
-The GIT model was proposed in [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by
-Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. GIT is a decoder-only Transformer
-that leverages [CLIP](clip)'s vision encoder to condition the model on vision inputs besides text. The model obtains state-of-the-art results on
-image captioning and visual question answering benchmarks.
-
-The abstract from the paper is the following:
-
-*In this paper, we design and train a Generative Image-to-text Transformer, GIT, to unify vision-language tasks such as image/video captioning and question answering. While generative models provide a consistent network architecture between pre-training and fine-tuning, existing work typically contains complex structures (uni/multi-modal encoder/decoder) and depends on external modules such as object detectors/taggers and optical character recognition (OCR). In GIT, we simplify the architecture as one image encoder and one text decoder under a single language modeling task. We also scale up the pre-training data and the model size to boost the model performance. Without bells and whistles, our GIT establishes new state of the arts on 12 challenging benchmarks with a large margin. For instance, our model surpasses the human performance for the first time on TextCaps (138.2 vs. 125.5 in CIDEr). Furthermore, we present a new scheme of generation-based image classification and scene text recognition, achieving decent performance on standard benchmarks.*
-
-Tips:
-
-- GIT is implemented in a very similar way to GPT-2, the only difference being that the model is also conditioned on `pixel_values`.
-- One can use [`GitProcessor`] to prepare images for the model, and the `generate` method for autoregressive generation.
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/git_architecture.jpg"
-alt="drawing" width="600"/>
-
-<small> GIT architecture. Taken from the <a href="https://arxiv.org/abs/2205.14100" target="_blank">original paper</a>. </small>
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr).
-The original code can be found [here](https://github.com/microsoft/GenerativeImage2Text).
-
-## GitVisionConfig
-
-[[autodoc]] GitVisionConfig
-
-## GitVisionModel
-
-[[autodoc]] GitVisionModel
-    - forward
-
-## GitConfig
-
-[[autodoc]] GitConfig
-    - all
-
-## GitProcessor
-
-[[autodoc]] GitProcessor
-    - __call__
-
-## GitModel
-
-[[autodoc]] GitModel
-    - forward
-
-## GitForCausalLM
-
-[[autodoc]] GitForCausalLM
-    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/glpn.md b/docs/source/en/model_doc/glpn.md
new file mode 100644
index 000000000000..be9a7d2d7910
--- /dev/null
+++ b/docs/source/en/model_doc/glpn.md
@@ -0,0 +1,76 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# GLPN
+
+<Tip>
+
+This is a recently introduced model so the API hasn't been tested extensively. There may be some bugs or slight
+breaking changes to fix it in the future. If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title).
+
+</Tip>
+
+## Overview
+
+The GLPN model was proposed in [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436)  by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
+GLPN combines [SegFormer](segformer)'s hierarchical mix-Transformer with a lightweight decoder for monocular depth estimation. The proposed decoder shows better performance than the previously proposed decoders, with considerably
+less computational complexity.
+
+The abstract from the paper is the following:
+
+*Depth estimation from a single image is an important task that can be applied to various fields in computer vision, and has grown rapidly with the development of convolutional neural networks. In this paper, we propose a novel structure and training strategy for monocular depth estimation to further improve the prediction accuracy of the network. We deploy a hierarchical transformer encoder to capture and convey the global context, and design a lightweight yet powerful decoder to generate an estimated depth map while considering local connectivity. By constructing connected paths between multi-scale local features and the global decoding stream with our proposed selective feature fusion module, the network can integrate both representations and recover fine details. In addition, the proposed decoder shows better performance than the previously proposed decoders, with considerably less computational complexity. Furthermore, we improve the depth-specific augmentation method by utilizing an important observation in depth estimation to enhance the model. Our network achieves state-of-the-art performance over the challenging depth dataset NYU Depth V2. Extensive experiments have been conducted to validate and show the effectiveness of the proposed approach. Finally, our model shows better generalisation ability and robustness than other comparative models.*
+
+Tips:
+
+- One can use [`GLPNImageProcessor`] to prepare images for the model.
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/glpn_architecture.jpg"
+alt="drawing" width="600"/>
+
+<small> Summary of the approach. Taken from the <a href="https://arxiv.org/abs/2201.07436" target="_blank">original paper</a>. </small>
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/vinvino02/GLPDepth).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with GLPN.
+
+- Demo notebooks for [`GLPNForDepthEstimation`] can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/GLPN).
+- [Monocular depth estimation task guide](../tasks/monocular_depth_estimation)
+
+## GLPNConfig
+
+[[autodoc]] GLPNConfig
+
+## GLPNFeatureExtractor
+
+[[autodoc]] GLPNFeatureExtractor
+    - __call__
+
+## GLPNImageProcessor
+
+[[autodoc]] GLPNImageProcessor
+    - preprocess
+
+## GLPNModel
+
+[[autodoc]] GLPNModel
+    - forward
+
+## GLPNForDepthEstimation
+
+[[autodoc]] GLPNForDepthEstimation
+    - forward
diff --git a/docs/source/en/model_doc/glpn.mdx b/docs/source/en/model_doc/glpn.mdx
deleted file mode 100644
index 5d087de2fb22..000000000000
--- a/docs/source/en/model_doc/glpn.mdx
+++ /dev/null
@@ -1,66 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# GLPN
-
-<Tip>
-
-This is a recently introduced model so the API hasn't been tested extensively. There may be some bugs or slight
-breaking changes to fix it in the future. If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title).
-
-</Tip>
-
-## Overview
-
-The GLPN model was proposed in [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436)  by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
-GLPN combines [SegFormer](segformer)'s hierarchical mix-Transformer with a lightweight decoder for monocular depth estimation. The proposed decoder shows better performance than the previously proposed decoders, with considerably
-less computational complexity.
-
-The abstract from the paper is the following:
-
-*Depth estimation from a single image is an important task that can be applied to various fields in computer vision, and has grown rapidly with the development of convolutional neural networks. In this paper, we propose a novel structure and training strategy for monocular depth estimation to further improve the prediction accuracy of the network. We deploy a hierarchical transformer encoder to capture and convey the global context, and design a lightweight yet powerful decoder to generate an estimated depth map while considering local connectivity. By constructing connected paths between multi-scale local features and the global decoding stream with our proposed selective feature fusion module, the network can integrate both representations and recover fine details. In addition, the proposed decoder shows better performance than the previously proposed decoders, with considerably less computational complexity. Furthermore, we improve the depth-specific augmentation method by utilizing an important observation in depth estimation to enhance the model. Our network achieves state-of-the-art performance over the challenging depth dataset NYU Depth V2. Extensive experiments have been conducted to validate and show the effectiveness of the proposed approach. Finally, our model shows better generalisation ability and robustness than other comparative models.*
-
-Tips:
-
-- A notebook illustrating inference with [`GLPNForDepthEstimation`] can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/GLPN/GLPN_inference_(depth_estimation).ipynb).
-- One can use [`GLPNImageProcessor`] to prepare images for the model.
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/glpn_architecture.jpg"
-alt="drawing" width="600"/>
-
-<small> Summary of the approach. Taken from the <a href="https://arxiv.org/abs/2201.07436" target="_blank">original paper</a>. </small>
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/vinvino02/GLPDepth).
-
-## GLPNConfig
-
-[[autodoc]] GLPNConfig
-
-## GLPNFeatureExtractor
-
-[[autodoc]] GLPNFeatureExtractor
-    - __call__
-
-## GLPNImageProcessor
-
-[[autodoc]] GLPNImageProcessor
-    - preprocess
-
-## GLPNModel
-
-[[autodoc]] GLPNModel
-    - forward
-
-## GLPNForDepthEstimation
-
-[[autodoc]] GLPNForDepthEstimation
-    - forward
diff --git a/docs/source/en/model_doc/gpt-sw3.md b/docs/source/en/model_doc/gpt-sw3.md
new file mode 100644
index 000000000000..286cac12c998
--- /dev/null
+++ b/docs/source/en/model_doc/gpt-sw3.md
@@ -0,0 +1,64 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# GPT-Sw3
+
+## Overview
+
+The GPT-Sw3 model was first proposed in
+[Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf)
+by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman,
+Fredrik Carlsson, Magnus Sahlgren.
+
+Since that first paper the authors have extended their work and trained new models on their new 1.2TB corpora named The Nordic Pile.
+
+GPT-Sw3 is a collection of large decoder-only pretrained transformer language models that were developed by AI Sweden
+in collaboration with RISE and the WASP WARA for Media and Language. GPT-Sw3 has been trained on a dataset containing
+320B tokens in Swedish, Norwegian, Danish, Icelandic, English, and programming code. The model was pretrained using a
+causal language modeling (CLM) objective utilizing the NeMo Megatron GPT implementation.
+
+This model was contributed by [AI Sweden](https://huggingface.co/AI-Sweden).
+
+The implementation uses the [GPT2Model](https://huggingface.co/docs/transformers/model_doc/gpt2) coupled
+with our `GPTSw3Tokenizer`. This means that `AutoTokenizer` and `AutoModelForCausalLM` map to our tokenizer
+implementation and the corresponding GPT2 model implementation respectively.
+*Note that sentencepiece is required to use our tokenizer and can be installed with:* `pip install transformers[sentencepiece]` or `pip install sentencepiece`
+
+Example usage:
+```python
+>>> from transformers import AutoTokenizer, AutoModelForCausalLM
+
+>>> tokenizer = AutoTokenizer.from_pretrained("AI-Sweden/gpt-sw3-356m")
+>>> model = AutoModelForCausalLM.from_pretrained("AI-Sweden/gpt-sw3-356m")
+
+>>> input_ids = tokenizer("Träd är fina för att", return_tensors="pt")["input_ids"]
+
+>>> generated_token_ids = model.generate(inputs=input_ids, max_new_tokens=10, do_sample=True)[0]
+
+>>> print(tokenizer.decode(generated_token_ids))
+Träd är fina för att de är färgstarka. Men ibland är det fint
+```
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Causal language modeling task guide](../tasks/language_modeling)
+
+## GPTSw3Tokenizer
+
+[[autodoc]] GPTSw3Tokenizer
+    - save_vocabulary
diff --git a/docs/source/en/model_doc/gpt-sw3.mdx b/docs/source/en/model_doc/gpt-sw3.mdx
deleted file mode 100644
index 23b6dc976da3..000000000000
--- a/docs/source/en/model_doc/gpt-sw3.mdx
+++ /dev/null
@@ -1,54 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# GPT-Sw3
-
-## Overview
-
-The GPT-Sw3 model was first proposed in
-[Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf)
-by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman,
-Fredrik Carlsson, Magnus Sahlgren.
-
-Since that first paper the authors have extended their work and trained new models on their new 1.2TB corpora named The Nordic Pile.
-
-GPT-Sw3 is a collection of large decoder-only pretrained transformer language models that were developed by AI Sweden
-in collaboration with RISE and the WASP WARA for Media and Language. GPT-Sw3 has been trained on a dataset containing
-320B tokens in Swedish, Norwegian, Danish, Icelandic, English, and programming code. The model was pretrained using a
-causal language modeling (CLM) objective utilizing the NeMo Megatron GPT implementation.
-
-This model was contributed by [AI Sweden](https://huggingface.co/AI-Sweden).
-
-The implementation uses the [GPT2Model](https://huggingface.co/docs/transformers/model_doc/gpt2) coupled
-with our `GPTSw3Tokenizer`. This means that `AutoTokenizer` and `AutoModelForCausalLM` map to our tokenizer
-implementation and the corresponding GPT2 model implementation respectively.
-*Note that sentencepiece is required to use our tokenizer and can be installed with:* `pip install transformers[sentencepiece]` or `pip install sentencepiece`
-
-Example usage:
-```python
->>> from transformers import AutoTokenizer, AutoModelForCausalLM
-
->>> tokenizer = AutoTokenizer.from_pretrained("AI-Sweden/gpt-sw3-356m")
->>> model = AutoModelForCausalLM.from_pretrained("AI-Sweden/gpt-sw3-356m")
-
->>> input_ids = tokenizer("Träd är fina för att", return_tensors="pt")["input_ids"]
-
->>> generated_token_ids = model.generate(inputs=input_ids, max_new_tokens=10, do_sample=True)[0]
-
->>> print(tokenizer.decode(generated_token_ids))
-Träd är fina för att de är färgstarka. Men ibland är det fint
-```
-
-## GPTSw3Tokenizer
-
-[[autodoc]] GPTSw3Tokenizer
-    - save_vocabulary
diff --git a/docs/source/en/model_doc/gpt2.md b/docs/source/en/model_doc/gpt2.md
new file mode 100644
index 000000000000..878bf84a3fac
--- /dev/null
+++ b/docs/source/en/model_doc/gpt2.md
@@ -0,0 +1,169 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# OpenAI GPT2
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=gpt2">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-gpt2-blueviolet">
+</a>
+<a href="https://huggingface.co/spaces/docs-demos/gpt2">
+<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
+</a>
+</div>
+
+## Overview
+
+OpenAI GPT-2 model was proposed in [Language Models are Unsupervised Multitask Learners](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) by Alec
+Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei and Ilya Sutskever from [OpenAI](https://huggingface.co/openai). It's a causal (unidirectional)
+transformer pretrained using language modeling on a very large corpus of ~40 GB of text data.
+
+The abstract from the paper is the following:
+
+*GPT-2 is a large transformer-based language model with 1.5 billion parameters, trained on a dataset[1] of 8 million
+web pages. GPT-2 is trained with a simple objective: predict the next word, given all of the previous words within some
+text. The diversity of the dataset causes this simple goal to contain naturally occurring demonstrations of many tasks
+across diverse domains. GPT-2 is a direct scale-up of GPT, with more than 10X the parameters and trained on more than
+10X the amount of data.*
+
+Tips:
+
+- GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
+  the left.
+- GPT-2 was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next
+  token in a sequence. Leveraging this feature allows GPT-2 to generate syntactically coherent text as it can be
+  observed in the *run_generation.py* example script.
+- The model can take the *past_key_values* (for PyTorch) or *past* (for TF) as input, which is the previously computed
+  key/value attention pairs. Using this (*past_key_values* or *past*) value prevents the model from re-computing
+  pre-computed values in the context of text generation. For PyTorch, see *past_key_values* argument of the
+  [`GPT2Model.forward`] method, or for TF the *past* argument of the
+  [`TFGPT2Model.call`] method for more information on its usage.
+- Enabling the *scale_attn_by_inverse_layer_idx* and *reorder_and_upcast_attn* flags will apply the training stability
+  improvements from [Mistral](https://github.com/stanford-crfm/mistral/) (for PyTorch only).
+
+[Write With Transformer](https://transformer.huggingface.co/doc/gpt2-large) is a webapp created and hosted by
+Hugging Face showcasing the generative capabilities of several models. GPT-2 is one of them and is available in five
+different sizes: small, medium, large, xl and a distilled version of the small checkpoint: *distilgpt-2*.
+
+This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://openai.com/blog/better-language-models/).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with GPT2. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+<PipelineTag pipeline="text-generation"/>
+
+- A blog on how to [Finetune a non-English GPT-2 Model with Hugging Face](https://www.philschmid.de/fine-tune-a-non-english-gpt-2-model-with-huggingface).
+- A blog on [How to generate text: using different decoding methods for language generation with Transformers](https://huggingface.co/blog/how-to-generate) with GPT-2.
+- A blog on [Training CodeParrot 🦜 from Scratch](https://huggingface.co/blog/codeparrot), a large GPT-2 model.
+- A blog on [Faster Text Generation with TensorFlow and XLA](https://huggingface.co/blog/tf-xla-generate) with GPT-2.
+- A blog on [How to train a Language Model with Megatron-LM](https://huggingface.co/blog/megatron-training) with a GPT-2 model.
+- A notebook on how to [finetune GPT2 to generate lyrics in the style of your favorite artist](https://colab.research.google.com/github/AlekseyKorshuk/huggingartists/blob/master/huggingartists-demo.ipynb). 🌎
+- A notebook on how to [finetune GPT2 to generate tweets in the style of your favorite Twitter user](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb). 🌎
+- [Causal language modeling](https://huggingface.co/course/en/chapter7/6?fw=pt#training-a-causal-language-model-from-scratch) chapter of the 🤗 Hugging Face Course.
+- [`GPT2LMHeadModel`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#gpt-2gpt-and-causal-language-modeling), [text generation example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-generation), and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
+- [`TFGPT2LMHeadModel`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_clmpy) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
+- [`FlaxGPT2LMHeadModel`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#causal-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/causal_language_modeling_flax.ipynb).
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Causal language modeling task guide](../tasks/language_modeling)
+
+## GPT2Config
+
+[[autodoc]] GPT2Config
+
+## GPT2Tokenizer
+
+[[autodoc]] GPT2Tokenizer
+    - save_vocabulary
+
+## GPT2TokenizerFast
+
+[[autodoc]] GPT2TokenizerFast
+
+## GPT2 specific outputs
+
+[[autodoc]] models.gpt2.modeling_gpt2.GPT2DoubleHeadsModelOutput
+
+[[autodoc]] models.gpt2.modeling_tf_gpt2.TFGPT2DoubleHeadsModelOutput
+
+## GPT2Model
+
+[[autodoc]] GPT2Model
+    - forward
+
+## GPT2LMHeadModel
+
+[[autodoc]] GPT2LMHeadModel
+    - forward
+
+## GPT2DoubleHeadsModel
+
+[[autodoc]] GPT2DoubleHeadsModel
+    - forward
+
+## GPT2ForQuestionAnswering
+
+[[autodoc]] GPT2ForQuestionAnswering
+    - forward
+
+## GPT2ForSequenceClassification
+
+[[autodoc]] GPT2ForSequenceClassification
+    - forward
+
+## GPT2ForTokenClassification
+
+[[autodoc]] GPT2ForTokenClassification
+    - forward
+
+## TFGPT2Model
+
+[[autodoc]] TFGPT2Model
+    - call
+
+## TFGPT2LMHeadModel
+
+[[autodoc]] TFGPT2LMHeadModel
+    - call
+
+## TFGPT2DoubleHeadsModel
+
+[[autodoc]] TFGPT2DoubleHeadsModel
+    - call
+
+## TFGPT2ForSequenceClassification
+
+[[autodoc]] TFGPT2ForSequenceClassification
+    - call
+
+## TFSequenceClassifierOutputWithPast
+
+[[autodoc]] modeling_tf_outputs.TFSequenceClassifierOutputWithPast
+
+## TFGPT2Tokenizer
+
+[[autodoc]] TFGPT2Tokenizer
+
+## FlaxGPT2Model
+
+[[autodoc]] FlaxGPT2Model
+    - __call__
+
+## FlaxGPT2LMHeadModel
+
+[[autodoc]] FlaxGPT2LMHeadModel
+    - __call__
diff --git a/docs/source/en/model_doc/gpt2.mdx b/docs/source/en/model_doc/gpt2.mdx
deleted file mode 100644
index caa23c337f6b..000000000000
--- a/docs/source/en/model_doc/gpt2.mdx
+++ /dev/null
@@ -1,153 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# OpenAI GPT2
-
-## Overview
-
-OpenAI GPT-2 model was proposed in [Language Models are Unsupervised Multitask Learners](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) by Alec
-Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei and Ilya Sutskever. It's a causal (unidirectional)
-transformer pretrained using language modeling on a very large corpus of ~40 GB of text data.
-
-The abstract from the paper is the following:
-
-*GPT-2 is a large transformer-based language model with 1.5 billion parameters, trained on a dataset[1] of 8 million
-web pages. GPT-2 is trained with a simple objective: predict the next word, given all of the previous words within some
-text. The diversity of the dataset causes this simple goal to contain naturally occurring demonstrations of many tasks
-across diverse domains. GPT-2 is a direct scale-up of GPT, with more than 10X the parameters and trained on more than
-10X the amount of data.*
-
-Tips:
-
-- GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
-  the left.
-- GPT-2 was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next
-  token in a sequence. Leveraging this feature allows GPT-2 to generate syntactically coherent text as it can be
-  observed in the *run_generation.py* example script.
-- The model can take the *past_key_values* (for PyTorch) or *past* (for TF) as input, which is the previously computed
-  key/value attention pairs. Using this (*past_key_values* or *past*) value prevents the model from re-computing
-  pre-computed values in the context of text generation. For PyTorch, see *past_key_values* argument of the
-  [`GPT2Model.forward`] method, or for TF the *past* argument of the
-  [`TFGPT2Model.call`] method for more information on its usage.
-- Enabling the *scale_attn_by_inverse_layer_idx* and *reorder_and_upcast_attn* flags will apply the training stability
-  improvements from [Mistral](https://github.com/stanford-crfm/mistral/) (for PyTorch only).
-
-[Write With Transformer](https://transformer.huggingface.co/doc/gpt2-large) is a webapp created and hosted by
-Hugging Face showcasing the generative capabilities of several models. GPT-2 is one of them and is available in five
-different sizes: small, medium, large, xl and a distilled version of the small checkpoint: *distilgpt-2*.
-
-This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://openai.com/blog/better-language-models/).
-
-## Resources
-
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with GPT2. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
-<PipelineTag pipeline="text-generation"/>
-
-- A blog on how to [Finetune a non-English GPT-2 Model with Hugging Face](https://www.philschmid.de/fine-tune-a-non-english-gpt-2-model-with-huggingface).
-- A blog on [How to generate text: using different decoding methods for language generation with Transformers](https://huggingface.co/blog/how-to-generate) with GPT-2.
-- A blog on [Training CodeParrot 🦜 from Scratch](https://huggingface.co/blog/codeparrot), a large GPT-2 model.
-- A blog on [Faster Text Generation with TensorFlow and XLA](https://huggingface.co/blog/tf-xla-generate) with GPT-2.
-- A blog on [How to train a Language Model with Megatron-LM](https://huggingface.co/blog/megatron-training) with a GPT-2 model.
-- A notebook on how to [finetune GPT2 to generate lyrics in the style of your favorite artist](https://colab.research.google.com/github/AlekseyKorshuk/huggingartists/blob/master/huggingartists-demo.ipynb). 🌎
-- A notebook on how to [finetune GPT2 to generate tweets in the style of your favorite Twitter user](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb). 🌎
-- [Causal language modeling](https://huggingface.co/course/en/chapter7/6?fw=pt#training-a-causal-language-model-from-scratch) chapter of the 🤗 Hugging Face Course.
-- [`GPT2LMHeadModel`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#gpt-2gpt-and-causal-language-modeling), [text generation example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-generation), and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
-- [`TFGPT2LMHeadModel`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_clmpy) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
-- [`FlaxGPT2LMHeadModel`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#causal-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/causal_language_modeling_flax.ipynb).
-
-
-## GPT2Config
-
-[[autodoc]] GPT2Config
-
-## GPT2Tokenizer
-
-[[autodoc]] GPT2Tokenizer
-    - save_vocabulary
-
-## GPT2TokenizerFast
-
-[[autodoc]] GPT2TokenizerFast
-
-## GPT2 specific outputs
-
-[[autodoc]] models.gpt2.modeling_gpt2.GPT2DoubleHeadsModelOutput
-
-[[autodoc]] models.gpt2.modeling_tf_gpt2.TFGPT2DoubleHeadsModelOutput
-
-## GPT2Model
-
-[[autodoc]] GPT2Model
-    - forward
-    - parallelize
-    - deparallelize
-
-## GPT2LMHeadModel
-
-[[autodoc]] GPT2LMHeadModel
-    - forward
-    - parallelize
-    - deparallelize
-
-## GPT2DoubleHeadsModel
-
-[[autodoc]] GPT2DoubleHeadsModel
-    - forward
-
-## GPT2ForSequenceClassification
-
-[[autodoc]] GPT2ForSequenceClassification
-    - forward
-
-## GPT2ForTokenClassification
-
-[[autodoc]] GPT2ForTokenClassification
-    - forward
-
-## TFGPT2Model
-
-[[autodoc]] TFGPT2Model
-    - call
-
-## TFGPT2LMHeadModel
-
-[[autodoc]] TFGPT2LMHeadModel
-    - call
-
-## TFGPT2DoubleHeadsModel
-
-[[autodoc]] TFGPT2DoubleHeadsModel
-    - call
-
-## TFGPT2ForSequenceClassification
-
-[[autodoc]] TFGPT2ForSequenceClassification
-    - call
-
-## TFSequenceClassifierOutputWithPast
-
-[[autodoc]] modeling_tf_outputs.TFSequenceClassifierOutputWithPast
-
-## TFGPT2Tokenizer
-
-[[autodoc]] TFGPT2Tokenizer
-
-## FlaxGPT2Model
-
-[[autodoc]] FlaxGPT2Model
-    - __call__
-
-## FlaxGPT2LMHeadModel
-
-[[autodoc]] FlaxGPT2LMHeadModel
-    - __call__
diff --git a/docs/source/en/model_doc/gpt_bigcode.md b/docs/source/en/model_doc/gpt_bigcode.md
new file mode 100644
index 000000000000..6965d5837d8e
--- /dev/null
+++ b/docs/source/en/model_doc/gpt_bigcode.md
@@ -0,0 +1,69 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# GPTBigCode
+
+## Overview
+
+The GPTBigCode model was proposed in [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by BigCode. The listed authors are: Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
+
+The abstract from the paper is the following:uery
+
+*The BigCode project is an open-scientific collaboration working on the responsible development of large language models for code. This tech report describes the progress of the collaboration until December 2022, outlining the current state of the Personally Identifiable Information (PII) redaction pipeline, the experiments conducted to de-risk the model architecture, and the experiments investigating better preprocessing methods for the training data. We train 1.1B parameter models on the Java, JavaScript, and Python subsets of The Stack and evaluate them on the MultiPL-E text-to-code benchmark. We find that more aggressive filtering of near-duplicates can further boost performance and, surprisingly, that selecting files from repositories with 5+ GitHub stars deteriorates performance significantly. Our best model outperforms previous open-source multilingual code generation models (InCoder-6.7B and CodeGen-Multi-2.7B) in both left-to-right generation and infilling on the Java, JavaScript, and Python portions of MultiPL-E, despite being a substantially smaller model. All models are released under an OpenRAIL license at [this https URL.](https://huggingface.co/bigcode)*
+
+The model is a an optimized [GPT2 model](https://huggingface.co/docs/transformers/model_doc/gpt2) with support for Multi-Query Attention.
+
+## Technical details
+
+The main differences compared to GPT2.
+- Added support for Multi-Query Attention.
+- Use `gelu_pytorch_tanh` instead of classic `gelu`.
+- Avoid unnecessary synchronizations (this has since been added to GPT2 in #20061, but wasn't in the reference codebase).
+- Use Linear layers instead of Conv1D (good speedup but makes the checkpoints incompatible).
+- Merge `_attn` and `_upcast_and_reordered_attn`. Always merge the matmul with scaling. Rename `reorder_and_upcast_attn`->`attention_softmax_in_fp32`
+- Cache the attention mask value to avoid recreating it every time.
+- Use jit to fuse the attention fp32 casting, masking, softmax, and scaling.
+- Combine the attention and causal masks into a single one, pre-computed for the whole model instead of every layer.
+- Merge the key and value caches into one (this changes the format of layer_past/ present, does it risk creating problems?)
+- Use the memory layout (self.num_heads, 3, self.head_dim) instead of `(3, self.num_heads, self.head_dim)` for the QKV tensor with MHA. (prevents an overhead with the merged key and values, but makes the checkpoints incompatible with the original gpt2 model).
+
+You can read more about the optimizations in the [original pull request](https://github.com/huggingface/transformers/pull/22575)
+
+## GPTBigCodeConfig
+
+[[autodoc]] GPTBigCodeConfig
+
+
+## GPTBigCodeModel
+
+[[autodoc]] GPTBigCodeModel
+    - forward
+
+## GPTBigCodeForCausalLM
+
+[[autodoc]] GPTBigCodeForCausalLM
+    - forward
+
+
+## GPTBigCodeForSequenceClassification
+
+[[autodoc]] GPTBigCodeForSequenceClassification
+    - forward
+
+## GPTBigCodeForTokenClassification
+
+[[autodoc]] GPTBigCodeForTokenClassification
+    - forward
diff --git a/docs/source/en/model_doc/gpt_neo.md b/docs/source/en/model_doc/gpt_neo.md
new file mode 100644
index 000000000000..6b925aad10e4
--- /dev/null
+++ b/docs/source/en/model_doc/gpt_neo.md
@@ -0,0 +1,99 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# GPT Neo
+
+## Overview
+
+The GPTNeo model was released in the [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) repository by Sid
+Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy. It is a GPT2 like causal language model trained on the
+[Pile](https://pile.eleuther.ai/) dataset.
+
+The architecture is similar to GPT2 except that GPT Neo uses local attention in every other layer with a window size of
+256 tokens.
+
+This model was contributed by [valhalla](https://huggingface.co/valhalla).
+
+### Generation
+
+The `generate()` method can be used to generate text using GPT Neo model.
+
+```python
+>>> from transformers import GPTNeoForCausalLM, GPT2Tokenizer
+
+>>> model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
+>>> tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
+
+>>> prompt = (
+...     "In a shocking finding, scientists discovered a herd of unicorns living in a remote, "
+...     "previously unexplored valley, in the Andes Mountains. Even more surprising to the "
+...     "researchers was the fact that the unicorns spoke perfect English."
+... )
+
+>>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids
+
+>>> gen_tokens = model.generate(
+...     input_ids,
+...     do_sample=True,
+...     temperature=0.9,
+...     max_length=100,
+... )
+>>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
+```
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Causal language modeling task guide](../tasks/language_modeling)
+
+## GPTNeoConfig
+
+[[autodoc]] GPTNeoConfig
+
+## GPTNeoModel
+
+[[autodoc]] GPTNeoModel
+    - forward
+
+## GPTNeoForCausalLM
+
+[[autodoc]] GPTNeoForCausalLM
+    - forward
+
+## GPTNeoForQuestionAnswering
+
+[[autodoc]] GPTNeoForQuestionAnswering
+    - forward
+
+## GPTNeoForSequenceClassification
+
+[[autodoc]] GPTNeoForSequenceClassification
+    - forward
+
+## GPTNeoForTokenClassification
+
+[[autodoc]] GPTNeoForTokenClassification
+    - forward
+
+## FlaxGPTNeoModel
+
+[[autodoc]] FlaxGPTNeoModel
+    - __call__
+
+## FlaxGPTNeoForCausalLM
+
+[[autodoc]] FlaxGPTNeoForCausalLM
+    - __call__
diff --git a/docs/source/en/model_doc/gpt_neo.mdx b/docs/source/en/model_doc/gpt_neo.mdx
deleted file mode 100644
index f68b92b21327..000000000000
--- a/docs/source/en/model_doc/gpt_neo.mdx
+++ /dev/null
@@ -1,80 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# GPT Neo
-
-## Overview
-
-The GPTNeo model was released in the [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) repository by Sid
-Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy. It is a GPT2 like causal language model trained on the
-[Pile](https://pile.eleuther.ai/) dataset.
-
-The architecture is similar to GPT2 except that GPT Neo uses local attention in every other layer with a window size of
-256 tokens.
-
-This model was contributed by [valhalla](https://huggingface.co/valhalla).
-
-### Generation
-
-The `generate()` method can be used to generate text using GPT Neo model.
-
-```python
->>> from transformers import GPTNeoForCausalLM, GPT2Tokenizer
-
->>> model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
->>> tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
-
->>> prompt = (
-...     "In a shocking finding, scientists discovered a herd of unicorns living in a remote, "
-...     "previously unexplored valley, in the Andes Mountains. Even more surprising to the "
-...     "researchers was the fact that the unicorns spoke perfect English."
-... )
-
->>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids
-
->>> gen_tokens = model.generate(
-...     input_ids,
-...     do_sample=True,
-...     temperature=0.9,
-...     max_length=100,
-... )
->>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
-```
-
-## GPTNeoConfig
-
-[[autodoc]] GPTNeoConfig
-
-## GPTNeoModel
-
-[[autodoc]] GPTNeoModel
-    - forward
-
-## GPTNeoForCausalLM
-
-[[autodoc]] GPTNeoForCausalLM
-    - forward
-
-## GPTNeoForSequenceClassification
-
-[[autodoc]] GPTNeoForSequenceClassification
-    - forward
-
-## FlaxGPTNeoModel
-
-[[autodoc]] FlaxGPTNeoModel
-    - __call__
-
-## FlaxGPTNeoForCausalLM
-
-[[autodoc]] FlaxGPTNeoForCausalLM
-    - __call__
diff --git a/docs/source/en/model_doc/gpt_neox.md b/docs/source/en/model_doc/gpt_neox.md
new file mode 100644
index 000000000000..0ee7c8630c65
--- /dev/null
+++ b/docs/source/en/model_doc/gpt_neox.md
@@ -0,0 +1,99 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# GPT-NeoX
+
+## Overview
+
+We introduce GPT-NeoX-20B, a 20 billion parameter autoregressive language model trained on the Pile, whose weights will
+be made freely and openly available to the public through a permissive license. It is, to the best of our knowledge,
+the largest dense autoregressive model that has publicly available weights at the time of submission. In this work,
+we describe GPT-NeoX-20B's architecture and training and evaluate its performance on a range of language-understanding,
+mathematics, and knowledge-based tasks. We find that GPT-NeoX-20B is a particularly powerful few-shot reasoner and
+gains far more in performance when evaluated five-shot than similarly sized GPT-3 and FairSeq models. We open-source
+the training and evaluation code, as well as the model weights, at [https://github.com/EleutherAI/gpt-neox](https://github.com/EleutherAI/gpt-neox).
+
+Development of the model was led by Sid Black, Stella Biderman and Eric Hallahan, and the model was trained with
+generous the support of [CoreWeave](https://www.coreweave.com/).
+
+GPT-NeoX-20B was trained with fp16, thus it is recommended to initialize the model as follows:
+
+```python
+model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/gpt-neox-20b").half().cuda()
+```
+
+GPT-NeoX-20B also has a different tokenizer from the one used in GPT-J-6B and GPT-Neo. The new tokenizer allocates
+additional tokens to whitespace characters, making the model more suitable for certain tasks like code generation.
+
+### Generation
+
+The `generate()` method can be used to generate text using GPT Neo model.
+
+```python
+>>> from transformers import GPTNeoXForCausalLM, GPTNeoXTokenizerFast
+
+>>> model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/gpt-neox-20b")
+>>> tokenizer = GPTNeoXTokenizerFast.from_pretrained("EleutherAI/gpt-neox-20b")
+
+>>> prompt = "GPTNeoX20B is a 20B-parameter autoregressive Transformer model developed by EleutherAI."
+
+>>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids
+
+>>> gen_tokens = model.generate(
+...     input_ids,
+...     do_sample=True,
+...     temperature=0.9,
+...     max_length=100,
+... )
+>>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
+```
+
+## Documentation resources
+
+- [Causal language modeling task guide](../tasks/language_modeling)
+
+## GPTNeoXConfig
+
+[[autodoc]] GPTNeoXConfig
+
+## GPTNeoXTokenizerFast
+
+[[autodoc]] GPTNeoXTokenizerFast
+
+## GPTNeoXModel
+
+[[autodoc]] GPTNeoXModel
+    - forward
+
+## GPTNeoXForCausalLM
+
+[[autodoc]] GPTNeoXForCausalLM
+    - forward
+
+## GPTNeoXForQuestionAnswering
+
+[[autodoc]] GPTNeoXForQuestionAnswering
+    - forward
+
+## GPTNeoXForSequenceClassification
+
+[[autodoc]] GPTNeoXForSequenceClassification
+    - forward
+
+## GPTNeoXForTokenClassification
+
+[[autodoc]] GPTNeoXForTokenClassification
+    - forward
diff --git a/docs/source/en/model_doc/gpt_neox.mdx b/docs/source/en/model_doc/gpt_neox.mdx
deleted file mode 100644
index 1be8be7a6a5e..000000000000
--- a/docs/source/en/model_doc/gpt_neox.mdx
+++ /dev/null
@@ -1,76 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# GPT-NeoX
-
-## Overview
-
-We introduce GPT-NeoX-20B, a 20 billion parameter autoregressive language model trained on the Pile, whose weights will
-be made freely and openly available to the public through a permissive license. It is, to the best of our knowledge,
-the largest dense autoregressive model that has publicly available weights at the time of submission. In this work,
-we describe GPT-NeoX-20B's architecture and training and evaluate its performance on a range of language-understanding,
-mathematics, and knowledge-based tasks. We find that GPT-NeoX-20B is a particularly powerful few-shot reasoner and
-gains far more in performance when evaluated five-shot than similarly sized GPT-3 and FairSeq models. We open-source
-the training and evaluation code, as well as the model weights, at [https://github.com/EleutherAI/gpt-neox](https://github.com/EleutherAI/gpt-neox).
-
-Development of the model was led by Sid Black, Stella Biderman and Eric Hallahan, and the model was trained with
-generous the support of [CoreWeave](https://www.coreweave.com/).
-
-GPT-NeoX-20B was trained with fp16, thus it is recommended to initialize the model as follows:
-
-```python
-model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/gpt-neox-20b").half().cuda()
-```
-
-GPT-NeoX-20B also has a different tokenizer from the one used in GPT-J-6B and GPT-Neo. The new tokenizer allocates
-additional tokens to whitespace characters, making the model more suitable for certain tasks like code generation.
-
-### Generation
-
-The `generate()` method can be used to generate text using GPT Neo model.
-
-```python
->>> from transformers import GPTNeoXForCausalLM, GPTNeoXTokenizerFast
-
->>> model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/gpt-neox-20b")
->>> tokenizer = GPTNeoXTokenizerFast.from_pretrained("EleutherAI/gpt-neox-20b")
-
->>> prompt = "GPTNeoX20B is a 20B-parameter autoregressive Transformer model developed by EleutherAI."
-
->>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids
-
->>> gen_tokens = model.generate(
-...     input_ids,
-...     do_sample=True,
-...     temperature=0.9,
-...     max_length=100,
-... )
->>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
-```
-
-## GPTNeoXConfig
-
-[[autodoc]] GPTNeoXConfig
-
-## GPTNeoXTokenizerFast
-
-[[autodoc]] GPTNeoXTokenizerFast
-
-## GPTNeoXModel
-
-[[autodoc]] GPTNeoXModel
-    - forward
-
-## GPTNeoXForCausalLM
-
-[[autodoc]] GPTNeoXForCausalLM
-    - forward
diff --git a/docs/source/en/model_doc/gpt_neox_japanese.md b/docs/source/en/model_doc/gpt_neox_japanese.md
new file mode 100644
index 000000000000..c21ba838792a
--- /dev/null
+++ b/docs/source/en/model_doc/gpt_neox_japanese.md
@@ -0,0 +1,74 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# GPT-NeoX-Japanese
+
+## Overview
+
+We introduce GPT-NeoX-Japanese, which is an autoregressive language model for Japanese, trained on top of [https://github.com/EleutherAI/gpt-neox](https://github.com/EleutherAI/gpt-neox).
+Japanese is a unique language with its large vocabulary and a combination of hiragana, katakana, and kanji writing scripts.
+To address this distinct structure of the Japanese language, we use a [special sub-word tokenizer](https://github.com/tanreinama/Japanese-BPEEncoder_V2). We are very grateful to *tanreinama* for open-sourcing this incredibly helpful tokenizer.
+Following the recommendations from Google's research on [PaLM](https://ai.googleblog.com/2022/04/pathways-language-model-palm-scaling-to.html), we have removed bias parameters from transformer blocks, achieving better model performance. Please refer [this article](https://medium.com/ml-abeja/training-a-better-gpt-2-93b157662ae4) in detail.
+
+Development of the model was led by [Shinya Otani](https://github.com/SO0529), [Takayoshi Makabe](https://github.com/spider-man-tm), [Anuj Arora](https://github.com/Anuj040), and [Kyo Hattori](https://github.com/go5paopao) from [ABEJA, Inc.](https://www.abejainc.com/). For more information on this model-building activity, please refer [here (ja)](https://tech-blog.abeja.asia/entry/abeja-gpt-project-202207).
+
+### Generation
+
+The `generate()` method can be used to generate text using GPT NeoX Japanese model.
+
+```python
+>>> from transformers import GPTNeoXJapaneseForCausalLM, GPTNeoXJapaneseTokenizer
+
+>>> model = GPTNeoXJapaneseForCausalLM.from_pretrained("abeja/gpt-neox-japanese-2.7b")
+>>> tokenizer = GPTNeoXJapaneseTokenizer.from_pretrained("abeja/gpt-neox-japanese-2.7b")
+
+>>> prompt = "人とAIが協調するためには、"
+
+>>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids
+
+>>> gen_tokens = model.generate(
+...     input_ids,
+...     do_sample=True,
+...     temperature=0.9,
+...     max_length=100,
+... )
+>>> gen_text = tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)[0]
+
+>>> print(gen_text)
+人とAIが協調するためには、AIと人が共存し、AIを正しく理解する必要があります。
+```
+
+## Documentation resources
+
+- [Causal language modeling task guide](../tasks/language_modeling)
+
+## GPTNeoXJapaneseConfig
+
+[[autodoc]] GPTNeoXJapaneseConfig
+
+## GPTNeoXJapaneseTokenizer
+
+[[autodoc]] GPTNeoXJapaneseTokenizer
+
+## GPTNeoXJapaneseModel
+
+[[autodoc]] GPTNeoXJapaneseModel
+    - forward
+
+## GPTNeoXJapaneseForCausalLM
+
+[[autodoc]] GPTNeoXJapaneseForCausalLM
+    - forward
diff --git a/docs/source/en/model_doc/gpt_neox_japanese.mdx b/docs/source/en/model_doc/gpt_neox_japanese.mdx
deleted file mode 100644
index da94b7497603..000000000000
--- a/docs/source/en/model_doc/gpt_neox_japanese.mdx
+++ /dev/null
@@ -1,66 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# GPT-NeoX-Japanese
-
-## Overview
-
-We introduce GPT-NeoX-Japanese, which is an autoregressive language model for Japanese, trained on top of [https://github.com/EleutherAI/gpt-neox](https://github.com/EleutherAI/gpt-neox).
-Japanese is a unique language with its large vocabulary and a combination of hiragana, katakana, and kanji writing scripts.
-To address this distinct structure of the Japanese language, we use a [special sub-word tokenizer](https://github.com/tanreinama/Japanese-BPEEncoder_V2). We are very grateful to *tanreinama* for open-sourcing this incredibly helpful tokenizer.
-Following the recommendations from Google's research on [PaLM](https://ai.googleblog.com/2022/04/pathways-language-model-palm-scaling-to.html), we have removed bias parameters from transformer blocks, achieving better model performance. Please refer [this article](https://medium.com/ml-abeja/training-a-better-gpt-2-93b157662ae4) in detail.
-
-Development of the model was led by [Shinya Otani](https://github.com/SO0529), [Takayoshi Makabe](https://github.com/spider-man-tm), [Anuj Arora](https://github.com/Anuj040), and [Kyo Hattori](https://github.com/go5paopao) from [ABEJA, Inc.](https://www.abejainc.com/). For more information on this model-building activity, please refer [here (ja)](https://tech-blog.abeja.asia/entry/abeja-gpt-project-202207).
-
-### Generation
-
-The `generate()` method can be used to generate text using GPT NeoX Japanese model.
-
-```python
->>> from transformers import GPTNeoXJapaneseForCausalLM, GPTNeoXJapaneseTokenizer
-
->>> model = GPTNeoXJapaneseForCausalLM.from_pretrained("abeja/gpt-neox-japanese-2.7b")
->>> tokenizer = GPTNeoXJapaneseTokenizer.from_pretrained("abeja/gpt-neox-japanese-2.7b")
-
->>> prompt = "人とAIが協調するためには、"
-
->>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids
-
->>> gen_tokens = model.generate(
-...     input_ids,
-...     do_sample=True,
-...     temperature=0.9,
-...     max_length=100,
-... )
->>> gen_text = tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)[0]
-
->>> print(gen_text)
-人とAIが協調するためには、AIと人が共存し、AIを正しく理解する必要があります。
-```
-
-## GPTNeoXJapaneseConfig
-
-[[autodoc]] GPTNeoXJapaneseConfig
-
-## GPTNeoXJapaneseTokenizer
-
-[[autodoc]] GPTNeoXJapaneseTokenizer
-
-## GPTNeoXJapaneseModel
-
-[[autodoc]] GPTNeoXJapaneseModel
-    - forward
-
-## GPTNeoXJapaneseForCausalLM
-
-[[autodoc]] GPTNeoXJapaneseForCausalLM
-    - forward
diff --git a/docs/source/en/model_doc/gptj.md b/docs/source/en/model_doc/gptj.md
new file mode 100644
index 000000000000..5ad80a010951
--- /dev/null
+++ b/docs/source/en/model_doc/gptj.md
@@ -0,0 +1,189 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# GPT-J
+
+## Overview
+
+The GPT-J model was released in the [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax) repository by Ben Wang and Aran Komatsuzaki. It is a GPT-2-like
+causal language model trained on [the Pile](https://pile.eleuther.ai/) dataset.
+
+This model was contributed by [Stella Biderman](https://huggingface.co/stellaathena).
+
+Tips:
+
+- To load [GPT-J](https://huggingface.co/EleutherAI/gpt-j-6B) in float32 one would need at least 2x model size
+  RAM: 1x for initial weights and another 1x to load the checkpoint. So for GPT-J it would take at least 48GB
+  RAM to just load the model. To reduce the RAM usage there are a few options. The `torch_dtype` argument can be
+  used to initialize the model in half-precision on a CUDA device only. There is also a fp16 branch which stores the fp16 weights,
+  which could be used to further minimize the RAM usage:
+
+```python
+>>> from transformers import GPTJForCausalLM
+>>> import torch
+
+>>> device = "cuda"
+>>> model = GPTJForCausalLM.from_pretrained(
+...     "EleutherAI/gpt-j-6B",
+...     revision="float16",
+...     torch_dtype=torch.float16,
+... ).to(device)
+```
+
+- The model should fit on 16GB GPU for inference. For training/fine-tuning it would take much more GPU RAM. Adam
+  optimizer for example makes four copies of the model: model, gradients, average and squared average of the gradients.
+  So it would need at least 4x model size GPU memory, even with mixed precision as gradient updates are in fp32. This
+  is not including the activations and data batches, which would again require some more GPU RAM. So one should explore
+  solutions such as DeepSpeed, to train/fine-tune the model. Another option is to use the original codebase to
+  train/fine-tune the model on TPU and then convert the model to Transformers format for inference. Instructions for
+  that could be found [here](https://github.com/kingoflolz/mesh-transformer-jax/blob/master/howto_finetune.md)
+
+- Although the embedding matrix has a size of 50400, only 50257 entries are used by the GPT-2 tokenizer. These extra
+  tokens are added for the sake of efficiency on TPUs. To avoid the mismatch between embedding matrix size and vocab
+  size, the tokenizer for [GPT-J](https://huggingface.co/EleutherAI/gpt-j-6B) contains 143 extra tokens
+  `<|extratoken_1|>... <|extratoken_143|>`, so the `vocab_size` of tokenizer also becomes 50400.
+
+### Generation
+
+The [`~generation.GenerationMixin.generate`] method can be used to generate text using GPT-J
+model.
+
+```python
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer
+
+>>> model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")
+>>> tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
+
+>>> prompt = (
+...     "In a shocking finding, scientists discovered a herd of unicorns living in a remote, "
+...     "previously unexplored valley, in the Andes Mountains. Even more surprising to the "
+...     "researchers was the fact that the unicorns spoke perfect English."
+... )
+
+>>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids
+
+>>> gen_tokens = model.generate(
+...     input_ids,
+...     do_sample=True,
+...     temperature=0.9,
+...     max_length=100,
+... )
+>>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
+```
+
+...or in float16 precision:
+
+```python
+>>> from transformers import GPTJForCausalLM, AutoTokenizer
+>>> import torch
+
+>>> device = "cuda"
+>>> model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", torch_dtype=torch.float16).to(device)
+>>> tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
+
+>>> prompt = (
+...     "In a shocking finding, scientists discovered a herd of unicorns living in a remote, "
+...     "previously unexplored valley, in the Andes Mountains. Even more surprising to the "
+...     "researchers was the fact that the unicorns spoke perfect English."
+... )
+
+>>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
+
+>>> gen_tokens = model.generate(
+...     input_ids,
+...     do_sample=True,
+...     temperature=0.9,
+...     max_length=100,
+... )
+>>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
+```
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with GPT-J. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+<PipelineTag pipeline="text-generation"/>
+
+- Description of [GPT-J](https://huggingface.co/EleutherAI/gpt-j-6B).
+- A blog on how to [Deploy GPT-J 6B for inference using Hugging Face Transformers and Amazon SageMaker](https://huggingface.co/blog/gptj-sagemaker).
+- A blog on how to [Accelerate GPT-J inference with DeepSpeed-Inference on GPUs](https://www.philschmid.de/gptj-deepspeed-inference).
+- A blog post introducing [GPT-J-6B: 6B JAX-Based Transformer](https://arankomatsuzaki.wordpress.com/2021/06/04/gpt-j/). 🌎
+- A notebook for [GPT-J-6B Inference Demo](https://colab.research.google.com/github/kingoflolz/mesh-transformer-jax/blob/master/colab_demo.ipynb). 🌎
+- Another notebook demonstrating [Inference with GPT-J-6B](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/GPT-J-6B/Inference_with_GPT_J_6B.ipynb).  
+- [Causal language modeling](https://huggingface.co/course/en/chapter7/6?fw=pt#training-a-causal-language-model-from-scratch) chapter of the 🤗 Hugging Face Course.
+- [`GPTJForCausalLM`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#gpt-2gpt-and-causal-language-modeling), [text generation example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-generation), and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
+- [`TFGPTJForCausalLM`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_clmpy) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
+- [`FlaxGPTJForCausalLM`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#causal-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/causal_language_modeling_flax.ipynb).
+
+**Documentation resources**
+- [Text classification task guide](../tasks/sequence_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Causal language modeling task guide](../tasks/language_modeling)
+
+## GPTJConfig
+
+[[autodoc]] GPTJConfig
+    - all
+
+## GPTJModel
+
+[[autodoc]] GPTJModel
+    - forward
+
+## GPTJForCausalLM
+
+[[autodoc]] GPTJForCausalLM
+    - forward
+
+## GPTJForSequenceClassification
+
+[[autodoc]] GPTJForSequenceClassification
+    - forward
+
+## GPTJForQuestionAnswering
+
+[[autodoc]] GPTJForQuestionAnswering
+    - forward
+
+## TFGPTJModel
+
+[[autodoc]] TFGPTJModel
+    - call
+
+## TFGPTJForCausalLM
+
+[[autodoc]] TFGPTJForCausalLM
+    - call
+
+## TFGPTJForSequenceClassification
+
+[[autodoc]] TFGPTJForSequenceClassification
+    - call
+
+## TFGPTJForQuestionAnswering
+
+[[autodoc]] TFGPTJForQuestionAnswering
+    - call
+
+## FlaxGPTJModel
+
+[[autodoc]] FlaxGPTJModel
+    - __call__
+
+## FlaxGPTJForCausalLM
+
+[[autodoc]] FlaxGPTJForCausalLM
+    - __call__
diff --git a/docs/source/en/model_doc/gptj.mdx b/docs/source/en/model_doc/gptj.mdx
deleted file mode 100644
index a8624c79c997..000000000000
--- a/docs/source/en/model_doc/gptj.mdx
+++ /dev/null
@@ -1,161 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# GPT-J
-
-## Overview
-
-The GPT-J model was released in the [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax) repository by Ben Wang and Aran Komatsuzaki. It is a GPT-2-like
-causal language model trained on [the Pile](https://pile.eleuther.ai/) dataset.
-
-This model was contributed by [Stella Biderman](https://huggingface.co/stellaathena).
-
-Tips:
-
-- To load [GPT-J](https://huggingface.co/EleutherAI/gpt-j-6B) in float32 one would need at least 2x model size CPU
-  RAM: 1x for initial weights and another 1x to load the checkpoint. So for GPT-J it would take at least 48GB of CPU
-  RAM to just load the model. To reduce the CPU RAM usage there are a few options. The `torch_dtype` argument can be
-  used to initialize the model in half-precision. And the `low_cpu_mem_usage` argument can be used to keep the RAM
-  usage to 1x. There is also a [fp16 branch](https://huggingface.co/EleutherAI/gpt-j-6B/tree/float16) which stores
-  the fp16 weights, which could be used to further minimize the RAM usage. Combining all this it should take roughly
-  12.1GB of CPU RAM to load the model.
-
-```python
->>> from transformers import GPTJForCausalLM
->>> import torch
-
->>> model = GPTJForCausalLM.from_pretrained(
-...     "EleutherAI/gpt-j-6B", revision="float16", torch_dtype=torch.float16, low_cpu_mem_usage=True
-... )
-```
-
-- The model should fit on 16GB GPU for inference. For training/fine-tuning it would take much more GPU RAM. Adam
-  optimizer for example makes four copies of the model: model, gradients, average and squared average of the gradients.
-  So it would need at least 4x model size GPU memory, even with mixed precision as gradient updates are in fp32. This
-  is not including the activations and data batches, which would again require some more GPU RAM. So one should explore
-  solutions such as DeepSpeed, to train/fine-tune the model. Another option is to use the original codebase to
-  train/fine-tune the model on TPU and then convert the model to Transformers format for inference. Instructions for
-  that could be found [here](https://github.com/kingoflolz/mesh-transformer-jax/blob/master/howto_finetune.md)
-
-- Although the embedding matrix has a size of 50400, only 50257 entries are used by the GPT-2 tokenizer. These extra
-  tokens are added for the sake of efficiency on TPUs. To avoid the mismatch between embedding matrix size and vocab
-  size, the tokenizer for [GPT-J](https://huggingface.co/EleutherAI/gpt-j-6B) contains 143 extra tokens
-  `<|extratoken_1|>... <|extratoken_143|>`, so the `vocab_size` of tokenizer also becomes 50400.
-
-### Generation
-
-The [`~generation.GenerationMixin.generate`] method can be used to generate text using GPT-J
-model.
-
-```python
->>> from transformers import AutoModelForCausalLM, AutoTokenizer
-
->>> model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")
->>> tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
-
->>> prompt = (
-...     "In a shocking finding, scientists discovered a herd of unicorns living in a remote, "
-...     "previously unexplored valley, in the Andes Mountains. Even more surprising to the "
-...     "researchers was the fact that the unicorns spoke perfect English."
-... )
-
->>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids
-
->>> gen_tokens = model.generate(
-...     input_ids,
-...     do_sample=True,
-...     temperature=0.9,
-...     max_length=100,
-... )
->>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
-```
-
-...or in float16 precision:
-
-```python
->>> from transformers import GPTJForCausalLM, AutoTokenizer
->>> import torch
-
->>> model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", torch_dtype=torch.float16)
->>> tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
-
->>> prompt = (
-...     "In a shocking finding, scientists discovered a herd of unicorns living in a remote, "
-...     "previously unexplored valley, in the Andes Mountains. Even more surprising to the "
-...     "researchers was the fact that the unicorns spoke perfect English."
-... )
-
->>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids
-
->>> gen_tokens = model.generate(
-...     input_ids,
-...     do_sample=True,
-...     temperature=0.9,
-...     max_length=100,
-... )
->>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
-```
-
-## GPTJConfig
-
-[[autodoc]] GPTJConfig
-    - all
-
-## GPTJModel
-
-[[autodoc]] GPTJModel
-    - forward
-
-## GPTJForCausalLM
-
-[[autodoc]] GPTJForCausalLM
-    - forward
-
-## GPTJForSequenceClassification
-
-[[autodoc]] GPTJForSequenceClassification
-    - forward
-
-## GPTJForQuestionAnswering
-
-[[autodoc]] GPTJForQuestionAnswering
-    - forward
-
-## TFGPTJModel
-
-[[autodoc]] TFGPTJModel
-    - call
-
-## TFGPTJForCausalLM
-
-[[autodoc]] TFGPTJForCausalLM
-    - call
-
-## TFGPTJForSequenceClassification
-
-[[autodoc]] TFGPTJForSequenceClassification
-    - call
-
-## TFGPTJForQuestionAnswering
-
-[[autodoc]] TFGPTJForQuestionAnswering
-    - call
-
-## FlaxGPTJModel
-
-[[autodoc]] FlaxGPTJModel
-    - __call__
-
-## FlaxGPTJForCausalLM
-
-[[autodoc]] FlaxGPTJForCausalLM
-    - __call__
diff --git a/docs/source/en/model_doc/gptsan-japanese.md b/docs/source/en/model_doc/gptsan-japanese.md
new file mode 100644
index 000000000000..48f67f850655
--- /dev/null
+++ b/docs/source/en/model_doc/gptsan-japanese.md
@@ -0,0 +1,121 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# GPTSAN-japanese
+
+## Overview
+
+The GPTSAN-japanese model was released in the repository by Toshiyuki Sakamoto (tanreinama).
+
+GPTSAN is a Japanese language model using Switch Transformer. It has the same structure as the model introduced as Prefix LM
+in the T5 paper, and support both Text Generation and Masked Language Modeling tasks. These basic tasks similarly can
+fine-tune for translation or summarization.
+
+### Generation
+
+The `generate()` method can be used to generate text using GPTSAN-Japanese model.
+
+```python
+>>> from transformers import AutoModel, AutoTokenizer
+>>> import torch
+
+>>> tokenizer = AutoTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
+>>> model = AutoModel.from_pretrained("Tanrei/GPTSAN-japanese").cuda()
+>>> x_tok = tokenizer("は、", prefix_text="織田信長", return_tensors="pt")
+>>> torch.manual_seed(0)
+>>> gen_tok = model.generate(x_tok.input_ids.cuda(), token_type_ids=x_tok.token_type_ids.cuda(), max_new_tokens=20)
+>>> tokenizer.decode(gen_tok[0])
+'織田信長は、2004年に『戦国BASARA』のために、豊臣秀吉'
+```
+
+## GPTSAN Features
+
+GPTSAN has some unique features. It has a model structure of Prefix-LM. It works as a shifted Masked Language Model for Prefix Input tokens. Un-prefixed inputs behave like normal generative models.
+The Spout vector is a GPTSAN specific input. Spout is pre-trained with random inputs, but you can specify a class of text or an arbitrary vector during fine-tuning. This allows you to indicate the tendency of the generated text.
+GPTSAN has a sparse Feed Forward based on Switch-Transformer. You can also add other layers and train them partially. See the original GPTSAN repository for details.
+
+### Prefix-LM Model
+
+GPTSAN has the structure of the model named Prefix-LM in the `T5` paper. (The original GPTSAN repository calls it `hybrid`)
+In GPTSAN, the `Prefix` part of Prefix-LM, that is, the input position that can be referenced by both tokens, can be specified with any length.
+Arbitrary lengths can also be specified differently for each batch.
+This length applies to the text entered in `prefix_text` for the tokenizer.
+The tokenizer returns the mask of the `Prefix` part of Prefix-LM as `token_type_ids`.
+The model treats the part where `token_type_ids` is 1 as a `Prefix` part, that is, the input can refer to both tokens before and after.
+
+Tips:
+
+Specifying the Prefix part is done with a mask passed to self-attention.
+When token_type_ids=None or all zero, it is equivalent to regular causal mask
+
+for example:
+
+>>> x_token = tokenizer("ｱｲｳｴ")
+input_ids:      | SOT | SEG | ｱ | ｲ | ｳ | ｴ |
+token_type_ids: | 1   | 0   | 0 | 0 | 0 | 0 |
+prefix_lm_mask:
+SOT | 1 0 0 0 0 0 |
+SEG | 1 1 0 0 0 0 |
+ｱ   | 1 1 1 0 0 0 |
+ｲ   | 1 1 1 1 0 0 |
+ｳ   | 1 1 1 1 1 0 |
+ｴ   | 1 1 1 1 1 1 |
+
+>>> x_token = tokenizer("", prefix_text="ｱｲｳｴ")
+input_ids:      | SOT | ｱ | ｲ | ｳ | ｴ | SEG |
+token_type_ids: | 1   | 1 | 1 | 1 | 1 | 0  |
+prefix_lm_mask:
+SOT | 1 1 1 1 1 0 |
+ｱ   | 1 1 1 1 1 0 |
+ｲ   | 1 1 1 1 1 0 |
+ｳ   | 1 1 1 1 1 0 |
+ｴ   | 1 1 1 1 1 0 |
+SEG | 1 1 1 1 1 1 |
+
+>>> x_token = tokenizer("ｳｴ", prefix_text="ｱｲ")
+input_ids:      | SOT | ｱ | ｲ | SEG | ｳ | ｴ |
+token_type_ids: | 1   | 1 | 1 | 0   | 0 | 0 |
+prefix_lm_mask:
+SOT | 1 1 1 0 0 0 |
+ｱ   | 1 1 1 0 0 0 |
+ｲ   | 1 1 1 0 0 0 |
+SEG | 1 1 1 1 0 0 |
+ｳ   | 1 1 1 1 1 0 |
+ｴ   | 1 1 1 1 1 1 |
+
+### Spout Vector
+
+A Spout Vector is a special vector for controlling text generation.
+This vector is treated as the first embedding in self-attention to bring extraneous attention to the generated tokens.
+In the pre-trained model published from `Tanrei/GPTSAN-japanese`, the Spout Vector is a 128-dimensional vector that passes through 8 fully connected layers in the model and is projected into the space acting as external attention.
+The Spout Vector projected by the fully connected layer is split to be passed to all self-attentions.
+
+## GPTSanJapaneseConfig
+
+[[autodoc]] GPTSanJapaneseConfig
+
+## GPTSanJapaneseTokenizer
+
+[[autodoc]] GPTSanJapaneseTokenizer
+
+## GPTSanJapaneseModel
+
+[[autodoc]] GPTSanJapaneseModel
+
+## GPTSanJapaneseForConditionalGeneration
+
+[[autodoc]] GPTSanJapaneseForConditionalGeneration
+    - forward
diff --git a/docs/source/en/model_doc/graphormer.md b/docs/source/en/model_doc/graphormer.md
new file mode 100644
index 000000000000..16d61bccbef0
--- /dev/null
+++ b/docs/source/en/model_doc/graphormer.md
@@ -0,0 +1,49 @@
+<!--Copyright 2022 The HuggingFace Team and Microsoft. All rights reserved.
+
+Licensed under the MIT License; you may not use this file except in compliance with
+the License. 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Graphormer
+
+## Overview
+
+The Graphormer model was proposed in [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234)  by 
+Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen and Tie-Yan Liu. It is a Graph Transformer model, modified to allow computations on graphs instead of text sequences by generating embeddings and features of interest during preprocessing and collation, then using a modified attention.
+
+The abstract from the paper is the following:
+
+*The Transformer architecture has become a dominant choice in many domains, such as natural language processing and computer vision. Yet, it has not achieved competitive performance on popular leaderboards of graph-level prediction compared to mainstream GNN variants. Therefore, it remains a mystery how Transformers could perform well for graph representation learning. In this paper, we solve this mystery by presenting Graphormer, which is built upon the standard Transformer architecture, and could attain excellent results on a broad range of graph representation learning tasks, especially on the recent OGB Large-Scale Challenge. Our key insight to utilizing Transformer in the graph is the necessity of effectively encoding the structural information of a graph into the model. To this end, we propose several simple yet effective structural encoding methods to help Graphormer better model graph-structured data. Besides, we mathematically characterize the expressive power of Graphormer and exhibit that with our ways of encoding the structural information of graphs, many popular GNN variants could be covered as the special cases of Graphormer.*
+
+Tips:
+
+This model will not work well on large graphs (more than 100 nodes/edges), as it will make the memory explode.
+You can reduce the batch size, increase your RAM, or decrease the `UNREACHABLE_NODE_DISTANCE` parameter in algos_graphormer.pyx, but it will be hard to go above 700 nodes/edges.
+
+This model does not use a tokenizer, but instead a special collator during training.
+
+This model was contributed by [clefourrier](https://huggingface.co/clefourrier). The original code can be found [here](https://github.com/microsoft/Graphormer).
+
+## GraphormerConfig
+
+[[autodoc]] GraphormerConfig
+
+
+## GraphormerModel
+
+[[autodoc]] GraphormerModel
+    - forward
+
+
+## GraphormerForGraphClassification
+
+[[autodoc]] GraphormerForGraphClassification
+    - forward
diff --git a/docs/source/en/model_doc/groupvit.md b/docs/source/en/model_doc/groupvit.md
new file mode 100644
index 000000000000..cf006e284b14
--- /dev/null
+++ b/docs/source/en/model_doc/groupvit.md
@@ -0,0 +1,87 @@
+<!--Copyright 2022 NVIDIA and The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# GroupViT
+
+## Overview
+
+The GroupViT model was proposed in [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
+Inspired by [CLIP](clip), GroupViT is a vision-language model that can perform zero-shot semantic segmentation on any given vocabulary categories.
+
+The abstract from the paper is the following:
+
+*Grouping and recognition are important components of visual scene understanding, e.g., for object detection and semantic segmentation. With end-to-end deep learning systems, grouping of image regions usually happens implicitly via top-down supervision from pixel-level recognition labels. Instead, in this paper, we propose to bring back the grouping mechanism into deep networks, which allows semantic segments to emerge automatically with only text supervision. We propose a hierarchical Grouping Vision Transformer (GroupViT), which goes beyond the regular grid structure representation and learns to group image regions into progressively larger arbitrary-shaped segments. We train GroupViT jointly with a text encoder on a large-scale image-text dataset via contrastive losses. With only text supervision and without any pixel-level annotations, GroupViT learns to group together semantic regions and successfully transfers to the task of semantic segmentation in a zero-shot manner, i.e., without any further fine-tuning. It achieves a zero-shot accuracy of 52.3% mIoU on the PASCAL VOC 2012 and 22.4% mIoU on PASCAL Context datasets, and performs competitively to state-of-the-art transfer-learning methods requiring greater levels of supervision.*
+
+Tips:
+
+- You may specify `output_segmentation=True` in the forward of `GroupViTModel` to get the segmentation logits of input texts. 
+
+This model was contributed by [xvjiarui](https://huggingface.co/xvjiarui). The TensorFlow version was contributed by [ariG23498](https://huggingface.co/ariG23498) with the help of [Yih-Dar SHIEH](https://huggingface.co/ydshieh), [Amy Roberts](https://huggingface.co/amyeroberts), and [Joao Gante](https://huggingface.co/joaogante).
+The original code can be found [here](https://github.com/NVlabs/GroupViT).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with GroupViT.
+
+- The quickest way to get started with GroupViT is by checking the [example notebooks](https://github.com/xvjiarui/GroupViT/blob/main/demo/GroupViT_hf_inference_notebook.ipynb) (which showcase zero-shot segmentation inference).
+- One can also check out the [HuggingFace Spaces demo](https://huggingface.co/spaces/xvjiarui/GroupViT) to play with GroupViT. 
+
+## GroupViTConfig
+
+[[autodoc]] GroupViTConfig
+    - from_text_vision_configs
+
+## GroupViTTextConfig
+
+[[autodoc]] GroupViTTextConfig
+
+## GroupViTVisionConfig
+
+[[autodoc]] GroupViTVisionConfig
+
+## GroupViTModel
+
+[[autodoc]] GroupViTModel
+    - forward
+    - get_text_features
+    - get_image_features
+
+## GroupViTTextModel
+
+[[autodoc]] GroupViTTextModel
+    - forward
+
+## GroupViTVisionModel
+
+[[autodoc]] GroupViTVisionModel
+    - forward
+
+## TFGroupViTModel
+
+[[autodoc]] TFGroupViTModel
+    - call
+    - get_text_features
+    - get_image_features
+
+## TFGroupViTTextModel
+
+[[autodoc]] TFGroupViTTextModel
+    - call
+
+## TFGroupViTVisionModel
+
+[[autodoc]] TFGroupViTVisionModel
+    - call
\ No newline at end of file
diff --git a/docs/source/en/model_doc/groupvit.mdx b/docs/source/en/model_doc/groupvit.mdx
deleted file mode 100644
index 8c955a2e30f7..000000000000
--- a/docs/source/en/model_doc/groupvit.mdx
+++ /dev/null
@@ -1,78 +0,0 @@
-<!--Copyright 2022 NVIDIA and The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# GroupViT
-
-## Overview
-
-The GroupViT model was proposed in [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
-Inspired by [CLIP](clip), GroupViT is a vision-language model that can perform zero-shot semantic segmentation on any given vocabulary categories.
-
-The abstract from the paper is the following:
-
-*Grouping and recognition are important components of visual scene understanding, e.g., for object detection and semantic segmentation. With end-to-end deep learning systems, grouping of image regions usually happens implicitly via top-down supervision from pixel-level recognition labels. Instead, in this paper, we propose to bring back the grouping mechanism into deep networks, which allows semantic segments to emerge automatically with only text supervision. We propose a hierarchical Grouping Vision Transformer (GroupViT), which goes beyond the regular grid structure representation and learns to group image regions into progressively larger arbitrary-shaped segments. We train GroupViT jointly with a text encoder on a large-scale image-text dataset via contrastive losses. With only text supervision and without any pixel-level annotations, GroupViT learns to group together semantic regions and successfully transfers to the task of semantic segmentation in a zero-shot manner, i.e., without any further fine-tuning. It achieves a zero-shot accuracy of 52.3% mIoU on the PASCAL VOC 2012 and 22.4% mIoU on PASCAL Context datasets, and performs competitively to state-of-the-art transfer-learning methods requiring greater levels of supervision.*
-
-Tips:
-
-- You may specify `output_segmentation=True` in the forward of `GroupViTModel` to get the segmentation logits of input texts. 
-- The quickest way to get started with GroupViT is by checking the [example notebooks](https://github.com/xvjiarui/GroupViT/blob/main/demo/GroupViT_hf_inference_notebook.ipynb) (which showcase zero-shot segmentation inference). One can also check out the [HuggingFace Spaces demo](https://huggingface.co/spaces/xvjiarui/GroupViT) to play with GroupViT. 
-
-This model was contributed by [xvjiarui](https://huggingface.co/xvjiarui). The TensorFlow version was contributed by [ariG23498](https://huggingface.co/ariG23498) with the help of [Yih-Dar SHIEH](https://huggingface.co/ydshieh), [Amy Roberts](https://huggingface.co/amyeroberts), and [Joao Gante](https://huggingface.co/joaogante).
-The original code can be found [here](https://github.com/NVlabs/GroupViT).
-
-
-## GroupViTConfig
-
-[[autodoc]] GroupViTConfig
-    - from_text_vision_configs
-
-## GroupViTTextConfig
-
-[[autodoc]] GroupViTTextConfig
-
-## GroupViTVisionConfig
-
-[[autodoc]] GroupViTVisionConfig
-
-## GroupViTModel
-
-[[autodoc]] GroupViTModel
-    - forward
-    - get_text_features
-    - get_image_features
-
-## GroupViTTextModel
-
-[[autodoc]] GroupViTTextModel
-    - forward
-
-## GroupViTVisionModel
-
-[[autodoc]] GroupViTVisionModel
-    - forward
-
-## TFGroupViTModel
-
-[[autodoc]] TFGroupViTModel
-    - call
-    - get_text_features
-    - get_image_features
-
-## TFGroupViTTextModel
-
-[[autodoc]] TFGroupViTTextModel
-    - call
-
-## TFGroupViTVisionModel
-
-[[autodoc]] TFGroupViTVisionModel
-    - call
\ No newline at end of file
diff --git a/docs/source/en/model_doc/herbert.md b/docs/source/en/model_doc/herbert.md
new file mode 100644
index 000000000000..ee927bddb025
--- /dev/null
+++ b/docs/source/en/model_doc/herbert.md
@@ -0,0 +1,69 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# HerBERT
+
+## Overview
+
+The HerBERT model was proposed in [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, and
+Ireneusz Gawlik. It is a BERT-based Language Model trained on Polish Corpora using only MLM objective with dynamic
+masking of whole words.
+
+The abstract from the paper is the following:
+
+*In recent years, a series of Transformer-based models unlocked major improvements in general natural language
+understanding (NLU) tasks. Such a fast pace of research would not be possible without general NLU benchmarks, which
+allow for a fair comparison of the proposed methods. However, such benchmarks are available only for a handful of
+languages. To alleviate this issue, we introduce a comprehensive multi-task benchmark for the Polish language
+understanding, accompanied by an online leaderboard. It consists of a diverse set of tasks, adopted from existing
+datasets for named entity recognition, question-answering, textual entailment, and others. We also introduce a new
+sentiment analysis task for the e-commerce domain, named Allegro Reviews (AR). To ensure a common evaluation scheme and
+promote models that generalize to different NLU tasks, the benchmark includes datasets from varying domains and
+applications. Additionally, we release HerBERT, a Transformer-based model trained specifically for the Polish language,
+which has the best average performance and obtains the best results for three out of nine tasks. Finally, we provide an
+extensive evaluation, including several standard baselines and recently proposed, multilingual Transformer-based
+models.*
+
+Examples of use:
+
+```python
+>>> from transformers import HerbertTokenizer, RobertaModel
+
+>>> tokenizer = HerbertTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
+>>> model = RobertaModel.from_pretrained("allegro/herbert-klej-cased-v1")
+
+>>> encoded_input = tokenizer.encode("Kto ma lepszą sztukę, ma lepszy rząd – to jasne.", return_tensors="pt")
+>>> outputs = model(encoded_input)
+
+>>> # HerBERT can also be loaded using AutoTokenizer and AutoModel:
+>>> import torch
+>>> from transformers import AutoModel, AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
+>>> model = AutoModel.from_pretrained("allegro/herbert-klej-cased-v1")
+```
+
+This model was contributed by [rmroczkowski](https://huggingface.co/rmroczkowski). The original code can be found
+[here](https://github.com/allegro/HerBERT).
+
+
+## HerbertTokenizer
+
+[[autodoc]] HerbertTokenizer
+
+## HerbertTokenizerFast
+
+[[autodoc]] HerbertTokenizerFast
diff --git a/docs/source/en/model_doc/herbert.mdx b/docs/source/en/model_doc/herbert.mdx
deleted file mode 100644
index 90e08ebe9ac7..000000000000
--- a/docs/source/en/model_doc/herbert.mdx
+++ /dev/null
@@ -1,65 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# HerBERT
-
-## Overview
-
-The HerBERT model was proposed in [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, and
-Ireneusz Gawlik. It is a BERT-based Language Model trained on Polish Corpora using only MLM objective with dynamic
-masking of whole words.
-
-The abstract from the paper is the following:
-
-*In recent years, a series of Transformer-based models unlocked major improvements in general natural language
-understanding (NLU) tasks. Such a fast pace of research would not be possible without general NLU benchmarks, which
-allow for a fair comparison of the proposed methods. However, such benchmarks are available only for a handful of
-languages. To alleviate this issue, we introduce a comprehensive multi-task benchmark for the Polish language
-understanding, accompanied by an online leaderboard. It consists of a diverse set of tasks, adopted from existing
-datasets for named entity recognition, question-answering, textual entailment, and others. We also introduce a new
-sentiment analysis task for the e-commerce domain, named Allegro Reviews (AR). To ensure a common evaluation scheme and
-promote models that generalize to different NLU tasks, the benchmark includes datasets from varying domains and
-applications. Additionally, we release HerBERT, a Transformer-based model trained specifically for the Polish language,
-which has the best average performance and obtains the best results for three out of nine tasks. Finally, we provide an
-extensive evaluation, including several standard baselines and recently proposed, multilingual Transformer-based
-models.*
-
-Examples of use:
-
-```python
->>> from transformers import HerbertTokenizer, RobertaModel
-
->>> tokenizer = HerbertTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
->>> model = RobertaModel.from_pretrained("allegro/herbert-klej-cased-v1")
-
->>> encoded_input = tokenizer.encode("Kto ma lepszą sztukę, ma lepszy rząd – to jasne.", return_tensors="pt")
->>> outputs = model(encoded_input)
-
->>> # HerBERT can also be loaded using AutoTokenizer and AutoModel:
->>> import torch
->>> from transformers import AutoModel, AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
->>> model = AutoModel.from_pretrained("allegro/herbert-klej-cased-v1")
-```
-
-This model was contributed by [rmroczkowski](https://huggingface.co/rmroczkowski). The original code can be found
-[here](https://github.com/allegro/HerBERT).
-
-
-## HerbertTokenizer
-
-[[autodoc]] HerbertTokenizer
-
-## HerbertTokenizerFast
-
-[[autodoc]] HerbertTokenizerFast
diff --git a/docs/source/en/model_doc/hubert.md b/docs/source/en/model_doc/hubert.md
new file mode 100644
index 000000000000..5349e1388523
--- /dev/null
+++ b/docs/source/en/model_doc/hubert.md
@@ -0,0 +1,79 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Hubert
+
+## Overview
+
+Hubert was proposed in [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan
+Salakhutdinov, Abdelrahman Mohamed.
+
+The abstract from the paper is the following:
+
+*Self-supervised approaches for speech representation learning are challenged by three unique problems: (1) there are
+multiple sound units in each input utterance, (2) there is no lexicon of input sound units during the pre-training
+phase, and (3) sound units have variable lengths with no explicit segmentation. To deal with these three problems, we
+propose the Hidden-Unit BERT (HuBERT) approach for self-supervised speech representation learning, which utilizes an
+offline clustering step to provide aligned target labels for a BERT-like prediction loss. A key ingredient of our
+approach is applying the prediction loss over the masked regions only, which forces the model to learn a combined
+acoustic and language model over the continuous inputs. HuBERT relies primarily on the consistency of the unsupervised
+clustering step rather than the intrinsic quality of the assigned cluster labels. Starting with a simple k-means
+teacher of 100 clusters, and using two iterations of clustering, the HuBERT model either matches or improves upon the
+state-of-the-art wav2vec 2.0 performance on the Librispeech (960h) and Libri-light (60,000h) benchmarks with 10min, 1h,
+10h, 100h, and 960h fine-tuning subsets. Using a 1B parameter model, HuBERT shows up to 19% and 13% relative WER
+reduction on the more challenging dev-other and test-other evaluation subsets.*
+
+Tips:
+
+- Hubert is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
+- Hubert model was fine-tuned using connectionist temporal classification (CTC) so the model output has to be decoded
+  using [`Wav2Vec2CTCTokenizer`].
+
+This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten).
+
+## Documentation resources
+
+- [Audio classification task guide](../tasks/audio_classification)
+- [Automatic speech recognition task guide](../tasks/asr)
+
+## HubertConfig
+
+[[autodoc]] HubertConfig
+
+## HubertModel
+
+[[autodoc]] HubertModel
+    - forward
+
+## HubertForCTC
+
+[[autodoc]] HubertForCTC
+    - forward
+
+## HubertForSequenceClassification
+
+[[autodoc]] HubertForSequenceClassification
+    - forward
+
+## TFHubertModel
+
+[[autodoc]] TFHubertModel
+    - call
+
+## TFHubertForCTC
+
+[[autodoc]] TFHubertForCTC
+    - call
diff --git a/docs/source/en/model_doc/hubert.mdx b/docs/source/en/model_doc/hubert.mdx
deleted file mode 100644
index faab44b89d58..000000000000
--- a/docs/source/en/model_doc/hubert.mdx
+++ /dev/null
@@ -1,71 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Hubert
-
-## Overview
-
-Hubert was proposed in [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan
-Salakhutdinov, Abdelrahman Mohamed.
-
-The abstract from the paper is the following:
-
-*Self-supervised approaches for speech representation learning are challenged by three unique problems: (1) there are
-multiple sound units in each input utterance, (2) there is no lexicon of input sound units during the pre-training
-phase, and (3) sound units have variable lengths with no explicit segmentation. To deal with these three problems, we
-propose the Hidden-Unit BERT (HuBERT) approach for self-supervised speech representation learning, which utilizes an
-offline clustering step to provide aligned target labels for a BERT-like prediction loss. A key ingredient of our
-approach is applying the prediction loss over the masked regions only, which forces the model to learn a combined
-acoustic and language model over the continuous inputs. HuBERT relies primarily on the consistency of the unsupervised
-clustering step rather than the intrinsic quality of the assigned cluster labels. Starting with a simple k-means
-teacher of 100 clusters, and using two iterations of clustering, the HuBERT model either matches or improves upon the
-state-of-the-art wav2vec 2.0 performance on the Librispeech (960h) and Libri-light (60,000h) benchmarks with 10min, 1h,
-10h, 100h, and 960h fine-tuning subsets. Using a 1B parameter model, HuBERT shows up to 19% and 13% relative WER
-reduction on the more challenging dev-other and test-other evaluation subsets.*
-
-Tips:
-
-- Hubert is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
-- Hubert model was fine-tuned using connectionist temporal classification (CTC) so the model output has to be decoded
-  using [`Wav2Vec2CTCTokenizer`].
-
-This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten).
-
-
-## HubertConfig
-
-[[autodoc]] HubertConfig
-
-## HubertModel
-
-[[autodoc]] HubertModel
-    - forward
-
-## HubertForCTC
-
-[[autodoc]] HubertForCTC
-    - forward
-
-## HubertForSequenceClassification
-
-[[autodoc]] HubertForSequenceClassification
-    - forward
-
-## TFHubertModel
-
-[[autodoc]] TFHubertModel
-    - call
-
-## TFHubertForCTC
-
-[[autodoc]] TFHubertForCTC
-    - call
diff --git a/docs/source/en/model_doc/ibert.md b/docs/source/en/model_doc/ibert.md
new file mode 100644
index 000000000000..9c5f9c3e8de6
--- /dev/null
+++ b/docs/source/en/model_doc/ibert.md
@@ -0,0 +1,83 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# I-BERT
+
+## Overview
+
+The I-BERT model was proposed in [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by
+Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney and Kurt Keutzer. It's a quantized version of RoBERTa running
+inference up to four times faster.
+
+The abstract from the paper is the following:
+
+*Transformer based models, like BERT and RoBERTa, have achieved state-of-the-art results in many Natural Language
+Processing tasks. However, their memory footprint, inference latency, and power consumption are prohibitive for
+efficient inference at the edge, and even at the data center. While quantization can be a viable solution for this,
+previous work on quantizing Transformer based models use floating-point arithmetic during inference, which cannot
+efficiently utilize integer-only logical units such as the recent Turing Tensor Cores, or traditional integer-only ARM
+processors. In this work, we propose I-BERT, a novel quantization scheme for Transformer based models that quantizes
+the entire inference with integer-only arithmetic. Based on lightweight integer-only approximation methods for
+nonlinear operations, e.g., GELU, Softmax, and Layer Normalization, I-BERT performs an end-to-end integer-only BERT
+inference without any floating point calculation. We evaluate our approach on GLUE downstream tasks using
+RoBERTa-Base/Large. We show that for both cases, I-BERT achieves similar (and slightly higher) accuracy as compared to
+the full-precision baseline. Furthermore, our preliminary implementation of I-BERT shows a speedup of 2.4 - 4.0x for
+INT8 inference on a T4 GPU system as compared to FP32 inference. The framework has been developed in PyTorch and has
+been open-sourced.*
+
+This model was contributed by [kssteven](https://huggingface.co/kssteven). The original code can be found [here](https://github.com/kssteven418/I-BERT).
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+- [Multiple choice task guide](../tasks/masked_language_modeling)
+
+## IBertConfig
+
+[[autodoc]] IBertConfig
+
+## IBertModel
+
+[[autodoc]] IBertModel
+    - forward
+
+## IBertForMaskedLM
+
+[[autodoc]] IBertForMaskedLM
+    - forward
+
+## IBertForSequenceClassification
+
+[[autodoc]] IBertForSequenceClassification
+    - forward
+
+## IBertForMultipleChoice
+
+[[autodoc]] IBertForMultipleChoice
+    - forward
+
+## IBertForTokenClassification
+
+[[autodoc]] IBertForTokenClassification
+    - forward
+
+## IBertForQuestionAnswering
+
+[[autodoc]] IBertForQuestionAnswering
+    - forward
diff --git a/docs/source/en/model_doc/ibert.mdx b/docs/source/en/model_doc/ibert.mdx
deleted file mode 100644
index 086e615fe5b4..000000000000
--- a/docs/source/en/model_doc/ibert.mdx
+++ /dev/null
@@ -1,72 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# I-BERT
-
-## Overview
-
-The I-BERT model was proposed in [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by
-Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney and Kurt Keutzer. It's a quantized version of RoBERTa running
-inference up to four times faster.
-
-The abstract from the paper is the following:
-
-*Transformer based models, like BERT and RoBERTa, have achieved state-of-the-art results in many Natural Language
-Processing tasks. However, their memory footprint, inference latency, and power consumption are prohibitive for
-efficient inference at the edge, and even at the data center. While quantization can be a viable solution for this,
-previous work on quantizing Transformer based models use floating-point arithmetic during inference, which cannot
-efficiently utilize integer-only logical units such as the recent Turing Tensor Cores, or traditional integer-only ARM
-processors. In this work, we propose I-BERT, a novel quantization scheme for Transformer based models that quantizes
-the entire inference with integer-only arithmetic. Based on lightweight integer-only approximation methods for
-nonlinear operations, e.g., GELU, Softmax, and Layer Normalization, I-BERT performs an end-to-end integer-only BERT
-inference without any floating point calculation. We evaluate our approach on GLUE downstream tasks using
-RoBERTa-Base/Large. We show that for both cases, I-BERT achieves similar (and slightly higher) accuracy as compared to
-the full-precision baseline. Furthermore, our preliminary implementation of I-BERT shows a speedup of 2.4 - 4.0x for
-INT8 inference on a T4 GPU system as compared to FP32 inference. The framework has been developed in PyTorch and has
-been open-sourced.*
-
-This model was contributed by [kssteven](https://huggingface.co/kssteven). The original code can be found [here](https://github.com/kssteven418/I-BERT).
-
-
-## IBertConfig
-
-[[autodoc]] IBertConfig
-
-## IBertModel
-
-[[autodoc]] IBertModel
-    - forward
-
-## IBertForMaskedLM
-
-[[autodoc]] IBertForMaskedLM
-    - forward
-
-## IBertForSequenceClassification
-
-[[autodoc]] IBertForSequenceClassification
-    - forward
-
-## IBertForMultipleChoice
-
-[[autodoc]] IBertForMultipleChoice
-    - forward
-
-## IBertForTokenClassification
-
-[[autodoc]] IBertForTokenClassification
-    - forward
-
-## IBertForQuestionAnswering
-
-[[autodoc]] IBertForQuestionAnswering
-    - forward
diff --git a/docs/source/en/model_doc/idefics.md b/docs/source/en/model_doc/idefics.md
new file mode 100644
index 000000000000..e0017df0c52f
--- /dev/null
+++ b/docs/source/en/model_doc/idefics.md
@@ -0,0 +1,63 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# IDEFICS
+
+## Overview
+
+The IDEFICS model was proposed in [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents
+](https://huggingface.co/papers/2306.16527
+) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh
+
+The abstract from the paper is the following:
+
+*Large multimodal models trained on natural documents, which interleave images and text, outperform models trained on image-text pairs on various multimodal benchmarks that require reasoning over one or multiple images to generate a text. However, the datasets used to train these models have not been released, and the collection process has not been fully specified. We introduce the OBELICS dataset, an open web-scale filtered dataset of interleaved image-text documents comprising 141 million web pages extracted from Common Crawl, 353 million associated images, and 115 billion text tokens. We describe the dataset creation process, present comprehensive filtering rules, and provide an analysis of the dataset's content. To show the viability of OBELISC, we train an 80 billion parameters vision and language model on the dataset and obtain competitive performance on various multimodal benchmarks. We release the code to reproduce the dataset along with the dataset itself.*
+
+This model was contributed by [HuggingFaceM4](https://huggingface.co/HuggingFaceM4). The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>). (TODO: don't have a public link yet).
+
+
+<Tip warning={true}>
+
+Idefics modeling code in Transformers is for finetuning and inferencing the pre-trained Idefics models.
+
+To train a new Idefics model from scratch use the m4 codebase (a link will be provided once it's made public)
+
+</Tip>
+
+
+## IdeficsConfig
+
+[[autodoc]] IdeficsConfig
+
+## IdeficsModel
+
+[[autodoc]] IdeficsModel
+    - forward
+
+## IdeficsForVisionText2Text
+
+[[autodoc]] IdeficsForVisionText2Text
+    - forward
+
+## IdeficsImageProcessor
+
+[[autodoc]] IdeficsImageProcessor
+    - preprocess
+
+## IdeficsProcessor
+
+[[autodoc]] IdeficsProcessor
+    - __call__
diff --git a/docs/source/en/model_doc/imagegpt.md b/docs/source/en/model_doc/imagegpt.md
new file mode 100644
index 000000000000..01eb7dde5fc2
--- /dev/null
+++ b/docs/source/en/model_doc/imagegpt.md
@@ -0,0 +1,119 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the
+License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+specific language governing permissions and limitations under the License. -->
+
+# ImageGPT
+
+## Overview
+
+The ImageGPT model was proposed in [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt) by Mark
+Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever. ImageGPT (iGPT) is a GPT-2-like
+model trained to predict the next pixel value, allowing for both unconditional and conditional image generation.
+
+The abstract from the paper is the following:
+
+*Inspired by progress in unsupervised representation learning for natural language, we examine whether similar models
+can learn useful representations for images. We train a sequence Transformer to auto-regressively predict pixels,
+without incorporating knowledge of the 2D input structure. Despite training on low-resolution ImageNet without labels,
+we find that a GPT-2 scale model learns strong image representations as measured by linear probing, fine-tuning, and
+low-data classification. On CIFAR-10, we achieve 96.3% accuracy with a linear probe, outperforming a supervised Wide
+ResNet, and 99.0% accuracy with full fine-tuning, matching the top supervised pre-trained models. We are also
+competitive with self-supervised benchmarks on ImageNet when substituting pixels for a VQVAE encoding, achieving 69.0%
+top-1 accuracy on a linear probe of our features.*
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/imagegpt_architecture.png"
+alt="drawing" width="600"/>
+
+<small> Summary of the approach. Taken from the [original paper](https://cdn.openai.com/papers/Generative_Pretraining_from_Pixels_V2.pdf). </small>
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr), based on [this issue](https://github.com/openai/image-gpt/issues/7). The original code can be found
+[here](https://github.com/openai/image-gpt).
+
+Tips:
+
+- ImageGPT is almost exactly the same as [GPT-2](gpt2), with the exception that a different activation
+  function is used (namely "quick gelu"), and the layer normalization layers don't mean center the inputs. ImageGPT
+  also doesn't have tied input- and output embeddings.
+- As the time- and memory requirements of the attention mechanism of Transformers scales quadratically in the sequence
+  length, the authors pre-trained ImageGPT on smaller input resolutions, such as 32x32 and 64x64. However, feeding a
+  sequence of 32x32x3=3072 tokens from 0..255 into a Transformer is still prohibitively large. Therefore, the authors
+  applied k-means clustering to the (R,G,B) pixel values with k=512. This way, we only have a 32*32 = 1024-long
+  sequence, but now of integers in the range 0..511. So we are shrinking the sequence length at the cost of a bigger
+  embedding matrix. In other words, the vocabulary size of ImageGPT is 512, + 1 for a special "start of sentence" (SOS)
+  token, used at the beginning of every sequence. One can use [`ImageGPTImageProcessor`] to prepare
+  images for the model.
+- Despite being pre-trained entirely unsupervised (i.e. without the use of any labels), ImageGPT produces fairly
+  performant image features useful for downstream tasks, such as image classification. The authors showed that the
+  features in the middle of the network are the most performant, and can be used as-is to train a linear model (such as
+  a sklearn logistic regression model for example). This is also referred to as "linear probing". Features can be
+  easily obtained by first forwarding the image through the model, then specifying `output_hidden_states=True`, and
+  then average-pool the hidden states at whatever layer you like.
+- Alternatively, one can further fine-tune the entire model on a downstream dataset, similar to BERT. For this, you can
+  use [`ImageGPTForImageClassification`].
+- ImageGPT comes in different sizes: there's ImageGPT-small, ImageGPT-medium and ImageGPT-large. The authors did also
+  train an XL variant, which they didn't release. The differences in size are summarized in the following table:
+
+| **Model variant** | **Depths** | **Hidden sizes** | **Decoder hidden size** | **Params (M)** | **ImageNet-1k Top 1** |
+|---|---|---|---|---|---|
+| MiT-b0 | [2, 2, 2, 2] | [32, 64, 160, 256] | 256 | 3.7 | 70.5 |
+| MiT-b1 | [2, 2, 2, 2] | [64, 128, 320, 512] | 256 | 14.0 | 78.7 |
+| MiT-b2 | [3, 4, 6, 3] | [64, 128, 320, 512] | 768 | 25.4 | 81.6 |
+| MiT-b3 | [3, 4, 18, 3] | [64, 128, 320, 512] | 768 | 45.2 | 83.1 |
+| MiT-b4 | [3, 8, 27, 3] | [64, 128, 320, 512] | 768 | 62.6 | 83.6 |
+| MiT-b5 | [3, 6, 40, 3] | [64, 128, 320, 512] | 768 | 82.0 | 83.8 |
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ImageGPT.
+
+<PipelineTag pipeline="image-classification"/>
+
+- Demo notebooks for ImageGPT can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/ImageGPT).
+- [`ImageGPTForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
+- See also: [Image classification task guide](../tasks/image_classification)
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## ImageGPTConfig
+
+[[autodoc]] ImageGPTConfig
+
+## ImageGPTFeatureExtractor
+
+[[autodoc]] ImageGPTFeatureExtractor
+
+    - __call__
+
+## ImageGPTImageProcessor
+
+[[autodoc]] ImageGPTImageProcessor
+    - preprocess
+
+## ImageGPTModel
+
+[[autodoc]] ImageGPTModel
+
+    - forward
+
+## ImageGPTForCausalImageModeling
+
+[[autodoc]] ImageGPTForCausalImageModeling
+
+    - forward
+
+## ImageGPTForImageClassification
+
+[[autodoc]] ImageGPTForImageClassification
+
+    - forward
diff --git a/docs/source/en/model_doc/imagegpt.mdx b/docs/source/en/model_doc/imagegpt.mdx
deleted file mode 100644
index ec265d1488e2..000000000000
--- a/docs/source/en/model_doc/imagegpt.mdx
+++ /dev/null
@@ -1,105 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the
-License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
-"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. -->
-
-# ImageGPT
-
-## Overview
-
-The ImageGPT model was proposed in [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt) by Mark
-Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever. ImageGPT (iGPT) is a GPT-2-like
-model trained to predict the next pixel value, allowing for both unconditional and conditional image generation.
-
-The abstract from the paper is the following:
-
-*Inspired by progress in unsupervised representation learning for natural language, we examine whether similar models
-can learn useful representations for images. We train a sequence Transformer to auto-regressively predict pixels,
-without incorporating knowledge of the 2D input structure. Despite training on low-resolution ImageNet without labels,
-we find that a GPT-2 scale model learns strong image representations as measured by linear probing, fine-tuning, and
-low-data classification. On CIFAR-10, we achieve 96.3% accuracy with a linear probe, outperforming a supervised Wide
-ResNet, and 99.0% accuracy with full fine-tuning, matching the top supervised pre-trained models. We are also
-competitive with self-supervised benchmarks on ImageNet when substituting pixels for a VQVAE encoding, achieving 69.0%
-top-1 accuracy on a linear probe of our features.*
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/imagegpt_architecture.png"
-alt="drawing" width="600"/>
-
-<small> Summary of the approach. Taken from the [original paper](https://cdn.openai.com/papers/Generative_Pretraining_from_Pixels_V2.pdf). </small>
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr), based on [this issue](https://github.com/openai/image-gpt/issues/7). The original code can be found
-[here](https://github.com/openai/image-gpt).
-
-Tips:
-
-- Demo notebooks for ImageGPT can be found
-  [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/ImageGPT).
-- ImageGPT is almost exactly the same as [GPT-2](gpt2), with the exception that a different activation
-  function is used (namely "quick gelu"), and the layer normalization layers don't mean center the inputs. ImageGPT
-  also doesn't have tied input- and output embeddings.
-- As the time- and memory requirements of the attention mechanism of Transformers scales quadratically in the sequence
-  length, the authors pre-trained ImageGPT on smaller input resolutions, such as 32x32 and 64x64. However, feeding a
-  sequence of 32x32x3=3072 tokens from 0..255 into a Transformer is still prohibitively large. Therefore, the authors
-  applied k-means clustering to the (R,G,B) pixel values with k=512. This way, we only have a 32*32 = 1024-long
-  sequence, but now of integers in the range 0..511. So we are shrinking the sequence length at the cost of a bigger
-  embedding matrix. In other words, the vocabulary size of ImageGPT is 512, + 1 for a special "start of sentence" (SOS)
-  token, used at the beginning of every sequence. One can use [`ImageGPTImageProcessor`] to prepare
-  images for the model.
-- Despite being pre-trained entirely unsupervised (i.e. without the use of any labels), ImageGPT produces fairly
-  performant image features useful for downstream tasks, such as image classification. The authors showed that the
-  features in the middle of the network are the most performant, and can be used as-is to train a linear model (such as
-  a sklearn logistic regression model for example). This is also referred to as "linear probing". Features can be
-  easily obtained by first forwarding the image through the model, then specifying `output_hidden_states=True`, and
-  then average-pool the hidden states at whatever layer you like.
-- Alternatively, one can further fine-tune the entire model on a downstream dataset, similar to BERT. For this, you can
-  use [`ImageGPTForImageClassification`].
-- ImageGPT comes in different sizes: there's ImageGPT-small, ImageGPT-medium and ImageGPT-large. The authors did also
-  train an XL variant, which they didn't release. The differences in size are summarized in the following table:
-
-| **Model variant** | **Depths** | **Hidden sizes** | **Decoder hidden size** | **Params (M)** | **ImageNet-1k Top 1** |
-|---|---|---|---|---|---|
-| MiT-b0 | [2, 2, 2, 2] | [32, 64, 160, 256] | 256 | 3.7 | 70.5 |
-| MiT-b1 | [2, 2, 2, 2] | [64, 128, 320, 512] | 256 | 14.0 | 78.7 |
-| MiT-b2 | [3, 4, 6, 3] | [64, 128, 320, 512] | 768 | 25.4 | 81.6 |
-| MiT-b3 | [3, 4, 18, 3] | [64, 128, 320, 512] | 768 | 45.2 | 83.1 |
-| MiT-b4 | [3, 8, 27, 3] | [64, 128, 320, 512] | 768 | 62.6 | 83.6 |
-| MiT-b5 | [3, 6, 40, 3] | [64, 128, 320, 512] | 768 | 82.0 | 83.8 |
-
-## ImageGPTConfig
-
-[[autodoc]] ImageGPTConfig
-
-## ImageGPTFeatureExtractor
-
-[[autodoc]] ImageGPTFeatureExtractor
-
-    - __call__
-
-## ImageGPTImageProcessor
-
-[[autodoc]] ImageGPTImageProcessor
-    - preprocess
-
-## ImageGPTModel
-
-[[autodoc]] ImageGPTModel
-
-    - forward
-
-## ImageGPTForCausalImageModeling
-
-[[autodoc]] ImageGPTForCausalImageModeling
-
-    - forward
-
-## ImageGPTForImageClassification
-
-[[autodoc]] ImageGPTForImageClassification
-
-    - forward
diff --git a/docs/source/en/model_doc/informer.md b/docs/source/en/model_doc/informer.md
new file mode 100644
index 000000000000..0d2d82a3f573
--- /dev/null
+++ b/docs/source/en/model_doc/informer.md
@@ -0,0 +1,52 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Informer
+
+## Overview
+
+The Informer model was proposed in [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting ](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
+
+This method introduces a Probabilistic Attention mechanism to select the "active" queries rather than the "lazy" queries and provides a sparse Transformer thus mitigating the quadratic compute and memory requirements of vanilla attention.
+
+The abstract from the paper is the following:
+
+*Many real-world applications require the prediction of long sequence time-series, such as electricity consumption planning. Long sequence time-series forecasting (LSTF) demands a high prediction capacity of the model, which is the ability to capture precise long-range dependency coupling between output and input efficiently. Recent studies have shown the potential of Transformer to increase the prediction capacity. However, there are several severe issues with Transformer that prevent it from being directly applicable to LSTF, including quadratic time complexity, high memory usage, and inherent limitation of the encoder-decoder architecture. To address these issues, we design an efficient transformer-based model for LSTF, named Informer, with three distinctive characteristics: (i) a ProbSparse self-attention mechanism, which achieves O(L logL) in time complexity and memory usage, and has comparable performance on sequences' dependency alignment. (ii) the self-attention distilling highlights dominating attention by halving cascading layer input, and efficiently handles extreme long input sequences. (iii) the generative style decoder, while conceptually simple, predicts the long time-series sequences at one forward operation rather than a step-by-step way, which drastically improves the inference speed of long-sequence predictions. Extensive experiments on four large-scale datasets demonstrate that Informer significantly outperforms existing methods and provides a new solution to the LSTF problem.*
+
+This model was contributed by [elisim](https://huggingface.co/elisim) and [kashif](https://huggingface.co/kashif).
+The original code can be found [here](https://github.com/zhouhaoyi/Informer2020).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+- Check out the Informer blog-post in HuggingFace blog: [Multivariate Probabilistic Time Series Forecasting with Informer](https://huggingface.co/blog/informer)
+
+## InformerConfig
+
+[[autodoc]] InformerConfig
+
+
+## InformerModel
+
+[[autodoc]] InformerModel
+    - forward
+
+
+## InformerForPrediction
+
+[[autodoc]] InformerForPrediction
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/instructblip.md b/docs/source/en/model_doc/instructblip.md
new file mode 100644
index 000000000000..d2cf80e50a5d
--- /dev/null
+++ b/docs/source/en/model_doc/instructblip.md
@@ -0,0 +1,68 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# InstructBLIP
+
+## Overview
+
+The InstructBLIP model was proposed in [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.
+InstructBLIP leverages the [BLIP-2](blip2) architecture for visual instruction tuning.
+
+The abstract from the paper is the following:
+
+*General-purpose language models that can solve various language-domain tasks have emerged driven by the pre-training and instruction-tuning pipeline. However, building general-purpose vision-language models is challenging due to the increased task discrepancy introduced by the additional visual input. Although vision-language pre-training has been widely studied, vision-language instruction tuning remains relatively less explored. In this paper, we conduct a systematic and comprehensive study on vision-language instruction tuning based on the pre-trained BLIP-2 models. We gather a wide variety of 26 publicly available datasets, transform them into instruction tuning format and categorize them into two clusters for held-in instruction tuning and held-out zero-shot evaluation. Additionally, we introduce instruction-aware visual feature extraction, a crucial method that enables the model to extract informative features tailored to the given instruction. The resulting InstructBLIP models achieve state-of-the-art zero-shot performance across all 13 held-out datasets, substantially outperforming BLIP-2 and the larger Flamingo. Our models also lead to state-of-the-art performance when finetuned on individual downstream tasks (e.g., 90.7% accuracy on ScienceQA IMG). Furthermore, we qualitatively demonstrate the advantages of InstructBLIP over concurrent multimodal models.*
+
+Tips:
+
+- InstructBLIP uses the same architecture as [BLIP-2](blip2) with a tiny but important difference: it also feeds the text prompt (instruction) to the Q-Former.
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/instructblip_architecture.jpg"
+alt="drawing" width="600"/>
+
+<small> InstructBLIP architecture. Taken from the <a href="https://arxiv.org/abs/2305.06500">original paper.</a> </small>
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr).
+The original code can be found [here](https://github.com/salesforce/LAVIS/tree/main/projects/instructblip).
+
+
+## InstructBlipConfig
+
+[[autodoc]] InstructBlipConfig
+    - from_vision_qformer_text_configs
+
+## InstructBlipVisionConfig
+
+[[autodoc]] InstructBlipVisionConfig
+
+## InstructBlipQFormerConfig
+
+[[autodoc]] InstructBlipQFormerConfig
+
+## InstructBlipProcessor
+
+[[autodoc]] InstructBlipProcessor
+
+## InstructBlipVisionModel
+
+[[autodoc]] InstructBlipVisionModel
+    - forward
+
+## InstructBlipQFormerModel
+
+[[autodoc]] InstructBlipQFormerModel
+    - forward
+
+## InstructBlipForConditionalGeneration
+
+[[autodoc]] InstructBlipForConditionalGeneration
+    - forward
+    - generate
\ No newline at end of file
diff --git a/docs/source/en/model_doc/jukebox.md b/docs/source/en/model_doc/jukebox.md
new file mode 100644
index 000000000000..5dc87ab6de5f
--- /dev/null
+++ b/docs/source/en/model_doc/jukebox.md
@@ -0,0 +1,83 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+# Jukebox
+
+## Overview
+
+The Jukebox model was proposed in [Jukebox: A generative model for music](https://arxiv.org/pdf/2005.00341.pdf)
+by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford,
+Ilya Sutskever. It introduces a generative music model which can produce minute long samples that can be conditioned on
+an artist, genres and lyrics.
+
+The abstract from the paper is the following:
+
+*We introduce Jukebox, a model that generates music with singing in the raw audio domain. We tackle the long context of raw audio using a multiscale VQ-VAE to compress it to discrete codes, and modeling those using autoregressive Transformers. We show that the combined model at scale can generate high-fidelity and diverse songs with coherence up to multiple minutes. We can condition on artist and genre to steer the musical and vocal style, and on unaligned lyrics to make the singing more controllable. We are releasing thousands of non cherry-picked samples, along with model weights and code.*
+
+As shown on the following figure, Jukebox is made of 3 `priors` which are decoder only models. They follow the architecture described in [Generating Long Sequences with Sparse Transformers](https://arxiv.org/abs/1904.10509), modified to support longer context length.
+First, a autoencoder is used to encode the text lyrics. Next, the first (also called `top_prior`) prior attends to the last hidden states extracted from the lyrics encoder. The priors are linked to the previous priors respectively via an `AudioConditionner` module. The`AudioConditioner` upsamples the outputs of the previous prior to raw tokens at a certain audio frame per second resolution. 
+The metadata such as *artist, genre and timing* are passed to each prior, in the form of a start token and positionnal embedding for the timing data.  The hidden states are mapped to the closest codebook vector from the VQVAE in order to convert them to raw audio.
+
+![JukeboxModel](https://gist.githubusercontent.com/ArthurZucker/92c1acaae62ebf1b6a951710bdd8b6af/raw/c9c517bf4eff61393f6c7dec9366ef02bdd059a3/jukebox.svg)
+
+Tips:
+- This model only supports inference. This is for a few reasons, mostly because it requires a crazy amount of memory to train. Feel free to open a PR and add what's missing to have a full integration with the hugging face traineer!
+- This model is very slow, and takes 8h to generate a minute long audio using the 5b top prior on a V100 GPU. In order automaticallay handle the device on which the model should execute, use `accelerate`.
+- Contrary to the paper, the order of the priors goes from `0` to `1` as it felt more intuitive : we sample starting from `0`.
+- Primed sampling (conditionning the sampling on raw audio) requires more memory than ancestral sampling and should be used with `fp16` set to `True`.
+
+This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ).
+The original code can be found [here](https://github.com/openai/jukebox).
+
+## JukeboxConfig
+
+[[autodoc]] JukeboxConfig
+
+## JukeboxPriorConfig
+
+[[autodoc]] JukeboxPriorConfig
+
+## JukeboxVQVAEConfig
+
+[[autodoc]] JukeboxVQVAEConfig
+
+## JukeboxTokenizer
+
+[[autodoc]] JukeboxTokenizer
+    - save_vocabulary
+
+## JukeboxModel
+
+[[autodoc]] JukeboxModel
+    - ancestral_sample
+    - primed_sample
+    - continue_sample
+    - upsample
+    - _sample
+
+
+## JukeboxPrior
+
+[[autodoc]] JukeboxPrior
+    - sample
+    - forward
+
+
+## JukeboxVQVAE
+
+[[autodoc]] JukeboxVQVAE
+    - forward
+    - encode
+    - decode
diff --git a/docs/source/en/model_doc/jukebox.mdx b/docs/source/en/model_doc/jukebox.mdx
deleted file mode 100644
index 860fb8fc3f67..000000000000
--- a/docs/source/en/model_doc/jukebox.mdx
+++ /dev/null
@@ -1,79 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-# Jukebox
-
-## Overview
-
-The Jukebox model was proposed in [Jukebox: A generative model for music](https://arxiv.org/pdf/2005.00341.pdf)
-by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford,
-Ilya Sutskever. It introduces a generative music model which can produce minute long samples that can be conditionned on
-an artist, genres and lyrics.
-
-The abstract from the paper is the following:
-
-*We introduce Jukebox, a model that generates music with singing in the raw audio domain. We tackle the long context of raw audio using a multiscale VQ-VAE to compress it to discrete codes, and modeling those using autoregressive Transformers. We show that the combined model at scale can generate high-fidelity and diverse songs with coherence up to multiple minutes. We can condition on artist and genre to steer the musical and vocal style, and on unaligned lyrics to make the singing more controllable. We are releasing thousands of non cherry-picked samples, along with model weights and code.*
-
-As shown on the following figure, Jukebox is made of 3 `priors` which are decoder only models. They follow the architecture described in [Generating Long Sequences with Sparse Transformers](https://arxiv.org/abs/1904.10509), modified to support longer context length.
-First, a autoencoder is used to encode the text lyrics. Next, the first (also called `top_prior`) prior attends to the last hidden states extracted from the lyrics encoder. The priors are linked to the previous priors respectively via an `AudioConditionner` module. The`AudioConditioner` upsamples the outputs of the previous prior to raw tokens at a certain audio frame per second resolution. 
-The metadata such as *artist, genre and timing* are passed to each prior, in the form of a start token and positionnal embedding for the timing data.  The hidden states are mapped to the closest codebook vector from the VQVAE in order to convert them to raw audio.
-
-![JukeboxModel](https://gist.githubusercontent.com/ArthurZucker/92c1acaae62ebf1b6a951710bdd8b6af/raw/c9c517bf4eff61393f6c7dec9366ef02bdd059a3/jukebox.svg)
-
-Tips:
-- This model only supports inference. This is for a few reasons, mostly because it requires a crazy amount of memory to train. Feel free to open a PR and add what's missing to have a full integration with the hugging face traineer!
-- This model is very slow, and takes 8h to generate a minute long audio using the 5b top prior on a V100 GPU. In order automaticallay handle the device on which the model should execute, use `accelerate`.
-- Contrary to the paper, the order of the priors goes from `0` to `1` as it felt more intuitive : we sample starting from `0`.
-- Primed sampling (conditionning the sampling on raw audio) requires more memory than ancestral sampling and should be used with `fp16` set to `True`.
-
-This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ).
-The original code can be found [here](https://github.com/openai/jukebox).
-
-## JukeboxConfig
-
-[[autodoc]] JukeboxConfig
-
-## JukeboxPriorConfig
-
-[[autodoc]] JukeboxPriorConfig
-
-## JukeboxVQVAEConfig
-
-[[autodoc]] JukeboxVQVAEConfig
-
-## JukeboxTokenizer
-
-[[autodoc]] JukeboxTokenizer
-    - save_vocabulary
-
-## JukeboxModel
-
-[[autodoc]] JukeboxModel
-    - ancestral_sample
-    - primed_sample
-    - continue_sample
-    - upsample
-    - _sample
-
-
-## JukeboxPrior
-
-[[autodoc]] JukeboxPrior
-    - sample
-    - forward
-
-
-## JukeboxVQVAE
-
-[[autodoc]] JukeboxVQVAE
-    - forward
-    - encode
-    - decode
diff --git a/docs/source/en/model_doc/layoutlm.md b/docs/source/en/model_doc/layoutlm.md
new file mode 100644
index 000000000000..ebf6b1a4b4fc
--- /dev/null
+++ b/docs/source/en/model_doc/layoutlm.md
@@ -0,0 +1,164 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# LayoutLM
+
+<a id='Overview'></a>
+
+## Overview
+
+The LayoutLM model was proposed in the paper [LayoutLM: Pre-training of Text and Layout for Document Image
+Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and
+Ming Zhou. It's a simple but effective pretraining method of text and layout for document image understanding and
+information extraction tasks, such as form understanding and receipt understanding. It obtains state-of-the-art results
+on several downstream tasks:
+
+- form understanding: the [FUNSD](https://guillaumejaume.github.io/FUNSD/) dataset (a collection of 199 annotated
+  forms comprising more than 30,000 words).
+- receipt understanding: the [SROIE](https://rrc.cvc.uab.es/?ch=13) dataset (a collection of 626 receipts for
+  training and 347 receipts for testing).
+- document image classification: the [RVL-CDIP](https://www.cs.cmu.edu/~aharley/rvl-cdip/) dataset (a collection of
+  400,000 images belonging to one of 16 classes).
+
+The abstract from the paper is the following:
+
+*Pre-training techniques have been verified successfully in a variety of NLP tasks in recent years. Despite the
+widespread use of pretraining models for NLP applications, they almost exclusively focus on text-level manipulation,
+while neglecting layout and style information that is vital for document image understanding. In this paper, we propose
+the LayoutLM to jointly model interactions between text and layout information across scanned document images, which is
+beneficial for a great number of real-world document image understanding tasks such as information extraction from
+scanned documents. Furthermore, we also leverage image features to incorporate words' visual information into LayoutLM.
+To the best of our knowledge, this is the first time that text and layout are jointly learned in a single framework for
+document-level pretraining. It achieves new state-of-the-art results in several downstream tasks, including form
+understanding (from 70.72 to 79.27), receipt understanding (from 94.02 to 95.24) and document image classification
+(from 93.07 to 94.42).*
+
+Tips:
+
+- In addition to *input_ids*, [`~transformers.LayoutLMModel.forward`] also expects the input `bbox`, which are
+  the bounding boxes (i.e. 2D-positions) of the input tokens. These can be obtained using an external OCR engine such
+  as Google's [Tesseract](https://github.com/tesseract-ocr/tesseract) (there's a [Python wrapper](https://pypi.org/project/pytesseract/) available). Each bounding box should be in (x0, y0, x1, y1) format, where
+  (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1, y1) represents the
+  position of the lower right corner. Note that one first needs to normalize the bounding boxes to be on a 0-1000
+  scale. To normalize, you can use the following function:
+
+```python
+def normalize_bbox(bbox, width, height):
+    return [
+        int(1000 * (bbox[0] / width)),
+        int(1000 * (bbox[1] / height)),
+        int(1000 * (bbox[2] / width)),
+        int(1000 * (bbox[3] / height)),
+    ]
+```
+
+Here, `width` and `height` correspond to the width and height of the original document in which the token
+occurs. Those can be obtained using the Python Image Library (PIL) library for example, as follows:
+
+```python
+from PIL import Image
+
+# Document can be a png, jpg, etc. PDFs must be converted to images.
+image = Image.open(name_of_your_document).convert("RGB")
+
+width, height = image.size
+```
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with LayoutLM. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+
+<PipelineTag pipeline="document-question-answering" />
+
+- A blog post on [fine-tuning
+  LayoutLM for document-understanding using Keras & Hugging Face
+  Transformers](https://www.philschmid.de/fine-tuning-layoutlm-keras).
+
+- A blog post on how to [fine-tune LayoutLM for document-understanding using only Hugging Face Transformers](https://www.philschmid.de/fine-tuning-layoutlm).
+
+- A notebook on how to [fine-tune LayoutLM on the FUNSD dataset with image embeddings](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Add_image_embeddings_to_LayoutLM.ipynb).
+
+- See also: [Document question answering task guide](../tasks/document_question_answering)
+
+<PipelineTag pipeline="text-classification" />
+
+- A notebook on how to [fine-tune LayoutLM for sequence classification on the RVL-CDIP dataset](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb).
+- [Text classification task guide](../tasks/sequence_classification)
+
+<PipelineTag pipeline="token-classification" />
+
+- A notebook on how to [ fine-tune LayoutLM for token classification on the FUNSD dataset](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb).
+- [Token classification task guide](../tasks/token_classification)
+
+**Other resources**
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+
+🚀 Deploy
+
+- A blog post on how to [Deploy LayoutLM with Hugging Face Inference Endpoints](https://www.philschmid.de/inference-endpoints-layoutlm).
+
+## LayoutLMConfig
+
+[[autodoc]] LayoutLMConfig
+
+## LayoutLMTokenizer
+
+[[autodoc]] LayoutLMTokenizer
+
+## LayoutLMTokenizerFast
+
+[[autodoc]] LayoutLMTokenizerFast
+
+## LayoutLMModel
+
+[[autodoc]] LayoutLMModel
+
+## LayoutLMForMaskedLM
+
+[[autodoc]] LayoutLMForMaskedLM
+
+## LayoutLMForSequenceClassification
+
+[[autodoc]] LayoutLMForSequenceClassification
+
+## LayoutLMForTokenClassification
+
+[[autodoc]] LayoutLMForTokenClassification
+
+## LayoutLMForQuestionAnswering
+
+[[autodoc]] LayoutLMForQuestionAnswering
+
+## TFLayoutLMModel
+
+[[autodoc]] TFLayoutLMModel
+
+## TFLayoutLMForMaskedLM
+
+[[autodoc]] TFLayoutLMForMaskedLM
+
+## TFLayoutLMForSequenceClassification
+
+[[autodoc]] TFLayoutLMForSequenceClassification
+
+## TFLayoutLMForTokenClassification
+
+[[autodoc]] TFLayoutLMForTokenClassification
+
+## TFLayoutLMForQuestionAnswering
+
+[[autodoc]] TFLayoutLMForQuestionAnswering
diff --git a/docs/source/en/model_doc/layoutlm.mdx b/docs/source/en/model_doc/layoutlm.mdx
deleted file mode 100644
index 257a4600c8cc..000000000000
--- a/docs/source/en/model_doc/layoutlm.mdx
+++ /dev/null
@@ -1,133 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# LayoutLM
-
-<a id='Overview'></a>
-
-## Overview
-
-The LayoutLM model was proposed in the paper [LayoutLM: Pre-training of Text and Layout for Document Image
-Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and
-Ming Zhou. It's a simple but effective pretraining method of text and layout for document image understanding and
-information extraction tasks, such as form understanding and receipt understanding. It obtains state-of-the-art results
-on several downstream tasks:
-
-- form understanding: the [FUNSD](https://guillaumejaume.github.io/FUNSD/) dataset (a collection of 199 annotated
-  forms comprising more than 30,000 words).
-- receipt understanding: the [SROIE](https://rrc.cvc.uab.es/?ch=13) dataset (a collection of 626 receipts for
-  training and 347 receipts for testing).
-- document image classification: the [RVL-CDIP](https://www.cs.cmu.edu/~aharley/rvl-cdip/) dataset (a collection of
-  400,000 images belonging to one of 16 classes).
-
-The abstract from the paper is the following:
-
-*Pre-training techniques have been verified successfully in a variety of NLP tasks in recent years. Despite the
-widespread use of pretraining models for NLP applications, they almost exclusively focus on text-level manipulation,
-while neglecting layout and style information that is vital for document image understanding. In this paper, we propose
-the LayoutLM to jointly model interactions between text and layout information across scanned document images, which is
-beneficial for a great number of real-world document image understanding tasks such as information extraction from
-scanned documents. Furthermore, we also leverage image features to incorporate words' visual information into LayoutLM.
-To the best of our knowledge, this is the first time that text and layout are jointly learned in a single framework for
-document-level pretraining. It achieves new state-of-the-art results in several downstream tasks, including form
-understanding (from 70.72 to 79.27), receipt understanding (from 94.02 to 95.24) and document image classification
-(from 93.07 to 94.42).*
-
-Tips:
-
-- In addition to *input_ids*, [`~transformers.LayoutLMModel.forward`] also expects the input `bbox`, which are
-  the bounding boxes (i.e. 2D-positions) of the input tokens. These can be obtained using an external OCR engine such
-  as Google's [Tesseract](https://github.com/tesseract-ocr/tesseract) (there's a [Python wrapper](https://pypi.org/project/pytesseract/) available). Each bounding box should be in (x0, y0, x1, y1) format, where
-  (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1, y1) represents the
-  position of the lower right corner. Note that one first needs to normalize the bounding boxes to be on a 0-1000
-  scale. To normalize, you can use the following function:
-
-```python
-def normalize_bbox(bbox, width, height):
-    return [
-        int(1000 * (bbox[0] / width)),
-        int(1000 * (bbox[1] / height)),
-        int(1000 * (bbox[2] / width)),
-        int(1000 * (bbox[3] / height)),
-    ]
-```
-
-Here, `width` and `height` correspond to the width and height of the original document in which the token
-occurs. Those can be obtained using the Python Image Library (PIL) library for example, as follows:
-
-```python
-from PIL import Image
-
-# Document can be a png, jpg, etc. PDFs must be converted to images.
-image = Image.open(name_of_your_document).convert("RGB")
-
-width, height = image.size
-```
-
-- For a demo which shows how to fine-tune [`LayoutLMForTokenClassification`] on the [FUNSD dataset](https://guillaumejaume.github.io/FUNSD/) (a collection of annotated forms), see [this notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb).
-  It includes an inference part, which shows how to use Google's Tesseract on a new document.
-
-This model was contributed by [liminghao1630](https://huggingface.co/liminghao1630). The original code can be found
-[here](https://github.com/microsoft/unilm/tree/master/layoutlm).
-
-
-## LayoutLMConfig
-
-[[autodoc]] LayoutLMConfig
-
-## LayoutLMTokenizer
-
-[[autodoc]] LayoutLMTokenizer
-
-## LayoutLMTokenizerFast
-
-[[autodoc]] LayoutLMTokenizerFast
-
-## LayoutLMModel
-
-[[autodoc]] LayoutLMModel
-
-## LayoutLMForMaskedLM
-
-[[autodoc]] LayoutLMForMaskedLM
-
-## LayoutLMForSequenceClassification
-
-[[autodoc]] LayoutLMForSequenceClassification
-
-## LayoutLMForTokenClassification
-
-[[autodoc]] LayoutLMForTokenClassification
-
-## LayoutLMForQuestionAnswering
-
-[[autodoc]] LayoutLMForQuestionAnswering
-
-## TFLayoutLMModel
-
-[[autodoc]] TFLayoutLMModel
-
-## TFLayoutLMForMaskedLM
-
-[[autodoc]] TFLayoutLMForMaskedLM
-
-## TFLayoutLMForSequenceClassification
-
-[[autodoc]] TFLayoutLMForSequenceClassification
-
-## TFLayoutLMForTokenClassification
-
-[[autodoc]] TFLayoutLMForTokenClassification
-
-## TFLayoutLMForQuestionAnswering
-
-[[autodoc]] TFLayoutLMForQuestionAnswering
diff --git a/docs/source/en/model_doc/layoutlmv2.md b/docs/source/en/model_doc/layoutlmv2.md
new file mode 100644
index 000000000000..f2a1c65a42b1
--- /dev/null
+++ b/docs/source/en/model_doc/layoutlmv2.md
@@ -0,0 +1,340 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# LayoutLMV2
+
+## Overview
+
+The LayoutLMV2 model was proposed in [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu,
+Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou. LayoutLMV2 improves [LayoutLM](layoutlm) to obtain
+state-of-the-art results across several document image understanding benchmarks:
+
+- information extraction from scanned documents: the [FUNSD](https://guillaumejaume.github.io/FUNSD/) dataset (a
+  collection of 199 annotated forms comprising more than 30,000 words), the [CORD](https://github.com/clovaai/cord)
+  dataset (a collection of 800 receipts for training, 100 for validation and 100 for testing), the [SROIE](https://rrc.cvc.uab.es/?ch=13) dataset (a collection of 626 receipts for training and 347 receipts for testing)
+  and the [Kleister-NDA](https://github.com/applicaai/kleister-nda) dataset (a collection of non-disclosure
+  agreements from the EDGAR database, including 254 documents for training, 83 documents for validation, and 203
+  documents for testing).
+- document image classification: the [RVL-CDIP](https://www.cs.cmu.edu/~aharley/rvl-cdip/) dataset (a collection of
+  400,000 images belonging to one of 16 classes).
+- document visual question answering: the [DocVQA](https://arxiv.org/abs/2007.00398) dataset (a collection of 50,000
+  questions defined on 12,000+ document images).
+
+The abstract from the paper is the following:
+
+*Pre-training of text and layout has proved effective in a variety of visually-rich document understanding tasks due to
+its effective model architecture and the advantage of large-scale unlabeled scanned/digital-born documents. In this
+paper, we present LayoutLMv2 by pre-training text, layout and image in a multi-modal framework, where new model
+architectures and pre-training tasks are leveraged. Specifically, LayoutLMv2 not only uses the existing masked
+visual-language modeling task but also the new text-image alignment and text-image matching tasks in the pre-training
+stage, where cross-modality interaction is better learned. Meanwhile, it also integrates a spatial-aware self-attention
+mechanism into the Transformer architecture, so that the model can fully understand the relative positional
+relationship among different text blocks. Experiment results show that LayoutLMv2 outperforms strong baselines and
+achieves new state-of-the-art results on a wide variety of downstream visually-rich document understanding tasks,
+including FUNSD (0.7895 -> 0.8420), CORD (0.9493 -> 0.9601), SROIE (0.9524 -> 0.9781), Kleister-NDA (0.834 -> 0.852),
+RVL-CDIP (0.9443 -> 0.9564), and DocVQA (0.7295 -> 0.8672). The pre-trained LayoutLMv2 model is publicly available at
+this https URL.*
+
+LayoutLMv2 depends on `detectron2`, `torchvision` and `tesseract`. Run the
+following to install them:
+```
+python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'
+python -m pip install torchvision tesseract
+```
+(If you are developing for LayoutLMv2, note that passing the doctests also requires the installation of these packages.)
+
+Tips:
+
+- The main difference between LayoutLMv1 and LayoutLMv2 is that the latter incorporates visual embeddings during
+  pre-training (while LayoutLMv1 only adds visual embeddings during fine-tuning).
+- LayoutLMv2 adds both a relative 1D attention bias as well as a spatial 2D attention bias to the attention scores in
+  the self-attention layers. Details can be found on page 5 of the [paper](https://arxiv.org/abs/2012.14740).
+- Demo notebooks on how to use the LayoutLMv2 model on RVL-CDIP, FUNSD, DocVQA, CORD can be found [here](https://github.com/NielsRogge/Transformers-Tutorials).
+- LayoutLMv2 uses Facebook AI's [Detectron2](https://github.com/facebookresearch/detectron2/) package for its visual
+  backbone. See [this link](https://detectron2.readthedocs.io/en/latest/tutorials/install.html) for installation
+  instructions.
+- In addition to `input_ids`, [`~LayoutLMv2Model.forward`] expects 2 additional inputs, namely
+  `image` and `bbox`. The `image` input corresponds to the original document image in which the text
+  tokens occur. The model expects each document image to be of size 224x224. This means that if you have a batch of
+  document images, `image` should be a tensor of shape (batch_size, 3, 224, 224). This can be either a
+  `torch.Tensor` or a `Detectron2.structures.ImageList`. You don't need to normalize the channels, as this is
+  done by the model. Important to note is that the visual backbone expects BGR channels instead of RGB, as all models
+  in Detectron2 are pre-trained using the BGR format. The `bbox` input are the bounding boxes (i.e. 2D-positions)
+  of the input text tokens. This is identical to [`LayoutLMModel`]. These can be obtained using an
+  external OCR engine such as Google's [Tesseract](https://github.com/tesseract-ocr/tesseract) (there's a [Python
+  wrapper](https://pypi.org/project/pytesseract/) available). Each bounding box should be in (x0, y0, x1, y1)
+  format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1, y1)
+  represents the position of the lower right corner. Note that one first needs to normalize the bounding boxes to be on
+  a 0-1000 scale. To normalize, you can use the following function:
+
+```python
+def normalize_bbox(bbox, width, height):
+    return [
+        int(1000 * (bbox[0] / width)),
+        int(1000 * (bbox[1] / height)),
+        int(1000 * (bbox[2] / width)),
+        int(1000 * (bbox[3] / height)),
+    ]
+```
+
+Here, `width` and `height` correspond to the width and height of the original document in which the token
+occurs (before resizing the image). Those can be obtained using the Python Image Library (PIL) library for example, as
+follows:
+
+```python
+from PIL import Image
+
+image = Image.open(
+    "name_of_your_document - can be a png, jpg, etc. of your documents (PDFs must be converted to images)."
+)
+
+width, height = image.size
+```
+
+However, this model includes a brand new [`~transformers.LayoutLMv2Processor`] which can be used to directly
+prepare data for the model (including applying OCR under the hood). More information can be found in the "Usage"
+section below.
+
+- Internally, [`~transformers.LayoutLMv2Model`] will send the `image` input through its visual backbone to
+  obtain a lower-resolution feature map, whose shape is equal to the `image_feature_pool_shape` attribute of
+  [`~transformers.LayoutLMv2Config`]. This feature map is then flattened to obtain a sequence of image tokens. As
+  the size of the feature map is 7x7 by default, one obtains 49 image tokens. These are then concatenated with the text
+  tokens, and send through the Transformer encoder. This means that the last hidden states of the model will have a
+  length of 512 + 49 = 561, if you pad the text tokens up to the max length. More generally, the last hidden states
+  will have a shape of `seq_length` + `image_feature_pool_shape[0]` *
+  `config.image_feature_pool_shape[1]`.
+- When calling [`~transformers.LayoutLMv2Model.from_pretrained`], a warning will be printed with a long list of
+  parameter names that are not initialized. This is not a problem, as these parameters are batch normalization
+  statistics, which are going to have values when fine-tuning on a custom dataset.
+- If you want to train the model in a distributed environment, make sure to call [`synchronize_batch_norm`] on the
+  model in order to properly synchronize the batch normalization layers of the visual backbone.
+
+In addition, there's LayoutXLM, which is a multilingual version of LayoutLMv2. More information can be found on
+[LayoutXLM's documentation page](layoutxlm).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with LayoutLMv2. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+<PipelineTag pipeline="text-classification"/>
+
+- A notebook on how to [finetune LayoutLMv2 for text-classification on RVL-CDIP dataset](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv2/RVL-CDIP/Fine_tuning_LayoutLMv2ForSequenceClassification_on_RVL_CDIP.ipynb).
+- See also: [Text classification task guide](../tasks/sequence_classification)
+
+<PipelineTag pipeline="question-answering"/>
+
+- A notebook on how to [finetune LayoutLMv2 for question-answering on DocVQA dataset](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv2/DocVQA/Fine_tuning_LayoutLMv2ForQuestionAnswering_on_DocVQA.ipynb).
+- See also: [Question answering task guide](../tasks/question_answering)
+- See also: [Document question answering task guide](../tasks/document_question_answering)
+
+
+<PipelineTag pipeline="token-classification"/>
+
+- A notebook on how to [finetune LayoutLMv2 for token-classification on CORD dataset](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv2/CORD/Fine_tuning_LayoutLMv2ForTokenClassification_on_CORD.ipynb).
+- A notebook on how to [finetune LayoutLMv2 for token-classification on FUNSD dataset](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv2/FUNSD/Fine_tuning_LayoutLMv2ForTokenClassification_on_FUNSD_using_HuggingFace_Trainer.ipynb).
+- See also: [Token classification task guide](../tasks/token_classification)
+
+## Usage: LayoutLMv2Processor
+
+The easiest way to prepare data for the model is to use [`LayoutLMv2Processor`], which internally
+combines a image processor ([`LayoutLMv2ImageProcessor`]) and a tokenizer
+([`LayoutLMv2Tokenizer`] or [`LayoutLMv2TokenizerFast`]). The image processor
+handles the image modality, while the tokenizer handles the text modality. A processor combines both, which is ideal
+for a multi-modal model like LayoutLMv2. Note that you can still use both separately, if you only want to handle one
+modality.
+
+```python
+from transformers import LayoutLMv2ImageProcessor, LayoutLMv2TokenizerFast, LayoutLMv2Processor
+
+image_processor = LayoutLMv2ImageProcessor()  # apply_ocr is set to True by default
+tokenizer = LayoutLMv2TokenizerFast.from_pretrained("microsoft/layoutlmv2-base-uncased")
+processor = LayoutLMv2Processor(image_processor, tokenizer)
+```
+
+In short, one can provide a document image (and possibly additional data) to [`LayoutLMv2Processor`],
+and it will create the inputs expected by the model. Internally, the processor first uses
+[`LayoutLMv2ImageProcessor`] to apply OCR on the image to get a list of words and normalized
+bounding boxes, as well to resize the image to a given size in order to get the `image` input. The words and
+normalized bounding boxes are then provided to [`LayoutLMv2Tokenizer`] or
+[`LayoutLMv2TokenizerFast`], which converts them to token-level `input_ids`,
+`attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide word labels to the processor,
+which are turned into token-level `labels`.
+
+[`LayoutLMv2Processor`] uses [PyTesseract](https://pypi.org/project/pytesseract/), a Python
+wrapper around Google's Tesseract OCR engine, under the hood. Note that you can still use your own OCR engine of
+choice, and provide the words and normalized boxes yourself. This requires initializing
+[`LayoutLMv2ImageProcessor`] with `apply_ocr` set to `False`.
+
+In total, there are 5 use cases that are supported by the processor. Below, we list them all. Note that each of these
+use cases work for both batched and non-batched inputs (we illustrate them for non-batched inputs).
+
+**Use case 1: document image classification (training, inference) + token classification (inference), apply_ocr =
+True**
+
+This is the simplest case, in which the processor (actually the image processor) will perform OCR on the image to get
+the words and normalized bounding boxes.
+
+```python
+from transformers import LayoutLMv2Processor
+from PIL import Image
+
+processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")
+
+image = Image.open(
+    "name_of_your_document - can be a png, jpg, etc. of your documents (PDFs must be converted to images)."
+).convert("RGB")
+encoding = processor(
+    image, return_tensors="pt"
+)  # you can also add all tokenizer parameters here such as padding, truncation
+print(encoding.keys())
+# dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image'])
+```
+
+**Use case 2: document image classification (training, inference) + token classification (inference), apply_ocr=False**
+
+In case one wants to do OCR themselves, one can initialize the image processor with `apply_ocr` set to
+`False`. In that case, one should provide the words and corresponding (normalized) bounding boxes themselves to
+the processor.
+
+```python
+from transformers import LayoutLMv2Processor
+from PIL import Image
+
+processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr")
+
+image = Image.open(
+    "name_of_your_document - can be a png, jpg, etc. of your documents (PDFs must be converted to images)."
+).convert("RGB")
+words = ["hello", "world"]
+boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]  # make sure to normalize your bounding boxes
+encoding = processor(image, words, boxes=boxes, return_tensors="pt")
+print(encoding.keys())
+# dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image'])
+```
+
+**Use case 3: token classification (training), apply_ocr=False**
+
+For token classification tasks (such as FUNSD, CORD, SROIE, Kleister-NDA), one can also provide the corresponding word
+labels in order to train a model. The processor will then convert these into token-level `labels`. By default, it
+will only label the first wordpiece of a word, and label the remaining wordpieces with -100, which is the
+`ignore_index` of PyTorch's CrossEntropyLoss. In case you want all wordpieces of a word to be labeled, you can
+initialize the tokenizer with `only_label_first_subword` set to `False`.
+
+```python
+from transformers import LayoutLMv2Processor
+from PIL import Image
+
+processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr")
+
+image = Image.open(
+    "name_of_your_document - can be a png, jpg, etc. of your documents (PDFs must be converted to images)."
+).convert("RGB")
+words = ["hello", "world"]
+boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]  # make sure to normalize your bounding boxes
+word_labels = [1, 2]
+encoding = processor(image, words, boxes=boxes, word_labels=word_labels, return_tensors="pt")
+print(encoding.keys())
+# dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'labels', 'image'])
+```
+
+**Use case 4: visual question answering (inference), apply_ocr=True**
+
+For visual question answering tasks (such as DocVQA), you can provide a question to the processor. By default, the
+processor will apply OCR on the image, and create [CLS] question tokens [SEP] word tokens [SEP].
+
+```python
+from transformers import LayoutLMv2Processor
+from PIL import Image
+
+processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")
+
+image = Image.open(
+    "name_of_your_document - can be a png, jpg, etc. of your documents (PDFs must be converted to images)."
+).convert("RGB")
+question = "What's his name?"
+encoding = processor(image, question, return_tensors="pt")
+print(encoding.keys())
+# dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image'])
+```
+
+**Use case 5: visual question answering (inference), apply_ocr=False**
+
+For visual question answering tasks (such as DocVQA), you can provide a question to the processor. If you want to
+perform OCR yourself, you can provide your own words and (normalized) bounding boxes to the processor.
+
+```python
+from transformers import LayoutLMv2Processor
+from PIL import Image
+
+processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr")
+
+image = Image.open(
+    "name_of_your_document - can be a png, jpg, etc. of your documents (PDFs must be converted to images)."
+).convert("RGB")
+question = "What's his name?"
+words = ["hello", "world"]
+boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]  # make sure to normalize your bounding boxes
+encoding = processor(image, question, words, boxes=boxes, return_tensors="pt")
+print(encoding.keys())
+# dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image'])
+```
+
+## LayoutLMv2Config
+
+[[autodoc]] LayoutLMv2Config
+
+## LayoutLMv2FeatureExtractor
+
+[[autodoc]] LayoutLMv2FeatureExtractor
+    - __call__
+
+## LayoutLMv2ImageProcessor
+
+[[autodoc]] LayoutLMv2ImageProcessor
+    - preprocess
+
+## LayoutLMv2Tokenizer
+
+[[autodoc]] LayoutLMv2Tokenizer
+    - __call__
+    - save_vocabulary
+
+## LayoutLMv2TokenizerFast
+
+[[autodoc]] LayoutLMv2TokenizerFast
+    - __call__
+
+## LayoutLMv2Processor
+
+[[autodoc]] LayoutLMv2Processor
+    - __call__
+
+## LayoutLMv2Model
+
+[[autodoc]] LayoutLMv2Model
+    - forward
+
+## LayoutLMv2ForSequenceClassification
+
+[[autodoc]] LayoutLMv2ForSequenceClassification
+
+## LayoutLMv2ForTokenClassification
+
+[[autodoc]] LayoutLMv2ForTokenClassification
+
+## LayoutLMv2ForQuestionAnswering
+
+[[autodoc]] LayoutLMv2ForQuestionAnswering
diff --git a/docs/source/en/model_doc/layoutlmv2.mdx b/docs/source/en/model_doc/layoutlmv2.mdx
deleted file mode 100644
index dc225d768d50..000000000000
--- a/docs/source/en/model_doc/layoutlmv2.mdx
+++ /dev/null
@@ -1,314 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# LayoutLMV2
-
-## Overview
-
-The LayoutLMV2 model was proposed in [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu,
-Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou. LayoutLMV2 improves [LayoutLM](layoutlm) to obtain
-state-of-the-art results across several document image understanding benchmarks:
-
-- information extraction from scanned documents: the [FUNSD](https://guillaumejaume.github.io/FUNSD/) dataset (a
-  collection of 199 annotated forms comprising more than 30,000 words), the [CORD](https://github.com/clovaai/cord)
-  dataset (a collection of 800 receipts for training, 100 for validation and 100 for testing), the [SROIE](https://rrc.cvc.uab.es/?ch=13) dataset (a collection of 626 receipts for training and 347 receipts for testing)
-  and the [Kleister-NDA](https://github.com/applicaai/kleister-nda) dataset (a collection of non-disclosure
-  agreements from the EDGAR database, including 254 documents for training, 83 documents for validation, and 203
-  documents for testing).
-- document image classification: the [RVL-CDIP](https://www.cs.cmu.edu/~aharley/rvl-cdip/) dataset (a collection of
-  400,000 images belonging to one of 16 classes).
-- document visual question answering: the [DocVQA](https://arxiv.org/abs/2007.00398) dataset (a collection of 50,000
-  questions defined on 12,000+ document images).
-
-The abstract from the paper is the following:
-
-*Pre-training of text and layout has proved effective in a variety of visually-rich document understanding tasks due to
-its effective model architecture and the advantage of large-scale unlabeled scanned/digital-born documents. In this
-paper, we present LayoutLMv2 by pre-training text, layout and image in a multi-modal framework, where new model
-architectures and pre-training tasks are leveraged. Specifically, LayoutLMv2 not only uses the existing masked
-visual-language modeling task but also the new text-image alignment and text-image matching tasks in the pre-training
-stage, where cross-modality interaction is better learned. Meanwhile, it also integrates a spatial-aware self-attention
-mechanism into the Transformer architecture, so that the model can fully understand the relative positional
-relationship among different text blocks. Experiment results show that LayoutLMv2 outperforms strong baselines and
-achieves new state-of-the-art results on a wide variety of downstream visually-rich document understanding tasks,
-including FUNSD (0.7895 -> 0.8420), CORD (0.9493 -> 0.9601), SROIE (0.9524 -> 0.9781), Kleister-NDA (0.834 -> 0.852),
-RVL-CDIP (0.9443 -> 0.9564), and DocVQA (0.7295 -> 0.8672). The pre-trained LayoutLMv2 model is publicly available at
-this https URL.*
-
-LayoutLMv2 depends on `detectron2`, `torchvision` and `tesseract`. Run the
-following to install them:
-```
-python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'
-python -m pip install torchvision tesseract
-```
-(If you are developing for LayoutLMv2, note that passing the doctests also requires the installation of these packages.)
-
-Tips:
-
-- The main difference between LayoutLMv1 and LayoutLMv2 is that the latter incorporates visual embeddings during
-  pre-training (while LayoutLMv1 only adds visual embeddings during fine-tuning).
-- LayoutLMv2 adds both a relative 1D attention bias as well as a spatial 2D attention bias to the attention scores in
-  the self-attention layers. Details can be found on page 5 of the [paper](https://arxiv.org/abs/2012.14740).
-- Demo notebooks on how to use the LayoutLMv2 model on RVL-CDIP, FUNSD, DocVQA, CORD can be found [here](https://github.com/NielsRogge/Transformers-Tutorials).
-- LayoutLMv2 uses Facebook AI's [Detectron2](https://github.com/facebookresearch/detectron2/) package for its visual
-  backbone. See [this link](https://detectron2.readthedocs.io/en/latest/tutorials/install.html) for installation
-  instructions.
-- In addition to `input_ids`, [`~LayoutLMv2Model.forward`] expects 2 additional inputs, namely
-  `image` and `bbox`. The `image` input corresponds to the original document image in which the text
-  tokens occur. The model expects each document image to be of size 224x224. This means that if you have a batch of
-  document images, `image` should be a tensor of shape (batch_size, 3, 224, 224). This can be either a
-  `torch.Tensor` or a `Detectron2.structures.ImageList`. You don't need to normalize the channels, as this is
-  done by the model. Important to note is that the visual backbone expects BGR channels instead of RGB, as all models
-  in Detectron2 are pre-trained using the BGR format. The `bbox` input are the bounding boxes (i.e. 2D-positions)
-  of the input text tokens. This is identical to [`LayoutLMModel`]. These can be obtained using an
-  external OCR engine such as Google's [Tesseract](https://github.com/tesseract-ocr/tesseract) (there's a [Python
-  wrapper](https://pypi.org/project/pytesseract/) available). Each bounding box should be in (x0, y0, x1, y1)
-  format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1, y1)
-  represents the position of the lower right corner. Note that one first needs to normalize the bounding boxes to be on
-  a 0-1000 scale. To normalize, you can use the following function:
-
-```python
-def normalize_bbox(bbox, width, height):
-    return [
-        int(1000 * (bbox[0] / width)),
-        int(1000 * (bbox[1] / height)),
-        int(1000 * (bbox[2] / width)),
-        int(1000 * (bbox[3] / height)),
-    ]
-```
-
-Here, `width` and `height` correspond to the width and height of the original document in which the token
-occurs (before resizing the image). Those can be obtained using the Python Image Library (PIL) library for example, as
-follows:
-
-```python
-from PIL import Image
-
-image = Image.open(
-    "name_of_your_document - can be a png, jpg, etc. of your documents (PDFs must be converted to images)."
-)
-
-width, height = image.size
-```
-
-However, this model includes a brand new [`~transformers.LayoutLMv2Processor`] which can be used to directly
-prepare data for the model (including applying OCR under the hood). More information can be found in the "Usage"
-section below.
-
-- Internally, [`~transformers.LayoutLMv2Model`] will send the `image` input through its visual backbone to
-  obtain a lower-resolution feature map, whose shape is equal to the `image_feature_pool_shape` attribute of
-  [`~transformers.LayoutLMv2Config`]. This feature map is then flattened to obtain a sequence of image tokens. As
-  the size of the feature map is 7x7 by default, one obtains 49 image tokens. These are then concatenated with the text
-  tokens, and send through the Transformer encoder. This means that the last hidden states of the model will have a
-  length of 512 + 49 = 561, if you pad the text tokens up to the max length. More generally, the last hidden states
-  will have a shape of `seq_length` + `image_feature_pool_shape[0]` *
-  `config.image_feature_pool_shape[1]`.
-- When calling [`~transformers.LayoutLMv2Model.from_pretrained`], a warning will be printed with a long list of
-  parameter names that are not initialized. This is not a problem, as these parameters are batch normalization
-  statistics, which are going to have values when fine-tuning on a custom dataset.
-- If you want to train the model in a distributed environment, make sure to call [`synchronize_batch_norm`] on the
-  model in order to properly synchronize the batch normalization layers of the visual backbone.
-
-In addition, there's LayoutXLM, which is a multilingual version of LayoutLMv2. More information can be found on
-[LayoutXLM's documentation page](layoutxlm).
-
-## Usage: LayoutLMv2Processor
-
-The easiest way to prepare data for the model is to use [`LayoutLMv2Processor`], which internally
-combines a feature extractor ([`LayoutLMv2FeatureExtractor`]) and a tokenizer
-([`LayoutLMv2Tokenizer`] or [`LayoutLMv2TokenizerFast`]). The feature extractor
-handles the image modality, while the tokenizer handles the text modality. A processor combines both, which is ideal
-for a multi-modal model like LayoutLMv2. Note that you can still use both separately, if you only want to handle one
-modality.
-
-```python
-from transformers import LayoutLMv2FeatureExtractor, LayoutLMv2TokenizerFast, LayoutLMv2Processor
-
-feature_extractor = LayoutLMv2FeatureExtractor()  # apply_ocr is set to True by default
-tokenizer = LayoutLMv2TokenizerFast.from_pretrained("microsoft/layoutlmv2-base-uncased")
-processor = LayoutLMv2Processor(feature_extractor, tokenizer)
-```
-
-In short, one can provide a document image (and possibly additional data) to [`LayoutLMv2Processor`],
-and it will create the inputs expected by the model. Internally, the processor first uses
-[`LayoutLMv2FeatureExtractor`] to apply OCR on the image to get a list of words and normalized
-bounding boxes, as well to resize the image to a given size in order to get the `image` input. The words and
-normalized bounding boxes are then provided to [`LayoutLMv2Tokenizer`] or
-[`LayoutLMv2TokenizerFast`], which converts them to token-level `input_ids`,
-`attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide word labels to the processor,
-which are turned into token-level `labels`.
-
-[`LayoutLMv2Processor`] uses [PyTesseract](https://pypi.org/project/pytesseract/), a Python
-wrapper around Google's Tesseract OCR engine, under the hood. Note that you can still use your own OCR engine of
-choice, and provide the words and normalized boxes yourself. This requires initializing
-[`LayoutLMv2FeatureExtractor`] with `apply_ocr` set to `False`.
-
-In total, there are 5 use cases that are supported by the processor. Below, we list them all. Note that each of these
-use cases work for both batched and non-batched inputs (we illustrate them for non-batched inputs).
-
-**Use case 1: document image classification (training, inference) + token classification (inference), apply_ocr =
-True**
-
-This is the simplest case, in which the processor (actually the feature extractor) will perform OCR on the image to get
-the words and normalized bounding boxes.
-
-```python
-from transformers import LayoutLMv2Processor
-from PIL import Image
-
-processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")
-
-image = Image.open(
-    "name_of_your_document - can be a png, jpg, etc. of your documents (PDFs must be converted to images)."
-).convert("RGB")
-encoding = processor(
-    image, return_tensors="pt"
-)  # you can also add all tokenizer parameters here such as padding, truncation
-print(encoding.keys())
-# dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image'])
-```
-
-**Use case 2: document image classification (training, inference) + token classification (inference), apply_ocr=False**
-
-In case one wants to do OCR themselves, one can initialize the feature extractor with `apply_ocr` set to
-`False`. In that case, one should provide the words and corresponding (normalized) bounding boxes themselves to
-the processor.
-
-```python
-from transformers import LayoutLMv2Processor
-from PIL import Image
-
-processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr")
-
-image = Image.open(
-    "name_of_your_document - can be a png, jpg, etc. of your documents (PDFs must be converted to images)."
-).convert("RGB")
-words = ["hello", "world"]
-boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]  # make sure to normalize your bounding boxes
-encoding = processor(image, words, boxes=boxes, return_tensors="pt")
-print(encoding.keys())
-# dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image'])
-```
-
-**Use case 3: token classification (training), apply_ocr=False**
-
-For token classification tasks (such as FUNSD, CORD, SROIE, Kleister-NDA), one can also provide the corresponding word
-labels in order to train a model. The processor will then convert these into token-level `labels`. By default, it
-will only label the first wordpiece of a word, and label the remaining wordpieces with -100, which is the
-`ignore_index` of PyTorch's CrossEntropyLoss. In case you want all wordpieces of a word to be labeled, you can
-initialize the tokenizer with `only_label_first_subword` set to `False`.
-
-```python
-from transformers import LayoutLMv2Processor
-from PIL import Image
-
-processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr")
-
-image = Image.open(
-    "name_of_your_document - can be a png, jpg, etc. of your documents (PDFs must be converted to images)."
-).convert("RGB")
-words = ["hello", "world"]
-boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]  # make sure to normalize your bounding boxes
-word_labels = [1, 2]
-encoding = processor(image, words, boxes=boxes, word_labels=word_labels, return_tensors="pt")
-print(encoding.keys())
-# dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'labels', 'image'])
-```
-
-**Use case 4: visual question answering (inference), apply_ocr=True**
-
-For visual question answering tasks (such as DocVQA), you can provide a question to the processor. By default, the
-processor will apply OCR on the image, and create [CLS] question tokens [SEP] word tokens [SEP].
-
-```python
-from transformers import LayoutLMv2Processor
-from PIL import Image
-
-processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")
-
-image = Image.open(
-    "name_of_your_document - can be a png, jpg, etc. of your documents (PDFs must be converted to images)."
-).convert("RGB")
-question = "What's his name?"
-encoding = processor(image, question, return_tensors="pt")
-print(encoding.keys())
-# dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image'])
-```
-
-**Use case 5: visual question answering (inference), apply_ocr=False**
-
-For visual question answering tasks (such as DocVQA), you can provide a question to the processor. If you want to
-perform OCR yourself, you can provide your own words and (normalized) bounding boxes to the processor.
-
-```python
-from transformers import LayoutLMv2Processor
-from PIL import Image
-
-processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr")
-
-image = Image.open(
-    "name_of_your_document - can be a png, jpg, etc. of your documents (PDFs must be converted to images)."
-).convert("RGB")
-question = "What's his name?"
-words = ["hello", "world"]
-boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]  # make sure to normalize your bounding boxes
-encoding = processor(image, question, words, boxes=boxes, return_tensors="pt")
-print(encoding.keys())
-# dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image'])
-```
-
-## LayoutLMv2Config
-
-[[autodoc]] LayoutLMv2Config
-
-## LayoutLMv2FeatureExtractor
-
-[[autodoc]] LayoutLMv2FeatureExtractor
-    - __call__
-
-## LayoutLMv2ImageProcessor
-
-[[autodoc]] LayoutLMv2ImageProcessor
-    - preprocess
-
-## LayoutLMv2Tokenizer
-
-[[autodoc]] LayoutLMv2Tokenizer
-    - __call__
-    - save_vocabulary
-
-## LayoutLMv2TokenizerFast
-
-[[autodoc]] LayoutLMv2TokenizerFast
-    - __call__
-
-## LayoutLMv2Processor
-
-[[autodoc]] LayoutLMv2Processor
-    - __call__
-
-## LayoutLMv2Model
-
-[[autodoc]] LayoutLMv2Model
-    - forward
-
-## LayoutLMv2ForSequenceClassification
-
-[[autodoc]] LayoutLMv2ForSequenceClassification
-
-## LayoutLMv2ForTokenClassification
-
-[[autodoc]] LayoutLMv2ForTokenClassification
-
-## LayoutLMv2ForQuestionAnswering
-
-[[autodoc]] LayoutLMv2ForQuestionAnswering
diff --git a/docs/source/en/model_doc/layoutlmv3.md b/docs/source/en/model_doc/layoutlmv3.md
new file mode 100644
index 000000000000..22e2c3ff7186
--- /dev/null
+++ b/docs/source/en/model_doc/layoutlmv3.md
@@ -0,0 +1,144 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# LayoutLMv3
+
+## Overview
+
+The LayoutLMv3 model was proposed in [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
+LayoutLMv3 simplifies [LayoutLMv2](layoutlmv2) by using patch embeddings (as in [ViT](vit)) instead of leveraging a CNN backbone, and pre-trains the model on 3 objectives: masked language modeling (MLM), masked image modeling (MIM)
+and word-patch alignment (WPA).
+
+The abstract from the paper is the following:
+
+*Self-supervised pre-training techniques have achieved remarkable progress in Document AI. Most multimodal pre-trained models use a masked language modeling objective to learn bidirectional representations on the text modality, but they differ in pre-training objectives for the image modality. This discrepancy adds difficulty to multimodal representation learning. In this paper, we propose LayoutLMv3 to pre-train multimodal Transformers for Document AI with unified text and image masking. Additionally, LayoutLMv3 is pre-trained with a word-patch alignment objective to learn cross-modal alignment by predicting whether the corresponding image patch of a text word is masked. The simple unified architecture and training objectives make LayoutLMv3 a general-purpose pre-trained model for both text-centric and image-centric Document AI tasks. Experimental results show that LayoutLMv3 achieves state-of-the-art performance not only in text-centric tasks, including form understanding, receipt understanding, and document visual question answering, but also in image-centric tasks such as document image classification and document layout analysis.*
+
+Tips:
+
+- In terms of data processing, LayoutLMv3 is identical to its predecessor [LayoutLMv2](layoutlmv2), except that:
+    - images need to be resized and normalized with channels in regular RGB format. LayoutLMv2 on the other hand normalizes the images internally and expects the channels in BGR format.
+    - text is tokenized using byte-pair encoding (BPE), as opposed to WordPiece.
+  Due to these differences in data preprocessing, one can use [`LayoutLMv3Processor`] which internally combines a [`LayoutLMv3ImageProcessor`] (for the image modality) and a [`LayoutLMv3Tokenizer`]/[`LayoutLMv3TokenizerFast`] (for the text modality) to prepare all data for the model.
+- Regarding usage of [`LayoutLMv3Processor`], we refer to the [usage guide](layoutlmv2#usage-layoutlmv2processor) of its predecessor.
+- Demo notebooks for LayoutLMv3 can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/LayoutLMv3).
+- Demo scripts can be found [here](https://github.com/huggingface/transformers/tree/main/examples/research_projects/layoutlmv3).
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/layoutlmv3_architecture.png"
+alt="drawing" width="600"/>
+
+<small> LayoutLMv3 architecture. Taken from the <a href="https://arxiv.org/abs/2204.08387">original paper</a>. </small>
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The TensorFlow version of this model was added by [chriskoo](https://huggingface.co/chriskoo), [tokec](https://huggingface.co/tokec), and [lre](https://huggingface.co/lre). The original code can be found [here](https://github.com/microsoft/unilm/tree/master/layoutlmv3).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with LayoutLMv3. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+<Tip>
+
+LayoutLMv3 is nearly identical to LayoutLMv2, so we've also included LayoutLMv2 resources you can adapt for LayoutLMv3 tasks. For these notebooks, take care to use [`LayoutLMv2Processor`] instead when preparing data for the model!
+
+</Tip>
+
+<PipelineTag pipeline="text-classification"/>
+
+- [`LayoutLMv2ForSequenceClassification`] is supported by this [notebook](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv2/RVL-CDIP/Fine_tuning_LayoutLMv2ForSequenceClassification_on_RVL_CDIP.ipynb).
+- [Text classification task guide](../tasks/sequence_classification)
+
+<PipelineTag pipeline="token-classification"/>
+
+- [`LayoutLMv3ForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/research_projects/layoutlmv3) and [notebook](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv3/Fine_tune_LayoutLMv3_on_FUNSD_(HuggingFace_Trainer).ipynb).
+- A [notebook](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv2/FUNSD/Inference_with_LayoutLMv2ForTokenClassification.ipynb) for how to perform inference with [`LayoutLMv2ForTokenClassification`] and a [notebook](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv2/FUNSD/True_inference_with_LayoutLMv2ForTokenClassification_%2B_Gradio_demo.ipynb) for how to perform inference when no labels are available with [`LayoutLMv2ForTokenClassification`].
+- A [notebook](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv2/FUNSD/Fine_tuning_LayoutLMv2ForTokenClassification_on_FUNSD_using_HuggingFace_Trainer.ipynb) for how to finetune [`LayoutLMv2ForTokenClassification`] with the 🤗 Trainer.
+- [Token classification task guide](../tasks/token_classification)
+
+<PipelineTag pipeline="question-answering"/>
+
+- [`LayoutLMv2ForQuestionAnswering`] is supported by this [notebook](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv2/DocVQA/Fine_tuning_LayoutLMv2ForQuestionAnswering_on_DocVQA.ipynb).
+- [Question answering task guide](../tasks/question_answering)
+
+**Document question answering**
+- [Document question answering task guide](../tasks/document_question_answering)
+
+## LayoutLMv3Config
+
+[[autodoc]] LayoutLMv3Config
+
+## LayoutLMv3FeatureExtractor
+
+[[autodoc]] LayoutLMv3FeatureExtractor
+    - __call__
+
+## LayoutLMv3ImageProcessor
+
+[[autodoc]] LayoutLMv3ImageProcessor
+    - preprocess
+
+## LayoutLMv3Tokenizer
+
+[[autodoc]] LayoutLMv3Tokenizer
+    - __call__
+    - save_vocabulary
+
+## LayoutLMv3TokenizerFast
+
+[[autodoc]] LayoutLMv3TokenizerFast
+    - __call__
+
+## LayoutLMv3Processor
+
+[[autodoc]] LayoutLMv3Processor
+    - __call__
+
+## LayoutLMv3Model
+
+[[autodoc]] LayoutLMv3Model
+    - forward
+
+## LayoutLMv3ForSequenceClassification
+
+[[autodoc]] LayoutLMv3ForSequenceClassification
+    - forward
+
+## LayoutLMv3ForTokenClassification
+
+[[autodoc]] LayoutLMv3ForTokenClassification
+    - forward
+
+## LayoutLMv3ForQuestionAnswering
+
+[[autodoc]] LayoutLMv3ForQuestionAnswering
+    - forward
+
+## TFLayoutLMv3Model
+
+[[autodoc]] TFLayoutLMv3Model
+    - call
+
+## TFLayoutLMv3ForSequenceClassification
+
+[[autodoc]] TFLayoutLMv3ForSequenceClassification
+    - call
+
+## TFLayoutLMv3ForTokenClassification
+
+[[autodoc]] TFLayoutLMv3ForTokenClassification
+    - call
+
+## TFLayoutLMv3ForQuestionAnswering
+
+[[autodoc]] TFLayoutLMv3ForQuestionAnswering
+    - call
diff --git a/docs/source/en/model_doc/layoutlmv3.mdx b/docs/source/en/model_doc/layoutlmv3.mdx
deleted file mode 100644
index d49ee1819a43..000000000000
--- a/docs/source/en/model_doc/layoutlmv3.mdx
+++ /dev/null
@@ -1,135 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# LayoutLMv3
-
-## Overview
-
-The LayoutLMv3 model was proposed in [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
-LayoutLMv3 simplifies [LayoutLMv2](layoutlmv2) by using patch embeddings (as in [ViT](vit)) instead of leveraging a CNN backbone, and pre-trains the model on 3 objectives: masked language modeling (MLM), masked image modeling (MIM)
-and word-patch alignment (WPA).
-
-The abstract from the paper is the following:
-
-*Self-supervised pre-training techniques have achieved remarkable progress in Document AI. Most multimodal pre-trained models use a masked language modeling objective to learn bidirectional representations on the text modality, but they differ in pre-training objectives for the image modality. This discrepancy adds difficulty to multimodal representation learning. In this paper, we propose LayoutLMv3 to pre-train multimodal Transformers for Document AI with unified text and image masking. Additionally, LayoutLMv3 is pre-trained with a word-patch alignment objective to learn cross-modal alignment by predicting whether the corresponding image patch of a text word is masked. The simple unified architecture and training objectives make LayoutLMv3 a general-purpose pre-trained model for both text-centric and image-centric Document AI tasks. Experimental results show that LayoutLMv3 achieves state-of-the-art performance not only in text-centric tasks, including form understanding, receipt understanding, and document visual question answering, but also in image-centric tasks such as document image classification and document layout analysis.*
-
-Tips:
-
-- In terms of data processing, LayoutLMv3 is identical to its predecessor [LayoutLMv2](layoutlmv2), except that:
-    - images need to be resized and normalized with channels in regular RGB format. LayoutLMv2 on the other hand normalizes the images internally and expects the channels in BGR format.
-    - text is tokenized using byte-pair encoding (BPE), as opposed to WordPiece.
-  Due to these differences in data preprocessing, one can use [`LayoutLMv3Processor`] which internally combines a [`LayoutLMv3FeatureExtractor`] (for the image modality) and a [`LayoutLMv3Tokenizer`]/[`LayoutLMv3TokenizerFast`] (for the text modality) to prepare all data for the model.
-- Regarding usage of [`LayoutLMv3Processor`], we refer to the [usage guide](layoutlmv2#usage-layoutlmv2processor) of its predecessor.
-- Demo notebooks for LayoutLMv3 can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/LayoutLMv3).
-- Demo scripts can be found [here](https://github.com/huggingface/transformers/tree/main/examples/research_projects/layoutlmv3).
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/layoutlmv3_architecture.png"
-alt="drawing" width="600"/>
-
-<small> LayoutLMv3 architecture. Taken from the <a href="https://arxiv.org/abs/2204.08387">original paper</a>. </small>
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr). The TensorFlow version of this model was added by [chriskoo](https://huggingface.co/chriskoo), [tokec](https://huggingface.co/tokec), and [lre](https://huggingface.co/lre). The original code can be found [here](https://github.com/microsoft/unilm/tree/master/layoutlmv3).
-
-## Resources
-
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with LayoutLMv3. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
-<Tip>
-
-LayoutLMv3 is nearly identical to LayoutLMv2, so we've also included LayoutLMv2 resources you can adapt for LayoutLMv3 tasks. For these notebooks, take care to use [`LayoutLMv2Processor`] instead when preparing data for the model!
-
-</Tip>
-
-<PipelineTag pipeline="text-classification"/>
-
-- [`LayoutLMv2ForSequenceClassification`] is supported by this [notebook](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv2/RVL-CDIP/Fine_tuning_LayoutLMv2ForSequenceClassification_on_RVL_CDIP.ipynb).
-
-<PipelineTag pipeline="token-classification"/>
-
-- [`LayoutLMv3ForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/research_projects/layoutlmv3) and [notebook](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv3/Fine_tune_LayoutLMv3_on_FUNSD_(HuggingFace_Trainer).ipynb).
-- A [notebook](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv2/FUNSD/Inference_with_LayoutLMv2ForTokenClassification.ipynb) for how to perform inference with [`LayoutLMv2ForTokenClassification`] and a [notebook](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv2/FUNSD/True_inference_with_LayoutLMv2ForTokenClassification_%2B_Gradio_demo.ipynb) for how to perform inference when no labels are available with [`LayoutLMv2ForTokenClassification`].
-- A [notebook](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv2/FUNSD/Fine_tuning_LayoutLMv2ForTokenClassification_on_FUNSD_using_HuggingFace_Trainer.ipynb) for how to finetune [`LayoutLMv2ForTokenClassification`] with the 🤗 Trainer.
-
-<PipelineTag pipeline="question-answering"/>
-
-- [`LayoutLMv2ForQuestionAnswering`] is supported by this [notebook](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv2/DocVQA/Fine_tuning_LayoutLMv2ForQuestionAnswering_on_DocVQA.ipynb).
-
-
-## LayoutLMv3Config
-
-[[autodoc]] LayoutLMv3Config
-
-## LayoutLMv3FeatureExtractor
-
-[[autodoc]] LayoutLMv3FeatureExtractor
-    - __call__
-
-## LayoutLMv3ImageProcessor
-
-[[autodoc]] LayoutLMv3ImageProcessor
-    - preprocess
-
-## LayoutLMv3Tokenizer
-
-[[autodoc]] LayoutLMv3Tokenizer
-    - __call__
-    - save_vocabulary
-
-## LayoutLMv3TokenizerFast
-
-[[autodoc]] LayoutLMv3TokenizerFast
-    - __call__
-
-## LayoutLMv3Processor
-
-[[autodoc]] LayoutLMv3Processor
-    - __call__
-
-## LayoutLMv3Model
-
-[[autodoc]] LayoutLMv3Model
-    - forward
-
-## LayoutLMv3ForSequenceClassification
-
-[[autodoc]] LayoutLMv3ForSequenceClassification
-    - forward
-
-## LayoutLMv3ForTokenClassification
-
-[[autodoc]] LayoutLMv3ForTokenClassification
-    - forward
-
-## LayoutLMv3ForQuestionAnswering
-
-[[autodoc]] LayoutLMv3ForQuestionAnswering
-    - forward
-
-## TFLayoutLMv3Model
-
-[[autodoc]] TFLayoutLMv3Model
-    - call
-
-## TFLayoutLMv3ForSequenceClassification
-
-[[autodoc]] TFLayoutLMv3ForSequenceClassification
-    - call
-
-## TFLayoutLMv3ForTokenClassification
-
-[[autodoc]] TFLayoutLMv3ForTokenClassification
-    - call
-
-## TFLayoutLMv3ForQuestionAnswering
-
-[[autodoc]] TFLayoutLMv3ForQuestionAnswering
-    - call
diff --git a/docs/source/en/model_doc/layoutxlm.md b/docs/source/en/model_doc/layoutxlm.md
new file mode 100644
index 000000000000..8858560bbb21
--- /dev/null
+++ b/docs/source/en/model_doc/layoutxlm.md
@@ -0,0 +1,81 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# LayoutXLM
+
+## Overview
+
+LayoutXLM was proposed in [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha
+Zhang, Furu Wei. It's a multilingual extension of the [LayoutLMv2 model](https://arxiv.org/abs/2012.14740) trained
+on 53 languages.
+
+The abstract from the paper is the following:
+
+*Multimodal pre-training with text, layout, and image has achieved SOTA performance for visually-rich document
+understanding tasks recently, which demonstrates the great potential for joint learning across different modalities. In
+this paper, we present LayoutXLM, a multimodal pre-trained model for multilingual document understanding, which aims to
+bridge the language barriers for visually-rich document understanding. To accurately evaluate LayoutXLM, we also
+introduce a multilingual form understanding benchmark dataset named XFUN, which includes form understanding samples in
+7 languages (Chinese, Japanese, Spanish, French, Italian, German, Portuguese), and key-value pairs are manually labeled
+for each language. Experiment results show that the LayoutXLM model has significantly outperformed the existing SOTA
+cross-lingual pre-trained models on the XFUN dataset.*
+
+One can directly plug in the weights of LayoutXLM into a LayoutLMv2 model, like so:
+
+```python
+from transformers import LayoutLMv2Model
+
+model = LayoutLMv2Model.from_pretrained("microsoft/layoutxlm-base")
+```
+
+Note that LayoutXLM has its own tokenizer, based on
+[`LayoutXLMTokenizer`]/[`LayoutXLMTokenizerFast`]. You can initialize it as
+follows:
+
+```python
+from transformers import LayoutXLMTokenizer
+
+tokenizer = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base")
+```
+
+Similar to LayoutLMv2, you can use [`LayoutXLMProcessor`] (which internally applies
+[`LayoutLMv2ImageProcessor`] and
+[`LayoutXLMTokenizer`]/[`LayoutXLMTokenizerFast`] in sequence) to prepare all
+data for the model.
+
+As LayoutXLM's architecture is equivalent to that of LayoutLMv2, one can refer to [LayoutLMv2's documentation page](layoutlmv2) for all tips, code examples and notebooks.
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/microsoft/unilm).
+
+
+## LayoutXLMTokenizer
+
+[[autodoc]] LayoutXLMTokenizer
+    - __call__
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## LayoutXLMTokenizerFast
+
+[[autodoc]] LayoutXLMTokenizerFast
+    - __call__
+
+## LayoutXLMProcessor
+
+[[autodoc]] LayoutXLMProcessor
+    - __call__
diff --git a/docs/source/en/model_doc/layoutxlm.mdx b/docs/source/en/model_doc/layoutxlm.mdx
deleted file mode 100644
index ed112453beae..000000000000
--- a/docs/source/en/model_doc/layoutxlm.mdx
+++ /dev/null
@@ -1,77 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# LayoutXLM
-
-## Overview
-
-LayoutXLM was proposed in [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha
-Zhang, Furu Wei. It's a multilingual extension of the [LayoutLMv2 model](https://arxiv.org/abs/2012.14740) trained
-on 53 languages.
-
-The abstract from the paper is the following:
-
-*Multimodal pre-training with text, layout, and image has achieved SOTA performance for visually-rich document
-understanding tasks recently, which demonstrates the great potential for joint learning across different modalities. In
-this paper, we present LayoutXLM, a multimodal pre-trained model for multilingual document understanding, which aims to
-bridge the language barriers for visually-rich document understanding. To accurately evaluate LayoutXLM, we also
-introduce a multilingual form understanding benchmark dataset named XFUN, which includes form understanding samples in
-7 languages (Chinese, Japanese, Spanish, French, Italian, German, Portuguese), and key-value pairs are manually labeled
-for each language. Experiment results show that the LayoutXLM model has significantly outperformed the existing SOTA
-cross-lingual pre-trained models on the XFUN dataset.*
-
-One can directly plug in the weights of LayoutXLM into a LayoutLMv2 model, like so:
-
-```python
-from transformers import LayoutLMv2Model
-
-model = LayoutLMv2Model.from_pretrained("microsoft/layoutxlm-base")
-```
-
-Note that LayoutXLM has its own tokenizer, based on
-[`LayoutXLMTokenizer`]/[`LayoutXLMTokenizerFast`]. You can initialize it as
-follows:
-
-```python
-from transformers import LayoutXLMTokenizer
-
-tokenizer = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base")
-```
-
-Similar to LayoutLMv2, you can use [`LayoutXLMProcessor`] (which internally applies
-[`LayoutLMv2FeatureExtractor`] and
-[`LayoutXLMTokenizer`]/[`LayoutXLMTokenizerFast`] in sequence) to prepare all
-data for the model.
-
-As LayoutXLM's architecture is equivalent to that of LayoutLMv2, one can refer to [LayoutLMv2's documentation page](layoutlmv2) for all tips, code examples and notebooks.
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/microsoft/unilm).
-
-
-## LayoutXLMTokenizer
-
-[[autodoc]] LayoutXLMTokenizer
-    - __call__
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-
-## LayoutXLMTokenizerFast
-
-[[autodoc]] LayoutXLMTokenizerFast
-    - __call__
-
-## LayoutXLMProcessor
-
-[[autodoc]] LayoutXLMProcessor
-    - __call__
diff --git a/docs/source/en/model_doc/led.md b/docs/source/en/model_doc/led.md
new file mode 100644
index 000000000000..9ba9383a59d5
--- /dev/null
+++ b/docs/source/en/model_doc/led.md
@@ -0,0 +1,131 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# LED
+
+## Overview
+
+The LED model was proposed in [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz
+Beltagy, Matthew E. Peters, Arman Cohan.
+
+The abstract from the paper is the following:
+
+*Transformer-based models are unable to process long sequences due to their self-attention operation, which scales
+quadratically with the sequence length. To address this limitation, we introduce the Longformer with an attention
+mechanism that scales linearly with sequence length, making it easy to process documents of thousands of tokens or
+longer. Longformer's attention mechanism is a drop-in replacement for the standard self-attention and combines a local
+windowed attention with a task motivated global attention. Following prior work on long-sequence transformers, we
+evaluate Longformer on character-level language modeling and achieve state-of-the-art results on text8 and enwik8. In
+contrast to most prior work, we also pretrain Longformer and finetune it on a variety of downstream tasks. Our
+pretrained Longformer consistently outperforms RoBERTa on long document tasks and sets new state-of-the-art results on
+WikiHop and TriviaQA. We finally introduce the Longformer-Encoder-Decoder (LED), a Longformer variant for supporting
+long document generative sequence-to-sequence tasks, and demonstrate its effectiveness on the arXiv summarization
+dataset.*
+
+Tips:
+
+- [`LEDForConditionalGeneration`] is an extension of
+  [`BartForConditionalGeneration`] exchanging the traditional *self-attention* layer with
+  *Longformer*'s *chunked self-attention* layer. [`LEDTokenizer`] is an alias of
+  [`BartTokenizer`].
+- LED works very well on long-range *sequence-to-sequence* tasks where the `input_ids` largely exceed a length of
+  1024 tokens.
+- LED pads the `input_ids` to be a multiple of `config.attention_window` if required. Therefore a small speed-up is
+  gained, when [`LEDTokenizer`] is used with the `pad_to_multiple_of` argument.
+- LED makes use of *global attention* by means of the `global_attention_mask` (see
+  [`LongformerModel`]). For summarization, it is advised to put *global attention* only on the first
+  `<s>` token. For question answering, it is advised to put *global attention* on all tokens of the question.
+- To fine-tune LED on all 16384, *gradient checkpointing* can be enabled in case training leads to out-of-memory (OOM)
+  errors. This can be done by executing `model.gradient_checkpointing_enable()`. 
+ Moreover, the `use_cache=False`
+  flag can be used to disable the caching mechanism to save memory.
+- A notebook showing how to evaluate LED, can be accessed [here](https://colab.research.google.com/drive/12INTTR6n64TzS4RrXZxMSXfrOd9Xzamo?usp=sharing).
+- A notebook showing how to fine-tune LED, can be accessed [here](https://colab.research.google.com/drive/12LjJazBl7Gam0XBPy_y0CTOJZeZ34c2v?usp=sharing).
+- LED is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
+  the left.
+
+This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten).
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Translation task guide](../tasks/translation)
+- [Summarization task guide](../tasks/summarization)
+
+## LEDConfig
+
+[[autodoc]] LEDConfig
+
+## LEDTokenizer
+
+[[autodoc]] LEDTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## LEDTokenizerFast
+
+[[autodoc]] LEDTokenizerFast
+
+## LED specific outputs
+
+[[autodoc]] models.led.modeling_led.LEDEncoderBaseModelOutput
+
+[[autodoc]] models.led.modeling_led.LEDSeq2SeqModelOutput
+
+[[autodoc]] models.led.modeling_led.LEDSeq2SeqLMOutput
+
+[[autodoc]] models.led.modeling_led.LEDSeq2SeqSequenceClassifierOutput
+
+[[autodoc]] models.led.modeling_led.LEDSeq2SeqQuestionAnsweringModelOutput
+
+[[autodoc]] models.led.modeling_tf_led.TFLEDEncoderBaseModelOutput
+
+[[autodoc]] models.led.modeling_tf_led.TFLEDSeq2SeqModelOutput
+
+[[autodoc]] models.led.modeling_tf_led.TFLEDSeq2SeqLMOutput
+
+## LEDModel
+
+[[autodoc]] LEDModel
+    - forward
+
+## LEDForConditionalGeneration
+
+[[autodoc]] LEDForConditionalGeneration
+    - forward
+
+## LEDForSequenceClassification
+
+[[autodoc]] LEDForSequenceClassification
+    - forward
+
+## LEDForQuestionAnswering
+
+[[autodoc]] LEDForQuestionAnswering
+    - forward
+
+## TFLEDModel
+
+[[autodoc]] TFLEDModel
+    - call
+
+## TFLEDForConditionalGeneration
+
+[[autodoc]] TFLEDForConditionalGeneration
+    - call
diff --git a/docs/source/en/model_doc/led.mdx b/docs/source/en/model_doc/led.mdx
deleted file mode 100644
index 6ecdf808e261..000000000000
--- a/docs/source/en/model_doc/led.mdx
+++ /dev/null
@@ -1,121 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# LED
-
-## Overview
-
-The LED model was proposed in [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz
-Beltagy, Matthew E. Peters, Arman Cohan.
-
-The abstract from the paper is the following:
-
-*Transformer-based models are unable to process long sequences due to their self-attention operation, which scales
-quadratically with the sequence length. To address this limitation, we introduce the Longformer with an attention
-mechanism that scales linearly with sequence length, making it easy to process documents of thousands of tokens or
-longer. Longformer's attention mechanism is a drop-in replacement for the standard self-attention and combines a local
-windowed attention with a task motivated global attention. Following prior work on long-sequence transformers, we
-evaluate Longformer on character-level language modeling and achieve state-of-the-art results on text8 and enwik8. In
-contrast to most prior work, we also pretrain Longformer and finetune it on a variety of downstream tasks. Our
-pretrained Longformer consistently outperforms RoBERTa on long document tasks and sets new state-of-the-art results on
-WikiHop and TriviaQA. We finally introduce the Longformer-Encoder-Decoder (LED), a Longformer variant for supporting
-long document generative sequence-to-sequence tasks, and demonstrate its effectiveness on the arXiv summarization
-dataset.*
-
-Tips:
-
-- [`LEDForConditionalGeneration`] is an extension of
-  [`BartForConditionalGeneration`] exchanging the traditional *self-attention* layer with
-  *Longformer*'s *chunked self-attention* layer. [`LEDTokenizer`] is an alias of
-  [`BartTokenizer`].
-- LED works very well on long-range *sequence-to-sequence* tasks where the `input_ids` largely exceed a length of
-  1024 tokens.
-- LED pads the `input_ids` to be a multiple of `config.attention_window` if required. Therefore a small speed-up is
-  gained, when [`LEDTokenizer`] is used with the `pad_to_multiple_of` argument.
-- LED makes use of *global attention* by means of the `global_attention_mask` (see
-  [`LongformerModel`]). For summarization, it is advised to put *global attention* only on the first
-  `<s>` token. For question answering, it is advised to put *global attention* on all tokens of the question.
-- To fine-tune LED on all 16384, *gradient checkpointing* can be enabled in case training leads to out-of-memory (OOM)
-  errors. This can be done by executing `model.gradient_checkpointing_enable()`. 
- Moreover, the `use_cache=False`
-  flag can be used to disable the caching mechanism to save memory.
-- A notebook showing how to evaluate LED, can be accessed [here](https://colab.research.google.com/drive/12INTTR6n64TzS4RrXZxMSXfrOd9Xzamo?usp=sharing).
-- A notebook showing how to fine-tune LED, can be accessed [here](https://colab.research.google.com/drive/12LjJazBl7Gam0XBPy_y0CTOJZeZ34c2v?usp=sharing).
-- LED is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
-  the left.
-
-This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten).
-
-
-## LEDConfig
-
-[[autodoc]] LEDConfig
-
-## LEDTokenizer
-
-[[autodoc]] LEDTokenizer
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-
-## LEDTokenizerFast
-
-[[autodoc]] LEDTokenizerFast
-
-## LED specific outputs
-
-[[autodoc]] models.led.modeling_led.LEDEncoderBaseModelOutput
-
-[[autodoc]] models.led.modeling_led.LEDSeq2SeqModelOutput
-
-[[autodoc]] models.led.modeling_led.LEDSeq2SeqLMOutput
-
-[[autodoc]] models.led.modeling_led.LEDSeq2SeqSequenceClassifierOutput
-
-[[autodoc]] models.led.modeling_led.LEDSeq2SeqQuestionAnsweringModelOutput
-
-[[autodoc]] models.led.modeling_tf_led.TFLEDEncoderBaseModelOutput
-
-[[autodoc]] models.led.modeling_tf_led.TFLEDSeq2SeqModelOutput
-
-[[autodoc]] models.led.modeling_tf_led.TFLEDSeq2SeqLMOutput
-
-## LEDModel
-
-[[autodoc]] LEDModel
-    - forward
-
-## LEDForConditionalGeneration
-
-[[autodoc]] LEDForConditionalGeneration
-    - forward
-
-## LEDForSequenceClassification
-
-[[autodoc]] LEDForSequenceClassification
-    - forward
-
-## LEDForQuestionAnswering
-
-[[autodoc]] LEDForQuestionAnswering
-    - forward
-
-## TFLEDModel
-
-[[autodoc]] TFLEDModel
-    - call
-
-## TFLEDForConditionalGeneration
-
-[[autodoc]] TFLEDForConditionalGeneration
-    - call
diff --git a/docs/source/en/model_doc/levit.md b/docs/source/en/model_doc/levit.md
new file mode 100644
index 000000000000..8145be775f52
--- /dev/null
+++ b/docs/source/en/model_doc/levit.md
@@ -0,0 +1,107 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# LeViT
+
+## Overview
+
+The LeViT model was proposed in [LeViT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze. LeViT improves the [Vision Transformer (ViT)](vit) in performance and efficiency by a few architectural differences such as activation maps with decreasing resolutions in Transformers and the introduction of an attention bias to integrate positional information.
+
+The abstract from the paper is the following:
+
+*We design a family of image classification architectures that optimize the trade-off between accuracy
+and efficiency in a high-speed regime. Our work exploits recent findings in attention-based architectures,
+which are competitive on highly parallel processing hardware. We revisit principles from the extensive
+literature on convolutional neural networks to apply them to transformers, in particular activation maps
+with decreasing resolutions. We also introduce the attention bias, a new way to integrate positional information
+in vision transformers. As a result, we propose LeVIT: a hybrid neural network for fast inference image classification.
+We consider different measures of efficiency on different hardware platforms, so as to best reflect a wide range of
+application scenarios. Our extensive experiments empirically validate our technical choices and show they are suitable
+to most architectures. Overall, LeViT significantly outperforms existing convnets and vision transformers with respect
+to the speed/accuracy tradeoff. For example, at 80% ImageNet top-1 accuracy, LeViT is 5 times faster than EfficientNet on CPU. *
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/levit_architecture.png"
+alt="drawing" width="600"/>
+
+<small> LeViT Architecture. Taken from the <a href="https://arxiv.org/abs/2104.01136">original paper</a>.</small>
+
+Tips:
+
+- Compared to ViT, LeViT models use an additional distillation head to effectively learn from a teacher (which, in the LeViT paper, is a ResNet like-model). The distillation head is learned through backpropagation under supervision of a ResNet like-model. They also draw inspiration from convolution neural networks to use activation maps with decreasing resolutions to increase the efficiency.
+- There are 2 ways to fine-tune distilled models, either (1) in a classic way, by only placing a prediction head on top
+  of the final hidden state and not using the distillation head, or (2) by placing both a prediction head and distillation
+  head on top of the final hidden state. In that case, the prediction head is trained using regular cross-entropy between
+  the prediction of the head and the ground-truth label, while the distillation prediction head is trained using hard distillation
+  (cross-entropy between the prediction of the distillation head and the label predicted by the teacher). At inference time,
+  one takes the average prediction between both heads as final prediction. (2) is also called "fine-tuning with distillation",
+  because one relies on a teacher that has already been fine-tuned on the downstream dataset. In terms of models, (1) corresponds
+  to [`LevitForImageClassification`] and (2) corresponds to [`LevitForImageClassificationWithTeacher`].
+- All released checkpoints were pre-trained and fine-tuned on  [ImageNet-1k](https://huggingface.co/datasets/imagenet-1k)
+  (also referred to as ILSVRC 2012, a collection of 1.3 million images and 1,000 classes). only. No external data was used. This is in
+  contrast with the original ViT model, which used external data like the JFT-300M dataset/Imagenet-21k for
+  pre-training.
+- The authors of LeViT released 5 trained LeViT models, which you can directly plug into [`LevitModel`] or [`LevitForImageClassification`].
+  Techniques like data augmentation, optimization, and regularization were used in order to simulate training on a much larger dataset
+  (while only using ImageNet-1k for pre-training). The 5 variants available are (all trained on images of size 224x224):
+  *facebook/levit-128S*, *facebook/levit-128*, *facebook/levit-192*, *facebook/levit-256* and
+  *facebook/levit-384*. Note that one should use [`LevitImageProcessor`] in order to
+  prepare images for the model.
+- [`LevitForImageClassificationWithTeacher`] currently supports only inference and not training or fine-tuning.
+- You can check out demo notebooks regarding inference as well as fine-tuning on custom data [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/VisionTransformer)
+  (you can just replace [`ViTFeatureExtractor`] by [`LevitImageProcessor`] and [`ViTForImageClassification`] by [`LevitForImageClassification`] or [`LevitForImageClassificationWithTeacher`]).
+
+This model was contributed by [anugunj](https://huggingface.co/anugunj). The original code can be found [here](https://github.com/facebookresearch/LeViT).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with LeViT.
+
+<PipelineTag pipeline="image-classification"/>
+
+- [`LevitForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
+- See also: [Image classification task guide](../tasks/image_classification)
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## LevitConfig
+
+[[autodoc]] LevitConfig
+
+## LevitFeatureExtractor
+
+[[autodoc]] LevitFeatureExtractor
+    - __call__
+
+## LevitImageProcessor
+
+  [[autodoc]] LevitImageProcessor
+    - preprocess
+
+
+## LevitModel
+
+[[autodoc]] LevitModel
+    - forward
+
+## LevitForImageClassification
+
+[[autodoc]] LevitForImageClassification
+    - forward
+
+## LevitForImageClassificationWithTeacher
+
+[[autodoc]] LevitForImageClassificationWithTeacher
+    - forward
diff --git a/docs/source/en/model_doc/levit.mdx b/docs/source/en/model_doc/levit.mdx
deleted file mode 100644
index 0a64471b3480..000000000000
--- a/docs/source/en/model_doc/levit.mdx
+++ /dev/null
@@ -1,93 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# LeViT
-
-## Overview
-
-The LeViT model was proposed in [LeViT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze. LeViT improves the [Vision Transformer (ViT)](vit) in performance and efficiency by a few architectural differences such as activation maps with decreasing resolutions in Transformers and the introduction of an attention bias to integrate positional information.
-
-The abstract from the paper is the following:
-
-*We design a family of image classification architectures that optimize the trade-off between accuracy
-and efficiency in a high-speed regime. Our work exploits recent findings in attention-based architectures,
-which are competitive on highly parallel processing hardware. We revisit principles from the extensive
-literature on convolutional neural networks to apply them to transformers, in particular activation maps
-with decreasing resolutions. We also introduce the attention bias, a new way to integrate positional information
-in vision transformers. As a result, we propose LeVIT: a hybrid neural network for fast inference image classification.
-We consider different measures of efficiency on different hardware platforms, so as to best reflect a wide range of
-application scenarios. Our extensive experiments empirically validate our technical choices and show they are suitable
-to most architectures. Overall, LeViT significantly outperforms existing convnets and vision transformers with respect
-to the speed/accuracy tradeoff. For example, at 80% ImageNet top-1 accuracy, LeViT is 5 times faster than EfficientNet on CPU. *
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/levit_architecture.png"
-alt="drawing" width="600"/>
-
-<small> LeViT Architecture. Taken from the <a href="https://arxiv.org/abs/2104.01136">original paper</a>.</small>
-
-Tips:
-
-- Compared to ViT, LeViT models use an additional distillation head to effectively learn from a teacher (which, in the LeViT paper, is a ResNet like-model). The distillation head is learned through backpropagation under supervision of a ResNet like-model. They also draw inspiration from convolution neural networks to use activation maps with decreasing resolutions to increase the efficiency.
-- There are 2 ways to fine-tune distilled models, either (1) in a classic way, by only placing a prediction head on top
-  of the final hidden state and not using the distillation head, or (2) by placing both a prediction head and distillation
-  head on top of the final hidden state. In that case, the prediction head is trained using regular cross-entropy between
-  the prediction of the head and the ground-truth label, while the distillation prediction head is trained using hard distillation
-  (cross-entropy between the prediction of the distillation head and the label predicted by the teacher). At inference time,
-  one takes the average prediction between both heads as final prediction. (2) is also called "fine-tuning with distillation",
-  because one relies on a teacher that has already been fine-tuned on the downstream dataset. In terms of models, (1) corresponds
-  to [`LevitForImageClassification`] and (2) corresponds to [`LevitForImageClassificationWithTeacher`].
-- All released checkpoints were pre-trained and fine-tuned on  [ImageNet-1k](https://huggingface.co/datasets/imagenet-1k)
-  (also referred to as ILSVRC 2012, a collection of 1.3 million images and 1,000 classes). only. No external data was used. This is in
-  contrast with the original ViT model, which used external data like the JFT-300M dataset/Imagenet-21k for
-  pre-training.
-- The authors of LeViT released 5 trained LeViT models, which you can directly plug into [`LevitModel`] or [`LevitForImageClassification`].
-  Techniques like data augmentation, optimization, and regularization were used in order to simulate training on a much larger dataset
-  (while only using ImageNet-1k for pre-training). The 5 variants available are (all trained on images of size 224x224):
-  *facebook/levit-128S*, *facebook/levit-128*, *facebook/levit-192*, *facebook/levit-256* and
-  *facebook/levit-384*. Note that one should use [`LevitImageProcessor`] in order to
-  prepare images for the model.
-- [`LevitForImageClassificationWithTeacher`] currently supports only inference and not training or fine-tuning.
-- You can check out demo notebooks regarding inference as well as fine-tuning on custom data [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/VisionTransformer)
-  (you can just replace [`ViTFeatureExtractor`] by [`LevitImageProcessor`] and [`ViTForImageClassification`] by [`LevitForImageClassification`] or [`LevitForImageClassificationWithTeacher`]).
-
-This model was contributed by [anugunj](https://huggingface.co/anugunj). The original code can be found [here](https://github.com/facebookresearch/LeViT).
-
-
-## LevitConfig
-
-[[autodoc]] LevitConfig
-
-## LevitFeatureExtractor
-
-[[autodoc]] LevitFeatureExtractor
-    - __call__
-
-## LevitImageProcessor
-
-  [[autodoc]] LevitImageProcessor
-    - preprocess
-
-
-## LevitModel
-
-[[autodoc]] LevitModel
-    - forward
-
-## LevitForImageClassification
-
-[[autodoc]] LevitForImageClassification
-    - forward
-
-## LevitForImageClassificationWithTeacher
-
-[[autodoc]] LevitForImageClassificationWithTeacher
-    - forward
diff --git a/docs/source/en/model_doc/lilt.md b/docs/source/en/model_doc/lilt.md
new file mode 100644
index 000000000000..901deefd7ffe
--- /dev/null
+++ b/docs/source/en/model_doc/lilt.md
@@ -0,0 +1,88 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# LiLT
+
+## Overview
+
+The LiLT model was proposed in [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
+LiLT allows to combine any pre-trained RoBERTa text encoder with a lightweight Layout Transformer, to enable [LayoutLM](layoutlm)-like document understanding for many
+languages.
+
+The abstract from the paper is the following:
+
+*Structured document understanding has attracted considerable attention and made significant progress recently, owing to its crucial role in intelligent document processing. However, most existing related models can only deal with the document data of specific language(s) (typically English) included in the pre-training collection, which is extremely limited. To address this issue, we propose a simple yet effective Language-independent Layout Transformer (LiLT) for structured document understanding. LiLT can be pre-trained on the structured documents of a single language and then directly fine-tuned on other languages with the corresponding off-the-shelf monolingual/multilingual pre-trained textual models. Experimental results on eight languages have shown that LiLT can achieve competitive or even superior performance on diverse widely-used downstream benchmarks, which enables language-independent benefit from the pre-training of document layout structure.*
+
+Tips:
+
+- To combine the Language-Independent Layout Transformer with a new RoBERTa checkpoint from the [hub](https://huggingface.co/models?search=roberta), refer to [this guide](https://github.com/jpWang/LiLT#or-generate-your-own-checkpoint-optional).
+The script will result in `config.json` and `pytorch_model.bin` files being stored locally. After doing this, one can do the following (assuming you're logged in with your HuggingFace account):
+
+```
+from transformers import LiltModel
+
+model = LiltModel.from_pretrained("path_to_your_files")
+model.push_to_hub("name_of_repo_on_the_hub")
+```
+
+- When preparing data for the model, make sure to use the token vocabulary that corresponds to the RoBERTa checkpoint you combined with the Layout Transformer.
+- As [lilt-roberta-en-base](https://huggingface.co/SCUT-DLVCLab/lilt-roberta-en-base) uses the same vocabulary as [LayoutLMv3](layoutlmv3), one can use [`LayoutLMv3TokenizerFast`] to prepare data for the model.
+The same is true for [lilt-roberta-en-base](https://huggingface.co/SCUT-DLVCLab/lilt-infoxlm-base): one can use [`LayoutXLMTokenizerFast`] for that model.
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/lilt_architecture.jpg"
+alt="drawing" width="600"/>
+
+<small> LiLT architecture. Taken from the <a href="https://arxiv.org/abs/2202.13669">original paper</a>. </small>
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr).
+The original code can be found [here](https://github.com/jpwang/lilt).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with LiLT.
+
+- Demo notebooks for LiLT can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/LiLT).
+
+**Documentation resources**
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## LiltConfig
+
+[[autodoc]] LiltConfig
+
+## LiltModel
+
+[[autodoc]] LiltModel
+    - forward
+
+## LiltForSequenceClassification
+
+[[autodoc]] LiltForSequenceClassification
+    - forward
+
+## LiltForTokenClassification
+
+[[autodoc]] LiltForTokenClassification
+    - forward
+
+## LiltForQuestionAnswering
+
+[[autodoc]] LiltForQuestionAnswering
+    - forward
diff --git a/docs/source/en/model_doc/lilt.mdx b/docs/source/en/model_doc/lilt.mdx
deleted file mode 100644
index 9b80c1bc097d..000000000000
--- a/docs/source/en/model_doc/lilt.mdx
+++ /dev/null
@@ -1,73 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# LiLT
-
-## Overview
-
-The LiLT model was proposed in [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
-LiLT allows to combine any pre-trained RoBERTa text encoder with a lightweight Layout Transformer, to enable [LayoutLM](layoutlm)-like document understanding for many
-languages.
-
-The abstract from the paper is the following:
-
-*Structured document understanding has attracted considerable attention and made significant progress recently, owing to its crucial role in intelligent document processing. However, most existing related models can only deal with the document data of specific language(s) (typically English) included in the pre-training collection, which is extremely limited. To address this issue, we propose a simple yet effective Language-independent Layout Transformer (LiLT) for structured document understanding. LiLT can be pre-trained on the structured documents of a single language and then directly fine-tuned on other languages with the corresponding off-the-shelf monolingual/multilingual pre-trained textual models. Experimental results on eight languages have shown that LiLT can achieve competitive or even superior performance on diverse widely-used downstream benchmarks, which enables language-independent benefit from the pre-training of document layout structure.*
-
-Tips:
-
-- To combine the Language-Independent Layout Transformer with a new RoBERTa checkpoint from the [hub](https://huggingface.co/models?search=roberta), refer to [this guide](https://github.com/jpWang/LiLT#or-generate-your-own-checkpoint-optional).
-The script will result in `config.json` and `pytorch_model.bin` files being stored locally. After doing this, one can do the following (assuming you're logged in with your HuggingFace account):
-
-```
-from transformers import LiltModel
-
-model = LiltModel.from_pretrained("path_to_your_files")
-model.push_to_hub("name_of_repo_on_the_hub")
-```
-
-- When preparing data for the model, make sure to use the token vocabulary that corresponds to the RoBERTa checkpoint you combined with the Layout Transformer.
-- As [lilt-roberta-en-base](https://huggingface.co/SCUT-DLVCLab/lilt-roberta-en-base) uses the same vocabulary as [LayoutLMv3](layoutlmv3), one can use [`LayoutLMv3TokenizerFast`] to prepare data for the model.
-The same is true for [lilt-roberta-en-base](https://huggingface.co/SCUT-DLVCLab/lilt-infoxlm-base): one can use [`LayoutXLMTokenizerFast`] for that model.
-- Demo notebooks for LiLT can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/LiLT).
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/lilt_architecture.jpg"
-alt="drawing" width="600"/>
-
-<small> LiLT architecture. Taken from the <a href="https://arxiv.org/abs/2202.13669">original paper</a>. </small>
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr).
-The original code can be found [here](https://github.com/jpwang/lilt).
-
-
-## LiltConfig
-
-[[autodoc]] LiltConfig
-
-## LiltModel
-
-[[autodoc]] LiltModel
-    - forward
-
-## LiltForSequenceClassification
-
-[[autodoc]] LiltForSequenceClassification
-    - forward
-
-## LiltForTokenClassification
-
-[[autodoc]] LiltForTokenClassification
-    - forward
-
-## LiltForQuestionAnswering
-
-[[autodoc]] LiltForQuestionAnswering
-    - forward
diff --git a/docs/source/en/model_doc/llama.md b/docs/source/en/model_doc/llama.md
new file mode 100644
index 000000000000..e63e4b1ab3b3
--- /dev/null
+++ b/docs/source/en/model_doc/llama.md
@@ -0,0 +1,117 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# LLaMA
+
+## Overview
+
+The LLaMA model was proposed in [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample. It is a collection of foundation language models ranging from 7B to 65B parameters.
+
+The abstract from the paper is the following:
+
+*We introduce LLaMA, a collection of foundation language models ranging from 7B to 65B parameters. We train our models on trillions of tokens, and show that it is possible to train state-of-the-art models using publicly available datasets exclusively, without resorting to proprietary and inaccessible datasets. In particular, LLaMA-13B outperforms GPT-3 (175B) on most benchmarks, and LLaMA-65B is competitive with the best models, Chinchilla-70B and PaLM-540B. We release all our models to the research community. *
+
+Tips:
+
+- Weights for the LLaMA models can be obtained from by filling out [this form](https://docs.google.com/forms/d/e/1FAIpQLSfqNECQnMkycAp2jP4Z9TFX0cGR4uf7b_fBxjY_OjhJILlKGA/viewform?usp=send_form)
+- After downloading the weights, they will need to be converted to the Hugging Face Transformers format using the [conversion script](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py). The script can be called with the following (example) command:
+
+```bash
+python src/transformers/models/llama/convert_llama_weights_to_hf.py \
+    --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path
+```
+
+- After conversion, the model and tokenizer can be loaded via:
+
+```python
+from transformers import LlamaForCausalLM, LlamaTokenizer
+
+tokenizer = LlamaTokenizer.from_pretrained("/output/path")
+model = LlamaForCausalLM.from_pretrained("/output/path")
+```
+
+Note that executing the script requires enough CPU RAM to host the whole model in float16 precision (even if the biggest versions
+come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). For the 65B model, it's thus 130GB of RAM needed.
+
+- The LLaMA tokenizer is a BPE model based on [sentencepiece](https://github.com/google/sentencepiece). One quirk of sentencepiece is that when decoding a sequence, if the first token is the start of the word (e.g. "Banana"), the tokenizer does not prepend the prefix space to the string.
+
+This model was contributed by [zphang](https://huggingface.co/zphang) with contributions from [BlackSamorez](https://huggingface.co/BlackSamorez). The code of the implementation in Hugging Face is based on GPT-NeoX [here](https://github.com/EleutherAI/gpt-neox). The original code of the authors can be found [here](https://github.com/facebookresearch/llama).
+
+
+Based on the original LLaMA model, Meta AI has released some follow-up works:
+
+- **Llama2**: Llama2 is an improved version of Llama with some architectural tweaks (Grouped Query Attention), and is pre-trained on 2Trillion tokens. Refer to the documentation of Llama2 which can be found [here](llama2).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with LLaMA. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+<PipelineTag pipeline="text-classification"/>
+
+- A [notebook](https://colab.research.google.com/github/bigscience-workshop/petals/blob/main/examples/prompt-tuning-sst2.ipynb#scrollTo=f04ba4d2) on how to use prompt tuning to adapt the LLaMA model for text classification task. 🌎
+
+<PipelineTag pipeline="question-answering"/>
+
+- [StackLLaMA: A hands-on guide to train LLaMA with RLHF](https://huggingface.co/blog/stackllama#stackllama-a-hands-on-guide-to-train-llama-with-rlhf), a blog post about how to train LLaMA to answer questions on [Stack Exchange](https://stackexchange.com/) with RLHF.
+
+⚗️ Optimization
+- A [notebook](https://colab.research.google.com/drive/1SQUXq1AMZPSLD4mk3A3swUIc6Y2dclme?usp=sharing) on how to fine-tune LLaMA model using xturing library on GPU which has limited memory. 🌎 
+
+⚡️ Inference
+- A [notebook](https://colab.research.google.com/github/DominguesM/alpaca-lora-ptbr-7b/blob/main/notebooks/02%20-%20Evaluate.ipynb) on how to run the LLaMA Model using PeftModel from the 🤗 PEFT library. 🌎 
+- A [notebook](https://colab.research.google.com/drive/1l2GiSSPbajVyp2Nk3CFT4t3uH6-5TiBe?usp=sharing) on how to load a PEFT adapter LLaMA model with LangChain. 🌎
+
+🚀 Deploy
+- A [notebook](https://colab.research.google.com/github/lxe/simple-llama-finetuner/blob/master/Simple_LLaMA_FineTuner.ipynb#scrollTo=3PM_DilAZD8T) on how to fine-tune LLaMA model using LoRA method via the 🤗 PEFT library with intuitive UI. 🌎 
+- A [notebook](https://github.com/aws/amazon-sagemaker-examples/blob/main/introduction_to_amazon_algorithms/jumpstart-foundation-models/text-generation-open-llama.ipynb) on how to deploy Open-LLaMA model for text generation on Amazon SageMaker. 🌎 
+
+## LlamaConfig
+
+[[autodoc]] LlamaConfig
+
+
+## LlamaTokenizer
+
+[[autodoc]] LlamaTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## LlamaTokenizerFast
+
+[[autodoc]] LlamaTokenizerFast
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - update_post_processor
+    - save_vocabulary
+
+## LlamaModel
+
+[[autodoc]] LlamaModel
+    - forward
+
+
+## LlamaForCausalLM
+
+[[autodoc]] LlamaForCausalLM
+    - forward
+
+## LlamaForSequenceClassification
+
+[[autodoc]] LlamaForSequenceClassification
+    - forward
diff --git a/docs/source/en/model_doc/llama2.md b/docs/source/en/model_doc/llama2.md
new file mode 100644
index 000000000000..11cb26f0dcf6
--- /dev/null
+++ b/docs/source/en/model_doc/llama2.md
@@ -0,0 +1,136 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Llama2
+
+## Overview
+
+The Llama2 model was proposed in [LLaMA: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom. It is a collection of foundation language models ranging from 7B to 70B parameters, with checkpoints finetuned for chat application!
+
+The abstract from the paper is the following:
+
+*In this work, we develop and release Llama 2, a collection of pretrained and fine-tuned large language models (LLMs) ranging in scale from 7 billion to 70 billion parameters. Our fine-tuned LLMs, called Llama 2-Chat, are optimized for dialogue use cases. Our models outperform open-source chat models on most benchmarks we tested, and based on our human evaluations for helpfulness and safety, may be a suitable substitute for closed-source models. We provide a detailed description of our approach to fine-tuning and safety improvements of Llama 2-Chat in order to enable the community to build on our work and contribute to the responsible development of LLMs.*
+
+Checkout all Llama2 models [here](https://huggingface.co/models?search=llama2)
+
+<Tip warning={true}>
+
+The `Llama2` models were trained using `bfloat16`, but the original inference uses `float16. The checkpoints uploaded on the hub use `torch_dtype = 'float16'` which will be
+used by the `AutoModel` API to cast the checkpoints from `torch.float32` to `torch.float16`. 
+
+The `dtype` of the online weights is mostly irrelevant, unless you are using `torch_dtype="auto"` when initializing a model using `model = AutoModelForCausalLM.from_pretrained("path", torch_dtype = "auto")`. The reason is that the model will first be downloaded ( using the `dtype` of the checkpoints online) then it will be casted to the default `dtype` of `torch` (becomes `torch.float32`) and finally, if there is a `torch_dtype` provided in the config, it will be used. 
+
+Training the model in `float16` is not recommended and known to produce `nan`, as such the model should be trained in `bfloat16`.
+
+</Tip>
+
+Tips:
+
+- Weights for the Llama2 models can be obtained by filling out [this form](https://ai.meta.com/resources/models-and-libraries/llama-downloads/)
+- The architecture is very similar to the first Llama, with the addition of Grouped Query Attention (GQA) following this [paper](https://arxiv.org/pdf/2305.13245.pdf)
+- Setting `config.pretraining_tp` to a value different than 1 will activate the more accurate but slower computation of the linear layers, which should better match the original logits.
+- The original model uses `pad_id = -1` which means that there is no padding token. We can't have the same logic, make sure to add a padding token using `tokenizer.add_special_tokens({"pad_token":"<pad>"})` and resize the token embedding accordingly. You should also set the `model.config.pad_token_id`. The `embed_tokens` layer of the model is initialized with `self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.config.padding_idx)`, which makes sure that encoding the padding token will output zeros, so passing it when initializing is recommended.
+- After filling out the form and gaining access to the model checkpoints, you should be able to use the already converted checkpoints. Otherwise, if you are converting your own model, feel free to use the [conversion script](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py). The script can be called with the following (example) command:
+
+```bash
+python src/transformers/models/llama/convert_llama_weights_to_hf.py \
+    --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path
+```
+
+- After conversion, the model and tokenizer can be loaded via:
+
+```python
+from transformers import LlamaForCausalLM, LlamaTokenizer
+
+tokenizer = LlamaTokenizer.from_pretrained("/output/path")
+model = LlamaForCausalLM.from_pretrained("/output/path")
+```
+
+Note that executing the script requires enough CPU RAM to host the whole model in float16 precision (even if the biggest versions
+come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). For the 75B model, it's thus 145GB of RAM needed.
+
+- The LLaMA tokenizer is a BPE model based on [sentencepiece](https://github.com/google/sentencepiece). One quirk of sentencepiece is that when decoding a sequence, if the first token is the start of the word (e.g. "Banana"), the tokenizer does not prepend the prefix space to the string.
+
+This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ) with contributions from [Lysandre Debut](https://huggingface.co/lysandre). The code of the implementation in Hugging Face is based on GPT-NeoX [here](https://github.com/EleutherAI/gpt-neox). The original code of the authors can be found [here](https://github.com/facebookresearch/llama).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with LLaMA2. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+- [Llama 2 is here - get it on Hugging Face](https://huggingface.co/blog/llama2), a blog post about Llama 2 and how to use it with 🤗 Transformers and 🤗 PEFT.
+- [LLaMA 2 - Every Resource you need](https://www.philschmid.de/llama-2), a compilation of relevant resources to learn about LLaMA 2 and how to get started quickly.
+
+<PipelineTag pipeline="text-generation"/>
+
+- A [notebook](https://colab.research.google.com/drive/1PEQyJO1-f6j0S_XJ8DV50NkpzasXkrzd?usp=sharing) on how to fine-tune Llama 2 in Google Colab using QLoRA and 4-bit precision. 🌎
+- A [notebook](https://colab.research.google.com/drive/134o_cXcMe_lsvl15ZE_4Y75Kstepsntu?usp=sharing) on how to fine-tune the "Llama-v2-7b-guanaco" model with 4-bit QLoRA and generate Q&A datasets from PDFs. 🌎
+
+<PipelineTag pipeline="text-classification"/>
+
+- A [notebook](https://colab.research.google.com/drive/1ggaa2oRFphdBmqIjSEbnb_HGkcIRC2ZB?usp=sharing) on how to fine-tune the Llama 2 model with QLoRa, TRL, and Korean text classification dataset. 🌎🇰🇷
+
+⚗️ Optimization
+- [Fine-tune Llama 2 with DPO](https://huggingface.co/blog/dpo-trl), a guide to using the TRL library's DPO method to fine tune Llama 2 on a specific dataset.
+- [Extended Guide: Instruction-tune Llama 2](https://www.philschmid.de/instruction-tune-llama-2), a guide to training Llama 2 to generate instructions from inputs, transforming the model from instruction-following to instruction-giving.
+- A [notebook](https://colab.research.google.com/drive/1SYpgFpcmtIUzdE7pxqknrM4ArCASfkFQ?usp=sharing) on how to fine-tune the Llama 2 model on a personal computer using QLoRa and TRL. 🌎
+
+⚡️ Inference
+- A [notebook](https://colab.research.google.com/drive/1TC56ArKerXUpbgRy5vM3woRsbTEVNq7h?usp=sharing) on how to quantize the Llama 2 model using GPTQ from the AutoGPTQ library. 🌎
+- A [notebook](https://colab.research.google.com/drive/1X1z9Q6domMKl2CnEM0QGHNwidLfR4dW2?usp=sharing) on how to run the Llama 2 Chat Model with 4-bit quantization on a local computer or Google Colab. 🌎
+
+🚀 Deploy
+- [Fine-tune LLaMA 2 (7-70B) on Amazon SageMaker](https://www.philschmid.de/sagemaker-llama2-qlora), a complete guide from setup to QLoRA fine-tuning and deployment on Amazon SageMaker.
+- [Deploy Llama 2 7B/13B/70B on Amazon SageMaker](https://www.philschmid.de/sagemaker-llama-llm), a guide on using Hugging Face's LLM DLC container for secure and scalable deployment.
+
+
+## LlamaConfig
+
+[[autodoc]] LlamaConfig
+
+
+## LlamaTokenizer
+
+[[autodoc]] LlamaTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## LlamaTokenizerFast
+
+[[autodoc]] LlamaTokenizerFast
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - update_post_processor
+    - save_vocabulary
+
+## LlamaModel
+
+[[autodoc]] LlamaModel
+    - forward
+
+
+## LlamaForCausalLM
+
+[[autodoc]] LlamaForCausalLM
+    - forward
+
+## LlamaForSequenceClassification
+
+[[autodoc]] LlamaForSequenceClassification
+    - forward
+
diff --git a/docs/source/en/model_doc/longformer.md b/docs/source/en/model_doc/longformer.md
new file mode 100644
index 000000000000..9947195058cc
--- /dev/null
+++ b/docs/source/en/model_doc/longformer.md
@@ -0,0 +1,204 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Longformer
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=longformer">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-longformer-blueviolet">
+</a>
+<a href="https://huggingface.co/spaces/docs-demos/longformer-base-4096-finetuned-squadv1">
+<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
+</a>
+</div>
+
+## Overview
+
+The Longformer model was presented in [Longformer: The Long-Document Transformer](https://arxiv.org/pdf/2004.05150.pdf) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+
+The abstract from the paper is the following:
+
+*Transformer-based models are unable to process long sequences due to their self-attention operation, which scales
+quadratically with the sequence length. To address this limitation, we introduce the Longformer with an attention
+mechanism that scales linearly with sequence length, making it easy to process documents of thousands of tokens or
+longer. Longformer's attention mechanism is a drop-in replacement for the standard self-attention and combines a local
+windowed attention with a task motivated global attention. Following prior work on long-sequence transformers, we
+evaluate Longformer on character-level language modeling and achieve state-of-the-art results on text8 and enwik8. In
+contrast to most prior work, we also pretrain Longformer and finetune it on a variety of downstream tasks. Our
+pretrained Longformer consistently outperforms RoBERTa on long document tasks and sets new state-of-the-art results on
+WikiHop and TriviaQA.*
+
+Tips:
+
+- Since the Longformer is based on RoBERTa, it doesn't have `token_type_ids`. You don't need to indicate which
+  token belongs to which segment. Just separate your segments with the separation token `tokenizer.sep_token` (or
+  `</s>`).
+- A transformer model replacing the attention matrices by sparse matrices to go faster. Often, the local context (e.g., what are the two tokens left and right?) is enough to take action for a given token. Some preselected input tokens are still given global attention, but the attention matrix has way less parameters, resulting in a speed-up. See the local attention section for more information.
+
+This model was contributed by [beltagy](https://huggingface.co/beltagy). The Authors' code can be found [here](https://github.com/allenai/longformer).
+
+## Longformer Self Attention
+
+Longformer self attention employs self attention on both a "local" context and a "global" context. Most tokens only
+attend "locally" to each other meaning that each token attends to its \\(\frac{1}{2} w\\) previous tokens and
+\\(\frac{1}{2} w\\) succeeding tokens with \\(w\\) being the window length as defined in
+`config.attention_window`. Note that `config.attention_window` can be of type `List` to define a
+different \\(w\\) for each layer. A selected few tokens attend "globally" to all other tokens, as it is
+conventionally done for all tokens in `BertSelfAttention`.
+
+Note that "locally" and "globally" attending tokens are projected by different query, key and value matrices. Also note
+that every "locally" attending token not only attends to tokens within its window \\(w\\), but also to all "globally"
+attending tokens so that global attention is *symmetric*.
+
+The user can define which tokens attend "locally" and which tokens attend "globally" by setting the tensor
+`global_attention_mask` at run-time appropriately. All Longformer models employ the following logic for
+`global_attention_mask`:
+
+- 0: the token attends "locally",
+- 1: the token attends "globally".
+
+For more information please also refer to [`~LongformerModel.forward`] method.
+
+Using Longformer self attention, the memory and time complexity of the query-key matmul operation, which usually
+represents the memory and time bottleneck, can be reduced from \\(\mathcal{O}(n_s \times n_s)\\) to
+\\(\mathcal{O}(n_s \times w)\\), with \\(n_s\\) being the sequence length and \\(w\\) being the average window
+size. It is assumed that the number of "globally" attending tokens is insignificant as compared to the number of
+"locally" attending tokens.
+
+For more information, please refer to the official [paper](https://arxiv.org/pdf/2004.05150.pdf).
+
+
+## Training
+
+[`LongformerForMaskedLM`] is trained the exact same way [`RobertaForMaskedLM`] is
+trained and should be used as follows:
+
+```python
+input_ids = tokenizer.encode("This is a sentence from [MASK] training data", return_tensors="pt")
+mlm_labels = tokenizer.encode("This is a sentence from the training data", return_tensors="pt")
+
+loss = model(input_ids, labels=input_ids, masked_lm_labels=mlm_labels)[0]
+```
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+- [Multiple choice task guide](../tasks/multiple_choice)
+
+## LongformerConfig
+
+[[autodoc]] LongformerConfig
+
+## LongformerTokenizer
+
+[[autodoc]] LongformerTokenizer
+
+## LongformerTokenizerFast
+
+[[autodoc]] LongformerTokenizerFast
+
+## Longformer specific outputs
+
+[[autodoc]] models.longformer.modeling_longformer.LongformerBaseModelOutput
+
+[[autodoc]] models.longformer.modeling_longformer.LongformerBaseModelOutputWithPooling
+
+[[autodoc]] models.longformer.modeling_longformer.LongformerMaskedLMOutput
+
+[[autodoc]] models.longformer.modeling_longformer.LongformerQuestionAnsweringModelOutput
+
+[[autodoc]] models.longformer.modeling_longformer.LongformerSequenceClassifierOutput
+
+[[autodoc]] models.longformer.modeling_longformer.LongformerMultipleChoiceModelOutput
+
+[[autodoc]] models.longformer.modeling_longformer.LongformerTokenClassifierOutput
+
+[[autodoc]] models.longformer.modeling_tf_longformer.TFLongformerBaseModelOutput
+
+[[autodoc]] models.longformer.modeling_tf_longformer.TFLongformerBaseModelOutputWithPooling
+
+[[autodoc]] models.longformer.modeling_tf_longformer.TFLongformerMaskedLMOutput
+
+[[autodoc]] models.longformer.modeling_tf_longformer.TFLongformerQuestionAnsweringModelOutput
+
+[[autodoc]] models.longformer.modeling_tf_longformer.TFLongformerSequenceClassifierOutput
+
+[[autodoc]] models.longformer.modeling_tf_longformer.TFLongformerMultipleChoiceModelOutput
+
+[[autodoc]] models.longformer.modeling_tf_longformer.TFLongformerTokenClassifierOutput
+
+## LongformerModel
+
+[[autodoc]] LongformerModel
+    - forward
+
+## LongformerForMaskedLM
+
+[[autodoc]] LongformerForMaskedLM
+    - forward
+
+## LongformerForSequenceClassification
+
+[[autodoc]] LongformerForSequenceClassification
+    - forward
+
+## LongformerForMultipleChoice
+
+[[autodoc]] LongformerForMultipleChoice
+    - forward
+
+## LongformerForTokenClassification
+
+[[autodoc]] LongformerForTokenClassification
+    - forward
+
+## LongformerForQuestionAnswering
+
+[[autodoc]] LongformerForQuestionAnswering
+    - forward
+
+## TFLongformerModel
+
+[[autodoc]] TFLongformerModel
+    - call
+
+## TFLongformerForMaskedLM
+
+[[autodoc]] TFLongformerForMaskedLM
+    - call
+
+## TFLongformerForQuestionAnswering
+
+[[autodoc]] TFLongformerForQuestionAnswering
+    - call
+
+## TFLongformerForSequenceClassification
+
+[[autodoc]] TFLongformerForSequenceClassification
+    - call
+
+## TFLongformerForTokenClassification
+
+[[autodoc]] TFLongformerForTokenClassification
+    - call
+
+## TFLongformerForMultipleChoice
+
+[[autodoc]] TFLongformerForMultipleChoice
+    - call
diff --git a/docs/source/en/model_doc/longformer.mdx b/docs/source/en/model_doc/longformer.mdx
deleted file mode 100644
index 3b639281cf45..000000000000
--- a/docs/source/en/model_doc/longformer.mdx
+++ /dev/null
@@ -1,182 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Longformer
-
-## Overview
-
-The Longformer model was presented in [Longformer: The Long-Document Transformer](https://arxiv.org/pdf/2004.05150.pdf) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-
-The abstract from the paper is the following:
-
-*Transformer-based models are unable to process long sequences due to their self-attention operation, which scales
-quadratically with the sequence length. To address this limitation, we introduce the Longformer with an attention
-mechanism that scales linearly with sequence length, making it easy to process documents of thousands of tokens or
-longer. Longformer's attention mechanism is a drop-in replacement for the standard self-attention and combines a local
-windowed attention with a task motivated global attention. Following prior work on long-sequence transformers, we
-evaluate Longformer on character-level language modeling and achieve state-of-the-art results on text8 and enwik8. In
-contrast to most prior work, we also pretrain Longformer and finetune it on a variety of downstream tasks. Our
-pretrained Longformer consistently outperforms RoBERTa on long document tasks and sets new state-of-the-art results on
-WikiHop and TriviaQA.*
-
-Tips:
-
-- Since the Longformer is based on RoBERTa, it doesn't have `token_type_ids`. You don't need to indicate which
-  token belongs to which segment. Just separate your segments with the separation token `tokenizer.sep_token` (or
-  `</s>`).
-
-This model was contributed by [beltagy](https://huggingface.co/beltagy). The Authors' code can be found [here](https://github.com/allenai/longformer).
-
-## Longformer Self Attention
-
-Longformer self attention employs self attention on both a "local" context and a "global" context. Most tokens only
-attend "locally" to each other meaning that each token attends to its \\(\frac{1}{2} w\\) previous tokens and
-\\(\frac{1}{2} w\\) succeeding tokens with \\(w\\) being the window length as defined in
-`config.attention_window`. Note that `config.attention_window` can be of type `List` to define a
-different \\(w\\) for each layer. A selected few tokens attend "globally" to all other tokens, as it is
-conventionally done for all tokens in `BertSelfAttention`.
-
-Note that "locally" and "globally" attending tokens are projected by different query, key and value matrices. Also note
-that every "locally" attending token not only attends to tokens within its window \\(w\\), but also to all "globally"
-attending tokens so that global attention is *symmetric*.
-
-The user can define which tokens attend "locally" and which tokens attend "globally" by setting the tensor
-`global_attention_mask` at run-time appropriately. All Longformer models employ the following logic for
-`global_attention_mask`:
-
-- 0: the token attends "locally",
-- 1: the token attends "globally".
-
-For more information please also refer to [`~LongformerModel.forward`] method.
-
-Using Longformer self attention, the memory and time complexity of the query-key matmul operation, which usually
-represents the memory and time bottleneck, can be reduced from \\(\mathcal{O}(n_s \times n_s)\\) to
-\\(\mathcal{O}(n_s \times w)\\), with \\(n_s\\) being the sequence length and \\(w\\) being the average window
-size. It is assumed that the number of "globally" attending tokens is insignificant as compared to the number of
-"locally" attending tokens.
-
-For more information, please refer to the official [paper](https://arxiv.org/pdf/2004.05150.pdf).
-
-
-## Training
-
-[`LongformerForMaskedLM`] is trained the exact same way [`RobertaForMaskedLM`] is
-trained and should be used as follows:
-
-```python
-input_ids = tokenizer.encode("This is a sentence from [MASK] training data", return_tensors="pt")
-mlm_labels = tokenizer.encode("This is a sentence from the training data", return_tensors="pt")
-
-loss = model(input_ids, labels=input_ids, masked_lm_labels=mlm_labels)[0]
-```
-
-## LongformerConfig
-
-[[autodoc]] LongformerConfig
-
-## LongformerTokenizer
-
-[[autodoc]] LongformerTokenizer
-
-## LongformerTokenizerFast
-
-[[autodoc]] LongformerTokenizerFast
-
-## Longformer specific outputs
-
-[[autodoc]] models.longformer.modeling_longformer.LongformerBaseModelOutput
-
-[[autodoc]] models.longformer.modeling_longformer.LongformerBaseModelOutputWithPooling
-
-[[autodoc]] models.longformer.modeling_longformer.LongformerMaskedLMOutput
-
-[[autodoc]] models.longformer.modeling_longformer.LongformerQuestionAnsweringModelOutput
-
-[[autodoc]] models.longformer.modeling_longformer.LongformerSequenceClassifierOutput
-
-[[autodoc]] models.longformer.modeling_longformer.LongformerMultipleChoiceModelOutput
-
-[[autodoc]] models.longformer.modeling_longformer.LongformerTokenClassifierOutput
-
-[[autodoc]] models.longformer.modeling_tf_longformer.TFLongformerBaseModelOutput
-
-[[autodoc]] models.longformer.modeling_tf_longformer.TFLongformerBaseModelOutputWithPooling
-
-[[autodoc]] models.longformer.modeling_tf_longformer.TFLongformerMaskedLMOutput
-
-[[autodoc]] models.longformer.modeling_tf_longformer.TFLongformerQuestionAnsweringModelOutput
-
-[[autodoc]] models.longformer.modeling_tf_longformer.TFLongformerSequenceClassifierOutput
-
-[[autodoc]] models.longformer.modeling_tf_longformer.TFLongformerMultipleChoiceModelOutput
-
-[[autodoc]] models.longformer.modeling_tf_longformer.TFLongformerTokenClassifierOutput
-
-## LongformerModel
-
-[[autodoc]] LongformerModel
-    - forward
-
-## LongformerForMaskedLM
-
-[[autodoc]] LongformerForMaskedLM
-    - forward
-
-## LongformerForSequenceClassification
-
-[[autodoc]] LongformerForSequenceClassification
-    - forward
-
-## LongformerForMultipleChoice
-
-[[autodoc]] LongformerForMultipleChoice
-    - forward
-
-## LongformerForTokenClassification
-
-[[autodoc]] LongformerForTokenClassification
-    - forward
-
-## LongformerForQuestionAnswering
-
-[[autodoc]] LongformerForQuestionAnswering
-    - forward
-
-## TFLongformerModel
-
-[[autodoc]] TFLongformerModel
-    - call
-
-## TFLongformerForMaskedLM
-
-[[autodoc]] TFLongformerForMaskedLM
-    - call
-
-## TFLongformerForQuestionAnswering
-
-[[autodoc]] TFLongformerForQuestionAnswering
-    - call
-
-## TFLongformerForSequenceClassification
-
-[[autodoc]] TFLongformerForSequenceClassification
-    - call
-
-## TFLongformerForTokenClassification
-
-[[autodoc]] TFLongformerForTokenClassification
-    - call
-
-## TFLongformerForMultipleChoice
-
-[[autodoc]] TFLongformerForMultipleChoice
-    - call
diff --git a/docs/source/en/model_doc/longt5.md b/docs/source/en/model_doc/longt5.md
new file mode 100644
index 000000000000..e8dcfe3b237f
--- /dev/null
+++ b/docs/source/en/model_doc/longt5.md
@@ -0,0 +1,129 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# LongT5
+
+## Overview
+
+The LongT5 model was proposed in [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916)
+by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung and Yinfei Yang. It's an
+encoder-decoder transformer pre-trained in a text-to-text denoising generative setting. LongT5 model is an extension of
+T5 model, and it enables using one of the two different efficient attention mechanisms - (1) Local attention, or (2)
+Transient-Global attention.
+
+
+The abstract from the paper is the following:
+
+*Recent work has shown that either (1) increasing the input length or (2) increasing model size can improve the
+performance of Transformer-based neural models. In this paper, we present a new model, called LongT5, with which we
+explore the effects of scaling both the input length and model size at the same time. Specifically, we integrated
+attention ideas from long-input transformers (ETC), and adopted pre-training strategies from summarization pre-training
+(PEGASUS) into the scalable T5 architecture. The result is a new attention mechanism we call {\em Transient Global}
+(TGlobal), which mimics ETC's local/global attention mechanism, but without requiring additional side-inputs. We are
+able to achieve state-of-the-art results on several summarization tasks and outperform the original T5 models on
+question answering tasks.*
+
+Tips:
+
+- [`LongT5ForConditionalGeneration`] is an extension of [`T5ForConditionalGeneration`] exchanging the traditional
+encoder *self-attention* layer with efficient either *local* attention or *transient-global* (*tglobal*) attention.
+- Unlike the T5 model, LongT5 does not use a task prefix. Furthermore, it uses a different pre-training objective
+inspired by the pre-training of [`PegasusForConditionalGeneration`].
+- LongT5 model is designed to work efficiently and very well on long-range *sequence-to-sequence* tasks where the
+input sequence exceeds commonly used 512 tokens. It is capable of handling input sequences of a length up to 16,384 tokens.
+- For *Local Attention*, the sparse sliding-window local attention operation allows a given token to attend only `r`
+tokens to the left and right of it (with `r=127` by default). *Local Attention* does not introduce any new parameters
+to the model. The complexity of the mechanism is linear in input sequence length `l`: `O(l*r)`.
+- *Transient Global Attention* is an extension of the *Local Attention*. It, furthermore, allows each input token to
+interact with all other tokens in the layer. This is achieved via splitting an input sequence into blocks of a fixed
+length `k` (with a default `k=16`). Then, a global token for such a block is obtained via summing and normalizing the embeddings of every token
+in the block. Thanks to this, the attention allows each token to attend to both nearby tokens like in Local attention, and
+also every global token like in the case of standard global attention (*transient* represents the fact the global tokens
+are constructed dynamically within each attention operation).  As a consequence, *TGlobal* attention introduces
+a few new parameters -- global relative position biases and a layer normalization for global token's embedding.
+The complexity of this mechanism is `O(l(r + l/k))`.
+- An example showing how to evaluate a fine-tuned LongT5 model on the [pubmed dataset](https://huggingface.co/datasets/scientific_papers) is below.
+
+```python
+>>> import evaluate
+>>> from datasets import load_dataset
+>>> from transformers import AutoTokenizer, LongT5ForConditionalGeneration
+
+>>> dataset = load_dataset("scientific_papers", "pubmed", split="validation")
+>>> model = (
+...     LongT5ForConditionalGeneration.from_pretrained("Stancld/longt5-tglobal-large-16384-pubmed-3k_steps")
+...     .to("cuda")
+...     .half()
+... )
+>>> tokenizer = AutoTokenizer.from_pretrained("Stancld/longt5-tglobal-large-16384-pubmed-3k_steps")
+
+
+>>> def generate_answers(batch):
+...     inputs_dict = tokenizer(
+...         batch["article"], max_length=16384, padding="max_length", truncation=True, return_tensors="pt"
+...     )
+...     input_ids = inputs_dict.input_ids.to("cuda")
+...     attention_mask = inputs_dict.attention_mask.to("cuda")
+...     output_ids = model.generate(input_ids, attention_mask=attention_mask, max_length=512, num_beams=2)
+...     batch["predicted_abstract"] = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+...     return batch
+
+
+>>> result = dataset.map(generate_answer, batched=True, batch_size=2)
+>>> rouge = evaluate.load("rouge")
+>>> rouge.compute(predictions=result["predicted_abstract"], references=result["abstract"])
+```
+
+This model was contributed by [stancld](https://huggingface.co/stancld).
+The original code can be found [here](https://github.com/google-research/longt5).
+
+## Documentation resources
+
+- [Translation task guide](../tasks/translation)
+- [Summarization task guide](../tasks/summarization)
+
+## LongT5Config
+
+[[autodoc]] LongT5Config
+
+## LongT5Model
+
+[[autodoc]] LongT5Model
+    - forward
+
+## LongT5ForConditionalGeneration
+
+[[autodoc]] LongT5ForConditionalGeneration
+    - forward
+
+## LongT5EncoderModel
+
+[[autodoc]] LongT5EncoderModel
+    - forward
+
+## FlaxLongT5Model
+
+[[autodoc]] FlaxLongT5Model
+    - __call__
+    - encode
+    - decode
+
+## FlaxLongT5ForConditionalGeneration
+
+[[autodoc]] FlaxLongT5ForConditionalGeneration
+    - __call__
+    - encode
+    - decode
diff --git a/docs/source/en/model_doc/longt5.mdx b/docs/source/en/model_doc/longt5.mdx
deleted file mode 100644
index 0e73d6c8ddff..000000000000
--- a/docs/source/en/model_doc/longt5.mdx
+++ /dev/null
@@ -1,121 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# LongT5
-
-## Overview
-
-The LongT5 model was proposed in [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916)
-by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung and Yinfei Yang. It's an
-encoder-decoder transformer pre-trained in a text-to-text denoising generative setting. LongT5 model is an extension of
-T5 model, and it enables using one of the two different efficient attention mechanisms - (1) Local attention, or (2)
-Transient-Global attention.
-
-
-The abstract from the paper is the following:
-
-*Recent work has shown that either (1) increasing the input length or (2) increasing model size can improve the
-performance of Transformer-based neural models. In this paper, we present a new model, called LongT5, with which we
-explore the effects of scaling both the input length and model size at the same time. Specifically, we integrated
-attention ideas from long-input transformers (ETC), and adopted pre-training strategies from summarization pre-training
-(PEGASUS) into the scalable T5 architecture. The result is a new attention mechanism we call {\em Transient Global}
-(TGlobal), which mimics ETC's local/global attention mechanism, but without requiring additional side-inputs. We are
-able to achieve state-of-the-art results on several summarization tasks and outperform the original T5 models on
-question answering tasks.*
-
-Tips:
-
-- [`LongT5ForConditionalGeneration`] is an extension of [`T5ForConditionalGeneration`] exchanging the traditional
-encoder *self-attention* layer with efficient either *local* attention or *transient-global* (*tglobal*) attention.
-- Unlike the T5 model, LongT5 does not use a task prefix. Furthermore, it uses a different pre-training objective
-inspired by the pre-training of [`PegasusForConditionalGeneration`].
-- LongT5 model is designed to work efficiently and very well on long-range *sequence-to-sequence* tasks where the
-input sequence exceeds commonly used 512 tokens. It is capable of handling input sequences of a length up to 16,384 tokens.
-- For *Local Attention*, the sparse sliding-window local attention operation allows a given token to attend only `r`
-tokens to the left and right of it (with `r=127` by default). *Local Attention* does not introduce any new parameters
-to the model. The complexity of the mechanism is linear in input sequence length `l`: `O(l*r)`.
-- *Transient Global Attention* is an extension of the *Local Attention*. It, furthermore, allows each input token to
-interact with all other tokens in the layer. This is achieved via splitting an input sequence into blocks of a fixed
-length `k` (with a default `k=16`). Then, a global token for such a block is obtained via summing and normalizing the embeddings of every token
-in the block. Thanks to this, the attention allows each token to attend to both nearby tokens like in Local attention, and
-also every global token like in the case of standard global attention (*transient* represents the fact the global tokens
-are constructed dynamically within each attention operation).  As a consequence, *TGlobal* attention introduces
-a few new parameters -- global relative position biases and a layer normalization for global token's embedding.
-The complexity of this mechanism is `O(l(r + l/k))`.
-- An example showing how to evaluate a fine-tuned LongT5 model on the [pubmed dataset](https://huggingface.co/datasets/scientific_papers) is below.
-
-```python
->>> import evaluate
->>> from datasets import load_dataset
->>> from transformers import AutoTokenizer, LongT5ForConditionalGeneration
-
->>> dataset = load_dataset("scientific_papers", "pubmed", split="validation")
->>> model = (
-...     LongT5ForConditionalGeneration.from_pretrained("Stancld/longt5-tglobal-large-16384-pubmed-3k_steps")
-...     .to("cuda")
-...     .half()
-... )
->>> tokenizer = AutoTokenizer.from_pretrained("Stancld/longt5-tglobal-large-16384-pubmed-3k_steps")
-
-
->>> def generate_answers(batch):
-...     inputs_dict = tokenizer(
-...         batch["article"], max_length=16384, padding="max_length", truncation=True, return_tensors="pt"
-...     )
-...     input_ids = inputs_dict.input_ids.to("cuda")
-...     attention_mask = inputs_dict.attention_mask.to("cuda")
-...     output_ids = model.generate(input_ids, attention_mask=attention_mask, max_length=512, num_beams=2)
-...     batch["predicted_abstract"] = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-...     return batch
-
-
->>> result = dataset.map(generate_answer, batched=True, batch_size=2)
->>> rouge = evaluate.load("rouge")
->>> rouge.compute(predictions=result["predicted_abstract"], references=result["abstract"])
-```
-
-This model was contributed by [stancld](https://huggingface.co/stancld).
-The original code can be found [here](https://github.com/google-research/longt5).
-
-
-## LongT5Config
-
-[[autodoc]] LongT5Config
-
-## LongT5Model
-
-[[autodoc]] LongT5Model
-    - forward
-
-## LongT5ForConditionalGeneration
-
-[[autodoc]] LongT5ForConditionalGeneration
-    - forward
-
-## LongT5EncoderModel
-
-[[autodoc]] LongT5EncoderModel
-    - forward
-
-## FlaxLongT5Model
-
-[[autodoc]] FlaxLongT5Model
-    - __call__
-    - encode
-    - decode
-
-## FlaxLongT5ForConditionalGeneration
-
-[[autodoc]] FlaxLongT5ForConditionalGeneration
-    - __call__
-    - encode
-    - decode
diff --git a/docs/source/en/model_doc/luke.md b/docs/source/en/model_doc/luke.md
new file mode 100644
index 000000000000..2947c7c41bdf
--- /dev/null
+++ b/docs/source/en/model_doc/luke.md
@@ -0,0 +1,185 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# LUKE
+
+## Overview
+
+The LUKE model was proposed in [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda and Yuji Matsumoto.
+It is based on RoBERTa and adds entity embeddings as well as an entity-aware self-attention mechanism, which helps
+improve performance on various downstream tasks involving reasoning about entities such as named entity recognition,
+extractive and cloze-style question answering, entity typing, and relation classification.
+
+The abstract from the paper is the following:
+
+*Entity representations are useful in natural language tasks involving entities. In this paper, we propose new
+pretrained contextualized representations of words and entities based on the bidirectional transformer. The proposed
+model treats words and entities in a given text as independent tokens, and outputs contextualized representations of
+them. Our model is trained using a new pretraining task based on the masked language model of BERT. The task involves
+predicting randomly masked words and entities in a large entity-annotated corpus retrieved from Wikipedia. We also
+propose an entity-aware self-attention mechanism that is an extension of the self-attention mechanism of the
+transformer, and considers the types of tokens (words or entities) when computing attention scores. The proposed model
+achieves impressive empirical performance on a wide range of entity-related tasks. In particular, it obtains
+state-of-the-art results on five well-known datasets: Open Entity (entity typing), TACRED (relation classification),
+CoNLL-2003 (named entity recognition), ReCoRD (cloze-style question answering), and SQuAD 1.1 (extractive question
+answering).*
+
+Tips:
+
+- This implementation is the same as [`RobertaModel`] with the addition of entity embeddings as well
+  as an entity-aware self-attention mechanism, which improves performance on tasks involving reasoning about entities.
+- LUKE treats entities as input tokens; therefore, it takes `entity_ids`, `entity_attention_mask`,
+  `entity_token_type_ids` and `entity_position_ids` as extra input. You can obtain those using
+  [`LukeTokenizer`].
+- [`LukeTokenizer`] takes `entities` and `entity_spans` (character-based start and end
+  positions of the entities in the input text) as extra input. `entities` typically consist of [MASK] entities or
+  Wikipedia entities. The brief description when inputting these entities are as follows:
+
+  - *Inputting [MASK] entities to compute entity representations*: The [MASK] entity is used to mask entities to be
+    predicted during pretraining. When LUKE receives the [MASK] entity, it tries to predict the original entity by
+    gathering the information about the entity from the input text. Therefore, the [MASK] entity can be used to address
+    downstream tasks requiring the information of entities in text such as entity typing, relation classification, and
+    named entity recognition.
+  - *Inputting Wikipedia entities to compute knowledge-enhanced token representations*: LUKE learns rich information
+    (or knowledge) about Wikipedia entities during pretraining and stores the information in its entity embedding. By
+    using Wikipedia entities as input tokens, LUKE outputs token representations enriched by the information stored in
+    the embeddings of these entities. This is particularly effective for tasks requiring real-world knowledge, such as
+    question answering.
+
+- There are three head models for the former use case:
+
+  - [`LukeForEntityClassification`], for tasks to classify a single entity in an input text such as
+    entity typing, e.g. the [Open Entity dataset](https://www.cs.utexas.edu/~eunsol/html_pages/open_entity.html).
+    This model places a linear head on top of the output entity representation.
+  - [`LukeForEntityPairClassification`], for tasks to classify the relationship between two entities
+    such as relation classification, e.g. the [TACRED dataset](https://nlp.stanford.edu/projects/tacred/). This
+    model places a linear head on top of the concatenated output representation of the pair of given entities.
+  - [`LukeForEntitySpanClassification`], for tasks to classify the sequence of entity spans, such as
+    named entity recognition (NER). This model places a linear head on top of the output entity representations. You
+    can address NER using this model by inputting all possible entity spans in the text to the model.
+
+  [`LukeTokenizer`] has a `task` argument, which enables you to easily create an input to these
+  head models by specifying `task="entity_classification"`, `task="entity_pair_classification"`, or
+  `task="entity_span_classification"`. Please refer to the example code of each head models.
+
+  A demo notebook on how to fine-tune [`LukeForEntityPairClassification`] for relation
+  classification can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/LUKE).
+
+  There are also 3 notebooks available, which showcase how you can reproduce the results as reported in the paper with
+  the HuggingFace implementation of LUKE. They can be found [here](https://github.com/studio-ousia/luke/tree/master/notebooks).
+
+Example:
+
+```python
+>>> from transformers import LukeTokenizer, LukeModel, LukeForEntityPairClassification
+
+>>> model = LukeModel.from_pretrained("studio-ousia/luke-base")
+>>> tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base")
+# Example 1: Computing the contextualized entity representation corresponding to the entity mention "Beyoncé"
+
+>>> text = "Beyoncé lives in Los Angeles."
+>>> entity_spans = [(0, 7)]  # character-based entity span corresponding to "Beyoncé"
+>>> inputs = tokenizer(text, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt")
+>>> outputs = model(**inputs)
+>>> word_last_hidden_state = outputs.last_hidden_state
+>>> entity_last_hidden_state = outputs.entity_last_hidden_state
+# Example 2: Inputting Wikipedia entities to obtain enriched contextualized representations
+
+>>> entities = [
+...     "Beyoncé",
+...     "Los Angeles",
+... ]  # Wikipedia entity titles corresponding to the entity mentions "Beyoncé" and "Los Angeles"
+>>> entity_spans = [(0, 7), (17, 28)]  # character-based entity spans corresponding to "Beyoncé" and "Los Angeles"
+>>> inputs = tokenizer(text, entities=entities, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt")
+>>> outputs = model(**inputs)
+>>> word_last_hidden_state = outputs.last_hidden_state
+>>> entity_last_hidden_state = outputs.entity_last_hidden_state
+# Example 3: Classifying the relationship between two entities using LukeForEntityPairClassification head model
+
+>>> model = LukeForEntityPairClassification.from_pretrained("studio-ousia/luke-large-finetuned-tacred")
+>>> tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-tacred")
+>>> entity_spans = [(0, 7), (17, 28)]  # character-based entity spans corresponding to "Beyoncé" and "Los Angeles"
+>>> inputs = tokenizer(text, entity_spans=entity_spans, return_tensors="pt")
+>>> outputs = model(**inputs)
+>>> logits = outputs.logits
+>>> predicted_class_idx = int(logits[0].argmax())
+>>> print("Predicted class:", model.config.id2label[predicted_class_idx])
+```
+
+This model was contributed by [ikuyamada](https://huggingface.co/ikuyamada) and [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/studio-ousia/luke).
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+- [Multiple choice task guide](../tasks/multiple_choice)
+
+## LukeConfig
+
+[[autodoc]] LukeConfig
+
+## LukeTokenizer
+
+[[autodoc]] LukeTokenizer
+    - __call__
+    - save_vocabulary
+
+## LukeModel
+
+[[autodoc]] LukeModel
+    - forward
+
+## LukeForMaskedLM
+
+[[autodoc]] LukeForMaskedLM
+    - forward
+
+## LukeForEntityClassification
+
+[[autodoc]] LukeForEntityClassification
+    - forward
+
+## LukeForEntityPairClassification
+
+[[autodoc]] LukeForEntityPairClassification
+    - forward
+
+## LukeForEntitySpanClassification
+
+[[autodoc]] LukeForEntitySpanClassification
+    - forward
+
+## LukeForSequenceClassification
+
+[[autodoc]] LukeForSequenceClassification
+    - forward
+
+## LukeForMultipleChoice
+
+[[autodoc]] LukeForMultipleChoice
+    - forward
+
+## LukeForTokenClassification
+
+[[autodoc]] LukeForTokenClassification
+    - forward
+
+## LukeForQuestionAnswering
+
+[[autodoc]] LukeForQuestionAnswering
+    - forward
diff --git a/docs/source/en/model_doc/luke.mdx b/docs/source/en/model_doc/luke.mdx
deleted file mode 100644
index b7483f9194e0..000000000000
--- a/docs/source/en/model_doc/luke.mdx
+++ /dev/null
@@ -1,174 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# LUKE
-
-## Overview
-
-The LUKE model was proposed in [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda and Yuji Matsumoto.
-It is based on RoBERTa and adds entity embeddings as well as an entity-aware self-attention mechanism, which helps
-improve performance on various downstream tasks involving reasoning about entities such as named entity recognition,
-extractive and cloze-style question answering, entity typing, and relation classification.
-
-The abstract from the paper is the following:
-
-*Entity representations are useful in natural language tasks involving entities. In this paper, we propose new
-pretrained contextualized representations of words and entities based on the bidirectional transformer. The proposed
-model treats words and entities in a given text as independent tokens, and outputs contextualized representations of
-them. Our model is trained using a new pretraining task based on the masked language model of BERT. The task involves
-predicting randomly masked words and entities in a large entity-annotated corpus retrieved from Wikipedia. We also
-propose an entity-aware self-attention mechanism that is an extension of the self-attention mechanism of the
-transformer, and considers the types of tokens (words or entities) when computing attention scores. The proposed model
-achieves impressive empirical performance on a wide range of entity-related tasks. In particular, it obtains
-state-of-the-art results on five well-known datasets: Open Entity (entity typing), TACRED (relation classification),
-CoNLL-2003 (named entity recognition), ReCoRD (cloze-style question answering), and SQuAD 1.1 (extractive question
-answering).*
-
-Tips:
-
-- This implementation is the same as [`RobertaModel`] with the addition of entity embeddings as well
-  as an entity-aware self-attention mechanism, which improves performance on tasks involving reasoning about entities.
-- LUKE treats entities as input tokens; therefore, it takes `entity_ids`, `entity_attention_mask`,
-  `entity_token_type_ids` and `entity_position_ids` as extra input. You can obtain those using
-  [`LukeTokenizer`].
-- [`LukeTokenizer`] takes `entities` and `entity_spans` (character-based start and end
-  positions of the entities in the input text) as extra input. `entities` typically consist of [MASK] entities or
-  Wikipedia entities. The brief description when inputting these entities are as follows:
-
-  - *Inputting [MASK] entities to compute entity representations*: The [MASK] entity is used to mask entities to be
-    predicted during pretraining. When LUKE receives the [MASK] entity, it tries to predict the original entity by
-    gathering the information about the entity from the input text. Therefore, the [MASK] entity can be used to address
-    downstream tasks requiring the information of entities in text such as entity typing, relation classification, and
-    named entity recognition.
-  - *Inputting Wikipedia entities to compute knowledge-enhanced token representations*: LUKE learns rich information
-    (or knowledge) about Wikipedia entities during pretraining and stores the information in its entity embedding. By
-    using Wikipedia entities as input tokens, LUKE outputs token representations enriched by the information stored in
-    the embeddings of these entities. This is particularly effective for tasks requiring real-world knowledge, such as
-    question answering.
-
-- There are three head models for the former use case:
-
-  - [`LukeForEntityClassification`], for tasks to classify a single entity in an input text such as
-    entity typing, e.g. the [Open Entity dataset](https://www.cs.utexas.edu/~eunsol/html_pages/open_entity.html).
-    This model places a linear head on top of the output entity representation.
-  - [`LukeForEntityPairClassification`], for tasks to classify the relationship between two entities
-    such as relation classification, e.g. the [TACRED dataset](https://nlp.stanford.edu/projects/tacred/). This
-    model places a linear head on top of the concatenated output representation of the pair of given entities.
-  - [`LukeForEntitySpanClassification`], for tasks to classify the sequence of entity spans, such as
-    named entity recognition (NER). This model places a linear head on top of the output entity representations. You
-    can address NER using this model by inputting all possible entity spans in the text to the model.
-
-  [`LukeTokenizer`] has a `task` argument, which enables you to easily create an input to these
-  head models by specifying `task="entity_classification"`, `task="entity_pair_classification"`, or
-  `task="entity_span_classification"`. Please refer to the example code of each head models.
-
-  A demo notebook on how to fine-tune [`LukeForEntityPairClassification`] for relation
-  classification can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/LUKE).
-
-  There are also 3 notebooks available, which showcase how you can reproduce the results as reported in the paper with
-  the HuggingFace implementation of LUKE. They can be found [here](https://github.com/studio-ousia/luke/tree/master/notebooks).
-
-Example:
-
-```python
->>> from transformers import LukeTokenizer, LukeModel, LukeForEntityPairClassification
-
->>> model = LukeModel.from_pretrained("studio-ousia/luke-base")
->>> tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base")
-# Example 1: Computing the contextualized entity representation corresponding to the entity mention "Beyoncé"
-
->>> text = "Beyoncé lives in Los Angeles."
->>> entity_spans = [(0, 7)]  # character-based entity span corresponding to "Beyoncé"
->>> inputs = tokenizer(text, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt")
->>> outputs = model(**inputs)
->>> word_last_hidden_state = outputs.last_hidden_state
->>> entity_last_hidden_state = outputs.entity_last_hidden_state
-# Example 2: Inputting Wikipedia entities to obtain enriched contextualized representations
-
->>> entities = [
-...     "Beyoncé",
-...     "Los Angeles",
-... ]  # Wikipedia entity titles corresponding to the entity mentions "Beyoncé" and "Los Angeles"
->>> entity_spans = [(0, 7), (17, 28)]  # character-based entity spans corresponding to "Beyoncé" and "Los Angeles"
->>> inputs = tokenizer(text, entities=entities, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt")
->>> outputs = model(**inputs)
->>> word_last_hidden_state = outputs.last_hidden_state
->>> entity_last_hidden_state = outputs.entity_last_hidden_state
-# Example 3: Classifying the relationship between two entities using LukeForEntityPairClassification head model
-
->>> model = LukeForEntityPairClassification.from_pretrained("studio-ousia/luke-large-finetuned-tacred")
->>> tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-tacred")
->>> entity_spans = [(0, 7), (17, 28)]  # character-based entity spans corresponding to "Beyoncé" and "Los Angeles"
->>> inputs = tokenizer(text, entity_spans=entity_spans, return_tensors="pt")
->>> outputs = model(**inputs)
->>> logits = outputs.logits
->>> predicted_class_idx = int(logits[0].argmax())
->>> print("Predicted class:", model.config.id2label[predicted_class_idx])
-```
-
-This model was contributed by [ikuyamada](https://huggingface.co/ikuyamada) and [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/studio-ousia/luke).
-
-
-## LukeConfig
-
-[[autodoc]] LukeConfig
-
-## LukeTokenizer
-
-[[autodoc]] LukeTokenizer
-    - __call__
-    - save_vocabulary
-
-## LukeModel
-
-[[autodoc]] LukeModel
-    - forward
-
-## LukeForMaskedLM
-
-[[autodoc]] LukeForMaskedLM
-    - forward
-
-## LukeForEntityClassification
-
-[[autodoc]] LukeForEntityClassification
-    - forward
-
-## LukeForEntityPairClassification
-
-[[autodoc]] LukeForEntityPairClassification
-    - forward
-
-## LukeForEntitySpanClassification
-
-[[autodoc]] LukeForEntitySpanClassification
-    - forward
-
-## LukeForSequenceClassification
-
-[[autodoc]] LukeForSequenceClassification
-    - forward
-
-## LukeForMultipleChoice
-
-[[autodoc]] LukeForMultipleChoice
-    - forward
-
-## LukeForTokenClassification
-
-[[autodoc]] LukeForTokenClassification
-    - forward
-
-## LukeForQuestionAnswering
-
-[[autodoc]] LukeForQuestionAnswering
-    - forward
diff --git a/docs/source/en/model_doc/lxmert.md b/docs/source/en/model_doc/lxmert.md
new file mode 100644
index 000000000000..114539f61e81
--- /dev/null
+++ b/docs/source/en/model_doc/lxmert.md
@@ -0,0 +1,109 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# LXMERT
+
+## Overview
+
+The LXMERT model was proposed in [LXMERT: Learning Cross-Modality Encoder Representations from Transformers](https://arxiv.org/abs/1908.07490) by Hao Tan & Mohit Bansal. It is a series of bidirectional transformer encoders
+(one for the vision modality, one for the language modality, and then one to fuse both modalities) pretrained using a
+combination of masked language modeling, visual-language text alignment, ROI-feature regression, masked
+visual-attribute modeling, masked visual-object modeling, and visual-question answering objectives. The pretraining
+consists of multiple multi-modal datasets: MSCOCO, Visual-Genome + Visual-Genome Question Answering, VQA 2.0, and GQA.
+
+The abstract from the paper is the following:
+
+*Vision-and-language reasoning requires an understanding of visual concepts, language semantics, and, most importantly,
+the alignment and relationships between these two modalities. We thus propose the LXMERT (Learning Cross-Modality
+Encoder Representations from Transformers) framework to learn these vision-and-language connections. In LXMERT, we
+build a large-scale Transformer model that consists of three encoders: an object relationship encoder, a language
+encoder, and a cross-modality encoder. Next, to endow our model with the capability of connecting vision and language
+semantics, we pre-train the model with large amounts of image-and-sentence pairs, via five diverse representative
+pretraining tasks: masked language modeling, masked object prediction (feature regression and label classification),
+cross-modality matching, and image question answering. These tasks help in learning both intra-modality and
+cross-modality relationships. After fine-tuning from our pretrained parameters, our model achieves the state-of-the-art
+results on two visual question answering datasets (i.e., VQA and GQA). We also show the generalizability of our
+pretrained cross-modality model by adapting it to a challenging visual-reasoning task, NLVR, and improve the previous
+best result by 22% absolute (54% to 76%). Lastly, we demonstrate detailed ablation studies to prove that both our novel
+model components and pretraining strategies significantly contribute to our strong results; and also present several
+attention visualizations for the different encoders*
+
+Tips:
+
+- Bounding boxes are not necessary to be used in the visual feature embeddings, any kind of visual-spacial features
+  will work.
+- Both the language hidden states and the visual hidden states that LXMERT outputs are passed through the
+  cross-modality layer, so they contain information from both modalities. To access a modality that only attends to
+  itself, select the vision/language hidden states from the first input in the tuple.
+- The bidirectional cross-modality encoder attention only returns attention values when the language modality is used
+  as the input and the vision modality is used as the context vector. Further, while the cross-modality encoder
+  contains self-attention for each respective modality and cross-attention, only the cross attention is returned and
+  both self attention outputs are disregarded.
+
+This model was contributed by [eltoto1219](https://huggingface.co/eltoto1219). The original code can be found [here](https://github.com/airsplay/lxmert).
+
+## Documentation resources
+
+- [Question answering task guide](../tasks/question_answering)
+
+## LxmertConfig
+
+[[autodoc]] LxmertConfig
+
+## LxmertTokenizer
+
+[[autodoc]] LxmertTokenizer
+
+## LxmertTokenizerFast
+
+[[autodoc]] LxmertTokenizerFast
+
+## Lxmert specific outputs
+
+[[autodoc]] models.lxmert.modeling_lxmert.LxmertModelOutput
+
+[[autodoc]] models.lxmert.modeling_lxmert.LxmertForPreTrainingOutput
+
+[[autodoc]] models.lxmert.modeling_lxmert.LxmertForQuestionAnsweringOutput
+
+[[autodoc]] models.lxmert.modeling_tf_lxmert.TFLxmertModelOutput
+
+[[autodoc]] models.lxmert.modeling_tf_lxmert.TFLxmertForPreTrainingOutput
+
+## LxmertModel
+
+[[autodoc]] LxmertModel
+    - forward
+
+## LxmertForPreTraining
+
+[[autodoc]] LxmertForPreTraining
+    - forward
+
+## LxmertForQuestionAnswering
+
+[[autodoc]] LxmertForQuestionAnswering
+    - forward
+
+## TFLxmertModel
+
+[[autodoc]] TFLxmertModel
+    - call
+
+## TFLxmertForPreTraining
+
+[[autodoc]] TFLxmertForPreTraining
+    - call
diff --git a/docs/source/en/model_doc/lxmert.mdx b/docs/source/en/model_doc/lxmert.mdx
deleted file mode 100644
index 51a5be07d7ed..000000000000
--- a/docs/source/en/model_doc/lxmert.mdx
+++ /dev/null
@@ -1,102 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# LXMERT
-
-## Overview
-
-The LXMERT model was proposed in [LXMERT: Learning Cross-Modality Encoder Representations from Transformers](https://arxiv.org/abs/1908.07490) by Hao Tan & Mohit Bansal. It is a series of bidirectional transformer encoders
-(one for the vision modality, one for the language modality, and then one to fuse both modalities) pretrained using a
-combination of masked language modeling, visual-language text alignment, ROI-feature regression, masked
-visual-attribute modeling, masked visual-object modeling, and visual-question answering objectives. The pretraining
-consists of multiple multi-modal datasets: MSCOCO, Visual-Genome + Visual-Genome Question Answering, VQA 2.0, and GQA.
-
-The abstract from the paper is the following:
-
-*Vision-and-language reasoning requires an understanding of visual concepts, language semantics, and, most importantly,
-the alignment and relationships between these two modalities. We thus propose the LXMERT (Learning Cross-Modality
-Encoder Representations from Transformers) framework to learn these vision-and-language connections. In LXMERT, we
-build a large-scale Transformer model that consists of three encoders: an object relationship encoder, a language
-encoder, and a cross-modality encoder. Next, to endow our model with the capability of connecting vision and language
-semantics, we pre-train the model with large amounts of image-and-sentence pairs, via five diverse representative
-pretraining tasks: masked language modeling, masked object prediction (feature regression and label classification),
-cross-modality matching, and image question answering. These tasks help in learning both intra-modality and
-cross-modality relationships. After fine-tuning from our pretrained parameters, our model achieves the state-of-the-art
-results on two visual question answering datasets (i.e., VQA and GQA). We also show the generalizability of our
-pretrained cross-modality model by adapting it to a challenging visual-reasoning task, NLVR, and improve the previous
-best result by 22% absolute (54% to 76%). Lastly, we demonstrate detailed ablation studies to prove that both our novel
-model components and pretraining strategies significantly contribute to our strong results; and also present several
-attention visualizations for the different encoders*
-
-Tips:
-
-- Bounding boxes are not necessary to be used in the visual feature embeddings, any kind of visual-spacial features
-  will work.
-- Both the language hidden states and the visual hidden states that LXMERT outputs are passed through the
-  cross-modality layer, so they contain information from both modalities. To access a modality that only attends to
-  itself, select the vision/language hidden states from the first input in the tuple.
-- The bidirectional cross-modality encoder attention only returns attention values when the language modality is used
-  as the input and the vision modality is used as the context vector. Further, while the cross-modality encoder
-  contains self-attention for each respective modality and cross-attention, only the cross attention is returned and
-  both self attention outputs are disregarded.
-
-This model was contributed by [eltoto1219](https://huggingface.co/eltoto1219). The original code can be found [here](https://github.com/airsplay/lxmert).
-
-
-## LxmertConfig
-
-[[autodoc]] LxmertConfig
-
-## LxmertTokenizer
-
-[[autodoc]] LxmertTokenizer
-
-## LxmertTokenizerFast
-
-[[autodoc]] LxmertTokenizerFast
-
-## Lxmert specific outputs
-
-[[autodoc]] models.lxmert.modeling_lxmert.LxmertModelOutput
-
-[[autodoc]] models.lxmert.modeling_lxmert.LxmertForPreTrainingOutput
-
-[[autodoc]] models.lxmert.modeling_lxmert.LxmertForQuestionAnsweringOutput
-
-[[autodoc]] models.lxmert.modeling_tf_lxmert.TFLxmertModelOutput
-
-[[autodoc]] models.lxmert.modeling_tf_lxmert.TFLxmertForPreTrainingOutput
-
-## LxmertModel
-
-[[autodoc]] LxmertModel
-    - forward
-
-## LxmertForPreTraining
-
-[[autodoc]] LxmertForPreTraining
-    - forward
-
-## LxmertForQuestionAnswering
-
-[[autodoc]] LxmertForQuestionAnswering
-    - forward
-
-## TFLxmertModel
-
-[[autodoc]] TFLxmertModel
-    - call
-
-## TFLxmertForPreTraining
-
-[[autodoc]] TFLxmertForPreTraining
-    - call
diff --git a/docs/source/en/model_doc/m2m_100.md b/docs/source/en/model_doc/m2m_100.md
new file mode 100644
index 000000000000..c2b4354c6d5f
--- /dev/null
+++ b/docs/source/en/model_doc/m2m_100.md
@@ -0,0 +1,123 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# M2M100
+
+## Overview
+
+The M2M100 model was proposed in [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky,
+Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy
+Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
+
+The abstract from the paper is the following:
+
+*Existing work in translation demonstrated the potential of massively multilingual machine translation by training a
+single model able to translate between any pair of languages. However, much of this work is English-Centric by training
+only on data which was translated from or to English. While this is supported by large sources of training data, it
+does not reflect translation needs worldwide. In this work, we create a true Many-to-Many multilingual translation
+model that can translate directly between any pair of 100 languages. We build and open source a training dataset that
+covers thousands of language directions with supervised data, created through large-scale mining. Then, we explore how
+to effectively increase model capacity through a combination of dense scaling and language-specific sparse parameters
+to create high quality models. Our focus on non-English-Centric models brings gains of more than 10 BLEU when directly
+translating between non-English directions while performing competitively to the best single systems of WMT. We
+open-source our scripts so that others may reproduce the data, evaluation, and final M2M-100 model.*
+
+This model was contributed by [valhalla](https://huggingface.co/valhalla).
+
+
+### Training and Generation
+
+M2M100 is a multilingual encoder-decoder (seq-to-seq) model primarily intended for translation tasks. As the model is
+multilingual it expects the sequences in a certain format: A special language id token is used as prefix in both the
+source and target text. The source text format is `[lang_code] X [eos]`, where `lang_code` is source language
+id for source text and target language id for target text, with `X` being the source or target text.
+
+The [`M2M100Tokenizer`] depends on `sentencepiece` so be sure to install it before running the
+examples. To install `sentencepiece` run `pip install sentencepiece`.
+
+- Supervised Training
+
+```python
+from transformers import M2M100Config, M2M100ForConditionalGeneration, M2M100Tokenizer
+
+model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
+tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="en", tgt_lang="fr")
+
+src_text = "Life is like a box of chocolates."
+tgt_text = "La vie est comme une boîte de chocolat."
+
+model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
+
+loss = model(**model_inputs).loss  # forward pass
+```
+
+- Generation
+
+  M2M100 uses the `eos_token_id` as the `decoder_start_token_id` for generation with the target language id
+  being forced as the first generated token. To force the target language id as the first generated token, pass the
+  *forced_bos_token_id* parameter to the *generate* method. The following example shows how to translate between
+  Hindi to French and Chinese to English using the *facebook/m2m100_418M* checkpoint.
+
+```python
+>>> from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
+
+>>> hi_text = "जीवन एक चॉकलेट बॉक्स की तरह है।"
+>>> chinese_text = "生活就像一盒巧克力。"
+
+>>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
+>>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
+
+>>> # translate Hindi to French
+>>> tokenizer.src_lang = "hi"
+>>> encoded_hi = tokenizer(hi_text, return_tensors="pt")
+>>> generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.get_lang_id("fr"))
+>>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+"La vie est comme une boîte de chocolat."
+
+>>> # translate Chinese to English
+>>> tokenizer.src_lang = "zh"
+>>> encoded_zh = tokenizer(chinese_text, return_tensors="pt")
+>>> generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en"))
+>>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+"Life is like a box of chocolate."
+```
+
+## Documentation resources
+
+- [Translation task guide](../tasks/translation)
+- [Summarization task guide](../tasks/summarization)
+
+## M2M100Config
+
+[[autodoc]] M2M100Config
+
+## M2M100Tokenizer
+
+[[autodoc]] M2M100Tokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## M2M100Model
+
+[[autodoc]] M2M100Model
+    - forward
+
+## M2M100ForConditionalGeneration
+
+[[autodoc]] M2M100ForConditionalGeneration
+    - forward
diff --git a/docs/source/en/model_doc/m2m_100.mdx b/docs/source/en/model_doc/m2m_100.mdx
deleted file mode 100644
index 10ac6a9df918..000000000000
--- a/docs/source/en/model_doc/m2m_100.mdx
+++ /dev/null
@@ -1,114 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# M2M100
-
-## Overview
-
-The M2M100 model was proposed in [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky,
-Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy
-Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
-
-The abstract from the paper is the following:
-
-*Existing work in translation demonstrated the potential of massively multilingual machine translation by training a
-single model able to translate between any pair of languages. However, much of this work is English-Centric by training
-only on data which was translated from or to English. While this is supported by large sources of training data, it
-does not reflect translation needs worldwide. In this work, we create a true Many-to-Many multilingual translation
-model that can translate directly between any pair of 100 languages. We build and open source a training dataset that
-covers thousands of language directions with supervised data, created through large-scale mining. Then, we explore how
-to effectively increase model capacity through a combination of dense scaling and language-specific sparse parameters
-to create high quality models. Our focus on non-English-Centric models brings gains of more than 10 BLEU when directly
-translating between non-English directions while performing competitively to the best single systems of WMT. We
-open-source our scripts so that others may reproduce the data, evaluation, and final M2M-100 model.*
-
-This model was contributed by [valhalla](https://huggingface.co/valhalla).
-
-
-### Training and Generation
-
-M2M100 is a multilingual encoder-decoder (seq-to-seq) model primarily intended for translation tasks. As the model is
-multilingual it expects the sequences in a certain format: A special language id token is used as prefix in both the
-source and target text. The source text format is `[lang_code] X [eos]`, where `lang_code` is source language
-id for source text and target language id for target text, with `X` being the source or target text.
-
-The [`M2M100Tokenizer`] depends on `sentencepiece` so be sure to install it before running the
-examples. To install `sentencepiece` run `pip install sentencepiece`.
-
-- Supervised Training
-
-```python
-from transformers import M2M100Config, M2M100ForConditionalGeneration, M2M100Tokenizer
-
-model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
-tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="en", tgt_lang="fr")
-
-src_text = "Life is like a box of chocolates."
-tgt_text = "La vie est comme une boîte de chocolat."
-
-model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
-
-loss = model(**model_inputs).loss  # forward pass
-```
-
-- Generation
-
-  M2M100 uses the `eos_token_id` as the `decoder_start_token_id` for generation with the target language id
-  being forced as the first generated token. To force the target language id as the first generated token, pass the
-  *forced_bos_token_id* parameter to the *generate* method. The following example shows how to translate between
-  Hindi to French and Chinese to English using the *facebook/m2m100_418M* checkpoint.
-
-```python
->>> from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
-
->>> hi_text = "जीवन एक चॉकलेट बॉक्स की तरह है।"
->>> chinese_text = "生活就像一盒巧克力。"
-
->>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
->>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
-
->>> # translate Hindi to French
->>> tokenizer.src_lang = "hi"
->>> encoded_hi = tokenizer(hi_text, return_tensors="pt")
->>> generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.get_lang_id("fr"))
->>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
-"La vie est comme une boîte de chocolat."
-
->>> # translate Chinese to English
->>> tokenizer.src_lang = "zh"
->>> encoded_zh = tokenizer(chinese_text, return_tensors="pt")
->>> generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en"))
->>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
-"Life is like a box of chocolate."
-```
-
-## M2M100Config
-
-[[autodoc]] M2M100Config
-
-## M2M100Tokenizer
-
-[[autodoc]] M2M100Tokenizer
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-
-## M2M100Model
-
-[[autodoc]] M2M100Model
-    - forward
-
-## M2M100ForConditionalGeneration
-
-[[autodoc]] M2M100ForConditionalGeneration
-    - forward
diff --git a/docs/source/en/model_doc/marian.md b/docs/source/en/model_doc/marian.md
new file mode 100644
index 000000000000..8be41686594c
--- /dev/null
+++ b/docs/source/en/model_doc/marian.md
@@ -0,0 +1,216 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# MarianMT
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=marian">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-marian-blueviolet">
+</a>
+<a href="https://huggingface.co/spaces/docs-demos/opus-mt-zh-en">
+<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
+</a>
+</div>
+
+**Bugs:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=sshleifer&labels=&template=bug-report.md&title)
+and assign @patrickvonplaten.
+
+Translations should be similar, but not identical to output in the test set linked to in each model card.
+
+Tips:
+
+- A framework for translation models, using the same models as BART.
+
+## Implementation Notes
+
+- Each model is about 298 MB on disk, there are more than 1,000 models.
+- The list of supported language pairs can be found [here](https://huggingface.co/Helsinki-NLP).
+- Models were originally trained by [Jörg Tiedemann](https://researchportal.helsinki.fi/en/persons/j%C3%B6rg-tiedemann) using the [Marian](https://marian-nmt.github.io/) C++ library, which supports fast training and translation.
+- All models are transformer encoder-decoders with 6 layers in each component. Each model's performance is documented
+  in a model card.
+- The 80 opus models that require BPE preprocessing are not supported.
+- The modeling code is the same as [`BartForConditionalGeneration`] with a few minor modifications:
+
+  - static (sinusoid) positional embeddings (`MarianConfig.static_position_embeddings=True`)
+  - no layernorm_embedding (`MarianConfig.normalize_embedding=False`)
+  - the model starts generating with `pad_token_id` (which has 0 as a token_embedding) as the prefix (Bart uses
+    `<s/>`),
+- Code to bulk convert models can be found in `convert_marian_to_pytorch.py`.
+- This model was contributed by [sshleifer](https://huggingface.co/sshleifer).
+
+## Naming
+
+- All model names use the following format: `Helsinki-NLP/opus-mt-{src}-{tgt}`
+- The language codes used to name models are inconsistent. Two digit codes can usually be found [here](https://developers.google.com/admin-sdk/directory/v1/languages), three digit codes require googling "language
+  code {code}".
+- Codes formatted like `es_AR` are usually `code_{region}`. That one is Spanish from Argentina.
+- The models were converted in two stages. The first 1000 models use ISO-639-2 codes to identify languages, the second
+  group use a combination of ISO-639-5 codes and ISO-639-2 codes.
+
+
+## Examples
+
+- Since Marian models are smaller than many other translation models available in the library, they can be useful for
+  fine-tuning experiments and integration tests.
+- [Fine-tune on GPU](https://github.com/huggingface/transformers/blob/master/examples/legacy/seq2seq/train_distil_marian_enro.sh)
+
+## Multilingual Models
+
+- All model names use the following format: `Helsinki-NLP/opus-mt-{src}-{tgt}`:
+- If a model can output multiple languages, and you should specify a language code by prepending the desired output
+  language to the `src_text`.
+- You can see a models's supported language codes in its model card, under target constituents, like in [opus-mt-en-roa](https://huggingface.co/Helsinki-NLP/opus-mt-en-roa).
+- Note that if a model is only multilingual on the source side, like `Helsinki-NLP/opus-mt-roa-en`, no language
+  codes are required.
+
+New multi-lingual models from the [Tatoeba-Challenge repo](https://github.com/Helsinki-NLP/Tatoeba-Challenge)
+require 3 character language codes:
+
+```python
+>>> from transformers import MarianMTModel, MarianTokenizer
+
+>>> src_text = [
+...     ">>fra<< this is a sentence in english that we want to translate to french",
+...     ">>por<< This should go to portuguese",
+...     ">>esp<< And this to Spanish",
+... ]
+
+>>> model_name = "Helsinki-NLP/opus-mt-en-roa"
+>>> tokenizer = MarianTokenizer.from_pretrained(model_name)
+>>> print(tokenizer.supported_language_codes)
+['>>zlm_Latn<<', '>>mfe<<', '>>hat<<', '>>pap<<', '>>ast<<', '>>cat<<', '>>ind<<', '>>glg<<', '>>wln<<', '>>spa<<', '>>fra<<', '>>ron<<', '>>por<<', '>>ita<<', '>>oci<<', '>>arg<<', '>>min<<']
+
+>>> model = MarianMTModel.from_pretrained(model_name)
+>>> translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
+>>> [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
+["c'est une phrase en anglais que nous voulons traduire en français",
+ 'Isto deve ir para o português.',
+ 'Y esto al español']
+```
+
+Here is the code to see all available pretrained models on the hub:
+
+```python
+from huggingface_hub import list_models
+
+model_list = list_models()
+org = "Helsinki-NLP"
+model_ids = [x.modelId for x in model_list if x.modelId.startswith(org)]
+suffix = [x.split("/")[1] for x in model_ids]
+old_style_multi_models = [f"{org}/{s}" for s in suffix if s != s.lower()]
+```
+
+## Old Style Multi-Lingual Models
+
+These are the old style multi-lingual models ported from the OPUS-MT-Train repo: and the members of each language
+group:
+
+```python no-style
+['Helsinki-NLP/opus-mt-NORTH_EU-NORTH_EU',
+ 'Helsinki-NLP/opus-mt-ROMANCE-en',
+ 'Helsinki-NLP/opus-mt-SCANDINAVIA-SCANDINAVIA',
+ 'Helsinki-NLP/opus-mt-de-ZH',
+ 'Helsinki-NLP/opus-mt-en-CELTIC',
+ 'Helsinki-NLP/opus-mt-en-ROMANCE',
+ 'Helsinki-NLP/opus-mt-es-NORWAY',
+ 'Helsinki-NLP/opus-mt-fi-NORWAY',
+ 'Helsinki-NLP/opus-mt-fi-ZH',
+ 'Helsinki-NLP/opus-mt-fi_nb_no_nn_ru_sv_en-SAMI',
+ 'Helsinki-NLP/opus-mt-sv-NORWAY',
+ 'Helsinki-NLP/opus-mt-sv-ZH']
+GROUP_MEMBERS = {
+ 'ZH': ['cmn', 'cn', 'yue', 'ze_zh', 'zh_cn', 'zh_CN', 'zh_HK', 'zh_tw', 'zh_TW', 'zh_yue', 'zhs', 'zht', 'zh'],
+ 'ROMANCE': ['fr', 'fr_BE', 'fr_CA', 'fr_FR', 'wa', 'frp', 'oc', 'ca', 'rm', 'lld', 'fur', 'lij', 'lmo', 'es', 'es_AR', 'es_CL', 'es_CO', 'es_CR', 'es_DO', 'es_EC', 'es_ES', 'es_GT', 'es_HN', 'es_MX', 'es_NI', 'es_PA', 'es_PE', 'es_PR', 'es_SV', 'es_UY', 'es_VE', 'pt', 'pt_br', 'pt_BR', 'pt_PT', 'gl', 'lad', 'an', 'mwl', 'it', 'it_IT', 'co', 'nap', 'scn', 'vec', 'sc', 'ro', 'la'],
+ 'NORTH_EU': ['de', 'nl', 'fy', 'af', 'da', 'fo', 'is', 'no', 'nb', 'nn', 'sv'],
+ 'SCANDINAVIA': ['da', 'fo', 'is', 'no', 'nb', 'nn', 'sv'],
+ 'SAMI': ['se', 'sma', 'smj', 'smn', 'sms'],
+ 'NORWAY': ['nb_NO', 'nb', 'nn_NO', 'nn', 'nog', 'no_nb', 'no'],
+ 'CELTIC': ['ga', 'cy', 'br', 'gd', 'kw', 'gv']
+}
+```
+
+Example of translating english to many romance languages, using old-style 2 character language codes
+
+
+```python
+>>> from transformers import MarianMTModel, MarianTokenizer
+
+>>> src_text = [
+...     ">>fr<< this is a sentence in english that we want to translate to french",
+...     ">>pt<< This should go to portuguese",
+...     ">>es<< And this to Spanish",
+... ]
+
+>>> model_name = "Helsinki-NLP/opus-mt-en-ROMANCE"
+>>> tokenizer = MarianTokenizer.from_pretrained(model_name)
+
+>>> model = MarianMTModel.from_pretrained(model_name)
+>>> translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
+>>> tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
+["c'est une phrase en anglais que nous voulons traduire en français", 
+ 'Isto deve ir para o português.',
+ 'Y esto al español']
+```
+
+## Documentation resources
+
+- [Translation task guide](../tasks/translation)
+- [Summarization task guide](../tasks/summarization)
+- [Causal language modeling task guide](../tasks/language_modeling)
+
+## MarianConfig
+
+[[autodoc]] MarianConfig
+
+## MarianTokenizer
+
+[[autodoc]] MarianTokenizer
+    - build_inputs_with_special_tokens
+
+## MarianModel
+
+[[autodoc]] MarianModel
+    - forward
+
+## MarianMTModel
+
+[[autodoc]] MarianMTModel
+    - forward
+
+## MarianForCausalLM
+
+[[autodoc]] MarianForCausalLM
+    - forward
+
+## TFMarianModel
+
+[[autodoc]] TFMarianModel
+    - call
+
+## TFMarianMTModel
+
+[[autodoc]] TFMarianMTModel
+    - call
+
+## FlaxMarianModel
+
+[[autodoc]] FlaxMarianModel
+    - __call__
+
+## FlaxMarianMTModel
+
+[[autodoc]] FlaxMarianMTModel
+    - __call__
diff --git a/docs/source/en/model_doc/marian.mdx b/docs/source/en/model_doc/marian.mdx
deleted file mode 100644
index 9d0a9ff2576a..000000000000
--- a/docs/source/en/model_doc/marian.mdx
+++ /dev/null
@@ -1,193 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# MarianMT
-
-**Bugs:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=sshleifer&labels=&template=bug-report.md&title)
-and assign @patrickvonplaten.
-
-Translations should be similar, but not identical to output in the test set linked to in each model card.
-
-## Implementation Notes
-
-- Each model is about 298 MB on disk, there are more than 1,000 models.
-- The list of supported language pairs can be found [here](https://huggingface.co/Helsinki-NLP).
-- Models were originally trained by [Jörg Tiedemann](https://researchportal.helsinki.fi/en/persons/j%C3%B6rg-tiedemann) using the [Marian](https://marian-nmt.github.io/) C++ library, which supports fast training and translation.
-- All models are transformer encoder-decoders with 6 layers in each component. Each model's performance is documented
-  in a model card.
-- The 80 opus models that require BPE preprocessing are not supported.
-- The modeling code is the same as [`BartForConditionalGeneration`] with a few minor modifications:
-
-  - static (sinusoid) positional embeddings (`MarianConfig.static_position_embeddings=True`)
-  - no layernorm_embedding (`MarianConfig.normalize_embedding=False`)
-  - the model starts generating with `pad_token_id` (which has 0 as a token_embedding) as the prefix (Bart uses
-    `<s/>`),
-- Code to bulk convert models can be found in `convert_marian_to_pytorch.py`.
-- This model was contributed by [sshleifer](https://huggingface.co/sshleifer).
-
-## Naming
-
-- All model names use the following format: `Helsinki-NLP/opus-mt-{src}-{tgt}`
-- The language codes used to name models are inconsistent. Two digit codes can usually be found [here](https://developers.google.com/admin-sdk/directory/v1/languages), three digit codes require googling "language
-  code {code}".
-- Codes formatted like `es_AR` are usually `code_{region}`. That one is Spanish from Argentina.
-- The models were converted in two stages. The first 1000 models use ISO-639-2 codes to identify languages, the second
-  group use a combination of ISO-639-5 codes and ISO-639-2 codes.
-
-
-## Examples
-
-- Since Marian models are smaller than many other translation models available in the library, they can be useful for
-  fine-tuning experiments and integration tests.
-- [Fine-tune on GPU](https://github.com/huggingface/transformers/blob/master/examples/legacy/seq2seq/train_distil_marian_enro.sh)
-
-## Multilingual Models
-
-- All model names use the following format: `Helsinki-NLP/opus-mt-{src}-{tgt}`:
-- If a model can output multiple languages, and you should specify a language code by prepending the desired output
-  language to the `src_text`.
-- You can see a models's supported language codes in its model card, under target constituents, like in [opus-mt-en-roa](https://huggingface.co/Helsinki-NLP/opus-mt-en-roa).
-- Note that if a model is only multilingual on the source side, like `Helsinki-NLP/opus-mt-roa-en`, no language
-  codes are required.
-
-New multi-lingual models from the [Tatoeba-Challenge repo](https://github.com/Helsinki-NLP/Tatoeba-Challenge)
-require 3 character language codes:
-
-```python
->>> from transformers import MarianMTModel, MarianTokenizer
-
->>> src_text = [
-...     ">>fra<< this is a sentence in english that we want to translate to french",
-...     ">>por<< This should go to portuguese",
-...     ">>esp<< And this to Spanish",
-... ]
-
->>> model_name = "Helsinki-NLP/opus-mt-en-roa"
->>> tokenizer = MarianTokenizer.from_pretrained(model_name)
->>> print(tokenizer.supported_language_codes)
-['>>zlm_Latn<<', '>>mfe<<', '>>hat<<', '>>pap<<', '>>ast<<', '>>cat<<', '>>ind<<', '>>glg<<', '>>wln<<', '>>spa<<', '>>fra<<', '>>ron<<', '>>por<<', '>>ita<<', '>>oci<<', '>>arg<<', '>>min<<']
-
->>> model = MarianMTModel.from_pretrained(model_name)
->>> translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
->>> [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
-["c'est une phrase en anglais que nous voulons traduire en français",
- 'Isto deve ir para o português.',
- 'Y esto al español']
-```
-
-Here is the code to see all available pretrained models on the hub:
-
-```python
-from huggingface_hub import list_models
-
-model_list = list_models()
-org = "Helsinki-NLP"
-model_ids = [x.modelId for x in model_list if x.modelId.startswith(org)]
-suffix = [x.split("/")[1] for x in model_ids]
-old_style_multi_models = [f"{org}/{s}" for s in suffix if s != s.lower()]
-```
-
-## Old Style Multi-Lingual Models
-
-These are the old style multi-lingual models ported from the OPUS-MT-Train repo: and the members of each language
-group:
-
-```python no-style
-['Helsinki-NLP/opus-mt-NORTH_EU-NORTH_EU',
- 'Helsinki-NLP/opus-mt-ROMANCE-en',
- 'Helsinki-NLP/opus-mt-SCANDINAVIA-SCANDINAVIA',
- 'Helsinki-NLP/opus-mt-de-ZH',
- 'Helsinki-NLP/opus-mt-en-CELTIC',
- 'Helsinki-NLP/opus-mt-en-ROMANCE',
- 'Helsinki-NLP/opus-mt-es-NORWAY',
- 'Helsinki-NLP/opus-mt-fi-NORWAY',
- 'Helsinki-NLP/opus-mt-fi-ZH',
- 'Helsinki-NLP/opus-mt-fi_nb_no_nn_ru_sv_en-SAMI',
- 'Helsinki-NLP/opus-mt-sv-NORWAY',
- 'Helsinki-NLP/opus-mt-sv-ZH']
-GROUP_MEMBERS = {
- 'ZH': ['cmn', 'cn', 'yue', 'ze_zh', 'zh_cn', 'zh_CN', 'zh_HK', 'zh_tw', 'zh_TW', 'zh_yue', 'zhs', 'zht', 'zh'],
- 'ROMANCE': ['fr', 'fr_BE', 'fr_CA', 'fr_FR', 'wa', 'frp', 'oc', 'ca', 'rm', 'lld', 'fur', 'lij', 'lmo', 'es', 'es_AR', 'es_CL', 'es_CO', 'es_CR', 'es_DO', 'es_EC', 'es_ES', 'es_GT', 'es_HN', 'es_MX', 'es_NI', 'es_PA', 'es_PE', 'es_PR', 'es_SV', 'es_UY', 'es_VE', 'pt', 'pt_br', 'pt_BR', 'pt_PT', 'gl', 'lad', 'an', 'mwl', 'it', 'it_IT', 'co', 'nap', 'scn', 'vec', 'sc', 'ro', 'la'],
- 'NORTH_EU': ['de', 'nl', 'fy', 'af', 'da', 'fo', 'is', 'no', 'nb', 'nn', 'sv'],
- 'SCANDINAVIA': ['da', 'fo', 'is', 'no', 'nb', 'nn', 'sv'],
- 'SAMI': ['se', 'sma', 'smj', 'smn', 'sms'],
- 'NORWAY': ['nb_NO', 'nb', 'nn_NO', 'nn', 'nog', 'no_nb', 'no'],
- 'CELTIC': ['ga', 'cy', 'br', 'gd', 'kw', 'gv']
-}
-```
-
-Example of translating english to many romance languages, using old-style 2 character language codes
-
-
-```python
->>> from transformers import MarianMTModel, MarianTokenizer
-
->>> src_text = [
-...     ">>fr<< this is a sentence in english that we want to translate to french",
-...     ">>pt<< This should go to portuguese",
-...     ">>es<< And this to Spanish",
-... ]
-
->>> model_name = "Helsinki-NLP/opus-mt-en-ROMANCE"
->>> tokenizer = MarianTokenizer.from_pretrained(model_name)
-
->>> model = MarianMTModel.from_pretrained(model_name)
->>> translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
->>> tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
-["c'est une phrase en anglais que nous voulons traduire en français", 
- 'Isto deve ir para o português.',
- 'Y esto al español']
-```
-
-## MarianConfig
-
-[[autodoc]] MarianConfig
-
-## MarianTokenizer
-
-[[autodoc]] MarianTokenizer
-    - build_inputs_with_special_tokens
-
-## MarianModel
-
-[[autodoc]] MarianModel
-    - forward
-
-## MarianMTModel
-
-[[autodoc]] MarianMTModel
-    - forward
-
-## MarianForCausalLM
-
-[[autodoc]] MarianForCausalLM
-    - forward
-
-## TFMarianModel
-
-[[autodoc]] TFMarianModel
-    - call
-
-## TFMarianMTModel
-
-[[autodoc]] TFMarianMTModel
-    - call
-
-## FlaxMarianModel
-
-[[autodoc]] FlaxMarianModel
-    - __call__
-
-## FlaxMarianMTModel
-
-[[autodoc]] FlaxMarianMTModel
-    - __call__
diff --git a/docs/source/en/model_doc/markuplm.md b/docs/source/en/model_doc/markuplm.md
new file mode 100644
index 000000000000..b286c4fc00c1
--- /dev/null
+++ b/docs/source/en/model_doc/markuplm.md
@@ -0,0 +1,252 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# MarkupLM
+
+## Overview
+
+The MarkupLM model was proposed in [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document
+Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei. MarkupLM is BERT, but
+applied to HTML pages instead of raw text documents. The model incorporates additional embedding layers to improve
+performance, similar to [LayoutLM](layoutlm).
+
+The model can be used for tasks like question answering on web pages or information extraction from web pages. It obtains
+state-of-the-art results on 2 important benchmarks:
+- [WebSRC](https://x-lance.github.io/WebSRC/), a dataset for Web-Based Structural Reading Comprehension (a bit like SQuAD but for web pages)
+- [SWDE](https://www.researchgate.net/publication/221299838_From_one_tree_to_a_forest_a_unified_solution_for_structured_web_data_extraction), a dataset
+for information extraction from web pages (basically named-entity recogntion on web pages)
+
+The abstract from the paper is the following:
+
+*Multimodal pre-training with text, layout, and image has made significant progress for Visually-rich Document
+Understanding (VrDU), especially the fixed-layout documents such as scanned document images. While, there are still a
+large number of digital documents where the layout information is not fixed and needs to be interactively and
+dynamically rendered for visualization, making existing layout-based pre-training approaches not easy to apply. In this
+paper, we propose MarkupLM for document understanding tasks with markup languages as the backbone such as
+HTML/XML-based documents, where text and markup information is jointly pre-trained. Experiment results show that the
+pre-trained MarkupLM significantly outperforms the existing strong baseline models on several document understanding
+tasks. The pre-trained model and code will be publicly available.*
+
+Tips:
+- In addition to `input_ids`, [`~MarkupLMModel.forward`] expects 2 additional inputs, namely `xpath_tags_seq` and `xpath_subs_seq`.
+These are the XPATH tags and subscripts respectively for each token in the input sequence.
+- One can use [`MarkupLMProcessor`] to prepare all data for the model. Refer to the [usage guide](#usage-markuplmprocessor) for more info.
+- Demo notebooks can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/MarkupLM).
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/markuplm_architecture.jpg"
+alt="drawing" width="600"/> 
+
+<small> MarkupLM architecture. Taken from the <a href="https://arxiv.org/abs/2110.08518">original paper.</a> </small>
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/microsoft/unilm/tree/master/markuplm).
+
+## Usage: MarkupLMProcessor
+
+The easiest way to prepare data for the model is to use [`MarkupLMProcessor`], which internally combines a feature extractor
+([`MarkupLMFeatureExtractor`]) and a tokenizer ([`MarkupLMTokenizer`] or [`MarkupLMTokenizerFast`]). The feature extractor is
+used to extract all nodes and xpaths from the HTML strings, which are then provided to the tokenizer, which turns them into the
+token-level inputs of the model (`input_ids` etc.). Note that you can still use the feature extractor and tokenizer separately,
+if you only want to handle one of the two tasks.
+
+```python
+from transformers import MarkupLMFeatureExtractor, MarkupLMTokenizerFast, MarkupLMProcessor
+
+feature_extractor = MarkupLMFeatureExtractor()
+tokenizer = MarkupLMTokenizerFast.from_pretrained("microsoft/markuplm-base")
+processor = MarkupLMProcessor(feature_extractor, tokenizer)
+```
+
+In short, one can provide HTML strings (and possibly additional data) to [`MarkupLMProcessor`],
+and it will create the inputs expected by the model. Internally, the processor first uses
+[`MarkupLMFeatureExtractor`] to get a list of nodes and corresponding xpaths. The nodes and
+xpaths are then provided to [`MarkupLMTokenizer`] or [`MarkupLMTokenizerFast`], which converts them
+to token-level `input_ids`, `attention_mask`, `token_type_ids`, `xpath_subs_seq`, `xpath_tags_seq`.
+Optionally, one can provide node labels to the processor, which are turned into token-level `labels`.
+
+[`MarkupLMFeatureExtractor`] uses [Beautiful Soup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/), a Python library for
+pulling data out of HTML and XML files, under the hood. Note that you can still use your own parsing solution of
+choice, and provide the nodes and xpaths yourself to [`MarkupLMTokenizer`] or [`MarkupLMTokenizerFast`].
+
+In total, there are 5 use cases that are supported by the processor. Below, we list them all. Note that each of these
+use cases work for both batched and non-batched inputs (we illustrate them for non-batched inputs).
+
+**Use case 1: web page classification (training, inference) + token classification (inference), parse_html = True**
+
+This is the simplest case, in which the processor will use the feature extractor to get all nodes and xpaths from the HTML.
+
+```python
+>>> from transformers import MarkupLMProcessor
+
+>>> processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base")
+
+>>> html_string = """
+...  <!DOCTYPE html>
+...  <html>
+...  <head>
+...  <title>Hello world</title>
+...  </head>
+...  <body>
+...  <h1>Welcome</h1>
+...  <p>Here is my website.</p>
+...  </body>
+...  </html>"""
+
+>>> # note that you can also add provide all tokenizer parameters here such as padding, truncation
+>>> encoding = processor(html_string, return_tensors="pt")
+>>> print(encoding.keys())
+dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'xpath_tags_seq', 'xpath_subs_seq'])
+```
+
+**Use case 2: web page classification (training, inference) + token classification (inference), parse_html=False**
+
+In case one already has obtained all nodes and xpaths, one doesn't need the feature extractor. In that case, one should
+provide the nodes and corresponding xpaths themselves to the processor, and make sure to set `parse_html` to `False`.
+
+```python
+>>> from transformers import MarkupLMProcessor
+
+>>> processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base")
+>>> processor.parse_html = False
+
+>>> nodes = ["hello", "world", "how", "are"]
+>>> xpaths = ["/html/body/div/li[1]/div/span", "/html/body/div/li[1]/div/span", "html/body", "html/body/div"]
+>>> encoding = processor(nodes=nodes, xpaths=xpaths, return_tensors="pt")
+>>> print(encoding.keys())
+dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'xpath_tags_seq', 'xpath_subs_seq'])
+```
+
+**Use case 3: token classification (training), parse_html=False**
+
+For token classification tasks (such as [SWDE](https://paperswithcode.com/dataset/swde)), one can also provide the
+corresponding node labels in order to train a model. The processor will then convert these into token-level `labels`.
+By default, it will only label the first wordpiece of a word, and label the remaining wordpieces with -100, which is the
+`ignore_index` of PyTorch's CrossEntropyLoss. In case you want all wordpieces of a word to be labeled, you can
+initialize the tokenizer with `only_label_first_subword` set to `False`.
+
+```python
+>>> from transformers import MarkupLMProcessor
+
+>>> processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base")
+>>> processor.parse_html = False
+
+>>> nodes = ["hello", "world", "how", "are"]
+>>> xpaths = ["/html/body/div/li[1]/div/span", "/html/body/div/li[1]/div/span", "html/body", "html/body/div"]
+>>> node_labels = [1, 2, 2, 1]
+>>> encoding = processor(nodes=nodes, xpaths=xpaths, node_labels=node_labels, return_tensors="pt")
+>>> print(encoding.keys())
+dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'xpath_tags_seq', 'xpath_subs_seq', 'labels'])
+```
+
+**Use case 4: web page question answering (inference), parse_html=True**
+
+For question answering tasks on web pages, you can provide a question to the processor. By default, the
+processor will use the feature extractor to get all nodes and xpaths, and create [CLS] question tokens [SEP] word tokens [SEP].
+
+```python
+>>> from transformers import MarkupLMProcessor
+
+>>> processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base")
+
+>>> html_string = """
+...  <!DOCTYPE html>
+...  <html>
+...  <head>
+...  <title>Hello world</title>
+...  </head>
+...  <body>
+...  <h1>Welcome</h1>
+...  <p>My name is Niels.</p>
+...  </body>
+...  </html>"""
+
+>>> question = "What's his name?"
+>>> encoding = processor(html_string, questions=question, return_tensors="pt")
+>>> print(encoding.keys())
+dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'xpath_tags_seq', 'xpath_subs_seq'])
+```
+
+**Use case 5: web page question answering (inference), parse_html=False**
+
+For question answering tasks (such as WebSRC), you can provide a question to the processor. If you have extracted
+all nodes and xpaths yourself, you can provide them directly to the processor. Make sure to set `parse_html` to `False`.
+
+```python
+>>> from transformers import MarkupLMProcessor
+
+>>> processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base")
+>>> processor.parse_html = False
+
+>>> nodes = ["hello", "world", "how", "are"]
+>>> xpaths = ["/html/body/div/li[1]/div/span", "/html/body/div/li[1]/div/span", "html/body", "html/body/div"]
+>>> question = "What's his name?"
+>>> encoding = processor(nodes=nodes, xpaths=xpaths, questions=question, return_tensors="pt")
+>>> print(encoding.keys())
+dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'xpath_tags_seq', 'xpath_subs_seq'])
+```
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+
+## MarkupLMConfig
+
+[[autodoc]] MarkupLMConfig
+    - all
+
+## MarkupLMFeatureExtractor
+
+[[autodoc]] MarkupLMFeatureExtractor
+    - __call__
+
+## MarkupLMTokenizer
+
+[[autodoc]] MarkupLMTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## MarkupLMTokenizerFast
+
+[[autodoc]] MarkupLMTokenizerFast
+    - all
+
+## MarkupLMProcessor
+
+[[autodoc]] MarkupLMProcessor
+    - __call__
+
+## MarkupLMModel
+
+[[autodoc]] MarkupLMModel
+    - forward
+
+## MarkupLMForSequenceClassification
+
+[[autodoc]] MarkupLMForSequenceClassification
+    - forward
+
+## MarkupLMForTokenClassification
+
+[[autodoc]] MarkupLMForTokenClassification
+    - forward
+
+## MarkupLMForQuestionAnswering
+
+[[autodoc]] MarkupLMForQuestionAnswering
+    - forward
diff --git a/docs/source/en/model_doc/markuplm.mdx b/docs/source/en/model_doc/markuplm.mdx
deleted file mode 100644
index f4deb6d873cd..000000000000
--- a/docs/source/en/model_doc/markuplm.mdx
+++ /dev/null
@@ -1,242 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# MarkupLM
-
-## Overview
-
-The MarkupLM model was proposed in [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document
-Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei. MarkupLM is BERT, but
-applied to HTML pages instead of raw text documents. The model incorporates additional embedding layers to improve
-performance, similar to [LayoutLM](layoutlm).
-
-The model can be used for tasks like question answering on web pages or information extraction from web pages. It obtains
-state-of-the-art results on 2 important benchmarks:
-- [WebSRC](https://x-lance.github.io/WebSRC/), a dataset for Web-Based Structual Reading Comprehension (a bit like SQuAD but for web pages)
-- [SWDE](https://www.researchgate.net/publication/221299838_From_one_tree_to_a_forest_a_unified_solution_for_structured_web_data_extraction), a dataset
-for information extraction from web pages (basically named-entity recogntion on web pages)
-
-The abstract from the paper is the following:
-
-*Multimodal pre-training with text, layout, and image has made significant progress for Visually-rich Document
-Understanding (VrDU), especially the fixed-layout documents such as scanned document images. While, there are still a
-large number of digital documents where the layout information is not fixed and needs to be interactively and
-dynamically rendered for visualization, making existing layout-based pre-training approaches not easy to apply. In this
-paper, we propose MarkupLM for document understanding tasks with markup languages as the backbone such as
-HTML/XML-based documents, where text and markup information is jointly pre-trained. Experiment results show that the
-pre-trained MarkupLM significantly outperforms the existing strong baseline models on several document understanding
-tasks. The pre-trained model and code will be publicly available.*
-
-Tips:
-- In addition to `input_ids`, [`~MarkupLMModel.forward`] expects 2 additional inputs, namely `xpath_tags_seq` and `xpath_subs_seq`.
-These are the XPATH tags and subscripts respectively for each token in the input sequence.
-- One can use [`MarkupLMProcessor`] to prepare all data for the model. Refer to the [usage guide](#usage-markuplmprocessor) for more info.
-- Demo notebooks can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/MarkupLM).
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/markuplm_architecture.jpg"
-alt="drawing" width="600"/> 
-
-<small> MarkupLM architecture. Taken from the <a href="https://arxiv.org/abs/2110.08518">original paper.</a> </small>
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/microsoft/unilm/tree/master/markuplm).
-
-## Usage: MarkupLMProcessor
-
-The easiest way to prepare data for the model is to use [`MarkupLMProcessor`], which internally combines a feature extractor
-([`MarkupLMFeatureExtractor`]) and a tokenizer ([`MarkupLMTokenizer`] or [`MarkupLMTokenizerFast`]). The feature extractor is
-used to extract all nodes and xpaths from the HTML strings, which are then provided to the tokenizer, which turns them into the
-token-level inputs of the model (`input_ids` etc.). Note that you can still use the feature extractor and tokenizer separately,
-if you only want to handle one of the two tasks.
-
-```python
-from transformers import MarkupLMFeatureExtractor, MarkupLMTokenizerFast, MarkupLMProcessor
-
-feature_extractor = MarkupLMFeatureExtractor()
-tokenizer = MarkupLMTokenizerFast.from_pretrained("microsoft/markuplm-base")
-processor = MarkupLMProcessor(feature_extractor, tokenizer)
-```
-
-In short, one can provide HTML strings (and possibly additional data) to [`MarkupLMProcessor`],
-and it will create the inputs expected by the model. Internally, the processor first uses
-[`MarkupLMFeatureExtractor`] to get a list of nodes and corresponding xpaths. The nodes and
-xpaths are then provided to [`MarkupLMTokenizer`] or [`MarkupLMTokenizerFast`], which converts them
-to token-level `input_ids`, `attention_mask`, `token_type_ids`, `xpath_subs_seq`, `xpath_tags_seq`.
-Optionally, one can provide node labels to the processor, which are turned into token-level `labels`.
-
-[`MarkupLMFeatureExtractor`] uses [Beautiful Soup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/), a Python library for
-pulling data out of HTML and XML files, under the hood. Note that you can still use your own parsing solution of
-choice, and provide the nodes and xpaths yourself to [`MarkupLMTokenizer`] or [`MarkupLMTokenizerFast`].
-
-In total, there are 5 use cases that are supported by the processor. Below, we list them all. Note that each of these
-use cases work for both batched and non-batched inputs (we illustrate them for non-batched inputs).
-
-**Use case 1: web page classification (training, inference) + token classification (inference), parse_html = True**
-
-This is the simplest case, in which the processor will use the feature extractor to get all nodes and xpaths from the HTML.
-
-```python
->>> from transformers import MarkupLMProcessor
-
->>> processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base")
-
->>> html_string = """
-...  <!DOCTYPE html>
-...  <html>
-...  <head>
-...  <title>Hello world</title>
-...  </head>
-...  <body>
-...  <h1>Welcome</h1>
-...  <p>Here is my website.</p>
-...  </body>
-...  </html>"""
-
->>> # note that you can also add provide all tokenizer parameters here such as padding, truncation
->>> encoding = processor(html_string, return_tensors="pt")
->>> print(encoding.keys())
-dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'xpath_tags_seq', 'xpath_subs_seq'])
-```
-
-**Use case 2: web page classification (training, inference) + token classification (inference), parse_html=False**
-
-In case one already has obtained all nodes and xpaths, one doesn't need the feature extractor. In that case, one should
-provide the nodes and corresponding xpaths themselves to the processor, and make sure to set `parse_html` to `False`.
-
-```python
->>> from transformers import MarkupLMProcessor
-
->>> processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base")
->>> processor.parse_html = False
-
->>> nodes = ["hello", "world", "how", "are"]
->>> xpaths = ["/html/body/div/li[1]/div/span", "/html/body/div/li[1]/div/span", "html/body", "html/body/div"]
->>> encoding = processor(nodes=nodes, xpaths=xpaths, return_tensors="pt")
->>> print(encoding.keys())
-dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'xpath_tags_seq', 'xpath_subs_seq'])
-```
-
-**Use case 3: token classification (training), parse_html=False**
-
-For token classification tasks (such as [SWDE](https://paperswithcode.com/dataset/swde)), one can also provide the
-corresponding node labels in order to train a model. The processor will then convert these into token-level `labels`.
-By default, it will only label the first wordpiece of a word, and label the remaining wordpieces with -100, which is the
-`ignore_index` of PyTorch's CrossEntropyLoss. In case you want all wordpieces of a word to be labeled, you can
-initialize the tokenizer with `only_label_first_subword` set to `False`.
-
-```python
->>> from transformers import MarkupLMProcessor
-
->>> processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base")
->>> processor.parse_html = False
-
->>> nodes = ["hello", "world", "how", "are"]
->>> xpaths = ["/html/body/div/li[1]/div/span", "/html/body/div/li[1]/div/span", "html/body", "html/body/div"]
->>> node_labels = [1, 2, 2, 1]
->>> encoding = processor(nodes=nodes, xpaths=xpaths, node_labels=node_labels, return_tensors="pt")
->>> print(encoding.keys())
-dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'xpath_tags_seq', 'xpath_subs_seq', 'labels'])
-```
-
-**Use case 4: web page question answering (inference), parse_html=True**
-
-For question answering tasks on web pages, you can provide a question to the processor. By default, the
-processor will use the feature extractor to get all nodes and xpaths, and create [CLS] question tokens [SEP] word tokens [SEP].
-
-```python
->>> from transformers import MarkupLMProcessor
-
->>> processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base")
-
->>> html_string = """
-...  <!DOCTYPE html>
-...  <html>
-...  <head>
-...  <title>Hello world</title>
-...  </head>
-...  <body>
-...  <h1>Welcome</h1>
-...  <p>My name is Niels.</p>
-...  </body>
-...  </html>"""
-
->>> question = "What's his name?"
->>> encoding = processor(html_string, questions=question, return_tensors="pt")
->>> print(encoding.keys())
-dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'xpath_tags_seq', 'xpath_subs_seq'])
-```
-
-**Use case 5: web page question answering (inference), parse_html=False**
-
-For question answering tasks (such as WebSRC), you can provide a question to the processor. If you have extracted
-all nodes and xpaths yourself, you can provide them directly to the processor. Make sure to set `parse_html` to `False`.
-
-```python
->>> from transformers import MarkupLMProcessor
-
->>> processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base")
->>> processor.parse_html = False
-
->>> nodes = ["hello", "world", "how", "are"]
->>> xpaths = ["/html/body/div/li[1]/div/span", "/html/body/div/li[1]/div/span", "html/body", "html/body/div"]
->>> question = "What's his name?"
->>> encoding = processor(nodes=nodes, xpaths=xpaths, questions=question, return_tensors="pt")
->>> print(encoding.keys())
-dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'xpath_tags_seq', 'xpath_subs_seq'])
-```
-
-## MarkupLMConfig
-
-[[autodoc]] MarkupLMConfig
-    - all
-
-## MarkupLMFeatureExtractor
-
-[[autodoc]] MarkupLMFeatureExtractor
-    - __call__
-
-## MarkupLMTokenizer
-
-[[autodoc]] MarkupLMTokenizer
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-
-## MarkupLMTokenizerFast
-
-[[autodoc]] MarkupLMTokenizerFast
-    - all
-
-## MarkupLMProcessor
-
-[[autodoc]] MarkupLMProcessor
-    - __call__
-
-## MarkupLMModel
-
-[[autodoc]] MarkupLMModel
-    - forward
-
-## MarkupLMForSequenceClassification
-
-[[autodoc]] MarkupLMForSequenceClassification
-    - forward
-
-## MarkupLMForTokenClassification
-
-[[autodoc]] MarkupLMForTokenClassification
-    - forward
-
-## MarkupLMForQuestionAnswering
-
-[[autodoc]] MarkupLMForQuestionAnswering
-    - forward
diff --git a/docs/source/en/model_doc/mask2former.md b/docs/source/en/model_doc/mask2former.md
new file mode 100644
index 000000000000..ddfa5da2ba2c
--- /dev/null
+++ b/docs/source/en/model_doc/mask2former.md
@@ -0,0 +1,74 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Mask2Former
+
+## Overview
+
+The Mask2Former model was proposed in [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar. Mask2Former is a unified framework for panoptic, instance and semantic segmentation and features significant performance and efficiency improvements over [MaskFormer](maskformer).
+
+The abstract from the paper is the following:
+
+*Image segmentation groups pixels with different semantics, e.g., category or instance membership. Each choice
+of semantics defines a task. While only the semantics of each task differ, current research focuses on designing specialized architectures for each task. We present Masked-attention Mask Transformer (Mask2Former), a new architecture capable of addressing any image segmentation task (panoptic, instance or semantic). Its key components include masked attention, which extracts localized features by constraining cross-attention within predicted mask regions. In addition to reducing the research effort by at least three times, it outperforms the best specialized architectures by a significant margin on four popular datasets. Most notably, Mask2Former sets a new state-of-the-art for panoptic segmentation (57.8 PQ on COCO), instance segmentation (50.1 AP on COCO) and semantic segmentation (57.7 mIoU on ADE20K).*
+
+Tips:
+- Mask2Former uses the same preprocessing and postprocessing steps as [MaskFormer](maskformer). Use [`Mask2FormerImageProcessor`] or [`AutoImageProcessor`] to prepare images and optional targets for the model.
+- To get the final segmentation, depending on the task, you can call [`~Mask2FormerImageProcessor.post_process_semantic_segmentation`] or [`~Mask2FormerImageProcessor.post_process_instance_segmentation`] or [`~Mask2FormerImageProcessor.post_process_panoptic_segmentation`]. All three tasks can be solved using [`Mask2FormerForUniversalSegmentation`] output, panoptic segmentation accepts an optional `label_ids_to_fuse` argument to fuse instances of the target object/s (e.g. sky) together.
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/mask2former_architecture.jpg" alt="drawing" width="600"/>
+
+<small> Mask2Former architecture. Taken from the <a href="https://arxiv.org/abs/2112.01527">original paper.</a> </small>
+
+This model was contributed by [Shivalika Singh](https://huggingface.co/shivi) and [Alara Dirik](https://huggingface.co/adirik). The original code can be found [here](https://github.com/facebookresearch/Mask2Former).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Mask2Former.
+
+- Demo notebooks regarding inference + fine-tuning Mask2Former on custom data can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/Mask2Former).
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we will review it.
+The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## MaskFormer specific outputs
+
+[[autodoc]] models.mask2former.modeling_mask2former.Mask2FormerModelOutput
+
+[[autodoc]] models.mask2former.modeling_mask2former.Mask2FormerForUniversalSegmentationOutput
+
+## Mask2FormerConfig
+
+[[autodoc]] Mask2FormerConfig
+
+## Mask2FormerModel
+
+[[autodoc]] Mask2FormerModel
+    - forward
+
+## Mask2FormerForUniversalSegmentation
+
+[[autodoc]] Mask2FormerForUniversalSegmentation
+    - forward
+
+## Mask2FormerImageProcessor
+
+[[autodoc]] Mask2FormerImageProcessor
+    - preprocess
+    - encode_inputs
+    - post_process_semantic_segmentation
+    - post_process_instance_segmentation
+    - post_process_panoptic_segmentation
\ No newline at end of file
diff --git a/docs/source/en/model_doc/maskformer.md b/docs/source/en/model_doc/maskformer.md
new file mode 100644
index 000000000000..4695e54857f7
--- /dev/null
+++ b/docs/source/en/model_doc/maskformer.md
@@ -0,0 +1,90 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# MaskFormer
+
+<Tip>
+
+This is a recently introduced model so the API hasn't been tested extensively. There may be some bugs or slight
+breaking changes to fix it in the future. If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title).
+
+</Tip>
+
+## Overview
+
+The MaskFormer model was proposed in [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov. MaskFormer addresses semantic segmentation with a mask classification paradigm instead of performing classic pixel-level classification.
+
+The abstract from the paper is the following:
+
+*Modern approaches typically formulate semantic segmentation as a per-pixel classification task, while instance-level segmentation is handled with an alternative mask classification. Our key insight: mask classification is sufficiently general to solve both semantic- and instance-level segmentation tasks in a unified manner using the exact same model, loss, and training procedure. Following this observation, we propose MaskFormer, a simple mask classification model which predicts a set of binary masks, each associated with a single global class label prediction. Overall, the proposed mask classification-based method simplifies the landscape of effective approaches to semantic and panoptic segmentation tasks and shows excellent empirical results. In particular, we observe that MaskFormer outperforms per-pixel classification baselines when the number of classes is large. Our mask classification-based method outperforms both current state-of-the-art semantic (55.6 mIoU on ADE20K) and panoptic segmentation (52.7 PQ on COCO) models.*
+
+Tips:
+-  MaskFormer's Transformer decoder is identical to the decoder of [DETR](detr). During training, the authors of DETR did find it helpful to use auxiliary losses in the decoder, especially to help the model output the correct number of objects of each class. If you set the parameter `use_auxilary_loss` of [`MaskFormerConfig`] to `True`, then prediction feedforward neural networks and Hungarian losses are added after each decoder layer (with the FFNs sharing parameters).
+- If you want to train the model in a distributed environment across multiple nodes, then one should update the
+  `get_num_masks` function inside in the `MaskFormerLoss` class of `modeling_maskformer.py`. When training on multiple nodes, this should be
+  set to the average number of target masks across all nodes, as can be seen in the original implementation [here](https://github.com/facebookresearch/MaskFormer/blob/da3e60d85fdeedcb31476b5edd7d328826ce56cc/mask_former/modeling/criterion.py#L169).
+- One can use [`MaskFormerImageProcessor`] to prepare images for the model and optional targets for the model.
+- To get the final segmentation, depending on the task, you can call [`~MaskFormerImageProcessor.post_process_semantic_segmentation`] or [`~MaskFormerImageProcessor.post_process_panoptic_segmentation`]. Both tasks can be solved using [`MaskFormerForInstanceSegmentation`] output, panoptic segmentation accepts an optional `label_ids_to_fuse` argument to fuse instances of the target object/s (e.g. sky) together.
+
+The figure below illustrates the architecture of MaskFormer. Taken from the [original paper](https://arxiv.org/abs/2107.06278).
+
+<img width="600" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/maskformer_architecture.png"/>
+
+This model was contributed by [francesco](https://huggingface.co/francesco). The original code can be found [here](https://github.com/facebookresearch/MaskFormer).
+
+## Resources
+
+<PipelineTag pipeline="image-segmentation"/>
+
+- All notebooks that illustrate inference as well as fine-tuning on custom data with MaskFormer can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/MaskFormer).
+
+## MaskFormer specific outputs
+
+[[autodoc]] models.maskformer.modeling_maskformer.MaskFormerModelOutput
+
+[[autodoc]] models.maskformer.modeling_maskformer.MaskFormerForInstanceSegmentationOutput
+
+## MaskFormerConfig
+
+[[autodoc]] MaskFormerConfig
+
+## MaskFormerImageProcessor
+
+[[autodoc]] MaskFormerImageProcessor
+    - preprocess
+    - encode_inputs
+    - post_process_semantic_segmentation
+    - post_process_instance_segmentation
+    - post_process_panoptic_segmentation
+
+## MaskFormerFeatureExtractor
+
+[[autodoc]] MaskFormerFeatureExtractor
+    - __call__
+    - encode_inputs
+    - post_process_semantic_segmentation
+    - post_process_instance_segmentation
+    - post_process_panoptic_segmentation
+
+## MaskFormerModel
+
+[[autodoc]] MaskFormerModel
+    - forward
+
+## MaskFormerForInstanceSegmentation
+
+[[autodoc]] MaskFormerForInstanceSegmentation
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/maskformer.mdx b/docs/source/en/model_doc/maskformer.mdx
deleted file mode 100644
index 4060cbab9a8f..000000000000
--- a/docs/source/en/model_doc/maskformer.mdx
+++ /dev/null
@@ -1,86 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# MaskFormer
-
-<Tip>
-
-This is a recently introduced model so the API hasn't been tested extensively. There may be some bugs or slight
-breaking changes to fix it in the future. If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title).
-
-</Tip>
-
-## Overview
-
-The MaskFormer model was proposed in [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov. MaskFormer addresses semantic segmentation with a mask classification paradigm instead of performing classic pixel-level classification.
-
-The abstract from the paper is the following:
-
-*Modern approaches typically formulate semantic segmentation as a per-pixel classification task, while instance-level segmentation is handled with an alternative mask classification. Our key insight: mask classification is sufficiently general to solve both semantic- and instance-level segmentation tasks in a unified manner using the exact same model, loss, and training procedure. Following this observation, we propose MaskFormer, a simple mask classification model which predicts a set of binary masks, each associated with a single global class label prediction. Overall, the proposed mask classification-based method simplifies the landscape of effective approaches to semantic and panoptic segmentation tasks and shows excellent empirical results. In particular, we observe that MaskFormer outperforms per-pixel classification baselines when the number of classes is large. Our mask classification-based method outperforms both current state-of-the-art semantic (55.6 mIoU on ADE20K) and panoptic segmentation (52.7 PQ on COCO) models.*
-
-Tips:
--  MaskFormer's Transformer decoder is identical to the decoder of [DETR](detr). During training, the authors of DETR did find it helpful to use auxiliary losses in the decoder, especially to help the model output the correct number of objects of each class. If you set the parameter `use_auxilary_loss` of [`MaskFormerConfig`] to `True`, then prediction feedforward neural networks and Hungarian losses are added after each decoder layer (with the FFNs sharing parameters).
-- If you want to train the model in a distributed environment across multiple nodes, then one should update the
-  `get_num_masks` function inside in the `MaskFormerLoss` class of `modeling_maskformer.py`. When training on multiple nodes, this should be
-  set to the average number of target masks across all nodes, as can be seen in the original implementation [here](https://github.com/facebookresearch/MaskFormer/blob/da3e60d85fdeedcb31476b5edd7d328826ce56cc/mask_former/modeling/criterion.py#L169).
-- One can use [`MaskFormerImageProcessor`] to prepare images for the model and optional targets for the model.
-- To get the final segmentation, depending on the task, you can call [`~MaskFormerImageProcessor.post_process_semantic_segmentation`] or [`~MaskFormerImageProcessor.post_process_panoptic_segmentation`]. Both tasks can be solved using [`MaskFormerForInstanceSegmentation`] output, panoptic segmentation accepts an optional `label_ids_to_fuse` argument to fuse instances of the target object/s (e.g. sky) together.
-
-The figure below illustrates the architecture of MaskFormer. Taken from the [original paper](https://arxiv.org/abs/2107.06278).
-
-<img width="600" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/maskformer_architecture.png"/>
-
-This model was contributed by [francesco](https://huggingface.co/francesco). The original code can be found [here](https://github.com/facebookresearch/MaskFormer).
-
-## Resources
-
-<PipelineTag pipeline="image-segmentation"/>
-
-- All notebooks that illustrate inference as well as fine-tuning on custom data with MaskFormer can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/MaskFormer).
-
-## MaskFormer specific outputs
-
-[[autodoc]] models.maskformer.modeling_maskformer.MaskFormerModelOutput
-
-[[autodoc]] models.maskformer.modeling_maskformer.MaskFormerForInstanceSegmentationOutput
-
-## MaskFormerConfig
-
-[[autodoc]] MaskFormerConfig
-
-## MaskFormerImageProcessor
-
-[[autodoc]] MaskFormerImageProcessor
-    - preprocess
-    - encode_inputs
-    - post_process_semantic_segmentation
-    - post_process_instance_segmentation
-    - post_process_panoptic_segmentation
-
-## MaskFormerFeatureExtractor
-
-[[autodoc]] MaskFormerFeatureExtractor
-    - __call__
-    - encode_inputs
-    - post_process_semantic_segmentation
-    - post_process_instance_segmentation
-    - post_process_panoptic_segmentation
-
-## MaskFormerModel
-
-[[autodoc]] MaskFormerModel
-    - forward
-
-## MaskFormerForInstanceSegmentation
-
-[[autodoc]] MaskFormerForInstanceSegmentation
-    - forward
diff --git a/docs/source/en/model_doc/matcha.md b/docs/source/en/model_doc/matcha.md
new file mode 100644
index 000000000000..20c403413feb
--- /dev/null
+++ b/docs/source/en/model_doc/matcha.md
@@ -0,0 +1,70 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# MatCha
+
+## Overview
+
+MatCha has been proposed in the paper [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662), from Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos.
+
+The abstract of the paper states the following:
+
+*Visual language data such as plots, charts, and infographics are ubiquitous in the human world. However, state-of-the-art vision-language models do not perform well on these data. We propose MatCha (Math reasoning and Chart derendering pretraining) to enhance visual language models' capabilities in jointly modeling charts/plots and language data. Specifically, we propose several pretraining tasks that cover plot deconstruction and numerical reasoning which are the key capabilities in visual language modeling. We perform the MatCha pretraining starting from Pix2Struct, a recently proposed image-to-text visual language model. On standard benchmarks such as PlotQA and ChartQA, the MatCha model outperforms state-of-the-art methods by as much as nearly 20%. We also examine how well MatCha pretraining transfers to domains such as screenshots, textbook diagrams, and document figures and observe overall improvement, verifying the usefulness of MatCha pretraining on broader visual language tasks.*
+
+## Model description
+
+MatCha is a model that is trained using `Pix2Struct` architecture. You can find more information about `Pix2Struct` in the [Pix2Struct documentation](https://huggingface.co/docs/transformers/main/en/model_doc/pix2struct).
+MatCha is a Visual Question Answering subset of `Pix2Struct` architecture. It renders the input question on the image and predicts the answer.
+
+## Usage
+
+Currently 6 checkpoints are available for MatCha:
+
+- `google/matcha`: the base MatCha model, used to fine-tune MatCha on downstream tasks
+- `google/matcha-chartqa`: MatCha model fine-tuned on ChartQA dataset. It can be used to answer questions about charts.
+- `google/matcha-plotqa-v1`: MatCha model fine-tuned on PlotQA dataset. It can be used to answer questions about plots.
+- `google/matcha-plotqa-v2`: MatCha model fine-tuned on PlotQA dataset. It can be used to answer questions about plots.
+- `google/matcha-chart2text-statista`: MatCha model fine-tuned on Statista dataset. 
+- `google/matcha-chart2text-pew`: MatCha model fine-tuned on Pew dataset.
+
+The models finetuned on `chart2text-pew` and `chart2text-statista` are more suited for summarization, whereas the models finetuned on `plotqa` and `chartqa` are more suited for question answering.
+
+You can use these models as follows (example on a ChatQA dataset):
+
+```python
+from transformers import AutoProcessor, Pix2StructForConditionalGeneration
+import requests
+from PIL import Image
+
+model = Pix2StructForConditionalGeneration.from_pretrained("google/matcha-chartqa").to(0)
+processor = AutoProcessor.from_pretrained("google/matcha-chartqa")
+url = "https://raw.githubusercontent.com/vis-nlp/ChartQA/main/ChartQA%20Dataset/val/png/20294671002019.png"
+image = Image.open(requests.get(url, stream=True).raw)
+
+inputs = processor(images=image, text="Is the sum of all 4 places greater than Laos?", return_tensors="pt").to(0)
+predictions = model.generate(**inputs, max_new_tokens=512)
+print(processor.decode(predictions[0], skip_special_tokens=True))
+```
+
+## Fine-tuning
+
+To fine-tune MatCha, refer to the pix2struct [fine-tuning notebook](https://github.com/huggingface/notebooks/blob/main/examples/image_captioning_pix2struct.ipynb). For `Pix2Struct` models, we have found out that fine-tuning the model with Adafactor and cosine learning rate scheduler leads to faste convergence:
+```python
+from transformers.optimization import Adafactor, get_cosine_schedule_with_warmup
+
+optimizer = Adafactor(self.parameters(), scale_parameter=False, relative_step=False, lr=0.01, weight_decay=1e-05)
+scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=1000, num_training_steps=40000)
+```
\ No newline at end of file
diff --git a/docs/source/en/model_doc/mbart.md b/docs/source/en/model_doc/mbart.md
new file mode 100644
index 000000000000..8a614dd50556
--- /dev/null
+++ b/docs/source/en/model_doc/mbart.md
@@ -0,0 +1,246 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# MBart and MBart-50
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=mbart">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-mbart-blueviolet">
+</a>
+<a href="https://huggingface.co/spaces/docs-demos/mbart-large-50-one-to-many-mmt">
+<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
+</a>
+</div>
+
+**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title) and assign
+@patrickvonplaten
+
+## Overview of MBart
+
+The MBart model was presented in [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov Marjan
+Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
+
+According to the abstract, MBART is a sequence-to-sequence denoising auto-encoder pretrained on large-scale monolingual
+corpora in many languages using the BART objective. mBART is one of the first methods for pretraining a complete
+sequence-to-sequence model by denoising full texts in multiple languages, while previous approaches have focused only
+on the encoder, decoder, or reconstructing parts of the text.
+
+This model was contributed by [valhalla](https://huggingface.co/valhalla). The Authors' code can be found [here](https://github.com/pytorch/fairseq/tree/master/examples/mbart)
+
+### Training of MBart
+
+MBart is a multilingual encoder-decoder (sequence-to-sequence) model primarily intended for translation task. As the
+model is multilingual it expects the sequences in a different format. A special language id token is added in both the
+source and target text. The source text format is `X [eos, src_lang_code]` where `X` is the source text. The
+target text format is `[tgt_lang_code] X [eos]`. `bos` is never used.
+
+The regular [`~MBartTokenizer.__call__`] will encode source text format passed as first argument or with the `text`
+keyword, and target text format passed with the `text_label` keyword argument.
+
+- Supervised training
+
+```python
+>>> from transformers import MBartForConditionalGeneration, MBartTokenizer
+
+>>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro", src_lang="en_XX", tgt_lang="ro_RO")
+>>> example_english_phrase = "UN Chief Says There Is No Military Solution in Syria"
+>>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
+
+>>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_romanian, return_tensors="pt")
+
+>>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro")
+>>> # forward pass
+>>> model(**inputs)
+```
+
+- Generation
+
+  While generating the target text set the `decoder_start_token_id` to the target language id. The following
+  example shows how to translate English to Romanian using the *facebook/mbart-large-en-ro* model.
+
+```python
+>>> from transformers import MBartForConditionalGeneration, MBartTokenizer
+
+>>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro", src_lang="en_XX")
+>>> article = "UN Chief Says There Is No Military Solution in Syria"
+>>> inputs = tokenizer(article, return_tensors="pt")
+>>> translated_tokens = model.generate(**inputs, decoder_start_token_id=tokenizer.lang_code_to_id["ro_RO"])
+>>> tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
+"Şeful ONU declară că nu există o soluţie militară în Siria"
+```
+
+## Overview of MBart-50
+
+MBart-50 was introduced in the [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) paper by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav
+Chaudhary, Jiatao Gu, Angela Fan. MBart-50 is created using the original *mbart-large-cc25* checkpoint by extendeding
+its embedding layers with randomly initialized vectors for an extra set of 25 language tokens and then pretrained on 50
+languages.
+
+According to the abstract
+
+*Multilingual translation models can be created through multilingual finetuning. Instead of finetuning on one
+direction, a pretrained model is finetuned on many directions at the same time. It demonstrates that pretrained models
+can be extended to incorporate additional languages without loss of performance. Multilingual finetuning improves on
+average 1 BLEU over the strongest baselines (being either multilingual from scratch or bilingual finetuning) while
+improving 9.3 BLEU on average over bilingual baselines from scratch.*
+
+
+### Training of MBart-50
+
+The text format for MBart-50 is slightly different from mBART. For MBart-50 the language id token is used as a prefix
+for both source and target text i.e the text format is `[lang_code] X [eos]`, where `lang_code` is source
+language id for source text and target language id for target text, with `X` being the source or target text
+respectively.
+
+
+MBart-50 has its own tokenizer [`MBart50Tokenizer`].
+
+-  Supervised training
+
+```python
+from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
+
+model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")
+tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO")
+
+src_text = " UN Chief Says There Is No Military Solution in Syria"
+tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
+
+model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
+
+model(**model_inputs)  # forward pass
+```
+
+- Generation
+
+  To generate using the mBART-50 multilingual translation models, `eos_token_id` is used as the
+  `decoder_start_token_id` and the target language id is forced as the first generated token. To force the
+  target language id as the first generated token, pass the *forced_bos_token_id* parameter to the *generate* method.
+  The following example shows how to translate between Hindi to French and Arabic to English using the
+  *facebook/mbart-50-large-many-to-many* checkpoint.
+
+```python
+from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
+
+article_hi = "संयुक्त राष्ट्र के प्रमुख का कहना है कि सीरिया में कोई सैन्य समाधान नहीं है"
+article_ar = "الأمين العام للأمم المتحدة يقول إنه لا يوجد حل عسكري في سوريا."
+
+model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
+tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
+
+# translate Hindi to French
+tokenizer.src_lang = "hi_IN"
+encoded_hi = tokenizer(article_hi, return_tensors="pt")
+generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.lang_code_to_id["fr_XX"])
+tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+# => "Le chef de l 'ONU affirme qu 'il n 'y a pas de solution militaire en Syria."
+
+# translate Arabic to English
+tokenizer.src_lang = "ar_AR"
+encoded_ar = tokenizer(article_ar, return_tensors="pt")
+generated_tokens = model.generate(**encoded_ar, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"])
+tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+# => "The Secretary-General of the United Nations says there is no military solution in Syria."
+```
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Causal language modeling task guide](../tasks/language_modeling)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+- [Translation task guide](../tasks/translation)
+- [Summarization task guide](../tasks/summarization)
+
+## MBartConfig
+
+[[autodoc]] MBartConfig
+
+## MBartTokenizer
+
+[[autodoc]] MBartTokenizer
+    - build_inputs_with_special_tokens
+
+## MBartTokenizerFast
+
+[[autodoc]] MBartTokenizerFast
+
+## MBart50Tokenizer
+
+[[autodoc]] MBart50Tokenizer
+
+## MBart50TokenizerFast
+
+[[autodoc]] MBart50TokenizerFast
+
+## MBartModel
+
+[[autodoc]] MBartModel
+
+## MBartForConditionalGeneration
+
+[[autodoc]] MBartForConditionalGeneration
+
+## MBartForQuestionAnswering
+
+[[autodoc]] MBartForQuestionAnswering
+
+## MBartForSequenceClassification
+
+[[autodoc]] MBartForSequenceClassification
+
+## MBartForCausalLM
+
+[[autodoc]] MBartForCausalLM
+    - forward
+
+## TFMBartModel
+
+[[autodoc]] TFMBartModel
+    - call
+
+## TFMBartForConditionalGeneration
+
+[[autodoc]] TFMBartForConditionalGeneration
+    - call
+
+## FlaxMBartModel
+
+[[autodoc]] FlaxMBartModel
+    - __call__
+    - encode
+    - decode
+
+## FlaxMBartForConditionalGeneration
+
+[[autodoc]] FlaxMBartForConditionalGeneration
+    - __call__
+    - encode
+    - decode
+
+## FlaxMBartForSequenceClassification
+
+[[autodoc]] FlaxMBartForSequenceClassification
+    - __call__
+    - encode
+    - decode
+
+## FlaxMBartForQuestionAnswering
+
+[[autodoc]] FlaxMBartForQuestionAnswering
+    - __call__
+    - encode
+    - decode
diff --git a/docs/source/en/model_doc/mbart.mdx b/docs/source/en/model_doc/mbart.mdx
deleted file mode 100644
index b24e31f33c9f..000000000000
--- a/docs/source/en/model_doc/mbart.mdx
+++ /dev/null
@@ -1,224 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# MBart and MBart-50
-
-**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title) and assign
-@patrickvonplaten
-
-## Overview of MBart
-
-The MBart model was presented in [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov Marjan
-Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-
-According to the abstract, MBART is a sequence-to-sequence denoising auto-encoder pretrained on large-scale monolingual
-corpora in many languages using the BART objective. mBART is one of the first methods for pretraining a complete
-sequence-to-sequence model by denoising full texts in multiple languages, while previous approaches have focused only
-on the encoder, decoder, or reconstructing parts of the text.
-
-This model was contributed by [valhalla](https://huggingface.co/valhalla). The Authors' code can be found [here](https://github.com/pytorch/fairseq/tree/master/examples/mbart)
-
-### Training of MBart
-
-MBart is a multilingual encoder-decoder (sequence-to-sequence) model primarily intended for translation task. As the
-model is multilingual it expects the sequences in a different format. A special language id token is added in both the
-source and target text. The source text format is `X [eos, src_lang_code]` where `X` is the source text. The
-target text format is `[tgt_lang_code] X [eos]`. `bos` is never used.
-
-The regular [`~MBartTokenizer.__call__`] will encode source text format passed as first argument or with the `text`
-keyword, and target text format passed with the `text_label` keyword argument.
-
-- Supervised training
-
-```python
->>> from transformers import MBartForConditionalGeneration, MBartTokenizer
-
->>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro", src_lang="en_XX", tgt_lang="ro_RO")
->>> example_english_phrase = "UN Chief Says There Is No Military Solution in Syria"
->>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
-
->>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_romanian, return_tensors="pt")
-
->>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro")
->>> # forward pass
->>> model(**inputs)
-```
-
-- Generation
-
-  While generating the target text set the `decoder_start_token_id` to the target language id. The following
-  example shows how to translate English to Romanian using the *facebook/mbart-large-en-ro* model.
-
-```python
->>> from transformers import MBartForConditionalGeneration, MBartTokenizer
-
->>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro", src_lang="en_XX")
->>> article = "UN Chief Says There Is No Military Solution in Syria"
->>> inputs = tokenizer(article, return_tensors="pt")
->>> translated_tokens = model.generate(**inputs, decoder_start_token_id=tokenizer.lang_code_to_id["ro_RO"])
->>> tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
-"Şeful ONU declară că nu există o soluţie militară în Siria"
-```
-
-## Overview of MBart-50
-
-MBart-50 was introduced in the [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) paper by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav
-Chaudhary, Jiatao Gu, Angela Fan. MBart-50 is created using the original *mbart-large-cc25* checkpoint by extendeding
-its embedding layers with randomly initialized vectors for an extra set of 25 language tokens and then pretrained on 50
-languages.
-
-According to the abstract
-
-*Multilingual translation models can be created through multilingual finetuning. Instead of finetuning on one
-direction, a pretrained model is finetuned on many directions at the same time. It demonstrates that pretrained models
-can be extended to incorporate additional languages without loss of performance. Multilingual finetuning improves on
-average 1 BLEU over the strongest baselines (being either multilingual from scratch or bilingual finetuning) while
-improving 9.3 BLEU on average over bilingual baselines from scratch.*
-
-
-### Training of MBart-50
-
-The text format for MBart-50 is slightly different from mBART. For MBart-50 the language id token is used as a prefix
-for both source and target text i.e the text format is `[lang_code] X [eos]`, where `lang_code` is source
-language id for source text and target language id for target text, with `X` being the source or target text
-respectively.
-
-
-MBart-50 has its own tokenizer [`MBart50Tokenizer`].
-
--  Supervised training
-
-```python
-from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
-
-model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")
-tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO")
-
-src_text = " UN Chief Says There Is No Military Solution in Syria"
-tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
-
-model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
-
-model(**model_inputs)  # forward pass
-```
-
-- Generation
-
-  To generate using the mBART-50 multilingual translation models, `eos_token_id` is used as the
-  `decoder_start_token_id` and the target language id is forced as the first generated token. To force the
-  target language id as the first generated token, pass the *forced_bos_token_id* parameter to the *generate* method.
-  The following example shows how to translate between Hindi to French and Arabic to English using the
-  *facebook/mbart-50-large-many-to-many* checkpoint.
-
-```python
-from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
-
-article_hi = "संयुक्त राष्ट्र के प्रमुख का कहना है कि सीरिया में कोई सैन्य समाधान नहीं है"
-article_ar = "الأمين العام للأمم المتحدة يقول إنه لا يوجد حل عسكري في سوريا."
-
-model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
-tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
-
-# translate Hindi to French
-tokenizer.src_lang = "hi_IN"
-encoded_hi = tokenizer(article_hi, return_tensors="pt")
-generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.lang_code_to_id["fr_XX"])
-tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
-# => "Le chef de l 'ONU affirme qu 'il n 'y a pas de solution militaire en Syria."
-
-# translate Arabic to English
-tokenizer.src_lang = "ar_AR"
-encoded_ar = tokenizer(article_ar, return_tensors="pt")
-generated_tokens = model.generate(**encoded_ar, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"])
-tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
-# => "The Secretary-General of the United Nations says there is no military solution in Syria."
-```
-
-## MBartConfig
-
-[[autodoc]] MBartConfig
-
-## MBartTokenizer
-
-[[autodoc]] MBartTokenizer
-    - build_inputs_with_special_tokens
-
-## MBartTokenizerFast
-
-[[autodoc]] MBartTokenizerFast
-
-## MBart50Tokenizer
-
-[[autodoc]] MBart50Tokenizer
-
-## MBart50TokenizerFast
-
-[[autodoc]] MBart50TokenizerFast
-
-## MBartModel
-
-[[autodoc]] MBartModel
-
-## MBartForConditionalGeneration
-
-[[autodoc]] MBartForConditionalGeneration
-
-## MBartForQuestionAnswering
-
-[[autodoc]] MBartForQuestionAnswering
-
-## MBartForSequenceClassification
-
-[[autodoc]] MBartForSequenceClassification
-
-## MBartForCausalLM
-
-[[autodoc]] MBartForCausalLM
-    - forward
-
-## TFMBartModel
-
-[[autodoc]] TFMBartModel
-    - call
-
-## TFMBartForConditionalGeneration
-
-[[autodoc]] TFMBartForConditionalGeneration
-    - call
-
-## FlaxMBartModel
-
-[[autodoc]] FlaxMBartModel
-    - __call__
-    - encode
-    - decode
-
-## FlaxMBartForConditionalGeneration
-
-[[autodoc]] FlaxMBartForConditionalGeneration
-    - __call__
-    - encode
-    - decode
-
-## FlaxMBartForSequenceClassification
-
-[[autodoc]] FlaxMBartForSequenceClassification
-    - __call__
-    - encode
-    - decode
-
-## FlaxMBartForQuestionAnswering
-
-[[autodoc]] FlaxMBartForQuestionAnswering
-    - __call__
-    - encode
-    - decode
diff --git a/docs/source/en/model_doc/mctct.md b/docs/source/en/model_doc/mctct.md
new file mode 100644
index 000000000000..72d4bedfac69
--- /dev/null
+++ b/docs/source/en/model_doc/mctct.md
@@ -0,0 +1,82 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# M-CTC-T
+
+<Tip warning={true}>
+
+This model is in maintenance mode only, so we won't accept any new PRs changing its code.
+
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
+You can do so by running the following command: `pip install -U transformers==4.30.0`.
+
+</Tip>
+
+## Overview
+
+The M-CTC-T model was proposed in [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert. The model is a 1B-param transformer encoder, with a CTC head over 8065 character labels and a language identification head over 60 language ID labels. It is trained on Common Voice (version 6.1, December 2020 release) and VoxPopuli. After training on Common Voice and VoxPopuli, the model is trained on Common Voice only. The labels are unnormalized character-level transcripts (punctuation and capitalization are not removed). The model takes as input Mel filterbank features from a 16Khz audio signal.
+
+The abstract from the paper is the following:
+
+*Semi-supervised learning through pseudo-labeling has become a staple of state-of-the-art monolingual
+speech recognition systems. In this work, we extend pseudo-labeling to massively multilingual speech
+recognition with 60 languages. We propose a simple pseudo-labeling recipe that works well even
+with low-resource languages: train a supervised multilingual model, fine-tune it with semi-supervised
+learning on a target language, generate pseudo-labels for that language, and train a final model using
+pseudo-labels for all languages, either from scratch or by fine-tuning. Experiments on the labeled
+Common Voice and unlabeled VoxPopuli datasets show that our recipe can yield a model with better
+performance for many languages that also transfers well to LibriSpeech.*
+
+
+
+This model was contributed by [cwkeam](https://huggingface.co/cwkeam). The original code can be found [here](https://github.com/flashlight/wav2letter/tree/main/recipes/mling_pl).
+
+## Documentation resources
+
+- [Automatic speech recognition task guide](../tasks/asr)
+
+Tips:
+
+- The PyTorch version of this model is only available in torch 1.9 and higher.
+
+## MCTCTConfig
+
+[[autodoc]] MCTCTConfig
+
+## MCTCTFeatureExtractor
+
+[[autodoc]] MCTCTFeatureExtractor
+    - __call__
+
+## MCTCTProcessor
+
+[[autodoc]] MCTCTProcessor
+    - __call__
+    - from_pretrained
+    - save_pretrained
+    - batch_decode
+    - decode
+
+
+## MCTCTModel
+
+[[autodoc]] MCTCTModel
+    - forward
+
+## MCTCTForCTC
+
+[[autodoc]] MCTCTForCTC
+    - forward
diff --git a/docs/source/en/model_doc/mctct.mdx b/docs/source/en/model_doc/mctct.mdx
deleted file mode 100644
index 690714ded613..000000000000
--- a/docs/source/en/model_doc/mctct.mdx
+++ /dev/null
@@ -1,66 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# M-CTC-T
-
-## Overview
-
-The M-CTC-T model was proposed in [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert. The model is a 1B-param transformer encoder, with a CTC head over 8065 character labels and a language identification head over 60 language ID labels. It is trained on Common Voice (version 6.1, December 2020 release) and VoxPopuli. After training on Common Voice and VoxPopuli, the model is trained on Common Voice only. The labels are unnormalized character-level transcripts (punctuation and capitalization are not removed). The model takes as input Mel filterbank features from a 16Khz audio signal.
-
-The abstract from the paper is the following:
-
-*Semi-supervised learning through pseudo-labeling has become a staple of state-of-the-art monolingual
-speech recognition systems. In this work, we extend pseudo-labeling to massively multilingual speech
-recognition with 60 languages. We propose a simple pseudo-labeling recipe that works well even
-with low-resource languages: train a supervised multilingual model, fine-tune it with semi-supervised
-learning on a target language, generate pseudo-labels for that language, and train a final model using
-pseudo-labels for all languages, either from scratch or by fine-tuning. Experiments on the labeled
-Common Voice and unlabeled VoxPopuli datasets show that our recipe can yield a model with better
-performance for many languages that also transfers well to LibriSpeech.*
-
-
-
-This model was contributed by [cwkeam](https://huggingface.co/cwkeam). The original code can be found [here](https://github.com/flashlight/wav2letter/tree/main/recipes/mling_pl).
-
-
-Tips:
-
-- The PyTorch version of this model is only available in torch 1.9 and higher.
-
-## MCTCTConfig
-
-[[autodoc]] MCTCTConfig
-
-## MCTCTFeatureExtractor
-
-[[autodoc]] MCTCTFeatureExtractor
-    - __call__
-
-## MCTCTProcessor
-
-[[autodoc]] MCTCTProcessor
-    - __call__
-    - from_pretrained
-    - save_pretrained
-    - batch_decode
-    - decode
-
-
-## MCTCTModel
-
-[[autodoc]] MCTCTModel
-    - forward
-
-## MCTCTForCTC
-
-[[autodoc]] MCTCTForCTC
-    - forward
diff --git a/docs/source/en/model_doc/mega.md b/docs/source/en/model_doc/mega.md
new file mode 100644
index 000000000000..d4d68b9becd1
--- /dev/null
+++ b/docs/source/en/model_doc/mega.md
@@ -0,0 +1,82 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# MEGA
+
+## Overview
+
+The MEGA model was proposed in [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
+MEGA proposes a new approach to self-attention with each encoder layer having a multi-headed exponential moving average in addition to a single head of standard dot-product attention, giving the attention mechanism 
+stronger positional biases. This allows MEGA to perform competitively to Transformers on standard benchmarks including LRA 
+while also having significantly fewer parameters. MEGA's compute efficiency allows it to scale to very long sequences, making it an 
+attractive option for long-document NLP tasks.
+
+The abstract from the paper is the following:
+
+ *The design choices in the Transformer attention mechanism, including weak inductive bias and quadratic computational complexity, have limited its application for modeling long sequences. In this paper, we introduce Mega, a simple, theoretically grounded, single-head gated attention mechanism equipped with (exponential) moving average to incorporate inductive bias of position-aware local dependencies into the position-agnostic attention mechanism. We further propose a variant of Mega that offers linear time and space complexity yet yields only minimal quality loss, by efficiently splitting the whole sequence into multiple chunks with fixed length. Extensive experiments on a wide range of sequence modeling benchmarks, including the Long Range Arena, neural machine translation, auto-regressive language modeling, and image and speech classification, show that Mega achieves significant improvements over other sequence models, including variants of Transformers and recent state space models. *
+
+Tips:
+
+- MEGA can perform quite well with relatively few parameters. See Appendix D in the MEGA paper for examples of architectural specs which perform well in various settings. If using MEGA as a decoder, be sure to set `bidirectional=False` to avoid errors with default bidirectional. 
+- Mega-chunk is a variant of mega that reduces time and spaces complexity from quadratic to linear. Utilize chunking with MegaConfig.use_chunking and control chunk size with MegaConfig.chunk_size 
+
+This model was contributed by [mnaylor](https://huggingface.co/mnaylor).
+The original code can be found [here](https://github.com/facebookresearch/mega).
+
+Implementation Notes:
+
+- The original implementation of MEGA had an inconsistent expectation of attention masks for padding and causal self-attention between the softmax attention and Laplace/squared ReLU method. This implementation addresses that inconsistency.
+- The original implementation did not include token type embeddings; this implementation adds support for these, with the option controlled by MegaConfig.add_token_type_embeddings
+
+
+## MegaConfig
+
+[[autodoc]] MegaConfig
+
+## MegaModel
+
+[[autodoc]] MegaModel
+    - forward
+
+## MegaForCausalLM
+
+[[autodoc]] MegaForCausalLM
+    - forward
+
+## MegaForMaskedLM
+
+[[autodoc]] MegaForMaskedLM
+    - forward
+
+## MegaForSequenceClassification
+
+[[autodoc]] MegaForSequenceClassification
+    - forward
+
+## MegaForMultipleChoice
+
+[[autodoc]] MegaForMultipleChoice
+    - forward
+
+## MegaForTokenClassification
+
+[[autodoc]] MegaForTokenClassification
+    - forward
+
+## MegaForQuestionAnswering
+
+[[autodoc]] MegaForQuestionAnswering
+    - forward
diff --git a/docs/source/en/model_doc/megatron-bert.md b/docs/source/en/model_doc/megatron-bert.md
new file mode 100644
index 000000000000..88ccff23587b
--- /dev/null
+++ b/docs/source/en/model_doc/megatron-bert.md
@@ -0,0 +1,141 @@
+<!--Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# MegatronBERT
+
+## Overview
+
+The MegatronBERT model was proposed in [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model
+Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley,
+Jared Casper and Bryan Catanzaro.
+
+The abstract from the paper is the following:
+
+*Recent work in language modeling demonstrates that training large transformer models advances the state of the art in
+Natural Language Processing applications. However, very large models can be quite difficult to train due to memory
+constraints. In this work, we present our techniques for training very large transformer models and implement a simple,
+efficient intra-layer model parallel approach that enables training transformer models with billions of parameters. Our
+approach does not require a new compiler or library changes, is orthogonal and complimentary to pipeline model
+parallelism, and can be fully implemented with the insertion of a few communication operations in native PyTorch. We
+illustrate this approach by converging transformer based models up to 8.3 billion parameters using 512 GPUs. We sustain
+15.1 PetaFLOPs across the entire application with 76% scaling efficiency when compared to a strong single GPU baseline
+that sustains 39 TeraFLOPs, which is 30% of peak FLOPs. To demonstrate that large language models can further advance
+the state of the art (SOTA), we train an 8.3 billion parameter transformer language model similar to GPT-2 and a 3.9
+billion parameter model similar to BERT. We show that careful attention to the placement of layer normalization in
+BERT-like models is critical to achieving increased performance as the model size grows. Using the GPT-2 model we
+achieve SOTA results on the WikiText103 (10.8 compared to SOTA perplexity of 15.8) and LAMBADA (66.5% compared to SOTA
+accuracy of 63.2%) datasets. Our BERT model achieves SOTA results on the RACE dataset (90.9% compared to SOTA accuracy
+of 89.4%).*
+
+Tips:
+
+We have provided pretrained [BERT-345M](https://ngc.nvidia.com/catalog/models/nvidia:megatron_bert_345m) checkpoints
+for use to evaluate or finetuning downstream tasks.
+
+To access these checkpoints, first [sign up](https://ngc.nvidia.com/signup) for and setup the NVIDIA GPU Cloud (NGC)
+Registry CLI. Further documentation for downloading models can be found in the [NGC documentation](https://docs.nvidia.com/dgx/ngc-registry-cli-user-guide/index.html#topic_6_4_1).
+
+Alternatively, you can directly download the checkpoints using:
+
+BERT-345M-uncased:
+
+```bash
+wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_uncased/zip
+-O megatron_bert_345m_v0_1_uncased.zip
+```
+
+BERT-345M-cased:
+
+```bash
+wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_cased/zip -O
+megatron_bert_345m_v0_1_cased.zip
+```
+
+Once you have obtained the checkpoints from NVIDIA GPU Cloud (NGC), you have to convert them to a format that will
+easily be loaded by Hugging Face Transformers and our port of the BERT code.
+
+The following commands allow you to do the conversion. We assume that the folder `models/megatron_bert` contains
+`megatron_bert_345m_v0_1_{cased, uncased}.zip` and that the commands are run from inside that folder:
+
+```bash
+python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py megatron_bert_345m_v0_1_uncased.zip
+```
+
+```bash
+python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py megatron_bert_345m_v0_1_cased.zip
+```
+
+This model was contributed by [jdemouth](https://huggingface.co/jdemouth). The original code can be found [here](https://github.com/NVIDIA/Megatron-LM). That repository contains a multi-GPU and multi-node implementation of the
+Megatron Language models. In particular, it contains a hybrid model parallel approach using "tensor parallel" and
+"pipeline parallel" techniques.
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Causal language modeling task guide](../tasks/language_modeling)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+- [Multiple choice task guide](../tasks/multiple_choice)
+
+## MegatronBertConfig
+
+[[autodoc]] MegatronBertConfig
+
+## MegatronBertModel
+
+[[autodoc]] MegatronBertModel
+    - forward
+
+## MegatronBertForMaskedLM
+
+[[autodoc]] MegatronBertForMaskedLM
+    - forward
+
+## MegatronBertForCausalLM
+
+[[autodoc]] MegatronBertForCausalLM
+    - forward
+
+## MegatronBertForNextSentencePrediction
+
+[[autodoc]] MegatronBertForNextSentencePrediction
+    - forward
+
+## MegatronBertForPreTraining
+
+[[autodoc]] MegatronBertForPreTraining
+    - forward
+
+## MegatronBertForSequenceClassification
+
+[[autodoc]] MegatronBertForSequenceClassification
+    - forward
+
+## MegatronBertForMultipleChoice
+
+[[autodoc]] MegatronBertForMultipleChoice
+    - forward
+
+## MegatronBertForTokenClassification
+
+[[autodoc]] MegatronBertForTokenClassification
+    - forward
+
+## MegatronBertForQuestionAnswering
+
+[[autodoc]] MegatronBertForQuestionAnswering
+    - forward
diff --git a/docs/source/en/model_doc/megatron-bert.mdx b/docs/source/en/model_doc/megatron-bert.mdx
deleted file mode 100644
index 911bf76aec27..000000000000
--- a/docs/source/en/model_doc/megatron-bert.mdx
+++ /dev/null
@@ -1,128 +0,0 @@
-<!--Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# MegatronBERT
-
-## Overview
-
-The MegatronBERT model was proposed in [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model
-Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley,
-Jared Casper and Bryan Catanzaro.
-
-The abstract from the paper is the following:
-
-*Recent work in language modeling demonstrates that training large transformer models advances the state of the art in
-Natural Language Processing applications. However, very large models can be quite difficult to train due to memory
-constraints. In this work, we present our techniques for training very large transformer models and implement a simple,
-efficient intra-layer model parallel approach that enables training transformer models with billions of parameters. Our
-approach does not require a new compiler or library changes, is orthogonal and complimentary to pipeline model
-parallelism, and can be fully implemented with the insertion of a few communication operations in native PyTorch. We
-illustrate this approach by converging transformer based models up to 8.3 billion parameters using 512 GPUs. We sustain
-15.1 PetaFLOPs across the entire application with 76% scaling efficiency when compared to a strong single GPU baseline
-that sustains 39 TeraFLOPs, which is 30% of peak FLOPs. To demonstrate that large language models can further advance
-the state of the art (SOTA), we train an 8.3 billion parameter transformer language model similar to GPT-2 and a 3.9
-billion parameter model similar to BERT. We show that careful attention to the placement of layer normalization in
-BERT-like models is critical to achieving increased performance as the model size grows. Using the GPT-2 model we
-achieve SOTA results on the WikiText103 (10.8 compared to SOTA perplexity of 15.8) and LAMBADA (66.5% compared to SOTA
-accuracy of 63.2%) datasets. Our BERT model achieves SOTA results on the RACE dataset (90.9% compared to SOTA accuracy
-of 89.4%).*
-
-Tips:
-
-We have provided pretrained [BERT-345M](https://ngc.nvidia.com/catalog/models/nvidia:megatron_bert_345m) checkpoints
-for use to evaluate or finetuning downstream tasks.
-
-To access these checkpoints, first [sign up](https://ngc.nvidia.com/signup) for and setup the NVIDIA GPU Cloud (NGC)
-Registry CLI. Further documentation for downloading models can be found in the [NGC documentation](https://docs.nvidia.com/dgx/ngc-registry-cli-user-guide/index.html#topic_6_4_1).
-
-Alternatively, you can directly download the checkpoints using:
-
-BERT-345M-uncased:
-
-```bash
-wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_uncased/zip
--O megatron_bert_345m_v0_1_uncased.zip
-```
-
-BERT-345M-cased:
-
-```bash
-wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_cased/zip -O
-megatron_bert_345m_v0_1_cased.zip
-```
-
-Once you have obtained the checkpoints from NVIDIA GPU Cloud (NGC), you have to convert them to a format that will
-easily be loaded by Hugging Face Transformers and our port of the BERT code.
-
-The following commands allow you to do the conversion. We assume that the folder `models/megatron_bert` contains
-`megatron_bert_345m_v0_1_{cased, uncased}.zip` and that the commands are run from inside that folder:
-
-```bash
-python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py megatron_bert_345m_v0_1_uncased.zip
-```
-
-```bash
-python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py megatron_bert_345m_v0_1_cased.zip
-```
-
-This model was contributed by [jdemouth](https://huggingface.co/jdemouth). The original code can be found [here](https://github.com/NVIDIA/Megatron-LM). That repository contains a multi-GPU and multi-node implementation of the
-Megatron Language models. In particular, it contains a hybrid model parallel approach using "tensor parallel" and
-"pipeline parallel" techniques.
-
-## MegatronBertConfig
-
-[[autodoc]] MegatronBertConfig
-
-## MegatronBertModel
-
-[[autodoc]] MegatronBertModel
-    - forward
-
-## MegatronBertForMaskedLM
-
-[[autodoc]] MegatronBertForMaskedLM
-    - forward
-
-## MegatronBertForCausalLM
-
-[[autodoc]] MegatronBertForCausalLM
-    - forward
-
-## MegatronBertForNextSentencePrediction
-
-[[autodoc]] MegatronBertForNextSentencePrediction
-    - forward
-
-## MegatronBertForPreTraining
-
-[[autodoc]] MegatronBertForPreTraining
-    - forward
-
-## MegatronBertForSequenceClassification
-
-[[autodoc]] MegatronBertForSequenceClassification
-    - forward
-
-## MegatronBertForMultipleChoice
-
-[[autodoc]] MegatronBertForMultipleChoice
-    - forward
-
-## MegatronBertForTokenClassification
-
-[[autodoc]] MegatronBertForTokenClassification
-    - forward
-
-## MegatronBertForQuestionAnswering
-
-[[autodoc]] MegatronBertForQuestionAnswering
-    - forward
diff --git a/docs/source/en/model_doc/megatron_gpt2.md b/docs/source/en/model_doc/megatron_gpt2.md
new file mode 100644
index 000000000000..1eea7d82bf3a
--- /dev/null
+++ b/docs/source/en/model_doc/megatron_gpt2.md
@@ -0,0 +1,71 @@
+<!--Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# MegatronGPT2
+
+## Overview
+
+The MegatronGPT2 model was proposed in [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model
+Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley,
+Jared Casper and Bryan Catanzaro.
+
+The abstract from the paper is the following:
+
+*Recent work in language modeling demonstrates that training large transformer models advances the state of the art in
+Natural Language Processing applications. However, very large models can be quite difficult to train due to memory
+constraints. In this work, we present our techniques for training very large transformer models and implement a simple,
+efficient intra-layer model parallel approach that enables training transformer models with billions of parameters. Our
+approach does not require a new compiler or library changes, is orthogonal and complimentary to pipeline model
+parallelism, and can be fully implemented with the insertion of a few communication operations in native PyTorch. We
+illustrate this approach by converging transformer based models up to 8.3 billion parameters using 512 GPUs. We sustain
+15.1 PetaFLOPs across the entire application with 76% scaling efficiency when compared to a strong single GPU baseline
+that sustains 39 TeraFLOPs, which is 30% of peak FLOPs. To demonstrate that large language models can further advance
+the state of the art (SOTA), we train an 8.3 billion parameter transformer language model similar to GPT-2 and a 3.9
+billion parameter model similar to BERT. We show that careful attention to the placement of layer normalization in
+BERT-like models is critical to achieving increased performance as the model size grows. Using the GPT-2 model we
+achieve SOTA results on the WikiText103 (10.8 compared to SOTA perplexity of 15.8) and LAMBADA (66.5% compared to SOTA
+accuracy of 63.2%) datasets. Our BERT model achieves SOTA results on the RACE dataset (90.9% compared to SOTA accuracy
+of 89.4%).*
+
+Tips:
+
+We have provided pretrained [GPT2-345M](https://ngc.nvidia.com/catalog/models/nvidia:megatron_lm_345m) checkpoints
+for use to evaluate or finetuning downstream tasks.
+
+To access these checkpoints, first [sign up](https://ngc.nvidia.com/signup) for and setup the NVIDIA GPU Cloud (NGC)
+Registry CLI. Further documentation for downloading models can be found in the [NGC documentation](https://docs.nvidia.com/dgx/ngc-registry-cli-user-guide/index.html#topic_6_4_1).
+
+Alternatively, you can directly download the checkpoints using:
+
+```bash
+wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_lm_345m/versions/v0.0/zip -O
+megatron_gpt2_345m_v0_0.zip
+```
+
+Once you have obtained the checkpoint from NVIDIA GPU Cloud (NGC), you have to convert it to a format that will easily
+be loaded by Hugging Face Transformers GPT2 implementation.
+
+The following command allows you to do the conversion. We assume that the folder `models/megatron_gpt2` contains
+`megatron_gpt2_345m_v0_0.zip` and that the command is run from that folder:
+
+```bash
+python3 $PATH_TO_TRANSFORMERS/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py megatron_gpt2_345m_v0_0.zip
+```
+
+This model was contributed by [jdemouth](https://huggingface.co/jdemouth). The original code can be found [here](https://github.com/NVIDIA/Megatron-LM). That repository contains a multi-GPU and multi-node implementation of the
+Megatron Language models. In particular, it contains a hybrid model parallel approach using "tensor parallel" and
+"pipeline parallel" techniques.
+
diff --git a/docs/source/en/model_doc/megatron_gpt2.mdx b/docs/source/en/model_doc/megatron_gpt2.mdx
deleted file mode 100644
index a0d91e5a1630..000000000000
--- a/docs/source/en/model_doc/megatron_gpt2.mdx
+++ /dev/null
@@ -1,67 +0,0 @@
-<!--Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# MegatronGPT2
-
-## Overview
-
-The MegatronGPT2 model was proposed in [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model
-Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley,
-Jared Casper and Bryan Catanzaro.
-
-The abstract from the paper is the following:
-
-*Recent work in language modeling demonstrates that training large transformer models advances the state of the art in
-Natural Language Processing applications. However, very large models can be quite difficult to train due to memory
-constraints. In this work, we present our techniques for training very large transformer models and implement a simple,
-efficient intra-layer model parallel approach that enables training transformer models with billions of parameters. Our
-approach does not require a new compiler or library changes, is orthogonal and complimentary to pipeline model
-parallelism, and can be fully implemented with the insertion of a few communication operations in native PyTorch. We
-illustrate this approach by converging transformer based models up to 8.3 billion parameters using 512 GPUs. We sustain
-15.1 PetaFLOPs across the entire application with 76% scaling efficiency when compared to a strong single GPU baseline
-that sustains 39 TeraFLOPs, which is 30% of peak FLOPs. To demonstrate that large language models can further advance
-the state of the art (SOTA), we train an 8.3 billion parameter transformer language model similar to GPT-2 and a 3.9
-billion parameter model similar to BERT. We show that careful attention to the placement of layer normalization in
-BERT-like models is critical to achieving increased performance as the model size grows. Using the GPT-2 model we
-achieve SOTA results on the WikiText103 (10.8 compared to SOTA perplexity of 15.8) and LAMBADA (66.5% compared to SOTA
-accuracy of 63.2%) datasets. Our BERT model achieves SOTA results on the RACE dataset (90.9% compared to SOTA accuracy
-of 89.4%).*
-
-Tips:
-
-We have provided pretrained [GPT2-345M](https://ngc.nvidia.com/catalog/models/nvidia:megatron_lm_345m) checkpoints
-for use to evaluate or finetuning downstream tasks.
-
-To access these checkpoints, first [sign up](https://ngc.nvidia.com/signup) for and setup the NVIDIA GPU Cloud (NGC)
-Registry CLI. Further documentation for downloading models can be found in the [NGC documentation](https://docs.nvidia.com/dgx/ngc-registry-cli-user-guide/index.html#topic_6_4_1).
-
-Alternatively, you can directly download the checkpoints using:
-
-```bash
-wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_lm_345m/versions/v0.0/zip -O
-megatron_gpt2_345m_v0_0.zip
-```
-
-Once you have obtained the checkpoint from NVIDIA GPU Cloud (NGC), you have to convert it to a format that will easily
-be loaded by Hugging Face Transformers GPT2 implementation.
-
-The following command allows you to do the conversion. We assume that the folder `models/megatron_gpt2` contains
-`megatron_gpt2_345m_v0_0.zip` and that the command is run from that folder:
-
-```bash
-python3 $PATH_TO_TRANSFORMERS/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py megatron_gpt2_345m_v0_0.zip
-```
-
-This model was contributed by [jdemouth](https://huggingface.co/jdemouth). The original code can be found [here](https://github.com/NVIDIA/Megatron-LM). That repository contains a multi-GPU and multi-node implementation of the
-Megatron Language models. In particular, it contains a hybrid model parallel approach using "tensor parallel" and
-"pipeline parallel" techniques.
-
diff --git a/docs/source/en/model_doc/mgp-str.md b/docs/source/en/model_doc/mgp-str.md
new file mode 100644
index 000000000000..e384c0620170
--- /dev/null
+++ b/docs/source/en/model_doc/mgp-str.md
@@ -0,0 +1,90 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# MGP-STR
+
+## Overview
+
+The MGP-STR model was proposed in [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao. MGP-STR is a conceptually **simple** yet **powerful** vision Scene Text Recognition (STR) model, which is built upon the [Vision Transformer (ViT)](vit). To integrate linguistic knowledge, Multi-Granularity Prediction (MGP) strategy is proposed to inject information from the language modality into the model in an implicit way.
+
+The abstract from the paper is the following:
+
+*Scene text recognition (STR) has been an active research topic in computer vision for years. To tackle this challenging problem, numerous innovative methods have been successively proposed and incorporating linguistic knowledge into STR models has recently become a prominent trend. In this work, we first draw inspiration from the recent progress in Vision Transformer (ViT) to construct a conceptually simple yet powerful vision STR model, which is built upon ViT and outperforms previous state-of-the-art models for scene text recognition, including both pure vision models and language-augmented methods. To integrate linguistic knowledge, we further propose a Multi-Granularity Prediction strategy to inject information from the language modality into the model in an implicit way, i.e. , subword representations (BPE and WordPiece) widely-used in NLP are introduced into the output space, in addition to the conventional character level representation, while no independent language model (LM) is adopted. The resultant algorithm (termed MGP-STR) is able to push the performance envelop of STR to an even higher level. Specifically, it achieves an average recognition accuracy of 93.35% on standard benchmarks.*
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/mgp_str_architecture.png"
+alt="drawing" width="600"/>
+
+<small> MGP-STR architecture. Taken from the <a href="https://arxiv.org/abs/2209.03592">original paper</a>. </small>
+
+Tips:
+
+- MGP-STR is trained on two synthetic datasets [MJSynth]((http://www.robots.ox.ac.uk/~vgg/data/text/)) (MJ) and SynthText(http://www.robots.ox.ac.uk/~vgg/data/scenetext/) (ST) without fine-tuning on other datasets. It achieves state-of-the-art results on six standard Latin scene text benchmarks, including 3 regular text datasets (IC13, SVT, IIIT) and 3 irregular ones (IC15, SVTP, CUTE).
+- This model was contributed by [yuekun](https://huggingface.co/yuekun). The original code can be found [here](https://github.com/AlibabaResearch/AdvancedLiterateMachinery/tree/main/OCR/MGP-STR).
+
+## Inference
+
+[`MgpstrModel`] accepts images as input and generates three types of predictions, which represent textual information at different granularities.
+The three types of predictions are fused to give the final prediction result.
+
+The [`ViTImageProcessor`] class is responsible for preprocessing the input image and
+[`MgpstrTokenizer`] decodes the generated character tokens to the target string. The
+[`MgpstrProcessor`] wraps [`ViTImageProcessor`] and [`MgpstrTokenizer`]
+into a single instance to both extract the input features and decode the predicted token ids.
+
+- Step-by-step Optical Character Recognition (OCR)
+
+``` py
+>>> from transformers import MgpstrProcessor, MgpstrForSceneTextRecognition
+>>> import requests
+>>> from PIL import Image
+
+>>> processor = MgpstrProcessor.from_pretrained('alibaba-damo/mgp-str-base')
+>>> model = MgpstrForSceneTextRecognition.from_pretrained('alibaba-damo/mgp-str-base')
+
+>>> # load image from the IIIT-5k dataset
+>>> url = "https://i.postimg.cc/ZKwLg2Gw/367-14.png"
+>>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+
+>>> pixel_values = processor(images=image, return_tensors="pt").pixel_values
+>>> outputs = model(pixel_values)
+
+>>> generated_text = processor.batch_decode(outputs.logits)['generated_text']
+```
+
+## MgpstrConfig
+
+[[autodoc]] MgpstrConfig
+
+## MgpstrTokenizer
+
+[[autodoc]] MgpstrTokenizer
+    - save_vocabulary
+
+## MgpstrProcessor
+
+[[autodoc]] MgpstrProcessor
+    - __call__
+    - batch_decode
+
+## MgpstrModel
+
+[[autodoc]] MgpstrModel
+    - forward
+
+## MgpstrForSceneTextRecognition
+
+[[autodoc]] MgpstrForSceneTextRecognition
+    - forward
diff --git a/docs/source/en/model_doc/mistral.md b/docs/source/en/model_doc/mistral.md
new file mode 100644
index 000000000000..fbef094cb343
--- /dev/null
+++ b/docs/source/en/model_doc/mistral.md
@@ -0,0 +1,151 @@
+<!--Copyright 2023 Mistral AI and The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Mistral
+
+## Overview
+
+Mistral-7B-v0.1 is Mistral AI’s first Large Language Model (LLM). 
+
+## Model Details
+
+Mistral-7B-v0.1 is a decoder-based LM with the following architectural choices:
+* Sliding Window Attention - Trained with 8k context length and fixed cache size, with a theoretical attention span of 128K tokens
+* GQA (Grouped Query Attention) - allowing faster inference and lower cache size.
+* Byte-fallback BPE tokenizer - ensures that characters are never mapped to out of vocabulary tokens.
+
+We also provide an instruction fine-tuned model: `Mistral-7B-Instruct-v0.1` which can be used for chat-based inference.
+
+For more details please read our [release blog post](https://mistral.ai/news/announcing-mistral-7b/)
+
+## License
+
+Both `Mistral-7B-v0.1` and `Mistral-7B-Instruct-v0.1` are released under the Apache 2.0 license.
+
+## Usage
+
+`Mistral-7B-v0.1` and `Mistral-7B-Instruct-v0.1` can be found on the [Huggingface Hub](https://huggingface.co/mistralai)
+
+These ready-to-use checkpoints can be downloaded and used via the HuggingFace Hub:
+
+```python
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer
+>>> device = "cuda" # the device to load the model onto
+
+>>> model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
+>>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
+
+>>> prompt = "My favourite condiment is"
+
+>>> model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
+>>> model.to(device)
+
+>>> generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
+>>> tokenizer.batch_decode(generated_ids)[0]
+"The expected outupt"
+```
+
+Raw weights for `Mistral-7B-v0.1` and `Mistral-7B-Instruct-v0.1` can be downloaded from:
+
+| Model Name                 | Checkpoint                                                                              |
+|----------------------------|-----------------------------------------------------------------------------------------|
+| `Mistral-7B-v0.1`          | [Raw Checkpoint](https://files.mistral-7b-v0-1.mistral.ai/mistral-7B-v0.1.tar)          |
+| `Mistral-7B-Instruct-v0.1` | [Raw Checkpoint](https://files.mistral-7b-v0-1.mistral.ai/mistral-7B-instruct-v0.1.tar) |
+
+
+To use these raw checkpoints with HuggingFace you can use the `convert_mistral_weights_to_hf.py` script to convert them to the HuggingFace format:
+
+```bash
+python src/transformers/models/mistral/convert_mistral_weights_to_hf.py \
+    --input_dir /path/to/downloaded/mistral/weights --model_size 7B --output_dir /output/path
+```
+
+You can then load the converted model from the `output/path`:
+
+```python
+from transformers import MistralForCausalLM, LlamaTokenzier
+
+tokenizer = LlamaTokenizer.from_pretrained("/output/path")
+model = MistralForCausalLM.from_pretrained("/output/path")
+```
+
+## Combining Mistral and Flash Attention 2
+
+First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature.
+
+```bash
+pip install -U flash-attn --no-build-isolation
+```
+
+Make also sure that you have a hardware that is compatible with Flash-Attention 2. Read more about it in the official documentation of [`flash-attn`](https://github.com/Dao-AILab/flash-attention) repository. Make also sure to load your model in half-precision (e.g. `torch.float16`)
+
+To load and run a model using Flash Attention 2, refer to the snippet below:
+
+```python
+>>> import torch
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer
+>>> device = "cuda" # the device to load the model onto
+
+>>> model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", torch_dtype=torch.float16, use_flash_attention_2=True)
+>>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
+
+>>> prompt = "My favourite condiment is"
+
+>>> model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
+>>> model.to(device)
+
+>>> generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
+>>> tokenizer.batch_decode(generated_ids)[0]
+"The expected outupt"
+```
+
+### Expected speedups
+
+Below is a expected speedup diagram that compares pure inference time between the native implementation in transformers using `mistralai/Mistral-7B-v0.1` checkpoint and the Flash Attention 2 version of the model.
+
+<div style="text-align: center">
+<img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/mistral-7b-inference-large-seqlen.png">
+</div>
+
+### Sliding window Attention
+
+The current implementation supports the sliding window attention mechanism and memory efficient cache management. 
+To enable sliding window attention, just make sure to have a `flash-attn` version that is compatible with sliding window attention (`>=2.3.0`). 
+
+The Flash Attention-2 model uses also a more memory efficient cache slicing mechanism - as recommended per the official implementation of Mistral model that use rolling cache mechanism we keep the cache size fixed (`self.config.sliding_window`), support batched generation only for `padding_side="left"` and use the absolute position of the current token to compute the positional embedding.
+
+## The Mistral Team
+
+Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.
+
+## MistralConfig
+
+[[autodoc]] MistralConfig
+
+## MistralModel
+
+[[autodoc]] MistralModel
+    - forward
+
+## MistralForCausalLM
+
+[[autodoc]] MistralForCausalLM
+    - forward
+
+## MistralForSequenceClassification
+
+[[autodoc]] MistralForSequenceClassification
+    - forward
diff --git a/docs/source/en/model_doc/mluke.md b/docs/source/en/model_doc/mluke.md
new file mode 100644
index 000000000000..ec9430848cea
--- /dev/null
+++ b/docs/source/en/model_doc/mluke.md
@@ -0,0 +1,65 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# mLUKE
+
+## Overview
+
+The mLUKE model was proposed in [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka. It's a multilingual extension
+of the [LUKE model](https://arxiv.org/abs/2010.01057) trained on the basis of XLM-RoBERTa.
+
+It is based on XLM-RoBERTa and adds entity embeddings, which helps improve performance on various downstream tasks
+involving reasoning about entities such as named entity recognition, extractive question answering, relation
+classification, cloze-style knowledge completion.
+
+The abstract from the paper is the following:
+
+*Recent studies have shown that multilingual pretrained language models can be effectively improved with cross-lingual
+alignment information from Wikipedia entities. However, existing methods only exploit entity information in pretraining
+and do not explicitly use entities in downstream tasks. In this study, we explore the effectiveness of leveraging
+entity representations for downstream cross-lingual tasks. We train a multilingual language model with 24 languages
+with entity representations and show the model consistently outperforms word-based pretrained models in various
+cross-lingual transfer tasks. We also analyze the model and the key insight is that incorporating entity
+representations into the input allows us to extract more language-agnostic features. We also evaluate the model with a
+multilingual cloze prompt task with the mLAMA dataset. We show that entity-based prompt elicits correct factual
+knowledge more likely than using only word representations.*
+
+One can directly plug in the weights of mLUKE into a LUKE model, like so:
+
+```python
+from transformers import LukeModel
+
+model = LukeModel.from_pretrained("studio-ousia/mluke-base")
+```
+
+Note that mLUKE has its own tokenizer, [`MLukeTokenizer`]. You can initialize it as follows:
+
+```python
+from transformers import MLukeTokenizer
+
+tokenizer = MLukeTokenizer.from_pretrained("studio-ousia/mluke-base")
+```
+
+As mLUKE's architecture is equivalent to that of LUKE, one can refer to [LUKE's documentation page](luke) for all
+tips, code examples and notebooks.
+
+This model was contributed by [ryo0634](https://huggingface.co/ryo0634). The original code can be found [here](https://github.com/studio-ousia/luke).
+
+## MLukeTokenizer
+
+[[autodoc]] MLukeTokenizer
+    - __call__
+    - save_vocabulary
diff --git a/docs/source/en/model_doc/mluke.mdx b/docs/source/en/model_doc/mluke.mdx
deleted file mode 100644
index b910f17ae2f6..000000000000
--- a/docs/source/en/model_doc/mluke.mdx
+++ /dev/null
@@ -1,61 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# mLUKE
-
-## Overview
-
-The mLUKE model was proposed in [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka. It's a multilingual extension
-of the [LUKE model](https://arxiv.org/abs/2010.01057) trained on the basis of XLM-RoBERTa.
-
-It is based on XLM-RoBERTa and adds entity embeddings, which helps improve performance on various downstream tasks
-involving reasoning about entities such as named entity recognition, extractive question answering, relation
-classification, cloze-style knowledge completion.
-
-The abstract from the paper is the following:
-
-*Recent studies have shown that multilingual pretrained language models can be effectively improved with cross-lingual
-alignment information from Wikipedia entities. However, existing methods only exploit entity information in pretraining
-and do not explicitly use entities in downstream tasks. In this study, we explore the effectiveness of leveraging
-entity representations for downstream cross-lingual tasks. We train a multilingual language model with 24 languages
-with entity representations and show the model consistently outperforms word-based pretrained models in various
-cross-lingual transfer tasks. We also analyze the model and the key insight is that incorporating entity
-representations into the input allows us to extract more language-agnostic features. We also evaluate the model with a
-multilingual cloze prompt task with the mLAMA dataset. We show that entity-based prompt elicits correct factual
-knowledge more likely than using only word representations.*
-
-One can directly plug in the weights of mLUKE into a LUKE model, like so:
-
-```python
-from transformers import LukeModel
-
-model = LukeModel.from_pretrained("studio-ousia/mluke-base")
-```
-
-Note that mLUKE has its own tokenizer, [`MLukeTokenizer`]. You can initialize it as follows:
-
-```python
-from transformers import MLukeTokenizer
-
-tokenizer = MLukeTokenizer.from_pretrained("studio-ousia/mluke-base")
-```
-
-As mLUKE's architecture is equivalent to that of LUKE, one can refer to [LUKE's documentation page](luke) for all
-tips, code examples and notebooks.
-
-This model was contributed by [ryo0634](https://huggingface.co/ryo0634). The original code can be found [here](https://github.com/studio-ousia/luke).
-
-## MLukeTokenizer
-
-[[autodoc]] MLukeTokenizer
-    - __call__
-    - save_vocabulary
diff --git a/docs/source/en/model_doc/mms.md b/docs/source/en/model_doc/mms.md
new file mode 100644
index 000000000000..497eb40d7e3e
--- /dev/null
+++ b/docs/source/en/model_doc/mms.md
@@ -0,0 +1,381 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# MMS
+
+## Overview
+
+The MMS model was proposed in [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) 
+by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli
+
+The abstract from the paper is the following:
+
+*Expanding the language coverage of speech technology has the potential to improve access to information for many more people. 
+However, current speech technology is restricted to about one hundred languages which is a small fraction of the over 7,000
+languages spoken around the world. 
+The Massively Multilingual Speech (MMS) project increases the number of supported languages by 10-40x, depending on the task. 
+The main ingredients are a new dataset based on readings of publicly available religious texts and effectively leveraging
+self-supervised learning. We built pre-trained wav2vec 2.0 models covering 1,406 languages, 
+a single multilingual automatic speech recognition model for 1,107 languages, speech synthesis models 
+for the same number of languages, as well as a language identification model for 4,017 languages. 
+Experiments show that our multilingual speech recognition model more than halves the word error rate of 
+Whisper on 54 languages of the FLEURS benchmark while being trained on a small fraction of the labeled data.*
+
+Here are the different models open sourced in the MMS project. The models and code are originally released [here](https://github.com/facebookresearch/fairseq/tree/main/examples/mms). We have add them to the `transformers` framework, making them easier to use.
+
+### Automatic Speech Recognition (ASR)
+
+The ASR model checkpoints  can be found here : [mms-1b-fl102](https://huggingface.co/facebook/mms-1b-fl102), [mms-1b-l1107](https://huggingface.co/facebook/mms-1b-l1107), [mms-1b-all](https://huggingface.co/facebook/mms-1b-all). For best accuracy, use the `mms-1b-all` model. 
+
+Tips:
+
+- All ASR models accept a float array corresponding to the raw waveform of the speech signal. The raw waveform should be pre-processed with [`Wav2Vec2FeatureExtractor`].
+- The models were trained using connectionist temporal classification (CTC) so the model output has to be decoded using
+  [`Wav2Vec2CTCTokenizer`].
+- You can load different language adapter weights for different languages via [`~Wav2Vec2PreTrainedModel.load_adapter`]. Language adapters only consists of roughly 2 million parameters 
+  and can therefore be efficiently loaded on the fly when needed.
+
+#### Loading
+
+By default MMS loads adapter weights for English. If you want to load adapter weights of another language 
+make sure to specify `target_lang=<your-chosen-target-lang>` as well as `"ignore_mismatched_sizes=True`.
+The `ignore_mismatched_sizes=True` keyword has to be passed to allow the language model head to be resized according
+to the vocabulary of the specified language.
+Similarly, the processor should be loaded with the same target language
+
+```py
+from transformers import Wav2Vec2ForCTC, AutoProcessor
+
+model_id = "facebook/mms-1b-all"
+target_lang = "fra"
+
+processor = AutoProcessor.from_pretrained(model_id, target_lang=target_lang)
+model = Wav2Vec2ForCTC.from_pretrained(model_id, target_lang=target_lang, ignore_mismatched_sizes=True)
+```
+
+<Tip>
+
+You can safely ignore a warning such as:
+
+```text
+Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/mms-1b-all and are newly initialized because the shapes did not match:
+- lm_head.bias: found shape torch.Size([154]) in the checkpoint and torch.Size([314]) in the model instantiated
+- lm_head.weight: found shape torch.Size([154, 1280]) in the checkpoint and torch.Size([314, 1280]) in the model instantiated
+You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+```
+
+</Tip>
+
+If you want to use the ASR pipeline, you can load your chosen target language as such:
+
+```py
+from transformers import pipeline
+
+model_id = "facebook/mms-1b-all"
+target_lang = "fra"
+
+pipe = pipeline(model=model_id, model_kwargs={"target_lang": "fra", "ignore_mismatched_sizes": True})
+```
+
+#### Inference
+
+Next, let's look at how we can run MMS in inference and change adapter layers after having called [`~PretrainedModel.from_pretrained`]
+First, we load audio data in different languages using the [Datasets](https://github.com/huggingface/datasets).
+
+```py
+from datasets import load_dataset, Audio
+
+# English
+stream_data = load_dataset("mozilla-foundation/common_voice_13_0", "en", split="test", streaming=True)
+stream_data = stream_data.cast_column("audio", Audio(sampling_rate=16000))
+en_sample = next(iter(stream_data))["audio"]["array"]
+
+# French
+stream_data = load_dataset("mozilla-foundation/common_voice_13_0", "fr", split="test", streaming=True)
+stream_data = stream_data.cast_column("audio", Audio(sampling_rate=16000))
+fr_sample = next(iter(stream_data))["audio"]["array"]
+```
+
+Next, we load the model and processor
+
+```py
+from transformers import Wav2Vec2ForCTC, AutoProcessor
+import torch
+
+model_id = "facebook/mms-1b-all"
+
+processor = AutoProcessor.from_pretrained(model_id)
+model = Wav2Vec2ForCTC.from_pretrained(model_id)
+```
+
+Now we process the audio data, pass the processed audio data to the model and transcribe the model output,
+just like we usually do for [`Wav2Vec2ForCTC`].
+
+```py
+inputs = processor(en_sample, sampling_rate=16_000, return_tensors="pt")
+
+with torch.no_grad():
+    outputs = model(**inputs).logits
+
+ids = torch.argmax(outputs, dim=-1)[0]
+transcription = processor.decode(ids)
+# 'joe keton disapproved of films and buster also had reservations about the media'
+```
+
+We can now keep the same model in memory and simply switch out the language adapters by
+calling the convenient [`~Wav2Vec2ForCTC.load_adapter`] function for the model and [`~Wav2Vec2CTCTokenizer.set_target_lang`] for the tokenizer.
+We pass the target language as an input - `"fra"` for French.
+
+```py
+processor.tokenizer.set_target_lang("fra")
+model.load_adapter("fra")
+
+inputs = processor(fr_sample, sampling_rate=16_000, return_tensors="pt")
+
+with torch.no_grad():
+    outputs = model(**inputs).logits
+
+ids = torch.argmax(outputs, dim=-1)[0]
+transcription = processor.decode(ids)
+# "ce dernier est volé tout au long de l'histoire romaine"
+```
+
+In the same way the language can be switched out for all other supported languages. Please have a look at:
+
+```py
+processor.tokenizer.vocab.keys()
+```
+
+to see all supported languages.
+
+To further improve performance from ASR models, language model decoding can be used. See the documentation [here](https://huggingface.co/facebook/mms-1b-all) for further details.  
+
+### Speech Synthesis (TTS)
+
+MMS-TTS uses the same model architecture as VITS, which was added to 🤗 Transformers in v4.33. MMS trains a separate 
+model checkpoint for each of the 1100+ languages in the project. All available checkpoints can be found on the Hugging 
+Face Hub: [facebook/mms-tts](https://huggingface.co/models?sort=trending&search=facebook%2Fmms-tts), and the inference 
+documentation under [VITS](https://huggingface.co/docs/transformers/main/en/model_doc/vits).
+
+#### Inference
+
+To use the MMS model, first update to the latest version of the Transformers library:
+
+```bash
+pip install --upgrade transformers accelerate
+```
+
+Since the flow-based model in VITS is non-deterministic, it is good practice to set a seed to ensure reproducibility of 
+the outputs. 
+
+- For languages with a Roman alphabet, such as English or French, the tokenizer can be used directly to 
+pre-process the text inputs. The following code example runs a forward pass using the MMS-TTS English checkpoint:
+
+```python
+import torch
+from transformers import VitsTokenizer, VitsModel, set_seed
+
+tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
+model = VitsModel.from_pretrained("facebook/mms-tts-eng")
+
+inputs = tokenizer(text="Hello - my dog is cute", return_tensors="pt")
+
+set_seed(555)  # make deterministic
+
+with torch.no_grad():
+   outputs = model(**inputs)
+
+waveform = outputs.waveform[0]
+```
+
+The resulting waveform can be saved as a `.wav` file:
+
+```python
+import scipy
+
+scipy.io.wavfile.write("synthesized_speech.wav", rate=model.config.sampling_rate, data=waveform)
+```
+
+Or displayed in a Jupyter Notebook / Google Colab:
+
+```python
+from IPython.display import Audio
+
+Audio(waveform, rate=model.config.sampling_rate)
+```
+
+For certain languages with non-Roman alphabets, such as Arabic, Mandarin or Hindi, the [`uroman`](https://github.com/isi-nlp/uroman) 
+perl package is required to pre-process the text inputs to the Roman alphabet.
+
+You can check whether you require the `uroman` package for your language by inspecting the `is_uroman` attribute of 
+the pre-trained `tokenizer`:
+
+```python
+from transformers import VitsTokenizer
+
+tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
+print(tokenizer.is_uroman)
+```
+
+If required, you should apply the uroman package to your text inputs **prior** to passing them to the `VitsTokenizer`, 
+since currently the tokenizer does not support performing the pre-processing itself.
+
+To do this, first clone the uroman repository to your local machine and set the bash variable `UROMAN` to the local path:
+
+```bash
+git clone https://github.com/isi-nlp/uroman.git
+cd uroman
+export UROMAN=$(pwd)
+```
+
+You can then pre-process the text input using the following code snippet. You can either rely on using the bash variable 
+`UROMAN` to point to the uroman repository, or you can pass the uroman directory as an argument to the `uromaize` function:
+
+```python
+import torch
+from transformers import VitsTokenizer, VitsModel, set_seed
+import os
+import subprocess
+
+tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-kor")
+model = VitsModel.from_pretrained("facebook/mms-tts-kor")
+
+def uromanize(input_string, uroman_path):
+    """Convert non-Roman strings to Roman using the `uroman` perl package."""
+    script_path = os.path.join(uroman_path, "bin", "uroman.pl")
+
+    command = ["perl", script_path]
+
+    process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    # Execute the perl command
+    stdout, stderr = process.communicate(input=input_string.encode())
+
+    if process.returncode != 0:
+        raise ValueError(f"Error {process.returncode}: {stderr.decode()}")
+
+    # Return the output as a string and skip the new-line character at the end
+    return stdout.decode()[:-1]
+
+text = "이봐 무슨 일이야"
+uromaized_text = uromanize(text, uroman_path=os.environ["UROMAN"])
+
+inputs = tokenizer(text=uromaized_text, return_tensors="pt")
+
+set_seed(555)  # make deterministic
+with torch.no_grad():
+   outputs = model(inputs["input_ids"])
+
+waveform = outputs.waveform[0]
+```
+
+**Tips:**
+
+* The MMS-TTS checkpoints are trained on lower-cased, un-punctuated text. By default, the `VitsTokenizer` *normalizes* the inputs by removing any casing and punctuation, to avoid passing out-of-vocabulary characters to the model. Hence, the model is agnostic to casing and punctuation, so these should be avoided in the text prompt. You can disable normalisation by setting `noramlize=False` in the call to the tokenizer, but this will lead to un-expected behaviour and is discouraged.
+* The speaking rate can be varied by setting the attribute `model.speaking_rate` to a chosen value. Likewise, the randomness of the noise is controlled by `model.noise_scale`:
+
+```python
+import torch
+from transformers import VitsTokenizer, VitsModel, set_seed
+
+tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
+model = VitsModel.from_pretrained("facebook/mms-tts-eng")
+
+inputs = tokenizer(text="Hello - my dog is cute", return_tensors="pt")
+
+# make deterministic
+set_seed(555)  
+
+# make speech faster and more noisy
+model.speaking_rate = 1.5
+model.noise_scale = 0.8
+
+with torch.no_grad():
+   outputs = model(**inputs)
+```
+
+
+### Language Identification (LID)
+
+Different LID models are available based on the number of languages they can recognize - [126](https://huggingface.co/facebook/mms-lid-126), [256](https://huggingface.co/facebook/mms-lid-256), [512](https://huggingface.co/facebook/mms-lid-512), [1024](https://huggingface.co/facebook/mms-lid-1024), [2048](https://huggingface.co/facebook/mms-lid-2048), [4017](https://huggingface.co/facebook/mms-lid-4017). 
+
+#### Inference
+First, we install transformers and some other libraries
+
+```bash
+pip install torch accelerate datasets[audio]
+pip install --upgrade transformers
+````
+
+Next, we load a couple of audio samples via `datasets`. Make sure that the audio data is sampled to 16000 kHz.
+
+```py
+from datasets import load_dataset, Audio
+
+# English
+stream_data = load_dataset("mozilla-foundation/common_voice_13_0", "en", split="test", streaming=True)
+stream_data = stream_data.cast_column("audio", Audio(sampling_rate=16000))
+en_sample = next(iter(stream_data))["audio"]["array"]
+
+# Arabic
+stream_data = load_dataset("mozilla-foundation/common_voice_13_0", "ar", split="test", streaming=True)
+stream_data = stream_data.cast_column("audio", Audio(sampling_rate=16000))
+ar_sample = next(iter(stream_data))["audio"]["array"]
+```
+
+Next, we load the model and processor
+
+```py
+from transformers import Wav2Vec2ForSequenceClassification, AutoFeatureExtractor
+import torch
+
+model_id = "facebook/mms-lid-126"
+
+processor = AutoFeatureExtractor.from_pretrained(model_id)
+model = Wav2Vec2ForSequenceClassification.from_pretrained(model_id)
+```
+
+Now we process the audio data, pass the processed audio data to the model to classify it into a language, just like we usually do for Wav2Vec2 audio classification models such as [ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition](https://huggingface.co/harshit345/xlsr-wav2vec-speech-emotion-recognition)
+
+```py
+# English
+inputs = processor(en_sample, sampling_rate=16_000, return_tensors="pt")
+
+with torch.no_grad():
+    outputs = model(**inputs).logits
+
+lang_id = torch.argmax(outputs, dim=-1)[0].item()
+detected_lang = model.config.id2label[lang_id]
+# 'eng'
+
+# Arabic
+inputs = processor(ar_sample, sampling_rate=16_000, return_tensors="pt")
+
+with torch.no_grad():
+    outputs = model(**inputs).logits
+
+lang_id = torch.argmax(outputs, dim=-1)[0].item()
+detected_lang = model.config.id2label[lang_id]
+# 'ara'
+```
+
+To see all the supported languages of a checkpoint, you can print out the language ids as follows:
+```py
+processor.id2label.values()
+```
+
+### Audio Pretrained Models
+
+Pretrained models are available for two different sizes - [300M](https://huggingface.co/facebook/mms-300m) , [1Bil](https://huggingface.co/facebook/mms-1b). The architecture is based on the Wav2Vec2 model, so one can refer to [Wav2Vec2's documentation page](wav2vec2) for further details on how to finetune with models for various downstream tasks.
diff --git a/docs/source/en/model_doc/mobilebert.md b/docs/source/en/model_doc/mobilebert.md
new file mode 100644
index 000000000000..e652756351d2
--- /dev/null
+++ b/docs/source/en/model_doc/mobilebert.md
@@ -0,0 +1,154 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# MobileBERT
+
+## Overview
+
+The MobileBERT model was proposed in [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny
+Zhou. It's a bidirectional transformer based on the BERT model, which is compressed and accelerated using several
+approaches.
+
+The abstract from the paper is the following:
+
+*Natural Language Processing (NLP) has recently achieved great success by using huge pre-trained models with hundreds
+of millions of parameters. However, these models suffer from heavy model sizes and high latency such that they cannot
+be deployed to resource-limited mobile devices. In this paper, we propose MobileBERT for compressing and accelerating
+the popular BERT model. Like the original BERT, MobileBERT is task-agnostic, that is, it can be generically applied to
+various downstream NLP tasks via simple fine-tuning. Basically, MobileBERT is a thin version of BERT_LARGE, while
+equipped with bottleneck structures and a carefully designed balance between self-attentions and feed-forward networks.
+To train MobileBERT, we first train a specially designed teacher model, an inverted-bottleneck incorporated BERT_LARGE
+model. Then, we conduct knowledge transfer from this teacher to MobileBERT. Empirical studies show that MobileBERT is
+4.3x smaller and 5.5x faster than BERT_BASE while achieving competitive results on well-known benchmarks. On the
+natural language inference tasks of GLUE, MobileBERT achieves a GLUEscore o 77.7 (0.6 lower than BERT_BASE), and 62 ms
+latency on a Pixel 4 phone. On the SQuAD v1.1/v2.0 question answering task, MobileBERT achieves a dev F1 score of
+90.0/79.2 (1.5/2.1 higher than BERT_BASE).*
+
+Tips:
+
+- MobileBERT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather
+  than the left.
+- MobileBERT is similar to BERT and therefore relies on the masked language modeling (MLM) objective. It is therefore
+  efficient at predicting masked tokens and at NLU in general, but is not optimal for text generation. Models trained
+  with a causal language modeling (CLM) objective are better in that regard.
+
+This model was contributed by [vshampor](https://huggingface.co/vshampor). The original code can be found [here](https://github.com/google-research/mobilebert).
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+- [Multiple choice task guide](../tasks/multiple_choice)
+
+## MobileBertConfig
+
+[[autodoc]] MobileBertConfig
+
+## MobileBertTokenizer
+
+[[autodoc]] MobileBertTokenizer
+
+## MobileBertTokenizerFast
+
+[[autodoc]] MobileBertTokenizerFast
+
+## MobileBert specific outputs
+
+[[autodoc]] models.mobilebert.modeling_mobilebert.MobileBertForPreTrainingOutput
+
+[[autodoc]] models.mobilebert.modeling_tf_mobilebert.TFMobileBertForPreTrainingOutput
+
+## MobileBertModel
+
+[[autodoc]] MobileBertModel
+    - forward
+
+## MobileBertForPreTraining
+
+[[autodoc]] MobileBertForPreTraining
+    - forward
+
+## MobileBertForMaskedLM
+
+[[autodoc]] MobileBertForMaskedLM
+    - forward
+
+## MobileBertForNextSentencePrediction
+
+[[autodoc]] MobileBertForNextSentencePrediction
+    - forward
+
+## MobileBertForSequenceClassification
+
+[[autodoc]] MobileBertForSequenceClassification
+    - forward
+
+## MobileBertForMultipleChoice
+
+[[autodoc]] MobileBertForMultipleChoice
+    - forward
+
+## MobileBertForTokenClassification
+
+[[autodoc]] MobileBertForTokenClassification
+    - forward
+
+## MobileBertForQuestionAnswering
+
+[[autodoc]] MobileBertForQuestionAnswering
+    - forward
+
+## TFMobileBertModel
+
+[[autodoc]] TFMobileBertModel
+    - call
+
+## TFMobileBertForPreTraining
+
+[[autodoc]] TFMobileBertForPreTraining
+    - call
+
+## TFMobileBertForMaskedLM
+
+[[autodoc]] TFMobileBertForMaskedLM
+    - call
+
+## TFMobileBertForNextSentencePrediction
+
+[[autodoc]] TFMobileBertForNextSentencePrediction
+    - call
+
+## TFMobileBertForSequenceClassification
+
+[[autodoc]] TFMobileBertForSequenceClassification
+    - call
+
+## TFMobileBertForMultipleChoice
+
+[[autodoc]] TFMobileBertForMultipleChoice
+    - call
+
+## TFMobileBertForTokenClassification
+
+[[autodoc]] TFMobileBertForTokenClassification
+    - call
+
+## TFMobileBertForQuestionAnswering
+
+[[autodoc]] TFMobileBertForQuestionAnswering
+    - call
diff --git a/docs/source/en/model_doc/mobilebert.mdx b/docs/source/en/model_doc/mobilebert.mdx
deleted file mode 100644
index 8305903d23c7..000000000000
--- a/docs/source/en/model_doc/mobilebert.mdx
+++ /dev/null
@@ -1,142 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# MobileBERT
-
-## Overview
-
-The MobileBERT model was proposed in [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny
-Zhou. It's a bidirectional transformer based on the BERT model, which is compressed and accelerated using several
-approaches.
-
-The abstract from the paper is the following:
-
-*Natural Language Processing (NLP) has recently achieved great success by using huge pre-trained models with hundreds
-of millions of parameters. However, these models suffer from heavy model sizes and high latency such that they cannot
-be deployed to resource-limited mobile devices. In this paper, we propose MobileBERT for compressing and accelerating
-the popular BERT model. Like the original BERT, MobileBERT is task-agnostic, that is, it can be generically applied to
-various downstream NLP tasks via simple fine-tuning. Basically, MobileBERT is a thin version of BERT_LARGE, while
-equipped with bottleneck structures and a carefully designed balance between self-attentions and feed-forward networks.
-To train MobileBERT, we first train a specially designed teacher model, an inverted-bottleneck incorporated BERT_LARGE
-model. Then, we conduct knowledge transfer from this teacher to MobileBERT. Empirical studies show that MobileBERT is
-4.3x smaller and 5.5x faster than BERT_BASE while achieving competitive results on well-known benchmarks. On the
-natural language inference tasks of GLUE, MobileBERT achieves a GLUEscore o 77.7 (0.6 lower than BERT_BASE), and 62 ms
-latency on a Pixel 4 phone. On the SQuAD v1.1/v2.0 question answering task, MobileBERT achieves a dev F1 score of
-90.0/79.2 (1.5/2.1 higher than BERT_BASE).*
-
-Tips:
-
-- MobileBERT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather
-  than the left.
-- MobileBERT is similar to BERT and therefore relies on the masked language modeling (MLM) objective. It is therefore
-  efficient at predicting masked tokens and at NLU in general, but is not optimal for text generation. Models trained
-  with a causal language modeling (CLM) objective are better in that regard.
-
-This model was contributed by [vshampor](https://huggingface.co/vshampor). The original code can be found [here](https://github.com/google-research/mobilebert).
-
-## MobileBertConfig
-
-[[autodoc]] MobileBertConfig
-
-## MobileBertTokenizer
-
-[[autodoc]] MobileBertTokenizer
-
-## MobileBertTokenizerFast
-
-[[autodoc]] MobileBertTokenizerFast
-
-## MobileBert specific outputs
-
-[[autodoc]] models.mobilebert.modeling_mobilebert.MobileBertForPreTrainingOutput
-
-[[autodoc]] models.mobilebert.modeling_tf_mobilebert.TFMobileBertForPreTrainingOutput
-
-## MobileBertModel
-
-[[autodoc]] MobileBertModel
-    - forward
-
-## MobileBertForPreTraining
-
-[[autodoc]] MobileBertForPreTraining
-    - forward
-
-## MobileBertForMaskedLM
-
-[[autodoc]] MobileBertForMaskedLM
-    - forward
-
-## MobileBertForNextSentencePrediction
-
-[[autodoc]] MobileBertForNextSentencePrediction
-    - forward
-
-## MobileBertForSequenceClassification
-
-[[autodoc]] MobileBertForSequenceClassification
-    - forward
-
-## MobileBertForMultipleChoice
-
-[[autodoc]] MobileBertForMultipleChoice
-    - forward
-
-## MobileBertForTokenClassification
-
-[[autodoc]] MobileBertForTokenClassification
-    - forward
-
-## MobileBertForQuestionAnswering
-
-[[autodoc]] MobileBertForQuestionAnswering
-    - forward
-
-## TFMobileBertModel
-
-[[autodoc]] TFMobileBertModel
-    - call
-
-## TFMobileBertForPreTraining
-
-[[autodoc]] TFMobileBertForPreTraining
-    - call
-
-## TFMobileBertForMaskedLM
-
-[[autodoc]] TFMobileBertForMaskedLM
-    - call
-
-## TFMobileBertForNextSentencePrediction
-
-[[autodoc]] TFMobileBertForNextSentencePrediction
-    - call
-
-## TFMobileBertForSequenceClassification
-
-[[autodoc]] TFMobileBertForSequenceClassification
-    - call
-
-## TFMobileBertForMultipleChoice
-
-[[autodoc]] TFMobileBertForMultipleChoice
-    - call
-
-## TFMobileBertForTokenClassification
-
-[[autodoc]] TFMobileBertForTokenClassification
-    - call
-
-## TFMobileBertForQuestionAnswering
-
-[[autodoc]] TFMobileBertForQuestionAnswering
-    - call
diff --git a/docs/source/en/model_doc/mobilenet_v1.md b/docs/source/en/model_doc/mobilenet_v1.md
new file mode 100644
index 000000000000..56743efe1416
--- /dev/null
+++ b/docs/source/en/model_doc/mobilenet_v1.md
@@ -0,0 +1,84 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# MobileNet V1
+
+## Overview
+
+The MobileNet model was proposed in [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
+
+The abstract from the paper is the following:
+
+*We present a class of efficient models called MobileNets for mobile and embedded vision applications. MobileNets are based on a streamlined architecture that uses depth-wise separable convolutions to build light weight deep neural networks. We introduce two simple global hyper-parameters that efficiently trade off between latency and accuracy. These hyper-parameters allow the model builder to choose the right sized model for their application based on the constraints of the problem. We present extensive experiments on resource and accuracy tradeoffs and show strong performance compared to other popular models on ImageNet classification. We then demonstrate the effectiveness of MobileNets across a wide range of applications and use cases including object detection, finegrain classification, face attributes and large scale geo-localization.*
+
+Tips:
+
+- The checkpoints are named **mobilenet\_v1\_*depth*\_*size***, for example **mobilenet\_v1\_1.0\_224**, where **1.0** is the depth multiplier (sometimes also referred to as "alpha" or the width multiplier) and **224** is the resolution of the input images the model was trained on.
+
+- Even though the checkpoint is trained on images of specific size, the model will work on images of any size. The smallest supported image size is 32x32.
+
+- One can use [`MobileNetV1ImageProcessor`] to prepare images for the model.
+
+- The available image classification checkpoints are pre-trained on [ImageNet-1k](https://huggingface.co/datasets/imagenet-1k) (also referred to as ILSVRC 2012, a collection of 1.3 million images and 1,000 classes). However, the model predicts 1001 classes: the 1000 classes from ImageNet plus an extra “background” class (index 0).
+
+- The original TensorFlow checkpoints use different padding rules than PyTorch, requiring the model to determine the padding amount at inference time, since this depends on the input image size. To use native PyTorch padding behavior, create a [`MobileNetV1Config`] with `tf_padding = False`.
+
+Unsupported features:
+
+- The [`MobileNetV1Model`] outputs a globally pooled version of the last hidden state. In the original model it is possible to use a 7x7 average pooling layer with stride 2 instead of global pooling. For larger inputs, this gives a pooled output that is larger than 1x1 pixel. The HuggingFace implementation does not support this.
+
+- It is currently not possible to specify an `output_stride`. For smaller output strides, the original model invokes dilated convolution to prevent the spatial resolution from being reduced further. The output stride of the HuggingFace model is always 32.
+
+- The original TensorFlow checkpoints include quantized models. We do not support these models as they include additional "FakeQuantization" operations to unquantize the weights.
+
+- It's common to extract the output from the pointwise layers at indices 5, 11, 12, 13 for downstream purposes. Using `output_hidden_states=True` returns the output from all intermediate layers. There is currently no way to limit this to specific layers.
+
+This model was contributed by [matthijs](https://huggingface.co/Matthijs). The original code and weights can be found [here](https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with MobileNetV1.
+
+<PipelineTag pipeline="image-classification"/>
+
+- [`MobileNetV1ForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
+- See also: [Image classification task guide](../tasks/image_classification)
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## MobileNetV1Config
+
+[[autodoc]] MobileNetV1Config
+
+## MobileNetV1FeatureExtractor
+
+[[autodoc]] MobileNetV1FeatureExtractor
+    - preprocess
+
+## MobileNetV1ImageProcessor
+
+[[autodoc]] MobileNetV1ImageProcessor
+    - preprocess
+
+## MobileNetV1Model
+
+[[autodoc]] MobileNetV1Model
+    - forward
+
+## MobileNetV1ForImageClassification
+
+[[autodoc]] MobileNetV1ForImageClassification
+    - forward
diff --git a/docs/source/en/model_doc/mobilenet_v1.mdx b/docs/source/en/model_doc/mobilenet_v1.mdx
deleted file mode 100644
index 48627954cec9..000000000000
--- a/docs/source/en/model_doc/mobilenet_v1.mdx
+++ /dev/null
@@ -1,69 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# MobileNet V1
-
-## Overview
-
-The MobileNet model was proposed in [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
-
-The abstract from the paper is the following:
-
-*We present a class of efficient models called MobileNets for mobile and embedded vision applications. MobileNets are based on a streamlined architecture that uses depth-wise separable convolutions to build light weight deep neural networks. We introduce two simple global hyper-parameters that efficiently trade off between latency and accuracy. These hyper-parameters allow the model builder to choose the right sized model for their application based on the constraints of the problem. We present extensive experiments on resource and accuracy tradeoffs and show strong performance compared to other popular models on ImageNet classification. We then demonstrate the effectiveness of MobileNets across a wide range of applications and use cases including object detection, finegrain classification, face attributes and large scale geo-localization.*
-
-Tips:
-
-- The checkpoints are named **mobilenet\_v1\_*depth*\_*size***, for example **mobilenet\_v1\_1.0\_224**, where **1.0** is the depth multiplier (sometimes also referred to as "alpha" or the width multiplier) and **224** is the resolution of the input images the model was trained on.
-
-- Even though the checkpoint is trained on images of specific size, the model will work on images of any size. The smallest supported image size is 32x32.
-
-- One can use [`MobileNetV1ImageProcessor`] to prepare images for the model.
-
-- The available image classification checkpoints are pre-trained on [ImageNet-1k](https://huggingface.co/datasets/imagenet-1k) (also referred to as ILSVRC 2012, a collection of 1.3 million images and 1,000 classes). However, the model predicts 1001 classes: the 1000 classes from ImageNet plus an extra “background” class (index 0).
-
-- The original TensorFlow checkpoints use different padding rules than PyTorch, requiring the model to determine the padding amount at inference time, since this depends on the input image size. To use native PyTorch padding behavior, create a [`MobileNetV1Config`] with `tf_padding = False`.
-
-Unsupported features:
-
-- The [`MobileNetV1Model`] outputs a globally pooled version of the last hidden state. In the original model it is possible to use a 7x7 average pooling layer with stride 2 instead of global pooling. For larger inputs, this gives a pooled output that is larger than 1x1 pixel. The HuggingFace implementation does not support this.
-
-- It is currently not possible to specify an `output_stride`. For smaller output strides, the original model invokes dilated convolution to prevent the spatial resolution from being reduced further. The output stride of the HuggingFace model is always 32.
-
-- The original TensorFlow checkpoints include quantized models. We do not support these models as they include additional "FakeQuantization" operations to unquantize the weights.
-
-- It's common to extract the output from the pointwise layers at indices 5, 11, 12, 13 for downstream purposes. Using `output_hidden_states=True` returns the output from all intermediate layers. There is currently no way to limit this to specific layers.
-
-This model was contributed by [matthijs](https://huggingface.co/Matthijs). The original code and weights can be found [here](https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md).
-
-## MobileNetV1Config
-
-[[autodoc]] MobileNetV1Config
-
-## MobileNetV1FeatureExtractor
-
-[[autodoc]] MobileNetV1FeatureExtractor
-    - preprocess
-
-## MobileNetV1ImageProcessor
-
-[[autodoc]] MobileNetV1ImageProcessor
-    - preprocess
-
-## MobileNetV1Model
-
-[[autodoc]] MobileNetV1Model
-    - forward
-
-## MobileNetV1ForImageClassification
-
-[[autodoc]] MobileNetV1ForImageClassification
-    - forward
diff --git a/docs/source/en/model_doc/mobilenet_v2.md b/docs/source/en/model_doc/mobilenet_v2.md
new file mode 100644
index 000000000000..bd4114dc71ad
--- /dev/null
+++ b/docs/source/en/model_doc/mobilenet_v2.md
@@ -0,0 +1,98 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# MobileNet V2
+
+## Overview
+
+The MobileNet model was proposed in [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
+
+The abstract from the paper is the following:
+
+*In this paper we describe a new mobile architecture, MobileNetV2, that improves the state of the art performance of mobile models on multiple tasks and benchmarks as well as across a spectrum of different model sizes. We also describe efficient ways of applying these mobile models to object detection in a novel framework we call SSDLite. Additionally, we demonstrate how to build mobile semantic segmentation models through a reduced form of DeepLabv3 which we call Mobile DeepLabv3.*
+
+*The MobileNetV2 architecture is based on an inverted residual structure where the input and output of the residual block are thin bottleneck layers opposite to traditional residual models which use expanded representations in the input an MobileNetV2 uses lightweight depthwise convolutions to filter features in the intermediate expansion layer. Additionally, we find that it is important to remove non-linearities in the narrow layers in order to maintain representational power. We demonstrate that this improves performance and provide an intuition that led to this design. Finally, our approach allows decoupling of the input/output domains from the expressiveness of the transformation, which provides a convenient framework for further analysis. We measure our performance on Imagenet classification, COCO object detection, VOC image segmentation. We evaluate the trade-offs between accuracy, and number of operations measured by multiply-adds (MAdd), as well as the number of parameters.*
+
+Tips:
+
+- The checkpoints are named **mobilenet\_v2\_*depth*\_*size***, for example **mobilenet\_v2\_1.0\_224**, where **1.0** is the depth multiplier (sometimes also referred to as "alpha" or the width multiplier) and **224** is the resolution of the input images the model was trained on.
+
+- Even though the checkpoint is trained on images of specific size, the model will work on images of any size. The smallest supported image size is 32x32.
+
+- One can use [`MobileNetV2ImageProcessor`] to prepare images for the model.
+
+- The available image classification checkpoints are pre-trained on [ImageNet-1k](https://huggingface.co/datasets/imagenet-1k) (also referred to as ILSVRC 2012, a collection of 1.3 million images and 1,000 classes). However, the model predicts 1001 classes: the 1000 classes from ImageNet plus an extra “background” class (index 0).
+
+- The segmentation model uses a [DeepLabV3+](https://arxiv.org/abs/1802.02611) head. The available semantic segmentation checkpoints are pre-trained on [PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/).
+
+- The original TensorFlow checkpoints use different padding rules than PyTorch, requiring the model to determine the padding amount at inference time, since this depends on the input image size. To use native PyTorch padding behavior, create a [`MobileNetV2Config`] with `tf_padding = False`.
+
+Unsupported features:
+
+- The [`MobileNetV2Model`] outputs a globally pooled version of the last hidden state. In the original model it is possible to use an average pooling layer with a fixed 7x7 window and stride 1 instead of global pooling. For inputs that are larger than the recommended image size, this gives a pooled output that is larger than 1x1. The Hugging Face implementation does not support this.
+
+- The original TensorFlow checkpoints include quantized models. We do not support these models as they include additional "FakeQuantization" operations to unquantize the weights.
+
+- It's common to extract the output from the expansion layers at indices 10 and 13, as well as the output from the final 1x1 convolution layer, for downstream purposes. Using `output_hidden_states=True` returns the output from all intermediate layers. There is currently no way to limit this to specific layers.
+
+- The DeepLabV3+ segmentation head does not use the final convolution layer from the backbone, but this layer gets computed anyway. There is currently no way to tell [`MobileNetV2Model`] up to which layer it should run.
+
+This model was contributed by [matthijs](https://huggingface.co/Matthijs). The original code and weights can be found [here for the main model](https://github.com/tensorflow/models/tree/master/research/slim/nets/mobilenet) and [here for DeepLabV3+](https://github.com/tensorflow/models/tree/master/research/deeplab).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with MobileNetV2.
+
+<PipelineTag pipeline="image-classification"/>
+
+- [`MobileNetV2ForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
+- See also: [Image classification task guide](../tasks/image_classification)
+
+**Semantic segmentation**
+- [Semantic segmentation task guide](../tasks/semantic_segmentation)
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## MobileNetV2Config
+
+[[autodoc]] MobileNetV2Config
+
+## MobileNetV2FeatureExtractor
+
+[[autodoc]] MobileNetV2FeatureExtractor
+    - preprocess
+    - post_process_semantic_segmentation
+
+## MobileNetV2ImageProcessor
+
+[[autodoc]] MobileNetV2ImageProcessor
+    - preprocess
+    - post_process_semantic_segmentation
+
+## MobileNetV2Model
+
+[[autodoc]] MobileNetV2Model
+    - forward
+
+## MobileNetV2ForImageClassification
+
+[[autodoc]] MobileNetV2ForImageClassification
+    - forward
+
+## MobileNetV2ForSemanticSegmentation
+
+[[autodoc]] MobileNetV2ForSemanticSegmentation
+    - forward
diff --git a/docs/source/en/model_doc/mobilenet_v2.mdx b/docs/source/en/model_doc/mobilenet_v2.mdx
deleted file mode 100644
index 6b9dde63b87f..000000000000
--- a/docs/source/en/model_doc/mobilenet_v2.mdx
+++ /dev/null
@@ -1,80 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# MobileNet V2
-
-## Overview
-
-The MobileNet model was proposed in [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
-
-The abstract from the paper is the following:
-
-*In this paper we describe a new mobile architecture, MobileNetV2, that improves the state of the art performance of mobile models on multiple tasks and benchmarks as well as across a spectrum of different model sizes. We also describe efficient ways of applying these mobile models to object detection in a novel framework we call SSDLite. Additionally, we demonstrate how to build mobile semantic segmentation models through a reduced form of DeepLabv3 which we call Mobile DeepLabv3.*
-
-*The MobileNetV2 architecture is based on an inverted residual structure where the input and output of the residual block are thin bottleneck layers opposite to traditional residual models which use expanded representations in the input an MobileNetV2 uses lightweight depthwise convolutions to filter features in the intermediate expansion layer. Additionally, we find that it is important to remove non-linearities in the narrow layers in order to maintain representational power. We demonstrate that this improves performance and provide an intuition that led to this design. Finally, our approach allows decoupling of the input/output domains from the expressiveness of the transformation, which provides a convenient framework for further analysis. We measure our performance on Imagenet classification, COCO object detection, VOC image segmentation. We evaluate the trade-offs between accuracy, and number of operations measured by multiply-adds (MAdd), as well as the number of parameters.*
-
-Tips:
-
-- The checkpoints are named **mobilenet\_v2\_*depth*\_*size***, for example **mobilenet\_v2\_1.0\_224**, where **1.0** is the depth multiplier (sometimes also referred to as "alpha" or the width multiplier) and **224** is the resolution of the input images the model was trained on.
-
-- Even though the checkpoint is trained on images of specific size, the model will work on images of any size. The smallest supported image size is 32x32.
-
-- One can use [`MobileNetV2ImageProcessor`] to prepare images for the model.
-
-- The available image classification checkpoints are pre-trained on [ImageNet-1k](https://huggingface.co/datasets/imagenet-1k) (also referred to as ILSVRC 2012, a collection of 1.3 million images and 1,000 classes). However, the model predicts 1001 classes: the 1000 classes from ImageNet plus an extra “background” class (index 0).
-
-- The segmentation model uses a [DeepLabV3+](https://arxiv.org/abs/1802.02611) head. The available semantic segmentation checkpoints are pre-trained on [PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/).
-
-- The original TensorFlow checkpoints use different padding rules than PyTorch, requiring the model to determine the padding amount at inference time, since this depends on the input image size. To use native PyTorch padding behavior, create a [`MobileNetV2Config`] with `tf_padding = False`.
-
-Unsupported features:
-
-- The [`MobileNetV2Model`] outputs a globally pooled version of the last hidden state. In the original model it is possible to use an average pooling layer with a fixed 7x7 window and stride 1 instead of global pooling. For inputs that are larger than the recommended image size, this gives a pooled output that is larger than 1x1. The Hugging Face implementation does not support this.
-
-- The original TensorFlow checkpoints include quantized models. We do not support these models as they include additional "FakeQuantization" operations to unquantize the weights.
-
-- It's common to extract the output from the expansion layers at indices 10 and 13, as well as the output from the final 1x1 convolution layer, for downstream purposes. Using `output_hidden_states=True` returns the output from all intermediate layers. There is currently no way to limit this to specific layers.
-
-- The DeepLabV3+ segmentation head does not use the final convolution layer from the backbone, but this layer gets computed anyway. There is currently no way to tell [`MobileNetV2Model`] up to which layer it should run.
-
-This model was contributed by [matthijs](https://huggingface.co/Matthijs). The original code and weights can be found [here for the main model](https://github.com/tensorflow/models/tree/master/research/slim/nets/mobilenet) and [here for DeepLabV3+](https://github.com/tensorflow/models/tree/master/research/deeplab).
-
-## MobileNetV2Config
-
-[[autodoc]] MobileNetV2Config
-
-## MobileNetV2FeatureExtractor
-
-[[autodoc]] MobileNetV2FeatureExtractor
-    - preprocess
-    - post_process_semantic_segmentation
-
-## MobileNetV2ImageProcessor
-
-[[autodoc]] MobileNetV2ImageProcessor
-    - preprocess
-    - post_process_semantic_segmentation
-
-## MobileNetV2Model
-
-[[autodoc]] MobileNetV2Model
-    - forward
-
-## MobileNetV2ForImageClassification
-
-[[autodoc]] MobileNetV2ForImageClassification
-    - forward
-
-## MobileNetV2ForSemanticSegmentation
-
-[[autodoc]] MobileNetV2ForSemanticSegmentation
-    - forward
diff --git a/docs/source/en/model_doc/mobilevit.md b/docs/source/en/model_doc/mobilevit.md
new file mode 100644
index 000000000000..2d815795689a
--- /dev/null
+++ b/docs/source/en/model_doc/mobilevit.md
@@ -0,0 +1,122 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# MobileViT
+
+## Overview
+
+The MobileViT model was proposed in [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari. MobileViT introduces a new layer that replaces local processing in convolutions with global processing using transformers.
+
+The abstract from the paper is the following:
+
+*Light-weight convolutional neural networks (CNNs) are the de-facto for mobile vision tasks. Their spatial inductive biases allow them to learn representations with fewer parameters across different vision tasks. However, these networks are spatially local. To learn global representations, self-attention-based vision trans-formers (ViTs) have been adopted. Unlike CNNs, ViTs are heavy-weight. In this paper, we ask the following question: is it possible to combine the strengths of CNNs and ViTs to build a light-weight and low latency network for mobile vision tasks? Towards this end, we introduce MobileViT, a light-weight and general-purpose vision transformer for mobile devices. MobileViT presents a different perspective for the global processing of information with transformers, i.e., transformers as convolutions. Our results show that MobileViT significantly outperforms CNN- and ViT-based networks across different tasks and datasets. On the ImageNet-1k dataset, MobileViT achieves top-1 accuracy of 78.4% with about 6 million parameters, which is 3.2% and 6.2% more accurate than MobileNetv3 (CNN-based) and DeIT (ViT-based) for a similar number of parameters. On the MS-COCO object detection task, MobileViT is 5.7% more accurate than MobileNetv3 for a similar number of parameters.*
+
+Tips:
+
+- MobileViT is more like a CNN than a Transformer model. It does not work on sequence data but on batches of images. Unlike ViT, there are no embeddings. The backbone model outputs a feature map. You can follow [this tutorial](https://keras.io/examples/vision/mobilevit) for a lightweight introduction.
+- One can use [`MobileViTImageProcessor`] to prepare images for the model. Note that if you do your own preprocessing, the pretrained checkpoints expect images to be in BGR pixel order (not RGB).
+- The available image classification checkpoints are pre-trained on [ImageNet-1k](https://huggingface.co/datasets/imagenet-1k) (also referred to as ILSVRC 2012, a collection of 1.3 million images and 1,000 classes).
+- The segmentation model uses a [DeepLabV3](https://arxiv.org/abs/1706.05587) head. The available semantic segmentation checkpoints are pre-trained on [PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/).
+- As the name suggests MobileViT was designed to be performant and efficient on mobile phones. The TensorFlow versions of the MobileViT models are fully compatible with [TensorFlow Lite](https://www.tensorflow.org/lite).
+
+  You can use the following code to convert a MobileViT checkpoint (be it image classification or semantic segmentation) to generate a
+  TensorFlow Lite model:
+
+```py
+from transformers import TFMobileViTForImageClassification
+import tensorflow as tf
+
+
+model_ckpt = "apple/mobilevit-xx-small"
+model = TFMobileViTForImageClassification.from_pretrained(model_ckpt)
+
+converter = tf.lite.TFLiteConverter.from_keras_model(model)
+converter.optimizations = [tf.lite.Optimize.DEFAULT]
+converter.target_spec.supported_ops = [
+    tf.lite.OpsSet.TFLITE_BUILTINS,
+    tf.lite.OpsSet.SELECT_TF_OPS,
+]
+tflite_model = converter.convert()
+tflite_filename = model_ckpt.split("/")[-1] + ".tflite"
+with open(tflite_filename, "wb") as f:
+    f.write(tflite_model)
+```
+
+  The resulting model will be just **about an MB** making it a good fit for mobile applications where resources and network
+  bandwidth can be constrained.
+
+
+This model was contributed by [matthijs](https://huggingface.co/Matthijs). The TensorFlow version of the model was contributed by [sayakpaul](https://huggingface.co/sayakpaul). The original code and weights can be found [here](https://github.com/apple/ml-cvnets).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with MobileViT.
+
+<PipelineTag pipeline="image-classification"/>
+
+- [`MobileViTForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
+- See also: [Image classification task guide](../tasks/image_classification)
+
+**Semantic segmentation**
+- [Semantic segmentation task guide](../tasks/semantic_segmentation)
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## MobileViTConfig
+
+[[autodoc]] MobileViTConfig
+
+## MobileViTFeatureExtractor
+
+[[autodoc]] MobileViTFeatureExtractor
+    - __call__
+    - post_process_semantic_segmentation
+
+## MobileViTImageProcessor
+
+[[autodoc]] MobileViTImageProcessor
+    - preprocess
+    - post_process_semantic_segmentation
+
+## MobileViTModel
+
+[[autodoc]] MobileViTModel
+    - forward
+
+## MobileViTForImageClassification
+
+[[autodoc]] MobileViTForImageClassification
+    - forward
+
+## MobileViTForSemanticSegmentation
+
+[[autodoc]] MobileViTForSemanticSegmentation
+    - forward
+
+## TFMobileViTModel
+
+[[autodoc]] TFMobileViTModel
+    - call
+
+## TFMobileViTForImageClassification
+
+[[autodoc]] TFMobileViTForImageClassification
+    - call
+
+## TFMobileViTForSemanticSegmentation
+
+[[autodoc]] TFMobileViTForSemanticSegmentation
+    - call
diff --git a/docs/source/en/model_doc/mobilevit.mdx b/docs/source/en/model_doc/mobilevit.mdx
deleted file mode 100644
index c7de403a800c..000000000000
--- a/docs/source/en/model_doc/mobilevit.mdx
+++ /dev/null
@@ -1,105 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# MobileViT
-
-## Overview
-
-The MobileViT model was proposed in [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari. MobileViT introduces a new layer that replaces local processing in convolutions with global processing using transformers.
-
-The abstract from the paper is the following:
-
-*Light-weight convolutional neural networks (CNNs) are the de-facto for mobile vision tasks. Their spatial inductive biases allow them to learn representations with fewer parameters across different vision tasks. However, these networks are spatially local. To learn global representations, self-attention-based vision trans-formers (ViTs) have been adopted. Unlike CNNs, ViTs are heavy-weight. In this paper, we ask the following question: is it possible to combine the strengths of CNNs and ViTs to build a light-weight and low latency network for mobile vision tasks? Towards this end, we introduce MobileViT, a light-weight and general-purpose vision transformer for mobile devices. MobileViT presents a different perspective for the global processing of information with transformers, i.e., transformers as convolutions. Our results show that MobileViT significantly outperforms CNN- and ViT-based networks across different tasks and datasets. On the ImageNet-1k dataset, MobileViT achieves top-1 accuracy of 78.4% with about 6 million parameters, which is 3.2% and 6.2% more accurate than MobileNetv3 (CNN-based) and DeIT (ViT-based) for a similar number of parameters. On the MS-COCO object detection task, MobileViT is 5.7% more accurate than MobileNetv3 for a similar number of parameters.*
-
-Tips:
-
-- MobileViT is more like a CNN than a Transformer model. It does not work on sequence data but on batches of images. Unlike ViT, there are no embeddings. The backbone model outputs a feature map. You can follow [this tutorial](https://keras.io/examples/vision/mobilevit) for a lightweight introduction.
-- One can use [`MobileViTImageProcessor`] to prepare images for the model. Note that if you do your own preprocessing, the pretrained checkpoints expect images to be in BGR pixel order (not RGB).
-- The available image classification checkpoints are pre-trained on [ImageNet-1k](https://huggingface.co/datasets/imagenet-1k) (also referred to as ILSVRC 2012, a collection of 1.3 million images and 1,000 classes).
-- The segmentation model uses a [DeepLabV3](https://arxiv.org/abs/1706.05587) head. The available semantic segmentation checkpoints are pre-trained on [PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/).
-- As the name suggests MobileViT was designed to be performant and efficient on mobile phones. The TensorFlow versions of the MobileViT models are fully compatible with [TensorFlow Lite](https://www.tensorflow.org/lite).
-
-  You can use the following code to convert a MobileViT checkpoint (be it image classification or semantic segmentation) to generate a
-  TensorFlow Lite model:
-
-```py
-from transformers import TFMobileViTForImageClassification
-import tensorflow as tf
-
-
-model_ckpt = "apple/mobilevit-xx-small"
-model = TFMobileViTForImageClassification.from_pretrained(model_ckpt)
-
-converter = tf.lite.TFLiteConverter.from_keras_model(model)
-converter.optimizations = [tf.lite.Optimize.DEFAULT]
-converter.target_spec.supported_ops = [
-    tf.lite.OpsSet.TFLITE_BUILTINS,
-    tf.lite.OpsSet.SELECT_TF_OPS,
-]
-tflite_model = converter.convert()
-tflite_filename = model_ckpt.split("/")[-1] + ".tflite"
-with open(tflite_filename, "wb") as f:
-    f.write(tflite_model)
-```
-
-  The resulting model will be just **about an MB** making it a good fit for mobile applications where resources and network
-  bandwidth can be constrained.
-
-
-This model was contributed by [matthijs](https://huggingface.co/Matthijs). The TensorFlow version of the model was contributed by [sayakpaul](https://huggingface.co/sayakpaul). The original code and weights can be found [here](https://github.com/apple/ml-cvnets).
-
-
-## MobileViTConfig
-
-[[autodoc]] MobileViTConfig
-
-## MobileViTFeatureExtractor
-
-[[autodoc]] MobileViTFeatureExtractor
-    - __call__
-    - post_process_semantic_segmentation
-
-## MobileViTImageProcessor
-
-[[autodoc]] MobileViTImageProcessor
-    - preprocess
-    - post_process_semantic_segmentation
-
-## MobileViTModel
-
-[[autodoc]] MobileViTModel
-    - forward
-
-## MobileViTForImageClassification
-
-[[autodoc]] MobileViTForImageClassification
-    - forward
-
-## MobileViTForSemanticSegmentation
-
-[[autodoc]] MobileViTForSemanticSegmentation
-    - forward
-
-## TFMobileViTModel
-
-[[autodoc]] TFMobileViTModel
-    - call
-
-## TFMobileViTForImageClassification
-
-[[autodoc]] TFMobileViTForImageClassification
-    - call
-
-## TFMobileViTForSemanticSegmentation
-
-[[autodoc]] TFMobileViTForSemanticSegmentation
-    - call
diff --git a/docs/source/en/model_doc/mobilevitv2.md b/docs/source/en/model_doc/mobilevitv2.md
new file mode 100644
index 000000000000..4b6689ef2b40
--- /dev/null
+++ b/docs/source/en/model_doc/mobilevitv2.md
@@ -0,0 +1,57 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# MobileViTV2
+
+## Overview
+
+The MobileViTV2 model was proposed in [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
+
+MobileViTV2 is the second version of MobileViT, constructed by replacing the multi-headed self-attention in MobileViT with separable self-attention.
+
+The abstract from the paper is the following:
+
+*Mobile vision transformers (MobileViT) can achieve state-of-the-art performance across several mobile vision tasks, including classification and detection. Though these models have fewer parameters, they have high latency as compared to convolutional neural network-based models. The main efficiency bottleneck in MobileViT is the multi-headed self-attention (MHA) in transformers, which requires O(k2) time complexity with respect to the number of tokens (or patches) k. Moreover, MHA requires costly operations (e.g., batch-wise matrix multiplication) for computing self-attention, impacting latency on resource-constrained devices. This paper introduces a separable self-attention method with linear complexity, i.e. O(k). A simple yet effective characteristic of the proposed method is that it uses element-wise operations for computing self-attention, making it a good choice for resource-constrained devices. The improved model, MobileViTV2, is state-of-the-art on several mobile vision tasks, including ImageNet object classification and MS-COCO object detection. With about three million parameters, MobileViTV2 achieves a top-1 accuracy of 75.6% on the ImageNet dataset, outperforming MobileViT by about 1% while running 3.2× faster on a mobile device.*
+
+Tips:
+
+- MobileViTV2 is more like a CNN than a Transformer model. It does not work on sequence data but on batches of images. Unlike ViT, there are no embeddings. The backbone model outputs a feature map.
+- One can use [`MobileViTImageProcessor`] to prepare images for the model. Note that if you do your own preprocessing, the pretrained checkpoints expect images to be in BGR pixel order (not RGB).
+- The available image classification checkpoints are pre-trained on [ImageNet-1k](https://huggingface.co/datasets/imagenet-1k) (also referred to as ILSVRC 2012, a collection of 1.3 million images and 1,000 classes).
+- The segmentation model uses a [DeepLabV3](https://arxiv.org/abs/1706.05587) head. The available semantic segmentation checkpoints are pre-trained on [PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/).
+
+This model was contributed by [shehan97](https://huggingface.co/shehan97).
+The original code can be found [here](https://github.com/apple/ml-cvnets).
+
+
+## MobileViTV2Config
+
+[[autodoc]] MobileViTV2Config
+
+## MobileViTV2Model
+
+[[autodoc]] MobileViTV2Model
+    - forward
+
+## MobileViTV2ForImageClassification
+
+[[autodoc]] MobileViTV2ForImageClassification
+    - forward
+
+## MobileViTV2ForSemanticSegmentation
+
+[[autodoc]] MobileViTV2ForSemanticSegmentation
+    - forward
diff --git a/docs/source/en/model_doc/mpnet.md b/docs/source/en/model_doc/mpnet.md
new file mode 100644
index 000000000000..97c140f631d1
--- /dev/null
+++ b/docs/source/en/model_doc/mpnet.md
@@ -0,0 +1,129 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# MPNet
+
+## Overview
+
+The MPNet model was proposed in [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
+
+MPNet adopts a novel pre-training method, named masked and permuted language modeling, to inherit the advantages of
+masked language modeling and permuted language modeling for natural language understanding.
+
+The abstract from the paper is the following:
+
+*BERT adopts masked language modeling (MLM) for pre-training and is one of the most successful pre-training models.
+Since BERT neglects dependency among predicted tokens, XLNet introduces permuted language modeling (PLM) for
+pre-training to address this problem. However, XLNet does not leverage the full position information of a sentence and
+thus suffers from position discrepancy between pre-training and fine-tuning. In this paper, we propose MPNet, a novel
+pre-training method that inherits the advantages of BERT and XLNet and avoids their limitations. MPNet leverages the
+dependency among predicted tokens through permuted language modeling (vs. MLM in BERT), and takes auxiliary position
+information as input to make the model see a full sentence and thus reducing the position discrepancy (vs. PLM in
+XLNet). We pre-train MPNet on a large-scale dataset (over 160GB text corpora) and fine-tune on a variety of
+down-streaming tasks (GLUE, SQuAD, etc). Experimental results show that MPNet outperforms MLM and PLM by a large
+margin, and achieves better results on these tasks compared with previous state-of-the-art pre-trained methods (e.g.,
+BERT, XLNet, RoBERTa) under the same model setting.*
+
+Tips:
+
+- MPNet doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. just
+  separate your segments with the separation token `tokenizer.sep_token` (or `[sep]`).
+
+The original code can be found [here](https://github.com/microsoft/MPNet).
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+- [Multiple choice task guide](../tasks/multiple_choice)
+
+## MPNetConfig
+
+[[autodoc]] MPNetConfig
+
+## MPNetTokenizer
+
+[[autodoc]] MPNetTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## MPNetTokenizerFast
+
+[[autodoc]] MPNetTokenizerFast
+
+## MPNetModel
+
+[[autodoc]] MPNetModel
+    - forward
+
+## MPNetForMaskedLM
+
+[[autodoc]] MPNetForMaskedLM
+    - forward
+
+## MPNetForSequenceClassification
+
+[[autodoc]] MPNetForSequenceClassification
+    - forward
+
+## MPNetForMultipleChoice
+
+[[autodoc]] MPNetForMultipleChoice
+    - forward
+
+## MPNetForTokenClassification
+
+[[autodoc]] MPNetForTokenClassification
+    - forward
+
+## MPNetForQuestionAnswering
+
+[[autodoc]] MPNetForQuestionAnswering
+    - forward
+
+## TFMPNetModel
+
+[[autodoc]] TFMPNetModel
+    - call
+
+## TFMPNetForMaskedLM
+
+[[autodoc]] TFMPNetForMaskedLM
+    - call
+
+## TFMPNetForSequenceClassification
+
+[[autodoc]] TFMPNetForSequenceClassification
+    - call
+
+## TFMPNetForMultipleChoice
+
+[[autodoc]] TFMPNetForMultipleChoice
+    - call
+
+## TFMPNetForTokenClassification
+
+[[autodoc]] TFMPNetForTokenClassification
+    - call
+
+## TFMPNetForQuestionAnswering
+
+[[autodoc]] TFMPNetForQuestionAnswering
+    - call
diff --git a/docs/source/en/model_doc/mpnet.mdx b/docs/source/en/model_doc/mpnet.mdx
deleted file mode 100644
index 0fa88ee87b72..000000000000
--- a/docs/source/en/model_doc/mpnet.mdx
+++ /dev/null
@@ -1,117 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# MPNet
-
-## Overview
-
-The MPNet model was proposed in [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
-
-MPNet adopts a novel pre-training method, named masked and permuted language modeling, to inherit the advantages of
-masked language modeling and permuted language modeling for natural language understanding.
-
-The abstract from the paper is the following:
-
-*BERT adopts masked language modeling (MLM) for pre-training and is one of the most successful pre-training models.
-Since BERT neglects dependency among predicted tokens, XLNet introduces permuted language modeling (PLM) for
-pre-training to address this problem. However, XLNet does not leverage the full position information of a sentence and
-thus suffers from position discrepancy between pre-training and fine-tuning. In this paper, we propose MPNet, a novel
-pre-training method that inherits the advantages of BERT and XLNet and avoids their limitations. MPNet leverages the
-dependency among predicted tokens through permuted language modeling (vs. MLM in BERT), and takes auxiliary position
-information as input to make the model see a full sentence and thus reducing the position discrepancy (vs. PLM in
-XLNet). We pre-train MPNet on a large-scale dataset (over 160GB text corpora) and fine-tune on a variety of
-down-streaming tasks (GLUE, SQuAD, etc). Experimental results show that MPNet outperforms MLM and PLM by a large
-margin, and achieves better results on these tasks compared with previous state-of-the-art pre-trained methods (e.g.,
-BERT, XLNet, RoBERTa) under the same model setting.*
-
-Tips:
-
-- MPNet doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. just
-  separate your segments with the separation token `tokenizer.sep_token` (or `[sep]`).
-
-The original code can be found [here](https://github.com/microsoft/MPNet).
-
-## MPNetConfig
-
-[[autodoc]] MPNetConfig
-
-## MPNetTokenizer
-
-[[autodoc]] MPNetTokenizer
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-
-## MPNetTokenizerFast
-
-[[autodoc]] MPNetTokenizerFast
-
-## MPNetModel
-
-[[autodoc]] MPNetModel
-    - forward
-
-## MPNetForMaskedLM
-
-[[autodoc]] MPNetForMaskedLM
-    - forward
-
-## MPNetForSequenceClassification
-
-[[autodoc]] MPNetForSequenceClassification
-    - forward
-
-## MPNetForMultipleChoice
-
-[[autodoc]] MPNetForMultipleChoice
-    - forward
-
-## MPNetForTokenClassification
-
-[[autodoc]] MPNetForTokenClassification
-    - forward
-
-## MPNetForQuestionAnswering
-
-[[autodoc]] MPNetForQuestionAnswering
-    - forward
-
-## TFMPNetModel
-
-[[autodoc]] TFMPNetModel
-    - call
-
-## TFMPNetForMaskedLM
-
-[[autodoc]] TFMPNetForMaskedLM
-    - call
-
-## TFMPNetForSequenceClassification
-
-[[autodoc]] TFMPNetForSequenceClassification
-    - call
-
-## TFMPNetForMultipleChoice
-
-[[autodoc]] TFMPNetForMultipleChoice
-    - call
-
-## TFMPNetForTokenClassification
-
-[[autodoc]] TFMPNetForTokenClassification
-    - call
-
-## TFMPNetForQuestionAnswering
-
-[[autodoc]] TFMPNetForQuestionAnswering
-    - call
diff --git a/docs/source/en/model_doc/mpt.md b/docs/source/en/model_doc/mpt.md
new file mode 100644
index 000000000000..fd0a3b5c46bf
--- /dev/null
+++ b/docs/source/en/model_doc/mpt.md
@@ -0,0 +1,69 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# MPT
+
+## Overview
+
+The MPT model was proposed by the [MosaicML](https://www.mosaicml.com/) team and released with multiple sizes and finetuned variants. The MPT models is a series of open source and commercially usable LLMs pre-trained on 1T tokens. 
+
+MPT models are GPT-style decoder-only transformers with several improvements: performance-optimized layer implementations, architecture changes that provide greater training stability, and the elimination of context length limits by replacing positional embeddings with ALiBi. 
+
+- MPT base: MPT base pre-trained models on next token prediction 
+- MPT instruct: MPT base models fine-tuned on instruction based tasks
+- MPT storywriter: MPT base models fine-tuned for 2500 steps on 65k-token excerpts of fiction books contained in the books3 corpus, this enables the model to handle very long sequences
+
+The original code is available at the  [`llm-foundry`](https://github.com/mosaicml/llm-foundry/tree/main) repository.
+
+Read more about it [in the release blogpost](https://www.mosaicml.com/blog/mpt-7b)
+
+Tips:
+
+- Learn more about some techniques behind training of the model [in this section of llm-foundry repository](https://github.com/mosaicml/llm-foundry/blob/main/TUTORIAL.md#faqs)
+- If you want to use the advanced version of the model (triton kernels, direct flash attention integration), you can still use the original model implementation by adding `trust_remote_code=True` when calling `from_pretrained`.
+
+- [Fine-tuning Notebook](https://colab.research.google.com/drive/1HCpQkLL7UXW8xJUJJ29X7QAeNJKO0frZ?usp=sharing) on how to fine-tune MPT-7B on a free Google Colab instance to turn the model into a Chatbot.
+
+
+## MptConfig
+
+[[autodoc]] MptConfig
+    - all
+
+## MptModel
+
+[[autodoc]] MptModel
+    - forward
+
+## MptForCausalLM
+
+[[autodoc]] MptForCausalLM
+    - forward
+
+## MptForSequenceClassification
+
+[[autodoc]] MptForSequenceClassification
+    - forward
+
+## MptForTokenClassification
+
+[[autodoc]] MptForTokenClassification
+    - forward
+
+## MptForQuestionAnswering
+
+[[autodoc]] MptForQuestionAnswering
+    - forward
diff --git a/docs/source/en/model_doc/mra.md b/docs/source/en/model_doc/mra.md
new file mode 100644
index 000000000000..c67fe35fc724
--- /dev/null
+++ b/docs/source/en/model_doc/mra.md
@@ -0,0 +1,68 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# MRA
+
+## Overview
+
+The MRA model was proposed in [Multi Resolution Analysis (MRA) for Approximate Self-Attention](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, and Vikas Singh.
+
+The abstract from the paper is the following:
+
+*Transformers have emerged as a preferred model for many tasks in natural langugage processing and vision. Recent efforts on training and deploying Transformers more efficiently have identified many strategies to approximate the self-attention matrix, a key module in a Transformer architecture. Effective ideas include various prespecified sparsity patterns, low-rank basis expansions and combinations thereof. In this paper, we revisit classical Multiresolution Analysis (MRA) concepts such as Wavelets, whose potential value in this setting remains underexplored thus far. We show that simple approximations based on empirical feedback and design choices informed by modern hardware and implementation challenges, eventually yield a MRA-based approach for self-attention with an excellent performance profile across most criteria of interest. We undertake an extensive set of experiments and demonstrate that this multi-resolution scheme outperforms most efficient self-attention proposals and is favorable for both short and long sequences. Code is available at https://github.com/mlpen/mra-attention.*
+
+This model was contributed by [novice03](https://huggingface.co/novice03).
+The original code can be found [here](https://github.com/mlpen/mra-attention).
+
+
+## MraConfig
+
+[[autodoc]] MraConfig
+
+
+## MraModel
+
+[[autodoc]] MraModel
+    - forward
+
+
+## MraForMaskedLM
+
+[[autodoc]] MraForMaskedLM
+    - forward
+
+
+## MraForSequenceClassification
+
+[[autodoc]] MraForSequenceClassification
+    - forward
+
+## MraForMultipleChoice
+
+[[autodoc]] MraForMultipleChoice
+    - forward
+
+
+## MraForTokenClassification
+
+[[autodoc]] MraForTokenClassification
+    - forward
+
+
+## MraForQuestionAnswering
+
+[[autodoc]] MraForQuestionAnswering
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/mt5.md b/docs/source/en/model_doc/mt5.md
new file mode 100644
index 000000000000..beec9b535490
--- /dev/null
+++ b/docs/source/en/model_doc/mt5.md
@@ -0,0 +1,128 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# mT5
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=mt5">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-mt5-blueviolet">
+</a>
+<a href="https://huggingface.co/spaces/docs-demos/mt5-small-finetuned-arxiv-cs-finetuned-arxiv-cs-full">
+<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
+</a>
+</div>
+
+## Overview
+
+The mT5 model was presented in [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya
+Siddhant, Aditya Barua, Colin Raffel.
+
+The abstract from the paper is the following:
+
+*The recent "Text-to-Text Transfer Transformer" (T5) leveraged a unified text-to-text format and scale to attain
+state-of-the-art results on a wide variety of English-language NLP tasks. In this paper, we introduce mT5, a
+multilingual variant of T5 that was pre-trained on a new Common Crawl-based dataset covering 101 languages. We detail
+the design and modified training of mT5 and demonstrate its state-of-the-art performance on many multilingual
+benchmarks. We also describe a simple technique to prevent "accidental translation" in the zero-shot setting, where a
+generative model chooses to (partially) translate its prediction into the wrong language. All of the code and model
+checkpoints used in this work are publicly available.*
+
+Note: mT5 was only pre-trained on [mC4](https://huggingface.co/datasets/mc4) excluding any supervised training.
+Therefore, this model has to be fine-tuned before it is usable on a downstream task, unlike the original T5 model.
+Since mT5 was pre-trained unsupervisedly, there's no real advantage to using a task prefix during single-task
+fine-tuning. If you are doing multi-task fine-tuning, you should use a prefix.
+
+Google has released the following variants:
+
+- [google/mt5-small](https://huggingface.co/google/mt5-small)
+
+- [google/mt5-base](https://huggingface.co/google/mt5-base)
+
+- [google/mt5-large](https://huggingface.co/google/mt5-large)
+
+- [google/mt5-xl](https://huggingface.co/google/mt5-xl)
+
+- [google/mt5-xxl](https://huggingface.co/google/mt5-xxl).
+
+This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The original code can be
+found [here](https://github.com/google-research/multilingual-t5).
+
+## Documentation resources
+
+- [Translation task guide](../tasks/translation)
+- [Summarization task guide](../tasks/summarization)
+
+## MT5Config
+
+[[autodoc]] MT5Config
+
+## MT5Tokenizer
+
+[[autodoc]] MT5Tokenizer
+
+See [`T5Tokenizer`] for all details.
+
+
+## MT5TokenizerFast
+
+[[autodoc]] MT5TokenizerFast
+
+See [`T5TokenizerFast`] for all details.
+
+
+## MT5Model
+
+[[autodoc]] MT5Model
+
+## MT5ForConditionalGeneration
+
+[[autodoc]] MT5ForConditionalGeneration
+
+## MT5EncoderModel
+
+[[autodoc]] MT5EncoderModel
+
+## MT5ForSequenceClassification
+
+[[autodoc]] MT5ForSequenceClassification
+
+## MT5ForQuestionAnswering
+
+[[autodoc]] MT5ForQuestionAnswering
+
+## TFMT5Model
+
+[[autodoc]] TFMT5Model
+
+## TFMT5ForConditionalGeneration
+
+[[autodoc]] TFMT5ForConditionalGeneration
+
+## TFMT5EncoderModel
+
+[[autodoc]] TFMT5EncoderModel
+
+## FlaxMT5Model
+
+[[autodoc]] FlaxMT5Model
+
+## FlaxMT5ForConditionalGeneration
+
+[[autodoc]] FlaxMT5ForConditionalGeneration
+
+## FlaxMT5EncoderModel
+
+[[autodoc]] FlaxMT5EncoderModel
diff --git a/docs/source/en/model_doc/mt5.mdx b/docs/source/en/model_doc/mt5.mdx
deleted file mode 100644
index dc08ed55a1c7..000000000000
--- a/docs/source/en/model_doc/mt5.mdx
+++ /dev/null
@@ -1,102 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# mT5
-
-## Overview
-
-The mT5 model was presented in [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya
-Siddhant, Aditya Barua, Colin Raffel.
-
-The abstract from the paper is the following:
-
-*The recent "Text-to-Text Transfer Transformer" (T5) leveraged a unified text-to-text format and scale to attain
-state-of-the-art results on a wide variety of English-language NLP tasks. In this paper, we introduce mT5, a
-multilingual variant of T5 that was pre-trained on a new Common Crawl-based dataset covering 101 languages. We detail
-the design and modified training of mT5 and demonstrate its state-of-the-art performance on many multilingual
-benchmarks. We also describe a simple technique to prevent "accidental translation" in the zero-shot setting, where a
-generative model chooses to (partially) translate its prediction into the wrong language. All of the code and model
-checkpoints used in this work are publicly available.*
-
-Note: mT5 was only pre-trained on [mC4](https://huggingface.co/datasets/mc4) excluding any supervised training.
-Therefore, this model has to be fine-tuned before it is usable on a downstream task, unlike the original T5 model.
-Since mT5 was pre-trained unsupervisedly, there's no real advantage to using a task prefix during single-task
-fine-tuning. If you are doing multi-task fine-tuning, you should use a prefix.
-
-Google has released the following variants:
-
-- [google/mt5-small](https://huggingface.co/google/mt5-small)
-
-- [google/mt5-base](https://huggingface.co/google/mt5-base)
-
-- [google/mt5-large](https://huggingface.co/google/mt5-large)
-
-- [google/mt5-xl](https://huggingface.co/google/mt5-xl)
-
-- [google/mt5-xxl](https://huggingface.co/google/mt5-xxl).
-
-This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The original code can be
-found [here](https://github.com/google-research/multilingual-t5).
-
-## MT5Config
-
-[[autodoc]] MT5Config
-
-## MT5Tokenizer
-
-[[autodoc]] MT5Tokenizer
-
-See [`T5Tokenizer`] for all details.
-
-
-## MT5TokenizerFast
-
-[[autodoc]] MT5TokenizerFast
-
-See [`T5TokenizerFast`] for all details.
-
-
-## MT5Model
-
-[[autodoc]] MT5Model
-
-## MT5ForConditionalGeneration
-
-[[autodoc]] MT5ForConditionalGeneration
-
-## MT5EncoderModel
-
-[[autodoc]] MT5EncoderModel
-
-## TFMT5Model
-
-[[autodoc]] TFMT5Model
-
-## TFMT5ForConditionalGeneration
-
-[[autodoc]] TFMT5ForConditionalGeneration
-
-## TFMT5EncoderModel
-
-[[autodoc]] TFMT5EncoderModel
-
-## FlaxMT5Model
-
-[[autodoc]] FlaxMT5Model
-
-## FlaxMT5ForConditionalGeneration
-
-[[autodoc]] FlaxMT5ForConditionalGeneration
-
-## FlaxMT5EncoderModel
-
-[[autodoc]] FlaxMT5EncoderModel
diff --git a/docs/source/en/model_doc/musicgen.md b/docs/source/en/model_doc/musicgen.md
new file mode 100644
index 000000000000..40c48382734c
--- /dev/null
+++ b/docs/source/en/model_doc/musicgen.md
@@ -0,0 +1,265 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# MusicGen
+
+## Overview
+
+The MusicGen model was proposed in the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284)
+by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez.
+
+MusicGen is a single stage auto-regressive Transformer model capable of generating high-quality music samples conditioned
+on text descriptions or audio prompts. The text descriptions are passed through a frozen text encoder model to obtain a
+sequence of hidden-state representations. MusicGen is then trained to predict discrete audio tokens, or *audio codes*,
+conditioned on these hidden-states. These audio tokens are then decoded using an audio compression model, such as EnCodec,
+to recover the audio waveform.
+
+Through an efficient token interleaving pattern, MusicGen does not require a self-supervised semantic representation of
+the text/audio prompts, thus eliminating the need to cascade multiple models to predict a set of codebooks (e.g.
+hierarchically or upsampling). Instead, it is able to generate all the codebooks in a single forward pass.
+
+The abstract from the paper is the following:
+
+*We tackle the task of conditional music generation. We introduce MusicGen, a single Language Model (LM) that operates
+over several streams of compressed discrete music representation, i.e., tokens. Unlike prior work, MusicGen is comprised
+of a single-stage transformer LM together with efficient token interleaving patterns, which eliminates the need for
+cascading several models, e.g., hierarchically or upsampling. Following this approach, we demonstrate how MusicGen
+can generate high-quality samples, while being conditioned on textual description or melodic features, allowing better
+controls over the generated output. We conduct extensive empirical evaluation, considering both automatic and human
+studies, showing the proposed approach is superior to the evaluated baselines on a standard text-to-music benchmark.
+Through ablation studies, we shed light over the importance of each of the components comprising MusicGen.*
+
+This model was contributed by [sanchit-gandhi](https://huggingface.co/sanchit-gandhi). The original code can be found
+[here](https://github.com/facebookresearch/audiocraft). The pre-trained checkpoints can be found on the
+[Hugging Face Hub](https://huggingface.co/models?sort=downloads&search=facebook%2Fmusicgen-).
+
+## Generation
+
+MusicGen is compatible with two generation modes: greedy and sampling. In practice, sampling leads to significantly
+better results than greedy, thus we encourage sampling mode to be used where possible. Sampling is enabled by default,
+and can be explicitly specified by setting `do_sample=True` in the call to [`MusicgenForConditionalGeneration.generate`],
+or by overriding the model's generation config (see below).
+
+Generation is limited by the sinusoidal positional embeddings to 30 second inputs. Meaning, MusicGen cannot generate more
+than 30 seconds of audio (1503 tokens), and input audio passed by Audio-Prompted Generation contributes to this limit so,
+given an input of 20 seconds of audio, MusicGen cannot generate more than 10 seconds of additional audio.
+
+### Unconditional Generation
+
+The inputs for unconditional (or 'null') generation can be obtained through the method
+[`MusicgenForConditionalGeneration.get_unconditional_inputs`]:
+
+```python
+>>> from transformers import MusicgenForConditionalGeneration
+
+>>> model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
+>>> unconditional_inputs = model.get_unconditional_inputs(num_samples=1)
+
+>>> audio_values = model.generate(**unconditional_inputs, do_sample=True, max_new_tokens=256)
+```
+
+The audio outputs are a three-dimensional Torch tensor of shape `(batch_size, num_channels, sequence_length)`. To listen
+to the generated audio samples, you can either play them in an ipynb notebook:
+
+```python
+from IPython.display import Audio
+
+sampling_rate = model.config.audio_encoder.sampling_rate
+Audio(audio_values[0].numpy(), rate=sampling_rate)
+```
+
+Or save them as a `.wav` file using a third-party library, e.g. `scipy`:
+
+```python
+>>> import scipy
+
+>>> sampling_rate = model.config.audio_encoder.sampling_rate
+>>> scipy.io.wavfile.write("musicgen_out.wav", rate=sampling_rate, data=audio_values[0, 0].numpy())
+```
+
+### Text-Conditional Generation
+
+The model can generate an audio sample conditioned on a text prompt through use of the [`MusicgenProcessor`] to pre-process
+the inputs:
+
+```python
+>>> from transformers import AutoProcessor, MusicgenForConditionalGeneration
+
+>>> processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
+>>> model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
+
+>>> inputs = processor(
+...     text=["80s pop track with bassy drums and synth", "90s rock song with loud guitars and heavy drums"],
+...     padding=True,
+...     return_tensors="pt",
+... )
+>>> audio_values = model.generate(**inputs, do_sample=True, guidance_scale=3, max_new_tokens=256)
+```
+
+The `guidance_scale` is used in classifier free guidance (CFG), setting the weighting between the conditional logits
+(which are predicted from the text prompts) and the unconditional logits (which are predicted from an unconditional or
+'null' prompt). Higher guidance scale encourages the model to generate samples that are more closely linked to the input
+prompt, usually at the expense of poorer audio quality. CFG is enabled by setting `guidance_scale > 1`. For best results,
+use `guidance_scale=3` (default).
+
+### Audio-Prompted Generation
+
+The same [`MusicgenProcessor`] can be used to pre-process an audio prompt that is used for audio continuation. In the
+following example, we load an audio file using the 🤗 Datasets library, which can be pip installed through the command
+below:
+
+```
+pip install --upgrade pip
+pip install datasets[audio]
+```
+
+```python
+>>> from transformers import AutoProcessor, MusicgenForConditionalGeneration
+>>> from datasets import load_dataset
+
+>>> processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
+>>> model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
+
+>>> dataset = load_dataset("sanchit-gandhi/gtzan", split="train", streaming=True)
+>>> sample = next(iter(dataset))["audio"]
+
+>>> # take the first half of the audio sample
+>>> sample["array"] = sample["array"][: len(sample["array"]) // 2]
+
+>>> inputs = processor(
+...     audio=sample["array"],
+...     sampling_rate=sample["sampling_rate"],
+...     text=["80s blues track with groovy saxophone"],
+...     padding=True,
+...     return_tensors="pt",
+... )
+>>> audio_values = model.generate(**inputs, do_sample=True, guidance_scale=3, max_new_tokens=256)
+```
+
+For batched audio-prompted generation, the generated `audio_values` can be post-processed to remove padding by using the
+[`MusicgenProcessor`] class:
+
+```python
+>>> from transformers import AutoProcessor, MusicgenForConditionalGeneration
+>>> from datasets import load_dataset
+
+>>> processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
+>>> model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
+
+>>> dataset = load_dataset("sanchit-gandhi/gtzan", split="train", streaming=True)
+>>> sample = next(iter(dataset))["audio"]
+
+>>> # take the first quarter of the audio sample
+>>> sample_1 = sample["array"][: len(sample["array"]) // 4]
+
+>>> # take the first half of the audio sample
+>>> sample_2 = sample["array"][: len(sample["array"]) // 2]
+
+>>> inputs = processor(
+...     audio=[sample_1, sample_2],
+...     sampling_rate=sample["sampling_rate"],
+...     text=["80s blues track with groovy saxophone", "90s rock song with loud guitars and heavy drums"],
+...     padding=True,
+...     return_tensors="pt",
+... )
+>>> audio_values = model.generate(**inputs, do_sample=True, guidance_scale=3, max_new_tokens=256)
+
+>>> # post-process to remove padding from the batched audio
+>>> audio_values = processor.batch_decode(audio_values, padding_mask=inputs.padding_mask)
+```
+
+### Generation Configuration
+
+The default parameters that control the generation process, such as sampling, guidance scale and number of generated 
+tokens, can be found in the model's generation config, and updated as desired:
+
+```python
+>>> from transformers import MusicgenForConditionalGeneration
+
+>>> model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
+
+>>> # inspect the default generation config
+>>> model.generation_config
+
+>>> # increase the guidance scale to 4.0
+>>> model.generation_config.guidance_scale = 4.0
+
+>>> # decrease the max length to 256 tokens
+>>> model.generation_config.max_length = 256
+```
+
+Note that any arguments passed to the generate method will **supersede** those in the generation config, so setting 
+`do_sample=False` in the call to generate will supersede the setting of `model.generation_config.do_sample` in the 
+generation config.
+
+## Model Structure
+
+The MusicGen model can be de-composed into three distinct stages:
+1. Text encoder: maps the text inputs to a sequence of hidden-state representations. The pre-trained MusicGen models use a frozen text encoder from either T5 or Flan-T5
+2. MusicGen decoder: a language model (LM) that auto-regressively generates audio tokens (or codes) conditional on the encoder hidden-state representations
+3. Audio encoder/decoder: used to encode an audio prompt to use as prompt tokens, and recover the audio waveform from the audio tokens predicted by the decoder
+
+Thus, the MusicGen model can either be used as a standalone decoder model, corresponding to the class [`MusicgenForCausalLM`],
+or as a composite model that includes the text encoder and audio encoder/decoder, corresponding to the class
+[`MusicgenForConditionalGeneration`]. If only the decoder needs to be loaded from the pre-trained checkpoint, it can be loaded by first 
+specifying the correct config, or be accessed through the `.decoder` attribute of the composite model:
+
+```python
+>>> from transformers import AutoConfig, MusicgenForCausalLM, MusicgenForConditionalGeneration
+
+>>> # Option 1: get decoder config and pass to `.from_pretrained`
+>>> decoder_config = AutoConfig.from_pretrained("facebook/musicgen-small").decoder
+>>> decoder = MusicgenForCausalLM.from_pretrained("facebook/musicgen-small", **decoder_config)
+
+>>> # Option 2: load the entire composite model, but only return the decoder
+>>> decoder = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small").decoder
+```
+
+Since the text encoder and audio encoder/decoder models are frozen during training, the MusicGen decoder [`MusicgenForCausalLM`]
+can be trained standalone on a dataset of encoder hidden-states and audio codes. For inference, the trained decoder can
+be combined with the frozen text encoder and audio encoder/decoders to recover the composite [`MusicgenForConditionalGeneration`]
+model.
+
+Tips:
+* MusicGen is trained on the 32kHz checkpoint of Encodec. You should ensure you use a compatible version of the Encodec model.
+* Sampling mode tends to deliver better results than greedy - you can toggle sampling with the variable `do_sample` in the call to [`MusicgenForConditionalGeneration.generate`]
+
+## MusicgenDecoderConfig
+
+[[autodoc]] MusicgenDecoderConfig
+
+## MusicgenConfig
+
+[[autodoc]] MusicgenConfig
+
+## MusicgenProcessor
+
+[[autodoc]] MusicgenProcessor
+
+## MusicgenModel
+
+[[autodoc]] MusicgenModel
+    - forward
+
+## MusicgenForCausalLM
+
+[[autodoc]] MusicgenForCausalLM
+    - forward
+
+## MusicgenForConditionalGeneration
+
+[[autodoc]] MusicgenForConditionalGeneration
+    - forward
diff --git a/docs/source/en/model_doc/mvp.md b/docs/source/en/model_doc/mvp.md
new file mode 100644
index 000000000000..043163f40b30
--- /dev/null
+++ b/docs/source/en/model_doc/mvp.md
@@ -0,0 +1,151 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# MVP
+
+## Overview
+
+The MVP model was proposed in [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
+
+
+According to the abstract,
+
+- MVP follows a standard Transformer encoder-decoder architecture.
+- MVP is supervised pre-trained using labeled datasets.
+- MVP also has task-specific soft prompts to stimulate the model's capacity in performing a certain task.
+- MVP is specially designed for natural language generation and can be adapted to a wide range of generation tasks, including but not limited to summarization, data-to-text generation, open-ended dialogue system, story generation, question answering, question generation, task-oriented dialogue system, commonsense generation, paraphrase generation, text style transfer, and text simplification. Our model can also be adapted to natural language understanding tasks such as sequence classification and (extractive) question answering.
+
+Tips:
+- We have released a series of models [here](https://huggingface.co/models?filter=mvp), including MVP, MVP with task-specific prompts, and multi-task pre-trained variants.
+- If you want to use a model without prompts (standard Transformer), you can load it through `MvpForConditionalGeneration.from_pretrained('RUCAIBox/mvp')`.
+- If you want to use a model with task-specific prompts, such as summarization, you can load it through `MvpForConditionalGeneration.from_pretrained('RUCAIBox/mvp-summarization')`.
+- Our model supports lightweight prompt tuning following [Prefix-tuning](https://arxiv.org/abs/2101.00190) with method `set_lightweight_tuning()`.
+
+This model was contributed by [Tianyi Tang](https://huggingface.co/StevenTang). The detailed information and instructions can be found [here](https://github.com/RUCAIBox/MVP).
+
+## Examples
+For summarization, it is an example to use MVP and MVP with summarization-specific prompts.
+
+```python
+>>> from transformers import MvpTokenizer, MvpForConditionalGeneration
+
+>>> tokenizer = MvpTokenizer.from_pretrained("RUCAIBox/mvp")
+>>> model = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mvp")
+>>> model_with_prompt = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mvp-summarization")
+
+>>> inputs = tokenizer(
+...     "Summarize: You may want to stick it to your boss and leave your job, but don't do it if these are your reasons.",
+...     return_tensors="pt",
+... )
+>>> generated_ids = model.generate(**inputs)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+["Why You Shouldn't Quit Your Job"]
+
+>>> generated_ids = model_with_prompt.generate(**inputs)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+["Don't do it if these are your reasons"]
+```
+
+For data-to-text generation, it is an example to use MVP and multi-task pre-trained variants.
+```python
+>>> from transformers import MvpTokenizerFast, MvpForConditionalGeneration
+
+>>> tokenizer = MvpTokenizerFast.from_pretrained("RUCAIBox/mvp")
+>>> model = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mvp")
+>>> model_with_mtl = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mtl-data-to-text")
+
+>>> inputs = tokenizer(
+...     "Describe the following data: Iron Man | instance of | Superhero [SEP] Stan Lee | creator | Iron Man",
+...     return_tensors="pt",
+... )
+>>> generated_ids = model.generate(**inputs)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+['Stan Lee created the character of Iron Man, a fictional superhero appearing in American comic']
+
+>>> generated_ids = model_with_mtl.generate(**inputs)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+['Iron Man is a fictional superhero appearing in American comic books published by Marvel Comics.']
+```
+
+For lightweight tuning, *i.e.*, fixing the model and only tuning prompts, you can load MVP with randomly initialized prompts or with task-specific prompts. Our code also supports Prefix-tuning with BART following the [original paper](https://arxiv.org/abs/2101.00190).
+
+```python
+>>> from transformers import MvpForConditionalGeneration
+
+>>> model = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mvp", use_prompt=True)
+>>> # the number of trainable parameters (full tuning)
+>>> sum(p.numel() for p in model.parameters() if p.requires_grad)
+468116832
+
+>>> # lightweight tuning with randomly initialized prompts
+>>> model.set_lightweight_tuning()
+>>> # the number of trainable parameters (lightweight tuning)
+>>> sum(p.numel() for p in model.parameters() if p.requires_grad)
+61823328
+
+>>> # lightweight tuning with task-specific prompts
+>>> model = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mtl-data-to-text")
+>>> model.set_lightweight_tuning()
+>>> # original lightweight Prefix-tuning
+>>> model = MvpForConditionalGeneration.from_pretrained("facebook/bart-large", use_prompt=True)
+>>> model.set_lightweight_tuning()
+```
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Causal language modeling task guide](../tasks/language_modeling)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+- [Translation task guide](../tasks/translation)
+- [Summarization task guide](../tasks/summarization)
+
+## MvpConfig
+
+[[autodoc]] MvpConfig
+
+## MvpTokenizer
+
+[[autodoc]] MvpTokenizer
+
+## MvpTokenizerFast
+
+[[autodoc]] MvpTokenizerFast
+
+## MvpModel
+
+[[autodoc]] MvpModel
+    - forward
+
+## MvpForConditionalGeneration
+
+[[autodoc]] MvpForConditionalGeneration
+    - forward
+
+## MvpForSequenceClassification
+
+[[autodoc]] MvpForSequenceClassification
+    - forward
+
+## MvpForQuestionAnswering
+
+[[autodoc]] MvpForQuestionAnswering
+    - forward
+
+## MvpForCausalLM
+
+[[autodoc]] MvpForCausalLM
+    - forward
diff --git a/docs/source/en/model_doc/mvp.mdx b/docs/source/en/model_doc/mvp.mdx
deleted file mode 100644
index 6fae8c73111d..000000000000
--- a/docs/source/en/model_doc/mvp.mdx
+++ /dev/null
@@ -1,138 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# MVP
-
-## Overview
-
-The MVP model was proposed in [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
-
-
-According to the abstract,
-
-- MVP follows a standard Transformer encoder-decoder architecture.
-- MVP is supervised pre-trained using labeled datasets.
-- MVP also has task-specific soft prompts to stimulate the model's capacity in performing a certain task.
-- MVP is specially designed for natural language generation and can be adapted to a wide range of generation tasks, including but not limited to summarization, data-to-text generation, open-ended dialogue system, story generation, question answering, question generation, task-oriented dialogue system, commonsense generation, paraphrase generation, text style transfer, and text simplification. Our model can also be adapted to natural language understanding tasks such as sequence classification and (extractive) question answering.
-
-Tips:
-- We have released a series of models [here](https://huggingface.co/models?filter=mvp), including MVP, MVP with task-specific prompts, and multi-task pre-trained variants.
-- If you want to use a model without prompts (standard Transformer), you can load it through `MvpForConditionalGeneration.from_pretrained('RUCAIBox/mvp')`.
-- If you want to use a model with task-specific prompts, such as summarization, you can load it through `MvpForConditionalGeneration.from_pretrained('RUCAIBox/mvp-summarization')`.
-- Our model supports lightweight prompt tuning following [Prefix-tuning](https://arxiv.org/abs/2101.00190) with method `set_lightweight_tuning()`.
-
-This model was contributed by [Tianyi Tang](https://huggingface.co/StevenTang). The detailed information and instructions can be found [here](https://github.com/RUCAIBox/MVP).
-
-## Examples
-For summarization, it is an example to use MVP and MVP with summarization-specific prompts.
-
-```python
->>> from transformers import MvpTokenizer, MvpForConditionalGeneration
-
->>> tokenizer = MvpTokenizer.from_pretrained("RUCAIBox/mvp")
->>> model = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mvp")
->>> model_with_prompt = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mvp-summarization")
-
->>> inputs = tokenizer(
-...     "Summarize: You may want to stick it to your boss and leave your job, but don't do it if these are your reasons.",
-...     return_tensors="pt",
-... )
->>> generated_ids = model.generate(**inputs)
->>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-["Why You Shouldn't Quit Your Job"]
-
->>> generated_ids = model_with_prompt.generate(**inputs)
->>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-["Don't do it if these are your reasons"]
-```
-
-For data-to-text generation, it is an example to use MVP and multi-task pre-trained variants.
-```python
->>> from transformers import MvpTokenizerFast, MvpForConditionalGeneration
-
->>> tokenizer = MvpTokenizerFast.from_pretrained("RUCAIBox/mvp")
->>> model = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mvp")
->>> model_with_mtl = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mtl-data-to-text")
-
->>> inputs = tokenizer(
-...     "Describe the following data: Iron Man | instance of | Superhero [SEP] Stan Lee | creator | Iron Man",
-...     return_tensors="pt",
-... )
->>> generated_ids = model.generate(**inputs)
->>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-['Stan Lee created the character of Iron Man, a fictional superhero appearing in American comic']
-
->>> generated_ids = model_with_mtl.generate(**inputs)
->>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-['Iron Man is a fictional superhero appearing in American comic books published by Marvel Comics.']
-```
-
-For lightweight tuning, *i.e.*, fixing the model and only tuning prompts, you can load MVP with randomly initialized prompts or with task-specific prompts. Our code also supports Prefix-tuning with BART following the [original paper](https://arxiv.org/abs/2101.00190).
-
-```python
->>> from transformers import MvpForConditionalGeneration
-
->>> model = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mvp", use_prompt=True)
->>> # the number of trainable parameters (full tuning)
->>> sum(p.numel() for p in model.parameters() if p.requires_grad)
-468116832
-
->>> # lightweight tuning with randomly initialized prompts
->>> model.set_lightweight_tuning()
->>> # the number of trainable parameters (lightweight tuning)
->>> sum(p.numel() for p in model.parameters() if p.requires_grad)
-61823328
-
->>> # lightweight tuning with task-specific prompts
->>> model = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mtl-data-to-text")
->>> model.set_lightweight_tuning()
->>> # original lightweight Prefix-tuning
->>> model = MvpForConditionalGeneration.from_pretrained("facebook/bart-large", use_prompt=True)
->>> model.set_lightweight_tuning()
-```
-
-## MvpConfig
-
-[[autodoc]] MvpConfig
-
-## MvpTokenizer
-
-[[autodoc]] MvpTokenizer
-
-## MvpTokenizerFast
-
-[[autodoc]] MvpTokenizerFast
-
-## MvpModel
-
-[[autodoc]] MvpModel
-    - forward
-
-## MvpForConditionalGeneration
-
-[[autodoc]] MvpForConditionalGeneration
-    - forward
-
-## MvpForSequenceClassification
-
-[[autodoc]] MvpForSequenceClassification
-    - forward
-
-## MvpForQuestionAnswering
-
-[[autodoc]] MvpForQuestionAnswering
-    - forward
-
-## MvpForCausalLM
-
-[[autodoc]] MvpForCausalLM
-    - forward
diff --git a/docs/source/en/model_doc/nat.md b/docs/source/en/model_doc/nat.md
new file mode 100644
index 000000000000..668951c241f5
--- /dev/null
+++ b/docs/source/en/model_doc/nat.md
@@ -0,0 +1,87 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Neighborhood Attention Transformer
+
+## Overview
+
+NAT was proposed in [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143)
+by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
+
+It is a hierarchical vision transformer based on Neighborhood Attention, a sliding-window self attention pattern.
+
+The abstract from the paper is the following:
+
+*We present Neighborhood Attention (NA), the first efficient and scalable sliding-window attention mechanism for vision.
+NA is a pixel-wise operation, localizing self attention (SA) to the nearest neighboring pixels, and therefore enjoys a
+linear time and space complexity compared to the quadratic complexity of SA. The sliding-window pattern allows NA's
+receptive field to grow without needing extra pixel shifts, and preserves translational equivariance, unlike
+Swin Transformer's Window Self Attention (WSA). We develop NATTEN (Neighborhood Attention Extension), a Python package
+with efficient C++ and CUDA kernels, which allows NA to run up to 40% faster than Swin's WSA while using up to 25% less
+memory. We further present Neighborhood Attention Transformer (NAT), a new hierarchical transformer design based on NA
+that boosts image classification and downstream vision performance. Experimental results on NAT are competitive;
+NAT-Tiny reaches 83.2% top-1 accuracy on ImageNet, 51.4% mAP on MS-COCO and 48.4% mIoU on ADE20K, which is 1.9%
+ImageNet accuracy, 1.0% COCO mAP, and 2.6% ADE20K mIoU improvement over a Swin model with similar size. *
+
+Tips:
+- One can use the [`AutoImageProcessor`] API to prepare images for the model.
+- NAT can be used as a *backbone*. When `output_hidden_states = True`,
+it will output both `hidden_states` and `reshaped_hidden_states`.
+The `reshaped_hidden_states` have a shape of `(batch, num_channels, height, width)` rather than
+`(batch_size, height, width, num_channels)`.
+
+Notes:
+- NAT depends on [NATTEN](https://github.com/SHI-Labs/NATTEN/)'s implementation of Neighborhood Attention.
+You can install it with pre-built wheels for Linux by referring to [shi-labs.com/natten](https://shi-labs.com/natten),
+or build on your system by running `pip install natten`.
+Note that the latter will likely take time to compile. NATTEN does not support Windows devices yet.
+- Patch size of 4 is only supported at the moment.
+
+<img
+src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/neighborhood-attention-pattern.jpg"
+alt="drawing" width="600"/>
+
+<small> Neighborhood Attention compared to other attention patterns.
+Taken from the <a href="https://arxiv.org/abs/2204.07143">original paper</a>.</small>
+
+This model was contributed by [Ali Hassani](https://huggingface.co/alihassanijr).
+The original code can be found [here](https://github.com/SHI-Labs/Neighborhood-Attention-Transformer).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with NAT.
+
+<PipelineTag pipeline="image-classification"/>
+
+- [`NatForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
+- See also: [Image classification task guide](../tasks/image_classification)
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## NatConfig
+
+[[autodoc]] NatConfig
+
+
+## NatModel
+
+[[autodoc]] NatModel
+    - forward
+
+## NatForImageClassification
+
+[[autodoc]] NatForImageClassification
+    - forward
diff --git a/docs/source/en/model_doc/nat.mdx b/docs/source/en/model_doc/nat.mdx
deleted file mode 100644
index 43b59fb471e8..000000000000
--- a/docs/source/en/model_doc/nat.mdx
+++ /dev/null
@@ -1,73 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Neighborhood Attention Transformer
-
-## Overview
-
-NAT was proposed in [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143)
-by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
-
-It is a hierarchical vision transformer based on Neighborhood Attention, a sliding-window self attention pattern.
-
-The abstract from the paper is the following:
-
-*We present Neighborhood Attention (NA), the first efficient and scalable sliding-window attention mechanism for vision.
-NA is a pixel-wise operation, localizing self attention (SA) to the nearest neighboring pixels, and therefore enjoys a
-linear time and space complexity compared to the quadratic complexity of SA. The sliding-window pattern allows NA's
-receptive field to grow without needing extra pixel shifts, and preserves translational equivariance, unlike
-Swin Transformer's Window Self Attention (WSA). We develop NATTEN (Neighborhood Attention Extension), a Python package
-with efficient C++ and CUDA kernels, which allows NA to run up to 40% faster than Swin's WSA while using up to 25% less
-memory. We further present Neighborhood Attention Transformer (NAT), a new hierarchical transformer design based on NA
-that boosts image classification and downstream vision performance. Experimental results on NAT are competitive;
-NAT-Tiny reaches 83.2% top-1 accuracy on ImageNet, 51.4% mAP on MS-COCO and 48.4% mIoU on ADE20K, which is 1.9%
-ImageNet accuracy, 1.0% COCO mAP, and 2.6% ADE20K mIoU improvement over a Swin model with similar size. *
-
-Tips:
-- One can use the [`AutoImageProcessor`] API to prepare images for the model.
-- NAT can be used as a *backbone*. When `output_hidden_states = True`,
-it will output both `hidden_states` and `reshaped_hidden_states`.
-The `reshaped_hidden_states` have a shape of `(batch, num_channels, height, width)` rather than
-`(batch_size, height, width, num_channels)`.
-
-Notes:
-- NAT depends on [NATTEN](https://github.com/SHI-Labs/NATTEN/)'s implementation of Neighborhood Attention.
-You can install it with pre-built wheels for Linux by referring to [shi-labs.com/natten](https://shi-labs.com/natten),
-or build on your system by running `pip install natten`.
-Note that the latter will likely take time to compile. NATTEN does not support Windows devices yet.
-- Patch size of 4 is only supported at the moment.
-
-<img
-src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/neighborhood-attention-pattern.jpg"
-alt="drawing" width="600"/>
-
-<small> Neighborhood Attention compared to other attention patterns.
-Taken from the <a href="https://arxiv.org/abs/2204.07143">original paper</a>.</small>
-
-This model was contributed by [Ali Hassani](https://huggingface.co/alihassanijr).
-The original code can be found [here](https://github.com/SHI-Labs/Neighborhood-Attention-Transformer).
-
-
-## NatConfig
-
-[[autodoc]] NatConfig
-
-
-## NatModel
-
-[[autodoc]] NatModel
-    - forward
-
-## NatForImageClassification
-
-[[autodoc]] NatForImageClassification
-    - forward
diff --git a/docs/source/en/model_doc/nezha.md b/docs/source/en/model_doc/nezha.md
new file mode 100644
index 000000000000..9c136cdf0660
--- /dev/null
+++ b/docs/source/en/model_doc/nezha.md
@@ -0,0 +1,88 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Nezha
+
+## Overview
+
+The Nezha model was proposed in [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei et al.
+
+The abstract from the paper is the following:
+
+*The pre-trained language models have achieved great successes in various natural language understanding (NLU) tasks
+due to its capacity to capture the deep contextualized information in text by pre-training on large-scale corpora.
+In this technical report, we present our practice of pre-training language models named NEZHA (NEural contextualiZed
+representation for CHinese lAnguage understanding) on Chinese corpora and finetuning for the Chinese NLU tasks. 
+The current version of NEZHA is based on BERT with a collection of proven improvements, which include Functional 
+Relative Positional Encoding as an effective positional encoding scheme, Whole Word Masking strategy,
+Mixed Precision Training and the LAMB Optimizer in training the models. The experimental results show that NEZHA
+achieves the state-of-the-art performances when finetuned on several representative Chinese tasks, including
+named entity recognition (People's Daily NER), sentence matching (LCQMC), Chinese sentiment classification (ChnSenti)
+and natural language inference (XNLI).*
+
+This model was contributed by [sijunhe](https://huggingface.co/sijunhe). The original code can be found [here](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/NEZHA-PyTorch).
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+- [Multiple choice task guide](../tasks/multiple_choice)
+
+## NezhaConfig
+
+[[autodoc]] NezhaConfig
+
+## NezhaModel
+
+[[autodoc]] NezhaModel
+    - forward
+
+## NezhaForPreTraining
+
+[[autodoc]] NezhaForPreTraining
+    - forward
+
+## NezhaForMaskedLM
+
+[[autodoc]] NezhaForMaskedLM
+    - forward
+
+## NezhaForNextSentencePrediction
+
+[[autodoc]] NezhaForNextSentencePrediction
+    - forward
+
+## NezhaForSequenceClassification
+
+[[autodoc]] NezhaForSequenceClassification
+    - forward
+
+## NezhaForMultipleChoice
+
+[[autodoc]] NezhaForMultipleChoice
+    - forward
+
+## NezhaForTokenClassification
+
+[[autodoc]] NezhaForTokenClassification
+    - forward
+
+## NezhaForQuestionAnswering
+
+[[autodoc]] NezhaForQuestionAnswering
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/nezha.mdx b/docs/source/en/model_doc/nezha.mdx
deleted file mode 100644
index 8b613c38eb94..000000000000
--- a/docs/source/en/model_doc/nezha.mdx
+++ /dev/null
@@ -1,76 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Nezha
-
-## Overview
-
-The Nezha model was proposed in [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei et al.
-
-The abstract from the paper is the following:
-
-*The pre-trained language models have achieved great successes in various natural language understanding (NLU) tasks
-due to its capacity to capture the deep contextualized information in text by pre-training on large-scale corpora.
-In this technical report, we present our practice of pre-training language models named NEZHA (NEural contextualiZed
-representation for CHinese lAnguage understanding) on Chinese corpora and finetuning for the Chinese NLU tasks. 
-The current version of NEZHA is based on BERT with a collection of proven improvements, which include Functional 
-Relative Positional Encoding as an effective positional encoding scheme, Whole Word Masking strategy,
-Mixed Precision Training and the LAMB Optimizer in training the models. The experimental results show that NEZHA
-achieves the state-of-the-art performances when finetuned on several representative Chinese tasks, including
-named entity recognition (People's Daily NER), sentence matching (LCQMC), Chinese sentiment classification (ChnSenti)
-and natural language inference (XNLI).*
-
-This model was contributed by [sijunhe](https://huggingface.co/sijunhe). The original code can be found [here](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/NEZHA-PyTorch).
-
-## NezhaConfig
-
-[[autodoc]] NezhaConfig
-
-## NezhaModel
-
-[[autodoc]] NezhaModel
-    - forward
-
-## NezhaForPreTraining
-
-[[autodoc]] NezhaForPreTraining
-    - forward
-
-## NezhaForMaskedLM
-
-[[autodoc]] NezhaForMaskedLM
-    - forward
-
-## NezhaForNextSentencePrediction
-
-[[autodoc]] NezhaForNextSentencePrediction
-    - forward
-
-## NezhaForSequenceClassification
-
-[[autodoc]] NezhaForSequenceClassification
-    - forward
-
-## NezhaForMultipleChoice
-
-[[autodoc]] NezhaForMultipleChoice
-    - forward
-
-## NezhaForTokenClassification
-
-[[autodoc]] NezhaForTokenClassification
-    - forward
-
-## NezhaForQuestionAnswering
-
-[[autodoc]] NezhaForQuestionAnswering
-    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/nllb-moe.md b/docs/source/en/model_doc/nllb-moe.md
new file mode 100644
index 000000000000..ea456934feea
--- /dev/null
+++ b/docs/source/en/model_doc/nllb-moe.md
@@ -0,0 +1,132 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# NLLB-MOE
+
+
+## Overview
+
+The NLLB model was presented in [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by Marta R. Costa-jussà, James Cross, Onur Çelebi,
+Maha Elbayad, Kenneth Heafield, Kevin Heffernan, Elahe Kalbassi, Janice Lam, Daniel Licht, Jean Maillard, Anna Sun, Skyler Wang, Guillaume Wenzek, Al Youngblood, Bapi Akula,
+Loic Barrault, Gabriel Mejia Gonzalez, Prangthip Hansanti, John Hoffman, Semarley Jarrett, Kaushik Ram Sadagopan, Dirk Rowe, Shannon Spruit, Chau Tran, Pierre Andrews,
+Necip Fazil Ayan, Shruti Bhosale, Sergey Edunov, Angela Fan, Cynthia Gao, Vedanuj Goswami, Francisco Guzmán, Philipp Koehn, Alexandre Mourachko, Christophe Ropers,
+Safiyyah Saleem, Holger Schwenk, and Jeff Wang.
+
+The abstract of the paper is the following:
+
+*Driven by the goal of eradicating language barriers on a global scale, machine translation has solidified itself as a key focus of artificial intelligence research today.
+However, such efforts have coalesced around a small subset of languages, leaving behind the vast majority of mostly low-resource languages. What does it take to break the
+200 language barrier while ensuring safe, high quality results, all while keeping ethical considerations in mind? In No Language Left Behind, we took on this challenge by
+first contextualizing the need for low-resource language translation support through exploratory interviews with native speakers. Then, we created datasets and models aimed
+at narrowing the performance gap between low and high-resource languages. More specifically, we developed a conditional compute model based on Sparsely Gated Mixture of
+Experts that is trained on data obtained with novel and effective data mining techniques tailored for low-resource languages. We propose multiple architectural and training
+improvements to counteract overfitting while training on thousands of tasks. Critically, we evaluated the performance of over 40,000 different translation directions using
+a human-translated benchmark, Flores-200, and combined human evaluation with a novel toxicity benchmark covering all languages in Flores-200 to assess translation safety.
+Our model achieves an improvement of 44% BLEU relative to the previous state-of-the-art, laying important groundwork towards realizing a universal translation system.*
+
+Tips:
+
+- M2M100ForConditionalGeneration is the base model for both NLLB and NLLB MoE
+- The NLLB-MoE is very similar to the NLLB model, but it's feed forward layer is based on the implementation of SwitchTransformers.
+- The tokenizer is the same as the NLLB models.
+
+This model was contributed by [Arthur Zucker](https://huggingface.co/ArtZucker).
+The original code can be found [here](https://github.com/facebookresearch/fairseq).
+
+## Implementation differences with SwitchTransformers
+The biggest difference is the way the tokens are routed. NLLB-MoE uses a `top-2-gate` which means that for each input, only the top two experts are selected based on the 
+highest predicted probabilities from the gating network, and the remaining experts are ignored. In `SwitchTransformers`, only the top-1 probabilities are computed, 
+which means that tokens have less probability of being forwarded. Moreover, if a token is not routed to any expert, `SwitchTransformers` still adds its unmodified hidden 
+states (kind of like a residual connection) while they are masked in `NLLB`'s top-2 routing mechanism. 
+
+## Generating with NLLB-MoE
+The avalable checkpoints requires around 350GB of storage. Make sure to use `accelerate` if you do not have enough RAM on your machine.
+
+While generating the target text set the `forced_bos_token_id` to the target language id. The following
+example shows how to translate English to French using the *facebook/nllb-200-distilled-600M* model.
+
+Note that we're using the BCP-47 code for French `fra_Latn`. See [here](https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200)
+for the list of all BCP-47 in the Flores 200 dataset.
+
+```python
+>>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-moe-54b")
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-moe-54b")
+
+>>> article = "Previously, Ring's CEO, Jamie Siminoff, remarked the company started when his doorbell wasn't audible from his shop in his garage."
+>>> inputs = tokenizer(article, return_tensors="pt")
+
+>>> translated_tokens = model.generate(
+...     **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["fra_Latn"], max_length=50
+... )
+>>> tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
+"Auparavant, le PDG de Ring, Jamie Siminoff, a fait remarquer que la société avait commencé lorsque sa sonnette n'était pas audible depuis son magasin dans son garage."
+```
+
+### Generating from any other language than English
+
+English (`eng_Latn`) is set as the default language from which to translate. In order to specify that you'd like to translate from a different language,
+you should specify the BCP-47 code in the `src_lang` keyword argument of the tokenizer initialization.
+
+See example below for a translation from romanian to german:
+
+```python
+>>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-moe-54b", src_lang="ron_Latn")
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-moe-54b")
+
+>>> article = "Şeful ONU spune că nu există o soluţie militară în Siria"
+>>> inputs = tokenizer(article, return_tensors="pt")
+
+>>> translated_tokens = model.generate(
+...     **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["deu_Latn"], max_length=30
+... )
+>>> tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
+```
+
+## Documentation resources
+
+- [Translation task guide](../tasks/translation)
+- [Summarization task guide](../tasks/summarization)
+
+
+## NllbMoeConfig
+
+[[autodoc]] NllbMoeConfig
+
+## NllbMoeTop2Router
+
+[[autodoc]] NllbMoeTop2Router
+    - route_tokens
+    - forward
+
+## NllbMoeSparseMLP
+
+[[autodoc]] NllbMoeSparseMLP
+    - forward
+
+## NllbMoeModel
+
+[[autodoc]] NllbMoeModel
+    - forward
+
+## NllbMoeForConditionalGeneration
+
+[[autodoc]] NllbMoeForConditionalGeneration
+    - forward
+
diff --git a/docs/source/en/model_doc/nllb.md b/docs/source/en/model_doc/nllb.md
new file mode 100644
index 000000000000..ec50716c73c8
--- /dev/null
+++ b/docs/source/en/model_doc/nllb.md
@@ -0,0 +1,146 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# NLLB
+
+**DISCLAIMER:** The default behaviour for the tokenizer has recently been fixed (and thus changed)!
+
+The previous version adds `[self.eos_token_id, self.cur_lang_code]` at the end of the token sequence for both target and source tokenization. This is wrong as the NLLB paper mentions (page 48, 6.1.1. Model Architecture) :
+
+*Note that we prefix the source sequence with the source language, as opposed to the target
+language as previously done in several works (Arivazhagan et al., 2019; Johnson et al.,
+2017). This is primarily because we prioritize optimizing zero-shot performance of our
+model on any pair of 200 languages at a minor cost to supervised performance.*
+
+Previous behaviour:
+
+```python
+>>> from transformers import NllbTokenizer
+
+>>> tokenizer = NllbTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
+>>> tokenizer("How was your day?").input_ids
+[13374, 1398, 4260, 4039, 248130, 2, 256047]
+
+>>> # 2: '</s>'
+>>> # 256047 : 'eng_Latn'
+```
+New behaviour
+
+```python
+>>> from transformers import NllbTokenizer
+
+>>> tokenizer = NllbTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
+>>> tokenizer("How was your day?").input_ids
+[256047, 13374, 1398, 4260, 4039, 248130, 2]
+ ```
+
+Enabling the old behaviour can be done as follows:
+```python
+>>> from transformers import NllbTokenizer
+
+>>> tokenizer = NllbTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", legacy_behaviour=True)
+```
+
+For more details, feel free to check the linked [PR](https://github.com/huggingface/transformers/pull/22313) and [Issue](https://github.com/huggingface/transformers/issues/19943).
+
+## Overview of NLLB
+
+The NLLB model was presented in [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by Marta R. Costa-jussà, James Cross, Onur Çelebi,
+Maha Elbayad, Kenneth Heafield, Kevin Heffernan, Elahe Kalbassi, Janice Lam, Daniel Licht, Jean Maillard, Anna Sun, Skyler Wang, Guillaume Wenzek, Al Youngblood, Bapi Akula,
+Loic Barrault, Gabriel Mejia Gonzalez, Prangthip Hansanti, John Hoffman, Semarley Jarrett, Kaushik Ram Sadagopan, Dirk Rowe, Shannon Spruit, Chau Tran, Pierre Andrews,
+Necip Fazil Ayan, Shruti Bhosale, Sergey Edunov, Angela Fan, Cynthia Gao, Vedanuj Goswami, Francisco Guzmán, Philipp Koehn, Alexandre Mourachko, Christophe Ropers,
+Safiyyah Saleem, Holger Schwenk, and Jeff Wang.
+
+The abstract of the paper is the following:
+
+*Driven by the goal of eradicating language barriers on a global scale, machine translation has solidified itself as a key focus of artificial intelligence research today.
+However, such efforts have coalesced around a small subset of languages, leaving behind the vast majority of mostly low-resource languages. What does it take to break the
+200 language barrier while ensuring safe, high quality results, all while keeping ethical considerations in mind? In No Language Left Behind, we took on this challenge by
+first contextualizing the need for low-resource language translation support through exploratory interviews with native speakers. Then, we created datasets and models aimed
+at narrowing the performance gap between low and high-resource languages. More specifically, we developed a conditional compute model based on Sparsely Gated Mixture of
+Experts that is trained on data obtained with novel and effective data mining techniques tailored for low-resource languages. We propose multiple architectural and training
+improvements to counteract overfitting while training on thousands of tasks. Critically, we evaluated the performance of over 40,000 different translation directions using
+a human-translated benchmark, Flores-200, and combined human evaluation with a novel toxicity benchmark covering all languages in Flores-200 to assess translation safety.
+Our model achieves an improvement of 44% BLEU relative to the previous state-of-the-art, laying important groundwork towards realizing a universal translation system.*
+
+This implementation contains the dense models available on release.
+
+**The sparse model NLLB-MoE (Mixture of Expert) is now available! More details [here](nllb-moe)**
+
+This model was contributed by [Lysandre](https://huggingface.co/lysandre). The authors' code can be found [here](https://github.com/facebookresearch/fairseq/tree/nllb).
+
+## Generating with NLLB
+
+While generating the target text set the `forced_bos_token_id` to the target language id. The following
+example shows how to translate English to French using the *facebook/nllb-200-distilled-600M* model.
+
+Note that we're using the BCP-47 code for French `fra_Latn`. See [here](https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200)
+for the list of all BCP-47 in the Flores 200 dataset.
+
+```python
+>>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
+
+>>> article = "UN Chief says there is no military solution in Syria"
+>>> inputs = tokenizer(article, return_tensors="pt")
+
+>>> translated_tokens = model.generate(
+...     **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["fra_Latn"], max_length=30
+... )
+>>> tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
+Le chef de l'ONU dit qu'il n'y a pas de solution militaire en Syrie
+```
+
+### Generating from any other language than English
+
+English (`eng_Latn`) is set as the default language from which to translate. In order to specify that you'd like to translate from a different language,
+you should specify the BCP-47 code in the `src_lang` keyword argument of the tokenizer initialization.
+
+See example below for a translation from romanian to german:
+
+```py
+>>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained(
+...     "facebook/nllb-200-distilled-600M", use_auth_token=True, src_lang="ron_Latn"
+... )
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M", use_auth_token=True)
+
+>>> article = "Şeful ONU spune că nu există o soluţie militară în Siria"
+>>> inputs = tokenizer(article, return_tensors="pt")
+
+>>> translated_tokens = model.generate(
+...     **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["deu_Latn"], max_length=30
+... )
+>>> tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
+UN-Chef sagt, es gibt keine militärische Lösung in Syrien
+```
+
+## Documentation resources
+
+- [Translation task guide](../tasks/translation)
+- [Summarization task guide](../tasks/summarization)
+
+## NllbTokenizer
+
+[[autodoc]] NllbTokenizer
+    - build_inputs_with_special_tokens
+
+## NllbTokenizerFast
+
+[[autodoc]] NllbTokenizerFast
diff --git a/docs/source/en/model_doc/nllb.mdx b/docs/source/en/model_doc/nllb.mdx
deleted file mode 100644
index d2c0089fa3a1..000000000000
--- a/docs/source/en/model_doc/nllb.mdx
+++ /dev/null
@@ -1,98 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# NLLB
-
-**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=bug&template=bug-report.yml) and assign
-@LysandreJik
-
-## Overview of NLLB
-
-The NLLB model was presented in [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by Marta R. Costa-jussà, James Cross, Onur Çelebi,
-Maha Elbayad, Kenneth Heafield, Kevin Heffernan, Elahe Kalbassi, Janice Lam, Daniel Licht, Jean Maillard, Anna Sun, Skyler Wang, Guillaume Wenzek, Al Youngblood, Bapi Akula,
-Loic Barrault, Gabriel Mejia Gonzalez, Prangthip Hansanti, John Hoffman, Semarley Jarrett, Kaushik Ram Sadagopan, Dirk Rowe, Shannon Spruit, Chau Tran, Pierre Andrews,
-Necip Fazil Ayan, Shruti Bhosale, Sergey Edunov, Angela Fan, Cynthia Gao, Vedanuj Goswami, Francisco Guzmán, Philipp Koehn, Alexandre Mourachko, Christophe Ropers,
-Safiyyah Saleem, Holger Schwenk, and Jeff Wang.
-
-The abstract of the paper is the following:
-
-*Driven by the goal of eradicating language barriers on a global scale, machine translation has solidified itself as a key focus of artificial intelligence research today.
-However, such efforts have coalesced around a small subset of languages, leaving behind the vast majority of mostly low-resource languages. What does it take to break the
-200 language barrier while ensuring safe, high quality results, all while keeping ethical considerations in mind? In No Language Left Behind, we took on this challenge by
-first contextualizing the need for low-resource language translation support through exploratory interviews with native speakers. Then, we created datasets and models aimed
-at narrowing the performance gap between low and high-resource languages. More specifically, we developed a conditional compute model based on Sparsely Gated Mixture of
-Experts that is trained on data obtained with novel and effective data mining techniques tailored for low-resource languages. We propose multiple architectural and training
-improvements to counteract overfitting while training on thousands of tasks. Critically, we evaluated the performance of over 40,000 different translation directions using
-a human-translated benchmark, Flores-200, and combined human evaluation with a novel toxicity benchmark covering all languages in Flores-200 to assess translation safety.
-Our model achieves an improvement of 44% BLEU relative to the previous state-of-the-art, laying important groundwork towards realizing a universal translation system.*
-
-This implementation contains the dense models available on release. Let us know via a GitHub issue if you would like to see the MoE models as well.
-
-This model was contributed by [Lysandre](https://huggingface.co/lysandre). The authors' code can be found [here](https://github.com/facebookresearch/fairseq/tree/nllb).
-
-## Generating with NLLB
-
-While generating the target text set the `forced_bos_token_id` to the target language id. The following
-example shows how to translate English to French using the *facebook/nllb-200-distilled-600M* model.
-
-Note that we're using the BCP-47 code for French `fra_Latn`. See [here](https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200)
-for the list of all BCP-47 in the Flores 200 dataset.
-
-```python
->>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
->>> model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
-
->>> article = "UN Chief says there is no military solution in Syria"
->>> inputs = tokenizer(article, return_tensors="pt")
-
->>> translated_tokens = model.generate(
-...     **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["fra_Latn"], max_length=30
-... )
->>> tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
-Le chef de l'ONU dit qu'il n'y a pas de solution militaire en Syrie
-```
-
-### Generating from any other language than English
-
-English (`eng_Latn`) is set as the default language from which to translate. In order to specify that you'd like to translate from a different language,
-you should specify the BCP-47 code in the `src_lang` keyword argument of the tokenizer initialization.
-
-See example below for a translation from romanian to german:
-
-```py
->>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained(
-...     "facebook/nllb-200-distilled-600M", use_auth_token=True, src_lang="ron_Latn"
-... )
->>> model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M", use_auth_token=True)
-
->>> article = "Şeful ONU spune că nu există o soluţie militară în Siria"
->>> inputs = tokenizer(article, return_tensors="pt")
-
->>> translated_tokens = model.generate(
-...     **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["deu_Latn"], max_length=30
-... )
->>> tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
-UN-Chef sagt, es gibt keine militärische Lösung in Syrien
-```
-
-## NllbTokenizer
-
-[[autodoc]] NllbTokenizer
-    - build_inputs_with_special_tokens
-
-## NllbTokenizerFast
-
-[[autodoc]] NllbTokenizerFast
diff --git a/docs/source/en/model_doc/nougat.md b/docs/source/en/model_doc/nougat.md
new file mode 100644
index 000000000000..3fcb97a541b8
--- /dev/null
+++ b/docs/source/en/model_doc/nougat.md
@@ -0,0 +1,109 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the
+License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+specific language governing permissions and limitations under the License. -->
+
+# Nougat
+
+## Overview
+
+The Nougat model was proposed in [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) by
+Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic. Nougat uses the same architecture as [Donut](donut), meaning an image Transformer
+encoder and an autoregressive text Transformer decoder to translate scientific PDFs to markdown, enabling easier access to them.
+
+The abstract from the paper is the following:
+
+*Scientific knowledge is predominantly stored in books and scientific journals, often in the form of PDFs. However, the PDF format leads to a loss of semantic information, particularly for mathematical expressions. We propose Nougat (Neural Optical Understanding for Academic Documents), a Visual Transformer model that performs an Optical Character Recognition (OCR) task for processing scientific documents into a markup language, and demonstrate the effectiveness of our model on a new dataset of scientific documents. The proposed approach offers a promising solution to enhance the accessibility of scientific knowledge in the digital age, by bridging the gap between human-readable documents and machine-readable text. We release the models and code to accelerate future work on scientific text recognition.*
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/nougat_architecture.jpg"
+alt="drawing" width="600"/>
+
+<small> Nougat high-level overview. Taken from the <a href="https://arxiv.org/abs/2308.13418">original paper</a>. </small>
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found
+[here](https://github.com/facebookresearch/nougat).
+
+Tips:
+
+- The quickest way to get started with Nougat is by checking the [tutorial
+  notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/Nougat), which show how to use the model
+  at inference time as well as fine-tuning on custom data.
+- Nougat is always used within the [VisionEncoderDecoder](vision-encoder-decoder) framework. The model is identical to [Donut](donut) in terms of architecture.
+
+## Inference
+
+Nougat's [`VisionEncoderDecoder`] model accepts images as input and makes use of
+[`~generation.GenerationMixin.generate`] to autoregressively generate text given the input image.
+
+The [`NougatImageProcessor`] class is responsible for preprocessing the input image and
+[`NougatTokenizerFast`] decodes the generated target tokens to the target string. The
+[`NougatProcessor`] wraps [`NougatImageProcessor`] and [`NougatTokenizerFast`] classes
+into a single instance to both extract the input features and decode the predicted token ids.
+
+- Step-by-step PDF transcription
+
+```py
+>>> from huggingface_hub import hf_hub_download
+>>> import re
+>>> from PIL import Image
+
+>>> from transformers import NougatProcessor, VisionEncoderDecoderModel
+>>> from datasets import load_dataset
+>>> import torch
+
+>>> processor = NougatProcessor.from_pretrained("facebook/nougat-base")
+>>> model = VisionEncoderDecoderModel.from_pretrained("facebook/nougat-base")
+
+>>> device = "cuda" if torch.cuda.is_available() else "cpu"
+>>> model.to(device)  # doctest: +IGNORE_RESULT
+
+>>> # prepare PDF image for the model
+>>> filepath = hf_hub_download(repo_id="hf-internal-testing/fixtures_docvqa", filename="nougat_paper.png", repo_type="dataset")
+>>> image = Image.open(filepath)
+>>> pixel_values = processor(image, return_tensors="pt").pixel_values
+
+>>> # generate transcription (here we only generate 30 tokens)
+>>> outputs = model.generate(
+...     pixel_values.to(device),
+...     min_length=1,
+...     max_new_tokens=30,
+...     bad_words_ids=[[processor.tokenizer.unk_token_id]],
+... )
+
+>>> sequence = processor.batch_decode(outputs, skip_special_tokens=True)[0]
+>>> sequence = processor.post_process_generation(sequence, fix_markdown=False)
+>>> # note: we're using repr here such for the sake of printing the \n characters, feel free to just print the sequence
+>>> print(repr(sequence))
+'\n\n# Nougat: Neural Optical Understanding for Academic Documents\n\n Lukas Blecher\n\nCorrespondence to: lblecher@'
+```
+
+See the [model hub](https://huggingface.co/models?filter=nougat) to look for Nougat checkpoints.
+
+## NougatImageProcessor
+
+[[autodoc]] NougatImageProcessor
+    - preprocess
+
+## NougatTokenizerFast
+
+[[autodoc]] NougatTokenizerFast
+
+## NougatProcessor
+
+[[autodoc]] NougatProcessor
+    - __call__
+    - from_pretrained
+    - save_pretrained
+    - batch_decode
+    - decode
+    - post_process_generation
\ No newline at end of file
diff --git a/docs/source/en/model_doc/nystromformer.md b/docs/source/en/model_doc/nystromformer.md
new file mode 100644
index 000000000000..6434944aba8a
--- /dev/null
+++ b/docs/source/en/model_doc/nystromformer.md
@@ -0,0 +1,80 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Nyströmformer
+
+## Overview
+
+The Nyströmformer model was proposed in [*Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention*](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn
+Fung, Yin Li, and Vikas Singh.
+
+The abstract from the paper is the following:
+
+*Transformers have emerged as a powerful tool for a broad range of natural language processing tasks. A key component
+that drives the impressive performance of Transformers is the self-attention mechanism that encodes the influence or
+dependence of other tokens on each specific token. While beneficial, the quadratic complexity of self-attention on the
+input sequence length has limited its application to longer sequences -- a topic being actively studied in the
+community. To address this limitation, we propose Nyströmformer -- a model that exhibits favorable scalability as a
+function of sequence length. Our idea is based on adapting the Nyström method to approximate standard self-attention
+with O(n) complexity. The scalability of Nyströmformer enables application to longer sequences with thousands of
+tokens. We perform evaluations on multiple downstream tasks on the GLUE benchmark and IMDB reviews with standard
+sequence length, and find that our Nyströmformer performs comparably, or in a few cases, even slightly better, than
+standard self-attention. On longer sequence tasks in the Long Range Arena (LRA) benchmark, Nyströmformer performs
+favorably relative to other efficient self-attention methods. Our code is available at this https URL.*
+
+This model was contributed by [novice03](https://huggingface.co/novice03). The original code can be found [here](https://github.com/mlpen/Nystromformer).
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+- [Multiple choice task guide](../tasks/multiple_choice)
+
+## NystromformerConfig
+
+[[autodoc]] NystromformerConfig
+
+## NystromformerModel
+
+[[autodoc]] NystromformerModel
+    - forward
+
+## NystromformerForMaskedLM
+
+[[autodoc]] NystromformerForMaskedLM
+    - forward
+
+## NystromformerForSequenceClassification
+
+[[autodoc]] NystromformerForSequenceClassification
+    - forward
+
+## NystromformerForMultipleChoice
+
+[[autodoc]] NystromformerForMultipleChoice
+    - forward
+
+## NystromformerForTokenClassification
+
+[[autodoc]] NystromformerForTokenClassification
+    - forward
+
+## NystromformerForQuestionAnswering
+
+[[autodoc]] NystromformerForQuestionAnswering
+    - forward
diff --git a/docs/source/en/model_doc/nystromformer.mdx b/docs/source/en/model_doc/nystromformer.mdx
deleted file mode 100644
index 5c1619b57f1e..000000000000
--- a/docs/source/en/model_doc/nystromformer.mdx
+++ /dev/null
@@ -1,68 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Nyströmformer
-
-## Overview
-
-The Nyströmformer model was proposed in [*Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention*](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn
-Fung, Yin Li, and Vikas Singh.
-
-The abstract from the paper is the following:
-
-*Transformers have emerged as a powerful tool for a broad range of natural language processing tasks. A key component
-that drives the impressive performance of Transformers is the self-attention mechanism that encodes the influence or
-dependence of other tokens on each specific token. While beneficial, the quadratic complexity of self-attention on the
-input sequence length has limited its application to longer sequences -- a topic being actively studied in the
-community. To address this limitation, we propose Nyströmformer -- a model that exhibits favorable scalability as a
-function of sequence length. Our idea is based on adapting the Nyström method to approximate standard self-attention
-with O(n) complexity. The scalability of Nyströmformer enables application to longer sequences with thousands of
-tokens. We perform evaluations on multiple downstream tasks on the GLUE benchmark and IMDB reviews with standard
-sequence length, and find that our Nyströmformer performs comparably, or in a few cases, even slightly better, than
-standard self-attention. On longer sequence tasks in the Long Range Arena (LRA) benchmark, Nyströmformer performs
-favorably relative to other efficient self-attention methods. Our code is available at this https URL.*
-
-This model was contributed by [novice03](https://huggingface.co/novice03). The original code can be found [here](https://github.com/mlpen/Nystromformer).
-
-## NystromformerConfig
-
-[[autodoc]] NystromformerConfig
-
-## NystromformerModel
-
-[[autodoc]] NystromformerModel
-    - forward
-
-## NystromformerForMaskedLM
-
-[[autodoc]] NystromformerForMaskedLM
-    - forward
-
-## NystromformerForSequenceClassification
-
-[[autodoc]] NystromformerForSequenceClassification
-    - forward
-
-## NystromformerForMultipleChoice
-
-[[autodoc]] NystromformerForMultipleChoice
-    - forward
-
-## NystromformerForTokenClassification
-
-[[autodoc]] NystromformerForTokenClassification
-    - forward
-
-## NystromformerForQuestionAnswering
-
-[[autodoc]] NystromformerForQuestionAnswering
-    - forward
diff --git a/docs/source/en/model_doc/oneformer.md b/docs/source/en/model_doc/oneformer.md
new file mode 100644
index 000000000000..5f8f46e1529e
--- /dev/null
+++ b/docs/source/en/model_doc/oneformer.md
@@ -0,0 +1,85 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# OneFormer
+
+## Overview
+
+The OneFormer model was proposed in [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi. OneFormer is a universal image segmentation framework that can be trained on a single panoptic dataset to perform semantic, instance, and panoptic segmentation tasks. OneFormer uses a task token to condition the model on the task in focus, making the architecture task-guided for training, and task-dynamic for inference.
+
+<img width="600" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/oneformer_teaser.png"/>
+
+The abstract from the paper is the following:
+
+*Universal Image Segmentation is not a new concept. Past attempts to unify image segmentation in the last decades include scene parsing, panoptic segmentation, and, more recently, new panoptic architectures. However, such panoptic architectures do not truly unify image segmentation because they need to be trained individually on the semantic, instance, or panoptic segmentation to achieve the best performance. Ideally, a truly universal framework should be trained only once and achieve SOTA performance across all three image segmentation tasks. To that end, we propose OneFormer, a universal image segmentation framework that unifies segmentation with a multi-task train-once design. We first propose a task-conditioned joint training strategy that enables training on ground truths of each domain (semantic, instance, and panoptic segmentation) within a single multi-task training process. Secondly, we introduce a task token to condition our model on the task at hand, making our model task-dynamic to support multi-task training and inference. Thirdly, we propose using a query-text contrastive loss during training to establish better inter-task and inter-class distinctions. Notably, our single OneFormer model outperforms specialized Mask2Former models across all three segmentation tasks on ADE20k, CityScapes, and COCO, despite the latter being trained on each of the three tasks individually with three times the resources. With new ConvNeXt and DiNAT backbones, we observe even more performance improvement. We believe OneFormer is a significant step towards making image segmentation more universal and accessible.*
+
+Tips:
+-  OneFormer requires two inputs during inference: *image* and *task token*. 
+- During training, OneFormer only uses panoptic annotations.
+- If you want to train the model in a distributed environment across multiple nodes, then one should update the
+  `get_num_masks` function inside in the `OneFormerLoss` class of `modeling_oneformer.py`. When training on multiple nodes, this should be
+  set to the average number of target masks across all nodes, as can be seen in the original implementation [here](https://github.com/SHI-Labs/OneFormer/blob/33ebb56ed34f970a30ae103e786c0cb64c653d9a/oneformer/modeling/criterion.py#L287).
+- One can use [`OneFormerProcessor`] to prepare input images and task inputs for the model and optional targets for the model. [`OneformerProcessor`] wraps [`OneFormerImageProcessor`] and [`CLIPTokenizer`] into a single instance to both prepare the images and encode the task inputs.
+- To get the final segmentation, depending on the task, you can call [`~OneFormerProcessor.post_process_semantic_segmentation`] or [`~OneFormerImageProcessor.post_process_instance_segmentation`] or [`~OneFormerImageProcessor.post_process_panoptic_segmentation`]. All three tasks can be solved using [`OneFormerForUniversalSegmentation`] output, panoptic segmentation accepts an optional `label_ids_to_fuse` argument to fuse instances of the target object/s (e.g. sky) together.
+
+The figure below illustrates the architecture of OneFormer. Taken from the [original paper](https://arxiv.org/abs/2211.06220).
+
+<img width="600" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/oneformer_architecture.png"/>
+
+This model was contributed by [Jitesh Jain](https://huggingface.co/praeclarumjj3). The original code can be found [here](https://github.com/SHI-Labs/OneFormer).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with OneFormer.
+
+- Demo notebooks regarding inference + fine-tuning on custom data can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/OneFormer).
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we will review it.
+The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## OneFormer specific outputs
+
+[[autodoc]] models.oneformer.modeling_oneformer.OneFormerModelOutput
+
+[[autodoc]] models.oneformer.modeling_oneformer.OneFormerForUniversalSegmentationOutput
+
+## OneFormerConfig
+
+[[autodoc]] OneFormerConfig
+
+## OneFormerImageProcessor
+
+[[autodoc]] OneFormerImageProcessor
+    - preprocess
+    - encode_inputs
+    - post_process_semantic_segmentation
+    - post_process_instance_segmentation
+    - post_process_panoptic_segmentation
+
+## OneFormerProcessor
+
+[[autodoc]] OneFormerProcessor
+
+## OneFormerModel
+
+[[autodoc]] OneFormerModel
+    - forward
+
+## OneFormerForUniversalSegmentation
+
+[[autodoc]] OneFormerForUniversalSegmentation
+    - forward
+    
\ No newline at end of file
diff --git a/docs/source/en/model_doc/open-llama.md b/docs/source/en/model_doc/open-llama.md
new file mode 100644
index 000000000000..c20ecb7f88ca
--- /dev/null
+++ b/docs/source/en/model_doc/open-llama.md
@@ -0,0 +1,63 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Open-Llama
+
+<Tip warning={true}>
+
+This model is in maintenance mode only, so we won't accept any new PRs changing its code.
+
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.31.0.
+You can do so by running the following command: `pip install -U transformers==4.31.0`.
+
+</Tip>
+
+<Tip warning={true}>
+
+This model differs from the [OpenLLaMA models](https://huggingface.co/models?search=openllama) on the Hugging Face Hub, which primarily use the [LLaMA](llama) architecture.
+
+</Tip>
+
+## Overview
+
+The Open-Llama model was proposed in [Open-Llama project](https://github.com/s-JoL/Open-Llama) by community developer s-JoL.
+
+The model is mainly based on LLaMA with some modifications, incorporating memory-efficient attention from Xformers, stable embedding from Bloom, and shared input-output embedding from PaLM.
+And the model is pre-trained on both Chinese and English, which gives it better performance on Chinese language tasks.
+
+This model was contributed by [s-JoL](https://huggingface.co/s-JoL).
+The original code can be found [Open-Llama](https://github.com/s-JoL/Open-Llama).
+Checkpoint and usage can be found at [s-JoL/Open-Llama-V1](https://huggingface.co/s-JoL/Open-Llama-V1).
+
+
+## OpenLlamaConfig
+
+[[autodoc]] OpenLlamaConfig
+
+## OpenLlamaModel
+
+[[autodoc]] OpenLlamaModel
+    - forward
+
+## OpenLlamaForCausalLM
+
+[[autodoc]] OpenLlamaForCausalLM
+    - forward
+
+## OpenLlamaForSequenceClassification
+
+[[autodoc]] OpenLlamaForSequenceClassification
+    - forward
diff --git a/docs/source/en/model_doc/openai-gpt.md b/docs/source/en/model_doc/openai-gpt.md
new file mode 100644
index 000000000000..ff98930b576e
--- /dev/null
+++ b/docs/source/en/model_doc/openai-gpt.md
@@ -0,0 +1,157 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# OpenAI GPT
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=openai-gpt">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-openai--gpt-blueviolet">
+</a>
+<a href="https://huggingface.co/spaces/docs-demos/openai-gpt">
+<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
+</a>
+</div>
+
+## Overview
+
+OpenAI GPT model was proposed in [Improving Language Understanding by Generative Pre-Training](https://s3-us-west-2.amazonaws.com/openai-assets/research-covers/language-unsupervised/language_understanding_paper.pdf)
+by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever. It's a causal (unidirectional) transformer
+pre-trained using language modeling on a large corpus will long range dependencies, the Toronto Book Corpus.
+
+The abstract from the paper is the following:
+
+*Natural language understanding comprises a wide range of diverse tasks such as textual entailment, question answering,
+semantic similarity assessment, and document classification. Although large unlabeled text corpora are abundant,
+labeled data for learning these specific tasks is scarce, making it challenging for discriminatively trained models to
+perform adequately. We demonstrate that large gains on these tasks can be realized by generative pretraining of a
+language model on a diverse corpus of unlabeled text, followed by discriminative fine-tuning on each specific task. In
+contrast to previous approaches, we make use of task-aware input transformations during fine-tuning to achieve
+effective transfer while requiring minimal changes to the model architecture. We demonstrate the effectiveness of our
+approach on a wide range of benchmarks for natural language understanding. Our general task-agnostic model outperforms
+discriminatively trained models that use architectures specifically crafted for each task, significantly improving upon
+the state of the art in 9 out of the 12 tasks studied.*
+
+Tips:
+
+- GPT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
+  the left.
+- GPT was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next
+  token in a sequence. Leveraging this feature allows GPT-2 to generate syntactically coherent text as it can be
+  observed in the *run_generation.py* example script.
+
+[Write With Transformer](https://transformer.huggingface.co/doc/gpt) is a webapp created and hosted by Hugging Face
+showcasing the generative capabilities of several models. GPT is one of them.
+
+This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/openai/finetune-transformer-lm).
+
+Note:
+
+If you want to reproduce the original tokenization process of the *OpenAI GPT* paper, you will need to install `ftfy`
+and `SpaCy`:
+
+```bash
+pip install spacy ftfy==4.4.3
+python -m spacy download en
+```
+
+If you don't install `ftfy` and `SpaCy`, the [`OpenAIGPTTokenizer`] will default to tokenize
+using BERT's `BasicTokenizer` followed by Byte-Pair Encoding (which should be fine for most usage, don't worry).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with OpenAI GPT. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+<PipelineTag pipeline="text-classification"/>
+
+- A blog post on [outperforming OpenAI GPT-3 with SetFit for text-classification](https://www.philschmid.de/getting-started-setfit).
+- See also: [Text classification task guide](../tasks/sequence_classification)
+
+<PipelineTag pipeline="text-generation"/>
+
+- A blog on how to [Finetune a non-English GPT-2 Model with Hugging Face](https://www.philschmid.de/fine-tune-a-non-english-gpt-2-model-with-huggingface).
+- A blog on [How to generate text: using different decoding methods for language generation with Transformers](https://huggingface.co/blog/how-to-generate) with GPT-2.
+- A blog on [Training CodeParrot 🦜 from Scratch](https://huggingface.co/blog/codeparrot), a large GPT-2 model.
+- A blog on [Faster Text Generation with TensorFlow and XLA](https://huggingface.co/blog/tf-xla-generate) with GPT-2.
+- A blog on [How to train a Language Model with Megatron-LM](https://huggingface.co/blog/megatron-training) with a GPT-2 model.
+- A notebook on how to [finetune GPT2 to generate lyrics in the style of your favorite artist](https://colab.research.google.com/github/AlekseyKorshuk/huggingartists/blob/master/huggingartists-demo.ipynb). 🌎
+- A notebook on how to [finetune GPT2 to generate tweets in the style of your favorite Twitter user](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb). 🌎
+- [Causal language modeling](https://huggingface.co/course/en/chapter7/6?fw=pt#training-a-causal-language-model-from-scratch) chapter of the 🤗 Hugging Face Course.
+- [`OpenAIGPTLMHeadModel`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#gpt-2gpt-and-causal-language-modeling), [text generation example script](https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-generation/run_generation.py) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
+- [`TFOpenAIGPTLMHeadModel`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_clmpy) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
+- See also: [Causal language modeling task guide](../tasks/language_modeling)
+
+<PipelineTag pipeline="token-classification"/>
+
+- A course material on [Byte-Pair Encoding tokenization](https://huggingface.co/course/en/chapter6/5).
+
+## OpenAIGPTConfig
+
+[[autodoc]] OpenAIGPTConfig
+
+## OpenAIGPTTokenizer
+
+[[autodoc]] OpenAIGPTTokenizer
+    - save_vocabulary
+
+## OpenAIGPTTokenizerFast
+
+[[autodoc]] OpenAIGPTTokenizerFast
+
+## OpenAI specific outputs
+
+[[autodoc]] models.openai.modeling_openai.OpenAIGPTDoubleHeadsModelOutput
+
+[[autodoc]] models.openai.modeling_tf_openai.TFOpenAIGPTDoubleHeadsModelOutput
+
+## OpenAIGPTModel
+
+[[autodoc]] OpenAIGPTModel
+    - forward
+
+## OpenAIGPTLMHeadModel
+
+[[autodoc]] OpenAIGPTLMHeadModel
+    - forward
+
+## OpenAIGPTDoubleHeadsModel
+
+[[autodoc]] OpenAIGPTDoubleHeadsModel
+    - forward
+
+## OpenAIGPTForSequenceClassification
+
+[[autodoc]] OpenAIGPTForSequenceClassification
+    - forward
+
+## TFOpenAIGPTModel
+
+[[autodoc]] TFOpenAIGPTModel
+    - call
+
+## TFOpenAIGPTLMHeadModel
+
+[[autodoc]] TFOpenAIGPTLMHeadModel
+    - call
+
+## TFOpenAIGPTDoubleHeadsModel
+
+[[autodoc]] TFOpenAIGPTDoubleHeadsModel
+    - call
+
+## TFOpenAIGPTForSequenceClassification
+
+[[autodoc]] TFOpenAIGPTForSequenceClassification
+    - call
diff --git a/docs/source/en/model_doc/openai-gpt.mdx b/docs/source/en/model_doc/openai-gpt.mdx
deleted file mode 100644
index b58eff453177..000000000000
--- a/docs/source/en/model_doc/openai-gpt.mdx
+++ /dev/null
@@ -1,142 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# OpenAI GPT
-
-## Overview
-
-OpenAI GPT model was proposed in [Improving Language Understanding by Generative Pre-Training](https://s3-us-west-2.amazonaws.com/openai-assets/research-covers/language-unsupervised/language_understanding_paper.pdf)
-by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever. It's a causal (unidirectional) transformer
-pre-trained using language modeling on a large corpus will long range dependencies, the Toronto Book Corpus.
-
-The abstract from the paper is the following:
-
-*Natural language understanding comprises a wide range of diverse tasks such as textual entailment, question answering,
-semantic similarity assessment, and document classification. Although large unlabeled text corpora are abundant,
-labeled data for learning these specific tasks is scarce, making it challenging for discriminatively trained models to
-perform adequately. We demonstrate that large gains on these tasks can be realized by generative pretraining of a
-language model on a diverse corpus of unlabeled text, followed by discriminative fine-tuning on each specific task. In
-contrast to previous approaches, we make use of task-aware input transformations during fine-tuning to achieve
-effective transfer while requiring minimal changes to the model architecture. We demonstrate the effectiveness of our
-approach on a wide range of benchmarks for natural language understanding. Our general task-agnostic model outperforms
-discriminatively trained models that use architectures specifically crafted for each task, significantly improving upon
-the state of the art in 9 out of the 12 tasks studied.*
-
-Tips:
-
-- GPT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
-  the left.
-- GPT was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next
-  token in a sequence. Leveraging this feature allows GPT-2 to generate syntactically coherent text as it can be
-  observed in the *run_generation.py* example script.
-
-[Write With Transformer](https://transformer.huggingface.co/doc/gpt) is a webapp created and hosted by Hugging Face
-showcasing the generative capabilities of several models. GPT is one of them.
-
-This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/openai/finetune-transformer-lm).
-
-Note:
-
-If you want to reproduce the original tokenization process of the *OpenAI GPT* paper, you will need to install `ftfy`
-and `SpaCy`:
-
-```bash
-pip install spacy ftfy==4.4.3
-python -m spacy download en
-```
-
-If you don't install `ftfy` and `SpaCy`, the [`OpenAIGPTTokenizer`] will default to tokenize
-using BERT's `BasicTokenizer` followed by Byte-Pair Encoding (which should be fine for most usage, don't worry).
-
-## Resources
-
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with OpenAI GPT. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
-<PipelineTag pipeline="text-classification"/>
-
-- A blog post on [outperforming OpenAI GPT-3 with SetFit for text-classification](https://www.philschmid.de/getting-started-setfit).
-
-<PipelineTag pipeline="text-generation"/>
-
-- A blog on how to [Finetune a non-English GPT-2 Model with Hugging Face](https://www.philschmid.de/fine-tune-a-non-english-gpt-2-model-with-huggingface).
-- A blog on [How to generate text: using different decoding methods for language generation with Transformers](https://huggingface.co/blog/how-to-generate) with GPT-2.
-- A blog on [Training CodeParrot 🦜 from Scratch](https://huggingface.co/blog/codeparrot), a large GPT-2 model.
-- A blog on [Faster Text Generation with TensorFlow and XLA](https://huggingface.co/blog/tf-xla-generate) with GPT-2.
-- A blog on [How to train a Language Model with Megatron-LM](https://huggingface.co/blog/megatron-training) with a GPT-2 model.
-- A notebook on how to [finetune GPT2 to generate lyrics in the style of your favorite artist](https://colab.research.google.com/github/AlekseyKorshuk/huggingartists/blob/master/huggingartists-demo.ipynb). 🌎
-- A notebook on how to [finetune GPT2 to generate tweets in the style of your favorite Twitter user](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb). 🌎
-- [Causal language modeling](https://huggingface.co/course/en/chapter7/6?fw=pt#training-a-causal-language-model-from-scratch) chapter of the 🤗 Hugging Face Course.
-- [`OpenAIGPTLMHeadModel`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#gpt-2gpt-and-causal-language-modeling), [text generation example script](https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-generation/run_generation.py) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
-- [`TFOpenAIGPTLMHeadModel`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_clmpy) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
-
-<PipelineTag pipeline="token-classification"/>
-
-- A course material on [Byte-Pair Encoding tokenization](https://huggingface.co/course/en/chapter6/5).
-
-## OpenAIGPTConfig
-
-[[autodoc]] OpenAIGPTConfig
-
-## OpenAIGPTTokenizer
-
-[[autodoc]] OpenAIGPTTokenizer
-    - save_vocabulary
-
-## OpenAIGPTTokenizerFast
-
-[[autodoc]] OpenAIGPTTokenizerFast
-
-## OpenAI specific outputs
-
-[[autodoc]] models.openai.modeling_openai.OpenAIGPTDoubleHeadsModelOutput
-
-[[autodoc]] models.openai.modeling_tf_openai.TFOpenAIGPTDoubleHeadsModelOutput
-
-## OpenAIGPTModel
-
-[[autodoc]] OpenAIGPTModel
-    - forward
-
-## OpenAIGPTLMHeadModel
-
-[[autodoc]] OpenAIGPTLMHeadModel
-    - forward
-
-## OpenAIGPTDoubleHeadsModel
-
-[[autodoc]] OpenAIGPTDoubleHeadsModel
-    - forward
-
-## OpenAIGPTForSequenceClassification
-
-[[autodoc]] OpenAIGPTForSequenceClassification
-    - forward
-
-## TFOpenAIGPTModel
-
-[[autodoc]] TFOpenAIGPTModel
-    - call
-
-## TFOpenAIGPTLMHeadModel
-
-[[autodoc]] TFOpenAIGPTLMHeadModel
-    - call
-
-## TFOpenAIGPTDoubleHeadsModel
-
-[[autodoc]] TFOpenAIGPTDoubleHeadsModel
-    - call
-
-## TFOpenAIGPTForSequenceClassification
-
-[[autodoc]] TFOpenAIGPTForSequenceClassification
-    - call
diff --git a/docs/source/en/model_doc/opt.md b/docs/source/en/model_doc/opt.md
new file mode 100644
index 000000000000..332c63600acb
--- /dev/null
+++ b/docs/source/en/model_doc/opt.md
@@ -0,0 +1,108 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# OPT
+
+## Overview
+
+The OPT model was proposed in [Open Pre-trained Transformer Language Models](https://arxiv.org/pdf/2205.01068) by Meta AI.
+OPT is a series of open-sourced large causal language models which perform similar in performance to GPT3.
+
+The abstract from the paper is the following:
+
+*Large language models, which are often trained for hundreds of thousands of compute days, have shown remarkable capabilities for zero- and few-shot learning. Given their computational cost, these models are difficult to replicate without significant capital. For the few that are available through APIs, no access is granted to the full model weights, making them difficult to study. We present Open Pre-trained Transformers (OPT), a suite of decoder-only pre-trained transformers ranging from 125M to 175B parameters, which we aim to fully and responsibly share with interested researchers. We show that OPT-175B is comparable to GPT-3, while requiring only 1/7th the carbon footprint to develop. We are also releasing our logbook detailing the infrastructure challenges we faced, along with code for experimenting with all of the released models.*
+
+Tips:
+- OPT has the same architecture as [`BartDecoder`].
+- Contrary to GPT2, OPT adds the EOS token `</s>` to the beginning of every prompt.
+
+This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ), [Younes Belkada](https://huggingface.co/ybelkada), and [Patrick Von Platen](https://huggingface.co/patrickvonplaten).
+The original code can be found [here](https://github.com/facebookresearch/metaseq).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with OPT. If you're
+interested in submitting a resource to be included here, please feel free to open a Pull Request and we will review it.
+The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+<PipelineTag pipeline="text-generation" />
+
+- A notebook on [fine-tuning OPT with PEFT, bitsandbytes, and Transformers](https://colab.research.google.com/drive/1jCkpikz0J2o20FBQmYmAGdiKmJGOMo-o?usp=sharing). 🌎
+- A blog post on [decoding strategies with OPT](https://huggingface.co/blog/introducing-csearch#62-example-two---opt).
+- [Causal language modeling](https://huggingface.co/course/en/chapter7/6?fw=pt#training-a-causal-language-model-from-scratch) chapter of the 🤗 Hugging Face Course.
+- [`OPTForCausalLM`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#gpt-2gpt-and-causal-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
+- [`TFOPTForCausalLM`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_clmpy) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
+- [`FlaxOPTForCausalLM`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#causal-language-modeling).
+
+<PipelineTag pipeline="text-classification" />
+
+- [Text classification task guide](sequence_classification.md)
+- [`OPTForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb).
+
+<PipelineTag pipeline="question-answering" />
+
+- [`OPTForQuestionAnswering`] is supported by this [question answering example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb).
+- [Question answering](https://huggingface.co/course/chapter7/7?fw=pt) chapter
+  of the 🤗 Hugging Face Course.
+
+⚡️ Inference
+
+- A blog post on [How 🤗 Accelerate runs very large models thanks to PyTorch](https://huggingface.co/blog/accelerate-large-models) with OPT.
+
+## OPTConfig
+
+[[autodoc]] OPTConfig
+
+## OPTModel
+
+[[autodoc]] OPTModel
+    - forward
+
+## OPTForCausalLM
+
+[[autodoc]] OPTForCausalLM
+    - forward
+
+## TFOPTModel
+
+[[autodoc]] TFOPTModel
+    - call
+
+## TFOPTForCausalLM
+
+[[autodoc]] TFOPTForCausalLM
+    - call
+
+## OPTForSequenceClassification
+
+[[autodoc]] OPTForSequenceClassification
+    - forward
+
+## OPTForQuestionAnswering
+
+[[autodoc]] OPTForQuestionAnswering
+    - forward
+
+## FlaxOPTModel
+
+[[autodoc]] FlaxOPTModel
+    - __call__
+
+
+## FlaxOPTForCausalLM
+
+[[autodoc]] FlaxOPTForCausalLM
+    - __call__
diff --git a/docs/source/en/model_doc/opt.mdx b/docs/source/en/model_doc/opt.mdx
deleted file mode 100644
index 612689678f88..000000000000
--- a/docs/source/en/model_doc/opt.mdx
+++ /dev/null
@@ -1,76 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# OPT
-
-## Overview
-
-The OPT model was proposed in [Open Pre-trained Transformer Language Models](https://arxiv.org/pdf/2205.01068) by Meta AI.
-OPT is a series of open-sourced large causal language models which perform similar in performance to GPT3. 
-
-
-The abstract from the paper is the following:
-
-*Large language models, which are often trained for hundreds of thousands of compute days, have shown remarkable capabilities for zero- and few-shot learning. Given their computational cost, these models are difficult to replicate without significant capital. For the few that are available through APIs, no access is granted to the full model weights, making them difficult to study. We present Open Pre-trained Transformers (OPT), a suite of decoder-only pre-trained transformers ranging from 125M to 175B parameters, which we aim to fully and responsibly share with interested researchers. We show that OPT-175B is comparable to GPT-3, while requiring only 1/7th the carbon footprint to develop. We are also releasing our logbook detailing the infrastructure challenges we faced, along with code for experimenting with all of the released models.*
-
-Tips:
-- OPT has the same architecture as [`BartDecoder`].
-- Contrary to GPT2, OPT adds the EOS token `</s>` to the beginning of every prompt. **Note**: Make sure to pass `use_fast=False` when loading OPT's tokenizer with [`AutoTokenizer`] to get the correct tokenizer.
-
-This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ), [Younes Belkada](https://huggingface.co/ybelkada), and [Patrick Von Platen](https://huggingface.co/patrickvonplaten).
-The original code can be found [here](https://github.com/facebookresearch/metaseq).
-
-
-## OPTConfig
-
-[[autodoc]] OPTConfig
-
-## OPTModel
-
-[[autodoc]] OPTModel
-    - forward
-
-## OPTForCausalLM
-
-[[autodoc]] OPTForCausalLM
-    - forward
-
-## TFOPTModel
-
-[[autodoc]] TFOPTModel
-    - call
-
-## TFOPTForCausalLM
-
-[[autodoc]] TFOPTForCausalLM
-    - call
-
-## OPTForSequenceClassification
-
-[[autodoc]] OPTForSequenceClassification
-    - forward
-
-## OPTForQuestionAnswering
-
-[[autodoc]] OPTForQuestionAnswering
-    - forward
-
-## FlaxOPTModel
-
-[[autodoc]] FlaxOPTModel
-    - __call__
-
-
-## FlaxOPTForCausalLM
-
-[[autodoc]] FlaxOPTForCausalLM
-    - __call__
\ No newline at end of file
diff --git a/docs/source/en/model_doc/owlvit.md b/docs/source/en/model_doc/owlvit.md
new file mode 100644
index 000000000000..b18b80b40511
--- /dev/null
+++ b/docs/source/en/model_doc/owlvit.md
@@ -0,0 +1,118 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# OWL-ViT
+
+## Overview
+
+The OWL-ViT (short for Vision Transformer for Open-World Localization) was proposed in [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby. OWL-ViT is an open-vocabulary object detection network trained on a variety of (image, text) pairs. It can be used to query an image with one or multiple text queries to search for and detect target objects described in text.
+
+The abstract from the paper is the following:
+
+*Combining simple architectures with large-scale pre-training has led to massive improvements in image classification. For object detection, pre-training and scaling approaches are less well established, especially in the long-tailed and open-vocabulary setting, where training data is relatively scarce. In this paper, we propose a strong recipe for transferring image-text models to open-vocabulary object detection. We use a standard Vision Transformer architecture with minimal modifications, contrastive image-text pre-training, and end-to-end detection fine-tuning. Our analysis of the scaling properties of this setup shows that increasing image-level pre-training and model size yield consistent improvements on the downstream detection task. We provide the adaptation strategies and regularizations needed to attain very strong performance on zero-shot text-conditioned and one-shot image-conditioned object detection. Code and models are available on GitHub.*
+
+## Usage
+
+OWL-ViT is a zero-shot text-conditioned object detection model. OWL-ViT uses [CLIP](clip) as its multi-modal backbone, with a ViT-like Transformer to get visual features and a causal language model to get the text features. To use CLIP for detection, OWL-ViT removes the final token pooling layer of the vision model and attaches a lightweight classification and box head to each transformer output token. Open-vocabulary classification is enabled by replacing the fixed classification layer weights with the class-name embeddings obtained from the text model. The authors first train CLIP from scratch and fine-tune it end-to-end with the classification and box heads on standard detection datasets using a bipartite matching loss. One or multiple text queries per image can be used to perform zero-shot text-conditioned object detection.
+
+[`OwlViTImageProcessor`] can be used to resize (or rescale) and normalize images for the model and [`CLIPTokenizer`] is used to encode the text. [`OwlViTProcessor`] wraps [`OwlViTImageProcessor`] and [`CLIPTokenizer`] into a single instance to both encode the text and prepare the images. The following example shows how to perform object detection using [`OwlViTProcessor`] and [`OwlViTForObjectDetection`].
+
+
+```python
+>>> import requests
+>>> from PIL import Image
+>>> import torch
+
+>>> from transformers import OwlViTProcessor, OwlViTForObjectDetection
+
+>>> processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
+>>> model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")
+
+>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+>>> texts = [["a photo of a cat", "a photo of a dog"]]
+>>> inputs = processor(text=texts, images=image, return_tensors="pt")
+>>> outputs = model(**inputs)
+
+>>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
+>>> target_sizes = torch.Tensor([image.size[::-1]])
+>>> # Convert outputs (bounding boxes and class logits) to COCO API
+>>> results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1)
+>>> i = 0  # Retrieve predictions for the first image for the corresponding text queries
+>>> text = texts[i]
+>>> boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
+>>> for box, score, label in zip(boxes, scores, labels):
+...     box = [round(i, 2) for i in box.tolist()]
+...     print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
+Detected a photo of a cat with confidence 0.707 at location [324.97, 20.44, 640.58, 373.29]
+Detected a photo of a cat with confidence 0.717 at location [1.46, 55.26, 315.55, 472.17]
+```
+
+This model was contributed by [adirik](https://huggingface.co/adirik). The original code can be found [here](https://github.com/google-research/scenic/tree/main/scenic/projects/owl_vit).
+
+## OwlViTConfig
+
+[[autodoc]] OwlViTConfig
+    - from_text_vision_configs
+
+## OwlViTTextConfig
+
+[[autodoc]] OwlViTTextConfig
+
+## OwlViTVisionConfig
+
+[[autodoc]] OwlViTVisionConfig
+
+## OwlViTImageProcessor
+
+[[autodoc]] OwlViTImageProcessor
+    - preprocess
+    - post_process_object_detection
+    - post_process_image_guided_detection
+
+## OwlViTFeatureExtractor
+
+[[autodoc]] OwlViTFeatureExtractor
+    - __call__
+    - post_process
+    - post_process_image_guided_detection
+
+## OwlViTProcessor
+
+[[autodoc]] OwlViTProcessor
+
+## OwlViTModel
+
+[[autodoc]] OwlViTModel
+    - forward
+    - get_text_features
+    - get_image_features
+
+## OwlViTTextModel
+
+[[autodoc]] OwlViTTextModel
+    - forward
+
+## OwlViTVisionModel
+
+[[autodoc]] OwlViTVisionModel
+    - forward
+
+## OwlViTForObjectDetection
+
+[[autodoc]] OwlViTForObjectDetection
+    - forward
+    - image_guided_detection
diff --git a/docs/source/en/model_doc/owlvit.mdx b/docs/source/en/model_doc/owlvit.mdx
deleted file mode 100644
index f13ad4a540e1..000000000000
--- a/docs/source/en/model_doc/owlvit.mdx
+++ /dev/null
@@ -1,118 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# OWL-ViT
-
-## Overview
-
-The OWL-ViT (short for Vision Transformer for Open-World Localization) was proposed in [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby. OWL-ViT is an open-vocabulary object detection network trained on a variety of (image, text) pairs. It can be used to query an image with one or multiple text queries to search for and detect target objects described in text.
-
-The abstract from the paper is the following:
-
-*Combining simple architectures with large-scale pre-training has led to massive improvements in image classification. For object detection, pre-training and scaling approaches are less well established, especially in the long-tailed and open-vocabulary setting, where training data is relatively scarce. In this paper, we propose a strong recipe for transferring image-text models to open-vocabulary object detection. We use a standard Vision Transformer architecture with minimal modifications, contrastive image-text pre-training, and end-to-end detection fine-tuning. Our analysis of the scaling properties of this setup shows that increasing image-level pre-training and model size yield consistent improvements on the downstream detection task. We provide the adaptation strategies and regularizations needed to attain very strong performance on zero-shot text-conditioned and one-shot image-conditioned object detection. Code and models are available on GitHub.*
-
-## Usage
-
-OWL-ViT is a zero-shot text-conditioned object detection model. OWL-ViT uses [CLIP](clip) as its multi-modal backbone, with a ViT-like Transformer to get visual features and a causal language model to get the text features. To use CLIP for detection, OWL-ViT removes the final token pooling layer of the vision model and attaches a lightweight classification and box head to each transformer output token. Open-vocabulary classification is enabled by replacing the fixed classification layer weights with the class-name embeddings obtained from the text model. The authors first train CLIP from scratch and fine-tune it end-to-end with the classification and box heads on standard detection datasets using a bipartite matching loss. One or multiple text queries per image can be used to perform zero-shot text-conditioned object detection.
-
-[`OwlViTFeatureExtractor`] can be used to resize (or rescale) and normalize images for the model and [`CLIPTokenizer`] is used to encode the text. [`OwlViTProcessor`] wraps [`OwlViTFeatureExtractor`] and [`CLIPTokenizer`] into a single instance to both encode the text and prepare the images. The following example shows how to perform object detection using [`OwlViTProcessor`] and [`OwlViTForObjectDetection`].
-
-
-```python
->>> import requests
->>> from PIL import Image
->>> import torch
-
->>> from transformers import OwlViTProcessor, OwlViTForObjectDetection
-
->>> processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
->>> model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")
-
->>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
->>> image = Image.open(requests.get(url, stream=True).raw)
->>> texts = [["a photo of a cat", "a photo of a dog"]]
->>> inputs = processor(text=texts, images=image, return_tensors="pt")
->>> outputs = model(**inputs)
-
->>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
->>> target_sizes = torch.Tensor([image.size[::-1]])
->>> # Convert outputs (bounding boxes and class logits) to COCO API
->>> results = processor.post_process(outputs=outputs, target_sizes=target_sizes)
-
->>> i = 0  # Retrieve predictions for the first image for the corresponding text queries
->>> text = texts[i]
->>> boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
-
->>> score_threshold = 0.1
->>> for box, score, label in zip(boxes, scores, labels):
-...     box = [round(i, 2) for i in box.tolist()]
-...     if score >= score_threshold:
-...         print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
-Detected a photo of a cat with confidence 0.707 at location [324.97, 20.44, 640.58, 373.29]
-Detected a photo of a cat with confidence 0.717 at location [1.46, 55.26, 315.55, 472.17]
-```
-
-This model was contributed by [adirik](https://huggingface.co/adirik). The original code can be found [here](https://github.com/google-research/scenic/tree/main/scenic/projects/owl_vit).
-
-## OwlViTConfig
-
-[[autodoc]] OwlViTConfig
-    - from_text_vision_configs
-
-## OwlViTTextConfig
-
-[[autodoc]] OwlViTTextConfig
-
-## OwlViTVisionConfig
-
-[[autodoc]] OwlViTVisionConfig
-
-## OwlViTImageProcessor
-
-[[autodoc]] OwlViTImageProcessor
-    - preprocess
-    - post_process_object_detection
-    - post_process_image_guided_detection
-
-## OwlViTFeatureExtractor
-
-[[autodoc]] OwlViTFeatureExtractor
-    - __call__
-    - post_process
-    - post_process_image_guided_detection
-
-## OwlViTProcessor
-
-[[autodoc]] OwlViTProcessor
-
-## OwlViTModel
-
-[[autodoc]] OwlViTModel
-    - forward
-    - get_text_features
-    - get_image_features
-
-## OwlViTTextModel
-
-[[autodoc]] OwlViTTextModel
-    - forward
-
-## OwlViTVisionModel
-
-[[autodoc]] OwlViTVisionModel
-    - forward
-
-## OwlViTForObjectDetection
-
-[[autodoc]] OwlViTForObjectDetection
-    - forward
-    - image_guided_detection
diff --git a/docs/source/en/model_doc/pegasus.md b/docs/source/en/model_doc/pegasus.md
new file mode 100644
index 000000000000..14608aae31c9
--- /dev/null
+++ b/docs/source/en/model_doc/pegasus.md
@@ -0,0 +1,166 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Pegasus
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=pegasus">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-pegasus-blueviolet">
+</a>
+<a href="https://huggingface.co/spaces/docs-demos/pegasus_paraphrase">
+<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
+</a>
+</div>
+
+**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=sshleifer&labels=&template=bug-report.md&title)
+and assign @patrickvonplaten.
+
+
+## Overview
+
+The Pegasus model was proposed in [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/pdf/1912.08777.pdf) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu on Dec 18, 2019.
+
+According to the abstract,
+
+- Pegasus' pretraining task is intentionally similar to summarization: important sentences are removed/masked from an
+  input document and are generated together as one output sequence from the remaining sentences, similar to an
+  extractive summary.
+- Pegasus achieves SOTA summarization performance on all 12 downstream tasks, as measured by ROUGE and human eval.
+
+This model was contributed by [sshleifer](https://huggingface.co/sshleifer). The Authors' code can be found [here](https://github.com/google-research/pegasus).
+
+Tips:
+
+- Sequence-to-sequence model with the same encoder-decoder model architecture as BART. Pegasus is pre-trained jointly on two self-supervised objective functions: Masked Language Modeling (MLM) and a novel summarization specific pretraining objective, called Gap Sentence Generation (GSG).
+
+  * MLM: encoder input tokens are randomly replaced by a mask tokens and have to be predicted by the encoder (like in BERT)
+  * GSG: whole encoder input sentences are replaced by a second mask token and fed to the decoder, but which has a causal mask to hide the future words like a regular auto-regressive transformer decoder.
+
+## Checkpoints
+
+All the [checkpoints](https://huggingface.co/models?search=pegasus) are fine-tuned for summarization, besides
+*pegasus-large*, whence the other checkpoints are fine-tuned:
+
+- Each checkpoint is 2.2 GB on disk and 568M parameters.
+- FP16 is not supported (help/ideas on this appreciated!).
+- Summarizing xsum in fp32 takes about 400ms/sample, with default parameters on a v100 GPU.
+- Full replication results and correctly pre-processed data can be found in this [Issue](https://github.com/huggingface/transformers/issues/6844#issue-689259666).
+- [Distilled checkpoints](https://huggingface.co/models?search=distill-pegasus) are described in this [paper](https://arxiv.org/abs/2010.13002).
+
+### Examples
+
+- [Script](https://github.com/huggingface/transformers/tree/main/examples/research_projects/seq2seq-distillation/finetune_pegasus_xsum.sh) to fine-tune pegasus
+  on the XSUM dataset. Data download instructions at [examples/pytorch/summarization/](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization/README.md).
+- FP16 is not supported (help/ideas on this appreciated!).
+- The adafactor optimizer is recommended for pegasus fine-tuning.
+
+
+## Implementation Notes
+
+- All models are transformer encoder-decoders with 16 layers in each component.
+- The implementation is completely inherited from [`BartForConditionalGeneration`]
+- Some key configuration differences:
+
+  - static, sinusoidal position embeddings
+  - the model starts generating with pad_token_id (which has 0 token_embedding) as the prefix.
+  - more beams are used (`num_beams=8`)
+- All pretrained pegasus checkpoints are the same besides three attributes: `tokenizer.model_max_length` (maximum
+  input size), `max_length` (the maximum number of tokens to generate) and `length_penalty`.
+- The code to convert checkpoints trained in the author's [repo](https://github.com/google-research/pegasus) can be
+  found in `convert_pegasus_tf_to_pytorch.py`.
+
+
+## Usage Example
+
+```python
+>>> from transformers import PegasusForConditionalGeneration, PegasusTokenizer
+>>> import torch
+
+>>> src_text = [
+...     """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."""
+... ]
+
+... model_name = "google/pegasus-xsum"
+... device = "cuda" if torch.cuda.is_available() else "cpu"
+... tokenizer = PegasusTokenizer.from_pretrained(model_name)
+... model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
+... batch = tokenizer(src_text, truncation=True, padding="longest", return_tensors="pt").to(device)
+... translated = model.generate(**batch)
+... tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
+... assert (
+...     tgt_text[0]
+...     == "California's largest electricity provider has turned off power to hundreds of thousands of customers."
+... )
+```
+
+## Documentation resources
+
+- [Causal language modeling task guide](../tasks/language_modeling)
+- [Translation task guide](../tasks/translation)
+- [Summarization task guide](../tasks/summarization)
+
+## PegasusConfig
+
+[[autodoc]] PegasusConfig
+
+## PegasusTokenizer
+
+warning: `add_tokens` does not work at the moment.
+
+[[autodoc]] PegasusTokenizer
+
+## PegasusTokenizerFast
+
+[[autodoc]] PegasusTokenizerFast
+
+## PegasusModel
+
+[[autodoc]] PegasusModel
+    - forward
+
+## PegasusForConditionalGeneration
+
+[[autodoc]] PegasusForConditionalGeneration
+    - forward
+
+## PegasusForCausalLM
+
+[[autodoc]] PegasusForCausalLM
+    - forward
+
+## TFPegasusModel
+
+[[autodoc]] TFPegasusModel
+    - call
+
+## TFPegasusForConditionalGeneration
+
+[[autodoc]] TFPegasusForConditionalGeneration
+    - call
+
+## FlaxPegasusModel
+
+[[autodoc]] FlaxPegasusModel
+    - __call__
+    - encode
+    - decode
+
+## FlaxPegasusForConditionalGeneration
+
+[[autodoc]] FlaxPegasusForConditionalGeneration
+    - __call__
+    - encode
+    - decode
diff --git a/docs/source/en/model_doc/pegasus.mdx b/docs/source/en/model_doc/pegasus.mdx
deleted file mode 100644
index 52dd10b9bfa7..000000000000
--- a/docs/source/en/model_doc/pegasus.mdx
+++ /dev/null
@@ -1,141 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Pegasus
-
-**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=sshleifer&labels=&template=bug-report.md&title)
-and assign @patrickvonplaten.
-
-
-## Overview
-
-The Pegasus model was proposed in [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/pdf/1912.08777.pdf) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu on Dec 18, 2019.
-
-According to the abstract,
-
-- Pegasus' pretraining task is intentionally similar to summarization: important sentences are removed/masked from an
-  input document and are generated together as one output sequence from the remaining sentences, similar to an
-  extractive summary.
-- Pegasus achieves SOTA summarization performance on all 12 downstream tasks, as measured by ROUGE and human eval.
-
-This model was contributed by [sshleifer](https://huggingface.co/sshleifer). The Authors' code can be found [here](https://github.com/google-research/pegasus).
-
-
-## Checkpoints
-
-All the [checkpoints](https://huggingface.co/models?search=pegasus) are fine-tuned for summarization, besides
-*pegasus-large*, whence the other checkpoints are fine-tuned:
-
-- Each checkpoint is 2.2 GB on disk and 568M parameters.
-- FP16 is not supported (help/ideas on this appreciated!).
-- Summarizing xsum in fp32 takes about 400ms/sample, with default parameters on a v100 GPU.
-- Full replication results and correctly pre-processed data can be found in this [Issue](https://github.com/huggingface/transformers/issues/6844#issue-689259666).
-- [Distilled checkpoints](https://huggingface.co/models?search=distill-pegasus) are described in this [paper](https://arxiv.org/abs/2010.13002).
-
-### Examples
-
-- [Script](https://github.com/huggingface/transformers/tree/main/examples/research_projects/seq2seq-distillation/finetune_pegasus_xsum.sh) to fine-tune pegasus
-  on the XSUM dataset. Data download instructions at [examples/pytorch/summarization/](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization/README.md).
-- FP16 is not supported (help/ideas on this appreciated!).
-- The adafactor optimizer is recommended for pegasus fine-tuning.
-
-
-## Implementation Notes
-
-- All models are transformer encoder-decoders with 16 layers in each component.
-- The implementation is completely inherited from [`BartForConditionalGeneration`]
-- Some key configuration differences:
-
-  - static, sinusoidal position embeddings
-  - the model starts generating with pad_token_id (which has 0 token_embedding) as the prefix.
-  - more beams are used (`num_beams=8`)
-- All pretrained pegasus checkpoints are the same besides three attributes: `tokenizer.model_max_length` (maximum
-  input size), `max_length` (the maximum number of tokens to generate) and `length_penalty`.
-- The code to convert checkpoints trained in the author's [repo](https://github.com/google-research/pegasus) can be
-  found in `convert_pegasus_tf_to_pytorch.py`.
-
-
-## Usage Example
-
-```python
->>> from transformers import PegasusForConditionalGeneration, PegasusTokenizer
->>> import torch
-
->>> src_text = [
-...     """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."""
-... ]
-
-... model_name = "google/pegasus-xsum"
-... device = "cuda" if torch.cuda.is_available() else "cpu"
-... tokenizer = PegasusTokenizer.from_pretrained(model_name)
-... model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
-... batch = tokenizer(src_text, truncation=True, padding="longest", return_tensors="pt").to(device)
-... translated = model.generate(**batch)
-... tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
-... assert (
-...     tgt_text[0]
-...     == "California's largest electricity provider has turned off power to hundreds of thousands of customers."
-... )
-```
-
-## PegasusConfig
-
-[[autodoc]] PegasusConfig
-
-## PegasusTokenizer
-
-warning: `add_tokens` does not work at the moment.
-
-[[autodoc]] PegasusTokenizer
-
-## PegasusTokenizerFast
-
-[[autodoc]] PegasusTokenizerFast
-
-## PegasusModel
-
-[[autodoc]] PegasusModel
-    - forward
-
-## PegasusForConditionalGeneration
-
-[[autodoc]] PegasusForConditionalGeneration
-    - forward
-
-## PegasusForCausalLM
-
-[[autodoc]] PegasusForCausalLM
-    - forward
-
-## TFPegasusModel
-
-[[autodoc]] TFPegasusModel
-    - call
-
-## TFPegasusForConditionalGeneration
-
-[[autodoc]] TFPegasusForConditionalGeneration
-    - call
-
-## FlaxPegasusModel
-
-[[autodoc]] FlaxPegasusModel
-    - __call__
-    - encode
-    - decode
-
-## FlaxPegasusForConditionalGeneration
-
-[[autodoc]] FlaxPegasusForConditionalGeneration
-    - __call__
-    - encode
-    - decode
diff --git a/docs/source/en/model_doc/pegasus_x.md b/docs/source/en/model_doc/pegasus_x.md
new file mode 100644
index 000000000000..a0fd670fc7c9
--- /dev/null
+++ b/docs/source/en/model_doc/pegasus_x.md
@@ -0,0 +1,54 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# PEGASUS-X
+
+## Overview
+
+The PEGASUS-X model was proposed in [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347)  by Jason Phang, Yao Zhao and Peter J. Liu.
+
+PEGASUS-X (PEGASUS eXtended) extends the PEGASUS models for long input summarization through additional long input pretraining and using staggered block-local attention with global tokens in the encoder.
+
+The abstract from the paper is the following:
+
+*While large pretrained Transformer models have proven highly capable at tackling natural language tasks, handling long sequence inputs continues to be a significant challenge. One such task is long input summarization, where inputs are longer than the maximum input context of most pretrained models. Through an extensive set of experiments, we investigate what model architectural changes and pretraining paradigms can most efficiently adapt a pretrained Transformer for long input summarization. We find that a staggered, block-local Transformer with global encoder tokens strikes a good balance of performance and efficiency, and that an additional pretraining phase on long sequences meaningfully improves downstream summarization performance. Based on our findings, we introduce PEGASUS-X, an extension of the PEGASUS model with additional long input pretraining to handle inputs of up to 16K tokens. PEGASUS-X achieves strong performance on long input summarization tasks comparable with much larger models while adding few additional parameters and not requiring model parallelism to train.*
+
+Tips:
+
+* PEGASUS-X uses the same tokenizer as PEGASUS.
+
+This model was contributed by [zphang](<https://huggingface.co/zphang). The original code can be found [here](https://github.com/google-research/pegasus).
+
+## Documentation resources
+
+- [Translation task guide](../tasks/translation)
+- [Summarization task guide](../tasks/summarization)
+
+## PegasusXConfig
+
+[[autodoc]] PegasusXConfig
+
+
+## PegasusXModel
+
+[[autodoc]] PegasusXModel
+    - forward
+
+
+## PegasusXForConditionalGeneration
+
+[[autodoc]] PegasusXForConditionalGeneration
+    - forward
diff --git a/docs/source/en/model_doc/pegasus_x.mdx b/docs/source/en/model_doc/pegasus_x.mdx
deleted file mode 100644
index c3527c9e01a6..000000000000
--- a/docs/source/en/model_doc/pegasus_x.mdx
+++ /dev/null
@@ -1,45 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# PEGASUS-X
-
-## Overview
-
-The PEGASUS-X model was proposed in [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347)  by Jason Phang, Yao Zhao and Peter J. Liu.
-
-PEGASUS-X (PEGASUS eXtended) extends the PEGASUS models for long input summarization through additional long input pretraining and using staggered block-local attention with global tokens in the encoder.
-
-The abstract from the paper is the following:
-
-*While large pretrained Transformer models have proven highly capable at tackling natural language tasks, handling long sequence inputs continues to be a significant challenge. One such task is long input summarization, where inputs are longer than the maximum input context of most pretrained models. Through an extensive set of experiments, we investigate what model architectural changes and pretraining paradigms can most efficiently adapt a pretrained Transformer for long input summarization. We find that a staggered, block-local Transformer with global encoder tokens strikes a good balance of performance and efficiency, and that an additional pretraining phase on long sequences meaningfully improves downstream summarization performance. Based on our findings, we introduce PEGASUS-X, an extension of the PEGASUS model with additional long input pretraining to handle inputs of up to 16K tokens. PEGASUS-X achieves strong performance on long input summarization tasks comparable with much larger models while adding few additional parameters and not requiring model parallelism to train.*
-
-Tips:
-
-* PEGASUS-X uses the same tokenizer as PEGASUS.
-
-This model was contributed by [zphang](<https://huggingface.co/zphang). The original code can be found [here](https://github.com/google-research/pegasus).
-
-## PegasusXConfig
-
-[[autodoc]] PegasusXConfig
-
-
-## PegasusXModel
-
-[[autodoc]] PegasusXModel
-    - forward
-
-
-## PegasusXForConditionalGeneration
-
-[[autodoc]] PegasusXForConditionalGeneration
-    - forward
diff --git a/docs/source/en/model_doc/perceiver.md b/docs/source/en/model_doc/perceiver.md
new file mode 100644
index 000000000000..97921baed2b1
--- /dev/null
+++ b/docs/source/en/model_doc/perceiver.md
@@ -0,0 +1,230 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Perceiver
+
+## Overview
+
+The Perceiver IO model was proposed in [Perceiver IO: A General Architecture for Structured Inputs &
+Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch,
+Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M.
+Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
+
+Perceiver IO is a generalization of [Perceiver](https://arxiv.org/abs/2103.03206) to handle arbitrary outputs in
+addition to arbitrary inputs. The original Perceiver only produced a single classification label. In addition to
+classification labels, Perceiver IO can produce (for example) language, optical flow, and multimodal videos with audio.
+This is done using the same building blocks as the original Perceiver. The computational complexity of Perceiver IO is
+linear in the input and output size and the bulk of the processing occurs in the latent space, allowing us to process
+inputs and outputs that are much larger than can be handled by standard Transformers. This means, for example,
+Perceiver IO can do BERT-style masked language modeling directly using bytes instead of tokenized inputs.
+
+The abstract from the paper is the following:
+
+*The recently-proposed Perceiver model obtains good results on several domains (images, audio, multimodal, point
+clouds) while scaling linearly in compute and memory with the input size. While the Perceiver supports many kinds of
+inputs, it can only produce very simple outputs such as class scores. Perceiver IO overcomes this limitation without
+sacrificing the original's appealing properties by learning to flexibly query the model's latent space to produce
+outputs of arbitrary size and semantics. Perceiver IO still decouples model depth from data size and still scales
+linearly with data size, but now with respect to both input and output sizes. The full Perceiver IO model achieves
+strong results on tasks with highly structured output spaces, such as natural language and visual understanding,
+StarCraft II, and multi-task and multi-modal domains. As highlights, Perceiver IO matches a Transformer-based BERT
+baseline on the GLUE language benchmark without the need for input tokenization and achieves state-of-the-art
+performance on Sintel optical flow estimation.*
+
+Here's a TLDR explaining how Perceiver works:
+
+The main problem with the self-attention mechanism of the Transformer is that the time and memory requirements scale
+quadratically with the sequence length. Hence, models like BERT and RoBERTa are limited to a max sequence length of 512
+tokens. Perceiver aims to solve this issue by, instead of performing self-attention on the inputs, perform it on a set
+of latent variables, and only use the inputs for cross-attention. In this way, the time and memory requirements don't
+depend on the length of the inputs anymore, as one uses a fixed amount of latent variables, like 256 or 512. These are
+randomly initialized, after which they are trained end-to-end using backpropagation.
+
+Internally, [`PerceiverModel`] will create the latents, which is a tensor of shape `(batch_size, num_latents,
+d_latents)`. One must provide `inputs` (which could be text, images, audio, you name it!) to the model, which it will
+use to perform cross-attention with the latents. The output of the Perceiver encoder is a tensor of the same shape. One
+can then, similar to BERT, convert the last hidden states of the latents to classification logits by averaging along
+the sequence dimension, and placing a linear layer on top of that to project the `d_latents` to `num_labels`.
+
+This was the idea of the original Perceiver paper. However, it could only output classification logits. In a follow-up
+work, PerceiverIO, they generalized it to let the model also produce outputs of arbitrary size. How, you might ask? The
+idea is actually relatively simple: one defines outputs of an arbitrary size, and then applies cross-attention with the
+last hidden states of the latents, using the outputs as queries, and the latents as keys and values.
+
+So let's say one wants to perform masked language modeling (BERT-style) with the Perceiver. As the Perceiver's input
+length will not have an impact on the computation time of the self-attention layers, one can provide raw bytes,
+providing `inputs` of length 2048 to the model. If one now masks out certain of these 2048 tokens, one can define the
+`outputs` as being of shape: `(batch_size, 2048, 768)`. Next, one performs cross-attention with the final hidden states
+of the latents to update the `outputs` tensor. After cross-attention, one still has a tensor of shape `(batch_size,
+2048, 768)`. One can then place a regular language modeling head on top, to project the last dimension to the
+vocabulary size of the model, i.e. creating logits of shape `(batch_size, 2048, 262)` (as Perceiver uses a vocabulary
+size of 262 byte IDs).
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/perceiver_architecture.jpg"
+alt="drawing" width="600"/>
+
+<small> Perceiver IO architecture. Taken from the <a href="https://arxiv.org/abs/2105.15203">original paper</a> </small>
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found
+[here](https://github.com/deepmind/deepmind-research/tree/master/perceiver).
+
+Tips:
+
+- The quickest way to get started with the Perceiver is by checking the [tutorial
+  notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/Perceiver).
+- Refer to the [blog post](https://huggingface.co/blog/perceiver) if you want to fully understand how the model works and
+is implemented in the library. Note that the models available in the library only showcase some examples of what you can do
+with the Perceiver. There are many more use cases, including question answering, named-entity recognition, object detection,
+audio classification, video classification, etc.
+
+**Note**:
+
+- Perceiver does **not** work with `torch.nn.DataParallel` due to a bug in PyTorch, see [issue #36035](https://github.com/pytorch/pytorch/issues/36035)
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+- [Image classification task guide](../tasks/image_classification)
+
+## Perceiver specific outputs
+
+[[autodoc]] models.perceiver.modeling_perceiver.PerceiverModelOutput
+
+[[autodoc]] models.perceiver.modeling_perceiver.PerceiverDecoderOutput
+
+[[autodoc]] models.perceiver.modeling_perceiver.PerceiverMaskedLMOutput
+
+[[autodoc]] models.perceiver.modeling_perceiver.PerceiverClassifierOutput
+
+## PerceiverConfig
+
+[[autodoc]] PerceiverConfig
+
+## PerceiverTokenizer
+
+[[autodoc]] PerceiverTokenizer
+    - __call__
+
+## PerceiverFeatureExtractor
+
+[[autodoc]] PerceiverFeatureExtractor
+    - __call__
+
+## PerceiverImageProcessor
+
+[[autodoc]] PerceiverImageProcessor
+    - preprocess
+
+## PerceiverTextPreprocessor
+
+[[autodoc]] models.perceiver.modeling_perceiver.PerceiverTextPreprocessor
+
+## PerceiverImagePreprocessor
+
+[[autodoc]] models.perceiver.modeling_perceiver.PerceiverImagePreprocessor
+
+## PerceiverOneHotPreprocessor
+
+[[autodoc]] models.perceiver.modeling_perceiver.PerceiverOneHotPreprocessor
+
+## PerceiverAudioPreprocessor
+
+[[autodoc]] models.perceiver.modeling_perceiver.PerceiverAudioPreprocessor
+
+## PerceiverMultimodalPreprocessor
+
+[[autodoc]] models.perceiver.modeling_perceiver.PerceiverMultimodalPreprocessor
+
+## PerceiverProjectionDecoder
+
+[[autodoc]] models.perceiver.modeling_perceiver.PerceiverProjectionDecoder
+
+## PerceiverBasicDecoder
+
+[[autodoc]] models.perceiver.modeling_perceiver.PerceiverBasicDecoder
+
+## PerceiverClassificationDecoder
+
+[[autodoc]] models.perceiver.modeling_perceiver.PerceiverClassificationDecoder
+
+## PerceiverOpticalFlowDecoder
+
+[[autodoc]] models.perceiver.modeling_perceiver.PerceiverOpticalFlowDecoder
+
+## PerceiverBasicVideoAutoencodingDecoder
+
+[[autodoc]] models.perceiver.modeling_perceiver.PerceiverBasicVideoAutoencodingDecoder
+
+## PerceiverMultimodalDecoder
+
+[[autodoc]] models.perceiver.modeling_perceiver.PerceiverMultimodalDecoder
+
+## PerceiverProjectionPostprocessor
+
+[[autodoc]] models.perceiver.modeling_perceiver.PerceiverProjectionPostprocessor
+
+## PerceiverAudioPostprocessor
+
+[[autodoc]] models.perceiver.modeling_perceiver.PerceiverAudioPostprocessor
+
+## PerceiverClassificationPostprocessor
+
+[[autodoc]] models.perceiver.modeling_perceiver.PerceiverClassificationPostprocessor
+
+## PerceiverMultimodalPostprocessor
+
+[[autodoc]] models.perceiver.modeling_perceiver.PerceiverMultimodalPostprocessor
+
+## PerceiverModel
+
+[[autodoc]] PerceiverModel
+    - forward
+
+## PerceiverForMaskedLM
+
+[[autodoc]] PerceiverForMaskedLM
+    - forward
+
+## PerceiverForSequenceClassification
+
+[[autodoc]] PerceiverForSequenceClassification
+    - forward
+
+## PerceiverForImageClassificationLearned
+
+[[autodoc]] PerceiverForImageClassificationLearned
+    - forward
+
+## PerceiverForImageClassificationFourier
+
+[[autodoc]] PerceiverForImageClassificationFourier
+    - forward
+
+## PerceiverForImageClassificationConvProcessing
+
+[[autodoc]] PerceiverForImageClassificationConvProcessing
+    - forward
+
+## PerceiverForOpticalFlow
+
+[[autodoc]] PerceiverForOpticalFlow
+    - forward
+
+## PerceiverForMultimodalAutoencoding
+
+[[autodoc]] PerceiverForMultimodalAutoencoding
+    - forward
diff --git a/docs/source/en/model_doc/perceiver.mdx b/docs/source/en/model_doc/perceiver.mdx
deleted file mode 100644
index 52a928472c0d..000000000000
--- a/docs/source/en/model_doc/perceiver.mdx
+++ /dev/null
@@ -1,220 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Perceiver
-
-## Overview
-
-The Perceiver IO model was proposed in [Perceiver IO: A General Architecture for Structured Inputs &
-Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch,
-Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M.
-Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
-
-Perceiver IO is a generalization of [Perceiver](https://arxiv.org/abs/2103.03206) to handle arbitrary outputs in
-addition to arbitrary inputs. The original Perceiver only produced a single classification label. In addition to
-classification labels, Perceiver IO can produce (for example) language, optical flow, and multimodal videos with audio.
-This is done using the same building blocks as the original Perceiver. The computational complexity of Perceiver IO is
-linear in the input and output size and the bulk of the processing occurs in the latent space, allowing us to process
-inputs and outputs that are much larger than can be handled by standard Transformers. This means, for example,
-Perceiver IO can do BERT-style masked language modeling directly using bytes instead of tokenized inputs.
-
-The abstract from the paper is the following:
-
-*The recently-proposed Perceiver model obtains good results on several domains (images, audio, multimodal, point
-clouds) while scaling linearly in compute and memory with the input size. While the Perceiver supports many kinds of
-inputs, it can only produce very simple outputs such as class scores. Perceiver IO overcomes this limitation without
-sacrificing the original's appealing properties by learning to flexibly query the model's latent space to produce
-outputs of arbitrary size and semantics. Perceiver IO still decouples model depth from data size and still scales
-linearly with data size, but now with respect to both input and output sizes. The full Perceiver IO model achieves
-strong results on tasks with highly structured output spaces, such as natural language and visual understanding,
-StarCraft II, and multi-task and multi-modal domains. As highlights, Perceiver IO matches a Transformer-based BERT
-baseline on the GLUE language benchmark without the need for input tokenization and achieves state-of-the-art
-performance on Sintel optical flow estimation.*
-
-Here's a TLDR explaining how Perceiver works:
-
-The main problem with the self-attention mechanism of the Transformer is that the time and memory requirements scale
-quadratically with the sequence length. Hence, models like BERT and RoBERTa are limited to a max sequence length of 512
-tokens. Perceiver aims to solve this issue by, instead of performing self-attention on the inputs, perform it on a set
-of latent variables, and only use the inputs for cross-attention. In this way, the time and memory requirements don't
-depend on the length of the inputs anymore, as one uses a fixed amount of latent variables, like 256 or 512. These are
-randomly initialized, after which they are trained end-to-end using backpropagation.
-
-Internally, [`PerceiverModel`] will create the latents, which is a tensor of shape `(batch_size, num_latents,
-d_latents)`. One must provide `inputs` (which could be text, images, audio, you name it!) to the model, which it will
-use to perform cross-attention with the latents. The output of the Perceiver encoder is a tensor of the same shape. One
-can then, similar to BERT, convert the last hidden states of the latents to classification logits by averaging along
-the sequence dimension, and placing a linear layer on top of that to project the `d_latents` to `num_labels`.
-
-This was the idea of the original Perceiver paper. However, it could only output classification logits. In a follow-up
-work, PerceiverIO, they generalized it to let the model also produce outputs of arbitrary size. How, you might ask? The
-idea is actually relatively simple: one defines outputs of an arbitrary size, and then applies cross-attention with the
-last hidden states of the latents, using the outputs as queries, and the latents as keys and values.
-
-So let's say one wants to perform masked language modeling (BERT-style) with the Perceiver. As the Perceiver's input
-length will not have an impact on the computation time of the self-attention layers, one can provide raw bytes,
-providing `inputs` of length 2048 to the model. If one now masks out certain of these 2048 tokens, one can define the
-`outputs` as being of shape: `(batch_size, 2048, 768)`. Next, one performs cross-attention with the final hidden states
-of the latents to update the `outputs` tensor. After cross-attention, one still has a tensor of shape `(batch_size,
-2048, 768)`. One can then place a regular language modeling head on top, to project the last dimension to the
-vocabulary size of the model, i.e. creating logits of shape `(batch_size, 2048, 262)` (as Perceiver uses a vocabulary
-size of 262 byte IDs).
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/perceiver_architecture.jpg"
-alt="drawing" width="600"/>
-
-<small> Perceiver IO architecture. Taken from the <a href="https://arxiv.org/abs/2105.15203">original paper</a> </small>
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found
-[here](https://github.com/deepmind/deepmind-research/tree/master/perceiver).
-
-Tips:
-
-- The quickest way to get started with the Perceiver is by checking the [tutorial
-  notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/Perceiver).
-- Refer to the [blog post](https://huggingface.co/blog/perceiver) if you want to fully understand how the model works and
-is implemented in the library. Note that the models available in the library only showcase some examples of what you can do
-with the Perceiver. There are many more use cases, including question answering, named-entity recognition, object detection,
-audio classification, video classification, etc.
-
-**Note**:
-
-- Perceiver does **not** work with `torch.nn.DataParallel` due to a bug in PyTorch, see [issue #36035](https://github.com/pytorch/pytorch/issues/36035)
-
-## Perceiver specific outputs
-
-[[autodoc]] models.perceiver.modeling_perceiver.PerceiverModelOutput
-
-[[autodoc]] models.perceiver.modeling_perceiver.PerceiverDecoderOutput
-
-[[autodoc]] models.perceiver.modeling_perceiver.PerceiverMaskedLMOutput
-
-[[autodoc]] models.perceiver.modeling_perceiver.PerceiverClassifierOutput
-
-## PerceiverConfig
-
-[[autodoc]] PerceiverConfig
-
-## PerceiverTokenizer
-
-[[autodoc]] PerceiverTokenizer
-    - __call__
-
-## PerceiverFeatureExtractor
-
-[[autodoc]] PerceiverFeatureExtractor
-    - __call__
-
-## PerceiverImageProcessor
-
-[[autodoc]] PerceiverImageProcessor
-    - preprocess
-
-## PerceiverTextPreprocessor
-
-[[autodoc]] models.perceiver.modeling_perceiver.PerceiverTextPreprocessor
-
-## PerceiverImagePreprocessor
-
-[[autodoc]] models.perceiver.modeling_perceiver.PerceiverImagePreprocessor
-
-## PerceiverOneHotPreprocessor
-
-[[autodoc]] models.perceiver.modeling_perceiver.PerceiverOneHotPreprocessor
-
-## PerceiverAudioPreprocessor
-
-[[autodoc]] models.perceiver.modeling_perceiver.PerceiverAudioPreprocessor
-
-## PerceiverMultimodalPreprocessor
-
-[[autodoc]] models.perceiver.modeling_perceiver.PerceiverMultimodalPreprocessor
-
-## PerceiverProjectionDecoder
-
-[[autodoc]] models.perceiver.modeling_perceiver.PerceiverProjectionDecoder
-
-## PerceiverBasicDecoder
-
-[[autodoc]] models.perceiver.modeling_perceiver.PerceiverBasicDecoder
-
-## PerceiverClassificationDecoder
-
-[[autodoc]] models.perceiver.modeling_perceiver.PerceiverClassificationDecoder
-
-## PerceiverOpticalFlowDecoder
-
-[[autodoc]] models.perceiver.modeling_perceiver.PerceiverOpticalFlowDecoder
-
-## PerceiverBasicVideoAutoencodingDecoder
-
-[[autodoc]] models.perceiver.modeling_perceiver.PerceiverBasicVideoAutoencodingDecoder
-
-## PerceiverMultimodalDecoder
-
-[[autodoc]] models.perceiver.modeling_perceiver.PerceiverMultimodalDecoder
-
-## PerceiverProjectionPostprocessor
-
-[[autodoc]] models.perceiver.modeling_perceiver.PerceiverProjectionPostprocessor
-
-## PerceiverAudioPostprocessor
-
-[[autodoc]] models.perceiver.modeling_perceiver.PerceiverAudioPostprocessor
-
-## PerceiverClassificationPostprocessor
-
-[[autodoc]] models.perceiver.modeling_perceiver.PerceiverClassificationPostprocessor
-
-## PerceiverMultimodalPostprocessor
-
-[[autodoc]] models.perceiver.modeling_perceiver.PerceiverMultimodalPostprocessor
-
-## PerceiverModel
-
-[[autodoc]] PerceiverModel
-    - forward
-
-## PerceiverForMaskedLM
-
-[[autodoc]] PerceiverForMaskedLM
-    - forward
-
-## PerceiverForSequenceClassification
-
-[[autodoc]] PerceiverForSequenceClassification
-    - forward
-
-## PerceiverForImageClassificationLearned
-
-[[autodoc]] PerceiverForImageClassificationLearned
-    - forward
-
-## PerceiverForImageClassificationFourier
-
-[[autodoc]] PerceiverForImageClassificationFourier
-    - forward
-
-## PerceiverForImageClassificationConvProcessing
-
-[[autodoc]] PerceiverForImageClassificationConvProcessing
-    - forward
-
-## PerceiverForOpticalFlow
-
-[[autodoc]] PerceiverForOpticalFlow
-    - forward
-
-## PerceiverForMultimodalAutoencoding
-
-[[autodoc]] PerceiverForMultimodalAutoencoding
-    - forward
diff --git a/docs/source/en/model_doc/persimmon.md b/docs/source/en/model_doc/persimmon.md
new file mode 100644
index 000000000000..cf13d070c622
--- /dev/null
+++ b/docs/source/en/model_doc/persimmon.md
@@ -0,0 +1,96 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Persimmon
+
+## Overview
+
+The Persimmon model was created by [ADEPT](https://www.adept.ai/blog/persimmon-8b), and authored by Erich Elsen, Augustus Odena, Maxwell Nye, Sağnak Taşırlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani.
+
+The authors introduced Persimmon-8B, a decoder model based on the classic transformers architecture, with query and key normalization. Persimmon-8B is a fully permissively-licensed model with approximately 8 billion parameters, released under the Apache license.  Some of the key attributes of Persimmon-8B are long context size (16K), performance, and capabilities for multimodal extensions.
+
+The authors showcase their approach to model evaluation, focusing on practical text generation, mirroring how users interact with language models. The work also includes a comparative analysis, pitting Persimmon-8B against other prominent models (MPT 7B Instruct and Llama 2 Base 7B 1-Shot), across various evaluation tasks. The results demonstrate Persimmon-8B's competitive performance, even with limited training data.
+
+In terms of model details, the work outlines the architecture and training methodology of Persimmon-8B, providing insights into its design choices, sequence length, and dataset composition. The authors present a fast inference code that outperforms traditional implementations through operator fusion and CUDA graph utilization while maintaining code coherence. They express their anticipation of how the community will leverage this contribution to drive innovation, hinting at further upcoming releases as part of an ongoing series of developments.
+
+
+<Tip warning={true}>
+
+The `Persimmon` models were trained using `bfloat16`, but the original inference uses `float16` The checkpoints uploaded on the hub use `torch_dtype = 'float16'` which will be
+used by the `AutoModel` API to cast the checkpoints from `torch.float32` to `torch.float16`. 
+
+The `dtype` of the online weights is mostly irrelevant, unless you are using `torch_dtype="auto"` when initializing a model using `model = AutoModelForCausalLM.from_pretrained("path", torch_dtype = "auto")`. The reason is that the model will first be downloaded ( using the `dtype` of the checkpoints online) then it will be cast to the default `dtype` of `torch` (becomes `torch.float32`). Users should specify the `torch_dtype` they want, and if they don't it will be `torch.float32`.
+
+Finetuning the model in `float16` is not recommended and known to produce `nan`, as such the model should be fine-tuned in `bfloat16`.
+
+</Tip>
+
+
+Tips:
+
+- To convert the model, you need to clone the original repository using `git clone https://github.com/persimmon-ai-labs/adept-inference`, then get the checkpoints:
+
+```bash
+git clone https://github.com/persimmon-ai-labs/adept-inference
+wget https://axtkn4xl5cip.objectstorage.us-phoenix-1.oci.customer-oci.com/n/axtkn4xl5cip/b/adept-public-data/o/8b_base_model_release.tar
+tar -xvf 8b_base_model_release.tar
+python src/transformers/models/persimmon/convert_persimmon_weights_to_hf.py  --input_dir /path/to/downloaded/persimmon/weights/ --output_dir /output/path \
+    --pt_model_path /path/to/8b_chat_model_release/iter_0001251/mp_rank_00/model_optim_rng.pt
+    --ada_lib_path /path/to/adept-inference
+```
+
+For the chat model:
+```bash
+wget https://axtkn4xl5cip.objectstorage.us-phoenix-1.oci.customer-oci.com/n/axtkn4xl5cip/b/adept-public-data/o/8b_chat_model_release.tar
+tar -xvf 8b_base_model_release.tar
+```
+
+Thereafter, models can be loaded via:
+
+```py
+from transformers import PersimmonForCausalLM, PersimmonTokenizer
+
+model = PersimmonForCausalLM.from_pretrained("/output/path")
+tokenizer = PersimmonTokenizer.from_pretrained("/output/path")
+```
+
+This model was contributed by [ArthurZ](https://huggingface.co/ArthurZ).
+The original code can be found [here](https://github.com/persimmon-ai-labs/adept-inference).
+
+- Perismmon uses a `sentencepiece` based tokenizer, with a `Unigram` model. It supports bytefallback, which is only available in `tokenizers==0.14.0` for the fast tokenizer.
+The `LlamaTokenizer` is used as it is a standard wrapper around sentencepiece. The `chat` template will be updated with the templating functions in a follow up PR!
+
+- The authors suggest to use the following prompt format for the chat mode: `f"human: {prompt}\n\nadept:"`
+
+
+## PersimmonConfig
+
+[[autodoc]] PersimmonConfig
+
+## PersimmonModel
+
+[[autodoc]] PersimmonModel
+    - forward
+
+## PersimmonForCausalLM
+
+[[autodoc]] PersimmonForCausalLM
+    - forward
+
+## PersimmonForSequenceClassification
+
+[[autodoc]] PersimmonForSequenceClassification
+    - forward
diff --git a/docs/source/en/model_doc/phobert.md b/docs/source/en/model_doc/phobert.md
new file mode 100644
index 000000000000..5543a9b3541a
--- /dev/null
+++ b/docs/source/en/model_doc/phobert.md
@@ -0,0 +1,57 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# PhoBERT
+
+## Overview
+
+The PhoBERT model was proposed in [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92.pdf) by Dat Quoc Nguyen, Anh Tuan Nguyen.
+
+The abstract from the paper is the following:
+
+*We present PhoBERT with two versions, PhoBERT-base and PhoBERT-large, the first public large-scale monolingual
+language models pre-trained for Vietnamese. Experimental results show that PhoBERT consistently outperforms the recent
+best pre-trained multilingual model XLM-R (Conneau et al., 2020) and improves the state-of-the-art in multiple
+Vietnamese-specific NLP tasks including Part-of-speech tagging, Dependency parsing, Named-entity recognition and
+Natural language inference.*
+
+Example of use:
+
+```python
+>>> import torch
+>>> from transformers import AutoModel, AutoTokenizer
+
+>>> phobert = AutoModel.from_pretrained("vinai/phobert-base")
+>>> tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
+
+>>> # INPUT TEXT MUST BE ALREADY WORD-SEGMENTED!
+>>> line = "Tôi là sinh_viên trường đại_học Công_nghệ ."
+
+>>> input_ids = torch.tensor([tokenizer.encode(line)])
+
+>>> with torch.no_grad():
+...     features = phobert(input_ids)  # Models outputs are now tuples
+
+>>> # With TensorFlow 2.0+:
+>>> # from transformers import TFAutoModel
+>>> # phobert = TFAutoModel.from_pretrained("vinai/phobert-base")
+```
+
+This model was contributed by [dqnguyen](https://huggingface.co/dqnguyen). The original code can be found [here](https://github.com/VinAIResearch/PhoBERT).
+
+## PhobertTokenizer
+
+[[autodoc]] PhobertTokenizer
diff --git a/docs/source/en/model_doc/phobert.mdx b/docs/source/en/model_doc/phobert.mdx
deleted file mode 100644
index 4ae9b0aa6251..000000000000
--- a/docs/source/en/model_doc/phobert.mdx
+++ /dev/null
@@ -1,53 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# PhoBERT
-
-## Overview
-
-The PhoBERT model was proposed in [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92.pdf) by Dat Quoc Nguyen, Anh Tuan Nguyen.
-
-The abstract from the paper is the following:
-
-*We present PhoBERT with two versions, PhoBERT-base and PhoBERT-large, the first public large-scale monolingual
-language models pre-trained for Vietnamese. Experimental results show that PhoBERT consistently outperforms the recent
-best pre-trained multilingual model XLM-R (Conneau et al., 2020) and improves the state-of-the-art in multiple
-Vietnamese-specific NLP tasks including Part-of-speech tagging, Dependency parsing, Named-entity recognition and
-Natural language inference.*
-
-Example of use:
-
-```python
->>> import torch
->>> from transformers import AutoModel, AutoTokenizer
-
->>> phobert = AutoModel.from_pretrained("vinai/phobert-base")
->>> tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
-
->>> # INPUT TEXT MUST BE ALREADY WORD-SEGMENTED!
->>> line = "Tôi là sinh_viên trường đại_học Công_nghệ ."
-
->>> input_ids = torch.tensor([tokenizer.encode(line)])
-
->>> with torch.no_grad():
-...     features = phobert(input_ids)  # Models outputs are now tuples
-
->>> # With TensorFlow 2.0+:
->>> # from transformers import TFAutoModel
->>> # phobert = TFAutoModel.from_pretrained("vinai/phobert-base")
-```
-
-This model was contributed by [dqnguyen](https://huggingface.co/dqnguyen). The original code can be found [here](https://github.com/VinAIResearch/PhoBERT).
-
-## PhobertTokenizer
-
-[[autodoc]] PhobertTokenizer
diff --git a/docs/source/en/model_doc/pix2struct.md b/docs/source/en/model_doc/pix2struct.md
new file mode 100644
index 000000000000..b722a59b82e6
--- /dev/null
+++ b/docs/source/en/model_doc/pix2struct.md
@@ -0,0 +1,78 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Pix2Struct
+
+## Overview
+
+The Pix2Struct model was proposed in [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.
+
+The abstract from the paper is the following:
+
+> Visually-situated language is ubiquitous -- sources range from textbooks with diagrams to web pages with images and tables, to mobile apps with buttons and forms. Perhaps due to this diversity, previous work has typically relied on domain-specific recipes with limited sharing of the underlying data, model architectures, and objectives. We present Pix2Struct, a pretrained image-to-text model for purely visual language understanding, which can be finetuned on tasks containing visually-situated language. Pix2Struct is pretrained by learning to parse masked screenshots of web pages into simplified HTML. The web, with its richness of visual elements cleanly reflected in the HTML structure, provides a large source of pretraining data well suited to the diversity of downstream tasks. Intuitively, this objective subsumes common pretraining signals such as OCR, language modeling, image captioning. In addition to the novel pretraining strategy, we introduce a variable-resolution input representation and a more flexible integration of language and vision inputs, where language prompts such as questions are rendered directly on top of the input image. For the first time, we show that a single pretrained model can achieve state-of-the-art results in six out of nine tasks across four domains: documents, illustrations, user interfaces, and natural images.
+
+Tips:
+
+Pix2Struct has been fine tuned on a variety of tasks and datasets, ranging from image captioning, visual question answering (VQA) over different inputs (books, charts, science diagrams), captioning UI components etc. The full list can be found in Table 1 of the paper.
+We therefore advise you to use these models for the tasks they have been fine tuned on. For instance, if you want to use Pix2Struct for UI captioning, you should use the model fine tuned on the UI dataset. If you want to use Pix2Struct for image captioning, you should use the model fine tuned on the natural images captioning dataset and so on.
+
+If you want to use the model to perform conditional text captioning, make sure to use the processor with `add_special_tokens=False`.
+
+This model was contributed by [ybelkada](https://huggingface.co/ybelkada).
+The original code can be found [here](https://github.com/google-research/pix2struct).
+
+## Resources
+
+- [Fine-tuning Notebook](https://github.com/huggingface/notebooks/blob/main/examples/image_captioning_pix2struct.ipynb)
+- [All models](https://huggingface.co/models?search=pix2struct)
+
+
+## Pix2StructConfig
+
+[[autodoc]] Pix2StructConfig
+    - from_text_vision_configs
+
+## Pix2StructTextConfig
+
+[[autodoc]] Pix2StructTextConfig
+
+## Pix2StructVisionConfig
+
+[[autodoc]] Pix2StructVisionConfig
+
+## Pix2StructProcessor
+
+[[autodoc]] Pix2StructProcessor
+
+## Pix2StructImageProcessor
+
+[[autodoc]] Pix2StructImageProcessor
+    - preprocess
+
+## Pix2StructTextModel
+
+[[autodoc]] Pix2StructTextModel
+    - forward
+
+## Pix2StructVisionModel
+
+[[autodoc]] Pix2StructVisionModel
+    - forward
+
+## Pix2StructForConditionalGeneration
+
+[[autodoc]] Pix2StructForConditionalGeneration
+    - forward
diff --git a/docs/source/en/model_doc/plbart.md b/docs/source/en/model_doc/plbart.md
new file mode 100644
index 000000000000..c9f502021485
--- /dev/null
+++ b/docs/source/en/model_doc/plbart.md
@@ -0,0 +1,119 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# PLBart
+
+**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title) and assign
+[@gchhablani](https://www.github.com/gchhablani).
+
+## Overview of PLBart
+
+The PLBART model was proposed in [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
+This is a BART-like model which can be used to perform code-summarization, code-generation, and code-translation tasks. The pre-trained model `plbart-base` has been trained using multilingual denoising task
+on Java, Python and English.
+
+According to the abstract
+
+*Code summarization and generation empower conversion between programming language (PL) and natural language (NL),
+while code translation avails the migration of legacy code from one PL to another. This paper introduces PLBART, 
+a sequence-to-sequence model capable of performing a broad spectrum of program and language understanding and generation tasks.
+PLBART is pre-trained on an extensive collection of Java and Python functions and associated NL text via denoising autoencoding.
+Experiments on code summarization in the English language, code generation, and code translation in seven programming languages
+show that PLBART outperforms or rivals state-of-the-art models. Moreover, experiments on discriminative tasks, e.g., program
+repair, clone detection, and vulnerable code detection, demonstrate PLBART's effectiveness in program understanding.
+Furthermore, analysis reveals that PLBART learns program syntax, style (e.g., identifier naming convention), logical flow
+(e.g., if block inside an else block is equivalent to else if block) that are crucial to program semantics and thus excels
+even with limited annotations.*
+
+This model was contributed by [gchhablani](https://huggingface.co/gchhablani). The Authors' code can be found [here](https://github.com/wasiahmad/PLBART).
+
+### Training of PLBart
+
+PLBart is a multilingual encoder-decoder (sequence-to-sequence) model primarily intended for code-to-text, text-to-code, code-to-code tasks. As the
+model is multilingual it expects the sequences in a different format. A special language id token is added in both the
+source and target text. The source text format is `X [eos, src_lang_code]` where `X` is the source text. The
+target text format is `[tgt_lang_code] X [eos]`. `bos` is never used.
+
+However, for fine-tuning, in some cases no language token is provided in cases where a single language is used. Please refer to [the paper](https://arxiv.org/abs/2103.06333) to learn more about this.
+
+In cases where the language code is needed, the regular [`~PLBartTokenizer.__call__`] will encode source text format 
+when you pass texts as the first argument or with the keyword argument `text`, and will encode target text format if
+it's passed with the `text_target` keyword argument.
+
+- Supervised training
+
+```python
+>>> from transformers import PLBartForConditionalGeneration, PLBartTokenizer
+
+>>> tokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-base", src_lang="en_XX", tgt_lang="python")
+>>> example_python_phrase = "def maximum(a,b,c):NEW_LINE_INDENTreturn max([a,b,c])"
+>>> expected_translation_english = "Returns the maximum value of a b c."
+>>> inputs = tokenizer(example_python_phrase, text_target=expected_translation_english, return_tensors="pt")
+>>> model(**inputs)
+```
+
+- Generation
+
+  While generating the target text set the `decoder_start_token_id` to the target language id. The following
+  example shows how to translate Python to English using the `uclanlp/plbart-python-en_XX` model.
+
+```python
+>>> from transformers import PLBartForConditionalGeneration, PLBartTokenizer
+
+>>> tokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-python-en_XX", src_lang="python", tgt_lang="en_XX")
+>>> example_python_phrase = "def maximum(a,b,c):NEW_LINE_INDENTreturn max([a,b,c])"
+>>> inputs = tokenizer(example_python_phrase, return_tensors="pt")
+>>> model = PLBartForConditionalGeneration.from_pretrained("uclanlp/plbart-python-en_XX")
+>>> translated_tokens = model.generate(**inputs, decoder_start_token_id=tokenizer.lang_code_to_id["en_XX"])
+>>> tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
+"Returns the maximum value of a b c."
+```
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Causal language modeling task guide](../tasks/language_modeling)
+- [Translation task guide](../tasks/translation)
+- [Summarization task guide](../tasks/summarization)
+
+## PLBartConfig
+
+[[autodoc]] PLBartConfig
+
+## PLBartTokenizer
+
+[[autodoc]] PLBartTokenizer
+    - build_inputs_with_special_tokens
+
+## PLBartModel
+
+[[autodoc]] PLBartModel
+    - forward
+
+## PLBartForConditionalGeneration
+
+[[autodoc]] PLBartForConditionalGeneration
+    - forward
+
+## PLBartForSequenceClassification
+
+[[autodoc]] PLBartForSequenceClassification
+    - forward
+
+## PLBartForCausalLM
+
+[[autodoc]] PLBartForCausalLM
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/plbart.mdx b/docs/source/en/model_doc/plbart.mdx
deleted file mode 100644
index 0755bb9a56e1..000000000000
--- a/docs/source/en/model_doc/plbart.mdx
+++ /dev/null
@@ -1,108 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# PLBart
-
-**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title) and assign
-[@gchhablani](https://www.github.com/gchhablani).
-
-## Overview of PLBart
-
-The PLBART model was proposed in [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
-This is a BART-like model which can be used to perform code-summarization, code-generation, and code-translation tasks. The pre-trained model `plbart-base` has been trained using multilingual denoising task
-on Java, Python and English.
-
-According to the abstract
-
-*Code summarization and generation empower conversion between programming language (PL) and natural language (NL),
-while code translation avails the migration of legacy code from one PL to another. This paper introduces PLBART, 
-a sequence-to-sequence model capable of performing a broad spectrum of program and language understanding and generation tasks.
-PLBART is pre-trained on an extensive collection of Java and Python functions and associated NL text via denoising autoencoding.
-Experiments on code summarization in the English language, code generation, and code translation in seven programming languages
-show that PLBART outperforms or rivals state-of-the-art models. Moreover, experiments on discriminative tasks, e.g., program
-repair, clone detection, and vulnerable code detection, demonstrate PLBART's effectiveness in program understanding.
-Furthermore, analysis reveals that PLBART learns program syntax, style (e.g., identifier naming convention), logical flow
-(e.g., if block inside an else block is equivalent to else if block) that are crucial to program semantics and thus excels
-even with limited annotations.*
-
-This model was contributed by [gchhablani](https://huggingface.co/gchhablani). The Authors' code can be found [here](https://github.com/wasiahmad/PLBART).
-
-### Training of PLBart
-
-PLBart is a multilingual encoder-decoder (sequence-to-sequence) model primarily intended for code-to-text, text-to-code, code-to-code tasks. As the
-model is multilingual it expects the sequences in a different format. A special language id token is added in both the
-source and target text. The source text format is `X [eos, src_lang_code]` where `X` is the source text. The
-target text format is `[tgt_lang_code] X [eos]`. `bos` is never used.
-
-However, for fine-tuning, in some cases no language token is provided in cases where a single language is used. Please refer to [the paper](https://arxiv.org/abs/2103.06333) to learn more about this.
-
-In cases where the language code is needed, the regular [`~PLBartTokenizer.__call__`] will encode source text format 
-when you pass texts as the first argument or with the keyword argument `text`, and will encode target text format if
-it's passed with the `text_target` keyword argument.
-
-- Supervised training
-
-```python
->>> from transformers import PLBartForConditionalGeneration, PLBartTokenizer
-
->>> tokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-base", src_lang="en_XX", tgt_lang="python")
->>> example_python_phrase = "def maximum(a,b,c):NEW_LINE_INDENTreturn max([a,b,c])"
->>> expected_translation_english = "Returns the maximum value of a b c."
->>> inputs = tokenizer(example_python_phrase, text_target=expected_translation_english, return_tensors="pt")
->>> model(**inputs)
-```
-
-- Generation
-
-  While generating the target text set the `decoder_start_token_id` to the target language id. The following
-  example shows how to translate Python to English using the `uclanlp/plbart-python-en_XX` model.
-
-```python
->>> from transformers import PLBartForConditionalGeneration, PLBartTokenizer
-
->>> tokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-python-en_XX", src_lang="python", tgt_lang="en_XX")
->>> example_python_phrase = "def maximum(a,b,c):NEW_LINE_INDENTreturn max([a,b,c])"
->>> inputs = tokenizer(example_python_phrase, return_tensors="pt")
->>> model = PLBartForConditionalGeneration.from_pretrained("uclanlp/plbart-python-en_XX")
->>> translated_tokens = model.generate(**inputs, decoder_start_token_id=tokenizer.lang_code_to_id["en_XX"])
->>> tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
-"Returns the maximum value of a b c."
-```
-
-## PLBartConfig
-
-[[autodoc]] PLBartConfig
-
-## PLBartTokenizer
-
-[[autodoc]] PLBartTokenizer
-    - build_inputs_with_special_tokens
-
-## PLBartModel
-
-[[autodoc]] PLBartModel
-    - forward
-
-## PLBartForConditionalGeneration
-
-[[autodoc]] PLBartForConditionalGeneration
-    - forward
-
-## PLBartForSequenceClassification
-
-[[autodoc]] PLBartForSequenceClassification
-    - forward
-
-## PLBartForCausalLM
-
-[[autodoc]] PLBartForCausalLM
-    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/poolformer.md b/docs/source/en/model_doc/poolformer.md
new file mode 100644
index 000000000000..537c60bdbcf6
--- /dev/null
+++ b/docs/source/en/model_doc/poolformer.md
@@ -0,0 +1,81 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# PoolFormer
+
+## Overview
+
+The PoolFormer model was proposed in [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418)  by Sea AI Labs. Instead of designing complicated token mixer to achieve SOTA performance, the target of this work is to demonstrate the competence of transformer models largely stem from the general architecture MetaFormer.
+
+The abstract from the paper is the following:
+
+*Transformers have shown great potential in computer vision tasks. A common belief is their attention-based token mixer module contributes most to their competence. However, recent works show the attention-based module in transformers can be replaced by spatial MLPs and the resulted models still perform quite well. Based on this observation, we hypothesize that the general architecture of the transformers, instead of the specific token mixer module, is more essential to the model's performance. To verify this, we deliberately replace the attention module in transformers with an embarrassingly simple spatial pooling operator to conduct only the most basic token mixing. Surprisingly, we observe that the derived model, termed as PoolFormer, achieves competitive performance on multiple computer vision tasks. For example, on ImageNet-1K, PoolFormer achieves 82.1% top-1 accuracy, surpassing well-tuned vision transformer/MLP-like baselines DeiT-B/ResMLP-B24 by 0.3%/1.1% accuracy with 35%/52% fewer parameters and 48%/60% fewer MACs. The effectiveness of PoolFormer verifies our hypothesis and urges us to initiate the concept of "MetaFormer", a general architecture abstracted from transformers without specifying the token mixer. Based on the extensive experiments, we argue that MetaFormer is the key player in achieving superior results for recent transformer and MLP-like models on vision tasks. This work calls for more future research dedicated to improving MetaFormer instead of focusing on the token mixer modules. Additionally, our proposed PoolFormer could serve as a starting baseline for future MetaFormer architecture design.*
+
+The figure below illustrates the architecture of PoolFormer. Taken from the [original paper](https://arxiv.org/abs/2111.11418).
+
+<img width="600" src="https://user-images.githubusercontent.com/15921929/142746124-1ab7635d-2536-4a0e-ad43-b4fe2c5a525d.png"/>
+
+
+Tips:
+
+- PoolFormer has a hierarchical architecture, where instead of Attention, a simple Average Pooling layer is present. All checkpoints of the model can be found on the [hub](https://huggingface.co/models?other=poolformer).
+- One can use [`PoolFormerImageProcessor`] to prepare images for the model.
+- As most models, PoolFormer comes in different sizes, the details of which can be found in the table below.
+
+| **Model variant** | **Depths**    | **Hidden sizes**    | **Params (M)** | **ImageNet-1k Top 1** |
+| :---------------: | ------------- | ------------------- | :------------: | :-------------------: |
+| s12               | [2, 2, 6, 2]  | [64, 128, 320, 512] | 12             | 77.2                  |
+| s24               | [4, 4, 12, 4] | [64, 128, 320, 512] | 21             | 80.3                  |
+| s36               | [6, 6, 18, 6] | [64, 128, 320, 512] | 31             | 81.4                  |
+| m36               | [6, 6, 18, 6] | [96, 192, 384, 768] | 56             | 82.1                  |
+| m48               | [8, 8, 24, 8] | [96, 192, 384, 768] | 73             | 82.5                  |
+
+This model was contributed by [heytanay](https://huggingface.co/heytanay). The original code can be found [here](https://github.com/sail-sg/poolformer).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with PoolFormer.
+
+<PipelineTag pipeline="image-classification"/>
+
+- [`PoolFormerForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
+- See also: [Image classification task guide](../tasks/image_classification)
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## PoolFormerConfig
+
+[[autodoc]] PoolFormerConfig
+
+## PoolFormerFeatureExtractor
+
+[[autodoc]] PoolFormerFeatureExtractor
+    - __call__
+
+## PoolFormerImageProcessor
+
+[[autodoc]] PoolFormerImageProcessor
+    - preprocess
+
+## PoolFormerModel
+
+[[autodoc]] PoolFormerModel
+    - forward
+
+## PoolFormerForImageClassification
+
+[[autodoc]] PoolFormerForImageClassification
+    - forward
diff --git a/docs/source/en/model_doc/poolformer.mdx b/docs/source/en/model_doc/poolformer.mdx
deleted file mode 100644
index e04762626163..000000000000
--- a/docs/source/en/model_doc/poolformer.mdx
+++ /dev/null
@@ -1,66 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# PoolFormer
-
-## Overview
-
-The PoolFormer model was proposed in [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418)  by Sea AI Labs. Instead of designing complicated token mixer to achieve SOTA performance, the target of this work is to demonstrate the competence of transformer models largely stem from the general architecture MetaFormer.
-
-The abstract from the paper is the following:
-
-*Transformers have shown great potential in computer vision tasks. A common belief is their attention-based token mixer module contributes most to their competence. However, recent works show the attention-based module in transformers can be replaced by spatial MLPs and the resulted models still perform quite well. Based on this observation, we hypothesize that the general architecture of the transformers, instead of the specific token mixer module, is more essential to the model's performance. To verify this, we deliberately replace the attention module in transformers with an embarrassingly simple spatial pooling operator to conduct only the most basic token mixing. Surprisingly, we observe that the derived model, termed as PoolFormer, achieves competitive performance on multiple computer vision tasks. For example, on ImageNet-1K, PoolFormer achieves 82.1% top-1 accuracy, surpassing well-tuned vision transformer/MLP-like baselines DeiT-B/ResMLP-B24 by 0.3%/1.1% accuracy with 35%/52% fewer parameters and 48%/60% fewer MACs. The effectiveness of PoolFormer verifies our hypothesis and urges us to initiate the concept of "MetaFormer", a general architecture abstracted from transformers without specifying the token mixer. Based on the extensive experiments, we argue that MetaFormer is the key player in achieving superior results for recent transformer and MLP-like models on vision tasks. This work calls for more future research dedicated to improving MetaFormer instead of focusing on the token mixer modules. Additionally, our proposed PoolFormer could serve as a starting baseline for future MetaFormer architecture design.*
-
-The figure below illustrates the architecture of PoolFormer. Taken from the [original paper](https://arxiv.org/abs/2111.11418).
-
-<img width="600" src="https://user-images.githubusercontent.com/15921929/142746124-1ab7635d-2536-4a0e-ad43-b4fe2c5a525d.png"/>
-
-
-Tips:
-
-- PoolFormer has a hierarchical architecture, where instead of Attention, a simple Average Pooling layer is present. All checkpoints of the model can be found on the [hub](https://huggingface.co/models?other=poolformer).
-- One can use [`PoolFormerImageProcessor`] to prepare images for the model.
-- As most models, PoolFormer comes in different sizes, the details of which can be found in the table below.
-
-| **Model variant** | **Depths**    | **Hidden sizes**    | **Params (M)** | **ImageNet-1k Top 1** |
-| :---------------: | ------------- | ------------------- | :------------: | :-------------------: |
-| s12               | [2, 2, 6, 2]  | [64, 128, 320, 512] | 12             | 77.2                  |
-| s24               | [4, 4, 12, 4] | [64, 128, 320, 512] | 21             | 80.3                  |
-| s36               | [6, 6, 18, 6] | [64, 128, 320, 512] | 31             | 81.4                  |
-| m36               | [6, 6, 18, 6] | [96, 192, 384, 768] | 56             | 82.1                  |
-| m48               | [8, 8, 24, 8] | [96, 192, 384, 768] | 73             | 82.5                  |
-
-This model was contributed by [heytanay](https://huggingface.co/heytanay). The original code can be found [here](https://github.com/sail-sg/poolformer).
-
-## PoolFormerConfig
-
-[[autodoc]] PoolFormerConfig
-
-## PoolFormerFeatureExtractor
-
-[[autodoc]] PoolFormerFeatureExtractor
-    - __call__
-
-## PoolFormerImageProcessor
-
-[[autodoc]] PoolFormerImageProcessor
-    - preprocess
-
-## PoolFormerModel
-
-[[autodoc]] PoolFormerModel
-    - forward
-
-## PoolFormerForImageClassification
-
-[[autodoc]] PoolFormerForImageClassification
-    - forward
diff --git a/docs/source/en/model_doc/pop2piano.md b/docs/source/en/model_doc/pop2piano.md
new file mode 100644
index 000000000000..95fd83f19237
--- /dev/null
+++ b/docs/source/en/model_doc/pop2piano.md
@@ -0,0 +1,196 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Pop2Piano
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/spaces/sweetcocoa/pop2piano">
+<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
+</a>
+</div>
+
+## Overview
+
+The Pop2Piano model was proposed in [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi and Kyogu Lee.
+
+Piano covers of pop music are widely enjoyed, but generating them from music is not a trivial task. It requires great 
+expertise with playing piano as well as knowing different characteristics and melodies of a song. With Pop2Piano you 
+can directly generate a cover from a song's audio waveform. It is the first model to directly generate a piano cover 
+from pop audio without melody and chord extraction modules. 
+
+Pop2Piano is an encoder-decoder Transformer model based on [T5](https://arxiv.org/pdf/1910.10683.pdf). The input audio 
+is transformed to its waveform and passed to the encoder, which transforms it to a latent representation. The decoder 
+uses these latent representations to generate token ids in an autoregressive way. Each token id corresponds to one of four 
+different token types: time, velocity, note and 'special'. The token ids are then decoded to their equivalent MIDI file.
+
+
+The abstract from the paper is the following:
+
+*Piano covers of pop music are enjoyed by many people. However, the
+task of automatically generating piano covers of pop music is still
+understudied. This is partly due to the lack of synchronized
+{Pop, Piano Cover} data pairs, which made it challenging to apply
+the latest data-intensive deep learning-based methods. To leverage
+the power of the data-driven approach, we make a large amount of
+paired and synchronized {Pop, Piano Cover} data using an automated
+pipeline. In this paper, we present Pop2Piano, a Transformer network
+that generates piano covers given waveforms of pop music. To the best
+of our knowledge, this is the first model to generate a piano cover
+directly from pop audio without using melody and chord extraction
+modules. We show that Pop2Piano, trained with our dataset, is capable
+of producing plausible piano covers.*
+
+
+Tips:
+
+1. To use Pop2Piano, you will need to install the 🤗 Transformers library, as well as the following third party modules:  
+```
+pip install pretty-midi==0.2.9 essentia==2.1b6.dev1034 librosa scipy
+```
+Please note that you may need to restart your runtime after installation.
+2. Pop2Piano is an Encoder-Decoder based model like T5.
+3. Pop2Piano can be used to generate midi-audio files for a given audio sequence.
+4. Choosing different composers in `Pop2PianoForConditionalGeneration.generate()` can lead to variety of different results.
+5. Setting the sampling rate to 44.1 kHz when loading the audio file can give good performance.
+6. Though Pop2Piano was mainly trained on Korean Pop music, it also does pretty well on other Western Pop or Hip Hop songs.
+
+This model was contributed by [Susnato Dhar](https://huggingface.co/susnato).
+The original code can be found [here](https://github.com/sweetcocoa/pop2piano).
+
+## Examples
+
+- Example using HuggingFace Dataset:
+
+```python
+>>> from datasets import load_dataset
+>>> from transformers import Pop2PianoForConditionalGeneration, Pop2PianoProcessor
+
+>>> model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano")
+>>> processor = Pop2PianoProcessor.from_pretrained("sweetcocoa/pop2piano")
+>>> ds = load_dataset("sweetcocoa/pop2piano_ci", split="test")
+
+>>> inputs = processor(
+...     audio=ds["audio"][0]["array"], sampling_rate=ds["audio"][0]["sampling_rate"], return_tensors="pt"
+... )
+>>> model_output = model.generate(input_features=inputs["input_features"], composer="composer1")
+>>> tokenizer_output = processor.batch_decode(
+...     token_ids=model_output, feature_extractor_output=inputs
+... )["pretty_midi_objects"][0]
+>>> tokenizer_output.write("./Outputs/midi_output.mid")
+```
+
+- Example using your own audio file:
+
+```python
+>>> import librosa
+>>> from transformers import Pop2PianoForConditionalGeneration, Pop2PianoProcessor
+
+>>> audio, sr = librosa.load("<your_audio_file_here>", sr=44100)  # feel free to change the sr to a suitable value.
+>>> model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano")
+>>> processor = Pop2PianoProcessor.from_pretrained("sweetcocoa/pop2piano")
+
+>>> inputs = processor(audio=audio, sampling_rate=sr, return_tensors="pt")
+>>> model_output = model.generate(input_features=inputs["input_features"], composer="composer1")
+>>> tokenizer_output = processor.batch_decode(
+...     token_ids=model_output, feature_extractor_output=inputs
+... )["pretty_midi_objects"][0]
+>>> tokenizer_output.write("./Outputs/midi_output.mid")
+```
+
+- Example of processing multiple audio files in batch:
+
+```python
+>>> import librosa
+>>> from transformers import Pop2PianoForConditionalGeneration, Pop2PianoProcessor
+
+>>> # feel free to change the sr to a suitable value.
+>>> audio1, sr1 = librosa.load("<your_first_audio_file_here>", sr=44100)  
+>>> audio2, sr2 = librosa.load("<your_second_audio_file_here>", sr=44100)
+>>> model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano")
+>>> processor = Pop2PianoProcessor.from_pretrained("sweetcocoa/pop2piano")
+
+>>> inputs = processor(audio=[audio1, audio2], sampling_rate=[sr1, sr2], return_attention_mask=True, return_tensors="pt")
+>>> # Since we now generating in batch(2 audios) we must pass the attention_mask
+>>> model_output = model.generate(
+...     input_features=inputs["input_features"],
+...     attention_mask=inputs["attention_mask"],
+...     composer="composer1",
+... )
+>>> tokenizer_output = processor.batch_decode(
+...     token_ids=model_output, feature_extractor_output=inputs
+... )["pretty_midi_objects"]
+
+>>> # Since we now have 2 generated MIDI files
+>>> tokenizer_output[0].write("./Outputs/midi_output1.mid")
+>>> tokenizer_output[1].write("./Outputs/midi_output2.mid")
+```
+
+
+- Example of processing multiple audio files in batch (Using `Pop2PianoFeatureExtractor` and `Pop2PianoTokenizer`):
+
+```python
+>>> import librosa
+>>> from transformers import Pop2PianoForConditionalGeneration, Pop2PianoFeatureExtractor, Pop2PianoTokenizer
+
+>>> # feel free to change the sr to a suitable value.
+>>> audio1, sr1 = librosa.load("<your_first_audio_file_here>", sr=44100)  
+>>> audio2, sr2 = librosa.load("<your_second_audio_file_here>", sr=44100)
+>>> model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano")
+>>> feature_extractor = Pop2PianoFeatureExtractor.from_pretrained("sweetcocoa/pop2piano")
+>>> tokenizer = Pop2PianoTokenizer.from_pretrained("sweetcocoa/pop2piano")
+
+>>> inputs = feature_extractor(
+...     audio=[audio1, audio2], 
+...     sampling_rate=[sr1, sr2], 
+...     return_attention_mask=True, 
+...     return_tensors="pt",
+... )
+>>> # Since we now generating in batch(2 audios) we must pass the attention_mask
+>>> model_output = model.generate(
+...     input_features=inputs["input_features"],
+...     attention_mask=inputs["attention_mask"],
+...     composer="composer1",
+... )
+>>> tokenizer_output = tokenizer.batch_decode(
+...     token_ids=model_output, feature_extractor_output=inputs
+... )["pretty_midi_objects"]
+
+>>> # Since we now have 2 generated MIDI files
+>>> tokenizer_output[0].write("./Outputs/midi_output1.mid")
+>>> tokenizer_output[1].write("./Outputs/midi_output2.mid")
+```
+
+
+## Pop2PianoConfig
+
+[[autodoc]] Pop2PianoConfig
+
+## Pop2PianoFeatureExtractor
+
+[[autodoc]] Pop2PianoFeatureExtractor
+    - __call__
+
+## Pop2PianoForConditionalGeneration
+
+[[autodoc]] Pop2PianoForConditionalGeneration
+    - forward
+    - generate
+
+## Pop2PianoTokenizer
+
+[[autodoc]] Pop2PianoTokenizer
+    - __call__
+
+## Pop2PianoProcessor
+
+[[autodoc]] Pop2PianoProcessor
+    - __call__
diff --git a/docs/source/en/model_doc/prophetnet.md b/docs/source/en/model_doc/prophetnet.md
new file mode 100644
index 000000000000..6ab0937da77e
--- /dev/null
+++ b/docs/source/en/model_doc/prophetnet.md
@@ -0,0 +1,107 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# ProphetNet
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=prophetnet">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-prophetnet-blueviolet">
+</a>
+<a href="https://huggingface.co/spaces/docs-demos/prophetnet-large-uncased">
+<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
+</a>
+</div>
+
+
+**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title) and assign
+@patrickvonplaten
+
+## Overview
+
+The ProphetNet model was proposed in [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training,](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei
+Zhang, Ming Zhou on 13 Jan, 2020.
+
+ProphetNet is an encoder-decoder model and can predict n-future tokens for "ngram" language modeling instead of just
+the next token.
+
+The abstract from the paper is the following:
+
+*In this paper, we present a new sequence-to-sequence pretraining model called ProphetNet, which introduces a novel
+self-supervised objective named future n-gram prediction and the proposed n-stream self-attention mechanism. Instead of
+the optimization of one-step ahead prediction in traditional sequence-to-sequence model, the ProphetNet is optimized by
+n-step ahead prediction which predicts the next n tokens simultaneously based on previous context tokens at each time
+step. The future n-gram prediction explicitly encourages the model to plan for the future tokens and prevent
+overfitting on strong local correlations. We pre-train ProphetNet using a base scale dataset (16GB) and a large scale
+dataset (160GB) respectively. Then we conduct experiments on CNN/DailyMail, Gigaword, and SQuAD 1.1 benchmarks for
+abstractive summarization and question generation tasks. Experimental results show that ProphetNet achieves new
+state-of-the-art results on all these datasets compared to the models using the same scale pretraining corpus.*
+
+Tips:
+
+- ProphetNet is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
+  the left.
+- The model architecture is based on the original Transformer, but replaces the “standard” self-attention mechanism in the decoder by a a main self-attention mechanism and a self and n-stream (predict) self-attention mechanism.
+
+The Authors' code can be found [here](https://github.com/microsoft/ProphetNet).
+
+## Documentation resources
+
+- [Causal language modeling task guide](../tasks/language_modeling)
+- [Translation task guide](../tasks/translation)
+- [Summarization task guide](../tasks/summarization)
+
+## ProphetNetConfig
+
+[[autodoc]] ProphetNetConfig
+
+## ProphetNetTokenizer
+
+[[autodoc]] ProphetNetTokenizer
+
+## ProphetNet specific outputs
+
+[[autodoc]] models.prophetnet.modeling_prophetnet.ProphetNetSeq2SeqLMOutput
+
+[[autodoc]] models.prophetnet.modeling_prophetnet.ProphetNetSeq2SeqModelOutput
+
+[[autodoc]] models.prophetnet.modeling_prophetnet.ProphetNetDecoderModelOutput
+
+[[autodoc]] models.prophetnet.modeling_prophetnet.ProphetNetDecoderLMOutput
+
+## ProphetNetModel
+
+[[autodoc]] ProphetNetModel
+    - forward
+
+## ProphetNetEncoder
+
+[[autodoc]] ProphetNetEncoder
+    - forward
+
+## ProphetNetDecoder
+
+[[autodoc]] ProphetNetDecoder
+    - forward
+
+## ProphetNetForConditionalGeneration
+
+[[autodoc]] ProphetNetForConditionalGeneration
+    - forward
+
+## ProphetNetForCausalLM
+
+[[autodoc]] ProphetNetForCausalLM
+    - forward
diff --git a/docs/source/en/model_doc/prophetnet.mdx b/docs/source/en/model_doc/prophetnet.mdx
deleted file mode 100644
index 14d0b3a92415..000000000000
--- a/docs/source/en/model_doc/prophetnet.mdx
+++ /dev/null
@@ -1,87 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# ProphetNet
-
-**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title) and assign
-@patrickvonplaten
-
-## Overview
-
-The ProphetNet model was proposed in [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training,](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei
-Zhang, Ming Zhou on 13 Jan, 2020.
-
-ProphetNet is an encoder-decoder model and can predict n-future tokens for "ngram" language modeling instead of just
-the next token.
-
-The abstract from the paper is the following:
-
-*In this paper, we present a new sequence-to-sequence pretraining model called ProphetNet, which introduces a novel
-self-supervised objective named future n-gram prediction and the proposed n-stream self-attention mechanism. Instead of
-the optimization of one-step ahead prediction in traditional sequence-to-sequence model, the ProphetNet is optimized by
-n-step ahead prediction which predicts the next n tokens simultaneously based on previous context tokens at each time
-step. The future n-gram prediction explicitly encourages the model to plan for the future tokens and prevent
-overfitting on strong local correlations. We pre-train ProphetNet using a base scale dataset (16GB) and a large scale
-dataset (160GB) respectively. Then we conduct experiments on CNN/DailyMail, Gigaword, and SQuAD 1.1 benchmarks for
-abstractive summarization and question generation tasks. Experimental results show that ProphetNet achieves new
-state-of-the-art results on all these datasets compared to the models using the same scale pretraining corpus.*
-
-Tips:
-
-- ProphetNet is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
-  the left.
-
-The Authors' code can be found [here](https://github.com/microsoft/ProphetNet).
-
-
-## ProphetNetConfig
-
-[[autodoc]] ProphetNetConfig
-
-## ProphetNetTokenizer
-
-[[autodoc]] ProphetNetTokenizer
-
-## ProphetNet specific outputs
-
-[[autodoc]] models.prophetnet.modeling_prophetnet.ProphetNetSeq2SeqLMOutput
-
-[[autodoc]] models.prophetnet.modeling_prophetnet.ProphetNetSeq2SeqModelOutput
-
-[[autodoc]] models.prophetnet.modeling_prophetnet.ProphetNetDecoderModelOutput
-
-[[autodoc]] models.prophetnet.modeling_prophetnet.ProphetNetDecoderLMOutput
-
-## ProphetNetModel
-
-[[autodoc]] ProphetNetModel
-    - forward
-
-## ProphetNetEncoder
-
-[[autodoc]] ProphetNetEncoder
-    - forward
-
-## ProphetNetDecoder
-
-[[autodoc]] ProphetNetDecoder
-    - forward
-
-## ProphetNetForConditionalGeneration
-
-[[autodoc]] ProphetNetForConditionalGeneration
-    - forward
-
-## ProphetNetForCausalLM
-
-[[autodoc]] ProphetNetForCausalLM
-    - forward
diff --git a/docs/source/en/model_doc/pvt.md b/docs/source/en/model_doc/pvt.md
new file mode 100644
index 000000000000..4b297a33f8fc
--- /dev/null
+++ b/docs/source/en/model_doc/pvt.md
@@ -0,0 +1,71 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Pyramid Vision Transformer (PVT)
+
+## Overview
+
+The PVT model was proposed in
+[Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/abs/2102.12122)
+by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. The PVT is a type of
+vision transformer that utilizes a pyramid structure to make it an effective backbone for dense prediction tasks. Specifically
+it allows for more fine-grained inputs (4 x 4 pixels per patch) to be used, while simultaneously shrinking the sequence length
+of the Transformer as it deepens - reducing the computational cost. Additionally, a spatial-reduction attention (SRA) layer
+is used to further reduce the resource consumption when learning high-resolution features.
+
+The abstract from the paper is the following:
+
+*Although convolutional neural networks (CNNs) have achieved great success in computer vision, this work investigates a 
+simpler, convolution-free backbone network useful for many dense prediction tasks. Unlike the recently proposed Vision 
+Transformer (ViT) that was designed for image classification specifically, we introduce the Pyramid Vision Transformer 
+(PVT), which overcomes the difficulties of porting Transformer to various dense prediction tasks. PVT has several 
+merits compared to current state of the arts. Different from ViT that typically yields low resolution outputs and 
+incurs high computational and memory costs, PVT not only can be trained on dense partitions of an image to achieve high 
+output resolution, which is important for dense prediction, but also uses a progressive shrinking pyramid to reduce the 
+computations of large feature maps. PVT inherits the advantages of both CNN and Transformer, making it a unified 
+backbone for various vision tasks without convolutions, where it can be used as a direct replacement for CNN backbones. 
+We validate PVT through extensive experiments, showing that it boosts the performance of many downstream tasks, including
+object detection, instance and semantic segmentation. For example, with a comparable number of parameters, PVT+RetinaNet 
+achieves 40.4 AP on the COCO dataset, surpassing ResNet50+RetinNet (36.3 AP) by 4.1 absolute AP (see Figure 2). We hope 
+that PVT could serve as an alternative and useful backbone for pixel-level predictions and facilitate future research.*
+
+This model was contributed by [Xrenya](<https://huggingface.co/Xrenya). The original code can be found [here](https://github.com/whai362/PVT).
+
+
+- PVTv1 on ImageNet-1K
+
+| **Model variant**  |**Size** |**Acc@1**|**Params (M)**|
+|--------------------|:-------:|:-------:|:------------:|
+| PVT-Tiny           |    224  |   75.1  |     13.2     |
+| PVT-Small          |    224  |   79.8  |     24.5     |
+| PVT-Medium         |    224  |   81.2  |     44.2     |
+| PVT-Large          |    224  |   81.7  |     61.4     |
+
+
+## PvtConfig
+
+[[autodoc]] PvtConfig
+
+## PvtImageProcessor
+
+[[autodoc]] PvtImageProcessor
+    - preprocess
+
+## PvtForImageClassification
+
+[[autodoc]] PvtForImageClassification
+    - forward
+
+## PvtModel
+
+[[autodoc]] PvtModel
+    - forward
diff --git a/docs/source/en/model_doc/qdqbert.md b/docs/source/en/model_doc/qdqbert.md
new file mode 100644
index 000000000000..62a0e010843d
--- /dev/null
+++ b/docs/source/en/model_doc/qdqbert.md
@@ -0,0 +1,172 @@
+<!--Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# QDQBERT
+
+## Overview
+
+The QDQBERT model can be referenced in [Integer Quantization for Deep Learning Inference: Principles and Empirical
+Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius
+Micikevicius.
+
+The abstract from the paper is the following:
+
+*Quantization techniques can reduce the size of Deep Neural Networks and improve inference latency and throughput by
+taking advantage of high throughput integer instructions. In this paper we review the mathematical aspects of
+quantization parameters and evaluate their choices on a wide range of neural network models for different application
+domains, including vision, speech, and language. We focus on quantization techniques that are amenable to acceleration
+by processors with high-throughput integer math pipelines. We also present a workflow for 8-bit quantization that is
+able to maintain accuracy within 1% of the floating-point baseline on all networks studied, including models that are
+more difficult to quantize, such as MobileNets and BERT-large.*
+
+Tips:
+
+- QDQBERT model adds fake quantization operations (pair of QuantizeLinear/DequantizeLinear ops) to (i) linear layer
+  inputs and weights, (ii) matmul inputs, (iii) residual add inputs, in BERT model.
+
+- QDQBERT requires the dependency of [Pytorch Quantization Toolkit](https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization). To install `pip install pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com`
+
+- QDQBERT model can be loaded from any checkpoint of HuggingFace BERT model (for example *bert-base-uncased*), and
+  perform Quantization Aware Training/Post Training Quantization.
+
+- A complete example of using QDQBERT model to perform Quatization Aware Training and Post Training Quantization for
+  SQUAD task can be found at [transformers/examples/research_projects/quantization-qdqbert/](examples/research_projects/quantization-qdqbert/).
+
+This model was contributed by [shangz](https://huggingface.co/shangz).
+
+
+### Set default quantizers
+
+QDQBERT model adds fake quantization operations (pair of QuantizeLinear/DequantizeLinear ops) to BERT by
+`TensorQuantizer` in [Pytorch Quantization Toolkit](https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization). `TensorQuantizer` is the module
+for quantizing tensors, with `QuantDescriptor` defining how the tensor should be quantized. Refer to [Pytorch
+Quantization Toolkit userguide](https://docs.nvidia.com/deeplearning/tensorrt/pytorch-quantization-toolkit/docs/userguide.html) for more details.
+
+Before creating QDQBERT model, one has to set the default `QuantDescriptor` defining default tensor quantizers.
+
+Example:
+
+```python
+>>> import pytorch_quantization.nn as quant_nn
+>>> from pytorch_quantization.tensor_quant import QuantDescriptor
+
+>>> # The default tensor quantizer is set to use Max calibration method
+>>> input_desc = QuantDescriptor(num_bits=8, calib_method="max")
+>>> # The default tensor quantizer is set to be per-channel quantization for weights
+>>> weight_desc = QuantDescriptor(num_bits=8, axis=((0,)))
+>>> quant_nn.QuantLinear.set_default_quant_desc_input(input_desc)
+>>> quant_nn.QuantLinear.set_default_quant_desc_weight(weight_desc)
+```
+
+### Calibration
+
+Calibration is the terminology of passing data samples to the quantizer and deciding the best scaling factors for
+tensors. After setting up the tensor quantizers, one can use the following example to calibrate the model:
+
+```python
+>>> # Find the TensorQuantizer and enable calibration
+>>> for name, module in model.named_modules():
+...     if name.endswith("_input_quantizer"):
+...         module.enable_calib()
+...         module.disable_quant()  # Use full precision data to calibrate
+
+>>> # Feeding data samples
+>>> model(x)
+>>> # ...
+
+>>> # Finalize calibration
+>>> for name, module in model.named_modules():
+...     if name.endswith("_input_quantizer"):
+...         module.load_calib_amax()
+...         module.enable_quant()
+
+>>> # If running on GPU, it needs to call .cuda() again because new tensors will be created by calibration process
+>>> model.cuda()
+
+>>> # Keep running the quantized model
+>>> # ...
+```
+
+### Export to ONNX
+
+The goal of exporting to ONNX is to deploy inference by [TensorRT](https://developer.nvidia.com/tensorrt). Fake
+quantization will be broken into a pair of QuantizeLinear/DequantizeLinear ONNX ops. After setting static member of
+TensorQuantizer to use Pytorch’s own fake quantization functions, fake quantized model can be exported to ONNX, follow
+the instructions in [torch.onnx](https://pytorch.org/docs/stable/onnx.html). Example:
+
+```python
+>>> from pytorch_quantization.nn import TensorQuantizer
+
+>>> TensorQuantizer.use_fb_fake_quant = True
+
+>>> # Load the calibrated model
+>>> ...
+>>> # ONNX export
+>>> torch.onnx.export(...)
+```
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Causal language modeling task guide](../tasks/language_modeling)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+- [Multiple choice task guide](../tasks/multiple_choice)
+
+## QDQBertConfig
+
+[[autodoc]] QDQBertConfig
+
+## QDQBertModel
+
+[[autodoc]] QDQBertModel
+    - forward
+
+## QDQBertLMHeadModel
+
+[[autodoc]] QDQBertLMHeadModel
+    - forward
+
+## QDQBertForMaskedLM
+
+[[autodoc]] QDQBertForMaskedLM
+    - forward
+
+## QDQBertForSequenceClassification
+
+[[autodoc]] QDQBertForSequenceClassification
+    - forward
+
+## QDQBertForNextSentencePrediction
+
+[[autodoc]] QDQBertForNextSentencePrediction
+    - forward
+
+## QDQBertForMultipleChoice
+
+[[autodoc]] QDQBertForMultipleChoice
+    - forward
+
+## QDQBertForTokenClassification
+
+[[autodoc]] QDQBertForTokenClassification
+    - forward
+
+## QDQBertForQuestionAnswering
+
+[[autodoc]] QDQBertForQuestionAnswering
+    - forward
diff --git a/docs/source/en/model_doc/qdqbert.mdx b/docs/source/en/model_doc/qdqbert.mdx
deleted file mode 100644
index df7b7bcee625..000000000000
--- a/docs/source/en/model_doc/qdqbert.mdx
+++ /dev/null
@@ -1,159 +0,0 @@
-<!--Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# QDQBERT
-
-## Overview
-
-The QDQBERT model can be referenced in [Integer Quantization for Deep Learning Inference: Principles and Empirical
-Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius
-Micikevicius.
-
-The abstract from the paper is the following:
-
-*Quantization techniques can reduce the size of Deep Neural Networks and improve inference latency and throughput by
-taking advantage of high throughput integer instructions. In this paper we review the mathematical aspects of
-quantization parameters and evaluate their choices on a wide range of neural network models for different application
-domains, including vision, speech, and language. We focus on quantization techniques that are amenable to acceleration
-by processors with high-throughput integer math pipelines. We also present a workflow for 8-bit quantization that is
-able to maintain accuracy within 1% of the floating-point baseline on all networks studied, including models that are
-more difficult to quantize, such as MobileNets and BERT-large.*
-
-Tips:
-
-- QDQBERT model adds fake quantization operations (pair of QuantizeLinear/DequantizeLinear ops) to (i) linear layer
-  inputs and weights, (ii) matmul inputs, (iii) residual add inputs, in BERT model.
-
-- QDQBERT requires the dependency of [Pytorch Quantization Toolkit](https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization). To install `pip install pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com`
-
-- QDQBERT model can be loaded from any checkpoint of HuggingFace BERT model (for example *bert-base-uncased*), and
-  perform Quantization Aware Training/Post Training Quantization.
-
-- A complete example of using QDQBERT model to perform Quatization Aware Training and Post Training Quantization for
-  SQUAD task can be found at [transformers/examples/research_projects/quantization-qdqbert/](examples/research_projects/quantization-qdqbert/).
-
-This model was contributed by [shangz](https://huggingface.co/shangz).
-
-
-### Set default quantizers
-
-QDQBERT model adds fake quantization operations (pair of QuantizeLinear/DequantizeLinear ops) to BERT by
-`TensorQuantizer` in [Pytorch Quantization Toolkit](https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization). `TensorQuantizer` is the module
-for quantizing tensors, with `QuantDescriptor` defining how the tensor should be quantized. Refer to [Pytorch
-Quantization Toolkit userguide](https://docs.nvidia.com/deeplearning/tensorrt/pytorch-quantization-toolkit/docs/userguide.html) for more details.
-
-Before creating QDQBERT model, one has to set the default `QuantDescriptor` defining default tensor quantizers.
-
-Example:
-
-```python
->>> import pytorch_quantization.nn as quant_nn
->>> from pytorch_quantization.tensor_quant import QuantDescriptor
-
->>> # The default tensor quantizer is set to use Max calibration method
->>> input_desc = QuantDescriptor(num_bits=8, calib_method="max")
->>> # The default tensor quantizer is set to be per-channel quantization for weights
->>> weight_desc = QuantDescriptor(num_bits=8, axis=((0,)))
->>> quant_nn.QuantLinear.set_default_quant_desc_input(input_desc)
->>> quant_nn.QuantLinear.set_default_quant_desc_weight(weight_desc)
-```
-
-### Calibration
-
-Calibration is the terminology of passing data samples to the quantizer and deciding the best scaling factors for
-tensors. After setting up the tensor quantizers, one can use the following example to calibrate the model:
-
-```python
->>> # Find the TensorQuantizer and enable calibration
->>> for name, module in model.named_modules():
-...     if name.endswith("_input_quantizer"):
-...         module.enable_calib()
-...         module.disable_quant()  # Use full precision data to calibrate
-
->>> # Feeding data samples
->>> model(x)
->>> # ...
-
->>> # Finalize calibration
->>> for name, module in model.named_modules():
-...     if name.endswith("_input_quantizer"):
-...         module.load_calib_amax()
-...         module.enable_quant()
-
->>> # If running on GPU, it needs to call .cuda() again because new tensors will be created by calibration process
->>> model.cuda()
-
->>> # Keep running the quantized model
->>> # ...
-```
-
-### Export to ONNX
-
-The goal of exporting to ONNX is to deploy inference by [TensorRT](https://developer.nvidia.com/tensorrt). Fake
-quantization will be broken into a pair of QuantizeLinear/DequantizeLinear ONNX ops. After setting static member of
-TensorQuantizer to use Pytorch’s own fake quantization functions, fake quantized model can be exported to ONNX, follow
-the instructions in [torch.onnx](https://pytorch.org/docs/stable/onnx.html). Example:
-
-```python
->>> from pytorch_quantization.nn import TensorQuantizer
-
->>> TensorQuantizer.use_fb_fake_quant = True
-
->>> # Load the calibrated model
->>> ...
->>> # ONNX export
->>> torch.onnx.export(...)
-```
-
-## QDQBertConfig
-
-[[autodoc]] QDQBertConfig
-
-## QDQBertModel
-
-[[autodoc]] QDQBertModel
-    - forward
-
-## QDQBertLMHeadModel
-
-[[autodoc]] QDQBertLMHeadModel
-    - forward
-
-## QDQBertForMaskedLM
-
-[[autodoc]] QDQBertForMaskedLM
-    - forward
-
-## QDQBertForSequenceClassification
-
-[[autodoc]] QDQBertForSequenceClassification
-    - forward
-
-## QDQBertForNextSentencePrediction
-
-[[autodoc]] QDQBertForNextSentencePrediction
-    - forward
-
-## QDQBertForMultipleChoice
-
-[[autodoc]] QDQBertForMultipleChoice
-    - forward
-
-## QDQBertForTokenClassification
-
-[[autodoc]] QDQBertForTokenClassification
-    - forward
-
-## QDQBertForQuestionAnswering
-
-[[autodoc]] QDQBertForQuestionAnswering
-    - forward
diff --git a/docs/source/en/model_doc/rag.md b/docs/source/en/model_doc/rag.md
new file mode 100644
index 000000000000..b467c6169f66
--- /dev/null
+++ b/docs/source/en/model_doc/rag.md
@@ -0,0 +1,108 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# RAG
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=rag">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-rag-blueviolet">
+</a>
+</div>
+
+## Overview
+
+Retrieval-augmented generation ("RAG") models combine the powers of pretrained dense retrieval (DPR) and
+sequence-to-sequence models. RAG models retrieve documents, pass them to a seq2seq model, then marginalize to generate
+outputs. The retriever and seq2seq modules are initialized from pretrained models, and fine-tuned jointly, allowing
+both retrieval and generation to adapt to downstream tasks.
+
+It is based on the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir
+Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela.
+
+The abstract from the paper is the following:
+
+*Large pre-trained language models have been shown to store factual knowledge in their parameters, and achieve
+state-of-the-art results when fine-tuned on downstream NLP tasks. However, their ability to access and precisely
+manipulate knowledge is still limited, and hence on knowledge-intensive tasks, their performance lags behind
+task-specific architectures. Additionally, providing provenance for their decisions and updating their world knowledge
+remain open research problems. Pre-trained models with a differentiable access mechanism to explicit nonparametric
+memory can overcome this issue, but have so far been only investigated for extractive downstream tasks. We explore a
+general-purpose fine-tuning recipe for retrieval-augmented generation (RAG) — models which combine pre-trained
+parametric and non-parametric memory for language generation. We introduce RAG models where the parametric memory is a
+pre-trained seq2seq model and the non-parametric memory is a dense vector index of Wikipedia, accessed with a
+pre-trained neural retriever. We compare two RAG formulations, one which conditions on the same retrieved passages
+across the whole generated sequence, the other can use different passages per token. We fine-tune and evaluate our
+models on a wide range of knowledge-intensive NLP tasks and set the state-of-the-art on three open domain QA tasks,
+outperforming parametric seq2seq models and task-specific retrieve-and-extract architectures. For language generation
+tasks, we find that RAG models generate more specific, diverse and factual language than a state-of-the-art
+parametric-only seq2seq baseline.*
+
+This model was contributed by [ola13](https://huggingface.co/ola13).
+
+Tips:
+- Retrieval-augmented generation (“RAG”) models combine the powers of pretrained dense retrieval (DPR) and Seq2Seq models. RAG models retrieve docs, pass them to a seq2seq model, then marginalize to generate outputs. The retriever and seq2seq modules are initialized from pretrained models, and fine-tuned jointly, allowing both retrieval and generation to adapt to downstream tasks.
+
+## RagConfig
+
+[[autodoc]] RagConfig
+
+## RagTokenizer
+
+[[autodoc]] RagTokenizer
+
+## Rag specific outputs
+
+[[autodoc]] models.rag.modeling_rag.RetrievAugLMMarginOutput
+
+[[autodoc]] models.rag.modeling_rag.RetrievAugLMOutput
+
+## RagRetriever
+
+[[autodoc]] RagRetriever
+
+## RagModel
+
+[[autodoc]] RagModel
+    - forward
+
+## RagSequenceForGeneration
+
+[[autodoc]] RagSequenceForGeneration
+    - forward
+    - generate
+
+## RagTokenForGeneration
+
+[[autodoc]] RagTokenForGeneration
+    - forward
+    - generate
+
+## TFRagModel
+
+[[autodoc]] TFRagModel
+    - call
+
+## TFRagSequenceForGeneration
+
+[[autodoc]] TFRagSequenceForGeneration
+    - call
+    - generate
+
+## TFRagTokenForGeneration
+
+[[autodoc]] TFRagTokenForGeneration
+    - call
+    - generate
diff --git a/docs/source/en/model_doc/rag.mdx b/docs/source/en/model_doc/rag.mdx
deleted file mode 100644
index 2f5d3498d8cf..000000000000
--- a/docs/source/en/model_doc/rag.mdx
+++ /dev/null
@@ -1,96 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# RAG
-
-## Overview
-
-Retrieval-augmented generation ("RAG") models combine the powers of pretrained dense retrieval (DPR) and
-sequence-to-sequence models. RAG models retrieve documents, pass them to a seq2seq model, then marginalize to generate
-outputs. The retriever and seq2seq modules are initialized from pretrained models, and fine-tuned jointly, allowing
-both retrieval and generation to adapt to downstream tasks.
-
-It is based on the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir
-Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela.
-
-The abstract from the paper is the following:
-
-*Large pre-trained language models have been shown to store factual knowledge in their parameters, and achieve
-state-of-the-art results when fine-tuned on downstream NLP tasks. However, their ability to access and precisely
-manipulate knowledge is still limited, and hence on knowledge-intensive tasks, their performance lags behind
-task-specific architectures. Additionally, providing provenance for their decisions and updating their world knowledge
-remain open research problems. Pre-trained models with a differentiable access mechanism to explicit nonparametric
-memory can overcome this issue, but have so far been only investigated for extractive downstream tasks. We explore a
-general-purpose fine-tuning recipe for retrieval-augmented generation (RAG) — models which combine pre-trained
-parametric and non-parametric memory for language generation. We introduce RAG models where the parametric memory is a
-pre-trained seq2seq model and the non-parametric memory is a dense vector index of Wikipedia, accessed with a
-pre-trained neural retriever. We compare two RAG formulations, one which conditions on the same retrieved passages
-across the whole generated sequence, the other can use different passages per token. We fine-tune and evaluate our
-models on a wide range of knowledge-intensive NLP tasks and set the state-of-the-art on three open domain QA tasks,
-outperforming parametric seq2seq models and task-specific retrieve-and-extract architectures. For language generation
-tasks, we find that RAG models generate more specific, diverse and factual language than a state-of-the-art
-parametric-only seq2seq baseline.*
-
-This model was contributed by [ola13](https://huggingface.co/ola13).
-
-
-## RagConfig
-
-[[autodoc]] RagConfig
-
-## RagTokenizer
-
-[[autodoc]] RagTokenizer
-
-## Rag specific outputs
-
-[[autodoc]] models.rag.modeling_rag.RetrievAugLMMarginOutput
-
-[[autodoc]] models.rag.modeling_rag.RetrievAugLMOutput
-
-## RagRetriever
-
-[[autodoc]] RagRetriever
-
-## RagModel
-
-[[autodoc]] RagModel
-    - forward
-
-## RagSequenceForGeneration
-
-[[autodoc]] RagSequenceForGeneration
-    - forward
-    - generate
-
-## RagTokenForGeneration
-
-[[autodoc]] RagTokenForGeneration
-    - forward
-    - generate
-
-## TFRagModel
-
-[[autodoc]] TFRagModel
-    - call
-
-## TFRagSequenceForGeneration
-
-[[autodoc]] TFRagSequenceForGeneration
-    - call
-    - generate
-
-## TFRagTokenForGeneration
-
-[[autodoc]] TFRagTokenForGeneration
-    - call
-    - generate
diff --git a/docs/source/en/model_doc/realm.md b/docs/source/en/model_doc/realm.md
new file mode 100644
index 000000000000..a8227bc83c73
--- /dev/null
+++ b/docs/source/en/model_doc/realm.md
@@ -0,0 +1,89 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# REALM
+
+## Overview
+
+The REALM model was proposed in [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang. It's a
+retrieval-augmented language model that firstly retrieves documents from a textual knowledge corpus and then
+utilizes retrieved documents to process question answering tasks.
+
+The abstract from the paper is the following:
+
+*Language model pre-training has been shown to capture a surprising amount of world knowledge, crucial for NLP tasks
+such as question answering. However, this knowledge is stored implicitly in the parameters of a neural network,
+requiring ever-larger networks to cover more facts. To capture knowledge in a more modular and interpretable way, we
+augment language model pre-training with a latent knowledge retriever, which allows the model to retrieve and attend
+over documents from a large corpus such as Wikipedia, used during pre-training, fine-tuning and inference. For the
+first time, we show how to pre-train such a knowledge retriever in an unsupervised manner, using masked language
+modeling as the learning signal and backpropagating through a retrieval step that considers millions of documents. We
+demonstrate the effectiveness of Retrieval-Augmented Language Model pre-training (REALM) by fine-tuning on the
+challenging task of Open-domain Question Answering (Open-QA). We compare against state-of-the-art models for both
+explicit and implicit knowledge storage on three popular Open-QA benchmarks, and find that we outperform all previous
+methods by a significant margin (4-16% absolute accuracy), while also providing qualitative benefits such as
+interpretability and modularity.*
+
+This model was contributed by [qqaatw](https://huggingface.co/qqaatw). The original code can be found
+[here](https://github.com/google-research/language/tree/master/language/realm).
+
+## RealmConfig
+
+[[autodoc]] RealmConfig
+
+## RealmTokenizer
+
+[[autodoc]] RealmTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+    - batch_encode_candidates
+
+## RealmTokenizerFast
+
+[[autodoc]] RealmTokenizerFast
+    - batch_encode_candidates
+
+## RealmRetriever
+
+[[autodoc]] RealmRetriever
+
+## RealmEmbedder
+
+[[autodoc]] RealmEmbedder
+    - forward
+
+## RealmScorer
+
+[[autodoc]] RealmScorer
+    - forward
+
+## RealmKnowledgeAugEncoder
+
+[[autodoc]] RealmKnowledgeAugEncoder
+    - forward
+
+## RealmReader
+
+[[autodoc]] RealmReader
+    - forward
+
+## RealmForOpenQA
+
+[[autodoc]] RealmForOpenQA
+    - block_embedding_to
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/realm.mdx b/docs/source/en/model_doc/realm.mdx
deleted file mode 100644
index 545b1e0a3bf8..000000000000
--- a/docs/source/en/model_doc/realm.mdx
+++ /dev/null
@@ -1,85 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# REALM
-
-## Overview
-
-The REALM model was proposed in [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang. It's a
-retrieval-augmented language model that firstly retrieves documents from a textual knowledge corpus and then
-utilizes retrieved documents to process question answering tasks.
-
-The abstract from the paper is the following:
-
-*Language model pre-training has been shown to capture a surprising amount of world knowledge, crucial for NLP tasks
-such as question answering. However, this knowledge is stored implicitly in the parameters of a neural network,
-requiring ever-larger networks to cover more facts. To capture knowledge in a more modular and interpretable way, we
-augment language model pre-training with a latent knowledge retriever, which allows the model to retrieve and attend
-over documents from a large corpus such as Wikipedia, used during pre-training, fine-tuning and inference. For the
-first time, we show how to pre-train such a knowledge retriever in an unsupervised manner, using masked language
-modeling as the learning signal and backpropagating through a retrieval step that considers millions of documents. We
-demonstrate the effectiveness of Retrieval-Augmented Language Model pre-training (REALM) by fine-tuning on the
-challenging task of Open-domain Question Answering (Open-QA). We compare against state-of-the-art models for both
-explicit and implicit knowledge storage on three popular Open-QA benchmarks, and find that we outperform all previous
-methods by a significant margin (4-16% absolute accuracy), while also providing qualitative benefits such as
-interpretability and modularity.*
-
-This model was contributed by [qqaatw](https://huggingface.co/qqaatw). The original code can be found
-[here](https://github.com/google-research/language/tree/master/language/realm).
-
-## RealmConfig
-
-[[autodoc]] RealmConfig
-
-## RealmTokenizer
-
-[[autodoc]] RealmTokenizer
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-    - batch_encode_candidates
-
-## RealmTokenizerFast
-
-[[autodoc]] RealmTokenizerFast
-    - batch_encode_candidates
-
-## RealmRetriever
-
-[[autodoc]] RealmRetriever
-
-## RealmEmbedder
-
-[[autodoc]] RealmEmbedder
-    - forward
-
-## RealmScorer
-
-[[autodoc]] RealmScorer
-    - forward
-
-## RealmKnowledgeAugEncoder
-
-[[autodoc]] RealmKnowledgeAugEncoder
-    - forward
-
-## RealmReader
-
-[[autodoc]] RealmReader
-    - forward
-
-## RealmForOpenQA
-
-[[autodoc]] RealmForOpenQA
-    - block_embedding_to
-    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/reformer.md b/docs/source/en/model_doc/reformer.md
new file mode 100644
index 000000000000..05274c7667b7
--- /dev/null
+++ b/docs/source/en/model_doc/reformer.md
@@ -0,0 +1,201 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Reformer
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=reformer">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-reformer-blueviolet">
+</a>
+<a href="https://huggingface.co/spaces/docs-demos/reformer-crime-and-punishment">
+<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
+</a>
+</div>
+
+**DISCLAIMER:** This model is still a work in progress, if you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title).
+
+## Overview
+
+The Reformer model was proposed in the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451.pdf) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+
+The abstract from the paper is the following:
+
+*Large Transformer models routinely achieve state-of-the-art results on a number of tasks but training these models can
+be prohibitively costly, especially on long sequences. We introduce two techniques to improve the efficiency of
+Transformers. For one, we replace dot-product attention by one that uses locality-sensitive hashing, changing its
+complexity from O(L^2) to O(Llog(L)), where L is the length of the sequence. Furthermore, we use reversible residual
+layers instead of the standard residuals, which allows storing activations only once in the training process instead of
+N times, where N is the number of layers. The resulting model, the Reformer, performs on par with Transformer models
+while being much more memory-efficient and much faster on long sequences.*
+
+This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The Authors' code can be
+found [here](https://github.com/google/trax/tree/master/trax/models/reformer).
+
+Tips:
+
+- Reformer does **not** work with *torch.nn.DataParallel* due to a bug in PyTorch, see [issue #36035](https://github.com/pytorch/pytorch/issues/36035).
+- Use Axial position encoding (see below for more details). It’s a mechanism to avoid having a huge positional encoding matrix (when the sequence length is very big) by factorizing it into smaller matrices.
+- Replace traditional attention by LSH (local-sensitive hashing) attention (see below for more details). It’s a technique to avoid computing the full product query-key in the attention layers.
+- Avoid storing the intermediate results of each layer by using reversible transformer layers to obtain them during the backward pass (subtracting the residuals from the input of the next layer gives them back) or recomputing them for results inside a given layer (less efficient than storing them but saves memory).
+- Compute the feedforward operations by chunks and not on the whole batch.
+
+## Axial Positional Encodings
+
+Axial Positional Encodings were first implemented in Google's [trax library](https://github.com/google/trax/blob/4d99ad4965bab1deba227539758d59f0df0fef48/trax/layers/research/position_encodings.py#L29)
+and developed by the authors of this model's paper. In models that are treating very long input sequences, the
+conventional position id encodings store an embedings vector of size \\(d\\) being the `config.hidden_size` for
+every position \\(i, \ldots, n_s\\), with \\(n_s\\) being `config.max_embedding_size`. This means that having
+a sequence length of \\(n_s = 2^{19} \approx 0.5M\\) and a `config.hidden_size` of \\(d = 2^{10} \approx 1000\\)
+would result in a position encoding matrix:
+
+$$X_{i,j}, \text{ with } i \in \left[1,\ldots, d\right] \text{ and } j \in \left[1,\ldots, n_s\right]$$
+
+which alone has over 500M parameters to store. Axial positional encodings factorize \\(X_{i,j}\\) into two matrices:
+
+$$X^{1}_{i,j}, \text{ with } i \in \left[1,\ldots, d^1\right] \text{ and } j \in \left[1,\ldots, n_s^1\right]$$
+
+and
+
+$$X^{2}_{i,j}, \text{ with } i \in \left[1,\ldots, d^2\right] \text{ and } j \in \left[1,\ldots, n_s^2\right]$$
+
+with:
+
+$$d = d^1 + d^2 \text{ and } n_s = n_s^1 \times n_s^2 .$$
+
+Therefore the following holds:
+
+$$X_{i,j} = \begin{cases}
+X^{1}_{i, k}, & \text{if }\ i < d^1 \text{ with } k = j \mod n_s^1 \\
+X^{2}_{i - d^1, l}, & \text{if } i \ge d^1 \text{ with } l = \lfloor\frac{j}{n_s^1}\rfloor
+\end{cases}$$
+
+Intuitively, this means that a position embedding vector \\(x_j \in \mathbb{R}^{d}\\) is now the composition of two
+factorized embedding vectors: \\(x^1_{k, l} + x^2_{l, k}\\), where as the `config.max_embedding_size` dimension
+\\(j\\) is factorized into \\(k \text{ and } l\\). This design ensures that each position embedding vector
+\\(x_j\\) is unique.
+
+Using the above example again, axial position encoding with \\(d^1 = 2^9, d^2 = 2^9, n_s^1 = 2^9, n_s^2 = 2^{10}\\)
+can drastically reduced the number of parameters from 500 000 000 to \\(2^{18} + 2^{19} \approx 780 000\\) parameters, this means 85% less memory usage.
+
+In practice, the parameter `config.axial_pos_embds_dim` is set to a tuple \\((d^1, d^2)\\) which sum has to be
+equal to `config.hidden_size` and `config.axial_pos_shape` is set to a tuple \\((n_s^1, n_s^2)\\) which
+product has to be equal to `config.max_embedding_size`, which during training has to be equal to the *sequence
+length* of the `input_ids`.
+
+
+## LSH Self Attention
+
+In Locality sensitive hashing (LSH) self attention the key and query projection weights are tied. Therefore, the key
+query embedding vectors are also tied. LSH self attention uses the locality sensitive hashing mechanism proposed in
+[Practical and Optimal LSH for Angular Distance](https://arxiv.org/abs/1509.02897) to assign each of the tied key
+query embedding vectors to one of `config.num_buckets` possible buckets. The premise is that the more "similar"
+key query embedding vectors (in terms of *cosine similarity*) are to each other, the more likely they are assigned to
+the same bucket.
+
+The accuracy of the LSH mechanism can be improved by increasing `config.num_hashes` or directly the argument
+`num_hashes` of the forward function so that the output of the LSH self attention better approximates the output
+of the "normal" full self attention. The buckets are then sorted and chunked into query key embedding vector chunks
+each of length `config.lsh_chunk_length`. For each chunk, the query embedding vectors attend to its key vectors
+(which are tied to themselves) and to the key embedding vectors of `config.lsh_num_chunks_before` previous
+neighboring chunks and `config.lsh_num_chunks_after` following neighboring chunks.
+
+For more information, see the [original Paper](https://arxiv.org/abs/2001.04451) or this great [blog post](https://www.pragmatic.ml/reformer-deep-dive/).
+
+Note that `config.num_buckets` can also be factorized into a list \\((n_{\text{buckets}}^1,
+n_{\text{buckets}}^2)\\). This way instead of assigning the query key embedding vectors to one of \\((1,\ldots,
+n_{\text{buckets}})\\) they are assigned to one of \\((1-1,\ldots, n_{\text{buckets}}^1-1, \ldots,
+1-n_{\text{buckets}}^2, \ldots, n_{\text{buckets}}^1-n_{\text{buckets}}^2)\\). This is crucial for very long sequences to
+save memory.
+
+When training a model from scratch, it is recommended to leave `config.num_buckets=None`, so that depending on the
+sequence length a good value for `num_buckets` is calculated on the fly. This value will then automatically be
+saved in the config and should be reused for inference.
+
+Using LSH self attention, the memory and time complexity of the query-key matmul operation can be reduced from
+\\(\mathcal{O}(n_s \times n_s)\\) to \\(\mathcal{O}(n_s \times \log(n_s))\\), which usually represents the memory
+and time bottleneck in a transformer model, with \\(n_s\\) being the sequence length.
+
+
+## Local Self Attention
+
+Local self attention is essentially a "normal" self attention layer with key, query and value projections, but is
+chunked so that in each chunk of length `config.local_chunk_length` the query embedding vectors only attends to
+the key embedding vectors in its chunk and to the key embedding vectors of `config.local_num_chunks_before`
+previous neighboring chunks and `config.local_num_chunks_after` following neighboring chunks.
+
+Using Local self attention, the memory and time complexity of the query-key matmul operation can be reduced from
+\\(\mathcal{O}(n_s \times n_s)\\) to \\(\mathcal{O}(n_s \times \log(n_s))\\), which usually represents the memory
+and time bottleneck in a transformer model, with \\(n_s\\) being the sequence length.
+
+
+## Training
+
+During training, we must ensure that the sequence length is set to a value that can be divided by the least common
+multiple of `config.lsh_chunk_length` and `config.local_chunk_length` and that the parameters of the Axial
+Positional Encodings are correctly set as described above. Reformer is very memory efficient so that the model can
+easily be trained on sequences as long as 64000 tokens.
+
+For training, the [`ReformerModelWithLMHead`] should be used as follows:
+
+```python
+input_ids = tokenizer.encode("This is a sentence from the training data", return_tensors="pt")
+loss = model(input_ids, labels=input_ids)[0]
+```
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Causal language modeling task guide](../tasks/language_modeling)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+
+## ReformerConfig
+
+[[autodoc]] ReformerConfig
+
+## ReformerTokenizer
+
+[[autodoc]] ReformerTokenizer
+    - save_vocabulary
+
+## ReformerTokenizerFast
+
+[[autodoc]] ReformerTokenizerFast
+
+## ReformerModel
+
+[[autodoc]] ReformerModel
+    - forward
+
+## ReformerModelWithLMHead
+
+[[autodoc]] ReformerModelWithLMHead
+    - forward
+
+## ReformerForMaskedLM
+
+[[autodoc]] ReformerForMaskedLM
+    - forward
+
+## ReformerForSequenceClassification
+
+[[autodoc]] ReformerForSequenceClassification
+    - forward
+
+## ReformerForQuestionAnswering
+
+[[autodoc]] ReformerForQuestionAnswering
+    - forward
diff --git a/docs/source/en/model_doc/reformer.mdx b/docs/source/en/model_doc/reformer.mdx
deleted file mode 100644
index 777a333e7b1f..000000000000
--- a/docs/source/en/model_doc/reformer.mdx
+++ /dev/null
@@ -1,177 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Reformer
-
-**DISCLAIMER:** This model is still a work in progress, if you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title).
-
-## Overview
-
-The Reformer model was proposed in the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451.pdf) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
-
-The abstract from the paper is the following:
-
-*Large Transformer models routinely achieve state-of-the-art results on a number of tasks but training these models can
-be prohibitively costly, especially on long sequences. We introduce two techniques to improve the efficiency of
-Transformers. For one, we replace dot-product attention by one that uses locality-sensitive hashing, changing its
-complexity from O(L^2) to O(Llog(L)), where L is the length of the sequence. Furthermore, we use reversible residual
-layers instead of the standard residuals, which allows storing activations only once in the training process instead of
-N times, where N is the number of layers. The resulting model, the Reformer, performs on par with Transformer models
-while being much more memory-efficient and much faster on long sequences.*
-
-This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The Authors' code can be
-found [here](https://github.com/google/trax/tree/master/trax/models/reformer).
-
-**Note**:
-
-- Reformer does **not** work with *torch.nn.DataParallel* due to a bug in PyTorch, see [issue #36035](https://github.com/pytorch/pytorch/issues/36035)
-
-## Axial Positional Encodings
-
-Axial Positional Encodings were first implemented in Google's [trax library](https://github.com/google/trax/blob/4d99ad4965bab1deba227539758d59f0df0fef48/trax/layers/research/position_encodings.py#L29)
-and developed by the authors of this model's paper. In models that are treating very long input sequences, the
-conventional position id encodings store an embedings vector of size \\(d\\) being the `config.hidden_size` for
-every position \\(i, \ldots, n_s\\), with \\(n_s\\) being `config.max_embedding_size`. This means that having
-a sequence length of \\(n_s = 2^{19} \approx 0.5M\\) and a `config.hidden_size` of \\(d = 2^{10} \approx 1000\\)
-would result in a position encoding matrix:
-
-$$X_{i,j}, \text{ with } i \in \left[1,\ldots, d\right] \text{ and } j \in \left[1,\ldots, n_s\right]$$
-
-which alone has over 500M parameters to store. Axial positional encodings factorize \\(X_{i,j}\\) into two matrices:
-
-$$X^{1}_{i,j}, \text{ with } i \in \left[1,\ldots, d^1\right] \text{ and } j \in \left[1,\ldots, n_s^1\right]$$
-
-and
-
-$$X^{2}_{i,j}, \text{ with } i \in \left[1,\ldots, d^2\right] \text{ and } j \in \left[1,\ldots, n_s^2\right]$$
-
-with:
-
-$$d = d^1 + d^2 \text{ and } n_s = n_s^1 \times n_s^2 .$$
-
-Therefore the following holds:
-
-$$X_{i,j} = \begin{cases}
-X^{1}_{i, k}, & \text{if }\ i < d^1 \text{ with } k = j \mod n_s^1 \\
-X^{2}_{i - d^1, l}, & \text{if } i \ge d^1 \text{ with } l = \lfloor\frac{j}{n_s^1}\rfloor
-\end{cases}$$
-
-Intuitively, this means that a position embedding vector \\(x_j \in \mathbb{R}^{d}\\) is now the composition of two
-factorized embedding vectors: \\(x^1_{k, l} + x^2_{l, k}\\), where as the `config.max_embedding_size` dimension
-\\(j\\) is factorized into \\(k \text{ and } l\\). This design ensures that each position embedding vector
-\\(x_j\\) is unique.
-
-Using the above example again, axial position encoding with \\(d^1 = 2^5, d^2 = 2^5, n_s^1 = 2^9, n_s^2 = 2^{10}\\)
-can drastically reduced the number of parameters to \\(2^{14} + 2^{15} \approx 49000\\) parameters.
-
-In practice, the parameter `config.axial_pos_embds_dim` is set to a tuple \\((d^1, d^2)\\) which sum has to be
-equal to `config.hidden_size` and `config.axial_pos_shape` is set to a tuple \\((n_s^1, n_s^2)\\) which
-product has to be equal to `config.max_embedding_size`, which during training has to be equal to the *sequence
-length* of the `input_ids`.
-
-
-## LSH Self Attention
-
-In Locality sensitive hashing (LSH) self attention the key and query projection weights are tied. Therefore, the key
-query embedding vectors are also tied. LSH self attention uses the locality sensitive hashing mechanism proposed in
-[Practical and Optimal LSH for Angular Distance](https://arxiv.org/abs/1509.02897) to assign each of the tied key
-query embedding vectors to one of `config.num_buckets` possible buckets. The premise is that the more "similar"
-key query embedding vectors (in terms of *cosine similarity*) are to each other, the more likely they are assigned to
-the same bucket.
-
-The accuracy of the LSH mechanism can be improved by increasing `config.num_hashes` or directly the argument
-`num_hashes` of the forward function so that the output of the LSH self attention better approximates the output
-of the "normal" full self attention. The buckets are then sorted and chunked into query key embedding vector chunks
-each of length `config.lsh_chunk_length`. For each chunk, the query embedding vectors attend to its key vectors
-(which are tied to themselves) and to the key embedding vectors of `config.lsh_num_chunks_before` previous
-neighboring chunks and `config.lsh_num_chunks_after` following neighboring chunks.
-
-For more information, see the [original Paper](https://arxiv.org/abs/2001.04451) or this great [blog post](https://www.pragmatic.ml/reformer-deep-dive/).
-
-Note that `config.num_buckets` can also be factorized into a list \\((n_{\text{buckets}}^1,
-n_{\text{buckets}}^2)\\). This way instead of assigning the query key embedding vectors to one of \\((1,\ldots,
-n_{\text{buckets}})\\) they are assigned to one of \\((1-1,\ldots, n_{\text{buckets}}^1-1, \ldots,
-1-n_{\text{buckets}}^2, \ldots, n_{\text{buckets}}^1-n_{\text{buckets}}^2)\\). This is crucial for very long sequences to
-save memory.
-
-When training a model from scratch, it is recommended to leave `config.num_buckets=None`, so that depending on the
-sequence length a good value for `num_buckets` is calculated on the fly. This value will then automatically be
-saved in the config and should be reused for inference.
-
-Using LSH self attention, the memory and time complexity of the query-key matmul operation can be reduced from
-\\(\mathcal{O}(n_s \times n_s)\\) to \\(\mathcal{O}(n_s \times \log(n_s))\\), which usually represents the memory
-and time bottleneck in a transformer model, with \\(n_s\\) being the sequence length.
-
-
-## Local Self Attention
-
-Local self attention is essentially a "normal" self attention layer with key, query and value projections, but is
-chunked so that in each chunk of length `config.local_chunk_length` the query embedding vectors only attends to
-the key embedding vectors in its chunk and to the key embedding vectors of `config.local_num_chunks_before`
-previous neighboring chunks and `config.local_num_chunks_after` following neighboring chunks.
-
-Using Local self attention, the memory and time complexity of the query-key matmul operation can be reduced from
-\\(\mathcal{O}(n_s \times n_s)\\) to \\(\mathcal{O}(n_s \times \log(n_s))\\), which usually represents the memory
-and time bottleneck in a transformer model, with \\(n_s\\) being the sequence length.
-
-
-## Training
-
-During training, we must ensure that the sequence length is set to a value that can be divided by the least common
-multiple of `config.lsh_chunk_length` and `config.local_chunk_length` and that the parameters of the Axial
-Positional Encodings are correctly set as described above. Reformer is very memory efficient so that the model can
-easily be trained on sequences as long as 64000 tokens.
-
-For training, the [`ReformerModelWithLMHead`] should be used as follows:
-
-```python
-input_ids = tokenizer.encode("This is a sentence from the training data", return_tensors="pt")
-loss = model(input_ids, labels=input_ids)[0]
-```
-
-## ReformerConfig
-
-[[autodoc]] ReformerConfig
-
-## ReformerTokenizer
-
-[[autodoc]] ReformerTokenizer
-    - save_vocabulary
-
-## ReformerTokenizerFast
-
-[[autodoc]] ReformerTokenizerFast
-
-## ReformerModel
-
-[[autodoc]] ReformerModel
-    - forward
-
-## ReformerModelWithLMHead
-
-[[autodoc]] ReformerModelWithLMHead
-    - forward
-
-## ReformerForMaskedLM
-
-[[autodoc]] ReformerForMaskedLM
-    - forward
-
-## ReformerForSequenceClassification
-
-[[autodoc]] ReformerForSequenceClassification
-    - forward
-
-## ReformerForQuestionAnswering
-
-[[autodoc]] ReformerForQuestionAnswering
-    - forward
diff --git a/docs/source/en/model_doc/regnet.md b/docs/source/en/model_doc/regnet.md
new file mode 100644
index 000000000000..89e89459bd7f
--- /dev/null
+++ b/docs/source/en/model_doc/regnet.md
@@ -0,0 +1,86 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# RegNet
+
+## Overview
+
+The RegNet model was proposed in [Designing Network Design Spaces](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
+
+The authors design search spaces to perform Neural Architecture Search (NAS). They first start from a high dimensional search space and iteratively reduce the search space by empirically applying constraints based on the best-performing models sampled by the current search space.
+
+The abstract from the paper is the following:
+
+*In this work, we present a new network design paradigm. Our goal is to help advance the understanding of network design and discover design principles that generalize across settings. Instead of focusing on designing individual network instances, we design network design spaces that parametrize populations of networks. The overall process is analogous to classic manual design of networks, but elevated to the design space level. Using our methodology we explore the structure aspect of network design and arrive at a low-dimensional design space consisting of simple, regular networks that we call RegNet. The core insight of the RegNet parametrization is surprisingly simple: widths and depths of good networks can be explained by a quantized linear function. We analyze the RegNet design space and arrive at interesting findings that do not match the current practice of network design. The RegNet design space provides simple and fast networks that work well across a wide range of flop regimes. Under comparable training settings and flops, the RegNet models outperform the popular EfficientNet models while being up to 5x faster on GPUs.*
+
+Tips:
+
+- One can use [`AutoImageProcessor`] to prepare images for the model.
+- The huge 10B model from [Self-supervised Pretraining of Visual Features in the Wild](https://arxiv.org/abs/2103.01988), trained on one billion Instagram images, is available on the [hub](https://huggingface.co/facebook/regnet-y-10b-seer)
+
+This model was contributed by [Francesco](https://huggingface.co/Francesco). The TensorFlow version of the model
+was contributed by [sayakpaul](https://huggingface.com/sayakpaul) and [ariG23498](https://huggingface.com/ariG23498).
+The original code can be found [here](https://github.com/facebookresearch/pycls).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with RegNet.
+
+<PipelineTag pipeline="image-classification"/>
+
+- [`RegNetForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
+- See also: [Image classification task guide](../tasks/image_classification)
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## RegNetConfig
+
+[[autodoc]] RegNetConfig
+
+
+## RegNetModel
+
+[[autodoc]] RegNetModel
+    - forward
+
+
+## RegNetForImageClassification
+
+[[autodoc]] RegNetForImageClassification
+    - forward
+
+## TFRegNetModel
+
+[[autodoc]] TFRegNetModel
+    - call
+
+
+## TFRegNetForImageClassification
+
+[[autodoc]] TFRegNetForImageClassification
+    - call
+
+
+## FlaxRegNetModel
+
+[[autodoc]] FlaxRegNetModel
+    - __call__
+
+
+## FlaxRegNetForImageClassification
+
+[[autodoc]] FlaxRegNetForImageClassification
+    - __call__
\ No newline at end of file
diff --git a/docs/source/en/model_doc/regnet.mdx b/docs/source/en/model_doc/regnet.mdx
deleted file mode 100644
index a426ad8fa146..000000000000
--- a/docs/source/en/model_doc/regnet.mdx
+++ /dev/null
@@ -1,60 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# RegNet
-
-## Overview
-
-The RegNet model was proposed in [Designing Network Design Spaces](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
-
-The authors design search spaces to perform Neural Architecture Search (NAS). They first start from a high dimensional search space and iteratively reduce the search space by empirically applying constraints based on the best-performing models sampled by the current search space.
-
-The abstract from the paper is the following:
-
-*In this work, we present a new network design paradigm. Our goal is to help advance the understanding of network design and discover design principles that generalize across settings. Instead of focusing on designing individual network instances, we design network design spaces that parametrize populations of networks. The overall process is analogous to classic manual design of networks, but elevated to the design space level. Using our methodology we explore the structure aspect of network design and arrive at a low-dimensional design space consisting of simple, regular networks that we call RegNet. The core insight of the RegNet parametrization is surprisingly simple: widths and depths of good networks can be explained by a quantized linear function. We analyze the RegNet design space and arrive at interesting findings that do not match the current practice of network design. The RegNet design space provides simple and fast networks that work well across a wide range of flop regimes. Under comparable training settings and flops, the RegNet models outperform the popular EfficientNet models while being up to 5x faster on GPUs.*
-
-Tips:
-
-- One can use [`AutoImageProcessor`] to prepare images for the model.
-- The huge 10B model from [Self-supervised Pretraining of Visual Features in the Wild](https://arxiv.org/abs/2103.01988), trained on one billion Instagram images, is available on the [hub](https://huggingface.co/facebook/regnet-y-10b-seer)
-
-This model was contributed by [Francesco](https://huggingface.co/Francesco). The TensorFlow version of the model
-was contributed by [sayakpaul](https://huggingface.com/sayakpaul) and [ariG23498](https://huggingface.com/ariG23498).
-The original code can be found [here](https://github.com/facebookresearch/pycls).
-
-
-## RegNetConfig
-
-[[autodoc]] RegNetConfig
-
-
-## RegNetModel
-
-[[autodoc]] RegNetModel
-    - forward
-
-
-## RegNetForImageClassification
-
-[[autodoc]] RegNetForImageClassification
-    - forward
-
-## TFRegNetModel
-
-[[autodoc]] TFRegNetModel
-    - call
-
-
-## TFRegNetForImageClassification
-
-[[autodoc]] TFRegNetForImageClassification
-    - call
\ No newline at end of file
diff --git a/docs/source/en/model_doc/rembert.md b/docs/source/en/model_doc/rembert.md
new file mode 100644
index 000000000000..b2e4d0f5adae
--- /dev/null
+++ b/docs/source/en/model_doc/rembert.md
@@ -0,0 +1,141 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# RemBERT
+
+## Overview
+
+The RemBERT model was proposed in [Rethinking Embedding Coupling in Pre-trained Language Models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, Melvin Johnson, Sebastian Ruder.
+
+The abstract from the paper is the following:
+
+*We re-evaluate the standard practice of sharing weights between input and output embeddings in state-of-the-art
+pre-trained language models. We show that decoupled embeddings provide increased modeling flexibility, allowing us to
+significantly improve the efficiency of parameter allocation in the input embedding of multilingual models. By
+reallocating the input embedding parameters in the Transformer layers, we achieve dramatically better performance on
+standard natural language understanding tasks with the same number of parameters during fine-tuning. We also show that
+allocating additional capacity to the output embedding provides benefits to the model that persist through the
+fine-tuning stage even though the output embedding is discarded after pre-training. Our analysis shows that larger
+output embeddings prevent the model's last layers from overspecializing to the pre-training task and encourage
+Transformer representations to be more general and more transferable to other tasks and languages. Harnessing these
+findings, we are able to train models that achieve strong performance on the XTREME benchmark without increasing the
+number of parameters at the fine-tuning stage.*
+
+Tips:
+
+For fine-tuning, RemBERT can be thought of as a bigger version of mBERT with an ALBERT-like factorization of the
+embedding layer. The embeddings are not tied in pre-training, in contrast with BERT, which enables smaller input
+embeddings (preserved during fine-tuning) and bigger output embeddings (discarded at fine-tuning). The tokenizer is
+also similar to the Albert one rather than the BERT one.
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Causal language modeling task guide](../tasks/language_modeling)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+- [Multiple choice task guide](../tasks/multiple_choice)
+
+## RemBertConfig
+
+[[autodoc]] RemBertConfig
+
+## RemBertTokenizer
+
+[[autodoc]] RemBertTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## RemBertTokenizerFast
+
+[[autodoc]] RemBertTokenizerFast
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## RemBertModel
+
+[[autodoc]] RemBertModel
+    - forward
+
+## RemBertForCausalLM
+
+[[autodoc]] RemBertForCausalLM
+    - forward
+
+## RemBertForMaskedLM
+
+[[autodoc]] RemBertForMaskedLM
+    - forward
+
+## RemBertForSequenceClassification
+
+[[autodoc]] RemBertForSequenceClassification
+    - forward
+
+## RemBertForMultipleChoice
+
+[[autodoc]] RemBertForMultipleChoice
+    - forward
+
+## RemBertForTokenClassification
+
+[[autodoc]] RemBertForTokenClassification
+    - forward
+
+## RemBertForQuestionAnswering
+
+[[autodoc]] RemBertForQuestionAnswering
+    - forward
+
+## TFRemBertModel
+
+[[autodoc]] TFRemBertModel
+    - call
+
+## TFRemBertForMaskedLM
+
+[[autodoc]] TFRemBertForMaskedLM
+    - call
+
+## TFRemBertForCausalLM
+
+[[autodoc]] TFRemBertForCausalLM
+    - call
+
+## TFRemBertForSequenceClassification
+
+[[autodoc]] TFRemBertForSequenceClassification
+    - call
+
+## TFRemBertForMultipleChoice
+
+[[autodoc]] TFRemBertForMultipleChoice
+    - call
+
+## TFRemBertForTokenClassification
+
+[[autodoc]] TFRemBertForTokenClassification
+    - call
+
+## TFRemBertForQuestionAnswering
+
+[[autodoc]] TFRemBertForQuestionAnswering
+    - call
diff --git a/docs/source/en/model_doc/rembert.mdx b/docs/source/en/model_doc/rembert.mdx
deleted file mode 100644
index 0edb8e5202d9..000000000000
--- a/docs/source/en/model_doc/rembert.mdx
+++ /dev/null
@@ -1,128 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# RemBERT
-
-## Overview
-
-The RemBERT model was proposed in [Rethinking Embedding Coupling in Pre-trained Language Models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, Melvin Johnson, Sebastian Ruder.
-
-The abstract from the paper is the following:
-
-*We re-evaluate the standard practice of sharing weights between input and output embeddings in state-of-the-art
-pre-trained language models. We show that decoupled embeddings provide increased modeling flexibility, allowing us to
-significantly improve the efficiency of parameter allocation in the input embedding of multilingual models. By
-reallocating the input embedding parameters in the Transformer layers, we achieve dramatically better performance on
-standard natural language understanding tasks with the same number of parameters during fine-tuning. We also show that
-allocating additional capacity to the output embedding provides benefits to the model that persist through the
-fine-tuning stage even though the output embedding is discarded after pre-training. Our analysis shows that larger
-output embeddings prevent the model's last layers from overspecializing to the pre-training task and encourage
-Transformer representations to be more general and more transferable to other tasks and languages. Harnessing these
-findings, we are able to train models that achieve strong performance on the XTREME benchmark without increasing the
-number of parameters at the fine-tuning stage.*
-
-Tips:
-
-For fine-tuning, RemBERT can be thought of as a bigger version of mBERT with an ALBERT-like factorization of the
-embedding layer. The embeddings are not tied in pre-training, in contrast with BERT, which enables smaller input
-embeddings (preserved during fine-tuning) and bigger output embeddings (discarded at fine-tuning). The tokenizer is
-also similar to the Albert one rather than the BERT one.
-
-## RemBertConfig
-
-[[autodoc]] RemBertConfig
-
-## RemBertTokenizer
-
-[[autodoc]] RemBertTokenizer
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-
-## RemBertTokenizerFast
-
-[[autodoc]] RemBertTokenizerFast
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-
-## RemBertModel
-
-[[autodoc]] RemBertModel
-    - forward
-
-## RemBertForCausalLM
-
-[[autodoc]] RemBertForCausalLM
-    - forward
-
-## RemBertForMaskedLM
-
-[[autodoc]] RemBertForMaskedLM
-    - forward
-
-## RemBertForSequenceClassification
-
-[[autodoc]] RemBertForSequenceClassification
-    - forward
-
-## RemBertForMultipleChoice
-
-[[autodoc]] RemBertForMultipleChoice
-    - forward
-
-## RemBertForTokenClassification
-
-[[autodoc]] RemBertForTokenClassification
-    - forward
-
-## RemBertForQuestionAnswering
-
-[[autodoc]] RemBertForQuestionAnswering
-    - forward
-
-## TFRemBertModel
-
-[[autodoc]] TFRemBertModel
-    - call
-
-## TFRemBertForMaskedLM
-
-[[autodoc]] TFRemBertForMaskedLM
-    - call
-
-## TFRemBertForCausalLM
-
-[[autodoc]] TFRemBertForCausalLM
-    - call
-
-## TFRemBertForSequenceClassification
-
-[[autodoc]] TFRemBertForSequenceClassification
-    - call
-
-## TFRemBertForMultipleChoice
-
-[[autodoc]] TFRemBertForMultipleChoice
-    - call
-
-## TFRemBertForTokenClassification
-
-[[autodoc]] TFRemBertForTokenClassification
-    - call
-
-## TFRemBertForQuestionAnswering
-
-[[autodoc]] TFRemBertForQuestionAnswering
-    - call
diff --git a/docs/source/en/model_doc/resnet.md b/docs/source/en/model_doc/resnet.md
new file mode 100644
index 000000000000..9bb36a776f16
--- /dev/null
+++ b/docs/source/en/model_doc/resnet.md
@@ -0,0 +1,87 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# ResNet
+
+## Overview
+
+The ResNet model was proposed in [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren and Jian Sun. Our implementation follows the small changes made by [Nvidia](https://catalog.ngc.nvidia.com/orgs/nvidia/resources/resnet_50_v1_5_for_pytorch), we apply the `stride=2` for downsampling in bottleneck's `3x3` conv and not in the first `1x1`. This is generally known as "ResNet v1.5".
+
+ResNet introduced residual connections, they allow to train networks with an unseen number of layers (up to 1000). ResNet won the 2015 ILSVRC & COCO competition, one important milestone in deep computer vision.
+
+The abstract from the paper is the following:
+
+*Deeper neural networks are more difficult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. We explicitly reformulate the layers as learning residual functions with reference to the layer inputs, instead of learning unreferenced functions. We provide comprehensive empirical evidence showing that these residual networks are easier to optimize, and can gain accuracy from considerably increased depth. On the ImageNet dataset we evaluate residual nets with a depth of up to 152 layers---8x deeper than VGG nets but still having lower complexity. An ensemble of these residual nets achieves 3.57% error on the ImageNet test set. This result won the 1st place on the ILSVRC 2015 classification task. We also present analysis on CIFAR-10 with 100 and 1000 layers.
+The depth of representations is of central importance for many visual recognition tasks. Solely due to our extremely deep representations, we obtain a 28% relative improvement on the COCO object detection dataset. Deep residual nets are foundations of our submissions to ILSVRC & COCO 2015 competitions, where we also won the 1st places on the tasks of ImageNet detection, ImageNet localization, COCO detection, and COCO segmentation.*
+
+Tips:
+
+- One can use [`AutoImageProcessor`] to prepare images for the model.
+
+The figure below illustrates the architecture of ResNet. Taken from the [original paper](https://arxiv.org/abs/1512.03385).
+
+<img width="600" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/resnet_architecture.png"/>
+
+This model was contributed by [Francesco](https://huggingface.co/Francesco). The TensorFlow version of this model was added by [amyeroberts](https://huggingface.co/amyeroberts). The original code can be found [here](https://github.com/KaimingHe/deep-residual-networks).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ResNet.
+
+<PipelineTag pipeline="image-classification"/>
+
+- [`ResNetForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
+- See also: [Image classification task guide](../tasks/image_classification)
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## ResNetConfig
+
+[[autodoc]] ResNetConfig
+
+
+## ResNetModel
+
+[[autodoc]] ResNetModel
+    - forward
+
+
+## ResNetForImageClassification
+
+[[autodoc]] ResNetForImageClassification
+    - forward
+
+
+## TFResNetModel
+
+[[autodoc]] TFResNetModel
+    - call
+
+
+## TFResNetForImageClassification
+
+[[autodoc]] TFResNetForImageClassification
+    - call
+
+## FlaxResNetModel
+
+[[autodoc]] FlaxResNetModel
+    - __call__
+
+## FlaxResNetForImageClassification
+
+[[autodoc]] FlaxResNetForImageClassification
+    - __call__
diff --git a/docs/source/en/model_doc/resnet.mdx b/docs/source/en/model_doc/resnet.mdx
deleted file mode 100644
index ce1799e8d48a..000000000000
--- a/docs/source/en/model_doc/resnet.mdx
+++ /dev/null
@@ -1,62 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# ResNet
-
-## Overview
-
-The ResNet model was proposed in [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren and Jian Sun. Our implementation follows the small changes made by [Nvidia](https://catalog.ngc.nvidia.com/orgs/nvidia/resources/resnet_50_v1_5_for_pytorch), we apply the `stride=2` for downsampling in bottleneck's `3x3` conv and not in the first `1x1`. This is generally known as "ResNet v1.5".
-
-ResNet introduced residual connections, they allow to train networks with an unseen number of layers (up to 1000). ResNet won the 2015 ILSVRC & COCO competition, one important milestone in deep computer vision.
-
-The abstract from the paper is the following:
-
-*Deeper neural networks are more difficult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. We explicitly reformulate the layers as learning residual functions with reference to the layer inputs, instead of learning unreferenced functions. We provide comprehensive empirical evidence showing that these residual networks are easier to optimize, and can gain accuracy from considerably increased depth. On the ImageNet dataset we evaluate residual nets with a depth of up to 152 layers---8x deeper than VGG nets but still having lower complexity. An ensemble of these residual nets achieves 3.57% error on the ImageNet test set. This result won the 1st place on the ILSVRC 2015 classification task. We also present analysis on CIFAR-10 with 100 and 1000 layers.
-The depth of representations is of central importance for many visual recognition tasks. Solely due to our extremely deep representations, we obtain a 28% relative improvement on the COCO object detection dataset. Deep residual nets are foundations of our submissions to ILSVRC & COCO 2015 competitions, where we also won the 1st places on the tasks of ImageNet detection, ImageNet localization, COCO detection, and COCO segmentation.*
-
-Tips:
-
-- One can use [`AutoImageProcessor`] to prepare images for the model.
-
-The figure below illustrates the architecture of ResNet. Taken from the [original paper](https://arxiv.org/abs/1512.03385).
-
-<img width="600" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/resnet_architecture.png"/>
-
-This model was contributed by [Francesco](https://huggingface.co/Francesco). The TensorFlow version of this model was added by [amyeroberts](https://huggingface.co/amyeroberts). The original code can be found [here](https://github.com/KaimingHe/deep-residual-networks).
-
-## ResNetConfig
-
-[[autodoc]] ResNetConfig
-
-
-## ResNetModel
-
-[[autodoc]] ResNetModel
-    - forward
-
-
-## ResNetForImageClassification
-
-[[autodoc]] ResNetForImageClassification
-    - forward
-
-
-## TFResNetModel
-
-[[autodoc]] TFResNetModel
-    - call
-
-
-## TFResNetForImageClassification
-
-[[autodoc]] TFResNetForImageClassification
-    - call
diff --git a/docs/source/en/model_doc/retribert.md b/docs/source/en/model_doc/retribert.md
new file mode 100644
index 000000000000..ab29ac966fe1
--- /dev/null
+++ b/docs/source/en/model_doc/retribert.md
@@ -0,0 +1,53 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# RetriBERT
+
+<Tip warning={true}>
+
+This model is in maintenance mode only, so we won't accept any new PRs changing its code.
+
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
+You can do so by running the following command: `pip install -U transformers==4.30.0`.
+
+</Tip>
+
+## Overview
+
+The RetriBERT model was proposed in the blog post [Explain Anything Like I'm Five: A Model for Open Domain Long Form
+Question Answering](https://yjernite.github.io/lfqa.html). RetriBERT is a small model that uses either a single or
+pair of BERT encoders with lower-dimension projection for dense semantic indexing of text.
+
+This model was contributed by [yjernite](https://huggingface.co/yjernite). Code to train and use the model can be
+found [here](https://github.com/huggingface/transformers/tree/main/examples/research-projects/distillation).
+
+
+## RetriBertConfig
+
+[[autodoc]] RetriBertConfig
+
+## RetriBertTokenizer
+
+[[autodoc]] RetriBertTokenizer
+
+## RetriBertTokenizerFast
+
+[[autodoc]] RetriBertTokenizerFast
+
+## RetriBertModel
+
+[[autodoc]] RetriBertModel
+    - forward
diff --git a/docs/source/en/model_doc/retribert.mdx b/docs/source/en/model_doc/retribert.mdx
deleted file mode 100644
index e83fae32300d..000000000000
--- a/docs/source/en/model_doc/retribert.mdx
+++ /dev/null
@@ -1,40 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# RetriBERT
-
-## Overview
-
-The RetriBERT model was proposed in the blog post [Explain Anything Like I'm Five: A Model for Open Domain Long Form
-Question Answering](https://yjernite.github.io/lfqa.html). RetriBERT is a small model that uses either a single or
-pair of BERT encoders with lower-dimension projection for dense semantic indexing of text.
-
-This model was contributed by [yjernite](https://huggingface.co/yjernite). Code to train and use the model can be
-found [here](https://github.com/huggingface/transformers/tree/main/examples/research-projects/distillation).
-
-
-## RetriBertConfig
-
-[[autodoc]] RetriBertConfig
-
-## RetriBertTokenizer
-
-[[autodoc]] RetriBertTokenizer
-
-## RetriBertTokenizerFast
-
-[[autodoc]] RetriBertTokenizerFast
-
-## RetriBertModel
-
-[[autodoc]] RetriBertModel
-    - forward
diff --git a/docs/source/en/model_doc/roberta-prelayernorm.md b/docs/source/en/model_doc/roberta-prelayernorm.md
new file mode 100644
index 000000000000..9822fd7af961
--- /dev/null
+++ b/docs/source/en/model_doc/roberta-prelayernorm.md
@@ -0,0 +1,152 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# RoBERTa-PreLayerNorm
+
+## Overview
+
+The RoBERTa-PreLayerNorm model was proposed in [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
+It is identical to using the `--encoder-normalize-before` flag in [fairseq](https://fairseq.readthedocs.io/).
+
+The abstract from the paper is the following:
+
+*fairseq is an open-source sequence modeling toolkit that allows researchers and developers to train custom models for translation, summarization, language modeling, and other text generation tasks. The toolkit is based on PyTorch and supports distributed training across multiple GPUs and machines. We also support fast mixed-precision training and inference on modern GPUs.*
+
+Tips:
+
+- The implementation is the same as [Roberta](roberta) except instead of using _Add and Norm_ it does _Norm and Add_. _Add_ and _Norm_ refers to the Addition and LayerNormalization as described in [Attention Is All You Need](https://arxiv.org/abs/1706.03762).
+- This is identical to using the `--encoder-normalize-before` flag in [fairseq](https://fairseq.readthedocs.io/).
+
+This model was contributed by [andreasmaden](https://huggingface.co/andreasmaden).
+The original code can be found [here](https://github.com/princeton-nlp/DinkyTrain).
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Causal language modeling task guide](../tasks/language_modeling)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+- [Multiple choice task guide](../tasks/multiple_choice)
+
+## RobertaPreLayerNormConfig
+
+[[autodoc]] RobertaPreLayerNormConfig
+
+## RobertaPreLayerNormModel
+
+[[autodoc]] RobertaPreLayerNormModel
+    - forward
+
+## RobertaPreLayerNormForCausalLM
+
+[[autodoc]] RobertaPreLayerNormForCausalLM
+    - forward
+
+## RobertaPreLayerNormForMaskedLM
+
+[[autodoc]] RobertaPreLayerNormForMaskedLM
+    - forward
+
+## RobertaPreLayerNormForSequenceClassification
+
+[[autodoc]] RobertaPreLayerNormForSequenceClassification
+    - forward
+
+## RobertaPreLayerNormForMultipleChoice
+
+[[autodoc]] RobertaPreLayerNormForMultipleChoice
+    - forward
+
+## RobertaPreLayerNormForTokenClassification
+
+[[autodoc]] RobertaPreLayerNormForTokenClassification
+    - forward
+
+## RobertaPreLayerNormForQuestionAnswering
+
+[[autodoc]] RobertaPreLayerNormForQuestionAnswering
+    - forward
+
+## TFRobertaPreLayerNormModel
+
+[[autodoc]] TFRobertaPreLayerNormModel
+    - call
+
+## TFRobertaPreLayerNormForCausalLM
+
+[[autodoc]] TFRobertaPreLayerNormForCausalLM
+    - call
+
+## TFRobertaPreLayerNormForMaskedLM
+
+[[autodoc]] TFRobertaPreLayerNormForMaskedLM
+    - call
+
+## TFRobertaPreLayerNormForSequenceClassification
+
+[[autodoc]] TFRobertaPreLayerNormForSequenceClassification
+    - call
+
+## TFRobertaPreLayerNormForMultipleChoice
+
+[[autodoc]] TFRobertaPreLayerNormForMultipleChoice
+    - call
+
+## TFRobertaPreLayerNormForTokenClassification
+
+[[autodoc]] TFRobertaPreLayerNormForTokenClassification
+    - call
+
+## TFRobertaPreLayerNormForQuestionAnswering
+
+[[autodoc]] TFRobertaPreLayerNormForQuestionAnswering
+    - call
+
+## FlaxRobertaPreLayerNormModel
+
+[[autodoc]] FlaxRobertaPreLayerNormModel
+    - __call__
+
+## FlaxRobertaPreLayerNormForCausalLM
+
+[[autodoc]] FlaxRobertaPreLayerNormForCausalLM
+    - __call__
+
+## FlaxRobertaPreLayerNormForMaskedLM
+
+[[autodoc]] FlaxRobertaPreLayerNormForMaskedLM
+    - __call__
+
+## FlaxRobertaPreLayerNormForSequenceClassification
+
+[[autodoc]] FlaxRobertaPreLayerNormForSequenceClassification
+    - __call__
+
+## FlaxRobertaPreLayerNormForMultipleChoice
+
+[[autodoc]] FlaxRobertaPreLayerNormForMultipleChoice
+    - __call__
+
+## FlaxRobertaPreLayerNormForTokenClassification
+
+[[autodoc]] FlaxRobertaPreLayerNormForTokenClassification
+    - __call__
+
+## FlaxRobertaPreLayerNormForQuestionAnswering
+
+[[autodoc]] FlaxRobertaPreLayerNormForQuestionAnswering
+    - __call__
diff --git a/docs/source/en/model_doc/roberta-prelayernorm.mdx b/docs/source/en/model_doc/roberta-prelayernorm.mdx
deleted file mode 100644
index a8fb2bb2b9aa..000000000000
--- a/docs/source/en/model_doc/roberta-prelayernorm.mdx
+++ /dev/null
@@ -1,140 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# RoBERTa-PreLayerNorm
-
-## Overview
-
-The RoBERTa-PreLayerNorm model was proposed in [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
-It is identical to using the `--encoder-normalize-before` flag in [fairseq](https://fairseq.readthedocs.io/).
-
-The abstract from the paper is the following:
-
-*fairseq is an open-source sequence modeling toolkit that allows researchers and developers to train custom models for translation, summarization, language modeling, and other text generation tasks. The toolkit is based on PyTorch and supports distributed training across multiple GPUs and machines. We also support fast mixed-precision training and inference on modern GPUs.*
-
-Tips:
-
-- The implementation is the same as [Roberta](roberta) except instead of using _Add and Norm_ it does _Norm and Add_. _Add_ and _Norm_ refers to the Addition and LayerNormalization as described in [Attention Is All You Need](https://arxiv.org/abs/1706.03762).
-- This is identical to using the `--encoder-normalize-before` flag in [fairseq](https://fairseq.readthedocs.io/).
-
-This model was contributed by [andreasmaden](https://huggingface.co/andreasmaden).
-The original code can be found [here](https://github.com/princeton-nlp/DinkyTrain).
-
-
-## RobertaPreLayerNormConfig
-
-[[autodoc]] RobertaPreLayerNormConfig
-
-## RobertaPreLayerNormModel
-
-[[autodoc]] RobertaPreLayerNormModel
-    - forward
-
-## RobertaPreLayerNormForCausalLM
-
-[[autodoc]] RobertaPreLayerNormForCausalLM
-    - forward
-
-## RobertaPreLayerNormForMaskedLM
-
-[[autodoc]] RobertaPreLayerNormForMaskedLM
-    - forward
-
-## RobertaPreLayerNormForSequenceClassification
-
-[[autodoc]] RobertaPreLayerNormForSequenceClassification
-    - forward
-
-## RobertaPreLayerNormForMultipleChoice
-
-[[autodoc]] RobertaPreLayerNormForMultipleChoice
-    - forward
-
-## RobertaPreLayerNormForTokenClassification
-
-[[autodoc]] RobertaPreLayerNormForTokenClassification
-    - forward
-
-## RobertaPreLayerNormForQuestionAnswering
-
-[[autodoc]] RobertaPreLayerNormForQuestionAnswering
-    - forward
-
-## TFRobertaPreLayerNormModel
-
-[[autodoc]] TFRobertaPreLayerNormModel
-    - call
-
-## TFRobertaPreLayerNormForCausalLM
-
-[[autodoc]] TFRobertaPreLayerNormForCausalLM
-    - call
-
-## TFRobertaPreLayerNormForMaskedLM
-
-[[autodoc]] TFRobertaPreLayerNormForMaskedLM
-    - call
-
-## TFRobertaPreLayerNormForSequenceClassification
-
-[[autodoc]] TFRobertaPreLayerNormForSequenceClassification
-    - call
-
-## TFRobertaPreLayerNormForMultipleChoice
-
-[[autodoc]] TFRobertaPreLayerNormForMultipleChoice
-    - call
-
-## TFRobertaPreLayerNormForTokenClassification
-
-[[autodoc]] TFRobertaPreLayerNormForTokenClassification
-    - call
-
-## TFRobertaPreLayerNormForQuestionAnswering
-
-[[autodoc]] TFRobertaPreLayerNormForQuestionAnswering
-    - call
-
-## FlaxRobertaPreLayerNormModel
-
-[[autodoc]] FlaxRobertaPreLayerNormModel
-    - __call__
-
-## FlaxRobertaPreLayerNormForCausalLM
-
-[[autodoc]] FlaxRobertaPreLayerNormForCausalLM
-    - __call__
-
-## FlaxRobertaPreLayerNormForMaskedLM
-
-[[autodoc]] FlaxRobertaPreLayerNormForMaskedLM
-    - __call__
-
-## FlaxRobertaPreLayerNormForSequenceClassification
-
-[[autodoc]] FlaxRobertaPreLayerNormForSequenceClassification
-    - __call__
-
-## FlaxRobertaPreLayerNormForMultipleChoice
-
-[[autodoc]] FlaxRobertaPreLayerNormForMultipleChoice
-    - __call__
-
-## FlaxRobertaPreLayerNormForTokenClassification
-
-[[autodoc]] FlaxRobertaPreLayerNormForTokenClassification
-    - __call__
-
-## FlaxRobertaPreLayerNormForQuestionAnswering
-
-[[autodoc]] FlaxRobertaPreLayerNormForQuestionAnswering
-    - __call__
diff --git a/docs/source/en/model_doc/roberta.md b/docs/source/en/model_doc/roberta.md
new file mode 100644
index 000000000000..5a2ba6b5cf66
--- /dev/null
+++ b/docs/source/en/model_doc/roberta.md
@@ -0,0 +1,233 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# RoBERTa
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=roberta">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-roberta-blueviolet">
+</a>
+<a href="https://huggingface.co/spaces/docs-demos/roberta-base">
+<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
+</a>
+<a href="https://huggingface.co/papers/1907.11692">
+<img alt="Paper page" src="https://img.shields.io/badge/Paper%20page-1907.11692-green">
+</a>
+</div>
+
+## Overview
+
+The RoBERTa model was proposed in [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, [Myle Ott](https://huggingface.co/myleott), Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer
+Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. It is based on Google's BERT model released in 2018.
+
+It builds on BERT and modifies key hyperparameters, removing the next-sentence pretraining objective and training with
+much larger mini-batches and learning rates.
+
+The abstract from the paper is the following:
+
+*Language model pretraining has led to significant performance gains but careful comparison between different
+approaches is challenging. Training is computationally expensive, often done on private datasets of different sizes,
+and, as we will show, hyperparameter choices have significant impact on the final results. We present a replication
+study of BERT pretraining (Devlin et al., 2019) that carefully measures the impact of many key hyperparameters and
+training data size. We find that BERT was significantly undertrained, and can match or exceed the performance of every
+model published after it. Our best model achieves state-of-the-art results on GLUE, RACE and SQuAD. These results
+highlight the importance of previously overlooked design choices, and raise questions about the source of recently
+reported improvements. We release our models and code.*
+
+Tips:
+
+- This implementation is the same as [`BertModel`] with a tiny embeddings tweak as well as a setup
+  for Roberta pretrained models.
+- RoBERTa has the same architecture as BERT, but uses a byte-level BPE as a tokenizer (same as GPT-2) and uses a
+  different pretraining scheme.
+- RoBERTa doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. Just
+  separate your segments with the separation token `tokenizer.sep_token` (or `</s>`)
+- Same as BERT with better pretraining tricks:
+
+    * dynamic masking: tokens are masked differently at each epoch, whereas BERT does it once and for all
+    * together to reach 512 tokens (so the sentences are in an order than may span several documents)
+    * train with larger batches
+    * use BPE with bytes as a subunit and not characters (because of unicode characters)
+- [CamemBERT](camembert) is a wrapper around RoBERTa. Refer to this page for usage examples.
+
+This model was contributed by [julien-c](https://huggingface.co/julien-c). The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/examples/roberta).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with RoBERTa. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+<PipelineTag pipeline="text-classification"/>
+
+- A blog on [Getting Started with Sentiment Analysis on Twitter](https://huggingface.co/blog/sentiment-analysis-twitter) using RoBERTa and the [Inference API](https://huggingface.co/inference-api).
+- A blog on [Opinion Classification with Kili and Hugging Face AutoTrain](https://huggingface.co/blog/opinion-classification-with-kili) using RoBERTa.
+- A notebook on how to [finetune RoBERTa for sentiment analysis](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb). 🌎
+- [`RobertaForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb).
+- [`TFRobertaForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb).
+- [`FlaxRobertaForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_flax.ipynb).
+- [Text classification task guide](../tasks/sequence_classification)
+
+<PipelineTag pipeline="token-classification"/>
+
+- [`RobertaForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/token-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb).
+- [`TFRobertaForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/token-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb).
+- [`FlaxRobertaForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/token-classification).
+- [Token classification](https://huggingface.co/course/chapter7/2?fw=pt) chapter of the 🤗 Hugging Face Course.
+- [Token classification task guide](../tasks/token_classification)
+
+<PipelineTag pipeline="fill-mask"/>
+
+- A blog on [How to train a new language model from scratch using Transformers and Tokenizers](https://huggingface.co/blog/how-to-train) with RoBERTa.
+- [`RobertaForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#robertabertdistilbert-and-masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
+- [`TFRobertaForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_mlmpy) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
+- [`FlaxRobertaForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/masked_language_modeling_flax.ipynb).
+- [Masked language modeling](https://huggingface.co/course/chapter7/3?fw=pt) chapter of the 🤗 Hugging Face Course.
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+
+<PipelineTag pipeline="question-answering"/>
+
+- A blog on [Accelerated Inference with Optimum and Transformers Pipelines](https://huggingface.co/blog/optimum-inference) with RoBERTa for question answering.
+- [`RobertaForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb).
+- [`TFRobertaForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb).
+- [`FlaxRobertaForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/question-answering).
+- [Question answering](https://huggingface.co/course/chapter7/7?fw=pt) chapter of the 🤗 Hugging Face Course.
+- [Question answering task guide](../tasks/question_answering)
+
+**Multiple choice**
+- [`RobertaForMultipleChoice`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/multiple-choice) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb).
+- [`TFRobertaForMultipleChoice`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/multiple-choice) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb).
+- [Multiple choice task guide](../tasks/multiple_choice)
+
+## RobertaConfig
+
+[[autodoc]] RobertaConfig
+
+## RobertaTokenizer
+
+[[autodoc]] RobertaTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## RobertaTokenizerFast
+
+[[autodoc]] RobertaTokenizerFast
+    - build_inputs_with_special_tokens
+
+## RobertaModel
+
+[[autodoc]] RobertaModel
+    - forward
+
+## RobertaForCausalLM
+
+[[autodoc]] RobertaForCausalLM
+    - forward
+
+## RobertaForMaskedLM
+
+[[autodoc]] RobertaForMaskedLM
+    - forward
+
+## RobertaForSequenceClassification
+
+[[autodoc]] RobertaForSequenceClassification
+    - forward
+
+## RobertaForMultipleChoice
+
+[[autodoc]] RobertaForMultipleChoice
+    - forward
+
+## RobertaForTokenClassification
+
+[[autodoc]] RobertaForTokenClassification
+    - forward
+
+## RobertaForQuestionAnswering
+
+[[autodoc]] RobertaForQuestionAnswering
+    - forward
+
+## TFRobertaModel
+
+[[autodoc]] TFRobertaModel
+    - call
+
+## TFRobertaForCausalLM
+
+[[autodoc]] TFRobertaForCausalLM
+    - call
+
+## TFRobertaForMaskedLM
+
+[[autodoc]] TFRobertaForMaskedLM
+    - call
+
+## TFRobertaForSequenceClassification
+
+[[autodoc]] TFRobertaForSequenceClassification
+    - call
+
+## TFRobertaForMultipleChoice
+
+[[autodoc]] TFRobertaForMultipleChoice
+    - call
+
+## TFRobertaForTokenClassification
+
+[[autodoc]] TFRobertaForTokenClassification
+    - call
+
+## TFRobertaForQuestionAnswering
+
+[[autodoc]] TFRobertaForQuestionAnswering
+    - call
+
+## FlaxRobertaModel
+
+[[autodoc]] FlaxRobertaModel
+    - __call__
+
+## FlaxRobertaForCausalLM
+
+[[autodoc]] FlaxRobertaForCausalLM
+    - __call__
+
+## FlaxRobertaForMaskedLM
+
+[[autodoc]] FlaxRobertaForMaskedLM
+    - __call__
+
+## FlaxRobertaForSequenceClassification
+
+[[autodoc]] FlaxRobertaForSequenceClassification
+    - __call__
+
+## FlaxRobertaForMultipleChoice
+
+[[autodoc]] FlaxRobertaForMultipleChoice
+    - __call__
+
+## FlaxRobertaForTokenClassification
+
+[[autodoc]] FlaxRobertaForTokenClassification
+    - __call__
+
+## FlaxRobertaForQuestionAnswering
+
+[[autodoc]] FlaxRobertaForQuestionAnswering
+    - __call__
diff --git a/docs/source/en/model_doc/roberta.mdx b/docs/source/en/model_doc/roberta.mdx
deleted file mode 100644
index 61f44381b0c4..000000000000
--- a/docs/source/en/model_doc/roberta.mdx
+++ /dev/null
@@ -1,206 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# RoBERTa
-
-## Overview
-
-The RoBERTa model was proposed in [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer
-Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. It is based on Google's BERT model released in 2018.
-
-It builds on BERT and modifies key hyperparameters, removing the next-sentence pretraining objective and training with
-much larger mini-batches and learning rates.
-
-The abstract from the paper is the following:
-
-*Language model pretraining has led to significant performance gains but careful comparison between different
-approaches is challenging. Training is computationally expensive, often done on private datasets of different sizes,
-and, as we will show, hyperparameter choices have significant impact on the final results. We present a replication
-study of BERT pretraining (Devlin et al., 2019) that carefully measures the impact of many key hyperparameters and
-training data size. We find that BERT was significantly undertrained, and can match or exceed the performance of every
-model published after it. Our best model achieves state-of-the-art results on GLUE, RACE and SQuAD. These results
-highlight the importance of previously overlooked design choices, and raise questions about the source of recently
-reported improvements. We release our models and code.*
-
-Tips:
-
-- This implementation is the same as [`BertModel`] with a tiny embeddings tweak as well as a setup
-  for Roberta pretrained models.
-- RoBERTa has the same architecture as BERT, but uses a byte-level BPE as a tokenizer (same as GPT-2) and uses a
-  different pretraining scheme.
-- RoBERTa doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. Just
-  separate your segments with the separation token `tokenizer.sep_token` (or `</s>`)
-- [CamemBERT](camembert) is a wrapper around RoBERTa. Refer to this page for usage examples.
-
-This model was contributed by [julien-c](https://huggingface.co/julien-c). The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/examples/roberta).
-
-## Resources
-
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with RoBERTa. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
-<PipelineTag pipeline="text-classification"/>
-
-- A blog on [Getting Started with Sentiment Analysis on Twitter](https://huggingface.co/blog/sentiment-analysis-twitter) using RoBERTa and the [Inference API](https://huggingface.co/inference-api).
-- A blog on [Opinion Classification with Kili and Hugging Face AutoTrain](https://huggingface.co/blog/opinion-classification-with-kili) using RoBERTa.
-- A notebook on how to [finetune RoBERTa for sentiment analysis](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb). 🌎
-- [`RobertaForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb).
-- [`TFRobertaForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb).
-- [`FlaxRobertaForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_flax.ipynb).
-
-<PipelineTag pipeline="token-classification"/>
-
-- [`RobertaForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/token-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb).
-- [`TFRobertaForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/token-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb).
-- [`FlaxRobertaForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/token-classification).
-- [Token classification](https://huggingface.co/course/chapter7/2?fw=pt) chapter of the 🤗 Hugging Face Course.
-
-<PipelineTag pipeline="fill-mask"/>
-
-- A blog on [How to train a new language model from scratch using Transformers and Tokenizers](https://huggingface.co/blog/how-to-train) with RoBERTa.
-- [`RobertaForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#robertabertdistilbert-and-masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
-- [`TFRobertaForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_mlmpy) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
-- [`FlaxRobertaForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/masked_language_modeling_flax.ipynb).
-- [Masked language modeling](https://huggingface.co/course/chapter7/3?fw=pt) chapter of the 🤗 Hugging Face Course.
-
-<PipelineTag pipeline="question-answering"/>
-
-- A blog on [Accelerated Inference with Optimum and Transformers Pipelines](https://huggingface.co/blog/optimum-inference) with RoBERTa for question answering.
-- [`RobertaForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb).
-- [`TFRobertaForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb).
-- [`FlaxRobertaForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/question-answering).
-- [Question answering](https://huggingface.co/course/chapter7/7?fw=pt) chapter of the 🤗 Hugging Face Course.
-
-**Multiple choice**
-- [`RobertaForMultipleChoice`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/multiple-choice) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb).
-- [`TFRobertaForMultipleChoice`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/multiple-choice) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb).
-
-## RobertaConfig
-
-[[autodoc]] RobertaConfig
-
-## RobertaTokenizer
-
-[[autodoc]] RobertaTokenizer
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-
-## RobertaTokenizerFast
-
-[[autodoc]] RobertaTokenizerFast
-    - build_inputs_with_special_tokens
-
-## RobertaModel
-
-[[autodoc]] RobertaModel
-    - forward
-
-## RobertaForCausalLM
-
-[[autodoc]] RobertaForCausalLM
-    - forward
-
-## RobertaForMaskedLM
-
-[[autodoc]] RobertaForMaskedLM
-    - forward
-
-## RobertaForSequenceClassification
-
-[[autodoc]] RobertaForSequenceClassification
-    - forward
-
-## RobertaForMultipleChoice
-
-[[autodoc]] RobertaForMultipleChoice
-    - forward
-
-## RobertaForTokenClassification
-
-[[autodoc]] RobertaForTokenClassification
-    - forward
-
-## RobertaForQuestionAnswering
-
-[[autodoc]] RobertaForQuestionAnswering
-    - forward
-
-## TFRobertaModel
-
-[[autodoc]] TFRobertaModel
-    - call
-
-## TFRobertaForCausalLM
-
-[[autodoc]] TFRobertaForCausalLM
-    - call
-
-## TFRobertaForMaskedLM
-
-[[autodoc]] TFRobertaForMaskedLM
-    - call
-
-## TFRobertaForSequenceClassification
-
-[[autodoc]] TFRobertaForSequenceClassification
-    - call
-
-## TFRobertaForMultipleChoice
-
-[[autodoc]] TFRobertaForMultipleChoice
-    - call
-
-## TFRobertaForTokenClassification
-
-[[autodoc]] TFRobertaForTokenClassification
-    - call
-
-## TFRobertaForQuestionAnswering
-
-[[autodoc]] TFRobertaForQuestionAnswering
-    - call
-
-## FlaxRobertaModel
-
-[[autodoc]] FlaxRobertaModel
-    - __call__
-
-## FlaxRobertaForCausalLM
-
-[[autodoc]] FlaxRobertaForCausalLM
-    - __call__
-
-## FlaxRobertaForMaskedLM
-
-[[autodoc]] FlaxRobertaForMaskedLM
-    - __call__
-
-## FlaxRobertaForSequenceClassification
-
-[[autodoc]] FlaxRobertaForSequenceClassification
-    - __call__
-
-## FlaxRobertaForMultipleChoice
-
-[[autodoc]] FlaxRobertaForMultipleChoice
-    - __call__
-
-## FlaxRobertaForTokenClassification
-
-[[autodoc]] FlaxRobertaForTokenClassification
-    - __call__
-
-## FlaxRobertaForQuestionAnswering
-
-[[autodoc]] FlaxRobertaForQuestionAnswering
-    - __call__
diff --git a/docs/source/en/model_doc/roc_bert.md b/docs/source/en/model_doc/roc_bert.md
new file mode 100644
index 000000000000..831c656fb817
--- /dev/null
+++ b/docs/source/en/model_doc/roc_bert.md
@@ -0,0 +1,106 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# RoCBert
+
+## Overview
+
+The RoCBert model was proposed in [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf)  by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
+It's a pretrained Chinese language model that is robust under various forms of adversarial attacks.
+
+The abstract from the paper is the following:
+
+*Large-scale pretrained language models have achieved SOTA results on NLP tasks. However, they have been shown
+vulnerable to adversarial attacks especially for logographic languages like Chinese. In this work, we propose
+ROCBERT: a pretrained Chinese Bert that is robust to various forms of adversarial attacks like word perturbation,
+synonyms, typos, etc. It is pretrained with the contrastive learning objective which maximizes the label consistency
+under different synthesized adversarial examples. The model takes as input multimodal information including the
+semantic, phonetic and visual features. We show all these features are important to the model robustness since the
+attack can be performed in all the three forms. Across 5 Chinese NLU tasks, ROCBERT outperforms strong baselines under
+three blackbox adversarial algorithms without sacrificing the performance on clean testset. It also performs the best
+in the toxic content detection task under human-made attacks.*
+
+This model was contributed by [weiweishi](https://huggingface.co/weiweishi).
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Causal language modeling task guide](../tasks/language_modeling)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+- [Multiple choice task guide](../tasks/multiple_choice)
+
+## RoCBertConfig
+
+[[autodoc]] RoCBertConfig
+    - all
+
+
+## RoCBertTokenizer
+
+[[autodoc]] RoCBertTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+
+## RoCBertModel
+
+[[autodoc]] RoCBertModel
+    - forward
+
+
+## RoCBertForPreTraining
+
+[[autodoc]] RoCBertForPreTraining
+    - forward
+
+
+## RoCBertForCausalLM
+
+[[autodoc]] RoCBertForCausalLM
+    - forward
+
+
+## RoCBertForMaskedLM
+
+[[autodoc]] RoCBertForMaskedLM
+    - forward
+
+
+## RoCBertForSequenceClassification
+
+[[autodoc]] transformers.RoCBertForSequenceClassification
+    - forward
+
+## RoCBertForMultipleChoice
+
+[[autodoc]] transformers.RoCBertForMultipleChoice
+    - forward
+
+
+## RoCBertForTokenClassification
+
+[[autodoc]] transformers.RoCBertForTokenClassification
+    - forward
+
+
+## RoCBertForQuestionAnswering
+
+[[autodoc]] RoCBertForQuestionAnswering
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/roc_bert.mdx b/docs/source/en/model_doc/roc_bert.mdx
deleted file mode 100644
index c30ccfd1c523..000000000000
--- a/docs/source/en/model_doc/roc_bert.mdx
+++ /dev/null
@@ -1,93 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# RoCBert
-
-## Overview
-
-The RoCBert model was proposed in [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf)  by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
-It's a pretrained Chinese language model that is robust under various forms of adversarial attacks.
-
-The abstract from the paper is the following:
-
-*Large-scale pretrained language models have achieved SOTA results on NLP tasks. However, they have been shown
-vulnerable to adversarial attacks especially for logographic languages like Chinese. In this work, we propose
-ROCBERT: a pretrained Chinese Bert that is robust to various forms of adversarial attacks like word perturbation,
-synonyms, typos, etc. It is pretrained with the contrastive learning objective which maximizes the label consistency
-under different synthesized adversarial examples. The model takes as input multimodal information including the
-semantic, phonetic and visual features. We show all these features are important to the model robustness since the
-attack can be performed in all the three forms. Across 5 Chinese NLU tasks, ROCBERT outperforms strong baselines under
-three blackbox adversarial algorithms without sacrificing the performance on clean testset. It also performs the best
-in the toxic content detection task under human-made attacks.*
-
-This model was contributed by [weiweishi](https://huggingface.co/weiweishi).
-
-## RoCBertConfig
-
-[[autodoc]] RoCBertConfig
-    - all
-
-
-## RoCBertTokenizer
-
-[[autodoc]] RoCBertTokenizer
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-
-
-## RoCBertModel
-
-[[autodoc]] RoCBertModel
-    - forward
-
-
-## RoCBertForPreTraining
-
-[[autodoc]] RoCBertForPreTraining
-    - forward
-
-
-## RoCBertForCausalLM
-
-[[autodoc]] RoCBertForCausalLM
-    - forward
-
-
-## RoCBertForMaskedLM
-
-[[autodoc]] RoCBertForMaskedLM
-    - forward
-
-
-## RoCBertForSequenceClassification
-
-[[autodoc]] transformers.RoCBertForSequenceClassification
-    - forward
-
-## RoCBertForMultipleChoice
-
-[[autodoc]] transformers.RoCBertForMultipleChoice
-    - forward
-
-
-## RoCBertForTokenClassification
-
-[[autodoc]] transformers.RoCBertForTokenClassification
-    - forward
-
-
-## RoCBertForQuestionAnswering
-
-[[autodoc]] RoCBertForQuestionAnswering
-    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/roformer.md b/docs/source/en/model_doc/roformer.md
new file mode 100644
index 000000000000..f15a1062965f
--- /dev/null
+++ b/docs/source/en/model_doc/roformer.md
@@ -0,0 +1,168 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# RoFormer
+
+## Overview
+
+The RoFormer model was proposed in [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
+
+The abstract from the paper is the following:
+
+*Position encoding in transformer architecture provides supervision for dependency modeling between elements at
+different positions in the sequence. We investigate various methods to encode positional information in
+transformer-based language models and propose a novel implementation named Rotary Position Embedding(RoPE). The
+proposed RoPE encodes absolute positional information with rotation matrix and naturally incorporates explicit relative
+position dependency in self-attention formulation. Notably, RoPE comes with valuable properties such as flexibility of
+being expand to any sequence lengths, decaying inter-token dependency with increasing relative distances, and
+capability of equipping the linear self-attention with relative position encoding. As a result, the enhanced
+transformer with rotary position embedding, or RoFormer, achieves superior performance in tasks with long texts. We
+release the theoretical analysis along with some preliminary experiment results on Chinese data. The undergoing
+experiment for English benchmark will soon be updated.*
+
+Tips:
+
+- RoFormer is a BERT-like autoencoding model with rotary position embeddings. Rotary position embeddings have shown
+  improved performance on classification tasks with long texts.
+
+
+This model was contributed by [junnyu](https://huggingface.co/junnyu). The original code can be found [here](https://github.com/ZhuiyiTechnology/roformer).
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Causal language modeling task guide](../tasks/language_modeling)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+- [Multiple choice task guide](../tasks/multiple_choice)
+
+## RoFormerConfig
+
+[[autodoc]] RoFormerConfig
+
+## RoFormerTokenizer
+
+[[autodoc]] RoFormerTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## RoFormerTokenizerFast
+
+[[autodoc]] RoFormerTokenizerFast
+    - build_inputs_with_special_tokens
+
+## RoFormerModel
+
+[[autodoc]] RoFormerModel
+    - forward
+
+## RoFormerForCausalLM
+
+[[autodoc]] RoFormerForCausalLM
+    - forward
+
+## RoFormerForMaskedLM
+
+[[autodoc]] RoFormerForMaskedLM
+    - forward
+
+## RoFormerForSequenceClassification
+
+[[autodoc]] RoFormerForSequenceClassification
+    - forward
+
+## RoFormerForMultipleChoice
+
+[[autodoc]] RoFormerForMultipleChoice
+    - forward
+
+## RoFormerForTokenClassification
+
+[[autodoc]] RoFormerForTokenClassification
+    - forward
+
+## RoFormerForQuestionAnswering
+
+[[autodoc]] RoFormerForQuestionAnswering
+    - forward
+
+## TFRoFormerModel
+
+[[autodoc]] TFRoFormerModel
+    - call
+
+## TFRoFormerForMaskedLM
+
+[[autodoc]] TFRoFormerForMaskedLM
+    - call
+
+## TFRoFormerForCausalLM
+
+[[autodoc]] TFRoFormerForCausalLM
+    - call
+
+## TFRoFormerForSequenceClassification
+
+[[autodoc]] TFRoFormerForSequenceClassification
+    - call
+
+## TFRoFormerForMultipleChoice
+
+[[autodoc]] TFRoFormerForMultipleChoice
+    - call
+
+## TFRoFormerForTokenClassification
+
+[[autodoc]] TFRoFormerForTokenClassification
+    - call
+
+## TFRoFormerForQuestionAnswering
+
+[[autodoc]] TFRoFormerForQuestionAnswering
+    - call
+
+## FlaxRoFormerModel
+
+[[autodoc]] FlaxRoFormerModel
+    - __call__
+
+## FlaxRoFormerForMaskedLM
+
+[[autodoc]] FlaxRoFormerForMaskedLM
+    - __call__
+
+## FlaxRoFormerForSequenceClassification
+
+[[autodoc]] FlaxRoFormerForSequenceClassification
+    - __call__
+
+## FlaxRoFormerForMultipleChoice
+
+[[autodoc]] FlaxRoFormerForMultipleChoice
+    - __call__
+
+## FlaxRoFormerForTokenClassification
+
+[[autodoc]] FlaxRoFormerForTokenClassification
+    - __call__
+
+## FlaxRoFormerForQuestionAnswering
+
+[[autodoc]] FlaxRoFormerForQuestionAnswering
+    - __call__
diff --git a/docs/source/en/model_doc/roformer.mdx b/docs/source/en/model_doc/roformer.mdx
deleted file mode 100644
index 435941d9f29a..000000000000
--- a/docs/source/en/model_doc/roformer.mdx
+++ /dev/null
@@ -1,155 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# RoFormer
-
-## Overview
-
-The RoFormer model was proposed in [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
-
-The abstract from the paper is the following:
-
-*Position encoding in transformer architecture provides supervision for dependency modeling between elements at
-different positions in the sequence. We investigate various methods to encode positional information in
-transformer-based language models and propose a novel implementation named Rotary Position Embedding(RoPE). The
-proposed RoPE encodes absolute positional information with rotation matrix and naturally incorporates explicit relative
-position dependency in self-attention formulation. Notably, RoPE comes with valuable properties such as flexibility of
-being expand to any sequence lengths, decaying inter-token dependency with increasing relative distances, and
-capability of equipping the linear self-attention with relative position encoding. As a result, the enhanced
-transformer with rotary position embedding, or RoFormer, achieves superior performance in tasks with long texts. We
-release the theoretical analysis along with some preliminary experiment results on Chinese data. The undergoing
-experiment for English benchmark will soon be updated.*
-
-Tips:
-
-- RoFormer is a BERT-like autoencoding model with rotary position embeddings. Rotary position embeddings have shown
-  improved performance on classification tasks with long texts.
-
-
-This model was contributed by [junnyu](https://huggingface.co/junnyu). The original code can be found [here](https://github.com/ZhuiyiTechnology/roformer).
-
-## RoFormerConfig
-
-[[autodoc]] RoFormerConfig
-
-## RoFormerTokenizer
-
-[[autodoc]] RoFormerTokenizer
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-
-## RoFormerTokenizerFast
-
-[[autodoc]] RoFormerTokenizerFast
-    - build_inputs_with_special_tokens
-
-## RoFormerModel
-
-[[autodoc]] RoFormerModel
-    - forward
-
-## RoFormerForCausalLM
-
-[[autodoc]] RoFormerForCausalLM
-    - forward
-
-## RoFormerForMaskedLM
-
-[[autodoc]] RoFormerForMaskedLM
-    - forward
-
-## RoFormerForSequenceClassification
-
-[[autodoc]] RoFormerForSequenceClassification
-    - forward
-
-## RoFormerForMultipleChoice
-
-[[autodoc]] RoFormerForMultipleChoice
-    - forward
-
-## RoFormerForTokenClassification
-
-[[autodoc]] RoFormerForTokenClassification
-    - forward
-
-## RoFormerForQuestionAnswering
-
-[[autodoc]] RoFormerForQuestionAnswering
-    - forward
-
-## TFRoFormerModel
-
-[[autodoc]] TFRoFormerModel
-    - call
-
-## TFRoFormerForMaskedLM
-
-[[autodoc]] TFRoFormerForMaskedLM
-    - call
-
-## TFRoFormerForCausalLM
-
-[[autodoc]] TFRoFormerForCausalLM
-    - call
-
-## TFRoFormerForSequenceClassification
-
-[[autodoc]] TFRoFormerForSequenceClassification
-    - call
-
-## TFRoFormerForMultipleChoice
-
-[[autodoc]] TFRoFormerForMultipleChoice
-    - call
-
-## TFRoFormerForTokenClassification
-
-[[autodoc]] TFRoFormerForTokenClassification
-    - call
-
-## TFRoFormerForQuestionAnswering
-
-[[autodoc]] TFRoFormerForQuestionAnswering
-    - call
-
-## FlaxRoFormerModel
-
-[[autodoc]] FlaxRoFormerModel
-    - __call__
-
-## FlaxRoFormerForMaskedLM
-
-[[autodoc]] FlaxRoFormerForMaskedLM
-    - __call__
-
-## FlaxRoFormerForSequenceClassification
-
-[[autodoc]] FlaxRoFormerForSequenceClassification
-    - __call__
-
-## FlaxRoFormerForMultipleChoice
-
-[[autodoc]] FlaxRoFormerForMultipleChoice
-    - __call__
-
-## FlaxRoFormerForTokenClassification
-
-[[autodoc]] FlaxRoFormerForTokenClassification
-    - __call__
-
-## FlaxRoFormerForQuestionAnswering
-
-[[autodoc]] FlaxRoFormerForQuestionAnswering
-    - __call__
diff --git a/docs/source/en/model_doc/rwkv.md b/docs/source/en/model_doc/rwkv.md
new file mode 100644
index 000000000000..9293db14cc63
--- /dev/null
+++ b/docs/source/en/model_doc/rwkv.md
@@ -0,0 +1,151 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# RWKV
+
+## Overview
+
+The RWKV model was proposed in [this repo](https://github.com/BlinkDL/RWKV-LM)
+
+It suggests a tweak in the traditional Transformer attention to make it linear. This way, the model can be used as recurrent network: passing inputs for timestamp 0 and timestamp 1 together is the same as passing inputs at timestamp 0, then inputs at timestamp 1 along with the state of timestamp 0 (see example below).
+
+This can be more efficient than a regular Transformer and can deal with sentence of any length (even if the model uses a fixed context length for training).
+
+This model was contributed by [sgugger](https://huggingface.co/sgugger).
+The original code can be found [here](https://github.com/BlinkDL/RWKV-LM).
+
+Example of use as an RNN:
+
+```py
+import torch
+from transformers import AutoTokenizer, RwkvConfig, RwkvModel
+
+model = RwkvModel.from_pretrained("sgugger/rwkv-430M-pile")
+tokenizer = AutoTokenizer.from_pretrained("sgugger/rwkv-430M-pile")
+
+inputs = tokenizer("This is an example.", return_tensors="pt")
+# Feed everything to the model
+outputs = model(inputs["input_ids"])
+output_whole = outputs.last_hidden_state
+
+outputs = model(inputs["input_ids"][:, :2])
+output_one = outputs.last_hidden_state
+
+# Using the state computed on the first inputs, we will get the same output
+outputs = model(inputs["input_ids"][:, 2:], state=outputs.state)
+output_two = outputs.last_hidden_state
+
+torch.allclose(torch.cat([output_one, output_two], dim=1), output_whole, atol=1e-5)
+```
+
+If you want to make sure the model stops generating when `'\n\n'` is detected, we recommend using the following stopping criteria:
+
+```python 
+from transformers import StoppingCriteria
+
+class RwkvStoppingCriteria(StoppingCriteria):
+    def __init__(self, eos_sequence = [187,187], eos_token_id = 537):
+        self.eos_sequence = eos_sequence
+        self.eos_token_id = eos_token_id
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        last_2_ids = input_ids[:,-2:].tolist()
+        return self.eos_sequence in last_2_ids
+
+
+output = model.generate(inputs["input_ids"], max_new_tokens=64, stopping_criteria = [RwkvStoppingCriteria()])
+```
+
+## RwkvConfig
+
+[[autodoc]] RwkvConfig
+
+
+## RwkvModel
+
+[[autodoc]] RwkvModel
+    - forward
+
+## RwkvLMHeadModel
+
+[[autodoc]] RwkvForCausalLM
+    - forward
+
+## Rwkv attention and the recurrent formulas
+
+In a traditional auto-regressive Transformer, attention is written as
+
+$$O = \hbox{softmax}(QK^{T} / \sqrt{d}) V$$
+
+with \\(Q\\), \\(K\\) and \\(V\\) are matrices of shape `seq_len x hidden_size` named query, key and value (they are actually bigger matrices with a batch dimension and an attention head dimension but we're only interested in the last two, which is where the matrix product is taken, so for the sake of simplicity we only consider those two). The product \\(QK^{T}\\) then has shape `seq_len x seq_len` and we can take the maxtrix product with \\(V\\) to get the output \\(O\\) of the same shape as the others.  
+
+Replacing the softmax by its value gives:
+
+$$O_{i} = \frac{\sum_{j=1}^{i} e^{Q_{i} K_{j}^{T} / \sqrt{d}} V_{j}}{\sum_{j=1}^{i} e^{Q_{i} K_{j}^{T} / \sqrt{d}}}$$
+
+Note that the entries in \\(QK^{T}\\) corresponding to \\(j > i\\) are masked (the sum stops at j) because the attention is not allowed to look at future tokens (only past ones).
+
+In comparison, the RWKV attention is given by
+
+$$O_{i} = \sigma(R_{i}) \frac{\sum_{j=1}^{i} e^{W_{i-j} + K_{j}} V_{j}}{\sum_{j=1}^{i} e^{W_{i-j} + K_{j}}}$$
+
+where \\(R\\) is a new matrix called receptance by the author, \\(K\\) and \\(V\\) are still the key and value (\\(\sigma\\) here is the sigmoid function). \\(W\\) is a new vector that represents the position of the token and is given by
+
+$$W_{0} = u \hbox{  and  } W_{k} = (k-1)w \hbox{ for } k \geq 1$$
+
+with \\(u\\) and \\(w\\) learnable parameters called in the code `time_first` and `time_decay` respectively. The numerator and denominator can both be expressed recursively. Naming them \\(N_{i}\\) and \\(D_{i}\\) we have:
+
+$$N_{i} = e^{u + K_{i}} V_{i} + \hat{N}_{i} \hbox{  where  } \hat{N}_{i} = e^{K_{i-1}} V_{i-1} + e^{w + K_{i-2}} V_{i-2} \cdots + e^{(i-2)w + K_{1}} V_{1}$$
+
+so \\(\hat{N}_{i}\\) (called `numerator_state` in the code) satistfies
+
+$$\hat{N}_{0} = 0 \hbox{  and  } \hat{N}_{j+1} = e^{K_{j}} V_{j} + e^{w} \hat{N}_{j}$$
+
+and
+
+$$D_{i} = e^{u + K_{i}} + \hat{D}_{i} \hbox{  where  } \hat{D}_{i} = e^{K_{i-1}} + e^{w + K_{i-2}} \cdots + e^{(i-2)w + K_{1}}$$
+
+so \\(\hat{D}_{i}\\) (called `denominator_state` in the code) satistfies
+
+$$\hat{D}_{0} = 0 \hbox{  and  } \hat{D}_{j+1} = e^{K_{j}} + e^{w} \hat{D}_{j}$$
+
+The actual recurrent formula used are a tiny bit more complex, as for numerical stability we don't want to compute exponentials of big numbers. Usually the softmax is not computed as is, but the exponential of the maximum term is divided of the numerator and denominator:
+
+$$\frac{e^{x_{i}}}{\sum_{j=1}^{n} e^{x_{j}}} = \frac{e^{x_{i} - M}}{\sum_{j=1}^{n} e^{x_{j} - M}}$$
+
+with \\(M\\) the maximum of all \\(x_{j}\\). So here on top of saving the numerator state (\\(\hat{N}\\)) and the denominator state (\\(\hat{D}\\)) we also keep track of the maximum of all terms encountered in the exponentials. So we actually use
+
+$$\tilde{N}_{i} = e^{-M_{i}} \hat{N}_{i} \hbox{  and  } \tilde{D}_{i} = e^{-M_{i}} \hat{D}_{i}$$
+
+defined by the following recurrent formulas:
+
+$$\tilde{N}_{0} = 0 \hbox{  and  } \tilde{N}_{j+1} = e^{K_{j} - q} V_{j} + e^{w + M_{j} - q} \tilde{N}_{j} \hbox{  where  } q = \max(K_{j}, w + M_{j})$$
+
+and
+
+$$\tilde{D}_{0} = 0 \hbox{  and  } \tilde{D}_{j+1} = e^{K_{j} - q} + e^{w + M_{j} - q} \tilde{D}_{j} \hbox{  where  } q = \max(K_{j}, w + M_{j})$$
+
+and \\(M_{j+1} = q\\). With those, we can then compute
+
+$$N_{i} = e^{u + K_{i} - q} V_{i} + e^{M_{i}} \tilde{N}_{i} \hbox{  where  } q = \max(u + K_{i}, M_{i})$$
+
+and
+
+$$D_{i} = e^{u + K_{i} - q} + e^{M_{i}} \tilde{D}_{i} \hbox{  where  } q = \max(u + K_{i}, M_{i})$$
+
+which finally gives us
+
+$$O_{i} = \sigma(R_{i}) \frac{N_{i}}{D_{i}}$$
\ No newline at end of file
diff --git a/docs/source/en/model_doc/sam.md b/docs/source/en/model_doc/sam.md
new file mode 100644
index 000000000000..fe0d24623fd7
--- /dev/null
+++ b/docs/source/en/model_doc/sam.md
@@ -0,0 +1,111 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# SAM
+
+## Overview
+
+SAM (Segment Anything Model) was proposed in [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
+
+The model can be used to predict segmentation masks of any object of interest given an input image. 
+
+![example image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/sam-output.png)
+
+The abstract from the paper is the following:
+
+*We introduce the Segment Anything (SA) project: a new task, model, and dataset for image segmentation. Using our efficient model in a data collection loop, we built the largest segmentation dataset to date (by far), with over 1 billion masks on 11M licensed and privacy respecting images. The model is designed and trained to be promptable, so it can transfer zero-shot to new image distributions and tasks. We evaluate its capabilities on numerous tasks and find that its zero-shot performance is impressive -- often competitive with or even superior to prior fully supervised results. We are releasing the Segment Anything Model (SAM) and corresponding dataset (SA-1B) of 1B masks and 11M images at [https://segment-anything.com](https://segment-anything.com) to foster research into foundation models for computer vision.*
+
+Tips:
+
+- The model predicts binary masks that states the presence or not of the object of interest given an image.
+- The model predicts much better results if input 2D points and/or input bounding boxes are provided
+- You can prompt multiple points for the same image, and predict a single mask. 
+- Fine-tuning the model is not supported yet
+- According to the paper, textual input should be also supported. However, at this time of writing this seems to be not supported according to [the official repository](https://github.com/facebookresearch/segment-anything/issues/4#issuecomment-1497626844). 
+
+
+This model was contributed by [ybelkada](https://huggingface.co/ybelkada) and [ArthurZ](https://huggingface.co/ArthurZ).
+The original code can be found [here](https://github.com/facebookresearch/segment-anything).
+
+Below is an example on how to run mask generation given an image and a 2D point:
+
+```python
+import torch
+from PIL import Image
+import requests
+from transformers import SamModel, SamProcessor
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model = SamModel.from_pretrained("facebook/sam-vit-huge").to(device)
+processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+
+img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
+raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
+input_points = [[[450, 600]]]  # 2D location of a window in the image
+
+inputs = processor(raw_image, input_points=input_points, return_tensors="pt").to(device)
+outputs = model(**inputs)
+
+masks = processor.image_processor.post_process_masks(
+    outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu()
+)
+scores = outputs.iou_scores
+```
+
+Resources:
+
+- [Demo notebook](https://github.com/huggingface/notebooks/blob/main/examples/segment_anything.ipynb) for using the model.
+- [Demo notebook](https://github.com/huggingface/notebooks/blob/main/examples/automatic_mask_generation.ipynb) for using the automatic mask generation pipeline.
+- [Demo notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/SAM/Run_inference_with_MedSAM_using_HuggingFace_Transformers.ipynb) for inference with MedSAM, a fine-tuned version of SAM on the medical domain.
+- [Demo notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/SAM/Fine_tune_SAM_(segment_anything)_on_a_custom_dataset.ipynb) for fine-tuning the model on custom data.
+
+## SamConfig
+
+[[autodoc]] SamConfig
+
+## SamVisionConfig
+
+[[autodoc]] SamVisionConfig
+
+## SamMaskDecoderConfig
+
+[[autodoc]] SamMaskDecoderConfig
+
+## SamPromptEncoderConfig
+
+[[autodoc]] SamPromptEncoderConfig
+
+
+## SamProcessor
+
+[[autodoc]] SamProcessor
+
+
+## SamImageProcessor
+
+[[autodoc]] SamImageProcessor
+
+
+## SamModel
+
+[[autodoc]] SamModel
+    - forward
+
+
+## TFSamModel
+
+[[autodoc]] TFSamModel
+    - call
\ No newline at end of file
diff --git a/docs/source/en/model_doc/segformer.md b/docs/source/en/model_doc/segformer.md
new file mode 100644
index 000000000000..0f535351af5c
--- /dev/null
+++ b/docs/source/en/model_doc/segformer.md
@@ -0,0 +1,164 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# SegFormer
+
+## Overview
+
+The SegFormer model was proposed in [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping
+Luo. The model consists of a hierarchical Transformer encoder and a lightweight all-MLP decode head to achieve great
+results on image segmentation benchmarks such as ADE20K and Cityscapes.
+
+The abstract from the paper is the following:
+
+*We present SegFormer, a simple, efficient yet powerful semantic segmentation framework which unifies Transformers with
+lightweight multilayer perception (MLP) decoders. SegFormer has two appealing features: 1) SegFormer comprises a novel
+hierarchically structured Transformer encoder which outputs multiscale features. It does not need positional encoding,
+thereby avoiding the interpolation of positional codes which leads to decreased performance when the testing resolution
+differs from training. 2) SegFormer avoids complex decoders. The proposed MLP decoder aggregates information from
+different layers, and thus combining both local attention and global attention to render powerful representations. We
+show that this simple and lightweight design is the key to efficient segmentation on Transformers. We scale our
+approach up to obtain a series of models from SegFormer-B0 to SegFormer-B5, reaching significantly better performance
+and efficiency than previous counterparts. For example, SegFormer-B4 achieves 50.3% mIoU on ADE20K with 64M parameters,
+being 5x smaller and 2.2% better than the previous best method. Our best model, SegFormer-B5, achieves 84.0% mIoU on
+Cityscapes validation set and shows excellent zero-shot robustness on Cityscapes-C.*
+
+The figure below illustrates the architecture of SegFormer. Taken from the [original paper](https://arxiv.org/abs/2105.15203).
+
+<img width="600" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/segformer_architecture.png"/>
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The TensorFlow version
+of the model was contributed by [sayakpaul](https://huggingface.co/sayakpaul). The original code can be found [here](https://github.com/NVlabs/SegFormer).
+
+Tips:
+
+- SegFormer consists of a hierarchical Transformer encoder, and a lightweight all-MLP decoder head.
+  [`SegformerModel`] is the hierarchical Transformer encoder (which in the paper is also referred to
+  as Mix Transformer or MiT). [`SegformerForSemanticSegmentation`] adds the all-MLP decoder head on
+  top to perform semantic segmentation of images. In addition, there's
+  [`SegformerForImageClassification`] which can be used to - you guessed it - classify images. The
+  authors of SegFormer first pre-trained the Transformer encoder on ImageNet-1k to classify images. Next, they throw
+  away the classification head, and replace it by the all-MLP decode head. Next, they fine-tune the model altogether on
+  ADE20K, Cityscapes and COCO-stuff, which are important benchmarks for semantic segmentation. All checkpoints can be
+  found on the [hub](https://huggingface.co/models?other=segformer).
+- The quickest way to get started with SegFormer is by checking the [example notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/SegFormer) (which showcase both inference and
+  fine-tuning on custom data). One can also check out the [blog post](https://huggingface.co/blog/fine-tune-segformer) introducing SegFormer and illustrating how it can be fine-tuned on custom data.
+- TensorFlow users should refer to [this repository](https://github.com/deep-diver/segformer-tf-transformers) that shows off-the-shelf inference and fine-tuning.
+- One can also check out [this interactive demo on Hugging Face Spaces](https://huggingface.co/spaces/chansung/segformer-tf-transformers)
+  to try out a SegFormer model on custom images.
+- SegFormer works on any input size, as it pads the input to be divisible by `config.patch_sizes`.
+- One can use [`SegformerImageProcessor`] to prepare images and corresponding segmentation maps
+  for the model. Note that this image processor is fairly basic and does not include all data augmentations used in
+  the original paper. The original preprocessing pipelines (for the ADE20k dataset for instance) can be found [here](https://github.com/NVlabs/SegFormer/blob/master/local_configs/_base_/datasets/ade20k_repeat.py). The most
+  important preprocessing step is that images and segmentation maps are randomly cropped and padded to the same size,
+  such as 512x512 or 640x640, after which they are normalized.
+- One additional thing to keep in mind is that one can initialize [`SegformerImageProcessor`] with
+  `reduce_labels` set to `True` or `False`. In some datasets (like ADE20k), the 0 index is used in the annotated
+  segmentation maps for background. However, ADE20k doesn't include the "background" class in its 150 labels.
+  Therefore, `reduce_labels` is used to reduce all labels by 1, and to make sure no loss is computed for the
+  background class (i.e. it replaces 0 in the annotated maps by 255, which is the *ignore_index* of the loss function
+  used by [`SegformerForSemanticSegmentation`]). However, other datasets use the 0 index as
+  background class and include this class as part of all labels. In that case, `reduce_labels` should be set to
+  `False`, as loss should also be computed for the background class.
+- As most models, SegFormer comes in different sizes, the details of which can be found in the table below
+  (taken from Table 7 of the [original paper](https://arxiv.org/abs/2105.15203)).
+
+| **Model variant** | **Depths**    | **Hidden sizes**    | **Decoder hidden size** | **Params (M)** | **ImageNet-1k Top 1** |
+| :---------------: | ------------- | ------------------- | :---------------------: | :------------: | :-------------------: |
+| MiT-b0            | [2, 2, 2, 2]  | [32, 64, 160, 256]  | 256                     | 3.7            | 70.5                  |
+| MiT-b1            | [2, 2, 2, 2]  | [64, 128, 320, 512] | 256                     | 14.0           | 78.7                  |
+| MiT-b2            | [3, 4, 6, 3]  | [64, 128, 320, 512] | 768                     | 25.4           | 81.6                  |
+| MiT-b3            | [3, 4, 18, 3] | [64, 128, 320, 512] | 768                     | 45.2           | 83.1                  |
+| MiT-b4            | [3, 8, 27, 3] | [64, 128, 320, 512] | 768                     | 62.6           | 83.6                  |
+| MiT-b5            | [3, 6, 40, 3] | [64, 128, 320, 512] | 768                     | 82.0           | 83.8                  |
+
+Note that MiT in the above table refers to the Mix Transformer encoder backbone introduced in SegFormer. For
+SegFormer's results on the segmentation datasets like ADE20k, refer to the [paper](https://arxiv.org/abs/2105.15203).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with SegFormer.
+
+<PipelineTag pipeline="image-classification"/>
+
+- [`SegformerForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
+- [Image classification task guide](../tasks/image_classification)
+
+Semantic segmentation:
+
+- [`SegformerForSemanticSegmentation`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/semantic-segmentation).
+- A blog on fine-tuning SegFormer on a custom dataset can be found [here](https://huggingface.co/blog/fine-tune-segformer).
+- More demo notebooks on SegFormer (both inference + fine-tuning on a custom dataset) can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/SegFormer).
+- [`TFSegformerForSemanticSegmentation`] is supported by this [example notebook](https://github.com/huggingface/notebooks/blob/main/examples/semantic_segmentation-tf.ipynb).
+- [Semantic segmentation task guide](../tasks/semantic_segmentation)
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## SegformerConfig
+
+[[autodoc]] SegformerConfig
+
+## SegformerFeatureExtractor
+
+[[autodoc]] SegformerFeatureExtractor
+    - __call__
+    - post_process_semantic_segmentation
+
+## SegformerImageProcessor
+
+[[autodoc]] SegformerImageProcessor
+    - preprocess
+    - post_process_semantic_segmentation
+
+## SegformerModel
+
+[[autodoc]] SegformerModel
+    - forward
+
+## SegformerDecodeHead
+
+[[autodoc]] SegformerDecodeHead
+    - forward
+
+## SegformerForImageClassification
+
+[[autodoc]] SegformerForImageClassification
+    - forward
+
+## SegformerForSemanticSegmentation
+
+[[autodoc]] SegformerForSemanticSegmentation
+    - forward
+
+## TFSegformerDecodeHead
+
+[[autodoc]] TFSegformerDecodeHead
+    - call
+
+## TFSegformerModel
+
+[[autodoc]] TFSegformerModel
+    - call
+
+## TFSegformerForImageClassification
+
+[[autodoc]] TFSegformerForImageClassification
+    - call
+
+## TFSegformerForSemanticSegmentation
+
+[[autodoc]] TFSegformerForSemanticSegmentation
+    - call
diff --git a/docs/source/en/model_doc/segformer.mdx b/docs/source/en/model_doc/segformer.mdx
deleted file mode 100644
index 76a02c27f423..000000000000
--- a/docs/source/en/model_doc/segformer.mdx
+++ /dev/null
@@ -1,142 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# SegFormer
-
-## Overview
-
-The SegFormer model was proposed in [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping
-Luo. The model consists of a hierarchical Transformer encoder and a lightweight all-MLP decode head to achieve great
-results on image segmentation benchmarks such as ADE20K and Cityscapes.
-
-The abstract from the paper is the following:
-
-*We present SegFormer, a simple, efficient yet powerful semantic segmentation framework which unifies Transformers with
-lightweight multilayer perception (MLP) decoders. SegFormer has two appealing features: 1) SegFormer comprises a novel
-hierarchically structured Transformer encoder which outputs multiscale features. It does not need positional encoding,
-thereby avoiding the interpolation of positional codes which leads to decreased performance when the testing resolution
-differs from training. 2) SegFormer avoids complex decoders. The proposed MLP decoder aggregates information from
-different layers, and thus combining both local attention and global attention to render powerful representations. We
-show that this simple and lightweight design is the key to efficient segmentation on Transformers. We scale our
-approach up to obtain a series of models from SegFormer-B0 to SegFormer-B5, reaching significantly better performance
-and efficiency than previous counterparts. For example, SegFormer-B4 achieves 50.3% mIoU on ADE20K with 64M parameters,
-being 5x smaller and 2.2% better than the previous best method. Our best model, SegFormer-B5, achieves 84.0% mIoU on
-Cityscapes validation set and shows excellent zero-shot robustness on Cityscapes-C.*
-
-The figure below illustrates the architecture of SegFormer. Taken from the [original paper](https://arxiv.org/abs/2105.15203).
-
-<img width="600" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/segformer_architecture.png"/>
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr). The TensorFlow version
-of the model was contributed by [sayakpaul](https://huggingface.co/sayakpaul). The original code can be found [here](https://github.com/NVlabs/SegFormer).
-
-Tips:
-
-- SegFormer consists of a hierarchical Transformer encoder, and a lightweight all-MLP decoder head.
-  [`SegformerModel`] is the hierarchical Transformer encoder (which in the paper is also referred to
-  as Mix Transformer or MiT). [`SegformerForSemanticSegmentation`] adds the all-MLP decoder head on
-  top to perform semantic segmentation of images. In addition, there's
-  [`SegformerForImageClassification`] which can be used to - you guessed it - classify images. The
-  authors of SegFormer first pre-trained the Transformer encoder on ImageNet-1k to classify images. Next, they throw
-  away the classification head, and replace it by the all-MLP decode head. Next, they fine-tune the model altogether on
-  ADE20K, Cityscapes and COCO-stuff, which are important benchmarks for semantic segmentation. All checkpoints can be
-  found on the [hub](https://huggingface.co/models?other=segformer).
-- The quickest way to get started with SegFormer is by checking the [example notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/SegFormer) (which showcase both inference and
-  fine-tuning on custom data). One can also check out the [blog post](https://huggingface.co/blog/fine-tune-segformer) introducing SegFormer and illustrating how it can be fine-tuned on custom data.
-- TensorFlow users should refer to [this repository](https://github.com/deep-diver/segformer-tf-transformers) that shows off-the-shelf inference and fine-tuning.
-- One can also check out [this interactive demo on Hugging Face Spaces](https://huggingface.co/spaces/chansung/segformer-tf-transformers)
-  to try out a SegFormer model on custom images.
-- SegFormer works on any input size, as it pads the input to be divisible by `config.patch_sizes`.
-- One can use [`SegformerImageProcessor`] to prepare images and corresponding segmentation maps
-  for the model. Note that this image processor is fairly basic and does not include all data augmentations used in
-  the original paper. The original preprocessing pipelines (for the ADE20k dataset for instance) can be found [here](https://github.com/NVlabs/SegFormer/blob/master/local_configs/_base_/datasets/ade20k_repeat.py). The most
-  important preprocessing step is that images and segmentation maps are randomly cropped and padded to the same size,
-  such as 512x512 or 640x640, after which they are normalized.
-- One additional thing to keep in mind is that one can initialize [`SegformerImageProcessor`] with
-  `reduce_labels` set to `True` or `False`. In some datasets (like ADE20k), the 0 index is used in the annotated
-  segmentation maps for background. However, ADE20k doesn't include the "background" class in its 150 labels.
-  Therefore, `reduce_labels` is used to reduce all labels by 1, and to make sure no loss is computed for the
-  background class (i.e. it replaces 0 in the annotated maps by 255, which is the *ignore_index* of the loss function
-  used by [`SegformerForSemanticSegmentation`]). However, other datasets use the 0 index as
-  background class and include this class as part of all labels. In that case, `reduce_labels` should be set to
-  `False`, as loss should also be computed for the background class.
-- As most models, SegFormer comes in different sizes, the details of which can be found in the table below
-  (taken from Table 7 of the [original paper](https://arxiv.org/abs/2105.15203)).
-
-| **Model variant** | **Depths**    | **Hidden sizes**    | **Decoder hidden size** | **Params (M)** | **ImageNet-1k Top 1** |
-| :---------------: | ------------- | ------------------- | :---------------------: | :------------: | :-------------------: |
-| MiT-b0            | [2, 2, 2, 2]  | [32, 64, 160, 256]  | 256                     | 3.7            | 70.5                  |
-| MiT-b1            | [2, 2, 2, 2]  | [64, 128, 320, 512] | 256                     | 14.0           | 78.7                  |
-| MiT-b2            | [3, 4, 6, 3]  | [64, 128, 320, 512] | 768                     | 25.4           | 81.6                  |
-| MiT-b3            | [3, 4, 18, 3] | [64, 128, 320, 512] | 768                     | 45.2           | 83.1                  |
-| MiT-b4            | [3, 8, 27, 3] | [64, 128, 320, 512] | 768                     | 62.6           | 83.6                  |
-| MiT-b5            | [3, 6, 40, 3] | [64, 128, 320, 512] | 768                     | 82.0           | 83.8                  |
-
-Note that MiT in the above table refers to the Mix Transformer encoder backbone introduced in SegFormer. For
-SegFormer's results on the segmentation datasets like ADE20k, refer to the [paper](https://arxiv.org/abs/2105.15203).
-
-
-## SegformerConfig
-
-[[autodoc]] SegformerConfig
-
-## SegformerFeatureExtractor
-
-[[autodoc]] SegformerFeatureExtractor
-    - __call__
-    - post_process_semantic_segmentation
-
-## SegformerImageProcessor
-
-[[autodoc]] SegformerImageProcessor
-    - preprocess
-    - post_process_semantic_segmentation
-
-## SegformerModel
-
-[[autodoc]] SegformerModel
-    - forward
-
-## SegformerDecodeHead
-
-[[autodoc]] SegformerDecodeHead
-    - forward
-
-## SegformerForImageClassification
-
-[[autodoc]] SegformerForImageClassification
-    - forward
-
-## SegformerForSemanticSegmentation
-
-[[autodoc]] SegformerForSemanticSegmentation
-    - forward
-
-## TFSegformerDecodeHead
-
-[[autodoc]] TFSegformerDecodeHead
-    - call
-
-## TFSegformerModel
-
-[[autodoc]] TFSegformerModel
-    - call
-
-## TFSegformerForImageClassification
-
-[[autodoc]] TFSegformerForImageClassification
-    - call
-
-## TFSegformerForSemanticSegmentation
-
-[[autodoc]] TFSegformerForSemanticSegmentation
-    - call
diff --git a/docs/source/en/model_doc/sew-d.md b/docs/source/en/model_doc/sew-d.md
new file mode 100644
index 000000000000..b70c59061b57
--- /dev/null
+++ b/docs/source/en/model_doc/sew-d.md
@@ -0,0 +1,65 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# SEW-D
+
+## Overview
+
+SEW-D (Squeezed and Efficient Wav2Vec with Disentangled attention) was proposed in [Performance-Efficiency Trade-offs
+in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim,
+Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
+
+The abstract from the paper is the following:
+
+*This paper is a study of performance-efficiency trade-offs in pre-trained models for automatic speech recognition
+(ASR). We focus on wav2vec 2.0, and formalize several architecture designs that influence both the model performance
+and its efficiency. Putting together all our observations, we introduce SEW (Squeezed and Efficient Wav2vec), a
+pre-trained model architecture with significant improvements along both performance and efficiency dimensions across a
+variety of training setups. For example, under the 100h-960h semi-supervised setup on LibriSpeech, SEW achieves a 1.9x
+inference speedup compared to wav2vec 2.0, with a 13.5% relative reduction in word error rate. With a similar inference
+time, SEW reduces word error rate by 25-50% across different model sizes.*
+
+Tips:
+
+- SEW-D is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
+- SEWDForCTC is fine-tuned using connectionist temporal classification (CTC) so the model output has to be decoded
+  using [`Wav2Vec2CTCTokenizer`].
+
+This model was contributed by [anton-l](https://huggingface.co/anton-l).
+
+## Documentation resources
+
+- [Audio classification task guide](../tasks/audio_classification)
+- [Automatic speech recognition task guide](../tasks/asr)
+
+## SEWDConfig
+
+[[autodoc]] SEWDConfig
+
+## SEWDModel
+
+[[autodoc]] SEWDModel
+    - forward
+
+## SEWDForCTC
+
+[[autodoc]] SEWDForCTC
+    - forward
+
+## SEWDForSequenceClassification
+
+[[autodoc]] SEWDForSequenceClassification
+    - forward
diff --git a/docs/source/en/model_doc/sew-d.mdx b/docs/source/en/model_doc/sew-d.mdx
deleted file mode 100644
index ceeb4f1ec35f..000000000000
--- a/docs/source/en/model_doc/sew-d.mdx
+++ /dev/null
@@ -1,57 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# SEW-D
-
-## Overview
-
-SEW-D (Squeezed and Efficient Wav2Vec with Disentangled attention) was proposed in [Performance-Efficiency Trade-offs
-in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim,
-Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
-
-The abstract from the paper is the following:
-
-*This paper is a study of performance-efficiency trade-offs in pre-trained models for automatic speech recognition
-(ASR). We focus on wav2vec 2.0, and formalize several architecture designs that influence both the model performance
-and its efficiency. Putting together all our observations, we introduce SEW (Squeezed and Efficient Wav2vec), a
-pre-trained model architecture with significant improvements along both performance and efficiency dimensions across a
-variety of training setups. For example, under the 100h-960h semi-supervised setup on LibriSpeech, SEW achieves a 1.9x
-inference speedup compared to wav2vec 2.0, with a 13.5% relative reduction in word error rate. With a similar inference
-time, SEW reduces word error rate by 25-50% across different model sizes.*
-
-Tips:
-
-- SEW-D is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
-- SEWDForCTC is fine-tuned using connectionist temporal classification (CTC) so the model output has to be decoded
-  using [`Wav2Vec2CTCTokenizer`].
-
-This model was contributed by [anton-l](https://huggingface.co/anton-l).
-
-
-## SEWDConfig
-
-[[autodoc]] SEWDConfig
-
-## SEWDModel
-
-[[autodoc]] SEWDModel
-    - forward
-
-## SEWDForCTC
-
-[[autodoc]] SEWDForCTC
-    - forward
-
-## SEWDForSequenceClassification
-
-[[autodoc]] SEWDForSequenceClassification
-    - forward
diff --git a/docs/source/en/model_doc/sew.md b/docs/source/en/model_doc/sew.md
new file mode 100644
index 000000000000..ebf128ea429f
--- /dev/null
+++ b/docs/source/en/model_doc/sew.md
@@ -0,0 +1,65 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# SEW
+
+## Overview
+
+SEW (Squeezed and Efficient Wav2Vec) was proposed in [Performance-Efficiency Trade-offs in Unsupervised Pre-training
+for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q.
+Weinberger, Yoav Artzi.
+
+The abstract from the paper is the following:
+
+*This paper is a study of performance-efficiency trade-offs in pre-trained models for automatic speech recognition
+(ASR). We focus on wav2vec 2.0, and formalize several architecture designs that influence both the model performance
+and its efficiency. Putting together all our observations, we introduce SEW (Squeezed and Efficient Wav2vec), a
+pre-trained model architecture with significant improvements along both performance and efficiency dimensions across a
+variety of training setups. For example, under the 100h-960h semi-supervised setup on LibriSpeech, SEW achieves a 1.9x
+inference speedup compared to wav2vec 2.0, with a 13.5% relative reduction in word error rate. With a similar inference
+time, SEW reduces word error rate by 25-50% across different model sizes.*
+
+Tips:
+
+- SEW is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
+- SEWForCTC is fine-tuned using connectionist temporal classification (CTC) so the model output has to be decoded using
+  [`Wav2Vec2CTCTokenizer`].
+
+This model was contributed by [anton-l](https://huggingface.co/anton-l).
+
+## Documentation resources
+
+- [Audio classification task guide](../tasks/audio_classification)
+- [Automatic speech recognition task guide](../tasks/asr)
+
+## SEWConfig
+
+[[autodoc]] SEWConfig
+
+## SEWModel
+
+[[autodoc]] SEWModel
+    - forward
+
+## SEWForCTC
+
+[[autodoc]] SEWForCTC
+    - forward
+
+## SEWForSequenceClassification
+
+[[autodoc]] SEWForSequenceClassification
+    - forward
diff --git a/docs/source/en/model_doc/sew.mdx b/docs/source/en/model_doc/sew.mdx
deleted file mode 100644
index dce949a856b3..000000000000
--- a/docs/source/en/model_doc/sew.mdx
+++ /dev/null
@@ -1,57 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# SEW
-
-## Overview
-
-SEW (Squeezed and Efficient Wav2Vec) was proposed in [Performance-Efficiency Trade-offs in Unsupervised Pre-training
-for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q.
-Weinberger, Yoav Artzi.
-
-The abstract from the paper is the following:
-
-*This paper is a study of performance-efficiency trade-offs in pre-trained models for automatic speech recognition
-(ASR). We focus on wav2vec 2.0, and formalize several architecture designs that influence both the model performance
-and its efficiency. Putting together all our observations, we introduce SEW (Squeezed and Efficient Wav2vec), a
-pre-trained model architecture with significant improvements along both performance and efficiency dimensions across a
-variety of training setups. For example, under the 100h-960h semi-supervised setup on LibriSpeech, SEW achieves a 1.9x
-inference speedup compared to wav2vec 2.0, with a 13.5% relative reduction in word error rate. With a similar inference
-time, SEW reduces word error rate by 25-50% across different model sizes.*
-
-Tips:
-
-- SEW is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
-- SEWForCTC is fine-tuned using connectionist temporal classification (CTC) so the model output has to be decoded using
-  [`Wav2Vec2CTCTokenizer`].
-
-This model was contributed by [anton-l](https://huggingface.co/anton-l).
-
-
-## SEWConfig
-
-[[autodoc]] SEWConfig
-
-## SEWModel
-
-[[autodoc]] SEWModel
-    - forward
-
-## SEWForCTC
-
-[[autodoc]] SEWForCTC
-    - forward
-
-## SEWForSequenceClassification
-
-[[autodoc]] SEWForSequenceClassification
-    - forward
diff --git a/docs/source/en/model_doc/speech-encoder-decoder.md b/docs/source/en/model_doc/speech-encoder-decoder.md
new file mode 100644
index 000000000000..b036f27e1865
--- /dev/null
+++ b/docs/source/en/model_doc/speech-encoder-decoder.md
@@ -0,0 +1,132 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Speech Encoder Decoder Models
+
+The [`SpeechEncoderDecoderModel`] can be used to initialize a speech-to-text model
+with any pretrained speech autoencoding model as the encoder (*e.g.* [Wav2Vec2](wav2vec2), [Hubert](hubert)) and any pretrained autoregressive model as the decoder.
+
+The effectiveness of initializing speech-sequence-to-text-sequence models with pretrained checkpoints for speech
+recognition and speech translation has *e.g.* been shown in [Large-Scale Self- and Semi-Supervised Learning for Speech
+Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli,
+Alexis Conneau.
+
+An example of how to use a [`SpeechEncoderDecoderModel`] for inference can be seen in [Speech2Text2](speech_to_text_2).
+
+## Randomly initializing `SpeechEncoderDecoderModel` from model configurations.
+
+[`SpeechEncoderDecoderModel`] can be randomly initialized from an encoder and a decoder config. In the following example, we show how to do this using the default [`Wav2Vec2Model`] configuration for the encoder
+and the default [`BertForCausalLM`] configuration for the decoder.
+
+```python
+>>> from transformers import BertConfig, Wav2Vec2Config, SpeechEncoderDecoderConfig, SpeechEncoderDecoderModel
+
+>>> config_encoder = Wav2Vec2Config()
+>>> config_decoder = BertConfig()
+
+>>> config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
+>>> model = SpeechEncoderDecoderModel(config=config)
+```
+
+## Initialising `SpeechEncoderDecoderModel` from a pretrained encoder and a pretrained decoder.
+
+[`SpeechEncoderDecoderModel`] can be initialized from a pretrained encoder checkpoint and a pretrained decoder checkpoint. Note that any pretrained Transformer-based speech model, *e.g.* [Wav2Vec2](wav2vec2), [Hubert](hubert) can serve as the encoder and both pretrained auto-encoding models, *e.g.* BERT, pretrained causal language models, *e.g.* GPT2, as well as the pretrained decoder part of sequence-to-sequence models, *e.g.* decoder of BART, can be used as the decoder.
+Depending on which architecture you choose as the decoder, the cross-attention layers might be randomly initialized.
+Initializing [`SpeechEncoderDecoderModel`] from a pretrained encoder and decoder checkpoint requires the model to be fine-tuned on a downstream task, as has been shown in [the *Warm-starting-encoder-decoder blog post*](https://huggingface.co/blog/warm-starting-encoder-decoder).
+To do so, the `SpeechEncoderDecoderModel` class provides a [`SpeechEncoderDecoderModel.from_encoder_decoder_pretrained`] method.
+
+```python
+>>> from transformers import SpeechEncoderDecoderModel
+
+>>> model = SpeechEncoderDecoderModel.from_encoder_decoder_pretrained(
+...     "facebook/hubert-large-ll60k", "bert-base-uncased"
+... )
+```
+
+## Loading an existing `SpeechEncoderDecoderModel` checkpoint and perform inference.
+
+To load fine-tuned checkpoints of the `SpeechEncoderDecoderModel` class, [`SpeechEncoderDecoderModel`] provides the `from_pretrained(...)` method just like any other model architecture in Transformers.
+
+To perform inference, one uses the [`generate`] method, which allows to autoregressively generate text. This method supports various forms of decoding, such as greedy, beam search and multinomial sampling.
+
+```python
+>>> from transformers import Wav2Vec2Processor, SpeechEncoderDecoderModel
+>>> from datasets import load_dataset
+>>> import torch
+
+>>> # load a fine-tuned speech translation model and corresponding processor
+>>> model = SpeechEncoderDecoderModel.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15")
+>>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15")
+
+>>> # let's perform inference on a piece of English speech (which we'll translate to German)
+>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+>>> input_values = processor(ds[0]["audio"]["array"], return_tensors="pt").input_values
+
+>>> # autoregressively generate transcription (uses greedy decoding by default)
+>>> generated_ids = model.generate(input_values)
+>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+>>> print(generated_text)
+Mr. Quilter ist der Apostel der Mittelschicht und wir freuen uns, sein Evangelium willkommen heißen zu können.
+```
+
+## Training
+
+Once the model is created, it can be fine-tuned similar to BART, T5 or any other encoder-decoder model on a dataset of (speech, text) pairs.
+As you can see, only 2 inputs are required for the model in order to compute a loss: `input_values` (which are the
+speech inputs) and `labels` (which are the `input_ids` of the encoded target sequence).
+
+```python
+>>> from transformers import AutoTokenizer, AutoFeatureExtractor, SpeechEncoderDecoderModel
+>>> from datasets import load_dataset
+
+>>> encoder_id = "facebook/wav2vec2-base-960h"  # acoustic model encoder
+>>> decoder_id = "bert-base-uncased"  # text decoder
+
+>>> feature_extractor = AutoFeatureExtractor.from_pretrained(encoder_id)
+>>> tokenizer = AutoTokenizer.from_pretrained(decoder_id)
+>>> # Combine pre-trained encoder and pre-trained decoder to form a Seq2Seq model
+>>> model = SpeechEncoderDecoderModel.from_encoder_decoder_pretrained(encoder_id, decoder_id)
+
+>>> model.config.decoder_start_token_id = tokenizer.cls_token_id
+>>> model.config.pad_token_id = tokenizer.pad_token_id
+
+>>> # load an audio input and pre-process (normalise mean/std to 0/1)
+>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+>>> input_values = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt").input_values
+
+>>> # load its corresponding transcription and tokenize to generate labels
+>>> labels = tokenizer(ds[0]["text"], return_tensors="pt").input_ids
+
+>>> # the forward function automatically creates the correct decoder_input_ids
+>>> loss = model(input_values=input_values, labels=labels).loss
+>>> loss.backward()
+```
+
+## SpeechEncoderDecoderConfig
+
+[[autodoc]] SpeechEncoderDecoderConfig
+
+## SpeechEncoderDecoderModel
+
+[[autodoc]] SpeechEncoderDecoderModel
+    - forward
+    - from_encoder_decoder_pretrained
+
+## FlaxSpeechEncoderDecoderModel
+
+[[autodoc]] FlaxSpeechEncoderDecoderModel
+    - __call__
+    - from_encoder_decoder_pretrained
diff --git a/docs/source/en/model_doc/speech-encoder-decoder.mdx b/docs/source/en/model_doc/speech-encoder-decoder.mdx
deleted file mode 100644
index b0718a27a88c..000000000000
--- a/docs/source/en/model_doc/speech-encoder-decoder.mdx
+++ /dev/null
@@ -1,128 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Speech Encoder Decoder Models
-
-The [`SpeechEncoderDecoderModel`] can be used to initialize a speech-to-text model
-with any pretrained speech autoencoding model as the encoder (*e.g.* [Wav2Vec2](wav2vec2), [Hubert](hubert)) and any pretrained autoregressive model as the decoder.
-
-The effectiveness of initializing speech-sequence-to-text-sequence models with pretrained checkpoints for speech
-recognition and speech translation has *e.g.* been shown in [Large-Scale Self- and Semi-Supervised Learning for Speech
-Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli,
-Alexis Conneau.
-
-An example of how to use a [`SpeechEncoderDecoderModel`] for inference can be seen in [Speech2Text2](speech_to_text_2).
-
-## Randomly initializing `SpeechEncoderDecoderModel` from model configurations.
-
-[`SpeechEncoderDecoderModel`] can be randomly initialized from an encoder and a decoder config. In the following example, we show how to do this using the default [`Wav2Vec2Model`] configuration for the encoder
-and the default [`BertForCausalLM`] configuration for the decoder.
-
-```python
->>> from transformers import BertConfig, Wav2Vec2Config, SpeechEncoderDecoderConfig, SpeechEncoderDecoderModel
-
->>> config_encoder = Wav2Vec2Config()
->>> config_decoder = BertConfig()
-
->>> config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
->>> model = SpeechEncoderDecoderModel(config=config)
-```
-
-## Initialising `SpeechEncoderDecoderModel` from a pretrained encoder and a pretrained decoder.
-
-[`SpeechEncoderDecoderModel`] can be initialized from a pretrained encoder checkpoint and a pretrained decoder checkpoint. Note that any pretrained Transformer-based speech model, *e.g.* [Wav2Vec2](wav2vec2), [Hubert](hubert) can serve as the encoder and both pretrained auto-encoding models, *e.g.* BERT, pretrained causal language models, *e.g.* GPT2, as well as the pretrained decoder part of sequence-to-sequence models, *e.g.* decoder of BART, can be used as the decoder.
-Depending on which architecture you choose as the decoder, the cross-attention layers might be randomly initialized.
-Initializing [`SpeechEncoderDecoderModel`] from a pretrained encoder and decoder checkpoint requires the model to be fine-tuned on a downstream task, as has been shown in [the *Warm-starting-encoder-decoder blog post*](https://huggingface.co/blog/warm-starting-encoder-decoder).
-To do so, the `SpeechEncoderDecoderModel` class provides a [`SpeechEncoderDecoderModel.from_encoder_decoder_pretrained`] method.
-
-```python
->>> from transformers import SpeechEncoderDecoderModel
-
->>> model = SpeechEncoderDecoderModel.from_encoder_decoder_pretrained(
-...     "facebook/hubert-large-ll60k", "bert-base-uncased"
-... )
-```
-
-## Loading an existing `SpeechEncoderDecoderModel` checkpoint and perform inference.
-
-To load fine-tuned checkpoints of the `SpeechEncoderDecoderModel` class, [`SpeechEncoderDecoderModel`] provides the `from_pretrained(...)` method just like any other model architecture in Transformers.
-
-To perform inference, one uses the [`generate`] method, which allows to autoregressively generate text. This method supports various forms of decoding, such as greedy, beam search and multinomial sampling.
-
-```python
->>> from transformers import Wav2Vec2Processor, SpeechEncoderDecoderModel
->>> from datasets import load_dataset
->>> import torch
-
->>> # load a fine-tuned speech translation model and corresponding processor
->>> model = SpeechEncoderDecoderModel.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15")
->>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15")
-
->>> # let's perform inference on a piece of English speech (which we'll translate to German)
->>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
->>> input_values = processor(ds[0]["audio"]["array"], return_tensors="pt").input_values
-
->>> # autoregressively generate transcription (uses greedy decoding by default)
->>> generated_ids = model.generate(input_values)
->>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
->>> print(generated_text)
-Mr. Quilter ist der Apostel der Mittelschicht und wir freuen uns, sein Evangelium willkommen heißen zu können.
-```
-
-## Training
-
-Once the model is created, it can be fine-tuned similar to BART, T5 or any other encoder-decoder model on a dataset of (speech, text) pairs.
-As you can see, only 2 inputs are required for the model in order to compute a loss: `input_values` (which are the
-speech inputs) and `labels` (which are the `input_ids` of the encoded target sequence).
-
-```python
->>> from transformers import AutoTokenizer, AutoFeatureExtractor, SpeechEncoderDecoderModel
->>> from datasets import load_dataset
-
->>> encoder_id = "facebook/wav2vec2-base-960h"  # acoustic model encoder
->>> decoder_id = "bert-base-uncased"  # text decoder
-
->>> feature_extractor = AutoFeatureExtractor.from_pretrained(encoder_id)
->>> tokenizer = AutoTokenizer.from_pretrained(decoder_id)
->>> # Combine pre-trained encoder and pre-trained decoder to form a Seq2Seq model
->>> model = SpeechEncoderDecoderModel.from_encoder_decoder_pretrained(encoder_id, decoder_id)
-
->>> model.config.decoder_start_token_id = tokenizer.cls_token_id
->>> model.config.pad_token_id = tokenizer.pad_token_id
-
->>> # load an audio input and pre-process (normalise mean/std to 0/1)
->>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
->>> input_values = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt").input_values
-
->>> # load its corresponding transcription and tokenize to generate labels
->>> labels = tokenizer(ds[0]["text"], return_tensors="pt").input_ids
-
->>> # the forward function automatically creates the correct decoder_input_ids
->>> loss = model(**input_features).loss
->>> loss.backward()
-```
-
-## SpeechEncoderDecoderConfig
-
-[[autodoc]] SpeechEncoderDecoderConfig
-
-## SpeechEncoderDecoderModel
-
-[[autodoc]] SpeechEncoderDecoderModel
-    - forward
-    - from_encoder_decoder_pretrained
-
-## FlaxSpeechEncoderDecoderModel
-
-[[autodoc]] FlaxSpeechEncoderDecoderModel
-    - __call__
-    - from_encoder_decoder_pretrained
\ No newline at end of file
diff --git a/docs/source/en/model_doc/speech_to_text.md b/docs/source/en/model_doc/speech_to_text.md
new file mode 100644
index 000000000000..cb13a1871ae6
--- /dev/null
+++ b/docs/source/en/model_doc/speech_to_text.md
@@ -0,0 +1,146 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Speech2Text
+
+## Overview
+
+The Speech2Text model was proposed in [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino. It's a
+transformer-based seq2seq (encoder-decoder) model designed for end-to-end Automatic Speech Recognition (ASR) and Speech
+Translation (ST). It uses a convolutional downsampler to reduce the length of speech inputs by 3/4th before they are
+fed into the encoder. The model is trained with standard autoregressive cross-entropy loss and generates the
+transcripts/translations autoregressively. Speech2Text has been fine-tuned on several datasets for ASR and ST:
+[LibriSpeech](http://www.openslr.org/12), [CoVoST 2](https://github.com/facebookresearch/covost), [MuST-C](https://ict.fbk.eu/must-c/).
+
+This model was contributed by [valhalla](https://huggingface.co/valhalla). The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/examples/speech_to_text).
+
+
+## Inference
+
+Speech2Text is a speech model that accepts a float tensor of log-mel filter-bank features extracted from the speech
+signal. It's a transformer-based seq2seq model, so the transcripts/translations are generated autoregressively. The
+`generate()` method can be used for inference.
+
+The [`Speech2TextFeatureExtractor`] class is responsible for extracting the log-mel filter-bank
+features. The [`Speech2TextProcessor`] wraps [`Speech2TextFeatureExtractor`] and
+[`Speech2TextTokenizer`] into a single instance to both extract the input features and decode the
+predicted token ids.
+
+The feature extractor depends on `torchaudio` and the tokenizer depends on `sentencepiece` so be sure to
+install those packages before running the examples. You could either install those as extra speech dependencies with
+`pip install transformers"[speech, sentencepiece]"` or install the packages separately with `pip install torchaudio sentencepiece`. Also `torchaudio` requires the development version of the [libsndfile](http://www.mega-nerd.com/libsndfile/) package which can be installed via a system package manager. On Ubuntu it can
+be installed as follows: `apt install libsndfile1-dev`
+
+
+- ASR and Speech Translation
+
+```python
+>>> import torch
+>>> from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
+>>> from datasets import load_dataset
+
+>>> model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr")
+>>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")
+
+
+>>> ds = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+
+>>> inputs = processor(ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt")
+>>> generated_ids = model.generate(inputs["input_features"], attention_mask=inputs["attention_mask"])
+
+>>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)
+>>> transcription
+['mister quilter is the apostle of the middle classes and we are glad to welcome his gospel']
+```
+
+- Multilingual speech translation
+
+  For multilingual speech translation models, `eos_token_id` is used as the `decoder_start_token_id` and
+  the target language id is forced as the first generated token. To force the target language id as the first
+  generated token, pass the `forced_bos_token_id` parameter to the `generate()` method. The following
+  example shows how to transate English speech to French text using the *facebook/s2t-medium-mustc-multilingual-st*
+  checkpoint.
+
+```python
+>>> import torch
+>>> from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
+>>> from datasets import load_dataset
+
+>>> model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-mustc-multilingual-st")
+>>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-mustc-multilingual-st")
+
+>>> ds = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+
+>>> inputs = processor(ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt")
+>>> generated_ids = model.generate(
+...     inputs["input_features"],
+...     attention_mask=inputs["attention_mask"],
+...     forced_bos_token_id=processor.tokenizer.lang_code_to_id["fr"],
+... )
+
+>>> translation = processor.batch_decode(generated_ids, skip_special_tokens=True)
+>>> translation
+["(Vidéo) Si M. Kilder est l'apossible des classes moyennes, et nous sommes heureux d'être accueillis dans son évangile."]
+```
+
+See the [model hub](https://huggingface.co/models?filter=speech_to_text) to look for Speech2Text checkpoints.
+
+
+## Speech2TextConfig
+
+[[autodoc]] Speech2TextConfig
+
+## Speech2TextTokenizer
+
+[[autodoc]] Speech2TextTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## Speech2TextFeatureExtractor
+
+[[autodoc]] Speech2TextFeatureExtractor
+    - __call__
+
+## Speech2TextProcessor
+
+[[autodoc]] Speech2TextProcessor
+    - __call__
+    - from_pretrained
+    - save_pretrained
+    - batch_decode
+    - decode
+
+## Speech2TextModel
+
+[[autodoc]] Speech2TextModel
+    - forward
+
+## Speech2TextForConditionalGeneration
+
+[[autodoc]] Speech2TextForConditionalGeneration
+    - forward
+
+## TFSpeech2TextModel
+
+[[autodoc]] TFSpeech2TextModel
+    - call
+
+## TFSpeech2TextForConditionalGeneration
+
+[[autodoc]] TFSpeech2TextForConditionalGeneration
+    - call
diff --git a/docs/source/en/model_doc/speech_to_text.mdx b/docs/source/en/model_doc/speech_to_text.mdx
deleted file mode 100644
index 95efc5504ff8..000000000000
--- a/docs/source/en/model_doc/speech_to_text.mdx
+++ /dev/null
@@ -1,142 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Speech2Text
-
-## Overview
-
-The Speech2Text model was proposed in [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino. It's a
-transformer-based seq2seq (encoder-decoder) model designed for end-to-end Automatic Speech Recognition (ASR) and Speech
-Translation (ST). It uses a convolutional downsampler to reduce the length of speech inputs by 3/4th before they are
-fed into the encoder. The model is trained with standard autoregressive cross-entropy loss and generates the
-transcripts/translations autoregressively. Speech2Text has been fine-tuned on several datasets for ASR and ST:
-[LibriSpeech](http://www.openslr.org/12), [CoVoST 2](https://github.com/facebookresearch/covost), [MuST-C](https://ict.fbk.eu/must-c/).
-
-This model was contributed by [valhalla](https://huggingface.co/valhalla). The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/examples/speech_to_text).
-
-
-## Inference
-
-Speech2Text is a speech model that accepts a float tensor of log-mel filter-bank features extracted from the speech
-signal. It's a transformer-based seq2seq model, so the transcripts/translations are generated autoregressively. The
-`generate()` method can be used for inference.
-
-The [`Speech2TextFeatureExtractor`] class is responsible for extracting the log-mel filter-bank
-features. The [`Speech2TextProcessor`] wraps [`Speech2TextFeatureExtractor`] and
-[`Speech2TextTokenizer`] into a single instance to both extract the input features and decode the
-predicted token ids.
-
-The feature extractor depends on `torchaudio` and the tokenizer depends on `sentencepiece` so be sure to
-install those packages before running the examples. You could either install those as extra speech dependencies with
-`pip install transformers"[speech, sentencepiece]"` or install the packages separately with `pip install torchaudio sentencepiece`. Also `torchaudio` requires the development version of the [libsndfile](http://www.mega-nerd.com/libsndfile/) package which can be installed via a system package manager. On Ubuntu it can
-be installed as follows: `apt install libsndfile1-dev`
-
-
-- ASR and Speech Translation
-
-```python
->>> import torch
->>> from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
->>> from datasets import load_dataset
-
->>> model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr")
->>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")
-
-
->>> ds = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
-
->>> inputs = processor(ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt")
->>> generated_ids = model.generate(inputs["input_features"], attention_mask=inputs["attention_mask"])
-
->>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)
->>> transcription
-['mister quilter is the apostle of the middle classes and we are glad to welcome his gospel']
-```
-
-- Multilingual speech translation
-
-  For multilingual speech translation models, `eos_token_id` is used as the `decoder_start_token_id` and
-  the target language id is forced as the first generated token. To force the target language id as the first
-  generated token, pass the `forced_bos_token_id` parameter to the `generate()` method. The following
-  example shows how to transate English speech to French text using the *facebook/s2t-medium-mustc-multilingual-st*
-  checkpoint.
-
-```python
->>> import torch
->>> from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
->>> from datasets import load_dataset
-
->>> model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-mustc-multilingual-st")
->>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-mustc-multilingual-st")
-
->>> ds = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
-
->>> inputs = processor(ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt")
->>> generated_ids = model.generate(
-...     inputs["input_features"],
-...     attention_mask=inputs["attention_mask"],
-...     forced_bos_token_id=processor.tokenizer.lang_code_to_id["fr"],
-... )
-
->>> translation = processor.batch_decode(generated_ids, skip_special_tokens=True)
->>> translation
-["(Vidéo) Si M. Kilder est l'apossible des classes moyennes, et nous sommes heureux d'être accueillis dans son évangile."]
-```
-
-See the [model hub](https://huggingface.co/models?filter=speech_to_text) to look for Speech2Text checkpoints.
-
-
-## Speech2TextConfig
-
-[[autodoc]] Speech2TextConfig
-
-## Speech2TextTokenizer
-
-[[autodoc]] Speech2TextTokenizer
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-
-## Speech2TextFeatureExtractor
-
-[[autodoc]] Speech2TextFeatureExtractor
-    - __call__
-
-## Speech2TextProcessor
-
-[[autodoc]] Speech2TextProcessor
-    - __call__
-    - from_pretrained
-    - save_pretrained
-    - batch_decode
-    - decode
-
-## Speech2TextModel
-
-[[autodoc]] Speech2TextModel
-    - forward
-
-## Speech2TextForConditionalGeneration
-
-[[autodoc]] Speech2TextForConditionalGeneration
-    - forward
-
-## TFSpeech2TextModel
-
-[[autodoc]] TFSpeech2TextModel
-    - call
-
-## TFSpeech2TextForConditionalGeneration
-
-[[autodoc]] TFSpeech2TextForConditionalGeneration
-    - call
diff --git a/docs/source/en/model_doc/speech_to_text_2.md b/docs/source/en/model_doc/speech_to_text_2.md
new file mode 100644
index 000000000000..1abdeced580e
--- /dev/null
+++ b/docs/source/en/model_doc/speech_to_text_2.md
@@ -0,0 +1,128 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Speech2Text2
+
+## Overview
+
+The Speech2Text2 model is used together with [Wav2Vec2](wav2vec2) for Speech Translation models proposed in
+[Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by
+Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
+
+Speech2Text2 is a *decoder-only* transformer model that can be used with any speech *encoder-only*, such as
+[Wav2Vec2](wav2vec2) or [HuBERT](hubert) for Speech-to-Text tasks. Please refer to the
+[SpeechEncoderDecoder](speech-encoder-decoder) class on how to combine Speech2Text2 with any speech *encoder-only*
+model.
+
+This model was contributed by [Patrick von Platen](https://huggingface.co/patrickvonplaten).
+
+The original code can be found [here](https://github.com/pytorch/fairseq/blob/1f7ef9ed1e1061f8c7f88f8b94c7186834398690/fairseq/models/wav2vec/wav2vec2_asr.py#L266).
+
+
+Tips:
+
+- Speech2Text2 achieves state-of-the-art results on the CoVoST Speech Translation dataset. For more information, see
+  the [official models](https://huggingface.co/models?other=speech2text2) .
+- Speech2Text2 is always used within the [SpeechEncoderDecoder](speech-encoder-decoder) framework.
+- Speech2Text2's tokenizer is based on [fastBPE](https://github.com/glample/fastBPE).
+
+## Inference
+
+Speech2Text2's [`SpeechEncoderDecoderModel`] model accepts raw waveform input values from speech and
+makes use of [`~generation.GenerationMixin.generate`] to translate the input speech
+autoregressively to the target language.
+
+The [`Wav2Vec2FeatureExtractor`] class is responsible for preprocessing the input speech and
+[`Speech2Text2Tokenizer`] decodes the generated target tokens to the target string. The
+[`Speech2Text2Processor`] wraps [`Wav2Vec2FeatureExtractor`] and
+[`Speech2Text2Tokenizer`] into a single instance to both extract the input features and decode the
+predicted token ids.
+
+- Step-by-step Speech Translation
+
+```python
+>>> import torch
+>>> from transformers import Speech2Text2Processor, SpeechEncoderDecoderModel
+>>> from datasets import load_dataset
+>>> import soundfile as sf
+
+>>> model = SpeechEncoderDecoderModel.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
+>>> processor = Speech2Text2Processor.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
+
+
+>>> def map_to_array(batch):
+...     speech, _ = sf.read(batch["file"])
+...     batch["speech"] = speech
+...     return batch
+
+
+>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+>>> ds = ds.map(map_to_array)
+
+>>> inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt")
+>>> generated_ids = model.generate(inputs=inputs["input_values"], attention_mask=inputs["attention_mask"])
+
+>>> transcription = processor.batch_decode(generated_ids)
+```
+
+- Speech Translation via Pipelines
+
+  The automatic speech recognition pipeline can also be used to translate speech in just a couple lines of code
+
+```python
+>>> from datasets import load_dataset
+>>> from transformers import pipeline
+
+>>> librispeech_en = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+>>> asr = pipeline(
+...     "automatic-speech-recognition",
+...     model="facebook/s2t-wav2vec2-large-en-de",
+...     feature_extractor="facebook/s2t-wav2vec2-large-en-de",
+... )
+
+>>> translation_de = asr(librispeech_en[0]["file"])
+```
+
+See [model hub](https://huggingface.co/models?filter=speech2text2) to look for Speech2Text2 checkpoints.
+
+## Documentation resources
+
+- [Causal language modeling task guide](../tasks/language_modeling)
+
+## Speech2Text2Config
+
+[[autodoc]] Speech2Text2Config
+
+## Speech2TextTokenizer
+
+[[autodoc]] Speech2Text2Tokenizer
+    - batch_decode
+    - decode
+    - save_vocabulary
+
+## Speech2Text2Processor
+
+[[autodoc]] Speech2Text2Processor
+    - __call__
+    - from_pretrained
+    - save_pretrained
+    - batch_decode
+    - decode
+
+## Speech2Text2ForCausalLM
+
+[[autodoc]] Speech2Text2ForCausalLM
+    - forward
diff --git a/docs/source/en/model_doc/speech_to_text_2.mdx b/docs/source/en/model_doc/speech_to_text_2.mdx
deleted file mode 100644
index 2e3ebc3f390a..000000000000
--- a/docs/source/en/model_doc/speech_to_text_2.mdx
+++ /dev/null
@@ -1,121 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Speech2Text2
-
-## Overview
-
-The Speech2Text2 model is used together with [Wav2Vec2](wav2vec2) for Speech Translation models proposed in
-[Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by
-Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
-
-Speech2Text2 is a *decoder-only* transformer model that can be used with any speech *encoder-only*, such as
-[Wav2Vec2](wav2vec2) or [HuBERT](hubert) for Speech-to-Text tasks. Please refer to the
-[SpeechEncoderDecoder](speech-encoder-decoder) class on how to combine Speech2Text2 with any speech *encoder-only*
-model.
-
-This model was contributed by [Patrick von Platen](https://huggingface.co/patrickvonplaten).
-
-The original code can be found [here](https://github.com/pytorch/fairseq/blob/1f7ef9ed1e1061f8c7f88f8b94c7186834398690/fairseq/models/wav2vec/wav2vec2_asr.py#L266).
-
-
-Tips:
-
-- Speech2Text2 achieves state-of-the-art results on the CoVoST Speech Translation dataset. For more information, see
-  the [official models](https://huggingface.co/models?other=speech2text2) .
-- Speech2Text2 is always used within the [SpeechEncoderDecoder](speech-encoder-decoder) framework.
-- Speech2Text2's tokenizer is based on [fastBPE](https://github.com/glample/fastBPE).
-
-## Inference
-
-Speech2Text2's [`SpeechEncoderDecoderModel`] model accepts raw waveform input values from speech and
-makes use of [`~generation.GenerationMixin.generate`] to translate the input speech
-autoregressively to the target language.
-
-The [`Wav2Vec2FeatureExtractor`] class is responsible for preprocessing the input speech and
-[`Speech2Text2Tokenizer`] decodes the generated target tokens to the target string. The
-[`Speech2Text2Processor`] wraps [`Wav2Vec2FeatureExtractor`] and
-[`Speech2Text2Tokenizer`] into a single instance to both extract the input features and decode the
-predicted token ids.
-
-- Step-by-step Speech Translation
-
-```python
->>> import torch
->>> from transformers import Speech2Text2Processor, SpeechEncoderDecoderModel
->>> from datasets import load_dataset
->>> import soundfile as sf
-
->>> model = SpeechEncoderDecoderModel.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
->>> processor = Speech2Text2Processor.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
-
-
->>> def map_to_array(batch):
-...     speech, _ = sf.read(batch["file"])
-...     batch["speech"] = speech
-...     return batch
-
-
->>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
->>> ds = ds.map(map_to_array)
-
->>> inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt")
->>> generated_ids = model.generate(inputs=inputs["input_values"], attention_mask=inputs["attention_mask"])
-
->>> transcription = processor.batch_decode(generated_ids)
-```
-
-- Speech Translation via Pipelines
-
-  The automatic speech recognition pipeline can also be used to translate speech in just a couple lines of code
-
-```python
->>> from datasets import load_dataset
->>> from transformers import pipeline
-
->>> librispeech_en = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
->>> asr = pipeline(
-...     "automatic-speech-recognition",
-...     model="facebook/s2t-wav2vec2-large-en-de",
-...     feature_extractor="facebook/s2t-wav2vec2-large-en-de",
-... )
-
->>> translation_de = asr(librispeech_en[0]["file"])
-```
-
-See [model hub](https://huggingface.co/models?filter=speech2text2) to look for Speech2Text2 checkpoints.
-
-
-## Speech2Text2Config
-
-[[autodoc]] Speech2Text2Config
-
-## Speech2TextTokenizer
-
-[[autodoc]] Speech2Text2Tokenizer
-    - batch_decode
-    - decode
-    - save_vocabulary
-
-## Speech2Text2Processor
-
-[[autodoc]] Speech2Text2Processor
-    - __call__
-    - from_pretrained
-    - save_pretrained
-    - batch_decode
-    - decode
-
-## Speech2Text2ForCausalLM
-
-[[autodoc]] Speech2Text2ForCausalLM
-    - forward
diff --git a/docs/source/en/model_doc/speecht5.md b/docs/source/en/model_doc/speecht5.md
new file mode 100644
index 000000000000..4d5e2098a542
--- /dev/null
+++ b/docs/source/en/model_doc/speecht5.md
@@ -0,0 +1,85 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# SpeechT5
+
+## Overview
+
+The SpeechT5 model was proposed in [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
+
+The abstract from the paper is the following:
+
+*Motivated by the success of T5 (Text-To-Text Transfer Transformer) in pre-trained natural language processing models, we propose a unified-modal SpeechT5 framework that explores the encoder-decoder pre-training for self-supervised speech/text representation learning. The SpeechT5 framework consists of a shared encoder-decoder network and six modal-specific (speech/text) pre/post-nets. After preprocessing the input speech/text through the pre-nets, the shared encoder-decoder network models the sequence-to-sequence transformation, and then the post-nets generate the output in the speech/text modality based on the output of the decoder. Leveraging large-scale unlabeled speech and text data, we pre-train SpeechT5 to learn a unified-modal representation, hoping to improve the modeling capability for both speech and text. To align the textual and speech information into this unified semantic space, we propose a cross-modal vector quantization approach that randomly mixes up speech/text states with latent units as the interface between encoder and decoder. Extensive evaluations show the superiority of the proposed SpeechT5 framework on a wide variety of spoken language processing tasks, including automatic speech recognition, speech synthesis, speech translation, voice conversion, speech enhancement, and speaker identification.*
+
+This model was contributed by [Matthijs](https://huggingface.co/Matthijs). The original code can be found [here](https://github.com/microsoft/SpeechT5).
+
+## SpeechT5Config
+
+[[autodoc]] SpeechT5Config
+
+## SpeechT5HifiGanConfig
+
+[[autodoc]] SpeechT5HifiGanConfig
+
+## SpeechT5Tokenizer
+
+[[autodoc]] SpeechT5Tokenizer
+    - __call__
+    - save_vocabulary
+    - decode
+    - batch_decode
+
+## SpeechT5FeatureExtractor
+
+[[autodoc]] SpeechT5FeatureExtractor
+    - __call__
+
+## SpeechT5Processor
+
+[[autodoc]] SpeechT5Processor
+    - __call__
+    - pad
+    - from_pretrained
+    - save_pretrained
+    - batch_decode
+    - decode
+
+## SpeechT5Model
+
+[[autodoc]] SpeechT5Model
+    - forward
+
+## SpeechT5ForSpeechToText
+
+[[autodoc]] SpeechT5ForSpeechToText
+    - forward
+
+## SpeechT5ForTextToSpeech
+
+[[autodoc]] SpeechT5ForTextToSpeech
+    - forward
+    - generate
+
+## SpeechT5ForSpeechToSpeech
+
+[[autodoc]] SpeechT5ForSpeechToSpeech
+    - forward
+    - generate_speech
+
+## SpeechT5HifiGan
+
+[[autodoc]] SpeechT5HifiGan
+    - forward
diff --git a/docs/source/en/model_doc/splinter.md b/docs/source/en/model_doc/splinter.md
new file mode 100644
index 000000000000..f16169d9b218
--- /dev/null
+++ b/docs/source/en/model_doc/splinter.md
@@ -0,0 +1,87 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Splinter
+
+## Overview
+
+The Splinter model was proposed in [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy. Splinter
+is an encoder-only transformer (similar to BERT) pretrained using the recurring span selection task on a large corpus
+comprising Wikipedia and the Toronto Book Corpus.
+
+The abstract from the paper is the following:
+
+In several question answering benchmarks, pretrained models have reached human parity through fine-tuning on an order
+of 100,000 annotated questions and answers. We explore the more realistic few-shot setting, where only a few hundred
+training examples are available, and observe that standard models perform poorly, highlighting the discrepancy between
+current pretraining objectives and question answering. We propose a new pretraining scheme tailored for question
+answering: recurring span selection. Given a passage with multiple sets of recurring spans, we mask in each set all
+recurring spans but one, and ask the model to select the correct span in the passage for each masked span. Masked spans
+are replaced with a special token, viewed as a question representation, that is later used during fine-tuning to select
+the answer span. The resulting model obtains surprisingly good results on multiple benchmarks (e.g., 72.7 F1 on SQuAD
+with only 128 training examples), while maintaining competitive performance in the high-resource setting.
+
+Tips:
+
+- Splinter was trained to predict answers spans conditioned on a special [QUESTION] token. These tokens contextualize
+  to question representations which are used to predict the answers. This layer is called QASS, and is the default
+  behaviour in the [`SplinterForQuestionAnswering`] class. Therefore:
+- Use [`SplinterTokenizer`] (rather than [`BertTokenizer`]), as it already
+  contains this special token. Also, its default behavior is to use this token when two sequences are given (for
+  example, in the *run_qa.py* script).
+- If you plan on using Splinter outside *run_qa.py*, please keep in mind the question token - it might be important for
+  the success of your model, especially in a few-shot setting.
+- Please note there are two different checkpoints for each size of Splinter. Both are basically the same, except that
+  one also has the pretrained weights of the QASS layer (*tau/splinter-base-qass* and *tau/splinter-large-qass*) and one
+  doesn't (*tau/splinter-base* and *tau/splinter-large*). This is done to support randomly initializing this layer at
+  fine-tuning, as it is shown to yield better results for some cases in the paper.
+
+This model was contributed by [yuvalkirstain](https://huggingface.co/yuvalkirstain) and [oriram](https://huggingface.co/oriram). The original code can be found [here](https://github.com/oriram/splinter).
+
+## Documentation resources
+
+- [Question answering task guide](../tasks/question-answering)
+
+## SplinterConfig
+
+[[autodoc]] SplinterConfig
+
+## SplinterTokenizer
+
+[[autodoc]] SplinterTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## SplinterTokenizerFast
+
+[[autodoc]] SplinterTokenizerFast
+
+## SplinterModel
+
+[[autodoc]] SplinterModel
+    - forward
+
+## SplinterForQuestionAnswering
+
+[[autodoc]] SplinterForQuestionAnswering
+    - forward
+
+## SplinterForPreTraining
+
+[[autodoc]] SplinterForPreTraining
+    - forward
diff --git a/docs/source/en/model_doc/splinter.mdx b/docs/source/en/model_doc/splinter.mdx
deleted file mode 100644
index 55e5f61b8d0b..000000000000
--- a/docs/source/en/model_doc/splinter.mdx
+++ /dev/null
@@ -1,79 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Splinter
-
-## Overview
-
-The Splinter model was proposed in [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy. Splinter
-is an encoder-only transformer (similar to BERT) pretrained using the recurring span selection task on a large corpus
-comprising Wikipedia and the Toronto Book Corpus.
-
-The abstract from the paper is the following:
-
-In several question answering benchmarks, pretrained models have reached human parity through fine-tuning on an order
-of 100,000 annotated questions and answers. We explore the more realistic few-shot setting, where only a few hundred
-training examples are available, and observe that standard models perform poorly, highlighting the discrepancy between
-current pretraining objectives and question answering. We propose a new pretraining scheme tailored for question
-answering: recurring span selection. Given a passage with multiple sets of recurring spans, we mask in each set all
-recurring spans but one, and ask the model to select the correct span in the passage for each masked span. Masked spans
-are replaced with a special token, viewed as a question representation, that is later used during fine-tuning to select
-the answer span. The resulting model obtains surprisingly good results on multiple benchmarks (e.g., 72.7 F1 on SQuAD
-with only 128 training examples), while maintaining competitive performance in the high-resource setting.
-
-Tips:
-
-- Splinter was trained to predict answers spans conditioned on a special [QUESTION] token. These tokens contextualize
-  to question representations which are used to predict the answers. This layer is called QASS, and is the default
-  behaviour in the [`SplinterForQuestionAnswering`] class. Therefore:
-- Use [`SplinterTokenizer`] (rather than [`BertTokenizer`]), as it already
-  contains this special token. Also, its default behavior is to use this token when two sequences are given (for
-  example, in the *run_qa.py* script).
-- If you plan on using Splinter outside *run_qa.py*, please keep in mind the question token - it might be important for
-  the success of your model, especially in a few-shot setting.
-- Please note there are two different checkpoints for each size of Splinter. Both are basically the same, except that
-  one also has the pretrained weights of the QASS layer (*tau/splinter-base-qass* and *tau/splinter-large-qass*) and one
-  doesn't (*tau/splinter-base* and *tau/splinter-large*). This is done to support randomly initializing this layer at
-  fine-tuning, as it is shown to yield better results for some cases in the paper.
-
-This model was contributed by [yuvalkirstain](https://huggingface.co/yuvalkirstain) and [oriram](https://huggingface.co/oriram). The original code can be found [here](https://github.com/oriram/splinter).
-
-## SplinterConfig
-
-[[autodoc]] SplinterConfig
-
-## SplinterTokenizer
-
-[[autodoc]] SplinterTokenizer
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-
-## SplinterTokenizerFast
-
-[[autodoc]] SplinterTokenizerFast
-
-## SplinterModel
-
-[[autodoc]] SplinterModel
-    - forward
-
-## SplinterForQuestionAnswering
-
-[[autodoc]] SplinterForQuestionAnswering
-    - forward
-
-## SplinterForPreTraining
-
-[[autodoc]] SplinterForPreTraining
-    - forward
diff --git a/docs/source/en/model_doc/squeezebert.md b/docs/source/en/model_doc/squeezebert.md
new file mode 100644
index 000000000000..515a2ef31781
--- /dev/null
+++ b/docs/source/en/model_doc/squeezebert.md
@@ -0,0 +1,99 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# SqueezeBERT
+
+## Overview
+
+The SqueezeBERT model was proposed in [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, Kurt W. Keutzer. It's a
+bidirectional transformer similar to the BERT model. The key difference between the BERT architecture and the
+SqueezeBERT architecture is that SqueezeBERT uses [grouped convolutions](https://blog.yani.io/filter-group-tutorial)
+instead of fully-connected layers for the Q, K, V and FFN layers.
+
+The abstract from the paper is the following:
+
+*Humans read and write hundreds of billions of messages every day. Further, due to the availability of large datasets,
+large computing systems, and better neural network models, natural language processing (NLP) technology has made
+significant strides in understanding, proofreading, and organizing these messages. Thus, there is a significant
+opportunity to deploy NLP in myriad applications to help web users, social networks, and businesses. In particular, we
+consider smartphones and other mobile devices as crucial platforms for deploying NLP models at scale. However, today's
+highly-accurate NLP neural network models such as BERT and RoBERTa are extremely computationally expensive, with
+BERT-base taking 1.7 seconds to classify a text snippet on a Pixel 3 smartphone. In this work, we observe that methods
+such as grouped convolutions have yielded significant speedups for computer vision networks, but many of these
+techniques have not been adopted by NLP neural network designers. We demonstrate how to replace several operations in
+self-attention layers with grouped convolutions, and we use this technique in a novel network architecture called
+SqueezeBERT, which runs 4.3x faster than BERT-base on the Pixel 3 while achieving competitive accuracy on the GLUE test
+set. The SqueezeBERT code will be released.*
+
+Tips:
+
+- SqueezeBERT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
+  rather than the left.
+- SqueezeBERT is similar to BERT and therefore relies on the masked language modeling (MLM) objective. It is therefore
+  efficient at predicting masked tokens and at NLU in general, but is not optimal for text generation. Models trained
+  with a causal language modeling (CLM) objective are better in that regard.
+- For best results when finetuning on sequence classification tasks, it is recommended to start with the
+  *squeezebert/squeezebert-mnli-headless* checkpoint.
+
+This model was contributed by [forresti](https://huggingface.co/forresti).
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+- [Multiple choice task guide](../tasks/multiple_choice)
+
+## SqueezeBertConfig
+
+[[autodoc]] SqueezeBertConfig
+
+## SqueezeBertTokenizer
+
+[[autodoc]] SqueezeBertTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## SqueezeBertTokenizerFast
+
+[[autodoc]] SqueezeBertTokenizerFast
+
+## SqueezeBertModel
+
+[[autodoc]] SqueezeBertModel
+
+## SqueezeBertForMaskedLM
+
+[[autodoc]] SqueezeBertForMaskedLM
+
+## SqueezeBertForSequenceClassification
+
+[[autodoc]] SqueezeBertForSequenceClassification
+
+## SqueezeBertForMultipleChoice
+
+[[autodoc]] SqueezeBertForMultipleChoice
+
+## SqueezeBertForTokenClassification
+
+[[autodoc]] SqueezeBertForTokenClassification
+
+## SqueezeBertForQuestionAnswering
+
+[[autodoc]] SqueezeBertForQuestionAnswering
diff --git a/docs/source/en/model_doc/squeezebert.mdx b/docs/source/en/model_doc/squeezebert.mdx
deleted file mode 100644
index c6219582c838..000000000000
--- a/docs/source/en/model_doc/squeezebert.mdx
+++ /dev/null
@@ -1,88 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# SqueezeBERT
-
-## Overview
-
-The SqueezeBERT model was proposed in [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, Kurt W. Keutzer. It's a
-bidirectional transformer similar to the BERT model. The key difference between the BERT architecture and the
-SqueezeBERT architecture is that SqueezeBERT uses [grouped convolutions](https://blog.yani.io/filter-group-tutorial)
-instead of fully-connected layers for the Q, K, V and FFN layers.
-
-The abstract from the paper is the following:
-
-*Humans read and write hundreds of billions of messages every day. Further, due to the availability of large datasets,
-large computing systems, and better neural network models, natural language processing (NLP) technology has made
-significant strides in understanding, proofreading, and organizing these messages. Thus, there is a significant
-opportunity to deploy NLP in myriad applications to help web users, social networks, and businesses. In particular, we
-consider smartphones and other mobile devices as crucial platforms for deploying NLP models at scale. However, today's
-highly-accurate NLP neural network models such as BERT and RoBERTa are extremely computationally expensive, with
-BERT-base taking 1.7 seconds to classify a text snippet on a Pixel 3 smartphone. In this work, we observe that methods
-such as grouped convolutions have yielded significant speedups for computer vision networks, but many of these
-techniques have not been adopted by NLP neural network designers. We demonstrate how to replace several operations in
-self-attention layers with grouped convolutions, and we use this technique in a novel network architecture called
-SqueezeBERT, which runs 4.3x faster than BERT-base on the Pixel 3 while achieving competitive accuracy on the GLUE test
-set. The SqueezeBERT code will be released.*
-
-Tips:
-
-- SqueezeBERT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
-  rather than the left.
-- SqueezeBERT is similar to BERT and therefore relies on the masked language modeling (MLM) objective. It is therefore
-  efficient at predicting masked tokens and at NLU in general, but is not optimal for text generation. Models trained
-  with a causal language modeling (CLM) objective are better in that regard.
-- For best results when finetuning on sequence classification tasks, it is recommended to start with the
-  *squeezebert/squeezebert-mnli-headless* checkpoint.
-
-This model was contributed by [forresti](https://huggingface.co/forresti).
-
-
-## SqueezeBertConfig
-
-[[autodoc]] SqueezeBertConfig
-
-## SqueezeBertTokenizer
-
-[[autodoc]] SqueezeBertTokenizer
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-
-## SqueezeBertTokenizerFast
-
-[[autodoc]] SqueezeBertTokenizerFast
-
-## SqueezeBertModel
-
-[[autodoc]] SqueezeBertModel
-
-## SqueezeBertForMaskedLM
-
-[[autodoc]] SqueezeBertForMaskedLM
-
-## SqueezeBertForSequenceClassification
-
-[[autodoc]] SqueezeBertForSequenceClassification
-
-## SqueezeBertForMultipleChoice
-
-[[autodoc]] SqueezeBertForMultipleChoice
-
-## SqueezeBertForTokenClassification
-
-[[autodoc]] SqueezeBertForTokenClassification
-
-## SqueezeBertForQuestionAnswering
-
-[[autodoc]] SqueezeBertForQuestionAnswering
diff --git a/docs/source/en/model_doc/swiftformer.md b/docs/source/en/model_doc/swiftformer.md
new file mode 100644
index 000000000000..67c9597d2123
--- /dev/null
+++ b/docs/source/en/model_doc/swiftformer.md
@@ -0,0 +1,49 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# SwiftFormer
+
+## Overview
+
+The SwiftFormer model was proposed in [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
+
+The SwiftFormer paper introduces a novel efficient additive attention mechanism that effectively replaces the quadratic matrix multiplication operations in the self-attention computation with linear element-wise multiplications. A series of models called 'SwiftFormer' is built based on this, which achieves state-of-the-art performance in terms of both accuracy and mobile inference speed. Even their small variant achieves 78.5% top-1 ImageNet1K accuracy with only 0.8 ms latency on iPhone 14, which is more accurate and 2× faster compared to MobileViT-v2.
+
+The abstract from the paper is the following:
+
+*Self-attention has become a defacto choice for capturing global context in various vision applications. However, its quadratic computational complexity with respect to image resolution limits its use in real-time applications, especially for deployment on resource-constrained mobile devices. Although hybrid approaches have been proposed to combine the advantages of convolutions and self-attention for a better speed-accuracy trade-off, the expensive matrix multiplication operations in self-attention remain a bottleneck. In this work, we introduce a novel efficient additive attention mechanism that effectively replaces the quadratic matrix multiplication operations with linear element-wise multiplications. Our design shows that the key-value interaction can be replaced with a linear layer without sacrificing any accuracy. Unlike previous state-of-the-art methods, our efficient formulation of self-attention enables its usage at all stages of the network. Using our proposed efficient additive attention, we build a series of models called "SwiftFormer" which achieves state-of-the-art performance in terms of both accuracy and mobile inference speed. Our small variant achieves 78.5% top-1 ImageNet-1K accuracy with only 0.8 ms latency on iPhone 14, which is more accurate and 2x faster compared to MobileViT-v2.*
+
+Tips:
+    - One can use the [`ViTImageProcessor`] API to prepare images for the model.
+
+
+This model was contributed by [shehan97](https://huggingface.co/shehan97).
+The original code can be found [here](https://github.com/Amshaker/SwiftFormer).
+
+
+## SwiftFormerConfig
+
+[[autodoc]] SwiftFormerConfig
+
+## SwiftFormerModel
+
+[[autodoc]] SwiftFormerModel
+    - forward
+
+## SwiftFormerForImageClassification
+
+[[autodoc]] SwiftFormerForImageClassification
+    - forward
diff --git a/docs/source/en/model_doc/swin.md b/docs/source/en/model_doc/swin.md
new file mode 100644
index 000000000000..37bb86db951a
--- /dev/null
+++ b/docs/source/en/model_doc/swin.md
@@ -0,0 +1,100 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Swin Transformer
+
+## Overview
+
+The Swin Transformer was proposed in [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030)
+by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
+
+The abstract from the paper is the following:
+
+*This paper presents a new vision Transformer, called Swin Transformer, that capably serves as a general-purpose backbone
+for computer vision. Challenges in adapting Transformer from language to vision arise from differences between the two domains,
+such as large variations in the scale of visual entities and the high resolution of pixels in images compared to words in text.
+To address these differences, we propose a hierarchical Transformer whose representation is computed with \bold{S}hifted
+\bold{win}dows. The shifted windowing scheme brings greater efficiency by limiting self-attention computation to non-overlapping
+local windows while also allowing for cross-window connection. This hierarchical architecture has the flexibility to model at
+various scales and has linear computational complexity with respect to image size. These qualities of Swin Transformer make it
+compatible with a broad range of vision tasks, including image classification (87.3 top-1 accuracy on ImageNet-1K) and dense
+prediction tasks such as object detection (58.7 box AP and 51.1 mask AP on COCO test-dev) and semantic segmentation
+(53.5 mIoU on ADE20K val). Its performance surpasses the previous state-of-the-art by a large margin of +2.7 box AP and
++2.6 mask AP on COCO, and +3.2 mIoU on ADE20K, demonstrating the potential of Transformer-based models as vision backbones.
+The hierarchical design and the shifted window approach also prove beneficial for all-MLP architectures.*
+
+Tips:
+- One can use the [`AutoImageProcessor`] API to prepare images for the model.
+- Swin pads the inputs supporting any input height and width (if divisible by `32`).
+- Swin can be used as a *backbone*. When `output_hidden_states = True`, it will output both `hidden_states` and `reshaped_hidden_states`. The `reshaped_hidden_states` have a shape of `(batch, num_channels, height, width)` rather than `(batch_size, sequence_length, num_channels)`.
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/swin_transformer_architecture.png"
+alt="drawing" width="600"/>
+
+<small> Swin Transformer architecture. Taken from the <a href="https://arxiv.org/abs/2102.03334">original paper</a>.</small>
+
+This model was contributed by [novice03](https://huggingface.co/novice03). The Tensorflow version of this model was contributed by [amyeroberts](https://huggingface.co/amyeroberts). The original code can be found [here](https://github.com/microsoft/Swin-Transformer).
+
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Swin Transformer.
+
+<PipelineTag pipeline="image-classification"/>
+
+- [`SwinForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
+- See also: [Image classification task guide](../tasks/image_classification)
+
+Besides that:
+
+- [`SwinForMaskedImageModeling`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## SwinConfig
+
+[[autodoc]] SwinConfig
+
+
+## SwinModel
+
+[[autodoc]] SwinModel
+    - forward
+
+## SwinForMaskedImageModeling
+
+[[autodoc]] SwinForMaskedImageModeling
+    - forward
+
+## SwinForImageClassification
+
+[[autodoc]] transformers.SwinForImageClassification
+    - forward
+
+## TFSwinModel
+
+[[autodoc]] TFSwinModel
+    - call
+
+## TFSwinForMaskedImageModeling
+
+[[autodoc]] TFSwinForMaskedImageModeling
+    - call
+
+## TFSwinForImageClassification
+
+[[autodoc]] transformers.TFSwinForImageClassification
+    - call
diff --git a/docs/source/en/model_doc/swin.mdx b/docs/source/en/model_doc/swin.mdx
deleted file mode 100644
index 503a141084a0..000000000000
--- a/docs/source/en/model_doc/swin.mdx
+++ /dev/null
@@ -1,81 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Swin Transformer
-
-## Overview
-
-The Swin Transformer was proposed in [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030)
-by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
-
-The abstract from the paper is the following:
-
-*This paper presents a new vision Transformer, called Swin Transformer, that capably serves as a general-purpose backbone
-for computer vision. Challenges in adapting Transformer from language to vision arise from differences between the two domains,
-such as large variations in the scale of visual entities and the high resolution of pixels in images compared to words in text.
-To address these differences, we propose a hierarchical Transformer whose representation is computed with \bold{S}hifted
-\bold{win}dows. The shifted windowing scheme brings greater efficiency by limiting self-attention computation to non-overlapping
-local windows while also allowing for cross-window connection. This hierarchical architecture has the flexibility to model at
-various scales and has linear computational complexity with respect to image size. These qualities of Swin Transformer make it
-compatible with a broad range of vision tasks, including image classification (87.3 top-1 accuracy on ImageNet-1K) and dense
-prediction tasks such as object detection (58.7 box AP and 51.1 mask AP on COCO test-dev) and semantic segmentation
-(53.5 mIoU on ADE20K val). Its performance surpasses the previous state-of-the-art by a large margin of +2.7 box AP and
-+2.6 mask AP on COCO, and +3.2 mIoU on ADE20K, demonstrating the potential of Transformer-based models as vision backbones.
-The hierarchical design and the shifted window approach also prove beneficial for all-MLP architectures.*
-
-Tips:
-- One can use the [`AutoImageProcessor`] API to prepare images for the model.
-- Swin pads the inputs supporting any input height and width (if divisible by `32`).
-- Swin can be used as a *backbone*. When `output_hidden_states = True`, it will output both `hidden_states` and `reshaped_hidden_states`. The `reshaped_hidden_states` have a shape of `(batch, num_channels, height, width)` rather than `(batch_size, sequence_length, num_channels)`.
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/swin_transformer_architecture.png"
-alt="drawing" width="600"/>
-
-<small> Swin Transformer architecture. Taken from the <a href="https://arxiv.org/abs/2102.03334">original paper</a>.</small>
-
-This model was contributed by [novice03](https://huggingface.co/novice03). The Tensorflow version of this model was contributed by [amyeroberts](https://huggingface.co/amyeroberts). The original code can be found [here](https://github.com/microsoft/Swin-Transformer).
-
-
-## SwinConfig
-
-[[autodoc]] SwinConfig
-
-
-## SwinModel
-
-[[autodoc]] SwinModel
-    - forward
-
-## SwinForMaskedImageModeling
-
-[[autodoc]] SwinForMaskedImageModeling
-    - forward
-
-## SwinForImageClassification
-
-[[autodoc]] transformers.SwinForImageClassification
-    - forward
-
-## TFSwinModel
-
-[[autodoc]] TFSwinModel
-    - call
-
-## TFSwinForMaskedImageModeling
-
-[[autodoc]] TFSwinForMaskedImageModeling
-    - call
-
-## TFSwinForImageClassification
-
-[[autodoc]] transformers.TFSwinForImageClassification
-    - call
diff --git a/docs/source/en/model_doc/swin2sr.md b/docs/source/en/model_doc/swin2sr.md
new file mode 100644
index 000000000000..dfee144e50c4
--- /dev/null
+++ b/docs/source/en/model_doc/swin2sr.md
@@ -0,0 +1,61 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Swin2SR
+
+## Overview
+
+The Swin2SR model was proposed in [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
+Swin2R improves the [SwinIR](https://github.com/JingyunLiang/SwinIR/) model by incorporating [Swin Transformer v2](swinv2) layers which mitigates issues such as training instability, resolution gaps between pre-training
+and fine-tuning, and hunger on data.
+
+The abstract from the paper is the following:
+
+*Compression plays an important role on the efficient transmission and storage of images and videos through band-limited systems such as streaming services, virtual reality or videogames. However, compression unavoidably leads to artifacts and the loss of the original information, which may severely degrade the visual quality. For these reasons, quality enhancement of compressed images has become a popular research topic. While most state-of-the-art image restoration methods are based on convolutional neural networks, other transformers-based methods such as SwinIR, show impressive performance on these tasks.
+In this paper, we explore the novel Swin Transformer V2, to improve SwinIR for image super-resolution, and in particular, the compressed input scenario. Using this method we can tackle the major issues in training transformer vision models, such as training instability, resolution gaps between pre-training and fine-tuning, and hunger on data. We conduct experiments on three representative tasks: JPEG compression artifacts removal, image super-resolution (classical and lightweight), and compressed image super-resolution. Experimental results demonstrate that our method, Swin2SR, can improve the training convergence and performance of SwinIR, and is a top-5 solution at the "AIM 2022 Challenge on Super-Resolution of Compressed Image and Video".*
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/swin2sr_architecture.png"
+alt="drawing" width="600"/>
+
+<small> Swin2SR architecture. Taken from the <a href="https://arxiv.org/abs/2209.11345">original paper.</a> </small>
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr).
+The original code can be found [here](https://github.com/mv-lab/swin2sr).
+
+## Resources
+
+Demo notebooks for Swin2SR can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/Swin2SR).
+
+A demo Space for image super-resolution with SwinSR can be found [here](https://huggingface.co/spaces/jjourney1125/swin2sr).
+
+## Swin2SRImageProcessor
+
+[[autodoc]] Swin2SRImageProcessor
+    - preprocess
+
+## Swin2SRConfig
+
+[[autodoc]] Swin2SRConfig
+
+## Swin2SRModel
+
+[[autodoc]] Swin2SRModel
+    - forward
+
+## Swin2SRForImageSuperResolution
+
+[[autodoc]] Swin2SRForImageSuperResolution
+    - forward
diff --git a/docs/source/en/model_doc/swin2sr.mdx b/docs/source/en/model_doc/swin2sr.mdx
deleted file mode 100644
index edb073d1ee38..000000000000
--- a/docs/source/en/model_doc/swin2sr.mdx
+++ /dev/null
@@ -1,57 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Swin2SR
-
-## Overview
-
-The Swin2SR model was proposed in [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
-Swin2R improves the [SwinIR](https://github.com/JingyunLiang/SwinIR/) model by incorporating [Swin Transformer v2](swinv2) layers which mitigates issues such as training instability, resolution gaps between pre-training
-and fine-tuning, and hunger on data.
-
-The abstract from the paper is the following:
-
-*Compression plays an important role on the efficient transmission and storage of images and videos through band-limited systems such as streaming services, virtual reality or videogames. However, compression unavoidably leads to artifacts and the loss of the original information, which may severely degrade the visual quality. For these reasons, quality enhancement of compressed images has become a popular research topic. While most state-of-the-art image restoration methods are based on convolutional neural networks, other transformers-based methods such as SwinIR, show impressive performance on these tasks.
-In this paper, we explore the novel Swin Transformer V2, to improve SwinIR for image super-resolution, and in particular, the compressed input scenario. Using this method we can tackle the major issues in training transformer vision models, such as training instability, resolution gaps between pre-training and fine-tuning, and hunger on data. We conduct experiments on three representative tasks: JPEG compression artifacts removal, image super-resolution (classical and lightweight), and compressed image super-resolution. Experimental results demonstrate that our method, Swin2SR, can improve the training convergence and performance of SwinIR, and is a top-5 solution at the "AIM 2022 Challenge on Super-Resolution of Compressed Image and Video".*
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/swin2sr_architecture.png"
-alt="drawing" width="600"/>
-
-<small> Swin2SR architecture. Taken from the <a href="https://arxiv.org/abs/2209.11345">original paper.</a> </small>
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr).
-The original code can be found [here](https://github.com/mv-lab/swin2sr).
-
-## Resources
-
-Demo notebooks for Swin2SR can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/Swin2SR).
-
-A demo Space for image super-resolution with SwinSR can be found [here](https://huggingface.co/spaces/jjourney1125/swin2sr).
-
-## Swin2SRImageProcessor
-
-[[autodoc]] Swin2SRImageProcessor
-    - preprocess
-
-## Swin2SRConfig
-
-[[autodoc]] Swin2SRConfig
-
-## Swin2SRModel
-
-[[autodoc]] Swin2SRModel
-    - forward
-
-## Swin2SRForImageSuperResolution
-
-[[autodoc]] Swin2SRForImageSuperResolution
-    - forward
diff --git a/docs/source/en/model_doc/swinv2.md b/docs/source/en/model_doc/swinv2.md
new file mode 100644
index 000000000000..e08389527ece
--- /dev/null
+++ b/docs/source/en/model_doc/swinv2.md
@@ -0,0 +1,65 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Swin Transformer V2
+
+## Overview
+
+The Swin Transformer V2 model was proposed in [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
+
+The abstract from the paper is the following:
+
+*Large-scale NLP models have been shown to significantly improve the performance on language tasks with no signs of saturation. They also demonstrate amazing few-shot capabilities like that of human beings. This paper aims to explore large-scale models in computer vision. We tackle three major issues in training and application of large vision models, including training instability, resolution gaps between pre-training and fine-tuning, and hunger on labelled data. Three main techniques are proposed: 1) a residual-post-norm method combined with cosine attention to improve training stability; 2) A log-spaced continuous position bias method to effectively transfer models pre-trained using low-resolution images to downstream tasks with high-resolution inputs; 3) A self-supervised pre-training method, SimMIM, to reduce the needs of vast labeled images. Through these techniques, this paper successfully trained a 3 billion-parameter Swin Transformer V2 model, which is the largest dense vision model to date, and makes it capable of training with images of up to 1,536×1,536 resolution. It set new performance records on 4 representative vision tasks, including ImageNet-V2 image classification, COCO object detection, ADE20K semantic segmentation, and Kinetics-400 video action classification. Also note our training is much more efficient than that in Google's billion-level visual models, which consumes 40 times less labelled data and 40 times less training time.*
+
+Tips:
+- One can use the [`AutoImageProcessor`] API to prepare images for the model.
+
+This model was contributed by [nandwalritik](https://huggingface.co/nandwalritik).
+The original code can be found [here](https://github.com/microsoft/Swin-Transformer).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Swin Transformer v2.
+
+<PipelineTag pipeline="image-classification"/>
+
+- [`Swinv2ForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
+- See also: [Image classification task guide](../tasks/image_classification)
+
+Besides that:
+
+- [`Swinv2ForMaskedImageModeling`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## Swinv2Config
+
+[[autodoc]] Swinv2Config
+
+## Swinv2Model
+
+[[autodoc]] Swinv2Model
+    - forward
+
+## Swinv2ForMaskedImageModeling
+
+[[autodoc]] Swinv2ForMaskedImageModeling
+    - forward
+
+## Swinv2ForImageClassification
+
+[[autodoc]] transformers.Swinv2ForImageClassification
+    - forward
diff --git a/docs/source/en/model_doc/swinv2.mdx b/docs/source/en/model_doc/swinv2.mdx
deleted file mode 100644
index 576f1a142a63..000000000000
--- a/docs/source/en/model_doc/swinv2.mdx
+++ /dev/null
@@ -1,47 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Swin Transformer V2
-
-## Overview
-
-The Swin Transformer V2 model was proposed in [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
-
-The abstract from the paper is the following:
-
-*Large-scale NLP models have been shown to significantly improve the performance on language tasks with no signs of saturation. They also demonstrate amazing few-shot capabilities like that of human beings. This paper aims to explore large-scale models in computer vision. We tackle three major issues in training and application of large vision models, including training instability, resolution gaps between pre-training and fine-tuning, and hunger on labelled data. Three main techniques are proposed: 1) a residual-post-norm method combined with cosine attention to improve training stability; 2) A log-spaced continuous position bias method to effectively transfer models pre-trained using low-resolution images to downstream tasks with high-resolution inputs; 3) A self-supervised pre-training method, SimMIM, to reduce the needs of vast labeled images. Through these techniques, this paper successfully trained a 3 billion-parameter Swin Transformer V2 model, which is the largest dense vision model to date, and makes it capable of training with images of up to 1,536×1,536 resolution. It set new performance records on 4 representative vision tasks, including ImageNet-V2 image classification, COCO object detection, ADE20K semantic segmentation, and Kinetics-400 video action classification. Also note our training is much more efficient than that in Google's billion-level visual models, which consumes 40 times less labelled data and 40 times less training time.*
-
-Tips:
-- One can use the [`AutoImageProcessor`] API to prepare images for the model.
-
-This model was contributed by [nandwalritik](https://huggingface.co/nandwalritik).
-The original code can be found [here](https://github.com/microsoft/Swin-Transformer).
-
-
-## Swinv2Config
-
-[[autodoc]] Swinv2Config
-
-## Swinv2Model
-
-[[autodoc]] Swinv2Model
-    - forward
-
-## Swinv2ForMaskedImageModeling
-
-[[autodoc]] Swinv2ForMaskedImageModeling
-    - forward
-
-## Swinv2ForImageClassification
-
-[[autodoc]] transformers.Swinv2ForImageClassification
-    - forward
diff --git a/docs/source/en/model_doc/switch_transformers.md b/docs/source/en/model_doc/switch_transformers.md
new file mode 100644
index 000000000000..8f6a231b7ef7
--- /dev/null
+++ b/docs/source/en/model_doc/switch_transformers.md
@@ -0,0 +1,72 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# SwitchTransformers
+
+## Overview
+
+The SwitchTransformers model was proposed in [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
+
+The Switch Transformer model uses a sparse T5 encoder-decoder architecture, where the MLP are replaced by a Mixture of Experts (MoE). A routing mechanism (top 1 in this case) associates each token to one of the expert, where each expert is a dense MLP. While switch transformers have a lot more weights than their equivalent dense models, the sparsity allows better scaling and better finetuning performance at scale.
+During a forward pass, only a fraction of the weights are used. The routing mechanism allows the model to select relevant weights on the fly which increases the model capacity without increasing the number of operations.
+
+
+The abstract from the paper is the following:
+
+*In deep learning, models typically reuse the same parameters for all inputs. Mixture of Experts (MoE) defies this and instead selects different parameters for each incoming example. The result is a sparsely-activated model -- with outrageous numbers of parameters -- but a constant computational cost. However, despite several notable successes of MoE, widespread adoption has been hindered by complexity, communication costs and training instability -- we address these with the Switch Transformer. We simplify the MoE routing algorithm and design intuitive improved models with reduced communication and computational costs. Our proposed training techniques help wrangle the instabilities and we show large sparse models may be trained, for the first time, with lower precision (bfloat16) formats. We design models based off T5-Base and T5-Large to obtain up to 7x increases in pre-training speed with the same computational resources. These improvements extend into multilingual settings where we measure gains over the mT5-Base version across all 101 languages. Finally, we advance the current scale of language models by pre-training up to trillion parameter models on the "Colossal Clean Crawled Corpus" and achieve a 4x speedup over the T5-XXL model.*
+
+Tips:
+
+- SwitchTransformers uses the [`T5Tokenizer`], which can be loaded directly from each model's repository.
+- The released weights are pretrained on English [Masked Language Modeling](https://moon-ci-docs.huggingface.co/docs/transformers/pr_19323/en/glossary#general-terms) task, and should be finetuned.
+
+This model was contributed by [Younes Belkada](https://huggingface.co/ybelkada) and [Arthur Zucker](https://huggingface.co/ArtZucker) .
+The original code can be found [here](https://github.com/google/flaxformer/tree/main/flaxformer/architectures/moe).
+
+## Resources
+
+- [Translation task guide](../tasks/translation)
+- [Summarization task guide](../tasks/summarization)
+
+## SwitchTransformersConfig
+
+[[autodoc]] SwitchTransformersConfig
+
+## SwitchTransformersTop1Router
+
+[[autodoc]] SwitchTransformersTop1Router
+    - _compute_router_probabilities
+    - forward
+
+## SwitchTransformersSparseMLP
+
+[[autodoc]] SwitchTransformersSparseMLP
+    - forward
+
+## SwitchTransformersModel
+
+[[autodoc]] SwitchTransformersModel
+    - forward
+
+## SwitchTransformersForConditionalGeneration
+
+[[autodoc]] SwitchTransformersForConditionalGeneration
+    - forward
+
+## SwitchTransformersEncoderModel
+
+[[autodoc]] SwitchTransformersEncoderModel
+    - forward
diff --git a/docs/source/en/model_doc/switch_transformers.mdx b/docs/source/en/model_doc/switch_transformers.mdx
deleted file mode 100644
index 348c831a0e98..000000000000
--- a/docs/source/en/model_doc/switch_transformers.mdx
+++ /dev/null
@@ -1,64 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# SwitchTransformers
-
-## Overview
-
-The SwitchTransformers model was proposed in [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
-
-The Switch Transformer model uses a sparse T5 encoder-decoder architecure, where the MLP are replaced by a Mixture of Experts (MoE). A routing mechanism (top 1 in this case) associates each token to one of the expert, where each expert is a dense MLP. While switch transformers have a lot more weights than their equivalent dense models, the sparsity allows better scaling and better finetuning performance at scale.
-During a forward pass, only a fraction of the weights are used. The routing mecanism allows the model to select relevant weights on the fly which increases the model capacity without increasing the number of operations.
-
-
-The abstract from the paper is the following:
-
-*In deep learning, models typically reuse the same parameters for all inputs. Mixture of Experts (MoE) defies this and instead selects different parameters for each incoming example. The result is a sparsely-activated model -- with outrageous numbers of parameters -- but a constant computational cost. However, despite several notable successes of MoE, widespread adoption has been hindered by complexity, communication costs and training instability -- we address these with the Switch Transformer. We simplify the MoE routing algorithm and design intuitive improved models with reduced communication and computational costs. Our proposed training techniques help wrangle the instabilities and we show large sparse models may be trained, for the first time, with lower precision (bfloat16) formats. We design models based off T5-Base and T5-Large to obtain up to 7x increases in pre-training speed with the same computational resources. These improvements extend into multilingual settings where we measure gains over the mT5-Base version across all 101 languages. Finally, we advance the current scale of language models by pre-training up to trillion parameter models on the "Colossal Clean Crawled Corpus" and achieve a 4x speedup over the T5-XXL model.*
-
-Tips:
-
-- SwitchTransformers uses the [`T5Tokenizer`], which can be loaded directly from each model's repository.
-- The released weights are pretrained on English [Masked Language Modeling](https://moon-ci-docs.huggingface.co/docs/transformers/pr_19323/en/glossary#general-terms) task, and should be finetuned.
-
-This model was contributed by [Younes Belkada](https://huggingface.co/ybelkada) and [Arthur Zucker](https://huggingface.co/ArtZucker) .
-The original code can be found [here](https://github.com/google/flaxformer/tree/main/flaxformer/architectures/moe).
-
-
-## SwitchTransformersConfig
-
-[[autodoc]] SwitchTransformersConfig
-
-## SwitchTransformersTop1Router
-
-[[autodoc]] SwitchTransformersTop1Router
-    - _compute_router_probabilities
-    - forward
-
-## SwitchTransformersSparseMLP
-
-[[autodoc]] SwitchTransformersSparseMLP
-    - forward
-
-## SwitchTransformersModel
-
-[[autodoc]] SwitchTransformersModel
-    - forward
-
-## SwitchTransformersForConditionalGeneration
-
-[[autodoc]] SwitchTransformersForConditionalGeneration
-    - forward
-
-## SwitchTransformersEncoderModel
-
-[[autodoc]] SwitchTransformersEncoderModel
-    - forward
diff --git a/docs/source/en/model_doc/t5.md b/docs/source/en/model_doc/t5.md
new file mode 100644
index 000000000000..2e833e8e1a67
--- /dev/null
+++ b/docs/source/en/model_doc/t5.md
@@ -0,0 +1,446 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# T5
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=t5">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-t5-blueviolet">
+</a>
+<a href="https://huggingface.co/spaces/docs-demos/t5-base">
+<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
+</a>
+<a href="https://huggingface.co/papers/1910.10683">
+<img alt="Paper page" src="https://img.shields.io/badge/Paper%20page-1910.10683-green">
+</a>
+</div>
+
+## Overview
+
+The T5 model was presented in [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/pdf/1910.10683.pdf) by [Colin Raffel](https://huggingface.co/craffel), Noam Shazeer, [Adam Roberts](https://huggingface.co/adarob), Katherine Lee, Sharan Narang,
+Michael Matena, Yanqi Zhou, Wei Li, [Peter J. Liu](https://huggingface.co/peterjliu).
+
+The abstract from the paper is the following:
+
+*Transfer learning, where a model is first pre-trained on a data-rich task before being fine-tuned on a downstream
+task, has emerged as a powerful technique in natural language processing (NLP). The effectiveness of transfer learning
+has given rise to a diversity of approaches, methodology, and practice. In this paper, we explore the landscape of
+transfer learning techniques for NLP by introducing a unified framework that converts every language problem into a
+text-to-text format. Our systematic study compares pretraining objectives, architectures, unlabeled datasets, transfer
+approaches, and other factors on dozens of language understanding tasks. By combining the insights from our exploration
+with scale and our new "Colossal Clean Crawled Corpus", we achieve state-of-the-art results on many benchmarks covering
+summarization, question answering, text classification, and more. To facilitate future work on transfer learning for
+NLP, we release our dataset, pre-trained models, and code.*
+
+Tips:
+
+- T5 is an encoder-decoder model pre-trained on a multi-task mixture of unsupervised and supervised tasks and for which
+each task is converted into a text-to-text format. T5 works well on a variety of tasks out-of-the-box by prepending a
+different prefix to the input corresponding to each task, e.g., for translation: *translate English to German: ...*,
+for summarization: *summarize: ...*.
+- The pretraining includes both supervised and self-supervised training. Supervised training is conducted on downstream tasks provided by the GLUE and SuperGLUE benchmarks (converting them into text-to-text tasks as explained above).
+- Self-supervised training uses corrupted tokens, by randomly removing 15% of the tokens and replacing them with individual sentinel tokens (if several consecutive tokens are marked for removal, the whole group is replaced with a single sentinel token). The input of the encoder is the corrupted sentence, the input of the decoder is the original sentence and the target is then the dropped out tokens delimited by their sentinel tokens.
+
+- T5 uses relative scalar embeddings. Encoder input padding can be done on the left and on the right.
+
+- See the [training](#training), [inference](#inference) and [scripts](#scripts) sections below for all details regarding usage.
+
+T5 comes in different sizes:
+
+- [t5-small](https://huggingface.co/t5-small)
+
+- [t5-base](https://huggingface.co/t5-base)
+
+- [t5-large](https://huggingface.co/t5-large)
+
+- [t5-3b](https://huggingface.co/t5-3b)
+
+- [t5-11b](https://huggingface.co/t5-11b).
+
+Based on the original T5 model, Google has released some follow-up works:
+
+- **T5v1.1**: T5v1.1 is an improved version of T5 with some architectural tweaks, and is pre-trained on C4 only without
+  mixing in the supervised tasks. Refer to the documentation of T5v1.1 which can be found [here](t5v1.1).
+
+- **mT5**: mT5 is a multilingual T5 model. It is pre-trained on the mC4 corpus, which includes 101 languages. Refer to
+  the documentation of mT5 which can be found [here](mt5).
+
+- **byT5**: byT5 is a T5 model pre-trained on byte sequences rather than SentencePiece subword token sequences. Refer
+  to the documentation of byT5 which can be found [here](byt5).
+
+- **UL2**: UL2 is a T5 like model pretrained on various denoising objectives
+
+- **Flan-T5**: Flan is a pretraining methods that is based on prompting. The Flan-T5 are T5 models trained on the Flan collection of 
+    datasets which include: `taskmaster2`, `djaym7/wiki_dialog`, `deepmind/code_contests`, `lambada`, `gsm8k`, `aqua_rat`, `esnli`, `quasc` and `qed`.
+
+- **FLan-UL2** : the UL2 model finetuned using the "Flan" prompt tuning and dataset collection.
+
+- **UMT5**: UmT5 is a multilingual T5 model trained on an improved and refreshed mC4 multilingual corpus,  29 trillion characters across 107 language, using a new sampling method, UniMax. Refer to
+ the documentation of mT5 which can be found [here](umt5).
+
+All checkpoints can be found on the [hub](https://huggingface.co/models?search=t5).
+
+This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/google-research/text-to-text-transfer-transformer).
+
+<a id='training'></a>
+
+## Training
+
+T5 is an encoder-decoder model and converts all NLP problems into a text-to-text format. It is trained using teacher
+forcing. This means that for training, we always need an input sequence and a corresponding target sequence. The input
+sequence is fed to the model using `input_ids`. The target sequence is shifted to the right, i.e., prepended by a
+start-sequence token and fed to the decoder using the `decoder_input_ids`. In teacher-forcing style, the target
+sequence is then appended by the EOS token and corresponds to the `labels`. The PAD token is hereby used as the
+start-sequence token. T5 can be trained / fine-tuned both in a supervised and unsupervised fashion.
+
+One can use [`T5ForConditionalGeneration`] (or the Tensorflow/Flax variant), which includes the
+language modeling head on top of the decoder.
+
+- Unsupervised denoising training
+
+In this setup, spans of the input sequence are masked by so-called sentinel tokens (*a.k.a* unique mask tokens) and
+the output sequence is formed as a concatenation of the same sentinel tokens and the *real* masked tokens. Each
+sentinel token represents a unique mask token for this sentence and should start with `<extra_id_0>`,
+`<extra_id_1>`, ... up to `<extra_id_99>`. As a default, 100 sentinel tokens are available in
+[`T5Tokenizer`].
+
+For instance, the sentence "The cute dog walks in the park" with the masks put on "cute dog" and "the" should be
+processed as follows:
+
+```python
+>>> from transformers import T5Tokenizer, T5ForConditionalGeneration
+
+>>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+>>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
+
+>>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
+>>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
+
+>>> # the forward function automatically creates the correct decoder_input_ids
+>>> loss = model(input_ids=input_ids, labels=labels).loss
+>>> loss.item()
+3.7837
+```
+
+If you're interested in pre-training T5 on a new corpus, check out the [run_t5_mlm_flax.py](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling) script in the Examples
+directory.
+
+- Supervised training
+
+In this setup, the input sequence and output sequence are a standard sequence-to-sequence input-output mapping.
+Suppose that we want to fine-tune the model for translation for example, and we have a training example: the input
+sequence "The house is wonderful." and output sequence "Das Haus ist wunderbar.", then they should be prepared for
+the model as follows:
+
+```python
+>>> from transformers import T5Tokenizer, T5ForConditionalGeneration
+
+>>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+>>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
+
+>>> input_ids = tokenizer("translate English to German: The house is wonderful.", return_tensors="pt").input_ids
+>>> labels = tokenizer("Das Haus ist wunderbar.", return_tensors="pt").input_ids
+
+>>> # the forward function automatically creates the correct decoder_input_ids
+>>> loss = model(input_ids=input_ids, labels=labels).loss
+>>> loss.item()
+0.2542
+```
+
+As you can see, only 2 inputs are required for the model in order to compute a loss: `input_ids` (which are the
+`input_ids` of the encoded input sequence) and `labels` (which are the `input_ids` of the encoded
+target sequence). The model will automatically create the `decoder_input_ids` based on the `labels`, by
+shifting them one position to the right and prepending the `config.decoder_start_token_id`, which for T5 is
+equal to 0 (i.e. the id of the pad token). Also note the task prefix: we prepend the input sequence with 'translate
+English to German: ' before encoding it. This will help in improving the performance, as this task prefix was used
+during T5's pre-training.
+
+However, the example above only shows a single training example. In practice, one trains deep learning models in
+batches. This entails that we must pad/truncate examples to the same length. For encoder-decoder models, one
+typically defines a `max_source_length` and `max_target_length`, which determine the maximum length of the
+input and output sequences respectively (otherwise they are truncated). These should be carefully set depending on
+the task.
+
+In addition, we must make sure that padding token id's of the `labels` are not taken into account by the loss
+function. In PyTorch and Tensorflow, this can be done by replacing them with -100, which is the `ignore_index`
+of the `CrossEntropyLoss`. In Flax, one can use the `decoder_attention_mask` to ignore padded tokens from
+the loss (see the [Flax summarization script](https://github.com/huggingface/transformers/tree/main/examples/flax/summarization) for details). We also pass
+`attention_mask` as additional input to the model, which makes sure that padding tokens of the inputs are
+ignored. The code example below illustrates all of this.
+
+```python
+>>> from transformers import T5Tokenizer, T5ForConditionalGeneration
+>>> import torch
+
+>>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+>>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
+
+>>> # the following 2 hyperparameters are task-specific
+>>> max_source_length = 512
+>>> max_target_length = 128
+
+>>> # Suppose we have the following 2 training examples:
+>>> input_sequence_1 = "Welcome to NYC"
+>>> output_sequence_1 = "Bienvenue à NYC"
+
+>>> input_sequence_2 = "HuggingFace is a company"
+>>> output_sequence_2 = "HuggingFace est une entreprise"
+
+>>> # encode the inputs
+>>> task_prefix = "translate English to French: "
+>>> input_sequences = [input_sequence_1, input_sequence_2]
+
+>>> encoding = tokenizer(
+...     [task_prefix + sequence for sequence in input_sequences],
+...     padding="longest",
+...     max_length=max_source_length,
+...     truncation=True,
+...     return_tensors="pt",
+... )
+
+>>> input_ids, attention_mask = encoding.input_ids, encoding.attention_mask
+
+>>> # encode the targets
+>>> target_encoding = tokenizer(
+...     [output_sequence_1, output_sequence_2],
+...     padding="longest",
+...     max_length=max_target_length,
+...     truncation=True,
+...     return_tensors="pt",
+... )
+>>> labels = target_encoding.input_ids
+
+>>> # replace padding token id's of the labels by -100 so it's ignored by the loss
+>>> labels[labels == tokenizer.pad_token_id] = -100
+
+>>> # forward pass
+>>> loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
+>>> loss.item()
+0.188
+```
+
+Additional training tips:
+
+- T5 models need a slightly higher learning rate than the default one set in the `Trainer` when using the AdamW
+optimizer. Typically, 1e-4 and 3e-4 work well for most problems (classification, summarization, translation, question
+answering, question generation). Note that T5 was pre-trained using the AdaFactor optimizer.
+
+According to [this forum post](https://discuss.huggingface.co/t/t5-finetuning-tips/684), task prefixes matter when
+(1) doing multi-task training (2) your task is similar or related to one of the supervised tasks used in T5's
+pre-training mixture (see Appendix D of the [paper](https://arxiv.org/pdf/1910.10683.pdf) for the task prefixes
+used).
+
+If training on TPU, it is recommended to pad all examples of the dataset to the same length or make use of
+*pad_to_multiple_of* to have a small number of predefined bucket sizes to fit all examples in. Dynamically padding
+batches to the longest example is not recommended on TPU as it triggers a recompilation for every batch shape that is
+encountered during training thus significantly slowing down the training. only padding up to the longest example in a
+batch) leads to very slow training on TPU.
+
+<a id='inference'></a>
+
+## Inference
+
+At inference time, it is recommended to use [`~generation.GenerationMixin.generate`]. This
+method takes care of encoding the input and feeding the encoded hidden states via cross-attention layers to the decoder
+and auto-regressively generates the decoder output. Check out [this blog post](https://huggingface.co/blog/how-to-generate) to know all the details about generating text with Transformers.
+There's also [this blog post](https://huggingface.co/blog/encoder-decoder#encoder-decoder) which explains how
+generation works in general in encoder-decoder models.
+
+```python
+>>> from transformers import T5Tokenizer, T5ForConditionalGeneration
+
+>>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+>>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
+
+>>> input_ids = tokenizer("translate English to German: The house is wonderful.", return_tensors="pt").input_ids
+>>> outputs = model.generate(input_ids)
+>>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+Das Haus ist wunderbar.
+```
+
+Note that T5 uses the `pad_token_id` as the `decoder_start_token_id`, so when doing generation without using
+[`~generation.GenerationMixin.generate`], make sure you start it with the `pad_token_id`.
+
+The example above only shows a single example. You can also do batched inference, like so:
+
+```python
+>>> from transformers import T5Tokenizer, T5ForConditionalGeneration
+
+>>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+>>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
+
+>>> task_prefix = "translate English to German: "
+>>> # use different length sentences to test batching
+>>> sentences = ["The house is wonderful.", "I like to work in NYC."]
+
+>>> inputs = tokenizer([task_prefix + sentence for sentence in sentences], return_tensors="pt", padding=True)
+
+>>> output_sequences = model.generate(
+...     input_ids=inputs["input_ids"],
+...     attention_mask=inputs["attention_mask"],
+...     do_sample=False,  # disable sampling to test if batching affects output
+... )
+
+>>> print(tokenizer.batch_decode(output_sequences, skip_special_tokens=True))
+['Das Haus ist wunderbar.', 'Ich arbeite gerne in NYC.']
+```
+
+Because T5 has been trained with the span-mask denoising objective,
+it can be used to predict the sentinel (masked-out) tokens during inference.
+The predicted tokens will then be placed between the sentinel tokens.
+
+```python
+>>> from transformers import T5Tokenizer, T5ForConditionalGeneration
+
+>>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+>>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
+
+>>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
+
+>>> sequence_ids = model.generate(input_ids)
+>>> sequences = tokenizer.batch_decode(sequence_ids)
+>>> sequences
+['<pad><extra_id_0> park offers<extra_id_1> the<extra_id_2> park.</s>']
+```
+
+
+<a id='scripts'></a>
+
+## Performance
+
+If you'd like a faster training and inference performance, install [apex](https://github.com/NVIDIA/apex#quick-start) and then the model will automatically use `apex.normalization.FusedRMSNorm` instead of `T5LayerNorm`. The former uses an optimized fused kernel which is several times faster than the latter.
+
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with T5. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+<PipelineTag pipeline="text-classification"/>
+
+- A notebook for how to [finetune T5 for classification and multiple choice](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb).
+- A notebook for how to [finetune T5 for sentiment span extraction](https://colab.research.google.com/github/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb). 🌎
+
+<PipelineTag pipeline="token-classification"/>
+
+- A notebook for how to [finetune T5 for named entity recognition](https://colab.research.google.com/drive/1obr78FY_cBmWY5ODViCmzdY6O1KB65Vc?usp=sharing). 🌎
+
+<PipelineTag pipeline="text-generation"/>
+
+- A notebook for [Finetuning CodeT5 for generating docstrings from Ruby code](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/T5/Fine_tune_CodeT5_for_generating_docstrings_from_Ruby_code.ipynb).
+
+<PipelineTag pipeline="summarization"/>
+
+- A notebook to [Finetune T5-base-dutch to perform Dutch abstractive summarization on a TPU](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/T5/Fine_tuning_Dutch_T5_base_on_CNN_Daily_Mail_for_summarization_(on_TPU_using_HuggingFace_Accelerate).ipynb).
+- A notebook for how to [finetune T5 for summarization in PyTorch and track experiments with WandB](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb#scrollTo=OKRpFvYhBauC). 🌎
+- A blog post on [Distributed Training: Train BART/T5 for Summarization using 🤗 Transformers and Amazon SageMaker](https://huggingface.co/blog/sagemaker-distributed-training-seq2seq).
+- [`T5ForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb).
+- [`TFT5ForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb).
+- [`FlaxT5ForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/summarization).
+- [Summarization](https://huggingface.co/course/chapter7/5?fw=pt#summarization) chapter of the 🤗 Hugging Face course.
+- [Summarization task guide](../tasks/summarization)
+
+<PipelineTag pipeline="fill-mask"/>
+
+- [`FlaxT5ForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#t5-like-span-masked-language-modeling) for training T5 with a span-masked language model objective. The script also shows how to train a T5 tokenizer. [`FlaxT5ForConditionalGeneration`] is also supported by this [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/masked_language_modeling_flax.ipynb).
+
+<PipelineTag pipeline="translation"/>
+
+- [`T5ForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/translation) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation.ipynb).
+- [`TFT5ForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/translation) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation-tf.ipynb).
+- [Translation task guide](../tasks/translation)
+
+<PipelineTag pipeline="question-answering"/>
+
+- A notebook on how to [finetune T5 for question answering with TensorFlow 2](https://colab.research.google.com/github/snapthat/TF-T5-text-to-text/blob/master/snapthatT5/notebooks/TF-T5-Datasets%20Training.ipynb). 🌎
+- A notebook on how to [finetune T5 for question answering on a TPU](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=QLGiFCDqvuil).
+
+🚀 **Deploy**
+- A blog post on how to deploy [T5 11B for inference for less than $500](https://www.philschmid.de/deploy-t5-11b).
+
+## T5Config
+
+[[autodoc]] T5Config
+
+## T5Tokenizer
+
+[[autodoc]] T5Tokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## T5TokenizerFast
+
+[[autodoc]] T5TokenizerFast
+
+## T5Model
+
+[[autodoc]] T5Model
+    - forward
+
+## T5ForConditionalGeneration
+
+[[autodoc]] T5ForConditionalGeneration
+    - forward
+
+## T5EncoderModel
+
+[[autodoc]] T5EncoderModel
+    - forward
+
+## T5ForSequenceClassification
+
+[[autodoc]] T5ForSequenceClassification
+    - forward
+
+## T5ForQuestionAnswering
+
+[[autodoc]] T5ForQuestionAnswering
+    - forward
+
+## TFT5Model
+
+[[autodoc]] TFT5Model
+    - call
+
+## TFT5ForConditionalGeneration
+
+[[autodoc]] TFT5ForConditionalGeneration
+    - call
+
+## TFT5EncoderModel
+
+[[autodoc]] TFT5EncoderModel
+    - call
+
+## FlaxT5Model
+
+[[autodoc]] FlaxT5Model
+    - __call__
+    - encode
+    - decode
+
+## FlaxT5ForConditionalGeneration
+
+[[autodoc]] FlaxT5ForConditionalGeneration
+    - __call__
+    - encode
+    - decode
+
+## FlaxT5EncoderModel
+
+[[autodoc]] FlaxT5EncoderModel
+    - __call__
diff --git a/docs/source/en/model_doc/t5.mdx b/docs/source/en/model_doc/t5.mdx
deleted file mode 100644
index 995816061c76..000000000000
--- a/docs/source/en/model_doc/t5.mdx
+++ /dev/null
@@ -1,412 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# T5
-
-## Overview
-
-The T5 model was presented in [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/pdf/1910.10683.pdf) by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang,
-Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu.
-
-The abstract from the paper is the following:
-
-*Transfer learning, where a model is first pre-trained on a data-rich task before being fine-tuned on a downstream
-task, has emerged as a powerful technique in natural language processing (NLP). The effectiveness of transfer learning
-has given rise to a diversity of approaches, methodology, and practice. In this paper, we explore the landscape of
-transfer learning techniques for NLP by introducing a unified framework that converts every language problem into a
-text-to-text format. Our systematic study compares pretraining objectives, architectures, unlabeled datasets, transfer
-approaches, and other factors on dozens of language understanding tasks. By combining the insights from our exploration
-with scale and our new "Colossal Clean Crawled Corpus", we achieve state-of-the-art results on many benchmarks covering
-summarization, question answering, text classification, and more. To facilitate future work on transfer learning for
-NLP, we release our dataset, pre-trained models, and code.*
-
-Tips:
-
-- T5 is an encoder-decoder model pre-trained on a multi-task mixture of unsupervised and supervised tasks and for which
-each task is converted into a text-to-text format. T5 works well on a variety of tasks out-of-the-box by prepending a
-different prefix to the input corresponding to each task, e.g., for translation: *translate English to German: ...*,
-for summarization: *summarize: ...*.
-
-- T5 uses relative scalar embeddings. Encoder input padding can be done on the left and on the right.
-
-- See the [training](#training), [inference](#inference) and [scripts](#scripts) sections below for all details regarding usage.
-
-T5 comes in different sizes:
-
-- [t5-small](https://huggingface.co/t5-small)
-
-- [t5-base](https://huggingface.co/t5-base)
-
-- [t5-large](https://huggingface.co/t5-large)
-
-- [t5-3b](https://huggingface.co/t5-3b)
-
-- [t5-11b](https://huggingface.co/t5-11b).
-
-Based on the original T5 model, Google has released some follow-up works:
-
-- **T5v1.1**: T5v1.1 is an improved version of T5 with some architectural tweaks, and is pre-trained on C4 only without
-  mixing in the supervised tasks. Refer to the documentation of T5v1.1 which can be found [here](t5v1.1).
-
-- **mT5**: mT5 is a multilingual T5 model. It is pre-trained on the mC4 corpus, which includes 101 languages. Refer to
-  the documentation of mT5 which can be found [here](mt5).
-
-- **byT5**: byT5 is a T5 model pre-trained on byte sequences rather than SentencePiece subword token sequences. Refer
-  to the documentation of byT5 which can be found [here](byt5).
-
-All checkpoints can be found on the [hub](https://huggingface.co/models?search=t5).
-
-This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/google-research/text-to-text-transfer-transformer).
-
-<a id='training'></a>
-
-## Training
-
-T5 is an encoder-decoder model and converts all NLP problems into a text-to-text format. It is trained using teacher
-forcing. This means that for training, we always need an input sequence and a corresponding target sequence. The input
-sequence is fed to the model using `input_ids`. The target sequence is shifted to the right, i.e., prepended by a
-start-sequence token and fed to the decoder using the `decoder_input_ids`. In teacher-forcing style, the target
-sequence is then appended by the EOS token and corresponds to the `labels`. The PAD token is hereby used as the
-start-sequence token. T5 can be trained / fine-tuned both in a supervised and unsupervised fashion.
-
-One can use [`T5ForConditionalGeneration`] (or the Tensorflow/Flax variant), which includes the
-language modeling head on top of the decoder.
-
-- Unsupervised denoising training
-
-In this setup, spans of the input sequence are masked by so-called sentinel tokens (*a.k.a* unique mask tokens) and
-the output sequence is formed as a concatenation of the same sentinel tokens and the *real* masked tokens. Each
-sentinel token represents a unique mask token for this sentence and should start with `<extra_id_0>`,
-`<extra_id_1>`, ... up to `<extra_id_99>`. As a default, 100 sentinel tokens are available in
-[`T5Tokenizer`].
-
-For instance, the sentence "The cute dog walks in the park" with the masks put on "cute dog" and "the" should be
-processed as follows:
-
-```python
->>> from transformers import T5Tokenizer, T5ForConditionalGeneration
-
->>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
->>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
-
->>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
->>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
-
->>> # the forward function automatically creates the correct decoder_input_ids
->>> loss = model(input_ids=input_ids, labels=labels).loss
->>> loss.item()
-3.7837
-```
-
-If you're interested in pre-training T5 on a new corpus, check out the [run_t5_mlm_flax.py](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling) script in the Examples
-directory.
-
-- Supervised training
-
-In this setup, the input sequence and output sequence are a standard sequence-to-sequence input-output mapping.
-Suppose that we want to fine-tune the model for translation for example, and we have a training example: the input
-sequence "The house is wonderful." and output sequence "Das Haus ist wunderbar.", then they should be prepared for
-the model as follows:
-
-```python
->>> from transformers import T5Tokenizer, T5ForConditionalGeneration
-
->>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
->>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
-
->>> input_ids = tokenizer("translate English to German: The house is wonderful.", return_tensors="pt").input_ids
->>> labels = tokenizer("Das Haus ist wunderbar.", return_tensors="pt").input_ids
-
->>> # the forward function automatically creates the correct decoder_input_ids
->>> loss = model(input_ids=input_ids, labels=labels).loss
->>> loss.item()
-0.2542
-```
-
-As you can see, only 2 inputs are required for the model in order to compute a loss: `input_ids` (which are the
-`input_ids` of the encoded input sequence) and `labels` (which are the `input_ids` of the encoded
-target sequence). The model will automatically create the `decoder_input_ids` based on the `labels`, by
-shifting them one position to the right and prepending the `config.decoder_start_token_id`, which for T5 is
-equal to 0 (i.e. the id of the pad token). Also note the task prefix: we prepend the input sequence with 'translate
-English to German: ' before encoding it. This will help in improving the performance, as this task prefix was used
-during T5's pre-training.
-
-However, the example above only shows a single training example. In practice, one trains deep learning models in
-batches. This entails that we must pad/truncate examples to the same length. For encoder-decoder models, one
-typically defines a `max_source_length` and `max_target_length`, which determine the maximum length of the
-input and output sequences respectively (otherwise they are truncated). These should be carefully set depending on
-the task.
-
-In addition, we must make sure that padding token id's of the `labels` are not taken into account by the loss
-function. In PyTorch and Tensorflow, this can be done by replacing them with -100, which is the `ignore_index`
-of the `CrossEntropyLoss`. In Flax, one can use the `decoder_attention_mask` to ignore padded tokens from
-the loss (see the [Flax summarization script](https://github.com/huggingface/transformers/tree/main/examples/flax/summarization) for details). We also pass
-`attention_mask` as additional input to the model, which makes sure that padding tokens of the inputs are
-ignored. The code example below illustrates all of this.
-
-```python
->>> from transformers import T5Tokenizer, T5ForConditionalGeneration
->>> import torch
-
->>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
->>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
-
->>> # the following 2 hyperparameters are task-specific
->>> max_source_length = 512
->>> max_target_length = 128
-
->>> # Suppose we have the following 2 training examples:
->>> input_sequence_1 = "Welcome to NYC"
->>> output_sequence_1 = "Bienvenue à NYC"
-
->>> input_sequence_2 = "HuggingFace is a company"
->>> output_sequence_2 = "HuggingFace est une entreprise"
-
->>> # encode the inputs
->>> task_prefix = "translate English to French: "
->>> input_sequences = [input_sequence_1, input_sequence_2]
-
->>> encoding = tokenizer(
-...     [task_prefix + sequence for sequence in input_sequences],
-...     padding="longest",
-...     max_length=max_source_length,
-...     truncation=True,
-...     return_tensors="pt",
-... )
-
->>> input_ids, attention_mask = encoding.input_ids, encoding.attention_mask
-
->>> # encode the targets
->>> target_encoding = tokenizer(
-...     [output_sequence_1, output_sequence_2],
-...     padding="longest",
-...     max_length=max_target_length,
-...     truncation=True,
-...     return_tensors="pt",
-... )
->>> labels = target_encoding.input_ids
-
->>> # replace padding token id's of the labels by -100 so it's ignored by the loss
->>> labels[labels == tokenizer.pad_token_id] = -100
-
->>> # forward pass
->>> loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
->>> loss.item()
-0.188
-```
-
-Additional training tips:
-
-- T5 models need a slightly higher learning rate than the default one set in the `Trainer` when using the AdamW
-optimizer. Typically, 1e-4 and 3e-4 work well for most problems (classification, summarization, translation, question
-answering, question generation). Note that T5 was pre-trained using the AdaFactor optimizer.
-
-According to [this forum post](https://discuss.huggingface.co/t/t5-finetuning-tips/684), task prefixes matter when
-(1) doing multi-task training (2) your task is similar or related to one of the supervised tasks used in T5's
-pre-training mixture (see Appendix D of the [paper](https://arxiv.org/pdf/1910.10683.pdf) for the task prefixes
-used).
-
-If training on TPU, it is recommended to pad all examples of the dataset to the same length or make use of
-*pad_to_multiple_of* to have a small number of predefined bucket sizes to fit all examples in. Dynamically padding
-batches to the longest example is not recommended on TPU as it triggers a recompilation for every batch shape that is
-encountered during training thus significantly slowing down the training. only padding up to the longest example in a
-batch) leads to very slow training on TPU.
-
-<a id='inference'></a>
-
-## Inference
-
-At inference time, it is recommended to use [`~generation.GenerationMixin.generate`]. This
-method takes care of encoding the input and feeding the encoded hidden states via cross-attention layers to the decoder
-and auto-regressively generates the decoder output. Check out [this blog post](https://huggingface.co/blog/how-to-generate) to know all the details about generating text with Transformers.
-There's also [this blog post](https://huggingface.co/blog/encoder-decoder#encoder-decoder) which explains how
-generation works in general in encoder-decoder models.
-
-```python
->>> from transformers import T5Tokenizer, T5ForConditionalGeneration
-
->>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
->>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
-
->>> input_ids = tokenizer("translate English to German: The house is wonderful.", return_tensors="pt").input_ids
->>> outputs = model.generate(input_ids)
->>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
-Das Haus ist wunderbar.
-```
-
-Note that T5 uses the `pad_token_id` as the `decoder_start_token_id`, so when doing generation without using
-[`~generation.GenerationMixin.generate`], make sure you start it with the `pad_token_id`.
-
-The example above only shows a single example. You can also do batched inference, like so:
-
-```python
->>> from transformers import T5Tokenizer, T5ForConditionalGeneration
-
->>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
->>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
-
->>> task_prefix = "translate English to German: "
->>> # use different length sentences to test batching
->>> sentences = ["The house is wonderful.", "I like to work in NYC."]
-
->>> inputs = tokenizer([task_prefix + sentence for sentence in sentences], return_tensors="pt", padding=True)
-
->>> output_sequences = model.generate(
-...     input_ids=inputs["input_ids"],
-...     attention_mask=inputs["attention_mask"],
-...     do_sample=False,  # disable sampling to test if batching affects output
-... )
-
->>> print(tokenizer.batch_decode(output_sequences, skip_special_tokens=True))
-['Das Haus ist wunderbar.', 'Ich arbeite gerne in NYC.']
-```
-
-Because T5 has been trained with the span-mask denoising objective,
-it can be used to predict the sentinel (masked-out) tokens during inference.
-The predicted tokens will then be placed between the sentinel tokens.
-
-```python
->>> from transformers import T5Tokenizer, T5ForConditionalGeneration
-
->>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
->>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
-
->>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
-
->>> sequence_ids = model.generate(input_ids)
->>> sequences = tokenizer.batch_decode(sequence_ids)
->>> sequences
-['<pad><extra_id_0> park offers<extra_id_1> the<extra_id_2> park.</s>']
-```
-
-
-<a id='scripts'></a>
-
-## Performance
-
-If you'd like a faster training and inference performance, install [apex](https://github.com/NVIDIA/apex#quick-start) and then the model will automatically use `apex.normalization.FusedRMSNorm` instead of `T5LayerNorm`. The former uses an optimized fused kernel which is several times faster than the latter.
-
-
-## Resources
-
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with T5. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
-<PipelineTag pipeline="text-classification"/>
-
-- A notebook for how to [finetune T5 for classification and multiple choice](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb).
-- A notebook for how to [finetune T5 for sentiment span extraction](https://colab.research.google.com/github/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb). 🌎
-
-<PipelineTag pipeline="token-classification"/>
-
-- A notebook for how to [finetune T5 for named entity recognition](https://colab.research.google.com/drive/1obr78FY_cBmWY5ODViCmzdY6O1KB65Vc?usp=sharing). 🌎
-
-<PipelineTag pipeline="text-generation"/>
-
-- A notebook for [Finetuning CodeT5 for generating docstrings from Ruby code](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/T5/Fine_tune_CodeT5_for_generating_docstrings_from_Ruby_code.ipynb).
-
-<PipelineTag pipeline="summarization"/>
-
-- A notebook to [Finetune T5-base-dutch to perform Dutch abstractive summarization on a TPU](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/T5/Fine_tuning_Dutch_T5_base_on_CNN_Daily_Mail_for_summarization_(on_TPU_using_HuggingFace_Accelerate).ipynb).
-- A notebook for how to [finetune T5 for summarization in PyTorch and track experiments with WandB](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb#scrollTo=OKRpFvYhBauC). 🌎
-- A blog post on [Distributed Training: Train BART/T5 for Summarization using 🤗 Transformers and Amazon SageMaker](https://huggingface.co/blog/sagemaker-distributed-training-seq2seq).
-- [`T5ForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization) and [noteboook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb).
-- [`TFT5ForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb).
-- [`FlaxT5ForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/summarization).
-- [Summarization](https://huggingface.co/course/chapter7/5?fw=pt#summarization) chapter of the 🤗 Hugging Face course.
-
-<PipelineTag pipeline="fill-mask"/>
-
-- [`FlaxT5ForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#t5-like-span-masked-language-modeling) for training T5 with a span-masked language model objective. The script also shows how to train a T5 tokenizer. [`FlaxT5ForConditionalGeneration`] is also supported by this [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/masked_language_modeling_flax.ipynb).
-
-<PipelineTag pipeline="translation"/>
-
-- [`T5ForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/translation) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation.ipynb).
-- [`TFT5ForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/translation) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation-tf.ipynb).
-
-<PipelineTag pipeline="question-answering"/>
-
-- A notebook on how to [finetune T5 for question answering with TensorFlow 2](https://colab.research.google.com/github/snapthat/TF-T5-text-to-text/blob/master/snapthatT5/notebooks/TF-T5-Datasets%20Training.ipynb). 🌎
-- A notebook on how to [finetune T5 for question answering on a TPU](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=QLGiFCDqvuil).
-
-🚀 **Deploy**
-- A blog post on how to deploy [T5 11B for inference for less than $500](https://www.philschmid.de/deploy-t5-11b).
-
-## T5Config
-
-[[autodoc]] T5Config
-
-## T5Tokenizer
-
-[[autodoc]] T5Tokenizer
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-
-## T5TokenizerFast
-
-[[autodoc]] T5TokenizerFast
-
-## T5Model
-
-[[autodoc]] T5Model
-    - forward
-    - parallelize
-    - deparallelize
-
-## T5ForConditionalGeneration
-
-[[autodoc]] T5ForConditionalGeneration
-    - forward
-    - parallelize
-    - deparallelize
-
-## T5EncoderModel
-
-[[autodoc]] T5EncoderModel
-    - forward
-    - parallelize
-    - deparallelize
-
-## TFT5Model
-
-[[autodoc]] TFT5Model
-    - call
-
-## TFT5ForConditionalGeneration
-
-[[autodoc]] TFT5ForConditionalGeneration
-    - call
-
-## TFT5EncoderModel
-
-[[autodoc]] TFT5EncoderModel
-    - call
-
-## FlaxT5Model
-
-[[autodoc]] FlaxT5Model
-    - __call__
-    - encode
-    - decode
-
-## FlaxT5ForConditionalGeneration
-
-[[autodoc]] FlaxT5ForConditionalGeneration
-    - __call__
-    - encode
-    - decode
-
-## FlaxT5EncoderModel
-
-[[autodoc]] FlaxT5EncoderModel
-    - __call__
diff --git a/docs/source/en/model_doc/t5v1.1.md b/docs/source/en/model_doc/t5v1.1.md
new file mode 100644
index 000000000000..900e26f521dd
--- /dev/null
+++ b/docs/source/en/model_doc/t5v1.1.md
@@ -0,0 +1,65 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# T5v1.1
+
+## Overview
+
+T5v1.1 was released in the [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511)
+repository by Colin Raffel et al. It's an improved version of the original T5 model.
+
+One can directly plug in the weights of T5v1.1 into a T5 model, like so:
+
+```python
+>>> from transformers import T5ForConditionalGeneration
+
+>>> model = T5ForConditionalGeneration.from_pretrained("google/t5-v1_1-base")
+```
+
+T5 Version 1.1 includes the following improvements compared to the original T5 model:
+
+- GEGLU activation in the feed-forward hidden layer, rather than ReLU. See [this paper](https://arxiv.org/abs/2002.05202).
+
+- Dropout was turned off in pre-training (quality win). Dropout should be re-enabled during fine-tuning.
+
+- Pre-trained on C4 only without mixing in the downstream tasks.
+
+- No parameter sharing between the embedding and classifier layer.
+
+- "xl" and "xxl" replace "3B" and "11B". The model shapes are a bit different - larger `d_model` and smaller
+  `num_heads` and `d_ff`.
+
+Note: T5 Version 1.1 was only pre-trained on [C4](https://huggingface.co/datasets/c4) excluding any supervised
+training. Therefore, this model has to be fine-tuned before it is usable on a downstream task, unlike the original T5
+model. Since t5v1.1 was pre-trained unsupervisedly, there's no real advantage to using a task prefix during single-task
+fine-tuning. If you are doing multi-task fine-tuning, you should use a prefix.
+
+Google has released the following variants:
+
+- [google/t5-v1_1-small](https://huggingface.co/google/t5-v1_1-small)
+
+- [google/t5-v1_1-base](https://huggingface.co/google/t5-v1_1-base)
+
+- [google/t5-v1_1-large](https://huggingface.co/google/t5-v1_1-large)
+
+- [google/t5-v1_1-xl](https://huggingface.co/google/t5-v1_1-xl)
+
+- [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl).
+
+One can refer to [T5's documentation page](t5) for all tips, code examples and notebooks.
+
+This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The original code can be
+found [here](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511).
diff --git a/docs/source/en/model_doc/t5v1.1.mdx b/docs/source/en/model_doc/t5v1.1.mdx
deleted file mode 100644
index a5b64f77dc7c..000000000000
--- a/docs/source/en/model_doc/t5v1.1.mdx
+++ /dev/null
@@ -1,61 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# T5v1.1
-
-## Overview
-
-T5v1.1 was released in the [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511)
-repository by Colin Raffel et al. It's an improved version of the original T5 model.
-
-One can directly plug in the weights of T5v1.1 into a T5 model, like so:
-
-```python
->>> from transformers import T5ForConditionalGeneration
-
->>> model = T5ForConditionalGeneration.from_pretrained("google/t5-v1_1-base")
-```
-
-T5 Version 1.1 includes the following improvements compared to the original T5 model:
-
-- GEGLU activation in the feed-forward hidden layer, rather than ReLU. See [this paper](https://arxiv.org/abs/2002.05202).
-
-- Dropout was turned off in pre-training (quality win). Dropout should be re-enabled during fine-tuning.
-
-- Pre-trained on C4 only without mixing in the downstream tasks.
-
-- No parameter sharing between the embedding and classifier layer.
-
-- "xl" and "xxl" replace "3B" and "11B". The model shapes are a bit different - larger `d_model` and smaller
-  `num_heads` and `d_ff`.
-
-Note: T5 Version 1.1 was only pre-trained on [C4](https://huggingface.co/datasets/c4) excluding any supervised
-training. Therefore, this model has to be fine-tuned before it is usable on a downstream task, unlike the original T5
-model. Since t5v1.1 was pre-trained unsupervisedly, there's no real advantage to using a task prefix during single-task
-fine-tuning. If you are doing multi-task fine-tuning, you should use a prefix.
-
-Google has released the following variants:
-
-- [google/t5-v1_1-small](https://huggingface.co/google/t5-v1_1-small)
-
-- [google/t5-v1_1-base](https://huggingface.co/google/t5-v1_1-base)
-
-- [google/t5-v1_1-large](https://huggingface.co/google/t5-v1_1-large)
-
-- [google/t5-v1_1-xl](https://huggingface.co/google/t5-v1_1-xl)
-
-- [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl).
-
-One can refer to [T5's documentation page](t5) for all tips, code examples and notebooks.
-
-This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The original code can be
-found [here](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511).
diff --git a/docs/source/en/model_doc/table-transformer.md b/docs/source/en/model_doc/table-transformer.md
new file mode 100644
index 000000000000..7ea7ae8cd352
--- /dev/null
+++ b/docs/source/en/model_doc/table-transformer.md
@@ -0,0 +1,68 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Table Transformer
+
+## Overview
+
+The Table Transformer model was proposed in [PubTables-1M: Towards comprehensive table extraction from unstructured documents](https://arxiv.org/abs/2110.00061) by
+Brandon Smock, Rohith Pesala, Robin Abraham. The authors introduce a new dataset, PubTables-1M, to benchmark progress in table extraction from unstructured documents,
+as well as table structure recognition and functional analysis. The authors train 2 [DETR](detr) models, one for table detection and one for table structure recognition, dubbed Table Transformers.
+
+The abstract from the paper is the following:
+
+*Recently, significant progress has been made applying machine learning to the problem of table structure inference and extraction from unstructured documents.
+However, one of the greatest challenges remains the creation of datasets with complete, unambiguous ground truth at scale. To address this, we develop a new, more
+comprehensive dataset for table extraction, called PubTables-1M. PubTables-1M contains nearly one million tables from scientific articles, supports multiple input
+modalities, and contains detailed header and location information for table structures, making it useful for a wide variety of modeling approaches. It also addresses a significant
+source of ground truth inconsistency observed in prior datasets called oversegmentation, using a novel canonicalization procedure. We demonstrate that these improvements lead to a
+significant increase in training performance and a more reliable estimate of model performance at evaluation for table structure recognition. Further, we show that transformer-based
+object detection models trained on PubTables-1M produce excellent results for all three tasks of detection, structure recognition, and functional analysis without the need for any
+special customization for these tasks.*
+
+Tips:
+
+- The authors released 2 models, one for [table detection](https://huggingface.co/microsoft/table-transformer-detection) in documents, one for [table structure recognition](https://huggingface.co/microsoft/table-transformer-structure-recognition) (the task of recognizing the individual rows, columns etc. in a table).
+- One can use the [`AutoImageProcessor`] API to prepare images and optional targets for the model. This will load a [`DetrImageProcessor`] behind the scenes.
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/table_transformer_architecture.jpeg"
+alt="drawing" width="600"/>
+
+<small> Table detection and table structure recognition clarified. Taken from the <a href="https://arxiv.org/abs/2110.00061">original paper</a>. </small>
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be
+found [here](https://github.com/microsoft/table-transformer).
+
+## Resources
+
+<PipelineTag pipeline="object-detection"/>
+
+- A demo notebook for the Table Transformer can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/Table%20Transformer).
+- It turns out padding of images is quite important for detection. An interesting Github thread with replies from the authors can be found [here](https://github.com/microsoft/table-transformer/issues/68).
+
+## TableTransformerConfig
+
+[[autodoc]] TableTransformerConfig
+
+## TableTransformerModel
+
+[[autodoc]] TableTransformerModel
+    - forward
+
+## TableTransformerForObjectDetection
+
+[[autodoc]] TableTransformerForObjectDetection
+    - forward
diff --git a/docs/source/en/model_doc/table-transformer.mdx b/docs/source/en/model_doc/table-transformer.mdx
deleted file mode 100644
index 862f4124c25f..000000000000
--- a/docs/source/en/model_doc/table-transformer.mdx
+++ /dev/null
@@ -1,64 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Table Transformer
-
-## Overview
-
-The Table Transformer model was proposed in [PubTables-1M: Towards comprehensive table extraction from unstructured documents](https://arxiv.org/abs/2110.00061) by
-Brandon Smock, Rohith Pesala, Robin Abraham. The authors introduce a new dataset, PubTables-1M, to benchmark progress in table extraction from unstructured documents,
-as well as table structure recognition and functional analysis. The authors train 2 [DETR](detr) models, one for table detection and one for table structure recognition, dubbed Table Transformers.
-
-The abstract from the paper is the following:
-
-*Recently, significant progress has been made applying machine learning to the problem of table structure inference and extraction from unstructured documents.
-However, one of the greatest challenges remains the creation of datasets with complete, unambiguous ground truth at scale. To address this, we develop a new, more
-comprehensive dataset for table extraction, called PubTables-1M. PubTables-1M contains nearly one million tables from scientific articles, supports multiple input
-modalities, and contains detailed header and location information for table structures, making it useful for a wide variety of modeling approaches. It also addresses a significant
-source of ground truth inconsistency observed in prior datasets called oversegmentation, using a novel canonicalization procedure. We demonstrate that these improvements lead to a
-significant increase in training performance and a more reliable estimate of model performance at evaluation for table structure recognition. Further, we show that transformer-based
-object detection models trained on PubTables-1M produce excellent results for all three tasks of detection, structure recognition, and functional analysis without the need for any
-special customization for these tasks.*
-
-Tips:
-
-- The authors released 2 models, one for [table detection](https://huggingface.co/microsoft/table-transformer-detection) in documents, one for [table structure recognition](https://huggingface.co/microsoft/table-transformer-structure-recognition) (the task of recognizing the individual rows, columns etc. in a table).
-- One can use the [`AutoImageProcessor`] API to prepare images and optional targets for the model. This will load a [`DetrImageProcessor`] behind the scenes.
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/table_transformer_architecture.jpeg"
-alt="drawing" width="600"/>
-
-<small> Table detection and table structure recognition clarified. Taken from the <a href="https://arxiv.org/abs/2110.00061">original paper</a>. </small>
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be
-found [here](https://github.com/microsoft/table-transformer).
-
-## Resources
-
-<PipelineTag pipeline="object-detection"/>
-
-- A demo notebook for the Table Transformer can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/Table%20Transformer).
-- It turns out padding of images is quite important for detection. An interesting Github thread with replies from the authors can be found [here](https://github.com/microsoft/table-transformer/issues/68). 
-
-## TableTransformerConfig
-
-[[autodoc]] TableTransformerConfig
-
-## TableTransformerModel
-
-[[autodoc]] TableTransformerModel
-    - forward
-
-## TableTransformerForObjectDetection
-
-[[autodoc]] TableTransformerForObjectDetection
-    - forward
diff --git a/docs/source/en/model_doc/tapas.md b/docs/source/en/model_doc/tapas.md
new file mode 100644
index 000000000000..1c76015f2857
--- /dev/null
+++ b/docs/source/en/model_doc/tapas.md
@@ -0,0 +1,623 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# TAPAS
+
+## Overview
+
+The TAPAS model was proposed in [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://www.aclweb.org/anthology/2020.acl-main.398)
+by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos. It's a BERT-based model specifically 
+designed (and pre-trained) for answering questions about tabular data. Compared to BERT, TAPAS uses relative position embeddings and has 7 
+token types that encode tabular structure. TAPAS is pre-trained on the masked language modeling (MLM) objective on a large dataset comprising 
+millions of tables from English Wikipedia and corresponding texts. 
+
+For question answering, TAPAS has 2 heads on top: a cell selection head and an aggregation head, for (optionally) performing aggregations (such as counting or summing) among selected cells. TAPAS has been fine-tuned on several datasets: 
+- [SQA](https://www.microsoft.com/en-us/download/details.aspx?id=54253) (Sequential Question Answering by Microsoft)
+- [WTQ](https://github.com/ppasupat/WikiTableQuestions) (Wiki Table Questions by Stanford University)
+- [WikiSQL](https://github.com/salesforce/WikiSQL) (by Salesforce). 
+
+It achieves state-of-the-art on both SQA and WTQ, while having comparable performance to SOTA on WikiSQL, with a much simpler architecture.
+
+The abstract from the paper is the following:
+
+*Answering natural language questions over tables is usually seen as a semantic parsing task. To alleviate the collection cost of full logical forms, one popular approach focuses on weak supervision consisting of denotations instead of logical forms. However, training semantic parsers from weak supervision poses difficulties, and in addition, the generated logical forms are only used as an intermediate step prior to retrieving the denotation. In this paper, we present TAPAS, an approach to question answering over tables without generating logical forms. TAPAS trains from weak supervision, and predicts the denotation by selecting table cells and optionally applying a corresponding aggregation operator to such selection. TAPAS extends BERT's architecture to encode tables as input, initializes from an effective joint pre-training of text segments and tables crawled from Wikipedia, and is trained end-to-end. We experiment with three different semantic parsing datasets, and find that TAPAS outperforms or rivals semantic parsing models by improving state-of-the-art accuracy on SQA from 55.1 to 67.2 and performing on par with the state-of-the-art on WIKISQL and WIKITQ, but with a simpler model architecture. We additionally find that transfer learning, which is trivial in our setting, from WIKISQL to WIKITQ, yields 48.7 accuracy, 4.2 points above the state-of-the-art.*
+
+In addition, the authors have further pre-trained TAPAS to recognize **table entailment**, by creating a balanced dataset of millions of automatically created training examples which are learned in an intermediate step prior to fine-tuning. The authors of TAPAS call this further pre-training intermediate pre-training (since TAPAS is first pre-trained on MLM, and then on another dataset). They found that intermediate pre-training further improves performance on SQA, achieving a new state-of-the-art as well as state-of-the-art on [TabFact](https://github.com/wenhuchen/Table-Fact-Checking), a large-scale dataset with 16k Wikipedia tables for table entailment (a binary classification task). For more details, see their follow-up paper: [Understanding tables with intermediate pre-training](https://www.aclweb.org/anthology/2020.findings-emnlp.27/) by Julian Martin Eisenschlos, Syrine Krichene and Thomas Müller.
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tapas_architecture.png"
+alt="drawing" width="600"/> 
+
+<small> TAPAS architecture. Taken from the <a href="https://ai.googleblog.com/2020/04/using-neural-networks-to-find-answers.html">original blog post</a>.</small>
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The Tensorflow version of this model was contributed by [kamalkraj](https://huggingface.co/kamalkraj). The original code can be found [here](https://github.com/google-research/tapas).
+
+Tips:
+
+- TAPAS is a model that uses relative position embeddings by default (restarting the position embeddings at every cell of the table). Note that this is something that was added after the publication of the original TAPAS paper. According to the authors, this usually results in a slightly better performance, and allows you to encode longer sequences without running out of embeddings. This is reflected in the `reset_position_index_per_cell` parameter of [`TapasConfig`], which is set to `True` by default. The default versions of the models available on the [hub](https://huggingface.co/models?search=tapas) all use relative position embeddings. You can still use the ones with absolute position embeddings by passing in an additional argument `revision="no_reset"` when calling the `from_pretrained()` method. Note that it's usually advised to pad the inputs on the right rather than the left.
+- TAPAS is based on BERT, so `TAPAS-base` for example corresponds to a `BERT-base` architecture. Of course, `TAPAS-large` will result in the best performance (the results reported in the paper are from `TAPAS-large`). Results of the various sized models are shown on the [original Github repository](https://github.com/google-research/tapas>).
+- TAPAS has checkpoints fine-tuned on SQA, which are capable of answering questions related to a table in a conversational set-up. This means that you can ask follow-up questions such as "what is his age?" related to the previous question. Note that the forward pass of TAPAS is a bit different in case of a conversational set-up: in that case, you have to feed every table-question pair one by one to the model, such that the `prev_labels` token type ids can be overwritten by the predicted `labels` of the model to the previous question. See "Usage" section for more info.
+- TAPAS is similar to BERT and therefore relies on the masked language modeling (MLM) objective. It is therefore efficient at predicting masked tokens and at NLU in general, but is not optimal for text generation. Models trained with a causal language modeling (CLM) objective are better in that regard. Note that TAPAS can be used as an encoder in the EncoderDecoderModel framework, to combine it with an autoregressive text decoder such as GPT-2.
+
+## Usage: fine-tuning
+
+Here we explain how you can fine-tune [`TapasForQuestionAnswering`] on your own dataset.
+
+**STEP 1: Choose one of the 3 ways in which you can use TAPAS - or experiment**
+
+Basically, there are 3 different ways in which one can fine-tune [`TapasForQuestionAnswering`], corresponding to the different datasets on which Tapas was fine-tuned:
+
+1. SQA: if you're interested in asking follow-up questions related to a table, in a conversational set-up. For example if you first ask "what's the name of the first actor?" then you can ask a follow-up question such as "how old is he?". Here, questions do not involve any aggregation (all questions are cell selection questions).
+2. WTQ: if you're not interested in asking questions in a conversational set-up, but rather just asking questions related to a table, which might involve aggregation, such as counting a number of rows, summing up cell values or averaging cell values. You can then for example ask "what's the total number of goals Cristiano Ronaldo made in his career?". This case is also called **weak supervision**, since the model itself must learn the appropriate aggregation operator (SUM/COUNT/AVERAGE/NONE) given only the answer to the question as supervision.
+3. WikiSQL-supervised: this dataset is based on WikiSQL with the model being given the ground truth aggregation operator during training. This is also called **strong supervision**. Here, learning the appropriate aggregation operator is much easier.
+
+To summarize:
+
+| **Task**                            | **Example dataset** | **Description**                                                                                         |
+|-------------------------------------|---------------------|---------------------------------------------------------------------------------------------------------|
+| Conversational                      | SQA                 | Conversational, only cell selection questions                                                           |
+| Weak supervision for aggregation    | WTQ                 | Questions might involve aggregation, and the model must learn this given only the answer as supervision |
+| Strong supervision for aggregation  | WikiSQL-supervised  | Questions might involve aggregation, and the model must learn this given the gold aggregation operator  |
+
+<frameworkcontent>
+<pt>
+Initializing a model with a pre-trained base and randomly initialized classification heads from the hub can be done as shown below.
+
+```py
+>>> from transformers import TapasConfig, TapasForQuestionAnswering
+
+>>> # for example, the base sized model with default SQA configuration
+>>> model = TapasForQuestionAnswering.from_pretrained("google/tapas-base")
+
+>>> # or, the base sized model with WTQ configuration
+>>> config = TapasConfig.from_pretrained("google/tapas-base-finetuned-wtq")
+>>> model = TapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)
+
+>>> # or, the base sized model with WikiSQL configuration
+>>> config = TapasConfig("google-base-finetuned-wikisql-supervised")
+>>> model = TapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)
+```
+
+Of course, you don't necessarily have to follow one of these three ways in which TAPAS was fine-tuned. You can also experiment by defining any hyperparameters you want when initializing [`TapasConfig`], and then create a [`TapasForQuestionAnswering`] based on that configuration. For example, if you have a dataset that has both conversational questions and questions that might involve aggregation, then you can do it this way. Here's an example:
+
+```py
+>>> from transformers import TapasConfig, TapasForQuestionAnswering
+
+>>> # you can initialize the classification heads any way you want (see docs of TapasConfig)
+>>> config = TapasConfig(num_aggregation_labels=3, average_logits_per_cell=True)
+>>> # initializing the pre-trained base sized model with our custom classification heads
+>>> model = TapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)
+```
+</pt>
+<tf>
+Initializing a model with a pre-trained base and randomly initialized classification heads from the hub can be done as shown below. Be sure to have installed the [tensorflow_probability](https://github.com/tensorflow/probability) dependency:
+
+```py
+>>> from transformers import TapasConfig, TFTapasForQuestionAnswering
+
+>>> # for example, the base sized model with default SQA configuration
+>>> model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base")
+
+>>> # or, the base sized model with WTQ configuration
+>>> config = TapasConfig.from_pretrained("google/tapas-base-finetuned-wtq")
+>>> model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)
+
+>>> # or, the base sized model with WikiSQL configuration
+>>> config = TapasConfig("google-base-finetuned-wikisql-supervised")
+>>> model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)
+```
+
+Of course, you don't necessarily have to follow one of these three ways in which TAPAS was fine-tuned. You can also experiment by defining any hyperparameters you want when initializing [`TapasConfig`], and then create a [`TFTapasForQuestionAnswering`] based on that configuration. For example, if you have a dataset that has both conversational questions and questions that might involve aggregation, then you can do it this way. Here's an example:
+
+```py
+>>> from transformers import TapasConfig, TFTapasForQuestionAnswering
+
+>>> # you can initialize the classification heads any way you want (see docs of TapasConfig)
+>>> config = TapasConfig(num_aggregation_labels=3, average_logits_per_cell=True)
+>>> # initializing the pre-trained base sized model with our custom classification heads
+>>> model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)
+```
+</tf>
+</frameworkcontent>
+
+What you can also do is start from an already fine-tuned checkpoint. A note here is that the already fine-tuned checkpoint on WTQ has some issues due to the L2-loss which is somewhat brittle. See [here](https://github.com/google-research/tapas/issues/91#issuecomment-735719340) for more info.
+
+For a list of all pre-trained and fine-tuned TAPAS checkpoints available on HuggingFace's  hub, see [here](https://huggingface.co/models?search=tapas).
+
+**STEP 2: Prepare your data in the SQA format**
+
+Second, no matter what you picked above, you should prepare your dataset in the [SQA](https://www.microsoft.com/en-us/download/details.aspx?id=54253) format. This format is a TSV/CSV file with the following columns:
+
+- `id`: optional, id of the table-question pair, for bookkeeping purposes.
+- `annotator`: optional, id of the person who annotated the table-question pair, for bookkeeping purposes.
+- `position`: integer indicating if the question is the first, second, third,... related to the table. Only required in case of conversational setup (SQA). You don't need this column in case you're going for WTQ/WikiSQL-supervised.
+- `question`: string
+- `table_file`: string, name of a csv file containing the tabular data
+- `answer_coordinates`: list of one or more tuples (each tuple being a cell coordinate, i.e. row, column pair that is part of the answer)
+- `answer_text`: list of one or more strings (each string being a cell value that is part of the answer)
+- `aggregation_label`: index of the aggregation operator. Only required in case of strong supervision for aggregation (the WikiSQL-supervised case)
+- `float_answer`: the float answer to the question, if there is one (np.nan if there isn't). Only required in case of weak supervision for aggregation (such as WTQ and WikiSQL)
+
+The tables themselves should be present in a folder, each table being a separate csv file. Note that the authors of the TAPAS algorithm used conversion scripts with some automated logic to convert the other datasets (WTQ, WikiSQL) into the SQA format. The author explains this [here](https://github.com/google-research/tapas/issues/50#issuecomment-705465960). A conversion of this script that works with HuggingFace's implementation can be found [here](https://github.com/NielsRogge/tapas_utils). Interestingly, these conversion scripts are not perfect (the `answer_coordinates` and `float_answer` fields are populated based on the `answer_text`), meaning that WTQ and WikiSQL results could actually be improved.
+
+**STEP 3: Convert your data into tensors using TapasTokenizer**
+
+<frameworkcontent>
+<pt>
+Third, given that you've prepared your data in this TSV/CSV format (and corresponding CSV files containing the tabular data), you can then use [`TapasTokenizer`] to convert table-question pairs into `input_ids`, `attention_mask`, `token_type_ids` and so on. Again, based on which of the three cases you picked above, [`TapasForQuestionAnswering`] requires different
+inputs to be fine-tuned:
+
+| **Task**                           | **Required inputs**                                                                                                 |
+|------------------------------------|---------------------------------------------------------------------------------------------------------------------|
+| Conversational                     | `input_ids`, `attention_mask`, `token_type_ids`, `labels`                                                           |
+|  Weak supervision for aggregation  | `input_ids`, `attention_mask`, `token_type_ids`, `labels`, `numeric_values`, `numeric_values_scale`, `float_answer` |
+| Strong supervision for aggregation | `input ids`, `attention mask`, `token type ids`, `labels`, `aggregation_labels`                                     |
+
+[`TapasTokenizer`] creates the `labels`, `numeric_values` and `numeric_values_scale` based on the `answer_coordinates` and `answer_text` columns of the TSV file. The `float_answer` and `aggregation_labels` are already in the TSV file of step 2. Here's an example:
+
+```py
+>>> from transformers import TapasTokenizer
+>>> import pandas as pd
+
+>>> model_name = "google/tapas-base"
+>>> tokenizer = TapasTokenizer.from_pretrained(model_name)
+
+>>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
+>>> queries = [
+...     "What is the name of the first actor?",
+...     "How many movies has George Clooney played in?",
+...     "What is the total number of movies?",
+... ]
+>>> answer_coordinates = [[(0, 0)], [(2, 1)], [(0, 1), (1, 1), (2, 1)]]
+>>> answer_text = [["Brad Pitt"], ["69"], ["209"]]
+>>> table = pd.DataFrame.from_dict(data)
+>>> inputs = tokenizer(
+...     table=table,
+...     queries=queries,
+...     answer_coordinates=answer_coordinates,
+...     answer_text=answer_text,
+...     padding="max_length",
+...     return_tensors="pt",
+... )
+>>> inputs
+{'input_ids': tensor([[ ... ]]), 'attention_mask': tensor([[...]]), 'token_type_ids': tensor([[[...]]]),
+'numeric_values': tensor([[ ... ]]), 'numeric_values_scale: tensor([[ ... ]]), labels: tensor([[ ... ]])}
+```
+
+Note that [`TapasTokenizer`] expects the data of the table to be **text-only**. You can use `.astype(str)` on a dataframe to turn it into text-only data.
+Of course, this only shows how to encode a single training example. It is advised to create a dataloader to iterate over batches:
+
+```py
+>>> import torch
+>>> import pandas as pd
+
+>>> tsv_path = "your_path_to_the_tsv_file"
+>>> table_csv_path = "your_path_to_a_directory_containing_all_csv_files"
+
+
+>>> class TableDataset(torch.utils.data.Dataset):
+...     def __init__(self, data, tokenizer):
+...         self.data = data
+...         self.tokenizer = tokenizer
+
+...     def __getitem__(self, idx):
+...         item = data.iloc[idx]
+...         table = pd.read_csv(table_csv_path + item.table_file).astype(
+...             str
+...         )  # be sure to make your table data text only
+...         encoding = self.tokenizer(
+...             table=table,
+...             queries=item.question,
+...             answer_coordinates=item.answer_coordinates,
+...             answer_text=item.answer_text,
+...             truncation=True,
+...             padding="max_length",
+...             return_tensors="pt",
+...         )
+...         # remove the batch dimension which the tokenizer adds by default
+...         encoding = {key: val.squeeze(0) for key, val in encoding.items()}
+...         # add the float_answer which is also required (weak supervision for aggregation case)
+...         encoding["float_answer"] = torch.tensor(item.float_answer)
+...         return encoding
+
+...     def __len__(self):
+...         return len(self.data)
+
+
+>>> data = pd.read_csv(tsv_path, sep="\t")
+>>> train_dataset = TableDataset(data, tokenizer)
+>>> train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32)
+```
+</pt>
+<tf>
+Third, given that you've prepared your data in this TSV/CSV format (and corresponding CSV files containing the tabular data), you can then use [`TapasTokenizer`] to convert table-question pairs into `input_ids`, `attention_mask`, `token_type_ids` and so on. Again, based on which of the three cases you picked above, [`TFTapasForQuestionAnswering`] requires different
+inputs to be fine-tuned:
+
+| **Task**                           | **Required inputs**                                                                                                 |
+|------------------------------------|---------------------------------------------------------------------------------------------------------------------|
+| Conversational                     | `input_ids`, `attention_mask`, `token_type_ids`, `labels`                                                           |
+|  Weak supervision for aggregation  | `input_ids`, `attention_mask`, `token_type_ids`, `labels`, `numeric_values`, `numeric_values_scale`, `float_answer` |
+| Strong supervision for aggregation | `input ids`, `attention mask`, `token type ids`, `labels`, `aggregation_labels`                                     |
+
+[`TapasTokenizer`] creates the `labels`, `numeric_values` and `numeric_values_scale` based on the `answer_coordinates` and `answer_text` columns of the TSV file. The `float_answer` and `aggregation_labels` are already in the TSV file of step 2. Here's an example:
+
+```py
+>>> from transformers import TapasTokenizer
+>>> import pandas as pd
+
+>>> model_name = "google/tapas-base"
+>>> tokenizer = TapasTokenizer.from_pretrained(model_name)
+
+>>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
+>>> queries = [
+...     "What is the name of the first actor?",
+...     "How many movies has George Clooney played in?",
+...     "What is the total number of movies?",
+... ]
+>>> answer_coordinates = [[(0, 0)], [(2, 1)], [(0, 1), (1, 1), (2, 1)]]
+>>> answer_text = [["Brad Pitt"], ["69"], ["209"]]
+>>> table = pd.DataFrame.from_dict(data)
+>>> inputs = tokenizer(
+...     table=table,
+...     queries=queries,
+...     answer_coordinates=answer_coordinates,
+...     answer_text=answer_text,
+...     padding="max_length",
+...     return_tensors="tf",
+... )
+>>> inputs
+{'input_ids': tensor([[ ... ]]), 'attention_mask': tensor([[...]]), 'token_type_ids': tensor([[[...]]]),
+'numeric_values': tensor([[ ... ]]), 'numeric_values_scale: tensor([[ ... ]]), labels: tensor([[ ... ]])}
+```
+
+Note that [`TapasTokenizer`] expects the data of the table to be **text-only**. You can use `.astype(str)` on a dataframe to turn it into text-only data.
+Of course, this only shows how to encode a single training example. It is advised to create a dataloader to iterate over batches:
+
+```py
+>>> import tensorflow as tf
+>>> import pandas as pd
+
+>>> tsv_path = "your_path_to_the_tsv_file"
+>>> table_csv_path = "your_path_to_a_directory_containing_all_csv_files"
+
+
+>>> class TableDataset:
+...     def __init__(self, data, tokenizer):
+...         self.data = data
+...         self.tokenizer = tokenizer
+
+...     def __iter__(self):
+...         for idx in range(self.__len__()):
+...             item = self.data.iloc[idx]
+...             table = pd.read_csv(table_csv_path + item.table_file).astype(
+...                 str
+...             )  # be sure to make your table data text only
+...             encoding = self.tokenizer(
+...                 table=table,
+...                 queries=item.question,
+...                 answer_coordinates=item.answer_coordinates,
+...                 answer_text=item.answer_text,
+...                 truncation=True,
+...                 padding="max_length",
+...                 return_tensors="tf",
+...             )
+...             # remove the batch dimension which the tokenizer adds by default
+...             encoding = {key: tf.squeeze(val, 0) for key, val in encoding.items()}
+...             # add the float_answer which is also required (weak supervision for aggregation case)
+...             encoding["float_answer"] = tf.convert_to_tensor(item.float_answer, dtype=tf.float32)
+...             yield encoding["input_ids"], encoding["attention_mask"], encoding["numeric_values"], encoding[
+...                 "numeric_values_scale"
+...             ], encoding["token_type_ids"], encoding["labels"], encoding["float_answer"]
+
+...     def __len__(self):
+...         return len(self.data)
+
+
+>>> data = pd.read_csv(tsv_path, sep="\t")
+>>> train_dataset = TableDataset(data, tokenizer)
+>>> output_signature = (
+...     tf.TensorSpec(shape=(512,), dtype=tf.int32),
+...     tf.TensorSpec(shape=(512,), dtype=tf.int32),
+...     tf.TensorSpec(shape=(512,), dtype=tf.float32),
+...     tf.TensorSpec(shape=(512,), dtype=tf.float32),
+...     tf.TensorSpec(shape=(512, 7), dtype=tf.int32),
+...     tf.TensorSpec(shape=(512,), dtype=tf.int32),
+...     tf.TensorSpec(shape=(512,), dtype=tf.float32),
+... )
+>>> train_dataloader = tf.data.Dataset.from_generator(train_dataset, output_signature=output_signature).batch(32)
+```
+</tf>
+</frameworkcontent>
+
+Note that here, we encode each table-question pair independently. This is fine as long as your dataset is **not conversational**. In case your dataset involves conversational questions (such as in SQA), then you should first group together the `queries`, `answer_coordinates` and `answer_text` per table (in the order of their `position`
+index) and batch encode each table with its questions. This will make sure that the `prev_labels` token types (see docs of [`TapasTokenizer`]) are set correctly. See [this notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) for more info. See [this notebook](https://github.com/kamalkraj/Tapas-Tutorial/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) for more info regarding using the TensorFlow model.
+
+**STEP 4: Train (fine-tune) the model
+
+<frameworkcontent>
+<pt>
+You can then fine-tune [`TapasForQuestionAnswering`] as follows (shown here for the weak supervision for aggregation case):
+
+```py
+>>> from transformers import TapasConfig, TapasForQuestionAnswering, AdamW
+
+>>> # this is the default WTQ configuration
+>>> config = TapasConfig(
+...     num_aggregation_labels=4,
+...     use_answer_as_supervision=True,
+...     answer_loss_cutoff=0.664694,
+...     cell_selection_preference=0.207951,
+...     huber_loss_delta=0.121194,
+...     init_cell_selection_weights_to_zero=True,
+...     select_one_column=True,
+...     allow_empty_column_selection=False,
+...     temperature=0.0352513,
+... )
+>>> model = TapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)
+
+>>> optimizer = AdamW(model.parameters(), lr=5e-5)
+
+>>> model.train()
+>>> for epoch in range(2):  # loop over the dataset multiple times
+...     for batch in train_dataloader:
+...         # get the inputs;
+...         input_ids = batch["input_ids"]
+...         attention_mask = batch["attention_mask"]
+...         token_type_ids = batch["token_type_ids"]
+...         labels = batch["labels"]
+...         numeric_values = batch["numeric_values"]
+...         numeric_values_scale = batch["numeric_values_scale"]
+...         float_answer = batch["float_answer"]
+
+...         # zero the parameter gradients
+...         optimizer.zero_grad()
+
+...         # forward + backward + optimize
+...         outputs = model(
+...             input_ids=input_ids,
+...             attention_mask=attention_mask,
+...             token_type_ids=token_type_ids,
+...             labels=labels,
+...             numeric_values=numeric_values,
+...             numeric_values_scale=numeric_values_scale,
+...             float_answer=float_answer,
+...         )
+...         loss = outputs.loss
+...         loss.backward()
+...         optimizer.step()
+```
+</pt>
+<tf>
+You can then fine-tune [`TFTapasForQuestionAnswering`] as follows (shown here for the weak supervision for aggregation case):
+
+```py
+>>> import tensorflow as tf
+>>> from transformers import TapasConfig, TFTapasForQuestionAnswering
+
+>>> # this is the default WTQ configuration
+>>> config = TapasConfig(
+...     num_aggregation_labels=4,
+...     use_answer_as_supervision=True,
+...     answer_loss_cutoff=0.664694,
+...     cell_selection_preference=0.207951,
+...     huber_loss_delta=0.121194,
+...     init_cell_selection_weights_to_zero=True,
+...     select_one_column=True,
+...     allow_empty_column_selection=False,
+...     temperature=0.0352513,
+... )
+>>> model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)
+
+>>> optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
+
+>>> for epoch in range(2):  # loop over the dataset multiple times
+...     for batch in train_dataloader:
+...         # get the inputs;
+...         input_ids = batch[0]
+...         attention_mask = batch[1]
+...         token_type_ids = batch[4]
+...         labels = batch[-1]
+...         numeric_values = batch[2]
+...         numeric_values_scale = batch[3]
+...         float_answer = batch[6]
+
+...         # forward + backward + optimize
+...         with tf.GradientTape() as tape:
+...             outputs = model(
+...                 input_ids=input_ids,
+...                 attention_mask=attention_mask,
+...                 token_type_ids=token_type_ids,
+...                 labels=labels,
+...                 numeric_values=numeric_values,
+...                 numeric_values_scale=numeric_values_scale,
+...                 float_answer=float_answer,
+...             )
+...         grads = tape.gradient(outputs.loss, model.trainable_weights)
+...         optimizer.apply_gradients(zip(grads, model.trainable_weights))
+```
+</tf>
+</frameworkcontent>
+
+## Usage: inference
+
+<frameworkcontent>
+<pt>
+Here we explain how you can use [`TapasForQuestionAnswering`] or [`TFTapasForQuestionAnswering`] for inference (i.e. making predictions on new data). For inference, only `input_ids`, `attention_mask` and `token_type_ids` (which you can obtain using [`TapasTokenizer`]) have to be provided to the model to obtain the logits. Next, you can use the handy [`~models.tapas.tokenization_tapas.convert_logits_to_predictions`] method to convert these into predicted coordinates and optional aggregation indices.
+
+However, note that inference is **different** depending on whether or not the setup is conversational. In a non-conversational set-up, inference can be done in parallel on all table-question pairs of a batch. Here's an example of that:
+
+```py
+>>> from transformers import TapasTokenizer, TapasForQuestionAnswering
+>>> import pandas as pd
+
+>>> model_name = "google/tapas-base-finetuned-wtq"
+>>> model = TapasForQuestionAnswering.from_pretrained(model_name)
+>>> tokenizer = TapasTokenizer.from_pretrained(model_name)
+
+>>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
+>>> queries = [
+...     "What is the name of the first actor?",
+...     "How many movies has George Clooney played in?",
+...     "What is the total number of movies?",
+... ]
+>>> table = pd.DataFrame.from_dict(data)
+>>> inputs = tokenizer(table=table, queries=queries, padding="max_length", return_tensors="pt")
+>>> outputs = model(**inputs)
+>>> predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
+...     inputs, outputs.logits.detach(), outputs.logits_aggregation.detach()
+... )
+
+>>> # let's print out the results:
+>>> id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"}
+>>> aggregation_predictions_string = [id2aggregation[x] for x in predicted_aggregation_indices]
+
+>>> answers = []
+>>> for coordinates in predicted_answer_coordinates:
+...     if len(coordinates) == 1:
+...         # only a single cell:
+...         answers.append(table.iat[coordinates[0]])
+...     else:
+...         # multiple cells
+...         cell_values = []
+...         for coordinate in coordinates:
+...             cell_values.append(table.iat[coordinate])
+...         answers.append(", ".join(cell_values))
+
+>>> display(table)
+>>> print("")
+>>> for query, answer, predicted_agg in zip(queries, answers, aggregation_predictions_string):
+...     print(query)
+...     if predicted_agg == "NONE":
+...         print("Predicted answer: " + answer)
+...     else:
+...         print("Predicted answer: " + predicted_agg + " > " + answer)
+What is the name of the first actor?
+Predicted answer: Brad Pitt
+How many movies has George Clooney played in?
+Predicted answer: COUNT > 69
+What is the total number of movies?
+Predicted answer: SUM > 87, 53, 69
+```
+</pt>
+<tf>
+Here we explain how you can use [`TFTapasForQuestionAnswering`] for inference (i.e. making predictions on new data). For inference, only `input_ids`, `attention_mask` and `token_type_ids` (which you can obtain using [`TapasTokenizer`]) have to be provided to the model to obtain the logits. Next, you can use the handy [`~models.tapas.tokenization_tapas.convert_logits_to_predictions`] method to convert these into predicted coordinates and optional aggregation indices.
+
+However, note that inference is **different** depending on whether or not the setup is conversational. In a non-conversational set-up, inference can be done in parallel on all table-question pairs of a batch. Here's an example of that:
+
+```py
+>>> from transformers import TapasTokenizer, TFTapasForQuestionAnswering
+>>> import pandas as pd
+
+>>> model_name = "google/tapas-base-finetuned-wtq"
+>>> model = TFTapasForQuestionAnswering.from_pretrained(model_name)
+>>> tokenizer = TapasTokenizer.from_pretrained(model_name)
+
+>>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
+>>> queries = [
+...     "What is the name of the first actor?",
+...     "How many movies has George Clooney played in?",
+...     "What is the total number of movies?",
+... ]
+>>> table = pd.DataFrame.from_dict(data)
+>>> inputs = tokenizer(table=table, queries=queries, padding="max_length", return_tensors="tf")
+>>> outputs = model(**inputs)
+>>> predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
+...     inputs, outputs.logits, outputs.logits_aggregation
+... )
+
+>>> # let's print out the results:
+>>> id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"}
+>>> aggregation_predictions_string = [id2aggregation[x] for x in predicted_aggregation_indices]
+
+>>> answers = []
+>>> for coordinates in predicted_answer_coordinates:
+...     if len(coordinates) == 1:
+...         # only a single cell:
+...         answers.append(table.iat[coordinates[0]])
+...     else:
+...         # multiple cells
+...         cell_values = []
+...         for coordinate in coordinates:
+...             cell_values.append(table.iat[coordinate])
+...         answers.append(", ".join(cell_values))
+
+>>> display(table)
+>>> print("")
+>>> for query, answer, predicted_agg in zip(queries, answers, aggregation_predictions_string):
+...     print(query)
+...     if predicted_agg == "NONE":
+...         print("Predicted answer: " + answer)
+...     else:
+...         print("Predicted answer: " + predicted_agg + " > " + answer)
+What is the name of the first actor?
+Predicted answer: Brad Pitt
+How many movies has George Clooney played in?
+Predicted answer: COUNT > 69
+What is the total number of movies?
+Predicted answer: SUM > 87, 53, 69
+```
+</tf>
+</frameworkcontent>
+
+In case of a conversational set-up, then each table-question pair must be provided **sequentially** to the model, such that the `prev_labels` token types can be overwritten by the predicted `labels` of the previous table-question pair. Again, more info can be found in [this notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) (for PyTorch) and [this notebook](https://github.com/kamalkraj/Tapas-Tutorial/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) (for TensorFlow).
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+
+## TAPAS specific outputs
+[[autodoc]] models.tapas.modeling_tapas.TableQuestionAnsweringOutput
+
+## TapasConfig
+[[autodoc]] TapasConfig
+
+## TapasTokenizer
+[[autodoc]] TapasTokenizer
+    - __call__
+    - convert_logits_to_predictions
+    - save_vocabulary
+
+## TapasModel
+[[autodoc]] TapasModel
+    - forward
+    
+## TapasForMaskedLM
+[[autodoc]] TapasForMaskedLM
+    - forward
+
+## TapasForSequenceClassification
+[[autodoc]] TapasForSequenceClassification
+    - forward
+    
+## TapasForQuestionAnswering
+[[autodoc]] TapasForQuestionAnswering
+    - forward
+
+## TFTapasModel
+[[autodoc]] TFTapasModel
+    - call
+    
+## TFTapasForMaskedLM
+[[autodoc]] TFTapasForMaskedLM
+    - call
+
+## TFTapasForSequenceClassification
+[[autodoc]] TFTapasForSequenceClassification
+    - call
+    
+## TFTapasForQuestionAnswering
+[[autodoc]] TFTapasForQuestionAnswering
+    - call
\ No newline at end of file
diff --git a/docs/source/en/model_doc/tapas.mdx b/docs/source/en/model_doc/tapas.mdx
deleted file mode 100644
index 5a2b54e8c32c..000000000000
--- a/docs/source/en/model_doc/tapas.mdx
+++ /dev/null
@@ -1,614 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# TAPAS
-
-## Overview
-
-The TAPAS model was proposed in [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://www.aclweb.org/anthology/2020.acl-main.398)
-by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos. It's a BERT-based model specifically 
-designed (and pre-trained) for answering questions about tabular data. Compared to BERT, TAPAS uses relative position embeddings and has 7 
-token types that encode tabular structure. TAPAS is pre-trained on the masked language modeling (MLM) objective on a large dataset comprising 
-millions of tables from English Wikipedia and corresponding texts. 
-
-For question answering, TAPAS has 2 heads on top: a cell selection head and an aggregation head, for (optionally) performing aggregations (such as counting or summing) among selected cells. TAPAS has been fine-tuned on several datasets: 
-- [SQA](https://www.microsoft.com/en-us/download/details.aspx?id=54253) (Sequential Question Answering by Microsoft)
-- [WTQ](https://github.com/ppasupat/WikiTableQuestions) (Wiki Table Questions by Stanford University)
-- [WikiSQL](https://github.com/salesforce/WikiSQL) (by Salesforce). 
-
-It achieves state-of-the-art on both SQA and WTQ, while having comparable performance to SOTA on WikiSQL, with a much simpler architecture.
-
-The abstract from the paper is the following:
-
-*Answering natural language questions over tables is usually seen as a semantic parsing task. To alleviate the collection cost of full logical forms, one popular approach focuses on weak supervision consisting of denotations instead of logical forms. However, training semantic parsers from weak supervision poses difficulties, and in addition, the generated logical forms are only used as an intermediate step prior to retrieving the denotation. In this paper, we present TAPAS, an approach to question answering over tables without generating logical forms. TAPAS trains from weak supervision, and predicts the denotation by selecting table cells and optionally applying a corresponding aggregation operator to such selection. TAPAS extends BERT's architecture to encode tables as input, initializes from an effective joint pre-training of text segments and tables crawled from Wikipedia, and is trained end-to-end. We experiment with three different semantic parsing datasets, and find that TAPAS outperforms or rivals semantic parsing models by improving state-of-the-art accuracy on SQA from 55.1 to 67.2 and performing on par with the state-of-the-art on WIKISQL and WIKITQ, but with a simpler model architecture. We additionally find that transfer learning, which is trivial in our setting, from WIKISQL to WIKITQ, yields 48.7 accuracy, 4.2 points above the state-of-the-art.*
-
-In addition, the authors have further pre-trained TAPAS to recognize **table entailment**, by creating a balanced dataset of millions of automatically created training examples which are learned in an intermediate step prior to fine-tuning. The authors of TAPAS call this further pre-training intermediate pre-training (since TAPAS is first pre-trained on MLM, and then on another dataset). They found that intermediate pre-training further improves performance on SQA, achieving a new state-of-the-art as well as state-of-the-art on [TabFact](https://github.com/wenhuchen/Table-Fact-Checking), a large-scale dataset with 16k Wikipedia tables for table entailment (a binary classification task). For more details, see their follow-up paper: [Understanding tables with intermediate pre-training](https://www.aclweb.org/anthology/2020.findings-emnlp.27/) by Julian Martin Eisenschlos, Syrine Krichene and Thomas Müller.
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tapas_architecture.png"
-alt="drawing" width="600"/> 
-
-<small> TAPAS architecture. Taken from the <a href="https://ai.googleblog.com/2020/04/using-neural-networks-to-find-answers.html">original blog post</a>.</small>
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr). The Tensorflow version of this model was contributed by [kamalkraj](https://huggingface.co/kamalkraj). The original code can be found [here](https://github.com/google-research/tapas).
-
-Tips:
-
-- TAPAS is a model that uses relative position embeddings by default (restarting the position embeddings at every cell of the table). Note that this is something that was added after the publication of the original TAPAS paper. According to the authors, this usually results in a slightly better performance, and allows you to encode longer sequences without running out of embeddings. This is reflected in the `reset_position_index_per_cell` parameter of [`TapasConfig`], which is set to `True` by default. The default versions of the models available on the [hub](https://huggingface.co/models?search=tapas) all use relative position embeddings. You can still use the ones with absolute position embeddings by passing in an additional argument `revision="no_reset"` when calling the `from_pretrained()` method. Note that it's usually advised to pad the inputs on the right rather than the left.
-- TAPAS is based on BERT, so `TAPAS-base` for example corresponds to a `BERT-base` architecture. Of course, `TAPAS-large` will result in the best performance (the results reported in the paper are from `TAPAS-large`). Results of the various sized models are shown on the [original Github repository](https://github.com/google-research/tapas>).
-- TAPAS has checkpoints fine-tuned on SQA, which are capable of answering questions related to a table in a conversational set-up. This means that you can ask follow-up questions such as "what is his age?" related to the previous question. Note that the forward pass of TAPAS is a bit different in case of a conversational set-up: in that case, you have to feed every table-question pair one by one to the model, such that the `prev_labels` token type ids can be overwritten by the predicted `labels` of the model to the previous question. See "Usage" section for more info.
-- TAPAS is similar to BERT and therefore relies on the masked language modeling (MLM) objective. It is therefore efficient at predicting masked tokens and at NLU in general, but is not optimal for text generation. Models trained with a causal language modeling (CLM) objective are better in that regard. Note that TAPAS can be used as an encoder in the EncoderDecoderModel framework, to combine it with an autoregressive text decoder such as GPT-2.
-
-## Usage: fine-tuning
-
-Here we explain how you can fine-tune [`TapasForQuestionAnswering`] on your own dataset.
-
-**STEP 1: Choose one of the 3 ways in which you can use TAPAS - or experiment**
-
-Basically, there are 3 different ways in which one can fine-tune [`TapasForQuestionAnswering`], corresponding to the different datasets on which Tapas was fine-tuned:
-
-1. SQA: if you're interested in asking follow-up questions related to a table, in a conversational set-up. For example if you first ask "what's the name of the first actor?" then you can ask a follow-up question such as "how old is he?". Here, questions do not involve any aggregation (all questions are cell selection questions).
-2. WTQ: if you're not interested in asking questions in a conversational set-up, but rather just asking questions related to a table, which might involve aggregation, such as counting a number of rows, summing up cell values or averaging cell values. You can then for example ask "what's the total number of goals Cristiano Ronaldo made in his career?". This case is also called **weak supervision**, since the model itself must learn the appropriate aggregation operator (SUM/COUNT/AVERAGE/NONE) given only the answer to the question as supervision.
-3. WikiSQL-supervised: this dataset is based on WikiSQL with the model being given the ground truth aggregation operator during training. This is also called **strong supervision**. Here, learning the appropriate aggregation operator is much easier.
-
-To summarize:
-
-| **Task**                            | **Example dataset** | **Description**                                                                                         |
-|-------------------------------------|---------------------|---------------------------------------------------------------------------------------------------------|
-| Conversational                      | SQA                 | Conversational, only cell selection questions                                                           |
-| Weak supervision for aggregation    | WTQ                 | Questions might involve aggregation, and the model must learn this given only the answer as supervision |
-| Strong supervision for aggregation  | WikiSQL-supervised  | Questions might involve aggregation, and the model must learn this given the gold aggregation operator  |
-
-<frameworkcontent>
-<pt>
-Initializing a model with a pre-trained base and randomly initialized classification heads from the hub can be done as shown below.
-
-```py
->>> from transformers import TapasConfig, TapasForQuestionAnswering
-
->>> # for example, the base sized model with default SQA configuration
->>> model = TapasForQuestionAnswering.from_pretrained("google/tapas-base")
-
->>> # or, the base sized model with WTQ configuration
->>> config = TapasConfig.from_pretrained("google/tapas-base-finetuned-wtq")
->>> model = TapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)
-
->>> # or, the base sized model with WikiSQL configuration
->>> config = TapasConfig("google-base-finetuned-wikisql-supervised")
->>> model = TapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)
-```
-
-Of course, you don't necessarily have to follow one of these three ways in which TAPAS was fine-tuned. You can also experiment by defining any hyperparameters you want when initializing [`TapasConfig`], and then create a [`TapasForQuestionAnswering`] based on that configuration. For example, if you have a dataset that has both conversational questions and questions that might involve aggregation, then you can do it this way. Here's an example:
-
-```py
->>> from transformers import TapasConfig, TapasForQuestionAnswering
-
->>> # you can initialize the classification heads any way you want (see docs of TapasConfig)
->>> config = TapasConfig(num_aggregation_labels=3, average_logits_per_cell=True)
->>> # initializing the pre-trained base sized model with our custom classification heads
->>> model = TapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)
-```
-</pt>
-<tf>
-Initializing a model with a pre-trained base and randomly initialized classification heads from the hub can be done as shown below. Be sure to have installed the [tensorflow_probability](https://github.com/tensorflow/probability) dependency:
-
-```py
->>> from transformers import TapasConfig, TFTapasForQuestionAnswering
-
->>> # for example, the base sized model with default SQA configuration
->>> model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base")
-
->>> # or, the base sized model with WTQ configuration
->>> config = TapasConfig.from_pretrained("google/tapas-base-finetuned-wtq")
->>> model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)
-
->>> # or, the base sized model with WikiSQL configuration
->>> config = TapasConfig("google-base-finetuned-wikisql-supervised")
->>> model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)
-```
-
-Of course, you don't necessarily have to follow one of these three ways in which TAPAS was fine-tuned. You can also experiment by defining any hyperparameters you want when initializing [`TapasConfig`], and then create a [`TFTapasForQuestionAnswering`] based on that configuration. For example, if you have a dataset that has both conversational questions and questions that might involve aggregation, then you can do it this way. Here's an example:
-
-```py
->>> from transformers import TapasConfig, TFTapasForQuestionAnswering
-
->>> # you can initialize the classification heads any way you want (see docs of TapasConfig)
->>> config = TapasConfig(num_aggregation_labels=3, average_logits_per_cell=True)
->>> # initializing the pre-trained base sized model with our custom classification heads
->>> model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)
-```
-</tf>
-</frameworkcontent>
-
-What you can also do is start from an already fine-tuned checkpoint. A note here is that the already fine-tuned checkpoint on WTQ has some issues due to the L2-loss which is somewhat brittle. See [here](https://github.com/google-research/tapas/issues/91#issuecomment-735719340) for more info.
-
-For a list of all pre-trained and fine-tuned TAPAS checkpoints available on HuggingFace's  hub, see [here](https://huggingface.co/models?search=tapas).
-
-**STEP 2: Prepare your data in the SQA format**
-
-Second, no matter what you picked above, you should prepare your dataset in the [SQA](https://www.microsoft.com/en-us/download/details.aspx?id=54253) format. This format is a TSV/CSV file with the following columns:
-
-- `id`: optional, id of the table-question pair, for bookkeeping purposes.
-- `annotator`: optional, id of the person who annotated the table-question pair, for bookkeeping purposes.
-- `position`: integer indicating if the question is the first, second, third,... related to the table. Only required in case of conversational setup (SQA). You don't need this column in case you're going for WTQ/WikiSQL-supervised.
-- `question`: string
-- `table_file`: string, name of a csv file containing the tabular data
-- `answer_coordinates`: list of one or more tuples (each tuple being a cell coordinate, i.e. row, column pair that is part of the answer)
-- `answer_text`: list of one or more strings (each string being a cell value that is part of the answer)
-- `aggregation_label`: index of the aggregation operator. Only required in case of strong supervision for aggregation (the WikiSQL-supervised case)
-- `float_answer`: the float answer to the question, if there is one (np.nan if there isn't). Only required in case of weak supervision for aggregation (such as WTQ and WikiSQL)
-
-The tables themselves should be present in a folder, each table being a separate csv file. Note that the authors of the TAPAS algorithm used conversion scripts with some automated logic to convert the other datasets (WTQ, WikiSQL) into the SQA format. The author explains this [here](https://github.com/google-research/tapas/issues/50#issuecomment-705465960). A conversion of this script that works with HuggingFace's implementation can be found [here](https://github.com/NielsRogge/tapas_utils). Interestingly, these conversion scripts are not perfect (the `answer_coordinates` and `float_answer` fields are populated based on the `answer_text`), meaning that WTQ and WikiSQL results could actually be improved.
-
-**STEP 3: Convert your data into tensors using TapasTokenizer**
-
-<frameworkcontent>
-<pt>
-Third, given that you've prepared your data in this TSV/CSV format (and corresponding CSV files containing the tabular data), you can then use [`TapasTokenizer`] to convert table-question pairs into `input_ids`, `attention_mask`, `token_type_ids` and so on. Again, based on which of the three cases you picked above, [`TapasForQuestionAnswering`] requires different
-inputs to be fine-tuned:
-
-| **Task**                           | **Required inputs**                                                                                                 |
-|------------------------------------|---------------------------------------------------------------------------------------------------------------------|
-| Conversational                     | `input_ids`, `attention_mask`, `token_type_ids`, `labels`                                                           |
-|  Weak supervision for aggregation  | `input_ids`, `attention_mask`, `token_type_ids`, `labels`, `numeric_values`, `numeric_values_scale`, `float_answer` |
-| Strong supervision for aggregation | `input ids`, `attention mask`, `token type ids`, `labels`, `aggregation_labels`                                     |
-
-[`TapasTokenizer`] creates the `labels`, `numeric_values` and `numeric_values_scale` based on the `answer_coordinates` and `answer_text` columns of the TSV file. The `float_answer` and `aggregation_labels` are already in the TSV file of step 2. Here's an example:
-
-```py
->>> from transformers import TapasTokenizer
->>> import pandas as pd
-
->>> model_name = "google/tapas-base"
->>> tokenizer = TapasTokenizer.from_pretrained(model_name)
-
->>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
->>> queries = [
-...     "What is the name of the first actor?",
-...     "How many movies has George Clooney played in?",
-...     "What is the total number of movies?",
-... ]
->>> answer_coordinates = [[(0, 0)], [(2, 1)], [(0, 1), (1, 1), (2, 1)]]
->>> answer_text = [["Brad Pitt"], ["69"], ["209"]]
->>> table = pd.DataFrame.from_dict(data)
->>> inputs = tokenizer(
-...     table=table,
-...     queries=queries,
-...     answer_coordinates=answer_coordinates,
-...     answer_text=answer_text,
-...     padding="max_length",
-...     return_tensors="pt",
-... )
->>> inputs
-{'input_ids': tensor([[ ... ]]), 'attention_mask': tensor([[...]]), 'token_type_ids': tensor([[[...]]]),
-'numeric_values': tensor([[ ... ]]), 'numeric_values_scale: tensor([[ ... ]]), labels: tensor([[ ... ]])}
-```
-
-Note that [`TapasTokenizer`] expects the data of the table to be **text-only**. You can use `.astype(str)` on a dataframe to turn it into text-only data.
-Of course, this only shows how to encode a single training example. It is advised to create a dataloader to iterate over batches:
-
-```py
->>> import torch
->>> import pandas as pd
-
->>> tsv_path = "your_path_to_the_tsv_file"
->>> table_csv_path = "your_path_to_a_directory_containing_all_csv_files"
-
-
->>> class TableDataset(torch.utils.data.Dataset):
-...     def __init__(self, data, tokenizer):
-...         self.data = data
-...         self.tokenizer = tokenizer
-
-...     def __getitem__(self, idx):
-...         item = data.iloc[idx]
-...         table = pd.read_csv(table_csv_path + item.table_file).astype(
-...             str
-...         )  # be sure to make your table data text only
-...         encoding = self.tokenizer(
-...             table=table,
-...             queries=item.question,
-...             answer_coordinates=item.answer_coordinates,
-...             answer_text=item.answer_text,
-...             truncation=True,
-...             padding="max_length",
-...             return_tensors="pt",
-...         )
-...         # remove the batch dimension which the tokenizer adds by default
-...         encoding = {key: val.squeeze(0) for key, val in encoding.items()}
-...         # add the float_answer which is also required (weak supervision for aggregation case)
-...         encoding["float_answer"] = torch.tensor(item.float_answer)
-...         return encoding
-
-...     def __len__(self):
-...         return len(self.data)
-
-
->>> data = pd.read_csv(tsv_path, sep="\t")
->>> train_dataset = TableDataset(data, tokenizer)
->>> train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32)
-```
-</pt>
-<tf>
-Third, given that you've prepared your data in this TSV/CSV format (and corresponding CSV files containing the tabular data), you can then use [`TapasTokenizer`] to convert table-question pairs into `input_ids`, `attention_mask`, `token_type_ids` and so on. Again, based on which of the three cases you picked above, [`TFTapasForQuestionAnswering`] requires different
-inputs to be fine-tuned:
-
-| **Task**                           | **Required inputs**                                                                                                 |
-|------------------------------------|---------------------------------------------------------------------------------------------------------------------|
-| Conversational                     | `input_ids`, `attention_mask`, `token_type_ids`, `labels`                                                           |
-|  Weak supervision for aggregation  | `input_ids`, `attention_mask`, `token_type_ids`, `labels`, `numeric_values`, `numeric_values_scale`, `float_answer` |
-| Strong supervision for aggregation | `input ids`, `attention mask`, `token type ids`, `labels`, `aggregation_labels`                                     |
-
-[`TapasTokenizer`] creates the `labels`, `numeric_values` and `numeric_values_scale` based on the `answer_coordinates` and `answer_text` columns of the TSV file. The `float_answer` and `aggregation_labels` are already in the TSV file of step 2. Here's an example:
-
-```py
->>> from transformers import TapasTokenizer
->>> import pandas as pd
-
->>> model_name = "google/tapas-base"
->>> tokenizer = TapasTokenizer.from_pretrained(model_name)
-
->>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
->>> queries = [
-...     "What is the name of the first actor?",
-...     "How many movies has George Clooney played in?",
-...     "What is the total number of movies?",
-... ]
->>> answer_coordinates = [[(0, 0)], [(2, 1)], [(0, 1), (1, 1), (2, 1)]]
->>> answer_text = [["Brad Pitt"], ["69"], ["209"]]
->>> table = pd.DataFrame.from_dict(data)
->>> inputs = tokenizer(
-...     table=table,
-...     queries=queries,
-...     answer_coordinates=answer_coordinates,
-...     answer_text=answer_text,
-...     padding="max_length",
-...     return_tensors="tf",
-... )
->>> inputs
-{'input_ids': tensor([[ ... ]]), 'attention_mask': tensor([[...]]), 'token_type_ids': tensor([[[...]]]),
-'numeric_values': tensor([[ ... ]]), 'numeric_values_scale: tensor([[ ... ]]), labels: tensor([[ ... ]])}
-```
-
-Note that [`TapasTokenizer`] expects the data of the table to be **text-only**. You can use `.astype(str)` on a dataframe to turn it into text-only data.
-Of course, this only shows how to encode a single training example. It is advised to create a dataloader to iterate over batches:
-
-```py
->>> import tensorflow as tf
->>> import pandas as pd
-
->>> tsv_path = "your_path_to_the_tsv_file"
->>> table_csv_path = "your_path_to_a_directory_containing_all_csv_files"
-
-
->>> class TableDataset:
-...     def __init__(self, data, tokenizer):
-...         self.data = data
-...         self.tokenizer = tokenizer
-
-...     def __iter__(self):
-...         for idx in range(self.__len__()):
-...             item = self.data.iloc[idx]
-...             table = pd.read_csv(table_csv_path + item.table_file).astype(
-...                 str
-...             )  # be sure to make your table data text only
-...             encoding = self.tokenizer(
-...                 table=table,
-...                 queries=item.question,
-...                 answer_coordinates=item.answer_coordinates,
-...                 answer_text=item.answer_text,
-...                 truncation=True,
-...                 padding="max_length",
-...                 return_tensors="tf",
-...             )
-...             # remove the batch dimension which the tokenizer adds by default
-...             encoding = {key: tf.squeeze(val, 0) for key, val in encoding.items()}
-...             # add the float_answer which is also required (weak supervision for aggregation case)
-...             encoding["float_answer"] = tf.convert_to_tensor(item.float_answer, dtype=tf.float32)
-...             yield encoding["input_ids"], encoding["attention_mask"], encoding["numeric_values"], encoding[
-...                 "numeric_values_scale"
-...             ], encoding["token_type_ids"], encoding["labels"], encoding["float_answer"]
-
-...     def __len__(self):
-...         return len(self.data)
-
-
->>> data = pd.read_csv(tsv_path, sep="\t")
->>> train_dataset = TableDataset(data, tokenizer)
->>> output_signature = (
-...     tf.TensorSpec(shape=(512,), dtype=tf.int32),
-...     tf.TensorSpec(shape=(512,), dtype=tf.int32),
-...     tf.TensorSpec(shape=(512,), dtype=tf.float32),
-...     tf.TensorSpec(shape=(512,), dtype=tf.float32),
-...     tf.TensorSpec(shape=(512, 7), dtype=tf.int32),
-...     tf.TensorSpec(shape=(512,), dtype=tf.int32),
-...     tf.TensorSpec(shape=(512,), dtype=tf.float32),
-... )
->>> train_dataloader = tf.data.Dataset.from_generator(train_dataset, output_signature=output_signature).batch(32)
-```
-</tf>
-</frameworkcontent>
-
-Note that here, we encode each table-question pair independently. This is fine as long as your dataset is **not conversational**. In case your dataset involves conversational questions (such as in SQA), then you should first group together the `queries`, `answer_coordinates` and `answer_text` per table (in the order of their `position`
-index) and batch encode each table with its questions. This will make sure that the `prev_labels` token types (see docs of [`TapasTokenizer`]) are set correctly. See [this notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) for more info. See [this notebook](https://github.com/kamalkraj/Tapas-Tutorial/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) for more info regarding using the TensorFlow model.
-
-**STEP 4: Train (fine-tune) the model
-
-<frameworkcontent>
-<pt>
-You can then fine-tune [`TapasForQuestionAnswering`] as follows (shown here for the weak supervision for aggregation case):
-
-```py
->>> from transformers import TapasConfig, TapasForQuestionAnswering, AdamW
-
->>> # this is the default WTQ configuration
->>> config = TapasConfig(
-...     num_aggregation_labels=4,
-...     use_answer_as_supervision=True,
-...     answer_loss_cutoff=0.664694,
-...     cell_selection_preference=0.207951,
-...     huber_loss_delta=0.121194,
-...     init_cell_selection_weights_to_zero=True,
-...     select_one_column=True,
-...     allow_empty_column_selection=False,
-...     temperature=0.0352513,
-... )
->>> model = TapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)
-
->>> optimizer = AdamW(model.parameters(), lr=5e-5)
-
->>> model.train()
->>> for epoch in range(2):  # loop over the dataset multiple times
-...     for batch in train_dataloader:
-...         # get the inputs;
-...         input_ids = batch["input_ids"]
-...         attention_mask = batch["attention_mask"]
-...         token_type_ids = batch["token_type_ids"]
-...         labels = batch["labels"]
-...         numeric_values = batch["numeric_values"]
-...         numeric_values_scale = batch["numeric_values_scale"]
-...         float_answer = batch["float_answer"]
-
-...         # zero the parameter gradients
-...         optimizer.zero_grad()
-
-...         # forward + backward + optimize
-...         outputs = model(
-...             input_ids=input_ids,
-...             attention_mask=attention_mask,
-...             token_type_ids=token_type_ids,
-...             labels=labels,
-...             numeric_values=numeric_values,
-...             numeric_values_scale=numeric_values_scale,
-...             float_answer=float_answer,
-...         )
-...         loss = outputs.loss
-...         loss.backward()
-...         optimizer.step()
-```
-</pt>
-<tf>
-You can then fine-tune [`TFTapasForQuestionAnswering`] as follows (shown here for the weak supervision for aggregation case):
-
-```py
->>> import tensorflow as tf
->>> from transformers import TapasConfig, TFTapasForQuestionAnswering
-
->>> # this is the default WTQ configuration
->>> config = TapasConfig(
-...     num_aggregation_labels=4,
-...     use_answer_as_supervision=True,
-...     answer_loss_cutoff=0.664694,
-...     cell_selection_preference=0.207951,
-...     huber_loss_delta=0.121194,
-...     init_cell_selection_weights_to_zero=True,
-...     select_one_column=True,
-...     allow_empty_column_selection=False,
-...     temperature=0.0352513,
-... )
->>> model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)
-
->>> optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
-
->>> for epoch in range(2):  # loop over the dataset multiple times
-...     for batch in train_dataloader:
-...         # get the inputs;
-...         input_ids = batch[0]
-...         attention_mask = batch[1]
-...         token_type_ids = batch[4]
-...         labels = batch[-1]
-...         numeric_values = batch[2]
-...         numeric_values_scale = batch[3]
-...         float_answer = batch[6]
-
-...         # forward + backward + optimize
-...         with tf.GradientTape() as tape:
-...             outputs = model(
-...                 input_ids=input_ids,
-...                 attention_mask=attention_mask,
-...                 token_type_ids=token_type_ids,
-...                 labels=labels,
-...                 numeric_values=numeric_values,
-...                 numeric_values_scale=numeric_values_scale,
-...                 float_answer=float_answer,
-...             )
-...         grads = tape.gradient(outputs.loss, model.trainable_weights)
-...         optimizer.apply_gradients(zip(grads, model.trainable_weights))
-```
-</tf>
-</frameworkcontent>
-
-## Usage: inference
-
-<frameworkcontent>
-<pt>
-Here we explain how you can use [`TapasForQuestionAnswering`] or [`TFTapasForQuestionAnswering`] for inference (i.e. making predictions on new data). For inference, only `input_ids`, `attention_mask` and `token_type_ids` (which you can obtain using [`TapasTokenizer`]) have to be provided to the model to obtain the logits. Next, you can use the handy [`~models.tapas.tokenization_tapas.convert_logits_to_predictions`] method to convert these into predicted coordinates and optional aggregation indices.
-
-However, note that inference is **different** depending on whether or not the setup is conversational. In a non-conversational set-up, inference can be done in parallel on all table-question pairs of a batch. Here's an example of that:
-
-```py
->>> from transformers import TapasTokenizer, TapasForQuestionAnswering
->>> import pandas as pd
-
->>> model_name = "google/tapas-base-finetuned-wtq"
->>> model = TapasForQuestionAnswering.from_pretrained(model_name)
->>> tokenizer = TapasTokenizer.from_pretrained(model_name)
-
->>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
->>> queries = [
-...     "What is the name of the first actor?",
-...     "How many movies has George Clooney played in?",
-...     "What is the total number of movies?",
-... ]
->>> table = pd.DataFrame.from_dict(data)
->>> inputs = tokenizer(table=table, queries=queries, padding="max_length", return_tensors="pt")
->>> outputs = model(**inputs)
->>> predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
-...     inputs, outputs.logits.detach(), outputs.logits_aggregation.detach()
-... )
-
->>> # let's print out the results:
->>> id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"}
->>> aggregation_predictions_string = [id2aggregation[x] for x in predicted_aggregation_indices]
-
->>> answers = []
->>> for coordinates in predicted_answer_coordinates:
-...     if len(coordinates) == 1:
-...         # only a single cell:
-...         answers.append(table.iat[coordinates[0]])
-...     else:
-...         # multiple cells
-...         cell_values = []
-...         for coordinate in coordinates:
-...             cell_values.append(table.iat[coordinate])
-...         answers.append(", ".join(cell_values))
-
->>> display(table)
->>> print("")
->>> for query, answer, predicted_agg in zip(queries, answers, aggregation_predictions_string):
-...     print(query)
-...     if predicted_agg == "NONE":
-...         print("Predicted answer: " + answer)
-...     else:
-...         print("Predicted answer: " + predicted_agg + " > " + answer)
-What is the name of the first actor?
-Predicted answer: Brad Pitt
-How many movies has George Clooney played in?
-Predicted answer: COUNT > 69
-What is the total number of movies?
-Predicted answer: SUM > 87, 53, 69
-```
-</pt>
-<tf>
-Here we explain how you can use [`TFTapasForQuestionAnswering`] for inference (i.e. making predictions on new data). For inference, only `input_ids`, `attention_mask` and `token_type_ids` (which you can obtain using [`TapasTokenizer`]) have to be provided to the model to obtain the logits. Next, you can use the handy [`~models.tapas.tokenization_tapas.convert_logits_to_predictions`] method to convert these into predicted coordinates and optional aggregation indices.
-
-However, note that inference is **different** depending on whether or not the setup is conversational. In a non-conversational set-up, inference can be done in parallel on all table-question pairs of a batch. Here's an example of that:
-
-```py
->>> from transformers import TapasTokenizer, TFTapasForQuestionAnswering
->>> import pandas as pd
-
->>> model_name = "google/tapas-base-finetuned-wtq"
->>> model = TFTapasForQuestionAnswering.from_pretrained(model_name)
->>> tokenizer = TapasTokenizer.from_pretrained(model_name)
-
->>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
->>> queries = [
-...     "What is the name of the first actor?",
-...     "How many movies has George Clooney played in?",
-...     "What is the total number of movies?",
-... ]
->>> table = pd.DataFrame.from_dict(data)
->>> inputs = tokenizer(table=table, queries=queries, padding="max_length", return_tensors="tf")
->>> outputs = model(**inputs)
->>> predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
-...     inputs, outputs.logits, outputs.logits_aggregation
-... )
-
->>> # let's print out the results:
->>> id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"}
->>> aggregation_predictions_string = [id2aggregation[x] for x in predicted_aggregation_indices]
-
->>> answers = []
->>> for coordinates in predicted_answer_coordinates:
-...     if len(coordinates) == 1:
-...         # only a single cell:
-...         answers.append(table.iat[coordinates[0]])
-...     else:
-...         # multiple cells
-...         cell_values = []
-...         for coordinate in coordinates:
-...             cell_values.append(table.iat[coordinate])
-...         answers.append(", ".join(cell_values))
-
->>> display(table)
->>> print("")
->>> for query, answer, predicted_agg in zip(queries, answers, aggregation_predictions_string):
-...     print(query)
-...     if predicted_agg == "NONE":
-...         print("Predicted answer: " + answer)
-...     else:
-...         print("Predicted answer: " + predicted_agg + " > " + answer)
-What is the name of the first actor?
-Predicted answer: Brad Pitt
-How many movies has George Clooney played in?
-Predicted answer: COUNT > 69
-What is the total number of movies?
-Predicted answer: SUM > 87, 53, 69
-```
-</tf>
-</frameworkcontent>
-
-In case of a conversational set-up, then each table-question pair must be provided **sequentially** to the model, such that the `prev_labels` token types can be overwritten by the predicted `labels` of the previous table-question pair. Again, more info can be found in [this notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) (for PyTorch) and [this notebook](https://github.com/kamalkraj/Tapas-Tutorial/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) (for TensorFlow).
-
-## TAPAS specific outputs
-[[autodoc]] models.tapas.modeling_tapas.TableQuestionAnsweringOutput
-
-## TapasConfig
-[[autodoc]] TapasConfig
-
-## TapasTokenizer
-[[autodoc]] TapasTokenizer
-    - __call__
-    - convert_logits_to_predictions
-    - save_vocabulary
-
-## TapasModel
-[[autodoc]] TapasModel
-    - forward
-    
-## TapasForMaskedLM
-[[autodoc]] TapasForMaskedLM
-    - forward
-
-## TapasForSequenceClassification
-[[autodoc]] TapasForSequenceClassification
-    - forward
-    
-## TapasForQuestionAnswering
-[[autodoc]] TapasForQuestionAnswering
-    - forward
-
-## TFTapasModel
-[[autodoc]] TFTapasModel
-    - call
-    
-## TFTapasForMaskedLM
-[[autodoc]] TFTapasForMaskedLM
-    - call
-
-## TFTapasForSequenceClassification
-[[autodoc]] TFTapasForSequenceClassification
-    - call
-    
-## TFTapasForQuestionAnswering
-[[autodoc]] TFTapasForQuestionAnswering
-    - call
\ No newline at end of file
diff --git a/docs/source/en/model_doc/tapex.md b/docs/source/en/model_doc/tapex.md
new file mode 100644
index 000000000000..52234b5c59bc
--- /dev/null
+++ b/docs/source/en/model_doc/tapex.md
@@ -0,0 +1,143 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# TAPEX
+
+<Tip warning={true}>
+
+This model is in maintenance mode only, so we won't accept any new PRs changing its code.
+
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
+You can do so by running the following command: `pip install -U transformers==4.30.0`.
+
+</Tip>
+
+## Overview
+
+The TAPEX model was proposed in [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu,
+Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou. TAPEX pre-trains a BART model to solve synthetic SQL queries, after
+which it can be fine-tuned to answer natural language questions related to tabular data, as well as performing table fact checking. 
+
+TAPEX has been fine-tuned on several datasets: 
+- [SQA](https://www.microsoft.com/en-us/download/details.aspx?id=54253) (Sequential Question Answering by Microsoft)
+- [WTQ](https://github.com/ppasupat/WikiTableQuestions) (Wiki Table Questions by Stanford University)
+- [WikiSQL](https://github.com/salesforce/WikiSQL) (by Salesforce)
+- [TabFact](https://tabfact.github.io/) (by USCB NLP Lab).
+
+The abstract from the paper is the following:
+
+*Recent progress in language model pre-training has achieved a great success via leveraging large-scale unstructured textual data. However, it is
+still a challenge to apply pre-training on structured tabular data due to the absence of large-scale high-quality tabular data. In this paper, we
+propose TAPEX to show that table pre-training can be achieved by learning a neural SQL executor over a synthetic corpus, which is obtained by automatically
+synthesizing executable SQL queries and their execution outputs. TAPEX addresses the data scarcity challenge via guiding the language model to mimic a SQL
+executor on the diverse, large-scale and high-quality synthetic corpus. We evaluate TAPEX on four benchmark datasets. Experimental results demonstrate that
+TAPEX outperforms previous table pre-training approaches by a large margin and achieves new state-of-the-art results on all of them. This includes improvements
+on the weakly-supervised WikiSQL denotation accuracy to 89.5% (+2.3%), the WikiTableQuestions denotation accuracy to 57.5% (+4.8%), the SQA denotation accuracy
+to 74.5% (+3.5%), and the TabFact accuracy to 84.2% (+3.2%). To our knowledge, this is the first work to exploit table pre-training via synthetic executable programs
+and to achieve new state-of-the-art results on various downstream tasks.*
+
+Tips:
+
+- TAPEX is a generative (seq2seq) model. One can directly plug in the weights of TAPEX into a BART model. 
+- TAPEX has checkpoints on the hub that are either pre-trained only, or fine-tuned on WTQ, SQA, WikiSQL and TabFact.
+- Sentences + tables are presented to the model as `sentence + " " + linearized table`. The linearized table has the following format: 
+  `col: col1 | col2 | col 3 row 1 : val1 | val2 | val3 row 2 : ...`.
+- TAPEX has its own tokenizer, that allows to prepare all data for the model easily. One can pass Pandas DataFrames and strings to the tokenizer,
+  and it will automatically create the `input_ids` and `attention_mask` (as shown in the usage examples below). 
+
+## Usage: inference
+
+Below, we illustrate how to use TAPEX for table question answering. As one can see, one can directly plug in the weights of TAPEX into a BART model.
+We use the [Auto API](auto), which will automatically instantiate the appropriate tokenizer ([`TapexTokenizer`]) and model ([`BartForConditionalGeneration`]) for us,
+based on the configuration file of the checkpoint on the hub.
+
+```python
+>>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+>>> import pandas as pd
+
+>>> tokenizer = AutoTokenizer.from_pretrained("microsoft/tapex-large-finetuned-wtq")
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("microsoft/tapex-large-finetuned-wtq")
+
+>>> # prepare table + question
+>>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
+>>> table = pd.DataFrame.from_dict(data)
+>>> question = "how many movies does Leonardo Di Caprio have?"
+
+>>> encoding = tokenizer(table, question, return_tensors="pt")
+
+>>> # let the model generate an answer autoregressively
+>>> outputs = model.generate(**encoding)
+
+>>> # decode back to text
+>>> predicted_answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
+>>> print(predicted_answer)
+53
+```
+
+Note that [`TapexTokenizer`] also supports batched inference. Hence, one can provide a batch of different tables/questions, or a batch of a single table
+and multiple questions, or a batch of a single query and multiple tables. Let's illustrate this:
+
+```python
+>>> # prepare table + question
+>>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
+>>> table = pd.DataFrame.from_dict(data)
+>>> questions = [
+...     "how many movies does Leonardo Di Caprio have?",
+...     "which actor has 69 movies?",
+...     "what's the first name of the actor who has 87 movies?",
+... ]
+>>> encoding = tokenizer(table, questions, padding=True, return_tensors="pt")
+
+>>> # let the model generate an answer autoregressively
+>>> outputs = model.generate(**encoding)
+
+>>> # decode back to text
+>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+[' 53', ' george clooney', ' brad pitt']
+```
+
+In case one wants to do table verification (i.e. the task of determining whether a given sentence is supported or refuted by the contents
+of a table), one can instantiate a [`BartForSequenceClassification`] model. TAPEX has checkpoints on the hub fine-tuned on TabFact, an important
+benchmark for table fact checking (it achieves 84% accuracy). The code example below again leverages the [Auto API](auto).
+
+```python
+>>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
+
+>>> tokenizer = AutoTokenizer.from_pretrained("microsoft/tapex-large-finetuned-tabfact")
+>>> model = AutoModelForSequenceClassification.from_pretrained("microsoft/tapex-large-finetuned-tabfact")
+
+>>> # prepare table + sentence
+>>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
+>>> table = pd.DataFrame.from_dict(data)
+>>> sentence = "George Clooney has 30 movies"
+
+>>> encoding = tokenizer(table, sentence, return_tensors="pt")
+
+>>> # forward pass
+>>> outputs = model(**encoding)
+
+>>> # print prediction
+>>> predicted_class_idx = outputs.logits[0].argmax(dim=0).item()
+>>> print(model.config.id2label[predicted_class_idx])
+Refused
+```
+
+
+## TapexTokenizer
+
+[[autodoc]] TapexTokenizer
+    - __call__
+    - save_vocabulary
\ No newline at end of file
diff --git a/docs/source/en/model_doc/tapex.mdx b/docs/source/en/model_doc/tapex.mdx
deleted file mode 100644
index f6e65764e50d..000000000000
--- a/docs/source/en/model_doc/tapex.mdx
+++ /dev/null
@@ -1,130 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# TAPEX
-
-## Overview
-
-The TAPEX model was proposed in [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu,
-Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou. TAPEX pre-trains a BART model to solve synthetic SQL queries, after
-which it can be fine-tuned to answer natural language questions related to tabular data, as well as performing table fact checking. 
-
-TAPEX has been fine-tuned on several datasets: 
-- [SQA](https://www.microsoft.com/en-us/download/details.aspx?id=54253) (Sequential Question Answering by Microsoft)
-- [WTQ](https://github.com/ppasupat/WikiTableQuestions) (Wiki Table Questions by Stanford University)
-- [WikiSQL](https://github.com/salesforce/WikiSQL) (by Salesforce)
-- [TabFact](https://tabfact.github.io/) (by USCB NLP Lab).
-
-The abstract from the paper is the following:
-
-*Recent progress in language model pre-training has achieved a great success via leveraging large-scale unstructured textual data. However, it is
-still a challenge to apply pre-training on structured tabular data due to the absence of large-scale high-quality tabular data. In this paper, we
-propose TAPEX to show that table pre-training can be achieved by learning a neural SQL executor over a synthetic corpus, which is obtained by automatically
-synthesizing executable SQL queries and their execution outputs. TAPEX addresses the data scarcity challenge via guiding the language model to mimic a SQL
-executor on the diverse, large-scale and high-quality synthetic corpus. We evaluate TAPEX on four benchmark datasets. Experimental results demonstrate that
-TAPEX outperforms previous table pre-training approaches by a large margin and achieves new state-of-the-art results on all of them. This includes improvements
-on the weakly-supervised WikiSQL denotation accuracy to 89.5% (+2.3%), the WikiTableQuestions denotation accuracy to 57.5% (+4.8%), the SQA denotation accuracy
-to 74.5% (+3.5%), and the TabFact accuracy to 84.2% (+3.2%). To our knowledge, this is the first work to exploit table pre-training via synthetic executable programs
-and to achieve new state-of-the-art results on various downstream tasks.*
-
-Tips:
-
-- TAPEX is a generative (seq2seq) model. One can directly plug in the weights of TAPEX into a BART model. 
-- TAPEX has checkpoints on the hub that are either pre-trained only, or fine-tuned on WTQ, SQA, WikiSQL and TabFact.
-- Sentences + tables are presented to the model as `sentence + " " + linearized table`. The linearized table has the following format: 
-  `col: col1 | col2 | col 3 row 1 : val1 | val2 | val3 row 2 : ...`.
-- TAPEX has its own tokenizer, that allows to prepare all data for the model easily. One can pass Pandas DataFrames and strings to the tokenizer,
-  and it will automatically create the `input_ids` and `attention_mask` (as shown in the usage examples below). 
-
-## Usage: inference
-
-Below, we illustrate how to use TAPEX for table question answering. As one can see, one can directly plug in the weights of TAPEX into a BART model.
-We use the [Auto API](auto), which will automatically instantiate the appropriate tokenizer ([`TapexTokenizer`]) and model ([`BartForConditionalGeneration`]) for us,
-based on the configuration file of the checkpoint on the hub.
-
-```python
->>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
->>> import pandas as pd
-
->>> tokenizer = AutoTokenizer.from_pretrained("microsoft/tapex-large-finetuned-wtq")
->>> model = AutoModelForSeq2SeqLM.from_pretrained("microsoft/tapex-large-finetuned-wtq")
-
->>> # prepare table + question
->>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
->>> table = pd.DataFrame.from_dict(data)
->>> question = "how many movies does Leonardo Di Caprio have?"
-
->>> encoding = tokenizer(table, question, return_tensors="pt")
-
->>> # let the model generate an answer autoregressively
->>> outputs = model.generate(**encoding)
-
->>> # decode back to text
->>> predicted_answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
->>> print(predicted_answer)
-53
-```
-
-Note that [`TapexTokenizer`] also supports batched inference. Hence, one can provide a batch of different tables/questions, or a batch of a single table
-and multiple questions, or a batch of a single query and multiple tables. Let's illustrate this:
-
-```python
->>> # prepare table + question
->>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
->>> table = pd.DataFrame.from_dict(data)
->>> questions = [
-...     "how many movies does Leonardo Di Caprio have?",
-...     "which actor has 69 movies?",
-...     "what's the first name of the actor who has 87 movies?",
-... ]
->>> encoding = tokenizer(table, questions, padding=True, return_tensors="pt")
-
->>> # let the model generate an answer autoregressively
->>> outputs = model.generate(**encoding)
-
->>> # decode back to text
->>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-[' 53', ' george clooney', ' brad pitt']
-```
-
-In case one wants to do table verification (i.e. the task of determining whether a given sentence is supported or refuted by the contents
-of a table), one can instantiate a [`BartForSequenceClassification`] model. TAPEX has checkpoints on the hub fine-tuned on TabFact, an important
-benchmark for table fact checking (it achieves 84% accuracy). The code example below again leverages the [Auto API](auto).
-
-```python
->>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
-
->>> tokenizer = AutoTokenizer.from_pretrained("microsoft/tapex-large-finetuned-tabfact")
->>> model = AutoModelForSequenceClassification.from_pretrained("microsoft/tapex-large-finetuned-tabfact")
-
->>> # prepare table + sentence
->>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
->>> table = pd.DataFrame.from_dict(data)
->>> sentence = "George Clooney has 30 movies"
-
->>> encoding = tokenizer(table, sentence, return_tensors="pt")
-
->>> # forward pass
->>> outputs = model(**encoding)
-
->>> # print prediction
->>> predicted_class_idx = outputs.logits[0].argmax(dim=0).item()
->>> print(model.config.id2label[predicted_class_idx])
-Refused
-```
-
-
-## TapexTokenizer
-
-[[autodoc]] TapexTokenizer
-    - __call__
-    - save_vocabulary
\ No newline at end of file
diff --git a/docs/source/en/model_doc/time_series_transformer.md b/docs/source/en/model_doc/time_series_transformer.md
new file mode 100644
index 000000000000..208798aa1c68
--- /dev/null
+++ b/docs/source/en/model_doc/time_series_transformer.md
@@ -0,0 +1,83 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Time Series Transformer
+
+<Tip>
+
+This is a recently introduced model so the API hasn't been tested extensively. There may be some bugs or slight
+breaking changes to fix it in the future. If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title).
+
+</Tip>
+
+## Overview
+
+The Time Series Transformer model is a vanilla encoder-decoder Transformer for time series forecasting.
+
+Tips:
+
+- Similar to other models in the library, [`TimeSeriesTransformerModel`] is the raw Transformer without any head on top, and [`TimeSeriesTransformerForPrediction`]
+adds a distribution head on top of the former, which can be used for time-series forecasting. Note that this is a so-called probabilistic forecasting model, not a
+point forecasting model. This means that the model learns a distribution, from which one can sample. The model doesn't directly output values.
+- [`TimeSeriesTransformerForPrediction`] consists of 2 blocks: an encoder, which takes a `context_length` of time series values as input (called `past_values`),
+and a decoder, which predicts a `prediction_length` of time series values into the future (called `future_values`). During training, one needs to provide
+pairs of (`past_values` and `future_values`) to the model.
+- In addition to the raw (`past_values` and `future_values`), one typically provides additional features to the model. These can be the following:
+    - `past_time_features`: temporal features which the model will add to `past_values`. These serve as "positional encodings" for the Transformer encoder.
+    Examples are "day of the month", "month of the year", etc. as scalar values (and then stacked together as a vector).
+    e.g. if a given time-series value was obtained on the 11th of August, then one could have [11, 8] as time feature vector (11 being "day of the month", 8 being "month of the year").
+    - `future_time_features`: temporal features which the model will add to `future_values`. These serve as "positional encodings" for the Transformer decoder.
+    Examples are "day of the month", "month of the year", etc. as scalar values (and then stacked together as a vector).
+    e.g. if a given time-series value was obtained on the 11th of August, then one could have [11, 8] as time feature vector (11 being "day of the month", 8 being "month of the year").
+    - `static_categorical_features`: categorical features which are static over time (i.e., have the same value for all `past_values` and `future_values`).
+    An example here is the store ID or region ID that identifies a given time-series.
+    Note that these features need to be known for ALL data points (also those in the future).
+    - `static_real_features`: real-valued features which are static over time (i.e., have the same value for all `past_values` and `future_values`).
+    An example here is the image representation of the product for which you have the time-series values (like the [ResNet](resnet) embedding of a "shoe" picture,
+    if your time-series is about the sales of shoes).
+    Note that these features need to be known for ALL data points (also those in the future).
+- The model is trained using "teacher-forcing", similar to how a Transformer is trained for machine translation. This means that, during training, one shifts the
+`future_values` one position to the right as input to the decoder, prepended by the last value of `past_values`. At each time step, the model needs to predict the
+next target. So the set-up of training is similar to a GPT model for language, except that there's no notion of `decoder_start_token_id` (we just use the last value
+of the context as initial input for the decoder).
+- At inference time, we give the final value of the `past_values` as input to the decoder. Next, we can sample from the model to make a prediction at the next time step,
+which is then fed to the decoder in order to make the next prediction (also called autoregressive generation).
+
+
+This model was contributed by [kashif](https://huggingface.co/kashif).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+- Check out the Time Series Transformer blog-post in HuggingFace blog: [Probabilistic Time Series Forecasting with 🤗 Transformers](https://huggingface.co/blog/time-series-transformers)
+
+
+## TimeSeriesTransformerConfig
+
+[[autodoc]] TimeSeriesTransformerConfig
+
+
+## TimeSeriesTransformerModel
+
+[[autodoc]] TimeSeriesTransformerModel
+    - forward
+
+
+## TimeSeriesTransformerForPrediction
+
+[[autodoc]] TimeSeriesTransformerForPrediction
+    - forward
diff --git a/docs/source/en/model_doc/time_series_transformer.mdx b/docs/source/en/model_doc/time_series_transformer.mdx
deleted file mode 100644
index 5dedef02eaa8..000000000000
--- a/docs/source/en/model_doc/time_series_transformer.mdx
+++ /dev/null
@@ -1,73 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Time Series Transformer
-
-<Tip>
-
-This is a recently introduced model so the API hasn't been tested extensively. There may be some bugs or slight
-breaking changes to fix it in the future. If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title).
-
-</Tip>
-
-## Overview
-
-The Time Series Transformer model is a vanilla encoder-decoder Transformer for time series forecasting.
-
-Tips:
-
-- Similar to other models in the library, [`TimeSeriesTransformerModel`] is the raw Transformer without any head on top, and [`TimeSeriesTransformerForPrediction`]
-adds a distribution head on top of the former, which can be used for time-series forecasting. Note that this is a so-called probabilistic forecasting model, not a
-point forecasting model. This means that the model learns a distribution, from which one can sample. The model doesn't directly output values.
-- [`TimeSeriesTransformerForPrediction`] consists of 2 blocks: an encoder, which takes a `context_length` of time series values as input (called `past_values`),
-and a decoder, which predicts a `prediction_length` of time series values into the future (called `future_values`). During training, one needs to provide
-pairs of (`past_values` and `future_values`) to the model.
-- In addition to the raw (`past_values` and `future_values`), one typically provides additional features to the model. These can be the following:
-    - `past_time_features`: temporal features which the model will add to `past_values`. These serve as "positional encodings" for the Transformer encoder.
-    Examples are "day of the month", "month of the year", etc. as scalar values (and then stacked together as a vector).
-    e.g. if a given time-series value was obtained on the 11th of August, then one could have [11, 8] as time feature vector (11 being "day of the month", 8 being "month of the year").
-    - `future_time_features`: temporal features which the model will add to `future_values`. These serve as "positional encodings" for the Transformer decoder.
-    Examples are "day of the month", "month of the year", etc. as scalar values (and then stacked together as a vector).
-    e.g. if a given time-series value was obtained on the 11th of August, then one could have [11, 8] as time feature vector (11 being "day of the month", 8 being "month of the year").
-    - `static_categorical_features`: categorical features which are static over time (i.e., have the same value for all `past_values` and `future_values`).
-    An example here is the store ID or region ID that identifies a given time-series.
-    Note that these features need to be known for ALL data points (also those in the future).
-    - `static_real_features`: real-valued features which are static over time (i.e., have the same value for all `past_values` and `future_values`).
-    An example here is the image representation of the product for which you have the time-series values (like the [ResNet](resnet) embedding of a "shoe" picture,
-    if your time-series is about the sales of shoes).
-    Note that these features need to be known for ALL data points (also those in the future).
-- The model is trained using "teacher-forcing", similar to how a Transformer is trained for machine translation. This means that, during training, one shifts the
-`future_values` one position to the right as input to the decoder, prepended by the last value of `past_values`. At each time step, the model needs to predict the
-next target. So the set-up of training is similar to a GPT model for language, except that there's no notion of `decoder_start_token_id` (we just use the last value
-of the context as initial input for the decoder).
-- At inference time, we give the final value of the `past_values` as input to the decoder. Next, we can sample from the model to make a prediction at the next time step,
-which is then fed to the decoder in order to make the next prediction (also called autoregressive generation).
-
-
-This model was contributed by [kashif](<https://huggingface.co/kashif).
-
-
-## TimeSeriesTransformerConfig
-
-[[autodoc]] TimeSeriesTransformerConfig
-
-
-## TimeSeriesTransformerModel
-
-[[autodoc]] TimeSeriesTransformerModel
-    - forward
-
-
-## TimeSeriesTransformerForPrediction
-
-[[autodoc]] TimeSeriesTransformerForPrediction
-    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/timesformer.md b/docs/source/en/model_doc/timesformer.md
new file mode 100644
index 000000000000..d87fde4fb2b3
--- /dev/null
+++ b/docs/source/en/model_doc/timesformer.md
@@ -0,0 +1,51 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# TimeSformer
+
+## Overview
+
+The TimeSformer model was proposed in [TimeSformer: Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Facebook Research.
+This work is a milestone in action-recognition field being the first video transformer. It inspired many transformer based video understanding and classification papers.
+
+The abstract from the paper is the following:
+
+*We present a convolution-free approach to video classification built exclusively on self-attention over space and time. Our method, named "TimeSformer," adapts the standard Transformer architecture to video by enabling spatiotemporal feature learning directly from a sequence of frame-level patches. Our experimental study compares different self-attention schemes and suggests that "divided attention," where temporal attention and spatial attention are separately applied within each block, leads to the best video classification accuracy among the design choices considered. Despite the radically new design, TimeSformer achieves state-of-the-art results on several action recognition benchmarks, including the best reported accuracy on Kinetics-400 and Kinetics-600. Finally, compared to 3D convolutional networks, our model is faster to train, it can achieve dramatically higher test efficiency (at a small drop in accuracy), and it can also be applied to much longer video clips (over one minute long). Code and models are available at: [this https URL](https://github.com/facebookresearch/TimeSformer).*
+
+Tips:
+
+There are many pretrained variants. Select your pretrained model based on the dataset it is trained on. Moreover, the number of input frames per clip changes based on the model size so you should consider this parameter while selecting your pretrained model.
+
+This model was contributed by [fcakyon](https://huggingface.co/fcakyon).
+The original code can be found [here](https://github.com/facebookresearch/TimeSformer).
+
+## Documentation resources
+
+- [Video classification task guide](../tasks/video_classification)
+
+## TimesformerConfig
+
+[[autodoc]] TimesformerConfig
+
+## TimesformerModel
+
+[[autodoc]] TimesformerModel
+    - forward
+
+## TimesformerForVideoClassification
+
+[[autodoc]] TimesformerForVideoClassification
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/timesformer.mdx b/docs/source/en/model_doc/timesformer.mdx
deleted file mode 100644
index 602ec4f4f2a7..000000000000
--- a/docs/source/en/model_doc/timesformer.mdx
+++ /dev/null
@@ -1,44 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# TimeSformer
-
-## Overview
-
-The TimeSformer model was proposed in [TimeSformer: Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Facebook Research.
-This work is a milestone in action-recognition field being the first video transformer. It inspired many transformer based video understanding and classification papers.
-
-The abstract from the paper is the following:
-
-*We present a convolution-free approach to video classification built exclusively on self-attention over space and time. Our method, named "TimeSformer," adapts the standard Transformer architecture to video by enabling spatiotemporal feature learning directly from a sequence of frame-level patches. Our experimental study compares different self-attention schemes and suggests that "divided attention," where temporal attention and spatial attention are separately applied within each block, leads to the best video classification accuracy among the design choices considered. Despite the radically new design, TimeSformer achieves state-of-the-art results on several action recognition benchmarks, including the best reported accuracy on Kinetics-400 and Kinetics-600. Finally, compared to 3D convolutional networks, our model is faster to train, it can achieve dramatically higher test efficiency (at a small drop in accuracy), and it can also be applied to much longer video clips (over one minute long). Code and models are available at: [this https URL](https://github.com/facebookresearch/TimeSformer).*
-
-Tips:
-
-There are many pretrained variants. Select your pretrained model based on the dataset it is trained on. Moreover, the number of input frames per clip changes based on the model size so you should consider this parameter while selecting your pretrained model.
-
-This model was contributed by [fcakyon](https://huggingface.co/fcakyon).
-The original code can be found [here](https://github.com/facebookresearch/TimeSformer).
-
-
-## TimesformerConfig
-
-[[autodoc]] TimesformerConfig
-
-## TimesformerModel
-
-[[autodoc]] TimesformerModel
-    - forward
-
-## TimesformerForVideoClassification
-
-[[autodoc]] TimesformerForVideoClassification
-    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/trajectory_transformer.md b/docs/source/en/model_doc/trajectory_transformer.md
new file mode 100644
index 000000000000..548642f7bb9f
--- /dev/null
+++ b/docs/source/en/model_doc/trajectory_transformer.md
@@ -0,0 +1,62 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Trajectory Transformer
+
+<Tip warning={true}>
+
+This model is in maintenance mode only, so we won't accept any new PRs changing its code.
+
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
+You can do so by running the following command: `pip install -U transformers==4.30.0`.
+
+</Tip>
+
+## Overview
+
+The Trajectory Transformer model was proposed in [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039)  by Michael Janner, Qiyang Li, Sergey Levine.
+
+The abstract from the paper is the following:
+
+*Reinforcement learning (RL) is typically concerned with estimating stationary policies or single-step models,
+leveraging the Markov property to factorize problems in time. However, we can also view RL as a generic sequence
+modeling problem, with the goal being to produce a sequence of actions that leads to a sequence of high rewards.
+Viewed in this way, it is tempting to consider whether high-capacity sequence prediction models that work well
+in other domains, such as natural-language processing, can also provide effective solutions to the RL problem.
+To this end, we explore how RL can be tackled with the tools of sequence modeling, using a Transformer architecture
+to model distributions over trajectories and repurposing beam search as a planning algorithm. Framing RL as sequence
+modeling problem simplifies a range of design decisions, allowing us to dispense with many of the components common
+in offline RL algorithms. We demonstrate the flexibility of this approach across long-horizon dynamics prediction,
+imitation learning, goal-conditioned RL, and offline RL. Further, we show that this approach can be combined with
+existing model-free algorithms to yield a state-of-the-art planner in sparse-reward, long-horizon tasks.*
+
+Tips:
+
+This Transformer is used for deep reinforcement learning. To use it, you need to create sequences from
+actions, states and rewards from all previous timesteps. This model will treat all these elements together
+as one big sequence (a trajectory).
+
+This model was contributed by [CarlCochet](https://huggingface.co/CarlCochet). The original code can be found [here](https://github.com/jannerm/trajectory-transformer).
+
+## TrajectoryTransformerConfig
+
+[[autodoc]] TrajectoryTransformerConfig
+
+
+## TrajectoryTransformerModel
+
+[[autodoc]] TrajectoryTransformerModel
+    - forward
diff --git a/docs/source/en/model_doc/trajectory_transformer.mdx b/docs/source/en/model_doc/trajectory_transformer.mdx
deleted file mode 100644
index da7a55a50eca..000000000000
--- a/docs/source/en/model_doc/trajectory_transformer.mdx
+++ /dev/null
@@ -1,49 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Trajectory Transformer
-
-## Overview
-
-The Trajectory Transformer model was proposed in [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039)  by Michael Janner, Qiyang Li, Sergey Levine.
-
-The abstract from the paper is the following:
-
-*Reinforcement learning (RL) is typically concerned with estimating stationary policies or single-step models,
-leveraging the Markov property to factorize problems in time. However, we can also view RL as a generic sequence
-modeling problem, with the goal being to produce a sequence of actions that leads to a sequence of high rewards.
-Viewed in this way, it is tempting to consider whether high-capacity sequence prediction models that work well
-in other domains, such as natural-language processing, can also provide effective solutions to the RL problem.
-To this end, we explore how RL can be tackled with the tools of sequence modeling, using a Transformer architecture
-to model distributions over trajectories and repurposing beam search as a planning algorithm. Framing RL as sequence
-modeling problem simplifies a range of design decisions, allowing us to dispense with many of the components common
-in offline RL algorithms. We demonstrate the flexibility of this approach across long-horizon dynamics prediction,
-imitation learning, goal-conditioned RL, and offline RL. Further, we show that this approach can be combined with
-existing model-free algorithms to yield a state-of-the-art planner in sparse-reward, long-horizon tasks.*
-
-Tips:
-
-This Transformer is used for deep reinforcement learning. To use it, you need to create sequences from
-actions, states and rewards from all previous timesteps. This model will treat all these elements together
-as one big sequence (a trajectory).
-
-This model was contributed by [CarlCochet](https://huggingface.co/CarlCochet). The original code can be found [here](https://github.com/jannerm/trajectory-transformer).
-
-## TrajectoryTransformerConfig
-
-[[autodoc]] TrajectoryTransformerConfig
-
-
-## TrajectoryTransformerModel
-
-[[autodoc]] TrajectoryTransformerModel
-    - forward
diff --git a/docs/source/en/model_doc/transfo-xl.md b/docs/source/en/model_doc/transfo-xl.md
new file mode 100644
index 000000000000..beb5ba2fea83
--- /dev/null
+++ b/docs/source/en/model_doc/transfo-xl.md
@@ -0,0 +1,123 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Transformer XL
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=transfo-xl">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-transfo--xl-blueviolet">
+</a>
+<a href="https://huggingface.co/spaces/docs-demos/transfo-xl-wt103">
+<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
+</a>
+</div>
+
+## Overview
+
+The Transformer-XL model was proposed in [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai, Zhilin Yang, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan
+Salakhutdinov. It's a causal (uni-directional) transformer with relative positioning (sinusoïdal) embeddings which can
+reuse previously computed hidden-states to attend to longer context (memory). This model also uses adaptive softmax
+inputs and outputs (tied).
+
+The abstract from the paper is the following:
+
+*Transformers have a potential of learning longer-term dependency, but are limited by a fixed-length context in the
+setting of language modeling. We propose a novel neural architecture Transformer-XL that enables learning dependency
+beyond a fixed length without disrupting temporal coherence. It consists of a segment-level recurrence mechanism and a
+novel positional encoding scheme. Our method not only enables capturing longer-term dependency, but also resolves the
+context fragmentation problem. As a result, Transformer-XL learns dependency that is 80% longer than RNNs and 450%
+longer than vanilla Transformers, achieves better performance on both short and long sequences, and is up to 1,800+
+times faster than vanilla Transformers during evaluation. Notably, we improve the state-of-the-art results of
+bpc/perplexity to 0.99 on enwiki8, 1.08 on text8, 18.3 on WikiText-103, 21.8 on One Billion Word, and 54.5 on Penn
+Treebank (without finetuning). When trained only on WikiText-103, Transformer-XL manages to generate reasonably
+coherent, novel text articles with thousands of tokens.*
+
+Tips:
+
+- Transformer-XL uses relative sinusoidal positional embeddings. Padding can be done on the left or on the right. The
+  original implementation trains on SQuAD with padding on the left, therefore the padding defaults are set to left.
+- Transformer-XL is one of the few models that has no sequence length limit.
+- Same as a regular GPT model, but introduces a recurrence mechanism for two consecutive segments (similar to a regular RNNs with two consecutive inputs). In this context, a segment is a number of consecutive tokens (for instance 512) that may span across multiple documents, and segments are fed in order to the model.
+- Basically, the hidden states of the previous segment are concatenated to the current input to compute the attention scores. This allows the model to pay attention to information that was in the previous segment as well as the current one. By stacking multiple attention layers, the receptive field can be increased to multiple previous segments.
+- This changes the positional embeddings to positional relative embeddings (as the regular positional embeddings would give the same results in the current input and the current hidden state at a given position) and needs to make some adjustments in the way attention scores are computed.
+
+This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/kimiyoung/transformer-xl).
+
+<Tip warning={true}>
+
+TransformerXL does **not** work with *torch.nn.DataParallel* due to a bug in PyTorch, see [issue #36035](https://github.com/pytorch/pytorch/issues/36035)
+
+</Tip>
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Causal language modeling task guide](../tasks/language_modeling)
+
+## TransfoXLConfig
+
+[[autodoc]] TransfoXLConfig
+
+## TransfoXLTokenizer
+
+[[autodoc]] TransfoXLTokenizer
+    - save_vocabulary
+
+## TransfoXL specific outputs
+
+[[autodoc]] models.transfo_xl.modeling_transfo_xl.TransfoXLModelOutput
+
+[[autodoc]] models.transfo_xl.modeling_transfo_xl.TransfoXLLMHeadModelOutput
+
+[[autodoc]] models.transfo_xl.modeling_tf_transfo_xl.TFTransfoXLModelOutput
+
+[[autodoc]] models.transfo_xl.modeling_tf_transfo_xl.TFTransfoXLLMHeadModelOutput
+
+## TransfoXLModel
+
+[[autodoc]] TransfoXLModel
+    - forward
+
+## TransfoXLLMHeadModel
+
+[[autodoc]] TransfoXLLMHeadModel
+    - forward
+
+## TransfoXLForSequenceClassification
+
+[[autodoc]] TransfoXLForSequenceClassification
+    - forward
+
+## TFTransfoXLModel
+
+[[autodoc]] TFTransfoXLModel
+    - call
+
+## TFTransfoXLLMHeadModel
+
+[[autodoc]] TFTransfoXLLMHeadModel
+    - call
+
+## TFTransfoXLForSequenceClassification
+
+[[autodoc]] TFTransfoXLForSequenceClassification
+    - call
+
+## Internal Layers
+
+[[autodoc]] AdaptiveEmbedding
+
+[[autodoc]] TFAdaptiveEmbedding
diff --git a/docs/source/en/model_doc/transfo-xl.mdx b/docs/source/en/model_doc/transfo-xl.mdx
deleted file mode 100644
index 7e2a7701426c..000000000000
--- a/docs/source/en/model_doc/transfo-xl.mdx
+++ /dev/null
@@ -1,103 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Transformer XL
-
-## Overview
-
-The Transformer-XL model was proposed in [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai, Zhilin Yang, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan
-Salakhutdinov. It's a causal (uni-directional) transformer with relative positioning (sinusoïdal) embeddings which can
-reuse previously computed hidden-states to attend to longer context (memory). This model also uses adaptive softmax
-inputs and outputs (tied).
-
-The abstract from the paper is the following:
-
-*Transformers have a potential of learning longer-term dependency, but are limited by a fixed-length context in the
-setting of language modeling. We propose a novel neural architecture Transformer-XL that enables learning dependency
-beyond a fixed length without disrupting temporal coherence. It consists of a segment-level recurrence mechanism and a
-novel positional encoding scheme. Our method not only enables capturing longer-term dependency, but also resolves the
-context fragmentation problem. As a result, Transformer-XL learns dependency that is 80% longer than RNNs and 450%
-longer than vanilla Transformers, achieves better performance on both short and long sequences, and is up to 1,800+
-times faster than vanilla Transformers during evaluation. Notably, we improve the state-of-the-art results of
-bpc/perplexity to 0.99 on enwiki8, 1.08 on text8, 18.3 on WikiText-103, 21.8 on One Billion Word, and 54.5 on Penn
-Treebank (without finetuning). When trained only on WikiText-103, Transformer-XL manages to generate reasonably
-coherent, novel text articles with thousands of tokens.*
-
-Tips:
-
-- Transformer-XL uses relative sinusoidal positional embeddings. Padding can be done on the left or on the right. The
-  original implementation trains on SQuAD with padding on the left, therefore the padding defaults are set to left.
-- Transformer-XL is one of the few models that has no sequence length limit.
-
-This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/kimiyoung/transformer-xl).
-
-<Tip warning={true}>
-
-TransformerXL does **not** work with *torch.nn.DataParallel* due to a bug in PyTorch, see [issue #36035](https://github.com/pytorch/pytorch/issues/36035)
-
-</Tip>
-
-
-## TransfoXLConfig
-
-[[autodoc]] TransfoXLConfig
-
-## TransfoXLTokenizer
-
-[[autodoc]] TransfoXLTokenizer
-    - save_vocabulary
-
-## TransfoXL specific outputs
-
-[[autodoc]] models.transfo_xl.modeling_transfo_xl.TransfoXLModelOutput
-
-[[autodoc]] models.transfo_xl.modeling_transfo_xl.TransfoXLLMHeadModelOutput
-
-[[autodoc]] models.transfo_xl.modeling_tf_transfo_xl.TFTransfoXLModelOutput
-
-[[autodoc]] models.transfo_xl.modeling_tf_transfo_xl.TFTransfoXLLMHeadModelOutput
-
-## TransfoXLModel
-
-[[autodoc]] TransfoXLModel
-    - forward
-
-## TransfoXLLMHeadModel
-
-[[autodoc]] TransfoXLLMHeadModel
-    - forward
-
-## TransfoXLForSequenceClassification
-
-[[autodoc]] TransfoXLForSequenceClassification
-    - forward
-
-## TFTransfoXLModel
-
-[[autodoc]] TFTransfoXLModel
-    - call
-
-## TFTransfoXLLMHeadModel
-
-[[autodoc]] TFTransfoXLLMHeadModel
-    - call
-
-## TFTransfoXLForSequenceClassification
-
-[[autodoc]] TFTransfoXLForSequenceClassification
-    - call
-
-## Internal Layers
-
-[[autodoc]] AdaptiveEmbedding
-
-[[autodoc]] TFAdaptiveEmbedding
diff --git a/docs/source/en/model_doc/trocr.md b/docs/source/en/model_doc/trocr.md
new file mode 100644
index 000000000000..bfab93ad663b
--- /dev/null
+++ b/docs/source/en/model_doc/trocr.md
@@ -0,0 +1,126 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the
+License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+specific language governing permissions and limitations under the License. -->
+
+# TrOCR
+
+## Overview
+
+The TrOCR model was proposed in [TrOCR: Transformer-based Optical Character Recognition with Pre-trained
+Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang,
+Zhoujun Li, Furu Wei. TrOCR consists of an image Transformer encoder and an autoregressive text Transformer decoder to
+perform [optical character recognition (OCR)](https://en.wikipedia.org/wiki/Optical_character_recognition).
+
+The abstract from the paper is the following:
+
+*Text recognition is a long-standing research problem for document digitalization. Existing approaches for text recognition
+are usually built based on CNN for image understanding and RNN for char-level text generation. In addition, another language
+model is usually needed to improve the overall accuracy as a post-processing step. In this paper, we propose an end-to-end
+text recognition approach with pre-trained image Transformer and text Transformer models, namely TrOCR, which leverages the
+Transformer architecture for both image understanding and wordpiece-level text generation. The TrOCR model is simple but
+effective, and can be pre-trained with large-scale synthetic data and fine-tuned with human-labeled datasets. Experiments
+show that the TrOCR model outperforms the current state-of-the-art models on both printed and handwritten text recognition
+tasks.*
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/trocr_architecture.jpg"
+alt="drawing" width="600"/>
+
+<small> TrOCR architecture. Taken from the <a href="https://arxiv.org/abs/2109.10282">original paper</a>. </small>
+
+Please refer to the [`VisionEncoderDecoder`] class on how to use this model.
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found
+[here](https://github.com/microsoft/unilm/tree/6f60612e7cc86a2a1ae85c47231507a587ab4e01/trocr).
+
+Tips:
+
+- The quickest way to get started with TrOCR is by checking the [tutorial
+  notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/TrOCR), which show how to use the model
+  at inference time as well as fine-tuning on custom data.
+- TrOCR is pre-trained in 2 stages before being fine-tuned on downstream datasets. It achieves state-of-the-art results
+  on both printed (e.g. the [SROIE dataset](https://paperswithcode.com/dataset/sroie) and handwritten (e.g. the [IAM
+  Handwriting dataset](https://fki.tic.heia-fr.ch/databases/iam-handwriting-database>) text recognition tasks. For more
+  information, see the [official models](https://huggingface.co/models?other=trocr>).
+- TrOCR is always used within the [VisionEncoderDecoder](vision-encoder-decoder) framework.
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with TrOCR. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+<PipelineTag pipeline="text-classification"/>
+
+- A blog post on [Accelerating Document AI](https://huggingface.co/blog/document-ai) with TrOCR.
+- A blog post on how to [Document AI](https://github.com/philschmid/document-ai-transformers) with TrOCR.
+- A notebook on how to [finetune TrOCR on IAM Handwriting Database using Seq2SeqTrainer](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TrOCR/Fine_tune_TrOCR_on_IAM_Handwriting_Database_using_Seq2SeqTrainer.ipynb).
+- A notebook on [inference with TrOCR](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TrOCR/Inference_with_TrOCR_%2B_Gradio_demo.ipynb) and Gradio demo.
+- A notebook on [finetune TrOCR on the IAM Handwriting Database](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TrOCR/Fine_tune_TrOCR_on_IAM_Handwriting_Database_using_native_PyTorch.ipynb) using native PyTorch.
+- A notebook on [evaluating TrOCR on the IAM test set](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TrOCR/Evaluating_TrOCR_base_handwritten_on_the_IAM_test_set.ipynb).
+
+<PipelineTag pipeline="text-generation"/>
+
+- [Casual language modeling](https://huggingface.co/docs/transformers/tasks/language_modeling) task guide.
+
+⚡️ Inference
+
+- An interactive-demo on [TrOCR handwritten character recognition](https://huggingface.co/spaces/nielsr/TrOCR-handwritten).
+
+## Inference
+
+TrOCR's [`VisionEncoderDecoder`] model accepts images as input and makes use of
+[`~generation.GenerationMixin.generate`] to autoregressively generate text given the input image.
+
+The [`ViTImageProcessor`/`DeiTImageProcessor`] class is responsible for preprocessing the input image and
+[`RobertaTokenizer`/`XLMRobertaTokenizer`] decodes the generated target tokens to the target string. The
+[`TrOCRProcessor`] wraps [`ViTImageProcessor`/`DeiTImageProcessor`] and [`RobertaTokenizer`/`XLMRobertaTokenizer`]
+into a single instance to both extract the input features and decode the predicted token ids.
+
+- Step-by-step Optical Character Recognition (OCR)
+
+``` py
+>>> from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+>>> import requests
+>>> from PIL import Image
+
+>>> processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
+>>> model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
+
+>>> # load image from the IAM dataset
+>>> url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+
+>>> pixel_values = processor(image, return_tensors="pt").pixel_values
+>>> generated_ids = model.generate(pixel_values)
+
+>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+```
+
+See the [model hub](https://huggingface.co/models?filter=trocr) to look for TrOCR checkpoints.
+
+## TrOCRConfig
+
+[[autodoc]] TrOCRConfig
+
+## TrOCRProcessor
+
+[[autodoc]] TrOCRProcessor
+    - __call__
+    - from_pretrained
+    - save_pretrained
+    - batch_decode
+    - decode
+
+## TrOCRForCausalLM
+
+[[autodoc]] TrOCRForCausalLM
+     - forward
diff --git a/docs/source/en/model_doc/trocr.mdx b/docs/source/en/model_doc/trocr.mdx
deleted file mode 100644
index 3e3a6c100753..000000000000
--- a/docs/source/en/model_doc/trocr.mdx
+++ /dev/null
@@ -1,101 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the
-License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
-"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. -->
-
-# TrOCR
-
-## Overview
-
-The TrOCR model was proposed in [TrOCR: Transformer-based Optical Character Recognition with Pre-trained
-Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang,
-Zhoujun Li, Furu Wei. TrOCR consists of an image Transformer encoder and an autoregressive text Transformer decoder to
-perform [optical character recognition (OCR)](https://en.wikipedia.org/wiki/Optical_character_recognition).
-
-The abstract from the paper is the following:
-
-*Text recognition is a long-standing research problem for document digitalization. Existing approaches for text recognition
-are usually built based on CNN for image understanding and RNN for char-level text generation. In addition, another language
-model is usually needed to improve the overall accuracy as a post-processing step. In this paper, we propose an end-to-end
-text recognition approach with pre-trained image Transformer and text Transformer models, namely TrOCR, which leverages the
-Transformer architecture for both image understanding and wordpiece-level text generation. The TrOCR model is simple but
-effective, and can be pre-trained with large-scale synthetic data and fine-tuned with human-labeled datasets. Experiments
-show that the TrOCR model outperforms the current state-of-the-art models on both printed and handwritten text recognition
-tasks.*
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/trocr_architecture.jpg"
-alt="drawing" width="600"/>
-
-<small> TrOCR architecture. Taken from the <a href="https://arxiv.org/abs/2109.10282">original paper</a>. </small>
-
-Please refer to the [`VisionEncoderDecoder`] class on how to use this model.
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found
-[here](https://github.com/microsoft/unilm/tree/6f60612e7cc86a2a1ae85c47231507a587ab4e01/trocr).
-
-Tips:
-
-- The quickest way to get started with TrOCR is by checking the [tutorial
-  notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/TrOCR), which show how to use the model
-  at inference time as well as fine-tuning on custom data.
-- TrOCR is pre-trained in 2 stages before being fine-tuned on downstream datasets. It achieves state-of-the-art results
-  on both printed (e.g. the [SROIE dataset](https://paperswithcode.com/dataset/sroie) and handwritten (e.g. the [IAM
-  Handwriting dataset](https://fki.tic.heia-fr.ch/databases/iam-handwriting-database>) text recognition tasks. For more
-  information, see the [official models](https://huggingface.co/models?other=trocr>).
-- TrOCR is always used within the [VisionEncoderDecoder](vision-encoder-decoder) framework.
-
-## Inference
-
-TrOCR's [`VisionEncoderDecoder`] model accepts images as input and makes use of
-[`~generation.GenerationMixin.generate`] to autoregressively generate text given the input image.
-
-The [`ViTImageProcessor`/`DeiTImageProcessor`] class is responsible for preprocessing the input image and
-[`RobertaTokenizer`/`XLMRobertaTokenizer`] decodes the generated target tokens to the target string. The
-[`TrOCRProcessor`] wraps [`ViTImageProcessor`/`DeiTImageProcessor`] and [`RobertaTokenizer`/`XLMRobertaTokenizer`]
-into a single instance to both extract the input features and decode the predicted token ids.
-
-- Step-by-step Optical Character Recognition (OCR)
-
-``` py
->>> from transformers import TrOCRProcessor, VisionEncoderDecoderModel
->>> import requests
->>> from PIL import Image
-
->>> processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
->>> model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
-
->>> # load image from the IAM dataset
->>> url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02.jpg"
->>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
-
->>> pixel_values = processor(image, return_tensors="pt").pixel_values
->>> generated_ids = model.generate(pixel_values)
-
->>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-```
-
-See the [model hub](https://huggingface.co/models?filter=trocr) to look for TrOCR checkpoints.
-
-## TrOCRConfig
-
-[[autodoc]] TrOCRConfig
-
-## TrOCRProcessor
-
-[[autodoc]] TrOCRProcessor
-    - __call__
-    - from_pretrained
-    - save_pretrained
-    - batch_decode
-    - decode
-
-## TrOCRForCausalLM
-
-[[autodoc]] TrOCRForCausalLM
-     - forward
diff --git a/docs/source/en/model_doc/tvlt.md b/docs/source/en/model_doc/tvlt.md
new file mode 100644
index 000000000000..5ddb6badb71f
--- /dev/null
+++ b/docs/source/en/model_doc/tvlt.md
@@ -0,0 +1,77 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# TVLT
+
+## Overview
+
+The TVLT model was proposed in [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156)
+by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal (the first three authors contributed equally). The Textless Vision-Language Transformer (TVLT) is a model that uses raw visual and audio inputs for vision-and-language representation learning, without using text-specific modules such as tokenization or automatic speech recognition (ASR). It can perform various audiovisual and vision-language tasks like retrieval, question answering, etc.
+
+The abstract from the paper is the following:
+
+*In this work, we present the Textless Vision-Language Transformer (TVLT), where homogeneous transformer blocks take raw visual and audio inputs for vision-and-language representation learning with minimal modality-specific design, and do not use text-specific modules such as tokenization or automatic speech recognition (ASR). TVLT is trained by reconstructing masked patches of continuous video frames and audio spectrograms (masked autoencoding) and contrastive modeling to align video and audio. TVLT attains performance comparable to its text-based counterpart on various multimodal tasks, such as visual question answering, image retrieval, video retrieval, and multimodal sentiment analysis, with 28x faster inference speed and only 1/3 of the parameters. Our findings suggest the possibility of learning compact and efficient visual-linguistic representations from low-level visual and audio signals without assuming the prior existence of text.*
+
+Tips:
+
+- TVLT is a model that takes both `pixel_values` and `audio_values` as input. One can use [`TvltProcessor`] to prepare data for the model.
+  This processor wraps an image processor (for the image/video modality) and an audio feature extractor (for the audio modality) into one.
+- TVLT is trained with images/videos and audios of various sizes: the authors resize and crop the input images/videos to 224 and limit the length of audio spectrogram to 2048. To make batching of videos and audios possible, the authors use a `pixel_mask` that indicates which pixels are real/padding and `audio_mask` that indicates which audio values are real/padding.
+- The design of TVLT is very similar to that of a standard Vision Transformer (ViT) and masked autoencoder (MAE) as in [ViTMAE](vitmae). The difference is that the model includes embedding layers for the audio modality.
+- The PyTorch version of this model is only available in torch 1.10 and higher.
+
+<p align="center">
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/tvlt_architecture.png"
+alt="drawing" width="600"/>
+</p>
+
+<small> TVLT architecture. Taken from the <a href="[https://arxiv.org/abs/2102.03334](https://arxiv.org/abs/2209.14156)">original paper</a>. </small>
+
+The original code can be found [here](https://github.com/zinengtang/TVLT). This model was contributed by [Zineng Tang](https://huggingface.co/ZinengTang).
+
+## TvltConfig
+
+[[autodoc]] TvltConfig
+
+## TvltProcessor
+
+[[autodoc]] TvltProcessor
+    - __call__
+
+## TvltImageProcessor
+
+[[autodoc]] TvltImageProcessor
+    - preprocess
+
+## TvltFeatureExtractor
+
+[[autodoc]] TvltFeatureExtractor
+    - __call__
+    
+## TvltModel
+
+[[autodoc]] TvltModel
+    - forward
+
+## TvltForPreTraining
+
+[[autodoc]] TvltForPreTraining
+    - forward
+
+## TvltForAudioVisualClassification
+
+[[autodoc]] TvltForAudioVisualClassification
+    - forward
diff --git a/docs/source/en/model_doc/ul2.md b/docs/source/en/model_doc/ul2.md
new file mode 100644
index 000000000000..3863f23a7d73
--- /dev/null
+++ b/docs/source/en/model_doc/ul2.md
@@ -0,0 +1,35 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# UL2
+
+## Overview
+
+The T5 model was presented in [Unifying Language Learning Paradigms](https://arxiv.org/pdf/2205.05131v1.pdf) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler.
+
+The abstract from the paper is the following:
+
+*Existing pre-trained models are generally geared towards a particular class of problems. To date, there seems to be still no consensus on what the right architecture and pre-training setup should be. This paper presents a unified framework for pre-training models that are universally effective across datasets and setups. We begin by disentangling architectural archetypes with pre-training objectives -- two concepts that are commonly conflated. Next, we present a generalized and unified perspective for self-supervision in NLP and show how different pre-training objectives can be cast as one another and how interpolating between different objectives can be effective. We then propose Mixture-of-Denoisers (MoD), a pre-training objective that combines diverse pre-training paradigms together. We furthermore introduce a notion of mode switching, wherein downstream fine-tuning is associated with specific pre-training schemes. We conduct extensive ablative experiments to compare multiple pre-training objectives and find that our method pushes the Pareto-frontier by outperforming T5 and/or GPT-like models across multiple diverse setups. Finally, by scaling our model up to 20B parameters, we achieve SOTA performance on 50 well-established supervised NLP tasks ranging from language generation (with automated and human evaluation), language understanding, text classification, question answering, commonsense reasoning, long text reasoning, structured knowledge grounding and information retrieval. Our model also achieve strong results at in-context learning, outperforming 175B GPT-3 on zero-shot SuperGLUE and tripling the performance of T5-XXL on one-shot summarization.*
+
+Tips:
+
+- UL2 is an encoder-decoder model pre-trained on a mixture of denoising functions as well as fine-tuned on an array of downstream tasks.
+- UL2 has the same architecture as [T5v1.1](t5v1.1) but uses the Gated-SiLU activation function instead of Gated-GELU.
+- The authors release checkpoints of one architecture which can be seen [here](https://huggingface.co/google/ul2)
+
+The original code can be found [here](https://github.com/google-research/google-research/tree/master/ul2).
+
+This model was contributed by [DanielHesslow](https://huggingface.co/Seledorn).
diff --git a/docs/source/en/model_doc/ul2.mdx b/docs/source/en/model_doc/ul2.mdx
deleted file mode 100644
index 2481285747fa..000000000000
--- a/docs/source/en/model_doc/ul2.mdx
+++ /dev/null
@@ -1,31 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# UL2
-
-## Overview
-
-The T5 model was presented in [Unifying Language Learning Paradigms](https://arxiv.org/pdf/2205.05131v1.pdf) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler.
-
-The abstract from the paper is the following:
-
-*Existing pre-trained models are generally geared towards a particular class of problems. To date, there seems to be still no consensus on what the right architecture and pre-training setup should be. This paper presents a unified framework for pre-training models that are universally effective across datasets and setups. We begin by disentangling architectural archetypes with pre-training objectives -- two concepts that are commonly conflated. Next, we present a generalized and unified perspective for self-supervision in NLP and show how different pre-training objectives can be cast as one another and how interpolating between different objectives can be effective. We then propose Mixture-of-Denoisers (MoD), a pre-training objective that combines diverse pre-training paradigms together. We furthermore introduce a notion of mode switching, wherein downstream fine-tuning is associated with specific pre-training schemes. We conduct extensive ablative experiments to compare multiple pre-training objectives and find that our method pushes the Pareto-frontier by outperforming T5 and/or GPT-like models across multiple diverse setups. Finally, by scaling our model up to 20B parameters, we achieve SOTA performance on 50 well-established supervised NLP tasks ranging from language generation (with automated and human evaluation), language understanding, text classification, question answering, commonsense reasoning, long text reasoning, structured knowledge grounding and information retrieval. Our model also achieve strong results at in-context learning, outperforming 175B GPT-3 on zero-shot SuperGLUE and tripling the performance of T5-XXL on one-shot summarization.*
-
-Tips:
-
-- UL2 is an encoder-decoder model pre-trained on a mixture of denoising functions as well as fine-tuned on an array of downstream tasks.
-- UL2 has the same architecture as [T5v1.1](t5v1.1) but uses the Gated-SiLU activation function instead of Gated-GELU.
-- The authors release checkpoints of one architecture which can be seen [here](https://huggingface.co/google/ul2)
-
-The original code can be found [here](https://github.com/google-research/google-research/tree/master/ul2).
-
-This model was contributed by [DanielHesslow](https://huggingface.co/Seledorn).
diff --git a/docs/source/en/model_doc/umt5.md b/docs/source/en/model_doc/umt5.md
new file mode 100644
index 000000000000..4e6375bd465a
--- /dev/null
+++ b/docs/source/en/model_doc/umt5.md
@@ -0,0 +1,104 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# UMT5
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=umt5">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-mt5-blueviolet">
+</a>
+<a href="https://huggingface.co/spaces/docs-demos/mt5-small-finetuned-arxiv-cs-finetuned-arxiv-cs-full">
+<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
+</a>
+</div>
+
+## Overview
+
+The UMT5 model was proposed in [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.
+
+The abstract from the paper is the following:
+
+*Pretrained multilingual large language models have typically used heuristic temperature-based sampling to balance between different languages. However previous work has not systematically evaluated the efficacy of different pretraining language distributions across model scales. In this paper, we propose a new sampling method, UniMax, that delivers more uniform coverage of head languages while mitigating overfitting on tail languages by explicitly capping the number of repeats over each language's corpus. We perform an extensive series of ablations testing a range of sampling strategies on a suite of multilingual benchmarks, while varying model scale. We find that UniMax outperforms standard temperature-based sampling, and the benefits persist as scale increases. As part of our contribution, we release: (i) an improved and refreshed mC4 multilingual corpus consisting of 29 trillion characters across 107 languages, and (ii) a suite of pretrained umT5 model checkpoints trained with UniMax sampling.*
+
+Tips: 
+
+- UMT5 was only pre-trained on [mC4](https://huggingface.co/datasets/mc4) excluding any supervised training.
+Therefore, this model has to be fine-tuned before it is usable on a downstream task, unlike the original T5 model.
+- Since umT5 was pre-trained in an unsupervise manner, there's no real advantage to using a task prefix during single-task
+fine-tuning. If you are doing multi-task fine-tuning, you should use a prefix.
+
+Google has released the following variants:
+
+- [google/umt5-small](https://huggingface.co/google/umt5-small)
+- [google/umt5-base](https://huggingface.co/google/umt5-base)
+- [google/umt5-xl](https://huggingface.co/google/umt5-xl)
+- [google/umt5-xxl](https://huggingface.co/google/umt5-xxl).
+
+This model was contributed by [agemagician](https://huggingface.co/agemagician) and [stefan-it](https://huggingface.co/stefan-it). The original code can be
+found [here](https://github.com/google-research/t5x).
+
+One can refer to [T5's documentation page](t5) for more tips, code examples and notebooks.
+
+## Differences with mT5?
+`UmT5` is based on mT5, with a non-shared relative positional bias that is computed for each layer. This means that the model set `has_relative_bias` for each layer.
+The conversion script is also different because the model was saved in t5x's latest checkpointing format.
+
+# Sample usage
+
+```python
+>>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("google/umt5-small")
+>>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
+
+>>> inputs = tokenizer(
+...     "A <extra_id_0> walks into a bar and orders a <extra_id_1> with <extra_id_2> pinch of <extra_id_3>.",
+...     return_tensors="pt",
+... )
+>>> outputs = model.generate(**inputs)
+>>> print(tokenizer.batch_decode(outputs))
+['<pad><extra_id_0>nyone who<extra_id_1> drink<extra_id_2> a<extra_id_3> alcohol<extra_id_4> A<extra_id_5> A. This<extra_id_6> I<extra_id_7><extra_id_52><extra_id_53></s>']
+```
+
+## UMT5Config
+
+[[autodoc]] UMT5Config
+
+## UMT5Model
+
+[[autodoc]] UMT5Model
+    - forward
+
+## UMT5ForConditionalGeneration
+
+[[autodoc]] UMT5ForConditionalGeneration
+    - forward
+
+## UMT5EncoderModel
+
+[[autodoc]] UMT5EncoderModel
+    - forward
+
+## UMT5ForSequenceClassification
+
+[[autodoc]] UMT5ForSequenceClassification
+    - forward
+
+## UMT5ForQuestionAnswering
+
+[[autodoc]] UMT5ForQuestionAnswering
+    - forward
+
diff --git a/docs/source/en/model_doc/unispeech-sat.md b/docs/source/en/model_doc/unispeech-sat.md
new file mode 100644
index 000000000000..25489d9eeffd
--- /dev/null
+++ b/docs/source/en/model_doc/unispeech-sat.md
@@ -0,0 +1,92 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# UniSpeech-SAT
+
+## Overview
+
+The UniSpeech-SAT model was proposed in [UniSpeech-SAT: Universal Speech Representation Learning with Speaker Aware
+Pre-Training](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen,
+Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu .
+
+The abstract from the paper is the following:
+
+*Self-supervised learning (SSL) is a long-standing goal for speech processing, since it utilizes large-scale unlabeled
+data and avoids extensive human labeling. Recent years witness great successes in applying self-supervised learning in
+speech recognition, while limited exploration was attempted in applying SSL for modeling speaker characteristics. In
+this paper, we aim to improve the existing SSL framework for speaker representation learning. Two methods are
+introduced for enhancing the unsupervised speaker information extraction. First, we apply the multi-task learning to
+the current SSL framework, where we integrate the utterance-wise contrastive loss with the SSL objective function.
+Second, for better speaker discrimination, we propose an utterance mixing strategy for data augmentation, where
+additional overlapped utterances are created unsupervisely and incorporate during training. We integrate the proposed
+methods into the HuBERT framework. Experiment results on SUPERB benchmark show that the proposed system achieves
+state-of-the-art performance in universal representation learning, especially for speaker identification oriented
+tasks. An ablation study is performed verifying the efficacy of each proposed method. Finally, we scale up training
+dataset to 94 thousand hours public audio data and achieve further performance improvement in all SUPERB tasks.*
+
+Tips:
+
+- UniSpeechSat is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
+  Please use [`Wav2Vec2Processor`] for the feature extraction.
+- UniSpeechSat model can be fine-tuned using connectionist temporal classification (CTC) so the model output has to be
+  decoded using [`Wav2Vec2CTCTokenizer`].
+- UniSpeechSat performs especially well on speaker verification, speaker identification, and speaker diarization tasks.
+
+This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The Authors' code can be
+found [here](https://github.com/microsoft/UniSpeech/tree/main/UniSpeech-SAT).
+
+## Documentation resources
+
+- [Audio classification task guide](../tasks/audio_classification)
+- [Automatic speech recognition task guide](../tasks/asr)
+
+## UniSpeechSatConfig
+
+[[autodoc]] UniSpeechSatConfig
+
+## UniSpeechSat specific outputs
+
+[[autodoc]] models.unispeech_sat.modeling_unispeech_sat.UniSpeechSatForPreTrainingOutput
+
+## UniSpeechSatModel
+
+[[autodoc]] UniSpeechSatModel
+    - forward
+
+## UniSpeechSatForCTC
+
+[[autodoc]] UniSpeechSatForCTC
+    - forward
+
+## UniSpeechSatForSequenceClassification
+
+[[autodoc]] UniSpeechSatForSequenceClassification
+    - forward
+
+## UniSpeechSatForAudioFrameClassification
+
+[[autodoc]] UniSpeechSatForAudioFrameClassification
+    - forward
+
+## UniSpeechSatForXVector
+
+[[autodoc]] UniSpeechSatForXVector
+    - forward
+
+## UniSpeechSatForPreTraining
+
+[[autodoc]] UniSpeechSatForPreTraining
+    - forward
diff --git a/docs/source/en/model_doc/unispeech-sat.mdx b/docs/source/en/model_doc/unispeech-sat.mdx
deleted file mode 100644
index e2ceb783ea94..000000000000
--- a/docs/source/en/model_doc/unispeech-sat.mdx
+++ /dev/null
@@ -1,84 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# UniSpeech-SAT
-
-## Overview
-
-The UniSpeech-SAT model was proposed in [UniSpeech-SAT: Universal Speech Representation Learning with Speaker Aware
-Pre-Training](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen,
-Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu .
-
-The abstract from the paper is the following:
-
-*Self-supervised learning (SSL) is a long-standing goal for speech processing, since it utilizes large-scale unlabeled
-data and avoids extensive human labeling. Recent years witness great successes in applying self-supervised learning in
-speech recognition, while limited exploration was attempted in applying SSL for modeling speaker characteristics. In
-this paper, we aim to improve the existing SSL framework for speaker representation learning. Two methods are
-introduced for enhancing the unsupervised speaker information extraction. First, we apply the multi-task learning to
-the current SSL framework, where we integrate the utterance-wise contrastive loss with the SSL objective function.
-Second, for better speaker discrimination, we propose an utterance mixing strategy for data augmentation, where
-additional overlapped utterances are created unsupervisely and incorporate during training. We integrate the proposed
-methods into the HuBERT framework. Experiment results on SUPERB benchmark show that the proposed system achieves
-state-of-the-art performance in universal representation learning, especially for speaker identification oriented
-tasks. An ablation study is performed verifying the efficacy of each proposed method. Finally, we scale up training
-dataset to 94 thousand hours public audio data and achieve further performance improvement in all SUPERB tasks.*
-
-Tips:
-
-- UniSpeechSat is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
-  Please use [`Wav2Vec2Processor`] for the feature extraction.
-- UniSpeechSat model can be fine-tuned using connectionist temporal classification (CTC) so the model output has to be
-  decoded using [`Wav2Vec2CTCTokenizer`].
-- UniSpeechSat performs especially well on speaker verification, speaker identification, and speaker diarization tasks.
-
-This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The Authors' code can be
-found [here](https://github.com/microsoft/UniSpeech/tree/main/UniSpeech-SAT).
-
-
-## UniSpeechSatConfig
-
-[[autodoc]] UniSpeechSatConfig
-
-## UniSpeechSat specific outputs
-
-[[autodoc]] models.unispeech_sat.modeling_unispeech_sat.UniSpeechSatForPreTrainingOutput
-
-## UniSpeechSatModel
-
-[[autodoc]] UniSpeechSatModel
-    - forward
-
-## UniSpeechSatForCTC
-
-[[autodoc]] UniSpeechSatForCTC
-    - forward
-
-## UniSpeechSatForSequenceClassification
-
-[[autodoc]] UniSpeechSatForSequenceClassification
-    - forward
-
-## UniSpeechSatForAudioFrameClassification
-
-[[autodoc]] UniSpeechSatForAudioFrameClassification
-    - forward
-
-## UniSpeechSatForXVector
-
-[[autodoc]] UniSpeechSatForXVector
-    - forward
-
-## UniSpeechSatForPreTraining
-
-[[autodoc]] UniSpeechSatForPreTraining
-    - forward
diff --git a/docs/source/en/model_doc/unispeech.md b/docs/source/en/model_doc/unispeech.md
new file mode 100644
index 000000000000..8338aa1bda2e
--- /dev/null
+++ b/docs/source/en/model_doc/unispeech.md
@@ -0,0 +1,77 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# UniSpeech
+
+## Overview
+
+The UniSpeech model was proposed in [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael
+Zeng, Xuedong Huang .
+
+The abstract from the paper is the following:
+
+*In this paper, we propose a unified pre-training approach called UniSpeech to learn speech representations with both
+unlabeled and labeled data, in which supervised phonetic CTC learning and phonetically-aware contrastive
+self-supervised learning are conducted in a multi-task learning manner. The resultant representations can capture
+information more correlated with phonetic structures and improve the generalization across languages and domains. We
+evaluate the effectiveness of UniSpeech for cross-lingual representation learning on public CommonVoice corpus. The
+results show that UniSpeech outperforms self-supervised pretraining and supervised transfer learning for speech
+recognition by a maximum of 13.4% and 17.8% relative phone error rate reductions respectively (averaged over all
+testing languages). The transferability of UniSpeech is also demonstrated on a domain-shift speech recognition task,
+i.e., a relative word error rate reduction of 6% against the previous approach.*
+
+Tips:
+
+- UniSpeech is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. Please
+  use [`Wav2Vec2Processor`] for the feature extraction.
+- UniSpeech model can be fine-tuned using connectionist temporal classification (CTC) so the model output has to be
+  decoded using [`Wav2Vec2CTCTokenizer`].
+
+This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The Authors' code can be
+found [here](https://github.com/microsoft/UniSpeech/tree/main/UniSpeech).
+
+## Documentation resources
+
+- [Audio classification task guide](../tasks/audio_classification)
+- [Automatic speech recognition task guide](../tasks/asr)
+
+## UniSpeechConfig
+
+[[autodoc]] UniSpeechConfig
+
+## UniSpeech specific outputs
+
+[[autodoc]] models.unispeech.modeling_unispeech.UniSpeechForPreTrainingOutput
+
+## UniSpeechModel
+
+[[autodoc]] UniSpeechModel
+    - forward
+
+## UniSpeechForCTC
+
+[[autodoc]] UniSpeechForCTC
+    - forward
+
+## UniSpeechForSequenceClassification
+
+[[autodoc]] UniSpeechForSequenceClassification
+    - forward
+
+## UniSpeechForPreTraining
+
+[[autodoc]] UniSpeechForPreTraining
+    - forward
diff --git a/docs/source/en/model_doc/unispeech.mdx b/docs/source/en/model_doc/unispeech.mdx
deleted file mode 100644
index 37d0a0a708e9..000000000000
--- a/docs/source/en/model_doc/unispeech.mdx
+++ /dev/null
@@ -1,69 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# UniSpeech
-
-## Overview
-
-The UniSpeech model was proposed in [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael
-Zeng, Xuedong Huang .
-
-The abstract from the paper is the following:
-
-*In this paper, we propose a unified pre-training approach called UniSpeech to learn speech representations with both
-unlabeled and labeled data, in which supervised phonetic CTC learning and phonetically-aware contrastive
-self-supervised learning are conducted in a multi-task learning manner. The resultant representations can capture
-information more correlated with phonetic structures and improve the generalization across languages and domains. We
-evaluate the effectiveness of UniSpeech for cross-lingual representation learning on public CommonVoice corpus. The
-results show that UniSpeech outperforms self-supervised pretraining and supervised transfer learning for speech
-recognition by a maximum of 13.4% and 17.8% relative phone error rate reductions respectively (averaged over all
-testing languages). The transferability of UniSpeech is also demonstrated on a domain-shift speech recognition task,
-i.e., a relative word error rate reduction of 6% against the previous approach.*
-
-Tips:
-
-- UniSpeech is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. Please
-  use [`Wav2Vec2Processor`] for the feature extraction.
-- UniSpeech model can be fine-tuned using connectionist temporal classification (CTC) so the model output has to be
-  decoded using [`Wav2Vec2CTCTokenizer`].
-
-This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The Authors' code can be
-found [here](https://github.com/microsoft/UniSpeech/tree/main/UniSpeech).
-
-
-## UniSpeechConfig
-
-[[autodoc]] UniSpeechConfig
-
-## UniSpeech specific outputs
-
-[[autodoc]] models.unispeech.modeling_unispeech.UniSpeechForPreTrainingOutput
-
-## UniSpeechModel
-
-[[autodoc]] UniSpeechModel
-    - forward
-
-## UniSpeechForCTC
-
-[[autodoc]] UniSpeechForCTC
-    - forward
-
-## UniSpeechForSequenceClassification
-
-[[autodoc]] UniSpeechForSequenceClassification
-    - forward
-
-## UniSpeechForPreTraining
-
-[[autodoc]] UniSpeechForPreTraining
-    - forward
diff --git a/docs/source/en/model_doc/upernet.md b/docs/source/en/model_doc/upernet.md
new file mode 100644
index 000000000000..db651acaa406
--- /dev/null
+++ b/docs/source/en/model_doc/upernet.md
@@ -0,0 +1,79 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# UPerNet
+
+## Overview
+
+The UPerNet model was proposed in [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221)
+by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun. UPerNet is a general framework to effectively segment
+a wide range of concepts from images, leveraging any vision backbone like [ConvNeXt](convnext) or [Swin](swin).
+
+The abstract from the paper is the following:
+
+*Humans recognize the visual world at multiple levels: we effortlessly categorize scenes and detect objects inside, while also identifying the textures and surfaces of the objects along with their different compositional parts. In this paper, we study a new task called Unified Perceptual Parsing, which requires the machine vision systems to recognize as many visual concepts as possible from a given image. A multi-task framework called UPerNet and a training strategy are developed to learn from heterogeneous image annotations. We benchmark our framework on Unified Perceptual Parsing and show that it is able to effectively segment a wide range of concepts from images. The trained networks are further applied to discover visual knowledge in natural scenes.*
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/upernet_architecture.jpg"
+alt="drawing" width="600"/>
+
+<small> UPerNet framework. Taken from the <a href="https://arxiv.org/abs/1807.10221">original paper</a>. </small>
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code is based on OpenMMLab's mmsegmentation [here](https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg/models/decode_heads/uper_head.py).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with UPerNet.
+
+- Demo notebooks for UPerNet can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/UPerNet).
+- [`UperNetForSemanticSegmentation`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/semantic-segmentation) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/semantic_segmentation.ipynb).
+- See also: [Semantic segmentation task guide](../tasks/semantic_segmentation)
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## Usage
+
+UPerNet is a general framework for semantic segmentation. It can be used with any vision backbone, like so:
+
+```py
+from transformers import SwinConfig, UperNetConfig, UperNetForSemanticSegmentation
+
+backbone_config = SwinConfig(out_features=["stage1", "stage2", "stage3", "stage4"])
+
+config = UperNetConfig(backbone_config=backbone_config)
+model = UperNetForSemanticSegmentation(config)
+```
+
+To use another vision backbone, like [ConvNeXt](convnext), simply instantiate the model with the appropriate backbone:
+
+```py
+from transformers import ConvNextConfig, UperNetConfig, UperNetForSemanticSegmentation
+
+backbone_config = ConvNextConfig(out_features=["stage1", "stage2", "stage3", "stage4"])
+
+config = UperNetConfig(backbone_config=backbone_config)
+model = UperNetForSemanticSegmentation(config)
+```
+
+Note that this will randomly initialize all the weights of the model.
+
+## UperNetConfig
+
+[[autodoc]] UperNetConfig
+
+## UperNetForSemanticSegmentation
+
+[[autodoc]] UperNetForSemanticSegmentation
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/van.md b/docs/source/en/model_doc/van.md
new file mode 100644
index 000000000000..b9539602d3b8
--- /dev/null
+++ b/docs/source/en/model_doc/van.md
@@ -0,0 +1,74 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# VAN
+
+<Tip warning={true}>
+
+This model is in maintenance mode only, so we won't accept any new PRs changing its code.
+
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
+You can do so by running the following command: `pip install -U transformers==4.30.0`.
+
+</Tip>
+
+## Overview
+
+The VAN model was proposed in [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
+
+This paper introduces a new attention layer based on convolution operations able to capture both local and distant relationships. This is done by combining normal and large kernel convolution layers. The latter uses a dilated convolution to capture distant correlations.
+
+The abstract from the paper is the following:
+
+*While originally designed for natural language processing tasks, the self-attention mechanism has recently taken various computer vision areas by storm. However, the 2D nature of images brings three challenges for applying self-attention in computer vision. (1) Treating images as 1D sequences neglects their 2D structures. (2) The quadratic complexity is too expensive for high-resolution images. (3) It only captures spatial adaptability but ignores channel adaptability. In this paper, we propose a novel large kernel attention (LKA) module to enable self-adaptive and long-range correlations in self-attention while avoiding the above issues. We further introduce a novel neural network based on LKA, namely Visual Attention Network (VAN). While extremely simple, VAN outperforms the state-of-the-art vision transformers and convolutional neural networks with a large margin in extensive experiments, including image classification, object detection, semantic segmentation, instance segmentation, etc. Code is available at [this https URL](https://github.com/Visual-Attention-Network/VAN-Classification).*
+
+Tips:
+
+- VAN does not have an embedding layer, thus the `hidden_states` will have a length equal to the number of stages.
+
+The figure below illustrates the architecture of a Visual Aattention Layer. Taken from the [original paper](https://arxiv.org/abs/2202.09741).
+
+<img width="600" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/van_architecture.png"/>
+
+This model was contributed by [Francesco](https://huggingface.co/Francesco). The original code can be found [here](https://github.com/Visual-Attention-Network/VAN-Classification).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with VAN.
+
+<PipelineTag pipeline="image-classification"/>
+
+- [`VanForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
+- See also: [Image classification task guide](../tasks/image_classification)
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## VanConfig
+
+[[autodoc]] VanConfig
+
+
+## VanModel
+
+[[autodoc]] VanModel
+    - forward
+
+
+## VanForImageClassification
+
+[[autodoc]] VanForImageClassification
+    - forward
+
diff --git a/docs/source/en/model_doc/van.mdx b/docs/source/en/model_doc/van.mdx
deleted file mode 100644
index 9fc05ab3e752..000000000000
--- a/docs/source/en/model_doc/van.mdx
+++ /dev/null
@@ -1,51 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# VAN
-
-## Overview
-
-The VAN model was proposed in [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
-
-This paper introduces a new attention layer based on convolution operations able to capture both local and distant relationships. This is done by combining normal and large kernel convolution layers. The latter uses a dilated convolution to capture distant correlations.
-
-The abstract from the paper is the following:
-
-*While originally designed for natural language processing tasks, the self-attention mechanism has recently taken various computer vision areas by storm. However, the 2D nature of images brings three challenges for applying self-attention in computer vision. (1) Treating images as 1D sequences neglects their 2D structures. (2) The quadratic complexity is too expensive for high-resolution images. (3) It only captures spatial adaptability but ignores channel adaptability. In this paper, we propose a novel large kernel attention (LKA) module to enable self-adaptive and long-range correlations in self-attention while avoiding the above issues. We further introduce a novel neural network based on LKA, namely Visual Attention Network (VAN). While extremely simple, VAN outperforms the state-of-the-art vision transformers and convolutional neural networks with a large margin in extensive experiments, including image classification, object detection, semantic segmentation, instance segmentation, etc. Code is available at [this https URL](https://github.com/Visual-Attention-Network/VAN-Classification).*
-
-Tips:
-
-- VAN does not have an embedding layer, thus the `hidden_states` will have a length equal to the number of stages.
-
-The figure below illustrates the architecture of a Visual Aattention Layer. Taken from the [original paper](https://arxiv.org/abs/2202.09741).
-
-<img width="600" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/van_architecture.png"/>
-
-This model was contributed by [Francesco](https://huggingface.co/Francesco). The original code can be found [here](https://github.com/Visual-Attention-Network/VAN-Classification).
-
-
-## VanConfig
-
-[[autodoc]] VanConfig
-
-
-## VanModel
-
-[[autodoc]] VanModel
-    - forward
-
-
-## VanForImageClassification
-
-[[autodoc]] VanForImageClassification
-    - forward
-
diff --git a/docs/source/en/model_doc/videomae.md b/docs/source/en/model_doc/videomae.md
new file mode 100644
index 000000000000..5a3620040ad9
--- /dev/null
+++ b/docs/source/en/model_doc/videomae.md
@@ -0,0 +1,81 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# VideoMAE
+
+## Overview
+
+The VideoMAE model was proposed in [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
+VideoMAE extends masked auto encoders ([MAE](vit_mae)) to video, claiming state-of-the-art performance on several video classification benchmarks.
+
+The abstract from the paper is the following:
+
+*Pre-training video transformers on extra large-scale datasets is generally required to achieve premier performance on relatively small datasets. In this paper, we show that video masked autoencoders (VideoMAE) are data-efficient learners for self-supervised video pre-training (SSVP). We are inspired by the recent ImageMAE and propose customized video tube masking and reconstruction. These simple designs turn out to be effective for overcoming information leakage caused by the temporal correlation during video reconstruction. We obtain three important findings on SSVP: (1) An extremely high proportion of masking ratio (i.e., 90% to 95%) still yields favorable performance of VideoMAE. The temporally redundant video content enables higher masking ratio than that of images. (2) VideoMAE achieves impressive results on very small datasets (i.e., around 3k-4k videos) without using any extra data. This is partially ascribed to the challenging task of video reconstruction to enforce high-level structure learning. (3) VideoMAE shows that data quality is more important than data quantity for SSVP. Domain shift between pre-training and target datasets are important issues in SSVP. Notably, our VideoMAE with the vanilla ViT backbone can achieve 83.9% on Kinects-400, 75.3% on Something-Something V2, 90.8% on UCF101, and 61.1% on HMDB51 without using any extra data.*
+
+Tips:
+
+- One can use [`VideoMAEImageProcessor`] to prepare videos for the model. It will resize + normalize all frames of a video for you.
+- [`VideoMAEForPreTraining`] includes the decoder on top for self-supervised pre-training.
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/videomae_architecture.jpeg"
+alt="drawing" width="600"/>
+
+<small> VideoMAE pre-training. Taken from the <a href="https://arxiv.org/abs/2203.12602">original paper</a>. </small>
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr).
+The original code can be found [here](https://github.com/MCG-NJU/VideoMAE).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with VideoMAE. If
+you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll
+review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+**Video classification**
+- [A notebook](https://github.com/huggingface/notebooks/blob/main/examples/video_classification.ipynb) that shows how
+to fine-tune a VideoMAE model on a custom dataset.
+- [Video classification task guide](../tasks/video_classification)
+- [A 🤗 Space](https://huggingface.co/spaces/sayakpaul/video-classification-ucf101-subset) showing how to perform inference with a video classification model.
+
+
+## VideoMAEConfig
+
+[[autodoc]] VideoMAEConfig
+
+## VideoMAEFeatureExtractor
+
+[[autodoc]] VideoMAEFeatureExtractor
+    - __call__
+
+## VideoMAEImageProcessor
+
+[[autodoc]] VideoMAEImageProcessor
+    - preprocess
+
+## VideoMAEModel
+
+[[autodoc]] VideoMAEModel
+    - forward
+
+## VideoMAEForPreTraining
+
+[[autodoc]] transformers.VideoMAEForPreTraining
+    - forward
+
+## VideoMAEForVideoClassification
+
+[[autodoc]] transformers.VideoMAEForVideoClassification
+    - forward
diff --git a/docs/source/en/model_doc/videomae.mdx b/docs/source/en/model_doc/videomae.mdx
deleted file mode 100644
index 76e822ef8a5c..000000000000
--- a/docs/source/en/model_doc/videomae.mdx
+++ /dev/null
@@ -1,77 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# VideoMAE
-
-## Overview
-
-The VideoMAE model was proposed in [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
-VideoMAE extends masked auto encoders ([MAE](vit_mae)) to video, claiming state-of-the-art performance on several video classification benchmarks.
-
-The abstract from the paper is the following:
-
-*Pre-training video transformers on extra large-scale datasets is generally required to achieve premier performance on relatively small datasets. In this paper, we show that video masked autoencoders (VideoMAE) are data-efficient learners for self-supervised video pre-training (SSVP). We are inspired by the recent ImageMAE and propose customized video tube masking and reconstruction. These simple designs turn out to be effective for overcoming information leakage caused by the temporal correlation during video reconstruction. We obtain three important findings on SSVP: (1) An extremely high proportion of masking ratio (i.e., 90% to 95%) still yields favorable performance of VideoMAE. The temporally redundant video content enables higher masking ratio than that of images. (2) VideoMAE achieves impressive results on very small datasets (i.e., around 3k-4k videos) without using any extra data. This is partially ascribed to the challenging task of video reconstruction to enforce high-level structure learning. (3) VideoMAE shows that data quality is more important than data quantity for SSVP. Domain shift between pre-training and target datasets are important issues in SSVP. Notably, our VideoMAE with the vanilla ViT backbone can achieve 83.9% on Kinects-400, 75.3% on Something-Something V2, 90.8% on UCF101, and 61.1% on HMDB51 without using any extra data.*
-
-Tips:
-
-- One can use [`VideoMAEImageProcessor`] to prepare videos for the model. It will resize + normalize all frames of a video for you.
-- [`VideoMAEForPreTraining`] includes the decoder on top for self-supervised pre-training.
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/videomae_architecture.jpeg"
-alt="drawing" width="600"/>
-
-<small> VideoMAE pre-training. Taken from the <a href="https://arxiv.org/abs/2203.12602">original paper</a>. </small>
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr).
-The original code can be found [here](https://github.com/MCG-NJU/VideoMAE).
-
-## Resources
-
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with VideoMAE. If
-you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll
-review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
-**Video classification**
-- [A notebook](https://github.com/huggingface/notebooks/blob/main/examples/video_classification.ipynb) that shows how
-to fine-tune a VideoMAE model on a custom dataset.
-- [Video classification task page](https://huggingface.co/tasks/video-classification)
-- [A 🤗 Space](https://huggingface.co/spaces/sayakpaul/video-classification-ucf101-subset) showing how to perform inference with a video classification model.
-
-
-## VideoMAEConfig
-
-[[autodoc]] VideoMAEConfig
-
-## VideoMAEFeatureExtractor
-
-[[autodoc]] VideoMAEFeatureExtractor
-    - __call__
-
-## VideoMAEImageProcessor
-
-[[autodoc]] VideoMAEImageProcessor
-    - preprocess
-
-## VideoMAEModel
-
-[[autodoc]] VideoMAEModel
-    - forward
-
-## VideoMAEForPreTraining
-
-[[autodoc]] transformers.VideoMAEForPreTraining
-    - forward
-
-## VideoMAEForVideoClassification
-
-[[autodoc]] transformers.VideoMAEForVideoClassification
-    - forward
diff --git a/docs/source/en/model_doc/vilt.md b/docs/source/en/model_doc/vilt.md
new file mode 100644
index 000000000000..2e2f4a140d20
--- /dev/null
+++ b/docs/source/en/model_doc/vilt.md
@@ -0,0 +1,108 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# ViLT
+
+## Overview
+
+The ViLT model was proposed in [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334)
+by Wonjae Kim, Bokyung Son, Ildoo Kim. ViLT incorporates text embeddings into a Vision Transformer (ViT), allowing it to have a minimal design
+for Vision-and-Language Pre-training (VLP).
+
+The abstract from the paper is the following:
+
+*Vision-and-Language Pre-training (VLP) has improved performance on various joint vision-and-language downstream tasks.
+Current approaches to VLP heavily rely on image feature extraction processes, most of which involve region supervision
+(e.g., object detection) and the convolutional architecture (e.g., ResNet). Although disregarded in the literature, we
+find it problematic in terms of both (1) efficiency/speed, that simply extracting input features requires much more
+computation than the multimodal interaction steps; and (2) expressive power, as it is upper bounded to the expressive
+power of the visual embedder and its predefined visual vocabulary. In this paper, we present a minimal VLP model,
+Vision-and-Language Transformer (ViLT), monolithic in the sense that the processing of visual inputs is drastically
+simplified to just the same convolution-free manner that we process textual inputs. We show that ViLT is up to tens of
+times faster than previous VLP models, yet with competitive or better downstream task performance.*
+
+Tips:
+
+- The quickest way to get started with ViLT is by checking the [example notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/ViLT)
+  (which showcase both inference and fine-tuning on custom data).
+- ViLT is a model that takes both `pixel_values` and `input_ids` as input. One can use [`ViltProcessor`] to prepare data for the model.
+  This processor wraps a image processor (for the image modality) and a tokenizer (for the language modality) into one.
+- ViLT is trained with images of various sizes: the authors resize the shorter edge of input images to 384 and limit the longer edge to
+  under 640 while preserving the aspect ratio. To make batching of images possible, the authors use a `pixel_mask` that indicates
+  which pixel values are real and which are padding. [`ViltProcessor`] automatically creates this for you.
+- The design of ViLT is very similar to that of a standard Vision Transformer (ViT). The only difference is that the model includes
+  additional embedding layers for the language modality.
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vilt_architecture.jpg"
+alt="drawing" width="600"/>
+
+<small> ViLT architecture. Taken from the <a href="https://arxiv.org/abs/2102.03334">original paper</a>. </small>
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/dandelin/ViLT).
+
+
+Tips:
+
+- The PyTorch version of this model is only available in torch 1.10 and higher.
+
+## ViltConfig
+
+[[autodoc]] ViltConfig
+
+## ViltFeatureExtractor
+
+[[autodoc]] ViltFeatureExtractor
+    - __call__
+
+## ViltImageProcessor
+
+[[autodoc]] ViltImageProcessor
+    - preprocess
+
+## ViltProcessor
+
+[[autodoc]] ViltProcessor
+    - __call__
+
+## ViltModel
+
+[[autodoc]] ViltModel
+    - forward
+
+## ViltForMaskedLM
+
+[[autodoc]] ViltForMaskedLM
+    - forward
+
+## ViltForQuestionAnswering
+
+[[autodoc]] ViltForQuestionAnswering
+    - forward
+
+## ViltForImagesAndTextClassification
+
+[[autodoc]] ViltForImagesAndTextClassification
+    - forward
+
+## ViltForImageAndTextRetrieval
+
+[[autodoc]] ViltForImageAndTextRetrieval
+    - forward
+
+## ViltForTokenClassification
+
+[[autodoc]] ViltForTokenClassification
+    - forward
diff --git a/docs/source/en/model_doc/vilt.mdx b/docs/source/en/model_doc/vilt.mdx
deleted file mode 100644
index 7c8653e1a3b9..000000000000
--- a/docs/source/en/model_doc/vilt.mdx
+++ /dev/null
@@ -1,104 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# ViLT
-
-## Overview
-
-The ViLT model was proposed in [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334)
-by Wonjae Kim, Bokyung Son, Ildoo Kim. ViLT incorporates text embeddings into a Vision Transformer (ViT), allowing it to have a minimal design
-for Vision-and-Language Pre-training (VLP).
-
-The abstract from the paper is the following:
-
-*Vision-and-Language Pre-training (VLP) has improved performance on various joint vision-and-language downstream tasks.
-Current approaches to VLP heavily rely on image feature extraction processes, most of which involve region supervision
-(e.g., object detection) and the convolutional architecture (e.g., ResNet). Although disregarded in the literature, we
-find it problematic in terms of both (1) efficiency/speed, that simply extracting input features requires much more
-computation than the multimodal interaction steps; and (2) expressive power, as it is upper bounded to the expressive
-power of the visual embedder and its predefined visual vocabulary. In this paper, we present a minimal VLP model,
-Vision-and-Language Transformer (ViLT), monolithic in the sense that the processing of visual inputs is drastically
-simplified to just the same convolution-free manner that we process textual inputs. We show that ViLT is up to tens of
-times faster than previous VLP models, yet with competitive or better downstream task performance.*
-
-Tips:
-
-- The quickest way to get started with ViLT is by checking the [example notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/ViLT)
-  (which showcase both inference and fine-tuning on custom data).
-- ViLT is a model that takes both `pixel_values` and `input_ids` as input. One can use [`ViltProcessor`] to prepare data for the model.
-  This processor wraps a feature extractor (for the image modality) and a tokenizer (for the language modality) into one.
-- ViLT is trained with images of various sizes: the authors resize the shorter edge of input images to 384 and limit the longer edge to
-  under 640 while preserving the aspect ratio. To make batching of images possible, the authors use a `pixel_mask` that indicates
-  which pixel values are real and which are padding. [`ViltProcessor`] automatically creates this for you.
-- The design of ViLT is very similar to that of a standard Vision Transformer (ViT). The only difference is that the model includes
-  additional embedding layers for the language modality.
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vilt_architecture.jpg"
-alt="drawing" width="600"/>
-
-<small> ViLT architecture. Taken from the <a href="https://arxiv.org/abs/2102.03334">original paper</a>. </small>
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/dandelin/ViLT).
-
-
-Tips:
-
-- The PyTorch version of this model is only available in torch 1.10 and higher.
-
-## ViltConfig
-
-[[autodoc]] ViltConfig
-
-## ViltFeatureExtractor
-
-[[autodoc]] ViltFeatureExtractor
-    - __call__
-
-## ViltImageProcessor
-
-[[autodoc]] ViltImageProcessor
-    - preprocess
-
-## ViltProcessor
-
-[[autodoc]] ViltProcessor
-    - __call__
-
-## ViltModel
-
-[[autodoc]] ViltModel
-    - forward
-
-## ViltForMaskedLM
-
-[[autodoc]] ViltForMaskedLM
-    - forward
-
-## ViltForQuestionAnswering
-
-[[autodoc]] ViltForQuestionAnswering
-    - forward
-
-## ViltForImagesAndTextClassification
-
-[[autodoc]] ViltForImagesAndTextClassification
-    - forward
-
-## ViltForImageAndTextRetrieval
-
-[[autodoc]] ViltForImageAndTextRetrieval
-    - forward
-
-## ViltForTokenClassification
-
-[[autodoc]] ViltForTokenClassification
-    - forward
diff --git a/docs/source/en/model_doc/vision-encoder-decoder.md b/docs/source/en/model_doc/vision-encoder-decoder.md
new file mode 100644
index 000000000000..0beeaeae108b
--- /dev/null
+++ b/docs/source/en/model_doc/vision-encoder-decoder.md
@@ -0,0 +1,170 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Vision Encoder Decoder Models
+
+## Overview
+
+The [`VisionEncoderDecoderModel`] can be used to initialize an image-to-text model with any
+pretrained Transformer-based vision model as the encoder (*e.g.* [ViT](vit), [BEiT](beit), [DeiT](deit), [Swin](swin))
+and any pretrained language model as the decoder (*e.g.* [RoBERTa](roberta), [GPT2](gpt2), [BERT](bert), [DistilBERT](distilbert)).
+
+The effectiveness of initializing image-to-text-sequence models with pretrained checkpoints has been shown in (for
+example) [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang,
+Zhoujun Li, Furu Wei.
+
+After such a [`VisionEncoderDecoderModel`] has been trained/fine-tuned, it can be saved/loaded just like any other models (see the examples below
+for more information).
+
+An example application is image captioning, in which the encoder is used to encode the image, after which an autoregressive language model generates
+the caption. Another example is optical character recognition. Refer to [TrOCR](trocr), which is an instance of [`VisionEncoderDecoderModel`].
+
+## Randomly initializing `VisionEncoderDecoderModel` from model configurations.
+
+[`VisionEncoderDecoderModel`] can be randomly initialized from an encoder and a decoder config. In the following example, we show how to do this using the default [`ViTModel`] configuration for the encoder
+and the default [`BertForCausalLM`] configuration for the decoder.
+
+```python
+>>> from transformers import BertConfig, ViTConfig, VisionEncoderDecoderConfig, VisionEncoderDecoderModel
+
+>>> config_encoder = ViTConfig()
+>>> config_decoder = BertConfig()
+
+>>> config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
+>>> model = VisionEncoderDecoderModel(config=config)
+```
+
+## Initialising `VisionEncoderDecoderModel` from a pretrained encoder and a pretrained decoder.
+
+[`VisionEncoderDecoderModel`] can be initialized from a pretrained encoder checkpoint and a pretrained decoder checkpoint. Note that any pretrained Transformer-based vision model, *e.g.* [Swin](swin), can serve as the encoder and both pretrained auto-encoding models, *e.g.* BERT, pretrained causal language models, *e.g.* GPT2, as well as the pretrained decoder part of sequence-to-sequence models, *e.g.* decoder of BART, can be used as the decoder.
+Depending on which architecture you choose as the decoder, the cross-attention layers might be randomly initialized.
+Initializing [`VisionEncoderDecoderModel`] from a pretrained encoder and decoder checkpoint requires the model to be fine-tuned on a downstream task, as has been shown in [the *Warm-starting-encoder-decoder blog post*](https://huggingface.co/blog/warm-starting-encoder-decoder).
+To do so, the `VisionEncoderDecoderModel` class provides a [`VisionEncoderDecoderModel.from_encoder_decoder_pretrained`] method.
+
+```python
+>>> from transformers import VisionEncoderDecoderModel
+
+>>> model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
+...     "microsoft/swin-base-patch4-window7-224-in22k", "bert-base-uncased"
+... )
+```
+
+## Loading an existing `VisionEncoderDecoderModel` checkpoint and perform inference.
+
+To load fine-tuned checkpoints of the `VisionEncoderDecoderModel` class, [`VisionEncoderDecoderModel`] provides the `from_pretrained(...)` method just like any other model architecture in Transformers.
+
+To perform inference, one uses the [`generate`] method, which allows to autoregressively generate text. This method supports various forms of decoding, such as greedy, beam search and multinomial sampling.
+
+```python
+>>> import requests
+>>> from PIL import Image
+
+>>> from transformers import GPT2TokenizerFast, ViTImageProcessor, VisionEncoderDecoderModel
+
+>>> # load a fine-tuned image captioning model and corresponding tokenizer and image processor
+>>> model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+>>> tokenizer = GPT2TokenizerFast.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+>>> image_processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+
+>>> # let's perform inference on an image
+>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+>>> pixel_values = image_processor(image, return_tensors="pt").pixel_values
+
+>>> # autoregressively generate caption (uses greedy decoding by default)
+>>> generated_ids = model.generate(pixel_values)
+>>> generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+>>> print(generated_text)
+a cat laying on a blanket next to a cat laying on a bed
+```
+
+## Loading a PyTorch checkpoint into `TFVisionEncoderDecoderModel`.
+
+[`TFVisionEncoderDecoderModel.from_pretrained`] currently doesn't support initializing the model from a
+PyTorch checkpoint. Passing `from_pt=True` to this method will throw an exception. If there are only PyTorch
+checkpoints for a particular vision encoder-decoder model, a workaround is:
+
+```python
+>>> from transformers import VisionEncoderDecoderModel, TFVisionEncoderDecoderModel
+
+>>> _model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+
+>>> _model.encoder.save_pretrained("./encoder")
+>>> _model.decoder.save_pretrained("./decoder")
+
+>>> model = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
+...     "./encoder", "./decoder", encoder_from_pt=True, decoder_from_pt=True
+... )
+>>> # This is only for copying some specific attributes of this particular model.
+>>> model.config = _model.config
+```
+
+## Training
+
+Once the model is created, it can be fine-tuned similar to BART, T5 or any other encoder-decoder model on a dataset of (image, text) pairs.
+As you can see, only 2 inputs are required for the model in order to compute a loss: `pixel_values` (which are the
+images) and `labels` (which are the `input_ids` of the encoded target sequence).
+
+```python
+>>> from transformers import ViTImageProcessor, BertTokenizer, VisionEncoderDecoderModel
+>>> from datasets import load_dataset
+
+>>> image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
+>>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+>>> model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
+...     "google/vit-base-patch16-224-in21k", "bert-base-uncased"
+... )
+
+>>> model.config.decoder_start_token_id = tokenizer.cls_token_id
+>>> model.config.pad_token_id = tokenizer.pad_token_id
+
+>>> dataset = load_dataset("huggingface/cats-image")
+>>> image = dataset["test"]["image"][0]
+>>> pixel_values = image_processor(image, return_tensors="pt").pixel_values
+
+>>> labels = tokenizer(
+...     "an image of two cats chilling on a couch",
+...     return_tensors="pt",
+... ).input_ids
+
+>>> # the forward function automatically creates the correct decoder_input_ids
+>>> loss = model(pixel_values=pixel_values, labels=labels).loss
+```
+
+This model was contributed by [nielsr](https://github.com/nielsrogge). This model's TensorFlow and Flax versions
+were contributed by [ydshieh](https://github.com/ydshieh).
+
+## VisionEncoderDecoderConfig
+
+[[autodoc]] VisionEncoderDecoderConfig
+
+## VisionEncoderDecoderModel
+
+[[autodoc]] VisionEncoderDecoderModel
+    - forward
+    - from_encoder_decoder_pretrained
+
+## TFVisionEncoderDecoderModel
+
+[[autodoc]] TFVisionEncoderDecoderModel
+    - call
+    - from_encoder_decoder_pretrained
+
+## FlaxVisionEncoderDecoderModel
+
+[[autodoc]] FlaxVisionEncoderDecoderModel
+    - __call__
+    - from_encoder_decoder_pretrained
diff --git a/docs/source/en/model_doc/vision-encoder-decoder.mdx b/docs/source/en/model_doc/vision-encoder-decoder.mdx
deleted file mode 100644
index 0241224c0667..000000000000
--- a/docs/source/en/model_doc/vision-encoder-decoder.mdx
+++ /dev/null
@@ -1,166 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Vision Encoder Decoder Models
-
-## Overview
-
-The [`VisionEncoderDecoderModel`] can be used to initialize an image-to-text model with any
-pretrained Transformer-based vision model as the encoder (*e.g.* [ViT](vit), [BEiT](beit), [DeiT](deit), [Swin](swin))
-and any pretrained language model as the decoder (*e.g.* [RoBERTa](roberta), [GPT2](gpt2), [BERT](bert), [DistilBERT](distilbert)).
-
-The effectiveness of initializing image-to-text-sequence models with pretrained checkpoints has been shown in (for
-example) [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang,
-Zhoujun Li, Furu Wei.
-
-After such a [`VisionEncoderDecoderModel`] has been trained/fine-tuned, it can be saved/loaded just like any other models (see the examples below
-for more information).
-
-An example application is image captioning, in which the encoder is used to encode the image, after which an autoregressive language model generates
-the caption. Another example is optical character recognition. Refer to [TrOCR](trocr), which is an instance of [`VisionEncoderDecoderModel`].
-
-## Randomly initializing `VisionEncoderDecoderModel` from model configurations.
-
-[`VisionEncoderDecoderModel`] can be randomly initialized from an encoder and a decoder config. In the following example, we show how to do this using the default [`ViTModel`] configuration for the encoder
-and the default [`BertForCausalLM`] configuration for the decoder.
-
-```python
->>> from transformers import BertConfig, ViTConfig, VisionEncoderDecoderConfig, VisionEncoderDecoderModel
-
->>> config_encoder = ViTConfig()
->>> config_decoder = BertConfig()
-
->>> config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
->>> model = VisionEncoderDecoderModel(config=config)
-```
-
-## Initialising `VisionEncoderDecoderModel` from a pretrained encoder and a pretrained decoder.
-
-[`VisionEncoderDecoderModel`] can be initialized from a pretrained encoder checkpoint and a pretrained decoder checkpoint. Note that any pretrained Transformer-based vision model, *e.g.* [Swin](swin), can serve as the encoder and both pretrained auto-encoding models, *e.g.* BERT, pretrained causal language models, *e.g.* GPT2, as well as the pretrained decoder part of sequence-to-sequence models, *e.g.* decoder of BART, can be used as the decoder.
-Depending on which architecture you choose as the decoder, the cross-attention layers might be randomly initialized.
-Initializing [`VisionEncoderDecoderModel`] from a pretrained encoder and decoder checkpoint requires the model to be fine-tuned on a downstream task, as has been shown in [the *Warm-starting-encoder-decoder blog post*](https://huggingface.co/blog/warm-starting-encoder-decoder).
-To do so, the `VisionEncoderDecoderModel` class provides a [`VisionEncoderDecoderModel.from_encoder_decoder_pretrained`] method.
-
-```python
->>> from transformers import VisionEncoderDecoderModel
-
->>> model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
-...     "microsoft/swin-base-patch4-window7-224-in22k", "bert-base-uncased"
-... )
-```
-
-## Loading an existing `VisionEncoderDecoderModel` checkpoint and perform inference.
-
-To load fine-tuned checkpoints of the `VisionEncoderDecoderModel` class, [`VisionEncoderDecoderModel`] provides the `from_pretrained(...)` method just like any other model architecture in Transformers.
-
-To perform inference, one uses the [`generate`] method, which allows to autoregressively generate text. This method supports various forms of decoding, such as greedy, beam search and multinomial sampling.
-
-```python
->>> import requests
->>> from PIL import Image
-
->>> from transformers import GPT2TokenizerFast, ViTImageProcessor, VisionEncoderDecoderModel
-
->>> # load a fine-tuned image captioning model and corresponding tokenizer and image processor
->>> model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
->>> tokenizer = GPT2TokenizerFast.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
->>> image_processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
-
->>> # let's perform inference on an image
->>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
->>> image = Image.open(requests.get(url, stream=True).raw)
->>> pixel_values = image_processor(image, return_tensors="pt").pixel_values
-
->>> # autoregressively generate caption (uses greedy decoding by default)
->>> generated_ids = model.generate(pixel_values)
->>> generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
->>> print(generated_text)
-a cat laying on a blanket next to a cat laying on a bed
-```
-
-## Loading a PyTorch checkpoint into `TFVisionEncoderDecoderModel`.
-
-[`TFVisionEncoderDecoderModel.from_pretrained`] currently doesn't support initializing the model from a
-PyTorch checkpoint. Passing `from_pt=True` to this method will throw an exception. If there are only PyTorch
-checkpoints for a particular vision encoder-decoder model, a workaround is:
-
-```python
->>> from transformers import VisionEncoderDecoderModel, TFVisionEncoderDecoderModel
-
->>> _model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
-
->>> _model.encoder.save_pretrained("./encoder")
->>> _model.decoder.save_pretrained("./decoder")
-
->>> model = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
-...     "./encoder", "./decoder", encoder_from_pt=True, decoder_from_pt=True
-... )
->>> # This is only for copying some specific attributes of this particular model.
->>> model.config = _model.config
-```
-
-## Training
-
-Once the model is created, it can be fine-tuned similar to BART, T5 or any other encoder-decoder model on a dataset of (image, text) pairs.
-As you can see, only 2 inputs are required for the model in order to compute a loss: `pixel_values` (which are the
-images) and `labels` (which are the `input_ids` of the encoded target sequence).
-
-```python
->>> from transformers import ViTImageProcessor, BertTokenizer, VisionEncoderDecoderModel
->>> from datasets import load_dataset
-
->>> image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
->>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
->>> model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
-...     "google/vit-base-patch16-224-in21k", "bert-base-uncased"
-... )
-
->>> model.config.decoder_start_token_id = tokenizer.cls_token_id
->>> model.config.pad_token_id = tokenizer.pad_token_id
-
->>> dataset = load_dataset("huggingface/cats-image")
->>> image = dataset["test"]["image"][0]
->>> pixel_values = image_processor(image, return_tensors="pt").pixel_values
-
->>> labels = tokenizer(
-...     "an image of two cats chilling on a couch",
-...     return_tensors="pt",
-... ).input_ids
-
->>> # the forward function automatically creates the correct decoder_input_ids
->>> loss = model(pixel_values=pixel_values, labels=labels).loss
-```
-
-This model was contributed by [nielsr](https://github.com/nielsrogge). This model's TensorFlow and Flax versions
-were contributed by [ydshieh](https://github.com/ydshieh).
-
-## VisionEncoderDecoderConfig
-
-[[autodoc]] VisionEncoderDecoderConfig
-
-## VisionEncoderDecoderModel
-
-[[autodoc]] VisionEncoderDecoderModel
-    - forward
-    - from_encoder_decoder_pretrained
-
-## TFVisionEncoderDecoderModel
-
-[[autodoc]] TFVisionEncoderDecoderModel
-    - call
-    - from_encoder_decoder_pretrained
-
-## FlaxVisionEncoderDecoderModel
-
-[[autodoc]] FlaxVisionEncoderDecoderModel
-    - __call__
-    - from_encoder_decoder_pretrained
diff --git a/docs/source/en/model_doc/vision-text-dual-encoder.md b/docs/source/en/model_doc/vision-text-dual-encoder.md
new file mode 100644
index 000000000000..6fa9728cac46
--- /dev/null
+++ b/docs/source/en/model_doc/vision-text-dual-encoder.md
@@ -0,0 +1,52 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# VisionTextDualEncoder
+
+## Overview
+
+The [`VisionTextDualEncoderModel`] can be used to initialize a vision-text dual encoder model with
+any pretrained vision autoencoding model as the vision encoder (*e.g.* [ViT](vit), [BEiT](beit), [DeiT](deit)) and any pretrained text autoencoding model as the text encoder (*e.g.* [RoBERTa](roberta), [BERT](bert)). Two projection layers are added on top of both the vision and text encoder to project the output embeddings
+to a shared latent space. The projection layers are randomly initialized so the model should be fine-tuned on a
+downstream task. This model can be used to align the vision-text embeddings using CLIP like contrastive image-text
+training and then can be used for zero-shot vision tasks such image-classification or retrieval.
+
+In [LiT: Zero-Shot Transfer with Locked-image Text Tuning](https://arxiv.org/abs/2111.07991) it is shown how
+leveraging pre-trained (locked/frozen) image and text model for contrastive learning yields significant improvement on
+new zero-shot vision tasks such as image classification or retrieval.
+
+## VisionTextDualEncoderConfig
+
+[[autodoc]] VisionTextDualEncoderConfig
+
+## VisionTextDualEncoderProcessor
+
+[[autodoc]] VisionTextDualEncoderProcessor
+
+## VisionTextDualEncoderModel
+
+[[autodoc]] VisionTextDualEncoderModel
+    - forward
+
+## FlaxVisionTextDualEncoderModel
+
+[[autodoc]] FlaxVisionTextDualEncoderModel
+    - __call__
+
+## TFVisionTextDualEncoderModel
+
+[[autodoc]] TFVisionTextDualEncoderModel
+    - call
diff --git a/docs/source/en/model_doc/vision-text-dual-encoder.mdx b/docs/source/en/model_doc/vision-text-dual-encoder.mdx
deleted file mode 100644
index c7ee59d77abb..000000000000
--- a/docs/source/en/model_doc/vision-text-dual-encoder.mdx
+++ /dev/null
@@ -1,43 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# VisionTextDualEncoder
-
-## Overview
-
-The [`VisionTextDualEncoderModel`] can be used to initialize a vision-text dual encoder model with
-any pretrained vision autoencoding model as the vision encoder (*e.g.* [ViT](vit), [BEiT](beit), [DeiT](deit)) and any pretrained text autoencoding model as the text encoder (*e.g.* [RoBERTa](roberta), [BERT](bert)). Two projection layers are added on top of both the vision and text encoder to project the output embeddings
-to a shared latent space. The projection layers are randomly initialized so the model should be fine-tuned on a
-downstream task. This model can be used to align the vision-text embeddings using CLIP like contrastive image-text
-training and then can be used for zero-shot vision tasks such image-classification or retrieval.
-
-In [LiT: Zero-Shot Transfer with Locked-image Text Tuning](https://arxiv.org/abs/2111.07991) it is shown how
-leveraging pre-trained (locked/frozen) image and text model for contrastive learning yields significant improvement on
-new zero-shot vision tasks such as image classification or retrieval.
-
-## VisionTextDualEncoderConfig
-
-[[autodoc]] VisionTextDualEncoderConfig
-
-## VisionTextDualEncoderProcessor
-
-[[autodoc]] VisionTextDualEncoderProcessor
-
-## VisionTextDualEncoderModel
-
-[[autodoc]] VisionTextDualEncoderModel
-    - forward
-
-## FlaxVisionTextDualEncoderModel
-
-[[autodoc]] FlaxVisionTextDualEncoderModel
-    - __call__
diff --git a/docs/source/en/model_doc/visual_bert.md b/docs/source/en/model_doc/visual_bert.md
new file mode 100644
index 000000000000..7d84c0d9faec
--- /dev/null
+++ b/docs/source/en/model_doc/visual_bert.md
@@ -0,0 +1,129 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# VisualBERT
+
+## Overview
+
+The VisualBERT model was proposed in [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
+VisualBERT is a neural network trained on a variety of (image, text) pairs.
+
+The abstract from the paper is the following:
+
+*We propose VisualBERT, a simple and flexible framework for modeling a broad range of vision-and-language tasks.
+VisualBERT consists of a stack of Transformer layers that implicitly align elements of an input text and regions in an
+associated input image with self-attention. We further propose two visually-grounded language model objectives for
+pre-training VisualBERT on image caption data. Experiments on four vision-and-language tasks including VQA, VCR, NLVR2,
+and Flickr30K show that VisualBERT outperforms or rivals with state-of-the-art models while being significantly
+simpler. Further analysis demonstrates that VisualBERT can ground elements of language to image regions without any
+explicit supervision and is even sensitive to syntactic relationships, tracking, for example, associations between
+verbs and image regions corresponding to their arguments.*
+
+Tips:
+
+1. Most of the checkpoints provided work with the [`VisualBertForPreTraining`] configuration. Other
+   checkpoints provided are the fine-tuned checkpoints for down-stream tasks - VQA ('visualbert-vqa'), VCR
+   ('visualbert-vcr'), NLVR2 ('visualbert-nlvr2'). Hence, if you are not working on these downstream tasks, it is
+   recommended that you use the pretrained checkpoints.
+
+2. For the VCR task, the authors use a fine-tuned detector for generating visual embeddings, for all the checkpoints.
+   We do not provide the detector and its weights as a part of the package, but it will be available in the research
+   projects, and the states can be loaded directly into the detector provided.
+
+## Usage
+
+VisualBERT is a multi-modal vision and language model. It can be used for visual question answering, multiple choice,
+visual reasoning and region-to-phrase correspondence tasks. VisualBERT uses a BERT-like transformer to prepare
+embeddings for image-text pairs. Both the text and visual features are then projected to a latent space with identical
+dimension.
+
+To feed images to the model, each image is passed through a pre-trained object detector and the regions and the
+bounding boxes are extracted. The authors use the features generated after passing these regions through a pre-trained
+CNN like ResNet as visual embeddings. They also add absolute position embeddings, and feed the resulting sequence of
+vectors to a standard BERT model. The text input is concatenated in the front of the visual embeddings in the embedding
+layer, and is expected to be bound by [CLS] and a [SEP] tokens, as in BERT. The segment IDs must also be set
+appropriately for the textual and visual parts.
+
+The [`BertTokenizer`] is used to encode the text. A custom detector/image processor must be used
+to get the visual embeddings. The following example notebooks show how to use VisualBERT with Detectron-like models:
+
+- [VisualBERT VQA demo notebook](https://github.com/huggingface/transformers/tree/main/examples/research_projects/visual_bert) : This notebook
+  contains an example on VisualBERT VQA.
+
+- [Generate Embeddings for VisualBERT (Colab Notebook)](https://colab.research.google.com/drive/1bLGxKdldwqnMVA5x4neY7-l_8fKGWQYI?usp=sharing) : This notebook contains
+  an example on how to generate visual embeddings.
+
+The following example shows how to get the last hidden state using [`VisualBertModel`]:
+
+```python
+>>> import torch
+>>> from transformers import BertTokenizer, VisualBertModel
+
+>>> model = VisualBertModel.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
+>>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+
+>>> inputs = tokenizer("What is the man eating?", return_tensors="pt")
+>>> # this is a custom function that returns the visual embeddings given the image path
+>>> visual_embeds = get_visual_embeddings(image_path)
+
+>>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
+>>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
+>>> inputs.update(
+...     {
+...         "visual_embeds": visual_embeds,
+...         "visual_token_type_ids": visual_token_type_ids,
+...         "visual_attention_mask": visual_attention_mask,
+...     }
+... )
+>>> outputs = model(**inputs)
+>>> last_hidden_state = outputs.last_hidden_state
+```
+
+This model was contributed by [gchhablani](https://huggingface.co/gchhablani). The original code can be found [here](https://github.com/uclanlp/visualbert).
+
+## VisualBertConfig
+
+[[autodoc]] VisualBertConfig
+
+## VisualBertModel
+
+[[autodoc]] VisualBertModel
+    - forward
+
+## VisualBertForPreTraining
+
+[[autodoc]] VisualBertForPreTraining
+    - forward
+
+## VisualBertForQuestionAnswering
+
+[[autodoc]] VisualBertForQuestionAnswering
+    - forward
+
+## VisualBertForMultipleChoice
+
+[[autodoc]] VisualBertForMultipleChoice
+    - forward
+
+## VisualBertForVisualReasoning
+
+[[autodoc]] VisualBertForVisualReasoning
+    - forward
+
+## VisualBertForRegionToPhraseAlignment
+
+[[autodoc]] VisualBertForRegionToPhraseAlignment
+    - forward
diff --git a/docs/source/en/model_doc/visual_bert.mdx b/docs/source/en/model_doc/visual_bert.mdx
deleted file mode 100644
index df8858b1fa67..000000000000
--- a/docs/source/en/model_doc/visual_bert.mdx
+++ /dev/null
@@ -1,125 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# VisualBERT
-
-## Overview
-
-The VisualBERT model was proposed in [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
-VisualBERT is a neural network trained on a variety of (image, text) pairs.
-
-The abstract from the paper is the following:
-
-*We propose VisualBERT, a simple and flexible framework for modeling a broad range of vision-and-language tasks.
-VisualBERT consists of a stack of Transformer layers that implicitly align elements of an input text and regions in an
-associated input image with self-attention. We further propose two visually-grounded language model objectives for
-pre-training VisualBERT on image caption data. Experiments on four vision-and-language tasks including VQA, VCR, NLVR2,
-and Flickr30K show that VisualBERT outperforms or rivals with state-of-the-art models while being significantly
-simpler. Further analysis demonstrates that VisualBERT can ground elements of language to image regions without any
-explicit supervision and is even sensitive to syntactic relationships, tracking, for example, associations between
-verbs and image regions corresponding to their arguments.*
-
-Tips:
-
-1. Most of the checkpoints provided work with the [`VisualBertForPreTraining`] configuration. Other
-   checkpoints provided are the fine-tuned checkpoints for down-stream tasks - VQA ('visualbert-vqa'), VCR
-   ('visualbert-vcr'), NLVR2 ('visualbert-nlvr2'). Hence, if you are not working on these downstream tasks, it is
-   recommended that you use the pretrained checkpoints.
-
-2. For the VCR task, the authors use a fine-tuned detector for generating visual embeddings, for all the checkpoints.
-   We do not provide the detector and its weights as a part of the package, but it will be available in the research
-   projects, and the states can be loaded directly into the detector provided.
-
-## Usage
-
-VisualBERT is a multi-modal vision and language model. It can be used for visual question answering, multiple choice,
-visual reasoning and region-to-phrase correspondence tasks. VisualBERT uses a BERT-like transformer to prepare
-embeddings for image-text pairs. Both the text and visual features are then projected to a latent space with identical
-dimension.
-
-To feed images to the model, each image is passed through a pre-trained object detector and the regions and the
-bounding boxes are extracted. The authors use the features generated after passing these regions through a pre-trained
-CNN like ResNet as visual embeddings. They also add absolute position embeddings, and feed the resulting sequence of
-vectors to a standard BERT model. The text input is concatenated in the front of the visual embeddings in the embedding
-layer, and is expected to be bound by [CLS] and a [SEP] tokens, as in BERT. The segment IDs must also be set
-appropriately for the textual and visual parts.
-
-The [`BertTokenizer`] is used to encode the text. A custom detector/image processor must be used
-to get the visual embeddings. The following example notebooks show how to use VisualBERT with Detectron-like models:
-
-- [VisualBERT VQA demo notebook](https://github.com/huggingface/transformers/tree/main/examples/research_projects/visual_bert) : This notebook
-  contains an example on VisualBERT VQA.
-
-- [Generate Embeddings for VisualBERT (Colab Notebook)](https://colab.research.google.com/drive/1bLGxKdldwqnMVA5x4neY7-l_8fKGWQYI?usp=sharing) : This notebook contains
-  an example on how to generate visual embeddings.
-
-The following example shows how to get the last hidden state using [`VisualBertModel`]:
-
-```python
->>> import torch
->>> from transformers import BertTokenizer, VisualBertModel
-
->>> model = VisualBertModel.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
->>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
-
->>> inputs = tokenizer("What is the man eating?", return_tensors="pt")
->>> # this is a custom function that returns the visual embeddings given the image path
->>> visual_embeds = get_visual_embeddings(image_path)
-
->>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
->>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
->>> inputs.update(
-...     {
-...         "visual_embeds": visual_embeds,
-...         "visual_token_type_ids": visual_token_type_ids,
-...         "visual_attention_mask": visual_attention_mask,
-...     }
-... )
->>> outputs = model(**inputs)
->>> last_hidden_state = outputs.last_hidden_state
-```
-
-This model was contributed by [gchhablani](https://huggingface.co/gchhablani). The original code can be found [here](https://github.com/uclanlp/visualbert).
-
-## VisualBertConfig
-
-[[autodoc]] VisualBertConfig
-
-## VisualBertModel
-
-[[autodoc]] VisualBertModel
-    - forward
-
-## VisualBertForPreTraining
-
-[[autodoc]] VisualBertForPreTraining
-    - forward
-
-## VisualBertForQuestionAnswering
-
-[[autodoc]] VisualBertForQuestionAnswering
-    - forward
-
-## VisualBertForMultipleChoice
-
-[[autodoc]] VisualBertForMultipleChoice
-    - forward
-
-## VisualBertForVisualReasoning
-
-[[autodoc]] VisualBertForVisualReasoning
-    - forward
-
-## VisualBertForRegionToPhraseAlignment
-
-[[autodoc]] VisualBertForRegionToPhraseAlignment
-    - forward
diff --git a/docs/source/en/model_doc/vit.md b/docs/source/en/model_doc/vit.md
new file mode 100644
index 000000000000..409580d09481
--- /dev/null
+++ b/docs/source/en/model_doc/vit.md
@@ -0,0 +1,186 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Vision Transformer (ViT)
+
+## Overview
+
+The Vision Transformer (ViT) model was proposed in [An Image is Worth 16x16 Words: Transformers for Image Recognition
+at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk
+Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob
+Uszkoreit, Neil Houlsby. It's the first paper that successfully trains a Transformer encoder on ImageNet, attaining
+very good results compared to familiar convolutional architectures.
+
+
+The abstract from the paper is the following:
+
+*While the Transformer architecture has become the de-facto standard for natural language processing tasks, its
+applications to computer vision remain limited. In vision, attention is either applied in conjunction with
+convolutional networks, or used to replace certain components of convolutional networks while keeping their overall
+structure in place. We show that this reliance on CNNs is not necessary and a pure transformer applied directly to
+sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of
+data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.),
+Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring
+substantially fewer computational resources to train.*
+
+Tips:
+
+- Demo notebooks regarding inference as well as fine-tuning ViT on custom data can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/VisionTransformer).
+- To feed images to the Transformer encoder, each image is split into a sequence of fixed-size non-overlapping patches,
+  which are then linearly embedded. A [CLS] token is added to serve as representation of an entire image, which can be
+  used for classification. The authors also add absolute position embeddings, and feed the resulting sequence of
+  vectors to a standard Transformer encoder.
+- As the Vision Transformer expects each image to be of the same size (resolution), one can use
+  [`ViTImageProcessor`] to resize (or rescale) and normalize images for the model.
+- Both the patch resolution and image resolution used during pre-training or fine-tuning are reflected in the name of
+  each checkpoint. For example, `google/vit-base-patch16-224` refers to a base-sized architecture with patch
+  resolution of 16x16 and fine-tuning resolution of 224x224. All checkpoints can be found on the [hub](https://huggingface.co/models?search=vit).
+- The available checkpoints are either (1) pre-trained on [ImageNet-21k](http://www.image-net.org/) (a collection of
+  14 million images and 21k classes) only, or (2) also fine-tuned on [ImageNet](http://www.image-net.org/challenges/LSVRC/2012/) (also referred to as ILSVRC 2012, a collection of 1.3 million
+  images and 1,000 classes).
+- The Vision Transformer was pre-trained using a resolution of 224x224. During fine-tuning, it is often beneficial to
+  use a higher resolution than pre-training [(Touvron et al., 2019)](https://arxiv.org/abs/1906.06423), [(Kolesnikov
+  et al., 2020)](https://arxiv.org/abs/1912.11370). In order to fine-tune at higher resolution, the authors perform
+  2D interpolation of the pre-trained position embeddings, according to their location in the original image.
+- The best results are obtained with supervised pre-training, which is not the case in NLP. The authors also performed
+  an experiment with a self-supervised pre-training objective, namely masked patched prediction (inspired by masked
+  language modeling). With this approach, the smaller ViT-B/16 model achieves 79.9% accuracy on ImageNet, a significant
+  improvement of 2% to training from scratch, but still 4% behind supervised pre-training.
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/vit_architecture.jpg"
+alt="drawing" width="600"/>
+
+<small> ViT architecture. Taken from the <a href="https://arxiv.org/abs/2010.11929">original paper.</a> </small>
+
+Following the original Vision Transformer, some follow-up works have been made:
+
+- [DeiT](deit) (Data-efficient Image Transformers) by Facebook AI. DeiT models are distilled vision transformers.
+  The authors of DeiT also released more efficiently trained ViT models, which you can directly plug into [`ViTModel`] or
+  [`ViTForImageClassification`]. There are 4 variants available (in 3 different sizes): *facebook/deit-tiny-patch16-224*,
+  *facebook/deit-small-patch16-224*, *facebook/deit-base-patch16-224* and *facebook/deit-base-patch16-384*. Note that one should
+  use [`DeiTImageProcessor`] in order to prepare images for the model.
+
+- [BEiT](beit) (BERT pre-training of Image Transformers) by Microsoft Research. BEiT models outperform supervised pre-trained
+  vision transformers using a self-supervised method inspired by BERT (masked image modeling) and based on a VQ-VAE.
+
+- DINO (a method for self-supervised training of Vision Transformers) by Facebook AI. Vision Transformers trained using
+  the DINO method show very interesting properties not seen with convolutional models. They are capable of segmenting
+  objects, without having ever been trained to do so. DINO checkpoints can be found on the [hub](https://huggingface.co/models?other=dino).
+
+- [MAE](vit_mae) (Masked Autoencoders) by Facebook AI. By pre-training Vision Transformers to reconstruct pixel values for a high portion
+  (75%) of masked patches (using an asymmetric encoder-decoder architecture), the authors show that this simple method outperforms
+  supervised pre-training after fine-tuning.
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code (written in JAX) can be
+found [here](https://github.com/google-research/vision_transformer).
+
+Note that we converted the weights from Ross Wightman's [timm library](https://github.com/rwightman/pytorch-image-models), who already converted the weights from JAX to PyTorch. Credits
+go to him!
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ViT.
+
+<PipelineTag pipeline="image-classification"/>
+
+- [`ViTForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
+- A blog on fine-tuning [`ViTForImageClassification`] on a custom dataset can be found [here](https://huggingface.co/blog/fine-tune-vit).
+- More demo notebooks to fine-tune [`ViTForImageClassification`] can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/VisionTransformer).
+- [Image classification task guide](../tasks/image_classification)
+
+Besides that:
+
+- [`ViTForMaskedImageModeling`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ViT. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+`ViTForImageClassification` is supported by:
+<PipelineTag pipeline="image-classification"/>
+
+- A blog post on how to [Fine-Tune ViT for Image Classification with Hugging Face Transformers](https://huggingface.co/blog/fine-tune-vit)
+- A blog post on [Image Classification with Hugging Face Transformers and `Keras`](https://www.philschmid.de/image-classification-huggingface-transformers-keras)
+- A notebook on [Fine-tuning for Image Classification with Hugging Face Transformers](https://github.com/huggingface/notebooks/blob/main/examples/image_classification.ipynb)
+- A notebook on how to [Fine-tune the Vision Transformer on CIFAR-10 with the Hugging Face Trainer](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb)
+- A notebook on how to [Fine-tune the Vision Transformer on CIFAR-10 with PyTorch Lightning](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb)
+
+⚗️ Optimization
+
+- A blog post on how to [Accelerate Vision Transformer (ViT) with Quantization using Optimum](https://www.philschmid.de/optimizing-vision-transformer)
+
+⚡️ Inference
+
+- A notebook on [Quick demo: Vision Transformer (ViT) by Google Brain](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Quick_demo_of_HuggingFace_version_of_Vision_Transformer_inference.ipynb)
+
+🚀 Deploy
+
+- A blog post on [Deploying Tensorflow Vision Models in Hugging Face with TF Serving](https://huggingface.co/blog/tf-serving-vision)
+- A blog post on [Deploying Hugging Face ViT on Vertex AI](https://huggingface.co/blog/deploy-vertex-ai)
+- A blog post on [Deploying Hugging Face ViT on Kubernetes with TF Serving](https://huggingface.co/blog/deploy-tfserving-kubernetes)
+
+
+## ViTConfig
+
+[[autodoc]] ViTConfig
+
+## ViTFeatureExtractor
+
+[[autodoc]] ViTFeatureExtractor
+    - __call__
+
+
+## ViTImageProcessor
+
+[[autodoc]] ViTImageProcessor
+    - preprocess
+
+## ViTModel
+
+[[autodoc]] ViTModel
+    - forward
+
+## ViTForMaskedImageModeling
+
+[[autodoc]] ViTForMaskedImageModeling
+    - forward
+
+## ViTForImageClassification
+
+[[autodoc]] ViTForImageClassification
+    - forward
+
+## TFViTModel
+
+[[autodoc]] TFViTModel
+    - call
+
+## TFViTForImageClassification
+
+[[autodoc]] TFViTForImageClassification
+    - call
+
+## FlaxVitModel
+
+[[autodoc]] FlaxViTModel
+    - __call__
+
+## FlaxViTForImageClassification
+
+[[autodoc]] FlaxViTForImageClassification
+    - __call__
diff --git a/docs/source/en/model_doc/vit.mdx b/docs/source/en/model_doc/vit.mdx
deleted file mode 100644
index de31278dfe75..000000000000
--- a/docs/source/en/model_doc/vit.mdx
+++ /dev/null
@@ -1,166 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Vision Transformer (ViT)
-
-## Overview
-
-The Vision Transformer (ViT) model was proposed in [An Image is Worth 16x16 Words: Transformers for Image Recognition
-at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk
-Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob
-Uszkoreit, Neil Houlsby. It's the first paper that successfully trains a Transformer encoder on ImageNet, attaining
-very good results compared to familiar convolutional architectures.
-
-
-The abstract from the paper is the following:
-
-*While the Transformer architecture has become the de-facto standard for natural language processing tasks, its
-applications to computer vision remain limited. In vision, attention is either applied in conjunction with
-convolutional networks, or used to replace certain components of convolutional networks while keeping their overall
-structure in place. We show that this reliance on CNNs is not necessary and a pure transformer applied directly to
-sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of
-data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.),
-Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring
-substantially fewer computational resources to train.*
-
-Tips:
-
-- Demo notebooks regarding inference as well as fine-tuning ViT on custom data can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/VisionTransformer).
-- To feed images to the Transformer encoder, each image is split into a sequence of fixed-size non-overlapping patches,
-  which are then linearly embedded. A [CLS] token is added to serve as representation of an entire image, which can be
-  used for classification. The authors also add absolute position embeddings, and feed the resulting sequence of
-  vectors to a standard Transformer encoder.
-- As the Vision Transformer expects each image to be of the same size (resolution), one can use
-  [`ViTImageProcessor`] to resize (or rescale) and normalize images for the model.
-- Both the patch resolution and image resolution used during pre-training or fine-tuning are reflected in the name of
-  each checkpoint. For example, `google/vit-base-patch16-224` refers to a base-sized architecture with patch
-  resolution of 16x16 and fine-tuning resolution of 224x224. All checkpoints can be found on the [hub](https://huggingface.co/models?search=vit).
-- The available checkpoints are either (1) pre-trained on [ImageNet-21k](http://www.image-net.org/) (a collection of
-  14 million images and 21k classes) only, or (2) also fine-tuned on [ImageNet](http://www.image-net.org/challenges/LSVRC/2012/) (also referred to as ILSVRC 2012, a collection of 1.3 million
-  images and 1,000 classes).
-- The Vision Transformer was pre-trained using a resolution of 224x224. During fine-tuning, it is often beneficial to
-  use a higher resolution than pre-training [(Touvron et al., 2019)](https://arxiv.org/abs/1906.06423), [(Kolesnikov
-  et al., 2020)](https://arxiv.org/abs/1912.11370). In order to fine-tune at higher resolution, the authors perform
-  2D interpolation of the pre-trained position embeddings, according to their location in the original image.
-- The best results are obtained with supervised pre-training, which is not the case in NLP. The authors also performed
-  an experiment with a self-supervised pre-training objective, namely masked patched prediction (inspired by masked
-  language modeling). With this approach, the smaller ViT-B/16 model achieves 79.9% accuracy on ImageNet, a significant
-  improvement of 2% to training from scratch, but still 4% behind supervised pre-training.
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/vit_architecture.jpg"
-alt="drawing" width="600"/>
-
-<small> ViT architecture. Taken from the <a href="https://arxiv.org/abs/2010.11929">original paper.</a> </small>
-
-Following the original Vision Transformer, some follow-up works have been made:
-
-- [DeiT](deit) (Data-efficient Image Transformers) by Facebook AI. DeiT models are distilled vision transformers.
-  The authors of DeiT also released more efficiently trained ViT models, which you can directly plug into [`ViTModel`] or
-  [`ViTForImageClassification`]. There are 4 variants available (in 3 different sizes): *facebook/deit-tiny-patch16-224*,
-  *facebook/deit-small-patch16-224*, *facebook/deit-base-patch16-224* and *facebook/deit-base-patch16-384*. Note that one should
-  use [`DeiTImageProcessor`] in order to prepare images for the model.
-
-- [BEiT](beit) (BERT pre-training of Image Transformers) by Microsoft Research. BEiT models outperform supervised pre-trained
-  vision transformers using a self-supervised method inspired by BERT (masked image modeling) and based on a VQ-VAE.
-
-- DINO (a method for self-supervised training of Vision Transformers) by Facebook AI. Vision Transformers trained using
-  the DINO method show very interesting properties not seen with convolutional models. They are capable of segmenting
-  objects, without having ever been trained to do so. DINO checkpoints can be found on the [hub](https://huggingface.co/models?other=dino).
-
-- [MAE](vit_mae) (Masked Autoencoders) by Facebook AI. By pre-training Vision Transformers to reconstruct pixel values for a high portion
-  (75%) of masked patches (using an asymmetric encoder-decoder architecture), the authors show that this simple method outperforms
-  supervised pre-training after fine-tuning.
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code (written in JAX) can be
-found [here](https://github.com/google-research/vision_transformer).
-
-Note that we converted the weights from Ross Wightman's [timm library](https://github.com/rwightman/pytorch-image-models), who already converted the weights from JAX to PyTorch. Credits
-go to him!
-
-
-## Resources
-
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ViT. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
-`ViTForImageClassification` is supported by:
-<PipelineTag pipeline="image-classification"/>
-
-- A blog post on how to [Fine-Tune ViT for Image Classification with Hugging Face Transformers](https://huggingface.co/blog/fine-tune-vit)
-- A blog post on [Image Classification with Hugging Face Transformers and `Keras`](https://www.philschmid.de/image-classification-huggingface-transformers-keras)
-- A notebook on [Fine-tuning for Image Classification with Hugging Face Transformers](https://github.com/huggingface/notebooks/blob/main/examples/image_classification.ipynb)
-- A notebook on how to [Fine-tune the Vision Transformer on CIFAR-10 with the Hugging Face Trainer](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb)
-- A notebook on how to [Fine-tune the Vision Transformer on CIFAR-10 with PyTorch Lightning](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb)
-
-⚗️ Optimization
-
-- A blog post on how to [Accelerate Vision Transformer (ViT) with Quantization using Optimum](https://www.philschmid.de/optimizing-vision-transformer)
-
-⚡️ Inference
-
-- A notebook on [Quick demo: Vision Transformer (ViT) by Google Brain](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Quick_demo_of_HuggingFace_version_of_Vision_Transformer_inference.ipynb)
-
-🚀 Deploy
-
-- A blog post on [Deploying Tensorflow Vision Models in Hugging Face with TF Serving](https://huggingface.co/blog/tf-serving-vision)
-- A blog post on [Deploying Hugging Face ViT on Vertex AI](https://huggingface.co/blog/deploy-vertex-ai)
-- A blog post on [Deploying Hugging Face ViT on Kubernetes with TF Serving](https://huggingface.co/blog/deploy-tfserving-kubernetes)
-
-
-## ViTConfig
-
-[[autodoc]] ViTConfig
-
-## ViTFeatureExtractor
-
-[[autodoc]] ViTFeatureExtractor
-    - __call__
-
-
-## ViTImageProcessor
-
-[[autodoc]] ViTImageProcessor
-    - preprocess
-
-## ViTModel
-
-[[autodoc]] ViTModel
-    - forward
-
-## ViTForMaskedImageModeling
-
-[[autodoc]] ViTForMaskedImageModeling
-    - forward
-
-## ViTForImageClassification
-
-[[autodoc]] ViTForImageClassification
-    - forward
-
-## TFViTModel
-
-[[autodoc]] TFViTModel
-    - call
-
-## TFViTForImageClassification
-
-[[autodoc]] TFViTForImageClassification
-    - call
-
-## FlaxVitModel
-
-[[autodoc]] FlaxViTModel
-    - __call__
-
-## FlaxViTForImageClassification
-
-[[autodoc]] FlaxViTForImageClassification
-    - __call__
diff --git a/docs/source/en/model_doc/vit_hybrid.md b/docs/source/en/model_doc/vit_hybrid.md
new file mode 100644
index 000000000000..84969cd0f622
--- /dev/null
+++ b/docs/source/en/model_doc/vit_hybrid.md
@@ -0,0 +1,73 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Hybrid Vision Transformer (ViT Hybrid)
+
+## Overview
+
+The hybrid Vision Transformer (ViT) model was proposed in [An Image is Worth 16x16 Words: Transformers for Image Recognition
+at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk
+Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob
+Uszkoreit, Neil Houlsby. It's the first paper that successfully trains a Transformer encoder on ImageNet, attaining
+very good results compared to familiar convolutional architectures. ViT hybrid is a slight variant of the [plain Vision Transformer](vit),
+by leveraging a convolutional backbone (specifically, [BiT](bit)) whose features are used as initial "tokens" for the Transformer.
+
+
+The abstract from the paper is the following:
+
+*While the Transformer architecture has become the de-facto standard for natural language processing tasks, its
+applications to computer vision remain limited. In vision, attention is either applied in conjunction with
+convolutional networks, or used to replace certain components of convolutional networks while keeping their overall
+structure in place. We show that this reliance on CNNs is not necessary and a pure transformer applied directly to
+sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of
+data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.),
+Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring
+substantially fewer computational resources to train.*
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code (written in JAX) can be
+found [here](https://github.com/google-research/vision_transformer).
+
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ViT Hybrid.
+
+<PipelineTag pipeline="image-classification"/>
+
+- [`ViTHybridForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
+- See also: [Image classification task guide](../tasks/image_classification)
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+
+## ViTHybridConfig
+
+[[autodoc]] ViTHybridConfig
+
+## ViTHybridImageProcessor
+
+[[autodoc]] ViTHybridImageProcessor
+    - preprocess
+
+## ViTHybridModel
+
+[[autodoc]] ViTHybridModel
+    - forward
+
+## ViTHybridForImageClassification
+
+[[autodoc]] ViTHybridForImageClassification
+    - forward
diff --git a/docs/source/en/model_doc/vit_hybrid.mdx b/docs/source/en/model_doc/vit_hybrid.mdx
deleted file mode 100644
index 8885af0dfe0f..000000000000
--- a/docs/source/en/model_doc/vit_hybrid.mdx
+++ /dev/null
@@ -1,68 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Hybrid Vision Transformer (ViT Hybrid)
-
-## Overview
-
-The hybrid Vision Transformer (ViT) model was proposed in [An Image is Worth 16x16 Words: Transformers for Image Recognition
-at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk
-Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob
-Uszkoreit, Neil Houlsby. It's the first paper that successfully trains a Transformer encoder on ImageNet, attaining
-very good results compared to familiar convolutional architectures. ViT hybrid is a slight variant of the [plain Vision Transformer](vit),
-by leveraging a convolutional backbone (specifically, [BiT](bit)) whose features are used as initial "tokens" for the Transformer.
-
-
-The abstract from the paper is the following:
-
-*While the Transformer architecture has become the de-facto standard for natural language processing tasks, its
-applications to computer vision remain limited. In vision, attention is either applied in conjunction with
-convolutional networks, or used to replace certain components of convolutional networks while keeping their overall
-structure in place. We show that this reliance on CNNs is not necessary and a pure transformer applied directly to
-sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of
-data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.),
-Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring
-substantially fewer computational resources to train.*
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code (written in JAX) can be
-found [here](https://github.com/google-research/vision_transformer).
-
-
-## Resources
-
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ViT Hybrid.
-
-<PipelineTag pipeline="image-classification"/>
-
-- [`ViTHybridForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
-
-If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
-
-## ViTHybridConfig
-
-[[autodoc]] ViTHybridConfig
-
-## ViTHybridImageProcessor
-
-[[autodoc]] ViTHybridImageProcessor
-    - preprocess
-
-## ViTHybridModel
-
-[[autodoc]] ViTHybridModel
-    - forward
-
-## ViTHybridForImageClassification
-
-[[autodoc]] ViTHybridForImageClassification
-    - forward
diff --git a/docs/source/en/model_doc/vit_mae.md b/docs/source/en/model_doc/vit_mae.md
new file mode 100644
index 000000000000..c14cc7e57c90
--- /dev/null
+++ b/docs/source/en/model_doc/vit_mae.md
@@ -0,0 +1,90 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# ViTMAE
+
+## Overview
+
+The ViTMAE model was proposed in [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377v2) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li,
+Piotr Dollár, Ross Girshick. The paper shows that, by pre-training a Vision Transformer (ViT) to reconstruct pixel values for masked patches, one can get results after
+fine-tuning that outperform supervised pre-training.
+
+The abstract from the paper is the following:
+
+*This paper shows that masked autoencoders (MAE) are scalable self-supervised learners for computer vision. Our MAE approach is simple: we mask random patches of the
+input image and reconstruct the missing pixels. It is based on two core designs. First, we develop an asymmetric encoder-decoder architecture, with an encoder that operates
+only on the visible subset of patches (without mask tokens), along with a lightweight decoder that reconstructs the original image from the latent representation and mask
+tokens. Second, we find that masking a high proportion of the input image, e.g., 75%, yields a nontrivial and meaningful self-supervisory task. Coupling these two designs
+enables us to train large models efficiently and effectively: we accelerate training (by 3x or more) and improve accuracy. Our scalable approach allows for learning high-capacity
+models that generalize well: e.g., a vanilla ViT-Huge model achieves the best accuracy (87.8%) among methods that use only ImageNet-1K data. Transfer performance in downstream
+tasks outperforms supervised pre-training and shows promising scaling behavior.*
+
+Tips:
+
+- MAE (masked auto encoding) is a method for self-supervised pre-training of Vision Transformers (ViTs). The pre-training objective is relatively simple:
+by masking a large portion (75%) of the image patches, the model must reconstruct raw pixel values. One can use [`ViTMAEForPreTraining`] for this purpose.
+- After pre-training, one "throws away" the decoder used to reconstruct pixels, and one uses the encoder for fine-tuning/linear probing. This means that after
+fine-tuning, one can directly plug in the weights into a [`ViTForImageClassification`].
+- One can use [`ViTImageProcessor`] to prepare images for the model. See the code examples for more info.
+- Note that the encoder of MAE is only used to encode the visual patches. The encoded patches are then concatenated with mask tokens, which the decoder (which also
+consists of Transformer blocks) takes as input. Each mask token is a shared, learned vector that indicates the presence of a missing patch to be predicted. Fixed
+sin/cos position embeddings are added both to the input of the encoder and the decoder.
+- For a visual understanding of how MAEs work you can check out this [post](https://keras.io/examples/vision/masked_image_modeling/).
+
+<img src="https://user-images.githubusercontent.com/11435359/146857310-f258c86c-fde6-48e8-9cee-badd2b21bd2c.png"
+alt="drawing" width="600"/> 
+
+<small> MAE architecture. Taken from the <a href="https://arxiv.org/abs/2111.06377">original paper.</a> </small>
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr). TensorFlow version of the model was contributed by [sayakpaul](https://github.com/sayakpaul) and 
+[ariG23498](https://github.com/ariG23498) (equal contribution). The original code can be found [here](https://github.com/facebookresearch/mae). 
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ViTMAE.
+
+- [`ViTMAEForPreTraining`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining), allowing you to pre-train the model from scratch/further pre-train the model on custom data.
+- A notebook that illustrates how to visualize reconstructed pixel values with [`ViTMAEForPreTraining`] can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/ViTMAE/ViT_MAE_visualization_demo.ipynb).
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## ViTMAEConfig
+
+[[autodoc]] ViTMAEConfig
+
+
+## ViTMAEModel
+
+[[autodoc]] ViTMAEModel
+    - forward
+
+
+## ViTMAEForPreTraining
+
+[[autodoc]] transformers.ViTMAEForPreTraining
+    - forward
+
+
+## TFViTMAEModel
+
+[[autodoc]] TFViTMAEModel
+    - call
+
+
+## TFViTMAEForPreTraining
+
+[[autodoc]] transformers.TFViTMAEForPreTraining
+    - call
diff --git a/docs/source/en/model_doc/vit_mae.mdx b/docs/source/en/model_doc/vit_mae.mdx
deleted file mode 100644
index 454423707046..000000000000
--- a/docs/source/en/model_doc/vit_mae.mdx
+++ /dev/null
@@ -1,81 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# ViTMAE
-
-## Overview
-
-The ViTMAE model was proposed in [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377v2) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li,
-Piotr Dollár, Ross Girshick. The paper shows that, by pre-training a Vision Transformer (ViT) to reconstruct pixel values for masked patches, one can get results after
-fine-tuning that outperform supervised pre-training.
-
-The abstract from the paper is the following:
-
-*This paper shows that masked autoencoders (MAE) are scalable self-supervised learners for computer vision. Our MAE approach is simple: we mask random patches of the
-input image and reconstruct the missing pixels. It is based on two core designs. First, we develop an asymmetric encoder-decoder architecture, with an encoder that operates
-only on the visible subset of patches (without mask tokens), along with a lightweight decoder that reconstructs the original image from the latent representation and mask
-tokens. Second, we find that masking a high proportion of the input image, e.g., 75%, yields a nontrivial and meaningful self-supervisory task. Coupling these two designs
-enables us to train large models efficiently and effectively: we accelerate training (by 3x or more) and improve accuracy. Our scalable approach allows for learning high-capacity
-models that generalize well: e.g., a vanilla ViT-Huge model achieves the best accuracy (87.8%) among methods that use only ImageNet-1K data. Transfer performance in downstream
-tasks outperforms supervised pre-training and shows promising scaling behavior.*
-
-Tips:
-
-- MAE (masked auto encoding) is a method for self-supervised pre-training of Vision Transformers (ViTs). The pre-training objective is relatively simple:
-by masking a large portion (75%) of the image patches, the model must reconstruct raw pixel values. One can use [`ViTMAEForPreTraining`] for this purpose.
-- An example Python script that illustrates how to pre-train [`ViTMAEForPreTraining`] from scratch can be found [here](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).
-One can easily tweak it for their own use case.
-- A notebook that illustrates how to visualize reconstructed pixel values with [`ViTMAEForPreTraining`] can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/ViTMAE/ViT_MAE_visualization_demo.ipynb).
-- After pre-training, one "throws away" the decoder used to reconstruct pixels, and one uses the encoder for fine-tuning/linear probing. This means that after
-fine-tuning, one can directly plug in the weights into a [`ViTForImageClassification`].
-- One can use [`ViTImageProcessor`] to prepare images for the model. See the code examples for more info.
-- Note that the encoder of MAE is only used to encode the visual patches. The encoded patches are then concatenated with mask tokens, which the decoder (which also
-consists of Transformer blocks) takes as input. Each mask token is a shared, learned vector that indicates the presence of a missing patch to be predicted. Fixed
-sin/cos position embeddings are added both to the input of the encoder and the decoder.
-- For a visual understanding of how MAEs work you can check out this [post](https://keras.io/examples/vision/masked_image_modeling/).
-
-<img src="https://user-images.githubusercontent.com/11435359/146857310-f258c86c-fde6-48e8-9cee-badd2b21bd2c.png"
-alt="drawing" width="600"/> 
-
-<small> MAE architecture. Taken from the <a href="https://arxiv.org/abs/2111.06377">original paper.</a> </small>
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr). TensorFlow version of the model was contributed by [sayakpaul](https://github.com/sayakpaul) and 
-[ariG23498](https://github.com/ariG23498) (equal contribution). The original code can be found [here](https://github.com/facebookresearch/mae). 
-
-
-## ViTMAEConfig
-
-[[autodoc]] ViTMAEConfig
-
-
-## ViTMAEModel
-
-[[autodoc]] ViTMAEModel
-    - forward
-
-
-## ViTMAEForPreTraining
-
-[[autodoc]] transformers.ViTMAEForPreTraining
-    - forward
-
-
-## TFViTMAEModel
-
-[[autodoc]] TFViTMAEModel
-    - call
-
-
-## TFViTMAEForPreTraining
-
-[[autodoc]] transformers.TFViTMAEForPreTraining
-    - call
diff --git a/docs/source/en/model_doc/vit_msn.md b/docs/source/en/model_doc/vit_msn.md
new file mode 100644
index 000000000000..ded0245194f8
--- /dev/null
+++ b/docs/source/en/model_doc/vit_msn.md
@@ -0,0 +1,78 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# ViTMSN
+
+## Overview
+
+The ViTMSN model was proposed in [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes,
+Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas. The paper presents a joint-embedding architecture to match the prototypes
+of masked patches with that of the unmasked patches. With this setup, their method yields excellent performance in the low-shot and extreme low-shot
+regimes.
+
+The abstract from the paper is the following:
+
+*We propose Masked Siamese Networks (MSN), a self-supervised learning framework for learning image representations. Our
+approach matches the representation of an image view containing randomly masked patches to the representation of the original
+unmasked image. This self-supervised pre-training strategy is particularly scalable when applied to Vision Transformers since only the
+unmasked patches are processed by the network. As a result, MSNs improve the scalability of joint-embedding architectures,
+while producing representations of a high semantic level that perform competitively on low-shot image classification. For instance,
+on ImageNet-1K, with only 5,000 annotated images, our base MSN model achieves 72.4% top-1 accuracy,
+and with 1% of ImageNet-1K labels, we achieve 75.7% top-1 accuracy, setting a new state-of-the-art for self-supervised learning on this benchmark.*
+
+Tips:
+
+- MSN (masked siamese networks) is a method for self-supervised pre-training of Vision Transformers (ViTs). The pre-training
+objective is to match the prototypes assigned to the unmasked views of the images to that of the masked views of the same images.
+- The authors have only released pre-trained weights of the backbone (ImageNet-1k pre-training). So, to use that on your own image classification dataset,
+use the [`ViTMSNForImageClassification`] class which is initialized from [`ViTMSNModel`]. Follow
+[this notebook](https://github.com/huggingface/notebooks/blob/main/examples/image_classification.ipynb) for a detailed tutorial on fine-tuning.
+- MSN is particularly useful in the low-shot and extreme low-shot regimes. Notably, it achieves 75.7% top-1 accuracy with only 1% of ImageNet-1K
+labels when fine-tuned.
+
+
+<img src="https://i.ibb.co/W6PQMdC/Screenshot-2022-09-13-at-9-08-40-AM.png" alt="drawing" width="600"/> 
+
+<small> MSN architecture. Taken from the <a href="https://arxiv.org/abs/2204.07141">original paper.</a> </small>
+
+This model was contributed by [sayakpaul](https://huggingface.co/sayakpaul). The original code can be found [here](https://github.com/facebookresearch/msn). 
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ViT MSN.
+
+<PipelineTag pipeline="image-classification"/>
+
+- [`ViTMSNForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
+- See also: [Image classification task guide](../tasks/image_classification)
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## ViTMSNConfig
+
+[[autodoc]] ViTMSNConfig
+
+
+## ViTMSNModel
+
+[[autodoc]] ViTMSNModel
+    - forward
+
+
+## ViTMSNForImageClassification
+
+[[autodoc]] ViTMSNForImageClassification
+    - forward
diff --git a/docs/source/en/model_doc/vit_msn.mdx b/docs/source/en/model_doc/vit_msn.mdx
deleted file mode 100644
index 07faed51e6cb..000000000000
--- a/docs/source/en/model_doc/vit_msn.mdx
+++ /dev/null
@@ -1,64 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# ViTMSN
-
-## Overview
-
-The ViTMSN model was proposed in [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes,
-Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas. The paper presents a joint-embedding architecture to match the prototypes
-of masked patches with that of the unmasked patches. With this setup, their method yields excellent performance in the low-shot and extreme low-shot
-regimes.
-
-The abstract from the paper is the following:
-
-*We propose Masked Siamese Networks (MSN), a self-supervised learning framework for learning image representations. Our
-approach matches the representation of an image view containing randomly masked patches to the representation of the original
-unmasked image. This self-supervised pre-training strategy is particularly scalable when applied to Vision Transformers since only the
-unmasked patches are processed by the network. As a result, MSNs improve the scalability of joint-embedding architectures,
-while producing representations of a high semantic level that perform competitively on low-shot image classification. For instance,
-on ImageNet-1K, with only 5,000 annotated images, our base MSN model achieves 72.4% top-1 accuracy,
-and with 1% of ImageNet-1K labels, we achieve 75.7% top-1 accuracy, setting a new state-of-the-art for self-supervised learning on this benchmark.*
-
-Tips:
-
-- MSN (masked siamese networks) is a method for self-supervised pre-training of Vision Transformers (ViTs). The pre-training
-objective is to match the prototypes assigned to the unmasked views of the images to that of the masked views of the same images.
-- The authors have only released pre-trained weights of the backbone (ImageNet-1k pre-training). So, to use that on your own image classification dataset,
-use the [`ViTMSNForImageClassification`] class which is initialized from [`ViTMSNModel`]. Follow
-[this notebook](https://github.com/huggingface/notebooks/blob/main/examples/image_classification.ipynb) for a detailed tutorial on fine-tuning.
-- MSN is particularly useful in the low-shot and extreme low-shot regimes. Notably, it achieves 75.7% top-1 accuracy with only 1% of ImageNet-1K
-labels when fine-tuned.
-
-
-<img src="https://i.ibb.co/W6PQMdC/Screenshot-2022-09-13-at-9-08-40-AM.png" alt="drawing" width="600"/> 
-
-<small> MSN architecture. Taken from the <a href="https://arxiv.org/abs/2204.07141">original paper.</a> </small>
-
-This model was contributed by [sayakpaul](https://huggingface.co/sayakpaul). The original code can be found [here](https://github.com/facebookresearch/msn). 
-
-
-## ViTMSNConfig
-
-[[autodoc]] ViTMSNConfig
-
-
-## ViTMSNModel
-
-[[autodoc]] ViTMSNModel
-    - forward
-
-
-## ViTMSNForImageClassification
-
-[[autodoc]] ViTMSNForImageClassification
-    - forward
diff --git a/docs/source/en/model_doc/vitdet.md b/docs/source/en/model_doc/vitdet.md
new file mode 100644
index 000000000000..657e467ee319
--- /dev/null
+++ b/docs/source/en/model_doc/vitdet.md
@@ -0,0 +1,39 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# ViTDet
+
+## Overview
+
+The ViTDet model was proposed in [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) by Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He.
+VitDet leverages the plain [Vision Transformer](vit) for the task of object detection.
+
+The abstract from the paper is the following:
+
+*We explore the plain, non-hierarchical Vision Transformer (ViT) as a backbone network for object detection. This design enables the original ViT architecture to be fine-tuned for object detection without needing to redesign a hierarchical backbone for pre-training. With minimal adaptations for fine-tuning, our plain-backbone detector can achieve competitive results. Surprisingly, we observe: (i) it is sufficient to build a simple feature pyramid from a single-scale feature map (without the common FPN design) and (ii) it is sufficient to use window attention (without shifting) aided with very few cross-window propagation blocks. With plain ViT backbones pre-trained as Masked Autoencoders (MAE), our detector, named ViTDet, can compete with the previous leading methods that were all based on hierarchical backbones, reaching up to 61.3 AP_box on the COCO dataset using only ImageNet-1K pre-training. We hope our study will draw attention to research on plain-backbone detectors.*
+
+Tips:
+
+- For the moment, only the backbone is available.
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr).
+The original code can be found [here](https://github.com/facebookresearch/detectron2/tree/main/projects/ViTDet).
+
+
+## VitDetConfig
+
+[[autodoc]] VitDetConfig
+
+## VitDetModel
+
+[[autodoc]] VitDetModel
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/vitmatte.md b/docs/source/en/model_doc/vitmatte.md
new file mode 100644
index 000000000000..479b398f8066
--- /dev/null
+++ b/docs/source/en/model_doc/vitmatte.md
@@ -0,0 +1,55 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# ViTMatte
+
+## Overview
+
+The ViTMatte model was proposed in [Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
+ViTMatte leverages plain [Vision Transformers](vit) for the task of image matting, which is the process of accurately estimating the foreground object in images and videos.
+
+The abstract from the paper is the following:
+
+*Recently, plain vision Transformers (ViTs) have shown impressive performance on various computer vision tasks, thanks to their strong modeling capacity and large-scale pretraining. However, they have not yet conquered the problem of image matting. We hypothesize that image matting could also be boosted by ViTs and present a new efficient and robust ViT-based matting system, named ViTMatte. Our method utilizes (i) a hybrid attention mechanism combined with a convolution neck to help ViTs achieve an excellent performance-computation trade-off in matting tasks. (ii) Additionally, we introduce the detail capture module, which just consists of simple lightweight convolutions to complement the detailed information required by matting. To the best of our knowledge, ViTMatte is the first work to unleash the potential of ViT on image matting with concise adaptation. It inherits many superior properties from ViT to matting, including various pretraining strategies, concise architecture design, and flexible inference strategies. We evaluate ViTMatte on Composition-1k and Distinctions-646, the most commonly used benchmark for image matting, our method achieves state-of-the-art performance and outperforms prior matting works by a large margin.*
+
+Tips:
+
+- The model expects both the image and trimap (concatenated) as input. One can use [`ViTMatteImageProcessor`] for this purpose.
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr).
+The original code can be found [here](https://github.com/hustvl/ViTMatte).
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/vitmatte_architecture.png"
+alt="drawing" width="600"/>
+
+<small> ViTMatte high-level overview. Taken from the <a href="https://arxiv.org/abs/2305.15272">original paper.</a> </small>
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ViTMatte.
+
+- A demo notebook regarding inference with [`VitMatteForImageMatting`], including background replacement, can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/ViTMatte).
+
+
+## VitMatteConfig
+
+[[autodoc]] VitMatteConfig
+
+## VitMatteImageProcessor
+
+[[autodoc]] VitMatteImageProcessor
+    - preprocess
+
+## VitMatteForImageMatting
+
+[[autodoc]] VitMatteForImageMatting
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/vits.md b/docs/source/en/model_doc/vits.md
new file mode 100644
index 000000000000..1b57df4027dd
--- /dev/null
+++ b/docs/source/en/model_doc/vits.md
@@ -0,0 +1,162 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# VITS
+
+## Overview
+
+The VITS model was proposed in [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son.
+
+
+VITS (**V**ariational **I**nference with adversarial learning for end-to-end **T**ext-to-**S**peech) is an end-to-end 
+speech synthesis model that predicts a speech waveform conditional on an input text sequence. It is a conditional variational 
+autoencoder (VAE) comprised of a posterior encoder, decoder, and conditional prior.
+
+A set of spectrogram-based acoustic features are predicted by the flow-based module, which is formed of a Transformer-based
+text encoder and multiple coupling layers. The spectrogram is decoded using a stack of transposed convolutional layers,
+much in the same style as the HiFi-GAN vocoder. Motivated by the one-to-many nature of the TTS problem, where the same text 
+input can be spoken in multiple ways, the model also includes a stochastic duration predictor, which allows the model to 
+synthesise speech with different rhythms from the same input text. 
+
+The model is trained end-to-end with a combination of losses derived from variational lower bound and adversarial training. 
+To improve the expressiveness of the model, normalizing flows are applied to the conditional prior distribution. During 
+inference, the text encodings are up-sampled based on the duration prediction module, and then mapped into the 
+waveform using a cascade of the flow module and HiFi-GAN decoder. Due to the stochastic nature of the duration predictor,
+the model is non-deterministic, and thus requires a fixed seed to generate the same speech waveform.
+
+The abstract from the paper is the following:
+
+*Several recent end-to-end text-to-speech (TTS) models enabling single-stage training and parallel sampling have been proposed, but their sample quality does not match that of two-stage TTS systems. In this work, we present a parallel end-to-end TTS method that generates more natural sounding audio than current two-stage models. Our method adopts variational inference augmented with normalizing flows and an adversarial training process, which improves the expressive power of generative modeling. We also propose a stochastic duration predictor to synthesize speech with diverse rhythms from input text. With the uncertainty modeling over latent variables and the stochastic duration predictor, our method expresses the natural one-to-many relationship in which a text input can be spoken in multiple ways with different pitches and rhythms. A subjective human evaluation (mean opinion score, or MOS) on the LJ Speech, a single speaker dataset, shows that our method outperforms the best publicly available TTS systems and achieves a MOS comparable to ground truth.*
+
+This model can also be used with TTS checkpoints from [Massively Multilingual Speech (MMS)](https://arxiv.org/abs/2305.13516) 
+as these checkpoints use the same architecture and a slightly modified tokenizer.
+
+This model was contributed by [Matthijs](https://huggingface.co/Matthijs) and [sanchit-gandhi](https://huggingface.co/sanchit-gandhi). The original code can be found [here](https://github.com/jaywalnut310/vits).
+
+## Model Usage
+
+Both the VITS and MMS-TTS checkpoints can be used with the same API. Since the flow-based model is non-deterministic, it 
+is good practice to set a seed to ensure reproducibility of the outputs. For languages with a Roman alphabet, 
+such as English or French, the tokenizer can be used directly to pre-process the text inputs. The following code example 
+runs a forward pass using the MMS-TTS English checkpoint:
+
+```python
+import torch
+from transformers import VitsTokenizer, VitsModel, set_seed
+
+tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
+model = VitsModel.from_pretrained("facebook/mms-tts-eng")
+
+inputs = tokenizer(text="Hello - my dog is cute", return_tensors="pt")
+
+set_seed(555)  # make deterministic
+
+with torch.no_grad():
+   outputs = model(**inputs)
+
+waveform = outputs.waveform[0]
+```
+
+The resulting waveform can be saved as a `.wav` file:
+
+```python
+import scipy
+
+scipy.io.wavfile.write("techno.wav", rate=model.config.sampling_rate, data=waveform)
+```
+
+Or displayed in a Jupyter Notebook / Google Colab:
+
+```python
+from IPython.display import Audio
+
+Audio(waveform, rate=model.config.sampling_rate)
+```
+
+For certain languages with a non-Roman alphabet, such as Arabic, Mandarin or Hindi, the [`uroman`](https://github.com/isi-nlp/uroman) 
+perl package is required to pre-process the text inputs to the Roman alphabet.
+
+You can check whether you require the `uroman` package for your language by inspecting the `is_uroman` attribute of 
+the pre-trained `tokenizer`:
+
+```python
+from transformers import VitsTokenizer
+
+tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
+print(tokenizer.is_uroman)
+```
+
+If required, you should apply the uroman package to your text inputs **prior** to passing them to the `VitsTokenizer`, 
+since currently the tokenizer does not support performing the pre-processing itself.  
+
+To do this, first clone the uroman repository to your local machine and set the bash variable `UROMAN` to the local path:
+
+```bash
+git clone https://github.com/isi-nlp/uroman.git
+cd uroman
+export UROMAN=$(pwd)
+```
+
+You can then pre-process the text input using the following code snippet. You can either rely on using the bash variable 
+`UROMAN` to point to the uroman repository, or you can pass the uroman directory as an argument to the `uromaize` function:
+
+```python
+import torch
+from transformers import VitsTokenizer, VitsModel, set_seed
+import os
+import subprocess
+
+tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-kor")
+model = VitsModel.from_pretrained("facebook/mms-tts-kor")
+
+def uromanize(input_string, uroman_path):
+    """Convert non-Roman strings to Roman using the `uroman` perl package."""
+    script_path = os.path.join(uroman_path, "bin", "uroman.pl")
+
+    command = ["perl", script_path]
+
+    process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    # Execute the perl command
+    stdout, stderr = process.communicate(input=input_string.encode())
+
+    if process.returncode != 0:
+        raise ValueError(f"Error {process.returncode}: {stderr.decode()}")
+
+    # Return the output as a string and skip the new-line character at the end
+    return stdout.decode()[:-1]
+
+text = "이봐 무슨 일이야"
+uromaized_text = uromanize(text, uroman_path=os.environ["UROMAN"])
+
+inputs = tokenizer(text=uromaized_text, return_tensors="pt")
+
+set_seed(555)  # make deterministic
+with torch.no_grad():
+   outputs = model(inputs["input_ids"])
+
+waveform = outputs.waveform[0]
+```
+
+## VitsConfig
+
+[[autodoc]] VitsConfig
+
+## VitsTokenizer
+
+[[autodoc]] VitsTokenizer
+    - __call__
+    - save_vocabulary
+
+## VitsModel
+
+[[autodoc]] VitsModel
+    - forward
diff --git a/docs/source/en/model_doc/vivit.md b/docs/source/en/model_doc/vivit.md
new file mode 100644
index 000000000000..755629a76752
--- /dev/null
+++ b/docs/source/en/model_doc/vivit.md
@@ -0,0 +1,44 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Video Vision Transformer (ViViT)
+
+## Overview
+
+The Vivit model was proposed in [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
+The paper proposes one of the first successful pure-transformer based set of models for video understanding.
+
+The abstract from the paper is the following:
+
+*We present pure-transformer based models for video classification, drawing upon the recent success of such models in image classification. Our model extracts spatio-temporal tokens from the input video, which are then encoded by a series of transformer layers. In order to handle the long sequences of tokens encountered in video, we propose several, efficient variants of our model which factorise the spatial- and temporal-dimensions of the input. Although transformer-based models are known to only be effective when large training datasets are available, we show how we can effectively regularise the model during training and leverage pretrained image models to be able to train on comparatively small datasets. We conduct thorough ablation studies, and achieve state-of-the-art results on multiple video classification benchmarks including Kinetics 400 and 600, Epic Kitchens, Something-Something v2 and Moments in Time, outperforming prior methods based on deep 3D convolutional networks.*
+
+
+This model was contributed by [jegormeister](https://huggingface.co/jegormeister). The original code (written in JAX) can be found [here](https://github.com/google-research/scenic/tree/main/scenic/projects/vivit).
+
+## VivitConfig
+
+[[autodoc]] VivitConfig
+
+## VivitImageProcessor
+
+[[autodoc]] VivitImageProcessor
+    - preprocess
+
+## VivitModel
+
+[[autodoc]] VivitModel
+    - forward
+
+## VivitForVideoClassification
+
+[[autodoc]] transformers.VivitForVideoClassification
+    - forward
diff --git a/docs/source/en/model_doc/wav2vec2-conformer.md b/docs/source/en/model_doc/wav2vec2-conformer.md
new file mode 100644
index 000000000000..87e255cd0c6e
--- /dev/null
+++ b/docs/source/en/model_doc/wav2vec2-conformer.md
@@ -0,0 +1,81 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Wav2Vec2-Conformer
+
+## Overview
+
+The Wav2Vec2-Conformer was added to an updated version of [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
+
+The official results of the model can be found in Table 3 and Table 4 of the paper.
+
+The Wav2Vec2-Conformer weights were released by the Meta AI team within the [Fairseq library](https://github.com/pytorch/fairseq/blob/main/examples/wav2vec/README.md#pre-trained-models).
+
+Tips:
+
+- Wav2Vec2-Conformer follows the same architecture as Wav2Vec2, but replaces the *Attention*-block with a *Conformer*-block
+  as introduced in [Conformer: Convolution-augmented Transformer for Speech Recognition](https://arxiv.org/abs/2005.08100).
+- For the same number of layers, Wav2Vec2-Conformer requires more parameters than Wav2Vec2, but also yields 
+an improved word error rate.
+- Wav2Vec2-Conformer uses the same tokenizer and feature extractor as Wav2Vec2.
+- Wav2Vec2-Conformer can use either no relative position embeddings, Transformer-XL-like position embeddings, or
+  rotary position embeddings by setting the correct `config.position_embeddings_type`.
+
+This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten).
+The original code can be found [here](https://github.com/pytorch/fairseq/tree/main/examples/wav2vec).
+
+## Documentation resources
+
+- [Audio classification task guide](../tasks/audio_classification)
+- [Automatic speech recognition task guide](../tasks/asr)
+
+## Wav2Vec2ConformerConfig
+
+[[autodoc]] Wav2Vec2ConformerConfig
+
+## Wav2Vec2Conformer specific outputs
+
+[[autodoc]] models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerForPreTrainingOutput
+
+## Wav2Vec2ConformerModel
+
+[[autodoc]] Wav2Vec2ConformerModel
+    - forward
+
+## Wav2Vec2ConformerForCTC
+
+[[autodoc]] Wav2Vec2ConformerForCTC
+    - forward
+
+## Wav2Vec2ConformerForSequenceClassification
+
+[[autodoc]] Wav2Vec2ConformerForSequenceClassification
+    - forward
+
+## Wav2Vec2ConformerForAudioFrameClassification
+
+[[autodoc]] Wav2Vec2ConformerForAudioFrameClassification
+    - forward
+
+## Wav2Vec2ConformerForXVector
+
+[[autodoc]] Wav2Vec2ConformerForXVector
+    - forward
+
+## Wav2Vec2ConformerForPreTraining
+
+[[autodoc]] Wav2Vec2ConformerForPreTraining
+    - forward
diff --git a/docs/source/en/model_doc/wav2vec2-conformer.mdx b/docs/source/en/model_doc/wav2vec2-conformer.mdx
deleted file mode 100644
index 2cfb38553f1e..000000000000
--- a/docs/source/en/model_doc/wav2vec2-conformer.mdx
+++ /dev/null
@@ -1,73 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Wav2Vec2-Conformer
-
-## Overview
-
-The Wav2Vec2-Conformer was added to an updated version of [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
-
-The official results of the model can be found in Table 3 and Table 4 of the paper.
-
-The Wav2Vec2-Conformer weights were released by the Meta AI team within the [Fairseq library](https://github.com/pytorch/fairseq/blob/main/examples/wav2vec/README.md#pre-trained-models).
-
-Tips:
-
-- Wav2Vec2-Conformer follows the same architecture as Wav2Vec2, but replaces the *Attention*-block with a *Conformer*-block
-  as introduced in [Conformer: Convolution-augmented Transformer for Speech Recognition](https://arxiv.org/abs/2005.08100).
-- For the same number of layers, Wav2Vec2-Conformer requires more parameters than Wav2Vec2, but also yields 
-an improved word error rate.
-- Wav2Vec2-Conformer uses the same tokenizer and feature extractor as Wav2Vec2.
-- Wav2Vec2-Conformer can use either no relative position embeddings, Transformer-XL-like position embeddings, or
-  rotary position embeddings by setting the correct `config.position_embeddings_type`.
-
-This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten).
-The original code can be found [here](https://github.com/pytorch/fairseq/tree/main/examples/wav2vec).
-
-
-## Wav2Vec2ConformerConfig
-
-[[autodoc]] Wav2Vec2ConformerConfig
-
-## Wav2Vec2Conformer specific outputs
-
-[[autodoc]] models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerForPreTrainingOutput
-
-## Wav2Vec2ConformerModel
-
-[[autodoc]] Wav2Vec2ConformerModel
-    - forward
-
-## Wav2Vec2ConformerForCTC
-
-[[autodoc]] Wav2Vec2ConformerForCTC
-    - forward
-
-## Wav2Vec2ConformerForSequenceClassification
-
-[[autodoc]] Wav2Vec2ConformerForSequenceClassification
-    - forward
-
-## Wav2Vec2ConformerForAudioFrameClassification
-
-[[autodoc]] Wav2Vec2ConformerForAudioFrameClassification
-    - forward
-
-## Wav2Vec2ConformerForXVector
-
-[[autodoc]] Wav2Vec2ConformerForXVector
-    - forward
-
-## Wav2Vec2ConformerForPreTraining
-
-[[autodoc]] Wav2Vec2ConformerForPreTraining
-    - forward
diff --git a/docs/source/en/model_doc/wav2vec2.md b/docs/source/en/model_doc/wav2vec2.md
new file mode 100644
index 000000000000..3a67f66d9d1f
--- /dev/null
+++ b/docs/source/en/model_doc/wav2vec2.md
@@ -0,0 +1,229 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Wav2Vec2
+
+## Overview
+
+The Wav2Vec2 model was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
+
+The abstract from the paper is the following:
+
+*We show for the first time that learning powerful representations from speech audio alone followed by fine-tuning on
+transcribed speech can outperform the best semi-supervised methods while being conceptually simpler. wav2vec 2.0 masks
+the speech input in the latent space and solves a contrastive task defined over a quantization of the latent
+representations which are jointly learned. Experiments using all labeled data of Librispeech achieve 1.8/3.3 WER on the
+clean/other test sets. When lowering the amount of labeled data to one hour, wav2vec 2.0 outperforms the previous state
+of the art on the 100 hour subset while using 100 times less labeled data. Using just ten minutes of labeled data and
+pre-training on 53k hours of unlabeled data still achieves 4.8/8.2 WER. This demonstrates the feasibility of speech
+recognition with limited amounts of labeled data.*
+
+Tips:
+
+- Wav2Vec2 is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
+- Wav2Vec2 model was trained using connectionist temporal classification (CTC) so the model output has to be decoded
+  using [`Wav2Vec2CTCTokenizer`].
+
+This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Wav2Vec2. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+<PipelineTag pipeline="audio-classification"/>
+
+- A notebook on how to [leverage a pretrained Wav2Vec2 model for emotion classification](https://colab.research.google.com/github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb). 🌎
+- [`Wav2Vec2ForCTC`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/audio-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/audio_classification.ipynb).
+- [Audio classification task guide](../tasks/audio_classification)
+
+<PipelineTag pipeline="automatic-speech-recognition"/>
+
+- A blog post on [boosting Wav2Vec2 with n-grams in 🤗 Transformers](https://huggingface.co/blog/wav2vec2-with-ngram).
+- A blog post on how to [finetune Wav2Vec2 for English ASR with 🤗 Transformers](https://huggingface.co/blog/fine-tune-wav2vec2-english).
+- A blog post on [finetuning XLS-R for Multi-Lingual ASR with 🤗 Transformers](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2).
+- A notebook on how to [create YouTube captions from any video by transcribing audio with Wav2Vec2](https://colab.research.google.com/github/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb). 🌎
+- [`Wav2Vec2ForCTC`] is supported by a notebook on [how to finetune a speech recognition model in English](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/speech_recognition.ipynb), and [how to finetune a speech recognition model in any language](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multi_lingual_speech_recognition.ipynb).
+- [Automatic speech recognition task guide](../tasks/asr)
+
+🚀 Deploy
+
+- A blog post on how to deploy Wav2Vec2 for [Automatic Speech Recogntion with Hugging Face's Transformers & Amazon SageMaker](https://www.philschmid.de/automatic-speech-recognition-sagemaker).
+
+## Wav2Vec2Config
+
+[[autodoc]] Wav2Vec2Config
+
+## Wav2Vec2CTCTokenizer
+
+[[autodoc]] Wav2Vec2CTCTokenizer
+    - __call__
+    - save_vocabulary
+    - decode
+    - batch_decode
+    - set_target_lang
+
+## Wav2Vec2FeatureExtractor
+
+[[autodoc]] Wav2Vec2FeatureExtractor
+    - __call__
+
+## Wav2Vec2Processor
+
+[[autodoc]] Wav2Vec2Processor
+    - __call__
+    - pad
+    - from_pretrained
+    - save_pretrained
+    - batch_decode
+    - decode
+
+## Wav2Vec2ProcessorWithLM
+
+[[autodoc]] Wav2Vec2ProcessorWithLM
+    - __call__
+    - pad
+    - from_pretrained
+    - save_pretrained
+    - batch_decode
+    - decode
+
+### Decoding multiple audios
+
+If you are planning to decode multiple batches of audios, you should consider using [`~Wav2Vec2ProcessorWithLM.batch_decode`] and passing an instantiated `multiprocessing.Pool`.
+Otherwise, [`~Wav2Vec2ProcessorWithLM.batch_decode`] performance will be slower than calling [`~Wav2Vec2ProcessorWithLM.decode`] for each audio individually, as it internally instantiates a new `Pool` for every call. See the example below:
+
+```python
+>>> # Let's see how to use a user-managed pool for batch decoding multiple audios
+>>> from multiprocessing import get_context
+>>> from transformers import AutoTokenizer, AutoProcessor, AutoModelForCTC
+>>> from datasets import load_dataset
+>>> import datasets
+>>> import torch
+
+>>> # import model, feature extractor, tokenizer
+>>> model = AutoModelForCTC.from_pretrained("patrickvonplaten/wav2vec2-base-100h-with-lm").to("cuda")
+>>> processor = AutoProcessor.from_pretrained("patrickvonplaten/wav2vec2-base-100h-with-lm")
+
+>>> # load example dataset
+>>> dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+>>> dataset = dataset.cast_column("audio", datasets.Audio(sampling_rate=16_000))
+
+
+>>> def map_to_array(batch):
+...     batch["speech"] = batch["audio"]["array"]
+...     return batch
+
+
+>>> # prepare speech data for batch inference
+>>> dataset = dataset.map(map_to_array, remove_columns=["audio"])
+
+
+>>> def map_to_pred(batch, pool):
+...     inputs = processor(batch["speech"], sampling_rate=16_000, padding=True, return_tensors="pt")
+...     inputs = {k: v.to("cuda") for k, v in inputs.items()}
+
+...     with torch.no_grad():
+...         logits = model(**inputs).logits
+
+...     transcription = processor.batch_decode(logits.cpu().numpy(), pool).text
+...     batch["transcription"] = transcription
+...     return batch
+
+
+>>> # note: pool should be instantiated *after* `Wav2Vec2ProcessorWithLM`.
+>>> #       otherwise, the LM won't be available to the pool's sub-processes
+>>> # select number of processes and batch_size based on number of CPU cores available and on dataset size
+>>> with get_context("fork").Pool(processes=2) as pool:
+...     result = dataset.map(
+...         map_to_pred, batched=True, batch_size=2, fn_kwargs={"pool": pool}, remove_columns=["speech"]
+...     )
+
+>>> result["transcription"][:2]
+['MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL', "NOR IS MISTER COULTER'S MANNER LESS INTERESTING THAN HIS MATTER"]
+```
+
+## Wav2Vec2 specific outputs
+
+[[autodoc]] models.wav2vec2_with_lm.processing_wav2vec2_with_lm.Wav2Vec2DecoderWithLMOutput
+
+[[autodoc]] models.wav2vec2.modeling_wav2vec2.Wav2Vec2BaseModelOutput
+
+[[autodoc]] models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTrainingOutput
+
+[[autodoc]] models.wav2vec2.modeling_flax_wav2vec2.FlaxWav2Vec2BaseModelOutput
+
+[[autodoc]] models.wav2vec2.modeling_flax_wav2vec2.FlaxWav2Vec2ForPreTrainingOutput
+
+## Wav2Vec2Model
+
+[[autodoc]] Wav2Vec2Model
+    - forward
+
+## Wav2Vec2ForCTC
+
+[[autodoc]] Wav2Vec2ForCTC
+    - forward
+    - load_adapter
+
+## Wav2Vec2ForSequenceClassification
+
+[[autodoc]] Wav2Vec2ForSequenceClassification
+    - forward
+
+## Wav2Vec2ForAudioFrameClassification
+
+[[autodoc]] Wav2Vec2ForAudioFrameClassification
+    - forward
+
+## Wav2Vec2ForXVector
+
+[[autodoc]] Wav2Vec2ForXVector
+    - forward
+
+## Wav2Vec2ForPreTraining
+
+[[autodoc]] Wav2Vec2ForPreTraining
+    - forward
+
+## TFWav2Vec2Model
+
+[[autodoc]] TFWav2Vec2Model
+    - call
+
+## TFWav2Vec2ForSequenceClassification
+
+[[autodoc]] TFWav2Vec2ForSequenceClassification
+    - call
+
+## TFWav2Vec2ForCTC
+
+[[autodoc]] TFWav2Vec2ForCTC
+    - call
+
+## FlaxWav2Vec2Model
+
+[[autodoc]] FlaxWav2Vec2Model
+    - __call__
+
+## FlaxWav2Vec2ForCTC
+
+[[autodoc]] FlaxWav2Vec2ForCTC
+    - __call__
+
+## FlaxWav2Vec2ForPreTraining
+
+[[autodoc]] FlaxWav2Vec2ForPreTraining
+    - __call__
diff --git a/docs/source/en/model_doc/wav2vec2.mdx b/docs/source/en/model_doc/wav2vec2.mdx
deleted file mode 100644
index 3acf176a27a8..000000000000
--- a/docs/source/en/model_doc/wav2vec2.mdx
+++ /dev/null
@@ -1,216 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Wav2Vec2
-
-## Overview
-
-The Wav2Vec2 model was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
-
-The abstract from the paper is the following:
-
-*We show for the first time that learning powerful representations from speech audio alone followed by fine-tuning on
-transcribed speech can outperform the best semi-supervised methods while being conceptually simpler. wav2vec 2.0 masks
-the speech input in the latent space and solves a contrastive task defined over a quantization of the latent
-representations which are jointly learned. Experiments using all labeled data of Librispeech achieve 1.8/3.3 WER on the
-clean/other test sets. When lowering the amount of labeled data to one hour, wav2vec 2.0 outperforms the previous state
-of the art on the 100 hour subset while using 100 times less labeled data. Using just ten minutes of labeled data and
-pre-training on 53k hours of unlabeled data still achieves 4.8/8.2 WER. This demonstrates the feasibility of speech
-recognition with limited amounts of labeled data.*
-
-Tips:
-
-- Wav2Vec2 is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
-- Wav2Vec2 model was trained using connectionist temporal classification (CTC) so the model output has to be decoded
-  using [`Wav2Vec2CTCTokenizer`].
-
-This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten).
-
-## Resources
-
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Wav2Vec2. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
-<PipelineTag pipeline="audio-classification"/>
-
-- A notebook on how to [leverage a pretrained Wav2Vec2 model for emotion classification](https://colab.research.google.com/github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb). 🌎
-- [`Wav2Vec2ForCTC`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/audio-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/audio_classification.ipynb).
-
-<PipelineTag pipeline="automatic-speech-recognition"/>
-
-- A blog post on [boosting Wav2Vec2 with n-grams in 🤗 Transformers](https://huggingface.co/blog/wav2vec2-with-ngram).
-- A blog post on how to [finetune Wav2Vec2 for English ASR with 🤗 Transformers](https://huggingface.co/blog/fine-tune-wav2vec2-english).
-- A blog post on [finetuning XLS-R for Multi-Lingual ASR with 🤗 Transformers](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2).
-- A notebook on how to [create YouTube captions from any video by transcribing audio with Wav2Vec2](https://colab.research.google.com/github/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb). 🌎
-- [`Wav2Vec2ForCTC`] is supported by a notebook on [how to finetune a speech recognition model in English](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/speech_recognition.ipynb), and [how to finetune a speech recognition model in any language](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multi_lingual_speech_recognition.ipynb).
-
-🚀 Deploy
-
-- A blog post on how to deploy Wav2Vec2 for [Automatic Speech Recogntion with Hugging Face's Transformers & Amazon SageMaker](https://www.philschmid.de/automatic-speech-recognition-sagemaker).
-
-## Wav2Vec2Config
-
-[[autodoc]] Wav2Vec2Config
-
-## Wav2Vec2CTCTokenizer
-
-[[autodoc]] Wav2Vec2CTCTokenizer
-    - __call__
-    - save_vocabulary
-    - decode
-    - batch_decode
-
-## Wav2Vec2FeatureExtractor
-
-[[autodoc]] Wav2Vec2FeatureExtractor
-    - __call__
-
-## Wav2Vec2Processor
-
-[[autodoc]] Wav2Vec2Processor
-    - __call__
-    - pad
-    - from_pretrained
-    - save_pretrained
-    - batch_decode
-    - decode
-
-## Wav2Vec2ProcessorWithLM
-
-[[autodoc]] Wav2Vec2ProcessorWithLM
-    - __call__
-    - pad
-    - from_pretrained
-    - save_pretrained
-    - batch_decode
-    - decode
-
-### Decoding multiple audios
-
-If you are planning to decode multiple batches of audios, you should consider using [`~Wav2Vec2ProcessorWithLM.batch_decode`] and passing an instantiated `multiprocessing.Pool`.
-Otherwise, [`~Wav2Vec2ProcessorWithLM.batch_decode`] performance will be slower than calling [`~Wav2Vec2ProcessorWithLM.decode`] for each audio individually, as it internally instantiates a new `Pool` for every call. See the example below:
-
-```python
->>> # Let's see how to use a user-managed pool for batch decoding multiple audios
->>> from multiprocessing import get_context
->>> from transformers import AutoTokenizer, AutoProcessor, AutoModelForCTC
->>> from datasets import load_dataset
->>> import datasets
->>> import torch
-
->>> # import model, feature extractor, tokenizer
->>> model = AutoModelForCTC.from_pretrained("patrickvonplaten/wav2vec2-base-100h-with-lm").to("cuda")
->>> processor = AutoProcessor.from_pretrained("patrickvonplaten/wav2vec2-base-100h-with-lm")
-
->>> # load example dataset
->>> dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
->>> dataset = dataset.cast_column("audio", datasets.Audio(sampling_rate=16_000))
-
-
->>> def map_to_array(batch):
-...     batch["speech"] = batch["audio"]["array"]
-...     return batch
-
-
->>> # prepare speech data for batch inference
->>> dataset = dataset.map(map_to_array, remove_columns=["audio"])
-
-
->>> def map_to_pred(batch, pool):
-...     inputs = processor(batch["speech"], sampling_rate=16_000, padding=True, return_tensors="pt")
-...     inputs = {k: v.to("cuda") for k, v in inputs.items()}
-
-...     with torch.no_grad():
-...         logits = model(**inputs).logits
-
-...     transcription = processor.batch_decode(logits.cpu().numpy(), pool).text
-...     batch["transcription"] = transcription
-...     return batch
-
-
->>> # note: pool should be instantiated *after* `Wav2Vec2ProcessorWithLM`.
->>> #       otherwise, the LM won't be available to the pool's sub-processes
->>> # select number of processes and batch_size based on number of CPU cores available and on dataset size
->>> with get_context("fork").Pool(processes=2) as pool:
-...     result = dataset.map(
-...         map_to_pred, batched=True, batch_size=2, fn_kwargs={"pool": pool}, remove_columns=["speech"]
-...     )
-
->>> result["transcription"][:2]
-['MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL', "NOR IS MISTER COULTER'S MANNER LESS INTERESTING THAN HIS MATTER"]
-```
-
-## Wav2Vec2 specific outputs
-
-[[autodoc]] models.wav2vec2_with_lm.processing_wav2vec2_with_lm.Wav2Vec2DecoderWithLMOutput
-
-[[autodoc]] models.wav2vec2.modeling_wav2vec2.Wav2Vec2BaseModelOutput
-
-[[autodoc]] models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTrainingOutput
-
-[[autodoc]] models.wav2vec2.modeling_flax_wav2vec2.FlaxWav2Vec2BaseModelOutput
-
-[[autodoc]] models.wav2vec2.modeling_flax_wav2vec2.FlaxWav2Vec2ForPreTrainingOutput
-
-## Wav2Vec2Model
-
-[[autodoc]] Wav2Vec2Model
-    - forward
-
-## Wav2Vec2ForCTC
-
-[[autodoc]] Wav2Vec2ForCTC
-    - forward
-
-## Wav2Vec2ForSequenceClassification
-
-[[autodoc]] Wav2Vec2ForSequenceClassification
-    - forward
-
-## Wav2Vec2ForAudioFrameClassification
-
-[[autodoc]] Wav2Vec2ForAudioFrameClassification
-    - forward
-
-## Wav2Vec2ForXVector
-
-[[autodoc]] Wav2Vec2ForXVector
-    - forward
-
-## Wav2Vec2ForPreTraining
-
-[[autodoc]] Wav2Vec2ForPreTraining
-    - forward
-
-## TFWav2Vec2Model
-
-[[autodoc]] TFWav2Vec2Model
-    - call
-
-## TFWav2Vec2ForCTC
-
-[[autodoc]] TFWav2Vec2ForCTC
-    - call
-
-## FlaxWav2Vec2Model
-
-[[autodoc]] FlaxWav2Vec2Model
-    - __call__
-
-## FlaxWav2Vec2ForCTC
-
-[[autodoc]] FlaxWav2Vec2ForCTC
-    - __call__
-
-## FlaxWav2Vec2ForPreTraining
-
-[[autodoc]] FlaxWav2Vec2ForPreTraining
-    - __call__
diff --git a/docs/source/en/model_doc/wav2vec2_phoneme.md b/docs/source/en/model_doc/wav2vec2_phoneme.md
new file mode 100644
index 000000000000..a852bef637b2
--- /dev/null
+++ b/docs/source/en/model_doc/wav2vec2_phoneme.md
@@ -0,0 +1,60 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Wav2Vec2Phoneme
+
+## Overview
+
+The Wav2Vec2Phoneme model was proposed in [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition (Xu et al.,
+2021](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
+
+The abstract from the paper is the following:
+
+*Recent progress in self-training, self-supervised pretraining and unsupervised learning enabled well performing speech
+recognition systems without any labeled data. However, in many cases there is labeled data available for related
+languages which is not utilized by these methods. This paper extends previous work on zero-shot cross-lingual transfer
+learning by fine-tuning a multilingually pretrained wav2vec 2.0 model to transcribe unseen languages. This is done by
+mapping phonemes of the training languages to the target language using articulatory features. Experiments show that
+this simple method significantly outperforms prior work which introduced task-specific architectures and used only part
+of a monolingually pretrained model.*
+
+Tips:
+
+- Wav2Vec2Phoneme uses the exact same architecture as Wav2Vec2
+- Wav2Vec2Phoneme is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
+- Wav2Vec2Phoneme model was trained using connectionist temporal classification (CTC) so the model output has to be
+  decoded using [`Wav2Vec2PhonemeCTCTokenizer`].
+- Wav2Vec2Phoneme can be fine-tuned on multiple language at once and decode unseen languages in a single forward pass
+  to a sequence of phonemes
+- By default the model outputs a sequence of phonemes. In order to transform the phonemes to a sequence of words one
+  should make use of a dictionary and language model.
+
+Relevant checkpoints can be found under https://huggingface.co/models?other=phoneme-recognition.
+
+This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten)
+
+The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/fairseq/models/wav2vec).
+
+Wav2Vec2Phoneme's architecture is based on the Wav2Vec2 model, so one can refer to [`Wav2Vec2`]'s documentation page except for the tokenizer.
+
+
+## Wav2Vec2PhonemeCTCTokenizer
+
+[[autodoc]] Wav2Vec2PhonemeCTCTokenizer
+	- __call__
+	- batch_decode
+	- decode
+	- phonemize
diff --git a/docs/source/en/model_doc/wav2vec2_phoneme.mdx b/docs/source/en/model_doc/wav2vec2_phoneme.mdx
deleted file mode 100644
index b39cf66ce136..000000000000
--- a/docs/source/en/model_doc/wav2vec2_phoneme.mdx
+++ /dev/null
@@ -1,56 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Wav2Vec2Phoneme
-
-## Overview
-
-The Wav2Vec2Phoneme model was proposed in [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition (Xu et al.,
-2021](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
-
-The abstract from the paper is the following:
-
-*Recent progress in self-training, self-supervised pretraining and unsupervised learning enabled well performing speech
-recognition systems without any labeled data. However, in many cases there is labeled data available for related
-languages which is not utilized by these methods. This paper extends previous work on zero-shot cross-lingual transfer
-learning by fine-tuning a multilingually pretrained wav2vec 2.0 model to transcribe unseen languages. This is done by
-mapping phonemes of the training languages to the target language using articulatory features. Experiments show that
-this simple method significantly outperforms prior work which introduced task-specific architectures and used only part
-of a monolingually pretrained model.*
-
-Tips:
-
-- Wav2Vec2Phoneme uses the exact same architecture as Wav2Vec2
-- Wav2Vec2Phoneme is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
-- Wav2Vec2Phoneme model was trained using connectionist temporal classification (CTC) so the model output has to be
-  decoded using [`Wav2Vec2PhonemeCTCTokenizer`].
-- Wav2Vec2Phoneme can be fine-tuned on multiple language at once and decode unseen languages in a single forward pass
-  to a sequence of phonemes
-- By default the model outputs a sequence of phonemes. In order to transform the phonemes to a sequence of words one
-  should make use of a dictionary and language model.
-
-Relevant checkpoints can be found under https://huggingface.co/models?other=phoneme-recognition.
-
-This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten)
-
-The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/fairseq/models/wav2vec).
-
-Wav2Vec2Phoneme's architecture is based on the Wav2Vec2 model, so one can refer to [`Wav2Vec2`]'s documentation page except for the tokenizer.
-
-
-## Wav2Vec2PhonemeCTCTokenizer
-
-[[autodoc]] Wav2Vec2PhonemeCTCTokenizer
-	- __call__
-	- batch_decode
-	- decode
-	- phonemize
diff --git a/docs/source/en/model_doc/wavlm.md b/docs/source/en/model_doc/wavlm.md
new file mode 100644
index 000000000000..2754304d8264
--- /dev/null
+++ b/docs/source/en/model_doc/wavlm.md
@@ -0,0 +1,83 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# WavLM
+
+## Overview
+
+The WavLM model was proposed in [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen,
+Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu,
+Michael Zeng, Furu Wei.
+
+The abstract from the paper is the following:
+
+*Self-supervised learning (SSL) achieves great success in speech recognition, while limited exploration has been
+attempted for other speech processing tasks. As speech signal contains multi-faceted information including speaker
+identity, paralinguistics, spoken content, etc., learning universal representations for all speech tasks is
+challenging. In this paper, we propose a new pre-trained model, WavLM, to solve full-stack downstream speech tasks.
+WavLM is built based on the HuBERT framework, with an emphasis on both spoken content modeling and speaker identity
+preservation. We first equip the Transformer structure with gated relative position bias to improve its capability on
+recognition tasks. For better speaker discrimination, we propose an utterance mixing training strategy, where
+additional overlapped utterances are created unsupervisely and incorporated during model training. Lastly, we scale up
+the training dataset from 60k hours to 94k hours. WavLM Large achieves state-of-the-art performance on the SUPERB
+benchmark, and brings significant improvements for various speech processing tasks on their representative benchmarks.*
+
+Tips:
+
+- WavLM is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. Please use
+  [`Wav2Vec2Processor`] for the feature extraction.
+- WavLM model can be fine-tuned using connectionist temporal classification (CTC) so the model output has to be decoded
+  using [`Wav2Vec2CTCTokenizer`].
+- WavLM performs especially well on speaker verification, speaker identification, and speaker diarization tasks.
+
+Relevant checkpoints can be found under https://huggingface.co/models?other=wavlm.
+
+This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The Authors' code can be
+found [here](https://github.com/microsoft/unilm/tree/master/wavlm).
+
+## Documentation resources
+
+- [Audio classification task guide](../tasks/audio_classification)
+- [Automatic speech recognition task guide](../tasks/asr)
+
+## WavLMConfig
+
+[[autodoc]] WavLMConfig
+
+## WavLMModel
+
+[[autodoc]] WavLMModel
+    - forward
+
+## WavLMForCTC
+
+[[autodoc]] WavLMForCTC
+    - forward
+
+## WavLMForSequenceClassification
+
+[[autodoc]] WavLMForSequenceClassification
+    - forward
+
+## WavLMForAudioFrameClassification
+
+[[autodoc]] WavLMForAudioFrameClassification
+    - forward
+
+## WavLMForXVector
+
+[[autodoc]] WavLMForXVector
+    - forward
diff --git a/docs/source/en/model_doc/wavlm.mdx b/docs/source/en/model_doc/wavlm.mdx
deleted file mode 100644
index 8e2138a61187..000000000000
--- a/docs/source/en/model_doc/wavlm.mdx
+++ /dev/null
@@ -1,75 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# WavLM
-
-## Overview
-
-The WavLM model was proposed in [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen,
-Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu,
-Michael Zeng, Furu Wei.
-
-The abstract from the paper is the following:
-
-*Self-supervised learning (SSL) achieves great success in speech recognition, while limited exploration has been
-attempted for other speech processing tasks. As speech signal contains multi-faceted information including speaker
-identity, paralinguistics, spoken content, etc., learning universal representations for all speech tasks is
-challenging. In this paper, we propose a new pre-trained model, WavLM, to solve full-stack downstream speech tasks.
-WavLM is built based on the HuBERT framework, with an emphasis on both spoken content modeling and speaker identity
-preservation. We first equip the Transformer structure with gated relative position bias to improve its capability on
-recognition tasks. For better speaker discrimination, we propose an utterance mixing training strategy, where
-additional overlapped utterances are created unsupervisely and incorporated during model training. Lastly, we scale up
-the training dataset from 60k hours to 94k hours. WavLM Large achieves state-of-the-art performance on the SUPERB
-benchmark, and brings significant improvements for various speech processing tasks on their representative benchmarks.*
-
-Tips:
-
-- WavLM is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. Please use
-  [`Wav2Vec2Processor`] for the feature extraction.
-- WavLM model can be fine-tuned using connectionist temporal classification (CTC) so the model output has to be decoded
-  using [`Wav2Vec2CTCTokenizer`].
-- WavLM performs especially well on speaker verification, speaker identification, and speaker diarization tasks.
-
-Relevant checkpoints can be found under https://huggingface.co/models?other=wavlm.
-
-This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The Authors' code can be
-found [here](https://github.com/microsoft/unilm/tree/master/wavlm).
-
-
-## WavLMConfig
-
-[[autodoc]] WavLMConfig
-
-## WavLMModel
-
-[[autodoc]] WavLMModel
-    - forward
-
-## WavLMForCTC
-
-[[autodoc]] WavLMForCTC
-    - forward
-
-## WavLMForSequenceClassification
-
-[[autodoc]] WavLMForSequenceClassification
-    - forward
-
-## WavLMForAudioFrameClassification
-
-[[autodoc]] WavLMForAudioFrameClassification
-    - forward
-
-## WavLMForXVector
-
-[[autodoc]] WavLMForXVector
-    - forward
diff --git a/docs/source/en/model_doc/whisper.md b/docs/source/en/model_doc/whisper.md
new file mode 100644
index 000000000000..fbf806cd41df
--- /dev/null
+++ b/docs/source/en/model_doc/whisper.md
@@ -0,0 +1,117 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Whisper
+
+## Overview
+
+The Whisper model was proposed in [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever.
+
+The abstract from the paper is the following:
+
+*We study the capabilities of speech processing systems trained simply to predict large amounts of transcripts of audio on the internet. When scaled to 680,000 hours of multilingual and multitask supervision, the resulting models generalize well to standard benchmarks and are often competitive with prior fully supervised results but in a zeroshot transfer setting without the need for any finetuning. When compared to humans, the models approach their accuracy and robustness. We are releasing models and inference code to serve as a foundation for further work on robust speech processing.*
+
+
+Tips:
+
+- The model usually performs well without requiring any finetuning.
+- The architecture follows a classic encoder-decoder architecture, which means that it relies on the [`~generation.GenerationMixin.generate`] function for inference.
+- Inference is currently only implemented for short-form i.e. audio is pre-segmented into <=30s segments. Long-form (including timestamps) will be implemented in a future release.
+- One can use [`WhisperProcessor`] to prepare audio for the model, and decode the predicted ID's back into text.
+
+This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ). The Tensorflow version of this model was contributed by [amyeroberts](https://huggingface.co/amyeroberts).
+The original code can be found [here](https://github.com/openai/whisper).
+
+
+## WhisperConfig
+
+[[autodoc]] WhisperConfig
+
+## WhisperTokenizer
+
+[[autodoc]] WhisperTokenizer
+    - set_prefix_tokens
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## WhisperTokenizerFast
+
+[[autodoc]] WhisperTokenizerFast
+    - set_prefix_tokens
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## WhisperFeatureExtractor
+
+[[autodoc]] WhisperFeatureExtractor
+    - __call__
+
+## WhisperProcessor
+
+[[autodoc]] WhisperProcessor
+    - __call__
+    - from_pretrained
+    - save_pretrained
+    - batch_decode
+    - decode
+
+## WhisperModel
+
+[[autodoc]] WhisperModel
+    - forward
+    - _mask_input_features
+
+## WhisperForConditionalGeneration
+
+[[autodoc]] WhisperForConditionalGeneration
+    - forward
+
+## WhisperForAudioClassification
+
+[[autodoc]] WhisperForAudioClassification
+    - forward
+
+
+## TFWhisperModel
+
+[[autodoc]] TFWhisperModel
+    - call
+
+## TFWhisperForConditionalGeneration
+
+[[autodoc]] TFWhisperForConditionalGeneration
+    - call
+
+
+## FlaxWhisperModel
+
+[[autodoc]] FlaxWhisperModel
+    - __call__
+
+## FlaxWhisperForConditionalGeneration
+
+[[autodoc]] FlaxWhisperForConditionalGeneration
+    - __call__
+
+## FlaxWhisperForAudioClassification
+
+[[autodoc]] FlaxWhisperForAudioClassification
+    - __call__
+
diff --git a/docs/source/en/model_doc/whisper.mdx b/docs/source/en/model_doc/whisper.mdx
deleted file mode 100644
index 4b7a60286184..000000000000
--- a/docs/source/en/model_doc/whisper.mdx
+++ /dev/null
@@ -1,81 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Whisper
-
-## Overview
-
-The Whisper model was proposed in [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever.
-
-The abstract from the paper is the following:
-
-*We study the capabilities of speech processing systems trained simply to predict large amounts of transcripts of audio on the internet. When scaled to 680,000 hours of multilingual and multitask supervision, the resulting models generalize well to standard benchmarks and are often competitive with prior fully supervised results but in a zeroshot transfer setting without the need for any finetuning. When compared to humans, the models approach their accuracy and robustness. We are releasing models and inference code to serve as a foundation for further work on robust speech processing.*
-
-
-Tips:
-
-- The model usually performs well without requiring any finetuning.
-- The architecture follows a classic encoder-decoder architecture, which means that it relies on the [`~generation.GenerationMixin.generate`] function for inference.
-- Inference is currently only implemented for short-form i.e. audio is pre-segmented into <=30s segments. Long-form (including timestamps) will be implemented in a future release.
-- One can use [`WhisperProcessor`] to prepare audio for the model, and decode the predicted ID's back into text.
-
-This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ). The Tensorflow version of this model was contributed by [amyeroberts](https://huggingface.co/amyeroberts).
-The original code can be found [here](https://github.com/openai/whisper).
-
-
-## WhisperConfig
-
-[[autodoc]] WhisperConfig
-
-## WhisperTokenizer
-
-[[autodoc]] WhisperTokenizer
-    - set_prefix_tokens
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-
-## WhisperFeatureExtractor
-
-[[autodoc]] WhisperFeatureExtractor
-    - __call__
-
-## WhisperProcessor
-
-[[autodoc]] WhisperProcessor
-    - __call__
-    - from_pretrained
-    - save_pretrained
-    - batch_decode
-    - decode
-
-## WhisperModel
-
-[[autodoc]] WhisperModel
-    - forward
-
-## WhisperForConditionalGeneration
-
-[[autodoc]] WhisperForConditionalGeneration
-    - forward
-
-
-## TFWhisperModel
-
-[[autodoc]] TFWhisperModel
-    - call
-
-## TFWhisperForConditionalGeneration
-
-[[autodoc]] TFWhisperForConditionalGeneration
-    - call
diff --git a/docs/source/en/model_doc/xclip.md b/docs/source/en/model_doc/xclip.md
new file mode 100644
index 000000000000..45c4c3db749b
--- /dev/null
+++ b/docs/source/en/model_doc/xclip.md
@@ -0,0 +1,80 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# X-CLIP
+
+## Overview
+
+The X-CLIP model was proposed in [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
+X-CLIP is a minimal extension of [CLIP](clip) for video. The model consists of a text encoder, a cross-frame vision encoder, a multi-frame integration Transformer, and a video-specific prompt generator.
+
+The abstract from the paper is the following:
+
+*Contrastive language-image pretraining has shown great success in learning visual-textual joint representation from web-scale data, demonstrating remarkable "zero-shot" generalization ability for various image tasks. However, how to effectively expand such new language-image pretraining methods to video domains is still an open problem. In this work, we present a simple yet effective approach that adapts the pretrained language-image models to video recognition directly, instead of pretraining a new model from scratch. More concretely, to capture the long-range dependencies of frames along the temporal dimension, we propose a cross-frame attention mechanism that explicitly exchanges information across frames. Such module is lightweight and can be plugged into pretrained language-image models seamlessly. Moreover, we propose a video-specific prompting scheme, which leverages video content information for generating discriminative textual prompts. Extensive experiments demonstrate that our approach is effective and can be generalized to different video recognition scenarios. In particular, under fully-supervised settings, our approach achieves a top-1 accuracy of 87.1% on Kinectics-400, while using 12 times fewer FLOPs compared with Swin-L and ViViT-H. In zero-shot experiments, our approach surpasses the current state-of-the-art methods by +7.6% and +14.9% in terms of top-1 accuracy under two popular protocols. In few-shot scenarios, our approach outperforms previous best methods by +32.1% and +23.1% when the labeled data is extremely limited.*
+
+Tips:
+
+- Usage of X-CLIP is identical to [CLIP](clip).
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/xclip_architecture.png"
+alt="drawing" width="600"/> 
+
+<small> X-CLIP architecture. Taken from the <a href="https://arxiv.org/abs/2208.02816">original paper.</a> </small>
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr).
+The original code can be found [here](https://github.com/microsoft/VideoX/tree/master/X-CLIP).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with X-CLIP.
+
+- Demo notebooks for X-CLIP can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/X-CLIP).
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## XCLIPProcessor
+
+[[autodoc]] XCLIPProcessor
+
+## XCLIPConfig
+
+[[autodoc]] XCLIPConfig
+    - from_text_vision_configs
+
+## XCLIPTextConfig
+
+[[autodoc]] XCLIPTextConfig
+
+## XCLIPVisionConfig
+
+[[autodoc]] XCLIPVisionConfig
+
+## XCLIPModel
+
+[[autodoc]] XCLIPModel
+    - forward
+    - get_text_features
+    - get_video_features
+
+## XCLIPTextModel
+
+[[autodoc]] XCLIPTextModel
+    - forward
+
+## XCLIPVisionModel
+
+[[autodoc]] XCLIPVisionModel
+    - forward
diff --git a/docs/source/en/model_doc/xclip.mdx b/docs/source/en/model_doc/xclip.mdx
deleted file mode 100644
index 96832f46e5b8..000000000000
--- a/docs/source/en/model_doc/xclip.mdx
+++ /dev/null
@@ -1,70 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# X-CLIP
-
-## Overview
-
-The X-CLIP model was proposed in [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
-X-CLIP is a minimal extension of [CLIP](clip) for video. The model consists of a text encoder, a cross-frame vision encoder, a multi-frame integration Transformer, and a video-specific prompt generator.
-
-The abstract from the paper is the following:
-
-*Contrastive language-image pretraining has shown great success in learning visual-textual joint representation from web-scale data, demonstrating remarkable "zero-shot" generalization ability for various image tasks. However, how to effectively expand such new language-image pretraining methods to video domains is still an open problem. In this work, we present a simple yet effective approach that adapts the pretrained language-image models to video recognition directly, instead of pretraining a new model from scratch. More concretely, to capture the long-range dependencies of frames along the temporal dimension, we propose a cross-frame attention mechanism that explicitly exchanges information across frames. Such module is lightweight and can be plugged into pretrained language-image models seamlessly. Moreover, we propose a video-specific prompting scheme, which leverages video content information for generating discriminative textual prompts. Extensive experiments demonstrate that our approach is effective and can be generalized to different video recognition scenarios. In particular, under fully-supervised settings, our approach achieves a top-1 accuracy of 87.1% on Kinectics-400, while using 12 times fewer FLOPs compared with Swin-L and ViViT-H. In zero-shot experiments, our approach surpasses the current state-of-the-art methods by +7.6% and +14.9% in terms of top-1 accuracy under two popular protocols. In few-shot scenarios, our approach outperforms previous best methods by +32.1% and +23.1% when the labeled data is extremely limited.*
-
-Tips:
-
-- Usage of X-CLIP is identical to [CLIP](clip).
-- Demo notebooks for X-CLIP can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/X-CLIP).
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/xclip_architecture.png"
-alt="drawing" width="600"/> 
-
-<small> X-CLIP architecture. Taken from the <a href="https://arxiv.org/abs/2208.02816">original paper.</a> </small>
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr).
-The original code can be found [here](https://github.com/microsoft/VideoX/tree/master/X-CLIP).
-
-
-## XCLIPProcessor
-
-[[autodoc]] XCLIPProcessor
-
-## XCLIPConfig
-
-[[autodoc]] XCLIPConfig
-    - from_text_vision_configs
-
-## XCLIPTextConfig
-
-[[autodoc]] XCLIPTextConfig
-
-## XCLIPVisionConfig
-
-[[autodoc]] XCLIPVisionConfig
-
-## XCLIPModel
-
-[[autodoc]] XCLIPModel
-    - forward
-    - get_text_features
-    - get_video_features
-
-## XCLIPTextModel
-
-[[autodoc]] XCLIPTextModel
-    - forward
-
-## XCLIPVisionModel
-
-[[autodoc]] XCLIPVisionModel
-    - forward
diff --git a/docs/source/en/model_doc/xglm.md b/docs/source/en/model_doc/xglm.md
new file mode 100644
index 000000000000..1b184c17e803
--- /dev/null
+++ b/docs/source/en/model_doc/xglm.md
@@ -0,0 +1,93 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# XGLM
+
+## Overview
+
+The XGLM model was proposed in [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668)
+by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, 
+Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, 
+Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
+
+The abstract from the paper is the following:
+
+*Large-scale autoregressive language models such as GPT-3 are few-shot learners that can perform a wide range of language 
+tasks without fine-tuning. While these models are known to be able to jointly represent many different languages, 
+their training data is dominated by English, potentially limiting their cross-lingual generalization. 
+In this work, we train multilingual autoregressive language models on a balanced corpus covering a diverse set of languages, 
+and study their few- and zero-shot learning capabilities in a wide range of tasks. Our largest model with 7.5 billion parameters 
+sets new state of the art in few-shot learning in more than 20 representative languages, outperforming GPT-3 of comparable size 
+in multilingual commonsense reasoning (with +7.4% absolute accuracy improvement in 0-shot settings and +9.4% in 4-shot settings) 
+and natural language inference (+5.4% in each of 0-shot and 4-shot settings). On the FLORES-101 machine translation benchmark, 
+our model outperforms GPT-3 on 171 out of 182 translation directions with 32 training examples, while surpassing the 
+official supervised baseline in 45 directions. We present a detailed analysis of where the model succeeds and fails, 
+showing in particular that it enables cross-lingual in-context learning on some tasks, while there is still room for improvement 
+on surface form robustness and adaptation to tasks that do not have a natural cloze form. Finally, we evaluate our models 
+in social value tasks such as hate speech detection in five languages and find it has limitations similar to comparable sized GPT-3 models.*
+
+
+This model was contributed by [Suraj](https://huggingface.co/valhalla). The original code can be found [here](https://github.com/pytorch/fairseq/tree/main/examples/xglm).
+
+## Documentation resources
+
+- [Causal language modeling task guide](../tasks/language_modeling)
+
+## XGLMConfig
+
+[[autodoc]] XGLMConfig
+
+## XGLMTokenizer
+
+[[autodoc]] XGLMTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## XGLMTokenizerFast
+
+[[autodoc]] XGLMTokenizerFast
+
+## XGLMModel
+
+[[autodoc]] XGLMModel
+    - forward
+
+## XGLMForCausalLM
+
+[[autodoc]] XGLMForCausalLM
+    - forward
+
+## TFXGLMModel
+
+[[autodoc]] TFXGLMModel
+    - call
+
+## TFXGLMForCausalLM
+
+[[autodoc]] TFXGLMForCausalLM
+    - call
+
+## FlaxXGLMModel
+
+[[autodoc]] FlaxXGLMModel
+    - __call__
+
+## FlaxXGLMForCausalLM
+
+[[autodoc]] FlaxXGLMForCausalLM
+    - __call__
\ No newline at end of file
diff --git a/docs/source/en/model_doc/xglm.mdx b/docs/source/en/model_doc/xglm.mdx
deleted file mode 100644
index e35bab25f89c..000000000000
--- a/docs/source/en/model_doc/xglm.mdx
+++ /dev/null
@@ -1,85 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# XGLM
-
-## Overview
-
-The XGLM model was proposed in [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668)
-by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, 
-Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, 
-Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
-
-The abstract from the paper is the following:
-
-*Large-scale autoregressive language models such as GPT-3 are few-shot learners that can perform a wide range of language 
-tasks without fine-tuning. While these models are known to be able to jointly represent many different languages, 
-their training data is dominated by English, potentially limiting their cross-lingual generalization. 
-In this work, we train multilingual autoregressive language models on a balanced corpus covering a diverse set of languages, 
-and study their few- and zero-shot learning capabilities in a wide range of tasks. Our largest model with 7.5 billion parameters 
-sets new state of the art in few-shot learning in more than 20 representative languages, outperforming GPT-3 of comparable size 
-in multilingual commonsense reasoning (with +7.4% absolute accuracy improvement in 0-shot settings and +9.4% in 4-shot settings) 
-and natural language inference (+5.4% in each of 0-shot and 4-shot settings). On the FLORES-101 machine translation benchmark, 
-our model outperforms GPT-3 on 171 out of 182 translation directions with 32 training examples, while surpassing the 
-official supervised baseline in 45 directions. We present a detailed analysis of where the model succeeds and fails, 
-showing in particular that it enables cross-lingual in-context learning on some tasks, while there is still room for improvement 
-on surface form robustness and adaptation to tasks that do not have a natural cloze form. Finally, we evaluate our models 
-in social value tasks such as hate speech detection in five languages and find it has limitations similar to comparable sized GPT-3 models.*
-
-
-This model was contributed by [Suraj](https://huggingface.co/valhalla). The original code can be found [here](https://github.com/pytorch/fairseq/tree/main/examples/xglm).
-
-## XGLMConfig
-
-[[autodoc]] XGLMConfig
-
-## XGLMTokenizer
-
-[[autodoc]] XGLMTokenizer
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-
-## XGLMTokenizerFast
-
-[[autodoc]] XGLMTokenizerFast
-
-## XGLMModel
-
-[[autodoc]] XGLMModel
-    - forward
-
-## XGLMForCausalLM
-
-[[autodoc]] XGLMForCausalLM
-    - forward
-
-## TFXGLMModel
-
-[[autodoc]] TFXGLMModel
-    - call
-
-## TFXGLMForCausalLM
-
-[[autodoc]] TFXGLMForCausalLM
-    - call
-
-## FlaxXGLMModel
-
-[[autodoc]] FlaxXGLMModel
-    - __call__
-
-## FlaxXGLMForCausalLM
-
-[[autodoc]] FlaxXGLMForCausalLM
-    - __call__
\ No newline at end of file
diff --git a/docs/source/en/model_doc/xlm-prophetnet.md b/docs/source/en/model_doc/xlm-prophetnet.md
new file mode 100644
index 000000000000..5e7ba5b7e3f5
--- /dev/null
+++ b/docs/source/en/model_doc/xlm-prophetnet.md
@@ -0,0 +1,91 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# XLM-ProphetNet
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=xprophetnet">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-xprophetnet-blueviolet">
+</a>
+<a href="https://huggingface.co/spaces/docs-demos/xprophetnet-large-wiki100-cased-xglue-ntg">
+<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
+</a>
+</div>
+
+**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title) and assign
+@patrickvonplaten
+
+
+## Overview
+
+The XLM-ProphetNet model was proposed in [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training,](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei
+Zhang, Ming Zhou on 13 Jan, 2020.
+
+XLM-ProphetNet is an encoder-decoder model and can predict n-future tokens for "ngram" language modeling instead of
+just the next token. Its architecture is identical to ProhpetNet, but the model was trained on the multi-lingual
+"wiki100" Wikipedia dump.
+
+The abstract from the paper is the following:
+
+*In this paper, we present a new sequence-to-sequence pretraining model called ProphetNet, which introduces a novel
+self-supervised objective named future n-gram prediction and the proposed n-stream self-attention mechanism. Instead of
+the optimization of one-step ahead prediction in traditional sequence-to-sequence model, the ProphetNet is optimized by
+n-step ahead prediction which predicts the next n tokens simultaneously based on previous context tokens at each time
+step. The future n-gram prediction explicitly encourages the model to plan for the future tokens and prevent
+overfitting on strong local correlations. We pre-train ProphetNet using a base scale dataset (16GB) and a large scale
+dataset (160GB) respectively. Then we conduct experiments on CNN/DailyMail, Gigaword, and SQuAD 1.1 benchmarks for
+abstractive summarization and question generation tasks. Experimental results show that ProphetNet achieves new
+state-of-the-art results on all these datasets compared to the models using the same scale pretraining corpus.*
+
+The Authors' code can be found [here](https://github.com/microsoft/ProphetNet).
+
+Tips:
+
+- XLM-ProphetNet's model architecture and pretraining objective is same as ProphetNet, but XLM-ProphetNet was pre-trained on the cross-lingual dataset XGLUE.
+
+## Documentation resources
+
+- [Causal language modeling task guide](../tasks/language_modeling)
+- [Translation task guide](../tasks/translation)
+- [Summarization task guide](../tasks/summarization)
+
+## XLMProphetNetConfig
+
+[[autodoc]] XLMProphetNetConfig
+
+## XLMProphetNetTokenizer
+
+[[autodoc]] XLMProphetNetTokenizer
+
+## XLMProphetNetModel
+
+[[autodoc]] XLMProphetNetModel
+
+## XLMProphetNetEncoder
+
+[[autodoc]] XLMProphetNetEncoder
+
+## XLMProphetNetDecoder
+
+[[autodoc]] XLMProphetNetDecoder
+
+## XLMProphetNetForConditionalGeneration
+
+[[autodoc]] XLMProphetNetForConditionalGeneration
+
+## XLMProphetNetForCausalLM
+
+[[autodoc]] XLMProphetNetForCausalLM
diff --git a/docs/source/en/model_doc/xlm-prophetnet.mdx b/docs/source/en/model_doc/xlm-prophetnet.mdx
deleted file mode 100644
index af4a3bb6e87e..000000000000
--- a/docs/source/en/model_doc/xlm-prophetnet.mdx
+++ /dev/null
@@ -1,68 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# XLM-ProphetNet
-
-**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title) and assign
-@patrickvonplaten
-
-
-## Overview
-
-The XLM-ProphetNet model was proposed in [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training,](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei
-Zhang, Ming Zhou on 13 Jan, 2020.
-
-XLM-ProphetNet is an encoder-decoder model and can predict n-future tokens for "ngram" language modeling instead of
-just the next token. Its architecture is identical to ProhpetNet, but the model was trained on the multi-lingual
-"wiki100" Wikipedia dump.
-
-The abstract from the paper is the following:
-
-*In this paper, we present a new sequence-to-sequence pretraining model called ProphetNet, which introduces a novel
-self-supervised objective named future n-gram prediction and the proposed n-stream self-attention mechanism. Instead of
-the optimization of one-step ahead prediction in traditional sequence-to-sequence model, the ProphetNet is optimized by
-n-step ahead prediction which predicts the next n tokens simultaneously based on previous context tokens at each time
-step. The future n-gram prediction explicitly encourages the model to plan for the future tokens and prevent
-overfitting on strong local correlations. We pre-train ProphetNet using a base scale dataset (16GB) and a large scale
-dataset (160GB) respectively. Then we conduct experiments on CNN/DailyMail, Gigaword, and SQuAD 1.1 benchmarks for
-abstractive summarization and question generation tasks. Experimental results show that ProphetNet achieves new
-state-of-the-art results on all these datasets compared to the models using the same scale pretraining corpus.*
-
-The Authors' code can be found [here](https://github.com/microsoft/ProphetNet).
-
-## XLMProphetNetConfig
-
-[[autodoc]] XLMProphetNetConfig
-
-## XLMProphetNetTokenizer
-
-[[autodoc]] XLMProphetNetTokenizer
-
-## XLMProphetNetModel
-
-[[autodoc]] XLMProphetNetModel
-
-## XLMProphetNetEncoder
-
-[[autodoc]] XLMProphetNetEncoder
-
-## XLMProphetNetDecoder
-
-[[autodoc]] XLMProphetNetDecoder
-
-## XLMProphetNetForConditionalGeneration
-
-[[autodoc]] XLMProphetNetForConditionalGeneration
-
-## XLMProphetNetForCausalLM
-
-[[autodoc]] XLMProphetNetForCausalLM
diff --git a/docs/source/en/model_doc/xlm-roberta-xl.md b/docs/source/en/model_doc/xlm-roberta-xl.md
new file mode 100644
index 000000000000..b65929460706
--- /dev/null
+++ b/docs/source/en/model_doc/xlm-roberta-xl.md
@@ -0,0 +1,81 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# XLM-RoBERTa-XL
+
+## Overview
+
+The XLM-RoBERTa-XL model was proposed in [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau. 
+
+The abstract from the paper is the following:
+
+*Recent work has demonstrated the effectiveness of cross-lingual language model pretraining for cross-lingual understanding. In this study, we present the results of two larger multilingual masked language models, with 3.5B and 10.7B parameters. Our two new models dubbed XLM-R XL and XLM-R XXL outperform XLM-R by 1.8% and 2.4% average accuracy on XNLI. Our model also outperforms the RoBERTa-Large model on several English tasks of the GLUE benchmark by 0.3% on average while handling 99 more languages. This suggests pretrained models with larger capacity may obtain both strong performance on high-resource languages while greatly improving low-resource languages. We make our code and models publicly available.*
+
+Tips:
+
+- XLM-RoBERTa-XL is a multilingual model trained on 100 different languages. Unlike some XLM multilingual models, it does
+  not require `lang` tensors to understand which language is used, and should be able to determine the correct
+  language from the input ids.
+
+This model was contributed by [Soonhwan-Kwon](https://github.com/Soonhwan-Kwon) and [stefan-it](https://huggingface.co/stefan-it). The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/examples/xlmr).
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Causal language modeling task guide](../tasks/language_modeling)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+- [Multiple choice task guide](../tasks/multiple_choice)
+
+## XLMRobertaXLConfig
+
+[[autodoc]] XLMRobertaXLConfig
+
+## XLMRobertaXLModel
+
+[[autodoc]] XLMRobertaXLModel
+    - forward
+
+## XLMRobertaXLForCausalLM
+
+[[autodoc]] XLMRobertaXLForCausalLM
+    - forward
+
+## XLMRobertaXLForMaskedLM
+
+[[autodoc]] XLMRobertaXLForMaskedLM
+    - forward
+
+## XLMRobertaXLForSequenceClassification
+
+[[autodoc]] XLMRobertaXLForSequenceClassification
+    - forward
+
+## XLMRobertaXLForMultipleChoice
+
+[[autodoc]] XLMRobertaXLForMultipleChoice
+    - forward
+
+## XLMRobertaXLForTokenClassification
+
+[[autodoc]] XLMRobertaXLForTokenClassification
+    - forward
+
+## XLMRobertaXLForQuestionAnswering
+
+[[autodoc]] XLMRobertaXLForQuestionAnswering
+    - forward
diff --git a/docs/source/en/model_doc/xlm-roberta-xl.mdx b/docs/source/en/model_doc/xlm-roberta-xl.mdx
deleted file mode 100644
index 01829a128c00..000000000000
--- a/docs/source/en/model_doc/xlm-roberta-xl.mdx
+++ /dev/null
@@ -1,69 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# XLM-RoBERTa-XL
-
-## Overview
-
-The XLM-RoBERTa-XL model was proposed in [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau. 
-
-The abstract from the paper is the following:
-
-*Recent work has demonstrated the effectiveness of cross-lingual language model pretraining for cross-lingual understanding. In this study, we present the results of two larger multilingual masked language models, with 3.5B and 10.7B parameters. Our two new models dubbed XLM-R XL and XLM-R XXL outperform XLM-R by 1.8% and 2.4% average accuracy on XNLI. Our model also outperforms the RoBERTa-Large model on several English tasks of the GLUE benchmark by 0.3% on average while handling 99 more languages. This suggests pretrained models with larger capacity may obtain both strong performance on high-resource languages while greatly improving low-resource languages. We make our code and models publicly available.*
-
-Tips:
-
-- XLM-RoBERTa-XL is a multilingual model trained on 100 different languages. Unlike some XLM multilingual models, it does
-  not require `lang` tensors to understand which language is used, and should be able to determine the correct
-  language from the input ids.
-
-This model was contributed by [Soonhwan-Kwon](https://github.com/Soonhwan-Kwon) and [stefan-it](https://huggingface.co/stefan-it). The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/examples/xlmr).
-
-
-## XLMRobertaXLConfig
-
-[[autodoc]] XLMRobertaXLConfig
-
-## XLMRobertaXLModel
-
-[[autodoc]] XLMRobertaXLModel
-    - forward
-
-## XLMRobertaXLForCausalLM
-
-[[autodoc]] XLMRobertaXLForCausalLM
-    - forward
-
-## XLMRobertaXLForMaskedLM
-
-[[autodoc]] XLMRobertaXLForMaskedLM
-    - forward
-
-## XLMRobertaXLForSequenceClassification
-
-[[autodoc]] XLMRobertaXLForSequenceClassification
-    - forward
-
-## XLMRobertaXLForMultipleChoice
-
-[[autodoc]] XLMRobertaXLForMultipleChoice
-    - forward
-
-## XLMRobertaXLForTokenClassification
-
-[[autodoc]] XLMRobertaXLForTokenClassification
-    - forward
-
-## XLMRobertaXLForQuestionAnswering
-
-[[autodoc]] XLMRobertaXLForQuestionAnswering
-    - forward
diff --git a/docs/source/en/model_doc/xlm-roberta.md b/docs/source/en/model_doc/xlm-roberta.md
new file mode 100644
index 000000000000..935003156fd1
--- /dev/null
+++ b/docs/source/en/model_doc/xlm-roberta.md
@@ -0,0 +1,232 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# XLM-RoBERTa
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=xlm-roberta">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-xlm--roberta-blueviolet">
+</a>
+<a href="https://huggingface.co/spaces/docs-demos/xlm-roberta-base">
+<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
+</a>
+</div>
+
+## Overview
+
+The XLM-RoBERTa model was proposed in [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume
+Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. It is based on Facebook's
+RoBERTa model released in 2019. It is a large multi-lingual language model, trained on 2.5TB of filtered CommonCrawl
+data.
+
+The abstract from the paper is the following:
+
+*This paper shows that pretraining multilingual language models at scale leads to significant performance gains for a
+wide range of cross-lingual transfer tasks. We train a Transformer-based masked language model on one hundred
+languages, using more than two terabytes of filtered CommonCrawl data. Our model, dubbed XLM-R, significantly
+outperforms multilingual BERT (mBERT) on a variety of cross-lingual benchmarks, including +13.8% average accuracy on
+XNLI, +12.3% average F1 score on MLQA, and +2.1% average F1 score on NER. XLM-R performs particularly well on
+low-resource languages, improving 11.8% in XNLI accuracy for Swahili and 9.2% for Urdu over the previous XLM model. We
+also present a detailed empirical evaluation of the key factors that are required to achieve these gains, including the
+trade-offs between (1) positive transfer and capacity dilution and (2) the performance of high and low resource
+languages at scale. Finally, we show, for the first time, the possibility of multilingual modeling without sacrificing
+per-language performance; XLM-Ris very competitive with strong monolingual models on the GLUE and XNLI benchmarks. We
+will make XLM-R code, data, and models publicly available.*
+
+Tips:
+
+- XLM-RoBERTa is a multilingual model trained on 100 different languages. Unlike some XLM multilingual models, it does
+  not require `lang` tensors to understand which language is used, and should be able to determine the correct
+  language from the input ids.
+- Uses RoBERTa tricks on the XLM approach, but does not use the translation language modeling objective. It only uses masked language modeling on sentences coming from one language.
+- This implementation is the same as RoBERTa. Refer to the [documentation of RoBERTa](roberta) for usage examples
+  as well as the information relative to the inputs and outputs.
+
+This model was contributed by [stefan-it](https://huggingface.co/stefan-it). The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/examples/xlmr).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with XLM-RoBERTa. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+<PipelineTag pipeline="text-classification"/>
+
+- A blog post on how to [finetune XLM RoBERTa for multiclass classification with Habana Gaudi on AWS](https://www.philschmid.de/habana-distributed-training)
+- [`XLMRobertaForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb).
+- [`TFXLMRobertaForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb).
+- [`FlaxXLMRobertaForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_flax.ipynb).
+- [Text classification](https://huggingface.co/docs/transformers/tasks/sequence_classification) chapter of the 🤗 Hugging Face Task Guides.
+- [Text classification task guide](../tasks/sequence_classification)
+
+<PipelineTag pipeline="token-classification"/>
+
+- [`XLMRobertaForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/token-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb).
+- [`TFXLMRobertaForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/token-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb).
+- [`FlaxXLMRobertaForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/token-classification).
+- [Token classification](https://huggingface.co/course/chapter7/2?fw=pt) chapter of the 🤗 Hugging Face Course.
+- [Token classification task guide](../tasks/token_classification)
+
+<PipelineTag pipeline="text-generation"/>
+
+- [`XLMRobertaForCausalLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
+- [Causal language modeling](https://huggingface.co/docs/transformers/tasks/language_modeling) chapter of the 🤗 Hugging Face Task Guides.
+- [Causal language modeling task guide](../tasks/language_modeling)
+
+<PipelineTag pipeline="fill-mask"/>
+
+- [`XLMRobertaForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#robertabertdistilbert-and-masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
+- [`TFXLMRobertaForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_mlmpy) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
+- [`FlaxXLMRobertaForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/masked_language_modeling_flax.ipynb).
+- [Masked language modeling](https://huggingface.co/course/chapter7/3?fw=pt) chapter of the 🤗 Hugging Face Course.
+- [Masked language modeling](../tasks/masked_language_modeling)
+
+<PipelineTag pipeline="question-answering"/>
+
+- [`XLMRobertaForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb).
+- [`TFXLMRobertaForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb).
+- [`FlaxXLMRobertaForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/question-answering).
+- [Question answering](https://huggingface.co/course/chapter7/7?fw=pt) chapter of the 🤗 Hugging Face Course.
+- [Question answering task guide](../tasks/question_answering)
+
+**Multiple choice**
+
+- [`XLMRobertaForMultipleChoice`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/multiple-choice) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb).
+- [`TFXLMRobertaForMultipleChoice`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/multiple-choice) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb).
+- [Multiple choice task guide](../tasks/multiple_choice)
+
+🚀 Deploy
+
+- A blog post on how to [Deploy Serverless XLM RoBERTa on AWS Lambda](https://www.philschmid.de/multilingual-serverless-xlm-roberta-with-huggingface).
+
+## XLMRobertaConfig
+
+[[autodoc]] XLMRobertaConfig
+
+## XLMRobertaTokenizer
+
+[[autodoc]] XLMRobertaTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## XLMRobertaTokenizerFast
+
+[[autodoc]] XLMRobertaTokenizerFast
+
+## XLMRobertaModel
+
+[[autodoc]] XLMRobertaModel
+    - forward
+
+## XLMRobertaForCausalLM
+
+[[autodoc]] XLMRobertaForCausalLM
+    - forward
+
+## XLMRobertaForMaskedLM
+
+[[autodoc]] XLMRobertaForMaskedLM
+    - forward
+
+## XLMRobertaForSequenceClassification
+
+[[autodoc]] XLMRobertaForSequenceClassification
+    - forward
+
+## XLMRobertaForMultipleChoice
+
+[[autodoc]] XLMRobertaForMultipleChoice
+    - forward
+
+## XLMRobertaForTokenClassification
+
+[[autodoc]] XLMRobertaForTokenClassification
+    - forward
+
+## XLMRobertaForQuestionAnswering
+
+[[autodoc]] XLMRobertaForQuestionAnswering
+    - forward
+
+## TFXLMRobertaModel
+
+[[autodoc]] TFXLMRobertaModel
+    - call
+
+## TFXLMRobertaForCausalLM
+
+[[autodoc]] TFXLMRobertaForCausalLM
+    - call
+
+## TFXLMRobertaForMaskedLM
+
+[[autodoc]] TFXLMRobertaForMaskedLM
+    - call
+
+## TFXLMRobertaForSequenceClassification
+
+[[autodoc]] TFXLMRobertaForSequenceClassification
+    - call
+
+## TFXLMRobertaForMultipleChoice
+
+[[autodoc]] TFXLMRobertaForMultipleChoice
+    - call
+
+## TFXLMRobertaForTokenClassification
+
+[[autodoc]] TFXLMRobertaForTokenClassification
+    - call
+
+## TFXLMRobertaForQuestionAnswering
+
+[[autodoc]] TFXLMRobertaForQuestionAnswering
+    - call
+
+## FlaxXLMRobertaModel
+
+[[autodoc]] FlaxXLMRobertaModel
+    - __call__
+
+## FlaxXLMRobertaForCausalLM
+
+[[autodoc]] FlaxXLMRobertaForCausalLM
+    - __call__
+
+## FlaxXLMRobertaForMaskedLM
+
+[[autodoc]] FlaxXLMRobertaForMaskedLM
+    - __call__
+
+## FlaxXLMRobertaForSequenceClassification
+
+[[autodoc]] FlaxXLMRobertaForSequenceClassification
+    - __call__
+
+## FlaxXLMRobertaForMultipleChoice
+
+[[autodoc]] FlaxXLMRobertaForMultipleChoice
+    - __call__
+
+## FlaxXLMRobertaForTokenClassification
+
+[[autodoc]] FlaxXLMRobertaForTokenClassification
+    - __call__
+
+## FlaxXLMRobertaForQuestionAnswering
+
+[[autodoc]] FlaxXLMRobertaForQuestionAnswering
+    - __call__
diff --git a/docs/source/en/model_doc/xlm-roberta.mdx b/docs/source/en/model_doc/xlm-roberta.mdx
deleted file mode 100644
index 941feac9d35a..000000000000
--- a/docs/source/en/model_doc/xlm-roberta.mdx
+++ /dev/null
@@ -1,202 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# XLM-RoBERTa
-
-## Overview
-
-The XLM-RoBERTa model was proposed in [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume
-Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. It is based on Facebook's
-RoBERTa model released in 2019. It is a large multi-lingual language model, trained on 2.5TB of filtered CommonCrawl
-data.
-
-The abstract from the paper is the following:
-
-*This paper shows that pretraining multilingual language models at scale leads to significant performance gains for a
-wide range of cross-lingual transfer tasks. We train a Transformer-based masked language model on one hundred
-languages, using more than two terabytes of filtered CommonCrawl data. Our model, dubbed XLM-R, significantly
-outperforms multilingual BERT (mBERT) on a variety of cross-lingual benchmarks, including +13.8% average accuracy on
-XNLI, +12.3% average F1 score on MLQA, and +2.1% average F1 score on NER. XLM-R performs particularly well on
-low-resource languages, improving 11.8% in XNLI accuracy for Swahili and 9.2% for Urdu over the previous XLM model. We
-also present a detailed empirical evaluation of the key factors that are required to achieve these gains, including the
-trade-offs between (1) positive transfer and capacity dilution and (2) the performance of high and low resource
-languages at scale. Finally, we show, for the first time, the possibility of multilingual modeling without sacrificing
-per-language performance; XLM-Ris very competitive with strong monolingual models on the GLUE and XNLI benchmarks. We
-will make XLM-R code, data, and models publicly available.*
-
-Tips:
-
-- XLM-RoBERTa is a multilingual model trained on 100 different languages. Unlike some XLM multilingual models, it does
-  not require `lang` tensors to understand which language is used, and should be able to determine the correct
-  language from the input ids.
-- This implementation is the same as RoBERTa. Refer to the [documentation of RoBERTa](roberta) for usage examples
-  as well as the information relative to the inputs and outputs.
-
-This model was contributed by [stefan-it](https://huggingface.co/stefan-it). The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/examples/xlmr).
-
-## Resources
-
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with XLM-RoBERTa. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
-<PipelineTag pipeline="text-classification"/>
-
-- A blog post on how to [finetune XLM RoBERTa for multiclass classification with Habana Gaudi on AWS](https://www.philschmid.de/habana-distributed-training)
-- [`XLMRobertaForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb).
-- [`TFXLMRobertaForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb).
-- [`FlaxXLMRobertaForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_flax.ipynb).
-- [Text classification](https://huggingface.co/docs/transformers/tasks/sequence_classification) chapter of the 🤗 Hugging Face Task Guides.
-
-<PipelineTag pipeline="token-classification"/>
-
-- [`XLMRobertaForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/token-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb).
-- [`TFXLMRobertaForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/token-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb).
-- [`FlaxXLMRobertaForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/token-classification).
-- [Token classification](https://huggingface.co/course/chapter7/2?fw=pt) chapter of the 🤗 Hugging Face Course.
-
-<PipelineTag pipeline="text-generation"/>
-
-- [`XLMRobertaForCausalLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
-- [Causal language modeling](https://huggingface.co/docs/transformers/tasks/language_modeling) chapter of the 🤗 Hugging Face Task Guides. 
-
-<PipelineTag pipeline="fill-mask"/>
-
-- [`XLMRobertaForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#robertabertdistilbert-and-masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
-- [`TFXLMRobertaForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_mlmpy) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
-- [`FlaxXLMRobertaForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/masked_language_modeling_flax.ipynb).
-- [Masked language modeling](https://huggingface.co/course/chapter7/3?fw=pt) chapter of the 🤗 Hugging Face Course.
-
-<PipelineTag pipeline="question-answering"/>
-
-- [`XLMRobertaForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb).
-- [`TFXLMRobertaForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb).
-- [`FlaxXLMRobertaForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/question-answering).
-- [Question answering](https://huggingface.co/course/chapter7/7?fw=pt) chapter of the 🤗 Hugging Face Course.
-
-**Multiple choice**
-
-- [`XLMRobertaForMultipleChoice`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/multiple-choice) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb).
-- [`TFXLMRobertaForMultipleChoice`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/multiple-choice) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb).
-
-🚀 Deploy
-
-- A blog post on how to [Deploy Serveless XLM RoBERTa on AWS Lambda](https://www.philschmid.de/multilingual-serverless-xlm-roberta-with-huggingface).
-
-## XLMRobertaConfig
-
-[[autodoc]] XLMRobertaConfig
-
-## XLMRobertaTokenizer
-
-[[autodoc]] XLMRobertaTokenizer
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-
-## XLMRobertaTokenizerFast
-
-[[autodoc]] XLMRobertaTokenizerFast
-
-## XLMRobertaModel
-
-[[autodoc]] XLMRobertaModel
-    - forward
-
-## XLMRobertaForCausalLM
-
-[[autodoc]] XLMRobertaForCausalLM
-    - forward
-
-## XLMRobertaForMaskedLM
-
-[[autodoc]] XLMRobertaForMaskedLM
-    - forward
-
-## XLMRobertaForSequenceClassification
-
-[[autodoc]] XLMRobertaForSequenceClassification
-    - forward
-
-## XLMRobertaForMultipleChoice
-
-[[autodoc]] XLMRobertaForMultipleChoice
-    - forward
-
-## XLMRobertaForTokenClassification
-
-[[autodoc]] XLMRobertaForTokenClassification
-    - forward
-
-## XLMRobertaForQuestionAnswering
-
-[[autodoc]] XLMRobertaForQuestionAnswering
-    - forward
-
-## TFXLMRobertaModel
-
-[[autodoc]] TFXLMRobertaModel
-    - call
-
-## TFXLMRobertaForMaskedLM
-
-[[autodoc]] TFXLMRobertaForMaskedLM
-    - call
-
-## TFXLMRobertaForSequenceClassification
-
-[[autodoc]] TFXLMRobertaForSequenceClassification
-    - call
-
-## TFXLMRobertaForMultipleChoice
-
-[[autodoc]] TFXLMRobertaForMultipleChoice
-    - call
-
-## TFXLMRobertaForTokenClassification
-
-[[autodoc]] TFXLMRobertaForTokenClassification
-    - call
-
-## TFXLMRobertaForQuestionAnswering
-
-[[autodoc]] TFXLMRobertaForQuestionAnswering
-    - call
-
-## FlaxXLMRobertaModel
-
-[[autodoc]] FlaxXLMRobertaModel
-    - __call__
-
-## FlaxXLMRobertaForMaskedLM
-
-[[autodoc]] FlaxXLMRobertaForMaskedLM
-    - __call__
-
-## FlaxXLMRobertaForSequenceClassification
-
-[[autodoc]] FlaxXLMRobertaForSequenceClassification
-    - __call__
-
-## FlaxXLMRobertaForMultipleChoice
-
-[[autodoc]] FlaxXLMRobertaForMultipleChoice
-    - __call__
-
-## FlaxXLMRobertaForTokenClassification
-
-[[autodoc]] FlaxXLMRobertaForTokenClassification
-    - __call__
-
-## FlaxXLMRobertaForQuestionAnswering
-
-[[autodoc]] FlaxXLMRobertaForQuestionAnswering
-    - __call__
diff --git a/docs/source/en/model_doc/xlm-v.md b/docs/source/en/model_doc/xlm-v.md
new file mode 100644
index 000000000000..38bed0dc46b5
--- /dev/null
+++ b/docs/source/en/model_doc/xlm-v.md
@@ -0,0 +1,47 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# XLM-V
+
+## Overview
+
+XLM-V is multilingual language model with a one million token vocabulary trained on 2.5TB of data from Common Crawl (same as XLM-R).
+It was introduced in the [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472)
+paper by Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer and Madian Khabsa.
+
+From the abstract of the XLM-V paper:
+
+*Large multilingual language models typically rely on a single vocabulary shared across 100+ languages.
+As these models have increased in parameter count and depth, vocabulary size has remained largely unchanged.
+This vocabulary bottleneck limits the representational capabilities of multilingual models like XLM-R.
+In this paper, we introduce a new approach for scaling to very large multilingual vocabularies by
+de-emphasizing token sharing between languages with little lexical overlap and assigning vocabulary capacity
+to achieve sufficient coverage for each individual language. Tokenizations using our vocabulary are typically
+more semantically meaningful and shorter compared to XLM-R. Leveraging this improved vocabulary, we train XLM-V,
+a multilingual language model with a one million token vocabulary. XLM-V outperforms XLM-R on every task we
+tested on ranging from natural language inference (XNLI), question answering (MLQA, XQuAD, TyDiQA), and
+named entity recognition (WikiAnn) to low-resource tasks (Americas NLI, MasakhaNER).*
+
+Tips:
+
+- XLM-V is compatible with the XLM-RoBERTa model architecture, only model weights from [`fairseq`](https://github.com/facebookresearch/fairseq)
+  library had to be converted.
+- The `XLMTokenizer` implementation is used to load the vocab and performs tokenization.
+
+A XLM-V (base size) model is available under the [`facebook/xlm-v-base`](https://huggingface.co/facebook/xlm-v-base) identifier.
+
+This model was contributed by [stefan-it](https://huggingface.co/stefan-it), including detailed experiments with XLM-V on downstream tasks.
+The experiments repository can be found [here](https://github.com/stefan-it/xlm-v-experiments).
diff --git a/docs/source/en/model_doc/xlm.md b/docs/source/en/model_doc/xlm.md
new file mode 100644
index 000000000000..8b5b31a2dbef
--- /dev/null
+++ b/docs/source/en/model_doc/xlm.md
@@ -0,0 +1,150 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# XLM
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=xlm">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-xlm-blueviolet">
+</a>
+<a href="https://huggingface.co/spaces/docs-demos/xlm-mlm-en-2048">
+<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
+</a>
+</div>
+
+## Overview
+
+The XLM model was proposed in [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by
+Guillaume Lample, Alexis Conneau. It's a transformer pretrained using one of the following objectives:
+
+- a causal language modeling (CLM) objective (next token prediction),
+- a masked language modeling (MLM) objective (BERT-like), or
+- a Translation Language Modeling (TLM) object (extension of BERT's MLM to multiple language inputs)
+
+The abstract from the paper is the following:
+
+*Recent studies have demonstrated the efficiency of generative pretraining for English natural language understanding.
+In this work, we extend this approach to multiple languages and show the effectiveness of cross-lingual pretraining. We
+propose two methods to learn cross-lingual language models (XLMs): one unsupervised that only relies on monolingual
+data, and one supervised that leverages parallel data with a new cross-lingual language model objective. We obtain
+state-of-the-art results on cross-lingual classification, unsupervised and supervised machine translation. On XNLI, our
+approach pushes the state of the art by an absolute gain of 4.9% accuracy. On unsupervised machine translation, we
+obtain 34.3 BLEU on WMT'16 German-English, improving the previous state of the art by more than 9 BLEU. On supervised
+machine translation, we obtain a new state of the art of 38.5 BLEU on WMT'16 Romanian-English, outperforming the
+previous best approach by more than 4 BLEU. Our code and pretrained models will be made publicly available.*
+
+Tips:
+
+- XLM has many different checkpoints, which were trained using different objectives: CLM, MLM or TLM. Make sure to
+  select the correct objective for your task (e.g. MLM checkpoints are not suitable for generation).
+- XLM has multilingual checkpoints which leverage a specific `lang` parameter. Check out the [multi-lingual](../multilingual) page for more information.
+- A transformer model trained on several languages. There are three different type of training for this model and the library provides checkpoints for all of them:
+
+    * Causal language modeling (CLM) which is the traditional autoregressive training (so this model could be in the previous section as well). One of the languages is selected for each training sample, and the model input is a sentence of 256 tokens, that may span over several documents in one of those languages.
+    * Masked language modeling (MLM) which is like RoBERTa. One of the languages is selected for each training sample, and the model input is a sentence of 256 tokens, that may span over several documents in one of those languages, with dynamic masking of the tokens.
+    * A combination of MLM and translation language modeling (TLM). This consists of concatenating a sentence in two different languages, with random masking. To predict one of the masked tokens, the model can use both, the surrounding context in language 1 and the context given by language 2.
+
+This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/facebookresearch/XLM/).
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Causal language modeling task guide](../tasks/language_modeling)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+- [Multiple choice task guide](../tasks/multiple_choice)
+
+## XLMConfig
+
+[[autodoc]] XLMConfig
+
+## XLMTokenizer
+
+[[autodoc]] XLMTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## XLM specific outputs
+
+[[autodoc]] models.xlm.modeling_xlm.XLMForQuestionAnsweringOutput
+
+## XLMModel
+
+[[autodoc]] XLMModel
+    - forward
+
+## XLMWithLMHeadModel
+
+[[autodoc]] XLMWithLMHeadModel
+    - forward
+
+## XLMForSequenceClassification
+
+[[autodoc]] XLMForSequenceClassification
+    - forward
+
+## XLMForMultipleChoice
+
+[[autodoc]] XLMForMultipleChoice
+    - forward
+
+## XLMForTokenClassification
+
+[[autodoc]] XLMForTokenClassification
+    - forward
+
+## XLMForQuestionAnsweringSimple
+
+[[autodoc]] XLMForQuestionAnsweringSimple
+    - forward
+
+## XLMForQuestionAnswering
+
+[[autodoc]] XLMForQuestionAnswering
+    - forward
+
+## TFXLMModel
+
+[[autodoc]] TFXLMModel
+    - call
+
+## TFXLMWithLMHeadModel
+
+[[autodoc]] TFXLMWithLMHeadModel
+    - call
+
+## TFXLMForSequenceClassification
+
+[[autodoc]] TFXLMForSequenceClassification
+    - call
+
+## TFXLMForMultipleChoice
+
+[[autodoc]] TFXLMForMultipleChoice
+    - call
+
+## TFXLMForTokenClassification
+
+[[autodoc]] TFXLMForTokenClassification
+    - call
+
+## TFXLMForQuestionAnsweringSimple
+
+[[autodoc]] TFXLMForQuestionAnsweringSimple
+    - call
diff --git a/docs/source/en/model_doc/xlm.mdx b/docs/source/en/model_doc/xlm.mdx
deleted file mode 100644
index a441c64c86c9..000000000000
--- a/docs/source/en/model_doc/xlm.mdx
+++ /dev/null
@@ -1,124 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# XLM
-
-## Overview
-
-The XLM model was proposed in [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by
-Guillaume Lample, Alexis Conneau. It's a transformer pretrained using one of the following objectives:
-
-- a causal language modeling (CLM) objective (next token prediction),
-- a masked language modeling (MLM) objective (BERT-like), or
-- a Translation Language Modeling (TLM) object (extension of BERT's MLM to multiple language inputs)
-
-The abstract from the paper is the following:
-
-*Recent studies have demonstrated the efficiency of generative pretraining for English natural language understanding.
-In this work, we extend this approach to multiple languages and show the effectiveness of cross-lingual pretraining. We
-propose two methods to learn cross-lingual language models (XLMs): one unsupervised that only relies on monolingual
-data, and one supervised that leverages parallel data with a new cross-lingual language model objective. We obtain
-state-of-the-art results on cross-lingual classification, unsupervised and supervised machine translation. On XNLI, our
-approach pushes the state of the art by an absolute gain of 4.9% accuracy. On unsupervised machine translation, we
-obtain 34.3 BLEU on WMT'16 German-English, improving the previous state of the art by more than 9 BLEU. On supervised
-machine translation, we obtain a new state of the art of 38.5 BLEU on WMT'16 Romanian-English, outperforming the
-previous best approach by more than 4 BLEU. Our code and pretrained models will be made publicly available.*
-
-Tips:
-
-- XLM has many different checkpoints, which were trained using different objectives: CLM, MLM or TLM. Make sure to
-  select the correct objective for your task (e.g. MLM checkpoints are not suitable for generation).
-- XLM has multilingual checkpoints which leverage a specific `lang` parameter. Check out the [multi-lingual](../multilingual) page for more information.
-
-This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/facebookresearch/XLM/).
-
-
-## XLMConfig
-
-[[autodoc]] XLMConfig
-
-## XLMTokenizer
-
-[[autodoc]] XLMTokenizer
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-
-## XLM specific outputs
-
-[[autodoc]] models.xlm.modeling_xlm.XLMForQuestionAnsweringOutput
-
-## XLMModel
-
-[[autodoc]] XLMModel
-    - forward
-
-## XLMWithLMHeadModel
-
-[[autodoc]] XLMWithLMHeadModel
-    - forward
-
-## XLMForSequenceClassification
-
-[[autodoc]] XLMForSequenceClassification
-    - forward
-
-## XLMForMultipleChoice
-
-[[autodoc]] XLMForMultipleChoice
-    - forward
-
-## XLMForTokenClassification
-
-[[autodoc]] XLMForTokenClassification
-    - forward
-
-## XLMForQuestionAnsweringSimple
-
-[[autodoc]] XLMForQuestionAnsweringSimple
-    - forward
-
-## XLMForQuestionAnswering
-
-[[autodoc]] XLMForQuestionAnswering
-    - forward
-
-## TFXLMModel
-
-[[autodoc]] TFXLMModel
-    - call
-
-## TFXLMWithLMHeadModel
-
-[[autodoc]] TFXLMWithLMHeadModel
-    - call
-
-## TFXLMForSequenceClassification
-
-[[autodoc]] TFXLMForSequenceClassification
-    - call
-
-## TFXLMForMultipleChoice
-
-[[autodoc]] TFXLMForMultipleChoice
-    - call
-
-## TFXLMForTokenClassification
-
-[[autodoc]] TFXLMForTokenClassification
-    - call
-
-## TFXLMForQuestionAnsweringSimple
-
-[[autodoc]] TFXLMForQuestionAnsweringSimple
-    - call
diff --git a/docs/source/en/model_doc/xlnet.md b/docs/source/en/model_doc/xlnet.md
new file mode 100644
index 000000000000..3685728cd72e
--- /dev/null
+++ b/docs/source/en/model_doc/xlnet.md
@@ -0,0 +1,176 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# XLNet
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=xlnet">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-xlnet-blueviolet">
+</a>
+<a href="https://huggingface.co/spaces/docs-demos/xlnet-base-cased">
+<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
+</a>
+</div>
+
+## Overview
+
+The XLNet model was proposed in [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang, Zihang Dai, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov,
+Quoc V. Le. XLnet is an extension of the Transformer-XL model pre-trained using an autoregressive method to learn
+bidirectional contexts by maximizing the expected likelihood over all permutations of the input sequence factorization
+order.
+
+The abstract from the paper is the following:
+
+*With the capability of modeling bidirectional contexts, denoising autoencoding based pretraining like BERT achieves
+better performance than pretraining approaches based on autoregressive language modeling. However, relying on
+corrupting the input with masks, BERT neglects dependency between the masked positions and suffers from a
+pretrain-finetune discrepancy. In light of these pros and cons, we propose XLNet, a generalized autoregressive
+pretraining method that (1) enables learning bidirectional contexts by maximizing the expected likelihood over all
+permutations of the factorization order and (2) overcomes the limitations of BERT thanks to its autoregressive
+formulation. Furthermore, XLNet integrates ideas from Transformer-XL, the state-of-the-art autoregressive model, into
+pretraining. Empirically, under comparable experiment settings, XLNet outperforms BERT on 20 tasks, often by a large
+margin, including question answering, natural language inference, sentiment analysis, and document ranking.*
+
+Tips:
+
+- The specific attention pattern can be controlled at training and test time using the `perm_mask` input.
+- Due to the difficulty of training a fully auto-regressive model over various factorization order, XLNet is pretrained
+  using only a sub-set of the output tokens as target which are selected with the `target_mapping` input.
+- To use XLNet for sequential decoding (i.e. not in fully bi-directional setting), use the `perm_mask` and
+  `target_mapping` inputs to control the attention span and outputs (see examples in
+  *examples/pytorch/text-generation/run_generation.py*)
+- XLNet is one of the few models that has no sequence length limit.
+- XLNet is not a traditional autoregressive model but uses a training strategy that builds on that. It permutes the tokens in the sentence, then allows the model to use the last n tokens to predict the token n+1. Since this is all done with a mask, the sentence is actually fed in the model in the right order, but instead of masking the first n tokens for n+1, XLNet uses a mask that hides the previous tokens in some given permutation of 1,…,sequence length.
+- XLNet also uses the same recurrence mechanism as Transformer-XL to build long-term dependencies.
+
+This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/zihangdai/xlnet/).
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Causal language modeling task guide](../tasks/language_modeling)
+- [Multiple choice task guide](../tasks/multiple_choice)
+
+## XLNetConfig
+
+[[autodoc]] XLNetConfig
+
+## XLNetTokenizer
+
+[[autodoc]] XLNetTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## XLNetTokenizerFast
+
+[[autodoc]] XLNetTokenizerFast
+
+## XLNet specific outputs
+
+[[autodoc]] models.xlnet.modeling_xlnet.XLNetModelOutput
+
+[[autodoc]] models.xlnet.modeling_xlnet.XLNetLMHeadModelOutput
+
+[[autodoc]] models.xlnet.modeling_xlnet.XLNetForSequenceClassificationOutput
+
+[[autodoc]] models.xlnet.modeling_xlnet.XLNetForMultipleChoiceOutput
+
+[[autodoc]] models.xlnet.modeling_xlnet.XLNetForTokenClassificationOutput
+
+[[autodoc]] models.xlnet.modeling_xlnet.XLNetForQuestionAnsweringSimpleOutput
+
+[[autodoc]] models.xlnet.modeling_xlnet.XLNetForQuestionAnsweringOutput
+
+[[autodoc]] models.xlnet.modeling_tf_xlnet.TFXLNetModelOutput
+
+[[autodoc]] models.xlnet.modeling_tf_xlnet.TFXLNetLMHeadModelOutput
+
+[[autodoc]] models.xlnet.modeling_tf_xlnet.TFXLNetForSequenceClassificationOutput
+
+[[autodoc]] models.xlnet.modeling_tf_xlnet.TFXLNetForMultipleChoiceOutput
+
+[[autodoc]] models.xlnet.modeling_tf_xlnet.TFXLNetForTokenClassificationOutput
+
+[[autodoc]] models.xlnet.modeling_tf_xlnet.TFXLNetForQuestionAnsweringSimpleOutput
+
+## XLNetModel
+
+[[autodoc]] XLNetModel
+    - forward
+
+## XLNetLMHeadModel
+
+[[autodoc]] XLNetLMHeadModel
+    - forward
+
+## XLNetForSequenceClassification
+
+[[autodoc]] XLNetForSequenceClassification
+    - forward
+
+## XLNetForMultipleChoice
+
+[[autodoc]] XLNetForMultipleChoice
+    - forward
+
+## XLNetForTokenClassification
+
+[[autodoc]] XLNetForTokenClassification
+    - forward
+
+## XLNetForQuestionAnsweringSimple
+
+[[autodoc]] XLNetForQuestionAnsweringSimple
+    - forward
+
+## XLNetForQuestionAnswering
+
+[[autodoc]] XLNetForQuestionAnswering
+    - forward
+
+## TFXLNetModel
+
+[[autodoc]] TFXLNetModel
+    - call
+
+## TFXLNetLMHeadModel
+
+[[autodoc]] TFXLNetLMHeadModel
+    - call
+
+## TFXLNetForSequenceClassification
+
+[[autodoc]] TFXLNetForSequenceClassification
+    - call
+
+## TFLNetForMultipleChoice
+
+[[autodoc]] TFXLNetForMultipleChoice
+    - call
+
+## TFXLNetForTokenClassification
+
+[[autodoc]] TFXLNetForTokenClassification
+    - call
+
+## TFXLNetForQuestionAnsweringSimple
+
+[[autodoc]] TFXLNetForQuestionAnsweringSimple
+    - call
diff --git a/docs/source/en/model_doc/xlnet.mdx b/docs/source/en/model_doc/xlnet.mdx
deleted file mode 100644
index ca30574690c5..000000000000
--- a/docs/source/en/model_doc/xlnet.mdx
+++ /dev/null
@@ -1,154 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# XLNet
-
-## Overview
-
-The XLNet model was proposed in [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang, Zihang Dai, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov,
-Quoc V. Le. XLnet is an extension of the Transformer-XL model pre-trained using an autoregressive method to learn
-bidirectional contexts by maximizing the expected likelihood over all permutations of the input sequence factorization
-order.
-
-The abstract from the paper is the following:
-
-*With the capability of modeling bidirectional contexts, denoising autoencoding based pretraining like BERT achieves
-better performance than pretraining approaches based on autoregressive language modeling. However, relying on
-corrupting the input with masks, BERT neglects dependency between the masked positions and suffers from a
-pretrain-finetune discrepancy. In light of these pros and cons, we propose XLNet, a generalized autoregressive
-pretraining method that (1) enables learning bidirectional contexts by maximizing the expected likelihood over all
-permutations of the factorization order and (2) overcomes the limitations of BERT thanks to its autoregressive
-formulation. Furthermore, XLNet integrates ideas from Transformer-XL, the state-of-the-art autoregressive model, into
-pretraining. Empirically, under comparable experiment settings, XLNet outperforms BERT on 20 tasks, often by a large
-margin, including question answering, natural language inference, sentiment analysis, and document ranking.*
-
-Tips:
-
-- The specific attention pattern can be controlled at training and test time using the `perm_mask` input.
-- Due to the difficulty of training a fully auto-regressive model over various factorization order, XLNet is pretrained
-  using only a sub-set of the output tokens as target which are selected with the `target_mapping` input.
-- To use XLNet for sequential decoding (i.e. not in fully bi-directional setting), use the `perm_mask` and
-  `target_mapping` inputs to control the attention span and outputs (see examples in
-  *examples/pytorch/text-generation/run_generation.py*)
-- XLNet is one of the few models that has no sequence length limit.
-
-This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/zihangdai/xlnet/).
-
-
-## XLNetConfig
-
-[[autodoc]] XLNetConfig
-
-## XLNetTokenizer
-
-[[autodoc]] XLNetTokenizer
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-
-## XLNetTokenizerFast
-
-[[autodoc]] XLNetTokenizerFast
-
-## XLNet specific outputs
-
-[[autodoc]] models.xlnet.modeling_xlnet.XLNetModelOutput
-
-[[autodoc]] models.xlnet.modeling_xlnet.XLNetLMHeadModelOutput
-
-[[autodoc]] models.xlnet.modeling_xlnet.XLNetForSequenceClassificationOutput
-
-[[autodoc]] models.xlnet.modeling_xlnet.XLNetForMultipleChoiceOutput
-
-[[autodoc]] models.xlnet.modeling_xlnet.XLNetForTokenClassificationOutput
-
-[[autodoc]] models.xlnet.modeling_xlnet.XLNetForQuestionAnsweringSimpleOutput
-
-[[autodoc]] models.xlnet.modeling_xlnet.XLNetForQuestionAnsweringOutput
-
-[[autodoc]] models.xlnet.modeling_tf_xlnet.TFXLNetModelOutput
-
-[[autodoc]] models.xlnet.modeling_tf_xlnet.TFXLNetLMHeadModelOutput
-
-[[autodoc]] models.xlnet.modeling_tf_xlnet.TFXLNetForSequenceClassificationOutput
-
-[[autodoc]] models.xlnet.modeling_tf_xlnet.TFXLNetForMultipleChoiceOutput
-
-[[autodoc]] models.xlnet.modeling_tf_xlnet.TFXLNetForTokenClassificationOutput
-
-[[autodoc]] models.xlnet.modeling_tf_xlnet.TFXLNetForQuestionAnsweringSimpleOutput
-
-## XLNetModel
-
-[[autodoc]] XLNetModel
-    - forward
-
-## XLNetLMHeadModel
-
-[[autodoc]] XLNetLMHeadModel
-    - forward
-
-## XLNetForSequenceClassification
-
-[[autodoc]] XLNetForSequenceClassification
-    - forward
-
-## XLNetForMultipleChoice
-
-[[autodoc]] XLNetForMultipleChoice
-    - forward
-
-## XLNetForTokenClassification
-
-[[autodoc]] XLNetForTokenClassification
-    - forward
-
-## XLNetForQuestionAnsweringSimple
-
-[[autodoc]] XLNetForQuestionAnsweringSimple
-    - forward
-
-## XLNetForQuestionAnswering
-
-[[autodoc]] XLNetForQuestionAnswering
-    - forward
-
-## TFXLNetModel
-
-[[autodoc]] TFXLNetModel
-    - call
-
-## TFXLNetLMHeadModel
-
-[[autodoc]] TFXLNetLMHeadModel
-    - call
-
-## TFXLNetForSequenceClassification
-
-[[autodoc]] TFXLNetForSequenceClassification
-    - call
-
-## TFLNetForMultipleChoice
-
-[[autodoc]] TFXLNetForMultipleChoice
-    - call
-
-## TFXLNetForTokenClassification
-
-[[autodoc]] TFXLNetForTokenClassification
-    - call
-
-## TFXLNetForQuestionAnsweringSimple
-
-[[autodoc]] TFXLNetForQuestionAnsweringSimple
-    - call
diff --git a/docs/source/en/model_doc/xls_r.md b/docs/source/en/model_doc/xls_r.md
new file mode 100644
index 000000000000..8e22004244ca
--- /dev/null
+++ b/docs/source/en/model_doc/xls_r.md
@@ -0,0 +1,47 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# XLS-R
+
+## Overview
+
+The XLS-R model was proposed in [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman
+Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
+
+The abstract from the paper is the following:
+
+*This paper presents XLS-R, a large-scale model for cross-lingual speech representation learning based on wav2vec 2.0.
+We train models with up to 2B parameters on nearly half a million hours of publicly available speech audio in 128
+languages, an order of magnitude more public data than the largest known prior work. Our evaluation covers a wide range
+of tasks, domains, data regimes and languages, both high and low-resource. On the CoVoST-2 speech translation
+benchmark, we improve the previous state of the art by an average of 7.4 BLEU over 21 translation directions into
+English. For speech recognition, XLS-R improves over the best known prior work on BABEL, MLS, CommonVoice as well as
+VoxPopuli, lowering error rates by 14-34% relative on average. XLS-R also sets a new state of the art on VoxLingua107
+language identification. Moreover, we show that with sufficient model size, cross-lingual pretraining can outperform
+English-only pretraining when translating English speech into other languages, a setting which favors monolingual
+pretraining. We hope XLS-R can help to improve speech processing tasks for many more languages of the world.*
+
+Tips:
+
+- XLS-R is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
+- XLS-R model was trained using connectionist temporal classification (CTC) so the model output has to be decoded using
+  [`Wav2Vec2CTCTokenizer`].
+
+Relevant checkpoints can be found under https://huggingface.co/models?other=xls_r.
+
+XLS-R's architecture is based on the Wav2Vec2 model, so one can refer to [Wav2Vec2's documentation page](wav2vec2).
+
+The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/fairseq/models/wav2vec).
diff --git a/docs/source/en/model_doc/xls_r.mdx b/docs/source/en/model_doc/xls_r.mdx
deleted file mode 100644
index 82a7e3b8afbd..000000000000
--- a/docs/source/en/model_doc/xls_r.mdx
+++ /dev/null
@@ -1,43 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# XLS-R
-
-## Overview
-
-The XLS-R model was proposed in [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman
-Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
-
-The abstract from the paper is the following:
-
-*This paper presents XLS-R, a large-scale model for cross-lingual speech representation learning based on wav2vec 2.0.
-We train models with up to 2B parameters on nearly half a million hours of publicly available speech audio in 128
-languages, an order of magnitude more public data than the largest known prior work. Our evaluation covers a wide range
-of tasks, domains, data regimes and languages, both high and low-resource. On the CoVoST-2 speech translation
-benchmark, we improve the previous state of the art by an average of 7.4 BLEU over 21 translation directions into
-English. For speech recognition, XLS-R improves over the best known prior work on BABEL, MLS, CommonVoice as well as
-VoxPopuli, lowering error rates by 14-34% relative on average. XLS-R also sets a new state of the art on VoxLingua107
-language identification. Moreover, we show that with sufficient model size, cross-lingual pretraining can outperform
-English-only pretraining when translating English speech into other languages, a setting which favors monolingual
-pretraining. We hope XLS-R can help to improve speech processing tasks for many more languages of the world.*
-
-Tips:
-
-- XLS-R is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
-- XLS-R model was trained using connectionist temporal classification (CTC) so the model output has to be decoded using
-  [`Wav2Vec2CTCTokenizer`].
-
-Relevant checkpoints can be found under https://huggingface.co/models?other=xls_r.
-
-XLS-R's architecture is based on the Wav2Vec2 model, so one can refer to [Wav2Vec2's documentation page](wav2vec2).
-
-The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/fairseq/models/wav2vec).
diff --git a/docs/source/en/model_doc/xlsr_wav2vec2.md b/docs/source/en/model_doc/xlsr_wav2vec2.md
new file mode 100644
index 000000000000..643d37416d38
--- /dev/null
+++ b/docs/source/en/model_doc/xlsr_wav2vec2.md
@@ -0,0 +1,45 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# XLSR-Wav2Vec2
+
+## Overview
+
+The XLSR-Wav2Vec2 model was proposed in [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael
+Auli.
+
+The abstract from the paper is the following:
+
+*This paper presents XLSR which learns cross-lingual speech representations by pretraining a single model from the raw
+waveform of speech in multiple languages. We build on wav2vec 2.0 which is trained by solving a contrastive task over
+masked latent speech representations and jointly learns a quantization of the latents shared across languages. The
+resulting model is fine-tuned on labeled data and experiments show that cross-lingual pretraining significantly
+outperforms monolingual pretraining. On the CommonVoice benchmark, XLSR shows a relative phoneme error rate reduction
+of 72% compared to the best known results. On BABEL, our approach improves word error rate by 16% relative compared to
+a comparable system. Our approach enables a single multilingual speech recognition model which is competitive to strong
+individual models. Analysis shows that the latent discrete speech representations are shared across languages with
+increased sharing for related languages. We hope to catalyze research in low-resource speech understanding by releasing
+XLSR-53, a large model pretrained in 53 languages.*
+
+Tips:
+
+- XLSR-Wav2Vec2 is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
+- XLSR-Wav2Vec2 model was trained using connectionist temporal classification (CTC) so the model output has to be
+  decoded using [`Wav2Vec2CTCTokenizer`].
+
+XLSR-Wav2Vec2's architecture is based on the Wav2Vec2 model, so one can refer to [Wav2Vec2's documentation page](wav2vec2).
+
+The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/fairseq/models/wav2vec).
diff --git a/docs/source/en/model_doc/xlsr_wav2vec2.mdx b/docs/source/en/model_doc/xlsr_wav2vec2.mdx
deleted file mode 100644
index 32229f28b147..000000000000
--- a/docs/source/en/model_doc/xlsr_wav2vec2.mdx
+++ /dev/null
@@ -1,41 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# XLSR-Wav2Vec2
-
-## Overview
-
-The XLSR-Wav2Vec2 model was proposed in [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael
-Auli.
-
-The abstract from the paper is the following:
-
-*This paper presents XLSR which learns cross-lingual speech representations by pretraining a single model from the raw
-waveform of speech in multiple languages. We build on wav2vec 2.0 which is trained by solving a contrastive task over
-masked latent speech representations and jointly learns a quantization of the latents shared across languages. The
-resulting model is fine-tuned on labeled data and experiments show that cross-lingual pretraining significantly
-outperforms monolingual pretraining. On the CommonVoice benchmark, XLSR shows a relative phoneme error rate reduction
-of 72% compared to the best known results. On BABEL, our approach improves word error rate by 16% relative compared to
-a comparable system. Our approach enables a single multilingual speech recognition model which is competitive to strong
-individual models. Analysis shows that the latent discrete speech representations are shared across languages with
-increased sharing for related languages. We hope to catalyze research in low-resource speech understanding by releasing
-XLSR-53, a large model pretrained in 53 languages.*
-
-Tips:
-
-- XLSR-Wav2Vec2 is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
-- XLSR-Wav2Vec2 model was trained using connectionist temporal classification (CTC) so the model output has to be
-  decoded using [`Wav2Vec2CTCTokenizer`].
-
-XLSR-Wav2Vec2's architecture is based on the Wav2Vec2 model, so one can refer to [Wav2Vec2's documentation page](wav2vec2).
-
-The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/fairseq/models/wav2vec).
diff --git a/docs/source/en/model_doc/xmod.md b/docs/source/en/model_doc/xmod.md
new file mode 100644
index 000000000000..5a3409bbc4c3
--- /dev/null
+++ b/docs/source/en/model_doc/xmod.md
@@ -0,0 +1,131 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# X-MOD
+
+## Overview
+
+The X-MOD model was proposed in [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) by Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, and Mikel Artetxe.
+X-MOD extends multilingual masked language models like [XLM-R](xlm-roberta) to include language-specific modular components (_language adapters_) during pre-training. For fine-tuning, the language adapters in each transformer layer are frozen.
+
+The abstract from the paper is the following:
+
+*Multilingual pre-trained models are known to suffer from the curse of multilinguality, which causes per-language performance to drop as they cover more languages. We address this issue by introducing language-specific modules, which allows us to grow the total capacity of the model, while keeping the total number of trainable parameters per language constant. In contrast with prior work that learns language-specific components post-hoc, we pre-train the modules of our Cross-lingual Modular (X-MOD) models from the start. Our experiments on natural language inference, named entity recognition and question answering show that our approach not only mitigates the negative interference between languages, but also enables positive transfer, resulting in improved monolingual and cross-lingual performance. Furthermore, our approach enables adding languages post-hoc with no measurable drop in performance, no longer limiting the model usage to the set of pre-trained languages.*
+
+Tips:
+- X-MOD is similar to [XLM-R](xlm-roberta), but a difference is that the input language needs to be specified so that the correct language adapter can be activated.
+- The main models – base and large – have adapters for 81 languages.
+
+This model was contributed by [jvamvas](https://huggingface.co/jvamvas).
+The original code can be found [here](https://github.com/facebookresearch/fairseq/tree/58cc6cca18f15e6d56e3f60c959fe4f878960a60/fairseq/models/xmod) and the original documentation is found [here](https://github.com/facebookresearch/fairseq/tree/58cc6cca18f15e6d56e3f60c959fe4f878960a60/examples/xmod).
+
+## Adapter Usage
+
+### Input language
+
+There are two ways to specify the input language:
+1. By setting a default language before using the model:
+
+```python
+from transformers import XmodModel
+
+model = XmodModel.from_pretrained("facebook/xmod-base")
+model.set_default_language("en_XX")
+```
+
+2. By explicitly passing the index of the language adapter for each sample:
+
+```python
+import torch
+
+input_ids = torch.tensor(
+    [
+        [0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2],
+        [0, 1310, 49083, 443, 269, 71, 5486, 165, 60429, 660, 23, 2],
+    ]
+)
+lang_ids = torch.LongTensor(
+    [
+        0,  # en_XX
+        8,  # de_DE
+    ]
+)
+output = model(input_ids, lang_ids=lang_ids)
+```
+
+### Fine-tuning
+The paper recommends that the embedding layer and the language adapters are frozen during fine-tuning. A method for doing this is provided:
+
+```python
+model.freeze_embeddings_and_language_adapters()
+# Fine-tune the model ...
+```
+
+### Cross-lingual transfer
+After fine-tuning, zero-shot cross-lingual transfer can be tested by activating the language adapter of the target language:
+
+```python
+model.set_default_language("de_DE")
+# Evaluate the model on German examples ...
+```
+
+## Resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Causal language modeling task guide](../tasks/language_modeling)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+- [Multiple choice task guide](../tasks/multiple_choice)
+
+## XmodConfig
+
+[[autodoc]] XmodConfig
+
+## XmodModel
+
+[[autodoc]] XmodModel
+    - forward
+
+## XmodForCausalLM
+
+[[autodoc]] XmodForCausalLM
+    - forward
+
+## XmodForMaskedLM
+
+[[autodoc]] XmodForMaskedLM
+    - forward
+
+## XmodForSequenceClassification
+
+[[autodoc]] XmodForSequenceClassification
+    - forward
+
+## XmodForMultipleChoice
+
+[[autodoc]] XmodForMultipleChoice
+    - forward
+
+## XmodForTokenClassification
+
+[[autodoc]] XmodForTokenClassification
+    - forward
+
+## XmodForQuestionAnswering
+
+[[autodoc]] XmodForQuestionAnswering
+    - forward
diff --git a/docs/source/en/model_doc/yolos.md b/docs/source/en/model_doc/yolos.md
new file mode 100644
index 000000000000..6185c3a06757
--- /dev/null
+++ b/docs/source/en/model_doc/yolos.md
@@ -0,0 +1,78 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# YOLOS
+
+## Overview
+
+The YOLOS model was proposed in [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
+YOLOS proposes to just leverage the plain [Vision Transformer (ViT)](vit) for object detection, inspired by DETR. It turns out that a base-sized encoder-only Transformer can also achieve 42 AP on COCO, similar to DETR and much more complex frameworks such as Faster R-CNN.
+
+The abstract from the paper is the following:
+
+*Can Transformer perform 2D object- and region-level recognition from a pure sequence-to-sequence perspective with minimal knowledge about the 2D spatial structure? To answer this question, we present You Only Look at One Sequence (YOLOS), a series of object detection models based on the vanilla Vision Transformer with the fewest possible modifications, region priors, as well as inductive biases of the target task. We find that YOLOS pre-trained on the mid-sized ImageNet-1k dataset only can already achieve quite competitive performance on the challenging COCO object detection benchmark, e.g., YOLOS-Base directly adopted from BERT-Base architecture can obtain 42.0 box AP on COCO val. We also discuss the impacts as well as limitations of current pre-train schemes and model scaling strategies for Transformer in vision through YOLOS.*
+
+Tips:
+
+- One can use [`YolosImageProcessor`] for preparing images (and optional targets) for the model. Contrary to [DETR](detr), YOLOS doesn't require a `pixel_mask` to be created.
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/yolos_architecture.png"
+alt="drawing" width="600"/>
+
+<small> YOLOS architecture. Taken from the <a href="https://arxiv.org/abs/2106.00666">original paper</a>.</small>
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/hustvl/YOLOS).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with YOLOS.
+
+<PipelineTag pipeline="object-detection"/>
+
+- All example notebooks illustrating inference + fine-tuning [`YolosForObjectDetection`] on a custom dataset can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/YOLOS).
+- See also: [Object detection task guide](../tasks/object_detection)
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## YolosConfig
+
+[[autodoc]] YolosConfig
+
+## YolosImageProcessor
+
+[[autodoc]] YolosImageProcessor
+    - preprocess
+    - pad
+    - post_process_object_detection
+
+## YolosFeatureExtractor
+
+[[autodoc]] YolosFeatureExtractor
+    - __call__
+    - pad
+    - post_process_object_detection
+
+
+## YolosModel
+
+[[autodoc]] YolosModel
+    - forward
+
+
+## YolosForObjectDetection
+
+[[autodoc]] YolosForObjectDetection
+    - forward
diff --git a/docs/source/en/model_doc/yolos.mdx b/docs/source/en/model_doc/yolos.mdx
deleted file mode 100644
index 838517ea765e..000000000000
--- a/docs/source/en/model_doc/yolos.mdx
+++ /dev/null
@@ -1,64 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# YOLOS
-
-## Overview
-
-The YOLOS model was proposed in [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
-YOLOS proposes to just leverage the plain [Vision Transformer (ViT)](vit) for object detection, inspired by DETR. It turns out that a base-sized encoder-only Transformer can also achieve 42 AP on COCO, similar to DETR and much more complex frameworks such as Faster R-CNN.
-
-The abstract from the paper is the following:
-
-*Can Transformer perform 2D object- and region-level recognition from a pure sequence-to-sequence perspective with minimal knowledge about the 2D spatial structure? To answer this question, we present You Only Look at One Sequence (YOLOS), a series of object detection models based on the vanilla Vision Transformer with the fewest possible modifications, region priors, as well as inductive biases of the target task. We find that YOLOS pre-trained on the mid-sized ImageNet-1k dataset only can already achieve quite competitive performance on the challenging COCO object detection benchmark, e.g., YOLOS-Base directly adopted from BERT-Base architecture can obtain 42.0 box AP on COCO val. We also discuss the impacts as well as limitations of current pre-train schemes and model scaling strategies for Transformer in vision through YOLOS.*
-
-Tips:
-
-- One can use [`YolosImageProcessor`] for preparing images (and optional targets) for the model. Contrary to [DETR](detr), YOLOS doesn't require a `pixel_mask` to be created.
-- Demo notebooks (regarding inference and fine-tuning on custom data) can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/YOLOS).
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/yolos_architecture.png"
-alt="drawing" width="600"/>
-
-<small> YOLOS architecture. Taken from the <a href="https://arxiv.org/abs/2106.00666">original paper</a>.</small>
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/hustvl/YOLOS).
-
-## YolosConfig
-
-[[autodoc]] YolosConfig
-
-## YolosImageProcessor
-
-[[autodoc]] YolosImageProcessor
-    - preprocess
-    - pad
-    - post_process_object_detection
-
-## YolosFeatureExtractor
-
-[[autodoc]] YolosFeatureExtractor
-    - __call__
-    - pad
-    - post_process_object_detection
-
-
-## YolosModel
-
-[[autodoc]] YolosModel
-    - forward
-
-
-## YolosForObjectDetection
-
-[[autodoc]] YolosForObjectDetection
-    - forward
diff --git a/docs/source/en/model_doc/yoso.md b/docs/source/en/model_doc/yoso.md
new file mode 100644
index 000000000000..4b98cd348c9a
--- /dev/null
+++ b/docs/source/en/model_doc/yoso.md
@@ -0,0 +1,102 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# YOSO
+
+## Overview
+
+The YOSO model was proposed in [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714)  
+by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh. YOSO approximates standard softmax self-attention
+via a Bernoulli sampling scheme based on Locality Sensitive Hashing (LSH). In principle, all the Bernoulli random variables can be sampled with
+a single hash. 
+
+The abstract from the paper is the following:
+
+*Transformer-based models are widely used in natural language processing (NLP). Central to the transformer model is 
+the self-attention mechanism, which captures the interactions of token pairs in the input sequences and depends quadratically 
+on the sequence length. Training such models on longer sequences is expensive. In this paper, we show that a Bernoulli sampling 
+attention mechanism based on Locality Sensitive Hashing (LSH), decreases the quadratic complexity of such models to linear. 
+We bypass the quadratic cost by considering self-attention as a sum of individual tokens associated with Bernoulli random 
+variables that can, in principle, be sampled at once by a single hash (although in practice, this number may be a small constant). 
+This leads to an efficient sampling scheme to estimate self-attention which relies on specific modifications of 
+LSH (to enable deployment on GPU architectures). We evaluate our algorithm on the GLUE benchmark with standard 512 sequence 
+length where we see favorable performance relative to a standard pretrained Transformer. On the Long Range Arena (LRA) benchmark, 
+for evaluating performance on long sequences, our method achieves results consistent with softmax self-attention but with sizable 
+speed-ups and memory savings and often outperforms other efficient self-attention methods. Our code is available at this https URL*
+
+Tips:
+
+- The YOSO attention algorithm is implemented through custom CUDA kernels, functions written in CUDA C++ that can be executed multiple times
+in parallel on a GPU.
+- The kernels provide a `fast_hash` function, which approximates the random projections of the queries and keys using the Fast Hadamard Transform. Using these
+hash codes, the `lsh_cumulation` function approximates self-attention via LSH-based Bernoulli sampling.
+- To use the custom kernels, the user should set `config.use_expectation = False`. To ensure that the kernels are compiled successfully, 
+the user must install the correct version of PyTorch and cudatoolkit. By default, `config.use_expectation = True`, which uses YOSO-E and 
+does not require compiling CUDA kernels.
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/yoso_architecture.jpg"
+alt="drawing" width="600"/> 
+
+<small> YOSO Attention Algorithm. Taken from the <a href="https://arxiv.org/abs/2111.09714">original paper</a>.</small>
+
+This model was contributed by [novice03](https://huggingface.co/novice03). The original code can be found [here](https://github.com/mlpen/YOSO).
+
+## Documentation resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+- [Multiple choice task guide](../tasks/multiple_choice)
+
+## YosoConfig
+
+[[autodoc]] YosoConfig
+
+
+## YosoModel
+
+[[autodoc]] YosoModel
+    - forward
+
+
+## YosoForMaskedLM
+
+[[autodoc]] YosoForMaskedLM
+    - forward
+
+
+## YosoForSequenceClassification
+
+[[autodoc]] YosoForSequenceClassification
+    - forward
+
+## YosoForMultipleChoice
+
+[[autodoc]] YosoForMultipleChoice
+    - forward
+
+
+## YosoForTokenClassification
+
+[[autodoc]] YosoForTokenClassification
+    - forward
+
+
+## YosoForQuestionAnswering
+
+[[autodoc]] YosoForQuestionAnswering
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/yoso.mdx b/docs/source/en/model_doc/yoso.mdx
deleted file mode 100644
index 997ab4d09416..000000000000
--- a/docs/source/en/model_doc/yoso.mdx
+++ /dev/null
@@ -1,91 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# YOSO
-
-## Overview
-
-The YOSO model was proposed in [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714)  
-by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh. YOSO approximates standard softmax self-attention
-via a Bernoulli sampling scheme based on Locality Sensitive Hashing (LSH). In principle, all the Bernoulli random variables can be sampled with
-a single hash. 
-
-The abstract from the paper is the following:
-
-*Transformer-based models are widely used in natural language processing (NLP). Central to the transformer model is 
-the self-attention mechanism, which captures the interactions of token pairs in the input sequences and depends quadratically 
-on the sequence length. Training such models on longer sequences is expensive. In this paper, we show that a Bernoulli sampling 
-attention mechanism based on Locality Sensitive Hashing (LSH), decreases the quadratic complexity of such models to linear. 
-We bypass the quadratic cost by considering self-attention as a sum of individual tokens associated with Bernoulli random 
-variables that can, in principle, be sampled at once by a single hash (although in practice, this number may be a small constant). 
-This leads to an efficient sampling scheme to estimate self-attention which relies on specific modifications of 
-LSH (to enable deployment on GPU architectures). We evaluate our algorithm on the GLUE benchmark with standard 512 sequence 
-length where we see favorable performance relative to a standard pretrained Transformer. On the Long Range Arena (LRA) benchmark, 
-for evaluating performance on long sequences, our method achieves results consistent with softmax self-attention but with sizable 
-speed-ups and memory savings and often outperforms other efficient self-attention methods. Our code is available at this https URL*
-
-Tips:
-
-- The YOSO attention algorithm is implemented through custom CUDA kernels, functions written in CUDA C++ that can be executed multiple times
-in parallel on a GPU.
-- The kernels provide a `fast_hash` function, which approximates the random projections of the queries and keys using the Fast Hadamard Transform. Using these
-hash codes, the `lsh_cumulation` function approximates self-attention via LSH-based Bernoulli sampling.
-- To use the custom kernels, the user should set `config.use_expectation = False`. To ensure that the kernels are compiled successfully, 
-the user must install the correct version of PyTorch and cudatoolkit. By default, `config.use_expectation = True`, which uses YOSO-E and 
-does not require compiling CUDA kernels.
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/yoso_architecture.jpg"
-alt="drawing" width="600"/> 
-
-<small> YOSO Attention Algorithm. Taken from the <a href="https://arxiv.org/abs/2111.09714">original paper</a>.</small>
-
-This model was contributed by [novice03](https://huggingface.co/novice03). The original code can be found [here](https://github.com/mlpen/YOSO).
-
-
-## YosoConfig
-
-[[autodoc]] YosoConfig
-
-
-## YosoModel
-
-[[autodoc]] YosoModel
-    - forward
-
-
-## YosoForMaskedLM
-
-[[autodoc]] YosoForMaskedLM
-    - forward
-
-
-## YosoForSequenceClassification
-
-[[autodoc]] YosoForSequenceClassification
-    - forward
-
-## YosoForMultipleChoice
-
-[[autodoc]] YosoForMultipleChoice
-    - forward
-
-
-## YosoForTokenClassification
-
-[[autodoc]] YosoForTokenClassification
-    - forward
-
-
-## YosoForQuestionAnswering
-
-[[autodoc]] YosoForQuestionAnswering
-    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_memory_anatomy.md b/docs/source/en/model_memory_anatomy.md
new file mode 100644
index 000000000000..0a0d5bb5b8bf
--- /dev/null
+++ b/docs/source/en/model_memory_anatomy.md
@@ -0,0 +1,272 @@
+<!---
+Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Model training anatomy
+
+To understand performance optimization techniques that one can apply to improve efficiency of model training 
+speed and memory utilization, it's helpful to get familiar with how GPU is utilized during training, and how compute 
+intensity varies depending on an operation performed.
+
+Let's start by exploring a motivating example of GPU utilization and the training run of a model. For the demonstration, 
+we'll need to install a few libraries: 
+
+```bash
+pip install transformers datasets accelerate nvidia-ml-py3
+```
+
+The `nvidia-ml-py3` library allows us to monitor the memory usage of the models from within Python. You might be familiar 
+with the `nvidia-smi` command in the terminal - this library allows to access the same information in Python directly.
+
+Then, we create some dummy data: random token IDs between 100 and 30000 and binary labels for a classifier. 
+In total, we get 512 sequences each with length 512 and store them in a [`~datasets.Dataset`] with PyTorch format.
+
+
+```py
+>>> import numpy as np
+>>> from datasets import Dataset
+
+
+>>> seq_len, dataset_size = 512, 512
+>>> dummy_data = {
+...     "input_ids": np.random.randint(100, 30000, (dataset_size, seq_len)),
+...     "labels": np.random.randint(0, 1, (dataset_size)),
+... }
+>>> ds = Dataset.from_dict(dummy_data)
+>>> ds.set_format("pt")
+```
+
+To print summary statistics for the GPU utilization and the training run with the [`Trainer`] we define two helper functions:
+
+```py
+>>> from pynvml import *
+
+
+>>> def print_gpu_utilization():
+...     nvmlInit()
+...     handle = nvmlDeviceGetHandleByIndex(0)
+...     info = nvmlDeviceGetMemoryInfo(handle)
+...     print(f"GPU memory occupied: {info.used//1024**2} MB.")
+
+
+>>> def print_summary(result):
+...     print(f"Time: {result.metrics['train_runtime']:.2f}")
+...     print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
+...     print_gpu_utilization()
+```
+
+Let's verify that we start with a free GPU memory:
+
+```py
+>>> print_gpu_utilization()
+GPU memory occupied: 0 MB.
+```
+
+That looks good: the GPU memory is not occupied as we would expect before we load any models. If that's not the case on 
+your machine make sure to stop all processes that are using GPU memory. However, not all free GPU memory can be used by 
+the user. When a model is loaded to the GPU the kernels are also loaded, which can take up 1-2GB of memory. To see how 
+much it is we load a tiny tensor into the GPU which triggers the kernels to be loaded as well.
+
+```py
+>>> import torch
+
+
+>>> torch.ones((1, 1)).to("cuda")
+>>> print_gpu_utilization()
+GPU memory occupied: 1343 MB.
+```
+
+We see that the kernels alone take up 1.3GB of GPU memory. Now let's see how much space the model uses.
+
+## Load Model
+
+First, we load the `bert-large-uncased` model. We load the model weights directly to the GPU so that we can check 
+how much space just the weights use.
+
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("bert-large-uncased").to("cuda")
+>>> print_gpu_utilization()
+GPU memory occupied: 2631 MB.
+```
+
+We can see that the model weights alone take up 1.3 GB of GPU memory. The exact number depends on the specific 
+GPU you are using. Note that on newer GPUs a model can sometimes take up more space since the weights are loaded in an 
+optimized fashion that speeds up the usage of the model. Now we can also quickly check if we get the same result 
+as with `nvidia-smi` CLI:
+
+
+```bash
+nvidia-smi
+```
+
+```bash
+Tue Jan 11 08:58:05 2022
++-----------------------------------------------------------------------------+
+| NVIDIA-SMI 460.91.03    Driver Version: 460.91.03    CUDA Version: 11.2     |
+|-------------------------------+----------------------+----------------------+
+| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
+|                               |                      |               MIG M. |
+|===============================+======================+======================|
+|   0  Tesla V100-SXM2...  On   | 00000000:00:04.0 Off |                    0 |
+| N/A   37C    P0    39W / 300W |   2631MiB / 16160MiB |      0%      Default |
+|                               |                      |                  N/A |
++-------------------------------+----------------------+----------------------+
+
++-----------------------------------------------------------------------------+
+| Processes:                                                                  |
+|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
+|        ID   ID                                                   Usage      |
+|=============================================================================|
+|    0   N/A  N/A      3721      C   ...nvs/codeparrot/bin/python     2629MiB |
++-----------------------------------------------------------------------------+
+```
+
+We get the same number as before and you can also see that we are using a V100 GPU with 16GB of memory. So now we can 
+start training the model and see how the GPU memory consumption changes. First, we set up a few standard training 
+arguments:
+
+```py
+default_args = {
+    "output_dir": "tmp",
+    "evaluation_strategy": "steps",
+    "num_train_epochs": 1,
+    "log_level": "error",
+    "report_to": "none",
+}
+```
+
+<Tip>
+
+ If you plan to run multiple experiments, in order to properly clear the memory between experiments, restart the Python 
+ kernel between experiments.
+
+</Tip>
+
+## Memory utilization at vanilla training
+
+Let's use the [`Trainer`] and train the model without using any GPU performance optimization techniques and a batch size of 4:
+
+```py
+>>> from transformers import TrainingArguments, Trainer, logging
+
+>>> logging.set_verbosity_error()
+
+
+>>> training_args = TrainingArguments(per_device_train_batch_size=4, **default_args)
+>>> trainer = Trainer(model=model, args=training_args, train_dataset=ds)
+>>> result = trainer.train()
+>>> print_summary(result)
+```
+
+```
+Time: 57.82
+Samples/second: 8.86
+GPU memory occupied: 14949 MB.
+```
+
+We see that already a relatively small batch size almost fills up our GPU's entire memory. However, a larger batch size 
+can often result in faster model convergence or better end performance. So ideally we want to tune the batch size to our
+model's needs and not to the GPU limitations. What's interesting is that we use much more memory than the size of the model. 
+To understand a bit better why this is the case let's have a look at a model's operations and memory needs.
+
+## Anatomy of Model's Operations
+
+Transformers architecture includes 3 main groups of operations grouped below by compute-intensity.
+
+1. **Tensor Contractions**
+
+    Linear layers and components of Multi-Head Attention all do batched **matrix-matrix multiplications**. These operations are the most compute-intensive part of training a transformer.
+
+2. **Statistical Normalizations**
+
+    Softmax and layer normalization are less compute-intensive than tensor contractions, and involve one or more **reduction operations**, the result of which is then applied via a map.
+
+3. **Element-wise Operators**
+
+    These are the remaining operators: **biases, dropout, activations, and residual connections**. These are the least compute-intensive operations.
+
+This knowledge can be helpful to know when analyzing performance bottlenecks.
+
+This summary is derived from [Data Movement Is All You Need: A Case Study on Optimizing Transformers 2020](https://arxiv.org/abs/2007.00072)
+
+
+## Anatomy of Model's Memory
+
+We've seen that training the model uses much more memory than just putting the model on the GPU. This is because there 
+are many components during training that use GPU memory. The components on GPU memory are the following:
+
+1. model weights
+2. optimizer states
+3. gradients
+4. forward activations saved for gradient computation
+5. temporary buffers
+6. functionality-specific memory
+
+A typical model trained in mixed precision with AdamW requires 18 bytes per model parameter plus activation memory. For 
+inference there are no optimizer states and gradients, so we can subtract those. And thus we end up with 6 bytes per 
+model parameter for mixed precision inference, plus activation memory.
+
+Let's look at the details.
+
+**Model Weights:**
+
+- 4 bytes * number of parameters for fp32 training
+- 6 bytes * number of parameters for mixed precision training (maintains a model in fp32 and one in fp16 in memory)
+
+**Optimizer States:**
+
+- 8 bytes * number of parameters for normal AdamW (maintains 2 states)
+- 2 bytes * number of parameters for 8-bit AdamW optimizers like [bitsandbytes](https://github.com/TimDettmers/bitsandbytes)
+- 4 bytes * number of parameters for optimizers like SGD with momentum (maintains only 1 state)
+
+**Gradients**
+
+- 4 bytes * number of parameters for either fp32 or mixed precision training (gradients are always kept in fp32)
+
+**Forward Activations**
+
+- size depends on many factors, the key ones being sequence length, hidden size and batch size.
+
+There are the input and output that are being passed and returned by the forward and the backward functions and the 
+forward activations saved for gradient computation.
+
+**Temporary Memory**
+
+Additionally, there are all kinds of temporary variables which get released once the calculation is done, but in the 
+moment these could require additional memory and could push to OOM. Therefore, when coding it's crucial to think 
+strategically about such temporary variables and sometimes to explicitly free those as soon as they are no longer needed.
+
+**Functionality-specific memory**
+
+Then, your software could have special memory needs. For example, when generating text using beam search, the software 
+needs to maintain multiple copies of inputs and outputs.
+
+**`forward` vs `backward` Execution Speed**
+
+For convolutions and linear layers there are 2x flops in the backward compared to the forward, which generally translates 
+into ~2x slower (sometimes more, because sizes in the backward tend to be more awkward). Activations are usually 
+bandwidth-limited, and it’s typical for an activation to have to read more data in the backward than in the forward 
+(e.g. activation forward reads once, writes once, activation backward reads twice, gradOutput and output of the forward, 
+and writes once, gradInput).
+
+As you can see, there are potentially a few places where we could save GPU memory or speed up operations. 
+Now that you understand what affects GPU utilization and computation speed, refer to 
+the [Methods and tools for efficient training on a single GPU](perf_train_gpu_one) documentation page to learn about 
+performance optimization techniques. 
diff --git a/docs/source/en/model_sharing.md b/docs/source/en/model_sharing.md
new file mode 100644
index 000000000000..84d287570da1
--- /dev/null
+++ b/docs/source/en/model_sharing.md
@@ -0,0 +1,232 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Share a model
+
+The last two tutorials showed how you can fine-tune a model with PyTorch, Keras, and 🤗 Accelerate for distributed setups. The next step is to share your model with the community! At Hugging Face, we believe in openly sharing knowledge and resources to democratize artificial intelligence for everyone. We encourage you to consider sharing your model with the community to help others save time and resources.
+
+In this tutorial, you will learn two methods for sharing a trained or fine-tuned model on the [Model Hub](https://huggingface.co/models):
+
+- Programmatically push your files to the Hub.
+- Drag-and-drop your files to the Hub with the web interface.
+
+<iframe width="560" height="315" src="https://www.youtube.com/embed/XvSGPZFEjDY" title="YouTube video player"
+frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
+picture-in-picture" allowfullscreen></iframe>
+
+<Tip>
+
+To share a model with the community, you need an account on [huggingface.co](https://huggingface.co/join). You can also join an existing organization or create a new one.
+
+</Tip>
+
+## Repository features
+
+Each repository on the Model Hub behaves like a typical GitHub repository. Our repositories offer versioning, commit history, and the ability to visualize differences.
+
+The Model Hub's built-in versioning is based on git and [git-lfs](https://git-lfs.github.com/). In other words, you can treat one model as one repository, enabling greater access control and scalability. Version control allows *revisions*, a method for pinning a specific version of a model with a commit hash, tag or branch.
+
+As a result, you can load a specific model version with the `revision` parameter:
+
+```py
+>>> model = AutoModel.from_pretrained(
+...     "julien-c/EsperBERTo-small", revision="v2.0.1"  # tag name, or branch name, or commit hash
+... )
+```
+
+Files are also easily edited in a repository, and you can view the commit history as well as the difference:
+
+![vis_diff](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vis_diff.png)
+
+## Setup
+
+Before sharing a model to the Hub, you will need your Hugging Face credentials. If you have access to a terminal, run the following command in the virtual environment where 🤗 Transformers is installed. This will store your access token in your Hugging Face cache folder (`~/.cache/` by default):
+
+```bash
+huggingface-cli login
+```
+
+If you are using a notebook like Jupyter or Colaboratory, make sure you have the [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library) library installed. This library allows you to programmatically interact with the Hub.
+
+```bash
+pip install huggingface_hub
+```
+
+Then use `notebook_login` to sign-in to the Hub, and follow the link [here](https://huggingface.co/settings/token) to generate a token to login with:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## Convert a model for all frameworks
+
+To ensure your model can be used by someone working with a different framework, we recommend you convert and upload your model with both PyTorch and TensorFlow checkpoints. While users are still able to load your model from a different framework if you skip this step, it will be slower because 🤗 Transformers will need to convert the checkpoint on-the-fly.
+
+Converting a checkpoint for another framework is easy. Make sure you have PyTorch and TensorFlow installed (see [here](installation) for installation instructions), and then find the specific model for your task in the other framework. 
+
+<frameworkcontent>
+<pt>
+Specify `from_tf=True` to convert a checkpoint from TensorFlow to PyTorch:
+
+```py
+>>> pt_model = DistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_tf=True)
+>>> pt_model.save_pretrained("path/to/awesome-name-you-picked")
+```
+</pt>
+<tf>
+Specify `from_pt=True` to convert a checkpoint from PyTorch to TensorFlow:
+
+```py
+>>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_pt=True)
+```
+
+Then you can save your new TensorFlow model with its new checkpoint:
+
+```py
+>>> tf_model.save_pretrained("path/to/awesome-name-you-picked")
+```
+</tf>
+<jax>
+If a model is available in Flax, you can also convert a checkpoint from PyTorch to Flax:
+
+```py
+>>> flax_model = FlaxDistilBertForSequenceClassification.from_pretrained(
+...     "path/to/awesome-name-you-picked", from_pt=True
+... )
+```
+</jax>
+</frameworkcontent>
+
+## Push a model during training
+
+<frameworkcontent>
+<pt>
+<Youtube id="Z1-XMy-GNLQ"/>
+
+Sharing a model to the Hub is as simple as adding an extra parameter or callback. Remember from the [fine-tuning tutorial](training), the [`TrainingArguments`] class is where you specify hyperparameters and additional training options. One of these training options includes the ability to push a model directly to the Hub. Set `push_to_hub=True` in your [`TrainingArguments`]:
+
+```py
+>>> training_args = TrainingArguments(output_dir="my-awesome-model", push_to_hub=True)
+```
+
+Pass your training arguments as usual to [`Trainer`]:
+
+```py
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=small_train_dataset,
+...     eval_dataset=small_eval_dataset,
+...     compute_metrics=compute_metrics,
+... )
+```
+
+After you fine-tune your model, call [`~transformers.Trainer.push_to_hub`] on [`Trainer`] to push the trained model to the Hub. 🤗 Transformers will even automatically add training hyperparameters, training results and framework versions to your model card!
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+Share a model to the Hub with [`PushToHubCallback`]. In the [`PushToHubCallback`] function, add:
+
+- An output directory for your model.
+- A tokenizer.
+- The `hub_model_id`, which is your Hub username and model name.
+
+```py
+>>> from transformers import PushToHubCallback
+
+>>> push_to_hub_callback = PushToHubCallback(
+...     output_dir="./your_model_save_path", tokenizer=tokenizer, hub_model_id="your-username/my-awesome-model"
+... )
+```
+
+Add the callback to [`fit`](https://keras.io/api/models/model_training_apis/), and 🤗 Transformers will push the trained model to the Hub:
+
+```py
+>>> model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3, callbacks=push_to_hub_callback)
+```
+</tf>
+</frameworkcontent>
+
+## Use the `push_to_hub` function
+
+You can also call `push_to_hub` directly on your model to upload it to the Hub.
+
+Specify your model name in `push_to_hub`:
+
+```py
+>>> pt_model.push_to_hub("my-awesome-model")
+```
+
+This creates a repository under your username with the model name `my-awesome-model`. Users can now load your model with the `from_pretrained` function:
+
+```py
+>>> from transformers import AutoModel
+
+>>> model = AutoModel.from_pretrained("your_username/my-awesome-model")
+```
+
+If you belong to an organization and want to push your model under the organization name instead, just add it to the `repo_id`:
+
+```py
+>>> pt_model.push_to_hub("my-awesome-org/my-awesome-model")
+```
+
+The `push_to_hub` function can also be used to add other files to a model repository. For example, add a tokenizer to a model repository:
+
+```py
+>>> tokenizer.push_to_hub("my-awesome-model")
+```
+
+Or perhaps you'd like to add the TensorFlow version of your fine-tuned PyTorch model:
+
+```py
+>>> tf_model.push_to_hub("my-awesome-model")
+```
+
+Now when you navigate to your Hugging Face profile, you should see your newly created model repository. Clicking on the **Files** tab will display all the files you've uploaded to the repository.
+
+For more details on how to create and upload files to a repository, refer to the Hub documentation [here](https://huggingface.co/docs/hub/how-to-upstream).
+
+## Upload with the web interface
+
+Users who prefer a no-code approach are able to upload a model through the Hub's web interface. Visit [huggingface.co/new](https://huggingface.co/new) to create a new repository:
+
+![new_model_repo](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/new_model_repo.png)
+
+From here, add some information about your model:
+
+- Select the **owner** of the repository. This can be yourself or any of the organizations you belong to.
+- Pick a name for your model, which will also be the repository name.
+- Choose whether your model is public or private.
+- Specify the license usage for your model.
+
+Now click on the **Files** tab and click on the **Add file** button to upload a new file to your repository. Then drag-and-drop a file to upload and add a commit message.
+
+![upload_file](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/upload_file.png)
+
+## Add a model card
+
+To make sure users understand your model's capabilities, limitations, potential biases and ethical considerations, please add a model card to your repository. The model card is defined in the `README.md` file. You can add a model card by:
+
+* Manually creating and uploading a `README.md` file.
+* Clicking on the **Edit model card** button in your model repository.
+
+Take a look at the DistilBert [model card](https://huggingface.co/distilbert-base-uncased) for a good example of the type of information a model card should include. For more details about other options you can control in the `README.md` file such as a model's carbon footprint or widget examples, refer to the documentation [here](https://huggingface.co/docs/hub/models-cards).
diff --git a/docs/source/en/model_sharing.mdx b/docs/source/en/model_sharing.mdx
deleted file mode 100644
index e6bd7fc4a6af..000000000000
--- a/docs/source/en/model_sharing.mdx
+++ /dev/null
@@ -1,228 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Share a model
-
-The last two tutorials showed how you can fine-tune a model with PyTorch, Keras, and 🤗 Accelerate for distributed setups. The next step is to share your model with the community! At Hugging Face, we believe in openly sharing knowledge and resources to democratize artificial intelligence for everyone. We encourage you to consider sharing your model with the community to help others save time and resources.
-
-In this tutorial, you will learn two methods for sharing a trained or fine-tuned model on the [Model Hub](https://huggingface.co/models):
-
-- Programmatically push your files to the Hub.
-- Drag-and-drop your files to the Hub with the web interface.
-
-<iframe width="560" height="315" src="https://www.youtube.com/embed/XvSGPZFEjDY" title="YouTube video player"
-frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
-picture-in-picture" allowfullscreen></iframe>
-
-<Tip>
-
-To share a model with the community, you need an account on [huggingface.co](https://huggingface.co/join). You can also join an existing organization or create a new one.
-
-</Tip>
-
-## Repository features
-
-Each repository on the Model Hub behaves like a typical GitHub repository. Our repositories offer versioning, commit history, and the ability to visualize differences.
-
-The Model Hub's built-in versioning is based on git and [git-lfs](https://git-lfs.github.com/). In other words, you can treat one model as one repository, enabling greater access control and scalability. Version control allows *revisions*, a method for pinning a specific version of a model with a commit hash, tag or branch.
-
-As a result, you can load a specific model version with the `revision` parameter:
-
-```py
->>> model = AutoModel.from_pretrained(
-...     "julien-c/EsperBERTo-small", revision="v2.0.1"  # tag name, or branch name, or commit hash
-... )
-```
-
-Files are also easily edited in a repository, and you can view the commit history as well as the difference:
-
-![vis_diff](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vis_diff.png)
-
-## Setup
-
-Before sharing a model to the Hub, you will need your Hugging Face credentials. If you have access to a terminal, run the following command in the virtual environment where 🤗 Transformers is installed. This will store your access token in your Hugging Face cache folder (`~/.cache/` by default):
-
-```bash
-huggingface-cli login
-```
-
-If you are using a notebook like Jupyter or Colaboratory, make sure you have the [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library) library installed. This library allows you to programmatically interact with the Hub.
-
-```bash
-pip install huggingface_hub
-```
-
-Then use `notebook_login` to sign-in to the Hub, and follow the link [here](https://huggingface.co/settings/token) to generate a token to login with:
-
-```py
->>> from huggingface_hub import notebook_login
-
->>> notebook_login()
-```
-
-## Convert a model for all frameworks
-
-To ensure your model can be used by someone working with a different framework, we recommend you convert and upload your model with both PyTorch and TensorFlow checkpoints. While users are still able to load your model from a different framework if you skip this step, it will be slower because 🤗 Transformers will need to convert the checkpoint on-the-fly.
-
-Converting a checkpoint for another framework is easy. Make sure you have PyTorch and TensorFlow installed (see [here](installation) for installation instructions), and then find the specific model for your task in the other framework. 
-
-<frameworkcontent>
-<pt>
-Specify `from_tf=True` to convert a checkpoint from TensorFlow to PyTorch:
-
-```py
->>> pt_model = DistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_tf=True)
->>> pt_model.save_pretrained("path/to/awesome-name-you-picked")
-```
-</pt>
-<tf>
-Specify `from_pt=True` to convert a checkpoint from PyTorch to TensorFlow:
-
-```py
->>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_pt=True)
-```
-
-Then you can save your new TensorFlow model with it's new checkpoint:
-
-```py
->>> tf_model.save_pretrained("path/to/awesome-name-you-picked")
-```
-</tf>
-<jax>
-If a model is available in Flax, you can also convert a checkpoint from PyTorch to Flax:
-
-```py
->>> flax_model = FlaxDistilBertForSequenceClassification.from_pretrained(
-...     "path/to/awesome-name-you-picked", from_pt=True
-... )
-```
-</jax>
-</frameworkcontent>
-
-## Push a model during training
-
-<frameworkcontent>
-<pt>
-<Youtube id="Z1-XMy-GNLQ"/>
-
-Sharing a model to the Hub is as simple as adding an extra parameter or callback. Remember from the [fine-tuning tutorial](training), the [`TrainingArguments`] class is where you specify hyperparameters and additional training options. One of these training options includes the ability to push a model directly to the Hub. Set `push_to_hub=True` in your [`TrainingArguments`]:
-
-```py
->>> training_args = TrainingArguments(output_dir="my-awesome-model", push_to_hub=True)
-```
-
-Pass your training arguments as usual to [`Trainer`]:
-
-```py
->>> trainer = Trainer(
-...     model=model,
-...     args=training_args,
-...     train_dataset=small_train_dataset,
-...     eval_dataset=small_eval_dataset,
-...     compute_metrics=compute_metrics,
-... )
-```
-
-After you fine-tune your model, call [`~transformers.Trainer.push_to_hub`] on [`Trainer`] to push the trained model to the Hub. 🤗 Transformers will even automatically add training hyperparameters, training results and framework versions to your model card!
-
-```py
->>> trainer.push_to_hub()
-```
-</pt>
-<tf>
-Share a model to the Hub with [`PushToHubCallback`]. In the [`PushToHubCallback`] function, add:
-
-- An output directory for your model.
-- A tokenizer.
-- The `hub_model_id`, which is your Hub username and model name.
-
-```py
->>> from transformers.keras.callbacks import PushToHubCallback
-
->>> push_to_hub_callback = PushToHubCallback(
-...     output_dir="./your_model_save_path", tokenizer=tokenizer, hub_model_id="your-username/my-awesome-model"
-... )
-```
-
-Add the callback to [`fit`](https://keras.io/api/models/model_training_apis/), and 🤗 Transformers will push the trained model to the Hub:
-
-```py
->>> model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3, callbacks=push_to_hub_callback)
-```
-</tf>
-</frameworkcontent>
-
-## Use the `push_to_hub` function
-
-You can also call `push_to_hub` directly on your model to upload it to the Hub.
-
-Specify your model name in `push_to_hub`:
-
-```py
->>> pt_model.push_to_hub("my-awesome-model")
-```
-
-This creates a repository under your username with the model name `my-awesome-model`. Users can now load your model with the `from_pretrained` function:
-
-```py
->>> from transformers import AutoModel
-
->>> model = AutoModel.from_pretrained("your_username/my-awesome-model")
-```
-
-If you belong to an organization and want to push your model under the organization name instead, just add it to the `repo_id`:
-
-```py
->>> pt_model.push_to_hub("my-awesome-org/my-awesome-model")
-```
-
-The `push_to_hub` function can also be used to add other files to a model repository. For example, add a tokenizer to a model repository:
-
-```py
->>> tokenizer.push_to_hub("my-awesome-model")
-```
-
-Or perhaps you'd like to add the TensorFlow version of your fine-tuned PyTorch model:
-
-```py
->>> tf_model.push_to_hub("my-awesome-model")
-```
-
-Now when you navigate to the your Hugging Face profile, you should see your newly created model repository. Clicking on the **Files** tab will display all the files you've uploaded to the repository.
-
-For more details on how to create and upload files to a repository, refer to the Hub documentation [here](https://huggingface.co/docs/hub/how-to-upstream).
-
-## Upload with the web interface
-
-Users who prefer a no-code approach are able to upload a model through the Hub's web interface. Visit [huggingface.co/new](https://huggingface.co/new) to create a new repository:
-
-![new_model_repo](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/new_model_repo.png)
-
-From here, add some information about your model:
-
-- Select the **owner** of the repository. This can be yourself or any of the organizations you belong to.
-- Pick a name for your model, which will also be the repository name.
-- Choose whether your model is public or private.
-- Specify the license usage for your model.
-
-Now click on the **Files** tab and click on the **Add file** button to upload a new file to your repository. Then drag-and-drop a file to upload and add a commit message.
-
-![upload_file](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/upload_file.png)
-
-## Add a model card
-
-To make sure users understand your model's capabilities, limitations, potential biases and ethical considerations, please add a model card to your repository. The model card is defined in the `README.md` file. You can add a model card by:
-
-* Manually creating and uploading a `README.md` file.
-* Clicking on the **Edit model card** button in your model repository.
-
-Take a look at the DistilBert [model card](https://huggingface.co/distilbert-base-uncased) for a good example of the type of information a model card should include. For more details about other options you can control in the `README.md` file such as a model's carbon footprint or widget examples, refer to the documentation [here](https://huggingface.co/docs/hub/models-cards).
diff --git a/docs/source/en/model_summary.md b/docs/source/en/model_summary.md
new file mode 100644
index 000000000000..10acb4c50210
--- /dev/null
+++ b/docs/source/en/model_summary.md
@@ -0,0 +1,107 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# The Transformer model family
+
+Since its introduction in 2017, the [original Transformer](https://arxiv.org/abs/1706.03762) model has inspired many new and exciting models that extend beyond natural language processing (NLP) tasks. There are models for [predicting the folded structure of proteins](https://huggingface.co/blog/deep-learning-with-proteins), [training a cheetah to run](https://huggingface.co/blog/train-decision-transformers), and [time series forecasting](https://huggingface.co/blog/time-series-transformers). With so many Transformer variants available, it can be easy to miss the bigger picture. What all these models have in common is they're based on the original Transformer architecture. Some models only use the encoder or decoder, while others use both. This provides a useful taxonomy to categorize and examine the high-level differences within models in the Transformer family, and it'll help you understand Transformers you haven't encountered before.
+
+If you aren't familiar with the original Transformer model or need a refresher, check out the [How do Transformers work](https://huggingface.co/course/chapter1/4?fw=pt) chapter from the Hugging Face course.
+
+<div align="center">
+    <iframe width="560" height="315" src="https://www.youtube.com/embed/H39Z_720T5s" title="YouTube video player"
+    frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
+    picture-in-picture" allowfullscreen></iframe>
+</div>
+
+## Computer vision
+
+<iframe style="border: 1px solid rgba(0, 0, 0, 0.1);" width="1000" height="450" src="https://www.figma.com/embed?embed_host=share&url=https%3A%2F%2Fwww.figma.com%2Ffile%2FacQBpeFBVvrDUlzFlkejoz%2FModelscape-timeline%3Fnode-id%3D0%253A1%26t%3Dm0zJ7m2BQ9oe0WtO-1" allowfullscreen></iframe> 
+
+### Convolutional network
+
+For a long time, convolutional networks (CNNs) were the dominant paradigm for computer vision tasks until the [Vision Transformer](https://arxiv.org/abs/2010.11929) demonstrated its scalability and efficiency. Even then, some of a CNN's best qualities, like translation invariance, are so powerful (especially for certain tasks) that some Transformers incorporate convolutions in their architecture. [ConvNeXt](model_doc/convnext) flipped this exchange around and incorporated design choices from Transformers to modernize a CNN. For example, ConvNeXt uses non-overlapping sliding windows to patchify an image and a larger kernel to increase its global receptive field. ConvNeXt also makes several layer design choices to be more memory-efficient and improve performance, so it competes favorably with Transformers!
+
+### Encoder[[cv-encoder]]
+
+The [Vision Transformer (ViT)](model_doc/vit) opened the door to computer vision tasks without convolutions. ViT uses a standard Transformer encoder, but its main breakthrough was how it treated an image. It splits an image into fixed-size patches and uses them to create an embedding, just like how a sentence is split into tokens. ViT capitalized on the Transformers' efficient architecture to demonstrate competitive results with the CNNs at the time while requiring fewer resources to train. ViT was soon followed by other vision models that could also handle dense vision tasks like segmentation as well as detection.
+
+One of these models is the [Swin](model_doc/swin) Transformer. It builds hierarchical feature maps (like a CNN 👀 and unlike ViT) from smaller-sized patches and merges them with neighboring patches in deeper layers. Attention is only computed within a local window, and the window is shifted between attention layers to create connections to help the model learn better. Since the Swin Transformer can produce hierarchical feature maps, it is a good candidate for dense prediction tasks like segmentation and detection. The [SegFormer](model_doc/segformer) also uses a Transformer encoder to build hierarchical feature maps, but it adds a simple multilayer perceptron (MLP) decoder on top to combine all the feature maps and make a prediction.
+
+Other vision models, like BeIT and ViTMAE, drew inspiration from BERT's pretraining objective. [BeIT](model_doc/beit) is pretrained by *masked image modeling (MIM)*; the image patches are randomly masked, and the image is also tokenized into visual tokens. BeIT is trained to predict the visual tokens corresponding to the masked patches. [ViTMAE](model_doc/vitmae) has a similar pretraining objective, except it must predict the pixels instead of visual tokens. What's unusual is 75% of the image patches are masked! The decoder reconstructs the pixels from the masked tokens and encoded patches. After pretraining, the decoder is thrown away, and the encoder is ready to be used in downstream tasks.
+
+### Decoder[[cv-decoder]]
+
+Decoder-only vision models are rare because most vision models rely on an encoder to learn an image representation. But for use cases like image generation, the decoder is a natural fit, as we've seen from text generation models like GPT-2. [ImageGPT](model_doc/imagegpt) uses the same architecture as GPT-2, but instead of predicting the next token in a sequence, it predicts the next pixel in an image. In addition to image generation, ImageGPT could also be finetuned for image classification.
+
+### Encoder-decoder[[cv-encoder-decoder]]
+
+Vision models commonly use an encoder (also known as a backbone) to extract important image features before passing them to a Transformer decoder. [DETR](model_doc/detr) has a pretrained backbone, but it also uses the complete Transformer encoder-decoder architecture for object detection. The encoder learns image representations and combines them with object queries (each object query is a learned embedding that focuses on a region or object in an image) in the decoder. DETR predicts the bounding box coordinates and class label for each object query.
+
+## Natural language processing
+
+<iframe style="border: 1px solid rgba(0, 0, 0, 0.1);" width="1000" height="450" src="https://www.figma.com/embed?embed_host=share&url=https%3A%2F%2Fwww.figma.com%2Ffile%2FUhbQAZDlpYW5XEpdFy6GoG%2Fnlp-model-timeline%3Fnode-id%3D0%253A1%26t%3D4mZMr4r1vDEYGJ50-1" allowfullscreen></iframe>
+
+### Encoder[[nlp-encoder]]
+
+[BERT](model_doc/bert) is an encoder-only Transformer that randomly masks certain tokens in the input to avoid seeing other tokens, which would allow it to "cheat". The pretraining objective is to predict the masked token based on the context. This allows BERT to fully use the left and right contexts to help it learn a deeper and richer representation of the inputs. However, there was still room for improvement in BERT's pretraining strategy. [RoBERTa](model_doc/roberta) improved upon this by introducing a new pretraining recipe that includes training for longer and on larger batches, randomly masking tokens at each epoch instead of just once during preprocessing, and removing the next-sentence prediction objective. 
+
+The dominant strategy to improve performance is to increase the model size. But training large models is computationally expensive. One way to reduce computational costs is using a smaller model like [DistilBERT](model_doc/distilbert). DistilBERT uses [knowledge distillation](https://arxiv.org/abs/1503.02531) - a compression technique - to create a smaller version of BERT while keeping nearly all of its language understanding capabilities. 
+
+However, most Transformer models continued to trend towards more parameters, leading to new models focused on improving training efficiency. [ALBERT](model_doc/albert) reduces memory consumption by lowering the number of parameters in two ways: separating the larger vocabulary embedding into two smaller matrices and allowing layers to share parameters. [DeBERTa](model_doc/deberta) added a disentangled attention mechanism where the word and its position are separately encoded in two vectors. The attention is computed from these separate vectors instead of a single vector containing the word and position embeddings. [Longformer](model_doc/longformer) also focused on making attention more efficient, especially for processing documents with longer sequence lengths. It uses a combination of local windowed attention (attention only calculated from fixed window size around each token) and global attention (only for specific task tokens like `[CLS]` for classification) to create a sparse attention matrix instead of a full attention matrix.
+
+### Decoder[[nlp-decoder]]
+
+[GPT-2](model_doc/gpt2) is a decoder-only Transformer that predicts the next word in the sequence. It masks tokens to the right so the model can't "cheat" by looking ahead. By pretraining on a massive body of text, GPT-2 became really good at generating text, even if the text is only sometimes accurate or true. But GPT-2 lacked the bidirectional context from BERT's pretraining, which made it unsuitable for certain tasks. [XLNET](model_doc/xlnet) combines the best of both BERT and GPT-2's pretraining objectives by using a permutation language modeling objective (PLM) that allows it to learn bidirectionally.
+
+After GPT-2, language models grew even bigger and are now known as *large language models (LLMs)*. LLMs demonstrate few- or even zero-shot learning if pretrained on a large enough dataset. [GPT-J](model_doc/gptj) is an LLM with 6B parameters and trained on 400B tokens. GPT-J was followed by [OPT](model_doc/opt), a family of decoder-only models, the largest of which is 175B and trained on 180B tokens. [BLOOM](model_doc/bloom) was released around the same time, and the largest model in the family has 176B parameters and is trained on 366B tokens in 46 languages and 13 programming languages.
+
+### Encoder-decoder[[nlp-encoder-decoder]]
+
+[BART](model_doc/bart) keeps the original Transformer architecture, but it modifies the pretraining objective with *text infilling* corruption, where some text spans are replaced with a single `mask` token. The decoder predicts the uncorrupted tokens (future tokens are masked) and uses the encoder's hidden states to help it. [Pegasus](model_doc/pegasus) is similar to BART, but Pegasus masks entire sentences instead of text spans. In addition to masked language modeling, Pegasus is pretrained by gap sentence generation (GSG). The GSG objective masks whole sentences important to a document, replacing them with a `mask` token. The decoder must generate the output from the remaining sentences. [T5](model_doc/t5) is a more unique model that casts all NLP tasks into a text-to-text problem using specific prefixes. For example, the prefix `Summarize:` indicates a summarization task. T5 is pretrained by supervised (GLUE and SuperGLUE) training and self-supervised training (randomly sample and drop out 15% of tokens).
+
+## Audio
+
+<iframe style="border: 1px solid rgba(0, 0, 0, 0.1);" width="1000" height="450" src="https://www.figma.com/embed?embed_host=share&url=https%3A%2F%2Fwww.figma.com%2Ffile%2Fvrchl8jDV9YwNVPWu2W0kK%2Fspeech-and-audio-model-timeline%3Fnode-id%3D0%253A1%26t%3DmM4H8pPMuK23rClL-1" allowfullscreen></iframe>
+
+### Encoder[[audio-encoder]]
+
+[Wav2Vec2](model_doc/wav2vec2) uses a Transformer encoder to learn speech representations directly from raw audio waveforms. It is pretrained with a contrastive task to determine the true speech representation from a set of false ones. [HuBERT](model_doc/hubert) is similar to Wav2Vec2 but has a different training process. Target labels are created by a clustering step in which segments of similar audio are assigned to a cluster which becomes a hidden unit. The hidden unit is mapped to an embedding to make a prediction.
+
+### Encoder-decoder[[audio-encoder-decoder]]
+
+[Speech2Text](model_doc/speech_to_text) is a speech model designed for automatic speech recognition (ASR) and speech translation. The model accepts log mel-filter bank features extracted from the audio waveform and pretrained autoregressively to generate a transcript or translation. [Whisper](model_doc/whisper) is also an ASR model, but unlike many other speech models, it is pretrained on a massive amount of ✨ labeled ✨ audio transcription data for zero-shot performance. A large chunk of the dataset also contains non-English languages, meaning Whisper can also be used for low-resource languages. Structurally, Whisper is similar to Speech2Text. The audio signal is converted to a log-mel spectrogram encoded by the encoder. The decoder generates the transcript autoregressively from the encoder's hidden states and the previous tokens.
+
+## Multimodal
+
+<iframe style="border: 1px solid rgba(0, 0, 0, 0.1);" width="1000" height="450" src="https://www.figma.com/embed?embed_host=share&url=https%3A%2F%2Fwww.figma.com%2Ffile%2FcX125FQHXJS2gxeICiY93p%2Fmultimodal%3Fnode-id%3D0%253A1%26t%3DhPQwdx3HFPWJWnVf-1" allowfullscreen></iframe>
+
+### Encoder[[mm-encoder]]
+
+[VisualBERT](model_doc/visual_bert) is a multimodal model for vision-language tasks released shortly after BERT. It combines BERT and a pretrained object detection system to extract image features into visual embeddings, passed alongside text embeddings to BERT. VisualBERT predicts the masked text based on the unmasked text and the visual embeddings, and it also has to predict whether the text is aligned with the image. When ViT was released, [ViLT](model_doc/vilt) adopted ViT in its architecture because it was easier to get the image embeddings this way. The image embeddings are jointly processed with the text embeddings. From there, ViLT is pretrained by image text matching, masked language modeling, and whole word masking.
+
+[CLIP](model_doc/clip) takes a different approach and makes a pair prediction of (`image`, `text`) . An image encoder (ViT) and a text encoder (Transformer) are jointly trained on a 400 million (`image`, `text`) pair dataset to maximize the similarity between the image and text embeddings of the (`image`, `text`) pairs. After pretraining, you can use natural language to instruct CLIP to predict the text given an image or vice versa. [OWL-ViT](model_doc/owlvit) builds on top of CLIP by using it as its backbone for zero-shot object detection. After pretraining, an object detection head is added to make a set prediction over the (`class`, `bounding box`) pairs.
+
+### Encoder-decoder[[mm-encoder-decoder]]
+
+Optical character recognition (OCR) is a long-standing text recognition task that typically involves several components to understand the image and generate the text. [TrOCR](model_doc/trocr) simplifies the process using an end-to-end Transformer. The encoder is a ViT-style model for image understanding and processes the image as fixed-size patches. The decoder accepts the encoder's hidden states and autoregressively generates text. [Donut](model_doc/donut) is a more general visual document understanding model that doesn't rely on OCR-based approaches. It uses a Swin Transformer as the encoder and multilingual BART as the decoder. Donut is pretrained to read text by predicting the next word based on the image and text annotations. The decoder generates a token sequence given a prompt. The prompt is represented by a special token for each downstream task. For example, document parsing has a special `parsing` token that is combined with the encoder hidden states to parse the document into a structured output format (JSON).
+
+## Reinforcement learning
+
+<iframe style="border: 1px solid rgba(0, 0, 0, 0.1);" width="1000" height="450" src="https://www.figma.com/embed?embed_host=share&url=https%3A%2F%2Fwww.figma.com%2Ffile%2FiB3Y6RvWYki7ZuKO6tNgZq%2Freinforcement-learning%3Fnode-id%3D0%253A1%26t%3DhPQwdx3HFPWJWnVf-1" allowfullscreen></iframe>
+
+### Decoder[[rl-decoder]]
+
+The Decision and Trajectory Transformer casts the state, action, and reward as a sequence modeling problem. The [Decision Transformer](model_doc/decision_transformer) generates a series of actions that lead to a future desired return based on returns-to-go, past states, and actions. For the last *K* timesteps, each of the three modalities are converted into token embeddings and processed by a GPT-like model to predict a future action token. [Trajectory Transformer](model_doc/trajectory_transformer) also tokenizes the states, actions, and rewards and processes them with a GPT architecture. Unlike the Decision Transformer, which is focused on reward conditioning, the Trajectory Transformer generates future actions with beam search.
\ No newline at end of file
diff --git a/docs/source/en/model_summary.mdx b/docs/source/en/model_summary.mdx
deleted file mode 100644
index b9799ab59129..000000000000
--- a/docs/source/en/model_summary.mdx
+++ /dev/null
@@ -1,955 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Summary of the models
-
-This is a summary of the most downloaded models in 🤗 Transformers. Click on the large outermost bubble of each pretrained model category (encoder, decoder, encoder-decoder) to zoom in and out to see the most popular models within a modality. The size of each bubble corresponds to the number of downloads of each model.
-
-<iframe width="100%" height="900" frameborder="0"
-  src="https://observablehq.com/embed/eafbe39385aaf8f2?cells=chart"></iframe>
-
-It assumes you're familiar with the original [transformer
-model](https://arxiv.org/abs/1706.03762). For a gentle introduction check the [annotated transformer](http://nlp.seas.harvard.edu/2018/04/03/attention.html). Here we focus on the high-level differences between the
-models. You can check them more in detail in their respective documentation. Also check out [the Model Hub](https://huggingface.co/models) where you can filter the checkpoints by model architecture.
-
-Each one of the models in the library falls into one of the following categories:
-
-- [autoregressive-models](#autoregressive-models)
-- [autoencoding-models](#autoencoding-models)
-- [seq-to-seq-models](#seq-to-seq-models)
-- [multimodal-models](#multimodal-models)
-- [retrieval-based-models](#retrieval-based-models)
-
-<iframe width="560" height="315" src="https://www.youtube.com/embed/H39Z_720T5s" title="YouTube video player"
-frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
-picture-in-picture" allowfullscreen></iframe>
-
-Autoregressive models are pretrained on the classic language modeling task: guess the next token having read all the
-previous ones. They correspond to the decoder of the original transformer model, and a mask is used on top of the full
-sentence so that the attention heads can only see what was before in the text, and not what’s after. Although those
-models can be fine-tuned and achieve great results on many tasks, the most natural application is text generation. A
-typical example of such models is GPT.
-
-Autoencoding models are pretrained by corrupting the input tokens in some way and trying to reconstruct the original
-sentence. They correspond to the encoder of the original transformer model in the sense that they get access to the
-full inputs without any mask. Those models usually build a bidirectional representation of the whole sentence. They can
-be fine-tuned and achieve great results on many tasks such as text generation, but their most natural application is
-sentence classification or token classification. A typical example of such models is BERT.
-
-Note that the only difference between autoregressive models and autoencoding models is in the way the model is
-pretrained. Therefore, the same architecture can be used for both autoregressive and autoencoding models. When a given
-model has been used for both types of pretraining, we have put it in the category corresponding to the article where it
-was first introduced.
-
-Sequence-to-sequence models use both the encoder and the decoder of the original transformer, either for translation
-tasks or by transforming other tasks to sequence-to-sequence problems. They can be fine-tuned to many tasks but their
-most natural applications are translation, summarization and question answering. The original transformer model is an
-example of such a model (only for translation), T5 is an example that can be fine-tuned on other tasks.
-
-Multimodal models mix text inputs with other kinds (e.g. images) and are more specific to a given task.
-
-<a id='autoregressive-models'></a>
-
-## Decoders or autoregressive models
-
-As mentioned before, these models rely on the decoder part of the original transformer and use an attention mask so
-that at each position, the model can only look at the tokens before the attention heads.
-
-<Youtube id="d_ixlCubqQw"/>
-
-### Original GPT
-
-<div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=openai-gpt">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-openai--gpt-blueviolet">
-</a>
-<a href="model_doc/openai-gpt">
-<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-openai--gpt-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/openai-gpt">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
-</div>
-
-
-[Improving Language Understanding by Generative Pre-Training](https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf), Alec Radford et al.
-
-The first autoregressive model based on the transformer architecture, pretrained on the Book Corpus dataset.
-
-The library provides versions of the model for language modeling and multitask language modeling/multiple choice
-classification.
-
-### GPT-2
-
-<div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=gpt2">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-gpt2-blueviolet">
-</a>
-<a href="model_doc/gpt2">
-<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-gpt2-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/gpt2">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
-</div>
-
-
-[Language Models are Unsupervised Multitask Learners](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf),
-Alec Radford et al.
-
-A bigger and better version of GPT, pretrained on WebText (web pages from outgoing links in Reddit with 3 karmas or
-more).
-
-The library provides versions of the model for language modeling and multitask language modeling/multiple choice
-classification.
-
-### CTRL
-
-<div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=ctrl">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-ctrl-blueviolet">
-</a>
-<a href="model_doc/ctrl">
-<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-ctrl-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/tiny-ctrl">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
-</div>
-
-
-[CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858),
-Nitish Shirish Keskar et al.
-
-Same as the GPT model but adds the idea of control codes. Text is generated from a prompt (can be empty) and one (or
-several) of those control codes which are then used to influence the text generation: generate with the style of
-wikipedia article, a book or a movie review.
-
-The library provides a version of the model for language modeling only.
-
-### Transformer-XL
-
-<div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=transfo-xl">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-transfo--xl-blueviolet">
-</a>
-<a href="model_doc/transfo-xl">
-<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-transfo--xl-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/transfo-xl-wt103">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
-</div>
-
-
-[Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860), Zihang
-Dai et al.
-
-Same as a regular GPT model, but introduces a recurrence mechanism for two consecutive segments (similar to a regular
-RNNs with two consecutive inputs). In this context, a segment is a number of consecutive tokens (for instance 512) that
-may span across multiple documents, and segments are fed in order to the model.
-
-Basically, the hidden states of the previous segment are concatenated to the current input to compute the attention
-scores. This allows the model to pay attention to information that was in the previous segment as well as the current
-one. By stacking multiple attention layers, the receptive field can be increased to multiple previous segments.
-
-This changes the positional embeddings to positional relative embeddings (as the regular positional embeddings would
-give the same results in the current input and the current hidden state at a given position) and needs to make some
-adjustments in the way attention scores are computed.
-
-The library provides a version of the model for language modeling only.
-
-<a id='reformer'></a>
-
-### Reformer
-
-<div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=reformer">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-reformer-blueviolet">
-</a>
-<a href="model_doc/reformer">
-<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-reformer-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/reformer-crime-and-punishment">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
-</div>
-
-
-[Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451), Nikita Kitaev et al .
-
-An autoregressive transformer model with lots of tricks to reduce memory footprint and compute time. Those tricks
-include:
-
-- Use [Axial position encoding](#axial-pos-encoding) (see below for more details). It’s a mechanism to avoid
-  having a huge positional encoding matrix (when the sequence length is very big) by factorizing it into smaller
-  matrices.
-- Replace traditional attention by [LSH (local-sensitive hashing) attention](#lsh-attention) (see below for more
-  details). It's a technique to avoid computing the full product query-key in the attention layers.
-- Avoid storing the intermediate results of each layer by using reversible transformer layers to obtain them during
-  the backward pass (subtracting the residuals from the input of the next layer gives them back) or recomputing them
-  for results inside a given layer (less efficient than storing them but saves memory).
-- Compute the feedforward operations by chunks and not on the whole batch.
-
-With those tricks, the model can be fed much larger sentences than traditional transformer autoregressive models.
-
-<Tip>
-
-This model could be very well be used in an autoencoding setting, there is no checkpoint for such a
-pretraining yet, though.
-
-</Tip>
-
-The library provides a version of the model for language modeling only.
-
-### XLNet
-
-<div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=xlnet">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-xlnet-blueviolet">
-</a>
-<a href="model_doc/xlnet">
-<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-xlnet-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/xlnet-base-cased">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
-</div>
-
-
-[XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237), Zhilin
-Yang et al.
-
-XLNet is not a traditional autoregressive model but uses a training strategy that builds on that. It permutes the
-tokens in the sentence, then allows the model to use the last n tokens to predict the token n+1. Since this is all done
-with a mask, the sentence is actually fed in the model in the right order, but instead of masking the first n tokens
-for n+1, XLNet uses a mask that hides the previous tokens in some given permutation of 1,...,sequence length.
-
-XLNet also uses the same recurrence mechanism as Transformer-XL to build long-term dependencies.
-
-The library provides a version of the model for language modeling, token classification, sentence classification,
-multiple choice classification and question answering.
-
-<a id='autoencoding-models'></a>
-
-## Encoders or autoencoding models
-
-As mentioned before, these models rely on the encoder part of the original transformer and use no mask so the model can
-look at all the tokens in the attention heads. For pretraining, targets are the original sentences and inputs are their
-corrupted versions.
-
-<Youtube id="MUqNwgPjJvQ"/>
-
-### BERT
-
-<div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=bert">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-bert-blueviolet">
-</a>
-<a href="model_doc/bert">
-<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-bert-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/bert-base-uncased">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
-</div>
-
-[BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805),
-Jacob Devlin et al.
-
-Corrupts the inputs by using random masking, more precisely, during pretraining, a given percentage of tokens (usually
-15%) is masked by:
-
-- a special mask token with probability 0.8
-- a random token different from the one masked with probability 0.1
-- the same token with probability 0.1
-
-The model must predict the original sentence, but has a second objective: inputs are two sentences A and B (with a
-separation token in between). With probability 50%, the sentences are consecutive in the corpus, in the remaining 50%
-they are not related. The model has to predict if the sentences are consecutive or not.
-
-The library provides a version of the model for language modeling (traditional or masked), next sentence prediction,
-token classification, sentence classification, multiple choice classification and question answering.
-
-### ALBERT
-
-<div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=albert">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-albert-blueviolet">
-</a>
-<a href="model_doc/albert">
-<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-albert-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/albert-base-v2">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
-</div>
-
-
-[ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942),
-Zhenzhong Lan et al.
-
-Same as BERT but with a few tweaks:
-
-- Embedding size E is different from hidden size H justified because the embeddings are context independent (one
-  embedding vector represents one token), whereas hidden states are context dependent (one hidden state represents a
-  sequence of tokens) so it's more logical to have H >> E. Also, the embedding matrix is large since it's V x E (V
-  being the vocab size). If E < H, it has less parameters.
-- Layers are split in groups that share parameters (to save memory).
-- Next sentence prediction is replaced by a sentence ordering prediction: in the inputs, we have two sentences A and
-  B (that are consecutive) and we either feed A followed by B or B followed by A. The model must predict if they have
-  been swapped or not.
-
-The library provides a version of the model for masked language modeling, token classification, sentence
-classification, multiple choice classification and question answering.
-
-### RoBERTa
-
-<div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=roberta">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-roberta-blueviolet">
-</a>
-<a href="model_doc/roberta">
-<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-roberta-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/roberta-base">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
-</div>
-
-
-[RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692), Yinhan Liu et al.
-
-Same as BERT with better pretraining tricks:
-
-- dynamic masking: tokens are masked differently at each epoch, whereas BERT does it once and for all
-- no NSP (next sentence prediction) loss and instead of putting just two sentences together, put a chunk of
-  contiguous texts together to reach 512 tokens (so the sentences are in an order than may span several documents)
-- train with larger batches
-- use BPE with bytes as a subunit and not characters (because of unicode characters)
-
-The library provides a version of the model for masked language modeling, token classification, sentence
-classification, multiple choice classification and question answering.
-
-### DistilBERT
-
-<div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=distilbert">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-distilbert-blueviolet">
-</a>
-<a href="model_doc/distilbert">
-<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-distilbert-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/distilbert-base-uncased">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
-</div>
-
-
-[DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108),
-Victor Sanh et al.
-
-Same as BERT but smaller. Trained by distillation of the pretrained BERT model, meaning it's been trained to predict
-the same probabilities as the larger model. The actual objective is a combination of:
-
-- finding the same probabilities as the teacher model
-- predicting the masked tokens correctly (but no next-sentence objective)
-- a cosine similarity between the hidden states of the student and the teacher model
-
-The library provides a version of the model for masked language modeling, token classification, sentence classification
-and question answering.
-
-### ConvBERT
-
-<div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=convbert">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-convbert-blueviolet">
-</a>
-<a href="model_doc/convbert">
-<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-convbert-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/conv-bert-base">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
-</div>
-
-
-[ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496), Zihang Jiang,
-Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
-
-Pre-trained language models like BERT and its variants have recently achieved impressive performance in various natural
-language understanding tasks. However, BERT heavily relies on the global self-attention block and thus suffers large
-memory footprint and computation cost. Although all its attention heads query on the whole input sequence for
-generating the attention map from a global perspective, we observe some heads only need to learn local dependencies,
-which means the existence of computation redundancy. We therefore propose a novel span-based dynamic convolution to
-replace these self-attention heads to directly model local dependencies. The novel convolution heads, together with the
-rest self-attention heads, form a new mixed attention block that is more efficient at both global and local context
-learning. We equip BERT with this mixed attention design and build a ConvBERT model. Experiments have shown that
-ConvBERT significantly outperforms BERT and its variants in various downstream tasks, with lower training cost and
-fewer model parameters. Remarkably, ConvBERTbase model achieves 86.4 GLUE score, 0.7 higher than ELECTRAbase, while
-using less than 1/4 training cost.
-
-The library provides a version of the model for masked language modeling, token classification, sentence classification
-and question answering.
-
-### XLM
-
-<div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=xlm">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-xlm-blueviolet">
-</a>
-<a href="model_doc/xlm">
-<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-xlm-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/xlm-mlm-en-2048">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
-</div>
-
-
-[Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291), Guillaume Lample and Alexis Conneau
-
-A transformer model trained on several languages. There are three different type of training for this model and the
-library provides checkpoints for all of them:
-
-- Causal language modeling (CLM) which is the traditional autoregressive training (so this model could be in the
-  previous section as well). One of the languages is selected for each training sample, and the model input is a
-  sentence of 256 tokens, that may span over several documents in one of those languages.
-- Masked language modeling (MLM) which is like RoBERTa. One of the languages is selected for each training sample,
-  and the model input is a sentence of 256 tokens, that may span over several documents in one of those languages,
-  with dynamic masking of the tokens.
-- A combination of MLM and translation language modeling (TLM). This consists of concatenating a sentence in two
-  different languages, with random masking. To predict one of the masked tokens, the model can use both, the
-  surrounding context in language 1 and the context given by language 2.
-
-Checkpoints refer to which method was used for pretraining by having *clm*, *mlm* or *mlm-tlm* in their names. On top
-of positional embeddings, the model has language embeddings. When training using MLM/CLM, this gives the model an
-indication of the language used, and when training using MLM+TLM, an indication of the language used for each part.
-
-The library provides a version of the model for language modeling, token classification, sentence classification and
-question answering.
-
-### XLM-RoBERTa
-
-<div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=xlm-roberta">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-xlm--roberta-blueviolet">
-</a>
-<a href="model_doc/xlm-roberta">
-<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-xlm--roberta-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/xlm-roberta-base">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
-</div>
-
-
-[Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116), Alexis Conneau et
-al.
-
-Uses RoBERTa tricks on the XLM approach, but does not use the translation language modeling objective. It only uses
-masked language modeling on sentences coming from one language. However, the model is trained on many more languages
-(100) and doesn't use the language embeddings, so it's capable of detecting the input language by itself.
-
-The library provides a version of the model for masked language modeling, token classification, sentence
-classification, multiple choice classification and question answering.
-
-### FlauBERT
-
-<div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=flaubert">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-flaubert-blueviolet">
-</a>
-<a href="model_doc/flaubert">
-<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-flaubert-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/flaubert_small_cased">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
-</div>
-
-
-[FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372), Hang Le et al.
-
-Like RoBERTa, without the sentence ordering prediction (so just trained on the MLM objective).
-
-The library provides a version of the model for language modeling and sentence classification.
-
-### ELECTRA
-
-<div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=electra">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-electra-blueviolet">
-</a>
-<a href="model_doc/electra">
-<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-electra-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/electra_large_discriminator_squad2_512">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
-</div>
-
-
-[ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators](https://arxiv.org/abs/2003.10555),
-Kevin Clark et al.
-
-ELECTRA is a transformer model pretrained with the use of another (small) masked language model. The inputs are
-corrupted by that language model, which takes an input text that is randomly masked and outputs a text in which ELECTRA
-has to predict which token is an original and which one has been replaced. Like for GAN training, the small language
-model is trained for a few steps (but with the original texts as objective, not to fool the ELECTRA model like in a
-traditional GAN setting) then the ELECTRA model is trained for a few steps.
-
-The library provides a version of the model for masked language modeling, token classification and sentence
-classification.
-
-### Funnel Transformer
-
-<div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=funnel">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-funnel-blueviolet">
-</a>
-<a href="model_doc/funnel">
-<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-funnel-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/funnel-transformer-small">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
-</div>
-
-
-[Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236), Zihang Dai et al.
-
-Funnel Transformer is a transformer model using pooling, a bit like a ResNet model: layers are grouped in blocks, and
-at the beginning of each block (except the first one), the hidden states are pooled among the sequence dimension. This
-way, their length is divided by 2, which speeds up the computation of the next hidden states. All pretrained models
-have three blocks, which means the final hidden state has a sequence length that is one fourth of the original sequence
-length.
-
-For tasks such as classification, this is not a problem, but for tasks like masked language modeling or token
-classification, we need a hidden state with the same sequence length as the original input. In those cases, the final
-hidden states are upsampled to the input sequence length and go through two additional layers. That's why there are two
-versions of each checkpoint. The version suffixed with "-base" contains only the three blocks, while the version
-without that suffix contains the three blocks and the upsampling head with its additional layers.
-
-The pretrained models available use the same pretraining objective as ELECTRA.
-
-The library provides a version of the model for masked language modeling, token classification, sentence
-classification, multiple choice classification and question answering.
-
-<a id='longformer'></a>
-
-### Longformer
-
-<div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=longformer">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-longformer-blueviolet">
-</a>
-<a href="model_doc/longformer">
-<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-longformer-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/longformer-base-4096-finetuned-squadv1">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
-</div>
-
-
-[Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150), Iz Beltagy et al.
-
-A transformer model replacing the attention matrices by sparse matrices to go faster. Often, the local context (e.g.,
-what are the two tokens left and right?) is enough to take action for a given token. Some preselected input tokens are
-still given global attention, but the attention matrix has way less parameters, resulting in a speed-up. See the
-[local attention section](#local-attention) for more information.
-
-It is pretrained the same way a RoBERTa otherwise.
-
-<Tip>
-
-This model could be very well be used in an autoregressive setting, there is no checkpoint for such a
-pretraining yet, though.
-
-</Tip>
-
-The library provides a version of the model for masked language modeling, token classification, sentence
-classification, multiple choice classification and question answering.
-
-<a id='seq-to-seq-models'></a>
-
-## Sequence-to-sequence models
-
-As mentioned before, these models keep both the encoder and the decoder of the original transformer.
-
-<Youtube id="0_4KEb08xrE"/>
-
-### BART
-
-<div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=bart">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-bart-blueviolet">
-</a>
-<a href="model_doc/bart">
-<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-bart-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/bart-large-mnli">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
-</div>
-
-
-[BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461), Mike Lewis et al.
-
-Sequence-to-sequence model with an encoder and a decoder. Encoder is fed a corrupted version of the tokens, decoder is
-fed the original tokens (but has a mask to hide the future words like a regular transformers decoder). A composition of
-the following transformations are applied on the pretraining tasks for the encoder:
-
-- mask random tokens (like in BERT)
-- delete random tokens
-- mask a span of k tokens with a single mask token (a span of 0 tokens is an insertion of a mask token)
-- permute sentences
-- rotate the document to make it start at a specific token
-
-The library provides a version of this model for conditional generation and sequence classification.
-
-### Pegasus
-
-<div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=pegasus">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-pegasus-blueviolet">
-</a>
-<a href="model_doc/pegasus">
-<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-pegasus-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/pegasus_paraphrase">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
-</div>
-
-
-[PEGASUS: Pre-training with Extracted Gap-sentences forAbstractive Summarization](https://arxiv.org/pdf/1912.08777.pdf), Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu on Dec 18, 2019.
-
-Sequence-to-sequence model with the same encoder-decoder model architecture as BART. Pegasus is pre-trained jointly on
-two self-supervised objective functions: Masked Language Modeling (MLM) and a novel summarization specific pretraining
-objective, called Gap Sentence Generation (GSG).
-
-- MLM: encoder input tokens are randomly replaced by a mask tokens and have to be predicted by the encoder (like in
-  BERT)
-- GSG: whole encoder input sentences are replaced by a second mask token and fed to the decoder, but which has a
-  causal mask to hide the future words like a regular auto-regressive transformer decoder.
-
-In contrast to BART, Pegasus' pretraining task is intentionally similar to summarization: important sentences are
-masked and are generated together as one output sequence from the remaining sentences, similar to an extractive
-summary.
-
-The library provides a version of this model for conditional generation, which should be used for summarization.
-
-
-### MarianMT
-
-<div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=marian">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-marian-blueviolet">
-</a>
-<a href="model_doc/marian">
-<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-marian-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/opus-mt-zh-en">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
-</div>
-
-
-[Marian: Fast Neural Machine Translation in C++](https://arxiv.org/abs/1804.00344), Marcin Junczys-Dowmunt et al.
-
-A framework for translation models, using the same models as BART
-
-The library provides a version of this model for conditional generation.
-
-
-### T5
-
-<div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=t5">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-t5-blueviolet">
-</a>
-<a href="model_doc/t5">
-<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-t5-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/t5-base">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
-</div>
-
-
-[Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683), Colin Raffel et al.
-
-Uses the traditional transformer model (with a slight change in the positional embeddings, which are learned at each
-layer). To be able to operate on all NLP tasks, it transforms them into text-to-text problems by using specific
-prefixes: “summarize: ”, “question: ”, “translate English to German: ” and so forth.
-
-The pretraining includes both supervised and self-supervised training. Supervised training is conducted on downstream
-tasks provided by the GLUE and SuperGLUE benchmarks (converting them into text-to-text tasks as explained above).
-
-Self-supervised training uses corrupted tokens, by randomly removing 15% of the tokens and replacing them with
-individual sentinel tokens (if several consecutive tokens are marked for removal, the whole group is replaced with a
-single sentinel token). The input of the encoder is the corrupted sentence, the input of the decoder is the original
-sentence and the target is then the dropped out tokens delimited by their sentinel tokens.
-
-For instance, if we have the sentence “My dog is very cute .”, and we decide to remove the tokens: "dog", "is" and
-"cute", the encoder input becomes “My <x> very <y> .” and the target input becomes “<x> dog is <y> cute .<z>”
-
-The library provides a version of this model for conditional generation.
-
-
-### MT5
-
-<div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=mt5">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-mt5-blueviolet">
-</a>
-<a href="model_doc/mt5">
-<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-mt5-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/mt5-small-finetuned-arxiv-cs-finetuned-arxiv-cs-full">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
-</div>
-
-
-[mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934), Linting Xue
-et al.
-
-The model architecture is same as T5. mT5's pretraining objective includes T5's self-supervised training, but not T5's
-supervised training. mT5 is trained on 101 languages.
-
-The library provides a version of this model for conditional generation.
-
-
-### MBart
-
-<div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=mbart">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-mbart-blueviolet">
-</a>
-<a href="model_doc/mbart">
-<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-mbart-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/mbart-large-50-one-to-many-mmt">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
-</div>
-
-
-[Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu,
-Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-
-The model architecture and pretraining objective is same as BART, but MBart is trained on 25 languages and is intended
-for supervised and unsupervised machine translation. MBart is one of the first methods for pretraining a complete
-sequence-to-sequence model by denoising full texts in multiple languages,
-
-The library provides a version of this model for conditional generation.
-
-The [mbart-large-en-ro checkpoint](https://huggingface.co/facebook/mbart-large-en-ro) can be used for english ->
-romanian translation.
-
-The [mbart-large-cc25](https://huggingface.co/facebook/mbart-large-cc25) checkpoint can be finetuned for other
-translation and summarization tasks, using code in ```examples/pytorch/translation/``` , but is not very useful without
-finetuning.
-
-
-### ProphetNet
-
-<div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=prophetnet">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-prophetnet-blueviolet">
-</a>
-<a href="model_doc/prophetnet">
-<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-prophetnet-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/prophetnet-large-uncased">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
-</div>
-
-
-[ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training,](https://arxiv.org/abs/2001.04063) by
-Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang, Ming Zhou.
-
-ProphetNet introduces a novel *sequence-to-sequence* pretraining objective, called *future n-gram prediction*. In
-future n-gram prediction, the model predicts the next n tokens simultaneously based on previous context tokens at each
-time step instead instead of just the single next token. The future n-gram prediction explicitly encourages the model
-to plan for the future tokens and prevent overfitting on strong local correlations. The model architecture is based on
-the original Transformer, but replaces the "standard" self-attention mechanism in the decoder by a a main
-self-attention mechanism and a self and n-stream (predict) self-attention mechanism.
-
-The library provides a pre-trained version of this model for conditional generation and a fine-tuned version for
-summarization.
-
-### XLM-ProphetNet
-
-<div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=xprophetnet">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-xprophetnet-blueviolet">
-</a>
-<a href="model_doc/xlm-prophetnet">
-<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-xprophetnet-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/xprophetnet-large-wiki100-cased-xglue-ntg">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
-</div>
-
-
-[ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training,](https://arxiv.org/abs/2001.04063) by
-Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang, Ming Zhou.
-
-XLM-ProphetNet's model architecture and pretraining objective is same as ProphetNet, but XLM-ProphetNet was pre-trained
-on the cross-lingual dataset [XGLUE](https://arxiv.org/abs/2004.01401).
-
-The library provides a pre-trained version of this model for multi-lingual conditional generation and fine-tuned
-versions for headline generation and question generation, respectively.
-
-<a id='multimodal-models'></a>
-
-## Multimodal models
-
-There is one multimodal model in the library which has not been pretrained in the self-supervised fashion like the
-others.
-
-### MMBT
-
-[Supervised Multimodal Bitransformers for Classifying Images and Text](https://arxiv.org/abs/1909.02950), Douwe Kiela
-et al.
-
-A transformers model used in multimodal settings, combining a text and an image to make predictions. The transformer
-model takes as inputs the embeddings of the tokenized text and the final activations of a pretrained on images resnet
-(after the pooling layer) that goes through a linear layer (to go from number of features at the end of the resnet to
-the hidden state dimension of the transformer).
-
-The different inputs are concatenated, and on top of the positional embeddings, a segment embedding is added to let the
-model know which part of the input vector corresponds to the text and which to the image.
-
-The pretrained model only works for classification.
-
-<!--More information in this [model documentation](model_doc/mmbt). TODO: write this page
--->
-
-<a id='retrieval-based-models'></a>
-
-## Retrieval-based models
-
-Some models use documents retrieval during (pre)training and inference for open-domain question answering, for example.
-
-
-### DPR
-
-<div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=dpr">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-dpr-blueviolet">
-</a>
-<a href="model_doc/dpr">
-<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-dpr-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/dpr-question_encoder-bert-base-multilingual">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
-</div>
-
-
-[Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906), Vladimir Karpukhin et
-al.
-
-Dense Passage Retrieval (DPR) - is a set of tools and models for state-of-the-art open-domain question-answering
-research.
-
-
-DPR consists in three models:
-
-- Question encoder: encode questions as vectors
-- Context encoder: encode contexts as vectors
-- Reader: extract the answer of the questions inside retrieved contexts, along with a relevance score (high if the
-  inferred span actually answers the question).
-
-DPR's pipeline (not implemented yet) uses a retrieval step to find the top k contexts given a certain question, and
-then it calls the reader with the question and the retrieved documents to get the answer.
-
-### RAG
-
-<div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=rag">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-rag-blueviolet">
-</a>
-<a href="model_doc/rag">
-<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-rag-blueviolet">
-</a>
-</div>
-
-[Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401), Patrick Lewis,
-Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau
-Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela
-
-Retrieval-augmented generation ("RAG") models combine the powers of pretrained dense retrieval (DPR) and Seq2Seq
-models. RAG models retrieve docs, pass them to a seq2seq model, then marginalize to generate outputs. The retriever and
-seq2seq modules are initialized from pretrained models, and fine-tuned jointly, allowing both retrieval and generation
-to adapt to downstream tasks.
-
-The two models RAG-Token and RAG-Sequence are available for generation.
-
-## More technical aspects
-
-### Full vs sparse attention
-
-Most transformer models use full attention in the sense that the attention matrix is square. It can be a big
-computational bottleneck when you have long texts. Longformer and reformer are models that try to be more efficient and
-use a sparse version of the attention matrix to speed up training.
-
-<a id='lsh-attention'></a>
-
-**LSH attention**
-
-[Reformer](#reformer) uses LSH attention. In the softmax(QK^t), only the biggest elements (in the softmax
-dimension) of the matrix QK^t are going to give useful contributions. So for each query q in Q, we can consider only
-the keys k in K that are close to q. A hash function is used to determine if q and k are close. The attention mask is
-modified to mask the current token (except at the first position), because it will give a query and a key equal (so
-very similar to each other). Since the hash can be a bit random, several hash functions are used in practice
-(determined by a n_rounds parameter) and then are averaged together.
-
-<a id='local-attention'></a>
-
-**Local attention**
-
-[Longformer](#longformer) uses local attention: often, the local context (e.g., what are the two tokens to the
-left and right?) is enough to take action for a given token. Also, by stacking attention layers that have a small
-window, the last layer will have a receptive field of more than just the tokens in the window, allowing them to build a
-representation of the whole sentence.
-
-Some preselected input tokens are also given global attention: for those few tokens, the attention matrix can access
-all tokens and this process is symmetric: all other tokens have access to those specific tokens (on top of the ones in
-their local window). This is shown in Figure 2d of the paper, see below for a sample attention mask:
-
-<img scale="50 %" align="center" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/local_attention_mask.png"/>
-
-Using those attention matrices with less parameters then allows the model to have inputs having a bigger sequence
-length.
-
-### Other tricks
-
-<a id='axial-pos-encoding'></a>
-
-**Axial positional encodings**
-
-[Reformer](#reformer) uses axial positional encodings: in traditional transformer models, the positional encoding
-E is a matrix of size \\(l\\) by \\(d\\), \\(l\\) being the sequence length and \\(d\\) the dimension of the
-hidden state. If you have very long texts, this matrix can be huge and take way too much space on the GPU. To alleviate
-that, axial positional encodings consist of factorizing that big matrix E in two smaller matrices E1 and E2, with
-dimensions \\(l_{1} \times d_{1}\\) and \\(l_{2} \times d_{2}\\), such that \\(l_{1} \times l_{2} = l\\) and
-\\(d_{1} + d_{2} = d\\) (with the product for the lengths, this ends up being way smaller). The embedding for time
-step \\(j\\) in E is obtained by concatenating the embeddings for timestep \\(j \% l1\\) in E1 and \\(j // l1\\)
-in E2.
diff --git a/docs/source/en/multilingual.md b/docs/source/en/multilingual.md
new file mode 100644
index 000000000000..9bf904a3b373
--- /dev/null
+++ b/docs/source/en/multilingual.md
@@ -0,0 +1,179 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Multilingual models for inference
+
+[[open-in-colab]]
+
+There are several multilingual models in 🤗 Transformers, and their inference usage differs from monolingual models. Not *all* multilingual model usage is different though. Some models, like [bert-base-multilingual-uncased](https://huggingface.co/bert-base-multilingual-uncased), can be used just like a monolingual model. This guide will show you how to use multilingual models whose usage differs for inference.
+
+## XLM
+
+XLM has ten different checkpoints, only one of which is monolingual. The nine remaining model checkpoints can be split into two categories: the checkpoints that use language embeddings and those that don't.
+
+### XLM with language embeddings
+
+The following XLM models use language embeddings to specify the language used at inference:
+
+- `xlm-mlm-ende-1024` (Masked language modeling, English-German)
+- `xlm-mlm-enfr-1024` (Masked language modeling, English-French)
+- `xlm-mlm-enro-1024` (Masked language modeling, English-Romanian)
+- `xlm-mlm-xnli15-1024` (Masked language modeling, XNLI languages)
+- `xlm-mlm-tlm-xnli15-1024` (Masked language modeling + translation, XNLI languages)
+- `xlm-clm-enfr-1024` (Causal language modeling, English-French)
+- `xlm-clm-ende-1024` (Causal language modeling, English-German)
+
+Language embeddings are represented as a tensor of the same shape as the `input_ids` passed to the model. The values in these tensors depend on the language used and are identified by the tokenizer's `lang2id` and `id2lang` attributes.
+
+In this example, load the `xlm-clm-enfr-1024` checkpoint (Causal language modeling, English-French):
+
+```py
+>>> import torch
+>>> from transformers import XLMTokenizer, XLMWithLMHeadModel
+
+>>> tokenizer = XLMTokenizer.from_pretrained("xlm-clm-enfr-1024")
+>>> model = XLMWithLMHeadModel.from_pretrained("xlm-clm-enfr-1024")
+```
+
+The `lang2id` attribute of the tokenizer displays this model's languages and their ids:
+
+```py
+>>> print(tokenizer.lang2id)
+{'en': 0, 'fr': 1}
+```
+
+Next, create an example input:
+
+```py
+>>> input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")])  # batch size of 1
+```
+
+Set the language id as `"en"` and use it to define the language embedding. The language embedding is a tensor filled with `0` since that is the language id for English. This tensor should be the same size as `input_ids`. 
+
+```py
+>>> language_id = tokenizer.lang2id["en"]  # 0
+>>> langs = torch.tensor([language_id] * input_ids.shape[1])  # torch.tensor([0, 0, 0, ..., 0])
+
+>>> # We reshape it to be of size (batch_size, sequence_length)
+>>> langs = langs.view(1, -1)  # is now of shape [1, sequence_length] (we have a batch size of 1)
+```
+
+Now you can pass the `input_ids` and language embedding to the model:
+
+```py
+>>> outputs = model(input_ids, langs=langs)
+```
+
+The [run_generation.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-generation/run_generation.py) script can generate text with language embeddings using the `xlm-clm` checkpoints.
+
+### XLM without language embeddings
+
+The following XLM models do not require language embeddings during inference:
+
+- `xlm-mlm-17-1280` (Masked language modeling, 17 languages)
+- `xlm-mlm-100-1280` (Masked language modeling, 100 languages)
+
+These models are used for generic sentence representations, unlike the previous XLM checkpoints.
+
+## BERT
+
+The following BERT models can be used for multilingual tasks:
+
+- `bert-base-multilingual-uncased` (Masked language modeling + Next sentence prediction, 102 languages)
+- `bert-base-multilingual-cased` (Masked language modeling + Next sentence prediction, 104 languages)
+
+These models do not require language embeddings during inference. They should identify the language from the
+context and infer accordingly.
+
+## XLM-RoBERTa
+
+The following XLM-RoBERTa models can be used for multilingual tasks:
+
+- `xlm-roberta-base` (Masked language modeling, 100 languages)
+- `xlm-roberta-large` (Masked language modeling, 100 languages)
+
+XLM-RoBERTa was trained on 2.5TB of newly created and cleaned CommonCrawl data in 100 languages. It provides strong gains over previously released multilingual models like mBERT or XLM on downstream tasks like classification, sequence labeling, and question answering.
+
+## M2M100
+
+The following M2M100 models can be used for multilingual translation:
+
+- `facebook/m2m100_418M` (Translation)
+- `facebook/m2m100_1.2B` (Translation)
+
+In this example, load the `facebook/m2m100_418M` checkpoint to translate from Chinese to English. You can set the source language in the tokenizer:
+
+```py
+>>> from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
+
+>>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger."
+>>> chinese_text = "不要插手巫師的事務, 因為他們是微妙的, 很快就會發怒."
+
+>>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="zh")
+>>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
+```
+
+Tokenize the text:
+
+```py
+>>> encoded_zh = tokenizer(chinese_text, return_tensors="pt")
+```
+
+M2M100 forces the target language id as the first generated token to translate to the target language. Set the `forced_bos_token_id` to `en` in the `generate` method to translate to English:
+
+```py
+>>> generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en"))
+>>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+'Do not interfere with the matters of the witches, because they are delicate and will soon be angry.'
+```
+
+## MBart
+
+The following MBart models can be used for multilingual translation:
+
+- `facebook/mbart-large-50-one-to-many-mmt` (One-to-many multilingual machine translation, 50 languages)
+- `facebook/mbart-large-50-many-to-many-mmt` (Many-to-many multilingual machine translation, 50 languages)
+- `facebook/mbart-large-50-many-to-one-mmt` (Many-to-one multilingual machine translation, 50 languages)
+- `facebook/mbart-large-50` (Multilingual translation, 50 languages)
+- `facebook/mbart-large-cc25`
+
+In this example, load the `facebook/mbart-large-50-many-to-many-mmt` checkpoint to translate Finnish to English. You can set the source language in the tokenizer:
+
+```py
+>>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+
+>>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger."
+>>> fi_text = "Älä sekaannu velhojen asioihin, sillä ne ovat hienovaraisia ja nopeasti vihaisia."
+
+>>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="fi_FI")
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
+```
+
+Tokenize the text:
+
+```py
+>>> encoded_en = tokenizer(en_text, return_tensors="pt")
+```
+
+MBart forces the target language id as the first generated token to translate to the target language. Set the `forced_bos_token_id` to `en` in the `generate` method to translate to English:
+
+```py
+>>> generated_tokens = model.generate(**encoded_en, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"])
+>>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+"Don't interfere with the wizard's affairs, because they are subtle, will soon get angry."
+```
+
+If you are using the `facebook/mbart-large-50-many-to-one-mmt` checkpoint, you don't need to force the target language id as the first generated token otherwise the usage is the same.
diff --git a/docs/source/en/multilingual.mdx b/docs/source/en/multilingual.mdx
deleted file mode 100644
index 7c95de6ffc09..000000000000
--- a/docs/source/en/multilingual.mdx
+++ /dev/null
@@ -1,175 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Multilingual models for inference
-
-[[open-in-colab]]
-
-There are several multilingual models in 🤗 Transformers, and their inference usage differs from monolingual models. Not *all* multilingual model usage is different though. Some models, like [bert-base-multilingual-uncased](https://huggingface.co/bert-base-multilingual-uncased), can be used just like a monolingual model. This guide will show you how to use multilingual models whose usage differs for inference.
-
-## XLM
-
-XLM has ten different checkpoints, only one of which is monolingual. The nine remaining model checkpoints can be split into two categories: the checkpoints that use language embeddings and those that don't.
-
-### XLM with language embeddings
-
-The following XLM models use language embeddings to specify the language used at inference:
-
-- `xlm-mlm-ende-1024` (Masked language modeling, English-German)
-- `xlm-mlm-enfr-1024` (Masked language modeling, English-French)
-- `xlm-mlm-enro-1024` (Masked language modeling, English-Romanian)
-- `xlm-mlm-xnli15-1024` (Masked language modeling, XNLI languages)
-- `xlm-mlm-tlm-xnli15-1024` (Masked language modeling + translation, XNLI languages)
-- `xlm-clm-enfr-1024` (Causal language modeling, English-French)
-- `xlm-clm-ende-1024` (Causal language modeling, English-German)
-
-Language embeddings are represented as a tensor of the same shape as the `input_ids` passed to the model. The values in these tensors depend on the language used and are identified by the tokenizer's `lang2id` and `id2lang` attributes.
-
-In this example, load the `xlm-clm-enfr-1024` checkpoint (Causal language modeling, English-French):
-
-```py
->>> import torch
->>> from transformers import XLMTokenizer, XLMWithLMHeadModel
-
->>> tokenizer = XLMTokenizer.from_pretrained("xlm-clm-enfr-1024")
->>> model = XLMWithLMHeadModel.from_pretrained("xlm-clm-enfr-1024")
-```
-
-The `lang2id` attribute of the tokenizer displays this model's languages and their ids:
-
-```py
->>> print(tokenizer.lang2id)
-{'en': 0, 'fr': 1}
-```
-
-Next, create an example input:
-
-```py
->>> input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")])  # batch size of 1
-```
-
-Set the language id as `"en"` and use it to define the language embedding. The language embedding is a tensor filled with `0` since that is the language id for English. This tensor should be the same size as `input_ids`. 
-
-```py
->>> language_id = tokenizer.lang2id["en"]  # 0
->>> langs = torch.tensor([language_id] * input_ids.shape[1])  # torch.tensor([0, 0, 0, ..., 0])
-
->>> # We reshape it to be of size (batch_size, sequence_length)
->>> langs = langs.view(1, -1)  # is now of shape [1, sequence_length] (we have a batch size of 1)
-```
-
-Now you can pass the `input_ids` and language embedding to the model:
-
-```py
->>> outputs = model(input_ids, langs=langs)
-```
-
-The [run_generation.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-generation/run_generation.py) script can generate text with language embeddings using the `xlm-clm` checkpoints.
-
-### XLM without language embeddings
-
-The following XLM models do not require language embeddings during inference:
-
-- `xlm-mlm-17-1280` (Masked language modeling, 17 languages)
-- `xlm-mlm-100-1280` (Masked language modeling, 100 languages)
-
-These models are used for generic sentence representations, unlike the previous XLM checkpoints.
-
-## BERT
-
-The following BERT models can be used for multilingual tasks:
-
-- `bert-base-multilingual-uncased` (Masked language modeling + Next sentence prediction, 102 languages)
-- `bert-base-multilingual-cased` (Masked language modeling + Next sentence prediction, 104 languages)
-
-These models do not require language embeddings during inference. They should identify the language from the
-context and infer accordingly.
-
-## XLM-RoBERTa
-
-The following XLM-RoBERTa models can be used for multilingual tasks:
-
-- `xlm-roberta-base` (Masked language modeling, 100 languages)
-- `xlm-roberta-large` (Masked language modeling, 100 languages)
-
-XLM-RoBERTa was trained on 2.5TB of newly created and cleaned CommonCrawl data in 100 languages. It provides strong gains over previously released multilingual models like mBERT or XLM on downstream tasks like classification, sequence labeling, and question answering.
-
-## M2M100
-
-The following M2M100 models can be used for multilingual translation:
-
-- `facebook/m2m100_418M` (Translation)
-- `facebook/m2m100_1.2B` (Translation)
-
-In this example, load the `facebook/m2m100_418M` checkpoint to translate from Chinese to English. You can set the source language in the tokenizer:
-
-```py
->>> from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
-
->>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger."
->>> chinese_text = "不要插手巫師的事務, 因為他們是微妙的, 很快就會發怒."
-
->>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="zh")
->>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
-```
-
-Tokenize the text:
-
-```py
->>> encoded_zh = tokenizer(chinese_text, return_tensors="pt")
-```
-
-M2M100 forces the target language id as the first generated token to translate to the target language. Set the `forced_bos_token_id` to `en` in the `generate` method to translate to English:
-
-```py
->>> generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en"))
->>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
-'Do not interfere with the matters of the witches, because they are delicate and will soon be angry.'
-```
-
-## MBart
-
-The following MBart models can be used for multilingual translation:
-
-- `facebook/mbart-large-50-one-to-many-mmt` (One-to-many multilingual machine translation, 50 languages)
-- `facebook/mbart-large-50-many-to-many-mmt` (Many-to-many multilingual machine translation, 50 languages)
-- `facebook/mbart-large-50-many-to-one-mmt` (Many-to-one multilingual machine translation, 50 languages)
-- `facebook/mbart-large-50` (Multilingual translation, 50 languages)
-- `facebook/mbart-large-cc25`
-
-In this example, load the `facebook/mbart-large-50-many-to-many-mmt` checkpoint to translate Finnish to English. You can set the source language in the tokenizer:
-
-```py
->>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-
->>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger."
->>> fi_text = "Älä sekaannu velhojen asioihin, sillä ne ovat hienovaraisia ja nopeasti vihaisia."
-
->>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="fi_FI")
->>> model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
-```
-
-Tokenize the text:
-
-```py
->>> encoded_en = tokenizer(en_text, return_tensors="pt")
-```
-
-MBart forces the target language id as the first generated token to translate to the target language. Set the `forced_bos_token_id` to `en` in the `generate` method to translate to English:
-
-```py
->>> generated_tokens = model.generate(**encoded_en, forced_bos_token_id=tokenizer.lang_code_to_id("en_XX"))
->>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
-"Don't interfere with the wizard's affairs, because they are subtle, will soon get angry."
-```
-
-If you are using the `facebook/mbart-large-50-many-to-one-mmt` checkpoint, you don't need to force the target language id as the first generated token otherwise the usage is the same.
\ No newline at end of file
diff --git a/docs/source/en/pad_truncation.md b/docs/source/en/pad_truncation.md
new file mode 100644
index 000000000000..8094dc1bc2aa
--- /dev/null
+++ b/docs/source/en/pad_truncation.md
@@ -0,0 +1,71 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Padding and truncation
+
+Batched inputs are often different lengths, so they can't be converted to fixed-size tensors. Padding and truncation are strategies for dealing with this problem, to create rectangular tensors from batches of varying lengths. Padding adds a special **padding token** to ensure shorter sequences will have the same length as either the longest sequence in a batch or the maximum length accepted by the model. Truncation works in the other direction by truncating long sequences.
+
+In most cases, padding your batch to the length of the longest sequence and truncating to the maximum length a model can accept works pretty well. However, the API supports more strategies if you need them. The three arguments you need to are: `padding`, `truncation` and `max_length`.
+
+The `padding` argument controls padding. It can be a boolean or a string:
+
+  - `True` or `'longest'`: pad to the longest sequence in the batch (no padding is applied if you only provide
+    a single sequence).
+  - `'max_length'`: pad to a length specified by the `max_length` argument or the maximum length accepted
+    by the model if no `max_length` is provided (`max_length=None`). Padding will still be applied if you only provide a single sequence.
+  - `False` or `'do_not_pad'`: no padding is applied. This is the default behavior.
+
+The `truncation` argument controls truncation. It can be a boolean or a string:
+
+  - `True` or `'longest_first'`: truncate to a maximum length specified by the `max_length` argument or
+    the maximum length accepted by the model if no `max_length` is provided (`max_length=None`). This will
+    truncate token by token, removing a token from the longest sequence in the pair until the proper length is
+    reached.
+  - `'only_second'`: truncate to a maximum length specified by the `max_length` argument or the maximum
+    length accepted by the model if no `max_length` is provided (`max_length=None`). This will only truncate
+    the second sentence of a pair if a pair of sequences (or a batch of pairs of sequences) is provided.
+  - `'only_first'`: truncate to a maximum length specified by the `max_length` argument or the maximum
+    length accepted by the model if no `max_length` is provided (`max_length=None`). This will only truncate
+    the first sentence of a pair if a pair of sequences (or a batch of pairs of sequences) is provided.
+  - `False` or `'do_not_truncate'`: no truncation is applied. This is the default behavior.
+
+The `max_length` argument controls the length of the padding and truncation. It can be an integer or `None`, in which case it will default to the maximum length the model can accept. If the model has no specific maximum input length, truncation or padding to `max_length` is deactivated.
+
+The following table summarizes the recommended way to setup padding and truncation. If you use pairs of input sequences in any of the following examples, you can replace `truncation=True` by a `STRATEGY` selected in
+`['only_first', 'only_second', 'longest_first']`, i.e. `truncation='only_second'` or `truncation='longest_first'` to control how both sequences in the pair are truncated as detailed before.
+
+| Truncation                           | Padding                           | Instruction                                                                                 |
+|--------------------------------------|-----------------------------------|---------------------------------------------------------------------------------------------|
+| no truncation                        | no padding                        | `tokenizer(batch_sentences)`                                                           |
+|                                      | padding to max sequence in batch  | `tokenizer(batch_sentences, padding=True)` or                                          |
+|                                      |                                   | `tokenizer(batch_sentences, padding='longest')`                                        |
+|                                      | padding to max model input length | `tokenizer(batch_sentences, padding='max_length')`                                     |
+|                                      | padding to specific length        | `tokenizer(batch_sentences, padding='max_length', max_length=42)`                      |
+|                                      | padding to a multiple of a value  | `tokenizer(batch_sentences, padding=True, pad_to_multiple_of=8)                        |
+| truncation to max model input length | no padding                        | `tokenizer(batch_sentences, truncation=True)` or                                       |
+|                                      |                                   | `tokenizer(batch_sentences, truncation=STRATEGY)`                                      |
+|                                      | padding to max sequence in batch  | `tokenizer(batch_sentences, padding=True, truncation=True)` or                         |
+|                                      |                                   | `tokenizer(batch_sentences, padding=True, truncation=STRATEGY)`                        |
+|                                      | padding to max model input length | `tokenizer(batch_sentences, padding='max_length', truncation=True)` or                 |
+|                                      |                                   | `tokenizer(batch_sentences, padding='max_length', truncation=STRATEGY)`                |
+|                                      | padding to specific length        | Not possible                                                                                |
+| truncation to specific length        | no padding                        | `tokenizer(batch_sentences, truncation=True, max_length=42)` or                        |
+|                                      |                                   | `tokenizer(batch_sentences, truncation=STRATEGY, max_length=42)`                       |
+|                                      | padding to max sequence in batch  | `tokenizer(batch_sentences, padding=True, truncation=True, max_length=42)` or          |
+|                                      |                                   | `tokenizer(batch_sentences, padding=True, truncation=STRATEGY, max_length=42)`         |
+|                                      | padding to max model input length | Not possible                                                                                |
+|                                      | padding to specific length        | `tokenizer(batch_sentences, padding='max_length', truncation=True, max_length=42)` or  |
+|                                      |                                   | `tokenizer(batch_sentences, padding='max_length', truncation=STRATEGY, max_length=42)` |
diff --git a/docs/source/en/pad_truncation.mdx b/docs/source/en/pad_truncation.mdx
deleted file mode 100644
index f848e23bed50..000000000000
--- a/docs/source/en/pad_truncation.mdx
+++ /dev/null
@@ -1,66 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Padding and truncation
-
-Batched inputs are often different lengths, so they can't be converted to fixed-size tensors. Padding and truncation are strategies for dealing with this problem, to create rectangular tensors from batches of varying lengths. Padding adds a special **padding token** to ensure shorter sequences will have the same length as either the longest sequence in a batch or the maximum length accepted by the model. Truncation works in the other direction by truncating long sequences.
-
-In most cases, padding your batch to the length of the longest sequence and truncating to the maximum length a model can accept works pretty well. However, the API supports more strategies if you need them. The three arguments you need to are: `padding`, `truncation` and `max_length`.
-
-The `padding` argument controls padding. It can be a boolean or a string:
-
-  - `True` or `'longest'`: pad to the longest sequence in the batch (no padding is applied if you only provide
-    a single sequence).
-  - `'max_length'`: pad to a length specified by the `max_length` argument or the maximum length accepted
-    by the model if no `max_length` is provided (`max_length=None`). Padding will still be applied if you only provide a single sequence.
-  - `False` or `'do_not_pad'`: no padding is applied. This is the default behavior.
-
-The `truncation` argument controls truncation. It can be a boolean or a string:
-
-  - `True` or `'longest_first'`: truncate to a maximum length specified by the `max_length` argument or
-    the maximum length accepted by the model if no `max_length` is provided (`max_length=None`). This will
-    truncate token by token, removing a token from the longest sequence in the pair until the proper length is
-    reached.
-  - `'only_second'`: truncate to a maximum length specified by the `max_length` argument or the maximum
-    length accepted by the model if no `max_length` is provided (`max_length=None`). This will only truncate
-    the second sentence of a pair if a pair of sequences (or a batch of pairs of sequences) is provided.
-  - `'only_first'`: truncate to a maximum length specified by the `max_length` argument or the maximum
-    length accepted by the model if no `max_length` is provided (`max_length=None`). This will only truncate
-    the first sentence of a pair if a pair of sequences (or a batch of pairs of sequences) is provided.
-  - `False` or `'do_not_truncate'`: no truncation is applied. This is the default behavior.
-
-The `max_length` argument controls the length of the padding and truncation. It can be an integer or `None`, in which case it will default to the maximum length the model can accept. If the model has no specific maximum input length, truncation or padding to `max_length` is deactivated.
-
-The following table summarizes the recommended way to setup padding and truncation. If you use pairs of input sequences in any of the following examples, you can replace `truncation=True` by a `STRATEGY` selected in
-`['only_first', 'only_second', 'longest_first']`, i.e. `truncation='only_second'` or `truncation='longest_first'` to control how both sequences in the pair are truncated as detailed before.
-
-| Truncation                           | Padding                           | Instruction                                                                                 |
-|--------------------------------------|-----------------------------------|---------------------------------------------------------------------------------------------|
-| no truncation                        | no padding                        | `tokenizer(batch_sentences)`                                                           |
-|                                      | padding to max sequence in batch  | `tokenizer(batch_sentences, padding=True)` or                                          |
-|                                      |                                   | `tokenizer(batch_sentences, padding='longest')`                                        |
-|                                      | padding to max model input length | `tokenizer(batch_sentences, padding='max_length')`                                     |
-|                                      | padding to specific length        | `tokenizer(batch_sentences, padding='max_length', max_length=42)`                      |
-| truncation to max model input length | no padding                        | `tokenizer(batch_sentences, truncation=True)` or                                       |
-|                                      |                                   | `tokenizer(batch_sentences, truncation=STRATEGY)`                                      |
-|                                      | padding to max sequence in batch  | `tokenizer(batch_sentences, padding=True, truncation=True)` or                         |
-|                                      |                                   | `tokenizer(batch_sentences, padding=True, truncation=STRATEGY)`                        |
-|                                      | padding to max model input length | `tokenizer(batch_sentences, padding='max_length', truncation=True)` or                 |
-|                                      |                                   | `tokenizer(batch_sentences, padding='max_length', truncation=STRATEGY)`                |
-|                                      | padding to specific length        | Not possible                                                                                |
-| truncation to specific length        | no padding                        | `tokenizer(batch_sentences, truncation=True, max_length=42)` or                        |
-|                                      |                                   | `tokenizer(batch_sentences, truncation=STRATEGY, max_length=42)`                       |
-|                                      | padding to max sequence in batch  | `tokenizer(batch_sentences, padding=True, truncation=True, max_length=42)` or          |
-|                                      |                                   | `tokenizer(batch_sentences, padding=True, truncation=STRATEGY, max_length=42)`         |
-|                                      | padding to max model input length | Not possible                                                                                |
-|                                      | padding to specific length        | `tokenizer(batch_sentences, padding='max_length', truncation=True, max_length=42)` or  |
-|                                      |                                   | `tokenizer(batch_sentences, padding='max_length', truncation=STRATEGY, max_length=42)` |
diff --git a/docs/source/en/peft.md b/docs/source/en/peft.md
new file mode 100644
index 000000000000..302b614e5f7b
--- /dev/null
+++ b/docs/source/en/peft.md
@@ -0,0 +1,216 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+# Load adapters with 🤗 PEFT
+
+[[open-in-colab]]
+
+[Parameter-Efficient Fine Tuning (PEFT)](https://huggingface.co/blog/peft) methods freeze the pretrained model parameters during fine-tuning and add a small number of trainable parameters (the adapters) on top of it. The adapters are trained to learn task-specific information. This approach has been shown to be very memory-efficient with lower compute usage while producing results comparable to a fully fine-tuned model. 
+
+Adapters trained with PEFT are also usually an order of magnitude smaller than the full model, making it convenient to share, store, and load them.
+
+<div class="flex flex-col justify-center">
+  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/peft/PEFT-hub-screenshot.png"/>
+  <figcaption class="text-center">The adapter weights for a OPTForCausalLM model stored on the Hub are only ~6MB compared to the full size of the model weights, which can be ~700MB.</figcaption>
+</div>
+
+If you're interested in learning more about the 🤗 PEFT library, check out the [documentation](https://huggingface.co/docs/peft/index).
+
+## Setup
+
+Get started by installing 🤗 PEFT:
+
+```bash
+pip install peft
+```
+
+If you want to try out the brand new features, you might be interested in installing the library from source:
+
+```bash
+pip install git+https://github.com/huggingface/peft.git
+```
+
+## Supported PEFT models
+
+🤗 Transformers natively supports some PEFT methods, meaning you can load adapter weights stored locally or on the Hub and easily run or train them with a few lines of code. The following methods are supported:
+
+- [Low Rank Adapters](https://huggingface.co/docs/peft/conceptual_guides/lora)
+- [IA3](https://huggingface.co/docs/peft/conceptual_guides/ia3)
+- [AdaLoRA](https://arxiv.org/abs/2303.10512)
+
+If you want to use other PEFT methods, such as prompt learning or prompt tuning, or about the 🤗 PEFT library in general, please refer to the [documentation](https://huggingface.co/docs/peft/index).
+
+
+## Load a PEFT adapter
+
+To load and use a PEFT adapter model from 🤗 Transformers, make sure the Hub repository or local directory contains an `adapter_config.json` file and the adapter weights, as shown in the example image above. Then you can load the PEFT adapter model using the `AutoModelFor` class. For example, to load a PEFT adapter model for causal language modeling:
+
+1. specify the PEFT model id
+2. pass it to the [`AutoModelForCausalLM`] class
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+peft_model_id = "ybelkada/opt-350m-lora"
+model = AutoModelForCausalLM.from_pretrained(peft_model_id)
+```
+
+<Tip>
+
+You can load a PEFT adapter with either an `AutoModelFor` class or the base model class like `OPTForCausalLM` or `LlamaForCausalLM`.
+
+</Tip>
+
+You can also load a PEFT adapter by calling the `load_adapter` method:
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_id = "facebook/opt-350m"
+peft_model_id = "ybelkada/opt-350m-lora"
+
+model = AutoModelForCausalLM.from_pretrained(model_id)
+model.load_adapter(peft_model_id)
+```
+
+## Load in 8bit or 4bit
+
+The `bitsandbytes` integration supports 8bit and 4bit precision data types, which are useful for loading large models because it saves memory (see the `bitsandbytes` integration [guide](./quantization#bitsandbytes-integration) to learn more). Add the `load_in_8bit` or `load_in_4bit` parameters to [`~PreTrainedModel.from_pretrained`] and set `device_map="auto"` to effectively distribute the model to your hardware:
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+peft_model_id = "ybelkada/opt-350m-lora"
+model = AutoModelForCausalLM.from_pretrained(peft_model_id, device_map="auto", load_in_8bit=True)
+```
+
+## Add a new adapter
+
+You can use [`~peft.PeftModel.add_adapter`] to add a new adapter to a model with an existing adapter as long as the new adapter is the same type as the current one. For example, if you have an existing LoRA adapter attached to a model:
+
+```py
+from transformers import AutoModelForCausalLM, OPTForCausalLM, AutoTokenizer
+from peft import PeftConfig
+
+model_id = "facebook/opt-350m"
+model = AutoModelForCausalLM.from_pretrained(model_id)
+
+lora_config = LoraConfig(
+    target_modules=["q_proj", "k_proj"],
+    init_lora_weights=False
+)
+
+model.add_adapter(lora_config, adapter_name="adapter_1")
+```
+
+To add a new adapter:
+
+```py
+# attach new adapter with same config
+model.add_adapter(lora_config, adapter_name="adapter_2")
+```
+
+Now you can use [`~peft.PeftModel.set_adapter`] to set which adapter to use:
+
+```py
+# use adapter_1
+model.set_adapter("adapter_1")
+output = model.generate(**inputs)
+print(tokenizer.decode(output_disabled[0], skip_special_tokens=True))
+
+# use adapter_2
+model.set_adapter("adapter_2")
+output_enabled = model.generate(**inputs)
+print(tokenizer.decode(output_enabled[0], skip_special_tokens=True))
+```
+
+## Enable and disable adapters
+
+Once you've added an adapter to a model, you can enable or disable the adapter module. To enable the adapter module:
+
+```py
+from transformers import AutoModelForCausalLM, OPTForCausalLM, AutoTokenizer
+from peft import PeftConfig
+
+model_id = "facebook/opt-350m"
+adapter_model_id = "ybelkada/opt-350m-lora"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+text = "Hello"
+inputs = tokenizer(text, return_tensors="pt")
+
+model = AutoModelForCausalLM.from_pretrained(model_id)
+peft_config = PeftConfig.from_pretrained(adapter_model_id)
+
+# to initiate with random weights
+peft_config.init_lora_weights = False
+
+model.add_adapter(peft_config)
+model.enable_adapters()
+output = model.generate(**inputs)
+```
+
+To disable the adapter module:
+
+```py
+model.disable_adapters()
+output = model.generate(**inputs)
+```
+
+## Train a PEFT adapter
+
+PEFT adapters are supported by the [`Trainer`] class so that you can train an adapter for your specific use case. It only requires adding a few more lines of code. For example, to train a LoRA adapter:
+
+<Tip>
+
+If you aren't familiar with fine-tuning a model with [`Trainer`], take a look at the [Fine-tune a pretrained model](training) tutorial.
+
+</Tip>
+
+1. Define your adapter configuration with the task type and hyperparameters (see [`~peft.LoraConfig`] for more details about what the hyperparameters do).
+
+```py
+from peft import LoraConfig
+
+peft_config = LoraConfig(
+    lora_alpha=16,
+    lora_dropout=0.1,
+    r=64,
+    bias="none",
+    task_type="CAUSAL_LM",
+)
+```
+
+2. Add adapter to the model.
+
+```py
+model.add_adapter(peft_config)
+```
+
+3. Now you can pass the model to [`Trainer`]!
+
+```py
+trainer = Trainer(model=model, ...)
+trainer.train()
+```
+
+To save your trained adapter and load it back:
+
+```py
+model.save_pretrained(save_dir)
+model = AutoModelForCausalLM.from_pretrained(save_dir)
+```
+
+<!--
+TODO: (@younesbelkada @stevhliu)
+-   Link to PEFT docs for further details
+-   Trainer  
+-   8-bit / 4-bit examples ?
+-->
diff --git a/docs/source/en/perf_hardware.md b/docs/source/en/perf_hardware.md
new file mode 100644
index 000000000000..a28824346e4b
--- /dev/null
+++ b/docs/source/en/perf_hardware.md
@@ -0,0 +1,155 @@
+<!---
+Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+
+# Custom hardware for training
+
+The hardware you use to run model training and inference can have a big effect on performance. For a deep dive into GPUs make sure to check out Tim Dettmer's excellent [blog post](https://timdettmers.com/2020/09/07/which-gpu-for-deep-learning/).
+
+Let's have a look at some practical advice for GPU setups.
+
+## GPU
+When you train bigger models you have essentially three options:
+
+- bigger GPUs
+- more GPUs
+- more CPU and NVMe (offloaded to by [DeepSpeed-Infinity](main_classes/deepspeed#nvme-support))
+
+Let's start at the case where you have a single GPU.
+
+### Power and Cooling
+
+If you bought an expensive high end GPU make sure you give it the correct power and sufficient cooling.
+
+**Power**:
+
+Some high end consumer GPU cards have 2 and sometimes 3 PCI-E 8-Pin power sockets. Make sure you have as many independent 12V PCI-E 8-Pin cables plugged into the card as there are sockets. Do not use the 2 splits at one end of the same cable (also known as pigtail cable). That is if you have 2 sockets on the GPU, you want 2 PCI-E 8-Pin cables going from your PSU to the card and not one that has 2 PCI-E 8-Pin connectors at the end! You won't get the full performance out of your card otherwise.
+
+Each PCI-E 8-Pin power cable needs to be plugged into a 12V rail on the PSU side and can supply up to 150W of power.
+
+Some other cards may use a PCI-E 12-Pin connectors, and these can deliver up to 500-600W of power.
+
+Low end cards may use 6-Pin connectors, which supply up to 75W of power.
+
+Additionally you want the high-end PSU that has stable voltage. Some lower quality ones may not give the card the stable voltage it needs to function at its peak.
+
+And of course the PSU needs to have enough unused Watts to power the card.
+
+**Cooling**:
+
+When a GPU gets overheated it will start throttling down and will not deliver full performance and it can even shutdown if it gets too hot.
+
+It's hard to tell the exact best temperature to strive for when a GPU is heavily loaded, but probably anything under +80C is good, but lower is better - perhaps 70-75C is an excellent range to be in. The throttling down is likely to start at around 84-90C. But other than throttling performance a prolonged very high temperature is likely to reduce the lifespan of a GPU.
+
+Next let's have a look at one of the most important aspects when having multiple GPUs: connectivity.
+
+### Multi-GPU Connectivity
+
+If you use multiple GPUs the way cards are inter-connected can have a huge impact on the total training time. If the GPUs are on the same physical node, you can run:
+
+```
+nvidia-smi topo -m
+```
+
+and it will tell you how the GPUs are inter-connected. On a machine with dual-GPU and which are connected with NVLink, you will most likely see something like:
+
+```
+        GPU0    GPU1    CPU Affinity    NUMA Affinity
+GPU0     X      NV2     0-23            N/A
+GPU1    NV2      X      0-23            N/A
+```
+
+on a different machine w/o NVLink we may see:
+```
+        GPU0    GPU1    CPU Affinity    NUMA Affinity
+GPU0     X      PHB     0-11            N/A
+GPU1    PHB      X      0-11            N/A
+```
+
+The report includes this legend:
+
+```
+  X    = Self
+  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
+  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
+  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
+  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
+  PIX  = Connection traversing at most a single PCIe bridge
+  NV#  = Connection traversing a bonded set of # NVLinks
+```
+
+So the first report `NV2` tells us the GPUs are interconnected with 2 NVLinks, and the second report `PHB` we have a typical consumer-level PCIe+Bridge setup.
+
+Check what type of connectivity you have on your setup. Some of these will make the communication between cards faster (e.g. NVLink), others slower (e.g. PHB).
+
+Depending on the type of scalability solution used, the connectivity speed could have a major or a minor impact. If the GPUs need to sync rarely, as in DDP, the impact of a slower connection will be less significant. If the GPUs need to send messages to each other often, as in ZeRO-DP, then faster connectivity becomes super important to achieve faster training.
+
+#### NVlink
+
+[NVLink](https://en.wikipedia.org/wiki/NVLink) is a wire-based serial multi-lane near-range communications link developed by Nvidia.
+
+Each new generation provides a faster bandwidth, e.g. here is a quote from [Nvidia Ampere GA102 GPU Architecture](https://www.nvidia.com/content/dam/en-zz/Solutions/geforce/ampere/pdf/NVIDIA-ampere-GA102-GPU-Architecture-Whitepaper-V1.pdf):
+
+> Third-Generation NVLink®
+> GA102 GPUs utilize NVIDIA’s third-generation NVLink interface, which includes four x4 links,
+> with each link providing 14.0625 GB/sec bandwidth in each direction between two GPUs. Four
+> links provide 56.25 GB/sec bandwidth in each direction, and 112.5 GB/sec total bandwidth
+> between two GPUs. Two RTX 3090 GPUs can be connected together for SLI using NVLink.
+> (Note that 3-Way and 4-Way SLI configurations are not supported.)
+
+So the higher `X` you get in the report of `NVX` in the output of `nvidia-smi topo -m` the better. The generation will depend on your GPU architecture.
+
+Let's compare the execution of a gpt2 language model training over a small sample of wikitext.
+
+The results are:
+
+
+| NVlink | Time |
+| -----  | ---: |
+| Y      | 101s |
+| N      | 131s |
+
+
+You can see that NVLink completes the training ~23% faster. In the second benchmark we use `NCCL_P2P_DISABLE=1` to tell the GPUs not to use NVLink.
+
+Here is the full benchmark code and outputs:
+
+```bash
+# DDP w/ NVLink
+
+rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch \
+--nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \
+--dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train \
+--output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
+
+{'train_runtime': 101.9003, 'train_samples_per_second': 1.963, 'epoch': 0.69}
+
+# DDP w/o NVLink
+
+rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 NCCL_P2P_DISABLE=1 python -m torch.distributed.launch \
+--nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \
+--dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train
+--output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
+
+{'train_runtime': 131.4367, 'train_samples_per_second': 1.522, 'epoch': 0.69}
+```
+
+Hardware: 2x TITAN RTX 24GB each + NVlink with 2 NVLinks (`NV2` in `nvidia-smi topo -m`)
+Software: `pytorch-1.8-to-be` + `cuda-11.0` / `transformers==4.3.0.dev0`
diff --git a/docs/source/en/perf_hardware.mdx b/docs/source/en/perf_hardware.mdx
deleted file mode 100644
index b28df49892b1..000000000000
--- a/docs/source/en/perf_hardware.mdx
+++ /dev/null
@@ -1,150 +0,0 @@
-<!---
-Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-
-# Custom hardware for training
-
-The hardware you use to run model training and inference can have a big effect on performance. For a deep dive into GPUs make sure to check out Tim Dettmer's excellent [blog post](https://timdettmers.com/2020/09/07/which-gpu-for-deep-learning/).
-
-Let's have a look at some practical advice for GPU setups.
-
-## GPU
-When you train bigger models you have essentially three options:
-- bigger GPUs
-- more GPUs
-- more CPU and NVMe (offloaded to by [DeepSpeed-Infinity](main_classes/deepspeed#nvme-support))
-
-Let's start at the case where you have a single GPU.
-
-### Power and Cooling
-
-If you bought an expensive high end GPU make sure you give it the correct power and sufficient cooling.
-
-**Power**:
-
-Some high end consumer GPU cards have 2 and sometimes 3 PCI-E 8-Pin power sockets. Make sure you have as many independent 12V PCI-E 8-Pin cables plugged into the card as there are sockets. Do not use the 2 splits at one end of the same cable (also known as pigtail cable). That is if you have 2 sockets on the GPU, you want 2 PCI-E 8-Pin cables going from your PSU to the card and not one that has 2 PCI-E 8-Pin connectors at the end! You won't get the full performance out of your card otherwise.
-
-Each PCI-E 8-Pin power cable needs to be plugged into a 12V rail on the PSU side and can supply up to 150W of power.
-
-Some other cards may use a PCI-E 12-Pin connectors, and these can deliver up to 500-600W of power.
-
-Low end cards may use 6-Pin connectors, which supply up to 75W of power.
-
-Additionally you want the high-end PSU that has stable voltage. Some lower quality ones may not give the card the stable voltage it needs to function at its peak.
-
-And of course the PSU needs to have enough unused Watts to power the card.
-
-**Cooling**:
-
-When a GPU gets overheated it will start throttling down and will not deliver full performance and it can even shutdown if it gets too hot.
-
-It's hard to tell the exact best temperature to strive for when a GPU is heavily loaded, but probably anything under +80C is good, but lower is better - perhaps 70-75C is an excellent range to be in. The throttling down is likely to start at around 84-90C. But other than throttling performance a prolonged very high temperature is likely to reduce the lifespan of a GPU.
-
-Next let's have a look at one of the most important aspects when having multiple GPUs: connectivity.
-
-### Multi-GPU Connectivity
-
-If you use multiple GPUs the way cards are inter-connected can have a huge impact on the total training time. If the GPUs are on the same physical node, you can run:
-
-```
-nvidia-smi topo -m
-```
-
-and it will tell you how the GPUs are inter-connected. On a machine with dual-GPU and which are connected with NVLink, you will most likely see something like:
-
-```
-        GPU0    GPU1    CPU Affinity    NUMA Affinity
-GPU0     X      NV2     0-23            N/A
-GPU1    NV2      X      0-23            N/A
-```
-
-on a different machine w/o NVLink we may see:
-```
-        GPU0    GPU1    CPU Affinity    NUMA Affinity
-GPU0     X      PHB     0-11            N/A
-GPU1    PHB      X      0-11            N/A
-```
-
-The report includes this legend:
-
-```
-  X    = Self
-  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
-  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
-  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
-  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
-  PIX  = Connection traversing at most a single PCIe bridge
-  NV#  = Connection traversing a bonded set of # NVLinks
-```
-
-So the first report `NV2` tells us the GPUs are interconnected with 2 NVLinks, and the second report `PHB` we have a typical consumer-level PCIe+Bridge setup.
-
-Check what type of connectivity you have on your setup. Some of these will make the communication between cards faster (e.g. NVLink), others slower (e.g. PHB).
-
-Depending on the type of scalability solution used, the connectivity speed could have a major or a minor impact. If the GPUs need to sync rarely, as in DDP, the impact of a slower connection will be less significant. If the GPUs need to send messages to each other often, as in ZeRO-DP, then faster connectivity becomes super important to achieve faster training.
-
-#### NVlink
-
-[NVLink](https://en.wikipedia.org/wiki/NVLink) is a wire-based serial multi-lane near-range communications link developed by Nvidia.
-
-Each new generation provides a faster bandwidth, e.g. here is a quote from [Nvidia Ampere GA102 GPU Architecture](https://www.nvidia.com/content/dam/en-zz/Solutions/geforce/ampere/pdf/NVIDIA-ampere-GA102-GPU-Architecture-Whitepaper-V1.pdf):
-
-> Third-Generation NVLink®
-> GA102 GPUs utilize NVIDIA’s third-generation NVLink interface, which includes four x4 links,
-> with each link providing 14.0625 GB/sec bandwidth in each direction between two GPUs. Four
-> links provide 56.25 GB/sec bandwidth in each direction, and 112.5 GB/sec total bandwidth
-> between two GPUs. Two RTX 3090 GPUs can be connected together for SLI using NVLink.
-> (Note that 3-Way and 4-Way SLI configurations are not supported.)
-
-So the higher `X` you get in the report of `NVX` in the output of `nvidia-smi topo -m` the better. The generation will depend on your GPU architecture.
-
-Let's compare the execution of a gpt2 language model training over a small sample of wikitext.
-
-The results are:
-
-
-| NVlink | Time |
-| -----  | ---: |
-| Y      | 101s |
-| N      | 131s |
-
-
-You can see that NVLink completes the training ~23% faster. In the second benchmark we use `NCCL_P2P_DISABLE=1` to tell the GPUs not to use NVLink.
-
-Here is the full benchmark code and outputs:
-
-```bash
-# DDP w/ NVLink
-
-rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch \
---nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \
---dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train \
---output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
-
-{'train_runtime': 101.9003, 'train_samples_per_second': 1.963, 'epoch': 0.69}
-
-# DDP w/o NVLink
-
-rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 NCCL_P2P_DISABLE=1 python -m torch.distributed.launch \
---nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \
---dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train
---output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
-
-{'train_runtime': 131.4367, 'train_samples_per_second': 1.522, 'epoch': 0.69}
-```
-
-Hardware: 2x TITAN RTX 24GB each + NVlink with 2 NVLinks (`NV2` in `nvidia-smi topo -m`)
-Software: `pytorch-1.8-to-be` + `cuda-11.0` / `transformers==4.3.0.dev0`
diff --git a/docs/source/en/perf_infer_cpu.md b/docs/source/en/perf_infer_cpu.md
new file mode 100644
index 000000000000..a7a524ae1ef0
--- /dev/null
+++ b/docs/source/en/perf_infer_cpu.md
@@ -0,0 +1,75 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Efficient Inference on CPU
+
+This guide focuses on inferencing large models efficiently on CPU.
+
+## `BetterTransformer` for faster inference
+
+We have recently integrated `BetterTransformer` for faster inference on CPU for text, image and audio models. Check the documentation about this integration [here](https://huggingface.co/docs/optimum/bettertransformer/overview) for more details.
+
+## PyTorch JIT-mode (TorchScript)
+TorchScript is a way to create serializable and optimizable models from PyTorch code. Any TorchScript program can be saved from a Python process and loaded in a process where there is no Python dependency.
+Comparing to default eager mode, jit mode in PyTorch normally yields better performance for model inference from optimization methodologies like operator fusion.
+
+For a gentle introduction to TorchScript, see the Introduction to [PyTorch TorchScript tutorial](https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html#tracing-modules).
+
+### IPEX Graph Optimization with JIT-mode
+Intel® Extension for PyTorch provides further optimizations in jit mode for Transformers series models. It is highly recommended for users to take advantage of Intel® Extension for PyTorch with jit mode. Some frequently used operator patterns from Transformers models are already supported in Intel® Extension for PyTorch with jit mode fusions. Those fusion patterns like Multi-head-attention fusion, Concat Linear, Linear+Add, Linear+Gelu, Add+LayerNorm fusion and etc. are enabled and perform well. The benefit of the fusion is delivered to users in a transparent fashion. According to the analysis, ~70% of most popular NLP tasks in question-answering, text-classification, and token-classification can get performance benefits with these fusion patterns for both Float32 precision and BFloat16 Mixed precision.
+
+Check more detailed information for [IPEX Graph Optimization](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/features/graph_optimization.html).
+
+#### IPEX installation:
+
+IPEX release is following PyTorch, check the approaches for [IPEX installation](https://intel.github.io/intel-extension-for-pytorch/).
+
+### Usage of JIT-mode
+To enable JIT-mode in Trainer for evaluaion or prediction, users should add `jit_mode_eval` in Trainer command arguments.
+
+<Tip warning={true}>
+
+for PyTorch >= 1.14.0. JIT-mode could benefit any models for prediction and evaluaion since dict input is supported in jit.trace
+
+for PyTorch < 1.14.0. JIT-mode could benefit models whose forward parameter order matches the tuple input order in jit.trace, like question-answering model
+In the case where the forward parameter order does not match the tuple input order in jit.trace, like text-classification models, jit.trace will fail and we are capturing this with the exception here to make it fallback. Logging is used to notify users.
+
+</Tip>
+
+Take an example of the use cases on [Transformers question-answering](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering)
+
+
+- Inference using jit mode on CPU:
+<pre>python run_qa.py \
+--model_name_or_path csarron/bert-base-uncased-squad-v1 \
+--dataset_name squad \
+--do_eval \
+--max_seq_length 384 \
+--doc_stride 128 \
+--output_dir /tmp/ \
+--no_cuda \
+<b>--jit_mode_eval </b></pre> 
+
+- Inference with IPEX using jit mode on CPU:
+<pre>python run_qa.py \
+--model_name_or_path csarron/bert-base-uncased-squad-v1 \
+--dataset_name squad \
+--do_eval \
+--max_seq_length 384 \
+--doc_stride 128 \
+--output_dir /tmp/ \
+--no_cuda \
+<b>--use_ipex \</b>
+<b>--jit_mode_eval</b></pre> 
diff --git a/docs/source/en/perf_infer_cpu.mdx b/docs/source/en/perf_infer_cpu.mdx
deleted file mode 100644
index a3df21e93a57..000000000000
--- a/docs/source/en/perf_infer_cpu.mdx
+++ /dev/null
@@ -1,71 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
--->
-
-# Efficient Inference on CPU
-
-This guide focuses on inferencing large models efficiently on CPU.
-
-## `BetterTransformer` for faster inference
-
-We have recently integrated `BetterTransformer` for faster inference on CPU for text, image and audio models. Check the documentation about this integration [here](https://huggingface.co/docs/optimum/bettertransformer/overview) for more details.
-
-## PyTorch JIT-mode (TorchScript)
-TorchScript is a way to create serializable and optimizable models from PyTorch code. Any TorchScript program can be saved from a Python process and loaded in a process where there is no Python dependency.
-Comparing to default eager mode, jit mode in PyTorch normally yields better performance for model inference from optimization methodologies like operator fusion.
-
-For a gentle introduction to TorchScript, see the Introduction to [PyTorch TorchScript tutorial](https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html#tracing-modules).
-
-### IPEX Graph Optimization with JIT-mode
-Intel® Extension for PyTorch provides further optimizations in jit mode for Transformers series models. It is highly recommended for users to take advantage of Intel® Extension for PyTorch with jit mode. Some frequently used operator patterns from Transformers models are already supported in Intel® Extension for PyTorch with jit mode fusions. Those fusion patterns like Multi-head-attention fusion, Concat Linear, Linear+Add, Linear+Gelu, Add+LayerNorm fusion and etc. are enabled and perform well. The benefit of the fusion is delivered to users in a transparent fashion. According to the analysis, ~70% of most popular NLP tasks in question-answering, text-classification, and token-classification can get performance benefits with these fusion patterns for both Float32 precision and BFloat16 Mixed precision.
-
-Check more detailed information for [IPEX Graph Optimization](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/features/graph_optimization.html).
-
-#### IPEX installation:
-
-IPEX release is following PyTorch, check the approaches for [IPEX installation](https://intel.github.io/intel-extension-for-pytorch/).
-
-### Usage of JIT-mode
-To enable JIT-mode in Trainer for evaluaion or prediction, users should add `jit_mode_eval` in Trainer command arguments.
-
-<Tip warning={true}>
-
-for PyTorch >= 1.14.0. JIT-mode could benefit any models for prediction and evaluaion since dict input is supported in jit.trace
-
-for PyTorch < 1.14.0. JIT-mode could benefit models whose forward parameter order matches the tuple input order in jit.trace, like question-answering model
-In the case where the forward parameter order does not match the tuple input order in jit.trace, like text-classification models, jit.trace will fail and we are capturing this with the exception here to make it fallback. Logging is used to notify users.
-
-</Tip>
-
-Take an example of the use cases on [Transformers question-answering](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering)
-
-
-- Inference using jit mode on CPU:
-<pre>python run_qa.py \
---model_name_or_path csarron/bert-base-uncased-squad-v1 \
---dataset_name squad \
---do_eval \
---max_seq_length 384 \
---doc_stride 128 \
---output_dir /tmp/ \
---no_cuda \
-<b>--jit_mode_eval </b></pre> 
-
-- Inference with IPEX using jit mode on CPU:
-<pre>python run_qa.py \
---model_name_or_path csarron/bert-base-uncased-squad-v1 \
---dataset_name squad \
---do_eval \
---max_seq_length 384 \
---doc_stride 128 \
---output_dir /tmp/ \
---no_cuda \
-<b>--use_ipex \</b>
-<b>--jit_mode_eval</b></pre> 
diff --git a/docs/source/en/perf_infer_gpu_many.md b/docs/source/en/perf_infer_gpu_many.md
new file mode 100644
index 000000000000..2118b5ddb404
--- /dev/null
+++ b/docs/source/en/perf_infer_gpu_many.md
@@ -0,0 +1,124 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Efficient Inference on a Multiple GPUs
+
+This document contains information on how to efficiently infer on a multiple GPUs. 
+<Tip>
+
+Note: A multi GPU setup can use the majority of the strategies described in the [single GPU section](./perf_infer_gpu_one). You must be aware of simple techniques, though, that can be used for a better usage.
+
+</Tip>
+
+## Flash Attention 2
+
+Flash Attention 2 integration also works in a multi-GPU setup, check out the appropriate section in the [single GPU section](./perf_infer_gpu_one#Flash-Attention-2)
+
+## BetterTransformer
+
+[BetterTransformer](https://huggingface.co/docs/optimum/bettertransformer/overview) converts 🤗 Transformers models to use the PyTorch-native fastpath execution, which calls optimized kernels like Flash Attention under the hood.  
+
+BetterTransformer is also supported for faster inference on single and multi-GPU for text, image, and audio models.
+
+<Tip>
+
+Flash Attention can only be used for models using fp16 or bf16 dtype. Make sure to cast your model to the appropriate dtype before using BetterTransformer.
+  
+</Tip>
+
+### Decoder models
+
+For text models, especially decoder-based models (GPT, T5, Llama, etc.), the BetterTransformer API converts all attention operations to use the [`torch.nn.functional.scaled_dot_product_attention` operator](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention) (SDPA) that is only available in PyTorch 2.0 and onwards. 
+
+To convert a model to BetterTransformer:
+
+```python
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m")
+# convert the model to BetterTransformer
+model.to_bettertransformer()
+
+# Use it for training or inference
+```
+
+SDPA can also call [Flash Attention](https://arxiv.org/abs/2205.14135) kernels under the hood. To enable Flash Attention or to check that it is available in a given setting (hardware, problem size), use [`torch.backends.cuda.sdp_kernel`](https://pytorch.org/docs/master/backends.html#torch.backends.cuda.sdp_kernel) as a context manager:
+
+
+```diff
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
+model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m").to("cuda")
+# convert the model to BetterTransformer
+model.to_bettertransformer()
+
+input_text = "Hello my dog is cute and"
+inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
+
++ with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
+    outputs = model.generate(**inputs)
+
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+```
+
+If you see a bug with a traceback saying 
+
+```bash
+RuntimeError: No available kernel.  Aborting execution.
+```
+
+try using the PyTorch nightly version, which may have a broader coverage for Flash Attention:
+
+```bash
+pip3 install -U --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118
+```
+
+Have a look at this [blog post](https://pytorch.org/blog/out-of-the-box-acceleration/) to learn more about what is possible with the BetterTransformer + SDPA API.
+
+### Encoder models
+
+For encoder models during inference, BetterTransformer dispatches the forward call of encoder layers to an equivalent of [`torch.nn.TransformerEncoderLayer`](https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoderLayer.html) that will execute the fastpath implementation of the encoder layers.
+
+Because `torch.nn.TransformerEncoderLayer` fastpath does not support training, it is dispatched to `torch.nn.functional.scaled_dot_product_attention` instead, which does not leverage nested tensors but can use Flash Attention or Memory-Efficient Attention fused kernels.
+
+More details about BetterTransformer performance can be found in this [blog post](https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2), and you can learn more about BetterTransformer for encoder models in this [blog](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/).
+
+
+## Advanced usage: mixing FP4 (or Int8) and BetterTransformer
+
+You can combine the different methods described above to get the best performance for your model. For example, you can use BetterTransformer with FP4 mixed-precision inference + flash attention:
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.float16
+)
+
+tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
+model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", quantization_config=quantization_config)
+
+input_text = "Hello my dog is cute and"
+inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
+    outputs = model.generate(**inputs)
+
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+```
\ No newline at end of file
diff --git a/docs/source/en/perf_infer_gpu_many.mdx b/docs/source/en/perf_infer_gpu_many.mdx
deleted file mode 100644
index d8a24d6ab8ae..000000000000
--- a/docs/source/en/perf_infer_gpu_many.mdx
+++ /dev/null
@@ -1,23 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
--->
-
-# Efficient Inference on a Multiple GPUs
-
-This document contains information on how to efficiently infer on a multiple GPUs. 
-<Tip>
-
-Note: A multi GPU setup can use the majority of the strategies described in the [single GPU section](./perf_infer_gpu_one). You must be aware of simple techniques, though, that can be used for a better usage.
-
-</Tip>
-
-## `BetterTransformer` for faster inference
-
-We have recently integrated `BetterTransformer` for faster inference on multi-GPU for text, image and audio models. Check the documentation about this integration [here](https://huggingface.co/docs/optimum/bettertransformer/overview) for more details.
diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md
new file mode 100644
index 000000000000..d24299012e9f
--- /dev/null
+++ b/docs/source/en/perf_infer_gpu_one.md
@@ -0,0 +1,422 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Efficient Inference on a Single GPU
+
+In addition to this guide, relevant information can be found as well in [the guide for training on a single GPU](perf_train_gpu_one) and [the guide for inference on CPUs](perf_infer_cpu).
+
+## Flash Attention 2
+
+<Tip>
+
+Note that this feature is experimental and might considerably change in future versions. For instance, the Flash Attention 2 API might migrate to `BetterTransformer` API in the near future.
+
+</Tip>
+
+Flash Attention 2 can considerably speed up transformer-based models' training and inference speed. Flash Attention 2 has been introduced in the [official Flash Attention repository](https://github.com/Dao-AILab/flash-attention) by Tri Dao et al. The scientific paper on Flash Attention can be found [here](https://arxiv.org/abs/2205.14135).
+
+Make sure to follow the installation guide on the repository mentioned above to properly install Flash Attention 2. Once that package is installed, you can benefit from this feature.
+
+We natively support Flash Attention 2 for the following models:
+
+- Llama
+- Mistral
+- Falcon
+
+You can request to add Flash Attention 2 support for more models by opening an issue on GitHub, and even open a Pull Request to integrate the changes. The supported models can be used for inference and training, including training with padding tokens - *which is currently not supported for `BetterTransformer` API below.*
+
+<Tip>
+
+Flash Attention 2 can only be used when the models' dtype is  `fp16` or `bf16` and runs only on NVIDIA-GPU devices. Make sure to cast your model to the appropriate dtype and load them on a supported device before using that feature.
+  
+</Tip>
+
+### Quick usage
+
+To enable Flash Attention 2 in your model, add `use_flash_attention_2` in the `from_pretrained` arguments:
+
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM
+
+model_id = "tiiuae/falcon-7b"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+model = AutoModelForCausalLM.from_pretrained(
+    model_id, 
+    torch_dtype=torch.bfloat16, 
+    use_flash_attention_2=True,
+)
+```
+
+And use it for generation or fine-tuning.
+
+### Expected speedups
+
+You can benefit from considerable speedups for fine-tuning and inference, especially for long sequences. However, since Flash Attention does not support computing attention scores with padding tokens under the hood, we must manually pad / unpad the attention scores for batched inference when the sequence contains padding tokens. This leads to a significant slowdown for batched generations with padding tokens. 
+
+To overcome this, one should use Flash Attention without padding tokens in the sequence for training (e.g., by packing a dataset, i.e., concatenating sequences until reaching the maximum sequence length. An example is provided [here](https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_clm.py#L516).
+
+Below is the expected speedup you can get for a simple forward pass on [tiiuae/falcon-7b](https://hf.co/tiiuae/falcon-7b) with a sequence length of 4096 and various batch sizes, without padding tokens:
+
+<div style="text-align: center">
+<img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/falcon-7b-inference-large-seqlen.png">
+</div>
+
+Below is the expected speedup you can get for a simple forward pass on [`meta-llama/Llama-7b-hf`](https://hf.co/meta-llama/Llama-7b-hf) with a sequence length of 4096 and various batch sizes, without padding tokens:
+
+<div style="text-align: center">
+<img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/llama-7b-inference-large-seqlen.png">
+</div>
+
+For sequences with padding tokens (training with padding tokens or generating with padding tokens), we need to unpad / pad the input sequences to compute correctly the attention scores. For relatively small sequence length, on pure forward pass, this creates an overhead leading to a small speedup (below 30% of the input has been filled with padding tokens). 
+
+<div style="text-align: center">
+<img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/llama-2-small-seqlen-padding.png">
+</div>
+
+But for large sequence length you can benefit from interesting speedup for pure inference (also training)
+
+Note that Flash Attention makes the attention computation more memory efficient, meaning you can train with much larger sequence lengths without facing CUDA OOM issues. It can lead up to memory reduction up to 20 for large sequence length. Check out [the official flash attention repository](https://github.com/Dao-AILab/flash-attention) for more details.
+
+<div style="text-align: center">
+<img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/llama-2-large-seqlen-padding.png">
+</div>
+
+
+### Advanced usage
+
+You can combine this feature with many exisiting feature for model optimization. Check out few examples below:
+
+### Combining Flash Attention 2 and 8-bit models
+
+You can combine this feature together with 8-bit quantization:
+
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM
+
+model_id = "tiiuae/falcon-7b"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+model = AutoModelForCausalLM.from_pretrained(
+    model_id, 
+    load_in_8bit=True,
+    use_flash_attention_2=True,
+)
+```
+
+### Combining Flash Attention 2 and 4-bit models
+
+You can combine this feature together with 4-bit quantization:
+
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM
+
+model_id = "tiiuae/falcon-7b"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+model = AutoModelForCausalLM.from_pretrained(
+    model_id, 
+    load_in_4bit=True,
+    use_flash_attention_2=True,
+)
+```
+
+### Combining Flash Attention 2 and PEFT
+
+You can combine this feature together with PEFT for training adapters using Flash Attention 2 under the hood:
+
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM
+from peft import LoraConfig
+
+model_id = "tiiuae/falcon-7b"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+model = AutoModelForCausalLM.from_pretrained(
+    model_id, 
+    load_in_4bit=True,
+    use_flash_attention_2=True,
+)
+
+lora_config = LoraConfig(
+    r=8,
+    task_type="CAUSAL_LM"
+)
+
+model.add_adapter(lora_config)
+
+... # train your model
+```
+
+## BetterTransformer
+
+[BetterTransformer](https://huggingface.co/docs/optimum/bettertransformer/overview) converts 🤗 Transformers models to use the PyTorch-native fastpath execution, which calls optimized kernels like Flash Attention under the hood.  
+
+BetterTransformer is also supported for faster inference on single and multi-GPU for text, image, and audio models.
+
+<Tip>
+
+Flash Attention can only be used for models using fp16 or bf16 dtype. Make sure to cast your model to the appropriate dtype before using BetterTransformer.
+  
+</Tip>
+
+### Encoder models
+
+PyTorch-native [`nn.MultiHeadAttention`](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/) attention fastpath, called BetterTransformer, can be used with Transformers through the integration in the [🤗 Optimum library](https://huggingface.co/docs/optimum/bettertransformer/overview).
+
+PyTorch's attention fastpath allows to speed up inference through kernel fusions and the use of [nested tensors](https://pytorch.org/docs/stable/nested.html). Detailed benchmarks can be found in [this blog post](https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2).
+
+After installing the [`optimum`](https://github.com/huggingface/optimum) package, to use Better Transformer during inference, the relevant internal modules are replaced by calling [`~PreTrainedModel.to_bettertransformer`]:
+
+```python
+model = model.to_bettertransformer()
+```
+
+The method [`~PreTrainedModel.reverse_bettertransformer`] allows to go back to the original modeling, which should be used before saving the model in order to use the canonical transformers modeling:
+
+```python
+model = model.reverse_bettertransformer()
+model.save_pretrained("saved_model")
+```
+
+Have a look at this [blog post](https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2) to learn more about what is possible to do with `BetterTransformer` API for encoder models.
+
+### Decoder models
+
+For text models, especially decoder-based models (GPT, T5, Llama, etc.), the BetterTransformer API converts all attention operations to use the [`torch.nn.functional.scaled_dot_product_attention` operator](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention) (SDPA) that is only available in PyTorch 2.0 and onwards. 
+
+To convert a model to BetterTransformer:
+
+```python
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m")
+# convert the model to BetterTransformer
+model.to_bettertransformer()
+
+# Use it for training or inference
+```
+
+SDPA can also call [Flash Attention](https://arxiv.org/abs/2205.14135) kernels under the hood. To enable Flash Attention or to check that it is available in a given setting (hardware, problem size), use [`torch.backends.cuda.sdp_kernel`](https://pytorch.org/docs/master/backends.html#torch.backends.cuda.sdp_kernel) as a context manager:
+
+
+```diff
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
+model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=torch.float16).to("cuda")
+# convert the model to BetterTransformer
+model.to_bettertransformer()
+
+input_text = "Hello my dog is cute and"
+inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
+
++ with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
+    outputs = model.generate(**inputs)
+
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+```
+
+If you see a bug with a traceback saying 
+
+```bash
+RuntimeError: No available kernel.  Aborting execution.
+```
+
+try using the PyTorch nightly version, which may have a broader coverage for Flash Attention:
+
+```bash
+pip3 install -U --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118
+```
+
+Or make sure your model is correctly casted in float16 or bfloat16
+
+
+Have a look at [this detailed blogpost](https://pytorch.org/blog/out-of-the-box-acceleration/) to read more about what is possible to do with `BetterTransformer` + SDPA API.
+
+## `bitsandbytes` integration for FP4 mixed-precision inference
+
+You can install `bitsandbytes` and benefit from easy model compression on GPUs. Using FP4 quantization you can expect to reduce up to 8x the model size compared to its native full precision version. Check out below how to get started.
+
+<Tip>
+
+Note that this feature can also be used in a multi GPU setup.
+
+</Tip>
+
+### Requirements [[requirements-for-fp4-mixedprecision-inference]]
+
+- Latest `bitsandbytes` library
+`pip install bitsandbytes>=0.39.0`
+
+- Install latest `accelerate` from source
+`pip install git+https://github.com/huggingface/accelerate.git`
+
+- Install latest `transformers` from source
+`pip install git+https://github.com/huggingface/transformers.git`
+
+### Running FP4 models - single GPU setup - Quickstart
+
+You can quickly run a FP4 model on a single GPU by running the following code:
+
+```py
+from transformers import AutoModelForCausalLM
+
+model_name = "bigscience/bloom-2b5"
+model_4bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_4bit=True)
+```
+Note that `device_map` is optional but setting `device_map = 'auto'` is prefered for inference as it will dispatch efficiently the model on the available ressources.
+
+### Running FP4 models - multi GPU setup
+
+The way to load your mixed 4-bit model in multiple GPUs is as follows (same command as single GPU setup):
+```py
+model_name = "bigscience/bloom-2b5"
+model_4bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_4bit=True)
+```
+But you can control the GPU RAM you want to allocate on each GPU using `accelerate`. Use the `max_memory` argument as follows:
+
+```py
+max_memory_mapping = {0: "600MB", 1: "1GB"}
+model_name = "bigscience/bloom-3b"
+model_4bit = AutoModelForCausalLM.from_pretrained(
+    model_name, device_map="auto", load_in_4bit=True, max_memory=max_memory_mapping
+)
+```
+In this example, the first GPU will use 600MB of memory and the second 1GB.
+
+### Advanced usage
+
+For more advanced usage of this method, please have a look at the [quantization](main_classes/quantization) documentation page.
+
+## `bitsandbytes` integration for Int8 mixed-precision matrix decomposition
+
+<Tip>
+
+Note that this feature can also be used in a multi GPU setup.
+
+</Tip>
+
+From the paper [`LLM.int8() : 8-bit Matrix Multiplication for Transformers at Scale`](https://arxiv.org/abs/2208.07339), we support Hugging Face integration for all models in the Hub with a few lines of code.
+The method reduces `nn.Linear` size by 2 for `float16` and `bfloat16` weights and by 4 for `float32` weights, with close to no impact to the quality by operating on the outliers in half-precision.
+
+![HFxbitsandbytes.png](https://cdn-uploads.huggingface.co/production/uploads/1659861207959-62441d1d9fdefb55a0b7d12c.png)
+
+Int8 mixed-precision matrix decomposition works by separating a matrix multiplication into two streams: (1) a systematic feature outlier stream matrix multiplied in fp16 (0.01%), (2) a regular stream of int8 matrix multiplication (99.9%). With this method, int8 inference with no predictive degradation is possible for very large models.
+For more details regarding the method, check out the [paper](https://arxiv.org/abs/2208.07339) or our [blogpost about the integration](https://huggingface.co/blog/hf-bitsandbytes-integration).
+
+![MixedInt8.gif](https://cdn-uploads.huggingface.co/production/uploads/1660567469965-62441d1d9fdefb55a0b7d12c.gif)
+
+Note, that you would require a GPU to run mixed-8bit models as the kernels have been compiled for GPUs only. Make sure that you have enough GPU memory to store the quarter (or half if your model weights are in half precision) of the model before using this feature.
+Below are some notes to help you use this module, or follow the demos on [Google colab](#colab-demos).
+
+### Requirements [[requirements-for-int8-mixedprecision-matrix-decomposition]]
+
+- If you have `bitsandbytes<0.37.0`, make sure you run on NVIDIA GPUs that support 8-bit tensor cores (Turing, Ampere or newer architectures - e.g. T4, RTX20s RTX30s, A40-A100). For `bitsandbytes>=0.37.0`, all GPUs should be supported.
+- Install the correct version of `bitsandbytes` by running:
+`pip install bitsandbytes>=0.31.5`
+- Install `accelerate`
+`pip install accelerate>=0.12.0`
+
+### Running mixed-Int8 models - single GPU setup
+
+After installing the required libraries, the way to load your mixed 8-bit model is as follows:
+
+```py
+from transformers import AutoModelForCausalLM
+
+model_name = "bigscience/bloom-2b5"
+model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
+```
+
+For text generation, we recommend:
+
+* using the model's `generate()` method instead of the `pipeline()` function. Although inference is possible with the `pipeline()` function, it is not optimized for mixed-8bit models, and will be slower than using the `generate()` method. Moreover, some sampling strategies are like nucleaus sampling are not supported by the `pipeline()` function for mixed-8bit models.
+* placing all inputs on the same device as the model.
+
+Here is a simple example:
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_name = "bigscience/bloom-2b5"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
+
+prompt = "Hello, my llama is cute"
+inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
+generated_ids = model.generate(**inputs)
+outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+```
+
+
+### Running mixed-int8 models - multi GPU setup
+
+The way to load your mixed 8-bit model in multiple GPUs is as follows (same command as single GPU setup):
+```py
+model_name = "bigscience/bloom-2b5"
+model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
+```
+But you can control the GPU RAM you want to allocate on each GPU using `accelerate`. Use the `max_memory` argument as follows:
+
+```py
+max_memory_mapping = {0: "1GB", 1: "2GB"}
+model_name = "bigscience/bloom-3b"
+model_8bit = AutoModelForCausalLM.from_pretrained(
+    model_name, device_map="auto", load_in_8bit=True, max_memory=max_memory_mapping
+)
+```
+In this example, the first GPU will use 1GB of memory and the second 2GB.
+
+### Colab demos
+
+With this method you can infer on models that were not possible to infer on a Google Colab before.
+Check out the demo for running T5-11b (42GB in fp32)! Using 8-bit quantization on Google Colab:
+
+[![Open In Colab: T5-11b demo](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1YORPWx4okIHXnjW7MSAidXN29mPVNT7F?usp=sharing)
+
+Or this demo for BLOOM-3B:
+
+[![Open In Colab: BLOOM-3b demo](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1qOjXfQIAULfKvZqwCen8-MoWKGdSatZ4?usp=sharing)
+
+## Advanced usage: mixing FP4 (or Int8) and BetterTransformer
+
+You can combine the different methods described above to get the best performance for your model. For example, you can use BetterTransformer with FP4 mixed-precision inference + flash attention:
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.float16
+)
+
+tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
+model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", quantization_config=quantization_config)
+
+input_text = "Hello my dog is cute and"
+inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
+    outputs = model.generate(**inputs)
+
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+```
diff --git a/docs/source/en/perf_infer_gpu_one.mdx b/docs/source/en/perf_infer_gpu_one.mdx
deleted file mode 100644
index 086e2ff48709..000000000000
--- a/docs/source/en/perf_infer_gpu_one.mdx
+++ /dev/null
@@ -1,80 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
--->
-
-# Efficient Inference on a Single GPU
-
-This document will be completed soon with information on how to infer on a single GPU. In the meantime you can check out [the guide for training on a single GPU](perf_train_gpu_one) and [the guide for inference on CPUs](perf_infer_cpu).
-
-## `BetterTransformer` for faster inference
-
-We have recently integrated `BetterTransformer` for faster inference on GPU for text, image and audio models. Check the documentation about this integration [here](https://huggingface.co/docs/optimum/bettertransformer/overview) for more details.
-
-## `bitsandbytes` integration for Int8 mixed-precision matrix decomposition
-
-Note that this feature is also totally applicable in a multi GPU setup as well.
-
-From the paper [`LLM.int8() : 8-bit Matrix Multiplication for Transformers at Scale`](https://arxiv.org/abs/2208.07339), we support HuggingFace integration for all models in the Hub with a few lines of code. 
-The method reduce `nn.Linear` size by 2 for `float16` and `bfloat16` weights and by 4 for `float32` weights, with close to no impact to the quality by operating on the outliers in half-precision.
-
-![HFxbitsandbytes.png](https://s3.amazonaws.com/moonup/production/uploads/1659861207959-62441d1d9fdefb55a0b7d12c.png)
-
-Int8 mixed-precision matrix decomposition works by separating a matrix multiplication into two streams: (1) a systematic feature outlier stream matrix multiplied in fp16 (0.01%), (2) a regular stream of int8 matrix multiplication (99.9%). With this method, int8 inference with no predictive degradation is possible for very large models.
-For more details regarding the method, check out the [paper](https://arxiv.org/abs/2208.07339) or our [blogpost about the integration](https://huggingface.co/blog/hf-bitsandbytes-integration).
-
-![MixedInt8.gif](https://s3.amazonaws.com/moonup/production/uploads/1660567469965-62441d1d9fdefb55a0b7d12c.gif)
-
-Note, that you would require a GPU to run mixed-8bit models as the kernels have been compiled for GPUs only. Make sure that you have enough GPU memory to store the quarter (or half if your model weights are in half precision) of the model before using this feature. 
-Below are some notes to help you use this module, or follow the demos on [Google colab](#colab-demos).
-
-### Requirements
-
-- Make sure you run that on NVIDIA GPUs that support 8-bit tensor cores (Turing, Ampere or newer architectures - e.g. T4, RTX20s RTX30s, A40-A100).
-- Install the correct version of `bitsandbytes` by running:
-`pip install bitsandbytes>=0.31.5`
-- Install `accelerate`
-`pip install accelerate>=0.12.0`
-
-### Running mixed-int8 models - single GPU setup
-
-After installing the required libraries, the way to load your mixed 8-bit model is as follows:
-```py
-model_name = "bigscience/bloom-2b5"
-model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
-```
-
-### Running mixed-int8 models - multi GPU setup
-
-The way to load your mixed 8-bit model in multiple GPUs is as follows (same command as single GPU setup):
-```py
-model_name = "bigscience/bloom-2b5"
-model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
-```
-But you can control the GPU RAM you want to allocate on each GPU using `accelerate`. Use the `max_memory` argument as follows:
-
-```py
-max_memory_mapping = {0: "1GB", 1: "2GB"}
-model_name = "bigscience/bloom-3b"
-model_8bit = AutoModelForCausalLM.from_pretrained(
-    model_name, device_map="auto", load_in_8bit=True, max_memory=max_memory_mapping
-)
-```
-In this example, the first GPU will use 1GB of memory and the second 2GB.
-
-### Colab demos
-
-With this method you can infer on models that were not possible to infer on a Google Colab before. 
-Check out the demo for running T5-11b (42GB in fp32)! Using 8-bit quantization on Google Colab:
-
-[![Open In Colab: T5-11b demo](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1YORPWx4okIHXnjW7MSAidXN29mPVNT7F?usp=sharing)
-
-Or this demo for BLOOM-3B:
-
-[![Open In Colab: BLOOM-3b demo](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1qOjXfQIAULfKvZqwCen8-MoWKGdSatZ4?usp=sharing)
\ No newline at end of file
diff --git a/docs/source/en/perf_infer_special.md b/docs/source/en/perf_infer_special.md
new file mode 100644
index 000000000000..e5744754b88e
--- /dev/null
+++ b/docs/source/en/perf_infer_special.md
@@ -0,0 +1,18 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Inference on Specialized Hardware
+
+This document will be completed soon with information on how to infer on specialized hardware. In the meantime you can check out [the guide for inference on CPUs](perf_infer_cpu).
\ No newline at end of file
diff --git a/docs/source/en/perf_infer_special.mdx b/docs/source/en/perf_infer_special.mdx
deleted file mode 100644
index e18a9a104883..000000000000
--- a/docs/source/en/perf_infer_special.mdx
+++ /dev/null
@@ -1,14 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
--->
-
-# Inference on Specialized Hardware
-
-This document will be completed soon with information on how to infer on specialized hardware. In the meantime you can check out [the guide for inference on CPUs](perf_infer_cpu).
\ No newline at end of file
diff --git a/docs/source/en/perf_torch_compile.md b/docs/source/en/perf_torch_compile.md
new file mode 100644
index 000000000000..a840e7d551ce
--- /dev/null
+++ b/docs/source/en/perf_torch_compile.md
@@ -0,0 +1,359 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Optimize inference using torch.compile()
+
+This guide aims to provide a benchmark on the inference speed-ups introduced with [`torch.compile()`](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) for [computer vision models in 🤗 Transformers](https://huggingface.co/models?pipeline_tag=image-classification&library=transformers&sort=trending).
+
+## Benefits of torch.compile
+   
+Depending on the model and the GPU, `torch.compile()` yields up to 30% speed-up during inference. To use `torch.compile()`, simply install any version of `torch` above 2.0. 
+
+Compiling a model takes time, so it's useful if you are compiling the model only once instead of every time you infer.
+To compile any computer vision model of your choice, call `torch.compile()` on the model as shown below:
+
+```diff
+from transformers import AutoModelForImageClassification
+
+model = AutoModelForImageClassification.from_pretrained(MODEL_ID).to("cuda")
++ model = torch.compile(model)
+```
+
+`compile()` comes with multiple modes for compiling, which essentially differ in compilation time and inference overhead. `max-autotune` takes longer than `reduce-overhead` but results in faster inference. Default mode is fastest for compilation but is not as efficient compared to `reduce-overhead` for inference time. In this guide, we used the default mode. You can learn more about it [here](https://pytorch.org/get-started/pytorch-2.0/#user-experience).
+
+We benchmarked `torch.compile` with different computer vision models, tasks, types of hardware, and batch sizes on `torch` version 2.0.1.
+
+## Benchmarking code 
+
+Below you can find the benchmarking code for each task. We warm up the GPU before inference and take the mean time of 300 inferences, using the same image each time.
+
+### Image Classification with ViT
+
+```python 
+import torch
+from PIL import Image
+import requests
+import numpy as np
+from transformers import AutoImageProcessor, AutoModelForImageClassification
+
+url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+image = Image.open(requests.get(url, stream=True).raw)
+
+processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
+model = AutoModelForImageClassification.from_pretrained("google/vit-base-patch16-224").to("cuda")
+model = torch.compile(model)
+
+processed_input = processor(image, return_tensors='pt').to(device="cuda")
+
+with torch.no_grad():
+    _ = model(**processed_input)
+
+```
+
+#### Object Detection with DETR
+
+```python 
+from transformers import AutoImageProcessor, AutoModelForObjectDetection
+
+processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50")
+model = AutoModelForObjectDetection.from_pretrained("facebook/detr-resnet-50").to("cuda")
+model = torch.compile(model)
+
+texts = ["a photo of a cat", "a photo of a dog"]
+inputs = processor(text=texts, images=image, return_tensors="pt").to("cuda")
+
+with torch.no_grad():
+    _ = model(**inputs)
+```
+
+#### Image Segmentation with Segformer
+
+```python 
+from transformers import SegformerImageProcessor, SegformerForSemanticSegmentation
+
+processor = SegformerImageProcessor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
+model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512").to("cuda")
+model = torch.compile(model)
+seg_inputs = processor(images=image, return_tensors="pt").to("cuda")
+
+with torch.no_grad():
+    _ = model(**seg_inputs)
+```
+
+Below you can find the list of the models we benchmarked.
+
+**Image Classification** 
+- [google/vit-base-patch16-224](https://huggingface.co/google/vit-base-patch16-224)
+- [microsoft/beit-base-patch16-224-pt22k-ft22k](https://huggingface.co/microsoft/beit-base-patch16-224-pt22k-ft22k)
+- [facebook/convnext-large-224](https://huggingface.co/facebook/convnext-large-224)
+- [microsoft/resnet-50](https://huggingface.co/)
+
+**Image Segmentation** 
+- [nvidia/segformer-b0-finetuned-ade-512-512](https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512)
+- [facebook/mask2former-swin-tiny-coco-panoptic](https://huggingface.co/facebook/mask2former-swin-tiny-coco-panoptic)
+- [facebook/maskformer-swin-base-ade](https://huggingface.co/facebook/maskformer-swin-base-ade)
+- [google/deeplabv3_mobilenet_v2_1.0_513](https://huggingface.co/google/deeplabv3_mobilenet_v2_1.0_513)
+
+**Object Detection** 
+- [google/owlvit-base-patch32](https://huggingface.co/google/owlvit-base-patch32)
+- [facebook/detr-resnet-101](https://huggingface.co/facebook/detr-resnet-101)
+- [microsoft/conditional-detr-resnet-50](https://huggingface.co/microsoft/conditional-detr-resnet-50)
+
+Below you can find visualization of inference durations with and without `torch.compile()` and percentage improvements for each model in different hardware and batch sizes. 
+
+<div class="flex">
+  <div>
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/torch_compile/a100_batch_comp.png" />
+  </div>
+  <div>
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/torch_compile/v100_batch_comp.png" />
+  </div>
+   <div>
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/torch_compile/t4_batch_comp.png" />
+  </div>
+</div>
+
+<div class="flex">
+  <div>
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/torch_compile/A100_1_duration.png" />
+  </div>
+  <div>
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/torch_compile/A100_1_percentage.png" />
+  </div>
+</div>
+
+
+![Duration Comparison on V100 with Batch Size of 1](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/torch_compile/v100_1_duration.png)
+
+![Percentage Improvement on T4 with Batch Size of 4](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/torch_compile/T4_4_percentage.png)
+
+Below you can find inference durations in milliseconds for each model with and without `compile()`. Note that OwlViT results in OOM in larger batch sizes.
+
+### A100 (batch size: 1)
+
+| **Task/Model** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
+|:---:|:---:|:---:|
+| Image Classification/ViT | 9.325 | 7.584 | 
+| Image Segmentation/Segformer | 11.759 | 10.500 |
+| Object Detection/OwlViT | 24.978 | 18.420 |
+| Image Classification/BeiT | 11.282 | 8.448 | 
+| Object Detection/DETR | 34.619 | 19.040 |
+| Image Classification/ConvNeXT | 10.410 | 10.208 | 
+| Image Classification/ResNet | 6.531 | 4.124 |
+| Image Segmentation/Mask2former | 60.188 | 49.117 |
+| Image Segmentation/Maskformer | 75.764 | 59.487 | 
+| Image Segmentation/MobileNet | 8.583 | 3.974 |
+| Object Detection/Resnet-101 | 36.276 | 18.197 |
+| Object Detection/Conditional-DETR | 31.219 | 17.993 |
+
+
+### A100 (batch size: 4)
+
+| **Task/Model** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
+|:---:|:---:|:---:|
+| Image Classification/ViT | 14.832 | 14.499 | 
+| Image Segmentation/Segformer | 18.838 | 16.476 |
+| Image Classification/BeiT | 13.205 | 13.048 | 
+| Object Detection/DETR | 48.657 | 32.418|
+| Image Classification/ConvNeXT | 22.940 | 21.631 | 
+| Image Classification/ResNet | 6.657 | 4.268 |
+| Image Segmentation/Mask2former | 74.277 | 61.781 |
+| Image Segmentation/Maskformer | 180.700 | 159.116 | 
+| Image Segmentation/MobileNet | 14.174 | 8.515 |
+| Object Detection/Resnet-101 | 68.101 | 44.998 |
+| Object Detection/Conditional-DETR | 56.470 | 35.552 |
+
+### A100 (batch size: 16)
+
+| **Task/Model** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
+|:---:|:---:|:---:|
+| Image Classification/ViT | 40.944 | 40.010 | 
+| Image Segmentation/Segformer | 37.005 | 31.144 |
+| Image Classification/BeiT | 41.854 | 41.048 | 
+| Object Detection/DETR | 164.382 | 161.902 |
+| Image Classification/ConvNeXT | 82.258 | 75.561 | 
+| Image Classification/ResNet | 7.018 | 5.024 |
+| Image Segmentation/Mask2former | 178.945 | 154.814 |
+| Image Segmentation/Maskformer | 638.570 | 579.826 | 
+| Image Segmentation/MobileNet | 51.693 | 30.310 |
+| Object Detection/Resnet-101 | 232.887 | 155.021 |
+| Object Detection/Conditional-DETR | 180.491 | 124.032 |
+
+### V100 (batch size: 1)
+
+| **Task/Model** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
+|:---:|:---:|:---:|
+| Image Classification/ViT | 10.495 | 6.00 | 
+| Image Segmentation/Segformer | 13.321 | 5.862 | 
+| Object Detection/OwlViT | 25.769 | 22.395 | 
+| Image Classification/BeiT | 11.347 | 7.234 | 
+| Object Detection/DETR | 33.951 | 19.388 |
+| Image Classification/ConvNeXT | 11.623 | 10.412 | 
+| Image Classification/ResNet | 6.484 | 3.820 |
+| Image Segmentation/Mask2former | 64.640 | 49.873 |
+| Image Segmentation/Maskformer | 95.532 | 72.207 | 
+| Image Segmentation/MobileNet | 9.217 | 4.753 |
+| Object Detection/Resnet-101 | 52.818 | 28.367 |
+| Object Detection/Conditional-DETR | 39.512 | 20.816 |
+
+### V100 (batch size: 4)
+
+| **Task/Model** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
+|:---:|:---:|:---:|
+| Image Classification/ViT | 15.181 | 14.501 | 
+| Image Segmentation/Segformer | 16.787 | 16.188 |
+| Image Classification/BeiT | 15.171 | 14.753 | 
+| Object Detection/DETR | 88.529 | 64.195 |
+| Image Classification/ConvNeXT | 29.574 | 27.085 | 
+| Image Classification/ResNet | 6.109 | 4.731 |
+| Image Segmentation/Mask2former | 90.402 | 76.926 |
+| Image Segmentation/Maskformer | 234.261 | 205.456 | 
+| Image Segmentation/MobileNet | 24.623 | 14.816 |
+| Object Detection/Resnet-101 | 134.672 | 101.304 |
+| Object Detection/Conditional-DETR | 97.464 | 69.739 |
+
+### V100 (batch size: 16)
+
+| **Task/Model** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
+|:---:|:---:|:---:|
+| Image Classification/ViT | 52.209 | 51.633 | 
+| Image Segmentation/Segformer | 61.013 | 55.499 |
+| Image Classification/BeiT | 53.938 | 53.581  |
+| Object Detection/DETR | OOM | OOM |
+| Image Classification/ConvNeXT | 109.682 | 100.771 | 
+| Image Classification/ResNet | 14.857 | 12.089 |
+| Image Segmentation/Mask2former | 249.605 | 222.801 |
+| Image Segmentation/Maskformer | 831.142 | 743.645 | 
+| Image Segmentation/MobileNet | 93.129 | 55.365 |
+| Object Detection/Resnet-101 | 482.425 | 361.843 |
+| Object Detection/Conditional-DETR | 344.661 | 255.298 |
+
+### T4 (batch size: 1)
+
+| **Task/Model** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
+|:---:|:---:|:---:|
+| Image Classification/ViT | 16.520 | 15.786 | 
+| Image Segmentation/Segformer | 16.116 | 14.205 |
+| Object Detection/OwlViT | 53.634 | 51.105 |
+| Image Classification/BeiT | 16.464 | 15.710 | 
+| Object Detection/DETR | 73.100 | 53.99 |
+| Image Classification/ConvNeXT | 32.932 | 30.845 | 
+| Image Classification/ResNet | 6.031 | 4.321 |
+| Image Segmentation/Mask2former | 79.192 | 66.815 |
+| Image Segmentation/Maskformer | 200.026 | 188.268 | 
+| Image Segmentation/MobileNet | 18.908 | 11.997 |
+| Object Detection/Resnet-101 | 106.622 | 82.566 |
+| Object Detection/Conditional-DETR | 77.594 | 56.984 |
+
+### T4 (batch size: 4)
+
+| **Task/Model** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
+|:---:|:---:|:---:|
+| Image Classification/ViT | 43.653 | 43.626 | 
+| Image Segmentation/Segformer | 45.327 | 42.445 |
+| Image Classification/BeiT | 52.007 | 51.354 | 
+| Object Detection/DETR | 277.850 | 268.003 |
+| Image Classification/ConvNeXT | 119.259 | 105.580 | 
+| Image Classification/ResNet | 13.039 | 11.388 |
+| Image Segmentation/Mask2former | 201.540 | 184.670 |
+| Image Segmentation/Maskformer | 764.052 | 711.280 | 
+| Image Segmentation/MobileNet | 74.289 | 48.677 |
+| Object Detection/Resnet-101 | 421.859 | 357.614 |
+| Object Detection/Conditional-DETR | 289.002 | 226.945 |
+
+### T4 (batch size: 16)
+
+| **Task/Model** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
+|:---:|:---:|:---:|
+| Image Classification/ViT | 163.914 | 160.907 | 
+| Image Segmentation/Segformer | 192.412 | 163.620 |
+| Image Classification/BeiT | 188.978 | 187.976 | 
+| Object Detection/DETR | OOM | OOM |
+| Image Classification/ConvNeXT | 422.886 | 388.078 | 
+| Image Classification/ResNet | 44.114 | 37.604 |
+| Image Segmentation/Mask2former | 756.337 | 695.291 |
+| Image Segmentation/Maskformer | 2842.940 | 2656.88 | 
+| Image Segmentation/MobileNet | 299.003 | 201.942 |
+| Object Detection/Resnet-101 |  1619.505 | 1262.758 | 
+| Object Detection/Conditional-DETR | 1137.513 | 897.390|
+
+## PyTorch Nightly
+We also benchmarked on PyTorch nightly (2.1.0dev, find the wheel [here](https://download.pytorch.org/whl/nightly/cu118)) and observed improvement in latency both for uncompiled and compiled models. 
+
+### A100
+
+| **Task/Model** | **Batch Size** | **torch 2.0 - no compile** | **torch 2.0 -<br> compile** |
+|:---:|:---:|:---:|:---:|
+| Image Classification/BeiT | Unbatched | 12.462 | 6.954 | 
+| Image Classification/BeiT | 4 | 14.109 | 12.851 | 
+| Image Classification/BeiT | 16 | 42.179 | 42.147 | 
+| Object Detection/DETR | Unbatched | 30.484 | 15.221 |
+| Object Detection/DETR | 4 | 46.816 | 30.942 |
+| Object Detection/DETR | 16 | 163.749 | 163.706  |
+
+### T4
+
+| **Task/Model** | **Batch Size** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
+|:---:|:---:|:---:|:---:|
+| Image Classification/BeiT | Unbatched | 14.408 | 14.052 | 
+| Image Classification/BeiT | 4 | 47.381 | 46.604 | 
+| Image Classification/BeiT | 16 | 42.179 | 42.147  | 
+| Object Detection/DETR | Unbatched | 68.382 | 53.481 |
+| Object Detection/DETR | 4 | 269.615 | 204.785 |
+| Object Detection/DETR | 16 | OOM | OOM   |
+
+### V100
+
+| **Task/Model** | **Batch Size** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
+|:---:|:---:|:---:|:---:|
+| Image Classification/BeiT | Unbatched | 13.477 | 7.926 | 
+| Image Classification/BeiT | 4 | 15.103 | 14.378 | 
+| Image Classification/BeiT | 16 | 52.517 | 51.691  | 
+| Object Detection/DETR | Unbatched | 28.706 | 19.077 |
+| Object Detection/DETR | 4 | 88.402 | 62.949|
+| Object Detection/DETR | 16 | OOM | OOM  |
+
+
+## Reduce Overhead
+We benchmarked `reduce-overhead` compilation mode for A100 and T4 in Nightly.
+
+### A100
+
+| **Task/Model** | **Batch Size** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
+|:---:|:---:|:---:|:---:|
+| Image Classification/ConvNeXT | Unbatched | 11.758 | 7.335 | 
+| Image Classification/ConvNeXT | 4 | 23.171 | 21.490 | 
+| Image Classification/ResNet | Unbatched | 7.435 | 3.801 | 
+| Image Classification/ResNet | 4 | 7.261 | 2.187 | 
+| Object Detection/Conditional-DETR | Unbatched | 32.823 | 11.627  | 
+| Object Detection/Conditional-DETR | 4 | 50.622 | 33.831  | 
+| Image Segmentation/MobileNet | Unbatched | 9.869 | 4.244 |
+| Image Segmentation/MobileNet | 4 | 14.385 | 7.946 |
+
+
+### T4
+
+| **Task/Model** | **Batch Size** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** | 
+|:---:|:---:|:---:|:---:|
+| Image Classification/ConvNeXT | Unbatched | 32.137 | 31.84 | 
+| Image Classification/ConvNeXT | 4 | 120.944 | 110.209 | 
+| Image Classification/ResNet | Unbatched | 9.761 | 7.698 | 
+| Image Classification/ResNet | 4 | 15.215 | 13.871 | 
+| Object Detection/Conditional-DETR | Unbatched | 72.150 | 57.660  | 
+| Object Detection/Conditional-DETR | 4 | 301.494 | 247.543  | 
+| Image Segmentation/MobileNet | Unbatched | 22.266 | 19.339  |
+| Image Segmentation/MobileNet | 4 | 78.311 | 50.983 |
+
+
diff --git a/docs/source/en/perf_train_cpu.md b/docs/source/en/perf_train_cpu.md
new file mode 100644
index 000000000000..9c81820ce7d5
--- /dev/null
+++ b/docs/source/en/perf_train_cpu.md
@@ -0,0 +1,67 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Efficient Training on CPU
+
+This guide focuses on training large models efficiently on CPU.
+
+## Mixed precision with IPEX
+
+IPEX is optimized for CPUs with AVX-512 or above, and functionally works for CPUs with only AVX2. So, it is expected to bring performance benefit for Intel CPU generations with AVX-512 or above while CPUs with only AVX2 (e.g., AMD CPUs or older Intel CPUs) might result in a better performance under IPEX, but not guaranteed. IPEX provides performance optimizations for CPU training with both Float32 and BFloat16. The usage of BFloat16 is the main focus of the following sections.
+
+Low precision data type BFloat16 has been natively supported on the 3rd Generation Xeon® Scalable Processors (aka Cooper Lake) with AVX512 instruction set and will be supported on the next generation of Intel® Xeon® Scalable Processors with Intel® Advanced Matrix Extensions (Intel® AMX) instruction set with further boosted performance. The Auto Mixed Precision for CPU backend has been enabled since PyTorch-1.10. At the same time, the support of Auto Mixed Precision with BFloat16 for CPU and BFloat16 optimization of operators has been massively enabled in Intel® Extension for PyTorch, and partially upstreamed to PyTorch master branch. Users can get better performance and user experience with IPEX Auto Mixed Precision.
+
+Check more detailed information for [Auto Mixed Precision](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/features/amp.html).
+
+### IPEX installation:
+
+IPEX release is following PyTorch, to install via pip:
+
+| PyTorch Version   | IPEX version   |
+| :---------------: | :----------:   |
+| 1.13              |  1.13.0+cpu    |
+| 1.12              |  1.12.300+cpu  |
+| 1.11              |  1.11.200+cpu  |
+| 1.10              |  1.10.100+cpu  |
+
+```
+pip install intel_extension_for_pytorch==<version_name> -f https://developer.intel.com/ipex-whl-stable-cpu
+```
+
+Check more approaches for [IPEX installation](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/installation.html).
+
+### Usage in Trainer
+To enable auto mixed precision with IPEX in Trainer, users should add `use_ipex`, `bf16` and `no_cuda` in training command arguments.
+
+Take an example of the use cases on [Transformers question-answering](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering)
+
+- Training with IPEX using BF16 auto mixed precision on CPU:
+<pre> python run_qa.py \
+--model_name_or_path bert-base-uncased \
+--dataset_name squad \
+--do_train \
+--do_eval \
+--per_device_train_batch_size 12 \
+--learning_rate 3e-5 \
+--num_train_epochs 2 \
+--max_seq_length 384 \
+--doc_stride 128 \
+--output_dir /tmp/debug_squad/ \
+<b>--use_ipex \</b>
+<b>--bf16 --no_cuda</b></pre> 
+
+### Practice example
+
+Blog: [Accelerating PyTorch Transformers with Intel Sapphire Rapids](https://huggingface.co/blog/intel-sapphire-rapids)
diff --git a/docs/source/en/perf_train_cpu.mdx b/docs/source/en/perf_train_cpu.mdx
deleted file mode 100644
index c35a62fcdcef..000000000000
--- a/docs/source/en/perf_train_cpu.mdx
+++ /dev/null
@@ -1,60 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
--->
-
-# Efficient Training on CPU
-
-This guide focuses on training large models efficiently on CPU.
-
-## Mixed precision with IPEX
-
-IPEX is optimized for CPUs with AVX-512 or above, and functionally works for CPUs with only AVX2. So, it is expected to bring performance benefit for Intel CPU generations with AVX-512 or above while CPUs with only AVX2 (e.g., AMD CPUs or older Intel CPUs) might result in a better performance under IPEX, but not guaranteed. IPEX provides performance optimizations for CPU training with both Float32 and BFloat16. The usage of BFloat16 is the main focus of the following sections.
-
-Low precision data type BFloat16 has been natively supported on the 3rd Generation Xeon® Scalable Processors (aka Cooper Lake) with AVX512 instruction set and will be supported on the next generation of Intel® Xeon® Scalable Processors with Intel® Advanced Matrix Extensions (Intel® AMX) instruction set with further boosted performance. The Auto Mixed Precision for CPU backend has been enabled since PyTorch-1.10. At the same time, the support of Auto Mixed Precision with BFloat16 for CPU and BFloat16 optimization of operators has been massively enabled in Intel® Extension for PyTorch, and partially upstreamed to PyTorch master branch. Users can get better performance and user experience with IPEX Auto Mixed Precision.
-
-Check more detailed information for [Auto Mixed Precision](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/features/amp.html).
-
-### IPEX installation:
-
-IPEX release is following PyTorch, to install via pip:
-
-| PyTorch Version   | IPEX version   |
-| :---------------: | :----------:   |
-| 1.13              |  1.13.0+cpu    |
-| 1.12              |  1.12.300+cpu  |
-| 1.11              |  1.11.200+cpu  |
-| 1.10              |  1.10.100+cpu  |
-
-```
-pip install intel_extension_for_pytorch==<version_name> -f https://developer.intel.com/ipex-whl-stable-cpu
-```
-
-Check more approaches for [IPEX installation](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/installation.html).
-
-### Usage in Trainer
-To enable auto mixed precision with IPEX in Trainer, users should add `use_ipex`, `bf16` and `no_cuda` in training command arguments.
-
-Take an example of the use cases on [Transformers question-answering](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering)
-
-- Training with IPEX using BF16 auto mixed precision on CPU:
-<pre> python run_qa.py \
---model_name_or_path bert-base-uncased \
---dataset_name squad \
---do_train \
---do_eval \
---per_device_train_batch_size 12 \
---learning_rate 3e-5 \
---num_train_epochs 2 \
---max_seq_length 384 \
---doc_stride 128 \
---output_dir /tmp/debug_squad/ \
-<b>--use_ipex \</b>
-<b>--bf16 --no_cuda</b></pre> 
-
diff --git a/docs/source/en/perf_train_cpu_many.md b/docs/source/en/perf_train_cpu_many.md
new file mode 100644
index 000000000000..4c131430babd
--- /dev/null
+++ b/docs/source/en/perf_train_cpu_many.md
@@ -0,0 +1,134 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Efficient Training on Multiple CPUs
+
+When training on a single CPU is too slow, we can use multiple CPUs. This guide focuses on PyTorch-based DDP enabling distributed CPU training efficiently.
+
+## Intel® oneCCL Bindings for PyTorch
+
+[Intel® oneCCL](https://github.com/oneapi-src/oneCCL) (collective communications library) is a library for efficient distributed deep learning training implementing such collectives like allreduce, allgather, alltoall. For more information on oneCCL, please refer to the [oneCCL documentation](https://spec.oneapi.com/versions/latest/elements/oneCCL/source/index.html) and [oneCCL specification](https://spec.oneapi.com/versions/latest/elements/oneCCL/source/index.html).
+
+Module `oneccl_bindings_for_pytorch` (`torch_ccl` before version 1.12)  implements PyTorch C10D ProcessGroup API and can be dynamically loaded as external ProcessGroup and only works on Linux platform now
+
+Check more detailed information for [oneccl_bind_pt](https://github.com/intel/torch-ccl).
+
+### Intel® oneCCL Bindings for PyTorch installation:
+
+Wheel files are available for the following Python versions:
+
+| Extension Version | Python 3.6 | Python 3.7 | Python 3.8 | Python 3.9 | Python 3.10 |
+| :---------------: | :--------: | :--------: | :--------: | :--------: | :---------: |
+| 1.13.0            |            | √          | √          | √          | √           |
+| 1.12.100          |            | √          | √          | √          | √           |
+| 1.12.0            |            | √          | √          | √          | √           |
+| 1.11.0            |            | √          | √          | √          | √           |
+| 1.10.0            | √          | √          | √          | √          |             |
+
+```
+pip install oneccl_bind_pt=={pytorch_version} -f https://developer.intel.com/ipex-whl-stable-cpu
+```
+where `{pytorch_version}` should be your PyTorch version, for instance 1.13.0.
+Check more approaches for [oneccl_bind_pt installation](https://github.com/intel/torch-ccl).
+Versions of oneCCL and PyTorch must match.
+
+<Tip warning={true}>
+
+oneccl_bindings_for_pytorch 1.12.0 prebuilt wheel does not work with PyTorch 1.12.1 (it is for PyTorch 1.12.0)
+PyTorch 1.12.1 should work with oneccl_bindings_for_pytorch 1.12.100
+
+</Tip>
+
+## Intel® MPI library
+Use this standards-based MPI implementation to deliver flexible, efficient, scalable cluster messaging on Intel® architecture. This component is part of the Intel® oneAPI HPC Toolkit.
+
+oneccl_bindings_for_pytorch is installed along with the MPI tool set. Need to source the environment before using it.
+
+for Intel® oneCCL >= 1.12.0
+```
+oneccl_bindings_for_pytorch_path=$(python -c "from oneccl_bindings_for_pytorch import cwd; print(cwd)")
+source $oneccl_bindings_for_pytorch_path/env/setvars.sh
+```
+
+for Intel® oneCCL whose version < 1.12.0
+```
+torch_ccl_path=$(python -c "import torch; import torch_ccl; import os;  print(os.path.abspath(os.path.dirname(torch_ccl.__file__)))")
+source $torch_ccl_path/env/setvars.sh
+```
+
+#### IPEX installation:
+
+IPEX provides performance optimizations for CPU training with both Float32 and BFloat16, you could refer [single CPU section](./perf_train_cpu).
+
+
+The following "Usage in Trainer" takes mpirun in Intel® MPI library as an example.
+
+
+## Usage in Trainer
+To enable multi CPU distributed training in the Trainer with the ccl backend, users should add **`--ddp_backend ccl`** in the command arguments.
+
+Let's see an example with the [question-answering example](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering)
+
+
+The following command enables training with 2 processes on one Xeon node, with one process running per one socket. The variables OMP_NUM_THREADS/CCL_WORKER_COUNT can be tuned for optimal performance.
+```shell script
+ export CCL_WORKER_COUNT=1
+ export MASTER_ADDR=127.0.0.1
+ mpirun -n 2 -genv OMP_NUM_THREADS=23 \
+ python3 run_qa.py \
+ --model_name_or_path bert-large-uncased \
+ --dataset_name squad \
+ --do_train \
+ --do_eval \
+ --per_device_train_batch_size 12  \
+ --learning_rate 3e-5  \
+ --num_train_epochs 2  \
+ --max_seq_length 384 \
+ --doc_stride 128  \
+ --output_dir /tmp/debug_squad/ \
+ --no_cuda \
+ --ddp_backend ccl \
+ --use_ipex
+```
+The following command enables training with a total of four processes on two Xeons (node0 and node1, taking node0 as the main process), ppn (processes per node) is set to 2, with one process running per one socket. The variables OMP_NUM_THREADS/CCL_WORKER_COUNT can be tuned for optimal performance.
+
+In node0, you need to create a configuration file which contains the IP addresses of each node (for example hostfile) and pass that configuration file path as an argument.
+```shell script
+ cat hostfile
+ xxx.xxx.xxx.xxx #node0 ip
+ xxx.xxx.xxx.xxx #node1 ip
+```
+Now, run the following command in node0 and **4DDP** will be enabled in node0 and node1 with BF16 auto mixed precision:
+```shell script
+ export CCL_WORKER_COUNT=1
+ export MASTER_ADDR=xxx.xxx.xxx.xxx #node0 ip
+ mpirun -f hostfile -n 4 -ppn 2 \
+ -genv OMP_NUM_THREADS=23 \
+ python3 run_qa.py \
+ --model_name_or_path bert-large-uncased \
+ --dataset_name squad \
+ --do_train \
+ --do_eval \
+ --per_device_train_batch_size 12  \
+ --learning_rate 3e-5  \
+ --num_train_epochs 2  \
+ --max_seq_length 384 \
+ --doc_stride 128  \
+ --output_dir /tmp/debug_squad/ \
+ --no_cuda \
+ --ddp_backend ccl \
+ --use_ipex \
+ --bf16
+```
diff --git a/docs/source/en/perf_train_cpu_many.mdx b/docs/source/en/perf_train_cpu_many.mdx
deleted file mode 100644
index 1310e40d30e1..000000000000
--- a/docs/source/en/perf_train_cpu_many.mdx
+++ /dev/null
@@ -1,130 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
--->
-
-# Efficient Training on Multiple CPUs
-
-When training on a single CPU is too slow, we can use multiple CPUs. This guide focuses on PyTorch-based DDP enabling distributed CPU training efficiently.
-
-## Intel® oneCCL Bindings for PyTorch
-
-[Intel® oneCCL](https://github.com/oneapi-src/oneCCL) (collective communications library) is a library for efficient distributed deep learning training implementing such collectives like allreduce, allgather, alltoall. For more information on oneCCL, please refer to the [oneCCL documentation](https://spec.oneapi.com/versions/latest/elements/oneCCL/source/index.html) and [oneCCL specification](https://spec.oneapi.com/versions/latest/elements/oneCCL/source/index.html).
-
-Module `oneccl_bindings_for_pytorch` (`torch_ccl` before version 1.12)  implements PyTorch C10D ProcessGroup API and can be dynamically loaded as external ProcessGroup and only works on Linux platform now
-
-Check more detailed information for [oneccl_bind_pt](https://github.com/intel/torch-ccl).
-
-### Intel® oneCCL Bindings for PyTorch installation:
-
-Wheel files are available for the following Python versions:
-
-| Extension Version | Python 3.6 | Python 3.7 | Python 3.8 | Python 3.9 | Python 3.10 |
-| :---------------: | :--------: | :--------: | :--------: | :--------: | :---------: |
-| 1.13.0            |            | √          | √          | √          | √           |
-| 1.12.100          |            | √          | √          | √          | √           |
-| 1.12.0            |            | √          | √          | √          | √           |
-| 1.11.0            |            | √          | √          | √          | √           |
-| 1.10.0            | √          | √          | √          | √          |             |
-
-```
-pip install oneccl_bind_pt=={pytorch_version} -f https://developer.intel.com/ipex-whl-stable-cpu
-```
-where `{pytorch_version}` should be your PyTorch version, for instance 1.13.0.
-Check more approaches for [oneccl_bind_pt installation](https://github.com/intel/torch-ccl).
-Versions of oneCCL and PyTorch must match.
-
-<Tip warning={true}>
-
-oneccl_bindings_for_pytorch 1.12.0 prebuilt wheel does not work with PyTorch 1.12.1 (it is for PyTorch 1.12.0)
-PyTorch 1.12.1 should work with oneccl_bindings_for_pytorch 1.12.100
-
-</Tip>
-
-## Intel® MPI library
-Use this standards-based MPI implementation to deliver flexible, efficient, scalable cluster messaging on Intel® architecture. This component is part of the Intel® oneAPI HPC Toolkit.
-
-oneccl_bindings_for_pytorch is installed along with the MPI tool set. Need to source the environment before using it.
-
-for Intel® oneCCL >= 1.12.0
-```
-oneccl_bindings_for_pytorch_path=$(python -c "from oneccl_bindings_for_pytorch import cwd; print(cwd)")
-source $oneccl_bindings_for_pytorch_path/env/setvars.sh
-```
-
-for Intel® oneCCL whose version < 1.12.0
-```
-torch_ccl_path=$(python -c "import torch; import torch_ccl; import os;  print(os.path.abspath(os.path.dirname(torch_ccl.__file__)))")
-source $torch_ccl_path/env/setvars.sh
-```
-
-#### IPEX installation:
-
-IPEX provides performance optimizations for CPU training with both Float32 and BFloat16, you could refer [single CPU section](./perf_train_cpu).
-
-
-The following "Usage in Trainer" takes mpirun in Intel® MPI library as an example.
-
-
-## Usage in Trainer
-To enable multi CPU distributed training in the Trainer with the ccl backend, users should add **`--xpu_backend ccl`** in the command arguments.
-
-Let's see an example with the [question-answering example](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering)
-
-
-The following command enables training with 2 processes on one Xeon node, with one process running per one socket. The variables OMP_NUM_THREADS/CCL_WORKER_COUNT can be tuned for optimal performance.
-```shell script
- export CCL_WORKER_COUNT=1
- export MASTER_ADDR=127.0.0.1
- mpirun -n 2 -genv OMP_NUM_THREADS=23 \
- python3 run_qa.py \
- --model_name_or_path bert-large-uncased \
- --dataset_name squad \
- --do_train \
- --do_eval \
- --per_device_train_batch_size 12  \
- --learning_rate 3e-5  \
- --num_train_epochs 2  \
- --max_seq_length 384 \
- --doc_stride 128  \
- --output_dir /tmp/debug_squad/ \
- --no_cuda \
- --xpu_backend ccl \
- --use_ipex
-```
-The following command enables training with a total of four processes on two Xeons (node0 and node1, taking node0 as the main process), ppn (processes per node) is set to 2, with one process running per one socket. The variables OMP_NUM_THREADS/CCL_WORKER_COUNT can be tuned for optimal performance.
-
-In node0, you need to create a configuration file which contains the IP addresses of each node (for example hostfile) and pass that configuration file path as an argument.
-```shell script
- cat hostfile
- xxx.xxx.xxx.xxx #node0 ip
- xxx.xxx.xxx.xxx #node1 ip
-```
-Now, run the following command in node0 and **4DDP** will be enabled in node0 and node1 with BF16 auto mixed precision:
-```shell script
- export CCL_WORKER_COUNT=1
- export MASTER_ADDR=xxx.xxx.xxx.xxx #node0 ip
- mpirun -f hostfile -n 4 -ppn 2 \
- -genv OMP_NUM_THREADS=23 \
- python3 run_qa.py \
- --model_name_or_path bert-large-uncased \
- --dataset_name squad \
- --do_train \
- --do_eval \
- --per_device_train_batch_size 12  \
- --learning_rate 3e-5  \
- --num_train_epochs 2  \
- --max_seq_length 384 \
- --doc_stride 128  \
- --output_dir /tmp/debug_squad/ \
- --no_cuda \
- --xpu_backend ccl \
- --use_ipex \
- --bf16
-```
diff --git a/docs/source/en/perf_train_gpu_many.md b/docs/source/en/perf_train_gpu_many.md
new file mode 100644
index 000000000000..fc93f763d815
--- /dev/null
+++ b/docs/source/en/perf_train_gpu_many.md
@@ -0,0 +1,530 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Efficient Training on Multiple GPUs
+
+When training on a single GPU is too slow or the model weights don't fit in a single GPUs memory we use a multi-GPU setup. Switching from a single GPU to multiple requires some form of parallelism as the work needs to be distributed. There are several techniques to achieve parallism such as data, tensor, or pipeline parallism. However, there is no one solution to fit them all and which settings works best depends on the hardware you are running on. While the main concepts most likely will apply to any other framework, this article is focused on PyTorch-based implementations.
+
+<Tip>
+
+ Note: Most of the strategies introduced in the [single GPU section](perf_train_gpu_one) (such as mixed precision training or gradient accumulation) are generic and apply to training models in general so make sure to have a look at it before diving into the following sections such as multi-GPU or CPU training.
+
+</Tip>
+
+We will first discuss in depth various 1D parallelism techniques and their pros and cons and then look at how they can be combined into 2D and 3D parallelism to enable an even faster training and to support even bigger models. Various other powerful alternative approaches will be presented.
+
+## Concepts
+
+The following is the brief description of the main concepts that will be described later in depth in this document.
+
+1. **DataParallel (DP)** - the same setup is replicated multiple times, and each being fed a slice of the data. The processing is done in parallel and all setups are synchronized at the end of each training step.
+2. **TensorParallel (TP)** - each tensor is split up into multiple chunks, so instead of having the whole tensor reside on a single gpu, each shard of the tensor resides on its designated gpu. During processing each shard gets processed separately and in parallel on different GPUs and the results are synced at the end of the step. This is what one may call horizontal parallelism, as the splitting happens on horizontal level.
+3. **PipelineParallel (PP)** - the model is split up vertically (layer-level) across multiple GPUs, so that only one or several layers of the model are places on a single gpu. Each gpu processes in parallel different stages of the pipeline and working on a small chunk of the batch.
+4. **Zero Redundancy Optimizer (ZeRO)** - Also performs sharding of the tensors somewhat similar to TP, except the whole tensor gets reconstructed in time for a forward or backward computation, therefore the model doesn't need to be modified. It also supports various offloading techniques to compensate for limited GPU memory.
+5. **Sharded DDP** - is another name for the foundational ZeRO concept as used by various other implementations of ZeRO.
+
+Before diving deeper into the specifics of each concept we first have a look at the rough decision process when training large models on a large infrastructure.
+
+## Scalability Strategy
+
+**⇨ Single Node / Multi-GPU**
+* Model fits onto a single GPU:
+
+    1. DDP - Distributed DP
+    2. ZeRO - may or may not be faster depending on the situation and configuration used
+
+* Model doesn't fit onto a single GPU:
+
+    1. PP
+    2. ZeRO
+    3. TP
+
+    With very fast intra-node connectivity of NVLINK or NVSwitch all three should be mostly on par, without these PP will be faster than TP or ZeRO. The degree of TP may also make a difference. Best to experiment to find the winner on your particular setup.
+
+    TP is almost always used within a single node. That is TP size <= gpus per node.
+
+* Largest Layer not fitting into a single GPU:
+
+    1. If not using ZeRO - must use TP, as PP alone won't be able to fit.
+    2. With ZeRO see the same entry for "Single GPU" above
+
+
+**⇨ Multi-Node / Multi-GPU**
+
+* When you have fast inter-node connectivity:
+
+    1. ZeRO - as it requires close to no modifications to the model
+    2. PP+TP+DP - less communications, but requires massive changes to the model
+
+* when you have slow inter-node connectivity and still low on GPU memory:
+
+    1. DP+PP+TP+ZeRO-1
+
+
+
+## Data Parallelism
+
+Most users with just 2 GPUs already enjoy the increased training speed up thanks to `DataParallel` (DP) and `DistributedDataParallel` (DDP) that are almost trivial to use. This is a built-in feature of Pytorch. Note that in general it is advised to use DDP as it is better maintained and works for all models while DP might fail for some models. [PyTorch documentation](https://pytorch.org/docs/master/generated/torch.nn.DataParallel.html) itself recommends the use of DDP.
+
+### DP vs DDP
+
+`DistributedDataParallel` (DDP) is typically faster than `DataParallel` (DP), but it is not always the case:
+* while DP is python threads-based, DDP is multiprocess-based - and as such it has no python threads limitations, such as GIL
+* on the other hand a slow inter-connectivity between the GPU cards could lead to an actual slower outcome with DDP
+
+Here are the main differences in the inter-GPU communication overhead between the two modes:
+
+[DDP](https://pytorch.org/docs/master/notes/ddp.html):
+
+- At the start time the main process replicates the model once from gpu 0 to the rest of gpus
+- Then for each batch:
+   1. each gpu consumes each own mini-batch of data directly
+   2. during `backward`, once the local gradients are ready, they are then averaged across all processes
+
+[DP](https://pytorch.org/docs/master/generated/torch.nn.DataParallel.html):
+
+For each batch:
+   1. gpu 0 reads the batch of data and then sends a mini-batch to each gpu
+   2. replicates the up-to-date model from gpu 0 to each gpu
+   3. runs `forward` and sends output from each gpu to gpu 0, computes loss
+   4. scatters loss from gpu 0 to all gpus, runs `backward`
+   5. sends gradients from each gpu to gpu 0 and averages those
+
+The only communication DDP performs per batch is sending gradients, whereas DP does 5 different data exchanges per batch.
+
+DP copies data within the process via python threads, whereas DDP copies data via [torch.distributed](https://pytorch.org/docs/master/distributed.html).
+
+Under DP gpu 0 performs a lot more work than the rest of the gpus, thus resulting in under-utilization of gpus.
+
+You can use DDP across multiple machines, but this is not the case with DP.
+
+There are other differences between DP and DDP but they aren't relevant to this discussion.
+
+If you want to go really deep into understanding these 2 modes, this [article](https://www.telesens.co/2019/04/04/distributed-data-parallel-training-using-pytorch-on-aws/) is highly recommended, as it has great diagrams, includes multiple benchmarks and profiler outputs on various hardware, explains all the nuances that you may need to know.
+
+Let's look at an actual benchmark:
+
+| Type   | NVlink | Time |
+| :----- | -----  | ---: |
+| 2:DP   | Y      | 110s |
+| 2:DDP  | Y      | 101s |
+| 2:DDP  | N      | 131s |
+
+
+Analysis:
+
+Here DP is ~10% slower than DDP w/ NVlink, but ~15% faster than DDP w/o NVlink
+
+The real difference will depend on how much data each GPU needs to sync with the others - the more there is to sync, the more a slow link will slow down the total runtime.
+
+Here is the full benchmark code and outputs:
+
+`NCCL_P2P_DISABLE=1` was used to disable the NVLink feature on the corresponding benchmark.
+
+```
+
+# DP
+rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 \
+python examples/pytorch/language-modeling/run_clm.py \
+--model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
+--do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
+
+{'train_runtime': 110.5948, 'train_samples_per_second': 1.808, 'epoch': 0.69}
+
+# DDP w/ NVlink
+rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 \
+python -m torch.distributed.launch --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \
+--model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
+--do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
+
+{'train_runtime': 101.9003, 'train_samples_per_second': 1.963, 'epoch': 0.69}
+
+# DDP w/o NVlink
+rm -r /tmp/test-clm; NCCL_P2P_DISABLE=1 CUDA_VISIBLE_DEVICES=0,1 \
+python -m torch.distributed.launch --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \
+--model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
+--do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
+
+{'train_runtime': 131.4367, 'train_samples_per_second': 1.522, 'epoch': 0.69}
+```
+
+Hardware: 2x TITAN RTX 24GB each + NVlink with 2 NVLinks (`NV2` in `nvidia-smi topo -m`)
+Software: `pytorch-1.8-to-be` + `cuda-11.0` / `transformers==4.3.0.dev0`
+
+## ZeRO Data Parallelism
+
+ZeRO-powered data parallelism (ZeRO-DP) is described on the following diagram from this [blog post](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/)
+![DeepSpeed-Image-1](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-zero.png)
+
+It can be difficult to wrap one's head around it, but in reality the concept is quite simple. This is just the usual `DataParallel` (DP), except, instead of replicating the full model params, gradients and optimizer states, each GPU stores only a slice of it.  And then at run-time when the full layer params are needed just for the given layer, all GPUs synchronize to give each other parts that they miss - this is it.
+
+Consider this simple model with 3 layers, where each layer has 3 params:
+```
+La | Lb | Lc
+---|----|---
+a0 | b0 | c0
+a1 | b1 | c1
+a2 | b2 | c2
+```
+Layer La has weights a0, a1 and a2.
+
+If we have 3 GPUs, the Sharded DDP (= Zero-DP) splits the model onto 3 GPUs like so:
+
+```
+GPU0:
+La | Lb | Lc
+---|----|---
+a0 | b0 | c0
+
+GPU1:
+La | Lb | Lc
+---|----|---
+a1 | b1 | c1
+
+GPU2:
+La | Lb | Lc
+---|----|---
+a2 | b2 | c2
+```
+
+In a way this is the same horizontal slicing, as tensor parallelism, if you imagine the typical DNN diagram. Vertical slicing is where one puts whole layer-groups on different GPUs. But it's just the starting point.
+
+Now each of these GPUs will get the usual mini-batch as it works in DP:
+```
+x0 => GPU0
+x1 => GPU1
+x2 => GPU2
+```
+
+The inputs are unmodified - they think they are going to be processed by the normal model.
+
+First, the inputs hit the layer La.
+
+Let's focus just on GPU0: x0 needs a0, a1, a2 params to do its forward path, but GPU0 has only a0 - it gets sent a1 from GPU1 and a2 from GPU2, bringing all pieces of the model together.
+
+In parallel, GPU1 gets mini-batch x1 and it only has a1, but needs a0 and a2 params, so it gets those from GPU0 and GPU2.
+
+Same happens to GPU2 that gets input x2. It gets a0 and a1 from GPU0 and GPU1, and with its a2 it reconstructs the full tensor.
+
+All 3 GPUs get the full tensors reconstructed and a forward happens.
+
+As soon as the calculation is done, the data that is no longer needed gets dropped - it's only used during the calculation. The reconstruction is done efficiently via a pre-fetch.
+
+And the whole process is repeated for layer Lb, then Lc forward-wise, and then backward Lc -> Lb -> La.
+
+To me this sounds like an efficient group backpacking weight distribution strategy:
+
+1. person A carries the tent
+2. person B carries the stove
+3. person C carries the axe
+
+Now each night they all share what they have with others and get from others what they don't have, and in the morning they pack up their allocated type of gear and continue on their way. This is Sharded DDP / Zero DP.
+
+Compare this strategy to the simple one where each person has to carry their own tent, stove and axe, which would be far more inefficient. This is DataParallel (DP and DDP) in Pytorch.
+
+While reading the literature on this topic you may encounter the following synonyms: Sharded, Partitioned.
+
+If you pay close attention the way ZeRO partitions the model's weights - it looks very similar to tensor parallelism which will be discussed later. This is because it partitions/shards each layer's weights, unlike vertical model parallelism which is discussed next.
+
+Implementations:
+
+- [DeepSpeed](https://www.deepspeed.ai/features/#the-zero-redundancy-optimizer) ZeRO-DP stages 1+2+3
+- [`transformers` integration](main_classes/trainer#trainer-integrations)
+
+## Naive Model Parallelism (Vertical) and Pipeline Parallelism
+
+Naive Model Parallelism (MP) is where one spreads groups of model layers across multiple GPUs. The mechanism is relatively simple - switch the desired layers `.to()` the desired devices and now whenever the data goes in and out those layers switch the data to the same device as the layer and leave the rest unmodified.
+
+We refer to it as Vertical MP, because if you remember how most models are drawn, we slice the layers vertically. For example, if the following diagram shows an 8-layer model:
+
+```
+===================  ===================
+|  0 | 1 | 2 | 3  |  |  4 | 5 | 6 | 7  |
+===================  ===================
+        gpu0                 gpu1
+```
+we just sliced it in 2 vertically, placing layers 0-3 onto GPU0 and 4-7 to GPU1.
+
+Now while data travels from layer 0 to 1, 1 to 2 and 2 to 3 this is just the normal model. But when data needs to pass from layer 3 to layer 4 it needs to travel from GPU0 to GPU1 which introduces a communication overhead. If the participating GPUs are on the same compute node (e.g. same physical machine) this copying is pretty fast, but if the GPUs are located on different compute nodes (e.g. multiple machines) the communication overhead could be significantly larger.
+
+Then layers 4 to 5 to 6 to 7 are as a normal model would have and when the 7th layer completes we often need to send the data back to layer 0 where the labels are (or alternatively send the labels to the last layer). Now the loss can be computed and the optimizer can do its work.
+
+Problems:
+- the main deficiency and why this one is called "naive" MP, is that all but one GPU is idle at any given moment. So if 4 GPUs are used, it's almost identical to quadrupling the amount of memory of a single GPU, and ignoring the rest of the hardware. Plus there is the overhead of copying the data between devices. So 4x 6GB cards will be able to accommodate the same size as 1x 24GB card using naive MP, except the latter will complete the training faster, since it doesn't have the data copying overhead. But, say, if you have 40GB cards and need to fit a 45GB model you can with 4x 40GB cards (but barely because of the gradient and optimizer states)
+- shared embeddings may need to get copied back and forth between GPUs.
+
+Pipeline Parallelism (PP) is almost identical to a naive MP, but it solves the GPU idling problem, by chunking the incoming batch into micro-batches and artificially creating a pipeline, which allows different GPUs to concurrently participate in the computation process.
+
+The following illustration from the [GPipe paper](https://ai.googleblog.com/2019/03/introducing-gpipe-open-source-library.html) shows the naive MP on the top, and PP on the bottom:
+
+![mp-pp](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-gpipe-bubble.png)
+
+It's easy to see from the bottom diagram how PP has less dead zones, where GPUs are idle. The idle parts are referred to as the "bubble".
+
+Both parts of the diagram show a parallelism that is of degree 4. That is 4 GPUs are participating in the pipeline. So there is the forward path of 4 pipe stages F0, F1, F2 and F3 and then the return reverse order backward path of B3, B2, B1 and B0.
+
+PP introduces a new hyper-parameter to tune and it's `chunks` which defines how many chunks of data are sent in a sequence through the same pipe stage. For example, in the bottom diagram you can see that `chunks=4`. GPU0 performs the same forward path on chunk 0, 1, 2 and 3 (F0,0, F0,1, F0,2, F0,3) and then it waits for other GPUs to do their work and only when their work is starting to be complete, GPU0 starts to work again doing the backward path for chunks 3, 2, 1 and 0 (B0,3, B0,2, B0,1, B0,0).
+
+Note that conceptually this is the same concept as gradient accumulation steps (GAS). Pytorch uses `chunks`, whereas DeepSpeed refers to the same hyper-parameter as GAS.
+
+Because of the chunks, PP introduces the concept of micro-batches (MBS). DP splits the global data batch size into mini-batches, so if you have a DP degree of 4, a global batch size of 1024 gets split up into 4 mini-batches of 256 each (1024/4). And if the number of `chunks` (or GAS) is 32 we end up with a micro-batch size of 8 (256/32). Each Pipeline stage works with a single micro-batch at a time.
+
+To calculate the global batch size of the DP + PP setup we then do: `mbs*chunks*dp_degree` (`8*32*4=1024`).
+
+Let's go back to the diagram.
+
+With `chunks=1` you end up with the naive MP, which is very inefficient. With a very large `chunks` value you end up with tiny micro-batch sizes which could be not every efficient either. So one has to experiment to find the value that leads to the highest efficient utilization of the gpus.
+
+While the diagram shows that there is a bubble of "dead" time that can't be parallelized because the last `forward` stage has to wait for `backward` to complete the pipeline, the purpose of finding the best value for `chunks` is to enable a high concurrent GPU utilization across all participating GPUs which translates to minimizing the size of the bubble.
+
+There are 2 groups of solutions - the traditional Pipeline API and the more modern solutions that make things much easier for the end user.
+
+Traditional Pipeline API solutions:
+- PyTorch
+- DeepSpeed
+- Megatron-LM
+
+Modern solutions:
+- Varuna
+- Sagemaker
+
+Problems with traditional Pipeline API solutions:
+- have to modify the model quite heavily, because Pipeline requires one to rewrite the normal flow of modules into a `nn.Sequential` sequence of the same, which may require changes to the design of the model.
+- currently the Pipeline API is very restricted. If you had a bunch of python variables being passed in the very first stage of the Pipeline, you will have to find a way around it. Currently, the pipeline interface requires either a single Tensor or a tuple of Tensors as the only input and output. These tensors must have a batch size as the very first dimension, since pipeline is going to chunk the mini batch into micro-batches. Possible improvements are being discussed here https://github.com/pytorch/pytorch/pull/50693
+- conditional control flow at the level of pipe stages is not possible - e.g., Encoder-Decoder models like T5 require special workarounds to handle a conditional encoder stage.
+- have to arrange each layer so that the output of one model becomes an input to the other model.
+
+We are yet to experiment with Varuna and SageMaker but their papers report that they have overcome the list of problems mentioned above and that they require much smaller changes to the user's model.
+
+Implementations:
+- [Pytorch](https://pytorch.org/docs/stable/pipeline.html) (initial support in pytorch-1.8, and progressively getting improved in 1.9 and more so in 1.10). Some [examples](https://github.com/pytorch/pytorch/blob/master/benchmarks/distributed/pipeline/pipe.py)
+- [DeepSpeed](https://www.deepspeed.ai/tutorials/pipeline/)
+- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) has an internal implementation - no API.
+- [Varuna](https://github.com/microsoft/varuna)
+- [SageMaker](https://arxiv.org/abs/2111.05972) - this is a proprietary solution that can only be used on AWS.
+- [OSLO](https://github.com/tunib-ai/oslo) - this is implemented based on the Hugging Face Transformers.
+
+🤗 Transformers status: as of this writing none of the models supports full-PP. GPT2 and T5 models have naive MP support. The main obstacle is being unable to convert the models to `nn.Sequential` and have all the inputs to be Tensors. This is because currently the models include many features that make the conversion very complicated, and will need to be removed to accomplish that.
+
+Other approaches:
+
+DeepSpeed, Varuna and SageMaker use the concept of an [Interleaved Pipeline](https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-core-features.html)
+![interleaved-pipeline-execution](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-sagemaker-interleaved-pipeline.png)
+
+Here the bubble (idle time) is further minimized by prioritizing backward passes.
+
+Varuna further tries to improve the schedule by using simulations to discover the most efficient scheduling.
+
+OSLO has pipeline parallelism implementation based on the Transformers without `nn.Sequential` converting.
+
+## Tensor Parallelism
+
+In Tensor Parallelism each GPU processes only a slice of a tensor and only aggregates the full tensor for operations that require the whole thing.
+
+In this section we use concepts and diagrams from the [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) paper: [Efficient Large-Scale Language Model Training on GPU Clusters](https://arxiv.org/abs/2104.04473).
+
+The main building block of any transformer is a fully connected `nn.Linear` followed by a nonlinear activation `GeLU`.
+
+Following the Megatron's paper notation, we can write the dot-product part of it as `Y = GeLU(XA)`, where `X` and `Y` are the input and output vectors, and `A` is the weight matrix.
+
+If we look at the computation in matrix form, it's easy to see how the matrix multiplication can be split between multiple GPUs:
+![Parallel GEMM](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-tp-parallel_gemm.png)
+
+If we split the weight matrix `A` column-wise across `N` GPUs and perform matrix multiplications `XA_1` through `XA_n` in parallel, then we will end up with `N` output vectors `Y_1, Y_2, ..., Y_n` which can be fed into `GeLU` independently:
+![independent GeLU](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-tp-independent-gelu.png)
+
+Using this principle, we can update an MLP of arbitrary depth, without the need for any synchronization between GPUs until the very end, where we need to reconstruct the output vector from shards. The Megatron-LM paper authors provide a helpful illustration for that:
+![parallel shard processing](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-tp-parallel_shard_processing.png)
+
+Parallelizing the multi-headed attention layers is even simpler, since they are already inherently parallel, due to having multiple independent heads!
+![parallel self-attention](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-tp-parallel_self_attention.png)
+
+Special considerations: TP requires very fast network, and therefore it's not advisable to do TP across more than one node. Practically, if a node has 4 GPUs, the highest TP degree is therefore 4. If you need a TP degree of 8, you need to use nodes that have at least 8 GPUs.
+
+This section is based on the original much more [detailed TP overview](https://github.com/huggingface/transformers/issues/10321#issuecomment-783543530).
+by [@anton-l](https://github.com/anton-l).
+
+SageMaker combines TP with DP for a more efficient processing.
+
+Alternative names:
+- DeepSpeed calls it [tensor slicing](https://www.deepspeed.ai/features/#model-parallelism)
+
+Implementations:
+- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) has an internal implementation, as it's very model-specific
+- [parallelformers](https://github.com/tunib-ai/parallelformers) (only inference at the moment)
+- [SageMaker](https://arxiv.org/abs/2111.05972) - this is a proprietary solution that can only be used on AWS.
+- [OSLO](https://github.com/tunib-ai/oslo) has the tensor parallelism implementation based on the Transformers.
+
+🤗 Transformers status:
+- core: not yet implemented in the core
+- but if you want inference [parallelformers](https://github.com/tunib-ai/parallelformers) provides this support for most of our models. So until this is implemented in the core you can use theirs. And hopefully training mode will be supported too.
+- Deepspeed-Inference also supports our BERT, GPT-2, and GPT-Neo models in their super-fast CUDA-kernel-based inference mode, see more [here](https://www.deepspeed.ai/tutorials/inference-tutorial/)
+
+## DP+PP
+
+The following diagram from the DeepSpeed [pipeline tutorial](https://www.deepspeed.ai/tutorials/pipeline/) demonstrates how one combines DP with PP.
+
+![dp-pp-2d](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-zero-dp-pp.png)
+
+Here it's important to see how DP rank 0 doesn't see GPU2 and DP rank 1 doesn't see GPU3. To DP there is just GPUs 0 and 1 where it feeds data as if there were just 2 GPUs. GPU0 "secretly" offloads some of its load to GPU2 using PP. And GPU1 does the same by enlisting GPU3 to its aid.
+
+Since each dimension requires at least 2 GPUs, here you'd need at least 4 GPUs.
+
+Implementations:
+- [DeepSpeed](https://github.com/microsoft/DeepSpeed)
+- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)
+- [Varuna](https://github.com/microsoft/varuna)
+- [SageMaker](https://arxiv.org/abs/2111.05972)
+- [OSLO](https://github.com/tunib-ai/oslo)
+
+🤗 Transformers status: not yet implemented
+
+## DP+PP+TP
+
+To get an even more efficient training a 3D parallelism is used where PP is combined with TP and DP. This can be seen in the following diagram.
+
+![dp-pp-tp-3d](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-deepspeed-3d.png)
+
+This diagram is from a blog post [3D parallelism: Scaling to trillion-parameter models](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/), which is a good read as well.
+
+Since each dimension requires at least 2 GPUs, here you'd need at least 8 GPUs.
+
+Implementations:
+- [DeepSpeed](https://github.com/microsoft/DeepSpeed) - DeepSpeed also includes an even more efficient DP, which they call ZeRO-DP.
+- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)
+- [Varuna](https://github.com/microsoft/varuna)
+- [SageMaker](https://arxiv.org/abs/2111.05972)
+- [OSLO](https://github.com/tunib-ai/oslo)
+
+🤗 Transformers status: not yet implemented, since we have no PP and TP.
+
+## ZeRO DP+PP+TP
+
+One of the main features of DeepSpeed is ZeRO, which is a super-scalable extension of DP. It has already been discussed in [ZeRO Data Parallelism](#zero-data-parallelism). Normally it's a standalone feature that doesn't require PP or TP. But it can be combined with PP and TP.
+
+When ZeRO-DP is combined with PP (and optionally TP) it typically enables only ZeRO stage 1 (optimizer sharding).
+
+While it's theoretically possible to use ZeRO stage 2 (gradient sharding) with Pipeline Parallelism, it will have bad performance impacts. There would need to be an additional reduce-scatter collective for every micro-batch to aggregate the gradients before sharding, which adds a potentially significant communication overhead. By nature of Pipeline Parallelism, small micro-batches are used and instead the focus is on trying to balance arithmetic intensity (micro-batch size) with minimizing the Pipeline bubble (number of micro-batches). Therefore those communication costs are going to hurt.
+
+In addition, There are already fewer layers than normal due to PP and so the memory savings won't be huge. PP already reduces gradient size by ``1/PP``, and so gradient sharding savings on top of that are less significant than pure DP.
+
+ZeRO stage 3 is not a good choice either for the same reason - more inter-node communications required.
+
+And since we have ZeRO, the other benefit is ZeRO-Offload. Since this is stage 1 optimizer states can be offloaded to CPU.
+
+Implementations:
+- [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed) and [Megatron-Deepspeed from BigScience](https://github.com/bigscience-workshop/Megatron-DeepSpeed), which is the fork of the former repo.
+- [OSLO](https://github.com/tunib-ai/oslo)
+
+Important papers:
+
+- [Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model](
+https://arxiv.org/abs/2201.11990)
+
+🤗 Transformers status: not yet implemented, since we have no PP and TP.
+
+## FlexFlow
+
+[FlexFlow](https://github.com/flexflow/FlexFlow) also solves the parallelization problem in a slightly different approach.
+
+Paper: ["Beyond Data and Model Parallelism for Deep Neural Networks" by Zhihao Jia, Matei Zaharia, Alex Aiken](https://arxiv.org/abs/1807.05358)
+
+It performs a sort of 4D Parallelism over Sample-Operator-Attribute-Parameter.
+
+1. Sample = Data Parallelism (sample-wise parallel)
+2. Operator = Parallelize a single operation into several sub-operations
+3. Attribute = Data Parallelism (length-wise parallel)
+4. Parameter = Model Parallelism (regardless of dimension - horizontal or vertical)
+
+Examples:
+* Sample
+
+Let's take 10 batches of sequence length 512. If we parallelize them by sample dimension into 2 devices, we get 10 x 512 which becomes be 5 x 2 x 512.
+
+* Operator
+
+If we perform layer normalization, we compute std first and mean second, and then we can normalize data. Operator parallelism allows computing std and mean in parallel. So if we parallelize them by operator dimension into 2 devices (cuda:0, cuda:1), first we copy input data into both devices, and cuda:0 computes std, cuda:1 computes mean at the same time.
+
+* Attribute
+
+We have 10 batches of 512 length. If we parallelize them by attribute dimension into 2 devices, 10 x 512 will be 10 x 2 x 256.
+
+* Parameter
+
+It is similar with tensor model parallelism or naive layer-wise model parallelism.
+
+![flex-flow-soap](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-flexflow.jpeg)
+
+The significance of this framework is that it takes resources like (1) GPU/TPU/CPU vs. (2) RAM/DRAM vs. (3) fast-intra-connect/slow-inter-connect and it automatically optimizes all these  algorithmically deciding which parallelisation to use where.
+
+One very important aspect is that FlexFlow is designed for optimizing DNN parallelizations for models with static and fixed workloads, since models with dynamic behavior may prefer different parallelization strategies across iterations.
+
+So the promise is very attractive - it runs a 30min simulation on the cluster of choice and it comes up with the best strategy to utilise this specific environment. If you add/remove/replace any parts it'll run and re-optimize the plan for that. And then you can train. A different setup will have its own custom optimization.
+
+🤗 Transformers status: not yet integrated. We already have our models FX-trace-able via [transformers.utils.fx](https://github.com/huggingface/transformers/blob/master/src/transformers/utils/fx.py), which is a prerequisite for FlexFlow, so someone needs to figure out what needs to be done to make FlexFlow work with our models.
+
+
+## Which Strategy To Use When
+
+Here is a very rough outline at which parallelism strategy to use when. The first on each list is typically faster.
+
+**⇨ Single GPU**
+
+* Model fits onto a single GPU:
+
+    1. Normal use
+
+* Model doesn't fit onto a single GPU:
+
+    1. ZeRO + Offload CPU and optionally NVMe
+    2. as above plus Memory Centric Tiling (see below for details) if the largest layer can't fit into a single GPU
+
+* Largest Layer not fitting into a single GPU:
+
+1. ZeRO - Enable [Memory Centric Tiling](https://deepspeed.readthedocs.io/en/latest/zero3.html#memory-centric-tiling) (MCT). It allows you to run arbitrarily large layers by automatically splitting them and executing them sequentially. MCT reduces the number of parameters that are live on a GPU, but it does not affect the activation memory. As this need is very rare as of this writing a manual override of `torch.nn.Linear` needs to be done by the user.
+
+**⇨ Single Node / Multi-GPU**
+
+* Model fits onto a single GPU:
+
+    1. DDP - Distributed DP
+    2. ZeRO - may or may not be faster depending on the situation and configuration used
+
+* Model doesn't fit onto a single GPU:
+
+    1. PP
+    2. ZeRO
+    3. TP
+
+    With very fast intra-node connectivity of NVLINK or NVSwitch all three should be mostly on par, without these PP will be faster than TP or ZeRO. The degree of TP may also make a difference. Best to experiment to find the winner on your particular setup.
+
+    TP is almost always used within a single node. That is TP size <= gpus per node.
+
+* Largest Layer not fitting into a single GPU:
+
+    1. If not using ZeRO - must use TP, as PP alone won't be able to fit.
+    2. With ZeRO see the same entry for "Single GPU" above
+
+
+**⇨ Multi-Node / Multi-GPU**
+
+* When you have fast inter-node connectivity:
+
+    1. ZeRO - as it requires close to no modifications to the model
+    2. PP+TP+DP - less communications, but requires massive changes to the model
+
+* when you have slow inter-node connectivity and still low on GPU memory:
+
+    1. DP+PP+TP+ZeRO-1
diff --git a/docs/source/en/perf_train_gpu_many.mdx b/docs/source/en/perf_train_gpu_many.mdx
deleted file mode 100644
index 17eb7b739925..000000000000
--- a/docs/source/en/perf_train_gpu_many.mdx
+++ /dev/null
@@ -1,529 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
--->
-
-# Efficient Training on Multiple GPUs
-
-When training on a single GPU is too slow or the model weights don't fit in a single GPUs memory we use a multi-GPU setup. Switching from a single GPU to multiple requires some form of parallelism as the work needs to be distributed. There are several techniques to achieve parallism such as data, tensor, or pipeline parallism. However, there is no one solution to fit them all and which settings works best depends on the hardware you are running on. While the main concepts most likely will apply to any other framework, this article is focused on PyTorch-based implementations.
-
-<Tip>
-
- Note: Most of the strategies introduced in the [single GPU section](perf_train_gpu_one) (such as mixed precision training or gradient accumulation) are generic and apply to training models in general so make sure to have a look at it before diving into the following sections such as multi-GPU or CPU training.
-
-</Tip>
-
-We will first discuss in depth various 1D parallelism techniques and their pros and cons and then look at how they can be combined into 2D and 3D parallelism to enable an even faster training and to support even bigger models. Various other powerful alternative approaches will be presented.
-
-## Concepts
-
-The following is the brief description of the main concepts that will be described later in depth in this document.
-
-1. **DataParallel (DP)** - the same setup is replicated multiple times, and each being fed a slice of the data. The processing is done in parallel and all setups are synchronized at the end of each training step.
-2. **TensorParallel (TP)** - each tensor is split up into multiple chunks, so instead of having the whole tensor reside on a single gpu, each shard of the tensor resides on its designated gpu. During processing each shard gets processed separately and in parallel on different GPUs and the results are synced at the end of the step. This is what one may call horizontal parallelism, as the splitting happens on horizontal level.
-3. **PipelineParallel (PP)** - the model is split up vertically (layer-level) across multiple GPUs, so that only one or several layers of the model are places on a single gpu. Each gpu processes in parallel different stages of the pipeline and working on a small chunk of the batch.
-4. **Zero Redundancy Optimizer (ZeRO)** - Also performs sharding of the tensors somewhat similar to TP, except the whole tensor gets reconstructed in time for a forward or backward computation, therefore the model doesn't need to be modified. It also supports various offloading techniques to compensate for limited GPU memory.
-5. **Sharded DDP** - is another name for the foundational ZeRO concept as used by various other implementations of ZeRO.
-
-Before diving deeper into the specifics of each concept we first have a look at the rough decision process when training large models on a large infrastructure.
-
-## Scalability Strategy
-
-**⇨ Single Node / Multi-GPU**
-* Model fits onto a single GPU:
-
-    1. DDP - Distributed DP
-    2. ZeRO - may or may not be faster depending on the situation and configuration used
-
-* Model doesn't fit onto a single GPU:
-
-    1. PP
-    2. ZeRO
-    3. TP
-
-    With very fast intra-node connectivity of NVLINK or NVSwitch all three should be mostly on par, without these PP will be faster than TP or ZeRO. The degree of TP may also make a difference. Best to experiment to find the winner on your particular setup.
-
-    TP is almost always used within a single node. That is TP size <= gpus per node.
-
-* Largest Layer not fitting into a single GPU:
-
-    1. If not using ZeRO - must use TP, as PP alone won't be able to fit.
-    2. With ZeRO see the same entry for "Single GPU" above
-
-
-**⇨ Multi-Node / Multi-GPU**
-
-* When you have fast inter-node connectivity:
-
-    1. ZeRO - as it requires close to no modifications to the model
-    2. PP+TP+DP - less communications, but requires massive changes to the model
-
-* when you have slow inter-node connectivity and still low on GPU memory:
-
-    1. DP+PP+TP+ZeRO-1
-
-
-
-## Data Parallelism
-
-Most users with just 2 GPUs already enjoy the increased training speed up thanks to `DataParallel` (DP) and `DistributedDataParallel` (DDP) that are almost trivial to use. This is a built-in feature of Pytorch. Note that in general it is advised to use DDP as it is better maintained and works for all models while DP might fail for some models. [PyTorch documentation](https://pytorch.org/docs/master/generated/torch.nn.DataParallel.html) itself recommends the use of DDP.
-
-### DP vs DDP
-
-`DistributedDataParallel` (DDP) is typically faster than `DataParallel` (DP), but it is not always the case:
-* while DP is python threads-based, DDP is multiprocess-based - and as such it has no python threads limitations, such as GIL
-* on the other hand a slow inter-connectivity between the GPU cards could lead to an actual slower outcome with DDP
-
-Here are the main differences in the inter-GPU communication overhead between the two modes:
-
-[DDP](https://pytorch.org/docs/master/notes/ddp.html):
-
-- At the start time the main process replicates the model once from gpu 0 to the rest of gpus
-- Then for each batch:
-   1. each gpu consumes each own mini-batch of data directly
-   2. during `backward`, once the local gradients are ready, they are then averaged across all processes
-
-[DP](https://pytorch.org/docs/master/generated/torch.nn.DataParallel.html):
-
-For each batch:
-   1. gpu 0 reads the batch of data and then sends a mini-batch to each gpu
-   2. replicates the up-to-date model from gpu 0 to each gpu
-   3. runs `forward` and sends output from each gpu to gpu 0, computes loss
-   4. scatters loss from gpu 0 to all gpus, runs `backward`
-   5. sends gradients from each gpu to gpu 0 and averages those
-
-The only communication DDP performs per batch is sending gradients, whereas DP does 5 different data exchanges per batch.
-
-DP copies data within the process via python threads, whereas DDP copies data via [torch.distributed](https://pytorch.org/docs/master/distributed.html).
-
-Under DP gpu 0 performs a lot more work than the rest of the gpus, thus resulting in under-utilization of gpus.
-
-You can use DDP across multiple machines, but this is not the case with DP.
-
-There are other differences between DP and DDP but they aren't relevant to this discussion.
-
-If you want to go really deep into understanding these 2 modes, this [article](https://www.telesens.co/2019/04/04/distributed-data-parallel-training-using-pytorch-on-aws/) is highly recommended, as it has great diagrams, includes multiple benchmarks and profiler outputs on various hardware, explains all the nuances that you may need to know.
-
-Let's look at an actual benchmark:
-
-| Type   | NVlink | Time |
-| :----- | -----  | ---: |
-| 2:DP   | Y      | 110s |
-| 2:DDP  | Y      | 101s |
-| 2:DDP  | N      | 131s |
-
-
-Analysis:
-
-Here DP is ~10% slower than DDP w/ NVlink, but ~15% faster than DDP w/o NVlink
-
-The real difference will depend on how much data each GPU needs to sync with the others - the more there is to sync, the more a slow link will slow down the total runtime.
-
-Here is the full benchmark code and outputs:
-
-`NCCL_P2P_DISABLE=1` was used to disable the NVLink feature on the corresponding benchmark.
-
-```
-
-# DP
-rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 \
-python examples/pytorch/language-modeling/run_clm.py \
---model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
---do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
-
-{'train_runtime': 110.5948, 'train_samples_per_second': 1.808, 'epoch': 0.69}
-
-# DDP w/ NVlink
-rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 \
-python -m torch.distributed.launch --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \
---model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
---do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
-
-{'train_runtime': 101.9003, 'train_samples_per_second': 1.963, 'epoch': 0.69}
-
-# DDP w/o NVlink
-rm -r /tmp/test-clm; NCCL_P2P_DISABLE=1 CUDA_VISIBLE_DEVICES=0,1 \
-python -m torch.distributed.launch --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \
---model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
---do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
-
-{'train_runtime': 131.4367, 'train_samples_per_second': 1.522, 'epoch': 0.69}
-```
-
-Hardware: 2x TITAN RTX 24GB each + NVlink with 2 NVLinks (`NV2` in `nvidia-smi topo -m`)
-Software: `pytorch-1.8-to-be` + `cuda-11.0` / `transformers==4.3.0.dev0`
-
-## ZeRO Data Parallelism
-
-ZeRO-powered data parallelism (ZeRO-DP) is described on the following diagram from this [blog post](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/)
-![DeepSpeed-Image-1](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-zero.png)
-
-It can be difficult to wrap one's head around it, but in reality the concept is quite simple. This is just the usual `DataParallel` (DP), except, instead of replicating the full model params, gradients and optimizer states, each GPU stores only a slice of it.  And then at run-time when the full layer params are needed just for the given layer, all GPUs synchronize to give each other parts that they miss - this is it.
-
-Consider this simple model with 3 layers, where each layer has 3 params:
-```
-La | Lb | Lc
----|----|---
-a0 | b0 | c0
-a1 | b1 | c1
-a2 | b2 | c2
-```
-Layer La has weights a0, a1 and a2.
-
-If we have 3 GPUs, the Sharded DDP (= Zero-DP) splits the model onto 3 GPUs like so:
-
-```
-GPU0:
-La | Lb | Lc
----|----|---
-a0 | b0 | c0
-
-GPU1:
-La | Lb | Lc
----|----|---
-a1 | b1 | c1
-
-GPU2:
-La | Lb | Lc
----|----|---
-a2 | b2 | c2
-```
-
-In a way this is the same horizontal slicing, as tensor parallelism, if you imagine the typical DNN diagram. Vertical slicing is where one puts whole layer-groups on different GPUs. But it's just the starting point.
-
-Now each of these GPUs will get the usual mini-batch as it works in DP:
-```
-x0 => GPU0
-x1 => GPU1
-x2 => GPU2
-```
-
-The inputs are unmodified - they think they are going to be processed by the normal model.
-
-First, the inputs hit the layer La.
-
-Let's focus just on GPU0: x0 needs a0, a1, a2 params to do its forward path, but GPU0 has only a0 - it gets sent a1 from GPU1 and a2 from GPU2, bringing all pieces of the model together.
-
-In parallel, GPU1 gets mini-batch x1 and it only has a1, but needs a0 and a2 params, so it gets those from GPU0 and GPU2.
-
-Same happens to GPU2 that gets input x2. It gets a0 and a1 from GPU0 and GPU1, and with its a2 it reconstructs the full tensor.
-
-All 3 GPUs get the full tensors reconstructed and a forward happens.
-
-As soon as the calculation is done, the data that is no longer needed gets dropped - it's only used during the calculation. The reconstruction is done efficiently via a pre-fetch.
-
-And the whole process is repeated for layer Lb, then Lc forward-wise, and then backward Lc -> Lb -> La.
-
-To me this sounds like an efficient group backpacking weight distribution strategy:
-
-1. person A carries the tent
-2. person B carries the stove
-3. person C carries the axe
-
-Now each night they all share what they have with others and get from others what they don't have, and in the morning they pack up their allocated type of gear and continue on their way. This is Sharded DDP / Zero DP.
-
-Compare this strategy to the simple one where each person has to carry their own tent, stove and axe, which would be far more inefficient. This is DataParallel (DP and DDP) in Pytorch.
-
-While reading the literature on this topic you may encounter the following synonyms: Sharded, Partitioned.
-
-If you pay close attention the way ZeRO partitions the model's weights - it looks very similar to tensor parallelism which will be discussed later. This is because it partitions/shards each layer's weights, unlike vertical model parallelism which is discussed next.
-
-Implementations:
-
-- [DeepSpeed](https://www.deepspeed.ai/features/#the-zero-redundancy-optimizer) ZeRO-DP stages 1+2+3
-- [Fairscale](https://github.com/facebookresearch/fairscale/#optimizer-state-sharding-zero) ZeRO-DP stages 1+2+3
-- [`transformers` integration](main_classes/trainer#trainer-integrations)
-
-## Naive Model Parallelism (Vertical) and Pipeline Parallelism
-
-Naive Model Parallelism (MP) is where one spreads groups of model layers across multiple GPUs. The mechanism is relatively simple - switch the desired layers `.to()` the desired devices and now whenever the data goes in and out those layers switch the data to the same device as the layer and leave the rest unmodified.
-
-We refer to it as Vertical MP, because if you remember how most models are drawn, we slice the layers vertically. For example, if the following diagram shows an 8-layer model:
-
-```
-===================  ===================
-|  0 | 1 | 2 | 3  |  |  4 | 5 | 6 | 7  |
-===================  ===================
-        gpu0                 gpu1
-```
-we just sliced it in 2 vertically, placing layers 0-3 onto GPU0 and 4-7 to GPU1.
-
-Now while data travels from layer 0 to 1, 1 to 2 and 2 to 3 this is just the normal model. But when data needs to pass from layer 3 to layer 4 it needs to travel from GPU0 to GPU1 which introduces a communication overhead. If the participating GPUs are on the same compute node (e.g. same physical machine) this copying is pretty fast, but if the GPUs are located on different compute nodes (e.g. multiple machines) the communication overhead could be significantly larger.
-
-Then layers 4 to 5 to 6 to 7 are as a normal model would have and when the 7th layer completes we often need to send the data back to layer 0 where the labels are (or alternatively send the labels to the last layer). Now the loss can be computed and the optimizer can do its work.
-
-Problems:
-- the main deficiency and why this one is called "naive" MP, is that all but one GPU is idle at any given moment. So if 4 GPUs are used, it's almost identical to quadrupling the amount of memory of a single GPU, and ignoring the rest of the hardware. Plus there is the overhead of copying the data between devices. So 4x 6GB cards will be able to accommodate the same size as 1x 24GB card using naive MP, except the latter will complete the training faster, since it doesn't have the data copying overhead. But, say, if you have 40GB cards and need to fit a 45GB model you can with 4x 40GB cards (but barely because of the gradient and optimizer states)
-- shared embeddings may need to get copied back and forth between GPUs.
-
-Pipeline Parallelism (PP) is almost identical to a naive MP, but it solves the GPU idling problem, by chunking the incoming batch into micro-batches and artificially creating a pipeline, which allows different GPUs to concurrently participate in the computation process.
-
-The following illustration from the [GPipe paper](https://ai.googleblog.com/2019/03/introducing-gpipe-open-source-library.html) shows the naive MP on the top, and PP on the bottom:
-
-![mp-pp](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-gpipe-bubble.png)
-
-It's easy to see from the bottom diagram how PP has less dead zones, where GPUs are idle. The idle parts are referred to as the "bubble".
-
-Both parts of the diagram show a parallelism that is of degree 4. That is 4 GPUs are participating in the pipeline. So there is the forward path of 4 pipe stages F0, F1, F2 and F3 and then the return reverse order backward path of B3, B2, B1 and B0.
-
-PP introduces a new hyper-parameter to tune and it's `chunks` which defines how many chunks of data are sent in a sequence through the same pipe stage. For example, in the bottomw diagram you can see that `chunks=4`. GPU0 performs the same forward path on chunk 0, 1, 2 and 3 (F0,0, F0,1, F0,2, F0,3) and then it waits for other GPUs to do their work and only when their work is starting to be complete, GPU0 starts to work again doing the backward path for chunks 3, 2, 1 and 0 (B0,3, B0,2, B0,1, B0,0).
-
-Note that conceptually this is the same concept as gradient accumulation steps (GAS). Pytorch uses `chunks`, whereas DeepSpeed refers to the same hyper-parameter as GAS.
-
-Because of the chunks, PP introduces the concept of micro-batches (MBS). DP splits the global data batch size into mini-batches, so if you have a DP degree of 4, a global batch size of 1024 gets split up into 4 mini-batches of 256 each (1024/4). And if the number of `chunks` (or GAS) is 32 we end up with a micro-batch size of 8 (256/32). Each Pipeline stage works with a single micro-batch at a time.
-
-To calculate the global batch size of the DP + PP setup we then do: `mbs*chunks*dp_degree` (`8*32*4=1024`).
-
-Let's go back to the diagram.
-
-With `chunks=1` you end up with the naive MP, which is very inefficient. With a very large `chunks` value you end up with tiny micro-batch sizes which could be not every efficient either. So one has to experiment to find the value that leads to the highest efficient utilization of the gpus.
-
-While the diagram shows that there is a bubble of "dead" time that can't be parallelized because the last `forward` stage has to wait for `backward` to complete the pipeline, the purpose of finding the best value for `chunks` is to enable a high concurrent GPU utilization across all participating GPUs which translates to minimizing the size of the bubble.
-
-There are 2 groups of solutions - the traditional Pipeline API and the more modern solutions that make things much easier for the end user.
-
-Traditional Pipeline API solutions:
-- PyTorch
-- FairScale
-- DeepSpeed
-- Megatron-LM
-
-Modern solutions:
-- Varuna
-- Sagemaker
-
-Problems with traditional Pipeline API solutions:
-- have to modify the model quite heavily, because Pipeline requires one to rewrite the normal flow of modules into a `nn.Sequential` sequence of the same, which may require changes to the design of the model.
-- currently the Pipeline API is very restricted. If you had a bunch of python variables being passed in the very first stage of the Pipeline, you will have to find a way around it. Currently, the pipeline interface requires either a single Tensor or a tuple of Tensors as the only input and output. These tensors must have a batch size as the very first dimension, since pipeline is going to chunk the mini batch into micro-batches. Possible improvements are being discussed here https://github.com/pytorch/pytorch/pull/50693
-- conditional control flow at the level of pipe stages is not possible - e.g., Encoder-Decoder models like T5 require special workarounds to handle a conditional encoder stage.
-- have to arrange each layer so that the output of one model becomes an input to the other model.
-
-We are yet to experiment with Varuna and SageMaker but their papers report that they have overcome the list of problems mentioned above and that they require much smaller changes to the user's model.
-
-Implementations:
-- [Pytorch](https://pytorch.org/docs/stable/pipeline.html) (initial support in pytorch-1.8, and progressively getting improved in 1.9 and more so in 1.10). Some [examples](https://github.com/pytorch/pytorch/blob/master/benchmarks/distributed/pipeline/pipe.py)
-- [FairScale](https://fairscale.readthedocs.io/en/latest/tutorials/pipe.html)
-- [DeepSpeed](https://www.deepspeed.ai/tutorials/pipeline/)
-- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) has an internal implementation - no API.
-- [Varuna](https://github.com/microsoft/varuna)
-- [SageMaker](https://arxiv.org/abs/2111.05972) - this is a proprietary solution that can only be used on AWS.
-- [OSLO](https://github.com/tunib-ai/oslo) - this is implemented based on the Hugging Face Transformers.
-
-🤗 Transformers status: as of this writing none of the models supports full-PP. GPT2 and T5 models have naive MP support. The main obstacle is being unable to convert the models to `nn.Sequential` and have all the inputs to be Tensors. This is because currently the models include many features that make the conversion very complicated, and will need to be removed to accomplish that.
-
-Other approaches:
-
-DeepSpeed, Varuna and SageMaker use the concept of an [Interleaved Pipeline](https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-core-features.html)
-![interleaved-pipeline-execution](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-sagemaker-interleaved-pipeline.png)
-
-Here the bubble (idle time) is further minimized by prioritizing backward passes.
-
-Varuna further tries to improve the schedule by using simulations to discover the most efficient scheduling.
-
-OSLO has pipeline parallelism implementation based on the Transformers without `nn.Sequential` converting.
-
-## Tensor Parallelism
-
-In Tensor Parallelism each GPU processes only a slice of a tensor and only aggregates the full tensor for operations that require the whole thing.
-
-In this section we use concepts and diagrams from the [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) paper: [Efficient Large-Scale Language Model Training on GPU Clusters](https://arxiv.org/abs/2104.04473).
-
-The main building block of any transformer is a fully connected `nn.Linear` followed by a nonlinear activation `GeLU`.
-
-Following the Megatron's paper notation, we can write the dot-product part of it as `Y = GeLU(XA)`, where `X` and `Y` are the input and output vectors, and `A` is the weight matrix.
-
-If we look at the computation in matrix form, it's easy to see how the matrix multiplication can be split between multiple GPUs:
-![Parallel GEMM](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-tp-parallel_gemm.png)
-
-If we split the weight matrix `A` column-wise across `N` GPUs and perform matrix multiplications `XA_1` through `XA_n` in parallel, then we will end up with `N` output vectors `Y_1, Y_2, ..., Y_n` which can be fed into `GeLU` independently:
-![independent GeLU](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-tp-independent-gelu.png)
-
-Using this principle, we can update an MLP of arbitrary depth, without the need for any synchronization between GPUs until the very end, where we need to reconstruct the output vector from shards. The Megatron-LM paper authors provide a helpful illustration for that:
-![parallel shard processing](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-tp-parallel_shard_processing.png)
-
-Parallelizing the multi-headed attention layers is even simpler, since they are already inherently parallel, due to having multiple independent heads!
-![parallel self-attention](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-tp-parallel_self_attention.png)
-
-Special considerations: TP requires very fast network, and therefore it's not advisable to do TP across more than one node. Practically, if a node has 4 GPUs, the highest TP degree is therefore 4. If you need a TP degree of 8, you need to use nodes that have at least 8 GPUs.
-
-This section is based on the original much more [detailed TP overview](https://github.com/huggingface/transformers/issues/10321#issuecomment-783543530).
-by [@anton-l](https://github.com/anton-l).
-
-SageMaker combines TP with DP for a more efficient processing.
-
-Alternative names:
-- DeepSpeed calls it [tensor slicing](https://www.deepspeed.ai/features/#model-parallelism)
-
-Implementations:
-- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) has an internal implementation, as it's very model-specific
-- [parallelformers](https://github.com/tunib-ai/parallelformers) (only inference at the moment)
-- [SageMaker](https://arxiv.org/abs/2111.05972) - this is a proprietary solution that can only be used on AWS.
-- [OSLO](https://github.com/tunib-ai/oslo) has the tensor parallelism implementation based on the Transformers.
-
-🤗 Transformers status:
-- core: not yet implemented in the core
-- but if you want inference [parallelformers](https://github.com/tunib-ai/parallelformers) provides this support for most of our models. So until this is implemented in the core you can use theirs. And hopefully training mode will be supported too.
-- Deepspeed-Inference also supports our BERT, GPT-2, and GPT-Neo models in their super-fast CUDA-kernel-based inference mode, see more [here](https://www.deepspeed.ai/tutorials/inference-tutorial/)
-
-## DP+PP
-
-The following diagram from the DeepSpeed [pipeline tutorial](https://www.deepspeed.ai/tutorials/pipeline/) demonstrates how one combines DP with PP.
-
-![dp-pp-2d](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-zero-dp-pp.png)
-
-Here it's important to see how DP rank 0 doesn't see GPU2 and DP rank 1 doesn't see GPU3. To DP there is just GPUs 0 and 1 where it feeds data as if there were just 2 GPUs. GPU0 "secretly" offloads some of its load to GPU2 using PP. And GPU1 does the same by enlisting GPU3 to its aid.
-
-Since each dimension requires at least 2 GPUs, here you'd need at least 4 GPUs.
-
-Implementations:
-- [DeepSpeed](https://github.com/microsoft/DeepSpeed)
-- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)
-- [Varuna](https://github.com/microsoft/varuna)
-- [SageMaker](https://arxiv.org/abs/2111.05972)
-- [OSLO](https://github.com/tunib-ai/oslo)
-
-🤗 Transformers status: not yet implemented
-
-## DP+PP+TP
-
-To get an even more efficient training a 3D parallelism is used where PP is combined with TP and DP. This can be seen in the following diagram.
-
-![dp-pp-tp-3d](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-deepspeed-3d.png)
-
-This diagram is from a blog post [3D parallelism: Scaling to trillion-parameter models](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/), which is a good read as well.
-
-Since each dimension requires at least 2 GPUs, here you'd need at least 8 GPUs.
-
-Implementations:
-- [DeepSpeed](https://github.com/microsoft/DeepSpeed) - DeepSpeed also includes an even more efficient DP, which they call ZeRO-DP.
-- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)
-- [Varuna](https://github.com/microsoft/varuna)
-- [SageMaker](https://arxiv.org/abs/2111.05972)
-- [OSLO](https://github.com/tunib-ai/oslo)
-
-🤗 Transformers status: not yet implemented, since we have no PP and TP.
-
-## ZeRO DP+PP+TP
-
-One of the main features of DeepSpeed is ZeRO, which is a super-scalable extension of DP. It has already been discussed in [ZeRO Data Parallelism](#zero-data-parallelism). Normally it's a standalone feature that doesn't require PP or TP. But it can be combined with PP and TP.
-
-When ZeRO-DP is combined with PP (and optionally TP) it typically enables only ZeRO stage 1 (optimizer sharding).
-
-While it's theoretically possible to use ZeRO stage 2 (gradient sharding) with Pipeline Parallelism, it will have bad performance impacts. There would need to be an additional reduce-scatter collective for every micro-batch to aggregate the gradients before sharding, which adds a potentially significant communication overhead. By nature of Pipeline Parallelism, small micro-batches are used and instead the focus is on trying to balance arithmetic intensity (micro-batch size) with minimizing the Pipeline bubble (number of micro-batches). Therefore those communication costs are going to hurt.
-
-In addition, There are already fewer layers than normal due to PP and so the memory savings won't be huge. PP already reduces gradient size by ``1/PP``, and so gradient sharding savings on top of that are less significant than pure DP.
-
-ZeRO stage 3 is not a good choice either for the same reason - more inter-node communications required.
-
-And since we have ZeRO, the other benefit is ZeRO-Offload. Since this is stage 1 optimizer states can be offloaded to CPU.
-
-Implementations:
-- [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed) and [Megatron-Deepspeed from BigScience](https://github.com/bigscience-workshop/Megatron-DeepSpeed), which is the fork of the former repo.
-- [OSLO](https://github.com/tunib-ai/oslo)
-
-Important papers:
-
-- [Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model](
-https://arxiv.org/abs/2201.11990)
-
-🤗 Transformers status: not yet implemented, since we have no PP and TP.
-
-## FlexFlow
-
-[FlexFlow](https://github.com/flexflow/FlexFlow) also solves the parallelization problem in a slightly different approach.
-
-Paper: ["Beyond Data and Model Parallelism for Deep Neural Networks" by Zhihao Jia, Matei Zaharia, Alex Aiken](https://arxiv.org/abs/1807.05358)
-
-It performs a sort of 4D Parallelism over Sample-Operator-Attribute-Parameter.
-
-1. Sample = Data Parallelism (sample-wise parallel)
-2. Operator = Parallelize a single operation into several sub-operations
-3. Attribute = Data Parallelism (length-wise parallel)
-4. Parameter = Model Parallelism (regardless of dimension - horizontal or vertical)
-
-Examples:
-* Sample
-
-Let's take 10 batches of sequence length 512. If we parallelize them by sample dimension into 2 devices, we get 10 x 512 which becomes be 5 x 2 x 512.
-
-* Operator
-
-If we perform layer normalization, we compute std first and mean second, and then we can normalize data. Operator parallelism allows computing std and mean in parallel. So if we parallelize them by operator dimension into 2 devices (cuda:0, cuda:1), first we copy input data into both devices, and cuda:0 computes std, cuda:1 computes mean at the same time.
-
-* Attribute
-
-We have 10 batches of 512 length. If we parallelize them by attribute dimension into 2 devices, 10 x 512 will be 10 x 2 x 256.
-
-* Parameter
-
-It is similar with tensor model parallelism or naive layer-wise model parallelism.
-
-![flex-flow-soap](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-flexflow.jpeg)
-
-The significance of this framework is that it takes resources like (1) GPU/TPU/CPU vs. (2) RAM/DRAM vs. (3) fast-intra-connect/slow-inter-connect and it automatically optimizes all these  algorithmically deciding which parallelisation to use where.
-
-One very important aspect is that FlexFlow is designed for optimizing DNN parallelizations for models with static and fixed workloads, since models with dynamic behavior may prefer different parallelization strategies across iterations.
-
-So the promise is very attractive - it runs a 30min simulation on the cluster of choice and it comes up with the best strategy to utilise this specific environment. If you add/remove/replace any parts it'll run and re-optimize the plan for that. And then you can train. A different setup will have its own custom optimization.
-
-🤗 Transformers status: not yet integrated. We already have our models FX-trace-able via [transformers.utils.fx](https://github.com/huggingface/transformers/blob/master/src/transformers/utils/fx.py), which is a prerequisite for FlexFlow, so someone needs to figure out what needs to be done to make FlexFlow work with our models.
-
-
-## Which Strategy To Use When
-
-Here is a very rough outline at which parallelism strategy to use when. The first on each list is typically faster.
-
-**⇨ Single GPU**
-
-* Model fits onto a single GPU:
-
-    1. Normal use
-
-* Model doesn't fit onto a single GPU:
-
-    1. ZeRO + Offload CPU and optionally NVMe
-    2. as above plus Memory Centric Tiling (see below for details) if the largest layer can't fit into a single GPU
-
-* Largest Layer not fitting into a single GPU:
-
-1. ZeRO - Enable [Memory Centric Tiling](https://deepspeed.readthedocs.io/en/latest/zero3.html#memory-centric-tiling) (MCT). It allows you to run arbitrarily large layers by automatically splitting them and executing them sequentially. MCT reduces the number of parameters that are live on a GPU, but it does not affect the activation memory. As this need is very rare as of this writing a manual override of `torch.nn.Linear` needs to be done by the user.
-
-**⇨ Single Node / Multi-GPU**
-
-* Model fits onto a single GPU:
-
-    1. DDP - Distributed DP
-    2. ZeRO - may or may not be faster depending on the situation and configuration used
-
-* Model doesn't fit onto a single GPU:
-
-    1. PP
-    2. ZeRO
-    3. TP
-
-    With very fast intra-node connectivity of NVLINK or NVSwitch all three should be mostly on par, without these PP will be faster than TP or ZeRO. The degree of TP may also make a difference. Best to experiment to find the winner on your particular setup.
-
-    TP is almost always used within a single node. That is TP size <= gpus per node.
-
-* Largest Layer not fitting into a single GPU:
-
-    1. If not using ZeRO - must use TP, as PP alone won't be able to fit.
-    2. With ZeRO see the same entry for "Single GPU" above
-
-
-**⇨ Multi-Node / Multi-GPU**
-
-* When you have fast inter-node connectivity:
-
-    1. ZeRO - as it requires close to no modifications to the model
-    2. PP+TP+DP - less communications, but requires massive changes to the model
-
-* when you have slow inter-node connectivity and still low on GPU memory:
-
-    1. DP+PP+TP+ZeRO-1
diff --git a/docs/source/en/perf_train_gpu_one.md b/docs/source/en/perf_train_gpu_one.md
new file mode 100644
index 000000000000..17b62c3a1379
--- /dev/null
+++ b/docs/source/en/perf_train_gpu_one.md
@@ -0,0 +1,532 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Methods and tools for efficient training on a single GPU
+
+This guide demonstrates practical techniques that you can use to increase the efficiency of your model's training by 
+optimizing memory utilization, speeding up the training, or both. If you'd like to understand how GPU is utilized during 
+training, please refer to the [Model training anatomy](model_memory_anatomy) conceptual guide first. This guide 
+focuses on practical techniques.  
+
+<Tip>
+
+If you have access to a machine with multiple GPUs, these approaches are still valid, plus you can leverage additional methods outlined in the [multi-GPU section](perf_train_gpu_many).
+
+</Tip>
+
+When training large models, there are two aspects that should be considered at the same time: 
+
+* Data throughput/training time
+* Model performance
+
+Maximizing the throughput (samples/second) leads to lower training cost. This is generally achieved by utilizing the GPU 
+as much as possible and thus filling GPU memory to its limit. If the desired batch size exceeds the limits of the GPU memory, 
+the memory optimization techniques, such as gradient accumulation, can help.
+
+However, if the preferred batch size fits into memory, there's no reason to apply memory-optimizing techniques because they can 
+slow down the training. Just because one can use a large batch size, does not necessarily mean they should. As part of 
+hyperparameter tuning, you should determine which batch size yields the best results and then optimize resources accordingly.
+
+The methods and tools covered in this guide can be classified based on the effect they have on the training process:
+
+| Method/tool                                                | Improves training speed | Optimizes memory utilization |
+|:-----------------------------------------------------------|:------------------------|:-----------------------------|
+| [Batch size choice](#batch-size-choice)                    | Yes                     | Yes                          |
+| [Gradient accumulation](#gradient-accumulation)            | No                      | Yes                          |
+| [Gradient checkpointing](#gradient-checkpointing)          | No                      | Yes                          |
+| [Mixed precision training](#mixed-precision-training)      | Yes                     | (No)                         |
+| [Optimizer choice](#optimizer-choice)                      | Yes                     | Yes                          |
+| [Data preloading](#data-preloading)                        | Yes                     | No                           |
+| [DeepSpeed Zero](#deepspeed-zero)                          | No                      | Yes                          |
+| [torch.compile](#using-torchcompile)                       | Yes                     | No                           |
+
+<Tip>
+
+Note: when using mixed precision with a small model and a large batch size, there will be some memory savings but with a 
+large model and a small batch size, the memory use will be larger.
+
+</Tip>
+
+You can combine the above methods to get a cumulative effect. These techniques are available to you whether you are 
+training your model with [`Trainer`] or writing a pure PyTorch loop, in which case you can [configure these optimizations 
+with 🤗 Accelerate](#using-accelerate).
+
+If these methods do not result in sufficient gains, you can explore the following options: 
+* [Look into building your own custom Docker container with efficient softare prebuilds](#efficient-software-prebuilds)
+* [Consider a model that uses Mixture of Experts (MoE)](#mixture-of-experts)
+* [Convert your model to BetterTransformer to leverage PyTorch native attention](#using-pytorch-native-attention)
+
+Finally, if all of the above is still not enough, even after switching to a server-grade GPU like A100, consider moving 
+to a multi-GPU setup. All these approaches are still valid in a multi-GPU setup, plus you can leverage additional parallelism 
+techniques outlined in the [multi-GPU section](perf_train_gpu_many). 
+
+## Batch size choice
+
+To achieve optimal performance, start by identifying the appropriate batch size. It is recommended to use batch sizes and 
+input/output neuron counts that are of size 2^N. Often it's a multiple of 8, but it can be 
+higher depending on the hardware being used and the model's dtype.
+
+For reference, check out NVIDIA's recommendation for [input/output neuron counts](
+https://docs.nvidia.com/deeplearning/performance/dl-performance-fully-connected/index.html#input-features) and 
+[batch size](https://docs.nvidia.com/deeplearning/performance/dl-performance-fully-connected/index.html#batch-size) for 
+fully connected layers (which are involved in GEMMs (General Matrix Multiplications)).
+
+[Tensor Core Requirements](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc) 
+define the multiplier based on the dtype and the hardware. For instance, for fp16 data type a multiple of 8 is recommended, unless 
+it's an A100 GPU, in which case use multiples of 64.
+
+For parameters that are small, consider also [Dimension Quantization Effects](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#dim-quantization). 
+This is where tiling happens and the right multiplier can have a significant speedup.
+
+## Gradient Accumulation
+
+The **gradient accumulation** method aims to calculate gradients in smaller increments instead of computing them for the 
+entire batch at once. This approach involves iteratively calculating gradients in smaller batches by performing forward 
+and backward passes through the model and accumulating the gradients during the process. Once a sufficient number of 
+gradients have been accumulated, the model's optimization step is executed. By employing gradient accumulation, it 
+becomes possible to increase the **effective batch size** beyond the limitations imposed by the GPU's memory capacity. 
+However, it is important to note that the additional forward and backward passes introduced by gradient accumulation can 
+slow down the training process.
+
+You can enable gradient accumulation by adding the `gradient_accumulation_steps` argument to  [`TrainingArguments`]: 
+
+```py
+training_args = TrainingArguments(per_device_train_batch_size=1, gradient_accumulation_steps=4, **default_args)
+```
+
+In the above example, your effective batch size becomes 4. 
+
+Alternatively, use 🤗 Accelerate to gain full control over the training loop. Find the 🤗 Accelerate example 
+[further down in this guide](#using-accelerate).
+
+While it is advised to max out GPU usage as much as possible, a high number of gradient accumulation steps can 
+result in a more pronounced training slowdown. Consider the following example. Let's say, the `per_device_train_batch_size=4` 
+without gradient accumulation hits the GPU's limit. If you would like to train with batches of size 64, do not set the 
+`per_device_train_batch_size` to 1 and `gradient_accumulation_steps` to 64. Instead, keep `per_device_train_batch_size=4` 
+and set `gradient_accumulation_steps=16`. This results in the same effective batch size while making better use of 
+the available GPU resources.
+
+For additional information, please refer to batch size and gradient accumulation benchmarks for [RTX-3090](https://github.com/huggingface/transformers/issues/14608#issuecomment-1004392537)
+and [A100](https://github.com/huggingface/transformers/issues/15026#issuecomment-1005033957).
+
+## Gradient Checkpointing
+
+Some large models may still face memory issues even when the batch size is set to 1 and gradient accumulation is used. 
+This is because there are other components that also require memory storage.
+
+Saving all activations from the forward pass in order to compute the gradients during the backward pass can result in 
+significant memory overhead. The alternative approach of discarding the activations and recalculating them when needed 
+during the backward pass, would introduce a considerable computational overhead and slow down the training process.
+
+**Gradient checkpointing** offers a compromise between these two approaches and saves strategically selected activations 
+throughout the computational graph so only a fraction of the activations need to be re-computed for the gradients. For 
+an in-depth explanation of gradient checkpointing, refer to [this great article](https://medium.com/tensorflow/fitting-larger-networks-into-memory-583e3c758ff9).
+
+To enable gradient checkpointing in the [`Trainer`], pass the corresponding a flag to [`TrainingArguments`]:
+
+```py
+training_args = TrainingArguments(
+    per_device_train_batch_size=1, gradient_accumulation_steps=4, gradient_checkpointing=True, **default_args
+)
+```
+
+Alternatively, use 🤗 Accelerate - find the 🤗 Accelerate example [further in this guide](#using-accelerate). 
+
+<Tip>
+
+While gradient checkpointing may improve memory efficiency, it slows training by approximately 20%.
+
+</Tip>
+
+## Mixed precision training
+
+**Mixed precision training** is a technique that aims to optimize the computational efficiency of training models by 
+utilizing lower-precision numerical formats for certain variables. Traditionally, most models use 32-bit floating point 
+precision (fp32 or float32) to represent and process variables. However, not all variables require this high precision 
+level to achieve accurate results. By reducing the precision of certain variables to lower numerical formats like 16-bit 
+floating point (fp16 or float16), we can speed up the computations. Because in this approach some computations are performed 
+in half-precision, while some are still in full precision, the approach is called mixed precision training.
+
+Most commonly mixed precision training is achieved by using fp16 (float16) data types, however, some GPU architectures 
+(such as the Ampere architecture) offer bf16 and tf32 (CUDA internal data type) data types. Check 
+out the [NVIDIA Blog](https://developer.nvidia.com/blog/accelerating-ai-training-with-tf32-tensor-cores/) to learn more about 
+the differences between these data types.
+
+### fp16
+
+The main advantage of mixed precision training comes from saving the activations in half precision (fp16). 
+Although the gradients are also computed in half precision they are converted back to full precision for the optimization 
+step so no memory is saved here. 
+While mixed precision training results in faster computations, it can also lead to more GPU memory being utilized, especially for small batch sizes.
+This is because the model is now present on the GPU in both 16-bit and 32-bit precision (1.5x the original model on the GPU).
+
+To enable mixed precision training, set the `fp16` flag to `True`:
+
+```py
+training_args = TrainingArguments(per_device_train_batch_size=4, fp16=True, **default_args)
+```
+
+If you prefer to use 🤗 Accelerate, find the 🤗 Accelerate example [further in this guide](#using-accelerate). 
+
+### BF16
+
+If you have access to an Ampere or newer hardware you can use bf16 for mixed precision training and evaluation. While 
+bf16 has a worse precision than fp16, it has a much bigger dynamic range. In fp16 the biggest number you can have 
+is `65535` and any number above that will result in an overflow. A bf16 number can be as large as `3.39e+38` (!) which 
+is about the same as fp32 - because both have 8-bits used for the numerical range.
+
+You can enable BF16 in the 🤗 Trainer with:
+
+```python
+training_args = TrainingArguments(bf16=True, **default_args)
+```
+
+### TF32
+
+The Ampere hardware uses a magical data type called tf32. It has the same numerical range as fp32 (8-bits), but instead 
+of 23 bits precision it has only 10 bits (same as fp16) and uses only 19 bits in total. It's "magical" in the sense that 
+you can use the normal fp32 training and/or inference code and by enabling tf32 support you can get up to 3x throughput 
+improvement. All you need to do is to add the following to your code:
+
+```
+import torch
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+```
+
+CUDA will automatically switch to using tf32 instead of fp32 where possible, assuming that the used GPU is from the Ampere series.
+
+According to [NVIDIA research](https://developer.nvidia.com/blog/accelerating-ai-training-with-tf32-tensor-cores/), the 
+majority of machine learning training workloads show the same perplexity and convergence with tf32 training as with fp32. 
+If you're already using fp16 or bf16 mixed precision it may help with the throughput as well.
+
+You can enable this mode in the 🤗 Trainer:
+
+```python
+TrainingArguments(tf32=True, **default_args)
+```
+
+<Tip>
+
+tf32 can't be accessed directly via `tensor.to(dtype=torch.tf32)` because it is an internal CUDA data type. You need `torch>=1.7` to use tf32 data types.
+
+</Tip>
+
+For additional information on tf32 vs other precisions, please refer to the following benchmarks: 
+[RTX-3090](https://github.com/huggingface/transformers/issues/14608#issuecomment-1004390803) and
+[A100](https://github.com/huggingface/transformers/issues/15026#issuecomment-1004543189).
+
+## Flash Attention 2
+
+You can speedup the training throughput by using Flash Attention 2 integration in transformers. Check out the appropriate section in the [single GPU section](./perf_infer_gpu_one#Flash-Attention-2) to learn more about how to load a model with Flash Attention 2 modules. 
+
+## Optimizer choice
+
+The most common optimizer used to train transformer models is Adam or AdamW (Adam with weight decay). Adam achieves 
+good convergence by storing the rolling average of the previous gradients; however, it adds an additional memory 
+footprint of the order of the number of model parameters. To remedy this, you can use an alternative optimizer. 
+For example if you have [NVIDIA/apex](https://github.com/NVIDIA/apex) installed, `adamw_apex_fused` will give you the 
+fastest training experience among all supported AdamW optimizers.
+
+[`Trainer`] integrates a variety of optimizers that can be used out of box: `adamw_hf`, `adamw_torch`, `adamw_torch_fused`, 
+`adamw_apex_fused`, `adamw_anyprecision`, `adafactor`, or `adamw_bnb_8bit`. More optimizers can be plugged in via a third-party implementation.
+
+Let's take a closer look at two alternatives to AdamW optimizer:
+1. `adafactor` which is available in [`Trainer`]
+2. `adamw_bnb_8bit` is also available in Trainer, but a third-party integration is provided below for demonstration.
+
+For comparison, for a 3B-parameter model, like “t5-3b”: 
+* A standard AdamW optimizer will need 24GB of GPU memory because it uses 8 bytes for each parameter (8*3 => 24GB)
+* Adafactor optimizer will need more than 12GB. It uses slightly more than 4 bytes for each parameter, so 4*3 and then some extra.
+* 8bit BNB quantized optimizer will use only (2*3) 6GB if all optimizer states are quantized.
+
+### Adafactor
+
+Adafactor doesn't store rolling averages for each element in weight matrices. Instead, it keeps aggregated information 
+(sums of rolling averages row- and column-wise), significantly reducing its footprint. However, compared to Adam, 
+Adafactor may have slower convergence in certain cases.
+
+You can switch to Adafactor by setting `optim="adafactor"` in [`TrainingArguments`]:
+
+```py
+training_args = TrainingArguments(per_device_train_batch_size=4, optim="adafactor", **default_args)
+```
+
+Combined with other approaches (gradient accumulation, gradient checkpointing, and mixed precision training) 
+you can notice up to 3x improvement while maintaining the throughput! However, as mentioned before, the convergence of 
+Adafactor can be worse than Adam. 
+
+### 8-bit Adam
+
+Instead of aggregating optimizer states like Adafactor, 8-bit Adam keeps the full state and quantizes it. Quantization 
+means that it stores the state with lower precision and dequantizes it only for the optimization. This is similar to the 
+idea behind mixed precision training.
+
+To use `adamw_bnb_8bit`, you simply need to set `optim="adamw_bnb_8bit"` in [`TrainingArguments`]:
+
+```py
+training_args = TrainingArguments(per_device_train_batch_size=4, optim="adamw_bnb_8bit", **default_args)
+```
+
+However, we can also use a third-party implementation of the 8-bit optimizer for demonstration purposes to see how that can be integrated.
+
+First, follow the installation guide in the GitHub [repo](https://github.com/TimDettmers/bitsandbytes) to install the `bitsandbytes` library 
+that implements the 8-bit Adam optimizer.
+
+Next you need to initialize the optimizer. This involves two steps: 
+* First, group the model's parameters into two groups - one where weight decay should be applied, and the other one where it should not. Usually, biases and layer norm parameters are not weight decayed. 
+* Then do some argument housekeeping to use the same parameters as the previously used AdamW optimizer.
+
+```py
+import bitsandbytes as bnb
+from torch import nn
+from transformers.trainer_pt_utils import get_parameter_names
+
+training_args = TrainingArguments(per_device_train_batch_size=4, **default_args)
+
+decay_parameters = get_parameter_names(model, [nn.LayerNorm])
+decay_parameters = [name for name in decay_parameters if "bias" not in name]
+optimizer_grouped_parameters = [
+    {
+        "params": [p for n, p in model.named_parameters() if n in decay_parameters],
+        "weight_decay": training_args.weight_decay,
+    },
+    {
+        "params": [p for n, p in model.named_parameters() if n not in decay_parameters],
+        "weight_decay": 0.0,
+    },
+]
+
+optimizer_kwargs = {
+    "betas": (training_args.adam_beta1, training_args.adam_beta2),
+    "eps": training_args.adam_epsilon,
+}
+optimizer_kwargs["lr"] = training_args.learning_rate
+adam_bnb_optim = bnb.optim.Adam8bit(
+    optimizer_grouped_parameters,
+    betas=(training_args.adam_beta1, training_args.adam_beta2),
+    eps=training_args.adam_epsilon,
+    lr=training_args.learning_rate,
+)
+```
+
+Finally, pass the custom optimizer as an argument to the `Trainer`:
+
+```py
+trainer = Trainer(model=model, args=training_args, train_dataset=ds, optimizers=(adam_bnb_optim, None))
+```
+
+Combined with other approaches (gradient accumulation, gradient checkpointing, and mixed precision training), 
+you can expect to get about a 3x memory improvement and even slightly higher throughput as using Adafactor. 
+
+### multi_tensor
+
+pytorch-nightly introduced `torch.optim._multi_tensor` which should significantly speed up the optimizers for situations 
+with lots of small feature tensors. It should eventually become the default, but if you want to experiment with it sooner, take a look at this GitHub [issue](https://github.com/huggingface/transformers/issues/9965).
+
+## Data preloading
+
+One of the important requirements to reach great training speed is the ability to feed the GPU at the maximum speed it 
+can handle. By default, everything happens in the main process, and it might not be able to read the data from disk fast 
+enough, and thus create a bottleneck, leading to GPU under-utilization. Configure the following arguments to reduce the bottleneck:
+
+- `DataLoader(pin_memory=True, ...)` - ensures the data gets preloaded into the pinned memory on CPU and typically leads to much faster transfers from CPU to GPU memory.
+- `DataLoader(num_workers=4, ...)` - spawn several workers to preload data faster. During training, watch the GPU utilization stats; if it's far from 100%, experiment with increasing the number of workers. Of course, the problem could be elsewhere, so many workers won't necessarily lead to better performance.
+
+When using [`Trainer`], the corresponding [`TrainingArguments`] are: `dataloader_pin_memory` (`True` by default), and `dataloader_num_workers` (defaults to `0`).
+
+## DeepSpeed ZeRO
+
+DeepSpeed is an open-source deep learning optimization library that is integrated with 🤗 Transformers and 🤗 Accelerate.
+It provides a wide range of features and optimizations designed to improve the efficiency and scalability of large-scale 
+deep learning training.
+
+If your model fits onto a single GPU and you have enough space to fit a small batch size, you don't need to use DeepSpeed
+as it'll only slow things down. However, if the model doesn't fit onto a single GPU or you can't fit a small batch, you can 
+leverage DeepSpeed ZeRO + CPU Offload, or NVMe Offload for much larger models. In this case, you need to separately
+[install the library](main_classes/deepspeed#installation), then follow one of the guides to create a configuration file 
+and launch DeepSpeed: 
+ 
+* For an in-depth guide on DeepSpeed integration with [`Trainer`], review [the corresponding documentation](main_classes/deepspeed), specifically the 
+[section for a single GPU](main_classes/deepspeed#deployment-with-one-gpu). Some adjustments are required to use DeepSpeed in a notebook; please take a look at the [corresponding guide](main_classes/deepspeed#deployment-in-notebooks).
+* If you prefer to use 🤗 Accelerate, refer to [🤗 Accelerate DeepSpeed guide](https://huggingface.co/docs/accelerate/en/usage_guides/deepspeed).
+
+## Using torch.compile
+
+PyTorch 2.0 introduced a new compile function that doesn't require any modification to existing PyTorch code but can 
+optimize your code by adding a single line of code: `model = torch.compile(model)`.
+
+If using [`Trainer`], you only need `to` pass the `torch_compile` option in the [`TrainingArguments`]: 
+
+```python
+training_args = TrainingArguments(torch_compile=True, **default_args)
+```
+
+`torch.compile` uses Python's frame evaluation API to automatically create a graph from existing PyTorch programs. After 
+capturing the graph, different backends can be deployed to lower the graph to an optimized engine. 
+You can find more details and benchmarks in [PyTorch documentation](https://pytorch.org/get-started/pytorch-2.0/).
+
+`torch.compile` has a growing list of backends, which can be found in by calling `torchdynamo.list_backends()`, each of which with its optional dependencies.
+
+Choose which backend to use by specifying it via `torch_compile_backend` in the [`TrainingArguments`].  Some of the most commonly used backends are:
+
+**Debugging backends**:
+* `dynamo.optimize("eager")` - Uses PyTorch to run the extracted GraphModule. This is quite useful in debugging TorchDynamo issues.
+* `dynamo.optimize("aot_eager")` - Uses AotAutograd with no compiler, i.e, just using PyTorch eager for the AotAutograd's extracted forward and backward graphs. This is useful for debugging, and unlikely to give speedups.
+
+**Training & inference backends**:
+* `dynamo.optimize("inductor")` - Uses TorchInductor backend with AotAutograd and cudagraphs by leveraging codegened Triton kernels  [Read more](https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747)
+* `dynamo.optimize("nvfuser")` -  nvFuser with TorchScript. [Read more](https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593)
+* `dynamo.optimize("aot_nvfuser")` -  nvFuser with AotAutograd. [Read more](https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593)
+* `dynamo.optimize("aot_cudagraphs")` - cudagraphs with AotAutograd. [Read more](https://github.com/pytorch/torchdynamo/pull/757)
+
+**Inference-only backend**s:
+* `dynamo.optimize("ofi")` -  Uses Torchscript optimize_for_inference.  [Read more](https://pytorch.org/docs/stable/generated/torch.jit.optimize_for_inference.html)
+* `dynamo.optimize("fx2trt")` -  Uses Nvidia TensorRT for inference optimizations.  [Read more](https://github.com/pytorch/TensorRT/blob/master/docsrc/tutorials/getting_started_with_fx_path.rst)
+* `dynamo.optimize("onnxrt")` -  Uses ONNXRT for inference on CPU/GPU.  [Read more](https://onnxruntime.ai/)
+* `dynamo.optimize("ipex")` -  Uses IPEX for inference on CPU.  [Read more](https://github.com/intel/intel-extension-for-pytorch)
+
+For an example of using `torch.compile` with 🤗 Transformers, check out this [blog post on fine-tuning a BERT model for Text Classification using the newest PyTorch 2.0 features](https://www.philschmid.de/getting-started-pytorch-2-0-transformers)
+
+## Using 🤗 Accelerate
+
+With [🤗 Accelerate](https://huggingface.co/docs/accelerate/index) you can use the above methods while gaining full 
+control over the training loop and can essentially write the loop in pure PyTorch with some minor modifications. 
+
+Suppose you have combined the methods in the [`TrainingArguments`] like so:
+
+```py
+training_args = TrainingArguments(
+    per_device_train_batch_size=1,
+    gradient_accumulation_steps=4,
+    gradient_checkpointing=True,
+    fp16=True,
+    **default_args,
+)
+```
+
+The full example training loop with 🤗 Accelerate is only a handful of lines of code long:
+
+```py
+from accelerate import Accelerator
+from torch.utils.data.dataloader import DataLoader
+
+dataloader = DataLoader(ds, batch_size=training_args.per_device_train_batch_size)
+
+if training_args.gradient_checkpointing:
+    model.gradient_checkpointing_enable()
+
+accelerator = Accelerator(fp16=training_args.fp16)
+model, optimizer, dataloader = accelerator.prepare(model, adam_bnb_optim, dataloader)
+
+model.train()
+for step, batch in enumerate(dataloader, start=1):
+    loss = model(**batch).loss
+    loss = loss / training_args.gradient_accumulation_steps
+    accelerator.backward(loss)
+    if step % training_args.gradient_accumulation_steps == 0:
+        optimizer.step()
+        optimizer.zero_grad()
+```
+
+First we wrap the dataset in a [`DataLoader`](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader). 
+Then we can enable gradient checkpointing by calling the model's [`~PreTrainedModel.gradient_checkpointing_enable`] method. 
+When we initialize the [`Accelerator`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator) 
+we can specify if we want to use mixed precision training and it will take care of it for us in the [`prepare`] call. 
+During the [`prepare`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.prepare) 
+call the dataloader will also be distributed across workers should we use multiple GPUs. We use the same [8-bit optimizer](#8-bit-adam) from the earlier example.
+
+Finally, we can add the main training loop. Note that the `backward` call is handled by 🤗 Accelerate. We can also see
+how gradient accumulation works: we normalize the loss, so we get the average at the end of accumulation and once we have 
+enough steps we run the optimization. 
+
+Implementing these optimization techniques with 🤗 Accelerate only takes a handful of lines of code and comes with the 
+benefit of more flexibility in the training loop. For a full documentation of all features have a look at the 
+[Accelerate documentation](https://huggingface.co/docs/accelerate/index).
+
+
+## Efficient Software Prebuilds
+
+PyTorch's [pip and conda builds](https://pytorch.org/get-started/locally/#start-locally) come prebuilt with the cuda toolkit 
+which is enough to run PyTorch, but it is insufficient if you need to build cuda extensions.
+
+At times, additional efforts may be required to pre-build some components. For instance, if you're using libraries like `apex` that 
+don't come pre-compiled. In other situations figuring out how to install the right cuda toolkit system-wide can be complicated. 
+To address these scenarios PyTorch and NVIDIA released a new version of NGC docker container which already comes with 
+everything prebuilt. You just need to install your programs on it, and it will run out of the box.
+
+This approach is also useful if you want to tweak the pytorch source and/or make a new customized build.
+To find the docker image version you want start [with PyTorch release notes](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/), 
+choose one of the latest monthly releases. Go into the release's notes for the desired release, check that the environment's 
+components are matching your needs (including NVIDIA Driver requirements!) and then at the very top of that document go 
+to the corresponding NGC page. If for some reason you get lost, here is [the index of all PyTorch NGC images](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch).
+
+Next follow the instructions to download and deploy the docker image.
+
+## Mixture of Experts
+
+Some recent papers reported a 4-5x training speedup and a faster inference by integrating
+Mixture of Experts (MoE) into the Transformer models.
+
+Since it has been discovered that more parameters lead to better performance, this technique allows to increase the 
+number of parameters by an order of magnitude without increasing training costs.
+
+In this approach every other FFN layer is replaced with a MoE Layer which consists of many experts, with a gated function 
+that trains each expert in a balanced way depending on the input token's position in a sequence.
+
+![MoE Transformer 2x block](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/perf-moe-transformer.png)
+
+(source: [GLAM](https://ai.googleblog.com/2021/12/more-efficient-in-context-learning-with.html))
+
+You can find exhaustive details and comparison tables in the papers listed at the end of this section.
+
+The main drawback of this approach is that it requires staggering amounts of GPU memory - almost an order of magnitude 
+larger than its dense equivalent. Various distillation and approaches are proposed to how to overcome the much higher memory requirements.
+
+There is direct trade-off though, you can use just a few experts with a 2-3x smaller base model instead of dozens or 
+hundreds experts leading to a 5x smaller model and thus increase the training speed moderately while increasing the 
+memory requirements moderately as well.
+
+Most related papers and implementations are built around Tensorflow/TPUs:
+
+- [GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding](https://arxiv.org/abs/2006.16668)
+- [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961)
+- [GLaM: Generalist Language Model (GLaM)](https://ai.googleblog.com/2021/12/more-efficient-in-context-learning-with.html)
+
+And for Pytorch DeepSpeed has built one as well: [DeepSpeed-MoE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale](https://arxiv.org/abs/2201.05596), [Mixture of Experts](https://www.deepspeed.ai/tutorials/mixture-of-experts/) - blog posts:  [1](https://www.microsoft.com/en-us/research/blog/deepspeed-powers-8x-larger-moe-model-training-with-high-performance/), [2](https://www.microsoft.com/en-us/research/publication/scalable-and-efficient-moe-training-for-multitask-multilingual-models/) and specific deployment with large transformer-based natural language generation models: [blog post](https://www.deepspeed.ai/news/2021/12/09/deepspeed-moe-nlg.html), [Megatron-Deepspeed branch](Thttps://github.com/microsoft/Megatron-DeepSpeed/tree/moe-training).
+
+## Using PyTorch native attention and Flash Attention
+
+PyTorch 2.0 released a native [`torch.nn.functional.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html) (SDPA), 
+that allows using fused GPU kernels such as [memory-efficient attention](https://arxiv.org/abs/2112.05682) and [flash attention](https://arxiv.org/abs/2205.14135).
+
+After installing the [`optimum`](https://github.com/huggingface/optimum) package, the relevant internal modules can be 
+replaced to use PyTorch's native attention with:
+
+```python
+model = model.to_bettertransformer()
+```
+
+Once converted, train the model as usual.
+
+<Tip warning={true}>
+
+The PyTorch-native `scaled_dot_product_attention` operator can only dispatch to Flash Attention if no `attention_mask` is provided.
+
+By default, in training mode, the BetterTransformer integration **drops the mask support and can only be used for training that does not require a padding mask for batched training**. This is the case, for example, during masked language modeling or causal language modeling. BetterTransformer is not suited for fine-tuning models on tasks that require a padding mask. 
+
+</Tip>
+
+Check out this [blogpost](https://pytorch.org/blog/out-of-the-box-acceleration/) to learn more about acceleration and memory-savings with SDPA.
\ No newline at end of file
diff --git a/docs/source/en/perf_train_gpu_one.mdx b/docs/source/en/perf_train_gpu_one.mdx
deleted file mode 100644
index 07299b016f59..000000000000
--- a/docs/source/en/perf_train_gpu_one.mdx
+++ /dev/null
@@ -1,744 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
--->
-
-# Efficient Training on a Single GPU
-
-This guide focuses on training large models efficiently on a single GPU. These approaches are still valid if you have access to a machine with multiple GPUs but you will also have access to additional methods outlined in the [multi-GPU section](perf_train_gpu_many).
-
-In this section we have a look at a few tricks to reduce the memory footprint and speed up training for large models and how they are integrated in the [`Trainer`] and [🤗 Accelerate](https://huggingface.co/docs/accelerate/). Each method can improve speed or memory usage which is summarized in the table below:
-
-|Method|Speed|Memory|
-|:-----|:----|:-----|
-| Gradient accumulation | No | Yes |
-| Gradient checkpointing | No| Yes |
-| Mixed precision training | Yes | (No) |
-| Batch size | Yes | Yes |
-| Optimizer choice | Yes | Yes |
-| DataLoader | Yes | No |
-| DeepSpeed Zero | No | Yes |
-
-A bracket means that it might not be strictly the case but is usually either not a main concern or negligible. Before we start make sure you have installed the following libraries:
-
-```bash
-pip install transformers datasets accelerate nvidia-ml-py3
-```
-
-The `nvidia-ml-py3` library allows us to monitor the memory usage of the models from within Python. You might be familiar with the `nvidia-smi` command in the terminal - this library allows to access the same information in Python directly.
-
-Then we create some dummy data. We create random token IDs between 100 and 30000 and binary labels for a classifier. In total we get 512 sequences each with length 512 and store them in a [`~datasets.Dataset`] with PyTorch format.
-
-
-```py
-import numpy as np
-from datasets import Dataset
-
-
-seq_len, dataset_size = 512, 512
-dummy_data = {
-    "input_ids": np.random.randint(100, 30000, (dataset_size, seq_len)),
-    "labels": np.random.randint(0, 1, (dataset_size)),
-}
-ds = Dataset.from_dict(dummy_data)
-ds.set_format("pt")
-```
-
-We want to print some summary statistics for the GPU utilization and the training run with the [`Trainer`]. We setup a two helper functions to do just that:
-
-```py
-from pynvml import *
-
-
-def print_gpu_utilization():
-    nvmlInit()
-    handle = nvmlDeviceGetHandleByIndex(0)
-    info = nvmlDeviceGetMemoryInfo(handle)
-    print(f"GPU memory occupied: {info.used//1024**2} MB.")
-
-
-def print_summary(result):
-    print(f"Time: {result.metrics['train_runtime']:.2f}")
-    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
-    print_gpu_utilization()
-```
-
-Let's verify that we start with a free GPU memory:
-
-```py
->>> print_gpu_utilization()
-GPU memory occupied: 0 MB.
-```
-
-That looks good: the GPU memory is not occupied as we would expect before we load any models. If that's not the case on your machine make sure to stop all processes that are using GPU memory. However, not all free GPU memory can be used by the user. When a model is loaded to the GPU also the kernels are loaded which can take up 1-2GB of memory. To see how much it is we load a tiny tensor into the GPU which triggers the kernels to be loaded as well.
-
-```py
->>> import torch
-
-
->>> torch.ones((1, 1)).to("cuda")
->>> print_gpu_utilization()
-GPU memory occupied: 1343 MB.
-```
-
-We see that the kernels alone take up 1.3GB of GPU memory. Now let's see how much space the model uses.
-
-## Load Model
-
-First, we load the `bert-large-uncased` model. We load the model weights directly to the GPU so that we can check how much space just weights use.
-
-
-```py
->>> from transformers import AutoModelForSequenceClassification
-
-
->>> model = AutoModelForSequenceClassification.from_pretrained("bert-large-uncased").to("cuda")
->>> print_gpu_utilization()
-GPU memory occupied: 2631 MB.
-```
-
-We can see that the model weights alone take up 1.3 GB of the GPU memory. The exact number depends on the specific GPU you are using. Note that on newer GPUs a model can sometimes take up more space since the weights are loaded in an optimized fashion that speeds up the usage of the model. Now we can also quickly check if we get the same result as with `nvidia-smi` CLI:
-
-
-```bash
-nvidia-smi
-```
-
-```bash
-Tue Jan 11 08:58:05 2022
-+-----------------------------------------------------------------------------+
-| NVIDIA-SMI 460.91.03    Driver Version: 460.91.03    CUDA Version: 11.2     |
-|-------------------------------+----------------------+----------------------+
-| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
-| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
-|                               |                      |               MIG M. |
-|===============================+======================+======================|
-|   0  Tesla V100-SXM2...  On   | 00000000:00:04.0 Off |                    0 |
-| N/A   37C    P0    39W / 300W |   2631MiB / 16160MiB |      0%      Default |
-|                               |                      |                  N/A |
-+-------------------------------+----------------------+----------------------+
-
-+-----------------------------------------------------------------------------+
-| Processes:                                                                  |
-|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
-|        ID   ID                                                   Usage      |
-|=============================================================================|
-|    0   N/A  N/A      3721      C   ...nvs/codeparrot/bin/python     2629MiB |
-+-----------------------------------------------------------------------------+
-```
-
-We get the same number as before and you can also see that we are using a V100 GPU with 16GB of memory. So now we can start training the model and see how the GPU memory consumption changes. First, we set up a few standard training arguments that we will use across all our experiments:
-
-```py
-default_args = {
-    "output_dir": "tmp",
-    "evaluation_strategy": "steps",
-    "num_train_epochs": 1,
-    "log_level": "error",
-    "report_to": "none",
-}
-```
-
-<Tip>
-
- Note: In order to properly clear the memory after experiments we need restart the Python kernel between experiments. Run all steps above and then just one of the experiments below.
-
-</Tip>
-
-## Vanilla Training
-
-As a first experiment we will use the [`Trainer`] and train the model without any further modifications and a batch size of 4:
-
-```py
-from transformers import TrainingArguments, Trainer, logging
-
-logging.set_verbosity_error()
-
-
-training_args = TrainingArguments(per_device_train_batch_size=4, **default_args)
-trainer = Trainer(model=model, args=training_args, train_dataset=ds)
-result = trainer.train()
-print_summary(result)
-```
-
-```
-Time: 57.82
-Samples/second: 8.86
-GPU memory occupied: 14949 MB.
-```
-
-We see that already a relatively small batch size almost fills up our GPU's entire memory. However, a larger batch size can often result in faster model convergence or better end performance. So ideally we want to tune the batch size to our model's needs and not to the GPU limitations. What's interesting is that we use much more memory than the size of the model. To understand a bit better why this is the case let's have look at a model's operations and memory needs.
-
-## Anatomy of Model's Operations
-
-Transformers architecture includes 3 main groups of operations grouped below by compute-intensity.
-
-1. **Tensor Contractions**
-
-    Linear layers and components of Multi-Head Attention all do batched **matrix-matrix multiplications**. These operations are the most compute-intensive part of training a transformer.
-
-2. **Statistical Normalizations**
-
-    Softmax and layer normalization are less compute-intensive than tensor contractions, and involve one or more **reduction operations**, the result of which is then applied via a map.
-
-3. **Element-wise Operators**
-
-    These are the remaining operators: **biases, dropout, activations, and residual connections**. These are the least compute-intensive operations.
-
-This knowledge can be helpful to know when analyzing performance bottlenecks.
-
-This summary is derived from [Data Movement Is All You Need: A Case Study on Optimizing Transformers 2020](https://arxiv.org/abs/2007.00072)
-
-
-## Anatomy of Model's Memory
-We've seen that training the model uses much more memory than just putting the model on the GPU. This is because there are many components during training that use GPU memory. The components on GPU memory are the following:
-1. model weights
-2. optimizer states
-3. gradients
-4. forward activations saved for gradient computation
-5. temporary buffers
-6. functionality-specific memory
-
-A typical model trained in mixed precision with AdamW requires 18 bytes per model parameter plus activation memory. For inference there are no optimizer states and gradients, so we can subtract those. And thus we end up with 6 bytes per model parameter for mixed precision inference, plus activation memory.
-
-Let's look at the details.
-
-**Model Weights:**
-
-- 4 bytes * number of parameters for fp32 training
-- 6 bytes * number of parameters for mixed precision training (maintains a model in fp32 and one in fp16 in memory)
-
-**Optimizer States:**
-
-- 8 bytes * number of parameters for normal AdamW (maintains 2 states)
-- 2 bytes * number of parameters for 8-bit AdamW optimizers like [bitsandbytes](https://github.com/TimDettmers/bitsandbytes)
-- 4 bytes * number of parameters for optimizers like SGD with momentum (maintains only 1 state)
-
-**Gradients**
-
-- 4 bytes * number of parameters for either fp32 or mixed precision training (gradients are always kept in fp32)
-
-**Forward Activations**
-
-- size depends on many factors, the key ones being sequence length, hidden size and batch size.
-
-There are the input and output that are being passed and returned by the forward and the backward functions and the forward activations saved for gradient computation.
-
-**Temporary Memory**
-
-Additionally there are all kinds of temporary variables which get released once the calculation is done, but in the moment these could require additional memory and could push to OOM. Therefore when coding it's crucial to think strategically about such temporary variables and sometimes to explicitly free those as soon as they are no longer needed.
-
-**Functionality-specific memory**
-
-Then your software could have special memory needs. For example, when generating text using beam search, the software needs to maintain multiple copies of inputs and outputs.
-
-**`forward` vs `backward` Execution Speed**
-
-For convolutions and linear layers there are 2x flops in the backward compared to the forward, which generally translates into ~2x slower (sometimes more, because sizes in the backward tend to be more awkward). Activations are usually bandwidth-limited, and it’s typical for an activation to have to read more data in the backward than in the forward (e.g. activation forward reads once, writes once, activation backward reads twice, gradOutput and output of the forward, and writes once, gradInput).
-
-So there are potentially a few places where we could save GPU memory or speed up operations. Let's start with a simple optimization: choosing the right batch size.
-
-## Batch sizes
-
-One gets the most efficient performance when batch sizes and input/output neuron counts are divisible by a certain number, which typically starts at 8, but can be much higher as well. That number varies a lot depending on the specific hardware being used and the dtype of the model.
-
-For example for fully connected layers (which correspond to GEMMs), NVIDIA provides recommendations for [input/output neuron counts](
-https://docs.nvidia.com/deeplearning/performance/dl-performance-fully-connected/index.html#input-features) and [batch size](https://docs.nvidia.com/deeplearning/performance/dl-performance-fully-connected/index.html#batch-size).
-
-[Tensor Core Requirements](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc) define the multiplier based on the dtype and the hardware. For example, for fp16 a multiple of 8 is recommended, but on A100 it's 64!
-
-For parameters that are small, there is also [Dimension Quantization Effects](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#dim-quantization) to consider, this is where tiling happens and the right multiplier can have a significant speedup.
-
-## Gradient Accumulation
-
-The idea behind gradient accumulation is to instead of calculating the gradients for the whole batch at once to do it in smaller steps. The way we do that is to calculate the gradients iteratively in smaller batches by doing a forward and backward pass through the model and accumulating the gradients in the process. When enough gradients are accumulated we run the model's optimization step. This way we can easily increase the overall batch size to numbers that would never fit into the GPU's memory. In turn, however, the added forward and backward passes can slow down the training a bit.
-
-We can use gradient accumulation in the [`Trainer`] by simply adding the `gradient_accumulation_steps` argument to [`TrainingArguments`]. Let's see how it impacts the models memory footprint:
-
-```py
-training_args = TrainingArguments(per_device_train_batch_size=1, gradient_accumulation_steps=4, **default_args)
-
-trainer = Trainer(model=model, args=training_args, train_dataset=ds)
-result = trainer.train()
-print_summary(result)
-```
-
-```
-Time: 66.03
-Samples/second: 7.75
-GPU memory occupied: 8681 MB.
-```
-
-We can see that the memory footprint was dramatically reduced at the cost of being only slightly slower than the vanilla run. Of course, this would change as you increase the number of accumulation steps. In general you would want to max out the GPU usage as much as possible. So in our case, the batch_size of 4 was already pretty close to the GPU's limit. If we wanted to train with a batch size of 64 we should not use `per_device_train_batch_size=1` and `gradient_accumulation_steps=64` but instead `per_device_train_batch_size=4` and `gradient_accumulation_steps=16` which has the same effective batch size while making better use of the available GPU resources.
-
-For more details see the benchmarks for [RTX-3090](https://github.com/huggingface/transformers/issues/14608#issuecomment-1004392537)
-and [A100](https://github.com/huggingface/transformers/issues/15026#issuecomment-1005033957).
-
-Next we have a look at another trick to save a little bit more GPU memory called gradient checkpointing.
-
-## Gradient Checkpointing
-
-Even when we set the batch size to 1 and use gradient accumulation we can still run out of memory when working with large models. In order to compute the gradients during the backward pass all activations from the forward pass are normally saved. This can create a big memory overhead. Alternatively, one could forget all activations during the forward pass and recompute them on demand during the backward pass. This would however add a significant computational overhead and slow down training.
-
-Gradient checkpointing strikes a compromise between the two approaches and saves strategically selected activations throughout the computational graph so only a fraction of the activations need to be re-computed for the gradients. See [this great article](https://medium.com/tensorflow/fitting-larger-networks-into-memory-583e3c758ff9) explaining the ideas behind gradient checkpointing.
-
-To enable gradient checkpointing in the [`Trainer`] we only need to pass it as a flag to the [`TrainingArguments`]. Everything else is handled under the hood:
-
-```py
-training_args = TrainingArguments(
-    per_device_train_batch_size=1, gradient_accumulation_steps=4, gradient_checkpointing=True, **default_args
-)
-
-trainer = Trainer(model=model, args=training_args, train_dataset=ds)
-result = trainer.train()
-print_summary(result)
-```
-
-```
-Time: 85.47
-Samples/second: 5.99
-GPU memory occupied: 6775 MB.
-```
-
-We can see that this saved some more memory but at the same time training became a bit slower. A general rule of thumb is that gradient checkpointing slows down training by about 20%. Let's have a look at another method with which we can regain some speed: mixed precision training.
-
-
-## Floating Data Types
-
-The idea of mixed precision training is that not all variables need to be stored in full (32-bit) floating point precision. If we can reduce the precision the variables and their computations are faster. Here are the commonly used floating point data types choice of which impacts both memory usage and throughput:
-
-- fp32 (`float32`)
-- fp16 (`float16`)
-- bf16 (`bfloat16`)
-- tf32 (CUDA internal data type)
-
-Here is a diagram that shows how these data types correlate to each other.
-
-![data types](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tf32-bf16-fp16-fp32.png)
-(source: [NVIDIA Blog](https://developer.nvidia.com/blog/accelerating-ai-training-with-tf32-tensor-cores/))
-
-While fp16 and fp32 have been around for quite some time, bf16 and tf32 are only available on the Ampere architecture GPUS and TPUs support bf16 as well. Let's start with the most commonly used method which is FP16 training/
-
-
-### FP16 Training
-
-The idea of mixed precision training is that not all variables need to be stored in full (32-bit) floating point precision. If we can reduce the precision the variales and their computations are faster. The main advantage comes from saving the activations in half (16-bit) precision. Although the gradients are also computed in half precision they are converted back to full precision for the optimization step so no memory is saved here. Since the model is present on the GPU in both 16-bit and 32-bit precision this can use more GPU memory (1.5x the original model is on the GPU), especially for small batch sizes. Since some computations are performed in full and some in half precision this approach is also called mixed precision training. Enabling mixed precision training is also just a matter of setting the `fp16` flag to `True`:
-
-```py
-training_args = TrainingArguments(per_device_train_batch_size=4, fp16=True, **default_args)
-
-trainer = Trainer(model=model, args=training_args, train_dataset=ds)
-result = trainer.train()
-print_summary(result)
-```
-
-```
-Time: 27.46
-Samples/second: 18.64
-GPU memory occupied: 13939 MB.
-```
-
-We can see that this is almost twice as fast as the vanilla training. Let's add it to the mix of the previous methods:
-
-
-```py
-training_args = TrainingArguments(
-    per_device_train_batch_size=1,
-    gradient_accumulation_steps=4,
-    gradient_checkpointing=True,
-    fp16=True,
-    **default_args,
-)
-
-trainer = Trainer(model=model, args=training_args, train_dataset=ds)
-result = trainer.train()
-print_summary(result)
-```
-
-```
-Time: 50.76
-Samples/second: 10.09
-GPU memory occupied: 7275 MB.
-```
-
-We can see that with these tweaks we use about half the GPU memory as at the beginning while also being slightly faster.
-
-### BF16
-If you have access to a Ampere or newer hardware you can use bf16 for your training and evaluation. While bf16 has a worse precision than fp16, it has a much much bigger dynamic range. Therefore, if in the past you were experiencing overflow issues while training the model, bf16 will prevent this from happening most of the time. Remember that in fp16 the biggest number you can have is `65535` and any number above that will overflow. A bf16 number can be as large as `3.39e+38` (!) which is about the same as fp32 - because both have 8-bits used for the numerical range.
-
-You can enable BF16 in the 🤗 Trainer with:
-
-```python
-TrainingArguments(bf16=True)
-```
-
-### TF32
-The Ampere hardware uses a magical data type called tf32. It has the same numerical range as fp32 (8-bits), but instead of 23 bits precision it has only 10 bits (same as fp16) and uses only 19 bits in total.
-
-It's magical in the sense that you can use the normal fp32 training and/or inference code and by enabling tf32 support you can get up to 3x throughput improvement. All you need to do is to add this to your code:
-
-```
-import torch
-torch.backends.cuda.matmul.allow_tf32 = True
-```
-
-When this is done CUDA will automatically switch to using tf32 instead of fp32 where it's possible. This, of course, assumes that the used GPU is from the Ampere series.
-
-Like all cases with reduced precision this may or may not be satisfactory for your needs, so you have to experiment and see. According to [NVIDIA research](https://developer.nvidia.com/blog/accelerating-ai-training-with-tf32-tensor-cores/) the majority of machine learning training shouldn't be impacted and showed the same perplexity and convergence as the fp32 training.
-
-If you're already using fp16 or bf16 mixed precision it may help with the throughput as well.
-
-You can enable this mode in the 🤗 Trainer with:
-```python
-TrainingArguments(tf32=True)
-```
-By default the PyTorch default is used.
-
-Note: tf32 mode is internal to CUDA and can't be accessed directly via `tensor.to(dtype=torch.tf32)` as `torch.tf32` doesn't exist.
-
-Note: you need `torch>=1.7` to enjoy this feature.
-
-You can also see a variety of benchmarks on tf32 vs other precisions:
-[RTX-3090](https://github.com/huggingface/transformers/issues/14608#issuecomment-1004390803) and
-[A100](https://github.com/huggingface/transformers/issues/15026#issuecomment-1004543189).
-
-We've now seen how we can change the floating types to increase throughput, but we are not done, yet! There is another area where we can save GPU memory: the optimizer.
-
-## Optimizer
-
-The most common optimizer used to train transformer model is Adam or AdamW (Adam with weight decay). Adam achieves good convergence by storing the rolling average of the previous gradients which, however, adds an additional memory footprint of the order of the number of model parameters. One remedy to this is to use an alternative optimizer such as Adafactor, which works well for some models but often it has instability issues.
-
-HF Trainer integrates a variety of optimisers that can be used out of box. To activate the desired optimizer simply pass the `--optim` flag to the command line.
-
-To see which optimizers are currently supported:
-
-```bash
-$ python examples/pytorch/translation/run_translation.py -h | grep "\-optim"
-         [--optim {adamw_hf,adamw_torch,adamw_torch_xla,adamw_apex_fused,adafactor}]
-```
-
-For example, if you have [NVIDIA/apex](https://github.com/NVIDIA/apex) installed `--optim adamw_apex_fused` will give you the fastest training experience among all supported AdamW optimizers.
-
-On the other hand [8bit BNB optimizer](https://github.com/TimDettmers/bitsandbytes) can save 3/4 of memory normally used by a typical AdamW optimizer if it is configured to quantize all optimizer states, but in some situations only some optimizer states are quintized and then more memory is used.
-
-Let's get a feel for the numbers and use for example use a 3B-parameter model, like `t5-3b`. Note that since a Gigabyte correpsonds to a billion bytes we can simply multiply the parameters (in billions) with the number of necessary bytes per parameter to get Gigabytes of GPU memory usage:
-
-- A standard AdamW uses 8 bytes for each parameter, here the optimizer will need (`8*3`) 24GB of GPU memory.
-- Adafactor uses slightly more than 4 bytes, so (`4*3`) 12GB and then some extra.
-- 8bit BNB quantized optimizer will use only (`2*3`) 6GB if all optimizer states are quantized.
-
-Let's have a look at Adafactor first.
-
-### Adafactor
-
-Instead of keeping the rolling average for each element in the weight matrices Adafactor only stores aggregated information (row- and column-wise sums of the rolling averages) which reduces the footprint considerably. One downside of Adafactor is that in some instances convergence can be slower than Adam's so some experimentation is advised here. We can use Adafactor simply by setting `optim="adafactor"`:
-
-
-```py
-training_args = TrainingArguments(per_device_train_batch_size=4, optim="adafactor", **default_args)
-
-trainer = Trainer(model=model, args=training_args, train_dataset=ds)
-result = trainer.train()
-print_summary(result)
-```
-
-```
-Time: 64.31
-Samples/second: 7.96
-GPU memory occupied: 12295 MB.
-```
-
-We can see that this saves a few more GB on the GPU. Let's see how it looks when we add it to the other methods we introduced earlier:
-
-
-```py
-training_args = TrainingArguments(
-    per_device_train_batch_size=1,
-    gradient_accumulation_steps=4,
-    gradient_checkpointing=True,
-    fp16=True,
-    optim="adafactor",
-    **default_args,
-)
-
-trainer = Trainer(model=model, args=training_args, train_dataset=ds)
-result = trainer.train()
-print_summary(result)
-```
-
-```
-Time: 56.54
-Samples/second: 9.06
-GPU memory occupied: 4847 MB.
-```
-
-We went from 15 GB memory usage to 5 GB - a 3x improvement while maintaining the throughput! However, as mentioned before, the convergence of Adafactor can be worse than Adam. There is an alternative to Adafactor called 8-bit Adam that takes a slightly different approach.
-
-### 8-bit Adam
-
-Instead of aggregating optimizer states like Adafactor, 8-bit Adam keeps the full state and quantizes it. Quantization means that it stores the state with lower precision and dequantizes it only for the optimization. This is similar to the idea behind FP16 training where using variables with lower precision saves memory.
-
-In contrast to the previous approaches is this one not integrated into the [`Trainer`] as a simple flag. We need to install the 8-bit optimizer and then pass it as a custom optimizer to the [`Trainer`]. Follow the installation guide in the Github [repo](https://github.com/TimDettmers/bitsandbytes) to install the `bitsandbytes` library that implements the 8-bit Adam optimizer.
-
-Once installed, we just need to initialize the the optimizer. Although this looks like a considerable amount of work it actually just involves two steps: first we need to group the model's parameters into two groups where to one group we apply weight decay and to the other we don't. Usually, biases and layer norm parameters are not weight decayed. Then in a second step we just do some argument housekeeping to use the same parameters as the previously used AdamW optimizer.
-
-<Tip>
-Note that in order to use the 8-bit optimizer with an existing pretrained model a change to the embedding layer is needed.
-Read [this issue](https://github.com/huggingface/transformers/issues/14819) for more information.
-</Tip>
-
-```py
-import bitsandbytes as bnb
-from torch import nn
-from transformers.trainer_pt_utils import get_parameter_names
-
-training_args = TrainingArguments(per_device_train_batch_size=4, **default_args)
-
-decay_parameters = get_parameter_names(model, [nn.LayerNorm])
-decay_parameters = [name for name in decay_parameters if "bias" not in name]
-optimizer_grouped_parameters = [
-    {
-        "params": [p for n, p in model.named_parameters() if n in decay_parameters],
-        "weight_decay": training_args.weight_decay,
-    },
-    {
-        "params": [p for n, p in model.named_parameters() if n not in decay_parameters],
-        "weight_decay": 0.0,
-    },
-]
-
-optimizer_kwargs = {
-    "betas": (training_args.adam_beta1, training_args.adam_beta2),
-    "eps": training_args.adam_epsilon,
-}
-optimizer_kwargs["lr"] = training_args.learning_rate
-adam_bnb_optim = bnb.optim.Adam8bit(
-    optimizer_grouped_parameters,
-    betas=(training_args.adam_beta1, training_args.adam_beta2),
-    eps=training_args.adam_epsilon,
-    lr=training_args.learning_rate,
-)
-```
-
-We can now pass the custom optimizer as an argument to the `Trainer`:
-```py
-trainer = Trainer(model=model, args=training_args, train_dataset=ds, optimizers=(adam_bnb_optim, None))
-result = trainer.train()
-print_summary(result)
-```
-
-```
-Time: 55.95
-Samples/second: 9.15
-GPU memory occupied: 13085 MB.
-```
-
-We can see that we get a similar memory improvement as with Adafactor while keeping the full rolling average of the gradients. Let's repeat the experiment with the full settings:
-
-```py
-training_args = TrainingArguments(
-    per_device_train_batch_size=1,
-    gradient_accumulation_steps=4,
-    gradient_checkpointing=True,
-    fp16=True,
-    **default_args,
-)
-
-trainer = Trainer(model=model, args=training_args, train_dataset=ds, optimizers=(adam_bnb_optim, None))
-result = trainer.train()
-print_summary(result)
-```
-
-```
-Time: 49.46
-Samples/second: 10.35
-GPU memory occupied: 5363 MB.
-```
-
-Again, we get about a 3x memory improvement and even slightly higher throughput as using Adafactor. So we have seen how we can optimize the memory footprint of large models. The following plot summarizes all our experiments:
-
-![png](https://huggingface.co/datasets/lvwerra/repo-images/raw/main/gpu-memory-savings.png)
-
-### `_multi_tensor`
-pytorch-nightly introduced `torch.optim._multi_tensor` which should significantly speed up the optimizers for situations with lots of small feature tensors. It should eventually become the default, but if you want to experiment with it sooner and don't mind using the bleed-edge, see: https://github.com/huggingface/transformers/issues/9965
-
-
-## Using 🤗 Accelerate
-
-So far we have used the [`Trainer`] to run the experiments but a more flexible alternative to that approach is to use 🤗 Accelerate. With 🤗 Accelerate you have full control over the training loop and can essentially write the loop in pure PyTorch with some minor modifications. In turn it allows you to easily scale across different infrastructures such as CPUs, GPUs, TPUs, or distributed multi-GPU setups without changing any code. Let's see what it takes to implement all of the above tweaks in 🤗 Accelerate. We can still use the [`TrainingArguments`] to wrap the training settings:
-
-
-```py
-training_args = TrainingArguments(
-    per_device_train_batch_size=1,
-    gradient_accumulation_steps=4,
-    gradient_checkpointing=True,
-    fp16=True,
-    **default_args,
-)
-```
-
-The full example training loop with 🤗 Accelerate is only a handful of lines of code long:
-
-
-```py
-from accelerate import Accelerator
-from torch.utils.data.dataloader import DataLoader
-
-dataloader = DataLoader(ds, batch_size=training_args.per_device_train_batch_size)
-
-if training_args.gradient_checkpointing:
-    model.gradient_checkpointing_enable()
-
-accelerator = Accelerator(fp16=training_args.fp16)
-model, optimizer, dataloader = accelerator.prepare(model, adam_bnb_optim, dataloader)
-
-model.train()
-for step, batch in enumerate(dataloader, start=1):
-    loss = model(**batch).loss
-    loss = loss / training_args.gradient_accumulation_steps
-    accelerator.backward(loss)
-    if step % training_args.gradient_accumulation_steps == 0:
-        optimizer.step()
-        optimizer.zero_grad()
-```
-
-First we wrap the dataset in a [`DataLoader`](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader). Then we can enable gradient checkpointing by calling the model's [`~PreTrainedModel.gradient_checkpointing_enable`] method. When we initialize the [`Accelerator`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator) we can specify if we want to use mixed precision training and it will take care of it for us in the [`prepare`] call. During the [`prepare`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.prepare) call the dataloader will also be distributed across workers should we use multiple GPUs. We use the same 8-bit optimizer from the earlier experiments.
-
-Finally, we can write the main training loop. Note that the `backward` call is handled by 🤗 Accelerate. We can also see how gradient accumulation works: we normalize the loss so we get the average at the end of accumulation and once we have enough steps we run the optimization. Now the question is: does this use the same amount of memory as the previous steps? Let's check:
-
-
-```py
->>> print_gpu_utilization()
-GPU memory occupied: 5363 MB.
-```
-
-Indeed it does. Implementing these optimization techniques with 🤗 Accelerate only takes a handful of lines of code and comes with the benefit of more flexiblity in the training loop. For a full documentation of all features have a look at the [Accelerate documentation](https://huggingface.co/docs/accelerate/index).
-
-## DataLoader
-
-One of the important requirements to reach great training speed is the ability to feed the GPU at the maximum speed it can handle. By default everything happens in the main process and it might not be able to read the data from disk fast enough, and thus create a bottleneck, leading to GPU under-utilization.
-
-- `DataLoader(pin_memory=True, ...)` which ensures that the data gets preloaded into the pinned memory on CPU and typically leads to much faster transfers from CPU to GPU memory.
-- `DataLoader(num_workers=4, ...)` - spawn several workers to pre-load data faster - during training watch the GPU utilization stats and if it's far from 100% experiment with raising the number of workers. Of course, the problem could be elsewhere so a very big number of workers won't necessarily lead to a better performance.
-
-## DeepSpeed ZeRO
-
-The in-depth details on how to use Deepspeed can be found [here](main_classes/deepspeed).
-
-First, a quick decision tree:
-
-1. Model fits onto a single GPU and you have enough space to fit a small batch size - you don't need to use Deepspeed as it'll only slow things down in this use case.
-2. Model doesn't fit onto a single GPU or you can't fit a small batch - use DeepSpeed ZeRO + CPU Offload and for much larger models NVMe Offload.
-
-Now if the decision tree suggested you use DeepSpeed first you need to [install it](main_classes/deepspeed#installation), then follow one of the following guides to create a configuration file and launch DeepSpeed.
-
-Activation:
-
-- HF Trainer-based examples: see this [guide](main_classes/deepspeed#deployment-with-one-gpu).
-- Custom HF Trainer-based program: Same as above, but pass:
-
-    ```python
-    TrainingArguments(deepspeed="/path/to/ds_config.json")
-    ```
-- Deployment in Notebooks: see this [guide](main_classes/deepspeed#deployment-in-notebooks).
-
-- Custom training loop: This is somewhat complex but you can study how this is implemented in [HF Trainer](
-https://github.com/huggingface/transformers/blob/master/src/transformers/trainer.py) - simply search for `deepspeed` in the code.
-
-
-## Choice of GPU
-Sometimes, even when applying all the above tweaks the throughput on a given GPU might still not be good enough. One easy solution is to change the type of GPU. For example switching from let's say a K80 (which you typically get on Google Colab) to a fancier GPU such as the V100 or A100. Although they are more expensive they are usually more cost effective than cheaper GPUs due to their larger memory and faster architecture.
-
-Now, let's take a step back and discuss what we should optimize for when scaling the training of large models.
-
-## How to scale
-
-When we train models there are a two aspects we want to optimize at the same time:
-
-- Data throughput/training time
-- Model performance
-
-We have seen that each method changes the memory usage and throughput. In general we want to maximize the throughput (samples/second) to minimize the training cost. This is generally achieved by utilizing the GPU as much as possible and thus filling GPU memory to its limit. For example, as mentioned earlier, we only employ gradient accumulation when we want to use a batch size beyond the size of the GPU memory. If the desired batch size fits into memory then there is no reason to apply gradient accumulation which will only slow down training.
-
-The second objective is model performance. Just because we can does not mean we should use a large batch size. As part of hyperparameter tuning you should determine which batch size yields the best result and then optimize the throughput accordingly.
-
-
-## Efficient Software Prebuilds
-
-PyTorch's [pip and conda builds](https://pytorch.org/get-started/locally/#start-locally) come prebuit with the cuda toolkit which is enough to run PyTorch, but it is insufficient if you need to build cuda extensions.
-
-At times it may take an additional effort to pre-build some components, e.g., if you're using libraries like `apex` that don't come pre-compiled. In other situations figuring out how to install the right cuda toolkit system-wide can be complicated. To address these users' needs PyTorch and NVIDIA release a new version of NGC docker container which already comes with everything prebuilt and you just need to install your programs on it and it will run out of the box.
-
-This approach is also useful if you want to tweak the pytorch source and/or make a new customized build.
-
-To find the docker image version you want start [here](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/), choose one of the latest monthly releases. Go into the release's notes for the desired release, check that the environment's components are matching your needs (including NVIDIA Driver requirements!) and then at the very top of that document go to the corresponding NGC page. If for some reason you get lost, here is [the index of all PyTorch NGC images](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch).
-
-Next follow the instructions to download and deploy the docker image.
-
-## Sparsity
-
-### Mixture of Experts
-
-Quite a few of the recent papers reported a 4-5x training speedup and a faster inference by integrating
-Mixture of Experts (MoE) into the Transformer models.
-
-Since it has been discovered that more parameters lead to better performance, this technique allows to increase the number of parameters by an order of magnitude without increasing training costs.
-
-In this approach every other FFN layer is replaced with a MoE Layer which consists of many experts, with a gated function that trains each expert in a balanced way depending on the input token's position in a sequence.
-
-![MoE Transformer 2x block](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/perf-moe-transformer.png)
-
-(source: [GLAM](https://ai.googleblog.com/2021/12/more-efficient-in-context-learning-with.html))
-
-You can find exhaustive details and comparison tables in the papers listed at the end of this section.
-
-The main drawback of this approach is that it requires staggering amounts of GPU memory - almost an order of magnitude larger than its dense equivalent. Various distillation and approaches are proposed to how to overcome the much higher memory requirements.
-
-There is direct trade-off though, you can use just a few experts with a 2-3x smaller base model instead of dozens or hundreds experts leading to a 5x smaller model and thus increase the training speed moderately while increasing the memory requirements moderately as well.
-
-Most related papers and implementations are built around Tensorflow/TPUs:
-
-- [GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding](https://arxiv.org/abs/2006.16668)
-- [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961)
-- [GLaM: Generalist Language Model (GLaM)](https://ai.googleblog.com/2021/12/more-efficient-in-context-learning-with.html)
-
-And for Pytorch DeepSpeed has built one as well: [DeepSpeed-MoE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale](https://arxiv.org/abs/2201.05596), [Mixture of Experts](https://www.deepspeed.ai/tutorials/mixture-of-experts/) - blog posts:  [1](https://www.microsoft.com/en-us/research/blog/deepspeed-powers-8x-larger-moe-model-training-with-high-performance/), [2](https://www.microsoft.com/en-us/research/publication/scalable-and-efficient-moe-training-for-multitask-multilingual-models/) and specific deployment with large transformer-based natural language generation models: [blog post](https://www.deepspeed.ai/news/2021/12/09/deepspeed-moe-nlg.html), [Megatron-Deepspeed branch](Thttps://github.com/microsoft/Megatron-DeepSpeed/tree/moe-training).
-
-
-## Scaling beyond a single GPU
-
-For some applications, such as pretraining large language models, applying all the approaches above might still not be fast enough. In this case you want to scale your experiment to several GPUs.
-
-Another use case for training on many GPUs is if the model does not fit on a single GPU with all the mentioned tricks. There are still more methods we can apply although life starts to get a bit more complicated. This usually involves some form of pipeline or tensor parallelism where the model itself is distributed across several GPUs. One can also make use of DeepSpeed which implements some of these parallelism strategies along with some more optimization to reduce the memory footprint such as partitioning the optimizer states. You can read more about this in the ["Multi-GPU training" section](perf_train_gpu_many).
-
-## Using torch.compile
-
-PyTorch 2.0 introduces a new compile function, you can learn more about it [in their documentation](https://pytorch.org/get-started/pytorch-2.0/). It uses Python’s frame evaluation API to automatically create a graph from existing PyTorch programs. After capturing the graph, different backends can be deployed to lower the graph to an optimized engine. You can choose one option below for performance boost.
-
-`torch.compile` has a growing list of backends, which can be found in [backends.py](https://github.com/pytorch/pytorch/blob/master/torch/_dynamo/optimizations/backends.py)
-or `torchdynamo.list_backends()` each of which with its optional dependencies.
-
-Some of the most commonly used backends are
-
-**Debugging backends**:
-* `dynamo.optimize("eager")` - Uses PyTorch to run the extracted GraphModule. This is quite useful in debugging TorchDynamo issues.
-* `dynamo.optimize("aot_eager")` - Uses AotAutograd with no compiler, i.e, just using PyTorch eager for the AotAutograd's extracted forward and backward graphs. This is useful for debugging, and unlikely to give speedups.
-
-**Training & inference backends**:
-* `dynamo.optimize("inductor")` - Uses TorchInductor backend with AotAutograd and cudagraphs by leveraging codegened Triton kernels  [Read more](https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747)
-* `dynamo.optimize("nvfuser")` -  nvFuser with TorchScript. [Read more](https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593)
-* `dynamo.optimize("aot_nvfuser")` -  nvFuser with AotAutograd. [Read more](https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593)
-* `dynamo.optimize("aot_cudagraphs")` - cudagraphs with AotAutograd. [Read more](https://github.com/pytorch/torchdynamo/pull/757)
-
-**Inference-only backend**s:
-* `dynamo.optimize("ofi")` -  Uses Torchscript optimize_for_inference.  [Read more](https://pytorch.org/docs/stable/generated/torch.jit.optimize_for_inference.html)
-* `dynamo.optimize("fx2trt")` -  Uses Nvidia TensorRT for inference optimizations.  [Read more](https://github.com/pytorch/TensorRT/blob/master/docsrc/tutorials/getting_started_with_fx_path.rst)
-* `dynamo.optimize("onnxrt")` -  Uses ONNXRT for inference on CPU/GPU.  [Read more](https://onnxruntime.ai/)
-* `dynamo.optimize("ipex")` -  Uses IPEX for inference on CPU.  [Read more](https://github.com/intel/intel-extension-for-pytorch)
diff --git a/docs/source/en/perf_train_special.md b/docs/source/en/perf_train_special.md
new file mode 100644
index 000000000000..48727b24fef3
--- /dev/null
+++ b/docs/source/en/perf_train_special.md
@@ -0,0 +1,24 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Training on Specialized Hardware
+
+<Tip>
+
+ Note: Most of the strategies introduced in the [single GPU section](perf_train_gpu_one) (such as mixed precision training or gradient accumulation) and [multi-GPU section](perf_train_gpu_many) are generic and apply to training models in general so make sure to have a look at it before diving into this section.
+
+</Tip>
+
+This document will be completed soon with information on how to train on specialized hardware.
diff --git a/docs/source/en/perf_train_special.mdx b/docs/source/en/perf_train_special.mdx
deleted file mode 100644
index cb6b8d4090e2..000000000000
--- a/docs/source/en/perf_train_special.mdx
+++ /dev/null
@@ -1,20 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
--->
-
-# Training on Specialized Hardware
-
-<Tip>
-
- Note: Most of the strategies introduced in the [single GPU section](perf_train_gpu_one) (such as mixed precision training or gradient accumulation) and [multi-GPU section](perf_train_gpu_many) are generic and apply to training models in general so make sure to have a look at it before diving into this section.
-
-</Tip>
-
-This document will be completed soon with information on how to train on specialized hardware.
diff --git a/docs/source/en/perf_train_tpu.md b/docs/source/en/perf_train_tpu.md
new file mode 100644
index 000000000000..c7b344ad81e7
--- /dev/null
+++ b/docs/source/en/perf_train_tpu.md
@@ -0,0 +1,24 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Training on TPUs
+
+<Tip>
+
+ Note: Most of the strategies introduced in the [single GPU section](perf_train_gpu_one) (such as mixed precision training or gradient accumulation) and [multi-GPU section](perf_train_gpu_many) are generic and apply to training models in general so make sure to have a look at it before diving into this section.
+
+</Tip>
+
+This document will be completed soon with information on how to train on TPUs.
diff --git a/docs/source/en/perf_train_tpu.mdx b/docs/source/en/perf_train_tpu.mdx
deleted file mode 100644
index bc37e00877c2..000000000000
--- a/docs/source/en/perf_train_tpu.mdx
+++ /dev/null
@@ -1,20 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
--->
-
-# Training on TPUs
-
-<Tip>
-
- Note: Most of the strategies introduced in the [single GPU section](perf_train_gpu_one) (such as mixed precision training or gradient accumulation) and [multi-GPU section](perf_train_gpu_many) are generic and apply to training models in general so make sure to have a look at it before diving into this section.
-
-</Tip>
-
-This document will be completed soon with information on how to train on TPUs.
diff --git a/docs/source/en/perf_train_tpu_tf.md b/docs/source/en/perf_train_tpu_tf.md
new file mode 100644
index 000000000000..011421b629c0
--- /dev/null
+++ b/docs/source/en/perf_train_tpu_tf.md
@@ -0,0 +1,162 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Training on TPU with TensorFlow
+
+<Tip>
+
+If you don't need long explanations and just want TPU code samples to get started with, check out [our TPU example notebook!](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb)
+
+</Tip>
+
+### What is a TPU?
+
+A TPU is a **Tensor Processing Unit.** They are hardware designed by Google, which are used to greatly speed up the tensor computations within neural networks, much like GPUs. They can be used for both network training and inference. They are generally accessed through Google’s cloud services, but small TPUs can also be accessed directly for free through Google Colab and Kaggle Kernels.
+
+Because [all TensorFlow models in 🤗 Transformers are Keras models](https://huggingface.co/blog/tensorflow-philosophy), most of the methods in this document are generally applicable to TPU training for any Keras model! However, there are a few points that are specific to the HuggingFace ecosystem (hug-o-system?) of Transformers and Datasets, and we’ll make sure to flag them up when we get to them.
+
+### What kinds of TPU are available?
+
+New users are often very confused by the range of TPUs, and the different ways to access them. The first key distinction to understand is the difference between **TPU Nodes** and **TPU VMs.**
+
+When you use a **TPU Node**, you are effectively indirectly accessing a remote TPU. You will need a separate VM, which will initialize your network and data pipeline and then forward them to the remote node. When you use a TPU on Google Colab, you are accessing it in the **TPU Node** style.
+
+Using TPU Nodes can have some quite unexpected behaviour for people who aren’t used to them! In particular, because the TPU is located on a physically different system to the machine you’re running your Python code on, your data cannot be local to your machine - any data pipeline that loads from your machine’s internal storage will totally fail! Instead, data must be stored in Google Cloud Storage where your data pipeline can still access it, even when the pipeline is running on the remote TPU node.
+
+<Tip>
+
+If you can fit all your data in memory as `np.ndarray` or `tf.Tensor`, then you can `fit()` on that data even when using Colab or a TPU Node, without needing to upload it to Google Cloud Storage.
+
+</Tip>
+
+<Tip>
+
+**🤗Specific Hugging Face Tip🤗:** The methods `Dataset.to_tf_dataset()` and its higher-level wrapper `model.prepare_tf_dataset()` , which you will see throughout our TF code examples, will both fail on a TPU Node. The reason for this is that even though they create a `tf.data.Dataset` it is not a “pure” `tf.data` pipeline and uses `tf.numpy_function` or `Dataset.from_generator()` to stream data from the underlying HuggingFace `Dataset`. This HuggingFace `Dataset` is backed by data that is on a local disc and which the remote TPU Node will not be able to read.
+
+</Tip>
+
+The second way to access a TPU is via a **TPU VM.** When using a TPU VM, you connect directly to the machine that the TPU is attached to, much like training on a GPU VM. TPU VMs are generally easier to work with, particularly when it comes to your data pipeline. All of the above warnings do not apply to TPU VMs!
+
+This is an opinionated document, so here’s our opinion: **Avoid using TPU Node if possible.** It is more confusing and more difficult to debug than TPU VMs. It is also likely to be unsupported in future - Google’s latest TPU, TPUv4, can only be accessed as a TPU VM, which suggests that TPU Nodes are increasingly going to become a “legacy” access method. However, we understand that the only free TPU access is on Colab and Kaggle Kernels, which uses TPU Node - so we’ll try to explain how to handle it if you have to! Check the [TPU example notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb) for code samples that explain this in more detail.
+
+### What sizes of TPU are available?
+
+A single TPU (a v2-8/v3-8/v4-8) runs 8 replicas. TPUs exist in **pods** that can run hundreds or thousands of replicas simultaneously. When you use more than a single TPU but less than a whole pod (for example, a v3-32), your TPU fleet is referred to as a **pod slice.**
+
+When you access a free TPU via Colab, you generally get a single v2-8 TPU.
+
+### I keep hearing about this XLA thing. What’s XLA, and how does it relate to TPUs?
+
+XLA is an optimizing compiler, used by both TensorFlow and JAX. In JAX it is the only compiler, whereas in TensorFlow it is optional (but mandatory on TPU!). The easiest way to enable it when training a Keras model is to pass the argument `jit_compile=True` to `model.compile()`. If you don’t get any errors and performance is good, that’s a great sign that you’re ready to move to TPU!
+
+Debugging on TPU is generally a bit harder than on CPU/GPU, so we recommend getting your code running on CPU/GPU with XLA first before trying it on TPU. You don’t have to train for long, of course - just for a few steps to make sure that your model and data pipeline are working like you expect them to.
+
+<Tip>
+
+XLA compiled code is usually faster - so even if you’re not planning to run on TPU, adding `jit_compile=True` can improve your performance. Be sure to note the caveats below about XLA compatibility, though!
+
+</Tip>
+
+<Tip warning={true}>
+
+**Tip born of painful experience:** Although using `jit_compile=True` is a good way to get a speed boost and test if your CPU/GPU code is XLA-compatible, it can actually cause a lot of problems if you leave it in when actually training on TPU. XLA compilation will happen implicitly on TPU, so remember to remove that line before actually running your code on a TPU!
+
+</Tip>
+
+### How do I make my model XLA compatible?
+
+In many cases, your code is probably XLA-compatible already! However, there are a few things that work in normal TensorFlow that don’t work in XLA. We’ve distilled them into three core rules below:
+
+<Tip>
+
+**🤗Specific HuggingFace Tip🤗:** We’ve put a lot of effort into rewriting our TensorFlow models and loss functions to be XLA-compatible. Our models and loss functions generally obey rule #1 and #2 by default, so you can skip over them if you’re using `transformers` models. Don’t forget about these rules when writing your own models and loss functions, though!
+
+</Tip>
+
+#### XLA Rule #1: Your code cannot have “data-dependent conditionals”
+
+What that means is that any `if` statement cannot depend on values inside a `tf.Tensor`. For example, this code block cannot be compiled with XLA!
+
+```python
+if tf.reduce_sum(tensor) > 10:
+    tensor = tensor / 2.0
+```
+
+This might seem very restrictive at first, but most neural net code doesn’t need to do this. You can often get around this restriction by using `tf.cond` (see the documentation [here](https://www.tensorflow.org/api_docs/python/tf/cond)) or by removing the conditional and finding a clever math trick with indicator variables instead, like so:
+
+```python
+sum_over_10 = tf.cast(tf.reduce_sum(tensor) > 10, tf.float32)
+tensor = tensor / (1.0 + sum_over_10)
+```
+
+This code has exactly the same effect as the code above, but by avoiding a conditional, we ensure it will compile with XLA without problems!
+
+#### XLA Rule #2: Your code cannot have “data-dependent shapes”
+
+What this means is that the shape of all of the `tf.Tensor` objects in your code cannot depend on their values. For example, the function `tf.unique` cannot be compiled with XLA, because it returns a `tensor` containing one instance of each unique value in the input. The shape of this output will obviously be different depending on how repetitive the input `Tensor` was, and so XLA refuses to handle it!
+
+In general, most neural network code obeys rule #2 by default. However, there are a few common cases where it becomes a problem. One very common one is when you use **label masking**, setting your labels to a negative value to indicate that those positions should be ignored when computing the loss. If you look at NumPy or PyTorch loss functions that support label masking, you will often see code like this that uses [boolean indexing](https://numpy.org/doc/stable/user/basics.indexing.html#boolean-array-indexing):
+
+```python
+label_mask = labels >= 0
+masked_outputs = outputs[label_mask]
+masked_labels = labels[label_mask]
+loss = compute_loss(masked_outputs, masked_labels)
+mean_loss = torch.mean(loss)
+```
+
+This code is totally fine in NumPy or PyTorch, but it breaks in XLA! Why? Because the shape of `masked_outputs` and `masked_labels` depends on how many positions are masked - that makes it a **data-dependent shape.** However, just like for rule #1, we can often rewrite this code to yield exactly the same output without any data-dependent shapes.
+
+```python
+label_mask = tf.cast(labels >= 0, tf.float32)
+loss = compute_loss(outputs, labels)
+loss = loss * label_mask  # Set negative label positions to 0
+mean_loss = tf.reduce_sum(loss) / tf.reduce_sum(label_mask)
+```
+
+Here, we avoid data-dependent shapes by computing the loss for every position, but zeroing out the masked positions in both the numerator and denominator when we calculate the mean, which yields exactly the same result as the first block while maintaining XLA compatibility. Note that we use the same trick as in rule #1 - converting a `tf.bool` to `tf.float32` and using it as an indicator variable. This is a really useful trick, so remember it if you need to convert your own code to XLA!
+
+#### XLA Rule #3: XLA will need to recompile your model for every different input shape it sees
+
+This is the big one. What this means is that if your input shapes are very variable, XLA will have to recompile your model over and over, which will create huge performance problems. This commonly arises in NLP models, where input texts have variable lengths after tokenization. In other modalities, static shapes are more common and this rule is much less of a problem.
+
+How can you get around rule #3? The key is **padding** - if you pad all your inputs to the same length, and then use an `attention_mask`, you can get the same results as you’d get from variable shapes, but without any XLA issues. However, excessive padding can cause severe slowdown too - if you pad all your samples to the maximum length in the whole dataset, you might end up with batches consisting endless padding tokens, which will waste a lot of compute and memory!
+
+There isn’t a perfect solution to this problem. However, you can try some tricks. One very useful trick is to **pad batches of samples up to a multiple of a number like 32 or 64 tokens.** This often only increases the number of tokens by a small amount, but it hugely reduces the number of unique input shapes, because every input shape now has to be a multiple of 32 or 64. Fewer unique input shapes means fewer XLA compilations!
+
+<Tip>
+
+**🤗Specific HuggingFace Tip🤗:** Our tokenizers and data collators have methods that can help you here. You can use `padding="max_length"` or `padding="longest"` when calling tokenizers to get them to output padded data. Our tokenizers and data collators also have a `pad_to_multiple_of` argument that you can use to reduce the number of unique input shapes you see!
+
+</Tip>
+
+### How do I actually train my model on TPU?
+
+Once your training is XLA-compatible and (if you’re using TPU Node / Colab) your dataset has been prepared appropriately, running on TPU is surprisingly easy! All you really need to change in your code is to add a few lines to initialize your TPU, and to ensure that your model and dataset are created inside a `TPUStrategy` scope. Take a look at [our TPU example notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb) to see this in action!
+
+### Summary
+
+There was a lot in here, so let’s summarize with a quick checklist you can follow when you want to get your model ready for TPU training:
+
+- Make sure your code follows the three rules of XLA
+- Compile your model with `jit_compile=True` on CPU/GPU and confirm that you can train it with XLA
+- Either load your dataset into memory or use a TPU-compatible dataset loading approach (see [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb))
+- Migrate your code either to Colab (with accelerator set to “TPU”) or a TPU VM on Google Cloud
+- Add TPU initializer code (see [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb))
+- Create your `TPUStrategy` and make sure dataset loading and model creation are inside the `strategy.scope()` (see [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb))
+- Don’t forget to take `jit_compile=True` out again when you move to TPU!
+- 🙏🙏🙏🥺🥺🥺
+- Call model.fit()
+- You did it!
\ No newline at end of file
diff --git a/docs/source/en/performance.md b/docs/source/en/performance.md
new file mode 100644
index 000000000000..a1661a6ba5a8
--- /dev/null
+++ b/docs/source/en/performance.md
@@ -0,0 +1,73 @@
+<!---
+Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Performance and Scalability
+
+Training large transformer models and deploying them to production present various challenges.  
+During training, the model may require more GPU memory than available or exhibit slow training speed. In the deployment 
+phase, the model can struggle to handle the required throughput in a production environment.
+
+This documentation aims to assist you in overcoming these challenges and finding the optimal setting for your use-case. 
+The guides are divided into training and inference sections, as each comes with different challenges and solutions. 
+Within each section you'll find separate guides for different hardware configurations, such as single GPU vs. multi-GPU 
+for training or CPU vs. GPU for inference.
+
+Use this document as your starting point to navigate further to the methods that match your scenario.
+
+## Training
+
+Training large transformer models efficiently requires an accelerator such as a GPU or TPU. The most common case is where 
+you have a single GPU. The methods that you can apply to improve training efficiency on a single GPU extend to other setups 
+such as multiple GPU. However, there are also techniques that are specific to multi-GPU or CPU training. We cover them in 
+separate sections.
+
+* [Methods and tools for efficient training on a single GPU](perf_train_gpu_one): start here to learn common approaches that can help optimize GPU memory utilization, speed up the training, or both. 
+* [Multi-GPU training section](perf_train_gpu_many): explore this section to learn about further optimization methods that apply to a multi-GPU settings, such as data, tensor, and pipeline parallelism.
+* [CPU training section](perf_train_cpu): learn about mixed precision training on CPU.
+* [Efficient Training on Multiple CPUs](perf_train_cpu_many): learn about distributed CPU training.
+* [Training on TPU with TensorFlow](perf_train_tpu_tf): if you are new to TPUs, refer to this section for an opinionated introduction to training on TPUs and using XLA. 
+* [Custom hardware for training](perf_hardware): find tips and tricks when building your own deep learning rig.
+* [Hyperparameter Search using Trainer API](hpo_train)
+
+## Inference
+
+Efficient inference with large models in a production environment can be as challenging as training them. In the following 
+sections we go through the steps to run inference on CPU and single/multi-GPU setups.
+
+* [Inference on a single CPU](perf_infer_cpu)
+* [Inference on a single GPU](perf_infer_gpu_one)
+* [Multi-GPU inference](perf_infer_gpu_many)
+* [XLA Integration for TensorFlow Models](tf_xla)
+
+
+## Training and inference
+
+Here you'll find techniques, tips and tricks that apply whether you are training a model, or running inference with it.
+
+* [Instantiating a big model](big_models)
+* [Troubleshooting performance issues](debugging)
+
+## Contribute
+
+This document is far from being complete and a lot more needs to be added, so if you have additions or corrections to 
+make please don't hesitate to open a PR or if you aren't sure start an Issue and we can discuss the details there.
+
+When making contributions that A is better than B, please try to include a reproducible benchmark and/or a link to the 
+source of that information (unless it comes directly from you).
diff --git a/docs/source/en/performance.mdx b/docs/source/en/performance.mdx
deleted file mode 100644
index 6c68e9b2acce..000000000000
--- a/docs/source/en/performance.mdx
+++ /dev/null
@@ -1,92 +0,0 @@
-<!---
-Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Performance and Scalability
-
-Training larger and larger transformer models and deploying them to production comes with a range of challenges. During training your model can require more GPU memory than is available or be very slow to train and when you deploy it for inference it can be overwhelmed with the throughput that is required in the production environment. This documentation is designed to help you navigate these challenges and find the best setting for your use-case. We split the guides into training and inference as they come with different challenges and solutions. Then within each of them we have separate guides for different kinds of hardware setting (e.g. single vs. multi-GPU for training or CPU vs. GPU for infrence).
-
-![perf_overview](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/perf_overview.png)
-
-This document serves as an overview and entry point for the methods that could be useful for your scenario.
-
-## Training
-
-Training transformer models efficiently requires an accelerator such as a GPU or TPU. The most common case is where you only have a single GPU, but there is also a section about multi-GPU and CPU training (with more coming soon).
-
-<Tip>
-
- Note: Most of the strategies introduced in the single GPU sections (such as mixed precision training or gradient accumulation) are generic and apply to training models in general so make sure to have a look at it before diving into the following sections such as multi-GPU or CPU training.
-
-</Tip>
-
-### Single GPU
-
-Training large models on a single GPU can be challenging but there are a number of tools and methods that make it feasible. In this section methods such as mixed precision training, gradient accumulation and checkpointing, efficient optimizers, as well as strategies to determine the best batch size are discussed.
-
-[Go to single GPU training section](perf_train_gpu_one)
-
-### Multi-GPU
-
-In some cases training on a single GPU is still too slow or won't fit the large model. Moving to a multi-GPU setup is the logical step, but training on multiple GPUs at once comes with new decisions: does each GPU have a full copy of the model or is the model itself also distributed? In this section we look at data, tensor, and pipeline parallism.
-
-[Go to multi-GPU training section](perf_train_gpu_many)
-
-### CPU
-
-
-[Go to CPU training section](perf_train_cpu)
-
-
-### TPU
-
-[_Coming soon_](perf_train_tpu)
-
-### Specialized Hardware
-
-[_Coming soon_](perf_train_special)
-
-## Inference
-
-Efficient inference with large models in a production environment can be as challenging as training them. In the following sections we go through the steps to run inference on CPU and single/multi-GPU setups.
-
-### CPU
-
-[Go to CPU inference section](perf_infer_cpu)
-
-### Single GPU
-
-[Go to single GPU inference section](perf_infer_gpu_one)
-
-### Multi-GPU
-
-[Go to multi-GPU inference section](perf_infer_gpu_many)
-
-### Specialized Hardware
-
-[_Coming soon_](perf_infer_special)
-
-## Hardware
-
-In the hardware section you can find tips and tricks when building your own deep learning rig.
-
-[Go to hardware section](perf_hardware)
-
-
-## Contribute
-
-This document is far from being complete and a lot more needs to be added, so if you have additions or corrections to make please don't hesitate to open a PR or if you aren't sure start an Issue and we can discuss the details there.
-
-When making contributions that A is better than B, please try to include a reproducible benchmark and/or a link to the source of that information (unless it comes directly from you).
diff --git a/docs/source/en/perplexity.md b/docs/source/en/perplexity.md
new file mode 100644
index 000000000000..18abc0305b0e
--- /dev/null
+++ b/docs/source/en/perplexity.md
@@ -0,0 +1,143 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Perplexity of fixed-length models
+
+[[open-in-colab]]
+
+Perplexity (PPL) is one of the most common metrics for evaluating language models. Before diving in, we should note
+that the metric applies specifically to classical language models (sometimes called autoregressive or causal language
+models) and is not well defined for masked language models like BERT (see [summary of the models](model_summary)).
+
+Perplexity is defined as the exponentiated average negative log-likelihood of a sequence. If we have a tokenized
+sequence \\(X = (x_0, x_1, \dots, x_t)\\), then the perplexity of \\(X\\) is,
+
+$$\text{PPL}(X) = \exp \left\{ {-\frac{1}{t}\sum_i^t \log p_\theta (x_i|x_{<i}) } \right\}$$
+
+where \\(\log p_\theta (x_i|x_{<i})\\) is the log-likelihood of the ith token conditioned on the preceding tokens \\(x_{<i}\\) according to our model. Intuitively, it can be thought of as an evaluation of the model's ability to predict uniformly among the set of specified tokens in a corpus. Importantly, this means that the tokenization procedure has a direct impact on a model's perplexity which should always be taken into consideration when comparing different models.
+
+This is also equivalent to the exponentiation of the cross-entropy between the data and model predictions. For more
+intuition about perplexity and its relationship to Bits Per Character (BPC) and data compression, check out this
+[fantastic blog post on The Gradient](https://thegradient.pub/understanding-evaluation-metrics-for-language-models/).
+
+## Calculating PPL with fixed-length models
+
+If we weren't limited by a model's context size, we would evaluate the model's perplexity by autoregressively
+factorizing a sequence and conditioning on the entire preceding subsequence at each step, as shown below.
+
+<img width="600" alt="Full decomposition of a sequence with unlimited context length" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/ppl_full.gif"/>
+
+When working with approximate models, however, we typically have a constraint on the number of tokens the model can
+process. The largest version of [GPT-2](model_doc/gpt2), for example, has a fixed length of 1024 tokens, so we
+cannot calculate \\(p_\theta(x_t|x_{<t})\\) directly when \\(t\\) is greater than 1024.
+
+Instead, the sequence is typically broken into subsequences equal to the model's maximum input size. If a model's max
+input size is \\(k\\), we then approximate the likelihood of a token \\(x_t\\) by conditioning only on the
+\\(k-1\\) tokens that precede it rather than the entire context. When evaluating the model's perplexity of a
+sequence, a tempting but suboptimal approach is to break the sequence into disjoint chunks and add up the decomposed
+log-likelihoods of each segment independently.
+
+<img width="600" alt="Suboptimal PPL not taking advantage of full available context" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/ppl_chunked.gif"/>
+
+This is quick to compute since the perplexity of each segment can be computed in one forward pass, but serves as a poor
+approximation of the fully-factorized perplexity and will typically yield a higher (worse) PPL because the model will
+have less context at most of the prediction steps.
+
+Instead, the PPL of fixed-length models should be evaluated with a sliding-window strategy. This involves repeatedly
+sliding the context window so that the model has more context when making each prediction.
+
+<img width="600" alt="Sliding window PPL taking advantage of all available context" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/ppl_sliding.gif"/>
+
+This is a closer approximation to the true decomposition of the sequence probability and will typically yield a more
+favorable score. The downside is that it requires a separate forward pass for each token in the corpus. A good
+practical compromise is to employ a strided sliding window, moving the context by larger strides rather than sliding by
+1 token a time. This allows computation to proceed much faster while still giving the model a large context to make
+predictions at each step.
+
+## Example: Calculating perplexity with GPT-2 in 🤗 Transformers
+
+Let's demonstrate this process with GPT-2.
+
+```python
+from transformers import GPT2LMHeadModel, GPT2TokenizerFast
+
+device = "cuda"
+model_id = "gpt2-large"
+model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
+tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
+```
+
+We'll load in the WikiText-2 dataset and evaluate the perplexity using a few different sliding-window strategies. Since
+this dataset is small and we're just doing one forward pass over the set, we can just load and encode the entire
+dataset in memory.
+
+```python
+from datasets import load_dataset
+
+test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")
+```
+
+With 🤗 Transformers, we can simply pass the `input_ids` as the `labels` to our model, and the average negative
+log-likelihood for each token is returned as the loss. With our sliding window approach, however, there is overlap in
+the tokens we pass to the model at each iteration. We don't want the log-likelihood for the tokens we're just treating
+as context to be included in our loss, so we can set these targets to `-100` so that they are ignored. The following
+is an example of how we could do this with a stride of `512`. This means that the model will have at least 512 tokens
+for context when calculating the conditional likelihood of any one token (provided there are 512 preceding tokens
+available to condition on).
+
+```python
+import torch
+from tqdm import tqdm
+
+max_length = model.config.n_positions
+stride = 512
+seq_len = encodings.input_ids.size(1)
+
+nlls = []
+prev_end_loc = 0
+for begin_loc in tqdm(range(0, seq_len, stride)):
+    end_loc = min(begin_loc + max_length, seq_len)
+    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
+    input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
+    target_ids = input_ids.clone()
+    target_ids[:, :-trg_len] = -100
+
+    with torch.no_grad():
+        outputs = model(input_ids, labels=target_ids)
+
+        # loss is calculated using CrossEntropyLoss which averages over valid labels
+        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
+        # to the left by 1.
+        neg_log_likelihood = outputs.loss
+
+    nlls.append(neg_log_likelihood)
+
+    prev_end_loc = end_loc
+    if end_loc == seq_len:
+        break
+
+ppl = torch.exp(torch.stack(nlls).mean())
+```
+
+Running this with the stride length equal to the max input length is equivalent to the suboptimal, non-sliding-window
+strategy we discussed above. The smaller the stride, the more context the model will have in making each prediction,
+and the better the reported perplexity will typically be.
+
+When we run the above with `stride = 1024`, i.e. no overlap, the resulting PPL is `19.44`, which is about the same
+as the `19.93` reported in the GPT-2 paper. By using `stride = 512` and thereby employing our striding window
+strategy, this jumps down to `16.45`. This is not only a more favorable score, but is calculated in a way that is
+closer to the true autoregressive decomposition of a sequence likelihood.
diff --git a/docs/source/en/perplexity.mdx b/docs/source/en/perplexity.mdx
deleted file mode 100644
index 01f861c99c5e..000000000000
--- a/docs/source/en/perplexity.mdx
+++ /dev/null
@@ -1,140 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Perplexity of fixed-length models
-
-[[open-in-colab]]
-
-Perplexity (PPL) is one of the most common metrics for evaluating language models. Before diving in, we should note
-that the metric applies specifically to classical language models (sometimes called autoregressive or causal language
-models) and is not well defined for masked language models like BERT (see [summary of the models](model_summary)).
-
-Perplexity is defined as the exponentiated average negative log-likelihood of a sequence. If we have a tokenized
-sequence \\(X = (x_0, x_1, \dots, x_t)\\), then the perplexity of \\(X\\) is,
-
-$$\text{PPL}(X) = \exp \left\{ {-\frac{1}{t}\sum_i^t \log p_\theta (x_i|x_{<i}) } \right\}$$
-
-where \\(\log p_\theta (x_i|x_{<i})\\) is the log-likelihood of the ith token conditioned on the preceding tokens \\(x_{<i}\\) according to our model. Intuitively, it can be thought of as an evaluation of the model's ability to predict uniformly among the set of specified tokens in a corpus. Importantly, this means that the tokenization procedure has a direct impact on a model's perplexity which should always be taken into consideration when comparing different models.
-
-This is also equivalent to the exponentiation of the cross-entropy between the data and model predictions. For more
-intuition about perplexity and its relationship to Bits Per Character (BPC) and data compression, check out this
-[fantastic blog post on The Gradient](https://thegradient.pub/understanding-evaluation-metrics-for-language-models/).
-
-## Calculating PPL with fixed-length models
-
-If we weren't limited by a model's context size, we would evaluate the model's perplexity by autoregressively
-factorizing a sequence and conditioning on the entire preceding subsequence at each step, as shown below.
-
-<img width="600" alt="Full decomposition of a sequence with unlimited context length" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/ppl_full.gif"/>
-
-When working with approximate models, however, we typically have a constraint on the number of tokens the model can
-process. The largest version of [GPT-2](model_doc/gpt2), for example, has a fixed length of 1024 tokens, so we
-cannot calculate \\(p_\theta(x_t|x_{<t})\\) directly when \\(t\\) is greater than 1024.
-
-Instead, the sequence is typically broken into subsequences equal to the model's maximum input size. If a model's max
-input size is \\(k\\), we then approximate the likelihood of a token \\(x_t\\) by conditioning only on the
-\\(k-1\\) tokens that precede it rather than the entire context. When evaluating the model's perplexity of a
-sequence, a tempting but suboptimal approach is to break the sequence into disjoint chunks and add up the decomposed
-log-likelihoods of each segment independently.
-
-<img width="600" alt="Suboptimal PPL not taking advantage of full available context" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/ppl_chunked.gif"/>
-
-This is quick to compute since the perplexity of each segment can be computed in one forward pass, but serves as a poor
-approximation of the fully-factorized perplexity and will typically yield a higher (worse) PPL because the model will
-have less context at most of the prediction steps.
-
-Instead, the PPL of fixed-length models should be evaluated with a sliding-window strategy. This involves repeatedly
-sliding the context window so that the model has more context when making each prediction.
-
-<img width="600" alt="Sliding window PPL taking advantage of all available context" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/ppl_sliding.gif"/>
-
-This is a closer approximation to the true decomposition of the sequence probability and will typically yield a more
-favorable score. The downside is that it requires a separate forward pass for each token in the corpus. A good
-practical compromise is to employ a strided sliding window, moving the context by larger strides rather than sliding by
-1 token a time. This allows computation to proceed much faster while still giving the model a large context to make
-predictions at each step.
-
-## Example: Calculating perplexity with GPT-2 in 🤗 Transformers
-
-Let's demonstrate this process with GPT-2.
-
-```python
-from transformers import GPT2LMHeadModel, GPT2TokenizerFast
-
-device = "cuda"
-model_id = "gpt2-large"
-model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
-tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
-```
-
-We'll load in the WikiText-2 dataset and evaluate the perplexity using a few different sliding-window strategies. Since
-this dataset is small and we're just doing one forward pass over the set, we can just load and encode the entire
-dataset in memory.
-
-```python
-from datasets import load_dataset
-
-test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
-encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")
-```
-
-With 🤗 Transformers, we can simply pass the `input_ids` as the `labels` to our model, and the average negative
-log-likelihood for each token is returned as the loss. With our sliding window approach, however, there is overlap in
-the tokens we pass to the model at each iteration. We don't want the log-likelihood for the tokens we're just treating
-as context to be included in our loss, so we can set these targets to `-100` so that they are ignored. The following
-is an example of how we could do this with a stride of `512`. This means that the model will have at least 512 tokens
-for context when calculating the conditional likelihood of any one token (provided there are 512 preceding tokens
-available to condition on).
-
-```python
-import torch
-from tqdm import tqdm
-
-max_length = model.config.n_positions
-stride = 512
-seq_len = encodings.input_ids.size(1)
-
-nlls = []
-prev_end_loc = 0
-for begin_loc in tqdm(range(0, seq_len, stride)):
-    end_loc = min(begin_loc + max_length, seq_len)
-    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
-    input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
-    target_ids = input_ids.clone()
-    target_ids[:, :-trg_len] = -100
-
-    with torch.no_grad():
-        outputs = model(input_ids, labels=target_ids)
-
-        # loss is calculated using CrossEntropyLoss which averages over input tokens.
-        # Multiply it with trg_len to get the summation instead of average.
-        # We will take average over all the tokens to get the true average
-        # in the last step of this example.
-        neg_log_likelihood = outputs.loss * trg_len
-
-    nlls.append(neg_log_likelihood)
-
-    prev_end_loc = end_loc
-    if end_loc == seq_len:
-        break
-
-ppl = torch.exp(torch.stack(nlls).sum() / end_loc)
-```
-
-Running this with the stride length equal to the max input length is equivalent to the suboptimal, non-sliding-window
-strategy we discussed above. The smaller the stride, the more context the model will have in making each prediction,
-and the better the reported perplexity will typically be.
-
-When we run the above with `stride = 1024`, i.e. no overlap, the resulting PPL is `19.64`, which is about the same
-as the `19.93` reported in the GPT-2 paper. By using `stride = 512` and thereby employing our striding window
-strategy, this jumps down to `16.44`. This is not only a more favorable score, but is calculated in a way that is
-closer to the true autoregressive decomposition of a sequence likelihood.
diff --git a/docs/source/en/philosophy.md b/docs/source/en/philosophy.md
new file mode 100644
index 000000000000..cad1e2ccdc8c
--- /dev/null
+++ b/docs/source/en/philosophy.md
@@ -0,0 +1,79 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Philosophy
+
+🤗 Transformers is an opinionated library built for:
+
+- machine learning researchers and educators seeking to use, study or extend large-scale Transformers models.
+- hands-on practitioners who want to fine-tune those models or serve them in production, or both.
+- engineers who just want to download a pretrained model and use it to solve a given machine learning task.
+
+The library was designed with two strong goals in mind:
+
+1. Be as easy and fast to use as possible:
+
+  - We strongly limited the number of user-facing abstractions to learn, in fact, there are almost no abstractions,
+    just three standard classes required to use each model: [configuration](main_classes/configuration),
+    [models](main_classes/model), and a preprocessing class ([tokenizer](main_classes/tokenizer) for NLP, [image processor](main_classes/image_processor) for vision, [feature extractor](main_classes/feature_extractor) for audio, and [processor](main_classes/processors) for multimodal inputs).
+  - All of these classes can be initialized in a simple and unified way from pretrained instances by using a common
+    `from_pretrained()` method which downloads (if needed), caches and
+    loads the related class instance and associated data (configurations' hyperparameters, tokenizers' vocabulary,
+    and models' weights) from a pretrained checkpoint provided on [Hugging Face Hub](https://huggingface.co/models) or your own saved checkpoint.
+  - On top of those three base classes, the library provides two APIs: [`pipeline`] for quickly
+    using a model for inference on a given task and [`Trainer`] to quickly train or fine-tune a PyTorch model (all TensorFlow models are compatible with `Keras.fit`).
+  - As a consequence, this library is NOT a modular toolbox of building blocks for neural nets. If you want to
+    extend or build upon the library, just use regular Python, PyTorch, TensorFlow, Keras modules and inherit from the base
+    classes of the library to reuse functionalities like model loading and saving. If you'd like to learn more about our coding philosophy for models, check out our [Repeat Yourself](https://huggingface.co/blog/transformers-design-philosophy) blog post.
+
+2. Provide state-of-the-art models with performances as close as possible to the original models:
+
+  - We provide at least one example for each architecture which reproduces a result provided by the official authors
+    of said architecture.
+  - The code is usually as close to the original code base as possible which means some PyTorch code may be not as
+    *pytorchic* as it could be as a result of being converted TensorFlow code and vice versa.
+
+A few other goals:
+
+- Expose the models' internals as consistently as possible:
+
+  - We give access, using a single API, to the full hidden-states and attention weights.
+  - The preprocessing classes and base model APIs are standardized to easily switch between models.
+
+- Incorporate a subjective selection of promising tools for fine-tuning and investigating these models:
+
+  - A simple and consistent way to add new tokens to the vocabulary and embeddings for fine-tuning.
+  - Simple ways to mask and prune Transformer heads.
+
+- Easily switch between PyTorch, TensorFlow 2.0 and Flax, allowing training with one framework and inference with another.
+
+## Main concepts
+
+The library is built around three types of classes for each model:
+
+- **Model classes** can be PyTorch models ([torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)), Keras models ([tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model)) or JAX/Flax models ([flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen.html)) that work with the pretrained weights provided in the library.
+- **Configuration classes** store the hyperparameters required to build a model (such as the number of layers and hidden size). You don't always need to instantiate these yourself. In particular, if you are using a pretrained model without any modification, creating the model will automatically take care of instantiating the configuration (which is part of the model).
+- **Preprocessing classes** convert the raw data into a format accepted by the model. A [tokenizer](main_classes/tokenizer) stores the vocabulary for each model and provide methods for encoding and decoding strings in a list of token embedding indices to be fed to a model. [Image processors](main_classes/image_processor) preprocess vision inputs, [feature extractors](main_classes/feature_extractor) preprocess audio inputs, and a [processor](main_classes/processors) handles multimodal inputs.
+
+All these classes can be instantiated from pretrained instances, saved locally, and shared on the Hub with three methods:
+
+- `from_pretrained()` lets you instantiate a model, configuration, and preprocessing class from a pretrained version either
+  provided by the library itself (the supported models can be found on the [Model Hub](https://huggingface.co/models)) or
+  stored locally (or on a server) by the user.
+- `save_pretrained()` lets you save a model, configuration, and preprocessing class locally so that it can be reloaded using
+  `from_pretrained()`.
+- `push_to_hub()` lets you share a model, configuration, and a preprocessing class to the Hub, so it is easily accessible to everyone.
+
diff --git a/docs/source/en/philosophy.mdx b/docs/source/en/philosophy.mdx
deleted file mode 100644
index 7788d7836236..000000000000
--- a/docs/source/en/philosophy.mdx
+++ /dev/null
@@ -1,75 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Philosophy
-
-🤗 Transformers is an opinionated library built for:
-
-- machine learning researchers and educators seeking to use, study or extend large-scale Transformers models.
-- hands-on practitioners who want to fine-tune those models or serve them in production, or both.
-- engineers who just want to download a pretrained model and use it to solve a given machine learning task.
-
-The library was designed with two strong goals in mind:
-
-1. Be as easy and fast to use as possible:
-
-  - We strongly limited the number of user-facing abstractions to learn, in fact, there are almost no abstractions,
-    just three standard classes required to use each model: [configuration](main_classes/configuration),
-    [models](main_classes/model), and a preprocessing class ([tokenizer](main_classes/tokenizer) for NLP, [image processor](main_classes/image_processor) for vision, [feature extractor](main_classes/feature_extractor) for audio, and [processor](main_classes/processors) for multimodal inputs).
-  - All of these classes can be initialized in a simple and unified way from pretrained instances by using a common
-    `from_pretrained()` method which downloads (if needed), caches and
-    loads the related class instance and associated data (configurations' hyperparameters, tokenizers' vocabulary,
-    and models' weights) from a pretrained checkpoint provided on [Hugging Face Hub](https://huggingface.co/models) or your own saved checkpoint.
-  - On top of those three base classes, the library provides two APIs: [`pipeline`] for quickly
-    using a model for inference on a given task and [`Trainer`] to quickly train or fine-tune a PyTorch model (all TensorFlow models are compatible with `Keras.fit`).
-  - As a consequence, this library is NOT a modular toolbox of building blocks for neural nets. If you want to
-    extend or build upon the library, just use regular Python, PyTorch, TensorFlow, Keras modules and inherit from the base
-    classes of the library to reuse functionalities like model loading and saving. If you'd like to learn more about our coding philosophy for models, check out our [Repeat Yourself](https://huggingface.co/blog/transformers-design-philosophy) blog post.
-
-2. Provide state-of-the-art models with performances as close as possible to the original models:
-
-  - We provide at least one example for each architecture which reproduces a result provided by the official authors
-    of said architecture.
-  - The code is usually as close to the original code base as possible which means some PyTorch code may be not as
-    *pytorchic* as it could be as a result of being converted TensorFlow code and vice versa.
-
-A few other goals:
-
-- Expose the models' internals as consistently as possible:
-
-  - We give access, using a single API, to the full hidden-states and attention weights.
-  - The preprocessing classes and base model APIs are standardized to easily switch between models.
-
-- Incorporate a subjective selection of promising tools for fine-tuning and investigating these models:
-
-  - A simple and consistent way to add new tokens to the vocabulary and embeddings for fine-tuning.
-  - Simple ways to mask and prune Transformer heads.
-
-- Easily switch between PyTorch, TensorFlow 2.0 and Flax, allowing training with one framework and inference with another.
-
-## Main concepts
-
-The library is built around three types of classes for each model:
-
-- **Model classes** can be PyTorch models ([torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)), Keras models ([tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model)) or JAX/Flax models ([flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen.html)) that work with the pretrained weights provided in the library.
-- **Configuration classes** store the hyperparameters required to build a model (such as the number of layers and hidden size). You don't always need to instantiate these yourself. In particular, if you are using a pretrained model without any modification, creating the model will automatically take care of instantiating the configuration (which is part of the model).
-- **Preprocessing classes** convert the raw data into a format accepted by the model. A [tokenizer](main_classes/tokenizer) stores the vocabulary for each model and provide methods for encoding and decoding strings in a list of token embedding indices to be fed to a model. [Image processors](main_classes/image_processor) preprocess vision inputs, [feature extractors](main_classes/feature_extractor) preprocess audio inputs, and a [processor](main_classes/processors) handles multimodal inputs.
-
-All these classes can be instantiated from pretrained instances, saved locally, and shared on the Hub with three methods:
-
-- `from_pretrained()` lets you instantiate a model, configuration, and preprocessing class from a pretrained version either
-  provided by the library itself (the supported models can be found on the [Model Hub](https://huggingface.co/models)) or
-  stored locally (or on a server) by the user.
-- `save_pretrained()` lets you save a model, configuration, and preprocessing class locally so that it can be reloaded using
-  `from_pretrained()`.
-- `push_to_hub()` lets you share a model, configuration, and a preprocessing class to the Hub, so it is easily accessible to everyone.
-
diff --git a/docs/source/en/pipeline_tutorial.md b/docs/source/en/pipeline_tutorial.md
new file mode 100644
index 000000000000..460fc17274a8
--- /dev/null
+++ b/docs/source/en/pipeline_tutorial.md
@@ -0,0 +1,317 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Pipelines for inference
+
+The [`pipeline`] makes it simple to use any model from the [Hub](https://huggingface.co/models) for inference on any language, computer vision, speech, and multimodal tasks. Even if you don't have experience with a specific modality or aren't familiar with the underlying code behind the models, you can still use them for inference with the [`pipeline`]! This tutorial will teach you to:
+
+* Use a [`pipeline`] for inference.
+* Use a specific tokenizer or model.
+* Use a [`pipeline`] for audio, vision, and multimodal tasks.
+
+<Tip>
+
+Take a look at the [`pipeline`] documentation for a complete list of supported tasks and available parameters.
+
+</Tip>
+
+## Pipeline usage
+
+While each task has an associated [`pipeline`], it is simpler to use the general [`pipeline`] abstraction which contains 
+all the task-specific pipelines. The [`pipeline`] automatically loads a default model and a preprocessing class capable 
+of inference for your task. Let's take the example of using the [`pipeline`] for automatic speech recognition (ASR), or
+speech-to-text.
+
+
+1. Start by creating a [`pipeline`] and specify the inference task:
+
+```py
+>>> from transformers import pipeline
+
+>>> transcriber = pipeline(task="automatic-speech-recognition")
+```
+
+2. Pass your input to the [`pipeline`]. In the case of speech recognition, this is an audio input file:
+
+```py
+>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+{'text': 'I HAVE A DREAM BUT ONE DAY THIS NATION WILL RISE UP LIVE UP THE TRUE MEANING OF ITS TREES'}
+```
+
+Not the result you had in mind? Check out some of the [most downloaded automatic speech recognition models](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&sort=trending) 
+on the Hub to see if you can get a better transcription.
+
+Let's try the [Whisper large-v2](https://huggingface.co/openai/whisper-large) model from OpenAI. Whisper was released 
+2 years later than Wav2Vec2, and was trained on close to 10x more data. As such, it beats Wav2Vec2 on most downstream 
+benchmarks. It also has the added benefit of predicting punctuation and casing, neither of which are possible with  
+Wav2Vec2.
+
+Let's give it a try here to see how it performs:
+
+```py
+>>> transcriber = pipeline(model="openai/whisper-large-v2")
+>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'}
+```
+
+Now this result looks more accurate! For a deep-dive comparison on Wav2Vec2 vs Whisper, refer to the [Audio Transformers Course](https://huggingface.co/learn/audio-course/chapter5/asr_models).
+We really encourage you to check out the Hub for models in different languages, models specialized in your field, and more.
+You can check out and compare model results directly from your browser on the Hub to see if it fits or 
+handles corner cases better than other ones.
+And if you don't find a model for your use case, you can always start [training](training) your own!
+
+If you have several inputs, you can pass your input as a list:
+
+```py
+transcriber(
+    [
+        "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac",
+        "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac",
+    ]
+)
+```
+
+Pipelines are great for experimentation as switching from one model to another is trivial; however, there are some ways to optimize them for larger workloads than experimentation. See the following guides that dive into iterating over whole datasets or using pipelines in a webserver:
+of the docs:
+* [Using pipelines on a dataset](#using-pipelines-on-a-dataset)
+* [Using pipelines for a webserver](./pipeline_webserver)
+
+## Parameters
+
+[`pipeline`] supports many parameters; some are task specific, and some are general to all pipelines.
+In general, you can specify parameters anywhere you want:
+
+```py
+transcriber = pipeline(model="openai/whisper-large-v2", my_parameter=1)
+
+out = transcriber(...)  # This will use `my_parameter=1`.
+out = transcriber(..., my_parameter=2)  # This will override and use `my_parameter=2`.
+out = transcriber(...)  # This will go back to using `my_parameter=1`.
+```
+
+Let's check out 3 important ones:
+
+### Device
+
+If you use `device=n`, the pipeline automatically puts the model on the specified device.
+This will work regardless of whether you are using PyTorch or Tensorflow.
+
+```py
+transcriber = pipeline(model="openai/whisper-large-v2", device=0)
+```
+
+If the model is too large for a single GPU and you are using PyTorch, you can set `device_map="auto"` to automatically 
+determine how to load and store the model weights. Using the `device_map` argument requires the 🤗 [Accelerate](https://huggingface.co/docs/accelerate)
+package:
+
+```bash
+pip install --upgrade accelerate
+```
+
+The following code automatically loads and stores model weights across devices:
+
+```py
+transcriber = pipeline(model="openai/whisper-large-v2", device_map="auto")
+```
+
+Note that if  `device_map="auto"` is passed, there is no need to add the argument `device=device` when instantiating your `pipeline` as you may encounter some unexpected behavior!
+
+### Batch size
+
+By default, pipelines will not batch inference for reasons explained in detail [here](https://huggingface.co/docs/transformers/main_classes/pipelines#pipeline-batching). The reason is that batching is not necessarily faster, and can actually be quite slower in some cases.
+
+But if it works in your use case, you can use:
+
+```py
+transcriber = pipeline(model="openai/whisper-large-v2", device=0, batch_size=2)
+audio_filenames = [f"https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/{i}.flac" for i in range(1, 5)]
+texts = transcriber(audio_filenames)
+```
+
+This runs the pipeline on the 4 provided audio files, but it will pass them in batches of 2
+to the model (which is on a GPU, where batching is more likely to help) without requiring any further code from you. 
+The output should always match what you would have received without batching. It is only meant as a way to help you get more speed out of a pipeline.
+
+Pipelines can also alleviate some of the complexities of batching because, for some pipelines, a single item (like a long audio file) needs to be chunked into multiple parts to be processed by a model. The pipeline performs this [*chunk batching*](./main_classes/pipelines#pipeline-chunk-batching) for you.
+
+### Task specific parameters
+
+All tasks provide task specific parameters which allow for additional flexibility and options to help you get your job done.
+For instance, the [`transformers.AutomaticSpeechRecognitionPipeline.__call__`] method has a `return_timestamps` parameter which sounds promising for subtitling videos:
+
+
+```py
+>>> transcriber = pipeline(model="openai/whisper-large-v2", return_timestamps=True)
+>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.', 'chunks': [{'timestamp': (0.0, 11.88), 'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its'}, {'timestamp': (11.88, 12.38), 'text': ' creed.'}]}
+```
+
+As you can see, the model inferred the text and also outputted **when** the various sentences were pronounced.
+
+There are many parameters available for each task, so check out each task's API reference to see what you can tinker with!
+For instance, the [`~transformers.AutomaticSpeechRecognitionPipeline`] has a `chunk_length_s` parameter which is helpful 
+for working on really long audio files (for example, subtitling entire movies or hour-long videos) that a model typically 
+cannot handle on its own:
+
+```python
+>>> transcriber = pipeline(model="openai/whisper-large-v2", chunk_length_s=30, return_timestamps=True)
+>>> transcriber("https://huggingface.co/datasets/sanchit-gandhi/librispeech_long/resolve/main/audio.wav")
+{'text': " Chapter 16. I might have told you of the beginning of this liaison in a few lines, but I wanted you to see every step by which we came.  I, too, agree to whatever Marguerite wished, Marguerite to be unable to live apart from me. It was the day after the evening...
+```
+
+If you can't find a parameter that would really help you out, feel free to [request it](https://github.com/huggingface/transformers/issues/new?assignees=&labels=feature&template=feature-request.yml)!
+
+
+## Using pipelines on a dataset
+
+The pipeline can also run inference on a large dataset. The easiest way we recommend doing this is by using an iterator:
+
+```py
+def data():
+    for i in range(1000):
+        yield f"My example {i}"
+
+
+pipe = pipeline(model="gpt2", device=0)
+generated_characters = 0
+for out in pipe(data()):
+    generated_characters += len(out[0]["generated_text"])
+```
+
+The iterator `data()` yields each result, and the pipeline automatically
+recognizes the input is iterable and will start fetching the data while
+it continues to process it on the GPU (this uses [DataLoader](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader) under the hood).
+This is important because you don't have to allocate memory for the whole dataset
+and you can feed the GPU as fast as possible.
+
+Since batching could speed things up, it may be useful to try tuning the `batch_size` parameter here.
+
+The simplest way to iterate over a dataset is to just load one from 🤗 [Datasets](https://github.com/huggingface/datasets/):
+
+```py
+# KeyDataset is a util that will just output the item we're interested in.
+from transformers.pipelines.pt_utils import KeyDataset
+from datasets import load_dataset
+
+pipe = pipeline(model="hf-internal-testing/tiny-random-wav2vec2", device=0)
+dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:10]")
+
+for out in pipe(KeyDataset(dataset, "audio")):
+    print(out)
+```
+
+
+## Using pipelines for a webserver
+
+<Tip>
+Creating an inference engine is a complex topic which deserves it's own
+page.
+</Tip>
+
+[Link](./pipeline_webserver)
+
+## Vision pipeline
+
+Using a [`pipeline`] for vision tasks is practically identical.
+
+Specify your task and pass your image to the classifier. The image can be a link, a local path or a base64-encoded image. For example, what species of cat is shown below?
+
+![pipeline-cat-chonk](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg)
+
+```py
+>>> from transformers import pipeline
+
+>>> vision_classifier = pipeline(model="google/vit-base-patch16-224")
+>>> preds = vision_classifier(
+...     images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> preds
+[{'score': 0.4335, 'label': 'lynx, catamount'}, {'score': 0.0348, 'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'}, {'score': 0.0324, 'label': 'snow leopard, ounce, Panthera uncia'}, {'score': 0.0239, 'label': 'Egyptian cat'}, {'score': 0.0229, 'label': 'tiger cat'}]
+```
+
+## Text pipeline
+
+Using a [`pipeline`] for NLP tasks is practically identical.
+
+```py
+>>> from transformers import pipeline
+
+>>> # This model is a `zero-shot-classification` model.
+>>> # It will classify text, except you are free to choose any label you might imagine
+>>> classifier = pipeline(model="facebook/bart-large-mnli")
+>>> classifier(
+...     "I have a problem with my iphone that needs to be resolved asap!!",
+...     candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"],
+... )
+{'sequence': 'I have a problem with my iphone that needs to be resolved asap!!', 'labels': ['urgent', 'phone', 'computer', 'not urgent', 'tablet'], 'scores': [0.504, 0.479, 0.013, 0.003, 0.002]}
+```
+
+## Multimodal pipeline
+
+The [`pipeline`] supports more than one modality. For example, a visual question answering (VQA) task combines text and image. Feel free to use any image link you like and a question you want to ask about the image. The image can be a URL or a local path to the image.
+
+For example, if you use this [invoice image](https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png):
+
+```py
+>>> from transformers import pipeline
+
+>>> vqa = pipeline(model="impira/layoutlm-document-qa")
+>>> vqa(
+...     image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png",
+...     question="What is the invoice number?",
+... )
+[{'score': 0.42515, 'answer': 'us-001', 'start': 16, 'end': 16}]
+```
+
+<Tip>
+
+To run the example above you need to have [`pytesseract`](https://pypi.org/project/pytesseract/) installed in addition to 🤗 Transformers:
+
+```bash
+sudo apt install -y tesseract-ocr
+pip install pytesseract
+```
+
+</Tip>
+
+## Using `pipeline` on large models with 🤗 `accelerate`:
+
+You can easily run `pipeline` on large models using 🤗 `accelerate`! First make sure you have installed `accelerate` with `pip install accelerate`. 
+
+First load your model using `device_map="auto"`! We will use `facebook/opt-1.3b` for our example.
+
+```py
+# pip install accelerate
+import torch
+from transformers import pipeline
+
+pipe = pipeline(model="facebook/opt-1.3b", torch_dtype=torch.bfloat16, device_map="auto")
+output = pipe("This is a cool example!", do_sample=True, top_p=0.95)
+```
+
+You can also pass 8-bit loaded models if you install `bitsandbytes` and add the argument `load_in_8bit=True`
+
+```py
+# pip install accelerate bitsandbytes
+import torch
+from transformers import pipeline
+
+pipe = pipeline(model="facebook/opt-1.3b", device_map="auto", model_kwargs={"load_in_8bit": True})
+output = pipe("This is a cool example!", do_sample=True, top_p=0.95)
+```
+
+Note that you can replace the checkpoint with any of the Hugging Face model that supports large model loading such as BLOOM!
diff --git a/docs/source/en/pipeline_tutorial.mdx b/docs/source/en/pipeline_tutorial.mdx
deleted file mode 100644
index 4be43484e02a..000000000000
--- a/docs/source/en/pipeline_tutorial.mdx
+++ /dev/null
@@ -1,248 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Pipelines for inference
-
-The [`pipeline`] makes it simple to use any model from the [Hub](https://huggingface.co/models) for inference on any language, computer vision, speech, and multimodal tasks. Even if you don't have experience with a specific modality or aren't familiar with the underlying code behind the models, you can still use them for inference with the [`pipeline`]! This tutorial will teach you to:
-
-* Use a [`pipeline`] for inference.
-* Use a specific tokenizer or model.
-* Use a [`pipeline`] for audio, vision, and multimodal tasks.
-
-<Tip>
-
-Take a look at the [`pipeline`] documentation for a complete list of supported tasks and available parameters.
-
-</Tip>
-
-## Pipeline usage
-
-While each task has an associated [`pipeline`], it is simpler to use the general [`pipeline`] abstraction which contains all the task-specific pipelines. The [`pipeline`] automatically loads a default model and a preprocessing class capable of inference for your task.
-
-1. Start by creating a [`pipeline`] and specify an inference task:
-
-```py
->>> from transformers import pipeline
-
->>> generator = pipeline(task="automatic-speech-recognition")
-```
-
-2. Pass your input text to the [`pipeline`]:
-
-```py
->>> generator("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
-{'text': 'I HAVE A DREAM BUT ONE DAY THIS NATION WILL RISE UP LIVE UP THE TRUE MEANING OF ITS TREES'}
-```
-
-Not the result you had in mind? Check out some of the [most downloaded automatic speech recognition models](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&sort=downloads) on the Hub to see if you can get a better transcription.
-Let's try [openai/whisper-large](https://huggingface.co/openai/whisper-large):
-
-```py
->>> generator = pipeline(model="openai/whisper-large")
->>> generator("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
-{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'}
-```
-
-Now this result looks more accurate!
-We really encourage you to check out the Hub for models in different languages, models specialized in your field, and more.
-You can check out and compare model results directly from your browser on the Hub to see if it fits or 
-handles corner cases better than other ones.
-And if you don't find a model for your use case, you can always start [training](training) your own!
-
-If you have several inputs, you can pass your input as a list:
-
-```py
-generator(
-    [
-        "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac",
-        "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac",
-    ]
-)
-```
-
-If you want to iterate over a whole dataset, or want to use it for inference in a webserver, check out dedicated parts
-
-[Using pipelines on a dataset](#using-pipelines-on-a-dataset)
-
-[Using pipelines for a webserver](./pipeline_webserver)
-
-## Parameters
-
-[`pipeline`] supports many parameters; some are task specific, and some are general to all pipelines.
-In general you can specify parameters anywhere you want:
-
-```py
-generator(model="openai/whisper-large", my_parameter=1)
-out = generate(...)  # This will use `my_parameter=1`.
-out = generate(..., my_parameter=2)  # This will override and use `my_parameter=2`.
-out = generate(...)  # This will go back to using `my_parameter=1`.
-```
-
-Let's check out 3 important ones:
-
-### Device
-
-If you use `device=n`, the pipeline automatically puts the model on the specified device.
-This will work regardless of whether you are using PyTorch or Tensorflow.
-
-```py
-generator(model="openai/whisper-large", device=0)
-```
-
-If the model is too large for a single GPU, you can set `device_map="auto"` to allow 🤗 [Accelerate](https://huggingface.co/docs/accelerate) to automatically determine how to load and store the model weights.
-
-```py
-#!pip install accelerate
-generator(model="openai/whisper-large", device_map="auto")
-```
-
-### Batch size
-
-By default, pipelines will not batch inference for reasons explained in detail [here](https://huggingface.co/docs/transformers/main_classes/pipelines#pipeline-batching). The reason is that batching is not necessarily faster, and can actually be quite slower in some cases.
-
-But if it works in your use case, you can use:
-
-```py
-generator(model="openai/whisper-large", device=0, batch_size=2)
-audio_filenames = [f"audio_{i}.flac" for i in range(10)]
-texts = generator(audio_filenames)
-```
-
-This runs the pipeline on the 10 provided audio files, but it will pass them in batches of 2
-to the model (which is on a GPU, where batching is more likely to help) without requiring any further code from you. 
-The output should always match what you would have received without batching. It is only meant as a way to help you get more speed out of a pipeline.
-
-Pipelines can also alleviate some of the complexities of batching because, for some pipelines, a single item (like a long audio file) needs to be chunked into multiple parts to be processed by a model. The pipeline performs this [*chunk batching*](./main_classes/pipelines#pipeline-chunk-batching) for you.
-
-### Task specific parameters
-
-All tasks provide task specific parameters which allow for additional flexibility and options to help you get your job done.
-For instance, the [`transformers.AutomaticSpeechRecognitionPipeline.__call__`] method has a `return_timestamps` parameter which sounds promising for subtitling videos:
-
-
-```py
->>> # Not using whisper, as it cannot provide timestamps.
->>> generator = pipeline(model="facebook/wav2vec2-large-960h-lv60-self", return_timestamps="word")
->>> generator("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
-{'text': 'I HAVE A DREAM BUT ONE DAY THIS NATION WILL RISE UP AND LIVE OUT THE TRUE MEANING OF ITS CREED', 'chunks': [{'text': 'I', 'timestamp': (1.22, 1.24)}, {'text': 'HAVE', 'timestamp': (1.42, 1.58)}, {'text': 'A', 'timestamp': (1.66, 1.68)}, {'text': 'DREAM', 'timestamp': (1.76, 2.14)}, {'text': 'BUT', 'timestamp': (3.68, 3.8)}, {'text': 'ONE', 'timestamp': (3.94, 4.06)}, {'text': 'DAY', 'timestamp': (4.16, 4.3)}, {'text': 'THIS', 'timestamp': (6.36, 6.54)}, {'text': 'NATION', 'timestamp': (6.68, 7.1)}, {'text': 'WILL', 'timestamp': (7.32, 7.56)}, {'text': 'RISE', 'timestamp': (7.8, 8.26)}, {'text': 'UP', 'timestamp': (8.38, 8.48)}, {'text': 'AND', 'timestamp': (10.08, 10.18)}, {'text': 'LIVE', 'timestamp': (10.26, 10.48)}, {'text': 'OUT', 'timestamp': (10.58, 10.7)}, {'text': 'THE', 'timestamp': (10.82, 10.9)}, {'text': 'TRUE', 'timestamp': (10.98, 11.18)}, {'text': 'MEANING', 'timestamp': (11.26, 11.58)}, {'text': 'OF', 'timestamp': (11.66, 11.7)}, {'text': 'ITS', 'timestamp': (11.76, 11.88)}, {'text': 'CREED', 'timestamp': (12.0, 12.38)}]}
-```
-
-As you can see, the model inferred the text and also outputted **when** the various words were pronounced
-in the sentence.
-
-There are many parameters available for each task, so check out each task's API reference to see what you can tinker with!
-For instance, the [`~transformers.AutomaticSpeechRecognitionPipeline`] has a `chunk_length_s` parameter which is helpful for working on really long audio files (for example, subtitling entire movies or hour-long videos) that a model typically cannot handle on its own.
-
-
-If you can't find a parameter that would really help you out, feel free to [request it](https://github.com/huggingface/transformers/issues/new?assignees=&labels=feature&template=feature-request.yml)!
-
-
-## Using pipelines on a dataset
-
-The pipeline can also run inference on a large dataset. The easiest way we recommend doing this is by using an iterator:
-
-```py
-def data():
-    for i in range(1000):
-        yield f"My example {i}"
-
-
-pipe = pipe(model="gpt2", device=0)
-generated_characters = 0
-for out in pipe(data()):
-    generated_characters += len(out["generated_text"])
-```
-
-The iterator `data()` yields each result, and the pipeline automatically
-recognizes the input is iterable and will start fetching the data while
-it continues to process it on the GPU (this uses [DataLoader](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader) under the hood).
-This is important because you don't have to allocate memory for the whole dataset
-and you can feed the GPU as fast as possible.
-
-Since batching could speed things up, it may be useful to try tuning the `batch_size` parameter here.
-
-The simplest way to iterate over a dataset is to just load one from 🤗 [Datasets](https://github.com/huggingface/datasets/):
-
-```py
-# KeyDataset is a util that will just output the item we're interested in.
-from transformers.pipelines.pt_utils import KeyDataset
-
-pipe = pipeline(model="hf-internal-testing/tiny-random-wav2vec2", device=0)
-dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:10]")
-
-for out in pipe(KeyDataset(dataset["audio"])):
-    print(out)
-```
-
-
-## Using pipelines for a webserver
-
-<Tip>
-Creating an inference engine is a complex topic which deserves it's own
-page.
-</Tip>
-
-[Link](./pipeline_webserver)
-
-## Vision pipeline
-
-Using a [`pipeline`] for vision tasks is practically identical.
-
-Specify your task and pass your image to the classifier. The image can be a link or a local path to the image. For example, what species of cat is shown below?
-
-![pipeline-cat-chonk](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg)
-
-```py
->>> from transformers import pipeline
-
->>> vision_classifier = pipeline(model="google/vit-base-patch16-224")
->>> preds = vision_classifier(
-...     images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
-... )
->>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
->>> preds
-[{'score': 0.4335, 'label': 'lynx, catamount'}, {'score': 0.0348, 'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'}, {'score': 0.0324, 'label': 'snow leopard, ounce, Panthera uncia'}, {'score': 0.0239, 'label': 'Egyptian cat'}, {'score': 0.0229, 'label': 'tiger cat'}]
-```
-
-## Text pipeline
-
-Using a [`pipeline`] for NLP tasks is practically identical.
-
-```py
->>> from transformers import pipeline
-
->>> # This model is a `zero-shot-classification` model.
->>> # It will classify text, except you are free to choose any label you might imagine
->>> classifier = pipeline(model="facebook/bart-large-mnli")
->>> classifier(
-...     "I have a problem with my iphone that needs to be resolved asap!!",
-...     candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"],
-... )
-{'sequence': 'I have a problem with my iphone that needs to be resolved asap!!', 'labels': ['urgent', 'phone', 'computer', 'not urgent', 'tablet'], 'scores': [0.504, 0.479, 0.013, 0.003, 0.002]}
-```
-
-## Multimodal pipeline
-
-The [`pipeline`] supports more than one modality. For example, a visual question answering (VQA) task combines text and image. Feel free to use any image link you like and a question you want to ask about the image. The image can be a URL or a local path to the image.
-
-For example, if you use this [invoice image](https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png):
-
-```py
->>> from transformers import pipeline
-
->>> vqa = pipeline(model="impira/layoutlm-document-qa")
->>> vqa(
-...     image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png",
-...     question="What is the invoice number?",
-... )
-[{'score': 0.42514941096305847, 'answer': 'us-001', 'start': 16, 'end': 16}]
-```
diff --git a/docs/source/en/pipeline_webserver.md b/docs/source/en/pipeline_webserver.md
new file mode 100644
index 000000000000..38ef28d498c6
--- /dev/null
+++ b/docs/source/en/pipeline_webserver.md
@@ -0,0 +1,168 @@
+<!--⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+# Using pipelines for a webserver
+
+<Tip>
+Creating an inference engine is a complex topic, and the "best" solution 
+will most likely depend on your problem space. Are you on CPU or GPU? Do
+you want the lowest latency, the highest throughput, support for
+many models, or just highly optimize 1 specific model?
+There are many ways to tackle this topic, so what we are going to present is a good default
+to get started which may not necessarily be the most optimal solution for you.
+</Tip>
+
+
+The key thing to understand is that we can use an iterator, just like you would [on a
+dataset](pipeline_tutorial#using-pipelines-on-a-dataset), since a webserver is basically a system that waits for requests and
+treats them as they come in.
+
+Usually webservers are multiplexed (multithreaded, async, etc..) to handle various
+requests concurrently. Pipelines on the other hand (and mostly the underlying models)
+are not really great for parallelism; they take up a lot of RAM, so it's best to give them all the available resources when they are running or it's a compute-intensive job.
+
+We are going to solve that by having the webserver handle the light load of receiving
+and sending requests, and having a single thread handling the actual work.
+This example is going to use `starlette`. The actual framework is not really
+important, but you might have to tune or change the code if you are using another
+one to achieve the same effect.
+
+Create `server.py`:
+
+```py
+from starlette.applications import Starlette
+from starlette.responses import JSONResponse
+from starlette.routing import Route
+from transformers import pipeline
+import asyncio
+
+
+async def homepage(request):
+    payload = await request.body()
+    string = payload.decode("utf-8")
+    response_q = asyncio.Queue()
+    await request.app.model_queue.put((string, response_q))
+    output = await response_q.get()
+    return JSONResponse(output)
+
+
+async def server_loop(q):
+    pipe = pipeline(model="bert-base-uncased")
+    while True:
+        (string, response_q) = await q.get()
+        out = pipe(string)
+        await response_q.put(out)
+
+
+app = Starlette(
+    routes=[
+        Route("/", homepage, methods=["POST"]),
+    ],
+)
+
+
+@app.on_event("startup")
+async def startup_event():
+    q = asyncio.Queue()
+    app.model_queue = q
+    asyncio.create_task(server_loop(q))
+```
+
+Now you can start it with:
+```bash
+uvicorn server:app
+```
+
+And you can query it:
+```bash
+curl -X POST -d "test [MASK]" http://localhost:8000/
+#[{"score":0.7742936015129089,"token":1012,"token_str":".","sequence":"test."},...]
+```
+
+And there you go, now you have a good idea of how to create a webserver!
+
+What is really important is that we load the model only **once**, so there are no copies
+of the model on the webserver. This way, no unnecessary RAM is being used.
+Then the queuing mechanism allows you to do fancy stuff like maybe accumulating a few
+items before inferring to use dynamic batching:
+
+<Tip warning={true}>
+
+The code sample below is intentionally written like pseudo-code for readability.
+Do not run this without checking if it makes sense for your system resources!
+
+</Tip>
+
+```py
+(string, rq) = await q.get()
+strings = []
+queues = []
+while True:
+    try:
+        (string, rq) = await asyncio.wait_for(q.get(), timeout=0.001)  # 1ms
+    except asyncio.exceptions.TimeoutError:
+        break
+    strings.append(string)
+    queues.append(rq)
+strings
+outs = pipe(strings, batch_size=len(strings))
+for rq, out in zip(queues, outs):
+    await rq.put(out)
+```
+
+Again, the proposed code is optimized for readability, not for being the best code.
+First of all, there's no batch size limit which is usually not a 
+great idea. Next, the timeout is reset on every queue fetch, meaning you could
+wait much more than 1ms before running the inference (delaying the first request 
+by that much). 
+
+It would be better to have a single 1ms deadline.
+
+This will always wait for 1ms even if the queue is empty, which might not be the
+best since you probably want to start doing inference if there's nothing in the queue.
+But maybe it does make sense if batching is really crucial for your use case.
+Again, there's really no one best solution.
+
+
+## Few things you might want to consider
+
+### Error checking
+
+There's a lot that can go wrong in production: out of memory, out of space,
+loading the model might fail, the query might be wrong, the query might be
+correct but still fail to run because of a model misconfiguration, and so on.
+
+Generally, it's good if the server outputs the errors to the user, so
+adding a lot of `try..except` statements to show those errors is a good
+idea. But keep in mind it may also be a security risk to reveal all those errors depending 
+on your security context.
+
+### Circuit breaking
+
+Webservers usually look better when they do circuit breaking. It means they 
+return proper errors when they're overloaded instead of just waiting for the query indefinitely. Return a 503 error instead of waiting for a super long time or a 504 after a long time.
+
+This is relatively easy to implement in the proposed code since there is a single queue.
+Looking at the queue size is a basic way to start returning errors before your 
+webserver fails under load.
+
+### Blocking the main thread
+
+Currently PyTorch is not async aware, and computation will block the main
+thread while running. That means it would be better if PyTorch was forced to run
+on its own thread/process. This wasn't done here because the code is a lot more
+complex (mostly because threads and async and queues don't play nice together).
+But ultimately it does the same thing.
+
+This would be important if the inference of single items were long (> 1s) because 
+in this case, it means every query during inference would have to wait for 1s before
+even receiving an error.
+
+### Dynamic batching
+
+In general, batching is not necessarily an improvement over passing 1 item at 
+a time (see [batching details](./main_classes/pipelines#pipeline-batching) for more information). But it can be very effective
+when used in the correct setting. In the API, there is no dynamic
+batching by default (too much opportunity for a slowdown). But for BLOOM inference -
+which is a very large model - dynamic batching is **essential** to provide a decent experience for everyone.
diff --git a/docs/source/en/pipeline_webserver.mdx b/docs/source/en/pipeline_webserver.mdx
deleted file mode 100644
index d9f12fa2b3a0..000000000000
--- a/docs/source/en/pipeline_webserver.mdx
+++ /dev/null
@@ -1,161 +0,0 @@
-# Using pipelines for a webserver
-
-<Tip>
-Creating an inference engine is a complex topic, and the "best" solution 
-will most likely depend on your problem space. Are you on CPU or GPU? Do
-you want the lowest latency, the highest throughput, support for
-many models, or just highly optimize 1 specific model?
-There are many ways to tackle this topic, so what we are going to present is a good default
-to get started which may not necessarily be the most optimal solution for you.
-</Tip>
-
-
-The key thing to understand is that we can use an iterator, just like you would [on a
-dataset](pipeline_tutorial#using-pipelines-on-a-dataset), since a webserver is basically a system that waits for requests and
-treats them as they come in.
-
-Usually webservers are multiplexed (multithreaded, async, etc..) to handle various
-requests concurrently. Pipelines on the other hand (and mostly the underlying models)
-are not really great for parallelism; they take up a lot of RAM, so it's best to give them all the available resources when they are running or it's a compute-intensive job.
-
-We are going to solve that by having the webserver handle the light load of receiving
-and sending requests, and having a single thread handling the actual work.
-This example is going to use `starlette`. The actual framework is not really
-important, but you might have to tune or change the code if you are using another
-one to achieve the same effect.
-
-Create `server.py`:
-
-```py
-from starlette.applications import Starlette
-from starlette.responses import JSONResponse
-from starlette.routing import Route
-from transformers import pipeline
-import asyncio
-
-
-async def homepage(request):
-    payload = await request.body()
-    string = payload.decode("utf-8")
-    response_q = asyncio.Queue()
-    await request.app.model_queue.put((string, response_q))
-    output = await response_q.get()
-    return JSONResponse(output)
-
-
-async def server_loop(q):
-    pipe = pipeline(model="bert-base-uncased")
-    while True:
-        (string, response_q) = await q.get()
-        out = pipe(string)
-        await response_q.put(out)
-
-
-app = Starlette(
-    routes=[
-        Route("/", homepage, methods=["POST"]),
-    ],
-)
-
-
-@app.on_event("startup")
-async def startup_event():
-    q = asyncio.Queue()
-    app.model_queue = q
-    asyncio.create_task(server_loop(q))
-```
-
-Now you can start it with:
-```bash
-uvicorn server:app
-```
-
-And you can query it:
-```bash
-curl -X POST -d "test [MASK]" http://localhost:8000/
-#[{"score":0.7742936015129089,"token":1012,"token_str":".","sequence":"test."},...]
-```
-
-And there you go, now you have a good idea of how to create a webserver!
-
-What is really important is that we load the model only **once**, so there are no copies
-of the model on the webserver. This way, no unnecessary RAM is being used.
-Then the queuing mechanism allows you to do fancy stuff like maybe accumulating a few
-items before inferring to use dynamic batching:
-
-```py
-(string, rq) = await q.get()
-strings = []
-queues = []
-while True:
-    try:
-        (string, rq) = await asyncio.wait_for(q.get(), timeout=0.001)  # 1ms
-    except asyncio.exceptions.TimeoutError:
-        break
-    strings.append(string)
-    queues.append(rq)
-strings
-outs = pipe(strings, batch_size=len(strings))
-for (rq, out) in zip(queues, outs):
-    await rq.put(out)
-```
-
-<Tip warning={true}>
-Do not activate this without checking it makes sense for your load!
-</Tip>
-
-The proposed code is optimized for readability, not for being the best code.
-First of all, there's no batch size limit which is usually not a 
-great idea. Next, the timeout is reset on every queue fetch, meaning you could
-wait much more than 1ms before running the inference (delaying the first request 
-by that much). 
-
-It would be better to have a single 1ms deadline.
-
-This will always wait for 1ms even if the queue is empty, which might not be the
-best since you probably want to start doing inference if there's nothing in the queue.
-But maybe it does make sense if batching is really crucial for your use case.
-Again, there's really no one best solution.
-
-
-## Few things you might want to consider
-
-### Error checking
-
-There's a lot that can go wrong in production: out of memory, out of space,
-loading the model might fail, the query might be wrong, the query might be
-correct but still fail to run because of a model misconfiguration, and so on.
-
-Generally, it's good if the server outputs the errors to the user, so
-adding a lot of `try..except` statements to show those errors is a good
-idea. But keep in mind it may also be a security risk to reveal all those errors depending 
-on your security context.
-
-### Circuit breaking
-
-Webservers usually look better when they do circuit breaking. It means they 
-return proper errors when they're overloaded instead of just waiting for the query indefinitely. Return a 503 error instead of waiting for a super long time or a 504 after a long time.
-
-This is relatively easy to implement in the proposed code since there is a single queue.
-Looking at the queue size is a basic way to start returning errors before your 
-webserver fails under load.
-
-### Blocking the main thread
-
-Currently PyTorch is not async aware, and computation will block the main
-thread while running. That means it would be better if PyTorch was forced to run
-on its own thread/process. This wasn't done here because the code is a lot more
-complex (mostly because threads and async and queues don't play nice together).
-But ultimately it does the same thing.
-
-This would be important if the inference of single items were long (> 1s) because 
-in this case, it means every query during inference would have to wait for 1s before
-even receiving an error.
-
-### Dynamic batching
-
-In general, batching is not necessarily an improvement over passing 1 item at 
-a time (see [batching details](./main_classes/pipelines#pipeline-batching) for more information). But it can be very effective
-when used in the correct setting. In the API, there is no dynamic
-batching by default (too much opportunity for a slowdown). But for BLOOM inference -
-which is a very large model - dynamic batching is **essential** to provide a decent experience for everyone.
diff --git a/docs/source/en/pr_checks.md b/docs/source/en/pr_checks.md
new file mode 100644
index 000000000000..f50cede3264f
--- /dev/null
+++ b/docs/source/en/pr_checks.md
@@ -0,0 +1,200 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Checks on a Pull Request
+
+When you open a pull request on 🤗 Transformers, a fair number of checks will be run to make sure the patch you are adding is not breaking anything existing. Those checks are of four types:
+- regular tests
+- documentation build
+- code and documentation style
+- general repository consistency
+
+In this document, we will take a stab at explaining what those various checks are and the reason behind them, as well as how to debug them locally if one of them fails on your PR.
+
+Note that, ideally, they require you to have a dev install:
+
+```bash
+pip install transformers[dev]
+```
+
+or for an editable install:
+
+```bash
+pip install -e .[dev]
+```
+
+inside the Transformers repo. Since the number of optional dependencies of Transformers has grown a lot, it's possible you don't manage to get all of them. If the dev install fails, make sure to install the Deep Learning framework you are working with (PyTorch, TensorFlow and/or Flax) then do
+
+```bash
+pip install transformers[quality]
+```
+
+or for an editable install:
+
+```bash
+pip install -e .[quality]
+```
+
+
+## Tests
+
+All the jobs that begin with `ci/circleci: run_tests_` run parts of the Transformers testing suite. Each of those jobs focuses on a part of the library in a certain environment: for instance `ci/circleci: run_tests_pipelines_tf` runs the pipelines test in an environment where TensorFlow only is installed.
+
+Note that to avoid running tests when there is no real change in the modules they are testing, only part of the test suite is run each time: a utility is run to determine the differences in the library between before and after the PR (what GitHub shows you in the "Files changes" tab) and picks the tests impacted by that diff. That utility can be run locally with:
+
+```bash
+python utils/tests_fetcher.py
+```
+
+from the root of the Transformers repo. It will:
+
+1. Check for each file in the diff if the changes are in the code or only in comments or docstrings. Only the files with real code changes are kept.
+2. Build an internal map that gives for each file of the source code of the library all the files it recursively impacts. Module A is said to impact module B if module B imports module A. For the recursive impact, we need a chain of modules going from module A to module B in which each module imports the previous one.
+3. Apply this map on the files gathered in step 1, which  gives us the list of model files impacted by the PR.
+4. Map each of those files to their corresponding test file(s) and get the list of tests to run.
+
+When executing the script locally, you should get the results of step 1, 3 and 4 printed and thus know which tests are run. The script will also create a file named `test_list.txt` which contains the list of tests to run, and you can run them locally with the following command:
+
+```bash
+python -m pytest -n 8 --dist=loadfile -rA -s $(cat test_list.txt)
+```
+
+Just in case anything slipped through the cracks, the full test suite is also run daily.
+
+## Documentation build
+
+The `build_pr_documentation` job builds and generates a preview of the documentation to make sure everything looks okay once your PR is merged. A bot will add a link to preview the documentation in your PR. Any changes you make to the PR are automatically updated in the preview. If the documentation fails to build, click on **Details** next to the failed job to see where things went wrong. Often, the error is as simple as a missing file in the `toctree`.
+
+If you're interested in building or previewing the documentation locally, take a look at the [`README.md`](https://github.com/huggingface/transformers/tree/main/docs) in the docs folder.
+
+## Code and documentation style
+
+Code formatting is applied to all the source files, the examples and the tests using `black` and `ruff`. We also have a custom tool taking care of the formatting of docstrings and `rst` files (`utils/style_doc.py`), as well as the order of the lazy imports performed in the Transformers `__init__.py` files (`utils/custom_init_isort.py`). All of this can be launched by executing
+
+```bash
+make style
+```
+
+The CI checks those have been applied inside the `ci/circleci: check_code_quality` check. It also runs `ruff`, that will have a basic look at your code and will complain if it finds an undefined variable, or one that is not used. To run that check locally, use
+
+```bash
+make quality
+```
+
+This can take a lot of time, so to run the same thing on only the files you modified in the current branch, run
+
+```bash
+make fixup
+```
+
+This last command will also run all the additional checks for the repository consistency. Let's have a look at them.
+
+## Repository consistency
+
+This regroups all the tests to make sure your PR leaves the repository in a good state, and is performed by the `ci/circleci: check_repository_consistency` check. You can locally run that check by executing the following:
+
+```bash
+make repo-consistency
+```
+
+This checks that:
+
+- All objects added to the init are documented (performed by `utils/check_repo.py`)
+- All `__init__.py` files have the same content in their two sections (performed by `utils/check_inits.py`)
+- All code identified as a copy from another module is consistent with the original (performed by `utils/check_copies.py`)
+- All configuration classes have at least one valid checkpoint mentioned in their docstrings (performed by `utils/check_config_docstrings.py`)
+- All configuration classes only contain attributes that are used in corresponding modeling files (performed by `utils/check_config_attributes.py`)
+- The translations of the READMEs and the index of the doc have the same model list as the main README (performed by `utils/check_copies.py`)
+- The auto-generated tables in the documentation are up to date (performed by `utils/check_table.py`)
+- The library has all objects available even if not all optional dependencies are installed (performed by `utils/check_dummies.py`)
+- All docstrings properly document the arguments in the signature of the object (performed by `utils/check_docstrings.py`)
+
+Should this check fail, the first two items require manual fixing, the last four can be fixed automatically for you by running the command
+
+```bash
+make fix-copies
+```
+
+Additional checks concern PRs that add new models, mainly that:
+
+- All models added are in an Auto-mapping (performed by `utils/check_repo.py`)
+<!-- TODO Sylvain, add a check that makes sure the common tests are implemented.-->
+- All models are properly tested (performed by `utils/check_repo.py`)
+
+<!-- TODO Sylvain, add the following
+- All models are added to the main README, inside the main doc
+- All checkpoints used actually exist on the Hub
+
+-->
+
+### Check copies
+
+Since the Transformers library is very opinionated with respect to model code, and each model should fully be implemented in a single file without relying on other models, we have added a mechanism that checks whether a copy of the code of a layer of a given model stays consistent with the original. This way, when there is a bug fix, we can see all other impacted models and choose to trickle down the modification or break the copy.
+
+<Tip>
+
+If a file is a full copy of another file, you should register it in the constant `FULL_COPIES` of `utils/check_copies.py`.
+
+</Tip>
+
+This mechanism relies on comments of the form `# Copied from xxx`. The `xxx` should contain the whole path to the class of function which is being copied below. For instance, `RobertaSelfOutput` is a direct copy of the `BertSelfOutput` class, so you can see [here](https://github.com/huggingface/transformers/blob/2bd7a27a671fd1d98059124024f580f8f5c0f3b5/src/transformers/models/roberta/modeling_roberta.py#L289) it has a comment:
+
+```py
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput
+```
+
+Note that instead of applying this to a whole class, you can apply it to the relevant methods that are copied from. For instance [here](https://github.com/huggingface/transformers/blob/2bd7a27a671fd1d98059124024f580f8f5c0f3b5/src/transformers/models/roberta/modeling_roberta.py#L598) you can see how `RobertaPreTrainedModel._init_weights` is copied from the same method in `BertPreTrainedModel` with the comment:
+
+```py
+# Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
+```
+
+Sometimes the copy is exactly the same except for names: for instance in `RobertaAttention`, we use `RobertaSelfAttention` insted of `BertSelfAttention` but other than that, the code is exactly the same. This is why `# Copied from` supports simple string replacements with the follwoing syntax: `Copied from xxx with foo->bar`. This means the code is copied with all instances of `foo` being replaced by `bar`. You can see how it used [here](https://github.com/huggingface/transformers/blob/2bd7a27a671fd1d98059124024f580f8f5c0f3b5/src/transformers/models/roberta/modeling_roberta.py#L304C1-L304C86) in `RobertaAttention` with the comment:
+
+```py
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Roberta
+```
+
+Note that there shouldn't be any spaces around the arrow (unless that space is part of the pattern to replace of course).
+
+You can add several patterns separated by a comma. For instance here `CamemberForMaskedLM` is a direct copy of `RobertaForMaskedLM` with two replacements: `Roberta` to `Camembert` and `ROBERTA` to `CAMEMBERT`. You can see [here](https://github.com/huggingface/transformers/blob/15082a9dc6950ecae63a0d3e5060b2fc7f15050a/src/transformers/models/camembert/modeling_camembert.py#L929) this is done with the comment:
+
+```py
+# Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM with Roberta->Camembert, ROBERTA->CAMEMBERT
+```
+
+If the order matters (because one of the replacements might conflict with a previous one), the replacements are executed from left to right.
+
+<Tip>
+
+If the replacements change the formatting (if you replace a short name by a very long name for instance), the copy is checked after applying the auto-formatter.
+
+</Tip>
+
+Another way when the patterns are just different casings of the same replacement (with an uppercased and a lowercased variants) is just to add the option `all-casing`. [Here](https://github.com/huggingface/transformers/blob/15082a9dc6950ecae63a0d3e5060b2fc7f15050a/src/transformers/models/mobilebert/modeling_mobilebert.py#L1237) is an example in `MobileBertForSequenceClassification` with the comment:
+
+```py
+# Copied from transformers.models.bert.modeling_bert.BertForSequenceClassification with Bert->MobileBert all-casing
+```
+
+In this case, the code is copied from `BertForSequenceClassification` by replacing:
+- `Bert` by `MobileBert` (for instance when using `MobileBertModel` in the init)
+- `bert` by `mobilebert` (for instance when defining `self.mobilebert`)
+- `BERT` by `MOBILEBERT` (in the constant `MOBILEBERT_INPUTS_DOCSTRING`)
diff --git a/docs/source/en/pr_checks.mdx b/docs/source/en/pr_checks.mdx
deleted file mode 100644
index 8b562b62b29c..000000000000
--- a/docs/source/en/pr_checks.mdx
+++ /dev/null
@@ -1,128 +0,0 @@
-<!---
-Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Checks on a Pull Request
-
-When you open a pull request on 🤗 Transformers, a fair number of checks will be run to make sure the patch you are adding is not breaking anything existing. Those checks are of four types:
-- regular tests
-- documentation build
-- code and documentation style
-- general repository consistency
-
-In this document, we will take a stab at explaining what those various checks are and the reason behind them, as well as how to debug them locally if one of them fails on your PR.
-
-Note that they all require you to have a dev install:
-
-```bash
-pip install transformers[dev]
-```
-
-or for an editable install:
-
-```bash
-pip install -e .[dev]
-```
-
-inside the Transformers repo.
-
-## Tests
-
-All the jobs that begin with `ci/circleci: run_tests_` run parts of the Transformers testing suite. Each of those jobs focuses on a part of the library in a certain environment: for instance `ci/circleci: run_tests_pipelines_tf` runs the pipelines test in an environment where TensorFlow only is installed.
-
-Note that to avoid running tests when there is no real change in the modules they are testing, only part of the test suite is run each time: a utility is run to determine the differences in the library between before and after the PR (what GitHub shows you in the "Files changes" tab) and picks the tests impacted by that diff. That utility can be run locally with:
-
-```bash
-python utils/tests_fetcher.py
-```
-
-from the root of the Transformers repo. It will:
-
-1. Check for each file in the diff if the changes are in the code or only in comments or docstrings. Only the files with real code changes are kept.
-2. Build an internal map that gives for each file of the source code of the library all the files it recursively impacts. Module A is said to impact module B if module B imports module A. For the recursive impact, we need a chain of modules going from module A to module B in which each module imports the previous one.
-3. Apply this map on the files gathered in step 1, which  gives us the list of model files impacted by the PR.
-4. Map each of those files to their corresponding test file(s) and get the list of tests to run.
-
-When executing the script locally, you should get the results of step 1, 3 and 4 printed and thus know which tests are run. The script will also create a file named `test_list.txt` which contains the list of tests to run, and you can run them locally with the following command:
-
-```bash
-python -m pytest -n 8 --dist=loadfile -rA -s $(cat test_list.txt)
-```
-
-Just in case anything slipped through the cracks, the full test suite is also run daily.
-
-## Documentation build
-
-The `build_pr_documentation` job builds and generates a preview of the documentation to make sure everything looks okay once your PR is merged. A bot will add a link to preview the documentation in your PR. Any changes you make to the PR are automatically updated in the preview. If the documentation fails to build, click on **Details** next to the failed job to see where things went wrong. Often, the error is as simple as a missing file in the `toctree`.
-
-If you're interested in building or previewing the documentation locally, take a look at the [`README.md`](https://github.com/huggingface/transformers/tree/main/docs) in the docs folder.
-
-## Code and documentation style
-
-Code formatting is applied to all the source files, the examples and the tests using `black` and `isort`. We also have a custom tool taking care of the formatting of docstrings and `rst` files (`utils/style_doc.py`), as well as the order of the lazy imports performed in the Transformers `__init__.py` files (`utils/custom_init_isort.py`). All of this can be launched by executing
-
-```bash
-make style
-```
-
-The CI checks those have been applied inside the `ci/circleci: check_code_quality` check. It also runs `flake8`, that will have a basic look at your code and will complain if it finds an undefined variable, or one that is not used. To run that check locally, use
-
-```bash
-make quality
-```
-
-This can take a lot of time, so to run the same thing on only the files you modified in the current branch, run
-
-```bash
-make fixup
-```
-
-This last command will also run all the additional checks for the repository consistency. Let's have a look at them.
-
-## Repository consistency
-
-This regroups all the tests to make sure your PR leaves the repository in a good state, and is performed by the `ci/circleci: check_repository_consistency` check. You can locally run that check by executing the following:
-
-```bash
-make repo-consistency
-```
-
-This checks that:
-
-- All objects added to the init are documented (performed by `utils/check_repo.py`)
-- All `__init__.py` files have the same content in their two sections (performed by `utils/check_inits.py`)
-- All code identified as a copy from another module is consistent with the original (performed by `utils/check_copies.py`)
-- All configuration classes have at least one valid checkpoint mentioned in their docstrings (performed by `utils/check_config_docstrings.py`)
-- The translations of the READMEs and the index of the doc have the same model list as the main README (performed by `utils/check_copies.py`)
-- The auto-generated tables in the documentation are up to date (performed by `utils/check_table.py`)
-- The library has all objects available even if not all optional dependencies are installed (performed by `utils/check_dummies.py`)
-
-Should this check fail, the first two items require manual fixing, the last four can be fixed automatically for you by running the command
-
-```bash
-make fix-copies
-```
-
-Additional checks concern PRs that add new models, mainly that:
-
-- All models added are in an Auto-mapping (performed by `utils/check_repo.py`)
-<!-- TODO Sylvain, add a check that makes sure the common tests are implemented.-->
-- All models are properly tested (performed by `utils/check_repo.py`)
-
-<!-- TODO Sylvain, add the following
-- All models are added to the main README, inside the main doc
-- All checkpoints used actually exist on the Hub
-
--->
diff --git a/docs/source/en/preprocessing.md b/docs/source/en/preprocessing.md
new file mode 100644
index 000000000000..c90c6c2a2288
--- /dev/null
+++ b/docs/source/en/preprocessing.md
@@ -0,0 +1,529 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Preprocess
+
+[[open-in-colab]]
+
+Before you can train a model on a dataset, it needs to be preprocessed into the expected model input format. Whether your data is text, images, or audio, they need to be converted and assembled into batches of tensors. 🤗 Transformers provides a set of preprocessing classes to help prepare your data for the model. In this tutorial, you'll learn that for:
+
+* Text, use a [Tokenizer](./main_classes/tokenizer) to convert text into a sequence of tokens, create a numerical representation of the tokens, and assemble them into tensors.
+* Speech and audio, use a [Feature extractor](./main_classes/feature_extractor) to extract sequential features from audio waveforms and convert them into tensors.
+* Image inputs use a [ImageProcessor](./main_classes/image) to convert images into tensors.
+* Multimodal inputs, use a [Processor](./main_classes/processors) to combine a tokenizer and a feature extractor or image processor.
+
+<Tip>
+
+`AutoProcessor` **always** works and automatically chooses the correct class for the model you're using, whether you're using a tokenizer, image processor, feature extractor or processor.
+
+</Tip>
+
+Before you begin, install 🤗 Datasets so you can load some datasets to experiment with:
+
+```bash
+pip install datasets
+```
+
+## Natural Language Processing
+
+<Youtube id="Yffk5aydLzg"/>
+
+The main tool for preprocessing textual data is a [tokenizer](main_classes/tokenizer). A tokenizer splits text into *tokens* according to a set of rules. The tokens are converted into numbers and then tensors, which become the model inputs. Any additional inputs required by the model are added by the tokenizer.
+
+<Tip>
+
+If you plan on using a pretrained model, it's important to use the associated pretrained tokenizer. This ensures the text is split the same way as the pretraining corpus, and uses the same corresponding tokens-to-index (usually referred to as the *vocab*) during pretraining.
+
+</Tip>
+
+Get started by loading a pretrained tokenizer with the [`AutoTokenizer.from_pretrained`] method. This downloads the *vocab* a model was pretrained with:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+```
+
+Then pass your text to the tokenizer:
+
+```py
+>>> encoded_input = tokenizer("Do not meddle in the affairs of wizards, for they are subtle and quick to anger.")
+>>> print(encoded_input)
+{'input_ids': [101, 2079, 2025, 19960, 10362, 1999, 1996, 3821, 1997, 16657, 1010, 2005, 2027, 2024, 11259, 1998, 4248, 2000, 4963, 1012, 102],
+ 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+ 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+```
+
+The tokenizer returns a dictionary with three important items:
+
+* [input_ids](glossary#input-ids) are the indices corresponding to each token in the sentence.
+* [attention_mask](glossary#attention-mask) indicates whether a token should be attended to or not.
+* [token_type_ids](glossary#token-type-ids) identifies which sequence a token belongs to when there is more than one sequence.
+
+Return your input by decoding the `input_ids`:
+
+```py
+>>> tokenizer.decode(encoded_input["input_ids"])
+'[CLS] Do not meddle in the affairs of wizards, for they are subtle and quick to anger. [SEP]'
+```
+
+As you can see, the tokenizer added two special tokens - `CLS` and `SEP` (classifier and separator) - to the sentence. Not all models need
+special tokens, but if they do, the tokenizer automatically adds them for you.
+
+If there are several sentences you want to preprocess, pass them as a list to the tokenizer:
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_inputs = tokenizer(batch_sentences)
+>>> print(encoded_inputs)
+{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102],
+               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
+               [101, 1327, 1164, 5450, 23434, 136, 102]],
+ 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0]],
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1, 1, 1]]}
+```
+
+### Pad
+
+Sentences aren't always the same length which can be an issue because tensors, the model inputs, need to have a uniform shape. Padding is a strategy for ensuring tensors are rectangular by adding a special *padding token* to shorter sentences.
+
+Set the `padding` parameter to `True` to pad the shorter sequences in the batch to match the longest sequence:
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch_sentences, padding=True)
+>>> print(encoded_input)
+{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
+               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
+               [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]],
+ 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]}
+```
+
+The first and third sentences are now padded with `0`'s because they are shorter.
+
+### Truncation
+
+On the other end of the spectrum, sometimes a sequence may be too long for a model to handle. In this case, you'll need to truncate the sequence to a shorter length.
+
+Set the `truncation` parameter to `True` to truncate a sequence to the maximum length accepted by the model:
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True)
+>>> print(encoded_input)
+{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
+               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
+               [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]],
+ 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]}
+```
+
+<Tip>
+
+Check out the [Padding and truncation](./pad_truncation) concept guide to learn more different padding and truncation arguments.
+
+</Tip>
+
+### Build tensors
+
+Finally, you want the tokenizer to return the actual tensors that get fed to the model.
+
+Set the `return_tensors` parameter to either `pt` for PyTorch, or `tf` for TensorFlow:
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt")
+>>> print(encoded_input)
+{'input_ids': tensor([[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
+                      [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
+                      [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]]),
+ 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
+ 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+                           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                           [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}
+```
+</pt>
+<tf>
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="tf")
+>>> print(encoded_input)
+{'input_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
+array([[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
+       [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
+       [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]],
+      dtype=int32)>,
+ 'token_type_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
+array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>,
+ 'attention_mask': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
+array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+       [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>}
+```
+</tf>
+</frameworkcontent>
+
+## Audio
+
+For audio tasks, you'll need a [feature extractor](main_classes/feature_extractor) to prepare your dataset for the model. The feature extractor is designed to extract features from raw audio data, and convert them into tensors.
+
+Load the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset (see the 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub.html) for more details on how to load a dataset) to see how you can use a feature extractor with audio datasets:
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
+```
+
+Access the first element of the `audio` column to take a look at the input. Calling the `audio` column automatically loads and resamples the audio file:
+
+```py
+>>> dataset[0]["audio"]
+{'array': array([ 0.        ,  0.00024414, -0.00024414, ..., -0.00024414,
+         0.        ,  0.        ], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
+ 'sampling_rate': 8000}
+```
+
+This returns three items:
+
+* `array` is the speech signal loaded - and potentially resampled - as a 1D array.
+* `path` points to the location of the audio file.
+* `sampling_rate` refers to how many data points in the speech signal are measured per second.
+
+For this tutorial, you'll use the [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) model. Take a look at the model card, and you'll learn Wav2Vec2 is pretrained on 16kHz sampled speech audio. It is important your audio data's sampling rate matches the sampling rate of the dataset used to pretrain the model. If your data's sampling rate isn't the same, then you need to resample your data.
+
+1. Use 🤗 Datasets' [`~datasets.Dataset.cast_column`] method to upsample the sampling rate to 16kHz:
+
+```py
+>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000))
+```
+
+2. Call the `audio` column again to resample the audio file:
+
+```py
+>>> dataset[0]["audio"]
+{'array': array([ 2.3443763e-05,  2.1729663e-04,  2.2145823e-04, ...,
+         3.8356509e-05, -7.3497440e-06, -2.1754686e-05], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
+ 'sampling_rate': 16000}
+```
+
+Next, load a feature extractor to normalize and pad the input. When padding textual data, a `0` is added for shorter sequences. The same idea applies to audio data. The feature extractor adds a `0` - interpreted as silence - to `array`.
+
+Load the feature extractor with [`AutoFeatureExtractor.from_pretrained`]:
+
+```py
+>>> from transformers import AutoFeatureExtractor
+
+>>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
+```
+
+Pass the audio `array` to the feature extractor. We also recommend adding the `sampling_rate` argument in the feature extractor in order to better debug any silent errors that may occur.
+
+```py
+>>> audio_input = [dataset[0]["audio"]["array"]]
+>>> feature_extractor(audio_input, sampling_rate=16000)
+{'input_values': [array([ 3.8106556e-04,  2.7506407e-03,  2.8015103e-03, ...,
+        5.6335266e-04,  4.6588284e-06, -1.7142107e-04], dtype=float32)]}
+```
+
+Just like the tokenizer, you can apply padding or truncation to handle variable sequences in a batch. Take a look at the sequence length of these two audio samples:
+
+```py
+>>> dataset[0]["audio"]["array"].shape
+(173398,)
+
+>>> dataset[1]["audio"]["array"].shape
+(106496,)
+```
+
+Create a function to preprocess the dataset so the audio samples are the same lengths. Specify a maximum sample length, and the feature extractor will either pad or truncate the sequences to match it:
+
+```py
+>>> def preprocess_function(examples):
+...     audio_arrays = [x["array"] for x in examples["audio"]]
+...     inputs = feature_extractor(
+...         audio_arrays,
+...         sampling_rate=16000,
+...         padding=True,
+...         max_length=100000,
+...         truncation=True,
+...     )
+...     return inputs
+```
+
+Apply the `preprocess_function` to the the first few examples in the dataset:
+
+```py
+>>> processed_dataset = preprocess_function(dataset[:5])
+```
+
+The sample lengths are now the same and match the specified maximum length. You can pass your processed dataset to the model now!
+
+```py
+>>> processed_dataset["input_values"][0].shape
+(100000,)
+
+>>> processed_dataset["input_values"][1].shape
+(100000,)
+```
+
+## Computer vision
+
+For computer vision tasks, you'll need an [image processor](main_classes/image_processor) to prepare your dataset for the model.
+Image preprocessing consists of several steps that convert images into the input expected by the model. These steps
+include but are not limited to resizing, normalizing, color channel correction, and converting images to tensors.
+
+<Tip>
+
+Image preprocessing often follows some form of image augmentation. Both image preprocessing and image augmentation
+transform image data, but they serve different purposes:
+
+* Image augmentation alters images in a way that can help prevent overfitting and increase the robustness of the model. You can get creative in how you augment your data - adjust brightness and colors, crop, rotate, resize, zoom, etc. However, be mindful not to change the meaning of the images with your augmentations.
+* Image preprocessing guarantees that the images match the model’s expected input format. When fine-tuning a computer vision model, images must be preprocessed exactly as when the model was initially trained.
+
+You can use any library you like for image augmentation. For image preprocessing, use the `ImageProcessor` associated with the model.
+
+</Tip>
+
+Load the [food101](https://huggingface.co/datasets/food101) dataset (see the 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub.html) for more details on how to load a dataset) to see how you can use an image processor with computer vision datasets:
+
+<Tip>
+
+Use 🤗 Datasets `split` parameter to only load a small sample from the training split since the dataset is quite large!
+
+</Tip>
+
+```py
+>>> from datasets import load_dataset
+
+>>> dataset = load_dataset("food101", split="train[:100]")
+```
+
+Next, take a look at the image with 🤗 Datasets [`Image`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=image#datasets.Image) feature:
+
+```py
+>>> dataset[0]["image"]
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vision-preprocess-tutorial.png"/>
+</div>
+
+Load the image processor with [`AutoImageProcessor.from_pretrained`]:
+
+```py
+>>> from transformers import AutoImageProcessor
+
+>>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
+```
+
+First, let's add some image augmentation. You can use any library you prefer, but in this tutorial, we'll use torchvision's [`transforms`](https://pytorch.org/vision/stable/transforms.html) module. If you're interested in using another data augmentation library, learn how in the [Albumentations](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb) or [Kornia notebooks](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb).
+
+1. Here we use [`Compose`](https://pytorch.org/vision/master/generated/torchvision.transforms.Compose.html) to chain together a couple of
+transforms - [`RandomResizedCrop`](https://pytorch.org/vision/main/generated/torchvision.transforms.RandomResizedCrop.html) and [`ColorJitter`](https://pytorch.org/vision/main/generated/torchvision.transforms.ColorJitter.html).
+Note that for resizing, we can get the image size requirements from the `image_processor`. For some models, an exact height and
+width are expected, for others only the `shortest_edge` is defined.
+
+```py
+>>> from torchvision.transforms import RandomResizedCrop, ColorJitter, Compose
+
+>>> size = (
+...     image_processor.size["shortest_edge"]
+...     if "shortest_edge" in image_processor.size
+...     else (image_processor.size["height"], image_processor.size["width"])
+... )
+
+>>> _transforms = Compose([RandomResizedCrop(size), ColorJitter(brightness=0.5, hue=0.5)])
+```
+
+2. The model accepts [`pixel_values`](model_doc/visionencoderdecoder#transformers.VisionEncoderDecoderModel.forward.pixel_values)
+as its input. `ImageProcessor` can take care of normalizing the images, and generating appropriate tensors.
+Create a function that combines image augmentation and image preprocessing for a batch of images and generates `pixel_values`:
+
+```py
+>>> def transforms(examples):
+...     images = [_transforms(img.convert("RGB")) for img in examples["image"]]
+...     examples["pixel_values"] = image_processor(images, do_resize=False, return_tensors="pt")["pixel_values"]
+...     return examples
+```
+
+<Tip>
+
+In the example above we set `do_resize=False` because we have already resized the images in the image augmentation transformation,
+and leveraged the `size` attribute from the appropriate `image_processor`. If you do not resize images during image augmentation,
+leave this parameter out. By default, `ImageProcessor` will handle the resizing.
+
+If you wish to normalize images as a part of the augmentation transformation, use the `image_processor.image_mean`,
+and `image_processor.image_std` values.
+</Tip>
+
+3. Then use 🤗 Datasets [`set_transform`](https://huggingface.co/docs/datasets/process.html#format-transform) to apply the transforms on the fly:
+
+```py
+>>> dataset.set_transform(transforms)
+```
+
+4. Now when you access the image, you'll notice the image processor has added `pixel_values`. You can pass your processed dataset to the model now!
+
+```py
+>>> dataset[0].keys()
+```
+
+Here is what the image looks like after the transforms are applied. The image has been randomly cropped and it's color properties are different.
+
+```py
+>>> import numpy as np
+>>> import matplotlib.pyplot as plt
+
+>>> img = dataset[0]["pixel_values"]
+>>> plt.imshow(img.permute(1, 2, 0))
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/preprocessed_image.png"/>
+</div>
+
+<Tip>
+
+For tasks like object detection, semantic segmentation, instance segmentation, and panoptic segmentation, `ImageProcessor`
+offers post processing methods. These methods convert model's raw outputs into meaningful predictions such as bounding boxes,
+or segmentation maps.
+
+</Tip>
+
+### Pad
+
+In some cases, for instance, when fine-tuning [DETR](./model_doc/detr), the model applies scale augmentation at training
+time. This may cause images to be different sizes in a batch. You can use [`DetrImageProcessor.pad`]
+from [`DetrImageProcessor`] and define a custom `collate_fn` to batch images together.
+
+```py
+>>> def collate_fn(batch):
+...     pixel_values = [item["pixel_values"] for item in batch]
+...     encoding = image_processor.pad(pixel_values, return_tensors="pt")
+...     labels = [item["labels"] for item in batch]
+...     batch = {}
+...     batch["pixel_values"] = encoding["pixel_values"]
+...     batch["pixel_mask"] = encoding["pixel_mask"]
+...     batch["labels"] = labels
+...     return batch
+```
+
+## Multimodal
+
+For tasks involving multimodal inputs, you'll need a [processor](main_classes/processors) to prepare your dataset for the model. A processor couples together two processing objects such as as tokenizer and feature extractor.
+
+Load the [LJ Speech](https://huggingface.co/datasets/lj_speech) dataset (see the 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub.html) for more details on how to load a dataset) to see how you can use a processor for automatic speech recognition (ASR):
+
+```py
+>>> from datasets import load_dataset
+
+>>> lj_speech = load_dataset("lj_speech", split="train")
+```
+
+For ASR, you're mainly focused on `audio` and `text` so you can remove the other columns:
+
+```py
+>>> lj_speech = lj_speech.map(remove_columns=["file", "id", "normalized_text"])
+```
+
+Now take a look at the `audio` and `text` columns:
+
+```py
+>>> lj_speech[0]["audio"]
+{'array': array([-7.3242188e-04, -7.6293945e-04, -6.4086914e-04, ...,
+         7.3242188e-04,  2.1362305e-04,  6.1035156e-05], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav',
+ 'sampling_rate': 22050}
+
+>>> lj_speech[0]["text"]
+'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition'
+```
+
+Remember you should always [resample](preprocessing#audio) your audio dataset's sampling rate to match the sampling rate of the dataset used to pretrain a model!
+
+```py
+>>> lj_speech = lj_speech.cast_column("audio", Audio(sampling_rate=16_000))
+```
+
+Load a processor with [`AutoProcessor.from_pretrained`]:
+
+```py
+>>> from transformers import AutoProcessor
+
+>>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
+```
+
+1. Create a function to process the audio data contained in `array` to `input_values`, and tokenize `text` to `labels`. These are the inputs to the model:
+
+```py
+>>> def prepare_dataset(example):
+...     audio = example["audio"]
+
+...     example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000))
+
+...     return example
+```
+
+2. Apply the `prepare_dataset` function to a sample:
+
+```py
+>>> prepare_dataset(lj_speech[0])
+```
+
+The processor has now added `input_values` and `labels`, and the sampling rate has also been correctly downsampled to 16kHz. You can pass your processed dataset to the model now!
diff --git a/docs/source/en/preprocessing.mdx b/docs/source/en/preprocessing.mdx
deleted file mode 100644
index 5283a9b17e00..000000000000
--- a/docs/source/en/preprocessing.mdx
+++ /dev/null
@@ -1,469 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Preprocess
-
-[[open-in-colab]]
-
-Before you can train a model on a dataset, it needs to be preprocessed into the expected model input format. Whether your data is text, images, or audio, they need to be converted and assembled into batches of tensors. 🤗 Transformers provides a set of preprocessing classes to help prepare your data for the model. In this tutorial, you'll learn that for:
-
-* Text, use a [Tokenizer](./main_classes/tokenizer) to convert text into a sequence of tokens, create a numerical representation of the tokens, and assemble them into tensors.
-* Image inputs use a [ImageProcessor](./main_classes/image) to convert images into tensors.
-* Speech and audio, use a [Feature extractor](./main_classes/feature_extractor) to extract sequential features from audio waveforms and convert them into tensors.
-* Multimodal inputs, use a [Processor](./main_classes/processors) to combine a tokenizer and a feature extractor or image processor.
-
-<Tip>
-
-`AutoProcessor` **always** works and automatically chooses the correct class for the model you're using, whether you're using a tokenizer, image processor, feature extractor or processor.
-
-</Tip>
-
-Before you begin, install 🤗 Datasets so you can load some datasets to experiment with:
-
-```bash
-pip install datasets
-```
-
-## Natural Language Processing
-
-<Youtube id="Yffk5aydLzg"/>
-
-The main tool for preprocessing textual data is a [tokenizer](main_classes/tokenizer). A tokenizer splits text into *tokens* according to a set of rules. The tokens are converted into numbers and then tensors, which become the model inputs. Any additional inputs required by the model are added by the tokenizer.
-
-<Tip>
-
-If you plan on using a pretrained model, it's important to use the associated pretrained tokenizer. This ensures the text is split the same way as the pretraining corpus, and uses the same corresponding tokens-to-index (usually referrred to as the *vocab*) during pretraining.
-
-</Tip>
-
-Get started by loading a pretrained tokenizer with the [`AutoTokenizer.from_pretrained`] method. This downloads the *vocab* a model was pretrained with:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
-```
-
-Then pass your text to the tokenizer:
-
-```py
->>> encoded_input = tokenizer("Do not meddle in the affairs of wizards, for they are subtle and quick to anger.")
->>> print(encoded_input)
-{'input_ids': [101, 2079, 2025, 19960, 10362, 1999, 1996, 3821, 1997, 16657, 1010, 2005, 2027, 2024, 11259, 1998, 4248, 2000, 4963, 1012, 102], 
- 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
- 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
-```
-
-The tokenizer returns a dictionary with three important items:
-
-* [input_ids](glossary#input-ids) are the indices corresponding to each token in the sentence.
-* [attention_mask](glossary#attention-mask) indicates whether a token should be attended to or not.
-* [token_type_ids](glossary#token-type-ids) identifies which sequence a token belongs to when there is more than one sequence.
-
-Return your input by decoding the `input_ids`:
-
-```py
->>> tokenizer.decode(encoded_input["input_ids"])
-'[CLS] Do not meddle in the affairs of wizards, for they are subtle and quick to anger. [SEP]'
-```
-
-As you can see, the tokenizer added two special tokens - `CLS` and `SEP` (classifier and separator) - to the sentence. Not all models need
-special tokens, but if they do, the tokenizer automatically adds them for you.
-
-If there are several sentences you want to preprocess, pass them as a list to the tokenizer:
-
-```py
->>> batch_sentences = [
-...     "But what about second breakfast?",
-...     "Don't think he knows about second breakfast, Pip.",
-...     "What about elevensies?",
-... ]
->>> encoded_inputs = tokenizer(batch_sentences)
->>> print(encoded_inputs)
-{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102], 
-               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], 
-               [101, 1327, 1164, 5450, 23434, 136, 102]], 
- 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], 
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
-                    [0, 0, 0, 0, 0, 0, 0]], 
- 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], 
-                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
-                    [1, 1, 1, 1, 1, 1, 1]]}
-```
-
-### Pad
-
-Sentences aren't always the same length which can be an issue because tensors, the model inputs, need to have a uniform shape. Padding is a strategy for ensuring tensors are rectangular by adding a special *padding token* to shorter sentences.
-
-Set the `padding` parameter to `True` to pad the shorter sequences in the batch to match the longest sequence:
-
-```py
->>> batch_sentences = [
-...     "But what about second breakfast?",
-...     "Don't think he knows about second breakfast, Pip.",
-...     "What about elevensies?",
-... ]
->>> encoded_input = tokenizer(batch_sentences, padding=True)
->>> print(encoded_input)
-{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], 
-               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], 
-               [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], 
- 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 
- 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], 
-                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
-                    [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]}
-```
-
-The first and third sentences are now padded with `0`'s because they are shorter.
-
-### Truncation
-
-On the other end of the spectrum, sometimes a sequence may be too long for a model to handle. In this case, you'll need to truncate the sequence to a shorter length.
-
-Set the `truncation` parameter to `True` to truncate a sequence to the maximum length accepted by the model:
-
-```py
->>> batch_sentences = [
-...     "But what about second breakfast?",
-...     "Don't think he knows about second breakfast, Pip.",
-...     "What about elevensies?",
-... ]
->>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True)
->>> print(encoded_input)
-{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], 
-               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], 
-               [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], 
- 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 
- 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], 
-                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
-                    [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]}
-```
-
-<Tip>
-
-Check out the [Padding and truncation](./pad_truncation) concept guide to learn more different padding and truncation arguments.
-
-</Tip>
-
-### Build tensors
-
-Finally, you want the tokenizer to return the actual tensors that get fed to the model.
-
-Set the `return_tensors` parameter to either `pt` for PyTorch, or `tf` for TensorFlow:
-
-<frameworkcontent>
-<pt>
-
-```py
->>> batch_sentences = [
-...     "But what about second breakfast?",
-...     "Don't think he knows about second breakfast, Pip.",
-...     "What about elevensies?",
-... ]
->>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt")
->>> print(encoded_input)
-{'input_ids': tensor([[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
-                      [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
-                      [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]]), 
- 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 
- 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
-                           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                           [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}
-```
-</pt>
-<tf>
-```py
->>> batch_sentences = [
-...     "But what about second breakfast?",
-...     "Don't think he knows about second breakfast, Pip.",
-...     "What about elevensies?",
-... ]
->>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="tf")
->>> print(encoded_input)
-{'input_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
-array([[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
-       [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
-       [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]],
-      dtype=int32)>, 
- 'token_type_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
-array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>, 
- 'attention_mask': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
-array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
-       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-       [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>}
-```
-</tf>
-</frameworkcontent>
-
-## Audio
-
-For audio tasks, you'll need a [feature extractor](main_classes/feature_extractor) to prepare your dataset for the model. The feature extractor is designed to extract features from raw audio data, and convert them into tensors.
-
-Load the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset (see the 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub.html) for more details on how to load a dataset) to see how you can use a feature extractor with audio datasets:
-
-```py
->>> from datasets import load_dataset, Audio
-
->>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
-```
-
-Access the first element of the `audio` column to take a look at the input. Calling the `audio` column automatically loads and resamples the audio file:
-
-```py
->>> dataset[0]["audio"]
-{'array': array([ 0.        ,  0.00024414, -0.00024414, ..., -0.00024414,
-         0.        ,  0.        ], dtype=float32),
- 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
- 'sampling_rate': 8000}
-```
-
-This returns three items:
-
-* `array` is the speech signal loaded - and potentially resampled - as a 1D array.
-* `path` points to the location of the audio file.
-* `sampling_rate` refers to how many data points in the speech signal are measured per second.
-
-For this tutorial, you'll use the [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) model. Take a look at the model card, and you'll learn Wav2Vec2 is pretrained on 16kHz sampled speech audio. It is important your audio data's sampling rate matches the sampling rate of the dataset used to pretrain the model. If your data's sampling rate isn't the same, then you need to resample your data. 
-
-1. Use 🤗 Datasets' [`~datasets.Dataset.cast_column`] method to upsample the sampling rate to 16kHz:
-
-```py
->>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000))
-```
-
-2. Call the `audio` column again to resample the audio file:
-
-```py
->>> dataset[0]["audio"]
-{'array': array([ 2.3443763e-05,  2.1729663e-04,  2.2145823e-04, ...,
-         3.8356509e-05, -7.3497440e-06, -2.1754686e-05], dtype=float32),
- 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
- 'sampling_rate': 16000}
-```
-
-Next, load a feature extractor to normalize and pad the input. When padding textual data, a `0` is added for shorter sequences. The same idea applies to audio data. The feature extractor adds a `0` - interpreted as silence - to `array`.
-
-Load the feature extractor with [`AutoFeatureExtractor.from_pretrained`]:
-
-```py
->>> from transformers import AutoFeatureExtractor
-
->>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
-```
-
-Pass the audio `array` to the feature extractor. We also recommend adding the `sampling_rate` argument in the feature extractor in order to better debug any silent errors that may occur.
-
-```py
->>> audio_input = [dataset[0]["audio"]["array"]]
->>> feature_extractor(audio_input, sampling_rate=16000)
-{'input_values': [array([ 3.8106556e-04,  2.7506407e-03,  2.8015103e-03, ...,
-        5.6335266e-04,  4.6588284e-06, -1.7142107e-04], dtype=float32)]}
-```
-
-Just like the tokenizer, you can apply padding or truncation to handle variable sequences in a batch. Take a look at the sequence length of these two audio samples:
-
-```py
->>> dataset[0]["audio"]["array"].shape
-(173398,)
-
->>> dataset[1]["audio"]["array"].shape
-(106496,)
-```
-
-Create a function to preprocess the dataset so the audio samples are the same lengths. Specify a maximum sample length, and the feature extractor will either pad or truncate the sequences to match it:
-
-```py
->>> def preprocess_function(examples):
-...     audio_arrays = [x["array"] for x in examples["audio"]]
-...     inputs = feature_extractor(
-...         audio_arrays,
-...         sampling_rate=16000,
-...         padding=True,
-...         max_length=100000,
-...         truncation=True,
-...     )
-...     return inputs
-```
-
-Apply the `preprocess_function` to the the first few examples in the dataset:
-
-```py
->>> processed_dataset = preprocess_function(dataset[:5])
-```
-
-The sample lengths are now the same and match the specified maximum length. You can pass your processed dataset to the model now!
-
-```py
->>> processed_dataset["input_values"][0].shape
-(100000,)
-
->>> processed_dataset["input_values"][1].shape
-(100000,)
-```
-
-## Computer vision
-
-For computer vision tasks, you'll need an [image processor](main_classes/image_processor) to prepare your dataset for the model. The image processor is designed to preprocess images, and convert them into tensors.
-
-Load the [food101](https://huggingface.co/datasets/food101) dataset (see the 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub.html) for more details on how to load a dataset) to see how you can use an image processor with computer vision datasets:
-
-<Tip>
-
-Use 🤗 Datasets `split` parameter to only load a small sample from the training split since the dataset is quite large!
-
-</Tip>
-
-```py
->>> from datasets import load_dataset
-
->>> dataset = load_dataset("food101", split="train[:100]")
-```
-
-Next, take a look at the image with 🤗 Datasets [`Image`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=image#datasets.Image) feature:
-
-```py
->>> dataset[0]["image"]
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vision-preprocess-tutorial.png"/>
-</div>
-
-Load the image processor with [`AutoImageProcessor.from_pretrained`]:
-
-```py
->>> from transformers import AutoImageProcessor
-
->>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
-```
-
-For computer vision tasks, it is common to add some type of data augmentation to the images as a part of preprocessing. You can add augmentations with any library you'd like, but in this tutorial, you'll use torchvision's [`transforms`](https://pytorch.org/vision/stable/transforms.html) module. If you're interested in using another data augmentation library, learn how in the [Albumentations](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb) or [Kornia notebooks](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb).
-
-1. Normalize the image with the image processor and use [`Compose`](https://pytorch.org/vision/master/generated/torchvision.transforms.Compose.html) to chain some transforms - [`RandomResizedCrop`](https://pytorch.org/vision/main/generated/torchvision.transforms.RandomResizedCrop.html) and [`ColorJitter`](https://pytorch.org/vision/main/generated/torchvision.transforms.ColorJitter.html) - together:
-
-```py
->>> from torchvision.transforms import Compose, Normalize, RandomResizedCrop, ColorJitter, ToTensor
-
->>> normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
->>> size = (
-...     image_processor.size["shortest_edge"]
-...     if "shortest_edge" in image_processor.size
-...     else (image_processor.size["height"], image_processor.size["width"])
-... )
->>> _transforms = Compose([RandomResizedCrop(size), ColorJitter(brightness=0.5, hue=0.5), ToTensor(), normalize])
-```
-
-2. The model accepts [`pixel_values`](model_doc/visionencoderdecoder#transformers.VisionEncoderDecoderModel.forward.pixel_values) as its input, which is generated by the image processor. Create a function that generates `pixel_values` from the transforms:
-
-```py
->>> def transforms(examples):
-...     examples["pixel_values"] = [_transforms(image.convert("RGB")) for image in examples["image"]]
-...     return examples
-```
-
-3. Then use 🤗 Datasets [`set_transform`](https://huggingface.co/docs/datasets/process.html#format-transform) to apply the transforms on the fly:
-
-```py
->>> dataset.set_transform(transforms)
-```
-
-4. Now when you access the image, you'll notice the image processor has added `pixel_values`. You can pass your processed dataset to the model now!
-
-```py
->>> dataset[0].keys()
-```
-
-Here is what the image looks like after the transforms are applied. The image has been randomly cropped and it's color properties are different.
-
-```py
->>> import numpy as np
->>> import matplotlib.pyplot as plt
-
->>> img = dataset[0]["pixel_values"]
->>> plt.imshow(img.permute(1, 2, 0))
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/preprocessed_image.png"/>
-</div>
-
-## Multimodal
-
-For tasks involving multimodal inputs, you'll need a [processor](main_classes/processors) to prepare your dataset for the model. A processor couples together two processing objects such as as tokenizer and feature extractor.
-
-Load the [LJ Speech](https://huggingface.co/datasets/lj_speech) dataset (see the 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub.html) for more details on how to load a dataset) to see how you can use a processor for automatic speech recognition (ASR):
-
-```py
->>> from datasets import load_dataset
-
->>> lj_speech = load_dataset("lj_speech", split="train")
-```
-
-For ASR, you're mainly focused on `audio` and `text` so you can remove the other columns:
-
-```py
->>> lj_speech = lj_speech.map(remove_columns=["file", "id", "normalized_text"])
-```
-
-Now take a look at the `audio` and `text` columns:
-
-```py
->>> lj_speech[0]["audio"]
-{'array': array([-7.3242188e-04, -7.6293945e-04, -6.4086914e-04, ...,
-         7.3242188e-04,  2.1362305e-04,  6.1035156e-05], dtype=float32),
- 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav',
- 'sampling_rate': 22050}
-
->>> lj_speech[0]["text"]
-'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition'
-```
-
-Remember you should always [resample](preprocessing#audio) your audio dataset's sampling rate to match the sampling rate of the dataset used to pretrain a model!
-
-```py
->>> lj_speech = lj_speech.cast_column("audio", Audio(sampling_rate=16_000))
-```
-
-Load a processor with [`AutoProcessor.from_pretrained`]:
-
-```py
->>> from transformers import AutoProcessor
-
->>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
-```
-
-1. Create a function to process the audio data contained in `array` to `input_values`, and tokenize `text` to `labels`. These are the inputs to the model:
-
-```py
->>> def prepare_dataset(example):
-...     audio = example["audio"]
-
-...     example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000))
-
-...     return example
-```
-
-2. Apply the `prepare_dataset` function to a sample:
-
-```py
->>> prepare_dataset(lj_speech[0])
-```
-
-The processor has now added `input_values` and `labels`, and the sampling rate has also been correctly downsampled to 16kHz. You can pass your processed dataset to the model now!
diff --git a/docs/source/en/quicktour.md b/docs/source/en/quicktour.md
new file mode 100644
index 000000000000..d49943da17a1
--- /dev/null
+++ b/docs/source/en/quicktour.md
@@ -0,0 +1,556 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Quick tour
+
+[[open-in-colab]]
+
+Get up and running with 🤗 Transformers! Whether you're a developer or an everyday user, this quick tour will help you get started and show you how to use the [`pipeline`] for inference, load a pretrained model and preprocessor with an [AutoClass](./model_doc/auto), and quickly train a model with PyTorch or TensorFlow. If you're a beginner, we recommend checking out our tutorials or [course](https://huggingface.co/course/chapter1/1) next for more in-depth explanations of the concepts introduced here.
+
+Before you begin, make sure you have all the necessary libraries installed:
+
+```bash
+!pip install transformers datasets
+```
+
+You'll also need to install your preferred machine learning framework:
+
+<frameworkcontent>
+<pt>
+
+```bash
+pip install torch
+```
+</pt>
+<tf>
+
+```bash
+pip install tensorflow
+```
+</tf>
+</frameworkcontent>
+
+## Pipeline
+
+<Youtube id="tiZFewofSLM"/>
+
+The [`pipeline`] is the easiest and fastest way to use a pretrained model for inference. You can use the [`pipeline`] out-of-the-box for many tasks across different modalities, some of which are shown in the table below:
+
+<Tip>
+
+For a complete list of available tasks, check out the [pipeline API reference](./main_classes/pipelines).
+
+</Tip>
+
+| **Task**                     | **Description**                                                                                              | **Modality**    | **Pipeline identifier**                       |
+|------------------------------|--------------------------------------------------------------------------------------------------------------|-----------------|-----------------------------------------------|
+| Text classification          | assign a label to a given sequence of text                                                                   | NLP             | pipeline(task=“sentiment-analysis”)           |
+| Text generation              | generate text given a prompt                                                                                 | NLP             | pipeline(task=“text-generation”)              |
+| Summarization                | generate a summary of a sequence of text or document                                                         | NLP             | pipeline(task=“summarization”)                |
+| Image classification         | assign a label to an image                                                                                   | Computer vision | pipeline(task=“image-classification”)         |
+| Image segmentation           | assign a label to each individual pixel of an image (supports semantic, panoptic, and instance segmentation) | Computer vision | pipeline(task=“image-segmentation”)           |
+| Object detection             | predict the bounding boxes and classes of objects in an image                                                | Computer vision | pipeline(task=“object-detection”)             |
+| Audio classification         | assign a label to some audio data                                                                            | Audio           | pipeline(task=“audio-classification”)         |
+| Automatic speech recognition | transcribe speech into text                                                                                  | Audio           | pipeline(task=“automatic-speech-recognition”) |
+| Visual question answering    | answer a question about the image, given an image and a question                                             | Multimodal      | pipeline(task=“vqa”)                          |
+| Document question answering  | answer a question about the document, given a document and a question                                        | Multimodal      | pipeline(task="document-question-answering")  |
+| Image captioning             | generate a caption for a given image                                                                         | Multimodal      | pipeline(task="image-to-text")                |
+
+Start by creating an instance of [`pipeline`] and specifying a task you want to use it for. In this guide, you'll use the [`pipeline`] for sentiment analysis as an example:
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline("sentiment-analysis")
+```
+
+The [`pipeline`] downloads and caches a default [pretrained model](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english) and tokenizer for sentiment analysis. Now you can use the `classifier` on your target text:
+
+```py
+>>> classifier("We are very happy to show you the 🤗 Transformers library.")
+[{'label': 'POSITIVE', 'score': 0.9998}]
+```
+
+If you have more than one input, pass your inputs as a list to the [`pipeline`] to return a list of dictionaries:
+
+```py
+>>> results = classifier(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."])
+>>> for result in results:
+...     print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
+label: POSITIVE, with score: 0.9998
+label: NEGATIVE, with score: 0.5309
+```
+
+The [`pipeline`] can also iterate over an entire dataset for any task you like. For this example, let's choose automatic speech recognition as our task:
+
+```py
+>>> import torch
+>>> from transformers import pipeline
+
+>>> speech_recognizer = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
+```
+
+Load an audio dataset (see the 🤗 Datasets [Quick Start](https://huggingface.co/docs/datasets/quickstart#audio) for more details) you'd like to iterate over. For example, load the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset:
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")  # doctest: +IGNORE_RESULT
+```
+
+You need to make sure the sampling rate of the dataset matches the sampling 
+rate [`facebook/wav2vec2-base-960h`](https://huggingface.co/facebook/wav2vec2-base-960h) was trained on:
+
+```py
+>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=speech_recognizer.feature_extractor.sampling_rate))
+```
+
+The audio files are automatically loaded and resampled when calling the `"audio"` column.
+Extract the raw waveform arrays from the first 4 samples and pass it as a list to the pipeline:
+
+```py
+>>> result = speech_recognizer(dataset[:4]["audio"])
+>>> print([d["text"] for d in result])
+['I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT', "FONDERING HOW I'D SET UP A JOIN TO HELL T WITH MY WIFE AND WHERE THE AP MIGHT BE", "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE APSO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AN I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS", 'HOW DO I FURN A JOINA COUT']
+```
+
+For larger datasets where the inputs are big (like in speech or vision), you'll want to pass a generator instead of a list to load all the inputs in memory. Take a look at the [pipeline API reference](./main_classes/pipelines) for more information.
+
+### Use another model and tokenizer in the pipeline
+
+The [`pipeline`] can accommodate any model from the [Hub](https://huggingface.co/models), making it easy to adapt the [`pipeline`] for other use-cases. For example, if you'd like a model capable of handling French text, use the tags on the Hub to filter for an appropriate model. The top filtered result returns a multilingual [BERT model](https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment) finetuned for sentiment analysis you can use for French text:
+
+```py
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+```
+
+<frameworkcontent>
+<pt>
+Use [`AutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and it's associated tokenizer (more on an `AutoClass` in the next section):
+
+```py
+>>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained(model_name)
+>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+```
+</pt>
+<tf>
+Use [`TFAutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and it's associated tokenizer (more on an `TFAutoClass` in the next section):
+
+```py
+>>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+
+>>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
+>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+```
+</tf>
+</frameworkcontent>
+
+Specify the model and tokenizer in the [`pipeline`], and now you can apply the `classifier` on French text:
+
+```py
+>>> classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
+>>> classifier("Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers.")
+[{'label': '5 stars', 'score': 0.7273}]
+```
+
+If you can't find a model for your use-case, you'll need to finetune a pretrained model on your data. Take a look at our [finetuning tutorial](./training) to learn how. Finally, after you've finetuned your pretrained model, please consider [sharing](./model_sharing) the model with the community on the Hub to democratize machine learning for everyone! 🤗
+
+## AutoClass
+
+<Youtube id="AhChOFRegn4"/>
+
+Under the hood, the [`AutoModelForSequenceClassification`] and [`AutoTokenizer`] classes work together to power the [`pipeline`] you used above. An [AutoClass](./model_doc/auto) is a shortcut that automatically retrieves the architecture of a pretrained model from its name or path. You only need to select the appropriate `AutoClass` for your task and it's associated preprocessing class. 
+
+Let's return to the example from the previous section and see how you can use the `AutoClass` to replicate the results of the [`pipeline`].
+
+### AutoTokenizer
+
+A tokenizer is responsible for preprocessing text into an array of numbers as inputs to a model. There are multiple rules that govern the tokenization process, including how to split a word and at what level words should be split (learn more about tokenization in the [tokenizer summary](./tokenizer_summary)). The most important thing to remember is you need to instantiate a tokenizer with the same model name to ensure you're using the same tokenization rules a model was pretrained with.
+
+Load a tokenizer with [`AutoTokenizer`]:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+```
+
+Pass your text to the tokenizer:
+
+```py
+>>> encoding = tokenizer("We are very happy to show you the 🤗 Transformers library.")
+>>> print(encoding)
+{'input_ids': [101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103, 100, 58263, 13299, 119, 102],
+ 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+ 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+```
+
+The tokenizer returns a dictionary containing:
+
+* [input_ids](./glossary#input-ids): numerical representations of your tokens.
+* [attention_mask](.glossary#attention-mask): indicates which tokens should be attended to.
+
+A tokenizer can also accept a list of inputs, and pad and truncate the text to return a batch with uniform length:
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> pt_batch = tokenizer(
+...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
+...     padding=True,
+...     truncation=True,
+...     max_length=512,
+...     return_tensors="pt",
+... )
+```
+</pt>
+<tf>
+
+```py
+>>> tf_batch = tokenizer(
+...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
+...     padding=True,
+...     truncation=True,
+...     max_length=512,
+...     return_tensors="tf",
+... )
+```
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+Check out the [preprocess](./preprocessing) tutorial for more details about tokenization, and how to use an [`AutoImageProcessor`], [`AutoFeatureExtractor`] and [`AutoProcessor`] to preprocess image, audio, and multimodal inputs.
+
+</Tip>
+
+### AutoModel
+
+<frameworkcontent>
+<pt>
+🤗 Transformers provides a simple and unified way to load pretrained instances. This means you can load an [`AutoModel`] like you would load an [`AutoTokenizer`]. The only difference is selecting the correct [`AutoModel`] for the task. For text (or sequence) classification, you should load [`AutoModelForSequenceClassification`]:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)
+```
+
+<Tip>
+
+See the [task summary](./task_summary) for tasks supported by an [`AutoModel`] class.
+
+</Tip>
+
+Now pass your preprocessed batch of inputs directly to the model. You just have to unpack the dictionary by adding `**`:
+
+```py
+>>> pt_outputs = pt_model(**pt_batch)
+```
+
+The model outputs the final activations in the `logits` attribute. Apply the softmax function to the `logits` to retrieve the probabilities:
+
+```py
+>>> from torch import nn
+
+>>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
+>>> print(pt_predictions)
+tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
+        [0.2084, 0.1826, 0.1969, 0.1755, 0.2365]], grad_fn=<SoftmaxBackward0>)
+```
+</pt>
+<tf>
+🤗 Transformers provides a simple and unified way to load pretrained instances. This means you can load an [`TFAutoModel`] like you would load an [`AutoTokenizer`]. The only difference is selecting the correct [`TFAutoModel`] for the task. For text (or sequence) classification, you should load [`TFAutoModelForSequenceClassification`]:
+
+```py
+>>> from transformers import TFAutoModelForSequenceClassification
+
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
+```
+
+<Tip>
+
+See the [task summary](./task_summary) for tasks supported by an [`AutoModel`] class.
+
+</Tip>
+
+Now pass your preprocessed batch of inputs directly to the model. You can pass the tensors as-is:
+
+```py
+>>> tf_outputs = tf_model(tf_batch)
+```
+
+The model outputs the final activations in the `logits` attribute. Apply the softmax function to the `logits` to retrieve the probabilities:
+
+```py
+>>> import tensorflow as tf
+
+>>> tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1)
+>>> tf_predictions  # doctest: +IGNORE_RESULT
+```
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+All 🤗 Transformers models (PyTorch or TensorFlow) output the tensors *before* the final activation
+function (like softmax) because the final activation function is often fused with the loss. Model outputs are special dataclasses so their attributes are autocompleted in an IDE. The model outputs behave like a tuple or a dictionary (you can index with an integer, a slice or a string) in which case, attributes that are None are ignored.
+
+</Tip>
+
+### Save a model
+
+<frameworkcontent>
+<pt>
+Once your model is fine-tuned, you can save it with its tokenizer using [`PreTrainedModel.save_pretrained`]:
+
+```py
+>>> pt_save_directory = "./pt_save_pretrained"
+>>> tokenizer.save_pretrained(pt_save_directory)  # doctest: +IGNORE_RESULT
+>>> pt_model.save_pretrained(pt_save_directory)
+```
+
+When you are ready to use the model again, reload it with [`PreTrainedModel.from_pretrained`]:
+
+```py
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained")
+```
+</pt>
+<tf>
+Once your model is fine-tuned, you can save it with its tokenizer using [`TFPreTrainedModel.save_pretrained`]:
+
+```py
+>>> tf_save_directory = "./tf_save_pretrained"
+>>> tokenizer.save_pretrained(tf_save_directory)  # doctest: +IGNORE_RESULT
+>>> tf_model.save_pretrained(tf_save_directory)
+```
+
+When you are ready to use the model again, reload it with [`TFPreTrainedModel.from_pretrained`]:
+
+```py
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("./tf_save_pretrained")
+```
+</tf>
+</frameworkcontent>
+
+One particularly cool 🤗 Transformers feature is the ability to save a model and reload it as either a PyTorch or TensorFlow model. The `from_pt` or `from_tf` parameter can convert the model from one framework to the other:
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> from transformers import AutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
+```
+</pt>
+<tf>
+
+```py
+>>> from transformers import TFAutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
+```
+</tf>
+</frameworkcontent>
+
+## Custom model builds
+
+You can modify the model's configuration class to change how a model is built. The configuration specifies a model's attributes, such as the number of hidden layers or attention heads. You start from scratch when you initialize a model from a custom configuration class. The model attributes are randomly initialized, and you'll need to train the model before you can use it to get meaningful results.
+
+Start by importing [`AutoConfig`], and then load the pretrained model you want to modify. Within [`AutoConfig.from_pretrained`], you can specify the attribute you want to change, such as the number of attention heads:
+
+```py
+>>> from transformers import AutoConfig
+
+>>> my_config = AutoConfig.from_pretrained("distilbert-base-uncased", n_heads=12)
+```
+
+<frameworkcontent>
+<pt>
+Create a model from your custom configuration with [`AutoModel.from_config`]:
+
+```py
+>>> from transformers import AutoModel
+
+>>> my_model = AutoModel.from_config(my_config)
+```
+</pt>
+<tf>
+Create a model from your custom configuration with [`TFAutoModel.from_config`]:
+
+```py
+>>> from transformers import TFAutoModel
+
+>>> my_model = TFAutoModel.from_config(my_config)
+```
+</tf>
+</frameworkcontent>
+
+Take a look at the [Create a custom architecture](./create_a_model) guide for more information about building custom configurations.
+
+## Trainer - a PyTorch optimized training loop
+
+All models are a standard [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) so you can use them in any typical training loop. While you can write your own training loop, 🤗 Transformers provides a [`Trainer`] class for PyTorch, which contains the basic training loop and adds additional functionality for features like distributed training, mixed precision, and more.
+
+Depending on your task, you'll typically pass the following parameters to [`Trainer`]:
+
+1. You'll start with a [`PreTrainedModel`] or a [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module):
+
+   ```py
+   >>> from transformers import AutoModelForSequenceClassification
+
+   >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+   ```
+
+2. [`TrainingArguments`] contains the model hyperparameters you can change like learning rate, batch size, and the number of epochs to train for. The default values are used if you don't specify any training arguments:
+
+   ```py
+   >>> from transformers import TrainingArguments
+
+   >>> training_args = TrainingArguments(
+   ...     output_dir="path/to/save/folder/",
+   ...     learning_rate=2e-5,
+   ...     per_device_train_batch_size=8,
+   ...     per_device_eval_batch_size=8,
+   ...     num_train_epochs=2,
+   ... )
+   ```
+
+3. Load a preprocessing class like a tokenizer, image processor, feature extractor, or processor:
+
+   ```py
+   >>> from transformers import AutoTokenizer
+
+   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+   ```
+
+4. Load a dataset:
+
+   ```py
+   >>> from datasets import load_dataset
+
+   >>> dataset = load_dataset("rotten_tomatoes")  # doctest: +IGNORE_RESULT
+   ```
+
+5. Create a function to tokenize the dataset:
+
+   ```py
+   >>> def tokenize_dataset(dataset):
+   ...     return tokenizer(dataset["text"])
+   ```
+
+   Then apply it over the entire dataset with [`~datasets.Dataset.map`]:
+
+   ```py
+   >>> dataset = dataset.map(tokenize_dataset, batched=True)
+   ```
+
+6. A [`DataCollatorWithPadding`] to create a batch of examples from your dataset:
+
+   ```py
+   >>> from transformers import DataCollatorWithPadding
+
+   >>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+   ```
+
+Now gather all these classes in [`Trainer`]:
+
+```py
+>>> from transformers import Trainer
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=dataset["train"],
+...     eval_dataset=dataset["test"],
+...     tokenizer=tokenizer,
+...     data_collator=data_collator,
+... )  # doctest: +SKIP
+```
+
+When you're ready, call [`~Trainer.train`] to start training:
+
+```py
+>>> trainer.train()  # doctest: +SKIP
+```
+
+<Tip>
+
+For tasks - like translation or summarization - that use a sequence-to-sequence model, use the [`Seq2SeqTrainer`] and [`Seq2SeqTrainingArguments`] classes instead.
+
+</Tip>
+
+You can customize the training loop behavior by subclassing the methods inside [`Trainer`]. This allows you to customize features such as the loss function, optimizer, and scheduler. Take a look at the [`Trainer`] reference for which methods can be subclassed. 
+
+The other way to customize the training loop is by using [Callbacks](./main_classes/callbacks). You can use callbacks to integrate with other libraries and inspect the training loop to report on progress or stop the training early. Callbacks do not modify anything in the training loop itself. To customize something like the loss function, you need to subclass the [`Trainer`] instead.
+
+## Train with TensorFlow
+
+All models are a standard [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) so they can be trained in TensorFlow with the [Keras](https://keras.io/) API. 🤗 Transformers provides the [`~TFPreTrainedModel.prepare_tf_dataset`] method to easily load your dataset as a `tf.data.Dataset` so you can start training right away with Keras' [`compile`](https://keras.io/api/models/model_training_apis/#compile-method) and [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) methods.
+
+1. You'll start with a [`TFPreTrainedModel`] or a [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model):
+
+   ```py
+   >>> from transformers import TFAutoModelForSequenceClassification
+
+   >>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+   ```
+
+2. Load a preprocessing class like a tokenizer, image processor, feature extractor, or processor:
+
+   ```py
+   >>> from transformers import AutoTokenizer
+
+   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+   ```
+
+3. Create a function to tokenize the dataset:
+
+   ```py
+   >>> def tokenize_dataset(dataset):
+   ...     return tokenizer(dataset["text"])  # doctest: +SKIP
+   ```
+
+4. Apply the tokenizer over the entire dataset with [`~datasets.Dataset.map`] and then pass the dataset and tokenizer to [`~TFPreTrainedModel.prepare_tf_dataset`]. You can also change the batch size and shuffle the dataset here if you'd like:
+
+   ```py
+   >>> dataset = dataset.map(tokenize_dataset)  # doctest: +SKIP
+   >>> tf_dataset = model.prepare_tf_dataset(
+   ...     dataset["train"], batch_size=16, shuffle=True, tokenizer=tokenizer
+   ... )  # doctest: +SKIP
+   ```
+
+5. When you're ready, you can call `compile` and `fit` to start training. Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to:
+
+   ```py
+   >>> from tensorflow.keras.optimizers import Adam
+
+   >>> model.compile(optimizer=Adam(3e-5))  # No loss argument!
+   >>> model.fit(tf_dataset)  # doctest: +SKIP
+   ```
+
+## What's next?
+
+Now that you've completed the 🤗 Transformers quick tour, check out our guides and learn how to do more specific things like writing a custom model, fine-tuning a model for a task, and how to train a model with a script. If you're interested in learning more about 🤗 Transformers core concepts, grab a cup of coffee and take a look at our Conceptual Guides!
diff --git a/docs/source/en/quicktour.mdx b/docs/source/en/quicktour.mdx
deleted file mode 100644
index 8b56eac0beba..000000000000
--- a/docs/source/en/quicktour.mdx
+++ /dev/null
@@ -1,542 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Quick tour
-
-[[open-in-colab]]
-
-Get up and running with 🤗 Transformers! Whether you're a developer or an everyday user, this quick tour will help you get started and show you how to use the [`pipeline`] for inference, load a pretrained model and preprocessor with an [AutoClass](./model_doc/auto), and quickly train a model with PyTorch or TensorFlow. If you're a beginner, we recommend checking out our tutorials or [course](https://huggingface.co/course/chapter1/1) next for more in-depth explanations of the concepts introduced here.
-
-Before you begin, make sure you have all the necessary libraries installed:
-
-```bash
-!pip install transformers datasets
-```
-
-You'll also need to install your preferred machine learning framework:
-
-<frameworkcontent>
-<pt>
-```bash
-pip install torch
-```
-</pt>
-<tf>
-```bash
-pip install tensorflow
-```
-</tf>
-</frameworkcontent>
-
-## Pipeline
-
-<Youtube id="tiZFewofSLM"/>
-
-The [`pipeline`] is the easiest way to use a pretrained model for inference. You can use the [`pipeline`] out-of-the-box for many tasks across different modalities. Take a look at the table below for some supported tasks:
-
-| **Task**                     | **Description**                                                                                              | **Modality**    | **Pipeline identifier**                       |
-|------------------------------|--------------------------------------------------------------------------------------------------------------|-----------------|-----------------------------------------------|
-| Text classification          | assign a label to a given sequence of text                                                                   | NLP             | pipeline(task="sentiment-analysis")           |
-| Text generation              | generate text that follows a given prompt                                                                    | NLP             | pipeline(task="text-generation")              |
-| Name entity recognition      | assign a label to each token in a sequence (people, organization, location, etc.)                            | NLP             | pipeline(task="ner")                          |
-| Question answering           | extract an answer from the text given some context and a question                                            | NLP             | pipeline(task="question-answering")           |
-| Fill-mask                    | predict the correct masked token in a sequence                                                               | NLP             | pipeline(task="fill-mask")                    |
-| Summarization                | generate a summary of a sequence of text or document                                                         | NLP             | pipeline(task="summarization")                |
-| Translation                  | translate text from one language into another                                                                | NLP             | pipeline(task="translation")                  |
-| Image classification         | assign a label to an image                                                                                   | Computer vision | pipeline(task="image-classification")         |
-| Image segmentation           | assign a label to each individual pixel of an image (supports semantic, panoptic, and instance segmentation) | Computer vision | pipeline(task="image-segmentation")           |
-| Object detection             | predict the bounding boxes and classes of objects in an image                                                | Computer vision | pipeline(task="object-detection")             |
-| Audio classification         | assign a label to an audio file                                                                              | Audio           | pipeline(task="audio-classification")         |
-| Automatic speech recognition | extract speech from an audio file into text                                                                  | Audio           | pipeline(task="automatic-speech-recognition") |
-| Visual question answering    | given an image and a question, correctly answer a question about the image                                   | Multimodal      | pipeline(task="vqa")                          |
-
-Start by creating an instance of [`pipeline`] and specifying a task you want to use it for. You can use the [`pipeline`] for any of the previously mentioned tasks, and for a complete list of supported tasks, take a look at the [pipeline API reference](./main_classes/pipelines). In this guide though, you'll use the [`pipeline`] for sentiment analysis as an example:
-
-```py
->>> from transformers import pipeline
-
->>> classifier = pipeline("sentiment-analysis")
-```
-
-The [`pipeline`] downloads and caches a default [pretrained model](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english) and tokenizer for sentiment analysis. Now you can use the `classifier` on your target text:
-
-```py
->>> classifier("We are very happy to show you the 🤗 Transformers library.")
-[{'label': 'POSITIVE', 'score': 0.9998}]
-```
-
-If you have more than one input, pass your inputs as a list to the [`pipeline`] to return a list of dictionaries:
-
-```py
->>> results = classifier(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."])
->>> for result in results:
-...     print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
-label: POSITIVE, with score: 0.9998
-label: NEGATIVE, with score: 0.5309
-```
-
-The [`pipeline`] can also iterate over an entire dataset for any task you like. For this example, let's choose automatic speech recognition as our task:
-
-```py
->>> import torch
->>> from transformers import pipeline
-
->>> speech_recognizer = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
-```
-
-Load an audio dataset (see the 🤗 Datasets [Quick Start](https://huggingface.co/docs/datasets/quickstart#audio) for more details) you'd like to iterate over. For example, load the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset:
-
-```py
->>> from datasets import load_dataset, Audio
-
->>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")  # doctest: +IGNORE_RESULT
-```
-
-You need to make sure the sampling rate of the dataset matches the sampling 
-rate [`facebook/wav2vec2-base-960h`](https://huggingface.co/facebook/wav2vec2-base-960h) was trained on:
-
-```py
->>> dataset = dataset.cast_column("audio", Audio(sampling_rate=speech_recognizer.feature_extractor.sampling_rate))
-```
-
-The audio files are automatically loaded and resampled when calling the `"audio"` column.
-Extract the raw waveform arrays from the first 4 samples and pass it as a list to the pipeline:
-
-```py
->>> result = speech_recognizer(dataset[:4]["audio"])
->>> print([d["text"] for d in result])
-['I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT', "FODING HOW I'D SET UP A JOIN TO HET WITH MY WIFE AND WHERE THE AP MIGHT BE", "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE AP SO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AND I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS", 'HOW DO I THURN A JOIN A COUNT']
-```
-
-For larger datasets where the inputs are big (like in speech or vision), you'll want to pass a generator instead of a list to load all the inputs in memory. Take a look at the [pipeline API reference](./main_classes/pipelines) for more information.
-
-### Use another model and tokenizer in the pipeline
-
-The [`pipeline`] can accommodate any model from the [Hub](https://huggingface.co/models), making it easy to adapt the [`pipeline`] for other use-cases. For example, if you'd like a model capable of handling French text, use the tags on the Hub to filter for an appropriate model. The top filtered result returns a multilingual [BERT model](https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment) finetuned for sentiment analysis you can use for French text:
-
-```py
->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
-```
-
-<frameworkcontent>
-<pt>
-Use [`AutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and it's associated tokenizer (more on an `AutoClass` in the next section):
-
-```py
->>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
-
->>> model = AutoModelForSequenceClassification.from_pretrained(model_name)
->>> tokenizer = AutoTokenizer.from_pretrained(model_name)
-```
-</pt>
-<tf>
-Use [`TFAutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and it's associated tokenizer (more on an `TFAutoClass` in the next section):
-
-```py
->>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
-
->>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
->>> tokenizer = AutoTokenizer.from_pretrained(model_name)
-```
-</tf>
-</frameworkcontent>
-
-Specify the model and tokenizer in the [`pipeline`], and now you can apply the `classifier` on French text:
-
-```py
->>> classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
->>> classifier("Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers.")
-[{'label': '5 stars', 'score': 0.7273}]
-```
-
-If you can't find a model for your use-case, you'll need to finetune a pretrained model on your data. Take a look at our [finetuning tutorial](./training) to learn how. Finally, after you've finetuned your pretrained model, please consider [sharing](./model_sharing) the model with the community on the Hub to democratize machine learning for everyone! 🤗
-
-## AutoClass
-
-<Youtube id="AhChOFRegn4"/>
-
-Under the hood, the [`AutoModelForSequenceClassification`] and [`AutoTokenizer`] classes work together to power the [`pipeline`] you used above. An [AutoClass](./model_doc/auto) is a shortcut that automatically retrieves the architecture of a pretrained model from its name or path. You only need to select the appropriate `AutoClass` for your task and it's associated preprocessing class. 
-
-Let's return to the example from the previous section and see how you can use the `AutoClass` to replicate the results of the [`pipeline`].
-
-### AutoTokenizer
-
-A tokenizer is responsible for preprocessing text into an array of numbers as inputs to a model. There are multiple rules that govern the tokenization process, including how to split a word and at what level words should be split (learn more about tokenization in the [tokenizer summary](./tokenizer_summary)). The most important thing to remember is you need to instantiate a tokenizer with the same model name to ensure you're using the same tokenization rules a model was pretrained with.
-
-Load a tokenizer with [`AutoTokenizer`]:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
->>> tokenizer = AutoTokenizer.from_pretrained(model_name)
-```
-
-Pass your text to the tokenizer:
-
-```py
->>> encoding = tokenizer("We are very happy to show you the 🤗 Transformers library.")
->>> print(encoding)
-{'input_ids': [101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103, 100, 58263, 13299, 119, 102],
- 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
- 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
-```
-
-The tokenizer returns a dictionary containing:
-
-* [input_ids](./glossary#input-ids): numerical representations of your tokens.
-* [attention_mask](.glossary#attention-mask): indicates which tokens should be attended to.
-
-A tokenizer can also accept a list of inputs, and pad and truncate the text to return a batch with uniform length:
-
-<frameworkcontent>
-<pt>
-```py
->>> pt_batch = tokenizer(
-...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
-...     padding=True,
-...     truncation=True,
-...     max_length=512,
-...     return_tensors="pt",
-... )
-```
-</pt>
-<tf>
-```py
->>> tf_batch = tokenizer(
-...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
-...     padding=True,
-...     truncation=True,
-...     max_length=512,
-...     return_tensors="tf",
-... )
-```
-</tf>
-</frameworkcontent>
-
-<Tip>
-
-Check out the [preprocess](./preprocessing) tutorial for more details about tokenization, and how to use an [`AutoImageProcessor`], [`AutoFeatureExtractor`] and [`AutoProcessor`] to preprocess image, audio, and multimodal inputs.
-
-</Tip>
-
-### AutoModel
-
-<frameworkcontent>
-<pt>
-🤗 Transformers provides a simple and unified way to load pretrained instances. This means you can load an [`AutoModel`] like you would load an [`AutoTokenizer`]. The only difference is selecting the correct [`AutoModel`] for the task. For text (or sequence) classification, you should load [`AutoModelForSequenceClassification`]:
-
-```py
->>> from transformers import AutoModelForSequenceClassification
-
->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
->>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)
-```
-
-<Tip>
-
-See the [task summary](./task_summary) for tasks supported by an [`AutoModel`] class.
-
-</Tip>
-
-Now pass your preprocessed batch of inputs directly to the model. You just have to unpack the dictionary by adding `**`:
-
-```py
->>> pt_outputs = pt_model(**pt_batch)
-```
-
-The model outputs the final activations in the `logits` attribute. Apply the softmax function to the `logits` to retrieve the probabilities:
-
-```py
->>> from torch import nn
-
->>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
->>> print(pt_predictions)
-tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
-        [0.2084, 0.1826, 0.1969, 0.1755, 0.2365]], grad_fn=<SoftmaxBackward0>)
-```
-</pt>
-<tf>
-🤗 Transformers provides a simple and unified way to load pretrained instances. This means you can load an [`TFAutoModel`] like you would load an [`AutoTokenizer`]. The only difference is selecting the correct [`TFAutoModel`] for the task. For text (or sequence) classification, you should load [`TFAutoModelForSequenceClassification`]:
-
-```py
->>> from transformers import TFAutoModelForSequenceClassification
-
->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
-```
-
-<Tip>
-
-See the [task summary](./task_summary) for tasks supported by an [`AutoModel`] class.
-
-</Tip>
-
-Now pass your preprocessed batch of inputs directly to the model by passing the dictionary keys directly to the tensors:
-
-```py
->>> tf_outputs = tf_model(tf_batch)
-```
-
-The model outputs the final activations in the `logits` attribute. Apply the softmax function to the `logits` to retrieve the probabilities:
-
-```py
->>> import tensorflow as tf
-
->>> tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1)
->>> tf_predictions  # doctest: +IGNORE_RESULT
-```
-</tf>
-</frameworkcontent>
-
-<Tip>
-
-All 🤗 Transformers models (PyTorch or TensorFlow) output the tensors *before* the final activation
-function (like softmax) because the final activation function is often fused with the loss. Model outputs are special dataclasses so their attributes are autocompleted in an IDE. The model outputs behave like a tuple or a dictionary (you can index with an integer, a slice or a string) in which case, attributes that are None are ignored.
-
-</Tip>
-
-### Save a model
-
-<frameworkcontent>
-<pt>
-Once your model is fine-tuned, you can save it with its tokenizer using [`PreTrainedModel.save_pretrained`]:
-
-```py
->>> pt_save_directory = "./pt_save_pretrained"
->>> tokenizer.save_pretrained(pt_save_directory)  # doctest: +IGNORE_RESULT
->>> pt_model.save_pretrained(pt_save_directory)
-```
-
-When you are ready to use the model again, reload it with [`PreTrainedModel.from_pretrained`]:
-
-```py
->>> pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained")
-```
-</pt>
-<tf>
-Once your model is fine-tuned, you can save it with its tokenizer using [`TFPreTrainedModel.save_pretrained`]:
-
-```py
->>> tf_save_directory = "./tf_save_pretrained"
->>> tokenizer.save_pretrained(tf_save_directory)  # doctest: +IGNORE_RESULT
->>> tf_model.save_pretrained(tf_save_directory)
-```
-
-When you are ready to use the model again, reload it with [`TFPreTrainedModel.from_pretrained`]:
-
-```py
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("./tf_save_pretrained")
-```
-</tf>
-</frameworkcontent>
-
-One particularly cool 🤗 Transformers feature is the ability to save a model and reload it as either a PyTorch or TensorFlow model. The `from_pt` or `from_tf` parameter can convert the model from one framework to the other:
-
-<frameworkcontent>
-<pt>
-```py
->>> from transformers import AutoModel
-
->>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
->>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
-```
-</pt>
-<tf>
-```py
->>> from transformers import TFAutoModel
-
->>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
-```
-</tf>
-</frameworkcontent>
-
-## Custom model builds
-
-You can modify the model's configuration class to change how a model is built. The configuration specifies a model's attributes, such as the number of hidden layers or attention heads. You start from scratch when you initialize a model from a custom configuration class. The model attributes are randomly initialized, and you'll need to train the model before you can use it to get meaningful results.
-
-Start by importing [`AutoConfig`], and then load the pretrained model you want to modify. Within [`AutoConfig.from_pretrained`], you can specify the attribute you want to change, such as the number of attention heads:
-
-```py
->>> from transformers import AutoConfig
-
->>> my_config = AutoConfig.from_pretrained("distilbert-base-uncased", n_heads=12)
-```
-
-<frameworkcontent>
-<pt>
-Create a model from your custom configuration with [`AutoModel.from_config`]:
-
-```py
->>> from transformers import AutoModel
-
->>> my_model = AutoModel.from_config(my_config)
-```
-</pt>
-<tf>
-Create a model from your custom configuration with [`TFAutoModel.from_config`]:
-
-```py
->>> from transformers import TFAutoModel
-
->>> my_model = TFAutoModel.from_config(my_config)
-```
-</tf>
-</frameworkcontent>
-
-Take a look at the [Create a custom architecture](./create_a_model) guide for more information about building custom configurations.
-
-## Trainer - a PyTorch optimized training loop
-
-All models are a standard [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) so you can use them in any typical training loop. While you can write your own training loop, 🤗 Transformers provides a [`Trainer`] class for PyTorch, which contains the basic training loop and adds additional functionality for features like distributed training, mixed precision, and more.
-
-Depending on your task, you'll typically pass the following parameters to [`Trainer`]:
-
-1. A [`PreTrainedModel`] or a [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module):
-
-   ```py
-   >>> from transformers import AutoModelForSequenceClassification
-
-   >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
-   ```
-
-2. [`TrainingArguments`] contains the model hyperparameters you can change like learning rate, batch size, and the number of epochs to train for. The default values are used if you don't specify any training arguments:
-
-   ```py
-   >>> from transformers import TrainingArguments
-
-   >>> training_args = TrainingArguments(
-   ...     output_dir="path/to/save/folder/",
-   ...     learning_rate=2e-5,
-   ...     per_device_train_batch_size=8,
-   ...     per_device_eval_batch_size=8,
-   ...     num_train_epochs=2,
-   ... )
-   ```
-
-3. A preprocessing class like a tokenizer, image processor, feature extractor, or processor:
-
-   ```py
-   >>> from transformers import AutoTokenizer
-
-   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
-   ```
-
-4. Load a dataset:
-
-   ```py
-   >>> from datasets import load_dataset
-
-   >>> dataset = load_dataset("rotten_tomatoes")  # doctest: +IGNORE_RESULT
-   ```
-
-5. Create a function to tokenize the dataset:
-
-   ```py
-   >>> def tokenize_dataset(dataset):
-   ...     return tokenizer(dataset["text"])
-   ```
-
-   Then apply it over the entire dataset with [`~datasets.Dataset.map`]:
-
-   ```py
-   >>> dataset = dataset.map(tokenize_dataset, batched=True)
-   ```
-
-6. A [`DataCollatorWithPadding`] to create a batch of examples from your dataset:
-
-   ```py
-   >>> from transformers import DataCollatorWithPadding
-
-   >>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
-   ```
-
-Now gather all these classes in [`Trainer`]:
-
-```py
->>> from transformers import Trainer
-
->>> trainer = Trainer(
-...     model=model,
-...     args=training_args,
-...     train_dataset=dataset["train"],
-...     eval_dataset=dataset["test"],
-...     tokenizer=tokenizer,
-...     data_collator=data_collator,
-... )  # doctest: +SKIP
-```
-
-When you're ready, call [`~Trainer.train`] to start training:
-
-```py
->>> trainer.train()  # doctest: +SKIP
-```
-
-<Tip>
-
-For tasks - like translation or summarization - that use a sequence-to-sequence model, use the [`Seq2SeqTrainer`] and [`Seq2SeqTrainingArguments`] classes instead.
-
-</Tip>
-
-You can customize the training loop behavior by subclassing the methods inside [`Trainer`]. This allows you to customize features such as the loss function, optimizer, and scheduler. Take a look at the [`Trainer`] reference for which methods can be subclassed. 
-
-The other way to customize the training loop is by using [Callbacks](./main_classes/callbacks). You can use callbacks to integrate with other libraries and inspect the training loop to report on progress or stop the training early. Callbacks do not modify anything in the training loop itself. To customize something like the loss function, you need to subclass the [`Trainer`] instead.
-
-## Train with TensorFlow
-
-All models are a standard [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) so they can be trained in TensorFlow with the [Keras](https://keras.io/) API. 🤗 Transformers provides the [`~TFPreTrainedModel.prepare_tf_dataset`] method to easily load your dataset as a `tf.data.Dataset` so you can start training right away with Keras' [`compile`](https://keras.io/api/models/model_training_apis/#compile-method) and [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) methods.
-
-1. You'll start with a [`TFPreTrainedModel`] or a [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model):
-
-   ```py
-   >>> from transformers import TFAutoModelForSequenceClassification
-
-   >>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
-   ```
-
-2. A preprocessing class like a tokenizer, image processor, feature extractor, or processor:
-
-   ```py
-   >>> from transformers import AutoTokenizer
-
-   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
-   ```
-
-3. Create a function to tokenize the dataset:
-
-   ```py
-   >>> def tokenize_dataset(dataset):
-   ...     return tokenizer(dataset["text"])  # doctest: +SKIP
-   ```
-
-4. Apply the tokenizer over the entire dataset with [`~datasets.Dataset.map`] and then pass the dataset and tokenizer to [`~TFPreTrainedModel.prepare_tf_dataset`]. You can also change the batch size and shuffle the dataset here if you'd like:
-
-   ```py
-   >>> dataset = dataset.map(tokenize_dataset)  # doctest: +SKIP
-   >>> tf_dataset = model.prepare_tf_dataset(
-   ...     dataset, batch_size=16, shuffle=True, tokenizer=tokenizer
-   ... )  # doctest: +SKIP
-   ```
-
-5. When you're ready, you can call `compile` and `fit` to start training:
-
-   ```py
-   >>> from tensorflow.keras.optimizers import Adam
-
-   >>> model.compile(optimizer=Adam(3e-5))
-   >>> model.fit(dataset)  # doctest: +SKIP
-   ```
-
-## What's next?
-
-Now that you've completed the 🤗 Transformers quick tour, check out our guides and learn how to do more specific things like writing a custom model, fine-tuning a model for a task, and how to train a model with a script. If you're interested in learning more about 🤗 Transformers core concepts, grab a cup of coffee and take a look at our Conceptual Guides!
diff --git a/docs/source/en/run_scripts.md b/docs/source/en/run_scripts.md
new file mode 100644
index 000000000000..3b40b6ea0672
--- /dev/null
+++ b/docs/source/en/run_scripts.md
@@ -0,0 +1,351 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Train with a script
+
+Along with the 🤗 Transformers [notebooks](./noteboks/README), there are also example scripts demonstrating how to train a model for a task with [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow), or [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax).
+
+You will also find scripts we've used in our [research projects](https://github.com/huggingface/transformers/tree/main/examples/research_projects) and [legacy examples](https://github.com/huggingface/transformers/tree/main/examples/legacy) which are mostly community contributed. These scripts are not actively maintained and require a specific version of 🤗 Transformers that will most likely be incompatible with the latest version of the library.
+
+The example scripts are not expected to work out-of-the-box on every problem, and you may need to adapt the script to the problem you're trying to solve. To help you with this, most of the scripts fully expose how data is preprocessed, allowing you to edit it as necessary for your use case.
+
+For any feature you'd like to implement in an example script, please discuss it on the [forum](https://discuss.huggingface.co/) or in an [issue](https://github.com/huggingface/transformers/issues) before submitting a Pull Request. While we welcome bug fixes, it is unlikely we will merge a Pull Request that adds more functionality at the cost of readability.
+
+This guide will show you how to run an example summarization training script in [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization) and [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization). All examples are expected to work with both frameworks unless otherwise specified.
+
+## Setup
+
+To successfully run the latest version of the example scripts, you have to **install 🤗 Transformers from source** in a new virtual environment:
+
+```bash
+git clone https://github.com/huggingface/transformers
+cd transformers
+pip install .
+```
+
+For older versions of the example scripts, click on the toggle below:
+
+<details>
+  <summary>Examples for older versions of 🤗 Transformers</summary>
+	<ul>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.5.1/examples">v4.5.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.4.2/examples">v4.4.2</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.3.3/examples">v4.3.3</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.2.2/examples">v4.2.2</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.1.1/examples">v4.1.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.0.1/examples">v4.0.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.5.1/examples">v3.5.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.4.0/examples">v3.4.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.3.1/examples">v3.3.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.2.0/examples">v3.2.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.1.0/examples">v3.1.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.0.2/examples">v3.0.2</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.11.0/examples">v2.11.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.10.0/examples">v2.10.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.9.1/examples">v2.9.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.8.0/examples">v2.8.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.7.0/examples">v2.7.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.6.0/examples">v2.6.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.5.1/examples">v2.5.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.4.0/examples">v2.4.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.3.0/examples">v2.3.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.2.0/examples">v2.2.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.1.0/examples">v2.1.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.0.0/examples">v2.0.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v1.2.0/examples">v1.2.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v1.1.0/examples">v1.1.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v1.0.0/examples">v1.0.0</a></li>
+	</ul>
+</details>
+
+Then switch your current clone of 🤗 Transformers to a specific version, like v3.5.1 for example:
+
+```bash
+git checkout tags/v3.5.1
+```
+
+After you've setup the correct library version, navigate to the example folder of your choice and install the example specific requirements:
+
+```bash
+pip install -r requirements.txt
+```
+
+## Run a script
+
+<frameworkcontent>
+<pt>
+The example script downloads and preprocesses a dataset from the 🤗 [Datasets](https://huggingface.co/docs/datasets/) library. Then the script fine-tunes a dataset with the [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) on an architecture that supports summarization. The following example shows how to fine-tune [T5-small](https://huggingface.co/t5-small) on the [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) dataset. The T5 model requires an additional `source_prefix` argument due to how it was trained. This prompt lets T5 know this is a summarization task.
+
+```bash
+python examples/pytorch/summarization/run_summarization.py \
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+</pt>
+<tf>
+The example script downloads and preprocesses a dataset from the 🤗 [Datasets](https://huggingface.co/docs/datasets/) library. Then the script fine-tunes a dataset using Keras on an architecture that supports summarization. The following example shows how to fine-tune [T5-small](https://huggingface.co/t5-small) on the [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) dataset. The T5 model requires an additional `source_prefix` argument due to how it was trained. This prompt lets T5 know this is a summarization task.
+
+```bash
+python examples/tensorflow/summarization/run_summarization.py  \
+    --model_name_or_path t5-small \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --output_dir /tmp/tst-summarization  \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 16 \
+    --num_train_epochs 3 \
+    --do_train \
+    --do_eval
+```
+</tf>
+</frameworkcontent>
+
+## Distributed training and mixed precision
+
+The [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) supports distributed training and mixed precision, which means you can also use it in a script. To enable both of these features:
+
+- Add the `fp16` argument to enable mixed precision.
+- Set the number of GPUs to use with the `nproc_per_node` argument.
+
+```bash
+python -m torch.distributed.launch \
+    --nproc_per_node 8 pytorch/summarization/run_summarization.py \
+    --fp16 \
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+
+TensorFlow scripts utilize a [`MirroredStrategy`](https://www.tensorflow.org/guide/distributed_training#mirroredstrategy) for distributed training, and you don't need to add any additional arguments to the training script. The TensorFlow script will use multiple GPUs by default if they are available.
+
+## Run a script on a TPU
+
+<frameworkcontent>
+<pt>
+Tensor Processing Units (TPUs) are specifically designed to accelerate performance. PyTorch supports TPUs with the [XLA](https://www.tensorflow.org/xla) deep learning compiler (see [here](https://github.com/pytorch/xla/blob/master/README.md) for more details). To use a TPU, launch the `xla_spawn.py` script and use the `num_cores` argument to set the number of TPU cores you want to use.
+
+```bash
+python xla_spawn.py --num_cores 8 \
+    summarization/run_summarization.py \
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+</pt>
+<tf>
+Tensor Processing Units (TPUs) are specifically designed to accelerate performance. TensorFlow scripts utilize a [`TPUStrategy`](https://www.tensorflow.org/guide/distributed_training#tpustrategy) for training on TPUs. To use a TPU, pass the name of the TPU resource to the `tpu` argument.
+
+```bash
+python run_summarization.py  \
+    --tpu name_of_tpu_resource \
+    --model_name_or_path t5-small \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --output_dir /tmp/tst-summarization  \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 16 \
+    --num_train_epochs 3 \
+    --do_train \
+    --do_eval
+```
+</tf>
+</frameworkcontent>
+
+## Run a script with 🤗 Accelerate
+
+🤗 [Accelerate](https://huggingface.co/docs/accelerate) is a PyTorch-only library that offers a unified method for training a model on several types of setups (CPU-only, multiple GPUs, TPUs) while maintaining complete visibility into the PyTorch training loop. Make sure you have 🤗 Accelerate installed if you don't already have it:
+
+> Note: As Accelerate is rapidly developing, the git version of accelerate must be installed to run the scripts
+```bash
+pip install git+https://github.com/huggingface/accelerate
+```
+
+Instead of the `run_summarization.py` script, you need to use the `run_summarization_no_trainer.py` script. 🤗 Accelerate supported scripts will have a `task_no_trainer.py` file in the folder. Begin by running the following command to create and save a configuration file:
+
+```bash
+accelerate config
+```
+
+Test your setup to make sure it is configured correctly:
+
+```bash
+accelerate test
+```
+
+Now you are ready to launch the training:
+
+```bash
+accelerate launch run_summarization_no_trainer.py \
+    --model_name_or_path t5-small \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir ~/tmp/tst-summarization
+```
+
+## Use a custom dataset
+
+The summarization script supports custom datasets as long as they are a CSV or JSON Line file. When you use your own dataset, you need to specify several additional arguments:
+
+- `train_file` and `validation_file` specify the path to your training and validation files.
+- `text_column` is the input text to summarize.
+- `summary_column` is the target text to output.
+
+A summarization script using a custom dataset would look like this:
+
+```bash
+python examples/pytorch/summarization/run_summarization.py \
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --train_file path_to_csv_or_jsonlines_file \
+    --validation_file path_to_csv_or_jsonlines_file \
+    --text_column text_column_name \
+    --summary_column summary_column_name \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --overwrite_output_dir \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --predict_with_generate
+```
+
+## Test a script
+
+It is often a good idea to run your script on a smaller number of dataset examples to ensure everything works as expected before committing to an entire dataset which may take hours to complete. Use the following arguments to truncate the dataset to a maximum number of samples:
+
+- `max_train_samples`
+- `max_eval_samples`
+- `max_predict_samples`
+
+```bash
+python examples/pytorch/summarization/run_summarization.py \
+    --model_name_or_path t5-small \
+    --max_train_samples 50 \
+    --max_eval_samples 50 \
+    --max_predict_samples 50 \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+
+Not all example scripts support the `max_predict_samples` argument. If you aren't sure whether your script supports this argument, add the `-h` argument to check:
+
+```bash
+examples/pytorch/summarization/run_summarization.py -h
+```
+
+## Resume training from checkpoint
+
+Another helpful option to enable is resuming training from a previous checkpoint. This will ensure you can pick up where you left off without starting over if your training gets interrupted. There are two methods to resume training from a checkpoint.
+
+The first method uses the `output_dir previous_output_dir` argument to resume training from the latest checkpoint stored in `output_dir`. In this case, you should remove `overwrite_output_dir`:
+
+```bash
+python examples/pytorch/summarization/run_summarization.py
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --output_dir previous_output_dir \
+    --predict_with_generate
+```
+
+The second method uses the `resume_from_checkpoint path_to_specific_checkpoint` argument to resume training from a specific checkpoint folder.
+
+```bash
+python examples/pytorch/summarization/run_summarization.py
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --resume_from_checkpoint path_to_specific_checkpoint \
+    --predict_with_generate
+```
+
+## Share your model
+
+All scripts can upload your final model to the [Model Hub](https://huggingface.co/models). Make sure you are logged into Hugging Face before you begin:
+
+```bash
+huggingface-cli login
+```
+
+Then add the `push_to_hub` argument to the script. This argument will create a repository with your Hugging Face username and the folder name specified in `output_dir`.
+
+To give your repository a specific name, use the `push_to_hub_model_id` argument to add it. The repository will be automatically listed under your namespace.
+
+The following example shows how to upload a model with a specific repository name:
+
+```bash
+python examples/pytorch/summarization/run_summarization.py
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --push_to_hub \
+    --push_to_hub_model_id finetuned-t5-cnn_dailymail \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
\ No newline at end of file
diff --git a/docs/source/en/run_scripts.mdx b/docs/source/en/run_scripts.mdx
deleted file mode 100644
index 58d6b8dd3e20..000000000000
--- a/docs/source/en/run_scripts.mdx
+++ /dev/null
@@ -1,347 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Train with a script
-
-Along with the 🤗 Transformers [notebooks](./noteboks/README), there are also example scripts demonstrating how to train a model for a task with [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow), or [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax).
-
-You will also find scripts we've used in our [research projects](https://github.com/huggingface/transformers/tree/main/examples/research_projects) and [legacy examples](https://github.com/huggingface/transformers/tree/main/examples/legacy) which are mostly community contributed. These scripts are not actively maintained and require a specific version of 🤗 Transformers that will most likely be incompatible with the latest version of the library.
-
-The example scripts are not expected to work out-of-the-box on every problem, and you may need to adapt the script to the problem you're trying to solve. To help you with this, most of the scripts fully expose how data is preprocessed, allowing you to edit it as necessary for your use case.
-
-For any feature you'd like to implement in an example script, please discuss it on the [forum](https://discuss.huggingface.co/) or in an [issue](https://github.com/huggingface/transformers/issues) before submitting a Pull Request. While we welcome bug fixes, it is unlikely we will merge a Pull Request that adds more functionality at the cost of readability.
-
-This guide will show you how to run an example summarization training script in [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization) and [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization). All examples are expected to work with both frameworks unless otherwise specified.
-
-## Setup
-
-To successfully run the latest version of the example scripts, you have to **install 🤗 Transformers from source** in a new virtual environment:
-
-```bash
-git clone https://github.com/huggingface/transformers
-cd transformers
-pip install .
-```
-
-For older versions of the example scripts, click on the toggle below:
-
-<details>
-  <summary>Examples for older versions of 🤗 Transformers</summary>
-	<ul>
-		<li><a href="https://github.com/huggingface/transformers/tree/v4.5.1/examples">v4.5.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v4.4.2/examples">v4.4.2</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v4.3.3/examples">v4.3.3</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v4.2.2/examples">v4.2.2</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v4.1.1/examples">v4.1.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v4.0.1/examples">v4.0.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v3.5.1/examples">v3.5.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v3.4.0/examples">v3.4.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v3.3.1/examples">v3.3.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v3.2.0/examples">v3.2.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v3.1.0/examples">v3.1.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v3.0.2/examples">v3.0.2</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.11.0/examples">v2.11.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.10.0/examples">v2.10.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.9.1/examples">v2.9.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.8.0/examples">v2.8.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.7.0/examples">v2.7.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.6.0/examples">v2.6.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.5.1/examples">v2.5.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.4.0/examples">v2.4.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.3.0/examples">v2.3.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.2.0/examples">v2.2.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.1.0/examples">v2.1.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.0.0/examples">v2.0.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v1.2.0/examples">v1.2.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v1.1.0/examples">v1.1.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v1.0.0/examples">v1.0.0</a></li>
-	</ul>
-</details>
-
-Then switch your current clone of 🤗 Transformers to a specific version, like v3.5.1 for example:
-
-```bash
-git checkout tags/v3.5.1
-```
-
-After you've setup the correct library version, navigate to the example folder of your choice and install the example specific requirements:
-
-```bash
-pip install -r requirements.txt
-```
-
-## Run a script
-
-<frameworkcontent>
-<pt>
-The example script downloads and preprocesses a dataset from the 🤗 [Datasets](https://huggingface.co/docs/datasets/) library. Then the script fine-tunes a dataset with the [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) on an architecture that supports summarization. The following example shows how to fine-tune [T5-small](https://huggingface.co/t5-small) on the [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) dataset. The T5 model requires an additional `source_prefix` argument due to how it was trained. This prompt lets T5 know this is a summarization task.
-
-```bash
-python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path t5-small \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --output_dir /tmp/tst-summarization \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --overwrite_output_dir \
-    --predict_with_generate
-```
-</pt>
-<tf>
-The example script downloads and preprocesses a dataset from the 🤗 [Datasets](https://huggingface.co/docs/datasets/) library. Then the script fine-tunes a dataset using Keras on an architecture that supports summarization. The following example shows how to fine-tune [T5-small](https://huggingface.co/t5-small) on the [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) dataset. The T5 model requires an additional `source_prefix` argument due to how it was trained. This prompt lets T5 know this is a summarization task.
-
-```bash
-python examples/tensorflow/summarization/run_summarization.py  \
-    --model_name_or_path t5-small \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --output_dir /tmp/tst-summarization  \
-    --per_device_train_batch_size 8 \
-    --per_device_eval_batch_size 16 \
-    --num_train_epochs 3 \
-    --do_train \
-    --do_eval
-```
-</tf>
-</frameworkcontent>
-
-## Distributed training and mixed precision
-
-The [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) supports distributed training and mixed precision, which means you can also use it in a script. To enable both of these features:
-
-- Add the `fp16` argument to enable mixed precision.
-- Set the number of GPUs to use with the `nproc_per_node` argument.
-
-```bash
-python -m torch.distributed.launch \
-    --nproc_per_node 8 pytorch/summarization/run_summarization.py \
-    --fp16 \
-    --model_name_or_path t5-small \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --output_dir /tmp/tst-summarization \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --overwrite_output_dir \
-    --predict_with_generate
-```
-
-TensorFlow scripts utilize a [`MirroredStrategy`](https://www.tensorflow.org/guide/distributed_training#mirroredstrategy) for distributed training, and you don't need to add any additional arguments to the training script. The TensorFlow script will use multiple GPUs by default if they are available.
-
-## Run a script on a TPU
-
-<frameworkcontent>
-<pt>
-Tensor Processing Units (TPUs) are specifically designed to accelerate performance. PyTorch supports TPUs with the [XLA](https://www.tensorflow.org/xla) deep learning compiler (see [here](https://github.com/pytorch/xla/blob/master/README.md) for more details). To use a TPU, launch the `xla_spawn.py` script and use the `num_cores` argument to set the number of TPU cores you want to use.
-
-```bash
-python xla_spawn.py --num_cores 8 \
-    summarization/run_summarization.py \
-    --model_name_or_path t5-small \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --output_dir /tmp/tst-summarization \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --overwrite_output_dir \
-    --predict_with_generate
-```
-</pt>
-<tf>
-Tensor Processing Units (TPUs) are specifically designed to accelerate performance. TensorFlow scripts utilize a [`TPUStrategy`](https://www.tensorflow.org/guide/distributed_training#tpustrategy) for training on TPUs. To use a TPU, pass the name of the TPU resource to the `tpu` argument.
-
-```bash
-python run_summarization.py  \
-    --tpu name_of_tpu_resource \
-    --model_name_or_path t5-small \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --output_dir /tmp/tst-summarization  \
-    --per_device_train_batch_size 8 \
-    --per_device_eval_batch_size 16 \
-    --num_train_epochs 3 \
-    --do_train \
-    --do_eval
-```
-</tf>
-</frameworkcontent>
-
-## Run a script with 🤗 Accelerate
-
-🤗 [Accelerate](https://huggingface.co/docs/accelerate) is a PyTorch-only library that offers a unified method for training a model on several types of setups (CPU-only, multiple GPUs, TPUs) while maintaining complete visibility into the PyTorch training loop. Make sure you have 🤗 Accelerate installed if you don't already have it:
-
-> Note: As Accelerate is rapidly developing, the git version of accelerate must be installed to run the scripts
-```bash
-pip install git+https://github.com/huggingface/accelerate
-```
-
-Instead of the `run_summarization.py` script, you need to use the `run_summarization_no_trainer.py` script. 🤗 Accelerate supported scripts will have a `task_no_trainer.py` file in the folder. Begin by running the following command to create and save a configuration file:
-
-```bash
-accelerate config
-```
-
-Test your setup to make sure it is configured correctly:
-
-```bash
-accelerate test
-```
-
-Now you are ready to launch the training:
-
-```bash
-accelerate launch run_summarization_no_trainer.py \
-    --model_name_or_path t5-small \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --output_dir ~/tmp/tst-summarization
-```
-
-## Use a custom dataset
-
-The summarization script supports custom datasets as long as they are a CSV or JSON Line file. When you use your own dataset, you need to specify several additional arguments:
-
-- `train_file` and `validation_file` specify the path to your training and validation files.
-- `text_column` is the input text to summarize.
-- `summary_column` is the target text to output.
-
-A summarization script using a custom dataset would look like this:
-
-```bash
-python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path t5-small \
-    --do_train \
-    --do_eval \
-    --train_file path_to_csv_or_jsonlines_file \
-    --validation_file path_to_csv_or_jsonlines_file \
-    --text_column text_column_name \
-    --summary_column summary_column_name \
-    --source_prefix "summarize: " \
-    --output_dir /tmp/tst-summarization \
-    --overwrite_output_dir \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --predict_with_generate
-```
-
-## Test a script
-
-It is often a good idea to run your script on a smaller number of dataset examples to ensure everything works as expected before committing to an entire dataset which may take hours to complete. Use the following arguments to truncate the dataset to a maximum number of samples:
-
-- `max_train_samples`
-- `max_eval_samples`
-- `max_predict_samples`
-
-```bash
-python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path t5-small \
-    --max_train_samples 50 \
-    --max_eval_samples 50 \
-    --max_predict_samples 50 \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --output_dir /tmp/tst-summarization \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --overwrite_output_dir \
-    --predict_with_generate
-```
-
-Not all example scripts support the `max_predict_samples` argument. If you aren't sure whether your script supports this argument, add the `-h` argument to check:
-
-```bash
-examples/pytorch/summarization/run_summarization.py -h
-```
-
-## Resume training from checkpoint
-
-Another helpful option to enable is resuming training from a previous checkpoint. This will ensure you can pick up where you left off without starting over if your training gets interrupted. There are two methods to resume training from a checkpoint.
-
-The first method uses the `output_dir previous_output_dir` argument to resume training from the latest checkpoint stored in `output_dir`. In this case, you should remove `overwrite_output_dir`:
-
-```bash
-python examples/pytorch/summarization/run_summarization.py
-    --model_name_or_path t5-small \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --output_dir /tmp/tst-summarization \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --output_dir previous_output_dir \
-    --predict_with_generate
-```
-
-The second method uses the `resume_from_checkpoint path_to_specific_checkpoint` argument to resume training from a specific checkpoint folder.
-
-```bash
-python examples/pytorch/summarization/run_summarization.py
-    --model_name_or_path t5-small \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --output_dir /tmp/tst-summarization \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --overwrite_output_dir \
-    --resume_from_checkpoint path_to_specific_checkpoint \
-    --predict_with_generate
-```
-
-## Share your model
-
-All scripts can upload your final model to the [Model Hub](https://huggingface.co/models). Make sure you are logged into Hugging Face before you begin:
-
-```bash
-huggingface-cli login
-```
-
-Then add the `push_to_hub` argument to the script. This argument will create a repository with your Hugging Face username and the folder name specified in `output_dir`.
-
-To give your repository a specific name, use the `push_to_hub_model_id` argument to add it. The repository will be automatically listed under your namespace.
-
-The following example shows how to upload a model with a specific repository name:
-
-```bash
-python examples/pytorch/summarization/run_summarization.py
-    --model_name_or_path t5-small \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --push_to_hub \
-    --push_to_hub_model_id finetuned-t5-cnn_dailymail \
-    --output_dir /tmp/tst-summarization \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --overwrite_output_dir \
-    --predict_with_generate
-```
\ No newline at end of file
diff --git a/docs/source/en/sagemaker.md b/docs/source/en/sagemaker.md
new file mode 100644
index 000000000000..f0a5a5f9c114
--- /dev/null
+++ b/docs/source/en/sagemaker.md
@@ -0,0 +1,29 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Run training on Amazon SageMaker
+
+The documentation has been moved to [hf.co/docs/sagemaker](https://huggingface.co/docs/sagemaker). This page will be removed in `transformers` 5.0. 
+
+### Table of Content
+
+- [Train Hugging Face models on Amazon SageMaker with the SageMaker Python SDK](https://huggingface.co/docs/sagemaker/train)
+- [Deploy Hugging Face models to Amazon SageMaker with the SageMaker Python SDK](https://huggingface.co/docs/sagemaker/inference)
+- [Frequently Asked Questions](https://huggingface.co/docs/sagemaker/faq)
diff --git a/docs/source/en/sagemaker.mdx b/docs/source/en/sagemaker.mdx
deleted file mode 100644
index 1ffdd4326e4d..000000000000
--- a/docs/source/en/sagemaker.mdx
+++ /dev/null
@@ -1,25 +0,0 @@
-<!---
-Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Run training on Amazon SageMaker
-
-The documentation has been moved to [hf.co/docs/sagemaker](https://huggingface.co/docs/sagemaker). This page will be removed in `transformers` 5.0. 
-
-### Table of Content
-
-- [Train Hugging Face models on Amazon SageMaker with the SageMaker Python SDK](https://huggingface.co/docs/sagemaker/train)
-- [Deploy Hugging Face models to Amazon SageMaker with the SageMaker Python SDK](https://huggingface.co/docs/sagemaker/inference)
-- [Frequently Asked Questions](https://huggingface.co/docs/sagemaker/faq)
diff --git a/docs/source/en/serialization.md b/docs/source/en/serialization.md
new file mode 100644
index 000000000000..9fec884a8be4
--- /dev/null
+++ b/docs/source/en/serialization.md
@@ -0,0 +1,210 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Export to ONNX
+
+Deploying 🤗 Transformers models in production environments often requires, or can benefit from exporting the models into 
+a serialized format that can be loaded and executed on specialized runtimes and hardware.
+
+🤗 Optimum is an extension of Transformers that enables exporting models from PyTorch or TensorFlow to serialized formats 
+such as ONNX and TFLite through its `exporters` module. 🤗 Optimum also provides a set of performance optimization tools to train 
+and run models on targeted hardware with maximum efficiency.
+
+This guide demonstrates how you can export 🤗 Transformers models to ONNX with 🤗 Optimum, for the guide on exporting models to TFLite, 
+please refer to the [Export to TFLite page](tflite).
+
+## Export to ONNX 
+
+[ONNX (Open Neural Network eXchange)](http://onnx.ai) is an open standard that defines a common set of operators and a 
+common file format to represent deep learning models in a wide variety of frameworks, including PyTorch and
+TensorFlow. When a model is exported to the ONNX format, these operators are used to
+construct a computational graph (often called an _intermediate representation_) which
+represents the flow of data through the neural network.
+
+By exposing a graph with standardized operators and data types, ONNX makes it easy to
+switch between frameworks. For example, a model trained in PyTorch can be exported to
+ONNX format and then imported in TensorFlow (and vice versa).
+
+Once exported to ONNX format, a model can be:
+- optimized for inference via techniques such as [graph optimization](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization) and [quantization](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/quantization). 
+- run with ONNX Runtime via [`ORTModelForXXX` classes](https://huggingface.co/docs/optimum/onnxruntime/package_reference/modeling_ort),
+which follow the same `AutoModel` API as the one you are used to in 🤗 Transformers.
+- run with [optimized inference pipelines](https://huggingface.co/docs/optimum/main/en/onnxruntime/usage_guides/pipelines),
+which has the same API as the [`pipeline`] function in 🤗 Transformers. 
+
+🤗 Optimum provides support for the ONNX export by leveraging configuration objects. These configuration objects come 
+ready-made for a number of model architectures, and are designed to be easily extendable to other architectures.
+
+For the list of ready-made configurations, please refer to [🤗 Optimum documentation](https://huggingface.co/docs/optimum/exporters/onnx/overview).
+
+There are two ways to export a 🤗 Transformers model to ONNX, here we show both:
+
+- export with 🤗 Optimum via CLI.
+- export with 🤗 Optimum with `optimum.onnxruntime`.
+
+### Exporting a 🤗 Transformers model to ONNX with CLI
+
+To export a 🤗 Transformers model to ONNX, first install an extra dependency:
+
+```bash
+pip install optimum[exporters]
+```
+
+To check out all available arguments, refer to the [🤗 Optimum docs](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli), 
+or view help in command line:
+
+```bash
+optimum-cli export onnx --help
+```
+
+To export a model's checkpoint from the 🤗 Hub, for example, `distilbert-base-uncased-distilled-squad`, run the following command: 
+
+```bash
+optimum-cli export onnx --model distilbert-base-uncased-distilled-squad distilbert_base_uncased_squad_onnx/
+```
+
+You should see the logs indicating progress and showing where the resulting `model.onnx` is saved, like this:
+
+```bash
+Validating ONNX model distilbert_base_uncased_squad_onnx/model.onnx...
+	-[✓] ONNX model output names match reference model (start_logits, end_logits)
+	- Validating ONNX Model output "start_logits":
+		-[✓] (2, 16) matches (2, 16)
+		-[✓] all values close (atol: 0.0001)
+	- Validating ONNX Model output "end_logits":
+		-[✓] (2, 16) matches (2, 16)
+		-[✓] all values close (atol: 0.0001)
+The ONNX export succeeded and the exported model was saved at: distilbert_base_uncased_squad_onnx
+```
+
+The example above illustrates exporting a checkpoint from 🤗 Hub. When exporting a local model, first make sure that you 
+saved both the model's weights and tokenizer files in the same directory (`local_path`). When using CLI, pass the 
+`local_path` to the `model` argument instead of the checkpoint name on 🤗 Hub and provide the `--task` argument. 
+You can review the list of supported tasks in the [🤗 Optimum documentation](https://huggingface.co/docs/optimum/exporters/task_manager).
+If `task` argument is not provided, it will default to the model architecture without any task specific head.
+
+```bash
+optimum-cli export onnx --model local_path --task question-answering distilbert_base_uncased_squad_onnx/
+```
+
+The resulting `model.onnx` file can then be run on one of the [many
+accelerators](https://onnx.ai/supported-tools.html#deployModel) that support the ONNX
+standard. For example, we can load and run the model with [ONNX
+Runtime](https://onnxruntime.ai/) as follows:
+
+```python
+>>> from transformers import AutoTokenizer
+>>> from optimum.onnxruntime import ORTModelForQuestionAnswering
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert_base_uncased_squad_onnx")
+>>> model = ORTModelForQuestionAnswering.from_pretrained("distilbert_base_uncased_squad_onnx")
+>>> inputs = tokenizer("What am I using?", "Using DistilBERT with ONNX Runtime!", return_tensors="pt")
+>>> outputs = model(**inputs)
+```
+
+The process is identical for TensorFlow checkpoints on the Hub. For instance, here's how you would
+export a pure TensorFlow checkpoint from the [Keras organization](https://huggingface.co/keras-io):
+
+```bash
+optimum-cli export onnx --model keras-io/transformers-qa distilbert_base_cased_squad_onnx/
+```
+
+### Exporting a 🤗 Transformers model to ONNX with `optimum.onnxruntime`
+
+Alternative to CLI, you can export a 🤗 Transformers model to ONNX programmatically like so: 
+
+```python
+>>> from optimum.onnxruntime import ORTModelForSequenceClassification
+>>> from transformers import AutoTokenizer
+
+>>> model_checkpoint = "distilbert_base_uncased_squad"
+>>> save_directory = "onnx/"
+
+>>> # Load a model from transformers and export it to ONNX
+>>> ort_model = ORTModelForSequenceClassification.from_pretrained(model_checkpoint, export=True)
+>>> tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
+
+>>> # Save the onnx model and tokenizer
+>>> ort_model.save_pretrained(save_directory)
+>>> tokenizer.save_pretrained(save_directory)
+```
+
+### Exporting a model for an unsupported architecture
+
+If you wish to contribute by adding support for a model that cannot be currently exported, you should first check if it is
+supported in [`optimum.exporters.onnx`](https://huggingface.co/docs/optimum/exporters/onnx/overview),
+and if it is not, [contribute to 🤗 Optimum](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/contribute)
+directly.
+
+### Exporting a model with `transformers.onnx`
+
+<Tip warning={true}>
+
+`tranformers.onnx` is no longer maintained, please export models with 🤗 Optimum as described above. This section will be removed in the future versions.
+
+</Tip>
+
+To export a 🤗 Transformers model to ONNX with `tranformers.onnx`, install extra dependencies:
+
+```bash
+pip install transformers[onnx]
+```
+
+Use `transformers.onnx` package as a Python module to export a checkpoint using a ready-made configuration:
+
+```bash
+python -m transformers.onnx --model=distilbert-base-uncased onnx/
+```
+
+This exports an ONNX graph of the checkpoint defined by the `--model` argument. Pass any checkpoint on the 🤗 Hub or one that's stored locally.
+The resulting `model.onnx` file can then be run on one of the many accelerators that support the ONNX standard. For example, 
+load and run the model with ONNX Runtime as follows:
+
+```python
+>>> from transformers import AutoTokenizer
+>>> from onnxruntime import InferenceSession
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+>>> session = InferenceSession("onnx/model.onnx")
+>>> # ONNX Runtime expects NumPy arrays as input
+>>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
+>>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs))
+```
+
+The required output names (like `["last_hidden_state"]`) can be obtained by taking a look at the ONNX configuration of 
+each model. For example, for DistilBERT we have:
+
+```python
+>>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig
+
+>>> config = DistilBertConfig()
+>>> onnx_config = DistilBertOnnxConfig(config)
+>>> print(list(onnx_config.outputs.keys()))
+["last_hidden_state"]
+```
+
+The process is identical for TensorFlow checkpoints on the Hub. For example, export a pure TensorFlow checkpoint like so:
+
+```bash
+python -m transformers.onnx --model=keras-io/transformers-qa onnx/
+```
+
+To export a model that's stored locally, save the model's weights and tokenizer files in the same directory (e.g. `local-pt-checkpoint`), 
+then export it to ONNX by pointing the `--model` argument of the `transformers.onnx` package to the desired directory:
+
+```bash
+python -m transformers.onnx --model=local-pt-checkpoint onnx/
+```
\ No newline at end of file
diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.mdx
deleted file mode 100644
index 7079a91f40c3..000000000000
--- a/docs/source/en/serialization.mdx
+++ /dev/null
@@ -1,538 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Export to ONNX
-
-If you need to deploy 🤗 Transformers models in production environments, we recommend
-exporting them to a serialized format that can be loaded and executed on specialized
-runtimes and hardware. In this guide, we'll show you how to export 🤗 Transformers
-models to [ONNX (Open Neural Network eXchange)](http://onnx.ai).
-
-ONNX is an open standard that defines a common set of operators and a common file format
-to represent deep learning models in a wide variety of frameworks, including PyTorch and
-TensorFlow. When a model is exported to the ONNX format, these operators are used to
-construct a computational graph (often called an _intermediate representation_) which
-represents the flow of data through the neural network.
-
-By exposing a graph with standardized operators and data types, ONNX makes it easy to
-switch between frameworks. For example, a model trained in PyTorch can be exported to
-ONNX format and then imported in TensorFlow (and vice versa).
-
-🤗 Transformers provides a [`transformers.onnx`](main_classes/onnx) package that enables
-you to convert model checkpoints to an ONNX graph by leveraging configuration objects.
-These configuration objects come ready made for a number of model architectures, and are
-designed to be easily extendable to other architectures.
-
-<Tip>
-
-You can also export 🤗 Transformers models with the [`optimum.exporters.onnx` package](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model)
-from 🤗 Optimum.
-
-Once exported, a model can be:
-
-- Optimized for inference via techniques such as quantization and graph optimization.
-- Run with ONNX Runtime via [`ORTModelForXXX` classes](https://huggingface.co/docs/optimum/onnxruntime/package_reference/modeling_ort),
-which follow the same `AutoModel` API as the one you are used to in 🤗 Transformers.
-- Run with [optimized inference pipelines](https://huggingface.co/docs/optimum/main/en/onnxruntime/usage_guides/pipelines),
-which has the same API as the [`pipeline`] function in 🤗 Transformers.
-
-To explore all these features,  check out the [🤗 Optimum library](https://github.com/huggingface/optimum).
-
-</Tip>
-
-Ready-made configurations include the following architectures:
-
-<!--This table is automatically generated by `make fix-copies`, do not fill manually!-->
-
-- ALBERT
-- BART
-- BEiT
-- BERT
-- BigBird
-- BigBird-Pegasus
-- Blenderbot
-- BlenderbotSmall
-- BLOOM
-- CamemBERT
-- Chinese-CLIP
-- CLIP
-- CodeGen
-- Conditional DETR
-- ConvBERT
-- ConvNeXT
-- Data2VecText
-- Data2VecVision
-- DeBERTa
-- DeBERTa-v2
-- DeiT
-- DETR
-- DistilBERT
-- ELECTRA
-- ERNIE
-- FlauBERT
-- GPT Neo
-- GPT-J
-- GPT-Sw3
-- GroupViT
-- I-BERT
-- ImageGPT
-- LayoutLM
-- LayoutLMv3
-- LeViT
-- Longformer
-- LongT5
-- M2M100
-- Marian
-- mBART
-- MobileBERT
-- MobileNetV1
-- MobileNetV2
-- MobileViT
-- MT5
-- OpenAI GPT-2
-- OWL-ViT
-- Perceiver
-- PLBart
-- PoolFormer
-- RemBERT
-- ResNet
-- RoBERTa
-- RoBERTa-PreLayerNorm
-- RoFormer
-- SegFormer
-- SqueezeBERT
-- Swin Transformer
-- T5
-- Table Transformer
-- Vision Encoder decoder
-- ViT
-- Whisper
-- XLM
-- XLM-RoBERTa
-- XLM-RoBERTa-XL
-- YOLOS
-
-In the next two sections, we'll show you how to:
-
-* Export a supported model using the `transformers.onnx` package.
-* Export a custom model for an unsupported architecture.
-
-## Exporting a model to ONNX
-
-<Tip>
-
-The recommended way of exporting a model is now to use
-[`optimum.exporters.onnx`](https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli),
-do not worry it is very similar to `transformers.onnx`!
-
-</Tip>
-
-To export a 🤗 Transformers model to ONNX, you'll first need to install some extra
-dependencies:
-
-```bash
-pip install transformers[onnx]
-```
-
-The `transformers.onnx` package can then be used as a Python module:
-
-```bash
-python -m transformers.onnx --help
-
-usage: Hugging Face Transformers ONNX exporter [-h] -m MODEL [--feature {causal-lm, ...}] [--opset OPSET] [--atol ATOL] output
-
-positional arguments:
-  output                Path indicating where to store generated ONNX model.
-
-optional arguments:
-  -h, --help            show this help message and exit
-  -m MODEL, --model MODEL
-                        Model ID on huggingface.co or path on disk to load model from.
-  --feature {causal-lm, ...}
-                        The type of features to export the model with.
-  --opset OPSET         ONNX opset version to export the model with.
-  --atol ATOL           Absolute difference tolerance when validating the model.
-```
-
-Exporting a checkpoint using a ready-made configuration can be done as follows:
-
-```bash
-python -m transformers.onnx --model=distilbert-base-uncased onnx/
-```
-
-You should see the following logs:
-
-```bash
-Validating ONNX model...
-        -[✓] ONNX model output names match reference model ({'last_hidden_state'})
-        - Validating ONNX Model output "last_hidden_state":
-                -[✓] (2, 8, 768) matches (2, 8, 768)
-                -[✓] all values close (atol: 1e-05)
-All good, model saved at: onnx/model.onnx
-```
-
-This exports an ONNX graph of the checkpoint defined by the `--model` argument. In this
-example, it is `distilbert-base-uncased`, but it can be any checkpoint on the Hugging
-Face Hub or one that's stored locally.
-
-The resulting `model.onnx` file can then be run on one of the [many
-accelerators](https://onnx.ai/supported-tools.html#deployModel) that support the ONNX
-standard. For example, we can load and run the model with [ONNX
-Runtime](https://onnxruntime.ai/) as follows:
-
-```python
->>> from transformers import AutoTokenizer
->>> from onnxruntime import InferenceSession
-
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
->>> session = InferenceSession("onnx/model.onnx")
->>> # ONNX Runtime expects NumPy arrays as input
->>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
->>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs))
-```
-
-The required output names (like `["last_hidden_state"]`) can be obtained by taking a
-look at the ONNX configuration of each model. For example, for DistilBERT we have:
-
-```python
->>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig
-
->>> config = DistilBertConfig()
->>> onnx_config = DistilBertOnnxConfig(config)
->>> print(list(onnx_config.outputs.keys()))
-["last_hidden_state"]
-```
-
-The process is identical for TensorFlow checkpoints on the Hub. For example, we can
-export a pure TensorFlow checkpoint from the [Keras
-organization](https://huggingface.co/keras-io) as follows:
-
-```bash
-python -m transformers.onnx --model=keras-io/transformers-qa onnx/
-```
-
-To export a model that's stored locally, you'll need to have the model's weights and
-tokenizer files stored in a directory. For example, we can load and save a checkpoint as
-follows:
-
-<frameworkcontent> <pt>
-```python
->>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
-
->>> # Load tokenizer and PyTorch weights form the Hub
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
->>> pt_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
->>> # Save to disk
->>> tokenizer.save_pretrained("local-pt-checkpoint")
->>> pt_model.save_pretrained("local-pt-checkpoint")
-```
-
-Once the checkpoint is saved, we can export it to ONNX by pointing the `--model`
-argument of the `transformers.onnx` package to the desired directory:
-
-```bash
-python -m transformers.onnx --model=local-pt-checkpoint onnx/
-```
-</pt> <tf>
-```python
->>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
-
->>> # Load tokenizer and TensorFlow weights from the Hub
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
->>> # Save to disk
->>> tokenizer.save_pretrained("local-tf-checkpoint")
->>> tf_model.save_pretrained("local-tf-checkpoint")
-```
-
-Once the checkpoint is saved, we can export it to ONNX by pointing the `--model`
-argument of the `transformers.onnx` package to the desired directory:
-
-```bash
-python -m transformers.onnx --model=local-tf-checkpoint onnx/
-```
-</tf> </frameworkcontent>
-
-## Selecting features for different model tasks
-
-<Tip>
-
-The recommended way of exporting a model is now to use `optimum.exporters.onnx`.
-You can check the [🤗 Optimum documentation](https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#selecting-a-task)
-to learn how to select a task.
-
-</Tip>
-
-Each ready-made configuration comes with a set of _features_ that enable you to export
-models for different types of tasks. As shown in the table below, each feature is
-associated with a different `AutoClass`:
-
-| Feature                              | Auto Class                           |
-| ------------------------------------ | ------------------------------------ |
-| `causal-lm`, `causal-lm-with-past`   | `AutoModelForCausalLM`               |
-| `default`, `default-with-past`       | `AutoModel`                          |
-| `masked-lm`                          | `AutoModelForMaskedLM`               |
-| `question-answering`                 | `AutoModelForQuestionAnswering`      |
-| `seq2seq-lm`, `seq2seq-lm-with-past` | `AutoModelForSeq2SeqLM`              |
-| `sequence-classification`            | `AutoModelForSequenceClassification` |
-| `token-classification`               | `AutoModelForTokenClassification`    |
-
-For each configuration, you can find the list of supported features via the
-[`~transformers.onnx.FeaturesManager`]. For example, for DistilBERT we have:
-
-```python
->>> from transformers.onnx.features import FeaturesManager
-
->>> distilbert_features = list(FeaturesManager.get_supported_features_for_model_type("distilbert").keys())
->>> print(distilbert_features)
-["default", "masked-lm", "causal-lm", "sequence-classification", "token-classification", "question-answering"]
-```
-
-You can then pass one of these features to the `--feature` argument in the
-`transformers.onnx` package. For example, to export a text-classification model we can
-pick a fine-tuned model from the Hub and run:
-
-```bash
-python -m transformers.onnx --model=distilbert-base-uncased-finetuned-sst-2-english \
-                            --feature=sequence-classification onnx/
-```
-
-This displays the following logs:
-
-```bash
-Validating ONNX model...
-        -[✓] ONNX model output names match reference model ({'logits'})
-        - Validating ONNX Model output "logits":
-                -[✓] (2, 2) matches (2, 2)
-                -[✓] all values close (atol: 1e-05)
-All good, model saved at: onnx/model.onnx
-```
-
-Notice that in this case, the output names from the fine-tuned model are `logits`
-instead of the `last_hidden_state` we saw with the `distilbert-base-uncased` checkpoint
-earlier. This is expected since the fine-tuned model has a sequence classification head.
-
-<Tip>
-
-The features that have a `with-past` suffix (like `causal-lm-with-past`) correspond to
-model classes with precomputed hidden states (key and values in the attention blocks)
-that can be used for fast autoregressive decoding.
-
-</Tip>
-
-<Tip>
-
-For `VisionEncoderDecoder` type models, the encoder and decoder parts are
-exported separately as two ONNX files named `encoder_model.onnx` and `decoder_model.onnx` respectively.
-
-</Tip>
-
-
-## Exporting a model for an unsupported architecture
-
-<Tip>
-
-If you wish to contribute by adding support for a model that cannot be currently exported, you should first check if it is
-supported in [`optimum.exporters.onnx`](https://huggingface.co/docs/optimum/main/en/exporters/onnx/package_reference/configuration#supported-architectures),
-and if it is not, [contribute to 🤗 Optimum](https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/contribute)
-directly.
-
-</Tip>
-
-If you wish to export a model whose architecture is not natively supported by the
-library, there are three main steps to follow:
-
-1. Implement a custom ONNX configuration.
-2. Export the model to ONNX.
-3. Validate the outputs of the PyTorch and exported models.
-
-In this section, we'll look at how DistilBERT was implemented to show what's involved
-with each step.
-
-### Implementing a custom ONNX configuration
-
-Let's start with the ONNX configuration object. We provide three abstract classes that
-you should inherit from, depending on the type of model architecture you wish to export:
-
-* Encoder-based models inherit from [`~onnx.config.OnnxConfig`]
-* Decoder-based models inherit from [`~onnx.config.OnnxConfigWithPast`]
-* Encoder-decoder models inherit from [`~onnx.config.OnnxSeq2SeqConfigWithPast`]
-
-<Tip>
-
-A good way to implement a custom ONNX configuration is to look at the existing
-implementation in the `configuration_<model_name>.py` file of a similar architecture.
-
-</Tip>
-
-Since DistilBERT is an encoder-based model, its configuration inherits from
-`OnnxConfig`:
-
-```python
->>> from typing import Mapping, OrderedDict
->>> from transformers.onnx import OnnxConfig
-
-
->>> class DistilBertOnnxConfig(OnnxConfig):
-...     @property
-...     def inputs(self) -> Mapping[str, Mapping[int, str]]:
-...         return OrderedDict(
-...             [
-...                 ("input_ids", {0: "batch", 1: "sequence"}),
-...                 ("attention_mask", {0: "batch", 1: "sequence"}),
-...             ]
-...         )
-```
-
-Every configuration object must implement the `inputs` property and return a mapping,
-where each key corresponds to an expected input, and each value indicates the axis of
-that input. For DistilBERT, we can see that two inputs are required: `input_ids` and
-`attention_mask`. These inputs have the same shape of `(batch_size, sequence_length)`
-which is why we see the same axes used in the configuration.
-
-<Tip>
-
-Notice that `inputs` property for `DistilBertOnnxConfig` returns an `OrderedDict`. This
-ensures that the inputs are matched with their relative position within the
-`PreTrainedModel.forward()` method when tracing the graph. We recommend using an
-`OrderedDict` for the `inputs` and `outputs` properties when implementing custom ONNX
-configurations.
-
-</Tip>
-
-Once you have implemented an ONNX configuration, you can instantiate it by providing the
-base model's configuration as follows:
-
-```python
->>> from transformers import AutoConfig
-
->>> config = AutoConfig.from_pretrained("distilbert-base-uncased")
->>> onnx_config = DistilBertOnnxConfig(config)
-```
-
-The resulting object has several useful properties. For example, you can view the ONNX
-operator set that will be used during the export:
-
-```python
->>> print(onnx_config.default_onnx_opset)
-11
-```
-
-You can also view the outputs associated with the model as follows:
-
-```python
->>> print(onnx_config.outputs)
-OrderedDict([("last_hidden_state", {0: "batch", 1: "sequence"})])
-```
-
-Notice that the outputs property follows the same structure as the inputs; it returns an
-`OrderedDict` of named outputs and their shapes. The output structure is linked to the
-choice of feature that the configuration is initialised with. By default, the ONNX
-configuration is initialized with the `default` feature that corresponds to exporting a
-model loaded with the `AutoModel` class. If you want to export a model for another task,
-just provide a different feature to the `task` argument when you initialize the ONNX
-configuration. For example, if we wished to export DistilBERT with a sequence
-classification head, we could use:
-
-```python
->>> from transformers import AutoConfig
-
->>> config = AutoConfig.from_pretrained("distilbert-base-uncased")
->>> onnx_config_for_seq_clf = DistilBertOnnxConfig(config, task="sequence-classification")
->>> print(onnx_config_for_seq_clf.outputs)
-OrderedDict([('logits', {0: 'batch'})])
-```
-
-<Tip>
-
-All of the base properties and methods associated with [`~onnx.config.OnnxConfig`] and
-the other configuration classes can be overridden if needed. Check out [`BartOnnxConfig`]
-for an advanced example.
-
-</Tip>
-
-### Exporting the model
-
-Once you have implemented the ONNX configuration, the next step is to export the model.
-Here we can use the `export()` function provided by the `transformers.onnx` package.
-This function expects the ONNX configuration, along with the base model and tokenizer,
-and the path to save the exported file:
-
-```python
->>> from pathlib import Path
->>> from transformers.onnx import export
->>> from transformers import AutoTokenizer, AutoModel
-
->>> onnx_path = Path("model.onnx")
->>> model_ckpt = "distilbert-base-uncased"
->>> base_model = AutoModel.from_pretrained(model_ckpt)
->>> tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
-
->>> onnx_inputs, onnx_outputs = export(tokenizer, base_model, onnx_config, onnx_config.default_onnx_opset, onnx_path)
-```
-
-The `onnx_inputs` and `onnx_outputs` returned by the `export()` function are lists of
-the keys defined in the `inputs` and `outputs` properties of the configuration. Once the
-model is exported, you can test that the model is well formed as follows:
-
-```python
->>> import onnx
-
->>> onnx_model = onnx.load("model.onnx")
->>> onnx.checker.check_model(onnx_model)
-```
-
-<Tip>
-
-If your model is larger than 2GB, you will see that many additional files are created
-during the export. This is _expected_ because ONNX uses [Protocol
-Buffers](https://developers.google.com/protocol-buffers/) to store the model and these
-have a size limit of 2GB. See the [ONNX
-documentation](https://github.com/onnx/onnx/blob/master/docs/ExternalData.md) for
-instructions on how to load models with external data.
-
-</Tip>
-
-### Validating the model outputs
-
-The final step is to validate that the outputs from the base and exported model agree
-within some absolute tolerance. Here we can use the `validate_model_outputs()` function
-provided by the `transformers.onnx` package as follows:
-
-```python
->>> from transformers.onnx import validate_model_outputs
-
->>> validate_model_outputs(
-...     onnx_config, tokenizer, base_model, onnx_path, onnx_outputs, onnx_config.atol_for_validation
-... )
-```
-
-This function uses the [`~transformers.onnx.OnnxConfig.generate_dummy_inputs`] method to
-generate inputs for the base and exported model, and the absolute tolerance can be
-defined in the configuration. We generally find numerical agreement in the 1e-6 to 1e-4
-range, although anything smaller than 1e-3 is likely to be OK.
-
-## Contributing a new configuration to 🤗 Transformers
-
-We are looking to expand the set of ready-made configurations and welcome contributions
-from the community! If you would like to contribute your addition to the library, you
-will need to:
-
-* Implement the ONNX configuration in the corresponding `configuration_<model_name>.py`
-file
-* Include the model architecture and corresponding features in
-  [`~onnx.features.FeatureManager`]
-* Add your model architecture to the tests in `test_onnx_v2.py`
-
-Check out how the configuration for [IBERT was
-contributed](https://github.com/huggingface/transformers/pull/14868/files) to get an
-idea of what's involved.
diff --git a/docs/source/en/task_summary.md b/docs/source/en/task_summary.md
new file mode 100644
index 000000000000..6cb49f88ba7f
--- /dev/null
+++ b/docs/source/en/task_summary.md
@@ -0,0 +1,341 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# What 🤗 Transformers can do
+
+🤗 Transformers is a library of pretrained state-of-the-art models for natural language processing (NLP), computer vision, and audio and speech processing tasks. Not only does the library contain Transformer models, but it also has non-Transformer models like modern convolutional networks for computer vision tasks. If you look at some of the most popular consumer products today, like smartphones, apps, and televisions, odds are that some kind of deep learning technology is behind it. Want to remove a background object from a picture taken by your smartphone? This is an example of a panoptic segmentation task (don't worry if you don't know what this means yet, we'll describe it in the following sections!). 
+
+This page provides an overview of the different speech and audio, computer vision, and NLP tasks that can be solved with the 🤗 Transformers library in just three lines of code!
+
+## Audio
+
+Audio and speech processing tasks are a little different from the other modalities mainly because audio as an input is a continuous signal. Unlike text, a raw audio waveform can't be neatly split into discrete chunks the way a sentence can be divided into words. To get around this, the raw audio signal is typically sampled at regular intervals. If you take more samples within an interval, the sampling rate is higher, and the audio more closely resembles the original audio source.
+
+Previous approaches preprocessed the audio to extract useful features from it. It is now more common to start audio and speech processing tasks by directly feeding the raw audio waveform to a feature encoder to extract an audio representation. This simplifies the preprocessing step and allows the model to learn the most essential features.
+
+### Audio classification
+
+Audio classification is a task that labels audio data from a predefined set of classes. It is a broad category with many specific applications, some of which include:
+
+* acoustic scene classification: label audio with a scene label ("office", "beach", "stadium")
+* acoustic event detection: label audio with a sound event label ("car horn", "whale calling", "glass breaking")
+* tagging: label audio containing multiple sounds (birdsongs, speaker identification in a meeting)
+* music classification: label music with a genre label ("metal", "hip-hop", "country")
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline(task="audio-classification", model="superb/hubert-base-superb-er")
+>>> preds = classifier("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> preds
+[{'score': 0.4532, 'label': 'hap'},
+ {'score': 0.3622, 'label': 'sad'},
+ {'score': 0.0943, 'label': 'neu'},
+ {'score': 0.0903, 'label': 'ang'}]
+```
+
+### Automatic speech recognition
+
+Automatic speech recognition (ASR) transcribes speech into text. It is one of the most common audio tasks due partly to speech being such a natural form of human communication. Today, ASR systems are embedded in "smart" technology products like speakers, phones, and cars. We can ask our virtual assistants to play music, set reminders, and tell us the weather. 
+
+But one of the key challenges Transformer architectures have helped with is in low-resource languages. By pretraining on large amounts of speech data, finetuning the model on only one hour of labeled speech data in a low-resource language can still produce high-quality results compared to previous ASR systems trained on 100x more labeled data.
+
+```py
+>>> from transformers import pipeline
+
+>>> transcriber = pipeline(task="automatic-speech-recognition", model="openai/whisper-small")
+>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'}
+```
+
+## Computer vision
+
+One of the first and earliest successful computer vision tasks was recognizing images of zip code numbers using a [convolutional neural network (CNN)](glossary#convolution). An image is composed of pixels, and each pixel has a numerical value. This makes it easy to represent an image as a matrix of pixel values. Each particular combination of pixel values describes the colors of an image. 
+
+Two general ways computer vision tasks can be solved are:
+
+1. Use convolutions to learn the hierarchical features of an image from low-level features to high-level abstract things.
+2. Split an image into patches and use a Transformer to gradually learn how each image patch is related to each other to form an image. Unlike the bottom-up approach favored by a CNN, this is kind of like starting out with a blurry image and then gradually bringing it into focus.
+
+### Image classification
+
+Image classification labels an entire image from a predefined set of classes. Like most classification tasks, there are many practical use cases for image classification, some of which include:
+
+* healthcare: label medical images to detect disease or monitor patient health
+* environment: label satellite images to monitor deforestation, inform wildland management or detect wildfires
+* agriculture: label images of crops to monitor plant health or satellite images for land use monitoring 
+* ecology: label images of animal or plant species to monitor wildlife populations or track endangered species
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline(task="image-classification")
+>>> preds = classifier(
+...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> print(*preds, sep="\n")
+{'score': 0.4335, 'label': 'lynx, catamount'}
+{'score': 0.0348, 'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'}
+{'score': 0.0324, 'label': 'snow leopard, ounce, Panthera uncia'}
+{'score': 0.0239, 'label': 'Egyptian cat'}
+{'score': 0.0229, 'label': 'tiger cat'}
+```
+
+### Object detection
+
+Unlike image classification, object detection identifies multiple objects within an image and the objects' positions in an image (defined by the bounding box). Some example applications of object detection include:
+
+* self-driving vehicles: detect everyday traffic objects such as other vehicles, pedestrians, and traffic lights
+* remote sensing: disaster monitoring, urban planning, and weather forecasting
+* defect detection: detect cracks or structural damage in buildings, and manufacturing defects
+
+```py
+>>> from transformers import pipeline
+
+>>> detector = pipeline(task="object-detection")
+>>> preds = detector(
+...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"], "box": pred["box"]} for pred in preds]
+>>> preds
+[{'score': 0.9865,
+  'label': 'cat',
+  'box': {'xmin': 178, 'ymin': 154, 'xmax': 882, 'ymax': 598}}]
+```
+
+### Image segmentation
+
+Image segmentation is a pixel-level task that assigns every pixel in an image to a class. It differs from object detection, which uses bounding boxes to label and predict objects in an image because segmentation is more granular. Segmentation can detect objects at a pixel-level. There are several types of image segmentation:
+
+* instance segmentation: in addition to labeling the class of an object, it also labels each distinct instance of an object ("dog-1", "dog-2")
+* panoptic segmentation: a combination of semantic and instance segmentation; it labels each pixel with a semantic class **and** each distinct instance of an object
+
+Segmentation tasks are helpful in self-driving vehicles to create a pixel-level map of the world around them so they can navigate safely around pedestrians and other vehicles. It is also useful for medical imaging, where the task's finer granularity can help identify abnormal cells or organ features. Image segmentation can also be used in ecommerce to virtually try on clothes or create augmented reality experiences by overlaying objects in the real world through your camera.
+
+```py
+>>> from transformers import pipeline
+
+>>> segmenter = pipeline(task="image-segmentation")
+>>> preds = segmenter(
+...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> print(*preds, sep="\n")
+{'score': 0.9879, 'label': 'LABEL_184'}
+{'score': 0.9973, 'label': 'snow'}
+{'score': 0.9972, 'label': 'cat'}
+```
+
+### Depth estimation
+
+Depth estimation predicts the distance of each pixel in an image from the camera. This computer vision task is especially important for scene understanding and reconstruction. For example, in self-driving cars, vehicles need to understand how far objects like pedestrians, traffic signs, and other vehicles are to avoid obstacles and collisions. Depth information is also helpful for constructing 3D representations from 2D images and can be used to create high-quality 3D representations of biological structures or buildings.
+
+There are two approaches to depth estimation:
+
+* stereo: depths are estimated by comparing two images of the same image from slightly different angles
+* monocular: depths are estimated from a single image
+
+```py
+>>> from transformers import pipeline
+
+>>> depth_estimator = pipeline(task="depth-estimation")
+>>> preds = depth_estimator(
+...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+```
+
+## Natural language processing
+
+NLP tasks are among the most common types of tasks because text is such a natural way for us to communicate. To get text into a format recognized by a model, it needs to be tokenized. This means dividing a sequence of text into separate words or subwords (tokens) and then converting these tokens into numbers. As a result, you can represent a sequence of text as a sequence of numbers, and once you have a sequence of numbers, it can be input into a model to solve all sorts of NLP tasks!
+
+### Text classification
+
+Like classification tasks in any modality, text classification labels a sequence of text (it can be sentence-level, a paragraph, or a document) from a predefined set of classes. There are many practical applications for text classification, some of which include:
+
+* sentiment analysis: label text according to some polarity like `positive` or `negative` which can inform and support decision-making in fields like politics, finance, and marketing
+* content classification: label text according to some topic to help organize and filter information in news and social media feeds (`weather`, `sports`, `finance`, etc.)
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline(task="sentiment-analysis")
+>>> preds = classifier("Hugging Face is the best thing since sliced bread!")
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> preds
+[{'score': 0.9991, 'label': 'POSITIVE'}]
+```
+
+### Token classification
+
+In any NLP task, text is preprocessed by separating the sequence of text into individual words or subwords. These are known as [tokens](/glossary#token). Token classification assigns each token a label from a predefined set of classes. 
+
+Two common types of token classification are:
+
+* named entity recognition (NER): label a token according to an entity category like organization, person, location or date. NER is especially popular in biomedical settings, where it can label genes, proteins, and drug names.
+* part-of-speech tagging (POS): label a token according to its part-of-speech like noun, verb, or adjective. POS is useful for helping translation systems understand how two identical words are grammatically different (bank as a noun versus bank as a verb).
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline(task="ner")
+>>> preds = classifier("Hugging Face is a French company based in New York City.")
+>>> preds = [
+...     {
+...         "entity": pred["entity"],
+...         "score": round(pred["score"], 4),
+...         "index": pred["index"],
+...         "word": pred["word"],
+...         "start": pred["start"],
+...         "end": pred["end"],
+...     }
+...     for pred in preds
+... ]
+>>> print(*preds, sep="\n")
+{'entity': 'I-ORG', 'score': 0.9968, 'index': 1, 'word': 'Hu', 'start': 0, 'end': 2}
+{'entity': 'I-ORG', 'score': 0.9293, 'index': 2, 'word': '##gging', 'start': 2, 'end': 7}
+{'entity': 'I-ORG', 'score': 0.9763, 'index': 3, 'word': 'Face', 'start': 8, 'end': 12}
+{'entity': 'I-MISC', 'score': 0.9983, 'index': 6, 'word': 'French', 'start': 18, 'end': 24}
+{'entity': 'I-LOC', 'score': 0.999, 'index': 10, 'word': 'New', 'start': 42, 'end': 45}
+{'entity': 'I-LOC', 'score': 0.9987, 'index': 11, 'word': 'York', 'start': 46, 'end': 50}
+{'entity': 'I-LOC', 'score': 0.9992, 'index': 12, 'word': 'City', 'start': 51, 'end': 55}
+```
+
+### Question answering
+
+Question answering is another token-level task that returns an answer to a question, sometimes with context (open-domain) and other times without context (closed-domain). This task happens whenever we ask a virtual assistant something like whether a restaurant is open. It can also provide customer or technical support and help search engines retrieve the relevant information you're asking for. 
+
+There are two common types of question answering:
+
+* extractive: given a question and some context, the answer is a span of text from the context the model must extract
+* abstractive: given a question and some context, the answer is generated from the context; this approach is handled by the [`Text2TextGenerationPipeline`] instead of the [`QuestionAnsweringPipeline`] shown below
+
+
+```py
+>>> from transformers import pipeline
+
+>>> question_answerer = pipeline(task="question-answering")
+>>> preds = question_answerer(
+...     question="What is the name of the repository?",
+...     context="The name of the repository is huggingface/transformers",
+... )
+>>> print(
+...     f"score: {round(preds['score'], 4)}, start: {preds['start']}, end: {preds['end']}, answer: {preds['answer']}"
+... )
+score: 0.9327, start: 30, end: 54, answer: huggingface/transformers
+```
+
+### Summarization
+
+Summarization creates a shorter version of a text from a longer one while trying to preserve most of the meaning of the original document. Summarization is a sequence-to-sequence task; it outputs a shorter text sequence than the input. There are a lot of long-form documents that can be summarized to help readers quickly understand the main points. Legislative bills, legal and financial documents, patents, and scientific papers are a few examples of documents that could be summarized to save readers time and serve as a reading aid.
+
+Like question answering, there are two types of summarization:
+
+* extractive: identify and extract the most important sentences from the original text
+* abstractive: generate the target summary (which may include new words not in the input document) from the original text; the [`SummarizationPipeline`] uses the abstractive approach
+
+```py
+>>> from transformers import pipeline
+
+>>> summarizer = pipeline(task="summarization")
+>>> summarizer(
+...     "In this work, we presented the Transformer, the first sequence transduction model based entirely on attention, replacing the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention. For translation tasks, the Transformer can be trained significantly faster than architectures based on recurrent or convolutional layers. On both WMT 2014 English-to-German and WMT 2014 English-to-French translation tasks, we achieve a new state of the art. In the former task our best model outperforms even all previously reported ensembles."
+... )
+[{'summary_text': ' The Transformer is the first sequence transduction model based entirely on attention . It replaces the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention . For translation tasks, the Transformer can be trained significantly faster than architectures based on recurrent or convolutional layers .'}]
+```
+
+### Translation
+
+Translation converts a sequence of text in one language to another. It is important in helping people from different backgrounds communicate with each other, help translate content to reach wider audiences, and even be a learning tool to help people learn a new language. Along with summarization, translation is a sequence-to-sequence task, meaning the model receives an input sequence and returns a target output sequence. 
+
+In the early days, translation models were mostly monolingual, but recently, there has been increasing interest in multilingual models that can translate between many pairs of languages.
+
+```py
+>>> from transformers import pipeline
+
+>>> text = "translate English to French: Hugging Face is a community-based open-source platform for machine learning."
+>>> translator = pipeline(task="translation", model="t5-small")
+>>> translator(text)
+[{'translation_text': "Hugging Face est une tribune communautaire de l'apprentissage des machines."}]
+```
+
+### Language modeling
+
+Language modeling is a task that predicts a word in a sequence of text. It has become a very popular NLP task because a pretrained language model can be finetuned for many other downstream tasks. Lately, there has been a lot of interest in large language models (LLMs) which demonstrate zero- or few-shot learning. This means the model can solve tasks it wasn't explicitly trained to do! Language models can be used to generate fluent and convincing text, though you need to be careful since the text may not always be accurate.
+
+There are two types of language modeling:
+
+* causal: the model's objective is to predict the next token in a sequence, and future tokens are masked
+
+    ```py
+    >>> from transformers import pipeline
+
+    >>> prompt = "Hugging Face is a community-based open-source platform for machine learning."
+    >>> generator = pipeline(task="text-generation")
+    >>> generator(prompt)  # doctest: +SKIP
+    ```
+
+* masked: the model's objective is to predict a masked token in a sequence with full access to the tokens in the sequence
+    
+    ```py
+    >>> text = "Hugging Face is a community-based open-source <mask> for machine learning."
+    >>> fill_mask = pipeline(task="fill-mask")
+    >>> preds = fill_mask(text, top_k=1)
+    >>> preds = [
+    ...     {
+    ...         "score": round(pred["score"], 4),
+    ...         "token": pred["token"],
+    ...         "token_str": pred["token_str"],
+    ...         "sequence": pred["sequence"],
+    ...     }
+    ...     for pred in preds
+    ... ]
+    >>> preds
+    [{'score': 0.2236,
+      'token': 1761,
+      'token_str': ' platform',
+      'sequence': 'Hugging Face is a community-based open-source platform for machine learning.'}]
+    ```
+
+## Multimodal
+
+Multimodal tasks require a model to process multiple data modalities (text, image, audio, video) to solve a particular problem. Image captioning is an example of a multimodal task where the model takes an image as input and outputs a sequence of text describing the image or some properties of the image. 
+
+Although multimodal models work with different data types or modalities, internally, the preprocessing steps help the model convert all the data types into embeddings (vectors or list of numbers that holds meaningful information about the data). For a task like image captioning, the model learns relationships between image embeddings and text embeddings.
+
+### Document question answering
+
+Document question answering is a task that answers natural language questions from a document. Unlike a token-level question answering task which takes text as input, document question answering takes an image of a document as input along with a question about the document and returns an answer. Document question answering can be used to parse structured documents and extract key information from it. In the example below, the total amount and change due can be extracted from a receipt.
+
+```py
+>>> from transformers import pipeline
+>>> from PIL import Image
+>>> import requests
+
+>>> url = "https://datasets-server.huggingface.co/assets/hf-internal-testing/example-documents/--/hf-internal-testing--example-documents/test/2/image/image.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> doc_question_answerer = pipeline("document-question-answering", model="magorshunov/layoutlm-invoices")
+>>> preds = doc_question_answerer(
+...     question="What is the total amount?",
+...     image=image,
+... )
+>>> preds
+[{'score': 0.8531, 'answer': '17,000', 'start': 4, 'end': 4}]
+```
+
+Hopefully, this page has given you some more background information about all the types of tasks in each modality and the practical importance of each one. In the next [section](tasks_explained), you'll learn **how** 🤗 Transformers work to solve these tasks.
\ No newline at end of file
diff --git a/docs/source/en/task_summary.mdx b/docs/source/en/task_summary.mdx
deleted file mode 100644
index 697ee21df5f9..000000000000
--- a/docs/source/en/task_summary.mdx
+++ /dev/null
@@ -1,1134 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Summary of the tasks
-
-[[open-in-colab]]
-
-This page shows the most frequent use-cases when using the library. The models available allow for many different
-configurations and a great versatility in use-cases. The most simple ones are presented here, showcasing usage for
-tasks such as image classification, question answering, sequence classification, named entity recognition and others.
-
-These examples leverage auto-models, which are classes that will instantiate a model according to a given checkpoint,
-automatically selecting the correct model architecture. Please check the [`AutoModel`] documentation
-for more information. Feel free to modify the code to be more specific and adapt it to your specific use-case.
-
-In order for a model to perform well on a task, it must be loaded from a checkpoint corresponding to that task. These
-checkpoints are usually pre-trained on a large corpus of data and fine-tuned on a specific task. This means the
-following:
-
-- Not all models were fine-tuned on all tasks. If you want to fine-tune a model on a specific task, you can leverage
-  one of the *run_$TASK.py* scripts in the [examples](https://github.com/huggingface/transformers/tree/main/examples) directory.
-- Fine-tuned models were fine-tuned on a specific dataset. This dataset may or may not overlap with your use-case and
-  domain. As mentioned previously, you may leverage the [examples](https://github.com/huggingface/transformers/tree/main/examples) scripts to fine-tune your model, or you may
-  create your own training script.
-
-In order to do an inference on a task, several mechanisms are made available by the library:
-
-- Pipelines: very easy-to-use abstractions, which require as little as two lines of code.
-- Direct model use: Less abstractions, but more flexibility and power via a direct access to a tokenizer
-  (PyTorch/TensorFlow) and full inference capacity.
-
-Both approaches are showcased here.
-
-<Tip>
-
-All tasks presented here leverage pre-trained checkpoints that were fine-tuned on specific tasks. Loading a
-checkpoint that was not fine-tuned on a specific task would load only the base transformer layers and not the
-additional head that is used for the task, initializing the weights of that head randomly.
-
-This would produce random output.
-
-</Tip>
-
-## Sequence Classification
-
-Sequence classification is the task of classifying sequences according to a given number of classes. An example of
-sequence classification is the GLUE dataset, which is entirely based on that task. If you would like to fine-tune a
-model on a GLUE sequence classification task, you may leverage the [run_glue.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification/run_glue.py), [run_tf_glue.py](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/text-classification/run_tf_glue.py), [run_tf_text_classification.py](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/text-classification/run_tf_text_classification.py) or [run_xnli.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification/run_xnli.py) scripts.
-
-Here is an example of using pipelines to do sentiment analysis: identifying if a sequence is positive or negative. It
-leverages a fine-tuned model on sst2, which is a GLUE task.
-
-This returns a label ("POSITIVE" or "NEGATIVE") alongside a score, as follows:
-
-```py
->>> from transformers import pipeline
-
->>> classifier = pipeline("sentiment-analysis")
-
->>> result = classifier("I hate you")[0]
->>> print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
-label: NEGATIVE, with score: 0.9991
-
->>> result = classifier("I love you")[0]
->>> print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
-label: POSITIVE, with score: 0.9999
-```
-
-Here is an example of doing a sequence classification using a model to determine if two sequences are paraphrases of
-each other. The process is the following:
-
-1. Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a BERT model and loads it
-   with the weights stored in the checkpoint.
-2. Build a sequence from the two sentences, with the correct model-specific separators, token type ids and attention
-   masks (which will be created automatically by the tokenizer).
-3. Pass this sequence through the model so that it is classified in one of the two available classes: 0 (not a
-   paraphrase) and 1 (is a paraphrase).
-4. Compute the softmax of the result to get probabilities over the classes.
-5. Print the results.
-
-<frameworkcontent>
-<pt>
-```py
->>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
->>> import torch
-
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")
-
->>> classes = ["not paraphrase", "is paraphrase"]
-
->>> sequence_0 = "The company HuggingFace is based in New York City"
->>> sequence_1 = "Apples are especially bad for your health"
->>> sequence_2 = "HuggingFace's headquarters are situated in Manhattan"
-
->>> # The tokenizer will automatically add any model specific separators (i.e. <CLS> and <SEP>) and tokens to
->>> # the sequence, as well as compute the attention masks.
->>> paraphrase = tokenizer(sequence_0, sequence_2, return_tensors="pt")
->>> not_paraphrase = tokenizer(sequence_0, sequence_1, return_tensors="pt")
-
->>> paraphrase_classification_logits = model(**paraphrase).logits
->>> not_paraphrase_classification_logits = model(**not_paraphrase).logits
-
->>> paraphrase_results = torch.softmax(paraphrase_classification_logits, dim=1).tolist()[0]
->>> not_paraphrase_results = torch.softmax(not_paraphrase_classification_logits, dim=1).tolist()[0]
-
->>> # Should be paraphrase
->>> for i in range(len(classes)):
-...     print(f"{classes[i]}: {int(round(paraphrase_results[i] * 100))}%")
-not paraphrase: 10%
-is paraphrase: 90%
-
->>> # Should not be paraphrase
->>> for i in range(len(classes)):
-...     print(f"{classes[i]}: {int(round(not_paraphrase_results[i] * 100))}%")
-not paraphrase: 94%
-is paraphrase: 6%
-```
-</pt>
-<tf>
-```py
->>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
->>> import tensorflow as tf
-
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
->>> model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")
-
->>> classes = ["not paraphrase", "is paraphrase"]
-
->>> sequence_0 = "The company HuggingFace is based in New York City"
->>> sequence_1 = "Apples are especially bad for your health"
->>> sequence_2 = "HuggingFace's headquarters are situated in Manhattan"
-
->>> # The tokenizer will automatically add any model specific separators (i.e. <CLS> and <SEP>) and tokens to
->>> # the sequence, as well as compute the attention masks.
->>> paraphrase = tokenizer(sequence_0, sequence_2, return_tensors="tf")
->>> not_paraphrase = tokenizer(sequence_0, sequence_1, return_tensors="tf")
-
->>> paraphrase_classification_logits = model(paraphrase).logits
->>> not_paraphrase_classification_logits = model(not_paraphrase).logits
-
->>> paraphrase_results = tf.nn.softmax(paraphrase_classification_logits, axis=1).numpy()[0]
->>> not_paraphrase_results = tf.nn.softmax(not_paraphrase_classification_logits, axis=1).numpy()[0]
-
->>> # Should be paraphrase
->>> for i in range(len(classes)):
-...     print(f"{classes[i]}: {int(round(paraphrase_results[i] * 100))}%")
-not paraphrase: 10%
-is paraphrase: 90%
-
->>> # Should not be paraphrase
->>> for i in range(len(classes)):
-...     print(f"{classes[i]}: {int(round(not_paraphrase_results[i] * 100))}%")
-not paraphrase: 94%
-is paraphrase: 6%
-```
-</tf>
-</frameworkcontent>
-
-## Extractive Question Answering
-
-Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
-question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune a
-model on a SQuAD task, you may leverage the [run_qa.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering/run_qa.py) and
-[run_tf_squad.py](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/question-answering/run_tf_squad.py)
-scripts.
-
-
-Here is an example of using pipelines to do question answering: extracting an answer from a text given a question. It
-leverages a fine-tuned model on SQuAD.
-
-```py
->>> from transformers import pipeline
-
->>> question_answerer = pipeline("question-answering")
-
->>> context = r"""
-... Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
-... question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
-... a model on a SQuAD task, you may leverage the examples/pytorch/question-answering/run_squad.py script.
-... """
-```
-
-This returns an answer extracted from the text, a confidence score, alongside "start" and "end" values, which are the
-positions of the extracted answer in the text.
-
-```py
->>> result = question_answerer(question="What is extractive question answering?", context=context)
->>> print(
-...     f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}"
-... )
-Answer: 'the task of extracting an answer from a text given a question', score: 0.6177, start: 34, end: 95
-
->>> result = question_answerer(question="What is a good example of a question answering dataset?", context=context)
->>> print(
-...     f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}"
-... )
-Answer: 'SQuAD dataset', score: 0.5152, start: 147, end: 160
-```
-
-Here is an example of question answering using a model and a tokenizer. The process is the following:
-
-1. Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a BERT model and loads it
-   with the weights stored in the checkpoint.
-2. Define a text and a few questions.
-3. Iterate over the questions and build a sequence from the text and the current question, with the correct
-   model-specific separators, token type ids and attention masks.
-4. Pass this sequence through the model. This outputs a range of scores across the entire sequence tokens (question and
-   text), for both the start and end positions.
-5. Compute the softmax of the result to get probabilities over the tokens.
-6. Fetch the tokens from the identified start and stop values, convert those tokens to a string.
-7. Print the results.
-
-<frameworkcontent>
-<pt>
-```py
->>> from transformers import AutoTokenizer, AutoModelForQuestionAnswering
->>> import torch
-
->>> tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
->>> model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
-
->>> text = r"""
-... 🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
-... architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
-... Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
-... TensorFlow 2.0 and PyTorch.
-... """
-
->>> questions = [
-...     "How many pretrained models are available in 🤗 Transformers?",
-...     "What does 🤗 Transformers provide?",
-...     "🤗 Transformers provides interoperability between which frameworks?",
-... ]
-
->>> for question in questions:
-...     inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="pt")
-...     input_ids = inputs["input_ids"].tolist()[0]
-
-...     outputs = model(**inputs)
-...     answer_start_scores = outputs.start_logits
-...     answer_end_scores = outputs.end_logits
-
-...     # Get the most likely beginning of answer with the argmax of the score
-...     answer_start = torch.argmax(answer_start_scores)
-...     # Get the most likely end of answer with the argmax of the score
-...     answer_end = torch.argmax(answer_end_scores) + 1
-
-...     answer = tokenizer.convert_tokens_to_string(
-...         tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end])
-...     )
-
-...     print(f"Question: {question}")
-...     print(f"Answer: {answer}")
-Question: How many pretrained models are available in 🤗 Transformers?
-Answer: over 32 +
-Question: What does 🤗 Transformers provide?
-Answer: general - purpose architectures
-Question: 🤗 Transformers provides interoperability between which frameworks?
-Answer: tensorflow 2. 0 and pytorch
-```
-</pt>
-<tf>
-```py
->>> from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering
->>> import tensorflow as tf
-
->>> tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
->>> model = TFAutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
-
->>> text = r"""
-... 🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
-... architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
-... Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
-... TensorFlow 2.0 and PyTorch.
-... """
-
->>> questions = [
-...     "How many pretrained models are available in 🤗 Transformers?",
-...     "What does 🤗 Transformers provide?",
-...     "🤗 Transformers provides interoperability between which frameworks?",
-... ]
-
->>> for question in questions:
-...     inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="tf")
-...     input_ids = inputs["input_ids"].numpy()[0]
-
-...     outputs = model(inputs)
-...     answer_start_scores = outputs.start_logits
-...     answer_end_scores = outputs.end_logits
-
-...     # Get the most likely beginning of answer with the argmax of the score
-...     answer_start = tf.argmax(answer_start_scores, axis=1).numpy()[0]
-...     # Get the most likely end of answer with the argmax of the score
-...     answer_end = tf.argmax(answer_end_scores, axis=1).numpy()[0] + 1
-
-...     answer = tokenizer.convert_tokens_to_string(
-...         tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end])
-...     )
-
-...     print(f"Question: {question}")
-...     print(f"Answer: {answer}")
-Question: How many pretrained models are available in 🤗 Transformers?
-Answer: over 32 +
-Question: What does 🤗 Transformers provide?
-Answer: general - purpose architectures
-Question: 🤗 Transformers provides interoperability between which frameworks?
-Answer: tensorflow 2. 0 and pytorch
-```
-</tf>
-</frameworkcontent>
-
-## Language Modeling
-
-Language modeling is the task of fitting a model to a corpus, which can be domain specific. All popular
-transformer-based models are trained using a variant of language modeling, e.g. BERT with masked language modeling,
-GPT-2 with causal language modeling.
-
-Language modeling can be useful outside of pretraining as well, for example to shift the model distribution to be
-domain-specific: using a language model trained over a very large corpus, and then fine-tuning it to a news dataset or
-on scientific papers e.g. [LysandreJik/arxiv-nlp](https://huggingface.co/lysandre/arxiv-nlp).
-
-### Masked Language Modeling
-
-Masked language modeling is the task of masking tokens in a sequence with a masking token, and prompting the model to
-fill that mask with an appropriate token. This allows the model to attend to both the right context (tokens on the
-right of the mask) and the left context (tokens on the left of the mask). Such a training creates a strong basis for
-downstream tasks requiring bi-directional context, such as SQuAD (question answering, see [Lewis, Lui, Goyal et al.](https://arxiv.org/abs/1910.13461), part 4.2). If you would like to fine-tune a model on a masked language modeling
-task, you may leverage the [run_mlm.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling/run_mlm.py) script.
-
-Here is an example of using pipelines to replace a mask from a sequence:
-
-```py
->>> from transformers import pipeline
-
->>> unmasker = pipeline("fill-mask")
-```
-
-This outputs the sequences with the mask filled, the confidence score, and the token id in the tokenizer vocabulary:
-
-```py
->>> from pprint import pprint
-
->>> pprint(
-...     unmasker(
-...         f"HuggingFace is creating a {unmasker.tokenizer.mask_token} that the community uses to solve NLP tasks."
-...     )
-... )
-[{'score': 0.1793,
-  'sequence': 'HuggingFace is creating a tool that the community uses to solve '
-              'NLP tasks.',
-  'token': 3944,
-  'token_str': ' tool'},
- {'score': 0.1135,
-  'sequence': 'HuggingFace is creating a framework that the community uses to '
-              'solve NLP tasks.',
-  'token': 7208,
-  'token_str': ' framework'},
- {'score': 0.0524,
-  'sequence': 'HuggingFace is creating a library that the community uses to '
-              'solve NLP tasks.',
-  'token': 5560,
-  'token_str': ' library'},
- {'score': 0.0349,
-  'sequence': 'HuggingFace is creating a database that the community uses to '
-              'solve NLP tasks.',
-  'token': 8503,
-  'token_str': ' database'},
- {'score': 0.0286,
-  'sequence': 'HuggingFace is creating a prototype that the community uses to '
-              'solve NLP tasks.',
-  'token': 17715,
-  'token_str': ' prototype'}]
-```
-
-Here is an example of doing masked language modeling using a model and a tokenizer. The process is the following:
-
-1. Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a DistilBERT model and
-   loads it with the weights stored in the checkpoint.
-2. Define a sequence with a masked token, placing the `tokenizer.mask_token` instead of a word.
-3. Encode that sequence into a list of IDs and find the position of the masked token in that list.
-4. Retrieve the predictions at the index of the mask token: this tensor has the same size as the vocabulary, and the
-   values are the scores attributed to each token. The model gives higher score to tokens it deems probable in that
-   context.
-5. Retrieve the top 5 tokens using the PyTorch `topk` or TensorFlow `top_k` methods.
-6. Replace the mask token by the tokens and print the results
-
-<frameworkcontent>
-<pt>
-```py
->>> from transformers import AutoModelForMaskedLM, AutoTokenizer
->>> import torch
-
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
->>> model = AutoModelForMaskedLM.from_pretrained("distilbert-base-cased")
-
->>> sequence = (
-...     "Distilled models are smaller than the models they mimic. Using them instead of the large "
-...     f"versions would help {tokenizer.mask_token} our carbon footprint."
-... )
-
->>> inputs = tokenizer(sequence, return_tensors="pt")
->>> mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
-
->>> token_logits = model(**inputs).logits
->>> mask_token_logits = token_logits[0, mask_token_index, :]
-
->>> top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
-
->>> for token in top_5_tokens:
-...     print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))
-Distilled models are smaller than the models they mimic. Using them instead of the large versions would help reduce our carbon footprint.
-Distilled models are smaller than the models they mimic. Using them instead of the large versions would help increase our carbon footprint.
-Distilled models are smaller than the models they mimic. Using them instead of the large versions would help decrease our carbon footprint.
-Distilled models are smaller than the models they mimic. Using them instead of the large versions would help offset our carbon footprint.
-Distilled models are smaller than the models they mimic. Using them instead of the large versions would help improve our carbon footprint.
-```
-</pt>
-<tf>
-```py
->>> from transformers import TFAutoModelForMaskedLM, AutoTokenizer
->>> import tensorflow as tf
-
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
->>> model = TFAutoModelForMaskedLM.from_pretrained("distilbert-base-cased")
-
->>> sequence = (
-...     "Distilled models are smaller than the models they mimic. Using them instead of the large "
-...     f"versions would help {tokenizer.mask_token} our carbon footprint."
-... )
-
->>> inputs = tokenizer(sequence, return_tensors="tf")
->>> mask_token_index = tf.where(inputs["input_ids"] == tokenizer.mask_token_id)[0, 1]
-
->>> token_logits = model(**inputs).logits
->>> mask_token_logits = token_logits[0, mask_token_index, :]
-
->>> top_5_tokens = tf.math.top_k(mask_token_logits, 5).indices.numpy()
-
->>> for token in top_5_tokens:
-...     print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))
-Distilled models are smaller than the models they mimic. Using them instead of the large versions would help reduce our carbon footprint.
-Distilled models are smaller than the models they mimic. Using them instead of the large versions would help increase our carbon footprint.
-Distilled models are smaller than the models they mimic. Using them instead of the large versions would help decrease our carbon footprint.
-Distilled models are smaller than the models they mimic. Using them instead of the large versions would help offset our carbon footprint.
-Distilled models are smaller than the models they mimic. Using them instead of the large versions would help improve our carbon footprint.
-```
-</tf>
-</frameworkcontent>
-
-This prints five sequences, with the top 5 tokens predicted by the model.
-
-
-### Causal Language Modeling
-
-Causal language modeling is the task of predicting the token following a sequence of tokens. In this situation, the
-model only attends to the left context (tokens on the left of the mask). Such a training is particularly interesting
-for generation tasks. If you would like to fine-tune a model on a causal language modeling task, you may leverage the
-[run_clm.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling/run_clm.py) script.
-
-Usually, the next token is predicted by sampling from the logits of the last hidden state the model produces from the
-input sequence.
-
-<frameworkcontent>
-<pt>
-Here is an example of using the tokenizer and model and leveraging the
-[`top_k_top_p_filtering`] method to sample the next token following an input sequence
-of tokens.
-
-```py
->>> from transformers import AutoModelForCausalLM, AutoTokenizer, top_k_top_p_filtering
->>> import torch
->>> from torch import nn
-
->>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
->>> model = AutoModelForCausalLM.from_pretrained("gpt2")
-
->>> sequence = f"Hugging Face is based in DUMBO, New York City, and"
-
->>> inputs = tokenizer(sequence, return_tensors="pt")
->>> input_ids = inputs["input_ids"]
-
->>> # get logits of last hidden state
->>> next_token_logits = model(**inputs).logits[:, -1, :]
-
->>> # filter
->>> filtered_next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0)
-
->>> # sample
->>> probs = nn.functional.softmax(filtered_next_token_logits, dim=-1)
->>> next_token = torch.multinomial(probs, num_samples=1)
-
->>> generated = torch.cat([input_ids, next_token], dim=-1)
-
->>> resulting_string = tokenizer.decode(generated.tolist()[0])
->>> print(resulting_string)
-Hugging Face is based in DUMBO, New York City, and ...
-```
-</pt>
-<tf>
-Here is an example of using the tokenizer and model and leveraging the
-[`tf_top_k_top_p_filtering`] method to sample the next token following an input sequence
-of tokens.
-
-```py
->>> from transformers import TFAutoModelForCausalLM, AutoTokenizer, tf_top_k_top_p_filtering
->>> import tensorflow as tf
-
->>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
->>> model = TFAutoModelForCausalLM.from_pretrained("gpt2")
-
->>> sequence = f"Hugging Face is based in DUMBO, New York City, and"
-
->>> inputs = tokenizer(sequence, return_tensors="tf")
->>> input_ids = inputs["input_ids"]
-
->>> # get logits of last hidden state
->>> next_token_logits = model(**inputs).logits[:, -1, :]
-
->>> # filter
->>> filtered_next_token_logits = tf_top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0)
-
->>> # sample
->>> next_token = tf.random.categorical(filtered_next_token_logits, dtype=tf.int32, num_samples=1)
-
->>> generated = tf.concat([input_ids, next_token], axis=1)
-
->>> resulting_string = tokenizer.decode(generated.numpy().tolist()[0])
->>> print(resulting_string)
-Hugging Face is based in DUMBO, New York City, and ...
-```
-</tf>
-</frameworkcontent>
-
-This outputs a (hopefully) coherent next token following the original sequence, which in our case is the word *is* or
-*features*.
-
-In the next section, we show how [`generation.GenerationMixin.generate`] can be used to
-generate multiple tokens up to a specified length instead of one token at a time.
-
-### Text Generation
-
-In text generation (*a.k.a* *open-ended text generation*) the goal is to create a coherent portion of text that is a
-continuation from the given context. The following example shows how *GPT-2* can be used in pipelines to generate text.
-As a default all models apply *Top-K* sampling when used in pipelines, as configured in their respective configurations
-(see [gpt-2 config](https://huggingface.co/gpt2/blob/main/config.json) for example).
-
-<frameworkcontent>
-<pt>
-```py
->>> from transformers import pipeline
-
->>> text_generator = pipeline("text-generation")
->>> print(text_generator("As far as I am concerned, I will", max_length=50, do_sample=False))
-[{'generated_text': 'As far as I am concerned, I will be the first to admit that I am not a fan of the idea of a
-"free market." I think that the idea of a free market is a bit of a stretch. I think that the idea'}]
-```
-
-Here, the model generates a random text with a total maximal length of *50* tokens from context *"As far as I am
-concerned, I will"*. Behind the scenes, the pipeline object calls the method
-[`PreTrainedModel.generate`] to generate text. The default arguments for this method can be
-overridden in the pipeline, as is shown above for the arguments `max_length` and `do_sample`.
-
-Below is an example of text generation using `XLNet` and its tokenizer, which includes calling `generate()` directly:
-
-```py
->>> from transformers import AutoModelForCausalLM, AutoTokenizer
-
->>> model = AutoModelForCausalLM.from_pretrained("xlnet-base-cased")
->>> tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
-
->>> # Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology
->>> PADDING_TEXT = """In 1991, the remains of Russian Tsar Nicholas II and his family
-... (except for Alexei and Maria) are discovered.
-... The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
-... remainder of the story. 1883 Western Siberia,
-... a young Grigori Rasputin is asked by his father and a group of men to perform magic.
-... Rasputin has a vision and denounces one of the men as a horse thief. Although his
-... father initially slaps him for making such an accusation, Rasputin watches as the
-... man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
-... the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
-... with people, even a bishop, begging for his blessing. <eod> </s> <eos>"""
-
->>> prompt = "Today the weather is really nice and I am planning on "
->>> inputs = tokenizer(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="pt")["input_ids"]
-
->>> prompt_length = len(tokenizer.decode(inputs[0]))
->>> outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60)
->>> generated = prompt + tokenizer.decode(outputs[0])[prompt_length + 1 :]
-
->>> print(generated)
-Today the weather is really nice and I am planning ...
-```
-</pt>
-<tf>
-```py
->>> from transformers import TFAutoModelForCausalLM, AutoTokenizer
-
->>> model = TFAutoModelForCausalLM.from_pretrained("xlnet-base-cased")
->>> tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
-
->>> # Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology
->>> PADDING_TEXT = """In 1991, the remains of Russian Tsar Nicholas II and his family
-... (except for Alexei and Maria) are discovered.
-... The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
-... remainder of the story. 1883 Western Siberia,
-... a young Grigori Rasputin is asked by his father and a group of men to perform magic.
-... Rasputin has a vision and denounces one of the men as a horse thief. Although his
-... father initially slaps him for making such an accusation, Rasputin watches as the
-... man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
-... the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
-... with people, even a bishop, begging for his blessing. <eod> </s> <eos>"""
-
->>> prompt = "Today the weather is really nice and I am planning on "
->>> inputs = tokenizer(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="tf")["input_ids"]
-
->>> prompt_length = len(tokenizer.decode(inputs[0]))
->>> outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60)
->>> generated = prompt + tokenizer.decode(outputs[0])[prompt_length + 1 :]
-
->>> print(generated)
-Today the weather is really nice and I am planning ...
-```
-</tf>
-</frameworkcontent>
-
-Text generation is currently possible with *GPT-2*, *OpenAi-GPT*, *CTRL*, *XLNet*, *Transfo-XL* and *Reformer* in
-PyTorch and for most models in Tensorflow as well. As can be seen in the example above *XLNet* and *Transfo-XL* often
-need to be padded to work well. GPT-2 is usually a good choice for *open-ended text generation* because it was trained
-on millions of webpages with a causal language modeling objective.
-
-For more information on how to apply different decoding strategies for text generation, please also refer to our text
-generation blog post [here](https://huggingface.co/blog/how-to-generate).
-
-
-## Named Entity Recognition
-
-Named Entity Recognition (NER) is the task of classifying tokens according to a class, for example, identifying a token
-as a person, an organisation or a location. An example of a named entity recognition dataset is the CoNLL-2003 dataset,
-which is entirely based on that task. If you would like to fine-tune a model on an NER task, you may leverage the
-[run_ner.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/token-classification/run_ner.py) script.
-
-Here is an example of using pipelines to do named entity recognition, specifically, trying to identify tokens as
-belonging to one of 9 classes:
-
-- O, Outside of a named entity
-- B-MIS, Beginning of a miscellaneous entity right after another miscellaneous entity
-- I-MIS, Miscellaneous entity
-- B-PER, Beginning of a person's name right after another person's name
-- I-PER, Person's name
-- B-ORG, Beginning of an organisation right after another organisation
-- I-ORG, Organisation
-- B-LOC, Beginning of a location right after another location
-- I-LOC, Location
-
-It leverages a fine-tuned model on CoNLL-2003, fine-tuned by [@stefan-it](https://github.com/stefan-it) from [dbmdz](https://github.com/dbmdz).
-
-```py
->>> from transformers import pipeline
-
->>> ner_pipe = pipeline("ner")
-
->>> sequence = """Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO,
-... therefore very close to the Manhattan Bridge which is visible from the window."""
-```
-
-This outputs a list of all words that have been identified as one of the entities from the 9 classes defined above.
-Here are the expected results:
-
-```py
->>> for entity in ner_pipe(sequence):
-...     print(entity)
-{'entity': 'I-ORG', 'score': 0.9996, 'index': 1, 'word': 'Hu', 'start': 0, 'end': 2}
-{'entity': 'I-ORG', 'score': 0.9910, 'index': 2, 'word': '##gging', 'start': 2, 'end': 7}
-{'entity': 'I-ORG', 'score': 0.9982, 'index': 3, 'word': 'Face', 'start': 8, 'end': 12}
-{'entity': 'I-ORG', 'score': 0.9995, 'index': 4, 'word': 'Inc', 'start': 13, 'end': 16}
-{'entity': 'I-LOC', 'score': 0.9994, 'index': 11, 'word': 'New', 'start': 40, 'end': 43}
-{'entity': 'I-LOC', 'score': 0.9993, 'index': 12, 'word': 'York', 'start': 44, 'end': 48}
-{'entity': 'I-LOC', 'score': 0.9994, 'index': 13, 'word': 'City', 'start': 49, 'end': 53}
-{'entity': 'I-LOC', 'score': 0.9863, 'index': 19, 'word': 'D', 'start': 79, 'end': 80}
-{'entity': 'I-LOC', 'score': 0.9514, 'index': 20, 'word': '##UM', 'start': 80, 'end': 82}
-{'entity': 'I-LOC', 'score': 0.9337, 'index': 21, 'word': '##BO', 'start': 82, 'end': 84}
-{'entity': 'I-LOC', 'score': 0.9762, 'index': 28, 'word': 'Manhattan', 'start': 114, 'end': 123}
-{'entity': 'I-LOC', 'score': 0.9915, 'index': 29, 'word': 'Bridge', 'start': 124, 'end': 130}
-```
-
-Note how the tokens of the sequence "Hugging Face" have been identified as an organisation, and "New York City",
-"DUMBO" and "Manhattan Bridge" have been identified as locations.
-
-Here is an example of doing named entity recognition, using a model and a tokenizer. The process is the following:
-
-1. Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a BERT model and loads it
-   with the weights stored in the checkpoint.
-2. Define a sequence with known entities, such as "Hugging Face" as an organisation and "New York City" as a location.
-3. Split words into tokens so that they can be mapped to predictions. We use a small hack by, first, completely
-   encoding and decoding the sequence, so that we're left with a string that contains the special tokens.
-4. Encode that sequence into IDs (special tokens are added automatically).
-5. Retrieve the predictions by passing the input to the model and getting the first output. This results in a
-   distribution over the 9 possible classes for each token. We take the argmax to retrieve the most likely class for
-   each token.
-6. Zip together each token with its prediction and print it.
-
-<frameworkcontent>
-<pt>
-```py
->>> from transformers import AutoModelForTokenClassification, AutoTokenizer
->>> import torch
-
->>> model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
-
->>> sequence = (
-...     "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, "
-...     "therefore very close to the Manhattan Bridge."
-... )
-
->>> inputs = tokenizer(sequence, return_tensors="pt")
->>> tokens = inputs.tokens()
-
->>> outputs = model(**inputs).logits
->>> predictions = torch.argmax(outputs, dim=2)
-```
-</pt>
-<tf>
-```py
->>> from transformers import TFAutoModelForTokenClassification, AutoTokenizer
->>> import tensorflow as tf
-
->>> model = TFAutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
-
->>> sequence = (
-...     "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, "
-...     "therefore very close to the Manhattan Bridge."
-... )
-
->>> inputs = tokenizer(sequence, return_tensors="tf")
->>> tokens = inputs.tokens()
-
->>> outputs = model(**inputs)[0]
->>> predictions = tf.argmax(outputs, axis=2)
-```
-</tf>
-</frameworkcontent>
-
-This outputs a list of each token mapped to its corresponding prediction. Differently from the pipeline, here every
-token has a prediction as we didn't remove the "0"th class, which means that no particular entity was found on that
-token.
-
-In the above example, `predictions` is an integer that corresponds to the predicted class. We can use the
-`model.config.id2label` property in order to recover the class name corresponding to the class number, which is
-illustrated below:
-
-```py
->>> for token, prediction in zip(tokens, predictions[0].numpy()):
-...     print((token, model.config.id2label[prediction]))
-('[CLS]', 'O')
-('Hu', 'I-ORG')
-('##gging', 'I-ORG')
-('Face', 'I-ORG')
-('Inc', 'I-ORG')
-('.', 'O')
-('is', 'O')
-('a', 'O')
-('company', 'O')
-('based', 'O')
-('in', 'O')
-('New', 'I-LOC')
-('York', 'I-LOC')
-('City', 'I-LOC')
-('.', 'O')
-('Its', 'O')
-('headquarters', 'O')
-('are', 'O')
-('in', 'O')
-('D', 'I-LOC')
-('##UM', 'I-LOC')
-('##BO', 'I-LOC')
-(',', 'O')
-('therefore', 'O')
-('very', 'O')
-('close', 'O')
-('to', 'O')
-('the', 'O')
-('Manhattan', 'I-LOC')
-('Bridge', 'I-LOC')
-('.', 'O')
-('[SEP]', 'O')
-```
-
-## Summarization
-
-Summarization is the task of summarizing a document or an article into a shorter text. If you would like to fine-tune a
-model on a summarization task, you may leverage the [run_summarization.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization/run_summarization.py)
-script.
-
-An example of a summarization dataset is the CNN / Daily Mail dataset, which consists of long news articles and was
-created for the task of summarization. If you would like to fine-tune a model on a summarization task, various
-approaches are described in this [document](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization/README.md).
-
-Here is an example of using the pipelines to do summarization. It leverages a Bart model that was fine-tuned on the CNN
-/ Daily Mail data set.
-
-```py
->>> from transformers import pipeline
-
->>> summarizer = pipeline("summarization")
-
->>> ARTICLE = """ New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York.
-... A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband.
-... Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other.
-... In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage.
-... Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the
-... 2010 marriage license application, according to court documents.
-... Prosecutors said the marriages were part of an immigration scam.
-... On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further.
-... After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective
-... Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.
-... All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say.
-... Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages.
-... Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted.
-... The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s
-... Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali.
-... Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force.
-... If convicted, Barrientos faces up to four years in prison.  Her next court appearance is scheduled for May 18.
-... """
-```
-
-Because the summarization pipeline depends on the `PreTrainedModel.generate()` method, we can override the default
-arguments of `PreTrainedModel.generate()` directly in the pipeline for `max_length` and `min_length` as shown
-below. This outputs the following summary:
-
-```py
->>> print(summarizer(ARTICLE, max_length=130, min_length=30, do_sample=False))
-[{'summary_text': ' Liana Barrientos, 39, is charged with two counts of "offering a false instrument for filing in
-the first degree" In total, she has been married 10 times, with nine of her marriages occurring between 1999 and
-2002 . At one time, she was married to eight men at once, prosecutors say .'}]
-```
-
-Here is an example of doing summarization using a model and a tokenizer. The process is the following:
-
-1. Instantiate a tokenizer and a model from the checkpoint name. Summarization is usually done using an encoder-decoder
-   model, such as `Bart` or `T5`.
-2. Define the article that should be summarized.
-3. Add the T5 specific prefix "summarize: ".
-4. Use the `PreTrainedModel.generate()` method to generate the summary.
-
-In this example we use Google's T5 model. Even though it was pre-trained only on a multi-task mixed dataset (including
-CNN / Daily Mail), it yields very good results.
-
-<frameworkcontent>
-<pt>
-```py
->>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
-
->>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
->>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
-
->>> # T5 uses a max_length of 512 so we cut the article to 512 tokens.
->>> inputs = tokenizer("summarize: " + ARTICLE, return_tensors="pt", max_length=512, truncation=True)
->>> outputs = model.generate(
-...     inputs["input_ids"], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True
-... )
-
->>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
-prosecutors say the marriages were part of an immigration scam. if convicted, barrientos faces two criminal
-counts of "offering a false instrument for filing in the first degree" she has been married 10 times, nine of them
-between 1999 and 2002.
-```
-</pt>
-<tf>
-```py
->>> from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer
-
->>> model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-base")
->>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
-
->>> # T5 uses a max_length of 512 so we cut the article to 512 tokens.
->>> inputs = tokenizer("summarize: " + ARTICLE, return_tensors="tf", max_length=512)
->>> outputs = model.generate(
-...     inputs["input_ids"], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True
-... )
-
->>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
-prosecutors say the marriages were part of an immigration scam. if convicted, barrientos faces two criminal
-counts of "offering a false instrument for filing in the first degree" she has been married 10 times, nine of them
-between 1999 and 2002.
-```
-</tf>
-</frameworkcontent>
-
-## Translation
-
-Translation is the task of translating a text from one language to another. If you would like to fine-tune a model on a
-translation task, you may leverage the [run_translation.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/translation/run_translation.py) script.
-
-An example of a translation dataset is the WMT English to German dataset, which has sentences in English as the input
-data and the corresponding sentences in German as the target data. If you would like to fine-tune a model on a
-translation task, various approaches are described in this [document](https://github.com/huggingface/transformers/tree/main/examples/pytorch/translation/README.md).
-
-Here is an example of using the pipelines to do translation. It leverages a T5 model that was only pre-trained on a
-multi-task mixture dataset (including WMT), yet, yielding impressive translation results.
-
-```py
->>> from transformers import pipeline
-
->>> translator = pipeline("translation_en_to_de")
->>> print(translator("Hugging Face is a technology company based in New York and Paris", max_length=40))
-[{'translation_text': 'Hugging Face ist ein Technologieunternehmen mit Sitz in New York und Paris.'}]
-```
-
-Because the translation pipeline depends on the `PreTrainedModel.generate()` method, we can override the default
-arguments of `PreTrainedModel.generate()` directly in the pipeline as is shown for `max_length` above.
-
-Here is an example of doing translation using a model and a tokenizer. The process is the following:
-
-1. Instantiate a tokenizer and a model from the checkpoint name. Summarization is usually done using an encoder-decoder
-   model, such as `Bart` or `T5`.
-2. Define the article that should be summarized.
-3. Add the T5 specific prefix "translate English to German: "
-4. Use the `PreTrainedModel.generate()` method to perform the translation.
-
-<frameworkcontent>
-<pt>
-```py
->>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
-
->>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
->>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
-
->>> inputs = tokenizer(
-...     "translate English to German: Hugging Face is a technology company based in New York and Paris",
-...     return_tensors="pt",
-... )
->>> outputs = model.generate(inputs["input_ids"], max_length=40, num_beams=4, early_stopping=True)
-
->>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
-Hugging Face ist ein Technologieunternehmen mit Sitz in New York und Paris.
-```
-</pt>
-<tf>
-```py
->>> from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer
-
->>> model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-base")
->>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
-
->>> inputs = tokenizer(
-...     "translate English to German: Hugging Face is a technology company based in New York and Paris",
-...     return_tensors="tf",
-... )
->>> outputs = model.generate(inputs["input_ids"], max_length=40, num_beams=4, early_stopping=True)
-
->>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
-Hugging Face ist ein Technologieunternehmen mit Sitz in New York und Paris.
-```
-</tf>
-</frameworkcontent>
-
-We get the same translation as with the pipeline example.
-
-## Audio classification
-
-Audio classification assigns a class to an audio signal. The Keyword Spotting dataset from the [SUPERB](https://huggingface.co/datasets/superb) benchmark is an example dataset that can be used for audio classification fine-tuning. This dataset contains ten classes of keywords for classification. If you'd like to fine-tune a model for audio classification, take a look at the [run_audio_classification.py](https://github.com/huggingface/transformers/blob/main/examples/pytorch/audio-classification/run_audio_classification.py) script or this [how-to guide](./tasks/audio_classification).
-
-The following examples demonstrate how to use a [`pipeline`] and a model and tokenizer for audio classification inference:
-
-```py
->>> from transformers import pipeline
->>> from datasets import load_dataset
->>> import torch
-
->>> torch.manual_seed(42)  # doctest: +IGNORE_RESULT
-
->>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
->>> dataset = dataset.sort("id")
->>> audio_file = dataset[0]["audio"]["path"]
-
->>> audio_classifier = pipeline(
-...     task="audio-classification", model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
-... )
->>> predictions = audio_classifier(audio_file)
->>> predictions = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in predictions]
->>> predictions
-[{'score': 0.1315, 'label': 'calm'}, {'score': 0.1307, 'label': 'neutral'}, {'score': 0.1274, 'label': 'sad'}, {'score': 0.1261, 'label': 'fearful'}, {'score': 0.1242, 'label': 'happy'}]
-```
-
-The general process for using a model and feature extractor for audio classification is:
-
-1. Instantiate a feature extractor and a model from the checkpoint name.
-2. Process the audio signal to be classified with a feature extractor.
-3. Pass the input through the model and take the `argmax` to retrieve the most likely class.
-4. Convert the class id to a class name with `id2label` to return an interpretable result.
-
-<frameworkcontent>
-<pt>
-```py
->>> from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
->>> from datasets import load_dataset
->>> import torch
-
->>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
->>> dataset = dataset.sort("id")
->>> sampling_rate = dataset.features["audio"].sampling_rate
-
->>> feature_extractor = AutoFeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ks")
->>> model = AutoModelForAudioClassification.from_pretrained("superb/wav2vec2-base-superb-ks")
-
->>> inputs = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
-
->>> with torch.no_grad():
-...     logits = model(**inputs).logits
-
->>> predicted_class_ids = torch.argmax(logits, dim=-1).item()
->>> predicted_label = model.config.id2label[predicted_class_ids]
->>> predicted_label
-'_unknown_'
-```
-</pt>
-</frameworkcontent>
-
-## Automatic speech recognition
-
-Automatic speech recognition transcribes an audio signal to text. The [Common Voice](https://huggingface.co/datasets/common_voice) dataset is an example dataset that can be used for automatic speech recognition fine-tuning. It contains an audio file of a speaker and the corresponding sentence. If you'd like to fine-tune a model for automatic speech recognition, take a look at the [run_speech_recognition_ctc.py](https://github.com/huggingface/transformers/blob/main/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py) or [run_speech_recognition_seq2seq.py](https://github.com/huggingface/transformers/blob/main/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py) scripts or this [how-to guide](./tasks/asr).
-
-The following examples demonstrate how to use a [`pipeline`] and a model and tokenizer for automatic speech recognition inference:
-
-```py
->>> from transformers import pipeline
->>> from datasets import load_dataset
-
->>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
->>> dataset = dataset.sort("id")
->>> audio_file = dataset[0]["audio"]["path"]
-
->>> speech_recognizer = pipeline(task="automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
->>> speech_recognizer(audio_file)
-{'text': 'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'}
-```
-
-The general process for using a model and processor for automatic speech recognition is:
-
-1. Instantiate a processor (which regroups a feature extractor for input processing and a tokenizer for decoding) and a model from the checkpoint name.
-2. Process the audio signal and text with a processor.
-3. Pass the input through the model and take the `argmax` to retrieve the predicted text.
-4. Decode the text with a tokenizer to obtain the transcription.
-
-<frameworkcontent>
-<pt>
-```py
->>> from transformers import AutoProcessor, AutoModelForCTC
->>> from datasets import load_dataset
->>> import torch
-
->>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
->>> dataset = dataset.sort("id")
->>> sampling_rate = dataset.features["audio"].sampling_rate
-
->>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
->>> model = AutoModelForCTC.from_pretrained("facebook/wav2vec2-base-960h")
-
->>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
->>> with torch.no_grad():
-...     logits = model(**inputs).logits
->>> predicted_ids = torch.argmax(logits, dim=-1)
-
->>> transcription = processor.batch_decode(predicted_ids)
->>> transcription[0]
-'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'
-```
-</pt>
-</frameworkcontent>
-
-## Image classification
-
-Like text and audio classification, image classification assigns a class to an image. The [CIFAR-100](https://huggingface.co/datasets/cifar100) dataset is an example dataset that can be used for image classification fine-tuning. It contains an image and the corresponding class. If you'd like to fine-tune a model for image classification, take a look at the [run_image_classification.py](https://github.com/huggingface/transformers/blob/main/examples/pytorch/image-classification/run_image_classification.py) script or this [how-to guide](./tasks/image_classification).
-
-The following examples demonstrate how to use a [`pipeline`] and a model and tokenizer for image classification inference:
-
-```py
->>> from transformers import pipeline
-
->>> vision_classifier = pipeline(task="image-classification")
->>> result = vision_classifier(
-...     images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
-... )
->>> print("\n".join([f"Class {d['label']} with score {round(d['score'], 4)}" for d in result]))
-Class lynx, catamount with score 0.4335
-Class cougar, puma, catamount, mountain lion, painter, panther, Felis concolor with score 0.0348
-Class snow leopard, ounce, Panthera uncia with score 0.0324
-Class Egyptian cat with score 0.0239
-Class tiger cat with score 0.0229
-```
-
-The general process for using a model and image processor for image classification is:
-
-1. Instantiate an image processor and a model from the checkpoint name.
-2. Process the image to be classified with an image processor.
-3. Pass the input through the model and take the `argmax` to retrieve the predicted class.
-4. Convert the class id to a class name with `id2label` to return an interpretable result.
-
-<frameworkcontent>
-<pt>
-```py
->>> from transformers import AutoImageProcessor, AutoModelForImageClassification
->>> import torch
->>> from datasets import load_dataset
-
->>> dataset = load_dataset("huggingface/cats-image")
->>> image = dataset["test"]["image"][0]
-
->>> feature_extractor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
->>> model = AutoModelForImageClassification.from_pretrained("google/vit-base-patch16-224")
-
->>> inputs = feature_extractor(image, return_tensors="pt")
-
->>> with torch.no_grad():
-...     logits = model(**inputs).logits
-
->>> predicted_label = logits.argmax(-1).item()
->>> print(model.config.id2label[predicted_label])
-Egyptian cat
-```
-</pt>
-</frameworkcontent>
diff --git a/docs/source/en/tasks/asr.md b/docs/source/en/tasks/asr.md
new file mode 100644
index 000000000000..d01269ba60a6
--- /dev/null
+++ b/docs/source/en/tasks/asr.md
@@ -0,0 +1,376 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Automatic speech recognition
+
+[[open-in-colab]]
+
+<Youtube id="TksaY_FDgnk"/>
+
+Automatic speech recognition (ASR) converts a speech signal to text, mapping a sequence of audio inputs to text outputs. Virtual assistants like Siri and Alexa use ASR models to help users everyday, and there are many other useful user-facing applications like live captioning and note-taking during meetings.
+
+This guide will show you how to:
+
+1. Finetune [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) on the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset to transcribe audio to text.
+2. Use your finetuned model for inference.
+
+<Tip>
+The task illustrated in this tutorial is supported by the following model architectures:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[Data2VecAudio](../model_doc/data2vec-audio), [Hubert](../model_doc/hubert), [M-CTC-T](../model_doc/mctct), [SEW](../model_doc/sew), [SEW-D](../model_doc/sew-d), [UniSpeech](../model_doc/unispeech), [UniSpeechSat](../model_doc/unispeech-sat), [Wav2Vec2](../model_doc/wav2vec2), [Wav2Vec2-Conformer](../model_doc/wav2vec2-conformer), [WavLM](../model_doc/wavlm)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+Before you begin, make sure you have all the necessary libraries installed:
+
+```bash
+pip install transformers datasets evaluate jiwer
+```
+
+We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## Load MInDS-14 dataset
+
+Start by loading a smaller subset of the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset from the 🤗 Datasets library. This'll give you a chance to experiment and make sure everything works before spending more time training on the full dataset.
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> minds = load_dataset("PolyAI/minds14", name="en-US", split="train[:100]")
+```
+
+Split the dataset's `train` split into a train and test set with the [`~Dataset.train_test_split`] method:
+
+```py
+>>> minds = minds.train_test_split(test_size=0.2)
+```
+
+Then take a look at the dataset:
+
+```py
+>>> minds
+DatasetDict({
+    train: Dataset({
+        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
+        num_rows: 16
+    })
+    test: Dataset({
+        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
+        num_rows: 4
+    })
+})
+```
+
+While the dataset contains a lot of useful information, like `lang_id` and `english_transcription`, you'll focus on the `audio` and `transcription` in this guide. Remove the other columns with the [`~datasets.Dataset.remove_columns`] method:
+
+```py
+>>> minds = minds.remove_columns(["english_transcription", "intent_class", "lang_id"])
+```
+
+Take a look at the example again:
+
+```py
+>>> minds["train"][0]
+{'audio': {'array': array([-0.00024414,  0.        ,  0.        , ...,  0.00024414,
+          0.00024414,  0.00024414], dtype=float32),
+  'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav',
+  'sampling_rate': 8000},
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav',
+ 'transcription': "hi I'm trying to use the banking app on my phone and currently my checking and savings account balance is not refreshing"}
+```
+
+There are two fields:
+
+- `audio`: a 1-dimensional `array` of the speech signal that must be called to load and resample the audio file.
+- `transcription`: the target text.
+
+## Preprocess
+
+The next step is to load a Wav2Vec2 processor to process the audio signal:
+
+```py
+>>> from transformers import AutoProcessor
+
+>>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base")
+```
+
+The MInDS-14 dataset has a sampling rate of 8000kHz (you can find this information in its [dataset card](https://huggingface.co/datasets/PolyAI/minds14)), which means you'll need to resample the dataset to 16000kHz to use the pretrained Wav2Vec2 model:
+
+```py
+>>> minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
+>>> minds["train"][0]
+{'audio': {'array': array([-2.38064706e-04, -1.58618059e-04, -5.43987835e-06, ...,
+          2.78103951e-04,  2.38446111e-04,  1.18740834e-04], dtype=float32),
+  'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav',
+  'sampling_rate': 16000},
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav',
+ 'transcription': "hi I'm trying to use the banking app on my phone and currently my checking and savings account balance is not refreshing"}
+```
+
+As you can see in the `transcription` above, the text contains a mix of upper and lowercase characters. The Wav2Vec2 tokenizer is only trained on uppercase characters so you'll need to make sure the text matches the tokenizer's vocabulary:
+
+```py
+>>> def uppercase(example):
+...     return {"transcription": example["transcription"].upper()}
+
+
+>>> minds = minds.map(uppercase)
+```
+
+Now create a preprocessing function that:
+
+1. Calls the `audio` column to load and resample the audio file.
+2. Extracts the `input_values` from the audio file and tokenize the `transcription` column with the processor.
+
+```py
+>>> def prepare_dataset(batch):
+...     audio = batch["audio"]
+...     batch = processor(audio["array"], sampling_rate=audio["sampling_rate"], text=batch["transcription"])
+...     batch["input_length"] = len(batch["input_values"][0])
+...     return batch
+```
+
+To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] function. You can speed up `map` by increasing the number of processes with the `num_proc` parameter. Remove the columns you don't need with the [`~datasets.Dataset.remove_columns`] method:
+
+```py
+>>> encoded_minds = minds.map(prepare_dataset, remove_columns=minds.column_names["train"], num_proc=4)
+```
+
+🤗 Transformers doesn't have a data collator for ASR, so you'll need to adapt the [`DataCollatorWithPadding`] to create a batch of examples. It'll also dynamically pad your text and labels to the length of the longest element in its batch (instead of the entire dataset) so they are a uniform length. While it is possible to pad your text in the `tokenizer` function by setting `padding=True`, dynamic padding is more efficient.
+
+Unlike other data collators, this specific data collator needs to apply a different padding method to `input_values` and `labels`:
+
+```py
+>>> import torch
+
+>>> from dataclasses import dataclass, field
+>>> from typing import Any, Dict, List, Optional, Union
+
+
+>>> @dataclass
+... class DataCollatorCTCWithPadding:
+...     processor: AutoProcessor
+...     padding: Union[bool, str] = "longest"
+
+...     def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+...         # split inputs and labels since they have to be of different lengths and need
+...         # different padding methods
+...         input_features = [{"input_values": feature["input_values"][0]} for feature in features]
+...         label_features = [{"input_ids": feature["labels"]} for feature in features]
+
+...         batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt")
+
+...         labels_batch = self.processor.pad(labels=label_features, padding=self.padding, return_tensors="pt")
+
+...         # replace padding with -100 to ignore loss correctly
+...         labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
+
+...         batch["labels"] = labels
+
+...         return batch
+```
+
+Now instantiate your `DataCollatorForCTCWithPadding`:
+
+```py
+>>> data_collator = DataCollatorCTCWithPadding(processor=processor, padding="longest")
+```
+
+## Evaluate
+
+Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [word error rate](https://huggingface.co/spaces/evaluate-metric/wer) (WER) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric):
+
+```py
+>>> import evaluate
+
+>>> wer = evaluate.load("wer")
+```
+
+Then create a function that passes your predictions and labels to [`~evaluate.EvaluationModule.compute`] to calculate the WER:
+
+```py
+>>> import numpy as np
+
+
+>>> def compute_metrics(pred):
+...     pred_logits = pred.predictions
+...     pred_ids = np.argmax(pred_logits, axis=-1)
+
+...     pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
+
+...     pred_str = processor.batch_decode(pred_ids)
+...     label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
+
+...     wer = wer.compute(predictions=pred_str, references=label_str)
+
+...     return {"wer": wer}
+```
+
+Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training.
+
+## Train
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)!
+
+</Tip>
+
+You're ready to start training your model now! Load Wav2Vec2 with [`AutoModelForCTC`]. Specify the reduction to apply with the `ctc_loss_reduction` parameter. It is often better to use the average instead of the default summation:
+
+```py
+>>> from transformers import AutoModelForCTC, TrainingArguments, Trainer
+
+>>> model = AutoModelForCTC.from_pretrained(
+...     "facebook/wav2vec2-base",
+...     ctc_loss_reduction="mean",
+...     pad_token_id=processor.tokenizer.pad_token_id,
+... )
+```
+
+At this point, only three steps remain:
+
+1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the WER and save the training checkpoint.
+2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function.
+3. Call [`~Trainer.train`] to finetune your model.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_asr_mind_model",
+...     per_device_train_batch_size=8,
+...     gradient_accumulation_steps=2,
+...     learning_rate=1e-5,
+...     warmup_steps=500,
+...     max_steps=2000,
+...     gradient_checkpointing=True,
+...     fp16=True,
+...     group_by_length=True,
+...     evaluation_strategy="steps",
+...     per_device_eval_batch_size=8,
+...     save_steps=1000,
+...     eval_steps=1000,
+...     logging_steps=25,
+...     load_best_model_at_end=True,
+...     metric_for_best_model="wer",
+...     greater_is_better=False,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=encoded_minds["train"],
+...     eval_dataset=encoded_minds["test"],
+...     tokenizer=processor,
+...     data_collator=data_collator,
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+</frameworkcontent>
+
+<Tip>
+
+For a more in-depth example of how to finetune a model for automatic speech recognition, take a look at this blog [post](https://huggingface.co/blog/fine-tune-wav2vec2-english) for English ASR and this [post](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2) for multilingual ASR.
+
+</Tip>
+
+## Inference
+
+Great, now that you've finetuned a model, you can use it for inference!
+
+Load an audio file you'd like to run inference on. Remember to resample the sampling rate of the audio file to match the sampling rate of the model if you need to!
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> dataset = load_dataset("PolyAI/minds14", "en-US", split="train")
+>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
+>>> sampling_rate = dataset.features["audio"].sampling_rate
+>>> audio_file = dataset[0]["audio"]["path"]
+```
+
+The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for automatic speech recognition with your model, and pass your audio file to it:
+
+```py
+>>> from transformers import pipeline
+
+>>> transcriber = pipeline("automatic-speech-recognition", model="stevhliu/my_awesome_asr_minds_model")
+>>> transcriber(audio_file)
+{'text': 'I WOUD LIKE O SET UP JOINT ACOUNT WTH Y PARTNER'}
+```
+
+<Tip>
+
+The transcription is decent, but it could be better! Try finetuning your model on more examples to get even better results!
+
+</Tip>
+
+You can also manually replicate the results of the `pipeline` if you'd like:
+
+<frameworkcontent>
+<pt>
+Load a processor to preprocess the audio file and transcription and return the `input` as PyTorch tensors:
+
+```py
+>>> from transformers import AutoProcessor
+
+>>> processor = AutoProcessor.from_pretrained("stevhliu/my_awesome_asr_mind_model")
+>>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
+```
+
+Pass your inputs to the model and return the logits:
+
+```py
+>>> from transformers import AutoModelForCTC
+
+>>> model = AutoModelForCTC.from_pretrained("stevhliu/my_awesome_asr_mind_model")
+>>> with torch.no_grad():
+...     logits = model(**inputs).logits
+```
+
+Get the predicted `input_ids` with the highest probability, and use the processor to decode the predicted `input_ids` back into text:
+
+```py
+>>> import torch
+
+>>> predicted_ids = torch.argmax(logits, dim=-1)
+>>> transcription = processor.batch_decode(predicted_ids)
+>>> transcription
+['I WOUL LIKE O SET UP JOINT ACOUNT WTH Y PARTNER']
+```
+</pt>
+</frameworkcontent>
\ No newline at end of file
diff --git a/docs/source/en/tasks/asr.mdx b/docs/source/en/tasks/asr.mdx
deleted file mode 100644
index fcd5bc508c87..000000000000
--- a/docs/source/en/tasks/asr.mdx
+++ /dev/null
@@ -1,366 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Automatic speech recognition
-
-<Youtube id="TksaY_FDgnk"/>
-
-Automatic speech recognition (ASR) converts a speech signal to text, mapping a sequence of audio inputs to text outputs. Virtual assistants like Siri and Alexa use ASR models to help users everyday, and there are many other useful user-facing applications like live captioning and note-taking during meetings.
-
-This guide will show you how to:
-
-1. Finetune [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) on the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset to transcribe audio to text.
-2. Use your finetuned model for inference.
-
-<Tip>
-
-See the automatic speech recognition [task page](https://huggingface.co/tasks/automatic-speech-recognition) for more information about its associated models, datasets, and metrics.
-
-</Tip>
-
-Before you begin, make sure you have all the necessary libraries installed:
-
-```bash
-pip install transformers datasets evaluate jiwer
-```
-
-We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login:
-
-```py
->>> from huggingface_hub import notebook_login
-
->>> notebook_login()
-```
-
-## Load MInDS-14 dataset
-
-Start by loading a smaller subset of the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset from the 🤗 Datasets library. This'll give you a chance to experiment and make sure everything works before spending more time training on the full dataset.
-
-```py
->>> from datasets import load_dataset, Audio
-
->>> minds = load_dataset("PolyAI/minds14", name="en-US", split="train[:100]")
-```
-
-Split the dataset's `train` split into a train and test set with the [`~Dataset.train_test_split`] method:
-
-```py
->>> minds = minds.train_test_split(test_size=0.2)
-```
-
-Then take a look at the dataset:
-
-```py
->>> minds
-DatasetDict({
-    train: Dataset({
-        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
-        num_rows: 16
-    })
-    test: Dataset({
-        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
-        num_rows: 4
-    })
-})
-```
-
-While the dataset contains a lot of useful information, like `lang_id` and `english_transcription`, you'll focus on the `audio` and `transcription` in this guide. Remove the other columns with the [`~datasets.Dataset.remove_columns`] method:
-
-```py
->>> minds = minds.remove_columns(["english_transcription", "intent_class", "lang_id"])
-```
-
-Take a look at the example again:
-
-```py
->>> minds["train"][0]
-{'audio': {'array': array([-0.00024414,  0.        ,  0.        , ...,  0.00024414,
-          0.00024414,  0.00024414], dtype=float32),
-  'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav',
-  'sampling_rate': 8000},
- 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav',
- 'transcription': "hi I'm trying to use the banking app on my phone and currently my checking and savings account balance is not refreshing"}
-```
-
-There are two fields:
-
-- `audio`: a 1-dimensional `array` of the speech signal that must be called to load and resample the audio file.
-- `transcription`: the target text.
-
-## Preprocess
-
-The next step is to load a Wav2Vec2 processor to process the audio signal:
-
-```py
->>> from transformers import AutoProcessor
-
->>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base")
-```
-
-The MInDS-14 dataset has a sampling rate of 8000kHz (you can find this information in its [dataset card](https://huggingface.co/datasets/PolyAI/minds14)), which means you'll need to resample the dataset to 16000kHz to use the pretrained Wav2Vec2 model:
-
-```py
->>> minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
->>> minds["train"][0]
-{'audio': {'array': array([-2.38064706e-04, -1.58618059e-04, -5.43987835e-06, ...,
-          2.78103951e-04,  2.38446111e-04,  1.18740834e-04], dtype=float32),
-  'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav',
-  'sampling_rate': 16000},
- 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav',
- 'transcription': "hi I'm trying to use the banking app on my phone and currently my checking and savings account balance is not refreshing"}
-```
-
-As you can see in the `transcription` above, the text contains a mix of upper and lowercase characters. The Wav2Vec2 tokenizer is only trained on uppercase characters so you'll need to make sure the text matches the tokenizer's vocabulary:
-
-```py
->>> def uppercase(example):
-...     return {"transcription": example["transcription"].upper()}
-
-
->>> minds = minds.map(uppercase)
-```
-
-Now create a preprocessing function that:
-
-1. Calls the `audio` column to load and resample the audio file.
-2. Extracts the `input_values` from the audio file and tokenize the `transcription` column with the processor.
-
-```py
->>> def prepare_dataset(batch):
-...     audio = batch["audio"]
-...     batch = processor(audio["array"], sampling_rate=audio["sampling_rate"], text=batch["transcription"])
-...     batch["input_length"] = len(batch["input_values"][0])
-...     return batch
-```
-
-To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] function. You can speed up `map` by increasing the number of processes with the `num_proc` parameter. Remove the columns you don't need with the [`~datasets.Dataset.remove_columns`] method:
-
-```py
->>> encoded_minds = minds.map(prepare_dataset, remove_columns=minds.column_names["train"], num_proc=4)
-```
-
-🤗 Transformers doesn't have a data collator for ASR, so you'll need to adapt the [`DataCollatorWithPadding`] to create a batch of examples. It'll also dynamically pad your text and labels to the length of the longest element in its batch (instead of the entire dataset) so they are a uniform length. While it is possible to pad your text in the `tokenizer` function by setting `padding=True`, dynamic padding is more efficient.
-
-Unlike other data collators, this specific data collator needs to apply a different padding method to `input_values` and `labels`:
-
-```py
->>> import torch
-
->>> from dataclasses import dataclass, field
->>> from typing import Any, Dict, List, Optional, Union
-
-
->>> @dataclass
-... class DataCollatorCTCWithPadding:
-
-...     processor: AutoProcessor
-...     padding: Union[bool, str] = "longest"
-
-...     def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
-...         # split inputs and labels since they have to be of different lengths and need
-...         # different padding methods
-...         input_features = [{"input_values": feature["input_values"][0]} for feature in features]
-...         label_features = [{"input_ids": feature["labels"]} for feature in features]
-
-...         batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt")
-
-...         labels_batch = self.processor.pad(labels=label_features, padding=self.padding, return_tensors="pt")
-
-...         # replace padding with -100 to ignore loss correctly
-...         labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
-
-...         batch["labels"] = labels
-
-...         return batch
-```
-
-Now instantiate your `DataCollatorForCTCWithPadding`:
-
-```py
->>> data_collator = DataCollatorCTCWithPadding(processor=processor, padding="longest")
-```
-
-## Evaluate
-
-Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [word error rate](https://huggingface.co/spaces/evaluate-metric/wer) (WER) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric):
-
-```py
->>> import evaluate
-
->>> wer = evaluate.load("wer")
-```
-
-Then create a function that passes your predictions and labels to [`~evaluate.EvaluationModule.compute`] to calculate the WER:
-
-```py
->>> import numpy as np
-
-
->>> def compute_metrics(pred):
-...     pred_logits = pred.predictions
-...     pred_ids = np.argmax(pred_logits, axis=-1)
-
-...     pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
-
-...     pred_str = processor.batch_decode(pred_ids)
-...     label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
-
-...     wer = wer.compute(predictions=pred_str, references=label_str)
-
-...     return {"wer": wer}
-```
-
-Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training.
-
-## Train
-
-<frameworkcontent>
-<pt>
-<Tip>
-
-If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)!
-
-</Tip>
-
-You're ready to start training your model now! Load Wav2Vec2 with [`AutoModelForCTC`]. Specify the reduction to apply with the `ctc_loss_reduction` parameter. It is often better to use the average instead of the default summation:
-
-```py
->>> from transformers import AutoModelForCTC, TrainingArguments, Trainer
-
->>> model = AutoModelForCTC.from_pretrained(
-...     "facebook/wav2vec2-base",
-...     ctc_loss_reduction="mean",
-...     pad_token_id=processor.tokenizer.pad_token_id,
-... )
-```
-
-At this point, only three steps remain:
-
-1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the WER and save the training checkpoint.
-2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function.
-3. Call [`~Trainer.train`] to finetune your model.
-
-```py
->>> training_args = TrainingArguments(
-...     output_dir="my_awesome_asr_mind_model",
-...     per_device_train_batch_size=8,
-...     gradient_accumulation_steps=2,
-...     learning_rate=1e-5,
-...     warmup_steps=500,
-...     max_steps=2000,
-...     gradient_checkpointing=True,
-...     fp16=True,
-...     group_by_length=True,
-...     evaluation_strategy="steps",
-...     per_device_eval_batch_size=8,
-...     save_steps=1000,
-...     eval_steps=1000,
-...     logging_steps=25,
-...     load_best_model_at_end=True,
-...     metric_for_best_model="wer",
-...     greater_is_better=False,
-...     push_to_hub=True,
-... )
-
->>> trainer = Trainer(
-...     model=model,
-...     args=training_args,
-...     train_dataset=encoded_minds["train"],
-...     eval_dataset=encoded_minds["test"],
-...     tokenizer=processor.feature_extractor,
-...     data_collator=data_collator,
-...     compute_metrics=compute_metrics,
-... )
-
->>> trainer.train()
-```
-
-Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model:
-
-```py
->>> trainer.push_to_hub()
-```
-</pt>
-</frameworkcontent>
-
-<Tip>
-
-For a more in-depth example of how to finetune a model for automatic speech recognition, take a look at this blog [post](https://huggingface.co/blog/fine-tune-wav2vec2-english) for English ASR and this [post](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2) for multilingual ASR.
-
-</Tip>
-
-## Inference
-
-Great, now that you've finetuned a model, you can use it for inference!
-
-Load an audio file you'd like to run inference on. Remember to resample the sampling rate of the audio file to match the sampling rate of the model if you need to!
-
-```py
->>> from datasets import load_dataset, Audio
-
->>> dataset = load_dataset("PolyAI/minds14", "en-US", split="train")
->>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
->>> sampling_rate = dataset.features["audio"].sampling_rate
->>> audio_file = dataset[0]["audio"]["path"]
-```
-
-The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for automatic speech recognition with your model, and pass your audio file to it:
-
-```py
->>> from transformers import pipeline
-
->>> transcriber = pipeline("automatic-speech-recognition", model="stevhliu/my_awesome_asr_minds_model")
->>> transcriber(audio_file)
-{'text': 'I WOUD LIKE O SET UP JOINT ACOUNT WTH Y PARTNER'}
-```
-
-<Tip>
-
-The transcription is decent, but it could be better! Try finetuning your model on more examples to get even better results!
-
-</Tip>
-
-You can also manually replicate the results of the `pipeline` if you'd like:
-
-<frameworkcontent>
-<pt>
-Load a processor to preprocess the audio file and transcription and return the `input` as PyTorch tensors:
-
-```py
->>> from transformers import AutoProcessor
-
->>> processor = AutoProcessor.from_pretrained("stevhliu/my_awesome_asr_mind_model")
->>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
-```
-
-Pass your inputs to the model and return the logits:
-
-```py
->>> from transformers import AutoModelForCTC
-
->>> model = AutoModelForCTC.from_pretrained("stevhliu/my_awesome_asr_mind_model")
->>> with torch.no_grad():
-...     logits = model(**inputs).logits
-```
-
-Get the predicted `input_ids` with the highest probability, and use the processor to decode the predicted `input_ids` back into text:
-
-```py
->>> import torch
-
->>> predicted_ids = torch.argmax(logits, dim=-1)
->>> transcription = processor.batch_decode(predicted_ids)
->>> transcription
-['I WOUL LIKE O SET UP JOINT ACOUNT WTH Y PARTNER']
-```
-</pt>
-</frameworkcontent>
\ No newline at end of file
diff --git a/docs/source/en/tasks/audio_classification.md b/docs/source/en/tasks/audio_classification.md
new file mode 100644
index 000000000000..743a797fc53f
--- /dev/null
+++ b/docs/source/en/tasks/audio_classification.md
@@ -0,0 +1,329 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Audio classification
+
+[[open-in-colab]]
+
+<Youtube id="KWwzcmG98Ds"/>
+
+Audio classification - just like with text - assigns a class label output from the input data. The only difference is instead of text inputs, you have raw audio waveforms. Some practical applications of audio classification include identifying speaker intent, language classification, and even animal species by their sounds.
+
+This guide will show you how to:
+
+1. Finetune [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) on the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset to classify speaker intent.
+2. Use your finetuned model for inference.
+
+<Tip>
+The task illustrated in this tutorial is supported by the following model architectures:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[Audio Spectrogram Transformer](../model_doc/audio-spectrogram-transformer), [Data2VecAudio](../model_doc/data2vec-audio), [Hubert](../model_doc/hubert), [SEW](../model_doc/sew), [SEW-D](../model_doc/sew-d), [UniSpeech](../model_doc/unispeech), [UniSpeechSat](../model_doc/unispeech-sat), [Wav2Vec2](../model_doc/wav2vec2), [Wav2Vec2-Conformer](../model_doc/wav2vec2-conformer), [WavLM](../model_doc/wavlm), [Whisper](../model_doc/whisper)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+Before you begin, make sure you have all the necessary libraries installed:
+
+```bash
+pip install transformers datasets evaluate
+```
+
+We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## Load MInDS-14 dataset
+
+Start by loading the MInDS-14 dataset from the 🤗 Datasets library:
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> minds = load_dataset("PolyAI/minds14", name="en-US", split="train")
+```
+
+Split the dataset's `train` split into a smaller train and test set with the [`~datasets.Dataset.train_test_split`] method. This'll give you a chance to experiment and make sure everything works before spending more time on the full dataset.
+
+```py
+>>> minds = minds.train_test_split(test_size=0.2)
+```
+
+Then take a look at the dataset:
+
+```py
+>>> minds
+DatasetDict({
+    train: Dataset({
+        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
+        num_rows: 450
+    })
+    test: Dataset({
+        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
+        num_rows: 113
+    })
+})
+```
+
+While the dataset contains a lot of useful information, like `lang_id` and `english_transcription`, you'll focus on the `audio` and `intent_class` in this guide. Remove the other columns with the [`~datasets.Dataset.remove_columns`] method:
+
+```py
+>>> minds = minds.remove_columns(["path", "transcription", "english_transcription", "lang_id"])
+```
+
+Take a look at an example now:
+
+```py
+>>> minds["train"][0]
+{'audio': {'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00048828,
+         -0.00024414, -0.00024414], dtype=float32),
+  'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602b9a5fbb1e6d0fbce91f52.wav',
+  'sampling_rate': 8000},
+ 'intent_class': 2}
+```
+
+There are two fields:
+
+- `audio`: a 1-dimensional `array` of the speech signal that must be called to load and resample the audio file. 
+- `intent_class`: represents the class id of the speaker's intent. 
+
+To make it easier for the model to get the label name from the label id, create a dictionary that maps the label name to an integer and vice versa:
+
+```py
+>>> labels = minds["train"].features["intent_class"].names
+>>> label2id, id2label = dict(), dict()
+>>> for i, label in enumerate(labels):
+...     label2id[label] = str(i)
+...     id2label[str(i)] = label
+```
+
+Now you can convert the label id to a label name:
+
+```py
+>>> id2label[str(2)]
+'app_error'
+```
+
+## Preprocess
+
+The next step is to load a Wav2Vec2 feature extractor to process the audio signal:
+
+```py
+>>> from transformers import AutoFeatureExtractor
+
+>>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
+```
+
+The MInDS-14 dataset has a sampling rate of 8000khz (you can find this information in it's [dataset card](https://huggingface.co/datasets/PolyAI/minds14)), which means you'll need to resample the dataset to 16000kHz to use the pretrained Wav2Vec2 model:
+
+```py
+>>> minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
+>>> minds["train"][0]
+{'audio': {'array': array([ 2.2098757e-05,  4.6582241e-05, -2.2803260e-05, ...,
+         -2.8419291e-04, -2.3305941e-04, -1.1425107e-04], dtype=float32),
+  'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602b9a5fbb1e6d0fbce91f52.wav',
+  'sampling_rate': 16000},
+ 'intent_class': 2}
+```
+
+Now create a preprocessing function that:
+
+1. Calls the `audio` column to load, and if necessary, resample the audio file.
+2. Checks if the sampling rate of the audio file matches the sampling rate of the audio data a model was pretrained with. You can find this information in the Wav2Vec2 [model card](https://huggingface.co/facebook/wav2vec2-base).
+3. Set a maximum input length to batch longer inputs without truncating them.
+
+```py
+>>> def preprocess_function(examples):
+...     audio_arrays = [x["array"] for x in examples["audio"]]
+...     inputs = feature_extractor(
+...         audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True
+...     )
+...     return inputs
+```
+
+To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] function. You can speed up `map` by setting `batched=True` to process multiple elements of the dataset at once. Remove the columns you don't need, and rename `intent_class` to `label` because that's the name the model expects:
+
+```py
+>>> encoded_minds = minds.map(preprocess_function, remove_columns="audio", batched=True)
+>>> encoded_minds = encoded_minds.rename_column("intent_class", "label")
+```
+
+## Evaluate
+
+Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric):
+
+```py
+>>> import evaluate
+
+>>> accuracy = evaluate.load("accuracy")
+```
+
+Then create a function that passes your predictions and labels to [`~evaluate.EvaluationModule.compute`] to calculate the accuracy:
+
+```py
+>>> import numpy as np
+
+
+>>> def compute_metrics(eval_pred):
+...     predictions = np.argmax(eval_pred.predictions, axis=1)
+...     return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)
+```
+
+Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training.
+
+## Train
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)!
+
+</Tip>
+
+You're ready to start training your model now! Load Wav2Vec2 with [`AutoModelForAudioClassification`] along with the number of expected labels, and the label mappings:
+
+```py
+>>> from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer
+
+>>> num_labels = len(id2label)
+>>> model = AutoModelForAudioClassification.from_pretrained(
+...     "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label
+... )
+```
+
+At this point, only three steps remain:
+
+1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the accuracy and save the training checkpoint.
+2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function.
+3. Call [`~Trainer.train`] to finetune your model.
+
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_mind_model",
+...     evaluation_strategy="epoch",
+...     save_strategy="epoch",
+...     learning_rate=3e-5,
+...     per_device_train_batch_size=32,
+...     gradient_accumulation_steps=4,
+...     per_device_eval_batch_size=32,
+...     num_train_epochs=10,
+...     warmup_ratio=0.1,
+...     logging_steps=10,
+...     load_best_model_at_end=True,
+...     metric_for_best_model="accuracy",
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=encoded_minds["train"],
+...     eval_dataset=encoded_minds["test"],
+...     tokenizer=feature_extractor,
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+</frameworkcontent>
+
+<Tip>
+
+For a more in-depth example of how to finetune a model for audio classification, take a look at the corresponding [PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/audio_classification.ipynb).
+
+</Tip>
+
+## Inference
+
+Great, now that you've finetuned a model, you can use it for inference!
+
+Load an audio file you'd like to run inference on. Remember to resample the sampling rate of the audio file to match the sampling rate of the model if you need to!
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
+>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
+>>> sampling_rate = dataset.features["audio"].sampling_rate
+>>> audio_file = dataset[0]["audio"]["path"]
+```
+
+The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for audio classification with your model, and pass your audio file to it:
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline("audio-classification", model="stevhliu/my_awesome_minds_model")
+>>> classifier(audio_file)
+[
+    {'score': 0.09766869246959686, 'label': 'cash_deposit'},
+    {'score': 0.07998877018690109, 'label': 'app_error'},
+    {'score': 0.0781070664525032, 'label': 'joint_account'},
+    {'score': 0.07667109370231628, 'label': 'pay_bill'},
+    {'score': 0.0755252093076706, 'label': 'balance'}
+]
+```
+
+You can also manually replicate the results of the `pipeline` if you'd like:
+
+<frameworkcontent>
+<pt>
+Load a feature extractor to preprocess the audio file and return the `input` as PyTorch tensors:
+
+```py
+>>> from transformers import AutoFeatureExtractor
+
+>>> feature_extractor = AutoFeatureExtractor.from_pretrained("stevhliu/my_awesome_minds_model")
+>>> inputs = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
+```
+
+Pass your inputs to the model and return the logits:
+
+```py
+>>> from transformers import AutoModelForAudioClassification
+
+>>> model = AutoModelForAudioClassification.from_pretrained("stevhliu/my_awesome_minds_model")
+>>> with torch.no_grad():
+...     logits = model(**inputs).logits
+```
+
+Get the class with the highest probability, and use the model's `id2label` mapping to convert it to a label:
+
+```py
+>>> import torch
+
+>>> predicted_class_ids = torch.argmax(logits).item()
+>>> predicted_label = model.config.id2label[predicted_class_ids]
+>>> predicted_label
+'cash_deposit'
+```
+</pt>
+</frameworkcontent>
\ No newline at end of file
diff --git a/docs/source/en/tasks/audio_classification.mdx b/docs/source/en/tasks/audio_classification.mdx
deleted file mode 100644
index ab0abbced785..000000000000
--- a/docs/source/en/tasks/audio_classification.mdx
+++ /dev/null
@@ -1,317 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Audio classification
-
-<Youtube id="KWwzcmG98Ds"/>
-
-Audio classification - just like with text - assigns a class label output from the input data. The only difference is instead of text inputs, you have raw audio waveforms. Some practical applications of audio classification include identifying speaker intent, language classification, and even animal species by their sounds.
-
-This guide will show you how to:
-
-1. Finetune [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) on the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset to classify speaker intent.
-2. Use your finetuned model for inference.
-
-<Tip>
-
-See the audio classification [task page](https://huggingface.co/tasks/audio-classification) for more information about its associated models, datasets, and metrics.
-
-</Tip>
-
-Before you begin, make sure you have all the necessary libraries installed:
-
-```bash
-pip install transformers datasets evaluate
-```
-
-We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login:
-
-```py
->>> from huggingface_hub import notebook_login
-
->>> notebook_login()
-```
-
-## Load MInDS-14 dataset
-
-Start by loading the MInDS-14 dataset from the 🤗 Datasets library:
-
-```py
->>> from datasets import load_dataset, Audio
-
->>> minds = load_dataset("PolyAI/minds14", name="en-US", split="train")
-```
-
-Split the dataset's `train` split into a smaller train and test set with the [`~datasets.Dataset.train_test_split`] method. This'll give you a chance to experiment and make sure everything works before spending more time on the full dataset.
-
-```py
->>> minds = minds.train_test_split(test_size=0.2)
-```
-
-Then take a look at the dataset:
-
-```py
->>> minds
-DatasetDict({
-    train: Dataset({
-        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
-        num_rows: 450
-    })
-    test: Dataset({
-        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
-        num_rows: 113
-    })
-})
-```
-
-While the dataset contains a lot of useful information, like `lang_id` and `english_transcription`, you'll focus on the `audio` and `intent_class` in this guide. Remove the other columns with the [`~datasets.Dataset.remove_columns`] method:
-
-```py
->>> minds = minds.remove_columns(["path", "transcription", "english_transcription", "lang_id"])
-```
-
-Take a look at an example now:
-
-```py
->>> minds["train"][0]
-{'audio': {'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00048828,
-         -0.00024414, -0.00024414], dtype=float32),
-  'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602b9a5fbb1e6d0fbce91f52.wav',
-  'sampling_rate': 8000},
- 'intent_class': 2}
-```
-
-There are two fields:
-
-- `audio`: a 1-dimensional `array` of the speech signal that must be called to load and resample the audio file. 
-- `intent_class`: represents the class id of the speaker's intent. 
-
-To make it easier for the model to get the label name from the label id, create a dictionary that maps the label name to an integer and vice versa:
-
-```py
->>> labels = minds["train"].features["intent_class"].names
->>> label2id, id2label = dict(), dict()
->>> for i, label in enumerate(labels):
-...     label2id[label] = str(i)
-...     id2label[str(i)] = label
-```
-
-Now you can convert the label id to a label name:
-
-```py
->>> id2label[str(2)]
-'app_error'
-```
-
-## Preprocess
-
-The next step is to load a Wav2Vec2 feature extractor to process the audio signal:
-
-```py
->>> from transformers import AutoFeatureExtractor
-
->>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
-```
-
-The MInDS-14 dataset has a sampling rate of 8000khz (you can find this information in it's [dataset card](https://huggingface.co/datasets/PolyAI/minds14)), which means you'll need to resample the dataset to 16000kHz to use the pretrained Wav2Vec2 model:
-
-```py
->>> minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
->>> minds["train"][0]
-{'audio': {'array': array([ 2.2098757e-05,  4.6582241e-05, -2.2803260e-05, ...,
-         -2.8419291e-04, -2.3305941e-04, -1.1425107e-04], dtype=float32),
-  'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602b9a5fbb1e6d0fbce91f52.wav',
-  'sampling_rate': 16000},
- 'intent_class': 2}
-```
-
-Now create a preprocessing function that:
-
-1. Calls the `audio` column to load, and if necessary, resample the audio file.
-2. Checks if the sampling rate of the audio file matches the sampling rate of the audio data a model was pretrained with. You can find this information in the Wav2Vec2 [model card](https://huggingface.co/facebook/wav2vec2-base).
-3. Set a maximum input length to batch longer inputs without truncating them.
-
-```py
->>> def preprocess_function(examples):
-...     audio_arrays = [x["array"] for x in examples["audio"]]
-...     inputs = feature_extractor(
-...         audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True
-...     )
-...     return inputs
-```
-
-To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] function. You can speed up `map` by setting `batched=True` to process multiple elements of the dataset at once. Remove the columns you don't need, and rename `intent_class` to `label` because that's the name the model expects:
-
-```py
->>> encoded_minds = minds.map(preprocess_function, remove_columns="audio", batched=True)
->>> encoded_minds = encoded_minds.rename_column("intent_class", "label")
-```
-
-## Evaluate
-
-Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric):
-
-```py
->>> import evaluate
-
->>> accuracy = evaluate.load("accuracy")
-```
-
-Then create a function that passes your predictions and labels to [`~evaluate.EvaluationModule.compute`] to calculate the accuracy:
-
-```py
->>> import numpy as np
-
-
->>> def compute_metrics(eval_pred):
-...     predictions = np.argmax(eval_pred.predictions, axis=1)
-...     return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)
-```
-
-Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training.
-
-## Train
-
-<frameworkcontent>
-<pt>
-<Tip>
-
-If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)!
-
-</Tip>
-You're ready to start training your model now! Load Wav2Vec2 with [`AutoModelForAudioClassification`] along with the number of expected labels, and the label mappings:
-
-```py
->>> from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer
-
->>> num_labels = len(id2label)
->>> model = AutoModelForAudioClassification.from_pretrained(
-...     "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label
-... )
-```
-
-At this point, only three steps remain:
-
-1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the accuracy and save the training checkpoint.
-2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function.
-3. Call [`~Trainer.train`] to finetune your model.
-
-
-```py
->>> training_args = TrainingArguments(
-...     output_dir="my_awesome_mind_model",
-...     evaluation_strategy="epoch",
-...     save_strategy="epoch",
-...     learning_rate=3e-5,
-...     per_device_train_batch_size=32,
-...     gradient_accumulation_steps=4,
-...     per_device_eval_batch_size=32,
-...     num_train_epochs=10,
-...     warmup_ratio=0.1,
-...     logging_steps=10,
-...     load_best_model_at_end=True,
-...     metric_for_best_model="accuracy",
-...     push_to_hub=True,
-... )
-
->>> trainer = Trainer(
-...     model=model,
-...     args=training_args,
-...     train_dataset=encoded_minds["train"],
-...     eval_dataset=encoded_minds["test"],
-...     tokenizer=feature_extractor,
-...     compute_metrics=compute_metrics,
-... )
-
->>> trainer.train()
-```
-
-Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model:
-
-```py
->>> trainer.push_to_hub()
-```
-</pt>
-</frameworkcontent>
-
-<Tip>
-
-For a more in-depth example of how to finetune a model for audio classification, take a look at the corresponding [PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/audio_classification.ipynb).
-
-</Tip>
-
-## Inference
-
-Great, now that you've finetuned a model, you can use it for inference!
-
-Load an audio file you'd like to run inference on. Remember to resample the sampling rate of the audio file to match the sampling rate of the model if you need to!
-
-```py
->>> from datasets import load_dataset, Audio
-
->>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
->>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
->>> sampling_rate = dataset.features["audio"].sampling_rate
->>> audio_file = dataset[0]["audio"]["path"]
-```
-
-The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for audio classification with your model, and pass your audio file to it:
-
-```py
->>> from transformers import pipeline
-
->>> classifier = pipeline("audio-classification", model="stevhliu/my_awesome_minds_model")
->>> classifier(audio_file)
-[
-    {'score': 0.09766869246959686, 'label': 'cash_deposit'},
-    {'score': 0.07998877018690109, 'label': 'app_error'},
-    {'score': 0.0781070664525032, 'label': 'joint_account'},
-    {'score': 0.07667109370231628, 'label': 'pay_bill'},
-    {'score': 0.0755252093076706, 'label': 'balance'}
-]
-```
-
-You can also manually replicate the results of the `pipeline` if you'd like:
-
-<frameworkcontent>
-<pt>
-Load a feature extractor to preprocess the audio file and return the `input` as PyTorch tensors:
-
-```py
->>> from transformers import AutoFeatureExtractor
-
->>> feature_extractor = AutoFeatureExtractor.from_pretrained("stevhliu/my_awesome_minds_model")
->>> inputs = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
-```
-
-Pass your inputs to the model and return the logits:
-
-```py
->>> from transformers import AutoModelForAudioClassification
-
->>> model = AutoModelForAudioClassification.from_pretrained("stevhliu/my_awesome_minds_model")
->>> with torch.no_grad():
-...     logits = model(**inputs).logits
-```
-
-Get the class with the highest probability, and use the model's `id2label` mapping to convert it to a label:
-
-```py
->>> import torch
-
->>> predicted_class_ids = torch.argmax(logits).item()
->>> predicted_label = model.config.id2label[predicted_class_ids]
->>> predicted_label
-'cash_deposit'
-```
-</pt>
-</frameworkcontent>
\ No newline at end of file
diff --git a/docs/source/en/tasks/document_question_answering.md b/docs/source/en/tasks/document_question_answering.md
new file mode 100644
index 000000000000..24bf3a069ac9
--- /dev/null
+++ b/docs/source/en/tasks/document_question_answering.md
@@ -0,0 +1,498 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Document Question Answering
+
+[[open-in-colab]]
+
+Document Question Answering, also referred to as Document Visual Question Answering, is a task that involves providing
+answers to questions posed about document images. The input to models supporting this task is typically a combination of an image and
+a question, and the output is an answer expressed in natural language. These models utilize multiple modalities, including
+text, the positions of words (bounding boxes), and the image itself.
+
+This guide illustrates how to:
+
+- Fine-tune [LayoutLMv2](../model_doc/layoutlmv2) on the [DocVQA dataset](https://huggingface.co/datasets/nielsr/docvqa_1200_examples_donut).
+- Use your fine-tuned model for inference.
+
+<Tip>
+
+The task illustrated in this tutorial is supported by the following model architectures:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+LayoutLMv2 solves the document question-answering task by adding a question-answering head on top of the final hidden
+states of the tokens, to predict the positions of the start and end tokens of the
+answer. In other words, the problem is treated as extractive question answering: given the context, extract which piece
+of information answers the question. The context comes from the output of an OCR engine, here it is Google's Tesseract.
+
+Before you begin, make sure you have all the necessary libraries installed. LayoutLMv2 depends on detectron2, torchvision and tesseract.
+
+```bash
+pip install -q transformers datasets
+```
+
+```bash
+pip install 'git+https://github.com/facebookresearch/detectron2.git'
+pip install torchvision
+```
+
+```bash
+sudo apt install tesseract-ocr
+pip install -q pytesseract
+```
+
+Once you have installed all of the dependencies, restart your runtime.
+
+We encourage you to share your model with the community. Log in to your Hugging Face account to upload it to the 🤗 Hub.
+When prompted, enter your token to log in:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+Let's define some global variables.
+
+```py
+>>> model_checkpoint = "microsoft/layoutlmv2-base-uncased"
+>>> batch_size = 4
+```
+
+## Load the data
+
+In this guide we use a small sample of preprocessed DocVQA that you can find on 🤗 Hub. If you'd like to use the full
+DocVQA dataset, you can register and download it on [DocVQA homepage](https://rrc.cvc.uab.es/?ch=17). If you do so, to
+proceed with this guide check out [how to load files into a 🤗 dataset](https://huggingface.co/docs/datasets/loading#local-and-remote-files).
+
+```py
+>>> from datasets import load_dataset
+
+>>> dataset = load_dataset("nielsr/docvqa_1200_examples")
+>>> dataset
+DatasetDict({
+    train: Dataset({
+        features: ['id', 'image', 'query', 'answers', 'words', 'bounding_boxes', 'answer'],
+        num_rows: 1000
+    })
+    test: Dataset({
+        features: ['id', 'image', 'query', 'answers', 'words', 'bounding_boxes', 'answer'],
+        num_rows: 200
+    })
+})
+```
+
+As you can see, the dataset is split into train and test sets already. Take a look at a random example to familiarize
+yourself with the features.
+
+```py
+>>> dataset["train"].features
+```
+
+Here's what the individual fields represent:
+* `id`: the example's id
+* `image`: a PIL.Image.Image object containing the document image
+* `query`: the question string - natural language asked question, in several languages
+* `answers`: a list of correct answers provided by human annotators
+* `words` and `bounding_boxes`: the results of OCR, which we will not use here
+* `answer`: an answer matched by a different model which we will not use here
+
+Let's leave only English questions, and drop the `answer` feature which appears to contain predictions by another model.
+We'll also take the first of the answers from the set provided by the annotators. Alternatively, you can randomly sample it.
+
+```py
+>>> updated_dataset = dataset.map(lambda example: {"question": example["query"]["en"]}, remove_columns=["query"])
+>>> updated_dataset = updated_dataset.map(
+...     lambda example: {"answer": example["answers"][0]}, remove_columns=["answer", "answers"]
+... )
+```
+
+Note that the LayoutLMv2 checkpoint that we use in this guide has been trained with `max_position_embeddings = 512` (you can
+find this information in the [checkpoint's `config.json` file](https://huggingface.co/microsoft/layoutlmv2-base-uncased/blob/main/config.json#L18)).
+We can truncate the examples but to avoid the situation where the answer might be at the end of a large document and end up truncated,
+here we'll remove the few examples where the embedding is likely to end up longer than 512.
+If most of the documents in your dataset are long, you can implement a sliding window strategy - check out [this notebook](https://github.com/huggingface/notebooks/blob/main/examples/question_answering.ipynb) for details.
+
+```py
+>>> updated_dataset = updated_dataset.filter(lambda x: len(x["words"]) + len(x["question"].split()) < 512)
+```
+
+At this point let's also remove the OCR features from this dataset. These are a result of OCR for fine-tuning a different
+model. They would still require some processing if we wanted to use them, as they do not match the input requirements
+of the model we use in this guide. Instead, we can use the [`LayoutLMv2Processor`] on the original data for both OCR and
+tokenization. This way we'll get the inputs that match model's expected input. If you want to process images manually,
+check out the [`LayoutLMv2` model documentation](../model_doc/layoutlmv2) to learn what input format the model expects.
+
+```py
+>>> updated_dataset = updated_dataset.remove_columns("words")
+>>> updated_dataset = updated_dataset.remove_columns("bounding_boxes")
+```
+
+Finally, the data exploration won't be complete if we don't peek at an image example.
+
+```py
+>>> updated_dataset["train"][11]["image"]
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/docvqa_example.jpg" alt="DocVQA Image Example"/>
+ </div>
+
+## Preprocess the data
+
+The Document Question Answering task is a multimodal task, and you need to make sure that the inputs from each modality
+are preprocessed according to the model's expectations. Let's start by loading the [`LayoutLMv2Processor`], which internally combines an image processor that can handle image data and a tokenizer that can encode text data.
+
+```py
+>>> from transformers import AutoProcessor
+
+>>> processor = AutoProcessor.from_pretrained(model_checkpoint)
+```
+
+### Preprocessing document images
+
+First, let's prepare the document images for the model with the help of the `image_processor` from the processor.
+By default, image processor resizes the images to 224x224, makes sure they have the correct order of color channels,
+applies OCR with tesseract to get words and normalized bounding boxes. In this tutorial, all of these defaults are exactly what we need.
+Write a function that applies the default image processing to a batch of images and returns the results of OCR.
+
+```py
+>>> image_processor = processor.image_processor
+
+
+>>> def get_ocr_words_and_boxes(examples):
+...     images = [image.convert("RGB") for image in examples["image"]]
+...     encoded_inputs = image_processor(images)
+
+...     examples["image"] = encoded_inputs.pixel_values
+...     examples["words"] = encoded_inputs.words
+...     examples["boxes"] = encoded_inputs.boxes
+
+...     return examples
+```
+
+To apply this preprocessing to the entire dataset in a fast way, use [`~datasets.Dataset.map`].
+
+```py
+>>> dataset_with_ocr = updated_dataset.map(get_ocr_words_and_boxes, batched=True, batch_size=2)
+```
+
+### Preprocessing text data
+
+Once we have applied OCR to the images, we need to encode the text part of the dataset to prepare it for the model.
+This involves converting the words and boxes that we got in the previous step to token-level `input_ids`, `attention_mask`,
+`token_type_ids` and `bbox`. For preprocessing text, we'll need the `tokenizer` from the processor.
+
+```py
+>>> tokenizer = processor.tokenizer
+```
+
+On top of the preprocessing mentioned above, we also need to add the labels for the model. For `xxxForQuestionAnswering` models
+in 🤗 Transformers, the labels consist of the `start_positions` and `end_positions`, indicating which token is at the
+start and which token is at the end of the answer.
+
+Let's start with that. Define a helper function that can find a sublist (the answer split into words) in a larger list (the words list).
+
+This function will take two lists as input, `words_list` and `answer_list`. It will then iterate over the `words_list` and check
+if the current word in the `words_list` (words_list[i]) is equal to the first word of answer_list (answer_list[0]) and if
+the sublist of `words_list` starting from the current word and of the same length as `answer_list` is equal `to answer_list`.
+If this condition is true, it means that a match has been found, and the function will record the match, its starting index (idx),
+and its ending index (idx + len(answer_list) - 1). If more than one match was found, the function will return only the first one.
+If no match is found, the function returns (`None`, 0, and 0).
+
+```py
+>>> def subfinder(words_list, answer_list):
+...     matches = []
+...     start_indices = []
+...     end_indices = []
+...     for idx, i in enumerate(range(len(words_list))):
+...         if words_list[i] == answer_list[0] and words_list[i : i + len(answer_list)] == answer_list:
+...             matches.append(answer_list)
+...             start_indices.append(idx)
+...             end_indices.append(idx + len(answer_list) - 1)
+...     if matches:
+...         return matches[0], start_indices[0], end_indices[0]
+...     else:
+...         return None, 0, 0
+```
+
+To illustrate how this function finds the position of the answer, let's use it on an example:
+
+```py
+>>> example = dataset_with_ocr["train"][1]
+>>> words = [word.lower() for word in example["words"]]
+>>> match, word_idx_start, word_idx_end = subfinder(words, example["answer"].lower().split())
+>>> print("Question: ", example["question"])
+>>> print("Words:", words)
+>>> print("Answer: ", example["answer"])
+>>> print("start_index", word_idx_start)
+>>> print("end_index", word_idx_end)
+Question:  Who is in  cc in this letter?
+Words: ['wie', 'baw', 'brown', '&', 'williamson', 'tobacco', 'corporation', 'research', '&', 'development', 'internal', 'correspondence', 'to:', 'r.', 'h.', 'honeycutt', 'ce:', 't.f.', 'riehl', 'from:', '.', 'c.j.', 'cook', 'date:', 'may', '8,', '1995', 'subject:', 'review', 'of', 'existing', 'brainstorming', 'ideas/483', 'the', 'major', 'function', 'of', 'the', 'product', 'innovation', 'graup', 'is', 'to', 'develop', 'marketable', 'nove!', 'products', 'that', 'would', 'be', 'profitable', 'to', 'manufacture', 'and', 'sell.', 'novel', 'is', 'defined', 'as:', 'of', 'a', 'new', 'kind,', 'or', 'different', 'from', 'anything', 'seen', 'or', 'known', 'before.', 'innovation', 'is', 'defined', 'as:', 'something', 'new', 'or', 'different', 'introduced;', 'act', 'of', 'innovating;', 'introduction', 'of', 'new', 'things', 'or', 'methods.', 'the', 'products', 'may', 'incorporate', 'the', 'latest', 'technologies,', 'materials', 'and', 'know-how', 'available', 'to', 'give', 'then', 'a', 'unique', 'taste', 'or', 'look.', 'the', 'first', 'task', 'of', 'the', 'product', 'innovation', 'group', 'was', 'to', 'assemble,', 'review', 'and', 'categorize', 'a', 'list', 'of', 'existing', 'brainstorming', 'ideas.', 'ideas', 'were', 'grouped', 'into', 'two', 'major', 'categories', 'labeled', 'appearance', 'and', 'taste/aroma.', 'these', 'categories', 'are', 'used', 'for', 'novel', 'products', 'that', 'may', 'differ', 'from', 'a', 'visual', 'and/or', 'taste/aroma', 'point', 'of', 'view', 'compared', 'to', 'canventional', 'cigarettes.', 'other', 'categories', 'include', 'a', 'combination', 'of', 'the', 'above,', 'filters,', 'packaging', 'and', 'brand', 'extensions.', 'appearance', 'this', 'category', 'is', 'used', 'for', 'novel', 'cigarette', 'constructions', 'that', 'yield', 'visually', 'different', 'products', 'with', 'minimal', 'changes', 'in', 'smoke', 'chemistry', 'two', 'cigarettes', 'in', 'cne.', 'emulti-plug', 'te', 'build', 'yaur', 'awn', 'cigarette.', 'eswitchable', 'menthol', 'or', 'non', 'menthol', 'cigarette.', '*cigarettes', 'with', 'interspaced', 'perforations', 'to', 'enable', 'smoker', 'to', 'separate', 'unburned', 'section', 'for', 'future', 'smoking.', '«short', 'cigarette,', 'tobacco', 'section', '30', 'mm.', '«extremely', 'fast', 'buming', 'cigarette.', '«novel', 'cigarette', 'constructions', 'that', 'permit', 'a', 'significant', 'reduction', 'iretobacco', 'weight', 'while', 'maintaining', 'smoking', 'mechanics', 'and', 'visual', 'characteristics.', 'higher', 'basis', 'weight', 'paper:', 'potential', 'reduction', 'in', 'tobacco', 'weight.', '«more', 'rigid', 'tobacco', 'column;', 'stiffing', 'agent', 'for', 'tobacco;', 'e.g.', 'starch', '*colored', 'tow', 'and', 'cigarette', 'papers;', 'seasonal', 'promotions,', 'e.g.', 'pastel', 'colored', 'cigarettes', 'for', 'easter', 'or', 'in', 'an', 'ebony', 'and', 'ivory', 'brand', 'containing', 'a', 'mixture', 'of', 'all', 'black', '(black', 'paper', 'and', 'tow)', 'and', 'ail', 'white', 'cigarettes.', '499150498']
+Answer:  T.F. Riehl
+start_index 17
+end_index 18
+```
+
+Once examples are encoded, however, they will look like this:
+
+```py
+>>> encoding = tokenizer(example["question"], example["words"], example["boxes"])
+>>> tokenizer.decode(encoding["input_ids"])
+[CLS] who is in cc in this letter? [SEP] wie baw brown & williamson tobacco corporation research & development ...
+```
+
+We'll need to find the position of the answer in the encoded input.
+* `token_type_ids` tells us which tokens are part of the question, and which ones are part of the document's words.
+* `tokenizer.cls_token_id` will help find the special token at the beginning of the input.
+* `word_ids` will help match the answer found in the original `words` to the same answer in the full encoded input and determine
+the start/end position of the answer in the encoded input.
+
+With that in mind, let's create a function to encode a batch of examples in the dataset:
+
+```py
+>>> def encode_dataset(examples, max_length=512):
+...     questions = examples["question"]
+...     words = examples["words"]
+...     boxes = examples["boxes"]
+...     answers = examples["answer"]
+
+...     # encode the batch of examples and initialize the start_positions and end_positions
+...     encoding = tokenizer(questions, words, boxes, max_length=max_length, padding="max_length", truncation=True)
+...     start_positions = []
+...     end_positions = []
+
+...     # loop through the examples in the batch
+...     for i in range(len(questions)):
+...         cls_index = encoding["input_ids"][i].index(tokenizer.cls_token_id)
+
+...         # find the position of the answer in example's words
+...         words_example = [word.lower() for word in words[i]]
+...         answer = answers[i]
+...         match, word_idx_start, word_idx_end = subfinder(words_example, answer.lower().split())
+
+...         if match:
+...             # if match is found, use `token_type_ids` to find where words start in the encoding
+...             token_type_ids = encoding["token_type_ids"][i]
+...             token_start_index = 0
+...             while token_type_ids[token_start_index] != 1:
+...                 token_start_index += 1
+
+...             token_end_index = len(encoding["input_ids"][i]) - 1
+...             while token_type_ids[token_end_index] != 1:
+...                 token_end_index -= 1
+
+...             word_ids = encoding.word_ids(i)[token_start_index : token_end_index + 1]
+...             start_position = cls_index
+...             end_position = cls_index
+
+...             # loop over word_ids and increase `token_start_index` until it matches the answer position in words
+...             # once it matches, save the `token_start_index` as the `start_position` of the answer in the encoding
+...             for id in word_ids:
+...                 if id == word_idx_start:
+...                     start_position = token_start_index
+...                 else:
+...                     token_start_index += 1
+
+...             # similarly loop over `word_ids` starting from the end to find the `end_position` of the answer
+...             for id in word_ids[::-1]:
+...                 if id == word_idx_end:
+...                     end_position = token_end_index
+...                 else:
+...                     token_end_index -= 1
+
+...             start_positions.append(start_position)
+...             end_positions.append(end_position)
+
+...         else:
+...             start_positions.append(cls_index)
+...             end_positions.append(cls_index)
+
+...     encoding["image"] = examples["image"]
+...     encoding["start_positions"] = start_positions
+...     encoding["end_positions"] = end_positions
+
+...     return encoding
+```
+
+Now that we have this preprocessing function, we can encode the entire dataset:
+
+```py
+>>> encoded_train_dataset = dataset_with_ocr["train"].map(
+...     encode_dataset, batched=True, batch_size=2, remove_columns=dataset_with_ocr["train"].column_names
+... )
+>>> encoded_test_dataset = dataset_with_ocr["test"].map(
+...     encode_dataset, batched=True, batch_size=2, remove_columns=dataset_with_ocr["test"].column_names
+... )
+```
+
+Let's check what the features of the encoded dataset look like:
+
+```py
+>>> encoded_train_dataset.features
+{'image': Sequence(feature=Sequence(feature=Sequence(feature=Value(dtype='uint8', id=None), length=-1, id=None), length=-1, id=None), length=-1, id=None),
+ 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
+ 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
+ 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
+ 'bbox': Sequence(feature=Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), length=-1, id=None),
+ 'start_positions': Value(dtype='int64', id=None),
+ 'end_positions': Value(dtype='int64', id=None)}
+```
+
+## Evaluation
+
+Evaluation for document question answering requires a significant amount of postprocessing. To avoid taking up too much
+of your time, this guide skips the evaluation step. The [`Trainer`] still calculates the evaluation loss during training so
+you're not completely in the dark about your model's performance. Extractive question answering is typically evaluated using F1/exact match.
+If you'd like to implement it yourself, check out the [Question Answering chapter](https://huggingface.co/course/chapter7/7?fw=pt#postprocessing)
+of the Hugging Face course for inspiration.
+
+## Train
+
+Congratulations! You've successfully navigated the toughest part of this guide and now you are ready to train your own model.
+Training involves the following steps:
+* Load the model with [`AutoModelForDocumentQuestionAnswering`] using the same checkpoint as in the preprocessing.
+* Define your training hyperparameters in [`TrainingArguments`].
+* Define a function to batch examples together, here the [`DefaultDataCollator`] will do just fine
+* Pass the training arguments to [`Trainer`] along with the model, dataset, and data collator.
+* Call [`~Trainer.train`] to finetune your model.
+
+```py
+>>> from transformers import AutoModelForDocumentQuestionAnswering
+
+>>> model = AutoModelForDocumentQuestionAnswering.from_pretrained(model_checkpoint)
+```
+
+In the [`TrainingArguments`] use `output_dir` to specify where to save your model, and configure hyperparameters as you see fit.
+If you wish to share your model with the community, set `push_to_hub` to `True` (you must be signed in to Hugging Face to upload your model).
+In this case the `output_dir` will also be the name of the repo where your model checkpoint will be pushed.
+
+```py
+>>> from transformers import TrainingArguments
+
+>>> # REPLACE THIS WITH YOUR REPO ID
+>>> repo_id = "MariaK/layoutlmv2-base-uncased_finetuned_docvqa"
+
+>>> training_args = TrainingArguments(
+...     output_dir=repo_id,
+...     per_device_train_batch_size=4,
+...     num_train_epochs=20,
+...     save_steps=200,
+...     logging_steps=50,
+...     evaluation_strategy="steps",
+...     learning_rate=5e-5,
+...     save_total_limit=2,
+...     remove_unused_columns=False,
+...     push_to_hub=True,
+... )
+```
+
+Define a simple data collator to batch examples together.
+
+```py
+>>> from transformers import DefaultDataCollator
+
+>>> data_collator = DefaultDataCollator()
+```
+
+Finally, bring everything together, and call [`~Trainer.train`]:
+
+```py
+>>> from transformers import Trainer
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     data_collator=data_collator,
+...     train_dataset=encoded_train_dataset,
+...     eval_dataset=encoded_test_dataset,
+...     tokenizer=processor,
+... )
+
+>>> trainer.train()
+```
+
+To add the final model to 🤗 Hub, create a model card and call `push_to_hub`:
+
+```py
+>>> trainer.create_model_card()
+>>> trainer.push_to_hub()
+```
+
+## Inference
+
+Now that you have finetuned a LayoutLMv2 model, and uploaded it to the 🤗 Hub, you can use it for inference. The simplest
+way to try out your finetuned model for inference is to use it in a [`Pipeline`].
+
+Let's take an example:
+```py
+>>> example = dataset["test"][2]
+>>> question = example["query"]["en"]
+>>> image = example["image"]
+>>> print(question)
+>>> print(example["answers"])
+'Who is ‘presiding’ TRRF GENERAL SESSION (PART 1)?'
+['TRRF Vice President', 'lee a. waller']
+```
+
+Next, instantiate a pipeline for
+document question answering with your model, and pass the image + question combination to it.
+
+```py
+>>> from transformers import pipeline
+
+>>> qa_pipeline = pipeline("document-question-answering", model="MariaK/layoutlmv2-base-uncased_finetuned_docvqa")
+>>> qa_pipeline(image, question)
+[{'score': 0.9949808120727539,
+  'answer': 'Lee A. Waller',
+  'start': 55,
+  'end': 57}]
+```
+
+You can also manually replicate the results of the pipeline if you'd like:
+1. Take an image and a question, prepare them for the model using the processor from your model.
+2. Forward the result or preprocessing through the model.
+3. The model returns `start_logits` and `end_logits`, which indicate which token is at the start of the answer and
+which token is at the end of the answer. Both have shape (batch_size, sequence_length).
+4. Take an argmax on the last dimension of both the `start_logits` and `end_logits` to get the predicted `start_idx` and `end_idx`.
+5. Decode the answer with the tokenizer.
+
+```py
+>>> import torch
+>>> from transformers import AutoProcessor
+>>> from transformers import AutoModelForDocumentQuestionAnswering
+
+>>> processor = AutoProcessor.from_pretrained("MariaK/layoutlmv2-base-uncased_finetuned_docvqa")
+>>> model = AutoModelForDocumentQuestionAnswering.from_pretrained("MariaK/layoutlmv2-base-uncased_finetuned_docvqa")
+
+>>> with torch.no_grad():
+...     encoding = processor(image.convert("RGB"), question, return_tensors="pt")
+...     outputs = model(**encoding)
+...     start_logits = outputs.start_logits
+...     end_logits = outputs.end_logits
+...     predicted_start_idx = start_logits.argmax(-1).item()
+...     predicted_end_idx = end_logits.argmax(-1).item()
+
+>>> processor.tokenizer.decode(encoding.input_ids.squeeze()[predicted_start_idx : predicted_end_idx + 1])
+'lee a. waller'
+```
\ No newline at end of file
diff --git a/docs/source/en/tasks/idefics.md b/docs/source/en/tasks/idefics.md
new file mode 100644
index 000000000000..376ec8b308b0
--- /dev/null
+++ b/docs/source/en/tasks/idefics.md
@@ -0,0 +1,426 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Image tasks with IDEFICS
+
+[[open-in-colab]]
+
+While individual tasks can be tackled by fine-tuning specialized models, an alternative approach 
+that has recently emerged and gained popularity is to use large models for a diverse set of tasks without fine-tuning. 
+For instance, large language models can handle such NLP tasks as summarization, translation, classification, and more. 
+This approach is no longer limited to a single modality, such as text, and in this guide, we will illustrate how you can 
+solve image-text tasks with a large multimodal model called IDEFICS. 
+
+[IDEFICS](../model_doc/idefics) is an open-access vision and language model based on [Flamingo](https://huggingface.co/papers/2204.14198), 
+a state-of-the-art visual language model initially developed by DeepMind. The model accepts arbitrary sequences of image 
+and text inputs and generates coherent text as output. It can answer questions about images, describe visual content, 
+create stories grounded in multiple images, and so on. IDEFICS comes in two variants - [80 billion parameters](https://huggingface.co/HuggingFaceM4/idefics-80b) 
+and [9 billion parameters](https://huggingface.co/HuggingFaceM4/idefics-9b), both of which are available on the 🤗 Hub. For each variant, you can also find fine-tuned instructed 
+versions of the model adapted for conversational use cases.
+
+This model is exceptionally versatile and can be used for a wide range of image and multimodal tasks. However, 
+being a large model means it requires significant computational resources and infrastructure. It is up to you to decide whether 
+this approach suits your use case better than fine-tuning specialized models for each individual task. 
+
+In this guide, you'll learn how to: 
+- [Load IDEFICS](#loading-the-model) and [load the quantized version of the model](#loading-the-quantized-version-of-the-model)
+- Use IDEFICS for: 
+  - [Image captioning](#image-captioning)
+  - [Prompted image captioning](#prompted-image-captioning)
+  - [Few-shot prompting](#few-shot-prompting)
+  - [Visual question answering](#visual-question-answering)
+  - [Image classificaiton](#image-classification)
+  - [Image-guided text generation](#image-guided-text-generation)
+- [Run inference in batch mode](#running-inference-in-batch-mode)
+- [Run IDEFICS instruct for conversational use](#idefics-instruct-for-conversational-use)
+
+Before you begin, make sure you have all the necessary libraries installed. 
+
+```bash
+pip install -q bitsandbytes sentencepiece accelerate transformers
+```
+
+<Tip>
+To run the following examples with a non-quantized version of the model checkpoint you will need at least 20GB of GPU memory.
+</Tip>
+
+## Loading the model
+
+Let's start by loading the model's 9 billion parameters checkpoint: 
+
+```py
+>>> checkpoint = "HuggingFaceM4/idefics-9b"
+```
+
+Just like for other Transformers models, you need to load a processor and the model itself from the checkpoint. 
+The IDEFICS processor wraps a [`LlamaTokenizer`] and IDEFICS image processor into a single processor to take care of 
+preparing text and image inputs for the model.
+
+```py
+>>> import torch
+
+>>> from transformers import IdeficsForVisionText2Text, AutoProcessor
+
+>>> processor = AutoProcessor.from_pretrained(checkpoint)
+
+>>> model = IdeficsForVisionText2Text.from_pretrained(checkpoint, torch_dtype=torch.bfloat16, device_map="auto")
+```
+
+Setting `device_map` to `"auto"` will automatically determine how to load and store the model weights in the most optimized 
+manner given existing devices.
+
+### Quantized model
+
+If high-memory GPU availability is an issue, you can load the quantized version of the model. To load the model and the 
+processor in 4bit precision, pass a `BitsAndBytesConfig` to the `from_pretrained` method and the model will be compressed 
+on the fly while loading.
+
+```py
+>>> import torch
+>>> from transformers import IdeficsForVisionText2Text, AutoProcessor, BitsAndBytesConfig
+
+>>> quantization_config = BitsAndBytesConfig(
+...     load_in_4bit=True,
+...     bnb_4bit_compute_dtype=torch.float16,
+... )
+
+>>> processor = AutoProcessor.from_pretrained(checkpoint)
+
+>>> model = IdeficsForVisionText2Text.from_pretrained(
+...     checkpoint,
+...     quantization_config=quantization_config,
+...     device_map="auto"
+... )
+```
+
+Now that you have the model loaded in one of the suggested ways, let's move on to exploring tasks that you can use IDEFICS for.
+
+## Image captioning
+
+Image captioning is the task of predicting a caption for a given image. A common application is to aid visually impaired 
+people navigate through different situations, for instance, explore image content online. 
+
+To illustrate the task, get an image to be captioned, e.g.:
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-im-captioning.jpg" alt="Image of a puppy in a flower bed"/>
+</div>
+
+Photo by [Hendo Wang](https://unsplash.com/@hendoo). 
+
+IDEFICS accepts text and image prompts. However, to caption an image, you do not have to provide a text prompt to the 
+model, only the preprocessed input image. Without a text prompt, the model will start generating text from the 
+BOS (beginning-of-sequence) token thus creating a caption.
+
+As image input to the model, you can use either an image object (`PIL.Image`) or a url from which the image can be retrieved.
+
+```py
+>>> prompt = [
+...     "https://images.unsplash.com/photo-1583160247711-2191776b4b91?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3542&q=80",
+... ]
+
+>>> inputs = processor(prompt, return_tensors="pt").to("cuda")
+>>> bad_words_ids = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
+
+>>> generated_ids = model.generate(**inputs, max_new_tokens=10, bad_words_ids=bad_words_ids)
+>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+>>> print(generated_text[0])
+A puppy in a flower bed
+```
+
+<Tip>
+
+It is a good idea to include the `bad_words_ids` in the call to `generate` to avoid errors arising when increasing 
+the `max_new_tokens`: the model will want to generate a new `<image>` or `<fake_token_around_image>` token when there 
+is no image being generated by the model.
+You can set it on-the-fly as in this guide, or store in the `GenerationConfig` as described in the [Text generation strategies](../generation_strategies) guide.
+</Tip>
+
+## Prompted image captioning
+
+You can extend image captioning by providing a text prompt, which the model will continue given the image. Let's take 
+another image to illustrate:
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-prompted-im-captioning.jpg" alt="Image of the Eiffel Tower at night"/>
+</div>
+
+Photo by [Denys Nevozhai](https://unsplash.com/@dnevozhai).
+   
+Textual and image prompts can be passed to the model's processor as a single list to create appropriate inputs.
+
+```py
+>>> prompt = [
+...     "https://images.unsplash.com/photo-1543349689-9a4d426bee8e?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3501&q=80",
+...     "This is an image of ",
+... ]
+
+>>> inputs = processor(prompt, return_tensors="pt").to("cuda")
+>>> bad_words_ids = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
+
+>>> generated_ids = model.generate(**inputs, max_new_tokens=10, bad_words_ids=bad_words_ids)
+>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+>>> print(generated_text[0])
+This is an image of the Eiffel Tower in Paris, France.
+```
+
+## Few-shot prompting
+
+While IDEFICS demonstrates great zero-shot results, your task may require a certain format of the caption, or come with 
+other restrictions or requirements that increase task's complexity. Few-shot prompting can be used to enable in-context learning.
+By providing examples in the prompt, you can steer the model to generate results that mimic the format of given examples. 
+
+Let's use the previous image of the Eiffel Tower as an example for the model and build a prompt that demonstrates to the model 
+that in addition to learning what the object in an image is, we would also like to get some interesting information about it. 
+Then, let's see, if we can get the same response format for an image of the Statue of Liberty:
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-few-shot.jpg" alt="Image of the Statue of Liberty"/>
+</div>
+
+Photo by [Juan Mayobre](https://unsplash.com/@jmayobres).
+  
+```py
+>>> prompt = ["User:",
+...            "https://images.unsplash.com/photo-1543349689-9a4d426bee8e?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3501&q=80",
+...            "Describe this image.\nAssistant: An image of the Eiffel Tower at night. Fun fact: the Eiffel Tower is the same height as an 81-storey building.\n",
+...            "User:",
+...            "https://images.unsplash.com/photo-1524099163253-32b7f0256868?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3387&q=80",
+...            "Describe this image.\nAssistant:"
+...            ]
+
+>>> inputs = processor(prompt, return_tensors="pt").to("cuda")
+>>> bad_words_ids = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
+
+>>> generated_ids = model.generate(**inputs, max_new_tokens=30, bad_words_ids=bad_words_ids)
+>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+>>> print(generated_text[0])
+User: Describe this image.
+Assistant: An image of the Eiffel Tower at night. Fun fact: the Eiffel Tower is the same height as an 81-storey building. 
+User: Describe this image.
+Assistant: An image of the Statue of Liberty. Fun fact: the Statue of Liberty is 151 feet tall.
+```
+
+Notice that just from a single example (i.e., 1-shot) the model has learned how to perform the task. For more complex tasks, 
+feel free to experiment with a larger number of examples (e.g., 3-shot, 5-shot, etc.).
+
+## Visual question answering
+
+Visual Question Answering (VQA) is the task of answering open-ended questions based on an image. Similar to image 
+captioning it can be used in accessibility applications, but also in education (reasoning about visual materials), customer 
+service (questions about products based on images), and image retrieval.
+
+Let's get a new image for this task: 
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-vqa.jpg" alt="Image of a couple having a picnic"/>
+</div>
+
+Photo by [Jarritos Mexican Soda](https://unsplash.com/@jarritos).
+
+You can steer the model from image captioning to visual question answering by prompting it with appropriate instructions: 
+
+```py
+>>> prompt = [
+...     "Instruction: Provide an answer to the question. Use the image to answer.\n",
+...     "https://images.unsplash.com/photo-1623944889288-cd147dbb517c?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3540&q=80",
+...     "Question: Where are these people and what's the weather like? Answer:"
+... ]
+
+>>> inputs = processor(prompt, return_tensors="pt").to("cuda")
+>>> bad_words_ids = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
+
+>>> generated_ids = model.generate(**inputs, max_new_tokens=20, bad_words_ids=bad_words_ids)
+>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+>>> print(generated_text[0])
+Instruction: Provide an answer to the question. Use the image to answer.
+ Question: Where are these people and what's the weather like? Answer: They're in a park in New York City, and it's a beautiful day.
+```
+
+## Image classification
+
+IDEFICS is capable of classifying images into different categories without being explicitly trained on data containing 
+labeled examples from those specific categories. Given a list of categories and using its image and text understanding 
+capabilities, the model can infer which category the image likely belongs to. 
+
+Say, we have this image of a vegetable stand: 
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-classification.jpg" alt="Image of a vegetable stand"/>
+</div>
+
+Photo by [Peter Wendt](https://unsplash.com/@peterwendt).
+
+We can instruct the model to classify the image into one of the categories that we have:
+
+```py
+>>> categories = ['animals','vegetables', 'city landscape', 'cars', 'office']
+>>> prompt = [f"Instruction: Classify the following image into a single category from the following list: {categories}.\n",
+...     "https://images.unsplash.com/photo-1471193945509-9ad0617afabf?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3540&q=80",    
+...     "Category: "
+... ]
+
+>>> inputs = processor(prompt, return_tensors="pt").to("cuda")
+>>> bad_words_ids = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
+
+>>> generated_ids = model.generate(**inputs, max_new_tokens=6, bad_words_ids=bad_words_ids)
+>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+>>> print(generated_text[0])
+Instruction: Classify the following image into a single category from the following list: ['animals', 'vegetables', 'city landscape', 'cars', 'office'].
+Category: Vegetables
+```  
+
+In the example above we instruct the model to classify the image into a single category, however, you can also prompt the model to do rank classification.
+
+## Image-guided text generation
+
+For more creative applications, you can use image-guided text generation to generate text based on an image. This can be 
+useful to create descriptions of products, ads, descriptions of a scene, etc. 
+
+Let's prompt IDEFICS to write a story based on a simple image of a red door: 
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-story-generation.jpg" alt="Image of a red door with a pumpkin on the steps"/>
+</div>
+
+Photo by [Craig Tidball](https://unsplash.com/@devonshiremedia).
+  
+```py
+>>> prompt = ["Instruction: Use the image to write a story. \n",
+...     "https://images.unsplash.com/photo-1517086822157-2b0358e7684a?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=2203&q=80",
+...     "Story: \n"]
+
+>>> inputs = processor(prompt, return_tensors="pt").to("cuda")
+>>> bad_words_ids = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
+
+>>> generated_ids = model.generate(**inputs, num_beams=2, max_new_tokens=200, bad_words_ids=bad_words_ids)
+>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+>>> print(generated_text[0]) 
+Instruction: Use the image to write a story. 
+ Story: 
+Once upon a time, there was a little girl who lived in a house with a red door.  She loved her red door.  It was the prettiest door in the whole world.
+
+One day, the little girl was playing in her yard when she noticed a man standing on her doorstep.  He was wearing a long black coat and a top hat.
+
+The little girl ran inside and told her mother about the man.
+
+Her mother said, “Don’t worry, honey.  He’s just a friendly ghost.”
+
+The little girl wasn’t sure if she believed her mother, but she went outside anyway.
+
+When she got to the door, the man was gone.
+
+The next day, the little girl was playing in her yard again when she noticed the man standing on her doorstep.
+
+He was wearing a long black coat and a top hat.
+
+The little girl ran
+```
+
+Looks like IDEFICS noticed the pumpkin on the doorstep and went with a spooky Halloween story about a ghost.
+
+<Tip>
+
+For longer outputs like this, you will greatly benefit from tweaking the text generation strategy. This can help 
+you significantly improve the quality of the generated output. Check out [Text generation strategies](../generation_strategies) 
+to learn more. 
+</Tip>
+
+## Running inference in batch mode
+
+All of the earlier sections illustrated IDEFICS for a single example. In a very similar fashion, you can run inference 
+for a batch of examples by passing a list of prompts:
+
+```py
+>>> prompts = [
+...     [   "https://images.unsplash.com/photo-1543349689-9a4d426bee8e?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3501&q=80",
+...         "This is an image of ",
+...     ],
+...     [   "https://images.unsplash.com/photo-1623944889288-cd147dbb517c?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3540&q=80",
+...         "This is an image of ",
+...     ],
+...     [   "https://images.unsplash.com/photo-1471193945509-9ad0617afabf?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3540&q=80",
+...         "This is an image of ",
+...     ],
+... ]
+
+>>> inputs = processor(prompts, return_tensors="pt").to("cuda")
+>>> bad_words_ids = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
+
+>>> generated_ids = model.generate(**inputs, max_new_tokens=10, bad_words_ids=bad_words_ids)
+>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+>>> for i,t in enumerate(generated_text):
+...     print(f"{i}:\n{t}\n") 
+0:
+This is an image of the Eiffel Tower in Paris, France.
+
+1:
+This is an image of a couple on a picnic blanket.
+
+2:
+This is an image of a vegetable stand.
+```
+
+## IDEFICS instruct for conversational use
+
+For conversational use cases, you can find fine-tuned instructed versions of the model on the 🤗 Hub: 
+`HuggingFaceM4/idefics-80b-instruct` and `HuggingFaceM4/idefics-9b-instruct`.
+
+These checkpoints are the result of fine-tuning the respective base models on a mixture of supervised and instruction 
+fine-tuning datasets, which boosts the downstream performance while making the models more usable in conversational settings.
+
+The use and prompting for the conversational use is very similar to using the base models: 
+
+```py
+>>> import torch
+>>> from transformers import IdeficsForVisionText2Text, AutoProcessor
+
+>>> device = "cuda" if torch.cuda.is_available() else "cpu"
+
+>>> checkpoint = "HuggingFaceM4/idefics-9b-instruct"
+>>> model = IdeficsForVisionText2Text.from_pretrained(checkpoint, torch_dtype=torch.bfloat16).to(device)
+>>> processor = AutoProcessor.from_pretrained(checkpoint)
+
+>>> prompts = [
+...     [
+...         "User: What is in this image?",
+...         "https://upload.wikimedia.org/wikipedia/commons/8/86/Id%C3%A9fix.JPG",
+...         "<end_of_utterance>",
+
+...         "\nAssistant: This picture depicts Idefix, the dog of Obelix in Asterix and Obelix. Idefix is running on the ground.<end_of_utterance>",
+
+...         "\nUser:",
+...         "https://static.wikia.nocookie.net/asterix/images/2/25/R22b.gif/revision/latest?cb=20110815073052",
+...         "And who is that?<end_of_utterance>",
+
+...         "\nAssistant:",
+...     ],
+... ]
+
+>>> # --batched mode
+>>> inputs = processor(prompts, add_end_of_utterance_token=False, return_tensors="pt").to(device)
+>>> # --single sample mode
+>>> # inputs = processor(prompts[0], return_tensors="pt").to(device)
+
+>>> # Generation args
+>>> exit_condition = processor.tokenizer("<end_of_utterance>", add_special_tokens=False).input_ids
+>>> bad_words_ids = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
+
+>>> generated_ids = model.generate(**inputs, eos_token_id=exit_condition, bad_words_ids=bad_words_ids, max_length=100)
+>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+>>> for i, t in enumerate(generated_text):
+...     print(f"{i}:\n{t}\n")
+```
diff --git a/docs/source/en/tasks/image_captioning.md b/docs/source/en/tasks/image_captioning.md
new file mode 100644
index 000000000000..71e81b4651bd
--- /dev/null
+++ b/docs/source/en/tasks/image_captioning.md
@@ -0,0 +1,276 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+
+# Image captioning
+
+[[open-in-colab]]
+
+Image captioning is the task of predicting a caption for a given image. Common real world applications of it include
+aiding visually impaired people that can help them navigate through different situations. Therefore, image captioning
+helps to improve content accessibility for people by describing images to them.
+
+This guide will show you how to:
+
+* Fine-tune an image captioning model.
+* Use the fine-tuned model for inference. 
+
+Before you begin, make sure you have all the necessary libraries installed:
+
+```bash
+pip install transformers datasets evaluate -q
+pip install jiwer -q
+```
+
+We encourage you to log in to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to log in:
+
+
+```python
+from huggingface_hub import notebook_login
+
+notebook_login()
+```
+
+## Load the Pokémon BLIP captions dataset
+
+Use the 🤗 Dataset library to load a dataset that consists of {image-caption} pairs. To create your own image captioning dataset
+in PyTorch, you can follow [this notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/GIT/Fine_tune_GIT_on_an_image_captioning_dataset.ipynb). 
+
+
+```python
+from datasets import load_dataset
+
+ds = load_dataset("lambdalabs/pokemon-blip-captions")
+ds
+```
+```bash
+DatasetDict({
+    train: Dataset({
+        features: ['image', 'text'],
+        num_rows: 833
+    })
+})
+```
+
+The dataset has two features, `image` and `text`.
+
+<Tip>
+
+Many image captioning datasets contain multiple captions per image. In those cases, a common strategy is to randomly sample a caption amongst the available ones during training. 
+
+</Tip>
+
+Split the dataset’s train split into a train and test set with the [~datasets.Dataset.train_test_split] method:
+
+
+```python
+ds = ds["train"].train_test_split(test_size=0.1)
+train_ds = ds["train"]
+test_ds = ds["test"]
+```
+
+Let's visualize a couple of samples from the training set. 
+
+
+```python
+from textwrap import wrap
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+def plot_images(images, captions):
+    plt.figure(figsize=(20, 20))
+    for i in range(len(images)):
+        ax = plt.subplot(1, len(images), i + 1)
+        caption = captions[i]
+        caption = "\n".join(wrap(caption, 12))
+        plt.title(caption)
+        plt.imshow(images[i])
+        plt.axis("off")
+
+
+sample_images_to_visualize = [np.array(train_ds[i]["image"]) for i in range(5)]
+sample_captions = [train_ds[i]["text"] for i in range(5)]
+plot_images(sample_images_to_visualize, sample_captions)
+```
+    
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/sample_training_images_image_cap.png" alt="Sample training images"/>
+</div>
+
+## Preprocess the dataset
+
+Since the dataset has two modalities (image and text), the pre-processing pipeline will preprocess images and the captions.
+
+To do so, load the processor class associated with the model you are about to fine-tune. 
+
+```python
+from transformers import AutoProcessor
+
+checkpoint = "microsoft/git-base"
+processor = AutoProcessor.from_pretrained(checkpoint)
+```
+
+The processor will internally pre-process the image (which includes resizing, and pixel scaling) and tokenize the caption. 
+
+```python
+def transforms(example_batch):
+    images = [x for x in example_batch["image"]]
+    captions = [x for x in example_batch["text"]]
+    inputs = processor(images=images, text=captions, padding="max_length")
+    inputs.update({"labels": inputs["input_ids"]})
+    return inputs
+
+
+train_ds.set_transform(transforms)
+test_ds.set_transform(transforms)
+```
+
+With the dataset ready, you can now set up the model for fine-tuning. 
+
+## Load a base model
+
+Load the ["microsoft/git-base"](https://huggingface.co/microsoft/git-base) into a [`AutoModelForCausalLM`](https://huggingface.co/docs/transformers/model_doc/auto#transformers.AutoModelForCausalLM) object.
+
+
+```python
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained(checkpoint)
+```
+
+## Evaluate
+
+Image captioning models are typically evaluated with the [Rouge Score](https://huggingface.co/spaces/evaluate-metric/rouge) or [Word Error Rate](https://huggingface.co/spaces/evaluate-metric/wer). For this guide, you will use the Word Error Rate (WER). 
+
+We use the 🤗 Evaluate library to do so. For potential limitations and other gotchas of the WER, refer to [this guide](https://huggingface.co/spaces/evaluate-metric/wer). 
+
+
+```python
+from evaluate import load
+import torch
+
+wer = load("wer")
+
+
+def compute_metrics(eval_pred):
+    logits, labels = eval_pred
+    predicted = logits.argmax(-1)
+    decoded_labels = processor.batch_decode(labels, skip_special_tokens=True)
+    decoded_predictions = processor.batch_decode(predicted, skip_special_tokens=True)
+    wer_score = wer.compute(predictions=decoded_predictions, references=decoded_labels)
+    return {"wer_score": wer_score}
+```
+
+## Train!
+
+Now, you are ready to start fine-tuning the model. You will use the 🤗 [`Trainer`] for this. 
+
+First, define the training arguments using [`TrainingArguments`].
+
+
+```python
+from transformers import TrainingArguments, Trainer
+
+model_name = checkpoint.split("/")[1]
+
+training_args = TrainingArguments(
+    output_dir=f"{model_name}-pokemon",
+    learning_rate=5e-5,
+    num_train_epochs=50,
+    fp16=True,
+    per_device_train_batch_size=32,
+    per_device_eval_batch_size=32,
+    gradient_accumulation_steps=2,
+    save_total_limit=3,
+    evaluation_strategy="steps",
+    eval_steps=50,
+    save_strategy="steps",
+    save_steps=50,
+    logging_steps=50,
+    remove_unused_columns=False,
+    push_to_hub=True,
+    label_names=["labels"],
+    load_best_model_at_end=True,
+)
+```
+
+Then pass them along with the datasets and the model to 🤗 Trainer. 
+
+```python
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_ds,
+    eval_dataset=test_ds,
+    compute_metrics=compute_metrics,
+)
+```
+
+To start training, simply call [`~Trainer.train`] on the [`Trainer`] object.
+
+```python 
+trainer.train()
+```
+
+You should see the training loss drop smoothly as training progresses.
+
+Once training is completed, share your model to the Hub with the [`~Trainer.push_to_hub`] method so everyone can use your model:
+
+
+```python
+trainer.push_to_hub()
+```
+
+## Inference
+
+Take a sample image from `test_ds` to test the model.
+
+
+```python
+from PIL import Image
+import requests
+
+url = "https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/pokemon.png"
+image = Image.open(requests.get(url, stream=True).raw)
+image
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/test_image_image_cap.png" alt="Test image"/>
+</div>
+    
+Prepare image for the model.
+
+```python
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+inputs = processor(images=image, return_tensors="pt").to(device)
+pixel_values = inputs.pixel_values
+```
+
+Call [`generate`] and decode the predictions. 
+
+```python
+generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
+generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+print(generated_caption)
+```
+```bash
+a drawing of a pink and blue pokemon
+```
+
+Looks like the fine-tuned model generated a pretty good caption!
diff --git a/docs/source/en/tasks/image_classification.md b/docs/source/en/tasks/image_classification.md
new file mode 100644
index 000000000000..489ec59ddf6a
--- /dev/null
+++ b/docs/source/en/tasks/image_classification.md
@@ -0,0 +1,547 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Image classification
+
+[[open-in-colab]]
+
+<Youtube id="tjAIM7BOYhw"/>
+
+Image classification assigns a label or class to an image. Unlike text or audio classification, the inputs are the
+pixel values that comprise an image. There are many applications for image classification, such as detecting damage
+after a natural disaster, monitoring crop health, or helping screen medical images for signs of disease.
+
+This guide illustrates how to:
+
+1. Fine-tune [ViT](model_doc/vit) on the [Food-101](https://huggingface.co/datasets/food101) dataset to classify a food item in an image.
+2. Use your fine-tuned model for inference.
+
+<Tip>
+The task illustrated in this tutorial is supported by the following model architectures:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[BEiT](../model_doc/beit), [BiT](../model_doc/bit), [ConvNeXT](../model_doc/convnext), [ConvNeXTV2](../model_doc/convnextv2), [CvT](../model_doc/cvt), [Data2VecVision](../model_doc/data2vec-vision), [DeiT](../model_doc/deit), [DiNAT](../model_doc/dinat), [DINOv2](../model_doc/dinov2), [EfficientFormer](../model_doc/efficientformer), [EfficientNet](../model_doc/efficientnet), [FocalNet](../model_doc/focalnet), [ImageGPT](../model_doc/imagegpt), [LeViT](../model_doc/levit), [MobileNetV1](../model_doc/mobilenet_v1), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [MobileViTV2](../model_doc/mobilevitv2), [NAT](../model_doc/nat), [Perceiver](../model_doc/perceiver), [PoolFormer](../model_doc/poolformer), [PVT](../model_doc/pvt), [RegNet](../model_doc/regnet), [ResNet](../model_doc/resnet), [SegFormer](../model_doc/segformer), [SwiftFormer](../model_doc/swiftformer), [Swin Transformer](../model_doc/swin), [Swin Transformer V2](../model_doc/swinv2), [VAN](../model_doc/van), [ViT](../model_doc/vit), [ViT Hybrid](../model_doc/vit_hybrid), [ViTMSN](../model_doc/vit_msn)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+Before you begin, make sure you have all the necessary libraries installed:
+
+```bash
+pip install transformers datasets evaluate
+```
+
+We encourage you to log in to your Hugging Face account to upload and share your model with the community. When prompted, enter your token to log in:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## Load Food-101 dataset
+
+Start by loading a smaller subset of the Food-101 dataset from the 🤗 Datasets library. This will give you a chance to
+experiment and make sure everything works before spending more time training on the full dataset.
+
+```py
+>>> from datasets import load_dataset
+
+>>> food = load_dataset("food101", split="train[:5000]")
+```
+
+Split the dataset's `train` split into a train and test set with the [`~datasets.Dataset.train_test_split`] method:
+
+```py
+>>> food = food.train_test_split(test_size=0.2)
+```
+
+Then take a look at an example:
+
+```py
+>>> food["train"][0]
+{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x512 at 0x7F52AFC8AC50>,
+ 'label': 79}
+```
+
+Each example in the dataset has two fields:
+
+- `image`: a PIL image of the food item
+- `label`: the label class of the food item
+
+To make it easier for the model to get the label name from the label id, create a dictionary that maps the label name
+to an integer and vice versa:
+
+```py
+>>> labels = food["train"].features["label"].names
+>>> label2id, id2label = dict(), dict()
+>>> for i, label in enumerate(labels):
+...     label2id[label] = str(i)
+...     id2label[str(i)] = label
+```
+
+Now you can convert the label id to a label name:
+
+```py
+>>> id2label[str(79)]
+'prime_rib'
+```
+
+## Preprocess
+
+The next step is to load a ViT image processor to process the image into a tensor:
+
+```py
+>>> from transformers import AutoImageProcessor
+
+>>> checkpoint = "google/vit-base-patch16-224-in21k"
+>>> image_processor = AutoImageProcessor.from_pretrained(checkpoint)
+```
+
+<frameworkcontent>
+<pt>
+Apply some image transformations to the images to make the model more robust against overfitting. Here you'll use torchvision's [`transforms`](https://pytorch.org/vision/stable/transforms.html) module, but you can also use any image library you like.
+
+Crop a random part of the image, resize it, and normalize it with the image mean and standard deviation:
+
+```py
+>>> from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor
+
+>>> normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
+>>> size = (
+...     image_processor.size["shortest_edge"]
+...     if "shortest_edge" in image_processor.size
+...     else (image_processor.size["height"], image_processor.size["width"])
+... )
+>>> _transforms = Compose([RandomResizedCrop(size), ToTensor(), normalize])
+```
+
+Then create a preprocessing function to apply the transforms and return the `pixel_values` - the inputs to the model - of the image:
+
+```py
+>>> def transforms(examples):
+...     examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in examples["image"]]
+...     del examples["image"]
+...     return examples
+```
+
+To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.with_transform`] method. The transforms are applied on the fly when you load an element of the dataset:
+
+```py
+>>> food = food.with_transform(transforms)
+```
+
+Now create a batch of examples using [`DefaultDataCollator`]. Unlike other data collators in 🤗 Transformers, the `DefaultDataCollator` does not apply additional preprocessing such as padding.
+
+```py
+>>> from transformers import DefaultDataCollator
+
+>>> data_collator = DefaultDataCollator()
+```
+</pt>
+</frameworkcontent>
+
+
+<frameworkcontent>
+<tf>
+
+To avoid overfitting and to make the model more robust, add some data augmentation to the training part of the dataset.
+Here we use Keras preprocessing layers to define the transformations for the training data (includes data augmentation),
+and transformations for the validation data (only center cropping, resizing and normalizing). You can use `tf.image`or
+any other library you prefer.
+
+```py
+>>> from tensorflow import keras
+>>> from tensorflow.keras import layers
+
+>>> size = (image_processor.size["height"], image_processor.size["width"])
+
+>>> train_data_augmentation = keras.Sequential(
+...     [
+...         layers.RandomCrop(size[0], size[1]),
+...         layers.Rescaling(scale=1.0 / 127.5, offset=-1),
+...         layers.RandomFlip("horizontal"),
+...         layers.RandomRotation(factor=0.02),
+...         layers.RandomZoom(height_factor=0.2, width_factor=0.2),
+...     ],
+...     name="train_data_augmentation",
+... )
+
+>>> val_data_augmentation = keras.Sequential(
+...     [
+...         layers.CenterCrop(size[0], size[1]),
+...         layers.Rescaling(scale=1.0 / 127.5, offset=-1),
+...     ],
+...     name="val_data_augmentation",
+... )
+```
+
+Next, create functions to apply appropriate transformations to a batch of images, instead of one image at a time.
+
+```py
+>>> import numpy as np
+>>> import tensorflow as tf
+>>> from PIL import Image
+
+
+>>> def convert_to_tf_tensor(image: Image):
+...     np_image = np.array(image)
+...     tf_image = tf.convert_to_tensor(np_image)
+...     # `expand_dims()` is used to add a batch dimension since
+...     # the TF augmentation layers operates on batched inputs.
+...     return tf.expand_dims(tf_image, 0)
+
+
+>>> def preprocess_train(example_batch):
+...     """Apply train_transforms across a batch."""
+...     images = [
+...         train_data_augmentation(convert_to_tf_tensor(image.convert("RGB"))) for image in example_batch["image"]
+...     ]
+...     example_batch["pixel_values"] = [tf.transpose(tf.squeeze(image)) for image in images]
+...     return example_batch
+
+
+... def preprocess_val(example_batch):
+...     """Apply val_transforms across a batch."""
+...     images = [
+...         val_data_augmentation(convert_to_tf_tensor(image.convert("RGB"))) for image in example_batch["image"]
+...     ]
+...     example_batch["pixel_values"] = [tf.transpose(tf.squeeze(image)) for image in images]
+...     return example_batch
+```
+
+Use 🤗 Datasets [`~datasets.Dataset.set_transform`] to apply the transformations on the fly:
+
+```py
+food["train"].set_transform(preprocess_train)
+food["test"].set_transform(preprocess_val)
+```
+
+As a final preprocessing step, create a batch of examples using `DefaultDataCollator`. Unlike other data collators in 🤗 Transformers, the
+`DefaultDataCollator` does not apply additional preprocessing, such as padding.
+
+```py
+>>> from transformers import DefaultDataCollator
+
+>>> data_collator = DefaultDataCollator(return_tensors="tf")
+```
+</tf>
+</frameworkcontent>
+
+## Evaluate
+
+Including a metric during training is often helpful for evaluating your model's performance. You can quickly load an
+evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load
+the [accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric):
+
+```py
+>>> import evaluate
+
+>>> accuracy = evaluate.load("accuracy")
+```
+
+Then create a function that passes your predictions and labels to [`~evaluate.EvaluationModule.compute`] to calculate the accuracy:
+
+```py
+>>> import numpy as np
+
+
+>>> def compute_metrics(eval_pred):
+...     predictions, labels = eval_pred
+...     predictions = np.argmax(predictions, axis=1)
+...     return accuracy.compute(predictions=predictions, references=labels)
+```
+
+Your `compute_metrics` function is ready to go now, and you'll return to it when you set up your training.
+
+## Train
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)!
+
+</Tip>
+
+You're ready to start training your model now! Load ViT with [`AutoModelForImageClassification`]. Specify the number of labels along with the number of expected labels, and the label mappings:
+
+```py
+>>> from transformers import AutoModelForImageClassification, TrainingArguments, Trainer
+
+>>> model = AutoModelForImageClassification.from_pretrained(
+...     checkpoint,
+...     num_labels=len(labels),
+...     id2label=id2label,
+...     label2id=label2id,
+... )
+```
+
+At this point, only three steps remain:
+
+1. Define your training hyperparameters in [`TrainingArguments`]. It is important you don't remove unused columns because that'll drop the `image` column. Without the `image` column, you can't create `pixel_values`. Set `remove_unused_columns=False` to prevent this behavior! The only other required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the accuracy and save the training checkpoint.
+2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function.
+3. Call [`~Trainer.train`] to finetune your model.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_food_model",
+...     remove_unused_columns=False,
+...     evaluation_strategy="epoch",
+...     save_strategy="epoch",
+...     learning_rate=5e-5,
+...     per_device_train_batch_size=16,
+...     gradient_accumulation_steps=4,
+...     per_device_eval_batch_size=16,
+...     num_train_epochs=3,
+...     warmup_ratio=0.1,
+...     logging_steps=10,
+...     load_best_model_at_end=True,
+...     metric_for_best_model="accuracy",
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     data_collator=data_collator,
+...     train_dataset=food["train"],
+...     eval_dataset=food["test"],
+...     tokenizer=image_processor,
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+</frameworkcontent>
+
+<frameworkcontent>
+<tf>
+
+<Tip>
+
+If you are unfamiliar with fine-tuning a model with Keras, check out the [basic tutorial](./training#train-a-tensorflow-model-with-keras) first!
+
+</Tip>
+
+To fine-tune a model in TensorFlow, follow these steps:
+1. Define the training hyperparameters, and set up an optimizer and a learning rate schedule.
+2. Instantiate a pre-trained model.
+3. Convert a 🤗 Dataset to a `tf.data.Dataset`.
+4. Compile your model.
+5. Add callbacks and use the `fit()` method to run the training.
+6. Upload your model to 🤗 Hub to share with the community.
+
+Start by defining the hyperparameters, optimizer and learning rate schedule:
+
+```py
+>>> from transformers import create_optimizer
+
+>>> batch_size = 16
+>>> num_epochs = 5
+>>> num_train_steps = len(food["train"]) * num_epochs
+>>> learning_rate = 3e-5
+>>> weight_decay_rate = 0.01
+
+>>> optimizer, lr_schedule = create_optimizer(
+...     init_lr=learning_rate,
+...     num_train_steps=num_train_steps,
+...     weight_decay_rate=weight_decay_rate,
+...     num_warmup_steps=0,
+... )
+```
+
+Then, load ViT with [`TFAutoModelForImageClassification`] along with the label mappings:
+
+```py
+>>> from transformers import TFAutoModelForImageClassification
+
+>>> model = TFAutoModelForImageClassification.from_pretrained(
+...     checkpoint,
+...     id2label=id2label,
+...     label2id=label2id,
+... )
+```
+
+Convert your datasets to the `tf.data.Dataset` format using the [`~datasets.Dataset.to_tf_dataset`] and your `data_collator`:
+
+```py
+>>> # converting our train dataset to tf.data.Dataset
+>>> tf_train_dataset = food["train"].to_tf_dataset(
+...     columns="pixel_values", label_cols="label", shuffle=True, batch_size=batch_size, collate_fn=data_collator
+... )
+
+>>> # converting our test dataset to tf.data.Dataset
+>>> tf_eval_dataset = food["test"].to_tf_dataset(
+...     columns="pixel_values", label_cols="label", shuffle=True, batch_size=batch_size, collate_fn=data_collator
+... )
+```
+
+Configure the model for training with `compile()`:
+
+```py
+>>> from tensorflow.keras.losses import SparseCategoricalCrossentropy
+
+>>> loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+>>> model.compile(optimizer=optimizer, loss=loss)
+```
+
+To compute the accuracy from the predictions and push your model to the 🤗 Hub, use [Keras callbacks](../main_classes/keras_callbacks).
+Pass your `compute_metrics` function to [KerasMetricCallback](../main_classes/keras_callbacks#transformers.KerasMetricCallback),
+and use the [PushToHubCallback](../main_classes/keras_callbacks#transformers.PushToHubCallback) to upload the model:
+
+```py
+>>> from transformers.keras_callbacks import KerasMetricCallback, PushToHubCallback
+
+>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_eval_dataset)
+>>> push_to_hub_callback = PushToHubCallback(
+...     output_dir="food_classifier",
+...     tokenizer=image_processor,
+...     save_strategy="no",
+... )
+>>> callbacks = [metric_callback, push_to_hub_callback]
+```
+
+Finally, you are ready to train your model! Call `fit()` with your training and validation datasets, the number of epochs,
+and your callbacks to fine-tune the model:
+
+```py
+>>> model.fit(tf_train_dataset, validation_data=tf_eval_dataset, epochs=num_epochs, callbacks=callbacks)
+Epoch 1/5
+250/250 [==============================] - 313s 1s/step - loss: 2.5623 - val_loss: 1.4161 - accuracy: 0.9290
+Epoch 2/5
+250/250 [==============================] - 265s 1s/step - loss: 0.9181 - val_loss: 0.6808 - accuracy: 0.9690
+Epoch 3/5
+250/250 [==============================] - 252s 1s/step - loss: 0.3910 - val_loss: 0.4303 - accuracy: 0.9820
+Epoch 4/5
+250/250 [==============================] - 251s 1s/step - loss: 0.2028 - val_loss: 0.3191 - accuracy: 0.9900
+Epoch 5/5
+250/250 [==============================] - 238s 949ms/step - loss: 0.1232 - val_loss: 0.3259 - accuracy: 0.9890
+```
+
+Congratulations! You have fine-tuned your model and shared it on the 🤗 Hub. You can now use it for inference!
+</tf>
+</frameworkcontent>
+
+
+<Tip>
+
+For a more in-depth example of how to finetune a model for image classification, take a look at the corresponding [PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
+
+</Tip>
+
+## Inference
+
+Great, now that you've fine-tuned a model, you can use it for inference!
+
+Load an image you'd like to run inference on:
+
+```py
+>>> ds = load_dataset("food101", split="validation[:10]")
+>>> image = ds["image"][0]
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png" alt="image of beignets"/>
+</div>
+
+The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for image classification with your model, and pass your image to it:
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline("image-classification", model="my_awesome_food_model")
+>>> classifier(image)
+[{'score': 0.31856709718704224, 'label': 'beignets'},
+ {'score': 0.015232225880026817, 'label': 'bruschetta'},
+ {'score': 0.01519392803311348, 'label': 'chicken_wings'},
+ {'score': 0.013022331520915031, 'label': 'pork_chop'},
+ {'score': 0.012728818692266941, 'label': 'prime_rib'}]
+```
+
+You can also manually replicate the results of the `pipeline` if you'd like:
+
+<frameworkcontent>
+<pt>
+Load an image processor to preprocess the image and return the `input` as PyTorch tensors:
+
+```py
+>>> from transformers import AutoImageProcessor
+>>> import torch
+
+>>> image_processor = AutoImageProcessor.from_pretrained("my_awesome_food_model")
+>>> inputs = image_processor(image, return_tensors="pt")
+```
+
+Pass your inputs to the model and return the logits:
+
+```py
+>>> from transformers import AutoModelForImageClassification
+
+>>> model = AutoModelForImageClassification.from_pretrained("my_awesome_food_model")
+>>> with torch.no_grad():
+...     logits = model(**inputs).logits
+```
+
+Get the predicted label with the highest probability, and use the model's `id2label` mapping to convert it to a label:
+
+```py
+>>> predicted_label = logits.argmax(-1).item()
+>>> model.config.id2label[predicted_label]
+'beignets'
+```
+</pt>
+</frameworkcontent>
+
+<frameworkcontent>
+<tf>
+Load an image processor to preprocess the image and return the `input` as TensorFlow tensors:
+
+```py
+>>> from transformers import AutoImageProcessor
+
+>>> image_processor = AutoImageProcessor.from_pretrained("MariaK/food_classifier")
+>>> inputs = image_processor(image, return_tensors="tf")
+```
+
+Pass your inputs to the model and return the logits:
+
+```py
+>>> from transformers import TFAutoModelForImageClassification
+
+>>> model = TFAutoModelForImageClassification.from_pretrained("MariaK/food_classifier")
+>>> logits = model(**inputs).logits
+```
+
+Get the predicted label with the highest probability, and use the model's `id2label` mapping to convert it to a label:
+
+```py
+>>> predicted_class_id = int(tf.math.argmax(logits, axis=-1)[0])
+>>> model.config.id2label[predicted_class_id]
+'beignets'
+```
+
+</tf>
+</frameworkcontent>
diff --git a/docs/source/en/tasks/image_classification.mdx b/docs/source/en/tasks/image_classification.mdx
deleted file mode 100644
index 2543db6d2877..000000000000
--- a/docs/source/en/tasks/image_classification.mdx
+++ /dev/null
@@ -1,297 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Image classification
-
-[[open-in-colab]]
-
-<Youtube id="tjAIM7BOYhw"/>
-
-Image classification assigns a label or class to an image. Unlike text or audio classification, the inputs are the pixel values that comprise an image. There are many applications for image classification such as detecting damage after a natural disaster, monitoring crop health, or helping screen medical images for signs of disease.
-
-This guide will show you how to:
-
-1. Finetune [ViT](https://huggingface.co/docs/transformers/v4.16.2/en/model_doc/vit) on the [Food-101](https://huggingface.co/datasets/food101) dataset to classify a food item in an image.
-2. Use your finetuned model for inference.
-
-<Tip>
-
-See the image classification [task page](https://huggingface.co/tasks/image-classification) for more information about its associated models, datasets, and metrics.
-
-</Tip>
-
-Before you begin, make sure you have all the necessary libraries installed:
-
-```bash
-pip install transformers datasets evaluate
-```
-
-We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login:
-
-```py
->>> from huggingface_hub import notebook_login
-
->>> notebook_login()
-```
-
-## Load Food-101 dataset
-
-Start by loading a smaller subset of the Food-101 dataset from the 🤗 Datasets library. This'll give you a chance to experiment and make sure everythings works before spending more time training on the full dataset.
-
-```py
->>> from datasets import load_dataset
-
->>> food = load_dataset("food101", split="train[:5000]")
-```
-
-Split the dataset's `train` split into a train and test set with the [`~datasets.Dataset.train_test_split`] method:
-
-```py
->>> food = food.train_test_split(test_size=0.2)
-```
-
-Then take a look at an example:
-
-```py
->>> food["train"][0]
-{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x512 at 0x7F52AFC8AC50>,
- 'label': 79}
-```
-
-There are two fields:
-
-- `image`: a PIL image of the food item.
-- `label`: the label class of the food item.
-
-To make it easier for the model to get the label name from the label id, create a dictionary that maps the label name to an integer and vice versa:
-
-```py
->>> labels = food["train"].features["label"].names
->>> label2id, id2label = dict(), dict()
->>> for i, label in enumerate(labels):
-...     label2id[label] = str(i)
-...     id2label[str(i)] = label
-```
-
-Now you can convert the label id to a label name:
-
-```py
->>> id2label[str(79)]
-'prime_rib'
-```
-
-## Preprocess
-
-The next step is to load a ViT image processor to process the image into a tensor:
-
-```py
->>> from transformers import AutoImageProcessor
-
->>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
-```
-
-Apply some image transformations to the images to make the model more robust against overfitting. Here you'll use torchvision's [`transforms`](https://pytorch.org/vision/stable/transforms.html) module, but you can also use any image library you like.
-
-Crop a random part of the image, resize it, and normalize it with the image mean and standard deviation:
-
-```py
->>> from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor
-
->>> normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
->>> size = (
-...     image_processor.size["shortest_edge"]
-...     if "shortest_edge" in image_processor.size
-...     else (image_processor.size["height"], image_processor.size["width"])
-... )
->>> _transforms = Compose([RandomResizedCrop(size), ToTensor(), normalize])
-```
-
-Then create a preprocessing function to apply the transforms and return the `pixel_values` - the inputs to the model - of the image:
-
-```py
->>> def transforms(examples):
-...     examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in examples["image"]]
-...     del examples["image"]
-...     return examples
-```
-
-To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.with_transform`] method. The transforms are applied on the fly when you load an element of the dataset:
-
-```py
->>> food = food.with_transform(transforms)
-```
-
-Now create a batch of examples using [`DataCollatorWithPadding`]. Unlike other data collators in 🤗 Transformers, the `DefaultDataCollator` does not apply additional preprocessing such as padding.
-
-```py
->>> from transformers import DefaultDataCollator
-
->>> data_collator = DefaultDataCollator()
-```
-
-## Evaluate
-
-Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric):
-
-```py
->>> import evaluate
-
->>> accuracy = evaluate.load("accuracy")
-```
-
-Then create a function that passes your predictions and labels to [`~evaluate.EvaluationModule.compute`] to calculate the accuracy:
-
-```py
->>> import numpy as np
-
-
->>> def compute_metrics(eval_pred):
-...     predictions = np.argmax(eval_pred.predictions, axis=1)
-...     return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)
-```
-
-Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training.
-
-## Train
-
-<frameworkcontent>
-<pt>
-<Tip>
-
-If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)!
-
-</Tip>
-You're ready to start training your model now! Load ViT with [`AutoModelForImageClassification`]. Specify the number of labels along with the number of expected labels, and the label mappings:
-
-```py
->>> from transformers import AutoModelForImageClassification, TrainingArguments, Trainer
-
->>> model = AutoModelForImageClassification.from_pretrained(
-...     "google/vit-base-patch16-224-in21k",
-...     num_labels=len(labels),
-...     id2label=id2label,
-...     label2id=label2id,
-... )
-```
-
-At this point, only three steps remain:
-
-1. Define your training hyperparameters in [`TrainingArguments`]. It is important you don't remove unused columns because this'll drop the `image` column. Without the `image` column, you can't create `pixel_values`. Set `remove_unused_columns=False` to prevent this behavior! The only other required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the accuracy and save the training checkpoint.
-2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function.
-3. Call [`~Trainer.train`] to finetune your model.
-
-```py
->>> training_args = TrainingArguments(
-...     output_dir="my_awesome_food_model",
-...     remove_unused_columns=False,
-...     evaluation_strategy="epoch",
-...     save_strategy="epoch",
-...     learning_rate=5e-5,
-...     per_device_train_batch_size=16,
-...     gradient_accumulation_steps=4,
-...     per_device_eval_batch_size=16,
-...     num_train_epochs=3,
-...     warmup_ratio=0.1,
-...     logging_steps=10,
-...     load_best_model_at_end=True,
-...     metric_for_best_model="accuracy",
-...     push_to_hub=True,
-... )
-
->>> trainer = Trainer(
-...     model=model,
-...     args=training_args,
-...     data_collator=data_collator,
-...     train_dataset=food["train"],
-...     eval_dataset=food["test"],
-...     tokenizer=image_processor,
-...     compute_metrics=compute_metrics,
-... )
-
->>> trainer.train()
-```
-
-Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model:
-
-```py
->>> trainer.push_to_hub()
-```
-</pt>
-</frameworkcontent>
-
-<Tip>
-
-For a more in-depth example of how to finetune a model for image classification, take a look at the corresponding [PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
-
-</Tip>
-
-## Inference
-
-Great, now that you've finetuned a model, you can use it for inference!
-
-Load an image you'd like to run inference on:
-
-```py
->>> ds = load_dataset("food101", split="validation[:10]")
->>> image = ds["image"][0]
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png" alt="image of beignets"/>
-</div>
-
-The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for image classification with your model, and pass your image to it:
-
-```py
->>> from transformers import pipeline
-
->>> classifier = pipeline("image-classification", model="my_awesome_food_model")
->>> classifier(image)
-[{'score': 0.35574808716773987, 'label': 'beignets'},
- {'score': 0.018057454377412796, 'label': 'chicken_wings'},
- {'score': 0.017733804881572723, 'label': 'prime_rib'},
- {'score': 0.016335085034370422, 'label': 'bruschetta'},
- {'score': 0.0160061065107584, 'label': 'ramen'}]
-```
-You can also manually replicate the results of the `pipeline` if you'd like:
-
-<frameworkcontent>
-<pt>
-Load an image processor to preprocess the image and return the `input` as PyTorch tensors:
-
-```py
->>> from transformers import AutoImageProcessor
->>> import torch
-
->>> image_processor = AutoImageProcessor.from_pretrained("my_awesome_food_model")
->>> inputs = image_processor(image, return_tensors="pt")
-```
-
-Pass your inputs to the model and return the logits:
-
-```py
->>> from transformers import AutoModelForImageClassification
-
->>> model = AutoModelForImageClassification.from_pretrained("my_awesome_food_model")
->>> with torch.no_grad():
-...     logits = model(**inputs).logits
-```
-
-Get the predicted label with the highest probability, and use the model's `id2label` mapping to convert it to a label:
-
-```py
->>> predicted_label = logits.argmax(-1).item()
->>> model.config.id2label[predicted_label]
-'beignets'
-```
-</pt>
-</frameworkcontent>
\ No newline at end of file
diff --git a/docs/source/en/tasks/language_modeling.md b/docs/source/en/tasks/language_modeling.md
new file mode 100644
index 000000000000..42406788214c
--- /dev/null
+++ b/docs/source/en/tasks/language_modeling.md
@@ -0,0 +1,425 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Causal language modeling
+
+[[open-in-colab]]
+
+There are two types of language modeling, causal and masked. This guide illustrates causal language modeling.
+Causal language models are frequently used for text generation. You can use these models for creative applications like
+choosing your own text adventure or an intelligent coding assistant like Copilot or CodeParrot.
+
+<Youtube id="Vpjb1lu0MDk"/>
+
+Causal language modeling predicts the next token in a sequence of tokens, and the model can only attend to tokens on
+the left. This means the model cannot see future tokens. GPT-2 is an example of a causal language model.
+
+This guide will show you how to:
+
+1. Finetune [DistilGPT2](https://huggingface.co/distilgpt2) on the [r/askscience](https://www.reddit.com/r/askscience/) subset of the [ELI5](https://huggingface.co/datasets/eli5) dataset.
+2. Use your finetuned model for inference.
+
+<Tip>
+You can finetune other architectures for causal language modeling following the same steps in this guide.
+Choose one of the following architectures:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+[BART](../model_doc/bart), [BERT](../model_doc/bert), [Bert Generation](../model_doc/bert-generation), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CodeLlama](../model_doc/code_llama), [CodeGen](../model_doc/codegen), [CPM-Ant](../model_doc/cpmant), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [Falcon](../model_doc/falcon), [GIT](../model_doc/git), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT NeoX Japanese](../model_doc/gpt_neox_japanese), [GPT-J](../model_doc/gptj), [LLaMA](../model_doc/llama), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [Mistral](../model_doc/mistral), [MPT](../model_doc/mpt), [MusicGen](../model_doc/musicgen), [MVP](../model_doc/mvp), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Pegasus](../model_doc/pegasus), [Persimmon](../model_doc/persimmon), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [RWKV](../model_doc/rwkv), [Speech2Text2](../model_doc/speech_to_text_2), [Transformer-XL](../model_doc/transfo-xl), [TrOCR](../model_doc/trocr), [XGLM](../model_doc/xglm), [XLM](../model_doc/xlm), [XLM-ProphetNet](../model_doc/xlm-prophetnet), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod)
+
+
+
+<!--End of the generated tip-->
+
+</Tip>
+
+Before you begin, make sure you have all the necessary libraries installed:
+
+```bash
+pip install transformers datasets evaluate
+```
+
+We encourage you to log in to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to log in:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## Load ELI5 dataset
+
+Start by loading a smaller subset of the r/askscience subset of the ELI5 dataset from the 🤗 Datasets library.
+ This'll give you a chance to experiment and make sure everything works before spending more time training on the full dataset.
+
+```py
+>>> from datasets import load_dataset
+
+>>> eli5 = load_dataset("eli5", split="train_asks[:5000]")
+```
+
+Split the dataset's `train_asks` split into a train and test set with the [`~datasets.Dataset.train_test_split`] method:
+
+```py
+>>> eli5 = eli5.train_test_split(test_size=0.2)
+```
+
+Then take a look at an example:
+
+```py
+>>> eli5["train"][0]
+{'answers': {'a_id': ['c3d1aib', 'c3d4lya'],
+  'score': [6, 3],
+  'text': ["The velocity needed to remain in orbit is equal to the square root of Newton's constant times the mass of earth divided by the distance from the center of the earth. I don't know the altitude of that specific mission, but they're usually around 300 km. That means he's going 7-8 km/s.\n\nIn space there are no other forces acting on either the shuttle or the guy, so they stay in the same position relative to each other. If he were to become unable to return to the ship, he would presumably run out of oxygen, or slowly fall into the atmosphere and burn up.",
+   "Hope you don't mind me asking another question, but why aren't there any stars visible in this photo?"]},
+ 'answers_urls': {'url': []},
+ 'document': '',
+ 'q_id': 'nyxfp',
+ 'selftext': '_URL_0_\n\nThis was on the front page earlier and I have a few questions about it. Is it possible to calculate how fast the astronaut would be orbiting the earth? Also how does he stay close to the shuttle so that he can return safely, i.e is he orbiting at the same speed and can therefore stay next to it? And finally if his propulsion system failed, would he eventually re-enter the atmosphere and presumably die?',
+ 'selftext_urls': {'url': ['http://apod.nasa.gov/apod/image/1201/freeflyer_nasa_3000.jpg']},
+ 'subreddit': 'askscience',
+ 'title': 'Few questions about this space walk photograph.',
+ 'title_urls': {'url': []}}
+```
+
+While this may look like a lot, you're only really interested in the `text` field. What's cool about language modeling
+tasks is you don't need labels (also known as an unsupervised task) because the next word *is* the label.
+
+## Preprocess
+
+<Youtube id="ma1TrR7gE7I"/>
+
+The next step is to load a DistilGPT2 tokenizer to process the `text` subfield:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+```
+
+You'll notice from the example above, the `text` field is actually nested inside `answers`. This means you'll need to
+extract the `text` subfield from its nested structure with the [`flatten`](https://huggingface.co/docs/datasets/process.html#flatten) method:
+
+```py
+>>> eli5 = eli5.flatten()
+>>> eli5["train"][0]
+{'answers.a_id': ['c3d1aib', 'c3d4lya'],
+ 'answers.score': [6, 3],
+ 'answers.text': ["The velocity needed to remain in orbit is equal to the square root of Newton's constant times the mass of earth divided by the distance from the center of the earth. I don't know the altitude of that specific mission, but they're usually around 300 km. That means he's going 7-8 km/s.\n\nIn space there are no other forces acting on either the shuttle or the guy, so they stay in the same position relative to each other. If he were to become unable to return to the ship, he would presumably run out of oxygen, or slowly fall into the atmosphere and burn up.",
+  "Hope you don't mind me asking another question, but why aren't there any stars visible in this photo?"],
+ 'answers_urls.url': [],
+ 'document': '',
+ 'q_id': 'nyxfp',
+ 'selftext': '_URL_0_\n\nThis was on the front page earlier and I have a few questions about it. Is it possible to calculate how fast the astronaut would be orbiting the earth? Also how does he stay close to the shuttle so that he can return safely, i.e is he orbiting at the same speed and can therefore stay next to it? And finally if his propulsion system failed, would he eventually re-enter the atmosphere and presumably die?',
+ 'selftext_urls.url': ['http://apod.nasa.gov/apod/image/1201/freeflyer_nasa_3000.jpg'],
+ 'subreddit': 'askscience',
+ 'title': 'Few questions about this space walk photograph.',
+ 'title_urls.url': []}
+```
+
+Each subfield is now a separate column as indicated by the `answers` prefix, and the `text` field is a list now. Instead
+of tokenizing each sentence separately, convert the list to a string so you can jointly tokenize them.
+
+Here is a first preprocessing function to join the list of strings for each example and tokenize the result:
+
+```py
+>>> def preprocess_function(examples):
+...     return tokenizer([" ".join(x) for x in examples["answers.text"]])
+```
+
+To apply this preprocessing function over the entire dataset, use the 🤗 Datasets [`~datasets.Dataset.map`] method. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once, and increasing the number of processes with `num_proc`. Remove any columns you don't need:
+
+```py
+>>> tokenized_eli5 = eli5.map(
+...     preprocess_function,
+...     batched=True,
+...     num_proc=4,
+...     remove_columns=eli5["train"].column_names,
+... )
+```
+
+This dataset contains the token sequences, but some of these are longer than the maximum input length for the model.
+
+You can now use a second preprocessing function to
+- concatenate all the sequences
+- split the concatenated sequences into shorter chunks defined by `block_size`, which should be both shorter than the maximum input length and short enough for your GPU RAM.
+
+```py
+>>> block_size = 128
+
+
+>>> def group_texts(examples):
+...     # Concatenate all texts.
+...     concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+...     total_length = len(concatenated_examples[list(examples.keys())[0]])
+...     # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+...     # customize this part to your needs.
+...     if total_length >= block_size:
+...         total_length = (total_length // block_size) * block_size
+...     # Split by chunks of block_size.
+...     result = {
+...         k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
+...         for k, t in concatenated_examples.items()
+...     }
+...     result["labels"] = result["input_ids"].copy()
+...     return result
+```
+
+Apply the `group_texts` function over the entire dataset:
+
+```py
+>>> lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)
+```
+
+Now create a batch of examples using [`DataCollatorForLanguageModeling`]. It's more efficient to *dynamically pad* the
+sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.
+
+<frameworkcontent>
+<pt>
+Use the end-of-sequence token as the padding token and set `mlm=False`. This will use the inputs as labels shifted to the right by one element:
+
+```py
+>>> from transformers import DataCollatorForLanguageModeling
+
+>>> tokenizer.pad_token = tokenizer.eos_token
+>>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
+```
+
+</pt>
+<tf>
+Use the end-of-sequence token as the padding token and set `mlm=False`. This will use the inputs as labels shifted to the right by one element:
+
+```py
+>>> from transformers import DataCollatorForLanguageModeling
+
+>>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="tf")
+```
+
+</tf>
+</frameworkcontent>
+
+
+## Train
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the [basic tutorial](../training#train-with-pytorch-trainer)!
+
+</Tip>
+
+You're ready to start training your model now! Load DistilGPT2 with [`AutoModelForCausalLM`]:
+
+```py
+>>> from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
+
+>>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+```
+
+At this point, only three steps remain:
+
+1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model).
+2. Pass the training arguments to [`Trainer`] along with the model, datasets, and data collator.
+3. Call [`~Trainer.train`] to finetune your model.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_eli5_clm-model",
+...     evaluation_strategy="epoch",
+...     learning_rate=2e-5,
+...     weight_decay=0.01,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=lm_dataset["train"],
+...     eval_dataset=lm_dataset["test"],
+...     data_collator=data_collator,
+... )
+
+>>> trainer.train()
+```
+
+Once training is completed, use the [`~transformers.Trainer.evaluate`] method to evaluate your model and get its perplexity:
+
+```py
+>>> import math
+
+>>> eval_results = trainer.evaluate()
+>>> print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
+Perplexity: 49.61
+```
+
+Then share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+<Tip>
+
+If you aren't familiar with finetuning a model with Keras, take a look at the [basic tutorial](../training#train-a-tensorflow-model-with-keras)!
+
+</Tip>
+To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters:
+
+```py
+>>> from transformers import create_optimizer, AdamWeightDecay
+
+>>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
+```
+
+Then you can load DistilGPT2 with [`TFAutoModelForCausalLM`]:
+
+```py
+>>> from transformers import TFAutoModelForCausalLM
+
+>>> model = TFAutoModelForCausalLM.from_pretrained("distilgpt2")
+```
+
+Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
+
+```py
+>>> tf_train_set = model.prepare_tf_dataset(
+...     lm_dataset["train"],
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_test_set = model.prepare_tf_dataset(
+...     lm_dataset["test"],
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to:
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)  # No loss argument!
+```
+
+This can be done by specifying where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> callback = PushToHubCallback(
+...     output_dir="my_awesome_eli5_clm-model",
+...     tokenizer=tokenizer,
+... )
+```
+
+Finally, you're ready to start training your model! Call [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) with your training and validation datasets, the number of epochs, and your callback to finetune the model:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=[callback])
+```
+
+Once training is completed, your model is automatically uploaded to the Hub so everyone can use it!
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+For a more in-depth example of how to finetune a model for causal language modeling, take a look at the corresponding
+[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)
+or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
+
+</Tip>
+
+## Inference
+
+Great, now that you've finetuned a model, you can use it for inference!
+
+Come up with a prompt you'd like to generate text from:
+
+```py
+>>> prompt = "Somatic hypermutation allows the immune system to"
+```
+
+The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for text generation with your model, and pass your text to it:
+
+```py
+>>> from transformers import pipeline
+
+>>> generator = pipeline("text-generation", model="my_awesome_eli5_clm-model")
+>>> generator(prompt)
+[{'generated_text': "Somatic hypermutation allows the immune system to be able to effectively reverse the damage caused by an infection.\n\n\nThe damage caused by an infection is caused by the immune system's ability to perform its own self-correcting tasks."}]
+```
+
+<frameworkcontent>
+<pt>
+Tokenize the text and return the `input_ids` as PyTorch tensors:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_eli5_clm-model")
+>>> inputs = tokenizer(prompt, return_tensors="pt").input_ids
+```
+
+Use the [`~transformers.generation_utils.GenerationMixin.generate`] method to generate text.
+For more details about the different text generation strategies and parameters for controlling generation, check out the [Text generation strategies](../generation_strategies) page.
+
+```py
+>>> from transformers import AutoModelForCausalLM
+
+>>> model = AutoModelForCausalLM.from_pretrained("my_awesome_eli5_clm-model")
+>>> outputs = model.generate(inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)
+```
+
+Decode the generated token ids back into text:
+
+```py
+>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+["Somatic hypermutation allows the immune system to react to drugs with the ability to adapt to a different environmental situation. In other words, a system of 'hypermutation' can help the immune system to adapt to a different environmental situation or in some cases even a single life. In contrast, researchers at the University of Massachusetts-Boston have found that 'hypermutation' is much stronger in mice than in humans but can be found in humans, and that it's not completely unknown to the immune system. A study on how the immune system"]
+```
+</pt>
+<tf>
+Tokenize the text and return the `input_ids` as TensorFlow tensors:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_eli5_clm-model")
+>>> inputs = tokenizer(prompt, return_tensors="tf").input_ids
+```
+
+Use the [`~transformers.generation_tf_utils.TFGenerationMixin.generate`] method to create the summarization. For more details about the different text generation strategies and parameters for controlling generation, check out the [Text generation strategies](../generation_strategies) page.
+
+```py
+>>> from transformers import TFAutoModelForCausalLM
+
+>>> model = TFAutoModelForCausalLM.from_pretrained("my_awesome_eli5_clm-model")
+>>> outputs = model.generate(input_ids=inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)
+```
+
+Decode the generated token ids back into text:
+
+```py
+>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+['Somatic hypermutation allows the immune system to detect the presence of other viruses as they become more prevalent. Therefore, researchers have identified a high proportion of human viruses. The proportion of virus-associated viruses in our study increases with age. Therefore, we propose a simple algorithm to detect the presence of these new viruses in our samples as a sign of improved immunity. A first study based on this algorithm, which will be published in Science on Friday, aims to show that this finding could translate into the development of a better vaccine that is more effective for']
+```
+</tf>
+</frameworkcontent>
diff --git a/docs/source/en/tasks/language_modeling.mdx b/docs/source/en/tasks/language_modeling.mdx
deleted file mode 100644
index eaf8fdc947f1..000000000000
--- a/docs/source/en/tasks/language_modeling.mdx
+++ /dev/null
@@ -1,669 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Language modeling
-
-Language modeling tasks predicts words in a sentence, making these types of models great at generating text. You can use these models for creative applications like choosing your own text adventure or an intelligent coding assistant like Copilot or CodeParrot. There are two types of language modeling, causal and masked.
-
-<Youtube id="Vpjb1lu0MDk"/>
-
-Causal language modeling predicts the next token in a sequence of tokens, and the model can only attend to tokens on the left. This means the model cannot see future tokens. GPT-2 is an example of a causal language model.
-
-<Youtube id="mqElG5QJWUg"/>
-
-Masked language modeling predicts a masked token in a sequence, and the model can attend to tokens bidirectionally. This means the model has full access to the tokens on the left and right. BERT is an example of a masked language model.
-
-This guide will show you how to:
-
-1. Finetune [DistilGPT2](https://huggingface.co/distilgpt2) for causal language modeling and [DistilRoBERTa](https://huggingface.co/distilroberta-base) for masked language modeling on the [r/askscience](https://www.reddit.com/r/askscience/) subset of the [ELI5](https://huggingface.co/datasets/eli5) dataset.
-2. Use your finetuned model for inference.
-
-<Tip>
-
-You can finetune other architectures for language modeling such as [GPT-Neo](https://huggingface.co/EleutherAI/gpt-neo-125M), [GPT-J](https://huggingface.co/EleutherAI/gpt-j-6B), and [BERT](https://huggingface.co/bert-base-uncased), following the same steps in this guide! See the text generation [task page](https://huggingface.co/tasks/text-generation) and fill mask [task page](https://huggingface.co/tasks/fill-mask) for more information about their associated models, datasets, and metrics.
-
-</Tip>
-
-Before you begin, make sure you have all the necessary libraries installed:
-
-```bash
-pip install transformers datasets evaluate
-```
-
-We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login:
-
-```py
->>> from huggingface_hub import notebook_login
-
->>> notebook_login()
-```
-
-## Load ELI5 dataset
-
-Start by loading a smaller subset of the r/askscience subset of the ELI5 dataset from the 🤗 Datasets library. This'll give you a chance to experiment and make sure everythings works before spending more time training on the full dataset.
-
-```py
->>> from datasets import load_dataset
-
->>> eli5 = load_dataset("eli5", split="train_asks[:5000]")
-```
-
-Split the dataset's `train_asks` split into a train and test set with the [`~datasets.Dataset.train_test_split`] method:
-
-```py
->>> eli5 = eli5.train_test_split(test_size=0.2)
-```
-
-Then take a look at an example:
-
-```py
->>> eli5["train"][0]
-{'answers': {'a_id': ['c3d1aib', 'c3d4lya'],
-  'score': [6, 3],
-  'text': ["The velocity needed to remain in orbit is equal to the square root of Newton's constant times the mass of earth divided by the distance from the center of the earth. I don't know the altitude of that specific mission, but they're usually around 300 km. That means he's going 7-8 km/s.\n\nIn space there are no other forces acting on either the shuttle or the guy, so they stay in the same position relative to each other. If he were to become unable to return to the ship, he would presumably run out of oxygen, or slowly fall into the atmosphere and burn up.",
-   "Hope you don't mind me asking another question, but why aren't there any stars visible in this photo?"]},
- 'answers_urls': {'url': []},
- 'document': '',
- 'q_id': 'nyxfp',
- 'selftext': '_URL_0_\n\nThis was on the front page earlier and I have a few questions about it. Is it possible to calculate how fast the astronaut would be orbiting the earth? Also how does he stay close to the shuttle so that he can return safely, i.e is he orbiting at the same speed and can therefore stay next to it? And finally if his propulsion system failed, would he eventually re-enter the atmosphere and presumably die?',
- 'selftext_urls': {'url': ['http://apod.nasa.gov/apod/image/1201/freeflyer_nasa_3000.jpg']},
- 'subreddit': 'askscience',
- 'title': 'Few questions about this space walk photograph.',
- 'title_urls': {'url': []}}
-```
-
-While this may look like a lot, you're only really interested in the `text` field. What's cool about language modeling tasks is you don't need labels (also known as an unsupervised task) because the next word *is* the label.
-
-## Preprocess
-
-<Youtube id="ma1TrR7gE7I"/>
-
-For causal language modeling, the next step is to load a DistilGPT2 tokenizer to process the `text` subfield:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
-```
-
-<Youtube id="8PmhEIXhBvI"/>
-
-For masked language modeling, the next step is to load a DistilRoBERTa tokenizer to process the `text` subfield:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
-```
-
-You'll notice from the example above, the `text` field is actually nested inside `answers`. This means you'll need to extract the `text` subfield from its nested structure with the [`flatten`](https://huggingface.co/docs/datasets/process.html#flatten) method:
-
-```py
->>> eli5 = eli5.flatten()
->>> eli5["train"][0]
-{'answers.a_id': ['c3d1aib', 'c3d4lya'],
- 'answers.score': [6, 3],
- 'answers.text': ["The velocity needed to remain in orbit is equal to the square root of Newton's constant times the mass of earth divided by the distance from the center of the earth. I don't know the altitude of that specific mission, but they're usually around 300 km. That means he's going 7-8 km/s.\n\nIn space there are no other forces acting on either the shuttle or the guy, so they stay in the same position relative to each other. If he were to become unable to return to the ship, he would presumably run out of oxygen, or slowly fall into the atmosphere and burn up.",
-  "Hope you don't mind me asking another question, but why aren't there any stars visible in this photo?"],
- 'answers_urls.url': [],
- 'document': '',
- 'q_id': 'nyxfp',
- 'selftext': '_URL_0_\n\nThis was on the front page earlier and I have a few questions about it. Is it possible to calculate how fast the astronaut would be orbiting the earth? Also how does he stay close to the shuttle so that he can return safely, i.e is he orbiting at the same speed and can therefore stay next to it? And finally if his propulsion system failed, would he eventually re-enter the atmosphere and presumably die?',
- 'selftext_urls.url': ['http://apod.nasa.gov/apod/image/1201/freeflyer_nasa_3000.jpg'],
- 'subreddit': 'askscience',
- 'title': 'Few questions about this space walk photograph.',
- 'title_urls.url': []}
-```
-
-Each subfield is now a separate column as indicated by the `answers` prefix, and the `text` field is a list now. Instead of tokenizing each sentence separately, convert the list to a string so you can jointly tokenize them.
-
-Here is how you can create a preprocessing function to convert the list to a string, and truncate sequences to be no longer than DistilGPT2's maximum input length:
-
-```py
->>> def preprocess_function(examples):
-...     return tokenizer([" ".join(x) for x in examples["answers.text"]], truncation=True)
-```
-
-To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.with_transform`] method. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once, and increasing the number of processes with `num_proc`. Remove any columns you don't need:
-
-```py
->>> tokenized_eli5 = eli5.map(
-...     preprocess_function,
-...     batched=True,
-...     num_proc=4,
-...     remove_columns=eli5["train"].column_names,
-... )
-```
-
-Now you'll need a second preprocessing function to capture text truncated from the lengthier examples to avoid losing any information. This preprocessing function should:
-
-- Concatenate all the text.
-- Split the concatenated text into smaller chunks defined by `block_size`.
-
-```py
->>> block_size = 128
-
-
->>> def group_texts(examples):
-...     concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
-...     total_length = len(concatenated_examples[list(examples.keys())[0]])
-...     total_length = (total_length // block_size) * block_size
-...     result = {
-...         k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
-...         for k, t in concatenated_examples.items()
-...     }
-...     result["labels"] = result["input_ids"].copy()
-...     return result
-```
-
-Apply the `group_texts` function over the entire dataset:
-
-```py
->>> lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)
-```
-
-Now create a batch of examples using [`DataCollatorForLanguageModeling`]. It's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximium length.
-
-<frameworkcontent>
-<pt>
-For causal language modeling, use the end-of-sequence token as the padding token and set `mlm=False`. This will use the inputs as labels shifted to the right by one element:
-
-```py
->>> from transformers import DataCollatorForLanguageModeling
-
->>> tokenizer.pad_token = tokenizer.eos_token
->>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
-```
-
-For masked language modeling, use the end-of-sequence token as the padding token and specify `mlm_probability` to randomly mask tokens each time you iterate over the data:
-
-```py
->>> from transformers import DataCollatorForLanguageModeling
-
->>> tokenizer.pad_token = tokenizer.eos_token
->>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
-```
-</pt>
-<tf>
-For causal language modeling, use the end-of-sequence token as the padding token and set `mlm=False`. This will use the inputs as labels shifted to the right by one element:
-
-```py
->>> from transformers import DataCollatorForLanguageModeling
-
->>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="tf")
-```
-
-For masked language modeling, use the end-of-sequence token as the padding token and specify `mlm_probability` to randomly mask tokens each time you iterate over the data:
-
-```py
->>> from transformers import DataCollatorForLanguageModeling
-
->>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, return_tensors="tf")
-```
-</tf>
-</frameworkcontent>
-
-## Causal language modeling
-
-Causal language models are frequently used for text generation. This section shows you how to finetune [DistilGPT2](https://huggingface.co/distilgpt2) to generate new text.
-
-### Train
-
-<frameworkcontent>
-<pt>
-<Tip>
-
-If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)!
-
-</Tip>
-You're ready to start training your model now! Load DistilGPT2 with [`AutoModelForCausalLM`]:
-
-```py
->>> from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
-
->>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
-```
-
-At this point, only three steps remain:
-
-1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model).
-2. Pass the training arguments to [`Trainer`] along with the model, datasets, and data collator.
-3. Call [`~Trainer.train`] to finetune your model.
-
-```py
->>> training_args = TrainingArguments(
-...     output_dir="my_awesome_eli5_clm-model",
-...     evaluation_strategy="epoch",
-...     learning_rate=2e-5,
-...     weight_decay=0.01,
-...     push_to_hub=True,
-... )
-
->>> trainer = Trainer(
-...     model=model,
-...     args=training_args,
-...     train_dataset=lm_dataset["train"],
-...     eval_dataset=lm_dataset["test"],
-...     data_collator=data_collator,
-... )
-
->>> trainer.train()
-```
-
-Once training is completed, use the [`~transformers.Trainer.evaluate`] method to evaluate your model and get its perplexity:
-
-```py
->>> import math
-
->>> eval_results = trainer.evaluate()
->>> print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
-Perplexity: 49.61
-```
-
-Then share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model:
-
-```py
->>> trainer.push_to_hub()
-```
-</pt>
-<tf>
-<Tip>
-
-If you aren't familiar with finetuning a model with Keras, take a look at the basic tutorial [here](../training#train-a-tensorflow-model-with-keras)!
-
-</Tip>
-To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters:
-
-```py
->>> from transformers import create_optimizer, AdamWeightDecay
-
->>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
-```
-
-Then you can load DistilGPT2 with [`TFAutoModelForCausalLM`]:
-
-```py
->>> from transformers import TFAutoModelForCausalLM
-
->>> model = TFAutoModelForCausalLM.from_pretrained("distilgpt2")
-```
-
-Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
-
-```py
->>> tf_train_set = model.prepare_tf_dataset(
-...     lm_dataset["train"],
-...     shuffle=True,
-...     batch_size=16,
-...     collate_fn=data_collator,
-... )
-
->>> tf_test_set = model.prepare_tf_dataset(
-...     lm_dataset["test"],
-...     shuffle=False,
-...     batch_size=16,
-...     collate_fn=data_collator,
-... )
-```
-
-Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
-
-```py
->>> import tensorflow as tf
-
->>> model.compile(optimizer=optimizer)
-```
-
-This can be done by specifying where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]:
-
-```py
->>> from transformers.keras_callbacks import PushToHubCallback
-
->>> callback = PushToHubCallback(
-...     output_dir="my_awesome_eli5_clm-model",
-...     tokenizer=tokenizer,
-... )
-```
-
-Finally, you're ready to start training your model! Call [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) with your training and validation datasets, the number of epochs, and your callback to finetune the model:
-
-```py
->>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=[callback])
-```
-
-Once training is completed, your model is automatically uploaded to the Hub so everyone can use it!
-</tf>
-</frameworkcontent>
-
-<Tip>
-
-For a more in-depth example of how to finetune a model for causal language modeling, take a look at the corresponding
-[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)
-or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
-
-</Tip>
-
-### Inference
-
-Great, now that you've finetuned a model, you can use it for inference!
-
-Come up with a prompt you'd like to generate text from:
-
-```py
->>> prompt = "Somatic hypermutation allows the immune system to"
-```
-
-The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for text generation with your model, and pass your text to it:
-
-```py
->>> from transformers import pipeline
-
->>> generator = pipeline("text-generation", model="my_awesome_eli5_clm-model")
->>> generator(prompt)
-[{'generated_text': "Somatic hypermutation allows the immune system to be able to effectively reverse the damage caused by an infection.\n\n\nThe damage caused by an infection is caused by the immune system's ability to perform its own self-correcting tasks."}]
-```
-
-<frameworkcontent>
-<pt>
-Tokenize the text and return the `input_ids` as PyTorch tensors:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_eli5_clm-model")
->>> inputs = tokenizer(prompt, return_tensors="pt").input_ids
-```
-
-Use the [`~transformers.generation_utils.GenerationMixin.generate`] method to generate text. For more details about the different text generation strategies and parameters for controlling generation, check out the [Text Generation](./main_classes/text_generation) API.
-
-```py
->>> from transformers import AutoModelForCausalLM
-
->>> model = AutoModelForCausalLM.from_pretrained("my_awesome_eli5_clm-model")
->>> outputs = model.generate(inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)
-```
-
-Decode the generated token ids back into text:
-
-```py
->>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-["Somatic hypermutation allows the immune system to react to drugs with the ability to adapt to a different environmental situation. In other words, a system of 'hypermutation' can help the immune system to adapt to a different environmental situation or in some cases even a single life. In contrast, researchers at the University of Massachusetts-Boston have found that 'hypermutation' is much stronger in mice than in humans but can be found in humans, and that it's not completely unknown to the immune system. A study on how the immune system"]
-```
-</pt>
-<tf>
-Tokenize the text and return the `input_ids` as TensorFlow tensors:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_eli5_clm-model")
->>> inputs = tokenizer(prompt, return_tensors="tf").input_ids
-```
-
-Use the [`~transformers.generation_tf_utils.TFGenerationMixin.generate`] method to create the summarization. For more details about the different text generation strategies and parameters for controlling generation, check out the [Text Generation](./main_classes/text_generation) API.
-
-```py
->>> from transformers import TFAutoModelForCausalLM
-
->>> model = TFAutoModelForCausalLM.from_pretrained("my_awesome_eli5_clm-model")
->>> outputs = model.generate(input_ids=inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)
-```
-
-Decode the generated token ids back into text:
-
-```py
->>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-['Somatic hypermutation allows the immune system to detect the presence of other viruses as they become more prevalent. Therefore, researchers have identified a high proportion of human viruses. The proportion of virus-associated viruses in our study increases with age. Therefore, we propose a simple algorithm to detect the presence of these new viruses in our samples as a sign of improved immunity. A first study based on this algorithm, which will be published in Science on Friday, aims to show that this finding could translate into the development of a better vaccine that is more effective for']
-```
-</tf>
-</frameworkcontent>
-
-## Masked language modeling
-
-Masked language modeling are good for tasks that require a good contextual understanding of an entire sequence. This section shows you how to finetune [DistilRoBERTa](https://huggingface.co/distilroberta-base) to predict a masked word.
-
-### Train
-
-<frameworkcontent>
-<pt>
-<Tip>
-
-If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)!
-
-</Tip>
-You're ready to start training your model now! Load DistilRoBERTa with [`AutoModelForMaskedLM`]:
-
-```py
->>> from transformers import AutoModelForMaskedLM
-
->>> model = AutoModelForMaskedLM.from_pretrained("distilroberta-base")
-```
-
-At this point, only three steps remain:
-
-1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model).
-2. Pass the training arguments to [`Trainer`] along with the model, datasets, and data collator.
-3. Call [`~Trainer.train`] to finetune your model.
-
-```py
->>> training_args = TrainingArguments(
-...     output_dir="my_awesome_eli5_mlm_model",
-...     evaluation_strategy="epoch",
-...     learning_rate=2e-5,
-...     num_train_epochs=3,
-...     weight_decay=0.01,
-...     push_to_hub=True,
-... )
-
->>> trainer = Trainer(
-...     model=model,
-...     args=training_args,
-...     train_dataset=lm_dataset["train"],
-...     eval_dataset=lm_dataset["test"],
-...     data_collator=data_collator,
-... )
-
->>> trainer.train()
-```
-
-Once training is completed, use the [`~transformers.Trainer.evaluate`] method to evaluate your model and get its perplexity:
-
-```py
->>> import math
-
->>> eval_results = trainer.evaluate()
->>> print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
-Perplexity: 8.76
-```
-
-Then share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model:
-
-```py
->>> trainer.push_to_hub()
-```
-</pt>
-<tf>
-<Tip>
-
-If you aren't familiar with finetuning a model with Keras, take a look at the basic tutorial [here](../training#train-a-tensorflow-model-with-keras)!
-
-</Tip>
-To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters:
-
-```py
->>> from transformers import create_optimizer, AdamWeightDecay
-
->>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
-```
-
-Then you can load DistilRoBERTa with [`TFAutoModelForMaskedLM`]:
-
-```py
->>> from transformers import TFAutoModelForMaskedLM
-
->>> model = TFAutoModelForMaskedLM.from_pretrained("distilroberta-base")
-```
-
-Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
-
-```py
->>> tf_train_set = model.prepare_tf_dataset(
-...     lm_dataset["train"],
-...     shuffle=True,
-...     batch_size=16,
-...     collate_fn=data_collator,
-... )
-
->>> tf_test_set = model.prepare_tf_dataset(
-...     lm_dataset["test"],
-...     shuffle=False,
-...     batch_size=16,
-...     collate_fn=data_collator,
-... )
-```
-
-Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
-
-```py
->>> import tensorflow as tf
-
->>> model.compile(optimizer=optimizer)
-```
-
-This can be done by specifying where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]:
-
-```py
->>> from transformers.keras_callbacks import PushToHubCallback
-
->>> callback = PushToHubCallback(
-...     output_dir="my_awesome_eli5_mlm_model",
-...     tokenizer=tokenizer,
-... )
-```
-
-Finally, you're ready to start training your model! Call [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) with your training and validation datasets, the number of epochs, and your callback to finetune the model:
-
-```py
->>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=[callback])
-```
-
-Once training is completed, your model is automatically uploaded to the Hub so everyone can use it!
-</tf>
-</frameworkcontent>
-
-<Tip>
-
-For a more in-depth example of how to finetune a model for masked language modeling, take a look at the corresponding
-[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)
-or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
-
-</Tip>
-
-### Inference
-
-Great, now that you've finetuned a model, you can use it for inference!
-
-Come up with some text you'd like the model to fill in the blank with, and use the special `<mask>` token to indicate the blank:
-
-```py
->>> text = "The Milky Way is a <mask> galaxy."
-```
-
-The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for fill-mask with your model, and pass your text to it. If you like, you can use the `top_k` parameter to specify how many predictions to return:
-
-```py
->>> from transformers import pipeline
-
->>> mask_filler = pipeline("fill-mask", "stevhliu/my_awesome_eli5_mlm_model")
->>> mask_filler(text, top_k=3)
-[{'score': 0.5150994658470154,
-  'token': 21300,
-  'token_str': ' spiral',
-  'sequence': 'The Milky Way is a spiral galaxy.'},
- {'score': 0.07087188959121704,
-  'token': 2232,
-  'token_str': ' massive',
-  'sequence': 'The Milky Way is a massive galaxy.'},
- {'score': 0.06434620916843414,
-  'token': 650,
-  'token_str': ' small',
-  'sequence': 'The Milky Way is a small galaxy.'}]
-```
-
-<frameworkcontent>
-<pt>
-Tokenize the text and return the `input_ids` as PyTorch tensors. You'll also need to specify the position of the `<mask>` token:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_eli5_mlm_model")
->>> inputs = tokenizer(text, return_tensors="pt")
->>> mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
-```
-
-Pass your inputs to the model and return the `logits` of the masked token:
-
-```py
->>> from transformers import AutoModelForMaskedLM
-
->>> model = AutoModelForMaskedLM.from_pretrained("stevhliu/my_awesome_eli5_mlm_model")
->>> logits = model(**inputs).logits
->>> mask_token_logits = logits[0, mask_token_index, :]
-```
-
-Then return the three masked tokens with the highest probability and print them out:
-
-```py
->>> top_3_tokens = torch.topk(mask_token_logits, 3, dim=1).indices[0].tolist()
-
->>> for token in top_3_tokens:
-...     print(text.replace(tokenizer.mask_token, tokenizer.decode([token])))
-The Milky Way is a spiral galaxy.
-The Milky Way is a massive galaxy.
-The Milky Way is a small galaxy.
-```
-</pt>
-<tf>
-Tokenize the text and return the `input_ids` as TensorFlow tensors. You'll also need to specify the position of the `<mask>` token:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_eli5_mlm_model")
->>> inputs = tokenizer(text, return_tensors="tf")
->>> mask_token_index = tf.where(inputs["input_ids"] == tokenizer.mask_token_id)[0, 1]
-```
-
-Pass your inputs to the model and return the `logits` of the masked token:
-
-```py
->>> from transformers import TFAutoModelForMaskedLM
-
->>> model = TFAutoModelForMaskedLM.from_pretrained("stevhliu/my_awesome_eli5_mlm_model")
->>> logits = model(**inputs).logits
->>> mask_token_logits = logits[0, mask_token_index, :]
-```
-
-Then return the three masked tokens with the highest probability and print them out:
-
-```py
->>> top_3_tokens = tf.math.top_k(mask_token_logits, 3).indices.numpy()
-
->>> for token in top_3_tokens:
-...     print(text.replace(tokenizer.mask_token, tokenizer.decode([token])))
-The Milky Way is a spiral galaxy.
-The Milky Way is a massive galaxy.
-The Milky Way is a small galaxy.
-```
-</tf>
-</frameworkcontent>
\ No newline at end of file
diff --git a/docs/source/en/tasks/masked_language_modeling.md b/docs/source/en/tasks/masked_language_modeling.md
new file mode 100644
index 000000000000..ba1e9e50dbe8
--- /dev/null
+++ b/docs/source/en/tasks/masked_language_modeling.md
@@ -0,0 +1,442 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Masked language modeling
+
+[[open-in-colab]]
+
+<Youtube id="mqElG5QJWUg"/>
+
+Masked language modeling predicts a masked token in a sequence, and the model can attend to tokens bidirectionally. This
+means the model has full access to the tokens on the left and right. Masked language modeling is great for tasks that
+require a good contextual understanding of an entire sequence. BERT is an example of a masked language model.
+
+This guide will show you how to:
+
+1. Finetune [DistilRoBERTa](https://huggingface.co/distilroberta-base) on the [r/askscience](https://www.reddit.com/r/askscience/) subset of the [ELI5](https://huggingface.co/datasets/eli5) dataset.
+2. Use your finetuned model for inference.
+
+<Tip>
+You can finetune other architectures for masked language modeling following the same steps in this guide.
+Choose one of the following architectures:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [CamemBERT](../model_doc/camembert), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ESM](../model_doc/esm), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MRA](../model_doc/mra), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [Perceiver](../model_doc/perceiver), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [TAPAS](../model_doc/tapas), [Wav2Vec2](../model_doc/wav2vec2), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+Before you begin, make sure you have all the necessary libraries installed:
+
+```bash
+pip install transformers datasets evaluate
+```
+
+We encourage you to log in to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to log in:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## Load ELI5 dataset
+
+Start by loading a smaller subset of the r/askscience subset of the ELI5 dataset from the 🤗 Datasets library. This'll
+give you a chance to experiment and make sure everything works before spending more time training on the full dataset.
+
+```py
+>>> from datasets import load_dataset
+
+>>> eli5 = load_dataset("eli5", split="train_asks[:5000]")
+```
+
+Split the dataset's `train_asks` split into a train and test set with the [`~datasets.Dataset.train_test_split`] method:
+
+```py
+>>> eli5 = eli5.train_test_split(test_size=0.2)
+```
+
+Then take a look at an example:
+
+```py
+>>> eli5["train"][0]
+{'answers': {'a_id': ['c3d1aib', 'c3d4lya'],
+  'score': [6, 3],
+  'text': ["The velocity needed to remain in orbit is equal to the square root of Newton's constant times the mass of earth divided by the distance from the center of the earth. I don't know the altitude of that specific mission, but they're usually around 300 km. That means he's going 7-8 km/s.\n\nIn space there are no other forces acting on either the shuttle or the guy, so they stay in the same position relative to each other. If he were to become unable to return to the ship, he would presumably run out of oxygen, or slowly fall into the atmosphere and burn up.",
+   "Hope you don't mind me asking another question, but why aren't there any stars visible in this photo?"]},
+ 'answers_urls': {'url': []},
+ 'document': '',
+ 'q_id': 'nyxfp',
+ 'selftext': '_URL_0_\n\nThis was on the front page earlier and I have a few questions about it. Is it possible to calculate how fast the astronaut would be orbiting the earth? Also how does he stay close to the shuttle so that he can return safely, i.e is he orbiting at the same speed and can therefore stay next to it? And finally if his propulsion system failed, would he eventually re-enter the atmosphere and presumably die?',
+ 'selftext_urls': {'url': ['http://apod.nasa.gov/apod/image/1201/freeflyer_nasa_3000.jpg']},
+ 'subreddit': 'askscience',
+ 'title': 'Few questions about this space walk photograph.',
+ 'title_urls': {'url': []}}
+```
+
+While this may look like a lot, you're only really interested in the `text` field. What's cool about language modeling tasks is you don't need labels (also known as an unsupervised task) because the next word *is* the label.
+
+## Preprocess
+
+<Youtube id="8PmhEIXhBvI"/>
+
+For masked language modeling, the next step is to load a DistilRoBERTa tokenizer to process the `text` subfield:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
+```
+
+You'll notice from the example above, the `text` field is actually nested inside `answers`. This means you'll need to e
+xtract the `text` subfield from its nested structure with the [`flatten`](https://huggingface.co/docs/datasets/process.html#flatten) method:
+
+```py
+>>> eli5 = eli5.flatten()
+>>> eli5["train"][0]
+{'answers.a_id': ['c3d1aib', 'c3d4lya'],
+ 'answers.score': [6, 3],
+ 'answers.text': ["The velocity needed to remain in orbit is equal to the square root of Newton's constant times the mass of earth divided by the distance from the center of the earth. I don't know the altitude of that specific mission, but they're usually around 300 km. That means he's going 7-8 km/s.\n\nIn space there are no other forces acting on either the shuttle or the guy, so they stay in the same position relative to each other. If he were to become unable to return to the ship, he would presumably run out of oxygen, or slowly fall into the atmosphere and burn up.",
+  "Hope you don't mind me asking another question, but why aren't there any stars visible in this photo?"],
+ 'answers_urls.url': [],
+ 'document': '',
+ 'q_id': 'nyxfp',
+ 'selftext': '_URL_0_\n\nThis was on the front page earlier and I have a few questions about it. Is it possible to calculate how fast the astronaut would be orbiting the earth? Also how does he stay close to the shuttle so that he can return safely, i.e is he orbiting at the same speed and can therefore stay next to it? And finally if his propulsion system failed, would he eventually re-enter the atmosphere and presumably die?',
+ 'selftext_urls.url': ['http://apod.nasa.gov/apod/image/1201/freeflyer_nasa_3000.jpg'],
+ 'subreddit': 'askscience',
+ 'title': 'Few questions about this space walk photograph.',
+ 'title_urls.url': []}
+```
+
+Each subfield is now a separate column as indicated by the `answers` prefix, and the `text` field is a list now. Instead
+of tokenizing each sentence separately, convert the list to a string so you can jointly tokenize them.
+
+Here is a first preprocessing function to join the list of strings for each example and tokenize the result:
+
+```py
+>>> def preprocess_function(examples):
+...     return tokenizer([" ".join(x) for x in examples["answers.text"]])
+```
+
+To apply this preprocessing function over the entire dataset, use the 🤗 Datasets [`~datasets.Dataset.map`] method. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once, and increasing the number of processes with `num_proc`. Remove any columns you don't need:
+
+```py
+>>> tokenized_eli5 = eli5.map(
+...     preprocess_function,
+...     batched=True,
+...     num_proc=4,
+...     remove_columns=eli5["train"].column_names,
+... )
+```
+
+This dataset contains the token sequences, but some of these are longer than the maximum input length for the model.
+
+You can now use a second preprocessing function to
+- concatenate all the sequences
+- split the concatenated sequences into shorter chunks defined by `block_size`, which should be both shorter than the maximum input length and short enough for your GPU RAM. 
+
+```py
+>>> block_size = 128
+
+
+>>> def group_texts(examples):
+...     # Concatenate all texts.
+...     concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+...     total_length = len(concatenated_examples[list(examples.keys())[0]])
+...     # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+...     # customize this part to your needs.
+...     if total_length >= block_size:
+...         total_length = (total_length // block_size) * block_size
+...     # Split by chunks of block_size.
+...     result = {
+...         k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
+...         for k, t in concatenated_examples.items()
+...     }
+...     return result
+```
+
+Apply the `group_texts` function over the entire dataset:
+
+```py
+>>> lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)
+```
+
+Now create a batch of examples using [`DataCollatorForLanguageModeling`]. It's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.
+
+<frameworkcontent>
+<pt>
+
+Use the end-of-sequence token as the padding token and specify `mlm_probability` to randomly mask tokens each time you iterate over the data:
+
+```py
+>>> from transformers import DataCollatorForLanguageModeling
+
+>>> tokenizer.pad_token = tokenizer.eos_token
+>>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
+```
+</pt>
+<tf>
+
+Use the end-of-sequence token as the padding token and specify `mlm_probability` to randomly mask tokens each time you iterate over the data:
+
+```py
+>>> from transformers import DataCollatorForLanguageModeling
+
+>>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, return_tensors="tf")
+```
+</tf>
+</frameworkcontent>
+
+## Train
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)!
+
+</Tip>
+
+You're ready to start training your model now! Load DistilRoBERTa with [`AutoModelForMaskedLM`]:
+
+```py
+>>> from transformers import AutoModelForMaskedLM
+
+>>> model = AutoModelForMaskedLM.from_pretrained("distilroberta-base")
+```
+
+At this point, only three steps remain:
+
+1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model).
+2. Pass the training arguments to [`Trainer`] along with the model, datasets, and data collator.
+3. Call [`~Trainer.train`] to finetune your model.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_eli5_mlm_model",
+...     evaluation_strategy="epoch",
+...     learning_rate=2e-5,
+...     num_train_epochs=3,
+...     weight_decay=0.01,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=lm_dataset["train"],
+...     eval_dataset=lm_dataset["test"],
+...     data_collator=data_collator,
+... )
+
+>>> trainer.train()
+```
+
+Once training is completed, use the [`~transformers.Trainer.evaluate`] method to evaluate your model and get its perplexity:
+
+```py
+>>> import math
+
+>>> eval_results = trainer.evaluate()
+>>> print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
+Perplexity: 8.76
+```
+
+Then share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+<Tip>
+
+If you aren't familiar with finetuning a model with Keras, take a look at the basic tutorial [here](../training#train-a-tensorflow-model-with-keras)!
+
+</Tip>
+To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters:
+
+```py
+>>> from transformers import create_optimizer, AdamWeightDecay
+
+>>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
+```
+
+Then you can load DistilRoBERTa with [`TFAutoModelForMaskedLM`]:
+
+```py
+>>> from transformers import TFAutoModelForMaskedLM
+
+>>> model = TFAutoModelForMaskedLM.from_pretrained("distilroberta-base")
+```
+
+Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
+
+```py
+>>> tf_train_set = model.prepare_tf_dataset(
+...     lm_dataset["train"],
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_test_set = model.prepare_tf_dataset(
+...     lm_dataset["test"],
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to:
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)  # No loss argument!
+```
+
+This can be done by specifying where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> callback = PushToHubCallback(
+...     output_dir="my_awesome_eli5_mlm_model",
+...     tokenizer=tokenizer,
+... )
+```
+
+Finally, you're ready to start training your model! Call [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) with your training and validation datasets, the number of epochs, and your callback to finetune the model:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=[callback])
+```
+
+Once training is completed, your model is automatically uploaded to the Hub so everyone can use it!
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+For a more in-depth example of how to finetune a model for masked language modeling, take a look at the corresponding
+[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)
+or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
+
+</Tip>
+
+## Inference
+
+Great, now that you've finetuned a model, you can use it for inference!
+
+Come up with some text you'd like the model to fill in the blank with, and use the special `<mask>` token to indicate the blank:
+
+```py
+>>> text = "The Milky Way is a <mask> galaxy."
+```
+
+The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for fill-mask with your model, and pass your text to it. If you like, you can use the `top_k` parameter to specify how many predictions to return:
+
+```py
+>>> from transformers import pipeline
+
+>>> mask_filler = pipeline("fill-mask", "stevhliu/my_awesome_eli5_mlm_model")
+>>> mask_filler(text, top_k=3)
+[{'score': 0.5150994658470154,
+  'token': 21300,
+  'token_str': ' spiral',
+  'sequence': 'The Milky Way is a spiral galaxy.'},
+ {'score': 0.07087188959121704,
+  'token': 2232,
+  'token_str': ' massive',
+  'sequence': 'The Milky Way is a massive galaxy.'},
+ {'score': 0.06434620916843414,
+  'token': 650,
+  'token_str': ' small',
+  'sequence': 'The Milky Way is a small galaxy.'}]
+```
+
+<frameworkcontent>
+<pt>
+Tokenize the text and return the `input_ids` as PyTorch tensors. You'll also need to specify the position of the `<mask>` token:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_eli5_mlm_model")
+>>> inputs = tokenizer(text, return_tensors="pt")
+>>> mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
+```
+
+Pass your inputs to the model and return the `logits` of the masked token:
+
+```py
+>>> from transformers import AutoModelForMaskedLM
+
+>>> model = AutoModelForMaskedLM.from_pretrained("stevhliu/my_awesome_eli5_mlm_model")
+>>> logits = model(**inputs).logits
+>>> mask_token_logits = logits[0, mask_token_index, :]
+```
+
+Then return the three masked tokens with the highest probability and print them out:
+
+```py
+>>> top_3_tokens = torch.topk(mask_token_logits, 3, dim=1).indices[0].tolist()
+
+>>> for token in top_3_tokens:
+...     print(text.replace(tokenizer.mask_token, tokenizer.decode([token])))
+The Milky Way is a spiral galaxy.
+The Milky Way is a massive galaxy.
+The Milky Way is a small galaxy.
+```
+</pt>
+<tf>
+Tokenize the text and return the `input_ids` as TensorFlow tensors. You'll also need to specify the position of the `<mask>` token:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_eli5_mlm_model")
+>>> inputs = tokenizer(text, return_tensors="tf")
+>>> mask_token_index = tf.where(inputs["input_ids"] == tokenizer.mask_token_id)[0, 1]
+```
+
+Pass your inputs to the model and return the `logits` of the masked token:
+
+```py
+>>> from transformers import TFAutoModelForMaskedLM
+
+>>> model = TFAutoModelForMaskedLM.from_pretrained("stevhliu/my_awesome_eli5_mlm_model")
+>>> logits = model(**inputs).logits
+>>> mask_token_logits = logits[0, mask_token_index, :]
+```
+
+Then return the three masked tokens with the highest probability and print them out:
+
+```py
+>>> top_3_tokens = tf.math.top_k(mask_token_logits, 3).indices.numpy()
+
+>>> for token in top_3_tokens:
+...     print(text.replace(tokenizer.mask_token, tokenizer.decode([token])))
+The Milky Way is a spiral galaxy.
+The Milky Way is a massive galaxy.
+The Milky Way is a small galaxy.
+```
+</tf>
+</frameworkcontent>
diff --git a/docs/source/en/tasks/monocular_depth_estimation.md b/docs/source/en/tasks/monocular_depth_estimation.md
new file mode 100644
index 000000000000..fa59771cbb02
--- /dev/null
+++ b/docs/source/en/tasks/monocular_depth_estimation.md
@@ -0,0 +1,151 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Monocular depth estimation
+
+Monocular depth estimation is a computer vision task that involves predicting the depth information of a scene from a
+single image. In other words, it is the process of estimating the distance of objects in a scene from
+a single camera viewpoint.
+
+Monocular depth estimation has various applications, including 3D reconstruction, augmented reality, autonomous driving,
+and robotics. It is a challenging task as it requires the model to understand the complex relationships between objects
+in the scene and the corresponding depth information, which can be affected by factors such as lighting conditions,
+occlusion, and texture.
+
+<Tip>
+The task illustrated in this tutorial is supported by the following model architectures:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[DPT](../model_doc/dpt), [GLPN](../model_doc/glpn)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+In this guide you'll learn how to:
+
+* create a depth estimation pipeline
+* run depth estimation inference by hand
+
+Before you begin, make sure you have all the necessary libraries installed:
+
+```bash
+pip install -q transformers
+```
+
+## Depth estimation pipeline
+
+The simplest way to try out inference with a model supporting depth estimation is to use the corresponding [`pipeline`].
+Instantiate a pipeline from a [checkpoint on the Hugging Face Hub](https://huggingface.co/models?pipeline_tag=depth-estimation&sort=downloads):
+
+```py
+>>> from transformers import pipeline
+
+>>> checkpoint = "vinvino02/glpn-nyu"
+>>> depth_estimator = pipeline("depth-estimation", model=checkpoint)
+```
+
+Next, choose an image to analyze:
+
+```py
+>>> from PIL import Image
+>>> import requests
+
+>>> url = "https://unsplash.com/photos/HwBAsSbPBDU/download?ixid=MnwxMjA3fDB8MXxzZWFyY2h8MzR8fGNhciUyMGluJTIwdGhlJTIwc3RyZWV0fGVufDB8MHx8fDE2Nzg5MDEwODg&force=true&w=640"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+>>> image
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/depth-estimation-example.jpg" alt="Photo of a busy street"/>
+</div>
+
+Pass the image to the pipeline.
+
+```py
+>>> predictions = depth_estimator(image)
+```
+
+The pipeline returns a dictionary with two entries. The first one, called `predicted_depth`, is a tensor with the values
+being the depth expressed in meters for each pixel.
+The second one, `depth`, is a PIL image that visualizes the depth estimation result.
+
+Let's take a look at the visualized result:
+
+```py
+>>> predictions["depth"]
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/depth-visualization.png" alt="Depth estimation visualization"/>
+</div>
+
+## Depth estimation inference by hand
+
+Now that you've seen how to use the depth estimation pipeline, let's see how we can replicate the same result by hand.
+
+Start by loading the model and associated processor from a [checkpoint on the Hugging Face Hub](https://huggingface.co/models?pipeline_tag=depth-estimation&sort=downloads).
+Here we'll use the same checkpoint as before:
+
+```py
+>>> from transformers import AutoImageProcessor, AutoModelForDepthEstimation
+
+>>> checkpoint = "vinvino02/glpn-nyu"
+
+>>> image_processor = AutoImageProcessor.from_pretrained(checkpoint)
+>>> model = AutoModelForDepthEstimation.from_pretrained(checkpoint)
+```
+
+Prepare the image input for the model using the `image_processor` that will take care of the necessary image transformations
+such as resizing and normalization:
+
+```py
+>>> pixel_values = image_processor(image, return_tensors="pt").pixel_values
+```
+
+Pass the prepared inputs through the model:
+
+```py
+>>> import torch
+
+>>> with torch.no_grad():
+...     outputs = model(pixel_values)
+...     predicted_depth = outputs.predicted_depth
+```
+
+Visualize the results:
+
+```py
+>>> import numpy as np
+
+>>> # interpolate to original size
+>>> prediction = torch.nn.functional.interpolate(
+...     predicted_depth.unsqueeze(1),
+...     size=image.size[::-1],
+...     mode="bicubic",
+...     align_corners=False,
+... ).squeeze()
+>>> output = prediction.numpy()
+
+>>> formatted = (output * 255 / np.max(output)).astype("uint8")
+>>> depth = Image.fromarray(formatted)
+>>> depth
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/depth-visualization.png" alt="Depth estimation visualization"/>
+</div>
diff --git a/docs/source/en/tasks/multiple_choice.md b/docs/source/en/tasks/multiple_choice.md
new file mode 100644
index 000000000000..938d3ba461bb
--- /dev/null
+++ b/docs/source/en/tasks/multiple_choice.md
@@ -0,0 +1,465 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Multiple choice
+
+[[open-in-colab]]
+
+A multiple choice task is similar to question answering, except several candidate answers are provided along with a context and the model is trained to select the correct answer.
+
+This guide will show you how to:
+
+1. Finetune [BERT](https://huggingface.co/bert-base-uncased) on the `regular` configuration of the [SWAG](https://huggingface.co/datasets/swag) dataset to select the best answer given multiple options and some context.
+2. Use your finetuned model for inference.
+
+<Tip>
+The task illustrated in this tutorial is supported by the following model architectures:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[ALBERT](../model_doc/albert), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [I-BERT](../model_doc/ibert), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MRA](../model_doc/mra), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [QDQBert](../model_doc/qdqbert), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+Before you begin, make sure you have all the necessary libraries installed:
+
+```bash
+pip install transformers datasets evaluate
+```
+
+We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## Load SWAG dataset
+
+Start by loading the `regular` configuration of the SWAG dataset from the 🤗 Datasets library:
+
+```py
+>>> from datasets import load_dataset
+
+>>> swag = load_dataset("swag", "regular")
+```
+
+Then take a look at an example:
+
+```py
+>>> swag["train"][0]
+{'ending0': 'passes by walking down the street playing their instruments.',
+ 'ending1': 'has heard approaching them.',
+ 'ending2': "arrives and they're outside dancing and asleep.",
+ 'ending3': 'turns the lead singer watches the performance.',
+ 'fold-ind': '3416',
+ 'gold-source': 'gold',
+ 'label': 0,
+ 'sent1': 'Members of the procession walk down the street holding small horn brass instruments.',
+ 'sent2': 'A drum line',
+ 'startphrase': 'Members of the procession walk down the street holding small horn brass instruments. A drum line',
+ 'video-id': 'anetv_jkn6uvmqwh4'}
+```
+
+While it looks like there are a lot of fields here, it is actually pretty straightforward:
+
+- `sent1` and `sent2`: these fields show how a sentence starts, and if you put the two together, you get the `startphrase` field.
+- `ending`: suggests a possible ending for how a sentence can end, but only one of them is correct.
+- `label`: identifies the correct sentence ending.
+
+## Preprocess
+
+The next step is to load a BERT tokenizer to process the sentence starts and the four possible endings:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+```
+
+The preprocessing function you want to create needs to:
+
+1. Make four copies of the `sent1` field and combine each of them with `sent2` to recreate how a sentence starts.
+2. Combine `sent2` with each of the four possible sentence endings.
+3. Flatten these two lists so you can tokenize them, and then unflatten them afterward so each example has a corresponding `input_ids`, `attention_mask`, and `labels` field.
+
+```py
+>>> ending_names = ["ending0", "ending1", "ending2", "ending3"]
+
+
+>>> def preprocess_function(examples):
+...     first_sentences = [[context] * 4 for context in examples["sent1"]]
+...     question_headers = examples["sent2"]
+...     second_sentences = [
+...         [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)
+...     ]
+
+...     first_sentences = sum(first_sentences, [])
+...     second_sentences = sum(second_sentences, [])
+
+...     tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
+...     return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}
+```
+
+To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] method. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once:
+
+```py
+tokenized_swag = swag.map(preprocess_function, batched=True)
+```
+
+🤗 Transformers doesn't have a data collator for multiple choice, so you'll need to adapt the [`DataCollatorWithPadding`] to create a batch of examples. It's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.
+
+`DataCollatorForMultipleChoice` flattens all the model inputs, applies padding, and then unflattens the results:
+
+<frameworkcontent>
+<pt>
+```py
+>>> from dataclasses import dataclass
+>>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
+>>> from typing import Optional, Union
+>>> import torch
+
+
+>>> @dataclass
+... class DataCollatorForMultipleChoice:
+...     """
+...     Data collator that will dynamically pad the inputs for multiple choice received.
+...     """
+
+...     tokenizer: PreTrainedTokenizerBase
+...     padding: Union[bool, str, PaddingStrategy] = True
+...     max_length: Optional[int] = None
+...     pad_to_multiple_of: Optional[int] = None
+
+...     def __call__(self, features):
+...         label_name = "label" if "label" in features[0].keys() else "labels"
+...         labels = [feature.pop(label_name) for feature in features]
+...         batch_size = len(features)
+...         num_choices = len(features[0]["input_ids"])
+...         flattened_features = [
+...             [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
+...         ]
+...         flattened_features = sum(flattened_features, [])
+
+...         batch = self.tokenizer.pad(
+...             flattened_features,
+...             padding=self.padding,
+...             max_length=self.max_length,
+...             pad_to_multiple_of=self.pad_to_multiple_of,
+...             return_tensors="pt",
+...         )
+
+...         batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
+...         batch["labels"] = torch.tensor(labels, dtype=torch.int64)
+...         return batch
+```
+</pt>
+<tf>
+```py
+>>> from dataclasses import dataclass
+>>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
+>>> from typing import Optional, Union
+>>> import tensorflow as tf
+
+
+>>> @dataclass
+... class DataCollatorForMultipleChoice:
+...     """
+...     Data collator that will dynamically pad the inputs for multiple choice received.
+...     """
+
+...     tokenizer: PreTrainedTokenizerBase
+...     padding: Union[bool, str, PaddingStrategy] = True
+...     max_length: Optional[int] = None
+...     pad_to_multiple_of: Optional[int] = None
+
+...     def __call__(self, features):
+...         label_name = "label" if "label" in features[0].keys() else "labels"
+...         labels = [feature.pop(label_name) for feature in features]
+...         batch_size = len(features)
+...         num_choices = len(features[0]["input_ids"])
+...         flattened_features = [
+...             [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
+...         ]
+...         flattened_features = sum(flattened_features, [])
+
+...         batch = self.tokenizer.pad(
+...             flattened_features,
+...             padding=self.padding,
+...             max_length=self.max_length,
+...             pad_to_multiple_of=self.pad_to_multiple_of,
+...             return_tensors="tf",
+...         )
+
+...         batch = {k: tf.reshape(v, (batch_size, num_choices, -1)) for k, v in batch.items()}
+...         batch["labels"] = tf.convert_to_tensor(labels, dtype=tf.int64)
+...         return batch
+```
+</tf>
+</frameworkcontent>
+
+## Evaluate
+
+Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric):
+
+```py
+>>> import evaluate
+
+>>> accuracy = evaluate.load("accuracy")
+```
+
+Then create a function that passes your predictions and labels to [`~evaluate.EvaluationModule.compute`] to calculate the accuracy:
+
+```py
+>>> import numpy as np
+
+
+>>> def compute_metrics(eval_pred):
+...     predictions, labels = eval_pred
+...     predictions = np.argmax(predictions, axis=1)
+...     return accuracy.compute(predictions=predictions, references=labels)
+```
+
+Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training.
+
+## Train
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)!
+
+</Tip>
+
+You're ready to start training your model now! Load BERT with [`AutoModelForMultipleChoice`]:
+
+```py
+>>> from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
+
+>>> model = AutoModelForMultipleChoice.from_pretrained("bert-base-uncased")
+```
+
+At this point, only three steps remain:
+
+1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the accuracy and save the training checkpoint.
+2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function.
+3. Call [`~Trainer.train`] to finetune your model.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_swag_model",
+...     evaluation_strategy="epoch",
+...     save_strategy="epoch",
+...     load_best_model_at_end=True,
+...     learning_rate=5e-5,
+...     per_device_train_batch_size=16,
+...     per_device_eval_batch_size=16,
+...     num_train_epochs=3,
+...     weight_decay=0.01,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=tokenized_swag["train"],
+...     eval_dataset=tokenized_swag["validation"],
+...     tokenizer=tokenizer,
+...     data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+<Tip>
+
+If you aren't familiar with finetuning a model with Keras, take a look at the basic tutorial [here](../training#train-a-tensorflow-model-with-keras)!
+
+</Tip>
+To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters:
+
+```py
+>>> from transformers import create_optimizer
+
+>>> batch_size = 16
+>>> num_train_epochs = 2
+>>> total_train_steps = (len(tokenized_swag["train"]) // batch_size) * num_train_epochs
+>>> optimizer, schedule = create_optimizer(init_lr=5e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
+```
+
+Then you can load BERT with [`TFAutoModelForMultipleChoice`]:
+
+```py
+>>> from transformers import TFAutoModelForMultipleChoice
+
+>>> model = TFAutoModelForMultipleChoice.from_pretrained("bert-base-uncased")
+```
+
+Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
+
+```py
+>>> data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
+>>> tf_train_set = model.prepare_tf_dataset(
+...     tokenized_swag["train"],
+...     shuffle=True,
+...     batch_size=batch_size,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_validation_set = model.prepare_tf_dataset(
+...     tokenized_swag["validation"],
+...     shuffle=False,
+...     batch_size=batch_size,
+...     collate_fn=data_collator,
+... )
+```
+
+Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to:
+
+```py
+>>> model.compile(optimizer=optimizer)  # No loss argument!
+```
+
+The last two things to setup before you start training is to compute the accuracy from the predictions, and provide a way to push your model to the Hub. Both are done by using [Keras callbacks](../main_classes/keras_callbacks).
+
+Pass your `compute_metrics` function to [`~transformers.KerasMetricCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import KerasMetricCallback
+
+>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
+```
+
+Specify where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> push_to_hub_callback = PushToHubCallback(
+...     output_dir="my_awesome_model",
+...     tokenizer=tokenizer,
+... )
+```
+
+Then bundle your callbacks together:
+
+```py
+>>> callbacks = [metric_callback, push_to_hub_callback]
+```
+
+Finally, you're ready to start training your model! Call [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) with your training and validation datasets, the number of epochs, and your callbacks to finetune the model:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=2, callbacks=callbacks)
+```
+
+Once training is completed, your model is automatically uploaded to the Hub so everyone can use it!
+</tf>
+</frameworkcontent>
+
+
+<Tip>
+
+For a more in-depth example of how to finetune a model for multiple choice, take a look at the corresponding
+[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)
+or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb).
+
+</Tip>
+
+## Inference
+
+Great, now that you've finetuned a model, you can use it for inference!
+
+Come up with some text and two candidate answers:
+
+```py
+>>> prompt = "France has a bread law, Le Décret Pain, with strict rules on what is allowed in a traditional baguette."
+>>> candidate1 = "The law does not apply to croissants and brioche."
+>>> candidate2 = "The law applies to baguettes."
+```
+
+<frameworkcontent>
+<pt>
+Tokenize each prompt and candidate answer pair and return PyTorch tensors. You should also create some `labels`:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_swag_model")
+>>> inputs = tokenizer([[prompt, candidate1], [prompt, candidate2]], return_tensors="pt", padding=True)
+>>> labels = torch.tensor(0).unsqueeze(0)
+```
+
+Pass your inputs and labels to the model and return the `logits`:
+
+```py
+>>> from transformers import AutoModelForMultipleChoice
+
+>>> model = AutoModelForMultipleChoice.from_pretrained("my_awesome_swag_model")
+>>> outputs = model(**{k: v.unsqueeze(0) for k, v in inputs.items()}, labels=labels)
+>>> logits = outputs.logits
+```
+
+Get the class with the highest probability:
+
+```py
+>>> predicted_class = logits.argmax().item()
+>>> predicted_class
+'0'
+```
+</pt>
+<tf>
+Tokenize each prompt and candidate answer pair and return TensorFlow tensors:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_swag_model")
+>>> inputs = tokenizer([[prompt, candidate1], [prompt, candidate2]], return_tensors="tf", padding=True)
+```
+
+Pass your inputs to the model and return the `logits`:
+
+```py
+>>> from transformers import TFAutoModelForMultipleChoice
+
+>>> model = TFAutoModelForMultipleChoice.from_pretrained("my_awesome_swag_model")
+>>> inputs = {k: tf.expand_dims(v, 0) for k, v in inputs.items()}
+>>> outputs = model(inputs)
+>>> logits = outputs.logits
+```
+
+Get the class with the highest probability:
+
+```py
+>>> predicted_class = int(tf.math.argmax(logits, axis=-1)[0])
+>>> predicted_class
+'0'
+```
+</tf>
+</frameworkcontent>
diff --git a/docs/source/en/tasks/multiple_choice.mdx b/docs/source/en/tasks/multiple_choice.mdx
deleted file mode 100644
index 1a1a517df7da..000000000000
--- a/docs/source/en/tasks/multiple_choice.mdx
+++ /dev/null
@@ -1,447 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Multiple choice
-
-A multiple choice task is similar to question answering, except several candidate answers are provided along with a context and the model is trained to select the correct answer.
-
-This guide will show you how to:
-
-1. Finetune [BERT](https://huggingface.co/bert-base-uncased) on the `regular` configuration of the [SWAG](https://huggingface.co/datasets/swag) dataset to select the best answer given multiple options and some context.
-2. Use your finetuned model for inference.
-
-Before you begin, make sure you have all the necessary libraries installed:
-
-```bash
-pip install transformers datasets evaluate
-```
-
-We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login:
-
-```py
->>> from huggingface_hub import notebook_login
-
->>> notebook_login()
-```
-
-## Load SWAG dataset
-
-Start by loading the `regular` configuration of the SWAG dataset from the 🤗 Datasets library:
-
-```py
->>> from datasets import load_dataset
-
->>> swag = load_dataset("swag", "regular")
-```
-
-Then take a look at an example:
-
-```py
->>> swag["train"][0]
-{'ending0': 'passes by walking down the street playing their instruments.',
- 'ending1': 'has heard approaching them.',
- 'ending2': "arrives and they're outside dancing and asleep.",
- 'ending3': 'turns the lead singer watches the performance.',
- 'fold-ind': '3416',
- 'gold-source': 'gold',
- 'label': 0,
- 'sent1': 'Members of the procession walk down the street holding small horn brass instruments.',
- 'sent2': 'A drum line',
- 'startphrase': 'Members of the procession walk down the street holding small horn brass instruments. A drum line',
- 'video-id': 'anetv_jkn6uvmqwh4'}
-```
-
-While it looks like there are a lot of fields here, it is actually pretty straightforward:
-
-- `sent1` and `sent2`: these fields show how a sentence starts, and if you put the two together, you get the `startphrase` field.
-- `ending`: suggests a possible ending for how a sentence can end, but only one of them is correct.
-- `label`: identifies the correct sentence ending.
-
-## Preprocess
-
-The next step is to load a BERT tokenizer to process the sentence starts and the four possible endings:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
-```
-
-The preprocessing function you want to create needs to:
-
-1. Make four copies of the `sent1` field and combine each of them with `sent2` to recreate how a sentence starts.
-2. Combine `sent2` with each of the four possible sentence endings.
-3. Flatten these two lists so you can tokenize them, and then unflatten them afterward so each example has a corresponding `input_ids`, `attention_mask`, and `labels` field.
-
-```py
->>> ending_names = ["ending0", "ending1", "ending2", "ending3"]
-
-
->>> def preprocess_function(examples):
-...     first_sentences = [[context] * 4 for context in examples["sent1"]]
-...     question_headers = examples["sent2"]
-...     second_sentences = [
-...         [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)
-...     ]
-
-...     first_sentences = sum(first_sentences, [])
-...     second_sentences = sum(second_sentences, [])
-
-...     tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
-...     return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}
-```
-
-To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] method. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once:
-
-```py
-tokenized_swag = swag.map(preprocess_function, batched=True)
-```
-
-🤗 Transformers doesn't have a data collator for multiple choice, so you'll need to adapt the [`DataCollatorWithPadding`] to create a batch of examples. It's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximium length.
-
-`DataCollatorForMultipleChoice` flattens all the model inputs, applies padding, and then unflattens the results:
-
-<frameworkcontent>
-<pt>
-```py
->>> from dataclasses import dataclass
->>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
->>> from typing import Optional, Union
->>> import torch
-
-
->>> @dataclass
-... class DataCollatorForMultipleChoice:
-...     """
-...     Data collator that will dynamically pad the inputs for multiple choice received.
-...     """
-
-...     tokenizer: PreTrainedTokenizerBase
-...     padding: Union[bool, str, PaddingStrategy] = True
-...     max_length: Optional[int] = None
-...     pad_to_multiple_of: Optional[int] = None
-
-...     def __call__(self, features):
-...         label_name = "label" if "label" in features[0].keys() else "labels"
-...         labels = [feature.pop(label_name) for feature in features]
-...         batch_size = len(features)
-...         num_choices = len(features[0]["input_ids"])
-...         flattened_features = [
-...             [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
-...         ]
-...         flattened_features = sum(flattened_features, [])
-
-...         batch = self.tokenizer.pad(
-...             flattened_features,
-...             padding=self.padding,
-...             max_length=self.max_length,
-...             pad_to_multiple_of=self.pad_to_multiple_of,
-...             return_tensors="pt",
-...         )
-
-...         batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
-...         batch["labels"] = torch.tensor(labels, dtype=torch.int64)
-...         return batch
-```
-</pt>
-<tf>
-```py
->>> from dataclasses import dataclass
->>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
->>> from typing import Optional, Union
->>> import tensorflow as tf
-
-
->>> @dataclass
-... class DataCollatorForMultipleChoice:
-...     """
-...     Data collator that will dynamically pad the inputs for multiple choice received.
-...     """
-
-...     tokenizer: PreTrainedTokenizerBase
-...     padding: Union[bool, str, PaddingStrategy] = True
-...     max_length: Optional[int] = None
-...     pad_to_multiple_of: Optional[int] = None
-
-...     def __call__(self, features):
-...         label_name = "label" if "label" in features[0].keys() else "labels"
-...         labels = [feature.pop(label_name) for feature in features]
-...         batch_size = len(features)
-...         num_choices = len(features[0]["input_ids"])
-...         flattened_features = [
-...             [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
-...         ]
-...         flattened_features = sum(flattened_features, [])
-
-...         batch = self.tokenizer.pad(
-...             flattened_features,
-...             padding=self.padding,
-...             max_length=self.max_length,
-...             pad_to_multiple_of=self.pad_to_multiple_of,
-...             return_tensors="tf",
-...         )
-
-...         batch = {k: tf.reshape(v, (batch_size, num_choices, -1)) for k, v in batch.items()}
-...         batch["labels"] = tf.convert_to_tensor(labels, dtype=tf.int64)
-...         return batch
-```
-</tf>
-</frameworkcontent>
-
-## Evaluate
-
-Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric):
-
-```py
->>> import evaluate
-
->>> accuracy = evaluate.load("accuracy")
-```
-
-Then create a function that passes your predictions and labels to [`~evaluate.EvaluationModule.compute`] to calculate the accuracy:
-
-```py
->>> import numpy as np
-
-
->>> def compute_metrics(eval_pred):
-...     predictions, labels = eval_pred
-...     predictions = np.argmax(predictions, axis=1)
-...     return accuracy.compute(predictions=predictions, references=labels)
-```
-
-Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training.
-
-## Train
-
-<frameworkcontent>
-<pt>
-<Tip>
-
-If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)!
-
-</Tip>
-You're ready to start training your model now! Load BERT with [`AutoModelForMultipleChoice`]:
-
-```py
->>> from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
-
->>> model = AutoModelForMultipleChoice.from_pretrained("bert-base-uncased")
-```
-
-At this point, only three steps remain:
-
-1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the accuracy and save the training checkpoint.
-2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function.
-3. Call [`~Trainer.train`] to finetune your model.
-
-```py
->>> training_args = TrainingArguments(
-...     output_dir="my_awesome_swag_model",
-...     evaluation_strategy="epoch",
-...     save_strategy="epoch",
-...     load_best_model_at_end=True,
-...     learning_rate=5e-5,
-...     per_device_train_batch_size=16,
-...     per_device_eval_batch_size=16,
-...     num_train_epochs=3,
-...     weight_decay=0.01,
-...     push_to_hub=True,
-... )
-
->>> trainer = Trainer(
-...     model=model,
-...     args=training_args,
-...     train_dataset=tokenized_swag["train"],
-...     eval_dataset=tokenized_swag["validation"],
-...     tokenizer=tokenizer,
-...     data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
-...     compute_metrics=compute_metrics,
-... )
-
->>> trainer.train()
-```
-
-Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model:
-
-```py
->>> trainer.push_to_hub()
-```
-</pt>
-<tf>
-<Tip>
-
-If you aren't familiar with finetuning a model with Keras, take a look at the basic tutorial [here](../training#train-a-tensorflow-model-with-keras)!
-
-</Tip>
-To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters:
-
-```py
->>> from transformers import create_optimizer
-
->>> batch_size = 16
->>> num_train_epochs = 2
->>> total_train_steps = (len(tokenized_swag["train"]) // batch_size) * num_train_epochs
->>> optimizer, schedule = create_optimizer(init_lr=5e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
-```
-
-Then you can load BERT with [`TFAutoModelForMultipleChoice`]:
-
-```py
->>> from transformers import TFAutoModelForMultipleChoice
-
->>> model = TFAutoModelForMultipleChoice.from_pretrained("bert-base-uncased")
-```
-
-Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
-
-```py
->>> data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
->>> tf_train_set = model.prepare_tf_dataset(
-...     tokenized_swag["train"],
-...     shuffle=True,
-...     batch_size=batch_size,
-...     collate_fn=data_collator,
-... )
-
->>> tf_validation_set = model.prepare_tf_dataset(
-...     tokenized_swag["validation"],
-...     shuffle=False,
-...     batch_size=batch_size,
-...     collate_fn=data_collator,
-... )
-```
-
-Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
-
-```py
->>> model.compile(optimizer=optimizer)
-```
-
-The last two things to setup before you start training is to compute the accuracy from the predictions, and provide a way to push your model to the Hub. Both are done by using [Keras callbacks](./main_classes/keras_callbacks).
-
-Pass your `compute_metrics` function to [`~transformers.KerasMetricCallback`]:
-
-```py
->>> from transformers.keras_callbacks import KerasMetricCallback
-
->>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
-```
-
-Specify where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]:
-
-```py
->>> from transformers.keras_callbacks import PushToHubCallback
-
->>> push_to_hub_callback = PushToHubCallback(
-...     output_dir="my_awesome_model",
-...     tokenizer=tokenizer,
-... )
-```
-
-Then bundle your callbacks together:
-
-```py
->>> callbacks = [metric_callback, push_to_hub_callback]
-```
-
-Finally, you're ready to start training your model! Call [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) with your training and validation datasets, the number of epochs, and your callbacks to finetune the model:
-
-```py
->>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=2, callbacks=callbacks)
-```
-
-Once training is completed, your model is automatically uploaded to the Hub so everyone can use it!
-</tf>
-</frameworkcontent>
-
-
-<Tip>
-
-For a more in-depth example of how to finetune a model for multiple choice, take a look at the corresponding
-[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)
-or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb).
-
-</Tip>
-
-## Inference
-
-Great, now that you've finetuned a model, you can use it for inference!
-
-Come up with some text and two candidate answers:
-
-```py
->>> prompt = "France has a bread law, Le Décret Pain, with strict rules on what is allowed in a traditional baguette."
->>> candidate1 = "The law does not apply to croissants and brioche."
->>> candidate2 = "The law applies to baguettes."
-```
-
-<frameworkcontent>
-<pt>
-Tokenize each prompt and candidate answer pair and return PyTorch tensors. You should also create some `labels`:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_swag_model")
->>> inputs = tokenizer([[prompt, candidate1], [prompt, candidate2]], return_tensors="pt", padding=True)
->>> labels = torch.tensor(0).unsqueeze(0)
-```
-
-Pass your inputs and labels to the model and return the `logits`:
-
-```py
->>> from transformers import AutoModelForMultipleChoice
-
->>> model = AutoModelForMultipleChoice.from_pretrained("my_awesome_swag_model")
->>> outputs = model(**{k: v.unsqueeze(0) for k, v in inputs.items()}, labels=labels)
->>> logits = outputs.logits
-```
-
-Get the class with the highest probability:
-
-```py
->>> predicted_class = logits.argmax().item()
->>> predicted_class
-'0'
-```
-</pt>
-<tf>
-Tokenize each prompt and candidate answer pair and return TensorFlow tensors:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_swag_model")
->>> inputs = tokenizer([[prompt, candidate1], [prompt, candidate2]], return_tensors="tf", padding=True)
-```
-
-Pass your inputs to the model and return the `logits`:
-
-```py
->>> from transformers import TFAutoModelForMultipleChoice
-
->>> model = TFAutoModelForMultipleChoice.from_pretrained("my_awesome_swag_model")
->>> inputs = {k: tf.expand_dims(v, 0) for k, v in inputs.items()}
->>> outputs = model(inputs)
->>> logits = outputs.logits
-```
-
-Get the class with the highest probability:
-
-```py
->>> predicted_class = int(tf.math.argmax(logits, axis=-1)[0])
->>> predicted_class
-'0'
-```
-</tf>
-</frameworkcontent>
\ No newline at end of file
diff --git a/docs/source/en/tasks/object_detection.md b/docs/source/en/tasks/object_detection.md
new file mode 100644
index 000000000000..7511ee66dd0b
--- /dev/null
+++ b/docs/source/en/tasks/object_detection.md
@@ -0,0 +1,593 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Object detection
+
+[[open-in-colab]]
+
+Object detection is the computer vision task of detecting instances (such as humans, buildings, or cars) in an image. Object detection models receive an image as input and output
+coordinates of the bounding boxes and associated labels of the detected objects. An image can contain multiple objects,
+each with its own bounding box and a label (e.g. it can have a car and a building), and each object can
+be present in different parts of an image (e.g. the image can have several cars).
+This task is commonly used in autonomous driving for detecting things like pedestrians, road signs, and traffic lights.
+Other applications include counting objects in images, image search, and more.
+
+In this guide, you will learn how to:
+
+ 1. Finetune [DETR](https://huggingface.co/docs/transformers/model_doc/detr), a model that combines a convolutional
+ backbone with an encoder-decoder Transformer, on the [CPPE-5](https://huggingface.co/datasets/cppe-5)
+ dataset.
+ 2. Use your finetuned model for inference.
+
+<Tip>
+The task illustrated in this tutorial is supported by the following model architectures:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[Conditional DETR](../model_doc/conditional_detr), [Deformable DETR](../model_doc/deformable_detr), [DETA](../model_doc/deta), [DETR](../model_doc/detr), [Table Transformer](../model_doc/table-transformer), [YOLOS](../model_doc/yolos)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+Before you begin, make sure you have all the necessary libraries installed:
+
+```bash
+pip install -q datasets transformers evaluate timm albumentations
+```
+
+You'll use 🤗 Datasets to load a dataset from the Hugging Face Hub, 🤗 Transformers to train your model,
+and `albumentations` to augment the data. `timm` is currently required to load a convolutional backbone for the DETR model.
+
+We encourage you to share your model with the community. Log in to your Hugging Face account to upload it to the Hub.
+When prompted, enter your token to log in:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## Load the CPPE-5 dataset
+
+The [CPPE-5 dataset](https://huggingface.co/datasets/cppe-5) contains images with
+annotations identifying medical personal protective equipment (PPE) in the context of the COVID-19 pandemic.
+
+Start by loading the dataset:
+
+```py
+>>> from datasets import load_dataset
+
+>>> cppe5 = load_dataset("cppe-5")
+>>> cppe5
+DatasetDict({
+    train: Dataset({
+        features: ['image_id', 'image', 'width', 'height', 'objects'],
+        num_rows: 1000
+    })
+    test: Dataset({
+        features: ['image_id', 'image', 'width', 'height', 'objects'],
+        num_rows: 29
+    })
+})
+```
+
+You'll see that this dataset already comes with a training set containing 1000 images and a test set with 29 images.
+
+To get familiar with the data, explore what the examples look like.
+
+```py
+>>> cppe5["train"][0]
+{'image_id': 15,
+ 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=943x663 at 0x7F9EC9E77C10>,
+ 'width': 943,
+ 'height': 663,
+ 'objects': {'id': [114, 115, 116, 117],
+  'area': [3796, 1596, 152768, 81002],
+  'bbox': [[302.0, 109.0, 73.0, 52.0],
+   [810.0, 100.0, 57.0, 28.0],
+   [160.0, 31.0, 248.0, 616.0],
+   [741.0, 68.0, 202.0, 401.0]],
+  'category': [4, 4, 0, 0]}}
+```
+
+The examples in the dataset have the following fields:
+- `image_id`: the example image id
+- `image`: a `PIL.Image.Image` object containing the image
+- `width`: width of the image
+- `height`: height of the image
+- `objects`: a dictionary containing bounding box metadata for the objects in the image:
+  - `id`: the annotation id
+  - `area`: the area of the bounding box
+  - `bbox`: the object's bounding box (in the [COCO format](https://albumentations.ai/docs/getting_started/bounding_boxes_augmentation/#coco) )
+  - `category`: the object's category, with possible values including `Coverall (0)`, `Face_Shield (1)`, `Gloves (2)`, `Goggles (3)` and `Mask (4)`
+
+You may notice that the `bbox` field follows the COCO format, which is the format that the DETR model expects.
+However, the grouping of the fields inside `objects` differs from the annotation format DETR requires. You will
+need to apply some preprocessing transformations before using this data for training.
+
+To get an even better understanding of the data, visualize an example in the dataset.
+
+```py
+>>> import numpy as np
+>>> import os
+>>> from PIL import Image, ImageDraw
+
+>>> image = cppe5["train"][0]["image"]
+>>> annotations = cppe5["train"][0]["objects"]
+>>> draw = ImageDraw.Draw(image)
+
+>>> categories = cppe5["train"].features["objects"].feature["category"].names
+
+>>> id2label = {index: x for index, x in enumerate(categories, start=0)}
+>>> label2id = {v: k for k, v in id2label.items()}
+
+>>> for i in range(len(annotations["id"])):
+...     box = annotations["bbox"][i]
+...     class_idx = annotations["category"][i]
+...     x, y, w, h = tuple(box)
+...     draw.rectangle((x, y, x + w, y + h), outline="red", width=1)
+...     draw.text((x, y), id2label[class_idx], fill="white")
+
+>>> image
+```
+
+<div class="flex justify-center">
+    <img src="https://i.imgur.com/TdaqPJO.png" alt="CPPE-5 Image Example"/>
+</div>
+
+To visualize the bounding boxes with associated labels, you can get the labels from the dataset's metadata, specifically
+the `category` field.
+You'll also want to create dictionaries that map a label id to a label class (`id2label`) and the other way around (`label2id`).
+You can use them later when setting up the model. Including these maps will make your model reusable by others if you share
+it on the Hugging Face Hub.
+
+As a final step of getting familiar with the data, explore it for potential issues. One common problem with datasets for
+object detection is bounding boxes that "stretch" beyond the edge of the image. Such "runaway" bounding boxes can raise
+errors during training and should be addressed at this stage. There are a few examples with this issue in this dataset.
+To keep things simple in this guide, we remove these images from the data.
+
+```py
+>>> remove_idx = [590, 821, 822, 875, 876, 878, 879]
+>>> keep = [i for i in range(len(cppe5["train"])) if i not in remove_idx]
+>>> cppe5["train"] = cppe5["train"].select(keep)
+```
+
+## Preprocess the data
+
+To finetune a model, you must preprocess the data you plan to use to match precisely the approach used for the pre-trained model.
+[`AutoImageProcessor`] takes care of processing image data to create `pixel_values`, `pixel_mask`, and
+`labels` that a DETR model can train with. The image processor has some attributes that you won't have to worry about:
+
+- `image_mean = [0.485, 0.456, 0.406 ]`
+- `image_std = [0.229, 0.224, 0.225]`
+
+These are the mean and standard deviation used to normalize images during the model pre-training. These values are crucial
+to replicate when doing inference or finetuning a pre-trained image model.
+
+Instantiate the image processor from the same checkpoint as the model you want to finetune.
+
+```py
+>>> from transformers import AutoImageProcessor
+
+>>> checkpoint = "facebook/detr-resnet-50"
+>>> image_processor = AutoImageProcessor.from_pretrained(checkpoint)
+```
+
+Before passing the images to the `image_processor`, apply two preprocessing transformations to the dataset:
+- Augmenting images
+- Reformatting annotations to meet DETR expectations
+
+First, to make sure the model does not overfit on the training data, you can apply image augmentation with any data augmentation library. Here we use [Albumentations](https://albumentations.ai/docs/) ...
+This library ensures that transformations affect the image and update the bounding boxes accordingly.
+The 🤗 Datasets library documentation has a detailed [guide on how to augment images for object detection](https://huggingface.co/docs/datasets/object_detection),
+and it uses the exact same dataset as an example. Apply the same approach here, resize each image to (480, 480),
+flip it horizontally, and brighten it:
+
+```py
+>>> import albumentations
+>>> import numpy as np
+>>> import torch
+
+>>> transform = albumentations.Compose(
+...     [
+...         albumentations.Resize(480, 480),
+...         albumentations.HorizontalFlip(p=1.0),
+...         albumentations.RandomBrightnessContrast(p=1.0),
+...     ],
+...     bbox_params=albumentations.BboxParams(format="coco", label_fields=["category"]),
+... )
+```
+
+The `image_processor` expects the annotations to be in the following format: `{'image_id': int, 'annotations': List[Dict]}`,
+ where each dictionary is a COCO object annotation. Let's add a function to reformat annotations for a single example:
+
+```py
+>>> def formatted_anns(image_id, category, area, bbox):
+...     annotations = []
+...     for i in range(0, len(category)):
+...         new_ann = {
+...             "image_id": image_id,
+...             "category_id": category[i],
+...             "isCrowd": 0,
+...             "area": area[i],
+...             "bbox": list(bbox[i]),
+...         }
+...         annotations.append(new_ann)
+
+...     return annotations
+```
+
+Now you can combine the image and annotation transformations to use on a batch of examples:
+
+```py
+>>> # transforming a batch
+>>> def transform_aug_ann(examples):
+...     image_ids = examples["image_id"]
+...     images, bboxes, area, categories = [], [], [], []
+...     for image, objects in zip(examples["image"], examples["objects"]):
+...         image = np.array(image.convert("RGB"))[:, :, ::-1]
+...         out = transform(image=image, bboxes=objects["bbox"], category=objects["category"])
+
+...         area.append(objects["area"])
+...         images.append(out["image"])
+...         bboxes.append(out["bboxes"])
+...         categories.append(out["category"])
+
+...     targets = [
+...         {"image_id": id_, "annotations": formatted_anns(id_, cat_, ar_, box_)}
+...         for id_, cat_, ar_, box_ in zip(image_ids, categories, area, bboxes)
+...     ]
+
+...     return image_processor(images=images, annotations=targets, return_tensors="pt")
+```
+
+Apply this preprocessing function to the entire dataset using 🤗 Datasets [`~datasets.Dataset.with_transform`] method. This method applies
+transformations on the fly when you load an element of the dataset.
+
+At this point, you can check what an example from the dataset looks like after the transformations. You should see a tensor
+with `pixel_values`, a tensor with `pixel_mask`, and `labels`.
+
+```py
+>>> cppe5["train"] = cppe5["train"].with_transform(transform_aug_ann)
+>>> cppe5["train"][15]
+{'pixel_values': tensor([[[ 0.9132,  0.9132,  0.9132,  ..., -1.9809, -1.9809, -1.9809],
+          [ 0.9132,  0.9132,  0.9132,  ..., -1.9809, -1.9809, -1.9809],
+          [ 0.9132,  0.9132,  0.9132,  ..., -1.9638, -1.9638, -1.9638],
+          ...,
+          [-1.5699, -1.5699, -1.5699,  ..., -1.9980, -1.9980, -1.9980],
+          [-1.5528, -1.5528, -1.5528,  ..., -1.9980, -1.9809, -1.9809],
+          [-1.5528, -1.5528, -1.5528,  ..., -1.9980, -1.9809, -1.9809]],
+
+         [[ 1.3081,  1.3081,  1.3081,  ..., -1.8431, -1.8431, -1.8431],
+          [ 1.3081,  1.3081,  1.3081,  ..., -1.8431, -1.8431, -1.8431],
+          [ 1.3081,  1.3081,  1.3081,  ..., -1.8256, -1.8256, -1.8256],
+          ...,
+          [-1.3179, -1.3179, -1.3179,  ..., -1.8606, -1.8606, -1.8606],
+          [-1.3004, -1.3004, -1.3004,  ..., -1.8606, -1.8431, -1.8431],
+          [-1.3004, -1.3004, -1.3004,  ..., -1.8606, -1.8431, -1.8431]],
+
+         [[ 1.4200,  1.4200,  1.4200,  ..., -1.6476, -1.6476, -1.6476],
+          [ 1.4200,  1.4200,  1.4200,  ..., -1.6476, -1.6476, -1.6476],
+          [ 1.4200,  1.4200,  1.4200,  ..., -1.6302, -1.6302, -1.6302],
+          ...,
+          [-1.0201, -1.0201, -1.0201,  ..., -1.5604, -1.5604, -1.5604],
+          [-1.0027, -1.0027, -1.0027,  ..., -1.5604, -1.5430, -1.5430],
+          [-1.0027, -1.0027, -1.0027,  ..., -1.5604, -1.5430, -1.5430]]]),
+ 'pixel_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
+         [1, 1, 1,  ..., 1, 1, 1],
+         [1, 1, 1,  ..., 1, 1, 1],
+         ...,
+         [1, 1, 1,  ..., 1, 1, 1],
+         [1, 1, 1,  ..., 1, 1, 1],
+         [1, 1, 1,  ..., 1, 1, 1]]),
+ 'labels': {'size': tensor([800, 800]), 'image_id': tensor([756]), 'class_labels': tensor([4]), 'boxes': tensor([[0.7340, 0.6986, 0.3414, 0.5944]]), 'area': tensor([519544.4375]), 'iscrowd': tensor([0]), 'orig_size': tensor([480, 480])}}
+```
+
+You have successfully augmented the individual images and prepared their annotations. However, preprocessing isn't
+complete yet. In the final step, create a custom `collate_fn` to batch images together.
+Pad images (which are now `pixel_values`) to the largest image in a batch, and create a corresponding `pixel_mask`
+to indicate which pixels are real (1) and which are padding (0).
+
+```py
+>>> def collate_fn(batch):
+...     pixel_values = [item["pixel_values"] for item in batch]
+...     encoding = image_processor.pad(pixel_values, return_tensors="pt")
+...     labels = [item["labels"] for item in batch]
+...     batch = {}
+...     batch["pixel_values"] = encoding["pixel_values"]
+...     batch["pixel_mask"] = encoding["pixel_mask"]
+...     batch["labels"] = labels
+...     return batch
+```
+
+## Training the DETR model
+You have done most of the heavy lifting in the previous sections, so now you are ready to train your model!
+The images in this dataset are still quite large, even after resizing. This means that finetuning this model will
+require at least one GPU.
+
+Training involves the following steps:
+1. Load the model with [`AutoModelForObjectDetection`] using the same checkpoint as in the preprocessing.
+2. Define your training hyperparameters in [`TrainingArguments`].
+3. Pass the training arguments to [`Trainer`] along with the model, dataset, image processor, and data collator.
+4. Call [`~Trainer.train`] to finetune your model.
+
+When loading the model from the same checkpoint that you used for the preprocessing, remember to pass the `label2id`
+and `id2label` maps that you created earlier from the dataset's metadata. Additionally, we specify `ignore_mismatched_sizes=True` to replace the existing classification head with a new one.
+
+```py
+>>> from transformers import AutoModelForObjectDetection
+
+>>> model = AutoModelForObjectDetection.from_pretrained(
+...     checkpoint,
+...     id2label=id2label,
+...     label2id=label2id,
+...     ignore_mismatched_sizes=True,
+... )
+```
+
+In the [`TrainingArguments`] use `output_dir` to specify where to save your model, then configure hyperparameters as you see fit.
+It is important you do not remove unused columns because this will drop the image column. Without the image column, you
+can't create `pixel_values`. For this reason, set `remove_unused_columns` to `False`.
+If you wish to share your model by pushing to the Hub, set `push_to_hub` to `True` (you must be signed in to Hugging
+Face to upload your model).
+
+```py
+>>> from transformers import TrainingArguments
+
+>>> training_args = TrainingArguments(
+...     output_dir="detr-resnet-50_finetuned_cppe5",
+...     per_device_train_batch_size=8,
+...     num_train_epochs=10,
+...     fp16=True,
+...     save_steps=200,
+...     logging_steps=50,
+...     learning_rate=1e-5,
+...     weight_decay=1e-4,
+...     save_total_limit=2,
+...     remove_unused_columns=False,
+...     push_to_hub=True,
+... )
+```
+
+Finally, bring everything together, and call [`~transformers.Trainer.train`]:
+
+```py
+>>> from transformers import Trainer
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     data_collator=collate_fn,
+...     train_dataset=cppe5["train"],
+...     tokenizer=image_processor,
+... )
+
+>>> trainer.train()
+```
+
+If you have set `push_to_hub` to `True` in the `training_args`, the training checkpoints are pushed to the
+Hugging Face Hub. Upon training completion, push the final model to the Hub as well by calling the [`~transformers.Trainer.push_to_hub`] method.
+
+```py
+>>> trainer.push_to_hub()
+```
+
+## Evaluate
+Object detection models are commonly evaluated with a set of <a href="https://cocodataset.org/#detection-eval">COCO-style metrics</a>.
+You can use one of the existing metrics implementations, but here you'll use the one from `torchvision` to evaluate the final
+model that you pushed to the Hub.
+
+To use the `torchvision` evaluator, you'll need to prepare a ground truth COCO dataset. The API to build a COCO dataset
+requires the data to be stored in a certain format, so you'll need to save images and annotations to disk first. Just like
+when you prepared your data for training, the annotations from the `cppe5["test"]` need to be formatted. However, images
+should stay as they are.
+
+The evaluation step requires a bit of work, but it can be split in three major steps.
+First, prepare the `cppe5["test"]` set: format the annotations and save the data to disk.
+
+```py
+>>> import json
+
+
+>>> # format annotations the same as for training, no need for data augmentation
+>>> def val_formatted_anns(image_id, objects):
+...     annotations = []
+...     for i in range(0, len(objects["id"])):
+...         new_ann = {
+...             "id": objects["id"][i],
+...             "category_id": objects["category"][i],
+...             "iscrowd": 0,
+...             "image_id": image_id,
+...             "area": objects["area"][i],
+...             "bbox": objects["bbox"][i],
+...         }
+...         annotations.append(new_ann)
+
+...     return annotations
+
+
+>>> # Save images and annotations into the files torchvision.datasets.CocoDetection expects
+>>> def save_cppe5_annotation_file_images(cppe5):
+...     output_json = {}
+...     path_output_cppe5 = f"{os.getcwd()}/cppe5/"
+
+...     if not os.path.exists(path_output_cppe5):
+...         os.makedirs(path_output_cppe5)
+
+...     path_anno = os.path.join(path_output_cppe5, "cppe5_ann.json")
+...     categories_json = [{"supercategory": "none", "id": id, "name": id2label[id]} for id in id2label]
+...     output_json["images"] = []
+...     output_json["annotations"] = []
+...     for example in cppe5:
+...         ann = val_formatted_anns(example["image_id"], example["objects"])
+...         output_json["images"].append(
+...             {
+...                 "id": example["image_id"],
+...                 "width": example["image"].width,
+...                 "height": example["image"].height,
+...                 "file_name": f"{example['image_id']}.png",
+...             }
+...         )
+...         output_json["annotations"].extend(ann)
+...     output_json["categories"] = categories_json
+
+...     with open(path_anno, "w") as file:
+...         json.dump(output_json, file, ensure_ascii=False, indent=4)
+
+...     for im, img_id in zip(cppe5["image"], cppe5["image_id"]):
+...         path_img = os.path.join(path_output_cppe5, f"{img_id}.png")
+...         im.save(path_img)
+
+...     return path_output_cppe5, path_anno
+```
+
+Next, prepare an instance of a `CocoDetection` class that can be used with `cocoevaluator`.
+
+```py
+>>> import torchvision
+
+
+>>> class CocoDetection(torchvision.datasets.CocoDetection):
+...     def __init__(self, img_folder, image_processor, ann_file):
+...         super().__init__(img_folder, ann_file)
+...         self.image_processor = image_processor
+
+...     def __getitem__(self, idx):
+...         # read in PIL image and target in COCO format
+...         img, target = super(CocoDetection, self).__getitem__(idx)
+
+...         # preprocess image and target: converting target to DETR format,
+...         # resizing + normalization of both image and target)
+...         image_id = self.ids[idx]
+...         target = {"image_id": image_id, "annotations": target}
+...         encoding = self.image_processor(images=img, annotations=target, return_tensors="pt")
+...         pixel_values = encoding["pixel_values"].squeeze()  # remove batch dimension
+...         target = encoding["labels"][0]  # remove batch dimension
+
+...         return {"pixel_values": pixel_values, "labels": target}
+
+
+>>> im_processor = AutoImageProcessor.from_pretrained("devonho/detr-resnet-50_finetuned_cppe5")
+
+>>> path_output_cppe5, path_anno = save_cppe5_annotation_file_images(cppe5["test"])
+>>> test_ds_coco_format = CocoDetection(path_output_cppe5, im_processor, path_anno)
+```
+
+Finally, load the metrics and run the evaluation.
+
+```py
+>>> import evaluate
+>>> from tqdm import tqdm
+
+>>> model = AutoModelForObjectDetection.from_pretrained("devonho/detr-resnet-50_finetuned_cppe5")
+>>> module = evaluate.load("ybelkada/cocoevaluate", coco=test_ds_coco_format.coco)
+>>> val_dataloader = torch.utils.data.DataLoader(
+...     test_ds_coco_format, batch_size=8, shuffle=False, num_workers=4, collate_fn=collate_fn
+... )
+
+>>> with torch.no_grad():
+...     for idx, batch in enumerate(tqdm(val_dataloader)):
+...         pixel_values = batch["pixel_values"]
+...         pixel_mask = batch["pixel_mask"]
+
+...         labels = [
+...             {k: v for k, v in t.items()} for t in batch["labels"]
+...         ]  # these are in DETR format, resized + normalized
+
+...         # forward pass
+...         outputs = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
+
+...         orig_target_sizes = torch.stack([target["orig_size"] for target in labels], dim=0)
+...         results = im_processor.post_process(outputs, orig_target_sizes)  # convert outputs of model to COCO api
+
+...         module.add(prediction=results, reference=labels)
+...         del batch
+
+>>> results = module.compute()
+>>> print(results)
+Accumulating evaluation results...
+DONE (t=0.08s).
+IoU metric: bbox
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.352
+ Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.681
+ Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.292
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.168
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.208
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.429
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.274
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.484
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.501
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.191
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.323
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.590
+```
+These results can be further improved by adjusting the hyperparameters in [`~transformers.TrainingArguments`]. Give it a go!
+
+## Inference
+Now that you have finetuned a DETR model, evaluated it, and uploaded it to the Hugging Face Hub, you can use it for inference.
+The simplest way to try out your finetuned model for inference is to use it in a [`Pipeline`]. Instantiate a pipeline
+for object detection with your model, and pass an image to it:
+
+```py
+>>> from transformers import pipeline
+>>> import requests
+
+>>> url = "https://i.imgur.com/2lnWoly.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> obj_detector = pipeline("object-detection", model="devonho/detr-resnet-50_finetuned_cppe5")
+>>> obj_detector(image)
+```
+
+You can also manually replicate the results of the pipeline if you'd like:
+
+```py
+>>> image_processor = AutoImageProcessor.from_pretrained("devonho/detr-resnet-50_finetuned_cppe5")
+>>> model = AutoModelForObjectDetection.from_pretrained("devonho/detr-resnet-50_finetuned_cppe5")
+
+>>> with torch.no_grad():
+...     inputs = image_processor(images=image, return_tensors="pt")
+...     outputs = model(**inputs)
+...     target_sizes = torch.tensor([image.size[::-1]])
+...     results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[0]
+
+>>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
+...     box = [round(i, 2) for i in box.tolist()]
+...     print(
+...         f"Detected {model.config.id2label[label.item()]} with confidence "
+...         f"{round(score.item(), 3)} at location {box}"
+...     )
+Detected Coverall with confidence 0.566 at location [1215.32, 147.38, 4401.81, 3227.08]
+Detected Mask with confidence 0.584 at location [2449.06, 823.19, 3256.43, 1413.9]
+```
+
+Let's plot the result:
+```py
+>>> draw = ImageDraw.Draw(image)
+
+>>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
+...     box = [round(i, 2) for i in box.tolist()]
+...     x, y, x2, y2 = tuple(box)
+...     draw.rectangle((x, y, x2, y2), outline="red", width=1)
+...     draw.text((x, y), model.config.id2label[label.item()], fill="white")
+
+>>> image
+```
+
+<div class="flex justify-center">
+    <img src="https://i.imgur.com/4QZnf9A.png" alt="Object detection result on a new image"/>
+</div>
diff --git a/docs/source/en/tasks/object_detection.mdx b/docs/source/en/tasks/object_detection.mdx
deleted file mode 100644
index a2b8a12fb60d..000000000000
--- a/docs/source/en/tasks/object_detection.mdx
+++ /dev/null
@@ -1,584 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Object detection
-
-[[open-in-colab]]
-
-Object detection is the computer vision task of detecting instances (such as humans, buildings, or cars) in an image. Object detection models receive an image as input and output
-coordinates of the bounding boxes and associated labels of the detected objects. An image can contain multiple objects,
-each with its own bounding box and a label (e.g. it can have a car and a building), and each object can
-be present in different parts of an image (e.g. the image can have several cars).
-This task is commonly used in autonomous driving for detecting things like pedestrians, road signs, and traffic lights.
-Other applications include counting objects in images, image search, and more.
-
- <Tip>
-Check out the <a href="https://huggingface.co/tasks/object-detection">object detection</a> task page to learn about use cases,
-models, metrics, and datasets associated with this task.
- </Tip>
-
-In this guide, you will learn how to:
-
- 1. Finetune [DETR](https://huggingface.co/docs/transformers/model_doc/detr), a model that combines a convolutional
- backbone with an encoder-decoder Transformer, on the [CPPE-5](https://huggingface.co/datasets/cppe-5)
- dataset.
- 2. Use your finetuned model for inference.
-
-Before you begin, make sure you have all the necessary libraries installed:
-
-```bash
-pip install -q datasets transformers evaluate timm albumentations
-```
-
-You'll use 🤗 Datasets to load a dataset from the Hugging Face Hub, 🤗 Transformers to train your model,
-and `albumentations` to augment the data. `timm` is currently required to load a convolutional backbone for the DETR model.
-
-We encourage you to share your model with the community. Log in to your Hugging Face account to upload it to the Hub.
-When prompted, enter your token to log in:
-
-```py
->>> from huggingface_hub import notebook_login
-
->>> notebook_login()
-```
-
-## Load the CPPE-5 dataset
-
-The [CPPE-5 dataset](https://huggingface.co/datasets/cppe-5) contains images with
-annotations identifying medical personal protective equipment (PPE) in the context of the COVID-19 pandemic.
-
-Start by loading the dataset:
-
-```py
->>> from datasets import load_dataset
-
->>> cppe5 = load_dataset("cppe-5")
->>> cppe5
-DatasetDict({
-    train: Dataset({
-        features: ['image_id', 'image', 'width', 'height', 'objects'],
-        num_rows: 1000
-    })
-    test: Dataset({
-        features: ['image_id', 'image', 'width', 'height', 'objects'],
-        num_rows: 29
-    })
-})
-```
-
-You'll see that this dataset already comes with a training set containing 1000 images and a test set with 29 images.
-
-To get familiar with the data, explore what the examples look like.
-
-```py
->>> cppe5["train"][0]
-{'image_id': 15,
- 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=943x663 at 0x7F9EC9E77C10>,
- 'width': 943,
- 'height': 663,
- 'objects': {'id': [114, 115, 116, 117],
-  'area': [3796, 1596, 152768, 81002],
-  'bbox': [[302.0, 109.0, 73.0, 52.0],
-   [810.0, 100.0, 57.0, 28.0],
-   [160.0, 31.0, 248.0, 616.0],
-   [741.0, 68.0, 202.0, 401.0]],
-  'category': [4, 4, 0, 0]}}
-```
-
-The examples in the dataset have the following fields:
-- `image_id`: the example image id
-- `image`: a `PIL.Image.Image` object containing the image
-- `width`: width of the image
-- `height`: height of the image
-- `objects`: a dictionary containing bounding box metadata for the objects in the image:
-  - `id`: the annotation id
-  - `area`: the area of the bounding box
-  - `bbox`: the object's bounding box (in the [COCO format](https://albumentations.ai/docs/getting_started/bounding_boxes_augmentation/#coco) )
-  - `category`: the object's category, with possible values including `Coverall (0)`, `Face_Shield (1)`, `Gloves (2)`, `Goggles (3)` and `Mask (4)`
-
-You may notice that the `bbox` field follows the COCO format, which is the format that the DETR model expects.
-However, the grouping of the fields inside `objects` differs from the annotation format DETR requires. You will
-need to apply some preprocessing transformations before using this data for training.
-
-To get an even better understanding of the data, visualize an example in the dataset.
-
-```py
->>> import numpy as np
->>> import os
->>> from PIL import Image, ImageDraw
-
->>> image = cppe5["train"][0]["image"]
->>> annotations = cppe5["train"][0]["objects"]
->>> draw = ImageDraw.Draw(image)
-
->>> categories = cppe5["train"].features["objects"].feature["category"].names
-
->>> id2label = {index: x for index, x in enumerate(categories, start=0)}
->>> label2id = {v: k for k, v in id2label.items()}
-
->>> for i in range(len(annotations["id"])):
-...     box = annotations["bbox"][i - 1]
-...     class_idx = annotations["category"][i - 1]
-...     x, y, w, h = tuple(box)
-...     draw.rectangle((x, y, x + w, y + h), outline="red", width=1)
-...     draw.text((x, y), id2label[class_idx], fill="white")
-
->>> image
-```
-
-<div class="flex justify-center">
-    <img src="https://i.imgur.com/TdaqPJO.png" alt="CPPE-5 Image Example"/>
-</div>
-
-To visualize the bounding boxes with associated labels, you can get the labels from the dataset's metadata, specifically
-the `category` field.
-You'll also want to create dictionaries that map a label id to a label class (`id2label`) and the other way around (`label2id`).
-You can use them later when setting up the model. Including these maps will make your model reusable by others if you share
-it on the Hugging Face Hub.
-
-As a final step of getting familiar with the data, explore it for potential issues. One common problem with datasets for
-object detection is bounding boxes that "stretch" beyond the edge of the image. Such "runaway" bounding boxes can raise
-errors during training and should be addressed at this stage. There are a few examples with this issue in this dataset.
-To keep things simple in this guide, we remove these images from the data.
-
-```py
->>> remove_idx = [590, 821, 822, 875, 876, 878, 879]
->>> keep = [i for i in range(len(cppe5["train"])) if i not in remove_idx]
->>> cppe5["train"] = cppe5["train"].select(keep)
-```
-
-## Preprocess the data
-
-To finetune a model, you must preprocess the data you plan to use to match precisely the approach used for the pre-trained model.
-[`AutoImageProcessor`] takes care of processing image data to create `pixel_values`, `pixel_mask`, and
-`labels` that a DETR model can train with. The image processor has some attributes that you won't have to worry about:
-
-- `image_mean = [0.485, 0.456, 0.406 ]`
-- `image_std = [0.229, 0.224, 0.225]`
-
-These are the mean and standard deviation used to normalize images during the model pre-training. These values are crucial
-to replicate when doing inference or finetuning a pre-trained image model.
-
-Instantiate the image processor from the same checkpoint as the model you want to finetune.
-
-```py
->>> from transformers import AutoImageProcessor
-
->>> checkpoint = "facebook/detr-resnet-50"
->>> image_processor = AutoImageProcessor.from_pretrained(checkpoint)
-```
-
-Before passing the images to the `image_processor`, apply two preprocessing transformations to the dataset:
-- Augmenting images
-- Reformatting annotations to meet DETR expectations
-
-First, to make sure the model does not overfit on the training data, you can apply image augmentation with any data augmentation library. Here we use [Albumentations](https://albumentations.ai/docs/) ...
-This library ensures that transformations affect the image and update the bounding boxes accordingly.
-The 🤗 Datasets library documentation has a detailed [guide on how to augment images for object detection](https://huggingface.co/docs/datasets/object_detection),
-and it uses the exact same dataset as an example. Apply the same approach here, resize each image to (480, 480),
-flip it horizontally, and brighten it:
-
-```py
->>> import albumentations
->>> import numpy as np
->>> import torch
-
->>> transform = albumentations.Compose(
-...     [
-...         albumentations.Resize(480, 480),
-...         albumentations.HorizontalFlip(p=1.0),
-...         albumentations.RandomBrightnessContrast(p=1.0),
-...     ],
-...     bbox_params=albumentations.BboxParams(format="coco", label_fields=["category"]),
-... )
-```
-
-The `image_processor` expects the annotations to be in the following format: `{'image_id': int, 'annotations': List[Dict]}`,
- where each dictionary is a COCO object annotation. Let's add a function to reformat annotations for a single example:
-
-```py
->>> def formatted_anns(image_id, category, area, bbox):
-
-...     annotations = []
-...     for i in range(0, len(category)):
-...         new_ann = {
-...             "image_id": image_id,
-...             "category_id": category[i],
-...             "isCrowd": 0,
-...             "area": area[i],
-...             "bbox": list(bbox[i]),
-...         }
-...         annotations.append(new_ann)
-
-...     return annotations
-```
-
-Now you can combine the image and annotation transformations to use on a batch of examples:
-
-```py
->>> # transforming a batch
->>> def transform_aug_ann(examples):
-...     image_ids = examples["image_id"]
-...     images, bboxes, area, categories = [], [], [], []
-...     for image, objects in zip(examples["image"], examples["objects"]):
-...         image = np.array(image.convert("RGB"))[:, :, ::-1]
-...         out = transform(image=image, bboxes=objects["bbox"], category=objects["category"])
-
-...         area.append(objects["area"])
-...         images.append(out["image"])
-...         bboxes.append(out["bboxes"])
-...         categories.append(out["category"])
-
-...     targets = [
-...         {"image_id": id_, "annotations": formatted_anns(id_, cat_, ar_, box_)}
-...         for id_, cat_, ar_, box_ in zip(image_ids, categories, area, bboxes)
-...     ]
-
-...     return image_processor(images=images, annotations=targets, return_tensors="pt")
-```
-
-Apply this preprocessing function to the entire dataset using 🤗 Datasets [`~datasets.Dataset.with_transform`] method. This method applies
-transformations on the fly when you load an element of the dataset.
-
-At this point, you can check what an example from the dataset looks like after the transformations. You should see a tensor
-with `pixel_values`, a tensor with `pixel_mask`, and `labels`.
-
-```py
->>> cppe5["train"] = cppe5["train"].with_transform(transform_aug_ann)
->>> cppe5["train"][15]
-{'pixel_values': tensor([[[ 0.9132,  0.9132,  0.9132,  ..., -1.9809, -1.9809, -1.9809],
-          [ 0.9132,  0.9132,  0.9132,  ..., -1.9809, -1.9809, -1.9809],
-          [ 0.9132,  0.9132,  0.9132,  ..., -1.9638, -1.9638, -1.9638],
-          ...,
-          [-1.5699, -1.5699, -1.5699,  ..., -1.9980, -1.9980, -1.9980],
-          [-1.5528, -1.5528, -1.5528,  ..., -1.9980, -1.9809, -1.9809],
-          [-1.5528, -1.5528, -1.5528,  ..., -1.9980, -1.9809, -1.9809]],
-
-         [[ 1.3081,  1.3081,  1.3081,  ..., -1.8431, -1.8431, -1.8431],
-          [ 1.3081,  1.3081,  1.3081,  ..., -1.8431, -1.8431, -1.8431],
-          [ 1.3081,  1.3081,  1.3081,  ..., -1.8256, -1.8256, -1.8256],
-          ...,
-          [-1.3179, -1.3179, -1.3179,  ..., -1.8606, -1.8606, -1.8606],
-          [-1.3004, -1.3004, -1.3004,  ..., -1.8606, -1.8431, -1.8431],
-          [-1.3004, -1.3004, -1.3004,  ..., -1.8606, -1.8431, -1.8431]],
-
-         [[ 1.4200,  1.4200,  1.4200,  ..., -1.6476, -1.6476, -1.6476],
-          [ 1.4200,  1.4200,  1.4200,  ..., -1.6476, -1.6476, -1.6476],
-          [ 1.4200,  1.4200,  1.4200,  ..., -1.6302, -1.6302, -1.6302],
-          ...,
-          [-1.0201, -1.0201, -1.0201,  ..., -1.5604, -1.5604, -1.5604],
-          [-1.0027, -1.0027, -1.0027,  ..., -1.5604, -1.5430, -1.5430],
-          [-1.0027, -1.0027, -1.0027,  ..., -1.5604, -1.5430, -1.5430]]]),
- 'pixel_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
-         [1, 1, 1,  ..., 1, 1, 1],
-         [1, 1, 1,  ..., 1, 1, 1],
-         ...,
-         [1, 1, 1,  ..., 1, 1, 1],
-         [1, 1, 1,  ..., 1, 1, 1],
-         [1, 1, 1,  ..., 1, 1, 1]]),
- 'labels': {'size': tensor([800, 800]), 'image_id': tensor([756]), 'class_labels': tensor([4]), 'boxes': tensor([[0.7340, 0.6986, 0.3414, 0.5944]]), 'area': tensor([519544.4375]), 'iscrowd': tensor([0]), 'orig_size': tensor([480, 480])}}
-```
-
-You have successfully augmented the individual images and prepared their annotations. However, preprocessing isn't
-complete yet. In the final step, create a custom `collate_fn` to batch images together.
-Pad images (which are now `pixel_values`) to the largest image in a batch, and create a corresponding `pixel_mask`
-to indicate which pixels are real (1) and which are padding (0).
-
-```py
->>> def collate_fn(batch):
-...     pixel_values = [item["pixel_values"] for item in batch]
-...     encoding = image_processor.pad_and_create_pixel_mask(pixel_values, return_tensors="pt")
-...     labels = [item["labels"] for item in batch]
-...     batch = {}
-...     batch["pixel_values"] = encoding["pixel_values"]
-...     batch["pixel_mask"] = encoding["pixel_mask"]
-...     batch["labels"] = labels
-...     return batch
-```
-
-## Training the DETR model
-You have done most of the heavy lifting in the previous sections, so now you are ready to train your model!
-The images in this dataset are still quite large, even after resizing. This means that finetuning this model will
-require at least one GPU.
-
-Training involves the following steps:
-1. Load the model with [`AutoModelForObjectDetection`] using the same checkpoint as in the preprocessing.
-2. Define your training hyperparameters in [`TrainingArguments`].
-3. Pass the training arguments to [`Trainer`] along with the model, dataset, image processor, and data collator.
-4. Call [`~Trainer.train`] to finetune your model.
-
-When loading the model from the same checkpoint that you used for the preprocessing, remember to pass the `label2id`
-and `id2label` maps that you created earlier from the dataset's metadata. Additionally, we specify `ignore_mismatched_sizes=True` to replace the existing classification head with a new one.
-
-```py
->>> from transformers import AutoModelForObjectDetection
-
->>> model = AutoModelForObjectDetection.from_pretrained(
-...     checkpoint,
-...     id2label=id2label,
-...     label2id=label2id,
-...     ignore_mismatched_sizes=True,
-... )
-```
-
-In the [`TrainingArguments`] use `output_dir` to specify where to save your model, then configure hyperparameters as you see fit.
-It is important you do not remove unused columns because this will drop the image column. Without the image column, you
-can't create `pixel_values`. For this reason, set `remove_unused_columns` to `False`.
-If you wish to share your model by pushing to the Hub, set `push_to_hub` to `True` (you must be signed in to Hugging
-Face to upload your model).
-
-```py
->>> from transformers import TrainingArguments
-
->>> training_args = TrainingArguments(
-...     output_dir="detr-resnet-50_finetuned_cppe5",
-...     per_device_train_batch_size=8,
-...     num_train_epochs=10,
-...     fp16=True,
-...     save_steps=200,
-...     logging_steps=50,
-...     learning_rate=1e-5,
-...     weight_decay=1e-4,
-...     save_total_limit=2,
-...     remove_unused_columns=False,
-...     push_to_hub=True,
-... )
-```
-
-Finally, bring everything together, and call [`~transformers.Trainer.train`]:
-
-```py
->>> from transformers import Trainer
-
->>> trainer = Trainer(
-...     model=model,
-...     args=training_args,
-...     data_collator=collate_fn,
-...     train_dataset=cppe5["train"],
-...     tokenizer=image_processor,
-... )
-
->>> trainer.train()
-```
-
-If you have set `push_to_hub` to `True` in the `training_args`, the training checkpoints are pushed to the
-Hugging Face Hub. Upon training completion, push the final model to the Hub as well by calling the [`~transformers.Trainer.push_to_hub`] method.
-
-```py
->>> trainer.push_to_hub()
-```
-
-## Evaluate
-Object detection models are commonly evaluated with a set of <a href="https://cocodataset.org/#detection-eval">COCO-style metrics</a>.
-You can use one of the existing metrics implementations, but here you'll use the one from `torchvision` to evaluate the final
-model that you pushed to the Hub.
-
-To use the `torchvision` evaluator, you'll need to prepare a ground truth COCO dataset. The API to build a COCO dataset
-requires the data to be stored in a certain format, so you'll need to save images and annotations to disk first. Just like
-when you prepared your data for training, the annotations from the `cppe5["test"]` need to be formatted. However, images
-should stay as they are.
-
-The evaluation step requires a bit of work, but it can be split in three major steps.
-First, prepare the `cppe5["test"]` set: format the annotations and save the data to disk.
-
-```py
->>> import json
-
->>> # format annotations the same as for training, no need for data augmentation
->>> def val_formatted_anns(image_id, objects):
-...     annotations = []
-...     for i in range(0, len(objects["id"])):
-...         new_ann = {
-...             "id": objects["id"][i],
-...             "category_id": objects["category"][i],
-...             "iscrowd": 0,
-...             "image_id": image_id,
-...             "area": objects["area"][i],
-...             "bbox": objects["bbox"][i],
-...         }
-...         annotations.append(new_ann)
-
-...     return annotations
-
-
->>> # Save images and annotations into the files torchvision.datasets.CocoDetection expects
->>> def save_cppe5_annotation_file_images(cppe5):
-...     output_json = {}
-...     path_output_cppe5 = f"{os.getcwd()}/cppe5/"
-
-...     if not os.path.exists(path_output_cppe5):
-...         os.makedirs(path_output_cppe5)
-
-...     path_anno = os.path.join(path_output_cppe5, "cppe5_ann.json")
-...     categories_json = [{"supercategory": "none", "id": id, "name": id2label[id]} for id in id2label]
-...     output_json["images"] = []
-...     output_json["annotations"] = []
-...     for example in cppe5:
-...         ann = val_formatted_anns(example["image_id"], example["objects"])
-...         output_json["images"].append(
-...             {
-...                 "id": example["image_id"],
-...                 "width": example["image"].width,
-...                 "height": example["image"].height,
-...                 "file_name": f"{example['image_id']}.png",
-...             }
-...         )
-...         output_json["annotations"].extend(ann)
-...     output_json["categories"] = categories_json
-
-...     with open(path_anno, "w") as file:
-...         json.dump(output_json, file, ensure_ascii=False, indent=4)
-
-...     for im, img_id in zip(cppe5["image"], cppe5["image_id"]):
-...         path_img = os.path.join(path_output_cppe5, f"{img_id}.png")
-...         im.save(path_img)
-
-...     return path_output_cppe5, path_anno
-```
-
-Next, prepare an instance of a `CocoDetection` class that can be used with `cocoevaluator`.
-
-```py
->>> import torchvision
-
-
->>> class CocoDetection(torchvision.datasets.CocoDetection):
-...     def __init__(self, img_folder, feature_extractor, ann_file):
-...         super().__init__(img_folder, ann_file)
-...         self.feature_extractor = feature_extractor
-
-...     def __getitem__(self, idx):
-...         # read in PIL image and target in COCO format
-...         img, target = super(CocoDetection, self).__getitem__(idx)
-
-...         # preprocess image and target: converting target to DETR format,
-...         # resizing + normalization of both image and target)
-...         image_id = self.ids[idx]
-...         target = {"image_id": image_id, "annotations": target}
-...         encoding = self.feature_extractor(images=img, annotations=target, return_tensors="pt")
-...         pixel_values = encoding["pixel_values"].squeeze()  # remove batch dimension
-...         target = encoding["labels"][0]  # remove batch dimension
-
-...         return {"pixel_values": pixel_values, "labels": target}
-
-
->>> im_processor = AutoImageProcessor.from_pretrained("MariaK/detr-resnet-50_finetuned_cppe5")
-
->>> path_output_cppe5, path_anno = save_cppe5_annotation_file_images(cppe5["test"])
->>> test_ds_coco_format = CocoDetection(path_output_cppe5, im_processor, path_anno)
-```
-
-Finally, load the metrics and run the evaluation.
-
-```py
->>> import evaluate
->>> from tqdm import tqdm
-
->>> model = AutoModelForObjectDetection.from_pretrained("MariaK/detr-resnet-50_finetuned_cppe5")
->>> module = evaluate.load("ybelkada/cocoevaluate", coco=test_ds_coco_format.coco)
->>> val_dataloader = torch.utils.data.DataLoader(
-...     test_ds_coco_format, batch_size=8, shuffle=False, num_workers=4, collate_fn=collate_fn
-... )
-
->>> with torch.no_grad():
-...     for idx, batch in enumerate(tqdm(val_dataloader)):
-...         pixel_values = batch["pixel_values"]
-...         pixel_mask = batch["pixel_mask"]
-
-...         labels = [
-...             {k: v for k, v in t.items()} for t in batch["labels"]
-...         ]  # these are in DETR format, resized + normalized
-
-...         # forward pass
-...         outputs = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
-
-...         orig_target_sizes = torch.stack([target["orig_size"] for target in labels], dim=0)
-...         results = im_processor.post_process(outputs, orig_target_sizes)  # convert outputs of model to COCO api
-
-...         module.add(prediction=results, reference=labels)
-...         del batch
-
->>> results = module.compute()
->>> print(results)
-Accumulating evaluation results...
-DONE (t=0.08s).
-IoU metric: bbox
- Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.150
- Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.280
- Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.130
- Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.038
- Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.036
- Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.182
- Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.166
- Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.317
- Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.335
- Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.104
- Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.146
- Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.382
-```
-These results can be further improved by adjusting the hyperparameters in [`~transformers.TrainingArguments`]. Give it a go!
-
-## Inference
-Now that you have finetuned a DETR model, evaluated it, and uploaded it to the Hugging Face Hub, you can use it for inference.
-The simplest way to try out your finetuned model for inference is to use it in a [`Pipeline`]. Instantiate a pipeline
-for object detection with your model, and pass an image to it:
-
-```py
->>> from transformers import pipeline
->>> import requests
-
->>> url = "https://i.imgur.com/2lnWoly.jpg"
->>> image = Image.open(requests.get(url, stream=True).raw)
-
->>> obj_detector = pipeline("object-detection", model="MariaK/detr-resnet-50_finetuned_cppe5")
->>> obj_detector(image)
-```
-
-You can also manually replicate the results of the pipeline if you'd like:
-
-```py
->>> image_processor = AutoImageProcessor.from_pretrained("MariaK/detr-resnet-50_finetuned_cppe5")
->>> model = AutoModelForObjectDetection.from_pretrained("MariaK/detr-resnet-50_finetuned_cppe5")
-
->>> with torch.no_grad():
-...     inputs = image_processor(images=image, return_tensors="pt")
-...     outputs = model(**inputs)
-...     target_sizes = torch.tensor([image.size[::-1]])
-...     results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[0]
-
->>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
-...     box = [round(i, 2) for i in box.tolist()]
-...     print(
-...         f"Detected {model.config.id2label[label.item()]} with confidence "
-...         f"{round(score.item(), 3)} at location {box}"
-...     )
-Detected Coverall with confidence 0.566 at location [1215.32, 147.38, 4401.81, 3227.08]
-Detected Mask with confidence 0.584 at location [2449.06, 823.19, 3256.43, 1413.9]
-```
-
-Let's plot the result:
-```py
->>> draw = ImageDraw.Draw(image)
-
->>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
-...     box = [round(i, 2) for i in box.tolist()]
-...     x, y, x2, y2 = tuple(box)
-...     draw.rectangle((x, y, x2, y2), outline="red", width=1)
-...     draw.text((x, y), model.config.id2label[label.item()], fill="white")
-
->>> image
-```
-
-<div class="flex justify-center">
-    <img src="https://i.imgur.com/4QZnf9A.png" alt="Object detection result on a new image"/>
-</div>
-
diff --git a/docs/source/en/tasks/question_answering.md b/docs/source/en/tasks/question_answering.md
new file mode 100644
index 000000000000..0db26ab8cbb7
--- /dev/null
+++ b/docs/source/en/tasks/question_answering.md
@@ -0,0 +1,434 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Question answering
+
+[[open-in-colab]]
+
+<Youtube id="ajPx5LwJD-I"/>
+
+Question answering tasks return an answer given a question. If you've ever asked a virtual assistant like Alexa, Siri or Google what the weather is, then you've used a question answering model before. There are two common types of question answering tasks:
+
+- Extractive: extract the answer from the given context.
+- Abstractive: generate an answer from the context that correctly answers the question.
+
+This guide will show you how to:
+
+1. Finetune [DistilBERT](https://huggingface.co/distilbert-base-uncased) on the [SQuAD](https://huggingface.co/datasets/squad) dataset for extractive question answering.
+2. Use your finetuned model for inference.
+
+<Tip>
+The task illustrated in this tutorial is supported by the following model architectures:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+
+[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [Falcon](../model_doc/falcon), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [OpenAI GPT-2](../model_doc/gpt2), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [LXMERT](../model_doc/lxmert), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MPT](../model_doc/mpt), [MRA](../model_doc/mra), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OPT](../model_doc/opt), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [Splinter](../model_doc/splinter), [SqueezeBERT](../model_doc/squeezebert), [T5](../model_doc/t5), [UMT5](../model_doc/umt5), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
+
+
+<!--End of the generated tip-->
+
+</Tip>
+
+Before you begin, make sure you have all the necessary libraries installed:
+
+```bash
+pip install transformers datasets evaluate
+```
+
+We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## Load SQuAD dataset
+
+Start by loading a smaller subset of the SQuAD dataset from the 🤗 Datasets library. This'll give you a chance to experiment and make sure everything works before spending more time training on the full dataset.
+
+```py
+>>> from datasets import load_dataset
+
+>>> squad = load_dataset("squad", split="train[:5000]")
+```
+
+Split the dataset's `train` split into a train and test set with the [`~datasets.Dataset.train_test_split`] method:
+
+```py
+>>> squad = squad.train_test_split(test_size=0.2)
+```
+
+Then take a look at an example:
+
+```py
+>>> squad["train"][0]
+{'answers': {'answer_start': [515], 'text': ['Saint Bernadette Soubirous']},
+ 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
+ 'id': '5733be284776f41900661182',
+ 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
+ 'title': 'University_of_Notre_Dame'
+}
+```
+
+There are several important fields here:
+
+- `answers`: the starting location of the answer token and the answer text.
+- `context`: background information from which the model needs to extract the answer.
+- `question`: the question a model should answer.
+
+## Preprocess
+
+<Youtube id="qgaM0weJHpA"/>
+
+The next step is to load a DistilBERT tokenizer to process the `question` and `context` fields:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+```
+
+There are a few preprocessing steps particular to question answering tasks you should be aware of:
+
+1. Some examples in a dataset may have a very long `context` that exceeds the maximum input length of the model. To deal with longer sequences, truncate only the `context` by setting `truncation="only_second"`.
+2. Next, map the start and end positions of the answer to the original `context` by setting
+   `return_offset_mapping=True`.
+3. With the mapping in hand, now you can find the start and end tokens of the answer. Use the [`~tokenizers.Encoding.sequence_ids`] method to
+   find which part of the offset corresponds to the `question` and which corresponds to the `context`.
+
+Here is how you can create a function to truncate and map the start and end tokens of the `answer` to the `context`:
+
+```py
+>>> def preprocess_function(examples):
+...     questions = [q.strip() for q in examples["question"]]
+...     inputs = tokenizer(
+...         questions,
+...         examples["context"],
+...         max_length=384,
+...         truncation="only_second",
+...         return_offsets_mapping=True,
+...         padding="max_length",
+...     )
+
+...     offset_mapping = inputs.pop("offset_mapping")
+...     answers = examples["answers"]
+...     start_positions = []
+...     end_positions = []
+
+...     for i, offset in enumerate(offset_mapping):
+...         answer = answers[i]
+...         start_char = answer["answer_start"][0]
+...         end_char = answer["answer_start"][0] + len(answer["text"][0])
+...         sequence_ids = inputs.sequence_ids(i)
+
+...         # Find the start and end of the context
+...         idx = 0
+...         while sequence_ids[idx] != 1:
+...             idx += 1
+...         context_start = idx
+...         while sequence_ids[idx] == 1:
+...             idx += 1
+...         context_end = idx - 1
+
+...         # If the answer is not fully inside the context, label it (0, 0)
+...         if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
+...             start_positions.append(0)
+...             end_positions.append(0)
+...         else:
+...             # Otherwise it's the start and end token positions
+...             idx = context_start
+...             while idx <= context_end and offset[idx][0] <= start_char:
+...                 idx += 1
+...             start_positions.append(idx - 1)
+
+...             idx = context_end
+...             while idx >= context_start and offset[idx][1] >= end_char:
+...                 idx -= 1
+...             end_positions.append(idx + 1)
+
+...     inputs["start_positions"] = start_positions
+...     inputs["end_positions"] = end_positions
+...     return inputs
+```
+
+To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] function. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once. Remove any columns you don't need:
+
+```py
+>>> tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)
+```
+
+Now create a batch of examples using [`DefaultDataCollator`]. Unlike other data collators in 🤗 Transformers, the [`DefaultDataCollator`] does not apply any additional preprocessing such as padding.
+
+<frameworkcontent>
+<pt>
+```py
+>>> from transformers import DefaultDataCollator
+
+>>> data_collator = DefaultDataCollator()
+```
+</pt>
+<tf>
+```py
+>>> from transformers import DefaultDataCollator
+
+>>> data_collator = DefaultDataCollator(return_tensors="tf")
+```
+</tf>
+</frameworkcontent>
+
+## Train
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)!
+
+</Tip>
+
+You're ready to start training your model now! Load DistilBERT with [`AutoModelForQuestionAnswering`]:
+
+```py
+>>> from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
+
+>>> model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+```
+
+At this point, only three steps remain:
+
+1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model).
+2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, and data collator.
+3. Call [`~Trainer.train`] to finetune your model.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_qa_model",
+...     evaluation_strategy="epoch",
+...     learning_rate=2e-5,
+...     per_device_train_batch_size=16,
+...     per_device_eval_batch_size=16,
+...     num_train_epochs=3,
+...     weight_decay=0.01,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=tokenized_squad["train"],
+...     eval_dataset=tokenized_squad["test"],
+...     tokenizer=tokenizer,
+...     data_collator=data_collator,
+... )
+
+>>> trainer.train()
+```
+
+Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+<Tip>
+
+If you aren't familiar with finetuning a model with Keras, take a look at the basic tutorial [here](../training#train-a-tensorflow-model-with-keras)!
+
+</Tip>
+To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters:
+
+```py
+>>> from transformers import create_optimizer
+
+>>> batch_size = 16
+>>> num_epochs = 2
+>>> total_train_steps = (len(tokenized_squad["train"]) // batch_size) * num_epochs
+>>> optimizer, schedule = create_optimizer(
+...     init_lr=2e-5,
+...     num_warmup_steps=0,
+...     num_train_steps=total_train_steps,
+... )
+```
+
+Then you can load DistilBERT with [`TFAutoModelForQuestionAnswering`]:
+
+```py
+>>> from transformers import TFAutoModelForQuestionAnswering
+
+>>> model = TFAutoModelForQuestionAnswering("distilbert-base-uncased")
+```
+
+Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
+
+```py
+>>> tf_train_set = model.prepare_tf_dataset(
+...     tokenized_squad["train"],
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_validation_set = model.prepare_tf_dataset(
+...     tokenized_squad["test"],
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)
+```
+
+The last thing to setup before you start training is to provide a way to push your model to the Hub. This can be done by specifying where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> callback = PushToHubCallback(
+...     output_dir="my_awesome_qa_model",
+...     tokenizer=tokenizer,
+... )
+```
+
+Finally, you're ready to start training your model! Call [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) with your training and validation datasets, the number of epochs, and your callback to finetune the model:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=[callback])
+```
+Once training is completed, your model is automatically uploaded to the Hub so everyone can use it!
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+For a more in-depth example of how to finetune a model for question answering, take a look at the corresponding
+[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb)
+or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb).
+
+</Tip>
+
+## Evaluate
+
+Evaluation for question answering requires a significant amount of postprocessing. To avoid taking up too much of your time, this guide skips the evaluation step. The [`Trainer`] still calculates the evaluation loss during training so you're not completely in the dark about your model's performance.
+
+If have more time and you're interested in how to evaluate your model for question answering, take a look at the [Question answering](https://huggingface.co/course/chapter7/7?fw=pt#postprocessing) chapter from the 🤗 Hugging Face Course!
+
+## Inference
+
+Great, now that you've finetuned a model, you can use it for inference!
+
+Come up with a question and some context you'd like the model to predict:
+
+```py
+>>> question = "How many programming languages does BLOOM support?"
+>>> context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages."
+```
+
+The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for question answering with your model, and pass your text to it:
+
+```py
+>>> from transformers import pipeline
+
+>>> question_answerer = pipeline("question-answering", model="my_awesome_qa_model")
+>>> question_answerer(question=question, context=context)
+{'score': 0.2058267742395401,
+ 'start': 10,
+ 'end': 95,
+ 'answer': '176 billion parameters and can generate text in 46 languages natural languages and 13'}
+```
+
+You can also manually replicate the results of the `pipeline` if you'd like:
+
+<frameworkcontent>
+<pt>
+Tokenize the text and return PyTorch tensors:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_qa_model")
+>>> inputs = tokenizer(question, context, return_tensors="pt")
+```
+
+Pass your inputs to the model and return the `logits`:
+
+```py
+>>> import torch
+>>> from transformers import AutoModelForQuestionAnswering
+
+>>> model = AutoModelForQuestionAnswering.from_pretrained("my_awesome_qa_model")
+>>> with torch.no_grad():
+...     outputs = model(**inputs)
+```
+
+Get the highest probability from the model output for the start and end positions:
+
+```py
+>>> answer_start_index = outputs.start_logits.argmax()
+>>> answer_end_index = outputs.end_logits.argmax()
+```
+
+Decode the predicted tokens to get the answer:
+
+```py
+>>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
+>>> tokenizer.decode(predict_answer_tokens)
+'176 billion parameters and can generate text in 46 languages natural languages and 13'
+```
+</pt>
+<tf>
+Tokenize the text and return TensorFlow tensors:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_qa_model")
+>>> inputs = tokenizer(question, text, return_tensors="tf")
+```
+
+Pass your inputs to the model and return the `logits`:
+
+```py
+>>> from transformers import TFAutoModelForQuestionAnswering
+
+>>> model = TFAutoModelForQuestionAnswering.from_pretrained("my_awesome_qa_model")
+>>> outputs = model(**inputs)
+```
+
+Get the highest probability from the model output for the start and end positions:
+
+```py
+>>> answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])
+>>> answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])
+```
+
+Decode the predicted tokens to get the answer:
+
+```py
+>>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
+>>> tokenizer.decode(predict_answer_tokens)
+'176 billion parameters and can generate text in 46 languages natural languages and 13'
+```
+</tf>
+</frameworkcontent>
diff --git a/docs/source/en/tasks/question_answering.mdx b/docs/source/en/tasks/question_answering.mdx
deleted file mode 100644
index deabef4f04f4..000000000000
--- a/docs/source/en/tasks/question_answering.mdx
+++ /dev/null
@@ -1,421 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Question answering
-
-[[open-in-colab]]
-
-<Youtube id="ajPx5LwJD-I"/>
-
-Question answering tasks return an answer given a question. If you've ever asked a virtual assistant like Alexa, Siri or Google what the weather is, then you've used a question answering model before. There are two common types of question answering tasks:
-
-- Extractive: extract the answer from the given context.
-- Abstractive: generate an answer from the context that correctly answers the question.
-
-This guide will show you how to:
-
-1. Finetune [DistilBERT](https://huggingface.co/distilbert-base-uncased) on the [SQuAD](https://huggingface.co/datasets/squad) dataset for extractive question answering.
-2. Use your finetuned model for inference.
-
-<Tip>
-
-See the question answering [task page](https://huggingface.co/tasks/question-answering) for more information about other forms of question answering and their associated models, datasets, and metrics.
-
-</Tip>
-
-Before you begin, make sure you have all the necessary libraries installed:
-
-```bash
-pip install transformers datasets evaluate
-```
-
-We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login:
-
-```py
->>> from huggingface_hub import notebook_login
-
->>> notebook_login()
-```
-
-## Load SQuAD dataset
-
-Start by loading a smaller subset of the SQuAD dataset from the 🤗 Datasets library. This'll give you a chance to experiment and make sure everythings works before spending more time training on the full dataset.
-
-```py
->>> from datasets import load_dataset
-
->>> squad = load_dataset("squad", split="train[:5000]")
-```
-
-Split the dataset's `train` split into a train and test set with the [`~datasets.Dataset.train_test_split`] method:
-
-```py
->>> squad = squad.train_test_split(test_size=0.2)
-```
-
-Then take a look at an example:
-
-```py
->>> squad["train"][0]
-{'answers': {'answer_start': [515], 'text': ['Saint Bernadette Soubirous']},
- 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
- 'id': '5733be284776f41900661182',
- 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
- 'title': 'University_of_Notre_Dame'
-}
-```
-
-There are several important fields here:
-
-- `answers`: the starting location of the answer token and the answer text.
-- `context`: background information from which the model needs to extract the answer.
-- `question`: the question a model should answer.
-
-## Preprocess
-
-<Youtube id="qgaM0weJHpA"/>
-
-The next step is to load a DistilBERT tokenizer to process the `question` and `context` fields:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
-```
-
-There are a few preprocessing steps particular to question answering tasks you should be aware of:
-
-1. Some examples in a dataset may have a very long `context` that exceeds the maximum input length of the model. To deal with longer sequences, truncate only the `context` by setting `truncation="only_second"`.
-2. Next, map the start and end positions of the answer to the original `context` by setting
-   `return_offset_mapping=True`.
-3. With the mapping in hand, now you can find the start and end tokens of the answer. Use the [`~tokenizers.Encoding.sequence_ids`] method to
-   find which part of the offset corresponds to the `question` and which corresponds to the `context`.
-
-Here is how you can create a function to truncate and map the start and end tokens of the `answer` to the `context`:
-
-```py
->>> def preprocess_function(examples):
-...     questions = [q.strip() for q in examples["question"]]
-...     inputs = tokenizer(
-...         questions,
-...         examples["context"],
-...         max_length=384,
-...         truncation="only_second",
-...         return_offsets_mapping=True,
-...         padding="max_length",
-...     )
-
-...     offset_mapping = inputs.pop("offset_mapping")
-...     answers = examples["answers"]
-...     start_positions = []
-...     end_positions = []
-
-...     for i, offset in enumerate(offset_mapping):
-...         answer = answers[i]
-...         start_char = answer["answer_start"][0]
-...         end_char = answer["answer_start"][0] + len(answer["text"][0])
-...         sequence_ids = inputs.sequence_ids(i)
-
-...         # Find the start and end of the context
-...         idx = 0
-...         while sequence_ids[idx] != 1:
-...             idx += 1
-...         context_start = idx
-...         while sequence_ids[idx] == 1:
-...             idx += 1
-...         context_end = idx - 1
-
-...         # If the answer is not fully inside the context, label it (0, 0)
-...         if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
-...             start_positions.append(0)
-...             end_positions.append(0)
-...         else:
-...             # Otherwise it's the start and end token positions
-...             idx = context_start
-...             while idx <= context_end and offset[idx][0] <= start_char:
-...                 idx += 1
-...             start_positions.append(idx - 1)
-
-...             idx = context_end
-...             while idx >= context_start and offset[idx][1] >= end_char:
-...                 idx -= 1
-...             end_positions.append(idx + 1)
-
-...     inputs["start_positions"] = start_positions
-...     inputs["end_positions"] = end_positions
-...     return inputs
-```
-
-To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] function. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once. Remove any columns you don't need:
-
-```py
->>> tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)
-```
-
-Now create a batch of examples using [`DefaultDataCollator`]. Unlike other data collators in 🤗 Transformers, the [`DefaultDataCollator`] does not apply any additional preprocessing such as padding.
-
-<frameworkcontent>
-<pt>
-```py
->>> from transformers import DefaultDataCollator
-
->>> data_collator = DefaultDataCollator()
-```
-</pt>
-<tf>
-```py
->>> from transformers import DefaultDataCollator
-
->>> data_collator = DefaultDataCollator(return_tensors="tf")
-```
-</tf>
-</frameworkcontent>
-
-## Train
-
-<frameworkcontent>
-<pt>
-<Tip>
-
-If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)!
-
-</Tip>
-You're ready to start training your model now! Load DistilBERT with [`AutoModelForQuestionAnswering`]:
-
-```py
->>> from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
-
->>> model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
-```
-
-At this point, only three steps remain:
-
-1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model).
-2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, and data collator.
-3. Call [`~Trainer.train`] to finetune your model.
-
-```py
->>> training_args = TrainingArguments(
-...     output_dir="my_awesome_qa_model",
-...     evaluation_strategy="epoch",
-...     learning_rate=2e-5,
-...     per_device_train_batch_size=16,
-...     per_device_eval_batch_size=16,
-...     num_train_epochs=3,
-...     weight_decay=0.01,
-...     push_to_hub=True,
-... )
-
->>> trainer = Trainer(
-...     model=model,
-...     args=training_args,
-...     train_dataset=tokenized_squad["train"],
-...     eval_dataset=tokenized_squad["test"],
-...     tokenizer=tokenizer,
-...     data_collator=data_collator,
-... )
-
->>> trainer.train()
-```
-
-Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model:
-
-```py
->>> trainer.push_to_hub()
-```
-</pt>
-<tf>
-<Tip>
-
-If you aren't familiar with finetuning a model with Keras, take a look at the basic tutorial [here](../training#train-a-tensorflow-model-with-keras)!
-
-</Tip>
-To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters:
-
-```py
->>> from transformers import create_optimizer
-
->>> batch_size = 16
->>> num_epochs = 2
->>> total_train_steps = (len(tokenized_squad["train"]) // batch_size) * num_epochs
->>> optimizer, schedule = create_optimizer(
-...     init_lr=2e-5,
-...     num_warmup_steps=0,
-...     num_train_steps=total_train_steps,
-... )
-```
-
-Then you can load DistilBERT with [`TFAutoModelForQuestionAnswering`]:
-
-```py
->>> from transformers import TFAutoModelForQuestionAnswering
-
->>> model = TFAutoModelForQuestionAnswering("distilbert-base-uncased")
-```
-
-Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
-
-```py
->>> tf_train_set = model.prepare_tf_dataset(
-...     tokenized_squad["train"],
-...     shuffle=True,
-...     batch_size=16,
-...     collate_fn=data_collator,
-... )
-
->>> tf_validation_set = model.prepare_tf_dataset(
-...     tokenized_squad["test"],
-...     shuffle=False,
-...     batch_size=16,
-...     collate_fn=data_collator,
-... )
-```
-
-Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
-
-```py
->>> import tensorflow as tf
-
->>> model.compile(optimizer=optimizer)
-```
-
-The last thing to setup before you start training is to provide a way to push your model to the Hub. This can be done by specifying where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]:
-
-```py
->>> from transformers.keras_callbacks import PushToHubCallback
-
->>> callback = PushToHubCallback(
-...     output_dir="my_awesome_qa_model",
-...     tokenizer=tokenizer,
-... )
-```
-
-Finally, you're ready to start training your model! Call [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) with your training and validation datasets, the number of epochs, and your callback to finetune the model:
-
-```py
->>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=[callback])
-```
-Once training is completed, your model is automatically uploaded to the Hub so everyone can use it!
-</tf>
-</frameworkcontent>
-
-<Tip>
-
-For a more in-depth example of how to finetune a model for question answering, take a look at the corresponding
-[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb)
-or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb).
-
-</Tip>
-
-## Evaluate
-
-Evaluation for question answering requires a significant amount of postprocessing. To avoid taking up too much of your time, this guide skips the evaluation step. The [`Trainer`] still calculates the evaluation loss during training so you're not completely in the dark about your model's performance.
-
-If have more time and you're interested in how to evaluate your model for question answering, take a look at the [Question answering](https://huggingface.co/course/chapter7/7?fw=pt#postprocessing) chapter from the 🤗 Hugging Face Course!
-
-## Inference
-
-Great, now that you've finetuned a model, you can use it for inference!
-
-Come up with a question and some context you'd like the model to predict:
-
-```py
->>> question = "How many programming languages does BLOOM support?"
->>> context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages."
-```
-
-The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for question answering with your model, and pass your text to it:
-
-```py
->>> from transformers import pipeline
-
->>> question_answerer = pipeline("question-answering", model="my_awesome_qa_model")
->>> question_answerer(question=question, context=context)
-{'score': 0.2058267742395401,
- 'start': 10,
- 'end': 95,
- 'answer': '176 billion parameters and can generate text in 46 languages natural languages and 13'}
-```
-
-You can also manually replicate the results of the `pipeline` if you'd like:
-
-<frameworkcontent>
-<pt>
-Tokenize the text and return PyTorch tensors:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_qa_model")
->>> inputs = tokenizer(question, context, return_tensors="pt")
-```
-
-Pass your inputs to the model and return the `logits`:
-
-```py
->>> from transformers import AutoModelForQuestionAnswering
-
->>> model = AutoModelForQuestionAnswering.from_pretrained("my_awesome_qa_model")
->>> with torch.no_grad():
-...     outputs = model(**inputs)
-```
-
-Get the highest probability from the model output for the start and end positions:
-
-```py
->>> answer_start_index = outputs.start_logits.argmax()
->>> answer_end_index = outputs.end_logits.argmax()
-```
-
-Decode the predicted tokens to get the answer:
-
-```py
->>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
->>> tokenizer.decode(predict_answer_tokens)
-'176 billion parameters and can generate text in 46 languages natural languages and 13'
-```
-</pt>
-<tf>
-Tokenize the text and return TensorFlow tensors:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_qa_model")
->>> inputs = tokenizer(question, text, return_tensors="tf")
-```
-
-Pass your inputs to the model and return the `logits`:
-
-```py
->>> from transformers import TFAutoModelForQuestionAnswering
-
->>> model = TFAutoModelForQuestionAnswering.from_pretrained("my_awesome_qa_model")
->>> outputs = model(**inputs)
-```
-
-Get the highest probability from the model output for the start and end positions:
-
-```py
->>> answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])
->>> answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])
-```
-
-Decode the predicted tokens to get the answer:
-
-```py
->>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
->>> tokenizer.decode(predict_answer_tokens)
-'176 billion parameters and can generate text in 46 languages natural languages and 13'
-```
-</tf>
-</frameworkcontent>
\ No newline at end of file
diff --git a/docs/source/en/tasks/semantic_segmentation.md b/docs/source/en/tasks/semantic_segmentation.md
new file mode 100644
index 000000000000..c3ad3e00f61a
--- /dev/null
+++ b/docs/source/en/tasks/semantic_segmentation.md
@@ -0,0 +1,598 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Semantic segmentation
+
+[[open-in-colab]]
+
+<Youtube id="dKE8SIt9C-w"/>
+
+Semantic segmentation assigns a label or class to each individual pixel of an image. There are several types of segmentation, and in the case of semantic segmentation, no distinction is made between unique instances of the same object. Both objects are given the same label (for example, "car" instead of "car-1" and "car-2"). Common real-world applications of semantic segmentation include training self-driving cars to identify pedestrians and important traffic information, identifying cells and abnormalities in medical imagery, and monitoring environmental changes from satellite imagery.
+
+This guide will show you how to:
+
+1. Finetune [SegFormer](https://huggingface.co/docs/transformers/main/en/model_doc/segformer#segformer) on the [SceneParse150](https://huggingface.co/datasets/scene_parse_150) dataset.
+2. Use your finetuned model for inference.
+
+<Tip>
+The task illustrated in this tutorial is supported by the following model architectures:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[BEiT](../model_doc/beit), [Data2VecVision](../model_doc/data2vec-vision), [DPT](../model_doc/dpt), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [MobileViTV2](../model_doc/mobilevitv2), [SegFormer](../model_doc/segformer), [UPerNet](../model_doc/upernet)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+Before you begin, make sure you have all the necessary libraries installed:
+
+```bash
+pip install -q datasets transformers evaluate
+```
+
+We encourage you to log in to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to log in:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## Load SceneParse150 dataset
+
+Start by loading a smaller subset of the SceneParse150 dataset from the 🤗 Datasets library. This'll give you a chance to experiment and make sure everything works before spending more time training on the full dataset.
+
+```py
+>>> from datasets import load_dataset
+
+>>> ds = load_dataset("scene_parse_150", split="train[:50]")
+```
+
+Split the dataset's `train` split into a train and test set with the [`~datasets.Dataset.train_test_split`] method:
+
+```py
+>>> ds = ds.train_test_split(test_size=0.2)
+>>> train_ds = ds["train"]
+>>> test_ds = ds["test"]
+```
+
+Then take a look at an example:
+
+```py
+>>> train_ds[0]
+{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x683 at 0x7F9B0C201F90>,
+ 'annotation': <PIL.PngImagePlugin.PngImageFile image mode=L size=512x683 at 0x7F9B0C201DD0>,
+ 'scene_category': 368}
+```
+
+- `image`: a PIL image of the scene.
+- `annotation`: a PIL image of the segmentation map, which is also the model's target.
+- `scene_category`: a category id that describes the image scene like "kitchen" or "office". In this guide, you'll only need `image` and `annotation`, both of which are PIL images.
+
+You'll also want to create a dictionary that maps a label id to a label class which will be useful when you set up the model later. Download the mappings from the Hub and create the `id2label` and `label2id` dictionaries:
+
+```py
+>>> import json
+>>> from huggingface_hub import cached_download, hf_hub_url
+
+>>> repo_id = "huggingface/label-files"
+>>> filename = "ade20k-id2label.json"
+>>> id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename, repo_type="dataset")), "r"))
+>>> id2label = {int(k): v for k, v in id2label.items()}
+>>> label2id = {v: k for k, v in id2label.items()}
+>>> num_labels = len(id2label)
+```
+
+## Preprocess
+
+The next step is to load a SegFormer image processor to prepare the images and annotations for the model. Some datasets, like this one, use the zero-index as the background class. However, the background class isn't actually included in the 150 classes, so you'll need to set `reduce_labels=True` to subtract one from all the labels. The zero-index is replaced by `255` so it's ignored by SegFormer's loss function:
+
+```py
+>>> from transformers import AutoImageProcessor
+
+>>> checkpoint = "nvidia/mit-b0"
+>>> image_processor = AutoImageProcessor.from_pretrained(checkpoint, reduce_labels=True)
+```
+
+<frameworkcontent>
+<pt>
+
+It is common to apply some data augmentations to an image dataset to make a model more robust against overfitting. In this guide, you'll use the [`ColorJitter`](https://pytorch.org/vision/stable/generated/torchvision.transforms.ColorJitter.html) function from [torchvision](https://pytorch.org/vision/stable/index.html) to randomly change the color properties of an image, but you can also use any image library you like.
+
+```py
+>>> from torchvision.transforms import ColorJitter
+
+>>> jitter = ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.1)
+```
+
+Now create two preprocessing functions to prepare the images and annotations for the model. These functions convert the images into `pixel_values` and annotations to `labels`. For the training set, `jitter` is applied before providing the images to the image processor. For the test set, the image processor crops and normalizes the `images`, and only crops the `labels` because no data augmentation is applied during testing.
+
+```py
+>>> def train_transforms(example_batch):
+...     images = [jitter(x) for x in example_batch["image"]]
+...     labels = [x for x in example_batch["annotation"]]
+...     inputs = image_processor(images, labels)
+...     return inputs
+
+
+>>> def val_transforms(example_batch):
+...     images = [x for x in example_batch["image"]]
+...     labels = [x for x in example_batch["annotation"]]
+...     inputs = image_processor(images, labels)
+...     return inputs
+```
+
+To apply the `jitter` over the entire dataset, use the 🤗 Datasets [`~datasets.Dataset.set_transform`] function. The transform is applied on the fly which is faster and consumes less disk space:
+
+```py
+>>> train_ds.set_transform(train_transforms)
+>>> test_ds.set_transform(val_transforms)
+```
+
+</pt>
+</frameworkcontent>
+
+<frameworkcontent>
+<tf>
+It is common to apply some data augmentations to an image dataset to make a model more robust against overfitting.
+In this guide, you'll use [`tf.image`](https://www.tensorflow.org/api_docs/python/tf/image) to randomly change the color properties of an image, but you can also use any image
+library you like.
+Define two separate transformation functions:
+- training data transformations that include image augmentation
+- validation data transformations that only transpose the images, since computer vision models in 🤗 Transformers expect channels-first layout
+
+```py
+>>> import tensorflow as tf
+
+
+>>> def aug_transforms(image):
+...     image = tf.keras.utils.img_to_array(image)
+...     image = tf.image.random_brightness(image, 0.25)
+...     image = tf.image.random_contrast(image, 0.5, 2.0)
+...     image = tf.image.random_saturation(image, 0.75, 1.25)
+...     image = tf.image.random_hue(image, 0.1)
+...     image = tf.transpose(image, (2, 0, 1))
+...     return image
+
+
+>>> def transforms(image):
+...     image = tf.keras.utils.img_to_array(image)
+...     image = tf.transpose(image, (2, 0, 1))
+...     return image
+```
+
+Next, create two preprocessing functions to prepare batches of images and annotations for the model. These functions apply
+the image transformations and use the earlier loaded `image_processor` to convert the images into `pixel_values` and
+annotations to `labels`. `ImageProcessor` also takes care of resizing and normalizing the images.
+
+```py
+>>> def train_transforms(example_batch):
+...     images = [aug_transforms(x.convert("RGB")) for x in example_batch["image"]]
+...     labels = [x for x in example_batch["annotation"]]
+...     inputs = image_processor(images, labels)
+...     return inputs
+
+
+>>> def val_transforms(example_batch):
+...     images = [transforms(x.convert("RGB")) for x in example_batch["image"]]
+...     labels = [x for x in example_batch["annotation"]]
+...     inputs = image_processor(images, labels)
+...     return inputs
+```
+
+To apply the preprocessing transformations over the entire dataset, use the 🤗 Datasets [`~datasets.Dataset.set_transform`] function.
+The transform is applied on the fly which is faster and consumes less disk space:
+
+```py
+>>> train_ds.set_transform(train_transforms)
+>>> test_ds.set_transform(val_transforms)
+```
+</tf>
+</frameworkcontent>
+
+## Evaluate
+
+Including a metric during training is often helpful for evaluating your model's performance. You can quickly load an evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [mean Intersection over Union](https://huggingface.co/spaces/evaluate-metric/accuracy) (IoU) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric):
+
+```py
+>>> import evaluate
+
+>>> metric = evaluate.load("mean_iou")
+```
+
+Then create a function to [`~evaluate.EvaluationModule.compute`] the metrics. Your predictions need to be converted to
+logits first, and then reshaped to match the size of the labels before you can call [`~evaluate.EvaluationModule.compute`]:
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> import numpy as np
+>>> import torch
+>>> from torch import nn
+
+>>> def compute_metrics(eval_pred):
+...     with torch.no_grad():
+...         logits, labels = eval_pred
+...         logits_tensor = torch.from_numpy(logits)
+...         logits_tensor = nn.functional.interpolate(
+...             logits_tensor,
+...             size=labels.shape[-2:],
+...             mode="bilinear",
+...             align_corners=False,
+...         ).argmax(dim=1)
+
+...         pred_labels = logits_tensor.detach().cpu().numpy()
+...         metrics = metric.compute(
+...             predictions=pred_labels,
+...             references=labels,
+...             num_labels=num_labels,
+...             ignore_index=255,
+...             reduce_labels=False,
+...         )
+...         for key, value in metrics.items():
+...             if type(value) is np.ndarray:
+...                 metrics[key] = value.tolist()
+...         return metrics
+```
+
+</pt>
+</frameworkcontent>
+
+
+<frameworkcontent>
+<tf>
+
+```py
+>>> def compute_metrics(eval_pred):
+...     logits, labels = eval_pred
+...     logits = tf.transpose(logits, perm=[0, 2, 3, 1])
+...     logits_resized = tf.image.resize(
+...         logits,
+...         size=tf.shape(labels)[1:],
+...         method="bilinear",
+...     )
+
+...     pred_labels = tf.argmax(logits_resized, axis=-1)
+...     metrics = metric.compute(
+...         predictions=pred_labels,
+...         references=labels,
+...         num_labels=num_labels,
+...         ignore_index=-1,
+...         reduce_labels=image_processor.do_reduce_labels,
+...     )
+
+...     per_category_accuracy = metrics.pop("per_category_accuracy").tolist()
+...     per_category_iou = metrics.pop("per_category_iou").tolist()
+
+...     metrics.update({f"accuracy_{id2label[i]}": v for i, v in enumerate(per_category_accuracy)})
+...     metrics.update({f"iou_{id2label[i]}": v for i, v in enumerate(per_category_iou)})
+...     return {"val_" + k: v for k, v in metrics.items()}
+```
+
+</tf>
+</frameworkcontent>
+
+Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training.
+
+## Train
+<frameworkcontent>
+<pt>
+<Tip>
+
+If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#finetune-with-trainer)!
+
+</Tip>
+
+You're ready to start training your model now! Load SegFormer with [`AutoModelForSemanticSegmentation`], and pass the model the mapping between label ids and label classes:
+
+```py
+>>> from transformers import AutoModelForSemanticSegmentation, TrainingArguments, Trainer
+
+>>> model = AutoModelForSemanticSegmentation.from_pretrained(checkpoint, id2label=id2label, label2id=label2id)
+```
+
+At this point, only three steps remain:
+
+1. Define your training hyperparameters in [`TrainingArguments`]. It is important you don't remove unused columns because this'll drop the `image` column. Without the `image` column, you can't create `pixel_values`. Set `remove_unused_columns=False` to prevent this behavior! The only other required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the IoU metric and save the training checkpoint.
+2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function.
+3. Call [`~Trainer.train`] to finetune your model.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="segformer-b0-scene-parse-150",
+...     learning_rate=6e-5,
+...     num_train_epochs=50,
+...     per_device_train_batch_size=2,
+...     per_device_eval_batch_size=2,
+...     save_total_limit=3,
+...     evaluation_strategy="steps",
+...     save_strategy="steps",
+...     save_steps=20,
+...     eval_steps=20,
+...     logging_steps=1,
+...     eval_accumulation_steps=5,
+...     remove_unused_columns=False,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=train_ds,
+...     eval_dataset=test_ds,
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+</frameworkcontent>
+
+<frameworkcontent>
+<tf>
+<Tip>
+
+If you are unfamiliar with fine-tuning a model with Keras, check out the [basic tutorial](./training#train-a-tensorflow-model-with-keras) first!
+
+</Tip>
+
+To fine-tune a model in TensorFlow, follow these steps:
+1. Define the training hyperparameters, and set up an optimizer and a learning rate schedule.
+2. Instantiate a pretrained model.
+3. Convert a 🤗 Dataset to a `tf.data.Dataset`.
+4. Compile your model.
+5. Add callbacks to calculate metrics and upload your model to 🤗 Hub
+6. Use the `fit()` method to run the training.
+
+Start by defining the hyperparameters, optimizer and learning rate schedule:
+
+```py
+>>> from transformers import create_optimizer
+
+>>> batch_size = 2
+>>> num_epochs = 50
+>>> num_train_steps = len(train_ds) * num_epochs
+>>> learning_rate = 6e-5
+>>> weight_decay_rate = 0.01
+
+>>> optimizer, lr_schedule = create_optimizer(
+...     init_lr=learning_rate,
+...     num_train_steps=num_train_steps,
+...     weight_decay_rate=weight_decay_rate,
+...     num_warmup_steps=0,
+... )
+```
+
+Then, load SegFormer with [`TFAutoModelForSemanticSegmentation`] along with the label mappings, and compile it with the
+optimizer. Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to:
+
+```py
+>>> from transformers import TFAutoModelForSemanticSegmentation
+
+>>> model = TFAutoModelForSemanticSegmentation.from_pretrained(
+...     checkpoint,
+...     id2label=id2label,
+...     label2id=label2id,
+... )
+>>> model.compile(optimizer=optimizer)  # No loss argument!
+```
+
+Convert your datasets to the `tf.data.Dataset` format using the [`~datasets.Dataset.to_tf_dataset`] and the [`DefaultDataCollator`]:
+
+```py
+>>> from transformers import DefaultDataCollator
+
+>>> data_collator = DefaultDataCollator(return_tensors="tf")
+
+>>> tf_train_dataset = train_ds.to_tf_dataset(
+...     columns=["pixel_values", "label"],
+...     shuffle=True,
+...     batch_size=batch_size,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_eval_dataset = test_ds.to_tf_dataset(
+...     columns=["pixel_values", "label"],
+...     shuffle=True,
+...     batch_size=batch_size,
+...     collate_fn=data_collator,
+... )
+```
+
+To compute the accuracy from the predictions and push your model to the 🤗 Hub, use [Keras callbacks](../main_classes/keras_callbacks).
+Pass your `compute_metrics` function to [`KerasMetricCallback`],
+and use the [`PushToHubCallback`] to upload the model:
+
+```py
+>>> from transformers.keras_callbacks import KerasMetricCallback, PushToHubCallback
+
+>>> metric_callback = KerasMetricCallback(
+...     metric_fn=compute_metrics, eval_dataset=tf_eval_dataset, batch_size=batch_size, label_cols=["labels"]
+... )
+
+>>> push_to_hub_callback = PushToHubCallback(output_dir="scene_segmentation", tokenizer=image_processor)
+
+>>> callbacks = [metric_callback, push_to_hub_callback]
+```
+
+Finally, you are ready to train your model! Call `fit()` with your training and validation datasets, the number of epochs,
+and your callbacks to fine-tune the model:
+
+```py
+>>> model.fit(
+...     tf_train_dataset,
+...     validation_data=tf_eval_dataset,
+...     callbacks=callbacks,
+...     epochs=num_epochs,
+... )
+```
+
+Congratulations! You have fine-tuned your model and shared it on the 🤗 Hub. You can now use it for inference!
+</tf>
+</frameworkcontent>
+
+
+## Inference
+
+Great, now that you've finetuned a model, you can use it for inference!
+
+Load an image for inference:
+
+```py
+>>> image = ds[0]["image"]
+>>> image
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/semantic-seg-image.png" alt="Image of bedroom"/>
+</div>
+
+<frameworkcontent>
+<pt>
+The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for image segmentation with your model, and pass your image to it:
+
+```py
+>>> from transformers import pipeline
+
+>>> segmenter = pipeline("image-segmentation", model="my_awesome_seg_model")
+>>> segmenter(image)
+[{'score': None,
+  'label': 'wall',
+  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062690>},
+ {'score': None,
+  'label': 'sky',
+  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062A50>},
+ {'score': None,
+  'label': 'floor',
+  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062B50>},
+ {'score': None,
+  'label': 'ceiling',
+  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062A10>},
+ {'score': None,
+  'label': 'bed ',
+  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062E90>},
+ {'score': None,
+  'label': 'windowpane',
+  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062390>},
+ {'score': None,
+  'label': 'cabinet',
+  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062550>},
+ {'score': None,
+  'label': 'chair',
+  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062D90>},
+ {'score': None,
+  'label': 'armchair',
+  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062E10>}]
+```
+
+You can also manually replicate the results of the `pipeline` if you'd like. Process the image with an image processor and place the `pixel_values` on a GPU:
+
+```py
+>>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # use GPU if available, otherwise use a CPU
+>>> encoding = image_processor(image, return_tensors="pt")
+>>> pixel_values = encoding.pixel_values.to(device)
+```
+
+Pass your input to the model and return the `logits`:
+
+```py
+>>> outputs = model(pixel_values=pixel_values)
+>>> logits = outputs.logits.cpu()
+```
+
+Next, rescale the logits to the original image size:
+
+```py
+>>> upsampled_logits = nn.functional.interpolate(
+...     logits,
+...     size=image.size[::-1],
+...     mode="bilinear",
+...     align_corners=False,
+... )
+
+>>> pred_seg = upsampled_logits.argmax(dim=1)[0]
+```
+
+</pt>
+</frameworkcontent>
+
+<frameworkcontent>
+<tf>
+Load an image processor to preprocess the image and return the input as TensorFlow tensors:
+
+```py
+>>> from transformers import AutoImageProcessor
+
+>>> image_processor = AutoImageProcessor.from_pretrained("MariaK/scene_segmentation")
+>>> inputs = image_processor(image, return_tensors="tf")
+```
+
+Pass your input to the model and return the `logits`:
+
+```py
+>>> from transformers import TFAutoModelForSemanticSegmentation
+
+>>> model = TFAutoModelForSemanticSegmentation.from_pretrained("MariaK/scene_segmentation")
+>>> logits = model(**inputs).logits
+```
+
+Next, rescale the logits to the original image size and apply argmax on the class dimension:
+```py
+>>> logits = tf.transpose(logits, [0, 2, 3, 1])
+
+>>> upsampled_logits = tf.image.resize(
+...     logits,
+...     # We reverse the shape of `image` because `image.size` returns width and height.
+...     image.size[::-1],
+... )
+
+>>> pred_seg = tf.math.argmax(upsampled_logits, axis=-1)[0]
+```
+
+</tf>
+</frameworkcontent>
+
+To visualize the results, load the [dataset color palette](https://github.com/tensorflow/models/blob/3f1ca33afe3c1631b733ea7e40c294273b9e406d/research/deeplab/utils/get_dataset_colormap.py#L51) as `ade_palette()` that maps each class to their RGB values. Then you can combine and plot your image and the predicted segmentation map:
+
+```py
+>>> import matplotlib.pyplot as plt
+>>> import numpy as np
+
+>>> color_seg = np.zeros((pred_seg.shape[0], pred_seg.shape[1], 3), dtype=np.uint8)
+>>> palette = np.array(ade_palette())
+>>> for label, color in enumerate(palette):
+...     color_seg[pred_seg == label, :] = color
+>>> color_seg = color_seg[..., ::-1]  # convert to BGR
+
+>>> img = np.array(image) * 0.5 + color_seg * 0.5  # plot the image with the segmentation map
+>>> img = img.astype(np.uint8)
+
+>>> plt.figure(figsize=(15, 10))
+>>> plt.imshow(img)
+>>> plt.show()
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/semantic-seg-preds.png" alt="Image of bedroom overlaid with segmentation map"/>
+</div>
diff --git a/docs/source/en/tasks/semantic_segmentation.mdx b/docs/source/en/tasks/semantic_segmentation.mdx
deleted file mode 100644
index f1ab7ee0ea68..000000000000
--- a/docs/source/en/tasks/semantic_segmentation.mdx
+++ /dev/null
@@ -1,333 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Semantic segmentation
-
-[[open-in-colab]]
-
-<Youtube id="dKE8SIt9C-w"/>
-
-Semantic segmentation assigns a label or class to each individual pixel of an image. There are several types of segmentation, and in the case of semantic segmentation, no distinction is made between unique instances of the same object. Both objects are given the same label (for example, "car" instead of "car-1" and "car-2"). Common real-world applications of semantic segmentation include training self-driving cars to identify pedestrians and important traffic information, identifying cells and abnormalities in medical imagery, and monitoring environmental changes from satellite imagery.
-
-This guide will show you how to:
-
-1. Finetune [SegFormer](https://huggingface.co/docs/transformers/main/en/model_doc/segformer#segformer) on the [SceneParse150](https://huggingface.co/datasets/scene_parse_150) dataset.
-2. Use your finetuned model for inference.
-
-<Tip>
-
-See the image segmentation [task page](https://huggingface.co/tasks/image-segmentation) for more information about its associated models, datasets, and metrics.
-
-</Tip>
-
-Before you begin, make sure you have all the necessary libraries installed:
-
-```bash
-pip install -q datasets transformers evaluate
-```
-
-We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login:
-
-```py
->>> from huggingface_hub import notebook_login
-
->>> notebook_login()
-```
-
-## Load SceneParse150 dataset
-
-Start by loading a smaller subset of the SceneParse150 dataset from the 🤗 Datasets library. This'll give you a chance to experiment and make sure everythings works before spending more time training on the full dataset.
-
-```py
->>> from datasets import load_dataset
-
->>> ds = load_dataset("scene_parse_150", split="train[:50]")
-```
-
-Split the dataset's `train` split into a train and test set with the [`~datasets.Dataset.train_test_split`] method:
-
-```py
->>> ds = ds.train_test_split(test_size=0.2)
->>> train_ds = ds["train"]
->>> test_ds = ds["test"]
-```
-
-Then take a look at an example:
-
-```py
->>> train_ds[0]
-{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x683 at 0x7F9B0C201F90>,
- 'annotation': <PIL.PngImagePlugin.PngImageFile image mode=L size=512x683 at 0x7F9B0C201DD0>,
- 'scene_category': 368}
-```
-
-- `image`: a PIL image of the scene.
-- `annotation`: a PIL image of the segmentation map, which is also the model's target.
-- `scene_category`: a category id that describes the image scene like "kitchen" or "office". In this guide, you'll only need `image` and `annotation`, both of which are PIL images.
-
-You'll also want to create a dictionary that maps a label id to a label class which will be useful when you set up the model later. Download the mappings from the Hub and create the `id2label` and `label2id` dictionaries:
-
-```py
->>> import json
->>> from huggingface_hub import cached_download, hf_hub_url
-
->>> repo_id = "huggingface/label-files"
->>> filename = "ade20k-id2label.json"
->>> id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename, repo_type="dataset")), "r"))
->>> id2label = {int(k): v for k, v in id2label.items()}
->>> label2id = {v: k for k, v in id2label.items()}
->>> num_labels = len(id2label)
-```
-
-## Preprocess
-
-The next step is to load a SegFormer image processor to prepare the images and annotations for the model. Some datasets, like this one, use the zero-index as the background class. However, the background class isn't actually included in the 150 classes, so you'll need to set `reduce_labels=True` to subtract one from all the labels. The zero-index is replaced by `255` so it's ignored by SegFormer's loss function:
-
-```py
->>> from transformers import AutoImageProcessor
-
->>> feature_extractor = AutoImageProcessor.from_pretrained("nvidia/mit-b0", reduce_labels=True)
-```
-
-It is common to apply some data augmentations to an image dataset to make a model more robust against overfitting. In this guide, you'll use the [`ColorJitter`](https://pytorch.org/vision/stable/generated/torchvision.transforms.ColorJitter.html) function from [torchvision](https://pytorch.org/vision/stable/index.html) to randomly change the color properties of an image, but you can also use any image library you like.
-
-```py
->>> from torchvision.transforms import ColorJitter
-
->>> jitter = ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.1)
-```
-
-Now create two preprocessing functions to prepare the images and annotations for the model. These functions convert the images into `pixel_values` and annotations to `labels`. For the training set, `jitter` is applied before providing the images to the image processor. For the test set, the image processor crops and normalizes the `images`, and only crops the `labels` because no data augmentation is applied during testing.
-
-```py
->>> def train_transforms(example_batch):
-...     images = [jitter(x) for x in example_batch["image"]]
-...     labels = [x for x in example_batch["annotation"]]
-...     inputs = feature_extractor(images, labels)
-...     return inputs
-
-
->>> def val_transforms(example_batch):
-...     images = [x for x in example_batch["image"]]
-...     labels = [x for x in example_batch["annotation"]]
-...     inputs = feature_extractor(images, labels)
-...     return inputs
-```
-
-To apply the `jitter` over the entire dataset, use the 🤗 Datasets [`~datasets.Dataset.set_transform`] function. The transform is applied on the fly which is faster and consumes less disk space:
-
-```py
->>> train_ds.set_transform(train_transforms)
->>> test_ds.set_transform(val_transforms)
-```
-
-## Evaluate
-
-Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [mean Intersection over Union](https://huggingface.co/spaces/evaluate-metric/accuracy) (IoU) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric):
-
-```py
->>> import evaluate
-
->>> metric = evaluate.load("mean_iou")
-```
-
-Then create a function to [`~evaluate.EvaluationModule.compute`] the metrics. Your predictions need to be converted to logits first, and then reshaped to match the size of the labels before you can call [`~evaluate.EvaluationModule.compute`]:
-
-```py
->>> def compute_metrics(eval_pred):
-...     with torch.no_grad():
-...         logits, labels = eval_pred
-...         logits_tensor = torch.from_numpy(logits)
-...         logits_tensor = nn.functional.interpolate(
-...             logits_tensor,
-...             size=labels.shape[-2:],
-...             mode="bilinear",
-...             align_corners=False,
-...         ).argmax(dim=1)
-
-...         pred_labels = logits_tensor.detach().cpu().numpy()
-...         metrics = metric.compute(
-...             predictions=pred_labels,
-...             references=labels,
-...             num_labels=num_labels,
-...             ignore_index=255,
-...             reduce_labels=False,
-...         )
-...         for key, value in metrics.items():
-...             if type(value) is np.ndarray:
-...                 metrics[key] = value.tolist()
-...         return metrics
-```
-
-Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training.
-
-## Train
-
-<Tip>
-
-If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#finetune-with-trainer)!
-
-</Tip>
-
-You're ready to start training your model now! Load SegFormer with [`AutoModelForSemanticSegmentation`], and pass the model the mapping between label ids and label classes:
-
-```py
->>> from transformers import AutoModelForSemanticSegmentation, TrainingArguments, Trainer
-
->>> pretrained_model_name = "nvidia/mit-b0"
->>> model = AutoModelForSemanticSegmentation.from_pretrained(
-...     pretrained_model_name, id2label=id2label, label2id=label2id
-... )
-```
-
-At this point, only three steps remain:
-
-1. Define your training hyperparameters in [`TrainingArguments`]. It is important you don't remove unused columns because this'll drop the `image` column. Without the `image` column, you can't create `pixel_values`. Set `remove_unused_columns=False` to prevent this behavior! The only other required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the IoU metric and save the training checkpoint.
-2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function.
-3. Call [`~Trainer.train`] to finetune your model.
-
-```py
->>> training_args = TrainingArguments(
-...     output_dir="segformer-b0-scene-parse-150",
-...     learning_rate=6e-5,
-...     num_train_epochs=50,
-...     per_device_train_batch_size=2,
-...     per_device_eval_batch_size=2,
-...     save_total_limit=3,
-...     evaluation_strategy="steps",
-...     save_strategy="steps",
-...     save_steps=20,
-...     eval_steps=20,
-...     logging_steps=1,
-...     eval_accumulation_steps=5,
-...     remove_unused_columns=False,
-...     push_to_hub=True,
-... )
-
->>> trainer = Trainer(
-...     model=model,
-...     args=training_args,
-...     train_dataset=train_ds,
-...     eval_dataset=test_ds,
-...     compute_metrics=compute_metrics,
-... )
-
->>> trainer.train()
-```
-
-Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model:
-
-```py
->>> trainer.push_to_hub()
-```
-
-## Inference
-
-Great, now that you've finetuned a model, you can use it for inference!
-
-Load an image for inference:
-
-```py
->>> image = ds[0]["image"]
->>> image
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/semantic-seg-image.png" alt="Image of bedroom"/>
-</div>
-
-The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for image segmentation with your model, and pass your image to it:
-
-```py
->>> from transformers import pipeline
-
->>> segmenter = pipeline("image-segmentation", model="my_awesome_seg_model")
->>> segmenter(image)
-[{'score': None,
-  'label': 'wall',
-  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062690>},
- {'score': None,
-  'label': 'sky',
-  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062A50>},
- {'score': None,
-  'label': 'floor',
-  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062B50>},
- {'score': None,
-  'label': 'ceiling',
-  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062A10>},
- {'score': None,
-  'label': 'bed ',
-  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062E90>},
- {'score': None,
-  'label': 'windowpane',
-  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062390>},
- {'score': None,
-  'label': 'cabinet',
-  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062550>},
- {'score': None,
-  'label': 'chair',
-  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062D90>},
- {'score': None,
-  'label': 'armchair',
-  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062E10>}]
-```
-
-You can also manually replicate the results of the `pipeline` if you'd like. Process the image with an image processor and place the `pixel_values` on a GPU:
-
-```py
->>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # use GPU if available, otherwise use a CPU
->>> encoding = feature_extractor(image, return_tensors="pt")
->>> pixel_values = encoding.pixel_values.to(device)
-```
-
-Pass your input to the model and return the `logits`:
-
-```py
->>> outputs = model(pixel_values=pixel_values)
->>> logits = outputs.logits.cpu()
-```
-
-Next, rescale the logits to the original image size:
-
-```py
->>> upsampled_logits = nn.functional.interpolate(
-...     logits,
-...     size=image.size[::-1],
-...     mode="bilinear",
-...     align_corners=False,
-... )
-
->>> pred_seg = upsampled_logits.argmax(dim=1)[0]
-```
-
-To visualize the results, load the [dataset color palette](https://github.com/tensorflow/models/blob/3f1ca33afe3c1631b733ea7e40c294273b9e406d/research/deeplab/utils/get_dataset_colormap.py#L51) that maps each class to their RGB values. Then you can combine and plot your image and the predicted segmentation map:
-
-```py
->>> import matplotlib.pyplot as plt
-
->>> color_seg = np.zeros((pred_seg.shape[0], pred_seg.shape[1], 3), dtype=np.uint8)
->>> palette = np.array(ade_palette())
->>> for label, color in enumerate(palette):
-...     color_seg[pred_seg == label, :] = color
->>> color_seg = color_seg[..., ::-1]  # convert to BGR
-
->>> img = np.array(image) * 0.5 + color_seg * 0.5  # plot the image with the segmentation map
->>> img = img.astype(np.uint8)
-
->>> plt.figure(figsize=(15, 10))
->>> plt.imshow(img)
->>> plt.show()
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/semantic-seg-preds.png" alt="Image of bedroom overlaid with segmentation map"/>
-</div>
\ No newline at end of file
diff --git a/docs/source/en/tasks/sequence_classification.md b/docs/source/en/tasks/sequence_classification.md
new file mode 100644
index 000000000000..b67d43453d27
--- /dev/null
+++ b/docs/source/en/tasks/sequence_classification.md
@@ -0,0 +1,399 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Text classification
+
+[[open-in-colab]]
+
+<Youtube id="leNG9fN9FQU"/>
+
+Text classification is a common NLP task that assigns a label or class to text. Some of the largest companies run text classification in production for a wide range of practical applications. One of the most popular forms of text classification is sentiment analysis, which assigns a label like 🙂 positive, 🙁 negative, or 😐 neutral to a sequence of text.
+
+This guide will show you how to:
+
+1. Finetune [DistilBERT](https://huggingface.co/distilbert-base-uncased) on the [IMDb](https://huggingface.co/datasets/imdb) dataset to determine whether a movie review is positive or negative.
+2. Use your finetuned model for inference.
+
+<Tip>
+The task illustrated in this tutorial is supported by the following model architectures:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+
+[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [CodeLlama](../model_doc/code_llama), [ConvBERT](../model_doc/convbert), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [ESM](../model_doc/esm), [Falcon](../model_doc/falcon), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [LLaMA](../model_doc/llama), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [Mistral](../model_doc/mistral), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MPT](../model_doc/mpt), [MRA](../model_doc/mra), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Perceiver](../model_doc/perceiver), [Persimmon](../model_doc/persimmon), [PLBart](../model_doc/plbart), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [T5](../model_doc/t5), [TAPAS](../model_doc/tapas), [Transformer-XL](../model_doc/transfo-xl), [UMT5](../model_doc/umt5), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
+
+
+
+<!--End of the generated tip-->
+
+</Tip>
+
+Before you begin, make sure you have all the necessary libraries installed:
+
+```bash
+pip install transformers datasets evaluate
+```
+
+We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## Load IMDb dataset
+
+Start by loading the IMDb dataset from the 🤗 Datasets library:
+
+```py
+>>> from datasets import load_dataset
+
+>>> imdb = load_dataset("imdb")
+```
+
+Then take a look at an example:
+
+```py
+>>> imdb["test"][0]
+{
+    "label": 0,
+    "text": "I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the background, and painfully one-dimensional characters cannot be overcome with a 'sci-fi' setting. (I'm sure there are those of you out there who think Babylon 5 is good sci-fi TV. It's not. It's clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It's really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it's rubbish as they have to always say \"Gene Roddenberry's Earth...\" otherwise people would not continue watching. Roddenberry's ashes must be turning in their orbit as this dull, cheap, poorly edited (watching it without advert breaks really brings this home) trudging Trabant of a show lumbers into space. Spoiler. So, kill off a main character. And then bring him back as another actor. Jeeez! Dallas all over again.",
+}
+```
+
+There are two fields in this dataset:
+
+- `text`: the movie review text.
+- `label`: a value that is either `0` for a negative review or `1` for a positive review.
+
+## Preprocess
+
+The next step is to load a DistilBERT tokenizer to preprocess the `text` field:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+```
+
+Create a preprocessing function to tokenize `text` and truncate sequences to be no longer than DistilBERT's maximum input length:
+
+```py
+>>> def preprocess_function(examples):
+...     return tokenizer(examples["text"], truncation=True)
+```
+
+To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] function. You can speed up `map` by setting `batched=True` to process multiple elements of the dataset at once:
+
+```py
+tokenized_imdb = imdb.map(preprocess_function, batched=True)
+```
+
+Now create a batch of examples using [`DataCollatorWithPadding`]. It's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.
+
+<frameworkcontent>
+<pt>
+```py
+>>> from transformers import DataCollatorWithPadding
+
+>>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+```
+</pt>
+<tf>
+```py
+>>> from transformers import DataCollatorWithPadding
+
+>>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
+```
+</tf>
+</frameworkcontent>
+
+## Evaluate
+
+Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric):
+
+```py
+>>> import evaluate
+
+>>> accuracy = evaluate.load("accuracy")
+```
+
+Then create a function that passes your predictions and labels to [`~evaluate.EvaluationModule.compute`] to calculate the accuracy:
+
+```py
+>>> import numpy as np
+
+
+>>> def compute_metrics(eval_pred):
+...     predictions, labels = eval_pred
+...     predictions = np.argmax(predictions, axis=1)
+...     return accuracy.compute(predictions=predictions, references=labels)
+```
+
+Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training.
+
+## Train
+
+Before you start training your model, create a map of the expected ids to their labels with `id2label` and `label2id`:
+
+```py
+>>> id2label = {0: "NEGATIVE", 1: "POSITIVE"}
+>>> label2id = {"NEGATIVE": 0, "POSITIVE": 1}
+```
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)!
+
+</Tip>
+
+You're ready to start training your model now! Load DistilBERT with [`AutoModelForSequenceClassification`] along with the number of expected labels, and the label mappings:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
+
+>>> model = AutoModelForSequenceClassification.from_pretrained(
+...     "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
+... )
+```
+
+At this point, only three steps remain:
+
+1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the accuracy and save the training checkpoint.
+2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function.
+3. Call [`~Trainer.train`] to finetune your model.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_model",
+...     learning_rate=2e-5,
+...     per_device_train_batch_size=16,
+...     per_device_eval_batch_size=16,
+...     num_train_epochs=2,
+...     weight_decay=0.01,
+...     evaluation_strategy="epoch",
+...     save_strategy="epoch",
+...     load_best_model_at_end=True,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=tokenized_imdb["train"],
+...     eval_dataset=tokenized_imdb["test"],
+...     tokenizer=tokenizer,
+...     data_collator=data_collator,
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+<Tip>
+
+[`Trainer`] applies dynamic padding by default when you pass `tokenizer` to it. In this case, you don't need to specify a data collator explicitly.
+
+</Tip>
+
+Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+<Tip>
+
+If you aren't familiar with finetuning a model with Keras, take a look at the basic tutorial [here](../training#train-a-tensorflow-model-with-keras)!
+
+</Tip>
+To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters:
+
+```py
+>>> from transformers import create_optimizer
+>>> import tensorflow as tf
+
+>>> batch_size = 16
+>>> num_epochs = 5
+>>> batches_per_epoch = len(tokenized_imdb["train"]) // batch_size
+>>> total_train_steps = int(batches_per_epoch * num_epochs)
+>>> optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
+```
+
+Then you can load DistilBERT with [`TFAutoModelForSequenceClassification`] along with the number of expected labels, and the label mappings:
+
+```py
+>>> from transformers import TFAutoModelForSequenceClassification
+
+>>> model = TFAutoModelForSequenceClassification.from_pretrained(
+...     "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
+... )
+```
+
+Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
+
+```py
+>>> tf_train_set = model.prepare_tf_dataset(
+...     tokenized_imdb["train"],
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_validation_set = model.prepare_tf_dataset(
+...     tokenized_imdb["test"],
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to:
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)  # No loss argument!
+```
+
+The last two things to setup before you start training is to compute the accuracy from the predictions, and provide a way to push your model to the Hub. Both are done by using [Keras callbacks](../main_classes/keras_callbacks).
+
+Pass your `compute_metrics` function to [`~transformers.KerasMetricCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import KerasMetricCallback
+
+>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
+```
+
+Specify where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> push_to_hub_callback = PushToHubCallback(
+...     output_dir="my_awesome_model",
+...     tokenizer=tokenizer,
+... )
+```
+
+Then bundle your callbacks together:
+
+```py
+>>> callbacks = [metric_callback, push_to_hub_callback]
+```
+
+Finally, you're ready to start training your model! Call [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) with your training and validation datasets, the number of epochs, and your callbacks to finetune the model:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=callbacks)
+```
+
+Once training is completed, your model is automatically uploaded to the Hub so everyone can use it!
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+For a more in-depth example of how to finetune a model for text classification, take a look at the corresponding
+[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb)
+or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb).
+
+</Tip>
+
+## Inference
+
+Great, now that you've finetuned a model, you can use it for inference!
+
+Grab some text you'd like to run inference on:
+
+```py
+>>> text = "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three."
+```
+
+The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for sentiment analysis with your model, and pass your text to it:
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline("sentiment-analysis", model="stevhliu/my_awesome_model")
+>>> classifier(text)
+[{'label': 'POSITIVE', 'score': 0.9994940757751465}]
+```
+
+You can also manually replicate the results of the `pipeline` if you'd like:
+
+<frameworkcontent>
+<pt>
+Tokenize the text and return PyTorch tensors:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_model")
+>>> inputs = tokenizer(text, return_tensors="pt")
+```
+
+Pass your inputs to the model and return the `logits`:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("stevhliu/my_awesome_model")
+>>> with torch.no_grad():
+...     logits = model(**inputs).logits
+```
+
+Get the class with the highest probability, and use the model's `id2label` mapping to convert it to a text label:
+
+```py
+>>> predicted_class_id = logits.argmax().item()
+>>> model.config.id2label[predicted_class_id]
+'POSITIVE'
+```
+</pt>
+<tf>
+Tokenize the text and return TensorFlow tensors:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_model")
+>>> inputs = tokenizer(text, return_tensors="tf")
+```
+
+Pass your inputs to the model and return the `logits`:
+
+```py
+>>> from transformers import TFAutoModelForSequenceClassification
+
+>>> model = TFAutoModelForSequenceClassification.from_pretrained("stevhliu/my_awesome_model")
+>>> logits = model(**inputs).logits
+```
+
+Get the class with the highest probability, and use the model's `id2label` mapping to convert it to a text label:
+
+```py
+>>> predicted_class_id = int(tf.math.argmax(logits, axis=-1)[0])
+>>> model.config.id2label[predicted_class_id]
+'POSITIVE'
+```
+</tf>
+</frameworkcontent>
diff --git a/docs/source/en/tasks/sequence_classification.mdx b/docs/source/en/tasks/sequence_classification.mdx
deleted file mode 100644
index bc9c5f20e722..000000000000
--- a/docs/source/en/tasks/sequence_classification.mdx
+++ /dev/null
@@ -1,386 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Text classification
-
-[[open-in-colab]]
-
-<Youtube id="leNG9fN9FQU"/>
-
-Text classification is a common NLP task that assigns a label or class to text. Some of the largest companies run text classification in production for a wide range of practical applications. One of the most popular forms of text classification is sentiment analysis, which assigns a label like 🙂 positive, 🙁 negative, or 😐 neutral to a sequence of text. 
-
-This guide will show you how to:
-
-1. Finetune [DistilBERT](https://huggingface.co/distilbert-base-uncased) on the [IMDb](https://huggingface.co/datasets/imdb) dataset to determine whether a movie review is positive or negative.
-2. Use your finetuned model for inference.
-
-<Tip>
-
-See the text classification [task page](https://huggingface.co/tasks/text-classification) for more information about other forms of text classification and their associated models, datasets, and metrics.
-
-</Tip>
-
-Before you begin, make sure you have all the necessary libraries installed:
-
-```bash
-pip install transformers datasets evaluate
-```
-
-We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login:
-
-```py
->>> from huggingface_hub import notebook_login
-
->>> notebook_login()
-```
-
-## Load IMDb dataset
-
-Start by loading the IMDb dataset from the 🤗 Datasets library:
-
-```py
->>> from datasets import load_dataset
-
->>> imdb = load_dataset("imdb")
-```
-
-Then take a look at an example:
-
-```py
->>> imdb["test"][0]
-{
-    "label": 0,
-    "text": "I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the background, and painfully one-dimensional characters cannot be overcome with a 'sci-fi' setting. (I'm sure there are those of you out there who think Babylon 5 is good sci-fi TV. It's not. It's clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It's really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it's rubbish as they have to always say \"Gene Roddenberry's Earth...\" otherwise people would not continue watching. Roddenberry's ashes must be turning in their orbit as this dull, cheap, poorly edited (watching it without advert breaks really brings this home) trudging Trabant of a show lumbers into space. Spoiler. So, kill off a main character. And then bring him back as another actor. Jeeez! Dallas all over again.",
-}
-```
-
-There are two fields in this dataset: 
-
-- `text`: the movie review text.
-- `label`: a value that is either `0` for a negative review or `1` for a positive review.
-
-## Preprocess
-
-The next step is to load a DistilBERT tokenizer to preprocess the `text` field:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
-```
-
-Create a preprocessing function to tokenize `text` and truncate sequences to be no longer than DistilBERT's maximum input length:
-
-```py
->>> def preprocess_function(examples):
-...     return tokenizer(examples["text"], truncation=True)
-```
-
-To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] function. You can speed up `map` by setting `batched=True` to process multiple elements of the dataset at once:
-
-```py
-tokenized_imdb = imdb.map(preprocess_function, batched=True)
-```
-
-Now create a batch of examples using [`DataCollatorWithPadding`]. It's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximium length.
-
-<frameworkcontent>
-<pt>
-```py
->>> from transformers import DataCollatorWithPadding
-
->>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
-```
-</pt>
-<tf>
-```py
->>> from transformers import DataCollatorWithPadding
-
->>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
-```
-</tf>
-</frameworkcontent>
-
-## Evaluate
-
-Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric):
-
-```py
->>> import evaluate
-
->>> accuracy = evaluate.load("accuracy")
-```
-
-Then create a function that passes your predictions and labels to [`~evaluate.EvaluationModule.compute`] to calculate the accuracy:
-
-```py
->>> import numpy as np
-
-
->>> def compute_metrics(eval_pred):
-...     predictions, labels = eval_pred
-...     predictions = np.argmax(predictions, axis=1)
-...     return accuracy.compute(predictions=predictions, references=labels)
-```
-
-Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training.
-
-## Train
-
-Before you start training your model, create a map of the expected ids to their labels with `id2label` and `label2id`:
-
-```py
->>> id2label = {0: "NEGATIVE", 1: "POSITIVE"}
->>> label2id = {"NEGATIVE": 0, "POSITIVE": 1}
-```
-
-<frameworkcontent>
-<pt>
-<Tip>
-
-If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)!
-
-</Tip>
-You're ready to start training your model now! Load DistilBERT with [`AutoModelForSequenceClassification`] along with the number of expected labels, and the label mappings:
-
-```py
->>> from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
-
->>> model = AutoModelForSequenceClassification.from_pretrained(
-...     "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
-... )
-```
-
-At this point, only three steps remain:
-
-1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the accuracy and save the training checkpoint.
-2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function.
-3. Call [`~Trainer.train`] to finetune your model.
-
-```py
->>> training_args = TrainingArguments(
-...     output_dir="my_awesome_model",
-...     learning_rate=2e-5,
-...     per_device_train_batch_size=16,
-...     per_device_eval_batch_size=16,
-...     num_train_epochs=2,
-...     weight_decay=0.01,
-...     evaluation_strategy="epoch",
-...     save_strategy="epoch",
-...     load_best_model_at_end=True,
-...     push_to_hub=True,
-... )
-
->>> trainer = Trainer(
-...     model=model,
-...     args=training_args,
-...     train_dataset=tokenized_imdb["train"],
-...     eval_dataset=tokenized_imdb["test"],
-...     tokenizer=tokenizer,
-...     data_collator=data_collator,
-...     compute_metrics=compute_metrics,
-... )
-
->>> trainer.train()
-```
-
-<Tip>
-
-[`Trainer`] applies dynamic padding by default when you pass `tokenizer` to it. In this case, you don't need to specify a data collator explicitly.
-
-</Tip>
-
-Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model:
-
-```py
->>> trainer.push_to_hub()
-```
-</pt>
-<tf>
-<Tip>
-
-If you aren't familiar with finetuning a model with Keras, take a look at the basic tutorial [here](../training#train-a-tensorflow-model-with-keras)!
-
-</Tip>
-To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters:
-
-```py
->>> from transformers import create_optimizer
->>> import tensorflow as tf
-
->>> batch_size = 16
->>> num_epochs = 5
->>> batches_per_epoch = len(tokenized_imdb["train"]) // batch_size
->>> total_train_steps = int(batches_per_epoch * num_epochs)
->>> optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
-```
-
-Then you can load DistilBERT with [`TFAutoModelForSequenceClassification`] along with the number of expected labels, and the label mappings:
-
-```py
->>> from transformers import TFAutoModelForSequenceClassification
-
->>> model = TFAutoModelForSequenceClassification.from_pretrained(
-...     "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
-... )
-```
-
-Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
-
-```py
->>> tf_train_set = model.prepare_tf_dataset(
-...     tokenized_imdb["train"],
-...     shuffle=True,
-...     batch_size=16,
-...     collate_fn=data_collator,
-... )
-
->>> tf_validation_set = model.prepare_tf_dataset(
-...     tokenized_imdb["test"],
-...     shuffle=False,
-...     batch_size=16,
-...     collate_fn=data_collator,
-... )
-```
-
-Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
-
-```py
->>> import tensorflow as tf
-
->>> model.compile(optimizer=optimizer)
-```
-
-The last two things to setup before you start training is to compute the accuracy from the predictions, and provide a way to push your model to the Hub. Both are done by using [Keras callbacks](./main_classes/keras_callbacks).
-
-Pass your `compute_metrics` function to [`~transformers.KerasMetricCallback`]:
-
-```py
->>> from transformers.keras_callbacks import KerasMetricCallback
-
->>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
-```
-
-Specify where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]:
-
-```py
->>> from transformers.keras_callbacks import PushToHubCallback
-
->>> push_to_hub_callback = PushToHubCallback(
-...     output_dir="my_awesome_model",
-...     tokenizer=tokenizer,
-... )
-```
-
-Then bundle your callbacks together:
-
-```py
->>> callbacks = [metric_callback, push_to_hub_callback]
-```
-
-Finally, you're ready to start training your model! Call [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) with your training and validation datasets, the number of epochs, and your callbacks to finetune the model:
-
-```py
->>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=callbacks)
-```
-
-Once training is completed, your model is automatically uploaded to the Hub so everyone can use it!
-</tf>
-</frameworkcontent>
-
-<Tip>
-
-For a more in-depth example of how to finetune a model for text classification, take a look at the corresponding
-[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb)
-or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb).
-
-</Tip>
-
-## Inference
-
-Great, now that you've finetuned a model, you can use it for inference!
-
-Grab some text you'd like to run inference on:
-
-```py
->>> text = "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three."
-```
-
-The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for sentiment analysis with your model, and pass your text to it:
-
-```py
->>> from transformers import pipeline
-
->>> classifier = pipeline("sentiment-analysis", model="stevhliu/my_awesome_model")
->>> classifier(text)
-[{'label': 'POSITIVE', 'score': 0.9994940757751465}]
-```
-
-You can also manually replicate the results of the `pipeline` if you'd like:
-
-<frameworkcontent>
-<pt>
-Tokenize the text and return PyTorch tensors:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_model")
->>> inputs = tokenizer(text, return_tensors="pt")
-```
-
-Pass your inputs to the model and return the `logits`:
-
-```py
->>> from transformers import AutoModelForSequenceClassification
-
->>> model = AutoModelForSequenceClassification.from_pretrained("stevhliu/my_awesome_model")
->>> with torch.no_grad():
-...     logits = model(**inputs).logits
-```
-
-Get the class with the highest probability, and use the model's `id2label` mapping to convert it to a text label:
-
-```py
->>> predicted_class_id = logits.argmax().item()
->>> model.config.id2label[predicted_class_id]
-'POSITIVE'
-```
-</pt>
-<tf>
-Tokenize the text and return TensorFlow tensors:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_model")
->>> inputs = tokenizer(text, return_tensors="tf")
-```
-
-Pass your inputs to the model and return the `logits`:
-
-```py
->>> from transformers import TFAutoModelForSequenceClassification
-
->>> model = TFAutoModelForSequenceClassification.from_pretrained("stevhliu/my_awesome_model")
->>> logits = model(**inputs).logits
-```
-
-Get the class with the highest probability, and use the model's `id2label` mapping to convert it to a text label:
-
-```py
->>> predicted_class_id = int(tf.math.argmax(logits, axis=-1)[0])
->>> model.config.id2label[predicted_class_id]
-'POSITIVE'
-```
-</tf>
-</frameworkcontent>
\ No newline at end of file
diff --git a/docs/source/en/tasks/summarization.md b/docs/source/en/tasks/summarization.md
new file mode 100644
index 000000000000..ecdf37ce6efb
--- /dev/null
+++ b/docs/source/en/tasks/summarization.md
@@ -0,0 +1,403 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Summarization
+
+[[open-in-colab]]
+
+<Youtube id="yHnr5Dk2zCI"/>
+
+Summarization creates a shorter version of a document or an article that captures all the important information. Along with translation, it is another example of a task that can be formulated as a sequence-to-sequence task. Summarization can be:
+
+- Extractive: extract the most relevant information from a document.
+- Abstractive: generate new text that captures the most relevant information.
+
+This guide will show you how to:
+
+1. Finetune [T5](https://huggingface.co/t5-small) on the California state bill subset of the [BillSum](https://huggingface.co/datasets/billsum) dataset for abstractive summarization.
+2. Use your finetuned model for inference.
+
+<Tip>
+The task illustrated in this tutorial is supported by the following model architectures:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[BART](../model_doc/bart), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [Encoder decoder](../model_doc/encoder-decoder), [FairSeq Machine-Translation](../model_doc/fsmt), [GPTSAN-japanese](../model_doc/gptsan-japanese), [LED](../model_doc/led), [LongT5](../model_doc/longt5), [M2M100](../model_doc/m2m_100), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [NLLB](../model_doc/nllb), [NLLB-MOE](../model_doc/nllb-moe), [Pegasus](../model_doc/pegasus), [PEGASUS-X](../model_doc/pegasus_x), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [SwitchTransformers](../model_doc/switch_transformers), [T5](../model_doc/t5), [UMT5](../model_doc/umt5), [XLM-ProphetNet](../model_doc/xlm-prophetnet)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+Before you begin, make sure you have all the necessary libraries installed:
+
+```bash
+pip install transformers datasets evaluate rouge_score
+```
+
+We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## Load BillSum dataset
+
+Start by loading the smaller California state bill subset of the BillSum dataset from the 🤗 Datasets library:
+
+```py
+>>> from datasets import load_dataset
+
+>>> billsum = load_dataset("billsum", split="ca_test")
+```
+
+Split the dataset into a train and test set with the [`~datasets.Dataset.train_test_split`] method:
+
+```py
+>>> billsum = billsum.train_test_split(test_size=0.2)
+```
+
+Then take a look at an example:
+
+```py
+>>> billsum["train"][0]
+{'summary': 'Existing law authorizes state agencies to enter into contracts for the acquisition of goods or services upon approval by the Department of General Services. Existing law sets forth various requirements and prohibitions for those contracts, including, but not limited to, a prohibition on entering into contracts for the acquisition of goods or services of $100,000 or more with a contractor that discriminates between spouses and domestic partners or same-sex and different-sex couples in the provision of benefits. Existing law provides that a contract entered into in violation of those requirements and prohibitions is void and authorizes the state or any person acting on behalf of the state to bring a civil action seeking a determination that a contract is in violation and therefore void. Under existing law, a willful violation of those requirements and prohibitions is a misdemeanor.\nThis bill would also prohibit a state agency from entering into contracts for the acquisition of goods or services of $100,000 or more with a contractor that discriminates between employees on the basis of gender identity in the provision of benefits, as specified. By expanding the scope of a crime, this bill would impose a state-mandated local program.\nThe California Constitution requires the state to reimburse local agencies and school districts for certain costs mandated by the state. Statutory provisions establish procedures for making that reimbursement.\nThis bill would provide that no reimbursement is required by this act for a specified reason.',
+ 'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nSection 10295.35 is added to the Public Contract Code, to read:\n10295.35.\n(a) (1) Notwithstanding any other law, a state agency shall not enter into any contract for the acquisition of goods or services in the amount of one hundred thousand dollars ($100,000) or more with a contractor that, in the provision of benefits, discriminates between employees on the basis of an employee’s or dependent’s actual or perceived gender identity, including, but not limited to, the employee’s or dependent’s identification as transgender.\n(2) For purposes of this section, “contract” includes contracts with a cumulative amount of one hundred thousand dollars ($100,000) or more per contractor in each fiscal year.\n(3) For purposes of this section, an employee health plan is discriminatory if the plan is not consistent with Section 1365.5 of the Health and Safety Code and Section 10140 of the Insurance Code.\n(4) The requirements of this section shall apply only to those portions of a contractor’s operations that occur under any of the following conditions:\n(A) Within the state.\n(B) On real property outside the state if the property is owned by the state or if the state has a right to occupy the property, and if the contractor’s presence at that location is connected to a contract with the state.\n(C) Elsewhere in the United States where work related to a state contract is being performed.\n(b) Contractors shall treat as confidential, to the maximum extent allowed by law or by the requirement of the contractor’s insurance provider, any request by an employee or applicant for employment benefits or any documentation of eligibility for benefits submitted by an employee or applicant for employment.\n(c) After taking all reasonable measures to find a contractor that complies with this section, as determined by the state agency, the requirements of this section may be waived under any of the following circumstances:\n(1) There is only one prospective contractor willing to enter into a specific contract with the state agency.\n(2) The contract is necessary to respond to an emergency, as determined by the state agency, that endangers the public health, welfare, or safety, or the contract is necessary for the provision of essential services, and no entity that complies with the requirements of this section capable of responding to the emergency is immediately available.\n(3) The requirements of this section violate, or are inconsistent with, the terms or conditions of a grant, subvention, or agreement, if the agency has made a good faith attempt to change the terms or conditions of any grant, subvention, or agreement to authorize application of this section.\n(4) The contractor is providing wholesale or bulk water, power, or natural gas, the conveyance or transmission of the same, or ancillary services, as required for ensuring reliable services in accordance with good utility practice, if the purchase of the same cannot practically be accomplished through the standard competitive bidding procedures and the contractor is not providing direct retail services to end users.\n(d) (1) A contractor shall not be deemed to discriminate in the provision of benefits if the contractor, in providing the benefits, pays the actual costs incurred in obtaining the benefit.\n(2) If a contractor is unable to provide a certain benefit, despite taking reasonable measures to do so, the contractor shall not be deemed to discriminate in the provision of benefits.\n(e) (1) Every contract subject to this chapter shall contain a statement by which the contractor certifies that the contractor is in compliance with this section.\n(2) The department or other contracting agency shall enforce this section pursuant to its existing enforcement powers.\n(3) (A) If a contractor falsely certifies that it is in compliance with this section, the contract with that contractor shall be subject to Article 9 (commencing with Section 10420), unless, within a time period specified by the department or other contracting agency, the contractor provides to the department or agency proof that it has complied, or is in the process of complying, with this section.\n(B) The application of the remedies or penalties contained in Article 9 (commencing with Section 10420) to a contract subject to this chapter shall not preclude the application of any existing remedies otherwise available to the department or other contracting agency under its existing enforcement powers.\n(f) Nothing in this section is intended to regulate the contracting practices of any local jurisdiction.\n(g) This section shall be construed so as not to conflict with applicable federal laws, rules, or regulations. In the event that a court or agency of competent jurisdiction holds that federal law, rule, or regulation invalidates any clause, sentence, paragraph, or section of this code or the application thereof to any person or circumstances, it is the intent of the state that the court or agency sever that clause, sentence, paragraph, or section so that the remainder of this section shall remain in effect.\nSEC. 2.\nSection 10295.35 of the Public Contract Code shall not be construed to create any new enforcement authority or responsibility in the Department of General Services or any other contracting agency.\nSEC. 3.\nNo reimbursement is required by this act pursuant to Section 6 of Article XIII\u2009B of the California Constitution because the only costs that may be incurred by a local agency or school district will be incurred because this act creates a new crime or infraction, eliminates a crime or infraction, or changes the penalty for a crime or infraction, within the meaning of Section 17556 of the Government Code, or changes the definition of a crime within the meaning of Section 6 of Article XIII\u2009B of the California Constitution.',
+ 'title': 'An act to add Section 10295.35 to the Public Contract Code, relating to public contracts.'}
+```
+
+There are two fields that you'll want to use:
+
+- `text`: the text of the bill which'll be the input to the model.
+- `summary`: a condensed version of `text` which'll be the model target.
+
+## Preprocess
+
+The next step is to load a T5 tokenizer to process `text` and `summary`:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> checkpoint = "t5-small"
+>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+```
+
+The preprocessing function you want to create needs to:
+
+1. Prefix the input with a prompt so T5 knows this is a summarization task. Some models capable of multiple NLP tasks require prompting for specific tasks.
+2. Use the keyword `text_target` argument when tokenizing labels.
+3. Truncate sequences to be no longer than the maximum length set by the `max_length` parameter.
+
+```py
+>>> prefix = "summarize: "
+
+
+>>> def preprocess_function(examples):
+...     inputs = [prefix + doc for doc in examples["text"]]
+...     model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
+
+...     labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)
+
+...     model_inputs["labels"] = labels["input_ids"]
+...     return model_inputs
+```
+
+To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] method. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once:
+
+```py
+>>> tokenized_billsum = billsum.map(preprocess_function, batched=True)
+```
+
+Now create a batch of examples using [`DataCollatorForSeq2Seq`]. It's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.
+
+<frameworkcontent>
+<pt>
+```py
+>>> from transformers import DataCollatorForSeq2Seq
+
+>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)
+```
+</pt>
+<tf>
+```py
+>>> from transformers import DataCollatorForSeq2Seq
+
+>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf")
+```
+</tf>
+</frameworkcontent>
+
+## Evaluate
+
+Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [ROUGE](https://huggingface.co/spaces/evaluate-metric/rouge) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric):
+
+```py
+>>> import evaluate
+
+>>> rouge = evaluate.load("rouge")
+```
+
+Then create a function that passes your predictions and labels to [`~evaluate.EvaluationModule.compute`] to calculate the ROUGE metric:
+
+```py
+>>> import numpy as np
+
+
+>>> def compute_metrics(eval_pred):
+...     predictions, labels = eval_pred
+...     decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
+...     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+...     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
+
+...     result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
+
+...     prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
+...     result["gen_len"] = np.mean(prediction_lens)
+
+...     return {k: round(v, 4) for k, v in result.items()}
+```
+
+Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training.
+
+## Train
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)!
+
+</Tip>
+
+You're ready to start training your model now! Load T5 with [`AutoModelForSeq2SeqLM`]:
+
+```py
+>>> from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
+
+>>> model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
+```
+
+At this point, only three steps remain:
+
+1. Define your training hyperparameters in [`Seq2SeqTrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the ROUGE metric and save the training checkpoint.
+2. Pass the training arguments to [`Seq2SeqTrainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function.
+3. Call [`~Trainer.train`] to finetune your model.
+
+```py
+>>> training_args = Seq2SeqTrainingArguments(
+...     output_dir="my_awesome_billsum_model",
+...     evaluation_strategy="epoch",
+...     learning_rate=2e-5,
+...     per_device_train_batch_size=16,
+...     per_device_eval_batch_size=16,
+...     weight_decay=0.01,
+...     save_total_limit=3,
+...     num_train_epochs=4,
+...     predict_with_generate=True,
+...     fp16=True,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Seq2SeqTrainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=tokenized_billsum["train"],
+...     eval_dataset=tokenized_billsum["test"],
+...     tokenizer=tokenizer,
+...     data_collator=data_collator,
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+<Tip>
+
+If you aren't familiar with finetuning a model with Keras, take a look at the basic tutorial [here](../training#train-a-tensorflow-model-with-keras)!
+
+</Tip>
+To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters:
+
+```py
+>>> from transformers import create_optimizer, AdamWeightDecay
+
+>>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
+```
+
+Then you can load T5 with [`TFAutoModelForSeq2SeqLM`]:
+
+```py
+>>> from transformers import TFAutoModelForSeq2SeqLM
+
+>>> model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint)
+```
+
+Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
+
+```py
+>>> tf_train_set = model.prepare_tf_dataset(
+...     tokenized_billsum["train"],
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_test_set = model.prepare_tf_dataset(
+...     tokenized_billsum["test"],
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to:
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)  # No loss argument!
+```
+
+The last two things to setup before you start training is to compute the ROUGE score from the predictions, and provide a way to push your model to the Hub. Both are done by using [Keras callbacks](../main_classes/keras_callbacks).
+
+Pass your `compute_metrics` function to [`~transformers.KerasMetricCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import KerasMetricCallback
+
+>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
+```
+
+Specify where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> push_to_hub_callback = PushToHubCallback(
+...     output_dir="my_awesome_billsum_model",
+...     tokenizer=tokenizer,
+... )
+```
+
+Then bundle your callbacks together:
+
+```py
+>>> callbacks = [metric_callback, push_to_hub_callback]
+```
+
+Finally, you're ready to start training your model! Call [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) with your training and validation datasets, the number of epochs, and your callbacks to finetune the model:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=callbacks)
+```
+
+Once training is completed, your model is automatically uploaded to the Hub so everyone can use it!
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+For a more in-depth example of how to finetune a model for summarization, take a look at the corresponding
+[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)
+or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb).
+
+</Tip>
+
+## Inference
+
+Great, now that you've finetuned a model, you can use it for inference!
+
+Come up with some text you'd like to summarize. For T5, you need to prefix your input depending on the task you're working on. For summarization you should prefix your input as shown below:
+
+```py
+>>> text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."
+```
+
+The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for summarization with your model, and pass your text to it:
+
+```py
+>>> from transformers import pipeline
+
+>>> summarizer = pipeline("summarization", model="stevhliu/my_awesome_billsum_model")
+>>> summarizer(text)
+[{"summary_text": "The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country."}]
+```
+
+You can also manually replicate the results of the `pipeline` if you'd like:
+
+
+<frameworkcontent>
+<pt>
+Tokenize the text and return the `input_ids` as PyTorch tensors:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_billsum_model")
+>>> inputs = tokenizer(text, return_tensors="pt").input_ids
+```
+
+Use the [`~transformers.generation_utils.GenerationMixin.generate`] method to create the summarization. For more details about the different text generation strategies and parameters for controlling generation, check out the [Text Generation](../main_classes/text_generation) API.
+
+```py
+>>> from transformers import AutoModelForSeq2SeqLM
+
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("stevhliu/my_awesome_billsum_model")
+>>> outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)
+```
+
+Decode the generated token ids back into text:
+
+```py
+>>> tokenizer.decode(outputs[0], skip_special_tokens=True)
+'the inflation reduction act lowers prescription drug costs, health care costs, and energy costs. it's the most aggressive action on tackling the climate crisis in american history. it will ask the ultra-wealthy and corporations to pay their fair share.'
+```
+</pt>
+<tf>
+Tokenize the text and return the `input_ids` as TensorFlow tensors:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_billsum_model")
+>>> inputs = tokenizer(text, return_tensors="tf").input_ids
+```
+
+Use the [`~transformers.generation_tf_utils.TFGenerationMixin.generate`] method to create the summarization. For more details about the different text generation strategies and parameters for controlling generation, check out the [Text Generation](../main_classes/text_generation) API.
+
+```py
+>>> from transformers import TFAutoModelForSeq2SeqLM
+
+>>> model = TFAutoModelForSeq2SeqLM.from_pretrained("stevhliu/my_awesome_billsum_model")
+>>> outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)
+```
+
+Decode the generated token ids back into text:
+
+```py
+>>> tokenizer.decode(outputs[0], skip_special_tokens=True)
+'the inflation reduction act lowers prescription drug costs, health care costs, and energy costs. it's the most aggressive action on tackling the climate crisis in american history. it will ask the ultra-wealthy and corporations to pay their fair share.'
+```
+</tf>
+</frameworkcontent>
diff --git a/docs/source/en/tasks/summarization.mdx b/docs/source/en/tasks/summarization.mdx
deleted file mode 100644
index 1663c1f71348..000000000000
--- a/docs/source/en/tasks/summarization.mdx
+++ /dev/null
@@ -1,390 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Summarization
-
-<Youtube id="yHnr5Dk2zCI"/>
-
-Summarization creates a shorter version of a document or an article that captures all the important information. Along with translation, it is another example of a task that can be formulated as a sequence-to-sequence task. Summarization can be:
-
-- Extractive: extract the most relevant information from a document.
-- Abstractive: generate new text that captures the most relevant information. 
-
-This guide will show you how to:
-
-1. Finetune [T5](https://huggingface.co/t5-small) on the California state bill subset of the [BillSum](https://huggingface.co/datasets/billsum) dataset for abstractive summarization.
-2. Use your finetuned model for inference.
-
-<Tip>
-
-See the summarization [task page](https://huggingface.co/tasks/summarization) for more information about its associated models, datasets, and metrics.
-
-</Tip>
-
-Before you begin, make sure you have all the necessary libraries installed:
-
-```bash
-pip install transformers datasets evaluate
-```
-
-We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login:
-
-```py
->>> from huggingface_hub import notebook_login
-
->>> notebook_login()
-```
-
-## Load BillSum dataset
-
-Start by loading the smaller California state bill subset of the BillSum dataset from the 🤗 Datasets library:
-
-```py
->>> from datasets import load_dataset
-
->>> billsum = load_dataset("billsum", split="ca_test")
-```
-
-Split the dataset into a train and test set with the [`~datasets.Dataset.train_test_split`] method:
-
-```py
->>> billsum = billsum.train_test_split(test_size=0.2)
-```
-
-Then take a look at an example:
-
-```py
->>> billsum["train"][0]
-{'summary': 'Existing law authorizes state agencies to enter into contracts for the acquisition of goods or services upon approval by the Department of General Services. Existing law sets forth various requirements and prohibitions for those contracts, including, but not limited to, a prohibition on entering into contracts for the acquisition of goods or services of $100,000 or more with a contractor that discriminates between spouses and domestic partners or same-sex and different-sex couples in the provision of benefits. Existing law provides that a contract entered into in violation of those requirements and prohibitions is void and authorizes the state or any person acting on behalf of the state to bring a civil action seeking a determination that a contract is in violation and therefore void. Under existing law, a willful violation of those requirements and prohibitions is a misdemeanor.\nThis bill would also prohibit a state agency from entering into contracts for the acquisition of goods or services of $100,000 or more with a contractor that discriminates between employees on the basis of gender identity in the provision of benefits, as specified. By expanding the scope of a crime, this bill would impose a state-mandated local program.\nThe California Constitution requires the state to reimburse local agencies and school districts for certain costs mandated by the state. Statutory provisions establish procedures for making that reimbursement.\nThis bill would provide that no reimbursement is required by this act for a specified reason.',
- 'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nSection 10295.35 is added to the Public Contract Code, to read:\n10295.35.\n(a) (1) Notwithstanding any other law, a state agency shall not enter into any contract for the acquisition of goods or services in the amount of one hundred thousand dollars ($100,000) or more with a contractor that, in the provision of benefits, discriminates between employees on the basis of an employee’s or dependent’s actual or perceived gender identity, including, but not limited to, the employee’s or dependent’s identification as transgender.\n(2) For purposes of this section, “contract” includes contracts with a cumulative amount of one hundred thousand dollars ($100,000) or more per contractor in each fiscal year.\n(3) For purposes of this section, an employee health plan is discriminatory if the plan is not consistent with Section 1365.5 of the Health and Safety Code and Section 10140 of the Insurance Code.\n(4) The requirements of this section shall apply only to those portions of a contractor’s operations that occur under any of the following conditions:\n(A) Within the state.\n(B) On real property outside the state if the property is owned by the state or if the state has a right to occupy the property, and if the contractor’s presence at that location is connected to a contract with the state.\n(C) Elsewhere in the United States where work related to a state contract is being performed.\n(b) Contractors shall treat as confidential, to the maximum extent allowed by law or by the requirement of the contractor’s insurance provider, any request by an employee or applicant for employment benefits or any documentation of eligibility for benefits submitted by an employee or applicant for employment.\n(c) After taking all reasonable measures to find a contractor that complies with this section, as determined by the state agency, the requirements of this section may be waived under any of the following circumstances:\n(1) There is only one prospective contractor willing to enter into a specific contract with the state agency.\n(2) The contract is necessary to respond to an emergency, as determined by the state agency, that endangers the public health, welfare, or safety, or the contract is necessary for the provision of essential services, and no entity that complies with the requirements of this section capable of responding to the emergency is immediately available.\n(3) The requirements of this section violate, or are inconsistent with, the terms or conditions of a grant, subvention, or agreement, if the agency has made a good faith attempt to change the terms or conditions of any grant, subvention, or agreement to authorize application of this section.\n(4) The contractor is providing wholesale or bulk water, power, or natural gas, the conveyance or transmission of the same, or ancillary services, as required for ensuring reliable services in accordance with good utility practice, if the purchase of the same cannot practically be accomplished through the standard competitive bidding procedures and the contractor is not providing direct retail services to end users.\n(d) (1) A contractor shall not be deemed to discriminate in the provision of benefits if the contractor, in providing the benefits, pays the actual costs incurred in obtaining the benefit.\n(2) If a contractor is unable to provide a certain benefit, despite taking reasonable measures to do so, the contractor shall not be deemed to discriminate in the provision of benefits.\n(e) (1) Every contract subject to this chapter shall contain a statement by which the contractor certifies that the contractor is in compliance with this section.\n(2) The department or other contracting agency shall enforce this section pursuant to its existing enforcement powers.\n(3) (A) If a contractor falsely certifies that it is in compliance with this section, the contract with that contractor shall be subject to Article 9 (commencing with Section 10420), unless, within a time period specified by the department or other contracting agency, the contractor provides to the department or agency proof that it has complied, or is in the process of complying, with this section.\n(B) The application of the remedies or penalties contained in Article 9 (commencing with Section 10420) to a contract subject to this chapter shall not preclude the application of any existing remedies otherwise available to the department or other contracting agency under its existing enforcement powers.\n(f) Nothing in this section is intended to regulate the contracting practices of any local jurisdiction.\n(g) This section shall be construed so as not to conflict with applicable federal laws, rules, or regulations. In the event that a court or agency of competent jurisdiction holds that federal law, rule, or regulation invalidates any clause, sentence, paragraph, or section of this code or the application thereof to any person or circumstances, it is the intent of the state that the court or agency sever that clause, sentence, paragraph, or section so that the remainder of this section shall remain in effect.\nSEC. 2.\nSection 10295.35 of the Public Contract Code shall not be construed to create any new enforcement authority or responsibility in the Department of General Services or any other contracting agency.\nSEC. 3.\nNo reimbursement is required by this act pursuant to Section 6 of Article XIII\u2009B of the California Constitution because the only costs that may be incurred by a local agency or school district will be incurred because this act creates a new crime or infraction, eliminates a crime or infraction, or changes the penalty for a crime or infraction, within the meaning of Section 17556 of the Government Code, or changes the definition of a crime within the meaning of Section 6 of Article XIII\u2009B of the California Constitution.',
- 'title': 'An act to add Section 10295.35 to the Public Contract Code, relating to public contracts.'}
-```
-
-There are two fields that you'll want to use:
-
-- `text`: the text of the bill which'll be the input to the model.
-- `summary`: a condensed version of `text` which'll be the model target.
-
-## Preprocess
-
-The next step is to load a T5 tokenizer to process `text` and `summary`:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
-```
-
-The preprocessing function you want to create needs to:
-
-1. Prefix the input with a prompt so T5 knows this is a summarization task. Some models capable of multiple NLP tasks require prompting for specific tasks.
-2. Use the keyword `text_target` argument when tokenizing labels.
-3. Truncate sequences to be no longer than the maximum length set by the `max_length` parameter.
-
-```py
->>> prefix = "summarize: "
-
-
->>> def preprocess_function(examples):
-...     inputs = [prefix + doc for doc in examples["text"]]
-...     model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
-
-...     labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)
-
-...     model_inputs["labels"] = labels["input_ids"]
-...     return model_inputs
-```
-
-To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] method. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once:
-
-```py
->>> tokenized_billsum = billsum.map(preprocess_function, batched=True)
-```
-
-Now create a batch of examples using [`DataCollatorForSeq2Seq`]. It's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximium length.
-
-<frameworkcontent>
-<pt>
-```py
->>> from transformers import DataCollatorForSeq2Seq
-
->>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
-```
-</pt>
-<tf>
-```py
->>> from transformers import DataCollatorForSeq2Seq
-
->>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors="tf")
-```
-</tf>
-</frameworkcontent>
-
-## Evaluate
-
-Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [ROUGE](https://huggingface.co/spaces/evaluate-metric/rouge) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric):
-
-```py
->>> import evaluate
-
->>> rouge = evaluate.load("rouge")
-```
-
-Then create a function that passes your predictions and labels to [`~evaluate.EvaluationModule.compute`] to calculate the ROUGE metric:
-
-```py
->>> import numpy as np
-
-
->>> def compute_metrics(eval_pred):
-...     predictions, labels = eval_pred
-...     decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
-...     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
-...     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
-
-...     result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
-
-...     prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
-...     result["gen_len"] = np.mean(prediction_lens)
-
-...     return {k: round(v, 4) for k, v in result.items()}
-```
-
-Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training.
-
-## Train
-
-<frameworkcontent>
-<pt>
-<Tip>
-
-If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)!
-
-</Tip>
-You're ready to start training your model now! Load T5 with [`AutoModelForSeq2SeqLM`]:
-
-```py
->>> from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
-
->>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
-```
-
-At this point, only three steps remain:
-
-1. Define your training hyperparameters in [`Seq2SeqTrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the ROUGE metric and save the training checkpoint.
-2. Pass the training arguments to [`Seq2SeqTrainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function.
-3. Call [`~Trainer.train`] to finetune your model.
-
-```py
->>> training_args = Seq2SeqTrainingArguments(
-...     output_dir="my_awesome_billsum_model",
-...     evaluation_strategy="epoch",
-...     learning_rate=2e-5,
-...     per_device_train_batch_size=16,
-...     per_device_eval_batch_size=16,
-...     weight_decay=0.01,
-...     save_total_limit=3,
-...     num_train_epochs=4,
-...     predict_with_generate=True,
-...     fp16=True,
-...     push_to_hub=True,
-... )
-
->>> trainer = Seq2SeqTrainer(
-...     model=model,
-...     args=training_args,
-...     train_dataset=tokenized_billsum["train"],
-...     eval_dataset=tokenized_billsum["test"],
-...     tokenizer=tokenizer,
-...     data_collator=data_collator,
-...     compute_metrics=compute_metrics,
-... )
-
->>> trainer.train()
-```
-
-Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model:
-
-```py
->>> trainer.push_to_hub()
-```
-</pt>
-<tf>
-<Tip>
-
-If you aren't familiar with finetuning a model with Keras, take a look at the basic tutorial [here](../training#train-a-tensorflow-model-with-keras)!
-
-</Tip>
-To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters:
-
-```py
->>> from transformers import create_optimizer, AdamWeightDecay
-
->>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
-```
-
-Then you can load T5 with [`TFAutoModelForSeq2SeqLM`]:
-
-```py
->>> from transformers import TFAutoModelForSeq2SeqLM
-
->>> model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-small")
-```
-
-Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
-
-```py
->>> tf_train_set = model.prepare_tf_dataset(
-...     tokenized_billsum["train"],
-...     shuffle=True,
-...     batch_size=16,
-...     collate_fn=data_collator,
-... )
-
->>> tf_test_set = model.prepare_tf_dataset(
-...     tokenized_billsum["test"],
-...     shuffle=False,
-...     batch_size=16,
-...     collate_fn=data_collator,
-... )
-```
-
-Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
-
-```py
->>> import tensorflow as tf
-
->>> model.compile(optimizer=optimizer)
-```
-
-The last two things to setup before you start training is to compute the ROUGE score from the predictions, and provide a way to push your model to the Hub. Both are done by using [Keras callbacks](./main_classes/keras_callbacks).
-
-Pass your `compute_metrics` function to [`~transformers.KerasMetricCallback`]:
-
-```py
->>> from transformers.keras_callbacks import KerasMetricCallback
-
->>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
-```
-
-Specify where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]:
-
-```py
->>> from transformers.keras_callbacks import PushToHubCallback
-
->>> push_to_hub_callback = PushToHubCallback(
-...     output_dir="my_awesome_billsum_model",
-...     tokenizer=tokenizer,
-... )
-```
-
-Then bundle your callbacks together:
-
-```py
->>> callbacks = [metric_callback, push_to_hub_callback]
-```
-
-Finally, you're ready to start training your model! Call [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) with your training and validation datasets, the number of epochs, and your callbacks to finetune the model:
-
-```py
->>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=callbacks)
-```
-
-Once training is completed, your model is automatically uploaded to the Hub so everyone can use it!
-</tf>
-</frameworkcontent>
-
-<Tip>
-
-For a more in-depth example of how to finetune a model for summarization, take a look at the corresponding
-[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)
-or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb).
-
-</Tip>
-
-## Inference
-
-Great, now that you've finetuned a model, you can use it for inference!
-
-Come up with some text you'd like to summarize. For T5, you need to prefix your input depending on the task you're working on. For summarization you should prefix your input as shown below:
-
-```py
->>> text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."
-```
-
-The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for summarization with your model, and pass your text to it:
-
-```py
->>> from transformers import pipeline
-
->>> summarizer = pipeline("summarization", model="stevhliu/my_awesome_billsum_model")
->>> summarizer(text)
-[{"summary_text": "The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country."}]
-```
-
-You can also manually replicate the results of the `pipeline` if you'd like:
-
-
-<frameworkcontent>
-<pt>
-Tokenize the text and return the `input_ids` as PyTorch tensors:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_billsum_model")
->>> inputs = tokenizer(text, return_tensors="pt").input_ids
-```
-
-Use the [`~transformers.generation_utils.GenerationMixin.generate`] method to create the summarization. For more details about the different text generation strategies and parameters for controlling generation, check out the [Text Generation](./main_classes/text_generation) API.
-
-```py
->>> from transformers import AutoModelForSeq2SeqLM
-
->>> model = AutoModelForSeq2SeqLM.from_pretrained("stevhliu/my_awesome_billsum_model")
->>> outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)
-```
-
-Decode the generated token ids back into text:
-
-```py
->>> tokenizer.decode(outputs[0], skip_special_tokens=True)
-'the inflation reduction act lowers prescription drug costs, health care costs, and energy costs. it's the most aggressive action on tackling the climate crisis in american history. it will ask the ultra-wealthy and corporations to pay their fair share.'
-```
-</pt>
-<tf>
-Tokenize the text and return the `input_ids` as TensorFlow tensors:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_billsum_model")
->>> inputs = tokenizer(text, return_tensors="tf").input_ids
-```
-
-Use the [`~transformers.generation_tf_utils.TFGenerationMixin.generate`] method to create the summarization. For more details about the different text generation strategies and parameters for controlling generation, check out the [Text Generation](./main_classes/text_generation) API.
-
-```py
->>> from transformers import TFAutoModelForSeq2SeqLM
-
->>> model = TFAutoModelForSeq2SeqLM.from_pretrained("stevhliu/my_awesome_billsum_model")
->>> outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)
-```
-
-Decode the generated token ids back into text:
-
-```py
->>> tokenizer.decode(outputs[0], skip_special_tokens=True)
-'the inflation reduction act lowers prescription drug costs, health care costs, and energy costs. it's the most aggressive action on tackling the climate crisis in american history. it will ask the ultra-wealthy and corporations to pay their fair share.'
-```
-</tf>
-</frameworkcontent>
\ No newline at end of file
diff --git a/docs/source/en/tasks/text-to-speech.md b/docs/source/en/tasks/text-to-speech.md
new file mode 100644
index 000000000000..86a0d49fd04d
--- /dev/null
+++ b/docs/source/en/tasks/text-to-speech.md
@@ -0,0 +1,633 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Text to speech
+
+[[open-in-colab]]
+
+Text-to-speech (TTS) is the task of creating natural-sounding speech from text, where the speech can be generated in multiple 
+languages and for multiple speakers. Several text-to-speech models are currently available in 🤗 Transformers, such as 
+[Bark](../model_doc/bark), [MMS](../model_doc/mms), [VITS](../model_doc/vits) and [SpeechT5](../model_doc/speecht5). 
+
+You can easily generate audio using the `"text-to-audio"` pipeline (or its alias - `"text-to-speech"`). Some models, like Bark, 
+can also be conditioned to generate non-verbal communications such as laughing, sighing and crying, or even add music.
+Here's an example of how you would use the `"text-to-speech"` pipeline with Bark: 
+
+```py
+>>> from transformers import pipeline
+
+>>> pipe = pipeline("text-to-speech", model="suno/bark-small")
+>>> text = "[clears throat] This is a test ... and I just took a long pause."
+>>> output = pipe(text)
+```
+
+Here's a code snippet you can use to listen to the resulting audio in a notebook: 
+
+```python
+>>> from IPython.display import Audio
+>>> Audio(output["audio"], rate=output["sampling_rate"])
+```
+
+For more examples on what Bark and other pretrained TTS models can do, refer to our 
+[Audio course](https://huggingface.co/learn/audio-course/chapter6/pre-trained_models). 
+
+If you are looking to fine-tune a TTS model, you can currently fine-tune SpeechT5 only. SpeechT5 is pre-trained on a combination of 
+speech-to-text and text-to-speech data, allowing it to learn a unified space of hidden representations shared by both text 
+and speech. This means that the same pre-trained model can be fine-tuned for different tasks. Furthermore, SpeechT5 
+supports multiple speakers through x-vector speaker embeddings. 
+
+The remainder of this guide illustrates how to:
+
+1. Fine-tune [SpeechT5](../model_doc/speecht5) that was originally trained on English speech on the Dutch (`nl`) language subset of the [VoxPopuli](https://huggingface.co/datasets/facebook/voxpopuli) dataset.
+2. Use your refined model for inference in one of two ways: using a pipeline or directly.
+
+Before you begin, make sure you have all the necessary libraries installed:
+
+```bash
+pip install datasets soundfile speechbrain accelerate
+```
+
+Install 🤗Transformers from source as not all the SpeechT5 features have been merged into an official release yet:
+
+```bash
+pip install git+https://github.com/huggingface/transformers.git
+```
+
+<Tip>
+
+To follow this guide you will need a GPU. If you're working in a notebook, run the following line to check if a GPU is available: 
+
+```bash
+!nvidia-smi
+```
+
+</Tip>
+
+We encourage you to log in to your Hugging Face account to upload and share your model with the community. When prompted, enter your token to log in:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## Load the dataset
+
+[VoxPopuli](https://huggingface.co/datasets/facebook/voxpopuli) is a large-scale multilingual speech corpus consisting of 
+data sourced from 2009-2020 European Parliament event recordings. It contains labelled audio-transcription data for 15 
+European languages. In this guide, we are using the Dutch language subset, feel free to pick another subset. 
+
+Note that VoxPopuli or any other automated speech recognition (ASR) dataset may not be the most suitable 
+option for training TTS models. The features that make it beneficial for ASR, such as excessive background noise, are 
+typically undesirable in TTS. However, finding top-quality, multilingual, and multi-speaker TTS datasets can be quite 
+challenging.
+
+Let's load the data:
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> dataset = load_dataset("facebook/voxpopuli", "nl", split="train")
+>>> len(dataset)
+20968
+```
+
+20968 examples should be sufficient for fine-tuning. SpeechT5 expects audio data to have a sampling rate of 16 kHz, so 
+make sure the examples in the dataset meet this requirement:
+
+```py
+dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
+```
+
+## Preprocess the data
+
+Let's begin by defining the model checkpoint to use and loading the appropriate processor: 
+
+```py
+>>> from transformers import SpeechT5Processor
+
+>>> checkpoint = "microsoft/speecht5_tts"
+>>> processor = SpeechT5Processor.from_pretrained(checkpoint)
+```
+
+### Text cleanup for SpeechT5 tokenization 
+
+Start by cleaning up the text data. You'll need the tokenizer part of the processor to process the text:
+
+```py
+>>> tokenizer = processor.tokenizer
+```
+
+The dataset examples contain `raw_text` and `normalized_text` features. When deciding which feature to use as the text input, 
+consider that the SpeechT5 tokenizer doesn't have any tokens for numbers. In `normalized_text` the numbers are written 
+out as text. Thus, it is a better fit, and we recommend using    `normalized_text` as input text.
+
+Because SpeechT5 was trained on the English language, it may not recognize certain characters in the Dutch dataset. If 
+left as is, these characters will be converted to `<unk>` tokens. However, in Dutch, certain characters like `à` are 
+used to stress syllables. In order to preserve the meaning of the text, we can replace this character with a regular `a`.
+
+To identify unsupported tokens, extract all unique characters in the dataset using the `SpeechT5Tokenizer` which 
+works with characters as tokens. To do this, write the `extract_all_chars` mapping function that concatenates 
+the transcriptions from all examples into one string and converts it to a set of characters. 
+Make sure to set `batched=True` and `batch_size=-1` in `dataset.map()` so that all transcriptions are available at once for 
+the mapping function.
+
+```py
+>>> def extract_all_chars(batch):
+...     all_text = " ".join(batch["normalized_text"])
+...     vocab = list(set(all_text))
+...     return {"vocab": [vocab], "all_text": [all_text]}
+
+
+>>> vocabs = dataset.map(
+...     extract_all_chars,
+...     batched=True,
+...     batch_size=-1,
+...     keep_in_memory=True,
+...     remove_columns=dataset.column_names,
+... )
+
+>>> dataset_vocab = set(vocabs["vocab"][0])
+>>> tokenizer_vocab = {k for k, _ in tokenizer.get_vocab().items()}
+```
+
+Now you have two sets of characters: one with the vocabulary from the dataset and one with the vocabulary from the tokenizer. 
+To identify any unsupported characters in the dataset, you can take the difference between these two sets. The resulting 
+set will contain the characters that are in the dataset but not in the tokenizer.
+
+```py
+>>> dataset_vocab - tokenizer_vocab
+{' ', 'à', 'ç', 'è', 'ë', 'í', 'ï', 'ö', 'ü'}
+```
+
+To handle the unsupported characters identified in the previous step, define a function that maps these characters to 
+valid tokens. Note that spaces are already replaced by `▁` in the tokenizer and don't need to be handled separately.
+
+```py
+>>> replacements = [
+...     ("à", "a"),
+...     ("ç", "c"),
+...     ("è", "e"),
+...     ("ë", "e"),
+...     ("í", "i"),
+...     ("ï", "i"),
+...     ("ö", "o"),
+...     ("ü", "u"),
+... ]
+
+
+>>> def cleanup_text(inputs):
+...     for src, dst in replacements:
+...         inputs["normalized_text"] = inputs["normalized_text"].replace(src, dst)
+...     return inputs
+
+
+>>> dataset = dataset.map(cleanup_text)
+```
+
+Now that you have dealt with special characters in the text, it's time to shift focus to the audio data.
+
+### Speakers
+
+The VoxPopuli dataset includes speech from multiple speakers, but how many speakers are represented in the dataset? To 
+determine this, we can count the number of unique speakers and the number of examples each speaker contributes to the dataset. 
+With a total of 20,968 examples in the dataset, this information will give us a better understanding of the distribution of 
+speakers and examples in the data.
+
+```py
+>>> from collections import defaultdict
+
+>>> speaker_counts = defaultdict(int)
+
+>>> for speaker_id in dataset["speaker_id"]:
+...     speaker_counts[speaker_id] += 1
+```
+
+By plotting a histogram you can get a sense of how much data there is for each speaker.
+
+```py
+>>> import matplotlib.pyplot as plt
+
+>>> plt.figure()
+>>> plt.hist(speaker_counts.values(), bins=20)
+>>> plt.ylabel("Speakers")
+>>> plt.xlabel("Examples")
+>>> plt.show()
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/tts_speakers_histogram.png" alt="Speakers histogram"/>
+</div>
+
+The histogram reveals that approximately one-third of the speakers in the dataset have fewer than 100 examples, while 
+around ten speakers have more than 500 examples. To improve training efficiency and balance the dataset, we can limit 
+the data to speakers with between 100 and 400 examples. 
+
+```py
+>>> def select_speaker(speaker_id):
+...     return 100 <= speaker_counts[speaker_id] <= 400
+
+
+>>> dataset = dataset.filter(select_speaker, input_columns=["speaker_id"])
+```
+
+Let's check how many speakers remain: 
+
+```py
+>>> len(set(dataset["speaker_id"]))
+42
+```
+
+Let's see how many examples are left: 
+
+```py
+>>> len(dataset)
+9973
+```
+
+You are left with just under 10,000 examples from approximately 40 unique speakers, which should be sufficient.
+
+Note that some speakers with few examples may actually have more audio available if the examples are long. However, 
+determining the total amount of audio for each speaker requires scanning through the entire dataset, which is a 
+time-consuming process that involves loading and decoding each audio file. As such, we have chosen to skip this step here.
+
+### Speaker embeddings
+
+To enable the TTS model to differentiate between multiple speakers, you'll need to create a speaker embedding for each example. 
+The speaker embedding is an additional input into the model that captures a particular speaker's voice characteristics.
+To generate these speaker embeddings, use the pre-trained [spkrec-xvect-voxceleb](https://huggingface.co/speechbrain/spkrec-xvect-voxceleb) 
+model from SpeechBrain. 
+
+Create a function `create_speaker_embedding()` that takes an input audio waveform and outputs a 512-element vector 
+containing the corresponding speaker embedding.
+
+```py
+>>> import os
+>>> import torch
+>>> from speechbrain.pretrained import EncoderClassifier
+
+>>> spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
+
+>>> device = "cuda" if torch.cuda.is_available() else "cpu"
+>>> speaker_model = EncoderClassifier.from_hparams(
+...     source=spk_model_name,
+...     run_opts={"device": device},
+...     savedir=os.path.join("/tmp", spk_model_name),
+... )
+
+
+>>> def create_speaker_embedding(waveform):
+...     with torch.no_grad():
+...         speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
+...         speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
+...         speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
+...     return speaker_embeddings
+```
+
+It's important to note that the `speechbrain/spkrec-xvect-voxceleb` model was trained on English speech from the VoxCeleb 
+dataset, whereas the training examples in this guide are in Dutch. While we believe that this model will still generate 
+reasonable speaker embeddings for our Dutch dataset, this assumption may not hold true in all cases.
+
+For optimal results, we recommend training an X-vector model on the target speech first. This will ensure that the model 
+is better able to capture the unique voice characteristics present in the Dutch language.
+
+### Processing the dataset
+
+Finally, let's process the data into the format the model expects. Create a `prepare_dataset` function that takes in a 
+single example and uses the `SpeechT5Processor` object to tokenize the input text and load the target audio into a log-mel spectrogram. 
+It should also add the speaker embeddings as an additional input.
+
+```py
+>>> def prepare_dataset(example):
+...     audio = example["audio"]
+
+...     example = processor(
+...         text=example["normalized_text"],
+...         audio_target=audio["array"],
+...         sampling_rate=audio["sampling_rate"],
+...         return_attention_mask=False,
+...     )
+
+...     # strip off the batch dimension
+...     example["labels"] = example["labels"][0]
+
+...     # use SpeechBrain to obtain x-vector
+...     example["speaker_embeddings"] = create_speaker_embedding(audio["array"])
+
+...     return example
+```
+
+Verify the processing is correct by looking at a single example:
+
+```py
+>>> processed_example = prepare_dataset(dataset[0])
+>>> list(processed_example.keys())
+['input_ids', 'labels', 'stop_labels', 'speaker_embeddings']
+```
+
+Speaker embeddings should be a 512-element vector:
+
+```py
+>>> processed_example["speaker_embeddings"].shape
+(512,)
+```
+
+The labels should be a log-mel spectrogram with 80 mel bins.
+
+```py
+>>> import matplotlib.pyplot as plt
+
+>>> plt.figure()
+>>> plt.imshow(processed_example["labels"].T)
+>>> plt.show()
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/tts_logmelspectrogram_1.png" alt="Log-mel spectrogram with 80 mel bins"/>
+</div>
+
+Side note: If you find this spectrogram confusing, it may be due to your familiarity with the convention of placing low frequencies 
+at the bottom and high frequencies at the top of a plot. However, when plotting spectrograms as an image using the matplotlib library, 
+the y-axis is flipped and the spectrograms appear upside down.
+
+Now apply the processing function to the entire dataset. This will take between 5 and 10 minutes.
+
+```py
+>>> dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names)
+```
+
+You'll see a warning saying that some examples in the dataset are longer than the maximum input length the model can handle (600 tokens). 
+Remove those examples from the dataset. Here we go even further and to allow for larger batch sizes we remove anything over 200 tokens.
+
+```py
+>>> def is_not_too_long(input_ids):
+...     input_length = len(input_ids)
+...     return input_length < 200
+
+
+>>> dataset = dataset.filter(is_not_too_long, input_columns=["input_ids"])
+>>> len(dataset)
+8259
+```
+
+Next, create a basic train/test split: 
+
+```py
+>>> dataset = dataset.train_test_split(test_size=0.1)
+```
+
+### Data collator
+
+In order to combine multiple examples into a batch, you need to define a custom data collator. This collator will pad shorter sequences with padding 
+tokens, ensuring that all examples have the same length. For the spectrogram labels, the padded portions are replaced with the special value `-100`. This special value 
+instructs the model to ignore that part of the spectrogram when calculating the spectrogram loss.
+
+```py
+>>> from dataclasses import dataclass
+>>> from typing import Any, Dict, List, Union
+
+
+>>> @dataclass
+... class TTSDataCollatorWithPadding:
+...     processor: Any
+
+...     def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+...         input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
+...         label_features = [{"input_values": feature["labels"]} for feature in features]
+...         speaker_features = [feature["speaker_embeddings"] for feature in features]
+
+...         # collate the inputs and targets into a batch
+...         batch = processor.pad(input_ids=input_ids, labels=label_features, return_tensors="pt")
+
+...         # replace padding with -100 to ignore loss correctly
+...         batch["labels"] = batch["labels"].masked_fill(batch.decoder_attention_mask.unsqueeze(-1).ne(1), -100)
+
+...         # not used during fine-tuning
+...         del batch["decoder_attention_mask"]
+
+...         # round down target lengths to multiple of reduction factor
+...         if model.config.reduction_factor > 1:
+...             target_lengths = torch.tensor([len(feature["input_values"]) for feature in label_features])
+...             target_lengths = target_lengths.new(
+...                 [length - length % model.config.reduction_factor for length in target_lengths]
+...             )
+...             max_length = max(target_lengths)
+...             batch["labels"] = batch["labels"][:, :max_length]
+
+...         # also add in the speaker embeddings
+...         batch["speaker_embeddings"] = torch.tensor(speaker_features)
+
+...         return batch
+```
+
+In SpeechT5, the input to the decoder part of the model is reduced by a factor 2. In other words, it throws away every 
+other timestep from the target sequence. The decoder then predicts a sequence that is twice as long. Since the original 
+target sequence length may be odd, the data collator makes sure to round the maximum length of the batch down to be a 
+multiple of 2.
+
+```py 
+>>> data_collator = TTSDataCollatorWithPadding(processor=processor)
+```
+
+## Train the model
+
+Load the pre-trained model from the same checkpoint as you used for loading the processor: 
+
+```py
+>>> from transformers import SpeechT5ForTextToSpeech
+
+>>> model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
+```
+
+The `use_cache=True` option is incompatible with gradient checkpointing. Disable it for training.
+
+```py 
+>>> model.config.use_cache = False
+```
+
+Define the training arguments. Here we are not computing any evaluation metrics during the training process. Instead, we'll 
+only look at the loss:
+
+```python
+>>> from transformers import Seq2SeqTrainingArguments
+
+>>> training_args = Seq2SeqTrainingArguments(
+...     output_dir="speecht5_finetuned_voxpopuli_nl",  # change to a repo name of your choice
+...     per_device_train_batch_size=4,
+...     gradient_accumulation_steps=8,
+...     learning_rate=1e-5,
+...     warmup_steps=500,
+...     max_steps=4000,
+...     gradient_checkpointing=True,
+...     fp16=True,
+...     evaluation_strategy="steps",
+...     per_device_eval_batch_size=2,
+...     save_steps=1000,
+...     eval_steps=1000,
+...     logging_steps=25,
+...     report_to=["tensorboard"],
+...     load_best_model_at_end=True,
+...     greater_is_better=False,
+...     label_names=["labels"],
+...     push_to_hub=True,
+... )
+```
+
+Instantiate the `Trainer` object  and pass the model, dataset, and data collator to it.
+
+```py
+>>> from transformers import Seq2SeqTrainer
+
+>>> trainer = Seq2SeqTrainer(
+...     args=training_args,
+...     model=model,
+...     train_dataset=dataset["train"],
+...     eval_dataset=dataset["test"],
+...     data_collator=data_collator,
+...     tokenizer=processor,
+... )
+```
+
+And with that, you're ready to start training! Training will take several hours. Depending on your GPU, 
+it is possible that you will encounter a CUDA "out-of-memory" error when you start training. In this case, you can reduce 
+the `per_device_train_batch_size` incrementally by factors of 2 and increase `gradient_accumulation_steps` by 2x to compensate.
+
+```py
+>>> trainer.train()
+```
+
+To be able to use your checkpoint with a pipeline, make sure to save the processor with the checkpoint: 
+
+```py
+>>> processor.save_pretrained("YOUR_ACCOUNT_NAME/speecht5_finetuned_voxpopuli_nl")
+```
+
+Push the final model to the 🤗 Hub:
+
+```py
+>>> trainer.push_to_hub()
+```
+
+## Inference
+
+### Inference with a pipeline
+
+Great, now that you've fine-tuned a model, you can use it for inference!
+First, let's see how you can use it with a corresponding pipeline. Let's create a `"text-to-speech"` pipeline with your 
+checkpoint: 
+
+```py
+>>> from transformers import pipeline
+
+>>> pipe = pipeline("text-to-speech", model="YOUR_ACCOUNT_NAME/speecht5_finetuned_voxpopuli_nl")
+```
+
+Pick a piece of text in Dutch you'd like narrated, e.g.:
+
+```py
+>>> text = "hallo allemaal, ik praat nederlands. groetjes aan iedereen!"
+```
+
+To use SpeechT5 with the pipeline, you'll need a speaker embedding. Let's get it from an example in the test dataset: 
+
+```py
+>>> example = dataset["test"][304]
+>>> speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
+```
+
+Now you can pass the text and speaker embeddings to the pipeline, and it will take care of the rest: 
+
+```py
+>>> forward_params = {"speaker_embeddings": speaker_embeddings}
+>>> output = pipe(text, forward_params=forward_params)
+>>> output
+{'audio': array([-6.82714235e-05, -4.26525949e-04,  1.06134125e-04, ...,
+        -1.22392643e-03, -7.76011671e-04,  3.29112721e-04], dtype=float32),
+ 'sampling_rate': 16000}
+```
+
+You can then listen to the result:
+
+```py
+>>> from IPython.display import Audio
+>>> Audio(output['audio'], rate=output['sampling_rate']) 
+```
+
+### Run inference manually
+
+You can achieve the same inference results without using the pipeline, however, more steps will be required. 
+
+Load the model from the 🤗 Hub: 
+
+```py
+>>> model = SpeechT5ForTextToSpeech.from_pretrained("YOUR_ACCOUNT/speecht5_finetuned_voxpopuli_nl")
+```
+
+Pick an example from the test dataset obtain a speaker embedding. 
+
+```py 
+>>> example = dataset["test"][304]
+>>> speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
+```
+
+Define the input text and tokenize it.
+
+```py 
+>>> text = "hallo allemaal, ik praat nederlands. groetjes aan iedereen!"
+>>> inputs = processor(text=text, return_tensors="pt")
+```
+
+Create a spectrogram with your model: 
+
+```py
+>>> spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
+```
+
+Visualize the spectrogram, if you'd like to: 
+
+```py
+>>> plt.figure()
+>>> plt.imshow(spectrogram.T)
+>>> plt.show()
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/tts_logmelspectrogram_2.png" alt="Generated log-mel spectrogram"/>
+</div>
+
+Finally, use the vocoder to turn the spectrogram into sound.
+
+```py
+>>> with torch.no_grad():
+...     speech = vocoder(spectrogram)
+
+>>> from IPython.display import Audio
+
+>>> Audio(speech.numpy(), rate=16000)
+```
+
+In our experience, obtaining satisfactory results from this model can be challenging. The quality of the speaker 
+embeddings appears to be a significant factor. Since SpeechT5 was pre-trained with English x-vectors, it performs best 
+when using English speaker embeddings. If the synthesized speech sounds poor, try using a different speaker embedding.
+
+Increasing the training duration is also likely to enhance the quality of the results. Even so, the speech clearly is Dutch instead of English, and it does 
+capture the voice characteristics of the speaker (compare to the original audio in the example).
+Another thing to experiment with is the model's configuration. For example, try using `config.reduction_factor = 1` to 
+see if this improves the results.
+
+Finally, it is essential to consider ethical considerations. Although TTS technology has numerous useful applications, it 
+may also be used for malicious purposes, such as impersonating someone's voice without their knowledge or consent. Please 
+use TTS judiciously and responsibly.
\ No newline at end of file
diff --git a/docs/source/en/tasks/token_classification.md b/docs/source/en/tasks/token_classification.md
new file mode 100644
index 000000000000..289f2b05896a
--- /dev/null
+++ b/docs/source/en/tasks/token_classification.md
@@ -0,0 +1,562 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Token classification
+
+[[open-in-colab]]
+
+<Youtube id="wVHdVlPScxA"/>
+
+Token classification assigns a label to individual tokens in a sentence. One of the most common token classification tasks is Named Entity Recognition (NER). NER attempts to find a label for each entity in a sentence, such as a person, location, or organization.
+
+This guide will show you how to:
+
+1. Finetune [DistilBERT](https://huggingface.co/distilbert-base-uncased) on the [WNUT 17](https://huggingface.co/datasets/wnut_17) dataset to detect new entities.
+2. Use your finetuned model for inference.
+
+<Tip>
+The task illustrated in this tutorial is supported by the following model architectures:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[ALBERT](../model_doc/albert), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BioGpt](../model_doc/biogpt), [BLOOM](../model_doc/bloom), [BROS](../model_doc/bros), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [ESM](../model_doc/esm), [Falcon](../model_doc/falcon), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MarkupLM](../model_doc/markuplm), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MPT](../model_doc/mpt), [MRA](../model_doc/mra), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [QDQBert](../model_doc/qdqbert), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+Before you begin, make sure you have all the necessary libraries installed:
+
+```bash
+pip install transformers datasets evaluate seqeval
+```
+
+We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## Load WNUT 17 dataset
+
+Start by loading the WNUT 17 dataset from the 🤗 Datasets library:
+
+```py
+>>> from datasets import load_dataset
+
+>>> wnut = load_dataset("wnut_17")
+```
+
+Then take a look at an example:
+
+```py
+>>> wnut["train"][0]
+{'id': '0',
+ 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0],
+ 'tokens': ['@paulwalk', 'It', "'s", 'the', 'view', 'from', 'where', 'I', "'m", 'living', 'for', 'two', 'weeks', '.', 'Empire', 'State', 'Building', '=', 'ESB', '.', 'Pretty', 'bad', 'storm', 'here', 'last', 'evening', '.']
+}
+```
+
+Each number in `ner_tags` represents an entity. Convert the numbers to their label names to find out what the entities are:
+
+```py
+>>> label_list = wnut["train"].features[f"ner_tags"].feature.names
+>>> label_list
+[
+    "O",
+    "B-corporation",
+    "I-corporation",
+    "B-creative-work",
+    "I-creative-work",
+    "B-group",
+    "I-group",
+    "B-location",
+    "I-location",
+    "B-person",
+    "I-person",
+    "B-product",
+    "I-product",
+]
+```
+
+The letter that prefixes each `ner_tag` indicates the token position of the entity:
+
+- `B-` indicates the beginning of an entity.
+- `I-` indicates a token is contained inside the same entity (for example, the `State` token is a part of an entity like
+  `Empire State Building`).
+- `0` indicates the token doesn't correspond to any entity.
+
+## Preprocess
+
+<Youtube id="iY2AZYdZAr0"/>
+
+The next step is to load a DistilBERT tokenizer to preprocess the `tokens` field:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+```
+
+As you saw in the example `tokens` field above, it looks like the input has already been tokenized. But the input actually hasn't been tokenized yet and you'll need to set `is_split_into_words=True` to tokenize the words into subwords. For example:
+
+```py
+>>> example = wnut["train"][0]
+>>> tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
+>>> tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
+>>> tokens
+['[CLS]', '@', 'paul', '##walk', 'it', "'", 's', 'the', 'view', 'from', 'where', 'i', "'", 'm', 'living', 'for', 'two', 'weeks', '.', 'empire', 'state', 'building', '=', 'es', '##b', '.', 'pretty', 'bad', 'storm', 'here', 'last', 'evening', '.', '[SEP]']
+```
+
+However, this adds some special tokens `[CLS]` and `[SEP]` and the subword tokenization creates a mismatch between the input and labels. A single word corresponding to a single label may now be split into two subwords. You'll need to realign the tokens and labels by:
+
+1. Mapping all tokens to their corresponding word with the [`word_ids`](https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.BatchEncoding.word_ids) method.
+2. Assigning the label `-100` to the special tokens `[CLS]` and `[SEP]` so they're ignored by the PyTorch loss function (see [CrossEntropyLoss](https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html)).
+3. Only labeling the first token of a given word. Assign `-100` to other subtokens from the same word.
+
+Here is how you can create a function to realign the tokens and labels, and truncate sequences to be no longer than DistilBERT's maximum input length:
+
+```py
+>>> def tokenize_and_align_labels(examples):
+...     tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
+
+...     labels = []
+...     for i, label in enumerate(examples[f"ner_tags"]):
+...         word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
+...         previous_word_idx = None
+...         label_ids = []
+...         for word_idx in word_ids:  # Set the special tokens to -100.
+...             if word_idx is None:
+...                 label_ids.append(-100)
+...             elif word_idx != previous_word_idx:  # Only label the first token of a given word.
+...                 label_ids.append(label[word_idx])
+...             else:
+...                 label_ids.append(-100)
+...             previous_word_idx = word_idx
+...         labels.append(label_ids)
+
+...     tokenized_inputs["labels"] = labels
+...     return tokenized_inputs
+```
+
+To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] function. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once:
+
+```py
+>>> tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True)
+```
+
+Now create a batch of examples using [`DataCollatorWithPadding`]. It's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.
+
+<frameworkcontent>
+<pt>
+```py
+>>> from transformers import DataCollatorForTokenClassification
+
+>>> data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
+```
+</pt>
+<tf>
+```py
+>>> from transformers import DataCollatorForTokenClassification
+
+>>> data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")
+```
+</tf>
+</frameworkcontent>
+
+## Evaluate
+
+Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [seqeval](https://huggingface.co/spaces/evaluate-metric/seqeval) framework (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric). Seqeval actually produces several scores: precision, recall, F1, and accuracy.
+
+```py
+>>> import evaluate
+
+>>> seqeval = evaluate.load("seqeval")
+```
+
+Get the NER labels first, and then create a function that passes your true predictions and true labels to [`~evaluate.EvaluationModule.compute`] to calculate the scores:
+
+```py
+>>> import numpy as np
+
+>>> labels = [label_list[i] for i in example[f"ner_tags"]]
+
+
+>>> def compute_metrics(p):
+...     predictions, labels = p
+...     predictions = np.argmax(predictions, axis=2)
+
+...     true_predictions = [
+...         [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
+...         for prediction, label in zip(predictions, labels)
+...     ]
+...     true_labels = [
+...         [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
+...         for prediction, label in zip(predictions, labels)
+...     ]
+
+...     results = seqeval.compute(predictions=true_predictions, references=true_labels)
+...     return {
+...         "precision": results["overall_precision"],
+...         "recall": results["overall_recall"],
+...         "f1": results["overall_f1"],
+...         "accuracy": results["overall_accuracy"],
+...     }
+```
+
+Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training.
+
+## Train
+
+Before you start training your model, create a map of the expected ids to their labels with `id2label` and `label2id`:
+
+```py
+>>> id2label = {
+...     0: "O",
+...     1: "B-corporation",
+...     2: "I-corporation",
+...     3: "B-creative-work",
+...     4: "I-creative-work",
+...     5: "B-group",
+...     6: "I-group",
+...     7: "B-location",
+...     8: "I-location",
+...     9: "B-person",
+...     10: "I-person",
+...     11: "B-product",
+...     12: "I-product",
+... }
+>>> label2id = {
+...     "O": 0,
+...     "B-corporation": 1,
+...     "I-corporation": 2,
+...     "B-creative-work": 3,
+...     "I-creative-work": 4,
+...     "B-group": 5,
+...     "I-group": 6,
+...     "B-location": 7,
+...     "I-location": 8,
+...     "B-person": 9,
+...     "I-person": 10,
+...     "B-product": 11,
+...     "I-product": 12,
+... }
+```
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)!
+
+</Tip>
+
+You're ready to start training your model now! Load DistilBERT with [`AutoModelForTokenClassification`] along with the number of expected labels, and the label mappings:
+
+```py
+>>> from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
+
+>>> model = AutoModelForTokenClassification.from_pretrained(
+...     "distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id
+... )
+```
+
+At this point, only three steps remain:
+
+1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the seqeval scores and save the training checkpoint.
+2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function.
+3. Call [`~Trainer.train`] to finetune your model.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_wnut_model",
+...     learning_rate=2e-5,
+...     per_device_train_batch_size=16,
+...     per_device_eval_batch_size=16,
+...     num_train_epochs=2,
+...     weight_decay=0.01,
+...     evaluation_strategy="epoch",
+...     save_strategy="epoch",
+...     load_best_model_at_end=True,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=tokenized_wnut["train"],
+...     eval_dataset=tokenized_wnut["test"],
+...     tokenizer=tokenizer,
+...     data_collator=data_collator,
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+<Tip>
+
+If you aren't familiar with finetuning a model with Keras, take a look at the basic tutorial [here](../training#train-a-tensorflow-model-with-keras)!
+
+</Tip>
+To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters:
+
+```py
+>>> from transformers import create_optimizer
+
+>>> batch_size = 16
+>>> num_train_epochs = 3
+>>> num_train_steps = (len(tokenized_wnut["train"]) // batch_size) * num_train_epochs
+>>> optimizer, lr_schedule = create_optimizer(
+...     init_lr=2e-5,
+...     num_train_steps=num_train_steps,
+...     weight_decay_rate=0.01,
+...     num_warmup_steps=0,
+... )
+```
+
+Then you can load DistilBERT with [`TFAutoModelForTokenClassification`] along with the number of expected labels, and the label mappings:
+
+```py
+>>> from transformers import TFAutoModelForTokenClassification
+
+>>> model = TFAutoModelForTokenClassification.from_pretrained(
+...     "distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id
+... )
+```
+
+Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
+
+```py
+>>> tf_train_set = model.prepare_tf_dataset(
+...     tokenized_wnut["train"],
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_validation_set = model.prepare_tf_dataset(
+...     tokenized_wnut["validation"],
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to:
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)  # No loss argument!
+```
+
+The last two things to setup before you start training is to compute the seqeval scores from the predictions, and provide a way to push your model to the Hub. Both are done by using [Keras callbacks](../main_classes/keras_callbacks).
+
+Pass your `compute_metrics` function to [`~transformers.KerasMetricCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import KerasMetricCallback
+
+>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
+```
+
+Specify where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> push_to_hub_callback = PushToHubCallback(
+...     output_dir="my_awesome_wnut_model",
+...     tokenizer=tokenizer,
+... )
+```
+
+Then bundle your callbacks together:
+
+```py
+>>> callbacks = [metric_callback, push_to_hub_callback]
+```
+
+Finally, you're ready to start training your model! Call [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) with your training and validation datasets, the number of epochs, and your callbacks to finetune the model:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=callbacks)
+```
+
+Once training is completed, your model is automatically uploaded to the Hub so everyone can use it!
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+For a more in-depth example of how to finetune a model for token classification, take a look at the corresponding
+[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb)
+or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb).
+
+</Tip>
+
+## Inference
+
+Great, now that you've finetuned a model, you can use it for inference!
+
+Grab some text you'd like to run inference on:
+
+```py
+>>> text = "The Golden State Warriors are an American professional basketball team based in San Francisco."
+```
+
+The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for NER with your model, and pass your text to it:
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline("ner", model="stevhliu/my_awesome_wnut_model")
+>>> classifier(text)
+[{'entity': 'B-location',
+  'score': 0.42658573,
+  'index': 2,
+  'word': 'golden',
+  'start': 4,
+  'end': 10},
+ {'entity': 'I-location',
+  'score': 0.35856336,
+  'index': 3,
+  'word': 'state',
+  'start': 11,
+  'end': 16},
+ {'entity': 'B-group',
+  'score': 0.3064001,
+  'index': 4,
+  'word': 'warriors',
+  'start': 17,
+  'end': 25},
+ {'entity': 'B-location',
+  'score': 0.65523505,
+  'index': 13,
+  'word': 'san',
+  'start': 80,
+  'end': 83},
+ {'entity': 'B-location',
+  'score': 0.4668663,
+  'index': 14,
+  'word': 'francisco',
+  'start': 84,
+  'end': 93}]
+```
+
+You can also manually replicate the results of the `pipeline` if you'd like:
+
+<frameworkcontent>
+<pt>
+Tokenize the text and return PyTorch tensors:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_wnut_model")
+>>> inputs = tokenizer(text, return_tensors="pt")
+```
+
+Pass your inputs to the model and return the `logits`:
+
+```py
+>>> from transformers import AutoModelForTokenClassification
+
+>>> model = AutoModelForTokenClassification.from_pretrained("stevhliu/my_awesome_wnut_model")
+>>> with torch.no_grad():
+...     logits = model(**inputs).logits
+```
+
+Get the class with the highest probability, and use the model's `id2label` mapping to convert it to a text label:
+
+```py
+>>> predictions = torch.argmax(logits, dim=2)
+>>> predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]
+>>> predicted_token_class
+['O',
+ 'O',
+ 'B-location',
+ 'I-location',
+ 'B-group',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'B-location',
+ 'B-location',
+ 'O',
+ 'O']
+```
+</pt>
+<tf>
+Tokenize the text and return TensorFlow tensors:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_wnut_model")
+>>> inputs = tokenizer(text, return_tensors="tf")
+```
+
+Pass your inputs to the model and return the `logits`:
+
+```py
+>>> from transformers import TFAutoModelForTokenClassification
+
+>>> model = TFAutoModelForTokenClassification.from_pretrained("stevhliu/my_awesome_wnut_model")
+>>> logits = model(**inputs).logits
+```
+
+Get the class with the highest probability, and use the model's `id2label` mapping to convert it to a text label:
+
+```py
+>>> predicted_token_class_ids = tf.math.argmax(logits, axis=-1)
+>>> predicted_token_class = [model.config.id2label[t] for t in predicted_token_class_ids[0].numpy().tolist()]
+>>> predicted_token_class
+['O',
+ 'O',
+ 'B-location',
+ 'I-location',
+ 'B-group',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'B-location',
+ 'B-location',
+ 'O',
+ 'O']
+```
+</tf>
+</frameworkcontent>
diff --git a/docs/source/en/tasks/token_classification.mdx b/docs/source/en/tasks/token_classification.mdx
deleted file mode 100644
index 8c7ceac48f46..000000000000
--- a/docs/source/en/tasks/token_classification.mdx
+++ /dev/null
@@ -1,552 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Token classification
-
-[[open-in-colab]]
-
-<Youtube id="wVHdVlPScxA"/>
-
-Token classification assigns a label to individual tokens in a sentence. One of the most common token classification tasks is Named Entity Recognition (NER). NER attempts to find a label for each entity in a sentence, such as a person, location, or organization. 
-
-This guide will show you how to:
-
-1. Finetune [DistilBERT](https://huggingface.co/distilbert-base-uncased) on the [WNUT 17](https://huggingface.co/datasets/wnut_17) dataset to detect new entities.
-2. Use your finetuned model for inference.
-
-<Tip>
-
-See the token classification [task page](https://huggingface.co/tasks/token-classification) for more information about other forms of token classification and their associated models, datasets, and metrics.
-
-</Tip>
-
-Before you begin, make sure you have all the necessary libraries installed:
-
-```bash
-pip install transformers datasets evaluate
-```
-
-We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login:
-
-```py
->>> from huggingface_hub import notebook_login
-
->>> notebook_login()
-```
-
-## Load WNUT 17 dataset
-
-Start by loading the WNUT 17 dataset from the 🤗 Datasets library:
-
-```py
->>> from datasets import load_dataset
-
->>> wnut = load_dataset("wnut_17")
-```
-
-Then take a look at an example:
-
-```py
->>> wnut["train"][0]
-{'id': '0',
- 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0],
- 'tokens': ['@paulwalk', 'It', "'s", 'the', 'view', 'from', 'where', 'I', "'m", 'living', 'for', 'two', 'weeks', '.', 'Empire', 'State', 'Building', '=', 'ESB', '.', 'Pretty', 'bad', 'storm', 'here', 'last', 'evening', '.']
-}
-```
-
-Each number in `ner_tags` represents an entity. Convert the numbers to their label names to find out what the entities are:
-
-```py
->>> label_list = wnut["train"].features[f"ner_tags"].feature.names
->>> label_list
-[
-    "O",
-    "B-corporation",
-    "I-corporation",
-    "B-creative-work",
-    "I-creative-work",
-    "B-group",
-    "I-group",
-    "B-location",
-    "I-location",
-    "B-person",
-    "I-person",
-    "B-product",
-    "I-product",
-]
-```
-
-The letter that prefixes each `ner_tag` indicates the token position of the entity:
-
-- `B-` indicates the beginning of an entity.
-- `I-` indicates a token is contained inside the same entity (for example, the `State` token is a part of an entity like
-  `Empire State Building`).
-- `0` indicates the token doesn't correspond to any entity.
-
-## Preprocess
-
-<Youtube id="iY2AZYdZAr0"/>
-
-The next step is to load a DistilBERT tokenizer to preprocess the `tokens` field:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
-```
-
-As you saw in the example `tokens` field above, it looks like the input has already been tokenized. But the input actually hasn't been tokenized yet and you'll need to set `is_split_into_words=True` to tokenize the words into subwords. For example:
-
-```py
->>> example = wnut["train"][0]
->>> tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
->>> tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
->>> tokens
-['[CLS]', '@', 'paul', '##walk', 'it', "'", 's', 'the', 'view', 'from', 'where', 'i', "'", 'm', 'living', 'for', 'two', 'weeks', '.', 'empire', 'state', 'building', '=', 'es', '##b', '.', 'pretty', 'bad', 'storm', 'here', 'last', 'evening', '.', '[SEP]']
-```
-
-However, this adds some special tokens `[CLS]` and `[SEP]` and the subword tokenization creates a mismatch between the input and labels. A single word corresponding to a single label may now be split into two subwords. You'll need to realign the tokens and labels by:
-
-1. Mapping all tokens to their corresponding word with the [`word_ids`](https://huggingface.co/docs/tokenizers/python/latest/api/reference.html#tokenizers.Encoding.word_ids) method.
-2. Assigning the label `-100` to the special tokens `[CLS]` and `[SEP]` so they're ignored by the PyTorch loss function.
-3. Only labeling the first token of a given word. Assign `-100` to other subtokens from the same word.
-
-Here is how you can create a function to realign the tokens and labels, and truncate sequences to be no longer than DistilBERT's maximum input length:
-
-```py
->>> def tokenize_and_align_labels(examples):
-...     tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
-
-...     labels = []
-...     for i, label in enumerate(examples[f"ner_tags"]):
-...         word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
-...         previous_word_idx = None
-...         label_ids = []
-...         for word_idx in word_ids:  # Set the special tokens to -100.
-...             if word_idx is None:
-...                 label_ids.append(-100)
-...             elif word_idx != previous_word_idx:  # Only label the first token of a given word.
-...                 label_ids.append(label[word_idx])
-...             else:
-...                 label_ids.append(-100)
-...             previous_word_idx = word_idx
-...         labels.append(label_ids)
-
-...     tokenized_inputs["labels"] = labels
-...     return tokenized_inputs
-```
-
-To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] function. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once:
-
-```py
->>> tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True)
-```
-
-Now create a batch of examples using [`DataCollatorWithPadding`]. It's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximium length.
-
-<frameworkcontent>
-<pt>
-```py
->>> from transformers import DataCollatorForTokenClassification
-
->>> data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
-```
-</pt>
-<tf>
-```py
->>> from transformers import DataCollatorForTokenClassification
-
->>> data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")
-```
-</tf>
-</frameworkcontent>
-
-## Evaluate
-
-Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [seqeval](https://huggingface.co/spaces/evaluate-metric/seqeval) framework (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric). Seqeval actually produces several scores: precision, recall, F1, and accuracy.
-
-```py
->>> import evaluate
-
->>> seqeval = evaluate.load("seqeval")
-```
-
-Get the NER labels first, and then create a function that passes your true predictions and true labels to [`~evaluate.EvaluationModule.compute`] to calculate the scores:
-
-```py
->>> import numpy as np
-
->>> labels = [label_list[i] for i in example[f"ner_tags"]]
-
-
->>> def compute_metrics(p):
-...     predictions, labels = p
-...     predictions = np.argmax(predictions, axis=2)
-
-...     true_predictions = [
-...         [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
-...         for prediction, label in zip(predictions, labels)
-...     ]
-...     true_labels = [
-...         [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
-...         for prediction, label in zip(predictions, labels)
-...     ]
-
-...     results = seqeval.compute(predictions=true_predictions, references=true_labels)
-...     return {
-...         "precision": results["overall_precision"],
-...         "recall": results["overall_recall"],
-...         "f1": results["overall_f1"],
-...         "accuracy": results["overall_accuracy"],
-...     }
-```
-
-Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training.
-
-## Train
-
-Before you start training your model, create a map of the expected ids to their labels with `id2label` and `label2id`:
-
-```py
->>> id2label = {
-...     0: "O",
-...     1: "B-corporation",
-...     2: "I-corporation",
-...     3: "B-creative-work",
-...     4: "I-creative-work",
-...     5: "B-group",
-...     6: "I-group",
-...     7: "B-location",
-...     8: "I-location",
-...     9: "B-person",
-...     10: "I-person",
-...     11: "B-product",
-...     12: "I-product",
-... }
->>> label2id = {
-...     "O": 0,
-...     "B-corporation": 1,
-...     "I-corporation": 2,
-...     "B-creative-work": 3,
-...     "I-creative-work": 4,
-...     "B-group": 5,
-...     "I-group": 6,
-...     "B-location": 7,
-...     "I-location": 8,
-...     "B-person": 9,
-...     "I-person": 10,
-...     "B-product": 11,
-...     "I-product": 12,
-... }
-```
-
-<frameworkcontent>
-<pt>
-<Tip>
-
-If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)!
-
-</Tip>
-You're ready to start training your model now! Load DistilBERT with [`AutoModelForTokenClassification`] along with the number of expected labels, and the label mappings:
-
-```py
->>> from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
-
->>> model = AutoModelForTokenClassification.from_pretrained(
-...     "distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id
-... )
-```
-
-At this point, only three steps remain:
-
-1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the seqeval scores and save the training checkpoint.
-2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function.
-3. Call [`~Trainer.train`] to finetune your model.
-
-```py
->>> training_args = TrainingArguments(
-...     output_dir="my_awesome_wnut_model",
-...     learning_rate=2e-5,
-...     per_device_train_batch_size=16,
-...     per_device_eval_batch_size=16,
-...     num_train_epochs=2,
-...     weight_decay=0.01,
-...     evaluation_strategy="epoch",
-...     save_strategy="epoch",
-...     load_best_model_at_end=True,
-...     push_to_hub=True,
-... )
-
->>> trainer = Trainer(
-...     model=model,
-...     args=training_args,
-...     train_dataset=tokenized_wnut["train"],
-...     eval_dataset=tokenized_wnut["test"],
-...     tokenizer=tokenizer,
-...     data_collator=data_collator,
-...     compute_metrics=compute_metrics,
-... )
-
->>> trainer.train()
-```
-
-Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model:
-
-```py
->>> trainer.push_to_hub()
-```
-</pt>
-<tf>
-<Tip>
-
-If you aren't familiar with finetuning a model with Keras, take a look at the basic tutorial [here](../training#train-a-tensorflow-model-with-keras)!
-
-</Tip>
-To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters:
-
-```py
->>> from transformers import create_optimizer
-
->>> batch_size = 16
->>> num_train_epochs = 3
->>> num_train_steps = (len(tokenized_wnut["train"]) // batch_size) * num_train_epochs
->>> optimizer, lr_schedule = create_optimizer(
-...     init_lr=2e-5,
-...     num_train_steps=num_train_steps,
-...     weight_decay_rate=0.01,
-...     num_warmup_steps=0,
-... )
-```
-
-Then you can load DistilBERT with [`TFAutoModelForTokenClassification`] along with the number of expected labels, and the label mappings:
-
-```py
->>> from transformers import TFAutoModelForTokenClassification
-
->>> model = TFAutoModelForTokenClassification.from_pretrained(
-...     "distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id
-... )
-```
-
-Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
-
-```py
->>> tf_train_set = model.prepare_tf_dataset(
-...     tokenized_wnut["train"],
-...     shuffle=True,
-...     batch_size=16,
-...     collate_fn=data_collator,
-... )
-
->>> tf_validation_set = model.prepare_tf_dataset(
-...     tokenized_wnut["validation"],
-...     shuffle=False,
-...     batch_size=16,
-...     collate_fn=data_collator,
-... )
-```
-
-Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
-
-```py
->>> import tensorflow as tf
-
->>> model.compile(optimizer=optimizer)
-```
-
-The last two things to setup before you start training is to compute the seqeval scores from the predictions, and provide a way to push your model to the Hub. Both are done by using [Keras callbacks](./main_classes/keras_callbacks).
-
-Pass your `compute_metrics` function to [`~transformers.KerasMetricCallback`]:
-
-```py
->>> from transformers.keras_callbacks import KerasMetricCallback
-
->>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
-```
-
-Specify where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]:
-
-```py
->>> from transformers.keras_callbacks import PushToHubCallback
-
->>> push_to_hub_callback = PushToHubCallback(
-...     output_dir="my_awesome_wnut_model",
-...     tokenizer=tokenizer,
-... )
-```
-
-Then bundle your callbacks together:
-
-```py
->>> callbacks = [metric_callback, push_to_hub_callback]
-```
-
-Finally, you're ready to start training your model! Call [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) with your training and validation datasets, the number of epochs, and your callbacks to finetune the model:
-
-```py
->>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=callbacks)
-```
-
-Once training is completed, your model is automatically uploaded to the Hub so everyone can use it!
-</tf>
-</frameworkcontent>
-
-<Tip>
-
-For a more in-depth example of how to finetune a model for token classification, take a look at the corresponding
-[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb)
-or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb).
-
-</Tip>
-
-## Inference
-
-Great, now that you've finetuned a model, you can use it for inference!
-
-Grab some text you'd like to run inference on:
-
-```py
->>> text = "The Golden State Warriors are an American professional basketball team based in San Francisco."
-```
-
-The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for NER with your model, and pass your text to it:
-
-```py
->>> from transformers import pipeline
-
->>> classifier = pipeline("ner", model="stevhliu/my_awesome_wnut_model")
->>> classifier(text)
-[{'entity': 'B-location',
-  'score': 0.42658573,
-  'index': 2,
-  'word': 'golden',
-  'start': 4,
-  'end': 10},
- {'entity': 'I-location',
-  'score': 0.35856336,
-  'index': 3,
-  'word': 'state',
-  'start': 11,
-  'end': 16},
- {'entity': 'B-group',
-  'score': 0.3064001,
-  'index': 4,
-  'word': 'warriors',
-  'start': 17,
-  'end': 25},
- {'entity': 'B-location',
-  'score': 0.65523505,
-  'index': 13,
-  'word': 'san',
-  'start': 80,
-  'end': 83},
- {'entity': 'B-location',
-  'score': 0.4668663,
-  'index': 14,
-  'word': 'francisco',
-  'start': 84,
-  'end': 93}]
-```
-
-You can also manually replicate the results of the `pipeline` if you'd like:
-
-<frameworkcontent>
-<pt>
-Tokenize the text and return PyTorch tensors:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_wnut_model")
->>> inputs = tokenizer(text, return_tensors="pt")
-```
-
-Pass your inputs to the model and return the `logits`:
-
-```py
->>> from transformers import AutoModelForTokenClassification
-
->>> model = AutoModelForTokenClassification.from_pretrained("stevhliu/my_awesome_wnut_model")
->>> with torch.no_grad():
-...     logits = model(**inputs).logits
-```
-
-Get the class with the highest probability, and use the model's `id2label` mapping to convert it to a text label:
-
-```py
->>> predictions = torch.argmax(logits, dim=2)
->>> predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]
->>> predicted_token_class
-['O',
- 'O',
- 'B-location',
- 'I-location',
- 'B-group',
- 'O',
- 'O',
- 'O',
- 'O',
- 'O',
- 'O',
- 'O',
- 'O',
- 'B-location',
- 'B-location',
- 'O',
- 'O']
-```
-</pt>
-<tf>
-Tokenize the text and return TensorFlow tensors:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_wnut_model")
->>> inputs = tokenizer(text, return_tensors="tf")
-```
-
-Pass your inputs to the model and return the `logits`:
-
-```py
->>> from transformers import TFAutoModelForTokenClassification
-
->>> model = TFAutoModelForTokenClassification.from_pretrained("stevhliu/my_awesome_wnut_model")
->>> logits = model(**inputs).logits
-```
-
-Get the class with the highest probability, and use the model's `id2label` mapping to convert it to a text label:
-
-```py
->>> predicted_token_class_ids = tf.math.argmax(logits, axis=-1)
->>> predicted_token_class = [model.config.id2label[t] for t in predicted_token_class_ids[0].numpy().tolist()]
->>> predicted_token_class
-['O',
- 'O',
- 'B-location',
- 'I-location',
- 'B-group',
- 'O',
- 'O',
- 'O',
- 'O',
- 'O',
- 'O',
- 'O',
- 'O',
- 'B-location',
- 'B-location',
- 'O',
- 'O']
-```
-</tf>
-</frameworkcontent>
\ No newline at end of file
diff --git a/docs/source/en/tasks/translation.md b/docs/source/en/tasks/translation.md
new file mode 100644
index 000000000000..d5394caef838
--- /dev/null
+++ b/docs/source/en/tasks/translation.md
@@ -0,0 +1,411 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Translation
+
+[[open-in-colab]]
+
+<Youtube id="1JvfrvZgi6c"/>
+
+Translation converts a sequence of text from one language to another. It is one of several tasks you can formulate as a sequence-to-sequence problem, a powerful framework for returning some output from an input, like translation or summarization. Translation systems are commonly used for translation between different language texts, but it can also be used for speech or some combination in between like text-to-speech or speech-to-text.
+
+This guide will show you how to:
+
+1. Finetune [T5](https://huggingface.co/t5-small) on the English-French subset of the [OPUS Books](https://huggingface.co/datasets/opus_books) dataset to translate English text to French.
+2. Use your finetuned model for inference.
+
+<Tip>
+The task illustrated in this tutorial is supported by the following model architectures:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[BART](../model_doc/bart), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [Encoder decoder](../model_doc/encoder-decoder), [FairSeq Machine-Translation](../model_doc/fsmt), [GPTSAN-japanese](../model_doc/gptsan-japanese), [LED](../model_doc/led), [LongT5](../model_doc/longt5), [M2M100](../model_doc/m2m_100), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [NLLB](../model_doc/nllb), [NLLB-MOE](../model_doc/nllb-moe), [Pegasus](../model_doc/pegasus), [PEGASUS-X](../model_doc/pegasus_x), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [SwitchTransformers](../model_doc/switch_transformers), [T5](../model_doc/t5), [UMT5](../model_doc/umt5), [XLM-ProphetNet](../model_doc/xlm-prophetnet)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+Before you begin, make sure you have all the necessary libraries installed:
+
+```bash
+pip install transformers datasets evaluate sacrebleu
+```
+
+We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## Load OPUS Books dataset
+
+Start by loading the English-French subset of the [OPUS Books](https://huggingface.co/datasets/opus_books) dataset from the 🤗 Datasets library:
+
+```py
+>>> from datasets import load_dataset
+
+>>> books = load_dataset("opus_books", "en-fr")
+```
+
+Split the dataset into a train and test set with the [`~datasets.Dataset.train_test_split`] method:
+
+```py
+>>> books = books["train"].train_test_split(test_size=0.2)
+```
+
+Then take a look at an example:
+
+```py
+>>> books["train"][0]
+{'id': '90560',
+ 'translation': {'en': 'But this lofty plateau measured only a few fathoms, and soon we reentered Our Element.',
+  'fr': 'Mais ce plateau élevé ne mesurait que quelques toises, et bientôt nous fûmes rentrés dans notre élément.'}}
+```
+
+`translation`: an English and French translation of the text.
+
+## Preprocess
+
+<Youtube id="XAR8jnZZuUs"/>
+
+The next step is to load a T5 tokenizer to process the English-French language pairs:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> checkpoint = "t5-small"
+>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+```
+
+The preprocessing function you want to create needs to:
+
+1. Prefix the input with a prompt so T5 knows this is a translation task. Some models capable of multiple NLP tasks require prompting for specific tasks.
+2. Tokenize the input (English) and target (French) separately because you can't tokenize French text with a tokenizer pretrained on an English vocabulary.
+3. Truncate sequences to be no longer than the maximum length set by the `max_length` parameter.
+
+```py
+>>> source_lang = "en"
+>>> target_lang = "fr"
+>>> prefix = "translate English to French: "
+
+
+>>> def preprocess_function(examples):
+...     inputs = [prefix + example[source_lang] for example in examples["translation"]]
+...     targets = [example[target_lang] for example in examples["translation"]]
+...     model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
+...     return model_inputs
+```
+
+To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] method. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once:
+
+```py
+>>> tokenized_books = books.map(preprocess_function, batched=True)
+```
+
+Now create a batch of examples using [`DataCollatorForSeq2Seq`]. It's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.
+
+<frameworkcontent>
+<pt>
+```py
+>>> from transformers import DataCollatorForSeq2Seq
+
+>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)
+```
+</pt>
+<tf>
+
+```py
+>>> from transformers import DataCollatorForSeq2Seq
+
+>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf")
+```
+</tf>
+</frameworkcontent>
+
+## Evaluate
+
+Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [SacreBLEU](https://huggingface.co/spaces/evaluate-metric/sacrebleu) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric):
+
+```py
+>>> import evaluate
+
+>>> metric = evaluate.load("sacrebleu")
+```
+
+Then create a function that passes your predictions and labels to [`~evaluate.EvaluationModule.compute`] to calculate the SacreBLEU score:
+
+```py
+>>> import numpy as np
+
+
+>>> def postprocess_text(preds, labels):
+...     preds = [pred.strip() for pred in preds]
+...     labels = [[label.strip()] for label in labels]
+
+...     return preds, labels
+
+
+>>> def compute_metrics(eval_preds):
+...     preds, labels = eval_preds
+...     if isinstance(preds, tuple):
+...         preds = preds[0]
+...     decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
+
+...     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+...     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
+
+...     decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
+
+...     result = metric.compute(predictions=decoded_preds, references=decoded_labels)
+...     result = {"bleu": result["score"]}
+
+...     prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
+...     result["gen_len"] = np.mean(prediction_lens)
+...     result = {k: round(v, 4) for k, v in result.items()}
+...     return result
+```
+
+Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training.
+
+## Train
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)!
+
+</Tip>
+
+You're ready to start training your model now! Load T5 with [`AutoModelForSeq2SeqLM`]:
+
+```py
+>>> from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
+
+>>> model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
+```
+
+At this point, only three steps remain:
+
+1. Define your training hyperparameters in [`Seq2SeqTrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the SacreBLEU metric and save the training checkpoint.
+2. Pass the training arguments to [`Seq2SeqTrainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function.
+3. Call [`~Trainer.train`] to finetune your model.
+
+```py
+>>> training_args = Seq2SeqTrainingArguments(
+...     output_dir="my_awesome_opus_books_model",
+...     evaluation_strategy="epoch",
+...     learning_rate=2e-5,
+...     per_device_train_batch_size=16,
+...     per_device_eval_batch_size=16,
+...     weight_decay=0.01,
+...     save_total_limit=3,
+...     num_train_epochs=2,
+...     predict_with_generate=True,
+...     fp16=True,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Seq2SeqTrainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=tokenized_books["train"],
+...     eval_dataset=tokenized_books["test"],
+...     tokenizer=tokenizer,
+...     data_collator=data_collator,
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+````
+
+Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+<Tip>
+
+If you aren't familiar with finetuning a model with Keras, take a look at the basic tutorial [here](../training#train-a-tensorflow-model-with-keras)!
+
+</Tip>
+To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters:
+
+```py
+>>> from transformers import AdamWeightDecay
+
+>>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
+```
+
+Then you can load T5 with [`TFAutoModelForSeq2SeqLM`]:
+
+```py
+>>> from transformers import TFAutoModelForSeq2SeqLM
+
+>>> model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint)
+```
+
+Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
+
+```py
+>>> tf_train_set = model.prepare_tf_dataset(
+...     tokenized_books["train"],
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_test_set = model.prepare_tf_dataset(
+...     tokenized_books["test"],
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to:
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)  # No loss argument!
+```
+
+The last two things to setup before you start training is to compute the SacreBLEU metric from the predictions, and provide a way to push your model to the Hub. Both are done by using [Keras callbacks](../main_classes/keras_callbacks).
+
+Pass your `compute_metrics` function to [`~transformers.KerasMetricCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import KerasMetricCallback
+
+>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
+```
+
+Specify where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> push_to_hub_callback = PushToHubCallback(
+...     output_dir="my_awesome_opus_books_model",
+...     tokenizer=tokenizer,
+... )
+```
+
+Then bundle your callbacks together:
+
+```py
+>>> callbacks = [metric_callback, push_to_hub_callback]
+```
+
+Finally, you're ready to start training your model! Call [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) with your training and validation datasets, the number of epochs, and your callbacks to finetune the model:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=callbacks)
+```
+
+Once training is completed, your model is automatically uploaded to the Hub so everyone can use it!
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+For a more in-depth example of how to finetune a model for translation, take a look at the corresponding
+[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation.ipynb)
+or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation-tf.ipynb).
+
+</Tip>
+
+## Inference
+
+Great, now that you've finetuned a model, you can use it for inference!
+
+Come up with some text you'd like to translate to another language. For T5, you need to prefix your input depending on the task you're working on. For translation from English to French, you should prefix your input as shown below:
+
+```py
+>>> text = "translate English to French: Legumes share resources with nitrogen-fixing bacteria."
+```
+
+The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for translation with your model, and pass your text to it:
+
+```py
+>>> from transformers import pipeline
+
+>>> translator = pipeline("translation", model="my_awesome_opus_books_model")
+>>> translator(text)
+[{'translation_text': 'Legumes partagent des ressources avec des bactéries azotantes.'}]
+```
+
+You can also manually replicate the results of the `pipeline` if you'd like:
+
+<frameworkcontent>
+<pt>
+Tokenize the text and return the `input_ids` as PyTorch tensors:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_opus_books_model")
+>>> inputs = tokenizer(text, return_tensors="pt").input_ids
+```
+
+Use the [`~transformers.generation_utils.GenerationMixin.generate`] method to create the translation. For more details about the different text generation strategies and parameters for controlling generation, check out the [Text Generation](../main_classes/text_generation) API.
+
+```py
+>>> from transformers import AutoModelForSeq2SeqLM
+
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("my_awesome_opus_books_model")
+>>> outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
+```
+
+Decode the generated token ids back into text:
+
+```py
+>>> tokenizer.decode(outputs[0], skip_special_tokens=True)
+'Les lignées partagent des ressources avec des bactéries enfixant l'azote.'
+```
+</pt>
+<tf>
+Tokenize the text and return the `input_ids` as TensorFlow tensors:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_opus_books_model")
+>>> inputs = tokenizer(text, return_tensors="tf").input_ids
+```
+
+Use the [`~transformers.generation_tf_utils.TFGenerationMixin.generate`] method to create the translation. For more details about the different text generation strategies and parameters for controlling generation, check out the [Text Generation](../main_classes/text_generation) API.
+
+```py
+>>> from transformers import TFAutoModelForSeq2SeqLM
+
+>>> model = TFAutoModelForSeq2SeqLM.from_pretrained("my_awesome_opus_books_model")
+>>> outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
+```
+
+Decode the generated token ids back into text:
+
+```py
+>>> tokenizer.decode(outputs[0], skip_special_tokens=True)
+'Les lugumes partagent les ressources avec des bactéries fixatrices d'azote.'
+```
+</tf>
+</frameworkcontent>
diff --git a/docs/source/en/tasks/translation.mdx b/docs/source/en/tasks/translation.mdx
deleted file mode 100644
index 318cb2b1a9d2..000000000000
--- a/docs/source/en/tasks/translation.mdx
+++ /dev/null
@@ -1,398 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Translation
-
-<Youtube id="1JvfrvZgi6c"/>
-
-Translation converts a sequence of text from one language to another. It is one of several tasks you can formulate as a sequence-to-sequence problem, a powerful framework for returning some output from an input, like translation or summarization. Translation systems are commonly used for translation between different language texts, but it can also be used for speech or some combination in between like text-to-speech or speech-to-text.
-
-This guide will show you how to:
-
-1. Finetune [T5](https://huggingface.co/t5-small) on the English-French subset of the [OPUS Books](https://huggingface.co/datasets/opus_books) dataset to translate English text to French.
-2. Use your finetuned model for inference.
-
-<Tip>
-
-See the translation [task page](https://huggingface.co/tasks/translation) for more information about its associated models, datasets, and metrics.
-
-</Tip>
-
-Before you begin, make sure you have all the necessary libraries installed:
-
-```bash
-pip install transformers datasets evaluate
-```
-
-We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login:
-
-```py
->>> from huggingface_hub import notebook_login
-
->>> notebook_login()
-```
-
-## Load OPUS Books dataset
-
-Start by loading the English-French subset of the [OPUS Books](https://huggingface.co/datasets/opus_books) dataset from the 🤗 Datasets library:
-
-```py
->>> from datasets import load_dataset
-
->>> books = load_dataset("opus_books", "en-fr")
-```
-
-Split the dataset into a train and test set with the [`~datasets.Dataset.train_test_split`] method:
-
-```py
->>> books = books["train"].train_test_split(test_size=0.2)
-```
-
-Then take a look at an example:
-
-```py
->>> books["train"][0]
-{'id': '90560',
- 'translation': {'en': 'But this lofty plateau measured only a few fathoms, and soon we reentered Our Element.',
-  'fr': 'Mais ce plateau élevé ne mesurait que quelques toises, et bientôt nous fûmes rentrés dans notre élément.'}}
-```
-
-`translation`: an English and French translation of the text.
-
-## Preprocess
-
-<Youtube id="XAR8jnZZuUs"/>
-
-The next step is to load a T5 tokenizer to process the English-French language pairs:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
-```
-
-The preprocessing function you want to create needs to:
-
-1. Prefix the input with a prompt so T5 knows this is a translation task. Some models capable of multiple NLP tasks require prompting for specific tasks.
-2. Tokenize the input (English) and target (French) separately because you can't tokenize French text with a tokenizer pretrained on an English vocabulary.
-3. Truncate sequences to be no longer than the maximum length set by the `max_length` parameter.
-
-```py
->>> source_lang = "en"
->>> target_lang = "fr"
->>> prefix = "translate English to French: "
-
-
->>> def preprocess_function(examples):
-...     inputs = [prefix + example[source_lang] for example in examples["translation"]]
-...     targets = [example[target_lang] for example in examples["translation"]]
-...     model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
-...     return model_inputs
-```
-
-To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] method. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once:
-
-```py
->>> tokenized_books = books.map(preprocess_function, batched=True)
-```
-
-Now create a batch of examples using [`DataCollatorForSeq2Seq`]. It's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximium length.
-
-<frameworkcontent>
-<pt>
-```py
->>> from transformers import DataCollatorForSeq2Seq
-
->>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
-```
-</pt>
-<tf>
-
-```py
->>> from transformers import DataCollatorForSeq2Seq
-
->>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors="tf")
-```
-</tf>
-</frameworkcontent>
-
-## Evaluate
-
-Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [SacreBLEU](https://huggingface.co/spaces/evaluate-metric/sacrebleu) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric):
-
-```py
->>> import evaluate
-
->>> sacrebleu = evaluate.load("sacrebleu")
-```
-
-Then create a function that passes your predictions and labels to [`~evaluate.EvaluationModule.compute`] to calculate the SacreBLEU score:
-
-```py
->>> import numpy as np
-
-
->>> def postprocess_text(preds, labels):
-...     preds = [pred.strip() for pred in preds]
-...     labels = [[label.strip()] for label in labels]
-
-...     return preds, labels
-
-
->>> def compute_metrics(eval_preds):
-...     preds, labels = eval_preds
-...     if isinstance(preds, tuple):
-...         preds = preds[0]
-...     decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
-
-...     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
-...     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
-
-...     decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
-
-...     result = metric.compute(predictions=decoded_preds, references=decoded_labels)
-...     result = {"bleu": result["score"]}
-
-...     prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
-...     result["gen_len"] = np.mean(prediction_lens)
-...     result = {k: round(v, 4) for k, v in result.items()}
-...     return result
-```
-
-Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training.
-
-## Train
-
-<frameworkcontent>
-<pt>
-<Tip>
-
-If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)!
-
-</Tip>
-You're ready to start training your model now! Load T5 with [`AutoModelForSeq2SeqLM`]:
-
-```py
->>> from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
-
->>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
-```
-
-At this point, only three steps remain:
-
-1. Define your training hyperparameters in [`Seq2SeqTrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the SacreBLEU metric and save the training checkpoint.
-2. Pass the training arguments to [`Seq2SeqTrainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function.
-3. Call [`~Trainer.train`] to finetune your model.
-
-```py
->>> training_args = Seq2SeqTrainingArguments(
-...     output_dir="my_awesome_opus_books_model",
-...     evaluation_strategy="epoch",
-...     learning_rate=2e-5,
-...     per_device_train_batch_size=16,
-...     per_device_eval_batch_size=16,
-...     weight_decay=0.01,
-...     save_total_limit=3,
-...     num_train_epochs=2,
-...     predict_with_generate=True,
-...     fp16=True,
-...     push_to_hub=True,
-... )
-
->>> trainer = Seq2SeqTrainer(
-...     model=model,
-...     args=training_args,
-...     train_dataset=tokenized_books["train"],
-...     eval_dataset=tokenized_books["test"],
-...     tokenizer=tokenizer,
-...     data_collator=data_collator,
-...     compute_metrics=compute_metrics,
-... )
-
->>> trainer.train()
-````
-
-Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model:
-
-```py
->>> trainer.push_to_hub()
-```
-</pt>
-<tf>
-<Tip>
-
-If you aren't familiar with finetuning a model with Keras, take a look at the basic tutorial [here](../training#train-a-tensorflow-model-with-keras)!
-
-</Tip>
-To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters:
-
-```py
->>> from transformers import AdamWeightDecay
-
->>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
-```
-
-Then you can load T5 with [`TFAutoModelForSeq2SeqLM`]:
-
-```py
->>> from transformers import TFAutoModelForSeq2SeqLM
-
->>> model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-small")
-```
-
-Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
-
-```py
->>> tf_train_set = model.prepare_tf_dataset(
-...     tokenized_books["train"],
-...     shuffle=True,
-...     batch_size=16,
-...     collate_fn=data_collator,
-... )
-
->>> tf_test_set = model.prepare_tf_dataset(
-...     tokenized_books["test"],
-...     shuffle=False,
-...     batch_size=16,
-...     collate_fn=data_collator,
-... )
-```
-
-Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
-
-```py
->>> import tensorflow as tf
-
->>> model.compile(optimizer=optimizer)
-```
-
-The last two things to setup before you start training is to compute the SacreBLEU metric from the predictions, and provide a way to push your model to the Hub. Both are done by using [Keras callbacks](./main_classes/keras_callbacks).
-
-Pass your `compute_metrics` function to [`~transformers.KerasMetricCallback`]:
-
-```py
->>> from transformers.keras_callbacks import KerasMetricCallback
-
->>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
-```
-
-Specify where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]:
-
-```py
->>> from transformers.keras_callbacks import PushToHubCallback
-
->>> push_to_hub_callback = PushToHubCallback(
-...     output_dir="my_awesome_opus_books_model",
-...     tokenizer=tokenizer,
-... )
-```
-
-Then bundle your callbacks together:
-
-```py
->>> callbacks = [metric_callback, push_to_hub_callback]
-```
-
-Finally, you're ready to start training your model! Call [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) with your training and validation datasets, the number of epochs, and your callbacks to finetune the model:
-
-```py
->>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=callbacks)
-```
-
-Once training is completed, your model is automatically uploaded to the Hub so everyone can use it!
-</tf>
-</frameworkcontent>
-
-<Tip>
-
-For a more in-depth example of how to finetune a model for translation, take a look at the corresponding
-[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation.ipynb)
-or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation-tf.ipynb).
-
-</Tip>
-
-## Inference
-
-Great, now that you've finetuned a model, you can use it for inference!
-
-Come up with some text you'd like to translate to another language. For T5, you need to prefix your input depending on the task you're working on. For translation from English to French, you should prefix your input as shown below:
-
-```py
->>> text = "translate English to French: Legumes share resources with nitrogen-fixing bacteria."
-```
-
-The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for translation with your model, and pass your text to it:
-
-```py
->>> from transformers import pipeline
-
->>> translator = pipeline("translation", model="my_awesome_opus_books_model")
->>> translator(text)
-[{'translation_text': 'Legumes partagent des ressources avec des bactéries azotantes.'}]
-```
-
-You can also manually replicate the results of the `pipeline` if you'd like:
-
-<frameworkcontent>
-<pt>
-Tokenize the text and return the `input_ids` as PyTorch tensors:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_opus_books_model")
->>> inputs = tokenizer(text, return_tensors="pt").input_ids
-```
-
-Use the [`~transformers.generation_utils.GenerationMixin.generate`] method to create the translation. For more details about the different text generation strategies and parameters for controlling generation, check out the [Text Generation](./main_classes/text_generation) API.
-
-```py
->>> from transformers import AutoModelForSeq2SeqLM
-
->>> model = AutoModelForSeq2SeqLM.from_pretrained("my_awesome_opus_books_model")
->>> outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
-```
-
-Decode the generated token ids back into text:
-
-```py
->>> tokenizer.decode(outputs[0], skip_special_tokens=True)
-'Les lignées partagent des ressources avec des bactéries enfixant l'azote.'
-```
-</pt>
-<tf>
-Tokenize the text and return the `input_ids` as TensorFlow tensors:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_opus_books_model")
->>> inputs = tokenizer(text, return_tensors="tf").input_ids
-```
-
-Use the [`~transformers.generation_tf_utils.TFGenerationMixin.generate`] method to create the translation. For more details about the different text generation strategies and parameters for controlling generation, check out the [Text Generation](./main_classes/text_generation) API.
-
-```py
->>> from transformers import TFAutoModelForSeq2SeqLM
-
->>> model = TFAutoModelForSeq2SeqLM.from_pretrained("my_awesome_opus_books_model")
->>> outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
-```
-
-Decode the generated token ids back into text:
-
-```py
->>> tokenizer.decode(outputs[0], skip_special_tokens=True)
-'Les lugumes partagent les ressources avec des bactéries fixatrices d'azote.'
-```
-</tf>
-</frameworkcontent>
\ No newline at end of file
diff --git a/docs/source/en/tasks/video_classification.md b/docs/source/en/tasks/video_classification.md
new file mode 100644
index 000000000000..a140ba373099
--- /dev/null
+++ b/docs/source/en/tasks/video_classification.md
@@ -0,0 +1,496 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Video classification
+
+[[open-in-colab]]
+
+Video classification is the task of assigning a label or class to an entire video. Videos are expected to have only one class for each video. Video classification models take a video as input and return a prediction about which class the video belongs to. These models can be used to categorize what a video is all about. A real-world application of video classification is action / activity recognition, which is useful for fitness applications. It is also helpful for vision-impaired individuals, especially when they are commuting.
+
+This guide will show you how to:
+
+1. Fine-tune [VideoMAE](https://huggingface.co/docs/transformers/main/en/model_doc/videomae) on a subset of the [UCF101](https://www.crcv.ucf.edu/data/UCF101.php) dataset.
+2. Use your fine-tuned model for inference.
+
+<Tip>
+The task illustrated in this tutorial is supported by the following model architectures:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[TimeSformer](../model_doc/timesformer), [VideoMAE](../model_doc/videomae), [ViViT](../model_doc/vivit)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+Before you begin, make sure you have all the necessary libraries installed:
+
+```bash
+pip install -q pytorchvideo transformers evaluate
+```
+
+You will use [PyTorchVideo](https://pytorchvideo.org/) (dubbed `pytorchvideo`) to process and prepare the videos.
+
+We encourage you to log in to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to log in:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## Load UCF101 dataset
+
+Start by loading a subset of the [UCF-101 dataset](https://www.crcv.ucf.edu/data/UCF101.php). This will give you a chance to experiment and make sure everything works before spending more time training on the full dataset.
+
+```py
+>>> from huggingface_hub import hf_hub_download
+
+>>> hf_dataset_identifier = "sayakpaul/ucf101-subset"
+>>> filename = "UCF101_subset.tar.gz"
+>>> file_path = hf_hub_download(repo_id=hf_dataset_identifier, filename=filename, repo_type="dataset")
+```
+
+After the subset has been downloaded, you need to extract the compressed archive:
+
+```py 
+>>> import tarfile
+
+>>> with tarfile.open(file_path) as t:
+...      t.extractall(".")
+```
+
+At a high level, the dataset is organized like so:
+
+```bash
+UCF101_subset/
+    train/
+        BandMarching/
+            video_1.mp4
+            video_2.mp4
+            ...
+        Archery
+            video_1.mp4
+            video_2.mp4
+            ...
+        ...
+    val/
+        BandMarching/
+            video_1.mp4
+            video_2.mp4
+            ...
+        Archery
+            video_1.mp4
+            video_2.mp4
+            ...
+        ...
+    test/
+        BandMarching/
+            video_1.mp4
+            video_2.mp4
+            ...
+        Archery
+            video_1.mp4
+            video_2.mp4
+            ...
+        ...
+```
+
+The (`sorted`) video paths appear like so:
+
+```bash
+...
+'UCF101_subset/train/ApplyEyeMakeup/v_ApplyEyeMakeup_g07_c04.avi',
+'UCF101_subset/train/ApplyEyeMakeup/v_ApplyEyeMakeup_g07_c06.avi',
+'UCF101_subset/train/ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c01.avi',
+'UCF101_subset/train/ApplyEyeMakeup/v_ApplyEyeMakeup_g09_c02.avi',
+'UCF101_subset/train/ApplyEyeMakeup/v_ApplyEyeMakeup_g09_c06.avi'
+...
+```
+
+You will notice that there are video clips belonging to the same group / scene where group is denoted by `g` in the video file paths. `v_ApplyEyeMakeup_g07_c04.avi` and `v_ApplyEyeMakeup_g07_c06.avi`, for example.
+
+For the validation and evaluation splits, you wouldn't want to have video clips from the same group / scene to prevent [data leakage](https://www.kaggle.com/code/alexisbcook/data-leakage). The subset that you are using in this tutorial takes this information into account.
+
+Next up, you will derive the set of labels present in the dataset. Also, create two dictionaries that'll be helpful when initializing the model:
+
+* `label2id`: maps the class names to integers.
+* `id2label`: maps the integers to class names. 
+
+```py 
+>>> class_labels = sorted({str(path).split("/")[2] for path in all_video_file_paths})
+>>> label2id = {label: i for i, label in enumerate(class_labels)}
+>>> id2label = {i: label for label, i in label2id.items()}
+
+>>> print(f"Unique classes: {list(label2id.keys())}.")
+
+# Unique classes: ['ApplyEyeMakeup', 'ApplyLipstick', 'Archery', 'BabyCrawling', 'BalanceBeam', 'BandMarching', 'BaseballPitch', 'Basketball', 'BasketballDunk', 'BenchPress'].
+```
+
+There are 10 unique classes. For each class, there are 30 videos in the training set.
+
+## Load a model to fine-tune
+
+Instantiate a video classification model from a pretrained checkpoint and its associated image processor. The model's encoder comes with pre-trained parameters, and the classification head is randomly initialized. The image processor will come in handy when writing the preprocessing pipeline for our dataset.
+
+```py 
+>>> from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification
+
+>>> model_ckpt = "MCG-NJU/videomae-base"
+>>> image_processor = VideoMAEImageProcessor.from_pretrained(model_ckpt)
+>>> model = VideoMAEForVideoClassification.from_pretrained(
+...     model_ckpt,
+...     label2id=label2id,
+...     id2label=id2label,
+...     ignore_mismatched_sizes=True,  # provide this in case you're planning to fine-tune an already fine-tuned checkpoint
+... )
+```
+
+While the model is loading, you might notice the following warning:
+
+```bash
+Some weights of the model checkpoint at MCG-NJU/videomae-base were not used when initializing VideoMAEForVideoClassification: [..., 'decoder.decoder_layers.1.attention.output.dense.bias', 'decoder.decoder_layers.2.attention.attention.key.weight']
+- This IS expected if you are initializing VideoMAEForVideoClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
+- This IS NOT expected if you are initializing VideoMAEForVideoClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
+Some weights of VideoMAEForVideoClassification were not initialized from the model checkpoint at MCG-NJU/videomae-base and are newly initialized: ['classifier.bias', 'classifier.weight']
+You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+```
+
+The warning is telling us we are throwing away some weights (e.g. the weights and bias of the `classifier` layer) and randomly initializing some others (the weights and bias of a new `classifier` layer). This is expected in this case, because we are adding a new head for which we don't have pretrained weights, so the library warns us we should fine-tune this model before using it for inference, which is exactly what we are going to do.
+
+**Note** that [this checkpoint](https://huggingface.co/MCG-NJU/videomae-base-finetuned-kinetics) leads to better performance on this task as the checkpoint was obtained fine-tuning on a similar downstream task having considerable domain overlap. You can check out [this checkpoint](https://huggingface.co/sayakpaul/videomae-base-finetuned-kinetics-finetuned-ucf101-subset) which was obtained by fine-tuning `MCG-NJU/videomae-base-finetuned-kinetics`.  
+
+## Prepare the datasets for training
+
+For preprocessing the videos, you will leverage the [PyTorchVideo library](https://pytorchvideo.org/). Start by importing the dependencies we need. 
+
+```py 
+>>> import pytorchvideo.data
+
+>>> from pytorchvideo.transforms import (
+...     ApplyTransformToKey,
+...     Normalize,
+...     RandomShortSideScale,
+...     RemoveKey,
+...     ShortSideScale,
+...     UniformTemporalSubsample,
+... )
+
+>>> from torchvision.transforms import (
+...     Compose,
+...     Lambda,
+...     RandomCrop,
+...     RandomHorizontalFlip,
+...     Resize,
+... )
+```
+
+For the training dataset transformations, use a combination of uniform temporal subsampling, pixel normalization, random cropping, and random horizontal flipping. For the validation and evaluation dataset transformations, keep the same transformation chain except for random cropping and horizontal flipping. To learn more about the details of these transformations check out the [official documentation of PyTorchVideo](https://pytorchvideo.org).  
+
+Use the `image_processor` associated with the pre-trained model to obtain the following information:
+
+* Image mean and standard deviation with which the video frame pixels will be normalized.
+* Spatial resolution to which the video frames will be resized.
+
+Start by defining some constants.
+
+```py
+>>> mean = image_processor.image_mean
+>>> std = image_processor.image_std
+>>> if "shortest_edge" in image_processor.size:
+...     height = width = image_processor.size["shortest_edge"]
+>>> else:
+...     height = image_processor.size["height"]
+...     width = image_processor.size["width"]
+>>> resize_to = (height, width)
+
+>>> num_frames_to_sample = model.config.num_frames
+>>> sample_rate = 4
+>>> fps = 30
+>>> clip_duration = num_frames_to_sample * sample_rate / fps
+```
+
+Now, define the dataset-specific transformations and the datasets respectively. Starting with the training set: 
+
+```py 
+>>> train_transform = Compose(
+...     [
+...         ApplyTransformToKey(
+...             key="video",
+...             transform=Compose(
+...                 [
+...                     UniformTemporalSubsample(num_frames_to_sample),
+...                     Lambda(lambda x: x / 255.0),
+...                     Normalize(mean, std),
+...                     RandomShortSideScale(min_size=256, max_size=320),
+...                     RandomCrop(resize_to),
+...                     RandomHorizontalFlip(p=0.5),
+...                 ]
+...             ),
+...         ),
+...     ]
+... )
+
+>>> train_dataset = pytorchvideo.data.Ucf101(
+...     data_path=os.path.join(dataset_root_path, "train"),
+...     clip_sampler=pytorchvideo.data.make_clip_sampler("random", clip_duration),
+...     decode_audio=False,
+...     transform=train_transform,
+... )
+```
+
+The same sequence of workflow can be applied to the validation and evaluation sets: 
+
+```py 
+>>> val_transform = Compose(
+...     [
+...         ApplyTransformToKey(
+...             key="video",
+...             transform=Compose(
+...                 [
+...                     UniformTemporalSubsample(num_frames_to_sample),
+...                     Lambda(lambda x: x / 255.0),
+...                     Normalize(mean, std),
+...                     Resize(resize_to),
+...                 ]
+...             ),
+...         ),
+...     ]
+... )
+
+>>> val_dataset = pytorchvideo.data.Ucf101(
+...     data_path=os.path.join(dataset_root_path, "val"),
+...     clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", clip_duration),
+...     decode_audio=False,
+...     transform=val_transform,
+... )
+
+>>> test_dataset = pytorchvideo.data.Ucf101(
+...     data_path=os.path.join(dataset_root_path, "test"),
+...     clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", clip_duration),
+...     decode_audio=False,
+...     transform=val_transform,
+... )
+```
+
+**Note**: The above dataset pipelines are taken from the [official PyTorchVideo example](https://pytorchvideo.org/docs/tutorial_classification#dataset). We're using the [`pytorchvideo.data.Ucf101()`](https://pytorchvideo.readthedocs.io/en/latest/api/data/data.html#pytorchvideo.data.Ucf101) function because it's tailored for the UCF-101 dataset. Under the hood, it returns a [`pytorchvideo.data.labeled_video_dataset.LabeledVideoDataset`](https://pytorchvideo.readthedocs.io/en/latest/api/data/data.html#pytorchvideo.data.LabeledVideoDataset) object. `LabeledVideoDataset` class is the base class for all things video in the PyTorchVideo dataset. So, if you want to use a custom dataset not supported off-the-shelf by PyTorchVideo, you can extend the `LabeledVideoDataset` class accordingly. Refer to the `data` API [documentation to](https://pytorchvideo.readthedocs.io/en/latest/api/data/data.html) learn more. Also, if your dataset follows a similar structure (as shown above), then using the `pytorchvideo.data.Ucf101()` should work just fine. 
+
+You can access the `num_videos` argument to know the number of videos in the dataset.
+
+```py
+>>> print(train_dataset.num_videos, val_dataset.num_videos, test_dataset.num_videos)
+# (300, 30, 75)
+```
+
+## Visualize the preprocessed video for better debugging 
+
+```py 
+>>> import imageio
+>>> import numpy as np
+>>> from IPython.display import Image
+
+>>> def unnormalize_img(img):
+...     """Un-normalizes the image pixels."""
+...     img = (img * std) + mean
+...     img = (img * 255).astype("uint8")
+...     return img.clip(0, 255)
+
+>>> def create_gif(video_tensor, filename="sample.gif"):
+...     """Prepares a GIF from a video tensor.
+...     
+...     The video tensor is expected to have the following shape:
+...     (num_frames, num_channels, height, width).
+...     """
+...     frames = []
+...     for video_frame in video_tensor:
+...         frame_unnormalized = unnormalize_img(video_frame.permute(1, 2, 0).numpy())
+...         frames.append(frame_unnormalized)
+...     kargs = {"duration": 0.25}
+...     imageio.mimsave(filename, frames, "GIF", **kargs)
+...     return filename
+
+>>> def display_gif(video_tensor, gif_name="sample.gif"):
+...     """Prepares and displays a GIF from a video tensor."""
+...     video_tensor = video_tensor.permute(1, 0, 2, 3)
+...     gif_filename = create_gif(video_tensor, gif_name)
+...     return Image(filename=gif_filename)
+
+>>> sample_video = next(iter(train_dataset))
+>>> video_tensor = sample_video["video"]
+>>> display_gif(video_tensor)
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/sample_gif.gif" alt="Person playing basketball"/>
+</div>
+
+## Train the model 
+
+Leverage [`Trainer`](https://huggingface.co/docs/transformers/main_classes/trainer) from  🤗 Transformers for training the model. To instantiate a `Trainer`, you need to define the training configuration and an evaluation metric. The most important is the [`TrainingArguments`](https://huggingface.co/transformers/main_classes/trainer.html#transformers.TrainingArguments), which is a class that contains all the attributes to configure the training. It requires an output folder name, which will be used to save the checkpoints of the model. It also helps sync all the information in the model repository on 🤗 Hub.
+
+Most of the training arguments are self-explanatory, but one that is quite important here is `remove_unused_columns=False`. This one will drop any features not used by the model's call function. By default it's `True` because usually it's ideal to drop unused feature columns, making it easier to unpack inputs into the model's call function. But, in this case, you need the unused features ('video' in particular) in order to create `pixel_values` (which is a mandatory key our model expects in its inputs).
+
+
+```py 
+>>> from transformers import TrainingArguments, Trainer
+
+>>> model_name = model_ckpt.split("/")[-1]
+>>> new_model_name = f"{model_name}-finetuned-ucf101-subset"
+>>> num_epochs = 4
+
+>>> args = TrainingArguments(
+...     new_model_name,
+...     remove_unused_columns=False,
+...     evaluation_strategy="epoch",
+...     save_strategy="epoch",
+...     learning_rate=5e-5,
+...     per_device_train_batch_size=batch_size,
+...     per_device_eval_batch_size=batch_size,
+...     warmup_ratio=0.1,
+...     logging_steps=10,
+...     load_best_model_at_end=True,
+...     metric_for_best_model="accuracy",
+...     push_to_hub=True,
+...     max_steps=(train_dataset.num_videos // batch_size) * num_epochs,
+... )
+```
+
+The dataset returned by `pytorchvideo.data.Ucf101()` doesn't implement the `__len__` method. As such, we must define `max_steps` when instantiating `TrainingArguments`. 
+
+Next, you need to define a function to compute the metrics from the predictions, which will use the `metric` you'll load now. The only preprocessing you have to do is to take the argmax of our predicted logits:
+
+```py
+import evaluate
+
+metric = evaluate.load("accuracy")
+
+
+def compute_metrics(eval_pred):
+    predictions = np.argmax(eval_pred.predictions, axis=1)
+    return metric.compute(predictions=predictions, references=eval_pred.label_ids)
+```
+
+**A note on evaluation**:
+
+In the [VideoMAE paper](https://arxiv.org/abs/2203.12602), the authors use the following evaluation strategy. They evaluate the model on several clips from test videos and apply different crops to those clips and report the aggregate score. However, in the interest of simplicity and brevity, we don't consider that in this tutorial.
+
+Also, define a `collate_fn`, which will be used to batch examples together. Each batch consists of 2 keys, namely `pixel_values` and `labels`.
+
+```py 
+>>> def collate_fn(examples):
+...     # permute to (num_frames, num_channels, height, width)
+...     pixel_values = torch.stack(
+...         [example["video"].permute(1, 0, 2, 3) for example in examples]
+...     )
+...     labels = torch.tensor([example["label"] for example in examples])
+...     return {"pixel_values": pixel_values, "labels": labels}
+```
+
+Then you just pass all of this along with the datasets to `Trainer`:
+
+```py 
+>>> trainer = Trainer(
+...     model,
+...     args,
+...     train_dataset=train_dataset,
+...     eval_dataset=val_dataset,
+...     tokenizer=image_processor,
+...     compute_metrics=compute_metrics,
+...     data_collator=collate_fn,
+... )
+```
+
+You might wonder why you passed along the `image_processor` as a tokenizer when you preprocessed the data already. This is only to make sure the image processor configuration file (stored as JSON) will also be uploaded to the repo on the Hub.
+
+Now fine-tune our model by calling the `train` method:
+
+```py 
+>>> train_results = trainer.train()
+```
+
+Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model:
+
+```py
+>>> trainer.push_to_hub()
+```
+
+## Inference
+
+Great, now that you have fine-tuned a model, you can use it for inference!
+
+Load a video for inference:
+
+```py 
+>>> sample_test_video = next(iter(test_dataset))
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/sample_gif_two.gif" alt="Teams playing basketball"/>
+</div>
+
+The simplest way to try out your fine-tuned model for inference is to use it in a [`pipeline`](https://huggingface.co/docs/transformers/main/en/main_classes/pipelines#transformers.VideoClassificationPipeline). Instantiate a `pipeline` for video classification with your model, and pass your video to it:
+
+```py
+>>> from transformers import pipeline
+
+>>> video_cls = pipeline(model="my_awesome_video_cls_model")
+>>> video_cls("https://huggingface.co/datasets/sayakpaul/ucf101-subset/resolve/main/v_BasketballDunk_g14_c06.avi")
+[{'score': 0.9272987842559814, 'label': 'BasketballDunk'},
+ {'score': 0.017777055501937866, 'label': 'BabyCrawling'},
+ {'score': 0.01663011871278286, 'label': 'BalanceBeam'},
+ {'score': 0.009560945443809032, 'label': 'BandMarching'},
+ {'score': 0.0068979403004050255, 'label': 'BaseballPitch'}]
+```
+
+You can also manually replicate the results of the `pipeline` if you'd like.
+
+
+```py
+>>> def run_inference(model, video):
+...     # (num_frames, num_channels, height, width)
+...     perumuted_sample_test_video = video.permute(1, 0, 2, 3)
+...     inputs = {
+...         "pixel_values": perumuted_sample_test_video.unsqueeze(0),
+...         "labels": torch.tensor(
+...             [sample_test_video["label"]]
+...         ),  # this can be skipped if you don't have labels available.
+...     }
+
+...     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+...     inputs = {k: v.to(device) for k, v in inputs.items()}
+...     model = model.to(device)
+
+...     # forward pass
+...     with torch.no_grad():
+...         outputs = model(**inputs)
+...         logits = outputs.logits
+
+...     return logits
+```
+
+Now, pass your input to the model and return the `logits`:
+
+```
+>>> logits = run_inference(trained_model, sample_test_video["video"])
+```
+
+Decoding the `logits`, we get: 
+
+```py 
+>>> predicted_class_idx = logits.argmax(-1).item()
+>>> print("Predicted class:", model.config.id2label[predicted_class_idx])
+# Predicted class: BasketballDunk
+```
\ No newline at end of file
diff --git a/docs/source/en/tasks/video_classification.mdx b/docs/source/en/tasks/video_classification.mdx
deleted file mode 100644
index 948d4c09a5dc..000000000000
--- a/docs/source/en/tasks/video_classification.mdx
+++ /dev/null
@@ -1,487 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Video classification
-
-[[open-in-colab]]
-
-Video classification is the task of assigning a label or class to an entire video. Videos are expected to have only one class for each video. Video classification models take a video as input and return a prediction about which class the video belongs to. These models can be used to categorize what a video is all about. A real-world application of video classification is action / activity recognition, which is useful for fitness applications. It is also helpful for vision-impaired individuals, especially when they are commuting.
-
-This guide will show you how to:
-
-1. Fine-tune [VideoMAE](https://huggingface.co/docs/transformers/main/en/model_doc/videomae) on a subset of the [UCF101](https://www.crcv.ucf.edu/data/UCF101.php) dataset.
-2. Use your fine-tuned model for inference.
-
-<Tip>
-
-See the video classification [task page](https://huggingface.co/tasks/video-classification) for more information about its associated models, datasets, and metrics.
-
-</Tip>
-
-Before you begin, make sure you have all the necessary libraries installed:
-
-```bash
-pip install -q pytorchvideo transformers evaluate
-```
-
-You will use [PyTorchVideo](https://pytorchvideo.org/) (dubbed `pytorchvideo`) to process and prepare the videos.
-
-We encourage you to log in to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to log in:
-
-```py
->>> from huggingface_hub import notebook_login
-
->>> notebook_login()
-```
-
-## Load UCF101 dataset
-
-Start by loading a subset of the [UCF-101 dataset](https://www.crcv.ucf.edu/data/UCF101.php). This will give you a chance to experiment and make sure everything works before spending more time training on the full dataset.
-
-```py
->>> from huggingface_hub import hf_hub_download
-
->>> hf_dataset_identifier = "sayakpaul/ucf101-subset"
->>> filename = "UCF101_subset.tar.gz"
->>> file_path = hf_hub_download(repo_id=hf_dataset_identifier, filename=filename, repo_type="dataset")
-```
-
-After the subset has been downloaded, you need to extract the compressed archive:
-
-```py 
->>> import tarfile
-
->>> with tarfile.open(file_path) as t:
-...      t.extractall(".")
-```
-
-At a high level, the dataset is organized like so:
-
-```bash
-UCF101_subset/
-    train/
-        BandMarching/
-            video_1.mp4
-            video_2.mp4
-            ...
-        Archery
-            video_1.mp4
-            video_2.mp4
-            ...
-        ...
-    val/
-        BandMarching/
-            video_1.mp4
-            video_2.mp4
-            ...
-        Archery
-            video_1.mp4
-            video_2.mp4
-            ...
-        ...
-    test/
-        BandMarching/
-            video_1.mp4
-            video_2.mp4
-            ...
-        Archery
-            video_1.mp4
-            video_2.mp4
-            ...
-        ...
-```
-
-The (`sorted`) video paths appear like so:
-
-```bash
-...
-'UCF101_subset/train/ApplyEyeMakeup/v_ApplyEyeMakeup_g07_c04.avi',
-'UCF101_subset/train/ApplyEyeMakeup/v_ApplyEyeMakeup_g07_c06.avi',
-'UCF101_subset/train/ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c01.avi',
-'UCF101_subset/train/ApplyEyeMakeup/v_ApplyEyeMakeup_g09_c02.avi',
-'UCF101_subset/train/ApplyEyeMakeup/v_ApplyEyeMakeup_g09_c06.avi'
-...
-```
-
-You will notice that there are video clips belonging to the same group / scene where group is denoted by `g` in the video file paths. `v_ApplyEyeMakeup_g07_c04.avi` and `v_ApplyEyeMakeup_g07_c06.avi`, for example.
-
-For the validation and evaluation splits, you wouldn't want to have video clips from the same group / scene to prevent [data leakage](https://www.kaggle.com/code/alexisbcook/data-leakage). The subset that you are using in this tutorial takes this information into account.
-
-Next up, you will derive the set of labels present in the dataset. Also, create two dictionaries that'll be helpful when initializing the model:
-
-* `label2id`: maps the class names to integers.
-* `id2label`: maps the integers to class names. 
-
-```py 
->>> class_labels = sorted({str(path).split("/")[2] for path in all_video_file_paths})
->>> label2id = {label: i for i, label in enumerate(class_labels)}
->>> id2label = {i: label for label, i in label2id.items()}
-
->>> print(f"Unique classes: {list(label2id.keys())}.")
-
-# Unique classes: ['ApplyEyeMakeup', 'ApplyLipstick', 'Archery', 'BabyCrawling', 'BalanceBeam', 'BandMarching', 'BaseballPitch', 'Basketball', 'BasketballDunk', 'BenchPress'].
-```
-
-There are 10 unique classes. For each class, there are 30 videos in the training set.
-
-## Load a model to fine-tune
-
-Instantiate a video classification model from a pretrained checkpoint and its associated image processor. The model's encoder comes with pre-trained parameters, and the classification head is randomly initialized. The image processor will come in handy when writing the preprocessing pipeline for our dataset.
-
-```py 
->>> from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification
-
->>> model_ckpt = "MCG-NJU/videomae-base"
->>> image_processor = VideoMAEImageProcessor.from_pretrained(model_ckpt)
->>> model = VideoMAEForVideoClassification.from_pretrained(
-...     model_ckpt,
-...     label2id=label2id,
-...     id2label=id2label,
-...     ignore_mismatched_sizes=True,  # provide this in case you're planning to fine-tune an already fine-tuned checkpoint
-... )
-```
-
-While the model is loading, you might notice the following warning:
-
-```bash
-Some weights of the model checkpoint at MCG-NJU/videomae-base were not used when initializing VideoMAEForVideoClassification: [..., 'decoder.decoder_layers.1.attention.output.dense.bias', 'decoder.decoder_layers.2.attention.attention.key.weight']
-- This IS expected if you are initializing VideoMAEForVideoClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
-- This IS NOT expected if you are initializing VideoMAEForVideoClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
-Some weights of VideoMAEForVideoClassification were not initialized from the model checkpoint at MCG-NJU/videomae-base and are newly initialized: ['classifier.bias', 'classifier.weight']
-You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
-```
-
-The warning is telling us we are throwing away some weights (e.g. the weights and bias of the `classifier` layer) and randomly initializing some others (the weights and bias of a new `classifier` layer). This is expected in this case, because we are adding a new head for which we don't have pretrained weights, so the library warns us we should fine-tune this model before using it for inference, which is exactly what we are going to do.
-
-**Note** that [this checkpoint](https://huggingface.co/MCG-NJU/videomae-base-finetuned-kinetics) leads to better performance on this task as the checkpoint was obtained fine-tuning on a similar downstream task having considerable domain overlap. You can check out [this checkpoint](https://huggingface.co/sayakpaul/videomae-base-finetuned-kinetics-finetuned-ucf101-subset) which was obtained by fine-tuning `MCG-NJU/videomae-base-finetuned-kinetics`.  
-
-## Prepare the datasets for training
-
-For preprocessing the videos, you will leverage the [PyTorchVideo library](https://pytorchvideo.org/). Start by importing the dependencies we need. 
-
-```py 
->>> import pytorchvideo.data
-
->>> from pytorchvideo.transforms import (
-...     ApplyTransformToKey,
-...     Normalize,
-...     RandomShortSideScale,
-...     RemoveKey,
-...     ShortSideScale,
-...     UniformTemporalSubsample,
-... )
-
->>> from torchvision.transforms import (
-...     Compose,
-...     Lambda,
-...     RandomCrop,
-...     RandomHorizontalFlip,
-...     Resize,
-... )
-```
-
-For the training dataset transformations, use a combination of uniform temporal subsampling, pixel normalization, random cropping, and random horizontal flipping. For the validation and evaluation dataset transformations, keep the same transformation chain except for random cropping and horizontal flipping. To learn more about the details of these transformations check out the [official documentation of PyTorchVideo](https://pytorchvideo.org).  
-
-Use the `image_processor` associated with the pre-trained model to obtain the following information:
-
-* Image mean and standard deviation with which the video frame pixels will be normalized.
-* Spatial resolution to which the video frames will be resized.
-
-Start by defining some constants.
-
-```py
->>> mean = image_processor.image_mean
->>> std = image_processor.image_std
->>> if "shortest_edge" in image_processor.size:
-...     height = width = image_processor.size["shortest_edge"]
->>> else:
-...     height = image_processor.size["height"]
-...     width = image_processor.size["width"]
->>> resize_to = (height, width)
-
->>> num_frames_to_sample = model.config.num_frames
->>> sample_rate = 4
->>> fps = 30
->>> clip_duration = num_frames_to_sample * sample_rate / fps
-```
-
-Now, define the dataset-specific transformations and the datasets respectively. Starting with the training set: 
-
-```py 
->>> train_transform = Compose(
-...     [
-...         ApplyTransformToKey(
-...             key="video",
-...             transform=Compose(
-...                 [
-...                     UniformTemporalSubsample(num_frames_to_sample),
-...                     Lambda(lambda x: x / 255.0),
-...                     Normalize(mean, std),
-...                     RandomShortSideScale(min_size=256, max_size=320),
-...                     RandomCrop(resize_to),
-...                     RandomHorizontalFlip(p=0.5),
-...                 ]
-...             ),
-...         ),
-...     ]
-... )
-
->>> train_dataset = pytorchvideo.data.Ucf101(
-...     data_path=os.path.join(dataset_root_path, "train"),
-...     clip_sampler=pytorchvideo.data.make_clip_sampler("random", clip_duration),
-...     decode_audio=False,
-...     transform=train_transform,
-... )
-```
-
-The same sequence of workflow can be applied to the validation and evaluation sets: 
-
-```py 
->>> val_transform = Compose(
-...     [
-...         ApplyTransformToKey(
-...             key="video",
-...             transform=Compose(
-...                 [
-...                     UniformTemporalSubsample(num_frames_to_sample),
-...                     Lambda(lambda x: x / 255.0),
-...                     Normalize(mean, std),
-...                     Resize(resize_to),
-...                 ]
-...             ),
-...         ),
-...     ]
-... )
-
->>> val_dataset = pytorchvideo.data.Ucf101(
-...     data_path=os.path.join(dataset_root_path, "val"),
-...     clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", clip_duration),
-...     decode_audio=False,
-...     transform=val_transform,
-... )
-
->>> test_dataset = pytorchvideo.data.Ucf101(
-...     data_path=os.path.join(dataset_root_path, "test"),
-...     clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", clip_duration),
-...     decode_audio=False,
-...     transform=val_transform,
-... )
-```
-
-**Note**: The above dataset pipelines are taken from the [official PyTorchVideo example](https://pytorchvideo.org/docs/tutorial_classification#dataset). We're using the [`pytorchvideo.data.Ucf101()`](https://pytorchvideo.readthedocs.io/en/latest/api/data/data.html#pytorchvideo.data.Ucf101) function because it's tailored for the UCF-101 dataset. Under the hood, it returns a [`pytorchvideo.data.labeled_video_dataset.LabeledVideoDataset`](https://pytorchvideo.readthedocs.io/en/latest/api/data/data.html#pytorchvideo.data.LabeledVideoDataset) object. `LabeledVideoDataset` class is the base class for all things video in the PyTorchVideo dataset. So, if you want to use a custom dataset not supported off-the-shelf by PyTorchVideo, you can extend the `LabeledVideoDataset` class accordingly. Refer to the `data` API [documentation to](https://pytorchvideo.readthedocs.io/en/latest/api/data/data.html) learn more. Also, if your dataset follows a similar structure (as shown above), then using the `pytorchvideo.data.Ucf101()` should work just fine. 
-
-You can access the `num_videos` argument to know the number of videos in the dataset.
-
-```py
->>> print(train_dataset.num_videos, val_dataset.num_videos, test_dataset.num_videos)
-# (300, 30, 75)
-```
-
-## Visualize the preprocessed video for better debugging 
-
-```py 
->>> import imageio
->>> import numpy as np
->>> from IPython.display import Image
-
->>> def unnormalize_img(img):
-...     """Un-normalizes the image pixels."""
-...     img = (img * std) + mean
-...     img = (img * 255).astype("uint8")
-...     return img.clip(0, 255)
-
->>> def create_gif(video_tensor, filename="sample.gif"):
-...     """Prepares a GIF from a video tensor.
-...     
-...     The video tensor is expected to have the following shape:
-...     (num_frames, num_channels, height, width).
-...     """
-...     frames = []
-...     for video_frame in video_tensor:
-...         frame_unnormalized = unnormalize_img(video_frame.permute(1, 2, 0).numpy())
-...         frames.append(frame_unnormalized)
-...     kargs = {"duration": 0.25}
-...     imageio.mimsave(filename, frames, "GIF", **kargs)
-...     return filename
-
->>> def display_gif(video_tensor, gif_name="sample.gif"):
-...     """Prepares and displays a GIF from a video tensor."""
-...     video_tensor = video_tensor.permute(1, 0, 2, 3)
-...     gif_filename = create_gif(video_tensor, gif_name)
-...     return Image(filename=gif_filename)
-
->>> sample_video = next(iter(train_dataset))
->>> video_tensor = sample_video["video"]
->>> display_gif(video_tensor)
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/sample_gif.gif" alt="Person playing basketball"/>
-</div>
-
-## Train the model 
-
-Leverage [`Trainer`](https://huggingface.co/docs/transformers/main_classes/trainer) from  🤗 Transformers for training the model. To instantiate a `Trainer`, you need to define the training configuration and an evaluation metric. The most important is the [`TrainingArguments`](https://huggingface.co/transformers/main_classes/trainer.html#transformers.TrainingArguments), which is a class that contains all the attributes to configure the training. It requires an output folder name, which will be used to save the checkpoints of the model. It also helps sync all the information in the model repository on 🤗 Hub.
-
-Most of the training arguments are self-explanatory, but one that is quite important here is `remove_unused_columns=False`. This one will drop any features not used by the model's call function. By default it's `True` because usually it's ideal to drop unused feature columns, making it easier to unpack inputs into the model's call function. But, in this case, you need the unused features ('video' in particular) in order to create `pixel_values` (which is a mandatory key our model expects in its inputs).
-
-
-```py 
->>> from transformers import TrainingArguments, Trainer
-
->>> model_name = model_ckpt.split("/")[-1]
->>> new_model_name = f"{model_name}-finetuned-ucf101-subset"
->>> num_epochs = 4
-
->>> args = TrainingArguments(
-...     new_model_name,
-...     remove_unused_columns=False,
-...     evaluation_strategy="epoch",
-...     save_strategy="epoch",
-...     learning_rate=5e-5,
-...     per_device_train_batch_size=batch_size,
-...     per_device_eval_batch_size=batch_size,
-...     warmup_ratio=0.1,
-...     logging_steps=10,
-...     load_best_model_at_end=True,
-...     metric_for_best_model="accuracy",
-...     push_to_hub=True,
-...     max_steps=(train_dataset.num_videos // batch_size) * num_epochs,
-... )
-```
-
-The dataset returned by `pytorchvideo.data.Ucf101()` doesn't implement the `__len__` method. As such, we must define `max_steps` when instantiating `TrainingArguments`. 
-
-Next, you need to define a function to compute the metrics from the predictions, which will use the `metric` you'll load now. The only preprocessing you have to do is to take the argmax of our predicted logits:
-
-```py
-import evaluate
-
-metric = evaluate.load("accuracy")
-
-
-def compute_metrics(eval_pred):
-    predictions = np.argmax(eval_pred.predictions, axis=1)
-    return metric.compute(predictions=predictions, references=eval_pred.label_ids)
-```
-
-**A note on evaluation**:
-
-In the [VideoMAE paper](https://arxiv.org/abs/2203.12602), the authors use the following evaluation strategy. They evaluate the model on several clips from test videos and apply different crops to those clips and report the aggregate score. However, in the interest of simplicity and brevity, we don't consider that in this tutorial.
-
-Also, define a `collate_fn`, which will be used to batch examples together. Each batch consists of 2 keys, namely `pixel_values` and `labels`.
-
-```py 
->>> def collate_fn(examples):
-...     # permute to (num_frames, num_channels, height, width)
-...     pixel_values = torch.stack(
-...         [example["video"].permute(1, 0, 2, 3) for example in examples]
-...     )
-...     labels = torch.tensor([example["label"] for example in examples])
-...     return {"pixel_values": pixel_values, "labels": labels}
-```
-
-Then you just pass all of this along with the datasets to `Trainer`:
-
-```py 
->>> trainer = Trainer(
-...     model,
-...     args,
-...     train_dataset=train_dataset,
-...     eval_dataset=val_dataset,
-...     tokenizer=image_processor,
-...     compute_metrics=compute_metrics,
-...     data_collator=collate_fn,
-... )
-```
-
-You might wonder why you passed along the `image_processor` as a tokenizer when you preprocessed the data already. This is only to make sure the image processor configuration file (stored as JSON) will also be uploaded to the repo on the Hub.
-
-Now fine-tune our model by calling the `train` method:
-
-```py 
->>> train_results = trainer.train()
-```
-
-Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model:
-
-```py
->>> trainer.push_to_hub()
-```
-
-## Inference
-
-Great, now that you have fine-tuned a model, you can use it for inference!
-
-Load a video for inference:
-
-```py 
->>> sample_test_video = next(iter(test_dataset))
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/sample_gif_two.gif" alt="Teams playing basketball"/>
-</div>
-
-The simplest way to try out your fine-tuned model for inference is to use it in a [`pipeline`](https://huggingface.co/docs/transformers/main/en/main_classes/pipelines#transformers.VideoClassificationPipeline). Instantiate a `pipeline` for video classification with your model, and pass your video to it:
-
-```py
->>> from transformers import pipeline
-
->>> video_cls = pipeline(model="my_awesome_video_cls_model")
->>> video_cls("https://huggingface.co/datasets/sayakpaul/ucf101-subset/resolve/main/v_BasketballDunk_g14_c06.avi")
-[{'score': 0.9272987842559814, 'label': 'BasketballDunk'},
- {'score': 0.017777055501937866, 'label': 'BabyCrawling'},
- {'score': 0.01663011871278286, 'label': 'BalanceBeam'},
- {'score': 0.009560945443809032, 'label': 'BandMarching'},
- {'score': 0.0068979403004050255, 'label': 'BaseballPitch'}]
-```
-
-You can also manually replicate the results of the `pipeline` if you'd like.
-
-
-```py
->>> def run_inference(model, video):
-...     # (num_frames, num_channels, height, width)
-...     perumuted_sample_test_video = video.permute(1, 0, 2, 3)
-...     inputs = {
-...         "pixel_values": perumuted_sample_test_video.unsqueeze(0),
-...         "labels": torch.tensor(
-...             [sample_test_video["label"]]
-...         ),  # this can be skipped if you don't have labels available.
-...     }
-
-...     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-...     inputs = {k: v.to(device) for k, v in inputs.items()}
-...     model = model.to(device)
-
-...     # forward pass
-...     with torch.no_grad():
-...         outputs = model(**inputs)
-...         logits = outputs.logits
-
-...     return logits
-```
-
-Now, pass your input to the model and return the `logits`:
-
-```
->>> logits = run_inference(trained_model, sample_test_video["video"])
-```
-
-Decoding the `logits`, we get: 
-
-```py 
->>> predicted_class_idx = logits.argmax(-1).item()
->>> print("Predicted class:", model.config.id2label[predicted_class_idx])
-# Predicted class: BasketballDunk
-```
\ No newline at end of file
diff --git a/docs/source/en/tasks/visual_question_answering.md b/docs/source/en/tasks/visual_question_answering.md
new file mode 100644
index 000000000000..c45f12dbc1e7
--- /dev/null
+++ b/docs/source/en/tasks/visual_question_answering.md
@@ -0,0 +1,401 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Visual Question Answering
+
+[[open-in-colab]]
+
+Visual Question Answering (VQA) is the task of answering open-ended questions based on an image. 
+The input to models supporting this task is typically a combination of an image and a question, and the output is an 
+answer expressed in natural language.
+
+Some noteworthy use case examples for VQA include:
+* Accessibility applications for visually impaired individuals.
+* Education: posing questions about visual materials presented in lectures or textbooks. VQA can also be utilized in interactive museum exhibits or historical sites.
+* Customer service and e-commerce: VQA can enhance user experience by letting users ask questions about products. 
+* Image retrieval: VQA models can be used to retrieve images with specific characteristics. For example, the user can ask "Is there a dog?" to find all images with dogs from a set of images.
+
+In this guide you'll learn how to:
+
+- Fine-tune a classification VQA model, specifically [ViLT](../model_doc/vilt), on the [`Graphcore/vqa` dataset](https://huggingface.co/datasets/Graphcore/vqa).
+- Use your fine-tuned ViLT for inference.
+- Run zero-shot VQA inference with a generative model, like BLIP-2.
+
+## Fine-tuning ViLT
+
+ViLT model incorporates text embeddings into a Vision Transformer (ViT), allowing it to have a minimal design for 
+Vision-and-Language Pre-training (VLP). This model can be used for several downstream tasks. For the VQA task, a classifier 
+head is placed on top (a linear layer on top of the final hidden state of the `[CLS]` token) and randomly initialized. 
+Visual Question Answering is thus treated as a **classification problem**.
+
+More recent models, such as BLIP, BLIP-2, and InstructBLIP, treat VQA as a generative task. Later in this guide we 
+illustrate how to use them for zero-shot VQA inference. 
+
+Before you begin, make sure you have all the necessary libraries installed. 
+
+```bash
+pip install -q transformers datasets
+```
+
+We encourage you to share your model with the community. Log in to your Hugging Face account to upload it to the 🤗 Hub.
+When prompted, enter your token to log in:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+Let's define the model checkpoint as a global variable.
+
+```py
+>>> model_checkpoint = "dandelin/vilt-b32-mlm"
+```
+
+## Load the data
+
+For illustration purposes, in this guide we use a very small sample of the annotated visual question answering `Graphcore/vqa` dataset. 
+You can find the full dataset on [🤗 Hub](https://huggingface.co/datasets/Graphcore/vqa).
+
+As an alternative to the [`Graphcore/vqa` dataset](https://huggingface.co/datasets/Graphcore/vqa), you can download the 
+same data manually from the official [VQA dataset page](https://visualqa.org/download.html). If you prefer to follow the 
+tutorial with your custom data, check out how to [Create an image dataset](https://huggingface.co/docs/datasets/image_dataset#loading-script)
+guide in the 🤗 Datasets documentation.  
+
+Let's load the first 200 examples from the validation split and explore the dataset's features:  
+
+```python
+>>> from datasets import load_dataset
+
+>>> dataset = load_dataset("Graphcore/vqa", split="validation[:200]")
+>>> dataset
+Dataset({
+    features: ['question', 'question_type', 'question_id', 'image_id', 'answer_type', 'label'],
+    num_rows: 200
+})
+```
+
+Let's take a look at an example to understand the dataset's features:
+
+```py
+>>> dataset[0]
+{'question': 'Where is he looking?',
+ 'question_type': 'none of the above',
+ 'question_id': 262148000,
+ 'image_id': '/root/.cache/huggingface/datasets/downloads/extracted/ca733e0e000fb2d7a09fbcc94dbfe7b5a30750681d0e965f8e0a23b1c2f98c75/val2014/COCO_val2014_000000262148.jpg',
+ 'answer_type': 'other',
+ 'label': {'ids': ['at table', 'down', 'skateboard', 'table'],
+  'weights': [0.30000001192092896,
+   1.0,
+   0.30000001192092896,
+   0.30000001192092896]}}
+```
+
+The features relevant to the task include: 
+* `question`: the question to be answered from the image
+* `image_id`: the path to the image the question refers to
+* `label`: the annotations
+
+We can remove the rest of the features as they won't be necessary: 
+
+```py 
+>>> dataset = dataset.remove_columns(['question_type', 'question_id', 'answer_type'])
+```
+
+As you can see, the `label` feature contains several answers to the same question (called `ids` here) collected by different human annotators. 
+This is because the answer to a question can be subjective. In this case, the question is "where is he looking?". Some people 
+annotated this with "down", others with "at table", another one with "skateboard", etc. 
+
+Take a look at the image and consider which answer would you give:
+
+```python
+>>> from PIL import Image
+
+>>> image = Image.open(dataset[0]['image_id'])
+>>> image
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/vqa-example.png" alt="VQA Image Example"/>
+</div>
+
+Due to the questions' and answers' ambiguity, datasets like this are treated as a multi-label classification problem (as 
+multiple answers are possibly valid). Moreover, rather than just creating a one-hot encoded vector, one creates a 
+soft encoding, based on the number of times a certain answer appeared in the annotations.
+
+For instance, in the example above, because the answer "down" is selected way more often than other answers, it has a 
+score (called `weight` in the dataset) of 1.0, and the rest of the answers have scores < 1.0. 
+
+To later instantiate the model with an appropriate classification head, let's create two dictionaries: one that maps 
+the label name to an integer and vice versa:
+
+```py
+>>> import itertools
+
+>>> labels = [item['ids'] for item in dataset['label']]
+>>> flattened_labels = list(itertools.chain(*labels))
+>>> unique_labels = list(set(flattened_labels))
+
+>>> label2id = {label: idx for idx, label in enumerate(unique_labels)}
+>>> id2label = {idx: label for label, idx in label2id.items()} 
+```
+
+Now that we have the mappings, we can replace the string answers with their ids, and flatten the dataset for a more convenient further preprocessing. 
+
+```python
+>>> def replace_ids(inputs):
+...   inputs["label"]["ids"] = [label2id[x] for x in inputs["label"]["ids"]]
+...   return inputs
+
+
+>>> dataset = dataset.map(replace_ids)
+>>> flat_dataset = dataset.flatten()
+>>> flat_dataset.features
+{'question': Value(dtype='string', id=None),
+ 'image_id': Value(dtype='string', id=None),
+ 'label.ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
+ 'label.weights': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None)}
+```
+
+## Preprocessing data
+
+The next step is to load a ViLT processor to prepare the image and text data for the model. 
+[`ViltProcessor`] wraps a BERT tokenizer and ViLT image processor into a convenient single processor:
+
+```py 
+>>> from transformers import ViltProcessor
+
+>>> processor = ViltProcessor.from_pretrained(model_checkpoint)
+```
+
+To preprocess the data we need to encode the images and questions using the [`ViltProcessor`]. The processor will use 
+the [`BertTokenizerFast`] to tokenize the text and create `input_ids`, `attention_mask` and `token_type_ids` for the text data. 
+As for images, the processor will leverage [`ViltImageProcessor`] to resize and normalize the image, and create `pixel_values` and `pixel_mask`.
+
+All these preprocessing steps are done under the hood, we only need to call the `processor`. However, we still need to 
+prepare the target labels. In this representation, each element corresponds to a possible answer (label). For correct answers, the element holds 
+their respective score (weight), while the remaining elements are set to zero.
+
+The following function applies the `processor` to the images and questions and formats the labels as described above:
+
+```py
+>>> import torch
+
+>>> def preprocess_data(examples):
+...     image_paths = examples['image_id']
+...     images = [Image.open(image_path) for image_path in image_paths]
+...     texts = examples['question']    
+
+...     encoding = processor(images, texts, padding="max_length", truncation=True, return_tensors="pt")
+
+...     for k, v in encoding.items():
+...           encoding[k] = v.squeeze()
+    
+...     targets = []
+
+...     for labels, scores in zip(examples['label.ids'], examples['label.weights']):
+...         target = torch.zeros(len(id2label))
+
+...         for label, score in zip(labels, scores):
+...             target[label] = score
+      
+...         targets.append(target)
+
+...     encoding["labels"] = targets
+    
+...     return encoding
+```
+
+To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.map`] function. You can speed up `map` by 
+setting `batched=True` to process multiple elements of the dataset at once. At this point, feel free to remove the columns you don't need.
+
+```py
+>>> processed_dataset = flat_dataset.map(preprocess_data, batched=True, remove_columns=['question','question_type',  'question_id', 'image_id', 'answer_type', 'label.ids', 'label.weights'])
+>>> processed_dataset
+Dataset({
+    features: ['input_ids', 'token_type_ids', 'attention_mask', 'pixel_values', 'pixel_mask', 'labels'],
+    num_rows: 200
+})
+```
+
+As a final step, create a batch of examples using [`DefaultDataCollator`]:
+
+```py
+>>> from transformers import DefaultDataCollator
+
+>>> data_collator = DefaultDataCollator()
+```
+
+## Train the model
+
+You’re ready to start training your model now! Load ViLT with [`ViltForQuestionAnswering`]. Specify the number of labels 
+along with the label mappings:
+
+```py
+>>> from transformers import ViltForQuestionAnswering
+
+>>> model = ViltForQuestionAnswering.from_pretrained(model_checkpoint, num_labels=len(id2label), id2label=id2label, label2id=label2id)
+```
+
+At this point, only three steps remain:
+
+1. Define your training hyperparameters in [`TrainingArguments`]:
+
+```py
+>>> from transformers import TrainingArguments
+
+>>> repo_id = "MariaK/vilt_finetuned_200"
+
+>>> training_args = TrainingArguments(
+...     output_dir=repo_id,
+...     per_device_train_batch_size=4,
+...     num_train_epochs=20,
+...     save_steps=200,
+...     logging_steps=50,
+...     learning_rate=5e-5,
+...     save_total_limit=2,
+...     remove_unused_columns=False,
+...     push_to_hub=True,
+... )
+```
+
+2. Pass the training arguments to [`Trainer`] along with the model, dataset, processor, and data collator.
+
+```py
+>>> from transformers import Trainer
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     data_collator=data_collator,
+...     train_dataset=processed_dataset,
+...     tokenizer=processor,
+... )
+```
+
+3. Call [`~Trainer.train`] to finetune your model.
+
+```py
+>>> trainer.train() 
+```
+
+Once training is completed, share your model to the Hub with the [`~Trainer.push_to_hub`] method to share your final model on the 🤗 Hub:
+
+```py
+>>> trainer.push_to_hub()
+```
+
+## Inference
+
+Now that you have fine-tuned a ViLT model, and uploaded it to the 🤗 Hub, you can use it for inference. The simplest
+way to try out your fine-tuned model for inference is to use it in a [`Pipeline`].
+
+```py
+>>> from transformers import pipeline
+
+>>> pipe = pipeline("visual-question-answering", model="MariaK/vilt_finetuned_200")
+```
+
+The model in this guide has only been trained on 200 examples, so don't expect a lot from it. Let's see if it at least 
+learned something from the data and take the first example from the dataset to illustrate inference:
+
+```py
+>>> example = dataset[0]
+>>> image = Image.open(example['image_id'])
+>>> question = example['question']
+>>> print(question)
+>>> pipe(image, question, top_k=1)
+"Where is he looking?"
+[{'score': 0.5498199462890625, 'answer': 'down'}]
+```
+
+Even though not very confident, the model indeed has learned something. With more examples and longer training, you'll get far better results!
+
+You can also manually replicate the results of the pipeline if you'd like:
+1. Take an image and a question, prepare them for the model using the processor from your model.
+2. Forward the result or preprocessing through the model.
+3. From the logits, get the most likely answer's id, and find the actual answer in the `id2label`.
+
+```py
+>>> processor = ViltProcessor.from_pretrained("MariaK/vilt_finetuned_200")
+
+>>> image = Image.open(example['image_id'])
+>>> question = example['question']
+
+>>> # prepare inputs
+>>> inputs = processor(image, question, return_tensors="pt")
+
+>>> model = ViltForQuestionAnswering.from_pretrained("MariaK/vilt_finetuned_200")
+
+>>> # forward pass
+>>> with torch.no_grad():
+...     outputs = model(**inputs)
+
+>>> logits = outputs.logits
+>>> idx = logits.argmax(-1).item()
+>>> print("Predicted answer:", model.config.id2label[idx])
+Predicted answer: down
+```
+
+## Zero-shot VQA
+
+The previous model treated VQA as a classification task. Some recent models, such as BLIP, BLIP-2, and InstructBLIP approach 
+VQA as a generative task. Let's take [BLIP-2](../model_doc/blip-2) as an example. It introduced a new visual-language pre-training 
+paradigm in which any combination of pre-trained vision encoder and LLM can be used (learn more in the [BLIP-2 blog post](https://huggingface.co/blog/blip-2)). 
+This enables achieving state-of-the-art results on multiple visual-language tasks including visual question answering. 
+
+Let's illustrate how you can use this model for VQA. First, let's load the model. Here we'll explicitly send the model to a 
+GPU, if available, which we didn't need to do earlier when training, as [`Trainer`] handles this automatically: 
+
+```py
+>>> from transformers import AutoProcessor, Blip2ForConditionalGeneration
+>>> import torch
+
+>>> processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
+>>> model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16)
+>>> device = "cuda" if torch.cuda.is_available() else "cpu"
+>>> model.to(device)
+```
+
+The model takes image and text as input, so let's use the exact same image/question pair from the first example in the VQA dataset: 
+
+```py 
+>>> example = dataset[0]
+>>> image = Image.open(example['image_id'])
+>>> question = example['question']
+```
+
+To use BLIP-2 for visual question answering task, the textual prompt has to follow a specific format: `Question: {} Answer:`.
+
+```py
+>>> prompt = f"Question: {question} Answer:" 
+```
+
+Now we need to preprocess the image/prompt with the model's processor, pass the processed input through the model, and decode the output:
+
+```py
+>>> inputs = processor(image, text=prompt, return_tensors="pt").to(device, torch.float16)
+
+>>> generated_ids = model.generate(**inputs, max_new_tokens=10)
+>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+>>> print(generated_text)
+"He is looking at the crowd" 
+```
+
+As you can see, the model recognized the crowd, and the direction of the face (looking down), however, it seems to miss 
+the fact the crowd is behind the skater. Still, in cases where acquiring human-annotated datasets is not feasible, this 
+approach can quickly produce useful results.
+ 
diff --git a/docs/source/en/tasks/zero_shot_image_classification.md b/docs/source/en/tasks/zero_shot_image_classification.md
new file mode 100644
index 000000000000..45775d40cad3
--- /dev/null
+++ b/docs/source/en/tasks/zero_shot_image_classification.md
@@ -0,0 +1,147 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Zero-shot image classification
+
+[[open-in-colab]]
+
+Zero-shot image classification is a task that involves classifying images into different categories using a model that was
+not explicitly trained on data containing labeled examples from those specific categories.
+
+Traditionally, image classification requires training a model on a specific set of labeled images, and this model learns to
+"map" certain image features to labels. When there's a need to use such model for a classification task that introduces a
+new set of labels, fine-tuning is required to "recalibrate" the model.
+
+In contrast, zero-shot or open vocabulary image classification models are typically multi-modal models that have been trained on a large
+dataset of images and associated descriptions. These models learn aligned vision-language representations that can be used for many downstream tasks including zero-shot image classification.
+
+This is a more flexible approach to image classification that allows models to generalize to new and unseen categories
+without the need for additional training data and enables users to query images with free-form text descriptions of their target objects .
+
+In this guide you'll learn how to:
+
+* create a zero-shot image classification pipeline
+* run zero-shot image classification inference by hand
+
+Before you begin, make sure you have all the necessary libraries installed:
+
+```bash
+pip install -q transformers
+```
+
+## Zero-shot image classification pipeline
+
+The simplest way to try out inference with a model supporting zero-shot image classification is to use the corresponding [`pipeline`].
+Instantiate a pipeline from a [checkpoint on the Hugging Face Hub](https://huggingface.co/models?pipeline_tag=zero-shot-image-classification&sort=downloads):
+
+```python
+>>> from transformers import pipeline
+
+>>> checkpoint = "openai/clip-vit-large-patch14"
+>>> detector = pipeline(model=checkpoint, task="zero-shot-image-classification")
+```
+
+Next, choose an image you'd like to classify.
+
+```py
+>>> from PIL import Image
+>>> import requests
+
+>>> url = "https://unsplash.com/photos/g8oS8-82DxI/download?ixid=MnwxMjA3fDB8MXx0b3BpY3x8SnBnNktpZGwtSGt8fHx8fDJ8fDE2NzgxMDYwODc&force=true&w=640"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> image
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/owl.jpg" alt="Photo of an owl"/>
+</div>
+
+Pass the image and the candidate object labels to the pipeline. Here we pass the image directly; other suitable options
+include a local path to an image or an image url.
+The candidate labels can be simple words like in this example, or more descriptive.
+
+```py
+>>> predictions = detector(image, candidate_labels=["fox", "bear", "seagull", "owl"])
+>>> predictions
+[{'score': 0.9996670484542847, 'label': 'owl'},
+ {'score': 0.000199399160919711, 'label': 'seagull'},
+ {'score': 7.392891711788252e-05, 'label': 'fox'},
+ {'score': 5.96074532950297e-05, 'label': 'bear'}]
+```
+
+## Zero-shot image classification by hand
+
+Now that you've seen how to use the zero-shot image classification pipeline, let's take a look how you can run zero-shot
+image classification manually.
+
+Start by loading the model and associated processor from a [checkpoint on the Hugging Face Hub](https://huggingface.co/models?pipeline_tag=zero-shot-image-classification&sort=downloads).
+Here we'll use the same checkpoint as before:
+
+```py
+>>> from transformers import AutoProcessor, AutoModelForZeroShotImageClassification
+
+>>> model = AutoModelForZeroShotImageClassification.from_pretrained(checkpoint)
+>>> processor = AutoProcessor.from_pretrained(checkpoint)
+```
+
+Let's take a different image to switch things up.
+
+```py
+>>> from PIL import Image
+>>> import requests
+
+>>> url = "https://unsplash.com/photos/xBRQfR2bqNI/download?ixid=MnwxMjA3fDB8MXxhbGx8fHx8fHx8fHwxNjc4Mzg4ODEx&force=true&w=640"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> image
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg" alt="Photo of a car"/>
+</div>
+
+Use the processor to prepare the inputs for the model. The processor combines an image processor that prepares the
+image for the model by resizing and normalizing it, and a tokenizer that takes care of the text inputs.
+
+```py
+>>> candidate_labels = ["tree", "car", "bike", "cat"]
+>>> inputs = processor(images=image, text=candidate_labels, return_tensors="pt", padding=True)
+```
+
+Pass the inputs through the model, and post-process the results:
+
+```py
+>>> import torch
+
+>>> with torch.no_grad():
+...     outputs = model(**inputs)
+
+>>> logits = outputs.logits_per_image[0]
+>>> probs = logits.softmax(dim=-1).numpy()
+>>> scores = probs.tolist()
+
+>>> result = [
+...     {"score": score, "label": candidate_label}
+...     for score, candidate_label in sorted(zip(probs, candidate_labels), key=lambda x: -x[0])
+... ]
+
+>>> result
+[{'score': 0.998572, 'label': 'car'},
+ {'score': 0.0010570387, 'label': 'bike'},
+ {'score': 0.0003393686, 'label': 'tree'},
+ {'score': 3.1572064e-05, 'label': 'cat'}]
+```
\ No newline at end of file
diff --git a/docs/source/en/tasks/zero_shot_object_detection.md b/docs/source/en/tasks/zero_shot_object_detection.md
new file mode 100644
index 000000000000..3dfefb3c8b5e
--- /dev/null
+++ b/docs/source/en/tasks/zero_shot_object_detection.md
@@ -0,0 +1,309 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Zero-shot object detection
+
+[[open-in-colab]]
+
+Traditionally, models used for [object detection](object_detection) require labeled image datasets for training,
+and are limited to detecting the set of classes from the training data.
+
+Zero-shot object detection is supported by the [OWL-ViT](../model_doc/owlvit) model which uses a different approach. OWL-ViT
+is an open-vocabulary object detector. It means that it can detect objects in images based on free-text queries without
+the need to fine-tune the model on labeled datasets.
+
+OWL-ViT leverages multi-modal representations to perform open-vocabulary detection. It combines [CLIP](../model_doc/clip) with
+lightweight object classification and localization heads. Open-vocabulary detection is achieved by embedding free-text queries with the text encoder of CLIP and using them as input to the object classification and localization heads.
+associate images and their corresponding textual descriptions, and ViT processes image patches as inputs. The authors
+of OWL-ViT first trained CLIP from scratch and then fine-tuned OWL-ViT end to end on standard object detection datasets using
+a bipartite matching loss.
+
+With this approach, the model can detect objects based on textual descriptions without prior training on labeled datasets.
+
+In this guide, you will learn how to use OWL-ViT:
+- to detect objects based on text prompts
+- for batch object detection
+- for image-guided object detection
+
+Before you begin, make sure you have all the necessary libraries installed:
+
+```bash
+pip install -q transformers
+```
+
+## Zero-shot object detection pipeline
+
+The simplest way to try out inference with OWL-ViT is to use it in a [`pipeline`]. Instantiate a pipeline
+for zero-shot object detection from a [checkpoint on the Hugging Face Hub](https://huggingface.co/models?other=owlvit):
+
+```python
+>>> from transformers import pipeline
+
+>>> checkpoint = "google/owlvit-base-patch32"
+>>> detector = pipeline(model=checkpoint, task="zero-shot-object-detection")
+```
+
+Next, choose an image you'd like to detect objects in. Here we'll use the image of astronaut Eileen Collins that is
+a part of the [NASA](https://www.nasa.gov/multimedia/imagegallery/index.html) Great Images dataset.
+
+```py
+>>> import skimage
+>>> import numpy as np
+>>> from PIL import Image
+
+>>> image = skimage.data.astronaut()
+>>> image = Image.fromarray(np.uint8(image)).convert("RGB")
+
+>>> image
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/zero-sh-obj-detection_1.png" alt="Astronaut Eileen Collins"/>
+</div>
+
+Pass the image and the candidate object labels to look for to the pipeline.
+Here we pass the image directly; other suitable options include a local path to an image or an image url. We also pass text descriptions for all items we want to query the image for. 
+
+```py
+>>> predictions = detector(
+...     image,
+...     candidate_labels=["human face", "rocket", "nasa badge", "star-spangled banner"],
+... )
+>>> predictions
+[{'score': 0.3571370542049408,
+  'label': 'human face',
+  'box': {'xmin': 180, 'ymin': 71, 'xmax': 271, 'ymax': 178}},
+ {'score': 0.28099656105041504,
+  'label': 'nasa badge',
+  'box': {'xmin': 129, 'ymin': 348, 'xmax': 206, 'ymax': 427}},
+ {'score': 0.2110239565372467,
+  'label': 'rocket',
+  'box': {'xmin': 350, 'ymin': -1, 'xmax': 468, 'ymax': 288}},
+ {'score': 0.13790413737297058,
+  'label': 'star-spangled banner',
+  'box': {'xmin': 1, 'ymin': 1, 'xmax': 105, 'ymax': 509}},
+ {'score': 0.11950037628412247,
+  'label': 'nasa badge',
+  'box': {'xmin': 277, 'ymin': 338, 'xmax': 327, 'ymax': 380}},
+ {'score': 0.10649408400058746,
+  'label': 'rocket',
+  'box': {'xmin': 358, 'ymin': 64, 'xmax': 424, 'ymax': 280}}]
+```
+
+Let's visualize the predictions:
+
+```py
+>>> from PIL import ImageDraw
+
+>>> draw = ImageDraw.Draw(image)
+
+>>> for prediction in predictions:
+...     box = prediction["box"]
+...     label = prediction["label"]
+...     score = prediction["score"]
+
+...     xmin, ymin, xmax, ymax = box.values()
+...     draw.rectangle((xmin, ymin, xmax, ymax), outline="red", width=1)
+...     draw.text((xmin, ymin), f"{label}: {round(score,2)}", fill="white")
+
+>>> image
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/zero-sh-obj-detection_2.png" alt="Visualized predictions on NASA image"/>
+</div>
+
+## Text-prompted zero-shot object detection by hand
+
+Now that you've seen how to use the zero-shot object detection pipeline, let's replicate the same
+result manually.
+
+Start by loading the model and associated processor from a [checkpoint on the Hugging Face Hub](https://huggingface.co/models?other=owlvit).
+Here we'll use the same checkpoint as before:
+
+```py
+>>> from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
+
+>>> model = AutoModelForZeroShotObjectDetection.from_pretrained(checkpoint)
+>>> processor = AutoProcessor.from_pretrained(checkpoint)
+```
+
+Let's take a different image to switch things up.
+
+```py
+>>> import requests
+
+>>> url = "https://unsplash.com/photos/oj0zeY2Ltk4/download?ixid=MnwxMjA3fDB8MXxzZWFyY2h8MTR8fHBpY25pY3xlbnwwfHx8fDE2Nzc0OTE1NDk&force=true&w=640"
+>>> im = Image.open(requests.get(url, stream=True).raw)
+>>> im
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/zero-sh-obj-detection_3.png" alt="Beach photo"/>
+</div>
+
+Use the processor to prepare the inputs for the model. The processor combines an image processor that prepares the
+image for the model by resizing and normalizing it, and a [`CLIPTokenizer`] that takes care of the text inputs.
+
+```py
+>>> text_queries = ["hat", "book", "sunglasses", "camera"]
+>>> inputs = processor(text=text_queries, images=im, return_tensors="pt")
+```
+
+Pass the inputs through the model, post-process, and visualize the results. Since the image processor resized images before
+feeding them to the model, you need to use the [`~OwlViTImageProcessor.post_process_object_detection`] method to make sure the predicted bounding
+boxes have the correct coordinates relative to the original image:
+
+```py
+>>> import torch
+
+>>> with torch.no_grad():
+...     outputs = model(**inputs)
+...     target_sizes = torch.tensor([im.size[::-1]])
+...     results = processor.post_process_object_detection(outputs, threshold=0.1, target_sizes=target_sizes)[0]
+
+>>> draw = ImageDraw.Draw(im)
+
+>>> scores = results["scores"].tolist()
+>>> labels = results["labels"].tolist()
+>>> boxes = results["boxes"].tolist()
+
+>>> for box, score, label in zip(boxes, scores, labels):
+...     xmin, ymin, xmax, ymax = box
+...     draw.rectangle((xmin, ymin, xmax, ymax), outline="red", width=1)
+...     draw.text((xmin, ymin), f"{text_queries[label]}: {round(score,2)}", fill="white")
+
+>>> im
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/zero-sh-obj-detection_4.png" alt="Beach photo with detected objects"/>
+</div>
+
+## Batch processing
+
+You can pass multiple sets of images and text queries to search for different (or same) objects in several images.
+Let's use both an astronaut image and the beach image together.
+For batch processing, you should pass text queries as a nested list to the processor and images as lists of PIL images,
+PyTorch tensors, or NumPy arrays.
+
+```py
+>>> images = [image, im]
+>>> text_queries = [
+...     ["human face", "rocket", "nasa badge", "star-spangled banner"],
+...     ["hat", "book", "sunglasses", "camera"],
+... ]
+>>> inputs = processor(text=text_queries, images=images, return_tensors="pt")
+```
+
+Previously for post-processing you passed the single image's size as a tensor, but you can also pass a tuple, or, in case
+of several images, a list of tuples. Let's create predictions for the two examples, and visualize the second one (`image_idx = 1`).
+
+```py
+>>> with torch.no_grad():
+...     outputs = model(**inputs)
+...     target_sizes = [x.size[::-1] for x in images]
+...     results = processor.post_process_object_detection(outputs, threshold=0.1, target_sizes=target_sizes)
+
+>>> image_idx = 1
+>>> draw = ImageDraw.Draw(images[image_idx])
+
+>>> scores = results[image_idx]["scores"].tolist()
+>>> labels = results[image_idx]["labels"].tolist()
+>>> boxes = results[image_idx]["boxes"].tolist()
+
+>>> for box, score, label in zip(boxes, scores, labels):
+...     xmin, ymin, xmax, ymax = box
+...     draw.rectangle((xmin, ymin, xmax, ymax), outline="red", width=1)
+...     draw.text((xmin, ymin), f"{text_queries[image_idx][label]}: {round(score,2)}", fill="white")
+
+>>> images[image_idx]
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/zero-sh-obj-detection_4.png" alt="Beach photo with detected objects"/>
+</div>
+
+## Image-guided object detection
+
+In addition to zero-shot object detection with text queries, OWL-ViT offers image-guided object detection. This means
+you can use an image query to find similar objects in the target image.
+Unlike text queries, only a single example image is allowed.
+
+Let's take an image with two cats on a couch as a target image, and an image of a single cat
+as a query:
+
+```py
+>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+>>> image_target = Image.open(requests.get(url, stream=True).raw)
+
+>>> query_url = "http://images.cocodataset.org/val2017/000000524280.jpg"
+>>> query_image = Image.open(requests.get(query_url, stream=True).raw)
+```
+
+Let's take a quick look at the images:
+
+```py
+>>> import matplotlib.pyplot as plt
+
+>>> fig, ax = plt.subplots(1, 2)
+>>> ax[0].imshow(image_target)
+>>> ax[1].imshow(query_image)
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/zero-sh-obj-detection_5.png" alt="Cats"/>
+</div>
+
+In the preprocessing step, instead of text queries, you now need to use `query_images`:
+
+```py
+>>> inputs = processor(images=image_target, query_images=query_image, return_tensors="pt")
+```
+
+For predictions, instead of passing the inputs to the model, pass them to [`~OwlViTForObjectDetection.image_guided_detection`]. Draw the predictions
+as before except now there are no labels.
+
+```py
+>>> with torch.no_grad():
+...     outputs = model.image_guided_detection(**inputs)
+...     target_sizes = torch.tensor([image_target.size[::-1]])
+...     results = processor.post_process_image_guided_detection(outputs=outputs, target_sizes=target_sizes)[0]
+
+>>> draw = ImageDraw.Draw(image_target)
+
+>>> scores = results["scores"].tolist()
+>>> boxes = results["boxes"].tolist()
+
+>>> for box, score, label in zip(boxes, scores, labels):
+...     xmin, ymin, xmax, ymax = box
+...     draw.rectangle((xmin, ymin, xmax, ymax), outline="white", width=4)
+
+>>> image_target
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/zero-sh-obj-detection_6.png" alt="Cats with bounding boxes"/>
+</div>
+
+If you'd like to interactively try out inference with OWL-ViT, check out this demo:
+
+<iframe
+	src="https://adirik-owl-vit.hf.space"
+	frameborder="0"
+	width="850"
+	height="450"
+></iframe>
diff --git a/docs/source/en/tasks_explained.md b/docs/source/en/tasks_explained.md
new file mode 100644
index 000000000000..d453e38e86b9
--- /dev/null
+++ b/docs/source/en/tasks_explained.md
@@ -0,0 +1,295 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# How 🤗 Transformers solve tasks
+
+In [What 🤗 Transformers can do](task_summary), you learned about natural language processing (NLP), speech and audio, computer vision tasks, and some important applications of them. This page will look closely at how models solve these tasks and explain what's happening under the hood. There are many ways to solve a given task, some models may implement certain techniques or even approach the task from a new angle, but for Transformer models, the general idea is the same. Owing to its flexible architecture, most models are a variant of an encoder, decoder, or encoder-decoder structure. In addition to Transformer models, our library also has several convolutional neural networks (CNNs), which are still used today for computer vision tasks. We'll also explain how a modern CNN works.
+
+To explain how tasks are solved, we'll walk through what goes on inside the model to output useful predictions.
+
+- [Wav2Vec2](model_doc/wav2vec2) for audio classification and automatic speech recognition (ASR)
+- [Vision Transformer (ViT)](model_doc/vit) and [ConvNeXT](model_doc/convnext) for image classification
+- [DETR](model_doc/detr) for object detection
+- [Mask2Former](model_doc/mask2former) for image segmentation
+- [GLPN](model_doc/glpn) for depth estimation
+- [BERT](model_doc/bert) for NLP tasks like text classification, token classification and question answering that use an encoder
+- [GPT2](model_doc/gpt2) for NLP tasks like text generation that use a decoder
+- [BART](model_doc/bart) for NLP tasks like summarization and translation that use an encoder-decoder
+
+<Tip>
+
+Before you go further, it is good to have some basic knowledge of the original Transformer architecture. Knowing how encoders, decoders, and attention work will aid you in understanding how different Transformer models work. If you're just getting started or need a refresher, check out our [course](https://huggingface.co/course/chapter1/4?fw=pt) for more information! 
+
+</Tip>
+
+## Speech and audio
+
+[Wav2Vec2](model_doc/wav2vec2) is a self-supervised model pretrained on unlabeled speech data and finetuned on labeled data for audio classification and automatic speech recognition. 
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/wav2vec2_architecture.png"/>
+</div>
+
+This model has four main components:
+
+1. A *feature encoder* takes the raw audio waveform, normalizes it to zero mean and unit variance, and converts it into a sequence of feature vectors that are each 20ms long.
+
+2. Waveforms are continuous by nature, so they can't be divided into separate units like a sequence of text can be split into words. That's why the feature vectors are passed to a *quantization module*, which aims to learn discrete speech units. The speech unit is chosen from a collection of codewords, known as a *codebook* (you can think of this as the vocabulary). From the codebook, the vector or speech unit, that best represents the continuous audio input is chosen and forwarded through the model.
+
+3. About half of the feature vectors are randomly masked, and the masked feature vector is fed to a *context network*, which is a Transformer encoder that also adds relative positional embeddings.
+
+4. The pretraining objective of the context network is a *contrastive task*. The model has to predict the true quantized speech representation of the masked prediction from a set of false ones, encouraging the model to find the most similar context vector and quantized speech unit (the target label).
+
+Now that wav2vec2 is pretrained, you can finetune it on your data for audio classification or automatic speech recognition!
+
+### Audio classification
+
+To use the pretrained model for audio classification, add a sequence classification head on top of the base Wav2Vec2 model. The classification head is a linear layer that accepts the encoder's hidden states. The hidden states represent the learned features from each audio frame which can have varying lengths. To create one vector of fixed-length, the hidden states are pooled first and then transformed into logits over the class labels. The cross-entropy loss is calculated between the logits and target to find the most likely class.
+
+Ready to try your hand at audio classification? Check out our complete [audio classification guide](tasks/audio_classification) to learn how to finetune Wav2Vec2 and use it for inference!
+
+### Automatic speech recognition
+
+To use the pretrained model for automatic speech recognition, add a language modeling head on top of the base Wav2Vec2 model for [connectionist temporal classification (CTC)](glossary#connectionist-temporal-classification-ctc). The language modeling head is a linear layer that accepts the encoder's hidden states and transforms them into logits. Each logit represents a token class (the number of tokens comes from the task vocabulary). The CTC loss is calculated between the logits and targets to find the most likely sequence of tokens, which are then decoded into a transcription.
+
+Ready to try your hand at automatic speech recognition? Check out our complete [automatic speech recognition guide](tasks/asr) to learn how to finetune Wav2Vec2 and use it for inference!
+
+## Computer vision
+
+There are two ways to approach computer vision tasks:
+
+1. Split an image into a sequence of patches and process them in parallel with a Transformer.
+2. Use a modern CNN, like [ConvNeXT](model_doc/convnext), which relies on convolutional layers but adopts modern network designs.
+
+<Tip>
+
+A third approach mixes Transformers with convolutions (for example, [Convolutional Vision Transformer](model_doc/cvt) or [LeViT](model_doc/levit)). We won't discuss those because they just combine the two approaches we examine here.
+
+</Tip>
+
+ViT and ConvNeXT are commonly used for image classification, but for other vision tasks like object detection, segmentation, and depth estimation, we'll look at DETR, Mask2Former and GLPN, respectively; these models are better suited for those tasks.
+
+### Image classification
+
+ViT and ConvNeXT can both be used for image classification; the main difference is that ViT uses an attention mechanism while ConvNeXT uses convolutions.
+
+#### Transformer
+
+[ViT](model_doc/vit) replaces convolutions entirely with a pure Transformer architecture. If you're familiar with the original Transformer, then you're already most of the way toward understanding ViT.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/vit_architecture.jpg"/>
+</div>
+
+The main change ViT introduced was in how images are fed to a Transformer:
+
+1. An image is split into square non-overlapping patches, each of which gets turned into a vector or *patch embedding*. The patch embeddings are generated from a convolutional 2D layer which creates the proper input dimensions (which for a base Transformer is 768 values for each patch embedding). If you had a 224x224 pixel image, you could split it into 196 16x16 image patches. Just like how text is tokenized into words, an image is "tokenized" into a sequence of patches.
+
+2. A *learnable embedding* - a special `[CLS]` token - is added to the beginning of the patch embeddings just like BERT. The final hidden state of the `[CLS]` token is used as the input to the attached classification head; other outputs are ignored. This token helps the model learn how to encode a representation of the image.
+
+3. The last thing to add to the patch and learnable embeddings are the *position embeddings* because the model doesn't know how the image patches are ordered. The position embeddings are also learnable and have the same size as the patch embeddings. Finally, all of the embeddings are passed to the Transformer encoder.
+
+4. The output, specifically only the output with the `[CLS]` token, is passed to a multilayer perceptron head (MLP). ViT's pretraining objective is simply classification. Like other classification heads, the MLP head converts the output into logits over the class labels and calculates the cross-entropy loss to find the most likely class.
+
+Ready to try your hand at image classification? Check out our complete [image classification guide](tasks/image_classification) to learn how to finetune ViT and use it for inference!
+
+#### CNN
+
+<Tip>
+
+This section briefly explains convolutions, but it'd be helpful to have a prior understanding of how they change an image's shape and size. If you're unfamiliar with convolutions, check out the [Convolution Neural Networks chapter](https://github.com/fastai/fastbook/blob/master/13_convolutions.ipynb) from the fastai book!
+
+</Tip>
+
+[ConvNeXT](model_doc/convnext) is a CNN architecture that adopts new and modern network designs to improve performance. However, convolutions are still at the core of the model. From a high-level perspective, a [convolution](glossary#convolution) is an operation where a smaller matrix (*kernel*) is multiplied by a small window of the image pixels. It computes some features from it, such as a particular texture or curvature of a line. Then it slides over to the next window of pixels; the distance the convolution travels is known as the *stride*. 
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/convolution.gif"/>
+</div>
+
+<small>A basic convolution without padding or stride, taken from <a href="https://arxiv.org/abs/1603.07285">A guide to convolution arithmetic for deep learning.</a></small>
+
+You can feed this output to another convolutional layer, and with each successive layer, the network learns more complex and abstract things like hotdogs or rockets. Between convolutional layers, it is common to add a pooling layer to reduce dimensionality and make the model more robust to variations of a feature's position.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/convnext_architecture.png"/>
+</div>
+
+ConvNeXT modernizes a CNN in five ways:
+
+1. Change the number of blocks in each stage and "patchify" an image with a larger stride and corresponding kernel size. The non-overlapping sliding window makes this patchifying strategy similar to how ViT splits an image into patches.
+
+2. A *bottleneck* layer shrinks the number of channels and then restores it because it is faster to do a 1x1 convolution, and you can increase the depth. An inverted bottleneck does the opposite by expanding the number of channels and shrinking them, which is more memory efficient.
+
+3. Replace the typical 3x3 convolutional layer in the bottleneck layer with *depthwise convolution*, which applies a convolution to each input channel separately and then stacks them back together at the end. This widens the network width for improved performance.
+
+4. ViT has a global receptive field which means it can see more of an image at once thanks to its attention mechanism. ConvNeXT attempts to replicate this effect by increasing the kernel size to 7x7.
+
+5. ConvNeXT also makes several layer design changes that imitate Transformer models. There are fewer activation and normalization layers,  the activation function is switched to GELU instead of ReLU, and it uses LayerNorm instead of BatchNorm.
+
+The output from the convolution blocks is passed to a classification head which converts the outputs into logits and calculates the cross-entropy loss to find the most likely label.
+
+### Object detection
+
+[DETR](model_doc/detr), *DEtection TRansformer*, is an end-to-end object detection model that combines a CNN with a Transformer encoder-decoder.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/detr_architecture.png"/>
+</div>
+
+1. A pretrained CNN *backbone* takes an image, represented by its pixel values, and creates a low-resolution feature map of it. A 1x1 convolution is applied to the feature map to reduce dimensionality and it creates a new feature map with a high-level image representation. Since the Transformer is a sequential model, the feature map is flattened into a sequence of feature vectors that are combined with positional embeddings.
+
+2. The feature vectors are passed to the encoder, which learns the image representations using its attention layers. Next, the encoder hidden states are combined with *object queries* in the decoder. Object queries are learned embeddings that focus on the different regions of an image, and they're updated as they progress through each attention layer. The decoder hidden states are passed to a feedforward network that predicts the bounding box coordinates and class label for each object query, or `no object` if there isn't one.
+
+    DETR decodes each object query in parallel to output *N* final predictions, where *N* is the number of queries. Unlike a typical autoregressive model that predicts one element at a time, object detection is a set prediction task (`bounding box`, `class label`) that makes *N* predictions in a single pass.
+
+3. DETR uses a *bipartite matching loss* during training to compare a fixed number of predictions with a fixed set of ground truth labels. If there are fewer ground truth labels in the set of *N* labels, then they're padded with a `no object` class. This loss function encourages DETR to find a one-to-one assignment between the predictions and ground truth labels. If either the bounding boxes or class labels aren't correct, a loss is incurred. Likewise, if DETR predicts an object that doesn't exist, it is penalized. This encourages DETR to find other objects in an image instead of focusing on one really prominent object.
+
+An object detection head is added on top of DETR to find the class label and the coordinates of the bounding box. There are two components to the object detection head: a linear layer to transform the decoder hidden states into logits over the class labels, and a MLP to predict the bounding box.
+
+Ready to try your hand at object detection? Check out our complete [object detection guide](tasks/object_detection) to learn how to finetune DETR and use it for inference!
+
+### Image segmentation
+
+[Mask2Former](model_doc/mask2former) is a universal architecture for solving all types of image segmentation tasks. Traditional segmentation models are typically tailored towards a particular subtask of image segmentation, like instance, semantic or panoptic segmentation. Mask2Former frames each of those tasks as a *mask classification* problem. Mask classification groups pixels into *N* segments, and predicts *N* masks and their corresponding class label for a given image. We'll explain how Mask2Former works in this section, and then you can try finetuning SegFormer at the end.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/mask2former_architecture.png"/>
+</div>
+
+There are three main components to Mask2Former:
+
+1. A [Swin](model_doc/swin) backbone accepts an image and creates a low-resolution image feature map from 3 consecutive 3x3 convolutions.
+
+2. The feature map is passed to a *pixel decoder* which gradually upsamples the low-resolution features into high-resolution per-pixel embeddings. The pixel decoder actually generates multi-scale features (contains both low- and high-resolution features) with resolutions 1/32, 1/16, and 1/8th of the original image.
+
+3. Each of these feature maps of differing scales is fed successively to one Transformer decoder layer at a time in order to capture small objects from the high-resolution features. The key to Mask2Former is the *masked attention* mechanism in the decoder. Unlike cross-attention which can attend to the entire image, masked attention only focuses on a certain area of the image. This is faster and leads to better performance because the local features of an image are enough for the model to learn from.
+
+4. Like [DETR](tasks_explained#object-detection), Mask2Former also uses learned object queries and combines them with the image features from the pixel decoder to make a set prediction (`class label`, `mask prediction`). The decoder hidden states are passed into a linear layer and transformed into logits over the class labels. The cross-entropy loss is calculated between the logits and class label to find the most likely one.
+
+    The mask predictions are generated by combining the pixel-embeddings with the final decoder hidden states. The sigmoid cross-entropy and dice loss is calculated between the logits and the ground truth mask to find the most likely mask.
+
+Ready to try your hand at object detection? Check out our complete [image segmentation guide](tasks/semantic_segmentation) to learn how to finetune SegFormer and use it for inference!
+
+### Depth estimation
+
+[GLPN](model_doc/glpn), *Global-Local Path Network*, is a Transformer for depth estimation that combines a [SegFormer](model_doc/segformer) encoder with a lightweight decoder.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/glpn_architecture.jpg"/>
+</div>
+
+1. Like ViT, an image is split into a sequence of patches, except these image patches are smaller. This is better for dense prediction tasks like segmentation or depth estimation. The image patches are transformed into patch embeddings (see the [image classification](#image-classification) section for more details about how patch embeddings are created), which are fed to the encoder.
+
+2. The encoder accepts the patch embeddings, and passes them through several encoder blocks. Each block consists of attention and Mix-FFN layers. The purpose of the latter is to provide positional information. At the end of each encoder block is a *patch merging* layer for creating hierarchical representations. The features of each group of neighboring patches are concatenated, and a linear layer is applied to the concatenated features to reduce the number of patches to a resolution of 1/4. This becomes the input to the next encoder block, where this whole process is repeated until you have image features with resolutions of 1/8, 1/16, and 1/32.
+
+3. A lightweight decoder takes the last feature map (1/32 scale) from the encoder and upsamples it to 1/16 scale. From here, the feature is passed into a *Selective Feature Fusion (SFF)* module, which selects and combines local and global features from an attention map for each feature and then upsamples it to 1/8th. This process is repeated until the decoded features are the same size as the original image. The output is passed through two convolution layers and then a sigmoid activation is applied to predict the depth of each pixel.
+
+## Natural language processing
+
+The Transformer was initially designed for machine translation, and since then, it has practically become the default architecture for solving all NLP tasks. Some tasks lend themselves to the Transformer's encoder structure, while others are better suited for the decoder. Still, other tasks make use of both the Transformer's encoder-decoder structure.
+
+### Text classification
+
+[BERT](model_doc/bert) is an encoder-only model and is the first model to effectively implement deep bidirectionality to learn richer representations of the text by attending to words on both sides.
+
+1. BERT uses [WordPiece](tokenizer_summary#wordpiece) tokenization to generate a token embedding of the text. To tell the difference between a single sentence and a pair of sentences, a special `[SEP]` token is added to differentiate them. A special `[CLS]` token is added to the beginning of every sequence of text. The final output with the `[CLS]` token is used as the input to the classification head for classification tasks. BERT also adds a segment embedding to denote whether a token belongs to the first or second sentence in a pair of sentences.
+
+2. BERT is pretrained with two objectives: masked language modeling and next-sentence prediction. In masked language modeling, some percentage of the input tokens are randomly masked, and the model needs to predict these. This solves the issue of bidirectionality, where the model could cheat and see all the words and "predict" the next word. The final hidden states of the predicted mask tokens are passed to a feedforward network with a softmax over the vocabulary to predict the masked word.
+
+    The second pretraining object is next-sentence prediction. The model must predict whether sentence B follows sentence A. Half of the time sentence B is the next sentence, and the other half of the time, sentence B is a random sentence. The prediction, whether it is the next sentence or not, is passed to a feedforward network with a softmax over the two classes (`IsNext` and `NotNext`).
+
+3. The input embeddings are passed through multiple encoder layers to output some final hidden states.
+
+To use the pretrained model for text classification, add a sequence classification head on top of the base BERT model. The sequence classification head is a linear layer that accepts the final hidden states and performs a linear transformation to convert them into logits. The cross-entropy loss is calculated between the logits and target to find the most likely label.
+
+Ready to try your hand at text classification? Check out our complete [text classification guide](tasks/sequence_classification) to learn how to finetune DistilBERT and use it for inference!
+
+### Token classification
+
+To use BERT for token classification tasks like named entity recognition (NER), add a token classification head on top of the base BERT model. The token classification head is a linear layer that accepts the final hidden states and performs a linear transformation to convert them into logits. The cross-entropy loss is calculated between the logits and each token to find the most likely label.
+
+Ready to try your hand at token classification? Check out our complete [token classification guide](tasks/token_classification) to learn how to finetune DistilBERT and use it for inference!
+
+### Question answering
+
+To use BERT for question answering, add a span classification head on top of the base BERT model. This linear layer accepts the final hidden states and performs a linear transformation to compute the `span` start and end logits corresponding to the answer. The cross-entropy loss is calculated between the logits and the label position to find the most likely span of text corresponding to the answer.
+
+Ready to try your hand at question answering? Check out our complete [question answering guide](tasks/question_answering) to learn how to finetune DistilBERT and use it for inference!
+
+<Tip>
+
+💡 Notice how easy it is to use BERT for different tasks once it's been pretrained. You only need to add a specific head to the pretrained model to manipulate the hidden states into your desired output!
+
+</Tip>
+
+### Text generation
+
+[GPT-2](model_doc/gpt2) is a decoder-only model pretrained on a large amount of text. It can generate convincing (though not always true!) text given a prompt and complete other NLP tasks like question answering despite not being explicitly trained to.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/gpt2_architecture.png"/>
+</div>
+
+1. GPT-2 uses [byte pair encoding (BPE)](tokenizer_summary#bytepair-encoding-bpe) to tokenize words and generate a token embedding. Positional encodings are added to the token embeddings to indicate the position of each token in the sequence. The input embeddings are passed through multiple decoder blocks to output some final hidden state. Within each decoder block, GPT-2 uses a *masked self-attention* layer which means GPT-2 can't attend to future tokens. It is only allowed to attend to tokens on the left. This is different from BERT's [`mask`] token because, in masked self-attention, an attention mask is used to set the score to `0` for future tokens.
+
+2. The output from the decoder is passed to a language modeling head, which performs a linear transformation to convert the hidden states into logits. The label is the next token in the sequence, which are created by shifting the logits to the right by one. The cross-entropy loss is calculated between the shifted logits and the labels to output the next most likely token.
+
+GPT-2's pretraining objective is based entirely on [causal language modeling](glossary#causal-language-modeling), predicting the next word in a sequence. This makes GPT-2 especially good at tasks that involve generating text.
+
+Ready to try your hand at text generation? Check out our complete [causal language modeling guide](tasks/language_modeling#causal-language-modeling) to learn how to finetune DistilGPT-2 and use it for inference!
+
+<Tip>
+
+For more information about text generation, check out the [text generation strategies](generation_strategies) guide!
+
+</Tip>
+
+### Summarization
+
+Encoder-decoder models like [BART](model_doc/bart) and [T5](model_doc/t5) are designed for the sequence-to-sequence pattern of a summarization task. We'll explain how BART works in this section, and then you can try finetuning T5 at the end.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bart_architecture.png"/>
+</div>
+
+1. BART's encoder architecture is very similar to BERT and accepts a token and positional embedding of the text. BART is pretrained by corrupting the input and then reconstructing it with the decoder. Unlike other encoders with specific corruption strategies, BART can apply any type of corruption. The *text infilling* corruption strategy works the best though. In text infilling, a number of text spans are replaced with a **single** [`mask`] token. This is important because the model has to predict the masked tokens, and it teaches the model to predict the number of missing tokens. The input embeddings and masked spans are passed through the encoder to output some final hidden states, but unlike BERT, BART doesn't add a final feedforward network at the end to predict a word.
+
+2. The encoder's output is passed to the decoder, which must predict the masked tokens and any uncorrupted tokens from the encoder's output. This gives additional context to help the decoder restore the original text. The output from the decoder is passed to a language modeling head, which performs a linear transformation to convert the hidden states into logits. The cross-entropy loss is calculated between the logits and the label, which is just the token shifted to the right.
+
+Ready to try your hand at summarization? Check out our complete [summarization guide](tasks/summarization) to learn how to finetune T5 and use it for inference!
+
+<Tip>
+
+For more information about text generation, check out the [text generation strategies](generation_strategies) guide!
+
+</Tip>
+
+### Translation
+
+Translation is another example of a sequence-to-sequence task, which means you can use an encoder-decoder model like [BART](model_doc/bart) or [T5](model_doc/t5) to do it. We'll explain how BART works in this section, and then you can try finetuning T5 at the end.
+
+BART adapts to translation by adding a separate randomly initialized encoder to map a source language to an input that can be decoded into the target language. This new encoder's embeddings are passed to the pretrained encoder instead of the original word embeddings. The source encoder is trained by updating the source encoder, positional embeddings, and input embeddings with the cross-entropy loss from the model output. The model parameters are frozen in this first step, and all the model parameters are trained together in the second step.
+
+BART has since been followed up by a multilingual version, mBART, intended for translation and pretrained on many different languages.
+
+Ready to try your hand at translation? Check out our complete [translation guide](tasks/summarization) to learn how to finetune T5 and use it for inference!
+
+<Tip>
+
+For more information about text generation, check out the [text generation strategies](generation_strategies) guide!
+
+</Tip>
\ No newline at end of file
diff --git a/docs/source/en/testing.md b/docs/source/en/testing.md
new file mode 100644
index 000000000000..dcd0893c5c33
--- /dev/null
+++ b/docs/source/en/testing.md
@@ -0,0 +1,1293 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Testing
+
+
+Let's take a look at how 🤗 Transformers models are tested and how you can write new tests and improve the existing ones.
+
+There are 2 test suites in the repository:
+
+1. `tests` -- tests for the general API
+2. `examples` -- tests primarily for various applications that aren't part of the API
+
+## How transformers are tested
+
+1. Once a PR is submitted it gets tested with 9 CircleCi jobs. Every new commit to that PR gets retested. These jobs
+   are defined in this [config file](https://github.com/huggingface/transformers/tree/main/.circleci/config.yml), so that if needed you can reproduce the same
+   environment on your machine.
+
+   These CI jobs don't run `@slow` tests.
+
+2. There are 3 jobs run by [github actions](https://github.com/huggingface/transformers/actions):
+
+   - [torch hub integration](https://github.com/huggingface/transformers/tree/main/.github/workflows/github-torch-hub.yml): checks whether torch hub
+     integration works.
+
+   - [self-hosted (push)](https://github.com/huggingface/transformers/tree/main/.github/workflows/self-push.yml): runs fast tests on GPU only on commits on
+     `main`. It only runs if a commit on `main` has updated the code in one of the following folders: `src`,
+     `tests`, `.github` (to prevent running on added model cards, notebooks, etc.)
+
+   - [self-hosted runner](https://github.com/huggingface/transformers/tree/main/.github/workflows/self-scheduled.yml): runs normal and slow tests on GPU in
+     `tests` and `examples`:
+
+```bash
+RUN_SLOW=1 pytest tests/
+RUN_SLOW=1 pytest examples/
+```
+
+   The results can be observed [here](https://github.com/huggingface/transformers/actions).
+
+
+
+## Running tests
+
+
+
+
+
+### Choosing which tests to run
+
+This document goes into many details of how tests can be run. If after reading everything, you need even more details
+you will find them [here](https://docs.pytest.org/en/latest/usage.html).
+
+Here are some most useful ways of running tests.
+
+Run all:
+
+```console
+pytest
+```
+
+or:
+
+```bash
+make test
+```
+
+Note that the latter is defined as:
+
+```bash
+python -m pytest -n auto --dist=loadfile -s -v ./tests/
+```
+
+which tells pytest to:
+
+- run as many test processes as they are CPU cores (which could be too many if you don't have a ton of RAM!)
+- ensure that all tests from the same file will be run by the same test process
+- do not capture output
+- run in verbose mode
+
+
+
+### Getting the list of all tests
+
+All tests of the test suite:
+
+```bash
+pytest --collect-only -q
+```
+
+All tests of a given test file:
+
+```bash
+pytest tests/test_optimization.py --collect-only -q
+```
+
+### Run a specific test module
+
+To run an individual test module:
+
+```bash
+pytest tests/utils/test_logging.py
+```
+
+### Run specific tests
+
+Since unittest is used inside most of the tests, to run specific subtests you need to know the name of the unittest
+class containing those tests. For example, it could be:
+
+```bash
+pytest tests/test_optimization.py::OptimizationTest::test_adam_w
+```
+
+Here:
+
+- `tests/test_optimization.py` - the file with tests
+- `OptimizationTest` - the name of the class
+- `test_adam_w` - the name of the specific test function
+
+If the file contains multiple classes, you can choose to run only tests of a given class. For example:
+
+```bash
+pytest tests/test_optimization.py::OptimizationTest
+```
+
+will run all the tests inside that class.
+
+As mentioned earlier you can see what tests are contained inside the `OptimizationTest` class by running:
+
+```bash
+pytest tests/test_optimization.py::OptimizationTest --collect-only -q
+```
+
+You can run tests by keyword expressions.
+
+To run only tests whose name contains `adam`:
+
+```bash
+pytest -k adam tests/test_optimization.py
+```
+
+Logical `and` and `or` can be used to indicate whether all keywords should match or either. `not` can be used to
+negate.
+
+To run all tests except those whose name contains `adam`:
+
+```bash
+pytest -k "not adam" tests/test_optimization.py
+```
+
+And you can combine the two patterns in one:
+
+```bash
+pytest -k "ada and not adam" tests/test_optimization.py
+```
+
+For example to run both `test_adafactor` and `test_adam_w` you can use:
+
+```bash
+pytest -k "test_adam_w or test_adam_w" tests/test_optimization.py
+```
+
+Note that we use `or` here, since we want either of the keywords to match to include both.
+
+If you want to include only tests that include both patterns, `and` is to be used:
+
+```bash
+pytest -k "test and ada" tests/test_optimization.py
+```
+
+### Run `accelerate` tests
+
+Sometimes you need to run `accelerate` tests on your models. For that you can just add `-m accelerate_tests` to your command, if let's say you want to run these tests on `OPT` run:
+```bash
+RUN_SLOW=1 pytest -m accelerate_tests tests/models/opt/test_modeling_opt.py 
+```
+
+
+### Run documentation tests 
+
+In order to test whether the documentation examples are correct, you should check that the `doctests` are passing. 
+As an example, let's use [`WhisperModel.forward`'s docstring](https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py#L1017-L1035): 
+
+```python 
+r"""
+Returns:
+
+Example:
+    ```python
+    >>> import torch
+    >>> from transformers import WhisperModel, WhisperFeatureExtractor
+    >>> from datasets import load_dataset
+
+    >>> model = WhisperModel.from_pretrained("openai/whisper-base")
+    >>> feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base")
+    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+    >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
+    >>> input_features = inputs.input_features
+    >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
+    >>> last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state
+    >>> list(last_hidden_state.shape)
+    [1, 2, 512]
+    ```"""
+
+```
+
+Just run the following line to automatically test every docstring example in the desired file: 
+```bash 
+pytest --doctest-modules <path_to_file_or_dir>
+```
+If the file has a markdown extention, you should add the `--doctest-glob="*.md"` argument.
+
+### Run only modified tests
+
+You can run the tests related to the unstaged files or the current branch (according to Git) by using [pytest-picked](https://github.com/anapaulagomes/pytest-picked). This is a great way of quickly testing your changes didn't break
+anything, since it won't run the tests related to files you didn't touch.
+
+```bash
+pip install pytest-picked
+```
+
+```bash
+pytest --picked
+```
+
+All tests will be run from files and folders which are modified, but not yet committed.
+
+### Automatically rerun failed tests on source modification
+
+[pytest-xdist](https://github.com/pytest-dev/pytest-xdist) provides a very useful feature of detecting all failed
+tests, and then waiting for you to modify files and continuously re-rerun those failing tests until they pass while you
+fix them. So that you don't need to re start pytest after you made the fix. This is repeated until all tests pass after
+which again a full run is performed.
+
+```bash
+pip install pytest-xdist
+```
+
+To enter the mode: `pytest -f` or `pytest --looponfail`
+
+File changes are detected by looking at `looponfailroots` root directories and all of their contents (recursively).
+If the default for this value does not work for you, you can change it in your project by setting a configuration
+option in `setup.cfg`:
+
+```ini
+[tool:pytest]
+looponfailroots = transformers tests
+```
+
+or `pytest.ini`/``tox.ini`` files:
+
+```ini
+[pytest]
+looponfailroots = transformers tests
+```
+
+This would lead to only looking for file changes in the respective directories, specified relatively to the ini-file’s
+directory.
+
+[pytest-watch](https://github.com/joeyespo/pytest-watch) is an alternative implementation of this functionality.
+
+
+### Skip a test module
+
+If you want to run all test modules, except a few you can exclude them by giving an explicit list of tests to run. For
+example, to run all except `test_modeling_*.py` tests:
+
+```bash
+pytest *ls -1 tests/*py | grep -v test_modeling*
+```
+
+### Clearing state
+
+CI builds and when isolation is important (against speed), cache should be cleared:
+
+```bash
+pytest --cache-clear tests
+```
+
+### Running tests in parallel
+
+As mentioned earlier `make test` runs tests in parallel via `pytest-xdist` plugin (`-n X` argument, e.g. `-n 2`
+to run 2 parallel jobs).
+
+`pytest-xdist`'s `--dist=` option allows one to control how the tests are grouped. `--dist=loadfile` puts the
+tests located in one file onto the same process.
+
+Since the order of executed tests is different and unpredictable, if running the test suite with `pytest-xdist`
+produces failures (meaning we have some undetected coupled tests), use [pytest-replay](https://github.com/ESSS/pytest-replay) to replay the tests in the same order, which should help with then somehow
+reducing that failing sequence to a minimum.
+
+### Test order and repetition
+
+It's good to repeat the tests several times, in sequence, randomly, or in sets, to detect any potential
+inter-dependency and state-related bugs (tear down). And the straightforward multiple repetition is just good to detect
+some problems that get uncovered by randomness of DL.
+
+
+#### Repeat tests
+
+- [pytest-flakefinder](https://github.com/dropbox/pytest-flakefinder):
+
+```bash
+pip install pytest-flakefinder
+```
+
+And then run every test multiple times (50 by default):
+
+```bash
+pytest --flake-finder --flake-runs=5 tests/test_failing_test.py
+```
+
+<Tip>
+
+This plugin doesn't work with `-n` flag from `pytest-xdist`.
+
+</Tip>
+
+<Tip>
+
+There is another plugin `pytest-repeat`, but it doesn't work with `unittest`.
+
+</Tip>
+
+#### Run tests in a random order
+
+```bash
+pip install pytest-random-order
+```
+
+Important: the presence of `pytest-random-order` will automatically randomize tests, no configuration change or
+command line options is required.
+
+As explained earlier this allows detection of coupled tests - where one test's state affects the state of another. When
+`pytest-random-order` is installed it will print the random seed it used for that session, e.g:
+
+```bash
+pytest tests
+[...]
+Using --random-order-bucket=module
+Using --random-order-seed=573663
+```
+
+So that if the given particular sequence fails, you can reproduce it by adding that exact seed, e.g.:
+
+```bash
+pytest --random-order-seed=573663
+[...]
+Using --random-order-bucket=module
+Using --random-order-seed=573663
+```
+
+It will only reproduce the exact order if you use the exact same list of tests (or no list at all). Once you start to
+manually narrowing down the list you can no longer rely on the seed, but have to list them manually in the exact order
+they failed and tell pytest to not randomize them instead using `--random-order-bucket=none`, e.g.:
+
+```bash
+pytest --random-order-bucket=none tests/test_a.py tests/test_c.py tests/test_b.py
+```
+
+To disable the shuffling for all tests:
+
+```bash
+pytest --random-order-bucket=none
+```
+
+By default `--random-order-bucket=module` is implied, which will shuffle the files on the module levels. It can also
+shuffle on `class`, `package`, `global` and `none` levels. For the complete details please see its
+[documentation](https://github.com/jbasko/pytest-random-order).
+
+Another randomization alternative is: [`pytest-randomly`](https://github.com/pytest-dev/pytest-randomly). This
+module has a very similar functionality/interface, but it doesn't have the bucket modes available in
+`pytest-random-order`. It has the same problem of imposing itself once installed.
+
+### Look and feel variations
+
+#### pytest-sugar
+
+[pytest-sugar](https://github.com/Frozenball/pytest-sugar) is a plugin that improves the look-n-feel, adds a
+progressbar, and show tests that fail and the assert instantly. It gets activated automatically upon installation.
+
+```bash
+pip install pytest-sugar
+```
+
+To run tests without it, run:
+
+```bash
+pytest -p no:sugar
+```
+
+or uninstall it.
+
+
+
+#### Report each sub-test name and its progress
+
+For a single or a group of tests via `pytest` (after `pip install pytest-pspec`):
+
+```bash
+pytest --pspec tests/test_optimization.py
+```
+
+#### Instantly shows failed tests
+
+[pytest-instafail](https://github.com/pytest-dev/pytest-instafail) shows failures and errors instantly instead of
+waiting until the end of test session.
+
+```bash
+pip install pytest-instafail
+```
+
+```bash
+pytest --instafail
+```
+
+### To GPU or not to GPU
+
+On a GPU-enabled setup, to test in CPU-only mode add `CUDA_VISIBLE_DEVICES=""`:
+
+```bash
+CUDA_VISIBLE_DEVICES="" pytest tests/utils/test_logging.py
+```
+
+or if you have multiple gpus, you can specify which one is to be used by `pytest`. For example, to use only the
+second gpu if you have gpus `0` and `1`, you can run:
+
+```bash
+CUDA_VISIBLE_DEVICES="1" pytest tests/utils/test_logging.py
+```
+
+This is handy when you want to run different tasks on different GPUs.
+
+Some tests must be run on CPU-only, others on either CPU or GPU or TPU, yet others on multiple-GPUs. The following skip
+decorators are used to set the requirements of tests CPU/GPU/TPU-wise:
+
+- `require_torch` - this test will run only under torch
+- `require_torch_gpu` - as `require_torch` plus requires at least 1 GPU
+- `require_torch_multi_gpu` - as `require_torch` plus requires at least 2 GPUs
+- `require_torch_non_multi_gpu` - as `require_torch` plus requires 0 or 1 GPUs
+- `require_torch_up_to_2_gpus` - as `require_torch` plus requires 0 or 1 or 2 GPUs
+- `require_torch_tpu` - as `require_torch` plus requires at least 1 TPU
+
+Let's depict the GPU requirements in the following table:
+
+
+| n gpus | decorator                      |
+|--------+--------------------------------|
+| `>= 0` | `@require_torch`               |
+| `>= 1` | `@require_torch_gpu`           |
+| `>= 2` | `@require_torch_multi_gpu`     |
+| `< 2`  | `@require_torch_non_multi_gpu` |
+| `< 3`  | `@require_torch_up_to_2_gpus`  |
+
+
+For example, here is a test that must be run only when there are 2 or more GPUs available and pytorch is installed:
+
+```python no-style
+@require_torch_multi_gpu
+def test_example_with_multi_gpu():
+```
+
+If a test requires `tensorflow` use the `require_tf` decorator. For example:
+
+```python no-style
+@require_tf
+def test_tf_thing_with_tensorflow():
+```
+
+These decorators can be stacked. For example, if a test is slow and requires at least one GPU under pytorch, here is
+how to set it up:
+
+```python no-style
+@require_torch_gpu
+@slow
+def test_example_slow_on_gpu():
+```
+
+Some decorators like `@parametrized` rewrite test names, therefore `@require_*` skip decorators have to be listed
+last for them to work correctly. Here is an example of the correct usage:
+
+```python no-style
+@parameterized.expand(...)
+@require_torch_multi_gpu
+def test_integration_foo():
+```
+
+This order problem doesn't exist with `@pytest.mark.parametrize`, you can put it first or last and it will still
+work. But it only works with non-unittests.
+
+Inside tests:
+
+- How many GPUs are available:
+
+```python
+from transformers.testing_utils import get_gpu_count
+
+n_gpu = get_gpu_count()  # works with torch and tf
+```
+
+### Testing with a specific PyTorch backend or device
+
+To run the test suite on a specific torch device add `TRANSFORMERS_TEST_DEVICE="$device"` where `$device` is the target backend. For example, to test on CPU only:
+```bash
+TRANSFORMERS_TEST_DEVICE="cpu" pytest tests/utils/test_logging.py
+```
+
+This variable is useful for testing custom or less common PyTorch backends such as `mps`. It can also be used to achieve the same effect as `CUDA_VISIBLE_DEVICES` by targeting specific GPUs or testing in CPU-only mode.
+
+Certain devices will require an additional import after importing `torch` for the first time. This can be specified using the environment variable `TRANSFORMERS_TEST_BACKEND`:
+```bash
+TRANSFORMERS_TEST_BACKEND="torch_npu" pytest tests/utils/test_logging.py
+```
+
+
+### Distributed training
+
+`pytest` can't deal with distributed training directly. If this is attempted - the sub-processes don't do the right
+thing and end up thinking they are `pytest` and start running the test suite in loops. It works, however, if one
+spawns a normal process that then spawns off multiple workers and manages the IO pipes.
+
+Here are some tests that use it:
+
+- [test_trainer_distributed.py](https://github.com/huggingface/transformers/tree/main/tests/trainer/test_trainer_distributed.py)
+- [test_deepspeed.py](https://github.com/huggingface/transformers/tree/main/tests/deepspeed/test_deepspeed.py)
+
+To jump right into the execution point, search for the `execute_subprocess_async` call in those tests.
+
+You will need at least 2 GPUs to see these tests in action:
+
+```bash
+CUDA_VISIBLE_DEVICES=0,1 RUN_SLOW=1 pytest -sv tests/test_trainer_distributed.py
+```
+
+### Output capture
+
+During test execution any output sent to `stdout` and `stderr` is captured. If a test or a setup method fails, its
+according captured output will usually be shown along with the failure traceback.
+
+To disable output capturing and to get the `stdout` and `stderr` normally, use `-s` or `--capture=no`:
+
+```bash
+pytest -s tests/utils/test_logging.py
+```
+
+To send test results to JUnit format output:
+
+```bash
+py.test tests --junitxml=result.xml
+```
+
+### Color control
+
+To have no color (e.g., yellow on white background is not readable):
+
+```bash
+pytest --color=no tests/utils/test_logging.py
+```
+
+### Sending test report to online pastebin service
+
+Creating a URL for each test failure:
+
+```bash
+pytest --pastebin=failed tests/utils/test_logging.py
+```
+
+This will submit test run information to a remote Paste service and provide a URL for each failure. You may select
+tests as usual or add for example -x if you only want to send one particular failure.
+
+Creating a URL for a whole test session log:
+
+```bash
+pytest --pastebin=all tests/utils/test_logging.py
+```
+
+## Writing tests
+
+🤗 transformers tests are based on `unittest`, but run by `pytest`, so most of the time features from both systems
+can be used.
+
+You can read [here](https://docs.pytest.org/en/stable/unittest.html) which features are supported, but the important
+thing to remember is that most `pytest` fixtures don't work. Neither parametrization, but we use the module
+`parameterized` that works in a similar way.
+
+
+### Parametrization
+
+Often, there is a need to run the same test multiple times, but with different arguments. It could be done from within
+the test, but then there is no way of running that test for just one set of arguments.
+
+```python
+# test_this1.py
+import unittest
+from parameterized import parameterized
+
+
+class TestMathUnitTest(unittest.TestCase):
+    @parameterized.expand(
+        [
+            ("negative", -1.5, -2.0),
+            ("integer", 1, 1.0),
+            ("large fraction", 1.6, 1),
+        ]
+    )
+    def test_floor(self, name, input, expected):
+        assert_equal(math.floor(input), expected)
+```
+
+Now, by default this test will be run 3 times, each time with the last 3 arguments of `test_floor` being assigned the
+corresponding arguments in the parameter list.
+
+and you could run just the `negative` and `integer` sets of params with:
+
+```bash
+pytest -k "negative and integer" tests/test_mytest.py
+```
+
+or all but `negative` sub-tests, with:
+
+```bash
+pytest -k "not negative" tests/test_mytest.py
+```
+
+Besides using the `-k` filter that was just mentioned, you can find out the exact name of each sub-test and run any
+or all of them using their exact names.
+
+```bash
+pytest test_this1.py --collect-only -q
+```
+
+and it will list:
+
+```bash
+test_this1.py::TestMathUnitTest::test_floor_0_negative
+test_this1.py::TestMathUnitTest::test_floor_1_integer
+test_this1.py::TestMathUnitTest::test_floor_2_large_fraction
+```
+
+So now you can run just 2 specific sub-tests:
+
+```bash
+pytest test_this1.py::TestMathUnitTest::test_floor_0_negative  test_this1.py::TestMathUnitTest::test_floor_1_integer
+```
+
+The module [parameterized](https://pypi.org/project/parameterized/) which is already in the developer dependencies
+of `transformers` works for both: `unittests` and `pytest` tests.
+
+If, however, the test is not a `unittest`, you may use `pytest.mark.parametrize` (or you may see it being used in
+some existing tests, mostly under `examples`).
+
+Here is the same example, this time using `pytest`'s `parametrize` marker:
+
+```python
+# test_this2.py
+import pytest
+
+
+@pytest.mark.parametrize(
+    "name, input, expected",
+    [
+        ("negative", -1.5, -2.0),
+        ("integer", 1, 1.0),
+        ("large fraction", 1.6, 1),
+    ],
+)
+def test_floor(name, input, expected):
+    assert_equal(math.floor(input), expected)
+```
+
+Same as with `parameterized`, with `pytest.mark.parametrize` you can have a fine control over which sub-tests are
+run, if the `-k` filter doesn't do the job. Except, this parametrization function creates a slightly different set of
+names for the sub-tests. Here is what they look like:
+
+```bash
+pytest test_this2.py --collect-only -q
+```
+
+and it will list:
+
+```bash
+test_this2.py::test_floor[integer-1-1.0]
+test_this2.py::test_floor[negative--1.5--2.0]
+test_this2.py::test_floor[large fraction-1.6-1]
+```
+
+So now you can run just the specific test:
+
+```bash
+pytest test_this2.py::test_floor[negative--1.5--2.0] test_this2.py::test_floor[integer-1-1.0]
+```
+
+as in the previous example.
+
+
+
+### Files and directories
+
+In tests often we need to know where things are relative to the current test file, and it's not trivial since the test
+could be invoked from more than one directory or could reside in sub-directories with different depths. A helper class
+`transformers.test_utils.TestCasePlus` solves this problem by sorting out all the basic paths and provides easy
+accessors to them:
+
+- `pathlib` objects (all fully resolved):
+
+  - `test_file_path` - the current test file path, i.e. `__file__`
+  - `test_file_dir` - the directory containing the current test file
+  - `tests_dir` - the directory of the `tests` test suite
+  - `examples_dir` - the directory of the `examples` test suite
+  - `repo_root_dir` - the directory of the repository
+  - `src_dir` - the directory of `src` (i.e. where the `transformers` sub-dir resides)
+
+- stringified paths---same as above but these return paths as strings, rather than `pathlib` objects:
+
+  - `test_file_path_str`
+  - `test_file_dir_str`
+  - `tests_dir_str`
+  - `examples_dir_str`
+  - `repo_root_dir_str`
+  - `src_dir_str`
+
+To start using those all you need is to make sure that the test resides in a subclass of
+`transformers.test_utils.TestCasePlus`. For example:
+
+```python
+from transformers.testing_utils import TestCasePlus
+
+
+class PathExampleTest(TestCasePlus):
+    def test_something_involving_local_locations(self):
+        data_dir = self.tests_dir / "fixtures/tests_samples/wmt_en_ro"
+```
+
+If you don't need to manipulate paths via `pathlib` or you just need a path as a string, you can always invoked
+`str()` on the `pathlib` object or use the accessors ending with `_str`. For example:
+
+```python
+from transformers.testing_utils import TestCasePlus
+
+
+class PathExampleTest(TestCasePlus):
+    def test_something_involving_stringified_locations(self):
+        examples_dir = self.examples_dir_str
+```
+
+### Temporary files and directories
+
+Using unique temporary files and directories are essential for parallel test running, so that the tests won't overwrite
+each other's data. Also we want to get the temporary files and directories removed at the end of each test that created
+them. Therefore, using packages like `tempfile`, which address these needs is essential.
+
+However, when debugging tests, you need to be able to see what goes into the temporary file or directory and you want
+to know it's exact path and not having it randomized on every test re-run.
+
+A helper class `transformers.test_utils.TestCasePlus` is best used for such purposes. It's a sub-class of
+`unittest.TestCase`, so we can easily inherit from it in the test modules.
+
+Here is an example of its usage:
+
+```python
+from transformers.testing_utils import TestCasePlus
+
+
+class ExamplesTests(TestCasePlus):
+    def test_whatever(self):
+        tmp_dir = self.get_auto_remove_tmp_dir()
+```
+
+This code creates a unique temporary directory, and sets `tmp_dir` to its location.
+
+- Create a unique temporary dir:
+
+```python
+def test_whatever(self):
+    tmp_dir = self.get_auto_remove_tmp_dir()
+```
+
+`tmp_dir` will contain the path to the created temporary dir. It will be automatically removed at the end of the
+test.
+
+- Create a temporary dir of my choice, ensure it's empty before the test starts and don't empty it after the test.
+
+```python
+def test_whatever(self):
+    tmp_dir = self.get_auto_remove_tmp_dir("./xxx")
+```
+
+This is useful for debug when you want to monitor a specific directory and want to make sure the previous tests didn't
+leave any data in there.
+
+- You can override the default behavior by directly overriding the `before` and `after` args, leading to one of the
+  following behaviors:
+
+  - `before=True`: the temporary dir will always be cleared at the beginning of the test.
+  - `before=False`: if the temporary dir already existed, any existing files will remain there.
+  - `after=True`: the temporary dir will always be deleted at the end of the test.
+  - `after=False`: the temporary dir will always be left intact at the end of the test.
+
+<Tip>
+
+In order to run the equivalent of `rm -r` safely, only subdirs of the project repository checkout are allowed if
+an explicit `tmp_dir` is used, so that by mistake no `/tmp` or similar important part of the filesystem will
+get nuked. i.e. please always pass paths that start with `./`.
+
+</Tip>
+
+<Tip>
+
+Each test can register multiple temporary directories and they all will get auto-removed, unless requested
+otherwise.
+
+</Tip>
+
+### Temporary sys.path override
+
+If you need to temporary override `sys.path` to import from another test for example, you can use the
+`ExtendSysPath` context manager. Example:
+
+
+```python
+import os
+from transformers.testing_utils import ExtendSysPath
+
+bindir = os.path.abspath(os.path.dirname(__file__))
+with ExtendSysPath(f"{bindir}/.."):
+    from test_trainer import TrainerIntegrationCommon  # noqa
+```
+
+### Skipping tests
+
+This is useful when a bug is found and a new test is written, yet the bug is not fixed yet. In order to be able to
+commit it to the main repository we need make sure it's skipped during `make test`.
+
+Methods:
+
+-  A **skip** means that you expect your test to pass only if some conditions are met, otherwise pytest should skip
+  running the test altogether. Common examples are skipping windows-only tests on non-windows platforms, or skipping
+  tests that depend on an external resource which is not available at the moment (for example a database).
+
+-  A **xfail** means that you expect a test to fail for some reason. A common example is a test for a feature not yet
+  implemented, or a bug not yet fixed. When a test passes despite being expected to fail (marked with
+  pytest.mark.xfail), it’s an xpass and will be reported in the test summary.
+
+One of the important differences between the two is that `skip` doesn't run the test, and `xfail` does. So if the
+code that's buggy causes some bad state that will affect other tests, do not use `xfail`.
+
+#### Implementation
+
+- Here is how to skip whole test unconditionally:
+
+```python no-style
+@unittest.skip("this bug needs to be fixed")
+def test_feature_x():
+```
+
+or via pytest:
+
+```python no-style
+@pytest.mark.skip(reason="this bug needs to be fixed")
+```
+
+or the `xfail` way:
+
+```python no-style
+@pytest.mark.xfail
+def test_feature_x():
+```
+
+- Here is how to skip a test based on some internal check inside the test:
+
+```python
+def test_feature_x():
+    if not has_something():
+        pytest.skip("unsupported configuration")
+```
+
+or the whole module:
+
+```python
+import pytest
+
+if not pytest.config.getoption("--custom-flag"):
+    pytest.skip("--custom-flag is missing, skipping tests", allow_module_level=True)
+```
+
+or the `xfail` way:
+
+```python
+def test_feature_x():
+    pytest.xfail("expected to fail until bug XYZ is fixed")
+```
+
+- Here is how to skip all tests in a module if some import is missing:
+
+```python
+docutils = pytest.importorskip("docutils", minversion="0.3")
+```
+
+-  Skip a test based on a condition:
+
+```python no-style
+@pytest.mark.skipif(sys.version_info < (3,6), reason="requires python3.6 or higher")
+def test_feature_x():
+```
+
+or:
+
+```python no-style
+@unittest.skipIf(torch_device == "cpu", "Can't do half precision")
+def test_feature_x():
+```
+
+or skip the whole module:
+
+```python no-style
+@pytest.mark.skipif(sys.platform == 'win32', reason="does not run on windows")
+class TestClass():
+    def test_feature_x(self):
+```
+
+More details, example and ways are [here](https://docs.pytest.org/en/latest/skipping.html).
+
+### Slow tests
+
+The library of tests is ever-growing, and some of the tests take minutes to run, therefore we can't afford waiting for
+an hour for the test suite to complete on CI. Therefore, with some exceptions for essential tests, slow tests should be
+marked as in the example below:
+
+```python no-style
+from transformers.testing_utils import slow
+@slow
+def test_integration_foo():
+```
+
+Once a test is marked as `@slow`, to run such tests set `RUN_SLOW=1` env var, e.g.:
+
+```bash
+RUN_SLOW=1 pytest tests
+```
+
+Some decorators like `@parameterized` rewrite test names, therefore `@slow` and the rest of the skip decorators
+`@require_*` have to be listed last for them to work correctly. Here is an example of the correct usage:
+
+```python no-style
+@parameteriz ed.expand(...)
+@slow
+def test_integration_foo():
+```
+
+As explained at the beginning of this document, slow tests get to run on a scheduled basis, rather than in PRs CI
+checks. So it's possible that some problems will be missed during a PR submission and get merged. Such problems will
+get caught during the next scheduled CI job. But it also means that it's important to run the slow tests on your
+machine before submitting the PR.
+
+Here is a rough decision making mechanism for choosing which tests should be marked as slow:
+
+If the test is focused on one of the library's internal components (e.g., modeling files, tokenization files,
+pipelines), then we should run that test in the non-slow test suite. If it's focused on an other aspect of the library,
+such as the documentation or the examples, then we should run these tests in the slow test suite. And then, to refine
+this approach we should have exceptions:
+
+- All tests that need to download a heavy set of weights or a dataset that is larger than ~50MB (e.g., model or
+  tokenizer integration tests, pipeline integration tests) should be set to slow. If you're adding a new model, you
+  should create and upload to the hub a tiny version of it (with random weights) for integration tests. This is
+  discussed in the following paragraphs.
+- All tests that need to do a training not specifically optimized to be fast should be set to slow.
+- We can introduce exceptions if some of these should-be-non-slow tests are excruciatingly slow, and set them to
+  `@slow`. Auto-modeling tests, which save and load large files to disk, are a good example of tests that are marked
+  as `@slow`.
+- If a test completes under 1 second on CI (including downloads if any) then it should be a normal test regardless.
+
+Collectively, all the non-slow tests need to cover entirely the different internals, while remaining fast. For example,
+a significant coverage can be achieved by testing with specially created tiny models with random weights. Such models
+have the very minimal number of layers (e.g., 2), vocab size (e.g., 1000), etc. Then the `@slow` tests can use large
+slow models to do qualitative testing. To see the use of these simply look for *tiny* models with:
+
+```bash
+grep tiny tests examples
+```
+
+Here is a an example of a [script](https://github.com/huggingface/transformers/tree/main/scripts/fsmt/fsmt-make-tiny-model.py) that created the tiny model
+[stas/tiny-wmt19-en-de](https://huggingface.co/stas/tiny-wmt19-en-de). You can easily adjust it to your specific
+model's architecture.
+
+It's easy to measure the run-time incorrectly if for example there is an overheard of downloading a huge model, but if
+you test it locally the downloaded files would be cached and thus the download time not measured. Hence check the
+execution speed report in CI logs instead (the output of `pytest --durations=0 tests`).
+
+That report is also useful to find slow outliers that aren't marked as such, or which need to be re-written to be fast.
+If you notice that the test suite starts getting slow on CI, the top listing of this report will show the slowest
+tests.
+
+
+### Testing the stdout/stderr output
+
+In order to test functions that write to `stdout` and/or `stderr`, the test can access those streams using the
+`pytest`'s [capsys system](https://docs.pytest.org/en/latest/capture.html). Here is how this is accomplished:
+
+```python
+import sys
+
+
+def print_to_stdout(s):
+    print(s)
+
+
+def print_to_stderr(s):
+    sys.stderr.write(s)
+
+
+def test_result_and_stdout(capsys):
+    msg = "Hello"
+    print_to_stdout(msg)
+    print_to_stderr(msg)
+    out, err = capsys.readouterr()  # consume the captured output streams
+    # optional: if you want to replay the consumed streams:
+    sys.stdout.write(out)
+    sys.stderr.write(err)
+    # test:
+    assert msg in out
+    assert msg in err
+```
+
+And, of course, most of the time, `stderr` will come as a part of an exception, so try/except has to be used in such
+a case:
+
+```python
+def raise_exception(msg):
+    raise ValueError(msg)
+
+
+def test_something_exception():
+    msg = "Not a good value"
+    error = ""
+    try:
+        raise_exception(msg)
+    except Exception as e:
+        error = str(e)
+        assert msg in error, f"{msg} is in the exception:\n{error}"
+```
+
+Another approach to capturing stdout is via `contextlib.redirect_stdout`:
+
+```python
+from io import StringIO
+from contextlib import redirect_stdout
+
+
+def print_to_stdout(s):
+    print(s)
+
+
+def test_result_and_stdout():
+    msg = "Hello"
+    buffer = StringIO()
+    with redirect_stdout(buffer):
+        print_to_stdout(msg)
+    out = buffer.getvalue()
+    # optional: if you want to replay the consumed streams:
+    sys.stdout.write(out)
+    # test:
+    assert msg in out
+```
+
+An important potential issue with capturing stdout is that it may contain `\r` characters that in normal `print`
+reset everything that has been printed so far. There is no problem with `pytest`, but with `pytest -s` these
+characters get included in the buffer, so to be able to have the test run with and without `-s`, you have to make an
+extra cleanup to the captured output, using `re.sub(r'~.*\r', '', buf, 0, re.M)`.
+
+But, then we have a helper context manager wrapper to automatically take care of it all, regardless of whether it has
+some `\r`'s in it or not, so it's a simple:
+
+```python
+from transformers.testing_utils import CaptureStdout
+
+with CaptureStdout() as cs:
+    function_that_writes_to_stdout()
+print(cs.out)
+```
+
+Here is a full test example:
+
+```python
+from transformers.testing_utils import CaptureStdout
+
+msg = "Secret message\r"
+final = "Hello World"
+with CaptureStdout() as cs:
+    print(msg + final)
+assert cs.out == final + "\n", f"captured: {cs.out}, expecting {final}"
+```
+
+If you'd like to capture `stderr` use the `CaptureStderr` class instead:
+
+```python
+from transformers.testing_utils import CaptureStderr
+
+with CaptureStderr() as cs:
+    function_that_writes_to_stderr()
+print(cs.err)
+```
+
+If you need to capture both streams at once, use the parent `CaptureStd` class:
+
+```python
+from transformers.testing_utils import CaptureStd
+
+with CaptureStd() as cs:
+    function_that_writes_to_stdout_and_stderr()
+print(cs.err, cs.out)
+```
+
+Also, to aid debugging test issues, by default these context managers automatically replay the captured streams on exit
+from the context.
+
+
+### Capturing logger stream
+
+If you need to validate the output of a logger, you can use `CaptureLogger`:
+
+```python
+from transformers import logging
+from transformers.testing_utils import CaptureLogger
+
+msg = "Testing 1, 2, 3"
+logging.set_verbosity_info()
+logger = logging.get_logger("transformers.models.bart.tokenization_bart")
+with CaptureLogger(logger) as cl:
+    logger.info(msg)
+assert cl.out, msg + "\n"
+```
+
+### Testing with environment variables
+
+If you want to test the impact of environment variables for a specific test you can use a helper decorator
+`transformers.testing_utils.mockenv`
+
+```python
+from transformers.testing_utils import mockenv
+
+
+class HfArgumentParserTest(unittest.TestCase):
+    @mockenv(TRANSFORMERS_VERBOSITY="error")
+    def test_env_override(self):
+        env_level_str = os.getenv("TRANSFORMERS_VERBOSITY", None)
+```
+
+At times an external program needs to be called, which requires setting `PYTHONPATH` in `os.environ` to include
+multiple local paths. A helper class `transformers.test_utils.TestCasePlus` comes to help:
+
+```python
+from transformers.testing_utils import TestCasePlus
+
+
+class EnvExampleTest(TestCasePlus):
+    def test_external_prog(self):
+        env = self.get_env()
+        # now call the external program, passing `env` to it
+```
+
+Depending on whether the test file was under the `tests` test suite or `examples` it'll correctly set up
+`env[PYTHONPATH]` to include one of these two directories, and also the `src` directory to ensure the testing is
+done against the current repo, and finally with whatever `env[PYTHONPATH]` was already set to before the test was
+called if anything.
+
+This helper method creates a copy of the `os.environ` object, so the original remains intact.
+
+
+### Getting reproducible results
+
+In some situations you may want to remove randomness for your tests. To get identical reproducible results set, you
+will need to fix the seed:
+
+```python
+seed = 42
+
+# python RNG
+import random
+
+random.seed(seed)
+
+# pytorch RNGs
+import torch
+
+torch.manual_seed(seed)
+torch.backends.cudnn.deterministic = True
+if torch.cuda.is_available():
+    torch.cuda.manual_seed_all(seed)
+
+# numpy RNG
+import numpy as np
+
+np.random.seed(seed)
+
+# tf RNG
+tf.random.set_seed(seed)
+```
+
+### Debugging tests
+
+To start a debugger at the point of the warning, do this:
+
+```bash
+pytest tests/utils/test_logging.py -W error::UserWarning --pdb
+```
+
+## Working with github actions workflows
+
+To trigger a self-push workflow CI job, you must:
+
+1. Create a new branch on `transformers` origin (not a fork!).
+2. The branch name has to start with either `ci_` or `ci-` (`main` triggers it too, but we can't do PRs on
+   `main`). It also gets triggered only for specific paths - you can find the up-to-date definition in case it
+   changed since this document has been written [here](https://github.com/huggingface/transformers/blob/main/.github/workflows/self-push.yml) under *push:*
+3. Create a PR from this branch.
+4. Then you can see the job appear [here](https://github.com/huggingface/transformers/actions/workflows/self-push.yml). It may not run right away if there
+   is a backlog.
+
+
+
+
+## Testing Experimental CI Features
+
+Testing CI features can be potentially problematic as it can interfere with the normal CI functioning. Therefore if a
+new CI feature is to be added, it should be done as following.
+
+1. Create a new dedicated job that tests what needs to be tested
+2. The new job must always succeed so that it gives us a green ✓ (details below).
+3. Let it run for some days to see that a variety of different PR types get to run on it (user fork branches,
+   non-forked branches, branches originating from github.com UI direct file edit, various forced pushes, etc. - there
+   are so many) while monitoring the experimental job's logs (not the overall job green as it's purposefully always
+   green)
+4. When it's clear that everything is solid, then merge the new changes into existing jobs.
+
+That way experiments on CI functionality itself won't interfere with the normal workflow.
+
+Now how can we make the job always succeed while the new CI feature is being developed?
+
+Some CIs, like TravisCI support ignore-step-failure and will report the overall job as successful, but CircleCI and
+Github Actions as of this writing don't support that.
+
+So the following workaround can be used:
+
+1. `set +euo pipefail` at the beginning of the run command to suppress most potential failures in the bash script.
+2. the last command must be a success: `echo "done"` or just `true` will do
+
+Here is an example:
+
+```yaml
+- run:
+    name: run CI experiment
+    command: |
+        set +euo pipefail
+        echo "setting run-all-despite-any-errors-mode"
+        this_command_will_fail
+        echo "but bash continues to run"
+        # emulate another failure
+        false
+        # but the last command must be a success
+        echo "during experiment do not remove: reporting success to CI, even if there were failures"
+```
+
+For simple commands you could also do:
+
+```bash
+cmd_that_may_fail || true
+```
+
+Of course, once satisfied with the results, integrate the experimental step or job with the rest of the normal jobs,
+while removing `set +euo pipefail` or any other things you may have added to ensure that the experimental job doesn't
+interfere with the normal CI functioning.
+
+This whole process would have been much easier if we only could set something like `allow-failure` for the
+experimental step, and let it fail without impacting the overall status of PRs. But as mentioned earlier CircleCI and
+Github Actions don't support it at the moment.
+
+You can vote for this feature and see where it is at these CI-specific threads:
+
+- [Github Actions:](https://github.com/actions/toolkit/issues/399)
+- [CircleCI:](https://ideas.circleci.com/ideas/CCI-I-344)
diff --git a/docs/source/en/testing.mdx b/docs/source/en/testing.mdx
deleted file mode 100644
index cb03a57b0413..000000000000
--- a/docs/source/en/testing.mdx
+++ /dev/null
@@ -1,1273 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Testing
-
-
-Let's take a look at how 🤗 Transformers models are tested and how you can write new tests and improve the existing ones.
-
-There are 2 test suites in the repository:
-
-1. `tests` -- tests for the general API
-2. `examples` -- tests primarily for various applications that aren't part of the API
-
-## How transformers are tested
-
-1. Once a PR is submitted it gets tested with 9 CircleCi jobs. Every new commit to that PR gets retested. These jobs
-   are defined in this [config file](https://github.com/huggingface/transformers/tree/main/.circleci/config.yml), so that if needed you can reproduce the same
-   environment on your machine.
-
-   These CI jobs don't run `@slow` tests.
-
-2. There are 3 jobs run by [github actions](https://github.com/huggingface/transformers/actions):
-
-   - [torch hub integration](https://github.com/huggingface/transformers/tree/main/.github/workflows/github-torch-hub.yml): checks whether torch hub
-     integration works.
-
-   - [self-hosted (push)](https://github.com/huggingface/transformers/tree/main/.github/workflows/self-push.yml): runs fast tests on GPU only on commits on
-     `main`. It only runs if a commit on `main` has updated the code in one of the following folders: `src`,
-     `tests`, `.github` (to prevent running on added model cards, notebooks, etc.)
-
-   - [self-hosted runner](https://github.com/huggingface/transformers/tree/main/.github/workflows/self-scheduled.yml): runs normal and slow tests on GPU in
-     `tests` and `examples`:
-
-```bash
-RUN_SLOW=1 pytest tests/
-RUN_SLOW=1 pytest examples/
-```
-
-   The results can be observed [here](https://github.com/huggingface/transformers/actions).
-
-
-
-## Running tests
-
-
-
-
-
-### Choosing which tests to run
-
-This document goes into many details of how tests can be run. If after reading everything, you need even more details
-you will find them [here](https://docs.pytest.org/en/latest/usage.html).
-
-Here are some most useful ways of running tests.
-
-Run all:
-
-```console
-pytest
-```
-
-or:
-
-```bash
-make test
-```
-
-Note that the latter is defined as:
-
-```bash
-python -m pytest -n auto --dist=loadfile -s -v ./tests/
-```
-
-which tells pytest to:
-
-- run as many test processes as they are CPU cores (which could be too many if you don't have a ton of RAM!)
-- ensure that all tests from the same file will be run by the same test process
-- do not capture output
-- run in verbose mode
-
-
-
-### Getting the list of all tests
-
-All tests of the test suite:
-
-```bash
-pytest --collect-only -q
-```
-
-All tests of a given test file:
-
-```bash
-pytest tests/test_optimization.py --collect-only -q
-```
-
-### Run a specific test module
-
-To run an individual test module:
-
-```bash
-pytest tests/test_logging.py
-```
-
-### Run specific tests
-
-Since unittest is used inside most of the tests, to run specific subtests you need to know the name of the unittest
-class containing those tests. For example, it could be:
-
-```bash
-pytest tests/test_optimization.py::OptimizationTest::test_adam_w
-```
-
-Here:
-
-- `tests/test_optimization.py` - the file with tests
-- `OptimizationTest` - the name of the class
-- `test_adam_w` - the name of the specific test function
-
-If the file contains multiple classes, you can choose to run only tests of a given class. For example:
-
-```bash
-pytest tests/test_optimization.py::OptimizationTest
-```
-
-will run all the tests inside that class.
-
-As mentioned earlier you can see what tests are contained inside the `OptimizationTest` class by running:
-
-```bash
-pytest tests/test_optimization.py::OptimizationTest --collect-only -q
-```
-
-You can run tests by keyword expressions.
-
-To run only tests whose name contains `adam`:
-
-```bash
-pytest -k adam tests/test_optimization.py
-```
-
-Logical `and` and `or` can be used to indicate whether all keywords should match or either. `not` can be used to
-negate.
-
-To run all tests except those whose name contains `adam`:
-
-```bash
-pytest -k "not adam" tests/test_optimization.py
-```
-
-And you can combine the two patterns in one:
-
-```bash
-pytest -k "ada and not adam" tests/test_optimization.py
-```
-
-For example to run both `test_adafactor` and `test_adam_w` you can use:
-
-```bash
-pytest -k "test_adam_w or test_adam_w" tests/test_optimization.py
-```
-
-Note that we use `or` here, since we want either of the keywords to match to include both.
-
-If you want to include only tests that include both patterns, `and` is to be used:
-
-```bash
-pytest -k "test and ada" tests/test_optimization.py
-```
-### Run documentation tests 
-
-In order to test whether the documentation examples are correct, you should check that the `doctests` are passing. 
-As an example, let's use [`WhisperModel.forward`'s docstring](https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py#L1017-L1035): 
-
-```python 
-r"""
-Returns:
-
-Example:
-    ```python
-    >>> import torch
-    >>> from transformers import WhisperModel, WhisperFeatureExtractor
-    >>> from datasets import load_dataset
-
-    >>> model = WhisperModel.from_pretrained("openai/whisper-base")
-    >>> feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base")
-    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-    >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
-    >>> input_features = inputs.input_features
-    >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
-    >>> last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state
-    >>> list(last_hidden_state.shape)
-    [1, 2, 512]
-    ```"""
-
-```
-3 steps are required to debug the docstring examples: 
-1. In order to properly run the test, **an extra line has to be added** at the end of the docstring. This can be automatically done on any file using: 
-```bash 
-python utils/prepare_for_doc_test.py <path_to_file_or_dir>
-```
-
-2. Then, you can use the following line to automatically test every docstring example in the desired file: 
-```bash 
-pytest --doctest-modules <path_to_file_or_dir>
-```
-3. Once you are done debugging, you need to remove the extra line added in step **1.** by running the following: 
-```bash 
-python utils/prepare_for_doc_test.py <path_to_file_or_dir> --remove_new_line
-```
-
-### Run only modified tests
-
-You can run the tests related to the unstaged files or the current branch (according to Git) by using [pytest-picked](https://github.com/anapaulagomes/pytest-picked). This is a great way of quickly testing your changes didn't break
-anything, since it won't run the tests related to files you didn't touch.
-
-```bash
-pip install pytest-picked
-```
-
-```bash
-pytest --picked
-```
-
-All tests will be run from files and folders which are modified, but not yet committed.
-
-### Automatically rerun failed tests on source modification
-
-[pytest-xdist](https://github.com/pytest-dev/pytest-xdist) provides a very useful feature of detecting all failed
-tests, and then waiting for you to modify files and continuously re-rerun those failing tests until they pass while you
-fix them. So that you don't need to re start pytest after you made the fix. This is repeated until all tests pass after
-which again a full run is performed.
-
-```bash
-pip install pytest-xdist
-```
-
-To enter the mode: `pytest -f` or `pytest --looponfail`
-
-File changes are detected by looking at `looponfailroots` root directories and all of their contents (recursively).
-If the default for this value does not work for you, you can change it in your project by setting a configuration
-option in `setup.cfg`:
-
-```ini
-[tool:pytest]
-looponfailroots = transformers tests
-```
-
-or `pytest.ini`/``tox.ini`` files:
-
-```ini
-[pytest]
-looponfailroots = transformers tests
-```
-
-This would lead to only looking for file changes in the respective directories, specified relatively to the ini-file’s
-directory.
-
-[pytest-watch](https://github.com/joeyespo/pytest-watch) is an alternative implementation of this functionality.
-
-
-### Skip a test module
-
-If you want to run all test modules, except a few you can exclude them by giving an explicit list of tests to run. For
-example, to run all except `test_modeling_*.py` tests:
-
-```bash
-pytest *ls -1 tests/*py | grep -v test_modeling*
-```
-
-### Clearing state
-
-CI builds and when isolation is important (against speed), cache should be cleared:
-
-```bash
-pytest --cache-clear tests
-```
-
-### Running tests in parallel
-
-As mentioned earlier `make test` runs tests in parallel via `pytest-xdist` plugin (`-n X` argument, e.g. `-n 2`
-to run 2 parallel jobs).
-
-`pytest-xdist`'s `--dist=` option allows one to control how the tests are grouped. `--dist=loadfile` puts the
-tests located in one file onto the same process.
-
-Since the order of executed tests is different and unpredictable, if running the test suite with `pytest-xdist`
-produces failures (meaning we have some undetected coupled tests), use [pytest-replay](https://github.com/ESSS/pytest-replay) to replay the tests in the same order, which should help with then somehow
-reducing that failing sequence to a minimum.
-
-### Test order and repetition
-
-It's good to repeat the tests several times, in sequence, randomly, or in sets, to detect any potential
-inter-dependency and state-related bugs (tear down). And the straightforward multiple repetition is just good to detect
-some problems that get uncovered by randomness of DL.
-
-
-#### Repeat tests
-
-- [pytest-flakefinder](https://github.com/dropbox/pytest-flakefinder):
-
-```bash
-pip install pytest-flakefinder
-```
-
-And then run every test multiple times (50 by default):
-
-```bash
-pytest --flake-finder --flake-runs=5 tests/test_failing_test.py
-```
-
-<Tip>
-
-This plugin doesn't work with `-n` flag from `pytest-xdist`.
-
-</Tip>
-
-<Tip>
-
-There is another plugin `pytest-repeat`, but it doesn't work with `unittest`.
-
-</Tip>
-
-#### Run tests in a random order
-
-```bash
-pip install pytest-random-order
-```
-
-Important: the presence of `pytest-random-order` will automatically randomize tests, no configuration change or
-command line options is required.
-
-As explained earlier this allows detection of coupled tests - where one test's state affects the state of another. When
-`pytest-random-order` is installed it will print the random seed it used for that session, e.g:
-
-```bash
-pytest tests
-[...]
-Using --random-order-bucket=module
-Using --random-order-seed=573663
-```
-
-So that if the given particular sequence fails, you can reproduce it by adding that exact seed, e.g.:
-
-```bash
-pytest --random-order-seed=573663
-[...]
-Using --random-order-bucket=module
-Using --random-order-seed=573663
-```
-
-It will only reproduce the exact order if you use the exact same list of tests (or no list at all). Once you start to
-manually narrowing down the list you can no longer rely on the seed, but have to list them manually in the exact order
-they failed and tell pytest to not randomize them instead using `--random-order-bucket=none`, e.g.:
-
-```bash
-pytest --random-order-bucket=none tests/test_a.py tests/test_c.py tests/test_b.py
-```
-
-To disable the shuffling for all tests:
-
-```bash
-pytest --random-order-bucket=none
-```
-
-By default `--random-order-bucket=module` is implied, which will shuffle the files on the module levels. It can also
-shuffle on `class`, `package`, `global` and `none` levels. For the complete details please see its
-[documentation](https://github.com/jbasko/pytest-random-order).
-
-Another randomization alternative is: [`pytest-randomly`](https://github.com/pytest-dev/pytest-randomly). This
-module has a very similar functionality/interface, but it doesn't have the bucket modes available in
-`pytest-random-order`. It has the same problem of imposing itself once installed.
-
-### Look and feel variations
-
-#### pytest-sugar
-
-[pytest-sugar](https://github.com/Frozenball/pytest-sugar) is a plugin that improves the look-n-feel, adds a
-progressbar, and show tests that fail and the assert instantly. It gets activated automatically upon installation.
-
-```bash
-pip install pytest-sugar
-```
-
-To run tests without it, run:
-
-```bash
-pytest -p no:sugar
-```
-
-or uninstall it.
-
-
-
-#### Report each sub-test name and its progress
-
-For a single or a group of tests via `pytest` (after `pip install pytest-pspec`):
-
-```bash
-pytest --pspec tests/test_optimization.py
-```
-
-#### Instantly shows failed tests
-
-[pytest-instafail](https://github.com/pytest-dev/pytest-instafail) shows failures and errors instantly instead of
-waiting until the end of test session.
-
-```bash
-pip install pytest-instafail
-```
-
-```bash
-pytest --instafail
-```
-
-### To GPU or not to GPU
-
-On a GPU-enabled setup, to test in CPU-only mode add `CUDA_VISIBLE_DEVICES=""`:
-
-```bash
-CUDA_VISIBLE_DEVICES="" pytest tests/test_logging.py
-```
-
-or if you have multiple gpus, you can specify which one is to be used by `pytest`. For example, to use only the
-second gpu if you have gpus `0` and `1`, you can run:
-
-```bash
-CUDA_VISIBLE_DEVICES="1" pytest tests/test_logging.py
-```
-
-This is handy when you want to run different tasks on different GPUs.
-
-Some tests must be run on CPU-only, others on either CPU or GPU or TPU, yet others on multiple-GPUs. The following skip
-decorators are used to set the requirements of tests CPU/GPU/TPU-wise:
-
-- `require_torch` - this test will run only under torch
-- `require_torch_gpu` - as `require_torch` plus requires at least 1 GPU
-- `require_torch_multi_gpu` - as `require_torch` plus requires at least 2 GPUs
-- `require_torch_non_multi_gpu` - as `require_torch` plus requires 0 or 1 GPUs
-- `require_torch_up_to_2_gpus` - as `require_torch` plus requires 0 or 1 or 2 GPUs
-- `require_torch_tpu` - as `require_torch` plus requires at least 1 TPU
-
-Let's depict the GPU requirements in the following table:
-
-
-| n gpus | decorator                      |
-|--------+--------------------------------|
-| `>= 0` | `@require_torch`               |
-| `>= 1` | `@require_torch_gpu`           |
-| `>= 2` | `@require_torch_multi_gpu`     |
-| `< 2`  | `@require_torch_non_multi_gpu` |
-| `< 3`  | `@require_torch_up_to_2_gpus`  |
-
-
-For example, here is a test that must be run only when there are 2 or more GPUs available and pytorch is installed:
-
-```python no-style
-@require_torch_multi_gpu
-def test_example_with_multi_gpu():
-```
-
-If a test requires `tensorflow` use the `require_tf` decorator. For example:
-
-```python no-style
-@require_tf
-def test_tf_thing_with_tensorflow():
-```
-
-These decorators can be stacked. For example, if a test is slow and requires at least one GPU under pytorch, here is
-how to set it up:
-
-```python no-style
-@require_torch_gpu
-@slow
-def test_example_slow_on_gpu():
-```
-
-Some decorators like `@parametrized` rewrite test names, therefore `@require_*` skip decorators have to be listed
-last for them to work correctly. Here is an example of the correct usage:
-
-```python no-style
-@parameterized.expand(...)
-@require_torch_multi_gpu
-def test_integration_foo():
-```
-
-This order problem doesn't exist with `@pytest.mark.parametrize`, you can put it first or last and it will still
-work. But it only works with non-unittests.
-
-Inside tests:
-
-- How many GPUs are available:
-
-```python
-from transformers.testing_utils import get_gpu_count
-
-n_gpu = get_gpu_count()  # works with torch and tf
-```
-
-### Distributed training
-
-`pytest` can't deal with distributed training directly. If this is attempted - the sub-processes don't do the right
-thing and end up thinking they are `pytest` and start running the test suite in loops. It works, however, if one
-spawns a normal process that then spawns off multiple workers and manages the IO pipes.
-
-Here are some tests that use it:
-
-- [test_trainer_distributed.py](https://github.com/huggingface/transformers/tree/main/tests/trainer/test_trainer_distributed.py)
-- [test_deepspeed.py](https://github.com/huggingface/transformers/tree/main/tests/deepspeed/test_deepspeed.py)
-
-To jump right into the execution point, search for the `execute_subprocess_async` call in those tests.
-
-You will need at least 2 GPUs to see these tests in action:
-
-```bash
-CUDA_VISIBLE_DEVICES=0,1 RUN_SLOW=1 pytest -sv tests/test_trainer_distributed.py
-```
-
-### Output capture
-
-During test execution any output sent to `stdout` and `stderr` is captured. If a test or a setup method fails, its
-according captured output will usually be shown along with the failure traceback.
-
-To disable output capturing and to get the `stdout` and `stderr` normally, use `-s` or `--capture=no`:
-
-```bash
-pytest -s tests/test_logging.py
-```
-
-To send test results to JUnit format output:
-
-```bash
-py.test tests --junitxml=result.xml
-```
-
-### Color control
-
-To have no color (e.g., yellow on white background is not readable):
-
-```bash
-pytest --color=no tests/test_logging.py
-```
-
-### Sending test report to online pastebin service
-
-Creating a URL for each test failure:
-
-```bash
-pytest --pastebin=failed tests/test_logging.py
-```
-
-This will submit test run information to a remote Paste service and provide a URL for each failure. You may select
-tests as usual or add for example -x if you only want to send one particular failure.
-
-Creating a URL for a whole test session log:
-
-```bash
-pytest --pastebin=all tests/test_logging.py
-```
-
-## Writing tests
-
-🤗 transformers tests are based on `unittest`, but run by `pytest`, so most of the time features from both systems
-can be used.
-
-You can read [here](https://docs.pytest.org/en/stable/unittest.html) which features are supported, but the important
-thing to remember is that most `pytest` fixtures don't work. Neither parametrization, but we use the module
-`parameterized` that works in a similar way.
-
-
-### Parametrization
-
-Often, there is a need to run the same test multiple times, but with different arguments. It could be done from within
-the test, but then there is no way of running that test for just one set of arguments.
-
-```python
-# test_this1.py
-import unittest
-from parameterized import parameterized
-
-
-class TestMathUnitTest(unittest.TestCase):
-    @parameterized.expand(
-        [
-            ("negative", -1.5, -2.0),
-            ("integer", 1, 1.0),
-            ("large fraction", 1.6, 1),
-        ]
-    )
-    def test_floor(self, name, input, expected):
-        assert_equal(math.floor(input), expected)
-```
-
-Now, by default this test will be run 3 times, each time with the last 3 arguments of `test_floor` being assigned the
-corresponding arguments in the parameter list.
-
-and you could run just the `negative` and `integer` sets of params with:
-
-```bash
-pytest -k "negative and integer" tests/test_mytest.py
-```
-
-or all but `negative` sub-tests, with:
-
-```bash
-pytest -k "not negative" tests/test_mytest.py
-```
-
-Besides using the `-k` filter that was just mentioned, you can find out the exact name of each sub-test and run any
-or all of them using their exact names.
-
-```bash
-pytest test_this1.py --collect-only -q
-```
-
-and it will list:
-
-```bash
-test_this1.py::TestMathUnitTest::test_floor_0_negative
-test_this1.py::TestMathUnitTest::test_floor_1_integer
-test_this1.py::TestMathUnitTest::test_floor_2_large_fraction
-```
-
-So now you can run just 2 specific sub-tests:
-
-```bash
-pytest test_this1.py::TestMathUnitTest::test_floor_0_negative  test_this1.py::TestMathUnitTest::test_floor_1_integer
-```
-
-The module [parameterized](https://pypi.org/project/parameterized/) which is already in the developer dependencies
-of `transformers` works for both: `unittests` and `pytest` tests.
-
-If, however, the test is not a `unittest`, you may use `pytest.mark.parametrize` (or you may see it being used in
-some existing tests, mostly under `examples`).
-
-Here is the same example, this time using `pytest`'s `parametrize` marker:
-
-```python
-# test_this2.py
-import pytest
-
-
-@pytest.mark.parametrize(
-    "name, input, expected",
-    [
-        ("negative", -1.5, -2.0),
-        ("integer", 1, 1.0),
-        ("large fraction", 1.6, 1),
-    ],
-)
-def test_floor(name, input, expected):
-    assert_equal(math.floor(input), expected)
-```
-
-Same as with `parameterized`, with `pytest.mark.parametrize` you can have a fine control over which sub-tests are
-run, if the `-k` filter doesn't do the job. Except, this parametrization function creates a slightly different set of
-names for the sub-tests. Here is what they look like:
-
-```bash
-pytest test_this2.py --collect-only -q
-```
-
-and it will list:
-
-```bash
-test_this2.py::test_floor[integer-1-1.0]
-test_this2.py::test_floor[negative--1.5--2.0]
-test_this2.py::test_floor[large fraction-1.6-1]
-```
-
-So now you can run just the specific test:
-
-```bash
-pytest test_this2.py::test_floor[negative--1.5--2.0] test_this2.py::test_floor[integer-1-1.0]
-```
-
-as in the previous example.
-
-
-
-### Files and directories
-
-In tests often we need to know where things are relative to the current test file, and it's not trivial since the test
-could be invoked from more than one directory or could reside in sub-directories with different depths. A helper class
-`transformers.test_utils.TestCasePlus` solves this problem by sorting out all the basic paths and provides easy
-accessors to them:
-
-- `pathlib` objects (all fully resolved):
-
-  - `test_file_path` - the current test file path, i.e. `__file__`
-  - `test_file_dir` - the directory containing the current test file
-  - `tests_dir` - the directory of the `tests` test suite
-  - `examples_dir` - the directory of the `examples` test suite
-  - `repo_root_dir` - the directory of the repository
-  - `src_dir` - the directory of `src` (i.e. where the `transformers` sub-dir resides)
-
-- stringified paths---same as above but these return paths as strings, rather than `pathlib` objects:
-
-  - `test_file_path_str`
-  - `test_file_dir_str`
-  - `tests_dir_str`
-  - `examples_dir_str`
-  - `repo_root_dir_str`
-  - `src_dir_str`
-
-To start using those all you need is to make sure that the test resides in a subclass of
-`transformers.test_utils.TestCasePlus`. For example:
-
-```python
-from transformers.testing_utils import TestCasePlus
-
-
-class PathExampleTest(TestCasePlus):
-    def test_something_involving_local_locations(self):
-        data_dir = self.tests_dir / "fixtures/tests_samples/wmt_en_ro"
-```
-
-If you don't need to manipulate paths via `pathlib` or you just need a path as a string, you can always invoked
-`str()` on the `pathlib` object or use the accessors ending with `_str`. For example:
-
-```python
-from transformers.testing_utils import TestCasePlus
-
-
-class PathExampleTest(TestCasePlus):
-    def test_something_involving_stringified_locations(self):
-        examples_dir = self.examples_dir_str
-```
-
-### Temporary files and directories
-
-Using unique temporary files and directories are essential for parallel test running, so that the tests won't overwrite
-each other's data. Also we want to get the temporary files and directories removed at the end of each test that created
-them. Therefore, using packages like `tempfile`, which address these needs is essential.
-
-However, when debugging tests, you need to be able to see what goes into the temporary file or directory and you want
-to know it's exact path and not having it randomized on every test re-run.
-
-A helper class `transformers.test_utils.TestCasePlus` is best used for such purposes. It's a sub-class of
-`unittest.TestCase`, so we can easily inherit from it in the test modules.
-
-Here is an example of its usage:
-
-```python
-from transformers.testing_utils import TestCasePlus
-
-
-class ExamplesTests(TestCasePlus):
-    def test_whatever(self):
-        tmp_dir = self.get_auto_remove_tmp_dir()
-```
-
-This code creates a unique temporary directory, and sets `tmp_dir` to its location.
-
-- Create a unique temporary dir:
-
-```python
-def test_whatever(self):
-    tmp_dir = self.get_auto_remove_tmp_dir()
-```
-
-`tmp_dir` will contain the path to the created temporary dir. It will be automatically removed at the end of the
-test.
-
-- Create a temporary dir of my choice, ensure it's empty before the test starts and don't empty it after the test.
-
-```python
-def test_whatever(self):
-    tmp_dir = self.get_auto_remove_tmp_dir("./xxx")
-```
-
-This is useful for debug when you want to monitor a specific directory and want to make sure the previous tests didn't
-leave any data in there.
-
-- You can override the default behavior by directly overriding the `before` and `after` args, leading to one of the
-  following behaviors:
-
-  - `before=True`: the temporary dir will always be cleared at the beginning of the test.
-  - `before=False`: if the temporary dir already existed, any existing files will remain there.
-  - `after=True`: the temporary dir will always be deleted at the end of the test.
-  - `after=False`: the temporary dir will always be left intact at the end of the test.
-
-<Tip>
-
-In order to run the equivalent of `rm -r` safely, only subdirs of the project repository checkout are allowed if
-an explicit `tmp_dir` is used, so that by mistake no `/tmp` or similar important part of the filesystem will
-get nuked. i.e. please always pass paths that start with `./`.
-
-</Tip>
-
-<Tip>
-
-Each test can register multiple temporary directories and they all will get auto-removed, unless requested
-otherwise.
-
-</Tip>
-
-### Temporary sys.path override
-
-If you need to temporary override `sys.path` to import from another test for example, you can use the
-`ExtendSysPath` context manager. Example:
-
-
-```python
-import os
-from transformers.testing_utils import ExtendSysPath
-
-bindir = os.path.abspath(os.path.dirname(__file__))
-with ExtendSysPath(f"{bindir}/.."):
-    from test_trainer import TrainerIntegrationCommon  # noqa
-```
-
-### Skipping tests
-
-This is useful when a bug is found and a new test is written, yet the bug is not fixed yet. In order to be able to
-commit it to the main repository we need make sure it's skipped during `make test`.
-
-Methods:
-
--  A **skip** means that you expect your test to pass only if some conditions are met, otherwise pytest should skip
-  running the test altogether. Common examples are skipping windows-only tests on non-windows platforms, or skipping
-  tests that depend on an external resource which is not available at the moment (for example a database).
-
--  A **xfail** means that you expect a test to fail for some reason. A common example is a test for a feature not yet
-  implemented, or a bug not yet fixed. When a test passes despite being expected to fail (marked with
-  pytest.mark.xfail), it’s an xpass and will be reported in the test summary.
-
-One of the important differences between the two is that `skip` doesn't run the test, and `xfail` does. So if the
-code that's buggy causes some bad state that will affect other tests, do not use `xfail`.
-
-#### Implementation
-
-- Here is how to skip whole test unconditionally:
-
-```python no-style
-@unittest.skip("this bug needs to be fixed")
-def test_feature_x():
-```
-
-or via pytest:
-
-```python no-style
-@pytest.mark.skip(reason="this bug needs to be fixed")
-```
-
-or the `xfail` way:
-
-```python no-style
-@pytest.mark.xfail
-def test_feature_x():
-```
-
-- Here is how to skip a test based on some internal check inside the test:
-
-```python
-def test_feature_x():
-    if not has_something():
-        pytest.skip("unsupported configuration")
-```
-
-or the whole module:
-
-```python
-import pytest
-
-if not pytest.config.getoption("--custom-flag"):
-    pytest.skip("--custom-flag is missing, skipping tests", allow_module_level=True)
-```
-
-or the `xfail` way:
-
-```python
-def test_feature_x():
-    pytest.xfail("expected to fail until bug XYZ is fixed")
-```
-
-- Here is how to skip all tests in a module if some import is missing:
-
-```python
-docutils = pytest.importorskip("docutils", minversion="0.3")
-```
-
--  Skip a test based on a condition:
-
-```python no-style
-@pytest.mark.skipif(sys.version_info < (3,6), reason="requires python3.6 or higher")
-def test_feature_x():
-```
-
-or:
-
-```python no-style
-@unittest.skipIf(torch_device == "cpu", "Can't do half precision")
-def test_feature_x():
-```
-
-or skip the whole module:
-
-```python no-style
-@pytest.mark.skipif(sys.platform == 'win32', reason="does not run on windows")
-class TestClass():
-    def test_feature_x(self):
-```
-
-More details, example and ways are [here](https://docs.pytest.org/en/latest/skipping.html).
-
-### Slow tests
-
-The library of tests is ever-growing, and some of the tests take minutes to run, therefore we can't afford waiting for
-an hour for the test suite to complete on CI. Therefore, with some exceptions for essential tests, slow tests should be
-marked as in the example below:
-
-```python no-style
-from transformers.testing_utils import slow
-@slow
-def test_integration_foo():
-```
-
-Once a test is marked as `@slow`, to run such tests set `RUN_SLOW=1` env var, e.g.:
-
-```bash
-RUN_SLOW=1 pytest tests
-```
-
-Some decorators like `@parameterized` rewrite test names, therefore `@slow` and the rest of the skip decorators
-`@require_*` have to be listed last for them to work correctly. Here is an example of the correct usage:
-
-```python no-style
-@parameteriz ed.expand(...)
-@slow
-def test_integration_foo():
-```
-
-As explained at the beginning of this document, slow tests get to run on a scheduled basis, rather than in PRs CI
-checks. So it's possible that some problems will be missed during a PR submission and get merged. Such problems will
-get caught during the next scheduled CI job. But it also means that it's important to run the slow tests on your
-machine before submitting the PR.
-
-Here is a rough decision making mechanism for choosing which tests should be marked as slow:
-
-If the test is focused on one of the library's internal components (e.g., modeling files, tokenization files,
-pipelines), then we should run that test in the non-slow test suite. If it's focused on an other aspect of the library,
-such as the documentation or the examples, then we should run these tests in the slow test suite. And then, to refine
-this approach we should have exceptions:
-
-- All tests that need to download a heavy set of weights or a dataset that is larger than ~50MB (e.g., model or
-  tokenizer integration tests, pipeline integration tests) should be set to slow. If you're adding a new model, you
-  should create and upload to the hub a tiny version of it (with random weights) for integration tests. This is
-  discussed in the following paragraphs.
-- All tests that need to do a training not specifically optimized to be fast should be set to slow.
-- We can introduce exceptions if some of these should-be-non-slow tests are excruciatingly slow, and set them to
-  `@slow`. Auto-modeling tests, which save and load large files to disk, are a good example of tests that are marked
-  as `@slow`.
-- If a test completes under 1 second on CI (including downloads if any) then it should be a normal test regardless.
-
-Collectively, all the non-slow tests need to cover entirely the different internals, while remaining fast. For example,
-a significant coverage can be achieved by testing with specially created tiny models with random weights. Such models
-have the very minimal number of layers (e.g., 2), vocab size (e.g., 1000), etc. Then the `@slow` tests can use large
-slow models to do qualitative testing. To see the use of these simply look for *tiny* models with:
-
-```bash
-grep tiny tests examples
-```
-
-Here is a an example of a [script](https://github.com/huggingface/transformers/tree/main/scripts/fsmt/fsmt-make-tiny-model.py) that created the tiny model
-[stas/tiny-wmt19-en-de](https://huggingface.co/stas/tiny-wmt19-en-de). You can easily adjust it to your specific
-model's architecture.
-
-It's easy to measure the run-time incorrectly if for example there is an overheard of downloading a huge model, but if
-you test it locally the downloaded files would be cached and thus the download time not measured. Hence check the
-execution speed report in CI logs instead (the output of `pytest --durations=0 tests`).
-
-That report is also useful to find slow outliers that aren't marked as such, or which need to be re-written to be fast.
-If you notice that the test suite starts getting slow on CI, the top listing of this report will show the slowest
-tests.
-
-
-### Testing the stdout/stderr output
-
-In order to test functions that write to `stdout` and/or `stderr`, the test can access those streams using the
-`pytest`'s [capsys system](https://docs.pytest.org/en/latest/capture.html). Here is how this is accomplished:
-
-```python
-import sys
-
-
-def print_to_stdout(s):
-    print(s)
-
-
-def print_to_stderr(s):
-    sys.stderr.write(s)
-
-
-def test_result_and_stdout(capsys):
-    msg = "Hello"
-    print_to_stdout(msg)
-    print_to_stderr(msg)
-    out, err = capsys.readouterr()  # consume the captured output streams
-    # optional: if you want to replay the consumed streams:
-    sys.stdout.write(out)
-    sys.stderr.write(err)
-    # test:
-    assert msg in out
-    assert msg in err
-```
-
-And, of course, most of the time, `stderr` will come as a part of an exception, so try/except has to be used in such
-a case:
-
-```python
-def raise_exception(msg):
-    raise ValueError(msg)
-
-
-def test_something_exception():
-    msg = "Not a good value"
-    error = ""
-    try:
-        raise_exception(msg)
-    except Exception as e:
-        error = str(e)
-        assert msg in error, f"{msg} is in the exception:\n{error}"
-```
-
-Another approach to capturing stdout is via `contextlib.redirect_stdout`:
-
-```python
-from io import StringIO
-from contextlib import redirect_stdout
-
-
-def print_to_stdout(s):
-    print(s)
-
-
-def test_result_and_stdout():
-    msg = "Hello"
-    buffer = StringIO()
-    with redirect_stdout(buffer):
-        print_to_stdout(msg)
-    out = buffer.getvalue()
-    # optional: if you want to replay the consumed streams:
-    sys.stdout.write(out)
-    # test:
-    assert msg in out
-```
-
-An important potential issue with capturing stdout is that it may contain `\r` characters that in normal `print`
-reset everything that has been printed so far. There is no problem with `pytest`, but with `pytest -s` these
-characters get included in the buffer, so to be able to have the test run with and without `-s`, you have to make an
-extra cleanup to the captured output, using `re.sub(r'~.*\r', '', buf, 0, re.M)`.
-
-But, then we have a helper context manager wrapper to automatically take care of it all, regardless of whether it has
-some `\r`'s in it or not, so it's a simple:
-
-```python
-from transformers.testing_utils import CaptureStdout
-
-with CaptureStdout() as cs:
-    function_that_writes_to_stdout()
-print(cs.out)
-```
-
-Here is a full test example:
-
-```python
-from transformers.testing_utils import CaptureStdout
-
-msg = "Secret message\r"
-final = "Hello World"
-with CaptureStdout() as cs:
-    print(msg + final)
-assert cs.out == final + "\n", f"captured: {cs.out}, expecting {final}"
-```
-
-If you'd like to capture `stderr` use the `CaptureStderr` class instead:
-
-```python
-from transformers.testing_utils import CaptureStderr
-
-with CaptureStderr() as cs:
-    function_that_writes_to_stderr()
-print(cs.err)
-```
-
-If you need to capture both streams at once, use the parent `CaptureStd` class:
-
-```python
-from transformers.testing_utils import CaptureStd
-
-with CaptureStd() as cs:
-    function_that_writes_to_stdout_and_stderr()
-print(cs.err, cs.out)
-```
-
-Also, to aid debugging test issues, by default these context managers automatically replay the captured streams on exit
-from the context.
-
-
-### Capturing logger stream
-
-If you need to validate the output of a logger, you can use `CaptureLogger`:
-
-```python
-from transformers import logging
-from transformers.testing_utils import CaptureLogger
-
-msg = "Testing 1, 2, 3"
-logging.set_verbosity_info()
-logger = logging.get_logger("transformers.models.bart.tokenization_bart")
-with CaptureLogger(logger) as cl:
-    logger.info(msg)
-assert cl.out, msg + "\n"
-```
-
-### Testing with environment variables
-
-If you want to test the impact of environment variables for a specific test you can use a helper decorator
-`transformers.testing_utils.mockenv`
-
-```python
-from transformers.testing_utils import mockenv
-
-
-class HfArgumentParserTest(unittest.TestCase):
-    @mockenv(TRANSFORMERS_VERBOSITY="error")
-    def test_env_override(self):
-        env_level_str = os.getenv("TRANSFORMERS_VERBOSITY", None)
-```
-
-At times an external program needs to be called, which requires setting `PYTHONPATH` in `os.environ` to include
-multiple local paths. A helper class `transformers.test_utils.TestCasePlus` comes to help:
-
-```python
-from transformers.testing_utils import TestCasePlus
-
-
-class EnvExampleTest(TestCasePlus):
-    def test_external_prog(self):
-        env = self.get_env()
-        # now call the external program, passing `env` to it
-```
-
-Depending on whether the test file was under the `tests` test suite or `examples` it'll correctly set up
-`env[PYTHONPATH]` to include one of these two directories, and also the `src` directory to ensure the testing is
-done against the current repo, and finally with whatever `env[PYTHONPATH]` was already set to before the test was
-called if anything.
-
-This helper method creates a copy of the `os.environ` object, so the original remains intact.
-
-
-### Getting reproducible results
-
-In some situations you may want to remove randomness for your tests. To get identical reproducible results set, you
-will need to fix the seed:
-
-```python
-seed = 42
-
-# python RNG
-import random
-
-random.seed(seed)
-
-# pytorch RNGs
-import torch
-
-torch.manual_seed(seed)
-torch.backends.cudnn.deterministic = True
-if torch.cuda.is_available():
-    torch.cuda.manual_seed_all(seed)
-
-# numpy RNG
-import numpy as np
-
-np.random.seed(seed)
-
-# tf RNG
-tf.random.set_seed(seed)
-```
-
-### Debugging tests
-
-To start a debugger at the point of the warning, do this:
-
-```bash
-pytest tests/test_logging.py -W error::UserWarning --pdb
-```
-
-## Working with github actions workflows
-
-To trigger a self-push workflow CI job, you must:
-
-1. Create a new branch on `transformers` origin (not a fork!).
-2. The branch name has to start with either `ci_` or `ci-` (`main` triggers it too, but we can't do PRs on
-   `main`). It also gets triggered only for specific paths - you can find the up-to-date definition in case it
-   changed since this document has been written [here](https://github.com/huggingface/transformers/blob/main/.github/workflows/self-push.yml) under *push:*
-3. Create a PR from this branch.
-4. Then you can see the job appear [here](https://github.com/huggingface/transformers/actions/workflows/self-push.yml). It may not run right away if there
-   is a backlog.
-
-
-
-
-## Testing Experimental CI Features
-
-Testing CI features can be potentially problematic as it can interfere with the normal CI functioning. Therefore if a
-new CI feature is to be added, it should be done as following.
-
-1. Create a new dedicated job that tests what needs to be tested
-2. The new job must always succeed so that it gives us a green ✓ (details below).
-3. Let it run for some days to see that a variety of different PR types get to run on it (user fork branches,
-   non-forked branches, branches originating from github.com UI direct file edit, various forced pushes, etc. - there
-   are so many) while monitoring the experimental job's logs (not the overall job green as it's purposefully always
-   green)
-4. When it's clear that everything is solid, then merge the new changes into existing jobs.
-
-That way experiments on CI functionality itself won't interfere with the normal workflow.
-
-Now how can we make the job always succeed while the new CI feature is being developed?
-
-Some CIs, like TravisCI support ignore-step-failure and will report the overall job as successful, but CircleCI and
-Github Actions as of this writing don't support that.
-
-So the following workaround can be used:
-
-1. `set +euo pipefail` at the beginning of the run command to suppress most potential failures in the bash script.
-2. the last command must be a success: `echo "done"` or just `true` will do
-
-Here is an example:
-
-```yaml
-- run:
-    name: run CI experiment
-    command: |
-        set +euo pipefail
-        echo "setting run-all-despite-any-errors-mode"
-        this_command_will_fail
-        echo "but bash continues to run"
-        # emulate another failure
-        false
-        # but the last command must be a success
-        echo "during experiment do not remove: reporting success to CI, even if there were failures"
-```
-
-For simple commands you could also do:
-
-```bash
-cmd_that_may_fail || true
-```
-
-Of course, once satisfied with the results, integrate the experimental step or job with the rest of the normal jobs,
-while removing `set +euo pipefail` or any other things you may have added to ensure that the experimental job doesn't
-interfere with the normal CI functioning.
-
-This whole process would have been much easier if we only could set something like `allow-failure` for the
-experimental step, and let it fail without impacting the overall status of PRs. But as mentioned earlier CircleCI and
-Github Actions don't support it at the moment.
-
-You can vote for this feature and see where it is at these CI-specific threads:
-
-- [Github Actions:](https://github.com/actions/toolkit/issues/399)
-- [CircleCI:](https://ideas.circleci.com/ideas/CCI-I-344)
diff --git a/docs/source/en/tf_xla.md b/docs/source/en/tf_xla.md
new file mode 100644
index 000000000000..5f6a360dd8d5
--- /dev/null
+++ b/docs/source/en/tf_xla.md
@@ -0,0 +1,174 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# XLA Integration for TensorFlow Models
+
+[[open-in-colab]]
+
+Accelerated Linear Algebra, dubbed XLA, is a compiler for accelerating the runtime of TensorFlow Models. From the [official documentation](https://www.tensorflow.org/xla):
+
+XLA (Accelerated Linear Algebra) is a domain-specific compiler for linear algebra that can accelerate TensorFlow models with potentially no source code changes.
+
+Using XLA in TensorFlow is simple – it comes packaged inside the `tensorflow` library, and it can be triggered with the `jit_compile` argument in any graph-creating function such as [`tf.function`](https://www.tensorflow.org/guide/intro_to_graphs). When using Keras methods like `fit()` and `predict()`, you can enable XLA simply by passing the `jit_compile` argument to `model.compile()`. However, XLA is not limited to these methods - it can also be used to accelerate any arbitrary `tf.function`.
+
+Several TensorFlow methods in 🤗 Transformers have been rewritten to be XLA-compatible, including text generation for models such as [GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2), [T5](https://huggingface.co/docs/transformers/model_doc/t5) and [OPT](https://huggingface.co/docs/transformers/model_doc/opt), as well as speech processing for models such as [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper).
+
+While the exact amount of speed-up is very much model-dependent, for TensorFlow text generation models inside 🤗 Transformers, we noticed a speed-up of ~100x. This document will explain how you can use XLA for these models to get the maximum amount of performance. We’ll also provide links to additional resources if you’re interested to learn more about the benchmarks and our design philosophy behind the XLA integration.
+
+## Running TF functions with XLA
+
+Let us consider the following model in TensorFlow:
+
+```py
+import tensorflow as tf
+
+model = tf.keras.Sequential(
+    [tf.keras.layers.Dense(10, input_shape=(10,), activation="relu"), tf.keras.layers.Dense(5, activation="softmax")]
+)
+```
+
+The above model accepts inputs having a dimension of `(10, )`. We can use the model for running a forward pass like so:
+
+```py
+# Generate random inputs for the model.
+batch_size = 16
+input_vector_dim = 10
+random_inputs = tf.random.normal((batch_size, input_vector_dim))
+
+# Run a forward pass.
+_ = model(random_inputs)
+```
+
+In order to run the forward pass with an XLA-compiled function, we’d need to do:
+
+```py
+xla_fn = tf.function(model, jit_compile=True)
+_ = xla_fn(random_inputs)
+```
+
+The default `call()` function of the `model` is used for compiling the XLA graph. But if there’s any other model function you want to compile into XLA that’s also possible with:
+
+```py
+my_xla_fn = tf.function(model.my_xla_fn, jit_compile=True)
+```
+
+## Running a TF text generation model with XLA from 🤗 Transformers
+
+To enable XLA-accelerated generation within 🤗 Transformers, you need to have a recent version of `transformers` installed. You can install it by running:
+
+```bash
+pip install transformers --upgrade
+```
+
+And then you can run the following code:
+
+```py
+import tensorflow as tf
+from transformers import AutoTokenizer, TFAutoModelForCausalLM
+
+# Will error if the minimal version of Transformers is not installed.
+from transformers.utils import check_min_version
+
+check_min_version("4.21.0")
+
+
+tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side="left", pad_token="</s>")
+model = TFAutoModelForCausalLM.from_pretrained("gpt2")
+input_string = ["TensorFlow is"]
+
+# One line to create an XLA generation function
+xla_generate = tf.function(model.generate, jit_compile=True)
+
+tokenized_input = tokenizer(input_string, return_tensors="tf")
+generated_tokens = xla_generate(**tokenized_input, num_beams=2)
+
+decoded_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
+print(f"Generated -- {decoded_text}")
+# Generated -- TensorFlow is an open-source, open-source, distributed-source application # framework for the
+```
+
+As you can notice, enabling XLA on `generate()` is just a single line of code. The rest of the code remains unchanged. However, there are a couple of gotchas in the above code snippet that are specific to XLA. You need to be aware of those to realize the speed-ups that XLA can bring in. We discuss these in the following section. 
+
+## Gotchas to be aware of
+
+When you are executing an XLA-enabled function (like `xla_generate()` above) for the first time, it will internally try to infer the computation graph, which is time-consuming.  This process is known as [“tracing”](https://www.tensorflow.org/guide/intro_to_graphs#when_is_a_function_tracing). 
+
+You might notice that the generation time is not fast. Successive calls of `xla_generate()` (or any other XLA-enabled function) won’t have to infer the computation graph, given the inputs to the function follow the same shape with which the computation graph was initially built. While this is not a problem for modalities with fixed input shapes (e.g., images), you must pay attention if you are working with variable input shape modalities (e.g., text).
+
+To ensure `xla_generate()` always operates with the same input shapes, you can specify the `padding` arguments when calling the tokenizer. 
+
+```py
+import tensorflow as tf
+from transformers import AutoTokenizer, TFAutoModelForCausalLM
+
+tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side="left", pad_token="</s>")
+model = TFAutoModelForCausalLM.from_pretrained("gpt2")
+input_string = ["TensorFlow is"]
+
+xla_generate = tf.function(model.generate, jit_compile=True)
+
+# Here, we call the tokenizer with padding options.
+tokenized_input = tokenizer(input_string, pad_to_multiple_of=8, padding=True, return_tensors="tf")
+
+generated_tokens = xla_generate(**tokenized_input, num_beams=2)
+decoded_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
+print(f"Generated -- {decoded_text}")
+```
+
+This way, you can ensure that the inputs to `xla_generate()` will always receive inputs with the shape it was traced with and thus leading to speed-ups in the generation time. You can verify this with the code below:
+
+```py
+import time
+import tensorflow as tf
+from transformers import AutoTokenizer, TFAutoModelForCausalLM
+
+tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side="left", pad_token="</s>")
+model = TFAutoModelForCausalLM.from_pretrained("gpt2")
+
+xla_generate = tf.function(model.generate, jit_compile=True)
+
+for input_string in ["TensorFlow is", "TensorFlow is a", "TFLite is a"]:
+    tokenized_input = tokenizer(input_string, pad_to_multiple_of=8, padding=True, return_tensors="tf")
+    start = time.time_ns()
+    generated_tokens = xla_generate(**tokenized_input, num_beams=2)
+    end = time.time_ns()
+    print(f"Execution time -- {(end - start) / 1e6:.1f} ms\n")
+```
+
+On a Tesla T4 GPU, you can expect the outputs like so:
+
+```bash
+Execution time -- 30819.6 ms
+
+Execution time -- 79.0 ms
+
+Execution time -- 78.9 ms
+```
+The first call to `xla_generate()` is time-consuming because of tracing, but the successive calls are orders of magnitude faster. Keep in mind that any change in the generation options at any point with trigger re-tracing and thus leading to slow-downs in the generation time. 
+
+We didn’t cover all the text generation options 🤗 Transformers provides in this document. We encourage you to read the documentation for advanced use cases.
+
+## Additional Resources
+
+Here, we leave you with some additional resources if you want to delve deeper into XLA in 🤗 Transformers and in general. 
+ 
+* [This Colab Notebook](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/91_tf_xla_generate.ipynb) provides an interactive demonstration if you want to fiddle with the XLA-compatible encoder-decoder (like [T5](https://huggingface.co/docs/transformers/model_doc/t5)) and decoder-only (like [GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2)) text generation models. 
+* [This blog post](https://huggingface.co/blog/tf-xla-generate) provides an overview of the comparison benchmarks for XLA-compatible models along with a friendly introduction to XLA in TensorFlow. 
+* [This blog post](https://blog.tensorflow.org/2022/11/how-hugging-face-improved-text-generation-performance-with-xla.html) discusses our design philosophy behind adding XLA support to the TensorFlow models in 🤗 Transformers. 
+* Recommended posts for learning more about XLA and TensorFlow graphs in general:
+    * [XLA: Optimizing Compiler for Machine Learning](https://www.tensorflow.org/xla)
+    * [Introduction to graphs and tf.function](https://www.tensorflow.org/guide/intro_to_graphs)
+    * [Better performance with tf.function](https://www.tensorflow.org/guide/function) 
\ No newline at end of file
diff --git a/docs/source/en/tflite.md b/docs/source/en/tflite.md
new file mode 100644
index 000000000000..7b7735c992ea
--- /dev/null
+++ b/docs/source/en/tflite.md
@@ -0,0 +1,62 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Export to TFLite
+
+[TensorFlow Lite](https://www.tensorflow.org/lite/guide) is a lightweight framework for deploying machine learning models 
+on resource-constrained devices, such as mobile phones, embedded systems, and Internet of Things (IoT) devices. 
+TFLite is designed to optimize and run models efficiently on these devices with limited computational power, memory, and 
+power consumption.
+A TensorFlow Lite model is represented in a special efficient portable format identified by the `.tflite` file extension. 
+
+🤗 Optimum offers functionality to export 🤗 Transformers models to TFLite through the `exporters.tflite` module. 
+For the list of supported model architectures, please refer to [🤗 Optimum documentation](https://huggingface.co/docs/optimum/exporters/tflite/overview).
+
+To export a model to TFLite, install the required dependencies:
+ 
+```bash
+pip install optimum[exporters-tf]
+```
+
+To check out all available arguments, refer to the [🤗 Optimum docs](https://huggingface.co/docs/optimum/main/en/exporters/tflite/usage_guides/export_a_model), 
+or view help in command line:
+
+```bash
+optimum-cli export tflite --help
+```
+
+To export a model's checkpoint from the 🤗 Hub, for example, `bert-base-uncased`, run the following command:
+
+```bash
+optimum-cli export tflite --model bert-base-uncased --sequence_length 128 bert_tflite/
+```
+
+You should see the logs indicating progress and showing where the resulting `model.tflite` is saved, like this:
+
+```bash
+Validating TFLite model...
+	-[✓] TFLite model output names match reference model (logits)
+	- Validating TFLite Model output "logits":
+		-[✓] (1, 128, 30522) matches (1, 128, 30522)
+		-[x] values not close enough, max diff: 5.817413330078125e-05 (atol: 1e-05)
+The TensorFlow Lite export succeeded with the warning: The maximum absolute difference between the output of the reference model and the TFLite exported model is not within the set tolerance 1e-05:
+- logits: max diff = 5.817413330078125e-05.
+ The exported model was saved at: bert_tflite
+ ```
+
+The example above illustrates exporting a checkpoint from 🤗 Hub. When exporting a local model, first make sure that you 
+saved both the model's weights and tokenizer files in the same directory (`local_path`). When using CLI, pass the 
+`local_path` to the `model` argument instead of the checkpoint name on 🤗 Hub. 
\ No newline at end of file
diff --git a/docs/source/en/tokenizer_summary.md b/docs/source/en/tokenizer_summary.md
new file mode 100644
index 000000000000..5a23c7bf8473
--- /dev/null
+++ b/docs/source/en/tokenizer_summary.md
@@ -0,0 +1,282 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Summary of the tokenizers
+
+[[open-in-colab]]
+
+On this page, we will have a closer look at tokenization.
+
+<Youtube id="VFp38yj8h3A"/>
+
+As we saw in [the preprocessing tutorial](preprocessing), tokenizing a text is splitting it into words or
+subwords, which then are converted to ids through a look-up table. Converting words or subwords to ids is
+straightforward, so in this summary, we will focus on splitting a text into words or subwords (i.e. tokenizing a text).
+More specifically, we will look at the three main types of tokenizers used in 🤗 Transformers: [Byte-Pair Encoding
+(BPE)](#byte-pair-encoding), [WordPiece](#wordpiece), and [SentencePiece](#sentencepiece), and show examples
+of which tokenizer type is used by which model.
+
+Note that on each model page, you can look at the documentation of the associated tokenizer to know which tokenizer
+type was used by the pretrained model. For instance, if we look at [`BertTokenizer`], we can see
+that the model uses [WordPiece](#wordpiece).
+
+## Introduction
+
+Splitting a text into smaller chunks is a task that is harder than it looks, and there are multiple ways of doing so.
+For instance, let's look at the sentence `"Don't you love 🤗 Transformers? We sure do."`
+
+<Youtube id="nhJxYji1aho"/>
+
+A simple way of tokenizing this text is to split it by spaces, which would give:
+
+```
+["Don't", "you", "love", "🤗", "Transformers?", "We", "sure", "do."]
+```
+
+This is a sensible first step, but if we look at the tokens `"Transformers?"` and `"do."`, we notice that the
+punctuation is attached to the words `"Transformer"` and `"do"`, which is suboptimal. We should take the
+punctuation into account so that a model does not have to learn a different representation of a word and every possible
+punctuation symbol that could follow it, which would explode the number of representations the model has to learn.
+Taking punctuation into account, tokenizing our exemplary text would give:
+
+```
+["Don", "'", "t", "you", "love", "🤗", "Transformers", "?", "We", "sure", "do", "."]
+```
+
+Better. However, it is disadvantageous, how the tokenization dealt with the word `"Don't"`. `"Don't"` stands for
+`"do not"`, so it would be better tokenized as `["Do", "n't"]`. This is where things start getting complicated, and
+part of the reason each model has its own tokenizer type. Depending on the rules we apply for tokenizing a text, a
+different tokenized output is generated for the same text. A pretrained model only performs properly if you feed it an
+input that was tokenized with the same rules that were used to tokenize its training data.
+
+[spaCy](https://spacy.io/) and [Moses](http://www.statmt.org/moses/?n=Development.GetStarted) are two popular
+rule-based tokenizers. Applying them on our example, *spaCy* and *Moses* would output something like:
+
+```
+["Do", "n't", "you", "love", "🤗", "Transformers", "?", "We", "sure", "do", "."]
+```
+
+As can be seen space and punctuation tokenization, as well as rule-based tokenization, is used here. Space and
+punctuation tokenization and rule-based tokenization are both examples of word tokenization, which is loosely defined
+as splitting sentences into words. While it's the most intuitive way to split texts into smaller chunks, this
+tokenization method can lead to problems for massive text corpora. In this case, space and punctuation tokenization
+usually generates a very big vocabulary (the set of all unique words and tokens used). *E.g.*, [Transformer XL](model_doc/transformerxl) uses space and punctuation tokenization, resulting in a vocabulary size of 267,735!
+
+Such a big vocabulary size forces the model to have an enormous embedding matrix as the input and output layer, which
+causes both an increased memory and time complexity. In general, transformers models rarely have a vocabulary size
+greater than 50,000, especially if they are pretrained only on a single language.
+
+So if simple space and punctuation tokenization is unsatisfactory, why not simply tokenize on characters?
+
+<Youtube id="ssLq_EK2jLE"/>
+
+While character tokenization is very simple and would greatly reduce memory and time complexity it makes it much harder
+for the model to learn meaningful input representations. *E.g.* learning a meaningful context-independent
+representation for the letter `"t"` is much harder than learning a context-independent representation for the word
+`"today"`. Therefore, character tokenization is often accompanied by a loss of performance. So to get the best of
+both worlds, transformers models use a hybrid between word-level and character-level tokenization called **subword**
+tokenization.
+
+## Subword tokenization
+
+<Youtube id="zHvTiHr506c"/>
+
+Subword tokenization algorithms rely on the principle that frequently used words should not be split into smaller
+subwords, but rare words should be decomposed into meaningful subwords. For instance `"annoyingly"` might be
+considered a rare word and could be decomposed into `"annoying"` and `"ly"`. Both `"annoying"` and `"ly"` as
+stand-alone subwords would appear more frequently while at the same time the meaning of `"annoyingly"` is kept by the
+composite meaning of `"annoying"` and `"ly"`. This is especially useful in agglutinative languages such as Turkish,
+where you can form (almost) arbitrarily long complex words by stringing together subwords.
+
+Subword tokenization allows the model to have a reasonable vocabulary size while being able to learn meaningful
+context-independent representations. In addition, subword tokenization enables the model to process words it has never
+seen before, by decomposing them into known subwords. For instance, the [`~transformers.BertTokenizer`] tokenizes
+`"I have a new GPU!"` as follows:
+
+```py
+>>> from transformers import BertTokenizer
+
+>>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+>>> tokenizer.tokenize("I have a new GPU!")
+["i", "have", "a", "new", "gp", "##u", "!"]
+```
+
+Because we are considering the uncased model, the sentence was lowercased first. We can see that the words `["i", "have", "a", "new"]` are present in the tokenizer's vocabulary, but the word `"gpu"` is not. Consequently, the
+tokenizer splits `"gpu"` into known subwords: `["gp" and "##u"]`. `"##"` means that the rest of the token should
+be attached to the previous one, without space (for decoding or reversal of the tokenization).
+
+As another example, [`~transformers.XLNetTokenizer`] tokenizes our previously exemplary text as follows:
+
+```py
+>>> from transformers import XLNetTokenizer
+
+>>> tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
+>>> tokenizer.tokenize("Don't you love 🤗 Transformers? We sure do.")
+["▁Don", "'", "t", "▁you", "▁love", "▁", "🤗", "▁", "Transform", "ers", "?", "▁We", "▁sure", "▁do", "."]
+```
+
+We'll get back to the meaning of those `"▁"` when we look at [SentencePiece](#sentencepiece). As one can see,
+the rare word `"Transformers"` has been split into the more frequent subwords `"Transform"` and `"ers"`.
+
+Let's now look at how the different subword tokenization algorithms work. Note that all of those tokenization
+algorithms rely on some form of training which is usually done on the corpus the corresponding model will be trained
+on.
+
+<a id='byte-pair-encoding'></a>
+
+### Byte-Pair Encoding (BPE)
+
+Byte-Pair Encoding (BPE) was introduced in [Neural Machine Translation of Rare Words with Subword Units (Sennrich et
+al., 2015)](https://arxiv.org/abs/1508.07909). BPE relies on a pre-tokenizer that splits the training data into
+words. Pretokenization can be as simple as space tokenization, e.g. [GPT-2](model_doc/gpt2), [RoBERTa](model_doc/roberta). More advanced pre-tokenization include rule-based tokenization, e.g. [XLM](model_doc/xlm),
+[FlauBERT](model_doc/flaubert) which uses Moses for most languages, or [GPT](model_doc/gpt) which uses
+Spacy and ftfy, to count the frequency of each word in the training corpus.
+
+After pre-tokenization, a set of unique words has been created and the frequency with which each word occurred in the
+training data has been determined. Next, BPE creates a base vocabulary consisting of all symbols that occur in the set
+of unique words and learns merge rules to form a new symbol from two symbols of the base vocabulary. It does so until
+the vocabulary has attained the desired vocabulary size. Note that the desired vocabulary size is a hyperparameter to
+define before training the tokenizer.
+
+As an example, let's assume that after pre-tokenization, the following set of words including their frequency has been
+determined:
+
+```
+("hug", 10), ("pug", 5), ("pun", 12), ("bun", 4), ("hugs", 5)
+```
+
+Consequently, the base vocabulary is `["b", "g", "h", "n", "p", "s", "u"]`. Splitting all words into symbols of the
+base vocabulary, we obtain:
+
+```
+("h" "u" "g", 10), ("p" "u" "g", 5), ("p" "u" "n", 12), ("b" "u" "n", 4), ("h" "u" "g" "s", 5)
+```
+
+BPE then counts the frequency of each possible symbol pair and picks the symbol pair that occurs most frequently. In
+the example above `"h"` followed by `"u"` is present _10 + 5 = 15_ times (10 times in the 10 occurrences of
+`"hug"`, 5 times in the 5 occurrences of `"hugs"`). However, the most frequent symbol pair is `"u"` followed by
+`"g"`, occurring _10 + 5 + 5 = 20_ times in total. Thus, the first merge rule the tokenizer learns is to group all
+`"u"` symbols followed by a `"g"` symbol together. Next, `"ug"` is added to the vocabulary. The set of words then
+becomes
+
+```
+("h" "ug", 10), ("p" "ug", 5), ("p" "u" "n", 12), ("b" "u" "n", 4), ("h" "ug" "s", 5)
+```
+
+BPE then identifies the next most common symbol pair. It's `"u"` followed by `"n"`, which occurs 16 times. `"u"`,
+`"n"` is merged to `"un"` and added to the vocabulary. The next most frequent symbol pair is `"h"` followed by
+`"ug"`, occurring 15 times. Again the pair is merged and `"hug"` can be added to the vocabulary.
+
+At this stage, the vocabulary is `["b", "g", "h", "n", "p", "s", "u", "ug", "un", "hug"]` and our set of unique words
+is represented as
+
+```
+("hug", 10), ("p" "ug", 5), ("p" "un", 12), ("b" "un", 4), ("hug" "s", 5)
+```
+
+Assuming, that the Byte-Pair Encoding training would stop at this point, the learned merge rules would then be applied
+to new words (as long as those new words do not include symbols that were not in the base vocabulary). For instance,
+the word `"bug"` would be tokenized to `["b", "ug"]` but `"mug"` would be tokenized as `["<unk>", "ug"]` since
+the symbol `"m"` is not in the base vocabulary. In general, single letters such as `"m"` are not replaced by the
+`"<unk>"` symbol because the training data usually includes at least one occurrence of each letter, but it is likely
+to happen for very special characters like emojis.
+
+As mentioned earlier, the vocabulary size, *i.e.* the base vocabulary size + the number of merges, is a hyperparameter
+to choose. For instance [GPT](model_doc/gpt) has a vocabulary size of 40,478 since they have 478 base characters
+and chose to stop training after 40,000 merges.
+
+#### Byte-level BPE
+
+A base vocabulary that includes all possible base characters can be quite large if *e.g.* all unicode characters are
+considered as base characters. To have a better base vocabulary, [GPT-2](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) uses bytes
+as the base vocabulary, which is a clever trick to force the base vocabulary to be of size 256 while ensuring that
+every base character is included in the vocabulary. With some additional rules to deal with punctuation, the GPT2's
+tokenizer can tokenize every text without the need for the <unk> symbol. [GPT-2](model_doc/gpt) has a vocabulary
+size of 50,257, which corresponds to the 256 bytes base tokens, a special end-of-text token and the symbols learned
+with 50,000 merges.
+
+<a id='wordpiece'></a>
+
+### WordPiece
+
+WordPiece is the subword tokenization algorithm used for [BERT](model_doc/bert), [DistilBERT](model_doc/distilbert), and [Electra](model_doc/electra). The algorithm was outlined in [Japanese and Korean
+Voice Search (Schuster et al., 2012)](https://static.googleusercontent.com/media/research.google.com/ja//pubs/archive/37842.pdf) and is very similar to
+BPE. WordPiece first initializes the vocabulary to include every character present in the training data and
+progressively learns a given number of merge rules. In contrast to BPE, WordPiece does not choose the most frequent
+symbol pair, but the one that maximizes the likelihood of the training data once added to the vocabulary.
+
+So what does this mean exactly? Referring to the previous example, maximizing the likelihood of the training data is
+equivalent to finding the symbol pair, whose probability divided by the probabilities of its first symbol followed by
+its second symbol is the greatest among all symbol pairs. *E.g.* `"u"`, followed by `"g"` would have only been
+merged if the probability of `"ug"` divided by `"u"`, `"g"` would have been greater than for any other symbol
+pair. Intuitively, WordPiece is slightly different to BPE in that it evaluates what it _loses_ by merging two symbols
+to ensure it's _worth it_.
+
+<a id='unigram'></a>
+
+### Unigram
+
+Unigram is a subword tokenization algorithm introduced in [Subword Regularization: Improving Neural Network Translation
+Models with Multiple Subword Candidates (Kudo, 2018)](https://arxiv.org/pdf/1804.10959.pdf). In contrast to BPE or
+WordPiece, Unigram initializes its base vocabulary to a large number of symbols and progressively trims down each
+symbol to obtain a smaller vocabulary. The base vocabulary could for instance correspond to all pre-tokenized words and
+the most common substrings. Unigram is not used directly for any of the models in the transformers, but it's used in
+conjunction with [SentencePiece](#sentencepiece).
+
+At each training step, the Unigram algorithm defines a loss (often defined as the log-likelihood) over the training
+data given the current vocabulary and a unigram language model. Then, for each symbol in the vocabulary, the algorithm
+computes how much the overall loss would increase if the symbol was to be removed from the vocabulary. Unigram then
+removes p (with p usually being 10% or 20%) percent of the symbols whose loss increase is the lowest, *i.e.* those
+symbols that least affect the overall loss over the training data. This process is repeated until the vocabulary has
+reached the desired size. The Unigram algorithm always keeps the base characters so that any word can be tokenized.
+
+Because Unigram is not based on merge rules (in contrast to BPE and WordPiece), the algorithm has several ways of
+tokenizing new text after training. As an example, if a trained Unigram tokenizer exhibits the vocabulary:
+
+```
+["b", "g", "h", "n", "p", "s", "u", "ug", "un", "hug"],
+```
+
+`"hugs"` could be tokenized both as `["hug", "s"]`, `["h", "ug", "s"]` or `["h", "u", "g", "s"]`. So which one
+to choose? Unigram saves the probability of each token in the training corpus on top of saving the vocabulary so that
+the probability of each possible tokenization can be computed after training. The algorithm simply picks the most
+likely tokenization in practice, but also offers the possibility to sample a possible tokenization according to their
+probabilities.
+
+Those probabilities are defined by the loss the tokenizer is trained on. Assuming that the training data consists of
+the words \\(x_{1}, \dots, x_{N}\\) and that the set of all possible tokenizations for a word \\(x_{i}\\) is
+defined as \\(S(x_{i})\\), then the overall loss is defined as
+
+$$\mathcal{L} = -\sum_{i=1}^{N} \log \left ( \sum_{x \in S(x_{i})} p(x) \right )$$
+
+<a id='sentencepiece'></a>
+
+### SentencePiece
+
+All tokenization algorithms described so far have the same problem: It is assumed that the input text uses spaces to
+separate words. However, not all languages use spaces to separate words. One possible solution is to use language
+specific pre-tokenizers, *e.g.* [XLM](model_doc/xlm) uses a specific Chinese, Japanese, and Thai pre-tokenizer).
+To solve this problem more generally, [SentencePiece: A simple and language independent subword tokenizer and
+detokenizer for Neural Text Processing (Kudo et al., 2018)](https://arxiv.org/pdf/1808.06226.pdf) treats the input
+as a raw input stream, thus including the space in the set of characters to use. It then uses the BPE or unigram
+algorithm to construct the appropriate vocabulary.
+
+The [`XLNetTokenizer`] uses SentencePiece for example, which is also why in the example earlier the
+`"▁"` character was included in the vocabulary. Decoding with SentencePiece is very easy since all tokens can just be
+concatenated and `"▁"` is replaced by a space.
+
+All transformers models in the library that use SentencePiece use it in combination with unigram. Examples of models
+using SentencePiece are [ALBERT](model_doc/albert), [XLNet](model_doc/xlnet), [Marian](model_doc/marian), and [T5](model_doc/t5).
diff --git a/docs/source/en/tokenizer_summary.mdx b/docs/source/en/tokenizer_summary.mdx
deleted file mode 100644
index 942fe279068e..000000000000
--- a/docs/source/en/tokenizer_summary.mdx
+++ /dev/null
@@ -1,278 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Summary of the tokenizers
-
-[[open-in-colab]]
-
-On this page, we will have a closer look at tokenization.
-
-<Youtube id="VFp38yj8h3A"/>
-
-As we saw in [the preprocessing tutorial](preprocessing), tokenizing a text is splitting it into words or
-subwords, which then are converted to ids through a look-up table. Converting words or subwords to ids is
-straightforward, so in this summary, we will focus on splitting a text into words or subwords (i.e. tokenizing a text).
-More specifically, we will look at the three main types of tokenizers used in 🤗 Transformers: [Byte-Pair Encoding
-(BPE)](#byte-pair-encoding), [WordPiece](#wordpiece), and [SentencePiece](#sentencepiece), and show examples
-of which tokenizer type is used by which model.
-
-Note that on each model page, you can look at the documentation of the associated tokenizer to know which tokenizer
-type was used by the pretrained model. For instance, if we look at [`BertTokenizer`], we can see
-that the model uses [WordPiece](#wordpiece).
-
-## Introduction
-
-Splitting a text into smaller chunks is a task that is harder than it looks, and there are multiple ways of doing so.
-For instance, let's look at the sentence `"Don't you love 🤗 Transformers? We sure do."`
-
-<Youtube id="nhJxYji1aho"/>
-
-A simple way of tokenizing this text is to split it by spaces, which would give:
-
-```
-["Don't", "you", "love", "🤗", "Transformers?", "We", "sure", "do."]
-```
-
-This is a sensible first step, but if we look at the tokens `"Transformers?"` and `"do."`, we notice that the
-punctuation is attached to the words `"Transformer"` and `"do"`, which is suboptimal. We should take the
-punctuation into account so that a model does not have to learn a different representation of a word and every possible
-punctuation symbol that could follow it, which would explode the number of representations the model has to learn.
-Taking punctuation into account, tokenizing our exemplary text would give:
-
-```
-["Don", "'", "t", "you", "love", "🤗", "Transformers", "?", "We", "sure", "do", "."]
-```
-
-Better. However, it is disadvantageous, how the tokenization dealt with the word `"Don't"`. `"Don't"` stands for
-`"do not"`, so it would be better tokenized as `["Do", "n't"]`. This is where things start getting complicated, and
-part of the reason each model has its own tokenizer type. Depending on the rules we apply for tokenizing a text, a
-different tokenized output is generated for the same text. A pretrained model only performs properly if you feed it an
-input that was tokenized with the same rules that were used to tokenize its training data.
-
-[spaCy](https://spacy.io/) and [Moses](http://www.statmt.org/moses/?n=Development.GetStarted) are two popular
-rule-based tokenizers. Applying them on our example, *spaCy* and *Moses* would output something like:
-
-```
-["Do", "n't", "you", "love", "🤗", "Transformers", "?", "We", "sure", "do", "."]
-```
-
-As can be seen space and punctuation tokenization, as well as rule-based tokenization, is used here. Space and
-punctuation tokenization and rule-based tokenization are both examples of word tokenization, which is loosely defined
-as splitting sentences into words. While it's the most intuitive way to split texts into smaller chunks, this
-tokenization method can lead to problems for massive text corpora. In this case, space and punctuation tokenization
-usually generates a very big vocabulary (the set of all unique words and tokens used). *E.g.*, [Transformer XL](model_doc/transformerxl) uses space and punctuation tokenization, resulting in a vocabulary size of 267,735!
-
-Such a big vocabulary size forces the model to have an enormous embedding matrix as the input and output layer, which
-causes both an increased memory and time complexity. In general, transformers models rarely have a vocabulary size
-greater than 50,000, especially if they are pretrained only on a single language.
-
-So if simple space and punctuation tokenization is unsatisfactory, why not simply tokenize on characters?
-
-<Youtube id="ssLq_EK2jLE"/>
-
-While character tokenization is very simple and would greatly reduce memory and time complexity it makes it much harder
-for the model to learn meaningful input representations. *E.g.* learning a meaningful context-independent
-representation for the letter `"t"` is much harder than learning a context-independent representation for the word
-`"today"`. Therefore, character tokenization is often accompanied by a loss of performance. So to get the best of
-both worlds, transformers models use a hybrid between word-level and character-level tokenization called **subword**
-tokenization.
-
-## Subword tokenization
-
-<Youtube id="zHvTiHr506c"/>
-
-Subword tokenization algorithms rely on the principle that frequently used words should not be split into smaller
-subwords, but rare words should be decomposed into meaningful subwords. For instance `"annoyingly"` might be
-considered a rare word and could be decomposed into `"annoying"` and `"ly"`. Both `"annoying"` and `"ly"` as
-stand-alone subwords would appear more frequently while at the same time the meaning of `"annoyingly"` is kept by the
-composite meaning of `"annoying"` and `"ly"`. This is especially useful in agglutinative languages such as Turkish,
-where you can form (almost) arbitrarily long complex words by stringing together subwords.
-
-Subword tokenization allows the model to have a reasonable vocabulary size while being able to learn meaningful
-context-independent representations. In addition, subword tokenization enables the model to process words it has never
-seen before, by decomposing them into known subwords. For instance, the [`~transformers.BertTokenizer`] tokenizes
-`"I have a new GPU!"` as follows:
-
-```py
->>> from transformers import BertTokenizer
-
->>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
->>> tokenizer.tokenize("I have a new GPU!")
-["i", "have", "a", "new", "gp", "##u", "!"]
-```
-
-Because we are considering the uncased model, the sentence was lowercased first. We can see that the words `["i", "have", "a", "new"]` are present in the tokenizer's vocabulary, but the word `"gpu"` is not. Consequently, the
-tokenizer splits `"gpu"` into known subwords: `["gp" and "##u"]`. `"##"` means that the rest of the token should
-be attached to the previous one, without space (for decoding or reversal of the tokenization).
-
-As another example, [`~transformers.XLNetTokenizer`] tokenizes our previously exemplary text as follows:
-
-```py
->>> from transformers import XLNetTokenizer
-
->>> tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
->>> tokenizer.tokenize("Don't you love 🤗 Transformers? We sure do.")
-["▁Don", "'", "t", "▁you", "▁love", "▁", "🤗", "▁", "Transform", "ers", "?", "▁We", "▁sure", "▁do", "."]
-```
-
-We'll get back to the meaning of those `"▁"` when we look at [SentencePiece](#sentencepiece). As one can see,
-the rare word `"Transformers"` has been split into the more frequent subwords `"Transform"` and `"ers"`.
-
-Let's now look at how the different subword tokenization algorithms work. Note that all of those tokenization
-algorithms rely on some form of training which is usually done on the corpus the corresponding model will be trained
-on.
-
-<a id='byte-pair-encoding'></a>
-
-### Byte-Pair Encoding (BPE)
-
-Byte-Pair Encoding (BPE) was introduced in [Neural Machine Translation of Rare Words with Subword Units (Sennrich et
-al., 2015)](https://arxiv.org/abs/1508.07909). BPE relies on a pre-tokenizer that splits the training data into
-words. Pretokenization can be as simple as space tokenization, e.g. [GPT-2](model_doc/gpt2), [Roberta](model_doc/roberta). More advanced pre-tokenization include rule-based tokenization, e.g. [XLM](model_doc/xlm),
-[FlauBERT](model_doc/flaubert) which uses Moses for most languages, or [GPT](model_doc/gpt) which uses
-Spacy and ftfy, to count the frequency of each word in the training corpus.
-
-After pre-tokenization, a set of unique words has been created and the frequency of each word it occurred in the
-training data has been determined. Next, BPE creates a base vocabulary consisting of all symbols that occur in the set
-of unique words and learns merge rules to form a new symbol from two symbols of the base vocabulary. It does so until
-the vocabulary has attained the desired vocabulary size. Note that the desired vocabulary size is a hyperparameter to
-define before training the tokenizer.
-
-As an example, let's assume that after pre-tokenization, the following set of words including their frequency has been
-determined:
-
-```
-("hug", 10), ("pug", 5), ("pun", 12), ("bun", 4), ("hugs", 5)
-```
-
-Consequently, the base vocabulary is `["b", "g", "h", "n", "p", "s", "u"]`. Splitting all words into symbols of the
-base vocabulary, we obtain:
-
-```
-("h" "u" "g", 10), ("p" "u" "g", 5), ("p" "u" "n", 12), ("b" "u" "n", 4), ("h" "u" "g" "s", 5)
-```
-
-BPE then counts the frequency of each possible symbol pair and picks the symbol pair that occurs most frequently. In
-the example above `"h"` followed by `"u"` is present _10 + 5 = 15_ times (10 times in the 10 occurrences of
-`"hug"`, 5 times in the 5 occurrences of `"hugs"`). However, the most frequent symbol pair is `"u"` followed by
-`"g"`, occurring _10 + 5 + 5 = 20_ times in total. Thus, the first merge rule the tokenizer learns is to group all
-`"u"` symbols followed by a `"g"` symbol together. Next, `"ug"` is added to the vocabulary. The set of words then
-becomes
-
-```
-("h" "ug", 10), ("p" "ug", 5), ("p" "u" "n", 12), ("b" "u" "n", 4), ("h" "ug" "s", 5)
-```
-
-BPE then identifies the next most common symbol pair. It's `"u"` followed by `"n"`, which occurs 16 times. `"u"`,
-`"n"` is merged to `"un"` and added to the vocabulary. The next most frequent symbol pair is `"h"` followed by
-`"ug"`, occurring 15 times. Again the pair is merged and `"hug"` can be added to the vocabulary.
-
-At this stage, the vocabulary is `["b", "g", "h", "n", "p", "s", "u", "ug", "un", "hug"]` and our set of unique words
-is represented as
-
-```
-("hug", 10), ("p" "ug", 5), ("p" "un", 12), ("b" "un", 4), ("hug" "s", 5)
-```
-
-Assuming, that the Byte-Pair Encoding training would stop at this point, the learned merge rules would then be applied
-to new words (as long as those new words do not include symbols that were not in the base vocabulary). For instance,
-the word `"bug"` would be tokenized to `["b", "ug"]` but `"mug"` would be tokenized as `["<unk>", "ug"]` since
-the symbol `"m"` is not in the base vocabulary. In general, single letters such as `"m"` are not replaced by the
-`"<unk>"` symbol because the training data usually includes at least one occurrence of each letter, but it is likely
-to happen for very special characters like emojis.
-
-As mentioned earlier, the vocabulary size, *i.e.* the base vocabulary size + the number of merges, is a hyperparameter
-to choose. For instance [GPT](model_doc/gpt) has a vocabulary size of 40,478 since they have 478 base characters
-and chose to stop training after 40,000 merges.
-
-#### Byte-level BPE
-
-A base vocabulary that includes all possible base characters can be quite large if *e.g.* all unicode characters are
-considered as base characters. To have a better base vocabulary, [GPT-2](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) uses bytes
-as the base vocabulary, which is a clever trick to force the base vocabulary to be of size 256 while ensuring that
-every base character is included in the vocabulary. With some additional rules to deal with punctuation, the GPT2's
-tokenizer can tokenize every text without the need for the <unk> symbol. [GPT-2](model_doc/gpt) has a vocabulary
-size of 50,257, which corresponds to the 256 bytes base tokens, a special end-of-text token and the symbols learned
-with 50,000 merges.
-
-<a id='wordpiece'></a>
-
-### WordPiece
-
-WordPiece is the subword tokenization algorithm used for [BERT](model_doc/bert), [DistilBERT](model_doc/distilbert), and [Electra](model_doc/electra). The algorithm was outlined in [Japanese and Korean
-Voice Search (Schuster et al., 2012)](https://static.googleusercontent.com/media/research.google.com/ja//pubs/archive/37842.pdf) and is very similar to
-BPE. WordPiece first initializes the vocabulary to include every character present in the training data and
-progressively learns a given number of merge rules. In contrast to BPE, WordPiece does not choose the most frequent
-symbol pair, but the one that maximizes the likelihood of the training data once added to the vocabulary.
-
-So what does this mean exactly? Referring to the previous example, maximizing the likelihood of the training data is
-equivalent to finding the symbol pair, whose probability divided by the probabilities of its first symbol followed by
-its second symbol is the greatest among all symbol pairs. *E.g.* `"u"`, followed by `"g"` would have only been
-merged if the probability of `"ug"` divided by `"u"`, `"g"` would have been greater than for any other symbol
-pair. Intuitively, WordPiece is slightly different to BPE in that it evaluates what it _loses_ by merging two symbols
-to ensure it's _worth it_.
-
-<a id='unigram'></a>
-
-### Unigram
-
-Unigram is a subword tokenization algorithm introduced in [Subword Regularization: Improving Neural Network Translation
-Models with Multiple Subword Candidates (Kudo, 2018)](https://arxiv.org/pdf/1804.10959.pdf). In contrast to BPE or
-WordPiece, Unigram initializes its base vocabulary to a large number of symbols and progressively trims down each
-symbol to obtain a smaller vocabulary. The base vocabulary could for instance correspond to all pre-tokenized words and
-the most common substrings. Unigram is not used directly for any of the models in the transformers, but it's used in
-conjunction with [SentencePiece](#sentencepiece).
-
-At each training step, the Unigram algorithm defines a loss (often defined as the log-likelihood) over the training
-data given the current vocabulary and a unigram language model. Then, for each symbol in the vocabulary, the algorithm
-computes how much the overall loss would increase if the symbol was to be removed from the vocabulary. Unigram then
-removes p (with p usually being 10% or 20%) percent of the symbols whose loss increase is the lowest, *i.e.* those
-symbols that least affect the overall loss over the training data. This process is repeated until the vocabulary has
-reached the desired size. The Unigram algorithm always keeps the base characters so that any word can be tokenized.
-
-Because Unigram is not based on merge rules (in contrast to BPE and WordPiece), the algorithm has several ways of
-tokenizing new text after training. As an example, if a trained Unigram tokenizer exhibits the vocabulary:
-
-```
-["b", "g", "h", "n", "p", "s", "u", "ug", "un", "hug"],
-```
-
-`"hugs"` could be tokenized both as `["hug", "s"]`, `["h", "ug", "s"]` or `["h", "u", "g", "s"]`. So which one
-to choose? Unigram saves the probability of each token in the training corpus on top of saving the vocabulary so that
-the probability of each possible tokenization can be computed after training. The algorithm simply picks the most
-likely tokenization in practice, but also offers the possibility to sample a possible tokenization according to their
-probabilities.
-
-Those probabilities are defined by the loss the tokenizer is trained on. Assuming that the training data consists of
-the words \\(x_{1}, \dots, x_{N}\\) and that the set of all possible tokenizations for a word \\(x_{i}\\) is
-defined as \\(S(x_{i})\\), then the overall loss is defined as
-
-$$\mathcal{L} = -\sum_{i=1}^{N} \log \left ( \sum_{x \in S(x_{i})} p(x) \right )$$
-
-<a id='sentencepiece'></a>
-
-### SentencePiece
-
-All tokenization algorithms described so far have the same problem: It is assumed that the input text uses spaces to
-separate words. However, not all languages use spaces to separate words. One possible solution is to use language
-specific pre-tokenizers, *e.g.* [XLM](model_doc/xlm) uses a specific Chinese, Japanese, and Thai pre-tokenizer).
-To solve this problem more generally, [SentencePiece: A simple and language independent subword tokenizer and
-detokenizer for Neural Text Processing (Kudo et al., 2018)](https://arxiv.org/pdf/1808.06226.pdf) treats the input
-as a raw input stream, thus including the space in the set of characters to use. It then uses the BPE or unigram
-algorithm to construct the appropriate vocabulary.
-
-The [`XLNetTokenizer`] uses SentencePiece for example, which is also why in the example earlier the
-`"▁"` character was included in the vocabulary. Decoding with SentencePiece is very easy since all tokens can just be
-concatenated and `"▁"` is replaced by a space.
-
-All transformers models in the library that use SentencePiece use it in combination with unigram. Examples of models
-using SentencePiece are [ALBERT](model_doc/albert), [XLNet](model_doc/xlnet), [Marian](model_doc/marian), and [T5](model_doc/t5).
diff --git a/docs/source/en/torchscript.md b/docs/source/en/torchscript.md
new file mode 100644
index 000000000000..adf34b2ea699
--- /dev/null
+++ b/docs/source/en/torchscript.md
@@ -0,0 +1,229 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Export to TorchScript
+
+<Tip>
+
+This is the very beginning of our experiments with TorchScript and we are still
+exploring its capabilities with variable-input-size models. It is a focus of interest to
+us and we will deepen our analysis in upcoming releases, with more code examples, a more
+flexible implementation, and benchmarks comparing Python-based codes with compiled
+TorchScript.
+
+</Tip>
+
+According to the [TorchScript documentation](https://pytorch.org/docs/stable/jit.html):
+
+> TorchScript is a way to create serializable and optimizable models from PyTorch code.
+
+There are two PyTorch modules, [JIT and
+TRACE](https://pytorch.org/docs/stable/jit.html), that allow developers to export their
+models to be reused in other programs like efficiency-oriented C++ programs.
+
+We provide an interface that allows you to export 🤗 Transformers models to TorchScript
+so they can be reused in a different environment than PyTorch-based Python programs.
+Here, we explain how to export and use our models using TorchScript.
+
+Exporting a model requires two things:
+
+- model instantiation with the `torchscript` flag
+- a forward pass with dummy inputs
+
+These necessities imply several things developers should be careful about as detailed
+below.
+
+## TorchScript flag and tied weights
+
+The `torchscript` flag is necessary because most of the 🤗 Transformers language models
+have tied weights between their `Embedding` layer and their `Decoding` layer.
+TorchScript does not allow you to export models that have tied weights, so it is
+necessary to untie and clone the weights beforehand.
+
+Models instantiated with the `torchscript` flag have their `Embedding` layer and
+`Decoding` layer separated, which means that they should not be trained down the line.
+Training would desynchronize the two layers, leading to unexpected results.
+
+This is not the case for models that do not have a language model head, as those do not
+have tied weights. These models can be safely exported without the `torchscript` flag.
+
+## Dummy inputs and standard lengths
+
+The dummy inputs are used for a models forward pass. While the inputs' values are
+propagated through the layers, PyTorch keeps track of the different operations executed
+on each tensor. These recorded operations are then used to create the *trace* of the
+model.
+
+The trace is created relative to the inputs' dimensions. It is therefore constrained by
+the dimensions of the dummy input, and will not work for any other sequence length or
+batch size. When trying with a different size, the following error is raised:
+
+```
+`The expanded size of the tensor (3) must match the existing size (7) at non-singleton dimension 2`
+```
+
+We recommended you trace the model with a dummy input size at least as large as the
+largest input that will be fed to the model during inference. Padding can help fill the
+missing values. However, since the model is traced with a larger input size, the
+dimensions of the matrix will also be large, resulting in more calculations.
+
+Be careful of the total number of operations done on each input and follow the
+performance closely when exporting varying sequence-length models.
+
+## Using TorchScript in Python
+
+This section demonstrates how to save and load models as well as how to use the trace
+for inference.
+
+### Saving a model
+
+To export a `BertModel` with TorchScript, instantiate `BertModel` from the `BertConfig`
+class and then save it to disk under the filename `traced_bert.pt`:
+
+```python
+from transformers import BertModel, BertTokenizer, BertConfig
+import torch
+
+enc = BertTokenizer.from_pretrained("bert-base-uncased")
+
+# Tokenizing input text
+text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+tokenized_text = enc.tokenize(text)
+
+# Masking one of the input tokens
+masked_index = 8
+tokenized_text[masked_index] = "[MASK]"
+indexed_tokens = enc.convert_tokens_to_ids(tokenized_text)
+segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
+
+# Creating a dummy input
+tokens_tensor = torch.tensor([indexed_tokens])
+segments_tensors = torch.tensor([segments_ids])
+dummy_input = [tokens_tensor, segments_tensors]
+
+# Initializing the model with the torchscript flag
+# Flag set to True even though it is not necessary as this model does not have an LM Head.
+config = BertConfig(
+    vocab_size_or_config_json_file=32000,
+    hidden_size=768,
+    num_hidden_layers=12,
+    num_attention_heads=12,
+    intermediate_size=3072,
+    torchscript=True,
+)
+
+# Instantiating the model
+model = BertModel(config)
+
+# The model needs to be in evaluation mode
+model.eval()
+
+# If you are instantiating the model with *from_pretrained* you can also easily set the TorchScript flag
+model = BertModel.from_pretrained("bert-base-uncased", torchscript=True)
+
+# Creating the trace
+traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
+torch.jit.save(traced_model, "traced_bert.pt")
+```
+
+### Loading a model
+
+Now you can load the previously saved `BertModel`, `traced_bert.pt`, from disk and use
+it on the previously initialised `dummy_input`:
+
+```python
+loaded_model = torch.jit.load("traced_bert.pt")
+loaded_model.eval()
+
+all_encoder_layers, pooled_output = loaded_model(*dummy_input)
+```
+
+### Using a traced model for inference
+
+Use the traced model for inference by using its `__call__` dunder method:
+
+```python
+traced_model(tokens_tensor, segments_tensors)
+```
+
+## Deploy Hugging Face TorchScript models to AWS with the Neuron SDK
+
+AWS introduced the [Amazon EC2 Inf1](https://aws.amazon.com/ec2/instance-types/inf1/)
+instance family for low cost, high performance machine learning inference in the cloud.
+The Inf1 instances are powered by the AWS Inferentia chip, a custom-built hardware
+accelerator, specializing in deep learning inferencing workloads. [AWS
+Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/#) is the SDK for
+Inferentia that supports tracing and optimizing transformers models for deployment on
+Inf1. The Neuron SDK provides:
+
+
+1. Easy-to-use API with one line of code change to trace and optimize a TorchScript
+   model for inference in the cloud.
+2. Out of the box performance optimizations for [improved
+   cost-performance](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/benchmark/>).
+3. Support for Hugging Face transformers models built with either
+   [PyTorch](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/bert_tutorial/tutorial_pretrained_bert.html)
+   or
+   [TensorFlow](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/tensorflow/huggingface_bert/huggingface_bert.html).
+
+### Implications
+
+Transformers models based on the [BERT (Bidirectional Encoder Representations from
+Transformers)](https://huggingface.co/docs/transformers/main/model_doc/bert)
+architecture, or its variants such as
+[distilBERT](https://huggingface.co/docs/transformers/main/model_doc/distilbert) and
+[roBERTa](https://huggingface.co/docs/transformers/main/model_doc/roberta) run best on
+Inf1 for non-generative tasks such as extractive question answering, sequence
+classification, and token classification. However, text generation tasks can still be
+adapted to run on Inf1 according to this [AWS Neuron MarianMT
+tutorial](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/transformers-marianmt.html).
+More information about models that can be converted out of the box on Inferentia can be
+found in the [Model Architecture
+Fit](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/models/models-inferentia.html#models-inferentia)
+section of the Neuron documentation.
+
+### Dependencies
+
+Using AWS Neuron to convert models requires a [Neuron SDK
+environment](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/index.html#installation-guide)
+which comes preconfigured on [AWS Deep Learning
+AMI](https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-inferentia-launching.html).
+
+### Converting a model for AWS Neuron
+
+Convert a model for AWS NEURON using the same code from [Using TorchScript in
+Python](torchscript#using-torchscript-in-python) to trace a `BertModel`. Import the
+`torch.neuron` framework extension to access the components of the Neuron SDK through a
+Python API:
+
+```python
+from transformers import BertModel, BertTokenizer, BertConfig
+import torch
+import torch.neuron
+```
+
+You only need to modify the following line:
+
+```diff
+- torch.jit.trace(model, [tokens_tensor, segments_tensors])
++ torch.neuron.trace(model, [token_tensor, segments_tensors])
+```
+
+This enables the Neuron SDK to trace the model and optimize it for Inf1 instances.
+
+To learn more about AWS Neuron SDK features, tools, example tutorials and latest
+updates, please see the [AWS NeuronSDK
+documentation](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html).
diff --git a/docs/source/en/torchscript.mdx b/docs/source/en/torchscript.mdx
deleted file mode 100644
index 0840973ad078..000000000000
--- a/docs/source/en/torchscript.mdx
+++ /dev/null
@@ -1,225 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Export to TorchScript
-
-<Tip>
-
-This is the very beginning of our experiments with TorchScript and we are still
-exploring its capabilities with variable-input-size models. It is a focus of interest to
-us and we will deepen our analysis in upcoming releases, with more code examples, a more
-flexible implementation, and benchmarks comparing Python-based codes with compiled
-TorchScript.
-
-</Tip>
-
-According to the [TorchScript documentation](https://pytorch.org/docs/stable/jit.html):
-
-> TorchScript is a way to create serializable and optimizable models from PyTorch code.
-
-There are two PyTorch modules, [JIT and
-TRACE](https://pytorch.org/docs/stable/jit.html), that allow developers to export their
-models to be reused in other programs like efficiency-oriented C++ programs.
-
-We provide an interface that allows you to export 🤗 Transformers models to TorchScript
-so they can be reused in a different environment than PyTorch-based Python programs.
-Here, we explain how to export and use our models using TorchScript.
-
-Exporting a model requires two things:
-
-- model instantiation with the `torchscript` flag
-- a forward pass with dummy inputs
-
-These necessities imply several things developers should be careful about as detailed
-below.
-
-## TorchScript flag and tied weights
-
-The `torchscript` flag is necessary because most of the 🤗 Transformers language models
-have tied weights between their `Embedding` layer and their `Decoding` layer.
-TorchScript does not allow you to export models that have tied weights, so it is
-necessary to untie and clone the weights beforehand.
-
-Models instantiated with the `torchscript` flag have their `Embedding` layer and
-`Decoding` layer separated, which means that they should not be trained down the line.
-Training would desynchronize the two layers, leading to unexpected results.
-
-This is not the case for models that do not have a language model head, as those do not
-have tied weights. These models can be safely exported without the `torchscript` flag.
-
-## Dummy inputs and standard lengths
-
-The dummy inputs are used for a models forward pass. While the inputs' values are
-propagated through the layers, PyTorch keeps track of the different operations executed
-on each tensor. These recorded operations are then used to create the *trace* of the
-model.
-
-The trace is created relative to the inputs' dimensions. It is therefore constrained by
-the dimensions of the dummy input, and will not work for any other sequence length or
-batch size. When trying with a different size, the following error is raised:
-
-```
-`The expanded size of the tensor (3) must match the existing size (7) at non-singleton dimension 2`
-```
-
-We recommended you trace the model with a dummy input size at least as large as the
-largest input that will be fed to the model during inference. Padding can help fill the
-missing values. However, since the model is traced with a larger input size, the
-dimensions of the matrix will also be large, resulting in more calculations.
-
-Be careful of the total number of operations done on each input and follow the
-performance closely when exporting varying sequence-length models.
-
-## Using TorchScript in Python
-
-This section demonstrates how to save and load models as well as how to use the trace
-for inference.
-
-### Saving a model
-
-To export a `BertModel` with TorchScript, instantiate `BertModel` from the `BertConfig`
-class and then save it to disk under the filename `traced_bert.pt`:
-
-```python
-from transformers import BertModel, BertTokenizer, BertConfig
-import torch
-
-enc = BertTokenizer.from_pretrained("bert-base-uncased")
-
-# Tokenizing input text
-text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-tokenized_text = enc.tokenize(text)
-
-# Masking one of the input tokens
-masked_index = 8
-tokenized_text[masked_index] = "[MASK]"
-indexed_tokens = enc.convert_tokens_to_ids(tokenized_text)
-segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
-
-# Creating a dummy input
-tokens_tensor = torch.tensor([indexed_tokens])
-segments_tensors = torch.tensor([segments_ids])
-dummy_input = [tokens_tensor, segments_tensors]
-
-# Initializing the model with the torchscript flag
-# Flag set to True even though it is not necessary as this model does not have an LM Head.
-config = BertConfig(
-    vocab_size_or_config_json_file=32000,
-    hidden_size=768,
-    num_hidden_layers=12,
-    num_attention_heads=12,
-    intermediate_size=3072,
-    torchscript=True,
-)
-
-# Instantiating the model
-model = BertModel(config)
-
-# The model needs to be in evaluation mode
-model.eval()
-
-# If you are instantiating the model with *from_pretrained* you can also easily set the TorchScript flag
-model = BertModel.from_pretrained("bert-base-uncased", torchscript=True)
-
-# Creating the trace
-traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
-torch.jit.save(traced_model, "traced_bert.pt")
-```
-
-### Loading a model
-
-Now you can load the previously saved `BertModel`, `traced_bert.pt`, from disk and use
-it on the previously initialised `dummy_input`:
-
-```python
-loaded_model = torch.jit.load("traced_bert.pt")
-loaded_model.eval()
-
-all_encoder_layers, pooled_output = loaded_model(*dummy_input)
-```
-
-### Using a traced model for inference
-
-Use the traced model for inference by using its `__call__` dunder method:
-
-```python
-traced_model(tokens_tensor, segments_tensors)
-```
-
-## Deploy Hugging Face TorchScript models to AWS with the Neuron SDK
-
-AWS introduced the [Amazon EC2 Inf1](https://aws.amazon.com/ec2/instance-types/inf1/)
-instance family for low cost, high performance machine learning inference in the cloud.
-The Inf1 instances are powered by the AWS Inferentia chip, a custom-built hardware
-accelerator, specializing in deep learning inferencing workloads. [AWS
-Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/#) is the SDK for
-Inferentia that supports tracing and optimizing transformers models for deployment on
-Inf1. The Neuron SDK provides:
-
-
-1. Easy-to-use API with one line of code change to trace and optimize a TorchScript
-   model for inference in the cloud.
-2. Out of the box performance optimizations for [improved
-   cost-performance](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/benchmark/>).
-3. Support for Hugging Face transformers models built with either
-   [PyTorch](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/bert_tutorial/tutorial_pretrained_bert.html)
-   or
-   [TensorFlow](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/tensorflow/huggingface_bert/huggingface_bert.html).
-
-### Implications
-
-Transformers models based on the [BERT (Bidirectional Encoder Representations from
-Transformers)](https://huggingface.co/docs/transformers/main/model_doc/bert)
-architecture, or its variants such as
-[distilBERT](https://huggingface.co/docs/transformers/main/model_doc/distilbert) and
-[roBERTa](https://huggingface.co/docs/transformers/main/model_doc/roberta) run best on
-Inf1 for non-generative tasks such as extractive question answering, sequence
-classification, and token classification. However, text generation tasks can still be
-adapted to run on Inf1 according to this [AWS Neuron MarianMT
-tutorial](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/transformers-marianmt.html).
-More information about models that can be converted out of the box on Inferentia can be
-found in the [Model Architecture
-Fit](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/models/models-inferentia.html#models-inferentia)
-section of the Neuron documentation.
-
-### Dependencies
-
-Using AWS Neuron to convert models requires a [Neuron SDK
-environment](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/index.html#installation-guide)
-which comes preconfigured on [AWS Deep Learning
-AMI](https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-inferentia-launching.html).
-
-### Converting a model for AWS Neuron
-
-Convert a model for AWS NEURON using the same code from [Using TorchScript in
-Python](serialization#using-torchscript-in-python) to trace a `BertModel`. Import the
-`torch.neuron` framework extension to access the components of the Neuron SDK through a
-Python API:
-
-```python
-from transformers import BertModel, BertTokenizer, BertConfig
-import torch
-import torch.neuron
-```
-
-You only need to modify the following line:
-
-```diff
-- torch.jit.trace(model, [tokens_tensor, segments_tensors])
-+ torch.neuron.trace(model, [token_tensor, segments_tensors])
-```
-
-This enables the Neuron SDK to trace the model and optimize it for Inf1 instances.
-
-To learn more about AWS Neuron SDK features, tools, example tutorials and latest
-updates, please see the [AWS NeuronSDK
-documentation](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html).
diff --git a/docs/source/en/training.md b/docs/source/en/training.md
new file mode 100644
index 000000000000..fb4a0b6a279e
--- /dev/null
+++ b/docs/source/en/training.md
@@ -0,0 +1,434 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Fine-tune a pretrained model
+
+[[open-in-colab]]
+
+There are significant benefits to using a pretrained model. It reduces computation costs, your carbon footprint, and allows you to use state-of-the-art models without having to train one from scratch. 🤗 Transformers provides access to thousands of pretrained models for a wide range of tasks. When you use a pretrained model, you train it on a dataset specific to your task. This is known as fine-tuning, an incredibly powerful training technique. In this tutorial, you will fine-tune a pretrained model with a deep learning framework of your choice:
+
+* Fine-tune a pretrained model with 🤗 Transformers [`Trainer`].
+* Fine-tune a pretrained model in TensorFlow with Keras.
+* Fine-tune a pretrained model in native PyTorch.
+
+<a id='data-processing'></a>
+
+## Prepare a dataset
+
+<Youtube id="_BZearw7f0w"/>
+
+Before you can fine-tune a pretrained model, download a dataset and prepare it for training. The previous tutorial showed you how to process data for training, and now you get an opportunity to put those skills to the test!
+
+Begin by loading the [Yelp Reviews](https://huggingface.co/datasets/yelp_review_full) dataset:
+
+```py
+>>> from datasets import load_dataset
+
+>>> dataset = load_dataset("yelp_review_full")
+>>> dataset["train"][100]
+{'label': 0,
+ 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. I\'ve worked at more than one location. I expect bad days, bad moods, and the occasional mistake. But I have yet to have a decent experience at this store. It will remain a place I avoid unless someone in my party needs to avoid illness from low blood sugar. Perhaps I should go back to the racially biased service of Steak n Shake instead!'}
+```
+
+As you now know, you need a tokenizer to process the text and include a padding and truncation strategy to handle any variable sequence lengths. To process your dataset in one step, use 🤗 Datasets [`map`](https://huggingface.co/docs/datasets/process.html#map) method to apply a preprocessing function over the entire dataset:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+
+
+>>> def tokenize_function(examples):
+...     return tokenizer(examples["text"], padding="max_length", truncation=True)
+
+
+>>> tokenized_datasets = dataset.map(tokenize_function, batched=True)
+```
+
+If you like, you can create a smaller subset of the full dataset to fine-tune on to reduce the time it takes:
+
+```py
+>>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
+>>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
+```
+
+<a id='trainer'></a>
+
+## Train
+
+At this point, you should follow the section corresponding to the framework you want to use. You can use the links
+in the right sidebar to jump to the one you want - and if you want to hide all of the content for a given framework,
+just use the button at the top-right of that framework's block!
+
+<frameworkcontent>
+<pt>
+<Youtube id="nvBXf7s7vTI"/>
+
+## Train with PyTorch Trainer
+
+🤗 Transformers provides a [`Trainer`] class optimized for training 🤗 Transformers models, making it easier to start training without manually writing your own training loop. The [`Trainer`] API supports a wide range of training options and features such as logging, gradient accumulation, and mixed precision.
+
+Start by loading your model and specify the number of expected labels. From the Yelp Review [dataset card](https://huggingface.co/datasets/yelp_review_full#data-fields), you know there are five labels:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+```
+
+<Tip>
+
+You will see a warning about some of the pretrained weights not being used and some weights being randomly
+initialized. Don't worry, this is completely normal! The pretrained head of the BERT model is discarded, and replaced with a randomly initialized classification head. You will fine-tune this new model head on your sequence classification task, transferring the knowledge of the pretrained model to it.
+
+</Tip>
+
+### Training hyperparameters
+
+Next, create a [`TrainingArguments`] class which contains all the hyperparameters you can tune as well as flags for activating different training options. For this tutorial you can start with the default training [hyperparameters](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments), but feel free to experiment with these to find your optimal settings.
+
+Specify where to save the checkpoints from your training:
+
+```py
+>>> from transformers import TrainingArguments
+
+>>> training_args = TrainingArguments(output_dir="test_trainer")
+```
+
+### Evaluate
+
+[`Trainer`] does not automatically evaluate model performance during training. You'll need to pass [`Trainer`] a function to compute and report metrics. The [🤗 Evaluate](https://huggingface.co/docs/evaluate/index) library provides a simple [`accuracy`](https://huggingface.co/spaces/evaluate-metric/accuracy) function you can load with the [`evaluate.load`] (see this [quicktour](https://huggingface.co/docs/evaluate/a_quick_tour) for more information) function:
+
+```py
+>>> import numpy as np
+>>> import evaluate
+
+>>> metric = evaluate.load("accuracy")
+```
+
+Call [`~evaluate.compute`] on `metric` to calculate the accuracy of your predictions. Before passing your predictions to `compute`, you need to convert the predictions to logits (remember all 🤗 Transformers models return logits):
+
+```py
+>>> def compute_metrics(eval_pred):
+...     logits, labels = eval_pred
+...     predictions = np.argmax(logits, axis=-1)
+...     return metric.compute(predictions=predictions, references=labels)
+```
+
+If you'd like to monitor your evaluation metrics during fine-tuning, specify the `evaluation_strategy` parameter in your training arguments to report the evaluation metric at the end of each epoch:
+
+```py
+>>> from transformers import TrainingArguments, Trainer
+
+>>> training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
+```
+
+### Trainer
+
+Create a [`Trainer`] object with your model, training arguments, training and test datasets, and evaluation function:
+
+```py
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=small_train_dataset,
+...     eval_dataset=small_eval_dataset,
+...     compute_metrics=compute_metrics,
+... )
+```
+
+Then fine-tune your model by calling [`~transformers.Trainer.train`]:
+
+```py
+>>> trainer.train()
+```
+</pt>
+<tf>
+<a id='keras'></a>
+
+<Youtube id="rnTGBy2ax1c"/>
+
+## Train a TensorFlow model with Keras
+
+You can also train 🤗 Transformers models in TensorFlow with the Keras API!
+
+### Loading data for Keras
+
+When you want to train a 🤗 Transformers model with the Keras API, you need to convert your dataset to a format that
+Keras understands. If your dataset is small, you can just convert the whole thing to NumPy arrays and pass it to Keras.
+Let's try that first before we do anything more complicated.
+
+First, load a dataset. We'll use the CoLA dataset from the [GLUE benchmark](https://huggingface.co/datasets/glue),
+since it's a simple binary text classification task, and just take the training split for now.
+
+```py
+from datasets import load_dataset
+
+dataset = load_dataset("glue", "cola")
+dataset = dataset["train"]  # Just take the training split for now
+```
+
+Next, load a tokenizer and tokenize the data as NumPy arrays. Note that the labels are already a list of 0 and 1s,
+so we can just convert that directly to a NumPy array without tokenization!
+
+```py
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+tokenized_data = tokenizer(dataset["sentence"], return_tensors="np", padding=True)
+# Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras
+tokenized_data = dict(tokenized_data)
+
+labels = np.array(dataset["label"])  # Label is already an array of 0 and 1
+```
+
+Finally, load, [`compile`](https://keras.io/api/models/model_training_apis/#compile-method), and [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) the model. Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to:
+
+```py
+from transformers import TFAutoModelForSequenceClassification
+from tensorflow.keras.optimizers import Adam
+
+# Load and compile our model
+model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
+# Lower learning rates are often better for fine-tuning transformers
+model.compile(optimizer=Adam(3e-5))  # No loss argument!
+
+model.fit(tokenized_data, labels)
+```
+
+<Tip>
+
+You don't have to pass a loss argument to your models when you `compile()` them! Hugging Face models automatically
+choose a loss that is appropriate for their task and model architecture if this argument is left blank. You can always
+override this by specifying a loss yourself if you want to!
+
+</Tip>
+
+This approach works great for smaller datasets, but for larger datasets, you might find it starts to become a problem. Why?
+Because the tokenized array and labels would have to be fully loaded into memory, and because NumPy doesn’t handle
+“jagged” arrays, so every tokenized sample would have to be padded to the length of the longest sample in the whole
+dataset. That’s going to make your array even bigger, and all those padding tokens will slow down training too!
+
+### Loading data as a tf.data.Dataset
+
+If you want to avoid slowing down training, you can load your data as a `tf.data.Dataset` instead. Although you can write your own
+`tf.data` pipeline if you want, we have two convenience methods for doing this:
+
+- [`~TFPreTrainedModel.prepare_tf_dataset`]: This is the method we recommend in most cases. Because it is a method
+on your model, it can inspect the model to automatically figure out which columns are usable as model inputs, and
+discard the others to make a simpler, more performant dataset.
+- [`~datasets.Dataset.to_tf_dataset`]: This method is more low-level, and is useful when you want to exactly control how
+your dataset is created, by specifying exactly which `columns` and `label_cols` to include.
+
+Before you can use [`~TFPreTrainedModel.prepare_tf_dataset`], you will need to add the tokenizer outputs to your dataset as columns, as shown in
+the following code sample:
+
+```py
+def tokenize_dataset(data):
+    # Keys of the returned dictionary will be added to the dataset as columns
+    return tokenizer(data["text"])
+
+
+dataset = dataset.map(tokenize_dataset)
+```
+
+Remember that Hugging Face datasets are stored on disk by default, so this will not inflate your memory usage! Once the
+columns have been added, you can stream batches from the dataset and add padding to each batch, which greatly
+reduces the number of padding tokens compared to padding the entire dataset.
+
+
+```py
+>>> tf_dataset = model.prepare_tf_dataset(dataset["train"], batch_size=16, shuffle=True, tokenizer=tokenizer)
+```
+
+Note that in the code sample above, you need to pass the tokenizer to `prepare_tf_dataset` so it can correctly pad batches as they're loaded.
+If all the samples in your dataset are the same length and no padding is necessary, you can skip this argument.
+If you need to do something more complex than just padding samples (e.g. corrupting tokens for masked language
+modelling), you can use the `collate_fn` argument instead to pass a function that will be called to transform the
+list of samples into a batch and apply any preprocessing you want. See our
+[examples](https://github.com/huggingface/transformers/tree/main/examples) or
+[notebooks](https://huggingface.co/docs/transformers/notebooks) to see this approach in action.
+
+Once you've created a `tf.data.Dataset`, you can compile and fit the model as before:
+
+```py
+model.compile(optimizer=Adam(3e-5))  # No loss argument!
+
+model.fit(tf_dataset)
+```
+
+</tf>
+</frameworkcontent>
+
+<a id='pytorch_native'></a>
+
+## Train in native PyTorch
+
+<frameworkcontent>
+<pt>
+<Youtube id="Dh9CL8fyG80"/>
+
+[`Trainer`] takes care of the training loop and allows you to fine-tune a model in a single line of code. For users who prefer to write their own training loop, you can also fine-tune a 🤗 Transformers model in native PyTorch.
+
+At this point, you may need to restart your notebook or execute the following code to free some memory:
+
+```py
+del model
+del trainer
+torch.cuda.empty_cache()
+```
+
+Next, manually postprocess `tokenized_dataset` to prepare it for training.
+
+1. Remove the `text` column because the model does not accept raw text as an input:
+
+    ```py
+    >>> tokenized_datasets = tokenized_datasets.remove_columns(["text"])
+    ```
+
+2. Rename the `label` column to `labels` because the model expects the argument to be named `labels`:
+
+    ```py
+    >>> tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
+    ```
+
+3. Set the format of the dataset to return PyTorch tensors instead of lists:
+
+    ```py
+    >>> tokenized_datasets.set_format("torch")
+    ```
+
+Then create a smaller subset of the dataset as previously shown to speed up the fine-tuning:
+
+```py
+>>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
+>>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
+```
+
+### DataLoader
+
+Create a `DataLoader` for your training and test datasets so you can iterate over batches of data:
+
+```py
+>>> from torch.utils.data import DataLoader
+
+>>> train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
+>>> eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)
+```
+
+Load your model with the number of expected labels:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+```
+
+### Optimizer and learning rate scheduler
+
+Create an optimizer and learning rate scheduler to fine-tune the model. Let's use the [`AdamW`](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html) optimizer from PyTorch:
+
+```py
+>>> from torch.optim import AdamW
+
+>>> optimizer = AdamW(model.parameters(), lr=5e-5)
+```
+
+Create the default learning rate scheduler from [`Trainer`]:
+
+```py
+>>> from transformers import get_scheduler
+
+>>> num_epochs = 3
+>>> num_training_steps = num_epochs * len(train_dataloader)
+>>> lr_scheduler = get_scheduler(
+...     name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
+... )
+```
+
+Lastly, specify `device` to use a GPU if you have access to one. Otherwise, training on a CPU may take several hours instead of a couple of minutes.
+
+```py
+>>> import torch
+
+>>> device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+>>> model.to(device)
+```
+
+<Tip>
+
+Get free access to a cloud GPU if you don't have one with a hosted notebook like [Colaboratory](https://colab.research.google.com/) or [SageMaker StudioLab](https://studiolab.sagemaker.aws/).
+
+</Tip>
+
+Great, now you are ready to train! 🥳 
+
+### Training loop
+
+To keep track of your training progress, use the [tqdm](https://tqdm.github.io/) library to add a progress bar over the number of training steps:
+
+```py
+>>> from tqdm.auto import tqdm
+
+>>> progress_bar = tqdm(range(num_training_steps))
+
+>>> model.train()
+>>> for epoch in range(num_epochs):
+...     for batch in train_dataloader:
+...         batch = {k: v.to(device) for k, v in batch.items()}
+...         outputs = model(**batch)
+...         loss = outputs.loss
+...         loss.backward()
+
+...         optimizer.step()
+...         lr_scheduler.step()
+...         optimizer.zero_grad()
+...         progress_bar.update(1)
+```
+
+### Evaluate
+
+Just like how you added an evaluation function to [`Trainer`], you need to do the same when you write your own training loop. But instead of calculating and reporting the metric at the end of each epoch, this time you'll accumulate all the batches with [`~evaluate.add_batch`] and calculate the metric at the very end.
+
+```py
+>>> import evaluate
+
+>>> metric = evaluate.load("accuracy")
+>>> model.eval()
+>>> for batch in eval_dataloader:
+...     batch = {k: v.to(device) for k, v in batch.items()}
+...     with torch.no_grad():
+...         outputs = model(**batch)
+
+...     logits = outputs.logits
+...     predictions = torch.argmax(logits, dim=-1)
+...     metric.add_batch(predictions=predictions, references=batch["labels"])
+
+>>> metric.compute()
+```
+</pt>
+</frameworkcontent>
+
+<a id='additional-resources'></a>
+
+## Additional resources
+
+For more fine-tuning examples, refer to:
+
+- [🤗 Transformers Examples](https://github.com/huggingface/transformers/tree/main/examples) includes scripts
+  to train common NLP tasks in PyTorch and TensorFlow.
+
+- [🤗 Transformers Notebooks](notebooks) contains various notebooks on how to fine-tune a model for specific tasks in PyTorch and TensorFlow.
diff --git a/docs/source/en/training.mdx b/docs/source/en/training.mdx
deleted file mode 100644
index 336ce05b83c0..000000000000
--- a/docs/source/en/training.mdx
+++ /dev/null
@@ -1,430 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Fine-tune a pretrained model
-
-[[open-in-colab]]
-
-There are significant benefits to using a pretrained model. It reduces computation costs, your carbon footprint, and allows you to use state-of-the-art models without having to train one from scratch. 🤗 Transformers provides access to thousands of pretrained models for a wide range of tasks. When you use a pretrained model, you train it on a dataset specific to your task. This is known as fine-tuning, an incredibly powerful training technique. In this tutorial, you will fine-tune a pretrained model with a deep learning framework of your choice:
-
-* Fine-tune a pretrained model with 🤗 Transformers [`Trainer`].
-* Fine-tune a pretrained model in TensorFlow with Keras.
-* Fine-tune a pretrained model in native PyTorch.
-
-<a id='data-processing'></a>
-
-## Prepare a dataset
-
-<Youtube id="_BZearw7f0w"/>
-
-Before you can fine-tune a pretrained model, download a dataset and prepare it for training. The previous tutorial showed you how to process data for training, and now you get an opportunity to put those skills to the test!
-
-Begin by loading the [Yelp Reviews](https://huggingface.co/datasets/yelp_review_full) dataset:
-
-```py
->>> from datasets import load_dataset
-
->>> dataset = load_dataset("yelp_review_full")
->>> dataset["train"][100]
-{'label': 0,
- 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. I\'ve worked at more than one location. I expect bad days, bad moods, and the occasional mistake. But I have yet to have a decent experience at this store. It will remain a place I avoid unless someone in my party needs to avoid illness from low blood sugar. Perhaps I should go back to the racially biased service of Steak n Shake instead!'}
-```
-
-As you now know, you need a tokenizer to process the text and include a padding and truncation strategy to handle any variable sequence lengths. To process your dataset in one step, use 🤗 Datasets [`map`](https://huggingface.co/docs/datasets/process.html#map) method to apply a preprocessing function over the entire dataset:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
-
-
->>> def tokenize_function(examples):
-...     return tokenizer(examples["text"], padding="max_length", truncation=True)
-
-
->>> tokenized_datasets = dataset.map(tokenize_function, batched=True)
-```
-
-If you like, you can create a smaller subset of the full dataset to fine-tune on to reduce the time it takes:
-
-```py
->>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
->>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
-```
-
-<a id='trainer'></a>
-
-## Train
-
-At this point, you should follow the section corresponding to the framework you want to use. You can use the links
-in the right sidebar to jump to the one you want - and if you want to hide all of the content for a given framework,
-just use the button at the top-right of that framework's block!
-
-<frameworkcontent>
-<pt>
-<Youtube id="nvBXf7s7vTI"/>
-
-## Train with PyTorch Trainer
-
-🤗 Transformers provides a [`Trainer`] class optimized for training 🤗 Transformers models, making it easier to start training without manually writing your own training loop. The [`Trainer`] API supports a wide range of training options and features such as logging, gradient accumulation, and mixed precision.
-
-Start by loading your model and specify the number of expected labels. From the Yelp Review [dataset card](https://huggingface.co/datasets/yelp_review_full#data-fields), you know there are five labels:
-
-```py
->>> from transformers import AutoModelForSequenceClassification
-
->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
-```
-
-<Tip>
-
-You will see a warning about some of the pretrained weights not being used and some weights being randomly
-initialized. Don't worry, this is completely normal! The pretrained head of the BERT model is discarded, and replaced with a randomly initialized classification head. You will fine-tune this new model head on your sequence classification task, transferring the knowledge of the pretrained model to it.
-
-</Tip>
-
-### Training hyperparameters
-
-Next, create a [`TrainingArguments`] class which contains all the hyperparameters you can tune as well as flags for activating different training options. For this tutorial you can start with the default training [hyperparameters](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments), but feel free to experiment with these to find your optimal settings.
-
-Specify where to save the checkpoints from your training:
-
-```py
->>> from transformers import TrainingArguments
-
->>> training_args = TrainingArguments(output_dir="test_trainer")
-```
-
-### Evaluate
-
-[`Trainer`] does not automatically evaluate model performance during training. You'll need to pass [`Trainer`] a function to compute and report metrics. The [🤗 Evaluate](https://huggingface.co/docs/evaluate/index) library provides a simple [`accuracy`](https://huggingface.co/spaces/evaluate-metric/accuracy) function you can load with the [`evaluate.load`] (see this [quicktour](https://huggingface.co/docs/evaluate/a_quick_tour) for more information) function:
-
-```py
->>> import numpy as np
->>> import evaluate
-
->>> metric = evaluate.load("accuracy")
-```
-
-Call [`~evaluate.compute`] on `metric` to calculate the accuracy of your predictions. Before passing your predictions to `compute`, you need to convert the predictions to logits (remember all 🤗 Transformers models return logits):
-
-```py
->>> def compute_metrics(eval_pred):
-...     logits, labels = eval_pred
-...     predictions = np.argmax(logits, axis=-1)
-...     return metric.compute(predictions=predictions, references=labels)
-```
-
-If you'd like to monitor your evaluation metrics during fine-tuning, specify the `evaluation_strategy` parameter in your training arguments to report the evaluation metric at the end of each epoch:
-
-```py
->>> from transformers import TrainingArguments, Trainer
-
->>> training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
-```
-
-### Trainer
-
-Create a [`Trainer`] object with your model, training arguments, training and test datasets, and evaluation function:
-
-```py
->>> trainer = Trainer(
-...     model=model,
-...     args=training_args,
-...     train_dataset=small_train_dataset,
-...     eval_dataset=small_eval_dataset,
-...     compute_metrics=compute_metrics,
-... )
-```
-
-Then fine-tune your model by calling [`~transformers.Trainer.train`]:
-
-```py
->>> trainer.train()
-```
-</pt>
-<tf>
-<a id='keras'></a>
-
-<Youtube id="rnTGBy2ax1c"/>
-
-## Train a TensorFlow model with Keras
-
-You can also train 🤗 Transformers models in TensorFlow with the Keras API!
-
-### Loading data for Keras
-
-When you want to train a 🤗 Transformers model with the Keras API, you need to convert your dataset to a format that
-Keras understands. If your dataset is small, you can just convert the whole thing to NumPy arrays and pass it to Keras.
-Let's try that first before we do anything more complicated.
-
-First, load a dataset. We'll use the CoLA dataset from the [GLUE benchmark](https://huggingface.co/datasets/glue),
-since it's a simple binary text classification task, and just take the training split for now.
-
-```py
-from datasets import load_dataset
-
-dataset = load_dataset("glue", "cola")
-dataset = dataset["train"]  # Just take the training split for now
-```
-
-Next, load a tokenizer and tokenize the data as NumPy arrays. Note that the labels are already a list of 0 and 1s,
-so we can just convert that directly to a NumPy array without tokenization!
-
-```py
-from transformers import AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
-tokenized_data = tokenizer(dataset["text"], return_tensors="np", padding=True)
-# Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras
-tokenized_data = dict(tokenized_data)
-
-labels = np.array(dataset["label"])  # Label is already an array of 0 and 1
-```
-
-Finally, load, [`compile`](https://keras.io/api/models/model_training_apis/#compile-method), and [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) the model:
-
-```py
-from transformers import TFAutoModelForSequenceClassification
-from tensorflow.keras.optimizers import Adam
-
-# Load and compile our model
-model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
-# Lower learning rates are often better for fine-tuning transformers
-model.compile(optimizer=Adam(3e-5))
-
-model.fit(tokenized_data, labels)
-```
-
-<Tip>
-
-You don't have to pass a loss argument to your models when you `compile()` them! Hugging Face models automatically
-choose a loss that is appropriate for their task and model architecture if this argument is left blank. You can always
-override this by specifying a loss yourself if you want to!
-
-</Tip>
-
-This approach works great for smaller datasets, but for larger datasets, you might find it starts to become a problem. Why?
-Because the tokenized array and labels would have to be fully loaded into memory, and because NumPy doesn’t handle
-“jagged” arrays, so every tokenized sample would have to be padded to the length of the longest sample in the whole
-dataset. That’s going to make your array even bigger, and all those padding tokens will slow down training too!
-
-### Loading data as a tf.data.Dataset
-
-If you want to avoid slowing down training, you can load your data as a `tf.data.Dataset` instead. Although you can write your own
-`tf.data` pipeline if you want, we have two convenience methods for doing this:
-
-- [`~TFPreTrainedModel.prepare_tf_dataset`]: This is the method we recommend in most cases. Because it is a method
-on your model, it can inspect the model to automatically figure out which columns are usable as model inputs, and
-discard the others to make a simpler, more performant dataset.
-- [`~datasets.Dataset.to_tf_dataset`]: This method is more low-level, and is useful when you want to exactly control how
-your dataset is created, by specifying exactly which `columns` and `label_cols` to include.
-
-Before you can use [`~TFPreTrainedModel.prepare_tf_dataset`], you will need to add the tokenizer outputs to your dataset as columns, as shown in
-the following code sample:
-
-```py
-def tokenize_dataset(data):
-    # Keys of the returned dictionary will be added to the dataset as columns
-    return tokenizer(data["text"])
-
-
-dataset = dataset.map(tokenize_dataset)
-```
-
-Remember that Hugging Face datasets are stored on disk by default, so this will not inflate your memory usage! Once the
-columns have been added, you can stream batches from the dataset and add padding to each batch, which greatly
-reduces the number of padding tokens compared to padding the entire dataset.
-
-
-```py
->>> tf_dataset = model.prepare_tf_dataset(dataset, batch_size=16, shuffle=True, tokenizer=tokenizer)
-```
-
-Note that in the code sample above, you need to pass the tokenizer to `prepare_tf_dataset` so it can correctly pad batches as they're loaded.
-If all the samples in your dataset are the same length and no padding is necessary, you can skip this argument.
-If you need to do something more complex than just padding samples (e.g. corrupting tokens for masked language
-modelling), you can use the `collate_fn` argument instead to pass a function that will be called to transform the
-list of samples into a batch and apply any preprocessing you want. See our
-[examples](https://github.com/huggingface/transformers/tree/main/examples) or
-[notebooks](https://huggingface.co/docs/transformers/notebooks) to see this approach in action.
-
-Once you've created a `tf.data.Dataset`, you can compile and fit the model as before:
-
-```py
-model.compile(optimizer=Adam(3e-5))
-
-model.fit(tf_dataset)
-```
-
-</tf>
-</frameworkcontent>
-
-<a id='pytorch_native'></a>
-
-## Train in native PyTorch
-
-<frameworkcontent>
-<pt>
-<Youtube id="Dh9CL8fyG80"/>
-
-[`Trainer`] takes care of the training loop and allows you to fine-tune a model in a single line of code. For users who prefer to write their own training loop, you can also fine-tune a 🤗 Transformers model in native PyTorch.
-
-At this point, you may need to restart your notebook or execute the following code to free some memory:
-
-```py
-del model
-del trainer
-torch.cuda.empty_cache()
-```
-
-Next, manually postprocess `tokenized_dataset` to prepare it for training.
-
-1. Remove the `text` column because the model does not accept raw text as an input:
-
-    ```py
-    >>> tokenized_datasets = tokenized_datasets.remove_columns(["text"])
-    ```
-
-2. Rename the `label` column to `labels` because the model expects the argument to be named `labels`:
-
-    ```py
-    >>> tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
-    ```
-
-3. Set the format of the dataset to return PyTorch tensors instead of lists:
-
-    ```py
-    >>> tokenized_datasets.set_format("torch")
-    ```
-
-Then create a smaller subset of the dataset as previously shown to speed up the fine-tuning:
-
-```py
->>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
->>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
-```
-
-### DataLoader
-
-Create a `DataLoader` for your training and test datasets so you can iterate over batches of data:
-
-```py
->>> from torch.utils.data import DataLoader
-
->>> train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
->>> eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)
-```
-
-Load your model with the number of expected labels:
-
-```py
->>> from transformers import AutoModelForSequenceClassification
-
->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
-```
-
-### Optimizer and learning rate scheduler
-
-Create an optimizer and learning rate scheduler to fine-tune the model. Let's use the [`AdamW`](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html) optimizer from PyTorch:
-
-```py
->>> from torch.optim import AdamW
-
->>> optimizer = AdamW(model.parameters(), lr=5e-5)
-```
-
-Create the default learning rate scheduler from [`Trainer`]:
-
-```py
->>> from transformers import get_scheduler
-
->>> num_epochs = 3
->>> num_training_steps = num_epochs * len(train_dataloader)
->>> lr_scheduler = get_scheduler(
-...     name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
-... )
-```
-
-Lastly, specify `device` to use a GPU if you have access to one. Otherwise, training on a CPU may take several hours instead of a couple of minutes.
-
-```py
->>> import torch
-
->>> device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
->>> model.to(device)
-```
-
-<Tip>
-
-Get free access to a cloud GPU if you don't have one with a hosted notebook like [Colaboratory](https://colab.research.google.com/) or [SageMaker StudioLab](https://studiolab.sagemaker.aws/).
-
-</Tip>
-
-Great, now you are ready to train! 🥳 
-
-### Training loop
-
-To keep track of your training progress, use the [tqdm](https://tqdm.github.io/) library to add a progress bar over the number of training steps:
-
-```py
->>> from tqdm.auto import tqdm
-
->>> progress_bar = tqdm(range(num_training_steps))
-
->>> model.train()
->>> for epoch in range(num_epochs):
-...     for batch in train_dataloader:
-...         batch = {k: v.to(device) for k, v in batch.items()}
-...         outputs = model(**batch)
-...         loss = outputs.loss
-...         loss.backward()
-
-...         optimizer.step()
-...         lr_scheduler.step()
-...         optimizer.zero_grad()
-...         progress_bar.update(1)
-```
-
-### Evaluate
-
-Just like how you added an evaluation function to [`Trainer`], you need to do the same when you write your own training loop. But instead of calculating and reporting the metric at the end of each epoch, this time you'll accumulate all the batches with [`~evaluate.add_batch`] and calculate the metric at the very end.
-
-```py
->>> import evaluate
-
->>> metric = evaluate.load("accuracy")
->>> model.eval()
->>> for batch in eval_dataloader:
-...     batch = {k: v.to(device) for k, v in batch.items()}
-...     with torch.no_grad():
-...         outputs = model(**batch)
-
-...     logits = outputs.logits
-...     predictions = torch.argmax(logits, dim=-1)
-...     metric.add_batch(predictions=predictions, references=batch["labels"])
-
->>> metric.compute()
-```
-</pt>
-</frameworkcontent>
-
-<a id='additional-resources'></a>
-
-## Additional resources
-
-For more fine-tuning examples, refer to:
-
-- [🤗 Transformers Examples](https://github.com/huggingface/transformers/tree/main/examples) includes scripts
-  to train common NLP tasks in PyTorch and TensorFlow.
-
-- [🤗 Transformers Notebooks](notebooks) contains various notebooks on how to fine-tune a model for specific tasks in PyTorch and TensorFlow.
diff --git a/docs/source/en/transformers_agents.md b/docs/source/en/transformers_agents.md
new file mode 100644
index 000000000000..424f5b15f5dd
--- /dev/null
+++ b/docs/source/en/transformers_agents.md
@@ -0,0 +1,323 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Transformers Agents
+
+<Tip warning={true}>
+
+Transformers Agents is an experimental API which is subject to change at any time. Results returned by the agents
+can vary as the APIs or underlying models are prone to change.
+
+</Tip>
+
+Transformers version v4.29.0, building on the concept of *tools* and *agents*. You can play with in
+[this colab](https://colab.research.google.com/drive/1c7MHD-T1forUPGcC_jlwsIptOzpG3hSj).
+
+In short, it provides a natural language API on top of transformers: we define a set of curated tools and design an 
+agent to interpret natural language and to use these tools. It is extensible by design; we curated some relevant tools, 
+but we'll show you how the system can be extended easily to use any tool developed by the community.
+
+Let's start with a few examples of what can be achieved with this new API. It is particularly powerful when it comes 
+to multimodal tasks, so let's take it for a spin to generate images and read text out loud.
+
+```py
+agent.run("Caption the following image", image=image)
+```
+
+| **Input**                                                                                                                   | **Output**                        |
+|-----------------------------------------------------------------------------------------------------------------------------|-----------------------------------|
+| <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/beaver.png" width=200> | A beaver is swimming in the water |
+
+---
+
+```py
+agent.run("Read the following text out loud", text=text)
+```
+| **Input**                                                                                                               | **Output**                                   |
+|-------------------------------------------------------------------------------------------------------------------------|----------------------------------------------|
+| A beaver is swimming in the water | <audio controls><source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tts_example.wav" type="audio/wav"> your browser does not support the audio element. </audio>
+
+---
+
+```py
+agent.run(
+    "In the following `document`, where will the TRRF Scientific Advisory Council Meeting take place?",
+    document=document,
+)
+```
+| **Input**                                                                                                                   | **Output**     |
+|-----------------------------------------------------------------------------------------------------------------------------|----------------|
+| <img src="https://datasets-server.huggingface.co/assets/hf-internal-testing/example-documents/--/hf-internal-testing--example-documents/test/0/image/image.jpg" width=200> | ballroom foyer |
+
+## Quickstart
+
+Before being able to use `agent.run`, you will need to instantiate an agent, which is a large language model (LLM). 
+We provide support for openAI models as well as opensource alternatives from BigCode and OpenAssistant. The openAI
+models perform better (but require you to have an openAI API key, so cannot be used for free); Hugging Face is
+providing free access to endpoints for BigCode and OpenAssistant models.
+
+To start with, please install the `agents` extras in order to install all default dependencies.
+```bash
+pip install transformers[agents]
+```
+
+To use openAI models, you instantiate an [`OpenAiAgent`] after installing the `openai` dependency:
+
+```bash
+pip install openai
+```
+
+
+```py
+from transformers import OpenAiAgent
+
+agent = OpenAiAgent(model="text-davinci-003", api_key="<your_api_key>")
+```
+
+To use BigCode or OpenAssistant, start by logging in to have access to the Inference API:
+
+```py
+from huggingface_hub import login
+
+login("<YOUR_TOKEN>")
+```
+
+Then, instantiate the agent
+
+```py
+from transformers import HfAgent
+
+# Starcoder
+agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
+# StarcoderBase
+# agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoderbase")
+# OpenAssistant
+# agent = HfAgent(url_endpoint="https://api-inference.huggingface.co/models/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5")
+```
+
+This is using the inference API that Hugging Face provides for free at the moment. If you have your own inference
+endpoint for this model (or another one) you can replace the URL above with your URL endpoint.
+
+<Tip>
+
+StarCoder and OpenAssistant are free to use and perform admirably well on simple tasks. However, the checkpoints
+don't hold up when handling more complex prompts. If you're facing such an issue, we recommend trying out the OpenAI
+model which, while sadly not open-source, performs better at this given time.
+
+</Tip>
+
+You're now good to go! Let's dive into the two APIs that you now have at your disposal.
+
+### Single execution (run)
+
+The single execution method is when using the [`~Agent.run`] method of the agent:
+
+```py
+agent.run("Draw me a picture of rivers and lakes.")
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200>
+
+It automatically selects the tool (or tools) appropriate for the task you want to perform and runs them appropriately. It
+can perform one or several tasks in the same instruction (though the more complex your instruction, the more likely
+the agent is to fail).
+
+```py
+agent.run("Draw me a picture of the sea then transform the picture to add an island")
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/sea_and_island.png" width=200>
+
+<br/>
+
+
+Every [`~Agent.run`] operation is independent, so you can run it several times in a row with different tasks.
+
+Note that your `agent` is just a large-language model, so small variations in your prompt might yield completely
+different results. It's important to explain as clearly as possible the task you want to perform. We go more in-depth
+on how to write good prompts [here](custom_tools#writing-good-user-inputs).
+
+If you'd like to keep a state across executions or to pass non-text objects to the agent, you can do so by specifying
+variables that you would like the agent to use. For example, you could generate the first image of rivers and lakes, 
+and ask the model to update that picture to add an island by doing the following:
+
+```python
+picture = agent.run("Generate a picture of rivers and lakes.")
+updated_picture = agent.run("Transform the image in `picture` to add an island to it.", picture=picture)
+```
+
+<Tip>
+
+This can be helpful when the model is unable to understand your request and mixes tools. An example would be:
+
+```py
+agent.run("Draw me the picture of a capybara swimming in the sea")
+```
+
+Here, the model could interpret in two ways:
+- Have the `text-to-image` generate a capybara swimming in the sea
+- Or, have the `text-to-image` generate capybara, then use the `image-transformation` tool to have it swim in the sea
+
+In case you would like to force the first scenario, you could do so by passing it the prompt as an argument:
+
+```py
+agent.run("Draw me a picture of the `prompt`", prompt="a capybara swimming in the sea")
+```
+
+</Tip>
+
+
+### Chat-based execution (chat)
+
+The agent also has a chat-based approach, using the [`~Agent.chat`] method:
+
+```py
+agent.chat("Generate a picture of rivers and lakes")
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200> 
+
+```py
+agent.chat("Transform the picture so that there is a rock in there")
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes_and_beaver.png" width=200>
+
+<br/>
+
+This is an interesting approach when you want to keep the state across instructions. It's better for experimentation, 
+but will tend to be much better at single instructions rather than complex instructions (which the [`~Agent.run`]
+method is better at handling).
+
+This method can also take arguments if you would like to pass non-text types or specific prompts.
+
+### ⚠️ Remote execution
+
+For demonstration purposes and so that it could be used with all setups, we had created remote executors for several 
+of the default tools the agent has access for the release. These are created using 
+[inference endpoints](https://huggingface.co/inference-endpoints).
+
+We have turned these off for now, but in order to see how to set up remote executors tools yourself,
+we recommend reading the [custom tool guide](./custom_tools).
+
+### What's happening here? What are tools, and what are agents?
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/diagram.png">
+
+#### Agents
+
+The "agent" here is a large language model, and we're prompting it so that it has access to a specific set of tools.
+
+LLMs are pretty good at generating small samples of code, so this API takes advantage of that by prompting the 
+LLM gives a small sample of code performing a task with a set of tools. This prompt is then completed by the 
+task you give your agent and the description of the tools you give it. This way it gets access to the doc of the 
+tools you are using, especially their expected inputs and outputs, and can generate the relevant code.
+
+#### Tools
+
+Tools are very simple: they're a single function, with a name, and a description. We then use these tools' descriptions 
+to prompt the agent. Through the prompt, we show the agent how it would leverage tools to perform what was 
+requested in the query.
+
+This is using brand-new tools and not pipelines, because the agent writes better code with very atomic tools. 
+Pipelines are more refactored and often combine several tasks in one. Tools are meant to be focused on
+one very simple task only.
+
+#### Code-execution?!
+
+This code is then executed with our small Python interpreter on the set of inputs passed along with your tools. 
+We hear you screaming "Arbitrary code execution!" in the back, but let us explain why that is not the case.
+
+The only functions that can be called are the tools you provided and the print function, so you're already 
+limited in what can be executed. You should be safe if it's limited to Hugging Face tools. 
+
+Then, we don't allow any attribute lookup or imports (which shouldn't be needed anyway for passing along 
+inputs/outputs to a small set of functions) so all the most obvious attacks (and you'd need to prompt the LLM 
+to output them anyway) shouldn't be an issue. If you want to be on the super safe side, you can execute the 
+run() method with the additional argument return_code=True, in which case the agent will just return the code 
+to execute and you can decide whether to do it or not.
+
+The execution will stop at any line trying to perform an illegal operation or if there is a regular Python error 
+with the code generated by the agent.
+
+### A curated set of tools
+
+We identify a set of tools that can empower such agents. Here is an updated list of the tools we have integrated 
+in `transformers`:
+
+- **Document question answering**: given a document (such as a PDF) in image format, answer a question on this document ([Donut](./model_doc/donut))
+- **Text question answering**: given a long text and a question, answer the question in the text ([Flan-T5](./model_doc/flan-t5))
+- **Unconditional image captioning**: Caption the image! ([BLIP](./model_doc/blip))
+- **Image question answering**: given an image, answer a question on this image ([VILT](./model_doc/vilt))
+- **Image segmentation**: given an image and a prompt, output the segmentation mask of that prompt ([CLIPSeg](./model_doc/clipseg))
+- **Speech to text**: given an audio recording of a person talking, transcribe the speech into text ([Whisper](./model_doc/whisper))
+- **Text to speech**: convert text to speech ([SpeechT5](./model_doc/speecht5))
+- **Zero-shot text classification**: given a text and a list of labels, identify to which label the text corresponds the most ([BART](./model_doc/bart))
+- **Text summarization**: summarize a long text in one or a few sentences ([BART](./model_doc/bart))
+- **Translation**: translate the text into a given language ([NLLB](./model_doc/nllb))
+
+These tools have an integration in transformers, and can be used manually as well, for example:
+
+```py
+from transformers import load_tool
+
+tool = load_tool("text-to-speech")
+audio = tool("This is a text to speech tool")
+```
+
+### Custom tools
+
+While we identify a curated set of tools, we strongly believe that the main value provided by this implementation is 
+the ability to quickly create and share custom tools.
+
+By pushing the code of a tool to a Hugging Face Space or a model repository, you're then able to leverage the tool 
+directly with the agent. We've added a few 
+**transformers-agnostic** tools to the [`huggingface-tools` organization](https://huggingface.co/huggingface-tools):
+
+- **Text downloader**: to download a text from a web URL
+- **Text to image**: generate an image according to a prompt, leveraging stable diffusion
+- **Image transformation**: modify an image given an initial image and a prompt, leveraging instruct pix2pix stable diffusion
+- **Text to video**: generate a small video according to a prompt, leveraging damo-vilab
+
+The text-to-image tool we have been using since the beginning is a remote tool that lives in 
+[*huggingface-tools/text-to-image*](https://huggingface.co/spaces/huggingface-tools/text-to-image)! We will
+continue releasing such tools on this and other organizations, to further supercharge this implementation.
+
+The agents have by default access to tools that reside on [`huggingface-tools`](https://huggingface.co/huggingface-tools).
+We explain how to you can write and share your tools as well as leverage any custom tool that resides on the Hub in [following guide](custom_tools).
+
+### Code generation
+
+So far we have shown how to use the agents to perform actions for you. However, the agent is only generating code
+that we then execute using a very restricted Python interpreter. In case you would like to use the code generated in 
+a different setting, the agent can be prompted to return the code, along with tool definition and accurate imports.
+
+For example, the following instruction
+```python
+agent.run("Draw me a picture of rivers and lakes", return_code=True)
+```
+
+returns the following code
+
+```python
+from transformers import load_tool
+
+image_generator = load_tool("huggingface-tools/text-to-image")
+
+image = image_generator(prompt="rivers and lakes")
+```
+
+that you can then modify and execute yourself.
diff --git a/docs/source/en/troubleshooting.md b/docs/source/en/troubleshooting.md
new file mode 100644
index 000000000000..29b032dd2799
--- /dev/null
+++ b/docs/source/en/troubleshooting.md
@@ -0,0 +1,198 @@
+<!---
+Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Troubleshoot
+
+Sometimes errors occur, but we are here to help! This guide covers some of the most common issues we've seen and how you can resolve them. However, this guide isn't meant to be a comprehensive collection of every 🤗 Transformers issue. For more help with troubleshooting your issue, try:
+
+<Youtube id="S2EEG3JIt2A"/>
+
+1. Asking for help on the [forums](https://discuss.huggingface.co/). There are specific categories you can post your question to, like [Beginners](https://discuss.huggingface.co/c/beginners/5) or [🤗 Transformers](https://discuss.huggingface.co/c/transformers/9). Make sure you write a good descriptive forum post with some reproducible code to maximize the likelihood that your problem is solved!
+
+<Youtube id="_PAli-V4wj0"/>
+
+2. Create an [Issue](https://github.com/huggingface/transformers/issues/new/choose) on the 🤗 Transformers repository if it is a bug related to the library. Try to include as much information describing the bug as possible to help us better figure out what's wrong and how we can fix it.
+
+3. Check the [Migration](migration) guide if you use an older version of 🤗 Transformers since some important changes have been introduced between versions.
+
+For more details about troubleshooting and getting help, take a look at [Chapter 8](https://huggingface.co/course/chapter8/1?fw=pt) of the Hugging Face course.
+
+
+## Firewalled environments
+
+Some GPU instances on cloud and intranet setups are firewalled to external connections, resulting in a connection error. When your script attempts to download model weights or datasets, the download will hang and then timeout with the following message:
+
+```
+ValueError: Connection error, and we cannot find the requested files in the cached path.
+Please try again or make sure your Internet connection is on.
+```
+
+In this case, you should try to run 🤗 Transformers on [offline mode](installation#offline-mode) to avoid the connection error.
+
+## CUDA out of memory
+
+Training large models with millions of parameters can be challenging without the appropriate hardware. A common error you may encounter when the GPU runs out of memory is:
+
+```
+CUDA out of memory. Tried to allocate 256.00 MiB (GPU 0; 11.17 GiB total capacity; 9.70 GiB already allocated; 179.81 MiB free; 9.85 GiB reserved in total by PyTorch)
+```
+
+Here are some potential solutions you can try to lessen memory use:
+
+- Reduce the [`per_device_train_batch_size`](main_classes/trainer#transformers.TrainingArguments.per_device_train_batch_size) value in [`TrainingArguments`].
+- Try using [`gradient_accumulation_steps`](main_classes/trainer#transformers.TrainingArguments.gradient_accumulation_steps) in [`TrainingArguments`] to effectively increase overall batch size.
+
+<Tip>
+
+Refer to the Performance [guide](performance) for more details about memory-saving techniques.
+
+</Tip>
+
+## Unable to load a saved TensorFlow model
+
+TensorFlow's [model.save](https://www.tensorflow.org/tutorials/keras/save_and_load#save_the_entire_model) method will save the entire model - architecture, weights, training configuration - in a single file. However, when you load the model file again, you may run into an error because 🤗 Transformers may not load all the TensorFlow-related objects in the model file. To avoid issues with saving and loading TensorFlow models, we recommend you:
+
+- Save the model weights as a `h5` file extension with [`model.save_weights`](https://www.tensorflow.org/tutorials/keras/save_and_load#save_the_entire_model) and then reload the model with [`~TFPreTrainedModel.from_pretrained`]:
+
+```py
+>>> from transformers import TFPreTrainedModel
+>>> from tensorflow import keras
+
+>>> model.save_weights("some_folder/tf_model.h5")
+>>> model = TFPreTrainedModel.from_pretrained("some_folder")
+```
+
+- Save the model with [`~TFPretrainedModel.save_pretrained`] and load it again with [`~TFPreTrainedModel.from_pretrained`]:
+
+```py
+>>> from transformers import TFPreTrainedModel
+
+>>> model.save_pretrained("path_to/model")
+>>> model = TFPreTrainedModel.from_pretrained("path_to/model")
+```
+
+## ImportError
+
+Another common error you may encounter, especially if it is a newly released model, is `ImportError`:
+
+```
+ImportError: cannot import name 'ImageGPTImageProcessor' from 'transformers' (unknown location)
+```
+
+For these error types, check to make sure you have the latest version of 🤗 Transformers installed to access the most recent models:
+
+```bash
+pip install transformers --upgrade
+```
+
+## CUDA error: device-side assert triggered
+
+Sometimes you may run into a generic CUDA error about an error in the device code.
+
+```
+RuntimeError: CUDA error: device-side assert triggered
+```
+
+You should try to run the code on a CPU first to get a more descriptive error message. Add the following environment variable to the beginning of your code to switch to a CPU:
+
+```py
+>>> import os
+
+>>> os.environ["CUDA_VISIBLE_DEVICES"] = ""
+```
+
+Another option is to get a better traceback from the GPU. Add the following environment variable to the beginning of your code to get the traceback to point to the source of the error:
+
+```py
+>>> import os
+
+>>> os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+```
+
+## Incorrect output when padding tokens aren't masked
+
+In some cases, the output `hidden_state` may be incorrect if the `input_ids` include padding tokens. To demonstrate, load a model and tokenizer. You can access a model's `pad_token_id` to see its value. The `pad_token_id` may be `None` for some models, but you can always manually set it.
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+>>> import torch
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
+>>> model.config.pad_token_id
+0
+```
+
+The following example shows the output without masking the padding tokens:
+
+```py
+>>> input_ids = torch.tensor([[7592, 2057, 2097, 2393, 9611, 2115], [7592, 0, 0, 0, 0, 0]])
+>>> output = model(input_ids)
+>>> print(output.logits)
+tensor([[ 0.0082, -0.2307],
+        [ 0.1317, -0.1683]], grad_fn=<AddmmBackward0>)
+```
+
+Here is the actual output of the second sequence:
+
+```py
+>>> input_ids = torch.tensor([[7592]])
+>>> output = model(input_ids)
+>>> print(output.logits)
+tensor([[-0.1008, -0.4061]], grad_fn=<AddmmBackward0>)
+```
+
+Most of the time, you should provide an `attention_mask` to your model to ignore the padding tokens to avoid this silent error. Now the output of the second sequence matches its actual output:
+
+<Tip>
+
+By default, the tokenizer creates an `attention_mask` for you based on your specific tokenizer's defaults.
+
+</Tip>
+
+```py
+>>> attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1], [1, 0, 0, 0, 0, 0]])
+>>> output = model(input_ids, attention_mask=attention_mask)
+>>> print(output.logits)
+tensor([[ 0.0082, -0.2307],
+        [-0.1008, -0.4061]], grad_fn=<AddmmBackward0>)
+```
+
+🤗 Transformers doesn't automatically create an `attention_mask` to mask a padding token if it is provided because:
+
+- Some models don't have a padding token.
+- For some use-cases, users want a model to attend to a padding token.
+
+## ValueError: Unrecognized configuration class XYZ for this kind of AutoModel
+
+Generally, we recommend using the [`AutoModel`] class to load pretrained instances of models. This class
+can automatically infer and load the correct architecture from a given checkpoint based on the configuration. If you see
+this `ValueError` when loading a model from a checkpoint, this means the Auto class couldn't find a mapping from
+the configuration in the given checkpoint to the kind of model you are trying to load. Most commonly, this happens when a
+checkpoint doesn't support a given task.
+For instance, you'll see this error in the following example because there is no GPT2 for question answering:
+
+```py
+>>> from transformers import AutoProcessor, AutoModelForQuestionAnswering
+
+>>> processor = AutoProcessor.from_pretrained("gpt2-medium")
+>>> model = AutoModelForQuestionAnswering.from_pretrained("gpt2-medium")
+ValueError: Unrecognized configuration class <class 'transformers.models.gpt2.configuration_gpt2.GPT2Config'> for this kind of AutoModel: AutoModelForQuestionAnswering.
+Model type should be one of AlbertConfig, BartConfig, BertConfig, BigBirdConfig, BigBirdPegasusConfig, BloomConfig, ...
+```
diff --git a/docs/source/en/troubleshooting.mdx b/docs/source/en/troubleshooting.mdx
deleted file mode 100644
index 74346bccef97..000000000000
--- a/docs/source/en/troubleshooting.mdx
+++ /dev/null
@@ -1,176 +0,0 @@
-<!---
-Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Troubleshoot
-
-Sometimes errors occur, but we are here to help! This guide covers some of the most common issues we've seen and how you can resolve them. However, this guide isn't meant to be a comprehensive collection of every 🤗 Transformers issue. For more help with troubleshooting your issue, try:
-
-<Youtube id="S2EEG3JIt2A"/>
-
-1. Asking for help on the [forums](https://discuss.huggingface.co/). There are specific categories you can post your question to, like [Beginners](https://discuss.huggingface.co/c/beginners/5) or [🤗 Transformers](https://discuss.huggingface.co/c/transformers/9). Make sure you write a good descriptive forum post with some reproducible code to maximize the likelihood that your problem is solved!
-
-<Youtube id="_PAli-V4wj0"/>
-
-2. Create an [Issue](https://github.com/huggingface/transformers/issues/new/choose) on the 🤗 Transformers repository if it is a bug related to the library. Try to include as much information describing the bug as possible to help us better figure out what's wrong and how we can fix it.
-
-3. Check the [Migration](migration) guide if you use an older version of 🤗 Transformers since some important changes have been introduced between versions.
-
-For more details about troubleshooting and getting help, take a look at [Chapter 8](https://huggingface.co/course/chapter8/1?fw=pt) of the Hugging Face course.
-
-
-## Firewalled environments
-
-Some GPU instances on cloud and intranet setups are firewalled to external connections, resulting in a connection error. When your script attempts to download model weights or datasets, the download will hang and then timeout with the following message:
-
-```
-ValueError: Connection error, and we cannot find the requested files in the cached path.
-Please try again or make sure your Internet connection is on.
-```
-
-In this case, you should try to run 🤗 Transformers on [offline mode](installation#offline-mode) to avoid the connection error.
-
-## CUDA out of memory
-
-Training large models with millions of parameters can be challenging without the appropriate hardware. A common error you may encounter when the GPU runs out of memory is:
-
-```
-CUDA out of memory. Tried to allocate 256.00 MiB (GPU 0; 11.17 GiB total capacity; 9.70 GiB already allocated; 179.81 MiB free; 9.85 GiB reserved in total by PyTorch)
-```
-
-Here are some potential solutions you can try to lessen memory use:
-
-- Reduce the [`per_device_train_batch_size`](main_classes/trainer#transformers.TrainingArguments.per_device_train_batch_size) value in [`TrainingArguments`].
-- Try using [`gradient_accumulation_steps`](main_classes/trainer#transformers.TrainingArguments.gradient_accumulation_steps) in [`TrainingArguments`] to effectively increase overall batch size.
-
-<Tip>
-
-Refer to the Performance [guide](performance) for more details about memory-saving techniques.
-
-</Tip>
-
-## Unable to load a saved TensorFlow model
-
-TensorFlow's [model.save](https://www.tensorflow.org/tutorials/keras/save_and_load#save_the_entire_model) method will save the entire model - architecture, weights, training configuration - in a single file. However, when you load the model file again, you may run into an error because 🤗 Transformers may not load all the TensorFlow-related objects in the model file. To avoid issues with saving and loading TensorFlow models, we recommend you:
-
-- Save the model weights as a `h5` file extension with [`model.save_weights`](https://www.tensorflow.org/tutorials/keras/save_and_load#save_the_entire_model) and then reload the model with [`~TFPreTrainedModel.from_pretrained`]:
-
-```py
->>> from transformers import TFPreTrainedModel
->>> from tensorflow import keras
-
->>> model.save_weights("some_folder/tf_model.h5")
->>> model = TFPreTrainedModel.from_pretrained("some_folder")
-```
-
-- Save the model with [`~TFPretrainedModel.save_pretrained`] and load it again with [`~TFPreTrainedModel.from_pretrained`]:
-
-```py
->>> from transformers import TFPreTrainedModel
-
->>> model.save_pretrained("path_to/model")
->>> model = TFPreTrainedModel.from_pretrained("path_to/model")
-```
-
-## ImportError
-
-Another common error you may encounter, especially if it is a newly released model, is `ImportError`:
-
-```
-ImportError: cannot import name 'ImageGPTImageProcessor' from 'transformers' (unknown location)
-```
-
-For these error types, check to make sure you have the latest version of 🤗 Transformers installed to access the most recent models:
-
-```bash
-pip install transformers --upgrade
-```
-
-## CUDA error: device-side assert triggered
-
-Sometimes you may run into a generic CUDA error about an error in the device code.
-
-```
-RuntimeError: CUDA error: device-side assert triggered
-```
-
-You should try to run the code on a CPU first to get a more descriptive error message. Add the following environment variable to the beginning of your code to switch to a CPU:
-
-```py
->>> import os
-
->>> os.environ["CUDA_VISIBLE_DEVICES"] = ""
-```
-
-Another option is to get a better traceback from the GPU. Add the following environment variable to the beginning of your code to get the traceback to point to the source of the error:
-
-```py
->>> import os
-
->>> os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
-```
-
-## Incorrect output when padding tokens aren't masked
-
-In some cases, the output `hidden_state` may be incorrect if the `input_ids` include padding tokens. To demonstrate, load a model and tokenizer. You can access a model's `pad_token_id` to see its value. The `pad_token_id` may be `None` for some models, but you can always manually set it.
-
-```py
->>> from transformers import AutoModelForSequenceClassification
->>> import torch
-
->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
->>> model.config.pad_token_id
-0
-```
-
-The following example shows the output without masking the padding tokens:
-
-```py
->>> input_ids = torch.tensor([[7592, 2057, 2097, 2393, 9611, 2115], [7592, 0, 0, 0, 0, 0]])
->>> output = model(input_ids)
->>> print(output.logits)
-tensor([[ 0.0082, -0.2307],
-        [ 0.1317, -0.1683]], grad_fn=<AddmmBackward0>)
-```
-
-Here is the actual output of the second sequence:
-
-```py
->>> input_ids = torch.tensor([[7592]])
->>> output = model(input_ids)
->>> print(output.logits)
-tensor([[-0.1008, -0.4061]], grad_fn=<AddmmBackward0>)
-```
-
-Most of the time, you should provide an `attention_mask` to your model to ignore the padding tokens to avoid this silent error. Now the output of the second sequence matches its actual output:
-
-<Tip>
-
-By default, the tokenizer creates an `attention_mask` for you based on your specific tokenizer's defaults.
-
-</Tip>
-
-```py
->>> attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1], [1, 0, 0, 0, 0, 0]])
->>> output = model(input_ids, attention_mask=attention_mask)
->>> print(output.logits)
-tensor([[ 0.0082, -0.2307],
-        [-0.1008, -0.4061]], grad_fn=<AddmmBackward0>)
-```
-
-🤗 Transformers doesn't automatically create an `attention_mask` to mask a padding token if it is provided because:
-
-- Some models don't have a padding token.
-- For some use-cases, users want a model to attend to a padding token.
\ No newline at end of file
diff --git a/docs/source/es/_toctree.yml b/docs/source/es/_toctree.yml
index e107915dc121..dd110b746c6e 100644
--- a/docs/source/es/_toctree.yml
+++ b/docs/source/es/_toctree.yml
@@ -62,13 +62,15 @@
   - sections:
     - local: debugging
       title: Debugging
-    title: Rendimiento y escalabilidad  
+    title: Rendimiento y escalabilidad
   - sections:
     - local: add_new_pipeline
       title: ¿Cómo puedo añadir un pipeline a 🤗 Transformers?
     - local: pr_checks
       title: Verificaciones en un Pull Request
     title: Contribuir
+  - local: community
+    title: Los recursos de la comunidad
   title: Guías prácticas
 - sections:
   - local: philosophy
diff --git a/docs/source/es/accelerate.md b/docs/source/es/accelerate.md
new file mode 100644
index 000000000000..2c4063b7ca3b
--- /dev/null
+++ b/docs/source/es/accelerate.md
@@ -0,0 +1,136 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Entrenamiento distribuido con 🤗 Accelerate
+
+El paralelismo ha emergido como una estrategia para entrenar modelos grandes en hardware limitado e incrementar la velocidad de entrenamiento en varios órdenes de magnitud. En Hugging Face creamos la biblioteca [🤗 Accelerate](https://huggingface.co/docs/accelerate) para ayudar a los usuarios a entrenar modelos 🤗 Transformers en cualquier tipo de configuración distribuida, ya sea en una máquina con múltiples GPUs o en múltiples GPUs distribuidas entre muchas máquinas. En este tutorial aprenderás cómo personalizar tu bucle de entrenamiento de PyTorch nativo para poder entrenar en entornos distribuidos.
+
+## Configuración
+
+Empecemos por instalar 🤗 Accelerate:
+
+```bash
+pip install accelerate
+```
+
+Luego, importamos y creamos un objeto [`Accelerator`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator). `Accelerator` detectará automáticamente el tipo de configuración distribuida que tengas disponible e inicializará todos los componentes necesarios para el entrenamiento. No necesitas especificar el dispositivo en donde se debe colocar tu modelo.
+
+```py
+>>> from accelerate import Accelerator
+
+>>> accelerator = Accelerator()
+```
+
+## Prepárate para acelerar
+
+Pasa todos los objetos relevantes para el entrenamiento al método [`prepare`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.prepare). Esto incluye los DataLoaders de entrenamiento y evaluación, un modelo y un optimizador:
+
+```py
+>>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
+...     train_dataloader, eval_dataloader, model, optimizer
+... )
+```
+
+## Backward
+
+Por último, reemplaza el típico `loss.backward()` en tu bucle de entrenamiento con el método [`backward`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.backward) de 🤗 Accelerate:
+
+```py
+>>> for epoch in range(num_epochs):
+...     for batch in train_dataloader:
+...         outputs = model(**batch)
+...         loss = outputs.loss
+...         accelerator.backward(loss)
+
+...         optimizer.step()
+...         lr_scheduler.step()
+...         optimizer.zero_grad()
+...         progress_bar.update(1)
+```
+
+Como se puede ver en el siguiente código, ¡solo necesitas adicionar cuatro líneas de código a tu bucle de entrenamiento para habilitar el entrenamiento distribuido!
+
+```diff
++ from accelerate import Accelerator
+  from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
+
++ accelerator = Accelerator()
+
+  model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
+  optimizer = AdamW(model.parameters(), lr=3e-5)
+
+- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+- model.to(device)
+
++ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
++     train_dataloader, eval_dataloader, model, optimizer
++ )
+
+  num_epochs = 3
+  num_training_steps = num_epochs * len(train_dataloader)
+  lr_scheduler = get_scheduler(
+      "linear",
+      optimizer=optimizer,
+      num_warmup_steps=0,
+      num_training_steps=num_training_steps
+  )
+
+  progress_bar = tqdm(range(num_training_steps))
+
+  model.train()
+  for epoch in range(num_epochs):
+      for batch in train_dataloader:
+-         batch = {k: v.to(device) for k, v in batch.items()}
+          outputs = model(**batch)
+          loss = outputs.loss
+-         loss.backward()
++         accelerator.backward(loss)
+
+          optimizer.step()
+          lr_scheduler.step()
+          optimizer.zero_grad()
+          progress_bar.update(1)
+```
+
+## Entrenamiento
+
+Una vez que hayas añadido las líneas de código relevantes, inicia el entrenamiento desde un script o notebook como Colaboratory.
+
+### Entrenar con un script
+
+Si estás corriendo tu entrenamiento desde un script ejecuta el siguiente comando para crear y guardar un archivo de configuración:
+
+```bash
+accelerate config
+```
+
+Comienza el entrenamiento con:
+
+```bash
+accelerate launch train.py
+```
+
+### Entrenar con un notebook
+
+🤗 Accelerate puede correr en un notebook si, por ejemplo, estás planeando utilizar las TPUs de Colaboratory. Encierra el código responsable del entrenamiento en una función y pásalo a `notebook_launcher`:
+
+```py
+>>> from accelerate import notebook_launcher
+
+>>> notebook_launcher(training_function)
+```
+
+Para obtener más información sobre 🤗 Accelerate y sus numerosas funciones, consulta la [documentación](https://huggingface.co/docs/accelerate).
diff --git a/docs/source/es/accelerate.mdx b/docs/source/es/accelerate.mdx
deleted file mode 100644
index 6065bc110a1d..000000000000
--- a/docs/source/es/accelerate.mdx
+++ /dev/null
@@ -1,132 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Entrenamiento distribuido con 🤗 Accelerate
-
-El paralelismo ha emergido como una estrategia para entrenar modelos grandes en hardware limitado e incrementar la velocidad de entrenamiento en varios órdenes de magnitud. En Hugging Face creamos la biblioteca [🤗 Accelerate](https://huggingface.co/docs/accelerate) para ayudar a los usuarios a entrenar modelos 🤗 Transformers en cualquier tipo de configuración distribuida, ya sea en una máquina con múltiples GPUs o en múltiples GPUs distribuidas entre muchas máquinas. En este tutorial aprenderás cómo personalizar tu bucle de entrenamiento de PyTorch nativo para poder entrenar en entornos distribuidos.
-
-## Configuración
-
-Empecemos por instalar 🤗 Accelerate:
-
-```bash
-pip install accelerate
-```
-
-Luego, importamos y creamos un objeto [`Accelerator`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator). `Accelerator` detectará automáticamente el tipo de configuración distribuida que tengas disponible e inicializará todos los componentes necesarios para el entrenamiento. No necesitas especificar el dispositivo en donde se debe colocar tu modelo.
-
-```py
->>> from accelerate import Accelerator
-
->>> accelerator = Accelerator()
-```
-
-## Prepárate para acelerar
-
-Pasa todos los objetos relevantes para el entrenamiento al método [`prepare`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.prepare). Esto incluye los DataLoaders de entrenamiento y evaluación, un modelo y un optimizador:
-
-```py
->>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
-...     train_dataloader, eval_dataloader, model, optimizer
-... )
-```
-
-## Backward
-
-Por último, reemplaza el típico `loss.backward()` en tu bucle de entrenamiento con el método [`backward`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.backward) de 🤗 Accelerate:
-
-```py
->>> for epoch in range(num_epochs):
-...     for batch in train_dataloader:
-...         outputs = model(**batch)
-...         loss = outputs.loss
-...         accelerator.backward(loss)
-
-...         optimizer.step()
-...         lr_scheduler.step()
-...         optimizer.zero_grad()
-...         progress_bar.update(1)
-```
-
-Como se puede ver en el siguiente código, ¡solo necesitas adicionar cuatro líneas de código a tu bucle de entrenamiento para habilitar el entrenamiento distribuido!
-
-```diff
-+ from accelerate import Accelerator
-  from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
-
-+ accelerator = Accelerator()
-
-  model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
-  optimizer = AdamW(model.parameters(), lr=3e-5)
-
-- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
-- model.to(device)
-
-+ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
-+     train_dataloader, eval_dataloader, model, optimizer
-+ )
-
-  num_epochs = 3
-  num_training_steps = num_epochs * len(train_dataloader)
-  lr_scheduler = get_scheduler(
-      "linear",
-      optimizer=optimizer,
-      num_warmup_steps=0,
-      num_training_steps=num_training_steps
-  )
-
-  progress_bar = tqdm(range(num_training_steps))
-
-  model.train()
-  for epoch in range(num_epochs):
-      for batch in train_dataloader:
--         batch = {k: v.to(device) for k, v in batch.items()}
-          outputs = model(**batch)
-          loss = outputs.loss
--         loss.backward()
-+         accelerator.backward(loss)
-
-          optimizer.step()
-          lr_scheduler.step()
-          optimizer.zero_grad()
-          progress_bar.update(1)
-```
-
-## Entrenamiento
-
-Una vez que hayas añadido las líneas de código relevantes, inicia el entrenamiento desde un script o notebook como Colaboratory.
-
-### Entrenar con un script
-
-Si estás corriendo tu entrenamiento desde un script ejecuta el siguiente comando para crear y guardar un archivo de configuración:
-
-```bash
-accelerate config
-```
-
-Comienza el entrenamiento con:
-
-```bash
-accelerate launch train.py
-```
-
-### Entrenar con un notebook
-
-🤗 Accelerate puede correr en un notebook si, por ejemplo, estás planeando utilizar las TPUs de Colaboratory. Encierra el código responsable del entrenamiento en una función y pásalo a `notebook_launcher`:
-
-```py
->>> from accelerate import notebook_launcher
-
->>> notebook_launcher(training_function)
-```
-
-Para obtener más información sobre 🤗 Accelerate y sus numerosas funciones, consulta la [documentación](https://huggingface.co/docs/accelerate).
diff --git a/docs/source/es/add_new_pipeline.md b/docs/source/es/add_new_pipeline.md
new file mode 100644
index 000000000000..5e64c435ab98
--- /dev/null
+++ b/docs/source/es/add_new_pipeline.md
@@ -0,0 +1,264 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# ¿Cómo puedo crear un pipeline personalizado?
+
+En esta guía, veremos cómo crear un pipeline personalizado y cómo compartirlo en el [Hub](hf.co/models) o añadirlo
+a la biblioteca 🤗 Transformers.
+
+En primer lugar, debes decidir las entradas que tu pipeline podrá recibir. Pueden ser strings, bytes,
+diccionarios o lo que te parezca que vaya a ser la entrada más apropiada. Intenta mantener estas entradas en un
+formato que sea tan Python puro como sea posible, puesto que esto facilita la compatibilidad (incluso con otros
+lenguajes de programación por medio de JSON). Estos serán los `inputs` (entradas) del pipeline (`preprocess`).
+
+Ahora debes definir los `outputs` (salidas). Al igual que con los `inputs`, entre más simple el formato, mejor.
+Estas serán las salidas del método `postprocess` (posprocesamiento).
+
+Empieza heredando la clase base `Pipeline` con los 4 métodos que debemos implementar: `preprocess` (preprocesamiento),
+`_forward` (ejecución), `postprocess` (posprocesamiento) y `_sanitize_parameters` (verificar parámetros).
+
+```python
+from transformers import Pipeline
+
+
+class MyPipeline(Pipeline):
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_kwargs = {}
+        if "maybe_arg" in kwargs:
+            preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
+        return preprocess_kwargs, {}, {}
+
+    def preprocess(self, inputs, maybe_arg=2):
+        model_input = Tensor(inputs["input_ids"])
+        return {"model_input": model_input}
+
+    def _forward(self, model_inputs):
+        # model_inputs == {"model_input": model_input}
+        outputs = self.model(**model_inputs)
+        # Quizá {"logits": Tensor(...)}
+        return outputs
+
+    def postprocess(self, model_outputs):
+        best_class = model_outputs["logits"].softmax(-1)
+        return best_class
+```
+
+La estructura de este desglose es así para garantizar una compatibilidad más o menos transparente con el uso de
+CPU/GPU y el pre/posprocesamiento en CPU en varios hilos.
+
+`preprocess` tomará las entradas definidas originalmente y las convertirá en algo que se le pueda pasar al modelo.
+Podría contener más información y a menudo es un objeto `Dict` (diccionario).
+
+`_forward` contiene los detalles de la implementación y no debería ser invocado de forma directa. `forward` es el
+método preferido a utilizar pues contiene verificaciones para asegurar que todo funcione en el dispositivo correcto.
+Cualquier cosa que esté relacionada con un modelo real debería ir en el método `_forward`, todo lo demás va en
+los métodos de preprocesamiento y posprocesamiento.
+
+Los métodos `postprocess` reciben la salida `_forward` y la convierten en la salida final que decidimos
+anteriormente.
+
+`_sanitize_parameters` existe para permitir a los usuarios pasar cualesquiera parámetros cuando lo deseen, ya
+sea al momento de inicializar el pipeline `pipeline(...., maybe_arg=4)` o al momento de invocarlo
+`pipe = pipeline(...); output = pipe(...., maybe_arg=4)`.
+
+
+El método `_sanitize_parameters` devuelve 3 diccionarios de kwargs que serán pasados directamente a `preprocess`,
+`_forward` y `postprocess`. No ingreses nada si el caller no se va a invocar con parámetros adicionales.
+Esto permite mantener los parámetros por defecto de la definición de la función, lo que es más "natural".
+
+Un ejemplo clásico sería un argumento `top_k` en el posprocesamiento de una tarea de clasificación.
+
+```python
+>>> pipe = pipeline("my-new-task")
+>>> pipe("This is a test")
+[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}, {"label": "3-star", "score": 0.05}
+{"label": "4-star", "score": 0.025}, {"label": "5-star", "score": 0.025}]
+
+>>> pipe("This is a test", top_k=2)
+[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}]
+```
+
+Para lograrlo, actualizaremos nuestro método `postprocess` con un valor por defecto de `5` y  modificaremos
+`_sanitize_parameters` para permitir este nuevo parámetro.
+
+
+```python
+def postprocess(self, model_outputs, top_k=5):
+    best_class = model_outputs["logits"].softmax(-1)
+    # Añade la lógica para manejar el top_k
+    return best_class
+
+
+def _sanitize_parameters(self, **kwargs):
+    preprocess_kwargs = {}
+    if "maybe_arg" in kwargs:
+        preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
+
+    postprocess_kwargs = {}
+    if "top_k" in kwargs:
+        postprocess_kwargs["top_k"] = kwargs["top_k"]
+    return preprocess_kwargs, {}, postprocess_kwargs
+```
+
+Intenta que las entradas y salidas sean muy simples e, idealmente, que puedan serializarse como JSON, pues esto
+hace el uso del pipeline muy sencillo sin que el usuario tenga que preocuparse por conocer nuevos tipos de objetos.
+También es relativamente común tener compatibilidad con muchos tipos diferentes de argumentos por facilidad de uso
+(por ejemplo, los archivos de audio pueden ser nombres de archivo, URLs o bytes).
+
+
+## Añadirlo a la lista de tareas
+
+Para registrar tu `new-task` (nueva tarea) en la lista de tareas, debes añadirla al
+`PIPELINE_REGISTRY` (registro de pipelines):
+
+```python
+from transformers.pipelines import PIPELINE_REGISTRY
+
+PIPELINE_REGISTRY.register_pipeline(
+    "new-task",
+    pipeline_class=MyPipeline,
+    pt_model=AutoModelForSequenceClassification,
+)
+```
+
+Puedes especificar un modelo por defecto si lo deseas, en cuyo caso debe venir con una versión específica (que puede ser el nombre de un branch o hash de commit, en este caso usamos `"abcdef"`), así como el tipo:
+
+```python
+PIPELINE_REGISTRY.register_pipeline(
+    "new-task",
+    pipeline_class=MyPipeline,
+    pt_model=AutoModelForSequenceClassification,
+    default={"pt": ("user/awesome_model", "abcdef")},
+    type="text",  # tipo de datos que maneja: texto, audio, imagen, multi-modalidad
+)
+```
+
+## Comparte tu pipeline en el Hub
+
+Para compartir tu pipeline personalizado en el Hub, solo tienes que guardar el código personalizado de tu sub-clase
+`Pipeline` en un archivo de Python. Por ejemplo, digamos que queremos usar un pipeline personalizado para la
+clasificación de duplas de oraciones de esta forma:
+
+```py
+import numpy as np
+
+from transformers import Pipeline
+
+
+def softmax(outputs):
+    maxes = np.max(outputs, axis=-1, keepdims=True)
+    shifted_exp = np.exp(outputs - maxes)
+    return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
+
+
+class PairClassificationPipeline(Pipeline):
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_kwargs = {}
+        if "second_text" in kwargs:
+            preprocess_kwargs["second_text"] = kwargs["second_text"]
+        return preprocess_kwargs, {}, {}
+
+    def preprocess(self, text, second_text=None):
+        return self.tokenizer(text, text_pair=second_text, return_tensors=self.framework)
+
+    def _forward(self, model_inputs):
+        return self.model(**model_inputs)
+
+    def postprocess(self, model_outputs):
+        logits = model_outputs.logits[0].numpy()
+        probabilities = softmax(logits)
+
+        best_class = np.argmax(probabilities)
+        label = self.model.config.id2label[best_class]
+        score = probabilities[best_class].item()
+        logits = logits.tolist()
+        return {"label": label, "score": score, "logits": logits}
+```
+
+La implementación es independiente del framework y funcionará con modelos de PyTorch y TensorFlow. Si guardamos
+esto en un archivo llamado `pair_classification.py`, podemos importarlo y registrarlo de la siguiente manera:
+
+```py
+from pair_classification import PairClassificationPipeline
+from transformers.pipelines import PIPELINE_REGISTRY
+from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification
+
+PIPELINE_REGISTRY.register_pipeline(
+    "pair-classification",
+    pipeline_class=PairClassificationPipeline,
+    pt_model=AutoModelForSequenceClassification,
+    tf_model=TFAutoModelForSequenceClassification,
+)
+```
+
+Una vez hecho esto, podemos usarlo con un modelo pre-entrenado. Por ejemplo, al modelo `sgugger/finetuned-bert-mrpc`
+se le hizo fine-tuning con el dataset MRPC, en el cual se clasifican duplas de oraciones como paráfrasis o no.
+
+```py
+from transformers import pipeline
+
+classifier = pipeline("pair-classification", model="sgugger/finetuned-bert-mrpc")
+```
+
+Ahora podemos compartirlo en el Hub usando el método `save_pretrained` (guardar pre-entrenado) en un `Repository`:
+
+```py
+from huggingface_hub import Repository
+
+repo = Repository("test-dynamic-pipeline", clone_from="{your_username}/test-dynamic-pipeline")
+classifier.save_pretrained("test-dynamic-pipeline")
+repo.push_to_hub()
+```
+
+Esto copiará el archivo donde definiste `PairClassificationPipeline` dentro de la carpeta `"test-dynamic-pipeline"`,
+y además guardará el modelo y el tokenizer del pipeline, antes de enviar todo al repositorio
+`{your_username}/test-dynamic-pipeline`. Después de esto, cualquier persona puede usarlo siempre que usen la opción
+`trust_remote_code=True` (confiar en código remoto):
+
+```py
+from transformers import pipeline
+
+classifier = pipeline(model="{your_username}/test-dynamic-pipeline", trust_remote_code=True)
+```
+
+## Añadir el pipeline a 🤗 Transformers
+
+Si quieres contribuir tu pipeline a la biblioteca 🤗 Transformers, tendrás que añadirlo a un nuevo módulo en el
+sub-módulo `pipelines` con el código de tu pipeline. Luego, debes añadirlo a la lista de tareas definidas en
+`pipelines/__init__.py`.
+
+A continuación tienes que añadir las pruebas. Crea un nuevo archivo llamado `tests/test_pipelines_MY_PIPELINE.py`
+basándote en las pruebas existentes.
+
+La función `run_pipeline_test` será muy genérica y se correrá sobre modelos pequeños escogidos al azar sobre todas las
+arquitecturas posibles definidas en `model_mapping` y `tf_model_mapping`.
+
+Esto es muy importante para probar compatibilidades a futuro, lo que significa que si alguien añade un nuevo modelo
+para `XXXForQuestionAnswering` entonces el pipeline intentará ejecutarse con ese modelo. Ya que los modelos son aleatorios,
+es imposible verificar los valores como tales, y es por eso que hay un helper `ANY` que simplemente intentará que la
+salida tenga el mismo tipo que la salida esperada del pipeline.
+
+También *debes* implementar 2 (preferiblemente 4) pruebas:
+
+- `test_small_model_pt` : Define un (1) modelo pequeño para este pipeline (no importa si los resultados no tienen sentido)
+y prueba las salidas del pipeline. Los resultados deberían ser los mismos que en `test_small_model_tf`.
+- `test_small_model_tf` : Define un (1) modelo pequeño para este pipeline (no importa si los resultados no tienen sentido)
+y prueba las salidas del pipeline. Los resultados deberían ser los mismos que en `test_small_model_pt`.
+- `test_large_model_pt` (`optional`): Prueba el pipeline en una tarea real en la que los resultados deben tener sentido.
+Estas pruebas son lentas y deben marcarse como tales. El objetivo de esto es ejemplificar el pipeline y asegurarse de que
+no haya divergencias en versiones futuras.
+- `test_large_model_tf` (`optional`): Prueba el pipeline en una tarea real en la que los resultados deben tener sentido.
+Estas pruebas son lentas y deben marcarse como tales. El objetivo de esto es ejemplificar el pipeline y asegurarse de que
+no haya divergencias en versiones futuras.
diff --git a/docs/source/es/add_new_pipeline.mdx b/docs/source/es/add_new_pipeline.mdx
deleted file mode 100644
index 8e022077972f..000000000000
--- a/docs/source/es/add_new_pipeline.mdx
+++ /dev/null
@@ -1,260 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
--->
-
-# ¿Cómo puedo crear un pipeline personalizado?
-
-En esta guía, veremos cómo crear un pipeline personalizado y cómo compartirlo en el [Hub](hf.co/models) o añadirlo
-a la biblioteca 🤗 Transformers.
-
-En primer lugar, debes decidir las entradas que tu pipeline podrá recibir. Pueden ser strings, bytes,
-diccionarios o lo que te parezca que vaya a ser la entrada más apropiada. Intenta mantener estas entradas en un
-formato que sea tan Python puro como sea posible, puesto que esto facilita la compatibilidad (incluso con otros
-lenguajes de programación por medio de JSON). Estos serán los `inputs` (entradas) del pipeline (`preprocess`).
-
-Ahora debes definir los `outputs` (salidas). Al igual que con los `inputs`, entre más simple el formato, mejor.
-Estas serán las salidas del método `postprocess` (posprocesamiento).
-
-Empieza heredando la clase base `Pipeline` con los 4 métodos que debemos implementar: `preprocess` (preprocesamiento),
-`_forward` (ejecución), `postprocess` (posprocesamiento) y `_sanitize_parameters` (verificar parámetros).
-
-```python
-from transformers import Pipeline
-
-
-class MyPipeline(Pipeline):
-    def _sanitize_parameters(self, **kwargs):
-        preprocess_kwargs = {}
-        if "maybe_arg" in kwargs:
-            preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
-        return preprocess_kwargs, {}, {}
-
-    def preprocess(self, inputs, maybe_arg=2):
-        model_input = Tensor(inputs["input_ids"])
-        return {"model_input": model_input}
-
-    def _forward(self, model_inputs):
-        # model_inputs == {"model_input": model_input}
-        outputs = self.model(**model_inputs)
-        # Quizá {"logits": Tensor(...)}
-        return outputs
-
-    def postprocess(self, model_outputs):
-        best_class = model_outputs["logits"].softmax(-1)
-        return best_class
-```
-
-La estructura de este desglose es así para garantizar una compatibilidad más o menos transparente con el uso de
-CPU/GPU y el pre/posprocesamiento en CPU en varios hilos.
-
-`preprocess` tomará las entradas definidas originalmente y las convertirá en algo que se le pueda pasar al modelo.
-Podría contener más información y a menudo es un objeto `Dict` (diccionario).
-
-`_forward` contiene los detalles de la implementación y no debería ser invocado de forma directa. `forward` es el
-método preferido a utilizar pues contiene verificaciones para asegurar que todo funcione en el dispositivo correcto.
-Cualquier cosa que esté relacionada con un modelo real debería ir en el método `_forward`, todo lo demás va en
-los métodos de preprocesamiento y posprocesamiento.
-
-Los métodos `postprocess` reciben la salida `_forward` y la convierten en la salida final que decidimos
-anteriormente.
-
-`_sanitize_parameters` existe para permitir a los usuarios pasar cualesquiera parámetros cuando lo deseen, ya
-sea al momento de inicializar el pipeline `pipeline(...., maybe_arg=4)` o al momento de invocarlo
-`pipe = pipeline(...); output = pipe(...., maybe_arg=4)`.
-
-
-El método `_sanitize_parameters` devuelve 3 diccionarios de kwargs que serán pasados directamente a `preprocess`,
-`_forward` y `postprocess`. No ingreses nada si el caller no se va a invocar con parámetros adicionales.
-Esto permite mantener los parámetros por defecto de la definición de la función, lo que es más "natural".
-
-Un ejemplo clásico sería un argumento `top_k` en el posprocesamiento de una tarea de clasificación.
-
-```python
->>> pipe = pipeline("my-new-task")
->>> pipe("This is a test")
-[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}, {"label": "3-star", "score": 0.05}
-{"label": "4-star", "score": 0.025}, {"label": "5-star", "score": 0.025}]
-
->>> pipe("This is a test", top_k=2)
-[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}]
-```
-
-Para lograrlo, actualizaremos nuestro método `postprocess` con un valor por defecto de `5` y  modificaremos
-`_sanitize_parameters` para permitir este nuevo parámetro.
-
-
-```python
-def postprocess(self, model_outputs, top_k=5):
-    best_class = model_outputs["logits"].softmax(-1)
-    # Añade la lógica para manejar el top_k
-    return best_class
-
-
-def _sanitize_parameters(self, **kwargs):
-    preprocess_kwargs = {}
-    if "maybe_arg" in kwargs:
-        preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
-
-    postprocess_kwargs = {}
-    if "top_k" in kwargs:
-        postprocess_kwargs["top_k"] = kwargs["top_k"]
-    return preprocess_kwargs, {}, postprocess_kwargs
-```
-
-Intenta que las entradas y salidas sean muy simples e, idealmente, que puedan serializarse como JSON, pues esto
-hace el uso del pipeline muy sencillo sin que el usuario tenga que preocuparse por conocer nuevos tipos de objetos.
-También es relativamente común tener compatibilidad con muchos tipos diferentes de argumentos por facilidad de uso
-(por ejemplo, los archivos de audio pueden ser nombres de archivo, URLs o bytes).
-
-
-## Añadirlo a la lista de tareas
-
-Para registrar tu `new-task` (nueva tarea) en la lista de tareas, debes añadirla al
-`PIPELINE_REGISTRY` (registro de pipelines):
-
-```python
-from transformers.pipelines import PIPELINE_REGISTRY
-
-PIPELINE_REGISTRY.register_pipeline(
-    "new-task",
-    pipeline_class=MyPipeline,
-    pt_model=AutoModelForSequenceClassification,
-)
-```
-
-Puedes especificar un modelo por defecto si lo deseas, en cuyo caso debe venir con una versión específica (que puede ser el nombre de un branch o hash de commit, en este caso usamos `"abcdef"`), así como el tipo:
-
-```python
-PIPELINE_REGISTRY.register_pipeline(
-    "new-task",
-    pipeline_class=MyPipeline,
-    pt_model=AutoModelForSequenceClassification,
-    default={"pt": ("user/awesome_model", "abcdef")},
-    type="text",  # tipo de datos que maneja: texto, audio, imagen, multi-modalidad
-)
-```
-
-## Comparte tu pipeline en el Hub
-
-Para compartir tu pipeline personalizado en el Hub, solo tienes que guardar el código personalizado de tu sub-clase
-`Pipeline` en un archivo de Python. Por ejemplo, digamos que queremos usar un pipeline personalizado para la
-clasificación de duplas de oraciones de esta forma:
-
-```py
-import numpy as np
-
-from transformers import Pipeline
-
-
-def softmax(outputs):
-    maxes = np.max(outputs, axis=-1, keepdims=True)
-    shifted_exp = np.exp(outputs - maxes)
-    return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
-
-
-class PairClassificationPipeline(Pipeline):
-    def _sanitize_parameters(self, **kwargs):
-        preprocess_kwargs = {}
-        if "second_text" in kwargs:
-            preprocess_kwargs["second_text"] = kwargs["second_text"]
-        return preprocess_kwargs, {}, {}
-
-    def preprocess(self, text, second_text=None):
-        return self.tokenizer(text, text_pair=second_text, return_tensors=self.framework)
-
-    def _forward(self, model_inputs):
-        return self.model(**model_inputs)
-
-    def postprocess(self, model_outputs):
-        logits = model_outputs.logits[0].numpy()
-        probabilities = softmax(logits)
-
-        best_class = np.argmax(probabilities)
-        label = self.model.config.id2label[best_class]
-        score = probabilities[best_class].item()
-        logits = logits.tolist()
-        return {"label": label, "score": score, "logits": logits}
-```
-
-La implementación es independiente del framework y funcionará con modelos de PyTorch y TensorFlow. Si guardamos
-esto en un archivo llamado `pair_classification.py`, podemos importarlo y registrarlo de la siguiente manera:
-
-```py
-from pair_classification import PairClassificationPipeline
-from transformers.pipelines import PIPELINE_REGISTRY
-from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification
-
-PIPELINE_REGISTRY.register_pipeline(
-    "pair-classification",
-    pipeline_class=PairClassificationPipeline,
-    pt_model=AutoModelForSequenceClassification,
-    tf_model=TFAutoModelForSequenceClassification,
-)
-```
-
-Una vez hecho esto, podemos usarlo con un modelo pre-entrenado. Por ejemplo, al modelo `sgugger/finetuned-bert-mrpc`
-se le hizo fine-tuning con el dataset MRPC, en el cual se clasifican duplas de oraciones como paráfrasis o no.
-
-```py
-from transformers import pipeline
-
-classifier = pipeline("pair-classification", model="sgugger/finetuned-bert-mrpc")
-```
-
-Ahora podemos compartirlo en el Hub usando el método `save_pretrained` (guardar pre-entrenado) en un `Repository`:
-
-```py
-from huggingface_hub import Repository
-
-repo = Repository("test-dynamic-pipeline", clone_from="{your_username}/test-dynamic-pipeline")
-classifier.save_pretrained("test-dynamic-pipeline")
-repo.push_to_hub()
-```
-
-Esto copiará el archivo donde definiste `PairClassificationPipeline` dentro de la carpeta `"test-dynamic-pipeline"`,
-y además guardará el modelo y el tokenizer del pipeline, antes de enviar todo al repositorio
-`{your_username}/test-dynamic-pipeline`. Después de esto, cualquier persona puede usarlo siempre que usen la opción
-`trust_remote_code=True` (confiar en código remoto):
-
-```py
-from transformers import pipeline
-
-classifier = pipeline(model="{your_username}/test-dynamic-pipeline", trust_remote_code=True)
-```
-
-## Añadir el pipeline a 🤗 Transformers
-
-Si quieres contribuir tu pipeline a la biblioteca 🤗 Transformers, tendrás que añadirlo a un nuevo módulo en el
-sub-módulo `pipelines` con el código de tu pipeline. Luego, debes añadirlo a la lista de tareas definidas en
-`pipelines/__init__.py`.
-
-A continuación tienes que añadir las pruebas. Crea un nuevo archivo llamado `tests/test_pipelines_MY_PIPELINE.py`
-basándote en las pruebas existentes.
-
-La función `run_pipeline_test` será muy genérica y se correrá sobre modelos pequeños escogidos al azar sobre todas las
-arquitecturas posibles definidas en `model_mapping` y `tf_model_mapping`.
-
-Esto es muy importante para probar compatibilidades a futuro, lo que significa que si alguien añade un nuevo modelo
-para `XXXForQuestionAnswering` entonces el pipeline intentará ejecutarse con ese modelo. Ya que los modelos son aleatorios,
-es imposible verificar los valores como tales, y es por eso que hay un helper `ANY` que simplemente intentará que la
-salida tenga el mismo tipo que la salida esperada del pipeline.
-
-También *debes* implementar 2 (preferiblemente 4) pruebas:
-
-- `test_small_model_pt` : Define un (1) modelo pequeño para este pipeline (no importa si los resultados no tienen sentido)
-y prueba las salidas del pipeline. Los resultados deberían ser los mismos que en `test_small_model_tf`.
-- `test_small_model_tf` : Define un (1) modelo pequeño para este pipeline (no importa si los resultados no tienen sentido)
-y prueba las salidas del pipeline. Los resultados deberían ser los mismos que en `test_small_model_pt`.
-- `test_large_model_pt` (`optional`): Prueba el pipeline en una tarea real en la que los resultados deben tener sentido.
-Estas pruebas son lentas y deben marcarse como tales. El objetivo de esto es ejemplificar el pipeline y asegurarse de que
-no haya divergencias en versiones futuras.
-- `test_large_model_tf` (`optional`): Prueba el pipeline en una tarea real en la que los resultados deben tener sentido.
-Estas pruebas son lentas y deben marcarse como tales. El objetivo de esto es ejemplificar el pipeline y asegurarse de que
-no haya divergencias en versiones futuras.
diff --git a/docs/source/es/autoclass_tutorial.md b/docs/source/es/autoclass_tutorial.md
new file mode 100644
index 000000000000..8b3ddd230b6b
--- /dev/null
+++ b/docs/source/es/autoclass_tutorial.md
@@ -0,0 +1,123 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Carga instancias preentrenadas con un AutoClass
+
+Con tantas arquitecturas diferentes de Transformer puede ser retador crear una para tu checkpoint. Como parte de la filosofía central de 🤗 Transformers para hacer que la biblioteca sea fácil, simple y flexible de usar; una `AutoClass` automáticamente infiere y carga la arquitectura correcta desde un checkpoint dado. El método `from_pretrained` te permite cargar rápidamente un modelo preentrenado para cualquier arquitectura, por lo que no tendrás que dedicar tiempo y recursos para entrenar uno desde cero. Producir este tipo de código con checkpoint implica que si funciona con uno, funcionará también con otro (siempre que haya sido entrenado para una tarea similar) incluso si la arquitectura es distinta.
+
+<Tip>
+
+Recuerda, la arquitectura se refiere al esqueleto del modelo y los checkpoints son los pesos para una arquitectura dada. Por ejemplo, [BERT](https://huggingface.co/bert-base-uncased) es una arquitectura, mientras que `bert-base-uncased` es un checkpoint. Modelo es un término general que puede significar una arquitectura o un checkpoint.
+
+</Tip>
+
+En este tutorial, aprenderás a:
+
+* Cargar un tokenizador pre-entrenado.
+* Cargar un extractor de características (feature extractor en inglés) pre-entrenado.
+* Cargar un procesador pre-entrenado.
+* Cargar un modelo pre-entrenado.
+
+## AutoTokenizer
+
+Casi cualquier tarea de Procesamiento de Lenguaje Natural comienza con un tokenizador. Un tokenizador convierte tu input a un formato que puede ser procesado por el modelo.
+
+Carga un tokenizador con [`AutoTokenizer.from_pretrained`]:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+```
+
+Luego tokeniza tu input como lo mostrado a continuación:
+
+```py
+>>> sequence = "In a hole in the ground there lived a hobbit."
+>>> print(tokenizer(sequence))
+{'input_ids': [101, 1999, 1037, 4920, 1999, 1996, 2598, 2045, 2973, 1037, 7570, 10322, 4183, 1012, 102], 
+ 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+ 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+```
+
+## AutoFeatureExtractor
+
+Para tareas de audio y visión, un extractor de características procesa la señal de audio o imagen al formato de input correcto.
+
+Carga un extractor de características con [`AutoFeatureExtractor.from_pretrained`]:
+
+```py
+>>> from transformers import AutoFeatureExtractor
+
+>>> feature_extractor = AutoFeatureExtractor.from_pretrained(
+...     "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
+... )
+```
+
+## AutoProcessor
+
+Las tareas multimodales requieren un procesador que combine dos tipos de herramientas de preprocesamiento. Por ejemplo, el modelo [LayoutLMV2](model_doc/layoutlmv2) requiere que un extractor de características maneje las imágenes y que un tokenizador maneje el texto; un procesador combina ambas.
+
+Carga un procesador con [`AutoProcessor.from_pretrained`]:
+
+```py
+>>> from transformers import AutoProcessor
+
+>>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased")
+```
+
+## AutoModel
+
+<frameworkcontent>
+<pt>
+Finalmente, las clases `AutoModelFor` te permiten cargar un modelo preentrenado para una tarea dada (revisa [aquí](model_doc/auto) para conocer la lista completa de tareas disponibles). Por ejemplo, cargue un modelo para clasificación de secuencias con [`AutoModelForSequenceClassification.from_pretrained`]:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+```
+
+Reutiliza fácilmente el mismo checkpoint para cargar una aquitectura para alguna tarea diferente:
+
+```py
+>>> from transformers import AutoModelForTokenClassification
+
+>>> model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
+```
+
+Generalmente recomendamos utilizar las clases `AutoTokenizer` y `AutoModelFor` para cargar instancias pre-entrenadas de modelos. Ésto asegurará que cargues la arquitectura correcta en cada ocasión. En el siguiente [tutorial](preprocessing), aprende a usar tu tokenizador recién cargado, el extractor de características y el procesador para preprocesar un dataset para fine-tuning.
+</pt>
+<tf>
+Finalmente, la clase `TFAutoModelFor` te permite cargar tu modelo pre-entrenado para una tarea dada (revisa [aquí](model_doc/auto) para conocer la lista completa de tareas disponibles). Por ejemplo, carga un modelo para clasificación de secuencias con [`TFAutoModelForSequenceClassification.from_pretrained`]:
+
+```py
+>>> from transformers import TFAutoModelForSequenceClassification
+
+>>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+```
+
+Reutiliza fácilmente el mismo checkpoint para cargar una aquitectura para alguna tarea diferente:
+
+```py
+>>> from transformers import TFAutoModelForTokenClassification
+
+>>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
+```
+
+Generalmente recomendamos utilizar las clases `AutoTokenizer` y `TFAutoModelFor` para cargar instancias de modelos pre-entrenados. Ésto asegurará que cargues la arquitectura correcta cada vez. En el siguiente [tutorial](preprocessing), aprende a usar tu tokenizador recién cargado, el extractor de características y el procesador para preprocesar un dataset para fine-tuning.
+</tf>
+</frameworkcontent>
diff --git a/docs/source/es/autoclass_tutorial.mdx b/docs/source/es/autoclass_tutorial.mdx
deleted file mode 100644
index e04a639422bb..000000000000
--- a/docs/source/es/autoclass_tutorial.mdx
+++ /dev/null
@@ -1,119 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Carga instancias preentrenadas con un AutoClass
-
-Con tantas arquitecturas diferentes de Transformer puede ser retador crear una para tu checkpoint. Como parte de la filosofía central de 🤗 Transformers para hacer que la biblioteca sea fácil, simple y flexible de usar; una `AutoClass` automáticamente infiere y carga la arquitectura correcta desde un checkpoint dado. El método `from_pretrained` te permite cargar rápidamente un modelo preentrenado para cualquier arquitectura, por lo que no tendrás que dedicar tiempo y recursos para entrenar uno desde cero. Producir este tipo de código con checkpoint implica que si funciona con uno, funcionará también con otro (siempre que haya sido entrenado para una tarea similar) incluso si la arquitectura es distinta.
-
-<Tip>
-
-Recuerda, la arquitectura se refiere al esqueleto del modelo y los checkpoints son los pesos para una arquitectura dada. Por ejemplo, [BERT](https://huggingface.co/bert-base-uncased) es una arquitectura, mientras que `bert-base-uncased` es un checkpoint. Modelo es un término general que puede significar una arquitectura o un checkpoint.
-
-</Tip>
-
-En este tutorial, aprenderás a:
-
-* Cargar un tokenizador pre-entrenado.
-* Cargar un extractor de características (feature extractor en inglés) pre-entrenado.
-* Cargar un procesador pre-entrenado.
-* Cargar un modelo pre-entrenado.
-
-## AutoTokenizer
-
-Casi cualquier tarea de Procesamiento de Lenguaje Natural comienza con un tokenizador. Un tokenizador convierte tu input a un formato que puede ser procesado por el modelo.
-
-Carga un tokenizador con [`AutoTokenizer.from_pretrained`]:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
-```
-
-Luego tokeniza tu input como lo mostrado a continuación:
-
-```py
->>> sequence = "In a hole in the ground there lived a hobbit."
->>> print(tokenizer(sequence))
-{'input_ids': [101, 1999, 1037, 4920, 1999, 1996, 2598, 2045, 2973, 1037, 7570, 10322, 4183, 1012, 102], 
- 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
- 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
-```
-
-## AutoFeatureExtractor
-
-Para tareas de audio y visión, un extractor de características procesa la señal de audio o imagen al formato de input correcto.
-
-Carga un extractor de características con [`AutoFeatureExtractor.from_pretrained`]:
-
-```py
->>> from transformers import AutoFeatureExtractor
-
->>> feature_extractor = AutoFeatureExtractor.from_pretrained(
-...     "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
-... )
-```
-
-## AutoProcessor
-
-Las tareas multimodales requieren un procesador que combine dos tipos de herramientas de preprocesamiento. Por ejemplo, el modelo [LayoutLMV2](model_doc/layoutlmv2) requiere que un extractor de características maneje las imágenes y que un tokenizador maneje el texto; un procesador combina ambas.
-
-Carga un procesador con [`AutoProcessor.from_pretrained`]:
-
-```py
->>> from transformers import AutoProcessor
-
->>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased")
-```
-
-## AutoModel
-
-<frameworkcontent>
-<pt>
-Finalmente, las clases `AutoModelFor` te permiten cargar un modelo preentrenado para una tarea dada (revisa [aquí](model_doc/auto) para conocer la lista completa de tareas disponibles). Por ejemplo, cargue un modelo para clasificación de secuencias con [`AutoModelForSequenceClassification.from_pretrained`]:
-
-```py
->>> from transformers import AutoModelForSequenceClassification
-
->>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
-```
-
-Reutiliza fácilmente el mismo checkpoint para cargar una aquitectura para alguna tarea diferente:
-
-```py
->>> from transformers import AutoModelForTokenClassification
-
->>> model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
-```
-
-Generalmente recomendamos utilizar las clases `AutoTokenizer` y `AutoModelFor` para cargar instancias pre-entrenadas de modelos. Ésto asegurará que cargues la arquitectura correcta en cada ocasión. En el siguiente [tutorial](preprocessing), aprende a usar tu tokenizador recién cargado, el extractor de características y el procesador para preprocesar un dataset para fine-tuning.
-</pt>
-<tf>
-Finalmente, la clase `TFAutoModelFor` te permite cargar tu modelo pre-entrenado para una tarea dada (revisa [aquí](model_doc/auto) para conocer la lista completa de tareas disponibles). Por ejemplo, carga un modelo para clasificación de secuencias con [`TFAutoModelForSequenceClassification.from_pretrained`]:
-
-```py
->>> from transformers import TFAutoModelForSequenceClassification
-
->>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
-```
-
-Reutiliza fácilmente el mismo checkpoint para cargar una aquitectura para alguna tarea diferente:
-
-```py
->>> from transformers import TFAutoModelForTokenClassification
-
->>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
-```
-
-Generalmente recomendamos utilizar las clases `AutoTokenizer` y `TFAutoModelFor` para cargar instancias de modelos pre-entrenados. Ésto asegurará que cargues la arquitectura correcta cada vez. En el siguiente [tutorial](preprocessing), aprende a usar tu tokenizador recién cargado, el extractor de características y el procesador para preprocesar un dataset para fine-tuning.
-</tf>
-</frameworkcontent>
diff --git a/docs/source/es/bertology.md b/docs/source/es/bertology.md
new file mode 100644
index 000000000000..ed4e12a8d59c
--- /dev/null
+++ b/docs/source/es/bertology.md
@@ -0,0 +1,41 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BERTología
+
+Hay un creciente campo de estudio empeñado en la investigación del funcionamiento interno de los transformers de gran escala como BERT
+(que algunos llaman "BERTología"). Algunos buenos ejemplos de este campo son:
+
+
+- BERT Rediscovers the Classical NLP Pipeline por Ian Tenney, Dipanjan Das, Ellie Pavlick:
+  https://arxiv.org/abs/1905.05950
+- Are Sixteen Heads Really Better than One? por Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650
+- What Does BERT Look At? An Analysis of BERT's Attention por Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D.
+  Manning: https://arxiv.org/abs/1906.04341
+- CAT-probing: A Metric-based Approach to Interpret How Pre-trained Models for Programming Language Attend Code Structure: https://arxiv.org/abs/2210.04633
+
+Para asistir al desarrollo de este nuevo campo, hemos incluido algunas features adicionales en los modelos BERT/GPT/GPT-2 para
+ayudar a acceder a las representaciones internas, principalmente adaptado de la gran obra de Paul Michel
+(https://arxiv.org/abs/1905.10650):
+
+
+- accediendo a todos los hidden-states de BERT/GPT/GPT-2,
+- accediendo a todos los pesos de atención para cada head de BERT/GPT/GPT-2,
+- adquiriendo los valores de salida y gradientes de las heads para poder computar la métrica de importancia de las heads y realizar la poda de heads como se explica
+  en https://arxiv.org/abs/1905.10650.
+
+Para ayudarte a entender y usar estas features, hemos añadido un script específico de ejemplo: [bertology.py](https://github.com/huggingface/transformers/tree/main/examples/research_projects/bertology/run_bertology.py) mientras extraes información y cortas un modelo pre-entrenado en
+GLUE.
diff --git a/docs/source/es/bertology.mdx b/docs/source/es/bertology.mdx
deleted file mode 100644
index 4a3a1e551bcf..000000000000
--- a/docs/source/es/bertology.mdx
+++ /dev/null
@@ -1,36 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# BERTología
-
-Hay un creciente campo de estudio empeñado en la investigación del funcionamiento interno de los transformers de gran escala como BERT
-(que algunos llaman "BERTología"). Algunos buenos ejemplos de este campo son:
-
-
-- BERT Rediscovers the Classical NLP Pipeline por Ian Tenney, Dipanjan Das, Ellie Pavlick:
-  https://arxiv.org/abs/1905.05950
-- Are Sixteen Heads Really Better than One? por Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650
-- What Does BERT Look At? An Analysis of BERT's Attention por Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D.
-  Manning: https://arxiv.org/abs/1906.04341
-
-Para asistir al desarrollo de este nuevo campo, hemos incluido algunas features adicionales en los modelos BERT/GPT/GPT-2 para
-ayudar a acceder a las representaciones internas, principalmente adaptado de la gran obra de Paul Michel
-(https://arxiv.org/abs/1905.10650):
-
-
-- accediendo a todos los hidden-states de BERT/GPT/GPT-2,
-- accediendo a todos los pesos de atención para cada head de BERT/GPT/GPT-2,
-- adquiriendo los valores de salida y gradientes de las heads para poder computar la métrica de importancia de las heads y realizar la poda de heads como se explica
-  en https://arxiv.org/abs/1905.10650.
-
-Para ayudarte a entender y usar estas features, hemos añadido un script específico de ejemplo: [bertology.py](https://github.com/huggingface/transformers/tree/main/examples/research_projects/bertology/run_bertology.py) mientras extraes información y cortas un modelo pre-entrenado en
-GLUE.
diff --git a/docs/source/es/community.md b/docs/source/es/community.md
new file mode 100644
index 000000000000..261970e6fe7d
--- /dev/null
+++ b/docs/source/es/community.md
@@ -0,0 +1,69 @@
+<!--⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+# Comunidad
+
+Esta página agrupa los recursos de 🤗 Transformers desarrollados por la comunidad.
+
+## Los recursos de la comunidad:
+
+| Recurso     |      Descripción     |      Autor      |
+|:----------|:-------------|------:|
+| [Hugging Face Transformers Glossary Flashcards](https://www.darigovresearch.com/huggingface-transformers-glossary-flashcards) | Un conjunto de flashcards basadas en el [Glosario de documentos de Transformers] (glosario) que se ha puesto en un formato que se puede aprender/revisar fácilmente usando [Anki] (https://apps.ankiweb.net/) una fuente abierta, aplicación de multiplataforma diseñada específicamente para la retención de conocimientos a largo plazo. Ve este [Introductory video on how to use the flashcards](https://www.youtube.com/watch?v=Dji_h7PILrw). | [Darigov Research](https://www.darigovresearch.com/) |
+
+## Los cuadernos de la comunidad:
+
+| Cuaderno     |      Descripción      |      Autor      |      |
+|:----------|:-------------|:-------------|------:|
+| [Ajustar un transformador preentrenado para generar letras](https://github.com/AlekseyKorshuk/huggingartists) | Cómo generar letras al estilo de tu artista favorito ajustando un modelo GPT-2 |  [Aleksey Korshuk](https://github.com/AlekseyKorshuk) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AlekseyKorshuk/huggingartists/blob/master/huggingartists-demo.ipynb) |
+| [Entrenar T5 en Tensorflow 2](https://github.com/snapthat/TF-T5-text-to-text) | Cómo entrenar a T5 para cualquier tarea usando Tensorflow 2. Este cuaderno demuestra una tarea de preguntas y respuestas implementada en Tensorflow 2 usando SQUAD | [Muhammad Harris](https://github.com/HarrisDePerceptron) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snapthat/TF-T5-text-to-text/blob/master/snapthatT5/notebooks/TF-T5-Datasets%20Training.ipynb) |
+| [Entrenar T5 en TPU](https://github.com/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb)  | Cómo entrenar a T5 en SQUAD con Transformers y Nlp | [Suraj Patil](https://github.com/patil-suraj) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=QLGiFCDqvuil) |
+| [Ajustar T5 para Clasificación y Opción Múltiple](https://github.com/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb)  | Cómo ajustar T5 para clasificación y tareas de opción múltiple usando un formato de texto a texto con PyTorch Lightning |  [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) |
+| [Ajustar DialoGPT en nuevos conjuntos de datos e idiomas](https://github.com/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb)  | Cómo ajustar el modelo DialoGPT en un nuevo conjunto de datos para chatbots conversacionales de diálogo abierto |  [Nathan Cooper](https://github.com/ncoop57) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) |
+| [Modelado de secuencias largas con Reformer](https://github.com/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb)  | Cómo entrenar en secuencias de hasta 500,000 tokens con Reformer |  [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb)  |
+| [Ajustar BART para resumir](https://github.com/ohmeow/ohmeow_website/blob/master/_notebooks/2020-05-23-text-generation-with-blurr.ipynb) | Cómo ajustar BART para resumir con fastai usando blurr | [Wayde Gilliam](https://ohmeow.com/) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ohmeow/ohmeow_website/blob/master/_notebooks/2020-05-23-text-generation-with-blurr.ipynb) |
+| [Ajustar un Transformador previamente entrenado en los tweets de cualquier persona](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) | Cómo generar tweets al estilo de tu cuenta de Twitter favorita ajustando un modelo GPT-2 |  [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) |
+| [Optimizar 🤗 modelos de Hugging Face con pesos y sesgos](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) | Un tutorial completo que muestra la integración de W&B con Hugging Face | [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) |
+| [Preentrenar Longformer](https://github.com/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb)  | Cómo construir una versión "larga" de modelos preentrenados existentes |  [Iz Beltagy](https://beltagy.net) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) |
+| [Ajustar Longformer para control de calidad](https://github.com/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) | Cómo ajustar el modelo antiguo para la tarea de control de calidad | [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) |
+| [Evaluar modelo con 🤗nlp](https://github.com/patrickvonplaten/notebooks/blob/master/How_to_evaluate_Longformer_on_TriviaQA_using_NLP.ipynb) | Cómo evaluar longformer en TriviaQA con `nlp` | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1m7eTGlPmLRgoPkkA7rkhQdZ9ydpmsdLE?usp=sharing) |
+| [Ajustar fino de T5 para la extracción de amplitud de opinión](https://github.com/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb)  | Cómo ajustar T5 para la extracción de intervalos de opiniones mediante un formato de texto a texto con PyTorch Lightning |  [Lorenzo Ampil](https://github.com/enzoampil) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb) |
+| [Ajustar fino de DistilBert para la clasificación multiclase](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb) | Cómo ajustar DistilBert para la clasificación multiclase con PyTorch | [Abhishek Kumar Mishra](https://github.com/abhimishra91) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb)|
+|[Ajustar BERT para la clasificación de etiquetas múltiples](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)| Cómo ajustar BERT para la clasificación de múltiples etiquetas usando PyTorch |[Abhishek Kumar Mishra](https://github.com/abhimishra91) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)|
+|[Ajustar T5 para resumir](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)| Cómo ajustar T5 para resumir en PyTorch y realizar un seguimiento de los experimentos con WandB |[Abhishek Kumar Mishra](https://github.com/abhimishra91) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)|
+|[Acelerar el ajuste fino en transformadores con Dynamic Padding/Bucketing](https://github.com/ELS-RD/transformers-notebook/blob/master/Divide_Hugging_Face_Transformers_training_time_by_2_or_more.ipynb)| Cómo acelerar el ajuste fino en un factor de 2 usando relleno dinámico/cubetas |[Michael Benesty](https://github.com/pommedeterresautee) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1CBfRU1zbfu7-ijiOqAAQUA-RJaxfcJoO?usp=sharing)|
+|[Preentrenar Reformer para modelado de lenguaje enmascarado](https://github.com/patrickvonplaten/notebooks/blob/master/Reformer_For_Masked_LM.ipynb)| Cómo entrenar un modelo Reformer con capas de autoatención bidireccionales | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tzzh0i8PgDQGV3SMFUGxM7_gGae3K-uW?usp=sharing)|
+|[Ampliar y ajustar Sci-BERT](https://github.com/lordtt13/word-embeddings/blob/master/COVID-19%20Research%20Data/COVID-SciBERT.ipynb)| Cómo aumentar el vocabulario de un modelo SciBERT preentrenado de AllenAI en el conjunto de datos CORD y canalizarlo. | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1rqAR40goxbAfez1xvF3hBJphSCsvXmh8)|
+|[Ajustar fino de BlenderBotSmall para resúmenes usando la API de Entrenador](https://github.com/lordtt13/transformers-experiments/blob/master/Custom%20Tasks/fine-tune-blenderbot_small-for-summarization.ipynb)| Cómo ajustar BlenderBotSmall para resumir en un conjunto de datos personalizado, utilizando la API de Entrenador. | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/19Wmupuls7mykSGyRN_Qo6lPQhgp56ymq?usp=sharing)|
+|[Ajustar Electra e interpreta con gradientes integrados](https://github.com/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb) | Cómo ajustar Electra para el análisis de sentimientos e interpretar predicciones con Captum Integrated Gradients | [Eliza Szczechla](https://elsanns.github.io) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb)|
+|[ajustar un modelo GPT-2 que no está en inglés con la clase Trainer](https://github.com/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb) | Cómo ajustar un modelo GPT-2 que no está en inglés con la clase Trainer | [Philipp Schmid](https://www.philschmid.de) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb)|
+|[Ajustar un modelo DistilBERT para la tarea de clasificación de múltiples etiquetas](https://github.com/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb) | Cómo ajustar un modelo DistilBERT para la tarea de clasificación de múltiples etiquetas | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb)|
+|[Ajustar ALBERT para la clasificación de pares de oraciones](https://github.com/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb) | Cómo ajustar un modelo ALBERT u otro modelo basado en BERT para la tarea de clasificación de pares de oraciones | [Nadir El Manouzi](https://github.com/NadirEM) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb)|
+|[Ajustar a Roberta para el análisis de sentimientos](https://github.com/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb) | Cómo ajustar un modelo de Roberta para el análisis de sentimientos | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb)|
+|[Evaluación de modelos de generación de preguntas](https://github.com/flexudy-pipe/qugeev) | ¿Qué tan precisas son las respuestas a las preguntas generadas por tu modelo de transformador seq2seq? | [Pascal Zoleko](https://github.com/zolekode) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1bpsSqCQU-iw_5nNoRm_crPq6FRuJthq_?usp=sharing)|
+|[Clasificar texto con DistilBERT y Tensorflow](https://github.com/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb) | Cómo ajustar DistilBERT para la clasificación de texto en TensorFlow | [Peter Bayerle](https://github.com/peterbayerle) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb)|
+|[Aprovechar BERT para el resumen de codificador y decodificador en CNN/Dailymail](https://github.com/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb) | Cómo iniciar en caliente un *EncoderDecoderModel* con un punto de control *bert-base-uncased* para resumir en CNN/Dailymail | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb)|
+|[Aprovechar RoBERTa para el resumen de codificador-decodificador en BBC XSum](https://github.com/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb) | Cómo iniciar en caliente un *EncoderDecoderModel* compartido con un punto de control *roberta-base* para resumir en BBC/XSum | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb)|
+|[Ajustar TAPAS en Sequential Question Answering (SQA)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) | Cómo ajustar *TapasForQuestionAnswering* con un punto de control *tapas-base* en el conjunto de datos del Sequential Question Answering (SQA) | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb)|
+|[Evaluar TAPAS en Table Fact Checking (TabFact)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb) | Cómo evaluar un *TapasForSequenceClassification* ajustado con un punto de control *tapas-base-finetuned-tabfact* usando una combinación de 🤗 conjuntos de datos y 🤗 bibliotecas de transformadores | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb)|
+|[Ajustar de mBART para traducción](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb) | Cómo ajustar mBART utilizando Seq2SeqTrainer para la traducción del hindi al inglés | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb)|
+|[Ajustar LayoutLM en FUNSD (a form understanding dataset)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb) | Cómo ajustar *LayoutLMForTokenClassification* en el conjunto de datos de FUNSD para la extracción de información de documentos escaneados | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb)|
+|[Ajustar DistilGPT2 y genere texto](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb) | Cómo ajustar DistilGPT2 y generar texto | [Aakash Tripathi](https://github.com/tripathiaakash) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb)|
+|[Ajustar LED en tokens de hasta 8K](https://github.com/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb) | Cómo ajustar LED en pubmed para resúmenes de largo alcance | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb)|
+|[Evaluar LED en Arxiv](https://github.com/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb) | Cómo evaluar efectivamente LED en resúmenes de largo alcance | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb)|
+|[Ajustar fino de LayoutLM en RVL-CDIP (un conjunto de datos de clasificación de imágenes de documentos)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb) | Cómo ajustar *LayoutLMForSequenceClassification* en el conjunto de datos RVL-CDIP para la clasificación de documentos escaneados | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb)|
+|[Decodificación Wav2Vec2 CTC con ajuste GPT2](https://github.com/voidful/huggingface_notebook/blob/main/xlsr_gpt.ipynb) | Cómo decodificar la secuencia CTC con el ajuste del modelo de lenguaje | [Eric Lam](https://github.com/voidful) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1e_z5jQHYbO2YKEaUgzb1ww1WwiAyydAj?usp=sharing)|
+|[Ajustar BART para resúmenes en dos idiomas con la clase Trainer](https://github.com/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb) | Cómo ajustar BART para resúmenes en dos idiomas con la clase Trainer | [Eliza Szczechla](https://github.com/elsanns) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb)|
+|[Evaluar Big Bird en Trivia QA](https://github.com/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb) | Cómo evaluar BigBird en respuesta a preguntas de documentos largos en Trivia QA | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb)|
+| [Crear subtítulos de video usando Wav2Vec2](https://github.com/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) | Cómo crear subtítulos de YouTube a partir de cualquier vídeo transcribiendo el audio con Wav2Vec | [Niklas Muennighoff](https://github.com/Muennighoff) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) |
+| [Ajustar el transformador de visión en CIFAR-10 usando PyTorch Lightning](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) | Cómo ajustar el transformador de visión (ViT) en CIFAR-10 usando transformadores HuggingFace, conjuntos de datos y PyTorch Lightning | [Niels Rogge](https://github.com/nielsrogge) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) |
+| [Ajustar el Transformador de visión en CIFAR-10 usando el 🤗 Entrenador](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) | Cómo ajustar el Vision Transformer (ViT) en CIFAR-10 usando HuggingFace Transformers, Datasets y el 🤗 Trainer | [Niels Rogge](https://github.com/nielsrogge) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) |
+| [Evaluar LUKE en Open Entity, un conjunto de datos de tipificación de entidades](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) | Cómo evaluar *LukeForEntityClassification* en el conjunto de datos de entidad abierta | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) |
+| [Evaluar LUKE en TACRED, un conjunto de datos de extracción de relaciones](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) | Cómo evaluar *LukeForEntityPairClassification* en el conjunto de datos TACRED | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) |
+| [Evaluar LUKE en CoNLL-2003, un punto de referencia importante de NER](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) | Cómo evaluar *LukeForEntitySpanClassification* en el conjunto de datos CoNLL-2003 | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) |
+| [Evaluar BigBird-Pegasus en el conjunto de datos de PubMed](https://github.com/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) | Cómo evaluar *BigBirdPegasusForConditionalGeneration* en el conjunto de datos de PubMed | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) |
+| [Clasificación de emociones del habla con Wav2Vec2](https://github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) | Cómo aprovechar un modelo Wav2Vec2 preentrenado para la clasificación de emociones en el conjunto de datos MEGA | [Mehrdad Farahani](https://github.com/m3hrdadfi) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) |
+| [Detectar objetos en una imagen con DETR](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) | Cómo usar un modelo entrenado *DetrForObjectDetection* para detectar objetos en una imagen y visualizar la atención | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) |
+| [Ajustar el DETR en un conjunto de datos de detección de objetos personalizados](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) | Cómo ajustar *DetrForObjectDetection* en un conjunto de datos de detección de objetos personalizados | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) |
+| [Ajustar T5 para el reconocimiento de entidades nombradas](https://github.com/ToluClassics/Notebooks/blob/main/T5_Ner_Finetuning.ipynb) | Cómo ajustar *T5* en una tarea de reconocimiento de entidad nombrada | [Ogundepo Odunayo](https://github.com/ToluClassics) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1obr78FY_cBmWY5ODViCmzdY6O1KB65Vc?usp=sharing) |
diff --git a/docs/source/es/converting_tensorflow_models.md b/docs/source/es/converting_tensorflow_models.md
new file mode 100644
index 000000000000..c7e22bddac70
--- /dev/null
+++ b/docs/source/es/converting_tensorflow_models.md
@@ -0,0 +1,153 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Convertir checkpoints de Tensorflow
+
+Te proporcionamos una interfaz de línea de comando (`CLI`, por sus siglas en inglés) para convertir puntos de control (_checkpoints_) originales de Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM en modelos que se puedan cargar utilizando los métodos `from_pretrained` de la biblioteca.
+
+<Tip>
+
+Desde 2.3.0, el script para convertir es parte de la CLI de transformers (**transformers-cli**) disponible en cualquier instalación de transformers >= 2.3.0.
+
+La siguiente documentación refleja el formato para el comando **transformers-cli convert**.
+
+</Tip>
+
+## BERT
+
+Puedes convertir cualquier checkpoint de TensorFlow para BERT (en particular, [los modelos pre-entrenados y publicados por Google](https://github.com/google-research/bert#pre-trained-models)) en un archivo de PyTorch mediante el script [convert_bert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py).
+
+Esta CLI toma como entrada un checkpoint de TensorFlow (tres archivos que comienzan con `bert_model.ckpt`) y el archivo de configuración asociado (`bert_config.json`), y crea un modelo PyTorch para esta configuración, carga los pesos del checkpoint de TensorFlow en el modelo de PyTorch y guarda el modelo resultante en un archivo estándar de PyTorch que se puede importar usando `from_pretrained()` (ve el ejemplo en [Tour rápido](quicktour), [run_glue.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification/run_glue.py)).
+
+Solo necesitas ejecutar este script **una vez** para convertir un modelo a PyTorch. Después, puedes ignorar el checkpoint de TensorFlow (los tres archivos que comienzan con `bert_model.ckpt`), pero asegúrate de conservar el archivo de configuración (`bert_config.json`) y el archivo de vocabulario (`vocab.txt`) ya que estos también son necesarios para el modelo en PyTorch.
+
+Para ejecutar este script deberás tener instalado TensorFlow y PyTorch (`pip install tensorflow`). El resto del repositorio solo requiere PyTorch.
+
+Aquí hay un ejemplo del proceso para convertir un modelo `BERT-Base Uncased` pre-entrenado:
+
+```bash
+export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
+
+transformers-cli convert --model_type bert \
+  --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
+  --config $BERT_BASE_DIR/bert_config.json \
+  --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
+```
+
+Puedes descargar los modelos pre-entrenados de Google para la conversión [aquí](https://github.com/google-research/bert#pre-trained-models).
+
+## ALBERT
+
+Convierte los checkpoints del modelo ALBERT de TensorFlow a PyTorch usando el script [convert_albert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py).
+
+La CLI toma como entrada un checkpoint de TensorFlow (tres archivos que comienzan con `model.ckpt-best`) y el archivo de configuración adjunto (`albert_config.json`), luego crea y guarda un modelo de PyTorch. Para ejecutar esta conversión deberás tener instalados TensorFlow y PyTorch.
+
+Aquí hay un ejemplo del proceso para convertir un modelo `ALBERT Base` pre-entrenado:
+
+```bash
+export ALBERT_BASE_DIR=/path/to/albert/albert_base
+
+transformers-cli convert --model_type albert \
+  --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
+  --config $ALBERT_BASE_DIR/albert_config.json \
+  --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
+```
+
+Puedes descargar los modelos pre-entrenados de Google para la conversión [aquí](https://github.com/google-research/albert#pre-trained-models).
+
+## OpenAI GPT
+
+Este es un ejemplo del proceso para convertir un modelo OpenAI GPT pre-entrenado, asumiendo que tu checkpoint de NumPy se guarda con el mismo formato que el modelo pre-entrenado de OpenAI (más información [aquí](https://github.com/openai/finetune-transformer-lm)):
+
+```bash
+export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
+
+transformers-cli convert --model_type gpt \
+  --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
+  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+  [--config OPENAI_GPT_CONFIG] \
+  [--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \
+```
+
+## OpenAI GPT-2
+
+Aquí hay un ejemplo del proceso para convertir un modelo OpenAI GPT-2 pre-entrenado (más información [aquí](https://github.com/openai/gpt-2)):
+
+```bash
+export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights
+
+transformers-cli convert --model_type gpt2 \
+  --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
+  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+  [--config OPENAI_GPT2_CONFIG] \
+  [--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]
+```
+
+## Transformer-XL
+
+Aquí hay un ejemplo del proceso para convertir un modelo Transformer-XL pre-entrenado (más información [aquí](https://github.com/kimiyoung/transformer-xl/tree/master/tf#obtain-and-evaluate-pretrained-sota-models)):
+
+```bash
+export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
+
+transformers-cli convert --model_type transfo_xl \
+  --tf_checkpoint $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
+  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+  [--config TRANSFO_XL_CONFIG] \
+  [--finetuning_task_name TRANSFO_XL_FINETUNED_TASK]
+```
+
+## XLNet
+
+Aquí hay un ejemplo del proceso para convertir un modelo XLNet pre-entrenado:
+
+```bash
+export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
+export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
+
+transformers-cli convert --model_type xlnet \
+  --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
+  --config $TRANSFO_XL_CONFIG_PATH \
+  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+  [--finetuning_task_name XLNET_FINETUNED_TASK] \
+```
+
+## XLM
+
+Aquí hay un ejemplo del proceso para convertir un modelo XLM pre-entrenado:
+
+```bash
+export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
+
+transformers-cli convert --model_type xlm \
+  --tf_checkpoint $XLM_CHECKPOINT_PATH \
+  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT
+ [--config XML_CONFIG] \
+ [--finetuning_task_name XML_FINETUNED_TASK]
+```
+
+## T5
+
+Aquí hay un ejemplo del proceso para convertir un modelo T5 pre-entrenado:
+
+```bash
+export T5=/path/to/t5/uncased_L-12_H-768_A-12
+
+transformers-cli convert --model_type t5 \
+  --tf_checkpoint $T5/t5_model.ckpt \
+  --config $T5/t5_config.json \
+  --pytorch_dump_output $T5/pytorch_model.bin
+```
diff --git a/docs/source/es/converting_tensorflow_models.mdx b/docs/source/es/converting_tensorflow_models.mdx
deleted file mode 100644
index 2ab15e81b250..000000000000
--- a/docs/source/es/converting_tensorflow_models.mdx
+++ /dev/null
@@ -1,149 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Convertir checkpoints de Tensorflow
-
-Te proporcionamos una interfaz de línea de comando (`CLI`, por sus siglas en inglés) para convertir puntos de control (_checkpoints_) originales de Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM en modelos que se puedan cargar utilizando los métodos `from_pretrained` de la biblioteca.
-
-<Tip>
-
-Desde 2.3.0, el script para convertir es parte de la CLI de transformers (**transformers-cli**) disponible en cualquier instalación de transformers >= 2.3.0.
-
-La siguiente documentación refleja el formato para el comando **transformers-cli convert**.
-
-</Tip>
-
-## BERT
-
-Puedes convertir cualquier checkpoint de TensorFlow para BERT (en particular, [los modelos pre-entrenados y publicados por Google](https://github.com/google-research/bert#pre-trained-models)) en un archivo de PyTorch mediante el script [convert_bert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py).
-
-Esta CLI toma como entrada un checkpoint de TensorFlow (tres archivos que comienzan con `bert_model.ckpt`) y el archivo de configuración asociado (`bert_config.json`), y crea un modelo PyTorch para esta configuración, carga los pesos del checkpoint de TensorFlow en el modelo de PyTorch y guarda el modelo resultante en un archivo estándar de PyTorch que se puede importar usando `from_pretrained()` (ve el ejemplo en [Tour rápido](quicktour), [run_glue.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification/run_glue.py)).
-
-Solo necesitas ejecutar este script **una vez** para convertir un modelo a PyTorch. Después, puedes ignorar el checkpoint de TensorFlow (los tres archivos que comienzan con `bert_model.ckpt`), pero asegúrate de conservar el archivo de configuración (`bert_config.json`) y el archivo de vocabulario (`vocab.txt`) ya que estos también son necesarios para el modelo en PyTorch.
-
-Para ejecutar este script deberás tener instalado TensorFlow y PyTorch (`pip install tensorflow`). El resto del repositorio solo requiere PyTorch.
-
-Aquí hay un ejemplo del proceso para convertir un modelo `BERT-Base Uncased` pre-entrenado:
-
-```bash
-export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
-
-transformers-cli convert --model_type bert \
-  --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
-  --config $BERT_BASE_DIR/bert_config.json \
-  --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
-```
-
-Puedes descargar los modelos pre-entrenados de Google para la conversión [aquí](https://github.com/google-research/bert#pre-trained-models).
-
-## ALBERT
-
-Convierte los checkpoints del modelo ALBERT de TensorFlow a PyTorch usando el script [convert_albert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py).
-
-La CLI toma como entrada un checkpoint de TensorFlow (tres archivos que comienzan con `model.ckpt-best`) y el archivo de configuración adjunto (`albert_config.json`), luego crea y guarda un modelo de PyTorch. Para ejecutar esta conversión deberás tener instalados TensorFlow y PyTorch.
-
-Aquí hay un ejemplo del proceso para convertir un modelo `ALBERT Base` pre-entrenado:
-
-```bash
-export ALBERT_BASE_DIR=/path/to/albert/albert_base
-
-transformers-cli convert --model_type albert \
-  --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
-  --config $ALBERT_BASE_DIR/albert_config.json \
-  --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
-```
-
-Puedes descargar los modelos pre-entrenados de Google para la conversión [aquí](https://github.com/google-research/albert#pre-trained-models).
-
-## OpenAI GPT
-
-Este es un ejemplo del proceso para convertir un modelo OpenAI GPT pre-entrenado, asumiendo que tu checkpoint de NumPy se guarda con el mismo formato que el modelo pre-entrenado de OpenAI (más información [aquí](https://github.com/openai/finetune-transformer-lm)):
-
-```bash
-export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
-
-transformers-cli convert --model_type gpt \
-  --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
-  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-  [--config OPENAI_GPT_CONFIG] \
-  [--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \
-```
-
-## OpenAI GPT-2
-
-Aquí hay un ejemplo del proceso para convertir un modelo OpenAI GPT-2 pre-entrenado (más información [aquí](https://github.com/openai/gpt-2)):
-
-```bash
-export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights
-
-transformers-cli convert --model_type gpt2 \
-  --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
-  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-  [--config OPENAI_GPT2_CONFIG] \
-  [--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]
-```
-
-## Transformer-XL
-
-Aquí hay un ejemplo del proceso para convertir un modelo Transformer-XL pre-entrenado (más información [aquí](https://github.com/kimiyoung/transformer-xl/tree/master/tf#obtain-and-evaluate-pretrained-sota-models)):
-
-```bash
-export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
-
-transformers-cli convert --model_type transfo_xl \
-  --tf_checkpoint $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
-  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-  [--config TRANSFO_XL_CONFIG] \
-  [--finetuning_task_name TRANSFO_XL_FINETUNED_TASK]
-```
-
-## XLNet
-
-Aquí hay un ejemplo del proceso para convertir un modelo XLNet pre-entrenado:
-
-```bash
-export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
-export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
-
-transformers-cli convert --model_type xlnet \
-  --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
-  --config $TRANSFO_XL_CONFIG_PATH \
-  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-  [--finetuning_task_name XLNET_FINETUNED_TASK] \
-```
-
-## XLM
-
-Aquí hay un ejemplo del proceso para convertir un modelo XLM pre-entrenado:
-
-```bash
-export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
-
-transformers-cli convert --model_type xlm \
-  --tf_checkpoint $XLM_CHECKPOINT_PATH \
-  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT
- [--config XML_CONFIG] \
- [--finetuning_task_name XML_FINETUNED_TASK]
-```
-
-## T5
-
-Aquí hay un ejemplo del proceso para convertir un modelo T5 pre-entrenado:
-
-```bash
-export T5=/path/to/t5/uncased_L-12_H-768_A-12
-
-transformers-cli convert --model_type t5 \
-  --tf_checkpoint $T5/t5_model.ckpt \
-  --config $T5/t5_config.json \
-  --pytorch_dump_output $T5/pytorch_model.bin
-```
diff --git a/docs/source/es/create_a_model.md b/docs/source/es/create_a_model.md
new file mode 100644
index 000000000000..04014a7b6a70
--- /dev/null
+++ b/docs/source/es/create_a_model.md
@@ -0,0 +1,371 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Crea una arquitectura personalizada
+
+Una [`AutoClass`](model_doc/auto) infiere, automáticamente, la arquitectura del modelo y descarga la configuración y los pesos del modelo preentrenado. Normalmente, recomendamos usar una `AutoClass` para producir un código agnóstico a puntos de guardado o checkpoints. Sin embargo, los usuarios que quieran más control sobre los parámetros específicos de los modelos pueden crear su propio modelo 🤗 Transformers personalizado a partir de varias clases base. Esto puede ser particularmente útil para alguien que esté interesado en estudiar, entrenar o experimentar con modelos 🤗 Transformers. En esta guía vamos a profundizar en la creación de modelos personalizados sin usar `AutoClass`. Aprenderemos a:
+
+- Cargar y personalizar una configuración para un modelo.
+- Crear una arquitectura para un modelo.
+- Crear tokenizadores rápidos y lentos para textos.
+- Crear un extractor de propiedades para tareas de audio o imágenes.
+- Crear un procesador para tareas multimodales.
+
+## Configuración
+
+Una [configuración](main_classes/configuration) es un conjunto de atributos específicos de un modelo. Cada configuración de modelo tiene atributos diferentes. Por ejemplo, todos los modelos de PLN tienen los atributos `hidden_size`, `num_attention_heads`, `num_hidden_layers` y `vocab_size` en común. Estos atributos especifican el número de cabezas de atención o de capas ocultas con las que se construyen los modelos.
+
+Puedes echarle un vistazo a [DistilBERT](model_doc/distilbert) y sus atributos accediendo a [`DistilBertConfig`]:
+
+```py
+>>> from transformers import DistilBertConfig
+
+>>> config = DistilBertConfig()
+>>> print(config)
+DistilBertConfig {
+  "activation": "gelu",
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "transformers_version": "4.16.2",
+  "vocab_size": 30522
+}
+```
+
+[`DistilBertConfig`] muestra todos los atributos por defecto que se han usado para construir un modelo [`DistilBertModel`] base. Todos ellos son personalizables, lo que deja espacio para poder experimentar. Por ejemplo, puedes personalizar un modelo predeterminado para:
+
+- Probar una función de activación diferente, usando el parámetro `activation`.
+- Usar un valor de abandono (también conocido como _dropout_) más alto para las probabilidades de las capas de atención, usando el parámetro `attention_dropout`.
+
+```py
+>>> my_config = DistilBertConfig(activation="relu", attention_dropout=0.4)
+>>> print(my_config)
+DistilBertConfig {
+  "activation": "relu",
+  "attention_dropout": 0.4,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "transformers_version": "4.16.2",
+  "vocab_size": 30522
+}
+```
+
+Los atributos de los modelos preentrenados pueden ser modificados con la función [`~PretrainedConfig.from_pretrained`]:
+
+```py
+>>> my_config = DistilBertConfig.from_pretrained("distilbert-base-uncased", activation="relu", attention_dropout=0.4)
+```
+
+Cuando estés satisfecho con la configuración de tu modelo, puedes guardarlo con la función [`~PretrainedConfig.save_pretrained`]. Tu configuración se guardará en un archivo JSON dentro del directorio que le especifiques como parámetro.
+
+```py
+>>> my_config.save_pretrained(save_directory="./your_model_save_path")
+```
+
+Para volver a usar el archivo de configuración, puedes cargarlo usando [`~PretrainedConfig.from_pretrained`]:
+
+```py
+>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json")
+```
+
+<Tip>
+  
+También puedes guardar los archivos de configuración como un diccionario; o incluso guardar solo la diferencia entre tu archivo personalizado y la configuración por defecto. Consulta la [documentación sobre configuración](main_classes/configuration) para ver más detalles.
+
+</Tip>
+
+## Modelo
+
+El siguiente paso será crear un [modelo](main_classes/models). El modelo, al que a veces también nos referimos como arquitectura, es el encargado de definir cada capa y qué operaciones se realizan. Los atributos como `num_hidden_layers` de la configuración se usan para definir la arquitectura. Todos los modelos comparten una clase base, [`PreTrainedModel`], y algunos métodos comunes que se pueden usar para redimensionar los _embeddings_ o para recortar cabezas de auto-atención (también llamadas _self-attention heads_). Además, todos los modelos son subclases de [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html), [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) o [`flax.linen.Module`](https://flax.readthedocs.io/en/latest/flax.linen.html#module), lo que significa que son compatibles con su respectivo framework. 
+
+<frameworkcontent>
+<pt>
+
+Carga los atributos de tu configuración personalizada en el modelo de la siguiente forma:
+
+```py
+>>> from transformers import DistilBertModel
+
+>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json")
+>>> model = DistilBertModel(my_config)
+```
+  
+Esto crea un modelo con valores aleatorios, en lugar de crearlo con los pesos del preentrenamiento, por lo que no serás capaz de usar este modelo para nada útil hasta que no lo entrenes. El entrenamiento es un proceso costoso, tanto en cuestión de recursos como de tiempo, por lo que generalmente es mejor usar un modelo preentrenado para obtener mejores resultados más rápido, consumiendo una fracción de los recursos que un entrenamiento completo hubiera requerido. 
+
+Puedes crear un modelo preentrenado con [`~PreTrainedModel.from_pretrained`]:
+
+```py
+>>> model = DistilBertModel.from_pretrained("distilbert-base-uncased")
+```
+
+Cuando cargues tus pesos del preentrenamiento, el modelo por defecto se carga automáticamente si nos lo proporciona 🤗 Transformers. Sin embargo, siempre puedes reemplazar (todos o algunos de) los atributos del modelo por defecto por los tuyos:
+
+```py
+>>> model = DistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
+```
+</pt>
+<tf>
+  
+Carga los atributos de tu configuración personalizada en el modelo de la siguiente forma:
+
+```py
+>>> from transformers import TFDistilBertModel
+
+>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json")
+>>> tf_model = TFDistilBertModel(my_config)
+```
+
+Esto crea un modelo con valores aleatorios, en lugar de crearlo con los pesos del preentrenamiento, por lo que no serás capaz de usar este modelo para nada útil hasta que no lo entrenes. El entrenamiento es un proceso costoso, tanto en cuestión de recursos como de tiempo, por lo que generalmente es mejor usar un modelo preentrenado para obtener mejores resultados más rápido, consumiendo solo una fracción de los recursos que un entrenamiento completo hubiera requerido. 
+
+Puedes crear un modelo preentrenado con [`~TFPreTrainedModel.from_pretrained`]:
+
+```py
+>>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased")
+```
+
+Cuando cargues tus pesos del preentrenamiento, el modelo por defecto se carga automáticamente si este nos lo proporciona 🤗 Transformers. Sin embargo, siempre puedes reemplazar (todos o algunos de) los atributos del modelo por defecto por los tuyos:
+
+```py
+>>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
+```
+</tf>
+</frameworkcontent>
+
+### Cabezas de modelo 
+
+En este punto del tutorial, tenemos un modelo DistilBERT base que devuelve los *hidden states* o estados ocultos. Los *hidden states* se pasan como parámetros de entrada a la cabeza del modelo para producir la salida. 🤗 Transformers ofrece una cabeza de modelo diferente para cada tarea, siempre y cuando el modelo sea compatible para la tarea (por ejemplo, no puedes usar DistilBERT para una tarea secuencia a secuencia como la traducción).
+
+
+<frameworkcontent>
+<pt>
+
+Por ejemplo,  [`DistilBertForSequenceClassification`] es un modelo DistilBERT base con una cabeza de clasificación de secuencias. La cabeza de clasificación de secuencias es una capa superior que precede a la recolección de las salidas.
+
+```py
+>>> from transformers import DistilBertForSequenceClassification
+
+>>> model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
+```
+
+Puedes reutilizar este punto de guardado o *checkpoint* para otra tarea fácilmente cambiando a una cabeza de un modelo diferente. Para una tarea de respuesta a preguntas, puedes usar la cabeza del modelo [`DistilBertForQuestionAnswering`]. La cabeza de respuesta a preguntas es similar a la de clasificación de secuencias, excepto porque consta de una capa lineal delante de la salida de los *hidden states*. 
+
+
+```py
+>>> from transformers import DistilBertForQuestionAnswering
+
+>>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+```
+</pt>
+<tf>
+
+Por ejemplo,  [`TFDistilBertForSequenceClassification`] es un modelo DistilBERT base con una cabeza de clasificación de secuencias. La cabeza de clasificación de secuencias es una capa superior que precede a la recolección de las salidas.
+
+```py
+>>> from transformers import TFDistilBertForSequenceClassification
+
+>>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
+```
+
+Puedes reutilizar este punto de guardado o *checkpoint* para otra tarea fácilmente cambiando a una cabeza de un modelo diferente. Para una tarea de respuesta a preguntas, puedes usar la cabeza del modelo [`TFDistilBertForQuestionAnswering`]. La cabeza de respuesta a preguntas es similar a la de clasificación de secuencias, excepto porque consta de una capa lineal delante de la salida de los *hidden states*. 
+
+
+```py
+>>> from transformers import TFDistilBertForQuestionAnswering
+
+>>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+```
+</tf>
+</frameworkcontent>
+
+## Tokenizer
+
+La ultima clase base que debes conocer antes de usar un modelo con datos textuales es la clase [tokenizer](main_classes/tokenizer), que convierte el texto bruto en tensores. Hay dos tipos de *tokenizers* que puedes usar con 🤗 Transformers:
+
+- [`PreTrainedTokenizer`]:  una implementación de un *tokenizer* hecha en Python.
+- [`PreTrainedTokenizerFast`]: un *tokenizer* de nuestra librería [🤗 Tokenizer](https://huggingface.co/docs/tokenizers/python/latest/), basada en Rust. Este tipo de *tokenizer* es bastante más rápido, especialmente durante la tokenización por lotes, gracias a estar implementado en Rust. Esta rápida tokenización también ofrece métodos adicionales como el *offset mapping*, que relaciona los tokens con sus palabras o caracteres originales.
+
+Ambos *tokenizers* son compatibles con los métodos comunes, como los de encodificación y decodificación, los métodos para añadir tokens y aquellos que manejan tokens especiales. 
+
+<Tip warning={true}>
+
+No todos los modelos son compatibles con un *tokenizer* rápido. Échale un vistazo a esta [tabla](index#supported-frameworks) para comprobar si un modelo específico es compatible con un *tokenizer* rápido.
+
+</Tip>
+
+Si has entrenado tu propio *tokenizer*, puedes crear uno desde tu archivo de “vocabulario”:
+
+```py
+>>> from transformers import DistilBertTokenizer
+
+>>> my_tokenizer = DistilBertTokenizer(vocab_file="my_vocab_file.txt", do_lower_case=False, padding_side="left")
+```
+
+Es importante recordar que los vocabularios que provienen de un *tokenizer* personalizado serán diferentes a los vocabularios generados por el *tokenizer* de un modelo preentrenado. Debes usar el vocabulario de un *tokenizer* preentrenado si vas a usar un modelo preentrenado, de lo contrario las entradas no tendrán sentido. Crea un *tokenizer* con el vocabulario de un modelo preentrenado usando la clase [`DistilBertTokenizer`]:
+
+
+```py
+>>> from transformers import DistilBertTokenizer
+
+>>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
+```
+
+Crea un *tokenizer* rápido con la clase [`DistilBertTokenizerFast`]:
+
+
+```py
+>>> from transformers import DistilBertTokenizerFast
+
+>>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
+```
+
+<Tip>
+
+Por defecto, el [`AutoTokenizer`] intentará cargar un *tokenizer* rápido. Puedes desactivar este comportamiento cambiando el parámetro `use_fast=False` de `from_pretrained`.
+
+
+</Tip>
+
+## Extractor de Características 
+
+Un extractor de características procesa entradas de audio e imagen. Hereda de la clase base [`~feature_extraction_utils.FeatureExtractionMixin`] y también puede heredar de la clase [`ImageFeatureExtractionMixin`] para el procesamiento de características de las imágenes o de la clase [`SequenceFeatureExtractor`] para el procesamiento de entradas de audio.
+
+Dependiendo de si trabajas en una tarea de audio o de video, puedes crear un extractor de características asociado al modelo que estés usando. Por ejemplo, podrías crear un [`ViTFeatureExtractor`] por defecto si estás usando [ViT](model_doc/vit) para clasificación de imágenes:
+
+```py
+>>> from transformers import ViTFeatureExtractor
+
+>>> vit_extractor = ViTFeatureExtractor()
+>>> print(vit_extractor)
+ViTFeatureExtractor {
+  "do_normalize": true,
+  "do_resize": true,
+  "feature_extractor_type": "ViTFeatureExtractor",
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "resample": 2,
+  "size": 224
+}
+```
+
+<Tip>
+
+Si no estás buscando ninguna personalización en específico, usa el método `from_pretrained` para cargar los parámetros del extractor de características por defecto del modelo.
+
+</Tip>
+
+Puedes modificar cualquier parámetro de [`ViTFeatureExtractor`] para crear tu extractor de características personalizado:
+
+```py
+>>> from transformers import ViTFeatureExtractor
+
+>>> my_vit_extractor = ViTFeatureExtractor(resample="PIL.Image.BOX", do_normalize=False, image_mean=[0.3, 0.3, 0.3])
+>>> print(my_vit_extractor)
+ViTFeatureExtractor {
+  "do_normalize": false,
+  "do_resize": true,
+  "feature_extractor_type": "ViTFeatureExtractor",
+  "image_mean": [
+    0.3,
+    0.3,
+    0.3
+  ],
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "resample": "PIL.Image.BOX",
+  "size": 224
+}
+```
+
+Para las entradas de audio, puedes crear un [`Wav2Vec2FeatureExtractor`] y personalizar los parámetros de una forma similar:
+
+
+```py
+>>> from transformers import Wav2Vec2FeatureExtractor
+
+>>> w2v2_extractor = Wav2Vec2FeatureExtractor()
+>>> print(w2v2_extractor)
+Wav2Vec2FeatureExtractor {
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "return_attention_mask": false,
+  "sampling_rate": 16000
+}
+```
+
+## Procesador
+
+Para modelos que son compatibles con tareas multimodales, 🤗 Transformers ofrece una clase *procesador* que agrupa un extractor de características y un *tokenizer* en el mismo objeto. Por ejemplo, probemos a usar el procesador [`Wav2Vec2Processor`] para una tarea de reconocimiento de voz (ASR). Un ASR transcribe el audio a texto, por lo que necesitaremos un extractor de características y un *tokenizer*.
+
+Crea un extractor de características para manejar la entrada de audio:
+
+
+```py
+>>> from transformers import Wav2Vec2FeatureExtractor
+
+>>> feature_extractor = Wav2Vec2FeatureExtractor(padding_value=1.0, do_normalize=True)
+```
+
+Crea un *tokenizer* para manejar la entrada de texto:
+
+```py
+>>> from transformers import Wav2Vec2CTCTokenizer
+
+>>> tokenizer = Wav2Vec2CTCTokenizer(vocab_file="my_vocab_file.txt")
+```
+
+Puedes combinar el extractor de características y el *tokenizer* en el [`Wav2Vec2Processor`]:
+
+
+```py
+>>> from transformers import Wav2Vec2Processor
+
+>>> processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+```
+Con dos clases base (la configuración y el modelo) y una clase de preprocesamiento adicional (*tokenizer*, extractor de características o procesador), puedes crear cualquiera de los modelos compatibles con 🤗 Transformers. Cada una de estas clases son configurables, permitiéndote usar sus atributos específicos. Puedes crear un modelo para entrenarlo de una forma fácil, o modificar un modelo preentrenado disponible para especializarlo.
diff --git a/docs/source/es/create_a_model.mdx b/docs/source/es/create_a_model.mdx
deleted file mode 100644
index 99ded53ee653..000000000000
--- a/docs/source/es/create_a_model.mdx
+++ /dev/null
@@ -1,367 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Crea una arquitectura personalizada
-
-Una [`AutoClass`](model_doc/auto) infiere, automáticamente, la arquitectura del modelo y descarga la configuración y los pesos del modelo preentrenado. Normalmente, recomendamos usar una `AutoClass` para producir un código agnóstico a puntos de guardado o checkpoints. Sin embargo, los usuarios que quieran más control sobre los parámetros específicos de los modelos pueden crear su propio modelo 🤗 Transformers personalizado a partir de varias clases base. Esto puede ser particularmente útil para alguien que esté interesado en estudiar, entrenar o experimentar con modelos 🤗 Transformers. En esta guía vamos a profundizar en la creación de modelos personalizados sin usar `AutoClass`. Aprenderemos a:
-
-- Cargar y personalizar una configuración para un modelo.
-- Crear una arquitectura para un modelo.
-- Crear tokenizadores rápidos y lentos para textos.
-- Crear un extractor de propiedades para tareas de audio o imágenes.
-- Crear un procesador para tareas multimodales.
-
-## Configuración
-
-Una [configuración](main_classes/configuration) es un conjunto de atributos específicos de un modelo. Cada configuración de modelo tiene atributos diferentes. Por ejemplo, todos los modelos de PLN tienen los atributos `hidden_size`, `num_attention_heads`, `num_hidden_layers` y `vocab_size` en común. Estos atributos especifican el número de cabezas de atención o de capas ocultas con las que se construyen los modelos.
-
-Puedes echarle un vistazo a [DistilBERT](model_doc/distilbert) y sus atributos accediendo a [`DistilBertConfig`]:
-
-```py
->>> from transformers import DistilBertConfig
-
->>> config = DistilBertConfig()
->>> print(config)
-DistilBertConfig {
-  "activation": "gelu",
-  "attention_dropout": 0.1,
-  "dim": 768,
-  "dropout": 0.1,
-  "hidden_dim": 3072,
-  "initializer_range": 0.02,
-  "max_position_embeddings": 512,
-  "model_type": "distilbert",
-  "n_heads": 12,
-  "n_layers": 6,
-  "pad_token_id": 0,
-  "qa_dropout": 0.1,
-  "seq_classif_dropout": 0.2,
-  "sinusoidal_pos_embds": false,
-  "transformers_version": "4.16.2",
-  "vocab_size": 30522
-}
-```
-
-[`DistilBertConfig`] muestra todos los atributos por defecto que se han usado para construir un modelo [`DistilBertModel`] base. Todos ellos son personalizables, lo que deja espacio para poder experimentar. Por ejemplo, puedes personalizar un modelo predeterminado para:
-
-- Probar una función de activación diferente, usando el parámetro `activation`.
-- Usar un valor de abandono (también conocido como _dropout_) más alto para las probabilidades de las capas de atención, usando el parámetro `attention_dropout`.
-
-```py
->>> my_config = DistilBertConfig(activation="relu", attention_dropout=0.4)
->>> print(my_config)
-DistilBertConfig {
-  "activation": "relu",
-  "attention_dropout": 0.4,
-  "dim": 768,
-  "dropout": 0.1,
-  "hidden_dim": 3072,
-  "initializer_range": 0.02,
-  "max_position_embeddings": 512,
-  "model_type": "distilbert",
-  "n_heads": 12,
-  "n_layers": 6,
-  "pad_token_id": 0,
-  "qa_dropout": 0.1,
-  "seq_classif_dropout": 0.2,
-  "sinusoidal_pos_embds": false,
-  "transformers_version": "4.16.2",
-  "vocab_size": 30522
-}
-```
-
-Los atributos de los modelos preentrenados pueden ser modificados con la función [`~PretrainedConfig.from_pretrained`]:
-
-```py
->>> my_config = DistilBertConfig.from_pretrained("distilbert-base-uncased", activation="relu", attention_dropout=0.4)
-```
-
-Cuando estés satisfecho con la configuración de tu modelo, puedes guardarlo con la función [`~PretrainedConfig.save_pretrained`]. Tu configuración se guardará en un archivo JSON dentro del directorio que le especifiques como parámetro.
-
-```py
->>> my_config.save_pretrained(save_directory="./your_model_save_path")
-```
-
-Para volver a usar el archivo de configuración, puedes cargarlo usando [`~PretrainedConfig.from_pretrained`]:
-
-```py
->>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json")
-```
-
-<Tip>
-  
-También puedes guardar los archivos de configuración como un diccionario; o incluso guardar solo la diferencia entre tu archivo personalizado y la configuración por defecto. Consulta la [documentación sobre configuración](main_classes/configuration) para ver más detalles.
-
-</Tip>
-
-## Modelo
-
-El siguiente paso será crear un [modelo](main_classes/models). El modelo, al que a veces también nos referimos como arquitectura, es el encargado de definir cada capa y qué operaciones se realizan. Los atributos como `num_hidden_layers` de la configuración se usan para definir la arquitectura. Todos los modelos comparten una clase base, [`PreTrainedModel`], y algunos métodos comunes que se pueden usar para redimensionar los _embeddings_ o para recortar cabezas de auto-atención (también llamadas _self-attention heads_). Además, todos los modelos son subclases de [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html), [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) o [`flax.linen.Module`](https://flax.readthedocs.io/en/latest/flax.linen.html#module), lo que significa que son compatibles con su respectivo framework. 
-
-<frameworkcontent>
-<pt>
-
-Carga los atributos de tu configuración personalizada en el modelo de la siguiente forma:
-
-```py
->>> from transformers import DistilBertModel
-
->>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json")
->>> model = DistilBertModel(my_config)
-```
-  
-Esto crea un modelo con valores aleatorios, en lugar de crearlo con los pesos del preentrenamiento, por lo que no serás capaz de usar este modelo para nada útil hasta que no lo entrenes. El entrenamiento es un proceso costoso, tanto en cuestión de recursos como de tiempo, por lo que generalmente es mejor usar un modelo preentrenado para obtener mejores resultados más rápido, consumiendo una fracción de los recursos que un entrenamiento completo hubiera requerido. 
-
-Puedes crear un modelo preentrenado con [`~PreTrainedModel.from_pretrained`]:
-
-```py
->>> model = DistilBertModel.from_pretrained("distilbert-base-uncased")
-```
-
-Cuando cargues tus pesos del preentrenamiento, el modelo por defecto se carga automáticamente si nos lo proporciona 🤗 Transformers. Sin embargo, siempre puedes reemplazar (todos o algunos de) los atributos del modelo por defecto por los tuyos:
-
-```py
->>> model = DistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
-```
-</pt>
-<tf>
-  
-Carga los atributos de tu configuración personalizada en el modelo de la siguiente forma:
-
-```py
->>> from transformers import TFDistilBertModel
-
->>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json")
->>> tf_model = TFDistilBertModel(my_config)
-```
-
-Esto crea un modelo con valores aleatorios, en lugar de crearlo con los pesos del preentrenamiento, por lo que no serás capaz de usar este modelo para nada útil hasta que no lo entrenes. El entrenamiento es un proceso costoso, tanto en cuestión de recursos como de tiempo, por lo que generalmente es mejor usar un modelo preentrenado para obtener mejores resultados más rápido, consumiendo solo una fracción de los recursos que un entrenamiento completo hubiera requerido. 
-
-Puedes crear un modelo preentrenado con [`~TFPreTrainedModel.from_pretrained`]:
-
-```py
->>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased")
-```
-
-Cuando cargues tus pesos del preentrenamiento, el modelo por defecto se carga automáticamente si este nos lo proporciona 🤗 Transformers. Sin embargo, siempre puedes reemplazar (todos o algunos de) los atributos del modelo por defecto por los tuyos:
-
-```py
->>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
-```
-</tf>
-</frameworkcontent>
-
-### Cabezas de modelo 
-
-En este punto del tutorial, tenemos un modelo DistilBERT base que devuelve los *hidden states* o estados ocultos. Los *hidden states* se pasan como parámetros de entrada a la cabeza del modelo para producir la salida. 🤗 Transformers ofrece una cabeza de modelo diferente para cada tarea, siempre y cuando el modelo sea compatible para la tarea (por ejemplo, no puedes usar DistilBERT para una tarea secuencia a secuencia como la traducción).
-
-
-<frameworkcontent>
-<pt>
-
-Por ejemplo,  [`DistilBertForSequenceClassification`] es un modelo DistilBERT base con una cabeza de clasificación de secuencias. La cabeza de clasificación de secuencias es una capa superior que precede a la recolección de las salidas.
-
-```py
->>> from transformers import DistilBertForSequenceClassification
-
->>> model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
-```
-
-Puedes reutilizar este punto de guardado o *checkpoint* para otra tarea fácilmente cambiando a una cabeza de un modelo diferente. Para una tarea de respuesta a preguntas, puedes usar la cabeza del modelo [`DistilBertForQuestionAnswering`]. La cabeza de respuesta a preguntas es similar a la de clasificación de secuencias, excepto porque consta de una capa lineal delante de la salida de los *hidden states*. 
-
-
-```py
->>> from transformers import DistilBertForQuestionAnswering
-
->>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
-```
-</pt>
-<tf>
-
-Por ejemplo,  [`TFDistilBertForSequenceClassification`] es un modelo DistilBERT base con una cabeza de clasificación de secuencias. La cabeza de clasificación de secuencias es una capa superior que precede a la recolección de las salidas.
-
-```py
->>> from transformers import TFDistilBertForSequenceClassification
-
->>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
-```
-
-Puedes reutilizar este punto de guardado o *checkpoint* para otra tarea fácilmente cambiando a una cabeza de un modelo diferente. Para una tarea de respuesta a preguntas, puedes usar la cabeza del modelo [`TFDistilBertForQuestionAnswering`]. La cabeza de respuesta a preguntas es similar a la de clasificación de secuencias, excepto porque consta de una capa lineal delante de la salida de los *hidden states*. 
-
-
-```py
->>> from transformers import TFDistilBertForQuestionAnswering
-
->>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
-```
-</tf>
-</frameworkcontent>
-
-## Tokenizer
-
-La ultima clase base que debes conocer antes de usar un modelo con datos textuales es la clase [tokenizer](main_classes/tokenizer), que convierte el texto bruto en tensores. Hay dos tipos de *tokenizers* que puedes usar con 🤗 Transformers:
-
-- [`PreTrainedTokenizer`]:  una implementación de un *tokenizer* hecha en Python.
-- [`PreTrainedTokenizerFast`]: un *tokenizer* de nuestra librería [🤗 Tokenizer](https://huggingface.co/docs/tokenizers/python/latest/), basada en Rust. Este tipo de *tokenizer* es bastante más rápido, especialmente durante la tokenización por lotes, gracias a estar implementado en Rust. Esta rápida tokenización también ofrece métodos adicionales como el *offset mapping*, que relaciona los tokens con sus palabras o caracteres originales.
-
-Ambos *tokenizers* son compatibles con los métodos comunes, como los de encodificación y decodificación, los métodos para añadir tokens y aquellos que manejan tokens especiales. 
-
-<Tip warning={true}>
-
-No todos los modelos son compatibles con un *tokenizer* rápido. Échale un vistazo a esta [tabla](index#supported-frameworks) para comprobar si un modelo específico es compatible con un *tokenizer* rápido.
-
-</Tip>
-
-Si has entrenado tu propio *tokenizer*, puedes crear uno desde tu archivo de “vocabulario”:
-
-```py
->>> from transformers import DistilBertTokenizer
-
->>> my_tokenizer = DistilBertTokenizer(vocab_file="my_vocab_file.txt", do_lower_case=False, padding_side="left")
-```
-
-Es importante recordar que los vocabularios que provienen de un *tokenizer* personalizado serán diferentes a los vocabularios generados por el *tokenizer* de un modelo preentrenado. Debes usar el vocabulario de un *tokenizer* preentrenado si vas a usar un modelo preentrenado, de lo contrario las entradas no tendrán sentido. Crea un *tokenizer* con el vocabulario de un modelo preentrenado usando la clase [`DistilBertTokenizer`]:
-
-
-```py
->>> from transformers import DistilBertTokenizer
-
->>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
-```
-
-Crea un *tokenizer* rápido con la clase [`DistilBertTokenizerFast`]:
-
-
-```py
->>> from transformers import DistilBertTokenizerFast
-
->>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
-```
-
-<Tip>
-
-Por defecto, el [`AutoTokenizer`] intentará cargar un *tokenizer* rápido. Puedes desactivar este comportamiento cambiando el parámetro `use_fast=False` de `from_pretrained`.
-
-
-</Tip>
-
-## Extractor de Características 
-
-Un extractor de características procesa entradas de audio e imagen. Hereda de la clase base [`~feature_extraction_utils.FeatureExtractionMixin`] y también puede heredar de la clase [`ImageFeatureExtractionMixin`] para el procesamiento de características de las imágenes o de la clase [`SequenceFeatureExtractor`] para el procesamiento de entradas de audio.
-
-Dependiendo de si trabajas en una tarea de audio o de video, puedes crear un extractor de características asociado al modelo que estés usando. Por ejemplo, podrías crear un [`ViTFeatureExtractor`] por defecto si estás usando [ViT](model_doc/vit) para clasificación de imágenes:
-
-```py
->>> from transformers import ViTFeatureExtractor
-
->>> vit_extractor = ViTFeatureExtractor()
->>> print(vit_extractor)
-ViTFeatureExtractor {
-  "do_normalize": true,
-  "do_resize": true,
-  "feature_extractor_type": "ViTFeatureExtractor",
-  "image_mean": [
-    0.5,
-    0.5,
-    0.5
-  ],
-  "image_std": [
-    0.5,
-    0.5,
-    0.5
-  ],
-  "resample": 2,
-  "size": 224
-}
-```
-
-<Tip>
-
-Si no estás buscando ninguna personalización en específico, usa el método `from_pretrained` para cargar los parámetros del extractor de características por defecto del modelo.
-
-</Tip>
-
-Puedes modificar cualquier parámetro de [`ViTFeatureExtractor`] para crear tu extractor de características personalizado:
-
-```py
->>> from transformers import ViTFeatureExtractor
-
->>> my_vit_extractor = ViTFeatureExtractor(resample="PIL.Image.BOX", do_normalize=False, image_mean=[0.3, 0.3, 0.3])
->>> print(my_vit_extractor)
-ViTFeatureExtractor {
-  "do_normalize": false,
-  "do_resize": true,
-  "feature_extractor_type": "ViTFeatureExtractor",
-  "image_mean": [
-    0.3,
-    0.3,
-    0.3
-  ],
-  "image_std": [
-    0.5,
-    0.5,
-    0.5
-  ],
-  "resample": "PIL.Image.BOX",
-  "size": 224
-}
-```
-
-Para las entradas de audio, puedes crear un [`Wav2Vec2FeatureExtractor`] y personalizar los parámetros de una forma similar:
-
-
-```py
->>> from transformers import Wav2Vec2FeatureExtractor
-
->>> w2v2_extractor = Wav2Vec2FeatureExtractor()
->>> print(w2v2_extractor)
-Wav2Vec2FeatureExtractor {
-  "do_normalize": true,
-  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
-  "feature_size": 1,
-  "padding_side": "right",
-  "padding_value": 0.0,
-  "return_attention_mask": false,
-  "sampling_rate": 16000
-}
-```
-
-## Procesador
-
-Para modelos que son compatibles con tareas multimodales, 🤗 Transformers ofrece una clase *procesador* que agrupa un extractor de características y un *tokenizer* en el mismo objeto. Por ejemplo, probemos a usar el procesador [`Wav2Vec2Processor`] para una tarea de reconocimiento de voz (ASR). Un ASR transcribe el audio a texto, por lo que necesitaremos un extractor de características y un *tokenizer*.
-
-Crea un extractor de características para manejar la entrada de audio:
-
-
-```py
->>> from transformers import Wav2Vec2FeatureExtractor
-
->>> feature_extractor = Wav2Vec2FeatureExtractor(padding_value=1.0, do_normalize=True)
-```
-
-Crea un *tokenizer* para manejar la entrada de texto:
-
-```py
->>> from transformers import Wav2Vec2CTCTokenizer
-
->>> tokenizer = Wav2Vec2CTCTokenizer(vocab_file="my_vocab_file.txt")
-```
-
-Puedes combinar el extractor de características y el *tokenizer* en el [`Wav2Vec2Processor`]:
-
-
-```py
->>> from transformers import Wav2Vec2Processor
-
->>> processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
-```
-Con dos clases base (la configuración y el modelo) y una clase de preprocesamiento adicional (*tokenizer*, extractor de características o procesador), puedes crear cualquiera de los modelos compatibles con 🤗 Transformers. Cada una de estas clases son configurables, permitiéndote usar sus atributos específicos. Puedes crear un modelo para entrenarlo de una forma fácil, o modificar un modelo preentrenado disponible para especializarlo.
diff --git a/docs/source/es/custom_models.md b/docs/source/es/custom_models.md
new file mode 100644
index 000000000000..e616a056055e
--- /dev/null
+++ b/docs/source/es/custom_models.md
@@ -0,0 +1,358 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Compartir modelos personalizados
+
+La biblioteca 🤗 Transformers está diseñada para ser fácilmente ampliable. Cada modelo está completamente codificado 
+sin abstracción en una subcarpeta determinada del repositorio, por lo que puedes copiar fácilmente un archivo del modelo 
+y ajustarlo según tus necesidades.
+
+Si estás escribiendo un modelo completamente nuevo, podría ser más fácil comenzar desde cero. En este tutorial, te mostraremos 
+cómo escribir un modelo personalizado y su configuración para que pueda usarse dentro de Transformers, y cómo puedes compartirlo 
+con la comunidad (con el código en el que se basa) para que cualquiera pueda usarlo, incluso si no está presente en la biblioteca 
+🤗 Transformers.
+
+Ilustraremos todo esto con un modelo ResNet, envolviendo la clase ResNet de la [biblioteca timm](https://github.com/rwightman/pytorch-image-models) en un [`PreTrainedModel`].
+
+## Escribir una configuración personalizada
+
+Antes de adentrarnos en el modelo, primero escribamos su configuración. La configuración de un modelo es un objeto que
+contendrá toda la información necesaria para construir el modelo. Como veremos en la siguiente sección, el modelo solo puede
+tomar un `config` para ser inicializado, por lo que realmente necesitamos que ese objeto esté lo más completo posible.
+
+En nuestro ejemplo, tomaremos un par de argumentos de la clase ResNet que tal vez queramos modificar. Las diferentes 
+configuraciones nos darán los diferentes tipos de ResNet que son posibles. Luego simplemente almacenamos esos argumentos 
+después de verificar la validez de algunos de ellos.
+
+```python
+from transformers import PretrainedConfig
+from typing import List
+
+
+class ResnetConfig(PretrainedConfig):
+    model_type = "resnet"
+
+    def __init__(
+        self,
+        block_type="bottleneck",
+        layers: List[int] = [3, 4, 6, 3],
+        num_classes: int = 1000,
+        input_channels: int = 3,
+        cardinality: int = 1,
+        base_width: int = 64,
+        stem_width: int = 64,
+        stem_type: str = "",
+        avg_down: bool = False,
+        **kwargs,
+    ):
+        if block_type not in ["basic", "bottleneck"]:
+            raise ValueError(f"`block_type` must be 'basic' or bottleneck', got {block_type}.")
+        if stem_type not in ["", "deep", "deep-tiered"]:
+            raise ValueError(f"`stem_type` must be '', 'deep' or 'deep-tiered', got {stem_type}.")
+
+        self.block_type = block_type
+        self.layers = layers
+        self.num_classes = num_classes
+        self.input_channels = input_channels
+        self.cardinality = cardinality
+        self.base_width = base_width
+        self.stem_width = stem_width
+        self.stem_type = stem_type
+        self.avg_down = avg_down
+        super().__init__(**kwargs)
+```
+
+Las tres cosas importantes que debes recordar al escribir tu propia configuración son las siguientes:
+- tienes que heredar de `PretrainedConfig`,
+- el `__init__` de tu `PretrainedConfig` debe aceptar cualquier `kwargs`,
+- esos `kwargs` deben pasarse a la superclase `__init__`.
+
+La herencia es para asegurarte de obtener toda la funcionalidad de la biblioteca 🤗 Transformers, mientras que las otras dos 
+restricciones provienen del hecho de que una `PretrainedConfig` tiene más campos que los que estás configurando. Al recargar una 
+`config` con el método `from_pretrained`, esos campos deben ser aceptados por tu `config` y luego enviados a la superclase.
+
+Definir un `model_type` para tu configuración (en este caso `model_type="resnet"`) no es obligatorio, a menos que quieras
+registrar tu modelo con las clases automáticas (ver la última sección).
+
+Una vez hecho esto, puedes crear y guardar fácilmente tu configuración como lo harías con cualquier otra configuración de un 
+modelo de la biblioteca. Así es como podemos crear una configuración resnet50d y guardarla:
+
+```py
+resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True)
+resnet50d_config.save_pretrained("custom-resnet")
+```
+
+Esto guardará un archivo llamado `config.json` dentro de la carpeta `custom-resnet`. Luego puedes volver a cargar tu configuración 
+con el método `from_pretrained`:
+
+```py
+resnet50d_config = ResnetConfig.from_pretrained("custom-resnet")
+```
+
+También puedes usar cualquier otro método de la clase [`PretrainedConfig`], como [`~PretrainedConfig.push_to_hub`], para cargar 
+directamente tu configuración en el Hub.
+
+## Escribir un modelo personalizado
+
+Ahora que tenemos nuestra configuración de ResNet, podemos seguir escribiendo el modelo. En realidad escribiremos dos: una que
+extrae las características ocultas de un grupo de imágenes (como [`BertModel`]) y una que es adecuada para clasificación de
+imagenes (como [`BertForSequenceClassification`]).
+
+Como mencionamos antes, solo escribiremos un envoltura (_wrapper_) libre del modelo para simplificar este ejemplo. Lo único que debemos 
+hacer antes de escribir esta clase es un mapeo entre los tipos de bloques y las clases de bloques reales. Luego se define el 
+modelo desde la configuración pasando todo a la clase `ResNet`:
+
+```py
+from transformers import PreTrainedModel
+from timm.models.resnet import BasicBlock, Bottleneck, ResNet
+from .configuration_resnet import ResnetConfig
+
+
+BLOCK_MAPPING = {"basic": BasicBlock, "bottleneck": Bottleneck}
+
+
+class ResnetModel(PreTrainedModel):
+    config_class = ResnetConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        block_layer = BLOCK_MAPPING[config.block_type]
+        self.model = ResNet(
+            block_layer,
+            config.layers,
+            num_classes=config.num_classes,
+            in_chans=config.input_channels,
+            cardinality=config.cardinality,
+            base_width=config.base_width,
+            stem_width=config.stem_width,
+            stem_type=config.stem_type,
+            avg_down=config.avg_down,
+        )
+
+    def forward(self, tensor):
+        return self.model.forward_features(tensor)
+```
+
+Para el modelo que clasificará las imágenes, solo cambiamos el método de avance (es decir, el método `forward`):
+
+```py
+import torch
+
+
+class ResnetModelForImageClassification(PreTrainedModel):
+    config_class = ResnetConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        block_layer = BLOCK_MAPPING[config.block_type]
+        self.model = ResNet(
+            block_layer,
+            config.layers,
+            num_classes=config.num_classes,
+            in_chans=config.input_channels,
+            cardinality=config.cardinality,
+            base_width=config.base_width,
+            stem_width=config.stem_width,
+            stem_type=config.stem_type,
+            avg_down=config.avg_down,
+        )
+
+    def forward(self, tensor, labels=None):
+        logits = self.model(tensor)
+        if labels is not None:
+            loss = torch.nn.cross_entropy(logits, labels)
+            return {"loss": loss, "logits": logits}
+        return {"logits": logits}
+```
+
+En ambos casos, observa cómo heredamos de `PreTrainedModel` y llamamos a la inicialización de la superclase con `config` 
+(un poco como cuando escribes `torch.nn.Module`). La línea que establece `config_class` no es obligatoria, a menos 
+que quieras registrar tu modelo con las clases automáticas (consulta la última sección).
+
+<Tip>
+
+Si tu modelo es muy similar a un modelo dentro de la biblioteca, puedes reutilizar la misma configuración de ese modelo.
+
+</Tip>
+
+Puedes hacer que tu modelo devuelva lo que quieras, pero devolver un diccionario como lo hicimos para 
+`ResnetModelForImageClassification`, con el `loss` incluido cuando se pasan las etiquetas, hará que tu modelo se pueda 
+usar directamente dentro de la clase [`Trainer`]. Usar otro formato de salida está bien, siempre y cuando estés planeando usar 
+tu propio bucle de entrenamiento u otra biblioteca para el entrenamiento.
+
+Ahora que tenemos nuestra clase, vamos a crear un modelo:
+
+```py
+resnet50d = ResnetModelForImageClassification(resnet50d_config)
+```
+
+Nuevamente, puedes usar cualquiera de los métodos de [`PreTrainedModel`], como [`~PreTrainedModel.save_pretrained`] o 
+[`~PreTrainedModel.push_to_hub`]. Usaremos el segundo en la siguiente sección y veremos cómo pasar los pesos del modelo 
+con el código de nuestro modelo. Pero primero, carguemos algunos pesos previamente entrenados dentro de nuestro modelo.
+
+En tu caso de uso, probablemente estarás entrenando tu modelo personalizado con tus propios datos. Para ir rápido en este 
+tutorial, usaremos la versión preentrenada de resnet50d. Dado que nuestro modelo es solo un envoltorio alrededor del resnet50d 
+original, será fácil transferir esos pesos:
+
+```py
+import timm
+
+pretrained_model = timm.create_model("resnet50d", pretrained=True)
+resnet50d.model.load_state_dict(pretrained_model.state_dict())
+```
+
+Ahora veamos cómo asegurarnos de que cuando hacemos [`~PreTrainedModel.save_pretrained`] o [`~PreTrainedModel.push_to_hub`], 
+se guarda el código del modelo.
+
+## Enviar el código al _Hub_
+
+<Tip warning={true}>
+
+Esta _API_ es experimental y puede tener algunos cambios leves en las próximas versiones.
+
+</Tip>
+
+Primero, asegúrate de que tu modelo esté completamente definido en un archivo `.py`. Puedes basarte en importaciones 
+relativas a otros archivos, siempre que todos los archivos estén en el mismo directorio (aún no admitimos submódulos 
+para esta característica). Para nuestro ejemplo, definiremos un archivo `modeling_resnet.py` y un archivo 
+`configuration_resnet.py` en una carpeta del directorio de trabajo actual llamado `resnet_model`. El archivo de configuración 
+contiene el código de `ResnetConfig` y el archivo del modelo contiene el código de `ResnetModel` y 
+`ResnetModelForImageClassification`.
+
+```
+.
+└── resnet_model
+    ├── __init__.py
+    ├── configuration_resnet.py
+    └── modeling_resnet.py
+```
+
+El `__init__.py`  puede estar vacío, solo está ahí para que Python detecte que `resnet_model` se puede usar como un módulo.
+
+<Tip warning={true}>
+
+Si copias archivos del modelo desde la biblioteca, deberás reemplazar todas las importaciones relativas en la parte superior 
+del archivo para importarlos desde el paquete `transformers`.
+
+</Tip>
+
+Ten en cuenta que puedes reutilizar (o subclasificar) una configuración o modelo existente.
+
+Para compartir tu modelo con la comunidad, sigue estos pasos: primero importa el modelo y la configuración de ResNet desde 
+los archivos recién creados:
+
+```py
+from resnet_model.configuration_resnet import ResnetConfig
+from resnet_model.modeling_resnet import ResnetModel, ResnetModelForImageClassification
+```
+
+Luego, debes decirle a la biblioteca que deseas copiar el código de esos objetos cuando usas el método `save_pretrained` 
+y registrarlos correctamente con una determinada clase automática (especialmente para modelos), simplemente ejecuta:
+
+```py
+ResnetConfig.register_for_auto_class()
+ResnetModel.register_for_auto_class("AutoModel")
+ResnetModelForImageClassification.register_for_auto_class("AutoModelForImageClassification")
+```
+
+Ten en cuenta que no es necesario especificar una clase automática para la configuración (solo hay una clase automática 
+para ellos, [`AutoConfig`]), pero es diferente para los modelos. Tu modelo personalizado podría ser adecuado para muchas 
+tareas diferentes, por lo que debes especificar cuál de las clases automáticas es la correcta para tu modelo.
+
+A continuación, vamos a crear la configuración y los modelos como lo hicimos antes:
+
+```py
+resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True)
+resnet50d = ResnetModelForImageClassification(resnet50d_config)
+
+pretrained_model = timm.create_model("resnet50d", pretrained=True)
+resnet50d.model.load_state_dict(pretrained_model.state_dict())
+```
+
+Ahora, para enviar el modelo al Hub, asegúrate de haber iniciado sesión. Ejecuta en tu terminal:
+
+```bash
+huggingface-cli login
+```
+
+o desde un _notebook_:
+
+```py
+from huggingface_hub import notebook_login
+
+notebook_login()
+```
+
+Luego puedes ingresar a tu propio espacio (o una organización de la que seas miembro) de esta manera:
+
+```py
+resnet50d.push_to_hub("custom-resnet50d")
+```
+
+Además de los pesos del modelo y la configuración en formato json, esto también copió los archivos `.py` del modelo y la
+configuración en la carpeta `custom-resnet50d` y subió el resultado al Hub. Puedes verificar el resultado en este 
+[repositorio de modelos](https://huggingface.co/sgugger/custom-resnet50d).
+
+Consulta el tutorial sobre cómo [compartir modelos](model_sharing) para obtener más información sobre el método para subir modelos al Hub.
+
+## Usar un modelo con código personalizado
+
+Puedes usar cualquier configuración, modelo o _tokenizador_ con archivos de código personalizado en tu repositorio con las 
+clases automáticas y el método `from_pretrained`. Todos los archivos y códigos cargados en el Hub se analizan en busca de 
+malware (consulta la documentación de [seguridad del Hub](https://huggingface.co/docs/hub/security#malware-scanning) para 
+obtener más información), pero aún debes revisar el código del modelo y el autor para evitar la ejecución de código malicioso 
+en tu computadora. Configura `trust_remote_code=True` para usar un modelo con código personalizado:
+
+```py
+from transformers import AutoModelForImageClassification
+
+model = AutoModelForImageClassification.from_pretrained("sgugger/custom-resnet50d", trust_remote_code=True)
+```
+
+También se recomienda encarecidamente pasar un _hash_ de confirmación como una "revisión" para asegurarte de que el autor 
+de los modelos no actualizó el código con algunas líneas nuevas maliciosas (a menos que confíes plenamente en los autores 
+de los modelos).
+
+```py
+commit_hash = "ed94a7c6247d8aedce4647f00f20de6875b5b292"
+model = AutoModelForImageClassification.from_pretrained(
+    "sgugger/custom-resnet50d", trust_remote_code=True, revision=commit_hash
+)
+```
+
+Ten en cuenta que al navegar por el historial de confirmaciones del repositorio del modelo en Hub, hay un botón para copiar 
+fácilmente el hash de confirmación de cualquier _commit_.
+
+## Registrar un model con código personalizado a las clases automáticas
+
+Si estás escribiendo una biblioteca que amplía 🤗 Transformers, es posible que quieras ampliar las clases automáticas para 
+incluir tu propio modelo. Esto es diferente de enviar el código al Hub en el sentido de que los usuarios necesitarán importar 
+tu biblioteca para obtener los modelos personalizados (al contrario de descargar automáticamente el código del modelo desde Hub).
+
+Siempre que tu configuración tenga un atributo `model_type` que sea diferente de los tipos de modelos existentes, y que tus 
+clases modelo tengan los atributos `config_class` correctos, puedes agregarlos a las clases automáticas de la siguiente manera:
+
+```py
+from transformers import AutoConfig, AutoModel, AutoModelForImageClassification
+
+AutoConfig.register("resnet", ResnetConfig)
+AutoModel.register(ResnetConfig, ResnetModel)
+AutoModelForImageClassification.register(ResnetConfig, ResnetModelForImageClassification)
+```
+
+Ten en cuenta que el primer argumento utilizado al registrar tu configuración personalizada en [`AutoConfig`] debe coincidir
+con el `model_type` de tu configuración personalizada, y el primer argumento utilizado al registrar tus modelos personalizados
+en cualquier clase del modelo automático debe coincidir con el `config_class ` de esos modelos.
diff --git a/docs/source/es/custom_models.mdx b/docs/source/es/custom_models.mdx
deleted file mode 100644
index 434d59f87dae..000000000000
--- a/docs/source/es/custom_models.mdx
+++ /dev/null
@@ -1,354 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Compartir modelos personalizados
-
-La biblioteca 🤗 Transformers está diseñada para ser fácilmente ampliable. Cada modelo está completamente codificado 
-sin abstracción en una subcarpeta determinada del repositorio, por lo que puedes copiar fácilmente un archivo del modelo 
-y ajustarlo según tus necesidades.
-
-Si estás escribiendo un modelo completamente nuevo, podría ser más fácil comenzar desde cero. En este tutorial, te mostraremos 
-cómo escribir un modelo personalizado y su configuración para que pueda usarse dentro de Transformers, y cómo puedes compartirlo 
-con la comunidad (con el código en el que se basa) para que cualquiera pueda usarlo, incluso si no está presente en la biblioteca 
-🤗 Transformers.
-
-Ilustraremos todo esto con un modelo ResNet, envolviendo la clase ResNet de la [biblioteca timm](https://github.com/rwightman/pytorch-image-models) en un [`PreTrainedModel`].
-
-## Escribir una configuración personalizada
-
-Antes de adentrarnos en el modelo, primero escribamos su configuración. La configuración de un modelo es un objeto que
-contendrá toda la información necesaria para construir el modelo. Como veremos en la siguiente sección, el modelo solo puede
-tomar un `config` para ser inicializado, por lo que realmente necesitamos que ese objeto esté lo más completo posible.
-
-En nuestro ejemplo, tomaremos un par de argumentos de la clase ResNet que tal vez queramos modificar. Las diferentes 
-configuraciones nos darán los diferentes tipos de ResNet que son posibles. Luego simplemente almacenamos esos argumentos 
-después de verificar la validez de algunos de ellos.
-
-```python
-from transformers import PretrainedConfig
-from typing import List
-
-
-class ResnetConfig(PretrainedConfig):
-    model_type = "resnet"
-
-    def __init__(
-        self,
-        block_type="bottleneck",
-        layers: List[int] = [3, 4, 6, 3],
-        num_classes: int = 1000,
-        input_channels: int = 3,
-        cardinality: int = 1,
-        base_width: int = 64,
-        stem_width: int = 64,
-        stem_type: str = "",
-        avg_down: bool = False,
-        **kwargs,
-    ):
-        if block_type not in ["basic", "bottleneck"]:
-            raise ValueError(f"`block_type` must be 'basic' or bottleneck', got {block_type}.")
-        if stem_type not in ["", "deep", "deep-tiered"]:
-            raise ValueError(f"`stem_type` must be '', 'deep' or 'deep-tiered', got {stem_type}.")
-
-        self.block_type = block_type
-        self.layers = layers
-        self.num_classes = num_classes
-        self.input_channels = input_channels
-        self.cardinality = cardinality
-        self.base_width = base_width
-        self.stem_width = stem_width
-        self.stem_type = stem_type
-        self.avg_down = avg_down
-        super().__init__(**kwargs)
-```
-
-Las tres cosas importantes que debes recordar al escribir tu propia configuración son las siguientes:
-- tienes que heredar de `PretrainedConfig`,
-- el `__init__` de tu `PretrainedConfig` debe aceptar cualquier `kwargs`,
-- esos `kwargs` deben pasarse a la superclase `__init__`.
-
-La herencia es para asegurarte de obtener toda la funcionalidad de la biblioteca 🤗 Transformers, mientras que las otras dos 
-restricciones provienen del hecho de que una `PretrainedConfig` tiene más campos que los que estás configurando. Al recargar una 
-`config` con el método `from_pretrained`, esos campos deben ser aceptados por tu `config` y luego enviados a la superclase.
-
-Definir un `model_type` para tu configuración (en este caso `model_type="resnet"`) no es obligatorio, a menos que quieras
-registrar tu modelo con las clases automáticas (ver la última sección).
-
-Una vez hecho esto, puedes crear y guardar fácilmente tu configuración como lo harías con cualquier otra configuración de un 
-modelo de la biblioteca. Así es como podemos crear una configuración resnet50d y guardarla:
-
-```py
-resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True)
-resnet50d_config.save_pretrained("custom-resnet")
-```
-
-Esto guardará un archivo llamado `config.json` dentro de la carpeta `custom-resnet`. Luego puedes volver a cargar tu configuración 
-con el método `from_pretrained`:
-
-```py
-resnet50d_config = ResnetConfig.from_pretrained("custom-resnet")
-```
-
-También puedes usar cualquier otro método de la clase [`PretrainedConfig`], como [`~PretrainedConfig.push_to_hub`], para cargar 
-directamente tu configuración en el Hub.
-
-## Escribir un modelo personalizado
-
-Ahora que tenemos nuestra configuración de ResNet, podemos seguir escribiendo el modelo. En realidad escribiremos dos: una que
-extrae las características ocultas de un grupo de imágenes (como [`BertModel`]) y una que es adecuada para clasificación de
-imagenes (como [`BertForSequenceClassification`]).
-
-Como mencionamos antes, solo escribiremos un envoltura (_wrapper_) libre del modelo para simplificar este ejemplo. Lo único que debemos 
-hacer antes de escribir esta clase es un mapeo entre los tipos de bloques y las clases de bloques reales. Luego se define el 
-modelo desde la configuración pasando todo a la clase `ResNet`:
-
-```py
-from transformers import PreTrainedModel
-from timm.models.resnet import BasicBlock, Bottleneck, ResNet
-from .configuration_resnet import ResnetConfig
-
-
-BLOCK_MAPPING = {"basic": BasicBlock, "bottleneck": Bottleneck}
-
-
-class ResnetModel(PreTrainedModel):
-    config_class = ResnetConfig
-
-    def __init__(self, config):
-        super().__init__(config)
-        block_layer = BLOCK_MAPPING[config.block_type]
-        self.model = ResNet(
-            block_layer,
-            config.layers,
-            num_classes=config.num_classes,
-            in_chans=config.input_channels,
-            cardinality=config.cardinality,
-            base_width=config.base_width,
-            stem_width=config.stem_width,
-            stem_type=config.stem_type,
-            avg_down=config.avg_down,
-        )
-
-    def forward(self, tensor):
-        return self.model.forward_features(tensor)
-```
-
-Para el modelo que clasificará las imágenes, solo cambiamos el método de avance (es decir, el método `forward`):
-
-```py
-import torch
-
-
-class ResnetModelForImageClassification(PreTrainedModel):
-    config_class = ResnetConfig
-
-    def __init__(self, config):
-        super().__init__(config)
-        block_layer = BLOCK_MAPPING[config.block_type]
-        self.model = ResNet(
-            block_layer,
-            config.layers,
-            num_classes=config.num_classes,
-            in_chans=config.input_channels,
-            cardinality=config.cardinality,
-            base_width=config.base_width,
-            stem_width=config.stem_width,
-            stem_type=config.stem_type,
-            avg_down=config.avg_down,
-        )
-
-    def forward(self, tensor, labels=None):
-        logits = self.model(tensor)
-        if labels is not None:
-            loss = torch.nn.cross_entropy(logits, labels)
-            return {"loss": loss, "logits": logits}
-        return {"logits": logits}
-```
-
-En ambos casos, observa cómo heredamos de `PreTrainedModel` y llamamos a la inicialización de la superclase con `config` 
-(un poco como cuando escribes `torch.nn.Module`). La línea que establece `config_class` no es obligatoria, a menos 
-que quieras registrar tu modelo con las clases automáticas (consulta la última sección).
-
-<Tip>
-
-Si tu modelo es muy similar a un modelo dentro de la biblioteca, puedes reutilizar la misma configuración de ese modelo.
-
-</Tip>
-
-Puedes hacer que tu modelo devuelva lo que quieras, pero devolver un diccionario como lo hicimos para 
-`ResnetModelForImageClassification`, con el `loss` incluido cuando se pasan las etiquetas, hará que tu modelo se pueda 
-usar directamente dentro de la clase [`Trainer`]. Usar otro formato de salida está bien, siempre y cuando estés planeando usar 
-tu propio bucle de entrenamiento u otra biblioteca para el entrenamiento.
-
-Ahora que tenemos nuestra clase, vamos a crear un modelo:
-
-```py
-resnet50d = ResnetModelForImageClassification(resnet50d_config)
-```
-
-Nuevamente, puedes usar cualquiera de los métodos de [`PreTrainedModel`], como [`~PreTrainedModel.save_pretrained`] o 
-[`~PreTrainedModel.push_to_hub`]. Usaremos el segundo en la siguiente sección y veremos cómo pasar los pesos del modelo 
-con el código de nuestro modelo. Pero primero, carguemos algunos pesos previamente entrenados dentro de nuestro modelo.
-
-En tu caso de uso, probablemente estarás entrenando tu modelo personalizado con tus propios datos. Para ir rápido en este 
-tutorial, usaremos la versión preentrenada de resnet50d. Dado que nuestro modelo es solo un envoltorio alrededor del resnet50d 
-original, será fácil transferir esos pesos:
-
-```py
-import timm
-
-pretrained_model = timm.create_model("resnet50d", pretrained=True)
-resnet50d.model.load_state_dict(pretrained_model.state_dict())
-```
-
-Ahora veamos cómo asegurarnos de que cuando hacemos [`~PreTrainedModel.save_pretrained`] o [`~PreTrainedModel.push_to_hub`], 
-se guarda el código del modelo.
-
-## Enviar el código al _Hub_
-
-<Tip warning={true}>
-
-Esta _API_ es experimental y puede tener algunos cambios leves en las próximas versiones.
-
-</Tip>
-
-Primero, asegúrate de que tu modelo esté completamente definido en un archivo `.py`. Puedes basarte en importaciones 
-relativas a otros archivos, siempre que todos los archivos estén en el mismo directorio (aún no admitimos submódulos 
-para esta característica). Para nuestro ejemplo, definiremos un archivo `modeling_resnet.py` y un archivo 
-`configuration_resnet.py` en una carpeta del directorio de trabajo actual llamado `resnet_model`. El archivo de configuración 
-contiene el código de `ResnetConfig` y el archivo del modelo contiene el código de `ResnetModel` y 
-`ResnetModelForImageClassification`.
-
-```
-.
-└── resnet_model
-    ├── __init__.py
-    ├── configuration_resnet.py
-    └── modeling_resnet.py
-```
-
-El `__init__.py`  puede estar vacío, solo está ahí para que Python detecte que `resnet_model` se puede usar como un módulo.
-
-<Tip warning={true}>
-
-Si copias archivos del modelo desde la biblioteca, deberás reemplazar todas las importaciones relativas en la parte superior 
-del archivo para importarlos desde el paquete `transformers`.
-
-</Tip>
-
-Ten en cuenta que puedes reutilizar (o subclasificar) una configuración o modelo existente.
-
-Para compartir tu modelo con la comunidad, sigue estos pasos: primero importa el modelo y la configuración de ResNet desde 
-los archivos recién creados:
-
-```py
-from resnet_model.configuration_resnet import ResnetConfig
-from resnet_model.modeling_resnet import ResnetModel, ResnetModelForImageClassification
-```
-
-Luego, debes decirle a la biblioteca que deseas copiar el código de esos objetos cuando usas el método `save_pretrained` 
-y registrarlos correctamente con una determinada clase automática (especialmente para modelos), simplemente ejecuta:
-
-```py
-ResnetConfig.register_for_auto_class()
-ResnetModel.register_for_auto_class("AutoModel")
-ResnetModelForImageClassification.register_for_auto_class("AutoModelForImageClassification")
-```
-
-Ten en cuenta que no es necesario especificar una clase automática para la configuración (solo hay una clase automática 
-para ellos, [`AutoConfig`]), pero es diferente para los modelos. Tu modelo personalizado podría ser adecuado para muchas 
-tareas diferentes, por lo que debes especificar cuál de las clases automáticas es la correcta para tu modelo.
-
-A continuación, vamos a crear la configuración y los modelos como lo hicimos antes:
-
-```py
-resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True)
-resnet50d = ResnetModelForImageClassification(resnet50d_config)
-
-pretrained_model = timm.create_model("resnet50d", pretrained=True)
-resnet50d.model.load_state_dict(pretrained_model.state_dict())
-```
-
-Ahora, para enviar el modelo al Hub, asegúrate de haber iniciado sesión. Ejecuta en tu terminal:
-
-```bash
-huggingface-cli login
-```
-
-o desde un _notebook_:
-
-```py
-from huggingface_hub import notebook_login
-
-notebook_login()
-```
-
-Luego puedes ingresar a tu propio espacio (o una organización de la que seas miembro) de esta manera:
-
-```py
-resnet50d.push_to_hub("custom-resnet50d")
-```
-
-Además de los pesos del modelo y la configuración en formato json, esto también copió los archivos `.py` del modelo y la
-configuración en la carpeta `custom-resnet50d` y subió el resultado al Hub. Puedes verificar el resultado en este 
-[repositorio de modelos](https://huggingface.co/sgugger/custom-resnet50d).
-
-Consulta el tutorial sobre cómo [compartir modelos](model_sharing) para obtener más información sobre el método para subir modelos al Hub.
-
-## Usar un modelo con código personalizado
-
-Puedes usar cualquier configuración, modelo o _tokenizador_ con archivos de código personalizado en tu repositorio con las 
-clases automáticas y el método `from_pretrained`. Todos los archivos y códigos cargados en el Hub se analizan en busca de 
-malware (consulta la documentación de [seguridad del Hub](https://huggingface.co/docs/hub/security#malware-scanning) para 
-obtener más información), pero aún debes revisar el código del modelo y el autor para evitar la ejecución de código malicioso 
-en tu computadora. Configura `trust_remote_code=True` para usar un modelo con código personalizado:
-
-```py
-from transformers import AutoModelForImageClassification
-
-model = AutoModelForImageClassification.from_pretrained("sgugger/custom-resnet50d", trust_remote_code=True)
-```
-
-También se recomienda encarecidamente pasar un _hash_ de confirmación como una "revisión" para asegurarte de que el autor 
-de los modelos no actualizó el código con algunas líneas nuevas maliciosas (a menos que confíes plenamente en los autores 
-de los modelos).
-
-```py
-commit_hash = "ed94a7c6247d8aedce4647f00f20de6875b5b292"
-model = AutoModelForImageClassification.from_pretrained(
-    "sgugger/custom-resnet50d", trust_remote_code=True, revision=commit_hash
-)
-```
-
-Ten en cuenta que al navegar por el historial de confirmaciones del repositorio del modelo en Hub, hay un botón para copiar 
-fácilmente el hash de confirmación de cualquier _commit_.
-
-## Registrar un model con código personalizado a las clases automáticas
-
-Si estás escribiendo una biblioteca que amplía 🤗 Transformers, es posible que quieras ampliar las clases automáticas para 
-incluir tu propio modelo. Esto es diferente de enviar el código al Hub en el sentido de que los usuarios necesitarán importar 
-tu biblioteca para obtener los modelos personalizados (al contrario de descargar automáticamente el código del modelo desde Hub).
-
-Siempre que tu configuración tenga un atributo `model_type` que sea diferente de los tipos de modelos existentes, y que tus 
-clases modelo tengan los atributos `config_class` correctos, puedes agregarlos a las clases automáticas de la siguiente manera:
-
-```py
-from transformers import AutoConfig, AutoModel, AutoModelForImageClassification
-
-AutoConfig.register("resnet", ResnetConfig)
-AutoModel.register(ResnetConfig, ResnetModel)
-AutoModelForImageClassification.register(ResnetConfig, ResnetModelForImageClassification)
-```
-
-Ten en cuenta que el primer argumento utilizado al registrar tu configuración personalizada en [`AutoConfig`] debe coincidir
-con el `model_type` de tu configuración personalizada, y el primer argumento utilizado al registrar tus modelos personalizados
-en cualquier clase del modelo automático debe coincidir con el `config_class ` de esos modelos.
diff --git a/docs/source/es/debugging.md b/docs/source/es/debugging.md
new file mode 100644
index 000000000000..313566753052
--- /dev/null
+++ b/docs/source/es/debugging.md
@@ -0,0 +1,335 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Debugging
+
+## Debug de problemas de Network multi-GPU
+
+Cuando entrenas o infieres con `DistributedDataParallel` y varias GPUs, si encuentras problemas de intercomunicación entre procesos y/o nodos, puedes usar el siguiente script para diagnosticar problemas de red.
+ 
+```bash
+wget https://raw.githubusercontent.com/huggingface/transformers/main/scripts/distributed/torch-distributed-gpu-test.py
+```
+
+Por ejemplo, para probar cómo interactúan 2 GPUs, haz lo siguiente:
+
+```bash
+python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
+```
+Si ambos procesos pueden hablar entre sí y asignar la memoria de la GPU, cada uno imprimirá un status OK.
+
+Para más GPUs o nodos, ajusta los argumentos en el script.
+
+Encontrarás muchos más detalles dentro del script de diagnóstico e incluso una receta de cómo ejecutarlo en un entorno SLURM.
+
+Un nivel adicional de debug es agregar la variable de entorno `NCCL_DEBUG=INFO` de la siguiente manera:
+
+```bash
+NCCL_DEBUG=INFO python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
+```
+
+Esto mostrará mucha información de debug relacionada con NCCL, que luego puedes buscar online si encuentras que reporta algún problema. O si no estás seguro de cómo interpretar el output, puedes compartir el archivo de log en un Issue.
+
+
+## Detección de Underflow y Overflow
+
+<Tip>
+
+Esta función está disponible actualmente sólo para PyTorch.
+
+</Tip>
+
+<Tip>
+
+Para el entrenamiento multi-GPU, requiere DDP (`torch.distributed.launch`).
+
+</Tip>
+
+<Tip>
+
+Esta función puede utilizarse con cualquier modelo basado en `nn.Module`.
+
+</Tip>
+
+Si empiezas a obtener `loss=NaN` o el modelo muestra algún otro comportamiento anormal debido a `inf` o `nan` en
+activations o weights hay que descubrir dónde se produce el primer underflow o overflow y qué lo ha provocado. Por suerte
+puedes lograrlo fácilmente activando un módulo especial que hará la detección automáticamente.
+
+Si estás usando [`Trainer`], solo necesitas añadir:
+
+```bash
+--debug underflow_overflow
+```
+
+a los argumentos normales de la línea de comandos, o pasar `debug="underflow_overflow"` al crear el objeto [`TrainingArguments`].
+
+Si estás usando tu propio bucle de entrenamiento u otro Trainer puedes lograr lo mismo con:
+
+```python
+from .debug_utils import DebugUnderflowOverflow
+
+debug_overflow = DebugUnderflowOverflow(model)
+```
+
+[`~debug_utils.DebugUnderflowOverflow`] inserta hooks en el modelo que inmediatamente después de cada forward
+testeará las variables de input y output y también los weights del módulo correspondiente. Tan pronto como se detecte `inf` o
+`nan` se detecta en al menos un elemento de las activations o weights, el programa afirmará e imprimirá un informe
+como este (esto fue capturado con `google/mt5-small` bajo fp16 mixed precision):
+
+```
+Detected inf/nan during batch_number=0
+Last 21 forward frames:
+abs min  abs max  metadata
+                  encoder.block.1.layer.1.DenseReluDense.dropout Dropout
+0.00e+00 2.57e+02 input[0]
+0.00e+00 2.85e+02 output
+[...]
+                  encoder.block.2.layer.0 T5LayerSelfAttention
+6.78e-04 3.15e+03 input[0]
+2.65e-04 3.42e+03 output[0]
+             None output[1]
+2.25e-01 1.00e+04 output[2]
+                  encoder.block.2.layer.1.layer_norm T5LayerNorm
+8.69e-02 4.18e-01 weight
+2.65e-04 3.42e+03 input[0]
+1.79e-06 4.65e+00 output
+                  encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
+2.17e-07 4.50e+00 weight
+1.79e-06 4.65e+00 input[0]
+2.68e-06 3.70e+01 output
+                  encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
+8.08e-07 2.66e+01 weight
+1.79e-06 4.65e+00 input[0]
+1.27e-04 2.37e+02 output
+                  encoder.block.2.layer.1.DenseReluDense.dropout Dropout
+0.00e+00 8.76e+03 input[0]
+0.00e+00 9.74e+03 output
+                  encoder.block.2.layer.1.DenseReluDense.wo Linear
+1.01e-06 6.44e+00 weight
+0.00e+00 9.74e+03 input[0]
+3.18e-04 6.27e+04 output
+                  encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
+1.79e-06 4.65e+00 input[0]
+3.18e-04 6.27e+04 output
+                  encoder.block.2.layer.1.dropout Dropout
+3.18e-04 6.27e+04 input[0]
+0.00e+00      inf output
+```
+
+El output del ejemplo se ha recortado en el centro por razones de brevedad.
+
+La segunda columna muestra el valor del elemento más grande en términos absolutos, por lo que si observas con detenimiento los últimos fotogramas,
+los inputs y outputs estaban en el rango de `1e4`. Así que cuando este entrenamiento se hizo con fp16 mixed precision, 
+el último paso sufrió overflow (ya que bajo `fp16` el mayor número antes de `inf` es `64e3`). Para evitar overflows en
+`fp16` las activations deben permanecer muy por debajo de `1e4`, porque `1e4 * 1e4 = 1e8` por lo que cualquier matrix multiplication con
+grandes activations va a llevar a una condición de overflow numérico.
+
+Al principio del output puedes descubrir en qué número de batch se produjo el problema (aquí `Detected inf/nan during batch_number=0` significa que el problema se produjo en el primer batch).
+
+Cada frame del informe comienza declarando la entrada completamente calificada para el módulo correspondiente que este frame está reportando.
+Si nos fijamos sólo en este frame:
+
+```
+                  encoder.block.2.layer.1.layer_norm T5LayerNorm
+8.69e-02 4.18e-01 weight
+2.65e-04 3.42e+03 input[0]
+1.79e-06 4.65e+00 output
+```
+
+Aquí, `encoder.block.2.layer.1.layer_norm` indica que era una layer norm para la primera capa, del segundo
+block del encoder. Y la call específica del `forward` es `T5LayerNorm`.
+
+Veamos los últimos frames de ese informe:
+
+```
+Detected inf/nan during batch_number=0
+Last 21 forward frames:
+abs min  abs max  metadata
+[...]
+                  encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
+2.17e-07 4.50e+00 weight
+1.79e-06 4.65e+00 input[0]
+2.68e-06 3.70e+01 output
+                  encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
+8.08e-07 2.66e+01 weight
+1.79e-06 4.65e+00 input[0]
+1.27e-04 2.37e+02 output
+                  encoder.block.2.layer.1.DenseReluDense.wo Linear
+1.01e-06 6.44e+00 weight
+0.00e+00 9.74e+03 input[0]
+3.18e-04 6.27e+04 output
+                  encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
+1.79e-06 4.65e+00 input[0]
+3.18e-04 6.27e+04 output
+                  encoder.block.2.layer.1.dropout Dropout
+3.18e-04 6.27e+04 input[0]
+0.00e+00      inf output
+```
+
+El último frame informa para la función `Dropout.forward` con la primera entrada para el único input y la segunda para el
+único output. Puedes ver que fue llamada desde un atributo `dropout` dentro de la clase `DenseReluDense`. Podemos ver
+que ocurrió durante la primera capa, del segundo block, durante el primer batch. Por último, el mayor absoluto
+elementos de input fue `6.27e+04` y el mismo para el output fue `inf`.
+
+Puedes ver aquí, que `T5DenseGatedGeluDense.forward` resultó en output activations, cuyo valor máximo absoluto fue
+alrededor de 62.7K, que está muy cerca del límite máximo de fp16 de 64K. En el siguiente frame tenemos `Dropout`, el cual renormaliza
+los weights, después de poner a cero algunos de los elementos, lo que empuja el valor máximo absoluto a más de 64K, y obtenemos un
+overflow (`inf`).
+
+Como puedes ver son los frames anteriores los que tenemos que mirar cuando los números empiezan a ser muy grandes para números fp16.
+
+Combinemos el informe con el código de `models/t5/modeling_t5.py`:
+
+```python
+class T5DenseGatedGeluDense(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+        self.gelu_act = ACT2FN["gelu_new"]
+
+    def forward(self, hidden_states):
+        hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
+        hidden_linear = self.wi_1(hidden_states)
+        hidden_states = hidden_gelu * hidden_linear
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+```
+
+Ahora es fácil ver la call `dropout`, y también todas las calls anteriores.
+
+Dado que la detección se produce en un forward hook, estos informes se imprimen inmediatamente después de que cada `forward`
+responda.
+
+Volviendo al informe completo, para actuar sobre él y arreglar el problema, tenemos que subir unos cuantos frames donde los números
+empezaron a subir y probablemente cambiar al modo `fp32` aquí, para que los números no sufran overflow cuando se multipliquen
+o al sumarlos. Por supuesto, puede haber otras soluciones. Por ejemplo, podríamos desactivar `amp` temporalmente si está
+activado, después de mover el original `forward` dentro de un helper wrapper, así:
+
+```python
+def _forward(self, hidden_states):
+    hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
+    hidden_linear = self.wi_1(hidden_states)
+    hidden_states = hidden_gelu * hidden_linear
+    hidden_states = self.dropout(hidden_states)
+    hidden_states = self.wo(hidden_states)
+    return hidden_states
+
+
+import torch
+
+
+def forward(self, hidden_states):
+    if torch.is_autocast_enabled():
+        with torch.cuda.amp.autocast(enabled=False):
+            return self._forward(hidden_states)
+    else:
+        return self._forward(hidden_states)
+```
+
+Como el detector automático sólo informa de los inputs y outputs de los frames completos, una vez que sepas dónde buscar, puedes
+analizar también las etapas intermedias de una función específica de `forward`. En este caso, puede utilizar la función
+función de ayuda `detect_overflow` para inyectar el detector donde quieras, por ejemplo:
+
+```python
+from debug_utils import detect_overflow
+
+
+class T5LayerFF(nn.Module):
+    [...]
+
+    def forward(self, hidden_states):
+        forwarded_states = self.layer_norm(hidden_states)
+        detect_overflow(forwarded_states, "after layer_norm")
+        forwarded_states = self.DenseReluDense(forwarded_states)
+        detect_overflow(forwarded_states, "after DenseReluDense")
+        return hidden_states + self.dropout(forwarded_states)
+```
+
+Puedes ver que hemos añadido 2 de estos y ahora se trackea si `inf` o `nan` para `forwarded_states` fue detectado
+en algún punto intermedio.
+
+De hecho, el detector ya informa de esto porque cada una de las llamadas en el ejemplo anterior es un `nn.Module`, pero
+digamos que si tuvieras algunos cálculos directos locales, así es como lo harías.
+
+Además, si estás instanciando el debugger en tu propio código, puedes ajustar el número de frames impresos de
+su valor por defecto, por ejemplo:
+
+```python
+from .debug_utils import DebugUnderflowOverflow
+
+debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100)
+```
+
+### Rastreo de valores mínimos y máximos absolutos de batches específicos
+
+La misma clase de debugging se puede utilizar para el rastreo por batches con la función de detección de underflow/overflow desactivada.
+
+Digamos que quieres ver los valores mínimos y máximos absolutos de todos los ingredientes de cada call `forward` de un determinado
+batch, y sólo hacerlo para los batches 1 y 3. Entonces instancias esta clase como:
+
+```python
+debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3])
+```
+
+Y ahora los batches 1 y 3 completos serán rastreados usando el mismo formato que el detector de underflow/overflow.
+
+Los batches son 0-index.
+
+Esto es muy útil si sabes que el programa empieza a comportarse mal después de un determinado número de batch, para que puedas avanzar rápidamente
+hasta esa área. Aquí hay un ejemplo de output recortado para tal configuración:
+
+```
+                  *** Starting batch number=1 ***
+abs min  abs max  metadata
+                  shared Embedding
+1.01e-06 7.92e+02 weight
+0.00e+00 2.47e+04 input[0]
+5.36e-05 7.92e+02 output
+[...]
+                  decoder.dropout Dropout
+1.60e-07 2.27e+01 input[0]
+0.00e+00 2.52e+01 output
+                  decoder T5Stack
+     not a tensor output
+                  lm_head Linear
+1.01e-06 7.92e+02 weight
+0.00e+00 1.11e+00 input[0]
+6.06e-02 8.39e+01 output
+                   T5ForConditionalGeneration
+     not a tensor output
+
+                  *** Starting batch number=3 ***
+abs min  abs max  metadata
+                  shared Embedding
+1.01e-06 7.92e+02 weight
+0.00e+00 2.78e+04 input[0]
+5.36e-05 7.92e+02 output
+[...]
+```
+
+Aquí obtendrás un gran número de frames mostrados - tantos como forward calls haya en tu modelo, por lo que puede o no ser lo que quieras, pero a veces puede ser más fácil de usar para debug que un debugger normal.
+Por ejemplo, si un problema comienza a ocurrir en el batch 150. Entonces puedes mostrar las trazas de los batches 149 y 150 y comparar dónde
+los números empezaron a divergir.
+
+También puedes especificar el número de batch después del cual se debe detener el entrenamiento, con:
+
+```python
+debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3], abort_after_batch_num=3)
+```
diff --git a/docs/source/es/debugging.mdx b/docs/source/es/debugging.mdx
deleted file mode 100644
index a709e0407b8b..000000000000
--- a/docs/source/es/debugging.mdx
+++ /dev/null
@@ -1,331 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Debugging
-
-## Debug de problemas de Network multi-GPU
-
-Cuando entrenas o infieres con `DistributedDataParallel` y varias GPUs, si encuentras problemas de intercomunicación entre procesos y/o nodos, puedes usar el siguiente script para diagnosticar problemas de red.
- 
-```bash
-wget https://raw.githubusercontent.com/huggingface/transformers/main/scripts/distributed/torch-distributed-gpu-test.py
-```
-
-Por ejemplo, para probar cómo interactúan 2 GPUs, haz lo siguiente:
-
-```bash
-python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
-```
-Si ambos procesos pueden hablar entre sí y asignar la memoria de la GPU, cada uno imprimirá un status OK.
-
-Para más GPUs o nodos, ajusta los argumentos en el script.
-
-Encontrarás muchos más detalles dentro del script de diagnóstico e incluso una receta de cómo ejecutarlo en un entorno SLURM.
-
-Un nivel adicional de debug es agregar la variable de entorno `NCCL_DEBUG=INFO` de la siguiente manera:
-
-```bash
-NCCL_DEBUG=INFO python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
-```
-
-Esto mostrará mucha información de debug relacionada con NCCL, que luego puedes buscar online si encuentras que reporta algún problema. O si no estás seguro de cómo interpretar el output, puedes compartir el archivo de log en un Issue.
-
-
-## Detección de Underflow y Overflow
-
-<Tip>
-
-Esta función está disponible actualmente sólo para PyTorch.
-
-</Tip>
-
-<Tip>
-
-Para el entrenamiento multi-GPU, requiere DDP (`torch.distributed.launch`).
-
-</Tip>
-
-<Tip>
-
-Esta función puede utilizarse con cualquier modelo basado en `nn.Module`.
-
-</Tip>
-
-Si empiezas a obtener `loss=NaN` o el modelo muestra algún otro comportamiento anormal debido a `inf` o `nan` en
-activations o weights hay que descubrir dónde se produce el primer underflow o overflow y qué lo ha provocado. Por suerte
-puedes lograrlo fácilmente activando un módulo especial que hará la detección automáticamente.
-
-Si estás usando [`Trainer`], solo necesitas añadir:
-
-```bash
---debug underflow_overflow
-```
-
-a los argumentos normales de la línea de comandos, o pasar `debug="underflow_overflow"` al crear el objeto [`TrainingArguments`].
-
-Si estás usando tu propio bucle de entrenamiento u otro Trainer puedes lograr lo mismo con:
-
-```python
-from .debug_utils import DebugUnderflowOverflow
-
-debug_overflow = DebugUnderflowOverflow(model)
-```
-
-[`~debug_utils.DebugUnderflowOverflow`] inserta hooks en el modelo que inmediatamente después de cada forward
-testeará las variables de input y output y también los weights del módulo correspondiente. Tan pronto como se detecte `inf` o
-`nan` se detecta en al menos un elemento de las activations o weights, el programa afirmará e imprimirá un informe
-como este (esto fue capturado con `google/mt5-small` bajo fp16 mixed precision):
-
-```
-Detected inf/nan during batch_number=0
-Last 21 forward frames:
-abs min  abs max  metadata
-                  encoder.block.1.layer.1.DenseReluDense.dropout Dropout
-0.00e+00 2.57e+02 input[0]
-0.00e+00 2.85e+02 output
-[...]
-                  encoder.block.2.layer.0 T5LayerSelfAttention
-6.78e-04 3.15e+03 input[0]
-2.65e-04 3.42e+03 output[0]
-             None output[1]
-2.25e-01 1.00e+04 output[2]
-                  encoder.block.2.layer.1.layer_norm T5LayerNorm
-8.69e-02 4.18e-01 weight
-2.65e-04 3.42e+03 input[0]
-1.79e-06 4.65e+00 output
-                  encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
-2.17e-07 4.50e+00 weight
-1.79e-06 4.65e+00 input[0]
-2.68e-06 3.70e+01 output
-                  encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
-8.08e-07 2.66e+01 weight
-1.79e-06 4.65e+00 input[0]
-1.27e-04 2.37e+02 output
-                  encoder.block.2.layer.1.DenseReluDense.dropout Dropout
-0.00e+00 8.76e+03 input[0]
-0.00e+00 9.74e+03 output
-                  encoder.block.2.layer.1.DenseReluDense.wo Linear
-1.01e-06 6.44e+00 weight
-0.00e+00 9.74e+03 input[0]
-3.18e-04 6.27e+04 output
-                  encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
-1.79e-06 4.65e+00 input[0]
-3.18e-04 6.27e+04 output
-                  encoder.block.2.layer.1.dropout Dropout
-3.18e-04 6.27e+04 input[0]
-0.00e+00      inf output
-```
-
-El output del ejemplo se ha recortado en el centro por razones de brevedad.
-
-La segunda columna muestra el valor del elemento más grande en términos absolutos, por lo que si observas con detenimiento los últimos fotogramas,
-los inputs y outputs estaban en el rango de `1e4`. Así que cuando este entrenamiento se hizo con fp16 mixed precision, 
-el último paso sufrió overflow (ya que bajo `fp16` el mayor número antes de `inf` es `64e3`). Para evitar overflows en
-`fp16` las activations deben permanecer muy por debajo de `1e4`, porque `1e4 * 1e4 = 1e8` por lo que cualquier matrix multiplication con
-grandes activations va a llevar a una condición de overflow numérico.
-
-Al principio del output puedes descubrir en qué número de batch se produjo el problema (aquí `Detected inf/nan during batch_number=0` significa que el problema se produjo en el primer batch).
-
-Cada frame del informe comienza declarando la entrada completamente calificada para el módulo correspondiente que este frame está reportando.
-Si nos fijamos sólo en este frame:
-
-```
-                  encoder.block.2.layer.1.layer_norm T5LayerNorm
-8.69e-02 4.18e-01 weight
-2.65e-04 3.42e+03 input[0]
-1.79e-06 4.65e+00 output
-```
-
-Aquí, `encoder.block.2.layer.1.layer_norm` indica que era una layer norm para la primera capa, del segundo
-block del encoder. Y la call específica del `forward` es `T5LayerNorm`.
-
-Veamos los últimos frames de ese informe:
-
-```
-Detected inf/nan during batch_number=0
-Last 21 forward frames:
-abs min  abs max  metadata
-[...]
-                  encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
-2.17e-07 4.50e+00 weight
-1.79e-06 4.65e+00 input[0]
-2.68e-06 3.70e+01 output
-                  encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
-8.08e-07 2.66e+01 weight
-1.79e-06 4.65e+00 input[0]
-1.27e-04 2.37e+02 output
-                  encoder.block.2.layer.1.DenseReluDense.wo Linear
-1.01e-06 6.44e+00 weight
-0.00e+00 9.74e+03 input[0]
-3.18e-04 6.27e+04 output
-                  encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
-1.79e-06 4.65e+00 input[0]
-3.18e-04 6.27e+04 output
-                  encoder.block.2.layer.1.dropout Dropout
-3.18e-04 6.27e+04 input[0]
-0.00e+00      inf output
-```
-
-El último frame informa para la función `Dropout.forward` con la primera entrada para el único input y la segunda para el
-único output. Puedes ver que fue llamada desde un atributo `dropout` dentro de la clase `DenseReluDense`. Podemos ver
-que ocurrió durante la primera capa, del segundo block, durante el primer batch. Por último, el mayor absoluto
-elementos de input fue `6.27e+04` y el mismo para el output fue `inf`.
-
-Puedes ver aquí, que `T5DenseGatedGeluDense.forward` resultó en output activations, cuyo valor máximo absoluto fue
-alrededor de 62.7K, que está muy cerca del límite máximo de fp16 de 64K. En el siguiente frame tenemos `Dropout`, el cual renormaliza
-los weights, después de poner a cero algunos de los elementos, lo que empuja el valor máximo absoluto a más de 64K, y obtenemos un
-overflow (`inf`).
-
-Como puedes ver son los frames anteriores los que tenemos que mirar cuando los números empiezan a ser muy grandes para números fp16.
-
-Combinemos el informe con el código de `models/t5/modeling_t5.py`:
-
-```python
-class T5DenseGatedGeluDense(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
-        self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
-        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
-        self.dropout = nn.Dropout(config.dropout_rate)
-        self.gelu_act = ACT2FN["gelu_new"]
-
-    def forward(self, hidden_states):
-        hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
-        hidden_linear = self.wi_1(hidden_states)
-        hidden_states = hidden_gelu * hidden_linear
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.wo(hidden_states)
-        return hidden_states
-```
-
-Ahora es fácil ver la call `dropout`, y también todas las calls anteriores.
-
-Dado que la detección se produce en un forward hook, estos informes se imprimen inmediatamente después de que cada `forward`
-responda.
-
-Volviendo al informe completo, para actuar sobre él y arreglar el problema, tenemos que subir unos cuantos frames donde los números
-empezaron a subir y probablemente cambiar al modo `fp32` aquí, para que los números no sufran overflow cuando se multipliquen
-o al sumarlos. Por supuesto, puede haber otras soluciones. Por ejemplo, podríamos desactivar `amp` temporalmente si está
-activado, después de mover el original `forward` dentro de un helper wrapper, así:
-
-```python
-def _forward(self, hidden_states):
-    hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
-    hidden_linear = self.wi_1(hidden_states)
-    hidden_states = hidden_gelu * hidden_linear
-    hidden_states = self.dropout(hidden_states)
-    hidden_states = self.wo(hidden_states)
-    return hidden_states
-
-
-import torch
-
-
-def forward(self, hidden_states):
-    if torch.is_autocast_enabled():
-        with torch.cuda.amp.autocast(enabled=False):
-            return self._forward(hidden_states)
-    else:
-        return self._forward(hidden_states)
-```
-
-Como el detector automático sólo informa de los inputs y outputs de los frames completos, una vez que sepas dónde buscar, puedes
-analizar también las etapas intermedias de una función específica de `forward`. En este caso, puede utilizar la función
-función de ayuda `detect_overflow` para inyectar el detector donde quieras, por ejemplo:
-
-```python
-from debug_utils import detect_overflow
-
-
-class T5LayerFF(nn.Module):
-    [...]
-
-    def forward(self, hidden_states):
-        forwarded_states = self.layer_norm(hidden_states)
-        detect_overflow(forwarded_states, "after layer_norm")
-        forwarded_states = self.DenseReluDense(forwarded_states)
-        detect_overflow(forwarded_states, "after DenseReluDense")
-        return hidden_states + self.dropout(forwarded_states)
-```
-
-Puedes ver que hemos añadido 2 de estos y ahora se trackea si `inf` o `nan` para `forwarded_states` fue detectado
-en algún punto intermedio.
-
-De hecho, el detector ya informa de esto porque cada una de las llamadas en el ejemplo anterior es un `nn.Module`, pero
-digamos que si tuvieras algunos cálculos directos locales, así es como lo harías.
-
-Además, si estás instanciando el debugger en tu propio código, puedes ajustar el número de frames impresos de
-su valor por defecto, por ejemplo:
-
-```python
-from .debug_utils import DebugUnderflowOverflow
-
-debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100)
-```
-
-### Rastreo de valores mínimos y máximos absolutos de batches específicos
-
-La misma clase de debugging se puede utilizar para el rastreo por batches con la función de detección de underflow/overflow desactivada.
-
-Digamos que quieres ver los valores mínimos y máximos absolutos de todos los ingredientes de cada call `forward` de un determinado
-batch, y sólo hacerlo para los batches 1 y 3. Entonces instancias esta clase como:
-
-```python
-debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3])
-```
-
-Y ahora los batches 1 y 3 completos serán rastreados usando el mismo formato que el detector de underflow/overflow.
-
-Los batches son 0-index.
-
-Esto es muy útil si sabes que el programa empieza a comportarse mal después de un determinado número de batch, para que puedas avanzar rápidamente
-hasta esa área. Aquí hay un ejemplo de output recortado para tal configuración:
-
-```
-                  *** Starting batch number=1 ***
-abs min  abs max  metadata
-                  shared Embedding
-1.01e-06 7.92e+02 weight
-0.00e+00 2.47e+04 input[0]
-5.36e-05 7.92e+02 output
-[...]
-                  decoder.dropout Dropout
-1.60e-07 2.27e+01 input[0]
-0.00e+00 2.52e+01 output
-                  decoder T5Stack
-     not a tensor output
-                  lm_head Linear
-1.01e-06 7.92e+02 weight
-0.00e+00 1.11e+00 input[0]
-6.06e-02 8.39e+01 output
-                   T5ForConditionalGeneration
-     not a tensor output
-
-                  *** Starting batch number=3 ***
-abs min  abs max  metadata
-                  shared Embedding
-1.01e-06 7.92e+02 weight
-0.00e+00 2.78e+04 input[0]
-5.36e-05 7.92e+02 output
-[...]
-```
-
-Aquí obtendrás un gran número de frames mostrados - tantos como forward calls haya en tu modelo, por lo que puede o no ser lo que quieras, pero a veces puede ser más fácil de usar para debug que un debugger normal.
-Por ejemplo, si un problema comienza a ocurrir en el batch 150. Entonces puedes mostrar las trazas de los batches 149 y 150 y comparar dónde
-los números empezaron a divergir.
-
-También puedes especificar el número de batch después del cual se debe detener el entrenamiento, con:
-
-```python
-debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3], abort_after_batch_num=3)
-```
diff --git a/docs/source/es/fast_tokenizers.md b/docs/source/es/fast_tokenizers.md
new file mode 100644
index 000000000000..92b925f67f7e
--- /dev/null
+++ b/docs/source/es/fast_tokenizers.md
@@ -0,0 +1,74 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Usa los tokenizadores de 🤗 Tokenizers
+
+[`PreTrainedTokenizerFast`] depende de la biblioteca [🤗 Tokenizers](https://huggingface.co/docs/tokenizers). Los tokenizadores obtenidos desde la biblioteca 🤗 Tokenizers pueden ser 
+cargados de forma muy sencilla en los 🤗 Transformers.
+
+Antes de entrar en detalles, comencemos creando un tokenizador dummy en unas cuantas líneas:
+
+```python
+>>> from tokenizers import Tokenizer
+>>> from tokenizers.models import BPE
+>>> from tokenizers.trainers import BpeTrainer
+>>> from tokenizers.pre_tokenizers import Whitespace
+
+>>> tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
+>>> trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
+
+>>> tokenizer.pre_tokenizer = Whitespace()
+>>> files = [...]
+>>> tokenizer.train(files, trainer)
+```
+
+Ahora tenemos un tokenizador entrenado en los archivos que definimos. Lo podemos seguir utilizando en ese entorno de ejecución (runtime en inglés), o puedes guardarlo
+en un archivo JSON para reutilizarlo en un futuro.
+
+## Cargando directamente desde el objeto tokenizador 
+
+Veamos cómo utilizar este objeto tokenizador en la biblioteca 🤗 Transformers. La clase
+[`PreTrainedTokenizerFast`] permite una instanciación fácil, al aceptar el objeto
+*tokenizer* instanciado como argumento:
+
+```python
+>>> from transformers import PreTrainedTokenizerFast
+
+>>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
+```
+
+Este objeto ya puede ser utilizado con todos los métodos compartidos por los tokenizadores de 🤗 Transformers! Visita la [página sobre tokenizadores
+](main_classes/tokenizer) para más información.
+
+## Cargando desde un archivo JSON
+
+Para cargar un tokenizador desde un archivo JSON, comencemos por guardar nuestro tokenizador:
+
+```python
+>>> tokenizer.save("tokenizer.json")
+```
+
+La localización (path en inglés) donde este archivo es guardado puede ser incluida en el método de inicialización de [`PreTrainedTokenizerFast`]
+utilizando el parámetro `tokenizer_file`:
+
+```python
+>>> from transformers import PreTrainedTokenizerFast
+
+>>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
+```
+
+Este objeto ya puede ser utilizado con todos los métodos compartidos por los tokenizadores de 🤗 Transformers! Visita la [página sobre tokenizadores
+](main_classes/tokenizer) para más información.
diff --git a/docs/source/es/fast_tokenizers.mdx b/docs/source/es/fast_tokenizers.mdx
deleted file mode 100644
index 63b43cc1c4c7..000000000000
--- a/docs/source/es/fast_tokenizers.mdx
+++ /dev/null
@@ -1,70 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Usa los tokenizadores de 🤗 Tokenizers
-
-[`PreTrainedTokenizerFast`] depende de la biblioteca [🤗 Tokenizers](https://huggingface.co/docs/tokenizers). Los tokenizadores obtenidos desde la biblioteca 🤗 Tokenizers pueden ser 
-cargados de forma muy sencilla en los 🤗 Transformers.
-
-Antes de entrar en detalles, comencemos creando un tokenizador dummy en unas cuantas líneas:
-
-```python
->>> from tokenizers import Tokenizer
->>> from tokenizers.models import BPE
->>> from tokenizers.trainers import BpeTrainer
->>> from tokenizers.pre_tokenizers import Whitespace
-
->>> tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
->>> trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
-
->>> tokenizer.pre_tokenizer = Whitespace()
->>> files = [...]
->>> tokenizer.train(files, trainer)
-```
-
-Ahora tenemos un tokenizador entrenado en los archivos que definimos. Lo podemos seguir utilizando en ese entorno de ejecución (runtime en inglés), o puedes guardarlo
-en un archivo JSON para reutilizarlo en un futuro.
-
-## Cargando directamente desde el objeto tokenizador 
-
-Veamos cómo utilizar este objeto tokenizador en la biblioteca 🤗 Transformers. La clase
-[`PreTrainedTokenizerFast`] permite una instanciación fácil, al aceptar el objeto
-*tokenizer* instanciado como argumento:
-
-```python
->>> from transformers import PreTrainedTokenizerFast
-
->>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
-```
-
-Este objeto ya puede ser utilizado con todos los métodos compartidos por los tokenizadores de 🤗 Transformers! Visita la [página sobre tokenizadores
-](main_classes/tokenizer) para más información.
-
-## Cargando desde un archivo JSON
-
-Para cargar un tokenizador desde un archivo JSON, comencemos por guardar nuestro tokenizador:
-
-```python
->>> tokenizer.save("tokenizer.json")
-```
-
-La localización (path en inglés) donde este archivo es guardado puede ser incluida en el método de inicialización de [`PreTrainedTokenizerFast`]
-utilizando el parámetro `tokenizer_file`:
-
-```python
->>> from transformers import PreTrainedTokenizerFast
-
->>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
-```
-
-Este objeto ya puede ser utilizado con todos los métodos compartidos por los tokenizadores de 🤗 Transformers! Visita la [página sobre tokenizadores
-](main_classes/tokenizer) para más información.
diff --git a/docs/source/es/index.md b/docs/source/es/index.md
new file mode 100644
index 000000000000..caefdfb7ad7b
--- /dev/null
+++ b/docs/source/es/index.md
@@ -0,0 +1,281 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 🤗 Transformers
+
+Machine Learning de última generación para PyTorch, TensorFlow y JAX.
+
+🤗 Transformers proporciona APIs para descargar y entrenar fácilmente modelos preentrenados de última generación. El uso de modelos  preentrenados puede reducir tus costos de cómputo, tu huella de carbono y ahorrarte tiempo al entrenar un modelo desde cero. Los modelos se pueden utilizar en diferentes modalidades, tales como:
+
+* 📝 Texto: clasificación de texto, extracción de información, respuesta a preguntas, resumir, traducción y generación de texto en más de 100 idiomas.
+* 🖼️ Imágenes: clasificación de imágenes, detección de objetos y segmentación.
+* 🗣️ Audio: reconocimiento de voz y clasificación de audio.
+* 🐙 Multimodal: respuesta a preguntas en tablas, reconocimiento óptico de caracteres, extracción de información de documentos escaneados, clasificación de videos y respuesta visual a preguntas.
+
+Nuestra biblioteca admite una integración perfecta entre tres de las bibliotecas de deep learning más populares: [PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/) y [JAX](https://jax.readthedocs.io/en/latest/). Entrena tu modelo con tres líneas de código en un framework y cárgalo para inferencia con otro.
+Cada arquitectura de 🤗 Transformers se define en un módulo de Python independiente para que se puedan personalizar fácilmente para investigación y experimentos.
+
+## Si estás buscando soporte personalizado del equipo de Hugging Face
+
+<a target="_blank" href="https://huggingface.co/support">
+<img alt="HuggingFace Expert Acceleration Program" src="https://huggingface.co/front/thumbnails/support.png" style="width: 100%; max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
+</a>
+
+## Contenidos
+
+La documentación está organizada en cuatro partes:
+
+- **EMPEZAR** contiene un recorrido rápido e instrucciones de instalación para comenzar a usar 🤗 Transformers.
+- **TUTORIALES** es un excelente lugar para comenzar. Esta sección te ayudará a obtener las habilidades básicas que necesitas para comenzar a usar 🤗 Transformers.
+- **GUÍAS PRÁCTICAS** te mostrará cómo lograr un objetivo específico, cómo hacer fine-tuning a un modelo preentrenado para el modelado de lenguaje o cómo crear un cabezal para un modelo personalizado.
+- **GUÍAS CONCEPTUALES** proporciona más discusión y explicación de los conceptos e ideas subyacentes detrás de los modelos, las tareas y la filosofía de diseño de 🤗 Transformers. 
+
+La biblioteca actualmente contiene implementaciones de JAX, PyTorch y TensorFlow, pesos de modelos preentrenados, scripts de uso y utilidades de conversión para los siguientes modelos.
+
+### Modelos compatibles
+
+<!--This list is updated automatically from the README with _make fix-copies_. Do not update manually! -->
+
+1. **[ALBERT](model_doc/albert)** (de Google Research y el Instituto Tecnológico de Toyota en Chicago) publicado con el paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), por Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
+1. **[ALIGN](model_doc/align)** (de Google Research) publicado con el paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) por Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.
+1. **[BART](model_doc/bart)** (de Facebook) publicado con el paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) por Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov y Luke Zettlemoyer.
+1. **[BARThez](model_doc/barthez)** (de École polytechnique) publicado con el paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) por Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
+1. **[BARTpho](model_doc/bartpho)** (de VinAI Research) publicado con el paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) por Nguyen Luong Tran, Duong Minh Le y Dat Quoc Nguyen.
+1. **[BEiT](model_doc/beit)** (de Microsoft) publicado con el paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) por Hangbo Bao, Li Dong, Furu Wei.
+1. **[BERT](model_doc/bert)** (de Google) publicado con el paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) por Jacob Devlin, Ming-Wei Chang, Kenton Lee y Kristina Toutanova.
+1. **[BERTweet](model_doc/bertweet)** (de VinAI Research) publicado con el paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) por Dat Quoc Nguyen, Thanh Vu y Anh Tuan Nguyen.
+1. **[BERT For Sequence Generation](model_doc/bert-generation)** (de Google) publicado con el paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) por Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[BigBird-RoBERTa](model_doc/big_bird)** (de Google Research) publicado con el paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) por Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[BigBird-Pegasus](model_doc/bigbird_pegasus)** (de Google Research) publicado con el paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) por Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[Blenderbot](model_doc/blenderbot)** (de Facebook) publicado con el paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) por Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BlenderbotSmall](model_doc/blenderbot-small)** (de Facebook) publicado con el paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) por Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BORT](model_doc/bort)** (de Alexa) publicado con el paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) por Adrian de Wynter y Daniel J. Perry.
+1. **[ByT5](model_doc/byt5)** (de Google Research) publicado con el paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) por Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
+1. **[CamemBERT](model_doc/camembert)** (de Inria/Facebook/Sorbonne) publicado con el paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) por Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah y Benoît Sagot.
+1. **[CANINE](model_doc/canine)** (de Google Research) publicado con el paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) por Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
+1. **[ConvNeXT](model_doc/convnext)** (de Facebook AI) publicado con el paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) por Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
+1. **[ConvNeXTV2](model_doc/convnextv2)** (de Facebook AI) publicado con el paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) por Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
+1. **[CLIP](model_doc/clip)** (de OpenAI) publicado con el paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) por Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
+1. **[ConvBERT](model_doc/convbert)** (de YituTech) publicado con el paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) por Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
+1. **[CPM](model_doc/cpm)** (de Universidad de Tsinghua) publicado con el paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) por Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
+1. **[CTRL](model_doc/ctrl)** (de Salesforce) publicado con el paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) por Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong y Richard Socher.
+1. **[Data2Vec](model_doc/data2vec)** (de Facebook) publicado con el paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) por Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
+1. **[DeBERTa](model_doc/deberta)** (de Microsoft) publicado con el paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) por Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[DeBERTa-v2](model_doc/deberta-v2)** (de Microsoft) publicado con el paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) por Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[Decision Transformer](model_doc/decision_transformer)** (de Berkeley/Facebook/Google) publicado con el paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) por Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
+1. **[DiT](model_doc/dit)** (de Microsoft Research) publicado con el paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) por Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
+1. **[DeiT](model_doc/deit)** (de Facebook) publicado con el paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) por Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
+1. **[DETR](model_doc/detr)** (de Facebook) publicado con el paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) por Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
+1. **[DialoGPT](model_doc/dialogpt)** (de Microsoft Research) publicado con el paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) por Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
+1. **[DistilBERT](model_doc/distilbert)** (de HuggingFace), publicado junto con el paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) por Victor Sanh, Lysandre Debut y Thomas Wolf. Se ha aplicado el mismo método para comprimir GPT2 en [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa en [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), BERT multilingüe en [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) y una versión alemana de DistilBERT.
+1. **[DPR](model_doc/dpr)** (de Facebook) publicado con el paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) por Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, y Wen-tau Yih.
+1. **[DPT](master/model_doc/dpt)** (de Intel Labs) publicado con el paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) por René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
+1. **[EfficientNet](model_doc/efficientnet)** (from Google Research) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946)  by Mingxing Tan and Quoc V. Le.
+1. **[EncoderDecoder](model_doc/encoder-decoder)** (de Google Research) publicado con el paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) por Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[ELECTRA](model_doc/electra)** (de Google Research/Universidad de Stanford) publicado con el paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) por Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
+1. **[FlauBERT](model_doc/flaubert)** (de CNRS) publicado con el paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) por Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
+1. **[FNet](model_doc/fnet)** (de Google Research) publicado con el paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) por James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
+1. **[Funnel Transformer](model_doc/funnel)** (de CMU/Google Brain) publicado con el paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) por Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
+1. **[GLPN](model_doc/glpn)** (de KAIST) publicado con el paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) por Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
+1. **[GPT](model_doc/openai-gpt)** (de OpenAI) publicado con el paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) por Alec Radford, Karthik Narasimhan, Tim Salimans y Ilya Sutskever.
+1. **[GPT-2](model_doc/gpt2)** (de OpenAI) publicado con el paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) por Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** y Ilya Sutskever**.
+1. **[GPT-J](model_doc/gptj)** (de EleutherAI) publicado con el repositorio [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) por Ben Wang y Aran Komatsuzaki.
+1. **[GPT Neo](model_doc/gpt_neo)** (de EleutherAI) publicado en el paper [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) por Sid Black, Stella Biderman, Leo Gao, Phil Wang y Connor Leahy.
+1. **[GPTSAN-japanese](model_doc/gptsan-japanese)** released with [GPTSAN](https://github.com/tanreinama/GPTSAN) by Toshiyuki Sakamoto (tanreinama).
+1. **[Hubert](model_doc/hubert)** (de Facebook) publicado con el paper [HuBERT: Self-Supervised Speech Representation Learning por Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) por Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
+1. **[I-BERT](model_doc/ibert)** (de Berkeley) publicado con el paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) por Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
+1. **[ImageGPT](model_doc/imagegpt)** (de OpenAI) publicado con el paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) por Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
+1. **[LayoutLM](model_doc/layoutlm)** (de Microsoft Research Asia) publicado con el paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) por Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
+1. **[LayoutLMv2](model_doc/layoutlmv2)** (de Microsoft Research Asia) publicado con el paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) por Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
+1. **[LayoutXLM](model_doc/layoutxlm)** (de Microsoft Research Asia) publicado con el paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) por Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
+1. **[LED](model_doc/led)** (de AllenAI) publicado con el paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) por Iz Beltagy, Matthew E. Peters, Arman Cohan.
+1. **[Longformer](model_doc/longformer)** (de AllenAI) publicado con el paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) por Iz Beltagy, Matthew E. Peters, Arman Cohan.
+1. **[LUKE](model_doc/luke)** (de Studio Ousia) publicado con el paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) por Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
+1. **[mLUKE](model_doc/mluke)** (de Studio Ousia) publicado con el paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) por Ryokan Ri, Ikuya Yamada, y Yoshimasa Tsuruoka.
+1. **[LXMERT](model_doc/lxmert)** (de UNC Chapel Hill) publicado con el paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) por Hao Tan y Mohit Bansal.
+1. **[M2M100](model_doc/m2m_100)** (de Facebook) publicado con el paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) por Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
+1. **[MarianMT](model_doc/marian)** Modelos de traducción automática entrenados usando [OPUS](http://opus.nlpl.eu/) data por Jörg Tiedemann. El [Marian Framework](https://marian-nmt.github.io/) está siendo desarrollado por el equipo de traductores de Microsoft.
+1. **[Mask2Former](model_doc/mask2former)** (de FAIR y UIUC) publicado con el paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) por Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
+1. **[MaskFormer](model_doc/maskformer)** (de Meta y UIUC) publicado con el paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) por Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
+1. **[MBart](model_doc/mbart)** (de Facebook) publicado con el paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) por Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
+1. **[MBart-50](model_doc/mbart)** (de Facebook) publicado con el paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) por Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
+1. **[Megatron-BERT](model_doc/megatron-bert)** (de NVIDIA) publicado con el paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) por Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper y Bryan Catanzaro.
+1. **[Megatron-GPT2](model_doc/megatron_gpt2)** (de NVIDIA) publicado con el paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) por Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper y Bryan Catanzaro.
+1. **[MPNet](model_doc/mpnet)** (de Microsoft Research) publicado con el paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) por Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
+1. **[MT5](model_doc/mt5)** (de Google AI) publicado con el paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) por Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
+1. **[Nyströmformer](model_doc/nystromformer)** (de la Universidad de Wisconsin - Madison) publicado con el paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) por Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
+1. **[OneFormer](model_doc/oneformer)** (de la SHI Labs) publicado con el paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) por Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
+1. **[Pegasus](model_doc/pegasus)** (de Google) publicado con el paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) por Jingqing Zhang, Yao Zhao, Mohammad Saleh y Peter J. Liu.
+1. **[Perceiver IO](model_doc/perceiver)** (de Deepmind) publicado con el paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) por Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
+1. **[PhoBERT](model_doc/phobert)** (de VinAI Research) publicado con el paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) por Dat Quoc Nguyen y Anh Tuan Nguyen.
+1. **[PLBart](model_doc/plbart)** (de UCLA NLP) publicado con el paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) por Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
+1. **[PoolFormer](model_doc/poolformer)** (de Sea AI Labs) publicado con el paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) por Yu, Weihao y Luo, Mi y Zhou, Pan y Si, Chenyang y Zhou, Yichen y Wang, Xinchao y Feng, Jiashi y Yan, Shuicheng.
+1. **[ProphetNet](model_doc/prophetnet)** (de Microsoft Research) publicado con el paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) por Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang y Ming Zhou.
+1. **[QDQBert](model_doc/qdqbert)** (de NVIDIA) publicado con el paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) por Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev y Paulius Micikevicius.
+1. **[REALM](model_doc/realm.html)** (de Google Research) publicado con el paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) por Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat y Ming-Wei Chang.
+1. **[Reformer](model_doc/reformer)** (de Google Research) publicado con el paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) por Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+1. **[RemBERT](model_doc/rembert)** (de Google Research) publicado con el paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) por Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
+1. **[RegNet](model_doc/regnet)** (de META Platforms) publicado con el paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) por Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
+1. **[ResNet](model_doc/resnet)** (de Microsoft Research) publicado con el paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) por Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
+1. **[RoBERTa](model_doc/roberta)** (de Facebook), publicado junto con el paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) por Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
+1. **[RoFormer](model_doc/roformer)** (de ZhuiyiTechnology), publicado junto con el paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) por Jianlin Su y Yu Lu y Shengfeng Pan y Bo Wen y Yunfeng Liu.
+1. **[SegFormer](model_doc/segformer)** (de NVIDIA) publicado con el paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) por Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
+1. **[SEW](model_doc/sew)** (de ASAPP) publicado con el paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) por Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
+1. **[SEW-D](model_doc/sew_d)** (de ASAPP) publicado con el paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) por Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
+1. **[SpeechToTextTransformer](model_doc/speech_to_text)** (de Facebook), publicado junto con el paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) por Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
+1. **[SpeechToTextTransformer2](model_doc/speech_to_text_2)** (de Facebook), publicado junto con el paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) por Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
+1. **[Splinter](model_doc/splinter)** (de Universidad de Tel Aviv), publicado junto con el paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) pory Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
+1. **[SqueezeBert](model_doc/squeezebert)** (de Berkeley) publicado con el paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) por Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, y Kurt W. Keutzer.
+1. **[Swin Transformer](model_doc/swin)** (de Microsoft) publicado con el paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) por Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
+1. **[T5](model_doc/t5)** (de Google AI) publicado con el paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) por Colin Raffel y Noam Shazeer y Adam Roberts y Katherine Lee y Sharan Narang y Michael Matena y Yanqi Zhou y Wei Li y Peter J. Liu.
+1. **[T5v1.1](model_doc/t5v1.1)** (de Google AI) publicado en el repositorio [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) por Colin Raffel y Noam Shazeer y Adam Roberts y Katherine Lee y Sharan Narang y Michael Matena y Yanqi Zhou y Wei Li y Peter J. Liu.
+1. **[TAPAS](model_doc/tapas)** (de Google AI) publicado con el paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) por Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno y Julian Martin Eisenschlos.
+1. **[TAPEX](model_doc/tapex)** (de Microsoft Research) publicado con el paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) por Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
+1. **[Transformer-XL](model_doc/transfo-xl)** (de Google/CMU) publicado con el paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) por Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+1. **[TrOCR](model_doc/trocr)** (de Microsoft), publicado junto con el paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) por Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
+1. **[UniSpeech](model_doc/unispeech)** (de Microsoft Research) publicado con el paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) por Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
+1. **[UniSpeechSat](model_doc/unispeech-sat)** (de Microsoft Research) publicado con el paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) por Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
+1. **[VAN](model_doc/van)** (de la Universidad de Tsinghua y la Universidad de Nankai) publicado con el paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) por Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
+1. **[ViLT](model_doc/vilt)** (de NAVER AI Lab/Kakao Enterprise/Kakao Brain) publicado con el paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) por Wonjae Kim, Bokyung Son, Ildoo Kim.
+1. **[Vision Transformer (ViT)](model_doc/vit)** (de Google AI) publicado con el paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) por Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[ViTMAE](model_doc/vit_mae)** (de Meta AI) publicado con el paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) por Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
+1. **[VisualBERT](model_doc/visual_bert)** (de UCLA NLP) publicado con el paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) por Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
+1. **[WavLM](model_doc/wavlm)** (de Microsoft Research) publicado con el paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) por Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
+1. **[Wav2Vec2](model_doc/wav2vec2)** (de Facebook AI) publicado con el paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) por Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
+1. **[Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme)** (de Facebook AI) publicado con el paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) por Qiantong Xu, Alexei Baevski, Michael Auli.
+1. **[XGLM](model_doc/xglm)** (de Facebook AI) publicado con el paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) por Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
+1. **[XLM](model_doc/xlm)** (de Facebook) publicado junto con el paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) por Guillaume Lample y Alexis Conneau.
+1. **[XLM-ProphetNet](model_doc/xlm-prophetnet)** (de Microsoft Research) publicado con el paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) por Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang y Ming Zhou.
+1. **[XLM-RoBERTa](model_doc/xlm-roberta)** (de Facebook AI), publicado junto con el paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) por Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer y Veselin Stoyanov.
+1. **[XLM-RoBERTa-XL](model_doc/xlm-roberta-xl)** (de Facebook AI), publicado junto con el paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) por Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
+1. **[XLNet](model_doc/xlnet)** (de Google/CMU) publicado con el paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) por Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
+1. **[XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)** (de Facebook AI) publicado con el paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) por Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
+1. **[XLS-R](model_doc/xls_r)** (de Facebook AI) publicado con el paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) por Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
+1. **[YOSO](model_doc/yoso)** (de la Universidad de Wisconsin-Madison) publicado con el paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) por Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
+
+
+### Frameworks compatibles
+
+La siguiente tabla representa el soporte actual en la biblioteca para cada uno de esos modelos, ya sea que tengan un tokenizador de Python (llamado "slow"). Un tokenizador "fast" respaldado por la biblioteca 🤗 Tokenizers, ya sea que tengan soporte en Jax (a través de
+Flax), PyTorch y/o TensorFlow.
+
+<!--This table is updated automatically from the auto modules with _make fix-copies_. Do not update manually!-->
+
+|            Modelo           | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support |
+|:---------------------------:|:--------------:|:--------------:|:---------------:|:------------------:|:------------:|
+|           ALBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|            BART             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|            BEiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|            BERT             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|       Bert Generation       |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           BigBird           |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
+|       BigBirdPegasus        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         Blenderbot          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|       BlenderbotSmall       |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|          CamemBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           Canine            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            CLIP             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|          ConvNext           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|        Data2VecAudio        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        Data2VecText         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           DeBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|         DeBERTa-v2          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|    Decision Transformer     |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            DeiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            DETR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         DistilBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|             DPR             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|             DPT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           ELECTRA           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|       Encoder decoder       |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+| FairSeq Machine-Translation |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          FlauBERT           |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            FNet             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|     Funnel Transformer      |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            GLPN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           GPT Neo           |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|            GPT-J            |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|           Hubert            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|           I-BERT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          ImageGPT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          LayoutLM           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|         LayoutLMv2          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|             LED             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|         Longformer          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            LUKE             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           LXMERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           M2M100            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Marian            |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
+|         MaskFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            mBART            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|        MegatronBert         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         MobileBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            MPNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|             mT5             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|        Nystromformer        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         OpenAI GPT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|        OpenAI GPT-2         |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|           Pegasus           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|          Perceiver          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           PLBart            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         PoolFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         ProphetNet          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           QDQBert           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             RAG             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            Realm            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          Reformer           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|           RegNet            |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|           RemBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           ResNet            |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|          RetriBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|           RoBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|          RoFormer           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|          SegFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             SEW             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            SEW-D            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|   Speech Encoder decoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|         Speech2Text         |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|        Speech2Text2         |       ✅       |       ❌       |       ❌        |         ❌         |      ❌      |
+|          Splinter           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|         SqueezeBERT         |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|            Swin             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             T5              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|            TAPAS            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            TAPEX            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|       Transformer-XL        |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            TrOCR            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          UniSpeech          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        UniSpeechSat         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             VAN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            ViLT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|   Vision Encoder decoder    |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|    VisionTextDualEncoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|         VisualBert          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             ViT             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|           ViTMAE            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|          Wav2Vec2           |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
+|            WavLM            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            XGLM             |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
+|             XLM             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|         XLM-RoBERTa         |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|       XLM-RoBERTa-XL        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        XLMProphetNet        |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            XLNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            YOSO             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+
+<!-- End table-->
diff --git a/docs/source/es/index.mdx b/docs/source/es/index.mdx
deleted file mode 100644
index 5091a52c8231..000000000000
--- a/docs/source/es/index.mdx
+++ /dev/null
@@ -1,271 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# 🤗 Transformers
-
-Machine Learning de última generación para PyTorch, TensorFlow y JAX.
-
-🤗 Transformers proporciona APIs para descargar y entrenar fácilmente modelos preentrenados de última generación. El uso de modelos  preentrenados puede reducir tus costos de cómputo, tu huella de carbono y ahorrarte tiempo al entrenar un modelo desde cero. Los modelos se pueden utilizar en diferentes modalidades, tales como:
-
-* 📝 Texto: clasificación de texto, extracción de información, respuesta a preguntas, resumir, traducción y generación de texto en más de 100 idiomas.
-* 🖼️ Imágenes: clasificación de imágenes, detección de objetos y segmentación.
-* 🗣️ Audio: reconocimiento de voz y clasificación de audio.
-* 🐙 Multimodal: respuesta a preguntas en tablas, reconocimiento óptico de caracteres, extracción de información de documentos escaneados, clasificación de videos y respuesta visual a preguntas.
-
-Nuestra biblioteca admite una integración perfecta entre tres de las bibliotecas de deep learning más populares: [PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/) y [JAX](https://jax.readthedocs.io/en/latest/). Entrena tu modelo con tres líneas de código en un framework y cárgalo para inferencia con otro.
-Cada arquitectura de 🤗 Transformers se define en un módulo de Python independiente para que se puedan personalizar fácilmente para investigación y experimentos.
-
-## Si estás buscando soporte personalizado del equipo de Hugging Face
-
-<a target="_blank" href="https://huggingface.co/support">
-<img alt="HuggingFace Expert Acceleration Program" src="https://huggingface.co/front/thumbnails/support.png" style="width: 100%; max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
-</a>
-
-## Contenidos
-
-La documentación está organizada en cuatro partes:
-
-- **EMPEZAR** contiene un recorrido rápido e instrucciones de instalación para comenzar a usar 🤗 Transformers.
-- **TUTORIALES** es un excelente lugar para comenzar. Esta sección te ayudará a obtener las habilidades básicas que necesitas para comenzar a usar 🤗 Transformers.
-- **GUÍAS PRÁCTICAS** te mostrará cómo lograr un objetivo específico, cómo hacer fine-tuning a un modelo preentrenado para el modelado de lenguaje o cómo crear un cabezal para un modelo personalizado.
-- **GUÍAS CONCEPTUALES** proporciona más discusión y explicación de los conceptos e ideas subyacentes detrás de los modelos, las tareas y la filosofía de diseño de 🤗 Transformers. 
-
-La biblioteca actualmente contiene implementaciones de JAX, PyTorch y TensorFlow, pesos de modelos preentrenados, scripts de uso y utilidades de conversión para los siguientes modelos.
-
-### Modelos compatibles
-
-<!--This list is updated automatically from the README with _make fix-copies_. Do not update manually! -->
-
-1. **[ALBERT](model_doc/albert)** (de Google Research y el Instituto Tecnológico de Toyota en Chicago) publicado con el paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), por Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-1. **[BART](model_doc/bart)** (de Facebook) publicado con el paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) por Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov y Luke Zettlemoyer.
-1. **[BARThez](model_doc/barthez)** (de École polytechnique) publicado con el paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) por Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
-1. **[BARTpho](model_doc/bartpho)** (de VinAI Research) publicado con el paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) por Nguyen Luong Tran, Duong Minh Le y Dat Quoc Nguyen.
-1. **[BEiT](model_doc/beit)** (de Microsoft) publicado con el paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) por Hangbo Bao, Li Dong, Furu Wei.
-1. **[BERT](model_doc/bert)** (de Google) publicado con el paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) por Jacob Devlin, Ming-Wei Chang, Kenton Lee y Kristina Toutanova.
-1. **[BERTweet](model_doc/bertweet)** (de VinAI Research) publicado con el paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) por Dat Quoc Nguyen, Thanh Vu y Anh Tuan Nguyen.
-1. **[BERT For Sequence Generation](model_doc/bert-generation)** (de Google) publicado con el paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) por Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[BigBird-RoBERTa](model_doc/big_bird)** (de Google Research) publicado con el paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) por Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BigBird-Pegasus](model_doc/bigbird_pegasus)** (de Google Research) publicado con el paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) por Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[Blenderbot](model_doc/blenderbot)** (de Facebook) publicado con el paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) por Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BlenderbotSmall](model_doc/blenderbot-small)** (de Facebook) publicado con el paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) por Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BORT](model_doc/bort)** (de Alexa) publicado con el paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) por Adrian de Wynter y Daniel J. Perry.
-1. **[ByT5](model_doc/byt5)** (de Google Research) publicado con el paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) por Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
-1. **[CamemBERT](model_doc/camembert)** (de Inria/Facebook/Sorbonne) publicado con el paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) por Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah y Benoît Sagot.
-1. **[CANINE](model_doc/canine)** (de Google Research) publicado con el paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) por Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
-1. **[ConvNeXT](model_doc/convnext)** (de Facebook AI) publicado con el paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) por Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
-1. **[CLIP](model_doc/clip)** (de OpenAI) publicado con el paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) por Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
-1. **[ConvBERT](model_doc/convbert)** (de YituTech) publicado con el paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) por Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
-1. **[CPM](model_doc/cpm)** (de Universidad de Tsinghua) publicado con el paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) por Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
-1. **[CTRL](model_doc/ctrl)** (de Salesforce) publicado con el paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) por Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong y Richard Socher.
-1. **[Data2Vec](model_doc/data2vec)** (de Facebook) publicado con el paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) por Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
-1. **[DeBERTa](model_doc/deberta)** (de Microsoft) publicado con el paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) por Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[DeBERTa-v2](model_doc/deberta-v2)** (de Microsoft) publicado con el paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) por Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[Decision Transformer](model_doc/decision_transformer)** (de Berkeley/Facebook/Google) publicado con el paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) por Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
-1. **[DiT](model_doc/dit)** (de Microsoft Research) publicado con el paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) por Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
-1. **[DeiT](model_doc/deit)** (de Facebook) publicado con el paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) por Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
-1. **[DETR](model_doc/detr)** (de Facebook) publicado con el paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) por Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
-1. **[DialoGPT](model_doc/dialogpt)** (de Microsoft Research) publicado con el paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) por Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-1. **[DistilBERT](model_doc/distilbert)** (de HuggingFace), publicado junto con el paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) por Victor Sanh, Lysandre Debut y Thomas Wolf. Se ha aplicado el mismo método para comprimir GPT2 en [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa en [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), BERT multilingüe en [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) y una versión alemana de DistilBERT.
-1. **[DPR](model_doc/dpr)** (de Facebook) publicado con el paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) por Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, y Wen-tau Yih.
-1. **[DPT](master/model_doc/dpt)** (de Intel Labs) publicado con el paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) por René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
-1. **[EncoderDecoder](model_doc/encoder-decoder)** (de Google Research) publicado con el paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) por Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[ELECTRA](model_doc/electra)** (de Google Research/Universidad de Stanford) publicado con el paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) por Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
-1. **[FlauBERT](model_doc/flaubert)** (de CNRS) publicado con el paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) por Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
-1. **[FNet](model_doc/fnet)** (de Google Research) publicado con el paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) por James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
-1. **[Funnel Transformer](model_doc/funnel)** (de CMU/Google Brain) publicado con el paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) por Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-1. **[GLPN](model_doc/glpn)** (de KAIST) publicado con el paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) por Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
-1. **[GPT](model_doc/openai-gpt)** (de OpenAI) publicado con el paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) por Alec Radford, Karthik Narasimhan, Tim Salimans y Ilya Sutskever.
-1. **[GPT-2](model_doc/gpt2)** (de OpenAI) publicado con el paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) por Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** y Ilya Sutskever**.
-1. **[GPT-J](model_doc/gptj)** (de EleutherAI) publicado con el repositorio [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) por Ben Wang y Aran Komatsuzaki.
-1. **[GPT Neo](model_doc/gpt_neo)** (de EleutherAI) publicado en el paper [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) por Sid Black, Stella Biderman, Leo Gao, Phil Wang y Connor Leahy.
-1. **[Hubert](model_doc/hubert)** (de Facebook) publicado con el paper [HuBERT: Self-Supervised Speech Representation Learning por Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) por Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
-1. **[I-BERT](model_doc/ibert)** (de Berkeley) publicado con el paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) por Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
-1. **[ImageGPT](model_doc/imagegpt)** (de OpenAI) publicado con el paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) por Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
-1. **[LayoutLM](model_doc/layoutlm)** (de Microsoft Research Asia) publicado con el paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) por Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
-1. **[LayoutLMv2](model_doc/layoutlmv2)** (de Microsoft Research Asia) publicado con el paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) por Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
-1. **[LayoutXLM](model_doc/layoutxlm)** (de Microsoft Research Asia) publicado con el paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) por Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
-1. **[LED](model_doc/led)** (de AllenAI) publicado con el paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) por Iz Beltagy, Matthew E. Peters, Arman Cohan.
-1. **[Longformer](model_doc/longformer)** (de AllenAI) publicado con el paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) por Iz Beltagy, Matthew E. Peters, Arman Cohan.
-1. **[LUKE](model_doc/luke)** (de Studio Ousia) publicado con el paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) por Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
-1. **[mLUKE](model_doc/mluke)** (de Studio Ousia) publicado con el paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) por Ryokan Ri, Ikuya Yamada, y Yoshimasa Tsuruoka.
-1. **[LXMERT](model_doc/lxmert)** (de UNC Chapel Hill) publicado con el paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) por Hao Tan y Mohit Bansal.
-1. **[M2M100](model_doc/m2m_100)** (de Facebook) publicado con el paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) por Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
-1. **[MarianMT](model_doc/marian)** Modelos de traducción automática entrenados usando [OPUS](http://opus.nlpl.eu/) data por Jörg Tiedemann. El [Marian Framework](https://marian-nmt.github.io/) está siendo desarrollado por el equipo de traductores de Microsoft.
-1. **[MaskFormer](model_doc/maskformer)** (de Meta y UIUC) publicado con el paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) por Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
-1. **[MBart](model_doc/mbart)** (de Facebook) publicado con el paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) por Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-1. **[MBart-50](model_doc/mbart)** (de Facebook) publicado con el paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) por Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-1. **[Megatron-BERT](model_doc/megatron-bert)** (de NVIDIA) publicado con el paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) por Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper y Bryan Catanzaro.
-1. **[Megatron-GPT2](model_doc/megatron_gpt2)** (de NVIDIA) publicado con el paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) por Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper y Bryan Catanzaro.
-1. **[MPNet](model_doc/mpnet)** (de Microsoft Research) publicado con el paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) por Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
-1. **[MT5](model_doc/mt5)** (de Google AI) publicado con el paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) por Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-1. **[Nyströmformer](model_doc/nystromformer)** (de la Universidad de Wisconsin - Madison) publicado con el paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) por Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
-1. **[Pegasus](model_doc/pegasus)** (de Google) publicado con el paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) por Jingqing Zhang, Yao Zhao, Mohammad Saleh y Peter J. Liu.
-1. **[Perceiver IO](model_doc/perceiver)** (de Deepmind) publicado con el paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) por Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
-1. **[PhoBERT](model_doc/phobert)** (de VinAI Research) publicado con el paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) por Dat Quoc Nguyen y Anh Tuan Nguyen.
-1. **[PLBart](model_doc/plbart)** (de UCLA NLP) publicado con el paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) por Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
-1. **[PoolFormer](model_doc/poolformer)** (de Sea AI Labs) publicado con el paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) por Yu, Weihao y Luo, Mi y Zhou, Pan y Si, Chenyang y Zhou, Yichen y Wang, Xinchao y Feng, Jiashi y Yan, Shuicheng.
-1. **[ProphetNet](model_doc/prophetnet)** (de Microsoft Research) publicado con el paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) por Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang y Ming Zhou.
-1. **[QDQBert](model_doc/qdqbert)** (de NVIDIA) publicado con el paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) por Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev y Paulius Micikevicius.
-1. **[REALM](model_doc/realm.html)** (de Google Research) publicado con el paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) por Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat y Ming-Wei Chang.
-1. **[Reformer](model_doc/reformer)** (de Google Research) publicado con el paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) por Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
-1. **[RemBERT](model_doc/rembert)** (de Google Research) publicado con el paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) por Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
-1. **[RegNet](model_doc/regnet)** (de META Platforms) publicado con el paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) por Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
-1. **[ResNet](model_doc/resnet)** (de Microsoft Research) publicado con el paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) por Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
-1. **[RoBERTa](model_doc/roberta)** (de Facebook), publicado junto con el paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) por Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-1. **[RoFormer](model_doc/roformer)** (de ZhuiyiTechnology), publicado junto con el paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) por Jianlin Su y Yu Lu y Shengfeng Pan y Bo Wen y Yunfeng Liu.
-1. **[SegFormer](model_doc/segformer)** (de NVIDIA) publicado con el paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) por Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
-1. **[SEW](model_doc/sew)** (de ASAPP) publicado con el paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) por Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
-1. **[SEW-D](model_doc/sew_d)** (de ASAPP) publicado con el paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) por Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
-1. **[SpeechToTextTransformer](model_doc/speech_to_text)** (de Facebook), publicado junto con el paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) por Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
-1. **[SpeechToTextTransformer2](model_doc/speech_to_text_2)** (de Facebook), publicado junto con el paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) por Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
-1. **[Splinter](model_doc/splinter)** (de Universidad de Tel Aviv), publicado junto con el paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) pory Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
-1. **[SqueezeBert](model_doc/squeezebert)** (de Berkeley) publicado con el paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) por Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, y Kurt W. Keutzer.
-1. **[Swin Transformer](model_doc/swin)** (de Microsoft) publicado con el paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) por Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
-1. **[T5](model_doc/t5)** (de Google AI) publicado con el paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) por Colin Raffel y Noam Shazeer y Adam Roberts y Katherine Lee y Sharan Narang y Michael Matena y Yanqi Zhou y Wei Li y Peter J. Liu.
-1. **[T5v1.1](model_doc/t5v1.1)** (de Google AI) publicado en el repositorio [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) por Colin Raffel y Noam Shazeer y Adam Roberts y Katherine Lee y Sharan Narang y Michael Matena y Yanqi Zhou y Wei Li y Peter J. Liu.
-1. **[TAPAS](model_doc/tapas)** (de Google AI) publicado con el paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) por Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno y Julian Martin Eisenschlos.
-1. **[TAPEX](model_doc/tapex)** (de Microsoft Research) publicado con el paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) por Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
-1. **[Transformer-XL](model_doc/transfo-xl)** (de Google/CMU) publicado con el paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) por Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-1. **[TrOCR](model_doc/trocr)** (de Microsoft), publicado junto con el paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) por Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
-1. **[UniSpeech](model_doc/unispeech)** (de Microsoft Research) publicado con el paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) por Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
-1. **[UniSpeechSat](model_doc/unispeech-sat)** (de Microsoft Research) publicado con el paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) por Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
-1. **[VAN](model_doc/van)** (de la Universidad de Tsinghua y la Universidad de Nankai) publicado con el paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) por Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
-1. **[ViLT](model_doc/vilt)** (de NAVER AI Lab/Kakao Enterprise/Kakao Brain) publicado con el paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) por Wonjae Kim, Bokyung Son, Ildoo Kim.
-1. **[Vision Transformer (ViT)](model_doc/vit)** (de Google AI) publicado con el paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) por Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-1. **[ViTMAE](model_doc/vit_mae)** (de Meta AI) publicado con el paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) por Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
-1. **[VisualBERT](model_doc/visual_bert)** (de UCLA NLP) publicado con el paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) por Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
-1. **[WavLM](model_doc/wavlm)** (de Microsoft Research) publicado con el paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) por Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
-1. **[Wav2Vec2](model_doc/wav2vec2)** (de Facebook AI) publicado con el paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) por Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
-1. **[Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme)** (de Facebook AI) publicado con el paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) por Qiantong Xu, Alexei Baevski, Michael Auli.
-1. **[XGLM](model_doc/xglm)** (de Facebook AI) publicado con el paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) por Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
-1. **[XLM](model_doc/xlm)** (de Facebook) publicado junto con el paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) por Guillaume Lample y Alexis Conneau.
-1. **[XLM-ProphetNet](model_doc/xlm-prophetnet)** (de Microsoft Research) publicado con el paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) por Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang y Ming Zhou.
-1. **[XLM-RoBERTa](model_doc/xlm-roberta)** (de Facebook AI), publicado junto con el paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) por Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer y Veselin Stoyanov.
-1. **[XLM-RoBERTa-XL](model_doc/xlm-roberta-xl)** (de Facebook AI), publicado junto con el paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) por Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
-1. **[XLNet](model_doc/xlnet)** (de Google/CMU) publicado con el paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) por Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-1. **[XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)** (de Facebook AI) publicado con el paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) por Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
-1. **[XLS-R](model_doc/xls_r)** (de Facebook AI) publicado con el paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) por Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
-1. **[YOSO](model_doc/yoso)** (de la Universidad de Wisconsin-Madison) publicado con el paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) por Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
-
-
-### Frameworks compatibles
-
-La siguiente tabla representa el soporte actual en la biblioteca para cada uno de esos modelos, ya sea que tengan un tokenizador de Python (llamado "slow"). Un tokenizador "fast" respaldado por la biblioteca 🤗 Tokenizers, ya sea que tengan soporte en Jax (a través de
-Flax), PyTorch y/o TensorFlow.
-
-<!--This table is updated automatically from the auto modules with _make fix-copies_. Do not update manually!-->
-
-|            Modelo           | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support |
-|:---------------------------:|:--------------:|:--------------:|:---------------:|:------------------:|:------------:|
-|           ALBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|            BART             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|            BEiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-|            BERT             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|       Bert Generation       |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           BigBird           |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
-|       BigBirdPegasus        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         Blenderbot          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|       BlenderbotSmall       |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|          CamemBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|           Canine            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            CLIP             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|          ConvNext           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|        Data2VecAudio        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|        Data2VecText         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           DeBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|         DeBERTa-v2          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|    Decision Transformer     |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            DeiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            DETR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         DistilBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|             DPR             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|             DPT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           ELECTRA           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|       Encoder decoder       |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-| FairSeq Machine-Translation |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          FlauBERT           |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            FNet             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|     Funnel Transformer      |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|            GLPN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           GPT Neo           |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-|            GPT-J            |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|           Hubert            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|           I-BERT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          ImageGPT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          LayoutLM           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|         LayoutLMv2          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|             LED             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|         Longformer          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|            LUKE             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           LXMERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|           M2M100            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           Marian            |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
-|         MaskFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            mBART            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|        MegatronBert         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         MobileBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|            MPNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|             mT5             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|        Nystromformer        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         OpenAI GPT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|        OpenAI GPT-2         |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|           Pegasus           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|          Perceiver          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           PLBart            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         PoolFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         ProphetNet          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           QDQBert           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             RAG             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            Realm            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|          Reformer           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|           RegNet            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           RemBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|           ResNet            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          RetriBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|           RoBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|          RoFormer           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|          SegFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             SEW             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            SEW-D            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|   Speech Encoder decoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-|         Speech2Text         |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|        Speech2Text2         |       ✅       |       ❌       |       ❌        |         ❌         |      ❌      |
-|          Splinter           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|         SqueezeBERT         |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|            Swin             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             T5              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|            TAPAS            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            TAPEX            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|       Transformer-XL        |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            TrOCR            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          UniSpeech          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|        UniSpeechSat         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             VAN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            ViLT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|   Vision Encoder decoder    |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|    VisionTextDualEncoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-|         VisualBert          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             ViT             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|           ViTMAE            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|          Wav2Vec2           |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
-|            WavLM            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            XGLM             |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
-|             XLM             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|         XLM-RoBERTa         |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|       XLM-RoBERTa-XL        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|        XLMProphetNet        |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            XLNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|            YOSO             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-
-<!-- End table-->
diff --git a/docs/source/es/installation.md b/docs/source/es/installation.md
new file mode 100644
index 000000000000..0eb2dcb03a44
--- /dev/null
+++ b/docs/source/es/installation.md
@@ -0,0 +1,242 @@
+<!---
+Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Instalación
+
+En esta guía puedes encontrar información para instalar 🤗 Transformers para cualquier biblioteca de Machine Learning con la que estés trabajando. Además, encontrarás información sobre cómo establecer el caché y cómo configurar 🤗 Transformers para correrlo de manera offline (opcional).
+
+🤗 Transformers ha sido probada en Python 3.6+, PyTorch 1.1.0+, TensorFlow 2.0+, y Flax. Para instalar la biblioteca de deep learning con la que desees trabajar, sigue las instrucciones correspondientes listadas a continuación:
+
+* [PyTorch](https://pytorch.org/get-started/locally/)
+* [TensorFlow 2.0](https://www.tensorflow.org/install/pip)
+* [Flax](https://flax.readthedocs.io/en/latest/)
+
+## Instalación con pip
+
+Es necesario instalar 🤗 Transformers en un [entorno virtual](https://docs.python.org/3/library/venv.html). Si necesitas más información sobre entornos virtuales de Python, consulta esta [guía](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/
+). Un entorno virtual facilita el manejo de proyectos y evita problemas de compatibilidad entre dependencias.
+
+Comienza por crear un entorno virtual en el directorio de tu proyecto:
+
+```bash
+python -m venv .env
+```
+
+Activa el entorno virtual:
+
+```bash
+source .env/bin/activate
+```
+
+Ahora puedes instalar 🤗 Transformers con el siguiente comando:
+
+```bash
+pip install transformers
+```
+
+Solo para CPU, puedes instalar 🤗 Transformers y una biblioteca de deep learning con un comando de una sola línea.
+
+Por ejemplo, instala 🤗 Transformers y Pytorch:
+
+```bash
+pip install transformers[torch]
+```
+
+🤗 Transformers y TensorFlow 2.0:
+
+```bash
+pip install transformers[tf-cpu]
+```
+
+🤗 Transformers y Flax:
+
+```bash
+pip install transformers[flax]
+```
+
+Por último, revisa si 🤗 Transformers ha sido instalada exitosamente con el siguiente comando que descarga un modelo pre-entrenado:
+
+```bash
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"
+```
+Después imprime la etiqueta y el puntaje:
+
+```bash
+[{'label': 'POSITIVE', 'score': 0.9998704791069031}]
+```
+
+## Instalación desde la fuente
+
+Instala 🤗 Transformers desde la fuente con el siguiente comando:
+
+```bash
+pip install git+https://github.com/huggingface/transformers
+```
+
+El comando de arriba instala la versión `master` más actual en vez de la última versión estable. La versión `master` es útil para obtener los últimos avances de  🤗 Transformers. Por ejemplo, se puede dar el caso de que un error fue corregido después de la última versión estable pero aún no se ha liberado un nuevo lanzamiento. Sin embargo, existe la posibilidad de que la versión `master` no sea estable. El equipo trata de mantener la versión `master` operacional y la mayoría de los errores son resueltos en unas cuantas horas o un día. Si encuentras algún problema, por favor abre un [Issue](https://github.com/huggingface/transformers/issues) para que pueda ser corregido más rápido.
+
+Verifica si 🤗 Transformers está instalada apropiadamente con el siguiente comando:
+
+```bash
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I love you'))"
+```
+
+## Instalación editable
+
+Necesitarás una instalación editable si deseas:
+* Usar la versión `master` del código fuente.
+* Contribuir a 🤗 Transformers y necesitas probar cambios en el código.
+
+Clona el repositorio e instala 🤗 Transformers con los siguientes comandos:
+
+```bash
+git clone https://github.com/huggingface/transformers.git
+cd transformers
+pip install -e .
+```
+
+Éstos comandos van a ligar el directorio desde donde clonamos el repositorio al path de las bibliotecas de Python. Python ahora buscará dentro de la carpeta que clonaste además de los paths normales de la biblioteca. Por ejemplo, si los paquetes de Python se encuentran instalados en `~/anaconda3/envs/main/lib/python3.7/site-packages/`, Python también buscará en el directorio desde donde clonamos el repositorio `~/transformers/`.
+
+<Tip warning={true}>
+
+Debes mantener el directorio `transformers` si deseas seguir usando la biblioteca.
+
+</Tip>
+
+Puedes actualizar tu copia local a la última versión de 🤗 Transformers con el siguiente comando:
+
+```bash
+cd ~/transformers/
+git pull
+```
+
+El entorno de Python que creaste para la instalación de 🤗 Transformers encontrará la versión `master` en la siguiente ejecución.
+
+## Instalación con conda
+
+Puedes instalar 🤗 Transformers desde el canal de conda `huggingface` con el siguiente comando:
+
+```bash
+conda install -c huggingface transformers
+```
+
+## Configuración de Caché
+
+Los modelos preentrenados se descargan y almacenan en caché localmente en: `~/.cache/huggingface/transformers/`. Este es el directorio predeterminado proporcionado por la variable de entorno de shell `TRANSFORMERS_CACHE`. En Windows, el directorio predeterminado es dado por `C:\Users\username\.cache\huggingface\transformers`. Puedes cambiar las variables de entorno de shell que se muestran a continuación, en orden de prioridad, para especificar un directorio de caché diferente:
+
+1. Variable de entorno del shell (por defecto): `TRANSFORMERS_CACHE`.
+2. Variable de entorno del shell:`HF_HOME` + `transformers/`.
+3. Variable de entorno del shell: `XDG_CACHE_HOME` + `/huggingface/transformers`.
+
+<Tip>
+
+🤗 Transformers usará las variables de entorno de shell `PYTORCH_TRANSFORMERS_CACHE` o `PYTORCH_PRETRAINED_BERT_CACHE` si viene de una iteración anterior de la biblioteca y ha configurado esas variables de entorno, a menos que especifiques la variable de entorno de shell `TRANSFORMERS_CACHE`.
+    
+</Tip>
+
+
+## Modo Offline
+
+🤗 Transformers puede ejecutarse en un entorno con firewall o fuera de línea (offline) usando solo archivos locales. Configura la variable de entorno `TRANSFORMERS_OFFLINE=1` para habilitar este comportamiento.
+
+<Tip>
+
+Puedes añadir [🤗 Datasets](https://huggingface.co/docs/datasets/) al flujo de entrenamiento offline declarando la variable de entorno  `HF_DATASETS_OFFLINE=1`.
+
+</Tip>
+
+Por ejemplo, normalmente ejecutarías un programa en una red normal con firewall para instancias externas con el siguiente comando:
+
+```bash
+python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
+```
+
+Ejecuta este mismo programa en una instancia offline con el siguiente comando:
+
+```bash
+HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
+python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
+```
+
+El script ahora debería ejecutarse sin bloquearse ni esperar a que se agote el tiempo de espera porque sabe que solo debe buscar archivos locales.
+
+### Obtener modelos y tokenizers para uso offline
+
+Otra opción para usar 🤗 Transformers offline es descargando previamente los archivos y después apuntar al path local donde se encuentren. Hay tres maneras de hacer esto:
+
+* Descarga un archivo mediante la interfaz de usuario del [Model Hub](https://huggingface.co/models) haciendo click en el ícono ↓.
+
+    ![download-icon](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/download-icon.png)
+
+
+* Utiliza el flujo de [`PreTrainedModel.from_pretrained`] y [`PreTrainedModel.save_pretrained`]:
+    1. Descarga previamente los archivos con [`PreTrainedModel.from_pretrained`]:
+
+    ```py
+    >>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/T0_3B")
+    >>> model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0_3B")
+    ```
+
+
+    2. Guarda los archivos en un directorio específico con [`PreTrainedModel.save_pretrained`]:
+
+    ```py
+    >>> tokenizer.save_pretrained("./your/path/bigscience_t0")
+    >>> model.save_pretrained("./your/path/bigscience_t0")
+    ```
+
+    3. Cuando te encuentres offline, recarga los archivos con [`PreTrainedModel.from_pretrained`] desde el directorio especificado: 
+
+    ```py
+    >>> tokenizer = AutoTokenizer.from_pretrained("./your/path/bigscience_t0")
+    >>> model = AutoModel.from_pretrained("./your/path/bigscience_t0")
+    ```
+
+* Descarga de manera programática los archivos con la biblioteca [huggingface_hub](https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub):
+
+    1. Instala la biblioteca [huggingface_hub](https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub) en tu entorno virtual: 
+
+    ```bash
+    python -m pip install huggingface_hub
+    ```
+
+    2. Utiliza la función [`hf_hub_download`](https://huggingface.co/docs/hub/adding-a-library#download-files-from-the-hub) para descargar un archivo a un path específico. Por ejemplo, el siguiente comando descarga el archivo `config.json` del modelo [T0](https://huggingface.co/bigscience/T0_3B) al path deseado:
+
+    ```py
+    >>> from huggingface_hub import hf_hub_download
+
+    >>> hf_hub_download(repo_id="bigscience/T0_3B", filename="config.json", cache_dir="./your/path/bigscience_t0")
+    ```
+
+Una vez que el archivo se descargue y se almacene en caché localmente, especifica tu ruta local para cargarlo y usarlo:
+
+```py
+>>> from transformers import AutoConfig
+
+>>> config = AutoConfig.from_pretrained("./your/path/bigscience_t0/config.json")
+```
+
+<Tip>
+
+Para más detalles sobre cómo descargar archivos almacenados en el Hub consulta la sección [How to download files from the Hub](https://huggingface.co/docs/hub/how-to-downstream).
+
+</Tip>
diff --git a/docs/source/es/installation.mdx b/docs/source/es/installation.mdx
deleted file mode 100644
index 01b9d81409d4..000000000000
--- a/docs/source/es/installation.mdx
+++ /dev/null
@@ -1,238 +0,0 @@
-<!---
-Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Instalación
-
-En esta guía puedes encontrar información para instalar 🤗 Transformers para cualquier biblioteca de Machine Learning con la que estés trabajando. Además, encontrarás información sobre cómo establecer el caché y cómo configurar 🤗 Transformers para correrlo de manera offline (opcional).
-
-🤗 Transformers ha sido probada en Python 3.6+, PyTorch 1.1.0+, TensorFlow 2.0+, y Flax. Para instalar la biblioteca de deep learning con la que desees trabajar, sigue las instrucciones correspondientes listadas a continuación:
-
-* [PyTorch](https://pytorch.org/get-started/locally/)
-* [TensorFlow 2.0](https://www.tensorflow.org/install/pip)
-* [Flax](https://flax.readthedocs.io/en/latest/)
-
-## Instalación con pip
-
-Es necesario instalar 🤗 Transformers en un [entorno virtual](https://docs.python.org/3/library/venv.html). Si necesitas más información sobre entornos virtuales de Python, consulta esta [guía](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/
-). Un entorno virtual facilita el manejo de proyectos y evita problemas de compatibilidad entre dependencias.
-
-Comienza por crear un entorno virtual en el directorio de tu proyecto:
-
-```bash
-python -m venv .env
-```
-
-Activa el entorno virtual:
-
-```bash
-source .env/bin/activate
-```
-
-Ahora puedes instalar 🤗 Transformers con el siguiente comando:
-
-```bash
-pip install transformers
-```
-
-Solo para CPU, puedes instalar 🤗 Transformers y una biblioteca de deep learning con un comando de una sola línea.
-
-Por ejemplo, instala 🤗 Transformers y Pytorch:
-
-```bash
-pip install transformers[torch]
-```
-
-🤗 Transformers y TensorFlow 2.0:
-
-```bash
-pip install transformers[tf-cpu]
-```
-
-🤗 Transformers y Flax:
-
-```bash
-pip install transformers[flax]
-```
-
-Por último, revisa si 🤗 Transformers ha sido instalada exitosamente con el siguiente comando que descarga un modelo pre-entrenado:
-
-```bash
-python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"
-```
-Después imprime la etiqueta y el puntaje:
-
-```bash
-[{'label': 'POSITIVE', 'score': 0.9998704791069031}]
-```
-
-## Instalación desde la fuente
-
-Instala 🤗 Transformers desde la fuente con el siguiente comando:
-
-```bash
-pip install git+https://github.com/huggingface/transformers
-```
-
-El comando de arriba instala la versión `master` más actual en vez de la última versión estable. La versión `master` es útil para obtener los últimos avances de  🤗 Transformers. Por ejemplo, se puede dar el caso de que un error fue corregido después de la última versión estable pero aún no se ha liberado un nuevo lanzamiento. Sin embargo, existe la posibilidad de que la versión `master` no sea estable. El equipo trata de mantener la versión `master` operacional y la mayoría de los errores son resueltos en unas cuantas horas o un día. Si encuentras algún problema, por favor abre un [Issue](https://github.com/huggingface/transformers/issues) para que pueda ser corregido más rápido.
-
-Verifica si 🤗 Transformers está instalada apropiadamente con el siguiente comando:
-
-```bash
-python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I love you'))"
-```
-
-## Instalación editable
-
-Necesitarás una instalación editable si deseas:
-* Usar la versión `master` del código fuente.
-* Contribuir a 🤗 Transformers y necesitas probar cambios en el código.
-
-Clona el repositorio e instala 🤗 Transformers con los siguientes comandos:
-
-```bash
-git clone https://github.com/huggingface/transformers.git
-cd transformers
-pip install -e .
-```
-
-Éstos comandos van a ligar el directorio desde donde clonamos el repositorio al path de las bibliotecas de Python. Python ahora buscará dentro de la carpeta que clonaste además de los paths normales de la biblioteca. Por ejemplo, si los paquetes de Python se encuentran instalados en `~/anaconda3/envs/main/lib/python3.7/site-packages/`, Python también buscará en el directorio desde donde clonamos el repositorio `~/transformers/`.
-
-<Tip warning={true}>
-
-Debes mantener el directorio `transformers` si deseas seguir usando la biblioteca.
-
-</Tip>
-
-Puedes actualizar tu copia local a la última versión de 🤗 Transformers con el siguiente comando:
-
-```bash
-cd ~/transformers/
-git pull
-```
-
-El entorno de Python que creaste para la instalación de 🤗 Transformers encontrará la versión `master` en la siguiente ejecución.
-
-## Instalación con conda
-
-Puedes instalar 🤗 Transformers desde el canal de conda `huggingface` con el siguiente comando:
-
-```bash
-conda install -c huggingface transformers
-```
-
-## Configuración de Caché
-
-Los modelos preentrenados se descargan y almacenan en caché localmente en: `~/.cache/huggingface/transformers/`. Este es el directorio predeterminado proporcionado por la variable de entorno de shell `TRANSFORMERS_CACHE`. En Windows, el directorio predeterminado es dado por `C:\Users\username\.cache\huggingface\transformers`. Puedes cambiar las variables de entorno de shell que se muestran a continuación, en orden de prioridad, para especificar un directorio de caché diferente:
-
-1. Variable de entorno del shell (por defecto): `TRANSFORMERS_CACHE`.
-2. Variable de entorno del shell:`HF_HOME` + `transformers/`.
-3. Variable de entorno del shell: `XDG_CACHE_HOME` + `/huggingface/transformers`.
-
-<Tip>
-
-🤗 Transformers usará las variables de entorno de shell `PYTORCH_TRANSFORMERS_CACHE` o `PYTORCH_PRETRAINED_BERT_CACHE` si viene de una iteración anterior de la biblioteca y ha configurado esas variables de entorno, a menos que especifiques la variable de entorno de shell `TRANSFORMERS_CACHE`.
-    
-</Tip>
-
-
-## Modo Offline
-
-🤗 Transformers puede ejecutarse en un entorno con firewall o fuera de línea (offline) usando solo archivos locales. Configura la variable de entorno `TRANSFORMERS_OFFLINE=1` para habilitar este comportamiento.
-
-<Tip>
-
-Puedes añadir [🤗 Datasets](https://huggingface.co/docs/datasets/) al flujo de entrenamiento offline declarando la variable de entorno  `HF_DATASETS_OFFLINE=1`.
-
-</Tip>
-
-Por ejemplo, normalmente ejecutarías un programa en una red normal con firewall para instancias externas con el siguiente comando:
-
-```bash
-python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
-```
-
-Ejecuta este mismo programa en una instancia offline con el siguiente comando:
-
-```bash
-HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
-python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
-```
-
-El script ahora debería ejecutarse sin bloquearse ni esperar a que se agote el tiempo de espera porque sabe que solo debe buscar archivos locales.
-
-### Obtener modelos y tokenizers para uso offline
-
-Otra opción para usar 🤗 Transformers offline es descargando previamente los archivos y después apuntar al path local donde se encuentren. Hay tres maneras de hacer esto:
-
-* Descarga un archivo mediante la interfaz de usuario del [Model Hub](https://huggingface.co/models) haciendo click en el ícono ↓.
-
-    ![download-icon](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/download-icon.png)
-
-
-* Utiliza el flujo de [`PreTrainedModel.from_pretrained`] y [`PreTrainedModel.save_pretrained`]:
-    1. Descarga previamente los archivos con [`PreTrainedModel.from_pretrained`]:
-
-    ```py
-    >>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-
-    >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/T0_3B")
-    >>> model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0_3B")
-    ```
-
-
-    2. Guarda los archivos en un directorio específico con [`PreTrainedModel.save_pretrained`]:
-
-    ```py
-    >>> tokenizer.save_pretrained("./your/path/bigscience_t0")
-    >>> model.save_pretrained("./your/path/bigscience_t0")
-    ```
-
-    3. Cuando te encuentres offline, recarga los archivos con [`PreTrainedModel.from_pretrained`] desde el directorio especificado: 
-
-    ```py
-    >>> tokenizer = AutoTokenizer.from_pretrained("./your/path/bigscience_t0")
-    >>> model = AutoModel.from_pretrained("./your/path/bigscience_t0")
-    ```
-
-* Descarga de manera programática los archivos con la biblioteca [huggingface_hub](https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub):
-
-    1. Instala la biblioteca [huggingface_hub](https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub) en tu entorno virtual: 
-
-    ```bash
-    python -m pip install huggingface_hub
-    ```
-
-    2. Utiliza la función [`hf_hub_download`](https://huggingface.co/docs/hub/adding-a-library#download-files-from-the-hub) para descargar un archivo a un path específico. Por ejemplo, el siguiente comando descarga el archivo `config.json` del modelo [T0](https://huggingface.co/bigscience/T0_3B) al path deseado:
-
-    ```py
-    >>> from huggingface_hub import hf_hub_download
-
-    >>> hf_hub_download(repo_id="bigscience/T0_3B", filename="config.json", cache_dir="./your/path/bigscience_t0")
-    ```
-
-Una vez que el archivo se descargue y se almacene en caché localmente, especifica tu ruta local para cargarlo y usarlo:
-
-```py
->>> from transformers import AutoConfig
-
->>> config = AutoConfig.from_pretrained("./your/path/bigscience_t0/config.json")
-```
-
-<Tip>
-
-Para más detalles sobre cómo descargar archivos almacenados en el Hub consulta la sección [How to download files from the Hub](https://huggingface.co/docs/hub/how-to-downstream).
-
-</Tip>
diff --git a/docs/source/es/model_sharing.md b/docs/source/es/model_sharing.md
new file mode 100644
index 000000000000..46e1ee07a9a5
--- /dev/null
+++ b/docs/source/es/model_sharing.md
@@ -0,0 +1,223 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Compartir un modelo
+
+Los últimos dos tutoriales mostraron cómo puedes realizar fine-tunning a un modelo con PyTorch, Keras y 🤗 Accelerate para configuraciones distribuidas. ¡El siguiente paso es compartir tu modelo con la comunidad! En Hugging Face creemos en compartir abiertamente a todos el conocimiento y los recursos para democratizar la inteligencia artificial. En este sentido, te animamos a considerar compartir tu modelo con la comunidad, de esta forma ayudas a otros ahorrando tiempo y recursos.
+
+En este tutorial aprenderás dos métodos para compartir un modelo trained o fine-tuned en el [Model Hub](https://huggingface.co/models):
+
+- Mediante Código, enviando (push) tus archivos al Hub.
+- Con la interfaz Web, con Drag-and-drop de tus archivos al Hub.
+
+<iframe width="560" height="315" src="https://www.youtube.com/embed/XvSGPZFEjDY" title="YouTube video player"
+frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
+picture-in-picture" allowfullscreen></iframe>
+
+<Tip>
+
+Para compartir un modelo con la comunidad necesitas una cuenta en [huggingface.co](https://huggingface.co/join). También puedes unirte a una organización existente o crear una nueva.
+
+</Tip>
+
+## Características de los repositorios
+
+Cada repositorio en el Model Hub se comporta como cualquier otro repositorio en GitHub. Nuestros repositorios ofrecen versioning, commit history, y la habilidad para visualizar diferencias.
+
+El versioning desarrollado dentro del Model Hub es basado en git y [git-lfs](https://git-lfs.github.com/). En otras palabras, puedes tratar un modelo como un repositorio, brindando un mejor control de acceso y escalabilidad. Version control permite *revisions*, un método para apuntar a una versión específica de un modelo utilizando un commit hash, tag o branch.
+
+Como resultado, puedes cargar una versión específica del modelo con el parámetro `revision`:
+
+```py
+>>> model = AutoModel.from_pretrained(
+...     "julien-c/EsperBERTo-small", revision="v2.0.1"  # tag name, or branch name, or commit hash
+... )
+```
+
+Los archivos son editados fácilmente dentro de un repositorio. Incluso puedes observar el commit history y las diferencias:
+
+![vis_diff](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vis_diff.png)
+
+## Configuración inicial
+
+Antes de compartir un modelo al Hub necesitarás tus credenciales de Hugging Face. Si tienes acceso a una terminal ejecuta el siguiente comando en el entorno virtual donde 🤗 Transformers esté instalado. Esto guardará tu token de acceso dentro de tu carpeta cache de Hugging Face (~/.cache/ by default):
+
+```bash
+huggingface-cli login
+```
+
+Si usas un notebook como Jupyter o Colaboratory, asegúrate de tener instalada la biblioteca [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library). Esta biblioteca te permitirá interactuar por código con el Hub.
+
+```bash
+pip install huggingface_hub
+```
+
+Luego usa `notebook_login` para iniciar sesión al Hub, y sigue el link [aquí](https://huggingface.co/settings/token) para generar un token con el que iniciaremos sesión:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## Convertir un modelo para todos los Frameworks
+
+Para asegurarnos que tu modelo pueda ser usado por alguien que esté trabajando con un framework diferente, te recomendamos convertir y subir tu modelo con checkpoints de pytorch y tensorflow. Aunque los usuarios aún son capaces de cargar su modelo desde un framework diferente, si se omite este paso será más lento debido a que 🤗 Transformers necesitará convertir el checkpoint sobre-la-marcha.
+
+Convertir un checkpoint para otro framework es fácil. Asegúrate tener Pytorch y TensorFlow instalado (Véase [aquí](installation) para instrucciones de instalación), y luego encuentra el modelo específico para tu tarea en el otro Framework. 
+
+Por ejemplo, supongamos que has entrenado DistilBert para clasificación de secuencias en PyTorch y quieres convertirlo a su equivalente en TensorFlow. Cargas el equivalente en TensorFlow de tu modelo para tu tarea y especificas `from_pt=True` así 🤗 Transformers convertirá el Pytorch checkpoint a un TensorFlow Checkpoint:
+
+```py
+>>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_pt=True)
+```
+
+Luego guardas tu nuevo modelo TensorFlow con su nuevo checkpoint:
+
+```py
+>>> tf_model.save_pretrained("path/to/awesome-name-you-picked")
+```
+
+De manera similar, especificas `from_tf=True` para convertir un checkpoint de TensorFlow a Pytorch:
+
+```py
+>>> pt_model = DistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_tf=True)
+>>> pt_model.save_pretrained("path/to/awesome-name-you-picked")
+```
+
+Si algún modelo está disponible en Flax, también puedes convertir un checkpoint de Pytorch a Flax:
+
+```py
+>>> flax_model = FlaxDistilBertForSequenceClassification.from_pretrained(
+...     "path/to/awesome-name-you-picked", from_pt=True
+... )
+```
+
+## Compartir un modelo con `Trainer`
+
+<Youtube id="Z1-XMy-GNLQ"/>
+
+Compartir un modelo al Hub es tan simple como añadir un parámetro extra o un callback. Si recuerdas del tutorial de [fine-tuning tutorial](training), la clase [`TrainingArguments`] es donde especificas los Hiperparámetros y opciones de entrenamiento adicionales. Una de estas opciones incluye la habilidad de compartir un modelo directamente al Hub. Para ello configuras `push_to_hub=True` dentro de [`TrainingArguments`]:
+
+```py
+>>> training_args = TrainingArguments(output_dir="my-awesome-model", push_to_hub=True)
+```
+
+A continuación, como usualmente, pasa tus argumentos de entrenamiento a [`Trainer`]:
+
+```py
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=small_train_dataset,
+...     eval_dataset=small_eval_dataset,
+...     compute_metrics=compute_metrics,
+... )
+```
+
+Luego que realizas fine-tune a tu modelo, llamas [`~transformers.Trainer.push_to_hub`] en [`Trainer`] para enviar el modelo al Hub!🤗 Transformers incluso añadirá automáticamente los Hiperparámetros de entrenamiento, resultados de entrenamiento y versiones del Framework a tu model card!
+
+```py
+>>> trainer.push_to_hub()
+```
+
+## Compartir un modelo con `PushToHubCallback`
+
+Los usuarios de TensorFlow pueden activar la misma funcionalidad con [`PushToHubCallback`]. En la funcion [`PushToHubCallback`], agrega:
+
+- Un directorio de salida para tu modelo.
+- Un tokenizador.
+- El `hub_model_id`, el cual es tu usuario Hub y el nombre del modelo.
+
+```py
+>>> from transformers import PushToHubCallback
+
+>>> push_to_hub_callback = PushToHubCallback(
+...     output_dir="./your_model_save_path", tokenizer=tokenizer, hub_model_id="your-username/my-awesome-model"
+... )
+```
+
+Agregamos el callback a [`fit`](https://keras.io/api/models/model_training_apis/), y 🤗 Transformers enviará el modelo entrenado al Hub:
+
+```py
+>>> model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3, callbacks=push_to_hub_callback)
+```
+
+## Usando la función `push_to_hub`
+
+Puedes llamar la función `push_to_hub` directamente en tu modelo para subirlo al Hub.
+
+Especifica el nombre del modelo en `push_to_hub`:
+
+```py
+>>> pt_model.push_to_hub("my-awesome-model")
+```
+
+Esto creará un repositorio bajo tu usuario con el nombre del modelo `my-awesome-model`. Ahora los usuarios pueden cargar tu modelo con la función `from_pretrained`:
+
+```py
+>>> from transformers import AutoModel
+
+>>> model = AutoModel.from_pretrained("your_username/my-awesome-model")
+```
+
+Si perteneces a una organización y quieres compartir tu modelo bajo el nombre de la organización, añade el parámetro `organization`:
+
+```py
+>>> pt_model.push_to_hub("my-awesome-model", organization="my-awesome-org")
+```
+
+La función `push_to_hub` también puede ser usada para añadir archivos al repositorio del modelo. Por ejemplo, añade un tokenizador al repositorio:
+
+```py
+>>> tokenizer.push_to_hub("my-awesome-model")
+```
+
+O quizás te gustaría añadir la versión de TensorFlow de tu modelo fine-tuned en Pytorch:
+
+```py
+>>> tf_model.push_to_hub("my-awesome-model")
+```
+
+Ahora, cuando navegues a tu perfil en Hugging Face, deberías observar el repositorio de tu modelo creado recientemente. Si das click en el tab **Files** observarás todos los archivos que has subido al repositorio.
+
+Para más detalles sobre cómo crear y subir archivos al repositorio, consulta la [documentación del Hub](https://huggingface.co/docs/hub/how-to-upstream).
+
+## Compartir con la interfaz web
+
+Los usuarios que prefieran un enfoque no-code tienen la opción de cargar su modelo a través de la interfaz gráfica del Hub. Visita la página [huggingface.co/new](https://huggingface.co/new) para crear un nuevo repositorio:
+
+![new_model_repo](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/new_model_repo.png)
+
+Desde aquí, añade información acerca del modelo:
+
+- Selecciona el **owner** (la persona propietaria) del repositorio. Puedes ser tú o cualquier organización a la que pertenezcas.
+- Escoge un nombre para tu modelo. También será el nombre del repositorio.
+- Elige si tu modelo es público o privado.
+- Especifica la licencia que usará tu modelo.
+
+Ahora puedes hacer click en el tab **Files** y luego en el botón **Add file** para subir un nuevo archivo a tu repositorio. Luego arrastra y suelta un archivo a subir y le añades un mensaje al commit.
+
+![upload_file](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/upload_file.png)
+
+## Añadiendo una tarjeta de modelo
+
+Para asegurarnos que los usuarios entiendan las capacidades de tu modelo, sus limitaciones, posibles sesgos y consideraciones éticas, por favor añade una tarjeta (como una tarjeta de presentación) al repositorio del modelo. La tarjeta de modelo es definida en el archivo `README.md`. Puedes agregar una de la siguiente manera:
+
+* Elaborando y subiendo manualmente el archivo`README.md`.
+* Dando click en el botón **Edit model card** dentro del repositorio.
+
+Toma un momento para ver la [tarjeta de modelo](https://huggingface.co/distilbert-base-uncased) de DistilBert para que tengas un buen ejemplo del tipo de información que debería incluir. Consulta [la documentación](https://huggingface.co/docs/hub/models-cards) para más detalles acerca de otras opciones que puedes controlar dentro del archivo `README.md` como la huella de carbono del modelo o ejemplos de widgets. Consulta la documentación [aquí] (https://huggingface.co/docs/hub/models-cards).
diff --git a/docs/source/es/model_sharing.mdx b/docs/source/es/model_sharing.mdx
deleted file mode 100644
index 38a52072b41b..000000000000
--- a/docs/source/es/model_sharing.mdx
+++ /dev/null
@@ -1,219 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Compartir un modelo
-
-Los últimos dos tutoriales mostraron cómo puedes realizar fine-tunning a un modelo con PyTorch, Keras y 🤗 Accelerate para configuraciones distribuidas. ¡El siguiente paso es compartir tu modelo con la comunidad! En Hugging Face creemos en compartir abiertamente a todos el conocimiento y los recursos para democratizar la inteligencia artificial. En este sentido, te animamos a considerar compartir tu modelo con la comunidad, de esta forma ayudas a otros ahorrando tiempo y recursos.
-
-En este tutorial aprenderás dos métodos para compartir un modelo trained o fine-tuned en el [Model Hub](https://huggingface.co/models):
-
-- Mediante Código, enviando (push) tus archivos al Hub.
-- Con la interfaz Web, con Drag-and-drop de tus archivos al Hub.
-
-<iframe width="560" height="315" src="https://www.youtube.com/embed/XvSGPZFEjDY" title="YouTube video player"
-frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
-picture-in-picture" allowfullscreen></iframe>
-
-<Tip>
-
-Para compartir un modelo con la comunidad necesitas una cuenta en [huggingface.co](https://huggingface.co/join). También puedes unirte a una organización existente o crear una nueva.
-
-</Tip>
-
-## Características de los repositorios
-
-Cada repositorio en el Model Hub se comporta como cualquier otro repositorio en GitHub. Nuestros repositorios ofrecen versioning, commit history, y la habilidad para visualizar diferencias.
-
-El versioning desarrollado dentro del Model Hub es basado en git y [git-lfs](https://git-lfs.github.com/). En otras palabras, puedes tratar un modelo como un repositorio, brindando un mejor control de acceso y escalabilidad. Version control permite *revisions*, un método para apuntar a una versión específica de un modelo utilizando un commit hash, tag o branch.
-
-Como resultado, puedes cargar una versión específica del modelo con el parámetro `revision`:
-
-```py
->>> model = AutoModel.from_pretrained(
-...     "julien-c/EsperBERTo-small", revision="v2.0.1"  # tag name, or branch name, or commit hash
-... )
-```
-
-Los archivos son editados fácilmente dentro de un repositorio. Incluso puedes observar el commit history y las diferencias:
-
-![vis_diff](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vis_diff.png)
-
-## Configuración inicial
-
-Antes de compartir un modelo al Hub necesitarás tus credenciales de Hugging Face. Si tienes acceso a una terminal ejecuta el siguiente comando en el entorno virtual donde 🤗 Transformers esté instalado. Esto guardará tu token de acceso dentro de tu carpeta cache de Hugging Face (~/.cache/ by default):
-
-```bash
-huggingface-cli login
-```
-
-Si usas un notebook como Jupyter o Colaboratory, asegúrate de tener instalada la biblioteca [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library). Esta biblioteca te permitirá interactuar por código con el Hub.
-
-```bash
-pip install huggingface_hub
-```
-
-Luego usa `notebook_login` para iniciar sesión al Hub, y sigue el link [aquí](https://huggingface.co/settings/token) para generar un token con el que iniciaremos sesión:
-
-```py
->>> from huggingface_hub import notebook_login
-
->>> notebook_login()
-```
-
-## Convertir un modelo para todos los Frameworks
-
-Para asegurarnos que tu modelo pueda ser usado por alguien que esté trabajando con un framework diferente, te recomendamos convertir y subir tu modelo con checkpoints de pytorch y tensorflow. Aunque los usuarios aún son capaces de cargar su modelo desde un framework diferente, si se omite este paso será más lento debido a que 🤗 Transformers necesitará convertir el checkpoint sobre-la-marcha.
-
-Convertir un checkpoint para otro framework es fácil. Asegúrate tener Pytorch y TensorFlow instalado (Véase [aquí](installation) para instrucciones de instalación), y luego encuentra el modelo específico para tu tarea en el otro Framework. 
-
-Por ejemplo, supongamos que has entrenado DistilBert para clasificación de secuencias en PyTorch y quieres convertirlo a su equivalente en TensorFlow. Cargas el equivalente en TensorFlow de tu modelo para tu tarea y especificas `from_pt=True` así 🤗 Transformers convertirá el Pytorch checkpoint a un TensorFlow Checkpoint:
-
-```py
->>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_pt=True)
-```
-
-Luego guardas tu nuevo modelo TensorFlow con su nuevo checkpoint:
-
-```py
->>> tf_model.save_pretrained("path/to/awesome-name-you-picked")
-```
-
-De manera similar, especificas `from_tf=True` para convertir un checkpoint de TensorFlow a Pytorch:
-
-```py
->>> pt_model = DistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_tf=True)
->>> pt_model.save_pretrained("path/to/awesome-name-you-picked")
-```
-
-Si algún modelo está disponible en Flax, también puedes convertir un checkpoint de Pytorch a Flax:
-
-```py
->>> flax_model = FlaxDistilBertForSequenceClassification.from_pretrained(
-...     "path/to/awesome-name-you-picked", from_pt=True
-... )
-```
-
-## Compartir un modelo con `Trainer`
-
-<Youtube id="Z1-XMy-GNLQ"/>
-
-Compartir un modelo al Hub es tan simple como añadir un parámetro extra o un callback. Si recuerdas del tutorial de [fine-tuning tutorial](training), la clase [`TrainingArguments`] es donde especificas los Hiperparámetros y opciones de entrenamiento adicionales. Una de estas opciones incluye la habilidad de compartir un modelo directamente al Hub. Para ello configuras `push_to_hub=True` dentro de [`TrainingArguments`]:
-
-```py
->>> training_args = TrainingArguments(output_dir="my-awesome-model", push_to_hub=True)
-```
-
-A continuación, como usualmente, pasa tus argumentos de entrenamiento a [`Trainer`]:
-
-```py
->>> trainer = Trainer(
-...     model=model,
-...     args=training_args,
-...     train_dataset=small_train_dataset,
-...     eval_dataset=small_eval_dataset,
-...     compute_metrics=compute_metrics,
-... )
-```
-
-Luego que realizas fine-tune a tu modelo, llamas [`~transformers.Trainer.push_to_hub`] en [`Trainer`] para enviar el modelo al Hub!🤗 Transformers incluso añadirá automáticamente los Hiperparámetros de entrenamiento, resultados de entrenamiento y versiones del Framework a tu model card!
-
-```py
->>> trainer.push_to_hub()
-```
-
-## Compartir un modelo con `PushToHubCallback`
-
-Los usuarios de TensorFlow pueden activar la misma funcionalidad con [`PushToHubCallback`]. En la funcion [`PushToHubCallback`], agrega:
-
-- Un directorio de salida para tu modelo.
-- Un tokenizador.
-- El `hub_model_id`, el cual es tu usuario Hub y el nombre del modelo.
-
-```py
->>> from transformers.keras.callbacks import PushToHubCallback
-
->>> push_to_hub_callback = PushToHubCallback(
-...     output_dir="./your_model_save_path", tokenizer=tokenizer, hub_model_id="your-username/my-awesome-model"
-... )
-```
-
-Agregamos el callback a [`fit`](https://keras.io/api/models/model_training_apis/), y 🤗 Transformers enviará el modelo entrenado al Hub:
-
-```py
->>> model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3, callbacks=push_to_hub_callback)
-```
-
-## Usando la función `push_to_hub`
-
-Puedes llamar la función `push_to_hub` directamente en tu modelo para subirlo al Hub.
-
-Especifica el nombre del modelo en `push_to_hub`:
-
-```py
->>> pt_model.push_to_hub("my-awesome-model")
-```
-
-Esto creará un repositorio bajo tu usuario con el nombre del modelo `my-awesome-model`. Ahora los usuarios pueden cargar tu modelo con la función `from_pretrained`:
-
-```py
->>> from transformers import AutoModel
-
->>> model = AutoModel.from_pretrained("your_username/my-awesome-model")
-```
-
-Si perteneces a una organización y quieres compartir tu modelo bajo el nombre de la organización, añade el parámetro `organization`:
-
-```py
->>> pt_model.push_to_hub("my-awesome-model", organization="my-awesome-org")
-```
-
-La función `push_to_hub` también puede ser usada para añadir archivos al repositorio del modelo. Por ejemplo, añade un tokenizador al repositorio:
-
-```py
->>> tokenizer.push_to_hub("my-awesome-model")
-```
-
-O quizás te gustaría añadir la versión de TensorFlow de tu modelo fine-tuned en Pytorch:
-
-```py
->>> tf_model.push_to_hub("my-awesome-model")
-```
-
-Ahora, cuando navegues a tu perfil en Hugging Face, deberías observar el repositorio de tu modelo creado recientemente. Si das click en el tab **Files** observarás todos los archivos que has subido al repositorio.
-
-Para más detalles sobre cómo crear y subir archivos al repositorio, consulta la [documentación del Hub](https://huggingface.co/docs/hub/how-to-upstream).
-
-## Compartir con la interfaz web
-
-Los usuarios que prefieran un enfoque no-code tienen la opción de cargar su modelo a través de la interfaz gráfica del Hub. Visita la página [huggingface.co/new](https://huggingface.co/new) para crear un nuevo repositorio:
-
-![new_model_repo](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/new_model_repo.png)
-
-Desde aquí, añade información acerca del modelo:
-
-- Selecciona el **owner** (la persona propietaria) del repositorio. Puedes ser tú o cualquier organización a la que pertenezcas.
-- Escoge un nombre para tu modelo. También será el nombre del repositorio.
-- Elige si tu modelo es público o privado.
-- Especifica la licencia que usará tu modelo.
-
-Ahora puedes hacer click en el tab **Files** y luego en el botón **Add file** para subir un nuevo archivo a tu repositorio. Luego arrastra y suelta un archivo a subir y le añades un mensaje al commit.
-
-![upload_file](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/upload_file.png)
-
-## Añadiendo una tarjeta de modelo
-
-Para asegurarnos que los usuarios entiendan las capacidades de tu modelo, sus limitaciones, posibles sesgos y consideraciones éticas, por favor añade una tarjeta (como una tarjeta de presentación) al repositorio del modelo. La tarjeta de modelo es definida en el archivo `README.md`. Puedes agregar una de la siguiente manera:
-
-* Elaborando y subiendo manualmente el archivo`README.md`.
-* Dando click en el botón **Edit model card** dentro del repositorio.
-
-Toma un momento para ver la [tarjeta de modelo](https://huggingface.co/distilbert-base-uncased) de DistilBert para que tengas un buen ejemplo del tipo de información que debería incluir. Consulta [la documentación](https://huggingface.co/docs/hub/models-cards) para más detalles acerca de otras opciones que puedes controlar dentro del archivo `README.md` como la huella de carbono del modelo o ejemplos de widgets. Consulta la documentación [aquí] (https://huggingface.co/docs/hub/models-cards).
diff --git a/docs/source/es/multilingual.md b/docs/source/es/multilingual.md
new file mode 100644
index 000000000000..fa60cac68c26
--- /dev/null
+++ b/docs/source/es/multilingual.md
@@ -0,0 +1,179 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Modelos multilingües para inferencia
+
+[[open-in-colab]]
+
+Existen varios modelos multilingües en 🤗 Transformers y su uso para inferencia difiere de los modelos monolingües. Sin embargo, no *todos* los usos de los modelos multilingües son diferentes. Algunos modelos, como [bert-base-multilingual-uncased](https://huggingface.co/bert-base-multilingual-uncased), pueden utilizarse igual que un modelo monolingüe. Esta guía te enseñará cómo utilizar modelos multilingües cuyo uso difiere en la inferencia.
+
+## XLM
+
+XLM tiene diez checkpoints diferentes de los cuales solo uno es monolingüe. Los nueve checkpoints restantes del modelo pueden dividirse en dos categorías: los checkpoints que utilizan language embeddings y los que no.
+
+### XLM con language embeddings
+
+Los siguientes modelos XLM usan language embeddings para especificar el lenguaje utilizado en la inferencia:
+
+- `xlm-mlm-ende-1024` (Masked language modeling, English-German)
+- `xlm-mlm-enfr-1024` (Masked language modeling, English-French)
+- `xlm-mlm-enro-1024` (Masked language modeling, English-Romanian)
+- `xlm-mlm-xnli15-1024` (Masked language modeling, XNLI languages)
+- `xlm-mlm-tlm-xnli15-1024` (Masked language modeling + translation, XNLI languages)
+- `xlm-clm-enfr-1024` (Causal language modeling, English-French)
+- `xlm-clm-ende-1024` (Causal language modeling, English-German)
+
+Los language embeddings son representados como un tensor de la mismas dimensiones que los `input_ids` pasados al modelo. Los valores de estos tensores dependen del idioma utilizado y se identifican mediante los atributos `lang2id` y `id2lang` del tokenizador.
+
+En este ejemplo, carga el checkpoint `xlm-clm-enfr-1024` (Causal language modeling, English-French):
+
+```py
+>>> import torch
+>>> from transformers import XLMTokenizer, XLMWithLMHeadModel
+
+>>> tokenizer = XLMTokenizer.from_pretrained("xlm-clm-enfr-1024")
+>>> model = XLMWithLMHeadModel.from_pretrained("xlm-clm-enfr-1024")
+```
+
+El atributo `lang2id` del tokenizador muestra los idiomas de este modelo y sus ids:
+
+```py
+>>> print(tokenizer.lang2id)
+{'en': 0, 'fr': 1}
+```
+
+A continuación, crea un input de ejemplo:
+
+```py
+>>> input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")])  # batch size of 1
+```
+
+Establece el id del idioma, por ejemplo `"en"`, y utilízalo para definir el language embedding. El language embedding es un tensor lleno de `0` ya que es el id del idioma para inglés. Este tensor debe ser del mismo tamaño que `input_ids`. 
+
+```py
+>>> language_id = tokenizer.lang2id["en"]  # 0
+>>> langs = torch.tensor([language_id] * input_ids.shape[1])  # torch.tensor([0, 0, 0, ..., 0])
+
+>>> # We reshape it to be of size (batch_size, sequence_length)
+>>> langs = langs.view(1, -1)  # is now of shape [1, sequence_length] (we have a batch size of 1)
+```
+
+Ahora puedes pasar los `input_ids` y el language embedding al modelo:
+
+```py
+>>> outputs = model(input_ids, langs=langs)
+```
+
+El script [run_generation.py](https://github.com/huggingface/transformers/tree/master/examples/pytorch/text-generation/run_generation.py) puede generar texto con language embeddings utilizando los checkpoints `xlm-clm`.
+
+### XLM sin language embeddings
+
+Los siguientes modelos XLM no requieren language embeddings durante la inferencia:
+
+- `xlm-mlm-17-1280` (modelado de lenguaje enmascarado, 17 idiomas)
+- `xlm-mlm-100-1280` (modelado de lenguaje enmascarado, 100 idiomas)
+
+Estos modelos se utilizan para representaciones genéricas de frases a diferencia de los anteriores checkpoints XLM.
+
+## BERT
+
+Los siguientes modelos de BERT pueden utilizarse para tareas multilingües:
+
+- `bert-base-multilingual-uncased` (modelado de lenguaje enmascarado + predicción de la siguiente oración, 102 idiomas)
+- `bert-base-multilingual-cased` (modelado de lenguaje enmascarado + predicción de la siguiente oración, 104 idiomas)
+
+Estos modelos no requieren language embeddings durante la inferencia. Deben identificar la lengua a partir del
+contexto e inferir en consecuencia.
+
+## XLM-RoBERTa
+
+Los siguientes modelos de XLM-RoBERTa pueden utilizarse para tareas multilingües:
+
+- `xlm-roberta-base` (modelado de lenguaje enmascarado, 100 idiomas)
+- `xlm-roberta-large` (Modelado de lenguaje enmascarado, 100 idiomas)
+
+XLM-RoBERTa se entrenó con 2,5 TB de datos CommonCrawl recién creados y depurados en 100 idiomas. Proporciona fuertes ventajas sobre los modelos multilingües publicados anteriormente como mBERT o XLM en tareas posteriores como la clasificación, el etiquetado de secuencias y la respuesta a preguntas.
+
+## M2M100
+
+Los siguientes modelos de M2M100 pueden utilizarse para traducción multilingüe:
+
+- `facebook/m2m100_418M` (traducción)
+- `facebook/m2m100_1.2B` (traducción)
+
+En este ejemplo, carga el checkpoint `facebook/m2m100_418M` para traducir del chino al inglés. Puedes establecer el idioma de origen en el tokenizador:
+
+```py
+>>> from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
+
+>>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger."
+>>> chinese_text = "不要插手巫師的事務, 因為他們是微妙的, 很快就會發怒."
+
+>>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="zh")
+>>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
+```
+
+Tokeniza el texto:
+
+```py
+>>> encoded_zh = tokenizer(chinese_text, return_tensors="pt")
+```
+
+M2M100 fuerza el id del idioma de destino como el primer token generado para traducir al idioma de destino.. Establece el `forced_bos_token_id` a `en` en el método `generate` para traducir al inglés:
+
+```py
+>>> generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en"))
+>>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+'Do not interfere with the matters of the witches, because they are delicate and will soon be angry.'
+```
+
+## MBart
+
+Los siguientes modelos de MBart pueden utilizarse para traducción multilingüe:
+
+- `facebook/mbart-large-50-one-to-many-mmt` (traducción automática multilingüe de uno a muchos, 50 idiomas)
+- `facebook/mbart-large-50-many-to-many-mmt` (traducción automática multilingüe de muchos a muchos, 50 idiomas)
+- `facebook/mbart-large-50-many-to-one-mmt` (traducción automática multilingüe muchos a uno, 50 idiomas)
+- `facebook/mbart-large-50` (traducción multilingüe, 50 idiomas)
+- `facebook/mbart-large-cc25`
+
+En este ejemplo, carga el checkpoint `facebook/mbart-large-50-many-to-many-mmt` para traducir del finlandés al inglés. Puedes establecer el idioma de origen en el tokenizador:
+
+```py
+>>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+
+>>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger."
+>>> fi_text = "Älä sekaannu velhojen asioihin, sillä ne ovat hienovaraisia ja nopeasti vihaisia."
+
+>>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="fi_FI")
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
+```
+
+Tokeniza el texto:
+
+```py
+>>> encoded_en = tokenizer(en_text, return_tensors="pt")
+```
+
+MBart fuerza el id del idioma de destino como el primer token generado para traducirlo. Establece el `forced_bos_token_id` a `en` en el método `generate` para traducir al inglés:
+
+```py
+>>> generated_tokens = model.generate(**encoded_en, forced_bos_token_id=tokenizer.lang_code_to_id("en_XX"))
+>>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+"Don't interfere with the wizard's affairs, because they are subtle, will soon get angry."
+```
+
+Si estás usando el checkpoint `facebook/mbart-large-50-many-to-one-mmt` no necesitas forzar el id del idioma de destino como el primer token generado, de lo contrario el uso es el mismo.
diff --git a/docs/source/es/multilingual.mdx b/docs/source/es/multilingual.mdx
deleted file mode 100644
index 4849416a44db..000000000000
--- a/docs/source/es/multilingual.mdx
+++ /dev/null
@@ -1,175 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Modelos multilingües para inferencia
-
-[[open-in-colab]]
-
-Existen varios modelos multilingües en 🤗 Transformers y su uso para inferencia difiere de los modelos monolingües. Sin embargo, no *todos* los usos de los modelos multilingües son diferentes. Algunos modelos, como [bert-base-multilingual-uncased](https://huggingface.co/bert-base-multilingual-uncased), pueden utilizarse igual que un modelo monolingüe. Esta guía te enseñará cómo utilizar modelos multilingües cuyo uso difiere en la inferencia.
-
-## XLM
-
-XLM tiene diez checkpoints diferentes de los cuales solo uno es monolingüe. Los nueve checkpoints restantes del modelo pueden dividirse en dos categorías: los checkpoints que utilizan language embeddings y los que no.
-
-### XLM con language embeddings
-
-Los siguientes modelos XLM usan language embeddings para especificar el lenguaje utilizado en la inferencia:
-
-- `xlm-mlm-ende-1024` (Masked language modeling, English-German)
-- `xlm-mlm-enfr-1024` (Masked language modeling, English-French)
-- `xlm-mlm-enro-1024` (Masked language modeling, English-Romanian)
-- `xlm-mlm-xnli15-1024` (Masked language modeling, XNLI languages)
-- `xlm-mlm-tlm-xnli15-1024` (Masked language modeling + translation, XNLI languages)
-- `xlm-clm-enfr-1024` (Causal language modeling, English-French)
-- `xlm-clm-ende-1024` (Causal language modeling, English-German)
-
-Los language embeddings son representados como un tensor de la mismas dimensiones que los `input_ids` pasados al modelo. Los valores de estos tensores dependen del idioma utilizado y se identifican mediante los atributos `lang2id` y `id2lang` del tokenizador.
-
-En este ejemplo, carga el checkpoint `xlm-clm-enfr-1024` (Causal language modeling, English-French):
-
-```py
->>> import torch
->>> from transformers import XLMTokenizer, XLMWithLMHeadModel
-
->>> tokenizer = XLMTokenizer.from_pretrained("xlm-clm-enfr-1024")
->>> model = XLMWithLMHeadModel.from_pretrained("xlm-clm-enfr-1024")
-```
-
-El atributo `lang2id` del tokenizador muestra los idiomas de este modelo y sus ids:
-
-```py
->>> print(tokenizer.lang2id)
-{'en': 0, 'fr': 1}
-```
-
-A continuación, crea un input de ejemplo:
-
-```py
->>> input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")])  # batch size of 1
-```
-
-Establece el id del idioma, por ejemplo `"en"`, y utilízalo para definir el language embedding. El language embedding es un tensor lleno de `0` ya que es el id del idioma para inglés. Este tensor debe ser del mismo tamaño que `input_ids`. 
-
-```py
->>> language_id = tokenizer.lang2id["en"]  # 0
->>> langs = torch.tensor([language_id] * input_ids.shape[1])  # torch.tensor([0, 0, 0, ..., 0])
-
->>> # We reshape it to be of size (batch_size, sequence_length)
->>> langs = langs.view(1, -1)  # is now of shape [1, sequence_length] (we have a batch size of 1)
-```
-
-Ahora puedes pasar los `input_ids` y el language embedding al modelo:
-
-```py
->>> outputs = model(input_ids, langs=langs)
-```
-
-El script [run_generation.py](https://github.com/huggingface/transformers/tree/master/examples/pytorch/text-generation/run_generation.py) puede generar texto con language embeddings utilizando los checkpoints `xlm-clm`.
-
-### XLM sin language embeddings
-
-Los siguientes modelos XLM no requieren language embeddings durante la inferencia:
-
-- `xlm-mlm-17-1280` (modelado de lenguaje enmascarado, 17 idiomas)
-- `xlm-mlm-100-1280` (modelado de lenguaje enmascarado, 100 idiomas)
-
-Estos modelos se utilizan para representaciones genéricas de frases a diferencia de los anteriores checkpoints XLM.
-
-## BERT
-
-Los siguientes modelos de BERT pueden utilizarse para tareas multilingües:
-
-- `bert-base-multilingual-uncased` (modelado de lenguaje enmascarado + predicción de la siguiente oración, 102 idiomas)
-- `bert-base-multilingual-cased` (modelado de lenguaje enmascarado + predicción de la siguiente oración, 104 idiomas)
-
-Estos modelos no requieren language embeddings durante la inferencia. Deben identificar la lengua a partir del
-contexto e inferir en consecuencia.
-
-## XLM-RoBERTa
-
-Los siguientes modelos de XLM-RoBERTa pueden utilizarse para tareas multilingües:
-
-- `xlm-roberta-base` (modelado de lenguaje enmascarado, 100 idiomas)
-- `xlm-roberta-large` (Modelado de lenguaje enmascarado, 100 idiomas)
-
-XLM-RoBERTa se entrenó con 2,5 TB de datos CommonCrawl recién creados y depurados en 100 idiomas. Proporciona fuertes ventajas sobre los modelos multilingües publicados anteriormente como mBERT o XLM en tareas posteriores como la clasificación, el etiquetado de secuencias y la respuesta a preguntas.
-
-## M2M100
-
-Los siguientes modelos de M2M100 pueden utilizarse para traducción multilingüe:
-
-- `facebook/m2m100_418M` (traducción)
-- `facebook/m2m100_1.2B` (traducción)
-
-En este ejemplo, carga el checkpoint `facebook/m2m100_418M` para traducir del chino al inglés. Puedes establecer el idioma de origen en el tokenizador:
-
-```py
->>> from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
-
->>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger."
->>> chinese_text = "不要插手巫師的事務, 因為他們是微妙的, 很快就會發怒."
-
->>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="zh")
->>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
-```
-
-Tokeniza el texto:
-
-```py
->>> encoded_zh = tokenizer(chinese_text, return_tensors="pt")
-```
-
-M2M100 fuerza el id del idioma de destino como el primer token generado para traducir al idioma de destino.. Establece el `forced_bos_token_id` a `en` en el método `generate` para traducir al inglés:
-
-```py
->>> generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en"))
->>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
-'Do not interfere with the matters of the witches, because they are delicate and will soon be angry.'
-```
-
-## MBart
-
-Los siguientes modelos de MBart pueden utilizarse para traducción multilingüe:
-
-- `facebook/mbart-large-50-one-to-many-mmt` (traducción automática multilingüe de uno a muchos, 50 idiomas)
-- `facebook/mbart-large-50-many-to-many-mmt` (traducción automática multilingüe de muchos a muchos, 50 idiomas)
-- `facebook/mbart-large-50-many-to-one-mmt` (traducción automática multilingüe muchos a uno, 50 idiomas)
-- `facebook/mbart-large-50` (traducción multilingüe, 50 idiomas)
-- `facebook/mbart-large-cc25`
-
-En este ejemplo, carga el checkpoint `facebook/mbart-large-50-many-to-many-mmt` para traducir del finlandés al inglés. Puedes establecer el idioma de origen en el tokenizador:
-
-```py
->>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-
->>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger."
->>> fi_text = "Älä sekaannu velhojen asioihin, sillä ne ovat hienovaraisia ja nopeasti vihaisia."
-
->>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="fi_FI")
->>> model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
-```
-
-Tokeniza el texto:
-
-```py
->>> encoded_en = tokenizer(en_text, return_tensors="pt")
-```
-
-MBart fuerza el id del idioma de destino como el primer token generado para traducirlo. Establece el `forced_bos_token_id` a `en` en el método `generate` para traducir al inglés:
-
-```py
->>> generated_tokens = model.generate(**encoded_en, forced_bos_token_id=tokenizer.lang_code_to_id("en_XX"))
->>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
-"Don't interfere with the wizard's affairs, because they are subtle, will soon get angry."
-```
-
-Si estás usando el checkpoint `facebook/mbart-large-50-many-to-one-mmt` no necesitas forzar el id del idioma de destino como el primer token generado, de lo contrario el uso es el mismo.
diff --git a/docs/source/es/philosophy.md b/docs/source/es/philosophy.md
new file mode 100644
index 000000000000..4054ac0ae507
--- /dev/null
+++ b/docs/source/es/philosophy.md
@@ -0,0 +1,79 @@
+<!--Copyright 2020 de The HuggingFace Team. Todos los derechos reservados
+
+Con licencia bajo la Licencia Apache, Versión 2.0 (la "Licencia"); No puedes usar este archivo excepto de conformidad con la Licencia.
+Puedes obtener una copia de la Licencia en
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Al menos que sea requrido por la ley aplicable o acordado por escrito, el software distribuido bajo la Licencia es distribuido sobre una BASE "AS IS", SIN GARANTIAS O CONDICIONES DE
+NINGÚN TIPO. Ver la Licencia para el idioma específico que rige los permisos y limitaciones bajo la Licencia.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Filosofía
+
+🤗 Transformers es una biblioteca construida para:
+
+- Los investigadores y educadores de NLP que busquen usar/estudiar/extender modelos transformers a gran escala 
+- Profesionales que quieren optimizar esos modelos y/o ponerlos en producción 
+- Ingenieros que solo quieren descargar un modelo preentrenado y usarlo para resolver una tarea NLP dada. 
+
+La biblioteca fue diseñada con dos fuertes objetivos en mente:
+
+- Que sea tan fácil y rápida de utilizar como sea posible:
+
+  - Hemos limitado enormemente el número de abstracciones que el usuario tiene que aprender. De hecho, no hay casi abstracciones,
+    solo tres clases estándar necesarias para usar cada modelo: [configuration](main_classes/configuration),
+    [models](main_classes/model) y [tokenizer](main_classes/tokenizer).
+  - Todas estas clases pueden ser inicializadas de forma simple y unificada a partir de ejemplos pre-entrenados mediante el uso de un método
+    `from_pretrained()` común de solicitud que se encargará de descargar (si es necesario), almacenar y cargar la solicitud de clase relacionada y datos asociados
+    (configurations' hyper-parameters, tokenizers' vocabulary, and models' weights) a partir de un control pre-entrenado proporcionado en
+    [Hugging Face Hub](https://huggingface.co/models) o de tu propio control guardado.
+  - Por encima de esas tres clases estándar, la biblioteca proporciona dos APIs: [`pipeline`] para usar rápidamente un modelo (junto a su configuracion y tokenizer asociados)
+    sobre una tarea dada, y [`Trainer`]/`Keras.fit` para entrenar u optimizar de forma rápida un modelo dado.
+  - Como consecuencia, esta biblioteca NO es una caja de herramientas modular de bloques individuales para redes neuronales. Si quieres extender/construir sobre la biblioteca,
+    usa simplemente los módulos regulares de Python/PyTorch/TensorFlow/Keras y emplea las clases estándar de la biblioteca como punto de partida para reutilizar funcionalidades
+    tales como abrir/guardar modelo.
+    
+- Proporciona modelos modernos con rendimientos lo más parecido posible a los modelos originales:
+
+  - Proporcionamos al menos un ejemplo para cada arquitectura que reproduce un resultado proporcionado por los autores de dicha arquitectura.
+  - El código normalmente es parecido al código base original, lo cual significa que algún código Pytorch puede no ser tan 
+    *pytorchic* como podría ser por haber sido convertido a código TensorFlow, y viceversa. 
+
+Unos cuantos objetivos adicionales:
+
+- Exponer las características internas de los modelos de la forma más coherente posible:
+
+  - Damos acceso, mediante una sola API, a todos los estados ocultos y pesos de atención.
+  - Tokenizer y el modelo de API base están estandarizados para cambiar fácilmente entre modelos.
+
+- Incorporar una selección subjetiva de herramientas de gran potencial para la optimización/investigación de estos modelos:
+
+  - Una forma sencilla/coherente de añadir nuevos tokens al vocabulario e incrustraciones (embeddings, en inglés) para optimización.
+  - Formas sencillas de camuflar y reducir "transformer heads".
+
+- Cambiar fácilmente entre PyTorch y TensorFlow 2.0, permitiendo el entrenamiento usando un marco y la inferencia usando otro.
+
+## Conceptos principales 
+
+La biblioteca está construida alrededor de tres tipos de clases para cada modelo:
+
+- **Model classes** como [`BertModel`], que consisten en más de 30 modelos PyTorch ([torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)) o modelos Keras ([tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model)) que funcionan con pesos pre-entrenados proporcionados en la
+  biblioteca.
+- **Configuration classes** como [`BertConfig`], que almacena todos los parámetros necesarios para construir un modelo. 
+  No siempre tienes que generarla tu. En particular, si estas usando un modelo pre-entrenado sin ninguna modificación,
+  la creación del modelo se encargará automáticamente de generar la configuración (que es parte del modelo).
+- **Tokenizer classes** como [`BertTokenizer`], que almacena el vocabulario para cada modelo y proporciona métodos para
+  codificar/decodificar strings en una lista de índices de "token embeddings" para ser empleados en un modelo.          
+
+Todas estas clases pueden ser generadas a partir de ejemplos pre-entrenados, y guardados localmente usando dos métodos:
+
+- `from_pretrained()` permite generar un modelo/configuración/tokenizer a partir de una versión pre-entrenada proporcionada ya sea por
+  la propia biblioteca (los modelos compatibles se pueden encontrar en [Model Hub](https://huggingface.co/models)) o
+  guardados localmente (o en un servidor) por el usuario. 
+- `save_pretrained()` permite guardar un modelo/configuración/tokenizer localmente, de forma que puede ser empleado de nuevo usando
+  `from_pretrained()`.
diff --git a/docs/source/es/philosophy.mdx b/docs/source/es/philosophy.mdx
deleted file mode 100644
index 65e9a2c67a42..000000000000
--- a/docs/source/es/philosophy.mdx
+++ /dev/null
@@ -1,75 +0,0 @@
-<!--Copyright 2020 de The HuggingFace Team. Todos los derechos reservados
-
-Con licencia bajo la Licencia Apache, Versión 2.0 (la "Licencia"); No puedes usar este archivo excepto de conformidad con la Licencia.
-Puedes obtener una copia de la Licencia en
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Al menos que sea requrido por la ley aplicable o acordado por escrito, el software distribuido bajo la Licencia es distribuido sobre una BASE "AS IS", SIN GARANTIAS O CONDICIONES DE
-NINGÚN TIPO. Ver la Licencia para el idioma específico que rige los permisos y limitaciones bajo la Licencia.
--->
-
-# Filosofía
-
-🤗 Transformers es una biblioteca construida para:
-
-- Los investigadores y educadores de NLP que busquen usar/estudiar/extender modelos transformers a gran escala 
-- Profesionales que quieren optimizar esos modelos y/o ponerlos en producción 
-- Ingenieros que solo quieren descargar un modelo preentrenado y usarlo para resolver una tarea NLP dada. 
-
-La biblioteca fue diseñada con dos fuertes objetivos en mente:
-
-- Que sea tan fácil y rápida de utilizar como sea posible:
-
-  - Hemos limitado enormemente el número de abstracciones que el usuario tiene que aprender. De hecho, no hay casi abstracciones,
-    solo tres clases estándar necesarias para usar cada modelo: [configuration](main_classes/configuration),
-    [models](main_classes/model) y [tokenizer](main_classes/tokenizer).
-  - Todas estas clases pueden ser inicializadas de forma simple y unificada a partir de ejemplos pre-entrenados mediante el uso de un método
-    `from_pretrained()` común de solicitud que se encargará de descargar (si es necesario), almacenar y cargar la solicitud de clase relacionada y datos asociados
-    (configurations' hyper-parameters, tokenizers' vocabulary, and models' weights) a partir de un control pre-entrenado proporcionado en
-    [Hugging Face Hub](https://huggingface.co/models) o de tu propio control guardado.
-  - Por encima de esas tres clases estándar, la biblioteca proporciona dos APIs: [`pipeline`] para usar rápidamente un modelo (junto a su configuracion y tokenizer asociados)
-    sobre una tarea dada, y [`Trainer`]/`Keras.fit` para entrenar u optimizar de forma rápida un modelo dado.
-  - Como consecuencia, esta biblioteca NO es una caja de herramientas modular de bloques individuales para redes neuronales. Si quieres extender/construir sobre la biblioteca,
-    usa simplemente los módulos regulares de Python/PyTorch/TensorFlow/Keras y emplea las clases estándar de la biblioteca como punto de partida para reutilizar funcionalidades
-    tales como abrir/guardar modelo.
-    
-- Proporciona modelos modernos con rendimientos lo más parecido posible a los modelos originales:
-
-  - Proporcionamos al menos un ejemplo para cada arquitectura que reproduce un resultado proporcionado por los autores de dicha arquitectura.
-  - El código normalmente es parecido al código base original, lo cual significa que algún código Pytorch puede no ser tan 
-    *pytorchic* como podría ser por haber sido convertido a código TensorFlow, y viceversa. 
-
-Unos cuantos objetivos adicionales:
-
-- Exponer las características internas de los modelos de la forma más coherente posible:
-
-  - Damos acceso, mediante una sola API, a todos los estados ocultos y pesos de atención.
-  - Tokenizer y el modelo de API base están estandarizados para cambiar fácilmente entre modelos.
-
-- Incorporar una selección subjetiva de herramientas de gran potencial para la optimización/investigación de estos modelos:
-
-  - Una forma sencilla/coherente de añadir nuevos tokens al vocabulario e incrustraciones (embeddings, en inglés) para optimización.
-  - Formas sencillas de camuflar y reducir "transformer heads".
-
-- Cambiar fácilmente entre PyTorch y TensorFlow 2.0, permitiendo el entrenamiento usando un marco y la inferencia usando otro.
-
-## Conceptos principales 
-
-La biblioteca está construida alrededor de tres tipos de clases para cada modelo:
-
-- **Model classes** como [`BertModel`], que consisten en más de 30 modelos PyTorch ([torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)) o modelos Keras ([tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model)) que funcionan con pesos pre-entrenados proporcionados en la
-  biblioteca.
-- **Configuration classes** como [`BertConfig`], que almacena todos los parámetros necesarios para construir un modelo. 
-  No siempre tienes que generarla tu. En particular, si estas usando un modelo pre-entrenado sin ninguna modificación,
-  la creación del modelo se encargará automáticamente de generar la configuración (que es parte del modelo).
-- **Tokenizer classes** como [`BertTokenizer`], que almacena el vocabulario para cada modelo y proporciona métodos para
-  codificar/decodificar strings en una lista de índices de "token embeddings" para ser empleados en un modelo.          
-
-Todas estas clases pueden ser generadas a partir de ejemplos pre-entrenados, y guardados localmente usando dos métodos:
-
-- `from_pretrained()` permite generar un modelo/configuración/tokenizer a partir de una versión pre-entrenada proporcionada ya sea por
-  la propia biblioteca (los modelos compatibles se pueden encontrar en [Model Hub](https://huggingface.co/models)) o
-  guardados localmente (o en un servidor) por el usuario. 
-- `save_pretrained()` permite guardar un modelo/configuración/tokenizer localmente, de forma que puede ser empleado de nuevo usando
-  `from_pretrained()`.
diff --git a/docs/source/es/pipeline_tutorial.md b/docs/source/es/pipeline_tutorial.md
new file mode 100644
index 000000000000..0f77c3c3db83
--- /dev/null
+++ b/docs/source/es/pipeline_tutorial.md
@@ -0,0 +1,143 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Pipelines para inferencia
+
+Un [`pipeline`] simplifica el uso de cualquier modelo del [Model Hub](https://huggingface.co/models) para la inferencia en una variedad de tareas como la generación de texto, la segmentación de imágenes y la clasificación de audio. Incluso si no tienes experiencia con una modalidad específica o no comprendes el código que alimenta los modelos, ¡aún puedes usarlos con el [`pipeline`]! Este tutorial te enseñará a:
+
+* Utilizar un [`pipeline`] para inferencia.
+* Utilizar un tokenizador o modelo específico.
+* Utilizar un [`pipeline`] para tareas de audio y visión.
+
+<Tip>
+
+Echa un vistazo a la documentación de [`pipeline`] para obtener una lista completa de tareas admitidas.
+
+</Tip>
+
+## Uso del pipeline
+
+Si bien cada tarea tiene un [`pipeline`] asociado, es más sencillo usar la abstracción general [`pipeline`] que contiene todos los pipelines de tareas específicas. El [`pipeline`] carga automáticamente un modelo predeterminado y un tokenizador con capacidad de inferencia para tu tarea.
+
+1. Comienza creando un [`pipeline`] y específica una tarea de inferencia:
+
+```py
+>>> from transformers import pipeline
+
+>>> generator = pipeline(task="text-generation")
+```
+
+2. Pasa tu texto de entrada al [`pipeline`]:
+
+```py
+>>> generator("Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone")
+[{'generated_text': 'Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone, Seven for the Iron-priests at the door to the east, and thirteen for the Lord Kings at the end of the mountain'}]
+```
+
+Si tienes más de una entrada, pásala como una lista:
+
+```py
+>>> generator(
+...     [
+...         "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone",
+...         "Nine for Mortal Men, doomed to die, One for the Dark Lord on his dark throne",
+...     ]
+... )
+```
+
+Cualquier parámetro adicional para tu tarea también se puede incluir en el [`pipeline`]. La tarea `text-generation` tiene un método [`~generation.GenerationMixin.generate`] con varios parámetros para controlar la salida. Por ejemplo, si deseas generar más de una salida, defínelo en el parámetro `num_return_sequences`:
+
+```py
+>>> generator(
+...     "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone",
+...     num_return_sequences=2,
+... )
+```
+
+### Selecciona un modelo y un tokenizador
+
+El [`pipeline`] acepta cualquier modelo del [Model Hub](https://huggingface.co/models). Hay etiquetas en el Model Hub que te permiten filtrar por el modelo que te gustaría utilizar para tu tarea. Una vez que hayas elegido un modelo apropiado, cárgalo con la clase `AutoModelFor` y [`AutoTokenizer`] correspondientes. Por ejemplo, carga la clase [`AutoModelForCausalLM`] para una tarea de modelado de lenguaje causal:
+
+```py
+>>> from transformers import AutoTokenizer, AutoModelForCausalLM
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+>>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+```
+
+Crea un [`pipeline`] para tu tarea y específica el modelo y el tokenizador que cargaste:
+
+```py
+>>> from transformers import pipeline
+
+>>> generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
+```
+
+Pasa tu texto de entrada a [`pipeline`] para generar algo de texto:
+
+```py
+>>> generator("Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone")
+[{'generated_text': 'Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone, Seven for the Dragon-lords (for them to rule in a world ruled by their rulers, and all who live within the realm'}]
+```
+
+## Pipeline de audio
+
+La flexibilidad de [`pipeline`] significa que también se puede extender a tareas de audio.
+
+Por ejemplo, clasifiquemos la emoción de un breve fragmento del famoso discurso de John F. Kennedy ["We choose to go to the Moon"](https://en.wikipedia.org/wiki/We_choose_to_go_to_the_Moon). Encuentra un modelo de [audio classification](https://huggingface.co/models?pipeline_tag=audio-classification) para reconocimiento de emociones en el Model Hub y cárgalo en el [`pipeline`]:
+
+```py
+>>> from transformers import pipeline
+
+>>> audio_classifier = pipeline(
+...     task="audio-classification", model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
+... )
+```
+
+Pasa el archivo de audio al [`pipeline`]:
+
+```py
+>>> audio_classifier("jfk_moon_speech.wav")
+[{'label': 'calm', 'score': 0.13856211304664612},
+ {'label': 'disgust', 'score': 0.13148026168346405},
+ {'label': 'happy', 'score': 0.12635163962841034},
+ {'label': 'angry', 'score': 0.12439591437578201},
+ {'label': 'fearful', 'score': 0.12404385954141617}]
+```
+
+## Pipeline de visión
+
+Finalmente, utilizar un [`pipeline`] para tareas de visión es prácticamente igual.
+
+Específica tu tarea de visión y pasa tu imagen al clasificador. La imagen puede ser un enlace o una ruta local a la imagen. Por ejemplo, ¿qué especie de gato se muestra a continuación?
+
+![pipeline-cat-chonk](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg)
+
+```py
+>>> from transformers import pipeline
+
+>>> vision_classifier = pipeline(task="image-classification")
+>>> vision_classifier(
+...     images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+[{'label': 'lynx, catamount', 'score': 0.4403027892112732},
+ {'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor',
+  'score': 0.03433405980467796},
+ {'label': 'snow leopard, ounce, Panthera uncia',
+  'score': 0.032148055732250214},
+ {'label': 'Egyptian cat', 'score': 0.02353910356760025},
+ {'label': 'tiger cat', 'score': 0.023034192621707916}]
+```
diff --git a/docs/source/es/pipeline_tutorial.mdx b/docs/source/es/pipeline_tutorial.mdx
deleted file mode 100644
index af202758eb13..000000000000
--- a/docs/source/es/pipeline_tutorial.mdx
+++ /dev/null
@@ -1,139 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Pipelines para inferencia
-
-Un [`pipeline`] simplifica el uso de cualquier modelo del [Model Hub](https://huggingface.co/models) para la inferencia en una variedad de tareas como la generación de texto, la segmentación de imágenes y la clasificación de audio. Incluso si no tienes experiencia con una modalidad específica o no comprendes el código que alimenta los modelos, ¡aún puedes usarlos con el [`pipeline`]! Este tutorial te enseñará a:
-
-* Utilizar un [`pipeline`] para inferencia.
-* Utilizar un tokenizador o modelo específico.
-* Utilizar un [`pipeline`] para tareas de audio y visión.
-
-<Tip>
-
-Echa un vistazo a la documentación de [`pipeline`] para obtener una lista completa de tareas admitidas.
-
-</Tip>
-
-## Uso del pipeline
-
-Si bien cada tarea tiene un [`pipeline`] asociado, es más sencillo usar la abstracción general [`pipeline`] que contiene todos los pipelines de tareas específicas. El [`pipeline`] carga automáticamente un modelo predeterminado y un tokenizador con capacidad de inferencia para tu tarea.
-
-1. Comienza creando un [`pipeline`] y específica una tarea de inferencia:
-
-```py
->>> from transformers import pipeline
-
->>> generator = pipeline(task="text-generation")
-```
-
-2. Pasa tu texto de entrada al [`pipeline`]:
-
-```py
->>> generator("Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone")
-[{'generated_text': 'Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone, Seven for the Iron-priests at the door to the east, and thirteen for the Lord Kings at the end of the mountain'}]
-```
-
-Si tienes más de una entrada, pásala como una lista:
-
-```py
->>> generator(
-...     [
-...         "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone",
-...         "Nine for Mortal Men, doomed to die, One for the Dark Lord on his dark throne",
-...     ]
-... )
-```
-
-Cualquier parámetro adicional para tu tarea también se puede incluir en el [`pipeline`]. La tarea `text-generation` tiene un método [`~generation.GenerationMixin.generate`] con varios parámetros para controlar la salida. Por ejemplo, si deseas generar más de una salida, defínelo en el parámetro `num_return_sequences`:
-
-```py
->>> generator(
-...     "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone",
-...     num_return_sequences=2,
-... )
-```
-
-### Selecciona un modelo y un tokenizador
-
-El [`pipeline`] acepta cualquier modelo del [Model Hub](https://huggingface.co/models). Hay etiquetas en el Model Hub que te permiten filtrar por el modelo que te gustaría utilizar para tu tarea. Una vez que hayas elegido un modelo apropiado, cárgalo con la clase `AutoModelFor` y [`AutoTokenizer`] correspondientes. Por ejemplo, carga la clase [`AutoModelForCausalLM`] para una tarea de modelado de lenguaje causal:
-
-```py
->>> from transformers import AutoTokenizer, AutoModelForCausalLM
-
->>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
->>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
-```
-
-Crea un [`pipeline`] para tu tarea y específica el modelo y el tokenizador que cargaste:
-
-```py
->>> from transformers import pipeline
-
->>> generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
-```
-
-Pasa tu texto de entrada a [`pipeline`] para generar algo de texto:
-
-```py
->>> generator("Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone")
-[{'generated_text': 'Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone, Seven for the Dragon-lords (for them to rule in a world ruled by their rulers, and all who live within the realm'}]
-```
-
-## Pipeline de audio
-
-La flexibilidad de [`pipeline`] significa que también se puede extender a tareas de audio.
-
-Por ejemplo, clasifiquemos la emoción de un breve fragmento del famoso discurso de John F. Kennedy ["We choose to go to the Moon"](https://en.wikipedia.org/wiki/We_choose_to_go_to_the_Moon). Encuentra un modelo de [audio classification](https://huggingface.co/models?pipeline_tag=audio-classification) para reconocimiento de emociones en el Model Hub y cárgalo en el [`pipeline`]:
-
-```py
->>> from transformers import pipeline
-
->>> audio_classifier = pipeline(
-...     task="audio-classification", model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
-... )
-```
-
-Pasa el archivo de audio al [`pipeline`]:
-
-```py
->>> audio_classifier("jfk_moon_speech.wav")
-[{'label': 'calm', 'score': 0.13856211304664612},
- {'label': 'disgust', 'score': 0.13148026168346405},
- {'label': 'happy', 'score': 0.12635163962841034},
- {'label': 'angry', 'score': 0.12439591437578201},
- {'label': 'fearful', 'score': 0.12404385954141617}]
-```
-
-## Pipeline de visión
-
-Finalmente, utilizar un [`pipeline`] para tareas de visión es prácticamente igual.
-
-Específica tu tarea de visión y pasa tu imagen al clasificador. La imagen puede ser un enlace o una ruta local a la imagen. Por ejemplo, ¿qué especie de gato se muestra a continuación?
-
-![pipeline-cat-chonk](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg)
-
-```py
->>> from transformers import pipeline
-
->>> vision_classifier = pipeline(task="image-classification")
->>> vision_classifier(
-...     images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
-... )
-[{'label': 'lynx, catamount', 'score': 0.4403027892112732},
- {'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor',
-  'score': 0.03433405980467796},
- {'label': 'snow leopard, ounce, Panthera uncia',
-  'score': 0.032148055732250214},
- {'label': 'Egyptian cat', 'score': 0.02353910356760025},
- {'label': 'tiger cat', 'score': 0.023034192621707916}]
-```
diff --git a/docs/source/es/pr_checks.md b/docs/source/es/pr_checks.md
new file mode 100644
index 000000000000..ba67e85306d3
--- /dev/null
+++ b/docs/source/es/pr_checks.md
@@ -0,0 +1,132 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Verificaciones en un Pull Request
+
+Cuando abres un _pull request_ en 🤗 Transformers, se ejecutarán una serie de verificaciones para asegurarte de que el _patch_ que estás agregando no rompa nada existente. Estas verificaciones son de cuatro tipos:
+- pruebas regulares
+- creación de la documentación
+- estilo del código y documentación
+- consistencia del repositorio
+
+En este documento, intentaremos explicar cuáles son esas diferentes verificaciones y el motivo detrás de ellas, así como también cómo depurarlas localmente si una falla en tu PR.
+
+Recuerda que todas las verificaciones requieren que tengas una instalación de desarrollo:
+
+```bash
+pip install transformers[dev]
+```
+
+o una instalación editable:
+
+```bash
+pip install -e .[dev]
+```
+
+del repositorio de Transformers.
+
+## Pruebas
+
+Todos los procesos que comienzan con `ci/circleci: run_tests_` ejecutan partes del conjunto de pruebas de Transformers. Cada uno de esos procesos se enfoca en una parte de la biblioteca en un entorno determinado: por ejemplo, `ci/circleci: run_tests_pipelines_tf` ejecuta la prueba de _pipelines_ en un entorno donde solo está instalado TensorFlow.
+
+Ten en cuenta que para evitar ejecutar pruebas cuando no hay un cambio real en los módulos que estás probando, solo se ejecuta una parte del conjunto de pruebas: se ejecuta una tarea auxiliar para determinar las diferencias en la biblioteca antes y después del PR (lo que GitHub te muestra en la pestaña "Files changes") y selecciona las pruebas afectadas por esa diferencia. Este auxiliar se puede ejecutar localmente usando:
+
+```bash
+python utils/tests_fetcher.py
+```
+
+desde el directorio raiz del repositorio de Transformers. Se ejecutará lo siguiente:
+
+1. Verificación para cada archivo en el _diff_ si los cambios están en el código, solo en comentarios o _docstrings_. Solo los archivos con cambios reales de código se conservan.
+2. Creación de un mapa interno que proporciona para cada archivo del código fuente de la biblioteca todos los archivos a los que impacta recursivamente. Se dice que el módulo A impacta al módulo B si el módulo B importa el módulo A. Para el impacto recursivo, necesitamos una cadena de módulos que va del módulo A al módulo B en la que cada módulo importa el anterior.
+3. Aplicación de este mapa en los archivos recopilados en el paso 1, lo que nos da una lista de archivos modelo afectados por el PR.
+4. Asignación de cada uno de esos archivos a sus archivos de prueba correspondientes y para obtener una la lista de pruebas a ejecutar.
+
+Al ejecutar el _script_ localmente, debes obtener los resultados de los pasos 1, 3 y 4 impresos y así saber qué pruebas se ejecutarán. El _script_ también creará un archivo llamado `test_list.txt` que contiene la lista de pruebas para ejecutar, y puede ejecutarlas localmente con el siguiente comando:
+
+```bash
+python -m pytest -n 8 --dist=loadfile -rA -s $(cat test_list.txt)
+```
+
+En caso de que se te escape algo, el conjunto completo de pruebas también se ejecuta a diario.
+
+## Creación de la documentación
+
+El proceso `build_pr_documentation` compila y genera una vista previa de la documentación para asegurarse de que todo se vea bien una vez que se fusione tu PR. Un bot agregará un enlace para obtener una vista previa de la documentación en tu PR. Cualquier cambio que realices en el PR se actualiza automáticamente en la vista previa. Si la documentación no se genera, haz clic en **Detalles** junto al proceso fallido para ver dónde salió mal. A menudo, el error es tan simple como que falta un archivo en `toctree`.
+
+Si estás interesado en compilar u obtener una vista previa de la documentación localmente, echa un vistazo al [`README.md`](https://github.com/huggingface/transformers/tree/main/docs) en la carpeta `docs`.
+
+## Estilo de código y documentación.
+
+El formato de código se aplica a todos los archivos fuente, los ejemplos y las pruebas utilizando `black` e `ruff`. También tenemos una herramienta personalizada que se ocupa del formato de los _docstrings_ y archivos `rst` (`utils/style_doc.py`), así como del orden de las importaciones _lazy_ realizadas en los archivos `__init__.py` de Transformers (`utils /custom_init_isort.py`). Todo esto se puede probar ejecutando
+
+```bash
+make style
+```
+
+CI verifica que se hayan aplicado dentro de la verificación `ci/circleci: check_code_quality`. También se ejecuta `ruff`, que hará una verificación básica a tu código y te hará saber si encuentra una variable no definida, o una que no se usa. Para ejecutar esa verificación localmente, usa
+
+```bash
+make quality
+```
+
+Esto puede llevar mucho tiempo, así que para ejecutar lo mismo solo en los archivos que modificaste en la rama actual, ejecuta
+
+```bash
+make fixup
+```
+
+Este último comando también ejecutará todas las verificaciones adicionales para la consistencia del repositorio. Echemos un vistazo a estas pruebas.
+
+## Consistencia del repositorio
+
+Esta verificación reagrupa todas las pruebas para asegurarse de que tu PR deja el repositorio en buen estado, y se realiza mediante `ci/circleci: check_repository_consistency`. Puedes ejecutar localmente esta verificación ejecutando lo siguiente:
+
+```bash
+make repo-consistency
+```
+
+Esta instrucción verifica que:
+
+- Todos los objetos agregados al _init_ están documentados (realizados por `utils/check_repo.py`)
+- Todos los archivos `__init__.py` tienen el mismo contenido en sus dos secciones (realizado por `utils/check_inits.py`)
+- Todo el código identificado como una copia de otro módulo es consistente con el original (realizado por `utils/check_copies.py`)
+- Todas las clases de configuración tienen al menos _checkpoint_ válido mencionado en sus _docstrings_ (realizado por `utils/check_config_docstrings.py`)
+- Las traducciones de los README y el índice del documento tienen la misma lista de modelos que el README principal (realizado por `utils/check_copies.py`)
+- Las tablas generadas automaticamente en la documentación están actualizadas (realizadas por `utils/check_table.py`)
+- La biblioteca tiene todos los objetos disponibles incluso si no están instaladas todas las dependencias opcionales (realizadas por `utils/check_dummies.py`)
+
+Si esta verificación falla, los primeros dos elementos requieren una reparación manual, los últimos cuatro pueden repararse automáticamente ejecutando el comando
+
+```bash
+make fix-copies
+```
+
+Las verificaciones adicionales se refieren a los PRs que agregan nuevos modelos, principalmente que:
+
+- Todos los modelos agregados están en un Auto-mapping (realizado por `utils/check_repo.py`)
+<!-- TODO Sylvain, add a check that makes sure the common tests are implemented.-->
+- Todos los modelos se verifican correctamente (realizados por `utils/check_repo.py`)
+
+<!-- TODO Sylvain, add the following
+- All models are added to the main README, inside the main doc
+- All checkpoints used actually exist on the Hub
+
+-->
diff --git a/docs/source/es/pr_checks.mdx b/docs/source/es/pr_checks.mdx
deleted file mode 100644
index b4ae0f1c7a12..000000000000
--- a/docs/source/es/pr_checks.mdx
+++ /dev/null
@@ -1,128 +0,0 @@
-<!---
-Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Verificaciones en un Pull Request
-
-Cuando abres un _pull request_ en 🤗 Transformers, se ejecutarán una serie de verificaciones para asegurarte de que el _patch_ que estás agregando no rompa nada existente. Estas verificaciones son de cuatro tipos:
-- pruebas regulares
-- creación de la documentación
-- estilo del código y documentación
-- consistencia del repositorio
-
-En este documento, intentaremos explicar cuáles son esas diferentes verificaciones y el motivo detrás de ellas, así como también cómo depurarlas localmente si una falla en tu PR.
-
-Recuerda que todas las verificaciones requieren que tengas una instalación de desarrollo:
-
-```bash
-pip install transformers[dev]
-```
-
-o una instalación editable:
-
-```bash
-pip install -e .[dev]
-```
-
-del repositorio de Transformers.
-
-## Pruebas
-
-Todos los procesos que comienzan con `ci/circleci: run_tests_` ejecutan partes del conjunto de pruebas de Transformers. Cada uno de esos procesos se enfoca en una parte de la biblioteca en un entorno determinado: por ejemplo, `ci/circleci: run_tests_pipelines_tf` ejecuta la prueba de _pipelines_ en un entorno donde solo está instalado TensorFlow.
-
-Ten en cuenta que para evitar ejecutar pruebas cuando no hay un cambio real en los módulos que estás probando, solo se ejecuta una parte del conjunto de pruebas: se ejecuta una tarea auxiliar para determinar las diferencias en la biblioteca antes y después del PR (lo que GitHub te muestra en la pestaña "Files changes") y selecciona las pruebas afectadas por esa diferencia. Este auxiliar se puede ejecutar localmente usando:
-
-```bash
-python utils/tests_fetcher.py
-```
-
-desde el directorio raiz del repositorio de Transformers. Se ejecutará lo siguiente:
-
-1. Verificación para cada archivo en el _diff_ si los cambios están en el código, solo en comentarios o _docstrings_. Solo los archivos con cambios reales de código se conservan.
-2. Creación de un mapa interno que proporciona para cada archivo del código fuente de la biblioteca todos los archivos a los que impacta recursivamente. Se dice que el módulo A impacta al módulo B si el módulo B importa el módulo A. Para el impacto recursivo, necesitamos una cadena de módulos que va del módulo A al módulo B en la que cada módulo importa el anterior.
-3. Aplicación de este mapa en los archivos recopilados en el paso 1, lo que nos da una lista de archivos modelo afectados por el PR.
-4. Asignación de cada uno de esos archivos a sus archivos de prueba correspondientes y para obtener una la lista de pruebas a ejecutar.
-
-Al ejecutar el _script_ localmente, debes obtener los resultados de los pasos 1, 3 y 4 impresos y así saber qué pruebas se ejecutarán. El _script_ también creará un archivo llamado `test_list.txt` que contiene la lista de pruebas para ejecutar, y puede ejecutarlas localmente con el siguiente comando:
-
-```bash
-python -m pytest -n 8 --dist=loadfile -rA -s $(cat test_list.txt)
-```
-
-En caso de que se te escape algo, el conjunto completo de pruebas también se ejecuta a diario.
-
-## Creación de la documentación
-
-El proceso `build_pr_documentation` compila y genera una vista previa de la documentación para asegurarse de que todo se vea bien una vez que se fusione tu PR. Un bot agregará un enlace para obtener una vista previa de la documentación en tu PR. Cualquier cambio que realices en el PR se actualiza automáticamente en la vista previa. Si la documentación no se genera, haz clic en **Detalles** junto al proceso fallido para ver dónde salió mal. A menudo, el error es tan simple como que falta un archivo en `toctree`.
-
-Si estás interesado en compilar u obtener una vista previa de la documentación localmente, echa un vistazo al [`README.md`](https://github.com/huggingface/transformers/tree/main/docs) en la carpeta `docs`.
-
-## Estilo de código y documentación.
-
-El formato de código se aplica a todos los archivos fuente, los ejemplos y las pruebas utilizando `black` e `isort`. También tenemos una herramienta personalizada que se ocupa del formato de los _docstrings_ y archivos `rst` (`utils/style_doc.py`), así como del orden de las importaciones _lazy_ realizadas en los archivos `__init__.py` de Transformers (`utils /custom_init_isort.py`). Todo esto se puede probar ejecutando
-
-```bash
-make style
-```
-
-CI verifica que se hayan aplicado dentro de la verificación `ci/circleci: check_code_quality`. También se ejecuta `flake8`, que hará una verificación básica a tu código y te hará saber si encuentra una variable no definida, o una que no se usa. Para ejecutar esa verificación localmente, usa
-
-```bash
-make quality
-```
-
-Esto puede llevar mucho tiempo, así que para ejecutar lo mismo solo en los archivos que modificaste en la rama actual, ejecuta
-
-```bash
-make fixup
-```
-
-Este último comando también ejecutará todas las verificaciones adicionales para la consistencia del repositorio. Echemos un vistazo a estas pruebas.
-
-## Consistencia del repositorio
-
-Esta verificación reagrupa todas las pruebas para asegurarse de que tu PR deja el repositorio en buen estado, y se realiza mediante `ci/circleci: check_repository_consistency`. Puedes ejecutar localmente esta verificación ejecutando lo siguiente:
-
-```bash
-make repo-consistency
-```
-
-Esta instrucción verifica que:
-
-- Todos los objetos agregados al _init_ están documentados (realizados por `utils/check_repo.py`)
-- Todos los archivos `__init__.py` tienen el mismo contenido en sus dos secciones (realizado por `utils/check_inits.py`)
-- Todo el código identificado como una copia de otro módulo es consistente con el original (realizado por `utils/check_copies.py`)
-- Todas las clases de configuración tienen al menos _checkpoint_ válido mencionado en sus _docstrings_ (realizado por `utils/check_config_docstrings.py`)
-- Las traducciones de los README y el índice del documento tienen la misma lista de modelos que el README principal (realizado por `utils/check_copies.py`)
-- Las tablas generadas automaticamente en la documentación están actualizadas (realizadas por `utils/check_table.py`)
-- La biblioteca tiene todos los objetos disponibles incluso si no están instaladas todas las dependencias opcionales (realizadas por `utils/check_dummies.py`)
-
-Si esta verificación falla, los primeros dos elementos requieren una reparación manual, los últimos cuatro pueden repararse automáticamente ejecutando el comando
-
-```bash
-make fix-copies
-```
-
-Las verificaciones adicionales se refieren a los PRs que agregan nuevos modelos, principalmente que:
-
-- Todos los modelos agregados están en un Auto-mapping (realizado por `utils/check_repo.py`)
-<!-- TODO Sylvain, add a check that makes sure the common tests are implemented.-->
-- Todos los modelos se verifican correctamente (realizados por `utils/check_repo.py`)
-
-<!-- TODO Sylvain, add the following
-- All models are added to the main README, inside the main doc
-- All checkpoints used actually exist on the Hub
-
--->
diff --git a/docs/source/es/preprocessing.md b/docs/source/es/preprocessing.md
new file mode 100644
index 000000000000..f4eec4862be8
--- /dev/null
+++ b/docs/source/es/preprocessing.md
@@ -0,0 +1,560 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Preprocesamiento
+
+[[open-in-colab]]
+
+Antes de que puedas utilizar los datos en un modelo, debes procesarlos en un formato aceptable para el modelo. Un modelo no entiende el texto en bruto, las imágenes o el audio. Estas entradas necesitan ser convertidas en números y ensambladas en tensores. En este tutorial, podrás:
+
+* Preprocesar los datos textuales con un tokenizador.
+* Preprocesar datos de imagen o audio con un extractor de características.
+* Preprocesar datos para una tarea multimodal con un procesador.
+
+## NLP
+
+<Youtube id="Yffk5aydLzg"/>
+
+La principal herramienta para procesar datos textuales es un [tokenizador](main_classes/tokenizer). Un tokenizador comienza dividiendo el texto en *tokens* según un conjunto de reglas. Los tokens se convierten en números, que se utilizan para construir tensores como entrada a un modelo. El tokenizador también añade cualquier entrada adicional que requiera el modelo.
+
+<Tip>
+
+Si tienes previsto utilizar un modelo pre-entrenado, es importante que utilices el tokenizador pre-entrenado asociado. Esto te asegura que el texto se divide de la misma manera que el corpus de pre-entrenamiento y utiliza el mismo índice de tokens correspondiente (usualmente referido como el *vocab*) durante el pre-entrenamiento.
+
+</Tip>
+
+Comienza rápidamente cargando un tokenizador pre-entrenado con la clase [`AutoTokenizer`]. Esto descarga el *vocab* utilizado cuando un modelo es pre-entrenado.
+
+### Tokenizar
+
+Carga un tokenizador pre-entrenado con [`AutoTokenizer.from_pretrained`]:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+```
+
+A continuación, pasa tu frase al tokenizador:
+
+```py
+>>> encoded_input = tokenizer("Do not meddle in the affairs of wizards, for they are subtle and quick to anger.")
+>>> print(encoded_input)
+{'input_ids': [101, 2079, 2025, 19960, 10362, 1999, 1996, 3821, 1997, 16657, 1010, 2005, 2027, 2024, 11259, 1998, 4248, 2000, 4963, 1012, 102], 
+ 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+ 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+```
+
+El tokenizador devuelve un diccionario con tres ítems importantes:
+
+* [input_ids](glossary#input-ids) son los índices correspondientes a cada token de la frase.
+* [attention_mask](glossary#attention-mask) indica si un token debe ser atendido o no.
+* [token_type_ids](glossary#token-type-ids) identifica a qué secuencia pertenece un token cuando hay más de una secuencia.
+
+Tu puedes decodificar el `input_ids` para devolver la entrada original:
+
+```py
+>>> tokenizer.decode(encoded_input["input_ids"])
+'[CLS] Do not meddle in the affairs of wizards, for they are subtle and quick to anger. [SEP]'
+```
+
+Como puedes ver, el tokenizador ha añadido dos tokens especiales - `CLS` y `SEP` (clasificador y separador) - a la frase. No todos los modelos necesitan
+tokens especiales, pero si lo llegas a necesitar,  el tokenizador los añadirá automáticamente.
+
+Si hay varias frases que quieres preprocesar, pasa las frases como una lista al tokenizador:
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_inputs = tokenizer(batch_sentences)
+>>> print(encoded_inputs)
+{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102], 
+               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], 
+               [101, 1327, 1164, 5450, 23434, 136, 102]], 
+ 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], 
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+                    [0, 0, 0, 0, 0, 0, 0]], 
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], 
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
+                    [1, 1, 1, 1, 1, 1, 1]]}
+```
+
+### Pad
+
+Esto nos lleva a un tema importante. Cuando se procesa un batch de frases, no siempre tienen la misma longitud. Esto es un problema porque los tensores que se introducen en el modelo deben tener una forma uniforme. El pad es una estrategia para asegurar que los tensores sean rectangulares añadiendo un "padding token" especial a las oraciones con menos tokens.
+
+Establece el parámetro `padding` en `True` aplicando el pad a las secuencias más cortas del batch para que coincidan con la secuencia más larga:
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch_sentences, padding=True)
+>>> print(encoded_input)
+{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], 
+               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], 
+               [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], 
+ 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], 
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
+                    [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]}
+```
+
+Observa que el tokenizador ha aplicado el pad a la primera y la tercera frase con un "0" porque son más cortas.
+
+### Truncamiento
+
+En el otro extremo del espectro, a veces una secuencia puede ser demasiado larga para un modelo. En este caso, tendrás que truncar la secuencia a una longitud más corta.
+
+Establece el parámetro `truncation` a `True` para truncar una secuencia a la longitud máxima aceptada por el modelo:
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True)
+>>> print(encoded_input)
+{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], 
+               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], 
+               [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], 
+ 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], 
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
+                    [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]}
+```
+
+### Construye tensores
+
+Finalmente, si quieres que el tokenizador devuelva los tensores reales que se introducen en el modelo.
+
+Establece el parámetro `return_tensors` como `pt` para PyTorch, o `tf` para TensorFlow:
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
+>>> print(encoded_input)
+{'input_ids': tensor([[  101,   153,  7719, 21490,  1122,  1114,  9582,  1623,   102],
+                      [  101,  5226,  1122,  9649,  1199,  2610,  1236,   102,     0]]), 
+ 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0],
+                           [0, 0, 0, 0, 0, 0, 0, 0, 0]]), 
+ 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1],
+                           [1, 1, 1, 1, 1, 1, 1, 1, 0]])}
+===PT-TF-SPLIT===
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors="tf")
+>>> print(encoded_input)
+{'input_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
+array([[  101,   153,  7719, 21490,  1122,  1114,  9582,  1623,   102],
+       [  101,  5226,  1122,  9649,  1199,  2610,  1236,   102,     0]],
+      dtype=int32)>, 
+ 'token_type_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
+array([[0, 0, 0, 0, 0, 0, 0, 0, 0],
+       [0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>, 
+ 'attention_mask': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
+array([[1, 1, 1, 1, 1, 1, 1, 1, 1],
+       [1, 1, 1, 1, 1, 1, 1, 1, 0]], dtype=int32)>}
+```
+
+## Audio
+
+Las entradas de audio se preprocesan de forma diferente a las entradas textuales, pero el objetivo final es el mismo: crear secuencias numéricas que el modelo pueda entender. Un [extractor de características](main_classes/feature_extractor) (o feature extractor en inglés) está diseñado para extraer características de datos provenientes de imágenes o audio sin procesar y convertirlos en tensores. Antes de empezar, instala 🤗 Datasets para cargar un dataset de audio para experimentar:
+
+```bash
+pip install datasets
+```
+
+Carga la tarea de detección de palabras clave del benchmark [SUPERB](https://huggingface.co/datasets/superb) (consulta el [tutorial 🤗 Dataset](https://huggingface.co/docs/datasets/load_hub.html) para que obtengas más detalles sobre cómo cargar un dataset):
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> dataset = load_dataset("superb", "ks")
+```
+
+Accede al primer elemento de la columna `audio` para echar un vistazo a la entrada. Al llamar a la columna `audio` se cargará y volverá a muestrear automáticamente el archivo de audio:
+
+```py
+>>> dataset["train"][0]["audio"]
+{'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00592041,
+        -0.00405884, -0.00253296], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/05734a36d88019a09725c20cc024e1c4e7982e37d7d55c0c1ca1742ea1cdd47f/_background_noise_/doing_the_dishes.wav',
+ 'sampling_rate': 16000}
+```
+
+Esto devuelve tres elementos:
+
+* `array` es la señal de voz cargada - y potencialmente remuestreada - como un array 1D.
+* `path` apunta a la ubicación del archivo de audio.
+* `sampling_rate` se refiere a cuántos puntos de datos de la señal de voz se miden por segundo.
+
+### Resample
+
+Para este tutorial, se utilizará el modelo [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base). Como puedes ver en la model card, el modelo Wav2Vec2 está pre-entrenado en audio de voz muestreado a 16kHz. Es importante que la tasa de muestreo de tus datos de audio coincida con la tasa de muestreo del dataset utilizado para pre-entrenar el modelo. Si la tasa de muestreo de tus datos no es la misma, deberás volver a muestrear tus datos de audio. 
+
+Por ejemplo, carga el dataset [LJ Speech](https://huggingface.co/datasets/lj_speech) que tiene una tasa de muestreo de 22050kHz. Para utilizar el modelo Wav2Vec2 con este dataset, reduce la tasa de muestreo a 16kHz:
+
+```py
+>>> lj_speech = load_dataset("lj_speech", split="train")
+>>> lj_speech[0]["audio"]
+{'array': array([-7.3242188e-04, -7.6293945e-04, -6.4086914e-04, ...,
+         7.3242188e-04,  2.1362305e-04,  6.1035156e-05], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav',
+ 'sampling_rate': 22050}
+```
+
+1. Usa el método 🤗 Datasets' [`cast_column`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.cast_column) para reducir la tasa de muestreo a 16kHz:
+
+```py
+>>> lj_speech = lj_speech.cast_column("audio", Audio(sampling_rate=16_000))
+```
+
+2. Carga el archivo de audio:
+
+```py
+>>> lj_speech[0]["audio"]
+{'array': array([-0.00064146, -0.00074657, -0.00068768, ...,  0.00068341,
+         0.00014045,  0.        ], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav',
+ 'sampling_rate': 16000}
+```
+
+Como puedes ver, el `sampling_rate` se ha reducido a 16kHz. Ahora que sabes cómo funciona el resampling, volvamos a nuestro ejemplo anterior con el dataset SUPERB.
+
+### Extractor de características
+
+El siguiente paso es cargar un extractor de características para normalizar y aplicar el pad a la entrada. Cuando se aplica padding a los datos textuales, se añade un "0" para las secuencias más cortas. La misma idea se aplica a los datos de audio y el extractor de características de audio añadirá un "0" - interpretado como silencio - al "array".
+
+Carga el extractor de características con [`AutoFeatureExtractor.from_pretrained`]:
+
+```py
+>>> from transformers import AutoFeatureExtractor
+
+>>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
+```
+
+Pasa el `array` de audio al extractor de características. También te recomendamos añadir el argumento `sampling_rate` en el extractor de características para poder depurar mejor los errores silenciosos que puedan producirse.
+
+```py
+>>> audio_input = [dataset["train"][0]["audio"]["array"]]
+>>> feature_extractor(audio_input, sampling_rate=16000)
+{'input_values': [array([ 0.00045439,  0.00045439,  0.00045439, ..., -0.1578519 , -0.10807519, -0.06727459], dtype=float32)]}
+```
+
+### Pad y truncamiento
+
+Al igual que el tokenizador, puedes aplicar padding o truncamiento para manejar secuencias variables en un batch. Fíjate en la longitud de la secuencia de estas dos muestras de audio:
+
+```py
+>>> dataset["train"][0]["audio"]["array"].shape
+(1522930,)
+
+>>> dataset["train"][1]["audio"]["array"].shape
+(988891,)
+```
+
+Como puedes ver, el `sampling_rate` se ha reducido a 16kHz. 
+
+```py
+>>> def preprocess_function(examples):
+...     audio_arrays = [x["array"] for x in examples["audio"]]
+...     inputs = feature_extractor(
+...         audio_arrays,
+...         sampling_rate=16000,
+...         padding=True,
+...         max_length=1000000,
+...         truncation=True,
+...     )
+...     return inputs
+```
+
+Aplica la función a los primeros ejemplos del dataset:
+
+```py
+>>> processed_dataset = preprocess_function(dataset["train"][:5])
+```
+
+Ahora echa un vistazo a las longitudes de las muestras procesadas:
+
+```py
+>>> processed_dataset["input_values"][0].shape
+(1000000,)
+
+>>> processed_dataset["input_values"][1].shape
+(1000000,)
+```
+
+Las longitudes de las dos primeras muestras coinciden ahora con la longitud máxima especificada.
+
+## Visión
+
+También se utiliza un extractor de características para procesar imágenes para tareas de visión por computadora. Una vez más, el objetivo es convertir la imagen en bruto en un batch de tensores como entrada.
+
+Vamos a cargar el dataset [food101](https://huggingface.co/datasets/food101) para este tutorial. Usa el parámetro 🤗 Datasets `split` para cargar solo una pequeña muestra de la división de entrenamiento ya que el dataset es bastante grande:
+
+```py
+>>> from datasets import load_dataset
+
+>>> dataset = load_dataset("food101", split="train[:100]")
+```
+
+A continuación, observa la imagen con la función 🤗 Datasets [`Image`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=image#datasets.Image):
+
+```py
+>>> dataset[0]["image"]
+```
+
+![vision-preprocess-tutorial.png](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vision-preprocess-tutorial.png)
+
+### Extractor de características
+
+Carga el extractor de características con [`AutoFeatureExtractor.from_pretrained`]:
+
+```py
+>>> from transformers import AutoFeatureExtractor
+
+>>> feature_extractor = AutoFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
+```
+
+### Aumento de Datos
+
+Para las tareas de visión por computadora es común añadir algún tipo de aumento de datos (o data augmentation) a las imágenes como parte del preprocesamiento. Puedes añadir el método de aumento de datos con cualquier librería que quieras, pero en este tutorial utilizarás el módulo [`transforms`](https://pytorch.org/vision/stable/transforms.html) de torchvision.
+
+1. Normaliza la imagen y utiliza [`Compose`](https://pytorch.org/vision/master/generated/torchvision.transforms.Compose.html) para encadenar algunas transformaciones - [`RandomResizedCrop`](https://pytorch.org/vision/main/generated/torchvision.transforms.RandomResizedCrop.html) y [`ColorJitter`](https://pytorch.org/vision/main/generated/torchvision.transforms.ColorJitter.html) - juntas:
+
+```py
+>>> from torchvision.transforms import Compose, Normalize, RandomResizedCrop, ColorJitter, ToTensor
+
+>>> normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
+>>> _transforms = Compose(
+...     [RandomResizedCrop(feature_extractor.size), ColorJitter(brightness=0.5, hue=0.5), ToTensor(), normalize]
+... )
+```
+
+2. El modelo acepta [`pixel_values`](model_doc/visionencoderdecoder#transformers.VisionEncoderDecoderModel.forward.pixel_values) como entrada. Este valor es generado por el extractor de características. Crea una función que genere `pixel_values` a partir de las transformaciones:
+
+```py
+>>> def transforms(examples):
+...     examples["pixel_values"] = [_transforms(image.convert("RGB")) for image in examples["image"]]
+...     return examples
+```
+
+3. A continuación, utiliza 🤗 Datasets [`set_transform`](https://huggingface.co/docs/datasets/process.html#format-transform) para aplicar las transformaciones sobre la marcha:
+
+```py
+>>> dataset.set_transform(transforms)
+```
+
+4. Ahora, cuando accedes a la imagen, observarás que el extractor de características ha añadido a la entrada del modelo `pixel_values`:
+
+```py
+>>> dataset[0]["image"]
+{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=384x512 at 0x7F1A7B0630D0>,
+ 'label': 6,
+ 'pixel_values': tensor([[[ 0.0353,  0.0745,  0.1216,  ..., -0.9922, -0.9922, -0.9922],
+          [-0.0196,  0.0667,  0.1294,  ..., -0.9765, -0.9843, -0.9922],
+          [ 0.0196,  0.0824,  0.1137,  ..., -0.9765, -0.9686, -0.8667],
+          ...,
+          [ 0.0275,  0.0745,  0.0510,  ..., -0.1137, -0.1216, -0.0824],
+          [ 0.0667,  0.0824,  0.0667,  ..., -0.0588, -0.0745, -0.0980],
+          [ 0.0353,  0.0353,  0.0431,  ..., -0.0039, -0.0039, -0.0588]],
+ 
+         [[ 0.2078,  0.2471,  0.2863,  ..., -0.9451, -0.9373, -0.9451],
+          [ 0.1608,  0.2471,  0.3098,  ..., -0.9373, -0.9451, -0.9373],
+          [ 0.2078,  0.2706,  0.3020,  ..., -0.9608, -0.9373, -0.8275],
+          ...,
+          [-0.0353,  0.0118, -0.0039,  ..., -0.2392, -0.2471, -0.2078],
+          [ 0.0196,  0.0353,  0.0196,  ..., -0.1843, -0.2000, -0.2235],
+          [-0.0118, -0.0039, -0.0039,  ..., -0.0980, -0.0980, -0.1529]],
+ 
+         [[ 0.3961,  0.4431,  0.4980,  ..., -0.9216, -0.9137, -0.9216],
+          [ 0.3569,  0.4510,  0.5216,  ..., -0.9059, -0.9137, -0.9137],
+          [ 0.4118,  0.4745,  0.5216,  ..., -0.9137, -0.8902, -0.7804],
+          ...,
+          [-0.2314, -0.1922, -0.2078,  ..., -0.4196, -0.4275, -0.3882],
+          [-0.1843, -0.1686, -0.2000,  ..., -0.3647, -0.3804, -0.4039],
+          [-0.1922, -0.1922, -0.1922,  ..., -0.2941, -0.2863, -0.3412]]])}
+```
+
+Este es el aspecto de la imagen después de preprocesarla. Como era de esperar por las transformaciones aplicadas, la imagen ha sido recortada aleatoriamente y sus propiedades de color son diferentes.
+
+```py
+>>> import numpy as np
+>>> import matplotlib.pyplot as plt
+
+>>> img = dataset[0]["pixel_values"]
+>>> plt.imshow(img.permute(1, 2, 0))
+```
+
+![preprocessed_image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/preprocessed_image.png)
+
+## Multimodal
+
+Para las tareas multimodales utilizarás una combinación de todo lo que has aprendido hasta ahora y aplicarás tus habilidades a una tarea de reconocimiento automático de voz (ASR). Esto significa que necesitarás un:
+
+* Extractor de características para preprocesar los datos de audio.
+* Un tokenizador para procesar el texto.
+
+Volvamos al dataset [LJ Speech](https://huggingface.co/datasets/lj_speech):
+
+```py
+>>> from datasets import load_dataset
+
+>>> lj_speech = load_dataset("lj_speech", split="train")
+```
+
+Suponiendo que te interesan principalmente las columnas `audio` y `texto`, elimina las demás columnas:
+
+```py
+>>> lj_speech = lj_speech.map(remove_columns=["file", "id", "normalized_text"])
+```
+
+Ahora echa un vistazo a las columnas `audio` y `texto`:
+
+```py
+>>> lj_speech[0]["audio"]
+{'array': array([-7.3242188e-04, -7.6293945e-04, -6.4086914e-04, ...,
+         7.3242188e-04,  2.1362305e-04,  6.1035156e-05], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav',
+ 'sampling_rate': 22050}
+
+>>> lj_speech[0]["text"]
+'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition'
+```
+
+Recuerda la sección anterior sobre el procesamiento de datos de audio, siempre debes [volver a muestrear](preprocessing#audio) la tasa de muestreo de tus datos de audio para que coincida con la tasa de muestreo del dataset utilizado para preentrenar un modelo:
+
+```py
+>>> lj_speech = lj_speech.cast_column("audio", Audio(sampling_rate=16_000))
+```
+
+### Processor
+
+Un processor combina un extractor de características y un tokenizador. Cargue un procesador con [`AutoProcessor.from_pretrained]:
+
+```py
+>>> from transformers import AutoProcessor
+
+>>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
+```
+
+1. Crea una función para procesar los datos de audio en `input_values`, y tokeniza el texto en `labels`. Estas son las entradas del modelo:
+
+```py
+>>> def prepare_dataset(example):
+...     audio = example["audio"]
+
+...     example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000))
+
+...     return example
+```
+
+2. Aplica la función `prepare_dataset` a una muestra:
+
+```py
+>>> prepare_dataset(lj_speech[0])
+```
+
+Observa que el método processor ha añadido `input_values` y `labels`. La tasa de muestreo también se ha reducido correctamente a 16kHz.
+
+Genial, ahora deberías ser capaz de preprocesar datos para cualquier modalidad e incluso combinar diferentes modalidades. En el siguiente tutorial, aprenderás aplicar fine tuning a un modelo en tus datos recién preprocesados.
+
+## Todo lo que siempre quisiste saber sobre el padding y el truncamiento
+
+Hemos visto los comandos que funcionarán para la mayoría de los casos (hacer pad a tu batch teniendo en cuenta la longitud de la frase máxima y 
+truncar a la longitud máxima que el modelo puede aceptar). Sin embargo, la API admite más estrategias si las necesitas. Los 
+tres argumentos que necesitas conocer para ello son `padding`, `truncation` y `max_length`.
+
+- `padding` controla el aplicarme padding al texto. Puede ser un booleano o una cadena que debe ser:
+
+  - `True` o `'longest'` para aplicar el pad hasta la secuencia más larga del batch (no apliques el padding si sólo le proporcionas 
+  una sola secuencia).
+  - `'max_length'` para aplicar el pad hasta la longitud especificada por el argumento `max_length` o la longitud máxima aceptada 
+  por el modelo si no le proporcionas `longitud_máxima` (`longitud_máxima=None`). Si sólo le proporcionas una única secuencia 
+  se le aplicará el padding.
+  `False` o `'do_not_pad'` para no aplicar pad a las secuencias. Como hemos visto antes, este es el comportamiento por 
+  defecto.
+
+- `truncation` controla el truncamiento. Puede ser un booleano o una string que debe ser:
+
+  - `True` o `'longest_first'` truncan hasta la longitud máxima especificada por el argumento `max_length` o 
+  la longitud máxima aceptada por el modelo si no le proporcionas `max_length` (`max_length=None`). Esto 
+  truncará token por token, eliminando un token de la secuencia más larga del par hasta alcanzar la longitud 
+  adecuada.
+  - `'only_second'` trunca hasta la longitud máxima especificada por el argumento `max_length` o la 
+  longitud máxima aceptada por el modelo si no le proporcionas `max_length` (`max_length=None`). Esto sólo truncará 
+  la segunda frase de un par si le proporcionas un par de secuencias (o un batch de pares de secuencias).
+  - `'only_first'` trunca hasta la longitud máxima especificada por el argumento `max_length` o la longitud máxima 
+  aceptada por el modelo si no se proporciona `max_length` (`max_length=None`). Esto sólo truncará 
+  la primera frase de un par si se proporciona un par de secuencias (o un lote de pares de secuencias).
+  - `False` o `'do_not_truncate'` para no truncar las secuencias. Como hemos visto antes, este es el comportamiento 
+  por defecto.
+
+- `max_length` para controlar la longitud del padding/truncamiento. Puede ser un número entero o `None`, en cuyo caso 
+será por defecto la longitud máxima que el modelo puede aceptar. Si el modelo no tiene una longitud máxima de entrada específica, el 
+padding/truncamiento a `longitud_máxima` se desactiva.
+
+A continuación te mostramos en una tabla que resume la forma recomendada de configurar el padding y el truncamiento. Si utilizas un par de secuencias de entrada en 
+algunos de los siguientes ejemplos, puedes sustituir `truncation=True` por una `STRATEGY` seleccionada en 
+`['only_first', 'only_second', 'longest_first']`, es decir, `truncation='only_second'` o `truncation= 'longest_first'` para controlar cómo se truncan ambas secuencias del par como se ha detallado anteriormente.
+
+| Truncation                           | Padding                           | Instrucciones                                                                               |
+|--------------------------------------|-----------------------------------|---------------------------------------------------------------------------------------------|
+| no truncation                        | no padding                        | `tokenizer(batch_sentences)`                                                           |
+|                                      | padding secuencia max del batch   | `tokenizer(batch_sentences, padding=True)` or                                          |
+|                                      |                                   | `tokenizer(batch_sentences, padding='longest')`                                        |
+|                                      | padding long max de input model   | `tokenizer(batch_sentences, padding='max_length')`                                     |
+|                                      | padding a una long especifica     | `tokenizer(batch_sentences, padding='max_length', max_length=42)`                      |
+| truncation long max del input model  | no padding                        | `tokenizer(batch_sentences, truncation=True)` or                                       |
+|                                      |                                   | `tokenizer(batch_sentences, truncation=STRATEGY)`                                      |
+|                                      | padding secuencia max del batch   | `tokenizer(batch_sentences, padding=True, truncation=True)` or                         |
+|                                      |                                   | `tokenizer(batch_sentences, padding=True, truncation=STRATEGY)`                        |
+|                                      | padding long max de input model   | `tokenizer(batch_sentences, padding='max_length', truncation=True)` or                 |
+|                                      |                                   | `tokenizer(batch_sentences, padding='max_length', truncation=STRATEGY)`                |
+|                                      | padding a una long especifica     | Not possible                                                                                |
+| truncation a una long especifica      | no padding                        | `tokenizer(batch_sentences, truncation=True, max_length=42)` or                        |
+|                                      |                                   | `tokenizer(batch_sentences, truncation=STRATEGY, max_length=42)`                       |
+|                                      | padding secuencia max del batch   | `tokenizer(batch_sentences, padding=True, truncation=True, max_length=42)` or          |
+|                                      |                                   | `tokenizer(batch_sentences, padding=True, truncation=STRATEGY, max_length=42)`         |
+|                                      | padding long max de input model   | Not possible                                                                                |
+|                                      | padding a una long especifica     | `tokenizer(batch_sentences, padding='max_length', truncation=True, max_length=42)` or  |
+|                                      |                                   | `tokenizer(batch_sentences, padding='max_length', truncation=STRATEGY, max_length=42)` |
+
+
+
+
+
+
+
+
diff --git a/docs/source/es/preprocessing.mdx b/docs/source/es/preprocessing.mdx
deleted file mode 100644
index 869f90c41773..000000000000
--- a/docs/source/es/preprocessing.mdx
+++ /dev/null
@@ -1,556 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Preprocesamiento
-
-[[open-in-colab]]
-
-Antes de que puedas utilizar los datos en un modelo, debes procesarlos en un formato aceptable para el modelo. Un modelo no entiende el texto en bruto, las imágenes o el audio. Estas entradas necesitan ser convertidas en números y ensambladas en tensores. En este tutorial, podrás:
-
-* Preprocesar los datos textuales con un tokenizador.
-* Preprocesar datos de imagen o audio con un extractor de características.
-* Preprocesar datos para una tarea multimodal con un procesador.
-
-## NLP
-
-<Youtube id="Yffk5aydLzg"/>
-
-La principal herramienta para procesar datos textuales es un [tokenizador](main_classes/tokenizer). Un tokenizador comienza dividiendo el texto en *tokens* según un conjunto de reglas. Los tokens se convierten en números, que se utilizan para construir tensores como entrada a un modelo. El tokenizador también añade cualquier entrada adicional que requiera el modelo.
-
-<Tip>
-
-Si tienes previsto utilizar un modelo pre-entrenado, es importante que utilices el tokenizador pre-entrenado asociado. Esto te asegura que el texto se divide de la misma manera que el corpus de pre-entrenamiento y utiliza el mismo índice de tokens correspondiente (usualmente referido como el *vocab*) durante el pre-entrenamiento.
-
-</Tip>
-
-Comienza rápidamente cargando un tokenizador pre-entrenado con la clase [`AutoTokenizer`]. Esto descarga el *vocab* utilizado cuando un modelo es pre-entrenado.
-
-### Tokenizar
-
-Carga un tokenizador pre-entrenado con [`AutoTokenizer.from_pretrained`]:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
-```
-
-A continuación, pasa tu frase al tokenizador:
-
-```py
->>> encoded_input = tokenizer("Do not meddle in the affairs of wizards, for they are subtle and quick to anger.")
->>> print(encoded_input)
-{'input_ids': [101, 2079, 2025, 19960, 10362, 1999, 1996, 3821, 1997, 16657, 1010, 2005, 2027, 2024, 11259, 1998, 4248, 2000, 4963, 1012, 102], 
- 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
- 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
-```
-
-El tokenizador devuelve un diccionario con tres ítems importantes:
-
-* [input_ids](glossary#input-ids) son los índices correspondientes a cada token de la frase.
-* [attention_mask](glossary#attention-mask) indica si un token debe ser atendido o no.
-* [token_type_ids](glossary#token-type-ids) identifica a qué secuencia pertenece un token cuando hay más de una secuencia.
-
-Tu puedes decodificar el `input_ids` para devolver la entrada original:
-
-```py
->>> tokenizer.decode(encoded_input["input_ids"])
-'[CLS] Do not meddle in the affairs of wizards, for they are subtle and quick to anger. [SEP]'
-```
-
-Como puedes ver, el tokenizador ha añadido dos tokens especiales - `CLS` y `SEP` (clasificador y separador) - a la frase. No todos los modelos necesitan
-tokens especiales, pero si lo llegas a necesitar,  el tokenizador los añadirá automáticamente.
-
-Si hay varias frases que quieres preprocesar, pasa las frases como una lista al tokenizador:
-
-```py
->>> batch_sentences = [
-...     "But what about second breakfast?",
-...     "Don't think he knows about second breakfast, Pip.",
-...     "What about elevensies?",
-... ]
->>> encoded_inputs = tokenizer(batch_sentences)
->>> print(encoded_inputs)
-{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102], 
-               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], 
-               [101, 1327, 1164, 5450, 23434, 136, 102]], 
- 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], 
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
-                    [0, 0, 0, 0, 0, 0, 0]], 
- 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], 
-                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
-                    [1, 1, 1, 1, 1, 1, 1]]}
-```
-
-### Pad
-
-Esto nos lleva a un tema importante. Cuando se procesa un batch de frases, no siempre tienen la misma longitud. Esto es un problema porque los tensores que se introducen en el modelo deben tener una forma uniforme. El pad es una estrategia para asegurar que los tensores sean rectangulares añadiendo un "padding token" especial a las oraciones con menos tokens.
-
-Establece el parámetro `padding` en `True` aplicando el pad a las secuencias más cortas del batch para que coincidan con la secuencia más larga:
-
-```py
->>> batch_sentences = [
-...     "But what about second breakfast?",
-...     "Don't think he knows about second breakfast, Pip.",
-...     "What about elevensies?",
-... ]
->>> encoded_input = tokenizer(batch_sentences, padding=True)
->>> print(encoded_input)
-{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], 
-               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], 
-               [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], 
- 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 
- 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], 
-                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
-                    [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]}
-```
-
-Observa que el tokenizador ha aplicado el pad a la primera y la tercera frase con un "0" porque son más cortas.
-
-### Truncamiento
-
-En el otro extremo del espectro, a veces una secuencia puede ser demasiado larga para un modelo. En este caso, tendrás que truncar la secuencia a una longitud más corta.
-
-Establece el parámetro `truncation` a `True` para truncar una secuencia a la longitud máxima aceptada por el modelo:
-
-```py
->>> batch_sentences = [
-...     "But what about second breakfast?",
-...     "Don't think he knows about second breakfast, Pip.",
-...     "What about elevensies?",
-... ]
->>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True)
->>> print(encoded_input)
-{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], 
-               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], 
-               [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], 
- 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 
- 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], 
-                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
-                    [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]}
-```
-
-### Construye tensores
-
-Finalmente, si quieres que el tokenizador devuelva los tensores reales que se introducen en el modelo.
-
-Establece el parámetro `return_tensors` como `pt` para PyTorch, o `tf` para TensorFlow:
-
-```py
->>> batch_sentences = [
-...     "But what about second breakfast?",
-...     "Don't think he knows about second breakfast, Pip.",
-...     "What about elevensies?",
-... ]
->>> encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
->>> print(encoded_input)
-{'input_ids': tensor([[  101,   153,  7719, 21490,  1122,  1114,  9582,  1623,   102],
-                      [  101,  5226,  1122,  9649,  1199,  2610,  1236,   102,     0]]), 
- 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0],
-                           [0, 0, 0, 0, 0, 0, 0, 0, 0]]), 
- 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1],
-                           [1, 1, 1, 1, 1, 1, 1, 1, 0]])}
-===PT-TF-SPLIT===
->>> batch_sentences = [
-...     "But what about second breakfast?",
-...     "Don't think he knows about second breakfast, Pip.",
-...     "What about elevensies?",
-... ]
->>> encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors="tf")
->>> print(encoded_input)
-{'input_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
-array([[  101,   153,  7719, 21490,  1122,  1114,  9582,  1623,   102],
-       [  101,  5226,  1122,  9649,  1199,  2610,  1236,   102,     0]],
-      dtype=int32)>, 
- 'token_type_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
-array([[0, 0, 0, 0, 0, 0, 0, 0, 0],
-       [0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>, 
- 'attention_mask': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
-array([[1, 1, 1, 1, 1, 1, 1, 1, 1],
-       [1, 1, 1, 1, 1, 1, 1, 1, 0]], dtype=int32)>}
-```
-
-## Audio
-
-Las entradas de audio se preprocesan de forma diferente a las entradas textuales, pero el objetivo final es el mismo: crear secuencias numéricas que el modelo pueda entender. Un [extractor de características](main_classes/feature_extractor) (o feature extractor en inglés) está diseñado para extraer características de datos provenientes de imágenes o audio sin procesar y convertirlos en tensores. Antes de empezar, instala 🤗 Datasets para cargar un dataset de audio para experimentar:
-
-```bash
-pip install datasets
-```
-
-Carga la tarea de detección de palabras clave del benchmark [SUPERB](https://huggingface.co/datasets/superb) (consulta el [tutorial 🤗 Dataset](https://huggingface.co/docs/datasets/load_hub.html) para que obtengas más detalles sobre cómo cargar un dataset):
-
-```py
->>> from datasets import load_dataset, Audio
-
->>> dataset = load_dataset("superb", "ks")
-```
-
-Accede al primer elemento de la columna `audio` para echar un vistazo a la entrada. Al llamar a la columna `audio` se cargará y volverá a muestrear automáticamente el archivo de audio:
-
-```py
->>> dataset["train"][0]["audio"]
-{'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00592041,
-        -0.00405884, -0.00253296], dtype=float32),
- 'path': '/root/.cache/huggingface/datasets/downloads/extracted/05734a36d88019a09725c20cc024e1c4e7982e37d7d55c0c1ca1742ea1cdd47f/_background_noise_/doing_the_dishes.wav',
- 'sampling_rate': 16000}
-```
-
-Esto devuelve tres elementos:
-
-* `array` es la señal de voz cargada - y potencialmente remuestreada - como un array 1D.
-* `path` apunta a la ubicación del archivo de audio.
-* `sampling_rate` se refiere a cuántos puntos de datos de la señal de voz se miden por segundo.
-
-### Resample
-
-Para este tutorial, se utilizará el modelo [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base). Como puedes ver en la model card, el modelo Wav2Vec2 está pre-entrenado en audio de voz muestreado a 16kHz. Es importante que la tasa de muestreo de tus datos de audio coincida con la tasa de muestreo del dataset utilizado para pre-entrenar el modelo. Si la tasa de muestreo de tus datos no es la misma, deberás volver a muestrear tus datos de audio. 
-
-Por ejemplo, carga el dataset [LJ Speech](https://huggingface.co/datasets/lj_speech) que tiene una tasa de muestreo de 22050kHz. Para utilizar el modelo Wav2Vec2 con este dataset, reduce la tasa de muestreo a 16kHz:
-
-```py
->>> lj_speech = load_dataset("lj_speech", split="train")
->>> lj_speech[0]["audio"]
-{'array': array([-7.3242188e-04, -7.6293945e-04, -6.4086914e-04, ...,
-         7.3242188e-04,  2.1362305e-04,  6.1035156e-05], dtype=float32),
- 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav',
- 'sampling_rate': 22050}
-```
-
-1. Usa el método 🤗 Datasets' [`cast_column`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.cast_column) para reducir la tasa de muestreo a 16kHz:
-
-```py
->>> lj_speech = lj_speech.cast_column("audio", Audio(sampling_rate=16_000))
-```
-
-2. Carga el archivo de audio:
-
-```py
->>> lj_speech[0]["audio"]
-{'array': array([-0.00064146, -0.00074657, -0.00068768, ...,  0.00068341,
-         0.00014045,  0.        ], dtype=float32),
- 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav',
- 'sampling_rate': 16000}
-```
-
-Como puedes ver, el `sampling_rate` se ha reducido a 16kHz. Ahora que sabes cómo funciona el resampling, volvamos a nuestro ejemplo anterior con el dataset SUPERB.
-
-### Extractor de características
-
-El siguiente paso es cargar un extractor de características para normalizar y aplicar el pad a la entrada. Cuando se aplica padding a los datos textuales, se añade un "0" para las secuencias más cortas. La misma idea se aplica a los datos de audio y el extractor de características de audio añadirá un "0" - interpretado como silencio - al "array".
-
-Carga el extractor de características con [`AutoFeatureExtractor.from_pretrained`]:
-
-```py
->>> from transformers import AutoFeatureExtractor
-
->>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
-```
-
-Pasa el `array` de audio al extractor de características. También te recomendamos añadir el argumento `sampling_rate` en el extractor de características para poder depurar mejor los errores silenciosos que puedan producirse.
-
-```py
->>> audio_input = [dataset["train"][0]["audio"]["array"]]
->>> feature_extractor(audio_input, sampling_rate=16000)
-{'input_values': [array([ 0.00045439,  0.00045439,  0.00045439, ..., -0.1578519 , -0.10807519, -0.06727459], dtype=float32)]}
-```
-
-### Pad y truncamiento
-
-Al igual que el tokenizador, puedes aplicar padding o truncamiento para manejar secuencias variables en un batch. Fíjate en la longitud de la secuencia de estas dos muestras de audio:
-
-```py
->>> dataset["train"][0]["audio"]["array"].shape
-(1522930,)
-
->>> dataset["train"][1]["audio"]["array"].shape
-(988891,)
-```
-
-Como puedes ver, el `sampling_rate` se ha reducido a 16kHz. 
-
-```py
->>> def preprocess_function(examples):
-...     audio_arrays = [x["array"] for x in examples["audio"]]
-...     inputs = feature_extractor(
-...         audio_arrays,
-...         sampling_rate=16000,
-...         padding=True,
-...         max_length=1000000,
-...         truncation=True,
-...     )
-...     return inputs
-```
-
-Aplica la función a los primeros ejemplos del dataset:
-
-```py
->>> processed_dataset = preprocess_function(dataset["train"][:5])
-```
-
-Ahora echa un vistazo a las longitudes de las muestras procesadas:
-
-```py
->>> processed_dataset["input_values"][0].shape
-(1000000,)
-
->>> processed_dataset["input_values"][1].shape
-(1000000,)
-```
-
-Las longitudes de las dos primeras muestras coinciden ahora con la longitud máxima especificada.
-
-## Visión
-
-También se utiliza un extractor de características para procesar imágenes para tareas de visión por computadora. Una vez más, el objetivo es convertir la imagen en bruto en un batch de tensores como entrada.
-
-Vamos a cargar el dataset [food101](https://huggingface.co/datasets/food101) para este tutorial. Usa el parámetro 🤗 Datasets `split` para cargar solo una pequeña muestra de la división de entrenamiento ya que el dataset es bastante grande:
-
-```py
->>> from datasets import load_dataset
-
->>> dataset = load_dataset("food101", split="train[:100]")
-```
-
-A continuación, observa la imagen con la función 🤗 Datasets [`Image`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=image#datasets.Image):
-
-```py
->>> dataset[0]["image"]
-```
-
-![vision-preprocess-tutorial.png](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vision-preprocess-tutorial.png)
-
-### Extractor de características
-
-Carga el extractor de características con [`AutoFeatureExtractor.from_pretrained`]:
-
-```py
->>> from transformers import AutoFeatureExtractor
-
->>> feature_extractor = AutoFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
-```
-
-### Aumento de Datos
-
-Para las tareas de visión por computadora es común añadir algún tipo de aumento de datos (o data augmentation) a las imágenes como parte del preprocesamiento. Puedes añadir el método de aumento de datos con cualquier librería que quieras, pero en este tutorial utilizarás el módulo [`transforms`](https://pytorch.org/vision/stable/transforms.html) de torchvision.
-
-1. Normaliza la imagen y utiliza [`Compose`](https://pytorch.org/vision/master/generated/torchvision.transforms.Compose.html) para encadenar algunas transformaciones - [`RandomResizedCrop`](https://pytorch.org/vision/main/generated/torchvision.transforms.RandomResizedCrop.html) y [`ColorJitter`](https://pytorch.org/vision/main/generated/torchvision.transforms.ColorJitter.html) - juntas:
-
-```py
->>> from torchvision.transforms import Compose, Normalize, RandomResizedCrop, ColorJitter, ToTensor
-
->>> normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
->>> _transforms = Compose(
-...     [RandomResizedCrop(feature_extractor.size), ColorJitter(brightness=0.5, hue=0.5), ToTensor(), normalize]
-... )
-```
-
-2. El modelo acepta [`pixel_values`](model_doc/visionencoderdecoder#transformers.VisionEncoderDecoderModel.forward.pixel_values) como entrada. Este valor es generado por el extractor de características. Crea una función que genere `pixel_values` a partir de las transformaciones:
-
-```py
->>> def transforms(examples):
-...     examples["pixel_values"] = [_transforms(image.convert("RGB")) for image in examples["image"]]
-...     return examples
-```
-
-3. A continuación, utiliza 🤗 Datasets [`set_transform`](https://huggingface.co/docs/datasets/process.html#format-transform) para aplicar las transformaciones sobre la marcha:
-
-```py
->>> dataset.set_transform(transforms)
-```
-
-4. Ahora, cuando accedes a la imagen, observarás que el extractor de características ha añadido a la entrada del modelo `pixel_values`:
-
-```py
->>> dataset[0]["image"]
-{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=384x512 at 0x7F1A7B0630D0>,
- 'label': 6,
- 'pixel_values': tensor([[[ 0.0353,  0.0745,  0.1216,  ..., -0.9922, -0.9922, -0.9922],
-          [-0.0196,  0.0667,  0.1294,  ..., -0.9765, -0.9843, -0.9922],
-          [ 0.0196,  0.0824,  0.1137,  ..., -0.9765, -0.9686, -0.8667],
-          ...,
-          [ 0.0275,  0.0745,  0.0510,  ..., -0.1137, -0.1216, -0.0824],
-          [ 0.0667,  0.0824,  0.0667,  ..., -0.0588, -0.0745, -0.0980],
-          [ 0.0353,  0.0353,  0.0431,  ..., -0.0039, -0.0039, -0.0588]],
- 
-         [[ 0.2078,  0.2471,  0.2863,  ..., -0.9451, -0.9373, -0.9451],
-          [ 0.1608,  0.2471,  0.3098,  ..., -0.9373, -0.9451, -0.9373],
-          [ 0.2078,  0.2706,  0.3020,  ..., -0.9608, -0.9373, -0.8275],
-          ...,
-          [-0.0353,  0.0118, -0.0039,  ..., -0.2392, -0.2471, -0.2078],
-          [ 0.0196,  0.0353,  0.0196,  ..., -0.1843, -0.2000, -0.2235],
-          [-0.0118, -0.0039, -0.0039,  ..., -0.0980, -0.0980, -0.1529]],
- 
-         [[ 0.3961,  0.4431,  0.4980,  ..., -0.9216, -0.9137, -0.9216],
-          [ 0.3569,  0.4510,  0.5216,  ..., -0.9059, -0.9137, -0.9137],
-          [ 0.4118,  0.4745,  0.5216,  ..., -0.9137, -0.8902, -0.7804],
-          ...,
-          [-0.2314, -0.1922, -0.2078,  ..., -0.4196, -0.4275, -0.3882],
-          [-0.1843, -0.1686, -0.2000,  ..., -0.3647, -0.3804, -0.4039],
-          [-0.1922, -0.1922, -0.1922,  ..., -0.2941, -0.2863, -0.3412]]])}
-```
-
-Este es el aspecto de la imagen después de preprocesarla. Como era de esperar por las transformaciones aplicadas, la imagen ha sido recortada aleatoriamente y sus propiedades de color son diferentes.
-
-```py
->>> import numpy as np
->>> import matplotlib.pyplot as plt
-
->>> img = dataset[0]["pixel_values"]
->>> plt.imshow(img.permute(1, 2, 0))
-```
-
-![preprocessed_image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/preprocessed_image.png)
-
-## Multimodal
-
-Para las tareas multimodales utilizarás una combinación de todo lo que has aprendido hasta ahora y aplicarás tus habilidades a una tarea de reconocimiento automático de voz (ASR). Esto significa que necesitarás un:
-
-* Extractor de características para preprocesar los datos de audio.
-* Un tokenizador para procesar el texto.
-
-Volvamos al dataset [LJ Speech](https://huggingface.co/datasets/lj_speech):
-
-```py
->>> from datasets import load_dataset
-
->>> lj_speech = load_dataset("lj_speech", split="train")
-```
-
-Suponiendo que te interesan principalmente las columnas `audio` y `texto`, elimina las demás columnas:
-
-```py
->>> lj_speech = lj_speech.map(remove_columns=["file", "id", "normalized_text"])
-```
-
-Ahora echa un vistazo a las columnas `audio` y `texto`:
-
-```py
->>> lj_speech[0]["audio"]
-{'array': array([-7.3242188e-04, -7.6293945e-04, -6.4086914e-04, ...,
-         7.3242188e-04,  2.1362305e-04,  6.1035156e-05], dtype=float32),
- 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav',
- 'sampling_rate': 22050}
-
->>> lj_speech[0]["text"]
-'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition'
-```
-
-Recuerda la sección anterior sobre el procesamiento de datos de audio, siempre debes [volver a muestrear](preprocessing#audio) la tasa de muestreo de tus datos de audio para que coincida con la tasa de muestreo del dataset utilizado para preentrenar un modelo:
-
-```py
->>> lj_speech = lj_speech.cast_column("audio", Audio(sampling_rate=16_000))
-```
-
-### Processor
-
-Un processor combina un extractor de características y un tokenizador. Cargue un procesador con [`AutoProcessor.from_pretrained]:
-
-```py
->>> from transformers import AutoProcessor
-
->>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
-```
-
-1. Crea una función para procesar los datos de audio en `input_values`, y tokeniza el texto en `labels`. Estas son las entradas del modelo:
-
-```py
->>> def prepare_dataset(example):
-...     audio = example["audio"]
-
-...     example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000))
-
-...     return example
-```
-
-2. Aplica la función `prepare_dataset` a una muestra:
-
-```py
->>> prepare_dataset(lj_speech[0])
-```
-
-Observa que el método processor ha añadido `input_values` y `labels`. La tasa de muestreo también se ha reducido correctamente a 16kHz.
-
-Genial, ahora deberías ser capaz de preprocesar datos para cualquier modalidad e incluso combinar diferentes modalidades. En el siguiente tutorial, aprenderás aplicar fine tuning a un modelo en tus datos recién preprocesados.
-
-## Todo lo que siempre quisiste saber sobre el padding y el truncamiento
-
-Hemos visto los comandos que funcionarán para la mayoría de los casos (hacer pad a tu batch teniendo en cuenta la longitud de la frase máxima y 
-truncar a la longitud máxima que el modelo puede aceptar). Sin embargo, la API admite más estrategias si las necesitas. Los 
-tres argumentos que necesitas conocer para ello son `padding`, `truncation` y `max_length`.
-
-- `padding` controla el aplicarme padding al texto. Puede ser un booleano o una cadena que debe ser:
-
-  - `True` o `'longest'` para aplicar el pad hasta la secuencia más larga del batch (no apliques el padding si sólo le proporcionas 
-  una sola secuencia).
-  - `'max_length'` para aplicar el pad hasta la longitud especificada por el argumento `max_length` o la longitud máxima aceptada 
-  por el modelo si no le proporcionas `longitud_máxima` (`longitud_máxima=None`). Si sólo le proporcionas una única secuencia 
-  se le aplicará el padding.
-  `False` o `'do_not_pad'` para no aplicar pad a las secuencias. Como hemos visto antes, este es el comportamiento por 
-  defecto.
-
-- `truncation` controla el truncamiento. Puede ser un booleano o una string que debe ser:
-
-  - `True` o `'longest_first'` truncan hasta la longitud máxima especificada por el argumento `max_length` o 
-  la longitud máxima aceptada por el modelo si no le proporcionas `max_length` (`max_length=None`). Esto 
-  truncará token por token, eliminando un token de la secuencia más larga del par hasta alcanzar la longitud 
-  adecuada.
-  - `'only_second'` trunca hasta la longitud máxima especificada por el argumento `max_length` o la 
-  longitud máxima aceptada por el modelo si no le proporcionas `max_length` (`max_length=None`). Esto sólo truncará 
-  la segunda frase de un par si le proporcionas un par de secuencias (o un batch de pares de secuencias).
-  - `'only_first'` trunca hasta la longitud máxima especificada por el argumento `max_length` o la longitud máxima 
-  aceptada por el modelo si no se proporciona `max_length` (`max_length=None`). Esto sólo truncará 
-  la primera frase de un par si se proporciona un par de secuencias (o un lote de pares de secuencias).
-  - `False` o `'do_not_truncate'` para no truncar las secuencias. Como hemos visto antes, este es el comportamiento 
-  por defecto.
-
-- `max_length` para controlar la longitud del padding/truncamiento. Puede ser un número entero o `None`, en cuyo caso 
-será por defecto la longitud máxima que el modelo puede aceptar. Si el modelo no tiene una longitud máxima de entrada específica, el 
-padding/truncamiento a `longitud_máxima` se desactiva.
-
-A continuación te mostramos en una tabla que resume la forma recomendada de configurar el padding y el truncamiento. Si utilizas un par de secuencias de entrada en 
-algunos de los siguientes ejemplos, puedes sustituir `truncation=True` por una `STRATEGY` seleccionada en 
-`['only_first', 'only_second', 'longest_first']`, es decir, `truncation='only_second'` o `truncation= 'longest_first'` para controlar cómo se truncan ambas secuencias del par como se ha detallado anteriormente.
-
-| Truncation                           | Padding                           | Instrucciones                                                                               |
-|--------------------------------------|-----------------------------------|---------------------------------------------------------------------------------------------|
-| no truncation                        | no padding                        | `tokenizer(batch_sentences)`                                                           |
-|                                      | padding secuencia max del batch   | `tokenizer(batch_sentences, padding=True)` or                                          |
-|                                      |                                   | `tokenizer(batch_sentences, padding='longest')`                                        |
-|                                      | padding long max de input model   | `tokenizer(batch_sentences, padding='max_length')`                                     |
-|                                      | padding a una long especifica     | `tokenizer(batch_sentences, padding='max_length', max_length=42)`                      |
-| truncation long max del input model  | no padding                        | `tokenizer(batch_sentences, truncation=True)` or                                       |
-|                                      |                                   | `tokenizer(batch_sentences, truncation=STRATEGY)`                                      |
-|                                      | padding secuencia max del batch   | `tokenizer(batch_sentences, padding=True, truncation=True)` or                         |
-|                                      |                                   | `tokenizer(batch_sentences, padding=True, truncation=STRATEGY)`                        |
-|                                      | padding long max de input model   | `tokenizer(batch_sentences, padding='max_length', truncation=True)` or                 |
-|                                      |                                   | `tokenizer(batch_sentences, padding='max_length', truncation=STRATEGY)`                |
-|                                      | padding a una long especifica     | Not possible                                                                                |
-| truncation a una long especifica      | no padding                        | `tokenizer(batch_sentences, truncation=True, max_length=42)` or                        |
-|                                      |                                   | `tokenizer(batch_sentences, truncation=STRATEGY, max_length=42)`                       |
-|                                      | padding secuencia max del batch   | `tokenizer(batch_sentences, padding=True, truncation=True, max_length=42)` or          |
-|                                      |                                   | `tokenizer(batch_sentences, padding=True, truncation=STRATEGY, max_length=42)`         |
-|                                      | padding long max de input model   | Not possible                                                                                |
-|                                      | padding a una long especifica     | `tokenizer(batch_sentences, padding='max_length', truncation=True, max_length=42)` or  |
-|                                      |                                   | `tokenizer(batch_sentences, padding='max_length', truncation=STRATEGY, max_length=42)` |
-
-
-
-
-
-
-
-
diff --git a/docs/source/es/quicktour.md b/docs/source/es/quicktour.md
new file mode 100644
index 000000000000..ad2549ef450b
--- /dev/null
+++ b/docs/source/es/quicktour.md
@@ -0,0 +1,401 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Tour rápido
+
+[[open-in-colab]]
+
+¡Entra en marcha con los 🤗 Transformers! Comienza usando [`pipeline`] para una inferencia veloz, carga un modelo preentrenado y un tokenizador con una [AutoClass](./model_doc/auto) para resolver tu tarea de texto, visión o audio.
+
+<Tip>
+
+Todos los ejemplos de código presentados en la documentación tienen un botón arriba a la derecha para elegir si quieres ocultar o mostrar el código en Pytorch o TensorFlow.
+Si no fuese así, se espera que el código funcione para ambos backends sin ningún cambio.
+
+</Tip>
+
+## Pipeline
+
+[`pipeline`] es la forma más fácil de usar un modelo preentrenado para una tarea dada.
+
+<Youtube id="tiZFewofSLM"/>
+
+El [`pipeline`] soporta muchas tareas comunes listas para usar:
+
+**Texto**:
+* Análisis de Sentimiento (Sentiment Analysis, en inglés): clasifica la polaridad de un texto dado.
+* Generación de Texto (Text Generation, en inglés): genera texto a partir de un input dado.
+* Reconocimiento de Entidades (Name Entity Recognition o NER, en inglés): etiqueta cada palabra con la entidad que representa (persona, fecha, ubicación, etc.).
+* Responder Preguntas (Question answering, en inglés): extrae la respuesta del contexto dado un contexto y una pregunta.
+* Rellenar Máscara (Fill-mask, en inglés): rellena el espacio faltante dado un texto con palabras enmascaradas.
+* Resumir (Summarization, en inglés): genera un resumen de una secuencia larga de texto o un documento.
+* Traducción (Translation, en inglés): traduce un texto a otro idioma.
+* Extracción de Características (Feature Extraction, en inglés): crea una representación tensorial del texto.
+
+**Imagen**:
+* Clasificación de Imágenes (Image Classification, en inglés): clasifica una imagen.
+* Segmentación de Imágenes (Image Segmentation, en inglés): clasifica cada pixel de una imagen.
+* Detección de Objetos (Object Detection, en inglés): detecta objetos dentro de una imagen.
+
+**Audio**:
+* Clasificación de Audios (Audio Classification, en inglés): asigna una etiqueta a un segmento de audio.
+* Reconocimiento de Voz Automático (Automatic Speech Recognition o ASR, en inglés): transcribe datos de audio a un texto.
+
+<Tip>
+
+Para más detalles acerca del [`pipeline`] y tareas asociadas, consulta la documentación [aquí](./main_classes/pipelines).
+
+</Tip>
+
+### Uso del Pipeline
+
+En el siguiente ejemplo, usarás el [`pipeline`] para análisis de sentimiento.
+
+Instala las siguientes dependencias si aún no lo has hecho:
+
+<frameworkcontent>
+<pt>
+
+```bash
+pip install torch
+```
+</pt>
+<tf>
+
+```bash
+pip install tensorflow
+```
+</tf>
+</frameworkcontent>
+
+Importa [`pipeline`] y especifica la tarea que deseas completar:
+
+```py
+>>> from transformers import pipeline
+
+>>> clasificador = pipeline("sentiment-analysis", model="pysentimiento/robertuito-sentiment-analysis")
+```
+
+El pipeline descarga y almacena en caché el [modelo preentrenado](https://huggingface.co/pysentimiento/robertuito-sentiment-analysis) y tokeniza para análisis de sentimiento. Si no hubieramos elegido un modelo el pipeline habría elegido uno por defecto. Ahora puedes usar `clasificador` en tu texto objetivo:
+
+```py
+>>> clasificador("Estamos muy felices de mostrarte la biblioteca de 🤗 Transformers.")
+[{'label': 'POS', 'score': 0.9320}]
+```
+
+Para más de un enunciado, entrega una lista al [`pipeline`] que devolverá una lista de diccionarios:
+
+El [`pipeline`] también puede iterar sobre un dataset entero. Comienza instalando la biblioteca [🤗 Datasets](https://huggingface.co/docs/datasets/):
+
+```bash
+pip install datasets
+```
+
+Crea un [`pipeline`] con la tarea que deseas resolver y el modelo que quieres usar. Coloca el parámetro `device` a `0` para poner los tensores en un dispositivo CUDA:
+
+```py
+>>> import torch
+>>> from transformers import pipeline
+
+>>> reconocedor_de_voz = pipeline(
+...     "automatic-speech-recognition", model="jonatasgrosman/wav2vec2-large-xlsr-53-spanish", device=0
+... )
+```
+
+A continuación, carga el dataset (ve 🤗 Datasets [Quick Start](https://huggingface.co/docs/datasets/quickstart.html) para más detalles) sobre el que quisieras iterar. Por ejemplo, vamos a cargar el dataset [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14):
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> dataset = load_dataset("PolyAI/minds14", name="es-ES", split="train")  # doctest: +IGNORE_RESULT
+```
+
+Debemos asegurarnos de que la frecuencia de muestreo del conjunto de datos coincide con la frecuencia de muestreo con la que se entrenó `jonatasgrosman/wav2vec2-large-xlsr-53-spanish`.
+
+```py
+>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=reconocedor_de_voz.feature_extractor.sampling_rate))
+```
+
+Los archivos de audio se cargan y remuestrean automáticamente cuando llamamos a la columna `"audio"`.
+Extraigamos las matrices de onda cruda (raw waveform, en inglés) de las primeras 4 muestras y pasémosla como una lista al pipeline:
+
+```py
+>>> resultado = reconocedor_de_voz(dataset[:4]["audio"])
+>>> print([d["text"] for d in resultado])
+['ahora buenas eh a ver tengo un problema con vuestra aplicación resulta que que quiero hacer una transferencia bancaria a una cuenta conocida pero me da error la aplicación a ver que a ver que puede ser', 'la aplicación no cargue saldo de mi nueva cuenta', 'hola tengo un problema con la aplicación no carga y y tampoco veo que carga el saldo de mi cuenta nueva dice que la aplicación está siendo reparada y ahora no puedo acceder a mi cuenta no necesito inmediatamente', 'hora buena la aplicación no se carga la vida no carga el saldo de mi cuenta nueva dice que la villadenta siendo reparada y oro no puedo hacer a mi cuenta']
+```
+
+Para un dataset más grande, donde los inputs son de mayor tamaño (como en habla/audio o visión), querrás pasar un generador en lugar de una lista que carga todos los inputs en memoria. Ve la [documentación del pipeline](./main_classes/pipelines) para más información.
+
+### Usa otro modelo y otro tokenizador en el pipeline
+
+El [`pipeline`] puede acomodarse a cualquier modelo del [Model Hub](https://huggingface.co/models) haciendo más fácil adaptar el [`pipeline`] para otros casos de uso. Por ejemplo, si quisieras un modelo capaz de manejar texto en francés, usa los tags en el Model Hub para filtrar entre los modelos apropiados. El resultado mejor filtrado devuelve un [modelo BERT](https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment) multilingual fine-tuned para el análisis de sentimiento. Genial, ¡vamos a usar este modelo!
+
+```py
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+```
+
+<frameworkcontent>
+<pt>
+Usa [`AutoModelForSequenceClassification`] y ['AutoTokenizer'] para cargar un modelo preentrenado y un tokenizador asociado (más en un `AutoClass` debajo):
+
+```py
+>>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained(model_name)
+>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+```
+
+</pt>
+
+<tf>
+Usa [`TFAutoModelForSequenceClassification`] y ['AutoTokenizer'] para cargar un modelo preentrenado y un tokenizador asociado (más en un `TFAutoClass` debajo):
+
+```py
+>>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+
+>>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
+>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+```
+
+</tf>
+</frameworkcontent>
+
+Después puedes especificar el modelo y el tokenizador en el [`pipeline`], y aplicar el `classifier` en tu texto objetivo:
+
+```py
+>>> classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
+>>> classifier("Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers.")
+[{'label': '5 stars', 'score': 0.7273}]
+```
+
+Si no pudieras encontrar el modelo para tu caso respectivo de uso necesitarás ajustar un modelo preentrenado a tus datos. Mira nuestro [tutorial de fine-tuning](./training) para aprender cómo. Finalmente, después de que has ajustado tu modelo preentrenado, ¡por favor considera compartirlo (ve el tutorial [aquí](./model_sharing)) con la comunidad en el Model Hub para democratizar el NLP! 🤗
+
+## AutoClass
+
+<Youtube id="AhChOFRegn4"/>
+
+Por debajo, las clases [`AutoModelForSequenceClassification`] y [`AutoTokenizer`] trabajan juntas para dar poder al [`pipeline`]. Una [AutoClass](./model_doc/auto) es un atajo que automáticamente recupera la arquitectura de un modelo preentrenado con su nombre o el path. Sólo necesitarás seleccionar el `AutoClass` apropiado para tu tarea y tu tokenizador asociado con [`AutoTokenizer`].
+
+Regresemos a nuestro ejemplo y veamos cómo puedes usar el `AutoClass` para reproducir los resultados del [`pipeline`].
+
+### AutoTokenizer
+
+Un tokenizador es responsable de procesar el texto a un formato que sea entendible para el modelo. Primero, el tokenizador separará el texto en palabras llamadas *tokens*. Hay múltiples reglas que gobiernan el proceso de tokenización incluyendo el cómo separar una palabra y en qué nivel (aprende más sobre tokenización [aquí](./tokenizer_summary)). Lo más importante es recordar que necesitarás instanciar el tokenizador con el mismo nombre del modelo para asegurar que estás usando las mismas reglas de tokenización con las que el modelo fue preentrenado.
+
+Carga un tokenizador con [`AutoTokenizer`]:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> nombre_del_modelo = "nlptown/bert-base-multilingual-uncased-sentiment"
+>>> tokenizer = AutoTokenizer.from_pretrained(nombre_del_modelo)
+```
+
+Después, el tokenizador convierte los tokens a números para construir un tensor que servirá como input para el modelo. Esto es conocido como el *vocabulario* del modelo.
+
+Pasa tu texto al tokenizador:
+
+```py
+>>> encoding = tokenizer("Estamos muy felices de mostrarte la biblioteca de 🤗 Transformers.")
+>>> print(encoding)
+{'input_ids': [101, 10602, 14000, 13653, 43353, 10107, 10102, 47201, 10218, 10106, 18283, 10102, 100, 58263, 119, 102],
+ 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+ 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+```
+
+El tokenizador devolverá un diccionario conteniendo:
+
+* [input_ids](./glossary#input-ids): representaciones numéricas de los tokens.
+* [atttention_mask](.glossary#attention-mask): indica cuáles tokens deben ser atendidos.
+
+Como con el [`pipeline`], el tokenizador aceptará una lista de inputs. Además, el tokenizador también puede rellenar (pad, en inglés) y truncar el texto para devolver un lote (batch, en inglés) de longitud uniforme:
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> pt_batch = tokenizer(
+...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
+...     padding=True,
+...     truncation=True,
+...     max_length=512,
+...     return_tensors="pt",
+... )
+```
+</pt>
+<tf>
+
+```py
+>>> tf_batch = tokenizer(
+...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
+...     padding=True,
+...     truncation=True,
+...     max_length=512,
+...     return_tensors="tf",
+... )
+```
+</tf>
+</frameworkcontent>
+
+Lee el tutorial de [preprocessing](./preprocessing) para más detalles acerca de la tokenización.
+
+### AutoModel
+
+<frameworkcontent>
+<pt>
+🤗 Transformers provee una forma simple y unificada de cargar tus instancias preentrenadas. Esto significa que puedes cargar un [`AutoModel`] como cargarías un [`AutoTokenizer`]. La única diferencia es seleccionar el [`AutoModel`] correcto para la tarea. Ya que estás clasificando texto, o secuencias, carga [`AutoModelForSequenceClassification`]:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)
+```
+
+<Tip>
+
+Ve el [task summary](./task_summary) para revisar qué clase del [`AutoModel`] deberías usar para cada tarea.
+
+</Tip>
+
+Ahora puedes pasar tu lote (batch) preprocesado de inputs directamente al modelo. Solo tienes que desempacar el diccionario añadiendo `**`:
+
+```py
+>>> pt_outputs = pt_model(**pt_batch)
+```
+
+El modelo producirá las activaciones finales en el atributo `logits`. Aplica la función softmax a `logits` para obtener las probabilidades:
+
+```py
+>>> from torch import nn
+
+>>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
+>>> print(pt_predictions)
+tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
+        [0.2084, 0.1826, 0.1969, 0.1755, 0.2365]], grad_fn=<SoftmaxBackward0>)
+```
+</pt>
+<tf>
+🤗 Transformers provee una forma simple y unificada de cargar tus instancias preentrenadas. Esto significa que puedes cargar un [`TFAutoModel`] como cargarías un [`AutoTokenizer`]. La única diferencia es seleccionar el [`TFAutoModel`] correcto para la tarea. Ya que estás clasificando texto, o secuencias, carga [`TFAutoModelForSequenceClassification`]:
+
+```py
+>>> from transformers import TFAutoModelForSequenceClassification
+
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
+```
+
+<Tip>
+  Ve el [task summary](./task_summary) para revisar qué clase del [`AutoModel`]
+  deberías usar para cada tarea.
+</Tip>
+
+Ahora puedes pasar tu lote preprocesado de inputs directamente al modelo pasando las llaves del diccionario directamente a los tensores:
+
+```py
+>>> tf_outputs = tf_model(tf_batch)
+```
+
+El modelo producirá las activaciones finales en el atributo `logits`. Aplica la función softmax a `logits` para obtener las probabilidades:
+
+```py
+>>> import tensorflow as tf
+
+>>> tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1)
+>>> print(tf.math.round(tf_predictions * 10**4) / 10**4)
+tf.Tensor(
+[[0.0021 0.0018 0.0116 0.2121 0.7725]
+ [0.2084 0.1826 0.1969 0.1755  0.2365]], shape=(2, 5), dtype=float32)
+```
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+Todos los modelos de 🤗 Transformers (PyTorch o TensorFlow) producirán los tensores *antes* de la función de activación
+final (como softmax) porque la función de activación final es comúnmente fusionada con la pérdida.
+
+</Tip>
+
+Los modelos son [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) o [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) estándares así que podrás usarlos en tu training loop usual. Sin embargo, para facilitar las cosas, 🤗 Transformers provee una clase [`Trainer`] para PyTorch que añade funcionalidades para entrenamiento distribuido, precición mixta, y más. Para TensorFlow, puedes usar el método `fit` desde [Keras](https://keras.io/). Consulta el [tutorial de entrenamiento](./training) para más detalles.
+
+<Tip>
+
+Los outputs del modelo de 🤗 Transformers son dataclasses especiales por lo que sus atributos pueden ser completados en un IDE.
+Los outputs del modelo también se comportan como tuplas o diccionarios (e.g., puedes indexar con un entero, un slice o una cadena) en cuyo caso los atributos que son `None` son ignorados.
+
+</Tip>
+
+### Guarda un modelo
+
+<frameworkcontent>
+<pt>
+Una vez que se haya hecho fine-tuning a tu modelo puedes guardarlo con tu tokenizador usando [`PreTrainedModel.save_pretrained`]:
+
+```py
+>>> pt_save_directory = "./pt_save_pretrained"
+>>> tokenizer.save_pretrained(pt_save_directory)  # doctest: +IGNORE_RESULT
+>>> pt_model.save_pretrained(pt_save_directory)
+```
+
+Cuando quieras usar el modelo otra vez cárgalo con [`PreTrainedModel.from_pretrained`]:
+
+```py
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained")
+```
+
+</pt>
+
+<tf>
+Una vez que se haya hecho fine-tuning a tu modelo puedes guardarlo con tu tokenizador usando [`TFPreTrainedModel.save_pretrained`]:
+
+```py
+>>> tf_save_directory = "./tf_save_pretrained"
+>>> tokenizer.save_pretrained(tf_save_directory)  # doctest: +IGNORE_RESULT
+>>> tf_model.save_pretrained(tf_save_directory)
+```
+
+Cuando quieras usar el modelo otra vez cárgalo con [`TFPreTrainedModel.from_pretrained`]:
+
+```py
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("./tf_save_pretrained")
+```
+</tf>
+</frameworkcontent>
+
+Una característica particularmente interesante de 🤗 Transformers es la habilidad de guardar el modelo y cargarlo como un modelo de PyTorch o TensorFlow. El parámetro `from_pt` o `from_tf` puede convertir el modelo de un framework al otro:
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> from transformers import AutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
+```
+</pt>
+<tf>
+
+```py
+>>> from transformers import TFAutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
+```
+</tf>
+</frameworkcontent>
diff --git a/docs/source/es/quicktour.mdx b/docs/source/es/quicktour.mdx
deleted file mode 100644
index 408c3fa375a0..000000000000
--- a/docs/source/es/quicktour.mdx
+++ /dev/null
@@ -1,391 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Tour rápido
-
-[[open-in-colab]]
-
-¡Entra en marcha con los 🤗 Transformers! Comienza usando [`pipeline`] para una inferencia veloz, carga un modelo preentrenado y un tokenizador con una [AutoClass](./model_doc/auto) para resolver tu tarea de texto, visión o audio.
-
-<Tip>
-
-Todos los ejemplos de código presentados en la documentación tienen un botón arriba a la derecha para elegir si quieres ocultar o mostrar el código en Pytorch o TensorFlow.
-Si no fuese así, se espera que el código funcione para ambos backends sin ningún cambio.
-
-</Tip>
-
-## Pipeline
-
-[`pipeline`] es la forma más fácil de usar un modelo preentrenado para una tarea dada.
-
-<Youtube id="tiZFewofSLM"/>
-
-El [`pipeline`] soporta muchas tareas comunes listas para usar:
-
-**Texto**:
-* Análisis de Sentimiento (Sentiment Analysis, en inglés): clasifica la polaridad de un texto dado.
-* Generación de Texto (Text Generation, en inglés): genera texto a partir de un input dado.
-* Reconocimiento de Entidades (Name Entity Recognition o NER, en inglés): etiqueta cada palabra con la entidad que representa (persona, fecha, ubicación, etc.).
-* Responder Preguntas (Question answering, en inglés): extrae la respuesta del contexto dado un contexto y una pregunta.
-* Rellenar Máscara (Fill-mask, en inglés): rellena el espacio faltante dado un texto con palabras enmascaradas.
-* Resumir (Summarization, en inglés): genera un resumen de una secuencia larga de texto o un documento.
-* Traducción (Translation, en inglés): traduce un texto a otro idioma.
-* Extracción de Características (Feature Extraction, en inglés): crea una representación tensorial del texto.
-
-**Imagen**:
-* Clasificación de Imágenes (Image Classification, en inglés): clasifica una imagen.
-* Segmentación de Imágenes (Image Segmentation, en inglés): clasifica cada pixel de una imagen.
-* Detección de Objetos (Object Detection, en inglés): detecta objetos dentro de una imagen.
-
-**Audio**:
-* Clasificación de Audios (Audio Classification, en inglés): asigna una etiqueta a un segmento de audio.
-* Reconocimiento de Voz Automático (Automatic Speech Recognition o ASR, en inglés): transcribe datos de audio a un texto.
-
-<Tip>
-
-Para más detalles acerca del [`pipeline`] y tareas asociadas, consulta la documentación [aquí](./main_classes/pipelines).
-
-</Tip>
-
-### Uso del Pipeline
-
-En el siguiente ejemplo, usarás el [`pipeline`] para análisis de sentimiento.
-
-Instala las siguientes dependencias si aún no lo has hecho:
-
-<frameworkcontent>
-<pt>
-```bash
-pip install torch
-```
-</pt>
-<tf>
-```bash
-pip install tensorflow
-```
-</tf>
-</frameworkcontent>
-
-Importa [`pipeline`] y especifica la tarea que deseas completar:
-
-```py
->>> from transformers import pipeline
-
->>> clasificador = pipeline("sentiment-analysis", model="pysentimiento/robertuito-sentiment-analysis")
-```
-
-El pipeline descarga y almacena en caché el [modelo preentrenado](https://huggingface.co/pysentimiento/robertuito-sentiment-analysis) y tokeniza para análisis de sentimiento. Si no hubieramos elegido un modelo el pipeline habría elegido uno por defecto. Ahora puedes usar `clasificador` en tu texto objetivo:
-
-```py
->>> clasificador("Estamos muy felices de mostrarte la biblioteca de 🤗 Transformers.")
-[{'label': 'POS', 'score': 0.9916}]
-```
-
-Para más de un enunciado, entrega una lista al [`pipeline`] que devolverá una lista de diccionarios:
-
-El [`pipeline`] también puede iterar sobre un dataset entero. Comienza instalando la biblioteca [🤗 Datasets](https://huggingface.co/docs/datasets/):
-
-```bash
-pip install datasets
-```
-
-Crea un [`pipeline`] con la tarea que deseas resolver y el modelo que quieres usar. Coloca el parámetro `device` a `0` para poner los tensores en un dispositivo CUDA:
-
-```py
->>> import torch
->>> from transformers import pipeline
-
->>> reconocedor_de_voz = pipeline(
-...     "automatic-speech-recognition", model="jonatasgrosman/wav2vec2-large-xlsr-53-spanish", device=0
-... )
-```
-
-A continuación, carga el dataset (ve 🤗 Datasets [Quick Start](https://huggingface.co/docs/datasets/quickstart.html) para más detalles) sobre el que quisieras iterar. Por ejemplo, vamos a cargar el dataset [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14):
-
-```py
->>> from datasets import load_dataset, Audio
-
->>> dataset = load_dataset("PolyAI/minds14", name="es-ES", split="train")  # doctest: +IGNORE_RESULT
-```
-
-Debemos asegurarnos de que la frecuencia de muestreo del conjunto de datos coincide con la frecuencia de muestreo con la que se entrenó `jonatasgrosman/wav2vec2-large-xlsr-53-spanish`.
-
-```py
->>> dataset = dataset.cast_column("audio", Audio(sampling_rate=reconocedor_de_voz.feature_extractor.sampling_rate))
-```
-
-Los archivos de audio se cargan y remuestrean automáticamente cuando llamamos a la columna `"audio"`.
-Extraigamos las matrices de onda cruda (raw waveform, en inglés) de las primeras 4 muestras y pasémosla como una lista al pipeline:
-
-```py
->>> resultado = reconocedor_de_voz(dataset[:4]["audio"])
->>> print([d["text"] for d in resultado])
-['ahora buenas eh a ver tengo un problema con vuestra aplicación resulta que que quiero hacer una transferencia bancaria a una cuenta conocida pero me da error la aplicación a ver que a ver que puede ser', 'la aplicación no cargue saldo de mi nueva cuenta', 'hola tengo un problema con la aplicación no carga y y tampoco veo que carga el saldo de mi cuenta nueva dice que la aplicación está siendo reparada y ahora no puedo acceder a mi cuenta no necesito inmediatamente', 'hora buena la aplicación no se carga la vileza no carga el saldo de mi cuenta nueva dice que la villadenta siendo reparada y oro no puedo hacer a mi cuenta']
-```
-
-Para un dataset más grande, donde los inputs son de mayor tamaño (como en habla/audio o visión), querrás pasar un generador en lugar de una lista que carga todos los inputs en memoria. Ve la [documentación del pipeline](./main_classes/pipelines) para más información.
-
-### Usa otro modelo y otro tokenizador en el pipeline
-
-El [`pipeline`] puede acomodarse a cualquier modelo del [Model Hub](https://huggingface.co/models) haciendo más fácil adaptar el [`pipeline`] para otros casos de uso. Por ejemplo, si quisieras un modelo capaz de manejar texto en francés, usa los tags en el Model Hub para filtrar entre los modelos apropiados. El resultado mejor filtrado devuelve un [modelo BERT](https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment) multilingual fine-tuned para el análisis de sentimiento. Genial, ¡vamos a usar este modelo!
-
-```py
->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
-```
-
-<frameworkcontent>
-<pt>
-Usa [`AutoModelForSequenceClassification`] y ['AutoTokenizer'] para cargar un modelo preentrenado y un tokenizador asociado (más en un `AutoClass` debajo):
-
-```py
->>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
-
->>> model = AutoModelForSequenceClassification.from_pretrained(model_name)
->>> tokenizer = AutoTokenizer.from_pretrained(model_name)
-```
-
-</pt>
-
-<tf>
-Usa [`TFAutoModelForSequenceClassification`] y ['AutoTokenizer'] para cargar un modelo preentrenado y un tokenizador asociado (más en un `TFAutoClass` debajo):
-
-```py
->>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
-
->>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
->>> tokenizer = AutoTokenizer.from_pretrained(model_name)
-```
-
-</tf>
-</frameworkcontent>
-
-Después puedes especificar el modelo y el tokenizador en el [`pipeline`], y aplicar el `classifier` en tu texto objetivo:
-
-```py
->>> classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
->>> classifier("Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers.")
-[{'label': '5 stars', 'score': 0.7273}]
-```
-
-Si no pudieras encontrar el modelo para tu caso respectivo de uso necesitarás ajustar un modelo preentrenado a tus datos. Mira nuestro [tutorial de fine-tuning](./training) para aprender cómo. Finalmente, después de que has ajustado tu modelo preentrenado, ¡por favor considera compartirlo (ve el tutorial [aquí](./model_sharing)) con la comunidad en el Model Hub para democratizar el NLP! 🤗
-
-## AutoClass
-
-<Youtube id="AhChOFRegn4"/>
-
-Por debajo, las clases [`AutoModelForSequenceClassification`] y [`AutoTokenizer`] trabajan juntas para dar poder al [`pipeline`]. Una [AutoClass](./model_doc/auto) es un atajo que automáticamente recupera la arquitectura de un modelo preentrenado con su nombre o el path. Sólo necesitarás seleccionar el `AutoClass` apropiado para tu tarea y tu tokenizador asociado con [`AutoTokenizer`].
-
-Regresemos a nuestro ejemplo y veamos cómo puedes usar el `AutoClass` para reproducir los resultados del [`pipeline`].
-
-### AutoTokenizer
-
-Un tokenizador es responsable de procesar el texto a un formato que sea entendible para el modelo. Primero, el tokenizador separará el texto en palabras llamadas *tokens*. Hay múltiples reglas que gobiernan el proceso de tokenización incluyendo el cómo separar una palabra y en qué nivel (aprende más sobre tokenización [aquí](./tokenizer_summary)). Lo más importante es recordar que necesitarás instanciar el tokenizador con el mismo nombre del modelo para asegurar que estás usando las mismas reglas de tokenización con las que el modelo fue preentrenado.
-
-Carga un tokenizador con [`AutoTokenizer`]:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> nombre_del_modelo = "nlptown/bert-base-multilingual-uncased-sentiment"
->>> tokenizer = AutoTokenizer.from_pretrained(nombre_del_modelo)
-```
-
-Después, el tokenizador convierte los tokens a números para construir un tensor que servirá como input para el modelo. Esto es conocido como el *vocabulario* del modelo.
-
-Pasa tu texto al tokenizador:
-
-```py
->>> encoding = tokenizer("Estamos muy felices de mostrarte la biblioteca de 🤗 Transformers.")
->>> print(encoding)
-{'input_ids': [101, 10602, 14000, 13653, 43353, 10107, 10102, 47201, 10218, 10106, 18283, 10102, 100, 58263, 119, 102],
- 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
- 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
-```
-
-El tokenizador devolverá un diccionario conteniendo:
-
-* [input_ids](./glossary#input-ids): representaciones numéricas de los tokens.
-* [atttention_mask](.glossary#attention-mask): indica cuáles tokens deben ser atendidos.
-
-Como con el [`pipeline`], el tokenizador aceptará una lista de inputs. Además, el tokenizador también puede rellenar (pad, en inglés) y truncar el texto para devolver un lote (batch, en inglés) de longitud uniforme:
-
-<frameworkcontent>
-<pt>
-```py
->>> pt_batch = tokenizer(
-...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
-...     padding=True,
-...     truncation=True,
-...     max_length=512,
-...     return_tensors="pt",
-... )
-```
-</pt>
-<tf>
-```py
->>> tf_batch = tokenizer(
-...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
-...     padding=True,
-...     truncation=True,
-...     max_length=512,
-...     return_tensors="tf",
-... )
-```
-</tf>
-</frameworkcontent>
-
-Lee el tutorial de [preprocessing](./preprocessing) para más detalles acerca de la tokenización.
-
-### AutoModel
-
-<frameworkcontent>
-<pt>
-🤗 Transformers provee una forma simple y unificada de cargar tus instancias preentrenadas. Esto significa que puedes cargar un [`AutoModel`] como cargarías un [`AutoTokenizer`]. La única diferencia es seleccionar el [`AutoModel`] correcto para la tarea. Ya que estás clasificando texto, o secuencias, carga [`AutoModelForSequenceClassification`]:
-
-```py
->>> from transformers import AutoModelForSequenceClassification
-
->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
->>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)
-```
-
-<Tip>
-
-Ve el [task summary](./task_summary) para revisar qué clase del [`AutoModel`] deberías usar para cada tarea.
-
-</Tip>
-
-Ahora puedes pasar tu lote (batch) preprocesado de inputs directamente al modelo. Solo tienes que desempacar el diccionario añadiendo `**`:
-
-```py
->>> pt_outputs = pt_model(**pt_batch)
-```
-
-El modelo producirá las activaciones finales en el atributo `logits`. Aplica la función softmax a `logits` para obtener las probabilidades:
-
-```py
->>> from torch import nn
-
->>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
->>> print(pt_predictions)
-tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
-        [0.2084, 0.1826, 0.1969, 0.1755, 0.2365]], grad_fn=<SoftmaxBackward0>)
-```
-</pt>
-<tf>
-🤗 Transformers provee una forma simple y unificada de cargar tus instancias preentrenadas. Esto significa que puedes cargar un [`TFAutoModel`] como cargarías un [`AutoTokenizer`]. La única diferencia es seleccionar el [`TFAutoModel`] correcto para la tarea. Ya que estás clasificando texto, o secuencias, carga [`TFAutoModelForSequenceClassification`]:
-
-```py
->>> from transformers import TFAutoModelForSequenceClassification
-
->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
-```
-
-<Tip>
-  Ve el [task summary](./task_summary) para revisar qué clase del [`AutoModel`]
-  deberías usar para cada tarea.
-</Tip>
-
-Ahora puedes pasar tu lote preprocesado de inputs directamente al modelo pasando las llaves del diccionario directamente a los tensores:
-
-```py
->>> tf_outputs = tf_model(tf_batch)
-```
-
-El modelo producirá las activaciones finales en el atributo `logits`. Aplica la función softmax a `logits` para obtener las probabilidades:
-
-```py
->>> import tensorflow as tf
-
->>> tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1)
->>> print(tf.math.round(tf_predictions * 10**4) / 10**4)
-tf.Tensor(
-[[0.0021 0.0018 0.0116 0.2121 0.7725]
- [0.2084 0.1826 0.1969 0.1755  0.2365]], shape=(2, 5), dtype=float32)
-```
-</tf>
-</frameworkcontent>
-
-<Tip>
-
-Todos los modelos de 🤗 Transformers (PyTorch o TensorFlow) producirán los tensores *antes* de la función de activación
-final (como softmax) porque la función de activación final es comúnmente fusionada con la pérdida.
-
-</Tip>
-
-Los modelos son [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) o [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) estándares así que podrás usarlos en tu training loop usual. Sin embargo, para facilitar las cosas, 🤗 Transformers provee una clase [`Trainer`] para PyTorch que añade funcionalidades para entrenamiento distribuido, precición mixta, y más. Para TensorFlow, puedes usar el método `fit` desde [Keras](https://keras.io/). Consulta el [tutorial de entrenamiento](./training) para más detalles.
-
-<Tip>
-
-Los outputs del modelo de 🤗 Transformers son dataclasses especiales por lo que sus atributos pueden ser completados en un IDE.
-Los outputs del modelo también se comportan como tuplas o diccionarios (e.g., puedes indexar con un entero, un slice o una cadena) en cuyo caso los atributos que son `None` son ignorados.
-
-</Tip>
-
-### Guarda un modelo
-
-<frameworkcontent>
-<pt>
-Una vez que se haya hecho fine-tuning a tu modelo puedes guardarlo con tu tokenizador usando [`PreTrainedModel.save_pretrained`]:
-
-```py
->>> pt_save_directory = "./pt_save_pretrained"
->>> tokenizer.save_pretrained(pt_save_directory)  # doctest: +IGNORE_RESULT
->>> pt_model.save_pretrained(pt_save_directory)
-```
-
-Cuando quieras usar el modelo otra vez cárgalo con [`PreTrainedModel.from_pretrained`]:
-
-```py
->>> pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained")
-```
-
-</pt>
-
-<tf>
-Una vez que se haya hecho fine-tuning a tu modelo puedes guardarlo con tu tokenizador usando [`TFPreTrainedModel.save_pretrained`]:
-
-```py
->>> tf_save_directory = "./tf_save_pretrained"
->>> tokenizer.save_pretrained(tf_save_directory)  # doctest: +IGNORE_RESULT
->>> tf_model.save_pretrained(tf_save_directory)
-```
-
-Cuando quieras usar el modelo otra vez cárgalo con [`TFPreTrainedModel.from_pretrained`]:
-
-```py
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("./tf_save_pretrained")
-```
-</tf>
-</frameworkcontent>
-
-Una característica particularmente interesante de 🤗 Transformers es la habilidad de guardar el modelo y cargarlo como un modelo de PyTorch o TensorFlow. El parámetro `from_pt` o `from_tf` puede convertir el modelo de un framework al otro:
-
-<frameworkcontent>
-<pt>
-```py
->>> from transformers import AutoModel
-
->>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
->>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
-```
-</pt>
-<tf>
-```py
->>> from transformers import TFAutoModel
-
->>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
-```
-</tf>
-</frameworkcontent>
diff --git a/docs/source/es/run_scripts.md b/docs/source/es/run_scripts.md
new file mode 100644
index 000000000000..a66fd1e47e13
--- /dev/null
+++ b/docs/source/es/run_scripts.md
@@ -0,0 +1,351 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Entrenamiento con scripts
+
+Junto con los [notebooks](./noteboks/README) de 🤗 Transformers, también hay scripts con ejemplos que muestran cómo entrenar un modelo para una tarea en [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow), o [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax).
+
+También encontrarás scripts que hemos usado en nuestros [proyectos de investigación](https://github.com/huggingface/transformers/tree/main/examples/research_projects) y [ejemplos pasados](https://github.com/huggingface/transformers/tree/main/examples/legacy) que en su mayoría son aportados por la comunidad. Estos scripts no se mantienen activamente y requieren una versión específica de 🤗 Transformers que probablemente sea incompatible con la última versión de la biblioteca.
+
+No se espera que los scripts de ejemplo funcionen de inmediato en todos los problemas, y es posible que debas adaptar el script al problema que estás tratando de resolver. Para ayudarte con esto, la mayoría de los scripts exponen completamente cómo se preprocesan los datos, lo que te permite editarlos según sea necesario para tu caso de uso.
+
+Para cualquier característica que te gustaría implementar en un script de ejemplo, por favor discútelo en el [foro](https://discuss.huggingface.co/) o con un [issue](https://github.com/huggingface/transformers/issues) antes de enviar un Pull Request. Si bien agradecemos las correcciones de errores, es poco probable que fusionemos un Pull Request que agregue más funcionalidad a costa de la legibilidad.
+
+Esta guía te mostrará cómo ejecutar un ejemplo de un script de entrenamiento para resumir texto en [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization) y [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization). Se espera que todos los ejemplos funcionen con ambos frameworks a menos que se especifique lo contrario.
+
+## Configuración
+
+Para ejecutar con éxito la última versión de los scripts de ejemplo debes **instalar 🤗 Transformers desde su fuente** en un nuevo entorno virtual:
+
+```bash
+git clone https://github.com/huggingface/transformers
+cd transformers
+pip install .
+```
+
+Para versiones anteriores de los scripts de ejemplo, haz clic en alguno de los siguientes links:
+
+<details>
+  <summary>Ejemplos de versiones anteriores de 🤗 Transformers</summary>
+	<ul>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.5.1/examples">v4.5.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.4.2/examples">v4.4.2</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.3.3/examples">v4.3.3</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.2.2/examples">v4.2.2</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.1.1/examples">v4.1.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.0.1/examples">v4.0.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.5.1/examples">v3.5.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.4.0/examples">v3.4.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.3.1/examples">v3.3.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.2.0/examples">v3.2.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.1.0/examples">v3.1.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.0.2/examples">v3.0.2</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.11.0/examples">v2.11.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.10.0/examples">v2.10.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.9.1/examples">v2.9.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.8.0/examples">v2.8.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.7.0/examples">v2.7.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.6.0/examples">v2.6.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.5.1/examples">v2.5.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.4.0/examples">v2.4.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.3.0/examples">v2.3.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.2.0/examples">v2.2.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.1.0/examples">v2.1.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.0.0/examples">v2.0.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v1.2.0/examples">v1.2.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v1.1.0/examples">v1.1.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v1.0.0/examples">v1.0.0</a></li>
+	</ul>
+</details>
+
+Luego cambia tu clon actual de 🤗 Transformers a una versión específica, por ejemplo v3.5.1:
+
+```bash
+git checkout tags/v3.5.1
+```
+
+Una vez que hayas configurado la versión correcta de la biblioteca, ve a la carpeta de ejemplo de tu elección e instala los requisitos específicos del ejemplo:
+
+```bash
+pip install -r requirements.txt
+```
+
+## Ejecutar un script
+
+<frameworkcontent>
+<pt>
+El script de ejemplo descarga y preprocesa un conjunto de datos de la biblioteca 🤗 [Datasets](https://huggingface.co/docs/datasets/). Luego, el script ajusta un conjunto de datos con [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) en una arquitectura que soporta la tarea de resumen. El siguiente ejemplo muestra cómo ajustar un [T5-small](https://huggingface.co/t5-small) en el conjunto de datos [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail). El modelo T5 requiere un argumento adicional `source_prefix` debido a cómo fue entrenado. Este aviso le permite a T5 saber que se trata de una tarea de resumir.
+
+```bash
+python examples/pytorch/summarization/run_summarization.py \
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+</pt>
+<tf>
+El script de ejemplo descarga y preprocesa un conjunto de datos de la biblioteca 🤗 [Datasets](https://huggingface.co/docs/datasets/). Luego, el script ajusta un conjunto de datos utilizando Keras en una arquitectura que soporta la tarea de resumir. El siguiente ejemplo muestra cómo ajustar un [T5-small](https://huggingface.co/t5-small) en el conjunto de datos [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail). El modelo T5 requiere un argumento adicional `source_prefix` debido a cómo fue entrenado. Este aviso le permite a T5 saber que se trata de una tarea de resumir.
+
+```bash
+python examples/tensorflow/summarization/run_summarization.py  \
+    --model_name_or_path t5-small \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --output_dir /tmp/tst-summarization  \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 16 \
+    --num_train_epochs 3 \
+    --do_train \
+    --do_eval
+```
+</tf>
+</frameworkcontent>
+
+## Entrenamiento distribuido y de precisión mixta
+
+[Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) admite un entrenamiento distribuido y de precisión mixta, lo que significa que también puedes usarlo en un script. Para habilitar ambas características:
+
+- Agrega el argumento `fp16` para habilitar la precisión mixta.
+- Establece la cantidad de GPU que se usará con el argumento `nproc_per_node`.
+
+```bash
+python -m torch.distributed.launch \
+    --nproc_per_node 8 pytorch/summarization/run_summarization.py \
+    --fp16 \
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+
+Los scripts de TensorFlow utilizan [`MirroredStrategy`](https://www.tensorflow.org/guide/distributed_training#mirroredstrategy) para el entrenamiento distribuido, y no es necesario agregar argumentos adicionales al script de entrenamiento. El script de TensorFlow utilizará múltiples GPUs de forma predeterminada si están disponibles.
+
+## Ejecutar un script en una TPU
+
+<frameworkcontent>
+<pt>
+Las Unidades de Procesamiento de Tensor (TPUs) están diseñadas específicamente para acelerar el rendimiento. PyTorch admite TPU con el compilador de aprendizaje profundo [XLA](https://www.tensorflow.org/xla) (consulta [aquí](https://github.com/pytorch/xla/blob/master/README.md) para obtener más detalles). Para usar una TPU, inicia el script `xla_spawn.py` y usa el argumento `num_cores` para establecer la cantidad de núcleos de TPU que deseas usar.
+
+```bash
+python xla_spawn.py --num_cores 8 \
+    summarization/run_summarization.py \
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+</pt>
+<tf>
+Las Unidades de Procesamiento de Tensor (TPUs) están diseñadas específicamente para acelerar el rendimiento. TensorFlow utiliza [`TPUStrategy`](https://www.tensorflow.org/guide/distributed_training#tpustrategy) para entrenar en TPUs. Para usar una TPU, pasa el nombre del recurso de la TPU al argumento `tpu`
+
+```bash
+python run_summarization.py  \
+    --tpu name_of_tpu_resource \
+    --model_name_or_path t5-small \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --output_dir /tmp/tst-summarization  \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 16 \
+    --num_train_epochs 3 \
+    --do_train \
+    --do_eval
+```
+</tf>
+</frameworkcontent>
+
+## Ejecutar un script con 🤗 Accelerate
+
+🤗 [Accelerate](https://huggingface.co/docs/accelerate) es una biblioteca exclusiva de PyTorch que ofrece un método unificado para entrenar un modelo en varios tipos de configuraciones (solo CPU, GPU múltiples, TPU) mientras mantiene una visibilidad completa en el ciclo de entrenamiento de PyTorch. Asegúrate de tener 🤗 Accelerate instalado si aún no lo tienes:
+
+> Nota: Como Accelerate se está desarrollando rápidamente, debes instalar la versión git de Accelerate para ejecutar los scripts
+```bash
+pip install git+https://github.com/huggingface/accelerate
+```
+
+En lugar del script `run_summarization.py`, debes usar el script `run_summarization_no_trainer.py`. Los scripts compatibles con 🤗 Accelerate tendrán un archivo `task_no_trainer.py` en la carpeta. Comienza ejecutando el siguiente comando para crear y guardar un archivo de configuración:
+
+```bash
+accelerate config
+```
+
+Prueba tu configuración para asegurarte que está configurada correctamente:
+
+```bash
+accelerate test
+```
+
+Todo listo para iniciar el entrenamiento:
+
+```bash
+accelerate launch run_summarization_no_trainer.py \
+    --model_name_or_path t5-small \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir ~/tmp/tst-summarization
+```
+
+## Usar un conjunto de datos personalizado
+
+El script de la tarea resumir admite conjuntos de datos personalizados siempre que sean un archivo CSV o JSON Line. Cuando uses tu propio conjunto de datos, necesitas especificar varios argumentos adicionales:
+
+- `train_file` y `validation_file` especifican la ruta a tus archivos de entrenamiento y validación.
+- `text_column` es el texto de entrada para resumir.
+- `summary_column` es el texto de destino para la salida.
+
+Un script para resumir que utiliza un conjunto de datos personalizado se vera así:
+
+```bash
+python examples/pytorch/summarization/run_summarization.py \
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --train_file path_to_csv_or_jsonlines_file \
+    --validation_file path_to_csv_or_jsonlines_file \
+    --text_column text_column_name \
+    --summary_column summary_column_name \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --overwrite_output_dir \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --predict_with_generate
+```
+
+## Prueba un script
+
+A veces, es una buena idea ejecutar tu secuencia de comandos en una cantidad menor de ejemplos para asegurarte de que todo funciona como se espera antes de comprometerte con un conjunto de datos completo, lo que puede demorar horas en completarse. Utiliza los siguientes argumentos para truncar el conjunto de datos a un número máximo de muestras:
+
+- `max_train_samples`
+- `max_eval_samples`
+- `max_predict_samples`
+
+```bash
+python examples/pytorch/summarization/run_summarization.py \
+    --model_name_or_path t5-small \
+    --max_train_samples 50 \
+    --max_eval_samples 50 \
+    --max_predict_samples 50 \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+
+No todos los scripts de ejemplo admiten el argumento `max_predict_samples`. Puede que desconozcas si la secuencia de comandos admite este argumento, agrega `-h` para verificar:
+
+```bash
+examples/pytorch/summarization/run_summarization.py -h
+```
+
+## Reanudar el entrenamiento desde el punto de control
+
+Otra opción útil para habilitar es reanudar el entrenamiento desde un punto de control anterior. Esto asegurará que puedas continuar donde lo dejaste sin comenzar de nuevo si tu entrenamiento se interrumpe. Hay dos métodos para reanudar el entrenamiento desde un punto de control.
+
+El primer método utiliza el argumento `output_dir previous_output_dir` para reanudar el entrenamiento desde el último punto de control almacenado en `output_dir`. En este caso, debes eliminar `overwrite_output_dir`:
+
+```bash
+python examples/pytorch/summarization/run_summarization.py
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --output_dir previous_output_dir \
+    --predict_with_generate
+```
+
+El segundo método utiliza el argumento `resume_from_checkpoint path_to_specific_checkpoint` para reanudar el entrenamiento desde una carpeta de punto de control específica.
+
+```bash
+python examples/pytorch/summarization/run_summarization.py
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --resume_from_checkpoint path_to_specific_checkpoint \
+    --predict_with_generate
+```
+
+## Comparte tu modelo
+
+Todos los scripts pueden cargar tu modelo final en el [Model Hub](https://huggingface.co/models). Asegúrate de haber iniciado sesión en Hugging Face antes de comenzar:
+
+```bash
+huggingface-cli login
+```
+
+Luego agrega el argumento `push_to_hub` al script. Este argumento creará un repositorio con tu nombre de usuario Hugging Face y el nombre de la carpeta especificado en `output_dir`.
+
+Para darle a tu repositorio un nombre específico, usa el argumento `push_to_hub_model_id` para añadirlo. El repositorio se incluirá automáticamente en tu namespace.
+
+El siguiente ejemplo muestra cómo cargar un modelo con un nombre de repositorio específico:
+
+```bash
+python examples/pytorch/summarization/run_summarization.py
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --push_to_hub \
+    --push_to_hub_model_id finetuned-t5-cnn_dailymail \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
diff --git a/docs/source/es/run_scripts.mdx b/docs/source/es/run_scripts.mdx
deleted file mode 100644
index d0ab716f80ff..000000000000
--- a/docs/source/es/run_scripts.mdx
+++ /dev/null
@@ -1,347 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Entrenamiento con scripts
-
-Junto con los [notebooks](./noteboks/README) de 🤗 Transformers, también hay scripts con ejemplos que muestran cómo entrenar un modelo para una tarea en [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow), o [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax).
-
-También encontrarás scripts que hemos usado en nuestros [proyectos de investigación](https://github.com/huggingface/transformers/tree/main/examples/research_projects) y [ejemplos pasados](https://github.com/huggingface/transformers/tree/main/examples/legacy) que en su mayoría son aportados por la comunidad. Estos scripts no se mantienen activamente y requieren una versión específica de 🤗 Transformers que probablemente sea incompatible con la última versión de la biblioteca.
-
-No se espera que los scripts de ejemplo funcionen de inmediato en todos los problemas, y es posible que debas adaptar el script al problema que estás tratando de resolver. Para ayudarte con esto, la mayoría de los scripts exponen completamente cómo se preprocesan los datos, lo que te permite editarlos según sea necesario para tu caso de uso.
-
-Para cualquier característica que te gustaría implementar en un script de ejemplo, por favor discútelo en el [foro](https://discuss.huggingface.co/) o con un [issue](https://github.com/huggingface/transformers/issues) antes de enviar un Pull Request. Si bien agradecemos las correcciones de errores, es poco probable que fusionemos un Pull Request que agregue más funcionalidad a costa de la legibilidad.
-
-Esta guía te mostrará cómo ejecutar un ejemplo de un script de entrenamiento para resumir texto en [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization) y [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization). Se espera que todos los ejemplos funcionen con ambos frameworks a menos que se especifique lo contrario.
-
-## Configuración
-
-Para ejecutar con éxito la última versión de los scripts de ejemplo debes **instalar 🤗 Transformers desde su fuente** en un nuevo entorno virtual:
-
-```bash
-git clone https://github.com/huggingface/transformers
-cd transformers
-pip install .
-```
-
-Para versiones anteriores de los scripts de ejemplo, haz clic en alguno de los siguientes links:
-
-<details>
-  <summary>Ejemplos de versiones anteriores de 🤗 Transformers</summary>
-	<ul>
-		<li><a href="https://github.com/huggingface/transformers/tree/v4.5.1/examples">v4.5.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v4.4.2/examples">v4.4.2</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v4.3.3/examples">v4.3.3</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v4.2.2/examples">v4.2.2</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v4.1.1/examples">v4.1.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v4.0.1/examples">v4.0.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v3.5.1/examples">v3.5.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v3.4.0/examples">v3.4.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v3.3.1/examples">v3.3.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v3.2.0/examples">v3.2.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v3.1.0/examples">v3.1.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v3.0.2/examples">v3.0.2</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.11.0/examples">v2.11.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.10.0/examples">v2.10.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.9.1/examples">v2.9.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.8.0/examples">v2.8.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.7.0/examples">v2.7.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.6.0/examples">v2.6.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.5.1/examples">v2.5.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.4.0/examples">v2.4.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.3.0/examples">v2.3.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.2.0/examples">v2.2.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.1.0/examples">v2.1.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.0.0/examples">v2.0.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v1.2.0/examples">v1.2.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v1.1.0/examples">v1.1.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v1.0.0/examples">v1.0.0</a></li>
-	</ul>
-</details>
-
-Luego cambia tu clon actual de 🤗 Transformers a una versión específica, por ejemplo v3.5.1:
-
-```bash
-git checkout tags/v3.5.1
-```
-
-Una vez que hayas configurado la versión correcta de la biblioteca, ve a la carpeta de ejemplo de tu elección e instala los requisitos específicos del ejemplo:
-
-```bash
-pip install -r requirements.txt
-```
-
-## Ejecutar un script
-
-<frameworkcontent>
-<pt>
-El script de ejemplo descarga y preprocesa un conjunto de datos de la biblioteca 🤗 [Datasets](https://huggingface.co/docs/datasets/). Luego, el script ajusta un conjunto de datos con [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) en una arquitectura que soporta la tarea de resumen. El siguiente ejemplo muestra cómo ajustar un [T5-small](https://huggingface.co/t5-small) en el conjunto de datos [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail). El modelo T5 requiere un argumento adicional `source_prefix` debido a cómo fue entrenado. Este aviso le permite a T5 saber que se trata de una tarea de resumir.
-
-```bash
-python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path t5-small \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --output_dir /tmp/tst-summarization \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --overwrite_output_dir \
-    --predict_with_generate
-```
-</pt>
-<tf>
-El script de ejemplo descarga y preprocesa un conjunto de datos de la biblioteca 🤗 [Datasets](https://huggingface.co/docs/datasets/). Luego, el script ajusta un conjunto de datos utilizando Keras en una arquitectura que soporta la tarea de resumir. El siguiente ejemplo muestra cómo ajustar un [T5-small](https://huggingface.co/t5-small) en el conjunto de datos [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail). El modelo T5 requiere un argumento adicional `source_prefix` debido a cómo fue entrenado. Este aviso le permite a T5 saber que se trata de una tarea de resumir.
-
-```bash
-python examples/tensorflow/summarization/run_summarization.py  \
-    --model_name_or_path t5-small \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --output_dir /tmp/tst-summarization  \
-    --per_device_train_batch_size 8 \
-    --per_device_eval_batch_size 16 \
-    --num_train_epochs 3 \
-    --do_train \
-    --do_eval
-```
-</tf>
-</frameworkcontent>
-
-## Entrenamiento distribuido y de precisión mixta
-
-[Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) admite un entrenamiento distribuido y de precisión mixta, lo que significa que también puedes usarlo en un script. Para habilitar ambas características:
-
-- Agrega el argumento `fp16` para habilitar la precisión mixta.
-- Establece la cantidad de GPU que se usará con el argumento `nproc_per_node`.
-
-```bash
-python -m torch.distributed.launch \
-    --nproc_per_node 8 pytorch/summarization/run_summarization.py \
-    --fp16 \
-    --model_name_or_path t5-small \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --output_dir /tmp/tst-summarization \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --overwrite_output_dir \
-    --predict_with_generate
-```
-
-Los scripts de TensorFlow utilizan [`MirroredStrategy`](https://www.tensorflow.org/guide/distributed_training#mirroredstrategy) para el entrenamiento distribuido, y no es necesario agregar argumentos adicionales al script de entrenamiento. El script de TensorFlow utilizará múltiples GPUs de forma predeterminada si están disponibles.
-
-## Ejecutar un script en una TPU
-
-<frameworkcontent>
-<pt>
-Las Unidades de Procesamiento de Tensor (TPUs) están diseñadas específicamente para acelerar el rendimiento. PyTorch admite TPU con el compilador de aprendizaje profundo [XLA](https://www.tensorflow.org/xla) (consulta [aquí](https://github.com/pytorch/xla/blob/master/README.md) para obtener más detalles). Para usar una TPU, inicia el script `xla_spawn.py` y usa el argumento `num_cores` para establecer la cantidad de núcleos de TPU que deseas usar.
-
-```bash
-python xla_spawn.py --num_cores 8 \
-    summarization/run_summarization.py \
-    --model_name_or_path t5-small \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --output_dir /tmp/tst-summarization \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --overwrite_output_dir \
-    --predict_with_generate
-```
-</pt>
-<tf>
-Las Unidades de Procesamiento de Tensor (TPUs) están diseñadas específicamente para acelerar el rendimiento. TensorFlow utiliza [`TPUStrategy`](https://www.tensorflow.org/guide/distributed_training#tpustrategy) para entrenar en TPUs. Para usar una TPU, pasa el nombre del recurso de la TPU al argumento `tpu`
-
-```bash
-python run_summarization.py  \
-    --tpu name_of_tpu_resource \
-    --model_name_or_path t5-small \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --output_dir /tmp/tst-summarization  \
-    --per_device_train_batch_size 8 \
-    --per_device_eval_batch_size 16 \
-    --num_train_epochs 3 \
-    --do_train \
-    --do_eval
-```
-</tf>
-</frameworkcontent>
-
-## Ejecutar un script con 🤗 Accelerate
-
-🤗 [Accelerate](https://huggingface.co/docs/accelerate) es una biblioteca exclusiva de PyTorch que ofrece un método unificado para entrenar un modelo en varios tipos de configuraciones (solo CPU, GPU múltiples, TPU) mientras mantiene una visibilidad completa en el ciclo de entrenamiento de PyTorch. Asegúrate de tener 🤗 Accelerate instalado si aún no lo tienes:
-
-> Nota: Como Accelerate se está desarrollando rápidamente, debes instalar la versión git de Accelerate para ejecutar los scripts
-```bash
-pip install git+https://github.com/huggingface/accelerate
-```
-
-En lugar del script `run_summarization.py`, debes usar el script `run_summarization_no_trainer.py`. Los scripts compatibles con 🤗 Accelerate tendrán un archivo `task_no_trainer.py` en la carpeta. Comienza ejecutando el siguiente comando para crear y guardar un archivo de configuración:
-
-```bash
-accelerate config
-```
-
-Prueba tu configuración para asegurarte que está configurada correctamente:
-
-```bash
-accelerate test
-```
-
-Todo listo para iniciar el entrenamiento:
-
-```bash
-accelerate launch run_summarization_no_trainer.py \
-    --model_name_or_path t5-small \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --output_dir ~/tmp/tst-summarization
-```
-
-## Usar un conjunto de datos personalizado
-
-El script de la tarea resumir admite conjuntos de datos personalizados siempre que sean un archivo CSV o JSON Line. Cuando uses tu propio conjunto de datos, necesitas especificar varios argumentos adicionales:
-
-- `train_file` y `validation_file` especifican la ruta a tus archivos de entrenamiento y validación.
-- `text_column` es el texto de entrada para resumir.
-- `summary_column` es el texto de destino para la salida.
-
-Un script para resumir que utiliza un conjunto de datos personalizado se vera así:
-
-```bash
-python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path t5-small \
-    --do_train \
-    --do_eval \
-    --train_file path_to_csv_or_jsonlines_file \
-    --validation_file path_to_csv_or_jsonlines_file \
-    --text_column text_column_name \
-    --summary_column summary_column_name \
-    --source_prefix "summarize: " \
-    --output_dir /tmp/tst-summarization \
-    --overwrite_output_dir \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --predict_with_generate
-```
-
-## Prueba un script
-
-A veces, es una buena idea ejecutar tu secuencia de comandos en una cantidad menor de ejemplos para asegurarte de que todo funciona como se espera antes de comprometerte con un conjunto de datos completo, lo que puede demorar horas en completarse. Utiliza los siguientes argumentos para truncar el conjunto de datos a un número máximo de muestras:
-
-- `max_train_samples`
-- `max_eval_samples`
-- `max_predict_samples`
-
-```bash
-python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path t5-small \
-    --max_train_samples 50 \
-    --max_eval_samples 50 \
-    --max_predict_samples 50 \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --output_dir /tmp/tst-summarization \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --overwrite_output_dir \
-    --predict_with_generate
-```
-
-No todos los scripts de ejemplo admiten el argumento `max_predict_samples`. Puede que desconozcas si la secuencia de comandos admite este argumento, agrega `-h` para verificar:
-
-```bash
-examples/pytorch/summarization/run_summarization.py -h
-```
-
-## Reanudar el entrenamiento desde el punto de control
-
-Otra opción útil para habilitar es reanudar el entrenamiento desde un punto de control anterior. Esto asegurará que puedas continuar donde lo dejaste sin comenzar de nuevo si tu entrenamiento se interrumpe. Hay dos métodos para reanudar el entrenamiento desde un punto de control.
-
-El primer método utiliza el argumento `output_dir previous_output_dir` para reanudar el entrenamiento desde el último punto de control almacenado en `output_dir`. En este caso, debes eliminar `overwrite_output_dir`:
-
-```bash
-python examples/pytorch/summarization/run_summarization.py
-    --model_name_or_path t5-small \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --output_dir /tmp/tst-summarization \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --output_dir previous_output_dir \
-    --predict_with_generate
-```
-
-El segundo método utiliza el argumento `resume_from_checkpoint path_to_specific_checkpoint` para reanudar el entrenamiento desde una carpeta de punto de control específica.
-
-```bash
-python examples/pytorch/summarization/run_summarization.py
-    --model_name_or_path t5-small \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --output_dir /tmp/tst-summarization \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --overwrite_output_dir \
-    --resume_from_checkpoint path_to_specific_checkpoint \
-    --predict_with_generate
-```
-
-## Comparte tu modelo
-
-Todos los scripts pueden cargar tu modelo final en el [Model Hub](https://huggingface.co/models). Asegúrate de haber iniciado sesión en Hugging Face antes de comenzar:
-
-```bash
-huggingface-cli login
-```
-
-Luego agrega el argumento `push_to_hub` al script. Este argumento creará un repositorio con tu nombre de usuario Hugging Face y el nombre de la carpeta especificado en `output_dir`.
-
-Para darle a tu repositorio un nombre específico, usa el argumento `push_to_hub_model_id` para añadirlo. El repositorio se incluirá automáticamente en tu namespace.
-
-El siguiente ejemplo muestra cómo cargar un modelo con un nombre de repositorio específico:
-
-```bash
-python examples/pytorch/summarization/run_summarization.py
-    --model_name_or_path t5-small \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --push_to_hub \
-    --push_to_hub_model_id finetuned-t5-cnn_dailymail \
-    --output_dir /tmp/tst-summarization \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --overwrite_output_dir \
-    --predict_with_generate
-```
diff --git a/docs/source/es/sagemaker.md b/docs/source/es/sagemaker.md
new file mode 100644
index 000000000000..a874aefe76f6
--- /dev/null
+++ b/docs/source/es/sagemaker.md
@@ -0,0 +1,29 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Ejecutar el entrenamiento en Amazon SageMaker
+
+La documentación ha sido trasladada a [hf.co/docs/sagemaker](https://huggingface.co/docs/sagemaker). Esta página será eliminada en `transformers` 5.0. 
+
+### Tabla de contenido
+
+- [Entrenar modelos de Hugging Face en Amazon SageMaker con SageMaker Python SDK](https://huggingface.co/docs/sagemaker/train)
+- [Desplegar modelos de Hugging Face en Amazon SageMaker con SageMaker Python SDK](https://huggingface.co/docs/sagemaker/inference)
+- [Preguntas Frecuentes](https://huggingface.co/docs/sagemaker/faq)
diff --git a/docs/source/es/sagemaker.mdx b/docs/source/es/sagemaker.mdx
deleted file mode 100644
index 491d93e10d4d..000000000000
--- a/docs/source/es/sagemaker.mdx
+++ /dev/null
@@ -1,25 +0,0 @@
-<!---
-Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Ejecutar el entrenamiento en Amazon SageMaker
-
-La documentación ha sido trasladada a [hf.co/docs/sagemaker](https://huggingface.co/docs/sagemaker). Esta página será eliminada en `transformers` 5.0. 
-
-### Tabla de contenido
-
-- [Entrenar modelos de Hugging Face en Amazon SageMaker con SageMaker Python SDK](https://huggingface.co/docs/sagemaker/train)
-- [Desplegar modelos de Hugging Face en Amazon SageMaker con SageMaker Python SDK](https://huggingface.co/docs/sagemaker/inference)
-- [Preguntas Frecuentes](https://huggingface.co/docs/sagemaker/faq)
diff --git a/docs/source/es/serialization.md b/docs/source/es/serialization.md
new file mode 100644
index 000000000000..9c24ba72f3d4
--- /dev/null
+++ b/docs/source/es/serialization.md
@@ -0,0 +1,674 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Exportar modelos 🤗 Transformers
+
+Si necesitas implementar modelos 🤗 Transformers en entornos de producción, te 
+recomendamos exportarlos a un formato serializado que se pueda cargar y ejecutar 
+en tiempos de ejecución y hardware especializados. En esta guía, te mostraremos cómo 
+exportar modelos 🤗 Transformers en dos formatos ampliamente utilizados: ONNX y TorchScript.
+
+Una vez exportado, un modelo puede optimizarse para la inferencia a través de técnicas 
+como la cuantización y _pruning_. Si estás interesado en optimizar tus modelos para
+que funcionen con la máxima eficiencia, consulta la 
+[biblioteca de 🤗 Optimum](https://github.com/huggingface/optimum).
+
+## ONNX
+
+El proyecto [ONNX (Open Neural Network eXchange)](http://onnx.ai) es un 
+estándar abierto que define un conjunto común de operadores y un formato 
+de archivo común para representar modelos de aprendizaje profundo en una 
+amplia variedad de _frameworks_, incluidos PyTorch y TensorFlow. Cuando un modelo 
+se exporta al formato ONNX, estos operadores se usan para construir un 
+grafo computacional (a menudo llamado _representación intermedia_) que 
+representa el flujo de datos a través de la red neuronal.
+
+Al exponer un grafo con operadores y tipos de datos estandarizados, ONNX facilita 
+el cambio entre frameworks. Por ejemplo, un modelo entrenado en PyTorch se puede 
+exportar a formato ONNX y luego importar en TensorFlow (y viceversa).
+
+🤗 Transformers proporciona un paquete llamado `transformers.onnx`, el cual permite convertir 
+los checkpoints de un modelo en un grafo ONNX aprovechando los objetos de configuración. 
+Estos objetos de configuración están hechos a la medida de diferentes arquitecturas de modelos
+y están diseñados para ser fácilmente extensibles a otras arquitecturas.
+
+Las configuraciones a la medida incluyen las siguientes arquitecturas:
+
+<!--This table is automatically generated by `make fix-copies`, do not fill manually!-->
+
+- ALBERT
+- BART
+- BEiT
+- BERT
+- BigBird
+- BigBird-Pegasus
+- Blenderbot
+- BlenderbotSmall
+- BLOOM
+- CamemBERT
+- CLIP
+- CodeGen
+- ConvBERT
+- ConvNeXT
+- ConvNeXTV2
+- Data2VecText
+- Data2VecVision
+- DeBERTa
+- DeBERTa-v2
+- DeiT
+- DETR
+- DistilBERT
+- ELECTRA
+- FlauBERT
+- GPT Neo
+- GPT-J
+- I-BERT
+- LayoutLM
+- LayoutLMv3
+- LeViT
+- LongT5
+- M2M100
+- Marian
+- mBART
+- MobileBERT
+- MobileViT
+- MT5
+- OpenAI GPT-2
+- Perceiver
+- PLBart
+- ResNet
+- RoBERTa
+- RoFormer
+- SqueezeBERT
+- T5
+- ViT
+- XLM
+- XLM-RoBERTa
+- XLM-RoBERTa-XL
+- YOLOS
+
+En las próximas dos secciones, te mostraremos cómo:
+
+* Exportar un modelo compatible utilizando el paquete `transformers.onnx`.
+* Exportar un modelo personalizado para una arquitectura no compatible.
+
+### Exportar un model a ONNX
+
+Para exportar un modelo 🤗 Transformers a ONNX, tienes que instalar primero algunas
+dependencias extra:
+
+```bash
+pip install transformers[onnx]
+```
+
+El paquete `transformers.onnx` puede ser usado luego como un módulo de Python:
+
+```bash
+python -m transformers.onnx --help
+
+usage: Hugging Face Transformers ONNX exporter [-h] -m MODEL [--feature {causal-lm, ...}] [--opset OPSET] [--atol ATOL] output
+
+positional arguments:
+  output                Path indicating where to store generated ONNX model.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -m MODEL, --model MODEL
+                        Model ID on huggingface.co or path on disk to load model from.
+  --feature {causal-lm, ...}
+                        The type of features to export the model with.
+  --opset OPSET         ONNX opset version to export the model with.
+  --atol ATOL           Absolute difference tolerence when validating the model.
+```
+
+Exportar un checkpoint usando una configuración a la medida se puede hacer de la siguiente manera:
+
+```bash
+python -m transformers.onnx --model=distilbert-base-uncased onnx/
+```
+
+que debería mostrar los siguientes registros:
+
+```bash
+Validating ONNX model...
+        -[✓] ONNX model output names match reference model ({'last_hidden_state'})
+        - Validating ONNX Model output "last_hidden_state":
+                -[✓] (2, 8, 768) matches (2, 8, 768)
+                -[✓] all values close (atol: 1e-05)
+All good, model saved at: onnx/model.onnx
+```
+
+Esto exporta un grafo ONNX del checkpoint definido por el argumento `--model`. 
+En este ejemplo, es un modelo `distilbert-base-uncased`, pero puede ser cualquier
+checkpoint en Hugging Face Hub o que esté almacenado localmente.
+
+El archivo `model.onnx` resultante se puede ejecutar en uno de los 
+[muchos aceleradores](https://onnx.ai/supported-tools.html#deployModel) 
+que admiten el estándar ONNX. Por ejemplo, podemos cargar y ejecutar el 
+modelo con [ONNX Runtime](https://onnxruntime.ai/) de la siguiente manera:
+
+```python
+>>> from transformers import AutoTokenizer
+>>> from onnxruntime import InferenceSession
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+>>> session = InferenceSession("onnx/model.onnx")
+>>> # ONNX Runtime expects NumPy arrays as input
+>>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
+>>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs))
+```
+
+Los nombres necesarios de salida (es decir, `["last_hidden_state"]`) se pueden obtener 
+echando un vistazo a la configuración ONNX de cada modelo. Por ejemplo, para DistilBERT tenemos:
+
+```python
+>>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig
+
+>>> config = DistilBertConfig()
+>>> onnx_config = DistilBertOnnxConfig(config)
+>>> print(list(onnx_config.outputs.keys()))
+["last_hidden_state"]s
+```
+
+El proceso es idéntico para los checkpoints de TensorFlow en Hub. 
+Por ejemplo, podemos exportar un checkpoint puro de TensorFlow desde 
+[Keras](https://huggingface.co/keras-io) de la siguiente manera:
+
+```bash
+python -m transformers.onnx --model=keras-io/transformers-qa onnx/
+```
+
+Para exportar un modelo que está almacenado localmente, deberás tener los pesos 
+y tokenizadores del modelo almacenados en un directorio. Por ejemplo, podemos cargar 
+y guardar un checkpoint de la siguiente manera:
+
+<frameworkcontent>
+<pt>
+```python
+>>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
+
+>>> # Load tokenizer and PyTorch weights form the Hub
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> # Save to disk
+>>> tokenizer.save_pretrained("local-pt-checkpoint")
+>>> pt_model.save_pretrained("local-pt-checkpoint")
+```
+
+Una vez que se guarda el checkpoint, podemos exportarlo a ONNX usando el argumento `--model` 
+del paquete `transformers.onnx` al directorio deseado:
+
+```bash
+python -m transformers.onnx --model=local-pt-checkpoint onnx/
+```
+</pt>
+<tf>
+```python
+>>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+
+>>> # Load tokenizer and TensorFlow weights from the Hub
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> # Save to disk
+>>> tokenizer.save_pretrained("local-tf-checkpoint")
+>>> tf_model.save_pretrained("local-tf-checkpoint")
+```
+
+Una vez que se guarda el checkpoint, podemos exportarlo a ONNX usando el argumento `--model` 
+del paquete `transformers.onnx` al directorio deseado:
+
+```bash
+python -m transformers.onnx --model=local-tf-checkpoint onnx/
+```
+</tf>
+</frameworkcontent>
+
+### Seleccionar características para diferentes topologías de un modelo
+
+Cada configuración a la medida viene con un conjunto de _características_ que te permiten exportar 
+modelos para diferentes tipos de topologías o tareas. Como se muestra en la siguiente tabla, cada 
+función está asociada con una auto-clase de automóvil diferente:
+
+| Feature                              | Auto Class                           |
+| ------------------------------------ | ------------------------------------ |
+| `causal-lm`, `causal-lm-with-past`   | `AutoModelForCausalLM`               |
+| `default`, `default-with-past`       | `AutoModel`                          |
+| `masked-lm`                          | `AutoModelForMaskedLM`               |
+| `question-answering`                 | `AutoModelForQuestionAnswering`      |
+| `seq2seq-lm`, `seq2seq-lm-with-past` | `AutoModelForSeq2SeqLM`              |
+| `sequence-classification`            | `AutoModelForSequenceClassification` |
+| `token-classification`               | `AutoModelForTokenClassification`    |
+
+Para cada configuración, puedes encontrar la lista de funciones admitidas a través de `FeaturesManager`. 
+Por ejemplo, para DistilBERT tenemos:
+
+```python
+>>> from transformers.onnx.features import FeaturesManager
+
+>>> distilbert_features = list(FeaturesManager.get_supported_features_for_model_type("distilbert").keys())
+>>> print(distilbert_features)
+["default", "masked-lm", "causal-lm", "sequence-classification", "token-classification", "question-answering"]
+```
+
+Le puedes pasar una de estas características al argumento `--feature` en el paquete `transformers.onnx`. 
+Por ejemplo, para exportar un modelo de clasificación de texto, podemos elegir un modelo ya ajustado del Hub y ejecutar:
+
+```bash
+python -m transformers.onnx --model=distilbert-base-uncased-finetuned-sst-2-english \
+                            --feature=sequence-classification onnx/
+```
+
+que mostrará los siguientes registros:
+
+```bash
+Validating ONNX model...
+        -[✓] ONNX model output names match reference model ({'logits'})
+        - Validating ONNX Model output "logits":
+                -[✓] (2, 2) matches (2, 2)
+                -[✓] all values close (atol: 1e-05)
+All good, model saved at: onnx/model.onnx
+```
+
+Ten en cuenta que, en este caso, los nombres de salida del modelo ajustado son `logits` en lugar de `last_hidden_state` 
+que vimos anteriormente con el checkpoint `distilbert-base-uncased`. Esto es de esperarse ya que el modelo ajustado 
+tiene un cabezal de clasificación secuencial.
+
+<Tip>
+
+Las características que tienen un sufijo 'with-past' (por ejemplo, 'causal-lm-with-past') corresponden a topologías 
+de modelo con estados ocultos precalculados (clave y valores en los bloques de atención) que se pueden usar para una 
+decodificación autorregresiva más rápida.
+
+</Tip>
+
+
+### Exportar un modelo para una arquitectura no compatible
+
+Si deseas exportar un modelo cuya arquitectura no es compatible de forma nativa 
+con la biblioteca, debes seguir tres pasos principales:
+
+1. Implementa una configuración personalizada en ONNX.
+2. Exporta el modelo a ONNX.
+3. Valide los resultados de PyTorch y los modelos exportados.
+
+En esta sección, veremos cómo se implementó la serialización de DistilBERT 
+para mostrar lo que implica cada paso.
+
+#### Implementar una configuración personalizada en ONNX
+
+Comencemos con el objeto de configuración de ONNX. Proporcionamos tres clases abstractas 
+de las que debe heredar, según el tipo de arquitectura del modelo que quieras exportar:
+
+* Modelos basados en el _Encoder_ inherente de [`~onnx.config.OnnxConfig`]
+* Modelos basados en el _Decoder_ inherente de [`~onnx.config.OnnxConfigWithPast`]
+* Modelos _Encoder-decoder_ inherente de [`~onnx.config.OnnxSeq2SeqConfigWithPast`]
+
+<Tip>
+
+Una buena manera de implementar una configuración personalizada en ONNX es observar la implementación 
+existente en el archivo `configuration_<model_name>.py` de una arquitectura similar.
+
+</Tip>
+
+Dado que DistilBERT es un modelo de tipo _encoder_, su configuración se hereda de `OnnxConfig`:
+
+```python
+>>> from typing import Mapping, OrderedDict
+>>> from transformers.onnx import OnnxConfig
+
+
+>>> class DistilBertOnnxConfig(OnnxConfig):
+...     @property
+...     def inputs(self) -> Mapping[str, Mapping[int, str]]:
+...         return OrderedDict(
+...             [
+...                 ("input_ids", {0: "batch", 1: "sequence"}),
+...                 ("attention_mask", {0: "batch", 1: "sequence"}),
+...             ]
+...         )
+```
+
+Cada objeto de configuración debe implementar la propiedad `inputs` y devolver un mapeo, 
+donde cada llave corresponde a una entrada esperada y cada valor indica el eje de esa entrada. 
+Para DistilBERT, podemos ver que se requieren dos entradas: `input_ids` y `attention_mask`. 
+Estas entradas tienen la misma forma de `(batch_size, sequence_length)`, es por lo que vemos 
+los mismos ejes utilizados en la configuración.
+
+<Tip>
+
+Observa que la propiedad `inputs` para `DistilBertOnnxConfig` devuelve un `OrderedDict`.
+Esto nos asegura que las entradas coincidan con su posición relativa dentro del método 
+`PreTrainedModel.forward()` al rastrear el grafo. Recomendamos usar un `OrderedDict` 
+para las propiedades `inputs` y `outputs` al implementar configuraciones ONNX personalizadas.
+
+</Tip>
+
+Una vez que hayas implementado una configuración ONNX, puedes crear una 
+instancia proporcionando la configuración del modelo base de la siguiente manera:
+
+```python
+>>> from transformers import AutoConfig
+
+>>> config = AutoConfig.from_pretrained("distilbert-base-uncased")
+>>> onnx_config = DistilBertOnnxConfig(config)
+```
+
+El objeto resultante tiene varias propiedades útiles. Por ejemplo, puedes ver el conjunto de operadores ONNX que se 
+utilizará durante la exportación:
+
+```python
+>>> print(onnx_config.default_onnx_opset)
+11
+```
+
+También puedes ver los resultados asociados con el modelo de la siguiente manera:
+
+```python
+>>> print(onnx_config.outputs)
+OrderedDict([("last_hidden_state", {0: "batch", 1: "sequence"})])
+```
+
+Observa que la propiedad de salidas sigue la misma estructura que las entradas; 
+devuelve un objecto `OrderedDict` de salidas nombradas y sus formas. La estructura 
+de salida está vinculada a la elección de la función con la que se inicializa la configuración.
+Por defecto, la configuración de ONNX se inicializa con la función `default` que 
+corresponde a exportar un modelo cargado con la clase `AutoModel`. Si quieres exportar 
+una topología de modelo diferente, simplemente proporciona una característica diferente 
+al argumento `task` cuando inicialices la configuración de ONNX. Por ejemplo, si quisiéramos 
+exportar DistilBERT con un cabezal de clasificación de secuencias, podríamos usar:
+
+```python
+>>> from transformers import AutoConfig
+
+>>> config = AutoConfig.from_pretrained("distilbert-base-uncased")
+>>> onnx_config_for_seq_clf = DistilBertOnnxConfig(config, task="sequence-classification")
+>>> print(onnx_config_for_seq_clf.outputs)
+OrderedDict([('logits', {0: 'batch'})])
+```
+
+<Tip>
+
+Todas las propiedades base y métodos asociados con [`~onnx.config.OnnxConfig`] y las 
+otras clases de configuración se pueden sobreescribir si es necesario.
+Consulte [`BartOnnxConfig`] para ver un ejemplo avanzado.
+
+</Tip>
+
+#### Exportar el modelo
+
+Una vez que hayas implementado la configuración de ONNX, el siguiente paso es exportar el modelo.
+Aquí podemos usar la función `export()` proporcionada por el paquete `transformers.onnx`.
+Esta función espera la configuración de ONNX, junto con el modelo base y el tokenizador, 
+y la ruta para guardar el archivo exportado:
+
+```python
+>>> from pathlib import Path
+>>> from transformers.onnx import export
+>>> from transformers import AutoTokenizer, AutoModel
+
+>>> onnx_path = Path("model.onnx")
+>>> model_ckpt = "distilbert-base-uncased"
+>>> base_model = AutoModel.from_pretrained(model_ckpt)
+>>> tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
+
+>>> onnx_inputs, onnx_outputs = export(tokenizer, base_model, onnx_config, onnx_config.default_onnx_opset, onnx_path)
+```
+
+Los objetos `onnx_inputs` y `onnx_outputs` devueltos por la función `export()` 
+son listas de llaves definidas en las propiedades `inputs` y `outputs` de la configuración.
+Una vez exportado el modelo, puedes probar que el modelo está bien formado de la siguiente manera:
+
+```python
+>>> import onnx
+
+>>> onnx_model = onnx.load("model.onnx")
+>>> onnx.checker.check_model(onnx_model)
+```
+
+<Tip>
+
+Si tu modelo tiene más de 2GB, verás que se crean muchos archivos adicionales durante la exportación.
+Esto es _esperado_ porque ONNX usa [Búferes de protocolo](https://developers.google.com/protocol-buffers/) 
+para almacenar el modelo y éstos tienen un límite de tamaño de 2 GB. Consulta la 
+[documentación de ONNX](https://github.com/onnx/onnx/blob/master/docs/ExternalData.md) para obtener 
+instrucciones sobre cómo cargar modelos con datos externos.
+
+</Tip>
+
+#### Validar los resultados del modelo
+
+El paso final es validar que los resultados del modelo base y exportado coincidan dentro 
+de cierta tolerancia absoluta. Aquí podemos usar la función `validate_model_outputs()` 
+proporcionada por el paquete `transformers.onnx` de la siguiente manera:
+
+```python
+>>> from transformers.onnx import validate_model_outputs
+
+>>> validate_model_outputs(
+...     onnx_config, tokenizer, base_model, onnx_path, onnx_outputs, onnx_config.atol_for_validation
+... )
+```
+
+Esta función usa el método `OnnxConfig.generate_dummy_inputs()` para generar entradas para el modelo base 
+y exportado, y la tolerancia absoluta se puede definir en la configuración. En general, encontramos una 
+concordancia numérica en el rango de 1e-6 a 1e-4, aunque es probable que cualquier valor menor que 1e-3 esté bien.
+
+### Contribuir con una nueva configuración a 🤗 Transformers
+
+¡Estamos buscando expandir el conjunto de configuraciones a la medida para usar y agradecemos las contribuciones de la comunidad! 
+Si deseas contribuir con su colaboración a la biblioteca, deberás:
+
+* Implementa la configuración de ONNX en el archivo `configuration_<model_name>.py` correspondiente
+* Incluye la arquitectura del modelo y las características correspondientes en [`~onnx.features.FeatureManager`]
+* Agrega tu arquitectura de modelo a las pruebas en `test_onnx_v2.py`
+
+Revisa cómo fue la contribución para la [configuración de IBERT](https://github.com/huggingface/transformers/pull/14868/files) 
+y así tener una idea de lo que necesito.
+
+## TorchScript
+
+<Tip>
+
+Este es el comienzo de nuestros experimentos con TorchScript y todavía estamos explorando sus capacidades con modelos de 
+tamaño de entrada variable. Es un tema de interés y profundizaremos nuestro análisis en las próximas 
+versiones,  con más ejemplos de código, una implementación más flexible y puntos de referencia que comparen códigos 
+basados en Python con TorchScript compilado.
+
+</Tip>
+
+Según la documentación de PyTorch: "TorchScript es una forma de crear modelos serializables y optimizables a partir del 
+código de PyTorch". Los dos módulos de Pytorch [JIT y TRACE](https://pytorch.org/docs/stable/jit.html) permiten al 
+desarrollador exportar su modelo para reutilizarlo  en otros programas, como los programas C++ orientados a la eficiencia.
+
+Hemos proporcionado una interfaz que permite exportar modelos de 🤗 Transformers a TorchScript para que puedan reutilizarse 
+en un entorno diferente  al de un programa Python basado en PyTorch. Aquí explicamos cómo exportar y usar nuestros modelos 
+usando TorchScript.
+
+Exportar un modelo requiere de dos cosas:
+
+- un pase hacia adelante con entradas ficticias.
+- instanciación del modelo con la indicador `torchscript`.
+
+Estas necesidades implican varias cosas con las que los desarrolladores deben tener cuidado. Éstas se detallan a continuación.
+
+### Indicador de TorchScript y pesos atados
+
+Este indicador es necesario porque la mayoría de los modelos de lenguaje en este repositorio tienen pesos vinculados entre su capa 
+de `Embedding` y su capa de `Decoding`. TorchScript no permite la exportación de modelos que tengan pesos atados, por lo que es 
+necesario desvincular y clonar los pesos previamente.
+
+Esto implica que los modelos instanciados con el indicador `torchscript` tienen su capa `Embedding` y `Decoding` separadas, 
+lo que significa que no deben entrenarse más adelante. El entrenamiento desincronizaría las dos capas, lo que generaría 
+resultados inesperados.
+
+Este no es el caso de los modelos que no tienen un cabezal de modelo de lenguaje, ya que no tienen pesos atados.
+Estos modelos se pueden exportar de forma segura sin el indicador `torchscript`.
+
+### Entradas ficticias y longitudes estándar
+
+Las entradas ficticias se utilizan para crear un modelo de pase hacia adelante. Mientras los valores de las entradas se 
+propagan a través de las capas, PyTorch realiza un seguimiento de las diferentes operaciones ejecutadas en cada tensor.
+Estas operaciones registradas se utilizan luego para crear el "rastro" del modelo.
+
+El rastro se crea en relación con las dimensiones de las entradas. Por lo tanto, está limitado por las dimensiones de la 
+entrada ficticia y no funcionará para ninguna otra longitud de secuencia o tamaño de lote. Al intentar con un tamaño diferente, 
+un error como:
+
+`The expanded size of the tensor (3) must match the existing size (7) at non-singleton dimension 2`
+
+aparecerá. Por lo tanto, se recomienda rastrear el modelo con un tamaño de entrada ficticia al menos tan grande como la 
+entrada más  grande que se alimentará al modelo durante la inferencia. El _padding_ se puede realizar para completar los 
+valores que faltan.  Sin embargo, como el modelo se habrá rastreado con un tamaño de entrada grande, las dimensiones de 
+las diferentes matrices también serán grandes, lo que dará como resultado más cálculos.
+
+Se recomienda tener cuidado con el número total de operaciones realizadas en cada entrada y seguir de cerca el rendimiento 
+al exportar modelos de longitud de secuencia variable.
+
+### Usar TorchScript en Python
+
+A continuación se muestra un ejemplo que muestra cómo guardar, cargar modelos y cómo usar el rastreo para la inferencia.
+
+#### Guardando un modelo
+
+Este fragmento muestra cómo usar TorchScript para exportar un `BertModel`. Aquí, el `BertModel` se instancia de acuerdo 
+con la clase `BertConfig` y luego se guarda en el disco con el nombre de archivo `traced_bert.pt`
+
+```python
+from transformers import BertModel, BertTokenizer, BertConfig
+import torch
+
+enc = BertTokenizer.from_pretrained("bert-base-uncased")
+
+# Tokenizing input text
+text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+tokenized_text = enc.tokenize(text)
+
+# Masking one of the input tokens
+masked_index = 8
+tokenized_text[masked_index] = "[MASK]"
+indexed_tokens = enc.convert_tokens_to_ids(tokenized_text)
+segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
+
+# Creating a dummy input
+tokens_tensor = torch.tensor([indexed_tokens])
+segments_tensors = torch.tensor([segments_ids])
+dummy_input = [tokens_tensor, segments_tensors]
+
+# Initializing the model with the torchscript flag
+# Flag set to True even though it is not necessary as this model does not have an LM Head.
+config = BertConfig(
+    vocab_size_or_config_json_file=32000,
+    hidden_size=768,
+    num_hidden_layers=12,
+    num_attention_heads=12,
+    intermediate_size=3072,
+    torchscript=True,
+)
+
+# Instantiating the model
+model = BertModel(config)
+
+# The model needs to be in evaluation mode
+model.eval()
+
+# If you are instantiating the model with *from_pretrained* you can also easily set the TorchScript flag
+model = BertModel.from_pretrained("bert-base-uncased", torchscript=True)
+
+# Creating the trace
+traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
+torch.jit.save(traced_model, "traced_bert.pt")
+```
+
+#### Cargar un modelo
+
+Este fragmento muestra cómo cargar el `BertModel` que se guardó previamente en el disco con el nombre `traced_bert.pt`.
+Estamos reutilizando el `dummy_input` previamente inicializado.
+
+```python
+loaded_model = torch.jit.load("traced_bert.pt")
+loaded_model.eval()
+
+all_encoder_layers, pooled_output = loaded_model(*dummy_input)
+```
+
+#### Usar un modelo rastreado para la inferencia
+
+Usar el modelo rastreado para la inferencia es tan simple como usar su método `__call__`:
+
+```python
+traced_model(tokens_tensor, segments_tensors)
+```
+
+### Implementar los modelos HuggingFace TorchScript en AWS mediante Neuron SDK
+
+AWS presentó la familia de instancias [Amazon EC2 Inf1](https://aws.amazon.com/ec2/instance-types/inf1/) para la inferencia 
+de aprendizaje automático de bajo costo y  alto rendimiento en la nube. Las instancias Inf1 funcionan con el chip AWS 
+Inferentia, un acelerador de hardware personalizado,  que se especializa en cargas de trabajo de inferencia de aprendizaje 
+profundo. [AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/#) es el kit de desarrollo para  Inferentia 
+que admite el rastreo y la optimización de modelos de  transformers para su implementación en Inf1. El SDK de Neuron proporciona:
+
+
+1. API fácil de usar con una línea de cambio de código para rastrear y optimizar un modelo de TorchScript para la inferencia en la nube.
+2. Optimizaciones de rendimiento listas para usar con un [costo-rendimiento mejorado](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/benchmark/>)
+3. Soporte para modelos HuggingFace Transformers construidos con [PyTorch](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/bert_tutorial/tutorial_pretrained_bert.html) 
+o [TensorFlow](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/tensorflow/huggingface_bert/huggingface_bert.html).
+
+#### Implicaciones
+
+Los modelos Transformers basados en la arquitectura 
+[BERT (Representaciones de _Enconder_ bidireccional de Transformers)](https://huggingface.co/docs/transformers/main/model_doc/bert), 
+o sus variantes, como [distilBERT](https://huggingface.co/docs/transformers/main/model_doc/distilbert) y 
+[roBERTa](https://huggingface.co/docs/transformers/main/model_doc/roberta), se ejecutarán mejor en Inf1 para tareas no 
+generativas, como la respuesta extractiva de preguntas, la clasificación de secuencias y la clasificación de tokens.
+Como alternativa, las tareas de generación de texto se pueden adaptar para ejecutarse en Inf1, según este 
+[tutorial de AWS Neuron MarianMT](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/transformers-marianmt.html).
+Puedes encontrar más información sobre los modelos que están listos para usarse en Inferentia en la 
+[sección _Model Architecture Fit_ de la documentación de Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/models/models-inferentia.html#models-inferentia).
+
+#### Dependencias
+
+Usar AWS Neuron para convertir modelos requiere las siguientes dependencias y entornos:
+
+* Un [entorno Neuron SDK](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/index.html#installation-guide), 
+que viene preconfigurado en [AWS Deep Learning AMI](https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-inferentia-launching.html).
+
+#### Convertir un modelo a AWS Neuron
+
+Con el mismo script usado en [Uso de TorchScript en Python](https://huggingface.co/docs/transformers/main/es/serialization#using-torchscript-in-python)
+para rastrear un "BertModel", puedes importar la extensión del _framework_ `torch.neuron` para acceder a los componentes 
+del SDK de Neuron a través de una API de Python.
+
+```python
+from transformers import BertModel, BertTokenizer, BertConfig
+import torch
+import torch.neuron
+```
+Y modificando la línea de código de rastreo de:
+
+```python
+torch.jit.trace(model, [tokens_tensor, segments_tensors])
+```
+
+con lo siguiente:
+
+```python
+torch.neuron.trace(model, [token_tensor, segments_tensors])
+```
+
+Este cambio permite a Neuron SDK rastrear el modelo y optimizarlo para ejecutarse en instancias Inf1.
+
+Para obtener más información sobre las funciones, las herramientas, los tutoriales de ejemplo y las últimas actualizaciones 
+de AWS Neuron SDK, consulte la [documentación de AWS NeuronSDK](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html).
diff --git a/docs/source/es/serialization.mdx b/docs/source/es/serialization.mdx
deleted file mode 100644
index 4c42fd5d830e..000000000000
--- a/docs/source/es/serialization.mdx
+++ /dev/null
@@ -1,669 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Exportar modelos 🤗 Transformers
-
-Si necesitas implementar modelos 🤗 Transformers en entornos de producción, te 
-recomendamos exportarlos a un formato serializado que se pueda cargar y ejecutar 
-en tiempos de ejecución y hardware especializados. En esta guía, te mostraremos cómo 
-exportar modelos 🤗 Transformers en dos formatos ampliamente utilizados: ONNX y TorchScript.
-
-Una vez exportado, un modelo puede optimizarse para la inferencia a través de técnicas 
-como la cuantización y _pruning_. Si estás interesado en optimizar tus modelos para
-que funcionen con la máxima eficiencia, consulta la 
-[biblioteca de 🤗 Optimum](https://github.com/huggingface/optimum).
-
-## ONNX
-
-El proyecto [ONNX (Open Neural Network eXchange)](http://onnx.ai) es un 
-estándar abierto que define un conjunto común de operadores y un formato 
-de archivo común para representar modelos de aprendizaje profundo en una 
-amplia variedad de _frameworks_, incluidos PyTorch y TensorFlow. Cuando un modelo 
-se exporta al formato ONNX, estos operadores se usan para construir un 
-grafo computacional (a menudo llamado _representación intermedia_) que 
-representa el flujo de datos a través de la red neuronal.
-
-Al exponer un grafo con operadores y tipos de datos estandarizados, ONNX facilita 
-el cambio entre frameworks. Por ejemplo, un modelo entrenado en PyTorch se puede 
-exportar a formato ONNX y luego importar en TensorFlow (y viceversa).
-
-🤗 Transformers proporciona un paquete llamado `transformers.onnx`, el cual permite convertir 
-los checkpoints de un modelo en un grafo ONNX aprovechando los objetos de configuración. 
-Estos objetos de configuración están hechos a la medida de diferentes arquitecturas de modelos
-y están diseñados para ser fácilmente extensibles a otras arquitecturas.
-
-Las configuraciones a la medida incluyen las siguientes arquitecturas:
-
-<!--This table is automatically generated by `make fix-copies`, do not fill manually!-->
-
-- ALBERT
-- BART
-- BEiT
-- BERT
-- BigBird
-- BigBird-Pegasus
-- Blenderbot
-- BlenderbotSmall
-- BLOOM
-- CamemBERT
-- CLIP
-- CodeGen
-- ConvBERT
-- ConvNeXT
-- Data2VecText
-- Data2VecVision
-- DeBERTa
-- DeBERTa-v2
-- DeiT
-- DETR
-- DistilBERT
-- ELECTRA
-- FlauBERT
-- GPT Neo
-- GPT-J
-- I-BERT
-- LayoutLM
-- LayoutLMv3
-- LeViT
-- LongT5
-- M2M100
-- Marian
-- mBART
-- MobileBERT
-- MobileViT
-- MT5
-- OpenAI GPT-2
-- Perceiver
-- PLBart
-- ResNet
-- RoBERTa
-- RoFormer
-- SqueezeBERT
-- T5
-- ViT
-- XLM
-- XLM-RoBERTa
-- XLM-RoBERTa-XL
-- YOLOS
-
-En las próximas dos secciones, te mostraremos cómo:
-
-* Exportar un modelo compatible utilizando el paquete `transformers.onnx`.
-* Exportar un modelo personalizado para una arquitectura no compatible.
-
-### Exportar un model a ONNX
-
-Para exportar un modelo 🤗 Transformers a ONNX, tienes que instalar primero algunas
-dependencias extra:
-
-```bash
-pip install transformers[onnx]
-```
-
-El paquete `transformers.onnx` puede ser usado luego como un módulo de Python:
-
-```bash
-python -m transformers.onnx --help
-
-usage: Hugging Face Transformers ONNX exporter [-h] -m MODEL [--feature {causal-lm, ...}] [--opset OPSET] [--atol ATOL] output
-
-positional arguments:
-  output                Path indicating where to store generated ONNX model.
-
-optional arguments:
-  -h, --help            show this help message and exit
-  -m MODEL, --model MODEL
-                        Model ID on huggingface.co or path on disk to load model from.
-  --feature {causal-lm, ...}
-                        The type of features to export the model with.
-  --opset OPSET         ONNX opset version to export the model with.
-  --atol ATOL           Absolute difference tolerence when validating the model.
-```
-
-Exportar un checkpoint usando una configuración a la medida se puede hacer de la siguiente manera:
-
-```bash
-python -m transformers.onnx --model=distilbert-base-uncased onnx/
-```
-
-que debería mostrar los siguientes registros:
-
-```bash
-Validating ONNX model...
-        -[✓] ONNX model output names match reference model ({'last_hidden_state'})
-        - Validating ONNX Model output "last_hidden_state":
-                -[✓] (2, 8, 768) matches (2, 8, 768)
-                -[✓] all values close (atol: 1e-05)
-All good, model saved at: onnx/model.onnx
-```
-
-Esto exporta un grafo ONNX del checkpoint definido por el argumento `--model`. 
-En este ejemplo, es un modelo `distilbert-base-uncased`, pero puede ser cualquier
-checkpoint en Hugging Face Hub o que esté almacenado localmente.
-
-El archivo `model.onnx` resultante se puede ejecutar en uno de los 
-[muchos aceleradores](https://onnx.ai/supported-tools.html#deployModel) 
-que admiten el estándar ONNX. Por ejemplo, podemos cargar y ejecutar el 
-modelo con [ONNX Runtime](https://onnxruntime.ai/) de la siguiente manera:
-
-```python
->>> from transformers import AutoTokenizer
->>> from onnxruntime import InferenceSession
-
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
->>> session = InferenceSession("onnx/model.onnx")
->>> # ONNX Runtime expects NumPy arrays as input
->>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
->>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs))
-```
-
-Los nombres necesarios de salida (es decir, `["last_hidden_state"]`) se pueden obtener 
-echando un vistazo a la configuración ONNX de cada modelo. Por ejemplo, para DistilBERT tenemos:
-
-```python
->>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig
-
->>> config = DistilBertConfig()
->>> onnx_config = DistilBertOnnxConfig(config)
->>> print(list(onnx_config.outputs.keys()))
-["last_hidden_state"]s
-```
-
-El proceso es idéntico para los checkpoints de TensorFlow en Hub. 
-Por ejemplo, podemos exportar un checkpoint puro de TensorFlow desde 
-[Keras](https://huggingface.co/keras-io) de la siguiente manera:
-
-```bash
-python -m transformers.onnx --model=keras-io/transformers-qa onnx/
-```
-
-Para exportar un modelo que está almacenado localmente, deberás tener los pesos 
-y tokenizadores del modelo almacenados en un directorio. Por ejemplo, podemos cargar 
-y guardar un checkpoint de la siguiente manera:
-
-<frameworkcontent>
-<pt>
-```python
->>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
-
->>> # Load tokenizer and PyTorch weights form the Hub
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
->>> pt_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
->>> # Save to disk
->>> tokenizer.save_pretrained("local-pt-checkpoint")
->>> pt_model.save_pretrained("local-pt-checkpoint")
-```
-
-Una vez que se guarda el checkpoint, podemos exportarlo a ONNX usando el argumento `--model` 
-del paquete `transformers.onnx` al directorio deseado:
-
-```bash
-python -m transformers.onnx --model=local-pt-checkpoint onnx/
-```
-</pt>
-<tf>
-```python
->>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
-
->>> # Load tokenizer and TensorFlow weights from the Hub
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
->>> # Save to disk
->>> tokenizer.save_pretrained("local-tf-checkpoint")
->>> tf_model.save_pretrained("local-tf-checkpoint")
-```
-
-Una vez que se guarda el checkpoint, podemos exportarlo a ONNX usando el argumento `--model` 
-del paquete `transformers.onnx` al directorio deseado:
-
-```bash
-python -m transformers.onnx --model=local-tf-checkpoint onnx/
-```
-</tf>
-</frameworkcontent>
-
-### Seleccionar características para diferentes topologías de un modelo
-
-Cada configuración a la medida viene con un conjunto de _características_ que te permiten exportar 
-modelos para diferentes tipos de topologías o tareas. Como se muestra en la siguiente tabla, cada 
-función está asociada con una auto-clase de automóvil diferente:
-
-| Feature                              | Auto Class                           |
-| ------------------------------------ | ------------------------------------ |
-| `causal-lm`, `causal-lm-with-past`   | `AutoModelForCausalLM`               |
-| `default`, `default-with-past`       | `AutoModel`                          |
-| `masked-lm`                          | `AutoModelForMaskedLM`               |
-| `question-answering`                 | `AutoModelForQuestionAnswering`      |
-| `seq2seq-lm`, `seq2seq-lm-with-past` | `AutoModelForSeq2SeqLM`              |
-| `sequence-classification`            | `AutoModelForSequenceClassification` |
-| `token-classification`               | `AutoModelForTokenClassification`    |
-
-Para cada configuración, puedes encontrar la lista de funciones admitidas a través de `FeaturesManager`. 
-Por ejemplo, para DistilBERT tenemos:
-
-```python
->>> from transformers.onnx.features import FeaturesManager
-
->>> distilbert_features = list(FeaturesManager.get_supported_features_for_model_type("distilbert").keys())
->>> print(distilbert_features)
-["default", "masked-lm", "causal-lm", "sequence-classification", "token-classification", "question-answering"]
-```
-
-Le puedes pasar una de estas características al argumento `--feature` en el paquete `transformers.onnx`. 
-Por ejemplo, para exportar un modelo de clasificación de texto, podemos elegir un modelo ya ajustado del Hub y ejecutar:
-
-```bash
-python -m transformers.onnx --model=distilbert-base-uncased-finetuned-sst-2-english \
-                            --feature=sequence-classification onnx/
-```
-
-que mostrará los siguientes registros:
-
-```bash
-Validating ONNX model...
-        -[✓] ONNX model output names match reference model ({'logits'})
-        - Validating ONNX Model output "logits":
-                -[✓] (2, 2) matches (2, 2)
-                -[✓] all values close (atol: 1e-05)
-All good, model saved at: onnx/model.onnx
-```
-
-Ten en cuenta que, en este caso, los nombres de salida del modelo ajustado son `logits` en lugar de `last_hidden_state` 
-que vimos anteriormente con el checkpoint `distilbert-base-uncased`. Esto es de esperarse ya que el modelo ajustado 
-tiene un cabezal de clasificación secuencial.
-
-<Tip>
-
-Las características que tienen un sufijo 'with-past' (por ejemplo, 'causal-lm-with-past') corresponden a topologías 
-de modelo con estados ocultos precalculados (clave y valores en los bloques de atención) que se pueden usar para una 
-decodificación autorregresiva más rápida.
-
-</Tip>
-
-
-### Exportar un modelo para una arquitectura no compatible
-
-Si deseas exportar un modelo cuya arquitectura no es compatible de forma nativa 
-con la biblioteca, debes seguir tres pasos principales:
-
-1. Implementa una configuración personalizada en ONNX.
-2. Exporta el modelo a ONNX.
-3. Valide los resultados de PyTorch y los modelos exportados.
-
-En esta sección, veremos cómo se implementó la serialización de DistilBERT 
-para mostrar lo que implica cada paso.
-
-#### Implementar una configuración personalizada en ONNX
-
-Comencemos con el objeto de configuración de ONNX. Proporcionamos tres clases abstractas 
-de las que debe heredar, según el tipo de arquitectura del modelo que quieras exportar:
-
-* Modelos basados en el _Encoder_ inherente de [`~onnx.config.OnnxConfig`]
-* Modelos basados en el _Decoder_ inherente de [`~onnx.config.OnnxConfigWithPast`]
-* Modelos _Encoder-decoder_ inherente de [`~onnx.config.OnnxSeq2SeqConfigWithPast`]
-
-<Tip>
-
-Una buena manera de implementar una configuración personalizada en ONNX es observar la implementación 
-existente en el archivo `configuration_<model_name>.py` de una arquitectura similar.
-
-</Tip>
-
-Dado que DistilBERT es un modelo de tipo _encoder_, su configuración se hereda de `OnnxConfig`:
-
-```python
->>> from typing import Mapping, OrderedDict
->>> from transformers.onnx import OnnxConfig
-
-
->>> class DistilBertOnnxConfig(OnnxConfig):
-...     @property
-...     def inputs(self) -> Mapping[str, Mapping[int, str]]:
-...         return OrderedDict(
-...             [
-...                 ("input_ids", {0: "batch", 1: "sequence"}),
-...                 ("attention_mask", {0: "batch", 1: "sequence"}),
-...             ]
-...         )
-```
-
-Cada objeto de configuración debe implementar la propiedad `inputs` y devolver un mapeo, 
-donde cada llave corresponde a una entrada esperada y cada valor indica el eje de esa entrada. 
-Para DistilBERT, podemos ver que se requieren dos entradas: `input_ids` y `attention_mask`. 
-Estas entradas tienen la misma forma de `(batch_size, sequence_length)`, es por lo que vemos 
-los mismos ejes utilizados en la configuración.
-
-<Tip>
-
-Observa que la propiedad `inputs` para `DistilBertOnnxConfig` devuelve un `OrderedDict`.
-Esto nos asegura que las entradas coincidan con su posición relativa dentro del método 
-`PreTrainedModel.forward()` al rastrear el grafo. Recomendamos usar un `OrderedDict` 
-para las propiedades `inputs` y `outputs` al implementar configuraciones ONNX personalizadas.
-
-</Tip>
-
-Una vez que hayas implementado una configuración ONNX, puedes crear una 
-instancia proporcionando la configuración del modelo base de la siguiente manera:
-
-```python
->>> from transformers import AutoConfig
-
->>> config = AutoConfig.from_pretrained("distilbert-base-uncased")
->>> onnx_config = DistilBertOnnxConfig(config)
-```
-
-El objeto resultante tiene varias propiedades útiles. Por ejemplo, puedes ver el conjunto de operadores ONNX que se 
-utilizará durante la exportación:
-
-```python
->>> print(onnx_config.default_onnx_opset)
-11
-```
-
-También puedes ver los resultados asociados con el modelo de la siguiente manera:
-
-```python
->>> print(onnx_config.outputs)
-OrderedDict([("last_hidden_state", {0: "batch", 1: "sequence"})])
-```
-
-Observa que la propiedad de salidas sigue la misma estructura que las entradas; 
-devuelve un objecto `OrderedDict` de salidas nombradas y sus formas. La estructura 
-de salida está vinculada a la elección de la función con la que se inicializa la configuración.
-Por defecto, la configuración de ONNX se inicializa con la función `default` que 
-corresponde a exportar un modelo cargado con la clase `AutoModel`. Si quieres exportar 
-una topología de modelo diferente, simplemente proporciona una característica diferente 
-al argumento `task` cuando inicialices la configuración de ONNX. Por ejemplo, si quisiéramos 
-exportar DistilBERT con un cabezal de clasificación de secuencias, podríamos usar:
-
-```python
->>> from transformers import AutoConfig
-
->>> config = AutoConfig.from_pretrained("distilbert-base-uncased")
->>> onnx_config_for_seq_clf = DistilBertOnnxConfig(config, task="sequence-classification")
->>> print(onnx_config_for_seq_clf.outputs)
-OrderedDict([('logits', {0: 'batch'})])
-```
-
-<Tip>
-
-Todas las propiedades base y métodos asociados con [`~onnx.config.OnnxConfig`] y las 
-otras clases de configuración se pueden sobreescribir si es necesario.
-Consulte [`BartOnnxConfig`] para ver un ejemplo avanzado.
-
-</Tip>
-
-#### Exportar el modelo
-
-Una vez que hayas implementado la configuración de ONNX, el siguiente paso es exportar el modelo.
-Aquí podemos usar la función `export()` proporcionada por el paquete `transformers.onnx`.
-Esta función espera la configuración de ONNX, junto con el modelo base y el tokenizador, 
-y la ruta para guardar el archivo exportado:
-
-```python
->>> from pathlib import Path
->>> from transformers.onnx import export
->>> from transformers import AutoTokenizer, AutoModel
-
->>> onnx_path = Path("model.onnx")
->>> model_ckpt = "distilbert-base-uncased"
->>> base_model = AutoModel.from_pretrained(model_ckpt)
->>> tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
-
->>> onnx_inputs, onnx_outputs = export(tokenizer, base_model, onnx_config, onnx_config.default_onnx_opset, onnx_path)
-```
-
-Los objetos `onnx_inputs` y `onnx_outputs` devueltos por la función `export()` 
-son listas de llaves definidas en las propiedades `inputs` y `outputs` de la configuración.
-Una vez exportado el modelo, puedes probar que el modelo está bien formado de la siguiente manera:
-
-```python
->>> import onnx
-
->>> onnx_model = onnx.load("model.onnx")
->>> onnx.checker.check_model(onnx_model)
-```
-
-<Tip>
-
-Si tu modelo tiene más de 2GB, verás que se crean muchos archivos adicionales durante la exportación.
-Esto es _esperado_ porque ONNX usa [Búferes de protocolo](https://developers.google.com/protocol-buffers/) 
-para almacenar el modelo y éstos tienen un límite de tamaño de 2 GB. Consulta la 
-[documentación de ONNX](https://github.com/onnx/onnx/blob/master/docs/ExternalData.md) para obtener 
-instrucciones sobre cómo cargar modelos con datos externos.
-
-</Tip>
-
-#### Validar los resultados del modelo
-
-El paso final es validar que los resultados del modelo base y exportado coincidan dentro 
-de cierta tolerancia absoluta. Aquí podemos usar la función `validate_model_outputs()` 
-proporcionada por el paquete `transformers.onnx` de la siguiente manera:
-
-```python
->>> from transformers.onnx import validate_model_outputs
-
->>> validate_model_outputs(
-...     onnx_config, tokenizer, base_model, onnx_path, onnx_outputs, onnx_config.atol_for_validation
-... )
-```
-
-Esta función usa el método `OnnxConfig.generate_dummy_inputs()` para generar entradas para el modelo base 
-y exportado, y la tolerancia absoluta se puede definir en la configuración. En general, encontramos una 
-concordancia numérica en el rango de 1e-6 a 1e-4, aunque es probable que cualquier valor menor que 1e-3 esté bien.
-
-### Contribuir con una nueva configuración a 🤗 Transformers
-
-¡Estamos buscando expandir el conjunto de configuraciones a la medida para usar y agradecemos las contribuciones de la comunidad! 
-Si deseas contribuir con su colaboración a la biblioteca, deberás:
-
-* Implementa la configuración de ONNX en el archivo `configuration_<model_name>.py` correspondiente
-* Incluye la arquitectura del modelo y las características correspondientes en [`~onnx.features.FeatureManager`]
-* Agrega tu arquitectura de modelo a las pruebas en `test_onnx_v2.py`
-
-Revisa cómo fue la contribución para la [configuración de IBERT](https://github.com/huggingface/transformers/pull/14868/files) 
-y así tener una idea de lo que necesito.
-
-## TorchScript
-
-<Tip>
-
-Este es el comienzo de nuestros experimentos con TorchScript y todavía estamos explorando sus capacidades con modelos de 
-tamaño de entrada variable. Es un tema de interés y profundizaremos nuestro análisis en las próximas 
-versiones,  con más ejemplos de código, una implementación más flexible y puntos de referencia que comparen códigos 
-basados en Python con TorchScript compilado.
-
-</Tip>
-
-Según la documentación de PyTorch: "TorchScript es una forma de crear modelos serializables y optimizables a partir del 
-código de PyTorch". Los dos módulos de Pytorch [JIT y TRACE](https://pytorch.org/docs/stable/jit.html) permiten al 
-desarrollador exportar su modelo para reutilizarlo  en otros programas, como los programas C++ orientados a la eficiencia.
-
-Hemos proporcionado una interfaz que permite exportar modelos de 🤗 Transformers a TorchScript para que puedan reutilizarse 
-en un entorno diferente  al de un programa Python basado en PyTorch. Aquí explicamos cómo exportar y usar nuestros modelos 
-usando TorchScript.
-
-Exportar un modelo requiere de dos cosas:
-
-- un pase hacia adelante con entradas ficticias.
-- instanciación del modelo con la indicador `torchscript`.
-
-Estas necesidades implican varias cosas con las que los desarrolladores deben tener cuidado. Éstas se detallan a continuación.
-
-### Indicador de TorchScript y pesos atados
-
-Este indicador es necesario porque la mayoría de los modelos de lenguaje en este repositorio tienen pesos vinculados entre su capa 
-de `Embedding` y su capa de `Decoding`. TorchScript no permite la exportación de modelos que tengan pesos atados, por lo que es 
-necesario desvincular y clonar los pesos previamente.
-
-Esto implica que los modelos instanciados con el indicador `torchscript` tienen su capa `Embedding` y `Decoding` separadas, 
-lo que significa que no deben entrenarse más adelante. El entrenamiento desincronizaría las dos capas, lo que generaría 
-resultados inesperados.
-
-Este no es el caso de los modelos que no tienen un cabezal de modelo de lenguaje, ya que no tienen pesos atados.
-Estos modelos se pueden exportar de forma segura sin el indicador `torchscript`.
-
-### Entradas ficticias y longitudes estándar
-
-Las entradas ficticias se utilizan para crear un modelo de pase hacia adelante. Mientras los valores de las entradas se 
-propagan a través de las capas, PyTorch realiza un seguimiento de las diferentes operaciones ejecutadas en cada tensor.
-Estas operaciones registradas se utilizan luego para crear el "rastro" del modelo.
-
-El rastro se crea en relación con las dimensiones de las entradas. Por lo tanto, está limitado por las dimensiones de la 
-entrada ficticia y no funcionará para ninguna otra longitud de secuencia o tamaño de lote. Al intentar con un tamaño diferente, 
-un error como:
-
-`The expanded size of the tensor (3) must match the existing size (7) at non-singleton dimension 2`
-
-aparecerá. Por lo tanto, se recomienda rastrear el modelo con un tamaño de entrada ficticia al menos tan grande como la 
-entrada más  grande que se alimentará al modelo durante la inferencia. El _padding_ se puede realizar para completar los 
-valores que faltan.  Sin embargo, como el modelo se habrá rastreado con un tamaño de entrada grande, las dimensiones de 
-las diferentes matrices también serán grandes, lo que dará como resultado más cálculos.
-
-Se recomienda tener cuidado con el número total de operaciones realizadas en cada entrada y seguir de cerca el rendimiento 
-al exportar modelos de longitud de secuencia variable.
-
-### Usar TorchScript en Python
-
-A continuación se muestra un ejemplo que muestra cómo guardar, cargar modelos y cómo usar el rastreo para la inferencia.
-
-#### Guardando un modelo
-
-Este fragmento muestra cómo usar TorchScript para exportar un `BertModel`. Aquí, el `BertModel` se instancia de acuerdo 
-con la clase `BertConfig` y luego se guarda en el disco con el nombre de archivo `traced_bert.pt`
-
-```python
-from transformers import BertModel, BertTokenizer, BertConfig
-import torch
-
-enc = BertTokenizer.from_pretrained("bert-base-uncased")
-
-# Tokenizing input text
-text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-tokenized_text = enc.tokenize(text)
-
-# Masking one of the input tokens
-masked_index = 8
-tokenized_text[masked_index] = "[MASK]"
-indexed_tokens = enc.convert_tokens_to_ids(tokenized_text)
-segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
-
-# Creating a dummy input
-tokens_tensor = torch.tensor([indexed_tokens])
-segments_tensors = torch.tensor([segments_ids])
-dummy_input = [tokens_tensor, segments_tensors]
-
-# Initializing the model with the torchscript flag
-# Flag set to True even though it is not necessary as this model does not have an LM Head.
-config = BertConfig(
-    vocab_size_or_config_json_file=32000,
-    hidden_size=768,
-    num_hidden_layers=12,
-    num_attention_heads=12,
-    intermediate_size=3072,
-    torchscript=True,
-)
-
-# Instantiating the model
-model = BertModel(config)
-
-# The model needs to be in evaluation mode
-model.eval()
-
-# If you are instantiating the model with *from_pretrained* you can also easily set the TorchScript flag
-model = BertModel.from_pretrained("bert-base-uncased", torchscript=True)
-
-# Creating the trace
-traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
-torch.jit.save(traced_model, "traced_bert.pt")
-```
-
-#### Cargar un modelo
-
-Este fragmento muestra cómo cargar el `BertModel` que se guardó previamente en el disco con el nombre `traced_bert.pt`.
-Estamos reutilizando el `dummy_input` previamente inicializado.
-
-```python
-loaded_model = torch.jit.load("traced_bert.pt")
-loaded_model.eval()
-
-all_encoder_layers, pooled_output = loaded_model(*dummy_input)
-```
-
-#### Usar un modelo rastreado para la inferencia
-
-Usar el modelo rastreado para la inferencia es tan simple como usar su método `__call__`:
-
-```python
-traced_model(tokens_tensor, segments_tensors)
-```
-
-### Implementar los modelos HuggingFace TorchScript en AWS mediante Neuron SDK
-
-AWS presentó la familia de instancias [Amazon EC2 Inf1](https://aws.amazon.com/ec2/instance-types/inf1/) para la inferencia 
-de aprendizaje automático de bajo costo y  alto rendimiento en la nube. Las instancias Inf1 funcionan con el chip AWS 
-Inferentia, un acelerador de hardware personalizado,  que se especializa en cargas de trabajo de inferencia de aprendizaje 
-profundo. [AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/#) es el kit de desarrollo para  Inferentia 
-que admite el rastreo y la optimización de modelos de  transformers para su implementación en Inf1. El SDK de Neuron proporciona:
-
-
-1. API fácil de usar con una línea de cambio de código para rastrear y optimizar un modelo de TorchScript para la inferencia en la nube.
-2. Optimizaciones de rendimiento listas para usar con un [costo-rendimiento mejorado](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/benchmark/>)
-3. Soporte para modelos HuggingFace Transformers construidos con [PyTorch](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/bert_tutorial/tutorial_pretrained_bert.html) 
-o [TensorFlow](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/tensorflow/huggingface_bert/huggingface_bert.html).
-
-#### Implicaciones
-
-Los modelos Transformers basados en la arquitectura 
-[BERT (Representaciones de _Enconder_ bidireccional de Transformers)](https://huggingface.co/docs/transformers/main/model_doc/bert), 
-o sus variantes, como [distilBERT](https://huggingface.co/docs/transformers/main/model_doc/distilbert) y 
-[roBERTa](https://huggingface.co/docs/transformers/main/model_doc/roberta), se ejecutarán mejor en Inf1 para tareas no 
-generativas, como la respuesta extractiva de preguntas, la clasificación de secuencias y la clasificación de tokens.
-Como alternativa, las tareas de generación de texto se pueden adaptar para ejecutarse en Inf1, según este 
-[tutorial de AWS Neuron MarianMT](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/transformers-marianmt.html).
-Puedes encontrar más información sobre los modelos que están listos para usarse en Inferentia en la 
-[sección _Model Architecture Fit_ de la documentación de Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/models/models-inferentia.html#models-inferentia).
-
-#### Dependencias
-
-Usar AWS Neuron para convertir modelos requiere las siguientes dependencias y entornos:
-
-* Un [entorno Neuron SDK](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/index.html#installation-guide), 
-que viene preconfigurado en [AWS Deep Learning AMI](https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-inferentia-launching.html).
-
-#### Convertir un modelo a AWS Neuron
-
-Con el mismo script usado en [Uso de TorchScript en Python](https://huggingface.co/docs/transformers/main/es/serialization#using-torchscript-in-python)
-para rastrear un "BertModel", puedes importar la extensión del _framework_ `torch.neuron` para acceder a los componentes 
-del SDK de Neuron a través de una API de Python.
-
-```python
-from transformers import BertModel, BertTokenizer, BertConfig
-import torch
-import torch.neuron
-```
-Y modificando la línea de código de rastreo de:
-
-```python
-torch.jit.trace(model, [tokens_tensor, segments_tensors])
-```
-
-con lo siguiente:
-
-```python
-torch.neuron.trace(model, [token_tensor, segments_tensors])
-```
-
-Este cambio permite a Neuron SDK rastrear el modelo y optimizarlo para ejecutarse en instancias Inf1.
-
-Para obtener más información sobre las funciones, las herramientas, los tutoriales de ejemplo y las últimas actualizaciones 
-de AWS Neuron SDK, consulte la [documentación de AWS NeuronSDK](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html).
diff --git a/docs/source/es/tasks/asr.md b/docs/source/es/tasks/asr.md
new file mode 100644
index 000000000000..850bdfd711e7
--- /dev/null
+++ b/docs/source/es/tasks/asr.md
@@ -0,0 +1,366 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Reconocimiento automático del habla
+
+<Youtube id="TksaY_FDgnk"/>
+
+El reconocimiento automático del habla (ASR, por sus siglas en inglés) convierte una señal de habla en texto y mapea una secuencia de entradas de audio en salidas en forma de texto. Los asistentes virtuales como Siri y Alexa usan modelos de ASR para ayudar a sus usuarios todos los días. De igual forma, hay muchas otras aplicaciones, como la transcripción de contenidos en vivo y la toma automática de notas durante reuniones.
+
+En esta guía te mostraremos como:
+
+1. Hacer fine-tuning al modelo [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) con el dataset [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) para transcribir audio a texto.
+2. Usar tu modelo ajustado para tareas de inferencia.
+
+<Tip>
+
+Revisa la [página de la tarea](https://huggingface.co/tasks/automatic-speech-recognition) de reconocimiento automático del habla para acceder a más información sobre los modelos, datasets y métricas asociados.
+
+</Tip>
+
+Antes de comenzar, asegúrate de haber instalado todas las librerías necesarias:
+
+```bash
+pip install transformers datasets evaluate jiwer
+```
+
+Te aconsejamos iniciar sesión con tu cuenta de Hugging Face para que puedas subir tu modelo y comartirlo con la comunidad. Cuando te sea solicitado, ingresa tu token para iniciar sesión:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## Cargar el dataset MInDS-14
+
+Comencemos cargando un subconjunto más pequeño del dataset [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) desde la biblioteca 🤗 Datasets. De esta forma, tendrás la oportunidad de experimentar y asegurarte de que todo funcione antes de invertir más tiempo entrenando con el dataset entero.
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> minds = load_dataset("PolyAI/minds14", name="en-US", split="train[:100]")
+```
+Divide la partición `train` (entrenamiento) en una partición de entrenamiento y una de prueba usando el método [`~Dataset.train_test_split`]:
+
+```py
+>>> minds = minds.train_test_split(test_size=0.2)
+```
+
+Ahora échale un vistazo al dataset:
+
+```py
+>>> minds
+DatasetDict({
+    train: Dataset({
+        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
+        num_rows: 16
+    })
+    test: Dataset({
+        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
+        num_rows: 4
+    })
+})
+```
+
+Aunque el dataset contiene mucha información útil, como los campos `lang_id` (identificador del lenguaje) y `english_transcription` (transcripción al inglés), en esta guía nos enfocaremos en los campos `audio` y `transcription`. Puedes quitar las otras columnas con el método [`~datasets.Dataset.remove_columns`]:
+
+```py
+>>> minds = minds.remove_columns(["english_transcription", "intent_class", "lang_id"])
+```
+
+Vuelve a echarle un vistazo al ejemplo:
+
+```py
+>>> minds["train"][0]
+{'audio': {'array': array([-0.00024414,  0.        ,  0.        , ...,  0.00024414,
+          0.00024414,  0.00024414], dtype=float32),
+  'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav',
+  'sampling_rate': 8000},
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav',
+ 'transcription': "hi I'm trying to use the banking app on my phone and currently my checking and savings account balance is not refreshing"}
+```
+
+Hay dos campos:
+
+- `audio`: un `array` (arreglo) unidimensional de la señal de habla que debe ser invocado para cargar y re-muestrear el archivo de audio.
+- `transcription`: el texto objetivo.
+
+## Preprocesamiento
+
+El siguiente paso es cargar un procesador Wav2Vec2 para procesar la señal de audio:
+
+```py
+>>> from transformers import AutoProcessor
+
+>>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base")
+```
+El dataset MInDS-14 tiene una tasa de muestreo de 8000kHz (puedes encontrar esta información en su [tarjeta de dataset](https://huggingface.co/datasets/PolyAI/minds14)), lo que significa que tendrás que re-muestrear el dataset a 16000kHz para poder usar el modelo Wav2Vec2 pre-entrenado:
+
+```py
+>>> minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
+>>> minds["train"][0]
+{'audio': {'array': array([-2.38064706e-04, -1.58618059e-04, -5.43987835e-06, ...,
+          2.78103951e-04,  2.38446111e-04,  1.18740834e-04], dtype=float32),
+  'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav',
+  'sampling_rate': 16000},
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav',
+ 'transcription': "hi I'm trying to use the banking app on my phone and currently my checking and savings account balance is not refreshing"}
+```
+
+Como puedes ver en el campo `transcription`, el texto contiene una mezcla de carácteres en mayúsculas y en minúsculas. El tokenizer Wav2Vec2 fue entrenado únicamente con carácteres en mayúsculas, así que tendrás que asegurarte de que el texto se ajuste al vocabulario del tokenizer:
+
+```py
+>>> def uppercase(example):
+...     return {"transcription": example["transcription"].upper()}
+
+
+>>> minds = minds.map(uppercase)
+```
+
+Ahora vamos a crear una función de preprocesamiento que:
+
+1. Invoque la columna `audio` para cargar y re-muestrear el archivo de audio.
+2. Extraiga el campo `input_values` (valores de entrada) del archivo de audio y haga la tokenización de la columna `transcription` con el procesador.
+
+```py
+>>> def prepare_dataset(batch):
+...     audio = batch["audio"]
+...     batch = processor(audio["array"], sampling_rate=audio["sampling_rate"], text=batch["transcription"])
+...     batch["input_length"] = len(batch["input_values"][0])
+...     return batch
+```
+
+Para aplicar la función de preprocesamiento a todo el dataset, puedes usar la función [`~datasets.Dataset.map`] de 🤗 Datasets. Para acelerar la función `map` puedes incrementar el número de procesos con el parámetro `num_proc`. Quita las columnas que no necesites con el método [`~datasets.Dataset.remove_columns`]:
+
+```py
+>>> encoded_minds = minds.map(prepare_dataset, remove_columns=minds.column_names["train"], num_proc=4)
+```
+
+🤗 Transformers no tiene un collator de datos para la tarea de ASR, así que tendrás que adaptar el [`DataCollatorWithPadding`] para crear un lote de ejemplos. El collator también le aplicará padding dinámico a tu texto y etiquetas para que tengan la longitud del elemento más largo en su lote (en vez de la mayor longitud en el dataset entero), de forma que todas las muestras tengan una longitud uniforme. Aunque es posible hacerle padding a tu texto con el `tokenizer` haciendo `padding=True`, el padding dinámico es más eficiente.
+
+A diferencia de otros collators de datos, este tiene que aplicarle un método de padding distinto a los campos `input_values` (valores de entrada) y `labels` (etiquetas):
+
+```py
+>>> import torch
+
+>>> from dataclasses import dataclass, field
+>>> from typing import Any, Dict, List, Optional, Union
+
+
+>>> @dataclass
+... class DataCollatorCTCWithPadding:
+...     processor: AutoProcessor
+...     padding: Union[bool, str] = "longest"
+
+...     def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+...         # particiona las entradas y las etiquetas ya que tienen que tener longitudes distintas y
+...         # requieren métodos de padding diferentes
+...         input_features = [{"input_values": feature["input_values"][0]} for feature in features]
+...         label_features = [{"input_ids": feature["labels"]} for feature in features]
+
+...         batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt")
+
+...         labels_batch = self.processor.pad(labels=label_features, padding=self.padding, return_tensors="pt")
+
+...         # remplaza el padding con -100 para ignorar la pérdida de forma correcta
+...         labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
+
+...         batch["labels"] = labels
+
+...         return batch
+```
+
+Ahora puedes instanciar tu `DataCollatorForCTCWithPadding`:
+
+```py
+>>> data_collator = DataCollatorCTCWithPadding(processor=processor, padding="longest")
+```
+
+## Evaluación
+
+A menudo es útil incluir una métrica durante el entrenamiento para evaluar el rendimiento de tu modelo. Puedes cargar un método de evaluación rápidamente con la biblioteca 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index). Para esta tarea, puedes usar la métrica de [tasa de error por palabra](https://huggingface.co/spaces/evaluate-metric/wer) (WER, por sus siglas en inglés). Puedes ver la [guía rápida](https://huggingface.co/docs/evaluate/a_quick_tour) de 🤗 Evaluate para aprender más acerca de cómo cargar y computar una métrica.
+
+```py
+>>> import evaluate
+
+>>> wer = evaluate.load("wer")
+```
+
+Ahora crea una función que le pase tus predicciones y etiquetas a [`~evaluate.EvaluationModule.compute`] para calcular la WER:
+
+```py
+>>> import numpy as np
+
+
+>>> def compute_metrics(pred):
+...     pred_logits = pred.predictions
+...     pred_ids = np.argmax(pred_logits, axis=-1)
+
+...     pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
+
+...     pred_str = processor.batch_decode(pred_ids)
+...     label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
+
+...     wer = wer.compute(predictions=pred_str, references=label_str)
+
+...     return {"wer": wer}
+```
+
+Ahora tu función `compute_metrics` (computar métricas) está lista y podrás usarla cuando estés preparando tu entrenamiento.
+
+## Entrenamiento
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+Si no tienes experiencia haciéndole fine-tuning a un modelo con el [`Trainer`], ¡échale un vistazo al tutorial básico [aquí](../training#train-with-pytorch-trainer)!
+
+</Tip>
+
+¡Ya puedes empezar a entrenar tu modelo! Para ello, carga Wav2Vec2 con [`AutoModelForCTC`]. Especifica la reducción que quieres aplicar con el parámetro `ctc_loss_reduction`. A menudo, es mejor usar el promedio en lugar de la sumatoria que se hace por defecto.
+
+```py
+>>> from transformers import AutoModelForCTC, TrainingArguments, Trainer
+
+>>> model = AutoModelForCTC.from_pretrained(
+...     "facebook/wav2vec2-base",
+...     ctc_loss_reduction="mean",
+...     pad_token_id=processor.tokenizer.pad_token_id,
+... )
+```
+En este punto, solo quedan tres pasos:
+
+1. Define tus hiperparámetros de entrenamiento en [`TrainingArguments`]. El único parámetro obligatorio es `output_dir` (carpeta de salida), el cual especifica dónde guardar tu modelo. Puedes subir este modelo al Hub haciendo `push_to_hub=True` (debes haber iniciado sesión en Hugging Face para subir tu modelo). Al final de cada época, el [`Trainer`] evaluará la WER y guardará el punto de control del entrenamiento.
+2. Pásale los argumentos del entrenamiento al [`Trainer`] junto con el modelo, el dataset, el tokenizer, el collator de datos y la función `compute_metrics`.
+3. Llama el método [`~Trainer.train`] para hacerle fine-tuning a tu modelo.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_asr_mind_model",
+...     per_device_train_batch_size=8,
+...     gradient_accumulation_steps=2,
+...     learning_rate=1e-5,
+...     warmup_steps=500,
+...     max_steps=2000,
+...     gradient_checkpointing=True,
+...     fp16=True,
+...     group_by_length=True,
+...     evaluation_strategy="steps",
+...     per_device_eval_batch_size=8,
+...     save_steps=1000,
+...     eval_steps=1000,
+...     logging_steps=25,
+...     load_best_model_at_end=True,
+...     metric_for_best_model="wer",
+...     greater_is_better=False,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=encoded_minds["train"],
+...     eval_dataset=encoded_minds["test"],
+...     tokenizer=processor.feature_extractor,
+...     data_collator=data_collator,
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+Una vez que el entrenamiento haya sido completado, comparte tu modelo en el Hub con el método [`~transformers.Trainer.push_to_hub`] para que todo el mundo pueda usar tu modelo:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+</frameworkcontent>
+
+<Tip>
+
+Para ver un ejemplo más detallado de cómo hacerle fine-tuning a un modelo para reconocimiento automático del habla, échale un vistazo a esta [entrada de blog](https://huggingface.co/blog/fine-tune-wav2vec2-english) para ASR en inglés y a esta [entrada](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2) para ASR multilingüe.
+
+</Tip>
+
+## Inferencia
+
+¡Genial, ahora que le has hecho fine-tuning a un modelo, puedes usarlo para inferencia!
+
+Carga el archivo de audio sobre el cual quieras correr la inferencia. ¡Recuerda re-muestrar la tasa de muestreo del archivo de audio para que sea la misma del modelo si es necesario!
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> dataset = load_dataset("PolyAI/minds14", "en-US", split="train")
+>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
+>>> sampling_rate = dataset.features["audio"].sampling_rate
+>>> audio_file = dataset[0]["audio"]["path"]
+```
+
+La manera más simple de probar tu modelo para hacer inferencia es usarlo en un [`pipeline`]. Puedes instanciar un `pipeline` para reconocimiento automático del habla con tu modelo y pasarle tu archivo de audio:
+
+```py
+>>> from transformers import pipeline
+
+>>> transcriber = pipeline("automatic-speech-recognition", model="stevhliu/my_awesome_asr_minds_model")
+>>> transcriber(audio_file)
+{'text': 'I WOUD LIKE O SET UP JOINT ACOUNT WTH Y PARTNER'}
+```
+
+<Tip>
+
+La transcripción es decente, pero podría ser mejor. ¡Intenta hacerle fine-tuning a tu modelo con más ejemplos para obtener resultados aún mejores!
+
+</Tip>
+
+También puedes replicar de forma manual los resultados del `pipeline` si lo deseas:
+
+<frameworkcontent>
+<pt>
+Carga un procesador para preprocesar el archivo de audio y la transcripción y devuelve el `input` como un tensor de PyTorch:
+
+```py
+>>> from transformers import AutoProcessor
+
+>>> processor = AutoProcessor.from_pretrained("stevhliu/my_awesome_asr_mind_model")
+>>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
+```
+
+Pásale tus entradas al modelo y devuelve los logits:
+
+```py
+>>> from transformers import AutoModelForCTC
+
+>>> model = AutoModelForCTC.from_pretrained("stevhliu/my_awesome_asr_mind_model")
+>>> with torch.no_grad():
+...     logits = model(**inputs).logits
+```
+
+Obtén los identificadores de los tokens con mayor probabilidad en las predicciones y usa el procesador para decodificarlos y transformarlos en texto:
+
+```py
+>>> import torch
+
+>>> predicted_ids = torch.argmax(logits, dim=-1)
+>>> transcription = processor.batch_decode(predicted_ids)
+>>> transcription
+['I WOUL LIKE O SET UP JOINT ACOUNT WTH Y PARTNER']
+```
+</pt>
+</frameworkcontent>
diff --git a/docs/source/es/tasks/asr.mdx b/docs/source/es/tasks/asr.mdx
deleted file mode 100644
index 7d331b11f7ea..000000000000
--- a/docs/source/es/tasks/asr.mdx
+++ /dev/null
@@ -1,363 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Reconocimiento automático del habla
-
-<Youtube id="TksaY_FDgnk"/>
-
-El reconocimiento automático del habla (ASR, por sus siglas en inglés) convierte una señal de habla en texto y mapea una secuencia de entradas de audio en salidas en forma de texto. Los asistentes virtuales como Siri y Alexa usan modelos de ASR para ayudar a sus usuarios todos los días. De igual forma, hay muchas otras aplicaciones, como la transcripción de contenidos en vivo y la toma automática de notas durante reuniones.
-
-En esta guía te mostraremos como:
-
-1. Hacer fine-tuning al modelo [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) con el dataset [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) para transcribir audio a texto.
-2. Usar tu modelo ajustado para tareas de inferencia.
-
-<Tip>
-
-Revisa la [página de la tarea](https://huggingface.co/tasks/automatic-speech-recognition) de reconocimiento automático del habla para acceder a más información sobre los modelos, datasets y métricas asociados.
-
-</Tip>
-
-Antes de comenzar, asegúrate de haber instalado todas las librerías necesarias:
-
-```bash
-pip install transformers datasets evaluate jiwer
-```
-
-Te aconsejamos iniciar sesión con tu cuenta de Hugging Face para que puedas subir tu modelo y comartirlo con la comunidad. Cuando te sea solicitado, ingresa tu token para iniciar sesión:
-
-```py
->>> from huggingface_hub import notebook_login
-
->>> notebook_login()
-```
-
-## Cargar el dataset MInDS-14
-
-Comencemos cargando un subconjunto más pequeño del dataset [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) desde la biblioteca 🤗 Datasets. De esta forma, tendrás la oportunidad de experimentar y asegurarte de que todo funcione antes de invertir más tiempo entrenando con el dataset entero.
-
-```py
->>> from datasets import load_dataset, Audio
-
->>> minds = load_dataset("PolyAI/minds14", name="en-US", split="train[:100]")
-```
-Divide la partición `train` (entrenamiento) en una partición de entrenamiento y una de prueba usando el método [`~Dataset.train_test_split`]:
-
-```py
->>> minds = minds.train_test_split(test_size=0.2)
-```
-
-Ahora échale un vistazo al dataset:
-
-```py
->>> minds
-DatasetDict({
-    train: Dataset({
-        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
-        num_rows: 16
-    })
-    test: Dataset({
-        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
-        num_rows: 4
-    })
-})
-```
-
-Aunque el dataset contiene mucha información útil, como los campos `lang_id` (identificador del lenguaje) y `english_transcription` (transcripción al inglés), en esta guía nos enfocaremos en los campos `audio` y `transcription`. Puedes quitar las otras columnas con el método [`~datasets.Dataset.remove_columns`]:
-
-```py
->>> minds = minds.remove_columns(["english_transcription", "intent_class", "lang_id"])
-```
-
-Vuelve a echarle un vistazo al ejemplo:
-
-```py
->>> minds["train"][0]
-{'audio': {'array': array([-0.00024414,  0.        ,  0.        , ...,  0.00024414,
-          0.00024414,  0.00024414], dtype=float32),
-  'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav',
-  'sampling_rate': 8000},
- 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav',
- 'transcription': "hi I'm trying to use the banking app on my phone and currently my checking and savings account balance is not refreshing"}
-```
-
-Hay dos campos:
-
-- `audio`: un `array` (arreglo) unidimensional de la señal de habla que debe ser invocado para cargar y re-muestrear el archivo de audio.
-- `transcription`: el texto objetivo.
-
-## Preprocesamiento
-
-El siguiente paso es cargar un procesador Wav2Vec2 para procesar la señal de audio:
-
-```py
->>> from transformers import AutoProcessor
-
->>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base")
-```
-El dataset MInDS-14 tiene una tasa de muestreo de 8000kHz (puedes encontrar esta información en su [tarjeta de dataset](https://huggingface.co/datasets/PolyAI/minds14)), lo que significa que tendrás que re-muestrear el dataset a 16000kHz para poder usar el modelo Wav2Vec2 pre-entrenado:
-
-```py
->>> minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
->>> minds["train"][0]
-{'audio': {'array': array([-2.38064706e-04, -1.58618059e-04, -5.43987835e-06, ...,
-          2.78103951e-04,  2.38446111e-04,  1.18740834e-04], dtype=float32),
-  'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav',
-  'sampling_rate': 16000},
- 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav',
- 'transcription': "hi I'm trying to use the banking app on my phone and currently my checking and savings account balance is not refreshing"}
-```
-
-Como puedes ver en el campo `transcription`, el texto contiene una mezcla de carácteres en mayúsculas y en minúsculas. El tokenizer Wav2Vec2 fue entrenado únicamente con carácteres en mayúsculas, así que tendrás que asegurarte de que el texto se ajuste al vocabulario del tokenizer:
-
-```py
->>> def uppercase(example):
-...     return {"transcription": example["transcription"].upper()}
-
-
->>> minds = minds.map(uppercase)
-```
-
-Ahora vamos a crear una función de preprocesamiento que:
-
-1. Invoque la columna `audio` para cargar y re-muestrear el archivo de audio.
-2. Extraiga el campo `input_values` (valores de entrada) del archivo de audio y haga la tokenización de la columna `transcription` con el procesador.
-
-```py
->>> def prepare_dataset(batch):
-...     audio = batch["audio"]
-...     batch = processor(audio["array"], sampling_rate=audio["sampling_rate"], text=batch["transcription"])
-...     batch["input_length"] = len(batch["input_values"][0])
-...     return batch
-```
-
-Para aplicar la función de preprocesamiento a todo el dataset, puedes usar la función [`~datasets.Dataset.map`] de 🤗 Datasets. Para acelerar la función `map` puedes incrementar el número de procesos con el parámetro `num_proc`. Quita las columnas que no necesites con el método [`~datasets.Dataset.remove_columns`]:
-
-```py
->>> encoded_minds = minds.map(prepare_dataset, remove_columns=minds.column_names["train"], num_proc=4)
-```
-
-🤗 Transformers no tiene un collator de datos para la tarea de ASR, así que tendrás que adaptar el [`DataCollatorWithPadding`] para crear un lote de ejemplos. El collator también le aplicará padding dinámico a tu texto y etiquetas para que tengan la longitud del elemento más largo en su lote (en vez de la mayor longitud en el dataset entero), de forma que todas las muestras tengan una longitud uniforme. Aunque es posible hacerle padding a tu texto con el `tokenizer` haciendo `padding=True`, el padding dinámico es más eficiente.
-
-A diferencia de otros collators de datos, este tiene que aplicarle un método de padding distinto a los campos `input_values` (valores de entrada) y `labels` (etiquetas):
-
-```py
->>> import torch
-
->>> from dataclasses import dataclass, field
->>> from typing import Any, Dict, List, Optional, Union
-
-
->>> @dataclass
-... class DataCollatorCTCWithPadding:
-
-...     processor: AutoProcessor
-...     padding: Union[bool, str] = "longest"
-
-...     def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
-...         # particiona las entradas y las etiquetas ya que tienen que tener longitudes distintas y
-...         # requieren métodos de padding diferentes
-...         input_features = [{"input_values": feature["input_values"][0]} for feature in features]
-...         label_features = [{"input_ids": feature["labels"]} for feature in features]
-
-...         batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt")
-
-...         labels_batch = self.processor.pad(labels=label_features, padding=self.padding, return_tensors="pt")
-
-...         # remplaza el padding con -100 para ignorar la pérdida de forma correcta
-...         labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
-
-...         batch["labels"] = labels
-
-...         return batch
-```
-
-Ahora puedes instanciar tu `DataCollatorForCTCWithPadding`:
-
-```py
->>> data_collator = DataCollatorCTCWithPadding(processor=processor, padding="longest")
-```
-
-## Evaluación
-
-A menudo es útil incluir una métrica durante el entrenamiento para evaluar el rendimiento de tu modelo. Puedes cargar un método de evaluación rápidamente con la biblioteca 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index). Para esta tarea, puedes usar la métrica de [tasa de error por palabra](https://huggingface.co/spaces/evaluate-metric/wer) (WER, por sus siglas en inglés). Puedes ver la [guía rápida](https://huggingface.co/docs/evaluate/a_quick_tour) de 🤗 Evaluate para aprender más acerca de cómo cargar y computar una métrica.
-
-```py
->>> import evaluate
-
->>> wer = evaluate.load("wer")
-```
-
-Ahora crea una función que le pase tus predicciones y etiquetas a [`~evaluate.EvaluationModule.compute`] para calcular la WER:
-
-```py
->>> import numpy as np
-
-
->>> def compute_metrics(pred):
-...     pred_logits = pred.predictions
-...     pred_ids = np.argmax(pred_logits, axis=-1)
-
-...     pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
-
-...     pred_str = processor.batch_decode(pred_ids)
-...     label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
-
-...     wer = wer.compute(predictions=pred_str, references=label_str)
-
-...     return {"wer": wer}
-```
-
-Ahora tu función `compute_metrics` (computar métricas) está lista y podrás usarla cuando estés preparando tu entrenamiento.
-
-## Entrenamiento
-
-<frameworkcontent>
-<pt>
-<Tip>
-
-Si no tienes experiencia haciéndole fine-tuning a un modelo con el [`Trainer`], ¡échale un vistazo al tutorial básico [aquí](../training#train-with-pytorch-trainer)!
-
-</Tip>
-
-¡Ya puedes empezar a entrenar tu modelo! Para ello, carga Wav2Vec2 con [`AutoModelForCTC`]. Especifica la reducción que quieres aplicar con el parámetro `ctc_loss_reduction`. A menudo, es mejor usar el promedio en lugar de la sumatoria que se hace por defecto.
-
-```py
->>> from transformers import AutoModelForCTC, TrainingArguments, Trainer
-
->>> model = AutoModelForCTC.from_pretrained(
-...     "facebook/wav2vec2-base",
-...     ctc_loss_reduction="mean",
-...     pad_token_id=processor.tokenizer.pad_token_id,
-... )
-```
-En este punto, solo quedan tres pasos:
-
-1. Define tus hiperparámetros de entrenamiento en [`TrainingArguments`]. El único parámetro obligatorio es `output_dir` (carpeta de salida), el cual especifica dónde guardar tu modelo. Puedes subir este modelo al Hub haciendo `push_to_hub=True` (debes haber iniciado sesión en Hugging Face para subir tu modelo). Al final de cada época, el [`Trainer`] evaluará la WER y guardará el punto de control del entrenamiento.
-2. Pásale los argumentos del entrenamiento al [`Trainer`] junto con el modelo, el dataset, el tokenizer, el collator de datos y la función `compute_metrics`.
-3. Llama el método [`~Trainer.train`] para hacerle fine-tuning a tu modelo.
-
-```py
->>> training_args = TrainingArguments(
-...     output_dir="my_awesome_asr_mind_model",
-...     per_device_train_batch_size=8,
-...     gradient_accumulation_steps=2,
-...     learning_rate=1e-5,
-...     warmup_steps=500,
-...     max_steps=2000,
-...     gradient_checkpointing=True,
-...     fp16=True,
-...     group_by_length=True,
-...     evaluation_strategy="steps",
-...     per_device_eval_batch_size=8,
-...     save_steps=1000,
-...     eval_steps=1000,
-...     logging_steps=25,
-...     load_best_model_at_end=True,
-...     metric_for_best_model="wer",
-...     greater_is_better=False,
-...     push_to_hub=True,
-... )
-
->>> trainer = Trainer(
-...     model=model,
-...     args=training_args,
-...     train_dataset=encoded_minds["train"],
-...     eval_dataset=encoded_minds["test"],
-...     tokenizer=processor.feature_extractor,
-...     data_collator=data_collator,
-...     compute_metrics=compute_metrics,
-... )
-
->>> trainer.train()
-```
-
-Una vez que el entrenamiento haya sido completado, comparte tu modelo en el Hub con el método [`~transformers.Trainer.push_to_hub`] para que todo el mundo pueda usar tu modelo:
-
-```py
->>> trainer.push_to_hub()
-```
-</pt>
-</frameworkcontent>
-
-<Tip>
-
-Para ver un ejemplo más detallado de cómo hacerle fine-tuning a un modelo para reconocimiento automático del habla, échale un vistazo a esta [entrada de blog](https://huggingface.co/blog/fine-tune-wav2vec2-english) para ASR en inglés y a esta [entrada](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2) para ASR multilingüe.
-
-</Tip>
-
-## Inferencia
-
-¡Genial, ahora que le has hecho fine-tuning a un modelo, puedes usarlo para inferencia!
-
-Carga el archivo de audio sobre el cual quieras correr la inferencia. ¡Recuerda re-muestrar la tasa de muestreo del archivo de audio para que sea la misma del modelo si es necesario!
-
-```py
->>> from datasets import load_dataset, Audio
-
->>> dataset = load_dataset("PolyAI/minds14", "en-US", split="train")
->>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
->>> sampling_rate = dataset.features["audio"].sampling_rate
->>> audio_file = dataset[0]["audio"]["path"]
-```
-
-La manera más simple de probar tu modelo para hacer inferencia es usarlo en un [`pipeline`]. Puedes instanciar un `pipeline` para reconocimiento automático del habla con tu modelo y pasarle tu archivo de audio:
-
-```py
->>> from transformers import pipeline
-
->>> transcriber = pipeline("automatic-speech-recognition", model="stevhliu/my_awesome_asr_minds_model")
->>> transcriber(audio_file)
-{'text': 'I WOUD LIKE O SET UP JOINT ACOUNT WTH Y PARTNER'}
-```
-
-<Tip>
-
-La transcripción es decente, pero podría ser mejor. ¡Intenta hacerle fine-tuning a tu modelo con más ejemplos para obtener resultados aún mejores!
-
-</Tip>
-
-También puedes replicar de forma manual los resultados del `pipeline` si lo deseas:
-
-<frameworkcontent>
-<pt>
-Carga un procesador para preprocesar el archivo de audio y la transcripción y devuelve el `input` como un tensor de PyTorch:
-
-```py
->>> from transformers import AutoProcessor
-
->>> processor = AutoProcessor.from_pretrained("stevhliu/my_awesome_asr_mind_model")
->>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
-```
-
-Pásale tus entradas al modelo y devuelve los logits:
-
-```py
->>> from transformers import AutoModelForCTC
-
->>> model = AutoModelForCTC.from_pretrained("stevhliu/my_awesome_asr_mind_model")
->>> with torch.no_grad():
-...     logits = model(**inputs).logits
-```
-
-Obtén los identificadores de los tokens con mayor probabilidad en las predicciones y usa el procesador para decodificarlos y transformarlos en texto:
-
-```py
->>> import torch
-
->>> predicted_ids = torch.argmax(logits, dim=-1)
->>> transcription = processor.batch_decode(predicted_ids)
->>> transcription
-['I WOUL LIKE O SET UP JOINT ACOUNT WTH Y PARTNER']
-```
-</pt>
-</frameworkcontent>
diff --git a/docs/source/es/tasks/image_classification.md b/docs/source/es/tasks/image_classification.md
new file mode 100644
index 000000000000..3a959aa934ff
--- /dev/null
+++ b/docs/source/es/tasks/image_classification.md
@@ -0,0 +1,173 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Clasificación de imágenes
+
+<Youtube id="tjAIM7BOYhw"/>
+
+La clasificación de imágenes asigna una etiqueta o clase a una imagen. A diferencia de la clasificación de texto o audio, las entradas son los valores de los píxeles que representan una imagen. La clasificación de imágenes tiene muchos usos, como la detección de daños tras una catástrofe, el control de la salud de los cultivos o la búsqueda de signos de enfermedad en imágenes médicas.
+
+Esta guía te mostrará como hacer fine-tune al [ViT](https://huggingface.co/docs/transformers/v4.16.2/en/model_doc/vit) en el dataset [Food-101](https://huggingface.co/datasets/food101) para clasificar un alimento en una imagen.
+
+<Tip>
+
+Consulta la [página de la tarea](https://huggingface.co/tasks/audio-classification) de clasificación de imágenes para obtener más información sobre sus modelos, datasets y métricas asociadas.
+
+</Tip>
+
+## Carga el dataset Food-101
+
+Carga solo las primeras 5000 imágenes del dataset Food-101 de la biblioteca 🤗 de Datasets ya que es bastante grande:
+
+```py
+>>> from datasets import load_dataset
+
+>>> food = load_dataset("food101", split="train[:5000]")
+```
+
+Divide el dataset en un train y un test set:
+
+```py
+>>> food = food.train_test_split(test_size=0.2)
+```
+
+A continuación, observa un ejemplo:
+
+```py
+>>> food["train"][0]
+{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x512 at 0x7F52AFC8AC50>,
+ 'label': 79}
+```
+
+El campo `image` contiene una imagen PIL, y cada `label` es un número entero que representa una clase. Crea un diccionario que asigne un nombre de label a un entero y viceversa. El mapeo ayudará al modelo a recuperar el nombre de label a partir del número de la misma:
+
+```py
+>>> labels = food["train"].features["label"].names
+>>> label2id, id2label = dict(), dict()
+>>> for i, label in enumerate(labels):
+...     label2id[label] = str(i)
+...     id2label[str(i)] = label
+```
+
+Ahora puedes convertir el número de label en un nombre de label para obtener más información:
+
+```py
+>>> id2label[str(79)]
+'prime_rib'
+```
+
+Cada clase de alimento - o label - corresponde a un número; `79` indica una costilla de primera en el ejemplo anterior.
+
+## Preprocesa
+
+Carga el image processor de ViT para procesar la imagen en un tensor:
+
+```py
+>>> from transformers import AutoImageProcessor
+
+>>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
+```
+
+Aplica varias transformaciones de imagen al dataset para hacer el modelo más robusto contra el overfitting. En este caso se utilizará el módulo [`transforms`](https://pytorch.org/vision/stable/transforms.html) de torchvision. Recorta una parte aleatoria de la imagen, cambia su tamaño y normalízala con la media y la desviación estándar de la imagen:
+
+```py
+>>> from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor
+
+>>> normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
+>>> _transforms = Compose([RandomResizedCrop(image_processor.size["height"]), ToTensor(), normalize])
+```
+
+Crea una función de preprocesamiento que aplique las transformaciones y devuelva los `pixel_values` - los inputs al modelo - de la imagen:
+
+```py
+>>> def transforms(examples):
+...     examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in examples["image"]]
+...     del examples["image"]
+...     return examples
+```
+
+Utiliza el método [`with_transform`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?#datasets.Dataset.with_transform) de 🤗 Dataset para aplicar las transformaciones sobre todo el dataset. Las transformaciones se aplican sobre la marcha cuando se carga un elemento del dataset:
+
+```py
+>>> food = food.with_transform(transforms)
+```
+
+Utiliza [`DefaultDataCollator`] para crear un batch de ejemplos. A diferencia de otros data collators en 🤗 Transformers, el DefaultDataCollator no aplica un preprocesamiento adicional como el padding.
+
+```py
+>>> from transformers import DefaultDataCollator
+
+>>> data_collator = DefaultDataCollator()
+```
+
+## Entrena
+Carga ViT con [`AutoModelForImageClassification`]. Especifica el número de labels, y pasa al modelo el mapping entre el número de label y la clase de label:
+
+```py
+>>> from transformers import AutoModelForImageClassification, TrainingArguments, Trainer
+
+>>> model = AutoModelForImageClassification.from_pretrained(
+...     "google/vit-base-patch16-224-in21k",
+...     num_labels=len(labels),
+...     id2label=id2label,
+...     label2id=label2id,
+... )
+```
+
+<Tip>
+
+Si no estás familiarizado con el fine-tuning de un modelo con el [`Trainer`], echa un vistazo al tutorial básico [aquí](../training#finetune-with-trainer)!
+
+</Tip>
+
+Al llegar a este punto, solo quedan tres pasos:
+
+1. Define tus hiperparámetros de entrenamiento en [`TrainingArguments`]. Es importante que no elimines las columnas que no se utilicen, ya que esto hará que desaparezca la columna `image`. Sin la columna `image` no puedes crear `pixel_values`. Establece `remove_unused_columns=False` para evitar este comportamiento.
+2. Pasa los training arguments al [`Trainer`] junto con el modelo, los datasets, tokenizer y data collator.
+3. Llama [`~Trainer.train`] para hacer fine-tune de tu modelo.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="./results",
+...     per_device_train_batch_size=16,
+...     evaluation_strategy="steps",
+...     num_train_epochs=4,
+...     fp16=True,
+...     save_steps=100,
+...     eval_steps=100,
+...     logging_steps=10,
+...     learning_rate=2e-4,
+...     save_total_limit=2,
+...     remove_unused_columns=False,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     data_collator=data_collator,
+...     train_dataset=food["train"],
+...     eval_dataset=food["test"],
+...     tokenizer=image_processor,
+... )
+
+>>> trainer.train()
+```
+
+<Tip>
+
+Para ver un ejemplo más a profundidad de cómo hacer fine-tune a un modelo para clasificación de imágenes, echa un vistazo al correspondiente [PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
+
+</Tip>
diff --git a/docs/source/es/tasks/image_classification.mdx b/docs/source/es/tasks/image_classification.mdx
deleted file mode 100644
index 9b8b03207d08..000000000000
--- a/docs/source/es/tasks/image_classification.mdx
+++ /dev/null
@@ -1,169 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Clasificación de imágenes
-
-<Youtube id="tjAIM7BOYhw"/>
-
-La clasificación de imágenes asigna una etiqueta o clase a una imagen. A diferencia de la clasificación de texto o audio, las entradas son los valores de los píxeles que representan una imagen. La clasificación de imágenes tiene muchos usos, como la detección de daños tras una catástrofe, el control de la salud de los cultivos o la búsqueda de signos de enfermedad en imágenes médicas.
-
-Esta guía te mostrará como hacer fine-tune al [ViT](https://huggingface.co/docs/transformers/v4.16.2/en/model_doc/vit) en el dataset [Food-101](https://huggingface.co/datasets/food101) para clasificar un alimento en una imagen.
-
-<Tip>
-
-Consulta la [página de la tarea](https://huggingface.co/tasks/audio-classification) de clasificación de imágenes para obtener más información sobre sus modelos, datasets y métricas asociadas.
-
-</Tip>
-
-## Carga el dataset Food-101
-
-Carga solo las primeras 5000 imágenes del dataset Food-101 de la biblioteca 🤗 de Datasets ya que es bastante grande:
-
-```py
->>> from datasets import load_dataset
-
->>> food = load_dataset("food101", split="train[:5000]")
-```
-
-Divide el dataset en un train y un test set:
-
-```py
->>> food = food.train_test_split(test_size=0.2)
-```
-
-A continuación, observa un ejemplo:
-
-```py
->>> food["train"][0]
-{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x512 at 0x7F52AFC8AC50>,
- 'label': 79}
-```
-
-El campo `image` contiene una imagen PIL, y cada `label` es un número entero que representa una clase. Crea un diccionario que asigne un nombre de label a un entero y viceversa. El mapeo ayudará al modelo a recuperar el nombre de label a partir del número de la misma:
-
-```py
->>> labels = food["train"].features["label"].names
->>> label2id, id2label = dict(), dict()
->>> for i, label in enumerate(labels):
-...     label2id[label] = str(i)
-...     id2label[str(i)] = label
-```
-
-Ahora puedes convertir el número de label en un nombre de label para obtener más información:
-
-```py
->>> id2label[str(79)]
-'prime_rib'
-```
-
-Cada clase de alimento - o label - corresponde a un número; `79` indica una costilla de primera en el ejemplo anterior.
-
-## Preprocesa
-
-Carga el feature extractor de ViT para procesar la imagen en un tensor:
-
-```py
->>> from transformers import AutoFeatureExtractor
-
->>> feature_extractor = AutoFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
-```
-
-Aplica varias transformaciones de imagen al dataset para hacer el modelo más robusto contra el overfitting. En este caso se utilizará el módulo [`transforms`](https://pytorch.org/vision/stable/transforms.html) de torchvision. Recorta una parte aleatoria de la imagen, cambia su tamaño y normalízala con la media y la desviación estándar de la imagen:
-
-```py
->>> from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor
-
->>> normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
->>> _transforms = Compose([RandomResizedCrop(feature_extractor.size), ToTensor(), normalize])
-```
-
-Crea una función de preprocesamiento que aplique las transformaciones y devuelva los `pixel_values` - los inputs al modelo - de la imagen:
-
-```py
->>> def transforms(examples):
-...     examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in examples["image"]]
-...     del examples["image"]
-...     return examples
-```
-
-Utiliza el método [`with_transform`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?#datasets.Dataset.with_transform) de 🤗 Dataset para aplicar las transformaciones sobre todo el dataset. Las transformaciones se aplican sobre la marcha cuando se carga un elemento del dataset:
-
-```py
->>> food = food.with_transform(transforms)
-```
-
-Utiliza [`DefaultDataCollator`] para crear un batch de ejemplos. A diferencia de otros data collators en 🤗 Transformers, el DefaultDataCollator no aplica un preprocesamiento adicional como el padding.
-
-```py
->>> from transformers import DefaultDataCollator
-
->>> data_collator = DefaultDataCollator()
-```
-
-## Entrena
-Carga ViT con [`AutoModelForImageClassification`]. Especifica el número de labels, y pasa al modelo el mapping entre el número de label y la clase de label:
-
-```py
->>> from transformers import AutoModelForImageClassification, TrainingArguments, Trainer
-
->>> model = AutoModelForImageClassification.from_pretrained(
-...     "google/vit-base-patch16-224-in21k",
-...     num_labels=len(labels),
-...     id2label=id2label,
-...     label2id=label2id,
-... )
-```
-
-<Tip>
-
-Si no estás familiarizado con el fine-tuning de un modelo con el [`Trainer`], echa un vistazo al tutorial básico [aquí](../training#finetune-with-trainer)!
-
-</Tip>
-
-Al llegar a este punto, solo quedan tres pasos:
-
-1. Define tus hiperparámetros de entrenamiento en [`TrainingArguments`]. Es importante que no elimines las columnas que no se utilicen, ya que esto hará que desaparezca la columna `image`. Sin la columna `image` no puedes crear `pixel_values`. Establece `remove_unused_columns=False` para evitar este comportamiento.
-2. Pasa los training arguments al [`Trainer`] junto con el modelo, los datasets, tokenizer y data collator.
-3. Llama [`~Trainer.train`] para hacer fine-tune de tu modelo.
-
-```py
->>> training_args = TrainingArguments(
-...     output_dir="./results",
-...     per_device_train_batch_size=16,
-...     evaluation_strategy="steps",
-...     num_train_epochs=4,
-...     fp16=True,
-...     save_steps=100,
-...     eval_steps=100,
-...     logging_steps=10,
-...     learning_rate=2e-4,
-...     save_total_limit=2,
-...     remove_unused_columns=False,
-... )
-
->>> trainer = Trainer(
-...     model=model,
-...     args=training_args,
-...     data_collator=data_collator,
-...     train_dataset=food["train"],
-...     eval_dataset=food["test"],
-...     tokenizer=feature_extractor,
-... )
-
->>> trainer.train()
-```
-
-<Tip>
-
-Para ver un ejemplo más a profundidad de cómo hacer fine-tune a un modelo para clasificación de imágenes, echa un vistazo al correspondiente [PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
-
-</Tip>
diff --git a/docs/source/es/tasks/language_modeling.md b/docs/source/es/tasks/language_modeling.md
new file mode 100644
index 000000000000..66ac8fb0d4b5
--- /dev/null
+++ b/docs/source/es/tasks/language_modeling.md
@@ -0,0 +1,423 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Modelado de lenguaje
+
+El modelado de lenguaje predice palabras en un enunciado. Hay dos formas de modelado de lenguaje.
+
+<Youtube id="Vpjb1lu0MDk"/>
+
+El modelado de lenguaje causal predice el siguiente token en una secuencia de tokens, y el modelo solo puede considerar los tokens a la izquierda.
+
+<Youtube id="mqElG5QJWUg"/>
+
+El modelado de lenguaje por enmascaramiento predice un token enmascarado en una secuencia, y el modelo puede considerar los tokens bidireccionalmente.
+
+Esta guía te mostrará cómo realizar fine-tuning [DistilGPT2](https://huggingface.co/distilgpt2) para modelos de lenguaje causales y [DistilRoBERTa](https://huggingface.co/distilroberta-base) para modelos de lenguaje por enmascaramiento en el [r/askscience](https://www.reddit.com/r/askscience/) subdataset [ELI5](https://huggingface.co/datasets/eli5). 
+
+<Tip>
+
+Puedes realizar fine-tuning a otras arquitecturas para modelos de lenguaje como [GPT-Neo](https://huggingface.co/EleutherAI/gpt-neo-125M), [GPT-J](https://huggingface.co/EleutherAI/gpt-j-6B) y [BERT](https://huggingface.co/bert-base-uncased) siguiendo los mismos pasos presentados en esta guía!
+
+Mira la [página de tarea](https://huggingface.co/tasks/text-generation) para generación de texto y la [página de tarea](https://huggingface.co/tasks/fill-mask) para modelos de lenguajes por enmascaramiento para obtener más información sobre los modelos, datasets, y métricas asociadas.
+
+</Tip>
+
+## Carga el dataset ELI5
+
+Carga solo los primeros 5000 registros desde la biblioteca 🤗 Datasets, dado que es bastante grande:
+
+```py
+>>> from datasets import load_dataset
+
+>>> eli5 = load_dataset("eli5", split="train_asks[:5000]")
+```
+
+Divide este dataset en subdatasets para el entrenamiento y el test:
+
+```py
+eli5 = eli5.train_test_split(test_size=0.2)
+```
+
+Luego observa un ejemplo:
+
+```py
+>>> eli5["train"][0]
+{'answers': {'a_id': ['c3d1aib', 'c3d4lya'],
+  'score': [6, 3],
+  'text': ["The velocity needed to remain in orbit is equal to the square root of Newton's constant times the mass of earth divided by the distance from the center of the earth. I don't know the altitude of that specific mission, but they're usually around 300 km. That means he's going 7-8 km/s.\n\nIn space there are no other forces acting on either the shuttle or the guy, so they stay in the same position relative to each other. If he were to become unable to return to the ship, he would presumably run out of oxygen, or slowly fall into the atmosphere and burn up.",
+   "Hope you don't mind me asking another question, but why aren't there any stars visible in this photo?"]},
+ 'answers_urls': {'url': []},
+ 'document': '',
+ 'q_id': 'nyxfp',
+ 'selftext': '_URL_0_\n\nThis was on the front page earlier and I have a few questions about it. Is it possible to calculate how fast the astronaut would be orbiting the earth? Also how does he stay close to the shuttle so that he can return safely, i.e is he orbiting at the same speed and can therefore stay next to it? And finally if his propulsion system failed, would he eventually re-enter the atmosphere and presumably die?',
+ 'selftext_urls': {'url': ['http://apod.nasa.gov/apod/image/1201/freeflyer_nasa_3000.jpg']},
+ 'subreddit': 'askscience',
+ 'title': 'Few questions about this space walk photograph.',
+ 'title_urls': {'url': []}}
+```
+
+Observa que `text` es un subcampo anidado dentro del diccionario `answers`. Cuando preproceses el dataset, deberás extraer el subcampo `text` en una columna aparte.
+
+## Preprocesamiento
+
+<Youtube id="ma1TrR7gE7I"/>
+
+Para modelados de lenguaje causales carga el tokenizador DistilGPT2 para procesar el subcampo `text`:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+```
+
+<Youtube id="8PmhEIXhBvI"/>
+
+Para modelados de lenguaje por enmascaramiento carga el tokenizador DistilRoBERTa, en lugar de DistilGPT2:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
+```
+
+Extrae el subcampo `text` desde su estructura anidado con el método [`flatten`](https://huggingface.co/docs/datasets/process.html#flatten):
+
+```py
+>>> eli5 = eli5.flatten()
+>>> eli5["train"][0]
+{'answers.a_id': ['c3d1aib', 'c3d4lya'],
+ 'answers.score': [6, 3],
+ 'answers.text': ["The velocity needed to remain in orbit is equal to the square root of Newton's constant times the mass of earth divided by the distance from the center of the earth. I don't know the altitude of that specific mission, but they're usually around 300 km. That means he's going 7-8 km/s.\n\nIn space there are no other forces acting on either the shuttle or the guy, so they stay in the same position relative to each other. If he were to become unable to return to the ship, he would presumably run out of oxygen, or slowly fall into the atmosphere and burn up.",
+  "Hope you don't mind me asking another question, but why aren't there any stars visible in this photo?"],
+ 'answers_urls.url': [],
+ 'document': '',
+ 'q_id': 'nyxfp',
+ 'selftext': '_URL_0_\n\nThis was on the front page earlier and I have a few questions about it. Is it possible to calculate how fast the astronaut would be orbiting the earth? Also how does he stay close to the shuttle so that he can return safely, i.e is he orbiting at the same speed and can therefore stay next to it? And finally if his propulsion system failed, would he eventually re-enter the atmosphere and presumably die?',
+ 'selftext_urls.url': ['http://apod.nasa.gov/apod/image/1201/freeflyer_nasa_3000.jpg'],
+ 'subreddit': 'askscience',
+ 'title': 'Few questions about this space walk photograph.',
+ 'title_urls.url': []}
+```
+
+Cada subcampo es ahora una columna separada, como lo indica el prefijo `answers`. Observa que `answers.text` es una lista. En lugar de tokenizar cada enunciado por separado, convierte la lista en un string para tokenizarlos conjuntamente.
+
+Así es como puedes crear una función de preprocesamiento para convertir la lista en una cadena y truncar las secuencias para que no superen la longitud máxima de input de DistilGPT2:
+
+```py
+>>> def preprocess_function(examples):
+...     return tokenizer([" ".join(x) for x in examples["answers.text"]], truncation=True)
+```
+
+Usa de 🤗 Datasets la función [`map`](https://huggingface.co/docs/datasets/process#map) para aplicar la función de preprocesamiento sobre el dataset en su totalidad. Puedes acelerar la función `map` configurando el argumento `batched=True` para procesar múltiples elementos del dataset a la vez y aumentar la cantidad de procesos con `num_proc`. Elimina las columnas que no necesitas:
+
+```py
+>>> tokenized_eli5 = eli5.map(
+...     preprocess_function,
+...     batched=True,
+...     num_proc=4,
+...     remove_columns=eli5["train"].column_names,
+... )
+```
+
+Ahora necesitas una segunda función de preprocesamiento para capturar el texto truncado de cualquier ejemplo demasiado largo para evitar cualquier pérdida de información. Esta función de preprocesamiento debería:
+
+- Concatenar todo el texto.
+- Dividir el texto concatenado en trozos más pequeños definidos por un `block_size`.
+
+```py
+>>> block_size = 128
+
+
+>>> def group_texts(examples):
+...     concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+...     total_length = len(concatenated_examples[list(examples.keys())[0]])
+...     total_length = (total_length // block_size) * block_size
+...     result = {
+...         k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
+...         for k, t in concatenated_examples.items()
+...     }
+...     result["labels"] = result["input_ids"].copy()
+...     return result
+```
+
+Aplica la función `group_texts` sobre todo el dataset:
+
+```py
+>>> lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)
+```
+
+Para modelados de lenguaje causales, usa [`DataCollatorForLanguageModeling`] para crear un lote de ejemplos. Esto también *rellenará dinámicamente* tu texto a la dimensión del elemento más largo del lote para que de esta manera tengan largo uniforme. Si bien es posible rellenar tu texto en la función `tokenizer` mediante el argumento `padding=True`, el rellenado dinámico es más eficiente. 
+
+<frameworkcontent>
+<pt>
+Puedes usar el token de final de secuencia como el token de relleno y asignar `mlm=False`. Esto usará los inputs como etiquetas movidas un elemento hacia la derecha:
+
+```py
+>>> from transformers import DataCollatorForLanguageModeling
+
+>>> tokenizer.pad_token = tokenizer.eos_token
+>>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
+```
+
+Para modelados de lenguaje por enmascaramiento usa el mismo [`DataCollatorForLanguageModeling`] excepto que deberás especificar `mlm_probability` para enmascarar tokens aleatoriamente cada vez que iteras sobre los datos.
+
+```py
+>>> from transformers import DataCollatorForLanguageModeling
+
+>>> tokenizer.pad_token = tokenizer.eos_token
+>>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
+```
+</pt>
+<tf>
+Puedes usar el token de final de secuencia como el token de relleno y asignar `mlm=False`. Esto usará los inputs como etiquetas movidas un elemento hacia la derecha:
+
+```py
+>>> from transformers import DataCollatorForLanguageModeling
+
+>>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="tf")
+```
+
+Para modelados de lenguajes por enmascaramiento usa el mismo [`DataCollatorForLanguageModeling`] excepto que deberás especificar `mlm_probability` para enmascarar tokens aleatoriamente cada vez que iteras sobre los datos.
+
+```py
+>>> from transformers import DataCollatorForLanguageModeling
+
+>>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="tf")
+```
+</tf>
+</frameworkcontent>
+
+## Modelado de lenguaje causal
+
+El modelado de lenguaje causal es frecuentemente utilizado para generación de texto. Esta sección te muestra cómo realizar fine-tuning a [DistilGPT2](https://huggingface.co/distilgpt2) para generar nuevo texto.
+
+### Entrenamiento
+
+<frameworkcontent>
+<pt>
+Carga DistilGPT2 con [`AutoModelForCausalLM`]:
+
+```py
+>>> from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
+
+>>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+```
+
+<Tip>
+
+Si no estás familiarizado con el proceso de realizar fine-tuning sobre un modelo con [`Trainer`], considera el tutorial básico [aquí](../training#finetune-with-trainer)!
+
+</Tip>
+
+A este punto, solo faltan tres pasos:
+
+1. Definir tus hiperparámetros de entrenamiento en [`TrainingArguments`].
+2. Pasarle los argumentos de entrenamiento a [`Trainer`] junto con el modelo, dataset, y el data collator.
+3. Realiza la llamada [`~Trainer.train`] para realizar el fine-tuning sobre tu modelo.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="./results",
+...     evaluation_strategy="epoch",
+...     learning_rate=2e-5,
+...     weight_decay=0.01,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=lm_dataset["train"],
+...     eval_dataset=lm_dataset["test"],
+...     data_collator=data_collator,
+... )
+
+>>> trainer.train()
+```
+</pt>
+<tf>
+Para realizar el fine-tuning de un modelo en TensorFlow, comienza por convertir tus datasets al formato `tf.data.Dataset` con [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.to_tf_dataset). Especifica los inputs y etiquetas en `columns`, ya sea para mezclar el dataset, tamaño de lote, y el data collator:
+
+```py
+>>> tf_train_set = lm_dataset["train"].to_tf_dataset(
+...     columns=["attention_mask", "input_ids", "labels"],
+...     dummy_labels=True,
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_test_set = lm_dataset["test"].to_tf_dataset(
+...     columns=["attention_mask", "input_ids", "labels"],
+...     dummy_labels=True,
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+<Tip>
+
+Si no estás familiarizado con realizar fine-tuning de tus modelos con Keras, considera el tutorial básico [aquí](training#finetune-with-keras)!
+
+</Tip>
+
+Crea la función optimizadora, la tasa de aprendizaje, y algunos hiperparámetros de entrenamiento:
+
+```py
+>>> from transformers import create_optimizer, AdamWeightDecay
+
+>>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
+```
+
+Carga DistilGPT2 con [`TFAutoModelForCausalLM`]:
+
+```py
+>>> from transformers import TFAutoModelForCausalLM
+
+>>> model = TFAutoModelForCausalLM.from_pretrained("distilgpt2")
+```
+
+Configura el modelo para entrenamiento con [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)
+```
+
+Llama a [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) para realizar el fine-tuning del modelo:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3)
+```
+</tf>
+</frameworkcontent>
+
+## Modelado de lenguaje por enmascaramiento
+
+El modelado de lenguaje por enmascaramiento es también conocido como una tarea de rellenar la máscara, pues predice un token enmascarado dada una secuencia. Los modelos de lenguaje por enmascaramiento requieren una buena comprensión del contexto de una secuencia entera, en lugar de solo el contexto a la izquierda. Esta sección te enseña como realizar el fine-tuning de [DistilRoBERTa](https://huggingface.co/distilroberta-base) para predecir una palabra enmascarada.
+
+### Entrenamiento
+
+<frameworkcontent>
+<pt>
+Carga DistilRoBERTa con [`AutoModelForMaskedlM`]:
+
+```py
+>>> from transformers import AutoModelForMaskedLM
+
+>>> model = AutoModelForMaskedLM.from_pretrained("distilroberta-base")
+```
+
+<Tip>
+
+Si no estás familiarizado con el proceso de realizar fine-tuning sobre un modelo con [`Trainer`], considera el tutorial básico [aquí](../training#finetune-with-trainer)!
+
+</Tip>
+
+A este punto, solo faltan tres pasos:
+
+1. Definir tus hiperparámetros de entrenamiento en [`TrainingArguments`].
+2. Pasarle los argumentos de entrenamiento a [`Trainer`] junto con el modelo, dataset, y el data collator.
+3. Realiza la llamada [`~Trainer.train`] para realizar el fine-tuning de tu modelo.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="./results",
+...     evaluation_strategy="epoch",
+...     learning_rate=2e-5,
+...     num_train_epochs=3,
+...     weight_decay=0.01,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=lm_dataset["train"],
+...     eval_dataset=lm_dataset["test"],
+...     data_collator=data_collator,
+... )
+
+>>> trainer.train()
+```
+</pt>
+<tf>
+Para realizar el fine-tuning de un modelo en TensorFlow, comienza por convertir tus datasets al formato `tf.data.Dataset` con [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.to_tf_dataset). Especifica los inputs y etiquetas en `columns`, ya sea para mezclar el dataset, tamaño de lote, y el data collator:
+
+```py
+>>> tf_train_set = lm_dataset["train"].to_tf_dataset(
+...     columns=["attention_mask", "input_ids", "labels"],
+...     dummy_labels=True,
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_test_set = lm_dataset["test"].to_tf_dataset(
+...     columns=["attention_mask", "input_ids", "labels"],
+...     dummy_labels=True,
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+<Tip>
+
+Si no estás familiarizado con realizar fine-tuning de tus modelos con Keras, considera el tutorial básico [aquí](training#finetune-with-keras)!
+
+</Tip>
+
+Crea la función optimizadora, la tasa de aprendizaje, y algunos hiperparámetros de entrenamiento:
+
+```py
+>>> from transformers import create_optimizer, AdamWeightDecay
+
+>>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
+```
+
+Carga DistilRoBERTa con [`TFAutoModelForMaskedLM`]:
+
+```py
+>>> from transformers import TFAutoModelForMaskedLM
+
+>>> model = TFAutoModelForCausalLM.from_pretrained("distilroberta-base")
+```
+
+Configura el modelo para entrenamiento con [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)
+```
+
+Llama a [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) para realizar el fine-tuning del modelo:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3)
+```
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+Para un ejemplo más profundo sobre cómo realizar el fine-tuning sobre un modelo de lenguaje causal, considera
+[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)
+o [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
+
+</Tip>
\ No newline at end of file
diff --git a/docs/source/es/tasks/language_modeling.mdx b/docs/source/es/tasks/language_modeling.mdx
deleted file mode 100644
index 565185072a11..000000000000
--- a/docs/source/es/tasks/language_modeling.mdx
+++ /dev/null
@@ -1,419 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Modelado de lenguaje
-
-El modelado de lenguaje predice palabras en un enunciado. Hay dos formas de modelado de lenguaje.
-
-<Youtube id="Vpjb1lu0MDk"/>
-
-El modelado de lenguaje causal predice el siguiente token en una secuencia de tokens, y el modelo solo puede considerar los tokens a la izquierda.
-
-<Youtube id="mqElG5QJWUg"/>
-
-El modelado de lenguaje por enmascaramiento predice un token enmascarado en una secuencia, y el modelo puede considerar los tokens bidireccionalmente.
-
-Esta guía te mostrará cómo realizar fine-tuning [DistilGPT2](https://huggingface.co/distilgpt2) para modelos de lenguaje causales y [DistilRoBERTa](https://huggingface.co/distilroberta-base) para modelos de lenguaje por enmascaramiento en el [r/askscience](https://www.reddit.com/r/askscience/) subdataset [ELI5](https://huggingface.co/datasets/eli5). 
-
-<Tip>
-
-Puedes realizar fine-tuning a otras arquitecturas para modelos de lenguaje como [GPT-Neo](https://huggingface.co/EleutherAI/gpt-neo-125M), [GPT-J](https://huggingface.co/EleutherAI/gpt-j-6B) y [BERT](https://huggingface.co/bert-base-uncased) siguiendo los mismos pasos presentados en esta guía!
-
-Mira la [página de tarea](https://huggingface.co/tasks/text-generation) para generación de texto y la [página de tarea](https://huggingface.co/tasks/fill-mask) para modelos de lenguajes por enmascaramiento para obtener más información sobre los modelos, datasets, y métricas asociadas.
-
-</Tip>
-
-## Carga el dataset ELI5
-
-Carga solo los primeros 5000 registros desde la biblioteca 🤗 Datasets, dado que es bastante grande:
-
-```py
->>> from datasets import load_dataset
-
->>> eli5 = load_dataset("eli5", split="train_asks[:5000]")
-```
-
-Divide este dataset en subdatasets para el entrenamiento y el test:
-
-```py
-eli5 = eli5.train_test_split(test_size=0.2)
-```
-
-Luego observa un ejemplo:
-
-```py
->>> eli5["train"][0]
-{'answers': {'a_id': ['c3d1aib', 'c3d4lya'],
-  'score': [6, 3],
-  'text': ["The velocity needed to remain in orbit is equal to the square root of Newton's constant times the mass of earth divided by the distance from the center of the earth. I don't know the altitude of that specific mission, but they're usually around 300 km. That means he's going 7-8 km/s.\n\nIn space there are no other forces acting on either the shuttle or the guy, so they stay in the same position relative to each other. If he were to become unable to return to the ship, he would presumably run out of oxygen, or slowly fall into the atmosphere and burn up.",
-   "Hope you don't mind me asking another question, but why aren't there any stars visible in this photo?"]},
- 'answers_urls': {'url': []},
- 'document': '',
- 'q_id': 'nyxfp',
- 'selftext': '_URL_0_\n\nThis was on the front page earlier and I have a few questions about it. Is it possible to calculate how fast the astronaut would be orbiting the earth? Also how does he stay close to the shuttle so that he can return safely, i.e is he orbiting at the same speed and can therefore stay next to it? And finally if his propulsion system failed, would he eventually re-enter the atmosphere and presumably die?',
- 'selftext_urls': {'url': ['http://apod.nasa.gov/apod/image/1201/freeflyer_nasa_3000.jpg']},
- 'subreddit': 'askscience',
- 'title': 'Few questions about this space walk photograph.',
- 'title_urls': {'url': []}}
-```
-
-Observa que `text` es un subcampo anidado dentro del diccionario `answers`. Cuando preproceses el dataset, deberás extraer el subcampo `text` en una columna aparte.
-
-## Preprocesamiento
-
-<Youtube id="ma1TrR7gE7I"/>
-
-Para modelados de lenguaje causales carga el tokenizador DistilGPT2 para procesar el subcampo `text`:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
-```
-
-<Youtube id="8PmhEIXhBvI"/>
-
-Para modelados de lenguaje por enmascaramiento carga el tokenizador DistilRoBERTa, en lugar de DistilGPT2:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
-```
-
-Extrae el subcampo `text` desde su estructura anidado con el método [`flatten`](https://huggingface.co/docs/datasets/process.html#flatten):
-
-```py
->>> eli5 = eli5.flatten()
->>> eli5["train"][0]
-{'answers.a_id': ['c3d1aib', 'c3d4lya'],
- 'answers.score': [6, 3],
- 'answers.text': ["The velocity needed to remain in orbit is equal to the square root of Newton's constant times the mass of earth divided by the distance from the center of the earth. I don't know the altitude of that specific mission, but they're usually around 300 km. That means he's going 7-8 km/s.\n\nIn space there are no other forces acting on either the shuttle or the guy, so they stay in the same position relative to each other. If he were to become unable to return to the ship, he would presumably run out of oxygen, or slowly fall into the atmosphere and burn up.",
-  "Hope you don't mind me asking another question, but why aren't there any stars visible in this photo?"],
- 'answers_urls.url': [],
- 'document': '',
- 'q_id': 'nyxfp',
- 'selftext': '_URL_0_\n\nThis was on the front page earlier and I have a few questions about it. Is it possible to calculate how fast the astronaut would be orbiting the earth? Also how does he stay close to the shuttle so that he can return safely, i.e is he orbiting at the same speed and can therefore stay next to it? And finally if his propulsion system failed, would he eventually re-enter the atmosphere and presumably die?',
- 'selftext_urls.url': ['http://apod.nasa.gov/apod/image/1201/freeflyer_nasa_3000.jpg'],
- 'subreddit': 'askscience',
- 'title': 'Few questions about this space walk photograph.',
- 'title_urls.url': []}
-```
-
-Cada subcampo es ahora una columna separada, como lo indica el prefijo `answers`. Observa que `answers.text` es una lista. En lugar de tokenizar cada enunciado por separado, convierte la lista en un string para tokenizarlos conjuntamente.
-
-Así es como puedes crear una función de preprocesamiento para convertir la lista en una cadena y truncar las secuencias para que no superen la longitud máxima de input de DistilGPT2:
-
-```py
->>> def preprocess_function(examples):
-...     return tokenizer([" ".join(x) for x in examples["answers.text"]], truncation=True)
-```
-
-Usa de 🤗 Datasets la función [`map`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map) para aplicar la función de preprocesamiento sobre el dataset en su totalidad. Puedes acelerar la función `map` configurando el argumento `batched=True` para procesar múltiples elementos del dataset a la vez y aumentar la cantidad de procesos con `num_proc`. Elimina las columnas que no necesitas:
-
-```py
->>> tokenized_eli5 = eli5.map(
-...     preprocess_function,
-...     batched=True,
-...     num_proc=4,
-...     remove_columns=eli5["train"].column_names,
-... )
-```
-
-Ahora necesitas una segunda función de preprocesamiento para capturar el texto truncado de cualquier ejemplo demasiado largo para evitar cualquier pérdida de información. Esta función de preprocesamiento debería:
-
-- Concatenar todo el texto.
-- Dividir el texto concatenado en trozos más pequeños definidos por un `block_size`.
-
-```py
->>> block_size = 128
-
-
->>> def group_texts(examples):
-...     concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
-...     total_length = len(concatenated_examples[list(examples.keys())[0]])
-...     total_length = (total_length // block_size) * block_size
-...     result = {
-...         k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
-...         for k, t in concatenated_examples.items()
-...     }
-...     result["labels"] = result["input_ids"].copy()
-...     return result
-```
-
-Aplica la función `group_texts` sobre todo el dataset:
-
-```py
->>> lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)
-```
-
-Para modelados de lenguaje causales, usa [`DataCollatorForLanguageModeling`] para crear un lote de ejemplos. Esto también *rellenará dinámicamente* tu texto a la dimensión del elemento más largo del lote para que de esta manera tengan largo uniforme. Si bien es posible rellenar tu texto en la función `tokenizer` mediante el argumento `padding=True`, el rellenado dinámico es más eficiente. 
-
-<frameworkcontent>
-<pt>
-Puedes usar el token de final de secuencia como el token de relleno y asignar `mlm=False`. Esto usará los inputs como etiquetas movidas un elemento hacia la derecha:
-
-```py
->>> from transformers import DataCollatorForLanguageModeling
-
->>> tokenizer.pad_token = tokenizer.eos_token
->>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
-```
-
-Para modelados de lenguaje por enmascaramiento usa el mismo [`DataCollatorForLanguageModeling`] excepto que deberás especificar `mlm_probability` para enmascarar tokens aleatoriamente cada vez que iteras sobre los datos.
-
-```py
->>> from transformers import DataCollatorForLanguageModeling
-
->>> tokenizer.pad_token = tokenizer.eos_token
->>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
-```
-</pt>
-<tf>
-Puedes usar el token de final de secuencia como el token de relleno y asignar `mlm=False`. Esto usará los inputs como etiquetas movidas un elemento hacia la derecha:
-
-```py
->>> from transformers import DataCollatorForLanguageModeling
-
->>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="tf")
-```
-
-Para modelados de lenguajes por enmascaramiento usa el mismo [`DataCollatorForLanguageModeling`] excepto que deberás especificar `mlm_probability` para enmascarar tokens aleatoriamente cada vez que iteras sobre los datos.
-
-```py
->>> from transformers import DataCollatorForLanguageModeling
-
->>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="tf")
-```
-</tf>
-</frameworkcontent>
-
-## Modelado de lenguaje causal
-
-El modelado de lenguaje causal es frecuentemente utilizado para generación de texto. Esta sección te muestra cómo realizar fine-tuning a [DistilGPT2](https://huggingface.co/distilgpt2) para generar nuevo texto.
-
-### Entrenamiento
-
-<frameworkcontent>
-<pt>
-Carga DistilGPT2 con [`AutoModelForCausalLM`]:
-
-```py
->>> from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
-
->>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
-```
-
-<Tip>
-
-Si no estás familiarizado con el proceso de realizar fine-tuning sobre un modelo con [`Trainer`], considera el tutorial básico [aquí](../training#finetune-with-trainer)!
-
-</Tip>
-
-A este punto, solo faltan tres pasos:
-
-1. Definir tus hiperparámetros de entrenamiento en [`TrainingArguments`].
-2. Pasarle los argumentos de entrenamiento a [`Trainer`] junto con el modelo, dataset, y el data collator.
-3. Realiza la llamada [`~Trainer.train`] para realizar el fine-tuning sobre tu modelo.
-
-```py
->>> training_args = TrainingArguments(
-...     output_dir="./results",
-...     evaluation_strategy="epoch",
-...     learning_rate=2e-5,
-...     weight_decay=0.01,
-... )
-
->>> trainer = Trainer(
-...     model=model,
-...     args=training_args,
-...     train_dataset=lm_dataset["train"],
-...     eval_dataset=lm_dataset["test"],
-...     data_collator=data_collator,
-... )
-
->>> trainer.train()
-```
-</pt>
-<tf>
-Para realizar el fine-tuning de un modelo en TensorFlow, comienza por convertir tus datasets al formato `tf.data.Dataset` con [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.to_tf_dataset). Especifica los inputs y etiquetas en `columns`, ya sea para mezclar el dataset, tamaño de lote, y el data collator:
-
-```py
->>> tf_train_set = lm_dataset["train"].to_tf_dataset(
-...     columns=["attention_mask", "input_ids", "labels"],
-...     dummy_labels=True,
-...     shuffle=True,
-...     batch_size=16,
-...     collate_fn=data_collator,
-... )
-
->>> tf_test_set = lm_dataset["test"].to_tf_dataset(
-...     columns=["attention_mask", "input_ids", "labels"],
-...     dummy_labels=True,
-...     shuffle=False,
-...     batch_size=16,
-...     collate_fn=data_collator,
-... )
-```
-
-<Tip>
-
-Si no estás familiarizado con realizar fine-tuning de tus modelos con Keras, considera el tutorial básico [aquí](training#finetune-with-keras)!
-
-</Tip>
-
-Crea la función optimizadora, la tasa de aprendizaje, y algunos hiperparámetros de entrenamiento:
-
-```py
->>> from transformers import create_optimizer, AdamWeightDecay
-
->>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
-```
-
-Carga DistilGPT2 con [`TFAutoModelForCausalLM`]:
-
-```py
->>> from transformers import TFAutoModelForCausalLM
-
->>> model = TFAutoModelForCausalLM.from_pretrained("distilgpt2")
-```
-
-Configura el modelo para entrenamiento con [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
-
-```py
->>> import tensorflow as tf
-
->>> model.compile(optimizer=optimizer)
-```
-
-Llama a [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) para realizar el fine-tuning del modelo:
-
-```py
->>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3)
-```
-</tf>
-</frameworkcontent>
-
-## Modelado de lenguaje por enmascaramiento
-
-El modelado de lenguaje por enmascaramiento es también conocido como una tarea de rellenar la máscara, pues predice un token enmascarado dada una secuencia. Los modelos de lenguaje por enmascaramiento requieren una buena comprensión del contexto de una secuencia entera, en lugar de solo el contexto a la izquierda. Esta sección te enseña como realizar el fine-tuning de [DistilRoBERTa](https://huggingface.co/distilroberta-base) para predecir una palabra enmascarada.
-
-### Entrenamiento
-
-<frameworkcontent>
-<pt>
-Carga DistilRoBERTa con [`AutoModelForMaskedlM`]:
-
-```py
->>> from transformers import AutoModelForMaskedLM
-
->>> model = AutoModelForMaskedLM.from_pretrained("distilroberta-base")
-```
-
-<Tip>
-
-Si no estás familiarizado con el proceso de realizar fine-tuning sobre un modelo con [`Trainer`], considera el tutorial básico [aquí](../training#finetune-with-trainer)!
-
-</Tip>
-
-A este punto, solo faltan tres pasos:
-
-1. Definir tus hiperparámetros de entrenamiento en [`TrainingArguments`].
-2. Pasarle los argumentos de entrenamiento a [`Trainer`] junto con el modelo, dataset, y el data collator.
-3. Realiza la llamada [`~Trainer.train`] para realizar el fine-tuning de tu modelo.
-
-```py
->>> training_args = TrainingArguments(
-...     output_dir="./results",
-...     evaluation_strategy="epoch",
-...     learning_rate=2e-5,
-...     num_train_epochs=3,
-...     weight_decay=0.01,
-... )
-
->>> trainer = Trainer(
-...     model=model,
-...     args=training_args,
-...     train_dataset=lm_dataset["train"],
-...     eval_dataset=lm_dataset["test"],
-...     data_collator=data_collator,
-... )
-
->>> trainer.train()
-```
-</pt>
-<tf>
-Para realizar el fine-tuning de un modelo en TensorFlow, comienza por convertir tus datasets al formato `tf.data.Dataset` con [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.to_tf_dataset). Especifica los inputs y etiquetas en `columns`, ya sea para mezclar el dataset, tamaño de lote, y el data collator:
-
-```py
->>> tf_train_set = lm_dataset["train"].to_tf_dataset(
-...     columns=["attention_mask", "input_ids", "labels"],
-...     dummy_labels=True,
-...     shuffle=True,
-...     batch_size=16,
-...     collate_fn=data_collator,
-... )
-
->>> tf_test_set = lm_dataset["test"].to_tf_dataset(
-...     columns=["attention_mask", "input_ids", "labels"],
-...     dummy_labels=True,
-...     shuffle=False,
-...     batch_size=16,
-...     collate_fn=data_collator,
-... )
-```
-
-<Tip>
-
-Si no estás familiarizado con realizar fine-tuning de tus modelos con Keras, considera el tutorial básico [aquí](training#finetune-with-keras)!
-
-</Tip>
-
-Crea la función optimizadora, la tasa de aprendizaje, y algunos hiperparámetros de entrenamiento:
-
-```py
->>> from transformers import create_optimizer, AdamWeightDecay
-
->>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
-```
-
-Carga DistilRoBERTa con [`TFAutoModelForMaskedLM`]:
-
-```py
->>> from transformers import TFAutoModelForMaskedLM
-
->>> model = TFAutoModelForCausalLM.from_pretrained("distilroberta-base")
-```
-
-Configura el modelo para entrenamiento con [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
-
-```py
->>> import tensorflow as tf
-
->>> model.compile(optimizer=optimizer)
-```
-
-Llama a [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) para realizar el fine-tuning del modelo:
-
-```py
->>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3)
-```
-</tf>
-</frameworkcontent>
-
-<Tip>
-
-Para un ejemplo más profundo sobre cómo realizar el fine-tuning sobre un modelo de lenguaje causal, considera
-[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)
-o [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
-
-</Tip>
\ No newline at end of file
diff --git a/docs/source/es/tasks/multiple_choice.md b/docs/source/es/tasks/multiple_choice.md
new file mode 100644
index 000000000000..8391dcbdd5eb
--- /dev/null
+++ b/docs/source/es/tasks/multiple_choice.md
@@ -0,0 +1,292 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Selección múltiple
+
+La tarea de selección múltiple es parecida a la de responder preguntas, con la excepción de que se dan varias opciones de respuesta junto con el contexto. El modelo se entrena para escoger la respuesta correcta
+entre varias opciones a partir del contexto dado.
+
+Esta guía te mostrará como hacerle fine-tuning a [BERT](https://huggingface.co/bert-base-uncased) en la configuración `regular` del dataset [SWAG](https://huggingface.co/datasets/swag), de forma
+que seleccione la mejor respuesta a partir de varias opciones y algún contexto.
+
+## Cargar el dataset SWAG
+
+Carga el dataset SWAG con la biblioteca 🤗 Datasets:
+
+```py
+>>> from datasets import load_dataset
+
+>>> swag = load_dataset("swag", "regular")
+```
+
+Ahora, échale un vistazo a un ejemplo del dataset:
+
+```py
+>>> swag["train"][0]
+{'ending0': 'passes by walking down the street playing their instruments.',
+ 'ending1': 'has heard approaching them.',
+ 'ending2': "arrives and they're outside dancing and asleep.",
+ 'ending3': 'turns the lead singer watches the performance.',
+ 'fold-ind': '3416',
+ 'gold-source': 'gold',
+ 'label': 0,
+ 'sent1': 'Members of the procession walk down the street holding small horn brass instruments.',
+ 'sent2': 'A drum line',
+ 'startphrase': 'Members of the procession walk down the street holding small horn brass instruments. A drum line',
+ 'video-id': 'anetv_jkn6uvmqwh4'}
+```
+
+Los campos `sent1` y `sent2` muestran cómo comienza una oración, y cada campo `ending` indica cómo podría terminar. Dado el comienzo de la oración, el modelo debe escoger el final de oración correcto indicado por el campo `label`.
+
+## Preprocesmaiento
+
+Carga el tokenizer de BERT para procesar el comienzo de cada oración y los cuatro finales posibles:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+```
+
+La función de preprocesmaiento debe hacer lo siguiente:
+
+1. Hacer cuatro copias del campo `sent1` de forma que se pueda combinar cada una con el campo `sent2` para recrear la forma en que empieza la oración.
+2. Combinar `sent2` con cada uno de los cuatro finales de oración posibles.
+3. Aplanar las dos listas para que puedas tokenizarlas, y luego des-aplanarlas para que cada ejemplo tenga los campos `input_ids`, `attention_mask` y `labels` correspondientes.
+
+```py
+>>> ending_names = ["ending0", "ending1", "ending2", "ending3"]
+
+
+>>> def preprocess_function(examples):
+...     first_sentences = [[context] * 4 for context in examples["sent1"]]
+...     question_headers = examples["sent2"]
+...     second_sentences = [
+...         [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)
+...     ]
+
+...     first_sentences = sum(first_sentences, [])
+...     second_sentences = sum(second_sentences, [])
+
+...     tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
+...     return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}
+```
+
+Usa la función [`~datasets.Dataset.map`] de 🤗 Datasets para aplicarle la función de preprocesamiento al dataset entero. Puedes acelerar la función `map` haciendo `batched=True` para procesar varios elementos del dataset a la vez.
+
+```py
+tokenized_swag = swag.map(preprocess_function, batched=True)
+```
+
+🤗 Transformers no tiene un collator de datos para la tarea de selección múltiple, así que tendrías que crear uno. Puedes adaptar el [`DataCollatorWithPadding`] para crear un lote de ejemplos para selección múltiple. Este también
+le *añadirá relleno de manera dinámica* a tu texto y a las etiquetas para que tengan la longitud del elemento más largo en su lote, de forma que tengan una longitud uniforme. Aunque es posible rellenar el texto en la función `tokenizer` haciendo
+`padding=True`, el rellenado dinámico es más eficiente.
+
+El `DataCollatorForMultipleChoice` aplanará todas las entradas del modelo, les aplicará relleno y luego des-aplanará los resultados:
+
+<frameworkcontent>
+<pt>
+```py
+>>> from dataclasses import dataclass
+>>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
+>>> from typing import Optional, Union
+>>> import torch
+
+
+>>> @dataclass
+... class DataCollatorForMultipleChoice:
+...     """
+...     Collator de datos que le añadirá relleno de forma automática a las entradas recibidas para
+...     una tarea de selección múltiple.
+...     """
+
+...     tokenizer: PreTrainedTokenizerBase
+...     padding: Union[bool, str, PaddingStrategy] = True
+...     max_length: Optional[int] = None
+...     pad_to_multiple_of: Optional[int] = None
+
+...     def __call__(self, features):
+...         label_name = "label" if "label" in features[0].keys() else "labels"
+...         labels = [feature.pop(label_name) for feature in features]
+...         batch_size = len(features)
+...         num_choices = len(features[0]["input_ids"])
+...         flattened_features = [
+...             [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
+...         ]
+...         flattened_features = sum(flattened_features, [])
+
+...         batch = self.tokenizer.pad(
+...             flattened_features,
+...             padding=self.padding,
+...             max_length=self.max_length,
+...             pad_to_multiple_of=self.pad_to_multiple_of,
+...             return_tensors="pt",
+...         )
+
+...         batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
+...         batch["labels"] = torch.tensor(labels, dtype=torch.int64)
+...         return batch
+```
+</pt>
+<tf>
+```py
+>>> from dataclasses import dataclass
+>>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
+>>> from typing import Optional, Union
+>>> import tensorflow as tf
+
+
+>>> @dataclass
+... class DataCollatorForMultipleChoice:
+...     """
+...     Data collator that will dynamically pad the inputs for multiple choice received.
+...     """
+
+...     tokenizer: PreTrainedTokenizerBase
+...     padding: Union[bool, str, PaddingStrategy] = True
+...     max_length: Optional[int] = None
+...     pad_to_multiple_of: Optional[int] = None
+
+...     def __call__(self, features):
+...         label_name = "label" if "label" in features[0].keys() else "labels"
+...         labels = [feature.pop(label_name) for feature in features]
+...         batch_size = len(features)
+...         num_choices = len(features[0]["input_ids"])
+...         flattened_features = [
+...             [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
+...         ]
+...         flattened_features = sum(flattened_features, [])
+
+...         batch = self.tokenizer.pad(
+...             flattened_features,
+...             padding=self.padding,
+...             max_length=self.max_length,
+...             pad_to_multiple_of=self.pad_to_multiple_of,
+...             return_tensors="tf",
+...         )
+
+...         batch = {k: tf.reshape(v, (batch_size, num_choices, -1)) for k, v in batch.items()}
+...         batch["labels"] = tf.convert_to_tensor(labels, dtype=tf.int64)
+...         return batch
+```
+</tf>
+</frameworkcontent>
+
+## Entrenamiento
+
+<frameworkcontent>
+<pt>
+Carga el modelo BERT con [`AutoModelForMultipleChoice`]:
+
+```py
+>>> from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
+
+>>> model = AutoModelForMultipleChoice.from_pretrained("bert-base-uncased")
+```
+
+<Tip>
+
+Para familiarizarte con el fine-tuning con [`Trainer`], ¡mira el tutorial básico [aquí](../training#finetune-with-trainer)!
+
+</Tip>
+
+En este punto, solo quedan tres pasos:
+
+1. Definir tus hiperparámetros de entrenamiento en [`TrainingArguments`].
+2. Pasarle los argumentos del entrenamiento al [`Trainer`] jnto con el modelo, el dataset, el tokenizer y el collator de datos.
+3. Invocar el método [`~Trainer.train`] para realizar el fine-tuning del modelo.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="./results",
+...     evaluation_strategy="epoch",
+...     learning_rate=5e-5,
+...     per_device_train_batch_size=16,
+...     per_device_eval_batch_size=16,
+...     num_train_epochs=3,
+...     weight_decay=0.01,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=tokenized_swag["train"],
+...     eval_dataset=tokenized_swag["validation"],
+...     tokenizer=tokenizer,
+...     data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
+... )
+
+>>> trainer.train()
+```
+</pt>
+<tf>
+Para realizar el fine-tuning de un modelo en TensorFlow, primero convierte tus datasets al formato `tf.data.Dataset` con el método [`~TFPreTrainedModel.prepare_tf_dataset`].
+
+```py
+>>> data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
+>>> tf_train_set = model.prepare_tf_dataset(
+...     tokenized_swag["train"],
+...     shuffle=True,
+...     batch_size=batch_size,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_validation_set = model.prepare_tf_dataset(
+...     tokenized_swag["validation"],
+...     shuffle=False,
+...     batch_size=batch_size,
+...     collate_fn=data_collator,
+... )
+```
+
+<Tip>
+
+Para familiarizarte con el fine-tuning con Keras, ¡mira el tutorial básico [aquí](training#finetune-with-keras)!
+
+</Tip>
+
+Prepara una función de optimización, un programa para la tasa de aprendizaje y algunos hiperparámetros de entrenamiento:
+
+```py
+>>> from transformers import create_optimizer
+
+>>> batch_size = 16
+>>> num_train_epochs = 2
+>>> total_train_steps = (len(tokenized_swag["train"]) // batch_size) * num_train_epochs
+>>> optimizer, schedule = create_optimizer(init_lr=5e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
+```
+
+Carga el modelo BERT con [`TFAutoModelForMultipleChoice`]:
+
+```py
+>>> from transformers import TFAutoModelForMultipleChoice
+
+>>> model = TFAutoModelForMultipleChoice.from_pretrained("bert-base-uncased")
+```
+
+Configura el modelo para entrenarlo con [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
+
+```py
+>>> model.compile(optimizer=optimizer)
+```
+
+Invoca el método [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) para realizar el fine-tuning del modelo:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=2)
+```
+</tf>
+</frameworkcontent>
diff --git a/docs/source/es/tasks/multiple_choice.mdx b/docs/source/es/tasks/multiple_choice.mdx
deleted file mode 100644
index 2ece0969bf96..000000000000
--- a/docs/source/es/tasks/multiple_choice.mdx
+++ /dev/null
@@ -1,288 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Selección múltiple
-
-La tarea de selección múltiple es parecida a la de responder preguntas, con la excepción de que se dan varias opciones de respuesta junto con el contexto. El modelo se entrena para escoger la respuesta correcta
-entre varias opciones a partir del contexto dado.
-
-Esta guía te mostrará como hacerle fine-tuning a [BERT](https://huggingface.co/bert-base-uncased) en la configuración `regular` del dataset [SWAG](https://huggingface.co/datasets/swag), de forma
-que seleccione la mejor respuesta a partir de varias opciones y algún contexto.
-
-## Cargar el dataset SWAG
-
-Carga el dataset SWAG con la biblioteca 🤗 Datasets:
-
-```py
->>> from datasets import load_dataset
-
->>> swag = load_dataset("swag", "regular")
-```
-
-Ahora, échale un vistazo a un ejemplo del dataset:
-
-```py
->>> swag["train"][0]
-{'ending0': 'passes by walking down the street playing their instruments.',
- 'ending1': 'has heard approaching them.',
- 'ending2': "arrives and they're outside dancing and asleep.",
- 'ending3': 'turns the lead singer watches the performance.',
- 'fold-ind': '3416',
- 'gold-source': 'gold',
- 'label': 0,
- 'sent1': 'Members of the procession walk down the street holding small horn brass instruments.',
- 'sent2': 'A drum line',
- 'startphrase': 'Members of the procession walk down the street holding small horn brass instruments. A drum line',
- 'video-id': 'anetv_jkn6uvmqwh4'}
-```
-
-Los campos `sent1` y `sent2` muestran cómo comienza una oración, y cada campo `ending` indica cómo podría terminar. Dado el comienzo de la oración, el modelo debe escoger el final de oración correcto indicado por el campo `label`.
-
-## Preprocesmaiento
-
-Carga el tokenizer de BERT para procesar el comienzo de cada oración y los cuatro finales posibles:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
-```
-
-La función de preprocesmaiento debe hacer lo siguiente:
-
-1. Hacer cuatro copias del campo `sent1` de forma que se pueda combinar cada una con el campo `sent2` para recrear la forma en que empieza la oración.
-2. Combinar `sent2` con cada uno de los cuatro finales de oración posibles.
-3. Aplanar las dos listas para que puedas tokenizarlas, y luego des-aplanarlas para que cada ejemplo tenga los campos `input_ids`, `attention_mask` y `labels` correspondientes.
-
-```py
->>> ending_names = ["ending0", "ending1", "ending2", "ending3"]
-
-
->>> def preprocess_function(examples):
-...     first_sentences = [[context] * 4 for context in examples["sent1"]]
-...     question_headers = examples["sent2"]
-...     second_sentences = [
-...         [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)
-...     ]
-
-...     first_sentences = sum(first_sentences, [])
-...     second_sentences = sum(second_sentences, [])
-
-...     tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
-...     return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}
-```
-
-Usa la función [`~datasets.Dataset.map`] de 🤗 Datasets para aplicarle la función de preprocesamiento al dataset entero. Puedes acelerar la función `map` haciendo `batched=True` para procesar varios elementos del dataset a la vez.
-
-```py
-tokenized_swag = swag.map(preprocess_function, batched=True)
-```
-
-🤗 Transformers no tiene un collator de datos para la tarea de selección múltiple, así que tendrías que crear uno. Puedes adaptar el [`DataCollatorWithPadding`] para crear un lote de ejemplos para selección múltiple. Este también
-le *añadirá relleno de manera dinámica* a tu texto y a las etiquetas para que tengan la longitud del elemento más largo en su lote, de forma que tengan una longitud uniforme. Aunque es posible rellenar el texto en la función `tokenizer` haciendo
-`padding=True`, el rellenado dinámico es más eficiente.
-
-El `DataCollatorForMultipleChoice` aplanará todas las entradas del modelo, les aplicará relleno y luego des-aplanará los resultados:
-
-<frameworkcontent>
-<pt>
-```py
->>> from dataclasses import dataclass
->>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
->>> from typing import Optional, Union
->>> import torch
-
-
->>> @dataclass
-... class DataCollatorForMultipleChoice:
-...     """
-...     Collator de datos que le añadirá relleno de forma automática a las entradas recibidas para
-...     una tarea de selección múltiple.
-...     """
-
-...     tokenizer: PreTrainedTokenizerBase
-...     padding: Union[bool, str, PaddingStrategy] = True
-...     max_length: Optional[int] = None
-...     pad_to_multiple_of: Optional[int] = None
-
-...     def __call__(self, features):
-...         label_name = "label" if "label" in features[0].keys() else "labels"
-...         labels = [feature.pop(label_name) for feature in features]
-...         batch_size = len(features)
-...         num_choices = len(features[0]["input_ids"])
-...         flattened_features = [
-...             [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
-...         ]
-...         flattened_features = sum(flattened_features, [])
-
-...         batch = self.tokenizer.pad(
-...             flattened_features,
-...             padding=self.padding,
-...             max_length=self.max_length,
-...             pad_to_multiple_of=self.pad_to_multiple_of,
-...             return_tensors="pt",
-...         )
-
-...         batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
-...         batch["labels"] = torch.tensor(labels, dtype=torch.int64)
-...         return batch
-```
-</pt>
-<tf>
-```py
->>> from dataclasses import dataclass
->>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
->>> from typing import Optional, Union
->>> import tensorflow as tf
-
-
->>> @dataclass
-... class DataCollatorForMultipleChoice:
-...     """
-...     Data collator that will dynamically pad the inputs for multiple choice received.
-...     """
-
-...     tokenizer: PreTrainedTokenizerBase
-...     padding: Union[bool, str, PaddingStrategy] = True
-...     max_length: Optional[int] = None
-...     pad_to_multiple_of: Optional[int] = None
-
-...     def __call__(self, features):
-...         label_name = "label" if "label" in features[0].keys() else "labels"
-...         labels = [feature.pop(label_name) for feature in features]
-...         batch_size = len(features)
-...         num_choices = len(features[0]["input_ids"])
-...         flattened_features = [
-...             [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
-...         ]
-...         flattened_features = sum(flattened_features, [])
-
-...         batch = self.tokenizer.pad(
-...             flattened_features,
-...             padding=self.padding,
-...             max_length=self.max_length,
-...             pad_to_multiple_of=self.pad_to_multiple_of,
-...             return_tensors="tf",
-...         )
-
-...         batch = {k: tf.reshape(v, (batch_size, num_choices, -1)) for k, v in batch.items()}
-...         batch["labels"] = tf.convert_to_tensor(labels, dtype=tf.int64)
-...         return batch
-```
-</tf>
-</frameworkcontent>
-
-## Entrenamiento
-
-<frameworkcontent>
-<pt>
-Carga el modelo BERT con [`AutoModelForMultipleChoice`]:
-
-```py
->>> from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
-
->>> model = AutoModelForMultipleChoice.from_pretrained("bert-base-uncased")
-```
-
-<Tip>
-
-Para familiarizarte con el fine-tuning con [`Trainer`], ¡mira el tutorial básico [aquí](../training#finetune-with-trainer)!
-
-</Tip>
-
-En este punto, solo quedan tres pasos:
-
-1. Definir tus hiperparámetros de entrenamiento en [`TrainingArguments`].
-2. Pasarle los argumentos del entrenamiento al [`Trainer`] jnto con el modelo, el dataset, el tokenizer y el collator de datos.
-3. Invocar el método [`~Trainer.train`] para realizar el fine-tuning del modelo.
-
-```py
->>> training_args = TrainingArguments(
-...     output_dir="./results",
-...     evaluation_strategy="epoch",
-...     learning_rate=5e-5,
-...     per_device_train_batch_size=16,
-...     per_device_eval_batch_size=16,
-...     num_train_epochs=3,
-...     weight_decay=0.01,
-... )
-
->>> trainer = Trainer(
-...     model=model,
-...     args=training_args,
-...     train_dataset=tokenized_swag["train"],
-...     eval_dataset=tokenized_swag["validation"],
-...     tokenizer=tokenizer,
-...     data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
-... )
-
->>> trainer.train()
-```
-</pt>
-<tf>
-Para realizar el fine-tuning de un modelo en TensorFlow, primero convierte tus datasets al formato `tf.data.Dataset` con el método [`~TFPreTrainedModel.prepare_tf_dataset`].
-
-```py
->>> data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
->>> tf_train_set = model.prepare_tf_dataset(
-...     tokenized_swag["train"],
-...     shuffle=True,
-...     batch_size=batch_size,
-...     collate_fn=data_collator,
-... )
-
->>> tf_validation_set = model.prepare_tf_dataset(
-...     tokenized_swag["validation"],
-...     shuffle=False,
-...     batch_size=batch_size,
-...     collate_fn=data_collator,
-... )
-```
-
-<Tip>
-
-Para familiarizarte con el fine-tuning con Keras, ¡mira el tutorial básico [aquí](training#finetune-with-keras)!
-
-</Tip>
-
-Prepara una función de optimización, un programa para la tasa de aprendizaje y algunos hiperparámetros de entrenamiento:
-
-```py
->>> from transformers import create_optimizer
-
->>> batch_size = 16
->>> num_train_epochs = 2
->>> total_train_steps = (len(tokenized_swag["train"]) // batch_size) * num_train_epochs
->>> optimizer, schedule = create_optimizer(init_lr=5e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
-```
-
-Carga el modelo BERT con [`TFAutoModelForMultipleChoice`]:
-
-```py
->>> from transformers import TFAutoModelForMultipleChoice
-
->>> model = TFAutoModelForMultipleChoice.from_pretrained("bert-base-uncased")
-```
-
-Configura el modelo para entrenarlo con [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
-
-```py
->>> model.compile(optimizer=optimizer)
-```
-
-Invoca el método [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) para realizar el fine-tuning del modelo:
-
-```py
->>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=2)
-```
-</tf>
-</frameworkcontent>
diff --git a/docs/source/es/tasks/question_answering.md b/docs/source/es/tasks/question_answering.md
new file mode 100644
index 000000000000..2aa896142e2e
--- /dev/null
+++ b/docs/source/es/tasks/question_answering.md
@@ -0,0 +1,275 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Respuesta a preguntas
+
+<Youtube id="ajPx5LwJD-I"/>
+
+La respuesta a preguntas devuelve una respuesta a partir de una pregunta dada. Existen dos formas comunes de responder preguntas:
+
+- Extractiva: extraer la respuesta a partir del contexto dado.
+- Abstractiva: generar una respuesta que responda correctamente la pregunta a partir del contexto dado.
+
+Esta guía te mostrará como hacer fine-tuning de [DistilBERT](https://huggingface.co/distilbert-base-uncased) en el dataset [SQuAD](https://huggingface.co/datasets/squad) para responder preguntas de forma extractiva.
+
+<Tip>
+
+Revisa la [página de la tarea](https://huggingface.co/tasks/question-answering) de responder preguntas para tener más información sobre otras formas de responder preguntas y los modelos, datasets y métricas asociadas.
+
+</Tip>
+
+## Carga el dataset SQuAD
+
+Carga el dataset SQuAD con la biblioteca 🤗 Datasets:
+
+```py
+>>> from datasets import load_dataset
+
+>>> squad = load_dataset("squad")
+```
+
+Ahora, échale un vistazo a una muestra:
+
+```py
+>>> squad["train"][0]
+{'answers': {'answer_start': [515], 'text': ['Saint Bernadette Soubirous']},
+ 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
+ 'id': '5733be284776f41900661182',
+ 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
+ 'title': 'University_of_Notre_Dame'
+}
+```
+
+El campo `answers` es un diccionario que contiene la posición inicial de la respuesta y el `texto` de la respuesta.
+
+## Preprocesamiento
+
+<Youtube id="qgaM0weJHpA"/>
+
+Carga el tokenizer de DistilBERT para procesar los campos `question` (pregunta) y `context` (contexto):
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+```
+
+Hay algunos pasos de preprocesamiento específicos para la tarea de respuesta a preguntas que debes tener en cuenta:
+
+1. Algunos ejemplos en un dataset pueden tener un contexto que supera la longitud máxima de entrada de un modelo. Trunca solamente el contexto asignándole el valor `"only_second"` al parámetro `truncation`.
+2. A continuación, mapea las posiciones de inicio y fin de la respuesta al contexto original asignándole el valor `True` al parámetro `return_offsets_mapping`.
+3. Una vez tengas el mapeo, puedes encontrar los tokens de inicio y fin de la respuesta. Usa el método [`sequence_ids`](https://huggingface.co/docs/tokenizers/python/latest/api/reference.html#tokenizers.Encoding.sequence_ids)
+para encontrar qué parte de la lista de tokens desplazados corresponde a la pregunta y cuál corresponde al contexto.
+
+A continuación puedes ver como se crea una función para truncar y mapear los tokens de inicio y fin de la respuesta al `context`:
+
+```py
+>>> def preprocess_function(examples):
+...     questions = [q.strip() for q in examples["question"]]
+...     inputs = tokenizer(
+...         questions,
+...         examples["context"],
+...         max_length=384,
+...         truncation="only_second",
+...         return_offsets_mapping=True,
+...         padding="max_length",
+...     )
+
+...     offset_mapping = inputs.pop("offset_mapping")
+...     answers = examples["answers"]
+...     start_positions = []
+...     end_positions = []
+
+...     for i, offset in enumerate(offset_mapping):
+...         answer = answers[i]
+...         start_char = answer["answer_start"][0]
+...         end_char = answer["answer_start"][0] + len(answer["text"][0])
+...         sequence_ids = inputs.sequence_ids(i)
+
+...         # Encuentra el inicio y el fin del contexto
+...         idx = 0
+...         while sequence_ids[idx] != 1:
+...             idx += 1
+...         context_start = idx
+...         while sequence_ids[idx] == 1:
+...             idx += 1
+...         context_end = idx - 1
+
+...         # Si la respuesta entera no está dentro del contexto, etiquétala como (0, 0)
+...         if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
+...             start_positions.append(0)
+...             end_positions.append(0)
+...         else:
+...             # De lo contrario, esta es la posición de los tokens de inicio y fin
+...             idx = context_start
+...             while idx <= context_end and offset[idx][0] <= start_char:
+...                 idx += 1
+...             start_positions.append(idx - 1)
+
+...             idx = context_end
+...             while idx >= context_start and offset[idx][1] >= end_char:
+...                 idx -= 1
+...             end_positions.append(idx + 1)
+
+...     inputs["start_positions"] = start_positions
+...     inputs["end_positions"] = end_positions
+...     return inputs
+```
+
+Usa la función [`~datasets.Dataset.map`] de 🤗 Datasets para aplicarle la función de preprocesamiento al dataset entero. Puedes acelerar la función `map` haciendo `batched=True` para procesar varios elementos del dataset a la vez.
+Quita las columnas que no necesites:
+
+```py
+>>> tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)
+```
+
+Usa el [`DefaultDataCollator`] para crear un lote de ejemplos. A diferencia de los otros collators de datos en 🤗 Transformers, el `DefaultDataCollator` no aplica ningún procesamiento adicional (como el rellenado).
+
+<frameworkcontent>
+<pt>
+```py
+>>> from transformers import DefaultDataCollator
+
+>>> data_collator = DefaultDataCollator()
+```
+</pt>
+<tf>
+```py
+>>> from transformers import DefaultDataCollator
+
+>>> data_collator = DefaultDataCollator(return_tensors="tf")
+```
+</tf>
+</frameworkcontent>
+
+## Entrenamiento
+
+<frameworkcontent>
+<pt>
+Carga el modelo DistilBERT con [`AutoModelForQuestionAnswering`]:
+
+```py
+>>> from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
+
+>>> model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+```
+
+<Tip>
+
+Para familiarizarte con el fine-tuning con [`Trainer`], ¡mira el tutorial básico [aquí](../training#finetune-with-trainer)!
+
+</Tip>
+
+En este punto, solo quedan tres pasos:
+
+1. Definir tus hiperparámetros de entrenamiento en [`TrainingArguments`].
+2. Pasarle los argumentos del entrenamiento al [`Trainer`] junto con el modelo, el dataset, el tokenizer y el collator de datos.
+3. Invocar el método [`~Trainer.train`] para realizar el fine-tuning del modelo.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="./results",
+...     evaluation_strategy="epoch",
+...     learning_rate=2e-5,
+...     per_device_train_batch_size=16,
+...     per_device_eval_batch_size=16,
+...     num_train_epochs=3,
+...     weight_decay=0.01,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=tokenized_squad["train"],
+...     eval_dataset=tokenized_squad["validation"],
+...     tokenizer=tokenizer,
+...     data_collator=data_collator,
+... )
+
+>>> trainer.train()
+```
+</pt>
+<tf>
+Para realizar el fine-tuning de un modelo en TensorFlow, primero convierte tus datasets al formato `tf.data.Dataset` con el método [`~TFPreTrainedModel.prepare_tf_dataset`].
+
+```py
+>>> tf_train_set = model.prepare_tf_dataset(
+...     tokenized_squad["train"],
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_validation_set = model.prepare_tf_dataset(
+...     tokenized_squad["validation"],
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+<Tip>
+
+Para familiarizarte con el fine-tuning con Keras, ¡mira el tutorial básico [aquí](training#finetune-with-keras)!
+
+</Tip>
+
+Prepara una función de optimización, un programa para la tasa de aprendizaje y algunos hiperparámetros de entrenamiento:
+
+```py
+>>> from transformers import create_optimizer
+
+>>> batch_size = 16
+>>> num_epochs = 2
+>>> total_train_steps = (len(tokenized_squad["train"]) // batch_size) * num_epochs
+>>> optimizer, schedule = create_optimizer(
+...     init_lr=2e-5,
+...     num_warmup_steps=0,
+...     num_train_steps=total_train_steps,
+... )
+```
+
+Carga el modelo DistilBERT con [`TFAutoModelForQuestionAnswering`]:
+
+```py
+>>> from transformers import TFAutoModelForQuestionAnswering
+
+>>> model = TFAutoModelForQuestionAnswering("distilbert-base-uncased")
+```
+
+Configura el modelo para entrenarlo con [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)
+```
+
+Invoca el método [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) para realizar el fine-tuning del modelo:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3)
+```
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+Para un ejemplo con mayor profundidad de cómo hacer fine-tuning a un modelo para responder preguntas, échale un vistazo al
+[cuaderno de PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb) o al
+[cuaderno de TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb) correspondiente.
+
+</Tip>
diff --git a/docs/source/es/tasks/question_answering.mdx b/docs/source/es/tasks/question_answering.mdx
deleted file mode 100644
index d599fa8f1a37..000000000000
--- a/docs/source/es/tasks/question_answering.mdx
+++ /dev/null
@@ -1,271 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Respuesta a preguntas
-
-<Youtube id="ajPx5LwJD-I"/>
-
-La respuesta a preguntas devuelve una respuesta a partir de una pregunta dada. Existen dos formas comunes de responder preguntas:
-
-- Extractiva: extraer la respuesta a partir del contexto dado.
-- Abstractiva: generar una respuesta que responda correctamente la pregunta a partir del contexto dado.
-
-Esta guía te mostrará como hacer fine-tuning de [DistilBERT](https://huggingface.co/distilbert-base-uncased) en el dataset [SQuAD](https://huggingface.co/datasets/squad) para responder preguntas de forma extractiva.
-
-<Tip>
-
-Revisa la [página de la tarea](https://huggingface.co/tasks/question-answering) de responder preguntas para tener más información sobre otras formas de responder preguntas y los modelos, datasets y métricas asociadas.
-
-</Tip>
-
-## Carga el dataset SQuAD
-
-Carga el dataset SQuAD con la biblioteca 🤗 Datasets:
-
-```py
->>> from datasets import load_dataset
-
->>> squad = load_dataset("squad")
-```
-
-Ahora, échale un vistazo a una muestra:
-
-```py
->>> squad["train"][0]
-{'answers': {'answer_start': [515], 'text': ['Saint Bernadette Soubirous']},
- 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
- 'id': '5733be284776f41900661182',
- 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
- 'title': 'University_of_Notre_Dame'
-}
-```
-
-El campo `answers` es un diccionario que contiene la posición inicial de la respuesta y el `texto` de la respuesta.
-
-## Preprocesamiento
-
-<Youtube id="qgaM0weJHpA"/>
-
-Carga el tokenizer de DistilBERT para procesar los campos `question` (pregunta) y `context` (contexto):
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
-```
-
-Hay algunos pasos de preprocesamiento específicos para la tarea de respuesta a preguntas que debes tener en cuenta:
-
-1. Algunos ejemplos en un dataset pueden tener un contexto que supera la longitud máxima de entrada de un modelo. Trunca solamente el contexto asignándole el valor `"only_second"` al parámetro `truncation`.
-2. A continuación, mapea las posiciones de inicio y fin de la respuesta al contexto original asignándole el valor `True` al parámetro `return_offsets_mapping`.
-3. Una vez tengas el mapeo, puedes encontrar los tokens de inicio y fin de la respuesta. Usa el método [`sequence_ids`](https://huggingface.co/docs/tokenizers/python/latest/api/reference.html#tokenizers.Encoding.sequence_ids)
-para encontrar qué parte de la lista de tokens desplazados corresponde a la pregunta y cuál corresponde al contexto.
-
-A continuación puedes ver como se crea una función para truncar y mapear los tokens de inicio y fin de la respuesta al `context`:
-
-```py
->>> def preprocess_function(examples):
-...     questions = [q.strip() for q in examples["question"]]
-...     inputs = tokenizer(
-...         questions,
-...         examples["context"],
-...         max_length=384,
-...         truncation="only_second",
-...         return_offsets_mapping=True,
-...         padding="max_length",
-...     )
-
-...     offset_mapping = inputs.pop("offset_mapping")
-...     answers = examples["answers"]
-...     start_positions = []
-...     end_positions = []
-
-...     for i, offset in enumerate(offset_mapping):
-...         answer = answers[i]
-...         start_char = answer["answer_start"][0]
-...         end_char = answer["answer_start"][0] + len(answer["text"][0])
-...         sequence_ids = inputs.sequence_ids(i)
-
-...         # Encuentra el inicio y el fin del contexto
-...         idx = 0
-...         while sequence_ids[idx] != 1:
-...             idx += 1
-...         context_start = idx
-...         while sequence_ids[idx] == 1:
-...             idx += 1
-...         context_end = idx - 1
-
-...         # Si la respuesta entera no está dentro del contexto, etiquétala como (0, 0)
-...         if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
-...             start_positions.append(0)
-...             end_positions.append(0)
-...         else:
-...             # De lo contrario, esta es la posición de los tokens de inicio y fin
-...             idx = context_start
-...             while idx <= context_end and offset[idx][0] <= start_char:
-...                 idx += 1
-...             start_positions.append(idx - 1)
-
-...             idx = context_end
-...             while idx >= context_start and offset[idx][1] >= end_char:
-...                 idx -= 1
-...             end_positions.append(idx + 1)
-
-...     inputs["start_positions"] = start_positions
-...     inputs["end_positions"] = end_positions
-...     return inputs
-```
-
-Usa la función [`~datasets.Dataset.map`] de 🤗 Datasets para aplicarle la función de preprocesamiento al dataset entero. Puedes acelerar la función `map` haciendo `batched=True` para procesar varios elementos del dataset a la vez.
-Quita las columnas que no necesites:
-
-```py
->>> tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)
-```
-
-Usa el [`DefaultDataCollator`] para crear un lote de ejemplos. A diferencia de los otros collators de datos en 🤗 Transformers, el `DefaultDataCollator` no aplica ningún procesamiento adicional (como el rellenado).
-
-<frameworkcontent>
-<pt>
-```py
->>> from transformers import DefaultDataCollator
-
->>> data_collator = DefaultDataCollator()
-```
-</pt>
-<tf>
-```py
->>> from transformers import DefaultDataCollator
-
->>> data_collator = DefaultDataCollator(return_tensors="tf")
-```
-</tf>
-</frameworkcontent>
-
-## Entrenamiento
-
-<frameworkcontent>
-<pt>
-Carga el modelo DistilBERT con [`AutoModelForQuestionAnswering`]:
-
-```py
->>> from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
-
->>> model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
-```
-
-<Tip>
-
-Para familiarizarte con el fine-tuning con [`Trainer`], ¡mira el tutorial básico [aquí](../training#finetune-with-trainer)!
-
-</Tip>
-
-En este punto, solo quedan tres pasos:
-
-1. Definir tus hiperparámetros de entrenamiento en [`TrainingArguments`].
-2. Pasarle los argumentos del entrenamiento al [`Trainer`] junto con el modelo, el dataset, el tokenizer y el collator de datos.
-3. Invocar el método [`~Trainer.train`] para realizar el fine-tuning del modelo.
-
-```py
->>> training_args = TrainingArguments(
-...     output_dir="./results",
-...     evaluation_strategy="epoch",
-...     learning_rate=2e-5,
-...     per_device_train_batch_size=16,
-...     per_device_eval_batch_size=16,
-...     num_train_epochs=3,
-...     weight_decay=0.01,
-... )
-
->>> trainer = Trainer(
-...     model=model,
-...     args=training_args,
-...     train_dataset=tokenized_squad["train"],
-...     eval_dataset=tokenized_squad["validation"],
-...     tokenizer=tokenizer,
-...     data_collator=data_collator,
-... )
-
->>> trainer.train()
-```
-</pt>
-<tf>
-Para realizar el fine-tuning de un modelo en TensorFlow, primero convierte tus datasets al formato `tf.data.Dataset` con el método [`~TFPreTrainedModel.prepare_tf_dataset`].
-
-```py
->>> tf_train_set = model.prepare_tf_dataset(
-...     tokenized_squad["train"],
-...     shuffle=True,
-...     batch_size=16,
-...     collate_fn=data_collator,
-... )
-
->>> tf_validation_set = model.prepare_tf_dataset(
-...     tokenized_squad["validation"],
-...     shuffle=False,
-...     batch_size=16,
-...     collate_fn=data_collator,
-... )
-```
-
-<Tip>
-
-Para familiarizarte con el fine-tuning con Keras, ¡mira el tutorial básico [aquí](training#finetune-with-keras)!
-
-</Tip>
-
-Prepara una función de optimización, un programa para la tasa de aprendizaje y algunos hiperparámetros de entrenamiento:
-
-```py
->>> from transformers import create_optimizer
-
->>> batch_size = 16
->>> num_epochs = 2
->>> total_train_steps = (len(tokenized_squad["train"]) // batch_size) * num_epochs
->>> optimizer, schedule = create_optimizer(
-...     init_lr=2e-5,
-...     num_warmup_steps=0,
-...     num_train_steps=total_train_steps,
-... )
-```
-
-Carga el modelo DistilBERT con [`TFAutoModelForQuestionAnswering`]:
-
-```py
->>> from transformers import TFAutoModelForQuestionAnswering
-
->>> model = TFAutoModelForQuestionAnswering("distilbert-base-uncased")
-```
-
-Configura el modelo para entrenarlo con [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
-
-```py
->>> import tensorflow as tf
-
->>> model.compile(optimizer=optimizer)
-```
-
-Invoca el método [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) para realizar el fine-tuning del modelo:
-
-```py
->>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3)
-```
-</tf>
-</frameworkcontent>
-
-<Tip>
-
-Para un ejemplo con mayor profundidad de cómo hacer fine-tuning a un modelo para responder preguntas, échale un vistazo al
-[cuaderno de PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb) o al
-[cuaderno de TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb) correspondiente.
-
-</Tip>
diff --git a/docs/source/es/tasks/summarization.md b/docs/source/es/tasks/summarization.md
new file mode 100644
index 000000000000..b545e4216e5d
--- /dev/null
+++ b/docs/source/es/tasks/summarization.md
@@ -0,0 +1,226 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Generación de resúmenes
+
+<Youtube id="yHnr5Dk2zCI"/>
+
+La generación de resúmenes (summarization, en inglés) crea una versión más corta de un documento o un artículo que resume toda su información importante. Junto con la traducción, es un ejemplo de una tarea que puede ser formulada como una tarea secuencia a secuencia. La generación de resúmenes puede ser:
+
+- Extractiva: Extrae la información más relevante de un documento.
+- Abstractiva: Genera un texto nuevo que captura la información más importante.
+
+Esta guía te mostrará cómo puedes hacer fine-tuning del modelo [T5](https://huggingface.co/t5-small) sobre el subset de proyectos de ley del estado de California, dentro del dataset [BillSum](https://huggingface.co/datasets/billsum) para hacer generación de resúmenes abstractiva.
+
+<Tip>
+
+Consulta la [página de la tarea](https://huggingface.co/tasks/summarization) de generación de resúmenes para obtener más información sobre sus modelos, datasets y métricas asociadas.
+
+</Tip>
+
+## Carga el dataset BillSum
+
+Carga el dataset BillSum de la biblioteca 🤗 Datasets:
+
+```py
+>>> from datasets import load_dataset
+
+>>> billsum = load_dataset("billsum", split="ca_test")
+```
+
+Divide el dataset en un set de train y un set de test:
+
+```py
+>>> billsum = billsum.train_test_split(test_size=0.2)
+```
+
+A continuación, observa un ejemplo:
+
+```py
+>>> billsum["train"][0]
+{'summary': 'Existing law authorizes state agencies to enter into contracts for the acquisition of goods or services upon approval by the Department of General Services. Existing law sets forth various requirements and prohibitions for those contracts, including, but not limited to, a prohibition on entering into contracts for the acquisition of goods or services of $100,000 or more with a contractor that discriminates between spouses and domestic partners or same-sex and different-sex couples in the provision of benefits. Existing law provides that a contract entered into in violation of those requirements and prohibitions is void and authorizes the state or any person acting on behalf of the state to bring a civil action seeking a determination that a contract is in violation and therefore void. Under existing law, a willful violation of those requirements and prohibitions is a misdemeanor.\nThis bill would also prohibit a state agency from entering into contracts for the acquisition of goods or services of $100,000 or more with a contractor that discriminates between employees on the basis of gender identity in the provision of benefits, as specified. By expanding the scope of a crime, this bill would impose a state-mandated local program.\nThe California Constitution requires the state to reimburse local agencies and school districts for certain costs mandated by the state. Statutory provisions establish procedures for making that reimbursement.\nThis bill would provide that no reimbursement is required by this act for a specified reason.',
+ 'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nSection 10295.35 is added to the Public Contract Code, to read:\n10295.35.\n(a) (1) Notwithstanding any other law, a state agency shall not enter into any contract for the acquisition of goods or services in the amount of one hundred thousand dollars ($100,000) or more with a contractor that, in the provision of benefits, discriminates between employees on the basis of an employee’s or dependent’s actual or perceived gender identity, including, but not limited to, the employee’s or dependent’s identification as transgender.\n(2) For purposes of this section, “contract” includes contracts with a cumulative amount of one hundred thousand dollars ($100,000) or more per contractor in each fiscal year.\n(3) For purposes of this section, an employee health plan is discriminatory if the plan is not consistent with Section 1365.5 of the Health and Safety Code and Section 10140 of the Insurance Code.\n(4) The requirements of this section shall apply only to those portions of a contractor’s operations that occur under any of the following conditions:\n(A) Within the state.\n(B) On real property outside the state if the property is owned by the state or if the state has a right to occupy the property, and if the contractor’s presence at that location is connected to a contract with the state.\n(C) Elsewhere in the United States where work related to a state contract is being performed.\n(b) Contractors shall treat as confidential, to the maximum extent allowed by law or by the requirement of the contractor’s insurance provider, any request by an employee or applicant for employment benefits or any documentation of eligibility for benefits submitted by an employee or applicant for employment.\n(c) After taking all reasonable measures to find a contractor that complies with this section, as determined by the state agency, the requirements of this section may be waived under any of the following circumstances:\n(1) There is only one prospective contractor willing to enter into a specific contract with the state agency.\n(2) The contract is necessary to respond to an emergency, as determined by the state agency, that endangers the public health, welfare, or safety, or the contract is necessary for the provision of essential services, and no entity that complies with the requirements of this section capable of responding to the emergency is immediately available.\n(3) The requirements of this section violate, or are inconsistent with, the terms or conditions of a grant, subvention, or agreement, if the agency has made a good faith attempt to change the terms or conditions of any grant, subvention, or agreement to authorize application of this section.\n(4) The contractor is providing wholesale or bulk water, power, or natural gas, the conveyance or transmission of the same, or ancillary services, as required for ensuring reliable services in accordance with good utility practice, if the purchase of the same cannot practically be accomplished through the standard competitive bidding procedures and the contractor is not providing direct retail services to end users.\n(d) (1) A contractor shall not be deemed to discriminate in the provision of benefits if the contractor, in providing the benefits, pays the actual costs incurred in obtaining the benefit.\n(2) If a contractor is unable to provide a certain benefit, despite taking reasonable measures to do so, the contractor shall not be deemed to discriminate in the provision of benefits.\n(e) (1) Every contract subject to this chapter shall contain a statement by which the contractor certifies that the contractor is in compliance with this section.\n(2) The department or other contracting agency shall enforce this section pursuant to its existing enforcement powers.\n(3) (A) If a contractor falsely certifies that it is in compliance with this section, the contract with that contractor shall be subject to Article 9 (commencing with Section 10420), unless, within a time period specified by the department or other contracting agency, the contractor provides to the department or agency proof that it has complied, or is in the process of complying, with this section.\n(B) The application of the remedies or penalties contained in Article 9 (commencing with Section 10420) to a contract subject to this chapter shall not preclude the application of any existing remedies otherwise available to the department or other contracting agency under its existing enforcement powers.\n(f) Nothing in this section is intended to regulate the contracting practices of any local jurisdiction.\n(g) This section shall be construed so as not to conflict with applicable federal laws, rules, or regulations. In the event that a court or agency of competent jurisdiction holds that federal law, rule, or regulation invalidates any clause, sentence, paragraph, or section of this code or the application thereof to any person or circumstances, it is the intent of the state that the court or agency sever that clause, sentence, paragraph, or section so that the remainder of this section shall remain in effect.\nSEC. 2.\nSection 10295.35 of the Public Contract Code shall not be construed to create any new enforcement authority or responsibility in the Department of General Services or any other contracting agency.\nSEC. 3.\nNo reimbursement is required by this act pursuant to Section 6 of Article XIII\u2009B of the California Constitution because the only costs that may be incurred by a local agency or school district will be incurred because this act creates a new crime or infraction, eliminates a crime or infraction, or changes the penalty for a crime or infraction, within the meaning of Section 17556 of the Government Code, or changes the definition of a crime within the meaning of Section 6 of Article XIII\u2009B of the California Constitution.',
+ 'title': 'An act to add Section 10295.35 to the Public Contract Code, relating to public contracts.'}
+```
+
+El campo `text` es el input y el campo `summary` es el objetivo.
+
+## Preprocesa
+
+Carga el tokenizador T5 para procesar `text` y `summary`:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
+```
+
+La función de preprocesamiento necesita:
+
+1. Agregar un prefijo al input; una clave para que T5 sepa que se trata de una tarea de generación de resúmenes. Algunos modelos capaces de realizar múltiples tareas de NLP requieren una clave que indique la tarea específica.
+2. Usar el argumento `text_target` para tokenizar etiquetas.
+3. Truncar secuencias para que no sean más largas que la longitud máxima fijada por el parámetro `max_length`.
+
+```py
+>>> prefix = "summarize: "
+
+
+>>> def preprocess_function(examples):
+...     inputs = [prefix + doc for doc in examples["text"]]
+...     model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
+
+...     labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)
+
+...     model_inputs["labels"] = labels["input_ids"]
+...     return model_inputs
+```
+
+Usa la función [`~datasets.Dataset.map`] de 🤗 Datasets para aplicar la función de preprocesamiento sobre el dataset en su totalidad. Puedes acelerar la función `map` configurando el argumento `batched=True` para procesar múltiples elementos del dataset a la vez:
+
+```py
+>>> tokenized_billsum = billsum.map(preprocess_function, batched=True)
+```
+
+Usa [`DataCollatorForSeq2Seq`] para crear un lote de ejemplos. Esto también *rellenará dinámicamente* tu texto y etiquetas a la dimensión del elemento más largo del lote para que tengan un largo uniforme. Si bien es posible rellenar tu texto en la función `tokenizer` mediante el argumento `padding=True`, el rellenado dinámico es más eficiente.
+
+<frameworkcontent>
+<pt>
+```py
+>>> from transformers import DataCollatorForSeq2Seq
+
+>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
+```
+</pt>
+<tf>
+```py
+>>> from transformers import DataCollatorForSeq2Seq
+
+>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors="tf")
+```
+</tf>
+</frameworkcontent>
+
+## Entrenamiento
+
+<frameworkcontent>
+<pt>
+Carga T5 con [`AutoModelForSeq2SeqLM`]:
+
+```py
+>>> from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
+
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
+```
+
+<Tip>
+
+Para familiarizarte con el proceso para realizar fine-tuning sobre un modelo con [`Trainer`], ¡mira el tutorial básico [aquí](../training#finetune-with-trainer)!
+
+</Tip>
+
+En este punto, solo faltan tres pasos:
+
+1. Definir tus hiperparámetros de entrenamiento en [`Seq2SeqTrainingArguments`].
+2. Pasarle los argumentos de entrenamiento a [`Seq2SeqTrainer`] junto con el modelo, dataset y data collator.
+3. Llamar [`~Trainer.train`] para realizar el fine-tuning sobre tu modelo.
+
+```py
+>>> training_args = Seq2SeqTrainingArguments(
+...     output_dir="./results",
+...     evaluation_strategy="epoch",
+...     learning_rate=2e-5,
+...     per_device_train_batch_size=16,
+...     per_device_eval_batch_size=16,
+...     weight_decay=0.01,
+...     save_total_limit=3,
+...     num_train_epochs=1,
+...     fp16=True,
+... )
+
+>>> trainer = Seq2SeqTrainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=tokenized_billsum["train"],
+...     eval_dataset=tokenized_billsum["test"],
+...     tokenizer=tokenizer,
+...     data_collator=data_collator,
+... )
+
+>>> trainer.train()
+```
+</pt>
+<tf>
+Para hacer fine-tuning de un modelo en TensorFlow, comienza por convertir tus datasets al formato `tf.data.Dataset` con [`~datasets.Dataset.to_tf_dataset`]. Especifica los inputs y etiquetas en `columns`, el tamaño de lote, el data collator, y si es necesario mezclar el dataset:
+
+```py
+>>> tf_train_set = tokenized_billsum["train"].to_tf_dataset(
+...     columns=["attention_mask", "input_ids", "labels"],
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_test_set = tokenized_billsum["test"].to_tf_dataset(
+...     columns=["attention_mask", "input_ids", "labels"],
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+<Tip>
+
+Para familiarizarte con el fine-tuning con Keras, ¡mira el tutorial básico [aquí](training#finetune-with-keras)!
+
+</Tip>
+
+Crea la función optimizadora, establece la tasa de aprendizaje y algunos hiperparámetros de entrenamiento:
+
+```py
+>>> from transformers import create_optimizer, AdamWeightDecay
+
+>>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
+```
+
+Carga T5 con [`TFAutoModelForSeq2SeqLM`]:
+
+```py
+>>> from transformers import TFAutoModelForSeq2SeqLM
+
+>>> model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-small")
+```
+
+Configura el modelo para entrenamiento con [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
+
+```py
+>>> model.compile(optimizer=optimizer)
+```
+
+Llama a [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) para realizar el fine-tuning del modelo:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3)
+```
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+Para un ejemplo con mayor profundidad de cómo hacer fine-tuning a un modelo para generación de resúmenes, revisa la
+[notebook en PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)
+o a la [notebook en TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb).
+
+</Tip>
diff --git a/docs/source/es/tasks/summarization.mdx b/docs/source/es/tasks/summarization.mdx
deleted file mode 100644
index c09c4b0b833a..000000000000
--- a/docs/source/es/tasks/summarization.mdx
+++ /dev/null
@@ -1,222 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Generación de resúmenes
-
-<Youtube id="yHnr5Dk2zCI"/>
-
-La generación de resúmenes (summarization, en inglés) crea una versión más corta de un documento o un artículo que resume toda su información importante. Junto con la traducción, es un ejemplo de una tarea que puede ser formulada como una tarea secuencia a secuencia. La generación de resúmenes puede ser:
-
-- Extractiva: Extrae la información más relevante de un documento.
-- Abstractiva: Genera un texto nuevo que captura la información más importante.
-
-Esta guía te mostrará cómo puedes hacer fine-tuning del modelo [T5](https://huggingface.co/t5-small) sobre el subset de proyectos de ley del estado de California, dentro del dataset [BillSum](https://huggingface.co/datasets/billsum) para hacer generación de resúmenes abstractiva.
-
-<Tip>
-
-Consulta la [página de la tarea](https://huggingface.co/tasks/summarization) de generación de resúmenes para obtener más información sobre sus modelos, datasets y métricas asociadas.
-
-</Tip>
-
-## Carga el dataset BillSum
-
-Carga el dataset BillSum de la biblioteca 🤗 Datasets:
-
-```py
->>> from datasets import load_dataset
-
->>> billsum = load_dataset("billsum", split="ca_test")
-```
-
-Divide el dataset en un set de train y un set de test:
-
-```py
->>> billsum = billsum.train_test_split(test_size=0.2)
-```
-
-A continuación, observa un ejemplo:
-
-```py
->>> billsum["train"][0]
-{'summary': 'Existing law authorizes state agencies to enter into contracts for the acquisition of goods or services upon approval by the Department of General Services. Existing law sets forth various requirements and prohibitions for those contracts, including, but not limited to, a prohibition on entering into contracts for the acquisition of goods or services of $100,000 or more with a contractor that discriminates between spouses and domestic partners or same-sex and different-sex couples in the provision of benefits. Existing law provides that a contract entered into in violation of those requirements and prohibitions is void and authorizes the state or any person acting on behalf of the state to bring a civil action seeking a determination that a contract is in violation and therefore void. Under existing law, a willful violation of those requirements and prohibitions is a misdemeanor.\nThis bill would also prohibit a state agency from entering into contracts for the acquisition of goods or services of $100,000 or more with a contractor that discriminates between employees on the basis of gender identity in the provision of benefits, as specified. By expanding the scope of a crime, this bill would impose a state-mandated local program.\nThe California Constitution requires the state to reimburse local agencies and school districts for certain costs mandated by the state. Statutory provisions establish procedures for making that reimbursement.\nThis bill would provide that no reimbursement is required by this act for a specified reason.',
- 'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nSection 10295.35 is added to the Public Contract Code, to read:\n10295.35.\n(a) (1) Notwithstanding any other law, a state agency shall not enter into any contract for the acquisition of goods or services in the amount of one hundred thousand dollars ($100,000) or more with a contractor that, in the provision of benefits, discriminates between employees on the basis of an employee’s or dependent’s actual or perceived gender identity, including, but not limited to, the employee’s or dependent’s identification as transgender.\n(2) For purposes of this section, “contract” includes contracts with a cumulative amount of one hundred thousand dollars ($100,000) or more per contractor in each fiscal year.\n(3) For purposes of this section, an employee health plan is discriminatory if the plan is not consistent with Section 1365.5 of the Health and Safety Code and Section 10140 of the Insurance Code.\n(4) The requirements of this section shall apply only to those portions of a contractor’s operations that occur under any of the following conditions:\n(A) Within the state.\n(B) On real property outside the state if the property is owned by the state or if the state has a right to occupy the property, and if the contractor’s presence at that location is connected to a contract with the state.\n(C) Elsewhere in the United States where work related to a state contract is being performed.\n(b) Contractors shall treat as confidential, to the maximum extent allowed by law or by the requirement of the contractor’s insurance provider, any request by an employee or applicant for employment benefits or any documentation of eligibility for benefits submitted by an employee or applicant for employment.\n(c) After taking all reasonable measures to find a contractor that complies with this section, as determined by the state agency, the requirements of this section may be waived under any of the following circumstances:\n(1) There is only one prospective contractor willing to enter into a specific contract with the state agency.\n(2) The contract is necessary to respond to an emergency, as determined by the state agency, that endangers the public health, welfare, or safety, or the contract is necessary for the provision of essential services, and no entity that complies with the requirements of this section capable of responding to the emergency is immediately available.\n(3) The requirements of this section violate, or are inconsistent with, the terms or conditions of a grant, subvention, or agreement, if the agency has made a good faith attempt to change the terms or conditions of any grant, subvention, or agreement to authorize application of this section.\n(4) The contractor is providing wholesale or bulk water, power, or natural gas, the conveyance or transmission of the same, or ancillary services, as required for ensuring reliable services in accordance with good utility practice, if the purchase of the same cannot practically be accomplished through the standard competitive bidding procedures and the contractor is not providing direct retail services to end users.\n(d) (1) A contractor shall not be deemed to discriminate in the provision of benefits if the contractor, in providing the benefits, pays the actual costs incurred in obtaining the benefit.\n(2) If a contractor is unable to provide a certain benefit, despite taking reasonable measures to do so, the contractor shall not be deemed to discriminate in the provision of benefits.\n(e) (1) Every contract subject to this chapter shall contain a statement by which the contractor certifies that the contractor is in compliance with this section.\n(2) The department or other contracting agency shall enforce this section pursuant to its existing enforcement powers.\n(3) (A) If a contractor falsely certifies that it is in compliance with this section, the contract with that contractor shall be subject to Article 9 (commencing with Section 10420), unless, within a time period specified by the department or other contracting agency, the contractor provides to the department or agency proof that it has complied, or is in the process of complying, with this section.\n(B) The application of the remedies or penalties contained in Article 9 (commencing with Section 10420) to a contract subject to this chapter shall not preclude the application of any existing remedies otherwise available to the department or other contracting agency under its existing enforcement powers.\n(f) Nothing in this section is intended to regulate the contracting practices of any local jurisdiction.\n(g) This section shall be construed so as not to conflict with applicable federal laws, rules, or regulations. In the event that a court or agency of competent jurisdiction holds that federal law, rule, or regulation invalidates any clause, sentence, paragraph, or section of this code or the application thereof to any person or circumstances, it is the intent of the state that the court or agency sever that clause, sentence, paragraph, or section so that the remainder of this section shall remain in effect.\nSEC. 2.\nSection 10295.35 of the Public Contract Code shall not be construed to create any new enforcement authority or responsibility in the Department of General Services or any other contracting agency.\nSEC. 3.\nNo reimbursement is required by this act pursuant to Section 6 of Article XIII\u2009B of the California Constitution because the only costs that may be incurred by a local agency or school district will be incurred because this act creates a new crime or infraction, eliminates a crime or infraction, or changes the penalty for a crime or infraction, within the meaning of Section 17556 of the Government Code, or changes the definition of a crime within the meaning of Section 6 of Article XIII\u2009B of the California Constitution.',
- 'title': 'An act to add Section 10295.35 to the Public Contract Code, relating to public contracts.'}
-```
-
-El campo `text` es el input y el campo `summary` es el objetivo.
-
-## Preprocesa
-
-Carga el tokenizador T5 para procesar `text` y `summary`:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
-```
-
-La función de preprocesamiento necesita:
-
-1. Agregar un prefijo al input; una clave para que T5 sepa que se trata de una tarea de generación de resúmenes. Algunos modelos capaces de realizar múltiples tareas de NLP requieren una clave que indique la tarea específica.
-2. Usar el argumento `text_target` para tokenizar etiquetas.
-3. Truncar secuencias para que no sean más largas que la longitud máxima fijada por el parámetro `max_length`.
-
-```py
->>> prefix = "summarize: "
-
-
->>> def preprocess_function(examples):
-...     inputs = [prefix + doc for doc in examples["text"]]
-...     model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
-
-...     labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)
-
-...     model_inputs["labels"] = labels["input_ids"]
-...     return model_inputs
-```
-
-Usa la función [`~datasets.Dataset.map`] de 🤗 Datasets para aplicar la función de preprocesamiento sobre el dataset en su totalidad. Puedes acelerar la función `map` configurando el argumento `batched=True` para procesar múltiples elementos del dataset a la vez:
-
-```py
->>> tokenized_billsum = billsum.map(preprocess_function, batched=True)
-```
-
-Usa [`DataCollatorForSeq2Seq`] para crear un lote de ejemplos. Esto también *rellenará dinámicamente* tu texto y etiquetas a la dimensión del elemento más largo del lote para que tengan un largo uniforme. Si bien es posible rellenar tu texto en la función `tokenizer` mediante el argumento `padding=True`, el rellenado dinámico es más eficiente.
-
-<frameworkcontent>
-<pt>
-```py
->>> from transformers import DataCollatorForSeq2Seq
-
->>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
-```
-</pt>
-<tf>
-```py
->>> from transformers import DataCollatorForSeq2Seq
-
->>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors="tf")
-```
-</tf>
-</frameworkcontent>
-
-## Entrenamiento
-
-<frameworkcontent>
-<pt>
-Carga T5 con [`AutoModelForSeq2SeqLM`]:
-
-```py
->>> from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
-
->>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
-```
-
-<Tip>
-
-Para familiarizarte con el proceso para realizar fine-tuning sobre un modelo con [`Trainer`], ¡mira el tutorial básico [aquí](../training#finetune-with-trainer)!
-
-</Tip>
-
-En este punto, solo faltan tres pasos:
-
-1. Definir tus hiperparámetros de entrenamiento en [`Seq2SeqTrainingArguments`].
-2. Pasarle los argumentos de entrenamiento a [`Seq2SeqTrainer`] junto con el modelo, dataset y data collator.
-3. Llamar [`~Trainer.train`] para realizar el fine-tuning sobre tu modelo.
-
-```py
->>> training_args = Seq2SeqTrainingArguments(
-...     output_dir="./results",
-...     evaluation_strategy="epoch",
-...     learning_rate=2e-5,
-...     per_device_train_batch_size=16,
-...     per_device_eval_batch_size=16,
-...     weight_decay=0.01,
-...     save_total_limit=3,
-...     num_train_epochs=1,
-...     fp16=True,
-... )
-
->>> trainer = Seq2SeqTrainer(
-...     model=model,
-...     args=training_args,
-...     train_dataset=tokenized_billsum["train"],
-...     eval_dataset=tokenized_billsum["test"],
-...     tokenizer=tokenizer,
-...     data_collator=data_collator,
-... )
-
->>> trainer.train()
-```
-</pt>
-<tf>
-Para hacer fine-tuning de un modelo en TensorFlow, comienza por convertir tus datasets al formato `tf.data.Dataset` con [`~datasets.Dataset.to_tf_dataset`]. Especifica los inputs y etiquetas en `columns`, el tamaño de lote, el data collator, y si es necesario mezclar el dataset:
-
-```py
->>> tf_train_set = tokenized_billsum["train"].to_tf_dataset(
-...     columns=["attention_mask", "input_ids", "labels"],
-...     shuffle=True,
-...     batch_size=16,
-...     collate_fn=data_collator,
-... )
-
->>> tf_test_set = tokenized_billsum["test"].to_tf_dataset(
-...     columns=["attention_mask", "input_ids", "labels"],
-...     shuffle=False,
-...     batch_size=16,
-...     collate_fn=data_collator,
-... )
-```
-
-<Tip>
-
-Para familiarizarte con el fine-tuning con Keras, ¡mira el tutorial básico [aquí](training#finetune-with-keras)!
-
-</Tip>
-
-Crea la función optimizadora, establece la tasa de aprendizaje y algunos hiperparámetros de entrenamiento:
-
-```py
->>> from transformers import create_optimizer, AdamWeightDecay
-
->>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
-```
-
-Carga T5 con [`TFAutoModelForSeq2SeqLM`]:
-
-```py
->>> from transformers import TFAutoModelForSeq2SeqLM
-
->>> model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-small")
-```
-
-Configura el modelo para entrenamiento con [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
-
-```py
->>> model.compile(optimizer=optimizer)
-```
-
-Llama a [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) para realizar el fine-tuning del modelo:
-
-```py
->>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3)
-```
-</tf>
-</frameworkcontent>
-
-<Tip>
-
-Para un ejemplo con mayor profundidad de cómo hacer fine-tuning a un modelo para generación de resúmenes, revisa la
-[notebook en PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)
-o a la [notebook en TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb).
-
-</Tip>
\ No newline at end of file
diff --git a/docs/source/es/training.md b/docs/source/es/training.md
new file mode 100644
index 000000000000..7b7b0657bd8f
--- /dev/null
+++ b/docs/source/es/training.md
@@ -0,0 +1,371 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Fine-tuning a un modelo pre-entrenado
+
+[[open-in-colab]]
+
+El uso de un modelo pre-entrenado tiene importantes ventajas. Reduce los costos de computación, la huella de carbono y te permite utilizar modelos de última generación sin tener que entrenar uno desde cero.
+
+* Fine-tuning a un modelo pre-entrenado con 🤗 Transformers [`Trainer`].
+* Fine-tuning a un modelo pre-entrenado en TensorFlow con Keras.
+* Fine-tuning a un modelo pre-entrenado en PyTorch nativo.
+
+<a id='data-processing'></a>
+
+## Prepara un dataset
+
+<Youtube id="_BZearw7f0w"/>
+
+Antes de aplicar fine-tuning a un modelo pre-entrenado, descarga un dataset y prepáralo para el entrenamiento. El tutorial anterior nos enseñó cómo procesar los datos para el entrenamiento, y ahora es la oportunidad de poner a prueba estas habilidades.
+
+Comienza cargando el dataset de [Yelp Reviews](https://huggingface.co/datasets/yelp_review_full):
+
+```py
+>>> from datasets import load_dataset
+
+>>> dataset = load_dataset("yelp_review_full")
+>>> dataset[100]
+{'label': 0,
+ 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. I\'ve worked at more than one location. I expect bad days, bad moods, and the occasional mistake. But I have yet to have a decent experience at this store. It will remain a place I avoid unless someone in my party needs to avoid illness from low blood sugar. Perhaps I should go back to the racially biased service of Steak n Shake instead!'}
+```
+
+Como ya sabes, necesitas un tokenizador para procesar el texto e incluir una estrategia para el padding y el truncamiento para manejar cualquier longitud de secuencia variable. Para procesar tu dataset en un solo paso, utiliza el método de 🤗 Datasets map para aplicar una función de preprocesamiento sobre todo el dataset:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+
+
+>>> def tokenize_function(examples):
+...     return tokenizer(examples["text"], padding="max_length", truncation=True)
+
+
+>>> tokenized_datasets = dataset.map(tokenize_function, batched=True)
+```
+
+Si lo deseas, puedes crear un subconjunto más pequeño del dataset completo para aplicarle fine-tuning y así reducir el tiempo.
+
+```py
+>>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
+>>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
+```
+
+<a id='trainer'></a>
+
+## Fine-tuning con `Trainer`
+
+<Youtube id="nvBXf7s7vTI"/>
+
+🤗 Transformers proporciona una clase [`Trainer`] optimizada para el entrenamiento de modelos de 🤗 Transformers, haciendo más fácil el inicio del entrenamiento sin necesidad de escribir manualmente tu propio ciclo. La API del [`Trainer`] soporta una amplia gama de opciones de entrenamiento y características como el logging, el gradient accumulation y el mixed precision.
+
+Comienza cargando tu modelo y especifica el número de labels previstas. A partir del [Card Dataset](https://huggingface.co/datasets/yelp_review_full#data-fields) de Yelp Review, que como ya sabemos tiene 5 labels:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+```
+
+<Tip>
+
+Verás una advertencia acerca de que algunos de los pesos pre-entrenados no están siendo utilizados y que algunos pesos están siendo inicializados al azar. No te preocupes, esto es completamente normal.
+El head/cabezal pre-entrenado del modelo BERT se descarta y se sustituye por un head de clasificación inicializado aleatoriamente. Puedes aplicar fine-tuning a este nuevo head del modelo en tu tarea de clasificación de secuencias haciendo transfer learning del modelo pre-entrenado.
+
+</Tip>
+
+### Hiperparámetros de entrenamiento
+
+A continuación, crea una clase [`TrainingArguments`] que contenga todos los hiperparámetros que puedes ajustar así como los indicadores para activar las diferentes opciones de entrenamiento. Para este tutorial puedes empezar con los [hiperparámetros](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments) de entrenamiento por defecto, pero siéntete libre de experimentar con ellos para encontrar tu configuración óptima.
+
+Especifica dónde vas a guardar los checkpoints de tu entrenamiento:
+
+```py
+>>> from transformers import TrainingArguments
+
+>>> training_args = TrainingArguments(output_dir="test_trainer")
+```
+
+### Métricas
+
+El [`Trainer`] no evalúa automáticamente el rendimiento del modelo durante el entrenamiento. Tendrás que pasarle a [`Trainer`] una función para calcular y hacer un reporte de las métricas. La biblioteca de 🤗 Datasets proporciona una función de [`accuracy`](https://huggingface.co/metrics/accuracy) simple que puedes cargar con la función `load_metric` (ver este [tutorial](https://huggingface.co/docs/datasets/metrics.html) para más información):
+
+```py
+>>> import numpy as np
+>>> from datasets import load_metric
+
+>>> metric = load_metric("accuracy")
+```
+
+Define la función `compute` en `metric` para calcular el accuracy de tus predicciones. Antes de pasar tus predicciones a `compute`, necesitas convertir las predicciones a logits (recuerda que todos los modelos de 🤗 Transformers devuelven logits).
+
+```py
+>>> def compute_metrics(eval_pred):
+...     logits, labels = eval_pred
+...     predictions = np.argmax(logits, axis=-1)
+...     return metric.compute(predictions=predictions, references=labels)
+```
+
+Si quieres controlar tus métricas de evaluación durante el fine-tuning, especifica el parámetro `evaluation_strategy` en tus argumentos de entrenamiento para que el modelo tenga en cuenta la métrica de evaluación al final de cada época:
+
+```py
+>>> from transformers import TrainingArguments
+
+>>> training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
+```
+
+### Trainer
+
+Crea un objeto [`Trainer`] con tu modelo, argumentos de entrenamiento, datasets de entrenamiento y de prueba, y tu función de evaluación:
+
+```py
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=small_train_dataset,
+...     eval_dataset=small_eval_dataset,
+...     compute_metrics=compute_metrics,
+... )
+```
+
+A continuación, aplica fine-tuning a tu modelo llamando [`~transformers.Trainer.train`]:
+
+```py
+>>> trainer.train()
+```
+
+<a id='keras'></a>
+
+## Fine-tuning con Keras
+
+<Youtube id="rnTGBy2ax1c"/>
+
+Los modelos de 🤗 Transformers también permiten realizar el entrenamiento en TensorFlow con la API de Keras. Solo es necesario hacer algunos cambios antes de hacer fine-tuning.
+
+### Convierte el dataset al formato de TensorFlow
+
+El [`DefaultDataCollator`] junta los tensores en un batch para que el modelo se entrene en él. Asegúrate de especificar `return_tensors` para devolver los tensores de TensorFlow:
+
+```py
+>>> from transformers import DefaultDataCollator
+
+>>> data_collator = DefaultDataCollator(return_tensors="tf")
+```
+
+<Tip>
+
+[`Trainer`] utiliza [`DataCollatorWithPadding`] por defecto por lo que no es necesario especificar explícitamente un intercalador de datos (data collator, en inglés).
+
+</Tip>
+
+A continuación, convierte los datasets tokenizados en datasets de TensorFlow con el método [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.to_tf_dataset). Especifica tus entradas en `columns` y tu etiqueta en `label_cols`:
+
+```py
+>>> tf_train_dataset = small_train_dataset.to_tf_dataset(
+...     columns=["attention_mask", "input_ids", "token_type_ids"],
+...     label_cols="labels",
+...     shuffle=True,
+...     collate_fn=data_collator,
+...     batch_size=8,
+... )
+
+>>> tf_validation_dataset = small_eval_dataset.to_tf_dataset(
+...     columns=["attention_mask", "input_ids", "token_type_ids"],
+...     label_cols="labels",
+...     shuffle=False,
+...     collate_fn=data_collator,
+...     batch_size=8,
+... )
+```
+
+### Compila y ajusta
+
+Carguemos un modelo TensorFlow con el número esperado de labels:
+
+```py
+>>> import tensorflow as tf
+>>> from transformers import TFAutoModelForSequenceClassification
+
+>>> model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+```
+
+A continuación, compila y aplica fine-tuning a tu modelo con [`fit`](https://keras.io/api/models/model_training_apis/) como lo harías con cualquier otro modelo de Keras:
+
+```py
+>>> model.compile(
+...     optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
+...     loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+...     metrics=tf.metrics.SparseCategoricalAccuracy(),
+... )
+
+>>> model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3)
+```
+
+<a id='pytorch_native'></a>
+
+## Fine-tune en PyTorch nativo
+
+<Youtube id="Dh9CL8fyG80"/>
+
+El [`Trainer`] se encarga del ciclo de entrenamiento y permite aplicar fine-tuning a un modelo en una sola línea de código. Para los que prefieran escribir su propio ciclo de entrenamiento, también pueden aplicar fine-tuning a un modelo de 🤗 Transformers en PyTorch nativo.
+
+En este punto, es posible que necesites reiniciar tu notebook o ejecutar el siguiente código para liberar algo de memoria:
+
+```py
+del model
+del pytorch_model
+del trainer
+torch.cuda.empty_cache()
+```
+
+A continuación, haremos un post-procesamiento manual al `tokenized_dataset` y así prepararlo para el entrenamiento.
+
+1. Elimina la columna de `text` porque el modelo no acepta texto en crudo como entrada:
+
+    ```py
+    >>> tokenized_datasets = tokenized_datasets.remove_columns(["text"])
+    ```
+
+2. Cambia el nombre de la columna de `label` a `labels` porque el modelo espera que el argumento se llame `labels`:
+
+    ```py
+    >>> tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
+    ```
+
+3. Establece el formato del dataset para devolver tensores PyTorch en lugar de listas:
+
+    ```py
+    >>> tokenized_datasets.set_format("torch")
+    ```
+
+A continuación, crea un subconjunto más pequeño del dataset como se ha mostrado anteriormente para acelerar el fine-tuning:
+
+```py
+>>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
+>>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
+```
+
+### DataLoader
+
+Crea un `DataLoader` para tus datasets de entrenamiento y de prueba para poder iterar sobre batches de datos:
+
+```py
+>>> from torch.utils.data import DataLoader
+
+>>> train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
+>>> eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)
+```
+
+Carga tu modelo con el número de labels previstas:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+```
+
+### Optimiza y programa el learning rate
+
+Crea un optimizador y el learning rate para aplicar fine-tuning al modelo. Vamos a utilizar el optimizador [`AdamW`](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html) de PyTorch:
+
+```py
+>>> from torch.optim import AdamW
+
+>>> optimizer = AdamW(model.parameters(), lr=5e-5)
+```
+
+Crea el learning rate desde el [`Trainer`]:
+
+```py
+>>> from transformers import get_scheduler
+
+>>> num_epochs = 3
+>>> num_training_steps = num_epochs * len(train_dataloader)
+>>> lr_scheduler = get_scheduler(
+...     name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
+... )
+```
+
+Por último, especifica el `device` o entorno de ejecución para utilizar una GPU si tienes acceso a una. De lo contrario, el entrenamiento en una CPU puede llevarte varias horas en lugar de un par de minutos.
+
+```py
+>>> import torch
+
+>>> device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+>>> model.to(device)
+```
+
+<Tip>
+
+Consigue acceso gratuito a una GPU en la nube si es que no tienes este recurso de forma local con un notebook alojado en [Colaboratory](https://colab.research.google.com/) o [SageMaker StudioLab](https://studiolab.sagemaker.aws/).
+
+</Tip>
+
+Genial, ¡ahora podemos entrenar! 🥳
+
+### Ciclo de entrenamiento
+
+Para hacer un seguimiento al progreso del entrenamiento, utiliza la biblioteca [tqdm](https://tqdm.github.io/) para añadir una barra de progreso sobre el número de pasos de entrenamiento:
+
+```py
+>>> from tqdm.auto import tqdm
+
+>>> progress_bar = tqdm(range(num_training_steps))
+
+>>> model.train()
+>>> for epoch in range(num_epochs):
+...     for batch in train_dataloader:
+...         batch = {k: v.to(device) for k, v in batch.items()}
+...         outputs = model(**batch)
+...         loss = outputs.loss
+...         loss.backward()
+
+...         optimizer.step()
+...         lr_scheduler.step()
+...         optimizer.zero_grad()
+...         progress_bar.update(1)
+```
+
+### Métricas
+
+De la misma manera que necesitas añadir una función de evaluación al [`Trainer`], necesitas hacer lo mismo cuando escribas tu propio ciclo de entrenamiento. Pero en lugar de calcular y reportar la métrica al final de cada época, esta vez acumularás todos los batches con [`add_batch`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=add_batch#datasets.Metric.add_batch) y calcularás la métrica al final.
+
+```py
+>>> metric = load_metric("accuracy")
+>>> model.eval()
+>>> for batch in eval_dataloader:
+...     batch = {k: v.to(device) for k, v in batch.items()}
+...     with torch.no_grad():
+...         outputs = model(**batch)
+
+...     logits = outputs.logits
+...     predictions = torch.argmax(logits, dim=-1)
+...     metric.add_batch(predictions=predictions, references=batch["labels"])
+
+>>> metric.compute()
+```
+
+<a id='additional-resources'></a>
+
+## Recursos adicionales
+
+Para más ejemplos de fine-tuning consulta:
+
+- [🤗 Transformers Examples](https://github.com/huggingface/transformers/tree/main/examples) incluye scripts
+  para entrenar tareas comunes de NLP en PyTorch y TensorFlow.
+
+- [🤗 Transformers Notebooks](notebooks) contiene varios notebooks sobre cómo aplicar fine-tuning a un modelo para tareas específicas en PyTorch y TensorFlow.
diff --git a/docs/source/es/training.mdx b/docs/source/es/training.mdx
deleted file mode 100644
index 467df17d1380..000000000000
--- a/docs/source/es/training.mdx
+++ /dev/null
@@ -1,367 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Fine-tuning a un modelo pre-entrenado
-
-[[open-in-colab]]
-
-El uso de un modelo pre-entrenado tiene importantes ventajas. Reduce los costos de computación, la huella de carbono y te permite utilizar modelos de última generación sin tener que entrenar uno desde cero.
-
-* Fine-tuning a un modelo pre-entrenado con 🤗 Transformers [`Trainer`].
-* Fine-tuning a un modelo pre-entrenado en TensorFlow con Keras.
-* Fine-tuning a un modelo pre-entrenado en PyTorch nativo.
-
-<a id='data-processing'></a>
-
-## Prepara un dataset
-
-<Youtube id="_BZearw7f0w"/>
-
-Antes de aplicar fine-tuning a un modelo pre-entrenado, descarga un dataset y prepáralo para el entrenamiento. El tutorial anterior nos enseñó cómo procesar los datos para el entrenamiento, y ahora es la oportunidad de poner a prueba estas habilidades.
-
-Comienza cargando el dataset de [Yelp Reviews](https://huggingface.co/datasets/yelp_review_full):
-
-```py
->>> from datasets import load_dataset
-
->>> dataset = load_dataset("yelp_review_full")
->>> dataset[100]
-{'label': 0,
- 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. I\'ve worked at more than one location. I expect bad days, bad moods, and the occasional mistake. But I have yet to have a decent experience at this store. It will remain a place I avoid unless someone in my party needs to avoid illness from low blood sugar. Perhaps I should go back to the racially biased service of Steak n Shake instead!'}
-```
-
-Como ya sabes, necesitas un tokenizador para procesar el texto e incluir una estrategia para el padding y el truncamiento para manejar cualquier longitud de secuencia variable. Para procesar tu dataset en un solo paso, utiliza el método de 🤗 Datasets map para aplicar una función de preprocesamiento sobre todo el dataset:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
-
-
->>> def tokenize_function(examples):
-...     return tokenizer(examples["text"], padding="max_length", truncation=True)
-
-
->>> tokenized_datasets = dataset.map(tokenize_function, batched=True)
-```
-
-Si lo deseas, puedes crear un subconjunto más pequeño del dataset completo para aplicarle fine-tuning y así reducir el tiempo.
-
-```py
->>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
->>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
-```
-
-<a id='trainer'></a>
-
-## Fine-tuning con `Trainer`
-
-<Youtube id="nvBXf7s7vTI"/>
-
-🤗 Transformers proporciona una clase [`Trainer`] optimizada para el entrenamiento de modelos de 🤗 Transformers, haciendo más fácil el inicio del entrenamiento sin necesidad de escribir manualmente tu propio ciclo. La API del [`Trainer`] soporta una amplia gama de opciones de entrenamiento y características como el logging, el gradient accumulation y el mixed precision.
-
-Comienza cargando tu modelo y especifica el número de labels previstas. A partir del [Card Dataset](https://huggingface.co/datasets/yelp_review_full#data-fields) de Yelp Review, que como ya sabemos tiene 5 labels:
-
-```py
->>> from transformers import AutoModelForSequenceClassification
-
->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
-```
-
-<Tip>
-
-Verás una advertencia acerca de que algunos de los pesos pre-entrenados no están siendo utilizados y que algunos pesos están siendo inicializados al azar. No te preocupes, esto es completamente normal.
-El head/cabezal pre-entrenado del modelo BERT se descarta y se sustituye por un head de clasificación inicializado aleatoriamente. Puedes aplicar fine-tuning a este nuevo head del modelo en tu tarea de clasificación de secuencias haciendo transfer learning del modelo pre-entrenado.
-
-</Tip>
-
-### Hiperparámetros de entrenamiento
-
-A continuación, crea una clase [`TrainingArguments`] que contenga todos los hiperparámetros que puedes ajustar así como los indicadores para activar las diferentes opciones de entrenamiento. Para este tutorial puedes empezar con los [hiperparámetros](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments) de entrenamiento por defecto, pero siéntete libre de experimentar con ellos para encontrar tu configuración óptima.
-
-Especifica dónde vas a guardar los checkpoints de tu entrenamiento:
-
-```py
->>> from transformers import TrainingArguments
-
->>> training_args = TrainingArguments(output_dir="test_trainer")
-```
-
-### Métricas
-
-El [`Trainer`] no evalúa automáticamente el rendimiento del modelo durante el entrenamiento. Tendrás que pasarle a [`Trainer`] una función para calcular y hacer un reporte de las métricas. La biblioteca de 🤗 Datasets proporciona una función de [`accuracy`](https://huggingface.co/metrics/accuracy) simple que puedes cargar con la función `load_metric` (ver este [tutorial](https://huggingface.co/docs/datasets/metrics.html) para más información):
-
-```py
->>> import numpy as np
->>> from datasets import load_metric
-
->>> metric = load_metric("accuracy")
-```
-
-Define la función `compute` en `metric` para calcular el accuracy de tus predicciones. Antes de pasar tus predicciones a `compute`, necesitas convertir las predicciones a logits (recuerda que todos los modelos de 🤗 Transformers devuelven logits).
-
-```py
->>> def compute_metrics(eval_pred):
-...     logits, labels = eval_pred
-...     predictions = np.argmax(logits, axis=-1)
-...     return metric.compute(predictions=predictions, references=labels)
-```
-
-Si quieres controlar tus métricas de evaluación durante el fine-tuning, especifica el parámetro `evaluation_strategy` en tus argumentos de entrenamiento para que el modelo tenga en cuenta la métrica de evaluación al final de cada época:
-
-```py
->>> from transformers import TrainingArguments
-
->>> training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
-```
-
-### Trainer
-
-Crea un objeto [`Trainer`] con tu modelo, argumentos de entrenamiento, datasets de entrenamiento y de prueba, y tu función de evaluación:
-
-```py
->>> trainer = Trainer(
-...     model=model,
-...     args=training_args,
-...     train_dataset=small_train_dataset,
-...     eval_dataset=small_eval_dataset,
-...     compute_metrics=compute_metrics,
-... )
-```
-
-A continuación, aplica fine-tuning a tu modelo llamando [`~transformers.Trainer.train`]:
-
-```py
->>> trainer.train()
-```
-
-<a id='keras'></a>
-
-## Fine-tuning con Keras
-
-<Youtube id="rnTGBy2ax1c"/>
-
-Los modelos de 🤗 Transformers también permiten realizar el entrenamiento en TensorFlow con la API de Keras. Solo es necesario hacer algunos cambios antes de hacer fine-tuning.
-
-### Convierte el dataset al formato de TensorFlow
-
-El [`DefaultDataCollator`] junta los tensores en un batch para que el modelo se entrene en él. Asegúrate de especificar `return_tensors` para devolver los tensores de TensorFlow:
-
-```py
->>> from transformers import DefaultDataCollator
-
->>> data_collator = DefaultDataCollator(return_tensors="tf")
-```
-
-<Tip>
-
-[`Trainer`] utiliza [`DataCollatorWithPadding`] por defecto por lo que no es necesario especificar explícitamente un intercalador de datos (data collator, en inglés).
-
-</Tip>
-
-A continuación, convierte los datasets tokenizados en datasets de TensorFlow con el método [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.to_tf_dataset). Especifica tus entradas en `columns` y tu etiqueta en `label_cols`:
-
-```py
->>> tf_train_dataset = small_train_dataset.to_tf_dataset(
-...     columns=["attention_mask", "input_ids", "token_type_ids"],
-...     label_cols=["labels"],
-...     shuffle=True,
-...     collate_fn=data_collator,
-...     batch_size=8,
-... )
-
->>> tf_validation_dataset = small_eval_dataset.to_tf_dataset(
-...     columns=["attention_mask", "input_ids", "token_type_ids"],
-...     label_cols=["labels"],
-...     shuffle=False,
-...     collate_fn=data_collator,
-...     batch_size=8,
-... )
-```
-
-### Compila y ajusta
-
-Carguemos un modelo TensorFlow con el número esperado de labels:
-
-```py
->>> import tensorflow as tf
->>> from transformers import TFAutoModelForSequenceClassification
-
->>> model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
-```
-
-A continuación, compila y aplica fine-tuning a tu modelo con [`fit`](https://keras.io/api/models/model_training_apis/) como lo harías con cualquier otro modelo de Keras:
-
-```py
->>> model.compile(
-...     optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
-...     loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
-...     metrics=tf.metrics.SparseCategoricalAccuracy(),
-... )
-
->>> model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3)
-```
-
-<a id='pytorch_native'></a>
-
-## Fine-tune en PyTorch nativo
-
-<Youtube id="Dh9CL8fyG80"/>
-
-El [`Trainer`] se encarga del ciclo de entrenamiento y permite aplicar fine-tuning a un modelo en una sola línea de código. Para los que prefieran escribir su propio ciclo de entrenamiento, también pueden aplicar fine-tuning a un modelo de 🤗 Transformers en PyTorch nativo.
-
-En este punto, es posible que necesites reiniciar tu notebook o ejecutar el siguiente código para liberar algo de memoria:
-
-```py
-del model
-del pytorch_model
-del trainer
-torch.cuda.empty_cache()
-```
-
-A continuación, haremos un post-procesamiento manual al `tokenized_dataset` y así prepararlo para el entrenamiento.
-
-1. Elimina la columna de `text` porque el modelo no acepta texto en crudo como entrada:
-
-    ```py
-    >>> tokenized_datasets = tokenized_datasets.remove_columns(["text"])
-    ```
-
-2. Cambia el nombre de la columna de `label` a `labels` porque el modelo espera que el argumento se llame `labels`:
-
-    ```py
-    >>> tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
-    ```
-
-3. Establece el formato del dataset para devolver tensores PyTorch en lugar de listas:
-
-    ```py
-    >>> tokenized_datasets.set_format("torch")
-    ```
-
-A continuación, crea un subconjunto más pequeño del dataset como se ha mostrado anteriormente para acelerar el fine-tuning:
-
-```py
->>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
->>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
-```
-
-### DataLoader
-
-Crea un `DataLoader` para tus datasets de entrenamiento y de prueba para poder iterar sobre batches de datos:
-
-```py
->>> from torch.utils.data import DataLoader
-
->>> train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
->>> eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)
-```
-
-Carga tu modelo con el número de labels previstas:
-
-```py
->>> from transformers import AutoModelForSequenceClassification
-
->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
-```
-
-### Optimiza y programa el learning rate
-
-Crea un optimizador y el learning rate para aplicar fine-tuning al modelo. Vamos a utilizar el optimizador [`AdamW`](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html) de PyTorch:
-
-```py
->>> from torch.optim import AdamW
-
->>> optimizer = AdamW(model.parameters(), lr=5e-5)
-```
-
-Crea el learning rate desde el [`Trainer`]:
-
-```py
->>> from transformers import get_scheduler
-
->>> num_epochs = 3
->>> num_training_steps = num_epochs * len(train_dataloader)
->>> lr_scheduler = get_scheduler(
-...     name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
-... )
-```
-
-Por último, especifica el `device` o entorno de ejecución para utilizar una GPU si tienes acceso a una. De lo contrario, el entrenamiento en una CPU puede llevarte varias horas en lugar de un par de minutos.
-
-```py
->>> import torch
-
->>> device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
->>> model.to(device)
-```
-
-<Tip>
-
-Consigue acceso gratuito a una GPU en la nube si es que no tienes este recurso de forma local con un notebook alojado en [Colaboratory](https://colab.research.google.com/) o [SageMaker StudioLab](https://studiolab.sagemaker.aws/).
-
-</Tip>
-
-Genial, ¡ahora podemos entrenar! 🥳
-
-### Ciclo de entrenamiento
-
-Para hacer un seguimiento al progreso del entrenamiento, utiliza la biblioteca [tqdm](https://tqdm.github.io/) para añadir una barra de progreso sobre el número de pasos de entrenamiento:
-
-```py
->>> from tqdm.auto import tqdm
-
->>> progress_bar = tqdm(range(num_training_steps))
-
->>> model.train()
->>> for epoch in range(num_epochs):
-...     for batch in train_dataloader:
-...         batch = {k: v.to(device) for k, v in batch.items()}
-...         outputs = model(**batch)
-...         loss = outputs.loss
-...         loss.backward()
-
-...         optimizer.step()
-...         lr_scheduler.step()
-...         optimizer.zero_grad()
-...         progress_bar.update(1)
-```
-
-### Métricas
-
-De la misma manera que necesitas añadir una función de evaluación al [`Trainer`], necesitas hacer lo mismo cuando escribas tu propio ciclo de entrenamiento. Pero en lugar de calcular y reportar la métrica al final de cada época, esta vez acumularás todos los batches con [`add_batch`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=add_batch#datasets.Metric.add_batch) y calcularás la métrica al final.
-
-```py
->>> metric = load_metric("accuracy")
->>> model.eval()
->>> for batch in eval_dataloader:
-...     batch = {k: v.to(device) for k, v in batch.items()}
-...     with torch.no_grad():
-...         outputs = model(**batch)
-
-...     logits = outputs.logits
-...     predictions = torch.argmax(logits, dim=-1)
-...     metric.add_batch(predictions=predictions, references=batch["labels"])
-
->>> metric.compute()
-```
-
-<a id='additional-resources'></a>
-
-## Recursos adicionales
-
-Para más ejemplos de fine-tuning consulta:
-
-- [🤗 Transformers Examples](https://github.com/huggingface/transformers/tree/main/examples) incluye scripts
-  para entrenar tareas comunes de NLP en PyTorch y TensorFlow.
-
-- [🤗 Transformers Notebooks](notebooks) contiene varios notebooks sobre cómo aplicar fine-tuning a un modelo para tareas específicas en PyTorch y TensorFlow.
diff --git a/docs/source/fr/_config.py b/docs/source/fr/_config.py
new file mode 100644
index 000000000000..07f1de5f7db0
--- /dev/null
+++ b/docs/source/fr/_config.py
@@ -0,0 +1,14 @@
+# docstyle-ignore
+INSTALL_CONTENT = """
+# Installation de Transformers
+! pip install transformers datasets
+# Pour installer à partir du code source au lieu de la dernière version, commentez la commande ci-dessus et décommentez la suivante.
+# ! pip install git+https://github.com/huggingface/transformers.git
+"""
+
+notebook_first_cells = [{"type": "code", "content": INSTALL_CONTENT}]
+black_avoid_patterns = {
+    "{processor_class}": "FakeProcessorClass",
+    "{model_class}": "FakeModelClass",
+    "{object_class}": "FakeObjectClass",
+}
diff --git a/docs/source/fr/_toctree.yml b/docs/source/fr/_toctree.yml
new file mode 100755
index 000000000000..11632a423b6a
--- /dev/null
+++ b/docs/source/fr/_toctree.yml
@@ -0,0 +1,156 @@
+- sections:
+  - local: index
+    title: 🤗 Transformers
+  - local: quicktour
+    title: Visite rapide
+  - local: in_translation
+    title: Installation
+  title: Démarrer
+- sections:
+  - local: in_translation
+    title: Pipelines pour l'inférence
+  - local: in_translation
+    title: Chargement d'instances pré-entraînées avec une AutoClass
+  - local: in_translation
+    title: Préparation des données
+  - local: in_translation
+    title: Fine-tune un modèle pré-entraîné
+  - local: in_translation
+    title: Entraînement distribué avec 🤗 Accelerate
+  - local: in_translation
+    title: Partager un modèle
+  title: Tutoriels
+- sections:
+  - sections:
+    - local: in_translation
+      title: Créer votre architecture
+    - local: in_translation
+      title: Partager vos modèles
+    - local: in_translation
+      title: Entraînement avec un script
+    - local: in_translation
+      title: Entraînement avec Amazon SageMaker
+    - local: in_translation
+      title: Convertir depuis des checkpoints Tensorflow
+    - local: in_translation
+      title: Exporter vers ONNX
+    - local: in_translation
+      title: Exporter vers TorchScript
+    - local: in_translation
+      title: Aide au dépannage
+    title: Usage général
+  - sections:
+    - local: in_translation
+      title: Utiliser les tokenizers de 🤗 Tokenizers
+    - local: in_translation
+      title: Inférence avec les modèles multilingues
+    - local: in_translation
+      title: Stratégies de génération de texte
+    - sections:
+      - isExpanded: false
+        local: in_translation
+        title: Classification de texte
+      - local: in_translation
+        title: Classification de token
+      - local: in_translation
+        title: Système de question-réponse
+      - local: in_translation
+        title: Modélisation causale du langage
+      - local: in_translation
+        title: Modélisation du langage avec masque
+      - local: in_translation
+        title: Traduction
+      - local: in_translation
+        title: Génération de résumé
+      - local: in_translation
+        title: Question à choix multiple
+      title: Guides des tâches
+    title: Traitement automatique des langues
+  - sections:
+    - local: in_translation
+      title: Classification audio
+    - local: in_translation
+      title: Reconnaissance automatique de la parole
+    title: Audio
+  - sections:
+    - local: in_translation
+      title: Classification d'images
+    - local: in_translation
+      title: Segmentation sémantique
+    - local: in_translation
+      title: Classification de vidéos
+    - local: in_translation
+      title: Détection d'objets
+    title: Vision par ordinateur
+  - sections:
+    - local: in_translation
+      title: Performance et extensibilité
+  - sections:
+    - local: in_translation
+      title: Comment contribuer à transformers?
+    - local: in_translation
+      title: Comment ajouter un modèle à 🤗 Transformers?
+    - local: in_translation
+      title: Comment convertir un modèle 🤗 Transformers vers TensorFlow?
+    - local: in_translation
+      title: Comment ajouter un pipeline à 🤗 Transformers?
+    - local: in_translation
+      title: Tester
+    - local: in_translation
+      title: Vérification pour une Pull Request
+    title: Contribuer
+  - local: in_translation
+    title: 🤗 Transformers Notebooks
+  - local: in_translation
+    title: Ressources communautaires
+  - local: in_translation
+    title: Benchmarks
+  - local: in_translation
+    title: Migration à partir de versions précédentes
+  title: Guides d'utilisation
+- sections:
+  - local: in_translation
+    title: Philosophie
+  - local: in_translation
+    title: Glossaire
+  - local: in_translation
+    title: Qu'est ce 🤗 Transformers peut faire ?
+  - local: in_translation
+    title: Quelles tâches 🤗 Transformers peut résoudre ?
+  - local: in_translation
+    title: Résumé des modèles
+  - local: in_translation
+    title: Résumé des tokenizers
+  - local: in_translation
+    title: Remplissage et troncature
+  - local: in_translation
+    title: BERTology
+  - local: in_translation
+    title: Perplexité des modèles à longueur fixe
+  - local: in_translation
+    title: Pipelines pour inférence avec des serveurs web
+  title: Guides conceptuels
+- sections:
+  - isExpanded: false
+    sections:
+    - local: in_translation
+      title: Classes principales
+    - local: in_translation
+      title: Modèles textuels
+    - local: in_translation
+      title: Modèles visuels
+    - local: in_translation
+      title: Modèles audio
+    - local: in_translation
+      title: Modèles multimodal
+    - local: in_translation
+      title: Modèles d'apprentissage par renforcement
+    - local: in_translation
+      title: Modèles de séries temporelles
+    - local: in_translation
+      title: Graph models
+    title: Modèles
+  - sections:
+    - local: in_translation
+      title: Utilitaires internes
+  title: API
diff --git a/docs/source/fr/in_translation.md b/docs/source/fr/in_translation.md
new file mode 100644
index 000000000000..910559ef6c9a
--- /dev/null
+++ b/docs/source/fr/in_translation.md
@@ -0,0 +1,5 @@
+<!--⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+# Traduction en cours. 
\ No newline at end of file
diff --git a/docs/source/fr/index.md b/docs/source/fr/index.md
new file mode 100644
index 000000000000..9e3e6eb5c236
--- /dev/null
+++ b/docs/source/fr/index.md
@@ -0,0 +1,410 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 🤗 Transformers
+
+Apprentissage automatique de pointe pour [PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/), et [JAX](https://jax.readthedocs.io/en/latest/).
+
+🤗 Transformers fournit des API et des outils pour télécharger et entraîner facilement des modèles pré-entraînés de pointe. L'utilisation de modèles pré-entraînés peut réduire vos coûts de calcul, votre empreinte carbone, et vous faire économiser le temps et les ressources nécessaires pour entraîner un modèle à partir de zéro. Ces modèles prennent en charge des tâches courantes dans différentes modalités, telles que :
+
+📝 **Traitement automatique des langues**: classification de texte, reconnaissance d'entités, système de question-réponse, modèle de langage, génération de résumé, traduction, question à choix multiples et génération de texte.<br>
+🖼️ **Vision par ordinateur**: classification d'image, détection d'objet et segmentation.<br>
+🗣️ **Audio**: reconnaissance automatique de la parole et classification audio.<br>
+🐙 **Multimodalité**: système de question-réponse avec des tableaux ou images, reconnaissance optique de caractères, extraction d'information depuis des documents scannés et classification de vidéo.
+
+🤗 Transformers prend en charge l'interopérabilité entre PyTorch, TensorFlow et JAX. Cela permet d'utiliser un framework différent à chaque étape de la vie d'un modèle, par exemple entraîner un modèle en trois lignes de code avec un framework, et le charger pour l'inférence avec un autre. Les modèles peuvent également être exportés dans un format comme ONNX et TorchScript pour être déployés dans des environnements de production.
+
+Rejoignez la communauté grandissante sur le [Hub](https://huggingface.co/models), le [forum](https://discuss.huggingface.co/) ou [Discord](https://discord.com/invite/JfAtkvEtRb) dès aujourd'hui !
+
+## Si vous cherchez un support personnalisé de l'équipe Hugging Face
+
+<a target="_blank" href="https://huggingface.co/support">
+    <img alt="HuggingFace Expert Acceleration Program" src="https://cdn-media.huggingface.co/marketing/transformers/new-support-improved.png" style="width: 100%; max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
+</a>
+
+## Contents
+
+La documentation est organisée en 5 parties:
+
+- **DEMARRER** propose une visite rapide de la bibliothèque et des instructions d'installation pour être opérationnel.
+- **TUTORIELS** excellent point de départ pour les débutants. Cette section vous aidera à acquérir les compétences de base dont vous avez besoin pour commencer à utiliser la bibliothèque.
+- **GUIDES D'UTILISATION** pour différentes tâches comme par exemple le finetuning d'un modèle pré-entraîné pour la classification de texte ou comment créer et partager votre propre modèle.
+- **GUIDES CONCEPTUELS** pour plus de discussions et d'explications sur les concepts et les idées sous-jacentes aux modèles, aux tâches et à la philosophie de conception de 🤗 Transformers.
+- **API** décrit toutes les classes et fonctions :
+
+  - **CLASSES PRINCIPALES** détaille les classes les plus importantes comme la configuration, le modèle, le tokenizer et le pipeline..
+  - **MODELES** détaille les classes et les fonctions propres à chaque modèle de la bibliothèque.
+  - **UTILITAIRES INTERNES** détaille les classes et fonctions utilitaires utilisées en interne.
+
+### Modèles supportés
+
+<!--This list is updated automatically from the README with _make fix-copies_. Do not update manually! -->
+
+1. **[ALBERT](model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
+1. **[ALIGN](model_doc/align)** (from Google Research) released with the paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.
+1. **[AltCLIP](model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
+1. **[Audio Spectrogram Transformer](model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
+1. **[BART](model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
+1. **[BARThez](model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
+1. **[BARTpho](model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
+1. **[BEiT](model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
+1. **[BERT](model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
+1. **[BERT For Sequence Generation](model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[BERTweet](model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
+1. **[BigBird-Pegasus](model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[BigBird-RoBERTa](model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[BioGpt](model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
+1. **[BiT](model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
+1. **[Blenderbot](model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BlenderbotSmall](model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BLIP](model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
+1. **[BLOOM](model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
+1. **[BORT](model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
+1. **[BridgeTower](model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
+1. **[ByT5](model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
+1. **[CamemBERT](model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
+1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
+1. **[Chinese-CLIP](model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
+1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
+1. **[CLIPSeg](model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
+1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
+1. **[Conditional DETR](model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
+1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
+1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
+1. **[ConvNeXTV2](model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
+1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
+1. **[CTRL](model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
+1. **[CvT](model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
+1. **[Data2Vec](model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
+1. **[DeBERTa](model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[DeBERTa-v2](model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[Decision Transformer](model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
+1. **[Deformable DETR](model_doc/deformable_detr)** (from SenseTime Research) released with the paper [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai.
+1. **[DeiT](model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
+1. **[DETA](model_doc/deta)** (from The University of Texas at Austin) released with the paper [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl.
+1. **[DETR](model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
+1. **[DialoGPT](model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
+1. **[DiNAT](model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi.
+1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
+1. **[DiT](model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
+1. **[Donut](model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
+1. **[DPR](model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
+1. **[DPT](master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
+1. **[EfficientFormer](model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
+1. **[ELECTRA](model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
+1. **[EncoderDecoder](model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[ERNIE](model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
+1. **[ESM](model_doc/esm)** (from Meta AI) are transformer protein language models.  **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2 and ESMFold** were released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
+1. **[FLAN-T5](model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
+1. **[FlauBERT](model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
+1. **[FLAVA](model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
+1. **[FNet](model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
+1. **[Funnel Transformer](model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
+1. **[GIT](model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
+1. **[GLPN](model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
+1. **[GPT](model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+1. **[GPT Neo](model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
+1. **[GPT NeoX](model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
+1. **[GPT NeoX Japanese](model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
+1. **[GPT-2](model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
+1. **[GPT-J](model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
+1. **[GPT-Sw3](model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren.
+1. **[Graphormer](model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
+1. **[GroupViT](model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
+1. **[Hubert](model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
+1. **[I-BERT](model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
+1. **[ImageGPT](model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
+1. **[Jukebox](model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
+1. **[LayoutLM](model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
+1. **[LayoutLMv2](model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
+1. **[LayoutLMv3](model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
+1. **[LayoutXLM](model_doc/layoutxlm)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
+1. **[LED](model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+1. **[LeViT](model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
+1. **[LiLT](model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
+1. **[Longformer](model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+1. **[LongT5](model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
+1. **[LUKE](model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
+1. **[LXMERT](model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
+1. **[M-CTC-T](model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
+1. **[M2M100](model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
+1. **[MarianMT](model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
+1. **[MarkupLM](model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
+1. **[Mask2Former](model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
+1. **[MaskFormer](model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
+1. **[mBART](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
+1. **[mBART-50](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
+1. **[Megatron-BERT](model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[Megatron-GPT2](model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[mLUKE](model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
+1. **[MobileBERT](model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
+1. **[MobileNetV1](model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
+1. **[MobileNetV2](model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
+1. **[MobileViT](model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
+1. **[MPNet](model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
+1. **[MT5](model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
+1. **[MVP](model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
+1. **[NAT](model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
+1. **[Nezha](model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
+1. **[NLLB](model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
+1. **[Nyströmformer](model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
+1. **[OneFormer](model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
+1. **[OPT](master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
+1. **[OWL-ViT](model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
+1. **[Pegasus](model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
+1. **[PEGASUS-X](model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
+1. **[Perceiver IO](model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
+1. **[PhoBERT](model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
+1. **[PLBart](model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
+1. **[PoolFormer](model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
+1. **[ProphetNet](model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[QDQBert](model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
+1. **[RAG](model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela.
+1. **[REALM](model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
+1. **[Reformer](model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+1. **[RegNet](model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
+1. **[RemBERT](model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
+1. **[ResNet](model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
+1. **[RoBERTa](model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
+1. **[RoBERTa-PreLayerNorm](model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
+1. **[RoCBert](model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
+1. **[RoFormer](model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
+1. **[SegFormer](model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
+1. **[SEW](model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
+1. **[SEW-D](model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
+1. **[SpeechT5](model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
+1. **[SpeechToTextTransformer](model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
+1. **[SpeechToTextTransformer2](model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
+1. **[Splinter](model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
+1. **[SqueezeBERT](model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
+1. **[Swin Transformer](model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
+1. **[Swin Transformer V2](model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
+1. **[Swin2SR](model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
+1. **[SwitchTransformers](model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
+1. **[T5](model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+1. **[T5v1.1](model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+1. **[Table Transformer](model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
+1. **[TAPAS](model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
+1. **[TAPEX](model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
+1. **[Time Series Transformer](model_doc/time_series_transformer)** (from HuggingFace).
+1. **[TimeSformer](model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
+1. **[Trajectory Transformer](model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
+1. **[Transformer-XL](model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+1. **[TrOCR](model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
+1. **[UL2](model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
+1. **[UniSpeech](model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
+1. **[UniSpeechSat](model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
+1. **[UPerNet](model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
+1. **[VAN](model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
+1. **[VideoMAE](model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
+1. **[ViLT](model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
+1. **[Vision Transformer (ViT)](model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[VisualBERT](model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
+1. **[ViT Hybrid](model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[ViTMAE](model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
+1. **[ViTMSN](model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
+1. **[Wav2Vec2](model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
+1. **[Wav2Vec2-Conformer](model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
+1. **[Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
+1. **[WavLM](model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
+1. **[Whisper](model_doc/whisper)** (from OpenAI) released with the paper [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever.
+1. **[X-CLIP](model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
+1. **[XGLM](model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
+1. **[XLM](model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
+1. **[XLM-ProphetNet](model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[XLM-RoBERTa](model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
+1. **[XLM-RoBERTa-XL](model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
+1. **[XLNet](model_doc/xlnet)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
+1. **[XLS-R](model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
+1. **[XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
+1. **[YOLOS](model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
+1. **[YOSO](model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
+
+
+### Frameworks compatibles
+
+Le tableau ci-dessous représente la prise en charge actuelle dans la bibliothèque pour chacun de ces modèles, qu'ils aient ou non un tokenizer Python (appelé "slow"). Un tokenizer rapide ("fast") soutenu par la bibliothèque 🤗 Tokenizers, qu'ils aient un support en Jax (via Flax), PyTorch, et/ou TensorFlow.
+
+<!--This table is updated automatically from the auto modules with _make fix-copies_. Do not update manually!-->
+
+|             Modèle             | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support |
+|:-----------------------------:|:--------------:|:--------------:|:---------------:|:------------------:|:------------:|
+|            ALBERT             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|            AltCLIP            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+| Audio Spectrogram Transformer |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             BART              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|             BEiT              |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|             BERT              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|        Bert Generation        |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            BigBird            |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
+|        BigBird-Pegasus        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            BioGpt             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              BiT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          Blenderbot           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|        BlenderbotSmall        |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|             BLIP              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             BLOOM             |       ❌       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          BridgeTower          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           CamemBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            CANINE             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         Chinese-CLIP          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             CLIP              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|            CLIPSeg            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            CodeGen            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|       Conditional DETR        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           ConvBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           ConvNeXT            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             CTRL              |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|              CvT              |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|         Data2VecAudio         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         Data2VecText          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        Data2VecVision         |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            DeBERTa            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|          DeBERTa-v2           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|     Decision Transformer      |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        Deformable DETR        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             DeiT              |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             DETA              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             DETR              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             DiNAT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          DistilBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|           DonutSwin           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              DPR              |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|              DPT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        EfficientFormer        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            ELECTRA            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|        Encoder decoder        |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|             ERNIE             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              ESM              |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|  FairSeq Machine-Translation  |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           FlauBERT            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             FLAVA             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             FNet              |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|      Funnel Transformer       |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|              GIT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             GLPN              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            GPT Neo            |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|           GPT NeoX            |       ❌       |       ✅       |       ✅        |         ❌         |      ❌      |
+|       GPT NeoX Japanese       |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             GPT-J             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|            GPT-Sw3            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|          Graphormer           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           GroupViT            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            Hubert             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            I-BERT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           ImageGPT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            Jukebox            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           LayoutLM            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|          LayoutLMv2           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          LayoutLMv3           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|              LED              |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|             LeViT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             LiLT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          Longformer           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            LongT5             |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|             LUKE              |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            LXMERT             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            M-CTC-T            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            M2M100             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            Marian             |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
+|           MarkupLM            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          Mask2Former          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          MaskFormer           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        MaskFormerSwin         |       ❌       |       ❌       |       ❌        |         ❌         |      ❌      |
+|             mBART             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|         Megatron-BERT         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          MobileBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|          MobileNetV1          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          MobileNetV2          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           MobileViT           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             MPNet             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|              MT5              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|              MVP              |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|              NAT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             Nezha             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         Nyströmformer         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           OneFormer           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          OpenAI GPT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|         OpenAI GPT-2          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|              OPT              |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|            OWL-ViT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            Pegasus            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|           PEGASUS-X           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Perceiver           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            PLBart             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          PoolFormer           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          ProphetNet           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            QDQBert            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              RAG              |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             REALM             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|           Reformer            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|            RegNet             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|            RemBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            ResNet             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|           RetriBERT           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|            RoBERTa            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|     RoBERTa-PreLayerNorm      |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|            RoCBert            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           RoFormer            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|           SegFormer           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|              SEW              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             SEW-D             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|    Speech Encoder decoder     |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|          Speech2Text          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|         Speech2Text2          |       ✅       |       ❌       |       ❌        |         ❌         |      ❌      |
+|           SpeechT5            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Splinter            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          SqueezeBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|       Swin Transformer        |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|      Swin Transformer V2      |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            Swin2SR            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|      SwitchTransformers       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              T5               |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|       Table Transformer       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             TAPAS             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|    Time Series Transformer    |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          TimeSformer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|    Trajectory Transformer     |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        Transformer-XL         |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             TrOCR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           UniSpeech           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         UniSpeechSat          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            UPerNet            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              VAN              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           VideoMAE            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             ViLT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|    Vision Encoder decoder     |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|     VisionTextDualEncoder     |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|          VisualBERT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              ViT              |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|          ViT Hybrid           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            ViTMAE             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            ViTMSN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Wav2Vec2            |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
+|      Wav2Vec2-Conformer       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             WavLM             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            Whisper            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            X-CLIP             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             XGLM              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|              XLM              |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|        XLM-ProphetNet         |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          XLM-RoBERTa          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|        XLM-RoBERTa-XL         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             XLNet             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|             YOLOS             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             YOSO              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+
+<!-- End table-->
diff --git a/docs/source/fr/quicktour.md b/docs/source/fr/quicktour.md
new file mode 100644
index 000000000000..666a931f825f
--- /dev/null
+++ b/docs/source/fr/quicktour.md
@@ -0,0 +1,550 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Visite rapide
+
+[[open-in-colab]]
+
+Soyez opérationnel avec 🤗 Transformers ! Que vous soyez un développeur ou un utilisateur lambda, cette visite rapide vous aidera à démarrer et vous montrera comment utiliser le [`pipeline`] pour l'inférence, charger un modèle pré-entraîné et un préprocesseur avec une [AutoClass](./model_doc/auto), et entraîner rapidement un modèle avec PyTorch ou TensorFlow. Si vous êtes un débutant, nous vous recommandons de consulter nos tutoriels ou notre [cours](https://huggingface.co/course/chapter1/1) suivant pour des explications plus approfondies des concepts présentés ici.
+
+Avant de commencer, assurez-vous que vous avez installé toutes les bibliothèques nécessaires :
+
+```bash
+!pip install transformers datasets
+```
+
+Vous aurez aussi besoin d'installer votre bibliothèque d'apprentissage profond favorite :
+
+<frameworkcontent>
+<pt>
+
+```bash
+pip install torch
+```
+</pt>
+<tf>
+
+```bash
+pip install tensorflow
+```
+</tf>
+</frameworkcontent>
+
+## Pipeline
+
+<Youtube id="tiZFewofSLM"/>
+
+Le [`pipeline`] est le moyen le plus simple d'utiliser un modèle pré-entraîné pour l'inférence. Vous pouvez utiliser le [`pipeline`] prêt à l'emploi pour de nombreuses tâches dans différentes modalités. Consultez le tableau ci-dessous pour connaître les tâches prises en charge :
+
+| **Tâche**                     | **Description**                                                                                              | **Modalité**        | **Identifiant du pipeline**                   |
+|------------------------------|--------------------------------------------------------------------------------------------------------------|----------------------|-----------------------------------------------|
+| Classification de texte      | Attribue une catégorie à une séquence de texte donnée                                                        | Texte                | pipeline(task="sentiment-analysis")           |
+| Génération de texte          | Génère du texte à partir d'une consigne donnée                                                               | Texte                | pipeline(task="text-generation")              |
+| Reconnaissance de token nommé      | Attribue une catégorie à chaque token dans une séquence (personnes, organisation, localisation, etc.)                            | Texte                | pipeline(task="ner")                          |
+| Question réponse             | Extrait une réponse du texte en fonction du contexte et d'une question                                       | Texte                | pipeline(task="question-answering")           |
+| Prédiction de token masqué                    | Prédit correctement le token masqué dans une séquence                                                               | Texte                | pipeline(task="fill-mask")                    |
+| Génération de résumé                | Génère un résumé d'une séquence de texte donnée ou d'un document                                                         | Texte                | pipeline(task="summarization")                |
+| Traduction                  | Traduit du texte d'un langage à un autre                                                                      | Texte                | pipeline(task="translation")                  |
+| Classification d'image       | Attribue une catégorie à une image                                                                           | Image                | pipeline(task="image-classification")         |
+| Segmentation d'image           | Attribue une catégorie à chaque pixel d'une image (supporte la segmentation sémantique, panoptique et d'instance) | Image                | pipeline(task="image-segmentation")           |
+| Détection d'objets             | Prédit les délimitations et catégories d'objets dans une image                                                | Image                | pipeline(task="object-detection")             |
+| Classification d'audio       | Attribue une catégorie à un fichier audio                                                                    | Audio                | pipeline(task="audio-classification")         |
+| Reconnaissance automatique de la parole | Extrait le discours d'un fichier audio en texte                                                                  | Audio                | pipeline(task="automatic-speech-recognition") |
+| Question réponse visuels    | Etant données une image et une question, répond correctement à une question sur l'image                                   | Modalités multiples  | pipeline(task="vqa")                          |
+
+Commencez par créer une instance de [`pipeline`] et spécifiez la tâche pour laquelle vous souhaitez l'utiliser. Vous pouvez utiliser le [`pipeline`] pour n'importe laquelle des tâches mentionnées dans le tableau précédent. Pour obtenir une liste complète des tâches prises en charge, consultez la documentation de l'[API pipeline](./main_classes/pipelines). Dans ce guide, nous utiliserons le [`pipeline`] pour l'analyse des sentiments à titre d'exemple :
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline("sentiment-analysis")
+```
+
+Le [`pipeline`] télécharge et stocke en cache un [modèle pré-entraîné](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english) et un tokenizer par défaut pour l'analyse des sentiments. Vous pouvez maintenant utiliser le `classifier` sur le texte de votre choix :
+
+```py
+>>> classifier("We are very happy to show you the 🤗 Transformers library.")
+[{'label': 'POSITIVE', 'score': 0.9998}]
+```
+
+Si vous voulez classifier plus qu'un texte, donnez une liste de textes au [`pipeline`] pour obtenir une liste de dictionnaires en retour :
+
+```py
+>>> results = classifier(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."])
+>>> for result in results:
+...     print(f"label: {result['label']}, avec le score de: {round(result['score'], 4)}")
+label: POSITIVE, avec le score de: 0.9998
+label: NEGATIVE, avec le score de: 0.5309
+```
+
+Le [`pipeline`] peut aussi itérer sur un jeu de données entier pour n'importe quelle tâche. Prenons par exemple la reconnaissance automatique de la parole :
+
+```py
+>>> import torch
+>>> from transformers import pipeline
+
+>>> speech_recognizer = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
+```
+
+Chargez un jeu de données audio (voir le 🤗 Datasets [Quick Start](https://huggingface.co/docs/datasets/quickstart#audio) pour plus de détails) sur lequel vous souhaitez itérer. Pour cet exemple, nous chargeons le jeu de données [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) :
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")  # doctest: +IGNORE_RESULT
+```
+
+Vous devez vous assurer que le taux d'échantillonnage de l'ensemble de données correspond au taux d'échantillonnage sur lequel [`facebook/wav2vec2-base-960h`](https://huggingface.co/facebook/wav2vec2-base-960h) a été entraîné :
+
+```py
+>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=speech_recognizer.feature_extractor.sampling_rate))
+```
+
+Les fichiers audio sont automatiquement chargés et rééchantillonnés lors de l'appel de la colonne `"audio"`.
+Extrayez les tableaux de formes d'ondes brutes des quatre premiers échantillons et passez-les comme une liste au pipeline :
+
+```py
+>>> result = speech_recognizer(dataset[:4]["audio"])
+>>> print([d["text"] for d in result])
+['I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT', "FODING HOW I'D SET UP A JOIN TO HET WITH MY WIFE AND WHERE THE AP MIGHT BE", "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE AP SO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AND I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS", 'HOW DO I THURN A JOIN A COUNT']
+```
+
+Pour les ensembles de données plus importants où les entrées sont volumineuses (comme dans les domaines de la parole ou de la vision), utilisez plutôt un générateur au lieu d'une liste pour charger toutes les entrées en mémoire. Pour plus d'informations, consultez la documentation de l'[API pipeline](./main_classes/pipelines).
+
+### Utiliser une autre modèle et tokenizer dans le pipeline
+
+Le [`pipeline`] peut être utilisé avec n'importe quel modèle du [Hub](https://huggingface.co/models), ce qui permet d'adapter facilement le [`pipeline`] à d'autres cas d'utilisation. Par exemple, si vous souhaitez un modèle capable de traiter du texte français, utilisez les filtres du Hub pour trouver un modèle approprié. Le premier résultat renvoie un [modèle BERT](https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment) multilingue finetuné pour l'analyse des sentiments que vous pouvez utiliser pour le texte français :
+
+```py
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+```
+
+<frameworkcontent>
+<pt>
+Utilisez [`AutoModelForSequenceClassification`] et [`AutoTokenizer`] pour charger le modèle pré-entraîné et le tokenizer adapté (plus de détails sur une `AutoClass` dans la section suivante) :
+
+```py
+>>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained(model_name)
+>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+```
+</pt>
+<tf>
+Utilisez [`TFAutoModelForSequenceClassification`] et [`AutoTokenizer`] pour charger le modèle pré-entraîné et le tokenizer adapté (plus de détails sur une `TFAutoClass` dans la section suivante) :
+
+```py
+>>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+
+>>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
+>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+```
+</tf>
+</frameworkcontent>
+
+Spécifiez le modèle et le tokenizer dans le [`pipeline`], et utilisez le `classifier` sur le texte en français :
+
+```py
+>>> classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
+>>> classifier("Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers.")
+[{'label': '5 stars', 'score': 0.7273}]
+```
+
+Si vous ne parvenez pas à trouver un modèle adapté à votre cas d'utilisation, vous devrez finetuner un modèle pré-entraîné sur vos données. Jetez un coup d'œil à notre [tutoriel sur le finetuning](./training) pour apprendre comment faire. Enfin, après avoir finetuné votre modèle pré-entraîné, pensez à [partager](./model_sharing) le modèle avec la communauté sur le Hub afin de démocratiser l'apprentissage automatique pour tous ! 🤗
+
+## AutoClass
+
+<Youtube id="AhChOFRegn4"/>
+
+Les classes [`AutoModelForSequenceClassification`] et [`AutoTokenizer`] fonctionnent ensemble pour créer un [`pipeline`] comme celui que vous avez utilisé ci-dessus. Une [AutoClass](./model_doc/auto) est un raccourci qui récupère automatiquement l'architecture d'un modèle pré-entraîné à partir de son nom ou de son emplacement. Il vous suffit de sélectionner l'`AutoClass` appropriée à votre tâche et la classe de prétraitement qui lui est associée. 
+
+Reprenons l'exemple de la section précédente et voyons comment vous pouvez utiliser l'`AutoClass` pour reproduire les résultats du [`pipeline`].
+
+### AutoTokenizer
+
+Un tokenizer est chargé de prétraiter le texte pour en faire un tableau de chiffres qui servira d'entrée à un modèle. De nombreuses règles régissent le processus de tokenisation, notamment la manière de diviser un mot et le niveau auquel les mots doivent être divisés (pour en savoir plus sur la tokenisation, consultez le [résumé](./tokenizer_summary)). La chose la plus importante à retenir est que vous devez instancier un tokenizer avec le même nom de modèle pour vous assurer que vous utilisez les mêmes règles de tokenisation que celles avec lesquelles un modèle a été pré-entraîné.
+
+Chargez un tokenizer avec [`AutoTokenizer`] :
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+```
+
+Passez votre texte au tokenizer :
+
+```py
+>>> encoding = tokenizer("We are very happy to show you the 🤗 Transformers library.")
+>>> print(encoding)
+{'input_ids': [101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103, 100, 58263, 13299, 119, 102],
+ 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+ 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+```
+
+Le tokenizer retourne un dictionnaire contenant :
+
+* [input_ids](./glossary#input-ids): la représentation numérique des tokens.
+* [attention_mask](.glossary#attention-mask): indique quels tokens doivent faire l'objet d'une attention particulière (plus particulièrement les tokens de remplissage).
+
+Un tokenizer peut également accepter une liste de textes, et remplir et tronquer le texte pour retourner un échantillon de longueur uniforme :
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> pt_batch = tokenizer(
+...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
+...     padding=True,
+...     truncation=True,
+...     max_length=512,
+...     return_tensors="pt",
+... )
+```
+</pt>
+<tf>
+
+```py
+>>> tf_batch = tokenizer(
+...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
+...     padding=True,
+...     truncation=True,
+...     max_length=512,
+...     return_tensors="tf",
+... )
+```
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+Consultez le tutoriel [prétraitement](./preprocessing) pour plus de détails sur la tokenisation, et sur la manière d'utiliser un [`AutoImageProcessor`], un [`AutoFeatureExtractor`] et un [`AutoProcessor`] pour prétraiter les images, l'audio et les contenus multimodaux.
+
+</Tip>
+
+### AutoModel
+
+<frameworkcontent>
+<pt>
+🤗 Transformers fournit un moyen simple et unifié de charger des instances pré-entraînées. Cela signifie que vous pouvez charger un [`AutoModel`] comme vous chargeriez un [`AutoTokenizer`]. La seule différence est de sélectionner l'[`AutoModel`] approprié pour la tâche. Pour une classification de texte (ou de séquence de textes), vous devez charger [`AutoModelForSequenceClassification`] :
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)
+```
+
+<Tip>
+
+Voir le [résumé de la tâche](./task_summary) pour vérifier si elle est prise en charge par une classe [`AutoModel`].
+
+</Tip>
+
+Maintenant, passez votre échantillon d'entrées prétraitées directement au modèle. Il vous suffit de décompresser le dictionnaire en ajoutant `**` :
+
+```py
+>>> pt_outputs = pt_model(**pt_batch)
+```
+
+Le modèle produit les activations finales dans l'attribut `logits`. Appliquez la fonction softmax aux `logits` pour récupérer les probabilités :
+
+```py
+>>> from torch import nn
+
+>>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
+>>> print(pt_predictions)
+tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
+        [0.2084, 0.1826, 0.1969, 0.1755, 0.2365]], grad_fn=<SoftmaxBackward0>)
+```
+</pt>
+<tf>
+🤗 Transformers fournit un moyen simple et unifié de charger des instances pré-entraînés. Cela signifie que vous pouvez charger un [`TFAutoModel`] comme vous chargeriez un [`AutoTokenizer`]. La seule différence est de sélectionner le [`TFAutoModel`] approprié pour la tâche. Pour une classification de texte (ou de séquence de textes), vous devez charger [`TFAutoModelForSequenceClassification`] :
+
+```py
+>>> from transformers import TFAutoModelForSequenceClassification
+
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
+```
+
+<Tip>
+
+Voir le [résumé de la tâche](./task_summary) pour vérifier si elle est prise en charge par une classe [`AutoModel`].
+
+</Tip>
+
+Passez maintenant votre échantillon d'entrées prétraitées directement au modèle en passant les clés du dictionnaire directement aux tensors :
+
+```py
+>>> tf_outputs = tf_model(tf_batch)
+```
+
+Le modèle produit les activations finales dans l'attribut `logits`. Appliquez la fonction softmax aux `logits` pour récupérer les probabilités :
+
+```py
+>>> import tensorflow as tf
+
+>>> tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1)
+>>> tf_predictions  # doctest: +IGNORE_RESULT
+```
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+Tous les modèles 🤗 Transformers (PyTorch ou TensorFlow) produisent les tensors *avant* la fonction d'activation finale (comme softmax) car la fonction d'activation finale est souvent fusionnée avec le calcul de la perte. Les structures produites par le modèle sont des classes de données spéciales, de sorte que leurs attributs sont autocomplétés dans un environnement de développement. Les structures produites par le modèle se comportent comme un tuple ou un dictionnaire (vous pouvez les indexer avec un entier, une tranche ou une chaîne), auquel cas les attributs qui sont None sont ignorés.
+
+</Tip>
+
+### Sauvegarder un modèle
+
+<frameworkcontent>
+<pt>
+Une fois que votre modèle est finetuné, vous pouvez le sauvegarder avec son tokenizer en utilisant [`PreTrainedModel.save_pretrained`] :
+
+```py
+>>> pt_save_directory = "./pt_save_pretrained"
+>>> tokenizer.save_pretrained(pt_save_directory)  # doctest: +IGNORE_RESULT
+>>> pt_model.save_pretrained(pt_save_directory)
+```
+
+Lorsque vous voulez réutiliser le modèle, rechargez-le avec [`PreTrainedModel.from_pretrained`] :
+
+```py
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained")
+```
+</pt>
+<tf>
+Une fois que votre modèle est finetuné, vous pouvez le sauvegarder avec son tokenizer en utilisant [`TFPreTrainedModel.save_pretrained`] :
+
+```py
+>>> tf_save_directory = "./tf_save_pretrained"
+>>> tokenizer.save_pretrained(tf_save_directory)  # doctest: +IGNORE_RESULT
+>>> tf_model.save_pretrained(tf_save_directory)
+```
+
+Lorsque vous voulez réutiliser le modèle, rechargez-le avec [`TFPreTrainedModel.from_pretrained`] :
+
+```py
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("./tf_save_pretrained")
+```
+</tf>
+</frameworkcontent>
+
+Une fonctionnalité particulièrement cool 🤗 Transformers est la possibilité d'enregistrer un modèle et de le recharger en tant que modèle PyTorch ou TensorFlow. Le paramètre `from_pt` ou `from_tf` permet de convertir le modèle d'un framework à l'autre :
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> from transformers import AutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
+```
+</pt>
+<tf>
+
+```py
+>>> from transformers import TFAutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
+```
+</tf>
+</frameworkcontent>
+
+## Constructions de modèles personnalisés
+
+Vous pouvez modifier la configuration du modèle pour changer la façon dont un modèle est construit. La configuration spécifie les attributs d'un modèle, tels que le nombre de couches ou de têtes d'attention. Vous partez de zéro lorsque vous initialisez un modèle à partir d'une configuration personnalisée. Les attributs du modèle sont initialisés de manière aléatoire et vous devrez entraîner le modèle avant de pouvoir l'utiliser pour obtenir des résultats significatifs.
+
+Commencez par importer [`AutoConfig`], puis chargez le modèle pré-entraîné que vous voulez modifier. Dans [`AutoConfig.from_pretrained`], vous pouvez spécifier l'attribut que vous souhaitez modifier, tel que le nombre de têtes d'attention :
+
+```py
+>>> from transformers import AutoConfig
+
+>>> my_config = AutoConfig.from_pretrained("distilbert-base-uncased", n_heads=12)
+```
+
+<frameworkcontent>
+<pt>
+Créez un modèle personnalisé à partir de votre configuration avec [`AutoModel.from_config`] :
+
+```py
+>>> from transformers import AutoModel
+
+>>> my_model = AutoModel.from_config(my_config)
+```
+</pt>
+<tf>
+Créez un modèle personnalisé à partir de votre configuration avec [`TFAutoModel.from_config`] :
+
+```py
+>>> from transformers import TFAutoModel
+
+>>> my_model = TFAutoModel.from_config(my_config)
+```
+</tf>
+</frameworkcontent>
+
+Consultez le guide [Créer une architecture personnalisée](./create_a_model) pour plus d'informations sur la création de configurations personnalisées.
+
+## Trainer - une boucle d'entraînement optimisée par PyTorch
+
+Tous les modèles sont des [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) standard, vous pouvez donc les utiliser dans n'importe quelle boucle d'entraînement typique. Bien que vous puissiez écrire votre propre boucle d'entraînement, 🤗 Transformers fournit une classe [`Trainer`] pour PyTorch, qui contient la boucle d'entraînement de base et ajoute des fonctionnalités supplémentaires comme l'entraînement distribué, la précision mixte, et plus encore.
+
+En fonction de votre tâche, vous passerez généralement les paramètres suivants à [`Trainer`] :
+
+1. Un [`PreTrainedModel`] ou un [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module):
+
+   ```py
+   >>> from transformers import AutoModelForSequenceClassification
+
+   >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+   ```
+
+2. [`TrainingArguments`] contient les hyperparamètres du modèle que vous pouvez changer comme le taux d'apprentissage, la taille de l'échantillon, et le nombre d'époques pour s'entraîner. Les valeurs par défaut sont utilisées si vous ne spécifiez pas d'hyperparamètres d'apprentissage :
+
+   ```py
+   >>> from transformers import TrainingArguments
+
+   >>> training_args = TrainingArguments(
+   ...     output_dir="path/to/save/folder/",
+   ...     learning_rate=2e-5,
+   ...     per_device_train_batch_size=8,
+   ...     per_device_eval_batch_size=8,
+   ...     num_train_epochs=2,
+   ... )
+   ```
+
+3. Une classe de prétraitement comme un tokenizer, un processeur d'images ou un extracteur de caractéristiques :
+
+   ```py
+   >>> from transformers import AutoTokenizer
+
+   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+   ```
+
+4. Chargez un jeu de données :
+
+   ```py
+   >>> from datasets import load_dataset
+
+   >>> dataset = load_dataset("rotten_tomatoes")  # doctest: +IGNORE_RESULT
+   ```
+
+5. Créez une fonction qui transforme le texte du jeu de données en token :
+
+   ```py
+   >>> def tokenize_dataset(dataset):
+   ...     return tokenizer(dataset["text"])
+   ```
+
+   Puis appliquez-la à l'intégralité du jeu de données avec [`~datasets.Dataset.map`]:
+
+   ```py
+   >>> dataset = dataset.map(tokenize_dataset, batched=True)
+   ```
+
+6. Un [`DataCollatorWithPadding`] pour créer un échantillon d'exemples à partir de votre jeu de données :
+
+   ```py
+   >>> from transformers import DataCollatorWithPadding
+
+   >>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+   ```
+
+Maintenant, rassemblez tous ces éléments dans un [`Trainer`] :
+
+```py
+>>> from transformers import Trainer
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=dataset["train"],
+...     eval_dataset=dataset["test"],
+...     tokenizer=tokenizer,
+...     data_collator=data_collator,
+... )  # doctest: +SKIP
+```
+
+Une fois que vous êtes prêt, appelez la fonction [`~Trainer.train`] pour commencer l'entraînement :
+
+```py
+>>> trainer.train()  # doctest: +SKIP
+```
+
+<Tip>
+
+Pour les tâches - comme la traduction ou la génération de résumé - qui utilisent un modèle séquence à séquence, utilisez plutôt les classes [`Seq2SeqTrainer`] et [`Seq2SeqTrainingArguments`].
+
+</Tip>
+
+Vous pouvez personnaliser le comportement de la boucle d'apprentissage en redéfinissant les méthodes à l'intérieur de [`Trainer`]. Cela vous permet de personnaliser des caractéristiques telles que la fonction de perte, l'optimiseur et le planificateur. Consultez la documentation de [`Trainer`] pour savoir quelles méthodes peuvent être redéfinies. 
+
+L'autre moyen de personnaliser la boucle d'apprentissage est d'utiliser les [Callbacks](./main_classes/callbacks). Vous pouvez utiliser les callbacks pour intégrer d'autres bibliothèques et inspecter la boucle d'apprentissage afin de suivre la progression ou d'arrêter l'apprentissage plus tôt. Les callbacks ne modifient rien dans la boucle d'apprentissage elle-même. Pour personnaliser quelque chose comme la fonction de perte, vous devez redéfinir le [`Trainer`] à la place.
+
+## Entraînement avec TensorFlow
+
+Tous les modèles sont des modèles standard [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) afin qu'ils puissent être entraînés avec TensorFlow avec l'API [Keras](https://keras.io/). 🤗 Transformers fournit la fonction [`~TFPreTrainedModel.prepare_tf_dataset`] pour charger facilement votre jeu de données comme un `tf.data.Dataset` afin que vous puissiez commencer l'entraînement immédiatement avec les fonctions [`compile`](https://keras.io/api/models/model_training_apis/#compile-method) et [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) de Keras.
+
+1. Vous commencez avec un modèle [`TFPreTrainedModel`] ou [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) :
+
+   ```py
+   >>> from transformers import TFAutoModelForSequenceClassification
+
+   >>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+   ```
+
+2. Une classe de prétraitement comme un tokenizer, un processeur d'images ou un extracteur de caractéristiques :
+
+   ```py
+   >>> from transformers import AutoTokenizer
+
+   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+   ```
+
+3. Créez une fonction qui transforme le texte du jeu de données en token :
+
+   ```py
+   >>> def tokenize_dataset(dataset):
+   ...     return tokenizer(dataset["text"])  # doctest: +SKIP
+   ```
+
+4. Appliquez le tokenizer à l'ensemble du jeu de données avec [`~datasets.Dataset.map`] et passez ensuite le jeu de données et le tokenizer à [`~TFPreTrainedModel.prepare_tf_dataset`]. Vous pouvez également modifier la taille de l'échantillon et mélanger le jeu de données ici si vous le souhaitez :
+
+   ```py
+   >>> dataset = dataset.map(tokenize_dataset)  # doctest: +SKIP
+   >>> tf_dataset = model.prepare_tf_dataset(
+   ...     dataset, batch_size=16, shuffle=True, tokenizer=tokenizer
+   ... )  # doctest: +SKIP
+   ```
+
+5. Une fois que vous êtes prêt, appelez les fonctions `compile` et `fit` pour commencer l'entraînement :
+
+   ```py
+   >>> from tensorflow.keras.optimizers import Adam
+
+   >>> model.compile(optimizer=Adam(3e-5))
+   >>> model.fit(dataset)  # doctest: +SKIP
+   ```
+
+## Et après ?
+
+Maintenant que vous avez terminé la visite rapide de 🤗 Transformers, consultez nos guides et apprenez à faire des choses plus spécifiques comme créer un modèle personnalisé, finetuner un modèle pour une tâche, et comment entraîner un modèle avec un script. Si vous souhaitez en savoir plus sur les concepts fondamentaux de 🤗 Transformers, jetez un œil à nos guides conceptuels !
diff --git a/docs/source/it/_toctree.yml b/docs/source/it/_toctree.yml
index 9c18dcdf9b70..47d90f9a9a85 100644
--- a/docs/source/it/_toctree.yml
+++ b/docs/source/it/_toctree.yml
@@ -1,47 +1,71 @@
-- sections:
-  - local: index
-    title: 🤗 Transformers
-  - local: quicktour
-    title: Tour rapido
-  - local: installation
-    title: Installazione
-  title: Iniziare
-- sections:
-  - local: pipeline_tutorial
-    title: Pipeline per l'inferenza
-  - local: autoclass_tutorial
-    title: Carica istanze pre-allenate con AutoClass
-  - local: preprocessing
-    title: Preprocess
-  - local: training
-    title: Fine-tuning di un modello pre-addestrato
-  - local: accelerate
-    title: Allenamento distribuito con 🤗 Accelerate
-  - local: model_sharing
-    title: Condividere un modello
-  title: Esercitazione
-- sections:
-  - local: create_a_model
-    title: Crea un'architettura personalizzata
-  - local: custom_models
-    title: Condividere modelli personalizzati
-  - local: run_scripts
-    title: Addestramento con script
-  - local: multilingual
-    title: Modelli multilingua per l'inferenza
-  - local: converting_tensorflow_models
-    title: Convertire modelli tensorflow
-  - local: serialization
-    title: Esporta modelli Transformers
-  - local: debugging
-    title: Debugging
-  title: Guide pratiche
-- sections:
-  - local: add_new_pipeline
-    title: Come aggiungere una pipeline a 🤗 Transformers?
-  - local: add_new_model
-    title: Come aggiungere un modello a 🤗 Transformers?
-  - local: perf_hardware
-    title: Hardware ottimizzato per l'addestramento
-  title: Guide How-to
-  
+- sections:
+  - local: index
+    title: 🤗 Transformers
+  - local: quicktour
+    title: Tour rapido
+  - local: installation
+    title: Installazione
+  title: Iniziare
+- sections:
+  - local: pipeline_tutorial
+    title: Pipeline per l'inferenza
+  - local: autoclass_tutorial
+    title: Carica istanze pre-allenate con AutoClass
+  - local: preprocessing
+    title: Preprocess
+  - local: training
+    title: Fine-tuning di un modello pre-addestrato
+  - local: accelerate
+    title: Allenamento distribuito con 🤗 Accelerate
+  - local: model_sharing
+    title: Condividere un modello
+  title: Esercitazione
+- sections:
+  - local: create_a_model
+    title: Crea un'architettura personalizzata
+  - local: custom_models
+    title: Condividere modelli personalizzati
+  - local: run_scripts
+    title: Addestramento con script
+  - local: multilingual
+    title: Modelli multilingua per l'inferenza
+  - local: converting_tensorflow_models
+    title: Convertire modelli tensorflow
+  - local: serialization
+    title: Esporta modelli Transformers
+  - local: perf_train_cpu
+    title: Addestramento efficiente su CPU
+  - local: perf_train_cpu_many
+    title: Addestramento efficiente su multiple CPU
+  - local: perf_train_tpu
+    title: Addestramento su TPU
+  - local: perf_train_special
+    title: Addestramento su Hardware Specializzato
+  - local: perf_infer_cpu
+    title: Inferenza Efficiente su CPU
+  - local: perf_infer_gpu_one
+    title: Inferenza su una GPU
+  - local: perf_infer_gpu_many
+    title: Inferenza Efficiente su GPU Multiple
+  - local: perf_infer_special
+    title: Inferenza su Hardware Specializzato
+  - local: big_models
+    title: Istanziare un big model
+  - local: migration
+    title: Passaggio da pacchetti precedenti
+  - local: debugging
+    title: Debugging
+  title: Guide pratiche
+- sections:
+  - local: add_new_pipeline
+    title: Come aggiungere una pipeline a 🤗 Transformers?
+  - local: add_new_model
+    title: Come aggiungere un modello a 🤗 Transformers?
+  - local: perf_hardware
+    title: Hardware ottimizzato per l'addestramento
+  - local: community
+    title: Risorse della comunità
+  - local: pr_checks
+    title: Controlli su una Pull Request
+  title: Guide How-to
+  
diff --git a/docs/source/it/accelerate.md b/docs/source/it/accelerate.md
new file mode 100644
index 000000000000..3114613a9a79
--- /dev/null
+++ b/docs/source/it/accelerate.md
@@ -0,0 +1,136 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Allenamento distribuito con 🤗 Accelerate
+
+La parallelizzazione è emersa come strategia per allenare modelli sempre più grandi su hardware limitato e accelerarne la velocità di allenamento di diversi ordini di magnitudine. In Hugging Face, abbiamo creato la libreria [🤗 Accelerate](https://huggingface.co/docs/accelerate) per aiutarti ad allenare in modo semplice un modello 🤗 Transformers su qualsiasi tipo di configurazione distribuita, sia che si tratti di più GPU su una sola macchina o di più GPU su più macchine. In questo tutorial, imparerai come personalizzare il training loop nativo di PyTorch per consentire l'addestramento in un ambiente distribuito.
+
+## Configurazione
+
+Inizia installando 🤗 Accelerate:
+
+```bash
+pip install accelerate
+```
+
+Poi importa e crea un oggetto [`Accelerator`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator). `Accelerator` rileverà automaticamente il tuo setup distribuito e inizializzerà tutte le componenti necessarie per l'allenamento. Non dovrai allocare esplicitamente il tuo modello su un device.
+
+```py
+>>> from accelerate import Accelerator
+
+>>> accelerator = Accelerator()
+```
+
+## Preparati ad accelerare
+
+Il prossimo passo è quello di passare tutti gli oggetti rilevanti per l'allenamento al metodo [`prepare`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.prepare). Questo include i tuoi DataLoaders per l'allenamento e per la valutazione, un modello e un ottimizzatore:
+
+```py
+>>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
+...     train_dataloader, eval_dataloader, model, optimizer
+... )
+```
+
+## Backward
+
+Infine, sostituisci il tipico metodo `loss.backward()` nel tuo loop di allenamento con il metodo [`backward`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.backward) di 🤗 Accelerate:
+
+```py
+>>> for epoch in range(num_epochs):
+...     for batch in train_dataloader:
+...         outputs = model(**batch)
+...         loss = outputs.loss
+...         accelerator.backward(loss)
+
+...         optimizer.step()
+...         lr_scheduler.step()
+...         optimizer.zero_grad()
+...         progress_bar.update(1)
+```
+
+Come puoi vedere nel seguente codice, hai solo bisogno di aggiungere quattro righe in più di codice al tuo training loop per abilitare l'allenamento distribuito!
+
+```diff
++ from accelerate import Accelerator
+  from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
+
++ accelerator = Accelerator()
+
+  model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
+  optimizer = AdamW(model.parameters(), lr=3e-5)
+
+- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+- model.to(device)
+
++ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
++     train_dataloader, eval_dataloader, model, optimizer
++ )
+
+  num_epochs = 3
+  num_training_steps = num_epochs * len(train_dataloader)
+  lr_scheduler = get_scheduler(
+      "linear",
+      optimizer=optimizer,
+      num_warmup_steps=0,
+      num_training_steps=num_training_steps
+  )
+
+  progress_bar = tqdm(range(num_training_steps))
+
+  model.train()
+  for epoch in range(num_epochs):
+      for batch in train_dataloader:
+-         batch = {k: v.to(device) for k, v in batch.items()}
+          outputs = model(**batch)
+          loss = outputs.loss
+-         loss.backward()
++         accelerator.backward(loss)
+
+          optimizer.step()
+          lr_scheduler.step()
+          optimizer.zero_grad()
+          progress_bar.update(1)
+```
+
+## Allenamento
+
+Una volta che hai aggiunto le righe di codice rilevanti, lancia il tuo allenamento in uno script o in un notebook come Colaboratory.
+
+### Allenamento con uno script
+
+Se stai eseguendo il tuo allenamento da uno script, esegui il comando seguente per creare e salvare un file di configurazione:
+
+```bash
+accelerate config
+```
+
+Poi lancia il tuo allenamento con:
+
+```bash
+accelerate launch train.py
+```
+
+### Allenamento con un notebook
+
+La libreria 🤗 Accelerate può anche essere utilizzata in un notebook se stai pianificando di utilizzare le TPU di Colaboratory. Inserisci tutto il codice legato all'allenamento in una funzione, e passala al `notebook_launcher`:
+
+```py
+>>> from accelerate import notebook_launcher
+
+>>> notebook_launcher(training_function)
+```
+
+Per maggiori informazioni relative a 🤗 Accelerate e le sue numerose funzionalità, fai riferimento alla [documentazione](https://huggingface.co/docs/accelerate).
\ No newline at end of file
diff --git a/docs/source/it/accelerate.mdx b/docs/source/it/accelerate.mdx
deleted file mode 100644
index 20dc1a7ff90b..000000000000
--- a/docs/source/it/accelerate.mdx
+++ /dev/null
@@ -1,132 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Allenamento distribuito con 🤗 Accelerate
-
-La parallelizzazione è emersa come strategia per allenare modelli sempre più grandi su hardware limitato e accelerarne la velocità di allenamento di diversi ordini di magnitudine. In Hugging Face, abbiamo creato la libreria [🤗 Accelerate](https://huggingface.co/docs/accelerate) per aiutarti ad allenare in modo semplice un modello 🤗 Transformers su qualsiasi tipo di configurazione distribuita, sia che si tratti di più GPU su una sola macchina o di più GPU su più macchine. In questo tutorial, imparerai come personalizzare il training loop nativo di PyTorch per consentire l'addestramento in un ambiente distribuito.
-
-## Configurazione
-
-Inizia installando 🤗 Accelerate:
-
-```bash
-pip install accelerate
-```
-
-Poi importa e crea un oggetto [`Accelerator`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator). `Accelerator` rileverà automaticamente il tuo setup distribuito e inizializzerà tutte le componenti necessarie per l'allenamento. Non dovrai allocare esplicitamente il tuo modello su un device.
-
-```py
->>> from accelerate import Accelerator
-
->>> accelerator = Accelerator()
-```
-
-## Preparati ad accelerare
-
-Il prossimo passo è quello di passare tutti gli oggetti rilevanti per l'allenamento al metodo [`prepare`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.prepare). Questo include i tuoi DataLoaders per l'allenamento e per la valutazione, un modello e un ottimizzatore:
-
-```py
->>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
-...     train_dataloader, eval_dataloader, model, optimizer
-... )
-```
-
-## Backward
-
-Infine, sostituisci il tipico metodo `loss.backward()` nel tuo loop di allenamento con il metodo [`backward`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.backward) di 🤗 Accelerate:
-
-```py
->>> for epoch in range(num_epochs):
-...     for batch in train_dataloader:
-...         outputs = model(**batch)
-...         loss = outputs.loss
-...         accelerator.backward(loss)
-
-...         optimizer.step()
-...         lr_scheduler.step()
-...         optimizer.zero_grad()
-...         progress_bar.update(1)
-```
-
-Come puoi vedere nel seguente codice, hai solo bisogno di aggiungere quattro righe in più di codice al tuo training loop per abilitare l'allenamento distribuito!
-
-```diff
-+ from accelerate import Accelerator
-  from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
-
-+ accelerator = Accelerator()
-
-  model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
-  optimizer = AdamW(model.parameters(), lr=3e-5)
-
-- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
-- model.to(device)
-
-+ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
-+     train_dataloader, eval_dataloader, model, optimizer
-+ )
-
-  num_epochs = 3
-  num_training_steps = num_epochs * len(train_dataloader)
-  lr_scheduler = get_scheduler(
-      "linear",
-      optimizer=optimizer,
-      num_warmup_steps=0,
-      num_training_steps=num_training_steps
-  )
-
-  progress_bar = tqdm(range(num_training_steps))
-
-  model.train()
-  for epoch in range(num_epochs):
-      for batch in train_dataloader:
--         batch = {k: v.to(device) for k, v in batch.items()}
-          outputs = model(**batch)
-          loss = outputs.loss
--         loss.backward()
-+         accelerator.backward(loss)
-
-          optimizer.step()
-          lr_scheduler.step()
-          optimizer.zero_grad()
-          progress_bar.update(1)
-```
-
-## Allenamento
-
-Una volta che hai aggiunto le righe di codice rilevanti, lancia il tuo allenamento in uno script o in un notebook come Colaboratory.
-
-### Allenamento con uno script
-
-Se stai eseguendo il tuo allenamento da uno script, esegui il comando seguente per creare e salvare un file di configurazione:
-
-```bash
-accelerate config
-```
-
-Poi lancia il tuo allenamento con:
-
-```bash
-accelerate launch train.py
-```
-
-### Allenamento con un notebook
-
-La libreria 🤗 Accelerate può anche essere utilizzata in un notebook se stai pianificando di utilizzare le TPU di Colaboratory. Inserisci tutto il codice legato all'allenamento in una funzione, e passala al `notebook_launcher`:
-
-```py
->>> from accelerate import notebook_launcher
-
->>> notebook_launcher(training_function)
-```
-
-Per maggiori informazioni relative a 🤗 Accelerate e le sue numerose funzionalità, fai riferimento alla [documentazione](https://huggingface.co/docs/accelerate).
\ No newline at end of file
diff --git a/docs/source/it/add_new_model.md b/docs/source/it/add_new_model.md
new file mode 100644
index 000000000000..3ee22e804aaa
--- /dev/null
+++ b/docs/source/it/add_new_model.md
@@ -0,0 +1,779 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Come aggiungere un modello a 🤗 Transformers?
+
+Aggiungere un nuovo modello é spesso difficile e richiede una profonda conoscenza della libreria 🤗 Transformers e anche 
+della repository originale del modello. A Hugging Face cerchiamo di dare alla community sempre piú poteri per aggiungere 
+modelli independentemente. Quindi, per alcuni nuovi modelli che la community vuole aggiungere a 🤗 Transformers, abbiamo 
+creato una specifica *call-for-model-addition* che spiega passo dopo passo come aggiungere il modello richiesto. Con 
+questo *call-for-model-addition* vogliamo insegnare a volenterosi e esperti collaboratori della community come implementare
+un modello in 🤗 Transformers.
+
+Se questo é qualcosa che può interessarvi, siete liberi di controllare l'attuale “calls-for-model-addition” [qui](https://github.com/huggingface/transformers/tree/main/templates/adding_a_new_model/open_model_proposals/README.md)
+e contattarci. 
+
+Se il modello sarà selezionato, allora potrete lavorare insieme a un membro di Hugging Face per integrare il modello in 🤗
+Transformers. Così facendo, ci guadagnerai in una comprensione totale, sia teorica che pratica, del modello proposto. Inoltre, 
+sarai l'artefice di un importante contributo open-source a 🤗 Transformers. Durante l'implementazione avrai l'opportunità di:
+
+- ottenere più comprensione delle best practices in open-source
+- capire i principi di design di una della librerie NLP più popolari 
+- capire come efficientemente testare complessi modelli NLP
+- capire come integrare utilit Python come `black`, `ruff`, `make fix-copies` in una libreria per garantire sempre di avere un codice leggibile e pulito 
+
+Siamo anche contenti se vuoi aggiungere un modello che non può essere trovato nella cartella “calls-for-model-addition”. 
+Le seguenti sezioni spiegano in dettaglio come aggiungere un nuovo modello. Può anche essere molto utile controllare modelli
+già aggiunti [qui](https://github.com/huggingface/transformers/pulls?q=is%3Apr+label%3A%22PR+for+Model+Addition%22+is%3Aclosed),
+per capire se richiamano il modello che vorreste aggiungere. 
+
+Per cominciare, vediamo una panoramica general della libreria Transformers.
+
+## Panoramica generale su 🤗 Transformers
+
+Prima di tutto, vediamo in generale 🤗 Transformers. 🤗 Transformers é una libreria molto strutturata, quindi
+puà essere che a volte ci sia un disaccordo con alcune filosofie della libreria o scelte di design. Dalla nostra esperienza, 
+tuttavia, abbiamo trovato che le scelte fondamentali di design della libreria sono cruciali per usare 🤗 Transformers efficacemente
+su larga scala, mantenendo i costi a un livello accettabile.  
+
+Un buon primo punto di partenza per capire al meglio la libreria é leggere la [documentazione sulla nostra filosofia](filosofia)
+Da qui, ci sono alcune scelte sul modo di lavorare che cerchiamo di applicare a tutti i modelli:
+
+- La composizione é generalmente favorita sulla sovra-astrazione
+- Duplicare il codice non é sempre male, soprattutto se migliora notevolmente la leggibilità e accessibilità del modello
+- Tutti i files creati per il nuovo modello devono il piu possibile "compatti". Questo vuol dire che quando qualcuno leggerá il codice 
+di uno specifico modello, potrá vedere solo il corrispettivo file `modeling_....py` senza avere multiple dipendenze.
+
+
+La cosa piú importante, é che consideriamo la libreria non solo un mezzo per dare un prodotto, *per esempio* dare la possibilità 
+di usare BERT per inferenza, ma é anche il prodotto reale che noi vogliamo migliorare sempre più. Quindi, quando aggiungi 
+un modello, non sei solo la persona che userà il modello, ma rappresenti anche tutti coloro che leggeranno, 
+cercheranno di capire e modificare il tuo modello.
+
+Tenendo questi principi in mente, immergiamoci nel design generale della libreria.
+
+### Panoramica sui modelli
+
+Per aggiungere con successo un modello, é importante capire l'interazione tra il tuo modello e la sua configurazione,
+[`PreTrainedModel`], e [`PretrainedConfig`]. Per dare un esempio, chiameremo il modello da aggiungere a 🤗 Transformers  
+`BrandNewBert`.
+
+Diamo un'occhiata:
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_overview.png"/>
+
+Come potete vedere, ci basiamo sull'ereditarietà in 🤗 Transformers, tenendo però il livello di astrazione a un minimo 
+assoluto.  Non ci sono mai più di due livelli di astrazione per ogni modello nella libreria. `BrandNewBertModel` eredita 
+da `BrandNewBertPreTrainedModel` che, a sua volta, eredita da [`PreTrainedModel`] -  semplice no? 
+Come regola generale, vogliamo essere sicuri che un nuovo modello dipenda solo da [`PreTrainedModel`]. Le funzionalità
+importanti che sono automaticamente conferite a ogni nuovo modello sono [`~PreTrainedModel.from_pretrained`]
+e [`~PreTrainedModel.save_pretrained`], che sono usate per serializzazione e deserializzazione. Tutte le altre importanti 
+funzionalità, come ad esempio `BrandNewBertModel.forward` devono essere definite completamente nel nuovo script
+`modeling_brand_new_bert.py`. Inoltre, vogliamo essere sicuri che un modello con uno specifico head layer, come 
+`BrandNewBertForMaskedLM` non erediti da `BrandNewBertModel`, ma piuttosto usi `BrandNewBertModel`
+come componente che può essere chiamata nel passaggio forward per mantenere il livello di astrazione basso. Ogni 
+nuovo modello richieste una classe di configurazione, chiamata `BrandNewBertConfig`. Questa configurazione é sempre 
+mantenuta come un attributo in [`PreTrainedModel`], e quindi può essere accessibile tramite l'attributo `config` 
+per tutte le classi che ereditano da `BrandNewBertPreTrainedModel`:
+
+```python
+model = BrandNewBertModel.from_pretrained("brandy/brand_new_bert")
+model.config  # il modello ha accesso al suo config
+```
+
+Analogamente al modello, la configurazione eredita le funzionalità base di serializzazione e deserializzazione da 
+[`PretrainedConfig`]. É da notare che la configurazione e il modello sono sempre serializzati in due formati differenti - 
+il modello é serializzato in un file *pytorch_model.bin* mentre la configurazione con *config.json*. Chiamando 
+[`~PreTrainedModel.save_pretrained`] automaticamente chiamerà [`~PretrainedConfig.save_pretrained`], cosicché sia il 
+modello che la configurazione siano salvati.
+
+
+### Stile per il codice
+
+Quando codifichi un nuovo modello, tieni presente che Transformers ha una sua struttura di fondo come libreria, perciò 
+ci sono alcuni fatti da considerare su come scrivere un codice :-)
+
+1. Il forward pass del tuo modello dev'essere scritto completamente nel file del modello, mentre dev'essere indipendente 
+   da altri modelli nella libreria. Se vuoi riutilizzare un blocco di codice da un altro modello, copia e incolla il codice con un commento `# Copied from` in cima al codice (guarda [qui](https://github.com/huggingface/transformers/blob/v4.17.0/src/transformers/models/roberta/modeling_roberta.py#L160)
+   per un ottimo esempio).
+2. Il codice dev'essere interamente comprensibile, anche da persone che non parlano in inglese. Questo significa che le 
+   variabili devono avere un nome descrittivo e bisogna evitare abbreviazioni. Per esempio, `activation` é molto meglio 
+   che `act`. Le variabili con una lettera sono da evitare fortemente, almeno che non sia per un indce in un for loop.
+3. Generamente é meglio avere un codice esplicito e piú lungo che un codice corto e magico.
+4. Evita di subclassare `nn.Sequential` in Pytorch, puoi subclassare `nn.Module` e scrivere il forward pass, cosicché 
+   chiunque può effettuare debug sul tuo codice, aggiungendo print o breaking points. 
+5. La tua function-signature dev'essere type-annoted. Per il resto, é meglio preferire variabili con un nome accettabile 
+   piuttosto che annotazioni per aumentare la comprensione e leggibilità del codice.
+
+### Panoramica sui tokenizers
+
+Questa sezione sarà creata al piu presto :-(
+
+## Aggiungere un modello a 🤗 Transformers passo dopo passo 
+
+Ci sono differenti modi per aggiungere un modello a Hugging Face. Qui trovi una lista di blog posts da parte della community su come aggiungere un modello:
+
+1. [Aggiungere GPT2](https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28) scritto da [Thomas](https://huggingface.co/thomwolf)
+2. [Aggiungere WMT19 MT](https://huggingface.co/blog/porting-fsmt) scritto da [Stas](https://huggingface.co/stas)
+
+Per esperienza, possiamo dirti che quando si aggiunge un modello é meglio tenere a mente le seguenti considerazioni:
+
+-  Non sfondare una porta giá aperta! La maggior parte del codice che aggiungerai per un nuovo modello 🤗 Transformers
+  esiste già da qualche parte in 🤗 Transformers. Prendi un po' di tempo per trovare codici simili in modelli e tokenizers esistenti e fare un copia-incolla. Ricorda che [grep](https://www.gnu.org/software/grep/) e [rg](https://github.com/BurntSushi/ripgrep) sono tuoi buoni amici. Inoltre, ricorda che puó essere molto probabile che il tokenizer per il tuo modello sia basato sull'implementazione di un altro modello, e il codice del tuo modello stesso su un altro ancora. *Per esempio* il modello FSMT é basato su BART, mentre il tokenizer di FSMT é basato su XLM.
+-  Ricorda che qui é piu una sfida ingegneristica che scientifica. Spendi piú tempo per create un efficiente ambiente di debugging piuttosto che cercare di capire tutti gli aspetti teorici dell'articolo del modello.
+-  Chiedi aiuto se sei in panne! I modelli sono la parte principale di 🤗 Transformers, perciò qui a Hugging Face siamo più che contenti di aiutarti in ogni passo per aggiungere il tuo modello. Non esitare a chiedere se vedi che non riesci a progredire.
+
+Di seguito, diamo una ricetta generale per aiutare a portare un modello in 🤗 Transformers.
+
+La lista seguente é un sommario di tutto quello che é stato fatto per aggiungere un modello, e può essere usata come To-Do List:
+
+-  1. ☐ (Opzionale) Capire gli aspetti teorici del modello
+-  2. ☐ Preparare l'ambiente dev per transformers
+-  3. ☐ Preparare l'ambiente debugging della repository originale 
+-  4. ☐ Create uno script che gestisca con successo il forward pass usando la repository originale e checkpoint 
+-  5. ☐ Aggiungere con successo lo scheletro del modello a Transformers
+-  6. ☐ Convertire i checkpoint original a Transformers checkpoint
+-  7. ☐ Effettuare con successo la forward pass in Transformers, di modo che dia un output identico al checkpoint originale 
+-  8. ☐ Finire i tests per il modello in Transformers
+-  9. ☐ Aggiungere con successo Tokenizer in Transformers
+-  10. ☐ Testare e provare gli integration tests da capo a fine
+-  11. ☐ Completare i docs
+-  12. ☐ Caricare i moedl weights all'hub
+-  13. ☐ Sottomettere una pull request
+-  14. ☐ (Opzionale) Aggiungere un notebook con una demo
+
+Per cominciare di solito consigliamo `BrandNewBert`, partendo dalla teoria, di modo da avere una buona comprensione della teoria generale. TUttavia, se preferisci imparare l'aspetto teorico del modello mentre *lavori* sul modello é ok immergersi direttamente nel codice di `BrandNewBert`. Questa opzione puó essere buona se le tue skills ingegneristiche sono meglio che quelle teoriche, o se il paper `BrandNewBert` ti dá problemi, o se semplicemente ti piace programmare piú che leggere articoli scientifici.
+
+### 1. (Opzionale) Aspetti teorici di BrandNewBert 
+
+Allora con calma, prendi un po' di tempo per leggere l'articolo su *BrandNewBert* . Sicuramente, alcune sezioni dell'articolo sono molto complesse, ma non preoccuparti! L'obiettivo non é avere una compresione immensa della teoria alla base, ma estrarre le informazioni necessarie per re-implementare con successo il modello in 🤗 Transformers. Quindi, non impazzire sugli aspetti teorici, ma piuttosto focalizzati su quelli pratici, ossia:
+
+- Che tipo di modello é *brand_new_bert*? É solo un encoder in stile BERT? O tipo decoder come GPT2? O encoder e decoder stile BART? Dai un'occhiata a [model_summary](model_summary) se non sei famigliare con le differenze tra questi modelli 
+- Quali sono le applicazioni di *brand_new_bert*? Classificazione di testo? Generazione di testo? O per tasks del genere seq2seq? 
+- Quali sono le nuove aggiunte al modello che lo rendono diverso da BERT/GPT-2/BART? 
+- Quali modelli estistenti in [🤗 Transformers models](https://huggingface.co/transformers/#contents) sono molto simili a *brand_new_bert*?
+- Che tipo di tokenizer si usa in questo caso? Un sentencepiece tokenizer? O un word piece tokenizer? Il tokenizer é lo stesso di BERT o BART? 
+
+Una volta che senti che hai avuto una bella overview dell'architettura del modello, puoi scrivere senza problemi al team di Hugging Face per ogni domanda che tu hai. Questo puó includere domande sull'architettura del modello, o sull'attention layer, etc. Saremo molto felici di aiutarti :) 
+
+
+### 2. Prepare il tuo ambiente
+
+1. Forka la [repository](https://github.com/huggingface/transformers) cliccando sul tasto ‘Fork' nella pagina della repository. Questo crea una copia del codice nel tuo account GitHub 
+
+2. Clona il tuo fork `transfomers` sul tuo dico locale, e aggiungi la repository base come remota:
+
+```bash
+git clone https://github.com/[your Github handle]/transformers.git
+cd transformers
+git remote add upstream https://github.com/huggingface/transformers.git
+```
+
+
+3. Crea un ambiente di sviluppo, per esempio tramite questo comando:
+
+```bash
+python -m venv .env
+source .env/bin/activate
+pip install -e ".[dev]"
+```
+
+quindi torna alla directory principale: 
+
+```bash
+cd ..
+```
+
+
+4. Attenzione, raccomandiamo di aggiungere la versione di PyTorch di *brand_new_bert* a Transfomers. Per installare PyTorch, basta seguire queste istruzioni https://pytorch.org/get-started/locally/.
+
+**Nota bene:** Non c'é bisogno di installare o avere installato CUDA. Il nuovo modello può funzionare senza problemi su una CPU.
+
+
+5. Per trasferire *brand_new_bert* To port *brand_new_bert* avrai bisogno anche accesso alla sua repository originale:
+
+```bash
+git clone https://github.com/org_that_created_brand_new_bert_org/brand_new_bert.git 
+cd brand_new_bert
+pip install -e .
+```
+
+Ok, ora hai un ambiente di sviluppo per portare *brand_new_bert* in 🤗 Transformers.
+
+
+### 3.-4. Provare un pretrained checkpoint usando la repo originale 
+
+Per cominciare, comincerai a lavorare sulla repo originale di *brand_new_bert*. Come spesso accade, l'implementazione originale é molto sullo stile "ricerca". Questo significa che a volte la documentazione non é al top, magari manca qualche cosa e il codice puó essere difficile da capire. Tuttavia, questa é e dev'essere la motivazione per reimplementare *brand_new_bert*. In Hugging Face, uno degli obiettivi principali é di *mettere le persone sulle spalle dei giganti*, il che si traduce, in questo contesto, di prendere un modello funzionante e riscriverlo e renderlo il piú possibile **accessibile, user-friendly, e leggibile**. Questa é la top motivazione per re-implementare modelli in 🤗 Transformers - cercare di creare nuove complesse tecnologie NLP accessibili a **chiunque**. 
+
+Riuscire a far girare il modello pretrained originale dalla repository ufficiale é spesso il passo **piu arduo**. Dalla nostra esperienza, é molto importante spendere un p' di tempo per diventare familiari con il codice base originale. Come test, prova a capire i seguenti punti:
+
+- Dove si trovano i pretrained weights? 
+- Come caricare i pretrained weights nel modello corrispondente? 
+- Come girare un tokenizer independentemente dal modello? 
+- Prova a tracciare un singolo forward pass, cosicché potrai sapere che classi e funzioni sono richieste per un semplice forward pass. Di solito, dovrai reimplementare queste funzioni e basta 
+- Prova a localizzare i componenti importanti del modello: Dove si trova la classe del modello? Ci sono sotto classi nel modello *per esempio* EngoderModel, DecoderMOdel? Dove si trova il self-attention layer? Ci sono molteplici differenti layer di attention, *per esempio * *self-attention*, *cross-attention*...?
+- Come puoi fare debug sul modello nell'ambiente originale della repo? Devi aggiungere dei *print* o puoi usare *ipdb* come debugger interattivo, o vabene anche un IDE efficiente per debug come PyCharm?
+
+É molto importante che prima di cominciare a trasferire il modello nuovo tu spenda tempo a fare debug del codice originale in maniera **efficiente**! Inoltre, ricorda che tutta la library é open-soruce, quindi non temere di aprire issue o fare una pull request nella repo originale. Tutti coloro che mantengono la repository saranno piú che felici di avere qualcuno che guarda e gioca con i loro codici!
+
+A questo punto, sta a te decidere quale ambiente per debug vuoi usare. Noi consilgiamo di evitare setup con GPU, che potrebbero costare assai, lavorare su una CPU puó essere un ottimo punto di partenza per indagare la repository originale e per cominciare a scrivere il codice per 🤗 Transformers. Solo alla fine, quando il modello é stato portato con successo in  🤗 Transformers, allora si potrá verificare il suo funzionamento su GPU.
+
+In generale ci sono due possibili ambienti di debug per il testare il modello originale: 
+
+- [Jupyter notebooks](https://jupyter.org/) / [google colab](https://colab.research.google.com/notebooks/intro.ipynb)
+- Scripts locali in Python 
+
+Il vantaggio dei Jupyter notebooks é la possibilità di eseguire cella per cella, il che può essere utile per decomporre tutte le componenti logiche, cosi da a vere un ciclo di debug più rapido, siccome si possono salvare i risultati da steps intermedi. Inoltre, i notebooks spesso sono molto facili da condividere con altri contributors, il che può essere molto utile se vuoi chiedere aiuto al team di Hugging Face. Se sei famigliare con Jupyter notebooks allora racommandiamo di lavorare in questa maniera.
+
+Ovviamente se non siete abituati a lavorare con i notebook, questo può essere uno svantaggio nell'usare questa tecnologia, sprecando un sacco di tempo per setup e portare tutto al nuovo ambiente, siccome non potreste neanche usare dei tools di debug come `ipdb`. 
+
+Per ogni pratica code-base, é sempre meglio come primo step caricare un **piccolo** checkpoint pretrained e cercare di riprodurre un singolo forward pass usando un vettore fittizio di IDs fatti da numeri interi. Un esempio per uno script simile, in pseudocodice é:
+
+```python
+model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/")
+input_ids = [0, 4, 5, 2, 3, 7, 9]  # vector of input ids
+original_output = model.predict(input_ids)
+```
+
+Per quanto riguarda la strategia di debugging, si può scegliere tra:
+
+- Decomporre il modello originario in piccole componenenti e testare ognuna di esse 
+- Decomporre il modello originario nel *tokenizer* originale e nel *modello* originale, testare un forward pass su questi, 
+e usare dei print statement o breakpoints intermedi per verificare
+
+Ancora una volta, siete liberi di scegliere quale strategia sia ottimale per voi. Spesso una strategia é piu 
+avvantaggiosa di un'altra, ma tutto dipende dall'code-base originario.
+
+Se il code-base vi permette di decomporre il modello in piccole sub-componenenti, *per esempio* se il code-base 
+originario può essere facilmente testato in eager mode, allora vale la pena effettuare un debugging di questo genere. 
+Ricordate che ci sono dei vantaggi nel decidere di prendere la strada piu impegnativa sin da subito: 
+
+- negli stage piu finali, quando bisognerà comparare il modello originario all'implementazione in Hugging Face, potrete verificare
+automaticamente ogni componente, individualmente, di modo che ci sia una corrispondenza 1:1
+- avrete l'opportunità di decomporre un problema molto grande in piccoli passi, così da strutturare meglio il vostro lavoro
+- separare il modello in componenti logiche vi aiuterà ad avere un'ottima overview sul design del modello, quindi una migliore 
+comprensione del modello stesso 
+- verso gli stage finali i test fatti componente per componente vi aiuterà ad essere sicuri di non andare avanti e indietro
+nell'implementazione, così da continuare la modifica del codice senza interruzione
+
+Un ottimo esempio di come questo può essere fatto é dato da [Lysandre](https://gist.github.com/LysandreJik/db4c948f6b4483960de5cbac598ad4ed) 
+per il modello ELECTRA
+
+Tuttavia, se il code-base originale é molto complesso o le componenti intermedie possono essere testate solo in tramite 
+compilazione, potrebbe richiedere parecchio tempo o addirittura essere impossibile separare il modello in piccole sotto-componenti. 
+Un buon esempio é [MeshTensorFlow di T5](https://github.com/tensorflow/mesh/tree/master/mesh_tensorflow). Questa libreria 
+é molto complessa e non offre un metodo semplice di decomposizione in sotto-componenti. Per simili librerie, potrete fare 
+affidamento ai print statements.
+
+In ogni caso, indipendentemente da quale strategia scegliete, la procedura raccomandata é di cominciare a fare debug dal 
+primo layer al layer finale. 
+É consigliato recuperare gli output dai layers, tramite print o sotto-componenti, nel seguente ordine:
+
+1. Recuperare gli IDs di input dati al modello
+2. Recuperare i word embeddings
+3. Recuperare l'input del primo Transformer layer 
+4. Recuperare l'output del primo Transformer layer 
+5. Recuperare l'output dei seguenti `n - 1` Transformer layers
+6. Recuperare l'output dell'intero BrandNewBert Model
+
+Gli IDs in input dovrebbero essere un arrary di interi, *per esempio* `input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]`
+
+Gli output dei seguenti layer di solito dovrebbero essere degli array di float multi-dimensionali come questo:
+
+```
+[[
+ [-0.1465, -0.6501,  0.1993,  ...,  0.1451,  0.3430,  0.6024],
+ [-0.4417, -0.5920,  0.3450,  ..., -0.3062,  0.6182,  0.7132],
+ [-0.5009, -0.7122,  0.4548,  ..., -0.3662,  0.6091,  0.7648],
+ ...,
+ [-0.5613, -0.6332,  0.4324,  ..., -0.3792,  0.7372,  0.9288],
+ [-0.5416, -0.6345,  0.4180,  ..., -0.3564,  0.6992,  0.9191],
+ [-0.5334, -0.6403,  0.4271,  ..., -0.3339,  0.6533,  0.8694]]],
+```
+
+Ci aspettiamo che ogni modello aggiunto a 🤗 Transformers passi con successo un paio di test d'integrazione. Questo 
+significa che il modello originale e la sua implementazione in 🤗 Transformers abbiano lo stesso output con una precisione 
+di 0.001! Siccome é normale che lo stesso esatto modello, scritto in librerie diverse, possa dare output leggermente 
+diversi, la tolleranza accettata é 1e-3 (0.001). Ricordate che i due modelli devono dare output quasi identici. Dunque, 
+é molto conveniente comparare gli output intermedi di 🤗 Transformers molteplici volte con gli output intermedi del 
+modello originale di *brand_new_bert*. Di seguito vi diamo alcuni consigli per avere un ambiente di debug il piu efficiente
+possibile:
+
+- Trovate la migliore strategia per fare debug dei risultati intermedi. Per esempio, é la repository originale scritta in PyTorch?
+Se si, molto probabilmente dovrete dedicare un po' di tempo per scrivere degli script piu lunghi, così da decomporre il 
+modello originale in piccole sotto-componenti, in modo da poter recuperare i valori intermedi. Oppure, la repo originale 
+é scritta in Tensorflow 1? Se é così dovrete fare affidamento ai print di Tensorflow [tf.print](https://www.tensorflow.org/api_docs/python/tf/print) 
+per avere i valori intermedi. Altro caso, la repo é scritta in Jax? Allora assicuratevi che il modello non sia in **jit** 
+quanto testate il foward pass, *per esempio* controllate [questo link](https://github.com/google/jax/issues/196). 
+- Usate i più piccoli pretrained checkpoint che potete trovare. Piu piccolo é il checkpoint, piu velocemente sarà il vostro 
+ciclo di debug. Non é efficiente avere un pretrained model così gigante che per il forward pass impieghi piu di 10 secondi. 
+Nel caso in cui i checkpoints siano molto grandi, e non si possa trovare di meglio, allora é buona consuetudine ricorrere
+a fare un dummy model nel nuovo ambiente, con weights inizializzati random e salvare quei weights per comprare la versione 🤗 Transformers 
+con il vostro modello
+- Accertatevi di usare la via piu semplice per chiamare il forward pass nella repo originale. Sarebbe opportuno trovare 
+la funzione originaria che chiami **solo** un singolo forward pass, *per esempio* questa funzione spesso viene chiamata 
+`predict`, `evaluate`, `forward` o `__call__`. Siate sicuri di non fare debug su una funzione che chiami `forward` molteplici 
+volte, *per esempio* per generare testo, come `autoregressive_sample`, `generate`.
+- Cercate di separare la tokenization dal forward pass del modello. Se la repo originaria mostra esempio dove potete dare 
+come input una stringa, provate a cercare dove nella forward call la stringa viene cambiata in input ids e cominciate il 
+debug da questo punto. Questo vi garantisce un ottimo punto di partenza per scrivere un piccolo script personale dove dare 
+gli input al modello, anziche delle stringhe in input. 
+- Assicuratevi che il debugging **non** sia in training mode. Spesso questo potra il modello a dare degli output random, per 
+via dei molteplici dropout layers. Assicuratevi che il forward pass nell'ambiente di debug sia **deterministico**, cosicche 
+i dropout non siano usati. Alternativamente, potete usare *transformers.utils.set_seed* se la vecchia e nuova implementazione 
+sono nello stesso framework.
+
+La seguente sezione vi da ulteriori dettagli e accorgimenti su come potete fare tutto questo per *brand_new_bert*.
+
+
+### 5.-14. Trasferire BrandNewBert in 🤗 Transformers
+
+Allora cominciamo ad aggiungere un nuovo codice in 🤗 Transformers. Andate nel vostro fork clone di 🤗 Transformers:
+
+
+```bash 
+cd transformers
+```
+
+Nel caso speciale in cui stiate aggiungendo un modello, la cui architettura sia identica a una di un modello già esistente,
+dovrete solo aggiugnere uno script di conversione, come descritto [qui](#write-a-conversion-script).
+In questo caso, potete riutilizzare l'intera architettura del modello gia esistente.
+
+Se questo non é il caso, cominciamo con il generare un nuovo modello. Avrete due opzioni:
+
+- `transformers-cli add-new-model-like` per aggiungere un nuovo modello come uno che gia esiste
+- `transformers-cli add-new-model` per aggiungere un nuovo modello da un nostro template (questo assomigliera a BERT o Bart, in base al modello che selezionerete)
+
+In entrambi i casi, l'output vi darà un questionario da riempire con informazioni basi sul modello. Il secondo comando richiede di installare
+un `cookiecutter` - maggiori informazioni [qui](https://github.com/huggingface/transformers/tree/main/templates/adding_a_new_model).
+
+**Aprire una Pull Request in main huggingface/transformers repo**
+
+Prime di cominciare ad adattare il codice automaticamente generato, aprite una nuova PR come "Work in progress (WIP)", 
+*per esempio* "[WIP] Aggiungere *brand_new_bert*", cosicché il team di Hugging Face possa lavorare al vostro fianco nell'
+integrare il modello in 🤗 Transformers.
+
+Questi sarebbero gli step generali da seguire:
+
+1. Creare un branch dal main branch con un nome descrittivo 
+
+```bash 
+git checkout -b add_brand_new_bert 
+```
+
+2. Commit del codice automaticamente generato 
+
+```bash 
+git add . 
+git commit 
+```
+
+3. Fare fetch e rebase del main esistente
+
+```bash 
+git fetch upstream 
+git rebase upstream/main 
+```
+
+4. Push dei cambiamenti al proprio account: 
+
+```bash
+git push -u origin a-descriptive-name-for-my-changes
+```
+
+5. Una volte che siete soddisfatti dei nuovi cambiamenti, andate sulla webpage del vostro fork su GitHub. Cliccate "Pull request". 
+Assiuratevi di aggiungere alcuni membri di Hugging Face come reviewers, nel riguardo alla destra della pagina della PR, cosicche il team 
+Hugging Face verrà notificato anche per i futuri cambiamenti. 
+
+6. Cambiare la PR a draft, cliccando su "Convert to draft" alla destra della pagina della PR
+
+Da quel punto in poi, ricordate di fare commit di ogni progresso e cambiamento, cosicche venga mostrato nella PR. Inoltre, 
+ricordatevi di tenere aggiornato il vostro lavoro con il main esistente:
+
+```bash
+git fetch upstream
+git merge upstream/main
+```
+
+In generale, tutte le domande che avrete riguardo al modello o l'implementazione dovranno essere fatte nella vostra PR 
+e discusse/risolte nella PR stessa. In questa maniera, il team di Hugging Face sarà sempre notificato quando farete commit 
+di un nuovo codice o se avrete qualche domanda. É molto utile indicare al team di Hugging Face il codice a cui fate riferimento 
+nella domanda, cosicche il team potra facilmente capire il problema o la domanda. 
+
+Per fare questo andate sulla tab "Files changed", dove potrete vedere tutti i vostri cambiamenti al codice, andate sulla linea 
+dove volete chiedere una domanda, e cliccate sul simbolo "+" per aggiungere un commento. Ogni volta che una domanda o problema 
+é stato risolto, cliccate sul bottone "Resolve".
+
+In questa stessa maniera, Hugging Face aprirà domande o commenti nel rivedere il vostro codice. Mi raccomando, chiedete più 
+domande possibili nella pagina della vostra PR. Se avete domande molto generali, non molto utili per il pubblico, siete liberi 
+di chiedere al team Hugging Face direttamente su slack o email.
+
+
+**5. Adattare i codici per brand_new_bert**
+
+Per prima cosa, ci focalizzeremo sul modello e non sui tokenizer. Tutto il codice relative dovrebbe trovarsi in  
+`src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` e
+`src/transformers/models/brand_new_bert/configuration_brand_new_bert.py`.
+
+Ora potete finalmente cominciare il codice :). Il codice generato in 
+`src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` avrà sia la stessa architettura di BERT se é un 
+modello encoder-only o BART se é encoder-decoder. A questo punto, ricordatevi cio che avete imparato all'inizio, riguardo 
+agli aspetti teorici del modello: *In che maniera il modello che sto implmementando é diverso da BERT o BART?*. Implementare 
+questi cambi  spesso vuol dire cambiare il layer *self-attention*, l'ordine dei layer di normalizzazione e così via... 
+Ancora una volta ripetiamo, é molto utile vedere architetture simili di modelli gia esistenti in Transformers per avere 
+un'idea migliore su come implementare il modello. 
+
+**Notate** che a questo punto non dovete avere subito un codice tutto corretto o pulito. Piuttosto, é consigliato cominciare con un 
+codice poco pulito, con copia-incolla del codice originale in `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` 
+fino a che non avrete tutto il codice necessario. In base alla nostra esperienza, é molto meglio aggiungere una prima bozza 
+del codice richiesto e poi correggere e migliorare iterativamente. L'unica cosa essenziale che deve funzionare qui é la seguente 
+instanza: 
+
+```python
+from transformers import BrandNewBertModel, BrandNewBertConfig
+
+model = BrandNewBertModel(BrandNewBertConfig())
+```
+
+Questo comando creerà un modello con i parametri di default definiti in `BrandNewBergConfig()` e weights random. Questo garantisce 
+che `init()` di tutte le componenti funzioni correttamente.
+
+
+**6. Scrivere uno script di conversione**
+
+Il prossimo step é scrivere uno script per convertire il checkpoint che avete usato per fare debug su *brand_new_berts* nella 
+repo originale in un checkpoint per la nuova implementazione di *brand_new_bert* in 🤗 Transformers. Non é consigliato scrivere 
+lo script di conversione da zero, ma piuttosto cercate e guardate script gia esistenti in 🤗 Transformers, così da trovarne
+uno simile al vostro modello. Di solito basta fare una copia di uno script gia esistente e adattarlo al vostro caso. 
+Non esistate a chiedre al team di Hugging Face a riguardo.
+
+- Se state convertendo un modello da TensorFlow a PyTorch, un ottimo inizio é vedere [questo script di conversione per BERT](https://github.com/huggingface/transformers/blob/7acfa95afb8194f8f9c1f4d2c6028224dbed35a2/src/transformers/models/bert/modeling_bert.py#L91)
+- Se state convertendo un modello da PyTorch a PyTorch, [lo script di conversione di BART può esservi utile](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py)
+
+Qui di seguito spiegheremo come i modelli PyTorch salvano i weights per ogni layer e come i nomi dei layer sono definiti. In PyTorch, 
+il nomde del layer é definito dal nome della class attribute che date al layer. Definiamo un modello dummy in PyTorch, 
+chiamato `SimpleModel`:
+
+```python
+from torch import nn
+
+
+class SimpleModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.dense = nn.Linear(10, 10)
+        self.intermediate = nn.Linear(10, 10)
+        self.layer_norm = nn.LayerNorm(10)
+```
+Ora possiamo creare un'instanza di questa definizione di modo da inizializzare a random weights: `dense`, `intermediate`, `layer_norm`.
+Possiamo usare print per vedere l'architettura del modello:
+
+```python
+model = SimpleModel()
+
+print(model)
+```
+
+Da cui si ottiene:
+
+```
+SimpleModel(
+  (dense): Linear(in_features=10, out_features=10, bias=True)
+  (intermediate): Linear(in_features=10, out_features=10, bias=True)
+  (layer_norm): LayerNorm((10,), eps=1e-05, elementwise_affine=True)
+)
+```
+
+Si può vedere come i nomi dei layers siano definiti dal nome della class attribute in PyTorch. I valori dei weights di uno 
+specifico layer possono essere visualizzati:
+
+
+```python
+print(model.dense.weight.data)
+```
+
+ad esempio:
+
+```
+tensor([[-0.0818,  0.2207, -0.0749, -0.0030,  0.0045, -0.1569, -0.1598,  0.0212,
+         -0.2077,  0.2157],
+        [ 0.1044,  0.0201,  0.0990,  0.2482,  0.3116,  0.2509,  0.2866, -0.2190,
+          0.2166, -0.0212],
+        [-0.2000,  0.1107, -0.1999, -0.3119,  0.1559,  0.0993,  0.1776, -0.1950,
+         -0.1023, -0.0447],
+        [-0.0888, -0.1092,  0.2281,  0.0336,  0.1817, -0.0115,  0.2096,  0.1415,
+         -0.1876, -0.2467],
+        [ 0.2208, -0.2352, -0.1426, -0.2636, -0.2889, -0.2061, -0.2849, -0.0465,
+          0.2577,  0.0402],
+        [ 0.1502,  0.2465,  0.2566,  0.0693,  0.2352, -0.0530,  0.1859, -0.0604,
+          0.2132,  0.1680],
+        [ 0.1733, -0.2407, -0.1721,  0.1484,  0.0358, -0.0633, -0.0721, -0.0090,
+          0.2707, -0.2509],
+        [-0.1173,  0.1561,  0.2945,  0.0595, -0.1996,  0.2988, -0.0802,  0.0407,
+          0.1829, -0.1568],
+        [-0.1164, -0.2228, -0.0403,  0.0428,  0.1339,  0.0047,  0.1967,  0.2923,
+          0.0333, -0.0536],
+        [-0.1492, -0.1616,  0.1057,  0.1950, -0.2807, -0.2710, -0.1586,  0.0739,
+          0.2220,  0.2358]]).
+```
+
+Nello script di conversione, dovreste riempire quei valori di inizializzazione random con gli stessi weights del corrispondente 
+layer nel checkpoint. *Per esempio*
+
+```python
+# retrieve matching layer weights, e.g. by
+# recursive algorithm
+layer_name = "dense"
+pretrained_weight = array_of_dense_layer
+
+model_pointer = getattr(model, "dense")
+
+model_pointer.weight.data = torch.from_numpy(pretrained_weight)
+```
+
+Così facendo, dovete verificare che ogni inizializzazione random di un peso del modello PyTorch e il suo corrispondente peso nel pretrained checkpoint 
+siano esattamente gli stessi e uguali in **dimensione/shape e nome**. Per fare questo, é **necessario** aggiungere un `assert` 
+per la dimensione/shape e nome:
+
+```python
+assert (
+    model_pointer.weight.shape == pretrained_weight.shape
+), f"Pointer shape of random weight {model_pointer.shape} and array shape of checkpoint weight {pretrained_weight.shape} mismatched"
+```
+
+Inoltre, dovrete fare il print sia dei nomi che dei weights per essere sicuri che siano gli stessi:
+
+```python
+logger.info(f"Initialize PyTorch weight {layer_name} from {pretrained_weight.name}")
+```
+
+Se la dimensione o il nome non sono uguali, probabilmente avete sbagliato ad assegnare il peso nel checkpoint o nel layer costrutture di 
+ 🤗 Transformers.
+
+Una dimensione sbagliata può essere dovuta ad un errore nei parameteri in `BrandNewBertConfig()`. Tuttavia, può essere anche 
+che l'implementazione del layer in PyTorch richieda di fare una transposizione della matrice dei weights. 
+
+Infine, controllate **tutti** che tutti i weights inizializzati e fate print di tutti i weights del checkpoint che non sono stati 
+usati per l'inizializzazione, di modo da essere sicuri che il modello sia correttamente convertito. É normale che ci siano 
+errori nel test di conversione, fai per un errore in `BrandNewBertConfig()`, o un errore nell'architettura in 🤗 Transformers, 
+o un bug in `init()`. 
+
+Questo step dev'essere fatto tramite iterazioni fino a che non si raggiungano gli stessi valori per i weights. Una volta che 
+il checkpoint é stato correttamente caricato in 🤗 Transformers, potete salvare il modello in una cartella di vostra scelta 
+`/path/to/converted/checkpoint/folder` che contenga sia
+`pytorch_model.bin` che `config.json`:
+
+```python
+model.save_pretrained("/path/to/converted/checkpoint/folder")
+```
+
+
+**7. Implementare il forward pass**
+
+Una volta che i weights pretrained sono stati correttamente caricati in 🤗 Transformers, dovrete assicurarvi che il forward pass 
+sia correttamente implementato. [Qui](#provare-un-pretrained-checkpoint-usando-la-repo-originale), avete give creato e provato
+uno script che testi il forward pass del modello usando la repo originaria. Ora dovrete fare lo stesso con uno script analogo 
+usando l'implementazione in 🤗 Transformers anziché l'originale. Piu o meno lo script dovrebbe essere:
+
+```python
+model = BrandNewBertModel.from_pretrained("/path/to/converted/checkpoint/folder")
+input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]
+output = model(input_ids).last_hidden_states
+```
+
+Di solito l'output da 🤗 Transformers non é uguale uguale all'output originario, sopratto la prima volta. Non vi abbattete - 
+é normale! Prima di tutto assicuratevi che non ci siano errori o che non vengano segnalati degli errori nella forward pass. 
+Spesso capita che ci siano dimensioni sbagliate o data type sbagliati, *ad esempio* `torch.long` anziche `torch.float32`. 
+Non esistate a chiedere al team Hugging Face!
+
+Nella parte finale assicuratevi che l'implementazione 🤗 Transformers funzioni correttamente cosi da testare che gli output 
+siano equivalenti a una precisione di `1e-3`. Controllate che `outputs.shape` siano le stesse tra 🤗 Transformers e l'implementazione 
+originaria. Poi, controllate che i valori in output siano identici. Questa é sicuramente la parte più difficile, qui una serie 
+di errori comuni quando gli output non sono uguali:
+
+- Alcuni layers non sono stati aggiunti, *ad esempio* un *activation* layer non é stato aggiunto, o ci si é scordati di una connessione 
+- La matrice del word embedding non é stata ripareggiata 
+- Ci sono degli embeddings posizionali sbagliati perché l'implementazione originaria ha un offset 
+- Il dropout é in azione durante il forward pass. Per sistemare questo errore controllate che *model.training = False* e che 
+il dropout non sia stato attivato nel forward pass, * per esempio * passate *self.training* a [PyTorch's functional dropout](https://pytorch.org/docs/stable/nn.functional.html?highlight=dropout#torch.nn.functional.dropout)
+
+La miglior maniera per sistemare il problema é di vedere all'implementazione originaria del forward pass e in 🤗 Transformers 
+fianco a fianco e vedere se ci sono delle differenze. In teoria, con debug e print degli output intermedie di entrambe le 
+implementazioni nel forward pass nell'esatta posizione del network dovrebbe aiutarvi a vedere dove ci sono differenze tra 
+i due frameworks. Come prima mossa controllate che `input_ids` siano identici in entrambi gli scripts. Da lì andate fino 
+all'ultimo layer. Potrete notare una differenza tra le due implementazioni a quel punto. 
+
+Una volta che lo stesso output é stato ragguingi, verificate gli output con `torch.allclose(original_output, output, atol=1e-3)`.
+A questo punto se é tutto a posto: complimenti! Le parti seguenti saranno una passeggiata 😊.
+
+
+**8. Aggiungere i test necessari per il modello**
+
+A questo punto avete aggiunto con successo il vostro nuovo modello. Tuttavia, é molto probabile che il modello non sia 
+del tutto ok con il design richiesto. Per essere sicuri che l'implementazione sia consona e compatibile con 🤗 Transformers é
+necessario implementare dei tests. Il Cookiecutter dovrebbe fornire automaticamente dei file per test per il vostro modello, 
+di solito nella folder `tests/test_modeling_brand_new_bert.py`. Provate questo per verificare l'ok nei test piu comuni:
+
+```bash
+pytest tests/test_modeling_brand_new_bert.py
+```
+
+Una volta sistemati i test comuni, bisogna assicurarsi che il vostro lavoro sia correttamente testato cosicchè:
+
+- a) La community puo capire in maniera semplice il vostro lavoro controllando tests specifici del modello *brand_new_bert*,
+- b) Implementazioni future del vostro modello non rompano alcune feature importante del modello.
+
+Per prima cosa agguingete dei test d'integrazione. Questi sono essenziali perche fanno la stessa funzione degli scripts di 
+debug usati precedentemente. Un template per questi tests esiste gia nel Cookiecutter ed é sotto il nome di `BrandNewBertModelIntegrationTests`, 
+voi dovrete solo completarlo. Una volta che questi tests sono OK, provate:
+
+```bash
+RUN_SLOW=1 pytest -sv tests/test_modeling_brand_new_bert.py::BrandNewBertModelIntegrationTests
+```
+
+<Tip>
+
+Nel caso siate su Windows, sostituite `RUN_SLOW=1` con `SET RUN_SLOW=1`
+
+</Tip>
+
+Di seguito, tutte le features che sono utili e necessarire per *brand_new_bert* devono essere testate in test separati, 
+contenuti in `BrandNewBertModelTester`/ `BrandNewBertModelTest`. spesso la gente si scorda questi test, ma ricordate che sono utili per:
+
+
+- Aiuta gli utenti a capire il vostro codice meglio, richiamando l'attenzione su queste nuove features
+- Developers e contributors futuri potranno velocemente testare nuove implementazioni del modello testanto questi casi speciali.
+
+
+**9. Implementare il tokenizer**
+
+A questo punto avremo bisogno un tokenizer per *brand_new_bert*. Di solito il tokenizer é uguale ad altri modelli in 🤗 Transformers.
+
+É importante che troviate il file con il tokenizer originale e che lo carichiate in 🤗 Transformers.
+
+Per controllare che il tokenizer funzioni in modo corretto, create uno script nella repo originaria che riceva come input 
+una stringa e ritorni gli `input_ids`. Piu o meno questo potrebbe essere il codice:
+
+```python
+input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words."
+model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/")
+input_ids = model.tokenize(input_str)
+```
+
+Potrebbe richiedere un po' di tempo, ma guardate ancora alla repo originaria per trovare la funzione corretta del tokenizer. 
+A volte capita di dover riscrivere il tokenizer nella repo originaria, di modo da avere come output gli `input_ids`. 
+A quel punto uno script analogo é necessario in 🤗 Transformers:
+
+```python
+from transformers import BrandNewBertTokenizer
+
+input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words."
+
+tokenizer = BrandNewBertTokenizer.from_pretrained("/path/to/tokenizer/folder/")
+
+input_ids = tokenizer(input_str).input_ids
+```
+
+Una volta che `input_ids` sono uguali, bisogna aggiungere un test per il tokenizer. 
+
+Il file test per tokenizer di *brand_new_brand* dovrebbe avere un paio di hard-coded test d'integrazione.
+
+
+**10. Test end-to-end**
+
+Ora che avete il tokenizer, dovrete aggiungere dei test d'integrazione per l'intero workflow in `tests/test_modeling_brand_new_bert.py` in 🤗 Transformer.
+Questi test devono mostrare che un significante campione text-to-text funzioni come ci si aspetta nell'implementazione di  🤗 Transformers.
+*Per esempio* potreste usare dei source-to-target-translation, o un sommario di un articolo, o un domanda-risposta e cosi via. 
+Se nessuno dei checkpoints é stato ultra parametrizzato per task simili, allora i tests per il modello sono piu che sufficienti. 
+Nello step finale dovete assicurarvi che il modello sia totalmente funzionale, e consigliamo anche di provare a testare su GPU. 
+Puo succedere che ci si scordi un `.to(self.device)` ad esempio. Se non avete accesso a GPU, il team Hugging Face puo provvedere
+a testare questo aspetto per voi. 
+
+**11. Aggiungere una Docstring**
+
+Siete quasi alla fine! L'ultima cosa rimasta é avere una bella docstring e una pagina doc. Il Cookiecutter dovrebbe provvedere già 
+un template chiamato `docs/source/model_doc/brand_new_bert.rst`, che dovrete compilare. La prima cosa che un utente farà 
+per usare il vostro modello sarà dare una bella lettura al doc. Quindi proponete una documentazione chiara e concisa. É molto 
+utile per la community avere anche delle *Tips* per mostrare come il modello puo' essere usato. Non esitate a chiedere a Hugging Face 
+riguardo alle docstirng. 
+
+Quindi, assicuratevi che la docstring sia stata aggiunta a `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py`. 
+Assicuratevi che la docstring sia corretta e che includa tutti i necessari input e output. Abbiamo una guida dettagliata per 
+scrivere la documentazione e docstring.
+
+
+**Rifattorizzare il codice**
+
+Perfetto! Ora che abbiamo tutto per *brand_new_bert* controllate che lo stile del codice sia ok:
+
+```bash
+make style
+```
+
+E che il codice passi i quality check:
+
+```bash
+make quality
+```
+
+A volte capita che manchino delle informazioninella docstring o alcuni nomi sbagliati, questo farà fallire i tests sopra. 
+Ripetiamo: chiedete pure a Hugging Face, saremo lieti di aiutarvi. 
+
+Per ultimo, fare del refactoring del codice una volta che é stato creato.
+
+Avete finito con il codice, congratulazioni! 🎉 Siete fantasticiiiiiii! 😎
+
+**12. Caricare il modello sul model hub**
+
+In questa ultima parte dovrete convertire e caricare il modello, con tutti i checkpoints, nel model hub e aggiungere una 
+model card per ogni checkpoint caricato. Leggete la nostra guida [Model sharing and uploading Page](model_sharing) per 
+avere familiarità con l'hub. Di solito in questa parte lavorate a fianco di Hugging face per decidere un nome che sia ok 
+per ogni checkpoint, per ottenere i permessi necessari per caricare il modello nell'organizzazione dell'autore di *brand_new_bert*. 
+Il metodo `push_to_hub`, presente in tutti i modelli `transformers`, é una maniera rapida e indolore per caricare il vostro checkpoint sull'hub:
+
+```python
+brand_new_bert.push_to_hub(
+    repo_path_or_name="brand_new_bert",
+    # Uncomment the following line to push to an organization
+    # organization="<ORGANIZATION>",
+    commit_message="Add model",
+    use_temp_dir=True,
+)
+```
+
+Vale la pena spendere un po' di tempo per creare una model card ad-hoc per ogni checkpoint. Le model cards dovrebbero 
+suggerire le caratteristiche specifiche del checkpoint, *per esempio* su che dataset il checkpoint é stato pretrained o fine-tuned. 
+O che su che genere di task il modello lavoro? E anche buona pratica includere del codice su come usare il modello correttamente.
+
+
+**13. (Opzionale) Aggiungere un notebook**
+
+É molto utile aggiungere un notebook, che dimostri in dettaglio come *brand_new_bert* si utilizzi per fare inferenza e/o 
+fine-tuned su specifiche task. Non é una cosa obbligatoria da avere nella vostra PR, ma é molto utile per la community.
+
+**14. Sottomettere la PR**
+
+L'ultimissimo step! Ovvero il merge della PR nel main. Di solito il team Hugging face a questo punto vi avrà gia aiutato, 
+ma é ok prendere un po' di tempo per pulire la descirzione e commenti nel codice.
+
+
+### Condividete il vostro lavoro!!
+
+É ora tempo di prendere un po' di credito dalla communità per il vostro lavoro! Caricare e implementare un nuovo modello 
+é un grandissimo contributo per Transformers e l'intera community NLP. Il codice e la conversione dei modelli pre-trained sara 
+sicuramente utilizzato da centinaia o migliaia di sviluppatori e ricercatori. Siate fieri e orgogliosi di condividere il vostro 
+traguardo con l'intera community :) 
+
+** Avete create un altro modello che é super facile da usare per tutti quanti nella community! 🤯**
diff --git a/docs/source/it/add_new_model.mdx b/docs/source/it/add_new_model.mdx
deleted file mode 100644
index 464ba5830609..000000000000
--- a/docs/source/it/add_new_model.mdx
+++ /dev/null
@@ -1,775 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
--->
-
-# Come aggiungere un modello a 🤗 Transformers?
-
-Aggiungere un nuovo modello é spesso difficile e richiede una profonda conoscenza della libreria 🤗 Transformers e anche 
-della repository originale del modello. A Hugging Face cerchiamo di dare alla community sempre piú poteri per aggiungere 
-modelli independentemente. Quindi, per alcuni nuovi modelli che la community vuole aggiungere a 🤗 Transformers, abbiamo 
-creato una specifica *call-for-model-addition* che spiega passo dopo passo come aggiungere il modello richiesto. Con 
-questo *call-for-model-addition* vogliamo insegnare a volenterosi e esperti collaboratori della community come implementare
-un modello in 🤗 Transformers.
-
-Se questo é qualcosa che può interessarvi, siete liberi di controllare l'attuale “calls-for-model-addition” [qui](https://github.com/huggingface/transformers/tree/main/templates/adding_a_new_model/open_model_proposals/README.md)
-e contattarci. 
-
-Se il modello sarà selezionato, allora potrete lavorare insieme a un membro di Hugging Face per integrare il modello in 🤗
-Transformers. Così facendo, ci guadagnerai in una comprensione totale, sia teorica che pratica, del modello proposto. Inoltre, 
-sarai l'artefice di un importante contributo open-source a 🤗 Transformers. Durante l'implementazione avrai l'opportunità di:
-
-- ottenere più comprensione delle best practices in open-source
-- capire i principi di design di una della librerie NLP più popolari 
-- capire come efficientemente testare complessi modelli NLP
-- capire come integrare utilit Python come `black`, `isort`, `make fix-copies` in una libreria per garantire sempre di avere un codice leggibile e pulito 
-
-Siamo anche contenti se vuoi aggiungere un modello che non può essere trovato nella cartella “calls-for-model-addition”. 
-Le seguenti sezioni spiegano in dettaglio come aggiungere un nuovo modello. Può anche essere molto utile controllare modelli
-già aggiunti [qui](https://github.com/huggingface/transformers/pulls?q=is%3Apr+label%3A%22PR+for+Model+Addition%22+is%3Aclosed),
-per capire se richiamano il modello che vorreste aggiungere. 
-
-Per cominciare, vediamo una panoramica general della libreria Transformers.
-
-## Panoramica generale su 🤗 Transformers
-
-Prima di tutto, vediamo in generale 🤗 Transformers. 🤗 Transformers é una libreria molto strutturata, quindi
-puà essere che a volte ci sia un disaccordo con alcune filosofie della libreria o scelte di design. Dalla nostra esperienza, 
-tuttavia, abbiamo trovato che le scelte fondamentali di design della libreria sono cruciali per usare 🤗 Transformers efficacemente
-su larga scala, mantenendo i costi a un livello accettabile.  
-
-Un buon primo punto di partenza per capire al meglio la libreria é leggere la [documentazione sulla nostra filosofia](filosofia)
-Da qui, ci sono alcune scelte sul modo di lavorare che cerchiamo di applicare a tutti i modelli:
-
-- La composizione é generalmente favorita sulla sovra-astrazione
-- Duplicare il codice non é sempre male, soprattutto se migliora notevolmente la leggibilità e accessibilità del modello
-- Tutti i files creati per il nuovo modello devono il piu possibile "compatti". Questo vuol dire che quando qualcuno leggerá il codice 
-di uno specifico modello, potrá vedere solo il corrispettivo file `modeling_....py` senza avere multiple dipendenze.
-
-
-La cosa piú importante, é che consideriamo la libreria non solo un mezzo per dare un prodotto, *per esempio* dare la possibilità 
-di usare BERT per inferenza, ma é anche il prodotto reale che noi vogliamo migliorare sempre più. Quindi, quando aggiungi 
-un modello, non sei solo la persona che userà il modello, ma rappresenti anche tutti coloro che leggeranno, 
-cercheranno di capire e modificare il tuo modello.
-
-Tenendo questi principi in mente, immergiamoci nel design generale della libreria.
-
-### Panoramica sui modelli
-
-Per aggiungere con successo un modello, é importante capire l'interazione tra il tuo modello e la sua configurazione,
-[`PreTrainedModel`], e [`PretrainedConfig`]. Per dare un esempio, chiameremo il modello da aggiungere a 🤗 Transformers  
-`BrandNewBert`.
-
-Diamo un'occhiata:
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_overview.png"/>
-
-Come potete vedere, ci basiamo sull'ereditarietà in 🤗 Transformers, tenendo però il livello di astrazione a un minimo 
-assoluto.  Non ci sono mai più di due livelli di astrazione per ogni modello nella libreria. `BrandNewBertModel` eredita 
-da `BrandNewBertPreTrainedModel` che, a sua volta, eredita da [`PreTrainedModel`] -  semplice no? 
-Come regola generale, vogliamo essere sicuri che un nuovo modello dipenda solo da [`PreTrainedModel`]. Le funzionalità
-importanti che sono automaticamente conferite a ogni nuovo modello sono [`~PreTrainedModel.from_pretrained`]
-e [`~PreTrainedModel.save_pretrained`], che sono usate per serializzazione e deserializzazione. Tutte le altre importanti 
-funzionalità, come ad esempio `BrandNewBertModel.forward` devono essere definite completamente nel nuovo script
-`modeling_brand_new_bert.py`. Inoltre, vogliamo essere sicuri che un modello con uno specifico head layer, come 
-`BrandNewBertForMaskedLM` non erediti da `BrandNewBertModel`, ma piuttosto usi `BrandNewBertModel`
-come componente che può essere chiamata nel passaggio forward per mantenere il livello di astrazione basso. Ogni 
-nuovo modello richieste una classe di configurazione, chiamata `BrandNewBertConfig`. Questa configurazione é sempre 
-mantenuta come un attributo in [`PreTrainedModel`], e quindi può essere accessibile tramite l'attributo `config` 
-per tutte le classi che ereditano da `BrandNewBertPreTrainedModel`:
-
-```python
-model = BrandNewBertModel.from_pretrained("brandy/brand_new_bert")
-model.config  # il modello ha accesso al suo config
-```
-
-Analogamente al modello, la configurazione eredita le funzionalità base di serializzazione e deserializzazione da 
-[`PretrainedConfig`]. É da notare che la configurazione e il modello sono sempre serializzati in due formati differenti - 
-il modello é serializzato in un file *pytorch_model.bin* mentre la configurazione con *config.json*. Chiamando 
-[`~PreTrainedModel.save_pretrained`] automaticamente chiamerà [`~PretrainedConfig.save_pretrained`], cosicché sia il 
-modello che la configurazione siano salvati.
-
-
-### Stile per il codice
-
-Quando codifichi un nuovo modello, tieni presente che Transformers ha una sua struttura di fondo come libreria, perciò 
-ci sono alcuni fatti da considerare su come scrivere un codice :-)
-
-1. Il forward pass del tuo modello dev'essere scritto completamente nel file del modello, mentre dev'essere indipendente 
-   da altri modelli nella libreria. Se vuoi riutilizzare un blocco di codice da un altro modello, copia e incolla il codice con un commento `# Copied from` in cima al codice (guarda [qui](https://github.com/huggingface/transformers/blob/v4.17.0/src/transformers/models/roberta/modeling_roberta.py#L160)
-   per un ottimo esempio).
-2. Il codice dev'essere interamente comprensibile, anche da persone che non parlano in inglese. Questo significa che le 
-   variabili devono avere un nome descrittivo e bisogna evitare abbreviazioni. Per esempio, `activation` é molto meglio 
-   che `act`. Le variabili con una lettera sono da evitare fortemente, almeno che non sia per un indce in un for loop.
-3. Generamente é meglio avere un codice esplicito e piú lungo che un codice corto e magico.
-4. Evita di subclassare `nn.Sequential` in Pytorch, puoi subclassare `nn.Module` e scrivere il forward pass, cosicché 
-   chiunque può effettuare debug sul tuo codice, aggiungendo print o breaking points. 
-5. La tua function-signature dev'essere type-annoted. Per il resto, é meglio preferire variabili con un nome accettabile 
-   piuttosto che annotazioni per aumentare la comprensione e leggibilità del codice.
-
-### Panoramica sui tokenizers
-
-Questa sezione sarà creata al piu presto :-(
-
-## Aggiungere un modello a 🤗 Transformers passo dopo passo 
-
-Ci sono differenti modi per aggiungere un modello a Hugging Face. Qui trovi una lista di blog posts da parte della community su come aggiungere un modello:
-
-1. [Aggiungere GPT2](https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28) scritto da [Thomas](https://huggingface.co/thomwolf)
-2. [Aggiungere WMT19 MT](https://huggingface.co/blog/porting-fsmt) scritto da [Stas](https://huggingface.co/stas)
-
-Per esperienza, possiamo dirti che quando si aggiunge un modello é meglio tenere a mente le seguenti considerazioni:
-
--  Non sfondare una porta giá aperta! La maggior parte del codice che aggiungerai per un nuovo modello 🤗 Transformers
-  esiste già da qualche parte in 🤗 Transformers. Prendi un po' di tempo per trovare codici simili in modelli e tokenizers esistenti e fare un copia-incolla. Ricorda che [grep](https://www.gnu.org/software/grep/) e [rg](https://github.com/BurntSushi/ripgrep) sono tuoi buoni amici. Inoltre, ricorda che puó essere molto probabile che il tokenizer per il tuo modello sia basato sull'implementazione di un altro modello, e il codice del tuo modello stesso su un altro ancora. *Per esempio* il modello FSMT é basato su BART, mentre il tokenizer di FSMT é basato su XLM.
--  Ricorda che qui é piu una sfida ingegneristica che scientifica. Spendi piú tempo per create un efficiente ambiente di debugging piuttosto che cercare di capire tutti gli aspetti teorici dell'articolo del modello.
--  Chiedi aiuto se sei in panne! I modelli sono la parte principale di 🤗 Transformers, perciò qui a Hugging Face siamo più che contenti di aiutarti in ogni passo per aggiungere il tuo modello. Non esitare a chiedere se vedi che non riesci a progredire.
-
-Di seguito, diamo una ricetta generale per aiutare a portare un modello in 🤗 Transformers.
-
-La lista seguente é un sommario di tutto quello che é stato fatto per aggiungere un modello, e può essere usata come To-Do List:
-
--  1. ☐ (Opzionale) Capire gli aspetti teorici del modello
--  2. ☐ Preparare l'ambiente dev per transformers
--  3. ☐ Preparare l'ambiente debugging della repository originale 
--  4. ☐ Create uno script che gestisca con successo il forward pass usando la repository originale e checkpoint 
--  5. ☐ Aggiungere con successo lo scheletro del modello a Transformers
--  6. ☐ Convertire i checkpoint original a Transformers checkpoint
--  7. ☐ Effettuare con successo la forward pass in Transformers, di modo che dia un output identico al checkpoint originale 
--  8. ☐ Finire i tests per il modello in Transformers
--  9. ☐ Aggiungere con successo Tokenizer in Transformers
--  10. ☐ Testare e provare gli integration tests da capo a fine
--  11. ☐ Completare i docs
--  12. ☐ Caricare i moedl weights all'hub
--  13. ☐ Sottomettere una pull request
--  14. ☐ (Opzionale) Aggiungere un notebook con una demo
-
-Per cominciare di solito consigliamo `BrandNewBert`, partendo dalla teoria, di modo da avere una buona comprensione della teoria generale. TUttavia, se preferisci imparare l'aspetto teorico del modello mentre *lavori* sul modello é ok immergersi direttamente nel codice di `BrandNewBert`. Questa opzione puó essere buona se le tue skills ingegneristiche sono meglio che quelle teoriche, o se il paper `BrandNewBert` ti dá problemi, o se semplicemente ti piace programmare piú che leggere articoli scientifici.
-
-### 1. (Opzionale) Aspetti teorici di BrandNewBert 
-
-Allora con calma, prendi un po' di tempo per leggere l'articolo su *BrandNewBert* . Sicuramente, alcune sezioni dell'articolo sono molto complesse, ma non preoccuparti! L'obiettivo non é avere una compresione immensa della teoria alla base, ma estrarre le informazioni necessarie per re-implementare con successo il modello in 🤗 Transformers. Quindi, non impazzire sugli aspetti teorici, ma piuttosto focalizzati su quelli pratici, ossia:
-
-- Che tipo di modello é *brand_new_bert*? É solo un encoder in stile BERT? O tipo decoder come GPT2? O encoder e decoder stile BART? Dai un'occhiata a [model_summary](model_summary) se non sei famigliare con le differenze tra questi modelli 
-- Quali sono le applicazioni di *brand_new_bert*? Classificazione di testo? Generazione di testo? O per tasks del genere seq2seq? 
-- Quali sono le nuove aggiunte al modello che lo rendono diverso da BERT/GPT-2/BART? 
-- Quali modelli estistenti in [🤗 Transformers models](https://huggingface.co/transformers/#contents) sono molto simili a *brand_new_bert*?
-- Che tipo di tokenizer si usa in questo caso? Un sentencepiece tokenizer? O un word piece tokenizer? Il tokenizer é lo stesso di BERT o BART? 
-
-Una volta che senti che hai avuto una bella overview dell'architettura del modello, puoi scrivere senza problemi al team di Hugging Face per ogni domanda che tu hai. Questo puó includere domande sull'architettura del modello, o sull'attention layer, etc. Saremo molto felici di aiutarti :) 
-
-
-### 2. Prepare il tuo ambiente
-
-1. Forka la [repository](https://github.com/huggingface/transformers) cliccando sul tasto ‘Fork' nella pagina della repository. Questo crea una copia del codice nel tuo account GitHub 
-
-2. Clona il tuo fork `transfomers` sul tuo dico locale, e aggiungi la repository base come remota:
-
-```bash
-git clone https://github.com/[your Github handle]/transformers.git
-cd transformers
-git remote add upstream https://github.com/huggingface/transformers.git
-```
-
-
-3. Crea un ambiente di sviluppo, per esempio tramite questo comando:
-
-```bash
-python -m venv .env
-source .env/bin/activate
-pip install -e ".[dev]"
-```
-
-quindi torna alla directory principale: 
-
-```bash
-cd ..
-```
-
-
-4. Attenzione, raccomandiamo di aggiungere la versione di PyTorch di *brand_new_bert* a Transfomers. Per installare PyTorch, basta seguire queste istruzioni https://pytorch.org/get-started/locally/.
-
-**Nota bene:** Non c'é bisogno di installare o avere installato CUDA. Il nuovo modello può funzionare senza problemi su una CPU.
-
-
-5. Per trasferire *brand_new_bert* To port *brand_new_bert* avrai bisogno anche accesso alla sua repository originale:
-
-```bash
-git clone https://github.com/org_that_created_brand_new_bert_org/brand_new_bert.git 
-cd brand_new_bert
-pip install -e .
-```
-
-Ok, ora hai un ambiente di sviluppo per portare *brand_new_bert* in 🤗 Transformers.
-
-
-### 3.-4. Provare un pretrained checkpoint usando la repo originale 
-
-Per cominciare, comincerai a lavorare sulla repo originale di *brand_new_bert*. Come spesso accade, l'implementazione originale é molto sullo stile "ricerca". Questo significa che a volte la documentazione non é al top, magari manca qualche cosa e il codice puó essere difficile da capire. Tuttavia, questa é e dev'essere la motivazione per reimplementare *brand_new_bert*. In Hugging Face, uno degli obiettivi principali é di *mettere le persone sulle spalle dei giganti*, il che si traduce, in questo contesto, di prendere un modello funzionante e riscriverlo e renderlo il piú possibile **accessibile, user-friendly, e leggibile**. Questa é la top motivazione per re-implementare modelli in 🤗 Transformers - cercare di creare nuove complesse tecnologie NLP accessibili a **chiunque**. 
-
-Riuscire a far girare il modello pretrained originale dalla repository ufficiale é spesso il passo **piu arduo**. Dalla nostra esperienza, é molto importante spendere un p' di tempo per diventare familiari con il codice base originale. Come test, prova a capire i seguenti punti:
-
-- Dove si trovano i pretrained weights? 
-- Come caricare i pretrained weights nel modello corrispondente? 
-- Come girare un tokenizer independentemente dal modello? 
-- Prova a tracciare un singolo forward pass, cosicché potrai sapere che classi e funzioni sono richieste per un semplice forward pass. Di solito, dovrai reimplementare queste funzioni e basta 
-- Prova a localizzare i componenti importanti del modello: Dove si trova la classe del modello? Ci sono sotto classi nel modello *per esempio* EngoderModel, DecoderMOdel? Dove si trova il self-attention layer? Ci sono molteplici differenti layer di attention, *per esempio * *self-attention*, *cross-attention*...?
-- Come puoi fare debug sul modello nell'ambiente originale della repo? Devi aggiungere dei *print* o puoi usare *ipdb* come debugger interattivo, o vabene anche un IDE efficiente per debug come PyCharm?
-
-É molto importante che prima di cominciare a trasferire il modello nuovo tu spenda tempo a fare debug del codice originale in maniera **efficiente**! Inoltre, ricorda che tutta la library é open-soruce, quindi non temere di aprire issue o fare una pull request nella repo originale. Tutti coloro che mantengono la repository saranno piú che felici di avere qualcuno che guarda e gioca con i loro codici!
-
-A questo punto, sta a te decidere quale ambiente per debug vuoi usare. Noi consilgiamo di evitare setup con GPU, che potrebbero costare assai, lavorare su una CPU puó essere un ottimo punto di partenza per indagare la repository originale e per cominciare a scrivere il codice per 🤗 Transformers. Solo alla fine, quando il modello é stato portato con successo in  🤗 Transformers, allora si potrá verificare il suo funzionamento su GPU.
-
-In generale ci sono due possibili ambienti di debug per il testare il modello originale: 
-
-- [Jupyter notebooks](https://jupyter.org/) / [google colab](https://colab.research.google.com/notebooks/intro.ipynb)
-- Scripts locali in Python 
-
-Il vantaggio dei Jupyter notebooks é la possibilità di eseguire cella per cella, il che può essere utile per decomporre tutte le componenti logiche, cosi da a vere un ciclo di debug più rapido, siccome si possono salvare i risultati da steps intermedi. Inoltre, i notebooks spesso sono molto facili da condividere con altri contributors, il che può essere molto utile se vuoi chiedere aiuto al team di Hugging Face. Se sei famigliare con Jupyter notebooks allora racommandiamo di lavorare in questa maniera.
-
-Ovviamente se non siete abituati a lavorare con i notebook, questo può essere uno svantaggio nell'usare questa tecnologia, sprecando un sacco di tempo per setup e portare tutto al nuovo ambiente, siccome non potreste neanche usare dei tools di debug come `ipdb`. 
-
-Per ogni pratica code-base, é sempre meglio come primo step caricare un **piccolo** checkpoint pretrained e cercare di riprodurre un singolo forward pass usando un vettore fittizio di IDs fatti da numeri interi. Un esempio per uno script simile, in pseudocodice é:
-
-```python
-model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/")
-input_ids = [0, 4, 5, 2, 3, 7, 9]  # vector of input ids
-original_output = model.predict(input_ids)
-```
-
-Per quanto riguarda la strategia di debugging, si può scegliere tra:
-
-- Decomporre il modello originario in piccole componenenti e testare ognuna di esse 
-- Decomporre il modello originario nel *tokenizer* originale e nel *modello* originale, testare un forward pass su questi, 
-e usare dei print statement o breakpoints intermedi per verificare
-
-Ancora una volta, siete liberi di scegliere quale strategia sia ottimale per voi. Spesso una strategia é piu 
-avvantaggiosa di un'altra, ma tutto dipende dall'code-base originario.
-
-Se il code-base vi permette di decomporre il modello in piccole sub-componenenti, *per esempio* se il code-base 
-originario può essere facilmente testato in eager mode, allora vale la pena effettuare un debugging di questo genere. 
-Ricordate che ci sono dei vantaggi nel decidere di prendere la strada piu impegnativa sin da subito: 
-
-- negli stage piu finali, quando bisognerà comparare il modello originario all'implementazione in Hugging Face, potrete verificare
-automaticamente ogni componente, individualmente, di modo che ci sia una corrispondenza 1:1
-- avrete l'opportunità di decomporre un problema molto grande in piccoli passi, così da strutturare meglio il vostro lavoro
-- separare il modello in componenti logiche vi aiuterà ad avere un'ottima overview sul design del modello, quindi una migliore 
-comprensione del modello stesso 
-- verso gli stage finali i test fatti componente per componente vi aiuterà ad essere sicuri di non andare avanti e indietro
-nell'implementazione, così da continuare la modifica del codice senza interruzione
-
-Un ottimo esempio di come questo può essere fatto é dato da [Lysandre](https://gist.github.com/LysandreJik/db4c948f6b4483960de5cbac598ad4ed) 
-per il modello ELECTRA
-
-Tuttavia, se il code-base originale é molto complesso o le componenti intermedie possono essere testate solo in tramite 
-compilazione, potrebbe richiedere parecchio tempo o addirittura essere impossibile separare il modello in piccole sotto-componenti. 
-Un buon esempio é [MeshTensorFlow di T5](https://github.com/tensorflow/mesh/tree/master/mesh_tensorflow). Questa libreria 
-é molto complessa e non offre un metodo semplice di decomposizione in sotto-componenti. Per simili librerie, potrete fare 
-affidamento ai print statements.
-
-In ogni caso, indipendentemente da quale strategia scegliete, la procedura raccomandata é di cominciare a fare debug dal 
-primo layer al layer finale. 
-É consigliato recuperare gli output dai layers, tramite print o sotto-componenti, nel seguente ordine:
-
-1. Recuperare gli IDs di input dati al modello
-2. Recuperare i word embeddings
-3. Recuperare l'input del primo Transformer layer 
-4. Recuperare l'output del primo Transformer layer 
-5. Recuperare l'output dei seguenti `n - 1` Transformer layers
-6. Recuperare l'output dell'intero BrandNewBert Model
-
-Gli IDs in input dovrebbero essere un arrary di interi, *per esempio* `input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]`
-
-Gli output dei seguenti layer di solito dovrebbero essere degli array di float multi-dimensionali come questo:
-
-```
-[[
- [-0.1465, -0.6501,  0.1993,  ...,  0.1451,  0.3430,  0.6024],
- [-0.4417, -0.5920,  0.3450,  ..., -0.3062,  0.6182,  0.7132],
- [-0.5009, -0.7122,  0.4548,  ..., -0.3662,  0.6091,  0.7648],
- ...,
- [-0.5613, -0.6332,  0.4324,  ..., -0.3792,  0.7372,  0.9288],
- [-0.5416, -0.6345,  0.4180,  ..., -0.3564,  0.6992,  0.9191],
- [-0.5334, -0.6403,  0.4271,  ..., -0.3339,  0.6533,  0.8694]]],
-```
-
-Ci aspettiamo che ogni modello aggiunto a 🤗 Transformers passi con successo un paio di test d'integrazione. Questo 
-significa che il modello originale e la sua implementazione in 🤗 Transformers abbiano lo stesso output con una precisione 
-di 0.001! Siccome é normale che lo stesso esatto modello, scritto in librerie diverse, possa dare output leggermente 
-diversi, la tolleranza accettata é 1e-3 (0.001). Ricordate che i due modelli devono dare output quasi identici. Dunque, 
-é molto conveniente comparare gli output intermedi di 🤗 Transformers molteplici volte con gli output intermedi del 
-modello originale di *brand_new_bert*. Di seguito vi diamo alcuni consigli per avere un ambiente di debug il piu efficiente
-possibile:
-
-- Trovate la migliore strategia per fare debug dei risultati intermedi. Per esempio, é la repository originale scritta in PyTorch?
-Se si, molto probabilmente dovrete dedicare un po' di tempo per scrivere degli script piu lunghi, così da decomporre il 
-modello originale in piccole sotto-componenti, in modo da poter recuperare i valori intermedi. Oppure, la repo originale 
-é scritta in Tensorflow 1? Se é così dovrete fare affidamento ai print di Tensorflow [tf.print](https://www.tensorflow.org/api_docs/python/tf/print) 
-per avere i valori intermedi. Altro caso, la repo é scritta in Jax? Allora assicuratevi che il modello non sia in **jit** 
-quanto testate il foward pass, *per esempio* controllate [questo link](https://github.com/google/jax/issues/196). 
-- Usate i più piccoli pretrained checkpoint che potete trovare. Piu piccolo é il checkpoint, piu velocemente sarà il vostro 
-ciclo di debug. Non é efficiente avere un pretrained model così gigante che per il forward pass impieghi piu di 10 secondi. 
-Nel caso in cui i checkpoints siano molto grandi, e non si possa trovare di meglio, allora é buona consuetudine ricorrere
-a fare un dummy model nel nuovo ambiente, con weights inizializzati random e salvare quei weights per comprare la versione 🤗 Transformers 
-con il vostro modello
-- Accertatevi di usare la via piu semplice per chiamare il forward pass nella repo originale. Sarebbe opportuno trovare 
-la funzione originaria che chiami **solo** un singolo forward pass, *per esempio* questa funzione spesso viene chiamata 
-`predict`, `evaluate`, `forward` o `__call__`. Siate sicuri di non fare debug su una funzione che chiami `forward` molteplici 
-volte, *per esempio* per generare testo, come `autoregressive_sample`, `generate`.
-- Cercate di separare la tokenization dal forward pass del modello. Se la repo originaria mostra esempio dove potete dare 
-come input una stringa, provate a cercare dove nella forward call la stringa viene cambiata in input ids e cominciate il 
-debug da questo punto. Questo vi garantisce un ottimo punto di partenza per scrivere un piccolo script personale dove dare 
-gli input al modello, anziche delle stringhe in input. 
-- Assicuratevi che il debugging **non** sia in training mode. Spesso questo potra il modello a dare degli output random, per 
-via dei molteplici dropout layers. Assicuratevi che il forward pass nell'ambiente di debug sia **deterministico**, cosicche 
-i dropout non siano usati. Alternativamente, potete usare *transformers.utils.set_seed* se la vecchia e nuova implementazione 
-sono nello stesso framework.
-
-La seguente sezione vi da ulteriori dettagli e accorgimenti su come potete fare tutto questo per *brand_new_bert*.
-
-
-### 5.-14. Trasferire BrandNewBert in 🤗 Transformers
-
-Allora cominciamo ad aggiungere un nuovo codice in 🤗 Transformers. Andate nel vostro fork clone di 🤗 Transformers:
-
-
-```bash 
-cd transformers
-```
-
-Nel caso speciale in cui stiate aggiungendo un modello, la cui architettura sia identica a una di un modello già esistente,
-dovrete solo aggiugnere uno script di conversione, come descritto [qui](#write-a-conversion-script).
-In questo caso, potete riutilizzare l'intera architettura del modello gia esistente.
-
-Se questo non é il caso, cominciamo con il generare un nuovo modello. Avrete due opzioni:
-
-- `transformers-cli add-new-model-like` per aggiungere un nuovo modello come uno che gia esiste
-- `transformers-cli add-new-model` per aggiungere un nuovo modello da un nostro template (questo assomigliera a BERT o Bart, in base al modello che selezionerete)
-
-In entrambi i casi, l'output vi darà un questionario da riempire con informazioni basi sul modello. Il secondo comando richiede di installare
-un `cookiecutter` - maggiori informazioni [qui](https://github.com/huggingface/transformers/tree/main/templates/adding_a_new_model).
-
-**Aprire una Pull Request in main huggingface/transformers repo**
-
-Prime di cominciare ad adattare il codice automaticamente generato, aprite una nuova PR come "Work in progress (WIP)", 
-*per esempio* "[WIP] Aggiungere *brand_new_bert*", cosicché il team di Hugging Face possa lavorare al vostro fianco nell'
-integrare il modello in 🤗 Transformers.
-
-Questi sarebbero gli step generali da seguire:
-
-1. Creare un branch dal main branch con un nome descrittivo 
-
-```bash 
-git checkout -b add_brand_new_bert 
-```
-
-2. Commit del codice automaticamente generato 
-
-```bash 
-git add . 
-git commit 
-```
-
-3. Fare fetch e rebase del main esistente
-
-```bash 
-git fetch upstream 
-git rebase upstream/main 
-```
-
-4. Push dei cambiamenti al proprio account: 
-
-```bash
-git push -u origin a-descriptive-name-for-my-changes
-```
-
-5. Una volte che siete soddisfatti dei nuovi cambiamenti, andate sulla webpage del vostro fork su GitHub. Cliccate "Pull request". 
-Assiuratevi di aggiungere alcuni membri di Hugging Face come reviewers, nel riguardo alla destra della pagina della PR, cosicche il team 
-Hugging Face verrà notificato anche per i futuri cambiamenti. 
-
-6. Cambiare la PR a draft, cliccando su "Convert to draft" alla destra della pagina della PR
-
-Da quel punto in poi, ricordate di fare commit di ogni progresso e cambiamento, cosicche venga mostrato nella PR. Inoltre, 
-ricordatevi di tenere aggiornato il vostro lavoro con il main esistente:
-
-```bash
-git fetch upstream
-git merge upstream/main
-```
-
-In generale, tutte le domande che avrete riguardo al modello o l'implementazione dovranno essere fatte nella vostra PR 
-e discusse/risolte nella PR stessa. In questa maniera, il team di Hugging Face sarà sempre notificato quando farete commit 
-di un nuovo codice o se avrete qualche domanda. É molto utile indicare al team di Hugging Face il codice a cui fate riferimento 
-nella domanda, cosicche il team potra facilmente capire il problema o la domanda. 
-
-Per fare questo andate sulla tab "Files changed", dove potrete vedere tutti i vostri cambiamenti al codice, andate sulla linea 
-dove volete chiedere una domanda, e cliccate sul simbolo "+" per aggiungere un commento. Ogni volta che una domanda o problema 
-é stato risolto, cliccate sul bottone "Resolve".
-
-In questa stessa maniera, Hugging Face aprirà domande o commenti nel rivedere il vostro codice. Mi raccomando, chiedete più 
-domande possibili nella pagina della vostra PR. Se avete domande molto generali, non molto utili per il pubblico, siete liberi 
-di chiedere al team Hugging Face direttamente su slack o email.
-
-
-**5. Adattare i codici per brand_new_bert**
-
-Per prima cosa, ci focalizzeremo sul modello e non sui tokenizer. Tutto il codice relative dovrebbe trovarsi in  
-`src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` e
-`src/transformers/models/brand_new_bert/configuration_brand_new_bert.py`.
-
-Ora potete finalmente cominciare il codice :). Il codice generato in 
-`src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` avrà sia la stessa architettura di BERT se é un 
-modello encoder-only o BART se é encoder-decoder. A questo punto, ricordatevi cio che avete imparato all'inizio, riguardo 
-agli aspetti teorici del modello: *In che maniera il modello che sto implmementando é diverso da BERT o BART?*. Implementare 
-questi cambi  spesso vuol dire cambiare il layer *self-attention*, l'ordine dei layer di normalizzazione e così via... 
-Ancora una volta ripetiamo, é molto utile vedere architetture simili di modelli gia esistenti in Transformers per avere 
-un'idea migliore su come implementare il modello. 
-
-**Notate** che a questo punto non dovete avere subito un codice tutto corretto o pulito. Piuttosto, é consigliato cominciare con un 
-codice poco pulito, con copia-incolla del codice originale in `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` 
-fino a che non avrete tutto il codice necessario. In base alla nostra esperienza, é molto meglio aggiungere una prima bozza 
-del codice richiesto e poi correggere e migliorare iterativamente. L'unica cosa essenziale che deve funzionare qui é la seguente 
-instanza: 
-
-```python
-from transformers import BrandNewBertModel, BrandNewBertConfig
-
-model = BrandNewBertModel(BrandNewBertConfig())
-```
-
-Questo comando creerà un modello con i parametri di default definiti in `BrandNewBergConfig()` e weights random. Questo garantisce 
-che `init()` di tutte le componenti funzioni correttamente.
-
-
-**6. Scrivere uno script di conversione**
-
-Il prossimo step é scrivere uno script per convertire il checkpoint che avete usato per fare debug su *brand_new_berts* nella 
-repo originale in un checkpoint per la nuova implementazione di *brand_new_bert* in 🤗 Transformers. Non é consigliato scrivere 
-lo script di conversione da zero, ma piuttosto cercate e guardate script gia esistenti in 🤗 Transformers, così da trovarne
-uno simile al vostro modello. Di solito basta fare una copia di uno script gia esistente e adattarlo al vostro caso. 
-Non esistate a chiedre al team di Hugging Face a riguardo.
-
-- Se state convertendo un modello da TensorFlow a PyTorch, un ottimo inizio é vedere [questo script di conversione per BERT](https://github.com/huggingface/transformers/blob/7acfa95afb8194f8f9c1f4d2c6028224dbed35a2/src/transformers/models/bert/modeling_bert.py#L91)
-- Se state convertendo un modello da PyTorch a PyTorch, [lo script di conversione di BART può esservi utile](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py)
-
-Qui di seguito spiegheremo come i modelli PyTorch salvano i weights per ogni layer e come i nomi dei layer sono definiti. In PyTorch, 
-il nomde del layer é definito dal nome della class attribute che date al layer. Definiamo un modello dummy in PyTorch, 
-chiamato `SimpleModel`:
-
-```python
-from torch import nn
-
-
-class SimpleModel(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.dense = nn.Linear(10, 10)
-        self.intermediate = nn.Linear(10, 10)
-        self.layer_norm = nn.LayerNorm(10)
-```
-Ora possiamo creare un'instanza di questa definizione di modo da inizializzare a random weights: `dense`, `intermediate`, `layer_norm`.
-Possiamo usare print per vedere l'architettura del modello:
-
-```python
-model = SimpleModel()
-
-print(model)
-```
-
-Da cui si ottiene:
-
-```
-SimpleModel(
-  (dense): Linear(in_features=10, out_features=10, bias=True)
-  (intermediate): Linear(in_features=10, out_features=10, bias=True)
-  (layer_norm): LayerNorm((10,), eps=1e-05, elementwise_affine=True)
-)
-```
-
-Si può vedere come i nomi dei layers siano definiti dal nome della class attribute in PyTorch. I valori dei weights di uno 
-specifico layer possono essere visualizzati:
-
-
-```python
-print(model.dense.weight.data)
-```
-
-ad esempio:
-
-```
-tensor([[-0.0818,  0.2207, -0.0749, -0.0030,  0.0045, -0.1569, -0.1598,  0.0212,
-         -0.2077,  0.2157],
-        [ 0.1044,  0.0201,  0.0990,  0.2482,  0.3116,  0.2509,  0.2866, -0.2190,
-          0.2166, -0.0212],
-        [-0.2000,  0.1107, -0.1999, -0.3119,  0.1559,  0.0993,  0.1776, -0.1950,
-         -0.1023, -0.0447],
-        [-0.0888, -0.1092,  0.2281,  0.0336,  0.1817, -0.0115,  0.2096,  0.1415,
-         -0.1876, -0.2467],
-        [ 0.2208, -0.2352, -0.1426, -0.2636, -0.2889, -0.2061, -0.2849, -0.0465,
-          0.2577,  0.0402],
-        [ 0.1502,  0.2465,  0.2566,  0.0693,  0.2352, -0.0530,  0.1859, -0.0604,
-          0.2132,  0.1680],
-        [ 0.1733, -0.2407, -0.1721,  0.1484,  0.0358, -0.0633, -0.0721, -0.0090,
-          0.2707, -0.2509],
-        [-0.1173,  0.1561,  0.2945,  0.0595, -0.1996,  0.2988, -0.0802,  0.0407,
-          0.1829, -0.1568],
-        [-0.1164, -0.2228, -0.0403,  0.0428,  0.1339,  0.0047,  0.1967,  0.2923,
-          0.0333, -0.0536],
-        [-0.1492, -0.1616,  0.1057,  0.1950, -0.2807, -0.2710, -0.1586,  0.0739,
-          0.2220,  0.2358]]).
-```
-
-Nello script di conversione, dovreste riempire quei valori di inizializzazione random con gli stessi weights del corrispondente 
-layer nel checkpoint. *Per esempio*
-
-```python
-# retrieve matching layer weights, e.g. by
-# recursive algorithm
-layer_name = "dense"
-pretrained_weight = array_of_dense_layer
-
-model_pointer = getattr(model, "dense")
-
-model_pointer.weight.data = torch.from_numpy(pretrained_weight)
-```
-
-Così facendo, dovete verificare che ogni inizializzazione random di un peso del modello PyTorch e il suo corrispondente peso nel pretrained checkpoint 
-siano esattamente gli stessi e uguali in **dimensione/shape e nome**. Per fare questo, é **necessario** aggiungere un `assert` 
-per la dimensione/shape e nome:
-
-```python
-assert (
-    model_pointer.weight.shape == pretrained_weight.shape
-), f"Pointer shape of random weight {model_pointer.shape} and array shape of checkpoint weight {pretrained_weight.shape} mismatched"
-```
-
-Inoltre, dovrete fare il print sia dei nomi che dei weights per essere sicuri che siano gli stessi:
-
-```python
-logger.info(f"Initialize PyTorch weight {layer_name} from {pretrained_weight.name}")
-```
-
-Se la dimensione o il nome non sono uguali, probabilmente avete sbagliato ad assegnare il peso nel checkpoint o nel layer costrutture di 
- 🤗 Transformers.
-
-Una dimensione sbagliata può essere dovuta ad un errore nei parameteri in `BrandNewBertConfig()`. Tuttavia, può essere anche 
-che l'implementazione del layer in PyTorch richieda di fare una transposizione della matrice dei weights. 
-
-Infine, controllate **tutti** che tutti i weights inizializzati e fate print di tutti i weights del checkpoint che non sono stati 
-usati per l'inizializzazione, di modo da essere sicuri che il modello sia correttamente convertito. É normale che ci siano 
-errori nel test di conversione, fai per un errore in `BrandNewBertConfig()`, o un errore nell'architettura in 🤗 Transformers, 
-o un bug in `init()`. 
-
-Questo step dev'essere fatto tramite iterazioni fino a che non si raggiungano gli stessi valori per i weights. Una volta che 
-il checkpoint é stato correttamente caricato in 🤗 Transformers, potete salvare il modello in una cartella di vostra scelta 
-`/path/to/converted/checkpoint/folder` che contenga sia
-`pytorch_model.bin` che `config.json`:
-
-```python
-model.save_pretrained("/path/to/converted/checkpoint/folder")
-```
-
-
-**7. Implementare il forward pass**
-
-Una volta che i weights pretrained sono stati correttamente caricati in 🤗 Transformers, dovrete assicurarvi che il forward pass 
-sia correttamente implementato. [Qui](#provare-un-pretrained-checkpoint-usando-la-repo-originale), avete give creato e provato
-uno script che testi il forward pass del modello usando la repo originaria. Ora dovrete fare lo stesso con uno script analogo 
-usando l'implementazione in 🤗 Transformers anziché l'originale. Piu o meno lo script dovrebbe essere:
-
-```python
-model = BrandNewBertModel.from_pretrained("/path/to/converted/checkpoint/folder")
-input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]
-output = model(input_ids).last_hidden_states
-```
-
-Di solito l'output da 🤗 Transformers non é uguale uguale all'output originario, sopratto la prima volta. Non vi abbattete - 
-é normale! Prima di tutto assicuratevi che non ci siano errori o che non vengano segnalati degli errori nella forward pass. 
-Spesso capita che ci siano dimensioni sbagliate o data type sbagliati, *ad esempio* `torch.long` anziche `torch.float32`. 
-Non esistate a chiedere al team Hugging Face!
-
-Nella parte finale assicuratevi che l'implementazione 🤗 Transformers funzioni correttamente cosi da testare che gli output 
-siano equivalenti a una precisione di `1e-3`. Controllate che `outputs.shape` siano le stesse tra 🤗 Transformers e l'implementazione 
-originaria. Poi, controllate che i valori in output siano identici. Questa é sicuramente la parte più difficile, qui una serie 
-di errori comuni quando gli output non sono uguali:
-
-- Alcuni layers non sono stati aggiunti, *ad esempio* un *activation* layer non é stato aggiunto, o ci si é scordati di una connessione 
-- La matrice del word embedding non é stata ripareggiata 
-- Ci sono degli embeddings posizionali sbagliati perché l'implementazione originaria ha un offset 
-- Il dropout é in azione durante il forward pass. Per sistemare questo errore controllate che *model.training = False* e che 
-il dropout non sia stato attivato nel forward pass, * per esempio * passate *self.training* a [PyTorch's functional dropout](https://pytorch.org/docs/stable/nn.functional.html?highlight=dropout#torch.nn.functional.dropout)
-
-La miglior maniera per sistemare il problema é di vedere all'implementazione originaria del forward pass e in 🤗 Transformers 
-fianco a fianco e vedere se ci sono delle differenze. In teoria, con debug e print degli output intermedie di entrambe le 
-implementazioni nel forward pass nell'esatta posizione del network dovrebbe aiutarvi a vedere dove ci sono differenze tra 
-i due frameworks. Come prima mossa controllate che `input_ids` siano identici in entrambi gli scripts. Da lì andate fino 
-all'ultimo layer. Potrete notare una differenza tra le due implementazioni a quel punto. 
-
-Una volta che lo stesso output é stato ragguingi, verificate gli output con `torch.allclose(original_output, output, atol=1e-3)`.
-A questo punto se é tutto a posto: complimenti! Le parti seguenti saranno una passeggiata 😊.
-
-
-**8. Aggiungere i test necessari per il modello**
-
-A questo punto avete aggiunto con successo il vostro nuovo modello. Tuttavia, é molto probabile che il modello non sia 
-del tutto ok con il design richiesto. Per essere sicuri che l'implementazione sia consona e compatibile con 🤗 Transformers é
-necessario implementare dei tests. Il Cookiecutter dovrebbe fornire automaticamente dei file per test per il vostro modello, 
-di solito nella folder `tests/test_modeling_brand_new_bert.py`. Provate questo per verificare l'ok nei test piu comuni:
-
-```bash
-pytest tests/test_modeling_brand_new_bert.py
-```
-
-Una volta sistemati i test comuni, bisogna assicurarsi che il vostro lavoro sia correttamente testato cosicchè:
-
-- a) La community puo capire in maniera semplice il vostro lavoro controllando tests specifici del modello *brand_new_bert*,
-- b) Implementazioni future del vostro modello non rompano alcune feature importante del modello.
-
-Per prima cosa agguingete dei test d'integrazione. Questi sono essenziali perche fanno la stessa funzione degli scripts di 
-debug usati precedentemente. Un template per questi tests esiste gia nel Cookiecutter ed é sotto il nome di `BrandNewBertModelIntegrationTests`, 
-voi dovrete solo completarlo. Una volta che questi tests sono OK, provate:
-
-```bash
-RUN_SLOW=1 pytest -sv tests/test_modeling_brand_new_bert.py::BrandNewBertModelIntegrationTests
-```
-
-<Tip>
-
-Nel caso siate su Windows, sostituite `RUN_SLOW=1` con `SET RUN_SLOW=1`
-
-</Tip>
-
-Di seguito, tutte le features che sono utili e necessarire per *brand_new_bert* devono essere testate in test separati, 
-contenuti in `BrandNewBertModelTester`/ `BrandNewBertModelTest`. spesso la gente si scorda questi test, ma ricordate che sono utili per:
-
-
-- Aiuta gli utenti a capire il vostro codice meglio, richiamando l'attenzione su queste nuove features
-- Developers e contributors futuri potranno velocemente testare nuove implementazioni del modello testanto questi casi speciali.
-
-
-**9. Implementare il tokenizer**
-
-A questo punto avremo bisogno un tokenizer per *brand_new_bert*. Di solito il tokenizer é uguale ad altri modelli in 🤗 Transformers.
-
-É importante che troviate il file con il tokenizer originale e che lo carichiate in 🤗 Transformers.
-
-Per controllare che il tokenizer funzioni in modo corretto, create uno script nella repo originaria che riceva come input 
-una stringa e ritorni gli `input_ids`. Piu o meno questo potrebbe essere il codice:
-
-```python
-input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words."
-model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/")
-input_ids = model.tokenize(input_str)
-```
-
-Potrebbe richiedere un po' di tempo, ma guardate ancora alla repo originaria per trovare la funzione corretta del tokenizer. 
-A volte capita di dover riscrivere il tokenizer nella repo originaria, di modo da avere come output gli `input_ids`. 
-A quel punto uno script analogo é necessario in 🤗 Transformers:
-
-```python
-from transformers import BrandNewBertTokenizer
-
-input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words."
-
-tokenizer = BrandNewBertTokenizer.from_pretrained("/path/to/tokenizer/folder/")
-
-input_ids = tokenizer(input_str).input_ids
-```
-
-Una volta che `input_ids` sono uguali, bisogna aggiungere un test per il tokenizer. 
-
-Il file test per tokenizer di *brand_new_brand* dovrebbe avere un paio di hard-coded test d'integrazione.
-
-
-**10. Test end-to-end**
-
-Ora che avete il tokenizer, dovrete aggiungere dei test d'integrazione per l'intero workflow in `tests/test_modeling_brand_new_bert.py` in 🤗 Transformer.
-Questi test devono mostrare che un significante campione text-to-text funzioni come ci si aspetta nell'implementazione di  🤗 Transformers.
-*Per esempio* potreste usare dei source-to-target-translation, o un sommario di un articolo, o un domanda-risposta e cosi via. 
-Se nessuno dei checkpoints é stato ultra parametrizzato per task simili, allora i tests per il modello sono piu che sufficienti. 
-Nello step finale dovete assicurarvi che il modello sia totalmente funzionale, e consigliamo anche di provare a testare su GPU. 
-Puo succedere che ci si scordi un `.to(self.device)` ad esempio. Se non avete accesso a GPU, il team Hugging Face puo provvedere
-a testare questo aspetto per voi. 
-
-**11. Aggiungere una Docstring**
-
-Siete quasi alla fine! L'ultima cosa rimasta é avere una bella docstring e una pagina doc. Il Cookiecutter dovrebbe provvedere già 
-un template chiamato `docs/source/model_doc/brand_new_bert.rst`, che dovrete compilare. La prima cosa che un utente farà 
-per usare il vostro modello sarà dare una bella lettura al doc. Quindi proponete una documentazione chiara e concisa. É molto 
-utile per la community avere anche delle *Tips* per mostrare come il modello puo' essere usato. Non esitate a chiedere a Hugging Face 
-riguardo alle docstirng. 
-
-Quindi, assicuratevi che la docstring sia stata aggiunta a `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py`. 
-Assicuratevi che la docstring sia corretta e che includa tutti i necessari input e output. Abbiamo una guida dettagliata per 
-scrivere la documentazione e docstring.
-
-
-**Rifattorizzare il codice**
-
-Perfetto! Ora che abbiamo tutto per *brand_new_bert* controllate che lo stile del codice sia ok:
-
-```bash
-make style
-```
-
-E che il codice passi i quality check:
-
-```bash
-make quality
-```
-
-A volte capita che manchino delle informazioninella docstring o alcuni nomi sbagliati, questo farà fallire i tests sopra. 
-Ripetiamo: chiedete pure a Hugging Face, saremo lieti di aiutarvi. 
-
-Per ultimo, fare del refactoring del codice una volta che é stato creato.
-
-Avete finito con il codice, congratulazioni! 🎉 Siete fantasticiiiiiii! 😎
-
-**12. Caricare il modello sul model hub**
-
-In questa ultima parte dovrete convertire e caricare il modello, con tutti i checkpoints, nel model hub e aggiungere una 
-model card per ogni checkpoint caricato. Leggete la nostra guida [Model sharing and uploading Page](model_sharing) per 
-avere familiarità con l'hub. Di solito in questa parte lavorate a fianco di Hugging face per decidere un nome che sia ok 
-per ogni checkpoint, per ottenere i permessi necessari per caricare il modello nell'organizzazione dell'autore di *brand_new_bert*. 
-Il metodo `push_to_hub`, presente in tutti i modelli `transformers`, é una maniera rapida e indolore per caricare il vostro checkpoint sull'hub:
-
-```python
-brand_new_bert.push_to_hub(
-    repo_path_or_name="brand_new_bert",
-    # Uncomment the following line to push to an organization
-    # organization="<ORGANIZATION>",
-    commit_message="Add model",
-    use_temp_dir=True,
-)
-```
-
-Vale la pena spendere un po' di tempo per creare una model card ad-hoc per ogni checkpoint. Le model cards dovrebbero 
-suggerire le caratteristiche specifiche del checkpoint, *per esempio* su che dataset il checkpoint é stato pretrained o fine-tuned. 
-O che su che genere di task il modello lavoro? E anche buona pratica includere del codice su come usare il modello correttamente.
-
-
-**13. (Opzionale) Aggiungere un notebook**
-
-É molto utile aggiungere un notebook, che dimostri in dettaglio come *brand_new_bert* si utilizzi per fare inferenza e/o 
-fine-tuned su specifiche task. Non é una cosa obbligatoria da avere nella vostra PR, ma é molto utile per la community.
-
-**14. Sottomettere la PR**
-
-L'ultimissimo step! Ovvero il merge della PR nel main. Di solito il team Hugging face a questo punto vi avrà gia aiutato, 
-ma é ok prendere un po' di tempo per pulire la descirzione e commenti nel codice.
-
-
-### Condividete il vostro lavoro!!
-
-É ora tempo di prendere un po' di credito dalla communità per il vostro lavoro! Caricare e implementare un nuovo modello 
-é un grandissimo contributo per Transformers e l'intera community NLP. Il codice e la conversione dei modelli pre-trained sara 
-sicuramente utilizzato da centinaia o migliaia di sviluppatori e ricercatori. Siate fieri e orgogliosi di condividere il vostro 
-traguardo con l'intera community :) 
-
-** Avete create un altro modello che é super facile da usare per tutti quanti nella community! 🤯**
diff --git a/docs/source/it/add_new_pipeline.md b/docs/source/it/add_new_pipeline.md
new file mode 100644
index 000000000000..adc1c3651a2c
--- /dev/null
+++ b/docs/source/it/add_new_pipeline.md
@@ -0,0 +1,250 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Come creare una pipeline personalizzata?
+
+In questa guida, scopriremo come creare una pipeline personalizzata e condividerla sull' [Hub](hf.co/models) o aggiungerla nella libreria
+Transformers.
+
+Innanzitutto, è necessario decidere gli input grezzi che la pipeline sarà in grado di accettare. Possono essere strings, raw bytes,
+dictionaries o qualsiasi cosa sia l'input desiderato più probabile. Cerca di mantenere questi input il più possibile in Python
+in quanto facilita la compatibilità (anche con altri linguaggi tramite JSON). Questi saranno gli `inputs` della
+pipeline (`preprocess`).
+
+Poi definire gli `outputs`. Stessa strategia degli `inputs`. Più è seplice e meglio è. Questi saranno gli output del metodo
+`postprocess`.
+
+Si parte ereditando la classe base `Pipeline`. con i 4 metodi che bisogna implementare `preprocess`,
+`_forward`, `postprocess` e `_sanitize_parameters`.
+
+
+```python
+from transformers import Pipeline
+
+
+class MyPipeline(Pipeline):
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_kwargs = {}
+        if "maybe_arg" in kwargs:
+            preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
+        return preprocess_kwargs, {}, {}
+
+    def preprocess(self, inputs, maybe_arg=2):
+        model_input = Tensor(inputs["input_ids"])
+        return {"model_input": model_input}
+
+    def _forward(self, model_inputs):
+        # model_inputs == {"model_input": model_input}
+        outputs = self.model(**model_inputs)
+        # Maybe {"logits": Tensor(...)}
+        return outputs
+
+    def postprocess(self, model_outputs):
+        best_class = model_outputs["logits"].softmax(-1)
+        return best_class
+```
+
+La struttura di questa suddivisione consiste nel supportare in modo relativamente continuo CPU/GPU, supportando allo stesso tempo l'esecuzione di
+pre/postelaborazione sulla CPU su thread diversi.
+
+`preprocess` prenderà gli input originariamente definiti e li trasformerà in qualcosa di alimentabile dal modello. Potrebbe
+contenere più informazioni e di solito è un `Dict`.
+
+`_forward` è il dettaglio dell'implementazione e non è destinato a essere chiamato direttamente. `forward` è il metodo preferito per assicurarsi che tutto funzioni correttamente perchè contiene delle slavaguardie. Se qualcosa è
+è collegato a un modello reale, appartiene al metodo `_forward`, tutto il resto è nel preprocess/postprocess.
+
+`postprocess` prende l'otput di `_forward` e lo trasforma nell'output finale che era stato deciso in precedenza.
+
+`_sanitize_parameters` esiste per consentire agli utenti di passare i parametri ogni volta che desiderano sia a inizialization time `pipeline(...., maybe_arg=4)` che al call time `pipe = pipeline(...); output = pipe(...., maybe_arg=4)`.
+
+`_sanitize_parameters` ritorna 3 dicts di kwargs che vengono passati direttamente a `preprocess`,
+`_forward` e `postprocess`. Non riempire nulla se il chiamante non ha chiamato con alcun parametro aggiuntivo. Questo
+consente di mantenere gli argomenti predefiniti nella definizione della funzione, che è sempre più "naturale".
+
+Un esempio classico potrebbe essere l'argomento `top_k` nel post processing dei classification tasks.
+
+```python
+>>> pipe = pipeline("my-new-task")
+>>> pipe("This is a test")
+[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}, {"label": "3-star", "score": 0.05}
+{"label": "4-star", "score": 0.025}, {"label": "5-star", "score": 0.025}]
+
+>>> pipe("This is a test", top_k=2)
+[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}]
+```
+
+In order to achieve that, we'll update our `postprocess` method with a default parameter to `5`. and edit
+`_sanitize_parameters` to allow this new parameter.
+
+
+```python
+def postprocess(self, model_outputs, top_k=5):
+    best_class = model_outputs["logits"].softmax(-1)
+    # Add logic to handle top_k
+    return best_class
+
+
+def _sanitize_parameters(self, **kwargs):
+    preprocess_kwargs = {}
+    if "maybe_arg" in kwargs:
+        preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
+
+    postprocess_kwargs = {}
+    if "top_k" in kwargs:
+        postprocess_kwargs["top_k"] = kwargs["top_k"]
+    return preprocess_kwargs, {}, postprocess_kwargs
+```
+
+Cercare di mantenere gli input/output molto semplici e idealmente serializzabili in JSON, in quanto ciò rende l'uso della pipeline molto facile
+senza richiedere agli utenti di comprendere nuovi tipi di oggetti. È anche relativamente comune supportare molti tipi di argomenti
+per facilitarne l'uso (ad esempio file audio, possono essere nomi di file, URL o byte puri).
+
+## Aggiungilo alla lista dei tasks supportati
+
+Per registrar il tuo `new-task` alla lista dei tasks supportati, devi aggiungerlo al `PIPELINE_REGISTRY`:
+
+```python
+from transformers.pipelines import PIPELINE_REGISTRY
+
+PIPELINE_REGISTRY.register_pipeline(
+    "new-task",
+    pipeline_class=MyPipeline,
+    pt_model=AutoModelForSequenceClassification,
+)
+```
+
+Puoi specificare il modello di default che desideri, in questo caso dovrebbe essere accompagnato da una revisione specifica (che può essere il nome di un branch o l'hash di un commit, in questo caso abbiamo preso `"abcdef"`) e anche dal type:
+
+```python
+PIPELINE_REGISTRY.register_pipeline(
+    "new-task",
+    pipeline_class=MyPipeline,
+    pt_model=AutoModelForSequenceClassification,
+    default={"pt": ("user/awesome_model", "abcdef")},
+    type="text",  # current support type: text, audio, image, multimodal
+)
+```
+
+## Condividi la tua pipeline sull'Hub
+
+Per condividere la tua pipeline personalizzata sull'Hub, devi solo salvare il codice della tua sottoclasse `Pipeline` in un file
+python. Per esempio, supponiamo di voler utilizzare una pipeline personalizzata per la classificazione delle coppie di frasi come la seguente:
+
+```py
+import numpy as np
+
+from transformers import Pipeline
+
+
+def softmax(outputs):
+    maxes = np.max(outputs, axis=-1, keepdims=True)
+    shifted_exp = np.exp(outputs - maxes)
+    return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
+
+
+class PairClassificationPipeline(Pipeline):
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_kwargs = {}
+        if "second_text" in kwargs:
+            preprocess_kwargs["second_text"] = kwargs["second_text"]
+        return preprocess_kwargs, {}, {}
+
+    def preprocess(self, text, second_text=None):
+        return self.tokenizer(text, text_pair=second_text, return_tensors=self.framework)
+
+    def _forward(self, model_inputs):
+        return self.model(**model_inputs)
+
+    def postprocess(self, model_outputs):
+        logits = model_outputs.logits[0].numpy()
+        probabilities = softmax(logits)
+
+        best_class = np.argmax(probabilities)
+        label = self.model.config.id2label[best_class]
+        score = probabilities[best_class].item()
+        logits = logits.tolist()
+        return {"label": label, "score": score, "logits": logits}
+```
+
+L'implementazione è agnostica al framework, e lavorerà sia con modelli PyTorch che con TensorFlow. Se l'abbiamo salvato in un file chiamato `pair_classification.py`, può essere successivamente importato e registrato in questo modo:
+
+```py
+from pair_classification import PairClassificationPipeline
+from transformers.pipelines import PIPELINE_REGISTRY
+from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification
+
+PIPELINE_REGISTRY.register_pipeline(
+    "pair-classification",
+    pipeline_class=PairClassificationPipeline,
+    pt_model=AutoModelForSequenceClassification,
+    tf_model=TFAutoModelForSequenceClassification,
+)
+```
+
+Una volta fatto, possiamo usarla con un modello pretrained. L'istanza `sgugger/finetuned-bert-mrpc` è stata
+fine-tuned sul dataset MRPC, che classifica le coppie di frasi come parafrasi o no.
+
+```py
+from transformers import pipeline
+
+classifier = pipeline("pair-classification", model="sgugger/finetuned-bert-mrpc")
+```
+
+Successivamente possiamo condividerlo sull'Hub usando il metodo `save_pretrained` in un `Repository`:
+
+```py
+from huggingface_hub import Repository
+
+repo = Repository("test-dynamic-pipeline", clone_from="{your_username}/test-dynamic-pipeline")
+classifier.save_pretrained("test-dynamic-pipeline")
+repo.push_to_hub()
+```
+
+Questo codice copierà il file dove è stato definitp `PairClassificationPipeline` all'interno della cartella `"test-dynamic-pipeline"`,
+insieme al salvataggio del modello e del tokenizer della pipeline, prima di pushare il tutto nel repository
+`{your_username}/test-dynamic-pipeline`. Dopodiché chiunque potrà utilizzarlo, purché fornisca l'opzione
+`trust_remote_code=True`:
+
+```py
+from transformers import pipeline
+
+classifier = pipeline(model="{your_username}/test-dynamic-pipeline", trust_remote_code=True)
+```
+
+## Aggiungere la pipeline a Transformers
+
+Se vuoi contribuire con la tua pipeline a Transformers, dovrai aggiungere un modulo nel sottomodulo `pipelines`
+con il codice della tua pipeline, quindi aggiungilo all'elenco dei tasks definiti in `pipelines/__init__.py`.
+
+Poi hai bisogno di aggiungere i test. Crea un nuovo file `tests/test_pipelines_MY_PIPELINE.py` con esempi ed altri test.
+
+La funzione `run_pipeline_test` sarà molto generica e su piccoli modelli casuali su ogni possibile
+architettura, come definito da `model_mapping` e `tf_model_mapping`.
+
+Questo è molto importante per testare la compatibilità futura, nel senso che se qualcuno aggiunge un nuovo modello di
+`XXXForQuestionAnswering` allora il test della pipeline tenterà di essere eseguito su di esso. Poiché i modelli sono casuali, è
+è impossibile controllare i valori effettivi, per questo esiste un aiuto `ANY` che tenterà solamente di far corrispondere l'output della pipeline TYPE.
+
+Hai anche *bisogno* di implementare 2 (idealmente 4) test.
+
+- `test_small_model_pt` : Definire 1 piccolo modello per questa pipeline (non importa se i risultati non hanno senso)
+  e testare i risultati della pipeline. I risultati dovrebbero essere gli stessi di `test_small_model_tf`.
+- `test_small_model_tf` : Definire 1 piccolo modello per questa pipeline (non importa se i risultati non hanno senso)
+  e testare i risultati della pipeline. I risultati dovrebbero essere gli stessi di `test_small_model_pt`.
+- `test_large_model_pt` (`optional`): Testare la pipeline su una pipeline reale in cui i risultati dovrebbero avere
+  senso. Questi test sono lenti e dovrebbero essere contrassegnati come tali. In questo caso l'obiettivo è mostrare la pipeline e assicurarsi che non ci siano  derive nelle versioni future
+- `test_large_model_tf` (`optional`): Testare la pipeline su una pipeline reale in cui i risultati dovrebbero avere
+  senso. Questi test sono lenti e dovrebbero essere contrassegnati come tali. In questo caso l'obiettivo è mostrare la pipeline e assicurarsi
+  che non ci siano derive nelle versioni future
\ No newline at end of file
diff --git a/docs/source/it/add_new_pipeline.mdx b/docs/source/it/add_new_pipeline.mdx
deleted file mode 100644
index cf9acd2902fc..000000000000
--- a/docs/source/it/add_new_pipeline.mdx
+++ /dev/null
@@ -1,246 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
--->
-
-# Come creare una pipeline personalizzata?
-
-In questa guida, scopriremo come creare una pipeline personalizzata e condividerla sull' [Hub](hf.co/models) o aggiungerla nella libreria
-Transformers.
-
-Innanzitutto, è necessario decidere gli input grezzi che la pipeline sarà in grado di accettare. Possono essere strings, raw bytes,
-dictionaries o qualsiasi cosa sia l'input desiderato più probabile. Cerca di mantenere questi input il più possibile in Python
-in quanto facilita la compatibilità (anche con altri linguaggi tramite JSON). Questi saranno gli `inputs` della
-pipeline (`preprocess`).
-
-Poi definire gli `outputs`. Stessa strategia degli `inputs`. Più è seplice e meglio è. Questi saranno gli output del metodo
-`postprocess`.
-
-Si parte ereditando la classe base `Pipeline`. con i 4 metodi che bisogna implementare `preprocess`,
-`_forward`, `postprocess` e `_sanitize_parameters`.
-
-
-```python
-from transformers import Pipeline
-
-
-class MyPipeline(Pipeline):
-    def _sanitize_parameters(self, **kwargs):
-        preprocess_kwargs = {}
-        if "maybe_arg" in kwargs:
-            preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
-        return preprocess_kwargs, {}, {}
-
-    def preprocess(self, inputs, maybe_arg=2):
-        model_input = Tensor(inputs["input_ids"])
-        return {"model_input": model_input}
-
-    def _forward(self, model_inputs):
-        # model_inputs == {"model_input": model_input}
-        outputs = self.model(**model_inputs)
-        # Maybe {"logits": Tensor(...)}
-        return outputs
-
-    def postprocess(self, model_outputs):
-        best_class = model_outputs["logits"].softmax(-1)
-        return best_class
-```
-
-La struttura di questa suddivisione consiste nel supportare in modo relativamente continuo CPU/GPU, supportando allo stesso tempo l'esecuzione di
-pre/postelaborazione sulla CPU su thread diversi.
-
-`preprocess` prenderà gli input originariamente definiti e li trasformerà in qualcosa di alimentabile dal modello. Potrebbe
-contenere più informazioni e di solito è un `Dict`.
-
-`_forward` è il dettaglio dell'implementazione e non è destinato a essere chiamato direttamente. `forward` è il metodo preferito per assicurarsi che tutto funzioni correttamente perchè contiene delle slavaguardie. Se qualcosa è
-è collegato a un modello reale, appartiene al metodo `_forward`, tutto il resto è nel preprocess/postprocess.
-
-`postprocess` prende l'otput di `_forward` e lo trasforma nell'output finale che era stato deciso in precedenza.
-
-`_sanitize_parameters` esiste per consentire agli utenti di passare i parametri ogni volta che desiderano sia a inizialization time `pipeline(...., maybe_arg=4)` che al call time `pipe = pipeline(...); output = pipe(...., maybe_arg=4)`.
-
-`_sanitize_parameters` ritorna 3 dicts di kwargs che vengono passati direttamente a `preprocess`,
-`_forward` e `postprocess`. Non riempire nulla se il chiamante non ha chiamato con alcun parametro aggiuntivo. Questo
-consente di mantenere gli argomenti predefiniti nella definizione della funzione, che è sempre più "naturale".
-
-Un esempio classico potrebbe essere l'argomento `top_k` nel post processing dei classification tasks.
-
-```python
->>> pipe = pipeline("my-new-task")
->>> pipe("This is a test")
-[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}, {"label": "3-star", "score": 0.05}
-{"label": "4-star", "score": 0.025}, {"label": "5-star", "score": 0.025}]
-
->>> pipe("This is a test", top_k=2)
-[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}]
-```
-
-In order to achieve that, we'll update our `postprocess` method with a default parameter to `5`. and edit
-`_sanitize_parameters` to allow this new parameter.
-
-
-```python
-def postprocess(self, model_outputs, top_k=5):
-    best_class = model_outputs["logits"].softmax(-1)
-    # Add logic to handle top_k
-    return best_class
-
-
-def _sanitize_parameters(self, **kwargs):
-    preprocess_kwargs = {}
-    if "maybe_arg" in kwargs:
-        preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
-
-    postprocess_kwargs = {}
-    if "top_k" in kwargs:
-        postprocess_kwargs["top_k"] = kwargs["top_k"]
-    return preprocess_kwargs, {}, postprocess_kwargs
-```
-
-Cercare di mantenere gli input/output molto semplici e idealmente serializzabili in JSON, in quanto ciò rende l'uso della pipeline molto facile
-senza richiedere agli utenti di comprendere nuovi tipi di oggetti. È anche relativamente comune supportare molti tipi di argomenti
-per facilitarne l'uso (ad esempio file audio, possono essere nomi di file, URL o byte puri).
-
-## Aggiungilo alla lista dei tasks supportati
-
-Per registrar il tuo `new-task` alla lista dei tasks supportati, devi aggiungerlo al `PIPELINE_REGISTRY`:
-
-```python
-from transformers.pipelines import PIPELINE_REGISTRY
-
-PIPELINE_REGISTRY.register_pipeline(
-    "new-task",
-    pipeline_class=MyPipeline,
-    pt_model=AutoModelForSequenceClassification,
-)
-```
-
-Puoi specificare il modello di default che desideri, in questo caso dovrebbe essere accompagnato da una revisione specifica (che può essere il nome di un branch o l'hash di un commit, in questo caso abbiamo preso `"abcdef"`) e anche dal type:
-
-```python
-PIPELINE_REGISTRY.register_pipeline(
-    "new-task",
-    pipeline_class=MyPipeline,
-    pt_model=AutoModelForSequenceClassification,
-    default={"pt": ("user/awesome_model", "abcdef")},
-    type="text",  # current support type: text, audio, image, multimodal
-)
-```
-
-## Condividi la tua pipeline sull'Hub
-
-Per condividere la tua pipeline personalizzata sull'Hub, devi solo salvare il codice della tua sottoclasse `Pipeline` in un file
-python. Per esempio, supponiamo di voler utilizzare una pipeline personalizzata per la classificazione delle coppie di frasi come la seguente:
-
-```py
-import numpy as np
-
-from transformers import Pipeline
-
-
-def softmax(outputs):
-    maxes = np.max(outputs, axis=-1, keepdims=True)
-    shifted_exp = np.exp(outputs - maxes)
-    return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
-
-
-class PairClassificationPipeline(Pipeline):
-    def _sanitize_parameters(self, **kwargs):
-        preprocess_kwargs = {}
-        if "second_text" in kwargs:
-            preprocess_kwargs["second_text"] = kwargs["second_text"]
-        return preprocess_kwargs, {}, {}
-
-    def preprocess(self, text, second_text=None):
-        return self.tokenizer(text, text_pair=second_text, return_tensors=self.framework)
-
-    def _forward(self, model_inputs):
-        return self.model(**model_inputs)
-
-    def postprocess(self, model_outputs):
-        logits = model_outputs.logits[0].numpy()
-        probabilities = softmax(logits)
-
-        best_class = np.argmax(probabilities)
-        label = self.model.config.id2label[best_class]
-        score = probabilities[best_class].item()
-        logits = logits.tolist()
-        return {"label": label, "score": score, "logits": logits}
-```
-
-L'implementazione è agnostica al framework, e lavorerà sia con modelli PyTorch che con TensorFlow. Se l'abbiamo salvato in un file chiamato `pair_classification.py`, può essere successivamente importato e registrato in questo modo:
-
-```py
-from pair_classification import PairClassificationPipeline
-from transformers.pipelines import PIPELINE_REGISTRY
-from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification
-
-PIPELINE_REGISTRY.register_pipeline(
-    "pair-classification",
-    pipeline_class=PairClassificationPipeline,
-    pt_model=AutoModelForSequenceClassification,
-    tf_model=TFAutoModelForSequenceClassification,
-)
-```
-
-Una volta fatto, possiamo usarla con un modello pretrained. L'istanza `sgugger/finetuned-bert-mrpc` è stata
-fine-tuned sul dataset MRPC, che classifica le coppie di frasi come parafrasi o no.
-
-```py
-from transformers import pipeline
-
-classifier = pipeline("pair-classification", model="sgugger/finetuned-bert-mrpc")
-```
-
-Successivamente possiamo condividerlo sull'Hub usando il metodo `save_pretrained` in un `Repository`:
-
-```py
-from huggingface_hub import Repository
-
-repo = Repository("test-dynamic-pipeline", clone_from="{your_username}/test-dynamic-pipeline")
-classifier.save_pretrained("test-dynamic-pipeline")
-repo.push_to_hub()
-```
-
-Questo codice copierà il file dove è stato definitp `PairClassificationPipeline` all'interno della cartella `"test-dynamic-pipeline"`,
-insieme al salvataggio del modello e del tokenizer della pipeline, prima di pushare il tutto nel repository
-`{your_username}/test-dynamic-pipeline`. Dopodiché chiunque potrà utilizzarlo, purché fornisca l'opzione
-`trust_remote_code=True`:
-
-```py
-from transformers import pipeline
-
-classifier = pipeline(model="{your_username}/test-dynamic-pipeline", trust_remote_code=True)
-```
-
-## Aggiungere la pipeline a Transformers
-
-Se vuoi contribuire con la tua pipeline a Transformers, dovrai aggiungere un modulo nel sottomodulo `pipelines`
-con il codice della tua pipeline, quindi aggiungilo all'elenco dei tasks definiti in `pipelines/__init__.py`.
-
-Poi hai bisogno di aggiungere i test. Crea un nuovo file `tests/test_pipelines_MY_PIPELINE.py` con esempi ed altri test.
-
-La funzione `run_pipeline_test` sarà molto generica e su piccoli modelli casuali su ogni possibile
-architettura, come definito da `model_mapping` e `tf_model_mapping`.
-
-Questo è molto importante per testare la compatibilità futura, nel senso che se qualcuno aggiunge un nuovo modello di
-`XXXForQuestionAnswering` allora il test della pipeline tenterà di essere eseguito su di esso. Poiché i modelli sono casuali, è
-è impossibile controllare i valori effettivi, per questo esiste un aiuto `ANY` che tenterà solamente di far corrispondere l'output della pipeline TYPE.
-
-Hai anche *bisogno* di implementare 2 (idealmente 4) test.
-
-- `test_small_model_pt` : Definire 1 piccolo modello per questa pipeline (non importa se i risultati non hanno senso)
-  e testare i risultati della pipeline. I risultati dovrebbero essere gli stessi di `test_small_model_tf`.
-- `test_small_model_tf` : Definire 1 piccolo modello per questa pipeline (non importa se i risultati non hanno senso)
-  e testare i risultati della pipeline. I risultati dovrebbero essere gli stessi di `test_small_model_pt`.
-- `test_large_model_pt` (`optional`): Testare la pipeline su una pipeline reale in cui i risultati dovrebbero avere
-  senso. Questi test sono lenti e dovrebbero essere contrassegnati come tali. In questo caso l'obiettivo è mostrare la pipeline e assicurarsi che non ci siano  derive nelle versioni future
-- `test_large_model_tf` (`optional`): Testare la pipeline su una pipeline reale in cui i risultati dovrebbero avere
-  senso. Questi test sono lenti e dovrebbero essere contrassegnati come tali. In questo caso l'obiettivo è mostrare la pipeline e assicurarsi
-  che non ci siano derive nelle versioni future
\ No newline at end of file
diff --git a/docs/source/it/autoclass_tutorial.md b/docs/source/it/autoclass_tutorial.md
new file mode 100644
index 000000000000..51621d098302
--- /dev/null
+++ b/docs/source/it/autoclass_tutorial.md
@@ -0,0 +1,123 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Carica istanze pre-allenate con AutoClass
+
+Con così tante architetture Transformer differenti, può essere sfidante crearne una per il tuo checkpoint. Come parte della filosofia centrale di 🤗 Transformers per rendere la libreria facile, semplice e flessibile da utilizzare, una `AutoClass` inferisce e carica automaticamente l'architettura corretta da un dato checkpoint. Il metodo `from_pretrained` ti permette di caricare velocemente un modello pre-allenato per qualsiasi architettura, così non devi utilizzare tempo e risorse per allenare un modello da zero. Produrre questo codice agnostico ai checkpoint significa che se il tuo codice funziona per un checkpoint, funzionerà anche per un altro checkpoint, purché sia stato allenato per un compito simile, anche se l'architettura è differente.
+
+<Tip>
+
+Ricorda, con architettura ci si riferisce allo scheletro del modello e con checkpoint ai pesi di una determinata architettura. Per esempio, [BERT](https://huggingface.co/bert-base-uncased) è un'architettura, mentre `bert-base-uncased` è un checkpoint. Modello è un termine generale che può significare sia architettura che checkpoint.
+
+</Tip>
+
+In questo tutorial, imparerai a:
+
+* Caricare un tokenizer pre-allenato.
+* Caricare un estrattore di caratteristiche (feature extractor, in inglese) pre-allenato.
+* Caricare un processore pre-allenato.
+* Caricare un modello pre-allenato.
+
+## AutoTokenizer
+
+Quasi tutti i compiti di NLP iniziano con un tokenizer. Un tokenizer converte il tuo input in un formato che possa essere elaborato dal modello.
+
+Carica un tokenizer con [`AutoTokenizer.from_pretrained`]:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
+```
+
+Poi tokenizza il tuo input come mostrato in seguito:
+
+```py
+>>> sequenza = "In un buco nel terreno viveva uno Hobbit."
+>>> print(tokenizer(sequenza))
+{'input_ids': [0, 360, 51, 373, 587, 1718, 54644, 22597, 330, 3269, 2291, 22155, 18, 5, 2],
+ 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+```
+
+## AutoFeatureExtractor
+
+Per compiti inerenti a audio e video, un feature extractor processa il segnale audio o l'immagine nel formato di input corretto.
+
+Carica un feature extractor con [`AutoFeatureExtractor.from_pretrained`]:
+
+```py
+>>> from transformers import AutoFeatureExtractor
+
+>>> feature_extractor = AutoFeatureExtractor.from_pretrained(
+...     "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
+... )
+```
+
+## AutoProcessor
+
+Compiti multimodali richiedono un processore che combini i due tipi di strumenti di elaborazione. Per esempio, il modello [LayoutLMV2](model_doc/layoutlmv2) richiede un feature extractor per gestire le immagine e un tokenizer per gestire il testo; un processore li combina entrambi.
+
+Carica un processore con [`AutoProcessor.from_pretrained`]:
+
+```py
+>>> from transformers import AutoProcessor
+
+>>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased")
+```
+
+## AutoModel
+
+<frameworkcontent>
+<pt>
+Infine, le classi `AutoModelFor` ti permettono di caricare un modello pre-allenato per un determinato compito (guarda [qui](model_doc/auto) per una lista completa di compiti presenti). Per esempio, carica un modello per la classificazione di sequenze con [`AutoModelForSequenceClassification.from_pretrained`]:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+```
+
+Semplicemente utilizza lo stesso checkpoint per caricare un'architettura per un task differente:
+
+```py
+>>> from transformers import AutoModelForTokenClassification
+
+>>> model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
+```
+
+Generalmente, raccomandiamo di utilizzare la classe `AutoTokenizer` e la classe `AutoModelFor` per caricare istanze pre-allenate dei modelli. Questo ti assicurerà di aver caricato la corretta architettura ogni volta. Nel prossimo [tutorial](preprocessing), imparerai come utilizzare il tokenizer, il feature extractor e il processore per elaborare un dataset per il fine-tuning.
+
+</pt>
+<tf>
+Infine, le classi `TFAutoModelFor` ti permettono di caricare un modello pre-allenato per un determinato compito (guarda [qui](model_doc/auto) per una lista completa di compiti presenti). Per esempio, carica un modello per la classificazione di sequenze con [`TFAutoModelForSequenceClassification.from_pretrained`]:
+
+```py
+>>> from transformers import TFAutoModelForSequenceClassification
+
+>>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+```
+
+Semplicemente utilizza lo stesso checkpoint per caricare un'architettura per un task differente:
+
+```py
+>>> from transformers import TFAutoModelForTokenClassification
+
+>>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
+```
+
+Generalmente, raccomandiamo di utilizzare la classe `AutoTokenizer` e la classe `TFAutoModelFor` per caricare istanze pre-allenate dei modelli. Questo ti assicurerà di aver caricato la corretta architettura ogni volta. Nel prossimo [tutorial](preprocessing), imparerai come utilizzare il tokenizer, il feature extractor e il processore per elaborare un dataset per il fine-tuning.
+</tf>
+</frameworkcontent>
diff --git a/docs/source/it/autoclass_tutorial.mdx b/docs/source/it/autoclass_tutorial.mdx
deleted file mode 100644
index 88dd6cad6c42..000000000000
--- a/docs/source/it/autoclass_tutorial.mdx
+++ /dev/null
@@ -1,119 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Carica istanze pre-allenate con AutoClass
-
-Con così tante architetture Transformer differenti, può essere sfidante crearne una per il tuo checkpoint. Come parte della filosofia centrale di 🤗 Transformers per rendere la libreria facile, semplice e flessibile da utilizzare, una `AutoClass` inferisce e carica automaticamente l'architettura corretta da un dato checkpoint. Il metodo `from_pretrained` ti permette di caricare velocemente un modello pre-allenato per qualsiasi architettura, così non devi utilizzare tempo e risorse per allenare un modello da zero. Produrre questo codice agnostico ai checkpoint significa che se il tuo codice funziona per un checkpoint, funzionerà anche per un altro checkpoint, purché sia stato allenato per un compito simile, anche se l'architettura è differente.
-
-<Tip>
-
-Ricorda, con architettura ci si riferisce allo scheletro del modello e con checkpoint ai pesi di una determinata architettura. Per esempio, [BERT](https://huggingface.co/bert-base-uncased) è un'architettura, mentre `bert-base-uncased` è un checkpoint. Modello è un termine generale che può significare sia architettura che checkpoint.
-
-</Tip>
-
-In questo tutorial, imparerai a:
-
-* Caricare un tokenizer pre-allenato.
-* Caricare un estrattore di caratteristiche (feature extractor, in inglese) pre-allenato.
-* Caricare un processore pre-allenato.
-* Caricare un modello pre-allenato.
-
-## AutoTokenizer
-
-Quasi tutti i compiti di NLP iniziano con un tokenizer. Un tokenizer converte il tuo input in un formato che possa essere elaborato dal modello.
-
-Carica un tokenizer con [`AutoTokenizer.from_pretrained`]:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
-```
-
-Poi tokenizza il tuo input come mostrato in seguito:
-
-```py
->>> sequenza = "In un buco nel terreno viveva uno Hobbit."
->>> print(tokenizer(sequenza))
-{'input_ids': [0, 360, 51, 373, 587, 1718, 54644, 22597, 330, 3269, 2291, 22155, 18, 5, 2],
- 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
-```
-
-## AutoFeatureExtractor
-
-Per compiti inerenti a audio e video, un feature extractor processa il segnale audio o l'immagine nel formato di input corretto.
-
-Carica un feature extractor con [`AutoFeatureExtractor.from_pretrained`]:
-
-```py
->>> from transformers import AutoFeatureExtractor
-
->>> feature_extractor = AutoFeatureExtractor.from_pretrained(
-...     "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
-... )
-```
-
-## AutoProcessor
-
-Compiti multimodali richiedono un processore che combini i due tipi di strumenti di elaborazione. Per esempio, il modello [LayoutLMV2](model_doc/layoutlmv2) richiede un feature extractor per gestire le immagine e un tokenizer per gestire il testo; un processore li combina entrambi.
-
-Carica un processore con [`AutoProcessor.from_pretrained`]:
-
-```py
->>> from transformers import AutoProcessor
-
->>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased")
-```
-
-## AutoModel
-
-<frameworkcontent>
-<pt>
-Infine, le classi `AutoModelFor` ti permettono di caricare un modello pre-allenato per un determinato compito (guarda [qui](model_doc/auto) per una lista completa di compiti presenti). Per esempio, carica un modello per la classificazione di sequenze con [`AutoModelForSequenceClassification.from_pretrained`]:
-
-```py
->>> from transformers import AutoModelForSequenceClassification
-
->>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
-```
-
-Semplicemente utilizza lo stesso checkpoint per caricare un'architettura per un task differente:
-
-```py
->>> from transformers import AutoModelForTokenClassification
-
->>> model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
-```
-
-Generalmente, raccomandiamo di utilizzare la classe `AutoTokenizer` e la classe `AutoModelFor` per caricare istanze pre-allenate dei modelli. Questo ti assicurerà di aver caricato la corretta architettura ogni volta. Nel prossimo [tutorial](preprocessing), imparerai come utilizzare il tokenizer, il feature extractor e il processore per elaborare un dataset per il fine-tuning.
-
-</pt>
-<tf>
-Infine, le classi `TFAutoModelFor` ti permettono di caricare un modello pre-allenato per un determinato compito (guarda [qui](model_doc/auto) per una lista completa di compiti presenti). Per esempio, carica un modello per la classificazione di sequenze con [`TFAutoModelForSequenceClassification.from_pretrained`]:
-
-```py
->>> from transformers import TFAutoModelForSequenceClassification
-
->>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
-```
-
-Semplicemente utilizza lo stesso checkpoint per caricare un'architettura per un task differente:
-
-```py
->>> from transformers import TFAutoModelForTokenClassification
-
->>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
-```
-
-Generalmente, raccomandiamo di utilizzare la classe `AutoTokenizer` e la classe `TFAutoModelFor` per caricare istanze pre-allenate dei modelli. Questo ti assicurerà di aver caricato la corretta architettura ogni volta. Nel prossimo [tutorial](preprocessing), imparerai come utilizzare il tokenizer, il feature extractor e il processore per elaborare un dataset per il fine-tuning.
-</tf>
-</frameworkcontent>
diff --git a/docs/source/it/big_models.md b/docs/source/it/big_models.md
new file mode 100644
index 000000000000..cd0fd9017d9d
--- /dev/null
+++ b/docs/source/it/big_models.md
@@ -0,0 +1,123 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Istanziare un big model
+
+Quando vuoi utilizzare un modello preaddestrato (pretrained) molto grande, una sfida è minimizzare l'uso della RAM. Il workflow classico
+in PyTorch è:
+
+1. Crea il tuo modello con pesi casuali (random weights).
+2. Carica i tuoi pesi preaddestrati.
+3. Inserisci i pesi preaddestrati nel tuo modello casuale.
+
+I passi 1 e 2 una versione completa del modello in memoria, in molti casi non è un problema, ma se il modello inizia a pesare diversi GigaBytes, queste due copie possono sturare la nostra RAM. Ancora peggio, se stai usando `torch.distributed` per seguire l'addestramento (training) in distribuito, ogni processo caricherà il modello preaddestrato e memorizzerà queste due copie nella RAM.
+
+<Tip>
+
+Nota che il modello creato casualmente è inizializzato con tensori "vuoti", che occupano spazio in memoria ma senza riempirlo (quindi i valori casuali sono quelli che si trovavano in questa porzione di memoria in un determinato momento). L'inizializzazione casuale che segue la distribuzione appropriata per il tipo di modello/parametri istanziato (come la distribuzione normale per le istanze) è eseguito solo dopo il passaggio 3 sui pesi non inizializzati, per essere più rapido possibile!
+
+</Tip>
+
+In questa guida, esploreremo le soluzioni che Transformers offre per affrontare questo problema. C'è da tenere in conto che questa è un'area in cui si sta attualmente sviluppando, quindi le API spiegate qui possono variare velocemente in futuro.
+
+## Checkpoints condivisi
+
+Dalla versione 4.18.0, i checkpoints dei modelli che occupano più di 10GB di spazio vengono automaticamente frammentati in più parti. Per quanto riguarda la possibilità di avere un unico checkpoint quando si utilizza `model.save_pretrained(save_dir)`, si hanno diversi checkpoint parziali (ognuno con dimensione < 10GB) e un  indice che mappa i nomi dei parametri ai file in cui sono memorizzati.
+
+Puoi controllare la dimensione massima dopo la frammentazione con il parametro `max_shard_size`, nel prossimo esempio, useremo modelli di dimensioni normali con frammenti di piccoli dimensioni: prendiamo un modello BERT classico.
+
+```py
+from transformers import AutoModel
+
+model = AutoModel.from_pretrained("bert-base-cased")
+```
+
+Se tu salvi usando [`~PreTrainedModel.save_pretrained`], avrai una nuova cartella con due file: il config del modello e i suoi pesi:
+
+```py
+>>> import os
+>>> import tempfile
+
+>>> with tempfile.TemporaryDirectory() as tmp_dir:
+...     model.save_pretrained(tmp_dir)
+...     print(sorted(os.listdir(tmp_dir)))
+['config.json', 'pytorch_model.bin']
+```
+
+Adesso usiamo una dimensione massima di frammentazione di 200MB:
+
+```py
+>>> with tempfile.TemporaryDirectory() as tmp_dir:
+...     model.save_pretrained(tmp_dir, max_shard_size="200MB")
+...     print(sorted(os.listdir(tmp_dir)))
+['config.json', 'pytorch_model-00001-of-00003.bin', 'pytorch_model-00002-of-00003.bin', 'pytorch_model-00003-of-00003.bin', 'pytorch_model.bin.index.json']
+```
+
+In aggiunta alla configurazione del modello, vediamo tre differenti file dei pesi, e un file `index.json` che è il nostro indice. Un checkpoint può essere ricaricato totalmente usando il metodo [`~PreTrainedModel.from_pretrained`]:
+
+```py
+>>> with tempfile.TemporaryDirectory() as tmp_dir:
+...     model.save_pretrained(tmp_dir, max_shard_size="200MB")
+...     new_model = AutoModel.from_pretrained(tmp_dir)
+```
+
+Il vantaggio principale di applicare questo metodo per modelli grandi è che durante il passo 2 del workflow illustrato in precedenza, ogni frammento del checkpoint viene caricato dopo il precedente, limitando l'utilizzo della RAM alla dimensione del modello più la dimensione del frammento più grande.
+
+Dietro le quinte, il file indice è utilizzato per determinare quali chiavi sono nel checkpoint, e dove i corrispondenti pesi sono memorizzati. Possiamo caricare l'indice come un qualsiasi json e ottenere un dizionario:
+
+```py
+>>> import json
+
+>>> with tempfile.TemporaryDirectory() as tmp_dir:
+...     model.save_pretrained(tmp_dir, max_shard_size="200MB")
+...     with open(os.path.join(tmp_dir, "pytorch_model.bin.index.json"), "r") as f:
+...         index = json.load(f)
+
+>>> print(index.keys())
+dict_keys(['metadata', 'weight_map'])
+```
+
+I metadati consistono solo nella dimensione totale del modello per ora. Abbiamo in programma di aggiungere altre informazioni in futuro:
+
+```py
+>>> index["metadata"]
+{'total_size': 433245184}
+```
+
+La mappa dei pesi è la parte principale di questo indice, che mappa ogni nome dei parametri (si trova solitamente nei modelli PyTorch come `state_dict`) al file in cui è memorizzato:
+
+```py
+>>> index["weight_map"]
+{'embeddings.LayerNorm.bias': 'pytorch_model-00001-of-00003.bin',
+ 'embeddings.LayerNorm.weight': 'pytorch_model-00001-of-00003.bin',
+ ...
+```
+
+Se vuoi caricare direttamente un checkpoint frammentato in un modello senza usare [`~PreTrainedModel.from_pretrained`] (come si farebbe con `model.load_state_dict()` per un checkpoint completo) devi usare [`~modeling_utils.load_sharded_checkpoint`]:
+
+```py
+>>> from transformers.modeling_utils import load_sharded_checkpoint
+
+>>> with tempfile.TemporaryDirectory() as tmp_dir:
+...     model.save_pretrained(tmp_dir, max_shard_size="200MB")
+...     load_sharded_checkpoint(model, tmp_dir)
+```
+
+## Caricamento low memory
+
+Frammentare i checkpoint l'utilizzo di memoria al passo 2 del workflow citato in precedenza, ma per utilizzare questo modello in un ambiente con poca memoria, consigliamo di utilizzare i nostri strumenti basati sulla libreria Accelerate.
+
+Per ulteriori informazioni, leggere la seguente guida: [Large model loading using Accelerate](./main_classes/model#large-model-loading)
\ No newline at end of file
diff --git a/docs/source/it/community.md b/docs/source/it/community.md
new file mode 100644
index 000000000000..2f3c0c8a82b4
--- /dev/null
+++ b/docs/source/it/community.md
@@ -0,0 +1,68 @@
+<!--⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+# Comunità
+
+Questa pagina raggruppa le risorse sviluppate dalla comunità riguardo 🤗 Transformers.
+
+## Risorse della comunità:
+
+| Risorsa     |      Descrizione      |      Autore      |
+|:----------|:-------------|------:|
+| [Glossario delle Flashcards di Transformers](https://www.darigovresearch.com/huggingface-transformers-glossary-flashcards) | Un insieme di flashcards basate sul [glossario della documentazione di Transformers](glossary), creato in un formato tale da permettere un facile apprendimento e revisione usando [Anki](https://apps.ankiweb.net/), un'applicazione open-source e multi-piattaforma, specificatamente progettata per ricordare informazioni nel lungo termine. Guarda questo [video introduttivo su come usare le flashcards](https://www.youtube.com/watch?v=Dji_h7PILrw). | [Darigov Research](https://www.darigovresearch.com/) |
+
+## Notebook della comunità:
+
+| Notebook     |      Descrizione      |      Autore      |      |
+|:----------|:-------------|:-------------|------:|
+| [Fine-tuning di un Transformer pre-addestrato, al fine di generare testi di canzoni](https://github.com/AlekseyKorshuk/huggingartists) | Come generare testi di canzoni nello stile del vostro artista preferito attraverso il fine-tuning di un modello GPT-2. |  [Aleksey Korshuk](https://github.com/AlekseyKorshuk) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AlekseyKorshuk/huggingartists/blob/master/huggingartists-demo.ipynb) |
+| [Addestramento di T5 in Tensorflow 2 ](https://github.com/snapthat/TF-T5-text-to-text) | Come addestrare T5 per qualsiasi attività usando Tensorflow 2. Questo notebook mostra come risolvere l'attività di "Question Answering" usando Tensorflow 2 e SQUAD. | [Muhammad Harris](https://github.com/HarrisDePerceptron) |[![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snapthat/TF-T5-text-to-text/blob/master/snapthatT5/notebooks/TF-T5-Datasets%20Training.ipynb) |
+| [Addestramento di T5 con TPU](https://github.com/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb)  | Come addestrare T5 su SQUAD con Transformers e NLP. | [Suraj Patil](https://github.com/patil-suraj) |[![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=QLGiFCDqvuil) |
+| [Fine-tuning di T5 per la classificazione e scelta multipla](https://github.com/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb)  | Come effettuare il fine-tuning di T5 per le attività di classificazione a scelta multipla - usando un formato testo-a-testo - con PyTorch Lightning. |  [Suraj Patil](https://github.com/patil-suraj) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) |
+| [Fine-tuning di DialoGPT su nuovi dataset e lingue](https://github.com/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb)  | Come effettuare il fine-tuning di un modello DialoGPT su un nuovo dataset per chatbots conversazionali open-dialog. |  [Nathan Cooper](https://github.com/ncoop57) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) |
+| [Modellamento di una lunga sequenza con Reformer](https://github.com/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb)  | Come addestrare su sequenze di lunghezza fino a 500 mila token con Reformer. |  [Patrick von Platen](https://github.com/patrickvonplaten) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb)  |
+| [Fine-tuning di BART per riassumere testi](https://github.com/ohmeow/ohmeow_website/blob/master/_notebooks/2020-05-23-text-generation-with-blurr.ipynb) | Come effettuare il fine-tuning di BART per riassumere testi con fastai usando blurr. | [Wayde Gilliam](https://ohmeow.com/) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ohmeow/ohmeow_website/blob/master/_notebooks/2020-05-23-text-generation-with-blurr.ipynb) |
+| [Fine-tuning di un Transformer pre-addestrato su tweet](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) | Come generare tweet nello stile del tuo account Twitter preferito attraverso il fine-tuning di un modello GPT-2. |  [Boris Dayma](https://github.com/borisdayma) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) |
+| [Ottimizzazione di modelli 🤗 Hugging Face con Weights & Biases](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) | Un tutorial completo che mostra l'integrazione di W&B con Hugging Face. | [Boris Dayma](https://github.com/borisdayma) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) |
+| [Longformer pre-addestrato](https://github.com/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb)  | Come costruire una versione "long" degli esistenti modelli pre-addestrati. |  [Iz Beltagy](https://beltagy.net) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) |
+| [Fine-tuning di Longformer per QA](https://github.com/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) | Come effettuare il fine-tuning di un modello longformer per un task di QA.| [Suraj Patil](https://github.com/patil-suraj) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) |
+| [Valutazione di modelli con 🤗NLP](https://github.com/patrickvonplaten/notebooks/blob/master/How_to_evaluate_Longformer_on_TriviaQA_using_NLP.ipynb) | Come valutare longformer su TriviaQA con `NLP`. | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1m7eTGlPmLRgoPkkA7rkhQdZ9ydpmsdLE?usp=sharing) |
+| [Fine-tuning di T5 per Sentiment Span Extraction](https://github.com/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb)  | Come effettuare il fine-tuning di T5 per la sentiment span extraction - usando un formato testo-a-testo - con PyTorch Lightning. |  [Lorenzo Ampil](https://github.com/enzoampil) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb) |
+| [Fine-tuning di DistilBert per la classificazione multi-classe](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb) | Come effettuare il fine-tuning di DistilBert per la classificazione multi-classe con PyTorch. | [Abhishek Kumar Mishra](https://github.com/abhimishra91) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb)|
+|[Fine-tuning di BERT per la classificazione multi-etichetta](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)|Come effettuare il fine-tuning di BERT per la classificazione multi-etichetta con PyTorch. |[Abhishek Kumar Mishra](https://github.com/abhimishra91) |[![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)|
+|[Accelerazione del fine-tuning con il Dynamic Padding / Bucketing](https://github.com/ELS-RD/transformers-notebook/blob/master/Divide_Hugging_Face_Transformers_training_time_by_2_or_more.ipynb)| Come velocizzare il fine-tuning di un fattore 2X usando il dynamic padding / bucketing. |[Michael Benesty](https://github.com/pommedeterresautee) |[![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1CBfRU1zbfu7-ijiOqAAQUA-RJaxfcJoO?usp=sharing)|
+|[Pre-addestramento di Reformer per Masked Language Modeling](https://github.com/patrickvonplaten/notebooks/blob/master/Reformer_For_Masked_LM.ipynb)| Come addestrare un modello Reformer usando livelli di self-attention bi-direzionali.| [Patrick von Platen](https://github.com/patrickvonplaten) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tzzh0i8PgDQGV3SMFUGxM7_gGae3K-uW?usp=sharing)|
+|[Espansione e fine-tuning di Sci-BERT](https://github.com/lordtt13/word-embeddings/blob/master/COVID-19%20Research%20Data/COVID-SciBERT.ipynb)| Come incrementare il vocabolario di un modello SciBERT - pre-addestrato da AllenAI sul dataset CORD - e crearne una pipeline. | [Tanmay Thakur](https://github.com/lordtt13) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1rqAR40goxbAfez1xvF3hBJphSCsvXmh8)|
+|[Fine-tuning di BlenderBotSmall per riassumere testi usando Trainer API](https://github.com/lordtt13/transformers-experiments/blob/master/Custom%20Tasks/fine-tune-blenderbot_small-for-summarization.ipynb)| Come effettuare il fine-tuning di BlenderBotSmall per riassumere testi su un dataset personalizzato, usando Trainer API. | [Tanmay Thakur](https://github.com/lordtt13) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/19Wmupuls7mykSGyRN_Qo6lPQhgp56ymq?usp=sharing)|
+|[Fine-tuning di Electra e interpretazione con Integrated Gradients](https://github.com/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb) | Come effettuare il fine-tuning di Electra per l'analisi dei sentimenti e intepretare le predizioni con Captum Integrated Gradients. | [Eliza Szczechla](https://elsanns.github.io) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb)|
+|[Fine-tuning di un modello GPT-2 non inglese con la classe Trainer](https://github.com/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb) | Come effettuare il fine-tuning di un modello GPT-2 non inglese con la classe Trainer. | [Philipp Schmid](https://www.philschmid.de) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb)|
+|[Fine-tuning di un modello DistilBERT per la classficazione multi-etichetta](https://github.com/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb) | Come effettuare il fine-tuning di un modello DistilBERT per l'attività di classificazione multi-etichetta. | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb)|
+|[Fine-tuning di ALBERT per la classifcazione di coppie di frasi](https://github.com/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb) | Come effettuare il fine-tuning di un modello ALBERT - o un altro modello BERT-based - per l'attività di classificazione di coppie di frasi. | [Nadir El Manouzi](https://github.com/NadirEM) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb)|
+|[Fine-tuning di Roberta per l'analisi di sentimenti](https://github.com/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb) | Come effettuare il fine-tuning di un modello Roberta per l'analisi di sentimenti. | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb)|
+|[Valutazione di modelli che generano domande](https://github.com/flexudy-pipe/qugeev) | Quanto sono accurante le risposte alle domande generate dal tuo modello transformer seq2seq? | [Pascal Zoleko](https://github.com/zolekode) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1bpsSqCQU-iw_5nNoRm_crPq6FRuJthq_?usp=sharing)|
+|[Classificazione di testo con DistilBERT e Tensorflow](https://github.com/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb) | Come effettuare il fine-tuning di DistilBERT per la classificazione di testo in TensorFlow. | [Peter Bayerle](https://github.com/peterbayerle) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb)|
+|[Utilizzo di BERT per riassumere testi con un modello Encoder-Decoder su CNN/Dailymail](https://github.com/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb) | Come avviare "a caldo" un *EncoderDecoderModel* attraverso l'utilizzo di un checkpoint *bert-base-uncased* per riassumere testi su CNN/Dailymail. | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb)|
+|[Utilizzo di RoBERTa per riassumere testi con un modello Encoder-Decoder su BBC XSum](https://github.com/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb) | Come avviare "a caldo" un *EncoderDecoderModel* (condiviso) attraverso l'utilizzo di un checkpoint *roberta-base* per riassumere testi su BBC/XSum. | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb)|
+|[Fine-tuning di TAPAS su Sequential Question Answering (SQA)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) | Come effettuare il fine-tuning di un modello *TapasForQuestionAnswering* attraverso l'utilizzo di un checkpoint *tapas-base* sul dataset Sequential Question Answering (SQA). | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb)|
+|[Valutazione di TAPAS su Table Fact Checking (TabFact)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb) | Come valutare un modello *TapasForSequenceClassification* - fine-tuned con un checkpoint *tapas-base-finetuned-tabfact* - usando una combinazione delle librerie 🤗 datasets e 🤗 transformers. | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb)|
+|[Fine-tuning di mBART per la traduzione](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb) | Come effettuare il fine-tuning di mBART usando Seq2SeqTrainer per la traduzione da hindi a inglese.| [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb)|
+|[Fine-tuning di LayoutLM su FUNSD (un dataset per la comprensione della forma)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb) | Come effettuare il fine-tuning di un modello *LayoutLMForTokenClassification* sul dataset FUNSD per l'estrazione di informazioni da documenti scannerizzati.| [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb)|
+|[Fine-tuning di DistilGPT2 e generazione di testo](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb) | Come effettuare il fine-tuning di DistilGPT2 e generare testo. | [Aakash Tripathi](https://github.com/tripathiaakash) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb)|
+|[Fine-tuning di LED fino a 8 mila token](https://github.com/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb) | Come effettuare il fine-tuning di LED su PubMed per riassumere "lunghi" testi. | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb)|
+|[Valutazione di LED su Arxiv](https://github.com/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb) | Come valutare efficacemente LED sull'attività di riassumere "lunghi" testi. | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb)|
+|[Fine-tuning di LayoutLM su RVL-CDIP, un dataset per la classificazione di documenti (immagini)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb) | Come effettuare il fine-tuning di un modello *LayoutLMForSequenceClassification* sul dataset RVL-CDIP per la classificazione di documenti scannerizzati. | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb)|
+|[Decodifica Wav2Vec2 CTC con variazioni di GPT2](https://github.com/voidful/huggingface_notebook/blob/main/xlsr_gpt.ipynb) | Come decodificare sequenze CTC, variate da modelli di linguaggio. | [Eric Lam](https://github.com/voidful) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1e_z5jQHYbO2YKEaUgzb1ww1WwiAyydAj?usp=sharing)
+|[Fine-tuning di BART per riassumere testi in due lingue con la classe Trainer](https://github.com/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb) | Come effettuare il fine-tuning di BART per riassumere testi in due lingue usando la classe Trainer. | [Eliza Szczechla](https://github.com/elsanns) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb)|
+|[Valutazione di Big Bird su Trivia QA](https://github.com/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb) | Come valutare BigBird su question answering di "lunghi" documenti attraverso Trivia QA. | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb)|
+| [Creazione di sottotitoli per video usando Wav2Vec2](https://github.com/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) | Come creare sottotitoli per qualsiasi video di YouTube trascrivendo l'audio con Wav2Vec. | [Niklas Muennighoff](https://github.com/Muennighoff) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) |
+| [Fine-tuning di Vision Transformer su CIFAR-10 usando PyTorch Lightning](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) | Come effettuare il fine-tuning di Vision Transformer (ViT) su CIFAR-10 usando HuggingFace Transformers, Datasets e PyTorch Lightning.| [Niels Rogge](https://github.com/nielsrogge) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) |
+| [Fine-tuning di Vision Transformer su CIFAR-10 usando 🤗 Trainer](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) | Come effettuare il fine-tuning di Vision Transformer (ViT) su CIFAR-10 usando HuggingFace Transformers, Datasets e 🤗 Trainer. | [Niels Rogge](https://github.com/nielsrogge) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) |
+| [Valutazione di LUKE su Open Entity, un dataset di entity typing](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) | Come valutare un modello *LukeForEntityClassification* sul dataset Open Entity. | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) |
+| [Valutazione di LUKE su TACRED, un dataset per l'estrazione di relazioni](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) | Come valutare un modello *LukeForEntityPairClassification* sul dataset TACRED. | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) |
+| [Valutazione di LUKE su CoNLL-2003, un importante benchmark NER](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) | Come valutare un modello *LukeForEntitySpanClassification* sul dataset CoNLL-2003. | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) |
+| [Valutazione di BigBird-Pegasus su dataset PubMed](https://github.com/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) | Come valutare un modello *BigBirdPegasusForConditionalGeneration* su dataset PubMed. | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) |
+| [Classificazione di emozioni dal discorso con Wav2Vec2](https://github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) | Come utilizzare un modello pre-addestrato Wav2Vec2 per la classificazione di emozioni sul dataset MEGA. | [Mehrdad Farahani](https://github.com/m3hrdadfi) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) |
+| [Rilevamento oggetti in un'immagine con DETR](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) | Come usare un modello addestrato *DetrForObjectDetection* per rilevare oggetti in un'immagine e visualizzare l'attention. | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) |
+| [Fine-tuning di DETR su un dataset personalizzato per rilevare oggetti](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) | Come effettuare fine-tuning di un modello *DetrForObjectDetection* su un dataset personalizzato per rilevare oggetti. | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) |
+| [Fine-tuning di T5 per Named Entity Recognition](https://github.com/ToluClassics/Notebooks/blob/main/T5_Ner_Finetuning.ipynb) | Come effettuare fine-tunining di *T5* per un'attività di Named Entity Recognition. | [Ogundepo Odunayo](https://github.com/ToluClassics) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1obr78FY_cBmWY5ODViCmzdY6O1KB65Vc?usp=sharing) |
diff --git a/docs/source/it/converting_tensorflow_models.md b/docs/source/it/converting_tensorflow_models.md
new file mode 100644
index 000000000000..04398636359c
--- /dev/null
+++ b/docs/source/it/converting_tensorflow_models.md
@@ -0,0 +1,159 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Convertire checkpoint di Tensorflow
+
+È disponibile un'interfaccia a linea di comando per convertire gli originali checkpoint di Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM 
+in modelli che possono essere caricati utilizzando i metodi `from_pretrained` della libreria.
+
+<Tip>
+
+A partire dalla versione 2.3.0 lo script di conversione è parte di transformers CLI (**transformers-cli**), disponibile in ogni installazione 
+di transformers >=2.3.0.
+
+La seguente documentazione riflette il formato dei comandi di **transformers-cli convert**.
+
+</Tip>
+
+## BERT
+
+Puoi convertire qualunque checkpoint Tensorflow di BERT (in particolare 
+[i modeli pre-allenati rilasciati da Google](https://github.com/google-research/bert#pre-trained-models)) 
+in un file di salvataggio Pytorch utilizzando lo script 
+[convert_bert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py).
+
+Questo CLI prende come input un checkpoint di Tensorflow (tre files che iniziano con `bert_model.ckpt`) ed il relativo 
+file di configurazione (`bert_config.json`), crea un modello Pytorch per questa configurazione, carica i pesi dal
+checkpoint di Tensorflow nel modello di Pytorch e salva il modello che ne risulta in un file di salvataggio standard di Pytorch che 
+può essere importato utilizzando `from_pretrained()` (vedi l'esempio nel
+[quicktour](quicktour) , [run_glue.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification/run_glue.py) ).
+
+Devi soltanto lanciare questo script di conversione **una volta** per ottenere un modello Pytorch. Dopodichè, potrai tralasciare 
+il checkpoint di Tensorflow (i tre files che iniziano con `bert_model.ckpt`), ma assicurati di tenere il file di configurazione 
+(`bert_config.json`) ed il file di vocabolario (`vocab.txt`) in quanto queste componenti sono necessarie anche per il modello di Pytorch.
+
+Per lanciare questo specifico script di conversione avrai bisogno di un'installazione di Tensorflow e di Pytorch
+(`pip install tensorflow`). Il resto della repository richiede soltanto Pytorch.
+
+Questo è un esempio del processo di conversione per un modello `BERT-Base Uncased` pre-allenato:
+
+```bash
+export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
+transformers-cli convert --model_type bert \
+  --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
+  --config $BERT_BASE_DIR/bert_config.json \
+  --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
+```
+
+Puoi scaricare i modelli pre-allenati di Google per la conversione [qua](https://github.com/google-research/bert#pre-trained-models).
+
+## ALBERT
+
+Per il modello ALBERT, converti checkpoint di Tensoflow in Pytorch utilizzando lo script 
+[convert_albert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py).
+
+Il CLI prende come input un checkpoint di Tensorflow (tre files che iniziano con `model.ckpt-best`) e i relativi file di 
+configurazione (`albert_config.json`), dopodichè crea e salva un modello Pytorch. Per lanciare questa conversione 
+avrai bisogno di un'installazione di Tensorflow e di Pytorch.
+
+Ecco un esempio del procedimento di conversione di un modello `ALBERT Base` pre-allenato:
+
+```bash
+export ALBERT_BASE_DIR=/path/to/albert/albert_base
+transformers-cli convert --model_type albert \
+  --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
+  --config $ALBERT_BASE_DIR/albert_config.json \
+  --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
+```
+
+Puoi scaricare i modelli pre-allenati di Google per la conversione [qui](https://github.com/google-research/albert#pre-trained-models).
+
+## OpenAI GPT
+
+Ecco un esempio del processo di conversione di un modello OpenAI GPT pre-allenato, assumendo che il tuo checkpoint di NumPy
+sia salvato nello stesso formato dei modelli pre-allenati OpenAI (vedi [qui](https://github.com/openai/finetune-transformer-lm)):
+```bash
+export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
+transformers-cli convert --model_type gpt \
+  --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
+  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+  [--config OPENAI_GPT_CONFIG] \
+  [--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \
+```
+
+## OpenAI GPT-2
+
+Ecco un esempio del processo di conversione di un modello OpenAI GPT-2 pre-allenato (vedi [qui](https://github.com/openai/gpt-2)):
+
+```bash
+export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights
+transformers-cli convert --model_type gpt2 \
+  --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
+  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+  [--config OPENAI_GPT2_CONFIG] \
+  [--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]
+```
+
+## Transformer-XL
+
+
+Ecco un esempio del processo di conversione di un modello Transformer-XL pre-allenato 
+(vedi [qui](https://github.com/kimiyoung/transformer-xl/tree/master/tf#obtain-and-evaluate-pretrained-sota-models)):
+
+```bash
+export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
+transformers-cli convert --model_type transfo_xl \
+  --tf_checkpoint $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
+  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+  [--config TRANSFO_XL_CONFIG] \
+  [--finetuning_task_name TRANSFO_XL_FINETUNED_TASK]
+```
+
+## XLNet
+
+Ecco un esempio del processo di conversione di un modello XLNet pre-allenato:
+
+```bash
+export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
+export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
+transformers-cli convert --model_type xlnet \
+  --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
+  --config $TRANSFO_XL_CONFIG_PATH \
+  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+  [--finetuning_task_name XLNET_FINETUNED_TASK] \
+```
+
+## XLM
+
+Ecco un esempio del processo di conversione di un modello XLM pre-allenato:
+
+```bash
+export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
+transformers-cli convert --model_type xlm \
+  --tf_checkpoint $XLM_CHECKPOINT_PATH \
+  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT
+ [--config XML_CONFIG] \
+ [--finetuning_task_name XML_FINETUNED_TASK]
+```
+
+## T5
+
+Ecco un esempio del processo di conversione di un modello T5 pre-allenato:
+
+```bash
+export T5=/path/to/t5/uncased_L-12_H-768_A-12
+transformers-cli convert --model_type t5 \
+  --tf_checkpoint $T5/t5_model.ckpt \
+  --config $T5/t5_config.json \
+  --pytorch_dump_output $T5/pytorch_model.bin
+```
diff --git a/docs/source/it/converting_tensorflow_models.mdx b/docs/source/it/converting_tensorflow_models.mdx
deleted file mode 100644
index b9b30a315c6a..000000000000
--- a/docs/source/it/converting_tensorflow_models.mdx
+++ /dev/null
@@ -1,155 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Convertire checkpoint di Tensorflow
-
-È disponibile un'interfaccia a linea di comando per convertire gli originali checkpoint di Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM 
-in modelli che possono essere caricati utilizzando i metodi `from_pretrained` della libreria.
-
-<Tip>
-
-A partire dalla versione 2.3.0 lo script di conversione è parte di transformers CLI (**transformers-cli**), disponibile in ogni installazione 
-di transformers >=2.3.0.
-
-La seguente documentazione riflette il formato dei comandi di **transformers-cli convert**.
-
-</Tip>
-
-## BERT
-
-Puoi convertire qualunque checkpoint Tensorflow di BERT (in particolare 
-[i modeli pre-allenati rilasciati da Google](https://github.com/google-research/bert#pre-trained-models)) 
-in un file di salvataggio Pytorch utilizzando lo script 
-[convert_bert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py).
-
-Questo CLI prende come input un checkpoint di Tensorflow (tre files che iniziano con `bert_model.ckpt`) ed il relativo 
-file di configurazione (`bert_config.json`), crea un modello Pytorch per questa configurazione, carica i pesi dal
-checkpoint di Tensorflow nel modello di Pytorch e salva il modello che ne risulta in un file di salvataggio standard di Pytorch che 
-può essere importato utilizzando `from_pretrained()` (vedi l'esempio nel
-[quicktour](quicktour) , [run_glue.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification/run_glue.py) ).
-
-Devi soltanto lanciare questo script di conversione **una volta** per ottenere un modello Pytorch. Dopodichè, potrai tralasciare 
-il checkpoint di Tensorflow (i tre files che iniziano con `bert_model.ckpt`), ma assicurati di tenere il file di configurazione 
-(`bert_config.json`) ed il file di vocabolario (`vocab.txt`) in quanto queste componenti sono necessarie anche per il modello di Pytorch.
-
-Per lanciare questo specifico script di conversione avrai bisogno di un'installazione di Tensorflow e di Pytorch
-(`pip install tensorflow`). Il resto della repository richiede soltanto Pytorch.
-
-Questo è un esempio del processo di conversione per un modello `BERT-Base Uncased` pre-allenato:
-
-```bash
-export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
-transformers-cli convert --model_type bert \
-  --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
-  --config $BERT_BASE_DIR/bert_config.json \
-  --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
-```
-
-Puoi scaricare i modelli pre-allenati di Google per la conversione [qua](https://github.com/google-research/bert#pre-trained-models).
-
-## ALBERT
-
-Per il modello ALBERT, converti checkpoint di Tensoflow in Pytorch utilizzando lo script 
-[convert_albert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py).
-
-Il CLI prende come input un checkpoint di Tensorflow (tre files che iniziano con `model.ckpt-best`) e i relativi file di 
-configurazione (`albert_config.json`), dopodichè crea e salva un modello Pytorch. Per lanciare questa conversione 
-avrai bisogno di un'installazione di Tensorflow e di Pytorch.
-
-Ecco un esempio del procedimento di conversione di un modello `ALBERT Base` pre-allenato:
-
-```bash
-export ALBERT_BASE_DIR=/path/to/albert/albert_base
-transformers-cli convert --model_type albert \
-  --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
-  --config $ALBERT_BASE_DIR/albert_config.json \
-  --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
-```
-
-Puoi scaricare i modelli pre-allenati di Google per la conversione [qui](https://github.com/google-research/albert#pre-trained-models).
-
-## OpenAI GPT
-
-Ecco un esempio del processo di conversione di un modello OpenAI GPT pre-allenato, assumendo che il tuo checkpoint di NumPy
-sia salvato nello stesso formato dei modelli pre-allenati OpenAI (vedi [qui](https://github.com/openai/finetune-transformer-lm)):
-```bash
-export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
-transformers-cli convert --model_type gpt \
-  --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
-  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-  [--config OPENAI_GPT_CONFIG] \
-  [--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \
-```
-
-## OpenAI GPT-2
-
-Ecco un esempio del processo di conversione di un modello OpenAI GPT-2 pre-allenato (vedi [qui](https://github.com/openai/gpt-2)):
-
-```bash
-export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights
-transformers-cli convert --model_type gpt2 \
-  --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
-  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-  [--config OPENAI_GPT2_CONFIG] \
-  [--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]
-```
-
-## Transformer-XL
-
-
-Ecco un esempio del processo di conversione di un modello Transformer-XL pre-allenato 
-(vedi [qui](https://github.com/kimiyoung/transformer-xl/tree/master/tf#obtain-and-evaluate-pretrained-sota-models)):
-
-```bash
-export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
-transformers-cli convert --model_type transfo_xl \
-  --tf_checkpoint $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
-  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-  [--config TRANSFO_XL_CONFIG] \
-  [--finetuning_task_name TRANSFO_XL_FINETUNED_TASK]
-```
-
-## XLNet
-
-Ecco un esempio del processo di conversione di un modello XLNet pre-allenato:
-
-```bash
-export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
-export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
-transformers-cli convert --model_type xlnet \
-  --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
-  --config $TRANSFO_XL_CONFIG_PATH \
-  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-  [--finetuning_task_name XLNET_FINETUNED_TASK] \
-```
-
-## XLM
-
-Ecco un esempio del processo di conversione di un modello XLM pre-allenato:
-
-```bash
-export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
-transformers-cli convert --model_type xlm \
-  --tf_checkpoint $XLM_CHECKPOINT_PATH \
-  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT
- [--config XML_CONFIG] \
- [--finetuning_task_name XML_FINETUNED_TASK]
-```
-
-## T5
-
-Ecco un esempio del processo di conversione di un modello T5 pre-allenato:
-
-```bash
-export T5=/path/to/t5/uncased_L-12_H-768_A-12
-transformers-cli convert --model_type t5 \
-  --tf_checkpoint $T5/t5_model.ckpt \
-  --config $T5/t5_config.json \
-  --pytorch_dump_output $T5/pytorch_model.bin
-```
diff --git a/docs/source/it/create_a_model.md b/docs/source/it/create_a_model.md
new file mode 100644
index 000000000000..c32040d7d389
--- /dev/null
+++ b/docs/source/it/create_a_model.md
@@ -0,0 +1,361 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Crea un'architettura personalizzata 
+
+Una [`AutoClass`](model_doc/auto) deduce automaticamente il modello dell'architettura e scarica la configurazione e i pesi pre-allenati. Generalmente, noi consigliamo di usare un `AutoClass` per produrre un codice indipendente dal checkpoint. Ma gli utenti che desiderano un controllo maggiore su parametri specifici del modello possono creare un modello 🤗 Transformers personalizzato da poche classi base. Questo potrebbe essere particolarmente utile per qualunque persona sia interessata nel studiare, allenare o sperimentare con un modello 🤗 Transformers. In questa guida, approfondisci la creazione di un modello personalizzato senza `AutoClass`. Impara come:
+
+- Caricare e personalizzare una configurazione del modello.
+- Creare un'architettura modello.
+- Creare un tokenizer lento e veloce per il testo.
+- Creare un estrattore di caratteristiche per attività riguardanti audio o immagini.
+- Creare un processore per attività multimodali.
+
+## Configurazione
+
+Una [configurazione](main_classes/configuration) si riferisce agli attributi specifici di un modello. Ogni configurazione del modello ha attributi diversi; per esempio, tutti i modelli npl hanno questi attributi in comune `hidden_size`, `num_attention_heads`, `num_hidden_layers` e `vocab_size`. Questi attributi specificano il numero di attention heads o strati nascosti con cui costruire un modello.
+
+Dai un'occhiata più da vicino a [DistilBERT](model_doc/distilbert) accedendo a [`DistilBertConfig`] per ispezionare i suoi attributi:
+
+```py
+>>> from transformers import DistilBertConfig
+
+>>> config = DistilBertConfig()
+>>> print(config)
+DistilBertConfig {
+  "activation": "gelu",
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "transformers_version": "4.16.2",
+  "vocab_size": 30522
+}
+```
+
+[`DistilBertConfig`] mostra tutti gli attributi predefiniti usati per costruire una base [`DistilBertModel`]. Tutti gli attributi sono personalizzabili, creando uno spazio per sperimentare. Per esempio, puoi configurare un modello predefinito per:
+
+- Provare un funzione di attivazione diversa con il parametro `activation`.
+- Utilizzare tasso di drop out più elevato per le probalità di attention con il parametro `attention_dropout`.
+
+```py
+>>> my_config = DistilBertConfig(activation="relu", attention_dropout=0.4)
+>>> print(my_config)
+DistilBertConfig {
+  "activation": "relu",
+  "attention_dropout": 0.4,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "transformers_version": "4.16.2",
+  "vocab_size": 30522
+}
+```
+
+Nella funzione [`~PretrainedConfig.from_pretrained`] possono essere modificati gli attributi del modello pre-allenato:
+
+```py
+>>> my_config = DistilBertConfig.from_pretrained("distilbert-base-uncased", activation="relu", attention_dropout=0.4)
+```
+
+Quando la configurazione del modello ti soddisfa, la puoi salvare con [`~PretrainedConfig.save_pretrained`]. Il file della tua configurazione è memorizzato come file JSON nella save directory specificata:
+
+```py
+>>> my_config.save_pretrained(save_directory="./your_model_save_path")
+```
+
+Per riutilizzare la configurazione del file, caricalo con [`~PretrainedConfig.from_pretrained`]:
+
+```py
+>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json")
+```
+
+<Tip>
+
+Puoi anche salvare il file di configurazione come dizionario oppure come la differenza tra gli attributi della tua configurazione personalizzata e gli attributi della configurazione predefinita! Guarda la documentazione [configuration](main_classes/configuration) per più dettagli.
+
+</Tip>
+
+## Modello
+
+Il prossimo passo e di creare [modello](main_classes/models). Il modello - vagamente riferito anche come architettura - definisce cosa ogni strato deve fare e quali operazioni stanno succedendo. Attributi come `num_hidden_layers` provenienti dalla configurazione sono usati per definire l'architettura. Ogni modello condivide la classe base [`PreTrainedModel`] e alcuni metodi comuni come il ridimensionamento degli input embeddings e la soppressione delle self-attention heads . Inoltre, tutti i modelli sono la sottoclasse di [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html), [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) o [`flax.linen.Module`](https://flax.readthedocs.io/en/latest/flax.linen.html#module). Cio significa che i modelli sono compatibili con l'uso di ciascun di framework.
+
+<frameworkcontent>
+<pt>
+Carica gli attributi della tua configurazione personalizzata nel modello:
+
+```py
+>>> from transformers import DistilBertModel
+
+>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json")
+>>> model = DistilBertModel(my_config)
+```
+
+Questo crea modelli con valori casuali invece di pesi pre-allenati. Non sarai in grado di usare questo modello per niente di utile finché non lo alleni. L'allenamento è un processo costoso e che richiede tempo . Generalmente è meglio usare un modello pre-allenato per ottenere risultati migliori velocemente, utilizzando solo una frazione delle risorse neccesarie per l'allenamento.
+
+Crea un modello pre-allenato con [`~PreTrainedModel.from_pretrained`]:
+
+```py
+>>> model = DistilBertModel.from_pretrained("distilbert-base-uncased")
+```
+
+Quando carichi pesi pre-allenati, la configurazione del modello predefinito è automaticamente caricata se il modello è fornito da 🤗 Transformers. Tuttavia, puoi ancora sostituire gli attributi - alcuni o tutti - di configurazione del modello predefinito con i tuoi se lo desideri:
+
+```py
+>>> model = DistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
+```
+</pt>
+<tf>
+Carica gli attributi di configurazione personalizzati nel modello:
+
+```py
+>>> from transformers import TFDistilBertModel
+
+>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json")
+>>> tf_model = TFDistilBertModel(my_config)
+```
+
+
+Questo crea modelli con valori casuali invece di pesi pre-allenati. Non sarai in grado di usare questo modello per niente di utile finché non lo alleni. L'allenamento è un processo costoso e che richiede tempo . Generalmente è meglio usare un modello pre-allenato per ottenere risultati migliori velocemente, utilizzando solo una frazione delle risorse neccesarie per l'allenamento.
+
+Crea un modello pre-allenoto con [`~TFPreTrainedModel.from_pretrained`]:
+
+```py
+>>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased")
+```
+
+Quando carichi pesi pre-allenati, la configurazione del modello predefinito è automaticamente caricato se il modello è fornito da 🤗 Transformers. Tuttavia, puoi ancora sostituire gli attributi - alcuni o tutti - di configurazione del modello predefinito con i tuoi se lo desideri:
+
+```py
+>>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
+```
+
+</tf>
+</frameworkcontent>
+
+### Model head
+
+A questo punto, hai un modello DistilBERT base i cui output sono gli *hidden states* (in italiano stati nascosti). Gli stati nascosti sono passati come input a un model head per produrre l'output finale. 🤗 Transformers fornisce un model head diverso per ogni attività fintanto che il modello supporta l'attività  (i.e., non puoi usare DistilBERT per un attività sequence-to-sequence come la traduzione).
+
+<frameworkcontent>
+<pt>
+Per esempio, [`DistilBertForSequenceClassification`] è un modello DistilBERT base con una testa di classificazione per sequenze. La sequenza di classificazione head è uno strato lineare sopra gli output ragruppati.
+
+```py
+>>> from transformers import DistilBertForSequenceClassification
+
+>>> model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
+```
+
+Riutilizza facilmente questo checkpoint per un'altra attività passando ad un model head differente. Per un attività di risposta alle domande, utilizzerai il model head [`DistilBertForQuestionAnswering`]. La head per compiti di question answering è simile alla classificazione di sequenza head tranne per il fatto che è uno strato lineare sopra l'output degli stati nascosti (hidden states in inglese) 
+
+```py
+>>> from transformers import DistilBertForQuestionAnswering
+
+>>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+```
+</pt>
+<tf>
+Per esempio, [`TFDistilBertForSequenceClassification`] è un modello DistilBERT base con classificazione di sequenza head. La classificazione di sequenza head è uno strato lineare sopra gli output raggruppati.
+
+```py
+>>> from transformers import TFDistilBertForSequenceClassification
+
+>>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
+```
+
+Riutilizza facilmente questo checkpoint per un altra attività passando ad un modello head diverso. Per un attività di risposta alle domande, utilizzerai il model head [`TFDistilBertForQuestionAnswering`]. Il head di risposta alle domande è simile alla sequenza di classificazione head tranne per il fatto che è uno strato lineare sopra l'output degli stati nascosti (hidden states in inglese)
+
+```py
+>>> from transformers import TFDistilBertForQuestionAnswering
+
+>>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+```
+</tf>
+</frameworkcontent>
+
+## Tokenizer
+
+L'ultima classe base di cui hai bisogno prima di utilizzare un modello per i dati testuali è un [tokenizer](main_classes/tokenizer) per convertire il testo grezzo in tensori. Ci sono due tipi di tokenizer che puoi usare con 🤗 Transformers:
+
+- [`PreTrainedTokenizer`]: un'implementazione Python di un tokenizer.
+- [`PreTrainedTokenizerFast`]: un tokenizer dalla nostra libreria [🤗 Tokenizer](https://huggingface.co/docs/tokenizers/python/latest/) basata su Rust. Questo tipo di tokenizer è significativamente più veloce, specialmente durante la batch tokenization, grazie alla sua implementazione Rust. Il tokenizer veloce offre anche metodi aggiuntivi come *offset mapping* che associa i token alle loro parole o caratteri originali.
+
+Entrambi i tokenizer supportano metodi comuni come la codifica e la decodifica, l'aggiunta di nuovi token e la gestione di token speciali.
+
+<Tip warning={true}>
+
+Non tutti i modelli supportano un tokenizer veloce. Dai un'occhiata a questo [tabella](index#supported-frameworks) per verificare se un modello ha il supporto per tokenizer veloce. 
+
+</Tip>
+
+Se hai addestrato il tuo tokenizer, puoi crearne uno dal tuo file *vocabolario*: 
+
+```py
+>>> from transformers import DistilBertTokenizer
+
+>>> my_tokenizer = DistilBertTokenizer(vocab_file="my_vocab_file.txt", do_lower_case=False, padding_side="left")
+```
+
+È importante ricordare che il vocabolario di un tokenizer personalizzato sarà diverso dal vocabolario generato dal tokenizer di un modello preallenato. È necessario utilizzare il vocabolario di un modello preallenato se si utilizza un modello preallenato, altrimenti gli input non avranno senso. Crea un tokenizer con il vocabolario di un modello preallenato con la classe [`DistilBertTokenizer`]:
+
+```py
+>>> from transformers import DistilBertTokenizer
+
+>>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
+```
+
+Crea un tokenizer veloce con la classe [`DistilBertTokenizerFast`]:
+
+```py
+>>> from transformers import DistilBertTokenizerFast
+
+>>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
+```
+
+<Tip>
+
+Per l'impostazione predefinita, [`AutoTokenizer`] proverà a caricare un tokenizer veloce. Puoi disabilitare questo comportamento impostando `use_fast=False` in `from_pretrained`.
+
+</Tip>
+
+## Estrattore Di Feature
+
+Un estrattore di caratteristiche (feature in inglese) elabora input audio o immagini. Eredita dalla classe [`~feature_extraction_utils.FeatureExtractionMixin`] base e può anche ereditare dalla classe [`ImageFeatureExtractionMixin`] per l'elaborazione delle caratteristiche dell'immagine o dalla classe [`SequenceFeatureExtractor`] per l'elaborazione degli input audio.
+
+A seconda che tu stia lavorando a un'attività audio o visiva, crea un estrattore di caratteristiche associato al modello che stai utilizzando. Ad esempio, crea un [`ViTFeatureExtractor`] predefinito se stai usando [ViT](model_doc/vit) per la classificazione delle immagini:
+
+```py
+>>> from transformers import ViTFeatureExtractor
+
+>>> vit_extractor = ViTFeatureExtractor()
+>>> print(vit_extractor)
+ViTFeatureExtractor {
+  "do_normalize": true,
+  "do_resize": true,
+  "feature_extractor_type": "ViTFeatureExtractor",
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "resample": 2,
+  "size": 224
+}
+```
+
+<Tip>
+
+Se non stai cercando alcuna personalizzazione, usa il metodo `from_pretrained` per caricare i parametri di default dell'estrattore di caratteristiche di un modello.
+
+</Tip>
+
+Modifica uno qualsiasi dei parametri [`ViTFeatureExtractor`] per creare il tuo estrattore di caratteristiche personalizzato:
+
+```py
+>>> from transformers import ViTFeatureExtractor
+
+>>> my_vit_extractor = ViTFeatureExtractor(resample="PIL.Image.BOX", do_normalize=False, image_mean=[0.3, 0.3, 0.3])
+>>> print(my_vit_extractor)
+ViTFeatureExtractor {
+  "do_normalize": false,
+  "do_resize": true,
+  "feature_extractor_type": "ViTFeatureExtractor",
+  "image_mean": [
+    0.3,
+    0.3,
+    0.3
+  ],
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "resample": "PIL.Image.BOX",
+  "size": 224
+}
+```
+
+Per gli input audio, puoi creare un [`Wav2Vec2FeatureExtractor`] e personalizzare i parametri in modo simile:
+
+```py
+>>> from transformers import Wav2Vec2FeatureExtractor
+
+>>> w2v2_extractor = Wav2Vec2FeatureExtractor()
+>>> print(w2v2_extractor)
+Wav2Vec2FeatureExtractor {
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "return_attention_mask": false,
+  "sampling_rate": 16000
+}
+```
+
+## Processore
+
+Per modelli che supportano attività multimodali, 🤗 Transformers offre una classe di processore che racchiude comodamente un estrattore di caratteristiche e un tokenizer in un unico oggetto. Ad esempio, utilizziamo [`Wav2Vec2Processor`] per un'attività di riconoscimento vocale automatico (ASR). ASR trascrive l'audio in testo, quindi avrai bisogno di un estrattore di caratteristiche e di un tokenizer.
+
+Crea un estrattore di feature per gestire gli input audio:
+
+```py
+>>> from transformers import Wav2Vec2FeatureExtractor
+
+>>> feature_extractor = Wav2Vec2FeatureExtractor(padding_value=1.0, do_normalize=True)
+```
+
+Crea un tokenizer per gestire gli input di testo:
+
+```py
+>>> from transformers import Wav2Vec2CTCTokenizer
+
+>>> tokenizer = Wav2Vec2CTCTokenizer(vocab_file="my_vocab_file.txt")
+```
+
+Combinare l'estrattore di caratteristiche e il tokenizer in [`Wav2Vec2Processor`]:
+
+```py
+>>> from transformers import Wav2Vec2Processor
+
+>>> processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+```
+
+Con due classi di base - configurazione e modello - e una classe di preelaborazione aggiuntiva (tokenizer, estrattore di caratteristiche o processore), puoi creare qualsiasi modello supportato da 🤗 Transformers. Ognuna di queste classi base è configurabile, consentendoti di utilizzare gli attributi specifici che desideri. È possibile impostare facilmente un modello per l'addestramento o modificare un modello preallenato esistente per la messa a punto.
\ No newline at end of file
diff --git a/docs/source/it/create_a_model.mdx b/docs/source/it/create_a_model.mdx
deleted file mode 100644
index 6e11f3f1d029..000000000000
--- a/docs/source/it/create_a_model.mdx
+++ /dev/null
@@ -1,357 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Crea un'architettura personalizzata 
-
-Una [`AutoClass`](model_doc/auto) deduce automaticamente il modello dell'architettura e scarica la configurazione e i pesi pre-allenati. Generalmente, noi consigliamo di usare un `AutoClass` per produrre un codice indipendente dal checkpoint. Ma gli utenti che desiderano un controllo maggiore su parametri specifici del modello possono creare un modello 🤗 Transformers personalizzato da poche classi base. Questo potrebbe essere particolarmente utile per qualunque persona sia interessata nel studiare, allenare o sperimentare con un modello 🤗 Transformers. In questa guida, approfondisci la creazione di un modello personalizzato senza `AutoClass`. Impara come:
-
-- Caricare e personalizzare una configurazione del modello.
-- Creare un'architettura modello.
-- Creare un tokenizer lento e veloce per il testo.
-- Creare un estrattore di caratteristiche per attività riguardanti audio o immagini.
-- Creare un processore per attività multimodali.
-
-## Configurazione
-
-Una [configurazione](main_classes/configuration) si riferisce agli attributi specifici di un modello. Ogni configurazione del modello ha attributi diversi; per esempio, tutti i modelli npl hanno questi attributi in comune `hidden_size`, `num_attention_heads`, `num_hidden_layers` e `vocab_size`. Questi attributi specificano il numero di attention heads o strati nascosti con cui costruire un modello.
-
-Dai un'occhiata più da vicino a [DistilBERT](model_doc/distilbert) accedendo a [`DistilBertConfig`] per ispezionare i suoi attributi:
-
-```py
->>> from transformers import DistilBertConfig
-
->>> config = DistilBertConfig()
->>> print(config)
-DistilBertConfig {
-  "activation": "gelu",
-  "attention_dropout": 0.1,
-  "dim": 768,
-  "dropout": 0.1,
-  "hidden_dim": 3072,
-  "initializer_range": 0.02,
-  "max_position_embeddings": 512,
-  "model_type": "distilbert",
-  "n_heads": 12,
-  "n_layers": 6,
-  "pad_token_id": 0,
-  "qa_dropout": 0.1,
-  "seq_classif_dropout": 0.2,
-  "sinusoidal_pos_embds": false,
-  "transformers_version": "4.16.2",
-  "vocab_size": 30522
-}
-```
-
-[`DistilBertConfig`] mostra tutti gli attributi predefiniti usati per costruire una base [`DistilBertModel`]. Tutti gli attributi sono personalizzabili, creando uno spazio per sperimentare. Per esempio, puoi configurare un modello predefinito per:
-
-- Provare un funzione di attivazione diversa con il parametro `activation`.
-- Utilizzare tasso di drop out più elevato per le probalità di attention con il parametro `attention_dropout`.
-
-```py
->>> my_config = DistilBertConfig(activation="relu", attention_dropout=0.4)
->>> print(my_config)
-DistilBertConfig {
-  "activation": "relu",
-  "attention_dropout": 0.4,
-  "dim": 768,
-  "dropout": 0.1,
-  "hidden_dim": 3072,
-  "initializer_range": 0.02,
-  "max_position_embeddings": 512,
-  "model_type": "distilbert",
-  "n_heads": 12,
-  "n_layers": 6,
-  "pad_token_id": 0,
-  "qa_dropout": 0.1,
-  "seq_classif_dropout": 0.2,
-  "sinusoidal_pos_embds": false,
-  "transformers_version": "4.16.2",
-  "vocab_size": 30522
-}
-```
-
-Nella funzione [`~PretrainedConfig.from_pretrained`] possono essere modificati gli attributi del modello pre-allenato:
-
-```py
->>> my_config = DistilBertConfig.from_pretrained("distilbert-base-uncased", activation="relu", attention_dropout=0.4)
-```
-
-Quando la configurazione del modello ti soddisfa, la puoi salvare con [`~PretrainedConfig.save_pretrained`]. Il file della tua configurazione è memorizzato come file JSON nella save directory specificata:
-
-```py
->>> my_config.save_pretrained(save_directory="./your_model_save_path")
-```
-
-Per riutilizzare la configurazione del file, caricalo con [`~PretrainedConfig.from_pretrained`]:
-
-```py
->>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json")
-```
-
-<Tip>
-
-Puoi anche salvare il file di configurazione come dizionario oppure come la differenza tra gli attributi della tua configurazione personalizzata e gli attributi della configurazione predefinita! Guarda la documentazione [configuration](main_classes/configuration) per più dettagli.
-
-</Tip>
-
-## Modello
-
-Il prossimo passo e di creare [modello](main_classes/models). Il modello - vagamente riferito anche come architettura - definisce cosa ogni strato deve fare e quali operazioni stanno succedendo. Attributi come `num_hidden_layers` provenienti dalla configurazione sono usati per definire l'architettura. Ogni modello condivide la classe base [`PreTrainedModel`] e alcuni metodi comuni come il ridimensionamento degli input embeddings e la soppressione delle self-attention heads . Inoltre, tutti i modelli sono la sottoclasse di [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html), [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) o [`flax.linen.Module`](https://flax.readthedocs.io/en/latest/flax.linen.html#module). Cio significa che i modelli sono compatibili con l'uso di ciascun di framework.
-
-<frameworkcontent>
-<pt>
-Carica gli attributi della tua configurazione personalizzata nel modello:
-
-```py
->>> from transformers import DistilBertModel
-
->>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json")
->>> model = DistilBertModel(my_config)
-```
-
-Questo crea modelli con valori casuali invece di pesi pre-allenati. Non sarai in grado di usare questo modello per niente di utile finché non lo alleni. L'allenamento è un processo costoso e che richiede tempo . Generalmente è meglio usare un modello pre-allenato per ottenere risultati migliori velocemente, utilizzando solo una frazione delle risorse neccesarie per l'allenamento.
-
-Crea un modello pre-allenato con [`~PreTrainedModel.from_pretrained`]:
-
-```py
->>> model = DistilBertModel.from_pretrained("distilbert-base-uncased")
-```
-
-Quando carichi pesi pre-allenati, la configurazione del modello predefinito è automaticamente caricata se il modello è fornito da 🤗 Transformers. Tuttavia, puoi ancora sostituire gli attributi - alcuni o tutti - di configurazione del modello predefinito con i tuoi se lo desideri:
-
-```py
->>> model = DistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
-```
-</pt>
-<tf>
-Carica gli attributi di configurazione personalizzati nel modello:
-
-```py
->>> from transformers import TFDistilBertModel
-
->>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json")
->>> tf_model = TFDistilBertModel(my_config)
-```
-
-
-Questo crea modelli con valori casuali invece di pesi pre-allenati. Non sarai in grado di usare questo modello per niente di utile finché non lo alleni. L'allenamento è un processo costoso e che richiede tempo . Generalmente è meglio usare un modello pre-allenato per ottenere risultati migliori velocemente, utilizzando solo una frazione delle risorse neccesarie per l'allenamento.
-
-Crea un modello pre-allenoto con [`~TFPreTrainedModel.from_pretrained`]:
-
-```py
->>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased")
-```
-
-Quando carichi pesi pre-allenati, la configurazione del modello predefinito è automaticamente caricato se il modello è fornito da 🤗 Transformers. Tuttavia, puoi ancora sostituire gli attributi - alcuni o tutti - di configurazione del modello predefinito con i tuoi se lo desideri:
-
-```py
->>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
-```
-
-</tf>
-</frameworkcontent>
-
-### Model head
-
-A questo punto, hai un modello DistilBERT base i cui output sono gli *hidden states* (in italiano stati nascosti). Gli stati nascosti sono passati come input a un model head per produrre l'output finale. 🤗 Transformers fornisce un model head diverso per ogni attività fintanto che il modello supporta l'attività  (i.e., non puoi usare DistilBERT per un attività sequence-to-sequence come la traduzione).
-
-<frameworkcontent>
-<pt>
-Per esempio, [`DistilBertForSequenceClassification`] è un modello DistilBERT base con una testa di classificazione per sequenze. La sequenza di classificazione head è uno strato lineare sopra gli output ragruppati.
-
-```py
->>> from transformers import DistilBertForSequenceClassification
-
->>> model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
-```
-
-Riutilizza facilmente questo checkpoint per un'altra attività passando ad un model head differente. Per un attività di risposta alle domande, utilizzerai il model head [`DistilBertForQuestionAnswering`]. La head per compiti di question answering è simile alla classificazione di sequenza head tranne per il fatto che è uno strato lineare sopra l'output degli stati nascosti (hidden states in inglese) 
-
-```py
->>> from transformers import DistilBertForQuestionAnswering
-
->>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
-```
-</pt>
-<tf>
-Per esempio, [`TFDistilBertForSequenceClassification`] è un modello DistilBERT base con classificazione di sequenza head. La classificazione di sequenza head è uno strato lineare sopra gli output raggruppati.
-
-```py
->>> from transformers import TFDistilBertForSequenceClassification
-
->>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
-```
-
-Riutilizza facilmente questo checkpoint per un altra attività passando ad un modello head diverso. Per un attività di risposta alle domande, utilizzerai il model head [`TFDistilBertForQuestionAnswering`]. Il head di risposta alle domande è simile alla sequenza di classificazione head tranne per il fatto che è uno strato lineare sopra l'output degli stati nascosti (hidden states in inglese)
-
-```py
->>> from transformers import TFDistilBertForQuestionAnswering
-
->>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
-```
-</tf>
-</frameworkcontent>
-
-## Tokenizer
-
-L'ultima classe base di cui hai bisogno prima di utilizzare un modello per i dati testuali è un [tokenizer](main_classes/tokenizer) per convertire il testo grezzo in tensori. Ci sono due tipi di tokenizer che puoi usare con 🤗 Transformers:
-
-- [`PreTrainedTokenizer`]: un'implementazione Python di un tokenizer.
-- [`PreTrainedTokenizerFast`]: un tokenizer dalla nostra libreria [🤗 Tokenizer](https://huggingface.co/docs/tokenizers/python/latest/) basata su Rust. Questo tipo di tokenizer è significativamente più veloce, specialmente durante la batch tokenization, grazie alla sua implementazione Rust. Il tokenizer veloce offre anche metodi aggiuntivi come *offset mapping* che associa i token alle loro parole o caratteri originali.
-
-Entrambi i tokenizer supportano metodi comuni come la codifica e la decodifica, l'aggiunta di nuovi token e la gestione di token speciali.
-
-<Tip warning={true}>
-
-Non tutti i modelli supportano un tokenizer veloce. Dai un'occhiata a questo [tabella](index#supported-frameworks) per verificare se un modello ha il supporto per tokenizer veloce. 
-
-</Tip>
-
-Se hai addestrato il tuo tokenizer, puoi crearne uno dal tuo file *vocabolario*: 
-
-```py
->>> from transformers import DistilBertTokenizer
-
->>> my_tokenizer = DistilBertTokenizer(vocab_file="my_vocab_file.txt", do_lower_case=False, padding_side="left")
-```
-
-È importante ricordare che il vocabolario di un tokenizer personalizzato sarà diverso dal vocabolario generato dal tokenizer di un modello preallenato. È necessario utilizzare il vocabolario di un modello preallenato se si utilizza un modello preallenato, altrimenti gli input non avranno senso. Crea un tokenizer con il vocabolario di un modello preallenato con la classe [`DistilBertTokenizer`]:
-
-```py
->>> from transformers import DistilBertTokenizer
-
->>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
-```
-
-Crea un tokenizer veloce con la classe [`DistilBertTokenizerFast`]:
-
-```py
->>> from transformers import DistilBertTokenizerFast
-
->>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
-```
-
-<Tip>
-
-Per l'impostazione predefinita, [`AutoTokenizer`] proverà a caricare un tokenizer veloce. Puoi disabilitare questo comportamento impostando `use_fast=False` in `from_pretrained`.
-
-</Tip>
-
-## Estrattore Di Feature
-
-Un estrattore di caratteristiche (feature in inglese) elabora input audio o immagini. Eredita dalla classe [`~feature_extraction_utils.FeatureExtractionMixin`] base e può anche ereditare dalla classe [`ImageFeatureExtractionMixin`] per l'elaborazione delle caratteristiche dell'immagine o dalla classe [`SequenceFeatureExtractor`] per l'elaborazione degli input audio.
-
-A seconda che tu stia lavorando a un'attività audio o visiva, crea un estrattore di caratteristiche associato al modello che stai utilizzando. Ad esempio, crea un [`ViTFeatureExtractor`] predefinito se stai usando [ViT](model_doc/vit) per la classificazione delle immagini:
-
-```py
->>> from transformers import ViTFeatureExtractor
-
->>> vit_extractor = ViTFeatureExtractor()
->>> print(vit_extractor)
-ViTFeatureExtractor {
-  "do_normalize": true,
-  "do_resize": true,
-  "feature_extractor_type": "ViTFeatureExtractor",
-  "image_mean": [
-    0.5,
-    0.5,
-    0.5
-  ],
-  "image_std": [
-    0.5,
-    0.5,
-    0.5
-  ],
-  "resample": 2,
-  "size": 224
-}
-```
-
-<Tip>
-
-Se non stai cercando alcuna personalizzazione, usa il metodo `from_pretrained` per caricare i parametri di default dell'estrattore di caratteristiche di un modello.
-
-</Tip>
-
-Modifica uno qualsiasi dei parametri [`ViTFeatureExtractor`] per creare il tuo estrattore di caratteristiche personalizzato:
-
-```py
->>> from transformers import ViTFeatureExtractor
-
->>> my_vit_extractor = ViTFeatureExtractor(resample="PIL.Image.BOX", do_normalize=False, image_mean=[0.3, 0.3, 0.3])
->>> print(my_vit_extractor)
-ViTFeatureExtractor {
-  "do_normalize": false,
-  "do_resize": true,
-  "feature_extractor_type": "ViTFeatureExtractor",
-  "image_mean": [
-    0.3,
-    0.3,
-    0.3
-  ],
-  "image_std": [
-    0.5,
-    0.5,
-    0.5
-  ],
-  "resample": "PIL.Image.BOX",
-  "size": 224
-}
-```
-
-Per gli input audio, puoi creare un [`Wav2Vec2FeatureExtractor`] e personalizzare i parametri in modo simile:
-
-```py
->>> from transformers import Wav2Vec2FeatureExtractor
-
->>> w2v2_extractor = Wav2Vec2FeatureExtractor()
->>> print(w2v2_extractor)
-Wav2Vec2FeatureExtractor {
-  "do_normalize": true,
-  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
-  "feature_size": 1,
-  "padding_side": "right",
-  "padding_value": 0.0,
-  "return_attention_mask": false,
-  "sampling_rate": 16000
-}
-```
-
-## Processore
-
-Per modelli che supportano attività multimodali, 🤗 Transformers offre una classe di processore che racchiude comodamente un estrattore di caratteristiche e un tokenizer in un unico oggetto. Ad esempio, utilizziamo [`Wav2Vec2Processor`] per un'attività di riconoscimento vocale automatico (ASR). ASR trascrive l'audio in testo, quindi avrai bisogno di un estrattore di caratteristiche e di un tokenizer.
-
-Crea un estrattore di feature per gestire gli input audio:
-
-```py
->>> from transformers import Wav2Vec2FeatureExtractor
-
->>> feature_extractor = Wav2Vec2FeatureExtractor(padding_value=1.0, do_normalize=True)
-```
-
-Crea un tokenizer per gestire gli input di testo:
-
-```py
->>> from transformers import Wav2Vec2CTCTokenizer
-
->>> tokenizer = Wav2Vec2CTCTokenizer(vocab_file="my_vocab_file.txt")
-```
-
-Combinare l'estrattore di caratteristiche e il tokenizer in [`Wav2Vec2Processor`]:
-
-```py
->>> from transformers import Wav2Vec2Processor
-
->>> processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
-```
-
-Con due classi di base - configurazione e modello - e una classe di preelaborazione aggiuntiva (tokenizer, estrattore di caratteristiche o processore), puoi creare qualsiasi modello supportato da 🤗 Transformers. Ognuna di queste classi base è configurabile, consentendoti di utilizzare gli attributi specifici che desideri. È possibile impostare facilmente un modello per l'addestramento o modificare un modello preallenato esistente per la messa a punto.
\ No newline at end of file
diff --git a/docs/source/it/custom_models.md b/docs/source/it/custom_models.md
new file mode 100644
index 000000000000..b0cdf4cd7bf0
--- /dev/null
+++ b/docs/source/it/custom_models.md
@@ -0,0 +1,359 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Condividere modelli personalizzati
+La libreria 🤗 Transformers è studiata per essere facilmente estendibile. Il codice di ogni modello è interamente 
+situato in una sottocartella del repository senza alcuna astrazione, perciò puoi facilmente copiare il file di un 
+modello e modificarlo in base ai tuoi bisogni.
+
+Se stai scrivendo un nuovo modello, potrebbe essere più semplice iniziare da zero. In questo tutorial, ti mostreremo
+come scrivere un modello personalizzato e la sua configurazione in modo che possa essere utilizzato all’interno di
+Transformers, e come condividerlo con la community (assieme al relativo codice) così che tutte le persone possano usarlo, anche
+se non presente nella libreria 🤗 Transformers.
+
+Illustriamo tutto questo su un modello ResNet, avvolgendo la classe ResNet della 
+[libreria timm](https://github.com/rwightman/pytorch-image-models) in un [`PreTrainedModel`].
+
+## Scrivere una configurazione personalizzata
+Prima di iniziare a lavorare al modello, scriviamone la configurazione. La configurazione di un modello è un oggetto
+che contiene tutte le informazioni necessarie per la build del modello. Come vedremo nella prossima sezione, il 
+modello può soltanto essere inizializzato tramite `config`, per cui dovremo rendere tale oggetto più completo possibile.
+
+Nel nostro esempio, prenderemo un paio di argomenti della classe ResNet che potremmo voler modificare. 
+Configurazioni differenti ci daranno quindi i differenti possibili tipi di ResNet. Salveremo poi questi argomenti, 
+dopo averne controllato la validità.
+
+```python
+from transformers import PretrainedConfig
+from typing import List
+
+
+class ResnetConfig(PretrainedConfig):
+    model_type = "resnet"
+
+    def __init__(
+        self,
+        block_type="bottleneck",
+        layers: List[int] = [3, 4, 6, 3],
+        num_classes: int = 1000,
+        input_channels: int = 3,
+        cardinality: int = 1,
+        base_width: int = 64,
+        stem_width: int = 64,
+        stem_type: str = "",
+        avg_down: bool = False,
+        **kwargs,
+    ):
+        if block_type not in ["basic", "bottleneck"]:
+            raise ValueError(f"`block_type` must be 'basic' or bottleneck', got {block_type}.")
+        if stem_type not in ["", "deep", "deep-tiered"]:
+            raise ValueError(f"`stem_type` must be '', 'deep' or 'deep-tiered', got {stem_type}.")
+
+        self.block_type = block_type
+        self.layers = layers
+        self.num_classes = num_classes
+        self.input_channels = input_channels
+        self.cardinality = cardinality
+        self.base_width = base_width
+        self.stem_width = stem_width
+        self.stem_type = stem_type
+        self.avg_down = avg_down
+        super().__init__(**kwargs)
+```
+
+Le tre cose più importanti da ricordare quando scrivi le tue configurazioni sono le seguenti:
+- Devi ereditare da `Pretrainedconfig`,
+- Il metodo `__init__` del tuo `Pretrainedconfig` deve accettare i kwargs,
+- I `kwargs` devono essere passati alla superclass `__init__`
+
+L’eredità è importante per assicurarsi di ottenere tutte le funzionalità della libreria 🤗 transformers, 
+mentre gli altri due vincoli derivano dal fatto che un `Pretrainedconfig` ha più campi di quelli che stai settando. 
+Quando ricarichi una config da un metodo `from_pretrained`, questi campi devono essere accettati dalla tua config e
+poi inviati alla superclasse.
+
+Definire un `model_type` per la tua configurazione (qua `model_type = “resnet”`) non è obbligatorio, a meno che tu
+non voglia registrare il modello con le classi Auto (vedi l'ultima sezione).
+
+Una volta completato, puoi facilmente creare e salvare la tua configurazione come faresti con ogni altra configurazione
+di modelli della libreria. Ecco come possiamo creare la config di un resnet50d e salvarlo:
+
+```py
+resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True)
+resnet50d_config.save_pretrained("custom-resnet")
+```
+
+Questo salverà un file chiamato `config.json` all'interno della cartella `custom-resnet`. Potrai poi ricaricare la tua
+config con il metodo `from_pretrained`.
+
+```py
+resnet50d_config = ResnetConfig.from_pretrained("custom-resnet")
+```
+
+Puoi anche usare qualunque altro metodo della classe [`PretrainedConfig`], come [`~PretrainedConfig.push_to_hub`]
+per caricare direttamente la tua configurazione nell'hub.
+
+## Scrivere un modello personalizzato
+
+Ora che abbiamo la nostra configurazione ResNet, possiamo continuare a scrivere il modello. In realtà, ne scriveremo
+due: uno che estrae le features nascoste da una batch di immagini (come [`BertModel`]) e uno che è utilizzabile per 
+la classificazione di immagini (come [`BertModelForSequenceClassification`]).
+
+Come abbiamo menzionato in precedenza, scriveremo soltanto un wrapper del modello, per mantenerlo semplice ai fini di 
+questo esempio. L'unica cosa che dobbiamo fare prima di scrivere questa classe è una mappatura fra i tipi di blocco e 
+le vere classi dei blocchi. Successivamente il modello è definito tramite la configurazione, passando tutto quanto alla
+classe `ResNet`.
+
+```py
+from transformers import PreTrainedModel
+from timm.models.resnet import BasicBlock, Bottleneck, ResNet
+from .configuration_resnet import ResnetConfig
+
+
+BLOCK_MAPPING = {"basic": BasicBlock, "bottleneck": Bottleneck}
+
+
+class ResnetModel(PreTrainedModel):
+    config_class = ResnetConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        block_layer = BLOCK_MAPPING[config.block_type]
+        self.model = ResNet(
+            block_layer,
+            config.layers,
+            num_classes=config.num_classes,
+            in_chans=config.input_channels,
+            cardinality=config.cardinality,
+            base_width=config.base_width,
+            stem_width=config.stem_width,
+            stem_type=config.stem_type,
+            avg_down=config.avg_down,
+        )
+
+    def forward(self, tensor):
+        return self.model.forward_features(tensor)
+```
+
+Per il modello che classificherà le immagini, cambiamo soltanto il metodo forward:
+
+```py
+import torch
+
+
+class ResnetModelForImageClassification(PreTrainedModel):
+    config_class = ResnetConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        block_layer = BLOCK_MAPPING[config.block_type]
+        self.model = ResNet(
+            block_layer,
+            config.layers,
+            num_classes=config.num_classes,
+            in_chans=config.input_channels,
+            cardinality=config.cardinality,
+            base_width=config.base_width,
+            stem_width=config.stem_width,
+            stem_type=config.stem_type,
+            avg_down=config.avg_down,
+        )
+
+    def forward(self, tensor, labels=None):
+        logits = self.model(tensor)
+        if labels is not None:
+            loss = torch.nn.cross_entropy(logits, labels)
+            return {"loss": loss, "logits": logits}
+        return {"logits": logits}
+```
+
+Nota come, in entrambi i casi, ereditiamo da `PreTrainedModel` e chiamiamo l'inizializzazione della superclasse 
+con il metodo `config` (un po' come quando scrivi un normale `torch.nn.Module`). La riga che imposta la  `config_class`
+non è obbligatoria, a meno che tu non voglia registrare il modello con le classi Auto (vedi l'ultima sezione).
+
+<Tip>
+
+Se il tuo modello è molto simile a un modello all'interno della libreria, puoi ri-usare la stessa configurazione di quel modello.
+
+</Tip>
+
+Puoi fare in modo che il tuo modello restituisca in output qualunque cosa tu voglia, ma far restituire un dizionario 
+come abbiamo fatto per `ResnetModelForImageClassification`, con la funzione di perdita inclusa quando vengono passate le labels,
+renderà il tuo modello direttamente utilizzabile all'interno della classe [`Trainer`]. Utilizzare altri formati di output va bene
+se hai in progetto di utilizzare un tuo loop di allenamento, o se utilizzerai un'altra libreria per l'addestramento.
+
+Ora che abbiamo la classe del nostro modello, creiamone uno:
+
+```py
+resnet50d = ResnetModelForImageClassification(resnet50d_config)
+```
+
+Ribadiamo, puoi usare qualunque metodo dei [`PreTrainedModel`], come [`~PreTrainedModel.save_pretrained`] o
+[`~PreTrainedModel.push_to_hub`]. Utilizzeremo quest'ultimo nella prossima sezione, e vedremo come caricare i pesi del
+modello assieme al codice del modello stesso. Ma prima, carichiamo alcuni pesi pre-allenati all'interno del nostro modello.
+
+Nel tuo caso specifico, probabilmente allenerai il tuo modello sui tuoi dati. Per velocizzare in questo tutorial, 
+utilizzeremo la versione pre-allenata del resnet50d. Dato che il nostro modello è soltanto un wrapper attorno a quel modello,
+sarà facile trasferirne i pesi:
+
+```py
+import timm
+
+pretrained_model = timm.create_model("resnet50d", pretrained=True)
+resnet50d.model.load_state_dict(pretrained_model.state_dict())
+```
+
+Vediamo adesso come assicurarci che quando facciamo [`~PreTrainedModel.save_pretrained`] o [`~PreTrainedModel.push_to_hub`], 
+il codice del modello venga salvato.
+
+## Inviare il codice all'Hub
+
+<Tip warning={true}>
+
+Questa API è sperimentale e potrebbe avere alcuni cambiamenti nei prossimi rilasci.
+
+</Tip>
+
+Innanzitutto, assicurati che il tuo modello sia completamente definito in un file `.py`. Può sfruttare import relativi
+ad altri file, purchè questi siano nella stessa directory (non supportiamo ancora sotto-moduli per questa funzionalità).
+Per questo esempio, definiremo un file `modeling_resnet.py` e un file `configuration_resnet.py` in una cartella dell'attuale
+working directory chiamata `resnet_model`. Il file configuration contiene il codice per `ResnetConfig` e il file modeling 
+contiene il codice di `ResnetModel` e `ResnetModelForImageClassification`.
+
+```
+.
+└── resnet_model
+    ├── __init__.py
+    ├── configuration_resnet.py
+    └── modeling_resnet.py
+```
+
+Il file `__init__.py` può essere vuoto, serve solo perchè Python capisca che `resnet_model` può essere utilizzato come un modulo.
+
+<Tip warning={true}>
+
+Se stai copiando i file relativi alla modellazione della libreria, dovrai sostituire tutti gli import relativi in cima al file con import del 
+    pacchetto `transformers`.
+
+</Tip>
+
+Nota che puoi ri-utilizzare (o usare come sottoclassi) un modello/configurazione esistente.
+
+Per condividere il tuo modello con la community, segui questi passi: prima importa il modello ResNet e la sua configurazione 
+dai nuovi file creati:
+
+```py
+from resnet_model.configuration_resnet import ResnetConfig
+from resnet_model.modeling_resnet import ResnetModel, ResnetModelForImageClassification
+```
+
+Dopodichè dovrai dire alla libreria che vuoi copiare i file con il codice di quegli oggetti quando utilizzi il metodo
+`save_pretrained` e registrarli in modo corretto con una Auto classe (specialmente per i modelli). Utilizza semplicemente:
+
+```py
+ResnetConfig.register_for_auto_class()
+ResnetModel.register_for_auto_class("AutoModel")
+ResnetModelForImageClassification.register_for_auto_class("AutoModelForImageClassification")
+```
+
+Nota che non c'è bisogno di specificare una Auto classe per la configurazione (c'è solo una Auto classe per le configurazioni,
+[`AutoConfig`], ma è diversa per i modelli). Il tuo modello personalizato potrebbe essere utilizzato per diverse tasks, 
+per cui devi specificare quale delle classi Auto è quella corretta per il tuo modello.
+
+Successivamente, creiamo i modelli e la config come abbiamo fatto in precedenza:
+
+```py
+resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True)
+resnet50d = ResnetModelForImageClassification(resnet50d_config)
+
+pretrained_model = timm.create_model("resnet50d", pretrained=True)
+resnet50d.model.load_state_dict(pretrained_model.state_dict())
+```
+
+Adesso, per inviare il modello all'Hub, assicurati di aver effettuato l'accesso. Lancia dal tuo terminale:
+
+```bash
+huggingface-cli login
+```
+
+O da un notebook:
+
+```py
+from huggingface_hub import notebook_login
+
+notebook_login()
+```
+
+Potrai poi inviare il tutto sul tuo profilo (o di un'organizzazione di cui fai parte) in questo modo:
+
+```py
+resnet50d.push_to_hub("custom-resnet50d")
+```
+
+Oltre ai pesi del modello e alla configurazione in formato json, questo ha anche copiato i file `.py` modeling e
+configuration all'interno della cartella `custom-resnet50d` e ha caricato i risultati sull'Hub. Puoi controllare
+i risultati in questa [model repo](https://huggingface.co/sgugger/custom-resnet50d).
+
+Puoi controllare il tutorial di condivisione [tutorial di condivisione](model_sharing) per più informazioni sul 
+metodo con cui inviare all'Hub.
+
+## Usare un modello con codice personalizzato
+
+Puoi usare ogni configurazione, modello o tokenizer con file di codice personalizzati nella sua repository 
+con le classi Auto e il metodo `from_pretrained`. Tutti i files e il codice caricati sull'Hub sono scansionati da malware
+(fai riferimento alla documentazione [Hub security](https://huggingface.co/docs/hub/security#malware-scanning) per più informazioni),
+ma dovresti comunque assicurarti dell'affidabilità del codice e dell'autore per evitare di eseguire codice dannoso sulla tua macchina. 
+Imposta `trust_remote_code=True` per usare un modello con codice personalizzato:
+
+```py
+from transformers import AutoModelForImageClassification
+
+model = AutoModelForImageClassification.from_pretrained("sgugger/custom-resnet50d", trust_remote_code=True)
+```
+
+Inoltre, raccomandiamo fortemente di passare un hash del commit come `revision` per assicurarti che le autrici o gli autori del modello 
+non abbiano modificato il codice con alcune nuove righe dannose (a meno che non ti fidi completamente della fonte):
+
+```py
+commit_hash = "ed94a7c6247d8aedce4647f00f20de6875b5b292"
+model = AutoModelForImageClassification.from_pretrained(
+    "sgugger/custom-resnet50d", trust_remote_code=True, revision=commit_hash
+)
+```
+
+Nota che quando cerchi la storia dei commit della repo del modello sull'Hub, c'è un bottone con cui facilmente copiare il 
+commit hash di ciascun commit.
+
+## Registrare un modello con codice personalizzato nelle classi Auto
+
+Se stai scrivendo una libreria che estende 🤗 Transformers, potresti voler estendere le classi Auto per includere il tuo modello.
+Questo è diverso dall'inviare codice nell'Hub: gli utenti dovranno importare la tua libreria per ottenere il modello personalizzato
+(anzichè scaricare automaticamente il modello dall'Hub).
+
+Finchè il tuo file di configurazione ha un attributo `model_type` diverso dai model types esistenti, e finchè le tue 
+classi modello hanno i corretti attributi `config_class`, potrai semplicemente aggiungerli alle classi Auto come segue:
+
+```py
+from transformers import AutoConfig, AutoModel, AutoModelForImageClassification
+
+AutoConfig.register("resnet", ResnetConfig)
+AutoModel.register(ResnetConfig, ResnetModel)
+AutoModelForImageClassification.register(ResnetConfig, ResnetModelForImageClassification)
+```
+
+Nota che il primo argomento utilizzato quando registri la configurazione di un modello personalizzato con [`AutoConfig`] 
+deve corrispondere al `model_type` della tua configurazione personalizzata, ed il primo argomento utilizzato quando 
+registri i tuoi modelli personalizzati in una qualunque classe Auto del modello deve corrispondere alla `config_class`
+di quei modelli.
diff --git a/docs/source/it/custom_models.mdx b/docs/source/it/custom_models.mdx
deleted file mode 100644
index b4b0302e29e3..000000000000
--- a/docs/source/it/custom_models.mdx
+++ /dev/null
@@ -1,355 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Condividere modelli personalizzati
-La libreria 🤗 Transformers è studiata per essere facilmente estendibile. Il codice di ogni modello è interamente 
-situato in una sottocartella del repository senza alcuna astrazione, perciò puoi facilmente copiare il file di un 
-modello e modificarlo in base ai tuoi bisogni.
-
-Se stai scrivendo un nuovo modello, potrebbe essere più semplice iniziare da zero. In questo tutorial, ti mostreremo
-come scrivere un modello personalizzato e la sua configurazione in modo che possa essere utilizzato all’interno di
-Transformers, e come condividerlo con la community (assieme al relativo codice) così che tutte le persone possano usarlo, anche
-se non presente nella libreria 🤗 Transformers.
-
-Illustriamo tutto questo su un modello ResNet, avvolgendo la classe ResNet della 
-[libreria timm](https://github.com/rwightman/pytorch-image-models) in un [`PreTrainedModel`].
-
-## Scrivere una configurazione personalizzata
-Prima di iniziare a lavorare al modello, scriviamone la configurazione. La configurazione di un modello è un oggetto
-che contiene tutte le informazioni necessarie per la build del modello. Come vedremo nella prossima sezione, il 
-modello può soltanto essere inizializzato tramite `config`, per cui dovremo rendere tale oggetto più completo possibile.
-
-Nel nostro esempio, prenderemo un paio di argomenti della classe ResNet che potremmo voler modificare. 
-Configurazioni differenti ci daranno quindi i differenti possibili tipi di ResNet. Salveremo poi questi argomenti, 
-dopo averne controllato la validità.
-
-```python
-from transformers import PretrainedConfig
-from typing import List
-
-
-class ResnetConfig(PretrainedConfig):
-    model_type = "resnet"
-
-    def __init__(
-        self,
-        block_type="bottleneck",
-        layers: List[int] = [3, 4, 6, 3],
-        num_classes: int = 1000,
-        input_channels: int = 3,
-        cardinality: int = 1,
-        base_width: int = 64,
-        stem_width: int = 64,
-        stem_type: str = "",
-        avg_down: bool = False,
-        **kwargs,
-    ):
-        if block_type not in ["basic", "bottleneck"]:
-            raise ValueError(f"`block_type` must be 'basic' or bottleneck', got {block_type}.")
-        if stem_type not in ["", "deep", "deep-tiered"]:
-            raise ValueError(f"`stem_type` must be '', 'deep' or 'deep-tiered', got {stem_type}.")
-
-        self.block_type = block_type
-        self.layers = layers
-        self.num_classes = num_classes
-        self.input_channels = input_channels
-        self.cardinality = cardinality
-        self.base_width = base_width
-        self.stem_width = stem_width
-        self.stem_type = stem_type
-        self.avg_down = avg_down
-        super().__init__(**kwargs)
-```
-
-Le tre cose più importanti da ricordare quando scrivi le tue configurazioni sono le seguenti:
-- Devi ereditare da `Pretrainedconfig`,
-- Il metodo `__init__` del tuo `Pretrainedconfig` deve accettare i kwargs,
-- I `kwargs` devono essere passati alla superclass `__init__`
-
-L’eredità è importante per assicurarsi di ottenere tutte le funzionalità della libreria 🤗 transformers, 
-mentre gli altri due vincoli derivano dal fatto che un `Pretrainedconfig` ha più campi di quelli che stai settando. 
-Quando ricarichi una config da un metodo `from_pretrained`, questi campi devono essere accettati dalla tua config e
-poi inviati alla superclasse.
-
-Definire un `model_type` per la tua configurazione (qua `model_type = “resnet”`) non è obbligatorio, a meno che tu
-non voglia registrare il modello con le classi Auto (vedi l'ultima sezione).
-
-Una volta completato, puoi facilmente creare e salvare la tua configurazione come faresti con ogni altra configurazione
-di modelli della libreria. Ecco come possiamo creare la config di un resnet50d e salvarlo:
-
-```py
-resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True)
-resnet50d_config.save_pretrained("custom-resnet")
-```
-
-Questo salverà un file chiamato `config.json` all'interno della cartella `custom-resnet`. Potrai poi ricaricare la tua
-config con il metodo `from_pretrained`.
-
-```py
-resnet50d_config = ResnetConfig.from_pretrained("custom-resnet")
-```
-
-Puoi anche usare qualunque altro metodo della classe [`PretrainedConfig`], come [`~PretrainedConfig.push_to_hub`]
-per caricare direttamente la tua configurazione nell'hub.
-
-## Scrivere un modello personalizzato
-
-Ora che abbiamo la nostra configurazione ResNet, possiamo continuare a scrivere il modello. In realtà, ne scriveremo
-due: uno che estrae le features nascoste da una batch di immagini (come [`BertModel`]) e uno che è utilizzabile per 
-la classificazione di immagini (come [`BertModelForSequenceClassification`]).
-
-Come abbiamo menzionato in precedenza, scriveremo soltanto un wrapper del modello, per mantenerlo semplice ai fini di 
-questo esempio. L'unica cosa che dobbiamo fare prima di scrivere questa classe è una mappatura fra i tipi di blocco e 
-le vere classi dei blocchi. Successivamente il modello è definito tramite la configurazione, passando tutto quanto alla
-classe `ResNet`.
-
-```py
-from transformers import PreTrainedModel
-from timm.models.resnet import BasicBlock, Bottleneck, ResNet
-from .configuration_resnet import ResnetConfig
-
-
-BLOCK_MAPPING = {"basic": BasicBlock, "bottleneck": Bottleneck}
-
-
-class ResnetModel(PreTrainedModel):
-    config_class = ResnetConfig
-
-    def __init__(self, config):
-        super().__init__(config)
-        block_layer = BLOCK_MAPPING[config.block_type]
-        self.model = ResNet(
-            block_layer,
-            config.layers,
-            num_classes=config.num_classes,
-            in_chans=config.input_channels,
-            cardinality=config.cardinality,
-            base_width=config.base_width,
-            stem_width=config.stem_width,
-            stem_type=config.stem_type,
-            avg_down=config.avg_down,
-        )
-
-    def forward(self, tensor):
-        return self.model.forward_features(tensor)
-```
-
-Per il modello che classificherà le immagini, cambiamo soltanto il metodo forward:
-
-```py
-import torch
-
-
-class ResnetModelForImageClassification(PreTrainedModel):
-    config_class = ResnetConfig
-
-    def __init__(self, config):
-        super().__init__(config)
-        block_layer = BLOCK_MAPPING[config.block_type]
-        self.model = ResNet(
-            block_layer,
-            config.layers,
-            num_classes=config.num_classes,
-            in_chans=config.input_channels,
-            cardinality=config.cardinality,
-            base_width=config.base_width,
-            stem_width=config.stem_width,
-            stem_type=config.stem_type,
-            avg_down=config.avg_down,
-        )
-
-    def forward(self, tensor, labels=None):
-        logits = self.model(tensor)
-        if labels is not None:
-            loss = torch.nn.cross_entropy(logits, labels)
-            return {"loss": loss, "logits": logits}
-        return {"logits": logits}
-```
-
-Nota come, in entrambi i casi, ereditiamo da `PreTrainedModel` e chiamiamo l'inizializzazione della superclasse 
-con il metodo `config` (un po' come quando scrivi un normale `torch.nn.Module`). La riga che imposta la  `config_class`
-non è obbligatoria, a meno che tu non voglia registrare il modello con le classi Auto (vedi l'ultima sezione).
-
-<Tip>
-
-Se il tuo modello è molto simile a un modello all'interno della libreria, puoi ri-usare la stessa configurazione di quel modello.
-
-</Tip>
-
-Puoi fare in modo che il tuo modello restituisca in output qualunque cosa tu voglia, ma far restituire un dizionario 
-come abbiamo fatto per `ResnetModelForImageClassification`, con la funzione di perdita inclusa quando vengono passate le labels,
-renderà il tuo modello direttamente utilizzabile all'interno della classe [`Trainer`]. Utilizzare altri formati di output va bene
-se hai in progetto di utilizzare un tuo loop di allenamento, o se utilizzerai un'altra libreria per l'addestramento.
-
-Ora che abbiamo la classe del nostro modello, creiamone uno:
-
-```py
-resnet50d = ResnetModelForImageClassification(resnet50d_config)
-```
-
-Ribadiamo, puoi usare qualunque metodo dei [`PreTrainedModel`], come [`~PreTrainedModel.save_pretrained`] o
-[`~PreTrainedModel.push_to_hub`]. Utilizzeremo quest'ultimo nella prossima sezione, e vedremo come caricare i pesi del
-modello assieme al codice del modello stesso. Ma prima, carichiamo alcuni pesi pre-allenati all'interno del nostro modello.
-
-Nel tuo caso specifico, probabilmente allenerai il tuo modello sui tuoi dati. Per velocizzare in questo tutorial, 
-utilizzeremo la versione pre-allenata del resnet50d. Dato che il nostro modello è soltanto un wrapper attorno a quel modello,
-sarà facile trasferirne i pesi:
-
-```py
-import timm
-
-pretrained_model = timm.create_model("resnet50d", pretrained=True)
-resnet50d.model.load_state_dict(pretrained_model.state_dict())
-```
-
-Vediamo adesso come assicurarci che quando facciamo [`~PreTrainedModel.save_pretrained`] o [`~PreTrainedModel.push_to_hub`], 
-il codice del modello venga salvato.
-
-## Inviare il codice all'Hub
-
-<Tip warning={true}>
-
-Questa API è sperimentale e potrebbe avere alcuni cambiamenti nei prossimi rilasci.
-
-</Tip>
-
-Innanzitutto, assicurati che il tuo modello sia completamente definito in un file `.py`. Può sfruttare import relativi
-ad altri file, purchè questi siano nella stessa directory (non supportiamo ancora sotto-moduli per questa funzionalità).
-Per questo esempio, definiremo un file `modeling_resnet.py` e un file `configuration_resnet.py` in una cartella dell'attuale
-working directory chiamata `resnet_model`. Il file configuration contiene il codice per `ResnetConfig` e il file modeling 
-contiene il codice di `ResnetModel` e `ResnetModelForImageClassification`.
-
-```
-.
-└── resnet_model
-    ├── __init__.py
-    ├── configuration_resnet.py
-    └── modeling_resnet.py
-```
-
-Il file `__init__.py` può essere vuoto, serve solo perchè Python capisca che `resnet_model` può essere utilizzato come un modulo.
-
-<Tip warning={true}>
-
-Se stai copiando i file relativi alla modellazione della libreria, dovrai sostituire tutti gli import relativi in cima al file con import del 
-    pacchetto `transformers`.
-
-</Tip>
-
-Nota che puoi ri-utilizzare (o usare come sottoclassi) un modello/configurazione esistente.
-
-Per condividere il tuo modello con la community, segui questi passi: prima importa il modello ResNet e la sua configurazione 
-dai nuovi file creati:
-
-```py
-from resnet_model.configuration_resnet import ResnetConfig
-from resnet_model.modeling_resnet import ResnetModel, ResnetModelForImageClassification
-```
-
-Dopodichè dovrai dire alla libreria che vuoi copiare i file con il codice di quegli oggetti quando utilizzi il metodo
-`save_pretrained` e registrarli in modo corretto con una Auto classe (specialmente per i modelli). Utilizza semplicemente:
-
-```py
-ResnetConfig.register_for_auto_class()
-ResnetModel.register_for_auto_class("AutoModel")
-ResnetModelForImageClassification.register_for_auto_class("AutoModelForImageClassification")
-```
-
-Nota che non c'è bisogno di specificare una Auto classe per la configurazione (c'è solo una Auto classe per le configurazioni,
-[`AutoConfig`], ma è diversa per i modelli). Il tuo modello personalizato potrebbe essere utilizzato per diverse tasks, 
-per cui devi specificare quale delle classi Auto è quella corretta per il tuo modello.
-
-Successivamente, creiamo i modelli e la config come abbiamo fatto in precedenza:
-
-```py
-resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True)
-resnet50d = ResnetModelForImageClassification(resnet50d_config)
-
-pretrained_model = timm.create_model("resnet50d", pretrained=True)
-resnet50d.model.load_state_dict(pretrained_model.state_dict())
-```
-
-Adesso, per inviare il modello all'Hub, assicurati di aver effettuato l'accesso. Lancia dal tuo terminale:
-
-```bash
-huggingface-cli login
-```
-
-O da un notebook:
-
-```py
-from huggingface_hub import notebook_login
-
-notebook_login()
-```
-
-Potrai poi inviare il tutto sul tuo profilo (o di un'organizzazione di cui fai parte) in questo modo:
-
-```py
-resnet50d.push_to_hub("custom-resnet50d")
-```
-
-Oltre ai pesi del modello e alla configurazione in formato json, questo ha anche copiato i file `.py` modeling e
-configuration all'interno della cartella `custom-resnet50d` e ha caricato i risultati sull'Hub. Puoi controllare
-i risultati in questa [model repo](https://huggingface.co/sgugger/custom-resnet50d).
-
-Puoi controllare il tutorial di condivisione [tutorial di condivisione](model_sharing) per più informazioni sul 
-metodo con cui inviare all'Hub.
-
-## Usare un modello con codice personalizzato
-
-Puoi usare ogni configurazione, modello o tokenizer con file di codice personalizzati nella sua repository 
-con le classi Auto e il metodo `from_pretrained`. Tutti i files e il codice caricati sull'Hub sono scansionati da malware
-(fai riferimento alla documentazione [Hub security](https://huggingface.co/docs/hub/security#malware-scanning) per più informazioni),
-ma dovresti comunque assicurarti dell'affidabilità del codice e dell'autore per evitare di eseguire codice dannoso sulla tua macchina. 
-Imposta `trust_remote_code=True` per usare un modello con codice personalizzato:
-
-```py
-from transformers import AutoModelForImageClassification
-
-model = AutoModelForImageClassification.from_pretrained("sgugger/custom-resnet50d", trust_remote_code=True)
-```
-
-Inoltre, raccomandiamo fortemente di passare un hash del commit come `revision` per assicurarti che le autrici o gli autori del modello 
-non abbiano modificato il codice con alcune nuove righe dannose (a meno che non ti fidi completamente della fonte):
-
-```py
-commit_hash = "ed94a7c6247d8aedce4647f00f20de6875b5b292"
-model = AutoModelForImageClassification.from_pretrained(
-    "sgugger/custom-resnet50d", trust_remote_code=True, revision=commit_hash
-)
-```
-
-Nota che quando cerchi la storia dei commit della repo del modello sull'Hub, c'è un bottone con cui facilmente copiare il 
-commit hash di ciascun commit.
-
-## Registrare un modello con codice personalizzato nelle classi Auto
-
-Se stai scrivendo una libreria che estende 🤗 Transformers, potresti voler estendere le classi Auto per includere il tuo modello.
-Questo è diverso dall'inviare codice nell'Hub: gli utenti dovranno importare la tua libreria per ottenere il modello personalizzato
-(anzichè scaricare automaticamente il modello dall'Hub).
-
-Finchè il tuo file di configurazione ha un attributo `model_type` diverso dai model types esistenti, e finchè le tue 
-classi modello hanno i corretti attributi `config_class`, potrai semplicemente aggiungerli alle classi Auto come segue:
-
-```py
-from transformers import AutoConfig, AutoModel, AutoModelForImageClassification
-
-AutoConfig.register("resnet", ResnetConfig)
-AutoModel.register(ResnetConfig, ResnetModel)
-AutoModelForImageClassification.register(ResnetConfig, ResnetModelForImageClassification)
-```
-
-Nota che il primo argomento utilizzato quando registri la configurazione di un modello personalizzato con [`AutoConfig`] 
-deve corrispondere al `model_type` della tua configurazione personalizzata, ed il primo argomento utilizzato quando 
-registri i tuoi modelli personalizzati in una qualunque classe Auto del modello deve corrispondere alla `config_class`
-di quei modelli.
diff --git a/docs/source/it/debugging.md b/docs/source/it/debugging.md
new file mode 100644
index 000000000000..5c1dab51bd11
--- /dev/null
+++ b/docs/source/it/debugging.md
@@ -0,0 +1,318 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Debugging
+
+## Debug dei problemi di rete multi-GPU
+
+Quando addestri o fai inferenza con `DistributedDataParallel` e GPU multiple, se si verificano problemi di intercomunicazione tra processi e/o nodi, puoi utilizzare il seguente script per diagnosticare i problemi della rete.
+
+```bash
+wget https://raw.githubusercontent.com/huggingface/transformers/main/scripts/distributed/torch-distributed-gpu-test.py
+```
+
+Per esempio per testare come 2 GPU interagiscono fai:
+
+```bash
+python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
+```
+
+Se entrambi i processi sono in grado di comunicare tra loro e di allocare la memoria della GPU, ciascuno di essi stamperà lo stato OK.
+
+Per più GPU o nodi adatta gli argumenti nello script.
+
+All'interno dello script di diagnostica troverai molti altri dettagli e anche una guida per eseguirlo in ambiente SLURM.
+
+Un livello di debug superiore è aggiungere la variabile d'ambiente `NCCL_DEBUG=INFO` come di seguito:
+
+```bash
+NCCL_DEBUG=INFO python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
+```
+
+In questo modo si scaricano molte informazioni di debug relative a NCCL, che puoi cercare online in caso di problemi. Oppure, se non hai la sicurezza di come interpretare l'output, puoi condividere il file di log in una Issue.
+
+## Rilevamento di Underflow e Overflow
+
+<Tip>
+
+Questa funzionalità al momento è disponibile solo per PyTorch.
+
+</Tip>
+
+<Tip>
+
+Per addestramento multi-GPU richiede DDP (`torch.distributed.launch`).
+
+</Tip>
+
+<Tip>
+
+Questa funzionalità può essere usata con modelli basati su `nn.Module`.
+
+</Tip>
+
+Se inizi a ottenere `loss=NaN` o il modello presenta qualche altro comportamento anomalo a causa di valori `inf` o `nan` in
+attivazioni o nei pesi, è necessario scoprire dove si verifica il primo underflow o overflow e cosa lo ha determinato. Fortunatamente
+è possibile farlo facilmente attivando un modulo speciale che effettuerà il rilevamento automaticamente.
+
+Se stai usando [`Trainer`], hai bisogno di aggiungere solo:
+
+```bash
+--debug underflow_overflow
+```
+
+ai normali argomenti della riga di comando, o passa `debug="underflow_overflow"` quando viene creato l'oggetto
+[`TrainingArguments`].
+
+Se stai usando il tuo ciclo di allenamento o un altro trainer, puoi ottenere lo stesso risultato con:
+
+```python
+from .debug_utils import DebugUnderflowOverflow
+
+debug_overflow = DebugUnderflowOverflow(model)
+```
+
+[`~debug_utils.DebugUnderflowOverflow`] inserisce dei ganci nel modello che dopo ogni chiamata
+testeranno le variabili di ingresso e di uscita e anche i pesi del modulo corrispondente. Non appena viene rilevato `inf` o
+o `nan` in almeno un elemento delle attivazioni o dei pesi, il programma lo notifica e stampa un rapporto come il seguente (questo è stato rilevato con `google/mt5-small` sotto fp16 mixed precision):
+
+```
+Detected inf/nan during batch_number=0
+Last 21 forward frames:
+abs min  abs max  metadata
+                  encoder.block.1.layer.1.DenseReluDense.dropout Dropout
+0.00e+00 2.57e+02 input[0]
+0.00e+00 2.85e+02 output
+[...]
+                  encoder.block.2.layer.0 T5LayerSelfAttention
+6.78e-04 3.15e+03 input[0]
+2.65e-04 3.42e+03 output[0]
+             None output[1]
+2.25e-01 1.00e+04 output[2]
+                  encoder.block.2.layer.1.layer_norm T5LayerNorm
+8.69e-02 4.18e-01 weight
+2.65e-04 3.42e+03 input[0]
+1.79e-06 4.65e+00 output
+                  encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
+2.17e-07 4.50e+00 weight
+1.79e-06 4.65e+00 input[0]
+2.68e-06 3.70e+01 output
+                  encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
+8.08e-07 2.66e+01 weight
+1.79e-06 4.65e+00 input[0]
+1.27e-04 2.37e+02 output
+                  encoder.block.2.layer.1.DenseReluDense.dropout Dropout
+0.00e+00 8.76e+03 input[0]
+0.00e+00 9.74e+03 output
+                  encoder.block.2.layer.1.DenseReluDense.wo Linear
+1.01e-06 6.44e+00 weight
+0.00e+00 9.74e+03 input[0]
+3.18e-04 6.27e+04 output
+                  encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
+1.79e-06 4.65e+00 input[0]
+3.18e-04 6.27e+04 output
+                  encoder.block.2.layer.1.dropout Dropout
+3.18e-04 6.27e+04 input[0]
+0.00e+00      inf output
+```
+
+L'output di esempio è stato tagliato al centro per brevità.
+
+La seconda colonna mostra il valore dell'elemento più grande in assoluto,così se osserviamo da vicino gli ultimi istanti,
+input e output sono nel range di `1e4`. Questo addestramento è stato eseguito con una mixed precision fp16 e l'ultimo passo usciva fuori (sotto `fp16` il valore più grande prima di `inf` è `64e3`). Per evitare overflows sotto `fp16` le attivazionioni devono rimanere molto al di sotto di `1e4`, perché `1e4 * 1e4 = 1e8` quindi qualsiasi moltiplicazione di matrice con grandi attivazioni porterà a una condizione di overflow numerico.
+
+All'inizio della traccia è possibile scoprire a quale lotto si è verificato il problema (questo `Detected inf/nan during batch_number=0` significa che il problema si è verificato nel primo lotto).
+
+Ogni frame segnalato inizia dichiarando la voce completamente qualificata per il modulo corrispondente per il quale il frame è stato segnalato. 
+Se osserviamo il seguente frame:
+
+```
+                  encoder.block.2.layer.1.layer_norm T5LayerNorm
+8.69e-02 4.18e-01 weight
+2.65e-04 3.42e+03 input[0]
+1.79e-06 4.65e+00 output
+```
+
+Questo, `encoder.block.2.layer.1.layer_norm` indica che si tratta di un layer norm nel primo layer, del secondo blocco dell'encoder. E le chiamata specifica di `forward` è `T5LayerNorm`.
+
+Osserviamo gli ultimi frame del report:
+
+```
+Detected inf/nan during batch_number=0
+Last 21 forward frames:
+abs min  abs max  metadata
+[...]
+                  encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
+2.17e-07 4.50e+00 weight
+1.79e-06 4.65e+00 input[0]
+2.68e-06 3.70e+01 output
+                  encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
+8.08e-07 2.66e+01 weight
+1.79e-06 4.65e+00 input[0]
+1.27e-04 2.37e+02 output
+                  encoder.block.2.layer.1.DenseReluDense.wo Linear
+1.01e-06 6.44e+00 weight
+0.00e+00 9.74e+03 input[0]
+3.18e-04 6.27e+04 output
+                  encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
+1.79e-06 4.65e+00 input[0]
+3.18e-04 6.27e+04 output
+                  encoder.block.2.layer.1.dropout Dropout
+3.18e-04 6.27e+04 input[0]
+0.00e+00      inf output
+```
+
+L'ultimo frame report per la funzione `Dropout.forward` con la prima voce per l'unico input e la seconda per l'unico output. Si può notare che è stato richiamato da un attibuto `dropout` dentro la classe `DenseReluDense`. Si può notare che ciò è avvenuto durante il primo strato, del 2° blocco, durante il primissimo lotto. Infine, gli elementi di input più grandi in assoluto sono stati `6.27e+04` e l'equivalente per l'output era `inf`.
+
+Puoi vedere qui, che `T5DenseGatedGeluDense.forward` risulta in output activations, il cui valore massimo assoluto era circa 62,7K, che è molto vicino al limite massimo di 64K di fp16. Nel prossimo frame abbiamo `Dropout` che rinormalizza i pesi, dopo aver azzerato alcuni elementi, il che spinge il valore massimo assoluto a più di 64K e si verifica un overflow.(`inf`).
+
+Come puoi notare, è nei frames precedenti che occorre esaminare quando i numeri iniziano a diventare molto grandi per i valori fp16.
+
+Confrontiamo il report al codice `models/t5/modeling_t5.py`:
+
+```python
+class T5DenseGatedGeluDense(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+        self.gelu_act = ACT2FN["gelu_new"]
+
+    def forward(self, hidden_states):
+        hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
+        hidden_linear = self.wi_1(hidden_states)
+        hidden_states = hidden_gelu * hidden_linear
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+```
+
+Ora è facile vedere la chiamata `dropout`, e tutte le chiamate precedenti.
+
+Poiché il rilevamento avviene in un avanzamento (forward hook in eng.), i rapporti vengono creati immeditamente dopo ogni rientro da `forward` (forward returns in eng.).
+
+Tornando al rapporto completo, per agire e risolvere il problema, dobbiamo andare qualche frame più in alto, dove i numeri hanno iniziato a salire, e probabilmente passare alla modalità `fp32`, in modo che i numeri non trabocchino quando vengono moltiplicati o sommati. Naturalmente, potrebbero esserci altre soluzioni. Per esempio, potremmo spegnere temporanemante `amp` se è abilitato, successivamente spostare `forward` in un helper wrapper, come:
+
+```python
+def _forward(self, hidden_states):
+    hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
+    hidden_linear = self.wi_1(hidden_states)
+    hidden_states = hidden_gelu * hidden_linear
+    hidden_states = self.dropout(hidden_states)
+    hidden_states = self.wo(hidden_states)
+    return hidden_states
+
+
+import torch
+
+
+def forward(self, hidden_states):
+    if torch.is_autocast_enabled():
+        with torch.cuda.amp.autocast(enabled=False):
+            return self._forward(hidden_states)
+    else:
+        return self._forward(hidden_states)
+```
+
+Poiché il rilevatore automatico riporta solo gli ingressi e le uscite di fotogrammi completi, una volta che si sa dove cercare, si può
+analizzare anche le fasi intermedie di una specifica funzione `forward`. In alcuni casi puoi usare la funzione di supporto `detect_overflow` per indirizzare il rilevatore dove preferisci, ad esempio:
+
+```python
+from debug_utils import detect_overflow
+
+
+class T5LayerFF(nn.Module):
+    [...]
+
+    def forward(self, hidden_states):
+        forwarded_states = self.layer_norm(hidden_states)
+        detect_overflow(forwarded_states, "after layer_norm")
+        forwarded_states = self.DenseReluDense(forwarded_states)
+        detect_overflow(forwarded_states, "after DenseReluDense")
+        return hidden_states + self.dropout(forwarded_states)
+```
+
+Si può vedere che abbiamo aggiunto 2 di questi e ora teniamo traccia se `inf` o `nan` per `forwarded_states` è stato rilevato
+da qualche parte.
+
+In realtà, il rilevatore li riporta già, perché ciascuna delle chiamate nell'esempio precedente è un `nn.Module`, ma
+diciamo che se avessimo dei calcoli diretti locali, questo è il modo in cui lo faremmo.
+
+Inoltre, se si istanzia il debugger nel proprio codice, è possibile modificare il numero di fotogrammi stampati rispetto a
+predefinito, ad esempio.:
+
+```python
+from .debug_utils import DebugUnderflowOverflow
+
+debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100)
+```
+
+### Tracciamento della mistura assoluta del lotto specifico e del valore massimo
+
+La stessa classe di debug può essere utilizzata per il tracciamento per-batch con la funzione di rilevamento di underflow/overflow disattivata.
+
+Supponiamo di voler osservare i valori minimi e massimi assoluti per tutti gli ingredienti di ogni chiamata `forward` di un dato lotto.
+lotto, e che lo si voglia fare solo per i lotti 1 e 3. Si istanzia questa classe come:
+
+```python
+debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3])
+```
+
+Ora i batch completi 1 e 3 saranno tracciati utilizzando lo stesso formato del rilevatore di underflow/overflow.
+
+I batches sono 0-indexed.
+
+Questo è utile se si sa che il programma inizia a comportarsi male dopo un certo numero di batch, in modo da poter avanzare velocemente fino a quell'area.
+direttamente a quell'area. Ecco un esempio di output troncato per questa configurazione:
+
+```
+                  *** Starting batch number=1 ***
+abs min  abs max  metadata
+                  shared Embedding
+1.01e-06 7.92e+02 weight
+0.00e+00 2.47e+04 input[0]
+5.36e-05 7.92e+02 output
+[...]
+                  decoder.dropout Dropout
+1.60e-07 2.27e+01 input[0]
+0.00e+00 2.52e+01 output
+                  decoder T5Stack
+     not a tensor output
+                  lm_head Linear
+1.01e-06 7.92e+02 weight
+0.00e+00 1.11e+00 input[0]
+6.06e-02 8.39e+01 output
+                   T5ForConditionalGeneration
+     not a tensor output
+
+                  *** Starting batch number=3 ***
+abs min  abs max  metadata
+                  shared Embedding
+1.01e-06 7.92e+02 weight
+0.00e+00 2.78e+04 input[0]
+5.36e-05 7.92e+02 output
+[...]
+```
+
+Qui verrà scaricato un numero enorme di fotogrammi, tanti quanti sono le chiamate in avanti nel modello, quindi può essere o non essere quello che volete, ma a volte può essere più utile usarlo di un classico debugger. Per esempio, se il problema inizia a verificarsi a partire dal lotto numero 150. Quindi è possibile scaricare le tracce dei lotti 149 e 150 e confrontare i punti in cui i numeri hanno iniziato a divergere.
+
+È inoltre possibile specificare il numero di batch dopo il quale interrompere l'addestramento, con:
+
+```python
+debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3], abort_after_batch_num=3)
+```
diff --git a/docs/source/it/debugging.mdx b/docs/source/it/debugging.mdx
deleted file mode 100644
index 5b392489eab9..000000000000
--- a/docs/source/it/debugging.mdx
+++ /dev/null
@@ -1,314 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Debugging
-
-## Debug dei problemi di rete multi-GPU
-
-Quando addestri o fai inferenza con `DistributedDataParallel` e GPU multiple, se si verificano problemi di intercomunicazione tra processi e/o nodi, puoi utilizzare il seguente script per diagnosticare i problemi della rete.
-
-```bash
-wget https://raw.githubusercontent.com/huggingface/transformers/main/scripts/distributed/torch-distributed-gpu-test.py
-```
-
-Per esempio per testare come 2 GPU interagiscono fai:
-
-```bash
-python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
-```
-
-Se entrambi i processi sono in grado di comunicare tra loro e di allocare la memoria della GPU, ciascuno di essi stamperà lo stato OK.
-
-Per più GPU o nodi adatta gli argumenti nello script.
-
-All'interno dello script di diagnostica troverai molti altri dettagli e anche una guida per eseguirlo in ambiente SLURM.
-
-Un livello di debug superiore è aggiungere la variabile d'ambiente `NCCL_DEBUG=INFO` come di seguito:
-
-```bash
-NCCL_DEBUG=INFO python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
-```
-
-In questo modo si scaricano molte informazioni di debug relative a NCCL, che puoi cercare online in caso di problemi. Oppure, se non hai la sicurezza di come interpretare l'output, puoi condividere il file di log in una Issue.
-
-## Rilevamento di Underflow e Overflow
-
-<Tip>
-
-Questa funzionalità al momento è disponibile solo per PyTorch.
-
-</Tip>
-
-<Tip>
-
-Per addestramento multi-GPU richiede DDP (`torch.distributed.launch`).
-
-</Tip>
-
-<Tip>
-
-Questa funzionalità può essere usata con modelli basati su `nn.Module`.
-
-</Tip>
-
-Se inizi a ottenere `loss=NaN` o il modello presenta qualche altro comportamento anomalo a causa di valori `inf` o `nan` in
-attivazioni o nei pesi, è necessario scoprire dove si verifica il primo underflow o overflow e cosa lo ha determinato. Fortunatamente
-è possibile farlo facilmente attivando un modulo speciale che effettuerà il rilevamento automaticamente.
-
-Se stai usando [`Trainer`], hai bisogno di aggiungere solo:
-
-```bash
---debug underflow_overflow
-```
-
-ai normali argomenti della riga di comando, o passa `debug="underflow_overflow"` quando viene creato l'oggetto
-[`TrainingArguments`].
-
-Se stai usando il tuo ciclo di allenamento o un altro trainer, puoi ottenere lo stesso risultato con:
-
-```python
-from .debug_utils import DebugUnderflowOverflow
-
-debug_overflow = DebugUnderflowOverflow(model)
-```
-
-[`~debug_utils.DebugUnderflowOverflow`] inserisce dei ganci nel modello che dopo ogni chiamata
-testeranno le variabili di ingresso e di uscita e anche i pesi del modulo corrispondente. Non appena viene rilevato `inf` o
-o `nan` in almeno un elemento delle attivazioni o dei pesi, il programma lo notifica e stampa un rapporto come il seguente (questo è stato rilevato con `google/mt5-small` sotto fp16 mixed precision):
-
-```
-Detected inf/nan during batch_number=0
-Last 21 forward frames:
-abs min  abs max  metadata
-                  encoder.block.1.layer.1.DenseReluDense.dropout Dropout
-0.00e+00 2.57e+02 input[0]
-0.00e+00 2.85e+02 output
-[...]
-                  encoder.block.2.layer.0 T5LayerSelfAttention
-6.78e-04 3.15e+03 input[0]
-2.65e-04 3.42e+03 output[0]
-             None output[1]
-2.25e-01 1.00e+04 output[2]
-                  encoder.block.2.layer.1.layer_norm T5LayerNorm
-8.69e-02 4.18e-01 weight
-2.65e-04 3.42e+03 input[0]
-1.79e-06 4.65e+00 output
-                  encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
-2.17e-07 4.50e+00 weight
-1.79e-06 4.65e+00 input[0]
-2.68e-06 3.70e+01 output
-                  encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
-8.08e-07 2.66e+01 weight
-1.79e-06 4.65e+00 input[0]
-1.27e-04 2.37e+02 output
-                  encoder.block.2.layer.1.DenseReluDense.dropout Dropout
-0.00e+00 8.76e+03 input[0]
-0.00e+00 9.74e+03 output
-                  encoder.block.2.layer.1.DenseReluDense.wo Linear
-1.01e-06 6.44e+00 weight
-0.00e+00 9.74e+03 input[0]
-3.18e-04 6.27e+04 output
-                  encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
-1.79e-06 4.65e+00 input[0]
-3.18e-04 6.27e+04 output
-                  encoder.block.2.layer.1.dropout Dropout
-3.18e-04 6.27e+04 input[0]
-0.00e+00      inf output
-```
-
-L'output di esempio è stato tagliato al centro per brevità.
-
-La seconda colonna mostra il valore dell'elemento più grande in assoluto,così se osserviamo da vicino gli ultimi istanti,
-input e output sono nel range di `1e4`. Questo addestramento è stato eseguito con una mixed precision fp16 e l'ultimo passo usciva fuori (sotto `fp16` il valore più grande prima di `inf` è `64e3`). Per evitare overflows sotto `fp16` le attivazionioni devono rimanere molto al di sotto di `1e4`, perché `1e4 * 1e4 = 1e8` quindi qualsiasi moltiplicazione di matrice con grandi attivazioni porterà a una condizione di overflow numerico.
-
-All'inizio della traccia è possibile scoprire a quale lotto si è verificato il problema (questo `Detected inf/nan during batch_number=0` significa che il problema si è verificato nel primo lotto).
-
-Ogni frame segnalato inizia dichiarando la voce completamente qualificata per il modulo corrispondente per il quale il frame è stato segnalato. 
-Se osserviamo il seguente frame:
-
-```
-                  encoder.block.2.layer.1.layer_norm T5LayerNorm
-8.69e-02 4.18e-01 weight
-2.65e-04 3.42e+03 input[0]
-1.79e-06 4.65e+00 output
-```
-
-Questo, `encoder.block.2.layer.1.layer_norm` indica che si tratta di un layer norm nel primo layer, del secondo blocco dell'encoder. E le chiamata specifica di `forward` è `T5LayerNorm`.
-
-Osserviamo gli ultimi frame del report:
-
-```
-Detected inf/nan during batch_number=0
-Last 21 forward frames:
-abs min  abs max  metadata
-[...]
-                  encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
-2.17e-07 4.50e+00 weight
-1.79e-06 4.65e+00 input[0]
-2.68e-06 3.70e+01 output
-                  encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
-8.08e-07 2.66e+01 weight
-1.79e-06 4.65e+00 input[0]
-1.27e-04 2.37e+02 output
-                  encoder.block.2.layer.1.DenseReluDense.wo Linear
-1.01e-06 6.44e+00 weight
-0.00e+00 9.74e+03 input[0]
-3.18e-04 6.27e+04 output
-                  encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
-1.79e-06 4.65e+00 input[0]
-3.18e-04 6.27e+04 output
-                  encoder.block.2.layer.1.dropout Dropout
-3.18e-04 6.27e+04 input[0]
-0.00e+00      inf output
-```
-
-L'ultimo frame report per la funzione `Dropout.forward` con la prima voce per l'unico input e la seconda per l'unico output. Si può notare che è stato richiamato da un attibuto `dropout` dentro la classe `DenseReluDense`. Si può notare che ciò è avvenuto durante il primo strato, del 2° blocco, durante il primissimo lotto. Infine, gli elementi di input più grandi in assoluto sono stati `6.27e+04` e l'equivalente per l'output era `inf`.
-
-Puoi vedere qui, che `T5DenseGatedGeluDense.forward` risulta in output activations, il cui valore massimo assoluto era circa 62,7K, che è molto vicino al limite massimo di 64K di fp16. Nel prossimo frame abbiamo `Dropout` che rinormalizza i pesi, dopo aver azzerato alcuni elementi, il che spinge il valore massimo assoluto a più di 64K e si verifica un overflow.(`inf`).
-
-Come puoi notare, è nei frames precedenti che occorre esaminare quando i numeri iniziano a diventare molto grandi per i valori fp16.
-
-Confrontiamo il report al codice `models/t5/modeling_t5.py`:
-
-```python
-class T5DenseGatedGeluDense(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
-        self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
-        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
-        self.dropout = nn.Dropout(config.dropout_rate)
-        self.gelu_act = ACT2FN["gelu_new"]
-
-    def forward(self, hidden_states):
-        hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
-        hidden_linear = self.wi_1(hidden_states)
-        hidden_states = hidden_gelu * hidden_linear
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.wo(hidden_states)
-        return hidden_states
-```
-
-Ora è facile vedere la chiamata `dropout`, e tutte le chiamate precedenti.
-
-Poiché il rilevamento avviene in un avanzamento (forward hook in eng.), i rapporti vengono creati immeditamente dopo ogni rientro da `forward` (forward returns in eng.).
-
-Tornando al rapporto completo, per agire e risolvere il problema, dobbiamo andare qualche frame più in alto, dove i numeri hanno iniziato a salire, e probabilmente passare alla modalità `fp32`, in modo che i numeri non trabocchino quando vengono moltiplicati o sommati. Naturalmente, potrebbero esserci altre soluzioni. Per esempio, potremmo spegnere temporanemante `amp` se è abilitato, successivamente spostare `forward` in un helper wrapper, come:
-
-```python
-def _forward(self, hidden_states):
-    hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
-    hidden_linear = self.wi_1(hidden_states)
-    hidden_states = hidden_gelu * hidden_linear
-    hidden_states = self.dropout(hidden_states)
-    hidden_states = self.wo(hidden_states)
-    return hidden_states
-
-
-import torch
-
-
-def forward(self, hidden_states):
-    if torch.is_autocast_enabled():
-        with torch.cuda.amp.autocast(enabled=False):
-            return self._forward(hidden_states)
-    else:
-        return self._forward(hidden_states)
-```
-
-Poiché il rilevatore automatico riporta solo gli ingressi e le uscite di fotogrammi completi, una volta che si sa dove cercare, si può
-analizzare anche le fasi intermedie di una specifica funzione `forward`. In alcuni casi puoi usare la funzione di supporto `detect_overflow` per indirizzare il rilevatore dove preferisci, ad esempio:
-
-```python
-from debug_utils import detect_overflow
-
-
-class T5LayerFF(nn.Module):
-    [...]
-
-    def forward(self, hidden_states):
-        forwarded_states = self.layer_norm(hidden_states)
-        detect_overflow(forwarded_states, "after layer_norm")
-        forwarded_states = self.DenseReluDense(forwarded_states)
-        detect_overflow(forwarded_states, "after DenseReluDense")
-        return hidden_states + self.dropout(forwarded_states)
-```
-
-Si può vedere che abbiamo aggiunto 2 di questi e ora teniamo traccia se `inf` o `nan` per `forwarded_states` è stato rilevato
-da qualche parte.
-
-In realtà, il rilevatore li riporta già, perché ciascuna delle chiamate nell'esempio precedente è un `nn.Module`, ma
-diciamo che se avessimo dei calcoli diretti locali, questo è il modo in cui lo faremmo.
-
-Inoltre, se si istanzia il debugger nel proprio codice, è possibile modificare il numero di fotogrammi stampati rispetto a
-predefinito, ad esempio.:
-
-```python
-from .debug_utils import DebugUnderflowOverflow
-
-debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100)
-```
-
-### Tracciamento della mistura assoluta del lotto specifico e del valore massimo
-
-La stessa classe di debug può essere utilizzata per il tracciamento per-batch con la funzione di rilevamento di underflow/overflow disattivata.
-
-Supponiamo di voler osservare i valori minimi e massimi assoluti per tutti gli ingredienti di ogni chiamata `forward` di un dato lotto.
-lotto, e che lo si voglia fare solo per i lotti 1 e 3. Si istanzia questa classe come:
-
-```python
-debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3])
-```
-
-Ora i batch completi 1 e 3 saranno tracciati utilizzando lo stesso formato del rilevatore di underflow/overflow.
-
-I batches sono 0-indexed.
-
-Questo è utile se si sa che il programma inizia a comportarsi male dopo un certo numero di batch, in modo da poter avanzare velocemente fino a quell'area.
-direttamente a quell'area. Ecco un esempio di output troncato per questa configurazione:
-
-```
-                  *** Starting batch number=1 ***
-abs min  abs max  metadata
-                  shared Embedding
-1.01e-06 7.92e+02 weight
-0.00e+00 2.47e+04 input[0]
-5.36e-05 7.92e+02 output
-[...]
-                  decoder.dropout Dropout
-1.60e-07 2.27e+01 input[0]
-0.00e+00 2.52e+01 output
-                  decoder T5Stack
-     not a tensor output
-                  lm_head Linear
-1.01e-06 7.92e+02 weight
-0.00e+00 1.11e+00 input[0]
-6.06e-02 8.39e+01 output
-                   T5ForConditionalGeneration
-     not a tensor output
-
-                  *** Starting batch number=3 ***
-abs min  abs max  metadata
-                  shared Embedding
-1.01e-06 7.92e+02 weight
-0.00e+00 2.78e+04 input[0]
-5.36e-05 7.92e+02 output
-[...]
-```
-
-Qui verrà scaricato un numero enorme di fotogrammi, tanti quanti sono le chiamate in avanti nel modello, quindi può essere o non essere quello che volete, ma a volte può essere più utile usarlo di un classico debugger. Per esempio, se il problema inizia a verificarsi a partire dal lotto numero 150. Quindi è possibile scaricare le tracce dei lotti 149 e 150 e confrontare i punti in cui i numeri hanno iniziato a divergere.
-
-È inoltre possibile specificare il numero di batch dopo il quale interrompere l'addestramento, con:
-
-```python
-debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3], abort_after_batch_num=3)
-```
diff --git a/docs/source/it/index.md b/docs/source/it/index.md
new file mode 100644
index 000000000000..5c7d22c1e6b1
--- /dev/null
+++ b/docs/source/it/index.md
@@ -0,0 +1,300 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 🤗 Transformers
+
+Machine Learning allo stato dell'arte per PyTorch, TensorFlow e JAX.
+
+🤗 Transformers fornisce delle API per scaricare in modo semplice e allenare modelli pre-allenati allo stato dell'arte. L'utilizzo di modelli pre-allenati può ridurre i tuoi costi computazionali, l'impatto ambientale, e farti risparmiare il tempo che utilizzeresti per allenare un modello da zero. I modelli possono essere utilizzati in diverse modalità come ad esempio:
+
+* 📝 Testo: classificazione del testo, estrazione delle informazioni, rispondere a domande, riassumere, traduzione e generazione del testo in più di 100 lingue.
+* 🖼️ Immagini: classificazione di immagini, rilevazione di oggetti e segmentazione.
+* 🗣️ Audio: riconoscimento vocale e classificazione dell'audio.
+* 🐙 Multimodale: rispondere a domande inerenti dati tabulari, riconoscimento ottico dei caratteri, estrazione di informazioni a partire da documenti scannerizzati, classificazione di video e risposta visuale a domande.
+
+La nostra libreria supporta un'integrazione perfetta tra tre delle librerie per il deep learning più popolari: [PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/) e [JAX](https://jax.readthedocs.io/en/latest/). Allena il tuo modello in tre righe di codice in un framework, e caricalo per l'inferenza in un altro.
+
+Ogni architettura di 🤗 Transformers è definita in un modulo Python indipendente così da poter essere personalizzata in modo semplice per la ricerca e gli esperimenti.
+
+## Se stai cercando supporto personalizzato dal team di Hugging Face
+
+<a target="_blank" href="https://huggingface.co/support">
+<img alt="HuggingFace Expert Acceleration Program" src="https://huggingface.co/front/thumbnails/support.png" style="width: 100%; max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
+</a>
+
+## Contenuti
+
+La documentazione è organizzata in cinque parti:
+
+- **INIZIARE** contiene un tour rapido e le istruzioni di installazione per cominciare ad utilizzare 🤗 Transformers.
+- **TUTORIALS** è un buon posto da cui iniziare se per te la nostra libreria è nuova. Questa sezione ti aiuterà ad acquisire le competenze basilari di cui hai bisogno per iniziare ad  utilizzare 🤗 Transformers.
+- **GUIDE PRATICHE** ti mostrerà come raggiungere obiettivi specifici come fare fine-tuning di un modello pre-allenato per la modellizzazione del linguaggio o come creare una testa per un modello personalizzato.
+- **GUIDE CONCETTUALI** fornisce discussioni e spiegazioni dei concetti sottostanti alle idee dietro ai modelli, compiti, e la filosofia di progettazione di 🤗 Transformers.
+- **API** descrive ogni classe e funzione, raggruppate in:
+    - **CLASSI PRINCIPALI** per le classi principali che espongono le API importanti della libreria.
+    - **MODELLI** per le classi e le funzioni relative ad ogni modello implementato all'interno della libreria.
+    - **HELPERS INTERNI** per le classi e le funzioni che utilizziamo internamente.
+
+La libreria attualmente contiene implementazioni in JAX, PyTorch e TensorFlow, pesi di modelli pre-allenati, script di utilizzo e strumenti di conversione per i seguenti modelli.
+
+### Modelli supportati
+
+<!--This list is updated automatically from the README with _make fix-copies_. Do not update manually! -->
+
+1. **[ALBERT](model_doc/albert)** (da Google Research e l'Istituto Tecnologico di Chicago) rilasciato con il paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), da Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
+1. **[ALIGN](model_doc/align)** (from Google Research) rilasciato con il paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) da Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.
+1. **[BART](model_doc/bart)** (da Facebook) rilasciato con il paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) da Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov e Luke Zettlemoyer.
+1. **[BARThez](model_doc/barthez)** (da politecnico di École) rilasciato con il paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) da Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
+1. **[BARTpho](model_doc/bartpho)** (da VinAI Research) rilasciato con il paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) da Nguyen Luong Tran, Duong Minh Le e Dat Quoc Nguyen.
+1. **[BEiT](model_doc/beit)** (da Microsoft) rilasciato con il paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) da Hangbo Bao, Li Dong, Furu Wei.
+1. **[BERT](model_doc/bert)** (da Google) rilasciato con il paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) da Jacob Devlin, Ming-Wei Chang, Kenton Lee e Kristina Toutanova.
+1. **[BERTweet](model_doc/bertweet)** (da VinAI Research) rilasciato con il paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) da Dat Quoc Nguyen, Thanh Vu e Anh Tuan Nguyen.
+1. **[BERT For Sequence Generation](model_doc/bert-generation)** (da Google) rilasciato con il paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) da Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[BigBird-RoBERTa](model_doc/big_bird)** (da Google Research) rilasciato con il paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) da Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[BigBird-Pegasus](model_doc/bigbird_pegasus)** (v Google Research) rilasciato con il paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) da Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[Blenderbot](model_doc/blenderbot)** (da Facebook) rilasciato con il paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) da Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BlenderbotSmall](model_doc/blenderbot-small)** (da Facebook) rilasciato con il paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) da Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BORT](model_doc/bort)** (da Alexa) rilasciato con il paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) da Adrian de Wynter e Daniel J. Perry.
+1. **[ByT5](model_doc/byt5)** (da Google Research) rilasciato con il paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) da Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
+1. **[CamemBERT](model_doc/camembert)** (da Inria/Facebook/Sorbonne) rilasciato con il paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) da Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah e Benoît Sagot.
+1. **[CANINE](model_doc/canine)** (da Google Research) rilasciato con il paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) da Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
+1. **[ConvNeXT](model_doc/convnext)** (da Facebook AI) rilasciato con il paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) da Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
+1. **[ConvNeXTV2](model_doc/convnextv2)** (da Facebook AI) rilasciato con il paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) da Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
+1. **[CLIP](model_doc/clip)** (da OpenAI) rilasciato con il paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) da Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
+1. **[ConvBERT](model_doc/convbert)** (da YituTech) rilasciato con il paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) da Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
+1. **[CPM](model_doc/cpm)** (dalla Università di Tsinghua) rilasciato con il paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) da Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
+1. **[CTRL](model_doc/ctrl)** (da Salesforce) rilasciato con il paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) da Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong e Richard Socher.
+1. **[CvT](model_doc/cvt)** (da Microsoft) rilasciato con il paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) da Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
+1. **[Data2Vec](model_doc/data2vec)** (da Facebook) rilasciato con il paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) da Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
+1. **[DeBERTa](model_doc/deberta)** (da Microsoft) rilasciato con il paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) da Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[DeBERTa-v2](model_doc/deberta-v2)** (da Microsoft) rilasciato con il paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) da Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[Decision Transformer](model_doc/decision_transformer)** (da Berkeley/Facebook/Google) rilasciato con il paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) da Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
+1. **[DiT](model_doc/dit)** (da Microsoft Research) rilasciato con il paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) da Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
+1. **[DeiT](model_doc/deit)** (da Facebook) rilasciato con il paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) da Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
+1. **[DETR](model_doc/detr)** (da Facebook) rilasciato con il paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) da Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
+1. **[DialoGPT](model_doc/dialogpt)** (da Microsoft Research) rilasciato con il paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) da Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
+1. **[DistilBERT](model_doc/distilbert)** (da HuggingFace), rilasciato assieme al paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) da Victor Sanh, Lysandre Debut e Thomas Wolf. La stessa tecnica è stata applicata per comprimere GPT2 in [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa in [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT in [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
+1. **[DPR](model_doc/dpr)** (da Facebook) rilasciato con il paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) da Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, e Wen-tau Yih.
+1. **[DPT](master/model_doc/dpt)** (da Intel Labs) rilasciato con il paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) da René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
+1. **[EfficientNet](model_doc/efficientnet)** (from Google Research) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946)  by Mingxing Tan and Quoc V. Le.
+1. **[EncoderDecoder](model_doc/encoder-decoder)** (da Google Research) rilasciato con il paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) da Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[ELECTRA](model_doc/electra)** (da Google Research/Stanford University) rilasciato con il paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) da Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
+1. **[FlauBERT](model_doc/flaubert)** (da CNRS) rilasciato con il paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) da Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
+1. **[FLAVA](model_doc/flava)** (da Facebook AI) rilasciato con il paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) da Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, e Douwe Kiela.
+1. **[FNet](model_doc/fnet)** (da Google Research) rilasciato con il paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) da James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
+1. **[Funnel Transformer](model_doc/funnel)** (da CMU/Google Brain) rilasciato con il paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) da Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
+1. **[GLPN](model_doc/glpn)** (da KAIST) rilasciato con il paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) da Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
+1. **[GPT](model_doc/openai-gpt)** (da OpenAI) rilasciato con il paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) da Alec Radford, Karthik Narasimhan, Tim Salimans e Ilya Sutskever.
+1. **[GPT-2](model_doc/gpt2)** (da OpenAI) rilasciato con il paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) da Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** e Ilya Sutskever**.
+1. **[GPT-J](model_doc/gptj)** (da EleutherAI) rilasciato nel repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) da Ben Wang e Aran Komatsuzaki.
+1. **[GPT Neo](model_doc/gpt_neo)** (da EleutherAI) rilasciato nel repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) da Sid Black, Stella Biderman, Leo Gao, Phil Wang e Connor Leahy.
+1. **[GPT NeoX](model_doc/gpt_neox)** (da EleutherAI) rilasciato con il paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) da Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
+1. **[Hubert](model_doc/hubert)** (da Facebook) rilasciato con il paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) da Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
+1. **[I-BERT](model_doc/ibert)** (da Berkeley) rilasciato con il paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) da Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
+1. **[ImageGPT](model_doc/imagegpt)** (da OpenAI) rilasciato con il paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) da Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
+1. **[LayoutLM](model_doc/layoutlm)** (da Microsoft Research Asia) rilasciato con il paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) da Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
+1. **[LayoutLMv2](model_doc/layoutlmv2)** (da Microsoft Research Asia) rilasciato con il paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) da Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
+1. **[LayoutLMv3](model_doc/layoutlmv3)** (da Microsoft Research Asia) rilasciato con il paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) da Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
+1. **[LayoutXLM](model_doc/layoutlxlm)** (da Microsoft Research Asia) rilasciato con il paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) da Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
+1. **[LED](model_doc/led)** (da AllenAI) rilasciato con il paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) da Iz Beltagy, Matthew E. Peters, Arman Cohan.
+1. **[Longformer](model_doc/longformer)** (da AllenAI) rilasciato con il paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) da Iz Beltagy, Matthew E. Peters, Arman Cohan.
+1. **[LUKE](model_doc/luke)** (da Studio Ousia) rilasciato con il paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) da Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
+1. **[mLUKE](model_doc/mluke)** (da Studio Ousia) rilasciato con il paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) da Ryokan Ri, Ikuya Yamada, e Yoshimasa Tsuruoka.
+1. **[LXMERT](model_doc/lxmert)** (da UNC Chapel Hill) rilasciato con il paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) da Hao Tan e Mohit Bansal.
+1. **[M2M100](model_doc/m2m_100)** (da Facebook) rilasciato con il paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) da Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
+1. **[MarianMT](model_doc/marian)** Modello di machine learning per le traduzioni allenato utilizzando i dati [OPUS](http://opus.nlpl.eu/) di Jörg Tiedemann. Il [Framework Marian](https://marian-nmt.github.io/) è stato sviluppato dal Microsoft Translator Team.
+1. **[Mask2Former](model_doc/mask2former)** (da FAIR e UIUC) rilasciato con il paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) da Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
+1. **[MaskFormer](model_doc/maskformer)** (da Meta e UIUC) rilasciato con il paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) da Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
+1. **[MBart](model_doc/mbart)** (da Facebook) rilasciato con il paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) da Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
+1. **[MBart-50](model_doc/mbart)** (da Facebook) rilasciato con il paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) da Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
+1. **[Megatron-BERT](model_doc/megatron-bert)** (da NVIDIA) rilasciato con il paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) da Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper e Bryan Catanzaro.
+1. **[Megatron-GPT2](model_doc/megatron_gpt2)** (da NVIDIA) rilasciato con il paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) da Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper e Bryan Catanzaro.
+1. **[MPNet](model_doc/mpnet)** (da Microsoft Research) rilasciato con il paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) da Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
+1. **[MT5](model_doc/mt5)** (da Google AI) rilasciato con il paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) da Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
+1. **[Nyströmformer](model_doc/nystromformer)** (dalla Università del Wisconsin - Madison) rilasciato con il paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) da Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
+1. **[OneFormer](model_doc/oneformer)** (da SHI Labs) rilasciato con il paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) da Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
+1. **[OPT](master/model_doc/opt)** (da Meta AI) rilasciato con il paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) da Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
+1. **[Pegasus](model_doc/pegasus)** (da Google) rilasciato con il paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) da Jingqing Zhang, Yao Zhao, Mohammad Saleh e Peter J. Liu.
+1. **[Perceiver IO](model_doc/perceiver)** (da Deepmind) rilasciato con il paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) da Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
+1. **[PhoBERT](model_doc/phobert)** (da VinAI Research) rilasciato con il paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) da Dat Quoc Nguyen e Anh Tuan Nguyen.
+1. **[PLBart](model_doc/plbart)** (da UCLA NLP) rilasciato con il paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) da Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
+1. **[PoolFormer](model_doc/poolformer)** (da Sea AI Labs) rilasciato con il paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) da Yu, Weihao e Luo, Mi e Zhou, Pan e Si, Chenyang e Zhou, Yichen e Wang, Xinchao e Feng, Jiashi e Yan, Shuicheng.
+1. **[ProphetNet](model_doc/prophetnet)** (da Microsoft Research) rilasciato con il paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) da Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang e Ming Zhou.
+1. **[QDQBert](model_doc/qdqbert)** (da NVIDIA) rilasciato con il paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) da Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev e Paulius Micikevicius.
+1. **[REALM](model_doc/realm.html)** (da Google Research) rilasciato con il paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) da Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat e Ming-Wei Chang.
+1. **[Reformer](model_doc/reformer)** (da Google Research) rilasciato con il paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) da Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+1. **[RemBERT](model_doc/rembert)** (da Google Research) rilasciato con il paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) da Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
+1. **[RegNet](model_doc/regnet)** (da META Platforms) rilasciato con il paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) da Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
+1. **[ResNet](model_doc/resnet)** (da Microsoft Research) rilasciato con il paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) da Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
+1. **[RoBERTa](model_doc/roberta)** (da Facebook), rilasciato assieme al paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) da Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
+1. **[RoFormer](model_doc/roformer)** (da ZhuiyiTechnology), rilasciato assieme al paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) da Jianlin Su e Yu Lu e Shengfeng Pan e Bo Wen e Yunfeng Liu.
+1. **[SegFormer](model_doc/segformer)** (da NVIDIA) rilasciato con il paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) da Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
+1. **[SEW](model_doc/sew)** (da ASAPP) rilasciato con il paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) da Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
+1. **[SEW-D](model_doc/sew_d)** (da ASAPP) rilasciato con il paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) da Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
+1. **[SpeechToTextTransformer](model_doc/speech_to_text)** (da Facebook), rilasciato assieme al paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) da Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
+1. **[SpeechToTextTransformer2](model_doc/speech_to_text_2)** (da Facebook), rilasciato assieme al paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) da Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
+1. **[Splinter](model_doc/splinter)** (dalla Università di Tel Aviv), rilasciato assieme al paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) da Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
+1. **[SqueezeBert](model_doc/squeezebert)** (da Berkeley) rilasciato con il paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) da Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, e Kurt W. Keutzer.
+1. **[Swin Transformer](model_doc/swin)** (da Microsoft) rilasciato con il paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) da Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
+1. **[T5](model_doc/t5)** (da Google AI) rilasciato con il paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) da Colin Raffel e Noam Shazeer e Adam Roberts e Katherine Lee e Sharan Narang e Michael Matena e Yanqi Zhou e Wei Li e Peter J. Liu.
+1. **[T5v1.1](model_doc/t5v1.1)** (da Google AI) rilasciato nel repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) da Colin Raffel e Noam Shazeer e Adam Roberts e Katherine Lee e Sharan Narang e Michael Matena e Yanqi Zhou e Wei Li e Peter J. Liu.
+1. **[TAPAS](model_doc/tapas)** (da Google AI) rilasciato con il paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) da Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno e Julian Martin Eisenschlos.
+1. **[TAPEX](model_doc/tapex)** (da Microsoft Research) rilasciato con il paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) da Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
+1. **[Trajectory Transformer](model_doc/trajectory_transformers)** (dall'Università della California a Berkeley) rilasciato con il paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) da Michael Janner, Qiyang Li, Sergey Levine
+1. **[Transformer-XL](model_doc/transfo-xl)** (da Google/CMU) rilasciato con il paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) da Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+1. **[TrOCR](model_doc/trocr)** (da Microsoft), rilasciato assieme al paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) da Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
+1. **[UniSpeech](model_doc/unispeech)** (da Microsoft Research) rilasciato con il paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) da Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
+1. **[UniSpeechSat](model_doc/unispeech-sat)** (da Microsoft Research) rilasciato con il paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) da Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
+1. **[VAN](model_doc/van)** (dalle Università di Tsinghua e Nankai) rilasciato con il paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) da Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
+1. **[ViLT](model_doc/vilt)** (da NAVER AI Lab/Kakao Enterprise/Kakao Brain) rilasciato con il paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) da Wonjae Kim, Bokyung Son, Ildoo Kim.
+1. **[Vision Transformer (ViT)](model_doc/vit)** (da Google AI) rilasciato con il paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) da Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[ViTMAE](model_doc/vit_mae)** (da Meta AI) rilasciato con il paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) da Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
+1. **[VisualBERT](model_doc/visual_bert)** (da UCLA NLP) rilasciato con il paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) da Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
+1. **[WavLM](model_doc/wavlm)** (da Microsoft Research) rilasciato con il paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) da Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
+1. **[Wav2Vec2](model_doc/wav2vec2)** (da Facebook AI) rilasciato con il paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) da Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
+1. **[Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme)** (da Facebook AI) rilasciato con il paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) da Qiantong Xu, Alexei Baevski, Michael Auli.
+1. **[XGLM](model_doc/xglm)** (da Facebook AI) rilasciato con il paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) da Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
+1. **[XLM](model_doc/xlm)** (v Facebook) rilasciato assieme al paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) da Guillaume Lample e Alexis Conneau.
+1. **[XLM-ProphetNet](model_doc/xlm-prophetnet)** (da Microsoft Research) rilasciato con il paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) da Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang e Ming Zhou.
+1. **[XLM-RoBERTa](model_doc/xlm-roberta)** (da Facebook AI), rilasciato assieme al paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) da Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer e Veselin Stoyanov.
+1. **[XLM-RoBERTa-XL](model_doc/xlm-roberta-xl)** (da Facebook AI), rilasciato assieme al paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) da Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
+1. **[XLNet](model_doc/xlnet)** (da Google/CMU) rilasciato con il paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) da Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
+1. **[XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)** (da Facebook AI) rilasciato con il paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) da Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
+1. **[XLS-R](model_doc/xls_r)** (da Facebook AI) rilasciato con il paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) da Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
+1. **[YOLOS](model_doc/yolos)** (dalla Università della scienza e tecnologia di Huazhong) rilasciato con il paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) da Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
+1. **[YOSO](model_doc/yoso)** (dall'Università del Wisconsin - Madison) rilasciato con il paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) da Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
+
+
+### Framework supportati
+
+La tabella seguente rappresenta il supporto attuale nella libreria per ognuno di questi modelli, si può identificare se questi hanno un Python
+tokenizer (chiamato "slow"). Un tokenizer "fast" supportato dalla libreria 🤗 Tokenizers, e se hanno supporto in Jax (via Flax), PyTorch, e/o TensorFlow.
+
+<!--This table is updated automatically from the auto modules with _make fix-copies_. Do not update manually!-->
+
+|            Model            | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support |
+|:---------------------------:|:--------------:|:--------------:|:---------------:|:------------------:|:------------:|
+|           ALBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|            BART             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|            BEiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|            BERT             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|       Bert Generation       |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           BigBird           |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
+|       BigBirdPegasus        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         Blenderbot          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|       BlenderbotSmall       |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|          CamemBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           Canine            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            CLIP             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|          ConvNext           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             CvT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        Data2VecAudio        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        Data2VecText         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|       Data2VecVision        |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|           DeBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|         DeBERTa-v2          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|    Decision Transformer     |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            DeiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            DETR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         DistilBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|             DPR             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|             DPT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           ELECTRA           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|       Encoder decoder       |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+| FairSeq Machine-Translation |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          FlauBERT           |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            Flava            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            FNet             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|     Funnel Transformer      |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            GLPN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           GPT Neo           |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|          GPT NeoX           |       ❌       |       ✅       |       ✅        |         ❌         |      ❌      |
+|            GPT-J            |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|           Hubert            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|           I-BERT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          ImageGPT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          LayoutLM           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|         LayoutLMv2          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|         LayoutLMv3          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|             LED             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|         Longformer          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            LUKE             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           LXMERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           M2M100            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Marian            |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
+|         MaskFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            mBART            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|        MegatronBert         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         MobileBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            MPNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|             mT5             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|        Nystromformer        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         OpenAI GPT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|        OpenAI GPT-2         |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|             OPT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Pegasus           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|          Perceiver          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           PLBart            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         PoolFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         ProphetNet          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           QDQBert           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             RAG             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            Realm            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          Reformer           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|           RegNet            |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|           RemBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           ResNet            |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|          RetriBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|           RoBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|          RoFormer           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|          SegFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             SEW             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            SEW-D            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|   Speech Encoder decoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|         Speech2Text         |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|        Speech2Text2         |       ✅       |       ❌       |       ❌        |         ❌         |      ❌      |
+|          Splinter           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|         SqueezeBERT         |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|            Swin             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             T5              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|            TAPAS            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|   Trajectory Transformer    |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|       Transformer-XL        |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            TrOCR            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          UniSpeech          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        UniSpeechSat         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             VAN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            ViLT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|   Vision Encoder decoder    |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|    VisionTextDualEncoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|         VisualBert          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             ViT             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|           ViTMAE            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|          Wav2Vec2           |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
+|     Wav2Vec2-Conformer      |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            WavLM            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            XGLM             |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
+|             XLM             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|         XLM-RoBERTa         |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|       XLM-RoBERTa-XL        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        XLMProphetNet        |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            XLNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            YOLOS            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            YOSO             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+
+<!-- End table-->
diff --git a/docs/source/it/index.mdx b/docs/source/it/index.mdx
deleted file mode 100644
index e612c3699b59..000000000000
--- a/docs/source/it/index.mdx
+++ /dev/null
@@ -1,291 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# 🤗 Transformers
-
-Machine Learning allo stato dell'arte per PyTorch, TensorFlow e JAX.
-
-🤗 Transformers fornisce delle API per scaricare in modo semplice e allenare modelli pre-allenati allo stato dell'arte. L'utilizzo di modelli pre-allenati può ridurre i tuoi costi computazionali, l'impatto ambientale, e farti risparmiare il tempo che utilizzeresti per allenare un modello da zero. I modelli possono essere utilizzati in diverse modalità come ad esempio:
-
-* 📝 Testo: classificazione del testo, estrazione delle informazioni, rispondere a domande, riassumere, traduzione e generazione del testo in più di 100 lingue.
-* 🖼️ Immagini: classificazione di immagini, rilevazione di oggetti e segmentazione.
-* 🗣️ Audio: riconoscimento vocale e classificazione dell'audio.
-* 🐙 Multimodale: rispondere a domande inerenti dati tabulari, riconoscimento ottico dei caratteri, estrazione di informazioni a partire da documenti scannerizzati, classificazione di video e risposta visuale a domande.
-
-La nostra libreria supporta un'integrazione perfetta tra tre delle librerie per il deep learning più popolari: [PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/) e [JAX](https://jax.readthedocs.io/en/latest/). Allena il tuo modello in tre righe di codice in un framework, e caricalo per l'inferenza in un altro.
-
-Ogni architettura di 🤗 Transformers è definita in un modulo Python indipendente così da poter essere personalizzata in modo semplice per la ricerca e gli esperimenti.
-
-## Se stai cercando supporto personalizzato dal team di Hugging Face
-
-<a target="_blank" href="https://huggingface.co/support">
-<img alt="HuggingFace Expert Acceleration Program" src="https://huggingface.co/front/thumbnails/support.png" style="width: 100%; max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
-</a>
-
-## Contenuti
-
-La documentazione è organizzata in cinque parti:
-
-- **INIZIARE** contiene un tour rapido e le istruzioni di installazione per cominciare ad utilizzare 🤗 Transformers.
-- **TUTORIALS** è un buon posto da cui iniziare se per te la nostra libreria è nuova. Questa sezione ti aiuterà ad acquisire le competenze basilari di cui hai bisogno per iniziare ad  utilizzare 🤗 Transformers.
-- **GUIDE PRATICHE** ti mostrerà come raggiungere obiettivi specifici come fare fine-tuning di un modello pre-allenato per la modellizzazione del linguaggio o come creare una testa per un modello personalizzato.
-- **GUIDE CONCETTUALI** fornisce discussioni e spiegazioni dei concetti sottostanti alle idee dietro ai modelli, compiti, e la filosofia di progettazione di 🤗 Transformers.
-- **API** descrive ogni classe e funzione, raggruppate in:
-    - **CLASSI PRINCIPALI** per le classi principali che espongono le API importanti della libreria.
-    - **MODELLI** per le classi e le funzioni relative ad ogni modello implementato all'interno della libreria.
-    - **HELPERS INTERNI** per le classi e le funzioni che utilizziamo internamente.
-
-La libreria attualmente contiene implementazioni in JAX, PyTorch e TensorFlow, pesi di modelli pre-allenati, script di utilizzo e strumenti di conversione per i seguenti modelli.
-
-### Modelli supportati
-
-<!--This list is updated automatically from the README with _make fix-copies_. Do not update manually! -->
-
-1. **[ALBERT](model_doc/albert)** (da Google Research e l'Istituto Tecnologico di Chicago) rilasciato con il paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), da Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-1. **[BART](model_doc/bart)** (da Facebook) rilasciato con il paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) da Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov e Luke Zettlemoyer.
-1. **[BARThez](model_doc/barthez)** (da politecnico di École) rilasciato con il paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) da Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
-1. **[BARTpho](model_doc/bartpho)** (da VinAI Research) rilasciato con il paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) da Nguyen Luong Tran, Duong Minh Le e Dat Quoc Nguyen.
-1. **[BEiT](model_doc/beit)** (da Microsoft) rilasciato con il paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) da Hangbo Bao, Li Dong, Furu Wei.
-1. **[BERT](model_doc/bert)** (da Google) rilasciato con il paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) da Jacob Devlin, Ming-Wei Chang, Kenton Lee e Kristina Toutanova.
-1. **[BERTweet](model_doc/bertweet)** (da VinAI Research) rilasciato con il paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) da Dat Quoc Nguyen, Thanh Vu e Anh Tuan Nguyen.
-1. **[BERT For Sequence Generation](model_doc/bert-generation)** (da Google) rilasciato con il paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) da Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[BigBird-RoBERTa](model_doc/big_bird)** (da Google Research) rilasciato con il paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) da Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BigBird-Pegasus](model_doc/bigbird_pegasus)** (v Google Research) rilasciato con il paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) da Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[Blenderbot](model_doc/blenderbot)** (da Facebook) rilasciato con il paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) da Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BlenderbotSmall](model_doc/blenderbot-small)** (da Facebook) rilasciato con il paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) da Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BORT](model_doc/bort)** (da Alexa) rilasciato con il paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) da Adrian de Wynter e Daniel J. Perry.
-1. **[ByT5](model_doc/byt5)** (da Google Research) rilasciato con il paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) da Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
-1. **[CamemBERT](model_doc/camembert)** (da Inria/Facebook/Sorbonne) rilasciato con il paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) da Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah e Benoît Sagot.
-1. **[CANINE](model_doc/canine)** (da Google Research) rilasciato con il paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) da Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
-1. **[ConvNeXT](model_doc/convnext)** (da Facebook AI) rilasciato con il paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) da Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
-1. **[CLIP](model_doc/clip)** (da OpenAI) rilasciato con il paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) da Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
-1. **[ConvBERT](model_doc/convbert)** (da YituTech) rilasciato con il paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) da Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
-1. **[CPM](model_doc/cpm)** (dalla Università di Tsinghua) rilasciato con il paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) da Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
-1. **[CTRL](model_doc/ctrl)** (da Salesforce) rilasciato con il paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) da Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong e Richard Socher.
-1. **[CvT](model_doc/cvt)** (da Microsoft) rilasciato con il paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) da Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
-1. **[Data2Vec](model_doc/data2vec)** (da Facebook) rilasciato con il paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) da Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
-1. **[DeBERTa](model_doc/deberta)** (da Microsoft) rilasciato con il paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) da Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[DeBERTa-v2](model_doc/deberta-v2)** (da Microsoft) rilasciato con il paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) da Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[Decision Transformer](model_doc/decision_transformer)** (da Berkeley/Facebook/Google) rilasciato con il paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) da Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
-1. **[DiT](model_doc/dit)** (da Microsoft Research) rilasciato con il paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) da Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
-1. **[DeiT](model_doc/deit)** (da Facebook) rilasciato con il paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) da Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
-1. **[DETR](model_doc/detr)** (da Facebook) rilasciato con il paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) da Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
-1. **[DialoGPT](model_doc/dialogpt)** (da Microsoft Research) rilasciato con il paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) da Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-1. **[DistilBERT](model_doc/distilbert)** (da HuggingFace), rilasciato assieme al paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) da Victor Sanh, Lysandre Debut e Thomas Wolf. La stessa tecnica è stata applicata per comprimere GPT2 in [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa in [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT in [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
-1. **[DPR](model_doc/dpr)** (da Facebook) rilasciato con il paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) da Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, e Wen-tau Yih.
-1. **[DPT](master/model_doc/dpt)** (da Intel Labs) rilasciato con il paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) da René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
-1. **[EncoderDecoder](model_doc/encoder-decoder)** (da Google Research) rilasciato con il paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) da Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[ELECTRA](model_doc/electra)** (da Google Research/Stanford University) rilasciato con il paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) da Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
-1. **[FlauBERT](model_doc/flaubert)** (da CNRS) rilasciato con il paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) da Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
-1. **[FLAVA](model_doc/flava)** (da Facebook AI) rilasciato con il paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) da Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, e Douwe Kiela.
-1. **[FNet](model_doc/fnet)** (da Google Research) rilasciato con il paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) da James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
-1. **[Funnel Transformer](model_doc/funnel)** (da CMU/Google Brain) rilasciato con il paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) da Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-1. **[GLPN](model_doc/glpn)** (da KAIST) rilasciato con il paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) da Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
-1. **[GPT](model_doc/openai-gpt)** (da OpenAI) rilasciato con il paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) da Alec Radford, Karthik Narasimhan, Tim Salimans e Ilya Sutskever.
-1. **[GPT-2](model_doc/gpt2)** (da OpenAI) rilasciato con il paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) da Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** e Ilya Sutskever**.
-1. **[GPT-J](model_doc/gptj)** (da EleutherAI) rilasciato nel repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) da Ben Wang e Aran Komatsuzaki.
-1. **[GPT Neo](model_doc/gpt_neo)** (da EleutherAI) rilasciato nel repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) da Sid Black, Stella Biderman, Leo Gao, Phil Wang e Connor Leahy.
-1. **[GPT NeoX](model_doc/gpt_neox)** (da EleutherAI) rilasciato con il paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) da Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
-1. **[Hubert](model_doc/hubert)** (da Facebook) rilasciato con il paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) da Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
-1. **[I-BERT](model_doc/ibert)** (da Berkeley) rilasciato con il paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) da Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
-1. **[ImageGPT](model_doc/imagegpt)** (da OpenAI) rilasciato con il paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) da Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
-1. **[LayoutLM](model_doc/layoutlm)** (da Microsoft Research Asia) rilasciato con il paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) da Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
-1. **[LayoutLMv2](model_doc/layoutlmv2)** (da Microsoft Research Asia) rilasciato con il paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) da Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
-1. **[LayoutLMv3](model_doc/layoutlmv3)** (da Microsoft Research Asia) rilasciato con il paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) da Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
-1. **[LayoutXLM](model_doc/layoutlxlm)** (da Microsoft Research Asia) rilasciato con il paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) da Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
-1. **[LED](model_doc/led)** (da AllenAI) rilasciato con il paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) da Iz Beltagy, Matthew E. Peters, Arman Cohan.
-1. **[Longformer](model_doc/longformer)** (da AllenAI) rilasciato con il paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) da Iz Beltagy, Matthew E. Peters, Arman Cohan.
-1. **[LUKE](model_doc/luke)** (da Studio Ousia) rilasciato con il paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) da Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
-1. **[mLUKE](model_doc/mluke)** (da Studio Ousia) rilasciato con il paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) da Ryokan Ri, Ikuya Yamada, e Yoshimasa Tsuruoka.
-1. **[LXMERT](model_doc/lxmert)** (da UNC Chapel Hill) rilasciato con il paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) da Hao Tan e Mohit Bansal.
-1. **[M2M100](model_doc/m2m_100)** (da Facebook) rilasciato con il paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) da Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
-1. **[MarianMT](model_doc/marian)** Modello di machine learning per le traduzioni allenato utilizzando i dati [OPUS](http://opus.nlpl.eu/) di Jörg Tiedemann. Il [Framework Marian](https://marian-nmt.github.io/) è stato sviluppato dal Microsoft Translator Team.
-1. **[MaskFormer](model_doc/maskformer)** (da Meta and UIUC) rilasciato con il paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) da Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
-1. **[MBart](model_doc/mbart)** (da Facebook) rilasciato con il paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) da Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-1. **[MBart-50](model_doc/mbart)** (da Facebook) rilasciato con il paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) da Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-1. **[Megatron-BERT](model_doc/megatron-bert)** (da NVIDIA) rilasciato con il paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) da Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper e Bryan Catanzaro.
-1. **[Megatron-GPT2](model_doc/megatron_gpt2)** (da NVIDIA) rilasciato con il paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) da Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper e Bryan Catanzaro.
-1. **[MPNet](model_doc/mpnet)** (da Microsoft Research) rilasciato con il paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) da Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
-1. **[MT5](model_doc/mt5)** (da Google AI) rilasciato con il paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) da Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-1. **[Nyströmformer](model_doc/nystromformer)** (dalla Università del Wisconsin - Madison) rilasciato con il paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) da Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
-1. **[OPT](master/model_doc/opt)** (da Meta AI) rilasciato con il paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) da Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
-1. **[Pegasus](model_doc/pegasus)** (da Google) rilasciato con il paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) da Jingqing Zhang, Yao Zhao, Mohammad Saleh e Peter J. Liu.
-1. **[Perceiver IO](model_doc/perceiver)** (da Deepmind) rilasciato con il paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) da Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
-1. **[PhoBERT](model_doc/phobert)** (da VinAI Research) rilasciato con il paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) da Dat Quoc Nguyen e Anh Tuan Nguyen.
-1. **[PLBart](model_doc/plbart)** (da UCLA NLP) rilasciato con il paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) da Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
-1. **[PoolFormer](model_doc/poolformer)** (da Sea AI Labs) rilasciato con il paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) da Yu, Weihao e Luo, Mi e Zhou, Pan e Si, Chenyang e Zhou, Yichen e Wang, Xinchao e Feng, Jiashi e Yan, Shuicheng.
-1. **[ProphetNet](model_doc/prophetnet)** (da Microsoft Research) rilasciato con il paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) da Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang e Ming Zhou.
-1. **[QDQBert](model_doc/qdqbert)** (da NVIDIA) rilasciato con il paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) da Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev e Paulius Micikevicius.
-1. **[REALM](model_doc/realm.html)** (da Google Research) rilasciato con il paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) da Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat e Ming-Wei Chang.
-1. **[Reformer](model_doc/reformer)** (da Google Research) rilasciato con il paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) da Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
-1. **[RemBERT](model_doc/rembert)** (da Google Research) rilasciato con il paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) da Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
-1. **[RegNet](model_doc/regnet)** (da META Platforms) rilasciato con il paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) da Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
-1. **[ResNet](model_doc/resnet)** (da Microsoft Research) rilasciato con il paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) da Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
-1. **[RoBERTa](model_doc/roberta)** (da Facebook), rilasciato assieme al paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) da Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-1. **[RoFormer](model_doc/roformer)** (da ZhuiyiTechnology), rilasciato assieme al paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) da Jianlin Su e Yu Lu e Shengfeng Pan e Bo Wen e Yunfeng Liu.
-1. **[SegFormer](model_doc/segformer)** (da NVIDIA) rilasciato con il paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) da Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
-1. **[SEW](model_doc/sew)** (da ASAPP) rilasciato con il paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) da Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
-1. **[SEW-D](model_doc/sew_d)** (da ASAPP) rilasciato con il paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) da Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
-1. **[SpeechToTextTransformer](model_doc/speech_to_text)** (da Facebook), rilasciato assieme al paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) da Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
-1. **[SpeechToTextTransformer2](model_doc/speech_to_text_2)** (da Facebook), rilasciato assieme al paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) da Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
-1. **[Splinter](model_doc/splinter)** (dalla Università di Tel Aviv), rilasciato assieme al paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) da Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
-1. **[SqueezeBert](model_doc/squeezebert)** (da Berkeley) rilasciato con il paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) da Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, e Kurt W. Keutzer.
-1. **[Swin Transformer](model_doc/swin)** (da Microsoft) rilasciato con il paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) da Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
-1. **[T5](model_doc/t5)** (da Google AI) rilasciato con il paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) da Colin Raffel e Noam Shazeer e Adam Roberts e Katherine Lee e Sharan Narang e Michael Matena e Yanqi Zhou e Wei Li e Peter J. Liu.
-1. **[T5v1.1](model_doc/t5v1.1)** (da Google AI) rilasciato nel repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) da Colin Raffel e Noam Shazeer e Adam Roberts e Katherine Lee e Sharan Narang e Michael Matena e Yanqi Zhou e Wei Li e Peter J. Liu.
-1. **[TAPAS](model_doc/tapas)** (da Google AI) rilasciato con il paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) da Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno e Julian Martin Eisenschlos.
-1. **[TAPEX](model_doc/tapex)** (da Microsoft Research) rilasciato con il paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) da Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
-1. **[Trajectory Transformer](model_doc/trajectory_transformers)** (dall'Università della California a Berkeley) rilasciato con il paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) da Michael Janner, Qiyang Li, Sergey Levine
-1. **[Transformer-XL](model_doc/transfo-xl)** (da Google/CMU) rilasciato con il paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) da Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-1. **[TrOCR](model_doc/trocr)** (da Microsoft), rilasciato assieme al paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) da Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
-1. **[UniSpeech](model_doc/unispeech)** (da Microsoft Research) rilasciato con il paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) da Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
-1. **[UniSpeechSat](model_doc/unispeech-sat)** (da Microsoft Research) rilasciato con il paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) da Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
-1. **[VAN](model_doc/van)** (dalle Università di Tsinghua e Nankai) rilasciato con il paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) da Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
-1. **[ViLT](model_doc/vilt)** (da NAVER AI Lab/Kakao Enterprise/Kakao Brain) rilasciato con il paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) da Wonjae Kim, Bokyung Son, Ildoo Kim.
-1. **[Vision Transformer (ViT)](model_doc/vit)** (da Google AI) rilasciato con il paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) da Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-1. **[ViTMAE](model_doc/vit_mae)** (da Meta AI) rilasciato con il paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) da Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
-1. **[VisualBERT](model_doc/visual_bert)** (da UCLA NLP) rilasciato con il paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) da Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
-1. **[WavLM](model_doc/wavlm)** (da Microsoft Research) rilasciato con il paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) da Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
-1. **[Wav2Vec2](model_doc/wav2vec2)** (da Facebook AI) rilasciato con il paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) da Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
-1. **[Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme)** (da Facebook AI) rilasciato con il paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) da Qiantong Xu, Alexei Baevski, Michael Auli.
-1. **[XGLM](model_doc/xglm)** (da Facebook AI) rilasciato con il paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) da Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
-1. **[XLM](model_doc/xlm)** (v Facebook) rilasciato assieme al paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) da Guillaume Lample e Alexis Conneau.
-1. **[XLM-ProphetNet](model_doc/xlm-prophetnet)** (da Microsoft Research) rilasciato con il paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) da Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang e Ming Zhou.
-1. **[XLM-RoBERTa](model_doc/xlm-roberta)** (da Facebook AI), rilasciato assieme al paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) da Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer e Veselin Stoyanov.
-1. **[XLM-RoBERTa-XL](model_doc/xlm-roberta-xl)** (da Facebook AI), rilasciato assieme al paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) da Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
-1. **[XLNet](model_doc/xlnet)** (da Google/CMU) rilasciato con il paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) da Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-1. **[XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)** (da Facebook AI) rilasciato con il paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) da Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
-1. **[XLS-R](model_doc/xls_r)** (da Facebook AI) rilasciato con il paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) da Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
-1. **[YOLOS](model_doc/yolos)** (dalla Università della scienza e tecnologia di Huazhong) rilasciato con il paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) da Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
-1. **[YOSO](model_doc/yoso)** (dall'Università del Wisconsin - Madison) rilasciato con il paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) da Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
-
-
-### Framework supportati
-
-La tabella seguente rappresenta il supporto attuale nella libreria per ognuno di questi modelli, si può identificare se questi hanno un Python
-tokenizer (chiamato "slow"). Un tokenizer "fast" supportato dalla libreria 🤗 Tokenizers, e se hanno supporto in Jax (via Flax), PyTorch, e/o TensorFlow.
-
-<!--This table is updated automatically from the auto modules with _make fix-copies_. Do not update manually!-->
-
-|            Model            | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support |
-|:---------------------------:|:--------------:|:--------------:|:---------------:|:------------------:|:------------:|
-|           ALBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|            BART             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|            BEiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-|            BERT             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|       Bert Generation       |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           BigBird           |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
-|       BigBirdPegasus        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         Blenderbot          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|       BlenderbotSmall       |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|          CamemBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|           Canine            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            CLIP             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|          ConvNext           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|             CvT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|        Data2VecAudio        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|        Data2VecText         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|       Data2VecVision        |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|           DeBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|         DeBERTa-v2          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|    Decision Transformer     |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            DeiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            DETR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         DistilBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|             DPR             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|             DPT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           ELECTRA           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|       Encoder decoder       |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-| FairSeq Machine-Translation |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          FlauBERT           |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            Flava            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            FNet             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|     Funnel Transformer      |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|            GLPN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           GPT Neo           |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-|          GPT NeoX           |       ❌       |       ✅       |       ✅        |         ❌         |      ❌      |
-|            GPT-J            |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|           Hubert            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|           I-BERT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          ImageGPT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          LayoutLM           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|         LayoutLMv2          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|         LayoutLMv3          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|             LED             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|         Longformer          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|            LUKE             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           LXMERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|           M2M100            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           Marian            |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
-|         MaskFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            mBART            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|        MegatronBert         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         MobileBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|            MPNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|             mT5             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|        Nystromformer        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         OpenAI GPT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|        OpenAI GPT-2         |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|             OPT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           Pegasus           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|          Perceiver          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           PLBart            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         PoolFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         ProphetNet          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           QDQBert           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             RAG             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            Realm            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|          Reformer           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|           RegNet            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           RemBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|           ResNet            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          RetriBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|           RoBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|          RoFormer           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|          SegFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             SEW             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            SEW-D            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|   Speech Encoder decoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-|         Speech2Text         |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|        Speech2Text2         |       ✅       |       ❌       |       ❌        |         ❌         |      ❌      |
-|          Splinter           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|         SqueezeBERT         |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|            Swin             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|             T5              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|            TAPAS            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|   Trajectory Transformer    |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|       Transformer-XL        |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            TrOCR            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          UniSpeech          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|        UniSpeechSat         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             VAN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            ViLT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|   Vision Encoder decoder    |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|    VisionTextDualEncoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-|         VisualBert          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             ViT             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|           ViTMAE            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|          Wav2Vec2           |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
-|     Wav2Vec2-Conformer      |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            WavLM            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            XGLM             |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
-|             XLM             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|         XLM-RoBERTa         |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|       XLM-RoBERTa-XL        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|        XLMProphetNet        |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            XLNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|            YOLOS            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            YOSO             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-
-<!-- End table-->
diff --git a/docs/source/it/installation.md b/docs/source/it/installation.md
new file mode 100644
index 000000000000..4f884f80d936
--- /dev/null
+++ b/docs/source/it/installation.md
@@ -0,0 +1,239 @@
+<!---
+Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Installazione
+
+Installa 🤗 Transformers per qualsiasi libreria di deep learning con cui stai lavorando, imposta la tua cache, e opzionalmente configura 🤗 Transformers per l'esecuzione offline.
+
+🤗 Transformers è testato su Python 3.6+, PyTorch 1.1.0+, TensorFlow 2.0+, e Flax. Segui le istruzioni di installazione seguenti per la libreria di deep learning che stai utilizzando:
+
+* [PyTorch](https://pytorch.org/get-started/locally/) istruzioni di installazione.
+* [TensorFlow 2.0](https://www.tensorflow.org/install/pip) istruzioni di installazione.
+* [Flax](https://flax.readthedocs.io/en/latest/) istruzioni di installazione.
+
+## Installazione con pip
+
+Puoi installare 🤗 Transformers in un [ambiente virtuale](https://docs.python.org/3/library/venv.html). Se non sei familiare con gli ambienti virtuali in Python, dai un'occhiata a questa [guida](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). Un ambiente virtuale rende più semplice la gestione di progetti differenti, evitando problemi di compatibilità tra dipendenze.
+
+Inizia creando un ambiente virtuale nella directory del tuo progetto:
+
+```bash
+python -m venv .env
+```
+
+Attiva l'ambiente virtuale:
+
+```bash
+source .env/bin/activate
+```
+
+Ora puoi procedere con l'installazione di 🤗 Transformers eseguendo il comando seguente:
+
+```bash
+pip install transformers
+```
+
+Per il solo supporto della CPU, puoi installare facilmente 🤗 Transformers e una libreria di deep learning in solo una riga. Ad esempio, installiamo 🤗 Transformers e PyTorch con:
+
+```bash
+pip install transformers[torch]
+```
+
+🤗 Transformers e TensorFlow 2.0:
+
+```bash
+pip install transformers[tf-cpu]
+```
+
+🤗 Transformers e Flax:
+
+```bash
+pip install transformers[flax]
+```
+
+Infine, verifica se 🤗 Transformers è stato installato in modo appropriato eseguendo il seguente comando. Questo scaricherà un modello pre-allenato:
+
+```bash
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"
+```
+
+Dopodiché stampa l'etichetta e il punteggio:
+
+```bash
+[{'label': 'POSITIVE', 'score': 0.9998704791069031}]
+```
+
+## Installazione dalla fonte
+
+Installa 🤗 Transformers dalla fonte con il seguente comando:
+
+```bash
+pip install git+https://github.com/huggingface/transformers
+```
+
+Questo comando installa la versione `main` più attuale invece dell'ultima versione stabile. Questo è utile per stare al passo con gli ultimi sviluppi. Ad esempio, se un bug è stato sistemato da quando è uscita l'ultima versione ufficiale ma non è stata ancora rilasciata una nuova versione. Tuttavia, questo significa che questa versione `main` può non essere sempre stabile. Ci sforziamo per mantenere la versione `main` operativa, e la maggior parte dei problemi viene risolta in poche ore o in un giorno. Se riscontri un problema, per favore apri una [Issue](https://github.com/huggingface/transformers/issues) così possiamo sistemarlo ancora più velocemente!
+
+Controlla se 🤗 Transformers è stata installata in modo appropriato con il seguente comando:
+
+```bash
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I love you'))"
+```
+
+## Installazione modificabile
+
+Hai bisogno di un'installazione modificabile se vuoi:
+
+* Usare la versione `main` del codice dalla fonte.
+* Contribuire a 🤗 Transformers e hai bisogno di testare i cambiamenti nel codice.
+
+Clona il repository e installa 🤗 Transformers con i seguenti comandi:
+
+```bash
+git clone https://github.com/huggingface/transformers.git
+cd transformers
+pip install -e .
+```
+
+Questi comandi collegheranno la cartella in cui è stato clonato il repository e i path delle librerie Python. Python guarderà ora all'interno della cartella clonata, oltre ai normali path delle librerie. Per esempio, se i tuoi pacchetti Python sono installati tipicamente in `~/anaconda3/envs/main/lib/python3.7/site-packages/`, Python cercherà anche nella cartella clonata: `~/transformers/`.
+
+<Tip warning={true}>
+
+Devi tenere la cartella `transformers` se vuoi continuare ad utilizzare la libreria.
+
+</Tip>
+
+Ora puoi facilmente aggiornare il tuo clone all'ultima versione di 🤗 Transformers con il seguente comando:
+
+```bash
+cd ~/transformers/
+git pull
+```
+
+Il tuo ambiente Python troverà la versione `main` di 🤗 Transformers alla prossima esecuzione.
+
+## Installazione con conda
+
+Installazione dal canale conda `huggingface`:
+
+```bash
+conda install -c huggingface transformers
+```
+
+## Impostazione della cache
+
+I modelli pre-allenati sono scaricati e memorizzati localmente nella cache in: `~/.cache/huggingface/transformers/`. Questa è la directory di default data dalla variabile d'ambiente della shell `TRANSFORMERS_CACHE`. Su Windows, la directory di default è data da `C:\Users\username\.cache\huggingface\transformers`. Puoi cambiare le variabili d'ambiente della shell indicate in seguito, in ordine di priorità, per specificare una directory differente per la cache:
+
+1. Variabile d'ambiente della shell (default): `TRANSFORMERS_CACHE`.
+2. Variabile d'ambiente della shell: `HF_HOME` + `transformers/`.
+3. Variabile d'ambiente della shell: `XDG_CACHE_HOME` + `/huggingface/transformers`.
+
+<Tip>
+
+🤗 Transformers utilizzerà le variabili d'ambiente della shell `PYTORCH_TRANSFORMERS_CACHE` o `PYTORCH_PRETRAINED_BERT_CACHE` se si proviene da un'iterazione precedente di questa libreria e sono state impostate queste variabili d'ambiente, a meno che non si specifichi la variabile d'ambiente della shell `TRANSFORMERS_CACHE`.
+
+</Tip>
+
+## Modalità Offline
+
+🤗 Transformers può essere eseguita in un ambiente firewalled o offline utilizzando solo file locali. Imposta la variabile d'ambiente `TRANSFORMERS_OFFLINE=1` per abilitare questo comportamento.
+
+<Tip>
+
+Aggiungi [🤗 Datasets](https://huggingface.co/docs/datasets/) al tuo flusso di lavoro offline di training impostando la variabile d'ambiente `HF_DATASETS_OFFLINE=1`.
+
+</Tip>
+
+Ad esempio, in genere si esegue un programma su una rete normale, protetta da firewall per le istanze esterne, con il seguente comando:
+
+```bash
+python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
+```
+
+Esegui lo stesso programma in un'istanza offline con:
+
+```bash
+HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
+python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
+```
+
+Lo script viene ora eseguito senza bloccarsi o attendere il timeout, perché sa di dover cercare solo file locali.
+
+### Ottenere modelli e tokenizer per l'uso offline
+
+Un'altra opzione per utilizzare offline 🤗 Transformers è scaricare i file in anticipo, e poi puntare al loro path locale quando hai la necessità di utilizzarli offline. Ci sono tre modi per fare questo:
+
+* Scarica un file tramite l'interfaccia utente sul [Model Hub](https://huggingface.co/models) premendo sull'icona ↓.
+
+    ![download-icon](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/download-icon.png)
+
+* Utilizza il flusso [`PreTrainedModel.from_pretrained`] e [`PreTrainedModel.save_pretrained`]:
+
+    1. Scarica i tuoi file in anticipo con [`PreTrainedModel.from_pretrained`]:
+
+    ```py
+    >>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/T0_3B")
+    >>> model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0_3B")
+    ```
+
+    2. Salva i tuoi file in una directory specificata con [`PreTrainedModel.save_pretrained`]:
+
+    ```py
+    >>> tokenizer.save_pretrained("./il/tuo/path/bigscience_t0")
+    >>> model.save_pretrained("./il/tuo/path/bigscience_t0")
+    ```
+
+    3. Ora quando sei offline, carica i tuoi file con [`PreTrainedModel.from_pretrained`] dalla directory specificata:
+
+    ```py
+    >>> tokenizer = AutoTokenizer.from_pretrained("./il/tuo/path/bigscience_t0")
+    >>> model = AutoModel.from_pretrained("./il/tuo/path/bigscience_t0")
+    ```
+
+* Scarica in maniera programmatica i file con la libreria [huggingface_hub](https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub):
+
+    1. Installa la libreria `huggingface_hub` nel tuo ambiente virtuale:
+
+    ```bash
+    python -m pip install huggingface_hub
+    ```
+
+    2. Utilizza la funzione [`hf_hub_download`](https://huggingface.co/docs/hub/adding-a-library#download-files-from-the-hub) per scaricare un file in un path specifico. Per esempio, il seguente comando scarica il file `config.json` dal modello [T0](https://huggingface.co/bigscience/T0_3B) nel path che desideri:
+
+    ```py
+    >>> from huggingface_hub import hf_hub_download
+
+    >>> hf_hub_download(repo_id="bigscience/T0_3B", filename="config.json", cache_dir="./il/tuo/path/bigscience_t0")
+    ```
+
+Una volta che il tuo file è scaricato e salvato in cache localmente, specifica il suo path locale per caricarlo e utilizzarlo:
+
+```py
+>>> from transformers import AutoConfig
+
+>>> config = AutoConfig.from_pretrained("./il/tuo/path/bigscience_t0/config.json")
+```
+
+<Tip>
+
+Fai riferimento alla sezione [How to download files from the Hub](https://huggingface.co/docs/hub/how-to-downstream) per avere maggiori dettagli su come scaricare modelli presenti sull Hub.
+
+</Tip>
\ No newline at end of file
diff --git a/docs/source/it/installation.mdx b/docs/source/it/installation.mdx
deleted file mode 100644
index 1ff47c110cff..000000000000
--- a/docs/source/it/installation.mdx
+++ /dev/null
@@ -1,235 +0,0 @@
-<!---
-Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Installazione
-
-Installa 🤗 Transformers per qualsiasi libreria di deep learning con cui stai lavorando, imposta la tua cache, e opzionalmente configura 🤗 Transformers per l'esecuzione offline.
-
-🤗 Transformers è testato su Python 3.6+, PyTorch 1.1.0+, TensorFlow 2.0+, e Flax. Segui le istruzioni di installazione seguenti per la libreria di deep learning che stai utilizzando:
-
-* [PyTorch](https://pytorch.org/get-started/locally/) istruzioni di installazione.
-* [TensorFlow 2.0](https://www.tensorflow.org/install/pip) istruzioni di installazione.
-* [Flax](https://flax.readthedocs.io/en/latest/) istruzioni di installazione.
-
-## Installazione con pip
-
-Puoi installare 🤗 Transformers in un [ambiente virtuale](https://docs.python.org/3/library/venv.html). Se non sei familiare con gli ambienti virtuali in Python, dai un'occhiata a questa [guida](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). Un ambiente virtuale rende più semplice la gestione di progetti differenti, evitando problemi di compatibilità tra dipendenze.
-
-Inizia creando un ambiente virtuale nella directory del tuo progetto:
-
-```bash
-python -m venv .env
-```
-
-Attiva l'ambiente virtuale:
-
-```bash
-source .env/bin/activate
-```
-
-Ora puoi procedere con l'installazione di 🤗 Transformers eseguendo il comando seguente:
-
-```bash
-pip install transformers
-```
-
-Per il solo supporto della CPU, puoi installare facilmente 🤗 Transformers e una libreria di deep learning in solo una riga. Ad esempio, installiamo 🤗 Transformers e PyTorch con:
-
-```bash
-pip install transformers[torch]
-```
-
-🤗 Transformers e TensorFlow 2.0:
-
-```bash
-pip install transformers[tf-cpu]
-```
-
-🤗 Transformers e Flax:
-
-```bash
-pip install transformers[flax]
-```
-
-Infine, verifica se 🤗 Transformers è stato installato in modo appropriato eseguendo il seguente comando. Questo scaricherà un modello pre-allenato:
-
-```bash
-python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"
-```
-
-Dopodiché stampa l'etichetta e il punteggio:
-
-```bash
-[{'label': 'POSITIVE', 'score': 0.9998704791069031}]
-```
-
-## Installazione dalla fonte
-
-Installa 🤗 Transformers dalla fonte con il seguente comando:
-
-```bash
-pip install git+https://github.com/huggingface/transformers
-```
-
-Questo comando installa la versione `main` più attuale invece dell'ultima versione stabile. Questo è utile per stare al passo con gli ultimi sviluppi. Ad esempio, se un bug è stato sistemato da quando è uscita l'ultima versione ufficiale ma non è stata ancora rilasciata una nuova versione. Tuttavia, questo significa che questa versione `main` può non essere sempre stabile. Ci sforziamo per mantenere la versione `main` operativa, e la maggior parte dei problemi viene risolta in poche ore o in un giorno. Se riscontri un problema, per favore apri una [Issue](https://github.com/huggingface/transformers/issues) così possiamo sistemarlo ancora più velocemente!
-
-Controlla se 🤗 Transformers è stata installata in modo appropriato con il seguente comando:
-
-```bash
-python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I love you'))"
-```
-
-## Installazione modificabile
-
-Hai bisogno di un'installazione modificabile se vuoi:
-
-* Usare la versione `main` del codice dalla fonte.
-* Contribuire a 🤗 Transformers e hai bisogno di testare i cambiamenti nel codice.
-
-Clona il repository e installa 🤗 Transformers con i seguenti comandi:
-
-```bash
-git clone https://github.com/huggingface/transformers.git
-cd transformers
-pip install -e .
-```
-
-Questi comandi collegheranno la cartella in cui è stato clonato il repository e i path delle librerie Python. Python guarderà ora all'interno della cartella clonata, oltre ai normali path delle librerie. Per esempio, se i tuoi pacchetti Python sono installati tipicamente in `~/anaconda3/envs/main/lib/python3.7/site-packages/`, Python cercherà anche nella cartella clonata: `~/transformers/`.
-
-<Tip warning={true}>
-
-Devi tenere la cartella `transformers` se vuoi continuare ad utilizzare la libreria.
-
-</Tip>
-
-Ora puoi facilmente aggiornare il tuo clone all'ultima versione di 🤗 Transformers con il seguente comando:
-
-```bash
-cd ~/transformers/
-git pull
-```
-
-Il tuo ambiente Python troverà la versione `main` di 🤗 Transformers alla prossima esecuzione.
-
-## Installazione con conda
-
-Installazione dal canale conda `huggingface`:
-
-```bash
-conda install -c huggingface transformers
-```
-
-## Impostazione della cache
-
-I modelli pre-allenati sono scaricati e memorizzati localmente nella cache in: `~/.cache/huggingface/transformers/`. Questa è la directory di default data dalla variabile d'ambiente della shell `TRANSFORMERS_CACHE`. Su Windows, la directory di default è data da `C:\Users\username\.cache\huggingface\transformers`. Puoi cambiare le variabili d'ambiente della shell indicate in seguito, in ordine di priorità, per specificare una directory differente per la cache:
-
-1. Variabile d'ambiente della shell (default): `TRANSFORMERS_CACHE`.
-2. Variabile d'ambiente della shell: `HF_HOME` + `transformers/`.
-3. Variabile d'ambiente della shell: `XDG_CACHE_HOME` + `/huggingface/transformers`.
-
-<Tip>
-
-🤗 Transformers utilizzerà le variabili d'ambiente della shell `PYTORCH_TRANSFORMERS_CACHE` o `PYTORCH_PRETRAINED_BERT_CACHE` se si proviene da un'iterazione precedente di questa libreria e sono state impostate queste variabili d'ambiente, a meno che non si specifichi la variabile d'ambiente della shell `TRANSFORMERS_CACHE`.
-
-</Tip>
-
-## Modalità Offline
-
-🤗 Transformers può essere eseguita in un ambiente firewalled o offline utilizzando solo file locali. Imposta la variabile d'ambiente `TRANSFORMERS_OFFLINE=1` per abilitare questo comportamento.
-
-<Tip>
-
-Aggiungi [🤗 Datasets](https://huggingface.co/docs/datasets/) al tuo flusso di lavoro offline di training impostando la variabile d'ambiente `HF_DATASETS_OFFLINE=1`.
-
-</Tip>
-
-Ad esempio, in genere si esegue un programma su una rete normale, protetta da firewall per le istanze esterne, con il seguente comando:
-
-```bash
-python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
-```
-
-Esegui lo stesso programma in un'istanza offline con:
-
-```bash
-HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
-python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
-```
-
-Lo script viene ora eseguito senza bloccarsi o attendere il timeout, perché sa di dover cercare solo file locali.
-
-### Ottenere modelli e tokenizer per l'uso offline
-
-Un'altra opzione per utilizzare offline 🤗 Transformers è scaricare i file in anticipo, e poi puntare al loro path locale quando hai la necessità di utilizzarli offline. Ci sono tre modi per fare questo:
-
-* Scarica un file tramite l'interfaccia utente sul [Model Hub](https://huggingface.co/models) premendo sull'icona ↓.
-
-    ![download-icon](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/download-icon.png)
-
-* Utilizza il flusso [`PreTrainedModel.from_pretrained`] e [`PreTrainedModel.save_pretrained`]:
-
-    1. Scarica i tuoi file in anticipo con [`PreTrainedModel.from_pretrained`]:
-
-    ```py
-    >>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-
-    >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/T0_3B")
-    >>> model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0_3B")
-    ```
-
-    2. Salva i tuoi file in una directory specificata con [`PreTrainedModel.save_pretrained`]:
-
-    ```py
-    >>> tokenizer.save_pretrained("./il/tuo/path/bigscience_t0")
-    >>> model.save_pretrained("./il/tuo/path/bigscience_t0")
-    ```
-
-    3. Ora quando sei offline, carica i tuoi file con [`PreTrainedModel.from_pretrained`] dalla directory specificata:
-
-    ```py
-    >>> tokenizer = AutoTokenizer.from_pretrained("./il/tuo/path/bigscience_t0")
-    >>> model = AutoModel.from_pretrained("./il/tuo/path/bigscience_t0")
-    ```
-
-* Scarica in maniera programmatica i file con la libreria [huggingface_hub](https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub):
-
-    1. Installa la libreria `huggingface_hub` nel tuo ambiente virtuale:
-
-    ```bash
-    python -m pip install huggingface_hub
-    ```
-
-    2. Utilizza la funzione [`hf_hub_download`](https://huggingface.co/docs/hub/adding-a-library#download-files-from-the-hub) per scaricare un file in un path specifico. Per esempio, il seguente comando scarica il file `config.json` dal modello [T0](https://huggingface.co/bigscience/T0_3B) nel path che desideri:
-
-    ```py
-    >>> from huggingface_hub import hf_hub_download
-
-    >>> hf_hub_download(repo_id="bigscience/T0_3B", filename="config.json", cache_dir="./il/tuo/path/bigscience_t0")
-    ```
-
-Una volta che il tuo file è scaricato e salvato in cache localmente, specifica il suo path locale per caricarlo e utilizzarlo:
-
-```py
->>> from transformers import AutoConfig
-
->>> config = AutoConfig.from_pretrained("./il/tuo/path/bigscience_t0/config.json")
-```
-
-<Tip>
-
-Fai riferimento alla sezione [How to download files from the Hub](https://huggingface.co/docs/hub/how-to-downstream) per avere maggiori dettagli su come scaricare modelli presenti sull Hub.
-
-</Tip>
\ No newline at end of file
diff --git a/docs/source/it/migration.md b/docs/source/it/migration.md
new file mode 100644
index 000000000000..3b3b71da4d49
--- /dev/null
+++ b/docs/source/it/migration.md
@@ -0,0 +1,320 @@
+<!--- 
+Copyright 2020 The HuggingFace Team. Tutti i diritti riservati. 
+
+Concesso in licenza in base alla Licenza Apache, Versione 2.0 (la "Licenza"); 
+non è possibile utilizzare questo file se non in conformità con la Licenza. 
+È possibile ottenere una copia della Licenza all'indirizzo 
+
+http://www.apache.org/licenses/LICENSE-2.0 
+
+A meno che non sia richiesto dalla legge applicabile o concordato per iscritto, il software 
+distribuito con la Licenza è distribuito su BASE "COSÌ COM'È", 
+SENZA GARANZIE O CONDIZIONI DI ALCUN TIPO, espresse o implicite. 
+Per la lingua specifica vedi la Licenza che regola le autorizzazioni e 
+le limitazioni ai sensi della STESSA. 
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+--> 
+
+# Migrazione da pacchetti precedenti 
+
+## Migrazione da transformers `v3.x` a `v4.x` 
+
+Un paio di modifiche sono state introdotte nel passaggio dalla versione 3 alla versione 4. Di seguito è riportato un riepilogo delle 
+modifiche previste: 
+
+#### 1. AutoTokenizer e pipeline ora utilizzano tokenizer veloci (rust) per impostazione predefinita. 
+
+I tokenizer python e rust hanno all'incirca le stesse API, ma i tokenizer rust hanno un set di funzionalità più completo. 
+
+Ciò introduce due modifiche sostanziali: 
+- La gestione dei token in overflow tra i tokenizer Python e Rust è diversa. 
+- I tokenizers di rust non accettano numeri interi nei metodi di codifica. 
+
+##### Come ottenere lo stesso comportamento di v3.x in v4.x 
+
+- Le pipeline ora contengono funzionalità aggiuntive pronte all'uso. Vedi la [pipeline di classificazione dei token con il flag `grouped_entities`](main_classes/pipelines#transformers.TokenClassificationPipeline). 
+- Gli auto-tokenizer ora restituiscono tokenizer rust. Per ottenere invece i tokenizer python, l'utente deve usare il flag `use_fast` impostandolo `False`: 
+
+Nella versione `v3.x`: 
+```py 
+from transformers import AutoTokenizer 
+
+tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") 
+``` 
+per ottenere lo stesso nella versione `v4.x`: 
+```py 
+from transformers import AutoTokenizer 
+
+tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False) 
+``` 
+
+#### 2. SentencePiece è stato rimosso dalle dipendenze richieste 
+
+Il requisito sulla dipendenza SentencePiece è stato rimosso da `setup.py`. È stato fatto per avere un canale su anaconda cloud senza basarsi su `conda-forge`. Ciò significa che i tokenizer che dipendono dalla libreria SentencePiece non saranno disponibili con un'installazione standard di `transformers`. 
+
+Ciò include le versioni **lente** di: 
+- `XLNetTokenizer` 
+- `AlbertTokenizer` 
+- `CamembertTokenizer` 
+- `MBartTokenizer` 
+- `PegasusTokenizer` 
+- `T5Tokenizer` 
+- `ReformerTokenizer` 
+- `XLMRobertaTokenizer` 
+
+##### Come ottenere lo stesso comportamento della v3.x nella v4.x 
+
+Per ottenere lo stesso comportamento della versione `v3.x`, devi installare anche `sentencepiece`: 
+
+Nella versione `v3.x`: 
+```bash 
+pip install transformers 
+``` 
+per ottenere lo stesso nella versione `v4.x`: 
+```bash 
+pip install transformers[sentencepiece] 
+``` 
+o 
+```bash 
+pip install transformers stentencepiece 
+``` 
+#### 3. L'architettura delle repo è stato aggiornata in modo che ogni modello abbia la propria cartella 
+
+Con l’aggiunta di nuovi modelli, il numero di file nella cartella `src/transformers` continua a crescere e diventa più difficile navigare e capire. Abbiamo fatto la scelta di inserire ogni modello e i file che lo accompagnano nelle proprie sottocartelle. 
+
+Si tratta di una modifica sostanziale in quanto l'importazione di layer intermedi utilizzando direttamente il modulo di un modello deve essere eseguita tramite un percorso diverso. 
+
+##### Come ottenere lo stesso comportamento della v3.x nella v4.x 
+
+Per ottenere lo stesso comportamento della versione `v3.x`, devi aggiornare il percorso utilizzato per accedere ai layer. 
+
+Nella versione `v3.x`: 
+```bash 
+from transformers.modeling_bert import BertLayer 
+``` 
+per ottenere lo stesso nella versione `v4.x`: 
+```bash 
+from transformers.models.bert.modeling_bert import BertLayer 
+``` 
+
+#### 4. Impostare l'argomento `return_dict` su `True` per impostazione predefinita 
+
+L'[argomento `return_dict`](main_classes/output) abilita la restituzione di oggetti python dict-like contenenti gli output del modello, invece delle tuple standard. Questo oggetto è self-documented poiché le chiavi possono essere utilizzate per recuperare valori, comportandosi anche come una tupla e gli utenti possono recuperare oggetti per indexing o slicing. 
+
+Questa è una modifica sostanziale poiché la tupla non può essere decompressa: `value0, value1 = outputs` non funzionerà. 
+
+##### Come ottenere lo stesso comportamento della v3.x nella v4.x 
+
+Per ottenere lo stesso comportamento della versione `v3.x`, specifica l'argomento `return_dict` come `False`, sia nella configurazione del modello che nel passaggio successivo. 
+
+Nella versione `v3.x`: 
+```bash 
+model = BertModel.from_pretrained("bert-base-cased") 
+outputs = model(**inputs) 
+``` 
+per ottenere lo stesso nella versione `v4.x`: 
+```bash 
+model = BertModel.from_pretrained("bert-base-cased") 
+outputs = model(**inputs, return_dict=False) 
+``` 
+o 
+```bash 
+model = BertModel.from_pretrained("bert-base-cased", return_dict=False) 
+outputs = model(**inputs) 
+``` 
+
+#### 5. Rimozione di alcuni attributi deprecati 
+
+Gli attributi sono stati rimossi se deprecati da almeno un mese. L'elenco completo degli attributi obsoleti è disponibile in [#8604](https://github.com/huggingface/transformers/pull/8604). 
+
+Ecco un elenco di questi attributi/metodi/argomenti e quali dovrebbero essere le loro sostituzioni: 
+
+In diversi modelli, le etichette diventano coerenti con gli altri modelli: 
+- `masked_lm_labels` diventa `labels` in `AlbertForMaskedLM` e `AlbertForPreTraining`. 
+- `masked_lm_labels` diventa `labels` in `BertForMaskedLM` e `BertForPreTraining`. 
+- `masked_lm_labels` diventa `labels` in `DistilBertForMaskedLM`. 
+- `masked_lm_labels` diventa `labels` in `ElectraForMaskedLM`. 
+- `masked_lm_labels` diventa `labels` in `LongformerForMaskedLM`. 
+- `masked_lm_labels` diventa `labels` in `MobileBertForMaskedLM`. 
+- `masked_lm_labels` diventa `labels` in `RobertaForMaskedLM`. 
+- `lm_labels` diventa `labels` in `BartForConditionalGeneration`. 
+- `lm_labels` diventa `labels` in `GPT2DoubleHeadsModel`. 
+- `lm_labels` diventa `labels` in `OpenAIGPTDoubleHeadsModel`. 
+- `lm_labels` diventa `labels` in `T5ForConditionalGeneration`. 
+
+In diversi modelli, il meccanismo di memorizzazione nella cache diventa coerente con gli altri: 
+- `decoder_cached_states` diventa `past_key_values` in tutti i modelli BART-like, FSMT e T5. 
+- `decoder_past_key_values` diventa `past_key_values` in tutti i modelli BART-like, FSMT e T5. 
+- `past` diventa `past_key_values` in tutti i modelli CTRL. 
+- `past` diventa `past_key_values` in tutti i modelli GPT-2. 
+
+Per quanto riguarda le classi tokenizer: 
+- L'attributo tokenizer `max_len` diventa `model_max_length`. 
+- L'attributo tokenizer `return_lengths` diventa `return_length`. 
+- L'argomento di codifica del tokenizer `is_pretokenized` diventa `is_split_into_words`. 
+
+Per quanto riguarda la classe `Trainer`: 
+- L'argomento `tb_writer` di `Trainer` è stato rimosso in favore della funzione richiamabile `TensorBoardCallback(tb_writer=...)`. 
+- L'argomento `prediction_loss_only` di `Trainer` è stato rimosso in favore dell'argomento di classe `args.prediction_loss_only`. 
+- L'attributo `data_collator` di `Trainer` sarà richiamabile. 
+- Il metodo `_log` di `Trainer` è deprecato a favore di `log`. 
+- Il metodo `_training_step` di `Trainer` è deprecato a favore di `training_step`. 
+- Il metodo `_prediction_loop` di `Trainer` è deprecato a favore di `prediction_loop`. 
+- Il metodo `is_local_master` di `Trainer` è deprecato a favore di `is_local_process_zero`. 
+- Il metodo `is_world_master` di `Trainer` è deprecato a favore di `is_world_process_zero`. 
+
+Per quanto riguarda la classe `TFTrainer`: 
+- L'argomento `prediction_loss_only` di `TFTrainer` è stato rimosso a favore dell'argomento di classe `args.prediction_loss_only`. 
+- Il metodo `_log` di `Trainer` è deprecato a favore di `log`. 
+- Il metodo `_prediction_loop` di `TFTrainer` è deprecato a favore di `prediction_loop`. 
+- Il metodo `_setup_wandb` di `TFTrainer` è deprecato a favore di `setup_wandb`. 
+- Il metodo `_run_model` di `TFTrainer` è deprecato a favore di `run_model`. 
+
+Per quanto riguarda la classe `TrainingArguments`: 
+- L'argomento `evaluate_during_training` di `TrainingArguments` è deprecato a favore di `evaluation_strategy`. 
+
+Per quanto riguarda il modello Transfo-XL: 
+- L'attributo di configurazione `tie_weight` di Transfo-XL diventa `tie_words_embeddings`. 
+- Il metodo di modellazione `reset_length` di Transfo-XL diventa `reset_memory_length`. 
+
+Per quanto riguarda le pipeline: 
+- L'argomento `topk` di `FillMaskPipeline` diventa `top_k`. 
+
+
+
+## Passaggio da pytorch-transformers a 🤗 Transformers 
+
+Ecco un breve riepilogo di ciò a cui prestare attenzione durante il passaggio da `pytorch-transformers` a 🤗 Transformers. 
+
+### L’ordine posizionale di alcune parole chiave di input dei modelli (`attention_mask`, `token_type_ids`...) è cambiato 
+
+Per usare Torchscript (vedi #1010, #1204 e #1195) l'ordine specifico delle **parole chiave di input** di alcuni modelli (`attention_mask`, `token_type_ids`...) è stato modificato. 
+
+Se inizializzavi i modelli usando parole chiave per gli argomenti, ad esempio `model(inputs_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)`, questo non dovrebbe causare alcun cambiamento. 
+
+Se inizializzavi i modelli con input posizionali per gli argomenti, ad esempio `model(inputs_ids, attention_mask, token_type_ids)`, potrebbe essere necessario ricontrollare l'ordine esatto degli argomenti di input. 
+
+## Migrazione da pytorch-pretrained-bert 
+
+Ecco un breve riepilogo di ciò a cui prestare attenzione durante la migrazione da `pytorch-pretrained-bert` a 🤗 Transformers 
+
+### I modelli restituiscono sempre `tuple` 
+
+La principale modifica di rilievo durante la migrazione da `pytorch-pretrained-bert` a 🤗 Transformers è che il metodo dei modelli di previsione dà sempre una `tupla` con vari elementi a seconda del modello e dei parametri di configurazione. 
+
+Il contenuto esatto delle tuple per ciascun modello è mostrato in dettaglio nelle docstring dei modelli e nella [documentazione](https://huggingface.co/transformers/). 
+
+In quasi tutti i casi, andrà bene prendendo il primo elemento dell'output come quello che avresti precedentemente utilizzato in `pytorch-pretrained-bert`. 
+
+Ecco un esempio di conversione da `pytorch-pretrained-bert`
+ a 🤗 Transformers per un modello di classificazione `BertForSequenceClassification`: 
+
+```python 
+# Carichiamo il nostro modello 
+model = BertForSequenceClassification.from_pretrained("bert-base-uncased") 
+
+# Se usavi questa riga in pytorch-pretrained-bert : 
+loss = model(input_ids, labels=labels) 
+
+# Ora usa questa riga in 🤗 Transformers per estrarre la perdita dalla tupla di output: 
+outputs = model(input_ids, labels=labels) 
+loss = outputs[0] 
+
+# In 🤗 Transformers puoi anche avere accesso ai logit: 
+loss, logits = outputs[:2] 
+
+# Ed anche agli attention weight se configuri il modello per restituirli (e anche altri output, vedi le docstring e la documentazione) 
+model = BertForSequenceClassification.from_pretrained(" bert-base-uncased", output_attentions=True) 
+outputs = model(input_ids, labels=labels) 
+loss, logits, attentions = outputs 
+``` 
+
+### Serializzazione 
+
+Modifica sostanziale nel metodo `from_pretrained()`: 
+
+1. I modelli sono ora impostati in modalità di valutazione in maniera predefinita quando usi il metodo `from_pretrained()`. Per addestrarli non dimenticare di riportarli in modalità di addestramento (`model.train()`) per attivare i moduli di dropout. 
+
+2. Gli argomenti aggiuntivi `*inputs` e `**kwargs` forniti al metodo `from_pretrained()` venivano passati direttamente al metodo `__init__()` della classe sottostante del modello. Ora sono usati per aggiornare prima l'attributo di configurazione del modello, che può non funzionare con le classi del modello derivate costruite basandosi sui precedenti esempi di `BertForSequenceClassification`. Più precisamente, gli argomenti posizionali `*inputs` forniti a `from_pretrained()` vengono inoltrati direttamente al metodo `__init__()`  del modello mentre gli argomenti keyword `**kwargs` (i) che corrispondono agli attributi della classe di configurazione, vengono utilizzati per aggiornare tali attributi (ii) che non corrispondono ad alcun attributo della classe di configurazione, vengono inoltrati al metodo `__init__()`. 
+
+Inoltre, sebbene non si tratti di una modifica sostanziale, i metodi di serializzazione sono stati standardizzati e probabilmente dovresti passare al nuovo metodo `save_pretrained(save_directory)` se prima usavi qualsiasi altro metodo di serializzazione. 
+
+Ecco un esempio: 
+
+```python 
+### Carichiamo un modello e un tokenizer 
+model = BertForSequenceClassification.from_pretrained("bert-base-uncased") 
+tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") 
+
+### Facciamo fare alcune cose al nostro modello e tokenizer 
+# Es: aggiungiamo nuovi token al vocabolario e agli embending del nostro modello 
+tokenizer.add_tokens(["[SPECIAL_TOKEN_1]", "[SPECIAL_TOKEN_2]"]) 
+model.resize_token_embeddings(len(tokenizer)) 
+# Alleniamo il nostro modello
+train(model) 
+
+### Ora salviamo il nostro modello e il tokenizer in una cartella 
+model.save_pretrained("./my_saved_model_directory/") 
+tokenizer.save_pretrained("./my_saved_model_directory/") 
+
+### Ricarichiamo il modello e il tokenizer 
+model = BertForSequenceClassification.from_pretrained("./my_saved_model_directory/") 
+tokenizer = BertTokenizer.from_pretrained("./my_saved_model_directory/") 
+``` 
+
+### Ottimizzatori: BertAdam e OpenAIAdam ora sono AdamW, lo scheduling è quello standard PyTorch 
+
+I due ottimizzatori precedenti inclusi, `BertAdam` e `OpenAIAdam`, sono stati sostituiti da un singolo `AdamW` che presenta alcune differenze: 
+
+- implementa solo la correzione del weights decay, 
+- lo scheduling ora è esterno (vedi sotto), 
+- anche il gradient clipping ora è esterno (vedi sotto). 
+
+Il nuovo ottimizzatore `AdamW` corrisponde alle API di `Adam` di PyTorch e ti consente di utilizzare metodi PyTorch o apex per lo scheduling e il clipping.
+
+Lo scheduling è ora standard [PyTorch learning rate schedulers](https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate) e non fanno più parte dell'ottimizzatore. 
+
+Ecco un esempio di linear warmup e decay con `BertAdam` e con `AdamW`: 
+
+```python 
+# Parametri: 
+lr = 1e-3 
+max_grad_norm = 1.0 
+num_training_steps = 1000 
+num_warmup_steps = 100 
+warmup_proportion = float( num_warmup_steps) / float(num_training_steps) # 0.1 
+
+### In precedenza l'ottimizzatore BertAdam veniva istanziato in questo modo: 
+optimizer = BertAdam( 
+   model.parameters(), 
+   lr=lr, 
+   schedule="warmup_linear", 
+   warmup=warmup_proportion, 
+   num_training_steps=num_training_steps, 
+) 
+### e usato in questo modo: 
+for batch in train_data: 
+   loss = model(batch) 
+   loss.backward() 
+   optimizer.step() 
+
+### In 🤗 Transformers, ottimizzatore e schedule sono divisi e usati in questo modo: 
+optimizer = AdamW( 
+   model.parameters(), lr=lr, correct_bias=False 
+) # Per riprodurre il comportamento specifico di BertAdam impostare correct_bias=False 
+scheduler = get_linear_schedule_with_warmup( 
+   optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps 
+) # PyTorch scheduler 
+### e va usato così: 
+for batch in train_data: 
+   loss = model(batch) 
+   loss.backward() 
+   torch.nn.utils.clip_grad_norm_( 
+   model.parameters(), max_grad_norm 
+   ) # Gradient clipping non è più in AdamW (quindi puoi usare amp senza problemi) 
+   optimizer.step() 
+   scheduler.step()
+```
diff --git a/docs/source/it/model_sharing.md b/docs/source/it/model_sharing.md
new file mode 100644
index 000000000000..351cf57bf96b
--- /dev/null
+++ b/docs/source/it/model_sharing.md
@@ -0,0 +1,238 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Condividi un modello
+
+Gli ultimi due tutorial ti hanno mostrato come puoi fare fine-tuning di un modello con PyTorch, Keras e 🤗 Accelerate per configurazioni distribuite. Il prossimo passo è quello di condividere il tuo modello con la community! In Hugging Face, crediamo nella condivisione della conoscenza e delle risorse in modo da democratizzare l'intelligenza artificiale per chiunque. Ti incoraggiamo a considerare di condividere il tuo modello con la community per aiutare altre persone a risparmiare tempo e risorse.
+
+In questo tutorial, imparerai due metodi per la condivisione di un modello trained o fine-tuned nel [Model Hub](https://huggingface.co/models):
+
+- Condividi in modo programmatico i tuoi file nell'Hub.
+- Trascina i tuoi file nell'Hub mediante interfaccia grafica.
+
+<iframe width="560" height="315" src="https://www.youtube.com/embed/XvSGPZFEjDY" title="YouTube video player"
+frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
+picture-in-picture" allowfullscreen></iframe>
+
+<Tip>
+
+Per condividere un modello con la community, hai bisogno di un account su [huggingface.co](https://huggingface.co/join). Puoi anche unirti ad un'organizzazione esistente o crearne una nuova.
+
+</Tip>
+
+## Caratteristiche dei repository
+
+Ogni repository nel Model Hub si comporta come un tipico repository di GitHub. I nostri repository offrono il versionamento, la cronologia dei commit, e la possibilità di visualizzare le differenze.
+
+Il versionamento all'interno del Model Hub è basato su git e [git-lfs](https://git-lfs.github.com/). In altre parole, puoi trattare un modello come un unico repository, consentendo un maggiore controllo degli accessi e maggiore scalabilità. Il controllo delle versioni consente *revisions*, un metodo per appuntare una versione specifica di un modello con un hash di commit, un tag o un branch.
+
+Come risultato, puoi caricare una specifica versione di un modello con il parametro `revision`:
+
+```py
+>>> model = AutoModel.from_pretrained(
+...     "julien-c/EsperBERTo-small", revision="v2.0.1"  # nome di un tag, di un branch, o commit hash
+... )
+```
+
+Anche i file possono essere modificati facilmente in un repository ed è possibile visualizzare la cronologia dei commit e le differenze:
+
+![vis_diff](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vis_diff.png)
+
+## Configurazione
+
+Prima di condividere un modello nell'Hub, hai bisogno delle tue credenziali di Hugging Face. Se hai accesso ad un terminale, esegui il seguente comando nell'ambiente virtuale in cui è installata la libreria 🤗 Transformers. Questo memorizzerà il tuo token di accesso nella cartella cache di Hugging Face (di default `~/.cache/`):
+
+```bash
+huggingface-cli login
+```
+
+Se stai usando un notebook come Jupyter o Colaboratory, assicurati di avere la libreria [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library) installata. Questa libreria ti permette di interagire in maniera programmatica con l'Hub.
+
+```bash
+pip install huggingface_hub
+```
+
+Utilizza `notebook_login` per accedere all'Hub, e segui il link [qui](https://huggingface.co/settings/token) per generare un token con cui effettuare il login:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## Converti un modello per tutti i framework
+
+Per assicurarti che il tuo modello possa essere utilizzato da persone che lavorano con un framework differente, ti raccomandiamo di convertire e caricare il tuo modello sia con i checkpoint di PyTorch che con quelli di TensorFlow. Anche se è possibile caricare il modello da un framework diverso, se si salta questo passaggio, il caricamento sarà più lento perché 🤗 Transformers ha bisogno di convertire i checkpoint al momento.
+
+Convertire un checkpoint per un altro framework è semplice. Assicurati di avere PyTorch e TensorFlow installati (vedi [qui](installation) per le istruzioni d'installazione), e poi trova il modello specifico per il tuo compito nell'altro framework.
+
+<frameworkcontent>
+<pt>
+Specifica `from_tf=True` per convertire un checkpoint da TensorFlow a PyTorch:
+
+```py
+>>> pt_model = DistilBertForSequenceClassification.from_pretrained(
+...     "path/verso/il-nome-magnifico-che-hai-scelto", from_tf=True
+... )
+>>> pt_model.save_pretrained("path/verso/il-nome-magnifico-che-hai-scelto")
+```
+</pt>
+<tf>
+Specifica `from_pt=True` per convertire un checkpoint da PyTorch a TensorFlow:
+
+```py
+>>> tf_model = TFDistilBertForSequenceClassification.from_pretrained(
+...     "path/verso/il-nome-magnifico-che-hai-scelto", from_pt=True
+... )
+```
+
+Poi puoi salvare il tuo nuovo modello in TensorFlow con il suo nuovo checkpoint:
+
+```py
+>>> tf_model.save_pretrained("path/verso/il-nome-magnifico-che-hai-scelto")
+```
+</tf>
+<jax>
+Se un modello è disponibile in Flax, puoi anche convertire un checkpoint da PyTorch a Flax:
+
+```py
+>>> flax_model = FlaxDistilBertForSequenceClassification.from_pretrained(
+...     "path/verso/il-nome-magnifico-che-hai-scelto", from_pt=True
+... )
+```
+</jax>
+</frameworkcontent>
+
+## Condividi un modello durante il training
+
+<frameworkcontent>
+<pt>
+<Youtube id="Z1-XMy-GNLQ"/>
+
+Condividere un modello nell'Hub è tanto semplice quanto aggiungere un parametro extra o un callback. Ricorda dal [tutorial sul fine-tuning](training), la classe [`TrainingArguments`] è dove specifichi gli iperparametri e le opzioni addizionali per l'allenamento. Una di queste opzioni di training include l'abilità di condividere direttamente un modello nell'Hub. Imposta `push_to_hub=True` in [`TrainingArguments`]:
+
+```py
+>>> training_args = TrainingArguments(output_dir="il-mio-bellissimo-modello", push_to_hub=True)
+```
+
+Passa gli argomenti per il training come di consueto al [`Trainer`]:
+
+```py
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=small_train_dataset,
+...     eval_dataset=small_eval_dataset,
+...     compute_metrics=compute_metrics,
+... )
+```
+
+Dopo aver effettuato il fine-tuning del tuo modello, chiama [`~transformers.Trainer.push_to_hub`] sul [`Trainer`] per condividere il modello allenato nell'Hub. 🤗 Transformers aggiungerà in modo automatico persino gli iperparametri, i risultati del training e le versioni del framework alla scheda del tuo modello (model card, in inglese)!
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+Condividi un modello nell'Hub con [`PushToHubCallback`]. Nella funzione [`PushToHubCallback`], aggiungi:
+
+- Una directory di output per il tuo modello.
+- Un tokenizer.
+- L'`hub_model_id`, che è il tuo username sull'Hub e il nome del modello.
+
+```py
+>>> from transformers import PushToHubCallback
+
+>>> push_to_hub_callback = PushToHubCallback(
+...     output_dir="./il_path_dove_salvare_il_tuo_modello",
+...     tokenizer=tokenizer,
+...     hub_model_id="il-tuo-username/il-mio-bellissimo-modello",
+... )
+```
+
+Aggiungi il callback a [`fit`](https://keras.io/api/models/model_training_apis/), e 🤗 Transformers caricherà il modello allenato nell'Hub:
+
+```py
+>>> model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3, callbacks=push_to_hub_callback)
+```
+</tf>
+</frameworkcontent>
+
+## Utilizzare la funzione `push_to_hub`
+
+Puoi anche chiamare `push_to_hub` direttamente sul tuo modello per caricarlo nell'Hub.
+
+Specifica il nome del tuo modello in `push_to_hub`:
+
+```py
+>>> pt_model.push_to_hub("il-mio-bellissimo-modello")
+```
+
+Questo crea un repository sotto il proprio username con il nome del modello `il-mio-bellissimo-modello`. Ora chiunque può caricare il tuo modello con la funzione `from_pretrained`:
+
+```py
+>>> from transformers import AutoModel
+
+>>> model = AutoModel.from_pretrained("il-tuo-username/il-mio-bellissimo-modello")
+```
+
+Se fai parte di un'organizzazione e vuoi invece condividere un modello sotto il nome dell'organizzazione, aggiungi il parametro `organization`:
+
+```py
+>>> pt_model.push_to_hub("il-mio-bellissimo-modello", organization="la-mia-fantastica-org")
+```
+
+La funzione `push_to_hub` può essere anche utilizzata per aggiungere altri file al repository del modello. Per esempio, aggiungi un tokenizer ad un repository di un modello:
+
+```py
+>>> tokenizer.push_to_hub("il-mio-bellissimo-modello")
+```
+
+O magari potresti voler aggiungere la versione di TensorFlow del tuo modello PyTorch a cui hai fatto fine-tuning:
+
+```py
+>>> tf_model.push_to_hub("il-mio-bellissimo-modello")
+```
+
+Ora quando navighi nel tuo profilo Hugging Face, dovresti vedere il tuo repository del modello appena creato. Premendo sulla scheda **Files** vengono visualizzati tutti i file caricati nel repository.
+
+Per maggiori dettagli su come creare e caricare file ad un repository, fai riferimento alla documentazione [qui](https://huggingface.co/docs/hub/how-to-upstream).
+
+## Carica un modello utilizzando l'interfaccia web
+
+Chi preferisce un approccio senza codice può caricare un modello tramite l'interfaccia web dell'hub. Visita [huggingface.co/new](https://huggingface.co/new) per creare un nuovo repository:
+
+![new_model_repo](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/new_model_repo.png)
+
+Da qui, aggiungi alcune informazioni sul tuo modello:
+
+- Seleziona il/la **owner** del repository. Puoi essere te o qualunque organizzazione di cui fai parte.
+- Scegli un nome per il tuo modello, il quale sarà anche il nome del repository.
+- Scegli se il tuo modello è pubblico o privato.
+- Specifica la licenza utilizzata per il tuo modello.
+
+Ora premi sulla scheda **Files** e premi sul pulsante **Add file** per caricare un nuovo file al tuo repository. Trascina poi un file per caricarlo e aggiungere un messaggio di commit.
+
+![upload_file](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/upload_file.png)
+
+## Aggiungi una scheda del modello
+
+Per assicurarti che chiunque possa comprendere le abilità, limitazioni, i potenziali bias e le considerazioni etiche del tuo modello, per favore aggiungi una scheda del modello (model card, in inglese) al tuo repository. La scheda del modello è definita nel file `README.md`. Puoi aggiungere una scheda del modello:
+
+* Creando manualmente e caricando un file `README.md`.
+* Premendo sul pulsante **Edit model card** nel repository del tuo modello.
+
+Dai un'occhiata alla [scheda del modello](https://huggingface.co/distilbert-base-uncased) di DistilBert per avere un buon esempio del tipo di informazioni che una scheda di un modello deve includere. Per maggiori dettagli legati ad altre opzioni che puoi controllare nel file `README.md`, come l'impatto ambientale o widget di esempio, fai riferimento alla documentazione [qui](https://huggingface.co/docs/hub/models-cards).
diff --git a/docs/source/it/model_sharing.mdx b/docs/source/it/model_sharing.mdx
deleted file mode 100644
index 87ba2b5b3421..000000000000
--- a/docs/source/it/model_sharing.mdx
+++ /dev/null
@@ -1,234 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Condividi un modello
-
-Gli ultimi due tutorial ti hanno mostrato come puoi fare fine-tuning di un modello con PyTorch, Keras e 🤗 Accelerate per configurazioni distribuite. Il prossimo passo è quello di condividere il tuo modello con la community! In Hugging Face, crediamo nella condivisione della conoscenza e delle risorse in modo da democratizzare l'intelligenza artificiale per chiunque. Ti incoraggiamo a considerare di condividere il tuo modello con la community per aiutare altre persone a risparmiare tempo e risorse.
-
-In questo tutorial, imparerai due metodi per la condivisione di un modello trained o fine-tuned nel [Model Hub](https://huggingface.co/models):
-
-- Condividi in modo programmatico i tuoi file nell'Hub.
-- Trascina i tuoi file nell'Hub mediante interfaccia grafica.
-
-<iframe width="560" height="315" src="https://www.youtube.com/embed/XvSGPZFEjDY" title="YouTube video player"
-frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
-picture-in-picture" allowfullscreen></iframe>
-
-<Tip>
-
-Per condividere un modello con la community, hai bisogno di un account su [huggingface.co](https://huggingface.co/join). Puoi anche unirti ad un'organizzazione esistente o crearne una nuova.
-
-</Tip>
-
-## Caratteristiche dei repository
-
-Ogni repository nel Model Hub si comporta come un tipico repository di GitHub. I nostri repository offrono il versionamento, la cronologia dei commit, e la possibilità di visualizzare le differenze.
-
-Il versionamento all'interno del Model Hub è basato su git e [git-lfs](https://git-lfs.github.com/). In altre parole, puoi trattare un modello come un unico repository, consentendo un maggiore controllo degli accessi e maggiore scalabilità. Il controllo delle versioni consente *revisions*, un metodo per appuntare una versione specifica di un modello con un hash di commit, un tag o un branch.
-
-Come risultato, puoi caricare una specifica versione di un modello con il parametro `revision`:
-
-```py
->>> model = AutoModel.from_pretrained(
-...     "julien-c/EsperBERTo-small", revision="v2.0.1"  # nome di un tag, di un branch, o commit hash
-... )
-```
-
-Anche i file possono essere modificati facilmente in un repository ed è possibile visualizzare la cronologia dei commit e le differenze:
-
-![vis_diff](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vis_diff.png)
-
-## Configurazione
-
-Prima di condividere un modello nell'Hub, hai bisogno delle tue credenziali di Hugging Face. Se hai accesso ad un terminale, esegui il seguente comando nell'ambiente virtuale in cui è installata la libreria 🤗 Transformers. Questo memorizzerà il tuo token di accesso nella cartella cache di Hugging Face (di default `~/.cache/`):
-
-```bash
-huggingface-cli login
-```
-
-Se stai usando un notebook come Jupyter o Colaboratory, assicurati di avere la libreria [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library) installata. Questa libreria ti permette di interagire in maniera programmatica con l'Hub.
-
-```bash
-pip install huggingface_hub
-```
-
-Utilizza `notebook_login` per accedere all'Hub, e segui il link [qui](https://huggingface.co/settings/token) per generare un token con cui effettuare il login:
-
-```py
->>> from huggingface_hub import notebook_login
-
->>> notebook_login()
-```
-
-## Converti un modello per tutti i framework
-
-Per assicurarti che il tuo modello possa essere utilizzato da persone che lavorano con un framework differente, ti raccomandiamo di convertire e caricare il tuo modello sia con i checkpoint di PyTorch che con quelli di TensorFlow. Anche se è possibile caricare il modello da un framework diverso, se si salta questo passaggio, il caricamento sarà più lento perché 🤗 Transformers ha bisogno di convertire i checkpoint al momento.
-
-Convertire un checkpoint per un altro framework è semplice. Assicurati di avere PyTorch e TensorFlow installati (vedi [qui](installation) per le istruzioni d'installazione), e poi trova il modello specifico per il tuo compito nell'altro framework.
-
-<frameworkcontent>
-<pt>
-Specifica `from_tf=True` per convertire un checkpoint da TensorFlow a PyTorch:
-
-```py
->>> pt_model = DistilBertForSequenceClassification.from_pretrained(
-...     "path/verso/il-nome-magnifico-che-hai-scelto", from_tf=True
-... )
->>> pt_model.save_pretrained("path/verso/il-nome-magnifico-che-hai-scelto")
-```
-</pt>
-<tf>
-Specifica `from_pt=True` per convertire un checkpoint da PyTorch a TensorFlow:
-
-```py
->>> tf_model = TFDistilBertForSequenceClassification.from_pretrained(
-...     "path/verso/il-nome-magnifico-che-hai-scelto", from_pt=True
-... )
-```
-
-Poi puoi salvare il tuo nuovo modello in TensorFlow con il suo nuovo checkpoint:
-
-```py
->>> tf_model.save_pretrained("path/verso/il-nome-magnifico-che-hai-scelto")
-```
-</tf>
-<jax>
-Se un modello è disponibile in Flax, puoi anche convertire un checkpoint da PyTorch a Flax:
-
-```py
->>> flax_model = FlaxDistilBertForSequenceClassification.from_pretrained(
-...     "path/verso/il-nome-magnifico-che-hai-scelto", from_pt=True
-... )
-```
-</jax>
-</frameworkcontent>
-
-## Condividi un modello durante il training
-
-<frameworkcontent>
-<pt>
-<Youtube id="Z1-XMy-GNLQ"/>
-
-Condividere un modello nell'Hub è tanto semplice quanto aggiungere un parametro extra o un callback. Ricorda dal [tutorial sul fine-tuning](training), la classe [`TrainingArguments`] è dove specifichi gli iperparametri e le opzioni addizionali per l'allenamento. Una di queste opzioni di training include l'abilità di condividere direttamente un modello nell'Hub. Imposta `push_to_hub=True` in [`TrainingArguments`]:
-
-```py
->>> training_args = TrainingArguments(output_dir="il-mio-bellissimo-modello", push_to_hub=True)
-```
-
-Passa gli argomenti per il training come di consueto al [`Trainer`]:
-
-```py
->>> trainer = Trainer(
-...     model=model,
-...     args=training_args,
-...     train_dataset=small_train_dataset,
-...     eval_dataset=small_eval_dataset,
-...     compute_metrics=compute_metrics,
-... )
-```
-
-Dopo aver effettuato il fine-tuning del tuo modello, chiama [`~transformers.Trainer.push_to_hub`] sul [`Trainer`] per condividere il modello allenato nell'Hub. 🤗 Transformers aggiungerà in modo automatico persino gli iperparametri, i risultati del training e le versioni del framework alla scheda del tuo modello (model card, in inglese)!
-
-```py
->>> trainer.push_to_hub()
-```
-</pt>
-<tf>
-Condividi un modello nell'Hub con [`PushToHubCallback`]. Nella funzione [`PushToHubCallback`], aggiungi:
-
-- Una directory di output per il tuo modello.
-- Un tokenizer.
-- L'`hub_model_id`, che è il tuo username sull'Hub e il nome del modello.
-
-```py
->>> from transformers.keras.callbacks import PushToHubCallback
-
->>> push_to_hub_callback = PushToHubCallback(
-...     output_dir="./il_path_dove_salvare_il_tuo_modello",
-...     tokenizer=tokenizer,
-...     hub_model_id="il-tuo-username/il-mio-bellissimo-modello",
-... )
-```
-
-Aggiungi il callback a [`fit`](https://keras.io/api/models/model_training_apis/), e 🤗 Transformers caricherà il modello allenato nell'Hub:
-
-```py
->>> model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3, callbacks=push_to_hub_callback)
-```
-</tf>
-</frameworkcontent>
-
-## Utilizzare la funzione `push_to_hub`
-
-Puoi anche chiamare `push_to_hub` direttamente sul tuo modello per caricarlo nell'Hub.
-
-Specifica il nome del tuo modello in `push_to_hub`:
-
-```py
->>> pt_model.push_to_hub("il-mio-bellissimo-modello")
-```
-
-Questo crea un repository sotto il proprio username con il nome del modello `il-mio-bellissimo-modello`. Ora chiunque può caricare il tuo modello con la funzione `from_pretrained`:
-
-```py
->>> from transformers import AutoModel
-
->>> model = AutoModel.from_pretrained("il-tuo-username/il-mio-bellissimo-modello")
-```
-
-Se fai parte di un'organizzazione e vuoi invece condividere un modello sotto il nome dell'organizzazione, aggiungi il parametro `organization`:
-
-```py
->>> pt_model.push_to_hub("il-mio-bellissimo-modello", organization="la-mia-fantastica-org")
-```
-
-La funzione `push_to_hub` può essere anche utilizzata per aggiungere altri file al repository del modello. Per esempio, aggiungi un tokenizer ad un repository di un modello:
-
-```py
->>> tokenizer.push_to_hub("il-mio-bellissimo-modello")
-```
-
-O magari potresti voler aggiungere la versione di TensorFlow del tuo modello PyTorch a cui hai fatto fine-tuning:
-
-```py
->>> tf_model.push_to_hub("il-mio-bellissimo-modello")
-```
-
-Ora quando navighi nel tuo profilo Hugging Face, dovresti vedere il tuo repository del modello appena creato. Premendo sulla scheda **Files** vengono visualizzati tutti i file caricati nel repository.
-
-Per maggiori dettagli su come creare e caricare file ad un repository, fai riferimento alla documentazione [qui](https://huggingface.co/docs/hub/how-to-upstream).
-
-## Carica un modello utilizzando l'interfaccia web
-
-Chi preferisce un approccio senza codice può caricare un modello tramite l'interfaccia web dell'hub. Visita [huggingface.co/new](https://huggingface.co/new) per creare un nuovo repository:
-
-![new_model_repo](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/new_model_repo.png)
-
-Da qui, aggiungi alcune informazioni sul tuo modello:
-
-- Seleziona il/la **owner** del repository. Puoi essere te o qualunque organizzazione di cui fai parte.
-- Scegli un nome per il tuo modello, il quale sarà anche il nome del repository.
-- Scegli se il tuo modello è pubblico o privato.
-- Specifica la licenza utilizzata per il tuo modello.
-
-Ora premi sulla scheda **Files** e premi sul pulsante **Add file** per caricare un nuovo file al tuo repository. Trascina poi un file per caricarlo e aggiungere un messaggio di commit.
-
-![upload_file](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/upload_file.png)
-
-## Aggiungi una scheda del modello
-
-Per assicurarti che chiunque possa comprendere le abilità, limitazioni, i potenziali bias e le considerazioni etiche del tuo modello, per favore aggiungi una scheda del modello (model card, in inglese) al tuo repository. La scheda del modello è definita nel file `README.md`. Puoi aggiungere una scheda del modello:
-
-* Creando manualmente e caricando un file `README.md`.
-* Premendo sul pulsante **Edit model card** nel repository del tuo modello.
-
-Dai un'occhiata alla [scheda del modello](https://huggingface.co/distilbert-base-uncased) di DistilBert per avere un buon esempio del tipo di informazioni che una scheda di un modello deve includere. Per maggiori dettagli legati ad altre opzioni che puoi controllare nel file `README.md`, come l'impatto ambientale o widget di esempio, fai riferimento alla documentazione [qui](https://huggingface.co/docs/hub/models-cards).
diff --git a/docs/source/it/multilingual.md b/docs/source/it/multilingual.md
new file mode 100644
index 000000000000..889c620ab29d
--- /dev/null
+++ b/docs/source/it/multilingual.md
@@ -0,0 +1,178 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Modelli multilingue per l'inferenza
+
+[[open-in-colab]]
+
+Ci sono diversi modelli multilingue in 🤗 Transformers, e il loro utilizzo per l'inferenza differisce da quello dei modelli monolingua. Non *tutti* gli utilizzi dei modelli multilingue sono però diversi. Alcuni modelli, come [bert-base-multilingual-uncased](https://huggingface.co/bert-base-multilingual-uncased), possono essere usati come un modello monolingua. Questa guida ti mostrerà come utilizzare modelli multilingue che utilizzano un modo diverso per fare l'inferenza.
+
+## XLM
+
+XLM ha dieci diversi checkpoint, di cui solo uno è monolingua. I nove checkpoint rimanenti possono essere suddivisi in due categorie: i checkpoint che utilizzano i language embeddings e quelli che non li utilizzano.
+
+### XLM con language embeddings
+
+I seguenti modelli XLM utilizzano gli embeddings linguistici per specificare la lingua utilizzata per l'inferenza:
+
+- `xlm-mlm-ende-1024` (Modellazione mascherata del linguaggio (Masked language modeling, in inglese), Inglese-Tedesco)
+- `xlm-mlm-enfr-1024` (Modellazione mascherata del linguaggio, Inglese-Francese)
+- `xlm-mlm-enro-1024` (Modellazione mascherata del linguaggio, Inglese-Rumeno)
+- `xlm-mlm-xnli15-1024` (Modellazione mascherata del linguaggio, lingue XNLI)
+- `xlm-mlm-tlm-xnli15-1024` (Modellazione mascherata del linguaggio + traduzione, lingue XNLI)
+- `xlm-clm-enfr-1024` (Modellazione causale del linguaggio, Inglese-Francese)
+- `xlm-clm-ende-1024` (Modellazione causale del linguaggio, Inglese-Tedesco)
+
+Gli embeddings linguistici sono rappresentati come un tensore delle stesse dimensioni dell' `input_ids` passato al modello. I valori in questi tensori dipendono dal linguaggio usato e sono identificati dagli attributi `lang2id` e `id2lang` del tokenizer.
+
+In questo esempio, carica il checkpoint `xlm-clm-enfr-1024` (Modellazione causale del linguaggio, Inglese-Francese):
+
+```py
+>>> import torch
+>>> from transformers import XLMTokenizer, XLMWithLMHeadModel
+
+>>> tokenizer = XLMTokenizer.from_pretrained("xlm-clm-enfr-1024")
+>>> model = XLMWithLMHeadModel.from_pretrained("xlm-clm-enfr-1024")
+```
+
+L'attributo `lang2id` del tokenizer mostra il linguaggio del modello e il suo ids:
+
+```py
+>>> print(tokenizer.lang2id)
+{'en': 0, 'fr': 1}
+```
+
+Poi, crea un esempio di input:
+
+```py
+>>> input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")])  # batch size of 1
+```
+
+Imposta l'id del linguaggio a `"en"` e usalo per definire il language embedding. Il language embedding è un tensore riempito con `0` perché questo è il language id per l'inglese. Questo tensore dovrebbe avere la stessa dimensione di `input_ids`.
+
+```py
+>>> language_id = tokenizer.lang2id["en"]  # 0
+>>> langs = torch.tensor([language_id] * input_ids.shape[1])  # torch.tensor([0, 0, 0, ..., 0])
+
+>>> # We reshape it to be of size (batch_size, sequence_length)
+>>> langs = langs.view(1, -1)  # is now of shape [1, sequence_length] (we have a batch size of 1)
+```
+
+Adesso puoi inserire `input_ids` e language embedding nel modello:
+
+```py
+>>> outputs = model(input_ids, langs=langs)
+```
+
+Lo script [run_generation.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-generation/run_generation.py) può generare testo tramite i language embeddings usando i checkpoints `xlm-clm`.
+
+### XLM senza language embeddings
+
+I seguenti modelli XLM non richiedono l'utilizzo dei language embeddings per fare inferenza:
+
+- `xlm-mlm-17-1280` (Modellazione mascherata del linguaggio, 17 lingue)
+- `xlm-mlm-100-1280` (Modellazione mascherata del linguaggio, 100 lingue)
+
+Questi modelli sono utilizzati per rappresentazioni generiche di frasi, a differenza dei precedenti checkpoints XML.
+
+## BERT
+
+Il seguente modello BERT può essere usato per compiti multilingue:
+
+- `bert-base-multilingual-uncased` (Modellazione mascherata del linguaggio + Previsione della prossima frase, 102 lingue)
+- `bert-base-multilingual-cased` (Modellazione mascherata del linguaggio + Previsione della prossima frase, 104 lingue)
+
+Questi modelli non richiedono language embeddings per fare inferenza. Riescono ad identificare il linguaggio dal contesto e inferire di conseguenza.
+
+## XLM-RoBERTa
+
+Il seguente modello XLM-RoBERTa può essere usato per compiti multilingue:
+
+- `xlm-roberta-base` (Modellazione mascherata del linguaggio, 100 lingue)
+- `xlm-roberta-large` (Modellazione mascherata del linguaggio, 100 lingue)
+
+XLM-RoBERTa è stato addestrato su 2.5TB di dati CommonCrawl appena creati e puliti in 100 lingue. Offre notevoli vantaggi rispetto ai modelli multilingue rilasciati in precedenza, come mBERT o XLM, in compiti come la classificazione, l'etichettatura delle sequenze e la risposta alle domande.
+
+## M2M100
+
+Il seguente modello M2M100 può essere usato per compiti multilingue:
+
+- `facebook/m2m100_418M` (Traduzione)
+- `facebook/m2m100_1.2B` (Traduzione)
+
+In questo esempio, carica il checkpoint `facebook/m2m100_418M`  per tradurre dal cinese all'inglese. Puoi impostare la lingua di partenza nel tokenizer:
+
+```py
+>>> from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
+
+>>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger."
+>>> chinese_text = "不要插手巫師的事務, 因為他們是微妙的, 很快就會發怒."
+
+>>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="zh")
+>>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
+```
+
+Applica il tokenizer al testo:
+
+```py
+>>> encoded_zh = tokenizer(chinese_text, return_tensors="pt")
+```
+
+M2M100 forza l'id della lingua obiettivo come primo token generato per tradurre nella lingua obiettivo. Imposta il parametro `forced_bos_token_id` a `en` nel metodo `generate` per tradurre in inglese:
+
+```py
+>>> generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en"))
+>>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+'Do not interfere with the matters of the witches, because they are delicate and will soon be angry.'
+```
+
+## MBart
+
+Il seguente modello MBart può essere usato per compiti multilingue:
+
+- `facebook/mbart-large-50-one-to-many-mmt` (Traduzione automatica multilingue uno-a-molti, 50 lingue)
+- `facebook/mbart-large-50-many-to-many-mmt` (Traduzione automatica multilingue molti-a-molti, 50 lingue)
+- `facebook/mbart-large-50-many-to-one-mmt` (Traduzione automatica multilingue molti-a-uno, 50 lingue)
+- `facebook/mbart-large-50` (Traduzione multilingue, 50 lingue)
+- `facebook/mbart-large-cc25`
+
+In questo esempio, carica il checkpoint `facebook/mbart-large-50-many-to-many-mmt` per tradurre dal finlandese all'inglese. Puoi impostare la lingua di partenza nel tokenizer:
+
+```py
+>>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+
+>>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger."
+>>> fi_text = "Älä sekaannu velhojen asioihin, sillä ne ovat hienovaraisia ja nopeasti vihaisia."
+
+>>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="fi_FI")
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
+```
+
+Applica il tokenizer sul testo:
+
+```py
+>>> encoded_en = tokenizer(en_text, return_tensors="pt")
+```
+
+MBart forza l'id della lingua obiettivo come primo token generato per tradurre nella lingua obiettivo. Imposta il parametro `forced_bos_token_id` a `en` nel metodo `generate` per tradurre in inglese:
+
+```py
+>>> generated_tokens = model.generate(**encoded_en, forced_bos_token_id=tokenizer.lang_code_to_id("en_XX"))
+>>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+"Don't interfere with the wizard's affairs, because they are subtle, will soon get angry."
+```
+
+Se stai usando il checkpoint `facebook/mbart-large-50-many-to-one-mmt`, non hai bisogno di forzare l'id della lingua obiettivo come primo token generato altrimenti l'uso è lo stesso.
\ No newline at end of file
diff --git a/docs/source/it/multilingual.mdx b/docs/source/it/multilingual.mdx
deleted file mode 100644
index a8ccec97d0a7..000000000000
--- a/docs/source/it/multilingual.mdx
+++ /dev/null
@@ -1,174 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Modelli multilingue per l'inferenza
-
-[[open-in-colab]]
-
-Ci sono diversi modelli multilingue in 🤗 Transformers, e il loro utilizzo per l'inferenza differisce da quello dei modelli monolingua. Non *tutti* gli utilizzi dei modelli multilingue sono però diversi. Alcuni modelli, come [bert-base-multilingual-uncased](https://huggingface.co/bert-base-multilingual-uncased), possono essere usati come un modello monolingua. Questa guida ti mostrerà come utilizzare modelli multilingue che utilizzano un modo diverso per fare l'inferenza.
-
-## XLM
-
-XLM ha dieci diversi checkpoint, di cui solo uno è monolingua. I nove checkpoint rimanenti possono essere suddivisi in due categorie: i checkpoint che utilizzano i language embeddings e quelli che non li utilizzano.
-
-### XLM con language embeddings
-
-I seguenti modelli XLM utilizzano gli embeddings linguistici per specificare la lingua utilizzata per l'inferenza:
-
-- `xlm-mlm-ende-1024` (Modellazione mascherata del linguaggio (Masked language modeling, in inglese), Inglese-Tedesco)
-- `xlm-mlm-enfr-1024` (Modellazione mascherata del linguaggio, Inglese-Francese)
-- `xlm-mlm-enro-1024` (Modellazione mascherata del linguaggio, Inglese-Rumeno)
-- `xlm-mlm-xnli15-1024` (Modellazione mascherata del linguaggio, lingue XNLI)
-- `xlm-mlm-tlm-xnli15-1024` (Modellazione mascherata del linguaggio + traduzione, lingue XNLI)
-- `xlm-clm-enfr-1024` (Modellazione causale del linguaggio, Inglese-Francese)
-- `xlm-clm-ende-1024` (Modellazione causale del linguaggio, Inglese-Tedesco)
-
-Gli embeddings linguistici sono rappresentati come un tensore delle stesse dimensioni dell' `input_ids` passato al modello. I valori in questi tensori dipendono dal linguaggio usato e sono identificati dagli attributi `lang2id` e `id2lang` del tokenizer.
-
-In questo esempio, carica il checkpoint `xlm-clm-enfr-1024` (Modellazione causale del linguaggio, Inglese-Francese):
-
-```py
->>> import torch
->>> from transformers import XLMTokenizer, XLMWithLMHeadModel
-
->>> tokenizer = XLMTokenizer.from_pretrained("xlm-clm-enfr-1024")
->>> model = XLMWithLMHeadModel.from_pretrained("xlm-clm-enfr-1024")
-```
-
-L'attributo `lang2id` del tokenizer mostra il linguaggio del modello e il suo ids:
-
-```py
->>> print(tokenizer.lang2id)
-{'en': 0, 'fr': 1}
-```
-
-Poi, crea un esempio di input:
-
-```py
->>> input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")])  # batch size of 1
-```
-
-Imposta l'id del linguaggio a `"en"` e usalo per definire il language embedding. Il language embedding è un tensore riempito con `0` perché questo è il language id per l'inglese. Questo tensore dovrebbe avere la stessa dimensione di `input_ids`.
-
-```py
->>> language_id = tokenizer.lang2id["en"]  # 0
->>> langs = torch.tensor([language_id] * input_ids.shape[1])  # torch.tensor([0, 0, 0, ..., 0])
-
->>> # We reshape it to be of size (batch_size, sequence_length)
->>> langs = langs.view(1, -1)  # is now of shape [1, sequence_length] (we have a batch size of 1)
-```
-
-Adesso puoi inserire `input_ids` e language embedding nel modello:
-
-```py
->>> outputs = model(input_ids, langs=langs)
-```
-
-Lo script [run_generation.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-generation/run_generation.py) può generare testo tramite i language embeddings usando i checkpoints `xlm-clm`.
-
-### XLM senza language embeddings
-
-I seguenti modelli XLM non richiedono l'utilizzo dei language embeddings per fare inferenza:
-
-- `xlm-mlm-17-1280` (Modellazione mascherata del linguaggio, 17 lingue)
-- `xlm-mlm-100-1280` (Modellazione mascherata del linguaggio, 100 lingue)
-
-Questi modelli sono utilizzati per rappresentazioni generiche di frasi, a differenza dei precedenti checkpoints XML.
-
-## BERT
-
-Il seguente modello BERT può essere usato per compiti multilingue:
-
-- `bert-base-multilingual-uncased` (Modellazione mascherata del linguaggio + Previsione della prossima frase, 102 lingue)
-- `bert-base-multilingual-cased` (Modellazione mascherata del linguaggio + Previsione della prossima frase, 104 lingue)
-
-Questi modelli non richiedono language embeddings per fare inferenza. Riescono ad identificare il linguaggio dal contesto e inferire di conseguenza.
-
-## XLM-RoBERTa
-
-Il seguente modello XLM-RoBERTa può essere usato per compiti multilingue:
-
-- `xlm-roberta-base` (Modellazione mascherata del linguaggio, 100 lingue)
-- `xlm-roberta-large` (Modellazione mascherata del linguaggio, 100 lingue)
-
-XLM-RoBERTa è stato addestrato su 2.5TB di dati CommonCrawl appena creati e puliti in 100 lingue. Offre notevoli vantaggi rispetto ai modelli multilingue rilasciati in precedenza, come mBERT o XLM, in compiti come la classificazione, l'etichettatura delle sequenze e la risposta alle domande.
-
-## M2M100
-
-Il seguente modello M2M100 può essere usato per compiti multilingue:
-
-- `facebook/m2m100_418M` (Traduzione)
-- `facebook/m2m100_1.2B` (Traduzione)
-
-In questo esempio, carica il checkpoint `facebook/m2m100_418M`  per tradurre dal cinese all'inglese. Puoi impostare la lingua di partenza nel tokenizer:
-
-```py
->>> from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
-
->>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger."
->>> chinese_text = "不要插手巫師的事務, 因為他們是微妙的, 很快就會發怒."
-
->>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="zh")
->>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
-```
-
-Applica il tokenizer al testo:
-
-```py
->>> encoded_zh = tokenizer(chinese_text, return_tensors="pt")
-```
-
-M2M100 forza l'id della lingua obiettivo come primo token generato per tradurre nella lingua obiettivo. Imposta il parametro `forced_bos_token_id` a `en` nel metodo `generate` per tradurre in inglese:
-
-```py
->>> generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en"))
->>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
-'Do not interfere with the matters of the witches, because they are delicate and will soon be angry.'
-```
-
-## MBart
-
-Il seguente modello MBart può essere usato per compiti multilingue:
-
-- `facebook/mbart-large-50-one-to-many-mmt` (Traduzione automatica multilingue uno-a-molti, 50 lingue)
-- `facebook/mbart-large-50-many-to-many-mmt` (Traduzione automatica multilingue molti-a-molti, 50 lingue)
-- `facebook/mbart-large-50-many-to-one-mmt` (Traduzione automatica multilingue molti-a-uno, 50 lingue)
-- `facebook/mbart-large-50` (Traduzione multilingue, 50 lingue)
-- `facebook/mbart-large-cc25`
-
-In questo esempio, carica il checkpoint `facebook/mbart-large-50-many-to-many-mmt` per tradurre dal finlandese all'inglese. Puoi impostare la lingua di partenza nel tokenizer:
-
-```py
->>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-
->>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger."
->>> fi_text = "Älä sekaannu velhojen asioihin, sillä ne ovat hienovaraisia ja nopeasti vihaisia."
-
->>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="fi_FI")
->>> model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
-```
-
-Applica il tokenizer sul testo:
-
-```py
->>> encoded_en = tokenizer(en_text, return_tensors="pt")
-```
-
-MBart forza l'id della lingua obiettivo come primo token generato per tradurre nella lingua obiettivo. Imposta il parametro `forced_bos_token_id` a `en` nel metodo `generate` per tradurre in inglese:
-
-```py
->>> generated_tokens = model.generate(**encoded_en, forced_bos_token_id=tokenizer.lang_code_to_id("en_XX"))
->>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
-"Don't interfere with the wizard's affairs, because they are subtle, will soon get angry."
-```
-
-Se stai usando il checkpoint `facebook/mbart-large-50-many-to-one-mmt`, non hai bisogno di forzare l'id della lingua obiettivo come primo token generato altrimenti l'uso è lo stesso.
\ No newline at end of file
diff --git a/docs/source/it/perf_hardware.md b/docs/source/it/perf_hardware.md
new file mode 100644
index 000000000000..a579362e2b1b
--- /dev/null
+++ b/docs/source/it/perf_hardware.md
@@ -0,0 +1,155 @@
+<!---
+Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+
+# Hardware ottimizzato per l'addestramento
+
+L'hardware utilizzato per eseguire l'addestramento del modello e l'inferenza può avere un grande effetto sulle prestazioni. Per un analisi approfondita delle GPUs, assicurati di dare un'occhiata all'eccellente [blog post](https://timdettmers.com/2020/09/07/which-gpu-for-deep-learning/) di Tim Dettmer.
+
+Diamo un'occhiata ad alcuni consigli pratici per la configurazione della GPU.
+
+## GPU
+Quando si addestrano modelli più grandi ci sono essenzialmente tre opzioni:
+- GPUs piu' grandi
+- Piu' GPUs
+- Piu' CPU e piu' NVMe (scaricato da [DeepSpeed-Infinity](main_classes/deepspeed#nvme-support))
+
+Iniziamo dal caso in cui ci sia una singola GPU.
+
+### Potenza e Raffreddamento
+
+Se hai acquistato una costosa GPU di fascia alta, assicurati di darle la potenza corretta e un raffreddamento sufficiente.
+
+**Potenza**:
+
+Alcune schede GPU consumer di fascia alta hanno 2 e talvolta 3 prese di alimentazione PCI-E a 8 pin. Assicurati di avere tanti cavi PCI-E a 8 pin indipendenti da 12 V collegati alla scheda quante sono le prese. Non utilizzare le 2 fessure a un'estremità dello stesso cavo (noto anche come cavo a spirale). Cioè se hai 2 prese sulla GPU, vuoi 2 cavi PCI-E a 8 pin che vanno dall'alimentatore alla scheda e non uno che abbia 2 connettori PCI-E a 8 pin alla fine! In caso contrario, non otterrai tutte le prestazioni ufficiali.
+
+Ciascun cavo di alimentazione PCI-E a 8 pin deve essere collegato a una guida da 12 V sul lato dell'alimentatore e può fornire fino a 150 W di potenza.
+
+Alcune altre schede possono utilizzare connettori PCI-E a 12 pin e questi possono fornire fino a 500-600 W di potenza.
+
+Le schede di fascia bassa possono utilizzare connettori a 6 pin, che forniscono fino a 75 W di potenza.
+
+Inoltre vuoi un alimentatore (PSU) di fascia alta che abbia una tensione stabile. Alcuni PSU di qualità inferiore potrebbero non fornire alla scheda la tensione stabile di cui ha bisogno per funzionare al massimo.
+
+E ovviamente l'alimentatore deve avere abbastanza Watt inutilizzati per alimentare la scheda.
+
+**Raffreddamento**:
+
+Quando una GPU si surriscalda, inizierà a rallentare e non fornirà le prestazioni mssimali e potrebbe persino spegnersi se diventasse troppo calda.
+
+È difficile dire l'esatta temperatura migliore a cui aspirare quando una GPU è molto caricata, ma probabilmente qualsiasi cosa al di sotto di +80°C va bene, ma più bassa è meglio - forse 70-75°C è un intervallo eccellente in cui trovarsi. È probabile che il rallentamento inizi a circa 84-90°C. Ma oltre alla limitazione delle prestazioni, una temperatura molto elevata prolungata è probabile che riduca la durata di una GPU.
+
+Diamo quindi un'occhiata a uno degli aspetti più importanti quando si hanno più GPU: la connettività.
+
+### Connettività multi-GPU
+
+Se utilizzi più GPU, il modo in cui le schede sono interconnesse può avere un enorme impatto sul tempo totale di allenamento. Se le GPU si trovano sullo stesso nodo fisico, puoi eseguire:
+
+```
+nvidia-smi topo -m
+```
+
+e ti dirà come sono interconnesse le GPU. Su una macchina con doppia GPU e collegata a NVLink, molto probabilmente vedrai qualcosa del tipo:
+
+```
+        GPU0    GPU1    CPU Affinity    NUMA Affinity
+GPU0     X      NV2     0-23            N/A
+GPU1    NV2      X      0-23            N/A
+```
+
+su una macchina diversa senza NVLink potremmo vedere:
+
+```
+        GPU0    GPU1    CPU Affinity    NUMA Affinity
+GPU0     X      PHB     0-11            N/A
+GPU1    PHB      X      0-11            N/A
+```
+
+Il rapporto include questa legenda:
+
+```
+  X    = Self
+  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
+  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
+  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
+  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
+  PIX  = Connection traversing at most a single PCIe bridge
+  NV#  = Connection traversing a bonded set of # NVLinks
+```
+
+Quindi il primo rapporto `NV2` ci dice che le GPU sono interconnesse con 2 NVLinks e nel secondo report `PHB` abbiamo una tipica configurazione PCIe+Bridge a livello di consumatore.
+
+Controlla che tipo di connettività hai sulla tua configurazione. Alcuni di questi renderanno la comunicazione tra le carte più veloce (es. NVLink), altri più lenta (es. PHB).
+
+A seconda del tipo di soluzione di scalabilità utilizzata, la velocità di connettività potrebbe avere un impatto maggiore o minore. Se le GPU devono sincronizzarsi raramente, come in DDP, l'impatto di una connessione più lenta sarà meno significativo. Se le GPU devono scambiarsi messaggi spesso, come in ZeRO-DP, una connettività più veloce diventa estremamente importante per ottenere un addestramento più veloce.
+
+#### NVlink
+
+[NVLink](https://en.wikipedia.org/wiki/NVLink) è un collegamento di comunicazione a corto raggio multilinea seriale basato su cavo sviluppato da Nvidia.
+
+Ogni nuova generazione fornisce una larghezza di banda più veloce, ad es. ecco una citazione da [Nvidia Ampere GA102 GPU Architecture](https://www.nvidia.com/content/dam/en-zz/Solutions/geforce/ampere/pdf/NVIDIA-ampere-GA102-GPU-Architecture-Whitepaper-V1.pdf):
+
+> Third-Generation NVLink®
+> GA102 GPUs utilize NVIDIA’s third-generation NVLink interface, which includes four x4 links,
+> with each link providing 14.0625 GB/sec bandwidth in each direction between two GPUs. Four
+> links provide 56.25 GB/sec bandwidth in each direction, and 112.5 GB/sec total bandwidth
+> between two GPUs. Two RTX 3090 GPUs can be connected together for SLI using NVLink.
+> (Note that 3-Way and 4-Way SLI configurations are not supported.)
+
+Quindi più `X` si ottiene nel rapporto di `NVX` nell'output di `nvidia-smi topo -m`, meglio è. La generazione dipenderà dall'architettura della tua GPU.
+
+Confrontiamo l'esecuzione di un training del modello di linguaggio gpt2 su un piccolo campione di wikitext
+
+I risultati sono:
+
+
+| NVlink | Time |
+| -----  | ---: |
+| Y      | 101s |
+| N      | 131s |
+
+
+Puoi vedere che NVLink completa l'addestramento circa il 23% più velocemente. Nel secondo benchmark utilizziamo `NCCL_P2P_DISABLE=1` per dire alle GPU di non utilizzare NVLink.
+
+Ecco il codice benchmark completo e gli output:
+
+```bash
+# DDP w/ NVLink
+
+rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch \
+--nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \
+--dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train \
+--output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
+
+{'train_runtime': 101.9003, 'train_samples_per_second': 1.963, 'epoch': 0.69}
+
+# DDP w/o NVLink
+
+rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 NCCL_P2P_DISABLE=1 python -m torch.distributed.launch \
+--nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \
+--dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train
+--output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
+
+{'train_runtime': 131.4367, 'train_samples_per_second': 1.522, 'epoch': 0.69}
+```
+
+Hardware: 2x TITAN RTX 24GB each + NVlink with 2 NVLinks (`NV2` in `nvidia-smi topo -m`)
+Software: `pytorch-1.8-to-be` + `cuda-11.0` / `transformers==4.3.0.dev0`
\ No newline at end of file
diff --git a/docs/source/it/perf_hardware.mdx b/docs/source/it/perf_hardware.mdx
deleted file mode 100644
index 0bfdbc8fe686..000000000000
--- a/docs/source/it/perf_hardware.mdx
+++ /dev/null
@@ -1,151 +0,0 @@
-<!---
-Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-
-# Hardware ottimizzato per l'addestramento
-
-L'hardware utilizzato per eseguire l'addestramento del modello e l'inferenza può avere un grande effetto sulle prestazioni. Per un analisi approfondita delle GPUs, assicurati di dare un'occhiata all'eccellente [blog post](https://timdettmers.com/2020/09/07/which-gpu-for-deep-learning/) di Tim Dettmer.
-
-Diamo un'occhiata ad alcuni consigli pratici per la configurazione della GPU.
-
-## GPU
-Quando si addestrano modelli più grandi ci sono essenzialmente tre opzioni:
-- GPUs piu' grandi
-- Piu' GPUs
-- Piu' CPU e piu' NVMe (scaricato da [DeepSpeed-Infinity](main_classes/deepspeed#nvme-support))
-
-Iniziamo dal caso in cui ci sia una singola GPU.
-
-### Potenza e Raffreddamento
-
-Se hai acquistato una costosa GPU di fascia alta, assicurati di darle la potenza corretta e un raffreddamento sufficiente.
-
-**Potenza**:
-
-Alcune schede GPU consumer di fascia alta hanno 2 e talvolta 3 prese di alimentazione PCI-E a 8 pin. Assicurati di avere tanti cavi PCI-E a 8 pin indipendenti da 12 V collegati alla scheda quante sono le prese. Non utilizzare le 2 fessure a un'estremità dello stesso cavo (noto anche come cavo a spirale). Cioè se hai 2 prese sulla GPU, vuoi 2 cavi PCI-E a 8 pin che vanno dall'alimentatore alla scheda e non uno che abbia 2 connettori PCI-E a 8 pin alla fine! In caso contrario, non otterrai tutte le prestazioni ufficiali.
-
-Ciascun cavo di alimentazione PCI-E a 8 pin deve essere collegato a una guida da 12 V sul lato dell'alimentatore e può fornire fino a 150 W di potenza.
-
-Alcune altre schede possono utilizzare connettori PCI-E a 12 pin e questi possono fornire fino a 500-600 W di potenza.
-
-Le schede di fascia bassa possono utilizzare connettori a 6 pin, che forniscono fino a 75 W di potenza.
-
-Inoltre vuoi un alimentatore (PSU) di fascia alta che abbia una tensione stabile. Alcuni PSU di qualità inferiore potrebbero non fornire alla scheda la tensione stabile di cui ha bisogno per funzionare al massimo.
-
-E ovviamente l'alimentatore deve avere abbastanza Watt inutilizzati per alimentare la scheda.
-
-**Raffreddamento**:
-
-Quando una GPU si surriscalda, inizierà a rallentare e non fornirà le prestazioni mssimali e potrebbe persino spegnersi se diventasse troppo calda.
-
-È difficile dire l'esatta temperatura migliore a cui aspirare quando una GPU è molto caricata, ma probabilmente qualsiasi cosa al di sotto di +80°C va bene, ma più bassa è meglio - forse 70-75°C è un intervallo eccellente in cui trovarsi. È probabile che il rallentamento inizi a circa 84-90°C. Ma oltre alla limitazione delle prestazioni, una temperatura molto elevata prolungata è probabile che riduca la durata di una GPU.
-
-Diamo quindi un'occhiata a uno degli aspetti più importanti quando si hanno più GPU: la connettività.
-
-### Connettività multi-GPU
-
-Se utilizzi più GPU, il modo in cui le schede sono interconnesse può avere un enorme impatto sul tempo totale di allenamento. Se le GPU si trovano sullo stesso nodo fisico, puoi eseguire:
-
-```
-nvidia-smi topo -m
-```
-
-e ti dirà come sono interconnesse le GPU. Su una macchina con doppia GPU e collegata a NVLink, molto probabilmente vedrai qualcosa del tipo:
-
-```
-        GPU0    GPU1    CPU Affinity    NUMA Affinity
-GPU0     X      NV2     0-23            N/A
-GPU1    NV2      X      0-23            N/A
-```
-
-su una macchina diversa senza NVLink potremmo vedere:
-
-```
-        GPU0    GPU1    CPU Affinity    NUMA Affinity
-GPU0     X      PHB     0-11            N/A
-GPU1    PHB      X      0-11            N/A
-```
-
-Il rapporto include questa legenda:
-
-```
-  X    = Self
-  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
-  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
-  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
-  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
-  PIX  = Connection traversing at most a single PCIe bridge
-  NV#  = Connection traversing a bonded set of # NVLinks
-```
-
-Quindi il primo rapporto `NV2` ci dice che le GPU sono interconnesse con 2 NVLinks e nel secondo report `PHB` abbiamo una tipica configurazione PCIe+Bridge a livello di consumatore.
-
-Controlla che tipo di connettività hai sulla tua configurazione. Alcuni di questi renderanno la comunicazione tra le carte più veloce (es. NVLink), altri più lenta (es. PHB).
-
-A seconda del tipo di soluzione di scalabilità utilizzata, la velocità di connettività potrebbe avere un impatto maggiore o minore. Se le GPU devono sincronizzarsi raramente, come in DDP, l'impatto di una connessione più lenta sarà meno significativo. Se le GPU devono scambiarsi messaggi spesso, come in ZeRO-DP, una connettività più veloce diventa estremamente importante per ottenere un addestramento più veloce.
-
-#### NVlink
-
-[NVLink](https://en.wikipedia.org/wiki/NVLink) è un collegamento di comunicazione a corto raggio multilinea seriale basato su cavo sviluppato da Nvidia.
-
-Ogni nuova generazione fornisce una larghezza di banda più veloce, ad es. ecco una citazione da [Nvidia Ampere GA102 GPU Architecture](https://www.nvidia.com/content/dam/en-zz/Solutions/geforce/ampere/pdf/NVIDIA-ampere-GA102-GPU-Architecture-Whitepaper-V1.pdf):
-
-> Third-Generation NVLink®
-> GA102 GPUs utilize NVIDIA’s third-generation NVLink interface, which includes four x4 links,
-> with each link providing 14.0625 GB/sec bandwidth in each direction between two GPUs. Four
-> links provide 56.25 GB/sec bandwidth in each direction, and 112.5 GB/sec total bandwidth
-> between two GPUs. Two RTX 3090 GPUs can be connected together for SLI using NVLink.
-> (Note that 3-Way and 4-Way SLI configurations are not supported.)
-
-Quindi più `X` si ottiene nel rapporto di `NVX` nell'output di `nvidia-smi topo -m`, meglio è. La generazione dipenderà dall'architettura della tua GPU.
-
-Confrontiamo l'esecuzione di un training del modello di linguaggio gpt2 su un piccolo campione di wikitext
-
-I risultati sono:
-
-
-| NVlink | Time |
-| -----  | ---: |
-| Y      | 101s |
-| N      | 131s |
-
-
-Puoi vedere che NVLink completa l'addestramento circa il 23% più velocemente. Nel secondo benchmark utilizziamo `NCCL_P2P_DISABLE=1` per dire alle GPU di non utilizzare NVLink.
-
-Ecco il codice benchmark completo e gli output:
-
-```bash
-# DDP w/ NVLink
-
-rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch \
---nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \
---dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train \
---output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
-
-{'train_runtime': 101.9003, 'train_samples_per_second': 1.963, 'epoch': 0.69}
-
-# DDP w/o NVLink
-
-rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 NCCL_P2P_DISABLE=1 python -m torch.distributed.launch \
---nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \
---dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train
---output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
-
-{'train_runtime': 131.4367, 'train_samples_per_second': 1.522, 'epoch': 0.69}
-```
-
-Hardware: 2x TITAN RTX 24GB each + NVlink with 2 NVLinks (`NV2` in `nvidia-smi topo -m`)
-Software: `pytorch-1.8-to-be` + `cuda-11.0` / `transformers==4.3.0.dev0`
\ No newline at end of file
diff --git a/docs/source/it/perf_infer_cpu.md b/docs/source/it/perf_infer_cpu.md
new file mode 100644
index 000000000000..baae51a5a978
--- /dev/null
+++ b/docs/source/it/perf_infer_cpu.md
@@ -0,0 +1,79 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Inferenza Efficiente su CPU
+
+Questa guida si concentra sull'inferenza di modelli di grandi dimensioni in modo efficiente sulla CPU.
+
+## `BetterTransformer` per inferenza più rapida
+
+Abbiamo integrato di recente `BetterTransformer` per fare inferenza più rapidamente con modelli per testi, immagini e audio. Visualizza la documentazione sull'integrazione [qui](https://huggingface.co/docs/optimum/bettertransformer/overview) per maggiori dettagli.
+
+## PyTorch JIT-mode (TorchScript)
+
+TorchScript è un modo di creare modelli serializzabili e ottimizzabili da codice PyTorch. Ogni programmma TorchScript può esere salvato da un processo Python  e caricato in un processo dove non ci sono dipendenze Python.
+Comparandolo con l'eager mode di default, jit mode in PyTorch normalmente fornisce prestazioni migliori per l'inferenza del modello da parte di metodologie di ottimizzazione come la operator fusion.
+
+Per una prima introduzione a TorchScript, vedi la Introduction to [PyTorch TorchScript tutorial](https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html#tracing-modules).
+
+### IPEX Graph Optimization con JIT-mode
+
+Intel® Extension per PyTorch fornnisce ulteriori ottimizzazioni in jit mode per i modelli della serie Transformers. Consigliamo vivamente agli utenti di usufruire dei vantaggi di Intel® Extension per PyTorch con jit mode. Alcuni operator patterns usati fequentemente dai modelli Transformers models sono già supportati in Intel® Extension per PyTorch con jit mode fusions. Questi fusion patterns come Multi-head-attention fusion, Concat Linear, Linear+Add, Linear+Gelu, Add+LayerNorm fusion and etc. sono abilitati e hanno buone performance. I benefici della fusion è fornito agli utenti in modo trasparente. In base alle analisi, il ~70% dei problemi più popolari in NLP question-answering, text-classification, and token-classification possono avere benefici sulle performance grazie ai fusion patterns sia per Float32 precision che per BFloat16 Mixed precision.
+
+Vedi maggiori informazioni per [IPEX Graph Optimization](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/features/graph_optimization.html).
+
+#### Installazione di IPEX
+
+I rilasci di IPEX seguono PyTorch, verifica i vari approcci per [IPEX installation](https://intel.github.io/intel-extension-for-pytorch/).
+
+### Utilizzo del JIT-mode
+
+Per abilitare JIT-mode in Trainer per evaluation e prediction, devi aggiungere `jit_mode_eval` negli argomenti di Trainer.
+
+<Tip warning={true}>
+
+per PyTorch >= 1.14.0. JIT-mode potrebe giovare a qualsiasi modello di prediction e evaluaion visto che il dict input è supportato in jit.trace
+
+per PyTorch < 1.14.0. JIT-mode potrebbe giovare ai modelli il cui ordine dei parametri corrisponde all'ordine delle tuple in ingresso in jit.trace, come i modelli per question-answering.
+Nel caso in cui l'ordine dei parametri seguenti non corrisponda all'ordine delle tuple in ingresso in jit.trace, come nei modelli di text-classification, jit.trace fallirà e lo cattureremo con una eccezione al fine di renderlo un fallback. Il logging è usato per notificare gli utenti.
+
+</Tip>
+
+Trovi un esempo con caso d'uso in [Transformers question-answering](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering)
+
+- Inference using jit mode on CPU:
+
+<pre>python run_qa.py \
+--model_name_or_path csarron/bert-base-uncased-squad-v1 \
+--dataset_name squad \
+--do_eval \
+--max_seq_length 384 \
+--doc_stride 128 \
+--output_dir /tmp/ \
+--no_cuda \
+<b>--jit_mode_eval </b></pre> 
+
+- Inference with IPEX using jit mode on CPU:
+
+<pre>python run_qa.py \
+--model_name_or_path csarron/bert-base-uncased-squad-v1 \
+--dataset_name squad \
+--do_eval \
+--max_seq_length 384 \
+--doc_stride 128 \
+--output_dir /tmp/ \
+--no_cuda \
+<b>--use_ipex \</b>
+<b>--jit_mode_eval</b></pre> 
diff --git a/docs/source/it/perf_infer_gpu_many.md b/docs/source/it/perf_infer_gpu_many.md
new file mode 100644
index 000000000000..b78cb34e1d6d
--- /dev/null
+++ b/docs/source/it/perf_infer_gpu_many.md
@@ -0,0 +1,28 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Inferenza Efficiente su GPU Multiple
+
+Questo documento contiene informazioni su come fare inferenza in maniera efficiente su GPU multiple.
+
+<Tip>
+
+Nota: Un setup con GPU multiple può utilizzare la maggior parte delle strategie descritte nella [sezione con GPU singola](./perf_infer_gpu_one). Tuttavia, è necessario conoscere delle tecniche semplici che possono essere utilizzate per un risultato migliore.
+
+</Tip>
+
+## `BetterTransformer` per inferenza più rapida
+
+Abbiamo recentemente integrato `BetterTransformer` per inferenza più rapida su multi-GPU per modelli su testo, immagini e audio. Controlla il documento con queste integrazioni [qui](https://huggingface.co/docs/optimum/bettertransformer/overview) per maggiori dettagli.
diff --git a/docs/source/it/perf_infer_gpu_one.md b/docs/source/it/perf_infer_gpu_one.md
new file mode 100644
index 000000000000..16f77b3b1f31
--- /dev/null
+++ b/docs/source/it/perf_infer_gpu_one.md
@@ -0,0 +1,112 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Inferenza efficiente su GPU singola
+
+Questo documento sarà presto completato con informazioni su come effetture l'inferenza su una singola GPU. Nel frattempo è possibile consultare [la guida per l'addestramento su una singola GPU](perf_train_gpu_one) e [la guida per l'inferenza su CPU](perf_infer_cpu).
+
+## `BetterTransformer` per l'inferenza più veloce
+
+Abbiamo recentemente integrato `BetterTransformer` per velocizzare l'inferenza su GPU per modelli di testo, immagini e audio. Per maggiori dettagli, consultare la documentazione su questa integrazione [qui](https://huggingface.co/docs/optimum/bettertransformer/overview).
+
+## Integrazione di `bitsandbytes` per Int8 mixed-precision matrix decomposition
+
+<Tip>
+
+Nota che questa funzione può essere utilizzata anche nelle configurazioni multi GPU.
+
+</Tip>
+
+Dal paper [`LLM.int8() : 8-bit Matrix Multiplication for Transformers at Scale`](https://arxiv.org/abs/2208.07339), noi supportiamo l'integrazione di Hugging Face per tutti i modelli dell'Hub con poche righe di codice.
+Il metodo `nn.Linear` riduce la dimensione di 2 per i pesi `float16` e `bfloat16` e di 4 per i pesi `float32`, con un impatto quasi nullo sulla qualità, operando sugli outlier in half-precision.
+
+![HFxbitsandbytes.png](https://cdn-uploads.huggingface.co/production/uploads/1659861207959-62441d1d9fdefb55a0b7d12c.png)
+
+Il metodo Int8 mixed-precision matrix decomposition funziona separando la moltiplicazione tra matrici in due flussi: (1) una matrice di flusso di outlier di caratteristiche sistematiche moltiplicata in fp16, (2) in flusso regolare di moltiplicazione di matrici int8 (99,9%). Con questo metodo, è possibile effettutare inferenza int8 per modelli molto grandi senza degrado predittivo.
+Per maggiori dettagli sul metodo, consultare il [paper](https://arxiv.org/abs/2208.07339) o il nostro [blogpost sull'integrazione](https://huggingface.co/blog/hf-bitsandbytes-integration).
+
+![MixedInt8.gif](https://cdn-uploads.huggingface.co/production/uploads/1660567469965-62441d1d9fdefb55a0b7d12c.gif)
+
+Nota che è necessaria una GPU per eseguire modelli di tipo mixed-8bit, poiché i kernel sono stati compilati solo per le GPU. Prima di utilizzare questa funzione, assicurarsi di disporre di memoria sufficiente sulla GPU per memorizzare un quarto del modello (o la metà se i pesi del modello sono in mezza precisione).
+Di seguito sono riportate alcune note per aiutarvi a utilizzare questo modulo, oppure seguite le dimostrazioni su [Google colab](#colab-demos).
+
+### Requisiti
+
+- Se si dispone di `bitsandbytes<0.37.0`, assicurarsi di eseguire su GPU NVIDIA che supportano tensor cores a 8 bit (Turing, Ampere o architetture più recenti - ad esempio T4, RTX20s RTX30s, A40-A100). Per `bitsandbytes>=0.37.0`, tutte le GPU dovrebbero essere supportate.
+- Installare la versione corretta di `bitsandbytes` eseguendo:
+`pip install bitsandbytes>=0.31.5`.
+- Installare `accelerate`
+`pip install accelerate>=0.12.0`
+
+### Esecuzione di modelli mixed-Int8 - configurazione per singola GPU
+
+Dopo aver installato le librerie necessarie, per caricare il tuo modello mixed 8-bit è il seguente:
+
+```py
+from transformers import AutoModelForCausalLM
+
+model_name = "bigscience/bloom-2b5"
+model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
+```
+
+Per la generazione di testo, si consiglia di:
+
+* utilizzare il metodo `generate()` del modello invece della funzione `pipeline()`. Sebbene l'inferenza sia possibile con la funzione `pipeline()`, essa non è ottimizzata per i modelli mixed-8bit e sarà più lenta rispetto all'uso del metodo `generate()`. Inoltre, alcune strategie di campionamento, come il campionamento nucleaus, non sono supportate dalla funzione `pipeline()` per i modelli mixed-8bit.
+* collocare tutti gli ingressi sullo stesso dispositivo del modello.
+
+Ecco un semplice esempio:
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_name = "bigscience/bloom-2b5"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
+
+text = "Hello, my llama is cute"
+inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
+generated_ids = model.generate(**inputs)
+outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+```
+
+
+### Esecuzione di modelli mixed-8bit - configurazione multi GPU
+
+Usare il seguente modo caricare il modello mixed-8bit su più GPU (stesso comando della configurazione a GPU singola):
+```py
+model_name = "bigscience/bloom-2b5"
+model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
+```
+Puoi controllare la RAM della GPU che si vuole allocare su ogni GPU usando `accelerate`. Utilizzare l'argomento `max_memory` come segue:
+
+```py
+max_memory_mapping = {0: "1GB", 1: "2GB"}
+model_name = "bigscience/bloom-3b"
+model_8bit = AutoModelForCausalLM.from_pretrained(
+    model_name, device_map="auto", load_in_8bit=True, max_memory=max_memory_mapping
+)
+```
+In questo esempio, la prima GPU utilizzerà 1 GB di memoria e la seconda 2 GB.
+
+### Colab demos
+
+Con questo metodo è possibile inferire modelli che prima non era possibile inferire su Google Colab.
+Guardate la demo per l'esecuzione di T5-11b (42GB in fp32)! Utilizzo la quantizzazione a 8 bit su Google Colab:
+
+[![Open In Colab: T5-11b demo](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1YORPWx4okIHXnjW7MSAidXN29mPVNT7F?usp=sharing)
+
+Oppure questa demo di BLOOM-3B:
+
+[![Open In Colab: BLOOM-3b demo](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1qOjXfQIAULfKvZqwCen8-MoWKGdSatZ4?usp=sharing)
\ No newline at end of file
diff --git a/docs/source/it/perf_infer_special.md b/docs/source/it/perf_infer_special.md
new file mode 100644
index 000000000000..3e2c0a5c288e
--- /dev/null
+++ b/docs/source/it/perf_infer_special.md
@@ -0,0 +1,18 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Inferenza su Hardware Specializzato
+
+Questo documento sarà completato a breve con la documentazione per l'inferenza su hardware specializzato. Nel frattempo puoi controllare [la guida per fare inferenza sulle CPU](perf_infer_cpu).
\ No newline at end of file
diff --git a/docs/source/it/perf_train_cpu.md b/docs/source/it/perf_train_cpu.md
new file mode 100644
index 000000000000..c91baeec8800
--- /dev/null
+++ b/docs/source/it/perf_train_cpu.md
@@ -0,0 +1,69 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Addestramento efficiente su CPU
+
+Questa guida si concentra su come addestrare in maniera efficiente grandi modelli su CPU.
+
+## Mixed precision con IPEX
+
+IPEX è ottimizzato per CPU con AVX-512 o superiore, e funziona per le CPU con solo AVX2. Pertanto, si prevede che le prestazioni saranno più vantaggiose per le le CPU Intel con AVX-512 o superiori, mentre le CPU con solo AVX2 (ad esempio, le CPU AMD o le CPU Intel più vecchie) potrebbero ottenere prestazioni migliori con IPEX, ma non sono garantite. IPEX offre ottimizzazioni delle prestazioni per l'addestramento della CPU sia con Float32 che con BFloat16. L'uso di BFloat16 è l'argomento principale delle seguenti sezioni.
+
+Il tipo di dati a bassa precisione BFloat16 è stato supportato in modo nativo su 3rd Generation Xeon® Scalable Processors (aka Cooper Lake) con AVX512 e sarà supportata dalla prossima generazione di Intel® Xeon® Scalable Processors con Intel® Advanced Matrix Extensions (Intel® AMX) instruction set con prestazioni ulteriormente migliorate. L'Auto Mixed Precision per il backende della CPU è stato abilitato da PyTorch-1.10. allo stesso tempo, il supporto di Auto Mixed Precision con BFloat16 per CPU e l'ottimizzazione degli operatori BFloat16 è stata abilitata in modo massiccio in Intel® Extension per PyTorch, and parzialmente aggiornato al branch master di PyTorch. Gli utenti possono ottenere prestazioni migliori ed users experience con IPEX Auto Mixed Precision..
+
+Vedi informazioni più dettagliate su [Auto Mixed Precision](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/features/amp.html).
+
+### Installazione di IPEX:
+
+Il rilascio di IPEX segue quello di PyTorch, da installare via pip:
+
+| PyTorch Version   | IPEX version   |
+| :---------------: | :----------:   |
+| 1.13              |  1.13.0+cpu    |
+| 1.12              |  1.12.300+cpu  |
+| 1.11              |  1.11.200+cpu  |
+| 1.10              |  1.10.100+cpu  |
+
+```bash
+pip install intel_extension_for_pytorch==<version_name> -f https://developer.intel.com/ipex-whl-stable-cpu
+```
+
+Vedi altri approcci per [IPEX installation](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/installation.html).
+
+### Utilizzo nel Trainer
+
+Per abilitare la auto mixed precision con IPEX in Trainer, l'utende dovrebbe aggiungere `use_ipex`, `bf16` e `no_cuda` negli argomenti del comando di addestramento.
+
+Vedi un sempio di un caso d'uso [Transformers question-answering](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering)
+
+- Training with IPEX using BF16 auto mixed precision on CPU:
+
+<pre> python run_qa.py \
+--model_name_or_path bert-base-uncased \
+--dataset_name squad \
+--do_train \
+--do_eval \
+--per_device_train_batch_size 12 \
+--learning_rate 3e-5 \
+--num_train_epochs 2 \
+--max_seq_length 384 \
+--doc_stride 128 \
+--output_dir /tmp/debug_squad/ \
+<b>--use_ipex \</b>
+<b>--bf16 --no_cuda</b></pre> 
+
+### Esempi pratici
+
+Blog: [Accelerating PyTorch Transformers with Intel Sapphire Rapids](https://huggingface.co/blog/intel-sapphire-rapids)
diff --git a/docs/source/it/perf_train_cpu_many.md b/docs/source/it/perf_train_cpu_many.md
new file mode 100644
index 000000000000..2fb10ee4ba49
--- /dev/null
+++ b/docs/source/it/perf_train_cpu_many.md
@@ -0,0 +1,141 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Addestramento effciente su multiple CPU
+
+Quando l'addestramento su una singola CPU è troppo lento, possiamo usare CPU multiple. Quasta guida si concentra su DDP basato su PyTorch abilitando l'addetramento distribuito su CPU in maniera efficiente.
+
+## Intel® oneCCL Bindings per PyTorch
+
+[Intel® oneCCL](https://github.com/oneapi-src/oneCCL) (collective communications library) è una libreria per l'addestramento efficiente del deep learning in distribuito e implementa collettivi come allreduce, allgather, alltoall. Per maggiori informazioni su oneCCL, fai riferimento a [oneCCL documentation](https://spec.oneapi.com/versions/latest/elements/oneCCL/source/index.html) e [oneCCL specification](https://spec.oneapi.com/versions/latest/elements/oneCCL/source/index.html).
+
+Il modulo `oneccl_bindings_for_pytorch` (`torch_ccl` precedentemente alla versione 1.12)  implementa PyTorch C10D ProcessGroup API e può essere caricato dinamicamente com external ProcessGroup e funziona solo su piattaforma Linux al momento.
+
+Qui trovi informazioni più dettagliate per [oneccl_bind_pt](https://github.com/intel/torch-ccl).
+
+### Intel® oneCCL Bindings per l'installazione PyTorch:
+
+I file wheel sono disponibili per le seguenti versioni di Python:
+
+| Extension Version | Python 3.6 | Python 3.7 | Python 3.8 | Python 3.9 | Python 3.10 |
+| :---------------: | :--------: | :--------: | :--------: | :--------: | :---------: |
+| 1.13.0            |            | √          | √          | √          | √           |
+| 1.12.100          |            | √          | √          | √          | √           |
+| 1.12.0            |            | √          | √          | √          | √           |
+| 1.11.0            |            | √          | √          | √          | √           |
+| 1.10.0            | √          | √          | √          | √          |             |
+
+```bash
+pip install oneccl_bind_pt=={pytorch_version} -f https://developer.intel.com/ipex-whl-stable-cpu
+```
+
+dove `{pytorch_version}` deve essere la tua versione di PyTorch, per l'stanza 1.13.0.
+Verifica altri approcci per [oneccl_bind_pt installation](https://github.com/intel/torch-ccl).
+Le versioni di oneCCL e PyTorch devono combaciare.
+
+<Tip warning={true}>
+
+oneccl_bindings_for_pytorch 1.12.0 prebuilt wheel does not work with PyTorch 1.12.1 (it is for PyTorch 1.12.0)
+PyTorch 1.12.1 should work with oneccl_bindings_for_pytorch 1.12.100
+
+</Tip>
+
+## Intel® MPI library
+
+Usa questa implementazione basata su standard MPI per fornire una architettura flessibile, efficiente, scalabile su cluster per Intel®. Questo componente è parte di Intel® oneAPI HPC Toolkit.
+
+oneccl_bindings_for_pytorch è installato insieme al set di strumenti MPI. Necessità di reperire l'ambiente prima di utilizzarlo.
+
+per Intel® oneCCL >= 1.12.0
+
+```bash
+oneccl_bindings_for_pytorch_path=$(python -c "from oneccl_bindings_for_pytorch import cwd; print(cwd)")
+source $oneccl_bindings_for_pytorch_path/env/setvars.sh
+```
+
+per Intel® oneCCL con versione < 1.12.0
+
+```bash
+torch_ccl_path=$(python -c "import torch; import torch_ccl; import os;  print(os.path.abspath(os.path.dirname(torch_ccl.__file__)))")
+source $torch_ccl_path/env/setvars.sh
+```
+
+#### Installazione IPEX:
+
+IPEX fornisce ottimizzazioni delle prestazioni per l'addestramento della CPU sia con Float32 che con BFloat16; puoi fare riferimento a [single CPU section](./perf_train_cpu).
+
+Il seguente "Utilizzo in Trainer" prende come esempio mpirun nella libreria Intel® MPI.
+
+## Utilizzo in Trainer
+
+Per abilitare l'addestramento distribuito multi CPU nel Trainer con il ccl backend, gli utenti devono aggiungere **`--ddp_backend ccl`** negli argomenti del comando.
+
+Vediamo un esempio per il [question-answering example](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering)
+
+Il seguente comando abilita due processi sul nodo Xeon, con un processo in esecuzione per ogni socket. Le variabili OMP_NUM_THREADS/CCL_WORKER_COUNT possono essere impostate per una prestazione ottimale.
+
+```shell script
+ export CCL_WORKER_COUNT=1
+ export MASTER_ADDR=127.0.0.1
+ mpirun -n 2 -genv OMP_NUM_THREADS=23 \
+ python3 run_qa.py \
+ --model_name_or_path bert-large-uncased \
+ --dataset_name squad \
+ --do_train \
+ --do_eval \
+ --per_device_train_batch_size 12  \
+ --learning_rate 3e-5  \
+ --num_train_epochs 2  \
+ --max_seq_length 384 \
+ --doc_stride 128  \
+ --output_dir /tmp/debug_squad/ \
+ --no_cuda \
+ --ddp_backend ccl \
+ --use_ipex
+```
+
+Il seguente comando abilita l'addestramento per un totale di quattro processi su due Xeon (node0 e node1, prendendo node0 come processo principale), ppn (processes per node) è impostato a 2, on un processo in esecuzione per ogni socket. Le variabili OMP_NUM_THREADS/CCL_WORKER_COUNT possono essere impostate per una prestazione ottimale.
+
+In node0, è necessario creare un file di configurazione che contenga gli indirizzi IP di ciascun nodo (per esempio hostfile) e passare il percorso del file di configurazione come parametro.
+
+```shell script
+ cat hostfile
+ xxx.xxx.xxx.xxx #node0 ip
+ xxx.xxx.xxx.xxx #node1 ip
+```
+
+A questo punto, esegui il seguente comando nel nodo0 e **4DDP** sarà abilitato in node0 e node1 con BF16 auto mixed precision:
+
+```shell script
+ export CCL_WORKER_COUNT=1
+ export MASTER_ADDR=xxx.xxx.xxx.xxx #node0 ip
+ mpirun -f hostfile -n 4 -ppn 2 \
+ -genv OMP_NUM_THREADS=23 \
+ python3 run_qa.py \
+ --model_name_or_path bert-large-uncased \
+ --dataset_name squad \
+ --do_train \
+ --do_eval \
+ --per_device_train_batch_size 12  \
+ --learning_rate 3e-5  \
+ --num_train_epochs 2  \
+ --max_seq_length 384 \
+ --doc_stride 128  \
+ --output_dir /tmp/debug_squad/ \
+ --no_cuda \
+ --ddp_backend ccl \
+ --use_ipex \
+ --bf16
+```
diff --git a/docs/source/it/perf_train_special.md b/docs/source/it/perf_train_special.md
new file mode 100644
index 000000000000..afe05d801d66
--- /dev/null
+++ b/docs/source/it/perf_train_special.md
@@ -0,0 +1,24 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Addestramento su Hardware Specializzato
+
+<Tip>
+
+ Nota: Molte delle strategie introdotte nella [sezione sulla GPU singola](perf_train_gpu_one) (come mixed precision training o gradient accumulation) e [sezione multi-GPU](perf_train_gpu_many) sono generiche e applicabili all'addestramento di modelli in generale quindi assicurati di dargli un'occhiata prima di immergerti in questa sezione.
+
+</Tip>
+
+Questo documento sarà presto completato con informazioni su come effettuare la formazione su hardware specializzato.
diff --git a/docs/source/it/perf_train_tpu.md b/docs/source/it/perf_train_tpu.md
new file mode 100644
index 000000000000..663f83c499cb
--- /dev/null
+++ b/docs/source/it/perf_train_tpu.md
@@ -0,0 +1,24 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Addestramento su TPU
+
+<Tip>
+
+ Nota: Molte delle strategie introdotte nella [sezione sulla GPU singola](perf_train_gpu_one) (come mixed precision training o gradient accumulation) e [sezione multi-GPU](perf_train_gpu_many) sono generiche e applicabili all'addestramento di modelli in generale quindi assicurati di dargli un'occhiata prima di immergerti in questa sezione.
+
+</Tip>
+
+Questo documento sarà presto completato con informazioni su come effettuare la formazione su TPU.
diff --git a/docs/source/it/pipeline_tutorial.md b/docs/source/it/pipeline_tutorial.md
new file mode 100644
index 000000000000..056282b164ed
--- /dev/null
+++ b/docs/source/it/pipeline_tutorial.md
@@ -0,0 +1,152 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Pipeline per l'inferenza
+
+La [`pipeline`] rende semplice usare qualsiasi modello dal [Model Hub](https://huggingface.co/models) per fare inferenza su diversi compiti come generazione del testo, segmentazione di immagini e classificazione di audio. Anche se non hai esperienza con una modalità specifica o non comprendi bene il codice che alimenta i modelli, è comunque possibile utilizzarli con l'opzione [`pipeline`]! Questa esercitazione ti insegnerà a:
+
+* Usare una [`pipeline`] per fare inferenza.
+* Usare uno specifico tokenizer o modello.
+* Usare una [`pipeline`] per compiti che riguardano audio e video.
+
+<Tip>
+
+Dai un'occhiata alla documentazione di [`pipeline`] per una lista completa dei compiti supportati.
+
+</Tip>
+
+## Utilizzo della Pipeline
+
+Nonostante ogni compito abbia una [`pipeline`] associata, è più semplice utilizzare l'astrazione generica della [`pipeline`] che contiene tutte quelle specifiche per ogni mansione. La [`pipeline`] carica automaticamente un modello predefinito e un tokenizer in grado di fare inferenza per il tuo compito.
+
+1. Inizia creando una [`pipeline`] e specificando il compito su cui fare inferenza:
+
+```py
+>>> from transformers import pipeline
+
+>>> generator = pipeline(task="text-generation")
+```
+
+2. Inserisci il testo in input nella [`pipeline`]:
+
+```py
+>>> generator(
+...     "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone"
+... )  # doctest: +SKIP
+[{'generated_text': 'Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone, Seven for the Iron-priests at the door to the east, and thirteen for the Lord Kings at the end of the mountain'}]
+```
+
+Se hai più di un input, inseriscilo in una lista:
+
+```py
+>>> generator(
+...     [
+...         "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone",
+...         "Nine for Mortal Men, doomed to die, One for the Dark Lord on his dark throne",
+...     ]
+... )  # doctest: +SKIP
+```
+
+Qualsiasi parametro addizionale per il tuo compito può essere incluso nella [`pipeline`]. La mansione `text-generation` ha un metodo [`~generation.GenerationMixin.generate`] con diversi parametri per controllare l'output. Ad esempio, se desideri generare più di un output, utilizza il parametro `num_return_sequences`:
+
+```py
+>>> generator(
+...     "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone",
+...     num_return_sequences=2,
+... )  # doctest: +SKIP
+```
+
+### Scegliere modello e tokenizer
+
+La [`pipeline`] accetta qualsiasi modello dal [Model Hub](https://huggingface.co/models). Ci sono tag nel Model Hub che consentono di filtrare i modelli per attività. Una volta che avrai scelto il modello appropriato, caricalo usando la corrispondente classe `AutoModelFor` e [`AutoTokenizer`]. Ad esempio, carica la classe [`AutoModelForCausalLM`] per un compito di causal language modeling:
+
+```py
+>>> from transformers import AutoTokenizer, AutoModelForCausalLM
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+>>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+```
+
+Crea una [`pipeline`] per il tuo compito, specificando il modello e il tokenizer che hai caricato:
+
+```py
+>>> from transformers import pipeline
+
+>>> generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
+```
+
+Inserisci il testo di input nella [`pipeline`] per generare del testo:
+
+```py
+>>> generator(
+...     "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone"
+... )  # doctest: +SKIP
+[{'generated_text': 'Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone, Seven for the Dragon-lords (for them to rule in a world ruled by their rulers, and all who live within the realm'}]
+```
+
+## Audio pipeline
+
+La flessibilità della [`pipeline`] fa si che possa essere estesa ad attività sugli audio.
+
+Per esempio, classifichiamo le emozioni in questo clip audio:
+
+```py
+>>> from datasets import load_dataset
+>>> import torch
+
+>>> torch.manual_seed(42)  # doctest: +IGNORE_RESULT
+>>> ds = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+>>> audio_file = ds[0]["audio"]["path"]
+```
+
+Trova un modello per la [classificazione audio](https://huggingface.co/models?pipeline_tag=audio-classification) sul Model Hub per eseguire un compito di riconoscimento automatico delle emozioni e caricalo nella [`pipeline`]:
+
+```py
+>>> from transformers import pipeline
+
+>>> audio_classifier = pipeline(
+...     task="audio-classification", model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
+... )
+```
+
+Inserisci il file audio nella [`pipeline`]:
+
+```py
+>>> preds = audio_classifier(audio_file)
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> preds
+[{'score': 0.1315, 'label': 'calm'}, {'score': 0.1307, 'label': 'neutral'}, {'score': 0.1274, 'label': 'sad'}, {'score': 0.1261, 'label': 'fearful'}, {'score': 0.1242, 'label': 'happy'}]
+```
+
+## Vision pipeline
+
+Infine, usare la [`pipeline`] per le attività sulle immagini è praticamente la stessa cosa.
+
+Specifica la tua attività e inserisci l'immagine nel classificatore. L'immagine può essere sia un link che un percorso sul tuo pc in locale. Per esempio, quale specie di gatto è raffigurata qui sotto?
+
+![pipeline-cat-chonk](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg)
+
+```py
+>>> from transformers import pipeline
+
+>>> vision_classifier = pipeline(task="image-classification")
+>>> preds = vision_classifier(
+...     images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> preds
+[{'score': 0.4335, 'label': 'lynx, catamount'}, {'score': 0.0348, 'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'}, {'score': 0.0324, 'label': 'snow leopard, ounce, Panthera uncia'}, {'score': 0.0239, 'label': 'Egyptian cat'}, {'score': 0.0229, 'label': 'tiger cat'}]
+```
diff --git a/docs/source/it/pipeline_tutorial.mdx b/docs/source/it/pipeline_tutorial.mdx
deleted file mode 100644
index 64347164505f..000000000000
--- a/docs/source/it/pipeline_tutorial.mdx
+++ /dev/null
@@ -1,148 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Pipeline per l'inferenza
-
-La [`pipeline`] rende semplice usare qualsiasi modello dal [Model Hub](https://huggingface.co/models) per fare inferenza su diversi compiti come generazione del testo, segmentazione di immagini e classificazione di audio. Anche se non hai esperienza con una modalità specifica o non comprendi bene il codice che alimenta i modelli, è comunque possibile utilizzarli con l'opzione [`pipeline`]! Questa esercitazione ti insegnerà a:
-
-* Usare una [`pipeline`] per fare inferenza.
-* Usare uno specifico tokenizer o modello.
-* Usare una [`pipeline`] per compiti che riguardano audio e video.
-
-<Tip>
-
-Dai un'occhiata alla documentazione di [`pipeline`] per una lista completa dei compiti supportati.
-
-</Tip>
-
-## Utilizzo della Pipeline
-
-Nonostante ogni compito abbia una [`pipeline`] associata, è più semplice utilizzare l'astrazione generica della [`pipeline`] che contiene tutte quelle specifiche per ogni mansione. La [`pipeline`] carica automaticamente un modello predefinito e un tokenizer in grado di fare inferenza per il tuo compito.
-
-1. Inizia creando una [`pipeline`] e specificando il compito su cui fare inferenza:
-
-```py
->>> from transformers import pipeline
-
->>> generator = pipeline(task="text-generation")
-```
-
-2. Inserisci il testo in input nella [`pipeline`]:
-
-```py
->>> generator(
-...     "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone"
-... )  # doctest: +SKIP
-[{'generated_text': 'Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone, Seven for the Iron-priests at the door to the east, and thirteen for the Lord Kings at the end of the mountain'}]
-```
-
-Se hai più di un input, inseriscilo in una lista:
-
-```py
->>> generator(
-...     [
-...         "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone",
-...         "Nine for Mortal Men, doomed to die, One for the Dark Lord on his dark throne",
-...     ]
-... )  # doctest: +SKIP
-```
-
-Qualsiasi parametro addizionale per il tuo compito può essere incluso nella [`pipeline`]. La mansione `text-generation` ha un metodo [`~generation.GenerationMixin.generate`] con diversi parametri per controllare l'output. Ad esempio, se desideri generare più di un output, utilizza il parametro `num_return_sequences`:
-
-```py
->>> generator(
-...     "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone",
-...     num_return_sequences=2,
-... )  # doctest: +SKIP
-```
-
-### Scegliere modello e tokenizer
-
-La [`pipeline`] accetta qualsiasi modello dal [Model Hub](https://huggingface.co/models). Ci sono tag nel Model Hub che consentono di filtrare i modelli per attività. Una volta che avrai scelto il modello appropriato, caricalo usando la corrispondente classe `AutoModelFor` e [`AutoTokenizer`]. Ad esempio, carica la classe [`AutoModelForCausalLM`] per un compito di causal language modeling:
-
-```py
->>> from transformers import AutoTokenizer, AutoModelForCausalLM
-
->>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
->>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
-```
-
-Crea una [`pipeline`] per il tuo compito, specificando il modello e il tokenizer che hai caricato:
-
-```py
->>> from transformers import pipeline
-
->>> generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
-```
-
-Inserisci il testo di input nella [`pipeline`] per generare del testo:
-
-```py
->>> generator(
-...     "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone"
-... )  # doctest: +SKIP
-[{'generated_text': 'Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone, Seven for the Dragon-lords (for them to rule in a world ruled by their rulers, and all who live within the realm'}]
-```
-
-## Audio pipeline
-
-La flessibilità della [`pipeline`] fa si che possa essere estesa ad attività sugli audio.
-
-Per esempio, classifichiamo le emozioni in questo clip audio:
-
-```py
->>> from datasets import load_dataset
->>> import torch
-
->>> torch.manual_seed(42)  # doctest: +IGNORE_RESULT
->>> ds = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
->>> audio_file = ds[0]["audio"]["path"]
-```
-
-Trova un modello per la [classificazione audio](https://huggingface.co/models?pipeline_tag=audio-classification) sul Model Hub per eseguire un compito di riconoscimento automatico delle emozioni e caricalo nella [`pipeline`]:
-
-```py
->>> from transformers import pipeline
-
->>> audio_classifier = pipeline(
-...     task="audio-classification", model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
-... )
-```
-
-Inserisci il file audio nella [`pipeline`]:
-
-```py
->>> preds = audio_classifier(audio_file)
->>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
->>> preds
-[{'score': 0.1315, 'label': 'calm'}, {'score': 0.1307, 'label': 'neutral'}, {'score': 0.1274, 'label': 'sad'}, {'score': 0.1261, 'label': 'fearful'}, {'score': 0.1242, 'label': 'happy'}]
-```
-
-## Vision pipeline
-
-Infine, usare la [`pipeline`] per le attività sulle immagini è praticamente la stessa cosa.
-
-Specifica la tua attività e inserisci l'immagine nel classificatore. L'immagine può essere sia un link che un percorso sul tuo pc in locale. Per esempio, quale specie di gatto è raffigurata qui sotto?
-
-![pipeline-cat-chonk](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg)
-
-```py
->>> from transformers import pipeline
-
->>> vision_classifier = pipeline(task="image-classification")
->>> preds = vision_classifier(
-...     images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
-... )
->>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
->>> preds
-[{'score': 0.4335, 'label': 'lynx, catamount'}, {'score': 0.0348, 'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'}, {'score': 0.0324, 'label': 'snow leopard, ounce, Panthera uncia'}, {'score': 0.0239, 'label': 'Egyptian cat'}, {'score': 0.0229, 'label': 'tiger cat'}]
-```
diff --git a/docs/source/it/pr_checks.md b/docs/source/it/pr_checks.md
new file mode 100644
index 000000000000..caa5fe32965b
--- /dev/null
+++ b/docs/source/it/pr_checks.md
@@ -0,0 +1,135 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Controlli su una Pull Request
+
+Quando apri una pull request sui 🤗 Transformers, vengono eseguiti un discreto numero di controlli per assicurarsi che la patch che stai aggiungendo non stia rompendo qualcosa di esistente. Questi controlli sono di quattro tipi:
+- test regolari
+- costruzione della documentazione
+- stile del codice e della documentazione
+- coerenza generale del repository
+
+In questo documento, cercheremo di spiegare quali sono i vari controlli e le loro ragioni, oltre a spiegare come eseguire il debug locale se uno di essi fallisce sulla tua PR.
+
+Nota che tutti richiedono un'installazione dev:
+
+```bash
+pip install transformers[dev]
+```
+
+o un'installazione modificabile:
+
+```bash
+pip install -e .[dev]
+```
+
+all'interno del repo Transformers.
+
+## Tests
+
+Tutti i job che iniziano con `ci/circleci: run_tests_` eseguono parti della suite di test dei Transformers. Ognuno di questi job si concentra su una parte della libreria in un determinato ambiente: per esempio `ci/circleci: run_tests_pipelines_tf` esegue il test delle pipeline in un ambiente in cui è installato solo TensorFlow.
+
+Nota che per evitare di eseguire i test quando non ci sono cambiamenti reali nei moduli che si stanno testando, ogni volta viene eseguita solo una parte della suite di test: viene eseguita una utility per determinare le differenze nella libreria tra prima e dopo la PR (ciò che GitHub mostra nella scheda "Files changes") e sceglie i test che sono stati impattati dalla diff. Questa utility può essere eseguita localmente con:
+
+```bash
+python utils/tests_fetcher.py
+```
+
+dalla root del repo Transformers. Di seguito ciò che farà:
+
+1. Controlla per ogni file nel diff se le modifiche sono nel codice o solo nei commenti o nelle docstrings. Vengono mantenuti solo i file con modifiche reali al codice.
+2. Costruisce una mappa interna che fornisce per ogni file del codice sorgente della libreria tutti i file su cui ha un impatto ricorsivo. Si dice che il modulo A ha un impatto sul modulo B se il modulo B importa il modulo A. Per l'impatto ricorsivo, abbiamo bisogno di una catena di moduli che va dal modulo A al modulo B in cui ogni modulo importa il precedente.
+3. Applica questa mappa ai file raccolti nel passaggio 1, si ottiene l'elenco dei file del modello interessati dalla PR.
+4. Mappa ciascuno di questi file con i corrispondenti file di test e ottiene l'elenco dei test da eseguire.
+
+Quando esegui lo script in locale, dovresti ottenere la stampa dei risultati dei passi 1, 3 e 4 e quindi sapere quali test sono stati eseguiti. Lo script creerà anche un file chiamato `test_list.txt` che contiene l'elenco dei test da eseguire e che puoi eseguire localmente con il seguente comando:
+
+```bash
+python -m pytest -n 8 --dist=loadfile -rA -s $(cat test_list.txt)
+```
+
+Nel caso in cui qualcosa sia sfuggito, l'intera suite di test viene eseguita quotidianamente.
+
+## Build della documentazione
+
+Il job `ci/circleci: build_doc` esegue una build della documentazione per assicurarsi che tutto sia a posto una volta che la PR è stata unita. Se questo passaggio fallisce, puoi controllare localmente entrando nella cartella `docs` del repo Transformers e digitare
+
+```bash
+make html
+```
+
+Sphinx non è noto per i suoi messaggi di errore chiari, quindi potrebbe essere necessario che provi alcune cose per trovare davvero la fonte dell'errore.
+
+## Stile del codice e della documentazione
+
+La formattazione del codice viene applicata a tutti i file sorgenti, agli esempi e ai test usando `black` e `isort`. Abbiamo anche uno strumento personalizzato che si occupa della formattazione delle docstring e dei file `rst` (`utils/style_doc.py`), così come dell'ordine dei lazy imports eseguiti nei file `__init__.py` dei Transformers (`utils/custom_init_isort.py`). Tutto questo può essere lanciato eseguendo
+
+```bash
+make style
+```
+
+I controlli della CI sono applicati all'interno del controllo `ci/circleci: check_code_quality`. Esegue anche `flake8`, che dà un'occhiata di base al codice e si lamenta se trova una variabile non definita o non utilizzata. Per eseguire questo controllo localmente, usare
+
+```bash
+make quality
+```
+
+Questa operazione può richiedere molto tempo, quindi per eseguire la stessa operazione solo sui file modificati nel branch corrente, eseguire
+
+```bash
+make fixup
+```
+
+Quest'ultimo comando eseguirà anche tutti i controlli aggiuntivi per la consistenza del repository. Diamogli un'occhiata.
+
+## Coerenza del repository
+
+All'interno sono raggruppati tutti i test per assicurarsi che la tua PR lasci il repository in un buono stato ed è eseguito dal controllo `ci/circleci: check_repository_consistency`. Puoi eseguire localmente questo controllo eseguendo quanto segue:
+
+```bash
+make repo-consistency
+```
+
+Questo verifica che:
+
+- Tutti gli oggetti aggiunti all'init sono documentati (eseguito da `utils/check_repo.py`)
+- Tutti i file `__init__.py` hanno lo stesso contenuto nelle loro due sezioni (eseguito da `utils/check_inits.py`)
+- Tutto il codice identificato come copia da un altro modulo è coerente con l'originale (eseguito da `utils/check_copies.py`)
+- Le traduzioni dei README e l'indice della documentazione hanno lo stesso elenco di modelli del README principale (eseguito da `utils/check_copies.py`)
+- Le tabelle autogenerate nella documentazione sono aggiornate (eseguito da `utils/check_table.py`)
+- La libreria ha tutti gli oggetti disponibili anche se non tutte le dipendenze opzionali sono installate (eseguito da `utils/check_dummies.py`)
+
+Se questo controllo fallisce, le prime due voci richiedono una correzione manuale, mentre le ultime quattro possono essere corrette automaticamente per te eseguendo il comando
+
+```bash
+make fix-copies
+```
+
+Ulteriori controlli riguardano le PR che aggiungono nuovi modelli, principalmente che:
+
+- Tutti i modelli aggiunti sono in un Auto-mapping (eseguita da `utils/check_repo.py`)
+<!-- TODO Sylvain, add a check that makes sure the common tests are implemented.-->
+- Tutti i modelli sono testati correttamente (eseguito da `utils/check_repo.py`)
+
+<!-- TODO Sylvain, add the following
+- All models are added to the main README, inside the main doc
+- All checkpoints used actually exist on the Hub
+
+-->
\ No newline at end of file
diff --git a/docs/source/it/preprocessing.md b/docs/source/it/preprocessing.md
new file mode 100644
index 000000000000..94578dfe166b
--- /dev/null
+++ b/docs/source/it/preprocessing.md
@@ -0,0 +1,491 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Preprocess
+
+[[open-in-colab]]
+
+Prima di poter usare i dati in un modello, bisogna processarli in un formato accettabile per quest'ultimo. Un modello non comprende il testo grezzo, le immagini o l'audio. Bisogna convertire questi input in numeri e assemblarli all'interno di tensori. In questa esercitazione, tu potrai:
+
+* Preprocessare dati testuali con un tokenizer.
+* Preprocessare immagini o dati audio con un estrattore di caratteristiche.
+* Preprocessare dati per attività multimodali mediante un processore.
+
+## NLP
+
+<Youtube id="Yffk5aydLzg"/>
+
+Lo strumento principale per processare dati testuali è un [tokenizer](main_classes/tokenizer). Un tokenizer inizia separando il testo in *tokens* secondo una serie di regole. I tokens sono convertiti in numeri, questi vengono utilizzati per costruire i tensori di input del modello. Anche altri input addizionali se richiesti dal modello vengono aggiunti dal tokenizer.
+
+<Tip>
+
+Se stai pensando si utilizzare un modello preaddestrato, è importante utilizzare il tokenizer preaddestrato associato. Questo assicura che il testo sia separato allo stesso modo che nel corpus usato per l'addestramento, e venga usata la stessa mappatura tokens-to-index (solitamente indicato come il *vocabolario*) come nel preaddestramento.
+
+</Tip>
+
+Iniziamo subito caricando un tokenizer preaddestrato con la classe [`AutoTokenizer`]. Questo scarica il *vocabolario* usato quando il modello è stato preaddestrato.
+
+### Tokenize
+
+Carica un tokenizer preaddestrato con [`AutoTokenizer.from_pretrained`]:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+```
+
+Poi inserisci le tue frasi nel tokenizer:
+
+```py
+>>> encoded_input = tokenizer("Do not meddle in the affairs of wizards, for they are subtle and quick to anger.")
+>>> print(encoded_input)
+{'input_ids': [101, 2079, 2025, 19960, 10362, 1999, 1996, 3821, 1997, 16657, 1010, 2005, 2027, 2024, 11259, 1998, 4248, 2000, 4963, 1012, 102], 
+ 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+ 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+```
+
+Il tokenizer restituisce un dizionario contenente tre oggetti importanti:
+
+* [input_ids](glossary#input-ids) sono gli indici che corrispondono ad ogni token nella frase.
+* [attention_mask](glossary#attention-mask) indicata se un token deve essere elaborato o no.
+* [token_type_ids](glossary#token-type-ids) identifica a quale sequenza appartiene un token se è presente più di una sequenza.
+
+Si possono decodificare gli `input_ids` per farsi restituire l'input originale:
+
+```py
+>>> tokenizer.decode(encoded_input["input_ids"])
+'[CLS] Do not meddle in the affairs of wizards, for they are subtle and quick to anger. [SEP]'
+```
+
+Come si può vedere, il tokenizer aggiunge due token speciali - `CLS` e `SEP` (classificatore e separatore) - alla frase. Non tutti i modelli hanno bisogno dei token speciali, ma se servono, il tokenizer li aggiungerà automaticamente.
+
+Se ci sono più frasi che vuoi processare, passale come una lista al tokenizer:
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_inputs = tokenizer(batch_sentences)
+>>> print(encoded_inputs)
+{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102], 
+               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], 
+               [101, 1327, 1164, 5450, 23434, 136, 102]], 
+ 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], 
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+                    [0, 0, 0, 0, 0, 0, 0]], 
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], 
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
+                    [1, 1, 1, 1, 1, 1, 1]]}
+```
+
+### Pad
+
+Questo è un argomento importante. Quando processi un insieme di frasi potrebbero non avere tutte la stessa lunghezza. Questo è un problema perchè i tensori, in input del modello, devono avere dimensioni uniformi. Il padding è una strategia per assicurarsi che i tensori siano rettangolari aggiungendo uno speciale *padding token* alle frasi più corte.
+
+Imposta il parametro `padding` a `True` per imbottire le frasi più corte nel gruppo in modo che combacino con la massima lunghezza presente:
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch_sentences, padding=True)
+>>> print(encoded_input)
+{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], 
+               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], 
+               [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], 
+ 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], 
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
+                    [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]}
+```
+
+Nota che il tokenizer aggiunge alle sequenze degli `0` perchè sono troppo corte!
+
+### Truncation
+
+L'altra faccia della medaglia è che avolte le sequenze possono essere troppo lunghe per essere gestite dal modello. In questo caso, avrai bisogno di troncare la sequenza per avere una lunghezza minore.
+
+Imposta il parametro `truncation` a `True` per troncare una sequenza alla massima lunghezza accettata dal modello:
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True)
+>>> print(encoded_input)
+{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], 
+               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], 
+               [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], 
+ 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], 
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
+                    [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]}
+```
+
+### Costruire i tensori
+
+Infine, vuoi che il tokenizer restituisca i tensori prodotti dal modello.
+
+Imposta il parametro `return_tensors` su `pt` per PyTorch, o `tf` per TensorFlow:
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
+>>> print(encoded_input)
+{'input_ids': tensor([[  101,   153,  7719, 21490,  1122,  1114,  9582,  1623,   102],
+                      [  101,  5226,  1122,  9649,  1199,  2610,  1236,   102,     0]]), 
+ 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0],
+                           [0, 0, 0, 0, 0, 0, 0, 0, 0]]), 
+ 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1],
+                           [1, 1, 1, 1, 1, 1, 1, 1, 0]])}
+===PT-TF-SPLIT===
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors="tf")
+>>> print(encoded_input)
+{'input_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
+array([[  101,   153,  7719, 21490,  1122,  1114,  9582,  1623,   102],
+       [  101,  5226,  1122,  9649,  1199,  2610,  1236,   102,     0]],
+      dtype=int32)>, 
+ 'token_type_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
+array([[0, 0, 0, 0, 0, 0, 0, 0, 0],
+       [0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>, 
+ 'attention_mask': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
+array([[1, 1, 1, 1, 1, 1, 1, 1, 1],
+       [1, 1, 1, 1, 1, 1, 1, 1, 0]], dtype=int32)>}
+```
+
+## Audio
+
+Gli input audio sono processati in modo differente rispetto al testo, ma l'obiettivo rimane lo stesso: creare sequenze numeriche che il modello può capire. Un [estrattore di caratteristiche](main_classes/feature_extractor) è progettato con lo scopo preciso di estrarre caratteristiche da immagini o dati audio grezzi e convertirli in tensori. Prima di iniziare, installa 🤗 Datasets per caricare un dataset audio e sperimentare:
+
+```bash
+pip install datasets
+```
+
+Carica il dataset [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) (vedi il 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub.html) per avere maggiori dettagli su come caricare un dataset):
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
+```
+
+Accedi al primo elemento della colonna `audio` per dare uno sguardo all'input. Richiamando la colonna `audio` sarà caricato automaticamente e ricampionato il file audio:
+
+```py
+>>> dataset[0]["audio"]
+{'array': array([ 0.        ,  0.00024414, -0.00024414, ..., -0.00024414,
+         0.        ,  0.        ], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
+ 'sampling_rate': 8000}
+```
+
+Questo restituisce tre oggetti:
+
+* `array` è il segnale vocale caricato - e potenzialmente ricampionato - come vettore 1D.
+* `path` il percorso del file audio.
+* `sampling_rate` si riferisce al numero di campioni del segnale vocale misurati al secondo.
+
+### Ricampionamento
+
+Per questo tutorial, puoi usare il modello [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base). Come puoi vedere dalla model card, il modello Wav2Vec2 è preaddestrato su un campionamento vocale a 16kHz.È importante che la frequenza di campionamento dei tuoi dati audio combaci con la frequenza di campionamento del dataset usato per preaddestrare il modello. Se la frequenza di campionamento dei tuoi dati non è uguale dovrai ricampionare i tuoi dati audio.
+
+Per esempio, il dataset [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) ha una frequenza di campionamento di 8000kHz. Utilizzando il modello Wav2Vec2 su questo dataset, alzala a 16kHz:
+
+```py
+>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
+>>> dataset[0]["audio"]
+{'array': array([ 0.        ,  0.00024414, -0.00024414, ..., -0.00024414,
+         0.        ,  0.        ], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
+ 'sampling_rate': 8000}
+```
+
+1. Usa il metodo di 🤗 Datasets' [`cast_column`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.cast_column) per alzare la frequenza di campionamento a 16kHz:
+
+```py
+>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000))
+```
+
+2. Carica il file audio:
+
+```py
+>>> dataset[0]["audio"]
+{'array': array([ 2.3443763e-05,  2.1729663e-04,  2.2145823e-04, ...,
+         3.8356509e-05, -7.3497440e-06, -2.1754686e-05], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
+ 'sampling_rate': 16000}
+```
+
+Come puoi notare, la `sampling_rate` adesso è 16kHz!
+
+### Feature extractor
+
+Il prossimo passo è caricare un estrattore di caratteristiche per normalizzare e fare padding sull'input. Quando applichiamo il padding sui dati testuali, uno `0` è aggiunto alle sequenze più brevi. La stessa idea si applica ai dati audio, l'estrattore di caratteristiche per gli audio aggiungerà uno `0` - interpretato come silenzio - agli `array`.
+
+Carica l'estrattore delle caratteristiche con [`AutoFeatureExtractor.from_pretrained`]:
+
+```py
+>>> from transformers import AutoFeatureExtractor
+
+>>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
+```
+
+Inserisci l' `array` audio nell'estrattore delle caratteristiche. Noi raccomandiamo sempre di aggiungere il parametro `sampling_rate` nell'estrattore delle caratteristiche per correggere meglio qualche errore, dovuto ai silenzi, che potrebbe verificarsi.
+
+```py
+>>> audio_input = [dataset[0]["audio"]["array"]]
+>>> feature_extractor(audio_input, sampling_rate=16000)
+{'input_values': [array([ 3.8106556e-04,  2.7506407e-03,  2.8015103e-03, ...,
+        5.6335266e-04,  4.6588284e-06, -1.7142107e-04], dtype=float32)]}
+```
+
+### Pad e truncate
+
+Come per il tokenizer, puoi applicare le operazioni padding o truncation per manipolare sequenze di variabili a lotti. Dai uno sguaro alla lunghezza delle sequenze di questi due campioni audio:
+
+```py
+>>> dataset[0]["audio"]["array"].shape
+(173398,)
+
+>>> dataset[1]["audio"]["array"].shape
+(106496,)
+```
+
+Come puoi vedere, il primo campione ha una sequenza più lunga del secondo. Crea una funzione che preprocesserà il dataset. Specifica una lunghezza massima del campione, e l'estrattore di features si occuperà di riempire o troncare la sequenza per coincidervi:
+
+```py
+>>> def preprocess_function(examples):
+...     audio_arrays = [x["array"] for x in examples["audio"]]
+...     inputs = feature_extractor(
+...         audio_arrays,
+...         sampling_rate=16000,
+...         padding=True,
+...         max_length=100000,
+...         truncation=True,
+...     )
+...     return inputs
+```
+
+Applica la funzione ai primi esempi nel dataset:
+
+```py
+>>> processed_dataset = preprocess_function(dataset[:5])
+```
+
+Adesso guarda la lunghezza dei campioni elaborati:
+
+```py
+>>> processed_dataset["input_values"][0].shape
+(100000,)
+
+>>> processed_dataset["input_values"][1].shape
+(100000,)
+```
+
+La lunghezza dei campioni adesso coincide con la massima lunghezza impostata nelle funzione.
+
+## Vision
+
+Un estrattore di caratteristiche si può usare anche per processare immagini e per compiti di visione. Ancora una volta, l'obiettivo è convertire l'immagine grezza in un lotto di tensori come input.
+
+Carica il dataset [food101](https://huggingface.co/datasets/food101) per questa esercitazione. Usa il parametro `split` di 🤗 Datasets  per caricare solo un piccolo campione dal dataset di addestramento poichè il set di dati è molto grande:
+
+```py
+>>> from datasets import load_dataset
+
+>>> dataset = load_dataset("food101", split="train[:100]")
+```
+
+Secondo passo, dai uno sguardo alle immagini usando la caratteristica [`Image`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=image#datasets.Image) di 🤗 Datasets:
+
+```py
+>>> dataset[0]["image"]
+```
+
+![vision-preprocess-tutorial.png](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vision-preprocess-tutorial.png)
+
+### Feature extractor
+
+Carica l'estrattore di caratteristiche [`AutoFeatureExtractor.from_pretrained`]:
+
+```py
+>>> from transformers import AutoFeatureExtractor
+
+>>> feature_extractor = AutoFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
+```
+
+### Data augmentation
+
+Per le attività di visione, è usuale aggiungere alcuni tipi di data augmentation alle immagini come parte del preprocessing. Puoi aggiungere augmentations con qualsiasi libreria che preferisci, ma in questa esercitazione, userai il modulo [`transforms`](https://pytorch.org/vision/stable/transforms.html) di torchvision.
+
+1. Normalizza l'immagine e usa [`Compose`](https://pytorch.org/vision/master/generated/torchvision.transforms.Compose.html) per concatenare alcune trasformazioni - [`RandomResizedCrop`](https://pytorch.org/vision/main/generated/torchvision.transforms.RandomResizedCrop.html) e [`ColorJitter`](https://pytorch.org/vision/main/generated/torchvision.transforms.ColorJitter.html) - insieme:
+
+```py
+>>> from torchvision.transforms import Compose, Normalize, RandomResizedCrop, ColorJitter, ToTensor
+
+>>> normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
+>>> _transforms = Compose(
+...     [RandomResizedCrop(feature_extractor.size), ColorJitter(brightness=0.5, hue=0.5), ToTensor(), normalize]
+... )
+```
+
+2. Il modello accetta [`pixel_values`](model_doc/visionencoderdecoder#transformers.VisionEncoderDecoderModel.forward.pixel_values) come input. Questo valore è generato dall'estrattore di caratteristiche. Crea una funzione che genera `pixel_values` dai transforms:
+
+```py
+>>> def transforms(examples):
+...     examples["pixel_values"] = [_transforms(image.convert("RGB")) for image in examples["image"]]
+...     return examples
+```
+
+3. Poi utilizza 🤗 Datasets [`set_transform`](https://huggingface.co/docs/datasets/process.html#format-transform)per applicare al volo la trasformazione:
+
+```py
+>>> dataset.set_transform(transforms)
+```
+
+4. Adesso quando accedi all'immagine, puoi notare che l'estrattore di caratteristiche ha aggiunto `pixel_values` allo schema di input:
+
+```py
+>>> dataset[0]["image"]
+{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=384x512 at 0x7F1A7B0630D0>,
+ 'label': 6,
+ 'pixel_values': tensor([[[ 0.0353,  0.0745,  0.1216,  ..., -0.9922, -0.9922, -0.9922],
+          [-0.0196,  0.0667,  0.1294,  ..., -0.9765, -0.9843, -0.9922],
+          [ 0.0196,  0.0824,  0.1137,  ..., -0.9765, -0.9686, -0.8667],
+          ...,
+          [ 0.0275,  0.0745,  0.0510,  ..., -0.1137, -0.1216, -0.0824],
+          [ 0.0667,  0.0824,  0.0667,  ..., -0.0588, -0.0745, -0.0980],
+          [ 0.0353,  0.0353,  0.0431,  ..., -0.0039, -0.0039, -0.0588]],
+ 
+         [[ 0.2078,  0.2471,  0.2863,  ..., -0.9451, -0.9373, -0.9451],
+          [ 0.1608,  0.2471,  0.3098,  ..., -0.9373, -0.9451, -0.9373],
+          [ 0.2078,  0.2706,  0.3020,  ..., -0.9608, -0.9373, -0.8275],
+          ...,
+          [-0.0353,  0.0118, -0.0039,  ..., -0.2392, -0.2471, -0.2078],
+          [ 0.0196,  0.0353,  0.0196,  ..., -0.1843, -0.2000, -0.2235],
+          [-0.0118, -0.0039, -0.0039,  ..., -0.0980, -0.0980, -0.1529]],
+ 
+         [[ 0.3961,  0.4431,  0.4980,  ..., -0.9216, -0.9137, -0.9216],
+          [ 0.3569,  0.4510,  0.5216,  ..., -0.9059, -0.9137, -0.9137],
+          [ 0.4118,  0.4745,  0.5216,  ..., -0.9137, -0.8902, -0.7804],
+          ...,
+          [-0.2314, -0.1922, -0.2078,  ..., -0.4196, -0.4275, -0.3882],
+          [-0.1843, -0.1686, -0.2000,  ..., -0.3647, -0.3804, -0.4039],
+          [-0.1922, -0.1922, -0.1922,  ..., -0.2941, -0.2863, -0.3412]]])}
+```
+
+Di seguito come si vede l'immagine dopo la fase di preprocessing. Come ci si aspetterebbe dalle trasformazioni applicate, l'immagine è stata ritagliata in modo casuale e le proprietà del colore sono diverse.
+
+```py
+>>> import numpy as np
+>>> import matplotlib.pyplot as plt
+
+>>> img = dataset[0]["pixel_values"]
+>>> plt.imshow(img.permute(1, 2, 0))
+```
+
+![preprocessed_image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/preprocessed_image.png)
+
+## Multimodal
+
+Per attività multimodali userai una combinazione di tutto quello che hai imparato poco fa e applicherai le tue competenze alla comprensione automatica del parlato (Automatic Speech Recognition -  ASR). Questo significa che avrai bisogno di:
+
+* Un estrattore delle caratteristiche per processare i dati audio.
+* Il Tokenizer per processare i testi.
+
+Ritorna sul datasere [LJ Speech](https://huggingface.co/datasets/lj_speech):
+
+```py
+>>> from datasets import load_dataset
+
+>>> lj_speech = load_dataset("lj_speech", split="train")
+```
+
+Visto che sei interessato solo alle colonne `audio` e `text`, elimina tutte le altre:
+
+```py
+>>> lj_speech = lj_speech.map(remove_columns=["file", "id", "normalized_text"])
+```
+
+Adesso guarda le colonne `audio` e `text`:
+
+```py
+>>> lj_speech[0]["audio"]
+{'array': array([-7.3242188e-04, -7.6293945e-04, -6.4086914e-04, ...,
+         7.3242188e-04,  2.1362305e-04,  6.1035156e-05], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav',
+ 'sampling_rate': 22050}
+
+>>> lj_speech[0]["text"]
+'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition'
+```
+
+Ricorda dalla sezione precedente sull'elaborazione dei dati audio, tu dovresti sempre [ricampionare](preprocessing#audio) la frequenza di campionamento dei tuoi dati audio per farla coincidere con quella del dataset usato dal modello preaddestrato:
+
+```py
+>>> lj_speech = lj_speech.cast_column("audio", Audio(sampling_rate=16_000))
+```
+
+### Processor
+
+Un processor combina un estrattore di caratteristiche e un tokenizer. Carica un processor con [`AutoProcessor.from_pretrained]:
+
+```py
+>>> from transformers import AutoProcessor
+
+>>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
+```
+
+1. Crea una funzione che processi i dati audio in `input_values`, e tokenizza il testo in `labels`. Questi sono i tuoi input per il modello:
+
+```py
+>>> def prepare_dataset(example):
+...     audio = example["audio"]
+
+...     example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000))
+
+...     return example
+```
+
+2. Applica la funzione `prepare_dataset` ad un campione:
+
+```py
+>>> prepare_dataset(lj_speech[0])
+```
+
+Nota che il processor ha aggiunto `input_values` e `labels`. La frequenza di campionamento è stata corretta riducendola a 16kHz.
+
+Fantastico, ora dovresti essere in grado di preelaborare i dati per qualsiasi modalità e persino di combinare modalità diverse! Nella prossima esercitazione, impareremo a mettere a punto un modello sui dati appena pre-elaborati.
\ No newline at end of file
diff --git a/docs/source/it/preprocessing.mdx b/docs/source/it/preprocessing.mdx
deleted file mode 100644
index a57ff9df9151..000000000000
--- a/docs/source/it/preprocessing.mdx
+++ /dev/null
@@ -1,487 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Preprocess
-
-[[open-in-colab]]
-
-Prima di poter usare i dati in un modello, bisogna processarli in un formato accettabile per quest'ultimo. Un modello non comprende il testo grezzo, le immagini o l'audio. Bisogna convertire questi input in numeri e assemblarli all'interno di tensori. In questa esercitazione, tu potrai:
-
-* Preprocessare dati testuali con un tokenizer.
-* Preprocessare immagini o dati audio con un estrattore di caratteristiche.
-* Preprocessare dati per attività multimodali mediante un processore.
-
-## NLP
-
-<Youtube id="Yffk5aydLzg"/>
-
-Lo strumento principale per processare dati testuali è un [tokenizer](main_classes/tokenizer). Un tokenizer inizia separando il testo in *tokens* secondo una serie di regole. I tokens sono convertiti in numeri, questi vengono utilizzati per costruire i tensori di input del modello. Anche altri input addizionali se richiesti dal modello vengono aggiunti dal tokenizer.
-
-<Tip>
-
-Se stai pensando si utilizzare un modello preaddestrato, è importante utilizzare il tokenizer preaddestrato associato. Questo assicura che il testo sia separato allo stesso modo che nel corpus usato per l'addestramento, e venga usata la stessa mappatura tokens-to-index (solitamente indicato come il *vocabolario*) come nel preaddestramento.
-
-</Tip>
-
-Iniziamo subito caricando un tokenizer preaddestrato con la classe [`AutoTokenizer`]. Questo scarica il *vocabolario* usato quando il modello è stato preaddestrato.
-
-### Tokenize
-
-Carica un tokenizer preaddestrato con [`AutoTokenizer.from_pretrained`]:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
-```
-
-Poi inserisci le tue frasi nel tokenizer:
-
-```py
->>> encoded_input = tokenizer("Do not meddle in the affairs of wizards, for they are subtle and quick to anger.")
->>> print(encoded_input)
-{'input_ids': [101, 2079, 2025, 19960, 10362, 1999, 1996, 3821, 1997, 16657, 1010, 2005, 2027, 2024, 11259, 1998, 4248, 2000, 4963, 1012, 102], 
- 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
- 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
-```
-
-Il tokenizer restituisce un dizionario contenente tre oggetti importanti:
-
-* [input_ids](glossary#input-ids) sono gli indici che corrispondono ad ogni token nella frase.
-* [attention_mask](glossary#attention-mask) indicata se un token deve essere elaborato o no.
-* [token_type_ids](glossary#token-type-ids) identifica a quale sequenza appartiene un token se è presente più di una sequenza.
-
-Si possono decodificare gli `input_ids` per farsi restituire l'input originale:
-
-```py
->>> tokenizer.decode(encoded_input["input_ids"])
-'[CLS] Do not meddle in the affairs of wizards, for they are subtle and quick to anger. [SEP]'
-```
-
-Come si può vedere, il tokenizer aggiunge due token speciali - `CLS` e `SEP` (classificatore e separatore) - alla frase. Non tutti i modelli hanno bisogno dei token speciali, ma se servono, il tokenizer li aggiungerà automaticamente.
-
-Se ci sono più frasi che vuoi processare, passale come una lista al tokenizer:
-
-```py
->>> batch_sentences = [
-...     "But what about second breakfast?",
-...     "Don't think he knows about second breakfast, Pip.",
-...     "What about elevensies?",
-... ]
->>> encoded_inputs = tokenizer(batch_sentences)
->>> print(encoded_inputs)
-{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102], 
-               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], 
-               [101, 1327, 1164, 5450, 23434, 136, 102]], 
- 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], 
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
-                    [0, 0, 0, 0, 0, 0, 0]], 
- 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], 
-                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
-                    [1, 1, 1, 1, 1, 1, 1]]}
-```
-
-### Pad
-
-Questo è un argomento importante. Quando processi un insieme di frasi potrebbero non avere tutte la stessa lunghezza. Questo è un problema perchè i tensori, in input del modello, devono avere dimensioni uniformi. Il padding è una strategia per assicurarsi che i tensori siano rettangolari aggiungendo uno speciale *padding token* alle frasi più corte.
-
-Imposta il parametro `padding` a `True` per imbottire le frasi più corte nel gruppo in modo che combacino con la massima lunghezza presente:
-
-```py
->>> batch_sentences = [
-...     "But what about second breakfast?",
-...     "Don't think he knows about second breakfast, Pip.",
-...     "What about elevensies?",
-... ]
->>> encoded_input = tokenizer(batch_sentences, padding=True)
->>> print(encoded_input)
-{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], 
-               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], 
-               [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], 
- 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 
- 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], 
-                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
-                    [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]}
-```
-
-Nota che il tokenizer aggiunge alle sequenze degli `0` perchè sono troppo corte!
-
-### Truncation
-
-L'altra faccia della medaglia è che avolte le sequenze possono essere troppo lunghe per essere gestite dal modello. In questo caso, avrai bisogno di troncare la sequenza per avere una lunghezza minore.
-
-Imposta il parametro `truncation` a `True` per troncare una sequenza alla massima lunghezza accettata dal modello:
-
-```py
->>> batch_sentences = [
-...     "But what about second breakfast?",
-...     "Don't think he knows about second breakfast, Pip.",
-...     "What about elevensies?",
-... ]
->>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True)
->>> print(encoded_input)
-{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], 
-               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], 
-               [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], 
- 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 
- 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], 
-                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
-                    [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]}
-```
-
-### Costruire i tensori
-
-Infine, vuoi che il tokenizer restituisca i tensori prodotti dal modello.
-
-Imposta il parametro `return_tensors` su `pt` per PyTorch, o `tf` per TensorFlow:
-
-```py
->>> batch_sentences = [
-...     "But what about second breakfast?",
-...     "Don't think he knows about second breakfast, Pip.",
-...     "What about elevensies?",
-... ]
->>> encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
->>> print(encoded_input)
-{'input_ids': tensor([[  101,   153,  7719, 21490,  1122,  1114,  9582,  1623,   102],
-                      [  101,  5226,  1122,  9649,  1199,  2610,  1236,   102,     0]]), 
- 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0],
-                           [0, 0, 0, 0, 0, 0, 0, 0, 0]]), 
- 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1],
-                           [1, 1, 1, 1, 1, 1, 1, 1, 0]])}
-===PT-TF-SPLIT===
->>> batch_sentences = [
-...     "But what about second breakfast?",
-...     "Don't think he knows about second breakfast, Pip.",
-...     "What about elevensies?",
-... ]
->>> encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors="tf")
->>> print(encoded_input)
-{'input_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
-array([[  101,   153,  7719, 21490,  1122,  1114,  9582,  1623,   102],
-       [  101,  5226,  1122,  9649,  1199,  2610,  1236,   102,     0]],
-      dtype=int32)>, 
- 'token_type_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
-array([[0, 0, 0, 0, 0, 0, 0, 0, 0],
-       [0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>, 
- 'attention_mask': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
-array([[1, 1, 1, 1, 1, 1, 1, 1, 1],
-       [1, 1, 1, 1, 1, 1, 1, 1, 0]], dtype=int32)>}
-```
-
-## Audio
-
-Gli input audio sono processati in modo differente rispetto al testo, ma l'obiettivo rimane lo stesso: creare sequenze numeriche che il modello può capire. Un [estrattore di caratteristiche](main_classes/feature_extractor) è progettato con lo scopo preciso di estrarre caratteristiche da immagini o dati audio grezzi e convertirli in tensori. Prima di iniziare, installa 🤗 Datasets per caricare un dataset audio e sperimentare:
-
-```bash
-pip install datasets
-```
-
-Carica il dataset [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) (vedi il 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub.html) per avere maggiori dettagli su come caricare un dataset):
-
-```py
->>> from datasets import load_dataset, Audio
-
->>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
-```
-
-Accedi al primo elemento della colonna `audio` per dare uno sguardo all'input. Richiamando la colonna `audio` sarà caricato automaticamente e ricampionato il file audio:
-
-```py
->>> dataset[0]["audio"]
-{'array': array([ 0.        ,  0.00024414, -0.00024414, ..., -0.00024414,
-         0.        ,  0.        ], dtype=float32),
- 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
- 'sampling_rate': 8000}
-```
-
-Questo restituisce tre oggetti:
-
-* `array` è il segnale vocale caricato - e potenzialmente ricampionato - come vettore 1D.
-* `path` il percorso del file audio.
-* `sampling_rate` si riferisce al numero di campioni del segnale vocale misurati al secondo.
-
-### Ricampionamento
-
-Per questo tutorial, puoi usare il modello [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base). Come puoi vedere dalla model card, il modello Wav2Vec2 è preaddestrato su un campionamento vocale a 16kHz.È importante che la frequenza di campionamento dei tuoi dati audio combaci con la frequenza di campionamento del dataset usato per preaddestrare il modello. Se la frequenza di campionamento dei tuoi dati non è uguale dovrai ricampionare i tuoi dati audio.
-
-Per esempio, il dataset [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) ha una frequenza di campionamento di 8000kHz. Utilizzando il modello Wav2Vec2 su questo dataset, alzala a 16kHz:
-
-```py
->>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
->>> dataset[0]["audio"]
-{'array': array([ 0.        ,  0.00024414, -0.00024414, ..., -0.00024414,
-         0.        ,  0.        ], dtype=float32),
- 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
- 'sampling_rate': 8000}
-```
-
-1. Usa il metodo di 🤗 Datasets' [`cast_column`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.cast_column) per alzare la frequenza di campionamento a 16kHz:
-
-```py
->>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000))
-```
-
-2. Carica il file audio:
-
-```py
->>> dataset[0]["audio"]
-{'array': array([ 2.3443763e-05,  2.1729663e-04,  2.2145823e-04, ...,
-         3.8356509e-05, -7.3497440e-06, -2.1754686e-05], dtype=float32),
- 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
- 'sampling_rate': 16000}
-```
-
-Come puoi notare, la `sampling_rate` adesso è 16kHz!
-
-### Feature extractor
-
-Il prossimo passo è caricare un estrattore di caratteristiche per normalizzare e fare padding sull'input. Quando applichiamo il padding sui dati testuali, uno `0` è aggiunto alle sequenze più brevi. La stessa idea si applica ai dati audio, l'estrattore di caratteristiche per gli audio aggiungerà uno `0` - interpretato come silenzio - agli `array`.
-
-Carica l'estrattore delle caratteristiche con [`AutoFeatureExtractor.from_pretrained`]:
-
-```py
->>> from transformers import AutoFeatureExtractor
-
->>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
-```
-
-Inserisci l' `array` audio nell'estrattore delle caratteristiche. Noi raccomandiamo sempre di aggiungere il parametro `sampling_rate` nell'estrattore delle caratteristiche per correggere meglio qualche errore, dovuto ai silenzi, che potrebbe verificarsi.
-
-```py
->>> audio_input = [dataset[0]["audio"]["array"]]
->>> feature_extractor(audio_input, sampling_rate=16000)
-{'input_values': [array([ 3.8106556e-04,  2.7506407e-03,  2.8015103e-03, ...,
-        5.6335266e-04,  4.6588284e-06, -1.7142107e-04], dtype=float32)]}
-```
-
-### Pad e truncate
-
-Come per il tokenizer, puoi applicare le operazioni padding o truncation per manipolare sequenze di variabili a lotti. Dai uno sguaro alla lunghezza delle sequenze di questi due campioni audio:
-
-```py
->>> dataset[0]["audio"]["array"].shape
-(173398,)
-
->>> dataset[1]["audio"]["array"].shape
-(106496,)
-```
-
-Come puoi vedere, il primo campione ha una sequenza più lunga del secondo. Crea una funzione che preprocesserà il dataset. Specifica una lunghezza massima del campione, e l'estrattore di features si occuperà di riempire o troncare la sequenza per coincidervi:
-
-```py
->>> def preprocess_function(examples):
-...     audio_arrays = [x["array"] for x in examples["audio"]]
-...     inputs = feature_extractor(
-...         audio_arrays,
-...         sampling_rate=16000,
-...         padding=True,
-...         max_length=100000,
-...         truncation=True,
-...     )
-...     return inputs
-```
-
-Applica la funzione ai primi esempi nel dataset:
-
-```py
->>> processed_dataset = preprocess_function(dataset[:5])
-```
-
-Adesso guarda la lunghezza dei campioni elaborati:
-
-```py
->>> processed_dataset["input_values"][0].shape
-(100000,)
-
->>> processed_dataset["input_values"][1].shape
-(100000,)
-```
-
-La lunghezza dei campioni adesso coincide con la massima lunghezza impostata nelle funzione.
-
-## Vision
-
-Un estrattore di caratteristiche si può usare anche per processare immagini e per compiti di visione. Ancora una volta, l'obiettivo è convertire l'immagine grezza in un lotto di tensori come input.
-
-Carica il dataset [food101](https://huggingface.co/datasets/food101) per questa esercitazione. Usa il parametro `split` di 🤗 Datasets  per caricare solo un piccolo campione dal dataset di addestramento poichè il set di dati è molto grande:
-
-```py
->>> from datasets import load_dataset
-
->>> dataset = load_dataset("food101", split="train[:100]")
-```
-
-Secondo passo, dai uno sguardo alle immagini usando la caratteristica [`Image`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=image#datasets.Image) di 🤗 Datasets:
-
-```py
->>> dataset[0]["image"]
-```
-
-![vision-preprocess-tutorial.png](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vision-preprocess-tutorial.png)
-
-### Feature extractor
-
-Carica l'estrattore di caratteristiche [`AutoFeatureExtractor.from_pretrained`]:
-
-```py
->>> from transformers import AutoFeatureExtractor
-
->>> feature_extractor = AutoFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
-```
-
-### Data augmentation
-
-Per le attività di visione, è usuale aggiungere alcuni tipi di data augmentation alle immagini come parte del preprocessing. Puoi aggiungere augmentations con qualsiasi libreria che preferisci, ma in questa esercitazione, userai il modulo [`transforms`](https://pytorch.org/vision/stable/transforms.html) di torchvision.
-
-1. Normalizza l'immagine e usa [`Compose`](https://pytorch.org/vision/master/generated/torchvision.transforms.Compose.html) per concatenare alcune trasformazioni - [`RandomResizedCrop`](https://pytorch.org/vision/main/generated/torchvision.transforms.RandomResizedCrop.html) e [`ColorJitter`](https://pytorch.org/vision/main/generated/torchvision.transforms.ColorJitter.html) - insieme:
-
-```py
->>> from torchvision.transforms import Compose, Normalize, RandomResizedCrop, ColorJitter, ToTensor
-
->>> normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
->>> _transforms = Compose(
-...     [RandomResizedCrop(feature_extractor.size), ColorJitter(brightness=0.5, hue=0.5), ToTensor(), normalize]
-... )
-```
-
-2. Il modello accetta [`pixel_values`](model_doc/visionencoderdecoder#transformers.VisionEncoderDecoderModel.forward.pixel_values) come input. Questo valore è generato dall'estrattore di caratteristiche. Crea una funzione che genera `pixel_values` dai transforms:
-
-```py
->>> def transforms(examples):
-...     examples["pixel_values"] = [_transforms(image.convert("RGB")) for image in examples["image"]]
-...     return examples
-```
-
-3. Poi utilizza 🤗 Datasets [`set_transform`](https://huggingface.co/docs/datasets/process.html#format-transform)per applicare al volo la trasformazione:
-
-```py
->>> dataset.set_transform(transforms)
-```
-
-4. Adesso quando accedi all'immagine, puoi notare che l'estrattore di caratteristiche ha aggiunto `pixel_values` allo schema di input:
-
-```py
->>> dataset[0]["image"]
-{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=384x512 at 0x7F1A7B0630D0>,
- 'label': 6,
- 'pixel_values': tensor([[[ 0.0353,  0.0745,  0.1216,  ..., -0.9922, -0.9922, -0.9922],
-          [-0.0196,  0.0667,  0.1294,  ..., -0.9765, -0.9843, -0.9922],
-          [ 0.0196,  0.0824,  0.1137,  ..., -0.9765, -0.9686, -0.8667],
-          ...,
-          [ 0.0275,  0.0745,  0.0510,  ..., -0.1137, -0.1216, -0.0824],
-          [ 0.0667,  0.0824,  0.0667,  ..., -0.0588, -0.0745, -0.0980],
-          [ 0.0353,  0.0353,  0.0431,  ..., -0.0039, -0.0039, -0.0588]],
- 
-         [[ 0.2078,  0.2471,  0.2863,  ..., -0.9451, -0.9373, -0.9451],
-          [ 0.1608,  0.2471,  0.3098,  ..., -0.9373, -0.9451, -0.9373],
-          [ 0.2078,  0.2706,  0.3020,  ..., -0.9608, -0.9373, -0.8275],
-          ...,
-          [-0.0353,  0.0118, -0.0039,  ..., -0.2392, -0.2471, -0.2078],
-          [ 0.0196,  0.0353,  0.0196,  ..., -0.1843, -0.2000, -0.2235],
-          [-0.0118, -0.0039, -0.0039,  ..., -0.0980, -0.0980, -0.1529]],
- 
-         [[ 0.3961,  0.4431,  0.4980,  ..., -0.9216, -0.9137, -0.9216],
-          [ 0.3569,  0.4510,  0.5216,  ..., -0.9059, -0.9137, -0.9137],
-          [ 0.4118,  0.4745,  0.5216,  ..., -0.9137, -0.8902, -0.7804],
-          ...,
-          [-0.2314, -0.1922, -0.2078,  ..., -0.4196, -0.4275, -0.3882],
-          [-0.1843, -0.1686, -0.2000,  ..., -0.3647, -0.3804, -0.4039],
-          [-0.1922, -0.1922, -0.1922,  ..., -0.2941, -0.2863, -0.3412]]])}
-```
-
-Di seguito come si vede l'immagine dopo la fase di preprocessing. Come ci si aspetterebbe dalle trasformazioni applicate, l'immagine è stata ritagliata in modo casuale e le proprietà del colore sono diverse.
-
-```py
->>> import numpy as np
->>> import matplotlib.pyplot as plt
-
->>> img = dataset[0]["pixel_values"]
->>> plt.imshow(img.permute(1, 2, 0))
-```
-
-![preprocessed_image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/preprocessed_image.png)
-
-## Multimodal
-
-Per attività multimodali userai una combinazione di tutto quello che hai imparato poco fa e applicherai le tue competenze alla comprensione automatica del parlato (Automatic Speech Recognition -  ASR). Questo significa che avrai bisogno di:
-
-* Un estrattore delle caratteristiche per processare i dati audio.
-* Il Tokenizer per processare i testi.
-
-Ritorna sul datasere [LJ Speech](https://huggingface.co/datasets/lj_speech):
-
-```py
->>> from datasets import load_dataset
-
->>> lj_speech = load_dataset("lj_speech", split="train")
-```
-
-Visto che sei interessato solo alle colonne `audio` e `text`, elimina tutte le altre:
-
-```py
->>> lj_speech = lj_speech.map(remove_columns=["file", "id", "normalized_text"])
-```
-
-Adesso guarda le colonne `audio` e `text`:
-
-```py
->>> lj_speech[0]["audio"]
-{'array': array([-7.3242188e-04, -7.6293945e-04, -6.4086914e-04, ...,
-         7.3242188e-04,  2.1362305e-04,  6.1035156e-05], dtype=float32),
- 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav',
- 'sampling_rate': 22050}
-
->>> lj_speech[0]["text"]
-'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition'
-```
-
-Ricorda dalla sezione precedente sull'elaborazione dei dati audio, tu dovresti sempre [ricampionare](preprocessing#audio) la frequenza di campionamento dei tuoi dati audio per farla coincidere con quella del dataset usato dal modello preaddestrato:
-
-```py
->>> lj_speech = lj_speech.cast_column("audio", Audio(sampling_rate=16_000))
-```
-
-### Processor
-
-Un processor combina un estrattore di caratteristiche e un tokenizer. Carica un processor con [`AutoProcessor.from_pretrained]:
-
-```py
->>> from transformers import AutoProcessor
-
->>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
-```
-
-1. Crea una funzione che processi i dati audio in `input_values`, e tokenizza il testo in `labels`. Questi sono i tuoi input per il modello:
-
-```py
->>> def prepare_dataset(example):
-...     audio = example["audio"]
-
-...     example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000))
-
-...     return example
-```
-
-2. Applica la funzione `prepare_dataset` ad un campione:
-
-```py
->>> prepare_dataset(lj_speech[0])
-```
-
-Nota che il processor ha aggiunto `input_values` e `labels`. La frequenza di campionamento è stata corretta riducendola a 16kHz.
-
-Fantastico, ora dovresti essere in grado di preelaborare i dati per qualsiasi modalità e persino di combinare modalità diverse! Nella prossima esercitazione, impareremo a mettere a punto un modello sui dati appena pre-elaborati.
\ No newline at end of file
diff --git a/docs/source/it/quicktour.md b/docs/source/it/quicktour.md
new file mode 100644
index 000000000000..f0e981d18eb7
--- /dev/null
+++ b/docs/source/it/quicktour.md
@@ -0,0 +1,401 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Quick tour
+
+[[open-in-colab]]
+
+Entra in azione con 🤗 Transformers! Inizia utilizzando [`pipeline`] per un'inferenza veloce, carica un modello pre-allenato e un tokenizer con una [AutoClass](./model_doc/auto) per risolvere i tuoi compiti legati a testo, immagini o audio.
+
+<Tip>
+
+Tutti gli esempi di codice presenti in questa documentazione hanno un pulsante in alto a sinistra che permette di selezionare tra PyTorch e TensorFlow. Se
+questo non è presente, ci si aspetta che il codice funzioni per entrambi i backend senza alcun cambiamento.
+
+</Tip>
+
+## Pipeline
+
+[`pipeline`] è il modo più semplice per utilizzare un modello pre-allenato per un dato compito.
+
+<Youtube id="tiZFewofSLM"/>
+
+La [`pipeline`] supporta molti compiti comuni:
+
+**Testo**:
+* Analisi del Sentimento (Sentiment Analysis, in inglese): classifica la polarità di un testo dato.
+* Generazione del Testo (Text Generation, in inglese): genera del testo a partire da un dato input.
+* Riconoscimento di Entità (Name Entity Recognition o NER, in inglese): etichetta ogni parola con l'entità che questa rappresenta (persona, data, luogo, ecc.).
+* Rispondere a Domande (Question answering, in inglese): estrae la risposta da un contesto, dato del contesto e una domanda.
+* Riempimento di Maschere (Fill-mask, in inglese): riempie gli spazi mancanti in un testo che ha parole mascherate.
+* Riassumere (Summarization, in inglese): genera una sintesi di una lunga sequenza di testo o di un documento.
+* Traduzione (Translation, in inglese): traduce un testo in un'altra lingua.
+* Estrazione di Caratteristiche (Feature Extraction, in inglese): crea un tensore che rappresenta un testo.
+
+**Immagini**:
+* Classificazione di Immagini (Image Classification, in inglese): classifica un'immagine.
+* Segmentazione di Immagini (Image Segmentation, in inglese): classifica ogni pixel di un'immagine.
+* Rilevazione di Oggetti (Object Detection, in inglese): rileva oggetti all'interno di un'immagine.
+
+**Audio**:
+* Classificazione di Audio (Audio Classification, in inglese): assegna un'etichetta ad un segmento di audio dato.
+* Riconoscimento Vocale Automatico (Automatic Speech Recognition o ASR, in inglese): trascrive il contenuto di un audio dato in un testo.
+
+<Tip>
+
+Per maggiori dettagli legati alla [`pipeline`] e ai compiti ad essa associati, fai riferimento alla documentazione [qui](./main_classes/pipelines).
+
+</Tip>
+
+### Utilizzo della Pipeline
+
+Nel seguente esempio, utilizzerai la [`pipeline`] per l'analisi del sentimento.
+
+Installa le seguenti dipendenze se non lo hai già fatto:
+
+<frameworkcontent>
+<pt>
+
+```bash
+pip install torch
+```
+</pt>
+<tf>
+
+```bash
+pip install tensorflow
+```
+</tf>
+</frameworkcontent>
+
+Importa [`pipeline`] e specifica il compito che vuoi completare:
+
+```py
+>>> from transformers import pipeline
+
+>>> classificatore = pipeline("sentiment-analysis", model="MilaNLProc/feel-it-italian-sentiment")
+```
+
+La pipeline scarica e salva il [modello pre-allenato](https://huggingface.co/MilaNLProc/feel-it-italian-sentiment) e il tokenizer per l'analisi del sentimento. Se non avessimo scelto un modello, la pipeline ne avrebbe scelto uno di default. Ora puoi utilizzare il `classifier` sul tuo testo obiettivo:
+
+```py
+>>> classificatore("Siamo molto felici di mostrarti la libreria 🤗 Transformers.")
+[{'label': 'positive', 'score': 0.9997}]
+```
+
+Per più di una frase, passa una lista di frasi alla [`pipeline`] la quale restituirà una lista di dizionari:
+
+```py
+>>> risultati = classificatore(
+...     ["Siamo molto felici di mostrarti la libreria 🤗 Transformers.", "Speriamo te non la odierai."]
+... )
+>>> for risultato in risultati:
+...     print(f"etichetta: {risultato['label']}, con punteggio: {round(risultato['score'], 4)}")
+etichetta: positive, con punteggio: 0.9998
+etichetta: negative, con punteggio: 0.9998
+```
+
+La [`pipeline`] può anche iterare su un dataset intero. Inizia installando la libreria [🤗 Datasets](https://huggingface.co/docs/datasets/):
+
+```bash
+pip install datasets 
+```
+
+Crea una [`pipeline`] con il compito che vuoi risolvere e con il modello che vuoi utilizzare.
+
+```py
+>>> import torch
+>>> from transformers import pipeline
+
+>>> riconoscitore_vocale = pipeline(
+...     "automatic-speech-recognition", model="radiogroup-crits/wav2vec2-xls-r-1b-italian-doc4lm-5gram"
+... )
+```
+
+Poi, carica un dataset (vedi 🤗 Datasets [Quick Start](https://huggingface.co/docs/datasets/quickstart.html) per maggiori dettagli) sul quale vuoi iterare. Per esempio, carichiamo il dataset [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14):
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> dataset = load_dataset("PolyAI/minds14", name="it-IT", split="train")  # doctest: +IGNORE_RESULT
+```
+
+Dobbiamo assicurarci che la frequenza di campionamento del set di dati corrisponda alla frequenza di campionamento con cui è stato addestrato `radiogroup-crits/wav2vec2-xls-r-1b-italian-doc4lm-5gram`.
+
+```py
+>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=riconoscitore_vocale.feature_extractor.sampling_rate))
+```
+
+I file audio vengono caricati automaticamente e ri-campionati quando chiamiamo la colonna "audio".
+Estraiamo i vettori delle forme d'onda grezze delle prime 4 osservazioni e passiamoli come lista alla pipeline:
+
+```py
+>>> risultato = riconoscitore_vocale(dataset[:4]["audio"])
+>>> print([d["text"] for d in risultato])
+['dovrei caricare dei soldi sul mio conto corrente', 'buongiorno e senza vorrei depositare denaro sul mio conto corrente come devo fare per cortesia', 'sì salve vorrei depositare del denaro sul mio conto', 'e buon pomeriggio vorrei depositare dei soldi sul mio conto bancario volleo sapere come posso fare se e posso farlo online ed un altro conto o andandoo tramite bancomut']
+```
+
+Per un dataset più grande dove gli input sono di dimensione maggiore (come nel parlato/audio o nella visione), dovrai passare un generatore al posto di una lista che carica tutti gli input in memoria. Guarda la [documentazione della pipeline](./main_classes/pipelines) per maggiori informazioni.
+
+### Utilizzare un altro modello e tokenizer nella pipeline
+
+La [`pipeline`] può ospitare qualsiasi modello del [Model Hub](https://huggingface.co/models), rendendo semplice l'adattamento della [`pipeline`] per altri casi d'uso. Per esempio, se si vuole un modello capace di trattare testo in francese, usa i tag presenti nel Model Hub in modo da filtrare per ottenere un modello appropriato. Il miglior risultato filtrato restituisce un modello multi-lingua [BERT model](https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment) fine-tuned per l'analisi del sentimento. Ottimo, utilizziamo questo modello!
+
+```py
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+```
+
+<frameworkcontent>
+<pt>
+Usa [`AutoModelForSequenceClassification`] e [`AutoTokenizer`] per caricare il modello pre-allenato e il suo tokenizer associato (maggiori informazioni su una `AutoClass` in seguito):
+
+```py
+>>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained(model_name)
+>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+```
+</pt>
+<tf>
+Usa [`TFAutoModelForSequenceClassification`] e [`AutoTokenizer`] per caricare il modello pre-allenato e il suo tokenizer associato (maggiori informazioni su una `TFAutoClass` in seguito):
+
+```py
+>>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+
+>>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
+>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+```
+</tf>
+</frameworkcontent>
+
+Poi puoi specificare il modello e il tokenizer nella [`pipeline`], e applicare il `classifier` sul tuo testo obiettivo:
+
+```py
+>>> classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
+>>> classifier("Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers.")
+[{'label': '5 stars', 'score': 0.7273}]
+```
+
+Se non riesci a trovare un modello per il tuo caso d'uso, dovrai fare fine-tuning di un modello pre-allenato sui tuoi dati. Dai un'occhiata al nostro tutorial [fine-tuning tutorial](./training) per imparare come. Infine, dopo che hai completato il fine-tuning del tuo modello pre-allenato, considera per favore di condividerlo (vedi il tutorial [qui](./model_sharing)) con la comunità sul Model Hub per democratizzare l'NLP! 🤗
+
+## AutoClass
+
+<Youtube id="AhChOFRegn4"/>
+
+Al suo interno, le classi [`AutoModelForSequenceClassification`] e [`AutoTokenizer`] lavorano assieme per dare potere alla [`pipeline`]. Una [AutoClass](./model_doc/auto) è una scorciatoia che automaticamente recupera l'architettura di un modello pre-allenato a partire dal suo nome o path. Hai solo bisogno di selezionare la `AutoClass` appropriata per il tuo compito e il suo tokenizer associato con [`AutoTokenizer`].
+
+Ritorniamo al nostro esempio e vediamo come puoi utilizzare la `AutoClass` per replicare i risultati della [`pipeline`].
+
+### AutoTokenizer
+
+Un tokenizer è responsabile dell'elaborazione del testo in modo da trasformarlo in un formato comprensibile dal modello. Per prima cosa, il tokenizer dividerà il testo in parole chiamate *token*. Ci sono diverse regole che governano il processo di tokenizzazione, tra cui come dividere una parola e a quale livello (impara di più sulla tokenizzazione [qui](./tokenizer_summary)). La cosa più importante da ricordare comunque è che hai bisogno di inizializzare il tokenizer con lo stesso nome del modello in modo da assicurarti che stai utilizzando le stesse regole di tokenizzazione con cui il modello è stato pre-allenato.
+
+Carica un tokenizer con [`AutoTokenizer`]:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> nome_del_modello = "nlptown/bert-base-multilingual-uncased-sentiment"
+>>> tokenizer = AutoTokenizer.from_pretrained(nome_del_modello)
+```
+
+Dopodiché, il tokenizer converte i token in numeri in modo da costruire un tensore come input del modello. Questo è conosciuto come il *vocabolario* del modello.
+
+Passa il tuo testo al tokenizer:
+
+```py
+>>> encoding = tokenizer("Siamo molto felici di mostrarti la libreria 🤗 Transformers.")
+>>> print(encoding)
+{'input_ids': [101, 56821, 10132, 14407, 13019, 13007, 10120, 47201, 10330, 10106, 91686, 100, 58263, 119, 102],
+'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+```
+
+Il tokenizer restituirà un dizionario contenente:
+
+* [input_ids](./glossary#input-ids): rappresentazioni numeriche dei tuoi token.
+* [attention_mask](.glossary#attention-mask): indica quali token devono essere presi in considerazione.
+
+Come con la [`pipeline`], il tokenizer accetterà una lista di input. In più, il tokenizer può anche completare (pad, in inglese) e troncare il testo in modo da restituire un lotto (batch, in inglese) di lunghezza uniforme:
+
+<frameworkcontent>
+<pt>
+```py
+>>> pt_batch = tokenizer(
+...     ["Siamo molto felici di mostrarti la libreria 🤗 Transformers.", "Speriamo te non la odierai."],
+...     padding=True,
+...     truncation=True,
+...     max_length=512,
+...     return_tensors="pt",
+... )
+```
+</pt>
+<tf>
+```py
+>>> tf_batch = tokenizer(
+...     ["Siamo molto felici di mostrarti la libreria 🤗 Transformers.", "Speriamo te non la odierai."],
+...     padding=True,
+...     truncation=True,
+...     max_length=512,
+...     return_tensors="tf",
+... )
+```
+</tf>
+</frameworkcontent>
+
+Leggi il tutorial sul [preprocessing](./preprocessing) per maggiori dettagli sulla tokenizzazione.
+
+### AutoModel
+
+<frameworkcontent>
+<pt>
+🤗 Transformers fornisce un metodo semplice e unificato per caricare istanze pre-allenate. Questo significa che puoi caricare un [`AutoModel`] come caricheresti un [`AutoTokenizer`]. L'unica differenza è selezionare l'[`AutoModel`] corretto per il compito di interesse. Dato che stai facendo classificazione di testi, o sequenze, carica [`AutoModelForSequenceClassification`]:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)
+```
+
+<Tip>
+
+Guarda il [task summary](./task_summary) per sapere quale classe di [`AutoModel`] utilizzare per quale compito.
+
+</Tip>
+
+Ora puoi passare il tuo lotto di input pre-processati direttamente al modello. Devi solo spacchettare il dizionario aggiungendo `**`:
+
+```py
+>>> pt_outputs = pt_model(**pt_batch)
+```
+
+Il modello produrrà le attivazioni finali nell'attributo `logits`. Applica la funzione softmax a `logits` per ottenere le probabilità:
+
+```py
+>>> from torch import nn
+
+>>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
+>>> print(pt_predictions)
+tensor([[0.0041, 0.0037, 0.0203, 0.2005, 0.7713],
+        [0.3766, 0.3292, 0.1832, 0.0558, 0.0552]], grad_fn=<SoftmaxBackward0>)
+```
+</pt>
+<tf>
+🤗 Transformers fornisce un metodo semplice e unificato per caricare istanze pre-allenate. Questo significa che puoi caricare un [`TFAutoModel`] come caricheresti un [`AutoTokenizer`]. L'unica differenza è selezionare il [`TFAutoModel`] corretto per il compito di interesse. Dato che stai facendo classificazione di testi, o sequenze, carica [`TFAutoModelForSequenceClassification`]:
+
+```py
+>>> from transformers import TFAutoModelForSequenceClassification
+
+>>> nome_del_modello = "nlptown/bert-base-multilingual-uncased-sentiment"
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(nome_del_modello)
+```
+
+<Tip>
+
+Guarda il [task summary](./task_summary) per sapere quale classe di [`AutoModel`] utilizzare per quale compito.
+
+</Tip>
+
+Ora puoi passare il tuo lotto di input pre-processati direttamente al modello passando le chiavi del dizionario al tensore:
+
+```py
+>>> tf_outputs = tf_model(tf_batch)
+```
+
+Il modello produrrà le attivazioni finali nell'attributo `logits`. Applica la funzione softmax a `logits` per ottenere le probabilità:
+```py
+>>> import tensorflow as tf
+
+>>> tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1)
+>>> tf_predictions  # doctest: +IGNORE_RESULT
+```
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+Tutti i modelli di 🤗 Transformers (PyTorch e TensorFlow) restituiscono i tensori *prima* della funzione finale
+di attivazione (come la softmax) perché la funzione di attivazione finale viene spesso unita a quella di perdita.
+
+</Tip>
+
+I modelli sono [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) o [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) standard così puoi utilizzarli all'interno del tuo training loop usuale. Tuttavia, per rendere le cose più semplici, 🤗 Transformers fornisce una classe [`Trainer`] per PyTorch che aggiunge delle funzionalità per l'allenamento distribuito, precisione mista, e altro ancora. Per TensorFlow, puoi utilizzare il metodo `fit` di [Keras](https://keras.io/). Fai riferimento al [tutorial per il training](./training) per maggiori dettagli.
+
+<Tip>
+
+Gli output del modello di 🤗 Transformers sono delle dataclasses speciali in modo che i loro attributi vengano auto-completati all'interno di un IDE.
+Gli output del modello si comportano anche come una tupla o un dizionario (ad esempio, puoi indicizzare con un intero, una slice o una stringa) nel qual caso gli attributi che sono `None` vengono ignorati.
+
+</Tip>
+
+### Salva un modello
+
+<frameworkcontent>
+<pt>
+Una volta completato il fine-tuning del tuo modello, puoi salvarlo con il suo tokenizer utilizzando [`PreTrainedModel.save_pretrained`]:
+
+```py
+>>> pt_save_directory = "./pt_save_pretrained"
+>>> tokenizer.save_pretrained(pt_save_directory)  # doctest: +IGNORE_RESULT
+>>> pt_model.save_pretrained(pt_save_directory)
+```
+
+Quando desideri utilizzare il tuo modello nuovamente, puoi ri-caricarlo con [`PreTrainedModel.from_pretrained`]:
+
+```py
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained")
+```
+</pt>
+<tf>
+Una volta completato il fine-tuning del tuo modello, puoi salvarlo con il suo tokenizer utilizzando [`TFPreTrainedModel.save_pretrained`]:
+
+```py
+>>> tf_save_directory = "./tf_save_pretrained"
+>>> tokenizer.save_pretrained(tf_save_directory)  # doctest: +IGNORE_RESULT
+>>> tf_model.save_pretrained(tf_save_directory)
+```
+
+Quando desideri utilizzare il tuo modello nuovamente, puoi ri-caricarlo con [`TFPreTrainedModel.from_pretrained`]:
+
+```py
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("./tf_save_pretrained")
+```
+</tf>
+</frameworkcontent>
+
+Una caratteristica particolarmente interessante di 🤗 Transformers è la sua abilità di salvare un modello e ri-caricarlo sia come modello di PyTorch che di TensorFlow. I parametri `from_pt` o `from_tf` possono convertire un modello da un framework all'altro:
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> from transformers import AutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
+```
+</pt>
+<tf>
+
+```py
+>>> from transformers import TFAutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
+```
+</tf>
+</frameworkcontent>
diff --git a/docs/source/it/quicktour.mdx b/docs/source/it/quicktour.mdx
deleted file mode 100644
index 2378edd2c2a1..000000000000
--- a/docs/source/it/quicktour.mdx
+++ /dev/null
@@ -1,393 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Quick tour
-
-[[open-in-colab]]
-
-Entra in azione con 🤗 Transformers! Inizia utilizzando [`pipeline`] per un'inferenza veloce, carica un modello pre-allenato e un tokenizer con una [AutoClass](./model_doc/auto) per risolvere i tuoi compiti legati a testo, immagini o audio.
-
-<Tip>
-
-Tutti gli esempi di codice presenti in questa documentazione hanno un pulsante in alto a sinistra che permette di selezionare tra PyTorch e TensorFlow. Se
-questo non è presente, ci si aspetta che il codice funzioni per entrambi i backend senza alcun cambiamento.
-
-</Tip>
-
-## Pipeline
-
-[`pipeline`] è il modo più semplice per utilizzare un modello pre-allenato per un dato compito.
-
-<Youtube id="tiZFewofSLM"/>
-
-La [`pipeline`] supporta molti compiti comuni:
-
-**Testo**:
-* Analisi del Sentimento (Sentiment Analysis, in inglese): classifica la polarità di un testo dato.
-* Generazione del Testo (Text Generation, in inglese): genera del testo a partire da un dato input.
-* Riconoscimento di Entità (Name Entity Recognition o NER, in inglese): etichetta ogni parola con l'entità che questa rappresenta (persona, data, luogo, ecc.).
-* Rispondere a Domande (Question answering, in inglese): estrae la risposta da un contesto, dato del contesto e una domanda.
-* Riempimento di Maschere (Fill-mask, in inglese): riempie gli spazi mancanti in un testo che ha parole mascherate.
-* Riassumere (Summarization, in inglese): genera una sintesi di una lunga sequenza di testo o di un documento.
-* Traduzione (Translation, in inglese): traduce un testo in un'altra lingua.
-* Estrazione di Caratteristiche (Feature Extraction, in inglese): crea un tensore che rappresenta un testo.
-
-**Immagini**:
-* Classificazione di Immagini (Image Classification, in inglese): classifica un'immagine.
-* Segmentazione di Immagini (Image Segmentation, in inglese): classifica ogni pixel di un'immagine.
-* Rilevazione di Oggetti (Object Detection, in inglese): rileva oggetti all'interno di un'immagine.
-
-**Audio**:
-* Classificazione di Audio (Audio Classification, in inglese): assegna un'etichetta ad un segmento di audio dato.
-* Riconoscimento Vocale Automatico (Automatic Speech Recognition o ASR, in inglese): trascrive il contenuto di un audio dato in un testo.
-
-<Tip>
-
-Per maggiori dettagli legati alla [`pipeline`] e ai compiti ad essa associati, fai riferimento alla documentazione [qui](./main_classes/pipelines).
-
-</Tip>
-
-### Utilizzo della Pipeline
-
-Nel seguente esempio, utilizzerai la [`pipeline`] per l'analisi del sentimento.
-
-Installa le seguenti dipendenze se non lo hai già fatto:
-
-<frameworkcontent>
-<pt>
-```bash
-pip install torch
-```
-</pt>
-<tf>
-```bash
-pip install tensorflow
-```
-</tf>
-</frameworkcontent>
-
-Importa [`pipeline`] e specifica il compito che vuoi completare:
-
-```py
->>> from transformers import pipeline
-
->>> classificatore = pipeline("sentiment-analysis", model="MilaNLProc/feel-it-italian-sentiment")
-```
-
-La pipeline scarica e salva il [modello pre-allenato](https://huggingface.co/MilaNLProc/feel-it-italian-sentiment) e il tokenizer per l'analisi del sentimento. Se non avessimo scelto un modello, la pipeline ne avrebbe scelto uno di default. Ora puoi utilizzare il `classifier` sul tuo testo obiettivo:
-
-```py
->>> classificatore("Siamo molto felici di mostrarti la libreria 🤗 Transformers.")
-[{'label': 'positive', 'score': 0.9997}]
-```
-
-Per più di una frase, passa una lista di frasi alla [`pipeline`] la quale restituirà una lista di dizionari:
-
-```py
->>> risultati = classificatore(
-...     ["Siamo molto felici di mostrarti la libreria 🤗 Transformers.", "Speriamo te non la odierai."]
-... )
->>> for risultato in risultati:
-...     print(f"etichetta: {risultato['label']}, con punteggio: {round(risultato['score'], 4)}")
-etichetta: positive, con punteggio: 0.9998
-etichetta: negative, con punteggio: 0.9998
-```
-
-La [`pipeline`] può anche iterare su un dataset intero. Inizia installando la libreria [🤗 Datasets](https://huggingface.co/docs/datasets/):
-
-```bash
-pip install datasets 
-```
-
-Crea una [`pipeline`] con il compito che vuoi risolvere e con il modello che vuoi utilizzare.
-
-```py
->>> import torch
->>> from transformers import pipeline
-
->>> riconoscitore_vocale = pipeline(
-...     "automatic-speech-recognition", model="radiogroup-crits/wav2vec2-xls-r-1b-italian-doc4lm-5gram"
-... )
-```
-
-Poi, carica un dataset (vedi 🤗 Datasets [Quick Start](https://huggingface.co/docs/datasets/quickstart.html) per maggiori dettagli) sul quale vuoi iterare. Per esempio, carichiamo il dataset [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14):
-
-```py
->>> from datasets import load_dataset, Audio
-
->>> dataset = load_dataset("PolyAI/minds14", name="it-IT", split="train")  # doctest: +IGNORE_RESULT
-```
-
-Dobbiamo assicurarci che la frequenza di campionamento del set di dati corrisponda alla frequenza di campionamento con cui è stato addestrato `radiogroup-crits/wav2vec2-xls-r-1b-italian-doc4lm-5gram`.
-
-```py
->>> dataset = dataset.cast_column("audio", Audio(sampling_rate=riconoscitore_vocale.feature_extractor.sampling_rate))
-```
-
-I file audio vengono caricati automaticamente e ri-campionati quando chiamiamo la colonna "audio".
-Estraiamo i vettori delle forme d'onda grezze delle prime 4 osservazioni e passiamoli come lista alla pipeline:
-
-```py
->>> risultato = riconoscitore_vocale(dataset[:4]["audio"])
->>> print([d["text"] for d in risultato])
-['dovrei caricare dei soldi sul mio conto corrente', 'buongiorno e senza vorrei depositare denaro sul mio conto corrente come devo fare per cortesia', 'sì salve vorrei depositare del denaro sul mio conto', 'e buon pomeriggio vorrei depositare dei soldi sul mio conto bancario volleo sapere come posso fare se e posso farlo online ed un altro conto o andandoo tramite bancomut']
-```
-
-Per un dataset più grande dove gli input sono di dimensione maggiore (come nel parlato/audio o nella visione), dovrai passare un generatore al posto di una lista che carica tutti gli input in memoria. Guarda la [documentazione della pipeline](./main_classes/pipelines) per maggiori informazioni.
-
-### Utilizzare un altro modello e tokenizer nella pipeline
-
-La [`pipeline`] può ospitare qualsiasi modello del [Model Hub](https://huggingface.co/models), rendendo semplice l'adattamento della [`pipeline`] per altri casi d'uso. Per esempio, se si vuole un modello capace di trattare testo in francese, usa i tag presenti nel Model Hub in modo da filtrare per ottenere un modello appropriato. Il miglior risultato filtrato restituisce un modello multi-lingua [BERT model](https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment) fine-tuned per l'analisi del sentimento. Ottimo, utilizziamo questo modello!
-
-```py
->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
-```
-
-<frameworkcontent>
-<pt>
-Usa [`AutoModelForSequenceClassification`] e [`AutoTokenizer`] per caricare il modello pre-allenato e il suo tokenizer associato (maggiori informazioni su una `AutoClass` in seguito):
-
-```py
->>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
-
->>> model = AutoModelForSequenceClassification.from_pretrained(model_name)
->>> tokenizer = AutoTokenizer.from_pretrained(model_name)
-```
-</pt>
-<tf>
-Usa [`TFAutoModelForSequenceClassification`] e [`AutoTokenizer`] per caricare il modello pre-allenato e il suo tokenizer associato (maggiori informazioni su una `TFAutoClass` in seguito):
-
-```py
->>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
-
->>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
->>> tokenizer = AutoTokenizer.from_pretrained(model_name)
-```
-</tf>
-</frameworkcontent>
-
-Poi puoi specificare il modello e il tokenizer nella [`pipeline`], e applicare il `classifier` sul tuo testo obiettivo:
-
-```py
->>> classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
->>> classifier("Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers.")
-[{'label': '5 stars', 'score': 0.7273}]
-```
-
-Se non riesci a trovare un modello per il tuo caso d'uso, dovrai fare fine-tuning di un modello pre-allenato sui tuoi dati. Dai un'occhiata al nostro tutorial [fine-tuning tutorial](./training) per imparare come. Infine, dopo che hai completato il fine-tuning del tuo modello pre-allenato, considera per favore di condividerlo (vedi il tutorial [qui](./model_sharing)) con la comunità sul Model Hub per democratizzare l'NLP! 🤗
-
-## AutoClass
-
-<Youtube id="AhChOFRegn4"/>
-
-Al suo interno, le classi [`AutoModelForSequenceClassification`] e [`AutoTokenizer`] lavorano assieme per dare potere alla [`pipeline`]. Una [AutoClass](./model_doc/auto) è una scorciatoia che automaticamente recupera l'architettura di un modello pre-allenato a partire dal suo nome o path. Hai solo bisogno di selezionare la `AutoClass` appropriata per il tuo compito e il suo tokenizer associato con [`AutoTokenizer`].
-
-Ritorniamo al nostro esempio e vediamo come puoi utilizzare la `AutoClass` per replicare i risultati della [`pipeline`].
-
-### AutoTokenizer
-
-Un tokenizer è responsabile dell'elaborazione del testo in modo da trasformarlo in un formato comprensibile dal modello. Per prima cosa, il tokenizer dividerà il testo in parole chiamate *token*. Ci sono diverse regole che governano il processo di tokenizzazione, tra cui come dividere una parola e a quale livello (impara di più sulla tokenizzazione [qui](./tokenizer_summary)). La cosa più importante da ricordare comunque è che hai bisogno di inizializzare il tokenizer con lo stesso nome del modello in modo da assicurarti che stai utilizzando le stesse regole di tokenizzazione con cui il modello è stato pre-allenato.
-
-Carica un tokenizer con [`AutoTokenizer`]:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> nome_del_modello = "nlptown/bert-base-multilingual-uncased-sentiment"
->>> tokenizer = AutoTokenizer.from_pretrained(nome_del_modello)
-```
-
-Dopodiché, il tokenizer converte i token in numeri in modo da costruire un tensore come input del modello. Questo è conosciuto come il *vocabolario* del modello.
-
-Passa il tuo testo al tokenizer:
-
-```py
->>> encoding = tokenizer("Siamo molto felici di mostrarti la libreria 🤗 Transformers.")
->>> print(encoding)
-{'input_ids': [101, 56821, 10132, 14407, 13019, 13007, 10120, 47201, 10330, 10106, 91686, 100, 58263, 119, 102],
-'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
-```
-
-Il tokenizer restituirà un dizionario contenente:
-
-* [input_ids](./glossary#input-ids): rappresentazioni numeriche dei tuoi token.
-* [attention_mask](.glossary#attention-mask): indica quali token devono essere presi in considerazione.
-
-Come con la [`pipeline`], il tokenizer accetterà una lista di input. In più, il tokenizer può anche completare (pad, in inglese) e troncare il testo in modo da restituire un lotto (batch, in inglese) di lunghezza uniforme:
-
-<frameworkcontent>
-<pt>
-```py
->>> pt_batch = tokenizer(
-...     ["Siamo molto felici di mostrarti la libreria 🤗 Transformers.", "Speriamo te non la odierai."],
-...     padding=True,
-...     truncation=True,
-...     max_length=512,
-...     return_tensors="pt",
-... )
-```
-</pt>
-<tf>
-```py
->>> tf_batch = tokenizer(
-...     ["Siamo molto felici di mostrarti la libreria 🤗 Transformers.", "Speriamo te non la odierai."],
-...     padding=True,
-...     truncation=True,
-...     max_length=512,
-...     return_tensors="tf",
-... )
-```
-</tf>
-</frameworkcontent>
-
-Leggi il tutorial sul [preprocessing](./preprocessing) per maggiori dettagli sulla tokenizzazione.
-
-### AutoModel
-
-<frameworkcontent>
-<pt>
-🤗 Transformers fornisce un metodo semplice e unificato per caricare istanze pre-allenate. Questo significa che puoi caricare un [`AutoModel`] come caricheresti un [`AutoTokenizer`]. L'unica differenza è selezionare l'[`AutoModel`] corretto per il compito di interesse. Dato che stai facendo classificazione di testi, o sequenze, carica [`AutoModelForSequenceClassification`]:
-
-```py
->>> from transformers import AutoModelForSequenceClassification
-
->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
->>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)
-```
-
-<Tip>
-
-Guarda il [task summary](./task_summary) per sapere quale classe di [`AutoModel`] utilizzare per quale compito.
-
-</Tip>
-
-Ora puoi passare il tuo lotto di input pre-processati direttamente al modello. Devi solo spacchettare il dizionario aggiungendo `**`:
-
-```py
->>> pt_outputs = pt_model(**pt_batch)
-```
-
-Il modello produrrà le attivazioni finali nell'attributo `logits`. Applica la funzione softmax a `logits` per ottenere le probabilità:
-
-```py
->>> from torch import nn
-
->>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
->>> print(pt_predictions)
-tensor([[0.0041, 0.0037, 0.0203, 0.2005, 0.7713],
-        [0.3766, 0.3292, 0.1832, 0.0558, 0.0552]], grad_fn=<SoftmaxBackward0>)
-```
-</pt>
-<tf>
-🤗 Transformers fornisce un metodo semplice e unificato per caricare istanze pre-allenate. Questo significa che puoi caricare un [`TFAutoModel`] come caricheresti un [`AutoTokenizer`]. L'unica differenza è selezionare il [`TFAutoModel`] corretto per il compito di interesse. Dato che stai facendo classificazione di testi, o sequenze, carica [`TFAutoModelForSequenceClassification`]:
-
-```py
->>> from transformers import TFAutoModelForSequenceClassification
-
->>> nome_del_modello = "nlptown/bert-base-multilingual-uncased-sentiment"
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(nome_del_modello)
-```
-
-<Tip>
-
-Guarda il [task summary](./task_summary) per sapere quale classe di [`AutoModel`] utilizzare per quale compito.
-
-</Tip>
-
-Ora puoi passare il tuo lotto di input pre-processati direttamente al modello passando le chiavi del dizionario al tensore:
-
-```py
->>> tf_outputs = tf_model(tf_batch)
-```
-
-Il modello produrrà le attivazioni finali nell'attributo `logits`. Applica la funzione softmax a `logits` per ottenere le probabilità:
-```py
->>> import tensorflow as tf
-
->>> tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1)
->>> tf_predictions  # doctest: +IGNORE_RESULT
-```
-</tf>
-</frameworkcontent>
-
-<Tip>
-
-Tutti i modelli di 🤗 Transformers (PyTorch e TensorFlow) restituiscono i tensori *prima* della funzione finale
-di attivazione (come la softmax) perché la funzione di attivazione finale viene spesso unita a quella di perdita.
-
-</Tip>
-
-I modelli sono [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) o [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) standard così puoi utilizzarli all'interno del tuo training loop usuale. Tuttavia, per rendere le cose più semplici, 🤗 Transformers fornisce una classe [`Trainer`] per PyTorch che aggiunge delle funzionalità per l'allenamento distribuito, precisione mista, e altro ancora. Per TensorFlow, puoi utilizzare il metodo `fit` di [Keras](https://keras.io/). Fai riferimento al [tutorial per il training](./training) per maggiori dettagli.
-
-<Tip>
-
-Gli output del modello di 🤗 Transformers sono delle dataclasses speciali in modo che i loro attributi vengano auto-completati all'interno di un IDE.
-Gli output del modello si comportano anche come una tupla o un dizionario (ad esempio, puoi indicizzare con un intero, una slice o una stringa) nel qual caso gli attributi che sono `None` vengono ignorati.
-
-</Tip>
-
-### Salva un modello
-
-<frameworkcontent>
-<pt>
-Una volta completato il fine-tuning del tuo modello, puoi salvarlo con il suo tokenizer utilizzando [`PreTrainedModel.save_pretrained`]:
-
-```py
->>> pt_save_directory = "./pt_save_pretrained"
->>> tokenizer.save_pretrained(pt_save_directory)  # doctest: +IGNORE_RESULT
->>> pt_model.save_pretrained(pt_save_directory)
-```
-
-Quando desideri utilizzare il tuo modello nuovamente, puoi ri-caricarlo con [`PreTrainedModel.from_pretrained`]:
-
-```py
->>> pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained")
-```
-</pt>
-<tf>
-Una volta completato il fine-tuning del tuo modello, puoi salvarlo con il suo tokenizer utilizzando [`TFPreTrainedModel.save_pretrained`]:
-
-```py
->>> tf_save_directory = "./tf_save_pretrained"
->>> tokenizer.save_pretrained(tf_save_directory)  # doctest: +IGNORE_RESULT
->>> tf_model.save_pretrained(tf_save_directory)
-```
-
-Quando desideri utilizzare il tuo modello nuovamente, puoi ri-caricarlo con [`TFPreTrainedModel.from_pretrained`]:
-
-```py
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("./tf_save_pretrained")
-```
-</tf>
-</frameworkcontent>
-
-Una caratteristica particolarmente interessante di 🤗 Transformers è la sua abilità di salvare un modello e ri-caricarlo sia come modello di PyTorch che di TensorFlow. I parametri `from_pt` o `from_tf` possono convertire un modello da un framework all'altro:
-
-<frameworkcontent>
-<pt>
-```py
->>> from transformers import AutoModel
-
->>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
->>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
-```
-</pt>
-<tf>
-```py
->>> from transformers import TFAutoModel
-
->>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
-```
-</tf>
-</frameworkcontent>
diff --git a/docs/source/it/run_scripts.md b/docs/source/it/run_scripts.md
new file mode 100644
index 000000000000..327eb9374d38
--- /dev/null
+++ b/docs/source/it/run_scripts.md
@@ -0,0 +1,351 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Addestramento con script
+
+Insieme ai [notebooks](./noteboks/README) 🤗 Transformers, ci sono anche esempi di script che dimostrano come addestrare un modello per un task con [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow), o [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax).
+
+Troverai anche script che abbiamo usato nei nostri [progetti di ricerca](https://github.com/huggingface/transformers/tree/main/examples/research_projects) e [precedenti esempi](https://github.com/huggingface/transformers/tree/main/examples/legacy) a cui contribuisce per lo più la comunità. Questi script non sono attivamente mantenuti e richiedono una specifica versione di 🤗 Transformers che sarà molto probabilmente incompatibile con l'ultima versione della libreria.
+
+Non è dato per scontato che gli script di esempio funzionino senza apportare modifiche per ogni problema, bensì potrebbe essere necessario adattare lo script al tuo caso specifico. Per aiutarti in ciò, la maggioranza degli script espone le modalità di pre-processamento dei dati, consentendoti di modificare lo script come preferisci.
+
+Per qualsiasi feature che vorresti implementare in uno script d'esempio, per favore discutine nel [forum](https://discuss.huggingface.co/) o in un'[issue](https://github.com/huggingface/transformers/issues) prima di inviare una Pull Request. Mentre accogliamo con piacere la correzione di bug, è più improbabile che faremo la stessa con una PR che aggiunge funzionalità sacrificando la leggibilità. 
+
+Questa guida ti mostrerà come eseguire uno script di esempio relativo al task di summarization in [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization) e [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization). Tutti gli esempi funzioneranno con entrambi i framework a meno che non sia specificato altrimenti. 
+
+## Installazione
+
+Per eseguire con successo l'ultima versione degli script di esempio, devi **installare 🤗 Transformers dalla fonte** in un nuovo ambiente virtuale:
+
+```bash
+git clone https://github.com/huggingface/transformers
+cd transformers
+pip install .
+```
+Per le precedenti versioni degli script di esempio, clicca sul pulsante di seguito:
+
+<details>
+  <summary>Esempi per versioni precedenti di 🤗 Transformers</summary>
+	<ul>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.5.1/examples">v4.5.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.4.2/examples">v4.4.2</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.3.3/examples">v4.3.3</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.2.2/examples">v4.2.2</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.1.1/examples">v4.1.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.0.1/examples">v4.0.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.5.1/examples">v3.5.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.4.0/examples">v3.4.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.3.1/examples">v3.3.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.2.0/examples">v3.2.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.1.0/examples">v3.1.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.0.2/examples">v3.0.2</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.11.0/examples">v2.11.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.10.0/examples">v2.10.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.9.1/examples">v2.9.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.8.0/examples">v2.8.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.7.0/examples">v2.7.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.6.0/examples">v2.6.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.5.1/examples">v2.5.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.4.0/examples">v2.4.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.3.0/examples">v2.3.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.2.0/examples">v2.2.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.1.0/examples">v2.1.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.0.0/examples">v2.0.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v1.2.0/examples">v1.2.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v1.1.0/examples">v1.1.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v1.0.0/examples">v1.0.0</a></li>
+	</ul>
+</details>
+
+Successivamente, cambia la tua attuale copia di 🤗 Transformers specificandone la versione, ad esempio v3.5.1:
+
+```bash
+git checkout tags/v3.5.1
+```
+
+ Dopo aver configurato correttamente la versione della libreria, naviga nella cartella degli esempi di tua scelta e installa i requisiti:
+
+```bash
+pip install -r requirements.txt
+```
+
+## Esegui uno script
+
+<frameworkcontent>
+<pt>
+
+Lo script di esempio scarica e pre-processa un dataset dalla libreria 🤗 [Datasets](https://huggingface.co/docs/datasets/). Successivamente, lo script esegue il fine-tuning su un dataset usando il [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) su un'architettura che supporta la summarization. Il seguente esempio mostra come eseguire il fine-tuning di [T5-small](https://huggingface.co/t5-small) sul dataset [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail). Il modello T5 richiede un parametro addizionale `source_prefix` a causa del modo in cui è stato addestrato. Questo prefisso permette a T5 di sapere che si tratta di un task di summarization.
+
+```bash
+python examples/pytorch/summarization/run_summarization.py \
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+</pt>
+<tf>
+Lo script di esempio scarica e pre-processa un dataset dalla libreria 🤗 [Datasets](https://huggingface.co/docs/datasets/). Successivamente, lo script esegue il fine-tuning su un dataset usando Keras su un'architettura che supporta la summarization. Il seguente esempio mostra come eseguire il fine-tuning di [T5-small](https://huggingface.co/t5-small) sul dataset [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail). Il modello T5 richiede un parametro addizionale `source_prefix` a causa del modo in cui è stato addestrato. Questo prefisso permette a T5 di sapere che si tratta di un task di summarization.
+
+```bash
+python examples/tensorflow/summarization/run_summarization.py  \
+    --model_name_or_path t5-small \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --output_dir /tmp/tst-summarization  \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 16 \
+    --num_train_epochs 3 \
+    --do_train \
+    --do_eval
+```
+</tf>
+</frameworkcontent>
+
+## Addestramento distribuito e precisione mista
+
+Il [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) supporta l'addestramento distribuito e la precisione mista, che significa che puoi anche usarla in uno script. Per abilitare entrambe le funzionalità:
+
+- Aggiunto l'argomento `fp16` per abilitare la precisione mista.
+- Imposta un numero di GPU da usare con l'argomento `nproc_per_node`.
+
+```bash
+python -m torch.distributed.launch \
+    --nproc_per_node 8 pytorch/summarization/run_summarization.py \
+    --fp16 \
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+
+Gli script TensorFlow utilizzano una [`MirroredStrategy`](https://www.tensorflow.org/guide/distributed_training#mirroredstrategy) per il training distribuito e non devi aggiungere alcun argomento addizionale allo script di training. Lo script TensorFlow userà multiple GPU in modo predefinito se quest'ultime sono disponibili:
+
+## Esegui uno script su TPU
+
+<frameworkcontent>
+<pt>
+Le Tensor Processing Units (TPU) sono state progettate per migliorare le prestazioni. PyTorch supporta le TPU con il compilatore per deep learning [XLA](https://www.tensorflow.org/xla) (guarda [questo link](https://github.com/pytorch/xla/blob/master/README.md) per maggiori dettagli). Per usare una TPU, avvia lo script `xla_spawn.py` e usa l'argomento `num_cores` per impostare il numero di core TPU che intendi usare.
+
+```bash
+python xla_spawn.py --num_cores 8 \
+    summarization/run_summarization.py \
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+</pt>
+<tf>
+Le Tensor Processing Units (TPU) sono state progettate per migliorare le prestazioni. Gli script TensorFlow utilizzano una [`TPUStrategy`](https://www.tensorflow.org/guide/distributed_training#tpustrategy) per eseguire l'addestramento su TPU. Per usare una TPU, passa il nome della risorsa TPU all'argomento `tpu`.
+
+```bash
+python run_summarization.py  \
+    --tpu name_of_tpu_resource \
+    --model_name_or_path t5-small \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --output_dir /tmp/tst-summarization  \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 16 \
+    --num_train_epochs 3 \
+    --do_train \
+    --do_eval
+```
+</tf>
+</frameworkcontent>
+
+## Esegui uno script con 🤗 Accelerate
+
+🤗 [Accelerate](https://huggingface.co/docs/accelerate) è una libreria compatibile solo con PyTorch che offre un metodo unificato per addestrare modelli su diverse tipologie di configurazioni (CPU, multiple GPU, TPU) mantenendo una completa visibilità rispetto al ciclo di training di PyTorch. Assicurati di aver effettuato l'installazione di 🤗 Accelerate, nel caso non lo avessi fatto:
+
+> Nota: dato che Accelerate è in rapido sviluppo, è necessario installare la versione proveniente da git per eseguire gli script:
+```bash
+pip install git+https://github.com/huggingface/accelerate
+```
+
+Invece che usare lo script `run_summarization.py`, devi usare lo script `run_summarization_no_trainer.py`. Gli script supportati in 🤗 Accelerate avranno un file chiamato `task_no_trainer.py` nella rispettiva cartella. Per iniziare, esegui il seguente comando per creare e salvare un file di configurazione: 
+
+```bash
+accelerate config
+```
+
+Testa la tua configurazione per assicurarti della sua correttezza:
+
+```bash
+accelerate test
+```
+
+Ora sei pronto per avviare l'addestramento:
+
+```bash
+accelerate launch run_summarization_no_trainer.py \
+    --model_name_or_path t5-small \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir ~/tmp/tst-summarization
+```
+
+## Uso di un dataset personalizzato
+
+Lo script di summarization supporta dataset personalizzati purché siano file CSV o JSON Line. Quando usi il tuo dataset, devi specificare diversi argomenti aggiuntivi:
+
+- `train_file` e `validation_file` specificano dove si trovano i file di addestramento e validazione.
+- `text_column` è il file di input da riassumere.
+- `summary_column` è il file di destinazione per l'output.
+
+Uno script di summarization usando un dataset personalizzato sarebbe simile a questo:
+
+```bash
+python examples/pytorch/summarization/run_summarization.py \
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --train_file path_to_csv_or_jsonlines_file \
+    --validation_file path_to_csv_or_jsonlines_file \
+    --text_column text_column_name \
+    --summary_column summary_column_name \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --overwrite_output_dir \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --predict_with_generate
+```
+
+## Testare uno script
+
+È spesso una buona idea avviare il tuo script su un numero inferiore di esempi tratti dal dataset, per assicurarti che tutto funzioni come previsto prima di eseguire lo script sull'intero dataset, che potrebbe necessitare di ore. Usa i seguenti argomenti per limitare il dataset ad un massimo numero di esempi:
+
+- `max_train_samples`
+- `max_eval_samples`
+- `max_predict_samples`
+
+```bash
+python examples/pytorch/summarization/run_summarization.py \
+    --model_name_or_path t5-small \
+    --max_train_samples 50 \
+    --max_eval_samples 50 \
+    --max_predict_samples 50 \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+
+Non tutti gli esempi di script supportano l'argomento `max_predict_samples`. Se non sei sicuro circa il supporto di questo argomento da parte del tuo script, aggiungi l'argomento `-h` per controllare:
+
+```bash
+examples/pytorch/summarization/run_summarization.py -h
+```
+
+## Riavviare addestramento da un checkpoint
+
+Un'altra utile opzione è riavviare un addestramento da un checkpoint precedente. Questo garantirà che tu possa riprendere da dove hai interrotto senza ricominciare se l'addestramento viene interrotto. Ci sono due metodi per riavviare l'addestramento da un checkpoint: 
+
+Il primo metodo usa l'argomento `output_dir previous_output_dir` per riavviare l'addestramento dall'ultima versione del checkpoint contenuto in `output_dir`. In questo caso, dovresti rimuovere `overwrite_output_dir`:
+
+```bash
+python examples/pytorch/summarization/run_summarization.py
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --output_dir previous_output_dir \
+    --predict_with_generate
+```
+
+Il secondo metodo usa l'argomento `resume_from_checkpoint path_to_specific_checkpoint` per riavviare un addestramento da una specifica cartella di checkpoint.
+
+```bash
+python examples/pytorch/summarization/run_summarization.py
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --resume_from_checkpoint path_to_specific_checkpoint \
+    --predict_with_generate
+```
+
+## Condividi il tuo modello
+
+Tutti gli script possono caricare il tuo modello finale al [Model Hub](https://huggingface.co/models). Prima di iniziare, assicurati di aver effettuato l'accesso su Hugging Face:
+
+```bash
+huggingface-cli login
+```
+
+Poi, aggiungi l'argomento `push_to_hub` allo script. Questo argomento consentirà di creare un repository con il tuo username Hugging Face e la cartella specificata in `output_dir`.
+
+Per dare uno specifico nome al repository, usa l'argomento `push_to_hub_model_id`. Il repository verrà automaticamente elencata sotto al tuo namespace.
+
+Il seguente esempio mostra come caricare un modello specificando il nome del repository:
+
+```bash
+python examples/pytorch/summarization/run_summarization.py
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --push_to_hub \
+    --push_to_hub_model_id finetuned-t5-cnn_dailymail \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
diff --git a/docs/source/it/run_scripts.mdx b/docs/source/it/run_scripts.mdx
deleted file mode 100644
index 3ffd58a62830..000000000000
--- a/docs/source/it/run_scripts.mdx
+++ /dev/null
@@ -1,347 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Addestramento con script
-
-Insieme ai [notebooks](./noteboks/README) 🤗 Transformers, ci sono anche esempi di script che dimostrano come addestrare un modello per un task con [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow), o [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax).
-
-Troverai anche script che abbiamo usato nei nostri [progetti di ricerca](https://github.com/huggingface/transformers/tree/main/examples/research_projects) e [precedenti esempi](https://github.com/huggingface/transformers/tree/main/examples/legacy) a cui contribuisce per lo più la comunità. Questi script non sono attivamente mantenuti e richiedono una specifica versione di 🤗 Transformers che sarà molto probabilmente incompatibile con l'ultima versione della libreria.
-
-Non è dato per scontato che gli script di esempio funzionino senza apportare modifiche per ogni problema, bensì potrebbe essere necessario adattare lo script al tuo caso specifico. Per aiutarti in ciò, la maggioranza degli script espone le modalità di pre-processamento dei dati, consentendoti di modificare lo script come preferisci.
-
-Per qualsiasi feature che vorresti implementare in uno script d'esempio, per favore discutine nel [forum](https://discuss.huggingface.co/) o in un'[issue](https://github.com/huggingface/transformers/issues) prima di inviare una Pull Request. Mentre accogliamo con piacere la correzione di bug, è più improbabile che faremo la stessa con una PR che aggiunge funzionalità sacrificando la leggibilità. 
-
-Questa guida ti mostrerà come eseguire uno script di esempio relativo al task di summarization in [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization) e [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization). Tutti gli esempi funzioneranno con entrambi i framework a meno che non sia specificato altrimenti. 
-
-## Installazione
-
-Per eseguire con successo l'ultima versione degli script di esempio, devi **installare 🤗 Transformers dalla fonte** in un nuovo ambiente virtuale:
-
-```bash
-git clone https://github.com/huggingface/transformers
-cd transformers
-pip install .
-```
-Per le precedenti versioni degli script di esempio, clicca sul pulsante di seguito:
-
-<details>
-  <summary>Esempi per versioni precedenti di 🤗 Transformers</summary>
-	<ul>
-		<li><a href="https://github.com/huggingface/transformers/tree/v4.5.1/examples">v4.5.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v4.4.2/examples">v4.4.2</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v4.3.3/examples">v4.3.3</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v4.2.2/examples">v4.2.2</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v4.1.1/examples">v4.1.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v4.0.1/examples">v4.0.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v3.5.1/examples">v3.5.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v3.4.0/examples">v3.4.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v3.3.1/examples">v3.3.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v3.2.0/examples">v3.2.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v3.1.0/examples">v3.1.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v3.0.2/examples">v3.0.2</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.11.0/examples">v2.11.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.10.0/examples">v2.10.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.9.1/examples">v2.9.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.8.0/examples">v2.8.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.7.0/examples">v2.7.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.6.0/examples">v2.6.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.5.1/examples">v2.5.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.4.0/examples">v2.4.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.3.0/examples">v2.3.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.2.0/examples">v2.2.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.1.0/examples">v2.1.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.0.0/examples">v2.0.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v1.2.0/examples">v1.2.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v1.1.0/examples">v1.1.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v1.0.0/examples">v1.0.0</a></li>
-	</ul>
-</details>
-
-Successivamente, cambia la tua attuale copia di 🤗 Transformers specificandone la versione, ad esempio v3.5.1:
-
-```bash
-git checkout tags/v3.5.1
-```
-
- Dopo aver configurato correttamente la versione della libreria, naviga nella cartella degli esempi di tua scelta e installa i requisiti:
-
-```bash
-pip install -r requirements.txt
-```
-
-## Esegui uno script
-
-<frameworkcontent>
-<pt>
-
-Lo script di esempio scarica e pre-processa un dataset dalla libreria 🤗 [Datasets](https://huggingface.co/docs/datasets/). Successivamente, lo script esegue il fine-tuning su un dataset usando il [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) su un'architettura che supporta la summarization. Il seguente esempio mostra come eseguire il fine-tuning di [T5-small](https://huggingface.co/t5-small) sul dataset [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail). Il modello T5 richiede un parametro addizionale `source_prefix` a causa del modo in cui è stato addestrato. Questo prefisso permette a T5 di sapere che si tratta di un task di summarization.
-
-```bash
-python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path t5-small \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --output_dir /tmp/tst-summarization \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --overwrite_output_dir \
-    --predict_with_generate
-```
-</pt>
-<tf>
-Lo script di esempio scarica e pre-processa un dataset dalla libreria 🤗 [Datasets](https://huggingface.co/docs/datasets/). Successivamente, lo script esegue il fine-tuning su un dataset usando Keras su un'architettura che supporta la summarization. Il seguente esempio mostra come eseguire il fine-tuning di [T5-small](https://huggingface.co/t5-small) sul dataset [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail). Il modello T5 richiede un parametro addizionale `source_prefix` a causa del modo in cui è stato addestrato. Questo prefisso permette a T5 di sapere che si tratta di un task di summarization.
-
-```bash
-python examples/tensorflow/summarization/run_summarization.py  \
-    --model_name_or_path t5-small \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --output_dir /tmp/tst-summarization  \
-    --per_device_train_batch_size 8 \
-    --per_device_eval_batch_size 16 \
-    --num_train_epochs 3 \
-    --do_train \
-    --do_eval
-```
-</tf>
-</frameworkcontent>
-
-## Addestramento distribuito e precisione mista
-
-Il [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) supporta l'addestramento distribuito e la precisione mista, che significa che puoi anche usarla in uno script. Per abilitare entrambe le funzionalità:
-
-- Aggiunto l'argomento `fp16` per abilitare la precisione mista.
-- Imposta un numero di GPU da usare con l'argomento `nproc_per_node`.
-
-```bash
-python -m torch.distributed.launch \
-    --nproc_per_node 8 pytorch/summarization/run_summarization.py \
-    --fp16 \
-    --model_name_or_path t5-small \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --output_dir /tmp/tst-summarization \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --overwrite_output_dir \
-    --predict_with_generate
-```
-
-Gli script TensorFlow utilizzano una [`MirroredStrategy`](https://www.tensorflow.org/guide/distributed_training#mirroredstrategy) per il training distribuito e non devi aggiungere alcun argomento addizionale allo script di training. Lo script TensorFlow userà multiple GPU in modo predefinito se quest'ultime sono disponibili:
-
-## Esegui uno script su TPU
-
-<frameworkcontent>
-<pt>
-Le Tensor Processing Units (TPU) sono state progettate per migliorare le prestazioni. PyTorch supporta le TPU con il compilatore per deep learning [XLA](https://www.tensorflow.org/xla) (guarda [questo link](https://github.com/pytorch/xla/blob/master/README.md) per maggiori dettagli). Per usare una TPU, avvia lo script `xla_spawn.py` e usa l'argomento `num_cores` per impostare il numero di core TPU che intendi usare.
-
-```bash
-python xla_spawn.py --num_cores 8 \
-    summarization/run_summarization.py \
-    --model_name_or_path t5-small \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --output_dir /tmp/tst-summarization \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --overwrite_output_dir \
-    --predict_with_generate
-```
-</pt>
-<tf>
-Le Tensor Processing Units (TPU) sono state progettate per migliorare le prestazioni. Gli script TensorFlow utilizzano una [`TPUStrategy`](https://www.tensorflow.org/guide/distributed_training#tpustrategy) per eseguire l'addestramento su TPU. Per usare una TPU, passa il nome della risorsa TPU all'argomento `tpu`.
-
-```bash
-python run_summarization.py  \
-    --tpu name_of_tpu_resource \
-    --model_name_or_path t5-small \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --output_dir /tmp/tst-summarization  \
-    --per_device_train_batch_size 8 \
-    --per_device_eval_batch_size 16 \
-    --num_train_epochs 3 \
-    --do_train \
-    --do_eval
-```
-</tf>
-</frameworkcontent>
-
-## Esegui uno script con 🤗 Accelerate
-
-🤗 [Accelerate](https://huggingface.co/docs/accelerate) è una libreria compatibile solo con PyTorch che offre un metodo unificato per addestrare modelli su diverse tipologie di configurazioni (CPU, multiple GPU, TPU) mantenendo una completa visibilità rispetto al ciclo di training di PyTorch. Assicurati di aver effettuato l'installazione di 🤗 Accelerate, nel caso non lo avessi fatto:
-
-> Nota: dato che Accelerate è in rapido sviluppo, è necessario installare la versione proveniente da git per eseguire gli script:
-```bash
-pip install git+https://github.com/huggingface/accelerate
-```
-
-Invece che usare lo script `run_summarization.py`, devi usare lo script `run_summarization_no_trainer.py`. Gli script supportati in 🤗 Accelerate avranno un file chiamato `task_no_trainer.py` nella rispettiva cartella. Per iniziare, esegui il seguente comando per creare e salvare un file di configurazione: 
-
-```bash
-accelerate config
-```
-
-Testa la tua configurazione per assicurarti della sua correttezza:
-
-```bash
-accelerate test
-```
-
-Ora sei pronto per avviare l'addestramento:
-
-```bash
-accelerate launch run_summarization_no_trainer.py \
-    --model_name_or_path t5-small \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --output_dir ~/tmp/tst-summarization
-```
-
-## Uso di un dataset personalizzato
-
-Lo script di summarization supporta dataset personalizzati purché siano file CSV o JSON Line. Quando usi il tuo dataset, devi specificare diversi argomenti aggiuntivi:
-
-- `train_file` e `validation_file` specificano dove si trovano i file di addestramento e validazione.
-- `text_column` è il file di input da riassumere.
-- `summary_column` è il file di destinazione per l'output.
-
-Uno script di summarization usando un dataset personalizzato sarebbe simile a questo:
-
-```bash
-python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path t5-small \
-    --do_train \
-    --do_eval \
-    --train_file path_to_csv_or_jsonlines_file \
-    --validation_file path_to_csv_or_jsonlines_file \
-    --text_column text_column_name \
-    --summary_column summary_column_name \
-    --source_prefix "summarize: " \
-    --output_dir /tmp/tst-summarization \
-    --overwrite_output_dir \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --predict_with_generate
-```
-
-## Testare uno script
-
-È spesso una buona idea avviare il tuo script su un numero inferiore di esempi tratti dal dataset, per assicurarti che tutto funzioni come previsto prima di eseguire lo script sull'intero dataset, che potrebbe necessitare di ore. Usa i seguenti argomenti per limitare il dataset ad un massimo numero di esempi:
-
-- `max_train_samples`
-- `max_eval_samples`
-- `max_predict_samples`
-
-```bash
-python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path t5-small \
-    --max_train_samples 50 \
-    --max_eval_samples 50 \
-    --max_predict_samples 50 \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --output_dir /tmp/tst-summarization \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --overwrite_output_dir \
-    --predict_with_generate
-```
-
-Non tutti gli esempi di script supportano l'argomento `max_predict_samples`. Se non sei sicuro circa il supporto di questo argomento da parte del tuo script, aggiungi l'argomento `-h` per controllare:
-
-```bash
-examples/pytorch/summarization/run_summarization.py -h
-```
-
-## Riavviare addestramento da un checkpoint
-
-Un'altra utile opzione è riavviare un addestramento da un checkpoint precedente. Questo garantirà che tu possa riprendere da dove hai interrotto senza ricominciare se l'addestramento viene interrotto. Ci sono due metodi per riavviare l'addestramento da un checkpoint: 
-
-Il primo metodo usa l'argomento `output_dir previous_output_dir` per riavviare l'addestramento dall'ultima versione del checkpoint contenuto in `output_dir`. In questo caso, dovresti rimuovere `overwrite_output_dir`:
-
-```bash
-python examples/pytorch/summarization/run_summarization.py
-    --model_name_or_path t5-small \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --output_dir /tmp/tst-summarization \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --output_dir previous_output_dir \
-    --predict_with_generate
-```
-
-Il secondo metodo usa l'argomento `resume_from_checkpoint path_to_specific_checkpoint` per riavviare un addestramento da una specifica cartella di checkpoint.
-
-```bash
-python examples/pytorch/summarization/run_summarization.py
-    --model_name_or_path t5-small \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --output_dir /tmp/tst-summarization \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --overwrite_output_dir \
-    --resume_from_checkpoint path_to_specific_checkpoint \
-    --predict_with_generate
-```
-
-## Condividi il tuo modello
-
-Tutti gli script possono caricare il tuo modello finale al [Model Hub](https://huggingface.co/models). Prima di iniziare, assicurati di aver effettuato l'accesso su Hugging Face:
-
-```bash
-huggingface-cli login
-```
-
-Poi, aggiungi l'argomento `push_to_hub` allo script. Questo argomento consentirà di creare un repository con il tuo username Hugging Face e la cartella specificata in `output_dir`.
-
-Per dare uno specifico nome al repository, usa l'argomento `push_to_hub_model_id`. Il repository verrà automaticamente elencata sotto al tuo namespace.
-
-Il seguente esempio mostra come caricare un modello specificando il nome del repository:
-
-```bash
-python examples/pytorch/summarization/run_summarization.py
-    --model_name_or_path t5-small \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --push_to_hub \
-    --push_to_hub_model_id finetuned-t5-cnn_dailymail \
-    --output_dir /tmp/tst-summarization \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --overwrite_output_dir \
-    --predict_with_generate
-```
diff --git a/docs/source/it/serialization.md b/docs/source/it/serialization.md
new file mode 100644
index 000000000000..0067f1a3c52e
--- /dev/null
+++ b/docs/source/it/serialization.md
@@ -0,0 +1,677 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Esporta modelli 🤗 Transformers 
+
+Se devi implementare 🤗 modelli Transformers in ambienti di produzione, noi
+consigliamo di esportarli in un formato serializzato che può essere caricato ed eseguito
+su runtime e hardware specializzati. In questa guida ti mostreremo come farlo
+esporta 🤗 Modelli Transformers in due formati ampiamente utilizzati: ONNX e TorchScript.
+
+Una volta esportato, un modello può essere ottimizato per l'inferenza tramite tecniche come 
+la quantizzazione e soppressione. Se sei interessato a ottimizzare i tuoi modelli per l'esecuzione
+con la massima efficienza, dai un'occhiata a [🤗 Optimum
+library](https://github.com/huggingface/optimum).
+
+## ONNX
+
+Il progetto [ONNX (Open Neural Network eXchange)](http://onnx.ai) Il progetto onnx è un open
+standard che definisce un insieme comune di operatori e un formato di file comune a
+rappresentano modelli di deep learning in un'ampia varietà di framework, tra cui
+PyTorch e TensorFlow. Quando un modello viene esportato nel formato ONNX, questi
+operatori sono usati per costruire un grafico computazionale (often called an
+_intermediate representation_) che rappresenta il flusso di dati attraverso la
+rete neurale.
+
+Esponendo un grafico con operatori e tipi di dati standardizzati, ONNX rende
+più facile passare da un framework all'altro. Ad esempio, un modello allenato in PyTorch può
+essere esportato in formato ONNX e quindi importato in TensorFlow (e viceversa).
+
+🤗 Transformers fornisce un pacchetto `transformers.onnx` che ti consente di
+convertire i checkpoint del modello in un grafico ONNX sfruttando gli oggetti di configurazione.
+Questi oggetti di configurazione sono già pronti per una serie di architetture di modelli,
+e sono progettati per essere facilmente estensibili ad altre architetture.
+
+Le configurazioni pronte includono le seguenti architetture:
+
+<!--This table is automatically generated by `make fix-copies`, do not fill manually!-->
+
+- ALBERT
+- BART
+- BEiT
+- BERT
+- BigBird
+- BigBird-Pegasus
+- Blenderbot
+- BlenderbotSmall
+- CamemBERT
+- ConvBERT
+- Data2VecText
+- Data2VecVision
+- DeiT
+- DistilBERT
+- ELECTRA
+- FlauBERT
+- GPT Neo
+- GPT-J
+- I-BERT
+- LayoutLM
+- M2M100
+- Marian
+- mBART
+- MobileBERT
+- OpenAI GPT-2
+- Perceiver
+- PLBart
+- RoBERTa
+- RoFormer
+- SqueezeBERT
+- T5
+- ViT
+- XLM
+- XLM-RoBERTa
+- XLM-RoBERTa-XL
+
+Nelle prossime due sezioni, ti mostreremo come:
+
+* Esporta un modello supportato usando il pacchetto `transformers.onnx`.
+* Esporta un modello personalizzato per un'architettura non supportata.
+
+### Esportazione di un modello in ONNX
+
+Per esportare un modello 🤗 Transformers in ONNX, dovrai prima installarne alcune
+dipendenze extra:
+
+```bash
+pip install transformers[onnx]
+```
+
+Il pacchetto `transformers.onnx` può essere usato come modulo Python:
+
+```bash
+python -m transformers.onnx --help
+
+usage: Hugging Face Transformers ONNX exporter [-h] -m MODEL [--feature {causal-lm, ...}] [--opset OPSET] [--atol ATOL] output
+
+positional arguments:
+  output                Path indicating where to store generated ONNX model.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -m MODEL, --model MODEL
+                        Model ID on huggingface.co or path on disk to load model from.
+  --feature {causal-lm, ...}
+                        The type of features to export the model with.
+  --opset OPSET         ONNX opset version to export the model with.
+  --atol ATOL           Absolute difference tolerance when validating the model.
+```
+
+L'esportazione di un checkpoint utilizzando una configurazione già pronta può essere eseguita come segue:
+
+```bash
+python -m transformers.onnx --model=distilbert-base-uncased onnx/
+```
+
+che dovrebbe mostrare i seguenti log:
+
+```bash
+Validating ONNX model...
+        -[✓] ONNX model output names match reference model ({'last_hidden_state'})
+        - Validating ONNX Model output "last_hidden_state":
+                -[✓] (2, 8, 768) matches (2, 8, 768)
+                -[✓] all values close (atol: 1e-05)
+All good, model saved at: onnx/model.onnx
+```
+
+Questo esporta un grafico ONNX del checkpoint definito dall'argomento `--model`.
+In questo esempio è `distilbert-base-uncased`, ma può essere qualsiasi checkpoint
+Hugging Face Hub o uno memorizzato localmente.
+
+Il file risultante `model.onnx` può quindi essere eseguito su uno dei [tanti
+acceleratori](https://onnx.ai/supported-tools.html#deployModel) che supportano il
+lo standard ONNX. Ad esempio, possiamo caricare ed eseguire il modello con [ONNX
+Runtime](https://onnxruntime.ai/) come segue:
+
+```python
+>>> from transformers import AutoTokenizer
+>>> from onnxruntime import InferenceSession
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+>>> session = InferenceSession("onnx/model.onnx")
+>>> # ONNX Runtime expects NumPy arrays as input
+>>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
+>>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs))
+```
+
+I nomi di output richiesti (cioè `["last_hidden_state"]`) possono essere ottenuti
+dando un'occhiata alla configurazione ONNX di ogni modello. Ad esempio, per
+DistilBERT abbiamo:
+
+```python
+>>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig
+
+>>> config = DistilBertConfig()
+>>> onnx_config = DistilBertOnnxConfig(config)
+>>> print(list(onnx_config.outputs.keys()))
+["last_hidden_state"]
+```
+
+Il processo è identico per i checkpoint TensorFlow sull'hub. Ad esempio, noi
+possiamo esportare un checkpoint TensorFlow puro da [Keras
+organizzazione](https://huggingface.co/keras-io) come segue:
+
+```bash
+python -m transformers.onnx --model=keras-io/transformers-qa onnx/
+```
+
+Per esportare un modello memorizzato localmente, devi disporre dei pesi del modello
+e file tokenizer memorizzati in una directory. Ad esempio, possiamo caricare e salvare un
+checkpoint come segue:
+
+<frameworkcontent>
+<pt>
+```python
+>>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
+
+>>> # Load tokenizer and PyTorch weights form the Hub
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> # Save to disk
+>>> tokenizer.save_pretrained("local-pt-checkpoint")
+>>> pt_model.save_pretrained("local-pt-checkpoint")
+```
+
+Una volta salvato il checkpoint, possiamo esportarlo su ONNX puntando l'argomento `--model`
+del pacchetto `transformers.onnx` nella directory desiderata:
+
+```bash
+python -m transformers.onnx --model=local-pt-checkpoint onnx/
+```
+</pt>
+<tf>
+```python
+>>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+
+>>> # Load tokenizer and TensorFlow weights from the Hub
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> # Save to disk
+>>> tokenizer.save_pretrained("local-tf-checkpoint")
+>>> tf_model.save_pretrained("local-tf-checkpoint")
+```
+
+Once the checkpoint is saved, we can export it to ONNX by pointing the `--model`
+argument of the `transformers.onnx` package to the desired directory:
+
+```bash
+python -m transformers.onnx --model=local-tf-checkpoint onnx/
+```
+</tf>
+</frameworkcontent>
+
+### Selezione delle caratteristiche per diverse topologie di modello
+
+Ogni configurazione già pronta viene fornita con una serie di _caratteristiche_ che ti consentono di
+esportare modelli per diversi tipi di topologie o attività. Come mostrato nella tabella
+di seguito, ogni caratteristica è associata a una diversa Auto Class:
+
+| Caratteristica                              | Auto Class                           |
+| ------------------------------------ | ------------------------------------ |
+| `causal-lm`, `causal-lm-with-past`   | `AutoModelForCausalLM`               |
+| `default`, `default-with-past`       | `AutoModel`                          |
+| `masked-lm`                          | `AutoModelForMaskedLM`               |
+| `question-answering`                 | `AutoModelForQuestionAnswering`      |
+| `seq2seq-lm`, `seq2seq-lm-with-past` | `AutoModelForSeq2SeqLM`              |
+| `sequence-classification`            | `AutoModelForSequenceClassification` |
+| `token-classification`               | `AutoModelForTokenClassification`    |
+
+Per ciascuna configurazione, puoi trovare l'elenco delle funzionalità supportate tramite il
+`FeaturesManager`. Ad esempio, per DistilBERT abbiamo:
+
+```python
+>>> from transformers.onnx.features import FeaturesManager
+
+>>> distilbert_features = list(FeaturesManager.get_supported_features_for_model_type("distilbert").keys())
+>>> print(distilbert_features)
+["default", "masked-lm", "causal-lm", "sequence-classification", "token-classification", "question-answering"]
+```
+
+Puoi quindi passare una di queste funzionalità all'argomento `--feature` nel
+pacchetto `transformers.onnx`. Ad esempio, per esportare un modello di classificazione del testo
+possiamo scegliere un modello ottimizzato dall'Hub ed eseguire:
+
+```bash
+python -m transformers.onnx --model=distilbert-base-uncased-finetuned-sst-2-english \
+                            --feature=sequence-classification onnx/
+```
+
+che visualizzerà i seguenti registri:
+
+```bash
+Validating ONNX model...
+        -[✓] ONNX model output names match reference model ({'logits'})
+        - Validating ONNX Model output "logits":
+                -[✓] (2, 2) matches (2, 2)
+                -[✓] all values close (atol: 1e-05)
+All good, model saved at: onnx/model.onnx
+```
+
+Puoi notare che in questo caso, i nomi di output del modello ottimizzato sono
+`logits` invece di `last_hidden_state` che abbiamo visto con il
+checkpoint `distilbert-base-uncased` precedente. Questo è previsto dal
+modello ottimizato visto che ha una testa di e.
+
+<Tip>
+
+Le caratteristiche che hanno un suffisso `wtih-past` (ad es. `causal-lm-with-past`)
+corrispondono a topologie di modello con stati nascosti precalcolati (chiave e valori
+nei blocchi di attenzione) che possono essere utilizzati per la decodifica autoregressiva veloce.
+
+</Tip>
+
+
+### Esportazione di un modello per un'architettura non supportata
+
+Se desideri esportare un modello la cui architettura non è nativamente supportata dalla
+libreria, ci sono tre passaggi principali da seguire:
+
+1. Implementare una configurazione ONNX personalizzata.
+2. Esportare il modello in ONNX.
+3. Convalidare gli output di PyTorch e dei modelli esportati.
+
+In questa sezione, vedremo come DistilBERT è stato implementato per mostrare cosa è
+coinvolto in ogni passaggio.
+
+#### Implementazione di una configurazione ONNX personalizzata
+
+Iniziamo con l'oggetto di configurazione ONNX. Forniamo tre classi
+astratte da cui ereditare, a seconda del tipo di archittettura
+del modello che desideri esportare:
+
+* I modelli basati su encoder ereditano da [`~onnx.config.OnnxConfig`]
+* I modelli basati su decoder ereditano da [`~onnx.config.OnnxConfigWithPast`]
+* I modelli encoder-decoder ereditano da[`~onnx.config.OnnxSeq2SeqConfigWithPast`]
+
+<Tip>
+
+Un buon modo per implementare una configurazione ONNX personalizzata è guardare l'implementazione
+esistente nel file `configuration_<model_name>.py` di un'architettura simile.
+
+</Tip>
+
+Poiché DistilBERT è un modello basato su encoder, la sua configurazione eredita da
+`OnnxConfig`:
+
+```python
+>>> from typing import Mapping, OrderedDict
+>>> from transformers.onnx import OnnxConfig
+
+
+>>> class DistilBertOnnxConfig(OnnxConfig):
+...     @property
+...     def inputs(self) -> Mapping[str, Mapping[int, str]]:
+...         return OrderedDict(
+...             [
+...                 ("input_ids", {0: "batch", 1: "sequence"}),
+...                 ("attention_mask", {0: "batch", 1: "sequence"}),
+...             ]
+...         )
+```
+
+Ogni oggetto di configurazione deve implementare la proprietà `inputs` e restituire una
+mappatura, dove ogni chiave corrisponde a un input previsto e ogni valore
+indica l'asse di quell'input. Per DistilBERT, possiamo vedere che sono richiesti
+due input: `input_ids` e `attention_mask`. Questi inputs hanno la stessa forma di
+`(batch_size, sequence_length)` per questo motivo vediamo gli stessi assi usati nella
+configurazione.
+
+<Tip>
+
+Puoi notare che la proprietà `inputs` per `DistilBertOnnxConfig` restituisce un
+`OrdinatoDict`. Ciò garantisce che gli input corrispondano alla loro posizione
+relativa all'interno del metodo `PreTrainedModel.forward()` durante il tracciamento del grafico.
+Raccomandiamo di usare un `OrderedDict` per le proprietà `inputs` e `outputs`
+quando si implementano configurazioni ONNX personalizzate.
+
+</Tip>
+
+Dopo aver implementato una configurazione ONNX, è possibile istanziarla
+fornendo alla configurazione del modello base come segue:
+
+```python
+>>> from transformers import AutoConfig
+
+>>> config = AutoConfig.from_pretrained("distilbert-base-uncased")
+>>> onnx_config = DistilBertOnnxConfig(config)
+```
+
+L'oggetto risultante ha diverse proprietà utili. Ad esempio è possibile visualizzare il
+Set operatore ONNX che verrà utilizzato durante l'esportazione:
+
+```python
+>>> print(onnx_config.default_onnx_opset)
+11
+```
+
+È inoltre possibile visualizzare gli output associati al modello come segue:
+
+```python
+>>> print(onnx_config.outputs)
+OrderedDict([("last_hidden_state", {0: "batch", 1: "sequence"})])
+```
+
+Puoi notare che la proprietà degli output segue la stessa struttura degli input; esso
+restituisce un `OrderedDict` di output con nome e le loro forme. La struttura di output
+è legato alla scelta della funzione con cui viene inizializzata la configurazione.
+Per impostazione predefinita, la configurazione ONNX viene inizializzata con la funzione 'predefinita'
+che corrisponde all'esportazione di un modello caricato con la classe `AutoModel`. Se tu
+desideri esportare una topologia di modello diversa, è sufficiente fornire una funzionalità diversa a
+l'argomento `task` quando inizializzi la configurazione ONNX. Ad esempio, se
+volevamo esportare DistilBERT con una testa di classificazione per sequenze, potremmo
+usare:
+
+```python
+>>> from transformers import AutoConfig
+
+>>> config = AutoConfig.from_pretrained("distilbert-base-uncased")
+>>> onnx_config_for_seq_clf = DistilBertOnnxConfig(config, task="sequence-classification")
+>>> print(onnx_config_for_seq_clf.outputs)
+OrderedDict([('logits', {0: 'batch'})])
+```
+
+<Tip>
+
+Tutte le proprietà e i metodi di base associati a [`~onnx.config.OnnxConfig`] e le
+altre classi di configurazione possono essere sovrascritte se necessario. Guarda
+[`BartOnnxConfig`] per un esempio avanzato.
+
+</Tip>
+
+#### Esportazione del modello
+
+Una volta implementata la configurazione ONNX, il passaggio successivo consiste nell'esportare il
+modello. Qui possiamo usare la funzione `export()` fornita dal
+pacchetto `transformers.onnx`. Questa funzione prevede la configurazione ONNX, insieme
+con il modello base e il tokenizer e il percorso per salvare il file esportato:
+
+```python
+>>> from pathlib import Path
+>>> from transformers.onnx import export
+>>> from transformers import AutoTokenizer, AutoModel
+
+>>> onnx_path = Path("model.onnx")
+>>> model_ckpt = "distilbert-base-uncased"
+>>> base_model = AutoModel.from_pretrained(model_ckpt)
+>>> tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
+
+>>> onnx_inputs, onnx_outputs = export(tokenizer, base_model, onnx_config, onnx_config.default_onnx_opset, onnx_path)
+```
+
+Gli `onnx_inputs` e `onnx_outputs` restituiti dalla funzione `export()` sono
+liste di chiavi definite nelle proprietà di `input` e `output` della
+configurazione. Una volta esportato il modello, puoi verificare che il modello sia ben
+formato come segue:
+
+```python
+>>> import onnx
+
+>>> onnx_model = onnx.load("model.onnx")
+>>> onnx.checker.check_model(onnx_model)
+```
+
+<Tip>
+
+Se il tuo modello è più largo di 2 GB, vedrai che molti file aggiuntivi sono
+creati durante l'esportazione. Questo è _previsto_ perché ONNX utilizza [Protocol
+Buffer](https://developers.google.com/protocol-buffers/) per memorizzare il modello e
+questi hanno un limite di dimensione 2 GB. Vedi la [Documentazione
+ONNX](https://github.com/onnx/onnx/blob/master/docs/ExternalData.md)
+per istruzioni su come caricare modelli con dati esterni.
+
+</Tip>
+
+#### Convalida degli output del modello
+
+Il passaggio finale consiste nel convalidare gli output dal modello di base e quello esportato
+corrispondere entro una soglia di tolleranza assoluta. Qui possiamo usare la
+Funzione `validate_model_outputs()` fornita dal pacchetto `transformers.onnx`
+come segue:
+
+```python
+>>> from transformers.onnx import validate_model_outputs
+
+>>> validate_model_outputs(
+...     onnx_config, tokenizer, base_model, onnx_path, onnx_outputs, onnx_config.atol_for_validation
+... )
+```
+
+Questa funzione usa il metodo `OnnxConfig.generate_dummy_inputs()` per generare
+input per il modello di base e quello esportato e la tolleranza assoluta può essere
+definita nella configurazione. Generalmente troviamo una corrispondenza numerica nell'intervallo da 1e-6
+a 1e-4, anche se è probabile che qualsiasi cosa inferiore a 1e-3 vada bene.
+
+### Contribuire con una nuova configurazione a 🤗 Transformers
+
+Stiamo cercando di espandere l'insieme di configurazioni già pronte e di accettare
+contributi della community! Se vuoi contribuire con la tua aggiunta
+nella libreria, dovrai:
+
+* Implementare la configurazione ONNX nella corrispondente `configuration file
+_<model_name>.py`
+* Includere l'architettura del modello e le funzioni corrispondenti in [`~onnx.features.FeatureManager`]
+* Aggiungere la tua architettura del modello ai test in `test_onnx_v2.py`
+
+Scopri come stato contribuito la configurazione per [IBERT]
+(https://github.com/huggingface/transformers/pull/14868/files) per
+avere un'idea di cosa è coinvolto.
+
+## TorchScript
+
+<Tip>
+
+Questo è l'inizio dei nostri esperimenti con TorchScript e stiamo ancora esplorando le sue capacità con
+modelli con variable-input-size. È una nostra priorità e approfondiremo le nostre analisi nelle prossime versioni,
+con più esempi di codici, un'implementazione più flessibile e benchmark che confrontano i codici basati su Python con quelli compilati con
+TorchScript.
+
+</Tip>
+
+Secondo la documentazione di Pytorch: "TorchScript è un modo per creare modelli serializzabili e ottimizzabili da codice
+Pytorch". I due moduli di Pytorch [JIT e TRACE](https://pytorch.org/docs/stable/jit.html) consentono allo sviluppatore di esportare
+il loro modello da riutilizzare in altri programmi, come i programmi C++ orientati all'efficienza.
+
+Abbiamo fornito un'interfaccia che consente l'esportazione di modelli 🤗 Transformers in TorchScript in modo che possano essere riutilizzati
+in un ambiente diverso rispetto a un programma Python basato su Pytorch. Qui spieghiamo come esportare e utilizzare i nostri modelli utilizzando
+TorchScript.
+
+Esportare un modello richiede due cose:
+
+- Un passaggio in avanti con input fittizzi.
+- Istanziazione del modello con flag `torchscript`.
+
+Queste necessità implicano diverse cose a cui gli sviluppatori dovrebbero prestare attenzione. Questi dettagli mostrati sotto.
+
+### Flag TorchScript e pesi legati
+
+Questo flag è necessario perché la maggior parte dei modelli linguistici in questo repository hanno pesi legati tra il loro
+strato "Embedding" e lo strato "Decoding". TorchScript non consente l'esportazione di modelli che hanno pesi
+legati, quindi è necessario prima slegare e clonare i pesi.
+
+Ciò implica che i modelli istanziati con il flag `torchscript` hanno il loro strato `Embedding` e strato `Decoding`
+separato, il che significa che non dovrebbero essere addestrati in futuro. L'allenamento de-sincronizza i due
+strati, portando a risultati inaspettati.
+
+Questo non è il caso per i modelli che non hanno una testa del modello linguistico, poiché quelli non hanno pesi legati. Questi modelli
+può essere esportato in sicurezza senza il flag `torchscript`.
+
+### Input fittizi e standard lengths
+
+Gli input fittizzi sono usati per fare un modello passaggio in avanti . Mentre i valori degli input si propagano attraverso i strati,
+Pytorch tiene traccia delle diverse operazioni eseguite su ciascun tensore. Queste operazioni registrate vengono quindi utilizzate per
+creare la "traccia" del modello.
+
+La traccia viene creata relativamente alle dimensioni degli input. È quindi vincolato dalle dimensioni dell'input
+fittizio e non funzionerà per altre lunghezze di sequenza o dimensioni batch. Quando si proverà con una dimensione diversa, ci sarà errore
+come:
+
+`La dimensione espansa del tensore (3) deve corrispondere alla dimensione esistente (7) nella dimensione non singleton 2`
+
+will be raised. Si consiglia pertanto di tracciare il modello con una dimensione di input fittizia grande almeno quanto il più grande
+input che verrà fornito al modello durante l'inferenza. È possibile eseguire il padding per riempire i valori mancanti. Il modello
+sarà tracciato con una grande dimensione di input, tuttavia, anche le dimensioni della diverse matrici saranno grandi,
+risultando in più calcoli.
+
+Si raccomanda di prestare attenzione al numero totale di operazioni eseguite su ciascun input e di seguire da vicino le prestazioni
+durante l'esportazione di modelli di sequenza-lunghezza variabili.
+
+### Usare TorchSscript in Python
+
+Di seguito è riportato un esempio, che mostra come salvare, caricare modelli e come utilizzare la traccia per l'inferenza.
+
+#### Salvare un modello
+
+Questo frammento di codice mostra come usare TorchScript per esportare un `BertModel`. Qui il `BertModel` è istanziato secondo
+una classe `BertConfig` e quindi salvato su disco con il nome del file `traced_bert.pt`
+
+```python
+from transformers import BertModel, BertTokenizer, BertConfig
+import torch
+
+enc = BertTokenizer.from_pretrained("bert-base-uncased")
+
+# Tokenizing input text
+text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+tokenized_text = enc.tokenize(text)
+
+# Masking one of the input tokens
+masked_index = 8
+tokenized_text[masked_index] = "[MASK]"
+indexed_tokens = enc.convert_tokens_to_ids(tokenized_text)
+segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
+
+# Creating a dummy input
+tokens_tensor = torch.tensor([indexed_tokens])
+segments_tensors = torch.tensor([segments_ids])
+dummy_input = [tokens_tensor, segments_tensors]
+
+# Initializing the model with the torchscript flag
+# Flag set to True even though it is not necessary as this model does not have an LM Head.
+config = BertConfig(
+    vocab_size_or_config_json_file=32000,
+    hidden_size=768,
+    num_hidden_layers=12,
+    num_attention_heads=12,
+    intermediate_size=3072,
+    torchscript=True,
+)
+
+# Instantiating the model
+model = BertModel(config)
+
+# The model needs to be in evaluation mode
+model.eval()
+
+# If you are instantiating the model with *from_pretrained* you can also easily set the TorchScript flag
+model = BertModel.from_pretrained("bert-base-uncased", torchscript=True)
+
+# Creating the trace
+traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
+torch.jit.save(traced_model, "traced_bert.pt")
+```
+
+#### Caricare un modello
+
+Questo frammento di codice mostra come caricare il `BertModel` che era stato precedentemente salvato su disco con il nome `traced_bert.pt`.
+Stiamo riutilizzando il `dummy_input` precedentemente inizializzato.
+
+```python
+loaded_model = torch.jit.load("traced_bert.pt")
+loaded_model.eval()
+
+all_encoder_layers, pooled_output = loaded_model(*dummy_input)
+```
+
+#### Utilizzare un modello tracciato per l'inferenza
+
+Usare il modello tracciato per l'inferenza è semplice come usare il suo metodo dunder `__call__`:
+
+```python
+traced_model(tokens_tensor, segments_tensors)
+```
+
+###Implementare modelli HuggingFace TorchScript su AWS utilizzando Neuron SDK
+
+AWS ha introdotto [Amazon EC2 Inf1](https://aws.amazon.com/ec2/instance-types/inf1/)
+famiglia di istanze per l'inferenza di machine learning a basso costo e ad alte prestazioni nel cloud.
+Le istanze Inf1 sono alimentate dal chip AWS Inferentia, un acceleratore hardware personalizzato,
+specializzato in carichi di lavoro di inferenza di deep learning.
+[AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/#)
+è l'SDK per Inferentia che supporta il tracciamento e l'ottimizzazione dei modelli transformers per
+distribuzione su Inf1. L'SDK Neuron fornisce:
+
+
+1. API di facile utilizzo con una riga di modifica del codice per tracciare e ottimizzare un modello TorchScript per l'inferenza nel cloud.
+2. Ottimizzazioni delle prestazioni pronte all'uso per [miglioramento dei costi-prestazioni](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/benchmark/>)
+3. Supporto per i modelli di trasformatori HuggingFace costruiti con [PyTorch](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/bert_tutorial/tutorial_pretrained_bert.html)
+    o [TensorFlow](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/tensorflow/huggingface_bert/huggingface_bert.html).
+
+#### Implicazioni
+
+Modelli Transformers basati su architettura [BERT (Bidirectional Encoder Representations from Transformers)](https://huggingface.co/docs/transformers/main/model_doc/bert),
+o sue varianti come [distilBERT](https://huggingface.co/docs/transformers/main/model_doc/distilbert)
+e [roBERTa](https://huggingface.co/docs/transformers/main/model_doc/roberta)
+funzioneranno meglio su Inf1 per attività non generative come la question answering estrattive,
+Classificazione della sequenza, Classificazione dei token. In alternativa, generazione di testo
+le attività possono essere adattate per essere eseguite su Inf1, secondo questo [tutorial AWS Neuron MarianMT](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/transformers-marianmt.html).
+Ulteriori informazioni sui modelli che possono essere convertiti fuori dagli schemi su Inferentia possono essere
+trovati nella [sezione Model Architecture Fit della documentazione Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/models/models-inferentia.html#models-inferentia).
+
+#### Dipendenze
+
+L'utilizzo di AWS Neuron per convertire i modelli richiede le seguenti dipendenze e l'ambiente:
+
+* A [Neuron SDK environment](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/index.html#installation-guide),
+  which comes pre-configured on [AWS Deep Learning AMI](https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-inferentia-launching.html).
+
+#### Convertire un modello per AWS Neuron
+
+Usando lo stesso script come in [Usando TorchScipt in Python](https://huggingface.co/docs/transformers/main/en/serialization#using-torchscript-in-python)
+per tracciare un "BertModel", importi l'estensione del framework `torch.neuron` per accedere
+i componenti di Neuron SDK tramite un'API Python.
+
+```python
+from transformers import BertModel, BertTokenizer, BertConfig
+import torch
+import torch.neuron
+```
+E modificare solo la riga di codice di traccia
+
+Da:
+
+```python
+torch.jit.trace(model, [tokens_tensor, segments_tensors])
+```
+
+A:
+
+```python
+torch.neuron.trace(model, [token_tensor, segments_tensors])
+```
+
+Questa modifica consente a Neuron SDK di tracciare il modello e ottimizzarlo per l'esecuzione nelle istanze Inf1.
+
+Per ulteriori informazioni sulle funzionalità, gli strumenti, i tutorial di esempi e gli ultimi aggiornamenti di AWS Neuron SDK,
+consultare la [documentazione AWS NeuronSDK](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html).
\ No newline at end of file
diff --git a/docs/source/it/serialization.mdx b/docs/source/it/serialization.mdx
deleted file mode 100644
index 1dde00f429bd..000000000000
--- a/docs/source/it/serialization.mdx
+++ /dev/null
@@ -1,673 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Esporta modelli 🤗 Transformers 
-
-Se devi implementare 🤗 modelli Transformers in ambienti di produzione, noi
-consigliamo di esportarli in un formato serializzato che può essere caricato ed eseguito
-su runtime e hardware specializzati. In questa guida ti mostreremo come farlo
-esporta 🤗 Modelli Transformers in due formati ampiamente utilizzati: ONNX e TorchScript.
-
-Una volta esportato, un modello può essere ottimizato per l'inferenza tramite tecniche come 
-la quantizzazione e soppressione. Se sei interessato a ottimizzare i tuoi modelli per l'esecuzione
-con la massima efficienza, dai un'occhiata a [🤗 Optimum
-library](https://github.com/huggingface/optimum).
-
-## ONNX
-
-Il progetto [ONNX (Open Neural Network eXchange)](http://onnx.ai) Il progetto onnx è un open
-standard che definisce un insieme comune di operatori e un formato di file comune a
-rappresentano modelli di deep learning in un'ampia varietà di framework, tra cui
-PyTorch e TensorFlow. Quando un modello viene esportato nel formato ONNX, questi
-operatori sono usati per costruire un grafico computazionale (often called an
-_intermediate representation_) che rappresenta il flusso di dati attraverso la
-rete neurale.
-
-Esponendo un grafico con operatori e tipi di dati standardizzati, ONNX rende
-più facile passare da un framework all'altro. Ad esempio, un modello allenato in PyTorch può
-essere esportato in formato ONNX e quindi importato in TensorFlow (e viceversa).
-
-🤗 Transformers fornisce un pacchetto `transformers.onnx` che ti consente di
-convertire i checkpoint del modello in un grafico ONNX sfruttando gli oggetti di configurazione.
-Questi oggetti di configurazione sono già pronti per una serie di architetture di modelli,
-e sono progettati per essere facilmente estensibili ad altre architetture.
-
-Le configurazioni pronte includono le seguenti architetture:
-
-<!--This table is automatically generated by `make fix-copies`, do not fill manually!-->
-
-- ALBERT
-- BART
-- BEiT
-- BERT
-- BigBird
-- BigBird-Pegasus
-- Blenderbot
-- BlenderbotSmall
-- CamemBERT
-- ConvBERT
-- Data2VecText
-- Data2VecVision
-- DeiT
-- DistilBERT
-- ELECTRA
-- FlauBERT
-- GPT Neo
-- GPT-J
-- I-BERT
-- LayoutLM
-- M2M100
-- Marian
-- mBART
-- MobileBERT
-- OpenAI GPT-2
-- Perceiver
-- PLBart
-- RoBERTa
-- RoFormer
-- SqueezeBERT
-- T5
-- ViT
-- XLM
-- XLM-RoBERTa
-- XLM-RoBERTa-XL
-
-Nelle prossime due sezioni, ti mostreremo come:
-
-* Esporta un modello supportato usando il pacchetto `transformers.onnx`.
-* Esporta un modello personalizzato per un'architettura non supportata.
-
-### Esportazione di un modello in ONNX
-
-Per esportare un modello 🤗 Transformers in ONNX, dovrai prima installarne alcune
-dipendenze extra:
-
-```bash
-pip install transformers[onnx]
-```
-
-Il pacchetto `transformers.onnx` può essere usato come modulo Python:
-
-```bash
-python -m transformers.onnx --help
-
-usage: Hugging Face Transformers ONNX exporter [-h] -m MODEL [--feature {causal-lm, ...}] [--opset OPSET] [--atol ATOL] output
-
-positional arguments:
-  output                Path indicating where to store generated ONNX model.
-
-optional arguments:
-  -h, --help            show this help message and exit
-  -m MODEL, --model MODEL
-                        Model ID on huggingface.co or path on disk to load model from.
-  --feature {causal-lm, ...}
-                        The type of features to export the model with.
-  --opset OPSET         ONNX opset version to export the model with.
-  --atol ATOL           Absolute difference tolerance when validating the model.
-```
-
-L'esportazione di un checkpoint utilizzando una configurazione già pronta può essere eseguita come segue:
-
-```bash
-python -m transformers.onnx --model=distilbert-base-uncased onnx/
-```
-
-che dovrebbe mostrare i seguenti log:
-
-```bash
-Validating ONNX model...
-        -[✓] ONNX model output names match reference model ({'last_hidden_state'})
-        - Validating ONNX Model output "last_hidden_state":
-                -[✓] (2, 8, 768) matches (2, 8, 768)
-                -[✓] all values close (atol: 1e-05)
-All good, model saved at: onnx/model.onnx
-```
-
-Questo esporta un grafico ONNX del checkpoint definito dall'argomento `--model`.
-In questo esempio è `distilbert-base-uncased`, ma può essere qualsiasi checkpoint
-Hugging Face Hub o uno memorizzato localmente.
-
-Il file risultante `model.onnx` può quindi essere eseguito su uno dei [tanti
-acceleratori](https://onnx.ai/supported-tools.html#deployModel) che supportano il
-lo standard ONNX. Ad esempio, possiamo caricare ed eseguire il modello con [ONNX
-Runtime](https://onnxruntime.ai/) come segue:
-
-```python
->>> from transformers import AutoTokenizer
->>> from onnxruntime import InferenceSession
-
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
->>> session = InferenceSession("onnx/model.onnx")
->>> # ONNX Runtime expects NumPy arrays as input
->>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
->>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs))
-```
-
-I nomi di output richiesti (cioè `["last_hidden_state"]`) possono essere ottenuti
-dando un'occhiata alla configurazione ONNX di ogni modello. Ad esempio, per
-DistilBERT abbiamo:
-
-```python
->>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig
-
->>> config = DistilBertConfig()
->>> onnx_config = DistilBertOnnxConfig(config)
->>> print(list(onnx_config.outputs.keys()))
-["last_hidden_state"]
-```
-
-Il processo è identico per i checkpoint TensorFlow sull'hub. Ad esempio, noi
-possiamo esportare un checkpoint TensorFlow puro da [Keras
-organizzazione](https://huggingface.co/keras-io) come segue:
-
-```bash
-python -m transformers.onnx --model=keras-io/transformers-qa onnx/
-```
-
-Per esportare un modello memorizzato localmente, devi disporre dei pesi del modello
-e file tokenizer memorizzati in una directory. Ad esempio, possiamo caricare e salvare un
-checkpoint come segue:
-
-<frameworkcontent>
-<pt>
-```python
->>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
-
->>> # Load tokenizer and PyTorch weights form the Hub
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
->>> pt_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
->>> # Save to disk
->>> tokenizer.save_pretrained("local-pt-checkpoint")
->>> pt_model.save_pretrained("local-pt-checkpoint")
-```
-
-Una volta salvato il checkpoint, possiamo esportarlo su ONNX puntando l'argomento `--model`
-del pacchetto `transformers.onnx` nella directory desiderata:
-
-```bash
-python -m transformers.onnx --model=local-pt-checkpoint onnx/
-```
-</pt>
-<tf>
-```python
->>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
-
->>> # Load tokenizer and TensorFlow weights from the Hub
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
->>> # Save to disk
->>> tokenizer.save_pretrained("local-tf-checkpoint")
->>> tf_model.save_pretrained("local-tf-checkpoint")
-```
-
-Once the checkpoint is saved, we can export it to ONNX by pointing the `--model`
-argument of the `transformers.onnx` package to the desired directory:
-
-```bash
-python -m transformers.onnx --model=local-tf-checkpoint onnx/
-```
-</tf>
-</frameworkcontent>
-
-### Selezione delle caratteristiche per diverse topologie di modello
-
-Ogni configurazione già pronta viene fornita con una serie di _caratteristiche_ che ti consentono di
-esportare modelli per diversi tipi di topologie o attività. Come mostrato nella tabella
-di seguito, ogni caratteristica è associata a una diversa Auto Class:
-
-| Caratteristica                              | Auto Class                           |
-| ------------------------------------ | ------------------------------------ |
-| `causal-lm`, `causal-lm-with-past`   | `AutoModelForCausalLM`               |
-| `default`, `default-with-past`       | `AutoModel`                          |
-| `masked-lm`                          | `AutoModelForMaskedLM`               |
-| `question-answering`                 | `AutoModelForQuestionAnswering`      |
-| `seq2seq-lm`, `seq2seq-lm-with-past` | `AutoModelForSeq2SeqLM`              |
-| `sequence-classification`            | `AutoModelForSequenceClassification` |
-| `token-classification`               | `AutoModelForTokenClassification`    |
-
-Per ciascuna configurazione, puoi trovare l'elenco delle funzionalità supportate tramite il
-`FeaturesManager`. Ad esempio, per DistilBERT abbiamo:
-
-```python
->>> from transformers.onnx.features import FeaturesManager
-
->>> distilbert_features = list(FeaturesManager.get_supported_features_for_model_type("distilbert").keys())
->>> print(distilbert_features)
-["default", "masked-lm", "causal-lm", "sequence-classification", "token-classification", "question-answering"]
-```
-
-Puoi quindi passare una di queste funzionalità all'argomento `--feature` nel
-pacchetto `transformers.onnx`. Ad esempio, per esportare un modello di classificazione del testo
-possiamo scegliere un modello ottimizzato dall'Hub ed eseguire:
-
-```bash
-python -m transformers.onnx --model=distilbert-base-uncased-finetuned-sst-2-english \
-                            --feature=sequence-classification onnx/
-```
-
-che visualizzerà i seguenti registri:
-
-```bash
-Validating ONNX model...
-        -[✓] ONNX model output names match reference model ({'logits'})
-        - Validating ONNX Model output "logits":
-                -[✓] (2, 2) matches (2, 2)
-                -[✓] all values close (atol: 1e-05)
-All good, model saved at: onnx/model.onnx
-```
-
-Puoi notare che in questo caso, i nomi di output del modello ottimizzato sono
-`logits` invece di `last_hidden_state` che abbiamo visto con il
-checkpoint `distilbert-base-uncased` precedente. Questo è previsto dal
-modello ottimizato visto che ha una testa di e.
-
-<Tip>
-
-Le caratteristiche che hanno un suffisso `wtih-past` (ad es. `causal-lm-with-past`)
-corrispondono a topologie di modello con stati nascosti precalcolati (chiave e valori
-nei blocchi di attenzione) che possono essere utilizzati per la decodifica autoregressiva veloce.
-
-</Tip>
-
-
-### Esportazione di un modello per un'architettura non supportata
-
-Se desideri esportare un modello la cui architettura non è nativamente supportata dalla
-libreria, ci sono tre passaggi principali da seguire:
-
-1. Implementare una configurazione ONNX personalizzata.
-2. Esportare il modello in ONNX.
-3. Convalidare gli output di PyTorch e dei modelli esportati.
-
-In questa sezione, vedremo come DistilBERT è stato implementato per mostrare cosa è
-coinvolto in ogni passaggio.
-
-#### Implementazione di una configurazione ONNX personalizzata
-
-Iniziamo con l'oggetto di configurazione ONNX. Forniamo tre classi
-astratte da cui ereditare, a seconda del tipo di archittettura
-del modello che desideri esportare:
-
-* I modelli basati su encoder ereditano da [`~onnx.config.OnnxConfig`]
-* I modelli basati su decoder ereditano da [`~onnx.config.OnnxConfigWithPast`]
-* I modelli encoder-decoder ereditano da[`~onnx.config.OnnxSeq2SeqConfigWithPast`]
-
-<Tip>
-
-Un buon modo per implementare una configurazione ONNX personalizzata è guardare l'implementazione
-esistente nel file `configuration_<model_name>.py` di un'architettura simile.
-
-</Tip>
-
-Poiché DistilBERT è un modello basato su encoder, la sua configurazione eredita da
-`OnnxConfig`:
-
-```python
->>> from typing import Mapping, OrderedDict
->>> from transformers.onnx import OnnxConfig
-
-
->>> class DistilBertOnnxConfig(OnnxConfig):
-...     @property
-...     def inputs(self) -> Mapping[str, Mapping[int, str]]:
-...         return OrderedDict(
-...             [
-...                 ("input_ids", {0: "batch", 1: "sequence"}),
-...                 ("attention_mask", {0: "batch", 1: "sequence"}),
-...             ]
-...         )
-```
-
-Ogni oggetto di configurazione deve implementare la proprietà `inputs` e restituire una
-mappatura, dove ogni chiave corrisponde a un input previsto e ogni valore
-indica l'asse di quell'input. Per DistilBERT, possiamo vedere che sono richiesti
-due input: `input_ids` e `attention_mask`. Questi inputs hanno la stessa forma di
-`(batch_size, sequence_length)` per questo motivo vediamo gli stessi assi usati nella
-configurazione.
-
-<Tip>
-
-Puoi notare che la proprietà `inputs` per `DistilBertOnnxConfig` restituisce un
-`OrdinatoDict`. Ciò garantisce che gli input corrispondano alla loro posizione
-relativa all'interno del metodo `PreTrainedModel.forward()` durante il tracciamento del grafico.
-Raccomandiamo di usare un `OrderedDict` per le proprietà `inputs` e `outputs`
-quando si implementano configurazioni ONNX personalizzate.
-
-</Tip>
-
-Dopo aver implementato una configurazione ONNX, è possibile istanziarla
-fornendo alla configurazione del modello base come segue:
-
-```python
->>> from transformers import AutoConfig
-
->>> config = AutoConfig.from_pretrained("distilbert-base-uncased")
->>> onnx_config = DistilBertOnnxConfig(config)
-```
-
-L'oggetto risultante ha diverse proprietà utili. Ad esempio è possibile visualizzare il
-Set operatore ONNX che verrà utilizzato durante l'esportazione:
-
-```python
->>> print(onnx_config.default_onnx_opset)
-11
-```
-
-È inoltre possibile visualizzare gli output associati al modello come segue:
-
-```python
->>> print(onnx_config.outputs)
-OrderedDict([("last_hidden_state", {0: "batch", 1: "sequence"})])
-```
-
-Puoi notare che la proprietà degli output segue la stessa struttura degli input; esso
-restituisce un `OrderedDict` di output con nome e le loro forme. La struttura di output
-è legato alla scelta della funzione con cui viene inizializzata la configurazione.
-Per impostazione predefinita, la configurazione ONNX viene inizializzata con la funzione 'predefinita'
-che corrisponde all'esportazione di un modello caricato con la classe `AutoModel`. Se tu
-desideri esportare una topologia di modello diversa, è sufficiente fornire una funzionalità diversa a
-l'argomento `task` quando inizializzi la configurazione ONNX. Ad esempio, se
-volevamo esportare DistilBERT con una testa di classificazione per sequenze, potremmo
-usare:
-
-```python
->>> from transformers import AutoConfig
-
->>> config = AutoConfig.from_pretrained("distilbert-base-uncased")
->>> onnx_config_for_seq_clf = DistilBertOnnxConfig(config, task="sequence-classification")
->>> print(onnx_config_for_seq_clf.outputs)
-OrderedDict([('logits', {0: 'batch'})])
-```
-
-<Tip>
-
-Tutte le proprietà e i metodi di base associati a [`~onnx.config.OnnxConfig`] e le
-altre classi di configurazione possono essere sovrascritte se necessario. Guarda
-[`BartOnnxConfig`] per un esempio avanzato.
-
-</Tip>
-
-#### Esportazione del modello
-
-Una volta implementata la configurazione ONNX, il passaggio successivo consiste nell'esportare il
-modello. Qui possiamo usare la funzione `export()` fornita dal
-pacchetto `transformers.onnx`. Questa funzione prevede la configurazione ONNX, insieme
-con il modello base e il tokenizer e il percorso per salvare il file esportato:
-
-```python
->>> from pathlib import Path
->>> from transformers.onnx import export
->>> from transformers import AutoTokenizer, AutoModel
-
->>> onnx_path = Path("model.onnx")
->>> model_ckpt = "distilbert-base-uncased"
->>> base_model = AutoModel.from_pretrained(model_ckpt)
->>> tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
-
->>> onnx_inputs, onnx_outputs = export(tokenizer, base_model, onnx_config, onnx_config.default_onnx_opset, onnx_path)
-```
-
-Gli `onnx_inputs` e `onnx_outputs` restituiti dalla funzione `export()` sono
-liste di chiavi definite nelle proprietà di `input` e `output` della
-configurazione. Una volta esportato il modello, puoi verificare che il modello sia ben
-formato come segue:
-
-```python
->>> import onnx
-
->>> onnx_model = onnx.load("model.onnx")
->>> onnx.checker.check_model(onnx_model)
-```
-
-<Tip>
-
-Se il tuo modello è più largo di 2 GB, vedrai che molti file aggiuntivi sono
-creati durante l'esportazione. Questo è _previsto_ perché ONNX utilizza [Protocol
-Buffer](https://developers.google.com/protocol-buffers/) per memorizzare il modello e
-questi hanno un limite di dimensione 2 GB. Vedi la [Documentazione
-ONNX](https://github.com/onnx/onnx/blob/master/docs/ExternalData.md)
-per istruzioni su come caricare modelli con dati esterni.
-
-</Tip>
-
-#### Convalida degli output del modello
-
-Il passaggio finale consiste nel convalidare gli output dal modello di base e quello esportato
-corrispondere entro una soglia di tolleranza assoluta. Qui possiamo usare la
-Funzione `validate_model_outputs()` fornita dal pacchetto `transformers.onnx`
-come segue:
-
-```python
->>> from transformers.onnx import validate_model_outputs
-
->>> validate_model_outputs(
-...     onnx_config, tokenizer, base_model, onnx_path, onnx_outputs, onnx_config.atol_for_validation
-... )
-```
-
-Questa funzione usa il metodo `OnnxConfig.generate_dummy_inputs()` per generare
-input per il modello di base e quello esportato e la tolleranza assoluta può essere
-definita nella configurazione. Generalmente troviamo una corrispondenza numerica nell'intervallo da 1e-6
-a 1e-4, anche se è probabile che qualsiasi cosa inferiore a 1e-3 vada bene.
-
-### Contribuire con una nuova configurazione a 🤗 Transformers
-
-Stiamo cercando di espandere l'insieme di configurazioni già pronte e di accettare
-contributi della community! Se vuoi contribuire con la tua aggiunta
-nella libreria, dovrai:
-
-* Implementare la configurazione ONNX nella corrispondente `configuration file
-_<model_name>.py`
-* Includere l'architettura del modello e le funzioni corrispondenti in [`~onnx.features.FeatureManager`]
-* Aggiungere la tua architettura del modello ai test in `test_onnx_v2.py`
-
-Scopri come stato contribuito la configurazione per [IBERT]
-(https://github.com/huggingface/transformers/pull/14868/files) per
-avere un'idea di cosa è coinvolto.
-
-## TorchScript
-
-<Tip>
-
-Questo è l'inizio dei nostri esperimenti con TorchScript e stiamo ancora esplorando le sue capacità con
-modelli con variable-input-size. È una nostra priorità e approfondiremo le nostre analisi nelle prossime versioni,
-con più esempi di codici, un'implementazione più flessibile e benchmark che confrontano i codici basati su Python con quelli compilati con
-TorchScript.
-
-</Tip>
-
-Secondo la documentazione di Pytorch: "TorchScript è un modo per creare modelli serializzabili e ottimizzabili da codice
-Pytorch". I due moduli di Pytorch [JIT e TRACE](https://pytorch.org/docs/stable/jit.html) consentono allo sviluppatore di esportare
-il loro modello da riutilizzare in altri programmi, come i programmi C++ orientati all'efficienza.
-
-Abbiamo fornito un'interfaccia che consente l'esportazione di modelli 🤗 Transformers in TorchScript in modo che possano essere riutilizzati
-in un ambiente diverso rispetto a un programma Python basato su Pytorch. Qui spieghiamo come esportare e utilizzare i nostri modelli utilizzando
-TorchScript.
-
-Esportare un modello richiede due cose:
-
-- Un passaggio in avanti con input fittizzi.
-- Istanziazione del modello con flag `torchscript`.
-
-Queste necessità implicano diverse cose a cui gli sviluppatori dovrebbero prestare attenzione. Questi dettagli mostrati sotto.
-
-### Flag TorchScript e pesi legati
-
-Questo flag è necessario perché la maggior parte dei modelli linguistici in questo repository hanno pesi legati tra il loro
-strato "Embedding" e lo strato "Decoding". TorchScript non consente l'esportazione di modelli che hanno pesi
-legati, quindi è necessario prima slegare e clonare i pesi.
-
-Ciò implica che i modelli istanziati con il flag `torchscript` hanno il loro strato `Embedding` e strato `Decoding`
-separato, il che significa che non dovrebbero essere addestrati in futuro. L'allenamento de-sincronizza i due
-strati, portando a risultati inaspettati.
-
-Questo non è il caso per i modelli che non hanno una testa del modello linguistico, poiché quelli non hanno pesi legati. Questi modelli
-può essere esportato in sicurezza senza il flag `torchscript`.
-
-### Input fittizi e standard lengths
-
-Gli input fittizzi sono usati per fare un modello passaggio in avanti . Mentre i valori degli input si propagano attraverso i strati,
-Pytorch tiene traccia delle diverse operazioni eseguite su ciascun tensore. Queste operazioni registrate vengono quindi utilizzate per
-creare la "traccia" del modello.
-
-La traccia viene creata relativamente alle dimensioni degli input. È quindi vincolato dalle dimensioni dell'input
-fittizio e non funzionerà per altre lunghezze di sequenza o dimensioni batch. Quando si proverà con una dimensione diversa, ci sarà errore
-come:
-
-`La dimensione espansa del tensore (3) deve corrispondere alla dimensione esistente (7) nella dimensione non singleton 2`
-
-will be raised. Si consiglia pertanto di tracciare il modello con una dimensione di input fittizia grande almeno quanto il più grande
-input che verrà fornito al modello durante l'inferenza. È possibile eseguire il padding per riempire i valori mancanti. Il modello
-sarà tracciato con una grande dimensione di input, tuttavia, anche le dimensioni della diverse matrici saranno grandi,
-risultando in più calcoli.
-
-Si raccomanda di prestare attenzione al numero totale di operazioni eseguite su ciascun input e di seguire da vicino le prestazioni
-durante l'esportazione di modelli di sequenza-lunghezza variabili.
-
-### Usare TorchSscript in Python
-
-Di seguito è riportato un esempio, che mostra come salvare, caricare modelli e come utilizzare la traccia per l'inferenza.
-
-#### Salvare un modello
-
-Questo frammento di codice mostra come usare TorchScript per esportare un `BertModel`. Qui il `BertModel` è istanziato secondo
-una classe `BertConfig` e quindi salvato su disco con il nome del file `traced_bert.pt`
-
-```python
-from transformers import BertModel, BertTokenizer, BertConfig
-import torch
-
-enc = BertTokenizer.from_pretrained("bert-base-uncased")
-
-# Tokenizing input text
-text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-tokenized_text = enc.tokenize(text)
-
-# Masking one of the input tokens
-masked_index = 8
-tokenized_text[masked_index] = "[MASK]"
-indexed_tokens = enc.convert_tokens_to_ids(tokenized_text)
-segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
-
-# Creating a dummy input
-tokens_tensor = torch.tensor([indexed_tokens])
-segments_tensors = torch.tensor([segments_ids])
-dummy_input = [tokens_tensor, segments_tensors]
-
-# Initializing the model with the torchscript flag
-# Flag set to True even though it is not necessary as this model does not have an LM Head.
-config = BertConfig(
-    vocab_size_or_config_json_file=32000,
-    hidden_size=768,
-    num_hidden_layers=12,
-    num_attention_heads=12,
-    intermediate_size=3072,
-    torchscript=True,
-)
-
-# Instantiating the model
-model = BertModel(config)
-
-# The model needs to be in evaluation mode
-model.eval()
-
-# If you are instantiating the model with *from_pretrained* you can also easily set the TorchScript flag
-model = BertModel.from_pretrained("bert-base-uncased", torchscript=True)
-
-# Creating the trace
-traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
-torch.jit.save(traced_model, "traced_bert.pt")
-```
-
-#### Caricare un modello
-
-Questo frammento di codice mostra come caricare il `BertModel` che era stato precedentemente salvato su disco con il nome `traced_bert.pt`.
-Stiamo riutilizzando il `dummy_input` precedentemente inizializzato.
-
-```python
-loaded_model = torch.jit.load("traced_bert.pt")
-loaded_model.eval()
-
-all_encoder_layers, pooled_output = loaded_model(*dummy_input)
-```
-
-#### Utilizzare un modello tracciato per l'inferenza
-
-Usare il modello tracciato per l'inferenza è semplice come usare il suo metodo dunder `__call__`:
-
-```python
-traced_model(tokens_tensor, segments_tensors)
-```
-
-###Implementare modelli HuggingFace TorchScript su AWS utilizzando Neuron SDK
-
-AWS ha introdotto [Amazon EC2 Inf1](https://aws.amazon.com/ec2/instance-types/inf1/)
-famiglia di istanze per l'inferenza di machine learning a basso costo e ad alte prestazioni nel cloud.
-Le istanze Inf1 sono alimentate dal chip AWS Inferentia, un acceleratore hardware personalizzato,
-specializzato in carichi di lavoro di inferenza di deep learning.
-[AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/#)
-è l'SDK per Inferentia che supporta il tracciamento e l'ottimizzazione dei modelli transformers per
-distribuzione su Inf1. L'SDK Neuron fornisce:
-
-
-1. API di facile utilizzo con una riga di modifica del codice per tracciare e ottimizzare un modello TorchScript per l'inferenza nel cloud.
-2. Ottimizzazioni delle prestazioni pronte all'uso per [miglioramento dei costi-prestazioni](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/benchmark/>)
-3. Supporto per i modelli di trasformatori HuggingFace costruiti con [PyTorch](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/bert_tutorial/tutorial_pretrained_bert.html)
-    o [TensorFlow](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/tensorflow/huggingface_bert/huggingface_bert.html).
-
-#### Implicazioni
-
-Modelli Transformers basati su architettura [BERT (Bidirectional Encoder Representations from Transformers)](https://huggingface.co/docs/transformers/main/model_doc/bert),
-o sue varianti come [distilBERT](https://huggingface.co/docs/transformers/main/model_doc/distilbert)
-e [roBERTa](https://huggingface.co/docs/transformers/main/model_doc/roberta)
-funzioneranno meglio su Inf1 per attività non generative come la question answering estrattive,
-Classificazione della sequenza, Classificazione dei token. In alternativa, generazione di testo
-le attività possono essere adattate per essere eseguite su Inf1, secondo questo [tutorial AWS Neuron MarianMT](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/transformers-marianmt.html).
-Ulteriori informazioni sui modelli che possono essere convertiti fuori dagli schemi su Inferentia possono essere
-trovati nella [sezione Model Architecture Fit della documentazione Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/models/models-inferentia.html#models-inferentia).
-
-#### Dipendenze
-
-L'utilizzo di AWS Neuron per convertire i modelli richiede le seguenti dipendenze e l'ambiente:
-
-* A [Neuron SDK environment](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/index.html#installation-guide),
-  which comes pre-configured on [AWS Deep Learning AMI](https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-inferentia-launching.html).
-
-#### Convertire un modello per AWS Neuron
-
-Usando lo stesso script come in [Usando TorchScipt in Python](https://huggingface.co/docs/transformers/main/en/serialization#using-torchscript-in-python)
-per tracciare un "BertModel", importi l'estensione del framework `torch.neuron` per accedere
-i componenti di Neuron SDK tramite un'API Python.
-
-```python
-from transformers import BertModel, BertTokenizer, BertConfig
-import torch
-import torch.neuron
-```
-E modificare solo la riga di codice di traccia
-
-Da:
-
-```python
-torch.jit.trace(model, [tokens_tensor, segments_tensors])
-```
-
-A:
-
-```python
-torch.neuron.trace(model, [token_tensor, segments_tensors])
-```
-
-Questa modifica consente a Neuron SDK di tracciare il modello e ottimizzarlo per l'esecuzione nelle istanze Inf1.
-
-Per ulteriori informazioni sulle funzionalità, gli strumenti, i tutorial di esempi e gli ultimi aggiornamenti di AWS Neuron SDK,
-consultare la [documentazione AWS NeuronSDK](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html).
\ No newline at end of file
diff --git a/docs/source/it/training.md b/docs/source/it/training.md
new file mode 100644
index 000000000000..be0883f07b77
--- /dev/null
+++ b/docs/source/it/training.md
@@ -0,0 +1,376 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Fine-tuning di un modello pre-addestrato
+
+[[open-in-colab]]
+
+Ci sono benefici significativi nell'usare un modello pre-addestrato. Si riducono i costi computazionali, l'impronta di carbonio e ti consente di usare modelli stato dell'arte senza doverli addestrare da zero. 🤗 Transformers consente l'accesso a migliaia di modelli pre-addestrati per un'ampia gamma di compiti. Quando usi un modello pre-addestrato, lo alleni su un dataset specifico per il tuo compito. Questo è conosciuto come fine-tuning, una tecnica di addestramento incredibilmente potente. In questa esercitazione, potrai fare il fine-tuning di un modello pre-addestrato, con un framework di deep learning a tua scelta:
+
+* Fine-tuning di un modello pre-addestrato con 🤗 Transformers [`Trainer`].
+* Fine-tuning di un modello pre-addestrato in TensorFlow con Keras.
+* Fine-tuning di un modello pre-addestrato con PyTorch.
+
+<a id='data-processing'></a>
+
+## Preparare un dataset
+
+<Youtube id="_BZearw7f0w"/>
+
+Prima di poter fare il fine-tuning di un modello pre-addestrato, scarica un dataset e preparalo per l'addestramento. La precedente esercitazione ti ha mostrato come processare i dati per l'addestramento e adesso hai l'opportunità di metterti alla prova!
+
+Inizia caricando il dataset [Yelp Reviews](https://huggingface.co/datasets/yelp_review_full):
+
+```py
+>>> from datasets import load_dataset
+
+>>> dataset = load_dataset("yelp_review_full")
+>>> dataset["train"][100]
+{'label': 0,
+ 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. I\'ve worked at more than one location. I expect bad days, bad moods, and the occasional mistake. But I have yet to have a decent experience at this store. It will remain a place I avoid unless someone in my party needs to avoid illness from low blood sugar. Perhaps I should go back to the racially biased service of Steak n Shake instead!'}
+```
+
+Come già sai, hai bisogno di un tokenizer per processare il testo e includere una strategia di padding e truncation per gestire sequenze di lunghezza variabile. Per processare il dataset in un unico passo, usa il metodo [`map`](https://huggingface.co/docs/datasets/process.html#map) di 🤗 Datasets che applica la funzione di preprocessing all'intero dataset:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+
+
+>>> def tokenize_function(examples):
+...     return tokenizer(examples["text"], padding="max_length", truncation=True)
+
+
+>>> tokenized_datasets = dataset.map(tokenize_function, batched=True)
+```
+
+Se vuoi, puoi creare un sottoinsieme più piccolo del dataset per il fine-tuning così da ridurre il tempo necessario:
+
+```py
+>>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
+>>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
+```
+
+<a id='trainer'></a>
+
+## Addestramento
+
+<frameworkcontent>
+<pt>
+<Youtube id="nvBXf7s7vTI"/>
+
+🤗 Transformers mette a disposizione la classe [`Trainer`] ottimizzata per addestrare modelli 🤗 Transformers, rendendo semplice iniziare l'addestramento senza scrivere manualmente il tuo ciclo di addestramento. L'API [`Trainer`] supporta un'ampia gamma di opzioni e funzionalità di addestramento come logging, gradient accumulation e mixed precision.
+
+Inizia caricando il tuo modello e specificando il numero di etichette (labels) attese. Nel dataset Yelp Review [dataset card](https://huggingface.co/datasets/yelp_review_full#data-fields), sai che ci sono cinque etichette:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+```
+
+<Tip>
+
+Potresti vedere un warning dato che alcuni dei pesi pre-addestrati non sono stati utilizzati e altri pesi sono stati inizializzati casualmente. Non preoccuparti, è completamente normale! L'head pre-addestrata del modello BERT viene scartata e rimpiazzata da una classification head inizializzata casualmente. Farai il fine-tuning di questa nuova head del modello sul tuo compito di classificazione, trasferendogli la conoscenza del modello pre-addestrato.
+
+</Tip>
+
+### Iperparametri per il training
+
+Successivamente, crea una classe [`TrainingArguments`] contenente tutti gli iperparametri che si possono regore nonché le variabili per attivare le differenti opzioni di addestramento. Per questa esercitazione puoi iniziare con gli [iperparametri](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments) di ddestramento predefiniti, ma sentiti libero di sperimentare per trovare la configurazione ottimale per te.
+
+Specifica dove salvare i checkpoints del tuo addestramento:
+
+```py
+>>> from transformers import TrainingArguments
+
+>>> training_args = TrainingArguments(output_dir="test_trainer")
+```
+
+### Metriche
+
+[`Trainer`] non valuta automaticamente le performance del modello durante l'addestramento. Dovrai passare a [`Trainer`] una funzione che calcola e restituisce le metriche. La libreria 🤗 Datasets mette a disposizione una semplice funzione [`accuracy`](https://huggingface.co/metrics/accuracy) che puoi caricare con la funzione `load_metric` (guarda questa [esercitazione](https://huggingface.co/docs/datasets/metrics.html) per maggiori informazioni):
+
+```py
+>>> import numpy as np
+>>> from datasets import load_metric
+
+>>> metric = load_metric("accuracy")
+```
+
+Richiama `compute` su `metric` per calcolare l'accuratezza delle tue previsioni. Prima di passare le tue previsioni a `compute`, hai bisogno di convertirle in logits (ricorda che tutti i modelli 🤗 Transformers restituiscono logits):
+
+```py
+>>> def compute_metrics(eval_pred):
+...     logits, labels = eval_pred
+...     predictions = np.argmax(logits, axis=-1)
+...     return metric.compute(predictions=predictions, references=labels)
+```
+
+Se preferisci monitorare le tue metriche di valutazione durante il fine-tuning, specifica il parametro `evaluation_strategy` nei tuoi training arguments per restituire le metriche di valutazione ad ogni epoca di addestramento:
+
+```py
+>>> from transformers import TrainingArguments, Trainer
+
+>>> training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
+```
+
+### Trainer
+
+Crea un oggetto [`Trainer`] col tuo modello, training arguments, dataset di training e test, e funzione di valutazione:
+
+```py
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=small_train_dataset,
+...     eval_dataset=small_eval_dataset,
+...     compute_metrics=compute_metrics,
+... )
+```
+
+Poi metti a punto il modello richiamando [`~transformers.Trainer.train`]:
+
+```py
+>>> trainer.train()
+```
+</pt>
+<tf>
+<a id='keras'></a>
+
+<Youtube id="rnTGBy2ax1c"/>
+
+I modelli 🤗 Transformers supportano anche l'addestramento in TensorFlow usando l'API di Keras.
+
+### Convertire dataset nel formato per TensorFlow
+
+Il [`DefaultDataCollator`] assembla tensori in lotti su cui il modello si addestrerà. Assicurati di specificare di restituire tensori per TensorFlow in `return_tensors`:
+
+```py
+>>> from transformers import DefaultDataCollator
+
+>>> data_collator = DefaultDataCollator(return_tensors="tf")
+```
+
+<Tip>
+
+[`Trainer`] usa [`DataCollatorWithPadding`] in maniera predefinita in modo da non dover specificare esplicitamente un collettore di dati.
+
+</Tip>
+
+Successivamente, converti i datasets tokenizzati in TensorFlow datasets con il metodo [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.to_tf_dataset). Specifica il tuo input in `columns` e le tue etichette in `label_cols`:
+
+```py
+>>> tf_train_dataset = small_train_dataset.to_tf_dataset(
+...     columns=["attention_mask", "input_ids", "token_type_ids"],
+...     label_cols=["labels"],
+...     shuffle=True,
+...     collate_fn=data_collator,
+...     batch_size=8,
+... )
+
+>>> tf_validation_dataset = small_eval_dataset.to_tf_dataset(
+...     columns=["attention_mask", "input_ids", "token_type_ids"],
+...     label_cols=["labels"],
+...     shuffle=False,
+...     collate_fn=data_collator,
+...     batch_size=8,
+... )
+```
+
+### Compilazione e addestramento
+
+Carica un modello TensorFlow col numero atteso di etichette:
+
+```py
+>>> import tensorflow as tf
+>>> from transformers import TFAutoModelForSequenceClassification
+
+>>> model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+```
+
+Poi compila e fai il fine-tuning del tuo modello usando [`fit`](https://keras.io/api/models/model_training_apis/) come faresti con qualsiasi altro modello di Keras:
+
+```py
+>>> model.compile(
+...     optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
+...     loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+...     metrics=tf.metrics.SparseCategoricalAccuracy(),
+... )
+
+>>> model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3)
+```
+</tf>
+</frameworkcontent>
+
+<a id='pytorch_native'></a>
+
+## Addestramento in PyTorch nativo
+
+<frameworkcontent>
+<pt>
+<Youtube id="Dh9CL8fyG80"/>
+
+[`Trainer`] si occupa del ciclo di addestramento e ti consente di mettere a punto un modello con una sola riga di codice. Per chi preferisse scrivere un proprio ciclo di addestramento personale, puoi anche fare il fine-tuning di un modello 🤗 Transformers in PyTorch nativo.
+
+A questo punto, potresti avere bisogno di riavviare il tuo notebook o eseguire il seguente codice per liberare un po' di memoria:
+
+```py
+del model
+del pytorch_model
+del trainer
+torch.cuda.empty_cache()
+```
+
+Successivamente, postprocessa manualmente il `tokenized_dataset` per prepararlo ad essere allenato.
+
+1. Rimuovi la colonna `text` perché il modello non accetta testo grezzo come input:
+
+    ```py
+    >>> tokenized_datasets = tokenized_datasets.remove_columns(["text"])
+    ```
+
+2. Rinomina la colonna `label` in `labels` perché il modello si aspetta che questo argomento si chiami `labels`:
+
+    ```py
+    >>> tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
+    ```
+
+3. Imposta il formato del dataset per farti restituire tensori di PyTorch all'interno delle liste:
+
+    ```py
+    >>> tokenized_datasets.set_format("torch")
+    ```
+
+Poi crea un piccolo sottocampione del dataset come visto precedentemente per velocizzare il fine-tuning:
+
+```py
+>>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
+>>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
+```
+
+### DataLoader
+
+Crea un `DataLoader` per i tuoi datasets di train e test così puoi iterare sui lotti di dati:
+
+```py
+>>> from torch.utils.data import DataLoader
+
+>>> train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
+>>> eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)
+```
+
+Carica il tuo modello con il numero atteso di etichette:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+```
+
+### Ottimizzatore e learning rate scheduler
+
+Crea un ottimizzatore e il learning rate scheduler per fare il fine-tuning del modello. Usa l'ottimizzatore [`AdamW`](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html) di PyTorch:
+
+```py
+>>> from torch.optim import AdamW
+
+>>> optimizer = AdamW(model.parameters(), lr=5e-5)
+```
+
+Crea il learning rate scheduler predefinito da [`Trainer`]:
+
+```py
+>>> from transformers import get_scheduler
+
+>>> num_epochs = 3
+>>> num_training_steps = num_epochs * len(train_dataloader)
+>>> lr_scheduler = get_scheduler(
+...     name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
+... )
+```
+
+Infine specifica come `device` da usare una GPU se ne hai una. Altrimenti, l'addestramento su una CPU può richiedere diverse ore invece di un paio di minuti.
+
+```py
+>>> import torch
+
+>>> device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+>>> model.to(device)
+```
+
+<Tip>
+
+Ottieni l'accesso gratuito a una GPU sul cloud se non ne possiedi una usando un notebook sul web come [Colaboratory](https://colab.research.google.com/) o [SageMaker StudioLab](https://studiolab.sagemaker.aws/).
+
+</Tip>
+
+Ottimo, adesso possiamo addestrare! 🥳 
+
+### Training loop
+
+Per tenere traccia dei tuoi progressi durante l'addestramento, usa la libreria [tqdm](https://tqdm.github.io/) per aggiungere una progress bar sopra il numero dei passi di addestramento:
+
+```py
+>>> from tqdm.auto import tqdm
+
+>>> progress_bar = tqdm(range(num_training_steps))
+
+>>> model.train()
+>>> for epoch in range(num_epochs):
+...     for batch in train_dataloader:
+...         batch = {k: v.to(device) for k, v in batch.items()}
+...         outputs = model(**batch)
+...         loss = outputs.loss
+...         loss.backward()
+
+...         optimizer.step()
+...         lr_scheduler.step()
+...         optimizer.zero_grad()
+...         progress_bar.update(1)
+```
+
+### Metriche
+
+Proprio come è necessario aggiungere una funzione di valutazione del [`Trainer`], è necessario fare lo stesso quando si scrive il proprio ciclo di addestramento. Ma invece di calcolare e riportare la metrica alla fine di ogni epoca, questa volta accumulerai tutti i batch con [`add_batch`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=add_batch#datasets.Metric.add_batch) e calcolerai la metrica alla fine.
+
+```py
+>>> metric = load_metric("accuracy")
+>>> model.eval()
+>>> for batch in eval_dataloader:
+...     batch = {k: v.to(device) for k, v in batch.items()}
+...     with torch.no_grad():
+...         outputs = model(**batch)
+
+...     logits = outputs.logits
+...     predictions = torch.argmax(logits, dim=-1)
+...     metric.add_batch(predictions=predictions, references=batch["labels"])
+
+>>> metric.compute()
+```
+</pt>
+</frameworkcontent>
+
+<a id='additional-resources'></a>
+
+## Altre risorse
+
+Per altri esempi sul fine-tuning, fai riferimento a:
+
+- [🤗 Transformers Examples](https://github.com/huggingface/transformers/tree/main/examples) include scripts per addestrare compiti comuni di NLP in PyTorch e TensorFlow.
+
+- [🤗 Transformers Notebooks](notebooks) contiene diversi notebooks su come mettere a punto un modello per compiti specifici in PyTorch e TensorFlow.
diff --git a/docs/source/it/training.mdx b/docs/source/it/training.mdx
deleted file mode 100644
index 68f6434bbb5a..000000000000
--- a/docs/source/it/training.mdx
+++ /dev/null
@@ -1,372 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Fine-tuning di un modello pre-addestrato
-
-[[open-in-colab]]
-
-Ci sono benefici significativi nell'usare un modello pre-addestrato. Si riducono i costi computazionali, l'impronta di carbonio e ti consente di usare modelli stato dell'arte senza doverli addestrare da zero. 🤗 Transformers consente l'accesso a migliaia di modelli pre-addestrati per un'ampia gamma di compiti. Quando usi un modello pre-addestrato, lo alleni su un dataset specifico per il tuo compito. Questo è conosciuto come fine-tuning, una tecnica di addestramento incredibilmente potente. In questa esercitazione, potrai fare il fine-tuning di un modello pre-addestrato, con un framework di deep learning a tua scelta:
-
-* Fine-tuning di un modello pre-addestrato con 🤗 Transformers [`Trainer`].
-* Fine-tuning di un modello pre-addestrato in TensorFlow con Keras.
-* Fine-tuning di un modello pre-addestrato con PyTorch.
-
-<a id='data-processing'></a>
-
-## Preparare un dataset
-
-<Youtube id="_BZearw7f0w"/>
-
-Prima di poter fare il fine-tuning di un modello pre-addestrato, scarica un dataset e preparalo per l'addestramento. La precedente esercitazione ti ha mostrato come processare i dati per l'addestramento e adesso hai l'opportunità di metterti alla prova!
-
-Inizia caricando il dataset [Yelp Reviews](https://huggingface.co/datasets/yelp_review_full):
-
-```py
->>> from datasets import load_dataset
-
->>> dataset = load_dataset("yelp_review_full")
->>> dataset["train"][100]
-{'label': 0,
- 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. I\'ve worked at more than one location. I expect bad days, bad moods, and the occasional mistake. But I have yet to have a decent experience at this store. It will remain a place I avoid unless someone in my party needs to avoid illness from low blood sugar. Perhaps I should go back to the racially biased service of Steak n Shake instead!'}
-```
-
-Come già sai, hai bisogno di un tokenizer per processare il testo e includere una strategia di padding e truncation per gestire sequenze di lunghezza variabile. Per processare il dataset in un unico passo, usa il metodo [`map`](https://huggingface.co/docs/datasets/process.html#map) di 🤗 Datasets che applica la funzione di preprocessing all'intero dataset:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
-
-
->>> def tokenize_function(examples):
-...     return tokenizer(examples["text"], padding="max_length", truncation=True)
-
-
->>> tokenized_datasets = dataset.map(tokenize_function, batched=True)
-```
-
-Se vuoi, puoi creare un sottoinsieme più piccolo del dataset per il fine-tuning così da ridurre il tempo necessario:
-
-```py
->>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
->>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
-```
-
-<a id='trainer'></a>
-
-## Addestramento
-
-<frameworkcontent>
-<pt>
-<Youtube id="nvBXf7s7vTI"/>
-
-🤗 Transformers mette a disposizione la classe [`Trainer`] ottimizzata per addestrare modelli 🤗 Transformers, rendendo semplice iniziare l'addestramento senza scrivere manualmente il tuo ciclo di addestramento. L'API [`Trainer`] supporta un'ampia gamma di opzioni e funzionalità di addestramento come logging, gradient accumulation e mixed precision.
-
-Inizia caricando il tuo modello e specificando il numero di etichette (labels) attese. Nel dataset Yelp Review [dataset card](https://huggingface.co/datasets/yelp_review_full#data-fields), sai che ci sono cinque etichette:
-
-```py
->>> from transformers import AutoModelForSequenceClassification
-
->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
-```
-
-<Tip>
-
-Potresti vedere un warning dato che alcuni dei pesi pre-addestrati non sono stati utilizzati e altri pesi sono stati inizializzati casualmente. Non preoccuparti, è completamente normale! L'head pre-addestrata del modello BERT viene scartata e rimpiazzata da una classification head inizializzata casualmente. Farai il fine-tuning di questa nuova head del modello sul tuo compito di classificazione, trasferendogli la conoscenza del modello pre-addestrato.
-
-</Tip>
-
-### Iperparametri per il training
-
-Successivamente, crea una classe [`TrainingArguments`] contenente tutti gli iperparametri che si possono regore nonché le variabili per attivare le differenti opzioni di addestramento. Per questa esercitazione puoi iniziare con gli [iperparametri](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments) di ddestramento predefiniti, ma sentiti libero di sperimentare per trovare la configurazione ottimale per te.
-
-Specifica dove salvare i checkpoints del tuo addestramento:
-
-```py
->>> from transformers import TrainingArguments
-
->>> training_args = TrainingArguments(output_dir="test_trainer")
-```
-
-### Metriche
-
-[`Trainer`] non valuta automaticamente le performance del modello durante l'addestramento. Dovrai passare a [`Trainer`] una funzione che calcola e restituisce le metriche. La libreria 🤗 Datasets mette a disposizione una semplice funzione [`accuracy`](https://huggingface.co/metrics/accuracy) che puoi caricare con la funzione `load_metric` (guarda questa [esercitazione](https://huggingface.co/docs/datasets/metrics.html) per maggiori informazioni):
-
-```py
->>> import numpy as np
->>> from datasets import load_metric
-
->>> metric = load_metric("accuracy")
-```
-
-Richiama `compute` su `metric` per calcolare l'accuratezza delle tue previsioni. Prima di passare le tue previsioni a `compute`, hai bisogno di convertirle in logits (ricorda che tutti i modelli 🤗 Transformers restituiscono logits):
-
-```py
->>> def compute_metrics(eval_pred):
-...     logits, labels = eval_pred
-...     predictions = np.argmax(logits, axis=-1)
-...     return metric.compute(predictions=predictions, references=labels)
-```
-
-Se preferisci monitorare le tue metriche di valutazione durante il fine-tuning, specifica il parametro `evaluation_strategy` nei tuoi training arguments per restituire le metriche di valutazione ad ogni epoca di addestramento:
-
-```py
->>> from transformers import TrainingArguments, Trainer
-
->>> training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
-```
-
-### Trainer
-
-Crea un oggetto [`Trainer`] col tuo modello, training arguments, dataset di training e test, e funzione di valutazione:
-
-```py
->>> trainer = Trainer(
-...     model=model,
-...     args=training_args,
-...     train_dataset=small_train_dataset,
-...     eval_dataset=small_eval_dataset,
-...     compute_metrics=compute_metrics,
-... )
-```
-
-Poi metti a punto il modello richiamando [`~transformers.Trainer.train`]:
-
-```py
->>> trainer.train()
-```
-</pt>
-<tf>
-<a id='keras'></a>
-
-<Youtube id="rnTGBy2ax1c"/>
-
-I modelli 🤗 Transformers supportano anche l'addestramento in TensorFlow usando l'API di Keras.
-
-### Convertire dataset nel formato per TensorFlow
-
-Il [`DefaultDataCollator`] assembla tensori in lotti su cui il modello si addestrerà. Assicurati di specificare di restituire tensori per TensorFlow in `return_tensors`:
-
-```py
->>> from transformers import DefaultDataCollator
-
->>> data_collator = DefaultDataCollator(return_tensors="tf")
-```
-
-<Tip>
-
-[`Trainer`] usa [`DataCollatorWithPadding`] in maniera predefinita in modo da non dover specificare esplicitamente un collettore di dati.
-
-</Tip>
-
-Successivamente, converti i datasets tokenizzati in TensorFlow datasets con il metodo [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.to_tf_dataset). Specifica il tuo input in `columns` e le tue etichette in `label_cols`:
-
-```py
->>> tf_train_dataset = small_train_dataset.to_tf_dataset(
-...     columns=["attention_mask", "input_ids", "token_type_ids"],
-...     label_cols=["labels"],
-...     shuffle=True,
-...     collate_fn=data_collator,
-...     batch_size=8,
-... )
-
->>> tf_validation_dataset = small_eval_dataset.to_tf_dataset(
-...     columns=["attention_mask", "input_ids", "token_type_ids"],
-...     label_cols=["labels"],
-...     shuffle=False,
-...     collate_fn=data_collator,
-...     batch_size=8,
-... )
-```
-
-### Compilazione e addestramento
-
-Carica un modello TensorFlow col numero atteso di etichette:
-
-```py
->>> import tensorflow as tf
->>> from transformers import TFAutoModelForSequenceClassification
-
->>> model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
-```
-
-Poi compila e fai il fine-tuning del tuo modello usando [`fit`](https://keras.io/api/models/model_training_apis/) come faresti con qualsiasi altro modello di Keras:
-
-```py
->>> model.compile(
-...     optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
-...     loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
-...     metrics=tf.metrics.SparseCategoricalAccuracy(),
-... )
-
->>> model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3)
-```
-</tf>
-</frameworkcontent>
-
-<a id='pytorch_native'></a>
-
-## Addestramento in PyTorch nativo
-
-<frameworkcontent>
-<pt>
-<Youtube id="Dh9CL8fyG80"/>
-
-[`Trainer`] si occupa del ciclo di addestramento e ti consente di mettere a punto un modello con una sola riga di codice. Per chi preferisse scrivere un proprio ciclo di addestramento personale, puoi anche fare il fine-tuning di un modello 🤗 Transformers in PyTorch nativo.
-
-A questo punto, potresti avere bisogno di riavviare il tuo notebook o eseguire il seguente codice per liberare un po' di memoria:
-
-```py
-del model
-del pytorch_model
-del trainer
-torch.cuda.empty_cache()
-```
-
-Successivamente, postprocessa manualmente il `tokenized_dataset` per prepararlo ad essere allenato.
-
-1. Rimuovi la colonna `text` perché il modello non accetta testo grezzo come input:
-
-    ```py
-    >>> tokenized_datasets = tokenized_datasets.remove_columns(["text"])
-    ```
-
-2. Rinomina la colonna `label` in `labels` perché il modello si aspetta che questo argomento si chiami `labels`:
-
-    ```py
-    >>> tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
-    ```
-
-3. Imposta il formato del dataset per farti restituire tensori di PyTorch all'interno delle liste:
-
-    ```py
-    >>> tokenized_datasets.set_format("torch")
-    ```
-
-Poi crea un piccolo sottocampione del dataset come visto precedentemente per velocizzare il fine-tuning:
-
-```py
->>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
->>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
-```
-
-### DataLoader
-
-Crea un `DataLoader` per i tuoi datasets di train e test così puoi iterare sui lotti di dati:
-
-```py
->>> from torch.utils.data import DataLoader
-
->>> train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
->>> eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)
-```
-
-Carica il tuo modello con il numero atteso di etichette:
-
-```py
->>> from transformers import AutoModelForSequenceClassification
-
->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
-```
-
-### Ottimizzatore e learning rate scheduler
-
-Crea un ottimizzatore e il learning rate scheduler per fare il fine-tuning del modello. Usa l'ottimizzatore [`AdamW`](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html) di PyTorch:
-
-```py
->>> from torch.optim import AdamW
-
->>> optimizer = AdamW(model.parameters(), lr=5e-5)
-```
-
-Crea il learning rate scheduler predefinito da [`Trainer`]:
-
-```py
->>> from transformers import get_scheduler
-
->>> num_epochs = 3
->>> num_training_steps = num_epochs * len(train_dataloader)
->>> lr_scheduler = get_scheduler(
-...     name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
-... )
-```
-
-Infine specifica come `device` da usare una GPU se ne hai una. Altrimenti, l'addestramento su una CPU può richiedere diverse ore invece di un paio di minuti.
-
-```py
->>> import torch
-
->>> device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
->>> model.to(device)
-```
-
-<Tip>
-
-Ottieni l'accesso gratuito a una GPU sul cloud se non ne possiedi una usando un notebook sul web come [Colaboratory](https://colab.research.google.com/) o [SageMaker StudioLab](https://studiolab.sagemaker.aws/).
-
-</Tip>
-
-Ottimo, adesso possiamo addestrare! 🥳 
-
-### Training loop
-
-Per tenere traccia dei tuoi progressi durante l'addestramento, usa la libreria [tqdm](https://tqdm.github.io/) per aggiungere una progress bar sopra il numero dei passi di addestramento:
-
-```py
->>> from tqdm.auto import tqdm
-
->>> progress_bar = tqdm(range(num_training_steps))
-
->>> model.train()
->>> for epoch in range(num_epochs):
-...     for batch in train_dataloader:
-...         batch = {k: v.to(device) for k, v in batch.items()}
-...         outputs = model(**batch)
-...         loss = outputs.loss
-...         loss.backward()
-
-...         optimizer.step()
-...         lr_scheduler.step()
-...         optimizer.zero_grad()
-...         progress_bar.update(1)
-```
-
-### Metriche
-
-Proprio come è necessario aggiungere una funzione di valutazione del [`Trainer`], è necessario fare lo stesso quando si scrive il proprio ciclo di addestramento. Ma invece di calcolare e riportare la metrica alla fine di ogni epoca, questa volta accumulerai tutti i batch con [`add_batch`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=add_batch#datasets.Metric.add_batch) e calcolerai la metrica alla fine.
-
-```py
->>> metric = load_metric("accuracy")
->>> model.eval()
->>> for batch in eval_dataloader:
-...     batch = {k: v.to(device) for k, v in batch.items()}
-...     with torch.no_grad():
-...         outputs = model(**batch)
-
-...     logits = outputs.logits
-...     predictions = torch.argmax(logits, dim=-1)
-...     metric.add_batch(predictions=predictions, references=batch["labels"])
-
->>> metric.compute()
-```
-</pt>
-</frameworkcontent>
-
-<a id='additional-resources'></a>
-
-## Altre risorse
-
-Per altri esempi sul fine-tuning, fai riferimento a:
-
-- [🤗 Transformers Examples](https://github.com/huggingface/transformers/tree/main/examples) include scripts per addestrare compiti comuni di NLP in PyTorch e TensorFlow.
-
-- [🤗 Transformers Notebooks](notebooks) contiene diversi notebooks su come mettere a punto un modello per compiti specifici in PyTorch e TensorFlow.
diff --git a/docs/source/ja/_toctree.yml b/docs/source/ja/_toctree.yml
new file mode 100644
index 000000000000..8ac8b1e3183f
--- /dev/null
+++ b/docs/source/ja/_toctree.yml
@@ -0,0 +1,14 @@
+- sections:
+  - local: index
+    title: 🤗 Transformers
+  - local: installation
+    title: インストール
+  title: はじめに
+- sections:
+  - local: accelerate
+    title: 🤗 Accelerate を用いた分散学習
+  title: チュートリアル
+- sections:
+  - sections:
+    - local: multilingual
+      title: 推論のための多言語モデル
\ No newline at end of file
diff --git a/docs/source/ja/accelerate.md b/docs/source/ja/accelerate.md
new file mode 100644
index 000000000000..73e45b9cd3c5
--- /dev/null
+++ b/docs/source/ja/accelerate.md
@@ -0,0 +1,136 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 🤗 Accelerate を用いた分散学習
+
+モデルが大きくなるにつれて、限られたハードウェアでより大きなモデルを訓練し、訓練速度を大幅に上昇させるための方法として並列処理が浮上してきました。1台のマシンに複数のGPUがあっても、複数のマシンにまたがる複数のGPUがあっても、あらゆるタイプの分散処理セットアップ上でユーザーが簡単に 🤗 Transformers モデルを訓練できるように、 Hugging Face では [🤗 Accelerate](https://huggingface.co/docs/accelerate) ライブラリを作成しました。このチュートリアルでは、PyTorch の訓練ループをカスタマイズして、分散処理環境での訓練を可能にする方法について学びます。
+
+## セットアップ
+
+はじめに 🤗 Accelerate をインストールしましょう:
+
+```bash
+pip install accelerate
+```
+
+そしたらインポートして [`~accelerate.Accelerator`] オブジェクトを作成しましょう。[`~accelerate.Accelerator`] は分散処理セットアップを自動的に検出し、訓練のために必要な全てのコンポーネントを初期化します。モデルをデバイスに明示的に配置する必要はありません。
+
+```py
+>>> from accelerate import Accelerator
+
+>>> accelerator = Accelerator()
+```
+
+## Accelerate する準備をしましょう
+
+次に、関連する全ての訓練オブジェクトを [`~accelerate.Accelerator.prepare`] メソッドに渡します。これには、訓練と評価それぞれのDataloader、モデル、optimizer が含まれます:
+
+```py
+>>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
+...     train_dataloader, eval_dataloader, model, optimizer
+... )
+```
+
+## Backward
+
+最後に訓練ループ内の `loss.backward()` を 🤗 Accelerate の [`~accelerate.Accelerator.backward`] メソッドで置き換えます：
+
+```py
+>>> for epoch in range(num_epochs):
+...     for batch in train_dataloader:
+...         outputs = model(**batch)
+...         loss = outputs.loss
+...         accelerator.backward(loss)
+
+...         optimizer.step()
+...         lr_scheduler.step()
+...         optimizer.zero_grad()
+...         progress_bar.update(1)
+```
+
+以下のコードで確認できる通り、訓練ループに4行のコードを追加するだけで分散学習が可能です！
+
+```diff
++ from accelerate import Accelerator
+  from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
+
++ accelerator = Accelerator()
+
+  model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
+  optimizer = AdamW(model.parameters(), lr=3e-5)
+
+- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+- model.to(device)
+
++ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
++     train_dataloader, eval_dataloader, model, optimizer
++ )
+
+  num_epochs = 3
+  num_training_steps = num_epochs * len(train_dataloader)
+  lr_scheduler = get_scheduler(
+      "linear",
+      optimizer=optimizer,
+      num_warmup_steps=0,
+      num_training_steps=num_training_steps
+  )
+
+  progress_bar = tqdm(range(num_training_steps))
+
+  model.train()
+  for epoch in range(num_epochs):
+      for batch in train_dataloader:
+-         batch = {k: v.to(device) for k, v in batch.items()}
+          outputs = model(**batch)
+          loss = outputs.loss
+-         loss.backward()
++         accelerator.backward(loss)
+
+          optimizer.step()
+          lr_scheduler.step()
+          optimizer.zero_grad()
+          progress_bar.update(1)
+```
+
+## 訓練する
+
+関連するコードを追加したら、スクリプトまたは Colaboratory などのノートブックで訓練を開始します。
+
+### スクリプトで訓練する
+
+スクリプトから訓練をしている場合は、設定ファイルを作成・保存するために以下のコマンドを実行してください:
+
+```bash
+accelerate config
+```
+
+そして次のようにして訓練を開始します:
+
+```bash
+accelerate launch train.py
+```
+
+### ノートブックで訓練する
+
+Colaboratory の TPU の利用をお考えの場合、🤗 Accelerate はノートブック上で実行することもできます。訓練に必要な全てのコードを関数に含め、[`~accelerate.notebook_launcher`] に渡してください:
+
+```py
+>>> from accelerate import notebook_launcher
+
+>>> notebook_launcher(training_function)
+```
+
+🤗 Accelerate と豊富な機能についてもっと知りたい方は[ドキュメント](https://huggingface.co/docs/accelerate)を参照してください。
diff --git a/docs/source/ja/index.md b/docs/source/ja/index.md
new file mode 100644
index 000000000000..364a3b34caba
--- /dev/null
+++ b/docs/source/ja/index.md
@@ -0,0 +1,399 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 🤗 Transformers
+
+[PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/), [JAX](https://jax.readthedocs.io/en/latest/)のための最先端機械学習。
+
+🤗 Transformers は最先端の学習済みモデルを簡単にダウンロードして学習するAPIとツールを提供します。学習済みモデルを使用することで計算コストと二酸化炭素の排出量を削減でき、またゼロからモデルを学習するために要求される時間とリソースを節約することができます。 これらのモデルは以下のような異なるモダリティにおける一般的なタスクをサポートします:
+
+📝 **自然言語処理**: テキスト分類、 固有表現抽出、 質問応答、 言語モデリング、 文章要約、 機械翻訳、 複数選択、テキスト生成。<br>
+🖼️ **コンピュータビジョン**: 画像分類、 物体検出、 セグメンテーション。<br>
+🗣️ **音声**: 自動音声認識、音声分類。<br>
+🐙 **マルチモーダル**: テーブル質問応答、 光学文字認識(OCR)、 スキャンされたドキュメントからの情報抽出、 動画分類、 visual question answering(視覚的質問応答)。
+
+🤗 Transformers はPyTorch, TensorFlow, JAX間のフレームワーク相互運用性をサポートしています。 これはモデルの各段階で異なるフレームワークを使うための柔軟性を提供します。あるフレームワークで3行のコードでモデルを学習し、別のフレームワークで推論のためにモデルをロードすることが可能です。また、本番環境のデプロイのためにモデルをONNXやTorchScriptのような形式でエクスポートすることも可能です。
+
+[Hub](https://huggingface.co/models), [forum](https://discuss.huggingface.co/), [Discord](https://discord.com/invite/JfAtkvEtRb)で成長中のコミュニティに今日参加しましょう！
+
+## Hugging Faceチームによるカスタムサポートをご希望の場合
+
+<a target="_blank" href="https://huggingface.co/support">
+    <img alt="HuggingFace Expert Acceleration Program" src="https://cdn-media.huggingface.co/marketing/transformers/new-support-improved.png" style="width: 100%; max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
+</a>
+
+## 目次
+
+ドキュメントは以下の5つのセクションで構成されています:
+
+- **はじめに** は、ライブラリのクイックツアーとライブラリを使い始めるためのインストール手順を提供しています。
+- **チュートリアル** は、初心者が始めるのに最適な場所です。このセクションでは、ライブラリを使い始めるために必要な基本的なスキルを習得できます。
+- **HOW-TOガイド** は、言語モデリングのために学習済みモデルをfinetuningすることやカスタムモデルの作成と共有の方法などといった特定の目標を達成するための方法を示しています。
+- **コンセプトガイド** は、モデルやタスク、そして 🤗 Transformersの設計思想の背景にある基本的にコンセプトや考え方についてより深く考察し解説しています。
+- **API** 全てのクラスと関数を説明します:
+
+  - **MAIN CLASSES** は、configuration, model, tokenizer, pipelineといった最も重要なクラスについて詳細に説明しています。
+  - **MODELS** は、ライブラリで実装されているそれぞれのモデルに関連したクラスと関数を詳細に説明しています。
+  - **INTERNAL HELPERS** は、内部で使用されているユーティリティクラスや関数を詳細に説明しています。
+
+### サポートされているモデル
+
+<!--This list is updated automatically from the README with _make fix-copies_. Do not update manually! -->
+
+1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (Google Research and the Toyota Technological Institute at Chicago から) Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut から公開された研究論文: [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942)
+1. **[AltCLIP](https://huggingface.co/docs/transformers/main/model_doc/altclip)** (BAAI から) Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell から公開された研究論文: [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679)
+1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (MIT から) Yuan Gong, Yu-An Chung, James Glass から公開された研究論文: [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778)
+1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (Facebook から) Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer から公開された研究論文: [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461)
+1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (École polytechnique から) Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis から公開された研究論文: [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321)
+1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (VinAI Research から) Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen から公開された研究論文: [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701)
+1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (Microsoft から) Hangbo Bao, Li Dong, Furu Wei から公開された研究論文: [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254)
+1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (Google から) Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova から公開された研究論文: [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805)
+1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bert-generation)** (Google から) Sascha Rothe, Shashi Narayan, Aliaksei Severyn から公開された研究論文: [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461)
+1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (VinAI Research から) Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen から公開された研究論文: [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/)
+1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (Google Research から) Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed から公開された研究論文: [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062)
+1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (Google Research から) Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed から公開された研究論文: [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062)
+1. **[BioGpt](https://huggingface.co/docs/transformers/main/model_doc/biogpt)** (Microsoft Research AI4Science から) Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu から公開された研究論文: [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9)
+1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (Google AI から) Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil から公開された研究論文: [Big Transfer (BiT)](https://arxiv.org/abs/1912.11370)Houlsby.
+1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (Facebook から) Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston から公開された研究論文: [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637)
+1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (Facebook から) Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston から公開された研究論文: [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637)
+1. **[BLIP](https://huggingface.co/docs/transformers/main/model_doc/blip)** (Salesforce から) Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi から公開された研究論文: [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086)
+1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (BigScience workshop から) [BigScience Workshop](https://bigscience.huggingface.co/) から公開されました.
+1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (Alexa から) Adrian de Wynter and Daniel J. Perry から公開された研究論文: [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499)
+1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (Google Research から) Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel から公開された研究論文: [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626)
+1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (Inria/Facebook/Sorbonne から) Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot から公開された研究論文: [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894)
+1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google Research から) Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting から公開された研究論文: [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874)
+1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (OFA-Sys から) An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou から公開された研究論文: [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335)
+1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI から) Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever から公開された研究論文: [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
+1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (University of Göttingen から) Timo Lüddecke and Alexander Ecker から公開された研究論文: [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003)
+1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (Salesforce から) Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong から公開された研究論文: [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474)
+1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (Microsoft Research Asia から) Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang から公開された研究論文: [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152)
+1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (YituTech から) Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan から公開された研究論文: [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496)
+1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (Facebook AI から) Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie から公開された研究論文: [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545)
+1. **[ConvNeXTV2](model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
+1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (Tsinghua University から) Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun から公開された研究論文: [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413)
+1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (Salesforce から) Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher から公開された研究論文: [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858)
+1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (Microsoft から) Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang から公開された研究論文: [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808)
+1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (Facebook から) Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli から公開された研究論文: [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555)
+1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (Microsoft から) Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen から公開された研究論文: [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654)
+1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (Microsoft から) Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen から公開された研究論文: [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654)
+1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (Berkeley/Facebook/Google から) Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch から公開された研究論文: [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345)
+1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (SenseTime Research から) Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai から公開された研究論文: [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159)
+1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (Facebook から) Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou から公開された研究論文: [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877)
+1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (Facebook から) Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko から公開された研究論文: [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872)
+1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (Microsoft Research から) Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan から公開された研究論文: [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536)
+1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (SHI Labs から) Ali Hassani and Humphrey Shi から公開された研究論文: [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001)
+1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (HuggingFace から), Victor Sanh, Lysandre Debut and Thomas Wolf. 同じ手法で GPT2, RoBERTa と Multilingual BERT の圧縮を行いました.圧縮されたモデルはそれぞれ [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation)、[DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation)、[DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) と名付けられました. 公開された研究論文: [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108)
+1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (Microsoft Research から) Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei から公開された研究論文: [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378)
+1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (NAVER から), Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park から公開された研究論文: [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664)
+1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (Facebook から) Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih から公開された研究論文: [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906)
+1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (Intel Labs から) René Ranftl, Alexey Bochkovskiy, Vladlen Koltun から公開された研究論文: [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413)
+1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Research) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946)  by Mingxing Tan and Quoc V. Le.
+1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (Google Research/Stanford University から) Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning から公開された研究論文: [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555)
+1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (Google Research から) Sascha Rothe, Shashi Narayan, Aliaksei Severyn から公開された研究論文: [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461)
+1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (Baidu から) Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu から公開された研究論文: [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223)
+1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (Meta AI から) はトランスフォーマープロテイン言語モデルです.  **ESM-1b** は Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus から公開された研究論文: [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118). **ESM-1v** は Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives　から公開された研究論文: [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648). **ESM-2** と　**ESMFold** は Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives から公開された研究論文: [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) 
+1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (Google AI から) Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V から公開されたレポジトリー [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) Le, and Jason Wei
+1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (CNRS から) Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab から公開された研究論文: [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372)
+1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (Facebook AI から) Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela から公開された研究論文: [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482)
+1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (Google Research から) James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon から公開された研究論文: [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824)
+1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (CMU/Google Brain から) Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le から公開された研究論文: [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236)
+1. **[GIT](https://huggingface.co/docs/transformers/main/model_doc/git)** (Microsoft Research から) Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. から公開された研究論文 [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100)
+1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (KAIST から) Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim から公開された研究論文: [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436)
+1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (OpenAI から) Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever から公開された研究論文: [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/)
+1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (EleutherAI から) Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy から公開されたレポジトリー : [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo)
+1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (EleutherAI から) Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach から公開された研究論文: [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745)
+1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (ABEJA から) Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori からリリース.
+1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (OpenAI から) Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever** から公開された研究論文: [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/)
+1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (EleutherAI から) Ben Wang and Aran Komatsuzaki から公開されたレポジトリー [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) 
+1. **[GPT-Sw3](https://huggingface.co/docs/transformers/main/model_doc/gpt-sw3)** (AI-Sweden から) Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren から公開された研究論文: [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) 
+1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA から) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang から公開された研究論文: [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094)
+1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook から) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed から公開された研究論文: [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447)
+1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (Berkeley から) Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer から公開された研究論文: [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321)
+1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (OpenAI から) Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever から公開された研究論文: [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/)
+1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (OpenAI から) Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever から公開された研究論文: [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf)
+1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (Microsoft Research Asia から) Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou から公開された研究論文: [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318)
+1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (Microsoft Research Asia から) Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou から公開された研究論文: [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740)
+1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (Microsoft Research Asia から) Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei から公開された研究論文: [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387)
+1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutxlm)** (Microsoft Research Asia から) Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei から公開された研究論文: [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836)
+1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (AllenAI から) Iz Beltagy, Matthew E. Peters, Arman Cohan から公開された研究論文: [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150)
+1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (Meta AI から) Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze から公開された研究論文: [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136)
+1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (South China University of Technology から) Jiapeng Wang, Lianwen Jin, Kai Ding から公開された研究論文: [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669)
+1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (AllenAI から) Iz Beltagy, Matthew E. Peters, Arman Cohan から公開された研究論文: [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150)
+1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (Google AI から) Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang から公開された研究論文: [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916)
+1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (Studio Ousia から) Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto から公開された研究論文: [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057)
+1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (UNC Chapel Hill から) Hao Tan and Mohit Bansal から公開された研究論文: [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490)
+1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (Facebook から) Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert から公開された研究論文: [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161)
+1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (Facebook から) Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin から公開された研究論文: [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125)
+1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Jörg Tiedemann から. [OPUS](http://opus.nlpl.eu/) を使いながら学習された "Machine translation" (マシントランスレーション) モデル. [Marian Framework](https://marian-nmt.github.io/) はMicrosoft Translator Team　が現在開発中です.
+1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (Microsoft Research Asia から) Junlong Li, Yiheng Xu, Lei Cui, Furu Wei から公開された研究論文: [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518)
+1. **[Mask2Former](https://huggingface.co/docs/transformers/main/model_doc/mask2former)** (FAIR and UIUC から) Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar. から公開された研究論文 [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527)
+1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (Meta and UIUC から) Bowen Cheng, Alexander G. Schwing, Alexander Kirillov から公開された研究論文: [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278)
+1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (Facebook から) Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer から公開された研究論文: [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210)
+1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (Facebook から) Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan から公開された研究論文: [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401)
+1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (NVIDIA から) Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro から公開された研究論文: [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053)
+1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (NVIDIA から) Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro から公開された研究論文: [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053)
+1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (Studio Ousia から) Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka から公開された研究論文: [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151)
+1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (CMU/Google Brain から) Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou から公開された研究論文: [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984)
+1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (Google Inc. から) Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam から公開された研究論文: [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861)
+1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (Google Inc. から) Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen から公開された研究論文: [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381)
+1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (Apple から) Sachin Mehta and Mohammad Rastegari から公開された研究論文: [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178)
+1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (Microsoft Research から) Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu から公開された研究論文: [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297)
+1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI から) Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel から公開された研究論文: [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934)
+1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (RUC AI Box から) Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen から公開された研究論文: [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131)
+1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (SHI Labs から) Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi から公開された研究論文: [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143)
+1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (Huawei Noah’s Ark Lab から) Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu から公開された研究論文: [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204)
+1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (Meta から) the NLLB team から公開された研究論文: [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672)
+1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (the University of Wisconsin - Madison から) Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh から公開された研究論文: [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902)
+1. **[OneFormer](https://huggingface.co/docs/transformers/main/model_doc/oneformer)** (SHI Labs から) Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi から公開された研究論文: [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220)
+1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (Meta AI から) Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al から公開された研究論文: [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068)
+1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI から) Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby から公開された研究論文: [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230)
+1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (Google から) Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu から公開された研究論文: [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)
+1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google から) Jason Phang, Yao Zhao, and Peter J. Liu から公開された研究論文: [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347)
+1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (Deepmind から) Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira から公開された研究論文: [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795)
+1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (VinAI Research から) Dat Quoc Nguyen and Anh Tuan Nguyen から公開された研究論文: [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/)
+1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (UCLA NLP から) Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang から公開された研究論文: [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333)
+1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (Sea AI Labs から) Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng から公開された研究論文: [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418)
+1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (Microsoft Research から) Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou から公開された研究論文: [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063)
+1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (NVIDIA から) Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius から公開された研究論文: [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602)
+1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (Facebook から) Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela から公開された研究論文: [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401)
+1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (Google Research から) Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang から公開された研究論文: [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909)
+1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (Google Research から) Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya から公開された研究論文: [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451)
+1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (META Platforms から) Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár から公開された研究論文: [Designing Network Design Space](https://arxiv.org/abs/2003.13678)
+1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (Google Research から) Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder から公開された研究論文: [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821)
+1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (Microsoft Research から) Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun から公開された研究論文: [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385)
+1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (Facebook から), Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov から公開された研究論文: [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692)
+1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/main/model_doc/roberta-prelayernorm)** (Facebook から) Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli から公開された研究論文: [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038)
+1. **[RoCBert](https://huggingface.co/docs/transformers/main/model_doc/roc_bert)** (WeChatAI から) HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou から公開された研究論文: [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf)
+1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (ZhuiyiTechnology から), Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu から公開された研究論文: [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864)
+1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (NVIDIA から) Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo から公開された研究論文: [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203)
+1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP から) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi から公開された研究論文: [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870)
+1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (ASAPP から) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi から公開された研究論文: [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870)
+1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (Facebook から), Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino から公開された研究論文: [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171)
+1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (Facebook から), Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau から公開された研究論文: [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678)
+1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (Tel Aviv University から), Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy から公開された研究論文: [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438)
+1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (Berkeley から) Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer から公開された研究論文: [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316)
+1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (Microsoft から) Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo から公開された研究論文: [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030)
+1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (Microsoft から) Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo から公開された研究論文: [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883)
+1. **[Swin2SR](https://huggingface.co/docs/transformers/main/model_doc/swin2sr)** (University of Würzburg から) Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte から公開された研究論文: [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345)
+1. **[SwitchTransformers](https://huggingface.co/docs/transformers/main/model_doc/switch_transformers)** (Google から) William Fedus, Barret Zoph, Noam Shazeer から公開された研究論文: [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961)
+1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (Google AI から) Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu から公開された研究論文: [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683)
+1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (Google AI から) Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu から公開されたレポジトリー [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511)
+1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (Microsoft Research から) Brandon Smock, Rohith Pesala, Robin Abraham から公開された研究論文: [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061)
+1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (Google AI から) Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos から公開された研究論文: [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349)
+1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (Microsoft Research から) Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou から公開された研究論文: [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653)
+1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)**  (HuggingFace から).
+1. **[TimeSformer](https://huggingface.co/docs/transformers/main/model_doc/timesformer)** (Facebook から) Gedas Bertasius, Heng Wang, Lorenzo Torresani から公開された研究論文: [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095)
+1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (the University of California at Berkeley から) Michael Janner, Qiyang Li, Sergey Levine から公開された研究論文: [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039)
+1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (Google/CMU から) Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov から公開された研究論文: [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860)
+1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (Microsoft から), Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei から公開された研究論文: [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282)
+1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (Google Research から) Yi Tay, Mostafa Dehghani, Vinh Q から公開された研究論文: [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
+1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (Microsoft Research から) Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang から公開された研究論文: [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597)
+1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (Microsoft Research から) Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu から公開された研究論文: [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752)
+1. **[UPerNet](https://huggingface.co/docs/transformers/main/model_doc/upernet)** (Peking University から) Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun. から公開された研究論文 [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221)
+1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (Tsinghua University and Nankai University から) Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu から公開された研究論文: [Visual Attention Network](https://arxiv.org/abs/2202.09741)
+1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (Multimedia Computing Group, Nanjing University から) Zhan Tong, Yibing Song, Jue Wang, Limin Wang から公開された研究論文: [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602)
+1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (NAVER AI Lab/Kakao Enterprise/Kakao Brain から) Wonjae Kim, Bokyung Son, Ildoo Kim から公開された研究論文: [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334)
+1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (Google AI から) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby から公開された研究論文: [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929)
+1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (UCLA NLP から) Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang から公開された研究論文: [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557)
+1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (Google AI から) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby から公開された研究論文: [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929)
+1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (Meta AI から) Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick から公開された研究論文: [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377)
+1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (Meta AI から) Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas から公開された研究論文: [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141)
+1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (Facebook AI から) Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli から公開された研究論文: [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477)
+1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (Facebook AI から) Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino から公開された研究論文: [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171)
+1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (Facebook AI から) Qiantong Xu, Alexei Baevski, Michael Auli から公開された研究論文: [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680)
+1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (Microsoft Research から) Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei から公開された研究論文: [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900)
+1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (OpenAI から) Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever から公開された研究論文: [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf)
+1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (Microsoft Research から) Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling から公開された研究論文: [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816)
+1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li から公開された研究論文: [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668)
+1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (Facebook から) Guillaume Lample and Alexis Conneau から公開された研究論文: [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291)
+1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (Microsoft Research から) Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou から公開された研究論文: [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063)
+1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (Facebook AI から), Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov から公開された研究論文: [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116)
+1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (Facebook AI から), Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau から公開された研究論文: [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572)
+1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (Google/CMU から) Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le から公開された研究論文: [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237)
+1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (Facebook AI から) Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli から公開された研究論文: [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296)
+1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (Facebook AI から) Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli から公開された研究論文: [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979)
+1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (Huazhong University of Science & Technology から) Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu から公開された研究論文: [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666)
+1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (the University of Wisconsin - Madison から) Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh から公開された研究論文: [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714)
+
+
+### サポートされているフレームワーク
+
+以下のテーブルはそれぞれのモデルでサポートされているライブラリを示しています。"slow"と呼ばれるPythonトークナイザー、🤗 Tokenizers ライブラリによる"fast"トークナイザー、PyTorch, TensorFlow, Flaxの5つのそれぞれがサポートされているかを示しています。
+
+<!--This table is updated automatically from the auto modules with _make fix-copies_. Do not update manually!-->
+
+|             Model             | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support |
+|:-----------------------------:|:--------------:|:--------------:|:---------------:|:------------------:|:------------:|
+|            ALBERT             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|            AltCLIP            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+| Audio Spectrogram Transformer |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             BART              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|             BEiT              |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|             BERT              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|        Bert Generation        |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            BigBird            |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
+|        BigBird-Pegasus        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            BioGpt             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              BiT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          Blenderbot           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|        BlenderbotSmall        |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|             BLIP              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             BLOOM             |       ❌       |       ✅       |       ✅        |         ❌         |      ❌      |
+|           CamemBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            CANINE             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         Chinese-CLIP          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             CLIP              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|            CLIPSeg            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            CodeGen            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|       Conditional DETR        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           ConvBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           ConvNeXT            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             CTRL              |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|              CvT              |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|         Data2VecAudio         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         Data2VecText          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        Data2VecVision         |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            DeBERTa            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|          DeBERTa-v2           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|     Decision Transformer      |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        Deformable DETR        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             DeiT              |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             DETR              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             DiNAT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          DistilBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|           DonutSwin           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              DPR              |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|              DPT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            ELECTRA            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|        Encoder decoder        |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|             ERNIE             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              ESM              |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|  FairSeq Machine-Translation  |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           FlauBERT            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             FLAVA             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             FNet              |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|      Funnel Transformer       |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|              GIT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             GLPN              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            GPT Neo            |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|           GPT NeoX            |       ❌       |       ✅       |       ✅        |         ❌         |      ❌      |
+|       GPT NeoX Japanese       |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             GPT-J             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|            GPT-Sw3            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|           GroupViT            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            Hubert             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            I-BERT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           ImageGPT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            Jukebox            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           LayoutLM            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|          LayoutLMv2           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          LayoutLMv3           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|              LED              |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|             LeViT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             LiLT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          Longformer           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            LongT5             |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|             LUKE              |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            LXMERT             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            M-CTC-T            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            M2M100             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            Marian             |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
+|           MarkupLM            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          Mask2Former          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          MaskFormer           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        MaskFormerSwin         |       ❌       |       ❌       |       ❌        |         ❌         |      ❌      |
+|             mBART             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|         Megatron-BERT         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          MobileBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|          MobileNetV1          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          MobileNetV2          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           MobileViT           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             MPNet             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|              MT5              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|              MVP              |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|              NAT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             Nezha             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         Nyströmformer         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          OpenAI GPT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|         OpenAI GPT-2          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|              OPT              |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|            OWL-ViT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            Pegasus            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|           PEGASUS-X           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Perceiver           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            PLBart             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          PoolFormer           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          ProphetNet           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            QDQBert            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              RAG              |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             REALM             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|           Reformer            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|            RegNet             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|            RemBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            ResNet             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|           RetriBERT           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|            RoBERTa            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|     RoBERTa-PreLayerNorm      |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|            RoCBert            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           RoFormer            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|           SegFormer           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|              SEW              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             SEW-D             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|    Speech Encoder decoder     |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|          Speech2Text          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|         Speech2Text2          |       ✅       |       ❌       |       ❌        |         ❌         |      ❌      |
+|           Splinter            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          SqueezeBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|       Swin Transformer        |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|      Swin Transformer V2      |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            Swin2SR            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|      SwitchTransformers       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              T5               |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|       Table Transformer       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             TAPAS             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|    Time Series Transformer    |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          TimeSformer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|    Trajectory Transformer     |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        Transformer-XL         |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             TrOCR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           UniSpeech           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         UniSpeechSat          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            UPerNet            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              VAN              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           VideoMAE            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             ViLT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|    Vision Encoder decoder     |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|     VisionTextDualEncoder     |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|          VisualBERT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              ViT              |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|          ViT Hybrid           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            ViTMAE             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            ViTMSN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Wav2Vec2            |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
+|      Wav2Vec2-Conformer       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             WavLM             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            Whisper            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            X-CLIP             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             XGLM              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|              XLM              |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|        XLM-ProphetNet         |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          XLM-RoBERTa          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|        XLM-RoBERTa-XL         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             XLNet             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|             YOLOS             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             YOSO              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+
+<!-- End table-->
\ No newline at end of file
diff --git a/docs/source/ja/installation.md b/docs/source/ja/installation.md
new file mode 100644
index 000000000000..3b8646672e52
--- /dev/null
+++ b/docs/source/ja/installation.md
@@ -0,0 +1,244 @@
+<!---
+Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# インストール
+
+使用しているDeep Learningライブラリに対して、🤗 Transformersをインストールしてキャッシュを設定、そしてオプションでオフラインで実行できるように 🤗 Transformersを設定します。
+
+🤗 TransformersはPython 3.6+, PyTorch 1.1.0+, TensorFlow 2.0+, Flaxで動作確認しています。 使用しているDeep Learningライブラリに合わせて、以下のインストール方法に従ってください:
+
+* [PyTorch](https://pytorch.org/get-started/locally/)のインストール手順。
+* [TensorFlow 2.0](https://www.tensorflow.org/install/pip)のインストール手順。
+* [Flax](https://flax.readthedocs.io/en/latest/)のインストール手順。
+
+## pipでのインストール
+
+🤗 Transformersを[仮想環境](https://docs.python.org/3/library/venv.html)にインストールする必要があります。 もし、Pythonの仮想環境に馴染みがない場合は、この[ガイド](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)をご覧ください。仮想環境によって異なるプロジェクトの管理がより簡単になり、依存関係間の互換性の問題を回避できます。
+
+まず、プロジェクトディレクトリに仮想環境を作成することから始めましょう:
+
+```bash
+python -m venv .env
+```
+
+仮想環境を起動しましょう。LinuxとMacOsの場合は以下のコマンドで起動します:
+
+```bash
+source .env/bin/activate
+```
+Windowsで仮想環境を起動します
+
+```bash
+.env/Scripts/activate
+```
+
+これで、次のコマンドで🤗 Transformersをインストールする準備が整いました:
+
+```bash
+pip install transformers
+```
+
+CPU対応のみ必要な場合、🤗 TransformersとDeep Learningライブラリを1行でインストールできるようになっていて便利です。例えば、🤗 TransformersとPyTorchを以下のように一緒にインストールできます:
+
+```bash
+pip install transformers[torch]
+```
+
+🤗 TransformersとTensorFlow 2.0:
+
+```bash
+pip install transformers[tf-cpu]
+```
+
+🤗 TransformersとFlax:
+
+```bash
+pip install transformers[flax]
+```
+
+最後に、以下のコマンドを実行することで🤗 Transformersが正しくインストールされているかを確認します。学習済みモデルがダウンロードされます:
+
+```bash
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"
+```
+
+その後、ラベルとスコアが出力されます:
+
+```bash
+[{'label': 'POSITIVE', 'score': 0.9998704791069031}]
+```
+
+## ソースからのインストール
+
+以下のコマンドでソースから🤗 Transformersをインストールします:
+
+```bash
+pip install git+https://github.com/huggingface/transformers
+```
+
+このコマンドは最新の安定版ではなく、開発における最新の`main`バージョンをインストールします。`main`バージョンは最新の開発状況に対応するのに便利です。例えば、最後の公式リリース以降にバグが修正されたが、新しいリリースがまだ展開されていない場合などです。しかし、これは`main`バージョンが常に安定しているとは限らないことを意味します。私たちは`main`バージョンの運用を維持するよう努め、ほとんどの問題は通常、数時間から1日以内に解決されます。もし問題に遭遇した場合は、より早く修正できるように[Issue](https://github.com/huggingface/transformers/issues)を作成してください！
+
+以下のコマンドを実行して、🤗 Transformersが正しくインストールされているかどうかを確認します:
+
+```bash
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I love you'))"
+```
+
+## 編集可能なインストール
+
+必要に応じて、編集可能なインストールをします:
+
+* ソースコードの`main`バージョンを使います。
+* 🤗 Transformersにコントリビュートし、コードの変更をテストする必要があります。
+
+以下のコマンドでレポジトリをクローンして、🤗 Transformersをインストールします:
+
+```bash
+git clone https://github.com/huggingface/transformers.git
+cd transformers
+pip install -e .
+```
+
+上記のコマンドは、レポジトリをクローンしたフォルダとPythonのライブラリをパスをリンクします。Pythonは通常のライブラリパスに加えて、あなたがクローンしたフォルダの中も見るようになります。例えば、Pythonパッケージが通常、`~/anaconda3/envs/main/lib/python3.7/site-packages/`にインストールされている場合、Pythonはクローンしたフォルダも検索するようになります: `~/transformers/`.
+
+<Tip warning={true}>
+
+ライブラリーを使い続けたい場合は、transformersフォルダーを保持しつづける必要があります。
+
+</Tip>
+
+これで、次のコマンドで簡単にクローンを🤗 Transformersの最新版に更新できます:
+
+```bash
+cd ~/transformers/
+git pull
+```
+
+Python環境は次回の実行時に🤗 Transformersの`main`バージョンを見つけるようになります。
+
+## condaでのインストール
+
+`huggingface`のcondaチャンネルからインストールします:
+
+```bash
+conda install -c huggingface transformers
+```
+
+## キャッシュの設定
+
+学習済みモデルはダウンロードされ、ローカルにキャッシュされます: `~/.cache/huggingface/hub`. これはシェル環境変数`TRANSFORMERS_CACHE`で指定されるデフォルトのディレクトリです。Windowsでは、デフォルトのディレクトリは`C:\Users\username\.cache\huggingface\hub`になっています。異なるキャッシュディレクトリを指定するために、以下のシェル環境変数を変更することが可能です。優先度は以下の順番に対応します:
+
+1. シェル環境変数 (デフォルト): `HUGGINGFACE_HUB_CACHE` または `TRANSFORMERS_CACHE`.
+2. シェル環境変数: `HF_HOME`.
+3. シェル環境変数: `XDG_CACHE_HOME` + `/huggingface`.
+
+<Tip>
+
+もし、以前のバージョンのライブラリを使用していた人で、`PYTORCH_TRANSFORMERS_CACHE`または`PYTORCH_PRETRAINED_BERT_CACHE`を設定していた場合、シェル環境変数`TRANSFORMERS_CACHE`を指定しない限り🤗 Transformersはこれらのシェル環境変数を使用します。
+
+</Tip>
+
+## オフラインモード
+
+🤗 Transformersはローカルファイルのみを使用することでファイアウォールやオフラインの環境でも動作させることができます。この動作を有効にするためには、環境変数`TRANSFORMERS_OFFLINE=1`を設定します。
+
+<Tip>
+
+環境変数`HF_DATASETS_OFFLINE=1`を設定し、オフライントレーニングワークフローに[🤗 Datasets](https://huggingface.co/docs/datasets/)を追加します。
+
+</Tip>
+
+例えば、外部インスタンスに対してファイアウォールで保護された通常のネットワーク上でプログラムを実行する場合、通常以下のようなコマンドで実行することになります:
+
+```bash
+python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
+```
+
+オフラインインスタンスでこの同じプログラムを実行します:
+
+```bash
+HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
+python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
+```
+
+このスクリプトは、ローカルファイルのみを検索することが分かっているので、ハングアップしたりタイムアウトを待ったりすることなく実行されるはずです。
+
+### オフラインで使用するためにモデルやトークナイザーを取得する
+
+オフラインで🤗 Transformersを使用するもう1つの方法は、前もってファイルをダウンロードしておき、オフラインで使用する必要があるときにそのローカルパスを指定することです。これには3つの方法があります:
+
+* [Model Hub](https://huggingface.co/models)のユーザーインターフェース上から↓アイコンをクリックしてファイルをダウンロードする方法。
+
+    ![download-icon](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/download-icon.png)
+
+* [`PreTrainedModel.from_pretrained`]および[`PreTrainedModel.save_pretrained`]のワークフローを使用する方法:
+
+    1. [`PreTrainedModel.from_pretrained`]で前もってファイルをダウンロードします:
+
+    ```py
+    >>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/T0_3B")
+    >>> model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0_3B")
+    ```
+
+    2. [`PreTrainedModel.save_pretrained`]で指定されたディレクトリにファイルを保存しておきます:
+
+    ```py
+    >>> tokenizer.save_pretrained("./your/path/bigscience_t0")
+    >>> model.save_pretrained("./your/path/bigscience_t0")
+    ```
+
+    3. オフラインにある時、[`PreTrainedModel.from_pretrained`]に指定したディレクトリからファイルをリロードします:
+
+    ```py
+    >>> tokenizer = AutoTokenizer.from_pretrained("./your/path/bigscience_t0")
+    >>> model = AutoModel.from_pretrained("./your/path/bigscience_t0")
+    ```
+
+* プログラム的に[huggingface_hub](https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub)ライブラリを用いて、ファイルをダウンロードする方法:
+
+    1. 仮想環境に`huggingface_hub`ライブラリをインストールします:
+
+    ```bash
+    python -m pip install huggingface_hub
+    ```
+
+    2. 指定のパスにファイルをダウンロードするために、[`hf_hub_download`](https://huggingface.co/docs/hub/adding-a-library#download-files-from-the-hub)関数を使用します。例えば、以下のコマンドで、[T0](https://huggingface.co/bigscience/T0_3B)モデルの`config.json`ファイルを指定のパスにダウンロードできます:
+
+    ```py
+    >>> from huggingface_hub import hf_hub_download
+
+    >>> hf_hub_download(repo_id="bigscience/T0_3B", filename="config.json", cache_dir="./your/path/bigscience_t0")
+    ```
+
+ファイルがダウンロードされ、ローカルにキャッシュされたら、そのローカルパスを指定してファイルをロードして使用します:
+
+```py
+>>> from transformers import AutoConfig
+
+>>> config = AutoConfig.from_pretrained("./your/path/bigscience_t0/config.json")
+```
+
+<Tip>
+
+Hubに保存されているファイルをダウンロードする方法の詳細については、[How to download files from the Hub](https://huggingface.co/docs/hub/how-to-downstream)セクションを参照してください。
+
+</Tip>
\ No newline at end of file
diff --git a/docs/source/ja/multilingual.md b/docs/source/ja/multilingual.md
new file mode 100644
index 000000000000..86dabb94633c
--- /dev/null
+++ b/docs/source/ja/multilingual.md
@@ -0,0 +1,178 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 推論のための多言語モデル
+
+[[open-in-colab]]
+
+🤗 Transformers にはいくつかの多言語モデルがあり、それらの推論の使用方法は単一言語モデルとは異なります。ただし、多言語モデルの使用方法がすべて異なるわけではありません。 [bert-base-multilingual-uncased](https://huggingface.co/bert-base-multilingual-uncased) などの一部のモデルは、単一言語モデルと同様に使用できます。 このガイドでは、推論のために使用方法が異なる多言語モデルをどのように使うかを示します。
+
+## XLM
+
+XLM には10の異なるチェックポイントがあり、そのうちの1つだけが単一言語です。 残りの9つのモデルチェックポイントは、言語埋め込みを使用するチェックポイントと使用しないチェックポイントの2つのカテゴリに分けることができます。
+
+### 言語の埋め込みがある XLM
+
+次の XLM モデルは、言語の埋め込みを使用して、推論で使用される言語を指定します。
+
+- `xlm-mlm-ende-1024` (マスク化された言語モデリング、英語-ドイツ語)
+- `xlm-mlm-enfr-1024` (マスク化された言語モデリング、英語-フランス語)
+- `xlm-mlm-enro-1024` (マスク化された言語モデリング、英語-ルーマニア語)
+- `xlm-mlm-xnli15-1024` (マスク化された言語モデリング、XNLI 言語)
+- `xlm-mlm-tlm-xnli15-1024` (マスク化された言語モデリング + 翻訳 + XNLI 言語)
+- `xlm-clm-enfr-1024` (因果言語モデリング、英語-フランス語)
+- `xlm-clm-ende-1024` (因果言語モデリング、英語-ドイツ語)
+
+言語の埋め込みは、モデルに渡される `input_ids` と同じ形状のテンソルとして表されます。 これらのテンソルの値は、使用される言語に依存し、トークナイザーの `lang2id` および `id2lang` 属性によって識別されます。
+
+この例では、`xlm-clm-enfr-1024` チェックポイントをロードします (因果言語モデリング、英語-フランス語)。
+
+```py
+>>> import torch
+>>> from transformers import XLMTokenizer, XLMWithLMHeadModel
+
+>>> tokenizer = XLMTokenizer.from_pretrained("xlm-clm-enfr-1024")
+>>> model = XLMWithLMHeadModel.from_pretrained("xlm-clm-enfr-1024")
+```
+
+トークナイザーの `lang2id` 属性は、このモデルの言語とその ID を表示します。
+
+```py
+>>> print(tokenizer.lang2id)
+{'en': 0, 'fr': 1}
+```
+
+次に、入力例を作成します。
+
+```py
+>>> input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")])  # batch size of 1
+```
+
+言語 ID を `en` に設定し、それを使用して言語の埋め込みを定義します。 言語の埋め込みは、英語の言語 ID であるため、`0` で埋められたテンソルです。 このテンソルは `input_ids` と同じサイズにする必要があります。
+
+```py
+>>> language_id = tokenizer.lang2id["en"]  # 0
+>>> langs = torch.tensor([language_id] * input_ids.shape[1])  # torch.tensor([0, 0, 0, ..., 0])
+
+>>> # We reshape it to be of size (batch_size, sequence_length)
+>>> langs = langs.view(1, -1)  # is now of shape [1, sequence_length] (we have a batch size of 1)
+```
+
+これで、`input_ids` と言語の埋め込みをモデルに渡すことができます。
+
+```py
+>>> outputs = model(input_ids, langs=langs)
+```
+
+[run_generation.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-generation/run_generation.py) スクリプトは、`xlm-clm` チェックポイントを使用して、言語が埋め込まれたテキストを生成できます。
+
+### 言語の埋め込みがないXLM
+
+次の XLM モデルは、推論中に言語の埋め込みを必要としません。
+
+- `xlm-mlm-17-1280` (マスク化された言語モデリング、17の言語)
+- `xlm-mlm-100-1280` (マスク化された言語モデリング、100の言語)
+
+これらのモデルは、以前の XLM チェックポイントとは異なり、一般的な文の表現に使用されます。
+
+## BERT
+
+以下の BERT モデルは、多言語タスクに使用できます。
+
+- `bert-base-multilingual-uncased` (マスク化された言語モデリング + 次の文の予測、102の言語)
+- `bert-base-multilingual-cased` (マスク化された言語モデリング + 次の文の予測、104の言語)
+
+これらのモデルは、推論中に言語の埋め込みを必要としません。 文脈から言語を識別し、それに応じて推測する必要があります。
+
+## XLM-RoBERTa
+
+次の XLM-RoBERTa モデルは、多言語タスクに使用できます。
+
+- `xlm-roberta-base` (マスク化された言語モデリング、100の言語)
+- `xlm-roberta-large` (マスク化された言語モデリング、100の言語)
+
+XLM-RoBERTa は、100の言語で新しく作成およびクリーニングされた2.5 TB の CommonCrawl データでトレーニングされました。 これは、分類、シーケンスのラベル付け、質問応答などのダウンストリームタスクで、mBERT や XLM などの以前にリリースされた多言語モデルを大幅に改善します。
+
+## M2M100
+
+次の M2M100 モデルは、多言語翻訳に使用できます。
+
+- `facebook/m2m100_418M` (翻訳)
+- `facebook/m2m100_1.2B` (翻訳)
+
+この例では、`facebook/m2m100_418M` チェックポイントをロードして、中国語から英語に翻訳します。 トークナイザーでソース言語を設定できます。
+
+```py
+>>> from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
+
+>>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger."
+>>> chinese_text = "不要插手巫師的事務, 因為他們是微妙的, 很快就會發怒."
+
+>>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="zh")
+>>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
+```
+
+テキストをトークン化します。
+
+```py
+>>> encoded_zh = tokenizer(chinese_text, return_tensors="pt")
+```
+
+M2M100 は、最初に生成されたトークンとしてターゲット言語 ID を強制的にターゲット言語に翻訳します。 英語に翻訳するには、`generate` メソッドで `forced_bos_token_id` を `en` に設定します。
+
+```py
+>>> generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en"))
+>>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+'Do not interfere with the matters of the witches, because they are delicate and will soon be angry.'
+```
+
+## MBart
+
+多言語翻訳には、次の MBart モデルを使用できます。
+
+- `facebook/mbart-large-50-one-to-many-mmt` (One-to-many multilingual machine translation, 50 languages)
+- `facebook/mbart-large-50-many-to-many-mmt` (Many-to-many multilingual machine translation, 50 languages)
+- `facebook/mbart-large-50-many-to-one-mmt` (Many-to-one multilingual machine translation, 50 languages)
+- `facebook/mbart-large-50` (Multilingual translation, 50 languages)
+- `facebook/mbart-large-cc25`
+
+この例では、`facebook/mbart-large-50-many-to-many-mmt` チェックポイントをロードして、フィンランド語を英語に翻訳します。トークナイザーでソース言語を設定できます。
+
+```py
+>>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+
+>>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger."
+>>> fi_text = "Älä sekaannu velhojen asioihin, sillä ne ovat hienovaraisia ja nopeasti vihaisia."
+
+>>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="fi_FI")
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
+```
+
+テキストをトークン化します。
+
+```py
+>>> encoded_en = tokenizer(en_text, return_tensors="pt")
+```
+
+MBart は、最初に生成されたトークンとしてターゲット言語 ID を強制的にターゲット言語に翻訳します。 英語に翻訳するには、`generate` メソッドで `forced_bos_token_id` を `en` に設定します。
+
+```py
+>>> generated_tokens = model.generate(**encoded_en, forced_bos_token_id=tokenizer.lang_code_to_id("en_XX"))
+>>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+"Don't interfere with the wizard's affairs, because they are subtle, will soon get angry."
+```
+
+`facebook/mbart-large-50-many-to-one-mmt` チェックポイントを使用している場合、最初に生成されたトークンとしてターゲット言語 ID を強制する必要はありません。それ以外の場合、使用方法は同じです。
\ No newline at end of file
diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index 62c6e57c72dd..b26d85aeae92 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -1,58 +1,692 @@
 - sections:
   - local: index
     title: 🤗 Transformers
+  - local: quicktour
+    title: 둘러보기
+  - local: installation
+    title: 설치방법
   title: 시작하기
 - sections:
-  - local: in_translation
-    title: (번역 중)
+  - local: pipeline_tutorial
+    title: Pipeline으로 추론하기
+  - local: autoclass_tutorial
+    title: AutoClass로 사전 학습된 인스턴스 로드하기
+  - local: preprocessing
+    title: 데이터 전처리하기
+  - local: training
+    title: 사전 학습된 모델 미세 조정하기
+  - local: run_scripts
+    title: 스크립트로 학습하기
+  - local: accelerate
+    title: 🤗 Accelerate로 분산 학습 구성하기
+  - local: peft
+    title: 🤗 PEFT로 어댑터 로드 및 학습하기
+  - local: model_sharing
+    title: 만든 모델 공유하기
+  - local: transformers_agents
+    title: 에이전트
+  - local: llm_tutorial
+    title: 대규모 언어 모델로 생성하기
   title: 튜토리얼
 - sections:
-  - local: in_translation
-    title: (번역 중)
-  title: How-to 가이드
+  - sections:
+      - local: tasks/sequence_classification
+        title: 텍스트 분류
+      - local: tasks/token_classification
+        title: 토큰 분류
+      - local: tasks/question_answering
+        title: 질의 응답(Question Answering)
+      - local: tasks/language_modeling
+        title: 인과적 언어 모델링(Causal language modeling)
+      - local: tasks/masked_language_modeling
+        title: 마스킹된 언어 모델링(Masked language modeling)
+      - local: tasks/translation
+        title: 번역
+      - local: tasks/summarization
+        title: 요약
+      - local: tasks/multiple_choice
+        title: 객관식 문제(Multiple Choice)
+    title: 자연어처리
+    isExpanded: false
+  - sections:
+      - local: tasks/audio_classification
+        title: 오디오 분류
+      - local: tasks/asr
+        title: 자동 음성 인식
+    title: 오디오
+    isExpanded: false
+  - sections:
+      - local: tasks/image_classification
+        title: 이미지 분류
+      - local: tasks/semantic_segmentation
+        title: 의미적 분할(Semantic segmentation)
+      - local: tasks/video_classification
+        title: 영상 분류
+      - local: tasks/object_detection
+        title: 객체 탐지
+      - local: tasks/zero_shot_object_detection
+        title: 제로샷(zero-shot) 객체 탐지
+      - local: tasks/zero_shot_image_classification
+        title: 제로샷(zero-shot) 이미지 분류
+      - local: tasks/monocular_depth_estimation
+        title: 단일 영상 기반 깊이 추정
+    title: 컴퓨터 비전
+    isExpanded: false
+  - sections:
+      - local: tasks/image_captioning
+        title: 이미지 캡셔닝
+      - local: tasks/document_question_answering
+        title: 문서 질의 응답(Document Question Answering)
+      - local: tasks/visual_question_answering
+        title: 시각적 질의응답 (Visual Question Answering)
+    title: 멀티모달
+    isExpanded: false
+  title: 태스크 가이드
+- sections:
+    - local: fast_tokenizers
+      title: 🤗 Tokenizers 라이브러리에서 토크나이저 사용하기
+    - local: multilingual
+      title: 다국어 모델 추론하기
+    - local: in_translation
+      title: (번역중) Customize text generation strategy
+    - local: create_a_model
+      title: 모델별 API 사용하기
+    - local: custom_models
+      title: 사용자 정의 모델 공유하기
+    - local: sagemaker
+      title: Amazon SageMaker에서 학습 실행하기
+    - local: serialization
+      title: ONNX로 내보내기
+    - local: tflite
+      title: TFLite로 내보내기
+    - local: torchscript
+      title: TorchScript로 내보내기
+    - local: in_translation
+      title: (번역중) Benchmarks
+    - local: in_translation
+      title: (번역중) Notebooks with examples
+    - local: community
+      title: 커뮤니티 리소스
+    - local: custom_tools
+      title: 사용자 정의 도구와 프롬프트
+    - local: troubleshooting
+      title: 문제 해결
+  title: (번역중) 개발자 가이드
+- sections:
+    - local: performance
+      title: 성능 및 확장성
+    - local: in_translation
+      title: (번역중) Training on one GPU
+    - local: perf_train_gpu_many
+      title: 다중 GPU에서 훈련 진행하기
+    - local: perf_train_cpu
+      title: CPU에서 훈련
+    - local: perf_train_cpu_many
+      title: 다중 CPU에서 훈련하기
+    - local: in_translation
+      title: (번역중) Training on TPUs
+    - local: perf_train_tpu_tf
+      title: TensorFlow로 TPU에서 훈련하기
+    - local: in_translation
+      title: (번역중) Training on Specialized Hardware
+    - local: perf_infer_cpu
+      title: CPU로 추론하기
+    - local: perf_infer_gpu_one
+      title: 하나의 GPU를 활용한 추론
+    - local: perf_infer_gpu_many
+      title: 다중 GPU에서 추론
+    - local: in_translation
+      title: (번역중) Inference on Specialized Hardware
+    - local: perf_hardware
+      title: 훈련용 사용자 맞춤형 하드웨어
+    - local: in_translation
+      title: (번역중) Instantiating a big model
+    - local: debugging
+      title: 디버깅
+    - local: hpo_train
+      title: Trainer API를 사용한 하이퍼파라미터 탐색
+    - local: tf_xla
+      title: TensorFlow 모델을 위한 XLA 통합
+  title: (번역중) 성능 및 확장성
 - sections:
+    - local: contributing
+      title: 🤗 Transformers에 기여하는 방법
+    - local: add_new_model
+      title: 🤗 Transformers에 새로운 모델을 추가하는 방법
+    - local: add_tensorflow_model
+      title: 어떻게 🤗 Transformers 모델을 TensorFlow로 변환하나요?
+    - local: add_new_pipeline
+      title: 어떻게 🤗 Transformers에 파이프라인을 추가하나요?
+    - local: testing
+      title: 테스트
+    - local: pr_checks
+      title: Pull Request에 대한 검사
+  title: (번역중) 기여하기
+
+- sections:
+  - local: philosophy
+    title: 이념과 목표
   - local: in_translation
-    title: (번역 중)
-  title: 개념 가이드
+    title: (번역중) Glossary
+  - local: task_summary
+    title: 🤗 Transformers로 할 수 있는 작업
+  - local: tasks_explained
+    title: 🤗 Transformers로 작업을 해결하는 방법
+  - local: model_summary
+    title: Transformer 모델군
+  - local: tokenizer_summary
+    title: 토크나이저 요약
+  - local: attention
+    title: 어텐션 매커니즘
+  - local: pad_truncation
+    title: 패딩과 잘라내기
+  - local: bertology
+    title: BERTology
+  - local: perplexity
+    title: 고정 길이 모델의 펄플렉서티(Perplexity)
+  - local: pipeline_webserver
+    title: 추론 웹 서버를 위한 파이프라인
+  - local: model_memory_anatomy
+    title: 모델 학습 해부하기
+  title: (번역중) 개념 가이드
 - sections:
   - sections:
     - local: in_translation
-      title: (번역 중)
-    title: 메인 클래스
+      title: (번역중) Auto Classes
+    - local: in_translation
+      title: (번역중) Callbacks
+    - local: in_translation
+      title: (번역중) Configuration
+    - local: in_translation
+      title: (번역중) Data Collator
+    - local: in_translation
+      title: (번역중) Keras callbacks
+    - local: in_translation
+      title: (번역중) Logging
+    - local: in_translation
+      title: (번역중) Models
+    - local: in_translation
+      title: (번역중) Text Generation
+    - local: in_translation
+      title: (번역중) ONNX
+    - local: in_translation
+      title: (번역중) Optimization
+    - local: in_translation
+      title: (번역중) Model outputs
+    - local: in_translation
+      title: (번역중) Pipelines
+    - local: in_translation
+      title: (번역중) Processors
+    - local: in_translation
+      title: (번역중) Quantization
+    - local: in_translation
+      title: (번역중) Tokenizer
+    - local: in_translation
+      title: (번역중) Trainer
+    - local: in_translation
+      title: (번역중) DeepSpeed Integration
+    - local: in_translation
+      title: (번역중) Feature Extractor
+    - local: in_translation
+      title: (번역중) Image Processor
+    title: (번역중) 메인 클래스
   - sections:
     - isExpanded: false
       sections:
       - local: in_translation
-        title: (번역 중)
-      title: 텍스트 모델
+        title: (번역중) ALBERT
+      - local: in_translation
+        title: (번역중) BART
+      - local: in_translation
+        title: (번역중) BARThez
+      - local: in_translation
+        title: (번역중) BARTpho
+      - local: in_translation
+        title: (번역중) BERT
+      - local: in_translation
+        title: (번역중) BertGeneration
+      - local: in_translation
+        title: (번역중) BertJapanese
+      - local: in_translation
+        title: (번역중) Bertweet
+      - local: in_translation
+        title: (번역중) BigBird
+      - local: in_translation
+        title: (번역중) BigBirdPegasus
+      - local: in_translation
+        title: (번역중) BioGpt
+      - local: in_translation
+        title: (번역중) Blenderbot
+      - local: in_translation
+        title: (번역중) Blenderbot Small
+      - local: in_translation
+        title: (번역중) BLOOM
+      - local: in_translation
+        title: (번역중) BORT
+      - local: in_translation
+        title: (번역중) ByT5
+      - local: in_translation
+        title: (번역중) CamemBERT
+      - local: in_translation
+        title: (번역중) CANINE
+      - local: in_translation
+        title: (번역중) CodeGen
+      - local: in_translation
+        title: (번역중) ConvBERT
+      - local: in_translation
+        title: (번역중) CPM
+      - local: in_translation
+        title: (번역중) CPMANT
+      - local: in_translation
+        title: (번역중) CTRL
+      - local: in_translation
+        title: (번역중) DeBERTa
+      - local: in_translation
+        title: (번역중) DeBERTa-v2
+      - local: in_translation
+        title: (번역중) DialoGPT
+      - local: in_translation
+        title: (번역중) DistilBERT
+      - local: in_translation
+        title: (번역중) DPR
+      - local: in_translation
+        title: (번역중) ELECTRA
+      - local: in_translation
+        title: (번역중) Encoder Decoder Models
+      - local: in_translation
+        title: (번역중) ERNIE
+      - local: in_translation
+        title: (번역중) ErnieM
+      - local: in_translation
+        title: (번역중) ESM
+      - local: in_translation
+        title: (번역중) FLAN-T5
+      - local: in_translation
+        title: (번역중) FLAN-UL2
+      - local: in_translation
+        title: (번역중) FlauBERT
+      - local: in_translation
+        title: (번역중) FNet
+      - local: in_translation
+        title: (번역중) FSMT
+      - local: in_translation
+        title: (번역중) Funnel Transformer
+      - local: in_translation
+        title: (번역중) GPT
+      - local: in_translation
+        title: (번역중) GPT Neo
+      - local: in_translation
+        title: (번역중) GPT NeoX
+      - local: in_translation
+        title: (번역중) GPT NeoX Japanese
+      - local: in_translation
+        title: (번역중) GPT-J
+      - local: in_translation
+        title: (번역중) GPT2
+      - local: in_translation
+        title: (번역중) GPTBigCode
+      - local: in_translation
+        title: (번역중) GPTSAN Japanese
+      - local: in_translation
+        title: (번역중) GPTSw3
+      - local: in_translation
+        title: (번역중) HerBERT
+      - local: in_translation
+        title: (번역중) I-BERT
+      - local: in_translation
+        title: (번역중) Jukebox
+      - local: in_translation
+        title: (번역중) LED
+      - local: model_doc/llama
+        title: LLaMA
+      - local: model_doc/llama2
+        title: LLaMA2
+      - local: in_translation
+        title: (번역중) Longformer
+      - local: in_translation
+        title: (번역중) LongT5
+      - local: in_translation
+        title: (번역중) LUKE
+      - local: in_translation
+        title: (번역중) M2M100
+      - local: in_translation
+        title: (번역중) MarianMT
+      - local: in_translation
+        title: (번역중) MarkupLM
+      - local: in_translation
+        title: (번역중) MBart and MBart-50
+      - local: in_translation
+        title: (번역중) MEGA
+      - local: in_translation
+        title: (번역중) MegatronBERT
+      - local: in_translation
+        title: (번역중) MegatronGPT2
+      - local: in_translation
+        title: (번역중) mLUKE
+      - local: in_translation
+        title: (번역중) MobileBERT
+      - local: in_translation
+        title: (번역중) MPNet
+      - local: in_translation
+        title: (번역중) MT5
+      - local: in_translation
+        title: (번역중) MVP
+      - local: in_translation
+        title: (번역중) NEZHA
+      - local: in_translation
+        title: (번역중) NLLB
+      - local: in_translation
+        title: (번역중) NLLB-MoE
+      - local: in_translation
+        title: (번역중) Nyströmformer
+      - local: in_translation
+        title: (번역중) Open-Llama
+      - local: in_translation
+        title: (번역중) OPT
+      - local: in_translation
+        title: (번역중) Pegasus
+      - local: in_translation
+        title: (번역중) PEGASUS-X
+      - local: in_translation
+        title: (번역중) PhoBERT
+      - local: in_translation
+        title: (번역중) PLBart
+      - local: in_translation
+        title: (번역중) ProphetNet
+      - local: in_translation
+        title: (번역중) QDQBert
+      - local: in_translation
+        title: (번역중) RAG
+      - local: in_translation
+        title: (번역중) REALM
+      - local: in_translation
+        title: (번역중) Reformer
+      - local: in_translation
+        title: (번역중) RemBERT
+      - local: in_translation
+        title: (번역중) RetriBERT
+      - local: in_translation
+        title: (번역중) RoBERTa
+      - local: in_translation
+        title: (번역중) RoBERTa-PreLayerNorm
+      - local: in_translation
+        title: (번역중) RoCBert
+      - local: in_translation
+        title: (번역중) RoFormer
+      - local: in_translation
+        title: (번역중) Splinter
+      - local: in_translation
+        title: (번역중) SqueezeBERT
+      - local: in_translation
+        title: (번역중) SwitchTransformers
+      - local: in_translation
+        title: (번역중) T5
+      - local: in_translation
+        title: (번역중) T5v1.1
+      - local: in_translation
+        title: (번역중) TAPEX
+      - local: in_translation
+        title: (번역중) Transformer XL
+      - local: in_translation
+        title: (번역중) UL2
+      - local: in_translation
+        title: (번역중) X-MOD
+      - local: in_translation
+        title: (번역중) XGLM
+      - local: in_translation
+        title: (번역중) XLM
+      - local: in_translation
+        title: (번역중) XLM-ProphetNet
+      - local: in_translation
+        title: (번역중) XLM-RoBERTa
+      - local: in_translation
+        title: (번역중) XLM-RoBERTa-XL
+      - local: in_translation
+        title: (번역중) XLM-V
+      - local: in_translation
+        title: (번역중) XLNet
+      - local: in_translation
+        title: (번역중) YOSO
+      title: (번역중) 텍스트 모델
+    - isExpanded: false
+      sections:
+      - local: in_translation
+        title: (번역중) BEiT
+      - local: in_translation
+        title: (번역중) BiT
+      - local: in_translation
+        title: (번역중) Conditional DETR
+      - local: in_translation
+        title: (번역중) ConvNeXT
+      - local: in_translation
+        title: (번역중) ConvNeXTV2
+      - local: in_translation
+        title: (번역중) CvT
+      - local: in_translation
+        title: (번역중) Deformable DETR
+      - local: in_translation
+        title: (번역중) DeiT
+      - local: in_translation
+        title: (번역중) DETA
+      - local: in_translation
+        title: (번역중) DETR
+      - local: in_translation
+        title: (번역중) DiNAT
+      - local: in_translation
+        title: (번역중) DiT
+      - local: in_translation
+        title: (번역중) DPT
+      - local: in_translation
+        title: (번역중) EfficientFormer
+      - local: in_translation
+        title: (번역중) EfficientNet
+      - local: in_translation
+        title: (번역중) FocalNet
+      - local: in_translation
+        title: (번역중) GLPN
+      - local: in_translation
+        title: (번역중) ImageGPT
+      - local: in_translation
+        title: (번역중) LeViT
+      - local: in_translation
+        title: (번역중) Mask2Former
+      - local: in_translation
+        title: (번역중) MaskFormer
+      - local: in_translation
+        title: (번역중) MobileNetV1
+      - local: in_translation
+        title: (번역중) MobileNetV2
+      - local: in_translation
+        title: (번역중) MobileViT
+      - local: in_translation
+        title: (번역중) NAT
+      - local: in_translation
+        title: (번역중) PoolFormer
+      - local: in_translation
+        title: (번역중) RegNet
+      - local: in_translation
+        title: (번역중) ResNet
+      - local: in_translation
+        title: (번역중) SegFormer
+      - local: in_translation
+        title: (번역중) Swin Transformer
+      - local: in_translation
+        title: (번역중) Swin Transformer V2
+      - local: in_translation
+        title: (번역중) Swin2SR
+      - local: in_translation
+        title: (번역중) Table Transformer
+      - local: in_translation
+        title: (번역중) TimeSformer
+      - local: in_translation
+        title: (번역중) UperNet
+      - local: in_translation
+        title: (번역중) VAN
+      - local: in_translation
+        title: (번역중) VideoMAE
+      - local: in_translation
+        title: (번역중) Vision Transformer (ViT)
+      - local: in_translation
+        title: (번역중) ViT Hybrid
+      - local: in_translation
+        title: (번역중) ViTMAE
+      - local: in_translation
+        title: (번역중) ViTMSN
+      - local: in_translation
+        title: (번역중) YOLOS
+      title: (번역중) 비전 모델
     - isExpanded: false
       sections:
       - local: in_translation
-        title: (번역 중)
-      title: 비전 모델
+        title: (번역중) Audio Spectrogram Transformer
+      - local: in_translation
+        title: (번역중) CLAP
+      - local: in_translation
+        title: (번역중) Hubert
+      - local: in_translation
+        title: (번역중) MCTCT
+      - local: in_translation
+        title: (번역중) SEW
+      - local: in_translation
+        title: (번역중) SEW-D
+      - local: in_translation
+        title: (번역중) Speech2Text
+      - local: in_translation
+        title: (번역중) Speech2Text2
+      - local: in_translation
+        title: (번역중) SpeechT5
+      - local: in_translation
+        title: (번역중) UniSpeech
+      - local: in_translation
+        title: (번역중) UniSpeech-SAT
+      - local: in_translation
+        title: (번역중) Wav2Vec2
+      - local: in_translation
+        title: (번역중) Wav2Vec2-Conformer
+      - local: in_translation
+        title: (번역중) Wav2Vec2Phoneme
+      - local: in_translation
+        title: (번역중) WavLM
+      - local: model_doc/whisper
+        title: Whisper
+      - local: in_translation
+        title: (번역중) XLS-R
+      - local: in_translation
+        title: (번역중) XLSR-Wav2Vec2
+      title: (번역중) 오디오 모델
     - isExpanded: false
       sections:
       - local: in_translation
-        title: (번역 중)
-      title: 오디오 모델
+        title: (번역중) ALIGN
+      - local: in_translation
+        title: (번역중) AltCLIP
+      - local: in_translation
+        title: (번역중) BLIP
+      - local: in_translation
+        title: (번역중) BLIP-2
+      - local: in_translation
+        title: (번역중) BridgeTower
+      - local: in_translation
+        title: (번역중) Chinese-CLIP
+      - local: in_translation
+        title: (번역중) CLIP
+      - local: in_translation
+        title: (번역중) CLIPSeg
+      - local: in_translation
+        title: (번역중) Data2Vec
+      - local: in_translation
+        title: (번역중) DePlot
+      - local: in_translation
+        title: (번역중) Donut
+      - local: in_translation
+        title: (번역중) FLAVA
+      - local: in_translation
+        title: (번역중) GIT
+      - local: in_translation
+        title: (번역중) GroupViT
+      - local: in_translation
+        title: (번역중) LayoutLM
+      - local: in_translation
+        title: (번역중) LayoutLMV2
+      - local: in_translation
+        title: (번역중) LayoutLMV3
+      - local: in_translation
+        title: (번역중) LayoutXLM
+      - local: in_translation
+        title: (번역중) LiLT
+      - local: in_translation
+        title: (번역중) LXMERT
+      - local: in_translation
+        title: (번역중) MatCha
+      - local: in_translation
+        title: (번역중) MGP-STR
+      - local: in_translation
+        title: (번역중) OneFormer
+      - local: in_translation
+        title: (번역중) OWL-ViT
+      - local: in_translation
+        title: (번역중) Perceiver
+      - local: in_translation
+        title: (번역중) Pix2Struct
+      - local: in_translation
+        title: (번역중) Segment Anything
+      - local: in_translation
+        title: (번역중) Speech Encoder Decoder Models
+      - local: in_translation
+        title: (번역중) TAPAS
+      - local: in_translation
+        title: (번역중) TrOCR
+      - local: in_translation
+        title: (번역중) TVLT
+      - local: in_translation
+        title: (번역중) ViLT
+      - local: in_translation
+        title: (번역중) Vision Encoder Decoder Models
+      - local: in_translation
+        title: (번역중) Vision Text Dual Encoder
+      - local: in_translation
+        title: (번역중) VisualBERT
+      - local: in_translation
+        title: (번역중) X-CLIP
+      title: (번역중) 멀티모달 모델
     - isExpanded: false
       sections:
       - local: in_translation
-        title: (번역 중)
-      title: 멀티모달 모델
+        title: (번역중) Decision Transformer
+      - local: in_translation
+        title: (번역중) Trajectory Transformer
+      title: (번역중) 강화학습 모델
     - isExpanded: false
       sections:
       - local: in_translation
-        title: (번역 중)
-      title: 강화학습 모델
+        title: (번역중) Informer
+      - local: in_translation
+        title: (번역중) Time Series Transformer
+      title: (번역중) 시계열 모델
     - isExpanded: false
       sections:
       - local: in_translation
-        title: (번역 중)
-      title: 시계열 모델
-    title: 모델
+        title: (번역중) Graphormer
+      title: (번역중) Graph models
+    title: (번역중) 모델
   - sections:
     - local: in_translation
-      title: (번역 중)
-    title: 내부 유틸리티
-  title: API
+      title: (번역중) Custom Layers and Utilities
+    - local: in_translation
+      title: (번역중) Utilities for pipelines
+    - local: in_translation
+      title: (번역중) Utilities for Tokenizers
+    - local: in_translation
+      title: (번역중) Utilities for Trainer
+    - local: in_translation
+      title: (번역중) Utilities for Generation
+    - local: in_translation
+      title: (번역중) Utilities for Image Processors
+    - local: in_translation
+      title: (번역중) Utilities for Audio processing
+    - local: in_translation
+      title: (번역중) General Utilities
+    - local: in_translation
+      title: (번역중) Utilities for Time Series
+    title: (번역중) Internal Helpers
+  title: (번역중) API
diff --git a/docs/source/ko/accelerate.md b/docs/source/ko/accelerate.md
new file mode 100644
index 000000000000..0ef8957de3ac
--- /dev/null
+++ b/docs/source/ko/accelerate.md
@@ -0,0 +1,136 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 🤗 Accelerate를 활용한 분산 학습[[distributed-training-with-accelerate]]
+
+모델이 커지면서 병렬 처리는 제한된 하드웨어에서 더 큰 모델을 훈련하고 훈련 속도를 몇 배로 가속화하기 위한 전략으로 등장했습니다. Hugging Face에서는 사용자가 하나의 머신에 여러 개의 GPU를 사용하든 여러 머신에 여러 개의 GPU를 사용하든 모든 유형의 분산 설정에서 🤗 Transformers 모델을 쉽게 훈련할 수 있도록 돕기 위해 [🤗 Accelerate](https://huggingface.co/docs/accelerate) 라이브러리를 만들었습니다. 이 튜토리얼에서는 분산 환경에서 훈련할 수 있도록 기본 PyTorch 훈련 루프를 커스터마이즈하는 방법을 알아봅시다.
+
+## 설정[[setup]]
+
+🤗 Accelerate 설치 시작하기:
+
+```bash
+pip install accelerate
+```
+
+그 다음, [`~accelerate.Accelerator`] 객체를 불러오고 생성합니다. [`~accelerate.Accelerator`]는 자동으로 분산 설정 유형을 감지하고 훈련에 필요한 모든 구성 요소를 초기화합니다. 장치에 모델을 명시적으로 배치할 필요는 없습니다.
+
+```py
+>>> from accelerate import Accelerator
+
+>>> accelerator = Accelerator()
+```
+
+## 가속화를 위한 준비[[prepare-to-accelerate]]
+
+다음 단계는 관련된 모든 훈련 객체를 [`~accelerate.Accelerator.prepare`] 메소드에 전달하는 것입니다. 여기에는 훈련 및 평가 데이터로더, 모델 및 옵티마이저가 포함됩니다:
+
+```py
+>>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
+...     train_dataloader, eval_dataloader, model, optimizer
+... )
+```
+
+## 백워드(Backward)[[backward]]
+
+마지막으로 훈련 루프의 일반적인 `loss.backward()`를 🤗 Accelerate의 [`~accelerate.Accelerator.backward`] 메소드로 대체하기만 하면 됩니다:
+
+```py
+>>> for epoch in range(num_epochs):
+...     for batch in train_dataloader:
+...         outputs = model(**batch)
+...         loss = outputs.loss
+...         accelerator.backward(loss)
+
+...         optimizer.step()
+...         lr_scheduler.step()
+...         optimizer.zero_grad()
+...         progress_bar.update(1)
+```
+
+다음 코드에서 볼 수 있듯이, 훈련 루프에 코드 네 줄만 추가하면 분산 학습을 활성화할 수 있습니다!
+
+```diff
++ from accelerate import Accelerator
+  from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
+
++ accelerator = Accelerator()
+
+  model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
+  optimizer = AdamW(model.parameters(), lr=3e-5)
+
+- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+- model.to(device)
+
++ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
++     train_dataloader, eval_dataloader, model, optimizer
++ )
+
+  num_epochs = 3
+  num_training_steps = num_epochs * len(train_dataloader)
+  lr_scheduler = get_scheduler(
+      "linear",
+      optimizer=optimizer,
+      num_warmup_steps=0,
+      num_training_steps=num_training_steps
+  )
+
+  progress_bar = tqdm(range(num_training_steps))
+
+  model.train()
+  for epoch in range(num_epochs):
+      for batch in train_dataloader:
+-         batch = {k: v.to(device) for k, v in batch.items()}
+          outputs = model(**batch)
+          loss = outputs.loss
+-         loss.backward()
++         accelerator.backward(loss)
+
+          optimizer.step()
+          lr_scheduler.step()
+          optimizer.zero_grad()
+          progress_bar.update(1)
+```
+
+## 학습[[train]]
+
+관련 코드를 추가한 후에는 스크립트나 Colaboratory와 같은 노트북에서 훈련을 시작하세요.
+
+### 스크립트로 학습하기[[train-with-a-script]]
+
+스크립트에서 훈련을 실행하는 경우, 다음 명령을 실행하여 구성 파일을 생성하고 저장합니다:
+
+```bash
+accelerate config
+```
+
+Then launch your training with:
+
+```bash
+accelerate launch train.py
+```
+
+### 노트북으로 학습하기[[train-with-a-notebook]]
+
+Collaboratory의 TPU를 사용하려는 경우, 노트북에서도 🤗 Accelerate를 실행할 수 있습니다. 훈련을 담당하는 모든 코드를 함수로 감싸서 [`~accelerate.notebook_launcher`]에 전달하세요:
+
+```py
+>>> from accelerate import notebook_launcher
+
+>>> notebook_launcher(training_function)
+```
+
+🤗 Accelerate 및 다양한 기능에 대한 자세한 내용은 [documentation](https://huggingface.co/docs/accelerate)를 참조하세요.
\ No newline at end of file
diff --git a/docs/source/ko/add_new_model.md b/docs/source/ko/add_new_model.md
new file mode 100644
index 000000000000..6ae32d2ac60f
--- /dev/null
+++ b/docs/source/ko/add_new_model.md
@@ -0,0 +1,630 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Hugging Face Transformers를 추가하는 방법은 무엇인가요? [[how-to-add-a-model-to-transformers]]
+
+Hugging Face Transformers 라이브러리는 커뮤니티 기여자들 덕분에 새로운 모델을 제공할 수 있는 경우가 많습니다. 하지만 이는 도전적인 프로젝트이며 Hugging Face Transformers 라이브러리와 구현할 모델에 대한 깊은 이해가 필요합니다. Hugging Face에서는 더 많은 커뮤니티 멤버가 모델을 적극적으로 추가할 수 있도록 지원하고자 하며, 이 가이드를 통해 PyTorch 모델을 추가하는 과정을 안내하고 있습니다 (PyTorch가 설치되어 있는지 확인해주세요).
+
+<Tip>
+
+TensorFlow 모델을 구현하고자 하는 경우 [🤗 Transformers 모델을 TensorFlow로 변환하는 방법](add_tensorflow_model) 가이드를 살펴보세요!
+
+</Tip>
+
+이 과정을 진행하면 다음과 같은 내용을 이해하게 됩니다:
+
+- 오픈 소스의 모범 사례에 대한 통찰력을 얻습니다.
+- 가장 인기 있는 딥러닝 라이브러리의 설계 원칙을 이해합니다.
+- 대규모 모델을 효율적으로 테스트하는 방법을 배웁니다.
+- `black`, `ruff`, `make fix-copies`와 같은 Python 유틸리티를 통합하여 깔끔하고 가독성 있는 코드를 작성하는 방법을 배웁니다.
+
+Hugging Face 팀은 항상 도움을 줄 준비가 되어 있으므로 혼자가 아니라는 점을 기억하세요. 🤗 ❤️
+
+시작에 앞서 🤗 Transformers에 원하는 모델을 추가하기 위해 [New model addition](https://github.com/huggingface/transformers/issues/new?assignees=&labels=New+model&template=new-model-addition.yml) 이슈를 열어야 합니다. 특정 모델을 기여하는 데 특별히 까다로운 기준을 가지지 않는 경우 [New model label](https://github.com/huggingface/transformers/labels/New%20model)을 필터링하여 요청되지 않은 모델이 있는지 확인하고 작업할 수 있습니다.
+
+새로운 모델 요청을 열었다면 첫 번째 단계는 🤗 Transformers에 익숙해지는 것입니다!
+
+## 🤗 Transformers의 전반적인 개요  [[general-overview-of-transformers]]
+
+먼저 🤗 Transformers에 대한 전반적인 개요를 파악해야 합니다. 🤗 Transformers는 매우 주관적인 라이브러리이기 때문에 해당 라이브러리의 철학이나 설계 선택 사항에 동의하지 않을 수도 있습니다. 그러나 우리의 경험상 라이브러리의 기본적인 설계 선택과 철학은 🤗 Transformers의 규모를 효율적으로 확장하면서 유지 보수 비용을 합리적인 수준으로 유지하는 것입니다.
+
+[라이브러리의 철학에 대한 문서](philosophy)를 읽는 것이 라이브러리를 더 잘 이해하는 좋은 시작점입니다. 모든 모델에 적용하려는 몇 가지 작업 방식에 대한 선택 사항이 있습니다:
+
+- 일반적으로 추상화보다는 구성을 선호합니다.
+- 코드를 복제하는 것이 항상 나쁜 것은 아닙니다. 코드의 가독성이나 접근성을 크게 향상시킨다면 복제하는 것은 좋습니다.
+- 모델 파일은 가능한 한 독립적으로 유지되어야 합니다. 따라서 특정 모델의 코드를 읽을 때 해당 `modeling_....py` 파일만 확인하면 됩니다.
+
+우리는 라이브러리의 코드가 제품을 제공하는 수단뿐만 아니라 개선하고자 하는 제품이라고도 생각합니다. 따라서 모델을 추가할 때, 사용자는 모델을 사용할 사람뿐만 아니라 코드를 읽고 이해하고 필요한 경우 조정할 수 있는 모든 사람까지도 포함한다는 점을 기억해야 합니다.
+
+이를 염두에 두고 일반적인 라이브러리 설계에 대해 조금 더 자세히 알아보겠습니다.
+
+### 모델 개요 [[overview-of-models]]
+
+모델을 성공적으로 추가하려면 모델과 해당 구성인 [`PreTrainedModel`] 및 [`PretrainedConfig`] 간의 상호작용을 이해하는 것이 중요합니다. 예를 들어, 🤗 Transformers에 추가하려는 모델을 `BrandNewBert`라고 부르겠습니다.
+
+다음을 살펴보겠습니다:
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_overview.png"/>
+
+보다시피, 🤗 Transformers에서는 상속을 사용하지만 추상화 수준을 최소한으로 유지합니다. 라이브러리의 어떤 모델에서도 두 수준 이상의 추상화가 존재하지 않습니다. `BrandNewBertModel`은 `BrandNewBertPreTrainedModel`에서 상속받고, 이 클래스는 [`PreTrainedModel`]에서 상속받습니다. 이로써 새로운 모델은 [`PreTrainedModel`]에만 의존하도록 하려고 합니다. 모든 새로운 모델에 자동으로 제공되는 중요한 기능은 [`~PreTrainedModel.from_pretrained`] 및 [`~PreTrainedModel.save_pretrained`]입니다. 이러한 기능 외에도 `BrandNewBertModel.forward`와 같은 다른 중요한 기능은 새로운 `modeling_brand_new_bert.py` 스크립트에서 완전히 정의되어야 합니다. 또한 `BrandNewBertForMaskedLM`과 같은 특정 헤드 레이어를 가진 모델은 `BrandNewBertModel`을 상속받지 않고 forward pass에서 호출할 수 있는 `BrandNewBertModel`을 사용하여 추상화 수준을 낮게 유지합니다. 모든 새로운 모델은 `BrandNewBertConfig`라는 구성 클래스를 필요로 합니다. 이 구성은 항상 [`PreTrainedModel`]의 속성으로 저장되며, 따라서 `BrandNewBertPreTrainedModel`을 상속받는 모든 클래스에서 `config` 속성을 통해 액세스할 수 있습니다:
+
+```python
+model = BrandNewBertModel.from_pretrained("brandy/brand_new_bert")
+model.config  # model has access to its config
+```
+
+모델과 마찬가지로 구성은 [`PretrainedConfig`]에서 기본 직렬화 및 역직렬화 기능을 상속받습니다. 구성과 모델은 항상 *pytorch_model.bin* 파일과 *config.json* 파일로 각각 별도로 직렬화됩니다. [`~PreTrainedModel.save_pretrained`]를 호출하면 자동으로 [`~PretrainedConfig.save_pretrained`]도 호출되므로 모델과 구성이 모두 저장됩니다.
+
+
+### 코드 스타일 [[code-style]]
+
+새로운 모델을 작성할 때, Transformers는 주관적인 라이브러리이며 몇 가지 독특한 코딩 스타일이 있습니다:
+
+1. 모델의 forward pass는 모델 파일에 완전히 작성되어야 합니다. 라이브러리의 다른 모델에서 블록을 재사용하려면 코드를 복사하여 위에 `# Copied from` 주석과 함께 붙여넣으면 됩니다 (예: [여기](https://github.com/huggingface/transformers/blob/v4.17.0/src/transformers/models/roberta/modeling_roberta.py#L160)를 참조하세요).
+2. 코드는 완전히 이해하기 쉬워야 합니다. 변수 이름을 명확하게 지정하고 약어를 사용하지 않는 것이 좋습니다. 예를 들어, `act`보다는 `activation`을 선호합니다. 한 글자 변수 이름은 루프의 인덱스인 경우를 제외하고 권장되지 않습니다.
+3. 더 일반적으로, 짧은 마법 같은 코드보다는 길고 명시적인 코드를 선호합니다.
+4. PyTorch에서 `nn.Sequential`을 하위 클래스로 만들지 말고 `nn.Module`을 하위 클래스로 만들고 forward pass를 작성하여 다른 사람이 코드를 빠르게 디버그할 수 있도록 합니다. print 문이나 중단점을 추가할 수 있습니다.
+5. 함수 시그니처에는 타입 주석을 사용해야 합니다. 그 외에는 타입 주석보다 변수 이름이 훨씬 읽기 쉽고 이해하기 쉽습니다.
+
+### 토크나이저 개요 [[overview-of-tokenizers]]
+ 
+아직 준비되지 않았습니다 :-( 이 섹션은 곧 추가될 예정입니다!
+
+## 🤗 Transformers에 모델 추가하는 단계별 방법  [[stepbystep-recipe-to-add-a-model-to-transformers]]
+
+각자 모델을 이식하는 방법에 대한 선호가 다르기 때문에 다른 기여자들이 Hugging Face에 모델을 이식하는 방법에 대한 요약을 살펴보는 것이 매우 유용할 수 있습니다. 다음은 모델을 이식하는 방법에 대한 커뮤니티 블로그 게시물 목록입니다:
+
+1. [GPT2 모델 이식하기](https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28) - [Thomas](https://huggingface.co/thomwolf)
+2. [WMT19 MT 모델 이식하기](https://huggingface.co/blog/porting-fsmt) - [Stas](https://huggingface.co/stas)
+
+경험상 모델을 추가할 때 주의해야 할 가장 중요한 사항은 다음과 같습니다:
+
+-  같은 일을 반복하지 마세요! 새로운 🤗 Transformers 모델을 위해 추가할 코드의 대부분은 이미 🤗 Transformers 어딘가에 존재합니다. 이미 존재하는 복사할 수 있는 유사한 모델과 토크나이저를 찾는데 시간을 투자하세요. [grep](https://www.gnu.org/software/grep/)와 [rg](https://github.com/BurntSushi/ripgrep)를 참고하세요. 모델의 토크나이저가 한 모델을 기반으로 하고 모델링 코드가 다른 모델을 기반으로 하는 경우가 존재할 수도 있습니다. 예를 들어 FSMT의 모델링 코드는 BART를 기반으로 하고 FSMT의 토크나이저 코드는 XLM을 기반으로 합니다.
+-  이것은 과학적인 도전보다는 공학적인 도전입니다. 논문의 모델의 모든 이론적 측면을 이해하려는 것보다 효율적인 디버깅 환경을 만드는 데 더 많은 시간을 소비해야 합니다.
+-  막힐 때 도움을 요청하세요! 모델은 🤗 Transformers의 핵심 구성 요소이므로 Hugging Face의 우리는 당신이 모델을 추가하는 각 단계에서 기꺼이 도움을 줄 준비가 되어 있습니다. 진전이 없다고 느끼면 주저하지 말고 도움을 요청하세요.
+
+다음에서는 모델을 🤗 Transformers로 이식하는 데 가장 유용한 일반적인 절차를 제공하려고 노력합니다.
+
+다음 목록은 모델을 추가하는 데 수행해야 할 모든 작업의 요약이며 To-Do 목록으로 사용할 수 있습니다:
+
+☐ (선택 사항) BrandNewBert의 이론적 측면 이해<br>
+☐ Hugging Face 개발 환경 준비<br>
+☐ 원본 리포지토리의 디버깅 환경 설정<br>
+☐ 원본 리포지토리와 체크포인트를 사용하여 `forward()` pass가 성공적으로 실행되는 스크립트 작성<br>
+☐ 🤗 Transformers에 모델 스켈레톤 성공적으로 추가<br>
+☐ 원본 체크포인트를 🤗 Transformers 체크포인트로 성공적으로 변환<br>
+☐ 🤗 Transformers에서 원본 체크포인트와 동일한 출력을 내주는 `forward()` pass 성공적으로 실행<br>
+☐ 🤗 Transformers에서 모델 테스트 완료<br>
+☐ 🤗 Transformers에 토크나이저 성공적으로 추가<br>
+☐ 종단 간 통합 테스트 실행<br>
+☐ 문서 작성 완료<br>
+☐ 모델 가중치를 허브에 업로드<br>
+☐ Pull request 제출<br>
+☐ (선택 사항) 데모 노트북 추가
+
+우선, 일반적으로는 `BrandNewBert`의 이론적인 이해로 시작하는 것을 권장합니다. 그러나 이론적 측면을 직접 이해하는 대신 *직접 해보면서* 모델의 이론적 측면을 이해하는 것을 선호하는 경우 바로 `BrandNewBert` 코드 베이스로 빠져드는 것도 괜찮습니다. 이 옵션은 엔지니어링 기술이 이론적 기술보다 더 뛰어난 경우, `BrandNewBert`의 논문을 이해하는 데 어려움이 있는 경우, 또는 과학적인 논문을 읽는 것보다 프로그래밍에 훨씬 더 흥미 있는 경우에 더 적합할 수 있습니다.
+
+### 1. (선택 사항) BrandNewBert의 이론적 측면 [[1-optional-theoretical-aspects-of-brandnewbert]]
+
+만약 그런 서술적인 작업이 존재한다면, *BrandNewBert*의 논문을 읽어보는 시간을 가져야 합니다. 이해하기 어려운 섹션이 많을 수 있습니다. 그렇더라도 걱정하지 마세요! 목표는 논문의 깊은 이론적 이해가 아니라 *BrandNewBert*를 🤗 Transformers에서 효과적으로 재구현하기 위해 필요한 정보를 추출하는 것입니다. 이를 위해 이론적 측면에 너무 많은 시간을 투자할 필요는 없지만 다음과 같은 실제적인 측면에 집중해야 합니다:
+
+- *BrandNewBert*는 어떤 유형의 모델인가요? BERT와 유사한 인코더 모델인가요? GPT2와 유사한 디코더 모델인가요? BART와 유사한 인코더-디코더 모델인가요? 이들 간의 차이점에 익숙하지 않은 경우[model_summary](model_summary)를 참조하세요.
+- *BrandNewBert*의 응용 분야는 무엇인가요? 텍스트 분류인가요? 텍스트 생성인가요? 요약과 같은 Seq2Seq 작업인가요?
+- *brand_new_bert*와 BERT/GPT-2/BART의 차이점은 무엇인가요?
+- *brand_new_bert*와 가장 유사한 [🤗 Transformers 모델](https://huggingface.co/transformers/#contents)은 무엇인가요?
+- 어떤 종류의 토크나이저가 사용되나요? Sentencepiece 토크나이저인가요? Word piece 토크나이저인가요? BERT 또는 BART에 사용되는 동일한 토크나이저인가요?
+
+모델의 아키텍처에 대해 충분히 이해했다는 생각이 든 후, 궁금한 사항이 있으면 Hugging Face 팀에 문의하십시오. 이는 모델의 아키텍처, 어텐션 레이어 등에 관한 질문을 포함할 수 있습니다. Hugging Face의 유지 관리자들은 보통 코드를 검토하는 것에 대해 매우 기뻐하므로 당신을 돕는 일을 매우 환영할 것입니다!
+
+### 2. 개발 환경 설정 [[2-next-prepare-your-environment]]
+
+1. 저장소 페이지에서 "Fork" 버튼을 클릭하여 저장소의 사본을 GitHub 사용자 계정으로 만듭니다.
+
+2. `transformers` fork를 로컬 디스크에 클론하고 베이스 저장소를 원격 저장소로 추가합니다:
+
+```bash
+git clone https://github.com/[your Github handle]/transformers.git
+cd transformers
+git remote add upstream https://github.com/huggingface/transformers.git
+```
+
+3. 개발 환경을 설정합니다. 다음 명령을 실행하여 개발 환경을 설정할 수 있습니다:
+
+```bash
+python -m venv .env
+source .env/bin/activate
+pip install -e ".[dev]"
+```
+
+각 운영 체제에 따라 Transformers의 선택적 의존성이 개수가 증가하면 이 명령이 실패할 수 있습니다. 그런 경우에는 작업 중인 딥 러닝 프레임워크 (PyTorch, TensorFlow 및/또는 Flax)을 설치한 후, 다음 명령을 수행하면 됩니다:
+
+```bash
+pip install -e ".[quality]"
+```
+
+대부분의 경우에는 이것으로 충분합니다. 그런 다음 상위 디렉토리로 돌아갑니다.
+
+```bash
+cd ..
+```
+
+4. Transformers에 *brand_new_bert*의 PyTorch 버전을 추가하는 것을 권장합니다. PyTorch를 설치하려면 다음 링크의 지침을 따르십시오: https://pytorch.org/get-started/locally/.
+
+**참고:** CUDA를 설치할 필요는 없습니다. 새로운 모델이 CPU에서 작동하도록 만드는 것으로 충분합니다.
+
+5. *brand_new_bert*를 이식하기 위해서는 해당 원본 저장소에 접근할 수 있어야 합니다:
+
+```bash
+git clone https://github.com/org_that_created_brand_new_bert_org/brand_new_bert.git
+cd brand_new_bert
+pip install -e .
+```
+
+이제 *brand_new_bert*를 🤗 Transformers로 이식하기 위한 개발 환경을 설정하였습니다.
+
+### 3.-4. 원본 저장소에서 사전 훈련된 체크포인트 실행하기 [[3.-4.-run-a-pretrained-checkpoint-using-the-original-repository]]
+
+먼저, 원본 *brand_new_bert* 저장소에서 작업을 시작합니다. 원본 구현은 보통 "연구용"으로 많이 사용됩니다. 즉, 문서화가 부족하고 코드가 이해하기 어려울 수 있습니다. 그러나 이것이 바로 *brand_new_bert*를 다시 구현하려는 동기가 되어야 합니다. Hugging Face에서의 주요 목표 중 하나는 **거인의 어깨 위에 서는 것**이며, 이는 여기에서 쉽게 해석되어 동작하는 모델을 가져와서 가능한 한 **접근 가능하고 사용자 친화적이며 아름답게** 만드는 것입니다. 이것은 🤗 Transformers에서 모델을 다시 구현하는 가장 중요한 동기입니다 - 새로운 복잡한 NLP 기술을 **모두에게** 접근 가능하게 만드는 것을 목표로 합니다.
+
+따라서 원본 저장소에 대해 자세히 살펴보는 것으로 시작해야 합니다.
+
+원본 저장소에서 공식 사전 훈련된 모델을 성공적으로 실행하는 것은 종종 **가장 어려운** 단계입니다. 우리의 경험에 따르면, 원본 코드 베이스에 익숙해지는 데 시간을 투자하는 것이 매우 중요합니다. 다음을 파악해야 합니다:
+
+- 사전 훈련된 가중치를 어디서 찾을 수 있는지?
+- 사전 훈련된 가중치를 해당 모델에로드하는 방법은?
+- 모델과 독립적으로 토크나이저를 실행하는 방법은?
+- 간단한 forward pass에 필요한 클래스와 함수를 파악하기 위해 forward pass를 한 번 추적해 보세요. 일반적으로 해당 함수들만 다시 구현하면 됩니다.
+- 모델의 중요한 구성 요소를 찾을 수 있어야 합니다. 모델 클래스는 어디에 있나요? 모델 하위 클래스(*EncoderModel*, *DecoderModel* 등)가 있나요? self-attention 레이어는 어디에 있나요? self-attention, cross-attention 등 여러 가지 다른 어텐션 레이어가 있나요?
+- 원본 환경에서 모델을 디버그할 수 있는 방법은 무엇인가요? *print* 문을 추가해야 하나요? *ipdb*와 같은 대화식 디버거를 사용할 수 있나요? PyCharm과 같은 효율적인 IDE를 사용해 모델을 디버그할 수 있나요?
+
+원본 저장소에서 코드를 이식하는 작업을 시작하기 전에 원본 저장소에서 코드를 **효율적으로** 디버그할 수 있어야 합니다! 또한, 오픈 소스 라이브러리로 작업하고 있다는 것을 기억해야 합니다. 따라서 원본 저장소에서 issue를 열거나 pull request를 열기를 주저하지 마십시오. 이 저장소의 유지 관리자들은 누군가가 자신들의 코드를 살펴본다는 것에 대해 매우 기뻐할 것입니다!
+
+현재 시점에서, 원래 모델을 디버깅하기 위해 어떤 디버깅 환경과 전략을 선호하는지는 당신에게 달렸습니다. 우리는 고가의 GPU 환경을 구축하는 것은 비추천합니다. 대신, 원래 저장소로 들어가서 작업을 시작할 때와 🤗 Transformers 모델의 구현을 시작할 때에도 CPU에서 작업하는 것이 좋습니다. 모델이 이미 🤗 Transformers로 성공적으로 이식되었을 때에만 모델이 GPU에서도 예상대로 작동하는지 확인해야합니다.
+
+일반적으로, 원래 모델을 실행하기 위한 두 가지 가능한 디버깅 환경이 있습니다.
+
+- [Jupyter 노트북](https://jupyter.org/) / [Google Colab](https://colab.research.google.com/notebooks/intro.ipynb)
+- 로컬 Python 스크립트
+
+Jupyter 노트북의 장점은 셀 단위로 실행할 수 있다는 것입니다. 이는 논리적인 구성 요소를 더 잘 분리하고 중간 결과를 저장할 수 있으므로 디버깅 사이클이 더 빨라질 수 있습니다. 또한, 노트북은 다른 기여자와 쉽게 공유할 수 있으므로 Hugging Face 팀의 도움을 요청하려는 경우 매우 유용할 수 있습니다. Jupyter 노트북에 익숙하다면 이를 사용하는 것을 강력히 추천합니다.
+
+Jupyter 노트북의 단점은 사용에 익숙하지 않은 경우 새로운 프로그래밍 환경에 적응하는 데 시간을 할애해야 하며, `ipdb`와 같은 알려진 디버깅 도구를 더 이상 사용할 수 없을 수도 있다는 것입니다.
+
+각 코드 베이스에 대해 좋은 첫 번째 단계는 항상 **작은** 사전 훈련된 체크포인트를 로드하고 더미 정수 벡터 입력을 사용하여 단일 forward pass를 재현하는 것입니다. 이와 같은 스크립트는 다음과 같을 수 있습니다(의사 코드로 작성):
+
+```python
+model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/")
+input_ids = [0, 4, 5, 2, 3, 7, 9]  # vector of input ids
+original_output = model.predict(input_ids)
+```
+
+다음으로, 디버깅 전략에 대해 일반적으로 다음과 같은 몇 가지 선택지가 있습니다:
+
+- 원본 모델을 많은 작은 테스트 가능한 구성 요소로 분해하고 각각에 대해 forward pass를 실행하여 검증합니다.
+- 원본 모델을 원본 *tokenizer*과 원본 *model*로만 분해하고 해당 부분에 대해 forward pass를 실행한 후 검증을 위해 중간 출력(print 문 또는 중단점)을 사용합니다.
+
+다시 말하지만, 어떤 전략을 선택할지는 당신에게 달려 있습니다. 원본 코드 베이스에 따라 하나 또는 다른 전략이 유리할 수 있습니다.
+
+원본 코드 베이스를 모델의 작은 하위 구성 요소로 분해할 수 있는지 여부, 예를 들어 원본 코드 베이스가 즉시 실행 모드에서 간단히 실행될 수 있는 경우, 그런 경우에는 그 노력이 가치가 있다는 것이 일반적입니다. 초기에 더 어려운 방법을 선택하는 것에는 몇 가지 중요한 장점이 있습니다.
+
+- 원본 모델을 🤗 Transformers 구현과 비교할 때 각 구성 요소가 일치하는지 자동으로 확인할 수 있습니다. 즉, 시각적인 비교(print 문을 통한 비교가 아닌) 대신 🤗 Transformers 구현과 그에 대응하는 원본 구성 요소가 일치하는지 확인할 수 있습니다.
+- 전체 모델을 모듈별로, 즉 작은 구성 요소로 분해함으로써 모델을 이식하는 큰 문제를 단순히 개별 구성 요소를 이식하는 작은 문제로 분해할 수 있으므로 작업을 더 잘 구조화할 수 있습니다.
+- 모델을 논리적으로 의미 있는 구성 요소로 분리하는 것은 모델의 설계에 대한 더 나은 개요를 얻고 모델을 더 잘 이해하는 데 도움이 됩니다.
+- 이러한 구성 요소별 테스트를 통해 코드를 변경하면서 회귀가 발생하지 않도록 보장할 수 있습니다.
+
+[Lysandre의 ELECTRA 통합 검사](https://gist.github.com/LysandreJik/db4c948f6b4483960de5cbac598ad4ed)는 이를 수행하는 좋은 예제입니다.
+
+그러나 원본 코드 베이스가 매우 복잡하거나 중간 구성 요소를 컴파일된 모드에서 실행하는 것만 허용하는 경우, 모델을 테스트 가능한 작은 하위 구성 요소로 분해하는 것이 시간이 많이 소요되거나 불가능할 수도 있습니다. [T5의 MeshTensorFlow](https://github.com/tensorflow/mesh/tree/master/mesh_tensorflow) 라이브러리는 매우 복잡하며 모델을 하위 구성 요소로 분해하는 간단한 방법을 제공하지 않습니다. 이러한 라이브러리의 경우, 보통 print 문을 통해 확인합니다.
+
+어떤 전략을 선택하더라도 권장되는 절차는 동일합니다. 먼저 시작 레이어를 디버그하고 마지막 레이어를 마지막에 디버그하는 것이 좋습니다.
+
+다음 순서로 각 레이어의 출력을 검색하는 것이 좋습니다:
+
+1. 모델에 전달된 입력 ID 가져오기
+2. 워드 임베딩 가져오기
+3. 첫 번째 Transformer 레이어의 입력 가져오기
+4. 첫 번째 Transformer 레이어의 출력 가져오기
+5. 다음 n-1개의 Transformer 레이어의 출력 가져오기
+6. BrandNewBert 모델의 출력 가져오기
+
+입력 ID는 정수 배열로 구성되며, 예를 들어 `input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]`와 같을 수 있습니다.
+
+다음 레이어의 출력은 종종 다차원 실수 배열로 구성되며, 다음과 같이 나타낼 수 있습니다:
+
+```
+[[
+ [-0.1465, -0.6501,  0.1993,  ...,  0.1451,  0.3430,  0.6024],
+ [-0.4417, -0.5920,  0.3450,  ..., -0.3062,  0.6182,  0.7132],
+ [-0.5009, -0.7122,  0.4548,  ..., -0.3662,  0.6091,  0.7648],
+ ...,
+ [-0.5613, -0.6332,  0.4324,  ..., -0.3792,  0.7372,  0.9288],
+ [-0.5416, -0.6345,  0.4180,  ..., -0.3564,  0.6992,  0.9191],
+ [-0.5334, -0.6403,  0.4271,  ..., -0.3339,  0.6533,  0.8694]]],
+```
+
+🤗 Transformers에 추가되는 모든 모델은 통합 테스트를 통과해야 합니다. 즉, 원본 모델과 🤗 Transformers의 재구현 버전이 0.001의 정밀도로 정확히 동일한 출력을 내야 합니다! 동일한 모델이 다른 라이브러리에서 작성되었을 때 라이브러리 프레임워크에 따라 약간 다른 출력을 얻는 것은 정상이므로 1e-3(0.001)의 오차는 허용합니다. 거의 동일한 출력을 내는 것만으로는 충분하지 않으며, 완벽히 일치하는 수준이어야 합니다. 따라서 🤗 Transformers 버전의 중간 출력을 *brand_new_bert*의 원래 구현의 중간 출력과 여러 번 비교해야 합니다. 이 경우 원본 저장소의 **효율적인** 디버깅 환경이 절대적으로 중요합니다. 디버깅 환경을 가능한 한 효율적으로 만드는 몇 가지 조언을 제시합니다.
+
+- 중간 결과를 디버그하는 가장 좋은 방법을 찾으세요. 원본 저장소가 PyTorch로 작성되었다면 원본 모델을 더 작은 하위 구성 요소로 분해하여 중간 값을 검색하는 긴 스크립트를 작성하는 것에 시간을 투자할 가치가 있습니다. 원본 저장소가 Tensorflow 1로 작성되었다면 [tf.print](https://www.tensorflow.org/api_docs/python/tf/print)와 같은 Tensorflow 출력 작업을 사용하여 중간 값을 출력해야 할 수도 있습니다. 원본 저장소가 Jax로 작성되었다면 forward pass를 실행할 때 모델이 **jit 되지 않도록** 해야 합니다. 예를 들어 [이 링크](https://github.com/google/jax/issues/196)를 확인해 보세요.
+- 사용 가능한 가장 작은 사전 훈련된 체크포인트를 사용하세요. 체크포인트가 작을수록 디버그 사이클이 더 빨라집니다. 전반적으로 forward pass에 10초 이상이 걸리는 경우 효율적이지 않습니다. 매우 큰 체크포인트만 사용할 수 있는 경우, 새 환경에서 임의로 초기화된 가중치로 더미 모델을 만들고 해당 가중치를 🤗 Transformers 버전과 비교하기 위해 저장하는 것이 더 의미가 있을 수 있습니다.
+- 디버깅 설정에서 가장 쉽게 forward pass를 호출하는 방법을 사용하세요. 원본 저장소에서 **단일** forward pass만 호출하는 함수를 찾는 것이 이상적입니다. 이 함수는 일반적으로 `predict`, `evaluate`, `forward`, `__call__`과 같이 호출됩니다. `autoregressive_sample`과 같은 텍스트 생성에서 `forward`를 여러 번 호출하여 텍스트를 생성하는 등의 작업을 수행하는 함수를 디버그하고 싶지 않을 것입니다.
+- 토큰화 과정을 모델의 *forward* pass와 분리하려고 노력하세요. 원본 저장소에서 입력 문자열을 입력해야 하는 예제가 있는 경우, 입력 문자열이 입력 ID로 변경되는 순간을 찾아서 시작하세요. 이 경우 직접 ID를 입력할 수 있도록 작은 스크립트를 작성하거나 원본 코드를 수정해야 할 수도 있습니다.
+- 디버깅 설정에서 모델이 훈련 모드가 아니라는 것을 확인하세요. 훈련 모드에서는 모델의 여러 드롭아웃 레이어 때문에 무작위 출력이 생성될 수 있습니다. 디버깅 환경에서 forward pass가 **결정론적**이도록 해야 합니다. 또는 동일한 프레임워크에 있는 경우 *transformers.utils.set_seed*를 사용하세요.
+
+다음 섹션에서는 *brand_new_bert*에 대해 이 작업을 수행하는 데 더 구체적인 세부 사항/팁을 제공합니다.
+
+### 5.-14. 🤗 Transformers에 BrandNewBert를 이식하기 [[5.-14.-port-brandnewbert-to-transformers]]
+
+이제, 마침내 🤗 Transformers에 새로운 코드를 추가할 수 있습니다. 🤗 Transformers 포크의 클론으로 이동하세요:
+
+```bash
+cd transformers
+```
+
+다음과 같이 이미 존재하는 모델의 모델 아키텍처와 정확히 일치하는 모델을 추가하는 특별한 경우에는 [이 섹션](#write-a-conversion-script)에 설명된대로 변환 스크립트만 추가하면 됩니다. 이 경우에는 이미 존재하는 모델의 전체 모델 아키텍처를 그대로 재사용할 수 있습니다.
+
+그렇지 않으면 새로운 모델 생성을 시작합시다. 여기에서 두 가지 선택지가 있습니다:
+
+- `transformers-cli add-new-model-like`를 사용하여 기존 모델과 유사한 새로운 모델 추가하기
+- `transformers-cli add-new-model`을 사용하여 템플릿을 기반으로 한 새로운 모델 추가하기 (선택한 모델 유형에 따라 BERT 또는 Bart와 유사한 모습일 것입니다)
+
+두 경우 모두, 모델의 기본 정보를 입력하는 설문조사가 제시됩니다. 두 번째 명령어는 `cookiecutter`를 설치해야 합니다. 자세한 정보는 [여기](https://github.com/huggingface/transformers/tree/main/templates/adding_a_new_model)에서 확인할 수 있습니다.
+
+**huggingface/transformers 메인 저장소에 Pull Request 열기**
+
+자동으로 생성된 코드를 수정하기 전에, 지금은 "작업 진행 중 (WIP)" 풀 리퀘스트를 열기 위한 시기입니다. 예를 들어, 🤗 Transformers에 "*brand_new_bert* 추가"라는 제목의 "[WIP] Add *brand_new_bert*" 풀 리퀘스트를 엽니다. 이렇게 하면 당신과 Hugging Face 팀이 🤗 Transformers에 모델을 통합하는 작업을 함께할 수 있습니다.
+
+다음을 수행해야 합니다:
+
+1. 메인 브랜치에서 작업을 잘 설명하는 이름으로 브랜치 생성
+
+```bash
+git checkout -b add_brand_new_bert
+```
+
+2. 자동으로 생성된 코드 커밋
+
+```bash
+git add .
+git commit
+```
+
+3. 현재 메인을 가져오고 리베이스
+
+```bash
+git fetch upstream
+git rebase upstream/main
+```
+
+4. 변경 사항을 계정에 푸시
+
+```bash
+git push -u origin a-descriptive-name-for-my-changes
+```
+
+5. 만족스럽다면, GitHub에서 자신의 포크한 웹 페이지로 이동합니다. "Pull request"를 클릭합니다. Hugging Face 팀의 일부 멤버의 GitHub 핸들을 리뷰어로 추가하여 Hugging Face 팀이 앞으로의 변경 사항에 대해 알림을 받을 수 있도록 합니다.
+
+6. GitHub 풀 리퀘스트 웹 페이지 오른쪽에 있는 "Convert to draft"를 클릭하여 PR을 초안으로 변경합니다.
+
+다음으로, 어떤 진전을 이루었다면 작업을 커밋하고 계정에 푸시하여 풀 리퀘스트에 표시되도록 해야 합니다. 또한, 다음과 같이 현재 메인과 작업을 업데이트해야 합니다:
+
+```bash
+git fetch upstream
+git merge upstream/main
+```
+
+일반적으로, 모델 또는 구현에 관한 모든 질문은 자신의 PR에서 해야 하며, PR에서 토론되고 해결되어야 합니다. 이렇게 하면 Hugging Face 팀이 새로운 코드를 커밋하거나 질문을 할 때 항상 알림을 받을 수 있습니다. Hugging Face 팀에게 문제 또는 질문을 효율적으로 이해할 수 있도록 추가한 코드를 명시하는 것이 도움이 될 때가 많습니다.
+
+이를 위해, 변경 사항을 모두 볼 수 있는 "Files changed" 탭으로 이동하여 질문하고자 하는 줄로 이동한 다음 "+" 기호를 클릭하여 코멘트를 추가할 수 있습니다. 질문이나 문제가 해결되면, 생성된 코멘트의 "Resolve" 버튼을 클릭할 수 있습니다.
+
+마찬가지로, Hugging Face 팀은 코드를 리뷰할 때 코멘트를 남길 것입니다. 우리는 PR에서 대부분의 질문을 GitHub에서 묻는 것을 권장합니다. 공개에 크게 도움이 되지 않는 매우 일반적인 질문의 경우, Slack이나 이메일을 통해 Hugging Face 팀에게 문의할 수 있습니다.
+
+**5. brand_new_bert에 대해 생성된 모델 코드를 적용하기**
+
+먼저, 우리는 모델 자체에만 초점을 맞추고 토크나이저에 대해서는 신경 쓰지 않을 것입니다. 모든 관련 코드는 다음의 생성된 파일에서 찾을 수 있습니다: `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` 및 `src/transformers/models/brand_new_bert/configuration_brand_new_bert.py`.
+
+이제 마침내 코딩을 시작할 수 있습니다 :). `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py`의 생성된 코드는 인코더 전용 모델인 경우 BERT와 동일한 아키텍처를 가지거나, 인코더-디코더 모델인 경우 BART와 동일한 아키텍처를 가질 것입니다. 이 시점에서, 모델의 이론적 측면에 대해 배운 내용을 다시 상기해야 합니다: *모델이 BERT 또는 BART와 어떻게 다른가요?*. 자주 변경해야 하는 것은 *self-attention* 레이어, 정규화 레이어의 순서 등을 변경하는 것입니다. 다시 말하지만, 자신의 모델을 구현하는 데 도움이 되도록 Transformers에서 이미 존재하는 모델의 유사한 아키텍처를 살펴보는 것이 유용할 수 있습니다.
+
+**참고로** 이 시점에서, 코드가 완전히 정확하거나 깨끗하다고 확신할 필요는 없습니다. 오히려 처음에는 원본 코드의 첫 번째 *불완전하고* 복사된 버전을 `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py`에 추가하는 것이 좋습니다. 필요한 모든 코드가 추가될 때까지 이러한 작업을 진행한 후, 다음 섹션에서 설명한 변환 스크립트를 사용하여 코드를 점진적으로 개선하고 수정하는 것이 훨씬 효율적입니다. 이 시점에서 작동해야 하는 유일한 것은 다음 명령이 작동하는 것입니다:
+
+```python
+from transformers import BrandNewBertModel, BrandNewBertConfig
+
+model = BrandNewBertModel(BrandNewBertConfig())
+```
+
+위의 명령은 `BrandNewBertConfig()`에 정의된 기본 매개변수에 따라 무작위 가중치로 모델을 생성하며, 이로써 모든 구성 요소의 `init()` 메서드가 작동함을 보장합니다.
+
+모든 무작위 초기화는 `BrandnewBertPreTrainedModel` 클래스의 `_init_weights` 메서드에서 수행되어야 합니다. 이 메서드는 구성 설정 변수에 따라 모든 리프 모듈을 초기화해야 합니다. BERT의 `_init_weights` 메서드 예제는 다음과 같습니다:
+
+```py
+def _init_weights(self, module):
+    """Initialize the weights"""
+    if isinstance(module, nn.Linear):
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        if module.bias is not None:
+            module.bias.data.zero_()
+    elif isinstance(module, nn.Embedding):
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        if module.padding_idx is not None:
+            module.weight.data[module.padding_idx].zero_()
+    elif isinstance(module, nn.LayerNorm):
+        module.bias.data.zero_()
+        module.weight.data.fill_(1.0)
+```
+
+몇 가지 모듈에 대해 특별한 초기화가 필요한 경우 사용자 정의 방식을 사용할 수도 있습니다. 예를 들어, `Wav2Vec2ForPreTraining`에서 마지막 두 개의 선형 레이어는 일반적인 PyTorch `nn.Linear`의 초기화를 가져야 하지만, 다른 모든 레이어는 위와 같은 초기화를 사용해야 합니다. 이는 다음과 같이 코드화됩니다:
+
+```py
+def _init_weights(self, module):
+    """Initialize the weights"""
+    if isinstnace(module, Wav2Vec2ForPreTraining):
+        module.project_hid.reset_parameters()
+        module.project_q.reset_parameters()
+        module.project_hid._is_hf_initialized = True
+        module.project_q._is_hf_initialized = True
+    elif isinstance(module, nn.Linear):
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        if module.bias is not None:
+            module.bias.data.zero_()
+```
+
+`_is_hf_initialized` 플래그는 서브모듈을 한 번만 초기화하도록 내부적으로 사용됩니다. `module.project_q` 및 `module.project_hid`에 대해 `True`로 설정함으로써, 우리가 수행한 사용자 정의 초기화가 이후에 덮어쓰이지 않도록 합니다. 즉, `_init_weights` 함수가 이들에게 적용되지 않습니다.
+
+**6. 변환 스크립트 작성하기**
+
+다음으로, 디버그에 사용한 체크포인트를 기존 저장소에서 만든 🤗 Transformers 구현과 호환되는 체크포인트로 변환할 수 있는 변환 스크립트를 작성해야 합니다. 변환 스크립트를 처음부터 작성하는 것보다는 *brand_new_bert*와 동일한 프레임워크로 작성된 유사한 모델을 변환한 기존 변환 스크립트를 찾아보는 것이 좋습니다. 일반적으로 기존 변환 스크립트를 복사하여 사용 사례에 맞게 약간 수정하는 것으로 충분합니다. 모델에 대해 유사한 기존 변환 스크립트를 어디에서 찾을 수 있는지 Hugging Face 팀에게 문의하는 것을 망설이지 마세요.
+
+- TensorFlow에서 PyTorch로 모델을 이전하는 경우, 좋은 참고 자료로 BERT의 변환 스크립트 [여기](https://github.com/huggingface/transformers/blob/7acfa95afb8194f8f9c1f4d2c6028224dbed35a2/src/transformers/models/bert/modeling_bert.py#L91)를 참조할 수 있습니다.
+- PyTorch에서 PyTorch로 모델을 이전하는 경우, 좋은 참고 자료로 BART의 변환 스크립트 [여기](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py)를 참조할 수 있습니다.
+
+다음에서는 PyTorch 모델이 레이어 가중치를 저장하고 레이어 이름을 정의하는 방법에 대해 간단히 설명하겠습니다. PyTorch에서 레이어의 이름은 레이어에 지정한 클래스 속성의 이름으로 정의됩니다. 다음과 같이 PyTorch에서 `SimpleModel`이라는 더미 모델을 정의해 봅시다:
+
+```python
+from torch import nn
+
+
+class SimpleModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.dense = nn.Linear(10, 10)
+        self.intermediate = nn.Linear(10, 10)
+        self.layer_norm = nn.LayerNorm(10)
+```
+
+이제 이 모델 정의의 인스턴스를 생성할 수 있으며 `dense`, `intermediate`, `layer_norm` 등의 가중치가 랜덤하게 할당됩니다. 모델을 출력하여 아키텍처를 확인할 수 있습니다.
+
+```python
+model = SimpleModel()
+
+print(model)
+```
+
+이는 다음과 같이 출력됩니다:
+
+```
+SimpleModel(
+  (dense): Linear(in_features=10, out_features=10, bias=True)
+  (intermediate): Linear(in_features=10, out_features=10, bias=True)
+  (layer_norm): LayerNorm((10,), eps=1e-05, elementwise_affine=True)
+)
+```
+
+우리는 레이어의 이름이 PyTorch에서 클래스 속성의 이름으로 정의되어 있는 것을 볼 수 있습니다. 특정 레이어의 가중치 값을 출력하여 확인할 수 있습니다:
+
+```python
+print(model.dense.weight.data)
+```
+
+가중치가 무작위로 초기화되었음을 확인할 수 있습니다.
+
+```
+tensor([[-0.0818,  0.2207, -0.0749, -0.0030,  0.0045, -0.1569, -0.1598,  0.0212,
+         -0.2077,  0.2157],
+        [ 0.1044,  0.0201,  0.0990,  0.2482,  0.3116,  0.2509,  0.2866, -0.2190,
+          0.2166, -0.0212],
+        [-0.2000,  0.1107, -0.1999, -0.3119,  0.1559,  0.0993,  0.1776, -0.1950,
+         -0.1023, -0.0447],
+        [-0.0888, -0.1092,  0.2281,  0.0336,  0.1817, -0.0115,  0.2096,  0.1415,
+         -0.1876, -0.2467],
+        [ 0.2208, -0.2352, -0.1426, -0.2636, -0.2889, -0.2061, -0.2849, -0.0465,
+          0.2577,  0.0402],
+        [ 0.1502,  0.2465,  0.2566,  0.0693,  0.2352, -0.0530,  0.1859, -0.0604,
+          0.2132,  0.1680],
+        [ 0.1733, -0.2407, -0.1721,  0.1484,  0.0358, -0.0633, -0.0721, -0.0090,
+          0.2707, -0.2509],
+        [-0.1173,  0.1561,  0.2945,  0.0595, -0.1996,  0.2988, -0.0802,  0.0407,
+          0.1829, -0.1568],
+        [-0.1164, -0.2228, -0.0403,  0.0428,  0.1339,  0.0047,  0.1967,  0.2923,
+          0.0333, -0.0536],
+        [-0.1492, -0.1616,  0.1057,  0.1950, -0.2807, -0.2710, -0.1586,  0.0739,
+          0.2220,  0.2358]]).
+```
+
+변환 스크립트에서는 이러한 무작위로 초기화된 가중치를 체크포인트의 해당 레이어의 정확한 가중치로 채워야 합니다. 예를 들면 다음과 같습니다:
+
+```python
+# retrieve matching layer weights, e.g. by
+# recursive algorithm
+layer_name = "dense"
+pretrained_weight = array_of_dense_layer
+
+model_pointer = getattr(model, "dense")
+
+model_pointer.weight.data = torch.from_numpy(pretrained_weight)
+```
+
+이렇게 하면 PyTorch 모델의 무작위로 초기화된 각 가중치와 해당 체크포인트 가중치가 **모양과 이름** 모두에서 정확히 일치하는지 확인해야 합니다. 이를 위해 모양에 대한 assert 문을 추가하고 체크포인트 가중치의 이름을 출력해야 합니다. 예를 들어 다음과 같은 문장을 추가해야 합니다:
+
+```python
+assert (
+    model_pointer.weight.shape == pretrained_weight.shape
+), f"Pointer shape of random weight {model_pointer.shape} and array shape of checkpoint weight {pretrained_weight.shape} mismatched"
+```
+
+또한 두 가중치의 이름을 출력하여 일치하는지 확인해야 합니다. *예시*:
+
+```python
+logger.info(f"Initialize PyTorch weight {layer_name} from {pretrained_weight.name}")
+```
+
+모양 또는 이름이 일치하지 않는 경우, 랜덤으로 초기화된 레이어에 잘못된 체크포인트 가중치를 할당한 것으로 추측됩니다.
+
+잘못된 모양은 `BrandNewBertConfig()`의 구성 매개변수 설정이 변환하려는 체크포인트에 사용된 설정과 정확히 일치하지 않기 때문일 가능성이 가장 큽니다. 그러나 PyTorch의 레이어 구현 자체에서 가중치를 전치해야 할 수도 있습니다.
+
+마지막으로, **모든** 필요한 가중치가 초기화되었는지 확인하고 초기화에 사용되지 않은 모든 체크포인트 가중치를 출력하여 모델이 올바르게 변환되었는지 확인해야 합니다. 잘못된 모양 문장이나 잘못된 이름 할당으로 인해 변환 시도가 실패하는 것은 완전히 정상입니다. 이는 `BrandNewBertConfig()`에서 잘못된 매개변수를 사용하거나 🤗 Transformers 구현에서 잘못된 아키텍처, 🤗 Transformers 구현의 구성 요소 중 하나의 `init()` 함수에 버그가 있는 경우이거나 체크포인트 가중치 중 하나를 전치해야 하는 경우일 가능성이 가장 높습니다.
+
+이 단계는 이전 단계와 함께 반복되어야 하며 모든 체크포인트의 가중치가 Transformers 모델에 올바르게 로드되었을 때까지 계속되어야 합니다. 🤗 Transformers 구현에 체크포인트를 올바르게 로드한 후에는 `/path/to/converted/checkpoint/folder`와 같은 원하는 폴더에 모델을 저장할 수 있어야 합니다. 해당 폴더에는 `pytorch_model.bin` 파일과 `config.json` 파일이 모두 포함되어야 합니다.
+
+```python
+model.save_pretrained("/path/to/converted/checkpoint/folder")
+```
+
+**7. 순방향 패스 구현하기**
+
+🤗 Transformers 구현에 사전 훈련된 가중치를 정확하게 로드한 후에는 순방향 패스가 올바르게 구현되었는지 확인해야 합니다. [원본 저장소에 익숙해지기](#34-run-a-pretrained-checkpoint-using-the-original-repository)에서 이미 원본 저장소를 사용하여 모델의 순방향 패스를 실행하는 스크립트를 만들었습니다. 이제 원본 대신 🤗 Transformers 구현을 사용하는 유사한 스크립트를 작성해야 합니다. 다음과 같이 작성되어야 합니다:
+
+```python
+model = BrandNewBertModel.from_pretrained("/path/to/converted/checkpoint/folder")
+input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]
+output = model(input_ids).last_hidden_states
+```
+
+🤗 Transformers 구현과 원본 모델 구현이 처음부터 정확히 동일한 출력을 제공하지 않거나 순방향 패스에서 오류가 발생할 가능성이 매우 높습니다. 실망하지 마세요. 예상된 일입니다! 먼저, 순방향 패스에서 오류가 발생하지 않도록 해야 합니다. 종종 잘못된 차원이 사용되어 *차원 불일치* 오류가 발생하거나 잘못된 데이터 유형 개체가 사용되는 경우가 있습니다. 예를 들면 `torch.long` 대신에 `torch.float32`가 사용된 경우입니다. 해결할 수 없는 오류가 발생하면 Hugging Face 팀에 도움을 요청하는 것이 좋습니다.
+
+🤗 Transformers 구현이 올바르게 작동하는지 확인하는 마지막 단계는 출력이 `1e-3`의 정밀도로 동일한지 확인하는 것입니다. 먼저, 출력 모양이 동일하도록 보장해야 합니다. 즉, 🤗 Transformers 구현 스크립트와 원본 구현 사이에서 `outputs.shape`는 동일한 값을 반환해야 합니다. 그 다음으로, 출력 값이 동일하도록 해야 합니다. 이는 새로운 모델을 추가할 때 가장 어려운 부분 중 하나입니다. 출력이 동일하지 않은 일반적인 실수 사례는 다음과 같습니다:
+
+- 일부 레이어가 추가되지 않았습니다. 즉, *활성화* 레이어가 추가되지 않았거나 잔차 연결이 빠졌습니다.
+- 단어 임베딩 행렬이 연결되지 않았습니다.
+- 잘못된 위치 임베딩이 사용되었습니다. 원본 구현에서는 오프셋을 사용합니다.
+- 순방향 패스 중에 Dropout이 적용되었습니다. 이를 수정하려면 *model.training이 False*인지 확인하고 순방향 패스 중에 Dropout 레이어가 잘못 활성화되지 않도록 하세요. 즉, [PyTorch의 기능적 Dropout](https://pytorch.org/docs/stable/nn.functional.html?highlight=dropout#torch.nn.functional.dropout)에 *self.training*을 전달하세요.
+
+문제를 해결하는 가장 좋은 방법은 일반적으로 원본 구현과 🤗 Transformers 구현의 순방향 패스를 나란히 놓고 차이점이 있는지 확인하는 것입니다. 이상적으로는 순방향 패스의 중간 출력을 디버그/출력하여 원본 구현과 🤗 Transformers 구현의 정확한 위치를 찾을 수 있어야 합니다. 먼저, 두 스크립트의 하드코딩된 `input_ids`가 동일한지 확인하세요. 다음으로, `input_ids`의 첫 번째 변환의 출력(일반적으로 단어 임베딩)이 동일한지 확인하세요. 그런 다음 네트워크의 가장 마지막 레이어까지 진행해보세요. 어느 시점에서 두 구현 사이에 차이가 있는 것을 알게 되는데, 이는 🤗 Transformers 구현의 버그 위치를 가리킬 것입니다. 저희 경험상으로는 원본 구현과 🤗 Transformers 구현 모두에서 동일한 위치에 많은 출력 문을 추가하고 이들의 중간 표현에 대해 동일한 값을 보이는 출력 문을 연속적으로 제거하는 것이 간단하고 효과적인 방법입니다.
+
+`torch.allclose(original_output, output, atol=1e-3)`로 출력을 확인하여 두 구현이 동일한 출력을 하는 것을 확신한다면, 가장 어려운 부분은 끝났습니다! 축하드립니다. 남은 작업은 쉬운 일이 될 것입니다 😊.
+
+**8. 필요한 모든 모델 테스트 추가하기**
+
+이 시점에서 새로운 모델을 성공적으로 추가했습니다. 그러나 해당 모델이 요구되는 디자인에 완전히 부합하지 않을 수도 있습니다. 🤗 Transformers와 완벽하게 호환되는 구현인지 확인하기 위해 모든 일반 테스트를 통과해야 합니다. Cookiecutter는 아마도 모델을 위한 테스트 파일을 자동으로 추가했을 것입니다. 아마도 `tests/models/brand_new_bert/test_modeling_brand_new_bert.py`와 같은 경로에 위치할 것입니다. 이 테스트 파일을 실행하여 일반 테스트가 모두 통과하는지 확인하세요.
+
+```bash
+pytest tests/models/brand_new_bert/test_modeling_brand_new_bert.py
+```
+
+모든 일반 테스트를 수정한 후, 이제 수행한 작업을 충분히 테스트하여 다음 사항을 보장해야 합니다.
+
+- a) 커뮤니티가 *brand_new_bert*의 특정 테스트를 살펴봄으로써 작업을 쉽게 이해할 수 있도록 함
+- b) 모델에 대한 향후 변경 사항이 모델의 중요한 기능을 손상시키지 않도록 함
+
+먼저 통합 테스트를 추가해야 합니다. 이러한 통합 테스트는 이전에 모델을 🤗 Transformers로 구현하기 위해 사용한 디버깅 스크립트와 동일한 작업을 수행합니다. Cookiecutter에 이미 이러한 모델 테스트의 템플릿인 `BrandNewBertModelIntegrationTests`가 추가되어 있으며, 여러분이 작성해야 할 내용으로만 채워 넣으면 됩니다. 이러한 테스트가 통과하는지 확인하려면 다음을 실행하세요.
+
+```bash
+RUN_SLOW=1 pytest -sv tests/models/brand_new_bert/test_modeling_brand_new_bert.py::BrandNewBertModelIntegrationTests
+```
+
+<Tip>
+
+Windows를 사용하는 경우 `RUN_SLOW=1`을 `SET RUN_SLOW=1`로 바꿔야 합니다.
+
+</Tip>
+
+둘째로, *brand_new_bert*에 특화된 모든 기능도 별도의 테스트에서 추가로 테스트해야 합니다. 이 부분은 종종 잊히는데, 두 가지 측면에서 굉장히 유용합니다.
+
+- *brand_new_bert*의 특수 기능이 어떻게 작동해야 하는지 보여줌으로써 커뮤니티에게 모델 추가 과정에서 습득한 지식을 전달하는 데 도움이 됩니다.
+- 향후 기여자는 이러한 특수 테스트를 실행하여 모델에 대한 변경 사항을 빠르게 테스트할 수 있습니다.
+
+
+**9. 토크나이저 구현하기**
+
+다음으로, *brand_new_bert*의 토크나이저를 추가해야 합니다. 보통 토크나이저는 🤗 Transformers의 기존 토크나이저와 동일하거나 매우 유사합니다.
+
+토크나이저가 올바르게 작동하는지 확인하기 위해 먼저 원본 리포지토리에서 문자열을 입력하고 `input_ids`를 반환하는 스크립트를 생성하는 것이 좋습니다. 다음과 같은 유사한 스크립트일 수 있습니다 (의사 코드로 작성):
+
+```python
+input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words."
+model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/")
+input_ids = model.tokenize(input_str)
+```
+
+원본 리포지토리를 자세히 살펴보고 올바른 토크나이저 함수를 찾거나, 복제본에서 변경 사항을 적용하여 `input_ids`만 출력하도록 해야 합니다. 원본 리포지토리를 사용하는 기능적인 토큰화 스크립트를 작성한 후, 🤗 Transformers의 유사한 스크립트를 생성해야 합니다. 다음과 같이 작성되어야 합니다:
+
+```python
+from transformers import BrandNewBertTokenizer
+
+input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words."
+
+tokenizer = BrandNewBertTokenizer.from_pretrained("/path/to/tokenizer/folder/")
+
+input_ids = tokenizer(input_str).input_ids
+```
+
+두 개의 `input_ids`가 동일한 값을 반환할 때, 마지막 단계로 토크나이저 테스트 파일도 추가해야 합니다.
+
+*brand_new_bert*의 모델링 테스트 파일과 유사하게, *brand_new_bert*의 토크나이제이션 테스트 파일에는 몇 가지 하드코딩된 통합 테스트가 포함되어야 합니다.
+
+**10. 종단 간 통합 테스트 실행**
+
+토크나이저를 추가한 후에는 모델과 토크나이저를 사용하여 몇 가지 종단 간 통합 테스트를 추가해야 합니다. `tests/models/brand_new_bert/test_modeling_brand_new_bert.py`에 추가해주세요. 이러한 테스트는 🤗 Transformers 구현이 예상대로 작동하는지를 의미 있는 text-to-text 예시로 보여줘야 합니다. 그 예시로는 *예를 들어* source-to-target 번역 쌍, article-to-summary 쌍, question-to-answer 쌍 등이 포함될 수 있습니다. 불러온 체크포인트 중 어느 것도 다운스트림 작업에서 미세 조정되지 않았다면, 모델 테스트만으로 충분합니다. 모델이 완전히 기능을 갖추었는지 확인하기 위해 마지막 단계로 GPU에서 모든 테스트를 실행하는 것이 좋습니다. 모델의 내부 텐서의 일부에 `.to(self.device)` 문을 추가하는 것을 잊었을 수 있으며, 이 경우 테스트에서 오류로 표시됩니다. GPU에 액세스할 수 없는 경우, Hugging Face 팀이 테스트를 대신 실행할 수 있습니다.
+
+**11. 기술문서 추가**
+
+이제 *brand_new_bert*에 필요한 모든 기능이 추가되었습니다. 거의 끝났습니다! 추가해야 할 것은 멋진 기술문서과 기술문서 페이지입니다. Cookiecutter가 `docs/source/model_doc/brand_new_bert.md`라는 템플릿 파일을 추가해줬을 것입니다. 이 페이지를 사용하기 전에 모델을 사용하는 사용자들은 일반적으로 이 페이지를 먼저 확인합니다. 따라서 문서는 이해하기 쉽고 간결해야 합니다. 모델을 사용하는 방법을 보여주기 위해 *팁*을 추가하는 것이 커뮤니티에 매우 유용합니다. 독스트링에 관련하여 Hugging Face 팀에 문의하는 것을 주저하지 마세요.
+
+다음으로, `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py`에 추가된 독스트링이 올바르며 필요한 모든 입력 및 출력을 포함하도록 확인하세요. [여기](writing-documentation)에서 우리의 문서 작성 가이드와 독스트링 형식에 대한 상세 가이드가 있습니다. 문서는 일반적으로 커뮤니티와 모델의 첫 번째 접점이기 때문에, 문서는 적어도 코드만큼의 주의를 기울여야 합니다.
+
+**코드 리팩토링**
+
+좋아요, 이제 *brand_new_bert*를 위한 모든 필요한 코드를 추가했습니다. 이 시점에서 다음을 실행하여 잠재적으로 잘못된 코드 스타일을 수정해야 합니다:
+
+그리고 코딩 스타일이 품질 점검을 통과하는지 확인하기 위해 다음을 실행하고 확인해야 합니다:
+
+```bash
+make style
+```
+
+🤗 Transformers에는 여전히 실패할 수 있는 몇 가지 매우 엄격한 디자인 테스트가 있습니다. 이는 독스트링에 누락된 정보나 잘못된 명명 때문에 종종 발생합니다. 여기서 막히면 Hugging Face 팀이 도움을 줄 것입니다.
+
+```bash
+make quality
+```
+
+마지막으로, 코드가 정확히 작동하는 것을 확인한 후에는 항상 코드를 리팩토링하는 것이 좋은 생각입니다. 모든 테스트가 통과된 지금은 추가한 코드를 다시 검토하고 리팩토링하는 좋은 시기입니다.
+
+이제 코딩 부분을 완료했습니다. 축하합니다! 🎉 멋져요! 😎
+
+**12. 모델을 모델 허브에 업로드하세요**
+
+이 마지막 파트에서는 모든 체크포인트를 변환하여 모델 허브에 업로드하고 각 업로드된 모델 체크포인트에 대한 모델 카드를 추가해야 합니다. [Model sharing and uploading Page](model_sharing)를 읽고 허브 기능에 익숙해지세요. *brand_new_bert*의 저자 조직 아래에 모델을 업로드할 수 있는 필요한 액세스 권한을 얻기 위해 Hugging Face 팀과 협업해야 합니다. `transformers`의 모든 모델에 있는 `push_to_hub` 메서드는 체크포인트를 허브에 빠르고 효율적으로 업로드하는 방법입니다. 아래에 작은 코드 조각이 붙여져 있습니다:
+
+각 체크포인트에 적합한 모델 카드를 만드는 데 시간을 할애하는 것은 가치가 있습니다. 모델 카드는 체크포인트의 특성을 강조해야 합니다. *예를 들어* 이 체크포인트는 어떤 데이터셋에서 사전 훈련/세부 훈련되었는지? 이 모델은 어떤 하위 작업에서 사용해야 하는지? 그리고 모델을 올바르게 사용하는 방법에 대한 몇 가지 코드도 포함해야 합니다.
+
+```python
+brand_new_bert.push_to_hub("brand_new_bert")
+# Uncomment the following line to push to an organization.
+# brand_new_bert.push_to_hub("<organization>/brand_new_bert")
+```
+
+**13. (선택 사항) 노트북 추가**
+
+*brand_new_bert*를 다운스트림 작업에서 추론 또는 미세 조정에 사용하는 방법을 자세히 보여주는 노트북을 추가하는 것이 매우 유용합니다. 이것은 PR을 병합하는 데 필수적이지는 않지만 커뮤니티에 매우 유용합니다.
+
+**14. 완료된 PR 제출**
+
+이제 프로그래밍을 마쳤으며, 마지막 단계로 PR을 메인 브랜치에 병합해야 합니다. 보통 Hugging Face 팀은 이미 여기까지 도움을 주었을 것입니다. 그러나 PR에 멋진 설명을 추가하고 리뷰어에게 특정 디자인 선택 사항을 강조하려면 완료된 PR에 약간의 설명을 추가하는 시간을 할애하는 것이 가치가 있습니다.
+
+### 작업물을 공유하세요!! [[share-your-work]]
+
+이제 커뮤니티에서 작업물을 인정받을 시간입니다! 모델 추가 작업을 완료하는 것은 Transformers와 전체 NLP 커뮤니티에 큰 기여입니다. 당신의 코드와 이식된 사전 훈련된 모델은 수백, 심지어 수천 명의 개발자와 연구원에 의해 확실히 사용될 것입니다. 당신의 작업에 자랑스러워해야 하며 이를 커뮤니티와 공유해야 합니다.
+
+**당신은 커뮤니티 내 모든 사람들에게 매우 쉽게 접근 가능한 또 다른 모델을 만들었습니다! 🤯**
diff --git a/docs/source/ko/add_new_pipeline.md b/docs/source/ko/add_new_pipeline.md
new file mode 100644
index 000000000000..554300928b51
--- /dev/null
+++ b/docs/source/ko/add_new_pipeline.md
@@ -0,0 +1,248 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 어떻게 사용자 정의 파이프라인을 생성하나요? [[how-to-create-a-custom-pipeline]]
+
+이 가이드에서는 사용자 정의 파이프라인을 어떻게 생성하고 [허브](hf.co/models)에 공유하거나 🤗 Transformers 라이브러리에 추가하는 방법을 살펴보겠습니다.
+
+먼저 파이프라인이 수용할 수 있는 원시 입력을 결정해야 합니다.
+문자열, 원시 바이트, 딕셔너리 또는 가장 원하는 입력일 가능성이 높은 것이면 무엇이든 가능합니다.
+이 입력을 가능한 한 순수한 Python 형식으로 유지해야 (JSON을 통해 다른 언어와도) 호환성이 좋아집니다.
+이것이 전처리(`preprocess`) 파이프라인의 입력(`inputs`)이 될 것입니다.
+
+그런 다음 `outputs`를 정의하세요.
+`inputs`와 같은 정책을 따르고, 간단할수록 좋습니다.
+이것이 후처리(`postprocess`) 메소드의 출력이 될 것입니다.
+
+먼저 4개의 메소드(`preprocess`, `_forward`, `postprocess` 및 `_sanitize_parameters`)를 구현하기 위해 기본 클래스 `Pipeline`을 상속하여 시작합니다.
+
+
+```python
+from transformers import Pipeline
+
+
+class MyPipeline(Pipeline):
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_kwargs = {}
+        if "maybe_arg" in kwargs:
+            preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
+        return preprocess_kwargs, {}, {}
+
+    def preprocess(self, inputs, maybe_arg=2):
+        model_input = Tensor(inputs["input_ids"])
+        return {"model_input": model_input}
+
+    def _forward(self, model_inputs):
+        # model_inputs == {"model_input": model_input}
+        outputs = self.model(**model_inputs)
+        # Maybe {"logits": Tensor(...)}
+        return outputs
+
+    def postprocess(self, model_outputs):
+        best_class = model_outputs["logits"].softmax(-1)
+        return best_class
+```
+
+이 분할 구조는 CPU/GPU에 대한 비교적 원활한 지원을 제공하는 동시에, 다른 스레드에서 CPU에 대한 사전/사후 처리를 수행할 수 있게 지원하는 것입니다.
+
+`preprocess`는 원래 정의된 입력을 가져와 모델에 공급할 수 있는 형식으로 변환합니다.
+더 많은 정보를 포함할 수 있으며 일반적으로 `Dict` 형태입니다.
+
+`_forward`는 구현 세부 사항이며 직접 호출할 수 없습니다. 
+`forward`는 예상 장치에서 모든 것이 작동하는지 확인하기 위한 안전장치가 포함되어 있어 선호되는 호출 메소드입니다.
+실제 모델과 관련된 것은 `_forward` 메소드에 속하며, 나머지는 전처리/후처리 과정에 있습니다.
+
+`postprocess` 메소드는 `_forward`의 출력을 가져와 이전에 결정한 최종 출력 형식으로 변환합니다.
+
+`_sanitize_parameters`는 초기화 시간에 `pipeline(...., maybe_arg=4)`이나 호출 시간에 `pipe = pipeline(...); output = pipe(...., maybe_arg=4)`과 같이, 사용자가 원하는 경우 언제든지 매개변수를 전달할 수 있도록 허용합니다.
+
+`_sanitize_parameters`의 반환 값은 `preprocess`, `_forward`, `postprocess`에 직접 전달되는 3개의 kwargs 딕셔너리입니다.
+호출자가 추가 매개변수로 호출하지 않았다면 아무것도 채우지 마십시오.
+이렇게 하면 항상 더 "자연스러운" 함수 정의의 기본 인수를 유지할 수 있습니다.
+
+분류 작업에서 `top_k` 매개변수가 대표적인 예입니다.
+
+```python
+>>> pipe = pipeline("my-new-task")
+>>> pipe("This is a test")
+[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}, {"label": "3-star", "score": 0.05}
+{"label": "4-star", "score": 0.025}, {"label": "5-star", "score": 0.025}]
+
+>>> pipe("This is a test", top_k=2)
+[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}]
+```
+
+이를 달성하기 위해 우리는 `postprocess` 메소드를 기본 매개변수인 `5`로 업데이트하고 `_sanitize_parameters`를 수정하여 이 새 매개변수를 허용합니다.
+
+
+```python
+def postprocess(self, model_outputs, top_k=5):
+    best_class = model_outputs["logits"].softmax(-1)
+    # top_k를 처리하는 로직 추가
+    return best_class
+
+
+def _sanitize_parameters(self, **kwargs):
+    preprocess_kwargs = {}
+    if "maybe_arg" in kwargs:
+        preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
+
+    postprocess_kwargs = {}
+    if "top_k" in kwargs:
+        postprocess_kwargs["top_k"] = kwargs["top_k"]
+    return preprocess_kwargs, {}, postprocess_kwargs
+```
+
+입/출력을 가능한 한 간단하고 완전히 JSON 직렬화 가능한 형식으로 유지하려고 노력하십시오.
+이렇게 하면 사용자가 새로운 종류의 개체를 이해하지 않고도 파이프라인을 쉽게 사용할 수 있습니다.
+또한 사용 용이성을 위해 여러 가지 유형의 인수(오디오 파일은 파일 이름, URL 또는 순수한 바이트일 수 있음)를 지원하는 것이 비교적 일반적입니다.
+
+
+
+## 지원되는 작업 목록에 추가하기 [[adding-it-to-the-list-of-supported-tasks]]
+
+`new-task`를 지원되는 작업 목록에 등록하려면 `PIPELINE_REGISTRY`에 추가해야 합니다:
+
+```python
+from transformers.pipelines import PIPELINE_REGISTRY
+
+PIPELINE_REGISTRY.register_pipeline(
+    "new-task",
+    pipeline_class=MyPipeline,
+    pt_model=AutoModelForSequenceClassification,
+)
+```
+
+원하는 경우 기본 모델을 지정할 수 있으며, 이 경우 특정 개정(분기 이름 또는 커밋 해시일 수 있음, 여기서는 "abcdef")과 타입을 함께 가져와야 합니다:
+
+```python
+PIPELINE_REGISTRY.register_pipeline(
+    "new-task",
+    pipeline_class=MyPipeline,
+    pt_model=AutoModelForSequenceClassification,
+    default={"pt": ("user/awesome_model", "abcdef")},
+    type="text",  # 현재 지원 유형: text, audio, image, multimodal
+)
+```
+
+## Hub에 파이프라인 공유하기 [[share-your-pipeline-on-the-hub]]
+
+Hub에 사용자 정의 파이프라인을 공유하려면 `Pipeline` 하위 클래스의 사용자 정의 코드를 Python 파일에 저장하기만 하면 됩니다.
+예를 들어, 다음과 같이 문장 쌍 분류를 위한 사용자 정의 파이프라인을 사용한다고 가정해 보겠습니다:
+
+```py
+import numpy as np
+
+from transformers import Pipeline
+
+
+def softmax(outputs):
+    maxes = np.max(outputs, axis=-1, keepdims=True)
+    shifted_exp = np.exp(outputs - maxes)
+    return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
+
+
+class PairClassificationPipeline(Pipeline):
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_kwargs = {}
+        if "second_text" in kwargs:
+            preprocess_kwargs["second_text"] = kwargs["second_text"]
+        return preprocess_kwargs, {}, {}
+
+    def preprocess(self, text, second_text=None):
+        return self.tokenizer(text, text_pair=second_text, return_tensors=self.framework)
+
+    def _forward(self, model_inputs):
+        return self.model(**model_inputs)
+
+    def postprocess(self, model_outputs):
+        logits = model_outputs.logits[0].numpy()
+        probabilities = softmax(logits)
+
+        best_class = np.argmax(probabilities)
+        label = self.model.config.id2label[best_class]
+        score = probabilities[best_class].item()
+        logits = logits.tolist()
+        return {"label": label, "score": score, "logits": logits}
+```
+
+구현은 프레임워크에 구애받지 않으며, PyTorch와 TensorFlow 모델에 대해 작동합니다.
+이를 `pair_classification.py`라는 파일에 저장한 경우, 다음과 같이 가져오고 등록할 수 있습니다:
+
+```py
+from pair_classification import PairClassificationPipeline
+from transformers.pipelines import PIPELINE_REGISTRY
+from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification
+
+PIPELINE_REGISTRY.register_pipeline(
+    "pair-classification",
+    pipeline_class=PairClassificationPipeline,
+    pt_model=AutoModelForSequenceClassification,
+    tf_model=TFAutoModelForSequenceClassification,
+)
+```
+
+이 작업이 완료되면 사전훈련된 모델과 함께 사용할 수 있습니다.
+예를 들어, `sgugger/finetuned-bert-mrpc`은 MRPC 데이터 세트에서 미세 조정되어 문장 쌍을 패러프레이즈인지 아닌지를 분류합니다.
+
+```py
+from transformers import pipeline
+
+classifier = pipeline("pair-classification", model="sgugger/finetuned-bert-mrpc")
+```
+
+그런 다음 `Repository`의 `save_pretrained` 메소드를 사용하여 허브에 공유할 수 있습니다:
+
+```py
+from huggingface_hub import Repository
+
+repo = Repository("test-dynamic-pipeline", clone_from="{your_username}/test-dynamic-pipeline")
+classifier.save_pretrained("test-dynamic-pipeline")
+repo.push_to_hub()
+```
+
+이렇게 하면 "test-dynamic-pipeline" 폴더 내에 `PairClassificationPipeline`을 정의한 파일이 복사되며, 파이프라인의 모델과 토크나이저도 저장한 후, `{your_username}/test-dynamic-pipeline` 저장소에 있는 모든 것을 푸시합니다.
+이후에는 `trust_remote_code=True` 옵션만 제공하면 누구나 사용할 수 있습니다.
+
+```py
+from transformers import pipeline
+
+classifier = pipeline(model="{your_username}/test-dynamic-pipeline", trust_remote_code=True)
+```
+
+## 🤗 Transformers에 파이프라인 추가하기 [[add-the-pipeline-to-transformers]]
+
+🤗 Transformers에 사용자 정의 파이프라인을 기여하려면, `pipelines` 하위 모듈에 사용자 정의 파이프라인 코드와 함께 새 모듈을 추가한 다음, `pipelines/__init__.py`에서 정의된 작업 목록에 추가해야 합니다.
+
+그런 다음 테스트를 추가해야 합니다.
+`tests/test_pipelines_MY_PIPELINE.py`라는 새 파일을 만들고 다른 테스트와 예제를 함께 작성합니다.
+
+`run_pipeline_test` 함수는 매우 일반적이며, `model_mapping` 및 `tf_model_mapping`에서 정의된 가능한 모든 아키텍처의 작은 무작위 모델에서 실행됩니다.
+
+이는 향후 호환성을 테스트하는 데 매우 중요하며, 누군가 `XXXForQuestionAnswering`을 위한 새 모델을 추가하면 파이프라인 테스트가 해당 모델에서 실행을 시도한다는 의미입니다.
+모델이 무작위이기 때문에 실제 값을 확인하는 것은 불가능하므로, 단순히 파이프라인 출력 `TYPE`과 일치시키기 위한 도우미 `ANY`가 있습니다.
+
+또한 2개(이상적으로는 4개)의 테스트를 구현해야 합니다.
+
+- `test_small_model_pt`: 이 파이프라인에 대한 작은 모델 1개를 정의(결과가 의미 없어도 상관없음)하고 파이프라인 출력을 테스트합니다.
+결과는 `test_small_model_tf`와 동일해야 합니다.
+- `test_small_model_tf`: 이 파이프라인에 대한 작은 모델 1개를 정의(결과가 의미 없어도 상관없음)하고 파이프라인 출력을 테스트합니다.
+결과는 `test_small_model_pt`와 동일해야 합니다.
+- `test_large_model_pt`(`선택사항`): 결과가 의미 있을 것으로 예상되는 실제 파이프라인에서 파이프라인을 테스트합니다.
+이러한 테스트는 속도가 느리므로 이를 표시해야 합니다.
+여기서의 목표는 파이프라인을 보여주고 향후 릴리즈에서의 변화가 없는지 확인하는 것입니다.
+- `test_large_model_tf`(`선택사항`): 결과가 의미 있을 것으로 예상되는 실제 파이프라인에서 파이프라인을 테스트합니다.
+이러한 테스트는 속도가 느리므로 이를 표시해야 합니다.
+여기서의 목표는 파이프라인을 보여주고 향후 릴리즈에서의 변화가 없는지 확인하는 것입니다.
diff --git a/docs/source/ko/add_tensorflow_model.md b/docs/source/ko/add_tensorflow_model.md
new file mode 100644
index 000000000000..378f2163b5db
--- /dev/null
+++ b/docs/source/ko/add_tensorflow_model.md
@@ -0,0 +1,262 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 어떻게 🤗 Transformers 모델을 TensorFlow로 변환하나요? [[how-to-convert-a-transformers-model-to-tensorflow]]
+
+🤗 Transformers에서처럼 사용할 수 있는 여러 가지 프레임워크가 있다는 것은 애플리케이션을 설계할 때 그들의 강점을 유연하게 이용할 수 있다는 장점이 있지만, 모델 별로 호환성을 추가해야 한다는 단점 또한 존재한다는 것을 의미합니다. 좋은 소식은 기존 모델에 TensorFlow 호환성을 추가하는 것이 [처음부터 새로운 모델을 추가하는 것](add_new_model)보다도 간단하다는 것입니다! 
+
+만약 대규모 TensorFlow 모델을 더 깊이 이해하려거나, 오픈 소스에 큰 기여를 하려거나, 선택한 모델에 Tensorflow를 활용하려한다면, 이 안내서는 여러분께 도움이 될 것입니다.
+
+이 가이드는 Hugging Face 팀의 최소한의 감독 아래에서 🤗 Transformers에서 사용되는 TensorFlow 모델 가중치와/또는 아키텍처를 기여할 수 있는 커뮤니티 구성원인 여러분을 대상으로 합니다. 
+새로운 모델을 작성하는 것은 쉬운 일이 아니지만, 이 가이드를 통해 조금 덜 힘들고 훨씬 쉬운 작업으로 만들 수 있습니다. 
+모두의 경험을 모으는 것은 이 작업을 점차적으로 더 쉽게 만드는 데 굉장히 중요하기 때문에, 이 가이드를 개선시킬만한 제안이 떠오르면 공유하시는걸 적극적으로 권장합니다!
+
+더 깊이 알아보기 전에, 🤗 Transformers를 처음 접하는 경우 다음 자료를 확인하는 것이 좋습니다:
+- [🤗 Transformers의 일반 개요](add_new_model#general-overview-of-transformers)
+- [Hugging Face의 TensorFlow 철학](https://huggingface.co/blog/tensorflow-philosophy)
+
+이 가이드의 나머지 부분에서는 새로운 TensorFlow 모델 아키텍처를 추가하는 데 필요한 단계, Pytorch를 TensorFlow 모델 가중치로 변환하는 절차 및 ML 프레임워크 간의 불일치를 효율적으로 디버깅하는 방법을 알게 될 것입니다. 시작해봅시다!
+
+<Tip>
+
+사용하려는 모델이 이미 해당하는 TensorFlow 아키텍처가 있는지 확실하지 않나요?
+
+선택한 모델([예](https://huggingface.co/bert-base-uncased/blob/main/config.json#L14))의 `config.json`의 `model_type` 필드를 확인해보세요. 🤗 Transformers의 해당 모델 폴더에는 "modeling_tf"로 시작하는 파일이 있는 경우, 해당 모델에는 해당 TensorFlow 아키텍처([예](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert))가 있다는 의미입니다.
+
+</Tip>
+
+## TensorFlow 모델 아키텍처 코드 추가하는 단계별 가이드 [[step-by-step-guide-to add-tensorFlow-model-architecture-code]]
+
+대규모 아키텍처를 가진 모델을 설계하는 방법에는 여러가지가 있으며, 해당 설계를 구현하는 방법도 여러 가지입니다. 
+그러나 우리는 [🤗 Transformers 일반 개요](add_new_model#general-overview-of-transformers)에서 언급한 대로 일관된 설계 선택에 따라야지만 🤗 Transformers를 사용하기 편할 것이라는 확고한 의견을 가지고 있습니다.
+우리의 경험을 통해 TensorFlow 모델을 추가하는 데 관련된 중요한 몇 가지 사항을 알려 드릴 수 있습니다:
+
+- 이미 있는걸 다시 개발하려 하지 마세요! 최소한 2개의 이미 구현된 모델을 대개 참조해야 합니다. 구현하려는 모델과 기능상 동일한 Pytorch 모델 하나와 같은 문제 유형을 풀고 있는 다른 TensorFlow 모델 하나를 살펴보세요.
+- 우수한 모델 구현은 시간이 지나도 남아있습니다. 이것은 코드가 아름답다는 이유가 아니라 코드가 명확하고 디버깅 및 개선이 쉽기 때문입니다. TensorFlow 구현에서 다른 모델들과 패턴을 똑같이 하고 Pytorch 구현과의 불일치를 최소화하여 메인테이너의 업무를 쉽게 한다면, 기여한 코드가 오래도록 유지될 수 있습니다.
+- 필요하다면 도움을 요청하세요! 🤗 Transformers 팀은 여러분을 돕기 위해 있으며, 여러분이 직면한 동일한 문제에 대한 해결책을 이미 찾은 경우도 있을 수 있습니다.
+
+TensorFlow 모델 아키텍처를 추가하는 데 필요한 단계를 개략적으로 써보면:
+1. 변환하려는 모델 선택
+2. transformers 개발 환경 준비
+3. (선택 사항) 이론적 측면 및 기존 구현 이해
+4. 모델 아키텍처 구현
+5. 모델 테스트 구현
+6. PR (pull request) 제출
+7. (선택 사항) 데모 빌드 및 공유
+
+### 1.-3. 모델 기여 준비 [[1.-3.-prepare-your-model-contribution]]
+
+**1. 변환하려는 모델 선택**
+
+우선 기본 사항부터 시작해 보겠습니다. 먼저 변환하려는 아키텍처를 알아야 합니다. 
+특정 아키텍처에 대한 관심 없는 경우, 🤗 Transformers 팀에게 제안을 요청하는 것은 여러분의 영향력을 극대화하는 좋은 방법입니다. 
+우리는 TensorFlow에서 빠져 있는 가장 유명한 아키텍처로 이끌어 드리겠습니다. 
+TensorFlow에서 사용할 모델이 이미 🤗 Transformers에 TensorFlow 아키텍처 구현이 있지만 가중치가 없는 경우, 
+이 페이지의 [가중치 추가 섹션](#adding-tensorflow-weights-to-hub)으로 바로 이동하셔도 됩니다.
+
+간단히 말해서, 이 안내서의 나머지 부분은 TensorFlow 버전의 *BrandNewBert*([가이드](add_new_model)와 동일한 예제)를 기여하려고 결정했다고 가정합니다.
+
+<Tip>
+
+TensorFlow 모델 아키텍처에 작업을 시작하기 전에 해당 작업이 진행 중인지 확인하세요. 
+`BrandNewBert`를 검색하여
+[pull request GitHub 페이지](https://github.com/huggingface/transformers/pulls?q=is%3Apr)에서 TensorFlow 관련 pull request가 없는지 확인할 수 있습니다.
+
+</Tip>
+
+**2. transformers 개발 환경 준비**
+
+
+모델 아키텍처를 선택한 후, 관련 작업을 수행할 의도를 미리 알리기 위해 Draft PR을 여세요. 아래 지침대로 하시면 환경을 설정하고 Draft PR을 열 수 있습니다.
+
+1. 'Fork' 버튼을 클릭하여 [리포지터리](https://github.com/huggingface/transformers)를 포크하세요. 이렇게 하면 GitHub 사용자 계정에 코드의 사본이 생성됩니다.
+
+
+2. `transformers` 포크를 로컬 디스크에 클론하고 원본 리포지터리를 원격 리포지터리로 추가하세요.
+
+```bash
+git clone https://github.com/[your Github handle]/transformers.git
+cd transformers
+git remote add upstream https://github.com/huggingface/transformers.git
+```
+
+3. 개발 환경을 설정하세요. 예를 들어, 다음 명령을 실행하여 개발 환경을 설정할 수 있습니다.
+
+```bash
+python -m venv .env
+source .env/bin/activate
+pip install -e ".[dev]"
+```
+
+운영 체제에 따라서 Transformers의 선택적 종속성이 증가하면서 위 명령이 실패할 수도 있습니다. 그런 경우 TensorFlow를 설치한 후 다음을 실행하세요.
+
+```bash
+pip install -e ".[quality]"
+```
+
+**참고:** CUDA를 설치할 필요는 없습니다. 새로운 모델이 CPU에서 작동하도록 만드는 것만으로 충분합니다.
+
+4. 메인 브랜치에서 만드려는 기능이 잘 표현되는 이름으로 브랜치를 만듭니다.
+
+```bash
+git checkout -b add_tf_brand_new_bert
+```
+
+5. 메인 브랜치의 현재 상태를 페치(fetch)하고 리베이스하세요.
+
+```bash
+git fetch upstream
+git rebase upstream/main
+```
+
+6. `transformers/src/models/brandnewbert/`에 `modeling_tf_brandnewbert.py`라는 빈 `.py` 파일을 추가하세요. 이 파일이 TensorFlow 모델 파일이 될 것입니다.
+
+7. 변경 사항을 계정에 푸시하세요.
+
+```bash
+git add .
+git commit -m "initial commit"
+git push -u origin add_tf_brand_new_bert
+```
+
+8. 만족스러운 경우 GitHub에서 포크된 웹 페이지로 이동합니다. "Pull request"를 클릭합니다. Hugging Face 팀의 GitHub ID를 리뷰어로 추가해서, 앞으로의 변경 사항에 대해 Hugging Face 팀이 알림을 받을 수 있도록 합니다.
+
+
+9. GitHub Pull Requests 페이지의 오른쪽에 있는 "Convert to draft"를 클릭하여 PR을 초안으로 변경하세요.
+
+이제 🤗 Transformers에서 *BrandNewBert*를 TensorFlow로 변환할 개발 환경을 설정했습니다.
+
+
+**3. (선택 사항) 이론적 측면 및 기존 구현 이해**
+
+
+*BrandNewBert*처럼 자세한 글이 있다면 시간을 내어 논문을 읽는걸 추천드립니다. 이해하기 어려운 부분이 많을 수 있습니다. 그렇다고 해서 걱정하지 마세요! 목표는 논문의 심도있는 이론적 이해가 아니라 TensorFlow를 사용하여 🤗 Transformers에 모델을 효과적으로 다시 구현하는 데 필요한 필수 정보를 추출하는 것입니다. 많은 시간을 이론적 이해에 투자할 필요는 없지만 실용적인 측면에서 현재 존재하는 모델 문서 페이지(e.g. [model docs for BERT](model_doc/bert))에 집중하는 것이 좋습니다.
+
+
+모델의 기본 사항을 이해한 후, 기존 구현을 이해하는 것이 중요합니다. 이는 작업 중인 모델에 대한 실제 구현이 여러분의 기대와 일치함을 확인하고, TensorFlow 측면에서의 기술적 문제를 예상할 수 있습니다.
+
+막대한 양의 정보를 처음으로 학습할 때 압도당하는 것은 자연스러운 일입니다. 이 단계에서 모델의 모든 측면을 이해해야 하는 필요는 전혀 없습니다. 그러나 우리는 Hugging Face의 [포럼](https://discuss.huggingface.co/)을 통해 질문이 있는 경우 대답을 구할 것을 권장합니다.
+
+### 4. 모델 구현 [[4-model-implementation]]
+
+
+이제 드디어 코딩을 시작할 시간입니다. 우리의 제안된 시작점은 PyTorch 파일 자체입니다: `modeling_brand_new_bert.py`의 내용을 
+`src/transformers/models/brand_new_bert/` 내부의
+`modeling_tf_brand_new_bert.py`에 복사합니다. 이 섹션의 목표는 파일을 수정하고 🤗 Transformers의 import 구조를 업데이트하여 `TFBrandNewBert` 및 `TFBrandNewBert.from_pretrained(model_repo, from_pt=True)`가 성공적으로 작동하는 TensorFlow *BrandNewBert* 모델을 가져올 수 있도록 하는 것입니다.
+
+유감스럽게도, PyTorch 모델을 TensorFlow로 변환하는 규칙은 없습니다. 그러나 프로세스를 가능한한 원활하게 만들기 위해 다음 팁을 따를 수 있습니다.
+
+- 모든 클래스 이름 앞에 `TF`를 붙입니다(예: `BrandNewBert`는 `TFBrandNewBert`가 됩니다).
+- 대부분의 PyTorch 작업에는 직접적인 TensorFlow 대체가 있습니다. 예를 들어, `torch.nn.Linear`는 `tf.keras.layers.Dense`에 해당하고, `torch.nn.Dropout`은 `tf.keras.layers.Dropout`에 해당합니다. 특정 작업에 대해 확신이 없는 경우 [TensorFlow 문서](https://www.tensorflow.org/api_docs/python/tf)나 [PyTorch 문서](https://pytorch.org/docs/stable/)를 참조할 수 있습니다.
+- 🤗 Transformers 코드베이스에서 패턴을 찾으세요. 직접적인 대체가 없는 특정 작업을 만나면 다른 사람이 이미 동일한 문제를 해결한 경우가 많습니다.
+- 기본적으로 PyTorch와 동일한 변수 이름과 구조를 유지하세요. 이렇게 하면 디버깅과 문제 추적, 그리고 문제 해결 추가가 더 쉬워집니다.
+- 일부 레이어는 각 프레임워크마다 다른 기본값을 가지고 있습니다. 대표적인 예로 배치 정규화 레이어의 epsilon은 [PyTorch](https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm2d.html#torch.nn.BatchNorm2d)에서 `1e-5`이고 [TensorFlow](https://www.tensorflow.org/api_docs/python/tf/keras/layers/BatchNormalization)에서 `1e-3`입니다. 문서를 모두 확인하세요!
+- PyTorch의 `nn.Parameter` 변수는 일반적으로 TF 레이어의 `build()` 내에서 초기화해야 합니다. 다음 예를 참조하세요: [PyTorch](https://github.com/huggingface/transformers/blob/655f72a6896c0533b1bdee519ed65a059c2425ac/src/transformers/models/vit_mae/modeling_vit_mae.py#L212) /
+   [TensorFlow](https://github.com/huggingface/transformers/blob/655f72a6896c0533b1bdee519ed65a059c2425ac/src/transformers/models/vit_mae/modeling_tf_vit_mae.py#L220)
+- PyTorch 모델의 함수 상단에 `#copied from ...`가 있는 경우, TensorFlow 모델에 TensorFlow 아키텍처가 있다면 TensorFlow 모델이 해당 함수를 복사한 아키텍처에서 사용할 수 있습니다.
+- TensorFlow 함수에서 `name` 속성을 올바르게 할당하는 것은 `from_pt=True` 가중치 교차 로딩을 수행하는 데 중요합니다. `name`은 대부분 PyTorch 코드의 해당 변수의 이름입니다. `name`이 제대로 설정되지 않으면 모델 가중치를 로드할 때 오류 메시지에서 확인할 수 있습니다.
+- 기본 모델 클래스인 `BrandNewBertModel`의 로직은 실제로 Keras 레이어 서브클래스([예시](https://github.com/huggingface/transformers/blob/4fd32a1f499e45f009c2c0dea4d81c321cba7e02/src/transformers/models/bert/modeling_tf_bert.py#L719))인 `TFBrandNewBertMainLayer`에 있습니다. `TFBrandNewBertModel`은 이 레이어를 감싸기만 하는 래퍼 역할을 합니다.
+- Keras 모델은 사전 훈련된 가중치를 로드하기 위해 빌드되어야 합니다. 따라서 `TFBrandNewBertPreTrainedModel`은 모델의 입력 예제인 `dummy_inputs`([예시](https://github.com/huggingface/transformers/blob/4fd32a1f499e45f009c2c0dea4d81c321cba7e02/src/transformers/models/bert/modeling_tf_bert.py#L916)) 유지해야 합니다.
+- 도움이 필요한 경우 도움을 요청하세요. 우리는 여기 있어서 도움을 드리기 위해 있는 것입니다! 🤗
+
+모델 파일 자체 외에도 모델 클래스 및 관련 문서 페이지에 대한 포인터를 추가해야 합니다. 이 부분은 다른 PR([예시](https://github.com/huggingface/transformers/pull/18020/files))의 패턴을 따라 완전히 완료할 수 있습니다. 다음은 필요한 수동 변경 목록입니다.
+
+- `src/transformers/__init__.py`에 *BrandNewBert*의 모든 공개 클래스를 포함합니다.
+- `src/transformers/models/auto/modeling_tf_auto.py`에서 *BrandNewBert* 클래스를 해당 Auto 클래스에 추가합니다.
+- `src/transformers/utils/dummy_tf_objects.py`에 *BrandNewBert*와 관련된 레이지 로딩 클래스를 추가합니다.
+- `src/transformers/models/brand_new_bert/__init__.py`에서 공개 클래스에 대한 import 구조를 업데이트합니다.
+- `docs/source/en/model_doc/brand_new_bert.md`에서 *BrandNewBert*의 공개 메서드에 대한 문서 포인터를 추가합니다.
+- `docs/source/en/model_doc/brand_new_bert.md`의 *BrandNewBert* 기여자 목록에 자신을 추가합니다.
+- 마지막으로 ✅ 녹색 체크박스를 TensorFlow 열 docs/source/en/index.md 안 BrandNewBert에 추가합니다.
+
+구현이 만족하면 다음 체크리스트를 실행하여 모델 아키텍처가 준비되었는지 확인하세요.  
+
+1. 훈련 시간에 다르게 동작하는 `training` 인수로 불리는 모든 레이어(예: Dropout)는 최상위 클래스에서 전파됩니다.
+2. #copied from ...가능할 때마다 사용했습니다.
+3. `TFBrandNewBertMainLayer`와 그것을 사용하는 모든 클래스는 `call`함수로 `@unpack_inputs`와 함께 데코레이터 됩니다.
+4. `TFBrandNewBertMainLayer`는 `@keras_serializable`로 데코레이터 됩니다.
+5. TensorFlow 모델은 `TFBrandNewBert.from_pretrained(model_repo, from_pt=True)`를 사용하여 PyTorch 가중치에서 로드할 수 있습니다.
+6. 예상 입력 형식을 사용하여 TensorFlow 모델을 호출할 수 있습니다.
+
+### 5. 모델 테스트 구현 [[5-add-model-tests]]
+
+TensorFlow 모델 아키텍처를 구현하는 데 성공했습니다! 이제 TensorFlow 모델을 테스트하는 구현을 작성할 차례입니다. 이를 통해 모델이 예상대로 작동하는지 확인할 수 있습니다. 이전에 우리는 `test_modeling_brand_new_bert.py` 파일을 `tests/models/brand_new_bert/ into test_modeling_tf_brand_new_bert.py`에 복사한 뒤, TensorFlow로 교체하는 것이 좋습니다. 지금은, 모든 `.from_pretrained()`을 `from_pt=True`를 사용하여 존재하는 Pytorch 가중치를 가져오도록 해야합니다.  
+
+완료하셨으면, 이제 진실의 순간이 찾아왔습니다: 테스트를 실행해 보세요! 😬
+
+```bash
+NVIDIA_TF32_OVERRIDE=0 RUN_SLOW=1 RUN_PT_TF_CROSS_TESTS=1 \
+py.test -vv tests/models/brand_new_bert/test_modeling_tf_brand_new_bert.py
+```
+
+오류가 많이 나타날 것이지만 괜찮습니다! 기계 학습 모델을 디버깅하는 것은 악명높게 어려우며 성공의 핵심 요소는 인내심입니다 (`breakpoint()`도 필요합니다). 우리의 경험상으로는 ML 프레임워크 사이의 미묘한 불일치로 인해 가장 어려운 문제가 발생합니다. 이에 대한 몇 가지 지침이 이 가이드의 끝 부분에 있습니다. 다른 경우에는 일반 테스트가 직접 모델에 적용되지 않을 수 있으며, 이 경우 모델 테스트 클래스 레벨에서 재정의를 제안합니다. 문제가 무엇이든지 상관없이 문제가 있으면 당신이 고립되었다면 draft pull request에서 도움을 요청하는 것이 좋습니다.
+
+모든 테스트가 통과되면 축하합니다. 이제 모델을 🤗 Transformers 라이브러리에 추가할 준비가 거의 완료된 것입니다! 🎉
+
+
+테스트를 추가하는 방법에 대한 자세한 내용은 [🤗 Transformers의 테스트 가이드](https://huggingface.co/transformers/contributing.html#running-tests)를 참조하세요.
+
+### 6.-7. 모든 사용자가 당신의 모델을 사용할 수 있게 하기 [[6.-7.-ensure-everyone -can-use-your-model]]
+
+**6. 풀 요청 제출하기**
+
+구현과 테스트가 완료되면 풀 요청을 제출할 시간입니다. 코드를 푸시하기 전에 코드 서식 맞추기 유틸리티인 `make fixup` 🪄 를 실행하세요. 이렇게 하면 자동으로 서식 오류를 수정하며 자동 검사가 실패하는 것을 방지할 수 있습니다.
+
+이제 드래프트 풀 요청을 실제 풀 요청으로 변환하는 시간입니다. "리뷰 준비됨" 버튼을 클릭하고 Joao (`@gante`)와 Matt (`@Rocketknight1`)를 리뷰어로 추가하세요. 모델 풀 요청에는 적어도 3명의 리뷰어가 필요하지만, 그들이 당신의 모델에 적절한 추가 리뷰어를 찾을 것입니다.
+
+모든 리뷰어들이 PR 상태에 만족하면 마지막으로 `.from_pretrained()` 호출에서 `from_pt=True` 플래그를 제거하는 것입니다. TensorFlow 가중치가 없기 때문에 이를 추가해야 합니다! 이를 수행하는 방법은 아래 섹션의 지침을 확인하세요.
+
+마침내 TensorFlow 가중치가 병합되고, 적어도 3명의 리뷰어 승인을 받았으며 모든 CI 검사가 통과되었다면, 로컬로 테스트를 한 번 더 확인하세요.
+
+```bash
+NVIDIA_TF32_OVERRIDE=0 RUN_SLOW=1 RUN_PT_TF_CROSS_TESTS=1 \
+py.test -vv tests/models/brand_new_bert/test_modeling_tf_brand_new_bert.py
+```
+
+그리고 우리는 당신의 PR을 병합할 것입니다! 마일스톤 달성을 축하드립니다! 🎉
+
+**7. (선택 사항) 데모를 만들고 세상과 공유하기**
+
+오픈 소스의 가장 어려운 부분 중 하나는 발견입니다. 다른 사용자들이 당신의 멋진 TensorFlow 기여를 어떻게 알 수 있을까요? 물론 적절한 커뮤니케이션으로 가능합니다! 📣
+
+커뮤니티와 모델을 공유하는 두 가지 주요 방법이 있습니다:
+- 데모 만들기. Gradio 데모, 노트북 및 모델을 자랑하는 다른 재미있는 방법을 포함합니다. [커뮤니티 기반 데모](https://huggingface.co/docs/transformers/community)에 노트북을 추가하는 것을 적극 권장합니다.
+- Twitter와 LinkedIn과 같은 소셜 미디어에 이야기 공유하기. 당신의 작업에 자랑스러워하고 커뮤니티와 당신의 업적을 공유해야 합니다. 이제 당신의 모델은 전 세계의 수천 명의 엔지니어와 연구원들에 의해 사용될 수 있습니다 🌍! 우리는 당신의 게시물을 리트윗하고 커뮤니티와 함께 당신의 작업을 공유하는 데 도움이 될 것입니다.
+
+
+## 🤗 허브에 TensorFlow 가중치 추가하기 [[adding-tensorFlow-weights-to-🤗-hub]]
+
+TensorFlow 모델 아키텍처가 🤗 Transformers에서 사용 가능하다고 가정하고, PyTorch 가중치를 TensorFlow 가중치로 변환하는 것은 쉽습니다!
+
+다음은 그 방법입니다:
+1. 터미널에서 Hugging Face 계정으로 로그인되어 있는지 확인하십시오. `huggingface-cli login` 명령어를 사용하여 로그인할 수 있습니다. (액세스 토큰은 [여기](https://huggingface.co/settings/tokens)에서 찾을 수 있습니다.)
+2. `transformers-cli pt-to-tf --model-name foo/bar`를 실행하십시오. 여기서 `foo/bar`는 변환하려는 PyTorch 가중치가 있는 모델 저장소의 이름입니다.
+3. 방금 만든 🤗 허브 PR에서 `@joaogante`와 `@Rocketknight1`을 태그합니다.
+
+그게 다입니다! 🎉
+
+
+## ML 프레임워크 간 디버깅 🐛[[debugging-mismatches-across-ml-frameworks]]
+
+새로운 아키텍처를 추가하거나 기존 아키텍처에 대한 TensorFlow 가중치를 생성할 때, PyTorch와 TensorFlow 간의 불일치로 인한 오류가 발생할 수 있습니다. 심지어 두 프레임워크의 모델 아키텍처 코드가 동일해 보일 수도 있습니다. 무슨 일이 벌어지고 있는 걸까요? 🤔
+
+먼저, 이러한 불일치를 이해하는 이유에 대해 이야기해 보겠습니다. 많은 커뮤니티 멤버들은 🤗 Transformers 모델을 그대로 사용하고, 우리의 모델이 예상대로 작동할 것이라고 믿습니다. 두 프레임워크 간에 큰 불일치가 있으면 모델이 적어도 하나의 프레임워크에 대한 참조 구현을 따르지 않음을 의미합니다. 이는 모델이 의도한 대로 작동하지 않을 수 있음을 나타냅니다. 이는 아예 실행되지 않는 모델보다 나쁠 수 있습니다! 따라서 우리는 모든 모델의 프레임워크 불일치를 `1e-5`보다 작게 유지하는 것을 목표로 합니다.
+
+기타 숫자 문제와 마찬가지로, 세세한 문제가 있습니다. 그리고 세세함에 집중하는 공정에서 필수 요소는 인내심입니다. 이러한 종류의 문제가 발생할 때 권장되는 작업 흐름은 다음과 같습니다:
+1. 불일치의 원인을 찾아보십시오. 변환 중인 모델은 아마도 특정 지점까지 거의 동일한 내부 변수를 가지고 있을 것입니다. 두 프레임워크의 아키텍처에 `breakpoint()` 문을 넣고, 위에서 아래로 숫자 변수의 값을 비교하여 문제의 근원을 찾아냅니다.
+2. 이제 문제의 근원을 찾았으므로 🤗 Transformers 팀에 연락하세요. 우리는 비슷한 문제를 이전에 겪었을 수 있으며 빠르게 해결책을 제공할 수 있습니다. 예외적인 경우에는 StackOverflow와 GitHub 이슈와 같은 인기있는 페이지를 확인하십시오.
+3. 더 이상 해결책이 없는 경우, 더 깊이 들어가야 합니다. 좋은 소식은 문제의 원인을 찾았으므로 나머지 모델을 추상화하고 문제가 있는 명령어에 초점을 맞출 수 있습니다! 나쁜 소식은 해당 명령어의 소스 구현에 대해 알아봐야 한다는 것입니다. 일부 경우에는 참조 구현에 문제가 있을 수도 있으니 업스트림 저장소에서 이슈를 열기를 꺼리지 마십시오.
+
+어떤 경우에는 🤗 Transformers 팀과의 토론을 통해 불일치를 수정할 수 없을 수도 있습니다. 모델의 출력 레이어에서 불일치가 매우 작지만 숨겨진 상태에서 크게 나타날 수 있기 때문입니다. 이 경우 모델을 배포하는 것을 우선시하기 위해 불일치를 무시하기로 결정할 수도 있습니다. 위에서 언급한 `pt-to-tf` CLI에는 가중치 변환 시 오류 메시지를 무시하는 `--max-error` 플래그가 있습니다.
diff --git a/docs/source/ko/attention.md b/docs/source/ko/attention.md
new file mode 100644
index 000000000000..8f82a4b851e4
--- /dev/null
+++ b/docs/source/ko/attention.md
@@ -0,0 +1,54 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 어텐션 메커니즘[[attention_mechanisms]]
+
+대부분의 트랜스포머 모델은 정방행렬인 전체 어텐션을 사용합니다. 
+하지만 이는 긴 텍스트를 다룰 때는 큰 계산 병목 현상을 유발할 수 있습니다. 
+`Longformer`와 `Reformer`는 훈련 속도를 높이기 위해 어텐션 행렬의 희소 버전을 사용하여 효율을 높이려는 모델입니다.
+
+## LSH 어텐션[[lsh_attention]]
+
+
+[Reformer](#reformer)는 LSH(Locality Sensitive Hashing) 어텐션을 사용합니다. softmax(QK^t)에서는 행렬 QK^t의 (softmax 차원에서) 가장 큰 요소들만 유용한 기여를 할 것입니다. 
+따라서 각각의 쿼리 q에 대해, q와 가까운 키 k만 고려할 수 있습니다. 해시 함수는 q와 k가 가까운지 여부를 결정하는 데 사용됩니다. 
+어텐션 마스크는 현재 토큰을 마스킹하여 변경됩니다. 이 때 첫 번째 위치의 토큰은 제외합니다. 왜냐하면 쿼리와 키가 동일한 값을 갖게 되기 때문입니다(서로 매우 유사함). 
+해시는 약간의 무작위성을 가질 수 있으므로, 실제로는 여러 개의 해시 함수가 사용되고 (`n_rounds` 매개변수에 의해 결정됨) 그 후에 평균값을 취하게 됩니다.
+
+## 지역 어텐션[[local_attention]]
+
+[Longformer](#longformer)는 지역 어텐션을 사용합니다. 종종 특정 토큰에 대해 지역 컨텍스트(예: 왼쪽과 오른쪽에 있는 두 개의 토큰은 무엇인가요?)만으로도 작업을 수행하는데 충분합니다. 
+또한 작은 창(window)을 가진 어텐션 레이어를 쌓음으로써 마지막 레이어는 창 내의 토큰뿐만 아니라 더 많은 수의 토큰에 대한 수용 영역(receptive field)을 갖게 되어 전체 문장의 표현을 구축할 수 있습니다.
+
+사전에 선택된 일부 입력 토큰들은 전역 어텐션을 받습니다. 이 몇 개의 토큰에 대해서는 어텐션 행렬이 모든 토큰에 접근할 수 있으며, 이 과정은 대칭적으로 이루어집니다. 
+다른 모든 토큰들은 로컬 창 내의 토큰들에 더해 해당 특정 토큰들에도 접근할 수 있습니다. 이는 논문의 Figure 2d에서 나타나며, 아래에 샘플 어텐션 마스크가 제시되어 있습니다:
+
+
+<div class="flex justify-center">
+    <img scale="50 %" align="center" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/local_attention_mask.png"/>
+</div>
+
+
+적은 파라미터의 어텐션 행렬을 사용하면 모델이 더 큰 시퀀스 입력 길이를 가질 수 있습니다.
+
+## 다른 방법들[[other_tricks]]
+
+### 축별 위치 인코딩[[axial_positional_encodings]]
+
+[Reformer](#reformer)는 축별 위치 인코딩(axial positional encodings)을 사용합니다. 기존의 트랜스포머 모델에서는 위치 인코딩 행렬 E는 크기가 \\(l \times d\\)인 행렬이며, 
+여기서 \\(l\\)은 시퀀스 길이(sequence length)이고 \\(d\\)는 숨겨진 상태(hidden state)의 차원입니다. 매우 긴 텍스트의 경우, 이 행렬은 매우 크며 GPU 상에서 공간을 많이 차지할 수 있습니다. 
+이를 완화하기 위해, 축별 위치 인코딩은 큰 행렬 E를 두 개의 작은 행렬 E1과 E2로 분해합니다. 이때 E1의 크기는 \\(l_{1} \times d_{1}\\)이고, E2의 크기는 \\(l_{2} \times d_{2}\\)입니다. 
+이때 \\(l_{1} \times l_{2} = l\\)이고 \\(d_{1} + d_{2} = d\\)(길이에 대한 곱셈 연산을 사용하면 훨씬 작아집니다). E의 시간 단계 j에 대한 임베딩은 E1에서 시간 단계 \\(j \% l1\\)의 임베딩과 E2에서 시간 단계  \\(j // l1\\)의 임베딩을 연결하여 얻습니다.
\ No newline at end of file
diff --git a/docs/source/ko/autoclass_tutorial.md b/docs/source/ko/autoclass_tutorial.md
new file mode 100644
index 000000000000..9ecfd9c2015d
--- /dev/null
+++ b/docs/source/ko/autoclass_tutorial.md
@@ -0,0 +1,144 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# AutoClass로 사전 학습된 인스턴스 로드[[load-pretrained-instances-with-an-autoclass]]
+
+트랜스포머 아키텍처가 매우 다양하기 때문에 체크포인트에 맞는 아키텍처를 생성하는 것이 어려울 수 있습니다. 라이브러리를 쉽고 간단하며 유연하게 사용하기 위한 Transformer 핵심 철학의 일환으로, `AutoClass`는 주어진 체크포인트에서 올바른 아키텍처를 자동으로 추론하여 로드합니다. `from_pretrained()` 메서드를 사용하면 모든 아키텍처에 대해 사전 학습된 모델을 빠르게 로드할 수 있으므로 모델을 처음부터 학습하는 데 시간과 리소스를 투입할 필요가 없습니다. 
+체크포인트에 구애받지 않는 코드를 생성한다는 것은 코드가 한 체크포인트에서 작동하면 아키텍처가 다르더라도 다른 체크포인트(유사한 작업에 대해 학습된 경우)에서도 작동한다는 것을 의미합니다.
+
+<Tip>
+
+아키텍처는 모델의 골격을 의미하며 체크포인트는 주어진 아키텍처에 대한 가중치입니다. 예를 들어, [BERT](https://huggingface.co/bert-base-uncased)는 아키텍처이고, `bert-base-uncased`는 체크포인트입니다. 모델은 아키텍처 또는 체크포인트를 의미할 수 있는 일반적인 용어입니다.
+
+</Tip>
+
+이 튜토리얼에서는 다음을 학습합니다:
+
+* 사전 학습된 토크나이저 로드하기.
+* 사전 학습된 이미지 프로세서 로드하기.
+* 사전 학습된 특징 추출기 로드하기.
+* 사전 훈련된 프로세서 로드하기.
+* 사전 학습된 모델 로드하기.
+
+## AutoTokenizer[[autotokenizer]]
+
+거의 모든 NLP 작업은 토크나이저로 시작됩니다. 토크나이저는 사용자의 입력을 모델에서 처리할 수 있는 형식으로 변환합니다.
+[`AutoTokenizer.from_pretrained`]로 토크나이저를 로드합니다:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+```
+
+그리고 아래와 같이 입력을 토큰화합니다:
+
+```py
+>>> sequence = "In a hole in the ground there lived a hobbit."
+>>> print(tokenizer(sequence))
+{'input_ids': [101, 1999, 1037, 4920, 1999, 1996, 2598, 2045, 2973, 1037, 7570, 10322, 4183, 1012, 102], 
+ 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+ 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+```
+
+## AutoImageProcessor[[autoimageprocessor]]
+
+비전 작업의 경우 이미지 프로세서가 이미지를 올바른 입력 형식으로 처리합니다.
+
+```py
+>>> from transformers import AutoImageProcessor
+
+>>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
+```
+
+
+## AutoFeatureExtractor[[autofeatureextractor]]
+
+오디오 작업의 경우 특징 추출기가 오디오 신호를 올바른 입력 형식으로 처리합니다.
+
+[`AutoFeatureExtractor.from_pretrained`]로 특징 추출기를 로드합니다:
+
+```py
+>>> from transformers import AutoFeatureExtractor
+
+>>> feature_extractor = AutoFeatureExtractor.from_pretrained(
+...     "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
+... )
+```
+
+## AutoProcessor[[autoprocessor]]
+
+멀티모달 작업에는 두 가지 유형의 전처리 도구를 결합한 프로세서가 필요합니다. 예를 들어 LayoutLMV2 모델에는 이미지를 처리하는 이미지 프로세서와 텍스트를 처리하는 토크나이저가 필요하며, 프로세서는 이 두 가지를 결합합니다.
+
+[`AutoProcessor.from_pretrained()`]로 프로세서를 로드합니다:
+
+```py
+>>> from transformers import AutoProcessor
+
+>>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased")
+```
+
+## AutoModel[[automodel]]
+
+<frameworkcontent>
+<pt>
+마지막으로 AutoModelFor클래스를 사용하면 주어진 작업에 대해 미리 학습된 모델을 로드할 수 있습니다 (사용 가능한 작업의 전체 목록은 [여기](model_doc/auto)를 참조하세요). 예를 들어, [`AutoModelForSequenceClassification.from_pretrained`]를 사용하여 시퀀스 분류용 모델을 로드할 수 있습니다:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+```
+
+동일한 체크포인트를 쉽게 재사용하여 다른 작업에 아키텍처를 로드할 수 있습니다:
+
+```py
+>>> from transformers import AutoModelForTokenClassification
+
+>>> model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
+```
+
+<Tip warning={true}>
+
+PyTorch모델의 경우 `from_pretrained()` 메서드는 내부적으로 피클을 사용하여 안전하지 않은 것으로 알려진 `torch.load()`를 사용합니다. 
+일반적으로 신뢰할 수 없는 소스에서 가져왔거나 변조되었을 수 있는 모델은 로드하지 마세요. 허깅 페이스 허브에서 호스팅되는 공개 모델의 경우 이러한 보안 위험이 부분적으로 완화되며, 각 커밋 시 멀웨어를 [검사합니다](https://huggingface.co/docs/hub/security-malware). GPG를 사용해 서명된 [커밋 검증](https://huggingface.co/docs/hub/security-gpg#signing-commits-with-gpg)과 같은 모범사례는 [문서](https://huggingface.co/docs/hub/security)를 참조하세요.
+
+텐서플로우와 Flax 체크포인트는 영향을 받지 않으며, `from_pretrained`메서드에 `from_tf` 와 `from_flax` 키워드 가변 인자를 사용하여 이 문제를 우회할 수 있습니다.
+
+</Tip>
+
+일반적으로 AutoTokenizer 클래스와 AutoModelFor 클래스를 사용하여 미리 학습된 모델 인스턴스를 로드하는 것이 좋습니다. 이렇게 하면 매번 올바른 아키텍처를 로드할 수 있습니다. 다음 [튜토리얼](preprocessing)에서는 새롭게 로드한 토크나이저, 이미지 프로세서, 특징 추출기를 사용하여 미세 튜닝용 데이터 세트를 전처리하는 방법에 대해 알아봅니다.
+</pt>
+<tf>
+마지막으로 `TFAutoModelFor` 클래스를 사용하면 주어진 작업에 대해 사전 훈련된 모델을 로드할 수 있습니다. (사용 가능한 작업의 전체 목록은 [여기](model_doc/auto)를 참조하세요. 예를 들어, [`TFAutoModelForSequenceClassification.from_pretrained`]로 시퀀스 분류를 위한 모델을 로드합니다:
+
+```py
+>>> from transformers import TFAutoModelForSequenceClassification
+
+>>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+```
+
+쉽게 동일한 체크포인트를 재사용하여 다른 작업에 아키텍처를 로드할 수 있습니다:
+
+```py
+>>> from transformers import TFAutoModelForTokenClassification
+
+>>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
+```
+
+일반적으로, `AutoTokenizer`클래스와 `TFAutoModelFor` 클래스를 사용하여 미리 학습된 모델 인스턴스를 로드하는 것이 좋습니다. 이렇게 하면 매번 올바른 아키텍처를 로드할 수 있습니다. 다음 [튜토리얼](preprocessing)에서는 새롭게 로드한 토크나이저, 이미지 프로세서, 특징 추출기를 사용하여 미세 튜닝용 데이터 세트를 전처리하는 방법에 대해 알아봅니다.
+</tf>
+</frameworkcontent>
diff --git a/docs/source/ko/bertology.md b/docs/source/ko/bertology.md
new file mode 100644
index 000000000000..7b4f3dc4c493
--- /dev/null
+++ b/docs/source/ko/bertology.md
@@ -0,0 +1,41 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BERTology
+
+BERT와 같은 대규모 트랜스포머의 내부 동작을 조사하는 연구 분야가 점점 더 중요해지고 있습니다.
+혹자는 "BERTology"라 칭하기도 합니다. 이 분야의 좋은 예시는 다음과 같습니다:
+
+
+- BERT는 고전적인 NLP 파이프라인의 재발견 - Ian Tenney, Dipanjan Das, Ellie Pavlick:
+  https://arxiv.org/abs/1905.05950
+- 16개의 헤드가 정말로 1개보다 나은가? - Paul Michel, Omer Levy, Graham Neubig:
+  https://arxiv.org/abs/1905.10650
+- BERT는 무엇을 보는가? BERT의 어텐션 분석 - Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. Manning:
+  https://arxiv.org/abs/1906.04341
+- CAT-probing: 프로그래밍 언어에 대해 사전훈련된 모델이 어떻게 코드 구조를 보는지 알아보기 위한 메트릭 기반 접근 방법:
+  https://arxiv.org/abs/2210.04633
+
+우리는 이 새로운 연구 분야의 발전을 돕기 위해, BERT/GPT/GPT-2 모델에 내부 표현을 살펴볼 수 있는 몇 가지 기능을 추가했습니다.
+이 기능들은 주로 Paul Michel의 훌륭한 작업을 참고하여 개발되었습니다
+(https://arxiv.org/abs/1905.10650):
+
+
+- BERT/GPT/GPT-2의 모든 은닉 상태에 접근하기,
+- BERT/GPT/GPT-2의 각 헤드의 모든 어텐션 가중치에 접근하기,
+- 헤드의 출력 값과 그래디언트를 검색하여 헤드 중요도 점수를 계산하고 https://arxiv.org/abs/1905.10650에서 설명된 대로 헤드를 제거하는 기능을 제공합니다.
+
+이러한 기능들을 이해하고 직접 사용해볼 수 있도록 [bertology.py](https://github.com/huggingface/transformers/tree/main/examples/research_projects/bertology/run_bertology.py) 예제 스크립트를 추가했습니다. 이 예제 스크립트에서는 GLUE에 대해 사전훈련된 모델에서 정보를 추출하고 모델을 가지치기(prune)해봅니다.
diff --git a/docs/source/ko/community.md b/docs/source/ko/community.md
new file mode 100644
index 000000000000..2d12e9de4a28
--- /dev/null
+++ b/docs/source/ko/community.md
@@ -0,0 +1,69 @@
+<!--⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+# 커뮤니티 [[community]]
+
+이 페이지는 커뮤니티에서 개발한 🤗 Transformers 리소스를 재구성한 페이지입니다.
+
+## 커뮤니티 리소스: [[community-resources]]
+
+| 리소스     |      설명      |      만든이     |
+|:----------|:-------------|------:|
+| [Hugging Face Transformers 용어집 플래시카드](https://www.darigovresearch.com/huggingface-transformers-glossary-flashcards) | [Transformers 문서 용어집](glossary)을 기반으로 한 플래시카드 세트로, 지식을 장기적으로 유지하기 위해 특별히 설계된 오픈소스 크로스 플랫폼 앱인 [Anki](https://apps.ankiweb.net/)를 사용하여 쉽게 학습/수정할 수 있는 형태로 제작되었습니다. [플래시카드 사용법에 대한 소개 동영상](https://www.youtube.com/watch?v=Dji_h7PILrw)을 참조하세요. | [Darigov 리서치](https://www.darigovresearch.com/) |
+
+## 커뮤니티 노트북: [[community-notebooks]]
+
+| 노트북     |      설명      |      만든이      |      |
+|:----------|:-------------|:-------------|------:|
+| [가사를 생성하기 위해 사전훈련된 트랜스포머를 미세 조정하기](https://github.com/AlekseyKorshuk/huggingartists) | GPT-2 모델을 미세 조정하여 좋아하는 아티스트의 스타일로 가사를 생성하는 방법 | [Aleksey Korshuk](https://github.com/AlekseyKorshuk) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AlekseyKorshuk/huggingartists/blob/master/huggingartists-demo.ipynb) |
+| [Tensorflow 2로 T5 훈련하기](https://github.com/snapthat/TF-T5-text-to-text) |  Tensorflow 2를 사용하여 T5를 훈련시키는 방법. 이 노트북은 Tensorflow 2로 SQUAD를 사용하여 구현한 질의응답 작업을 보여줍니다. | [Muhammad Harris](https://github.com/HarrisDePerceptron) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snapthat/TF-T5-text-to-text/blob/master/snapthatT5/notebooks/TF-T5-Datasets%20Training.ipynb) |
+| [TPU에서 T5 훈련하기](https://github.com/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb) | Transformers와 Nlp를 사용하여 SQUAD로 T5를 훈련하는 방법 | [Suraj Patil](https://github.com/patil-suraj) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=QLGiFCDqvuil) |
+| [분류 및 객관식 문제를 위해 T5 미세 조정하기](https://github.com/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) | 분류 및 객관식 문제에 맞게 텍스트-텍스트 형식을 사용하여 PyTorch Lightning으로 T5를 미세 조정하는 방법 | [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) |
+| [새로운 데이터 세트와 언어로 DialoGPT 미세 조정하기](https://github.com/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) | 자유 대화형 챗봇을 만들기 위해 새로운 데이터 세트로 DialoGPT 모델을 미세 조정하는 방법 | [Nathan Cooper](https://github.com/ncoop57) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) |
+| [Reformer로 긴 시퀀스 모델링하기](https://github.com/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb)  | Reformer로 최대 50만 토큰의 시퀀스를 훈련하는 방법 | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb) |
+| [요약을 위해 BART 미세 조정하기](https://github.com/ohmeow/ohmeow_website/blob/master/posts/2021-05-25-mbart-sequence-classification-with-blurr.ipynb) | blurr를 사용하여 fastai로 요약하기 위해 BART를 미세 조정하는 방법 | [Wayde Gilliam](https://ohmeow.com/) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ohmeow/ohmeow_website/blob/master/posts/2021-05-25-mbart-sequence-classification-with-blurr.ipynb) |
+| [다른 사람의 트윗으로 사전훈련된 트랜스포머 미세 조정하기](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) | GPT-2 모델을 미세 조정하여 좋아하는 트위터 계정 스타일로 트윗을 생성하는 방법 | [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) |
+| [Weights & Biases로 🤗 Hugging Face 모델 최적화하기](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) | W&B와 Hugging Face의 통합을 보여주는 전체 튜토리얼 | [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) |
+| [Longformer 사전훈련하기](https://github.com/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb)  | 기존 사전훈련된 모델의 "긴" 버전을 빌드하는 방법 |  [Iz Beltagy](https://beltagy.net) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) |
+| [QA를 위해 Longformer 미세 조정하기](https://github.com/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) | QA 작업을 위해 Longformer를 미세 조정하는 방법 | [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) |
+| [🤗 Nlp로 모델 평가하기](https://github.com/patrickvonplaten/notebooks/blob/master/How_to_evaluate_Longformer_on_TriviaQA_using_NLP.ipynb) | `Nlp`로 TriviaQA에서 Longformer를 평가하는 방법 | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1m7eTGlPmLRgoPkkA7rkhQdZ9ydpmsdLE?usp=sharing) |
+| [감정 범위 추출을 위해 T5 미세 조정하기](https://github.com/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb)  | 감정 범위 추출을 위해 텍스트-텍스트 형식을 사용하여 PyTorch Lightning으로 T5를 미세 조정하는 방법 |  [Lorenzo Ampil](https://github.com/enzoampil) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb) |
+| [다중 클래스 분류를 위해 DistilBert 미세 조정하기](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb) | 다중 클래스 분류를 위해 PyTorch를 사용하여 DistilBert를 미세 조정하는 방법 | [Abhishek Kumar Mishra](https://github.com/abhimishra91) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb)|
+| [다중 레이블 분류를 위해 BERT 미세 조정하기](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb) | 다중 레이블 분류를 위해 PyTorch를 사용하여 BERT를 미세 조정하는 방법 | [Abhishek Kumar Mishra](https://github.com/abhimishra91) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)|
+| [요약을 위해 T5 미세 조정하기](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb) | 요약을 위해 PyTorch로 T5를 미세 조정하고 WandB로 실험을 추적하는 방법 | [Abhishek Kumar Mishra](https://github.com/abhimishra91) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)|
+| [동적 패딩/버켓팅으로 Transformers 미세 조정 속도 높이기](https://github.com/ELS-RD/transformers-notebook/blob/master/Divide_Hugging_Face_Transformers_training_time_by_2_or_more.ipynb)| 동적 패딩/버켓팅을 사용하여 미세 조정 속도를 2배로 높이는 방법 |[Michael Benesty](https://github.com/pommedeterresautee) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1CBfRU1zbfu7-ijiOqAAQUA-RJaxfcJoO?usp=sharing)|
+|[마스킹된 언어 모델링을 위해 Reformer 사전훈련하기](https://github.com/patrickvonplaten/notebooks/blob/master/Reformer_For_Masked_LM.ipynb)| 양방향 셀프 어텐션 레이어를 이용해서 Reformer 모델을 훈련하는 방법 | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tzzh0i8PgDQGV3SMFUGxM7_gGae3K-uW?usp=sharing)|
+| [Sci-BERT 확장 및 미세 조정하기](https://github.com/lordtt13/word-embeddings/blob/master/COVID-19%20Research%20Data/COVID-SciBERT.ipynb)| CORD 데이터 세트로 AllenAI에서 사전훈련된 SciBERT 모델의 어휘를 늘리고 파이프라인을 구축하는 방법 | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1rqAR40goxbAfez1xvF3hBJphSCsvXmh8)|
+| [요약을 위해 Trainer API로 BlenderBotSmall 미세 조정하기](https://github.com/lordtt13/transformers-experiments/blob/master/Custom%20Tasks/fine-tune-blenderbot_small-for-summarization.ipynb)| 요약을 위해 Trainer API를 사용하여 사용자 지정 데이터 세트로 BlenderBotSmall 미세 조정하기 | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/19Wmupuls7mykSGyRN_Qo6lPQhgp56ymq?usp=sharing)|
+| [통합 기울기(Integrated Gradient)를 이용하여 Electra 미세 조정하고 해석하기](https://github.com/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb) | 감정 분석을 위해 Electra를 미세 조정하고 Captum 통합 기울기로 예측을 해석하는 방법 | [Eliza Szczechla](https://elsanns.github.io) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb)|
+| [Trainer 클래스로 비영어권 GPT-2 모델 미세 조정하기](https://github.com/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb) | Trainer 클래스로 비영어권 GPT-2 모델을 미세 조정하는 방법 | [Philipp Schmid](https://www.philschmid.de) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb)|
+|[다중 라벨 분류 작업을 위해 DistilBERT 모델 미세 조정하기](https://github.com/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb) | 다중 라벨 분류 작업을 위해 DistilBERT 모델을 미세 조정하는 방법 | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb)|
+|[문장쌍 분류를 위해 ALBERT 미세 조정하기](https://github.com/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb) | 문장쌍 분류 작업을 위해 ALBERT 모델 또는 다른 BERT 기반 모델을 미세 조정하는 방법 | [Nadir El Manouzi](https://github.com/NadirEM) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb)|
+|[감정 분석을 위해 Roberta 미세 조정하기](https://github.com/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb) | 감정 분석을 위해 Roberta 모델을 미세 조정하는 방법 | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb)|
+|[질문 생성 모델 평가하기](https://github.com/flexudy-pipe/qugeev) | seq2seq 트랜스포머 모델이 생성한 질문과 이에 대한 답변이 얼마나 정확한가요? | [Pascal Zoleko](https://github.com/zolekode) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1bpsSqCQU-iw_5nNoRm_crPq6FRuJthq_?usp=sharing)|
+|[DistilBERT와 Tensorflow로 텍스트 분류하기](https://github.com/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb) | 텍스트 분류를 위해 TensorFlow로  DistilBERT를 미세 조정하는 방법 | [Peter Bayerle](https://github.com/peterbayerle) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb)|
+|[CNN/Dailail 요약을 위해 인코더-디코더 모델에 BERT 활용하기](https://github.com/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb) | CNN/Dailail 요약을 위해 *bert-base-uncased* 체크포인트를 활용하여 *EncoderDecoderModel*을 워밍업하는 방법 | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb)|
+|[BBC XSum 요약을 위해 인코더-디코더 모델에 RoBERTa 활용하기](https://github.com/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb) | BBC/XSum 요약을 위해 *roberta-base* 체크포인트를 활용하여 공유 *EncoderDecoderModel*을 워밍업하는 방법 | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb)|
+|[순차적 질문 답변(SQA)을 위해 TAPAS 미세 조정하기](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) | *tapas-base* 체크포인트를 활용하여 순차적 질문 답변(SQA) 데이터 세트로 *TapasForQuestionAnswering*을 미세 조정하는 방법 | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb)|
+|[표 사실 검사(TabFact)로 TAPAS 평가하기](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb) | 🤗 Datasets와 🤗 Transformer 라이브러리를 함께 사용하여 *tapas-base-finetuned-tabfact* 체크포인트로 미세 조정된 *TapasForSequenceClassification*을 평가하는 방법 | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb)|
+|[번역을 위해 mBART 미세 조정하기](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb) | 힌디어에서 영어로 번역하기 위해 Seq2SeqTrainer를 사용하여 mBART를 미세 조정하는 방법 | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb)|
+|[FUNSD(양식 이해 데이터 세트)로 LayoutLM 미세 조정하기](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb) | 스캔한 문서에서 정보 추출을 위해 FUNSD 데이터 세트로 *LayoutLMForTokenClassification*을 미세 조정하는 방법 | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb)|
+|[DistilGPT2 미세 조정하고 및 텍스트 생성하기](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb) | DistilGPT2를 미세 조정하고 텍스트를 생성하는 방법 | [Aakash Tripathi](https://github.com/tripathiaakash) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb)|
+|[최대 8K 토큰에서 LED 미세 조정하기](https://github.com/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb) | 긴 범위를 요약하기 위해 PubMed로 LED를 미세 조정하는 방법 | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb)|
+|[Arxiv로 LED 평가하기](https://github.com/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb) | 긴 범위 요약에 대해 LED를 효과적으로 평가하는 방법 | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb)|
+|[RVL-CDIP(문서 이미지 분류 데이터 세트)로 LayoutLM 미세 조정하기)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb) | 스캔 문서 분류를 위해 RVL-CDIP 데이터 세트로 *LayoutLMForSequenceClassification*을 미세 조정하는 방법 | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb)|
+|[GPT2 조정을 통한 Wav2Vec2 CTC 디코딩](https://github.com/voidful/huggingface_notebook/blob/main/xlsr_gpt.ipynb) | 언어 모델 조정을 통해 CTC 시퀀스를 디코딩하는 방법 | [Eric Lam](https://github.com/voidful) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1e_z5jQHYbO2YKEaUgzb1ww1WwiAyydAj?usp=sharing)|
+|[Trainer 클래스로 두 개 언어로 요약하기 위해 BART 미세 조정하기](https://github.com/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb) | Trainer 클래스로 두 개 언어로 요약하기 위해 BART 미세 조정하는 방법 | [Eliza Szczechla](https://github.com/elsanns) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb)|
+|[Trivia QA로 Big Bird 평가하기](https://github.com/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb) | Trivia QA로 긴 문서 질문에 대한 답변에 대해 BigBird를 평가하는 방법 | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb)|
+| [Wav2Vec2를 사용하여 동영상 캡션 만들기](https://github.com/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) | Wav2Vec으로 오디오를 텍스트로 변환하여 모든 동영상에서 YouTube 캡션 만드는 방법 | [Niklas Muennighoff](https://github.com/Muennighoff) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) |
+| [PyTorch Lightning을 사용하여 CIFAR-10으로 비전 트랜스포머 미세 조정하기](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) | HuggingFace Transformers, Datasets, PyTorch Lightning을 사용하여 CIFAR-10으로 비전 트랜스포머(ViT)를 미세 조정하는 방법  | [Niels Rogge](https://github.com/nielsrogge) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) |
+| [🤗 Trainer를 사용하여 CIFAR-10에서 비전 트랜스포머 미세 조정하기](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) | Datasets, 🤗 Trainer를 사용하여 CIFAR-10에서 비전 트랜스포머(ViT)를 미세 조정하는 방법 | [Niels Rogge](https://github.com/nielsrogge) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) |
+| [개체 입력 데이터 세트인 Open Entity로 LUKE 평가하기](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) | Open Entity 데이터 세트로 *LukeForEntityClassification*을 평가하는 방법 | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) |
+| [관계 추출 데이터 세트인 TACRED로 LUKE 평가하기](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) | TACRED 데이터 세트로 *LukeForEntityPairClassification*을 평가하는 방법 | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) |
+| [중요 NER 벤치마크인 CoNLL-2003으로 LUKE 평가하기](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) | CoNLL-2003 데이터 세트로 *LukeForEntitySpanClassification*를 평가하는 방법 | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) |
+| [PubMed 데이터 세트로 BigBird-Pegasus 평가하기](https://github.com/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) | PubMed 데이터 세트로 *BigBirdPegasusForConditionalGeneration*를 평가하는 방법 | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) |
+| [Wav2Vec2를 사용해서 음성 감정 분류하기](https://github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) | 감정 분류를 위해 사전훈련된 Wav2Vec2 모델을 MEGA 데이터 세트에 활용하는 방법 | [Mehrdad Farahani](https://github.com/m3hrdadfi) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) |
+| [DETR로 이미지에서 객체 탐지하기](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) | 훈련된 *DetrForObjectDetection* 모델을 사용하여 이미지에서 객체를 탐지하고 어텐션을 시각화하는 방법 | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) |
+| [사용자 지정 객체 탐지 데이터 세트로 DETR 미세 조정하기](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) | 사용자 지정 객체 탐지 데이터 세트로 *DetrForObjectDetection*을 미세 조정하는 방법 | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) |
+| [개체명 인식을 위해 T5 미세 조정하기](https://github.com/ToluClassics/Notebooks/blob/main/T5_Ner_Finetuning.ipynb) | 개체명 인식 작업을 위해 *T5*를 미세 조정하는 방법 | [Ogundepo Odunayo](https://github.com/ToluClassics) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1obr78FY_cBmWY5ODViCmzdY6O1KB65Vc?usp=sharing) |
diff --git a/docs/source/ko/contributing.md b/docs/source/ko/contributing.md
new file mode 100644
index 000000000000..0f37c2b09265
--- /dev/null
+++ b/docs/source/ko/contributing.md
@@ -0,0 +1,332 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# 🤗 Transformers에 기여하기 [[contribute-to-transformers]]
+
+누구나 🤗 Transformers에 기여할 수 있으며, 우리는 모든 사람의 기여를 소중히 생각합니다. 코드 기여는 커뮤니티를 돕는 유일한 방법이 아닙니다. 질문에 답하거나 다른 사람을 도와 문서를 개선하는 것도 매우 가치가 있습니다.
+
+🤗 Transformers를 널리 알리는 것도 큰 도움이 됩니다! 멋진 프로젝트들을 가능하게 한 🤗 Transformers 라이브러리에 대해 블로그 게시글에 언급하거나, 도움이 되었을 때마다 Twitter에 알리거나, 저장소에 ⭐️ 를 표시하여 감사 인사를 전해주세요.
+
+어떤 방식으로 기여하든 [행동 규칙](https://github.com/huggingface/transformers/blob/main/CODE_OF_CONDUCT.md)을 숙지하고 존중해주세요.
+
+**이 안내서는 멋진 [scikit-learn 기여 안내서](https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md)에서 큰 영감을 받았습니다.**
+
+## 기여하는 방법 [[ways-to-contribute]]
+
+여러 가지 방법으로 🤗 Transformers에 기여할 수 있습니다:
+
+* 기존 코드의 미해결된 문제를 수정합니다.
+* 버그 또는 새로 추가되길 원하는 기능과 관련된 이슈를 제출합니다.
+* 새로운 모델을 구현합니다.
+* 예제나 문서에 기여합니다.
+
+어디서부터 시작할지 모르겠다면, [Good First Issue](https://github.com/huggingface/transformers/contribute) 목록을 확인해보세요. 이 목록은 초보자도 참여하기 쉬운 오픈 이슈 목록을 제공하며, 당신이 오픈소스에 처음으로 기여하는 데 큰 도움이 될 것입니다. 그저 작업하고 싶은 이슈에 댓글만 달아주면 됩니다.
+
+조금 더 도전적인 작업을 원한다면, [Good Second Issue](https://github.com/huggingface/transformers/labels/Good%20Second%20Issue) 목록도 확인해보세요. 이미 당신이 잘 하고 있다고 생각되더라도, 한 번 시도해보세요! 우리도 여러분을 도울 것입니다. 🚀
+
+> 커뮤니티에 이루어지는 모든 기여는 똑같이 소중합니다. 🥰
+
+## 미해결된 문제 수정하기 [[fixing-outstanding-issues]]
+
+기존 코드에서 발견한 문제점에 대한 해결책이 떠오른 경우, 언제든지 [기여를 시작](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md/#create-a-pull-request)하고 Pull Request를 생성해주세요!
+
+## 버그 관련 이슈를 제기하거나 새로운 기능 요청하기 [[submitting-a-bugrelated-issue-or-feature-request]]
+
+버그 관련 이슈를 제기하거나 새로운 기능을 요청할 때는 다음 가이드라인을 최대한 준수해주세요. 이렇게 하면 좋은 피드백과 함께 빠르게 답변해 드릴 수 있습니다.
+
+### 버그를 발견하셨나요? [[did-you-find-a-bug]]
+
+🤗 Transformers 라이브러리는 사용 중에 겪는 문제를 보고해주는 사용자들 덕분에 더욱 견고해지고 신뢰할 수 있게 되었습니다.
+
+이슈를 보고하기 전에, 버그가 이미 **보고되지 않았는지** 확인해주세요. (GitHub의 이슈 탭 아래의 검색 바를 사용하세요). 이슈는 라이브러리 자체에서 발생한 버그어야 하며, 코드의 다른 부분과 관련된 것이 아니어야 합니다. 버그가 라이브러리의 문제로 발생하였는지 확실하지 않은 경우 먼저 [포럼](https://discuss.huggingface.co/)에서 질문해 주세요. 이렇게 하면 일반적인 질문보다 라이브러리와 관련된 문제를 더 빠르게 해결할 수 있습니다.
+
+버그가 이미 보고되지 않았다는 것을 확인했다면, 다음 정보를 포함하여 이슈를 제출해 주세요. 그러면 우리가 빠르게 해결할 수 있습니다:
+
+* 사용 중인 **운영체제 종류와 버전**, 그리고 **Python**, **PyTorch** 또는 **TensorFlow** 버전.
+* 버그를 30초 이내로 재현할 수 있는 간단하고 독립적인 코드 스니펫.
+* 예외가 발생한 경우 *전체* 트레이스백.
+* 스크린샷과 같이 도움이 될 것으로 생각되는 추가 정보를 첨부해 주세요.
+
+운영체제와 소프트웨어 버전을 자동으로 가져오려면 다음 명령을 실행하세요:
+
+```bash
+transformers-cli env
+```
+
+저장소의 루트 디렉터리에서도 같은 명령을 실행할 수 있습니다:
+
+```bash
+python src/transformers/commands/transformers_cli.py env
+```
+
+
+### 새로운 기능을 원하시나요? [[do-you-want-a-new-feature]]
+
+🤗 Transformers에서 사용하고 싶은 새로운 기능이 있다면, 다음 내용을 포함하여 이슈를 제출해 주세요:
+
+1. 이 기능이 필요한 *이유*는 무엇인가요? 라이브러리에 대한 문제나 불만과 관련이 있나요? 프로젝트에 필요한 기능인가요? 커뮤니티에 도움이 될 만한 기능인가요?
+
+   어떤 내용이든 여러분의 이야기를 듣고 싶습니다!
+
+2. 요청하는 기능을 최대한 자세히 설명해 주세요. 더 많은 정보를 제공할수록 더 나은 도움을 드릴 수 있습니다.
+3. 해당 기능의 사용법을 보여주는 *코드 스니펫*을 제공해 주세요.
+4. 기능과 관련된 논문이 있는 경우 링크를 포함해 주세요.
+
+이슈가 잘 작성되었다면 이슈가 생성된 순간, 이미 80% 정도의 작업이 완료된 것입니다. 
+
+이슈를 제기하는 데 도움이 될 만한 [템플릿](https://github.com/huggingface/transformers/tree/main/templates)도 준비되어 있습니다.
+
+## 새로운 모델을 구현하고 싶으신가요? [[do-you-want-to-implement-a-new-model]]
+
+새로운 모델은 계속해서 출시됩니다. 만약 여러분이 새로운 모델을 구현하고 싶다면 다음 정보를 제공해 주세요.
+
+* 모델에 대한 간단한 설명과 논문 링크.
+* 구현이 공개되어 있다면 구현 링크.
+* 모델 가중치가 사용 가능하다면 가중치 링크.
+
+만약 모델을 직접 기여하고 싶으시다면, 알려주세요. 🤗 Transformers에 추가할 수 있도록 도와드리겠습니다!
+
+새로운 모델을 추가하는 방법에 대한 [상세 안내서와 템플릿](https://github.com/huggingface/transformers/tree/main/templates)을 제공하고 있으며, [🤗 Transformers에 새로운 모델을 추가하는 방법](https://huggingface.co/docs/transformers/add_new_model)에 대한 기술적인 안내서도 있습니다.
+
+## 문서를 추가하고 싶으신가요? [[do-you-want-to-add-documentation]]
+
+우리는 언제나 더 명확하고 정확한 문서를 제공하기 위하여 개선점을 찾고 있습니다. 오탈자나 부족한 내용, 분명하지 않거나 부정확한 내용 등을 알려주시면 개선하는 데 도움이 됩니다. 관심이 있으시다면 변경하거나 기여하실 수 있도록 도와드리겠습니다!
+
+문서를 생성, 빌드 및 작성하는 방법에 대한 자세한 내용은 [README](https://github.com/huggingface/transformers/tree/main/docs) 문서를 확인해 주세요.
+
+## 풀 리퀘스트(Pull Request) 생성하기 [[create-a-pull-request]]
+
+코드를 작성하기 전에 기존의 Pull Request나 이슈를 검색하여 누군가 이미 동일한 작업을 하고 있는지 확인하는 것이 좋습니다. 확실하지 않다면 피드백을 받기 위해 이슈를 열어보는 것이 좋습니다.
+
+🤗 Transformers에 기여하기 위해서는 기본적인 `git` 사용 능력이 필요합니다. `git`은 사용하기 쉬운 도구는 아니지만, 매우 훌륭한 매뉴얼을 제공합니다. 쉘(shell)에서 `git --help`을 입력하여 확인해보세요! 만약 책을 선호한다면, [Pro Git](https://git-scm.com/book/en/v2)은 매우 좋은 참고 자료가 될 것입니다.
+
+🤗 Transformers에 기여하려면 **[Python 3.8]((https://github.com/huggingface/transformers/blob/main/setup.py#L426))** 이상의 버전이 필요합니다. 기여를 시작하려면 다음 단계를 따르세요:
+
+1. 저장소 페이지에서 **[Fork](https://github.com/huggingface/transformers/fork)** 버튼을 클릭하여 저장소를 포크하세요. 이렇게 하면 코드의 복사본이 여러분의 GitHub 사용자 계정 아래에 생성됩니다.
+
+2. 포크한 저장소를 로컬 디스크로 클론하고, 기본 저장소를 원격(remote)으로 추가하세요:
+
+   ```bash
+   git clone git@github.com:<your Github handle>/transformers.git
+   cd transformers
+   git remote add upstream https://github.com/huggingface/transformers.git
+   ```
+
+3. 개발 변경 사항을 저장할 새 브랜치를 생성하세요:
+
+   ```bash
+   git checkout -b a-descriptive-name-for-my-changes
+   ```
+
+   🚨 절대 `main` 브랜치에서 작업하지 **마세요!**
+
+4. 가상 환경에서 다음 명령을 실행하여 개발 환경을 설정하세요:
+
+   ```bash
+   pip install -e ".[dev]"
+   ```
+
+   만약 이미 가상 환경에 🤗 Transformers가 설치되어 있다면, `-e` 플래그를 사용하여 설치하기 전에 `pip uninstall transformers`로 제거해주세요.
+   
+   여러분의 운영체제에 따라서, 그리고 🤗 Transformers의 선택적 의존성의 수가 증가하면서, 이 명령이 실패할 수도 있습니다. 그럴 경우 사용하려는 딥러닝 프레임워크(PyTorch, TensorFlow, 그리고/또는 Flax)를 설치한 후 아래 명령을 실행해주세요:
+
+   ```bash
+   pip install -e ".[quality]"
+   ```
+
+   대부분의 경우 이것으로 충분할 것입니다.
+
+5. 브랜치에서 기능을 개발하세요.
+
+   코드를 작업하는 동안 테스트 스위트(test suite)가 통과하는지 확인하세요. 다음과 같이 변경 사항에 영향을 받는 테스트를 실행하세요:
+
+   ```bash
+   pytest tests/<TEST_TO_RUN>.py
+   ```
+
+   테스트에 대한 더 많은 정보는 [테스트](https://huggingface.co/docs/transformers/testing) 가이드를 확인하세요.
+
+   🤗 Transformers는 `black`과 `ruff`를 사용하여 소스 코드의 형식을 일관되게 유지합니다. 변경 사항을 적용한 후에는 다음 명령으로 자동으로 스타일 교정 및 코드 검증을 수행하세요:
+
+   ```bash
+   make fixup
+   ```
+
+   이것은 또한 작업 중인 PR에서 수정한 파일에서만 작동하도록 최적화되어 있습니다.
+
+   검사를 하나씩 실행하려는 경우, 다음 명령으로 스타일 교정을 적용할 수 있습니다:
+
+   ```bash
+   make style
+   ```
+
+   🤗 Transformers는 또한 `ruff`와 몇 가지 사용자 정의 스크립트를 사용하여 코딩 실수를 확인합니다. CI를 통해 품질 관리가 수행되지만, 다음 명령으로 동일한 검사를 실행할 수 있습니다:
+
+   ```bash
+   make quality
+   ```
+
+   마지막으로, 새 모델을 추가할 때 일부 파일을 업데이트하는 것을 잊지 않도록 하기 위한 많은 스크립트가 있습니다. 다음 명령으로 이러한 스크립트를 실행할 수 있습니다:
+
+   ```bash
+   make repo-consistency
+   ```
+
+   이러한 검사에 대해 자세히 알아보고 관련 문제를 해결하는 방법은 [Pull Request에 대한 검사](https://huggingface.co/docs/transformers/pr_checks) 가이드를 확인하세요.
+
+   만약 `docs/source` 디렉터리 아래의 문서를 수정하는 경우, 문서가 빌드될 수 있는지 확인하세요. 이 검사는 Pull Request를 열 때도 CI에서 실행됩니다. 로컬 검사를 실행하려면 문서 빌더를 설치해야 합니다:
+   
+   ```bash
+   pip install ".[docs]"
+   ```
+
+   저장소의 루트 디렉터리에서 다음 명령을 실행하세요:
+
+   ```bash
+   doc-builder build transformers docs/source/en --build_dir ~/tmp/test-build
+   ```
+
+   이 명령은 `~/tmp/test-build` 폴더에 문서를 빌드하며, 생성된 Markdown 파일을 선호하는 편집기로 확인할 수 있습니다. Pull Request를 열 때 GitHub에서 문서를 미리 볼 수도 있습니다.
+
+   변경 사항에 만족하면 `git add`로 변경된 파일을 추가하고, `git commit`으로 변경 사항을 로컬에 기록하세요:
+
+   ```bash
+   git add modified_file.py
+   git commit
+   ```
+
+   [좋은 커밋 메시지](https://chris.beams.io/posts/git-commit/)를 작성하여 변경 사항을 명확하게 전달하세요!
+
+   변경 사항을 프로젝트 원본 저장소와 동기화하려면, PR을 *열기 전에* 브랜치를 `upstream/branch`로 리베이스(rebase)하세요. 또는 관리자의 요청에 이 작업이 필요할 수 있습니다:
+
+   ```bash
+   git fetch upstream
+   git rebase upstream/main
+   ```
+   
+   변경 사항을 브랜치에 푸시하세요:
+
+   ```bash
+   git push -u origin a-descriptive-name-for-my-changes
+   ```
+
+   이미 PR을 열었다면, `--force` 플래그와 함께 강제 푸시해야 합니다. 아직 PR이 열리지 않았다면 정상적으로 변경 사항을 푸시하면 됩니다.
+
+6. 이제 GitHub에서 포크한 저장소로 이동하고 **Pull request(풀 리퀘스트)**를 클릭하여 Pull Request를 열 수 있습니다. 아래의 [체크리스트](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md/#pull-request-checklist)에서 모든 항목에 체크 표시를 하세요. 준비가 완료되면 프로젝트 관리자에게 변경 사항을 보내 검토를 요청할 수 있습니다.
+
+7. 관리자가 변경 사항을 요청해도 괜찮습니다. 핵심 기여자들도 동일한 상황을 겪습니다! 모두가 변경 사항을 Pull Request에서 볼 수 있도록, 로컬 브랜치에서 작업하고 변경 사항을 포크한 저장소로 푸시하세요. 그러면 변경 사항이 자동으로 Pull Request에 나타납니다.
+
+### Pull Request 체크리스트 [[pull-request-checklist]]
+
+☐ Pull Request 제목은 기여 내용을 요약해야 합니다.<br>
+☐ Pull Request가 이슈를 해결하는 경우, Pull Request 설명에 이슈 번호를 언급하여 연관되어 있음을 알려주세요. (이슈를 확인하는 사람들이 해당 이슈에 대한 작업이 진행 중임을 알 수 있게 합니다).<br>
+☐ 작업이 진행중이라면 제목 앞에 `[WIP]`를 붙여주세요. 중복 작업을 피하고 병합할 준비가 된 PR과 구분하기에 유용합니다.<br>
+☐ 기존 테스트를 통과하는지 확인하세요.<br>
+☐ 새로운 기능을 추가하는 경우, 해당 기능에 대한 테스트도 추가하세요.<br>
+   - 새 모델을 추가하는 경우, `ModelTester.all_model_classes = (MyModel, MyModelWithLMHead,...)`을 사용하여 일반적인 테스트를 활성화하세요.
+   - 새 `@slow` 테스트를 추가하는 경우, 다음 명령으로 테스트를 통과하는지 확인하세요: `RUN_SLOW=1 python -m pytest tests/models/my_new_model/test_my_new_model.py`.
+   - 새 토크나이저를 추가하는 경우, 테스트를 작성하고 다음 명령으로 테스트를 통과하는지 확인하세요: `RUN_SLOW=1 python -m pytest tests/models/{your_model_name}/test_tokenization_{your_model_name}.py`. 
+   - CircleCI에서는 느린 테스트를 실행하지 않지만, GitHub Actions에서는 매일 밤 실행됩니다!<br>
+
+☐ 모든 공개 메소드는 유용한 기술문서를 가져야 합니다 (예를 들어 [`modeling_bert.py`](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py) 참조).<br>
+☐ 저장소가 빠르게 성장하고 있으므로 저장소에 상당한 부담을 주는 이미지, 동영상 및 기타 텍스트가 아닌 파일은 추가하지 마세요. 대신 [`hf-internal-testing`](https://huggingface.co/hf-internal-testing)과 같은 Hub 저장소를 사용하여 이러한 파일을 호스팅하고 URL로 참조하세요. 문서와 관련된 이미지는 다음 저장소에 배치하는 것을 권장합니다: [huggingface/documentation-images](https://huggingface.co/datasets/huggingface/documentation-images). 이 데이터셋 저장소에서 PR을 열어서 Hugging Face 멤버에게 병합을 요청할 수 있습니다.
+
+Pull Request에서 실행되는 검사에 대한 자세한 정보는 [Pull Request에 대한 검사](https://huggingface.co/docs/transformers/pr_checks) 가이드를 확인하세요.
+
+### 테스트 [[tests]]
+
+라이브러리 동작과 여러 예제를 테스트할 수 있는 광범위한 테스트 스위트가 포함되어 있습니다. 라이브러리 테스트는 [tests](https://github.com/huggingface/transformers/tree/main/tests) 폴더에, 예제 테스트는 [examples](https://github.com/huggingface/transformers/tree/main/examples) 폴더에 있습니다.
+
+속도가 빠른 `pytest`와 `pytest-xdist`를 선호합니다. 저장소의 루트 디렉터리에서 테스트를 실행할 *하위 폴더 경로 또는 테스트 파일 경로*를 지정하세요.
+
+```bash
+python -m pytest -n auto --dist=loadfile -s -v ./tests/models/my_new_model
+```
+
+마찬가지로 `examples` 디렉터리에서도 *하위 폴더 경로 또는 테스트 파일 경로*를 지정하세요. 예를 들어, 다음 명령은 PyTorch `examples` 디렉터리의 텍스트 분류 하위 폴더를 테스트합니다:
+
+```bash
+pip install -r examples/xxx/requirements.txt  # only needed the first time
+python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/text-classification
+```
+
+이것이 실제로 `make test` 및 `make test-examples` 명령이 구현되는 방식입니다 (`pip install`은 제외합니다)!
+
+또한 특정 기능만 테스트하기 위한 더 작은 테스트를 지정할 수 있습니다.
+
+기본적으로 느린 테스트는 건너뛰지만 `RUN_SLOW` 환경 변수를 `yes`로 설정하여 실행할 수 있습니다. 이렇게 하면 많은 기가바이트 단위의 모델이 다운로드되므로 충분한 디스크 공간, 좋은 인터넷 연결과 많은 인내가 필요합니다!
+
+<Tip warning={true}>
+
+테스트를 실행하려면 *하위 폴더 경로 또는 테스트 파일 경로*를 지정하세요. 그렇지 않으면 `tests` 또는 `examples` 폴더의 모든 테스트를 실행하게 되어 매우 긴 시간이 걸립니다!
+
+</Tip>
+
+```bash
+RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./tests/models/my_new_model
+RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/text-classification
+```
+
+느린 테스트와 마찬가지로, 다음과 같이 테스트 중에 기본적으로 활성화되지 않는 다른 환경 변수도 있습니다:
+- `RUN_CUSTOM_TOKENIZERS`: 사용자 정의 토크나이저 테스트를 활성화합니다.
+- `RUN_PT_FLAX_CROSS_TESTS`: PyTorch + Flax 통합 테스트를 활성화합니다.
+- `RUN_PT_TF_CROSS_TESTS`: TensorFlow + PyTorch 통합 테스트를 활성화합니다.
+
+더 많은 환경 변수와 추가 정보는 [testing_utils.py](src/transformers/testing_utils.py)에서 찾을 수 있습니다.
+
+🤗 Transformers는 테스트 실행기로 `pytest`를 사용합니다. 그러나 테스트 스위트 자체에서는 `pytest` 관련 기능을 사용하지 않습니다.
+
+이것은 `unittest`가 완전히 지원된다는 것을 의미합니다. 다음은 `unittest`로 테스트를 실행하는 방법입니다:
+
+```bash
+python -m unittest discover -s tests -t . -v
+python -m unittest discover -s examples -t examples -v
+```
+
+### 스타일 가이드 [[style-guide]]
+
+문서는 [Google Python 스타일 가이드](https://google.github.io/styleguide/pyguide.html)를 따릅니다. 자세한 정보는 [문서 작성 가이드](https://github.com/huggingface/transformers/tree/main/docs#writing-documentation---specification)를 확인하세요.
+
+### Windows에서 개발 [[develop-on-windows]]
+
+Windows에서 개발할 경우([Windows Subsystem for Linux](https://learn.microsoft.com/en-us/windows/wsl/) 또는 WSL에서 작업하지 않는 한) Windows `CRLF` 줄 바꿈을 Linux `LF` 줄 바꿈으로 변환하도록 git을 구성해야 합니다:
+
+```bash
+git config core.autocrlf input
+```
+
+Windows에서 `make` 명령을 실행하는 한 가지 방법은 MSYS2를 사용하는 것입니다:
+
+1. [MSYS2](https://www.msys2.org/)를 다운로드합니다. `C:\msys64`에 설치되었다고 가정합니다.
+2. CLI에서 `C:\msys64\msys2.exe`를 엽니다 (시작 메뉴에서 사용 가능해야 함).
+3. 쉘에서 다음을 실행하여: `pacman -Syu` 및 `pacman -S make`로 `make`를 설치합니다.
+4. 환경 변수 PATH에 `C:\msys64\usr\bin`을 추가하세요.
+
+이제 모든 터미널 (Powershell, cmd.exe 등)에서 `make`를 사용할 수 있습니다! 🎉
+
+### 포크한 저장소를 상위 원본 브랜치(main)과 동기화하기 (Hugging Face 저장소) [[sync-a-forked-repository-with-upstream-main-the-hugging-face-repository]]
+
+포크한 저장소의 main 브랜치를 업데이트할 때, 다음 단계를 따라 수행해주세요. 이렇게 하면 각 upstream PR에 참조 노트가 추가되는 것을 피하고 이러한 PR에 관여하는 개발자들에게 불필요한 알림이 전송되는 것을 방지할 수 있습니다.
+
+1. 가능하면 포크된 저장소의 브랜치 및 PR을 사용하여 upstream과 동기화하지 마세요. 대신 포크된 main 저장소에 직접 병합하세요.
+2. PR이 반드시 필요한 경우, 브랜치를 확인한 후 다음 단계를 사용하세요:
+
+```bash
+git checkout -b your-branch-for-syncing
+git pull --squash --no-commit upstream main
+git commit -m '<your message without GitHub references>'
+git push --set-upstream origin your-branch-for-syncing
+```
\ No newline at end of file
diff --git a/docs/source/ko/create_a_model.md b/docs/source/ko/create_a_model.md
new file mode 100644
index 000000000000..8c7be3291e24
--- /dev/null
+++ b/docs/source/ko/create_a_model.md
@@ -0,0 +1,388 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 맞춤형 아키텍처 만들기[[create-a-custom-architecture]]
+
+[`AutoClass`](model_doc/auto)는 모델 아키텍처를 자동으로 추론하고 미리 학습된 configuration과 가중치를 다운로드합니다. 일반적으로 체크포인트에 구애받지 않는 코드를 생성하려면 `AutoClass`를 사용하는 것이 좋습니다. 하지만 특정 모델 파라미터를 보다 세밀하게 제어하고자 하는 사용자는 몇 가지 기본 클래스만으로 커스텀 🤗 Transformers 모델을 생성할 수 있습니다. 이는 🤗 Transformers 모델을 연구, 교육 또는 실험하는 데 관심이 있는 모든 사용자에게 특히 유용할 수 있습니다. 이 가이드에서는 'AutoClass'를 사용하지 않고 커스텀 모델을 만드는 방법에 대해 알아보겠습니다:
+
+- 모델 configuration을 가져오고 사용자 지정합니다.
+- 모델 아키텍처를 생성합니다.
+- 텍스트에 사용할 느리거나 빠른 토큰화기를 만듭니다.
+- 비전 작업을 위한 이미지 프로세서를 생성합니다.
+- 오디오 작업을 위한 특성 추출기를 생성합니다.
+- 멀티모달 작업용 프로세서를 생성합니다.
+
+## Configuration[[configuration]]
+
+[configuration](main_classes/configuration)은 모델의 특정 속성을 나타냅니다. 각 모델 구성에는 서로 다른 속성이 있습니다. 예를 들어, 모든 NLP 모델에는 `hidden_size`, `num_attention_heads`, `num_hidden_layers` 및 `vocab_size` 속성이 공통으로 있습니다. 이러한 속성은 모델을 구성할 attention heads 또는 hidden layers의 수를 지정합니다.
+
+[DistilBERT](model_doc/distilbert) 속성을 검사하기 위해 [`DistilBertConfig`]에 접근하여 자세히 살펴봅니다:
+
+```py
+>>> from transformers import DistilBertConfig
+
+>>> config = DistilBertConfig()
+>>> print(config)
+DistilBertConfig {
+  "activation": "gelu",
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "transformers_version": "4.16.2",
+  "vocab_size": 30522
+}
+```
+
+[`DistilBertConfig`]는 기본 [`DistilBertModel`]을 빌드하는 데 사용되는 모든 기본 속성을 표시합니다. 모든 속성은 커스터마이징이 가능하므로 실험을 위한 공간을 만들 수 있습니다. 예를 들어 기본 모델을 다음과 같이 커스터마이즈할 수 있습니다:
+
+- `activation` 파라미터로 다른 활성화 함수를 사용해 보세요.
+- `attention_dropout` 파라미터를 사용하여 어텐션 확률에 더 높은 드롭아웃 비율을 사용하세요.
+
+```py
+>>> my_config = DistilBertConfig(activation="relu", attention_dropout=0.4)
+>>> print(my_config)
+DistilBertConfig {
+  "activation": "relu",
+  "attention_dropout": 0.4,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "transformers_version": "4.16.2",
+  "vocab_size": 30522
+}
+```
+
+사전 학습된 모델 속성은 [`~PretrainedConfig.from_pretrained`] 함수에서 수정할 수 있습니다:
+
+```py
+>>> my_config = DistilBertConfig.from_pretrained("distilbert-base-uncased", activation="relu", attention_dropout=0.4)
+```
+
+모델 구성이 만족스러우면 [`~PretrainedConfig.save_pretrained`]로 저장할 수 있습니다. 설정 파일은 지정된 작업 경로에 JSON 파일로 저장됩니다:
+
+```py
+>>> my_config.save_pretrained(save_directory="./your_model_save_path")
+```
+
+configuration 파일을 재사용하려면 [`~PretrainedConfig.from_pretrained`]를 사용하여 가져오세요:
+
+```py
+>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/config.json")
+```
+
+<Tip>
+
+configuration 파일을 딕셔너리로 저장하거나 사용자 정의 configuration 속성과 기본 configuration 속성의 차이점만 저장할 수도 있습니다! 자세한 내용은 [configuration](main_classes/configuration) 문서를 참조하세요.
+
+</Tip>
+
+## 모델[[model]]
+
+다음 단계는 [모델(model)](main_classes/models)을 만드는 것입니다. 느슨하게 아키텍처라고도 불리는 모델은 각 계층이 수행하는 동작과 발생하는 작업을 정의합니다. configuration의 `num_hidden_layers`와 같은 속성은 아키텍처를 정의하는 데 사용됩니다. 모든 모델은 기본 클래스 [`PreTrainedModel`]과 입력 임베딩 크기 조정 및 셀프 어텐션 헤드 가지 치기와 같은 몇 가지 일반적인 메소드를 공유합니다. 또한 모든 모델은 [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html), [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) 또는 [`flax.linen.Module`](https://flax.readthedocs.io/en/latest/flax.linen.html#module)의 서브클래스이기도 합니다. 즉, 모델은 각 프레임워크의 사용법과 호환됩니다.
+
+<frameworkcontent>
+<pt>
+사용자 지정 configuration 속성을 모델에 가져옵니다:
+
+```py
+>>> from transformers import DistilBertModel
+
+>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/config.json")
+>>> model = DistilBertModel(my_config)
+```
+
+이제 사전 학습된 가중치 대신 임의의 값을 가진 모델이 생성됩니다. 이 모델을 훈련하기 전까지는 유용하게 사용할 수 없습니다. 훈련은 비용과 시간이 많이 소요되는 프로세스입니다. 일반적으로 훈련에 필요한 리소스의 일부만 사용하면서 더 나은 결과를 더 빨리 얻으려면 사전 훈련된 모델을 사용하는 것이 좋습니다.
+
+사전 학습된 모델을 [`~PreTrainedModel.from_pretrained`]로 생성합니다:
+
+```py
+>>> model = DistilBertModel.from_pretrained("distilbert-base-uncased")
+```
+
+🤗 Transformers에서 제공한 모델의 사전 학습된 가중치를 사용하는 경우 기본 모델 configuration을 자동으로 불러옵니다. 그러나 원하는 경우 기본 모델 configuration 속성의 일부 또는 전부를 사용자 지정으로 바꿀 수 있습니다:
+
+```py
+>>> model = DistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
+```
+</pt>
+<tf>
+사용자 지정 configuration 속성을 모델에 불러옵니다:
+
+```py
+>>> from transformers import TFDistilBertModel
+
+>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json")
+>>> tf_model = TFDistilBertModel(my_config)
+```
+
+이제 사전 학습된 가중치 대신 임의의 값을 가진 모델이 생성됩니다. 이 모델을 훈련하기 전까지는 유용하게 사용할 수 없습니다. 훈련은 비용과 시간이 많이 소요되는 프로세스입니다. 일반적으로 훈련에 필요한 리소스의 일부만 사용하면서 더 나은 결과를 더 빨리 얻으려면 사전 훈련된 모델을 사용하는 것이 좋습니다.
+
+사전 학습된 모델을 [`~TFPreTrainedModel.from_pretrained`]로 생성합니다:
+
+```py
+>>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased")
+```
+
+🤗 Transformers에서 제공한 모델의 사전 학습된 가중치를 사용하는 경우 기본 모델 configuration을 자동으로 불러옵니다. 그러나 원하는 경우 기본 모델 configuration 속성의 일부 또는 전부를 사용자 지정으로 바꿀 수 있습니다:
+
+```py
+>>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
+```
+</tf>
+</frameworkcontent>
+
+### 모델 헤드[[model-heads]]
+
+이 시점에서 *은닉 상태(hidden state)*를 출력하는 기본 DistilBERT 모델을 갖게 됩니다. 은닉 상태는 최종 출력을 생성하기 위해 모델 헤드에 입력으로 전달됩니다. 🤗 Transformers는 모델이 해당 작업을 지원하는 한 각 작업마다 다른 모델 헤드를 제공합니다(즉, 번역과 같은 시퀀스 간 작업에는 DistilBERT를 사용할 수 없음).
+
+<frameworkcontent>
+<pt>
+예를 들어, [`DistilBertForSequenceClassification`]은 시퀀스 분류 헤드가 있는 기본 DistilBERT 모델입니다. 시퀀스 분류 헤드는 풀링된 출력 위에 있는 선형 레이어입니다.
+
+```py
+>>> from transformers import DistilBertForSequenceClassification
+
+>>> model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
+```
+
+다른 모델 헤드로 전환하여 이 체크포인트를 다른 작업에 쉽게 재사용할 수 있습니다. 질의응답 작업의 경우, [`DistilBertForQuestionAnswering`] 모델 헤드를 사용할 수 있습니다. 질의응답 헤드는 숨겨진 상태 출력 위에 선형 레이어가 있다는 점을 제외하면 시퀀스 분류 헤드와 유사합니다.
+
+```py
+>>> from transformers import DistilBertForQuestionAnswering
+
+>>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+```
+</pt>
+<tf>
+예를 들어, [`TFDistilBertForSequenceClassification`]은 시퀀스 분류 헤드가 있는 기본 DistilBERT 모델입니다. 시퀀스 분류 헤드는 풀링된 출력 위에 있는 선형 레이어입니다.
+
+```py
+>>> from transformers import TFDistilBertForSequenceClassification
+
+>>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
+```
+
+다른 모델 헤드로 전환하여 이 체크포인트를 다른 작업에 쉽게 재사용할 수 있습니다. 질의응답 작업의 경우, [`TFDistilBertForQuestionAnswering`] 모델 헤드를 사용할 수 있습니다. 질의응답 헤드는 숨겨진 상태 출력 위에 선형 레이어가 있다는 점을 제외하면 시퀀스 분류 헤드와 유사합니다.
+
+```py
+>>> from transformers import TFDistilBertForQuestionAnswering
+
+>>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+```
+</tf>
+</frameworkcontent>
+
+## 토크나이저[[tokenizer]]
+
+텍스트 데이터에 모델을 사용하기 전에 마지막으로 필요한 기본 클래스는 원시 텍스트를 텐서로 변환하는 [토크나이저](main_classes/tokenizer)입니다. 🤗 Transformers에 사용할 수 있는 토크나이저는 두 가지 유형이 있습니다:
+
+- [`PreTrainedTokenizer`]: 파이썬으로 구현된 토크나이저입니다.
+- [`PreTrainedTokenizerFast`]: Rust 기반 [🤗 Tokenizer](https://huggingface.co/docs/tokenizers/python/latest/) 라이브러리로 만들어진 토크나이저입니다. 이 토크나이저는 Rust로 구현되어 배치 토큰화에서 특히 빠릅니다. 빠른 토크나이저는 토큰을 원래 단어나 문자에 매핑하는 *오프셋 매핑*과 같은 추가 메소드도 제공합니다.
+두 토크나이저 모두 인코딩 및 디코딩, 새 토큰 추가, 특수 토큰 관리와 같은 일반적인 방법을 지원합니다.
+
+<Tip warning={true}>
+
+모든 모델이 빠른 토크나이저를 지원하는 것은 아닙니다. 이 [표](index#supported-frameworks)에서 모델의 빠른 토크나이저 지원 여부를 확인하세요.
+
+</Tip>
+
+토크나이저를 직접 학습한 경우, *어휘(vocabulary)* 파일에서 토크나이저를 만들 수 있습니다:
+
+```py
+>>> from transformers import DistilBertTokenizer
+
+>>> my_tokenizer = DistilBertTokenizer(vocab_file="my_vocab_file.txt", do_lower_case=False, padding_side="left")
+```
+
+사용자 지정 토크나이저의 어휘는 사전 학습된 모델의 토크나이저에서 생성된 어휘와 다를 수 있다는 점을 기억하는 것이 중요합니다. 사전 학습된 모델을 사용하는 경우 사전 학습된 모델의 어휘를 사용해야 하며, 그렇지 않으면 입력이 의미를 갖지 못합니다. [`DistilBertTokenizer`] 클래스를 사용하여 사전 학습된 모델의 어휘로 토크나이저를 생성합니다:
+
+```py
+>>> from transformers import DistilBertTokenizer
+
+>>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
+```
+
+[`DistilBertTokenizerFast`] 클래스로 빠른 토크나이저를 생성합니다:
+
+```py
+>>> from transformers import DistilBertTokenizerFast
+
+>>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
+```
+
+<Tip>
+
+[`AutoTokenizer`]는 기본적으로 빠른 토크나이저를 가져오려고 합니다. 이 동작을 비활성화하려면 `from_pretrained`에서 `use_fast=False`를 설정하면 됩니다.
+
+</Tip>
+
+## 이미지 프로세서[[image-processor]]
+
+이미지 프로세서(image processor)는 비전 입력을 처리합니다. 기본 [`~image_processing_utils.ImageProcessingMixin`] 클래스에서 상속합니다.
+
+사용하려면 사용 중인 모델과 연결된 이미지 프로세서를 생성합니다. 예를 들어, 이미지 분류에 [ViT](model_doc/vit)를 사용하는 경우 기본 [`ViTImageProcessor`]를 생성합니다:
+
+```py
+>>> from transformers import ViTImageProcessor
+
+>>> vit_extractor = ViTImageProcessor()
+>>> print(vit_extractor)
+ViTImageProcessor {
+  "do_normalize": true,
+  "do_resize": true,
+  "feature_extractor_type": "ViTImageProcessor",
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "resample": 2,
+  "size": 224
+}
+```
+
+<Tip>
+
+사용자 지정을 원하지 않는 경우 `from_pretrained` 메소드를 사용하여 모델의 기본 이미지 프로세서 매개변수를 불러오면 됩니다.
+
+</Tip>
+
+사용자 지정 이미지 프로세서를 생성하려면 [`ViTImageProcessor`] 파라미터를 수정합니다:
+
+```py
+>>> from transformers import ViTImageProcessor
+
+>>> my_vit_extractor = ViTImageProcessor(resample="PIL.Image.BOX", do_normalize=False, image_mean=[0.3, 0.3, 0.3])
+>>> print(my_vit_extractor)
+ViTImageProcessor {
+  "do_normalize": false,
+  "do_resize": true,
+  "feature_extractor_type": "ViTImageProcessor",
+  "image_mean": [
+    0.3,
+    0.3,
+    0.3
+  ],
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "resample": "PIL.Image.BOX",
+  "size": 224
+}
+```
+
+## 특성 추출기[[feature-extractor]]
+
+특성 추출기(feature extractor)는 오디오 입력을 처리합니다. 기본 [`~feature_extraction_utils.FeatureExtractionMixin`] 클래스에서 상속되며, 오디오 입력을 처리하기 위해 [`SequenceFeatureExtractor`] 클래스에서 상속할 수도 있습니다.
+
+사용하려면 사용 중인 모델과 연결된 특성 추출기를 생성합니다. 예를 들어, 오디오 분류에 [Wav2Vec2](model_doc/wav2vec2)를 사용하는 경우 기본 [`Wav2Vec2FeatureExtractor`]를 생성합니다:
+
+```py
+>>> from transformers import Wav2Vec2FeatureExtractor
+
+>>> w2v2_extractor = Wav2Vec2FeatureExtractor()
+>>> print(w2v2_extractor)
+Wav2Vec2FeatureExtractor {
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "return_attention_mask": false,
+  "sampling_rate": 16000
+}
+```
+
+<Tip>
+
+사용자 지정이 필요하지 않은 경우 `from_pretrained` 메소드를 사용하여 모델의 기본 특성 추출기 ㅁ개변수를 불러 오면 됩니다.
+
+</Tip>
+
+사용자 지정 특성 추출기를 만들려면 [`Wav2Vec2FeatureExtractor`] 매개변수를 수정합니다:
+
+```py
+>>> from transformers import Wav2Vec2FeatureExtractor
+
+>>> w2v2_extractor = Wav2Vec2FeatureExtractor(sampling_rate=8000, do_normalize=False)
+>>> print(w2v2_extractor)
+Wav2Vec2FeatureExtractor {
+  "do_normalize": false,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "return_attention_mask": false,
+  "sampling_rate": 8000
+}
+```
+
+
+## 프로세서[[processor]]
+
+멀티모달 작업을 지원하는 모델의 경우, 🤗 Transformers는 특성 추출기 및 토크나이저와 같은 처리 클래스를 단일 객체로 편리하게 래핑하는 프로세서 클래스를 제공합니다. 예를 들어, 자동 음성 인식 작업(Automatic Speech Recognition task (ASR))에 [`Wav2Vec2Processor`]를 사용한다고 가정해 보겠습니다. 자동 음성 인식 작업은 오디오를 텍스트로 변환하므로 특성 추출기와 토크나이저가 필요합니다.
+
+오디오 입력을 처리할 특성 추출기를 만듭니다:
+
+```py
+>>> from transformers import Wav2Vec2FeatureExtractor
+
+>>> feature_extractor = Wav2Vec2FeatureExtractor(padding_value=1.0, do_normalize=True)
+```
+
+텍스트 입력을 처리할 토크나이저를 만듭니다:
+
+```py
+>>> from transformers import Wav2Vec2CTCTokenizer
+
+>>> tokenizer = Wav2Vec2CTCTokenizer(vocab_file="my_vocab_file.txt")
+```
+
+[`Wav2Vec2Processor`]에서 특성 추출기와 토크나이저를 결합합니다:
+
+```py
+>>> from transformers import Wav2Vec2Processor
+
+>>> processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+```
+
+configuration과 모델이라는 두 가지 기본 클래스와 추가 전처리 클래스(토크나이저, 이미지 프로세서, 특성 추출기 또는 프로세서)를 사용하면 🤗 Transformers에서 지원하는 모든 모델을 만들 수 있습니다. 이러한 각 기본 클래스는 구성이 가능하므로 원하는 특정 속성을 사용할 수 있습니다. 학습을 위해 모델을 쉽게 설정하거나 기존의 사전 학습된 모델을 수정하여 미세 조정할 수 있습니다.
diff --git a/docs/source/ko/custom_models.md b/docs/source/ko/custom_models.md
new file mode 100644
index 000000000000..72dad7caaff2
--- /dev/null
+++ b/docs/source/ko/custom_models.md
@@ -0,0 +1,346 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 사용자 정의 모델 공유하기[[sharing-custom-models]]
+
+🤗 Transformers 라이브러리는 쉽게 확장할 수 있도록 설계되었습니다. 
+모든 모델은 추상화 없이 저장소의 지정된 하위 폴더에 완전히 코딩되어 있으므로, 손쉽게 모델링 파일을 복사하고 필요에 따라 조정할 수 있습니다.
+
+완전히 새로운 모델을 만드는 경우에는 처음부터 시작하는 것이 더 쉬울 수 있습니다.
+이 튜토리얼에서는 Transformers 내에서 사용할 수 있도록 사용자 정의 모델과 구성을 작성하는 방법과 
+🤗 Transformers 라이브러리에 없는 경우에도 누구나 사용할 수 있도록 (의존성과 함께) 커뮤니티에 공유하는 방법을 배울 수 있습니다.
+
+[timm 라이브러리](https://github.com/rwightman/pytorch-image-models)의 ResNet 클래스를 [`PreTrainedModel`]로 래핑한 ResNet 모델을 예로 모든 것을 설명합니다.
+
+## 사용자 정의 구성 작성하기[[writing-a-custom-configuration]]
+
+모델에 들어가기 전에 먼저 구성을 작성해보도록 하겠습니다.
+모델의 `configuration`은 모델을 만들기 위해 필요한 모든 중요한 것들을 포함하고 있는 객체입니다.
+다음 섹션에서 볼 수 있듯이, 모델은 `config`를 사용해서만 초기화할 수 있기 때문에 완벽한 구성이 필요합니다.
+
+아래 예시에서는 ResNet 클래스의 인수(argument)를 조정해보겠습니다.
+다른 구성은 가능한 ResNet 중 다른 유형을 제공합니다.
+그런 다음 몇 가지 유효성을 확인한 후 해당 인수를 저장합니다.
+
+```python
+from transformers import PretrainedConfig
+from typing import List
+
+
+class ResnetConfig(PretrainedConfig):
+    model_type = "resnet"
+
+    def __init__(
+        self,
+        block_type="bottleneck",
+        layers: List[int] = [3, 4, 6, 3],
+        num_classes: int = 1000,
+        input_channels: int = 3,
+        cardinality: int = 1,
+        base_width: int = 64,
+        stem_width: int = 64,
+        stem_type: str = "",
+        avg_down: bool = False,
+        **kwargs,
+    ):
+        if block_type not in ["basic", "bottleneck"]:
+            raise ValueError(f"`block_type` must be 'basic' or bottleneck', got {block_type}.")
+        if stem_type not in ["", "deep", "deep-tiered"]:
+            raise ValueError(f"`stem_type` must be '', 'deep' or 'deep-tiered', got {stem_type}.")
+
+        self.block_type = block_type
+        self.layers = layers
+        self.num_classes = num_classes
+        self.input_channels = input_channels
+        self.cardinality = cardinality
+        self.base_width = base_width
+        self.stem_width = stem_width
+        self.stem_type = stem_type
+        self.avg_down = avg_down
+        super().__init__(**kwargs)
+```
+
+사용자 정의 `configuration`을 작성할 때 기억해야 할 세 가지 중요한 사항은 다음과 같습니다:
+- `PretrainedConfig`을 상속해야 합니다.
+- `PretrainedConfig`의 `__init__`은 모든 kwargs를 허용해야 하고,
+- 이러한 `kwargs`는 상위 클래스 `__init__`에 전달되어야 합니다.
+
+상속은 🤗 Transformers 라이브러리에서 모든 기능을 가져오는 것입니다.
+이러한 점으로부터 비롯되는 두 가지 제약 조건은 `PretrainedConfig`에 설정하는 것보다 더 많은 필드가 있습니다.
+`from_pretrained` 메서드로 구성을 다시 로드할 때 해당 필드는 구성에서 수락한 후 상위 클래스로 보내야 합니다.
+
+모델을 auto 클래스에 등록하지 않는 한, `configuration`에서 `model_type`을 정의(여기서 `model_type="resnet"`)하는 것은 필수 사항이 아닙니다 (마지막 섹션 참조).
+
+이렇게 하면 라이브러리의 다른 모델 구성과 마찬가지로 구성을 쉽게 만들고 저장할 수 있습니다.
+다음은 resnet50d 구성을 생성하고 저장하는 방법입니다:
+
+```py
+resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True)
+resnet50d_config.save_pretrained("custom-resnet")
+```
+
+이렇게 하면 `custom-resnet` 폴더 안에 `config.json`이라는 파일이 저장됩니다.
+그런 다음 `from_pretrained` 메서드를 사용하여 구성을 다시 로드할 수 있습니다.
+
+```py
+resnet50d_config = ResnetConfig.from_pretrained("custom-resnet")
+```
+
+구성을 Hub에 직접 업로드하기 위해 [`PretrainedConfig`] 클래스의 [`~PretrainedConfig.push_to_hub`]와 같은 다른 메서드를 사용할 수 있습니다.
+
+
+## 사용자 정의 모델 작성하기[[writing-a-custom-model]]
+
+이제 ResNet 구성이 있으므로 모델을 작성할 수 있습니다.
+실제로는 두 개를 작성할 것입니다. 하나는 이미지 배치에서 hidden features를 추출하는 것([`BertModel`]과 같이), 다른 하나는 이미지 분류에 적합한 것입니다([`BertForSequenceClassification`]과 같이).
+
+이전에 언급했듯이 이 예제에서는 단순하게 하기 위해 모델의 느슨한 래퍼(loose wrapper)만 작성할 것입니다.
+이 클래스를 작성하기 전에 블록 유형과 실제 블록 클래스 간의 매핑 작업만 하면 됩니다.
+그런 다음 `ResNet` 클래스로 전달되어 `configuration`을 통해 모델이 선언됩니다:
+
+```py
+from transformers import PreTrainedModel
+from timm.models.resnet import BasicBlock, Bottleneck, ResNet
+from .configuration_resnet import ResnetConfig
+
+
+BLOCK_MAPPING = {"basic": BasicBlock, "bottleneck": Bottleneck}
+
+
+class ResnetModel(PreTrainedModel):
+    config_class = ResnetConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        block_layer = BLOCK_MAPPING[config.block_type]
+        self.model = ResNet(
+            block_layer,
+            config.layers,
+            num_classes=config.num_classes,
+            in_chans=config.input_channels,
+            cardinality=config.cardinality,
+            base_width=config.base_width,
+            stem_width=config.stem_width,
+            stem_type=config.stem_type,
+            avg_down=config.avg_down,
+        )
+
+    def forward(self, tensor):
+        return self.model.forward_features(tensor)
+```
+
+이미지 분류 모델을 만들기 위해서는 forward 메소드만 변경하면 됩니다:
+
+```py
+import torch
+
+
+class ResnetModelForImageClassification(PreTrainedModel):
+    config_class = ResnetConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        block_layer = BLOCK_MAPPING[config.block_type]
+        self.model = ResNet(
+            block_layer,
+            config.layers,
+            num_classes=config.num_classes,
+            in_chans=config.input_channels,
+            cardinality=config.cardinality,
+            base_width=config.base_width,
+            stem_width=config.stem_width,
+            stem_type=config.stem_type,
+            avg_down=config.avg_down,
+        )
+
+    def forward(self, tensor, labels=None):
+        logits = self.model(tensor)
+        if labels is not None:
+            loss = torch.nn.cross_entropy(logits, labels)
+            return {"loss": loss, "logits": logits}
+        return {"logits": logits}
+```
+
+두 경우 모두 `PreTrainedModel`를 상속받고, `config`를 통해 상위 클래스 초기화를 호출하다는 점을 기억하세요 (일반적인 `torch.nn.Module`을 작성할 때와 비슷함).
+모델을 auto 클래스에 등록하고 싶은 경우에는 `config_class`를 설정하는 부분이 필수입니다 (마지막 섹션 참조).
+
+<Tip>
+
+라이브러리에 존재하는 모델과 굉장히 유사하다면, 모델을 생성할 때 구성을 참조해 재사용할 수 있습니다.
+
+</Tip>
+
+원하는 것을 모델이 반환하도록 할 수 있지만, `ResnetModelForImageClassification`에서 했던 것 처럼
+레이블을 통과시켰을 때 손실과 함께 사전 형태로 반환하는 것이 [`Trainer`] 클래스 내에서 직접 모델을 사용하기에 유용합니다.
+자신만의 학습 루프 또는 다른 학습 라이브러리를 사용할 계획이라면 다른 출력 형식을 사용해도 좋습니다.
+
+이제 모델 클래스가 있으므로 하나 생성해 보겠습니다:
+
+```py
+resnet50d = ResnetModelForImageClassification(resnet50d_config)
+```
+
+다시 말하지만, [`~PreTrainedModel.save_pretrained`]또는 [`~PreTrainedModel.push_to_hub`]처럼 [`PreTrainedModel`]에 속하는 모든 메소드를 사용할 수 있습니다.
+다음 섹션에서 두 번째 메소드를 사용해 모델 코드와 모델 가중치를 업로드하는 방법을 살펴보겠습니다.
+먼저, 모델 내부에 사전 훈련된 가중치를 로드해 보겠습니다.
+
+이 예제를 활용할 때는, 사용자 정의 모델을 자신만의 데이터로 학습시킬 것입니다.
+이 튜토리얼에서는 빠르게 진행하기 위해 사전 훈련된 resnet50d를 사용하겠습니다.
+아래 모델은 resnet50d의 래퍼이기 때문에, 가중치를 쉽게 로드할 수 있습니다.
+
+
+```py
+import timm
+
+pretrained_model = timm.create_model("resnet50d", pretrained=True)
+resnet50d.model.load_state_dict(pretrained_model.state_dict())
+```
+
+이제 [`~PreTrainedModel.save_pretrained`] 또는 [`~PreTrainedModel.push_to_hub`]를 사용할 때 모델 코드가 저장되는지 확인해봅시다.
+
+## Hub로 코드 업로드하기[[sending-the-code-to-the-hub]]
+
+<Tip warning={true}>
+
+이 API는 실험적이며 다음 릴리스에서 약간의 변경 사항이 있을 수 있습니다.
+
+</Tip>
+
+먼저 모델이 `.py` 파일에 완전히 정의되어 있는지 확인하세요.
+모든 파일이 동일한 작업 경로에 있기 때문에 상대경로 임포트(relative import)에 의존할 수 있습니다 (transformers에서는 이 기능에 대한 하위 모듈을 지원하지 않습니다).
+이 예시에서는 작업 경로 안의 `resnet_model`에서 `modeling_resnet.py` 파일과 `configuration_resnet.py` 파일을 정의합니다.
+구성 파일에는 `ResnetConfig`에 대한 코드가 있고 모델링 파일에는 `ResnetModel` 및 `ResnetModelForImageClassification`에 대한 코드가 있습니다.
+
+```
+.
+└── resnet_model
+    ├── __init__.py
+    ├── configuration_resnet.py
+    └── modeling_resnet.py
+```
+
+Python이 `resnet_model`을 모듈로 사용할 수 있도록 감지하는 목적이기 때문에 `__init__.py`는 비어 있을 수 있습니다.
+
+<Tip warning={true}>
+
+라이브러리에서 모델링 파일을 복사하는 경우,
+모든 파일 상단에 있는 상대 경로 임포트(relative import) 부분을 `transformers` 패키지에서 임포트 하도록 변경해야 합니다.
+
+</Tip>
+
+기존 구성이나 모델을 재사용(또는 서브 클래스화)할 수 있습니다.
+
+커뮤니티에 모델을 공유하기 위해서는 다음 단계를 따라야 합니다:
+먼저, 새로 만든 파일에 ResNet 모델과 구성을 임포트합니다:
+
+```py
+from resnet_model.configuration_resnet import ResnetConfig
+from resnet_model.modeling_resnet import ResnetModel, ResnetModelForImageClassification
+```
+
+다음으로 `save_pretrained` 메소드를 사용해 해당 객체의 코드 파일을 복사하고, 
+복사한 파일을 Auto 클래스로 등록하고(모델인 경우) 실행합니다:
+
+```py
+ResnetConfig.register_for_auto_class()
+ResnetModel.register_for_auto_class("AutoModel")
+ResnetModelForImageClassification.register_for_auto_class("AutoModelForImageClassification")
+```
+
+`configuration`에 대한 auto 클래스를 지정할 필요는 없지만(`configuration` 관련 auto 클래스는 AutoConfig 클래스 하나만 있음), 모델의 경우에는 지정해야 합니다.
+사용자 지정 모델은 다양한 작업에 적합할 수 있으므로, 모델에 맞는 auto 클래스를 지정해야 합니다.
+
+다음으로, 이전에 작업했던 것과 마찬가지로 구성과 모델을 작성합니다:
+
+```py
+resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True)
+resnet50d = ResnetModelForImageClassification(resnet50d_config)
+
+pretrained_model = timm.create_model("resnet50d", pretrained=True)
+resnet50d.model.load_state_dict(pretrained_model.state_dict())
+```
+
+이제 모델을 Hub로 업로드하기 위해 로그인 상태인지 확인하세요. 
+터미널에서 다음 코드를 실행해 확인할 수 있습니다:
+
+```bash
+huggingface-cli login
+```
+
+주피터 노트북의 경우에는 다음과 같습니다:
+
+```py
+from huggingface_hub import notebook_login
+
+notebook_login()
+```
+
+그런 다음 이렇게 자신의 네임스페이스(또는 자신이 속한 조직)에 업로드할 수 있습니다:
+```py
+resnet50d.push_to_hub("custom-resnet50d")
+```
+
+On top of the modeling weights and the configuration in json format, this also copied the modeling and
+configuration `.py` files in the folder `custom-resnet50d` and uploaded the result to the Hub. You can check the result
+in this [model repo](https://huggingface.co/sgugger/custom-resnet50d).
+json 형식의 모델링 가중치와 구성 외에도 `custom-resnet50d` 폴더 안의 모델링과 구성 `.py` 파일을 복사하해 Hub에 업로드합니다.
+[모델 저장소](https://huggingface.co/sgugger/custom-resnet50d)에서 결과를 확인할 수 있습니다.
+
+[sharing tutorial](model_sharing) 문서의 `push_to_hub` 메소드에서 자세한 내용을 확인할 수 있습니다.
+
+
+## 사용자 정의 코드로 모델 사용하기[[using-a-model-with-custom-code]]
+
+auto 클래스와 `from_pretrained` 메소드를 사용하여 사용자 지정 코드 파일과 함께 모든 구성, 모델, 토크나이저를 사용할 수 있습니다.
+Hub에 업로드된 모든 파일 및 코드는 멜웨어가 있는지 검사되지만 (자세한 내용은 [Hub 보안](https://huggingface.co/docs/hub/security#malware-scanning) 설명 참조),
+자신의 컴퓨터에서 모델 코드와 작성자가 악성 코드를 실행하지 않는지 확인해야 합니다.
+사용자 정의 코드로 모델을 사용하려면 `trust_remote_code=True`로 설정하세요:
+
+```py
+from transformers import AutoModelForImageClassification
+
+model = AutoModelForImageClassification.from_pretrained("sgugger/custom-resnet50d", trust_remote_code=True)
+```
+
+모델 작성자가 악의적으로 코드를 업데이트하지 않았다는 점을 확인하기 위해, 커밋 해시(commit hash)를 `revision`으로 전달하는 것도 강력히 권장됩니다 (모델 작성자를 완전히 신뢰하지 않는 경우).
+
+```py
+commit_hash = "ed94a7c6247d8aedce4647f00f20de6875b5b292"
+model = AutoModelForImageClassification.from_pretrained(
+    "sgugger/custom-resnet50d", trust_remote_code=True, revision=commit_hash
+)
+```
+
+Hub에서 모델 저장소의 커밋 기록을 찾아볼 때, 모든 커밋의 커밋 해시를 쉽게 복사할 수 있는 버튼이 있습니다.
+
+## 사용자 정의 코드로 만든 모델을 auto 클래스로 등록하기[[registering-a-model-with-custom-code-to-the-auto-classes]]
+
+🤗 Transformers를 상속하는 라이브러리를 작성하는 경우 사용자 정의 모델을 auto 클래스에 추가할 수 있습니다.
+사용자 정의 모델을 사용하기 위해 해당 라이브러리를 임포트해야 하기 때문에, 이는 Hub로 코드를 업로드하는 것과 다릅니다 (Hub에서 자동적으로 모델 코드를 다운로드 하는 것과 반대).
+
+구성에 기존 모델 유형과 다른 `model_type` 속성이 있고 모델 클래스에 올바른 `config_class` 속성이 있는 한,
+다음과 같이 auto 클래스에 추가할 수 있습니다:
+
+```py
+from transformers import AutoConfig, AutoModel, AutoModelForImageClassification
+
+AutoConfig.register("resnet", ResnetConfig)
+AutoModel.register(ResnetConfig, ResnetModel)
+AutoModelForImageClassification.register(ResnetConfig, ResnetModelForImageClassification)
+```
+
+사용자 정의 구성을 [`AutoConfig`]에 등록할 때 사용되는 첫 번째 인수는 사용자 정의 구성의 `model_type`과 일치해야 합니다.
+또한, 사용자 정의 모델을 auto 클래스에 등록할 때 사용되는 첫 번째 인수는 해당 모델의 `config_class`와 일치해야 합니다.
\ No newline at end of file
diff --git a/docs/source/ko/custom_tools.md b/docs/source/ko/custom_tools.md
new file mode 100644
index 000000000000..87017a68b524
--- /dev/null
+++ b/docs/source/ko/custom_tools.md
@@ -0,0 +1,748 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# 사용자 정의 도구와 프롬프트[[custom-tools-and-prompts]]
+
+<Tip>
+
+Transformers와 관련하여 어떤 도구와 에이전트가 있는지 잘 모르신다면 [Transformers Agents](transformers_agents) 페이지를 먼저 읽어보시기 바랍니다. 
+
+</Tip>
+
+<Tip warning={true}>
+
+Transformers Agents는 실험 중인 API로 언제든지 변경될 수 있습니다. 
+API 또는 기반 모델이 변경되기 쉽기 때문에 에이전트가 반환하는 결과도 달라질 수 있습니다.
+
+</Tip>
+
+에이전트에게 권한을 부여하고 새로운 작업을 수행하게 하려면 사용자 정의 도구와 프롬프트를 만들고 사용하는 것이 무엇보다 중요합니다.
+이 가이드에서는 다음과 같은 내용을 살펴보겠습니다:
+
+- 프롬프트를 사용자 정의하는 방법
+- 사용자 정의 도구를 사용하는 방법
+- 사용자 정의 도구를 만드는 방법
+
+## 프롬프트를 사용자 정의하기[[customizing-the-prompt]]
+
+[Transformers Agents](transformers_agents)에서 설명한 것처럼 에이전트는 [`~Agent.run`] 및 [`~Agent.chat`] 모드에서 실행할 수 있습니다.
+`run`(실행) 모드와 `chat`(채팅) 모드 모두 동일한 로직을 기반으로 합니다. 
+에이전트를 구동하는 언어 모델은 긴 프롬프트에 따라 조건이 지정되고, 중지 토큰에 도달할 때까지 다음 토큰을 생성하여 프롬프트를 완수합니다.
+`chat` 모드에서는 프롬프트가 이전 사용자 입력 및 모델 생성으로 연장된다는 점이 두 모드의 유일한 차이점입니다.
+이를 통해 에이전트가 과거 상호작용에 접근할 수 있게 되므로 에이전트에게 일종의 메모리를 제공하는 셈입니다.
+
+### 프롬프트의 구조[[structure-of-the-prompt]]
+
+어떻게 프롬프트 사용자 정의를 잘 할 수 있는지 이해하기 위해 프롬프트의 구조를 자세히 살펴봅시다.
+프롬프트는 크게 네 부분으로 구성되어 있습니다.
+
+- 1. 도입: 에이전트가 어떻게 행동해야 하는지, 도구의 개념에 대한 설명.
+- 2. 모든 도구에 대한 설명. 이는 런타임에 사용자가 정의/선택한 도구로 동적으로 대체되는 `<<all_tools>>` 토큰으로 정의됩니다.
+- 3. 작업 예제 및 해당 솔루션 세트.
+- 4. 현재 예제 및 해결 요청.
+
+각 부분을 더 잘 이해할 수 있도록 짧은 버전을 통해 `run` 프롬프트가 어떻게 보이는지 살펴보겠습니다:
+
+````text
+I will ask you to perform a task, your job is to come up with a series of simple commands in Python that will perform the task.
+[...]
+You can print intermediate results if it makes sense to do so.
+
+Tools:
+- document_qa: This is a tool that answers a question about a document (pdf). It takes an input named `document` which should be the document containing the information, as well as a `question` that is the question about the document. It returns a text that contains the answer to the question.
+- image_captioner: This is a tool that generates a description of an image. It takes an input named `image` which should be the image to the caption and returns a text that contains the description in English.
+[...]
+
+Task: "Answer the question in the variable `question` about the image stored in the variable `image`. The question is in French."
+
+I will use the following tools: `translator` to translate the question into English and then `image_qa` to answer the question on the input image.
+
+Answer:
+```py
+translated_question = translator(question=question, src_lang="French", tgt_lang="English")
+print(f"The translated question is {translated_question}.")
+answer = image_qa(image=image, question=translated_question)
+print(f"The answer is {answer}")
+```
+
+Task: "Identify the oldest person in the `document` and create an image showcasing the result as a banner."
+
+I will use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer.
+
+Answer:
+```py
+answer = document_qa(document, question="What is the oldest person?")
+print(f"The answer is {answer}.")
+image = image_generator("A banner showing " + answer)
+```
+
+[...]
+
+Task: "Draw me a picture of rivers and lakes"
+
+I will use the following
+````
+
+도입(*"도구:"* 앞의 텍스트)에서는 모델이 어떻게 작동하고 무엇을 해야 하는지 정확하게 설명합니다.
+에이전트는 항상 같은 방식으로 작동해야 하므로 이 부분은 사용자 정의할 필요가 없을 가능성이 높습니다.
+
+두 번째 부분(*"도구"* 아래의 글머리 기호)은 `run` 또는 `chat`을 호출할 때 동적으로 추가됩니다. 
+정확히 `agent.toolbox`에 있는 도구 수만큼 글머리 기호가 있고, 각 글머리 기호는 도구의 이름과 설명으로 구성됩니다:
+
+```text
+- <tool.name>: <tool.description>
+```
+
+문서 질의응답 도구를 가져오고 이름과 설명을 출력해서 빠르게 확인해 보겠습니다.
+
+```py
+from transformers import load_tool
+
+document_qa = load_tool("document-question-answering")
+print(f"- {document_qa.name}: {document_qa.description}")
+```
+
+그러면 다음 결과가 출력됩니다:
+```text
+- document_qa: This is a tool that answers a question about a document (pdf). It takes an input named `document` which should be the document containing the information, as well as a `question` that is the question about the document. It returns a text that contains the answer to the question.
+```
+
+여기서 도구 이름이 짧고 정확하다는 것을 알 수 있습니다. 
+설명은 두 부분으로 구성되어 있는데, 첫 번째 부분에서는 도구의 기능을 설명하고 두 번째 부분에서는 예상되는 입력 인수와 반환 값을 명시합니다.
+
+에이전트가 도구를 올바르게 사용하려면 좋은 도구 이름과 도구 설명이 매우 중요합니다. 
+에이전트가 도구에 대해 알 수 있는 유일한 정보는 이름과 설명뿐이므로, 이 두 가지를 정확하게 작성하고 도구 상자에 있는 기존 도구의 스타일과 일치하는지 확인해야 합니다. 
+특히 이름에 따라 예상되는 모든 인수가 설명에 코드 스타일로 언급되어 있는지, 예상되는 유형과 그 유형이 무엇인지에 대한 설명이 포함되어 있는지 확인하세요.
+
+<Tip>
+
+도구에 어떤 이름과 설명이 있어야 하는지 이해하려면 엄선된 Transformers 도구의 이름과 설명을 확인하세요. 
+[`Agent.toolbox`] 속성을 가진 모든 도구를 볼 수 있습니다.
+
+</Tip>
+
+세 번째 부분에는 에이전트가 어떤 종류의 사용자 요청에 대해 어떤 코드를 생성해야 하는지 정확하게 보여주는 엄선된 예제 세트가 포함되어 있습니다. 
+에이전트를 지원하는 대규모 언어 모델은 프롬프트에서 패턴을 인식하고 새로운 데이터로 패턴을 반복하는 데 매우 능숙합니다. 
+따라서 에이전트가 실제로 올바른 실행 가능한 코드를 생성할 가능성을 극대화하는 방식으로 예제를 작성하는 것이 매우 중요합니다. 
+
+한 가지 예를 살펴보겠습니다:
+
+````text
+Task: "Identify the oldest person in the `document` and create an image showcasing the result as a banner."
+
+I will use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer.
+
+Answer:
+```py
+answer = document_qa(document, question="What is the oldest person?")
+print(f"The answer is {answer}.")
+image = image_generator("A banner showing " + answer)
+```
+
+````
+작업 설명, 에이전트가 수행하려는 작업에 대한 설명, 마지막으로 생성된 코드, 이 세 부분으로 구성된 프롬프트는 모델에 반복하여 제공됩니다. 
+프롬프트의 일부인 모든 예제는 이러한 정확한 패턴으로 되어 있으므로, 에이전트가 새 토큰을 생성할 때 정확히 동일한 패턴을 재현할 수 있습니다.
+
+프롬프트 예제는 Transformers 팀이 선별하고 일련의 [problem statements](https://github.com/huggingface/transformers/blob/main/src/transformers/tools/evaluate_agent.py)에 따라 엄격하게 평가하여 
+에이전트의 프롬프트가 에이전트의 실제 사용 사례를 최대한 잘 해결할 수 있도록 보장합니다.
+
+프롬프트의 마지막 부분은 다음에 해당합니다:
+```text
+Task: "Draw me a picture of rivers and lakes"
+
+I will use the following
+```
+
+이는 에이전트가 완료해야 할 최종적인 미완성 예제입니다. 미완성 예제는 실제 사용자 입력에 따라 동적으로 만들어집니다. 
+위 예시의 경우 사용자가 다음과 같이 실행했습니다:
+
+```py
+agent.run("Draw me a picture of rivers and lakes")
+```
+
+사용자 입력 - *즉* Task: *"Draw me a picture of rivers and lakes"*가 프롬프트 템플릿에 맞춰 "Task: <task> \n\n I will use the following"로 캐스팅됩니다. 
+이 문장은 에이전트에게 조건이 적용되는 프롬프트의 마지막 줄을 구성하므로 에이전트가 이전 예제에서 수행한 것과 정확히 동일한 방식으로 예제를 완료하도록 강력하게 영향을 미칩니다.
+
+너무 자세히 설명하지 않더라도 채팅 템플릿의 프롬프트 구조는 동일하지만 예제의 스타일이 약간 다릅니다. *예를 들면*:
+
+````text
+[...]
+
+=====
+
+Human: Answer the question in the variable `question` about the image stored in the variable `image`.
+
+Assistant: I will use the tool `image_qa` to answer the question on the input image.
+
+```py
+answer = image_qa(text=question, image=image)
+print(f"The answer is {answer}")
+```
+
+Human: I tried this code, it worked but didn't give me a good result. The question is in French
+
+Assistant: In this case, the question needs to be translated first. I will use the tool `translator` to do this.
+
+```py
+translated_question = translator(question=question, src_lang="French", tgt_lang="English")
+print(f"The translated question is {translated_question}.")
+answer = image_qa(text=translated_question, image=image)
+print(f"The answer is {answer}")
+```
+
+=====
+
+[...]
+````
+
+`run` 프롬프트의 예와는 반대로, 각 `chat` 프롬프트의 예에는 *Human(사람)*과 *Assistant(어시스턴트)* 간에 하나 이상의 교환이 있습니다. 모든 교환은 `run` 프롬프트의 예와 유사한 구조로 되어 있습니다.
+사용자의 입력이 *Human:* 뒤에 추가되며, 에이전트에게 코드를 생성하기 전에 수행해야 할 작업을 먼저 생성하라는 메시지가 표시됩니다. 
+교환은 이전 교환을 기반으로 할 수 있으므로 위와 같이 사용자가 "**이** 코드를 시도했습니다"라고 입력하면 이전에 생성된 에이전트의 코드를 참조하여 과거 교환을 참조할 수 있습니다.
+
+`.chat`을 실행하면 사용자의 입력 또는 *작업*이 미완성된 양식의 예시로 캐스팅됩니다:
+```text
+Human: <user-input>\n\nAssistant:
+```
+그러면 에이전트가 이를 완성합니다. `run` 명령과 달리 `chat` 명령은 완료된 예제를 프롬프트에 추가하여 에이전트에게 다음 `chat` 차례에 대한 더 많은 문맥을 제공합니다.
+
+이제 프롬프트가 어떻게 구성되어 있는지 알았으니 어떻게 사용자 정의할 수 있는지 살펴봅시다!
+
+### 좋은 사용자 입력 작성하기[[writing-good-user-inputs]]
+
+대규모 언어 모델이 사용자의 의도를 이해하는 능력이 점점 더 향상되고 있지만, 에이전트가 올바른 작업을 선택할 수 있도록 최대한 정확성을 유지하는 것은 큰 도움이 됩니다. 
+최대한 정확하다는 것은 무엇을 의미할까요?
+
+에이전트는 프롬프트에서 도구 이름 목록과 해당 설명을 볼 수 있습니다. 
+더 많은 도구가 추가될수록 에이전트가 올바른 도구를 선택하기가 더 어려워지고 실행할 도구의 올바른 순서를 선택하는 것은 더욱 어려워집니다. 
+일반적인 실패 사례를 살펴보겠습니다. 여기서는 분석할 코드만 반환하겠습니다.
+
+```py
+from transformers import HfAgent
+
+agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
+
+agent.run("Show me a tree", return_code=True)
+```
+
+그러면 다음 결과가 출력됩니다:
+
+```text
+==Explanation from the agent==
+I will use the following tool: `image_segmenter` to create a segmentation mask for the image.
+
+
+==Code generated by the agent==
+mask = image_segmenter(image, prompt="tree")
+```
+
+우리가 원했던 결과가 아닐 수도 있습니다. 대신 나무 이미지가 생성되기를 원할 가능성이 더 높습니다.
+따라서 에이전트가 특정 도구를 사용하도록 유도하려면 도구의 이름과 설명에 있는 중요한 키워드를 사용하는 것이 매우 유용할 수 있습니다. 한번 살펴보겠습니다.
+```py
+agent.toolbox["image_generator"].description
+```
+
+```text
+'This is a tool that creates an image according to a prompt, which is a text description. It takes an input named `prompt` which contains the image description and outputs an image.
+```
+
+이름과 설명은 "image", "prompt", "create" 및 "generate" 키워드를 사용합니다. 이 단어들을 사용하면 더 잘 작동할 가능성이 높습니다. 프롬프트를 조금 더 구체화해 보겠습니다.
+
+```py
+agent.run("Create an image of a tree", return_code=True)
+```
+
+이 코드는 다음 프롬프트를 만들어냅니다:
+```text
+==Explanation from the agent==
+I will use the following tool `image_generator` to generate an image of a tree.
+
+
+==Code generated by the agent==
+image = image_generator(prompt="tree")
+```
+
+훨씬 낫네요! 저희가 원했던 것과 비슷해 보입니다. 
+즉, 에이전트가 작업을 올바른 도구에 올바르게 매핑하는 데 어려움을 겪고 있다면 도구 이름과 설명에서 가장 관련성이 높은 키워드를 찾아보고 이를 통해 작업 요청을 구체화해 보세요.
+
+### 도구 설명 사용자 정의하기[[customizing-the-tool-descriptions]]
+
+앞서 살펴본 것처럼 에이전트는 각 도구의 이름과 설명에 액세스할 수 있습니다. 
+기본 도구에는 매우 정확한 이름과 설명이 있어야 하지만 특정 사용 사례에 맞게 도구의 설명이나 이름을 변경하는 것이 도움이 될 수도 있습니다. 
+이는 매우 유사한 여러 도구를 추가했거나 특정 도메인(*예*: 이미지 생성 및 변환)에만 에이전트를 사용하려는 경우에 특히 중요해질 수 있습니다.
+
+일반적인 문제는 이미지 생성 작업에 많이 사용되는 경우 에이전트가 이미지 생성과 이미지 변환/수정을 혼동하는 것입니다. *예를 들어,*
+```py
+agent.run("Make an image of a house and a car", return_code=True)
+```
+그러면 다음 결과가 출력됩니다:
+```text
+==Explanation from the agent== 
+I will use the following tools `image_generator` to generate an image of a house and `image_transformer` to transform the image of a car into the image of a house.
+
+==Code generated by the agent==
+house_image = image_generator(prompt="A house")
+car_image = image_generator(prompt="A car")
+house_car_image = image_transformer(image=car_image, prompt="A house")
+```
+
+결과물이 우리가 여기서 원하는 것과 정확히 일치하지 않을 수 있습니다. 에이전트가 `image_generator`와 `image_transformer`의 차이점을 이해하기 어려워서 두 가지를 함께 사용하는 경우가 많은 것 같습니다.
+
+여기서 `image_transformer`의 도구 이름과 설명을 변경하여 에이전트가 도울 수 있습니다. 
+"image" 및 "prompt"와 약간 분리하기 위해 `modifier`라고 대신 부르겠습니다:
+```py
+agent.toolbox["modifier"] = agent.toolbox.pop("image_transformer")
+agent.toolbox["modifier"].description = agent.toolbox["modifier"].description.replace(
+    "transforms an image according to a prompt", "modifies an image"
+)
+```
+
+이제 "modify"은 새 이미지 프로세서를 사용하라는 강력한 신호이므로 위의 프롬프트에 도움이 될 것입니다. 다시 실행해 봅시다.
+
+```py
+agent.run("Make an image of a house and a car", return_code=True)
+```
+
+여기서 다음과 같은 결과를 얻게 됩니다:
+```text
+==Explanation from the agent==
+I will use the following tools: `image_generator` to generate an image of a house, then `image_generator` to generate an image of a car.
+
+
+==Code generated by the agent==
+house_image = image_generator(prompt="A house")
+car_image = image_generator(prompt="A car")
+```
+
+우리가 염두에 두었던 것과 확실히 더 가까워졌습니다! 하지만 집과 자동차가 모두 같은 이미지에 포함되면 좋겠습니다. 작업을 단일 이미지 생성에 더 집중하면 도움이 될 것입니다:
+
+```py
+agent.run("Create image: 'A house and car'", return_code=True)
+```
+
+```text
+==Explanation from the agent==
+I will use the following tool: `image_generator` to generate an image.
+
+
+==Code generated by the agent==
+image = image_generator(prompt="A house and car")
+```
+
+<Tip warning={true}>
+
+에이전트는 여전히 특히 여러 개체의 이미지를 생성하는 것과 같이 약간 더 복잡한 사용 사례에서 취약한 경우가 많습니다.
+앞으로 몇 달 안에 에이전트 자체와 기본 프롬프트가 더욱 개선되어 에이전트가 다양한 사용자 입력에 더욱 강력하게 대응할 수 있도록 할 예정입니다.
+
+</Tip>
+
+### 전체 프롬프트 사용자 정의하기[[customizing-the-whole-prompt]]
+
+사용자에게 최대한의 유연성을 제공하기 위해 [위](#structure-of-the-prompt)에 설명된 전체 프롬프트 템플릿을 사용자가 덮어쓸 수 있습니다. 
+이 경우 사용자 정의 프롬프트에 소개 섹션, 도구 섹션, 예제 섹션 및 미완성 예제 섹션이 포함되어 있는지 확인하세요. 
+`run` 프롬프트 템플릿을 덮어쓰려면 다음과 같이 하면 됩니다:
+
+```py
+template = """ [...] """
+
+agent = HfAgent(your_endpoint, run_prompt_template=template)
+```
+
+<Tip warning={true}>
+
+에이전트가 사용 가능한 도구를 인식하고 사용자의 프롬프트를 올바르게 삽입할 수 있도록 `<<all_tools>>` 문자열과 `<<prompt>>`를 `template` 어딘가에 정의해야 합니다.
+
+</Tip>
+
+마찬가지로 `chat` 프롬프트 템플릿을 덮어쓸 수 있습니다. `chat` 모드에서는 항상 다음과 같은 교환 형식을 사용한다는 점에 유의하세요:
+
+```text
+Human: <<task>>
+
+Assistant:
+```
+
+따라서 사용자 정의 `chat` 프롬프트 템플릿의 예제에서도 이 형식을 사용하는 것이 중요합니다. 
+다음과 같이 인스턴스화 할 때 `chat` 템플릿을 덮어쓸 수 있습니다.
+
+```
+template = """ [...] """
+
+agent = HfAgent(url_endpoint=your_endpoint, chat_prompt_template=template)
+```
+
+<Tip warning={true}>
+
+에이전트가 사용 가능한 도구를 인식할 수 있도록 `<<all_tools>>` 문자열을 `template` 어딘가에 정의해야 합니다.
+
+</Tip>
+
+두 경우 모두 커뮤니티의 누군가가 호스팅하는 템플릿을 사용하려는 경우 프롬프트 템플릿 대신 저장소 ID를 전달할 수 있습니다. 
+기본 프롬프트는 [이 저장소](https://huggingface.co/datasets/huggingface-tools/default-prompts)를 예로 들 수 있습니다.
+
+Hub의 저장소에 사용자 정의 프롬프트를 업로드하여 커뮤니티와 공유하려면 다음을 확인하세요:
+- 데이터 세트 저장소를 사용하세요.
+- `run` 명령에 대한 프롬프트 템플릿을 `run_prompt_template.txt`라는 파일에 넣으세요.
+- `chat` 명령에 대한 프롬프트 템플릿을 `chat_prompt_template.txt`라는 파일에 넣으세요.
+
+## 사용자 정의 도구 사용하기[[using-custom-tools]]
+
+이 섹션에서는 이미지 생성에 특화된 두 가지 기존 사용자 정의 도구를 활용하겠습니다:
+
+- 더 많은 이미지 수정을 허용하기 위해 [huggingface-tools/image-transformation](https://huggingface.co/spaces/huggingface-tools/image-transformation)을 
+  [diffusers/controlnet-canny-tool](https://huggingface.co/spaces/diffusers/controlnet-canny-tool)로 대체합니다.
+- 기본 도구 상자에 이미지 업스케일링을 위한 새로운 도구가 추가되었습니다: 
+  [diffusers/latent-upscaler-tool](https://huggingface.co/spaces/diffusers/latent-upscaler-tool)가 기존 이미지 변환 도구를 대체합니다.
+
+편리한 [`load_tool`] 함수를 사용하여 사용자 정의 도구를 가져오는 것으로 시작하겠습니다:
+
+```py
+from transformers import load_tool
+
+controlnet_transformer = load_tool("diffusers/controlnet-canny-tool")
+upscaler = load_tool("diffusers/latent-upscaler-tool")
+```
+
+에이전트에게 사용자 정의 도구를 추가하면 도구의 설명과 이름이 에이전트의 프롬프트에 자동으로 포함됩니다. 
+따라서 에이전트가 사용 방법을 이해할 수 있도록 사용자 정의 도구의 설명과 이름을 잘 작성해야 합니다.
+`controlnet_transformer`의 설명과 이름을 살펴보겠습니다:
+
+```py
+print(f"Description: '{controlnet_transformer.description}'")
+print(f"Name: '{controlnet_transformer.name}'")
+```
+
+그러면 다음 결과가 출력됩니다:
+```text
+Description: 'This is a tool that transforms an image with ControlNet according to a prompt. 
+It takes two inputs: `image`, which should be the image to transform, and `prompt`, which should be the prompt to use to change it. It returns the modified image.'
+Name: 'image_transformer'
+```
+
+이름과 설명이 정확하고 [큐레이팅 된 도구 세트(curated set of tools)](./transformers_agents#a-curated-set-of-tools)의 스타일에 맞습니다.
+다음으로, `controlnet_transformer`와 `upscaler`로 에이전트를 인스턴스화해 봅시다:
+```py
+tools = [controlnet_transformer, upscaler]
+agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=tools)
+```
+
+이 명령을 실행하면 다음 정보가 표시됩니다:
+
+```text
+image_transformer has been replaced by <transformers_modules.diffusers.controlnet-canny-tool.bd76182c7777eba9612fc03c0
+8718a60c0aa6312.image_transformation.ControlNetTransformationTool object at 0x7f1d3bfa3a00> as provided in `additional_tools`
+```
+
+큐레이팅된 도구 세트에는 이미 'image_transformer' 도구가 있으며, 이 도구는 사용자 정의 도구로 대체됩니다.
+
+<Tip>
+
+기존 도구와 똑같은 작업에 사용자 정의 도구를 사용하려는 경우 기존 도구를 덮어쓰는 것이 유용할 수 있습니다. 
+에이전트가 해당 작업에 능숙하기 때문입니다.
+이 경우 사용자 정의 도구가 덮어쓴 도구와 정확히 동일한 API를 따라야 하며, 그렇지 않으면 해당 도구를 사용하는 모든 예제가 업데이트되도록 프롬프트 템플릿을 조정해야 한다는 점에 유의하세요.
+
+</Tip>
+
+업스케일러 도구에 지정된 'image_upscaler'라는 이름 아직 기본 도구 상자에는 존재하지 않기 때문에, 도구 목록에 해당 이름이 간단히 추가되었습니다.
+에이전트가 현재 사용할 수 있는 도구 상자는 언제든지 `agent.toolbox` 속성을 통해 확인할 수 있습니다:
+
+```py
+print("\n".join([f"- {a}" for a in agent.toolbox.keys()]))
+```
+
+```text
+- document_qa
+- image_captioner
+- image_qa
+- image_segmenter
+- transcriber
+- summarizer
+- text_classifier
+- text_qa
+- text_reader
+- translator
+- image_transformer
+- text_downloader
+- image_generator
+- video_generator
+- image_upscaler
+```
+
+에이전트의 도구 상자에 `image_upscaler`가 추가된 점을 주목하세요.
+
+이제 새로운 도구를 사용해봅시다! [Transformers Agents Quickstart](./transformers_agents#single-execution-run)에서 생성한 이미지를 다시 사용하겠습니다.
+
+```py
+from diffusers.utils import load_image
+
+image = load_image(
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png"
+)
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200> 
+
+이미지를 아름다운 겨울 풍경으로 바꿔 봅시다:
+
+```py
+image = agent.run("Transform the image: 'A frozen lake and snowy forest'", image=image)
+```
+
+```text
+==Explanation from the agent==
+I will use the following tool: `image_transformer` to transform the image.
+
+
+==Code generated by the agent==
+image = image_transformer(image, prompt="A frozen lake and snowy forest")
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes_winter.png" width=200> 
+
+새로운 이미지 처리 도구는 이미지를 매우 강력하게 수정할 수 있는 ControlNet을 기반으로 합니다.
+기본적으로 이미지 처리 도구는 512x512 픽셀 크기의 이미지를 반환합니다. 이를 업스케일링할 수 있는지 살펴봅시다.
+
+```py
+image = agent.run("Upscale the image", image)
+```
+
+```text
+==Explanation from the agent==
+I will use the following tool: `image_upscaler` to upscale the image.
+
+
+==Code generated by the agent==
+upscaled_image = image_upscaler(image)
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes_winter_upscale.png" width=400> 
+
+에이전트는 업스케일러 도구의 설명과 이름만 보고 방금 추가한 업스케일러 도구에 "이미지 업스케일링"이라는 프롬프트를 자동으로 매핑하여 올바르게 실행했습니다.
+
+다음으로 새 사용자 정의 도구를 만드는 방법을 살펴보겠습니다.
+
+### 새 도구 추가하기[[adding-new-tools]]
+
+이 섹션에서는 에이전트에게 추가할 수 있는 새 도구를 만드는 방법을 보여 드립니다.
+
+#### 새 도구 만들기[[creating-a-new-tool]]
+
+먼저 도구를 만드는 것부터 시작하겠습니다. 
+특정 작업에 대해 가장 많은 다운로드를 받은 Hugging Face Hub의 모델을 가져오는, 그다지 유용하지는 않지만 재미있는 작업을 추가하겠습니다.
+
+다음 코드를 사용하면 됩니다:
+
+```python
+from huggingface_hub import list_models
+
+task = "text-classification"
+
+model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
+print(model.id)
+```
+`text-classification`(텍스트 분류) 작업의 경우 `'facebook/bart-large-mnli'`를 반환하고, `translation`(번역) 작업의 경우 `'t5-base'`를 반환합니다.
+
+이를 에이전트가 활용할 수 있는 도구로 변환하려면 어떻게 해야 할까요? 
+모든 도구는 필요한 주요 속성을 보유하는 슈퍼클래스 `Tool`에 의존합니다. 이를 상속하는 클래스를 만들어 보겠습니다:
+
+```python
+from transformers import Tool
+
+
+class HFModelDownloadsTool(Tool):
+    pass
+```
+
+이 클래스에는 몇 가지 요구사항이 있습니다:
+- 도구 자체의 이름에 해당하는 `name` 속성. 수행명이 있는 다른 도구와 호환되도록 `model_download_counter`로 이름을 지정하겠습니다.
+- 에이전트의 프롬프트를 채우는 데 사용되는 속성 `description`.
+- `inputs` 및 `outputs` 속성. 이를 정의하면 Python 인터프리터가 유형에 대한 정보에 입각한 선택을 하는 데 도움이 되며, 
+  도구를 허브에 푸시할 때 gradio 데모를 생성할 수 있습니다. 
+  두 속성 모두 값은 '텍스트', '이미지' 또는 '오디오'가 될 수 있는 예상 값의 리스트입니다.
+- 추론 코드가 포함된 `__call__` 메소드. 이것이 우리가 위에서 다루었던 코드입니다!
+
+이제 클래스의 모습은 다음과 같습니다:
+
+```python
+from transformers import Tool
+from huggingface_hub import list_models
+
+
+class HFModelDownloadsTool(Tool):
+    name = "model_download_counter"
+    description = (
+        "This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub. "
+        "It takes the name of the category (such as text-classification, depth-estimation, etc), and "
+        "returns the name of the checkpoint."
+    )
+
+    inputs = ["text"]
+    outputs = ["text"]
+
+    def __call__(self, task: str):
+        model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
+        return model.id
+```
+
+이제 도구를 손쉽게 사용할 수 있게 되었습니다. 
+도구를 파일에 저장하고 메인 스크립트에서 가져옵니다. 이 파일의 이름을 `model_downloads.py`로 지정하면 결과적으로 가져오기 코드는 다음과 같습니다:
+
+```python
+from model_downloads import HFModelDownloadsTool
+
+tool = HFModelDownloadsTool()
+```
+
+다른 사람들이 이 기능을 활용할 수 있도록 하고 초기화를 더 간단하게 하려면 네임스페이스 아래의 Hub로 푸시하는 것이 좋습니다. 
+그렇게 하려면 `tool` 변수에서 `push_to_hub`를 호출하면 됩니다:
+
+```python
+tool.push_to_hub("hf-model-downloads")
+```
+
+이제 허브에 코드가 생겼습니다! 마지막 단계인 에이전트가 코드를 사용하도록 하는 단계를 살펴보겠습니다.
+
+#### 에이전트가 도구를 사용하게 하기[[Having-the-agent-use-the-tool]]
+
+이제 이런 식으로 허브에 존재하는 도구를 인스턴스화할 수 있습니다(도구의 사용자 이름은 변경하세요):
+We now have our tool that lives on the Hub which can be instantiated as such (change the user name for your tool):
+
+```python
+from transformers import load_tool
+
+tool = load_tool("lysandre/hf-model-downloads")
+```
+
+이 도구를 에이전트에서 사용하려면 에이전트 초기화 메소드의 `additional_tools` 매개변수에 전달하기만 하면 됩니다:
+
+```python
+from transformers import HfAgent
+
+agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=[tool])
+
+agent.run(
+    "Can you read out loud the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?"
+)
+```
+그러면 다음과 같은 결과가 출력됩니다:
+```text
+==Code generated by the agent==
+model = model_download_counter(task="text-to-video")
+print(f"The model with the most downloads is {model}.")
+audio_model = text_reader(model)
+
+
+==Result==
+The model with the most downloads is damo-vilab/text-to-video-ms-1.7b.
+```
+
+and generates the following audio.
+
+| **Audio**                                                                                                                                            |
+|------------------------------------------------------------------------------------------------------------------------------------------------------|
+| <audio controls><source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/damo.wav" type="audio/wav"/> |
+
+
+<Tip>
+
+LLM에 따라 일부는 매우 취약하기 때문에 제대로 작동하려면 매우 정확한 프롬프트가 필요합니다. 
+에이전트가 도구를 잘 활용하기 위해서는 도구의 이름과 설명을 잘 정의하는 것이 무엇보다 중요합니다.
+
+</Tip>
+
+### 기존 도구 대체하기[[replacing-existing-tools]]
+
+에이전트의 도구 상자에 새 항목을 배정하기만 하면 기존 도구를 대체할 수 있습니다. 방법은 다음과 같습니다:
+
+```python
+from transformers import HfAgent, load_tool
+
+agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
+agent.toolbox["image-transformation"] = load_tool("diffusers/controlnet-canny-tool")
+```
+
+<Tip>
+
+다른 도구로 교체할 때는 주의하세요! 이 작업으로 에이전트의 프롬프트도 조정됩니다. 
+작업에 더 적합한 프롬프트가 있으면 좋을 수 있지만, 
+다른 도구보다 더 많이 선택되거나 정의한 도구 대신 다른 도구가 선택될 수도 있습니다.
+
+</Tip>
+
+## gradio-tools 사용하기[[leveraging-gradio-tools]]
+
+[gradio-tools](https://github.com/freddyaboulton/gradio-tools)는 Hugging Face Spaces를 도구로 사용할 수 있는 강력한 라이브러리입니다. 
+기존의 많은 Spaces뿐만 아니라 사용자 정의 Spaces를 사용하여 디자인할 수 있도록 지원합니다.
+
+우리는 `Tool.from_gradio` 메소드를 사용하여 `gradio_tools`에 대한 지원을 제공합니다. 
+예를 들어, 프롬프트를 개선하고 더 나은 이미지를 생성하기 위해 `gradio-tools` 툴킷에서 제공되는 `StableDiffusionPromptGeneratorTool` 도구를 활용하고자 합니다.
+
+먼저 `gradio_tools`에서 도구를 가져와서 인스턴스화합니다:
+
+```python
+from gradio_tools import StableDiffusionPromptGeneratorTool
+
+gradio_tool = StableDiffusionPromptGeneratorTool()
+```
+
+해당 인스턴스를 `Tool.from_gradio` 메소드에 전달합니다:
+
+```python
+from transformers import Tool
+
+tool = Tool.from_gradio(gradio_tool)
+```
+
+이제 일반적인 사용자 정의 도구와 똑같이 관리할 수 있습니다. 
+이를 활용하여 `a rabbit wearing a space suit'(우주복을 입은 토끼)라는 프롬프트를 개선했습니다:
+
+```python
+from transformers import HfAgent
+
+agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=[tool])
+
+agent.run("Generate an image of the `prompt` after improving it.", prompt="A rabbit wearing a space suit")
+```
+
+모델이 도구를 적절히 활용합니다:
+```text
+==Explanation from the agent==
+I will use the following  tools: `StableDiffusionPromptGenerator` to improve the prompt, then `image_generator` to generate an image according to the improved prompt.
+
+
+==Code generated by the agent==
+improved_prompt = StableDiffusionPromptGenerator(prompt)
+print(f"The improved prompt is {improved_prompt}.")
+image = image_generator(improved_prompt)
+```
+
+마지막으로 이미지를 생성하기 전에:
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png">
+
+<Tip warning={true}>
+
+gradio-tools는 다른 모달리티로 작업할 때에도 *텍스트* 입력 및 출력을 필요로 합니다. 
+이 구현은 이미지 및 오디오 객체에서 작동합니다. 
+현재는 이 두 가지가 호환되지 않지만 지원 개선을 위해 노력하면서 빠르게 호환될 것입니다.
+
+</Tip>
+
+## 향후 Langchain과의 호환성[[future-compatibility-with-langchain]]
+
+저희는 Langchain을 좋아하며 매우 매력적인 도구 모음을 가지고 있다고 생각합니다. 
+이러한 도구를 처리하기 위해 Langchain은 다른 모달리티와 작업할 때에도 *텍스트* 입력과 출력을 필요로 합니다.
+이는 종종 객체의 직렬화된(즉, 디스크에 저장된) 버전입니다.
+
+이 차이로 인해 transformers-agents와 Langchain 간에는 멀티 모달리티가 처리되지 않습니다. 
+향후 버전에서 이 제한이 해결되기를 바라며, 이 호환성을 달성할 수 있도록 열렬한 Langchain 사용자의 도움을 환영합니다.
+
+저희는 더 나은 지원을 제공하고자 합니다. 도움을 주고 싶으시다면, [이슈를 열어](https://github.com/huggingface/transformers/issues/new) 의견을 공유해 주세요.
diff --git a/docs/source/ko/debugging.md b/docs/source/ko/debugging.md
new file mode 100644
index 000000000000..24b2c7b04b50
--- /dev/null
+++ b/docs/source/ko/debugging.md
@@ -0,0 +1,306 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 디버깅 [[debugging]]
+
+## Multi-GPU 네트워크 문제 디버그 [[multigpu-network-issues-debug]]
+
+`DistributedDataParallel` 및 다중 GPU를 사용하여 훈련하거나 추론할 때, 프로세스 및/또는 노드 간의 상호 통신 문제가 발생하는 경우, 다음 스크립트를 사용하여 네트워크 문제를 진단할 수 있습니다.
+
+```bash
+wget https://raw.githubusercontent.com/huggingface/transformers/main/scripts/distributed/torch-distributed-gpu-test.py
+```
+
+예를 들어, 2개의 GPU가 상호 작용하는 방식을 테스트하려면 다음을 실행하세요:
+
+```bash
+python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
+```
+두 프로세스가 서로 통신하고 GPU 메모리를 할당하는 경우, 각각 "OK" 상태를 출력합니다.
+
+더 많은 GPU 또는 노드의 경우 스크립트의 인수를 조정하면 됩니다.
+
+진단 스크립트 내에서 더 많은 세부 정보와 SLURM 환경에서 실행하는 방법에 대한 레시피를 찾을 수 있습니다.
+
+추가적인 디버그 수준은 다음과 같이 `NCCL_DEBUG=INFO` 환경 변수를 추가하는 것입니다:
+
+```bash
+NCCL_DEBUG=INFO python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
+```
+
+이렇게 하면 NCCL 관련 디버그 정보가 많이 출력되며, 문제가 보고된 경우에는 인터넷에서 검색할 수 있습니다. 또는 출력을 해석하는 방법을 잘 모르는 경우 로그 파일을 이슈에 공유할 수 있습니다.
+
+
+
+## 언더플로 및 오버플로 감지 [[underflow-and-overflow-detection]]
+
+
+<Tip>
+
+이 기능은 현재 PyTorch에서만 사용할 수 있습니다.
+
+</Tip>
+
+<Tip>
+
+다중 GPU 훈련을 위해서는 DDP (`torch.distributed.launch`)가 필요합니다.
+
+</Tip>
+
+<Tip>
+
+이 기능은 `nn.Module`을 기반으로 하는 모델과 함께 사용할 수 있습니다.
+
+</Tip>
+
+`loss=NaN`이 나타나거나 모델이 `inf` 또는 `nan`으로 인해 다른 이상한 동작을 하는 경우, 언더플로 또는 오버플로의 첫 번째 발생 위치와 그 원인을 파악해야 합니다. 다행히도 이를 자동으로 감지하는 특수 모듈을 활성화하여 쉽게 알아낼 수 있습니다.
+
+[`Trainer`]를 사용하는 경우, 다음을 기존의 명령줄 인수에 추가하면 됩니다.
+
+```bash
+--debug underflow_overflow
+```
+또는 [`TrainingArguments`] 객체를 생성할 때 `debug="underflow_overflow"`를 전달합니다.
+
+자체 훈련 루프나 다른 Trainer를 사용하는 경우, 다음과 같이 수행할 수 있습니다. 
+
+```python
+from transformers.debug_utils import DebugUnderflowOverflow
+
+debug_overflow = DebugUnderflowOverflow(model)
+```
+
+[`~debug_utils.DebugUnderflowOverflow`]는 모델에 후크를 삽입하여 각 forward 호출 직후에 입력 및 출력 변수 및 해당 모듈의 가중치를 테스트합니다. 활성화나 가중치의 최소한 하나의 요소에서 `inf` 또는 `nan`이 감지되면 프로그램이 어설트되고 다음과 같은 보고서가 출력됩니다. (이 예제는 fp16 혼합 정밀도에서 `google/mt5-small`에서 캡처된 것입니다):
+
+```
+Detected inf/nan during batch_number=0
+Last 21 forward frames:
+abs min  abs max  metadata
+                  encoder.block.1.layer.1.DenseReluDense.dropout Dropout
+0.00e+00 2.57e+02 input[0]
+0.00e+00 2.85e+02 output
+[...]
+                  encoder.block.2.layer.0 T5LayerSelfAttention
+6.78e-04 3.15e+03 input[0]
+2.65e-04 3.42e+03 output[0]
+             None output[1]
+2.25e-01 1.00e+04 output[2]
+                  encoder.block.2.layer.1.layer_norm T5LayerNorm
+8.69e-02 4.18e-01 weight
+2.65e-04 3.42e+03 input[0]
+1.79e-06 4.65e+00 output
+                  encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
+2.17e-07 4.50e+00 weight
+1.79e-06 4.65e+00 input[0]
+2.68e-06 3.70e+01 output
+                  encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
+8.08e-07 2.66e+01 weight
+1.79e-06 4.65e+00 input[0]
+1.27e-04 2.37e+02 output
+                  encoder.block.2.layer.1.DenseReluDense.dropout Dropout
+0.00e+00 8.76e+03 input[0]
+0.00e+00 9.74e+03 output
+                  encoder.block.2.layer.1.DenseReluDense.wo Linear
+1.01e-06 6.44e+00 weight
+0.00e+00 9.74e+03 input[0]
+3.18e-04 6.27e+04 output
+                  encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
+1.79e-06 4.65e+00 input[0]
+3.18e-04 6.27e+04 output
+                  encoder.block.2.layer.1.dropout Dropout
+3.18e-04 6.27e+04 input[0]
+0.00e+00      inf output
+```
+
+예제 출력은 간략성을 위해 중간 부분이 잘려 있습니다.
+
+두 번째 열은 절대적으로 가장 큰 요소의 값이며, 따라서 마지막 몇 개의 프레임을 자세히 살펴보면 입력과 출력이 `1e4` 범위에 있음을 알 수 있습니다. 따라서 이 훈련은 `fp16` 혼합 정밀도로 수행될 때 가장 마지막 단계에서 오버플로우가 발생했습니다 (`fp16`에서 `inf` 이전의 가장 큰 숫자는 `64e3`입니다). `fp16` 아래에서 오버플로우를 피하기 위해서는 활성화는 `1e4`보다 훨씬 작아야 합니다. 왜냐하면 `1e4 * 1e4 = 1e8`이기 때문에 큰 활성화와의 행렬 곱은 수치적인 오버플로우 조건으로 이어질 것입니다.
+
+추적의 맨 처음에서 어느 배치 번호에서 문제가 발생했는지 알 수 있습니다 (여기서 `Detected inf/nan during batch_number=0`은 문제가 첫 번째 배치에서 발생했음을 의미합니다).
+
+각 보고된 프레임은 해당 프레임이 보고하는 해당 모듈에 대한 완전한 항목을 선언하며, 이 프레임만 살펴보면 다음과 같습니다.
+
+```
+                  encoder.block.2.layer.1.layer_norm T5LayerNorm
+8.69e-02 4.18e-01 weight
+2.65e-04 3.42e+03 input[0]
+1.79e-06 4.65e+00 output
+```
+
+여기서 `encoder.block.2.layer.1.layer_norm`은 인코더의 두 번째 블록의 첫 번째 레이어에 대한 레이어 정규화를 의미하며, `forward`의 특정 호출은 `T5LayerNorm`입니다.
+
+이 보고서의 마지막 몇 개 프레임을 살펴보겠습니다:
+
+```
+Detected inf/nan during batch_number=0
+Last 21 forward frames:
+abs min  abs max  metadata
+[...]
+                  encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
+2.17e-07 4.50e+00 weight
+1.79e-06 4.65e+00 input[0]
+2.68e-06 3.70e+01 output
+                  encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
+8.08e-07 2.66e+01 weight
+1.79e-06 4.65e+00 input[0]
+1.27e-04 2.37e+02 output
+                  encoder.block.2.layer.1.DenseReluDense.wo Linear
+1.01e-06 6.44e+00 weight
+0.00e+00 9.74e+03 input[0]
+3.18e-04 6.27e+04 output
+                  encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
+1.79e-06 4.65e+00 input[0]
+3.18e-04 6.27e+04 output
+                  encoder.block.2.layer.1.dropout Dropout
+3.18e-04 6.27e+04 input[0]
+0.00e+00      inf output
+```
+
+마지막 프레임은 `Dropout.forward` 함수에 대한 보고입니다. 첫 번째 항목은 유일한 입력을 나타내고 두 번째 항목은 유일한 출력을 나타냅니다. 이 함수가 `DenseReluDense` 클래스 내부의 `dropout` 속성에서 호출된 것을 볼 수 있습니다. 이는 첫 번째 레이어의 두 번째 블록에서 첫 번째 배치 중에 발생했다는 것을 알 수 있습니다. 마지막으로, 절대적으로 가장 큰 입력 요소는 `6.27e+04`이고 출력도 마찬가지로 `inf`입니다.
+
+여기에서는 `T5DenseGatedGeluDense.forward`가 출력 활성화를 생성하는데, 절대적으로 가장 큰 값이 약 62.7K인 것을 볼 수 있습니다. 이 값은 fp16의 최대 제한인 64K에 매우 근접합니다. 다음 프레임에서는 일부 요소를 0으로 만든 후 가중치를 재정규화하는 `Dropout`이 있습니다. 이로 인해 절대 최대값이 64K를 초과하고 오버플로우(`inf`)가 발생합니다.
+
+보시다시피, fp16 숫자의 경우 숫자가 매우 커질 때 이전 프레임을 살펴보아야 합니다.
+
+보고서를 `models/t5/modeling_t5.py`의 코드와 일치시켜 보겠습니다.
+
+```python
+class T5DenseGatedGeluDense(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+        self.gelu_act = ACT2FN["gelu_new"]
+
+    def forward(self, hidden_states):
+        hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
+        hidden_linear = self.wi_1(hidden_states)
+        hidden_states = hidden_gelu * hidden_linear
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+```
+
+이제 `dropout` 호출과 이전의 모든 호출을 쉽게 확인할 수 있습니다.
+
+감지는 `forward` 후크에서 발생하므로, 이러한 보고서는 각 `forward`가 반환된 직후에 즉시 출력됩니다.
+
+전체 보고서로 돌아가서 문제에 대한 조치 및 수정을 하려면, 숫자가 증가하기 시작한 몇 개의 프레임 위로 이동해서 여기서 `fp32` 모드로 전환해야 합니다. 이렇게 해야 숫자가 곱해지거나 합쳐질 때 오버플로우되지 않을 가능성이 높습니다. 물론 다른 해결책도 있을 수 있습니다. 예를 들어, `amp`가 활성화된 경우 일시적으로 끄고 원래의 `forward`를 도우미 래퍼로 이동한 후 다음과 같이 할 수 있습니다:
+
+```python
+def _forward(self, hidden_states):
+    hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
+    hidden_linear = self.wi_1(hidden_states)
+    hidden_states = hidden_gelu * hidden_linear
+    hidden_states = self.dropout(hidden_states)
+    hidden_states = self.wo(hidden_states)
+    return hidden_states
+
+
+import torch
+
+
+def forward(self, hidden_states):
+    if torch.is_autocast_enabled():
+        with torch.cuda.amp.autocast(enabled=False):
+            return self._forward(hidden_states)
+    else:
+        return self._forward(hidden_states)
+```
+
+자동 감지기는 전체 프레임의 입력과 출력에 대해서만 보고하므로, 어디를 살펴봐야 하는지 알면 특정 `forward` 함수의 중간 단계도 분석할 수 있습니다. 이 경우에는 `detect_overflow` 도우미 함수를 사용하여 원하는 위치에 감지기를 삽입할 수 있습니다. 예를 들어:
+
+```python
+from debug_utils import detect_overflow
+
+
+class T5LayerFF(nn.Module):
+    [...]
+
+    def forward(self, hidden_states):
+        forwarded_states = self.layer_norm(hidden_states)
+        detect_overflow(forwarded_states, "after layer_norm")
+        forwarded_states = self.DenseReluDense(forwarded_states)
+        detect_overflow(forwarded_states, "after DenseReluDense")
+        return hidden_states + self.dropout(forwarded_states)
+```
+
+여기서는 이를 추가하여 2개의 것을 추적하고 이제 `forwarded_states`의 `inf` 또는 `nan`이 중간에 감지되었는지를 추적합니다.
+
+실제로 위의 예제에서 각 호출이 `nn.Module`이기 때문에 탐지기가 이미 이를 보고합니다. 로컬에서 직접 계산하는 경우 이렇게 수행한다고 가정해 봅시다.
+
+또한, 자체 코드에서 디버거를 인스턴스화하는 경우 기본값에서 출력되는 프레임 수를 조정할 수 있습니다. 예를 들어:
+
+```python
+from transformers.debug_utils import DebugUnderflowOverflow
+
+debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100)
+```
+
+### 특정 배치의 절댓값 최소 및 최대 값 추적 [[specific-batch-absolute-min-and-max-value-tracing]]
+
+동일한 디버깅 클래스는 언더플로우/오버플로우 감지 기능이 꺼진 상태에서 배치별 추적에도 사용할 수 있습니다.
+
+예를 들어, 특정 배치의 각 `forward` 호출의 모든 구성 성분에 대한 절대 최솟값과 최댓값을 확인하고, 이를 배치 1과 3에 대해서만 수행하려면 다음과 같이 이 클래스를 인스턴스화합니다:
+
+```python
+debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3])
+```
+
+그러면 이제 배치 1과 3 전체가 언더플로우/오버플로우 감지기와 동일한 형식으로 추적됩니다.
+
+배치는 0부터 시작합니다.
+
+이는 프로그램이 특정 배치 번호 이후에 오작동하기 시작하는 것을 알고 있는 경우에 유용합니다. 그렇기 때문에 해당 영역으로 바로 이동할 수 있습니다. 이런 구성에 대한 샘플 축소된 출력은 다음과 같습니다.
+
+```
+                  *** Starting batch number=1 ***
+abs min  abs max  metadata
+                  shared Embedding
+1.01e-06 7.92e+02 weight
+0.00e+00 2.47e+04 input[0]
+5.36e-05 7.92e+02 output
+[...]
+                  decoder.dropout Dropout
+1.60e-07 2.27e+01 input[0]
+0.00e+00 2.52e+01 output
+                  decoder T5Stack
+     not a tensor output
+                  lm_head Linear
+1.01e-06 7.92e+02 weight
+0.00e+00 1.11e+00 input[0]
+6.06e-02 8.39e+01 output
+                   T5ForConditionalGeneration
+     not a tensor output
+
+                  *** Starting batch number=3 ***
+abs min  abs max  metadata
+                  shared Embedding
+1.01e-06 7.92e+02 weight
+0.00e+00 2.78e+04 input[0]
+5.36e-05 7.92e+02 output
+[...]
+```
+
+여기에서는 모델의 forward 호출 수와 동일한 수의 프레임이 덤프되므로 많은 수의 프레임이 생성됩니다. 따라서 원하는 것일 수도 있고 아닐 수도 있습니다. 그러나 때로는 일반 디버거보다 디버깅 목적으로 더 쉽게 사용할 수 있습니다. 예를 들어, 문제가 배치 번호 150에서 시작하는 경우 149와 150의 추적을 덤프하고 숫자가 어디서부터 다르게 되었는지 비교할 수 있습니다.
+
+또한, 훈련을 중지할 배치 번호를 지정할 수도 있습니다. 다음과 같이 지정할 수 있습니다.
+
+```python
+debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3], abort_after_batch_num=3)
+```
diff --git a/docs/source/ko/fast_tokenizers.md b/docs/source/ko/fast_tokenizers.md
new file mode 100644
index 000000000000..a6d1f14283bb
--- /dev/null
+++ b/docs/source/ko/fast_tokenizers.md
@@ -0,0 +1,71 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 🤗 Tokenizers 라이브러리의 토크나이저 사용하기[[use-tokenizers-from-tokenizers]]
+
+[`PreTrainedTokenizerFast`]는 [🤗 Tokenizers](https://huggingface.co/docs/tokenizers) 라이브러리에 기반합니다. 🤗 Tokenizers 라이브러리의 토크나이저는
+🤗 Transformers로 매우 간단하게 불러올 수 있습니다.
+
+구체적인 내용에 들어가기 전에, 몇 줄의 코드로 더미 토크나이저를 만들어 보겠습니다:
+
+```python
+>>> from tokenizers import Tokenizer
+>>> from tokenizers.models import BPE
+>>> from tokenizers.trainers import BpeTrainer
+>>> from tokenizers.pre_tokenizers import Whitespace
+
+>>> tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
+>>> trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
+
+>>> tokenizer.pre_tokenizer = Whitespace()
+>>> files = [...]
+>>> tokenizer.train(files, trainer)
+```
+
+우리가 정의한 파일을 통해 이제 학습된 토크나이저를 갖게 되었습니다. 이 런타임에서 계속 사용하거나 JSON 파일로 저장하여 나중에 사용할 수 있습니다.
+
+## 토크나이저 객체로부터 직접 불러오기[[loading-directly-from-the-tokenizer-object]]
+
+🤗 Transformers 라이브러리에서 이 토크나이저 객체를 활용하는 방법을 살펴보겠습니다.
+[`PreTrainedTokenizerFast`] 클래스는 인스턴스화된 *토크나이저* 객체를 인수로 받아 쉽게 인스턴스화할 수 있습니다:
+
+```python
+>>> from transformers import PreTrainedTokenizerFast
+
+>>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
+```
+
+이제 `fast_tokenizer` 객체는 🤗 Transformers 토크나이저에서 공유하는 모든 메소드와 함께 사용할 수 있습니다! 자세한 내용은 [토크나이저 페이지](main_classes/tokenizer)를 참조하세요.
+
+## JSON 파일에서 불러오기[[loading-from-a-JSON-file]]
+
+<!--In order to load a tokenizer from a JSON file, let's first start by saving our tokenizer:-->
+
+JSON 파일에서 토크나이저를 불러오기 위해, 먼저 토크나이저를 저장해 보겠습니다:
+
+```python
+>>> tokenizer.save("tokenizer.json")
+```
+
+JSON 파일을 저장한 경로는 `tokenizer_file` 매개변수를 사용하여 [`PreTrainedTokenizerFast`] 초기화 메소드에 전달할 수 있습니다:
+
+```python
+>>> from transformers import PreTrainedTokenizerFast
+
+>>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
+```
+
+이제 `fast_tokenizer` 객체는 🤗 Transformers 토크나이저에서 공유하는 모든 메소드와 함께 사용할 수 있습니다! 자세한 내용은 [토크나이저 페이지](main_classes/tokenizer)를 참조하세요.
diff --git a/docs/source/ko/hpo_train.md b/docs/source/ko/hpo_train.md
new file mode 100644
index 000000000000..c7b25306930a
--- /dev/null
+++ b/docs/source/ko/hpo_train.md
@@ -0,0 +1,124 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Trainer API를 사용한 하이퍼파라미터 탐색 [[hyperparameter-search-using-trainer-api]]
+
+🤗 Transformers에서는 🤗 Transformers 모델을 학습시키는데 최적화된 [`Trainer`] 클래스를 제공하기 때문에, 사용자는 직접 훈련 루프를 작성할 필요 없이 더욱 간편하게 학습을 시킬 수 있습니다. 또한, [`Trainer`]는 하이퍼파라미터 탐색을 위한 API를 제공합니다. 이 문서에서 이 API를 활용하는 방법을 예시와 함께 보여드리겠습니다.
+
+## 하이퍼파라미터 탐색 백엔드 [[hyperparameter-search-backend]]
+
+[`Trainer`]는 현재 아래 4가지 하이퍼파라미터 탐색 백엔드를 지원합니다:
+[optuna](https://optuna.org/)와 [sigopt](https://sigopt.com/), [raytune](https://docs.ray.io/en/latest/tune/index.html), [wandb](https://wandb.ai/site/sweeps) 입니다.
+
+하이퍼파라미터 탐색 백엔드로 사용하기 전에 아래의 명령어를 사용하여 라이브러리들을 설치하세요.
+```bash
+pip install optuna/sigopt/wandb/ray[tune] 
+```
+
+## 예제에서 하이퍼파라미터 탐색을 활성화하는 방법 [[how-to-enable-hyperparameter-search-in-example]]
+
+하이퍼파라미터 탐색 공간을 정의하세요. 하이퍼파라미터 탐색 백엔드마다 서로 다른 형식이 필요합니다.
+
+sigopt의 경우, 해당 [object_parameter](https://docs.sigopt.com/ai-module-api-references/api_reference/objects/object_parameter) 문서를 참조하여 아래와 같이 작성하세요:
+```py
+>>> def sigopt_hp_space(trial):
+...     return [
+...         {"bounds": {"min": 1e-6, "max": 1e-4}, "name": "learning_rate", "type": "double"},
+...         {
+...             "categorical_values": ["16", "32", "64", "128"],
+...             "name": "per_device_train_batch_size",
+...             "type": "categorical",
+...         },
+...     ]
+```
+
+optuna의 경우, 해당 [object_parameter](https://optuna.readthedocs.io/en/stable/tutorial/10_key_features/002_configurations.html#sphx-glr-tutorial-10-key-features-002-configurations-py) 문서를 참조하여 아래와 같이 작성하세요:
+
+```py
+>>> def optuna_hp_space(trial):
+...     return {
+...         "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
+...         "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32, 64, 128]),
+...     }
+```
+
+raytune의 경우, 해당 [object_parameter](https://docs.ray.io/en/latest/tune/api/search_space.html) 문서를 참조하여 아래와 같이 작성하세요:
+
+```py
+>>> def ray_hp_space(trial):
+...     return {
+...         "learning_rate": tune.loguniform(1e-6, 1e-4),
+...         "per_device_train_batch_size": tune.choice([16, 32, 64, 128]),
+...     }
+```
+
+wandb의 경우, 해당 [object_parameter](https://docs.wandb.ai/guides/sweeps/configuration) 문서를 참조하여 아래와 같이 작성하세요:
+
+```py
+>>> def wandb_hp_space(trial):
+...     return {
+...         "method": "random",
+...         "metric": {"name": "objective", "goal": "minimize"},
+...         "parameters": {
+...             "learning_rate": {"distribution": "uniform", "min": 1e-6, "max": 1e-4},
+...             "per_device_train_batch_size": {"values": [16, 32, 64, 128]},
+...         },
+...     }
+```
+
+`model_init` 함수를 정의하고 이를 [`Trainer`]에 전달하세요. 아래는 그 예시입니다.
+```py
+>>> def model_init(trial):
+...     return AutoModelForSequenceClassification.from_pretrained(
+...         model_args.model_name_or_path,
+...         from_tf=bool(".ckpt" in model_args.model_name_or_path),
+...         config=config,
+...         cache_dir=model_args.cache_dir,
+...         revision=model_args.model_revision,
+...         use_auth_token=True if model_args.use_auth_token else None,
+...     )
+```
+
+아래와 같이 `model_init` 함수, 훈련 인수, 훈련 및 테스트 데이터셋, 그리고 평가 함수를 사용하여 [`Trainer`]를 생성하세요:
+
+```py
+>>> trainer = Trainer(
+...     model=None,
+...     args=training_args,
+...     train_dataset=small_train_dataset,
+...     eval_dataset=small_eval_dataset,
+...     compute_metrics=compute_metrics,
+...     tokenizer=tokenizer,
+...     model_init=model_init,
+...     data_collator=data_collator,
+... )
+```
+
+하이퍼파라미터 탐색을 호출하고, 최적의 시험 매개변수를 가져오세요. 백엔드는 `"optuna"`/`"sigopt"`/`"wandb"`/`"ray"` 중에서 선택할 수 있습니다. 방향은 `"minimize"` 또는 `"maximize"` 중 선택하며, 목표를 최소화할 것인지 최대화할 것인지를 결정합니다.
+
+자신만의 compute_objective 함수를 정의할 수 있습니다. 만약 이 함수를 정의하지 않으면, 기본 compute_objective가 호출되고, f1과 같은 평가 지표의 합이 목푯값으로 반환됩니다.
+
+```py
+>>> best_trial = trainer.hyperparameter_search(
+...     direction="maximize",
+...     backend="optuna",
+...     hp_space=optuna_hp_space,
+...     n_trials=20,
+...     compute_objective=compute_objective,
+... )
+```
+
+## DDP 미세 조정을 위한 하이퍼파라미터 탐색 [[hyperparameter-search-for-ddp-finetune]]
+현재, DDP(Distributed Data Parallelism; 분산 데이터 병렬처리)를 위한 하이퍼파라미터 탐색은 optuna와 sigopt에서 가능합니다. 최상위 프로세스가 하이퍼파라미터 탐색 과정을 시작하고 그 결과를 다른 프로세스에 전달합니다.
diff --git a/docs/source/ko/in_translation.md b/docs/source/ko/in_translation.md
new file mode 100644
index 000000000000..61ff1426a452
--- /dev/null
+++ b/docs/source/ko/in_translation.md
@@ -0,0 +1,5 @@
+<!--⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+# 열심히 번역 중입니다. 조금 이따 만나요!
\ No newline at end of file
diff --git a/docs/source/ko/in_translation.mdx b/docs/source/ko/in_translation.mdx
deleted file mode 100644
index ead906183348..000000000000
--- a/docs/source/ko/in_translation.mdx
+++ /dev/null
@@ -1 +0,0 @@
-# 열심히 번역 중입니다. 조금 이따 만나요!
\ No newline at end of file
diff --git a/docs/source/ko/index.md b/docs/source/ko/index.md
new file mode 100644
index 000000000000..f0ec9ae1b8b9
--- /dev/null
+++ b/docs/source/ko/index.md
@@ -0,0 +1,362 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 🤗 Transformers
+
+[PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/), [JAX](https://jax.readthedocs.io/en/latest/)를 위한 최첨단 머신러닝
+
+🤗 Transformers는 사전학습된 최첨단 모델들을 쉽게 다운로드하고 훈련시킬 수 있는 API와 도구를 제공합니다. 사전학습된 모델을 쓰면 컴퓨팅 비용과 탄소 배출량이 줄고, 모델을 처음부터 훈련시키는 데 필요한 시간과 리소스를 절약할 수 있습니다. 저희 모델들은 다양한 분야의 태스크를 지원합니다.
+
+📝 **자연어 처리**: 텍스트 분류, 개체명 인식, 질의응답, 언어 모델링, 요약, 번역, 객관식 질의응답, 텍스트 생성<br>
+🖼️ **컴퓨터 비전**: 이미지 분류, 객체 탐지, 객체 분할<br>
+🗣️ **오디오**: 자동음성인식, 오디오 분류<br>
+🐙 **멀티모달**: 표 질의응답, 광학 문자 인식 (OCR), 스캔한 문서에서 정보 추출, 비디오 분류, 시각 질의응답
+
+🤗 Transformers는 PyTorch, TensorFlow와 JAX 간의 상호운용성을 지원합니다. 유연하게 모델의 각 단계마다 다른 프레임워크를 사용할 수도 있습니다. 예를 들어 코드 3줄만 써서 모델을 훈련시킨 다음, 다른 프레임워크 상에서 추론할 수 있습니다. 모델을 운영 환경에 배포하기 위해 ONNX나 TorchScript 형식으로 내보낼 수도 있습니다.
+
+커뮤니티에 참여하시려면 [Hub](https://huggingface.co/models), [포럼](https://discuss.huggingface.co/), [디스코드](https://discord.com/invite/JfAtkvEtRb)를 방문해주세요!
+
+## Hugging Face 팀과 직접 대화하고 싶으신가요?[[hugging-face-team]]
+
+<a target="_blank" href="https://huggingface.co/support">
+    <img alt="HuggingFace Expert Acceleration Program" src="https://cdn-media.huggingface.co/marketing/transformers/new-support-improved.png" style="width: 100%; max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
+</a>
+
+## 콘텐츠[[contents]]
+
+저희 기술문서는 크게 5개 섹션으로 나눌 수 있습니다:
+
+- **시작하기**에서 라이브러리를 간단히 훑어보고, 본격적으로 뛰어들 수 있게 설치 방법을 안내합니다.
+- **튜토리얼**에서 라이브러리에 익숙해질 수 있도록 자세하고도 쉽게 기본적인 부분을 안내합니다.
+- **How-to 가이드**에서 언어 모델링을 위해 사전학습된 모델을 파인 튜닝하는 방법이나, 직접 모델을 작성하고 공유하는 방법과 같이 특정 목표를 달성하는 방법을 안내합니다.
+- **개념 가이드**에서 🤗 Transformers의 설계 철학과 함께 모델이나 태스크 뒤에 숨겨진 개념들과 아이디어를 탐구하고 설명을 덧붙입니다.
+- **API**에서 모든 클래스와 함수를 설명합니다.
+
+  - **메인 클래스**에서 configuration, model, tokenizer, pipeline과 같이 제일 중요한 클래스들을 자세히 설명합니다.
+  - **모델**에서 라이브러리 속 구현된 각 모델과 연관된 클래스와 함수를 자세히 설명합니다.
+  - **내부 유틸리티**에서 내부적으로 사용되는 유틸리티 클래스와 함수를 자세히 설명합니다.
+
+### 지원 모델[[supported-models]]
+
+<!--This list is updated automatically from the README with _make fix-copies_. Do not update manually! -->
+
+1. **[ALBERT](model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
+1. **[BART](model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
+1. **[BARThez](model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
+1. **[BARTpho](model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
+1. **[BEiT](model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
+1. **[BERT](model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
+1. **[BERT For Sequence Generation](model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[BERTweet](model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
+1. **[BigBird-Pegasus](model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[BigBird-RoBERTa](model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[Blenderbot](model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BlenderbotSmall](model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BLOOM](model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
+1. **[BORT](model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
+1. **[ByT5](model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
+1. **[CamemBERT](model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
+1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
+1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
+1. **[CLIPSeg](model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
+1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
+1. **[Conditional DETR](model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
+1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
+1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
+1. **[ConvNeXTV2](model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
+1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
+1. **[CTRL](model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
+1. **[CvT](model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
+1. **[Data2Vec](model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
+1. **[DeBERTa](model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[DeBERTa-v2](model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[Decision Transformer](model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
+1. **[Deformable DETR](model_doc/deformable_detr)** (from SenseTime Research) released with the paper [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai.
+1. **[DeiT](model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
+1. **[DETR](model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
+1. **[DialoGPT](model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
+1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
+1. **[DiT](model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
+1. **[Donut](model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
+1. **[DPR](model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
+1. **[DPT](master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
+1. **[EfficientNet](model_doc/efficientnet)** (from Google Research) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946)  by Mingxing Tan and Quoc V. Le.
+1. **[ELECTRA](model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
+1. **[EncoderDecoder](model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[ERNIE](model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
+1. **[ESM](model_doc/esm)** (from Meta AI) are transformer protein language models.  **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2 and ESMFold** were released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives. 
+1. **[FLAN-T5](model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
+1. **[FlauBERT](model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
+1. **[FLAVA](model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
+1. **[FNet](model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
+1. **[Funnel Transformer](model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
+1. **[GLPN](model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
+1. **[GPT](model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+1. **[GPT Neo](model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
+1. **[GPT NeoX](model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
+1. **[GPT NeoX Japanese](model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
+1. **[GPT-2](model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
+1. **[GPT-J](model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
+1. **[GPTSAN-japanese](model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
+1. **[GroupViT](model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
+1. **[Hubert](model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
+1. **[I-BERT](model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
+1. **[ImageGPT](model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
+1. **[Jukebox](model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
+1. **[LayoutLM](model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
+1. **[LayoutLMv2](model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
+1. **[LayoutLMv3](model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
+1. **[LayoutXLM](model_doc/layoutxlm)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
+1. **[LED](model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+1. **[LeViT](model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
+1. **[LiLT](model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
+1. **[Longformer](model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+1. **[LongT5](model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
+1. **[LUKE](model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
+1. **[LXMERT](model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
+1. **[M-CTC-T](model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
+1. **[M2M100](model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
+1. **[MarianMT](model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
+1. **[MarkupLM](model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
+1. **[Mask2Former](model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
+1. **[MaskFormer](model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
+1. **[mBART](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
+1. **[mBART-50](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
+1. **[Megatron-BERT](model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[Megatron-GPT2](model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[mLUKE](model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
+1. **[MobileBERT](model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
+1. **[MobileViT](model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
+1. **[MPNet](model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
+1. **[MT5](model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
+1. **[MVP](model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
+1. **[Nezha](model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
+1. **[NLLB](model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
+1. **[Nyströmformer](model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
+1. **[OneFormer](model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
+1. **[OPT](master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
+1. **[OWL-ViT](model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
+1. **[Pegasus](model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
+1. **[PEGASUS-X](model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
+1. **[Perceiver IO](model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
+1. **[PhoBERT](model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
+1. **[PLBart](model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
+1. **[PoolFormer](model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
+1. **[ProphetNet](model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[QDQBert](model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
+1. **[RAG](model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela.
+1. **[REALM](model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
+1. **[Reformer](model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+1. **[RegNet](model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
+1. **[RemBERT](model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
+1. **[ResNet](model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
+1. **[RoBERTa](model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
+1. **[RoCBert](model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
+1. **[RoFormer](model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
+1. **[SegFormer](model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
+1. **[SEW](model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
+1. **[SEW-D](model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
+1. **[SpeechToTextTransformer](model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
+1. **[SpeechToTextTransformer2](model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
+1. **[Splinter](model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
+1. **[SqueezeBERT](model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
+1. **[Swin Transformer](model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
+1. **[Swin Transformer V2](model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
+1. **[T5](model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+1. **[T5v1.1](model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+1. **[Table Transformer](model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
+1. **[TAPAS](model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
+1. **[TAPEX](model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
+1. **[Time Series Transformer](model_doc/time_series_transformer)**  (from HuggingFace).
+1. **[Trajectory Transformer](model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
+1. **[Transformer-XL](model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+1. **[TrOCR](model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
+1. **[UL2](model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
+1. **[UniSpeech](model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
+1. **[UniSpeechSat](model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
+1. **[VAN](model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
+1. **[VideoMAE](model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
+1. **[ViLT](model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
+1. **[Vision Transformer (ViT)](model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[VisualBERT](model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
+1. **[ViTMAE](model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
+1. **[ViTMSN](model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
+1. **[Wav2Vec2](model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
+1. **[Wav2Vec2-Conformer](model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
+1. **[Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
+1. **[WavLM](model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
+1. **[Whisper](model_doc/whisper)** (from OpenAI) released with the paper [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever.
+1. **[X-CLIP](model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
+1. **[XGLM](model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
+1. **[XLM](model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
+1. **[XLM-ProphetNet](model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[XLM-RoBERTa](model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
+1. **[XLM-RoBERTa-XL](model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
+1. **[XLNet](model_doc/xlnet)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
+1. **[XLS-R](model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
+1. **[XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
+1. **[YOLOS](model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
+1. **[YOSO](model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
+
+
+### 지원 프레임워크[[supported-framework]]
+
+아래 표는 라이브러리 속 각 모델의 지원 현황을 나타냅니다. 토큰화를 파이썬 (별칭 "slow") 또는 🤗 Tokenizers (별칭 "fast") 라이브러리로 하는지; (Flax를 통한) Jax, PyTorch, TensorFlow 중 어떤 프레임워크를 지원하는지 표시되어 있습니다.
+
+<!--This table is updated automatically from the auto modules with _make fix-copies_. Do not update manually!-->
+
+|            Model            | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support |
+|:---------------------------:|:--------------:|:--------------:|:---------------:|:------------------:|:------------:|
+|           ALBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|            BART             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|            BEiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|            BERT             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|       Bert Generation       |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           BigBird           |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
+|       BigBird-Pegasus       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         Blenderbot          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|       BlenderbotSmall       |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|            BLOOM            |       ❌       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          CamemBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           CANINE            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            CLIP             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|           CLIPSeg           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           CodeGen           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|      Conditional DETR       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|          ConvNeXT           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             CvT             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|        Data2VecAudio        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        Data2VecText         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|       Data2VecVision        |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|           DeBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|         DeBERTa-v2          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|    Decision Transformer     |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|       Deformable DETR       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            DeiT             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            DETR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         DistilBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|          DonutSwin          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             DPR             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|             DPT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           ELECTRA           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|       Encoder decoder       |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|            ERNIE            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             ESM             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+| FairSeq Machine-Translation |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          FlauBERT           |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            FLAVA            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            FNet             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|     Funnel Transformer      |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            GLPN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           GPT Neo           |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|          GPT NeoX           |       ❌       |       ✅       |       ✅        |         ❌         |      ❌      |
+|      GPT NeoX Japanese      |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            GPT-J            |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|          GroupViT           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|           Hubert            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|           I-BERT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          ImageGPT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Jukebox           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          LayoutLM           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|         LayoutLMv2          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|         LayoutLMv3          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|             LED             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            LeViT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            LiLT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         Longformer          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           LongT5            |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|            LUKE             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           LXMERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           M-CTC-T           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           M2M100            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Marian            |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
+|          MarkupLM           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|         MaskFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            mBART            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|        Megatron-BERT        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         MobileBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|          MobileViT          |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            MPNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|             MT5             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|             MVP             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|            Nezha            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        Nyströmformer        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         OpenAI GPT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|        OpenAI GPT-2         |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|             OPT             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|           OWL-ViT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Pegasus           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|          PEGASUS-X          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          Perceiver          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           PLBart            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         PoolFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         ProphetNet          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           QDQBert           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             RAG             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            REALM            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          Reformer           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|           RegNet            |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|           RemBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           ResNet            |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|          RetriBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|           RoBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|           RoCBert           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          RoFormer           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|          SegFormer          |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             SEW             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            SEW-D            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|   Speech Encoder decoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|         Speech2Text         |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|        Speech2Text2         |       ✅       |       ❌       |       ❌        |         ❌         |      ❌      |
+|          Splinter           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|         SqueezeBERT         |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|      Swin Transformer       |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|     Swin Transformer V2     |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             T5              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|      Table Transformer      |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            TAPAS            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|   Time Series Transformer   |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|   Trajectory Transformer    |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|       Transformer-XL        |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            TrOCR            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          UniSpeech          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        UniSpeechSat         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             VAN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          VideoMAE           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            ViLT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|   Vision Encoder decoder    |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|    VisionTextDualEncoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|         VisualBERT          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             ViT             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|           ViTMAE            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|           ViTMSN            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          Wav2Vec2           |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
+|     Wav2Vec2-Conformer      |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            WavLM            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Whisper           |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|           X-CLIP            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            XGLM             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|             XLM             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|       XLM-ProphetNet        |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         XLM-RoBERTa         |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|       XLM-RoBERTa-XL        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            XLNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            YOLOS            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            YOSO             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+
+<!-- End table-->
diff --git a/docs/source/ko/index.mdx b/docs/source/ko/index.mdx
deleted file mode 100644
index 0aa73ff2a577..000000000000
--- a/docs/source/ko/index.mdx
+++ /dev/null
@@ -1,353 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# 🤗 Transformers
-
-[PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/), [JAX](https://jax.readthedocs.io/en/latest/)를 위한 최첨단 머신러닝
-
-🤗 Transformers는 사전학습된 최첨단 모델들을 쉽게 다운로드하고 훈련시킬 수 있는 API와 도구를 제공합니다. 사전학습된 모델을 쓰면 컴퓨팅 비용과 탄소 배출량이 줄고, 모델을 처음부터 훈련시키는 데 필요한 시간과 리소스를 절약할 수 있습니다. 저희 모델들은 다양한 분야의 태스크를 지원합니다.
-
-📝 **자연어 처리**: 텍스트 분류, 개체명 인식, 질의응답, 언어 모델링, 요약, 번역, 객관식 질의응답, 텍스트 생성<br>
-🖼️ **컴퓨터 비전**: 이미지 분류, 객체 탐지, 객체 분할<br>
-🗣️ **오디오**: 자동음성인식, 오디오 분류<br>
-🐙 **멀티모달**: 표 질의응답, 광학 문자 인식 (OCR), 스캔한 문서에서 정보 추출, 비디오 분류, 시각 질의응답
-
-🤗 Transformers는 PyTorch, TensorFlow와 JAX 간의 상호운용성을 지원합니다. 유연하게 모델의 각 단계마다 다른 프레임워크를 사용할 수도 있습니다. 예를 들어 코드 3줄만 써서 모델을 훈련시킨 다음, 다른 프레임워크 상에서 추론할 수 있습니다. 모델을 운영 환경에 배포하기 위해 ONNX나 TorchScript 형식으로 내보낼 수도 있습니다.
-
-커뮤니티에 참여하시려면 [Hub](https://huggingface.co/models), [포럼](https://discuss.huggingface.co/), [디스코드](https://discord.com/invite/JfAtkvEtRb)를 방문해주세요!
-
-## Hugging Face 팀과 직접 대화하고 싶으신가요?[[hugging-face-team]]
-
-<a target="_blank" href="https://huggingface.co/support">
-    <img alt="HuggingFace Expert Acceleration Program" src="https://cdn-media.huggingface.co/marketing/transformers/new-support-improved.png" style="width: 100%; max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
-</a>
-
-## 콘텐츠[[contents]]
-
-저희 기술문서는 크게 5개 섹션으로 나눌 수 있습니다:
-
-- **시작하기**에서 라이브러리를 간단히 훑어보고, 본격적으로 뛰어들 수 있게 설치 방법을 안내합니다.
-- **튜토리얼**에서 라이브러리에 익숙해질 수 있도록 자세하고도 쉽게 기본적인 부분을 안내합니다.
-- **How-to 가이드**에서 언어 모델링을 위해 사전학습된 모델을 파인 튜닝하는 방법이나, 직접 모델을 작성하고 공유하는 방법과 같이 특정 목표를 달성하는 방법을 안내합니다.
-- **개념 가이드**에서 🤗 Transformers의 설계 철학과 함께 모델이나 태스크 뒤에 숨겨진 개념들과 아이디어를 탐구하고 설명을 덧붙입니다.
-- **API**에서 모든 클래스와 함수를 설명합니다.
-
-  - **메인 클래스**에서 configuration, model, tokenizer, pipeline과 같이 제일 중요한 클래스들을 자세히 설명합니다.
-  - **모델**에서 라이브러리 속 구현된 각 모델과 연관된 클래스와 함수를 자세히 설명합니다.
-  - **내부 유틸리티**에서 내부적으로 사용되는 유틸리티 클래스와 함수를 자세히 설명합니다.
-
-### 지원 모델[[supported-models]]
-
-<!--This list is updated automatically from the README with _make fix-copies_. Do not update manually! -->
-
-1. **[ALBERT](model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-1. **[BART](model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
-1. **[BARThez](model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
-1. **[BARTpho](model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
-1. **[BEiT](model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
-1. **[BERT](model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
-1. **[BERT For Sequence Generation](model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[BERTweet](model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
-1. **[BigBird-Pegasus](model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BigBird-RoBERTa](model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[Blenderbot](model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BlenderbotSmall](model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BLOOM](model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
-1. **[BORT](model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
-1. **[ByT5](model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
-1. **[CamemBERT](model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
-1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
-1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
-1. **[CLIPSeg](model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
-1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
-1. **[Conditional DETR](model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
-1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
-1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
-1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
-1. **[CTRL](model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
-1. **[CvT](model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
-1. **[Data2Vec](model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
-1. **[DeBERTa](model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[DeBERTa-v2](model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[Decision Transformer](model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
-1. **[Deformable DETR](model_doc/deformable_detr)** (from SenseTime Research) released with the paper [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai.
-1. **[DeiT](model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
-1. **[DETR](model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
-1. **[DialoGPT](model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
-1. **[DiT](model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
-1. **[Donut](model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
-1. **[DPR](model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
-1. **[DPT](master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
-1. **[ELECTRA](model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
-1. **[EncoderDecoder](model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[ERNIE](model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
-1. **[ESM](model_doc/esm)** (from Meta AI) are transformer protein language models.  **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2 and ESMFold** were released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives. 
-1. **[FLAN-T5](model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
-1. **[FlauBERT](model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
-1. **[FLAVA](model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
-1. **[FNet](model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
-1. **[Funnel Transformer](model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-1. **[GLPN](model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
-1. **[GPT](model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
-1. **[GPT Neo](model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
-1. **[GPT NeoX](model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
-1. **[GPT NeoX Japanese](model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
-1. **[GPT-2](model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
-1. **[GPT-J](model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
-1. **[GroupViT](model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
-1. **[Hubert](model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
-1. **[I-BERT](model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
-1. **[ImageGPT](model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
-1. **[Jukebox](model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
-1. **[LayoutLM](model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
-1. **[LayoutLMv2](model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
-1. **[LayoutLMv3](model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
-1. **[LayoutXLM](model_doc/layoutxlm)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
-1. **[LED](model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-1. **[LeViT](model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
-1. **[LiLT](model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
-1. **[Longformer](model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-1. **[LongT5](model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
-1. **[LUKE](model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
-1. **[LXMERT](model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
-1. **[M-CTC-T](model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
-1. **[M2M100](model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
-1. **[MarianMT](model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
-1. **[MarkupLM](model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
-1. **[MaskFormer](model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
-1. **[mBART](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-1. **[mBART-50](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-1. **[Megatron-BERT](model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-1. **[Megatron-GPT2](model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-1. **[mLUKE](model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
-1. **[MobileBERT](model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
-1. **[MobileViT](model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
-1. **[MPNet](model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
-1. **[MT5](model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-1. **[MVP](model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
-1. **[Nezha](model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
-1. **[NLLB](model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
-1. **[Nyströmformer](model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
-1. **[OPT](master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
-1. **[OWL-ViT](model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
-1. **[Pegasus](model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
-1. **[PEGASUS-X](model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
-1. **[Perceiver IO](model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
-1. **[PhoBERT](model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
-1. **[PLBart](model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
-1. **[PoolFormer](model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
-1. **[ProphetNet](model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-1. **[QDQBert](model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
-1. **[RAG](model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela.
-1. **[REALM](model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
-1. **[Reformer](model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
-1. **[RegNet](model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
-1. **[RemBERT](model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
-1. **[ResNet](model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
-1. **[RoBERTa](model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-1. **[RoCBert](model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
-1. **[RoFormer](model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
-1. **[SegFormer](model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
-1. **[SEW](model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
-1. **[SEW-D](model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
-1. **[SpeechToTextTransformer](model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
-1. **[SpeechToTextTransformer2](model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
-1. **[Splinter](model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
-1. **[SqueezeBERT](model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
-1. **[Swin Transformer](model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
-1. **[Swin Transformer V2](model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
-1. **[T5](model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-1. **[T5v1.1](model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-1. **[Table Transformer](model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
-1. **[TAPAS](model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
-1. **[TAPEX](model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
-1. **[Time Series Transformer](model_doc/time_series_transformer)**  (from HuggingFace).
-1. **[Trajectory Transformer](model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
-1. **[Transformer-XL](model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-1. **[TrOCR](model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
-1. **[UL2](model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
-1. **[UniSpeech](model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
-1. **[UniSpeechSat](model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
-1. **[VAN](model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
-1. **[VideoMAE](model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
-1. **[ViLT](model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
-1. **[Vision Transformer (ViT)](model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-1. **[VisualBERT](model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
-1. **[ViTMAE](model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
-1. **[ViTMSN](model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
-1. **[Wav2Vec2](model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
-1. **[Wav2Vec2-Conformer](model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
-1. **[Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
-1. **[WavLM](model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
-1. **[Whisper](model_doc/whisper)** (from OpenAI) released with the paper [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever.
-1. **[X-CLIP](model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
-1. **[XGLM](model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
-1. **[XLM](model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
-1. **[XLM-ProphetNet](model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-1. **[XLM-RoBERTa](model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
-1. **[XLM-RoBERTa-XL](model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
-1. **[XLNet](model_doc/xlnet)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-1. **[XLS-R](model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
-1. **[XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
-1. **[YOLOS](model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
-1. **[YOSO](model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
-
-
-### 지원 프레임워크[[supported-framework]]
-
-아래 표는 라이브러리 속 각 모델의 지원 현황을 나타냅니다. 토큰화를 파이썬 (별칭 "slow") 또는 🤗 Tokenizers (별칭 "fast") 라이브러리로 하는지; (Flax를 통한) Jax, PyTorch, TensorFlow 중 어떤 프레임워크를 지원하는지 표시되어 있습니다.
-
-<!--This table is updated automatically from the auto modules with _make fix-copies_. Do not update manually!-->
-
-|            Model            | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support |
-|:---------------------------:|:--------------:|:--------------:|:---------------:|:------------------:|:------------:|
-|           ALBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|            BART             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|            BEiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-|            BERT             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|       Bert Generation       |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           BigBird           |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
-|       BigBird-Pegasus       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         Blenderbot          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|       BlenderbotSmall       |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|            BLOOM            |       ❌       |       ✅       |       ✅        |         ❌         |      ❌      |
-|          CamemBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|           CANINE            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            CLIP             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|           CLIPSeg           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           CodeGen           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|      Conditional DETR       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|          ConvNeXT           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|             CvT             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|        Data2VecAudio        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|        Data2VecText         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|       Data2VecVision        |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|           DeBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|         DeBERTa-v2          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|    Decision Transformer     |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|       Deformable DETR       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            DeiT             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            DETR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         DistilBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|          DonutSwin          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             DPR             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|             DPT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           ELECTRA           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|       Encoder decoder       |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|            ERNIE            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             ESM             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-| FairSeq Machine-Translation |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          FlauBERT           |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            FLAVA            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            FNet             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|     Funnel Transformer      |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|            GLPN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           GPT Neo           |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-|          GPT NeoX           |       ❌       |       ✅       |       ✅        |         ❌         |      ❌      |
-|      GPT NeoX Japanese      |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            GPT-J            |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|          GroupViT           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|           Hubert            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|           I-BERT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          ImageGPT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           Jukebox           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          LayoutLM           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|         LayoutLMv2          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|         LayoutLMv3          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|             LED             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|            LeViT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            LiLT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         Longformer          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|           LongT5            |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-|            LUKE             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           LXMERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|           M-CTC-T           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           M2M100            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           Marian            |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
-|          MarkupLM           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|         MaskFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            mBART            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|        Megatron-BERT        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         MobileBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|          MobileViT          |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            MPNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|             MT5             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|             MVP             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|            Nezha            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|        Nyströmformer        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         OpenAI GPT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|        OpenAI GPT-2         |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|             OPT             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|           OWL-ViT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           Pegasus           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|          PEGASUS-X          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          Perceiver          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           PLBart            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         PoolFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         ProphetNet          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           QDQBert           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             RAG             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            REALM            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|          Reformer           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|           RegNet            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|           RemBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|           ResNet            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|          RetriBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|           RoBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|           RoCBert           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          RoFormer           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|          SegFormer          |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|             SEW             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            SEW-D            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|   Speech Encoder decoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-|         Speech2Text         |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|        Speech2Text2         |       ✅       |       ❌       |       ❌        |         ❌         |      ❌      |
-|          Splinter           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|         SqueezeBERT         |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|      Swin Transformer       |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|     Swin Transformer V2     |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             T5              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|      Table Transformer      |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            TAPAS            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|   Time Series Transformer   |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|   Trajectory Transformer    |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|       Transformer-XL        |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            TrOCR            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          UniSpeech          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|        UniSpeechSat         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             VAN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          VideoMAE           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            ViLT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|   Vision Encoder decoder    |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|    VisionTextDualEncoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-|         VisualBERT          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             ViT             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|           ViTMAE            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|           ViTMSN            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          Wav2Vec2           |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
-|     Wav2Vec2-Conformer      |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            WavLM            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           Whisper           |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|           X-CLIP            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            XGLM             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|             XLM             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|       XLM-ProphetNet        |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         XLM-RoBERTa         |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|       XLM-RoBERTa-XL        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            XLNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|            YOLOS            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            YOSO             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-
-<!-- End table-->
diff --git a/docs/source/ko/installation.md b/docs/source/ko/installation.md
new file mode 100644
index 000000000000..cd72d8c6bcbf
--- /dev/null
+++ b/docs/source/ko/installation.md
@@ -0,0 +1,245 @@
+<!---
+Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 설치방법[[installation]]
+
+🤗 Transformers를 사용 중인 딥러닝 라이브러리에 맞춰 설치하고, 캐시를 구성하거나 선택적으로 오프라인에서도 실행할 수 있도록 🤗 Transformers를 설정하는 방법을 배우겠습니다.
+
+🤗 Transformers는 Python 3.6+, PyTorch 1.1.0+, TensorFlow 2.0+ 및 Flax에서 테스트되었습니다. 딥러닝 라이브러리를 설치하려면 아래 링크된 저마다의 공식 사이트를 참고해주세요.
+
+* [PyTorch](https://pytorch.org/get-started/locally/) 설치하기
+* [TensorFlow 2.0](https://www.tensorflow.org/install/pip) 설치하기
+* [Flax](https://flax.readthedocs.io/en/latest/) 설치하기
+
+## pip으로 설치하기[[install-with-pip]]
+
+🤗 Transformers를 [가상 환경](https://docs.python.org/3/library/venv.html)에 설치하는 것을 추천드립니다. Python 가상 환경에 익숙하지 않다면, 이 [가이드](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)를 참고하세요. 가상 환경을 사용하면 서로 다른 프로젝트들을 보다 쉽게 관리할 수 있고, 의존성 간의 호환성 문제를 방지할 수 있습니다.
+
+먼저 프로젝트 디렉토리에서 가상 환경을 만들어 줍니다.
+
+```bash
+python -m venv .env
+```
+
+가상 환경을 활성화해주세요. Linux나 MacOS의 경우:
+
+```bash
+source .env/bin/activate
+```
+Windows의 경우:
+
+```bash
+.env/Scripts/activate
+```
+
+이제 🤗 Transformers를 설치할 준비가 되었습니다. 다음 명령을 입력해주세요. 
+
+```bash
+pip install transformers
+```
+
+CPU만 써도 된다면, 🤗 Transformers와 딥러닝 라이브러리를 단 1줄로 설치할 수 있습니다. 예를 들어 🤗 Transformers와 PyTorch의 경우:
+
+```bash
+pip install transformers[torch]
+```
+
+🤗 Transformers와 TensorFlow 2.0의 경우:
+
+```bash
+pip install transformers[tf-cpu]
+```
+
+🤗 Transformers와 Flax의 경우:
+
+```bash
+pip install transformers[flax]
+```
+
+마지막으로 🤗 Transformers가 제대로 설치되었는지 확인할 차례입니다. 사전훈련된 모델을 다운로드하는 코드입니다.
+
+```bash
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"
+```
+
+라벨과 점수가 출력되면 잘 설치된 것입니다.
+
+```bash
+[{'label': 'POSITIVE', 'score': 0.9998704791069031}]
+```
+
+## 소스에서 설치하기[[install-from-source]]
+
+🤗 Transformers를 소스에서 설치하려면 아래 명령을 실행하세요.
+
+```bash
+pip install git+https://github.com/huggingface/transformers
+```
+
+위 명령은 최신이지만 (안정적인) `stable` 버전이 아닌 실험성이 짙은 `main` 버전을 설치합니다. `main` 버전은 개발 현황과 발맞추는데 유용합니다. 예시로 마지막 공식 릴리스 이후 발견된 버그가 패치되었지만, 새 릴리스로 아직 롤아웃되지는 않은 경우를 들 수 있습니다. 바꿔 말하면 `main` 버전이 안정성과는 거리가 있다는 뜻이기도 합니다. 저희는 `main` 버전을 사용하는데 문제가 없도록 노력하고 있으며, 대부분의 문제는 대개 몇 시간이나 하루 안에 해결됩니다. 만약 문제가 발생하면 [이슈](https://github.com/huggingface/transformers/issues)를 열어주시면 더 빨리 해결할 수 있습니다!
+
+전과 마찬가지로 🤗 Transformers가 제대로 설치되었는지 확인할 차례입니다.
+
+```bash
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I love you'))"
+```
+
+## 수정 가능한 설치[[editable-install]]
+
+수정 가능한 설치가 필요한 경우는 다음과 같습니다.
+
+* `main` 버전의 소스 코드를 사용하기 위해
+* 🤗 Transformers에 기여하고 싶어서 코드의 변경 사항을 테스트하기 위해
+
+리포지터리를 복제하고 🤗 Transformers를 설치하려면 다음 명령을 입력해주세요.
+
+```bash
+git clone https://github.com/huggingface/transformers.git
+cd transformers
+pip install -e .
+```
+
+위 명령은 리포지터리를 복제한 위치의 폴더와 Python 라이브러리의 경로를 연결시킵니다. Python이 일반 라이브러리 경로 외에 복제한 폴더 내부를 확인할 것입니다. 예를 들어 Python 패키지가 일반적으로 `~/anaconda3/envs/main/lib/python3.7/site-packages/`에 설치되어 있는데, 명령을 받은 Python이 이제 복제한 폴더인 `~/transformers/`도 검색하게 됩니다.
+
+<Tip warning={true}>
+
+라이브러리를 계속 사용하려면 `transformers` 폴더를 꼭 유지해야 합니다.
+
+</Tip>
+
+복제본은 최신 버전의 🤗 Transformers로 쉽게 업데이트할 수 있습니다.
+
+```bash
+cd ~/transformers/
+git pull
+```
+
+Python 환경을 다시 실행하면 업데이트된 🤗 Transformers의 `main` 버전을 찾아낼 것입니다.
+
+## conda로 설치하기[[install-with-conda]]
+
+`huggingface` conda 채널에서 설치할 수 있습니다.
+
+```bash
+conda install -c huggingface transformers
+```
+
+## 캐시 구성하기[[cache-setup]]
+
+사전훈련된 모델은 다운로드된 후 로컬 경로 `~/.cache/huggingface/hub`에 캐시됩니다. 셸 환경 변수 `TRANSFORMERS_CACHE`의 기본 디렉터리입니다. Windows의 경우 기본 디렉터리는 `C:\Users\username\.cache\huggingface\hub`입니다. 아래의 셸 환경 변수를 (우선 순위) 순서대로 변경하여 다른 캐시 디렉토리를 지정할 수 있습니다.
+
+1. 셸 환경 변수 (기본): `HUGGINGFACE_HUB_CACHE` 또는 `TRANSFORMERS_CACHE`
+2. 셸 환경 변수: `HF_HOME`
+3. 셸 환경 변수: `XDG_CACHE_HOME` + `/huggingface`
+
+<Tip>
+
+과거 🤗 Transformers에서 쓰였던 셸 환경 변수 `PYTORCH_TRANSFORMERS_CACHE` 또는 `PYTORCH_PRETRAINED_BERT_CACHE`이 설정되있다면, 셸 환경 변수 `TRANSFORMERS_CACHE`을 지정하지 않는 한 우선 사용됩니다.
+
+</Tip>
+
+## 오프라인 모드[[offline-mode]]
+
+🤗 Transformers를 로컬 파일만 사용하도록 해서 방화벽 또는 오프라인 환경에서 실행할 수 있습니다. 활성화하려면 `TRANSFORMERS_OFFLINE=1` 환경 변수를 설정하세요.
+
+<Tip>
+
+`HF_DATASETS_OFFLINE=1` 환경 변수를 설정하여 오프라인 훈련 과정에 [🤗 Datasets](https://huggingface.co/docs/datasets/)을 추가할 수 있습니다.
+
+</Tip>
+
+예를 들어 외부 기기 사이에 방화벽을 둔 일반 네트워크에서 평소처럼 프로그램을 다음과 같이 실행할 수 있습니다.
+
+```bash
+python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
+```
+
+오프라인 기기에서 동일한 프로그램을 다음과 같이 실행할 수 있습니다.
+
+```bash
+HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
+python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
+```
+
+이제 스크립트는 로컬 파일에 한해서만 검색할 것이므로, 스크립트가 중단되거나 시간이 초과될 때까지 멈춰있지 않고 잘 실행될 것입니다.
+
+### 오프라인용 모델 및 토크나이저 만들어두기[[fetch-models-and-tokenizers-to-use-offline]]
+
+Another option for using 🤗 Transformers offline is to download the files ahead of time, and then point to their local path when you need to use them offline. There are three ways to do this:
+🤗 Transformers를 오프라인으로 사용하는 또 다른 방법은 파일을 미리 다운로드한 다음, 오프라인일 때 사용할 로컬 경로를 지정해두는 것입니다. 3가지 중 편한 방법을 고르세요.
+
+* [Model Hub](https://huggingface.co/models)의 UI를 통해 파일을 다운로드하려면 ↓ 아이콘을 클릭하세요.
+
+    ![download-icon](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/download-icon.png)
+
+* [`PreTrainedModel.from_pretrained`]와 [`PreTrainedModel.save_pretrained`] 워크플로를 활용하세요.
+
+    1. 미리 [`PreTrainedModel.from_pretrained`]로 파일을 다운로드해두세요.
+
+    ```py
+    >>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/T0_3B")
+    >>> model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0_3B")
+    ```
+
+    2. [`PreTrainedModel.save_pretrained`]로 지정된 경로에 파일을 저장해두세요.
+
+    ```py
+    >>> tokenizer.save_pretrained("./your/path/bigscience_t0")
+    >>> model.save_pretrained("./your/path/bigscience_t0")
+    ```
+
+    3. 이제 오프라인일 때 [`PreTrainedModel.from_pretrained`]로 저장해뒀던 파일을 지정된 경로에서 다시 불러오세요.
+
+    ```py
+    >>> tokenizer = AutoTokenizer.from_pretrained("./your/path/bigscience_t0")
+    >>> model = AutoModel.from_pretrained("./your/path/bigscience_t0")
+    ```
+
+* [huggingface_hub](https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub) 라이브러리를 활용해서 파일을 다운로드하세요.
+
+    1. 가상환경에 `huggingface_hub` 라이브러리를 설치하세요.
+
+    ```bash
+    python -m pip install huggingface_hub
+    ```
+
+    2. [`hf_hub_download`](https://huggingface.co/docs/hub/adding-a-library#download-files-from-the-hub) 함수로 파일을 특정 위치에 다운로드할 수 있습니다. 예를 들어 아래 명령은 [T0](https://huggingface.co/bigscience/T0_3B) 모델의 `config.json` 파일을 지정된 경로에 다운로드합니다.
+
+    ```py
+    >>> from huggingface_hub import hf_hub_download
+
+    >>> hf_hub_download(repo_id="bigscience/T0_3B", filename="config.json", cache_dir="./your/path/bigscience_t0")
+    ```
+
+파일을 다운로드하고 로컬에 캐시 해놓고 나면, 나중에 불러와 사용할 수 있도록 로컬 경로를 지정해두세요.
+
+```py
+>>> from transformers import AutoConfig
+
+>>> config = AutoConfig.from_pretrained("./your/path/bigscience_t0/config.json")
+```
+
+<Tip>
+
+Hub에 저장된 파일을 다운로드하는 방법을 더 자세히 알아보려면 [Hub에서 파일 다운로드하기](https://huggingface.co/docs/hub/how-to-downstream) 섹션을 참고해주세요.
+
+</Tip>
\ No newline at end of file
diff --git a/docs/source/ko/llm_tutorial.md b/docs/source/ko/llm_tutorial.md
new file mode 100644
index 000000000000..05f27dff4f50
--- /dev/null
+++ b/docs/source/ko/llm_tutorial.md
@@ -0,0 +1,221 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+
+# 대규모 언어 모델로 생성하기 [[generation-with-llms]]
+
+[[open-in-colab]]
+
+LLM 또는 대규모 언어 모델은 텍스트 생성의 핵심 구성 요소입니다. 간단히 말하면, 주어진 입력 텍스트에 대한 다음 단어(정확하게는 토큰)를 예측하기 위해 훈련된 대규모 사전 훈련 변환기 모델로 구성됩니다. 토큰을 한 번에 하나씩 예측하기 때문에 새로운 문장을 생성하려면 모델을 호출하는 것 외에 더 복잡한 작업을 수행해야 합니다. 즉, 자기회귀 생성을 수행해야 합니다.
+
+자기회귀 생성은 몇 개의 초기 입력값을 제공한 후, 그 출력을 다시 모델에 입력으로 사용하여 반복적으로 호출하는 추론 과정입니다. 🤗 Transformers에서는 [`~generation.GenerationMixin.generate`] 메소드가 이 역할을 하며, 이는 생성 기능을 가진 모든 모델에서 사용 가능합니다.
+
+이 튜토리얼에서는 다음 내용을 다루게 됩니다:
+
+* LLM으로 텍스트 생성
+* 일반적으로 발생하는 문제 해결
+* LLM을 최대한 활용하기 위한 다음 단계
+
+시작하기 전에 필요한 모든 라이브러리가 설치되어 있는지 확인하세요:
+
+```bash
+pip install transformers bitsandbytes>=0.39.0 -q
+```
+
+
+## 텍스트 생성 [[generate-text]]
+
+[인과적 언어 모델링(causal language modeling)](tasks/language_modeling)을 목적으로 학습된 언어 모델은 일련의 텍스트 토큰을 입력으로 사용하고, 그 결과로 다음 토큰이 나올 확률 분포를 제공합니다.
+
+<!-- [GIF 1 -- FWD PASS] -->
+<figure class="image table text-center m-0 w-full">
+    <video
+        style="max-width: 90%; margin: auto;"
+        autoplay loop muted playsinline
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/assisted-generation/gif_1_1080p.mov"
+    ></video>
+    <figcaption>"LLM의 전방 패스"</figcaption>
+</figure>
+
+LLM과 자기회귀 생성을 함께 사용할 때 핵심적인 부분은 이 확률 분포로부터 다음 토큰을 어떻게 고를 것인지입니다. 다음 반복 과정에 사용될 토큰을 결정하는 한, 어떠한 방법도 가능합니다. 확률 분포에서 가장 가능성이 높은 토큰을 선택하는 것처럼 간단할 수도 있고, 결과 분포에서 샘플링하기 전에 수십 가지 변환을 적용하는 것처럼 복잡할 수도 있습니다.
+
+<!-- [GIF 2 -- TEXT GENERATION] -->
+<figure class="image table text-center m-0 w-full">
+    <video
+        style="max-width: 90%; margin: auto;"
+        autoplay loop muted playsinline
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/assisted-generation/gif_2_1080p.mov"
+    ></video>
+    <figcaption>"자기회귀 생성은 확률 분포에서 다음 토큰을 반복적으로 선택하여 텍스트를 생성합니다."</figcaption>
+</figure>
+
+위에서 설명한 과정은 어떤 종료 조건이 충족될 때까지 반복적으로 수행됩니다. 모델이 시퀀스의 끝(EOS 토큰)을 출력할 때까지를 종료 조건으로 하는 것이 이상적입니다. 그렇지 않은 경우에는 미리 정의된 최대 길이에 도달했을 때 생성이 중단됩니다.
+
+모델이 예상대로 동작하기 위해선 토큰 선택 단계와 정지 조건을 올바르게 설정하는 것이 중요합니다. 이러한 이유로, 각 모델에는 기본 생성 설정이 잘 정의된 [`~generation.GenerationConfig`] 파일이 함께 제공됩니다.
+
+코드를 확인해봅시다!
+
+<Tip>
+
+기본 LLM 사용에 관심이 있다면, 우리의 [`Pipeline`](pipeline_tutorial) 인터페이스로 시작하는 것을 추천합니다. 그러나 LLM은 양자화나 토큰 선택 단계에서의 미세한 제어와 같은 고급 기능들을 종종 필요로 합니다. 이러한 작업은 [`~generation.GenerationMixin.generate`]를 통해 가장 잘 수행될 수 있습니다. LLM을 이용한 자기회귀 생성은 자원을 많이 소모하므로, 적절한 처리량을 위해 GPU에서 실행되어야 합니다.
+
+</Tip>
+
+<!-- TODO: update example to llama 2 (or a newer popular baseline) when it becomes ungated -->
+먼저, 모델을 불러오세요.
+
+```py
+>>> from transformers import AutoModelForCausalLM
+
+>>> model = AutoModelForCausalLM.from_pretrained(
+...     "openlm-research/open_llama_7b", device_map="auto", load_in_4bit=True
+... )
+```
+
+`from_pretrained` 함수를 호출할 때 2개의 플래그를 주목하세요:
+
+- `device_map`은 모델이 GPU로 이동되도록 합니다.
+- `load_in_4bit`는 리소스 요구 사항을 크게 줄이기 위해 [4비트 동적 양자화](main_classes/quantization)를 적용합니다.
+
+이 외에도 모델을 초기화하는 다양한 방법이 있지만, LLM을 처음 시작할 때 이 설정을 추천합니다.
+
+이어서 텍스트 입력을 [토크나이저](tokenizer_summary)으로 전처리하세요.
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b")
+>>> model_inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to("cuda")
+```
+
+`model_inputs` 변수에는 토큰화된 텍스트 입력과 함께 어텐션 마스크가 들어 있습니다. [`~generation.GenerationMixin.generate`]는 어텐션 마스크가 제공되지 않았을 경우에도 이를 추론하려고 노력하지만, 최상의 성능을 위해서는 가능하면 어텐션 마스크를 전달하는 것을 권장합니다. 
+
+마지막으로 [`~generation.GenerationMixin.generate`] 메소드를 호출해 생성된 토큰을 얻은 후, 이를 출력하기 전에 텍스트 형태로 변환하세요.
+
+```py
+>>> generated_ids = model.generate(**model_inputs)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'A list of colors: red, blue, green, yellow, black, white, and brown'
+```
+
+이게 전부입니다! 몇 줄의 코드만으로 LLM의 능력을 활용할 수 있게 되었습니다.
+
+
+## 일반적으로 발생하는 문제 [[common-pitfalls]]
+
+[생성 전략](generation_strategies)이 많고, 기본값이 항상 사용 사례에 적합하지 않을 수 있습니다. 출력이 예상과 다를 때 흔히 발생하는 문제와 이를 해결하는 방법에 대한 목록을 만들었습니다.
+
+```py
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b")
+>>> tokenizer.pad_token = tokenizer.eos_token  # Llama has no pad token by default
+>>> model = AutoModelForCausalLM.from_pretrained(
+...     "openlm-research/open_llama_7b", device_map="auto", load_in_4bit=True
+... )
+```
+
+### 생성된 출력이 너무 짧거나 길다 [[generated-output-is-too-shortlong]]
+
+[`~generation.GenerationConfig`] 파일에서 별도로 지정하지 않으면, `generate`는 기본적으로 최대 20개의 토큰을 반환합니다. `generate` 호출에서 `max_new_tokens`을 수동으로 설정하여 반환할 수 있는 새 토큰의 최대 수를 설정하는 것이 좋습니다. LLM(정확하게는 [디코더 전용 모델](https://huggingface.co/learn/nlp-course/chapter1/6?fw=pt))은 입력 프롬프트도 출력의 일부로 반환합니다.
+
+
+```py
+>>> model_inputs = tokenizer(["A sequence of numbers: 1, 2"], return_tensors="pt").to("cuda")
+
+>>> # By default, the output will contain up to 20 tokens
+>>> generated_ids = model.generate(**model_inputs)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'A sequence of numbers: 1, 2, 3, 4, 5'
+
+>>> # Setting `max_new_tokens` allows you to control the maximum length
+>>> generated_ids = model.generate(**model_inputs, max_new_tokens=50)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'A sequence of numbers: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,'
+```
+
+### 잘못된 생성 모드 [[incorrect-generation-mode]]
+
+기본적으로 [`~generation.GenerationConfig`] 파일에서 별도로 지정하지 않으면, `generate`는 각 반복에서 가장 확률이 높은 토큰을 선택합니다(그리디 디코딩). 하려는 작업에 따라 이 방법은 바람직하지 않을 수 있습니다. 예를 들어, 챗봇이나 에세이 작성과 같은 창의적인 작업은 샘플링이 적합할 수 있습니다. 반면, 오디오를 텍스트로 변환하거나 번역과 같은 입력 기반 작업은 그리디 디코딩이 더 적합할 수 있습니다. `do_sample=True`로 샘플링을 활성화할 수 있으며, 이 주제에 대한 자세한 내용은 이 [블로그 포스트](https://huggingface.co/blog/how-to-generate)에서 볼 수 있습니다.
+
+```py
+>>> # Set seed or reproducibility -- you don't need this unless you want full reproducibility
+>>> from transformers import set_seed
+>>> set_seed(0)
+
+>>> model_inputs = tokenizer(["I am a cat."], return_tensors="pt").to("cuda")
+
+>>> # LLM + greedy decoding = repetitive, boring output
+>>> generated_ids = model.generate(**model_inputs)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'I am a cat. I am a cat. I am a cat. I am a cat'
+
+>>> # With sampling, the output becomes more creative!
+>>> generated_ids = model.generate(**model_inputs, do_sample=True)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'I am a cat.\nI just need to be. I am always.\nEvery time'
+```
+
+### 잘못된 패딩 [[wrong-padding-side]]
+
+LLM은 [디코더 전용](https://huggingface.co/learn/nlp-course/chapter1/6?fw=pt) 구조를 가지고 있어, 입력 프롬프트에 대해 지속적으로 반복 처리를 합니다. 입력 데이터의 길이가 다르면 패딩 작업이 필요합니다. LLM은 패딩 토큰에서 작동을 이어가도록 설계되지 않았기 때문에, 입력 왼쪽에 패딩이 추가 되어야 합니다. 그리고 어텐션 마스크도 꼭 `generate` 함수에 전달되어야 합니다!
+
+```py
+>>> # The tokenizer initialized above has right-padding active by default: the 1st sequence,
+>>> # which is shorter, has padding on the right side. Generation fails.
+>>> model_inputs = tokenizer(
+...     ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
+... ).to("cuda")
+>>> generated_ids = model.generate(**model_inputs)
+>>> tokenizer.batch_decode(generated_ids[0], skip_special_tokens=True)[0]
+''
+
+>>> # With left-padding, it works as expected!
+>>> tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b", padding_side="left")
+>>> tokenizer.pad_token = tokenizer.eos_token  # Llama has no pad token by default
+>>> model_inputs = tokenizer(
+...     ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
+... ).to("cuda")
+>>> generated_ids = model.generate(**model_inputs)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'1, 2, 3, 4, 5, 6,'
+```
+
+<!-- TODO: when the prompting guide is ready, mention the importance of setting the right prompt in this section -->
+
+## 추가 자료 [[further-resources]]
+
+자기회귀 생성 프로세스는 상대적으로 단순한 편이지만, LLM을 최대한 활용하려면 여러 가지 요소를 고려해야 하므로 쉽지 않을 수 있습니다. LLM에 대한 더 깊은 이해와 활용을 위한 다음 단계는 아래와 같습니다:
+
+<!-- TODO: complete with new guides -->
+### 고급 생성 사용 [[advanced-generate-usage]]
+
+1. [가이드](generation_strategies)는 다양한 생성 방법을 제어하는 방법, 생성 설정 파일을 설정하는 방법, 출력을 스트리밍하는 방법에 대해 설명합니다.
+2. [`~generation.GenerationConfig`]와 [`~generation.GenerationMixin.generate`], [generate-related classes](internal/generation_utils)를 참조해보세요.
+
+### LLM 리더보드 [[llm-leaderboards]]
+
+1. [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)는 오픈 소스 모델의 품질에 중점을 둡니다.
+2. [Open LLM-Perf Leaderboard](https://huggingface.co/spaces/optimum/llm-perf-leaderboard)는 LLM 처리량에 중점을 둡니다.
+
+### 지연 시간 및 처리량 [[latency-and-throughput]]
+
+1. 메모리 요구 사항을 줄이려면, 동적 양자화에 대한 [가이드](main_classes/quantization)를 참조하세요.
+
+### 관련 라이브러리 [[related-libraries]]
+
+1. [`text-generation-inference`](https://github.com/huggingface/text-generation-inference)는 LLM을 위한 실제 운영 환경에 적합한 서버입니다.
+2. [`optimum`](https://github.com/huggingface/optimum)은 특정 하드웨어 장치에서 LLM을 최적화하기 위해 🤗 Transformers를 확장한 것입니다.
diff --git a/docs/source/ko/model_doc/llama.md b/docs/source/ko/model_doc/llama.md
new file mode 100644
index 000000000000..282befac213d
--- /dev/null
+++ b/docs/source/ko/model_doc/llama.md
@@ -0,0 +1,117 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# LLaMA [[llama]]
+
+## 개요 [[overview]]
+
+LLaMA 모델은 Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample에 의해 제안된 [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)에서 소개되었습니다. 이 모델은 7B에서 65B개의 파라미터까지 다양한 크기의 기초 언어 모델을 모아놓은 것입니다.
+
+논문의 초록은 다음과 같습니다:
+
+*"LLaMA는 7B에서 65B개의 파라미터 수를 가진 기초 언어 모델의 모음입니다. 우리는 수조 개의 토큰으로 모델을 훈련시켰고, 공개적으로 이용 가능한 데이터셋만을 사용하여 최고 수준의 모델을 훈련시킬 수 있음을 보여줍니다. 특히, LLaMA-13B 모델은 대부분의 벤치마크에서 GPT-3 (175B)를 능가하며, LLaMA-65B는 최고 수준의 모델인 Chinchilla-70B와 PaLM-540B에 버금가는 성능을 보입니다. 우리는 모든 모델을 연구 커뮤니티에 공개합니다."*
+
+팁:
+
+- LLaMA 모델의 가중치는 [이 양식](https://docs.google.com/forms/d/e/1FAIpQLSfqNECQnMkycAp2jP4Z9TFX0cGR4uf7b_fBxjY_OjhJILlKGA/viewform?usp=send_form)을 작성하여 얻을 수 있습니다.
+- 가중치를 다운로드한 후에는 이를 [변환 스크립트](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py)를 사용하여 Hugging Face Transformers 형식으로 변환해야합니다. 변환 스크립트를 실행하려면 아래의 예시 명령어를 참고하세요:
+
+```bash
+python src/transformers/models/llama/convert_llama_weights_to_hf.py \
+    --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path
+```
+
+- 변환을 하였다면 모델과 토크나이저는 다음과 같이 로드할 수 있습니다:
+
+```python
+from transformers import LlamaForCausalLM, LlamaTokenizer
+
+tokenizer = LlamaTokenizer.from_pretrained("/output/path")
+model = LlamaForCausalLM.from_pretrained("/output/path")
+```
+
+스크립트를 실행하기 위해서는 모델을 float16 정밀도로 전부 로드할 수 있을 만큼의 충분한 CPU RAM이 필요합니다. (가장 큰 버전의 모델이 여러 체크포인트로 나뉘어 있더라도, 각 체크포인트는 모델의 각 가중치의 일부를 포함하고 있기 때문에 모든 체크포인트를 RAM에 로드해야 합니다) 65B 모델의 경우, 총 130GB의 RAM이 필요합니다.
+
+
+- LLaMA 토크나이저는 [sentencepiece](https://github.com/google/sentencepiece)를 기반으로 하는 BPE 모델입니다. sentencepiece의 특징 중 하나는 시퀀스를 디코딩할 때 첫 토큰이 단어의 시작이라면 (예를 들어 "Banana"), 토크나이저는 문자열 앞에 공백을 추가하지 않는다는 것입니다.
+
+이 모델은 [BlackSamorez](https://huggingface.co/BlackSamorez)의 기여와 함께, [zphang](https://huggingface.co/zphang)에 의해 제공되었습니다. Hugging Face에서의 구현 코드는 GPT-NeoX를 기반으로 하며 [여기](https://github.com/EleutherAI/gpt-neox)에서 찾을 수 있고, 저자의 코드 원본은 [여기](https://github.com/facebookresearch/llama)에서 확인할 수 있습니다.
+
+
+원래 LLaMA 모델을 기반으로 Meta AI에서 몇 가지 후속 작업을 발표했습니다:
+
+- **Llama2**: Llama2는 구조적인 몇 가지 수정(Grouped Query Attention)을 통해 개선된 버전이며, 2조 개의 토큰으로 사전 훈련이 되어 있습니다. Llama2에 대한 자세한 내용은 [이 문서](llama2)를 참고하세요.
+
+## 리소스 [[resources]]
+
+LLaMA를 시작하는 데 도움이 될 Hugging Face 및 커뮤니티(🌎로 표시)의 공식 자료 목록입니다. 여기에 자료를 제출하고 싶다면 Pull Request를 올려주세요! 추가할 자료는 기존의 자료와 중복되지 않고 새로운 내용을 보여주는 것이 좋습니다.
+
+<PipelineTag pipeline="text-classification"/>
+
+- LLaMA 모델을 텍스트 분류 작업에 적용하기 위한 프롬프트 튜닝 방법에 대한 [노트북](https://colab.research.google.com/github/bigscience-workshop/petals/blob/main/examples/prompt-tuning-sst2.ipynb#scrollTo=f04ba4d2) 🌎
+
+<PipelineTag pipeline="question-answering"/>
+
+- [Stack Exchange](https://stackexchange.com/)에서 질문에 답하는 LLaMA를 훈련하는 방법을 위한 [StackLLaMA: RLHF로 LLaMA를 훈련하는 실전 가이드](https://huggingface.co/blog/stackllama#stackllama-a-hands-on-guide-to-train-llama-with-rlhf) 🌎
+
+⚗️ 최적화
+- 제한된 메모리를 가진 GPU에서 xturing 라이브러리를 사용하여 LLaMA 모델을 미세 조정하는 방법에 대한 [노트북](https://colab.research.google.com/drive/1SQUXq1AMZPSLD4mk3A3swUIc6Y2dclme?usp=sharing) 🌎
+
+⚡️ 추론
+- 🤗 PEFT 라이브러리의 PeftModel을 사용하여 LLaMA 모델을 실행하는 방법에 대한 [노트북](https://colab.research.google.com/github/DominguesM/alpaca-lora-ptbr-7b/blob/main/notebooks/02%20-%20Evaluate.ipynb) 🌎
+- LangChain을 사용하여 PEFT 어댑터 LLaMA 모델을 로드하는 방법에 대한 [노트북](https://colab.research.google.com/drive/1l2GiSSPbajVyp2Nk3CFT4t3uH6-5TiBe?usp=sharing) 🌎
+
+🚀 배포
+- 🤗 PEFT 라이브러리와 사용자 친화적인 UI로 LLaMA 모델을 미세 조정하는 방법에 대한 [노트북](https://colab.research.google.com/github/lxe/simple-llama-finetuner/blob/master/Simple_LLaMA_FineTuner.ipynb#scrollTo=3PM_DilAZD8T) 🌎
+- Amazon SageMaker에서 텍스트 생성을 위해 Open-LLaMA 모델을 배포하는 방법에 대한 [노트북](https://github.com/aws/amazon-sagemaker-examples/blob/main/introduction_to_amazon_algorithms/jumpstart-foundation-models/text-generation-open-llama.ipynb) 🌎
+
+## LlamaConfig [[llamaconfig]]
+
+[[autodoc]] LlamaConfig
+
+
+## LlamaTokenizer [[llamatokenizer]]
+
+[[autodoc]] LlamaTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## LlamaTokenizerFast [[llamatokenizerfast]]
+
+[[autodoc]] LlamaTokenizerFast
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - update_post_processor
+    - save_vocabulary
+
+## LlamaModel [[llamamodel]]
+
+[[autodoc]] LlamaModel
+    - forward
+
+
+## LlamaForCausalLM [[llamaforcausallm]]
+
+[[autodoc]] LlamaForCausalLM
+    - forward
+
+## LlamaForSequenceClassification [[llamaforsequenceclassification]]
+
+[[autodoc]] LlamaForSequenceClassification
+    - forward
diff --git a/docs/source/ko/model_doc/llama2.md b/docs/source/ko/model_doc/llama2.md
new file mode 100644
index 000000000000..5290f2bb7b6f
--- /dev/null
+++ b/docs/source/ko/model_doc/llama2.md
@@ -0,0 +1,129 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Llama2 [[llama2]]
+
+## 개요 [[overview]]
+
+Llama2 모델은 Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Ya1smine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushkar Mishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom의 논문 [LLaMA: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/)에서 제안되었습니다. 채팅 어플리케이션에 맞게 미세 조정된 체크포인트를 포함된 7B에서 70B 범위의 매개변수를 가진 기초 언어 모델 모음입니다!
+
+논문의 초록은 다음과 같습니다:
+
+*이 연구에서 우리는 70억에서 700억 파라미터의 범위에서 사전 훈련 및 미세 조정된 대규모 언어 모델(LLMs)의 모음인 Llama 2를 개발 및 공개합니다. Llama 2-Chat라고 불리는 미세 조정된 LLMs은 대화 사용 사례에 최적화되었습니다. 우리의 모델은 테스트한 대부분의 벤치마크에서 오픈 소스 채팅 모델보다 성능이 뛰어나며, 유용성과 안전성에 대한 인적 평가를 바탕으로 비공개 소스 모델을 대체할 수 있는 적절한 대안이 될 수 있습니다. 우리는 Llama 2-Chat의 미세 조정 및 안전성 향상의 접근 방식에 대한 자세한 설명을 제공하여 커뮤니티가 우리의 작업을 기반으로 LLMs의 책임있는 개발에 기여할 수 있도록 합니다.*
+
+[여기](https://huggingface.co/models?search=llama2)에서 모든 Llama2 모델을 확인할 수 있습니다.
+
+<Tip warning={true}>
+
+`Llama2` 모델은 `bfloat16`을 사용하여 훈련되었지만, 원래 추론은 `float16`을 사용합니다. 허브에 업로드된 체크포인트는 `torch_dtype = 'float16'`을 사용하며, 이는 `AutoModel` API에 의해 체크포인트를 `torch.float32`에서 `torch.float16`으로 캐스팅하는 데 사용됩니다. 
+
+온라인 가중치의 `dtype`은 `model = AutoModelForCausalLM.from_pretrained("path", torch_dtype = "auto")`를 사용하여 모델을 초기화할 때 `torch_dtype="auto"`를 사용하지 않는 한 대부분 관련이 없습니다. 그 이유는 모델이 먼저 다운로드될 것이고 (온라인 체크포인트의 `dtype`을 사용하여) 그다음에 기본 `dtype`인 `torch`로 캐스팅하고(`torch.float32`가 됨), 마지막으로 구성(configuration)에서 제공된 `torch_dtype`이 있는 경우 이를 사용하기 때문입니다.
+
+모델을 `float16`에서 훈련하는 것은 권장되지 않으며 `nan`을 생성하는 것으로 알려져 있습니다. 따라서 모델은 `bfloat16`에서 훈련되어야 합니다.
+
+</Tip>
+
+🍯 팁:
+
+- Llama2 모델의 가중치는 [이 양식](https://ai.meta.com/resources/models-and-libraries/llama-downloads/)을 작성하여 얻을 수 있습니다.
+- 아키텍처는 처음 버전의 Llama와 매우 유사하며, [이 논문](https://arxiv.org/pdf/2305.13245.pdf)의 내용에 따라 Grouped Query Attention (GQA)이 추가되었습니다.
+- `config.pretraining_tp`를 1과 다른 값으로 설정하면 더 정확하지만 느린 선형 레이어 계산이 활성화되어 원본 로짓과 더 잘 일치하게 됩니다.
+- 원래 모델은 `pad_id = -1`을 사용하는데, 이는 패딩 토큰이 없음을 의미합니다. 동일한 로직을 사용할 수 없으므로 `tokenizer.add_special_tokens({"pad_token":"<pad>"})`를 사용하여 패딩 토큰을 추가하고 이에 따라 토큰 임베딩 크기를 조정해야 합니다. 또한 `model.config.pad_token_id`를 설정해야 합니다. 모델의 `embed_tokens` 레이어는 `self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.config.padding_idx)`로 초기화되어, 패딩 토큰 인코딩이 0을 출력하도록 합니다. 따라서 초기화 시에 전달하는 것을 권장합니다.
+- 양식을 작성하고 모델 체크포인트 접근 권한을 얻은 후에는 이미 변환된 체크포인트를 사용할 수 있습니다. 그렇지 않고 자신의 모델을 직접 변환하려는 경우, [변환 스크립트](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py)를 자유롭게 사용하세요. 스크립트는 다음과 같은 예시의 명령어로 호출할 수 있습니다:
+
+```bash
+python src/transformers/models/llama/convert_llama_weights_to_hf.py \
+    --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path
+```
+
+- 변환 후 모델과 토크나이저는 다음과 같이 로드할 수 있습니다:
+
+```python
+from transformers import LlamaForCausalLM, LlamaTokenizer
+
+tokenizer = LlamaTokenizer.from_pretrained("/output/path")
+model = LlamaForCausalLM.from_pretrained("/output/path")
+```
+
+스크립트를 실행하려면 모델을 float16 정밀도로 전부 호스트할 수 있을 만큼 충분한 CPU RAM이 필요합니다 (가장 큰 버전이 여러 체크포인트로 제공되더라도 각 체크포인트는 모델 가중치의 일부만을 포함하므로 모두 RAM에 로드해야 합니다). 75B 모델의 경우, 총 145GB의 RAM이 필요합니다.
+
+- LLaMA 토크나이저는 [sentencepiece](https://github.com/google/sentencepiece)를 기반으로 한 BPE 모델입니다. sentencepiece의 특징 중 하나는 시퀀스를 디코딩할 때 첫 번째 토큰이 단어의 시작이면 (예: "Banana") 토크나이저는 문자열 앞에 접두사 공간을 추가하지 않는 것입니다.
+
+이 모델은 [Arthur Zucker](https://huggingface.co/ArthurZ)가 [Lysandre Debut](https://huggingface.co/lysandre)의 도움을 받아 제공하였습니다. Hugging Face에서의 구현 코드는 [여기](https://github.com/EleutherAI/gpt-neox)의 GPT-NeoX 를 기반으로 합니다. 저자의 원래 코드는 [여기](https://github.com/facebookresearch/llama)에서 찾을 수 있습니다.
+
+## 리소스 [[resources]]
+
+LLaMA2를 시작하는 데 도움이 될 Hugging Face의 공식 및 커뮤니티(🌎로 표시) 리소스 목록입니다. 여기에 새로운 리소스를 추가하기 위해서 Pull Request를 열어 주시면 검토하겠습니다! 리소스는 기존 리소스와 중복되지 않는 새로운 것을 보여주는 것이 이상적입니다.
+
+- [Llama 2 is here - get it on Hugging Face](https://huggingface.co/blog/llama2), Llama 2에 관한 블로그 포스트와 🤗 Transformers 및 🤗 PEFT와 함께 사용하는 방법에 대한 내용입니다.
+- [LLaMA 2 - Every Resource you need](https://www.philschmid.de/llama-2), LLaMA 2에 대해 알아보고 빠르게 시작하는 데 필요한 관련 리소스의 모음입니다.
+
+<PipelineTag pipeline="text-generation"/>
+
+- Google Colab에서 QLoRA와 4-bit 정밀도를 사용하여 Llama 2를 미세 조정하는 방법에 대한 [노트북](https://colab.research.google.com/drive/1PEQyJO1-f6j0S_XJ8DV50NkpzasXkrzd?usp=sharing)입니다. 🌎
+- "Llama-v2-7b-guanaco" 모델을 4-bit QLoRA로 미세 조정하고 PDF에서 Q&A 데이터셋을 생성하는 방법에 대한 [노트북](https://colab.research.google.com/drive/134o_cXcMe_lsvl15ZE_4Y75Kstepsntu?usp=sharing)입니다. 🌎
+
+⚗️ 최적화
+- [Llama 2를 DPO로 미세 조정하기](https://huggingface.co/blog/dpo-trl), TRL 라이브러리의 DPO 방법을 사용하여 특정 데이터셋에서 Llama 2를 미세 조정하는 방법을 안내하는 가이드입니다.
+- [확장 가이드: Llama 2 명령어 조정](https://www.philschmid.de/instruction-tune-llama-2), 입력에서 명령어를 생성하도록 Llama 2를 훈련시키는 방법을 안내하는 가이드로, 명령어를 따르는 모델에서 명령어를 주는 모델로 변환합니다.
+- 개인 컴퓨터에서 QLoRA와 TRL을 사용하여 Llama 2 모델을 미세 조정하는 방법에 대한 [노트북](https://colab.research.google.com/drive/1SYpgFpcmtIUzdE7pxqknrM4ArCASfkFQ?usp=sharing)입니다. 🌎
+
+⚡️ 추론
+- AutoGPTQ 라이브러리의 GPTQ를 사용하여 Llama 2 모델을 양자화하는 방법에 대한 [노트북](https://colab.research.google.com/drive/1TC56ArKerXUpbgRy5vM3woRsbTEVNq7h?usp=sharing)입니다. 🌎
+- 로컬 컴퓨터나 Google Colab에서 4-bit 양자화로 Llama 2 채팅 모델을 실행하는 방법에 대한 [노트북](https://colab.research.google.com/drive/1X1z9Q6domMKl2CnEM0QGHNwidLfR4dW2?usp=sharing)입니다. 🌎
+
+🚀 배포
+- [Amazon SageMaker에서 LLaMA 2 (7-70B) 미세 조정하기](https://www.philschmid.de/sagemaker-llama2-qlora), Amazon SageMaker에서 QLoRA 미세 조정 및 배포에 이르기까지의 완전한 가이드입니다.
+- [Amazon SageMaker에서 Llama 2 7B/13B/70B 배포하기](https://www.philschmid.de/sagemaker-llama-llm), 안전하고 확장 가능한 배포를 위해 Hugging Face의 LLM DLC 컨테이너를 사용하는 방법에 대한 가이드입니다.
+
+
+## LlamaConfig [[llamaconfig]]
+
+[[autodoc]] LlamaConfig
+
+
+## LlamaTokenizer [[llamatokenizer]]
+
+[[autodoc]] LlamaTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## LlamaTokenizerFast [[llamatokenizerfast]]
+
+[[autodoc]] LlamaTokenizerFast
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - update_post_processor
+    - save_vocabulary
+
+## LlamaModel [[llamamodel]]
+
+[[autodoc]] LlamaModel
+    - forward
+
+
+## LlamaForCausalLM [[llamaforcausallm]]
+
+[[autodoc]] LlamaForCausalLM
+    - forward
+
+## LlamaForSequenceClassification [[llamaforsequenceclassification]]
+
+[[autodoc]] LlamaForSequenceClassification
+    - forward
diff --git a/docs/source/ko/model_doc/whisper.md b/docs/source/ko/model_doc/whisper.md
new file mode 100644
index 000000000000..68fbe045caf6
--- /dev/null
+++ b/docs/source/ko/model_doc/whisper.md
@@ -0,0 +1,120 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Whisper [[whisper]]
+
+## 개요 [[overview]]
+
+Whisper 모델은 Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever에 의해 [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf)에서 제안되었습니다.
+
+논문의 초록은 다음과 같습니다:
+
+*우리는 인터넷에서 대량의 오디오를 글로 옮긴 것을 예측하도록 간단히 훈련된 음성 처리 시스템의 성능을 연구합니다. 68만 시간의 다국어 및 다중 작업 지도(multitask supervision)에 확장했을 때, 결과 모델은 표준 벤치마크에 잘 일반화되며, 미세 조정이 필요 없는 제로샷 전송 설정에서 이전의 완전히 지도된(fully-supervised) 결과와 경쟁할 수 있는 경우가 많습니다. 사람과 비교하면, 이 모델은 사람의 정확도와 견고성에 근접합니다. 우리는 강력한 음성 처리를 위한 추가 작업의 기반이 될 모델과 추론 코드를 공개합니다.*
+
+
+
+팁:
+
+- 이 모델은 일반적으로 별도의 미세 조정 없이도 잘 작동합니다.
+- 아키텍처는 고전적인 인코더-디코더 아키텍처를 따르기 때문에, 추론을 위해 [`~generation.GenerationMixin.generate`] 함수를 사용합니다.
+- 현재 추론은 짧은 형식에만 구현되어 있으며, 오디오는 30초 미만의 세그먼트로 미리 분할되어야 합니다. 타임스탬프를 포함한 긴 형식에 대한 추론은 향후 릴리스에서 구현될 예정입니다.
+- [`WhisperProcessor`]를 사용하여 모델에 사용할 오디오를 준비하고, 예측된 ID를 텍스트로 디코딩할 수 있습니다.
+
+이 모델은 [Arthur Zucker](https://huggingface.co/ArthurZ)에 의해 제공되었습니다. 이 모델의 Tensorflow 버전은 [amyeroberts](https://huggingface.co/amyeroberts)에 의해 제공되었습니다.
+원본 코드는 [여기](https://github.com/openai/whisper)에서 찾을 수 있습니다.
+
+
+
+## WhisperConfig [[whisperconfig]]
+
+[[autodoc]] WhisperConfig
+
+## WhisperTokenizer [[whispertokenizer]]
+
+[[autodoc]] WhisperTokenizer
+    - set_prefix_tokens
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## WhisperTokenizerFast [[whispertokenizerfast]]
+
+[[autodoc]] WhisperTokenizerFast
+    - set_prefix_tokens
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## WhisperFeatureExtractor [[whisperfeatureextractor]]
+
+[[autodoc]] WhisperFeatureExtractor
+    - __call__
+
+## WhisperProcessor [[whisperprocessor]]
+
+[[autodoc]] WhisperProcessor
+    - __call__
+    - from_pretrained
+    - save_pretrained
+    - batch_decode
+    - decode
+
+## WhisperModel [[whispermodel]]
+
+[[autodoc]] WhisperModel
+    - forward
+    - _mask_input_features
+
+## WhisperForConditionalGeneration [[whisperforconditionalgeneration]]
+
+[[autodoc]] WhisperForConditionalGeneration
+    - forward
+
+## WhisperForAudioClassification [[whisperforaudioclassification]]
+
+[[autodoc]] WhisperForAudioClassification
+    - forward
+
+
+
+## TFWhisperModel [[tfwhispermodel]]
+
+[[autodoc]] TFWhisperModel
+    - call
+
+## TFWhisperForConditionalGeneration [[tfwhisperforconditionalgeneration]]
+
+[[autodoc]] TFWhisperForConditionalGeneration
+    - call
+
+
+## FlaxWhisperModel [[flaxwhispermodel]]
+
+[[autodoc]] FlaxWhisperModel
+    - __call__
+
+## FlaxWhisperForConditionalGeneration [[flaxwhisperforconditionalgeneration]]
+
+[[autodoc]] FlaxWhisperForConditionalGeneration
+    - __call__
+
+## FlaxWhisperForAudioClassification [[flaxwhisperforaudioclassification]]
+
+[[autodoc]] FlaxWhisperForAudioClassification
+    - __call__
+
diff --git a/docs/source/ko/model_memory_anatomy.md b/docs/source/ko/model_memory_anatomy.md
new file mode 100644
index 000000000000..351cbebe0285
--- /dev/null
+++ b/docs/source/ko/model_memory_anatomy.md
@@ -0,0 +1,242 @@
+<!---
+Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# 모델 학습 해부하기 [[model-training-anatomy]]
+
+모델 훈련 속도와 메모리 활용의 효율성을 향상시키기 위해 적용할 수 있는 성능 최적화 기술을 이해하려면 GPU가 훈련 중에 어떻게 활용되는지, 그리고 수행되는 연산에 따라 연산 강도가 어떻게 변하는지에 익숙해져야 합니다.
+
+먼저 GPU 활용과 모델 훈련 실행에 대한 예시를 살펴보겠습니다. 데모를 위해 몇몇 라이브러리를 설치해야 합니다:
+
+```bash
+pip install transformers datasets accelerate nvidia-ml-py3
+```
+
+`nvidia-ml-py3` 라이브러리는 Python 내부에서 모델의 메모리 사용량을 모니터링할 수 있게 해줍니다. 터미널의 `nvidia-smi` 명령어에 익숙할 수 있는데, 이 라이브러리는 Python에서 직접 동일한 정보에 접근할 수 있게 해줍니다.
+
+그 다음, 100과 30000 사이의 무작위 토큰 ID와 분류기를 위한 이진 레이블인 더미 데이터를 생성합니다.
+길이가 각각 512인 총 512개의 시퀀스를 가져와 PyTorch 형식의 [`~datasets.Dataset`]에 저장합니다.
+
+
+```py
+>>> import numpy as np
+>>> from datasets import Dataset
+
+
+>>> seq_len, dataset_size = 512, 512
+>>> dummy_data = {
+...     "input_ids": np.random.randint(100, 30000, (dataset_size, seq_len)),
+...     "labels": np.random.randint(0, 1, (dataset_size)),
+... }
+>>> ds = Dataset.from_dict(dummy_data)
+>>> ds.set_format("pt")
+```
+
+GPU 활용 및 [`Trainer`]로 실행한 훈련 과정에 대한 요약 통계를 출력하기 위해 두 개의 도우미 함수를 정의하겠습니다:
+
+```py
+>>> from pynvml import *
+
+
+>>> def print_gpu_utilization():
+...     nvmlInit()
+...     handle = nvmlDeviceGetHandleByIndex(0)
+...     info = nvmlDeviceGetMemoryInfo(handle)
+...     print(f"GPU memory occupied: {info.used//1024**2} MB.")
+
+
+>>> def print_summary(result):
+...     print(f"Time: {result.metrics['train_runtime']:.2f}")
+...     print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
+...     print_gpu_utilization()
+```
+
+시작할 때 GPU 메모리가 비어 있는지 확인해 봅시다:
+
+```py
+>>> print_gpu_utilization()
+GPU memory occupied: 0 MB.
+```
+
+좋습니다. 모델을 로드하기 전에는 예상대로 GPU 메모리가 점유되지 않았습니다. 그렇지 않다면 사용자의 기기에서 GPU 메모리를 사용하는 모든 프로세스를 중단해야 합니다. 그러나 사용자는 모든 여유 GPU 메모리를 사용할 수는 없습니다. 모델이 GPU에 로드될 때 커널도 로드되므로 1-2GB의 메모리를 차지할 수 있습니다. 얼마나 되는지 확인하기 위해 GPU에 작은 텐서를 로드하여 커널이 로드되도록 트리거합니다.
+
+```py
+>>> import torch
+
+
+>>> torch.ones((1, 1)).to("cuda")
+>>> print_gpu_utilization()
+GPU memory occupied: 1343 MB.
+```
+
+커널만으로도 GPU 메모리의 1.3GB를 차지합니다. 이제 모델이 얼마나 많은 공간을 사용하는지 확인해 보겠습니다.
+
+## 모델 로드 [[load-model]]
+
+우선, `bert-large-uncased` 모델을 로드합니다. 모델의 가중치를 직접 GPU에 로드해서 가중치만이 얼마나 많은 공간을 차지하는지 확인할 수 있습니다.
+
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("bert-large-uncased").to("cuda")
+>>> print_gpu_utilization()
+GPU memory occupied: 2631 MB.
+```
+
+모델의 가중치만으로도 GPU 메모리를 1.3 GB 차지하는 것을 볼 수 있습니다. 정확한 숫자는 사용하는 GPU에 따라 다릅니다. 최신 GPU에서는 모델 사용 속도를 높이는 최적화된 방식으로 가중치가 로드되므로, 모델이 더 많은 공간을 차지할 수 있습니다. 이제 `nvidia-smi` CLI와 동일한 결과를 얻는지 빠르게 확인할 수 있습니다:
+
+
+```bash
+nvidia-smi
+```
+
+```bash
+Tue Jan 11 08:58:05 2022
++-----------------------------------------------------------------------------+
+| NVIDIA-SMI 460.91.03    Driver Version: 460.91.03    CUDA Version: 11.2     |
+|-------------------------------+----------------------+----------------------+
+| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
+|                               |                      |               MIG M. |
+|===============================+======================+======================|
+|   0  Tesla V100-SXM2...  On   | 00000000:00:04.0 Off |                    0 |
+| N/A   37C    P0    39W / 300W |   2631MiB / 16160MiB |      0%      Default |
+|                               |                      |                  N/A |
++-------------------------------+----------------------+----------------------+
+
++-----------------------------------------------------------------------------+
+| Processes:                                                                  |
+|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
+|        ID   ID                                                   Usage      |
+|=============================================================================|
+|    0   N/A  N/A      3721      C   ...nvs/codeparrot/bin/python     2629MiB |
++-----------------------------------------------------------------------------+
+```
+
+이전과 동일한 숫자가 출력되고 16GB 메모리를 가진 V100 GPU를 사용하고 있다는 것도 볼 수 있습니다. 그러므로 이제 모델 훈련을 시작하여 GPU 메모리 사용량이 어떻게 달라지는지 볼 수 있습니다. 우선 몇몇 표준 훈련 인수를 설정합니다:
+
+```py
+default_args = {
+    "output_dir": "tmp",
+    "evaluation_strategy": "steps",
+    "num_train_epochs": 1,
+    "log_level": "error",
+    "report_to": "none",
+}
+```
+
+<Tip>
+
+여러 실험을 실행할 계획이라면, 실험 간에 메모리를 제대로 비우기 위해서 Python 커널을 실험 사이마다 재시작해야 합니다.
+
+</Tip>
+
+## 기본 훈련에서의 메모리 활용 [[memory-utilization-at-vanilla-training]]
+
+[`Trainer`]를 사용하여, GPU 성능 최적화 기술을 사용하지 않고 배치 크기가 4인 모델을 훈련시키겠습니다:
+
+```py
+>>> from transformers import TrainingArguments, Trainer, logging
+
+>>> logging.set_verbosity_error()
+
+
+>>> training_args = TrainingArguments(per_device_train_batch_size=4, **default_args)
+>>> trainer = Trainer(model=model, args=training_args, train_dataset=ds)
+>>> result = trainer.train()
+>>> print_summary(result)
+```
+
+```
+Time: 57.82
+Samples/second: 8.86
+GPU memory occupied: 14949 MB.
+```
+
+우리는 비교적 작은 배치 크기로도 전체 GPU 메모리를 거의 다 차지하는 것을 볼 수 있습니다. 그러나 배치 크기가 클수록 모델 수렴 속도가 빨라지고 최종 성능이 향상되는 경우가 많습니다. 그래서 이상적으로는 GPU 제한이 아닌 우리 모델의 요구사항에 맞게 배치 크기를 조정하려고 합니다. 흥미롭게도 우리는 모델의 크기보다 훨씬 더 많은 메모리를 사용합니다. 왜 이런 현상이 발생하는지 조금 더 잘 이해하기 위해 모델의 연산과 메모리 요구 사항을 살펴보겠습니다.
+
+## 모델의 연산 해부하기 [[anatomy-of-models-operations]]
+
+트랜스포머 아키텍처에는 연산 강도(compute-intensity)에 따라 그룹화된 3가지 주요 연산 그룹이 있습니다.
+
+1. **텐서 축약(Tensor Contractions)**
+
+    선형 레이어와 멀티헤드 어텐션의 구성 요소는 모두 **행렬-행렬 곱셈(matrix-matrix multiplications)**을 일괄적으로 처리합니다. 이 연산은 트랜스포머 훈련에서 가장 연산 강도가 높은 부분입니다.
+
+2. **통계 정규화(Statistical Normalizations)**
+
+    소프트맥스와 레이어 정규화는 텐서 축약보다 연산 강도가 낮습니다. 하나 이상의 **감소 연산(reduction operations)**을 포함하며, 그 결과는 map을 통해 적용됩니다.
+
+3. **원소별 연산자(Element-wise Operators)**
+
+    그 외 연산자들, **편향(biases), 드롭아웃(dropout), 활성화 함수(activations), 잔차 연결(residual connections)**이 여기에 해당합니다. 이 연산들은 연산 강도가 가장 낮습니다.
+
+이러한 지식은 성능 병목 현상을 분석할 때 도움이 될 수 있습니다.
+
+이 내용은 [Data Movement Is All You Need: A Case Study on Optimizing Transformers 2020](https://arxiv.org/abs/2007.00072)을 참고하였습니다.
+
+
+## 모델의 메모리 구조 [[anatomy-of-models-memory]]
+
+모델을 훈련시키는 데는 단순히 GPU에 모델을 올리는 것보다 훨씬 더 많은 메모리를 사용한다는 것을 보았습니다. 이는 훈련 중 GPU 메모리를 사용하는 많은 구성 요소가 있기 때문입니다. GPU 메모리의 구성 요소는 다음과 같습니다:
+
+1. 모델 가중치
+2. 옵티마이저 상태
+3. 그라디언트
+4. 그라디언트 계산을 위해 저장된 순방향 활성화
+5. 임시 버퍼
+6. 기능별 메모리
+
+AdamW를 사용하여 혼합 정밀도로 훈련된 일반적인 모델은 모델 파라미터당 18 바이트와 활성화 메모리가 필요합니다. 추론 단계에서는 옵티마이저와 그라디언트가 필요하지 않으므로 이들은 제외합니다. 따라서 혼합 정밀도 추론의 경우 모델 매개변수당 6 바이트와 활성화 메모리가 필요합니다.
+
+자세히 살펴보겠습니다.
+
+**모델 가중치:**
+
+- fp32 훈련의 경우 매개 변수 수 * 4 바이트
+- 혼합 정밀도 훈련의 경우 매개 변수 수 * 6 바이트 (메모리에 fp32와 fp16 두 가지 모델을 유지)
+
+**옵티마이저 상태:**
+
+- 일반 AdamW의 경우 매개 변수 수 * 8 바이트 (2가지 상태 유지)
+- [bitsandbytes](https://github.com/TimDettmers/bitsandbytes)와 같은 8비트 AdamW 옵티마이저의 경우 매개 변수 수 * 2 바이트
+- Momentum을 가진 SGD와 같은 옵티마이저의 경우 매개 변수 수 * 4 바이트 (하나의 상태만 유지)
+
+**그라디언트**
+
+- fp32 또는 혼합 정밀도 훈련의 경우 매개 변수 수 * 4 바이트 (그라디언트는 항상 fp32으로 유지됩니다.)
+
+**순방향 활성화**
+
+- 크기는 여러 요인에 따라 달라지며, 주요 요인은 시퀀스 길이, 은닉 상태의 크기 및 배치 크기입니다.
+
+순방향 및 역방향 함수에서 전달 및 반환되는 입력과 출력이 있으며, 그라디언트 계산을 위해 저장된 순방향 활성화가 있습니다.
+
+**임시 메모리**
+
+더불어 모든 종류의 임시 변수는 연산이 완료되면 곧바로 해제되지만, 그 순간에는 추가 메모리가 필요할 수 있고 OOM을 유발할 수 있습니다. 따라서 코딩할 때 이러한 임시 변수에 대해 전략적으로 생각하고 때로는 더 이상 필요 없는 임시 변수를 즉시 명시적으로 메모리에서 제거하는 것이 중요합니다.
+
+**기능별 메모리**
+
+그런 다음, 소프트웨어에는 특별한 메모리 요구 사항이 있을 수 있습니다. 예를 들어, 빔 검색을 사용하여 텍스트를 생성할 때 소프트웨어는 입력과 출력 사본을 여러 개 유지해야 합니다.
+
+**`forward` vs `backward` 실행 속도**
+
+합성곱과 선형 레이어의 경우 순방향에 비해 역방향에서는 2배의 플롭스가 필요하므로 일반적으로 2배 정도 느리게 변환됩니다(역방향의 경우 사이즈가 부자연스럽기 때문에, 때로는 더욱 느릴 수도 있습니다). 활성화는 일반적으로 대역폭이 제한되어 있으며, 일반적으로 순방향보다 역방향에서 더 많은 데이터를 읽어야 합니다. (예를 들어, 순방향 활성화 시 한 번 씩 읽고 쓰지만, 역방향 활성화에서는 순방향 gradOutput과 출력에 대해 총 두 번 읽고 gradInput에 대해 한 번 씁니다.)
+
+보다시피, GPU 메모리를 절약하거나 작업 속도를 높일 수 있는 몇 가지 방법이 있습니다.
+이제 GPU 활용과 계산 속도에 영향을 주는 것이 무엇인지를 이해했으므로, [Methods and tools for efficient training on a single GPU](perf_train_gpu_one) 문서 페이지를 참조하여 성능 최적화 기법에 대해 알아보세요.
\ No newline at end of file
diff --git a/docs/source/ko/model_sharing.md b/docs/source/ko/model_sharing.md
new file mode 100644
index 000000000000..ed6836e8de56
--- /dev/null
+++ b/docs/source/ko/model_sharing.md
@@ -0,0 +1,232 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 모델 공유하기[[share-a-model]]
+
+지난 두 튜토리얼에서 분산 설정을 위해 PyTorch, Keras 및 🤗 Accelerate를 사용하여 모델을 미세 조정하는 방법을 보았습니다. 다음 단계는 모델을 커뮤니티와 공유하는 것입니다! Hugging Face는 인공지능의 민주화를 위해 모두에게 지식과 자원을 공개적으로 공유해야 한다고 믿습니다. 다른 사람들이 시간과 자원을 절약할 수 있도록 커뮤니티에 모델을 공유하는 것을 고려해 보세요.
+
+이 튜토리얼에서 [Model Hub](https://huggingface.co/models)에서 훈련되거나 미세 조정 모델을 공유하는 두 가지 방법에 대해 알아봅시다:
+
+- API를 통해 파일을 Hub에 푸시합니다.
+- 웹사이트를 통해 파일을 Hub로 끌어다 놓습니다.
+
+<iframe width="560" height="315" src="https://www.youtube.com/embed/XvSGPZFEjDY" title="YouTube video player"
+frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
+picture-in-picture" allowfullscreen></iframe>
+
+<Tip>
+
+커뮤니티에 모델을 공유하려면, [huggingface.co](https://huggingface.co/join)에 계정이 필요합니다. 기존 조직에 가입하거나 새로 만들 수도 있습니다.
+
+</Tip>
+
+## 저장소 특징[[repository-features]]
+
+모델 허브의 각 저장소는 일반적인 GitHub 저장소처럼 작동합니다. 저장소는 버전 관리, 커밋 기록, 차이점 시각화 기능을 제공합니다.
+
+모델 허브에 내장된 버전 관리는 git 및 [git-lfs](https://git-lfs.github.com/)를 기반으로 합니다. 즉, 하나의 모델을 하나의 저장소로 취급하여 접근 제어 및 확장성이 향상됩니다. 버전 제어는 커밋 해시, 태그 또는 브랜치로 모델의 특정 버전을 고정하는 방법인 *revision*을 허용합니다.
+
+따라서 `revision` 매개변수를 사용하여 특정 모델 버전을 가져올 수 있습니다:
+
+```py
+>>> model = AutoModel.from_pretrained(
+...     "julien-c/EsperBERTo-small", revision="v2.0.1"  # tag name, or branch name, or commit hash
+... )
+```
+
+또한 저장소에서 파일을 쉽게 편집할 수 있으며, 커밋 기록과 차이를 볼 수 있습니다:
+
+![vis_diff](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vis_diff.png)
+
+## 설정[[setup]]
+
+모델을 허브에 공유하기 전에 Hugging Face 자격 증명이 필요합니다. 터미널에 액세스할 수 있는 경우, 🤗 Transformers가 설치된 가상 환경에서 다음 명령을 실행합니다. 그러면 Hugging Face 캐시 폴더(기본적으로 `~/.cache/`)에 액세스 토큰을 저장합니다:
+
+```bash
+huggingface-cli login
+```
+
+Jupyter 또는 Colaboratory와 같은 노트북을 사용 중인 경우, [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library) 라이브러리가 설치되었는지 확인하세요. 이 라이브러리를 사용하면 API로 허브와 상호 작용할 수 있습니다.
+
+```bash
+pip install huggingface_hub
+```
+
+그런 다음 `notebook_login`로 허브에 로그인하고, [여기](https://huggingface.co/settings/token) 링크에서 로그인할 토큰을 생성합니다:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## 프레임워크 간 모델 변환하기[[convert-a-model-for-all-frameworks]]
+
+다른 프레임워크로 작업하는 사용자가 모델을 사용할 수 있도록 하려면, PyTorch 및 TensorFlow 체크포인트를 모두 사용하여 모델을 변환하고 업로드하는 것이 좋습니다. 이 단계를 건너뛰어도 사용자는 다른 프레임워크에서 모델을 가져올 수 있지만, 🤗 Transformers가 체크포인트를 즉석에서 변환해야 하므로 속도가 느려질 수 있습니다.
+
+체크포인트를 다른 프레임워크로 변환하는 것은 쉽습니다. PyTorch 및 TensorFlow가 설치되어 있는지 확인한 다음(설치 지침은 [여기](installation) 참조) 다른 프레임워크에서 작업에 대한 특정 모델을 찾습니다.
+
+<frameworkcontent>
+<pt>
+체크포인트를 TensorFlow에서 PyTorch로 변환하려면 `from_tf=True`를 지정하세요:
+
+```py
+>>> pt_model = DistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_tf=True)
+>>> pt_model.save_pretrained("path/to/awesome-name-you-picked")
+```
+</pt>
+<tf>
+체크포인트를 PyTorch에서 TensorFlow로 변환하려면 `from_pt=True`를 지정하세요:
+
+```py
+>>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_pt=True)
+```
+
+그런 다음 새로운 체크포인트와 함께 새로운 TensorFlow 모델을 저장할 수 있습니다:
+
+```py
+>>> tf_model.save_pretrained("path/to/awesome-name-you-picked")
+```
+</tf>
+<jax>
+Flax에서 모델을 사용하는 경우, PyTorch에서 Flax로 체크포인트를 변환할 수도 있습니다:
+
+```py
+>>> flax_model = FlaxDistilBertForSequenceClassification.from_pretrained(
+...     "path/to/awesome-name-you-picked", from_pt=True
+... )
+```
+</jax>
+</frameworkcontent>
+
+## 훈련 중 모델 푸시하기[[push-a-model-during-training]]
+
+<frameworkcontent>
+<pt>
+<Youtube id="Z1-XMy-GNLQ"/>
+
+모델을 허브에 공유하는 것은 추가 매개변수나 콜백을 추가하는 것만큼 간단합니다. [미세 조정 튜토리얼](training)에서 [`TrainingArguments`] 클래스는 하이퍼파라미터와 추가 훈련 옵션을 지정하는 곳이라는 것을 기억하세요. 이러한 훈련 옵션 중 하나는 모델을 허브로 직접 푸시하는 기능을 포함합니다. [`TrainingArguments`]에서 `push_to_hub=True`를 설정하세요:
+
+```py
+>>> training_args = TrainingArguments(output_dir="my-awesome-model", push_to_hub=True)
+```
+
+평소와 같이 훈련 인수를 [`Trainer`]에 전달합니다:
+
+```py
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=small_train_dataset,
+...     eval_dataset=small_eval_dataset,
+...     compute_metrics=compute_metrics,
+... )
+```
+
+모델을 미세 조정한 후, [`Trainer`]에서 [`~transformers.Trainer.push_to_hub`]를 호출하여 훈련된 모델을 허브로 푸시하세요. 🤗 Transformers는 훈련 하이퍼파라미터, 훈련 결과 및 프레임워크 버전을 모델 카드에 자동으로 추가합니다!
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+[`PushToHubCallback`]을 사용하여 모델을 허브에 공유하려면, [`PushToHubCallback`]에 다음 인수를 정의하세요:
+
+- 출력된 모델의 파일 경로
+- 토크나이저
+- `{Hub 사용자 이름}/{모델 이름}` 형식의 `hub_model_id`
+
+```py
+>>> from transformers import PushToHubCallback
+
+>>> push_to_hub_callback = PushToHubCallback(
+...     output_dir="./your_model_save_path", tokenizer=tokenizer, hub_model_id="your-username/my-awesome-model"
+... )
+```
+
+[`fit`](https://keras.io/api/models/model_training_apis/)에 콜백을 추가하면, 🤗 Transformers가 훈련된 모델을 허브로 푸시합니다:
+
+```py
+>>> model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3, callbacks=push_to_hub_callback)
+```
+</tf>
+</frameworkcontent>
+
+## `push_to_hub` 함수 사용하기[[use-the-pushtohub-function]]
+
+모델에서 직접 `push_to_hub`를 호출하여 허브에 업로드할 수도 있습니다.
+
+`push_to_hub`에 모델 이름을 지정하세요:
+
+```py
+>>> pt_model.push_to_hub("my-awesome-model")
+```
+
+이렇게 하면 사용자 이름 아래에 모델 이름 `my-awesome-model`로 저장소가 생성됩니다. 이제 사용자는 `from_pretrained` 함수를 사용하여 모델을 가져올 수 있습니다:
+
+```py
+>>> from transformers import AutoModel
+
+>>> model = AutoModel.from_pretrained("your_username/my-awesome-model")
+```
+
+조직에 속하고 모델을 조직 이름으로 대신 푸시하려면 `repo_id`에 추가하세요:
+
+```py
+>>> pt_model.push_to_hub("my-awesome-org/my-awesome-model")
+```
+
+`push_to_hub` 함수는 모델 저장소에 다른 파일을 추가하는 데에도 사용할 수 있습니다. 예를 들어 모델 저장소에 토크나이저를 추가할 수 있습니다:
+
+```py
+>>> tokenizer.push_to_hub("my-awesome-model")
+```
+
+또는 미세 조정된 PyTorch 모델의 TensorFlow 버전을 추가할 수도 있습니다:
+
+```py
+>>> tf_model.push_to_hub("my-awesome-model")
+```
+
+이제 Hugging Face 프로필로 이동하면, 새로 생성한 모델 저장소가 표시됩니다. **Files** 탭을 클릭하면 저장소에 업로드한 모든 파일이 표시됩니다.
+
+저장소에 파일을 만들고 업로드하는 방법에 대한 자세한 내용은 허브 설명서 [여기](https://huggingface.co/docs/hub/how-to-upstream)를 참조하세요.
+
+## 웹 인터페이스로 업로드하기[[upload-with-the-web-interface]]
+
+코드 없는 접근 방식을 선호하는 사용자는 허브의 웹 인터페이스를 통해 모델을 업로드할 수 있습니다. [huggingface.co/new](https://huggingface.co/new)를 방문하여 새로운 저장소를 생성하세요:
+
+![new_model_repo](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/new_model_repo.png)
+
+여기서 모델에 대한 몇 가지 정보를 추가하세요:
+
+- 저장소의 **소유자**를 선택합니다. 이는 사용자 또는 사용자가 속한 조직일 수 있습니다.
+- 저장소 이름이 될 모델의 이름을 선택합니다.
+- 모델이 공개인지 비공개인지 선택합니다.
+- 모델의 라이센스 사용을 지정합니다.
+
+이제 **Files** 탭을 클릭하고 **Add file** 버튼을 클릭하여 새로운 파일을 저장소에 업로드합니다. 그런 다음 업로드할 파일을 끌어다 놓고 커밋 메시지를 추가하세요.
+
+![upload_file](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/upload_file.png)
+
+## 모델 카드 추가하기[[add-a-model-card]]
+
+사용자가 모델의 기능, 제한, 잠재적 편향 및 윤리적 고려 사항을 이해할 수 있도록 저장소에 모델 카드를 추가하세요. 모델 카드는 `README.md` 파일에 정의되어 있습니다. 다음 방법으로 모델 카드를 추가할 수 있습니다:
+
+* `README.md` 파일을 수동으로 생성하여 업로드합니다.
+* 모델 저장소에서 **Edit model card** 버튼을 클릭합니다.
+
+모델 카드에 포함할 정보 유형에 대한 좋은 예는 DistilBert [모델 카드](https://huggingface.co/distilbert-base-uncased)를 참조하세요. 모델의 탄소 발자국이나 위젯 예시 등 `README.md` 파일에서 제어할 수 있는 다른 옵션에 대한 자세한 내용은 [여기](https://huggingface.co/docs/hub/models-cards) 문서를 참조하세요.
diff --git a/docs/source/ko/model_summary.md b/docs/source/ko/model_summary.md
new file mode 100644
index 000000000000..568b9425335d
--- /dev/null
+++ b/docs/source/ko/model_summary.md
@@ -0,0 +1,107 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Transformer 모델군[[the-transformer-model-family]]
+
+2017년에 소개된 [기본 Transformer](https://arxiv.org/abs/1706.03762) 모델은 자연어 처리(NLP) 작업을 넘어 새롭고 흥미로운 모델들에 영감을 주었습니다. [단백질 접힘 구조 예측](https://huggingface.co/blog/deep-learning-with-proteins), [치타의 달리기 훈련](https://huggingface.co/blog/train-decision-transformers), [시계열 예측](https://huggingface.co/blog/time-series-transformers) 등을 위한 다양한 모델이 생겨났습니다. Transformer의 변형이 너무 많아서, 큰 그림을 놓치기 쉽습니다. 하지만 여기 있는 모든 모델의 공통점은 기본 Trasnformer 아키텍처를 기반으로 한다는 점입니다. 일부 모델은 인코더 또는 디코더만 사용하고, 다른 모델들은 인코더와 디코더를 모두 사용하기도 합니다. 이렇게 Transformer 모델군 내 상위 레벨에서의 차이점을 분류하고 검토하면 유용한 분류 체계를 얻을 수 있으며, 이전에 접해보지 못한 Transformer 모델들 또한 이해하는 데 도움이 될 것입니다. 
+
+기본 Transformer 모델에 익숙하지 않거나 복습이 필요한 경우, Hugging Face 강의의 [트랜스포머는 어떻게 동작하나요?](https://huggingface.co/course/chapter1/4?fw=pt) 챕터를 확인하세요. 
+
+<div align="center">
+    <iframe width="560" height="315" src="https://www.youtube.com/embed/H39Z_720T5s" title="YouTube video player"
+    frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
+    picture-in-picture" allowfullscreen></iframe>
+</div>
+
+## 컴퓨터 비전[[computer-vision]]
+
+<iframe style="border: 1px solid rgba(0, 0, 0, 0.1);" width="1000" height="450" src="https://www.figma.com/embed?embed_host=share&url=https%3A%2F%2Fwww.figma.com%2Ffile%2FacQBpeFBVvrDUlzFlkejoz%2FModelscape-timeline%3Fnode-id%3D0%253A1%26t%3Dm0zJ7m2BQ9oe0WtO-1" allowfullscreen></iframe> 
+
+### 합성곱 네트워크[[convolutional-network]]
+
+[Vision Transformer](https://arxiv.org/abs/2010.11929)가 확장성과 효율성을 입증하기 전까지 오랫동안 합성곱 네트워크(CNN)가 컴퓨터 비전 작업의 지배적인 패러다임이었습니다. 그럼에도 불구하고, 이동 불변성(translation invariance)과 같은 CNN의 우수한 부분이 도드라지기 때문에 몇몇 (특히 특정 과업에서의) Transformer 모델은 아키텍처에 합성곱을 통합하기도 했습니다. [ConvNeXt](model_doc/convnext)는 이런 관례를 뒤집어 CNN을 현대화하기 위해 Transformer의 디자인을 차용합니다. 예를 들면 ConvNeXt는 겹치지 않는 슬라이딩 창(sliding window)을 사용하여 이미지를 패치화하고, 더 큰 커널로 전역 수용 필드(global receptive field)를 확장시킵니다. ConvNeXt는 또한 메모리 효율을 높이고 성능을 향상시키기 위해 여러 레이어 설계를 선택하기 때문에 Transformer와 견줄만합니다!
+
+### 인코더[[cv-encoder]]
+
+[Vision Transformer(ViT)](model_doc/vit)는 합성곱 없는 컴퓨터 비전 작업의 막을 열었습니다. ViT는 표준 Transformer 인코더를 사용하지만, 가장 큰 혁신은 이미지를 처리하는 방식이었습니다. 문장을 토큰으로 분할하는 것처럼 이미지를 고정된 크기의 패치로 분할하고, 이를 사용하여 임베딩을 생성합니다. ViT는 Transformer의 효율적인 아키텍처를 활용하여 훈련에 더 적은 자원을 사용하면서도 당시 CNN에 비견하는 결과를 입증했습니다. 그리고 ViT를 뒤이어 분할(segmentation)과 같은 고밀도 비전 작업과 탐지 작업도 다룰 수 있는 다른 비전 모델이 등장했습니다.
+
+이러한 모델 중 하나가 [Swin](model_doc/swin) Transformer입니다. 이 모델은 작은 크기의 패치에서 계층적 특징 맵(CNN 👀과 같지만 ViT와는 다름)을 만들고 더 깊은 레이어의 인접 패치와 병합합니다. 어텐션(Attention)은 지역 윈도우 내에서만 계산되며, 모델이 더 잘 학습할 수 있도록 어텐션 레이어 간에 윈도우를 이동하며 연결을 생성합니다. Swin Transformer는 계층적 특징 맵을 생성할 수 있으므로, 분할(segmentation)과 탐지와 같은 고밀도 예측 작업에 적합합니다. [SegFormer](model_doc/segformer) 역시 Transformer 인코더를 사용하여 계층적 특징 맵을 구축하지만, 상단에 간단한 다층 퍼셉트론(MLP) 디코더를 추가하여 모든 특징 맵을 결합하고 예측을 수행합니다. 
+
+BeIT와 ViTMAE와 같은 다른 비전 모델은 BERT의 사전훈련 목표(objective)에서 영감을 얻었습니다. [BeIT](model_doc/beit)는 *마스크드 이미지 모델링(MIM)*으로 사전훈련되며, 이미지 패치는 임의로 마스킹되고 이미지도 시각적 토큰으로 토큰화됩니다. BeIT는 마스킹된 패치에 해당하는 시각적 토큰을 예측하도록 학습됩니다. [ViTMAE](model_doc/vitmae)도 비슷한 사전훈련 목표가 있지만, 시각적 토큰 대신 픽셀을 예측해야 한다는 점이 다릅니다. 특이한 점은 이미지 패치의 75%가 마스킹되어 있다는 것입니다! 디코더는 마스킹된 토큰과 인코딩된 패치에서 픽셀을 재구성합니다. 사전훈련이 끝나면 디코더는 폐기되고 인코더는 다운스트림 작업에 사용할 준비가 됩니다.
+
+### 디코더[[cv-decoder]]
+
+대부분의 비전 모델은 인코더에 의존하여 이미지 표현을 학습하기 때문에 디코더 전용 비전 모델은 드뭅니다. 하지만 이미지 생성 등의 사례의 경우, GPT-2와 같은 텍스트 생성 모델에서 보았듯이 디코더가 가장 적합합니다. [ImageGPT](model_doc/imagegpt)는 GPT-2와 동일한 아키텍처를 사용하지만, 시퀀스의 다음 토큰을 예측하는 대신 이미지의 다음 픽셀을 예측합니다. ImageGPT는 이미지 생성 뿐만 아니라 이미지 분류를 위해 미세 조정할 수도 있습니다. 
+
+### 인코더-디코더[[cv-encoder-decoder]]
+
+비전 모델은 일반적으로 인코더(백본으로도 알려짐)를 사용하여 중요한 이미지 특징을 추출한 후, 이를 Transformer 디코더로 전달합니다. [DETR](model_doc/detr)에 사전훈련된 백본이 있지만, 객체 탐지를 위해 완전한 Transformer 인코더-디코더 아키텍처도 사용합니다. 인코더는 이미지 표현을 학습하고 이를 디코더에서 객체 쿼리(각 객체 쿼리는 이미지의 영역 또는 객체에 중점을 두고 학습된 임베딩)와 결합합니다. DETR은 각 객체 쿼리에 대한 바운딩 박스 좌표와 클래스 레이블을 예측합니다.
+
+## 자연어처리[[natural-language-processing]]
+
+<iframe style="border: 1px solid rgba(0, 0, 0, 0.1);" width="1000" height="450" src="https://www.figma.com/embed?embed_host=share&url=https%3A%2F%2Fwww.figma.com%2Ffile%2FUhbQAZDlpYW5XEpdFy6GoG%2Fnlp-model-timeline%3Fnode-id%3D0%253A1%26t%3D4mZMr4r1vDEYGJ50-1" allowfullscreen></iframe>
+
+### 인코더[[nlp-encoder]]
+
+[BERT](model_doc/bert)는 인코더 전용 Transformer로, 다른 토큰을 보고 소위 "부정 행위"를 저지르는 걸 막기 위해 입력에서 특정 토큰을 임의로 마스킹합니다. 사전훈련의 목표는 컨텍스트를 기반으로 마스킹된 토큰을 예측하는 것입니다. 이를 통해 BERT는 왼쪽과 오른쪽 컨텍스트를 충분히 활용하여 입력에 대해 더 깊고 풍부한 표현을 학습할 수 있습니다. 그러나 BERT의 사전훈련 전략에는 여전히 개선의 여지가 남아 있었습니다. [RoBERTa](model_doc/roberta)는 더 긴 시간 동안 더 큰 배치에 대한 훈련을 포함하고, 전처리 중에 한 번만 마스킹하는 것이 아니라 각 에폭에서 토큰을 임의로 마스킹하고, 다음 문장 예측 목표를 제거하는 새로운 사전훈련 방식을 도입함으로써 이를 개선했습니다. 
+
+성능 개선을 위한 전략으로 모델 크기를 키우는 것이 지배적입니다. 하지만 큰 모델을 훈련하려면 계산 비용이 많이 듭니다. 계산 비용을 줄이는 한 가지 방법은 [DistilBERT](model_doc/distilbert)와 같이 작은 모델을 사용하는 것입니다. DistilBERT는 압축 기법인 [지식 증류(knowledge distillation)](https://arxiv.org/abs/1503.02531)를 사용하여, 거의 모든 언어 이해 능력을 유지하면서 더 작은 버전의 BERT를 만듭니다. 
+
+그러나 대부분의 Transformer 모델에 더 많은 매개변수를 사용하는 경향이 이어졌고, 이에 따라 훈련 효율성을 개선하는 것에 중점을 둔 새로운 모델이 등장했습니다. [ALBERT](model_doc/albert)는 두 가지 방법으로 매개변수 수를 줄여 메모리 사용량을 줄였습니다. 바로 큰 어휘를 두 개의 작은 행렬로 분리하는 것과 레이어가 매개변수를 공유하도록 하는 것입니다. [DeBERTa](model_doc/deberta)는 단어와 그 위치를 두 개의 벡터로 개별적으로 인코딩하는 분리된(disentangled) 어텐션 메커니즘을 추가했습니다. 어텐션은 단어와 위치 임베딩을 포함하는 단일 벡터 대신 이 별도의 벡터에서 계산됩니다. [Longformer](model_doc/longformer)는 특히 시퀀스 길이가 긴 문서를 처리할 때, 어텐션을 더 효율적으로 만드는 것에 중점을 두었습니다. 지역(local) 윈도우 어텐션(각 토큰 주변의 고정된 윈도우 크기에서만 계산되는 어텐션)과 전역(global) 어텐션(분류를 위해 `[CLS]`와 같은 특정 작업 토큰에만 해당)의 조합을 사용하여 전체(full) 어텐션 행렬 대신 희소(sparse) 어텐션 행렬을 생성합니다. 
+
+### 디코더[[nlp-decoder]]
+
+[GPT-2](model_doc/gpt2)는 시퀀스에서 다음 단어를 예측하는 디코더 전용 Transformer입니다. 토큰을 오른쪽으로 마스킹하여 모델이 이전 토큰을 보고 "부정 행위"를 하지 못하도록 합니다. GPT-2는 방대한 텍스트에 대해 사전훈련하여 텍스트가 일부만 정확하거나 사실인 경우에도 상당히 능숙하게 텍스트를 생성할 수 있게 되었습니다. 하지만 GPT-2는 BERT가 사전훈련에서 갖는 양방향 컨텍스트가 부족하기 때문에 특정 작업에 적합하지 않았습니다. [XLNET](model_doc/xlnet)은 양방향 훈련이 가능한 permutation language modeling objective(PLM)를 사용하여 BERT와 GPT-2의 사전훈련 목표에 대한 장점을 함께 가지고 있습니다.
+
+GPT-2 이후, 언어 모델은 더욱 거대해졌고 현재는 *대규모 언어 모델(LLM)*로 알려져 있습니다. 충분히 큰 데이터 세트로 사전훈련된 LLM은 퓨샷(few-shot) 또는 제로샷(zero-shot) 학습을 수행합니다. [GPT-J](model_doc/gptj)는 6B 크기의 매개변수가 있고 400B 크기의 토큰으로 훈련된 LLM입니다. GPT-J에 이어 디코더 전용 모델군인 [OPT](model_doc/opt)가 등장했으며, 이 중 가장 큰 모델은 175B 크기이고 180B 크기의 토큰으로 훈련되었습니다. [BLOOM](model_doc/bloom)은 비슷한 시기에 출시되었으며, 이 중 가장 큰 모델은 176B 크기의 매개변수가 있고 46개의 언어와 13개의 프로그래밍 언어로 된 366B 크기의 토큰으로 훈련되었습니다. 
+
+### 인코더-디코더[[nlp-encoder-decoder]]
+
+[BART](model_doc/bart)는 기본 Transformer 아키텍처를 유지하지만, 일부 텍스트 스팬(span)이 단일 `마스크` 토큰으로 대체되는 *text infilling* 변형으로 사전훈련 목표를 수정합니다. 디코더는 변형되지 않은 토큰(향후 토큰은 마스킹됨)을 예측하고 인코더의 은닉 상태를 사용하여 이 작업을 돕습니다. [Pegasus](model_doc/pegasus)는 BART와 유사하지만, Pegasus는 텍스트 스팬 대신 전체 문장을 마스킹합니다. Pegasus는 마스크드 언어 모델링 외에도 gap sentence generation(GSG)로 사전훈련됩니다. GSG는 문서에 중요한 문장 전체를 마스킹하여 `마스크` 토큰으로 대체하는 것을 목표로 합니다. 디코더는 남은 문장에서 출력을 생성해야 합니다. [T5](model_doc/t5)는 특정 접두사를 사용하여 모든 NLP 작업을 텍스트 투 텍스트 문제로 변환하는 더 특수한 모델입니다. 예를 들어, 접두사 `Summarize:`은 요약 작업을 나타냅니다. T5는 지도(GLUE 및 SuperGLUE) 훈련과 자기지도 훈련(토큰의 15%를 임의로 샘플링하여 제거)으로 사전훈련됩니다.
+
+## 오디오[[audio]]
+
+<iframe style="border: 1px solid rgba(0, 0, 0, 0.1);" width="1000" height="450" src="https://www.figma.com/embed?embed_host=share&url=https%3A%2F%2Fwww.figma.com%2Ffile%2Fvrchl8jDV9YwNVPWu2W0kK%2Fspeech-and-audio-model-timeline%3Fnode-id%3D0%253A1%26t%3DmM4H8pPMuK23rClL-1" allowfullscreen></iframe>
+
+### 인코더[[audio-encoder]]
+
+[Wav2Vec2](model_doc/wav2vec2)는 Transformer 인코더를 사용하여 원본 오디오 파형(raw audio waveform)에서 직접 음성 표현을 학습합니다. 허위 음성 표현 세트에서 실제 음성 표현을 판별하는 대조 작업으로 사전훈련됩니다. [HuBERT](model_doc/hubert)는 Wav2Vec2와 유사하지만 훈련 과정이 다릅니다. 타겟 레이블이 유사한 오디오 세그먼트가 클러스터에 할당되어 은닉 단위(unit)가 되는 군집화(clustering) 단계에서 생성됩니다. 은닉 단위는 예측을 위한 임베딩에 매핑됩니다.
+
+### 인코더-디코더[[audio-encoder-decoder]]
+
+[Speech2Text](model_doc/speech_to_text)는 자동 음성 인식(ASR) 및 음성 번역을 위해 고안된 음성 모델입니다. 이 모델은 오디오 파형에서 추출한 log mel-filter bank 특징을 채택하고 자기회귀 방식으로 사전훈련하여, 전사본 또는 번역을 만듭니다. [Whisper](model_doc/whisper)은 ASR 모델이지만, 다른 많은 음성 모델과 달리 제로샷 성능을 위해 대량의 ✨ 레이블이 지정된 ✨ 오디오 전사 데이터에 대해 사전훈련됩니다. 데이터 세트의 큰 묶음에는 영어가 아닌 언어도 포함되어 있어서 자원이 적은 언어에도 Whisper를 사용할 수 있습니다. 구조적으로, Whisper는 Speech2Text와 유사합니다. 오디오 신호는 인코더에 의해 인코딩된 log-mel spectrogram으로 변환됩니다. 디코더는 인코더의 은닉 상태와 이전 토큰으로부터 자기회귀 방식으로 전사를 생성합니다.
+
+## 멀티모달[[multimodal]]
+
+<iframe style="border: 1px solid rgba(0, 0, 0, 0.1);" width="1000" height="450" src="https://www.figma.com/embed?embed_host=share&url=https%3A%2F%2Fwww.figma.com%2Ffile%2FcX125FQHXJS2gxeICiY93p%2Fmultimodal%3Fnode-id%3D0%253A1%26t%3DhPQwdx3HFPWJWnVf-1" allowfullscreen></iframe>
+
+### 인코더[[mm-encoder]]
+
+[VisualBERT](model_doc/visual_bert)는 BERT 이후에 출시된 비전 언어 작업을 위한 멀티모달 모델입니다. 이 모델은 BERT와 사전훈련된 객체 탐지 시스템을 결합하여 이미지 특징을 시각 임베딩으로 추출하고, 텍스트 임베딩과 함께 BERT로 전달합니다. VisualBERT는 마스킹되지 않은 텍스트와 시각 임베딩을 기반으로 마스킹된 텍스트를 예측하고, 텍스트가 이미지와 일치하는지 예측해야 합니다. ViT가 이미지 임베딩을 구하는 방식이 더 쉬웠기 때문에, ViT가 출시된 후 [ViLT](model_doc/vilt)는 아키텍처에 ViT를 채택했습니다. 이미지 임베딩은 텍스트 임베딩과 함께 처리됩니다. 여기에서, ViLT는 이미지 텍스트 매칭, 마스크드 언어 모델링, 전체 단어 마스킹을 통해 사전훈련됩니다.
+
+[CLIP](model_doc/clip)은 다른 접근 방식을 사용하여 (`이미지`, `텍스트`)의 쌍 예측을 수행합니다. (`이미지`, `텍스트`) 쌍에서의 이미지와 텍스트 임베딩 간의 유사도를 최대화하기 위해 4억 개의 (`이미지`, `텍스트`) 쌍 데이터 세트에 대해 이미지 인코더(ViT)와 텍스트 인코더(Transformer)를 함께 훈련합니다. 사전훈련 후, 자연어를 사용하여 이미지가 주어진 텍스트를 예측하거나 그 반대로 예측하도록 CLIP에 지시할 수 있습니다. [OWL-ViT](model_doc/owlvit)는 CLIP을 제로샷 객체 탐지를 위한 백본(backbone)으로 사용하여 CLIP 상에 구축됩니다. 사전훈련 후, 객체 탐지 헤드가 추가되어 (`클래스`, `바운딩 박스`) 쌍에 대한 집합(set) 예측을 수행합니다.
+
+### 인코더-디코더[[mm-encoder-decoder]]
+
+광학 문자 인식(OCR)은 이미지를 이해하고 텍스트를 생성하기 위해 다양한 구성 요소를 필요로 하는 전통적인 텍스트 인식 작업입니다. [TrOCR](model_doc/trocr)은 종단간(end-to-end) Transformer를 사용하여 이 프로세스를 간소화합니다. 인코더는 이미지 이해를 위한 ViT 방식의 모델이며 이미지를 고정된 크기의 패치로 처리합니다. 디코더는 인코더의 은닉 상태를 받아서 자기회귀 방식으로 텍스트를 생성합니다. [Donut](model_doc/donut)은 OCR 기반 접근 방식에 의존하지 않는 더 일반적인 시각 문서 이해 모델입니다. 이 모델은 Swin Transformer를 인코더로, 다국어 BART를 디코더로 사용합니다. Donut은 이미지와 텍스트 주석을 기반으로 다음 단어를 예측하여 텍스트를 읽도록 사전훈련됩니다. 디코더는 프롬프트가 주어지면 토큰 시퀀스를 생성합니다. 프롬프트는 각 다운스트림 작업에 대한 특수 토큰으로 표현됩니다. 예를 들어, 문서 파싱(parsing)에는 인코더의 은닉 상태와 결합되어 문서를 정형 출력 형식(JSON)으로 파싱하는 특수 `파싱` 토큰이 있습니다.
+
+## 강화 학습[[reinforcement-learning]]
+
+<iframe style="border: 1px solid rgba(0, 0, 0, 0.1);" width="1000" height="450" src="https://www.figma.com/embed?embed_host=share&url=https%3A%2F%2Fwww.figma.com%2Ffile%2FiB3Y6RvWYki7ZuKO6tNgZq%2Freinforcement-learning%3Fnode-id%3D0%253A1%26t%3DhPQwdx3HFPWJWnVf-1" allowfullscreen></iframe>
+
+### 디코더[[rl-decoder]]
+
+Decision 및 Trajectory Transformer는 상태(state), 행동(action), 보상(reward)을 시퀀스 모델링 문제로 표현합니다. [Decision Transformer](model_doc/decision_transformer)는 기대 보상(returns-to-go), 과거 상태 및 행동을 기반으로 미래의 원하는 수익(return)으로 이어지는 일련의 행동을 생성합니다. 마지막 *K* 시간 스텝(timestep)에 대해, 세 가지 모달리티는 각각 토큰 임베딩으로 변환되고 GPT와 같은 모델에 의해 처리되어 미래의 액션 토큰을 예측합니다. [Trajectory Transformer](model_doc/trajectory_transformer)도 상태, 행동, 보상을 토큰화하여 GPT 아키텍처로 처리합니다. 보상 조건에 중점을 둔 Decision Transformer와 달리 Trajectory Transformer는 빔 서치(beam search)로 미래 행동을 생성합니다.
\ No newline at end of file
diff --git a/docs/source/ko/multilingual.md b/docs/source/ko/multilingual.md
new file mode 100644
index 000000000000..2862bd983887
--- /dev/null
+++ b/docs/source/ko/multilingual.md
@@ -0,0 +1,192 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 다국어 모델 추론하기[[multilingual-models-for-inference]]
+
+[[open-in-colab]]
+
+🤗 Transformers에는 여러 종류의 다국어(multilingual) 모델이 있으며, 단일 언어(monolingual) 모델과 추론 시 사용법이 다릅니다.
+그렇다고 해서 *모든* 다국어 모델의 사용법이 다른 것은 아닙니다.
+
+[bert-base-multilingual-uncased](https://huggingface.co/bert-base-multilingual-uncased)와 같은 몇몇 모델은 단일 언어 모델처럼 사용할 수 있습니다.
+이번 가이드에서 다국어 모델의 추론 시 사용 방법을 알아볼 것입니다.
+
+## XLM[[xlm]]
+
+XLM에는 10가지 체크포인트(checkpoint)가 있는데, 이 중 하나만 단일 언어입니다. 
+나머지 체크포인트 9개는 언어 임베딩을 사용하는 체크포인트와 그렇지 않은 체크포인트의 두 가지 범주로 나눌 수 있습니다.
+
+### 언어 임베딩을 사용하는 XLM[[xlm-with-language-embeddings]]
+
+다음 XLM 모델은 추론 시에 언어 임베딩을 사용합니다:
+
+- `xlm-mlm-ende-1024` (마스킹된 언어 모델링, 영어-독일어)
+- `xlm-mlm-enfr-1024` (마스킹된 언어 모델링, 영어-프랑스어)
+- `xlm-mlm-enro-1024` (마스킹된 언어 모델링, 영어-루마니아어)
+- `xlm-mlm-xnli15-1024` (마스킹된 언어 모델링, XNLI 데이터 세트에서 제공하는 15개 국어)
+- `xlm-mlm-tlm-xnli15-1024` (마스킹된 언어 모델링 + 번역, XNLI 데이터 세트에서 제공하는 15개 국어)
+- `xlm-clm-enfr-1024` (Causal language modeling, 영어-프랑스어)
+- `xlm-clm-ende-1024` (Causal language modeling, 영어-독일어)
+
+언어 임베딩은 모델에 전달된 `input_ids`와 동일한 shape의 텐서로 표현됩니다.
+이러한 텐서의 값은 사용된 언어에 따라 다르며 토크나이저의 `lang2id` 및 `id2lang` 속성에 의해 식별됩니다.
+
+다음 예제에서는 `xlm-clm-enfr-1024` 체크포인트(코잘 언어 모델링(causal language modeling), 영어-프랑스어)를 가져옵니다:
+
+```py
+>>> import torch
+>>> from transformers import XLMTokenizer, XLMWithLMHeadModel
+
+>>> tokenizer = XLMTokenizer.from_pretrained("xlm-clm-enfr-1024")
+>>> model = XLMWithLMHeadModel.from_pretrained("xlm-clm-enfr-1024")
+```
+
+토크나이저의 `lang2id` 속성은 모델의 언어와 해당 ID를 표시합니다:
+
+```py
+>>> print(tokenizer.lang2id)
+{'en': 0, 'fr': 1}
+```
+
+다음으로, 예제 입력을 만듭니다:
+
+```py
+>>> input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")])  # 배치 크기는 1입니다
+```
+
+언어 ID를 `"en"`으로 설정해 언어 임베딩을 정의합니다. 
+언어 임베딩은 영어의 언어 ID인 `0`으로 채워진 텐서입니다.
+이 텐서는 `input_ids`와 같은 크기여야 합니다. 
+
+```py
+>>> language_id = tokenizer.lang2id["en"]  # 0
+>>> langs = torch.tensor([language_id] * input_ids.shape[1])  # torch.tensor([0, 0, 0, ..., 0])
+
+>>> # (batch_size, sequence_length) shape의 텐서가 되도록 만듭니다.
+>>> langs = langs.view(1, -1)  # 이제 [1, sequence_length] shape이 되었습니다(배치 크기는 1입니다)
+```
+
+이제 `input_ids`와 언어 임베딩을 모델로 전달합니다:
+
+```py
+>>> outputs = model(input_ids, langs=langs)
+```
+
+[run_generation.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-generation/run_generation.py) 스크립트로 `xlm-clm` 체크포인트를 사용해 텍스트와 언어 임베딩을 생성할 수 있습니다.
+
+### 언어 임베딩을 사용하지 않는 XLM[[xlm-without-language-embeddings]]
+
+다음 XLM 모델은 추론 시에 언어 임베딩이 필요하지 않습니다:
+
+- `xlm-mlm-17-1280` (마스킹된 언어 모델링, 17개 국어)
+- `xlm-mlm-100-1280` (마스킹된 언어 모델링, 100개 국어)
+
+이전의 XLM 체크포인트와 달리 이 모델은 일반 문장 표현에 사용됩니다.
+
+## BERT[[bert]]
+
+다음 BERT 모델은 다국어 태스크에 사용할 수 있습니다:
+
+- `bert-base-multilingual-uncased` (마스킹된 언어 모델링 + 다음 문장 예측, 102개 국어)
+- `bert-base-multilingual-cased` (마스킹된 언어 모델링 + 다음 문장 예측, 104개 국어)
+
+이러한 모델은 추론 시에 언어 임베딩이 필요하지 않습니다. 
+문맥에서 언어를 식별하고, 식별된 언어로 추론합니다.
+
+## XLM-RoBERTa[[xlmroberta]]
+
+다음 XLM-RoBERTa 또한 다국어 다국어 태스크에 사용할 수 있습니다:
+
+- `xlm-roberta-base` (마스킹된 언어 모델링, 100개 국어)
+- `xlm-roberta-large` (마스킹된 언어 모델링, 100개 국어)
+
+XLM-RoBERTa는 100개 국어에 대해 새로 생성되고 정제된 2.5TB 규모의 CommonCrawl 데이터로 학습되었습니다.
+이전에 공개된 mBERT나 XLM과 같은 다국어 모델에 비해 분류, 시퀀스 라벨링, 질의 응답과 같은 다운스트림(downstream) 작업에서 이점이 있습니다.
+
+## M2M100[[m2m100]]
+
+다음 M2M100 모델 또한 다국어 다국어 태스크에 사용할 수 있습니다:
+
+- `facebook/m2m100_418M` (번역)
+- `facebook/m2m100_1.2B` (번역)
+
+이 예제에서는 `facebook/m2m100_418M` 체크포인트를 가져와서 중국어를 영어로 번역합니다. 
+토크나이저에서 번역 대상 언어(source language)를 설정할 수 있습니다:
+
+```py
+>>> from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
+
+>>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger."
+>>> chinese_text = "不要插手巫師的事務, 因為他們是微妙的, 很快就會發怒."
+
+>>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="zh")
+>>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
+```
+
+문장을 토큰화합니다:
+
+```py
+>>> encoded_zh = tokenizer(chinese_text, return_tensors="pt")
+```
+
+M2M100은 번역을 진행하기 위해 첫 번째로 생성되는 토큰은 번역할 언어(target language) ID로 강제 지정합니다.
+영어로 번역하기 위해 `generate` 메소드에서 `forced_bos_token_id`를 `en`으로 설정합니다:
+
+```py
+>>> generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en"))
+>>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+'Do not interfere with the matters of the witches, because they are delicate and will soon be angry.'
+```
+
+## MBart[[mbart]]
+
+다음 MBart 모델 또한 다국어 태스크에 사용할 수 있습니다:
+
+- `facebook/mbart-large-50-one-to-many-mmt` (일대다 다국어 번역, 50개 국어)
+- `facebook/mbart-large-50-many-to-many-mmt` (다대다 다국어 번역, 50개 국어)
+- `facebook/mbart-large-50-many-to-one-mmt` (다대일 다국어 번역, 50개 국어)
+- `facebook/mbart-large-50` (다국어 번역, 50개 국어)
+- `facebook/mbart-large-cc25`
+
+이 예제에서는 핀란드어를 영어로 번역하기 위해 `facebook/mbart-large-50-many-to-many-mmt` 체크포인트를 가져옵니다. 
+토크나이저에서 번역 대상 언어(source language)를 설정할 수 있습니다:
+
+```py
+>>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+
+>>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger."
+>>> fi_text = "Älä sekaannu velhojen asioihin, sillä ne ovat hienovaraisia ja nopeasti vihaisia."
+
+>>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="fi_FI")
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
+```
+
+문장을 토큰화합니다:
+
+```py
+>>> encoded_en = tokenizer(en_text, return_tensors="pt")
+```
+
+MBart는 번역을 진행하기 위해 첫 번째로 생성되는 토큰은 번역할 언어(target language) ID로 강제 지정합니다.
+영어로 번역하기 위해 `generate` 메소드에서 `forced_bos_token_id`를 `en`으로 설정합니다:
+
+```py
+>>> generated_tokens = model.generate(**encoded_en, forced_bos_token_id=tokenizer.lang_code_to_id("en_XX"))
+>>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+"Don't interfere with the wizard's affairs, because they are subtle, will soon get angry."
+```
+
+`facebook/mbart-large-50-many-to-one-mmt` 체크포인트를 사용하고 있다면, 첫 번째로 생성되는 토큰을 번역할 언어(target language) ID로 강제 지정할 필요는 없습니다.
diff --git a/docs/source/ko/pad_truncation.md b/docs/source/ko/pad_truncation.md
new file mode 100644
index 000000000000..6aa8b99b1dfc
--- /dev/null
+++ b/docs/source/ko/pad_truncation.md
@@ -0,0 +1,68 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 패딩과 잘라내기[[padding-and-truncation]]
+
+배치 입력은 길이가 다른 경우가 많아서 고정 크기 텐서로 변환할 수 없습니다. 패딩과 잘라내기는 다양한 길이의 배치에서 직사각형 텐서를 생성할 수 있도록 이 문제를 해결하는 전략입니다. 패딩은 특수한 **패딩 토큰**을 추가하여 짧은 시퀀스가 배치에서 가장 긴 시퀀스 또는 모델에서 허용하는 최대 길이와 동일한 길이를 갖도록 합니다. 잘라내기는 긴 시퀀스를 잘라내어 패딩과 다른 방식으로 시퀀스의 길이를 동일하게 합니다.
+
+대부분의 경우 배치에 가장 긴 시퀀스의 길이로 패딩하고 모델이 허용할 수 있는 최대 길이로 잘라내는 것이 잘 작동합니다. 그러나 필요하다면 API가 지원하는 더 많은 전략을 사용할 수 있습니다. 필요한 인수는 `padding`, `truncation`, `max_length` 세 가지입니다.
+
+`padding` 인수는 패딩을 제어합니다. 불리언 또는 문자열일 수 있습니다:
+
+  - `True` 또는 `'longest'`: 배치에서 가장 긴 시퀀스로 패딩합니다(단일 시퀀스만 제공하는 경우 패딩이 적용되지 않습니다).
+  - `'max_length'`: `max_length` 인수가 지정한 길이로 패딩하거나, `max_length`가 제공되지 않은 경우(`max_length=None`) 모델에서 허용되는 최대 길이로 패딩합니다. 단일 시퀀스만 제공하는 경우에도 패딩이 적용됩니다.
+  - `False` 또는 `'do_not_pad'`: 패딩이 적용되지 않습니다. 이것이 기본 동작입니다.
+
+`truncation` 인수는 잘라낼 방법을 정합니다. 불리언 또는 문자열일 수 있습니다:
+
+  - `True` 또는 `longest_first`: `max_length` 인수가 지정한 최대 길이로 잘라내거나, 
+    `max_length`가 제공되지 않은 경우(`max_length=None`) 모델에서 허용되는 최대 길이로 잘라냅니다. 
+    시퀀스 쌍에서 가장 긴 시퀀스의 토큰을 적절한 길이에 도달할 때까지 하나씩 제거합니다.
+  - `'only_second'`: `max_length` 인수가 지정한 최대 길이로 잘라내거나, 
+    `max_length`가 제공되지 않은 경우(`max_length=None`) 모델에서 허용되는 최대 길이로 잘라냅니다.
+    시퀀스 쌍(또는 시퀀스 쌍의 배치)가 제공된 경우 쌍의 두 번째 문장만 잘라냅니다.
+  - `'only_first'`: `max_length` 인수가 지정한 최대 길이로 잘라내거나, 
+    `max_length`가 제공되지 않은 경우(`max_length=None`) 모델에서 허용되는 최대 길이로 잘라냅니다. 
+    시퀀스 쌍(또는 시퀀스 쌍의 배치)가 제공된 경우 쌍의 첫 번째 문장만 잘라냅니다.
+  - `False` 또는 `'do_not_truncate'`: 잘라내기를 적용하지 않습니다. 이것이 기본 동작입니다.
+
+`max_length` 인수는 패딩 및 잘라내기를 적용할 길이를 제어합니다. 이 인수는 정수 또는 `None`일 수 있으며, `None`일 경우 모델이 허용할 수 있는 최대 길이로 기본값이 설정됩니다. 모델에 특정한 최대 입력 길이가 없는 경우 `max_length`에 대한 잘라내기 또는 패딩이 비활성화됩니다.
+
+다음 표에는 패딩 및 잘라내기를 설정하는 권장 방법이 요약되어 있습니다. 
+입력으로 시퀀스 쌍을 사용하는 경우, 다음 예제에서 `truncation=True`를 `['only_first', 'only_second', 'longest_first']`에서 선택한 `STRATEGY`, 즉 `truncation='only_second'` 또는 `truncation='longest_first'`로 바꾸면 앞서 설명한 대로 쌍의 두 시퀀스가 잘리는 방식을 제어할 수 있습니다.
+
+| 잘라내기                             | 패딩                              | 사용 방법                                                                                 |
+|--------------------------------------|-----------------------------------|------------------------------------------------------------------------------------------|
+| 잘라내기 없음                        | 패딩 없음                          | `tokenizer(batch_sentences)`                                                             |
+|                                      | 배치 내 최대 길이로 패딩           | `tokenizer(batch_sentences, padding=True)` 또는                                          |
+|                                      |                                   | `tokenizer(batch_sentences, padding='longest')`                                          |
+|                                      | 모델의 최대 입력 길이로 패딩      | `tokenizer(batch_sentences, padding='max_length')`                                        |
+|                                      | 특정 길이로 패딩                  | `tokenizer(batch_sentences, padding='max_length', max_length=42)`                         |
+|                                      | 다양한 길이로 패딩                | `tokenizer(batch_sentences, padding=True, pad_to_multiple_of=8)                           |
+| 모델의 최대 입력 길이로 잘라내기      | 패딩 없음                         | `tokenizer(batch_sentences, truncation=True)` 또는                                        |
+|                                      |                                   | `tokenizer(batch_sentences, truncation=STRATEGY)`                                         |
+|                                      | 배치 내 최대 길이로 패딩          | `tokenizer(batch_sentences, padding=True, truncation=True)` 또는                          |
+|                                      |                                   | `tokenizer(batch_sentences, padding=True, truncation=STRATEGY)`                           |
+|                                      | 모델의 최대 입력 길이로 패딩      | `tokenizer(batch_sentences, padding='max_length', truncation=True)` 또는                  |
+|                                      |                                   | `tokenizer(batch_sentences, padding='max_length', truncation=STRATEGY)`                   |
+|                                      | 특정 길이로 패딩                  | 사용 불가                                                                              |
+| 특정 길이로 잘라내기                 | 패딩 없음                         | `tokenizer(batch_sentences, truncation=True, max_length=42)` 또는                         |
+|                                      |                                   | `tokenizer(batch_sentences, truncation=STRATEGY, max_length=42)`                         |
+|                                      | 배치 내 최대 길이로 패딩          | `tokenizer(batch_sentences, padding=True, truncation=True, max_length=42)` 또는           |
+|                                      |                                   | `tokenizer(batch_sentences, padding=True, truncation=STRATEGY, max_length=42)`           |
+|                                      | 모델의 최대 입력 길이로 패딩       | 사용 불가                                                                             |
+|                                      | 특정 길이로 패딩                   | `tokenizer(batch_sentences, padding='max_length', truncation=True, max_length=42)` 또는  |
+|                                      |                                   | `tokenizer(batch_sentences, padding='max_length', truncation=STRATEGY, max_length=42)`   |
diff --git a/docs/source/ko/peft.md b/docs/source/ko/peft.md
new file mode 100644
index 000000000000..90327e62c27a
--- /dev/null
+++ b/docs/source/ko/peft.md
@@ -0,0 +1,209 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+# 🤗 PEFT로 어댑터 가져오기 [[load-adapters-with-peft]]
+
+[[open-in-colab]]
+
+[Parameter-Efficient Fine Tuning (PEFT)](https://huggingface.co/blog/peft) 방법은 사전훈련된 모델의 매개변수를 미세 조정 중 고정시키고, 그 위에 훈련할 수 있는 매우 적은 수의 매개변수(어댑터)를 추가합니다. 어댑터는 작업별 정보를 학습하도록 훈련됩니다. 이 접근 방식은 완전히 미세 조정된 모델에 필적하는 결과를 생성하면서, 메모리 효율적이고 비교적 적은 컴퓨팅 리소스를 사용합니다.
+
+또한 PEFT로 훈련된 어댑터는 일반적으로 전체 모델보다 훨씬 작기 때문에 공유, 저장 및 가져오기가 편리합니다.
+
+<div class="flex flex-col justify-center">
+  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/peft/PEFT-hub-screenshot.png"/>
+  <figcaption class="text-center">Hub에 저장된 OPTForCausalLM 모델의 어댑터 가중치는 최대 700MB에 달하는 모델 가중치의 전체 크기에 비해 약 6MB에 불과합니다.</figcaption>
+</div>
+
+🤗 PEFT 라이브러리에 대해 자세히 알아보려면 [문서](https://huggingface.co/docs/peft/index)를 확인하세요.
+
+## 설정 [[setup]]
+
+🤗 PEFT를 설치하여 시작하세요:
+
+```bash
+pip install peft
+```
+
+새로운 기능을 사용해보고 싶다면, 다음 소스에서 라이브러리를 설치하는 것이 좋습니다:
+
+```bash
+pip install git+https://github.com/huggingface/peft.git
+```
+
+## 지원되는 PEFT 모델 [[supported-peft-models]]
+
+🤗 Transformers는 기본적으로 일부 PEFT 방법을 지원하며, 로컬이나 Hub에 저장된 어댑터 가중치를 가져오고 몇 줄의 코드만으로 쉽게 실행하거나 훈련할 수 있습니다. 다음 방법을 지원합니다:
+
+- [Low Rank Adapters](https://huggingface.co/docs/peft/conceptual_guides/lora)
+- [IA3](https://huggingface.co/docs/peft/conceptual_guides/ia3)
+- [AdaLoRA](https://arxiv.org/abs/2303.10512)
+
+🤗 PEFT와 관련된 다른 방법(예: 프롬프트 훈련 또는 프롬프트 튜닝) 또는 일반적인 🤗 PEFT 라이브러리에 대해 자세히 알아보려면 [문서](https://huggingface.co/docs/peft/index)를 참조하세요.
+
+
+## PEFT 어댑터 가져오기 [[load-a-peft-adapter]]
+
+🤗 Transformers에서 PEFT 어댑터 모델을 가져오고 사용하려면 Hub 저장소나 로컬 디렉터리에 `adapter_config.json` 파일과 어댑터 가중치가 포함되어 있는지 확인하십시오. 그런 다음 `AutoModelFor` 클래스를 사용하여 PEFT 어댑터 모델을 가져올 수 있습니다. 예를 들어 인과 관계 언어 모델용 PEFT 어댑터 모델을 가져오려면 다음 단계를 따르십시오:
+
+1. PEFT 모델 ID를 지정하십시오.
+2. [`AutoModelForCausalLM`] 클래스에 전달하십시오.
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+peft_model_id = "ybelkada/opt-350m-lora"
+model = AutoModelForCausalLM.from_pretrained(peft_model_id)
+```
+
+<Tip>
+
+`AutoModelFor` 클래스나 기본 모델 클래스(예: `OPTForCausalLM` 또는 `LlamaForCausalLM`) 중 하나를 사용하여 PEFT 어댑터를 가져올 수 있습니다.
+
+</Tip>
+
+`load_adapter` 메소드를 호출하여 PEFT 어댑터를 가져올 수도 있습니다.
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_id = "facebook/opt-350m"
+peft_model_id = "ybelkada/opt-350m-lora"
+
+model = AutoModelForCausalLM.from_pretrained(model_id)
+model.load_adapter(peft_model_id)
+```
+
+## 8비트 또는 4비트로 가져오기 [[load-in-8bit-or-4bit]]
+
+`bitsandbytes` 통합은 8비트와 4비트 정밀도 데이터 유형을 지원하므로 큰 모델을 가져올 때 유용하면서 메모리도 절약합니다. 모델을 하드웨어에 효과적으로 분배하려면 [`~PreTrainedModel.from_pretrained`]에 `load_in_8bit` 또는 `load_in_4bit` 매개변수를 추가하고 `device_map="auto"`를 설정하세요:
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+peft_model_id = "ybelkada/opt-350m-lora"
+model = AutoModelForCausalLM.from_pretrained(peft_model_id, device_map="auto", load_in_8bit=True)
+```
+
+## 새 어댑터 추가 [[add-a-new-adapter]]
+
+새 어댑터가 현재 어댑터와 동일한 유형인 경우에 한해 기존 어댑터가 있는 모델에 새 어댑터를 추가하려면 [`~peft.PeftModel.add_adapter`]를 사용할 수 있습니다. 예를 들어 모델에 기존 LoRA 어댑터가 연결되어 있는 경우:
+
+```py
+from transformers import AutoModelForCausalLM, OPTForCausalLM, AutoTokenizer
+from peft import PeftConfig
+
+model_id = "facebook/opt-350m"
+model = AutoModelForCausalLM.from_pretrained(model_id)
+
+lora_config = LoraConfig(
+    target_modules=["q_proj", "k_proj"],
+    init_lora_weights=False
+)
+
+model.add_adapter(lora_config, adapter_name="adapter_1")
+```
+
+새 어댑터를 추가하려면:
+
+```py
+# attach new adapter with same config
+model.add_adapter(lora_config, adapter_name="adapter_2")
+```
+
+이제 [`~peft.PeftModel.set_adapter`]를 사용하여 어댑터를 사용할 어댑터로 설정할 수 있습니다:
+
+```py
+# use adapter_1
+model.set_adapter("adapter_1")
+output = model.generate(**inputs)
+print(tokenizer.decode(output_disabled[0], skip_special_tokens=True))
+
+# use adapter_2
+model.set_adapter("adapter_2")
+output_enabled = model.generate(**inputs)
+print(tokenizer.decode(output_enabled[0], skip_special_tokens=True))
+```
+
+## 어댑터 활성화 및 비활성화 [[enable-and-disable-adapters]]
+
+모델에 어댑터를 추가한 후 어댑터 모듈을 활성화 또는 비활성화할 수 있습니다. 어댑터 모듈을 활성화하려면:
+
+```py
+from transformers import AutoModelForCausalLM, OPTForCausalLM, AutoTokenizer
+from peft import PeftConfig
+
+model_id = "facebook/opt-350m"
+adapter_model_id = "ybelkada/opt-350m-lora"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+text = "Hello"
+inputs = tokenizer(text, return_tensors="pt")
+
+model = AutoModelForCausalLM.from_pretrained(model_id)
+peft_config = PeftConfig.from_pretrained(adapter_model_id)
+
+# to initiate with random weights
+peft_config.init_lora_weights = False
+
+model.add_adapter(peft_config)
+model.enable_adapters()
+output = model.generate(**inputs)
+```
+
+어댑터 모듈을 비활성화하려면:
+
+```py
+model.disable_adapters()
+output = model.generate(**inputs)
+```
+
+## PEFT 어댑터 훈련 [[train-a-peft-adapter]]
+
+PEFT 어댑터는 [`Trainer`] 클래스에서 지원되므로 특정 사용 사례에 맞게 어댑터를 훈련할 수 있습니다. 몇 줄의 코드를 추가하기만 하면 됩니다. 예를 들어 LoRA 어댑터를 훈련하려면:
+
+<Tip>
+
+[`Trainer`]를 사용하여 모델을 미세 조정하는 것이 익숙하지 않다면 [사전훈련된 모델을 미세 조정하기](training) 튜토리얼을 확인하세요.
+
+</Tip>
+
+1. 작업 유형 및 하이퍼파라미터를 지정하여 어댑터 구성을 정의합니다. 하이퍼파라미터에 대한 자세한 내용은 [`~peft.LoraConfig`]를 참조하세요.
+
+```py
+from peft import LoraConfig
+
+peft_config = LoraConfig(
+    lora_alpha=16,
+    lora_dropout=0.1,
+    r=64,
+    bias="none",
+    task_type="CAUSAL_LM",
+)
+```
+
+2. 모델에 어댑터를 추가합니다.
+
+```py
+model.add_adapter(peft_config)
+```
+
+3. 이제 모델을 [`Trainer`]에 전달할 수 있습니다!
+
+```py
+trainer = Trainer(model=model, ...)
+trainer.train()
+```
+
+훈련한 어댑터를 저장하고 다시 가져오려면:
+
+```py
+model.save_pretrained(save_dir)
+model = AutoModelForCausalLM.from_pretrained(save_dir)
+```
diff --git a/docs/source/ko/perf_hardware.md b/docs/source/ko/perf_hardware.md
new file mode 100644
index 000000000000..e715b39487f3
--- /dev/null
+++ b/docs/source/ko/perf_hardware.md
@@ -0,0 +1,156 @@
+<!---
+Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+
+# 훈련용 사용자 맞춤형 하드웨어 [[custom-hardware-for-training]]
+
+모델 훈련과 추론에 사용하는 하드웨어는 성능에 큰 영향을 미칠 수 있습니다. GPU에 대해 자세히 알아보려면, Tim Dettmer의 훌륭한 블로그 포스트를 확인해보세요. [블로그 포스트 링크](https://timdettmers.com/2020/09/07/which-gpu-for-deep-learning/) (영어로 작성됨).
+
+GPU 설정에 대한 실용적인 조언을 살펴보겠습니다.
+
+## GPU [[gpu]]
+더 큰 모델을 훈련시킬 때는 기본적으로 세 가지 옵션이 있습니다:
+
+- 더 큰 GPU
+- 더 많은 GPU
+- 더 많은 CPU 및 NVMe ([DeepSpeed-Infinity](../en/main_classes/deepspeed#nvme-support)를 통한 오프로드(offload))
+
+우선, 하나의 GPU만 사용하는 경우부터 시작해봅시다.
+
+### 전원 공급과 냉각 [[power-and-cooling]]
+
+비싼 고성능 GPU를 구매한 경우, 올바른 전원 공급과 충분한 냉각을 제공해야 합니다.
+
+**전원 공급**:
+
+일부 고성능 소비자용 GPU는 2개 혹은 가끔가다 3개의 PCI-E 8핀 전원 소켓이 있습니다. 카드에 있는 소켓 수만큼 독립적인 12V PCI-E 8핀 케이블이 연결되어 있는지 확인하세요. 같은 케이블의 한쪽 끝에 있는 2개의 스플릿(또는 피그테일(pigtail) 케이블)을 사용하지 마세요. 즉, GPU에 2개의 소켓이 있다면, PSU(전원 공급 장치)에서 카드로 연결되는 2개의 PCI-E 8핀 케이블이 필요하며, 끝에 2개의 PCI-E 8핀 커넥터가 있는 케이블이 필요하지 않습니다! 그렇지 않으면 카드의 전체 성능을 제대로 발휘하지 못할 수 있습니다.
+
+각각의 PCI-E 8핀 전원 케이블은 PSU 쪽의 12V 레일에 연결되어야 하며 최대 150W의 전력을 공급할 수 있습니다.
+
+일부 다른 GPU는 PCI-E 12핀 커넥터를 사용하며, 이러한 커넥터는 최대 500W-600W의 전력을 공급할 수 있습니다.
+
+저가형 GPU는 6핀 커넥터를 사용하며, 최대 75W의 전력을 공급합니다.
+
+또한 GPU가 안정적인 전압을 받을 수 있도록 고급 PSU를 선택해야 합니다. 일부 저품질의 PSU는 GPU가 최고 성능으로 동작하기 위해 필요한 전압을 안정적으로 공급하지 못할 수 있습니다.
+
+물론, PSU는 GPU에 전원을 공급하기에 충분한 여분의 전력 용량을 가져야 합니다.
+
+**냉각**:
+
+GPU가 과열되면 성능이 저하되고 최대 성능을 발휘하지 못할 수 있으며, 너무 뜨거워지면 중지될 수 있습니다.
+
+GPU가 과열될 때 정확한 적정 온도를 알기 어려우나, 아마도 +80℃ 미만이면 좋지만 더 낮을수록 좋습니다. 70℃-75℃ 정도가 훌륭한 온도 범위입니다. 성능 저하가 발생하기 시작하는 온도는 대략 84℃-90℃ 정도일 것입니다. 하지만 성능 저하 이외에도 지속적으로 매우 높은 온도는 GPU 수명을 단축시킬 수 있습니다.
+
+이어서, 여러 개의 GPU를 사용할 때 가장 중요한 측면 중 하나인 GPU 간 연결 방식을 살펴보겠습니다.
+
+### 다중 GPU 연결 방식 [[multigpu-connectivity]]
+
+다중 GPU를 사용하는 경우 GPU 간의 연결 방식은 전체 훈련 시간에 큰 영향을 미칠 수 있습니다. 만약 GPU가 동일한 물리적 노드에 있을 경우, 다음과 같이 확인할 수 있습니다:
+
+```
+nvidia-smi topo -m
+```
+
+만약 NVLink로 연결된 듀얼 GPU 환경이라면, 다음과 같은 결과를 확인할 수 있습니다:
+
+```
+        GPU0    GPU1    CPU Affinity    NUMA Affinity
+GPU0     X      NV2     0-23            N/A
+GPU1    NV2      X      0-23            N/A
+```
+
+NVLink를 지원하지 않는 다른 환경의 경우에는 다음과 같은 결과를 확인할 수 있습니다:
+```
+        GPU0    GPU1    CPU Affinity    NUMA Affinity
+GPU0     X      PHB     0-11            N/A
+GPU1    PHB      X      0-11            N/A
+```
+
+이 결과에는 다음과 같은 범례가 포함되어 있습니다:
+
+```
+  X    = Self
+  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
+  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
+  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
+  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
+  PIX  = Connection traversing at most a single PCIe bridge
+  NV#  = Connection traversing a bonded set of # NVLinks
+```
+
+따라서 첫 번째 결과의 `NV2`는 GPU가 2개의 NVLink로 연결되어 있다는 것을 나타내고, 두 번째 결과의 `PHB`는 일반적인 소비자용 PCIe+브릿지 설정을 가지고 있다는 것을 나타냅니다.
+
+설정에서 어떤 유형의 연결 방식을 가지고 있는지 확인하세요. 일부 연결 방식은 GPU 간 통신을 더 빠르게 만들 수 있으며(NVLink와 같이), 어떤 연결 방식은 더 느리게 만들 수 있습니다(PHB와 같이).
+
+사용하는 확장성 솔루션의 종류에 따라 연결 속도가 주요한 영향을 미칠 수도 있고 미미한 영향을 미칠 수도 있습니다. DDP와 같이 GPU가 거의 동기화하지 않아도 되는 경우, 연결 속도가 느려도 큰 영향을 받지 않습니다. 반면 ZeRO-DP와 같이 GPU간 통신이 많이 필요한 경우, 더 빠른 훈련을 위해서는 더 빠른 연결 속도가 중요합니다.
+
+#### NVLink [[nvlink]]
+
+[NVLink](https://en.wikipedia.org/wiki/NVLink)는 Nvidia에서 개발한 유선 기반의 직렬 다중 레인 근거리 통신 링크입니다.
+
+새로운 세대의 NVLink는 더 빠른 대역폭을 제공합니다. [Nvidia Ampere GA102 GPU Architecture](https://www.nvidia.com/content/dam/en-zz/Solutions/geforce/ampere/pdf/NVIDIA-ampere-GA102-GPU-Architecture-Whitepaper-V1.pdf)에서 아래와 같은 정보를 확인하실 수 있습니다:
+
+> 3세대 NVLink®
+> GA102 GPU는 4개의 x4 링크를 포함하는 NVIDIA의 3세대 NVLink 인터페이스를 활용하며,
+> 각 링크는 두 개의 GPU 간에 각 방향으로 초당 14.0625GB의 대역폭을 제공합니다.
+> 4개의 링크는 각 방향에 초당 56.25GB의 대역폭을 제공하며, 두 개의 GPU 간에는 초당 112.5GB의 총 대역폭을 제공합니다.
+> 두 개의 RTX 3090 GPU를 NVLink를 사용해 SLI로 연결할 수 있습니다.
+> (3-Way 및 4-Way SLI 구성은 지원되지 않음에 유의하세요.)
+
+
+따라서 `nvidia-smi topo -m`의 결과에서 `NVX`의 값이 높을수록 더 좋습니다. 세대는 GPU 아키텍처에 따라 다를 수 있습니다.
+
+그렇다면, gpt2를 작은 wikitext 샘플로 학습시키는 예제를 통해, NVLink가 훈련에 어떤 영향을 미치는지 살펴보겠습니다.
+
+결과는 다음과 같습니다:
+
+
+| NVlink | Time |
+| -----  | ---: |
+| Y      | 101s |
+| N      | 131s |
+
+
+NVLink 사용 시 훈련이 약 23% 더 빠르게 완료됨을 확인할 수 있습니다. 두 번째 벤치마크에서는 `NCCL_P2P_DISABLE=1`을 사용하여 NVLink를 사용하지 않도록 설정했습니다.
+
+전체 벤치마크 코드와 결과는 다음과 같습니다:
+
+```bash
+# DDP w/ NVLink
+
+rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch \
+--nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \
+--dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train \
+--output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
+
+{'train_runtime': 101.9003, 'train_samples_per_second': 1.963, 'epoch': 0.69}
+
+# DDP w/o NVLink
+
+rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 NCCL_P2P_DISABLE=1 python -m torch.distributed.launch \
+--nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \
+--dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train
+--output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
+
+{'train_runtime': 131.4367, 'train_samples_per_second': 1.522, 'epoch': 0.69}
+```
+
+하드웨어: 각각 2개의 TITAN RTX 24GB + 2개의 NVLink (`NV2` in `nvidia-smi topo -m`)
+소프트웨어: `pytorch-1.8-to-be` + `cuda-11.0` / `transformers==4.3.0.dev0`
diff --git a/docs/source/ko/perf_infer_cpu.md b/docs/source/ko/perf_infer_cpu.md
new file mode 100644
index 000000000000..123e56b4f32c
--- /dev/null
+++ b/docs/source/ko/perf_infer_cpu.md
@@ -0,0 +1,73 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# CPU에서 효율적인 추론하기 [[efficient-inference-on-cpu]]
+
+이 가이드는 CPU에서 대규모 모델을 효율적으로 추론하는 방법에 중점을 두고 있습니다.
+
+## 더 빠른 추론을 위한 `BetterTransformer` [[bettertransformer-for-faster-inference]]
+
+우리는 최근 CPU에서 텍스트, 이미지 및 오디오 모델의 빠른 추론을 위해 `BetterTransformer`를 통합했습니다. 이 통합에 대한 더 자세한 내용은 [이 문서](https://huggingface.co/docs/optimum/bettertransformer/overview)를 참조하세요.
+
+## PyTorch JIT 모드 (TorchScript) [[pytorch-jitmode-torchscript]]
+TorchScript는 PyTorch 코드에서 직렬화와 최적화가 가능한 모델을 생성할때 쓰입니다. TorchScript로 만들어진 프로그램은 기존 Python 프로세스에서 저장한 뒤, 종속성이 없는 새로운 프로세스로 가져올 수 있습니다. PyTorch의 기본 설정인 `eager` 모드와 비교했을때, `jit` 모드는 연산자 결합과 같은 최적화 방법론을 통해 모델 추론에서 대부분 더 나은 성능을 제공합니다.
+
+TorchScript에 대한 친절한 소개는 [PyTorch TorchScript 튜토리얼](https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html#tracing-modules)을 참조하세요.
+
+### JIT 모드와 함께하는 IPEX 그래프 최적화 [[ipex-graph-optimization-with-jitmode]]
+Intel® Extension for PyTorch(IPEX)는 Transformers 계열 모델의 jit 모드에서 추가적인 최적화를 제공합니다. jit 모드와 더불어 Intel® Extension for PyTorch(IPEX)를 활용하시길 강력히 권장드립니다. Transformers 모델에서 자주 사용되는 일부 연산자 패턴은 이미 jit 모드 연산자 결합(operator fusion)의 형태로 Intel® Extension for PyTorch(IPEX)에서 지원되고 있습니다. Multi-head-attention, Concat Linear, Linear+Add, Linear+Gelu, Add+LayerNorm 결합 패턴 등이 이용 가능하며 활용했을 때 성능이 우수합니다. 연산자 결합의 이점은 사용자에게 고스란히 전달됩니다. 분석에 따르면, 질의 응답, 텍스트 분류 및 토큰 분류와 같은 가장 인기 있는 NLP 태스크 중 약 70%가 이러한 결합 패턴을 사용하여 Float32 정밀도와 BFloat16 혼합 정밀도 모두에서 성능상의 이점을 얻을 수 있습니다.
+
+[IPEX 그래프 최적화](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/features/graph_optimization.html)에 대한 자세한 정보를 확인하세요.
+
+#### IPEX 설치: [[ipex-installation]]
+
+IPEX 배포 주기는 PyTorch를 따라서 이루어집니다. 자세한 정보는 [IPEX 설치 방법](https://intel.github.io/intel-extension-for-pytorch/)을 확인하세요.
+
+### JIT 모드 사용법 [[usage-of-jitmode]]
+평가 또는 예측을 위해 Trainer에서 JIT 모드를 사용하려면 Trainer의 명령 인수에 `jit_mode_eval`을 추가해야 합니다.
+
+<Tip warning={true}>
+
+PyTorch의 버전이 1.14.0 이상이라면, jit 모드는 jit.trace에서 dict 입력이 지원되므로, 모든 모델의 예측과 평가가 개선될 수 있습니다.
+
+PyTorch의 버전이 1.14.0 미만이라면, 질의 응답 모델과 같이 forward 매개변수의 순서가 jit.trace의 튜플 입력 순서와 일치하는 모델에 득이 될 수 있습니다. 텍스트 분류 모델과 같이 forward 매개변수 순서가 jit.trace의 튜플 입력 순서와 다른 경우, jit.trace가 실패하며 예외가 발생합니다. 이때 예외상황을 사용자에게 알리기 위해 Logging이 사용됩니다.
+
+</Tip>
+
+[Transformers 질의 응답](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering)의 사용 사례 예시를 참조하세요.
+
+
+- CPU에서 jit 모드를 사용한 추론:
+<pre>python run_qa.py \
+--model_name_or_path csarron/bert-base-uncased-squad-v1 \
+--dataset_name squad \
+--do_eval \
+--max_seq_length 384 \
+--doc_stride 128 \
+--output_dir /tmp/ \
+--no_cuda \
+<b>--jit_mode_eval </b></pre> 
+
+- CPU에서 IPEX와 함께 jit 모드를 사용한 추론:
+<pre>python run_qa.py \
+--model_name_or_path csarron/bert-base-uncased-squad-v1 \
+--dataset_name squad \
+--do_eval \
+--max_seq_length 384 \
+--doc_stride 128 \
+--output_dir /tmp/ \
+--no_cuda \
+<b>--use_ipex \</b>
+<b>--jit_mode_eval</b></pre> 
diff --git a/docs/source/ko/perf_infer_gpu_many.md b/docs/source/ko/perf_infer_gpu_many.md
new file mode 100644
index 000000000000..3e4542180398
--- /dev/null
+++ b/docs/source/ko/perf_infer_gpu_many.md
@@ -0,0 +1,27 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 다중 GPU에서 효율적인 추론 [[efficient-inference-on-a-multiple-gpus]]
+
+이 문서에는 다중 GPU에서 효율적으로 추론하는 방법에 대한 정보가 포함되어 있습니다.
+<Tip>
+
+참고: 다중 GPU 설정은 [단일 GPU 섹션](./perf_infer_gpu_one)에서 설명된 대부분의 전략을 사용할 수 있습니다. 그러나 더 나은 활용을 위해 간단한 기법들을 알아야 합니다.
+
+</Tip>
+
+## 더 빠른 추론을 위한 `BetterTransformer` [[bettertransformer-for-faster-inference]]
+
+우리는 최근 텍스트, 이미지 및 오디오 모델에 대한 다중 GPU에서 더 빠른 추론을 위해 `BetterTransformer`를 통합했습니다. 자세한 내용은 이 통합에 대한 [문서](https://huggingface.co/docs/optimum/bettertransformer/overview)를 확인하십시오.
\ No newline at end of file
diff --git a/docs/source/ko/perf_infer_gpu_one.md b/docs/source/ko/perf_infer_gpu_one.md
new file mode 100644
index 000000000000..73cef858b97d
--- /dev/null
+++ b/docs/source/ko/perf_infer_gpu_one.md
@@ -0,0 +1,184 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 단일 GPU에서 효율적인 추론 [[efficient-inference-on-a-single-gpu]]
+
+이 가이드 외에도, [단일 GPU에서의 훈련 가이드](perf_train_gpu_one)와 [CPU에서의 추론 가이드](perf_infer_cpu)에서도 관련 정보를 찾을 수 있습니다.
+
+## Better Transformer: PyTorch 네이티브 Transformer 패스트패스 [[better-transformer-pytorchnative-transformer-fastpath]]
+
+PyTorch 네이티브 [`nn.MultiHeadAttention`](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/) 어텐션 패스트패스인 BetterTransformer는 [🤗 Optimum 라이브러리](https://huggingface.co/docs/optimum/bettertransformer/overview)의 통합을 통해 Transformers와 함께 사용할 수 있습니다.
+
+PyTorch의 어텐션 패스트패스는 커널 퓨전과 [중첩된 텐서](https://pytorch.org/docs/stable/nested.html)의 사용을 통해 추론 속도를 높일 수 있습니다. 자세한 벤치마크는 [이 블로그 글](https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2)에서 확인할 수 있습니다.
+
+[`optimum`](https://github.com/huggingface/optimum) 패키지를 설치한 후에는 추론 중 Better Transformer를 사용할 수 있도록 [`~PreTrainedModel.to_bettertransformer`]를 호출하여 관련 내부 모듈을 대체합니다:
+
+```python
+model = model.to_bettertransformer()
+```
+
+[`~PreTrainedModel.reverse_bettertransformer`] 메소드는 정규화된 transformers 모델링을 사용하기 위해 모델을 저장하기 전 원래의 모델링으로 돌아갈 수 있도록 해줍니다:
+
+```python
+model = model.reverse_bettertransformer()
+model.save_pretrained("saved_model")
+```
+
+PyTorch 2.0부터는 어텐션 패스트패스가 인코더와 디코더 모두에서 지원됩니다. 지원되는 아키텍처 목록은 [여기](https://huggingface.co/docs/optimum/bettertransformer/overview#supported-models)에서 확인할 수 있습니다.
+
+## FP4 혼합 정밀도 추론을 위한 `bitsandbytes` 통합 [[bitsandbytes-integration-for-fp4-mixedprecision-inference]]
+
+`bitsandbytes`를 설치하면 GPU에서 손쉽게 모델을 압축할 수 있습니다. FP4 양자화를 사용하면 원래의 전체 정밀도 버전과 비교하여 모델 크기를 최대 8배 줄일 수 있습니다. 아래에서 시작하는 방법을 확인하세요.
+
+<Tip>
+
+이 기능은 다중 GPU 설정에서도 사용할 수 있습니다.
+
+</Tip>
+
+### 요구 사항 [[requirements-for-fp4-mixedprecision-inference]]
+
+- 최신 `bitsandbytes` 라이브러리
+`pip install bitsandbytes>=0.39.0`
+
+- 최신 `accelerate`를 소스에서 설치
+`pip install git+https://github.com/huggingface/accelerate.git`
+
+- 최신 `transformers`를 소스에서 설치
+`pip install git+https://github.com/huggingface/transformers.git`
+
+### FP4 모델 실행 - 단일 GPU 설정 - 빠른 시작 [[running-fp4-models-single-gpu-setup-quickstart]]
+
+다음 코드를 실행하여 단일 GPU에서 빠르게 FP4 모델을 실행할 수 있습니다.
+
+```py
+from transformers import AutoModelForCausalLM
+
+model_name = "bigscience/bloom-2b5"
+model_4bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_4bit=True)
+```
+`device_map`은 선택 사항입니다. 그러나 `device_map = 'auto'`로 설정하는 것이 사용 가능한 리소스를 효율적으로 디스패치하기 때문에 추론에 있어 권장됩니다.
+
+### FP4 모델 실행 - 다중 GPU 설정 [[running-fp4-models-multi-gpu-setup]]
+
+다중 GPU에서 혼합 4비트 모델을 가져오는 방법은 단일 GPU 설정과 동일합니다(동일한 명령어 사용):
+```py
+model_name = "bigscience/bloom-2b5"
+model_4bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_4bit=True)
+```
+하지만 `accelerate`를 사용하여 각 GPU에 할당할 GPU RAM을 제어할 수 있습니다. 다음과 같이 `max_memory` 인수를 사용하세요:
+
+```py
+max_memory_mapping = {0: "600MB", 1: "1GB"}
+model_name = "bigscience/bloom-3b"
+model_4bit = AutoModelForCausalLM.from_pretrained(
+    model_name, device_map="auto", load_in_4bit=True, max_memory=max_memory_mapping
+)
+```
+이 예에서는 첫 번째 GPU가 600MB의 메모리를 사용하고 두 번째 GPU가 1GB를 사용합니다.
+
+### 고급 사용법 [[advanced-usage]]
+
+이 방법의 더 고급 사용법에 대해서는 [양자화](main_classes/quantization) 문서 페이지를 참조하세요.
+
+## Int8 혼합 정밀도 행렬 분해를 위한 `bitsandbytes` 통합 [[bitsandbytes-integration-for-int8-mixedprecision-matrix-decomposition]]
+
+<Tip>
+
+이 기능은 다중 GPU 설정에서도 사용할 수 있습니다.
+
+</Tip>
+
+[`LLM.int8() : 8-bit Matrix Multiplication for Transformers at Scale`](https://arxiv.org/abs/2208.07339) 논문에서 우리는 몇 줄의 코드로 Hub의 모든 모델에 대한 Hugging Face 통합을 지원합니다.
+이 방법은 `float16` 및 `bfloat16` 가중치에 대해 `nn.Linear` 크기를 2배로 줄이고, `float32` 가중치에 대해 4배로 줄입니다. 이는 절반 정밀도에서 이상치를 처리함으로써 품질에 거의 영향을 미치지 않습니다.
+
+![HFxbitsandbytes.png](https://cdn-uploads.huggingface.co/production/uploads/1659861207959-62441d1d9fdefb55a0b7d12c.png)
+
+Int8 혼합 정밀도 행렬 분해는 행렬 곱셈을 두 개의 스트림으로 분리합니다: (1) fp16로 곱해지는 체계적인 특이값 이상치 스트림 행렬(0.01%) 및 (2) int8 행렬 곱셈의 일반적인 스트림(99.9%). 이 방법을 사용하면 매우 큰 모델에 대해 예측 저하 없이 int8 추론이 가능합니다.
+이 방법에 대한 자세한 내용은 [논문](https://arxiv.org/abs/2208.07339)이나 [통합에 관한 블로그 글](https://huggingface.co/blog/hf-bitsandbytes-integration)에서 확인할 수 있습니다.
+
+![MixedInt8.gif](https://cdn-uploads.huggingface.co/production/uploads/1660567469965-62441d1d9fdefb55a0b7d12c.gif)
+
+커널은 GPU 전용으로 컴파일되어 있기 때문에 혼합 8비트 모델을 실행하려면 GPU가 필요합니다. 이 기능을 사용하기 전에 모델의 1/4(또는 모델 가중치가 절반 정밀도인 경우 절반)을 저장할 충분한 GPU 메모리가 있는지 확인하세요.
+이 모듈을 사용하는 데 도움이 되는 몇 가지 참고 사항이 아래에 나와 있습니다. 또는 [Google colab](#colab-demos)에서 데모를 따라할 수도 있습니다.
+
+### 요구 사항 [[requirements-for-int8-mixedprecision-matrix-decomposition]]
+
+- `bitsandbytes<0.37.0`을 사용하는 경우, 8비트 텐서 코어(Turing, Ampere 또는 이후 아키텍처 - 예: T4, RTX20s RTX30s, A40-A100)를 지원하는 NVIDIA GPU에서 실행하는지 확인하세요. `bitsandbytes>=0.37.0`을 사용하는 경우, 모든 GPU가 지원됩니다.
+- 올바른 버전의 `bitsandbytes`를 다음 명령으로 설치하세요:
+`pip install bitsandbytes>=0.31.5`
+- `accelerate`를 설치하세요
+`pip install accelerate>=0.12.0`
+
+### 혼합 Int8 모델 실행 - 단일 GPU 설정 [[running-mixedint8-models-single-gpu-setup]]
+
+필요한 라이브러리를 설치한 후 혼합 8비트 모델을 가져오는 방법은 다음과 같습니다:
+
+```py
+from transformers import AutoModelForCausalLM
+
+model_name = "bigscience/bloom-2b5"
+model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
+```
+
+텍스트 생성의 경우:
+
+* `pipeline()` 함수 대신 모델의 `generate()` 메소드를 사용하는 것을 권장합니다. `pipeline()` 함수로는 추론이 가능하지만, 혼합 8비트 모델에 최적화되지 않았기 때문에 `generate()` 메소드를 사용하는 것보다 느릴 수 있습니다. 또한, nucleus 샘플링과 같은 일부 샘플링 전략은 혼합 8비트 모델에 대해 `pipeline()` 함수에서 지원되지 않습니다.
+* 입력을 모델과 동일한 GPU에 배치하는 것이 좋습니다.
+
+다음은 간단한 예입니다:
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_name = "bigscience/bloom-2b5"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
+
+prompt = "Hello, my llama is cute"
+inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
+generated_ids = model.generate(**inputs)
+outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+```
+
+
+### 혼합 Int8 모델 실행 - 다중 GPU 설정 [[running-mixedint8-models-multi-gpu-setup]]
+
+다중 GPU에서 혼합 8비트 모델을 로드하는 방법은 단일 GPU 설정과 동일합니다(동일한 명령어 사용):
+```py
+model_name = "bigscience/bloom-2b5"
+model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
+```
+하지만 `accelerate`를 사용하여 각 GPU에 할당할 GPU RAM을 제어할 수 있습니다. 다음과 같이 `max_memory` 인수를 사용하세요:
+
+```py
+max_memory_mapping = {0: "1GB", 1: "2GB"}
+model_name = "bigscience/bloom-3b"
+model_8bit = AutoModelForCausalLM.from_pretrained(
+    model_name, device_map="auto", load_in_8bit=True, max_memory=max_memory_mapping
+)
+```
+이 예시에서는 첫 번째 GPU가 1GB의 메모리를 사용하고 두 번째 GPU가 2GB를 사용합니다.
+
+### Colab 데모 [[colab-demos]]
+
+이 방법을 사용하면 이전에 Google Colab에서 추론할 수 없었던 모델에 대해 추론할 수 있습니다.
+Google Colab에서 8비트 양자화를 사용하여 T5-11b(42GB in fp32)를 실행하는 데모를 확인하세요:
+
+[![Open In Colab: T5-11b demo](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1YORPWx4okIHXnjW7MSAidXN29mPVNT7F?usp=sharing)
+
+또는 BLOOM-3B에 대한 데모를 확인하세요:
+
+[![Open In Colab: BLOOM-3b demo](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1qOjXfQIAULfKvZqwCen8-MoWKGdSatZ4?usp=sharing)
\ No newline at end of file
diff --git a/docs/source/ko/perf_train_cpu.md b/docs/source/ko/perf_train_cpu.md
new file mode 100644
index 000000000000..573e7abc9d59
--- /dev/null
+++ b/docs/source/ko/perf_train_cpu.md
@@ -0,0 +1,67 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# CPU에서 효율적인 훈련 [[efficient-training-on-cpu]]
+
+이 가이드는 CPU에서 대규모 모델을 효율적으로 훈련하는 데 초점을 맞춥니다.
+
+## IPEX와 혼합 정밀도 [[mixed-precision-with-ipex]]
+
+IPEX는 AVX-512 이상을 지원하는 CPU에 최적화되어 있으며, AVX2만 지원하는 CPU에도 기능적으로 작동합니다. 따라서 AVX-512 이상의 Intel CPU 세대에서는 성능상 이점이 있을 것으로 예상되지만, AVX2만 지원하는 CPU (예: AMD CPU 또는 오래된 Intel CPU)의 경우에는 IPEX 아래에서 더 나은 성능을 보일 수 있지만 이는 보장되지 않습니다. IPEX는 Float32와 BFloat16를 모두 사용하여 CPU 훈련을 위한 성능 최적화를 제공합니다. BFloat16의 사용은 다음 섹션의 주요 초점입니다.
+
+저정밀도 데이터 타입인 BFloat16은 3세대 Xeon® Scalable 프로세서 (코드명: Cooper Lake)에서 AVX512 명령어 집합을 네이티브로 지원해 왔으며, 다음 세대의 Intel® Xeon® Scalable 프로세서에서 Intel® Advanced Matrix Extensions (Intel® AMX) 명령어 집합을 지원하여 성능을 크게 향상시킬 예정입니다. CPU 백엔드의 자동 혼합 정밀도 기능은 PyTorch-1.10부터 활성화되었습니다. 동시에, Intel® Extension for PyTorch에서 BFloat16에 대한 CPU의 자동 혼합 정밀도 및 연산자의 BFloat16 최적화를 대규모로 활성화하고, PyTorch 마스터 브랜치로 부분적으로 업스트림을 반영했습니다. 사용자들은 IPEX 자동 혼합 정밀도를 사용하여 더 나은 성능과 사용자 경험을 얻을 수 있습니다.
+
+[자동 혼합 정밀도](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/features/amp.html)에 대한 자세한 정보를 확인하십시오.
+
+### IPEX 설치: [[ipex-installation]]
+
+IPEX 릴리스는 PyTorch를 따라갑니다. pip를 통해 설치하려면:
+
+| PyTorch Version   | IPEX version   |
+| :---------------: | :----------:   |
+| 1.13              |  1.13.0+cpu    |
+| 1.12              |  1.12.300+cpu  |
+| 1.11              |  1.11.200+cpu  |
+| 1.10              |  1.10.100+cpu  |
+
+```
+pip install intel_extension_for_pytorch==<version_name> -f https://developer.intel.com/ipex-whl-stable-cpu
+```
+
+[IPEX 설치](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/installation.html)에 대한 더 많은 접근 방법을 확인하십시오.
+
+### Trainer에서의 사용법 [[usage-in-trainer]]
+Trainer에서 IPEX의 자동 혼합 정밀도를 활성화하려면 사용자는 훈련 명령 인수에 `use_ipex`, `bf16`, `no_cuda`를 추가해야 합니다.
+
+[Transformers 질문-응답](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering)의 사용 사례를 살펴보겠습니다.
+
+- CPU에서 BF16 자동 혼합 정밀도를 사용하여 IPEX로 훈련하기:
+<pre> python run_qa.py \
+--model_name_or_path bert-base-uncased \
+--dataset_name squad \
+--do_train \
+--do_eval \
+--per_device_train_batch_size 12 \
+--learning_rate 3e-5 \
+--num_train_epochs 2 \
+--max_seq_length 384 \
+--doc_stride 128 \
+--output_dir /tmp/debug_squad/ \
+<b>--use_ipex \</b>
+<b>--bf16 --no_cuda</b></pre> 
+
+### 실습 예시 [[practice-example]]
+
+블로그: [Intel Sapphire Rapids로 PyTorch Transformers 가속화](https://huggingface.co/blog/intel-sapphire-rapids)
\ No newline at end of file
diff --git a/docs/source/ko/perf_train_cpu_many.md b/docs/source/ko/perf_train_cpu_many.md
new file mode 100644
index 000000000000..47545e845326
--- /dev/null
+++ b/docs/source/ko/perf_train_cpu_many.md
@@ -0,0 +1,134 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 다중 CPU에서 효율적으로 훈련하기 [[efficient-training-on-multiple-cpus]]
+
+하나의 CPU에서 훈련하는 것이 너무 느릴 때는 다중 CPU를 사용할 수 있습니다. 이 가이드는 PyTorch 기반의 DDP를 사용하여 분산 CPU 훈련을 효율적으로 수행하는 방법에 대해 설명합니다.
+
+## PyTorch용 Intel® oneCCL 바인딩 [[intel-oneccl-bindings-for-pytorch]]
+
+[Intel® oneCCL](https://github.com/oneapi-src/oneCCL) (collective communications library)은 allreduce, allgather, alltoall과 같은 집합 통신(collective communications)을 구현한 효율적인 분산 딥러닝 훈련을 위한 라이브러리입니다. oneCCL에 대한 자세한 정보는 [oneCCL 문서](https://spec.oneapi.com/versions/latest/elements/oneCCL/source/index.html)와 [oneCCL 사양](https://spec.oneapi.com/versions/latest/elements/oneCCL/source/index.html)을 참조하세요.
+
+`oneccl_bindings_for_pytorch` 모듈 (`torch_ccl`은 버전 1.12 이전에 사용)은 PyTorch C10D ProcessGroup API를 구현하며, 외부 ProcessGroup로 동적으로 가져올 수 있으며 현재 Linux 플랫폼에서만 작동합니다.
+
+[oneccl_bind_pt](https://github.com/intel/torch-ccl)에서 더 자세한 정보를 확인하세요.
+
+### PyTorch용 Intel® oneCCL 바인딩 설치: [[intel-oneccl-bindings-for-pytorch-installation]]
+
+다음 Python 버전에 대한 Wheel 파일을 사용할 수 있습니다.
+
+| Extension Version | Python 3.6 | Python 3.7 | Python 3.8 | Python 3.9 | Python 3.10 |
+| :---------------: | :--------: | :--------: | :--------: | :--------: | :---------: |
+| 1.13.0            |            | √          | √          | √          | √           |
+| 1.12.100          |            | √          | √          | √          | √           |
+| 1.12.0            |            | √          | √          | √          | √           |
+| 1.11.0            |            | √          | √          | √          | √           |
+| 1.10.0            | √          | √          | √          | √          |             |
+
+```
+pip install oneccl_bind_pt=={pytorch_version} -f https://developer.intel.com/ipex-whl-stable-cpu
+```
+`{pytorch_version}`은 1.13.0과 같이 PyTorch 버전을 나타냅니다.
+[oneccl_bind_pt 설치](https://github.com/intel/torch-ccl)에 대한 더 많은 접근 방법을 확인해 보세요.
+oneCCL과 PyTorch의 버전은 일치해야 합니다.
+
+<Tip warning={true}>
+
+oneccl_bindings_for_pytorch 1.12.0 버전의 미리 빌드된 Wheel 파일은 PyTorch 1.12.1과 호환되지 않습니다(PyTorch 1.12.0용입니다).
+PyTorch 1.12.1은 oneccl_bindings_for_pytorch 1.12.10 버전과 함께 사용해야 합니다.
+
+</Tip>
+
+## Intel® MPI 라이브러리 [[intel-mpi-library]]
+이 표준 기반 MPI 구현을 사용하여 Intel® 아키텍처에서 유연하고 효율적이며 확장 가능한 클러스터 메시징을 제공하세요. 이 구성 요소는 Intel® oneAPI HPC Toolkit의 일부입니다.
+
+oneccl_bindings_for_pytorch는 MPI 도구 세트와 함께 설치됩니다. 사용하기 전에 환경을 소스로 지정해야 합니다.
+
+Intel® oneCCL 버전 1.12.0 이상인 경우
+```
+oneccl_bindings_for_pytorch_path=$(python -c "from oneccl_bindings_for_pytorch import cwd; print(cwd)")
+source $oneccl_bindings_for_pytorch_path/env/setvars.sh
+```
+
+Intel® oneCCL 버전이 1.12.0 미만인 경우
+```
+torch_ccl_path=$(python -c "import torch; import torch_ccl; import os;  print(os.path.abspath(os.path.dirname(torch_ccl.__file__)))")
+source $torch_ccl_path/env/setvars.sh
+```
+
+#### IPEX 설치: [[ipex-installation]]
+
+IPEX는 Float32와 BFloat16을 모두 사용하는 CPU 훈련을 위한 성능 최적화를 제공합니다. [single CPU section](./perf_train_cpu)을 참조하세요.
+
+
+이어서 나오는 "Trainer에서의 사용"은 Intel® MPI 라이브러리의 mpirun을 예로 들었습니다.
+
+
+## Trainer에서의 사용 [[usage-in-trainer]]
+Trainer에서 ccl 백엔드를 사용하여 멀티 CPU 분산 훈련을 활성화하려면 명령 인수에 **`--ddp_backend ccl`**을 추가해야 합니다.
+
+[질의 응답 예제](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering)를 사용한 예를 살펴보겠습니다.
+
+
+다음 명령은 한 Xeon 노드에서 2개의 프로세스로 훈련을 활성화하며, 각 소켓당 하나의 프로세스가 실행됩니다. OMP_NUM_THREADS/CCL_WORKER_COUNT 변수는 최적의 성능을 위해 조정할 수 있습니다.
+```shell script
+ export CCL_WORKER_COUNT=1
+ export MASTER_ADDR=127.0.0.1
+ mpirun -n 2 -genv OMP_NUM_THREADS=23 \
+ python3 run_qa.py \
+ --model_name_or_path bert-large-uncased \
+ --dataset_name squad \
+ --do_train \
+ --do_eval \
+ --per_device_train_batch_size 12  \
+ --learning_rate 3e-5  \
+ --num_train_epochs 2  \
+ --max_seq_length 384 \
+ --doc_stride 128  \
+ --output_dir /tmp/debug_squad/ \
+ --no_cuda \
+ --ddp_backend ccl \
+ --use_ipex
+```
+다음 명령은 두 개의 Xeon(노드0 및 노드1, 주 프로세스로 노드0을 사용)에서 총 4개의 프로세스로 훈련을 활성화하며, 각 소켓당 하나의 프로세스가 실행됩니다. OMP_NUM_THREADS/CCL_WORKER_COUNT 변수는 최적의 성능을 위해 조정할 수 있습니다.
+
+노드0에서는 각 노드의 IP 주소를 포함하는 구성 파일(예: hostfile)을 생성하고 해당 구성 파일 경로를 인수로 전달해야 합니다.
+```shell script
+ cat hostfile
+ xxx.xxx.xxx.xxx #node0 ip
+ xxx.xxx.xxx.xxx #node1 ip
+```
+이제 노드0에서 다음 명령을 실행하면 **4DDP**가 노드0 및 노드1에서 BF16 자동 혼합 정밀도로 활성화됩니다.
+```shell script
+ export CCL_WORKER_COUNT=1
+ export MASTER_ADDR=xxx.xxx.xxx.xxx #node0 ip
+ mpirun -f hostfile -n 4 -ppn 2 \
+ -genv OMP_NUM_THREADS=23 \
+ python3 run_qa.py \
+ --model_name_or_path bert-large-uncased \
+ --dataset_name squad \
+ --do_train \
+ --do_eval \
+ --per_device_train_batch_size 12  \
+ --learning_rate 3e-5  \
+ --num_train_epochs 2  \
+ --max_seq_length 384 \
+ --doc_stride 128  \
+ --output_dir /tmp/debug_squad/ \
+ --no_cuda \
+ --ddp_backend ccl \
+ --use_ipex \
+ --bf16
+```
diff --git a/docs/source/ko/perf_train_gpu_many.md b/docs/source/ko/perf_train_gpu_many.md
new file mode 100644
index 000000000000..d5e1f83239b6
--- /dev/null
+++ b/docs/source/ko/perf_train_gpu_many.md
@@ -0,0 +1,533 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 다중 GPU에서 효율적인 훈련 [[efficient-training-on-multiple-gpus]]
+
+단일 GPU에서의 훈련이 너무 느리거나 모델 가중치가 단일 GPU의 메모리에 맞지 않는 경우, 다중-GPU 설정을 사용합니다. 단일 GPU에서 다중 GPU로 전환하기 위해서는 작업을 분산해야 합니다. 데이터, 텐서 또는 파이프라인과 같은 병렬화 기법을 사용하여 작업을 병렬로 처리할 수 있습니다. 그러나 이러한 설정을 모두에게 적용할 수 있는 완벽한 해결책은 없으며, 어떤 설정이 가장 적합한지는 사용하는 하드웨어에 따라 달라집니다. 이 문서는 주로 PyTorch 기반의 구현을 중심으로 설명하며, 대부분의 개념은 다른 프레임워크에도 적용될 수 있을 것으로 예상됩니다.
+
+<Tip>
+
+ 참고: [단일 GPU 섹션](perf_train_gpu_one)에서 소개된 전략(혼합 정밀도 훈련 또는 그래디언트 누적 등)은 일반적으로 모델 훈련에 적용되며, 다중-GPU 또는 CPU 훈련과 같은 다음 섹션으로 진입하기 전에 해당 섹션을 참고하는 것이 좋습니다.
+
+</Tip>
+
+먼저 1D 병렬화 기술에 대해 자세히 논의한 후, 이러한 기술을 결합하여 2D 및 3D 병렬화를 구현하여 더 빠른 훈련과 더 큰 모델을 지원하는 방법을 살펴볼 것입니다. 또한 다른 효과적인 대안 방식도 소개될 예정입니다.
+
+## 개념 [[concepts]]
+
+다음은 이 문서에서 자세히 설명될 주요 개념에 대한 간단한 설명입니다.
+
+1. **DataParallel (DP)** - 동일한 설정이 여러 번 복제되고, 각 설정에 데이터 일부를 받습니다. 처리는 병렬로 수행되며 모든 설정은 각 훈련 단계의 끝날 때 동기화됩니다.
+2. **TensorParallel (TP)** - 각 텐서는 여러 개의 묶음으로 분할되기에, 전체 텐서가 단일 GPU에 상주하는 대신 텐서의 각 샤드가 지정된 GPU에 상주합니다. 처리하는 동안 각 샤드는 서로 다른 GPU에서 개별적으로 병렬 처리되며 결과는 단계가 끝날 때 동기화됩니다. 분할이 수평 수준에서 이루어지기 때문에 이를 수평 병렬 처리라고 부를 수 있습니다.
+3. **PipelineParallel (PP)** - 모델이 수직으로 (레이어 수준) 여러 GPU에 분할되어 모델의 단일 GPU에는 하나 또는 여러 레이어가 배치됩니다. 각 GPU는 파이프라인의 서로 다른 단계를 병렬로 처리하며 작은 배치 묶음에서 작동합니다.
+4. **Zero Redundancy Optimizer (ZeRO)** - TP와 유사하게 텐서를 샤딩하지만, 전체 텐서는 순방향 또는 역방향 계산을 위해 재구성되므로 모델을 수정할 필요가 없습니다. 또한 제한된 GPU 메모리를 보완하기 위해 다양한 오프로드 기술을 지원합니다.
+5. **Sharded DDP** - ZeRO의 기본 개념으로 다른 ZeRO 구현에서도 사용되는 용어입니다.
+
+각 개념의 구체적인 내용에 대해 자세히 들어가기 전에 대규모 인프라에서 대규모 모델을 훈련하는 경우의 대략적인 결정 과정을 살펴보겠습니다.
+
+## 확장성 전략 [[scalability-strategy]]
+
+**⇨ 단일 노드 / 다중-GPU**
+* 모델이 단일 GPU에 맞는 경우:
+
+    1. DDP - 분산 DP
+    2. ZeRO - 상황과 구성에 따라 더 빠를 수도 있고 그렇지 않을 수도 있음
+
+* 모델이 단일 GPU에 맞지 않는 경우:
+
+    1. PP
+    2. ZeRO
+    3. TP
+
+    노드 내 연결 속도가 매우 빠른 NVLINK 또는 NVSwitch의 경우 세 가지 방법은 대부분 비슷한 성능을 보여야 하며, PP가 없는 경우 TP 또는 ZeRO보다 빠를 것입니다. TP의 정도도 차이를 만들 수 있습니다. 특정 설정에서 승자를 찾기 위해 실험하는 것이 가장 좋습니다.
+
+    TP는 거의 항상 단일 노드 내에서 사용됩니다. 즉, TP 크기 <= 노드당 GPU 수입니다.
+
+* 가장 큰 레이어가 단일 GPU에 맞지 않는 경우:
+
+    1. ZeRO를 사용하지 않는 경우 - PP만으로는 맞지 않으므로 TP를 반드시 사용해야 함
+    2. ZeRO를 사용하는 경우에는 위의 "단일 GPU" 항목과 동일
+
+
+**⇨ 다중 노드 /  다중 GPU**
+
+* 노드 간 연결 속도가 빠른 경우:
+
+    1. ZeRO - 모델에 대부분의 수정을 필요로 하지 않음
+    2. PP+TP+DP - 통신이 적지만 모델에 대대적인 변경이 필요함
+
+* 노드 간 연결 속도가 느리며, GPU 메모리가 여전히 부족한 경우:
+
+    1. DP+PP+TP+ZeRO-1
+	
+
+
+## 데이터 병렬화 [[data-parallelism]]
+
+2개의 GPU만으로도 대부분의 사용자들은 `DataParallel` (DP)과 `DistributedDataParallel` (DDP)을 통해 향상된 훈련 속도를 누릴 수 있습니다. 이는 PyTorch의 내장 기능입니다. 일반적으로 DDP를 사용하는 것이 좋으며, DP는 일부 모델에서 작동하지 않을 수 있으므로 주의해야 합니다. [PyTorch 문서](https://pytorch.org/docs/master/generated/torch.nn.DataParallel.html)에서도 DDP의 사용을 권장합니다.
+
+### DP vs DDP [[dp-vs-ddp]]
+
+`DistributedDataParallel` (DDP)은 일반적으로 `DataParallel` (DP)보다 빠르지만, 항상 그렇지는 않습니다:
+* DP는 파이썬 스레드 기반인 반면, DDP는 다중 프로세스 기반이기 때문에 GIL과 같은 파이썬 스레드 제한이 없습니다.
+* 그러나 GPU 카드 간의 느린 상호 연결성은 DDP로 인해 실제로 느린 결과를 낼 수 있습니다.
+
+이 두 모드 간의 GPU 간 통신 오버헤드의 주요 차이점은 다음과 같습니다:
+
+[DDP](https://pytorch.org/docs/master/notes/ddp.html):
+
+- 시작할 때, 주 프로세스가 모델을 gpu 0에서 다른 모든 gpu로 복제합니다.
+- 그런 다음 각 배치에 대해:
+   1. 각 gpu는 자체 미니 배치 데이터를 직접 사용합니다.
+   2. `backward` 동안 로컬 그래디언트가 준비되면, 모든 프로세스에 평균화됩니다.
+
+[DP](https://pytorch.org/docs/master/generated/torch.nn.DataParallel.html):
+
+각 배치에 대해:
+   1. gpu 0은 데이터 배치를 읽고 각 gpu에 미니 배치를 보냅니다.
+   2. 업데이트된 모델을 gpu 0에서 각 gpu로 복제합니다.
+   3. `forward`를 실행하고 각 gpu의 출력을 gpu 0으로 보내고 손실을 계산합니다.
+   4. gpu 0에서 모든 gpu로 손실을 분산하고 `backward`를 실행합니다.
+   5. 각 gpu에서 그래디언트를 gpu 0으로 보내고 이를 평균화합니다.
+
+DDP는 각 배치마다 그래디언트를 보내는 통신만을 수행하며, DP는 배치마다 5개의 다른 데이터 교환을 수행합니다.
+
+DP는 파이썬 스레드를 통해 프로세스 내에서 데이터를 복제하며, DDP는 [torch.distributed](https://pytorch.org/docs/master/distributed.html)를 통해 데이터를 복제합니다.
+
+DP에서는 gpu 0이 다른 gpu보다 훨씬 더 많은 작업을 수행하므로, gpu의 활용도가 낮아집니다.
+
+DDP는 여러 대의 컴퓨터에서 사용할 수 있지만, DP의 경우는 그렇지 않습니다.
+
+DP와 DDP 사이에는 다른 차이점이 있지만, 이 토론과는 관련이 없습니다.
+
+이 2가지 모드를 깊게 이해하고 싶다면, [이 문서](https://www.telesens.co/2019/04/04/distributed-data-parallel-training-using-pytorch-on-aws/)를 강력히 추천합니다. 이 문서는 멋진 다이어그램을 포함하고 있으며, 다양한 하드웨어에서 여러 벤치마크와 프로파일러 출력을 설명하여 필요한 세부 사항을 모두 설명합니다.
+
+실제 벤치마크를 살펴보겠습니다:
+
+| Type   | NVlink | Time |
+| :----- | -----  | ---: |
+| 2:DP   | Y      | 110s |
+| 2:DDP  | Y      | 101s |
+| 2:DDP  | N      | 131s |
+
+
+분석:
+
+여기서 DP는 NVlink가 있는 DDP보다 약 10% 느립니다. 그러나 NVlink가 없는 DDP보다 약 15% 빠릅니다.
+
+실제 차이는 각 GPU가 다른 GPU와 동기화해야 하는 데이터 양에 따라 달라질 것입니다. 동기화할 데이터가 많을수록 느린 링크가 총 실행 시간을 늦출 수 있습니다.
+
+다음은 전체 벤치마크 코드와 출력입니다:
+
+해당 벤치마크에서 `NCCL_P2P_DISABLE=1`을 사용하여 NVLink 기능을 비활성화했습니다.
+
+```
+
+# DP
+rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 \
+python examples/pytorch/language-modeling/run_clm.py \
+--model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
+--do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
+
+{'train_runtime': 110.5948, 'train_samples_per_second': 1.808, 'epoch': 0.69}
+
+# DDP w/ NVlink
+rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 \
+python -m torch.distributed.launch --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \
+--model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
+--do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
+
+{'train_runtime': 101.9003, 'train_samples_per_second': 1.963, 'epoch': 0.69}
+
+# DDP w/o NVlink
+rm -r /tmp/test-clm; NCCL_P2P_DISABLE=1 CUDA_VISIBLE_DEVICES=0,1 \
+python -m torch.distributed.launch --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \
+--model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
+--do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
+
+{'train_runtime': 131.4367, 'train_samples_per_second': 1.522, 'epoch': 0.69}
+```
+
+하드웨어: 각각 24GB의 TITAN RTX 2개 + NVlink과 2개의 NVLink (`nvidia-smi topo -m`에서 `NV2`입니다.)
+소프트웨어: `pytorch-1.8-to-be` + `cuda-11.0` / `transformers==4.3.0.dev0`
+
+## ZeRO 데이터 병렬화 [[zero-data-parallelism]]
+
+ZeRO를 기반으로 한 데이터 병렬화 (ZeRO-DP)는 다음 [블로그 글](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/)의 다음 다이어그램에서 설명되고 있습니다.
+![DeepSpeed-Image-1](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-zero.png)
+
+이 개념은 이해하기 어려울 수 있지만, 실제로는 매우 간단한 개념입니다. 이는 일반적인 `DataParallel` (DP)과 동일하지만, 전체 모델 매개변수, 그래디언트 및 옵티마이저 상태를 복제하는 대신 각 GPU는 그 중 일부만 저장합니다. 그리고 실행 시간에는 주어진 레이어에 대해 전체 레이어 매개변수가 필요할 때 각 GPU가 서로에게 필요한 부분을 제공하기 위해 동기화됩니다 - 그게 전부입니다.
+
+각각 3개의 레이어와 3개의 매개변수가 있는 간단한 모델을 생각해 봅시다:
+```
+La | Lb | Lc
+---|----|---
+a0 | b0 | c0
+a1 | b1 | c1
+a2 | b2 | c2
+```
+레이어 La에는 가중치 a0, a1 및 a2가 있습니다.
+
+3개의 GPU가 있는 경우, Sharded DDP (= Zero-DP)는 다음과 같이 모델을 3개의 GPU에 분할합니다:
+
+```
+GPU0:
+La | Lb | Lc
+---|----|---
+a0 | b0 | c0
+
+GPU1:
+La | Lb | Lc
+---|----|---
+a1 | b1 | c1
+
+GPU2:
+La | Lb | Lc
+---|----|---
+a2 | b2 | c2
+```
+
+일반적인 DNN 다이어그램을 상상해보면 이는 텐서 병렬 처리와 같은 수평 슬라이싱입니다. 수직 슬라이싱은 전체 레이어 그룹을 다른 GPU에 배치하는 것입니다. 이는 시작에 불과합니다.
+
+이제 이러한 각각의 GPU는 DP에서 작동하는 것과 마찬가지로 일반적인 미니 배치를 받습니다:
+```
+x0 => GPU0
+x1 => GPU1
+x2 => GPU2
+```
+
+입력은 수정되지 않은 상태로 일반 모델에 의해 처리될 것으로 간주합니다.
+
+먼저, 입력은 레이어 La에 도달합니다.
+
+GPU0에만 집중해 보겠습니다. x0은 순방향 경로를 수행하기 위해 a0, a1, a2 파라미터가 필요하지만 GPU0에는 a0만 있습니다. GPU1에서 a1을, GPU2에서 a2를 전송받아 모델의 모든 조각을 하나로 모읍니다.
+
+병렬적으로, GPU1은 미니 배치 x1을 받고 a1만 가지고 있지만, a0 및 a2 매개변수가 필요합니다. 따라서 GPU0 및 GPU2에서 이를 가져옵니다.
+
+GPU2도 동일한 작업을 수행합니다. 입력 x2를 받고 GPU0 및 GPU1에서 각각 a0과 a1을, 그리고 자신의 a2와 함께 전체 텐서를 복원합니다.
+
+3개의 GPU는 복원된 전체 텐서를 받고 forward가 수행됩니다.
+
+계산이 완료되면 더 이상 필요하지 않은 데이터는 삭제되고, 해당 데이터는 계산 중에만 사용됩니다. 복원은 사전 패치를 통해 효율적으로 수행됩니다.
+
+그리고 전체 프로세스는 레이어 Lb에 대해 반복되고, 그 다음 Lc로 순방향으로, 그다음은 역방향으로 Lc -> Lb -> La로 반복됩니다.
+
+개인적으로 이것은 효율적인 그룹 배낭 여행자의 중량 분배 전략처럼 들립니다:
+
+1. 사람 A가 텐트를 운반합니다.
+2. 사람 B가 난로를 운반합니다.
+3. 사람 C가 도끼를 운반합니다.
+
+이제 매일 밤 각자 가진 것을 다른 사람들과 공유하고, 가지지 않은 것은 다른 사람들로부터 받고, 아침에는 할당된 유형의 장비를 싸고 계속해서 여행을 진행합니다. 이것이 Sharded DDP / Zero DP입니다.
+
+이 전략을 각각 자신의 텐트, 난로 및 도끼를 개별적으로 운반해야 하는 단순한 전략과 비교해보면 훨씬 비효율적일 것입니다. 이것이 Pytorch의 DataParallel (DP 및 DDP)입니다.
+
+이 주제에 대해 논문을 읽을 때 다음 동의어를 만날 수 있습니다: Sharded, Partitioned.
+
+ZeRO가 모델 가중치를 분할하는 방식을 자세히 살펴보면, 텐서 병렬화와 매우 유사한 것을 알 수 있습니다. 이는 이후에 설명될 수직 모델 병렬화와는 달리 각 레이어의 가중치를 분할/분할하기 때문입니다.
+
+구현:
+
+- [DeepSpeed](https://www.deepspeed.ai/features/#the-zero-redundancy-optimizer)는 1단계 + 2단계 + 3단계의 ZeRO-DP를 제공합니다.
+- [Fairscale](https://github.com/facebookresearch/fairscale/#optimizer-state-sharding-zero)은 1단계 + 2단계 + 3단계의 ZeRO-DP를 제공합니다.
+- [`transformers` 통합](main_classes/trainer#trainer-integrations)
+
+## 네이티브 모델 병렬 처리(수직적) 및 파이프라인 병렬 처리[[naive-model-parallelism-vertical-and-pipeline-parallelism]]
+
+Naive Model Parallelism (MP)은 모델 레이어 그룹을 다중 GPU에 분산하는 방식입니다. 메커니즘은 상대적으로 간단합니다. 원하는 레이어를 `.to()`를 사용하여 원하는 장치로 전환하면 데이터가 해당 레이어로 들어오고 나갈 때 데이터도 레이어와 동일한 장치로 전환되고 나머지는 수정되지 않습니다.
+
+대부분의 모델이 그려지는 방식이 레이어를 세로로 슬라이스하기 때문에 이를 수직 모델 병렬화라고 부릅니다. 예를 들어 다음 다이어그램은 8레이어 모델을 보여줍니다:
+
+```
+===================  ===================
+|  0 | 1 | 2 | 3  |  |  4 | 5 | 6 | 7  |
+===================  ===================
+        gpu0                 gpu1
+```
+우리는 모델을 수직으로 2개로 분할하여 레이어 0-3을 GPU0에 배치하고 레이어 4-7을 GPU1에 배치했습니다.
+
+이제 데이터가 레이어 0에서 1로, 1에서 2로, 2에서 3으로 이동하는 동안에는 일반적인 모델입니다. 그러나 데이터가 레이어 3에서 레이어 4로 전달되어야 할 때는 GPU0에서 GPU1로 이동해야 하므로 통신 오버헤드가 발생합니다. 참여하는 GPU가 동일한 컴퓨팅 노드(예: 동일한 물리적인 기계)에 있는 경우 이 복사는 매우 빠릅니다. 그러나 GPU가 서로 다른 컴퓨팅 노드(예: 여러 기계)에 위치한 경우 통신 오버헤드는 상당히 크게 될 수 있습니다.
+
+그런 다음 레이어 4부터 5로, 6으로, 7로 진행되는 것은 일반적인 모델과 동일하게 진행되고, 7번째 레이어가 완료되면 데이터를 다시 레이어 0으로 보내거나 또는 레이블을 마지막 레이어로 보내야 할 필요가 있습니다. 이제 손실을 계산하고 옵티마이저가 작동할 수 있습니다.
+
+문제점:
+- 이 방식을 "naive" MP라고 부르는 이유는 주어진 상황에 하나의 GPU를 제외한 모든 GPU가 유휴 상태라는 점입니다. 따라서 4개의 GPU를 사용하는 경우 단일 GPU의 메모리 양을 4배로 늘리고 나머지 하드웨어는 무시하는 것과 거의 동일합니다. 또한 장치 간 데이터 복사의 오버헤드도 있습니다. 따라서 4개의 6GB 카드는 naive MP를 사용하여 1개의 24GB 카드와 동일한 크기를 수용할 수 있지만, 후자는 데이터 복사의 오버헤드가 없으므로 훈련을 더 빨리 완료합니다. 그러나 예를 들어 40GB 카드가 있고 45GB 모델을 맞추어야 할 경우 4개의 40GB 카드로 맞출 수 있습니다 (하지만 그래디언트와 옵티마이저 상태 때문에 가까스로 가능합니다).
+- 공유 임베딩은 GPU 간에 복사해야 할 수도 있습니다.
+
+파이프라인 병렬화 (PP)은 거의 naive MP와 동일하지만 GPU 유휴 상태 문제를 해결하기 위해 들어오는 배치를 마이크로 배치로 나누고 인공적으로 파이프라인을 생성하여 서로 다른 GPU가 동시에 계산에 참여할 수 있게 합니다.
+
+[GPipe 논문](https://ai.googleblog.com/2019/03/introducing-gpipe-open-source-library.html)에서 가져온 그림은 상단에 naive MP를, 하단에는 PP를 보여줍니다:
+
+![mp-pp](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-gpipe-bubble.png)
+
+하단 다이어그램에서 PP가 유휴 영역이 적은 것을 쉽게 볼 수 있습니다. 유휴 부분을 "bubble"이라고 합니다.
+
+다이어그램의 양쪽 부분은 참여하는 GPU가 4개인 병렬성을 보여줍니다. 즉, 4개의 GPU가 파이프라인에 참여합니다. 따라서 4개의 파이프 단계 F0, F1, F2 및 F3의 순방향 경로와 B3, B2, B1 및 B0의 역방향 경로가 있습니다.
+
+PP는 조정해야 할 새로운 하이퍼파라미터인 `chunks`를 도입합니다. 이는 동일한 파이프 단계를 통해 일련의 데이터를 묶어서 보내는 방식을 정의합니다. 예를 들어, 아래 다이어그램에서 `chunks=4`를 볼 수 있습니다. GPU0은 0, 1, 2 및 3 (F0,0, F0,1, F0,2, F0,3) 묶음에서 동일한 순방향 경로를 수행하고, 다른 GPU가 작업을 수행하기 시작하고 완료가 시작될 때만 GPU0이 묶음의 역순으로 3, 2, 1 및 0 (B0,3, B0,2, B0,1, B0,0) 경로를 수행합니다.
+
+개념적으로 이는 그래디언트 누적 단계 (GAS)와 동일한 개념입니다. 파이토치에서는 `chunks`를 사용하고 DeepSpeed에서는 동일한 하이퍼파라미터를 GAS로 참조합니다.
+
+묶음으로 인해 PP는 마이크로 배치 (MBS)의 개념을 도입합니다. DP는 전역 데이터 배치 크기를 미니 배치로 나눕니다. 따라서 DP 차수가 4이고 전역 배치 크기가 1024이면 256씩 4개의 미니 배치로 분할됩니다 (1024/4). 그리고 `chunks` (또는 GAS)의 수가 32이면 마이크로 배치 크기는 8이 됩니다 (256/32). 각 파이프라인 단계는 한 번에 하나의 마이크로 배치와 함께 작동합니다.
+
+DP + PP 설정의 전역 배치 크기를 계산하려면 `mbs*chunks*dp_degree` (`8*32*4=1024`)를 수행합니다.
+
+다이어그램으로 돌아가 보겠습니다.
+
+`chunks=1`로 설정하면 매우 비효율적인 naive MP가 생성되며, 매우 큰 `chunks` 값으로 설정하면 아주 작은 마이크로 배치 크기가 생성되어 효율적이지 않을 수 있습니다. 따라서 가장 효율적인 GPU 활용을 위해 어떤 값이 가장 적절한지 실험을 해야 합니다.
+
+다이어그램에서 보이는 것처럼 "dead" 시간의 버블이 존재하여 마지막 `forward` 단계가 `backward` 단계가 파이프라인을 완료하기를 기다려야 하는 상황이 발생하지만, `chunks`의 가장 적절한 값을 찾는 것의 목적은 모든 참여하는 GPU에서 동시에 고도로 활용되는 GPU 활용을 가능하게 하여 버블의 크기를 최소화하는 것입니다.
+
+해결책은 전통적인 파이프라인 API와 더 현대적인 솔루션으로 나뉩니다. 전통적인 파이프라인 API 솔루션과 현대적인 솔루션에 대해 알아보겠습니다.
+
+전통적인 파이프라인 API 솔루션:
+- 파이토치
+- FairScale
+- DeepSpeed
+- Megatron-LM
+
+현대적인 솔루션:
+- Varuna
+- Sagemaker
+
+전통적인 파이프라인 API 솔루션의 문제점:
+- 모델을 상당히 수정해야 한다는 점이 문제입니다. 파이프라인은 모듈의 정상적인 흐름을 `nn.Sequential` 시퀀스로 다시 작성해야 하므로 모델의 설계를 변경해야 할 수 있습니다.
+- 현재 파이프라인 API는 매우 제한적입니다. 파이프라인의 매우 첫 번째 단계에서 전달되는 많은 파이썬 변수가 있는 경우 이를 해결해야 합니다. 현재 파이프라인 인터페이스는 하나의 텐서 또는 텐서의 튜플을 유일한 입력 및 출력으로 요구합니다. 이러한 텐서는 마이크로 배치로 미니 배치로 묶을 것이므로 첫 번째 차원으로 배치 크기가 있어야 합니다. 가능한 개선 사항은 여기에서 논의되고 있습니다. https://github.com/pytorch/pytorch/pull/50693
+- 파이프 단계 수준에서 조건부 제어 흐름은 불가능합니다. 예를 들어, T5와 같은 인코더-디코더 모델은 조건부 인코더 단계를 처리하기 위해 특별한 해결책이 필요합니다.
+- 각 레이어를 정렬하여 하나의 모델의 출력이 다른 모델의 입력이 되도록해야 합니다.
+
+우리는 아직 Varuna와 SageMaker로 실험하지 않았지만, 해당 논문들은 위에서 언급한 문제들의 목록을 극복했고 사용자의 모델에 대한 변경 사항이 훨씬 적게 필요하다고 보고하고 있습니다.
+
+구현:
+- [파이토치](https://pytorch.org/docs/stable/pipeline.html) (파이토치-1.8에서 초기 지원, 1.9에서 점진적으로 개선되고 1.10에서 더 개선됨). [예제](https://github.com/pytorch/pytorch/blob/master/benchmarks/distributed/pipeline/pipe.py)도 참고하세요.
+- [FairScale](https://fairscale.readthedocs.io/en/latest/tutorials/pipe.html)
+- [DeepSpeed](https://www.deepspeed.ai/tutorials/pipeline/)
+- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)은 내부 구현을 가지고 있습니다 - API 없음.
+- [Varuna](https://github.com/microsoft/varuna)
+- [SageMaker](https://arxiv.org/abs/2111.05972) - 이는 AWS에서만 사용할 수 있는 소유 솔루션입니다.
+- [OSLO](https://github.com/tunib-ai/oslo) - 이는 Hugging Face Transformers를 기반으로 구현된 파이프라인 병렬화입니다.
+
+🤗 Transformers 상태: 이 작성 시점에서 모델 중 어느 것도 완전한 PP를 지원하지 않습니다. GPT2와 T5 모델은 naive MP를 지원합니다. 주요 장애물은 모델을 `nn.Sequential`로 변환하고 모든 입력을 텐서로 가져와야 하는 것을 처리할 수 없기 때문입니다. 현재 모델에는 이러한 변환을 매우 복잡하게 만드는 많은 기능이 포함되어 있어 제거해야 합니다.
+
+기타 접근 방법:
+
+DeepSpeed, Varuna 및 SageMaker는 [교차 파이프라인(Interleaved Pipeline)](https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-core-features.html) 개념을 사용합니다.
+![interleaved-pipeline-execution](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-sagemaker-interleaved-pipeline.png)
+
+여기서는 버블(유휴 시간)을 역방향 패스에 우선순위를 부여하여 최소화합니다.
+
+Varuna는 가장 효율적인 스케줄링을 찾기 위해 시뮬레이션을 사용하여 스케줄을 개선하려고 합니다.
+
+OSLO는 `nn.Sequential`로 변환하지 않고 Transformers를 기반으로 한 파이프라인 병렬화를 구현했습니다.
+
+## 텐서 병렬 처리 [[tensor-parallelism]]
+
+텐서 병렬 처리에서는 각 GPU가 텐서의 일부분만 처리하고 전체 텐서가 필요한 연산에 대해서만 전체 텐서를 집계합니다.
+
+이 섹션에서는 [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) 논문인 [Efficient Large-Scale Language Model Training on GPU Clusters](https://arxiv.org/abs/2104.04473)에서의 개념과 다이어그램을 사용합니다.
+
+Transformer의 주요 구성 요소는 fully connected `nn.Linear`와 비선형 활성화 함수인 `GeLU`입니다.
+
+Megatron 논문의 표기법을 따라 행렬의 점곱 부분을 `Y = GeLU(XA)`로 표현할 수 있습니다. 여기서 `X`와 `Y`는 입력 및 출력 벡터이고 `A`는 가중치 행렬입니다.
+
+행렬 형태로 계산을 살펴보면, 행렬 곱셈을 다중 GPU로 분할할 수 있는 방법을 쉽게 알 수 있습니다:
+![Parallel GEMM](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-tp-parallel_gemm.png)
+
+가중치 행렬 `A`를 `N`개의 GPU에 대해 열별로 분할하고 병렬로 행렬 곱셈 `XA_1`에서 `XA_n`까지 수행하면 `N`개의 출력 벡터 `Y_1, Y_2, ..., Y_n`가 생성되며 독립적으로 `GeLU`에 전달될 수 있습니다:
+![independent GeLU](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-tp-independent-gelu.png)
+
+이 원리를 사용하여 동기화가 필요하지 않은 GPU 간의 임의 깊이의 MLP를 업데이트할 수 있습니다. 그러나 결과 벡터를 샤드로부터 재구성해야 하는 마지막 단계까지는 GPU 간의 동기화가 필요합니다. Megatron-LM 논문의 저자들은 이에 대한 유용한 그림을 제공합니다:
+![parallel shard processing](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-tp-parallel_shard_processing.png)
+
+다중 헤드 어텐션 레이어의 병렬화는 더욱 간단합니다. 이미 독립적인 다중 헤드를 가지고 있기 때문에 이미 병렬화되어 있습니다!
+![parallel self-attention](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-tp-parallel_self_attention.png)
+
+특별 고려사항: TP는 매우 빠른 네트워크가 필요하므로 한 개 이상의 노드에서 TP를 수행하는 것은 권장되지 않습니다. 실제로 노드에 4개의 GPU가 있는 경우 TP의 최대 차수는 4입니다. TP 차수가 8인 경우 최소한 8개의 GPU가 있는 노드를 사용해야 합니다.
+
+이 섹션은 원래의 [더 자세한 TP 개요](https://github.com/huggingface/transformers/issues/10321#issuecomment-783543530)를 기반으로 합니다. 
+작성자는 [@anton-l](https://github.com/anton-l)입니다.
+
+SageMaker는 더 효율적인 처리를 위해 TP와 DP를 결합합니다.
+
+대체 이름:
+- DeepSpeed는 이를 [텐서 슬라이싱](https://www.deepspeed.ai/features/#model-parallelism)이라고 부릅니다.
+
+구현:
+- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)은 내부 구현을 가지고 있으므로 모델에 매우 특화되어 있습니다.
+- [parallelformers](https://github.com/tunib-ai/parallelformers) (현재는 추론에만 해당)
+- [SageMaker](https://arxiv.org/abs/2111.05972) - 이는 AWS에서만 사용할 수 있는 소유 솔루션입니다.
+- [OSLO](https://github.com/tunib-ai/oslo)은 Transformers를 기반으로 한 텐서 병렬 처리 구현을 가지고 있습니다.
+
+🤗 Transformers 현황:
+- core: 아직 핵심 부분에 구현되지 않음
+- 그러나 추론을 하려면 [parallelformers](https://github.com/tunib-ai/parallelformers)가 대부분의 모델을 지원합니다. 따라서 핵심 부분에 구현되기 전까지 그들의 것을 사용할 수 있습니다. 그리고 훈련 모드도 지원될 예정입니다.
+- Deepspeed-Inference는 CUDA 커널을 기반으로 하는 매우 빠른 추론 모드에서 BERT, GPT-2 및 GPT-Neo 모델을 지원합니다. 자세한 내용은 [여기](https://www.deepspeed.ai/tutorials/inference-tutorial/)를 참조하세요.
+
+## DP+PP [[dppp]]
+
+DeepSpeed [pipeline tutorial](https://www.deepspeed.ai/tutorials/pipeline/)에서 다음 다이어그램은 DP와 PP를 결합하는 방법을 보여줍니다.
+
+![dp-pp-2d](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-zero-dp-pp.png)
+
+여기서 DP 랭크 0은 GPU2를 보지 못하고, DP 랭크 1은 GPU3을 보지 못하는 것이 중요합니다. DP에게는 딱 2개의 GPU인 것처럼 데이터를 공급합니다. GPU0은 PP를 사용하여 GPU2에게 일부 작업을 "비밀리에" 할당합니다. 그리고 GPU1도 GPU3을 도움으로 삼아 같은 방식으로 작업합니다.
+
+각 차원마다 적어도 2개의 GPU가 필요하므로 최소한 4개의 GPU가 필요합니다.
+
+구현:
+- [DeepSpeed](https://github.com/microsoft/DeepSpeed)
+- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)
+- [Varuna](https://github.com/microsoft/varuna)
+- [SageMaker](https://arxiv.org/abs/2111.05972)
+- [OSLO](https://github.com/tunib-ai/oslo)
+
+🤗 Transformers 현황: 아직 구현되지 않음
+
+## DP+PP+TP [[dppptp]]
+
+더 효율적인 훈련을 위해 PP와 TP 및 DP를 결합하여 3D 병렬 처리를 사용합니다. 다음 다이어그램에서 이를 확인할 수 있습니다.
+
+![dp-pp-tp-3d](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-deepspeed-3d.png)
+
+이 다이어그램은 [3D parallelism: Scaling to trillion-parameter models](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/)이라는 블로그 글에서 확인할 수 있습니다.
+
+각 차원마다 적어도 2개의 GPU가 필요하므로 최소한 8개의 GPU가 필요합니다.
+
+구현:
+- [DeepSpeed](https://github.com/microsoft/DeepSpeed) - DeepSpeed는 더욱 효율적인 DP인 ZeRO-DP라고도 부릅니다.
+- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)
+- [Varuna](https://github.com/microsoft/varuna)
+- [SageMaker](https://arxiv.org/abs/2111.05972)
+- [OSLO](https://github.com/tunib-ai/oslo)
+
+🤗 Transformers 현황: 아직 구현되지 않음. PP와 TP가 없기 때문입니다.
+
+## ZeRO DP+PP+TP [[zero-dppptp]]
+
+DeepSpeed의 주요 기능 중 하나는 DP의 확장인 ZeRO입니다. ZeRO-DP에 대해 이미 [ZeRO Data Parallelism](#zero-data-parallelism)에서 논의되었습니다. 일반적으로 이는 PP나 TP를 필요로하지 않는 독립적인 기능입니다. 그러나 PP와 TP와 결합할 수도 있습니다.
+
+ZeRO-DP가 PP와 (선택적으로 TP와) 결합되면 일반적으로 ZeRO 단계 1(옵티마이저 분할)만 활성화됩니다.
+
+이론적으로는 ZeRO 단계 2(그라디언트 분할)를 파이프라인 병렬 처리와 함께 사용할 수도 있지만, 이는 성능에 나쁜 영향을 미칠 것입니다. 각 마이크로 배치마다 그라디언트를 샤딩하기 전에 추가적인 리듀스-스캐터 컬렉티브가 필요하며, 이는 잠재적으로 상당한 통신 오버헤드를 추가합니다. 파이프라인 병렬 처리의 특성상 작은 마이크로 배치가 사용되며, 산술 연산 강도(마이크로 배치 크기)를 균형 있게 유지하면서 파이프라인 버블(마이크로 배치 수)을 최소화하는 것에 중점을 둡니다. 따라서 해당 통신 비용은 문제가 될 것입니다.
+
+또한, PP로 인해 정상보다 적은 수의 레이어가 있으므로 메모리 절약은 크지 않을 것입니다. PP는 이미 그래디언트 크기를 ``1/PP``로 줄이기 때문에 그래디언트 샤딩의 절약 효과는 순수 DP보다는 미미합니다.
+
+ZeRO 단계 3도 같은 이유로 좋은 선택이 아닙니다 - 더 많은 노드 간 통신이 필요합니다.
+
+그리고 ZeRO가 있기 때문에 다른 이점은 ZeRO-Offload입니다. 이는 단계 1이므로 옵티마이저 상태를 CPU로 오프로드할 수 있습니다.
+
+구현:
+- [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed) 및 [BigScience의 Megatron-Deepspeed](https://github.com/bigscience-workshop/Megatron-DeepSpeed), 이전 저장소의 포크입니다.
+- [OSLO](https://github.com/tunib-ai/oslo)
+
+중요한 논문:
+
+- [Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model](
+https://arxiv.org/abs/2201.11990)
+
+🤗 Transformers 현황: 아직 구현되지 않음, PP와 TP가 없기 때문입니다.
+
+## FlexFlow [[flexflow]]
+
+[FlexFlow](https://github.com/flexflow/FlexFlow)는 약간 다른 방식으로 병렬화 문제를 해결합니다.
+
+논문: ["Beyond Data and Model Parallelism for Deep Neural Networks" by Zhihao Jia, Matei Zaharia, Alex Aiken](https://arxiv.org/abs/1807.05358)
+
+이는 Sample-Operator-Attribute-Parameter를 기반으로 하는 일종의 4D 병렬화를 수행합니다.
+
+1. Sample = 데이터 병렬화 (샘플별 병렬)
+2. Operator = 단일 연산을 여러 하위 연산으로 병렬화
+3. Attribute = 데이터 병렬화 (길이별 병렬)
+4. Parameter = 모델 병렬화 (수평 또는 수직과 관계없이)
+
+예시:
+* Sample
+
+512 길이의 10개의 배치를 가정해 봅시다. 이를 sample 차원으로 2개의 장치에 병렬화하면, 10 x 512는 5 x 2 x 512가 됩니다.
+
+* Operator
+
+레이어 정규화를 수행한다면, 우선 std를 계산하고 두 번째로 mean을 계산한 다음 데이터를 정규화할 수 있습니다. Operator 병렬화는 std와 mean을 병렬로 계산할 수 있도록 합니다. 따라서 operator 차원으로 2개의 장치 (cuda:0, cuda:1)에 병렬화하면, 먼저 입력 데이터를 두 장치로 복사한 다음 cuda:0에서 std를 계산하고 cuda:1에서 동시에 mean을 계산합니다.
+
+* Attribute
+
+512 길이의 10개의 배치가 있습니다. 이를 attribute 차원으로 2개의 장치에 병렬화하면, 10 x 512는 10 x 2 x 256이 됩니다.
+
+* Parameter
+
+이는 tensor 모델 병렬화 또는 naive layer-wise 모델 병렬화와 유사합니다.
+
+![flex-flow-soap](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-flexflow.jpeg)
+
+이 프레임워크의 중요한 점은 (1) GPU/TPU/CPU 대 (2) RAM/DRAM 대 (3) 빠른 인트라-커넥트 대 느린 인터-커넥트와 같은 리소스를 고려하여 어디에서 어떤 병렬화를 사용할지를 알고리즘적으로 자동으로 최적화한다는 것입니다.
+
+하나 매우 중요한 측면은 FlexFlow가 정적이고 고정된 워크로드를 가진 모델에 대한 DNN 병렬화를 최적화하기 위해 설계되었다는 것입니다. 동적인 동작을 가진 모델은 반복마다 다른 병렬화 전략을 선호할 수 있습니다.
+
+따라서 이 프레임워크의 장점은 선택한 클러스터에서 30분 동안 시뮬레이션을 실행하고 이 특정 환경을 최적으로 활용하기 위한 최상의 전략을 제안한다는 것입니다. 부품을 추가/제거/교체하면 실행하고 그에 대한 계획을 다시 최적화한 후 훈련할 수 있습니다. 다른 설정은 자체적인 사용자 정의 최적화를 가질 수 있습니다.
+
+🤗 Transformers 현황: 아직 통합되지 않음. 이미 [transformers.utils.fx](https://github.com/huggingface/transformers/blob/master/src/transformers/utils/fx.py)를 통해 모델을 FX-추적할 수 있으며, 이는 FlexFlow의 선행 조건입니다. 따라서 어떤 작업을 수행해야 FlexFlow가 우리의 모델과 함께 작동할 수 있는지 파악해야 합니다.
+
+
+## 어떤 전략을 사용해야 할까요? [[which-strategy-to-use-when]]
+
+다음은 어떤 병렬화 전략을 언제 사용해야 하는지에 대한 매우 대략적인 개요입니다. 각 목록의 첫 번째 전략이 일반적으로 더 빠릅니다.
+
+**⇨ 단일 GPU**
+
+* 모델이 단일 GPU에 맞는 경우:
+
+    1. 일반적인 사용
+
+* 모델이 단일 GPU에 맞지 않는 경우:
+
+    1. ZeRO + CPU 및 옵션으로 NVMe 언로드
+    2. 위와 동일하게 사용하되, 가장 큰 레이어가 단일 GPU에 맞지 않는 경우 Memory Centric Tiling(자세한 내용은 아래 참조)을 추가적으로 사용
+
+* 가장 큰 레이어가 단일 GPU에 맞지 않는 경우:
+
+1. ZeRO - [Memory Centric Tiling](https://deepspeed.readthedocs.io/en/latest/zero3.html#memory-centric-tiling) (MCT) 활성화. 이를 통해 크기가 매우 큰 레이어를 임의로 분할하여 순차적으로 실행할 수 있습니다. MCT는 GPU에 활성화된 매개변수의 수를 줄이지만 활성화 메모리에는 영향을 주지 않습니다. 현재 작성 기준으로 이 요구사항은 매우 드물기 때문에 사용자가 `torch.nn.Linear`를 수동으로 수정해야 합니다.
+
+**⇨ 단일 노드 / 다중 GPU**
+
+* 모델이 단일 GPU에 맞는 경우:
+
+    1. DDP - 분산 DP
+    2. ZeRO - 상황과 구성에 따라 빠를 수도 있고 그렇지 않을 수도 있습니다.
+
+* 모델이 단일 GPU에 맞지 않는 경우:
+
+    1. PP
+    2. ZeRO
+    3. TP
+
+    NVLINK 또는 NVSwitch를 통한 매우 빠른 인트라-노드 연결이 있는 경우 이 세 가지 방법은 거의 동등할 것이며, 이러한 연결이 없는 경우 PP가 TP나 ZeRO보다 빠를 것입니다. 또한 TP의 차수도 영향을 줄 수 있습니다. 특정 설정에서 우승자를 찾기 위해 실험하는 것이 가장 좋습니다.
+
+    TP는 거의 항상 단일 노드 내에서 사용됩니다. 즉, TP 크기 <= 노드당 GPU 수입니다.
+
+* 가장 큰 레이어가 단일 GPU에 맞지 않는 경우:
+
+    1. ZeRO를 사용하지 않을 경우 - PP만 사용할 수 없으므로 TP를 사용해야 합니다.
+    2. ZeRO를 사용할 경우, "단일 GPU"의 항목과 동일한 항목 참조
+
+
+**⇨ 다중 노드 / 다중 GPU**
+
+* 빠른 노드 간 연결이 있는 경우:
+
+    1. ZeRO - 모델에 대한 수정이 거의 필요하지 않습니다.
+    2. PP+TP+DP - 통신이 적지만 모델에 대한 대규모 변경이 필요합니다.
+
+* 느린 노드 간 연결 및 GPU 메모리 부족한 경우:
+
+    1. DP+PP+TP+ZeRO-1
diff --git a/docs/source/ko/perf_train_tpu_tf.md b/docs/source/ko/perf_train_tpu_tf.md
new file mode 100644
index 000000000000..28d4fdafb96c
--- /dev/null
+++ b/docs/source/ko/perf_train_tpu_tf.md
@@ -0,0 +1,162 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# TensorFlow로 TPU에서 훈련하기[[training-on-tpu-with-tensorflow]]
+
+<Tip>
+
+자세한 설명이 필요하지 않고 바로 TPU 샘플 코드를 시작하고 싶다면 [우리의 TPU 예제 노트북!](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb)을 확인하세요.
+
+</Tip>
+
+### TPU가 무엇인가요?[[what-is-a-tpu]]
+
+TPU는 **텐서 처리 장치**입니다. Google에서 설계한 하드웨어로, GPU처럼 신경망 내에서 텐서 연산을 더욱 빠르게 처리하기 위해 사용됩니다. 네트워크 훈련과 추론 모두에 사용할 수 있습니다. 일반적으로 Google의 클라우드 서비스를 통해 이용할 수 있지만, Google Colab과 Kaggle Kernel을 통해 소규모 TPU를 무료로 직접 이용할 수도 있습니다.
+
+[🤗 Transformers의 모든 Tensorflow 모델은 Keras 모델](https://huggingface.co/blog/tensorflow-philosophy)이기 때문에, 이 문서에서 다루는 대부분의 메소드는 대체로 모든 Keras 모델을 위한 TPU 훈련에 적용할 수 있습니다! 하지만 Transformer와 데이터 세트의 HuggingFace 생태계(hug-o-system?)에 특화된 몇 가지 사항이 있으며, 해당 사항에 대해 설명할 때 반드시 언급하도록 하겠습니다.
+
+### 어떤 종류의 TPU가 있나요?[[what-kinds-of-tpu-are-available]]
+
+신규 사용자는 TPU의 범위와 다양한 이용 방법에 대해 매우 혼란스러워하는 경우가 많습니다. **TPU 노드**와 **TPU VM**의 차이점은 가장 먼저 이해해야 할 핵심적인 구분 사항입니다.
+
+**TPU 노드**를 사용한다면, 실제로는 원격 TPU를 간접적으로 이용하는 것입니다. 네트워크와 데이터 파이프라인을 초기화한 다음, 이를 원격 노드로 전달할 별도의 VM이 필요합니다. Google Colab에서 TPU를 사용하는 경우, **TPU 노드** 방식으로 이용하게 됩니다.
+
+TPU 노드를 사용하는 것은 이를 사용하지 않는 사용자에게 예기치 않은 현상이 발생하기도 합니다! 특히, TPU는 파이썬 코드를 실행하는 기기(machine)와 물리적으로 다른 시스템에 있기 때문에 로컬 기기에 데이터를 저장할 수 없습니다. 즉, 컴퓨터의 내부 저장소에서 가져오는 데이터 파이프라인은 절대 작동하지 않습니다! 로컬 기기에 데이터를 저장하는 대신에, 데이터 파이프라인이 원격 TPU 노드에서 실행 중일 때에도 데이터 파이프라인이 계속 이용할 수 있는 Google Cloud Storage에 데이터를 저장해야 합니다.
+
+<Tip>
+
+메모리에 있는 모든 데이터를 `np.ndarray` 또는 `tf.Tensor`로 맞출 수 있다면, Google Cloud Storage에 업로드할 필요 없이, Colab 또는 TPU 노드를 사용해서 해당 데이터에 `fit()` 할 수 있습니다.
+
+</Tip>
+
+<Tip>
+
+**🤗특수한 Hugging Face 팁🤗:** TF 코드 예제에서 볼 수 있는 `Dataset.to_tf_dataset()` 메소드와 그 상위 래퍼(wrapper)인 `model.prepare_tf_dataset()`는 모두 TPU 노드에서 작동하지 않습니다. 그 이유는 `tf.data.Dataset`을 생성하더라도 “순수한” `tf.data` 파이프라인이 아니며 `tf.numpy_function` 또는 `Dataset.from_generator()`를 사용하여 기본 HuggingFace `Dataset`에서 데이터를 전송하기 때문입니다. 이 HuggingFace `Dataset`는 로컬 디스크에 있는 데이터로 지원되며 원격 TPU 노드가 읽을 수 없습니다.
+
+</Tip>
+
+TPU를 이용하는 두 번째 방법은 **TPU VM**을 사용하는 것입니다. TPU VM을 사용할 때, GPU VM에서 훈련하는 것과 같이 TPU가 장착된 기기에 직접 연결합니다. 특히 데이터 파이프라인과 관련하여, TPU VM은 대체로 작업하기 더 쉽습니다. 위의 모든 경고는 TPU VM에는 해당되지 않습니다!
+
+이 문서는 의견이 포함된 문서이며, 저희의 의견이 여기에 있습니다: **가능하면 TPU 노드를 사용하지 마세요.** TPU 노드는 TPU VM보다 더 복잡하고 디버깅하기가 더 어렵습니다. 또한 향후에는 지원되지 않을 가능성이 높습니다. Google의 최신 TPU인 TPUv4는 TPU VM으로만 이용할 수 있으므로, TPU 노드는 점점 더 "구식" 이용 방법이 될 것으로 전망됩니다. 그러나 TPU 노드를 사용하는 Colab과 Kaggle Kernel에서만 무료 TPU 이용이 가능한 것으로 확인되어, 필요한 경우 이를 다루는 방법을 설명해 드리겠습니다! 이에 대한 자세한 설명이 담긴 코드 샘플은 [TPU 예제 노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb)에서 확인하시기 바랍니다.
+
+### 어떤 크기의 TPU를 사용할 수 있나요?[[what-sizes-of-tpu-are-available]]
+
+단일 TPU(v2-8/v3-8/v4-8)는 8개의 복제본(replicas)을 실행합니다. TPU는 수백 또는 수천 개의 복제본을 동시에 실행할 수 있는 **pod**로 존재합니다. 단일 TPU를 하나 이상 사용하지만 전체 Pod보다 적게 사용하는 경우(예를 들면, v3-32), TPU 구성을 **pod 슬라이스**라고 합니다.
+
+Colab을 통해 무료 TPU에 이용하는 경우, 기본적으로 단일 v2-8 TPU를 제공받습니다.
+
+### XLA에 대해 들어본 적이 있습니다. XLA란 무엇이고 TPU와 어떤 관련이 있나요?[[i-keep-hearing-about-this-xla-thing-whats-xla-and-how-does-it-relate-to-tpus]]
+
+XLA는 최적화 컴파일러로, TensorFlow와 JAX에서 모두 사용됩니다. JAX에서는 유일한 컴파일러이지만, TensorFlow에서는 선택 사항입니다(하지만 TPU에서는 필수입니다!). Keras 모델을 훈련할 때 이를 활성화하는 가장 쉬운 방법은 `jit_compile=True` 인수를 `model.compile()`에 전달하는 것입니다. 오류가 없고 성능이 양호하다면, TPU로 전환할 준비가 되었다는 좋은 신호입니다!
+
+TPU에서 디버깅하는 것은 대개 CPU/GPU보다 조금 더 어렵기 때문에, TPU에서 시도하기 전에 먼저 XLA로 CPU/GPU에서 코드를 실행하는 것을 권장합니다. 물론 오래 학습할 필요는 없습니다. 즉, 모델과 데이터 파이프라인이 예상대로 작동하는지 확인하기 위해 몇 단계만 거치면 됩니다.
+
+<Tip>
+
+XLA로 컴파일된 코드는 대체로 더 빠릅니다. 따라서 TPU에서 실행할 계획이 없더라도, `jit_compile=True`를 추가하면 성능이 향상될 수 있습니다. 하지만 XLA 호환성에 대한 아래 주의 사항을 반드시 확인하세요!
+
+</Tip>
+
+<Tip warning={true}>
+
+**뼈아픈 경험에서 얻은 팁:** `jit_compile=True`를 사용하면 속도를 높이고 CPU/GPU 코드가 XLA와 호환되는지 검증할 수 있는 좋은 방법이지만, 실제 TPU에서 훈련할 때 그대로 남겨두면 많은 문제를 초래할 수 있습니다. XLA 컴파일은 TPU에서 암시적으로 이뤄지므로, 실제 TPU에서 코드를 실행하기 전에 해당 줄을 제거하는 것을 잊지 마세요!
+
+</Tip>
+
+### 제 XLA 모델과 호환하려면 어떻게 해야 하나요?[[how-do-i-make-my-model-xla-compatible]]
+
+대부분의 경우, 여러분의 코드는 이미 XLA와 호환될 것입니다! 그러나 표준 TensorFlow에서 작동하지만, XLA에서는 작동하지 않는 몇 가지 사항이 있습니다. 이를 아래 세 가지 핵심 규칙으로 간추렸습니다:
+
+<Tip>
+
+**특수한 HuggingFace 팁🤗:** 저희는 TensorFlow 모델과 손실 함수를 XLA와 호환되도록 재작성하는 데 많은 노력을 기울였습니다. 저희의 모델과 손실 함수는 대개 기본적으로 규칙 #1과 #2를 따르므로 `transformers` 모델을 사용하는 경우, 이를 건너뛸 수 있습니다. 하지만 자체 모델과 손실 함수를 작성할 때는 이러한 규칙을 잊지 마세요!
+
+</Tip>
+
+#### XLA 규칙 #1: 코드에서 “데이터 종속 조건문”을 사용할 수 없습니다[[xla-rule-1-your-code-cannot-have-datadependent-conditionals]]
+
+어떤 `if`문도 `tf.Tensor` 내부의 값에 종속될 수 없다는 것을 의미합니다. 예를 들어, 이 코드 블록은 XLA로 컴파일할 수 없습니다!
+
+```python
+if tf.reduce_sum(tensor) > 10:
+    tensor = tensor / 2.0
+```
+
+처음에는 매우 제한적으로 보일 수 있지만, 대부분의 신경망 코드에서는 이를 수행할 필요가 없습니다. `tf.cond`를 사용하거나([여기](https://www.tensorflow.org/api_docs/python/tf/cond) 문서를 참조), 다음과 같이 조건문을 제거하고 대신 지표 변수를 사용하는 영리한 수학 트릭을 찾아내어 이 제한을 우회할 수 있습니다:
+
+```python
+sum_over_10 = tf.cast(tf.reduce_sum(tensor) > 10, tf.float32)
+tensor = tensor / (1.0 + sum_over_10)
+```
+
+이 코드는 위의 코드와 정확히 동일한 효과를 구현하지만, 조건문을 제거하여 문제 없이 XLA로 컴파일되도록 합니다!
+
+#### XLA 규칙 #2: 코드에서 "데이터 종속 크기"를 가질 수 없습니다[[xla-rule-2-your-code-cannot-have-datadependent-shapes]]
+
+코드에서 모든 `tf.Tensor` 객체의 크기가 해당 값에 종속될 수 없다는 것을 의미합니다. 예를 들어, `tf.unique` 함수는 입력에서 각 고유 값의 인스턴스 하나를 포함하는 `tensor`를 반환하기 때문에 XLA로 컴파일할 수 없습니다. 이 출력의 크기는 입력 `Tensor`가 얼마나 반복적인지에 따라 분명히 달라질 것이므로, XLA는 이를 처리하지 못합니다!
+
+일반적으로, 대부분의 신경망 코드는 기본값으로 규칙 2를 따릅니다. 그러나 문제가 되는 몇 가지 대표적인 사례가 있습니다. 가장 흔한 사례 중 하나는 **레이블 마스킹**을 사용하여 손실(loss)을 계산할 때, 해당 위치를 무시하도록 나타내기 위해 레이블을 음수 값으로 설정하는 경우입니다. 레이블 마스킹을 지원하는 NumPy나 PyTorch 손실 함수를 보면 [불 인덱싱](https://numpy.org/doc/stable/user/basics.indexing.html#boolean-array-indexing)을 사용하는 다음과 같은 코드를 자주 접할 수 있습니다:
+
+```python
+label_mask = labels >= 0
+masked_outputs = outputs[label_mask]
+masked_labels = labels[label_mask]
+loss = compute_loss(masked_outputs, masked_labels)
+mean_loss = torch.mean(loss)
+```
+
+이 코드는 NumPy나 PyTorch에서는 문제 없이 작동하지만, XLA에서는 손상됩니다! 왜 그럴까요? 얼마나 많은 위치가 마스킹되는지에 따라 `masked_outputs`와 `masked_labels`의 크기가 달라져서, **데이터 종속 크기**가 되기 때문입니다. 그러나 규칙 #1과 마찬가지로, 이 코드를 다시 작성하면 데이터 종속적 모양 크기가 정확히 동일한 출력을 산출할 수 있습니다.
+
+```python
+label_mask = tf.cast(labels >= 0, tf.float32)
+loss = compute_loss(outputs, labels)
+loss = loss * label_mask  # Set negative label positions to 0
+mean_loss = tf.reduce_sum(loss) / tf.reduce_sum(label_mask)
+```
+
+여기서, 모든 위치에 대한 손실을 계산하지만, 평균을 계산할 때 분자와 분모 모두에서 마스크된 위치를 0으로 처리합니다. 이는 데이터 종속 크기를 방지하고 XLA 호환성을 유지하면서 첫 번째 블록과 정확히 동일한 결과를 산출합니다. 규칙 #1에서와 동일한 트릭을 사용하여 `tf.bool`을 `tf.float32`로 변환하고 이를 지표 변수로 사용합니다. 해당 트릭은 매우 유용하며, 자체 코드를 XLA로 변환해야 할 경우 기억해 두세요!
+
+#### XLA 규칙 #3: XLA는 각기 다른 입력 크기가 나타날 때마다 모델을 다시 컴파일해야 합니다[[xla-rule-3-xla-will-need-to-recompile-your-model-for-every-different-input-shape-it-sees]]
+
+이것은 가장 큰 문제입니다. 입력 크기가 매우 가변적인 경우, XLA는 모델을 반복해서 다시 컴파일해야 하므로 성능에 큰 문제가 발생할 수 있습니다. 이 문제는 토큰화 후 입력 텍스트의 길이가 가변적인 NLP 모델에서 주로 발생합니다. 다른 모달리티에서는 정적 크기가 더 흔하며, 해당 규칙이 훨씬 덜 문제시 됩니다.
+
+규칙 #3을 어떻게 우회할 수 있을까요? 핵심은 **패딩**입니다. 모든 입력을 동일한 길이로 패딩한 다음, `attention_mask`를 사용하면 어떤 XLA 문제도 없이 가변 크기에서 가져온 것과 동일한 결과를 가져올 수 있습니다. 그러나 과도한 패딩은 심각한 속도 저하를 야기할 수도 있습니다. 모든 샘플을 전체 데이터 세트의 최대 길이로 패딩하면, 무한한 패딩 토큰으로 구성된 배치가 생성되어 많은 연산과 메모리가 낭비될 수 있습니다!
+
+이 문제에 대한 완벽한 해결책은 없습니다. 하지만, 몇 가지 트릭을 시도해볼 수 있습니다. 한 가지 유용한 트릭은 **샘플 배치를 32 또는 64 토큰과 같은 숫자의 배수까지 패딩하는 것입니다.** 이는 토큰 수가 소폭 증가하지만, 모든 입력 크기가 32 또는 64의 배수여야 하기 때문에 고유한 입력 크기의 수가 대폭 줄어듭니다. 고유한 입력 크기가 적다는 것은 XLA 컴파일 횟수가 적어진다는 것을 의미합니다!
+
+<Tip>
+
+**🤗특수한 HuggingFace 팁🤗:** 토크나이저와 데이터 콜레이터에 도움이 될 수 있는 메소드가 있습니다. 토크나이저를 불러올 때 `padding="max_length"` 또는 `padding="longest"`를 사용하여 패딩된 데이터를 출력하도록 할 수 있습니다. 토크나이저와 데이터 콜레이터는 나타나는 고유한 입력 크기의 수를 줄이기 위해 사용할 수 있는 `pad_to_multiple_of` 인수도 있습니다!
+
+</Tip>
+
+### 실제 TPU로 모델을 훈련하려면 어떻게 해야 하나요?[[how-do-i-actually-train-my-model-on-tpu]]
+
+훈련이 XLA와 호환되고 (TPU 노드/Colab을 사용하는 경우) 데이터 세트가 적절하게 준비되었다면, TPU에서 실행하는 것은 놀랍도록 쉽습니다! 코드에서 몇 줄만 추가하여, TPU를 초기화하고 모델과 데이터 세트가 `TPUStrategy` 범위 내에 생성되도록 변경하면 됩니다. [우리의 TPU 예제 노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb)을 참조하여 실제로 작동하는 모습을 확인해 보세요!
+
+### 요약[[summary]]
+
+여기에 많은 내용이 포함되어 있으므로, TPU 훈련을 위한 모델을 준비할 때 따를 수 있는 간략한 체크리스트로 요약해 보겠습니다:
+
+- 코드가 XLA의 세 가지 규칙을 따르는지 확인합니다.
+- CPU/GPU에서 `jit_compile=True`로 모델을 컴파일하고 XLA로 훈련할 수 있는지 확인합니다.
+- 데이터 세트를 메모리에 가져오거나 TPU 호환 데이터 세트를 가져오는 방식을 사용합니다([노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb) 참조)
+- 코드를 Colab(accelerator가 “TPU”로 설정됨) 또는 Google Cloud의 TPU VM으로 마이그레이션합니다.
+- TPU 초기화 코드를 추가합니다([노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb) 참조)
+- `TPUStrategy`를 생성하고 데이터 세트를 가져오는 것과 모델 생성이 `strategy.scope()` 내에 있는지 확인합니다([노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb) 참조)
+- TPU로 이동할 때 `jit_compile=True`를 다시 설정하는 것을 잊지 마세요!
+- 🙏🙏🙏🥺🥺🥺
+- model.fit()을 불러옵니다.
+- 여러분이 해냈습니다!
\ No newline at end of file
diff --git a/docs/source/ko/performance.md b/docs/source/ko/performance.md
new file mode 100644
index 000000000000..226bd5f249af
--- /dev/null
+++ b/docs/source/ko/performance.md
@@ -0,0 +1,96 @@
+<!---
+Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 성능 및 확장성 [[performance-and-scalability]]
+
+점점 더 큰 규모의 트랜스포머 모델을 훈련하고 프로덕션에 배포하는 데에는 다양한 어려움이 따릅니다. 훈련 중에는 모델이 사용 가능한 GPU 메모리보다 더 많은 메모리를 필요로 하거나 훈련 속도가 매우 느릴 수 있으며, 추론을 위해 배포할 때는 제품 환경에서 요구되는 처리량으로 인해 과부하가 발생할 수 있습니다. 이 문서는 이러한 문제를 극복하고 사용 사례에 가장 적합한 설정을 찾도록 도움을 주기 위해 설계되었습니다. 훈련과 추론으로 가이드를 분할했는데, 이는 각각 다른 문제와 해결 방법이 있기 때문입니다. 그리고 각 가이드에는 다양한 종류의 하드웨어 설정에 대한 별도의 가이드가 있습니다(예: 훈련을 위한 단일 GPU vs 다중 GPU 또는 추론을 위한 CPU vs GPU).
+
+![perf_overview](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/perf_overview.png)
+
+이 문서는 사용자의 상황에 유용할 수 있는 방법들에 대한 개요 및 시작점 역할을 합니다.
+
+## 훈련 [[training]]
+
+효율적인 트랜스포머 모델 훈련에는 GPU나 TPU와 같은 가속기가 필요합니다. 가장 일반적인 경우는 단일 GPU만 사용하는 경우지만, 다중 GPU 및 CPU 훈련에 대한 섹션도 있습니다(곧 더 많은 내용이 추가될 예정).
+
+<Tip>
+
+ 참고: 단일 GPU 섹션에서 소개된 대부분의 전략(예: 혼합 정밀도 훈련 또는 그라디언트 누적)은 일반적인 모델 훈련에도 적용되므로, 다중 GPU나 CPU 훈련과 같은 섹션을 살펴보기 전에 꼭 참고하시길 바랍니다.
+
+</Tip>
+
+### 단일 GPU [[single-gpu]]
+
+단일 GPU에서 대규모 모델을 훈련하는 것은 어려울 수 있지만, 이를 가능하게 하는 여러 가지 도구와 방법이 있습니다. 이 섹션에서는 혼합 정밀도 훈련, 그라디언트 누적 및 체크포인팅, 효율적인 옵티마이저, 최적의 배치 크기를 결정하기 위한 전략 등에 대해 논의합니다.
+
+[단일 GPU 훈련 섹션으로 이동](perf_train_gpu_one)
+
+### 다중 GPU [[multigpu]]
+
+단일 GPU에서 훈련하는 것이 너무 느리거나 대규모 모델에 적합하지 않은 경우도 있습니다. 다중 GPU 설정으로 전환하는 것은 논리적인 단계이지만, 여러 GPU에서 한 번에 훈련하려면 각 GPU마다 모델의 전체 사본을 둘지, 혹은 모델 자체도 여러 GPU에 분산하여 둘지 등 새로운 결정을 내려야 합니다. 이 섹션에서는 데이터, 텐서 및 파이프라인 병렬화에 대해 살펴봅니다.
+
+[다중 GPU 훈련 섹션으로 이동](perf_train_gpu_many)
+
+### CPU [[cpu]]
+
+
+[CPU 훈련 섹션으로 이동](perf_train_cpu)
+
+
+### TPU [[tpu]]
+
+[_곧 제공될 예정_](perf_train_tpu)
+
+### 특수한 하드웨어 [[specialized-hardware]]
+
+[_곧 제공될 예정_](perf_train_special)
+
+## 추론 [[inference]]
+
+제품 및 서비스 환경에서 대규모 모델을 효율적으로 추론하는 것은 모델을 훈련하는 것만큼 어려울 수 있습니다. 이어지는 섹션에서는 CPU 및 단일/다중 GPU 설정에서 추론을 진행하는 단계를 살펴봅니다.
+
+### CPU [[cpu]]
+
+[CPU 추론 섹션으로 이동](perf_infer_cpu)
+
+### 단일 GPU [[single-gpu]]
+
+[단일 GPU 추론 섹션으로 이동](perf_infer_gpu_one)
+
+### 다중 GPU [[multigpu]]
+
+[다중 GPU 추론 섹션으로 이동](perf_infer_gpu_many)
+
+### 특수한 하드웨어 [[specialized-hardware]]
+
+[_곧 제공될 예정_](perf_infer_special)
+
+## 하드웨어 [[hardware]]
+
+하드웨어 섹션에서는 자신만의 딥러닝 장비를 구축할 때 유용한 팁과 요령을 살펴볼 수 있습니다.
+
+[하드웨어 섹션으로 이동](perf_hardware)
+
+
+## 기여하기 [[contribute]]
+
+이 문서는 완성되지 않은 상태이며, 추가해야 할 내용이나 수정 사항이 많이 있습니다. 따라서 추가하거나 수정할 내용이 있으면 주저하지 말고 PR을 열어 주시거나, 자세한 내용을 논의하기 위해 Issue를 시작해 주시기 바랍니다.
+
+A가 B보다 좋다고 하는 기여를 할 때는, 재현 가능한 벤치마크와/또는 해당 정보의 출처 링크를 포함해주세요(당신으로부터의 직접적인 정보가 아닌 경우).
\ No newline at end of file
diff --git a/docs/source/ko/perplexity.md b/docs/source/ko/perplexity.md
new file mode 100644
index 000000000000..72eee0643c33
--- /dev/null
+++ b/docs/source/ko/perplexity.md
@@ -0,0 +1,135 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# 고정 길이 모델의 펄플렉서티(Perplexity)[[perplexity-of-fixedlength-models]]
+
+[[open-in-colab]]
+
+펄플렉서티(Perplexity, PPL)는 가장 일반적인 언어 모델 평가지표 중 하나입니다.
+자세히 알아보기 전에 이 평가지표는 고전적인 언어 모델(자기회귀 또는 인과적 언어 모델이라고도 함)에만 적용되며 BERT와 같은 마스킹된 언어 모델에는 잘 적용하지 않습니다 (BERT는 [summary of the models](../en/model_summary) 문서를 참고하세요).
+
+펄플렉서티는 시퀀스의 음의 로그 우도(negative log-likelihood, NLL) 값의 평균에 지수(exponentiate)를 취한 값으로 정의됩니다.
+토큰화된 시퀀스 \\(X = (x_0, x_1, \dots, x_t)\\) 가 있을 때, \\(X\\) 의 펄플렉서티는 아래 수식과 같이 구할 수 있습니다.
+
+$$\text{PPL}(X) = \exp \left\{ {-\frac{1}{t}\sum_i^t \log p_\theta (x_i|x_{<i}) } \right\}$$
+
+\\(\log p_\theta (x_i|x_{<i})\\) 는 모델에 i번째 이전까지 토큰이 주어졌을 때 i번째 토큰의 로그 우도값입니다.
+
+직관적으로 말뭉치에서 지정된 토큰 집합을 균일하게 예측하는 모델의 능력에 대한 평가로 생각할 수 있습니다.
+중요한 점은 토큰화 과정이 모델의 펄플렉서티에 직접적인 영향을 미치므로 서로 다른 모델을 비교할 때 항상 이를 고려해야 합니다.
+
+이는 데이터와 모델 예측 간의 cross-entropy 값에 지수를 취한 것과 동일합니다.
+펄플렉서티와 문자당 비트 수(BPC) 및 데이터 압축과의 관계에 대해 더 직관적인 이해를 원하신다면 다음 글
+[fantastic blog post on The Gradient](https://thegradient.pub/understanding-evaluation-metrics-for-language-models/)을 확인하세요.
+
+## 고정 길이 모델의 펄플렉서티(PPL) 계산하기[[calculating-ppl-with-fixedlength-models]]
+
+모델의 컨텍스트 크기가 정해져있지 않다면,
+아래와 같이 시퀀스를 자동 회귀적으로 분해하고 각 단계에서 선행 하는 전체 시퀀스를 조건부 확률에 넣어 모델의 펄플렉서티를 계산할 것입니다.
+
+<img width="600" alt="Full decomposition of a sequence with unlimited context length" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/ppl_full.gif"/>
+
+그러나 모델의 근사치를 구할 때는 일반적으로 모델이 처리할 수 있는 토큰 수에 제한이 있습니다.
+예를 들어, 가장 큰 버전의 [GPT-2](model_doc/gpt2)는 토큰의 길이가 1024로 고정되어 있습니다.
+따라서 \\(t\\) 가 1024보다 큰 경우에 \\(p_\theta(x_t|x_{<t})\\) 을 계산할 수 없습니다.
+
+대신 시퀀스는 일반적으로 모델의 최대 입력 크기와 동일한 길이는 가지는 부분 시퀀스로 쪼갭니다.
+만약 모델의 최대 입력 길이가 \\(k\\) 라면, 
+토큰 \\(x_t\\) 의 우도 값을 계산할 때 이전 토큰을 모두 사용하지 않고, \\(k-1\\) 토큰까지 사용해 대략적인 우도 값을 추정합니다. 
+
+모델의 시퀀스에 대한 펄플렉서티를 계산할 때,
+수월하지만 차선책은 시퀀스를 청크로 쪼개고 분해된 각 부분의 로그 우도 값을 독립적으로 합산하는 것입니다.
+
+<img width="600" alt="Suboptimal PPL not taking advantage of full available context" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/ppl_chunked.gif"/>
+
+이 방법은 각 부분의 펄플렉서티를 한 번의 포워드 패스로 계산할 수 있어 빠르지만 일반적으로 더 높은(더 나쁜) PPL을 산출합니다.
+왜냐하면 대부분의 예측 단계에서 모델의 컨텍스트가 적기 때문입니다.
+
+대신, 고정 길이 모델의 PPL은 슬라이딩 윈도우 전략으로 평가해야 합니다.
+이 전략에는 컨텍스트 윈도우을 반복적으로 슬라이딩해 모델이 각 예측을 수행할 때 더 많은 컨텍스트를 갖도록 하는 작업이 포함됩니다.
+
+<img width="600" alt="Sliding window PPL taking advantage of all available context" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/ppl_sliding.gif"/>
+
+이는 시퀀스 확률의 실제 분해에 더 가까운 근사치이며 일반적으로 더 유리한 점수를 산출합니다.
+단점은 말뭉치의 각 토큰에 대해 별도의 포워드 패스가 필요하다는 것입니다.
+현실적으로 좋은 절충안은 한 번에 한 토큰씩 슬라이딩하는 것이 아니라 더 큰 간격으로 컨텍스트를 이동하는 스트라이드가 적용된 슬라이딩 윈도우을 사용하는 것입니다. 
+이렇게 하면 계산을 훨씬 더 빠르게 진행하면서도 모델에 각 단계에서 예측을 수행할 수 있는 긴 컨텍스트를 제공할 수 있습니다.
+
+## 예제: 🤗 Transformers에서 GPT-2로 펄플렉서티(perplexity) 계산하기[[example-calculating-perplexity-with-gpt2-in-transformers]]
+
+이제 GPT-2로 위의 과정을 시연해 보겠습니다.
+
+```python
+from transformers import GPT2LMHeadModel, GPT2TokenizerFast
+
+device = "cuda"
+model_id = "gpt2-large"
+model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
+tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
+```
+
+WikiText-2 데이터 세트를 가져오고 몇 가지 슬라이딩 윈도우 전략을 사용해 펄플렉서티를 계산해보겠습니다.
+이 데이터 세트는 크기가 작고 포워드 패스 한 번만 수행하기 때문에 전체 데이터 세트를 메모리에 가져오고 인코딩할 수 있습니다.
+
+```python
+from datasets import load_dataset
+
+test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")
+```
+
+🤗 Transformers를 사용하면 모델의 `labels`로 `input_ids`를 전달해 각 토큰에 대한 평균 음의 우도 값을 손실로 반환할 수 있습니다.
+하지만 슬라이딩 윈도우 방식을 사용하면 각 반복마다 모델에 전달하는 토큰이 겹칩니다.
+컨텍스트로 처리하는 토큰에 대한 로그 우도 값이 손실에 포함되는 것을 원하지 않기 때문에 이러한 토큰의 `input_ids`를 `-100`으로 설정하여 무시할 수 있습니다. 
+
+다음은 스트라이드(stride)를 `512`로 사용한 예시입니다. 
+즉, 모델이 한 토큰의 조건부 우도 값을 계산할 때 컨텍스트에 최소한 512개의 토큰이 포함되어있다는 의미입니다 (해당 토큰 앞에 512개의 토큰이 있는 경우).
+
+```python
+import torch
+from tqdm import tqdm
+
+max_length = model.config.n_positions
+stride = 512
+seq_len = encodings.input_ids.size(1)
+
+nlls = []
+prev_end_loc = 0
+for begin_loc in tqdm(range(0, seq_len, stride)):
+    end_loc = min(begin_loc + max_length, seq_len)
+    trg_len = end_loc - prev_end_loc  # 마지막 루프의 스트라이드 값과 다를 수 있음
+    input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
+    target_ids = input_ids.clone()
+    target_ids[:, :-trg_len] = -100
+
+    with torch.no_grad():
+        outputs = model(input_ids, labels=target_ids)
+
+        # 손실은 모든 유효한 레이블에 대한 평균값을 구하는 교차 엔트로피(cross entropy)로 계산됩니다.
+        # 나이브 베이지안 모델은 내부적으로 레이블을 왼쪽으로 1개씩 밀기 때문에, (타켓 - 1)개 만큼의 레이블에 대해 손실을 계산합니다.
+        neg_log_likelihood = outputs.loss
+
+    nlls.append(neg_log_likelihood)
+
+    prev_end_loc = end_loc
+    if end_loc == seq_len:
+        break
+
+ppl = torch.exp(torch.stack(nlls).mean())
+```
+
+스트라이드를 최대 입력 길이와 동일하게 설정하면 위에서 설명한 차선책인 비슬라이딩 윈도우 전략과 동일합니다.
+일반적으로 스트라이드가 작을수록 모델이 각 예측을 할 때 더 많은 컨텍스트를 볼 수 있게 되어 펄플렉서티 값이 좋아집니다.
+
+위의 계산을 토큰이 겹치지 않도록 `stride = 1024`로 설정하면 PPL은 `19.44`로 GPT-2 논문에서 보고된 `19.93`과 거의 동일합니다.
+`stride = 512`로 슬라이딩 윈도우 전략을 사용하면 PPL은 `16.45`로 떨어집니다. 
+이는 더 좋은 점수일 뿐만 아니라 시퀀스 확률의 실제 자동 회귀 분해에 더 가까운 방식으로 계산됩니다.
\ No newline at end of file
diff --git a/docs/source/ko/philosophy.md b/docs/source/ko/philosophy.md
new file mode 100644
index 000000000000..94b6c46f60e2
--- /dev/null
+++ b/docs/source/ko/philosophy.md
@@ -0,0 +1,66 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 이념과 목표 [[philosophy]]
+
+🤗 Transformers는 다음과 같은 목적으로 만들어진 독자적인 라이브러리입니다:
+
+- 대규모 Transformers 모델을 사용하거나 연구하거나 확장하려는 기계 학습 연구원 및 교육자를 위한 것입니다.
+- 모델을 미세 조정하거나 제작용으로 사용하고자 하는 실전 개발자를 위한 것입니다.
+- 특정 기계 학습 작업을 해결하기 위해 사전훈련된 모델을 다운로드하고 사용하기만 하려는 엔지니어를 위한 것입니다.
+
+이 라이브러리는 두 가지 주요 목표를 가지고 설계되었습니다:
+
+1. 사용하기 쉽고 빠르게 만드는 것:
+
+- 학습해야 할 사용자 대상 추상화의 수를 제한했습니다. 실제로 거의 추상화가 없으며, 각 모델을 사용하기 위해 필요한 세 가지 표준 클래스인 [configuration](main_classes/configuration), [models](main_classes/model) 및 전처리 클래스인 ([tokenizer](main_classes/tokenizer)는 NLP용, [image processor](main_classes/image_processor)는 비전용, [feature extractor](main_classes/feature_extractor)는 오디오용, [processor](main_classes/processors)는 멀티모달 입력용)만 사용합니다.
+- 이러한 클래스는 공통적인 `from_pretrained()` 메서드를 사용하여 미리 훈련된 인스턴스에서 간단하고 통일된 방식으로 초기화할 수 있습니다. 이 메소드는 미리 훈련된 체크포인트에서 관련 클래스 인스턴스와 관련 데이터(구성의 하이퍼파라미터, 토크나이저의 어휘, 모델의 가중치)를 (필요한 경우) 다운로드하고 캐시하며 가져옵니다. 체크포인트는 [Hugging Face Hub](https://huggingface.co/models)에서 제공되거나 사용자 자체의 저장된 체크포인트에서 제공됩니다.
+- 이 세 가지 기본 클래스 위에 라이브러리는 [`pipeline`] API를 제공하여 주어진 작업에 대해 모델을 빠르게 추론하는 데 사용하고, [`Trainer`]를 제공하여 PyTorch 모델을 빠르게 훈련하거나 미세 조정할 수 있도록 합니다(모든 TensorFlow 모델은 `Keras.fit`과 호환됩니다).
+- 결과적으로, 이 라이브러리는 신경망을 구축하기 위한 모듈식 도구 상자가 아닙니다. 라이브러리를 확장하거나 구축하려면 일반적인 Python, PyTorch, TensorFlow, Keras 모듈을 사용하고 라이브러리의 기본 클래스를 상속하여 모델 로딩 및 저장과 같은 기능을 재사용하면 됩니다. 모델에 대한 코딩 철학에 대해 더 자세히 알고 싶다면 [Repeat Yourself](https://huggingface.co/blog/transformers-design-philosophy) 블로그 글을 확인해보세요.
+
+2. 원래 모델과 가능한 한 근접한 성능을 제공하는 최신 모델을 제공하는 것:
+
+- 각 아키텍처에 대해 공식 저자가 제공한 결과를 재현하는 적어도 한 가지 예제를 제공합니다.
+- 코드는 원래 코드와 가능한 한 유사하게 유지되므로 PyTorch 코드는 TensorFlow 코드로 변환되어 *pytorchic*하지 않을 수 있고, 그 반대의 경우도 마찬가지입니다.
+
+기타 목표 몇 가지:
+
+- 모델의 내부를 가능한 일관되게 노출시키기:
+
+  - 전체 은닉 상태와 어텐션 가중치에 대한 액세스를 단일 API를 사용하여 제공합니다.
+  - 전처리 클래스 및 기본 모델 API는 모델 간에 쉽게 전환할 수 있도록 표준화되어 있습니다.
+
+- 미세 조정 및 모델 탐색을 위한 유망한 도구들을 주관적으로 선택하기:
+
+  - 미세 조정을 위해 어휘 및 임베딩에 새로운 토큰을 간단하고 일관된 방식으로 추가하는 방법을 제공합니다.
+  - Transformer 헤드를 마스킹하고 가지치기하는 간단한 방법을 제공합니다.
+
+- PyTorch, TensorFlow 2.0 및 Flax 간에 쉽게 전환할 수 있도록 하여 하나의 프레임워크로 훈련하고 다른 프레임워크로 추론할 수 있게 합니다.
+
+## 주요 개념 [[main-concepts]]
+
+이 라이브러리는 각 모델에 대해 세 가지 유형의 클래스를 기반으로 구축되었습니다:
+
+- **모델 클래스**는 라이브러리에서 제공하는 사전 훈련된 가중치와 함께 작동하는 PyTorch 모델([torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)), Keras 모델([tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model)), JAX/Flax 모델([flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen.html))일 수 있습니다.
+- **구성 클래스**는 모델을 구축하는 데 필요한 하이퍼파라미터(예: 레이어 수 및 은닉 크기)를 저장합니다. 구성 클래스를 직접 인스턴스화할 필요는 없습니다. 특히, 수정 없이 고 사전 학습된 모델을 사용하는 경우 모델을 생성하면 모델의 일부인 구성을 자동으로 인스턴스화됩니다.
+- **전처리 클래스**는 원시 데이터를 모델이 수용하는 형식으로 변환합니다. [Tokenizer](main_classes/tokenizer)는 각 모델의 어휘를 저장하고, 문자열을 토큰 임베딩 인덱스 리스트로 인코딩하고 디코딩하기 위한 메소드를 제공합니다. [Image processors](main_classes/image_processor)는 비전 입력을 전처리하고, [feature extractors](main_classes/feature_extractor)는 오디오 입력을 전처리하며, [processor](main_classes/processors)는 멀티모달 입력을 처리합니다.
+
+모든 이러한 클래스는 사전 훈련된 인스턴스에서 인스턴스화하고 로컬로 저장하며, 세 가지 메소드를 사용하여 Hub에서 공유할 수 있습니다:
+
+- `from_pretrained()` 메소드를 사용하면 라이브러리 자체에서 제공하는 사전 훈련된 버전(지원되는 모델은 [Model Hub](https://huggingface.co/models)에서 찾을 수 있음)이나 사용자가 로컬로 저장한 경우(또는 서버에 저장한 경우)의 모델, 구성 및 전처리 클래스를 인스턴스화할 수 있습니다.
+- `save_pretrained()` 메소드를 사용하면 모델, 구성 및 전처리 클래스를 로컬로 저장하여 `from_pretrained()`를 사용하여 다시 가져올 수 있습니다.
+- `push_to_hub()` 메소드를 사용하면 모델, 구성 및 전처리 클래스를 Hub에 공유하여 모두에게 쉽게 접근할 수 있습니다.
+
diff --git a/docs/source/ko/pipeline_tutorial.md b/docs/source/ko/pipeline_tutorial.md
new file mode 100644
index 000000000000..4c32db756f0e
--- /dev/null
+++ b/docs/source/ko/pipeline_tutorial.md
@@ -0,0 +1,243 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 추론을 위한 Pipeline[[pipelines-for-inference]]
+
+[`pipeline`]을 사용하면 언어, 컴퓨터 비전, 오디오 및 멀티모달 태스크에 대한 추론을 위해 [Hub](https://huggingface.co/models)의 어떤 모델이든 쉽게 사용할 수 있습니다. 특정 분야에 대한 경험이 없거나, 모델을 이루는 코드가 익숙하지 않은 경우에도 [`pipeline`]을 사용해서 추론할 수 있어요! 이 튜토리얼에서는 다음을 배워보겠습니다.
+
+* 추론을 위해 [`pipeline`]을 사용하는 방법
+* 특정 토크나이저 또는 모델을 사용하는 방법
+* 언어, 컴퓨터 비전, 오디오 및 멀티모달 태스크에서 [`pipeline`]을 사용하는 방법
+
+<Tip>
+
+지원하는 모든 태스크와 쓸 수 있는 매개변수를 담은 목록은 [`pipeline`] 설명서를 참고해주세요.
+
+</Tip>
+
+## Pipeline 사용하기[[pipeline-usage]]
+
+각 태스크마다 고유의 [`pipeline`]이 있지만, 개별 파이프라인을 담고있는 추상화된 [`pipeline`]를 사용하는 것이 일반적으로 더 간단합니다. [`pipeline`]은 태스크에 알맞게 추론이 가능한 기본 모델과 전처리 클래스를 자동으로 로드합니다.
+
+1. 먼저 [`pipeline`]을 생성하고 태스크를 지정하세요.
+
+```py
+>>> from transformers import pipeline
+
+>>> generator = pipeline(task="automatic-speech-recognition")
+```
+
+2. 그리고 [`pipeline`]에 입력을 넣어주세요.
+
+```py
+>>> generator("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+{'text': 'I HAVE A DREAM BUT ONE DAY THIS NATION WILL RISE UP LIVE UP THE TRUE MEANING OF ITS TREES'}
+```
+
+기대했던 결과가 아닌가요? Hub에서 [가장 많이 다운로드된 자동 음성 인식 모델](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&sort=downloads)로 더 나은 결과를 얻을 수 있는지 확인해보세요.
+다음은 [openai/whisper-large](https://huggingface.co/openai/whisper-large)로 시도해보겠습니다.
+
+```py
+>>> generator = pipeline(model="openai/whisper-large")
+>>> generator("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'}
+```
+
+훨씬 더 나아졌군요!
+Hub의 모델들은 여러 다양한 언어와 전문분야를 아우르기 때문에 꼭 자신의 언어나 분야에 특화된 모델을 찾아보시기 바랍니다.
+브라우저를 벗어날 필요없이 Hub에서 직접 모델의 출력을 확인하고 다른 모델과 비교해서 자신의 상황에 더 적합한지, 애매한 입력을 더 잘 처리하는지도 확인할 수 있습니다.
+만약 상황에 알맞는 모델을 없다면 언제나 직접 [훈련](training)시킬 수 있습니다!
+
+입력이 여러 개 있는 경우, 리스트 형태로 전달할 수 있습니다.
+
+```py
+generator(
+    [
+        "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac",
+        "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac",
+    ]
+)
+```
+
+전체 데이터세트을 순회하거나 웹서버에 올려두어 추론에 사용하고 싶다면, 각 상세 페이지를 참조하세요.
+
+[데이터세트에서 Pipeline 사용하기](#using-pipelines-on-a-dataset)
+
+[웹서버에서 Pipeline 사용하기](./pipeline_webserver)
+
+## 매개변수[[parameters]]
+
+[`pipeline`]은 많은 매개변수를 지원합니다. 특정 태스크용인 것도 있고, 범용인 것도 있습니다.
+일반적으로 원하는 위치에 어디든 매개변수를 넣을 수 있습니다.
+
+```py
+generator(model="openai/whisper-large", my_parameter=1)
+out = generate(...)  # This will use `my_parameter=1`.
+out = generate(..., my_parameter=2)  # This will override and use `my_parameter=2`.
+out = generate(...)  # This will go back to using `my_parameter=1`.
+```
+
+중요한 3가지 매개변수를 살펴보겠습니다.
+
+### 기기(device)[[device]]
+
+`device=n`처럼 기기를 지정하면 파이프라인이 자동으로 해당 기기에 모델을 배치합니다.
+파이토치에서나 텐서플로우에서도 모두 작동합니다.
+
+```py
+generator(model="openai/whisper-large", device=0)
+```
+
+모델이 GPU 하나에 돌아가기 버겁다면, `device_map="auto"`를 지정해서 🤗 [Accelerate](https://huggingface.co/docs/accelerate)가 모델 가중치를 어떻게 로드하고 저장할지 자동으로 결정하도록 할 수 있습니다.
+
+```py
+#!pip install accelerate
+generator(model="openai/whisper-large", device_map="auto")
+```
+
+### 배치 사이즈[[batch-size]]
+
+기본적으로 파이프라인은 [여기](https://huggingface.co/docs/transformers/main_classes/pipelines#pipeline-batching)에 나온 이유로 추론을 일괄 처리하지 않습니다. 간단히 설명하자면 일괄 처리가 반드시 더 빠르지 않고 오히려 더 느려질 수도 있기 때문입니다.
+
+하지만 자신의 상황에 적합하다면, 이렇게 사용하세요.
+
+```py
+generator(model="openai/whisper-large", device=0, batch_size=2)
+audio_filenames = [f"audio_{i}.flac" for i in range(10)]
+texts = generator(audio_filenames)
+```
+
+파이프라인 위 제공된 10개의 오디오 파일을 추가로 처리하는 코드 없이 (일괄 처리에 보다 효과적인 GPU 위) 모델에 2개씩 전달합니다.
+출력은 일괄 처리하지 않았을 때와 똑같아야 합니다. 파이프라인에서 속도를 더 낼 수도 있는 방법 중 하나일 뿐입니다.
+
+파이프라인은 일괄 처리의 복잡한 부분을 줄여주기도 합니다. (예를 들어 긴 오디오 파일처럼) 여러 부분으로 나눠야 모델이 처리할 수 있는 것을 [*chunk batching*](./main_classes/pipelines#pipeline-chunk-batching)이라고 하는데, 파이프라인을 사용하면 자동으로 나눠줍니다.
+
+### 특정 태스크용 매개변수[[task-specific-parameters]]
+
+각 태스크마다 구현할 때 유연성과 옵션을 제공하기 위해 태스크용 매개변수가 있습니다.
+예를 들어 [`transformers.AutomaticSpeechRecognitionPipeline.__call__`] 메서드에는 동영상의 자막을 넣을 때 유용할 것 같은 `return_timestamps` 매개변수가 있습니다. 
+
+```py
+>>> # Not using whisper, as it cannot provide timestamps.
+>>> generator = pipeline(model="facebook/wav2vec2-large-960h-lv60-self", return_timestamps="word")
+>>> generator("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+{'text': 'I HAVE A DREAM BUT ONE DAY THIS NATION WILL RISE UP AND LIVE OUT THE TRUE MEANING OF ITS CREED', 'chunks': [{'text': 'I', 'timestamp': (1.22, 1.24)}, {'text': 'HAVE', 'timestamp': (1.42, 1.58)}, {'text': 'A', 'timestamp': (1.66, 1.68)}, {'text': 'DREAM', 'timestamp': (1.76, 2.14)}, {'text': 'BUT', 'timestamp': (3.68, 3.8)}, {'text': 'ONE', 'timestamp': (3.94, 4.06)}, {'text': 'DAY', 'timestamp': (4.16, 4.3)}, {'text': 'THIS', 'timestamp': (6.36, 6.54)}, {'text': 'NATION', 'timestamp': (6.68, 7.1)}, {'text': 'WILL', 'timestamp': (7.32, 7.56)}, {'text': 'RISE', 'timestamp': (7.8, 8.26)}, {'text': 'UP', 'timestamp': (8.38, 8.48)}, {'text': 'AND', 'timestamp': (10.08, 10.18)}, {'text': 'LIVE', 'timestamp': (10.26, 10.48)}, {'text': 'OUT', 'timestamp': (10.58, 10.7)}, {'text': 'THE', 'timestamp': (10.82, 10.9)}, {'text': 'TRUE', 'timestamp': (10.98, 11.18)}, {'text': 'MEANING', 'timestamp': (11.26, 11.58)}, {'text': 'OF', 'timestamp': (11.66, 11.7)}, {'text': 'ITS', 'timestamp': (11.76, 11.88)}, {'text': 'CREED', 'timestamp': (12.0, 12.38)}]}
+```
+
+보시다시피 모델이 텍스트를 추론할 뿐만 아니라 각 단어를 말한 시점까지도 출력했습니다.
+
+태스크마다 다양한 매개변수를 가지고 있는데요. 원하는 태스크의 API를 참조해서 바꿔볼 수 있는 여러 매개변수를 살펴보세요!
+지금까지 다뤄본 [`~transformers.AutomaticSpeechRecognitionPipeline`]에는 `chunk_length_s` 매개변수가 있습니다. 영화나 1시간 분량의 동영상의 자막 작업을 할 때처럼, 일반적으로 모델이 자체적으로 처리할 수 없는 매우 긴 오디오 파일을 처리할 때 유용하죠.
+
+
+도움이 될 만한 매개변수를 찾지 못했다면 언제든지 [요청](https://github.com/huggingface/transformers/issues/new?assignees=&labels=feature&template=feature-request.yml)해주세요!
+
+
+## 데이터세트에서 Pipeline 사용하기[[using-pipelines-on-a-dataset]]
+
+파이프라인은 대규모 데이터세트에서도 추론 작업을 할 수 있습니다. 이때 이터레이터를 사용하는 걸 추천드립니다.
+
+```py
+def data():
+    for i in range(1000):
+        yield f"My example {i}"
+
+
+pipe = pipe(model="gpt2", device=0)
+generated_characters = 0
+for out in pipe(data()):
+    generated_characters += len(out["generated_text"])
+```
+
+이터레이터 `data()`는 각 결과를 호출마다 생성하고, 파이프라인은 입력이 순회할 수 있는 자료구조임을 자동으로 인식하여 GPU에서 기존 데이터가 처리되는 동안 새로운 데이터를 가져오기 시작합니다.(이때 내부적으로 [DataLoader](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader)를 사용해요.) 이 과정은 전체 데이터세트를 메모리에 적재하지 않고도 GPU에 최대한 빠르게 새로운 작업을 공급할 수 있기 때문에 중요합니다.
+
+그리고 일괄 처리가 더 빠를 수 있기 때문에, `batch_size` 매개변수를 조정해봐도 좋아요.
+
+데이터세트를 순회하는 가장 간단한 방법은 🤗 [Datasets](https://github.com/huggingface/datasets/)를 활용하는 것인데요.
+
+```py
+# KeyDataset is a util that will just output the item we're interested in.
+from transformers.pipelines.pt_utils import KeyDataset
+
+pipe = pipeline(model="hf-internal-testing/tiny-random-wav2vec2", device=0)
+dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:10]")
+
+for out in pipe(KeyDataset(dataset["audio"])):
+    print(out)
+```
+
+
+## 웹서버에서 Pipeline 사용하기[[using-pipelines-for-a-webserver]]
+
+<Tip>
+추론 엔진을 만드는 과정은 따로 페이지를 작성할만한 복잡한 주제입니다.
+</Tip>
+
+[Link](./pipeline_webserver)
+
+## 비전 Pipeline[[vision-pipeline]]
+
+비전 태스크를 위해 [`pipeline`]을 사용하는 일은 거의 동일합니다.
+
+태스크를 지정하고 이미지를 분류기에 전달하면 됩니다. 이미지는 인터넷 링크 또는 로컬 경로의 형태로 전달해주세요. 예를 들어 아래에 표시된 고양이는 어떤 종인가요?
+
+![pipeline-cat-chonk](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg)
+
+```py
+>>> from transformers import pipeline
+
+>>> vision_classifier = pipeline(model="google/vit-base-patch16-224")
+>>> preds = vision_classifier(
+...     images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> preds
+[{'score': 0.4335, 'label': 'lynx, catamount'}, {'score': 0.0348, 'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'}, {'score': 0.0324, 'label': 'snow leopard, ounce, Panthera uncia'}, {'score': 0.0239, 'label': 'Egyptian cat'}, {'score': 0.0229, 'label': 'tiger cat'}]
+```
+
+### 텍스트 Pipeline[[text-pipeline]]
+
+NLP 태스크를 위해 [`pipeline`]을 사용하는 일도 거의 동일합니다.
+
+```py
+>>> from transformers import pipeline
+
+>>> # This model is a `zero-shot-classification` model.
+>>> # It will classify text, except you are free to choose any label you might imagine
+>>> classifier = pipeline(model="facebook/bart-large-mnli")
+>>> classifier(
+...     "I have a problem with my iphone that needs to be resolved asap!!",
+...     candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"],
+... )
+{'sequence': 'I have a problem with my iphone that needs to be resolved asap!!', 'labels': ['urgent', 'phone', 'computer', 'not urgent', 'tablet'], 'scores': [0.504, 0.479, 0.013, 0.003, 0.002]}
+```
+
+### 멀티모달 Pipeline[[multimodal-pipeline]]
+
+[`pipeline`]은 여러 모달리티(역주: 오디오, 비디오, 텍스트와 같은 데이터 형태)를 지원합니다. 예시로 시각적 질의응답(VQA; Visual Question Answering) 태스크는 텍스트와 이미지를 모두 사용합니다. 그 어떤 이미지 링크나 묻고 싶은 질문도 자유롭게 전달할 수 있습니다. 이미지는 URL 또는 로컬 경로의 형태로 전달해주세요.
+
+예를 들어 이 [거래명세서 사진](https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png)에서 거래명세서 번호를 묻고 싶다면,
+
+```py
+>>> from transformers import pipeline
+
+>>> vqa = pipeline(model="impira/layoutlm-document-qa")
+>>> vqa(
+...     image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png",
+...     question="What is the invoice number?",
+... )
+[{'score': 0.42514941096305847, 'answer': 'us-001', 'start': 16, 'end': 16}]
+```
diff --git a/docs/source/ko/pipeline_webserver.md b/docs/source/ko/pipeline_webserver.md
new file mode 100644
index 000000000000..b62dfe26541e
--- /dev/null
+++ b/docs/source/ko/pipeline_webserver.md
@@ -0,0 +1,144 @@
+<!--⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+# 웹 서버를 위한 파이프라인 사용하기[[using_pipelines_for_a_webserver]]
+
+<Tip>
+추론 엔진을 만드는 것은 복잡한 주제이며, "최선의" 솔루션은 문제 공간에 따라 달라질 가능성이 높습니다. CPU 또는 GPU를 사용하는지에 따라 다르고 낮은 지연 시간을 원하는지, 높은 처리량을 원하는지, 다양한 모델을 지원할 수 있길 원하는지, 하나의 특정 모델을 고도로 최적화하길 원하는지 등에 따라 달라집니다. 이 주제를 해결하는 방법에는 여러 가지가 있으므로, 이 장에서 제시하는 것은 처음 시도해 보기에 좋은 출발점일 수는 있지만, 이 장을 읽는 여러분이 필요로 하는 최적의 솔루션은 아닐 수 있습니다.
+</Tip>
+
+핵심적으로 이해해야 할 점은 [dataset](pipeline_tutorial#using-pipelines-on-a-dataset)를 다룰 때와 마찬가지로 반복자를 사용 가능하다는 것입니다. 왜냐하면, 웹 서버는 기본적으로 요청을 기다리고 들어오는 대로 처리하는 시스템이기 때문입니다.
+
+보통 웹 서버는 다양한 요청을 동시에 다루기 위해 매우 다중화된 구조(멀티 스레딩, 비동기 등)를 지니고 있습니다. 반면에, 파이프라인(대부분 파이프라인 안에 있는 모델)은 병렬처리에 그다지 좋지 않습니다. 왜냐하면 파이프라인은 많은 RAM을 차지하기 때문입니다. 따라서, 파이프라인이 실행 중이거나 계산 집약적인 작업 중일 때 모든 사용 가능한 리소스를 제공하는 것이 가장 좋습니다.
+
+이 문제를 우리는 웹 서버가 요청을 받고 보내는 가벼운 부하를 처리하고, 실제 작업을 처리하는 단일 스레드를 갖는 방법으로 해결할 것입니다. 이 예제는 `starlette` 라이브러리를 사용합니다.
+실제 프레임워크는 중요하지 않지만, 다른 프레임워크를 사용하는 경우 동일한 효과를 보기 위해선 코드를 조정하거나 변경해야 할 수 있습니다.
+
+`server.py`를 생성하세요:
+
+```py
+from starlette.applications import Starlette
+from starlette.responses import JSONResponse
+from starlette.routing import Route
+from transformers import pipeline
+import asyncio
+
+
+async def homepage(request):
+    payload = await request.body()
+    string = payload.decode("utf-8")
+    response_q = asyncio.Queue()
+    await request.app.model_queue.put((string, response_q))
+    output = await response_q.get()
+    return JSONResponse(output)
+
+
+async def server_loop(q):
+    pipe = pipeline(model="bert-base-uncased")
+    while True:
+        (string, response_q) = await q.get()
+        out = pipe(string)
+        await response_q.put(out)
+
+
+app = Starlette(
+    routes=[
+        Route("/", homepage, methods=["POST"]),
+    ],
+)
+
+
+@app.on_event("startup")
+async def startup_event():
+    q = asyncio.Queue()
+    app.model_queue = q
+    asyncio.create_task(server_loop(q))
+```
+
+이제 다음 명령어로 실행시킬 수 있습니다:
+
+```bash
+uvicorn server:app
+```
+
+이제 쿼리를 날려볼 수 있습니다:
+
+```bash
+curl -X POST -d "test [MASK]" http://localhost:8000/
+#[{"score":0.7742936015129089,"token":1012,"token_str":".","sequence":"test."},...]
+```
+
+자, 이제 웹 서버를 만드는 방법에 대한 좋은 개념을 알게 되었습니다!
+
+중요한 점은 모델을 **한 번만** 가져온다는 것입니다. 따라서 웹 서버에는 모델의 사본이 없습니다. 이런 방식은 불필요한 RAM이 사용되지 않습니다. 그런 다음 큐 메커니즘을 사용하면, 다음과 같은
+동적 배치를 사용하기 위해 추론 전 단계에 몇 개의 항목을 축적하는 것과 같은 멋진 작업을 할 수 있습니다:
+
+<Tip warning={true}>
+코드는 의도적으로 가독성을 위해 의사 코드처럼 작성되었습니다!
+아래 코드를 작동시키기 전에 시스템 자원이 충분한지 확인하세요!
+</Tip>
+
+```py
+(string, rq) = await q.get()
+strings = []
+queues = []
+while True:
+    try:
+        (string, rq) = await asyncio.wait_for(q.get(), timeout=0.001)  # 1ms
+    except asyncio.exceptions.TimeoutError:
+        break
+    strings.append(string)
+    queues.append(rq)
+strings
+outs = pipe(strings, batch_size=len(strings))
+for rq, out in zip(queues, outs):
+    await rq.put(out)
+```
+
+다시 말씀 드리자면, 제안된 코드는 가독성을 위해 최적화되었으며, 최상의 코드는 아닙니다.
+첫째, 배치 크기 제한이 없으며 이는 일반적으로 좋은 방식이 아닙니다.
+둘째, 모든 큐 가져오기에서 타임아웃이 재설정되므로 추론을 실행하기 전에 1ms보다 훨씬 오래 기다릴 수 있습니다(첫 번째 요청을 그만큼 지연시킴).
+
+단일 1ms 길이의 데드라인을 두는 편이 더 좋습니다.
+
+이 방식을 사용하면 큐가 비어 있어도 항상 1ms를 기다리게 될 것입니다. 
+큐에 아무것도 없을 때 추론을 원하는 경우에는 최선의 방법이 아닐 수 있습니다.
+하지만 배치 작업이 사용례에 따라 정말로 중요하다면 의미가 있을 수도 있습니다. 
+다시 말하지만, 최상의 솔루션은 없습니다.
+
+## 고려해야 할 몇 가지 사항[[few_things_you_might want_to_consider]]
+
+### 에러 확인[[error_checking]]
+
+프로덕션 환경에서는 문제가 발생할 여지가 많습니다. 
+메모리가 모자라거나, 공간이 부족하거나, 모델을 가져오는 데에 실패하거나, 쿼리가 잘못되었거나, 쿼리는 정확해도 모델 설정이 잘못되어 실행에 실패하는 등등 많은 경우가 존재합니다.
+
+일반적으로 서버가 사용자에게 오류를 출력하는 것이 좋으므로
+오류를 표시하기 위해 `try...except` 문을 많이 추가하는 것이 좋습니다. 
+하지만 보안 상황에 따라 모든 오류를 표시하는 것은 보안상 위험할 수도 있다는 점을 명심해야합니다.
+
+### 서킷 브레이킹[[circuit_breaking]]
+
+웹 서버는 일반적으로 서킷 브레이킹을 수행할 때 더 나은 상황에 직면합니다.
+즉, 이는 서버가 쿼리를 무기한 기다리는 대신 과부하 상태일 때 적절한 오류를 반환하는 것을 의미합니다.
+서버가 매우 오랜 시간 동안 대기하거나 적당한 시간이 지난 후에 504 에러를 반환하는 대신 503 에러를 빠르게 반환하게 하는 것입니다.
+
+제안된 코드에는 단일 큐가 있으므로 구현하기가 비교적 쉽습니다.
+큐 크기를 확인하는 것은 웹 서버가 과부하 상항 하에 있을 때 에러를 반환하기 위한 가장 기초적인 작업입니다.
+
+### 메인 쓰레드 차단[[blocking_the_main_thread]]
+
+현재 PyTorch는 비동기 처리를 지원하지 않으며, 실행 중에는 메인 스레드가 차단됩니다. 
+따라서 PyTorch를 별도의 스레드/프로세스에서 실행하도록 강제하는 것이 좋습니다.
+여기서는 이 작업이 수행되지 않았습니다. 왜냐하면 코드가 훨씬 더 복잡하기 때문입니다(주로 스레드, 비동기 처리, 큐가 서로 잘 맞지 않기 때문입니다).
+하지만 궁극적으로는 같은 작업을 수행하는 것입니다.
+
+단일 항목의 추론이 오래 걸린다면 (> 1초), 메인 쓰레드를 차단하는 것은 중요할 수 있습니다. 왜냐하면 이 경우 추론 중 모든 쿼리는 오류를 받기 전에 1초를 기다려야 하기 때문입니다.
+
+### 동적 배치[[dynamic_batching]]
+
+일반적으로, 배치 처리가 1개 항목을 한 번에 전달하는 것에 비해 반드시 성능 향상이 있는 것은 아닙니다(자세한 내용은 [`batching details`](./main_classes/pipelines#pipeline-batching)을 참고하세요).
+하지만 올바른 설정에서 사용하면 매우 효과적일 수 있습니다.
+API에는 기본적으로 속도 저하의 가능성이 매우 높기 때문에 동적 배치 처리가 없습니다.
+하지만 매우 큰 모델인 BLOOM 추론의 경우 동적 배치 처리는 모든 사람에게 적절한 경험을 제공하는 데 **필수**입니다.
diff --git a/docs/source/ko/pr_checks.md b/docs/source/ko/pr_checks.md
new file mode 100644
index 000000000000..1d155cd1fb9d
--- /dev/null
+++ b/docs/source/ko/pr_checks.md
@@ -0,0 +1,200 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Pull Request에 대한 검사 [[checks-on-a-pull-request]]
+
+🤗 Transformers에서 Pull Request를 열 때, 기존에 있는 것을 망가뜨리지 않는지 확인하기 위해 상당한 수의 검사가 실행됩니다. 이러한 검사는 다음과 같은 네 가지 유형으로 구성됩니다:
+- 일반적인 테스트
+- 문서 빌드
+- 코드 및 문서 스타일
+- 일반 저장소 일관성
+
+이 문서에서는 이러한 다양한 검사와 그 이유를 설명하고, PR에서 하나 이상의 검사가 실패한 경우 로컬에서 어떻게 디버그하는지 알아보겠습니다.
+
+참고로, 이러한 검사를 사용하려면 개발 설치가 필요합니다:
+
+```bash
+pip install transformers[dev]
+```
+
+또는 Transformers 저장소 내에 편집 가능한 설치가 필요합니다:
+
+```bash
+pip install -e .[dev]
+```
+
+Transformers의 선택적 종속성 수가 많이 늘어났기 때문에 개발 설치를 실패할 수도 있습니다. 개발 설치가 실패하는 경우, 작업 중인 Deep Learning 프레임워크 (PyTorch, TensorFlow 및/또는 Flax)를 설치하고 다음 명령을 실행하세요.
+
+```bash
+pip install transformers[quality]
+```
+
+편집 가능한 설치의 경우는 다음 명령을 실행하세요.
+
+```bash
+pip install -e .[quality]
+```
+
+
+## 테스트 [[tests]]
+
+`ci/circleci: run_tests_`로 시작하는 모든 작업은 Transformers 테스트 모음의 일부를 실행합니다. 이러한 작업은 특정 환경에서 일부 라이브러리에 중점을 둡니다. 예를 들어 `ci/circleci: run_tests_pipelines_tf`는 TensorFlow만 설치된 환경에서 파이프라인 테스트를 실행합니다.
+
+테스트 모듈에서 실제로 변경 사항이 없을 때 테스트를 실행하지 않기 위해, 테스트 모음의 일부만 실행됩니다. 라이브러리의 변경 전후에 대한 차이를 확인하기 위해 유틸리티가 실행되고, 해당 차이에 영향을 받는 테스트가 선택됩니다. 이 유틸리티는 로컬에서 다음과 같이 실행할 수 있습니다:
+
+```bash
+python utils/tests_fetcher.py
+```
+
+Transformers 저장소의 최상단에서 실행합니다. 이 유틸리티는 다음과 같은 작업을 수행합니다:
+
+1. 변경 사항이 있는 파일마다 변경 사항이 코드인지 주석 또는 문서 문자열인지 확인합니다. 실제 코드 변경이 있는 파일만 유지됩니다.
+2. 소스 코드 파일의 각 파일에 대해 재귀적으로 영향을 주는 모든 파일을 제공하는 내부 맵을 작성합니다. 모듈 B가 모듈 A를 가져오면 모듈 A는 모듈 B에 영향을 줍니다. 재귀적인 영향에는 각 모듈이 이전 모듈을 가져오는 모듈 체인이 필요합니다.
+3. 단계 1에서 수집한 파일에 이 맵을 적용하여 PR에 영향을 받는 모델 파일 목록을 얻습니다.
+4. 각 파일을 해당하는 테스트 파일에 매핑하고 실행할 테스트 목록을 가져옵니다.
+
+로컬에서 스크립트를 실행하면 단계 1, 3 및 4의 결과를 출력하여 실행되는 테스트를 알 수 있습니다. 스크립트는 또한 `test_list.txt`라는 파일을 생성하여 실행할 테스트 목록을 포함하며, 다음 명령으로 해당 테스트를 로컬에서 실행할 수 있습니다:
+
+```bash
+python -m pytest -n 8 --dist=loadfile -rA -s $(cat test_list.txt)
+```
+
+잘못된 사항이 누락되었을 경우, 전체 테스트 모음도 매일 실행됩니다.
+
+## 문서 빌드 [[documentation-build]]
+
+`build_pr_documentation` 작업은 문서를 빌드하고 미리 보기를 생성하여 PR이 병합된 후 모든 것이 제대로 보이는지 확인합니다. 로봇은 PR에 문서 미리보기 링크를 추가합니다. PR에서 만든 변경 사항은 자동으로 미리보기에 업데이트됩니다. 문서 빌드에 실패한 경우 **세부 정보**를 클릭하여 어디에서 문제가 발생했는지 확인할 수 있습니다. 오류는 주로 `toctree`에 누락된 파일과 같이 간단한 오류입니다.
+
+로컬에서 문서를 빌드하거나 미리 볼 경우, docs 폴더의 [`README.md`](https://github.com/huggingface/transformers/tree/main/docs)를 참조하세요.
+
+## 코드 및 문서 스타일 [[code-and-documentation-style]]
+
+`black`과 `ruff`를 사용하여 모든 소스 파일, 예제 및 테스트에 코드 형식을 적용합니다. 또한, `utils/style_doc.py`에서 문서 문자열과 `rst` 파일의 형식, 그리고 Transformers의 `__init__.py` 파일에서 실행되는 지연된 임포트의 순서에 대한 사용자 정의 도구가 있습니다. 이 모든 것은 다음을 실행함으로써 실행할 수 있습니다:
+
+```bash
+make style
+```
+
+CI는 이러한 사항이 `ci/circleci: check_code_quality` 검사 내에서 적용되었는지 확인합니다. 또한 `ruff`도 실행되며, 정의되지 않은 변수나 사용되지 않은 변수를 발견하면 경고합니다. 이 검사를 로컬에서 실행하려면 다음을 사용하세요:
+
+```bash
+make quality
+```
+
+이 작업은 많은 시간이 소요될 수 있으므로 현재 브랜치에서 수정한 파일에 대해서만 동일한 작업을 실행하려면 다음을 실행하세요.
+
+```bash
+make fixup
+```
+
+이 명령은 현재 브랜치에서 수정한 파일에 대한 모든 추가적인 검사도 실행합니다. 이제 이들을 살펴보겠습니다.
+
+## 저장소 일관성 [[repository-consistency]]
+
+이는 PR이 저장소를 정상적인 상태로 유지하는지 확인하는 모든 테스트를 모은 것이며, `ci/circleci: check_repository_consistency` 검사에서 수행됩니다. 다음을 실행함으로써 로컬에서 이 검사를 실행할 수 있습니다.
+
+```bash
+make repo-consistency
+```
+
+이 검사는 다음을 확인합니다.
+
+- init에 추가된 모든 객체가 문서화되었는지 (`utils/check_repo.py`에서 수행)
+- `__init__.py` 파일의 두 섹션에 동일한 내용이 있는지 (`utils/check_inits.py`에서 수행)
+- 다른 모듈에서 복사된 코드가 원본과 일치하는지 (`utils/check_copies.py`에서 수행)
+- 모든 구성 클래스에 docstring에 언급된 유효한 체크포인트가 적어도 하나 있는지 (`utils/check_config_docstrings.py`에서 수행)
+- 모든 구성 클래스가 해당하는 모델링 파일에서 사용되는 속성만 포함하고 있는지 (`utils/check_config_attributes.py`에서 수행)
+- README와 문서 인덱스의 번역이 메인 README와 동일한 모델 목록을 가지고 있는지 (`utils/check_copies.py`에서 수행)
+- 문서의 자동 생성된 테이블이 최신 상태인지 (`utils/check_table.py`에서 수행)
+- 라이브러리에는 선택적 종속성이 설치되지 않았더라도 모든 객체가 사용 가능한지 (`utils/check_dummies.py`에서 수행)
+
+이러한 검사가 실패하는 경우, 처음 두 가지 항목은 수동으로 수정해야 하며, 나머지 네 가지 항목은 다음 명령을 실행하여 자동으로 수정할 수 있습니다.
+
+```bash
+make fix-copies
+```
+
+추가적인 검사는 새로운 모델을 추가하는 PR에 대한 것으로, 주로 다음과 같습니다:
+
+- 추가된 모든 모델이 Auto-mapping에 있는지 (`utils/check_repo.py`에서 수행)
+<!-- TODO Sylvain, add a check that makes sure the common tests are implemented.-->
+- 모든 모델이 올바르게 테스트되었는지 (`utils/check_repo.py`에서 수행)
+
+<!-- TODO Sylvain, add the following
+- 모든 모델이 메인 README, 주요 문서에 추가되었는지
+- 사용된 모든 체크포인트가 실제로 Hub에 존재하는지
+
+-->
+
+### 복사본 확인 [[check-copies]]
+
+Transformers 라이브러리는 모델 코드에 대해 매우 완고하며, 각 모델은 다른 모델에 의존하지 않고 완전히 단일 파일로 구현되어야 합니다. 이렇게 하기 위해 특정 모델의 코드 복사본이 원본과 일관된 상태로 유지되는지 확인하는 메커니즘을 추가했습니다. 따라서 버그 수정이 필요한 경우 다른 모델에 영향을 주는 모든 모델을 볼 수 있으며 수정을 적용할지 수정된 사본을 삭제할지 선택할 수 있습니다.
+
+<Tip>
+
+파일이 다른 파일의 완전한 사본인 경우 해당 파일을 `utils/check_copies.py`의 `FULL_COPIES` 상수에 등록해야 합니다.
+
+</Tip>
+
+이 메커니즘은 `# Copied from xxx` 형식의 주석을 기반으로 합니다. `xxx`에는 아래에 복사되는 클래스 또는 함수의 전체 경로가 포함되어야 합니다. 예를 들어 `RobertaSelfOutput`은 `BertSelfOutput` 클래스의 복사본입니다. 따라서 [여기](https://github.com/huggingface/transformers/blob/2bd7a27a671fd1d98059124024f580f8f5c0f3b5/src/transformers/models/roberta/modeling_roberta.py#L289)에서 주석이 있습니다:
+
+
+```py
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput
+```
+
+클래스 전체에 수정을 적용하는 대신에 복사본과 관련있는 메서드에 적용할 수도 있습니다. 예를 들어 [여기](https://github.com/huggingface/transformers/blob/2bd7a27a671fd1d98059124024f580f8f5c0f3b5/src/transformers/models/roberta/modeling_roberta.py#L598)에서 `RobertaPreTrainedModel._init_weights`가 `BertPreTrainedModel`의 동일한 메서드에서 복사된 것을 볼 수 있으며 해당 주석이 있습니다:
+
+```py
+# Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
+```
+
+복사본이 이름만 다른 경우가 있습니다: 예를 들어 `RobertaAttention`에서 `BertSelfAttention` 대신 `RobertaSelfAttention`을 사용하지만 그 외에는 코드가 완전히 동일합니다: 이 때 `# Copied from`은 `Copied from xxx with foo->bar`와 같은 간단한 문자열 대체를 지원합니다. 이는 모든 `foo` 인스턴스를 `bar`로 바꿔서 코드를 복사합니다. [여기](https://github.com/huggingface/transformers/blob/2bd7a27a671fd1d98059124024f580f8f5c0f3b5/src/transformers/models/roberta/modeling_roberta.py#L304C1-L304C86)에서 어떻게 사용되는지 볼 수 있습니다:
+
+```py
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Roberta
+```
+
+화살표 주변에는 공백이 없어야 합니다(공백이 대체 패턴의 일부인 경우는 예외입니다).
+
+대체 패턴을 쉼표로 구분하여 여러 패턴을 추가할 수 있습니다. 예를 들어 `CamemberForMaskedLM`은 두 가지 대체 사항을 가진 `RobertaForMaskedLM`의 복사본입니다: `Roberta`를 `Camembert`로 대체하고 `ROBERTA`를 `CAMEMBERT`로 대체합니다. [여기](https://github.com/huggingface/transformers/blob/15082a9dc6950ecae63a0d3e5060b2fc7f15050a/src/transformers/models/camembert/modeling_camembert.py#L929)에서 이것이 주석으로 어떻게 구현되었는지 확인할 수 있습니다:
+
+```py
+# Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM with Roberta->Camembert, ROBERTA->CAMEMBERT
+```
+
+순서가 중요한 경우(이전 수정과 충돌할 수 있는 경우) 수정은 왼쪽에서 오른쪽으로 실행됩니다.
+
+<Tip>
+
+새 변경이 서식을 변경하는 경우(짧은 이름을 매우 긴 이름으로 바꾸는 경우) 자동 서식 지정기를 적용한 후 복사본이 검사됩니다.
+
+</Tip>
+
+패턴의 대소문자가 다른 경우(대문자와 소문자가 혼용된 대체 양식) `all-casing` 옵션을 추가하는 방법도 있습니다. [여기](https://github.com/huggingface/transformers/blob/15082a9dc6950ecae63a0d3e5060b2fc7f15050a/src/transformers/models/mobilebert/modeling_mobilebert.py#L1237)에서 `MobileBertForSequenceClassification`에서 사용된 예시를 볼 수 있습니다:
+
+```py
+# Copied from transformers.models.bert.modeling_bert.BertForSequenceClassification with Bert->MobileBert all-casing
+```
+
+이 경우, 코드는 다음과 같이 복사됩니다:
+- `MobileBert`에서 `Bert`로(예: `MobileBertModel`을 init에서 사용할 때)
+- `mobilebert`에서 `bert`로(예: `self.mobilebert`를 정의할 때)
+- `MOBILEBERT`에서 `BERT`로(`MOBILEBERT_INPUTS_DOCSTRING` 상수에서)
diff --git a/docs/source/ko/preprocessing.md b/docs/source/ko/preprocessing.md
new file mode 100644
index 000000000000..7a9d2987381c
--- /dev/null
+++ b/docs/source/ko/preprocessing.md
@@ -0,0 +1,539 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 전처리[[preprocess]]
+
+[[open-in-colab]]
+
+모델을 훈련하려면 데이터 세트를 모델에 맞는 입력 형식으로 전처리해야 합니다. 텍스트, 이미지 또는 오디오인지 관계없이 데이터를 텐서 배치로 변환하고 조립할 필요가 있습니다. 🤗 Transformers는 모델에 대한 데이터를 준비하는 데 도움이 되는 일련의 전처리 클래스를 제공합니다. 이 튜토리얼에서는 다음 내용을 배울 수 있습니다:
+
+* 텍스트는 [Tokenizer](./main_classes/tokenizer)를 사용하여 토큰 시퀀스로 변환하고 토큰의 숫자 표현을 만든 후 텐서로 조립합니다.
+* 음성 및 오디오는 [Feature extractor](./main_classes/feature_extractor)를 사용하여 오디오 파형에서 시퀀스 특성을 파악하여 텐서로 변환합니다.
+* 이미지 입력은 [ImageProcessor](./main_classes/image)을 사용하여 이미지를 텐서로 변환합니다.
+* 멀티모달 입력은 [Processor](./main_classes/processors)을 사용하여 토크나이저와 특성 추출기 또는 이미지 프로세서를 결합합니다.
+
+<Tip>
+
+`AutoProcessor`는 **언제나** 작동하여 토크나이저, 이미지 프로세서, 특성 추출기 또는 프로세서 등 사용 중인 모델에 맞는 클래스를 자동으로 선택합니다.
+
+</Tip>
+
+시작하기 전에 🤗 Datasets를 설치하여 실험에 사용할 데이터를 불러올 수 있습니다:
+
+```bash
+pip install datasets
+```
+
+## 자연어처리[[natural-language-processing]]
+
+<Youtube id="Yffk5aydLzg"/>
+
+텍스트 데이터를 전처리하기 위한 기본 도구는 [tokenizer](main_classes/tokenizer)입니다. 토크나이저는 일련의 규칙에 따라 텍스트를 *토큰*으로 나눕니다. 토큰은 숫자로 변환되고 텐서는 모델 입력이 됩니다. 모델에 필요한 추가 입력은 토크나이저에 의해 추가됩니다.
+
+<Tip>
+
+사전훈련된 모델을 사용할 계획이라면 모델과 함께 사전훈련된 토크나이저를 사용하는 것이 중요합니다. 이렇게 하면 텍스트가 사전훈련 말뭉치와 동일한 방식으로 분할되고 사전훈련 중에 동일한 해당 토큰-인덱스 쌍(일반적으로 *vocab*이라고 함)을 사용합니다.
+
+</Tip>
+
+시작하려면 [`AutoTokenizer.from_pretrained`] 메소드를 사용하여 사전훈련된 토크나이저를 불러오세요. 모델과 함께 사전훈련된 *vocab*을 다운로드합니다:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+```
+
+그 다음으로 텍스트를 토크나이저에 넣어주세요:
+
+```py
+>>> encoded_input = tokenizer("Do not meddle in the affairs of wizards, for they are subtle and quick to anger.")
+>>> print(encoded_input)
+{'input_ids': [101, 2079, 2025, 19960, 10362, 1999, 1996, 3821, 1997, 16657, 1010, 2005, 2027, 2024, 11259, 1998, 4248, 2000, 4963, 1012, 102],
+ 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+ 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+```
+
+토크나이저는 세 가지 중요한 항목을 포함한 딕셔너리를 반환합니다:
+
+* [input_ids](glossary#input-ids)는 문장의 각 토큰에 해당하는 인덱스입니다.
+* [attention_mask](glossary#attention-mask)는 토큰을 처리해야 하는지 여부를 나타냅니다.
+* [token_type_ids](glossary#token-type-ids)는 두 개 이상의 시퀀스가 있을 때 토큰이 속한 시퀀스를 식별합니다.
+
+`input_ids`를 디코딩하여 입력을 반환합니다:
+
+```py
+>>> tokenizer.decode(encoded_input["input_ids"])
+'[CLS] Do not meddle in the affairs of wizards, for they are subtle and quick to anger. [SEP]'
+```
+
+토크나이저가 두 개의 특수한 토큰(분류 토큰 `CLS`와 분할 토큰 `SEP`)을 문장에 추가했습니다.
+모든 모델에 특수한 토큰이 필요한 것은 아니지만, 필요하다면 토크나이저가 자동으로 추가합니다.
+
+전처리할 문장이 여러 개 있는 경우에는 리스트로 토크나이저에 전달합니다:
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_inputs = tokenizer(batch_sentences)
+>>> print(encoded_inputs)
+{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102],
+               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
+               [101, 1327, 1164, 5450, 23434, 136, 102]],
+ 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0]],
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1, 1, 1]]}
+```
+
+### 패딩[[pad]]
+
+모델 입력인 텐서는 모양이 균일해야 하지만, 문장의 길이가 항상 같지는 않기 때문에 문제가 될 수 있습니다. 패딩은 짧은 문장에 특수한 *패딩 토큰*을 추가하여 텐서를 직사각형 모양이 되도록 하는 전략입니다.
+
+`padding` 매개변수를 `True`로 설정하여 배치 내의 짧은 시퀀스를 가장 긴 시퀀스에 맞춰 패딩합니다.
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch_sentences, padding=True)
+>>> print(encoded_input)
+{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
+               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
+               [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]],
+ 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]}
+```
+
+길이가 짧은 첫 문장과 세 번째 문장이 이제 `0`으로 채워졌습니다.
+
+### 잘라내기[[truncation]]
+
+한편, 때로는 시퀀스가 모델에서 처리하기에 너무 길 수도 있습니다. 이 경우, 시퀀스를 더 짧게 줄일 필요가 있습니다.
+
+모델에서 허용하는 최대 길이로 시퀀스를 자르려면 `truncation` 매개변수를 `True`로 설정하세요:
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True)
+>>> print(encoded_input)
+{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
+               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
+               [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]],
+ 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]}
+```
+
+<Tip>
+
+다양한 패딩과 잘라내기 인수에 대해 더 알아보려면 [패딩과 잘라내기](./pad_truncation) 개념 가이드를 확인해보세요.
+
+</Tip>
+
+### 텐서 만들기[[build-tensors]]
+
+마지막으로, 토크나이저가 모델에 공급되는 실제 텐서를 반환하도록 합니다.
+
+`return_tensors` 매개변수를 PyTorch의 경우 `pt`, TensorFlow의 경우 `tf`로 설정하세요:
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt")
+>>> print(encoded_input)
+{'input_ids': tensor([[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
+                      [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
+                      [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]]),
+ 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
+ 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+                           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                           [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}
+```
+</pt>
+<tf>
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="tf")
+>>> print(encoded_input)
+{'input_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
+array([[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
+       [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
+       [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]],
+      dtype=int32)>,
+ 'token_type_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
+array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>,
+ 'attention_mask': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
+array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+       [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>}
+```
+</tf>
+</frameworkcontent>
+
+## 오디오[[audio]]
+
+오디오 작업은 모델에 맞는 데이터 세트를 준비하기 위해 [특성 추출기](main_classes/feature_extractor)가 필요합니다. 특성 추출기는 원시 오디오 데이터에서 특성를 추출하고 이를 텐서로 변환하는 것이 목적입니다.
+
+오디오 데이터 세트에 특성 추출기를 사용하는 방법을 보기 위해 [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) 데이터 세트를 가져오세요. (데이터 세트를 가져오는 방법은 🤗 [데이터 세트 튜토리얼](https://huggingface.co/docs/datasets/load_hub.html)에서 자세히 설명하고 있습니다.)
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
+```
+
+`audio` 열의 첫 번째 요소에 접근하여 입력을 살펴보세요. `audio` 열을 호출하면 오디오 파일을 자동으로 가져오고 리샘플링합니다.
+
+```py
+>>> dataset[0]["audio"]
+{'array': array([ 0.        ,  0.00024414, -0.00024414, ..., -0.00024414,
+         0.        ,  0.        ], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
+ 'sampling_rate': 8000}
+```
+
+이렇게 하면 세 가지 항목이 반환됩니다:
+
+* `array`는 1D 배열로 가져와서 (필요한 경우) 리샘플링된 음성 신호입니다.
+* `path`는 오디오 파일의 위치를 가리킵니다.
+* `sampling_rate`는 음성 신호에서 초당 측정되는 데이터 포인트 수를 나타냅니다.
+
+이 튜토리얼에서는 [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) 모델을 사용합니다. 모델 카드를 보면 Wav2Vec2가 16kHz 샘플링된 음성 오디오를 기반으로 사전훈련된 것을 알 수 있습니다.
+모델을 사전훈련하는 데 사용된 데이터 세트의 샘플링 레이트와 오디오 데이터의 샘플링 레이트가 일치해야 합니다. 데이터의 샘플링 레이트가 다르면 데이터를 리샘플링해야 합니다.
+
+1. 🤗 Datasets의 [`~datasets.Dataset.cast_column`] 메소드를 사용하여 샘플링 레이트를 16kHz로 업샘플링하세요:
+
+```py
+>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000))
+```
+
+2. 오디오 파일을 리샘플링하기 위해 `audio` 열을 다시 호출합니다:
+
+```py
+>>> dataset[0]["audio"]
+{'array': array([ 2.3443763e-05,  2.1729663e-04,  2.2145823e-04, ...,
+         3.8356509e-05, -7.3497440e-06, -2.1754686e-05], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
+ 'sampling_rate': 16000}
+```
+
+다음으로, 입력을 정규화하고 패딩할 특성 추출기를 가져오세요. 텍스트 데이터의 경우, 더 짧은 시퀀스에 대해 `0`이 추가됩니다. 오디오 데이터에도 같은 개념이 적용됩니다.
+특성 추출기는 배열에 `0`(묵음으로 해석)을 추가합니다.
+
+[`AutoFeatureExtractor.from_pretrained`]를 사용하여 특성 추출기를 가져오세요:
+
+```py
+>>> from transformers import AutoFeatureExtractor
+
+>>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
+```
+
+오디오 `array`를 특성 추출기에 전달하세요. 또한, 발생할 수 있는 조용한 오류(silent errors)를 더 잘 디버깅할 수 있도록 특성 추출기에 `sampling_rate` 인수를 추가하는 것을 권장합니다.
+
+```py
+>>> audio_input = [dataset[0]["audio"]["array"]]
+>>> feature_extractor(audio_input, sampling_rate=16000)
+{'input_values': [array([ 3.8106556e-04,  2.7506407e-03,  2.8015103e-03, ...,
+        5.6335266e-04,  4.6588284e-06, -1.7142107e-04], dtype=float32)]}
+```
+
+토크나이저와 마찬가지로 배치 내에서 가변적인 시퀀스를 처리하기 위해 패딩 또는 잘라내기를 적용할 수 있습니다. 이 두 개의 오디오 샘플의 시퀀스 길이를 확인해보세요:
+
+```py
+>>> dataset[0]["audio"]["array"].shape
+(173398,)
+
+>>> dataset[1]["audio"]["array"].shape
+(106496,)
+```
+
+오디오 샘플의 길이가 동일하도록 데이터 세트를 전처리하는 함수를 만드세요. 최대 샘플 길이를 지정하면 특성 추출기가 해당 길이에 맞춰 시퀀스를 패딩하거나 잘라냅니다:
+
+```py
+>>> def preprocess_function(examples):
+...     audio_arrays = [x["array"] for x in examples["audio"]]
+...     inputs = feature_extractor(
+...         audio_arrays,
+...         sampling_rate=16000,
+...         padding=True,
+...         max_length=100000,
+...         truncation=True,
+...     )
+...     return inputs
+```
+
+`preprocess_function`을 데이터 세트의 처음 예시 몇 개에 적용해보세요:
+
+```py
+>>> processed_dataset = preprocess_function(dataset[:5])
+```
+
+이제 샘플 길이가 모두 같고 지정된 최대 길이에 맞게 되었습니다. 드디어 전처리된 데이터 세트를 모델에 전달할 수 있습니다!
+
+```py
+>>> processed_dataset["input_values"][0].shape
+(100000,)
+
+>>> processed_dataset["input_values"][1].shape
+(100000,)
+```
+
+## 컴퓨터 비전[[computer-vision]]
+
+컴퓨터 비전 작업의 경우, 모델에 대한 데이터 세트를 준비하기 위해 [이미지 프로세서](main_classes/image_processor)가 필요합니다.
+이미지 전처리는 이미지를 모델이 예상하는 입력으로 변환하는 여러 단계로 이루어집니다.
+이러한 단계에는 크기 조정, 정규화, 색상 채널 보정, 이미지의 텐서 변환 등이 포함됩니다.
+
+<Tip>
+
+이미지 전처리는 이미지 증강 기법을 몇 가지 적용한 뒤에 할 수도 있습니다.
+이미지 전처리 및 이미지 증강은 모두 이미지 데이터를 변형하지만, 서로 다른 목적을 가지고 있습니다:
+
+* 이미지 증강은 과적합(over-fitting)을 방지하고 모델의 견고함(resiliency)을 높이는 데 도움이 되는 방식으로 이미지를 수정합니다.
+밝기와 색상 조정, 자르기, 회전, 크기 조정, 확대/축소 등 다양한 방법으로 데이터를 증강할 수 있습니다.
+그러나 증강으로 이미지의 의미가 바뀌지 않도록 주의해야 합니다.
+* 이미지 전처리는 이미지가 모델이 예상하는 입력 형식과 일치하도록 보장합니다.
+컴퓨터 비전 모델을 미세 조정할 때 이미지는 모델이 초기에 훈련될 때와 정확히 같은 방식으로 전처리되어야 합니다.
+
+이미지 증강에는 원하는 라이브러리를 무엇이든 사용할 수 있습니다. 이미지 전처리에는 모델과 연결된 `ImageProcessor`를 사용합니다.
+
+</Tip>
+
+[food101](https://huggingface.co/datasets/food101) 데이터 세트를 가져와서 컴퓨터 비전 데이터 세트에서 이미지 프로세서를 어떻게 사용하는지 알아보세요.
+데이터 세트를 불러오는 방법은 🤗 [데이터 세트 튜토리얼](https://huggingface.co/docs/datasets/load_hub.html)을 참고하세요.
+
+<Tip>
+
+데이터 세트가 상당히 크기 때문에 🤗 Datasets의 `split` 매개변수를 사용하여 훈련 세트에서 작은 샘플만 가져오세요!
+
+</Tip>
+
+```py
+>>> from datasets import load_dataset
+
+>>> dataset = load_dataset("food101", split="train[:100]")
+```
+
+다음으로, 🤗 Datasets의 [`image`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=image#datasets.Image)로 이미지를 확인해보세요:
+
+```py
+>>> dataset[0]["image"]
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vision-preprocess-tutorial.png"/>
+</div>
+
+[`AutoImageProcessor.from_pretrained`]로 이미지 프로세서를 가져오세요:
+
+```py
+>>> from transformers import AutoImageProcessor
+
+>>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
+```
+
+먼저 이미지 증강 단계를 추가해 봅시다. 아무 라이브러리나 사용해도 괜찮지만, 이번 튜토리얼에서는 torchvision의 [`transforms`](https://pytorch.org/vision/stable/transforms.html) 모듈을 사용하겠습니다.
+다른 데이터 증강 라이브러리를 사용해보고 싶다면, [Albumentations](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb) 또는 [Kornia notebooks](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb)에서 어떻게 사용하는지 배울 수 있습니다.
+
+1. [`Compose`](https://pytorch.org/vision/master/generated/torchvision.transforms.Compose.html)로  [`RandomResizedCrop`](https://pytorch.org/vision/main/generated/torchvision.transforms.RandomResizedCrop.html)와 [`ColorJitter`](https://pytorch.org/vision/main/generated/torchvision.transforms.ColorJitter.html) 등 변환을 몇 가지 연결하세요.
+참고로 크기 조정에 필요한 이미지의 크기 요구사항은 `image_processor`에서 가져올 수 있습니다.
+일부 모델은 정확한 높이와 너비를 요구하지만, 제일 짧은 변의 길이(`shortest_edge`)만 정의된 모델도 있습니다.
+
+```py
+>>> from torchvision.transforms import RandomResizedCrop, ColorJitter, Compose
+
+>>> size = (
+...     image_processor.size["shortest_edge"]
+...     if "shortest_edge" in image_processor.size
+...     else (image_processor.size["height"], image_processor.size["width"])
+... )
+
+>>> _transforms = Compose([RandomResizedCrop(size), ColorJitter(brightness=0.5, hue=0.5)])
+```
+
+2. 모델은 입력으로 [`pixel_values`](model_doc/visionencoderdecoder#transformers.VisionEncoderDecoderModel.forward.pixel_values)를 받습니다.
+`ImageProcessor`는 이미지 정규화 및 적절한 텐서 생성을 처리할 수 있습니다.
+배치 이미지에 대한 이미지 증강 및 이미지 전처리를 결합하고 `pixel_values`를 생성하는 함수를 만듭니다:
+
+```py
+>>> def transforms(examples):
+...     images = [_transforms(img.convert("RGB")) for img in examples["image"]]
+...     examples["pixel_values"] = image_processor(images, do_resize=False, return_tensors="pt")["pixel_values"]
+...     return examples
+```
+
+<Tip>
+
+위의 예에서는 이미지 증강 중에 이미지 크기를 조정했기 때문에 `do_resize=False`로 설정하고, 해당 `image_processor`에서 `size` 속성을 활용했습니다.
+이미지 증강 중에 이미지 크기를 조정하지 않은 경우 이 매개변수를 생략하세요.
+기본적으로는 `ImageProcessor`가 크기 조정을 처리합니다.
+
+증강 변환 과정에서 이미지를 정규화하려면 `image_processor.image_mean` 및 `image_processor.image_std` 값을 사용하세요.
+
+</Tip>
+
+3. 🤗 Datasets의 [`set_transform`](https://huggingface.co/docs/datasets/process.html#format-transform)를 사용하여 실시간으로 변환을 적용합니다:
+
+```py
+>>> dataset.set_transform(transforms)
+```
+
+4. 이제 이미지에 접근하면 이미지 프로세서가 `pixel_values`를 추가한 것을 알 수 있습니다.
+드디어 처리된 데이터 세트를 모델에 전달할 수 있습니다!
+
+```py
+>>> dataset[0].keys()
+```
+
+다음은 변형이 적용된 후의 이미지입니다. 이미지가 무작위로 잘려나갔고 색상 속성이 다릅니다.
+
+```py
+>>> import numpy as np
+>>> import matplotlib.pyplot as plt
+
+>>> img = dataset[0]["pixel_values"]
+>>> plt.imshow(img.permute(1, 2, 0))
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/preprocessed_image.png"/>
+</div>
+
+<Tip>
+
+`ImageProcessor`는 객체 감지, 시맨틱 세그멘테이션(semantic segmentation), 인스턴스 세그멘테이션(instance segmentation), 파놉틱 세그멘테이션(panoptic segmentation)과 같은 작업에 대한 후처리 방법을 제공합니다.
+이러한 방법은 모델의 원시 출력을 경계 상자나 세그멘테이션 맵과 같은 의미 있는 예측으로 변환해줍니다.
+
+</Tip>
+
+### 패딩[[pad]]
+
+예를 들어, [DETR](./model_doc/detr)와 같은 경우에는 모델이 훈련할 때 크기 조정 증강을 적용합니다.
+이로 인해 배치 내 이미지 크기가 달라질 수 있습니다.
+[`DetrImageProcessor`]의 [`DetrImageProcessor.pad`]를 사용하고 사용자 정의 `collate_fn`을 정의해서 배치 이미지를 처리할 수 있습니다.
+
+```py
+>>> def collate_fn(batch):
+...     pixel_values = [item["pixel_values"] for item in batch]
+...     encoding = image_processor.pad(pixel_values, return_tensors="pt")
+...     labels = [item["labels"] for item in batch]
+...     batch = {}
+...     batch["pixel_values"] = encoding["pixel_values"]
+...     batch["pixel_mask"] = encoding["pixel_mask"]
+...     batch["labels"] = labels
+...     return batch
+```
+
+## 멀티모달[[multimodal]]
+
+멀티모달 입력이 필요한 작업의 경우, 모델에 데이터 세트를 준비하기 위한 [프로세서](main_classes/processors)가 필요합니다.
+프로세서는 토크나이저와 특성 추출기와 같은 두 가지 처리 객체를 결합합니다.
+
+[LJ Speech](https://huggingface.co/datasets/lj_speech) 데이터 세트를 가져와서 자동 음성 인식(ASR)을 위한 프로세서를 사용하는 방법을 확인하세요.
+(데이터 세트를 가져오는 방법에 대한 자세한 내용은 🤗 [데이터 세트 튜토리얼](https://huggingface.co/docs/datasets/load_hub.html)에서 볼 수 있습니다.)
+
+```py
+>>> from datasets import load_dataset
+
+>>> lj_speech = load_dataset("lj_speech", split="train")
+```
+
+자동 음성 인식(ASR)에서는 `audio`와 `text`에만 집중하면 되므로, 다른 열들은 제거할 수 있습니다:
+
+```py
+>>> lj_speech = lj_speech.map(remove_columns=["file", "id", "normalized_text"])
+```
+
+이제 `audio`와 `text`열을 살펴보세요:
+
+```py
+>>> lj_speech[0]["audio"]
+{'array': array([-7.3242188e-04, -7.6293945e-04, -6.4086914e-04, ...,
+         7.3242188e-04,  2.1362305e-04,  6.1035156e-05], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav',
+ 'sampling_rate': 22050}
+
+>>> lj_speech[0]["text"]
+'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition'
+```
+
+기존에 사전훈련된 모델에서 사용된 데이터 세트와 새로운 오디오 데이터 세트의 샘플링 레이트를 일치시키기 위해 오디오 데이터 세트의 샘플링 레이트를 [리샘플링](preprocessing#audio)해야 합니다!
+
+```py
+>>> lj_speech = lj_speech.cast_column("audio", Audio(sampling_rate=16_000))
+```
+
+[`AutoProcessor.from_pretrained`]로 프로세서를 가져오세요:
+
+```py
+>>> from transformers import AutoProcessor
+
+>>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
+```
+
+1. `array`에 들어 있는 오디오 데이터를 `input_values`로 변환하고 `text`를 토큰화하여 `labels`로 변환하는 함수를 만듭니다.
+모델의 입력은 다음과 같습니다:
+
+```py
+>>> def prepare_dataset(example):
+...     audio = example["audio"]
+
+...     example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000))
+
+...     return example
+```
+
+2. 샘플을 `prepare_dataset` 함수에 적용하세요:
+
+```py
+>>> prepare_dataset(lj_speech[0])
+```
+
+이제 프로세서가 `input_values`와 `labels`를 추가하고, 샘플링 레이트도 올바르게 16kHz로 다운샘플링했습니다.
+드디어 처리된 데이터 세트를 모델에 전달할 수 있습니다!
diff --git a/docs/source/ko/quicktour.md b/docs/source/ko/quicktour.md
new file mode 100644
index 000000000000..a456c4e0017a
--- /dev/null
+++ b/docs/source/ko/quicktour.md
@@ -0,0 +1,557 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 둘러보기 [[quick-tour]]
+
+[[open-in-colab]]
+
+🤗 Transformers를 시작해보세요! 개발해본 적이 없더라도 쉽게 읽을 수 있도록 쓰인 이 글은 [`pipeline`](./main_classes/pipelines)을 사용하여 추론하고, 사전학습된 모델과 전처리기를 [AutoClass](./model_doc/auto)로 로드하고, PyTorch 또는 TensorFlow로 모델을 빠르게 학습시키는 방법을 소개해 드릴 것입니다. 본 가이드에서 소개되는 개념을 (특히 초보자의 관점으로) 더 친절하게 접하고 싶다면, 튜토리얼이나 [코스](https://huggingface.co/course/chapter1/1)를 참조하기를 권장합니다.
+
+시작하기 전에 필요한 라이브러리가 모두 설치되어 있는지 확인하세요:
+
+```bash
+!pip install transformers datasets
+```
+
+또한 선호하는 머신 러닝 프레임워크를 설치해야 합니다:
+
+<frameworkcontent>
+<pt>
+
+```bash
+pip install torch
+```
+</pt>
+<tf>
+
+```bash
+pip install tensorflow
+```
+</tf>
+</frameworkcontent>
+
+## 파이프라인 [[pipeline]]
+
+<Youtube id="tiZFewofSLM"/>
+
+[`pipeline`](./main_classes/pipelines)은 사전 훈련된 모델로 추론하기에 가장 쉽고 빠른 방법입니다. [`pipeline`]은 여러 모달리티에서 다양한 과업을 쉽게 처리할 수 있으며, 아래 표에 표시된 몇 가지 과업을 기본적으로 지원합니다:
+
+<Tip>
+
+사용 가능한 작업의 전체 목록은 [Pipelines API 참조](./main_classes/pipelines)를 확인하세요.
+
+</Tip>
+
+| **태스크**      | **설명**                                                             | **모달리티**     | **파이프라인 ID**                             |
+|-----------------|----------------------------------------------------------------------|------------------|-----------------------------------------------|
+| 텍스트 분류      | 텍스트에 알맞은 레이블 붙이기                                         | 자연어 처리(NLP) | pipeline(task="sentiment-analysis")           |
+| 텍스트 생성      | 주어진 문자열 입력과 이어지는 텍스트 생성하기                       | 자연어 처리(NLP) | pipeline(task="text-generation")              |
+| 개체명 인식      | 문자열의 각 토큰마다 알맞은 레이블 붙이기 (인물, 조직, 장소 등등)     | 자연어 처리(NLP) | pipeline(task="ner")                          |
+| 질의응답         | 주어진 문맥과 질문에 따라 올바른 대답하기                           | 자연어 처리(NLP) | pipeline(task="question-answering")           |
+| 빈칸 채우기      | 문자열의 빈칸에 알맞은 토큰 맞추기                                  | 자연어 처리(NLP) | pipeline(task="fill-mask")                    |
+| 요약             | 텍스트나 문서를 요약하기                                            | 자연어 처리(NLP) | pipeline(task="summarization")                |
+| 번역             | 텍스트를 한 언어에서 다른 언어로 번역하기                           | 자연어 처리(NLP) | pipeline(task="translation")                  |
+| 이미지 분류      | 이미지에 알맞은 레이블 붙이기                                         | 컴퓨터 비전(CV)  | pipeline(task="image-classification")         |
+| 이미지 분할      | 이미지의 픽셀마다 레이블 붙이기(시맨틱, 파놉틱 및 인스턴스 분할 포함) | 컴퓨터 비전(CV)  | pipeline(task="image-segmentation")           |
+| 객체 탐지        | 이미지 속 객체의 경계 상자를 그리고 클래스를 예측하기               | 컴퓨터 비전(CV)  | pipeline(task="object-detection")             |
+| 오디오 분류      | 오디오 파일에 알맞은 레이블 붙이기                                    | 오디오           | pipeline(task="audio-classification")         |
+| 자동 음성 인식   | 오디오 파일 속 음성을 텍스트로 바꾸기                               | 오디오           | pipeline(task="automatic-speech-recognition") |
+| 시각 질의응답    | 주어진 이미지와 질문에 대해 올바르게 대답하기                       | 멀티모달         | pipeline(task="vqa")                          |
+| 문서 질의응답    | 주어진 문서와 질문에 대해 올바르게 대답하기                         | 멀티모달         | pipeline(task="document-question-answering")  |
+| 이미지 캡션 달기 | 주어진 이미지의 캡션 생성하기                                       | 멀티모달         | pipeline(task="image-to-text")                |
+
+먼저 [`pipeline`]의 인스턴스를 생성하고 사용할 작업을 지정합니다. 이 가이드에서는 감정 분석을 위해 [`pipeline`]을 사용하는 예제를 보여드리겠습니다:
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline("sentiment-analysis")
+```
+
+[`pipeline`]은 감정 분석을 위한 [사전 훈련된 모델](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)과 토크나이저를 자동으로 다운로드하고 캐시합니다. 이제 `classifier`를 대상 텍스트에 사용할 수 있습니다:
+
+```py
+>>> classifier("We are very happy to show you the 🤗 Transformers library.")
+[{'label': 'POSITIVE', 'score': 0.9998}]
+```
+
+만약 입력이 여러 개 있는 경우, 입력을 리스트로 [`pipeline`]에 전달하여, 사전 훈련된 모델의 출력을 딕셔너리로 이루어진 리스트 형태로 받을 수 있습니다:
+
+```py
+>>> results = classifier(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."])
+>>> for result in results:
+...     print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
+label: POSITIVE, with score: 0.9998
+label: NEGATIVE, with score: 0.5309
+```
+
+[`pipeline`]은 주어진 과업에 관계없이 데이터셋 전부를 순회할 수도 있습니다. 이 예제에서는 자동 음성 인식을 과업으로 선택해 보겠습니다:
+
+```py
+>>> import torch
+>>> from transformers import pipeline
+
+>>> speech_recognizer = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
+```
+
+데이터셋을 로드할 차례입니다. (자세한 내용은 🤗 Datasets [시작하기](https://huggingface.co/docs/datasets/quickstart#audio)을 참조하세요) 여기에서는 [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) 데이터셋을 로드하겠습니다:
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")  # doctest: +IGNORE_RESULT
+```
+
+데이터셋의 샘플링 레이트가 기존 모델인 [`facebook/wav2vec2-base-960h`](https://huggingface.co/facebook/wav2vec2-base-960h)의 훈련 당시 샘플링 레이트와 일치하는지 확인해야 합니다:
+
+```py
+>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=speech_recognizer.feature_extractor.sampling_rate))
+```
+
+`"audio"` 열을 호출하면 자동으로 오디오 파일을 가져와서 리샘플링합니다. 첫 4개 샘플에서 원시 웨이브폼 배열을 추출하고 파이프라인에 리스트로 전달하세요:
+
+```py
+>>> result = speech_recognizer(dataset[:4]["audio"])
+>>> print([d["text"] for d in result])
+['I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT', "FONDERING HOW I'D SET UP A JOIN TO HELL T WITH MY WIFE AND WHERE THE AP MIGHT BE", "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE APSO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AN I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS", 'HOW DO I FURN A JOINA COUT']
+```
+
+음성이나 비전과 같이 입력이 큰 대규모 데이터셋의 경우, 모든 입력을 메모리에 로드하려면 리스트 대신 제너레이터 형태로 전달해야 합니다. 자세한 내용은 [Pipelines API 참조](./main_classes/pipelines)를 확인하세요.
+
+### 파이프라인에서 다른 모델과 토크나이저 사용하기 [[use-another-model-and-tokenizer-in-the-pipeline]]
+
+[`pipeline`]은 [Hub](https://huggingface.co/models)의 모든 모델을 사용할 수 있기 때문에, [`pipeline`]을 다른 용도에 맞게 쉽게 수정할 수 있습니다. 예를 들어, 프랑스어 텍스트를 처리할 수 있는 모델을 사용하기 위해선 Hub의 태그를 사용하여 적절한 모델을 필터링하면 됩니다. 필터링된 결과의 상위 항목으로는 프랑스어 텍스트에 사용할 수 있는 다국어 [BERT 모델](https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment)이 반환됩니다:
+
+```py
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+```
+
+<frameworkcontent>
+<pt>
+[`AutoModelForSequenceClassification`]과 [`AutoTokenizer`]를 사용하여 사전 훈련된 모델과 관련된 토크나이저를 로드하세요 (다음 섹션에서 [`AutoClass`]에 대해 더 자세히 알아보겠습니다):
+
+```py
+>>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained(model_name)
+>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+```
+</pt>
+<tf>
+[`TFAutoModelForSequenceClassification`]과 [`AutoTokenizer`]를 사용하여 사전 훈련된 모델과 관련된 토크나이저를 로드하세요 (다음 섹션에서 [`TFAutoClass`]에 대해 더 자세히 알아보겠습니다):
+
+```py
+>>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+
+>>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
+>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+```
+</tf>
+</frameworkcontent>
+
+[`pipeline`]에서 모델과 토크나이저를 지정하면, 이제 `classifier`를 프랑스어 텍스트에 적용할 수 있습니다:
+
+```py
+>>> classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
+>>> classifier("Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers.")
+[{'label': '5 stars', 'score': 0.7273}]
+```
+
+마땅한 모델을 찾을 수 없는 경우 데이터를 기반으로 사전 훈련된 모델을 미세조정해야 합니다. 미세조정 방법에 대한 자세한 내용은 [미세조정 튜토리얼](./training)을 참조하세요. 사전 훈련된 모델을 미세조정한 후에는 모델을 Hub의 커뮤니티와 공유하여 머신러닝 민주화에 기여해주세요! 🤗
+
+## AutoClass [[autoclass]]
+
+<Youtube id="AhChOFRegn4"/>
+
+[`AutoModelForSequenceClassification`]과 [`AutoTokenizer`] 클래스는 위에서 다룬 [`pipeline`]의 기능을 구현하는 데 사용됩니다. [AutoClass](./model_doc/auto)는 사전 훈련된 모델의 아키텍처를 이름이나 경로에서 자동으로 가져오는 '바로가기'입니다. 과업에 적합한 `AutoClass`를 선택하고 해당 전처리 클래스를 선택하기만 하면 됩니다.
+
+이전 섹션의 예제로 돌아가서 [`pipeline`]의 결과를 `AutoClass`를 활용해 복제하는 방법을 살펴보겠습니다.
+
+### AutoTokenizer [[autotokenizer]]
+
+토크나이저는 텍스트를 모델의 입력으로 사용하기 위해 숫자 배열 형태로 전처리하는 역할을 담당합니다. 토큰화 과정에는 단어를 어디에서 끊을지, 어느 수준까지 나눌지와 같은 여러 규칙들이 있습니다 (토큰화에 대한 자세한 내용은 [토크나이저 요약](./tokenizer_summary)을 참조하세요). 가장 중요한 점은 모델이 사전 훈련된 모델과 동일한 토큰화 규칙을 사용하도록 동일한 모델 이름으로 토크나이저를 인스턴스화해야 한다는 것입니다.
+
+[`AutoTokenizer`]로 토크나이저를 로드하세요:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+```
+
+텍스트를 토크나이저에 전달하세요:
+
+```py
+>>> encoding = tokenizer("We are very happy to show you the 🤗 Transformers library.")
+>>> print(encoding)
+{'input_ids': [101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103, 100, 58263, 13299, 119, 102],
+ 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+ 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+```
+
+토크나이저는 다음을 포함한 딕셔너리를 반환합니다:
+
+* [input_ids](./glossary#input-ids): 토큰의 숫자 표현.
+* [attention_mask](.glossary#attention-mask): 어떤 토큰에 주의를 기울여야 하는지를 나타냅니다.
+
+토크나이저는 입력을 리스트 형태로도 받을 수 있으며, 텍스트를 패딩하고 잘라내어 일정한 길이의 묶음을 반환할 수도 있습니다:
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> pt_batch = tokenizer(
+...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
+...     padding=True,
+...     truncation=True,
+...     max_length=512,
+...     return_tensors="pt",
+... )
+```
+</pt>
+<tf>
+
+```py
+>>> tf_batch = tokenizer(
+...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
+...     padding=True,
+...     truncation=True,
+...     max_length=512,
+...     return_tensors="tf",
+... )
+```
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+[전처리](./preprocessing) 튜토리얼을 참조하시면 토큰화에 대한 자세한 설명과 함께 이미지, 오디오와 멀티모달 입력을 전처리하기 위한 [`AutoImageProcessor`]와 [`AutoFeatureExtractor`], [`AutoProcessor`]의 사용방법도 알 수 있습니다.
+
+</Tip>
+
+### AutoModel [[automodel]]
+
+<frameworkcontent>
+<pt>
+🤗 Transformers는 사전 훈련된 인스턴스를 간단하고 통합된 방법으로 로드할 수 있습니다. 즉, [`AutoTokenizer`]처럼 [`AutoModel`]을 로드할 수 있습니다. 유일한 차이점은 과업에 알맞은 [`AutoModel`]을 선택해야 한다는 점입니다. 텍스트 (또는 시퀀스) 분류의 경우 [`AutoModelForSequenceClassification`]을 로드해야 합니다:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)
+```
+
+<Tip>
+
+[`AutoModel`] 클래스에서 지원하는 과업에 대해서는 [과업 요약](./task_summary)을 참조하세요.
+
+</Tip>
+
+이제 전처리된 입력 묶음을 직접 모델에 전달해야 합니다. 아래처럼 `**`를 앞에 붙여 딕셔너리를 풀어주면 됩니다:
+
+```py
+>>> pt_outputs = pt_model(**pt_batch)
+```
+
+모델의 최종 활성화 함수 출력은 `logits` 속성에 담겨있습니다. `logits`에 softmax 함수를 적용하여 확률을 얻을 수 있습니다:
+
+```py
+>>> from torch import nn
+
+>>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
+>>> print(pt_predictions)
+tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
+        [0.2084, 0.1826, 0.1969, 0.1755, 0.2365]], grad_fn=<SoftmaxBackward0>)
+```
+</pt>
+<tf>
+🤗 Transformers는 사전 훈련된 인스턴스를 간단하고 통합된 방법으로 로드할 수 있습니다. 즉, [`AutoTokenizer`]처럼 [`TFAutoModel`]을 로드할 수 있습니다. 유일한 차이점은 과업에 알맞은 [`TFAutoModel`]을 선택해야 한다는 점입니다. 텍스트 (또는 시퀀스) 분류의 경우 [`TFAutoModelForSequenceClassification`]을 로드해야 합니다:
+
+```py
+>>> from transformers import TFAutoModelForSequenceClassification
+
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
+```
+
+<Tip>
+
+[`AutoModel`] 클래스에서 지원하는 과업에 대해서는 [과업 요약](./task_summary)을 참조하세요.
+
+</Tip>
+
+이제 전처리된 입력 묶음을 직접 모델에 전달해야 합니다. 아래처럼 그대로 텐서를 전달하면 됩니다:
+
+```py
+>>> tf_outputs = tf_model(tf_batch)
+```
+
+모델의 최종 활성화 함수 출력은 `logits` 속성에 담겨있습니다. `logits`에 softmax 함수를 적용하여 확률을 얻을 수 있습니다:
+
+```py
+>>> import tensorflow as tf
+
+>>> tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1)
+>>> tf_predictions  # doctest: +IGNORE_RESULT
+```
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+모든 🤗 Transformers 모델(PyTorch 또는 TensorFlow)은 (softmax와 같은) 최종 활성화 함수 *이전에* 텐서를 출력합니다. 왜냐하면 최종 활성화 함수의 출력은 종종 손실 함수 출력과 결합되기 때문입니다. 모델 출력은 특수한 데이터 클래스이므로 IDE에서 자동 완성됩니다. 모델 출력은 튜플이나 딕셔너리처럼 동작하며 (정수, 슬라이스 또는 문자열로 인덱싱 가능), None인 속성은 무시됩니다.
+
+</Tip>
+
+### 모델 저장하기 [[save-a-model]]
+
+<frameworkcontent>
+<pt>
+미세조정된 모델을 토크나이저와 함께 저장하려면 [`PreTrainedModel.save_pretrained`]를 사용하세요:
+
+```py
+>>> pt_save_directory = "./pt_save_pretrained"
+>>> tokenizer.save_pretrained(pt_save_directory)  # doctest: +IGNORE_RESULT
+>>> pt_model.save_pretrained(pt_save_directory)
+```
+
+모델을 다시 사용하려면 [`PreTrainedModel.from_pretrained`]로 모델을 다시 로드하세요:
+
+```py
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained")
+```
+</pt>
+<tf>
+미세조정된 모델을 토크나이저와 함께 저장하려면 [`TFPreTrainedModel.save_pretrained`]를 사용하세요:
+
+```py
+>>> tf_save_directory = "./tf_save_pretrained"
+>>> tokenizer.save_pretrained(tf_save_directory)  # doctest: +IGNORE_RESULT
+>>> tf_model.save_pretrained(tf_save_directory)
+```
+
+모델을 다시 사용하려면 [`TFPreTrainedModel.from_pretrained`]로 모델을 다시 로드하세요:
+
+```py
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("./tf_save_pretrained")
+```
+</tf>
+</frameworkcontent>
+
+🤗 Transformers의 멋진 기능 중 하나는 모델을 PyTorch 또는 TensorFlow 모델로 저장해뒀다가 다른 프레임워크로 다시 로드할 수 있는 점입니다. `from_pt` 또는 `from_tf` 매개변수를 사용하여 모델을 한 프레임워크에서 다른 프레임워크로 변환할 수 있습니다:
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> from transformers import AutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
+```
+</pt>
+<tf>
+
+```py
+>>> from transformers import TFAutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
+```
+</tf>
+</frameworkcontent>
+
+## 커스텀 모델 구축하기 [[custom-model-builds]]
+
+모델의 구성 클래스를 수정하여 모델의 구조를 바꿀 수 있습니다. (은닉층이나 어텐션 헤드의 수와 같은) 모델의 속성은 구성에서 지정되기 때문입니다. 커스텀 구성 클래스로 모델을 만들면 처음부터 시작해야 합니다. 모델 속성은 무작위로 초기화되므로 의미 있는 결과를 얻으려면 먼저 모델을 훈련시켜야 합니다.
+
+먼저 [`AutoConfig`]를 가져오고 수정하고 싶은 사전학습된 모델을 로드하세요. [`AutoConfig.from_pretrained`] 내부에서 (어텐션 헤드 수와 같이) 변경하려는 속성를 지정할 수 있습니다:
+
+```py
+>>> from transformers import AutoConfig
+
+>>> my_config = AutoConfig.from_pretrained("distilbert-base-uncased", n_heads=12)
+```
+
+<frameworkcontent>
+<pt>
+[`AutoModel.from_config`]를 사용하여 바꾼 구성대로 모델을 생성하세요:
+
+```py
+>>> from transformers import AutoModel
+
+>>> my_model = AutoModel.from_config(my_config)
+```
+</pt>
+<tf>
+[`TFAutoModel.from_config`]를 사용하여 바꾼 구성대로 모델을 생성하세요:
+
+```py
+>>> from transformers import TFAutoModel
+
+>>> my_model = TFAutoModel.from_config(my_config)
+```
+</tf>
+</frameworkcontent>
+
+커스텀 구성에 대한 자세한 내용은 [커스텀 아키텍처 만들기](./create_a_model) 가이드를 확인하세요.
+
+## Trainer - PyTorch에 최적화된 훈련 루프 [[trainer-a-pytorch-optimized-training-loop]]
+
+모든 모델은 [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)이므로 일반적인 훈련 루프에서 사용할 수 있습니다. 직접 훈련 루프를 작성할 수도 있지만, 🤗 Transformers는 PyTorch를 위한 [`Trainer`] 클래스를 제공합니다. 이 클래스에는 기본 훈련 루프가 포함되어 있으며 분산 훈련, 혼합 정밀도 등과 같은 기능을 추가로 제공합니다.
+
+과업에 따라 다르지만 일반적으로 [`Trainer`]에 다음 매개변수를 전달합니다:
+
+1. [`PreTrainedModel`] 또는 [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)로 시작합니다:
+
+   ```py
+   >>> from transformers import AutoModelForSequenceClassification
+
+   >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+   ```
+
+2. [`TrainingArguments`]는 학습률, 배치 크기, 훈련할 에포크 수와 같은 모델 하이퍼파라미터를 포함합니다. 훈련 인자를 지정하지 않으면 기본값이 사용됩니다:
+
+   ```py
+   >>> from transformers import TrainingArguments
+
+   >>> training_args = TrainingArguments(
+   ...     output_dir="path/to/save/folder/",
+   ...     learning_rate=2e-5,
+   ...     per_device_train_batch_size=8,
+   ...     per_device_eval_batch_size=8,
+   ...     num_train_epochs=2,
+   ... )
+   ```
+
+3. 토크나이저, 이미지 프로세서, 특징 추출기(feature extractor) 또는 프로세서와 전처리 클래스를 로드하세요:
+
+   ```py
+   >>> from transformers import AutoTokenizer
+
+   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+   ```
+
+4. 데이터셋을 로드하세요:
+
+   ```py
+   >>> from datasets import load_dataset
+
+   >>> dataset = load_dataset("rotten_tomatoes")  # doctest: +IGNORE_RESULT
+   ```
+
+5. 데이터셋을 토큰화하는 함수를 생성하세요:
+
+   ```py
+   >>> def tokenize_dataset(dataset):
+   ...     return tokenizer(dataset["text"])
+   ```
+
+   그리고 [`~datasets.Dataset.map`]로 데이터셋 전체에 적용하세요:
+
+   ```py
+   >>> dataset = dataset.map(tokenize_dataset, batched=True)
+   ```
+
+6. [`DataCollatorWithPadding`]을 사용하여 데이터셋의 표본 묶음을 만드세요:
+
+   ```py
+   >>> from transformers import DataCollatorWithPadding
+
+   >>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+   ```
+
+이제 위의 모든 클래스를 [`Trainer`]로 모으세요:
+
+```py
+>>> from transformers import Trainer
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=dataset["train"],
+...     eval_dataset=dataset["test"],
+...     tokenizer=tokenizer,
+...     data_collator=data_collator,
+... )  # doctest: +SKIP
+```
+
+준비가 되었으면 [`~Trainer.train`]을 호출하여 훈련을 시작하세요:
+
+```py
+>>> trainer.train()  # doctest: +SKIP
+```
+
+<Tip>
+
+번역이나 요약과 같이 시퀀스-시퀀스 모델을 사용하는 과업에는 [`Seq2SeqTrainer`] 및 [`Seq2SeqTrainingArguments`] 클래스를 사용하세요.
+
+</Tip>
+
+[`Trainer`] 내의 메서드를 서브클래스화하여 훈련 루프를 바꿀 수도 있습니다. 이러면 손실 함수, 옵티마이저, 스케줄러와 같은 기능 또한 바꿀 수 있게 됩니다. 변경 가능한 메소드에 대해서는 [`Trainer`] 문서를 참고하세요.
+
+훈련 루프를 수정하는 다른 방법은 [Callbacks](./main_classes/callbacks)를 사용하는 것입니다. Callbacks로 다른 라이브러리와 통합하고, 훈련 루프를 체크하여 진행 상황을 보고받거나, 훈련을 조기에 중단할 수 있습니다. Callbacks은 훈련 루프 자체를 바꾸지는 않습니다. 손실 함수와 같은 것을 바꾸려면 [`Trainer`]를 서브클래스화해야 합니다.
+
+## TensorFlow로 훈련시키기 [[train-with-tensorflow]]
+
+모든 모델은 [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model)이므로 [Keras](https://keras.io/) API를 통해 TensorFlow에서 훈련시킬 수 있습니다. 🤗 Transformers는 데이터셋을 쉽게 `tf.data.Dataset` 형태로 쉽게 로드할 수 있는 [`~TFPreTrainedModel.prepare_tf_dataset`] 메소드를 제공하기 때문에, Keras의 [`compile`](https://keras.io/api/models/model_training_apis/#compile-method) 및 [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) 메소드로 바로 훈련을 시작할 수 있습니다.
+
+1. [`TFPreTrainedModel`] 또는 [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model)로 시작합니다:
+
+   ```py
+   >>> from transformers import TFAutoModelForSequenceClassification
+
+   >>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+   ```
+
+2. 토크나이저, 이미지 프로세서, 특징 추출기(feature extractor) 또는 프로세서와 같은 전처리 클래스를 로드하세요:
+
+   ```py
+   >>> from transformers import AutoTokenizer
+
+   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+   ```
+
+3. 데이터셋을 토큰화하는 함수를 생성하세요:
+
+   ```py
+   >>> def tokenize_dataset(dataset):
+   ...     return tokenizer(dataset["text"])  # doctest: +SKIP
+   ```
+
+4. [`~datasets.Dataset.map`]을 사용하여 전체 데이터셋에 토큰화 함수를 적용하고, 데이터셋과 토크나이저를 [`~TFPreTrainedModel.prepare_tf_dataset`]에 전달하세요. 배치 크기를 변경하거나 데이터셋을 섞을 수도 있습니다:
+
+   ```py
+   >>> dataset = dataset.map(tokenize_dataset)  # doctest: +SKIP
+   >>> tf_dataset = model.prepare_tf_dataset(
+   ...     dataset["train"], batch_size=16, shuffle=True, tokenizer=tokenizer
+   ... )  # doctest: +SKIP
+   ```
+
+5. 준비되었으면 `compile` 및 `fit`를 호출하여 훈련을 시작하세요. 🤗 Transformers의 모든 모델은 과업과 관련된 기본 손실 함수를 가지고 있으므로 명시적으로 지정하지 않아도 됩니다:
+
+   ```py
+   >>> from tensorflow.keras.optimizers import Adam
+
+   >>> model.compile(optimizer=Adam(3e-5))  # No loss argument!
+   >>> model.fit(tf_dataset)  # doctest: +SKIP
+   ```
+
+## 다음 단계는 무엇인가요? [[whats-next]]
+
+🤗 Transformers 둘러보기를 모두 읽으셨다면, 가이드를 살펴보고 더 구체적인 것을 수행하는 방법을 알아보세요. 이를테면 커스텀 모델 구축하는 방법, 과업에 알맞게 모델을 미세조정하는 방법, 스크립트로 모델 훈련하는 방법 등이 있습니다. 🤗 Transformers 핵심 개념에 대해 더 알아보려면 커피 한 잔 들고 개념 가이드를 살펴보세요!
\ No newline at end of file
diff --git a/docs/source/ko/run_scripts.md b/docs/source/ko/run_scripts.md
new file mode 100644
index 000000000000..c1af1677183b
--- /dev/null
+++ b/docs/source/ko/run_scripts.md
@@ -0,0 +1,375 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 스크립트로 실행하기[[train-with-a-script]]
+
+🤗 Transformers 노트북과 함께 [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow), 또는 [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax)를 사용해 특정 태스크에 대한 모델을 훈련하는 방법을 보여주는 예제 스크립트도 있습니다.
+
+또한 [연구 프로젝트](https://github.com/huggingface/transformers/tree/main/examples/research_projects) 및 [레거시 예제](https://github.com/huggingface/transformers/tree/main/examples/legacy)에서 대부분 커뮤니티에서 제공한 스크립트를 찾을 수 있습니다. 
+이러한 스크립트는 적극적으로 유지 관리되지 않으며 최신 버전의 라이브러리와 호환되지 않을 가능성이 높은 특정 버전의 🤗 Transformers를 필요로 합니다.
+
+예제 스크립트가 모든 문제에서 바로 작동하는 것은 아니며, 해결하려는 문제에 맞게 스크립트를 변경해야 할 수도 있습니다.
+이를 위해 대부분의 스크립트에는 데이터 전처리 방법이 나와있어 필요에 따라 수정할 수 있습니다.
+
+예제 스크립트에 구현하고 싶은 기능이 있으면 pull request를 제출하기 전에 [포럼](https://discuss.huggingface.co/) 또는 [이슈](https://github.com/huggingface/transformers/issues)에서 논의해 주세요.
+버그 수정은 환영하지만 가독성을 희생하면서까지 더 많은 기능을 추가하는 pull request는 병합(merge)하지 않을 가능성이 높습니다.
+
+이 가이드에서는 [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization) 및 [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization)에서 요약 훈련하는
+ 스크립트 예제를 실행하는 방법을 설명합니다.
+특별한 설명이 없는 한 모든 예제는 두 프레임워크 모두에서 작동할 것으로 예상됩니다.
+
+## 설정하기[[setup]]
+
+최신 버전의 예제 스크립트를 성공적으로 실행하려면 새 가상 환경에서 **소스로부터 🤗 Transformers를 설치**해야 합니다:
+
+```bash
+git clone https://github.com/huggingface/transformers
+cd transformers
+pip install .
+```
+
+이전 버전의 예제 스크립트를 보려면 아래 토글을 클릭하세요:
+
+<details>
+  <summary>이전 버전의 🤗 Transformers 예제</summary>
+	<ul>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.5.1/examples">v4.5.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.4.2/examples">v4.4.2</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.3.3/examples">v4.3.3</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.2.2/examples">v4.2.2</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.1.1/examples">v4.1.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.0.1/examples">v4.0.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.5.1/examples">v3.5.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.4.0/examples">v3.4.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.3.1/examples">v3.3.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.2.0/examples">v3.2.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.1.0/examples">v3.1.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.0.2/examples">v3.0.2</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.11.0/examples">v2.11.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.10.0/examples">v2.10.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.9.1/examples">v2.9.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.8.0/examples">v2.8.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.7.0/examples">v2.7.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.6.0/examples">v2.6.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.5.1/examples">v2.5.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.4.0/examples">v2.4.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.3.0/examples">v2.3.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.2.0/examples">v2.2.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.1.0/examples">v2.1.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.0.0/examples">v2.0.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v1.2.0/examples">v1.2.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v1.1.0/examples">v1.1.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v1.0.0/examples">v1.0.0</a></li>
+	</ul>
+</details>
+
+그리고 다음과 같이 복제(clone)해온 🤗 Transformers 버전을 특정 버전(예: v3.5.1)으로 전환하세요:
+
+```bash
+git checkout tags/v3.5.1
+```
+
+올바른 라이브러리 버전을 설정한 후 원하는 예제 폴더로 이동하여 예제별로 라이브러리에 대한 요구 사항(requirements)을 설치합니다:
+
+```bash
+pip install -r requirements.txt
+```
+
+## 스크립트 실행하기[[run-a-script]]
+
+<frameworkcontent>
+<pt>
+예제 스크립트는 🤗 [Datasets](https://huggingface.co/docs/datasets/) 라이브러리에서 데이터 세트를 다운로드하고 전처리합니다.
+그런 다음 스크립트는 요약 기능을 지원하는 아키텍처에서 [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer)를 사용하여 데이터 세트를 미세 조정합니다.
+다음 예는 [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) 데이터 세트에서 [T5-small](https://huggingface.co/t5-small)을 미세 조정합니다.
+T5 모델은 훈련 방식에 따라 추가 `source_prefix` 인수가 필요하며, 이 프롬프트는 요약 작업임을 T5에 알려줍니다.
+
+```bash
+python examples/pytorch/summarization/run_summarization.py \
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+</pt>
+<tf>
+예제 스크립트는 🤗 [Datasets](https://huggingface.co/docs/datasets/) 라이브러리에서 데이터 세트를 다운로드하고 전처리합니다.
+그런 다음 스크립트는 요약 기능을 지원하는 아키텍처에서 Keras를 사용하여 데이터 세트를 미세 조정합니다. 
+다음 예는 [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) 데이터 세트에서 [T5-small](https://huggingface.co/t5-small)을 미세 조정합니다.
+T5 모델은 훈련 방식에 따라 추가 `source_prefix` 인수가 필요하며, 이 프롬프트는 요약 작업임을 T5에 알려줍니다.
+```bash
+python examples/tensorflow/summarization/run_summarization.py  \
+    --model_name_or_path t5-small \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --output_dir /tmp/tst-summarization  \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 16 \
+    --num_train_epochs 3 \
+    --do_train \
+    --do_eval
+```
+</tf>
+</frameworkcontent>
+
+## 혼합 정밀도(mixed precision)로 분산 훈련하기[[distributed-training-and-mixed-precision]]
+
+[Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) 클래스는 분산 훈련과 혼합 정밀도(mixed precision)를 지원하므로 스크립트에서도 사용할 수 있습니다.
+이 두 가지 기능을 모두 활성화하려면 다음 두 가지를 설정해야 합니다:
+
+- `fp16` 인수를 추가해 혼합 정밀도(mixed precision)를 활성화합니다.
+- `nproc_per_node` 인수를 추가해 사용할 GPU 개수를 설정합니다.
+
+```bash
+python -m torch.distributed.launch \
+    --nproc_per_node 8 pytorch/summarization/run_summarization.py \
+    --fp16 \
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+
+TensorFlow 스크립트는 분산 훈련을 위해 [`MirroredStrategy`](https://www.tensorflow.org/guide/distributed_training#mirroredstrategy)를 활용하며, 훈련 스크립트에 인수를 추가할 필요가 없습니다.
+다중 GPU 환경이라면, TensorFlow 스크립트는 기본적으로 여러 개의 GPU를 사용합니다.
+
+## TPU 위에서 스크립트 실행하기[[run-a-script-on-a-tpu]]
+
+<frameworkcontent>
+<pt>
+Tensor Processing Units (TPUs)는 성능을 가속화하기 위해 특별히 설계되었습니다.
+PyTorch는 [XLA](https://www.tensorflow.org/xla) 딥러닝 컴파일러와 함께 TPU를 지원합니다(자세한 내용은 [여기](https://github.com/pytorch/xla/blob/master/README.md) 참조). 
+TPU를 사용하려면 `xla_spawn.py` 스크립트를 실행하고 `num_cores` 인수를 사용하여 사용하려는 TPU 코어 수를 설정합니다.
+
+```bash
+python xla_spawn.py --num_cores 8 \
+    summarization/run_summarization.py \
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+</pt>
+<tf>
+Tensor Processing Units (TPUs)는 성능을 가속화하기 위해 특별히 설계되었습니다.
+TensorFlow 스크립트는 TPU를 훈련에 사용하기 위해 [`TPUStrategy`](https://www.tensorflow.org/guide/distributed_training#tpustrategy)를 활용합니다.
+TPU를 사용하려면 TPU 리소스의 이름을 `tpu` 인수에 전달합니다.
+
+```bash
+python run_summarization.py  \
+    --tpu name_of_tpu_resource \
+    --model_name_or_path t5-small \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --output_dir /tmp/tst-summarization  \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 16 \
+    --num_train_epochs 3 \
+    --do_train \
+    --do_eval
+```
+</tf>
+</frameworkcontent>
+
+## 🤗 Accelerate로 스크립트 실행하기[[run-a-script-with-accelerate]]
+
+🤗 [Accelerate](https://huggingface.co/docs/accelerate)는 PyTorch 훈련 과정에 대한 완전한 가시성을 유지하면서 여러 유형의 설정(CPU 전용, 다중 GPU, TPU)에서 모델을 훈련할 수 있는 통합 방법을 제공하는 PyTorch 전용 라이브러리입니다.
+🤗 Accelerate가 설치되어 있는지 확인하세요:
+
+> 참고: Accelerate는 빠르게 개발 중이므로 스크립트를 실행하려면 accelerate를 설치해야 합니다.
+```bash
+pip install git+https://github.com/huggingface/accelerate
+```
+
+`run_summarization.py` 스크립트 대신 `run_summarization_no_trainer.py` 스크립트를 사용해야 합니다.
+🤗 Accelerate 클래스가 지원되는 스크립트는 폴더에 `task_no_trainer.py` 파일이 있습니다.
+다음 명령을 실행하여 구성 파일을 생성하고 저장합니다:
+```bash
+accelerate config
+```
+
+설정을 테스트하여 올바르게 구성되었는지 확인합니다:
+
+```bash
+accelerate test
+```
+
+이제 훈련을 시작할 준비가 되었습니다:
+
+```bash
+accelerate launch run_summarization_no_trainer.py \
+    --model_name_or_path t5-small \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir ~/tmp/tst-summarization
+```
+
+## 사용자 정의 데이터 세트 사용하기[[use-a-custom-dataset]]
+
+요약 스크립트는 사용자 지정 데이터 세트가 CSV 또는 JSON 파일인 경우 지원합니다.
+사용자 지정 데이터 세트를 사용하는 경우에는 몇 가지 추가 인수를 지정해야 합니다:
+
+- `train_file`과 `validation_file`은 훈련 및 검증 파일의 경로를 지정합니다.
+- `text_column`은 요약할 입력 텍스트입니다.
+- `summary_column`은 출력할 대상 텍스트입니다.
+
+사용자 지정 데이터 세트를 사용하는 요약 스크립트는 다음과 같습니다:
+
+```bash
+python examples/pytorch/summarization/run_summarization.py \
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --train_file path_to_csv_or_jsonlines_file \
+    --validation_file path_to_csv_or_jsonlines_file \
+    --text_column text_column_name \
+    --summary_column summary_column_name \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --overwrite_output_dir \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --predict_with_generate
+```
+
+## 스크립트 테스트하기[[test-a-script]]
+
+전체 데이터 세트를 대상으로 훈련을 완료하는데 꽤 오랜 시간이 걸리기 때문에, 작은 데이터 세트에서 모든 것이 예상대로 실행되는지 확인하는 것이 좋습니다.
+
+다음 인수를 사용하여 데이터 세트를 최대 샘플 수로 잘라냅니다:
+- `max_train_samples`
+- `max_eval_samples`
+- `max_predict_samples`
+
+```bash
+python examples/pytorch/summarization/run_summarization.py \
+    --model_name_or_path t5-small \
+    --max_train_samples 50 \
+    --max_eval_samples 50 \
+    --max_predict_samples 50 \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+
+모든 예제 스크립트가 `max_predict_samples` 인수를 지원하지는 않습니다.
+스크립트가 이 인수를 지원하는지 확실하지 않은 경우 `-h` 인수를 추가하여 확인하세요:
+
+```bash
+examples/pytorch/summarization/run_summarization.py -h
+```
+
+## 체크포인트(checkpoint)에서 훈련 이어서 하기[[resume-training-from-checkpoint]]
+
+또 다른 유용한 옵션은 이전 체크포인트에서 훈련을 재개하는 것입니다. 
+이렇게 하면 훈련이 중단되더라도 처음부터 다시 시작하지 않고 중단한 부분부터 다시 시작할 수 있습니다.
+체크포인트에서 훈련을 재개하는 방법에는 두 가지가 있습니다.
+
+첫 번째는 `output_dir previous_output_dir` 인수를 사용하여 `output_dir`에 저장된 최신 체크포인트부터 훈련을 재개하는 방법입니다.
+이 경우 `overwrite_output_dir`을 제거해야 합니다:
+```bash
+python examples/pytorch/summarization/run_summarization.py
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --output_dir previous_output_dir \
+    --predict_with_generate
+```
+
+두 번째는 `resume_from_checkpoint path_to_specific_checkpoint` 인수를 사용하여 특정 체크포인트 폴더에서 훈련을 재개하는 방법입니다.
+
+```bash
+python examples/pytorch/summarization/run_summarization.py
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --resume_from_checkpoint path_to_specific_checkpoint \
+    --predict_with_generate
+```
+
+## 모델 공유하기[[share-your-model]]
+
+모든 스크립트는 최종 모델을 [Model Hub](https://huggingface.co/models)에 업로드할 수 있습니다.
+시작하기 전에 Hugging Face에 로그인했는지 확인하세요:
+```bash
+huggingface-cli login
+```
+
+그런 다음 스크립트에 `push_to_hub` 인수를 추가합니다.
+이 인수는 Hugging Face 사용자 이름과 `output_dir`에 지정된 폴더 이름으로 저장소를 생성합니다.
+
+저장소에 특정 이름을 지정하려면 `push_to_hub_model_id` 인수를 사용하여 추가합니다.
+저장소는 네임스페이스 아래에 자동으로 나열됩니다.
+다음 예는 특정 저장소 이름으로 모델을 업로드하는 방법입니다:
+
+```bash
+python examples/pytorch/summarization/run_summarization.py
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --push_to_hub \
+    --push_to_hub_model_id finetuned-t5-cnn_dailymail \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
\ No newline at end of file
diff --git a/docs/source/ko/sagemaker.md b/docs/source/ko/sagemaker.md
new file mode 100644
index 000000000000..f612435d3c1a
--- /dev/null
+++ b/docs/source/ko/sagemaker.md
@@ -0,0 +1,29 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Amazon SageMaker에서 학습 실행하기[[run-training-on-amazon-sagemaker]]
+
+문서가 [hf.co/docs/sagemaker](https://huggingface.co/docs/sagemaker)로 이동되었습니다. 이 페이지는 `transformers` 5.0 에서 삭제될 예정입니다. 
+
+### 목차[[table-of-content]]
+
+- [Train Hugging Face models on Amazon SageMaker with the SageMaker Python SDK](https://huggingface.co/docs/sagemaker/train)
+- [Deploy Hugging Face models to Amazon SageMaker with the SageMaker Python SDK](https://huggingface.co/docs/sagemaker/inference)
+- [Frequently Asked Questions](https://huggingface.co/docs/sagemaker/faq)
diff --git a/docs/source/ko/serialization.md b/docs/source/ko/serialization.md
new file mode 100644
index 000000000000..0cbcf005e3ac
--- /dev/null
+++ b/docs/source/ko/serialization.md
@@ -0,0 +1,181 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# ONNX로 내보내기 [[export-to-onnx]]
+
+🤗 Transformers 모델을 제품 환경에서 배포하기 위해서는 모델을 직렬화된 형식으로 내보내고 특정 런타임과 하드웨어에서 로드하고 실행할 수 있으면 유용합니다.
+
+🤗 Optimum은 Transformers의 확장으로, PyTorch 또는 TensorFlow에서 모델을 ONNX와 TFLite와 같은 직렬화된 형식으로 내보낼 수 있도록 하는 `exporters` 모듈을 통해 제공됩니다. 🤗 Optimum은 또한 성능 최적화 도구 세트를 제공하여 특정 하드웨어에서 모델을 훈련하고 실행할 때 최대 효율성을 달성할 수 있습니다.
+
+이 안내서는 🤗 Optimum을 사용하여 🤗 Transformers 모델을 ONNX로 내보내는 방법을 보여줍니다. TFLite로 모델을 내보내는 안내서는 [TFLite로 내보내기 페이지](tflite)를 참조하세요.
+
+## ONNX로 내보내기 [[export-to-onnx]]
+
+[ONNX (Open Neural Network eXchange)](http://onnx.ai)는 PyTorch와 TensorFlow를 포함한 다양한 프레임워크에서 심층 학습 모델을 나타내는 데 사용되는 공통 연산자 세트와 공통 파일 형식을 정의하는 오픈 표준입니다. 모델이 ONNX 형식으로 내보내지면 이러한 연산자를 사용하여 신경망을 통해 데이터가 흐르는 흐름을 나타내는 계산 그래프(일반적으로 _중간 표현_이라고 함)가 구성됩니다.
+
+표준화된 연산자와 데이터 유형을 가진 그래프를 노출함으로써, ONNX는 프레임워크 간에 쉽게 전환할 수 있습니다. 예를 들어, PyTorch에서 훈련된 모델을 ONNX 형식으로 내보내고 TensorFlow에서 가져올 수 있습니다(그 반대도 가능합니다).
+
+ONNX 형식으로 내보낸 모델은 다음과 같이 사용할 수 있습니다:
+- [그래프 최적화](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization) 및 [양자화](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/quantization)와 같은 기법을 사용하여 추론을 위해 최적화됩니다.
+- ONNX Runtime을 통해 실행할 수 있습니다. [`ORTModelForXXX` 클래스들](https://huggingface.co/docs/optimum/onnxruntime/package_reference/modeling_ort)을 통해 동일한 `AutoModel` API를 따릅니다. 이 API는 🤗 Transformers에서 사용하는 것과 동일합니다.
+- [최적화된 추론 파이프라인](https://huggingface.co/docs/optimum/main/en/onnxruntime/usage_guides/pipelines)을 사용할 수 있습니다. 이는 🤗 Transformers의 [`pipeline`] 함수와 동일한 API를 가지고 있습니다.
+
+🤗 Optimum은 구성 객체를 활용하여 ONNX 내보내기를 지원합니다. 이러한 구성 객체는 여러 모델 아키텍처에 대해 미리 준비되어 있으며 다른 아키텍처에 쉽게 확장할 수 있도록 설계되었습니다.
+
+미리 준비된 구성 목록은 [🤗 Optimum 문서](https://huggingface.co/docs/optimum/exporters/onnx/overview)를 참조하세요.
+
+🤗 Transformers 모델을 ONNX로 내보내는 두 가지 방법이 있습니다. 여기에서 두 가지 방법을 모두 보여줍니다:
+
+- 🤗 Optimum을 사용하여 CLI로 내보내기
+- `optimum.onnxruntime`을 사용하여 🤗 Optimum으로 ONNX로 내보내기
+
+### CLI를 사용하여 🤗 Transformers 모델을 ONNX로 내보내기 [[exporting-a-transformers-model-to-onnx-with-cli]]
+
+🤗 Transformers 모델을 ONNX로 내보내려면 먼저 추가 종속성을 설치하세요:
+
+```bash
+pip install optimum[exporters]
+```
+
+사용 가능한 모든 인수를 확인하려면 [🤗 Optimum 문서](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli)를 참조하거나 명령줄에서 도움말을 보세요.
+
+```bash
+optimum-cli export onnx --help
+```
+
+예를 들어, 🤗 Hub에서 `distilbert-base-uncased-distilled-squad`와 같은 모델의 체크포인트를 내보내려면 다음 명령을 실행하세요:
+
+```bash
+optimum-cli export onnx --model distilbert-base-uncased-distilled-squad distilbert_base_uncased_squad_onnx/
+```
+
+위와 같이 진행 상황을 나타내는 로그가 표시되고 결과인 `model.onnx`가 저장된 위치가 표시됩니다.
+
+```bash
+Validating ONNX model distilbert_base_uncased_squad_onnx/model.onnx...
+	-[✓] ONNX model output names match reference model (start_logits, end_logits)
+	- Validating ONNX Model output "start_logits":
+		-[✓] (2, 16) matches (2, 16)
+		-[✓] all values close (atol: 0.0001)
+	- Validating ONNX Model output "end_logits":
+		-[✓] (2, 16) matches (2, 16)
+		-[✓] all values close (atol: 0.0001)
+The ONNX export succeeded and the exported model was saved at: distilbert_base_uncased_squad_onnx
+```
+
+위의 예제는 🤗 Hub에서 체크포인트를 내보내는 것을 설명합니다. 로컬 모델을 내보낼 때에는 모델의 가중치와 토크나이저 파일을 동일한 디렉토리(`local_path`)에 저장했는지 확인하세요. CLI를 사용할 때에는 🤗 Hub의 체크포인트 이름 대신 `model` 인수에 `local_path`를 전달하고 `--task` 인수를 제공하세요. 지원되는 작업의 목록은 [🤗 Optimum 문서](https://huggingface.co/docs/optimum/exporters/task_manager)를 참조하세요. `task` 인수가 제공되지 않으면 작업에 특화된 헤드 없이 모델 아키텍처로 기본 설정됩니다.
+
+```bash
+optimum-cli export onnx --model local_path --task question-answering distilbert_base_uncased_squad_onnx/
+```
+
+그 결과로 생성된 `model.onnx` 파일은 ONNX 표준을 지원하는 많은 [가속기](https://onnx.ai/supported-tools.html#deployModel) 중 하나에서 실행할 수 있습니다. 예를 들어, [ONNX Runtime](https://onnxruntime.ai/)을 사용하여 모델을 로드하고 실행할 수 있습니다:
+
+```python
+>>> from transformers import AutoTokenizer
+>>> from optimum.onnxruntime import ORTModelForQuestionAnswering
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert_base_uncased_squad_onnx")
+>>> model = ORTModelForQuestionAnswering.from_pretrained("distilbert_base_uncased_squad_onnx")
+>>> inputs = tokenizer("What am I using?", "Using DistilBERT with ONNX Runtime!", return_tensors="pt")
+>>> outputs = model(**inputs)
+```
+
+Hub의 TensorFlow 체크포인트에 대해서도 동일한 프로세스가 적용됩니다. 예를 들어, [Keras organization](https://huggingface.co/keras-io)에서 순수한 TensorFlow 체크포인트를 내보내는 방법은 다음과 같습니다:
+
+```bash
+optimum-cli export onnx --model keras-io/transformers-qa distilbert_base_cased_squad_onnx/
+```
+
+### `optimum.onnxruntime`을 사용하여 🤗 Transformers 모델을 ONNX로 내보내기 [[exporting-a-transformers-model-to-onnx-with-optimumonnxruntime]]
+
+CLI 대신에 `optimum.onnxruntime`을 사용하여 프로그래밍 방식으로 🤗 Transformers 모델을 ONNX로 내보낼 수도 있습니다. 다음과 같이 진행하세요:
+
+```python
+>>> from optimum.onnxruntime import ORTModelForSequenceClassification
+>>> from transformers import AutoTokenizer
+
+>>> model_checkpoint = "distilbert_base_uncased_squad"
+>>> save_directory = "onnx/"
+
+>>> # Load a model from transformers and export it to ONNX
+>>> ort_model = ORTModelForSequenceClassification.from_pretrained(model_checkpoint, export=True)
+>>> tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
+
+>>> # Save the onnx model and tokenizer
+>>> ort_model.save_pretrained(save_directory)
+>>> tokenizer.save_pretrained(save_directory)
+```
+
+### 지원되지 않는 아키텍처의 모델 내보내기 [[exporting-a-model-for-an-unsupported-architecture]]
+
+현재 내보낼 수 없는 모델을 지원하기 위해 기여하려면, 먼저 [`optimum.exporters.onnx`](https://huggingface.co/docs/optimum/exporters/onnx/overview)에서 지원되는지 확인한 후 지원되지 않는 경우에는 [🤗 Optimum에 기여](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/contribute)하세요.
+
+### `transformers.onnx`를 사용하여 모델 내보내기 [[exporting-a-model-with-transformersonnx]]
+
+<Tip warning={true}>
+
+`tranformers.onnx`는 더 이상 유지되지 않습니다. 위에서 설명한 대로 🤗 Optimum을 사용하여 모델을 내보내세요. 이 섹션은 향후 버전에서 제거될 예정입니다.
+
+</Tip>
+
+🤗 Transformers 모델을 ONNX로 내보내려면 추가 종속성을 설치하세요:
+
+```bash
+pip install transformers[onnx]
+```
+
+`transformers.onnx` 패키지를 Python 모듈로 사용하여 준비된 구성을 사용하여 체크포인트를 내보냅니다:
+
+```bash
+python -m transformers.onnx --model=distilbert-base-uncased onnx/
+```
+
+이렇게 하면 `--model` 인수에 정의된 체크포인트의 ONNX 그래프가 내보내집니다. 🤗 Hub에서 제공하는 체크포인트나 로컬에 저장된 체크포인트를 전달할 수 있습니다. 결과로 생성된 `model.onnx` 파일은 ONNX 표준을 지원하는 많은 가속기 중 하나에서 실행할 수 있습니다. 예를 들어, 다음과 같이 ONNX Runtime을 사용하여 모델을 로드하고 실행할 수 있습니다:
+
+```python
+>>> from transformers import AutoTokenizer
+>>> from onnxruntime import InferenceSession
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+>>> session = InferenceSession("onnx/model.onnx")
+>>> # ONNX Runtime expects NumPy arrays as input
+>>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
+>>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs))
+```
+
+필요한 출력 이름(예: `["last_hidden_state"]`)은 각 모델의 ONNX 구성을 확인하여 얻을 수 있습니다. 예를 들어, DistilBERT의 경우 다음과 같습니다:
+
+```python
+>>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig
+
+>>> config = DistilBertConfig()
+>>> onnx_config = DistilBertOnnxConfig(config)
+>>> print(list(onnx_config.outputs.keys()))
+["last_hidden_state"]
+```
+
+Hub의 TensorFlow 체크포인트에 대해서도 동일한 프로세스가 적용됩니다. 예를 들어, 다음과 같이 순수한 TensorFlow 체크포인트를 내보냅니다:
+
+```bash
+python -m transformers.onnx --model=keras-io/transformers-qa onnx/
+```
+
+로컬에 저장된 모델을 내보내려면 모델의 가중치 파일과 토크나이저 파일을 동일한 디렉토리에 저장한 다음, transformers.onnx 패키지의 --model 인수를 원하는 디렉토리로 지정하여 ONNX로 내보냅니다:
+
+```bash
+python -m transformers.onnx --model=local-pt-checkpoint onnx/
+```
\ No newline at end of file
diff --git a/docs/source/ko/task_summary.md b/docs/source/ko/task_summary.md
new file mode 100644
index 000000000000..dbebf38760a6
--- /dev/null
+++ b/docs/source/ko/task_summary.md
@@ -0,0 +1,341 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 🤗 Transformers로 할 수 있는 것[[what__transformers_can_do]]
+
+🤗 Transformers는 자연어처리(NLP), 컴퓨터 비전, 오디오 및 음성 처리 작업에 대한 사전훈련된 최첨단 모델 라이브러리입니다. 
+이 라이브러리는 트랜스포머 모델뿐만 아니라 컴퓨터 비전 작업을 위한 현대적인 합성곱 신경망과 같은 트랜스포머가 아닌 모델도 포함하고 있습니다. 
+
+스마트폰, 앱, 텔레비전과 같은 오늘날 가장 인기 있는 소비자 제품을 살펴보면, 딥러닝 기술이 그 뒤에 사용되고 있을 확률이 높습니다. 
+스마트폰으로 촬영한 사진에서 배경 객체를 제거하고 싶다면 어떻게 할까요? 이는 파놉틱 세그멘테이션 작업의 예입니다(아직 이게 무엇인지 모른다면, 다음 섹션에서 설명하겠습니다!).
+
+이 페이지는 다양한 음성 및 오디오, 컴퓨터 비전, NLP 작업을 🤗 Transformers 라이브러리를 활용하여 다루는 간단한 예제를 3줄의 코드로 제공합니다. 
+
+## 오디오[[audio]]
+
+
+음성 및 오디오 처리 작업은 다른 모달리티와 약간 다릅니다. 이는 주로 오디오가 연속적인 신호로 입력되기 때문입니다. 
+텍스트와 달리 원본 오디오 파형(waveform)은 문장이 단어로 나눠지는 것처럼 깔끔하게 이산적인 묶음으로 나눌 수 없습니다. 
+이를 극복하기 위해 원본 오디오 신호는 일정한 간격으로 샘플링됩니다. 해당 간격 내에서 더 많은 샘플을 취할 경우 샘플링률이 높아지며, 오디오는 원본 오디오 소스에 더 가까워집니다.
+
+과거의 접근 방식은 오디오에서 유용한 특징을 추출하기 위해 오디오를 전처리하는 것이었습니다. 
+하지만 현재는 원본 오디오 파형을 특성 인코더에 직접 넣어서 오디오 표현(representation)을 추출하는 것이 더 일반적입니다. 
+이렇게 하면 전처리 단계가 단순해지고 모델이 가장 중요한 특징을 학습할 수 있습니다.
+
+### 오디오 분류[[audio_classification]]
+
+
+오디오 분류는 오디오 데이터에 미리 정의된 클래스 집합의 레이블을 지정하는 작업입니다. 이는 많은 구체적인 응용 프로그램을 포함한 넓은 범주입니다.
+
+일부 예시는 다음과 같습니다:
+
+* 음향 장면 분류: 오디오에 장면 레이블("사무실", "해변", "경기장")을 지정합니다.
+* 음향 이벤트 감지: 오디오에 소리 이벤트 레이블("차 경적", "고래 울음소리", "유리 파손")을 지정합니다.
+* 태깅: 여러 가지 소리(새 지저귐, 회의에서의 화자 식별)가 포함된 오디오에 레이블을 지정합니다.
+* 음악 분류: 음악에 장르 레이블("메탈", "힙합", "컨트리")을 지정합니다.
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline(task="audio-classification", model="superb/hubert-base-superb-er")
+>>> preds = classifier("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> preds
+[{'score': 0.4532, 'label': 'hap'},
+ {'score': 0.3622, 'label': 'sad'},
+ {'score': 0.0943, 'label': 'neu'},
+ {'score': 0.0903, 'label': 'ang'}]
+```
+
+### 자동 음성 인식[[automatic_speech_recognition]]
+
+
+자동 음성 인식(ASR)은 음성을 텍스트로 변환하는 작업입니다. 
+음성은 인간의 자연스러운 의사소통 형태이기 때문에 ASR은 가장 일반적인 오디오 작업 중 하나입니다. 
+오늘날 ASR 시스템은 스피커, 전화 및 자동차와 같은 "스마트" 기술 제품에 내장되어 있습니다. 
+우리는 가상 비서에게 음악 재생, 알림 설정 및 날씨 정보를 요청할 수 있습니다.
+
+하지만 트랜스포머 아키텍처가 해결하는 데 도움을 준 핵심 도전 과제 중 하나는 양이 데이터 양이 적은 언어(low-resource language)에 대한 것입니다. 대량의 음성 데이터로 사전 훈련한 후 데이터 양이 적은 언어에서 레이블이 지정된 음성 데이터 1시간만으로 모델을 미세 조정하면 이전의 100배 많은 레이블이 지정된 데이터로 훈련된 ASR 시스템보다 훨씬 더 높은 품질의 결과를 얻을 수 있습니다. 
+```py
+>>> from transformers import pipeline
+
+>>> transcriber = pipeline(task="automatic-speech-recognition", model="openai/whisper-small")
+>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'}
+```
+
+## 컴퓨터 비전[[computer_vision]]
+
+컴퓨터 비전 작업 중 가장 초기의 성공적인 작업 중 하나는 [합성곱 신경망(CNN)](glossary#convolution)을 사용하여 우편번호 숫자 이미지를 인식하는 것이었습니다. 이미지는 픽셀로 구성되어 있으며 각 픽셀은 숫자 값으로 표현됩니다. 이로써 이미지를 픽셀 값의 행렬로 나타내는 것이 쉬워집니다. 특정한 픽셀 값의 조합은 이미지의 색상을 의미합니다.
+
+컴퓨터 비전 작업은 일반적으로 다음 두 가지 방법으로 접근 가능합니다:
+
+1. 합성곱을 사용하여 이미지의 낮은 수준 특징에서 높은 수준의 추상적인 요소까지 계층적으로 학습합니다.
+
+2. 이미지를 패치로 나누고 트랜스포머를 사용하여 점진적으로 각 이미지 패치가 서로 어떠한 방식으로 연관되어 이미지를 형성하는지 학습합니다. `CNN`에서 선호하는 상향식 접근법과는 달리, 이 방식은 흐릿한 이미지로 초안을 그리고 점진적으로 선명한 이미지로 만들어가는 것과 유사합니다.
+
+### 이미지 분류[[image_classification]]
+
+
+이미지 분류는 한 개의 전체 이미지에 미리 정의된 클래스 집합의 레이블을 지정하는 작업입니다. 
+
+대부분의 분류 작업과 마찬가지로, 이미지 분류에는 다양한 실용적인 용도가 있으며, 일부 예시는 다음과 같습니다:
+
+
+* 의료: 질병을 감지하거나 환자 건강을 모니터링하기 위해 의료 이미지에 레이블을 지정합니다.
+* 환경: 위성 이미지를 분류하여 산림 벌채를 감시하고 야생 지역 관리를 위한 정보를 제공하거나 산불을 감지합니다. 
+* 농업: 작물 이미지를 분류하여 식물 건강을 확인하거나 위성 이미지를 분류하여 토지 이용 관찰에 사용합니다.
+* 생태학: 동물이나 식물 종 이미지를 분류하여 야생 동물 개체군을 조사하거나 멸종 위기에 처한 종을 추적합니다.
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline(task="image-classification")
+>>> preds = classifier(
+...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> print(*preds, sep="\n")
+{'score': 0.4335, 'label': 'lynx, catamount'}
+{'score': 0.0348, 'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'}
+{'score': 0.0324, 'label': 'snow leopard, ounce, Panthera uncia'}
+{'score': 0.0239, 'label': 'Egyptian cat'}
+{'score': 0.0229, 'label': 'tiger cat'}
+```
+
+### 객체 탐지[[object_detection]]
+
+
+이미지 분류와 달리 객체 탐지는 이미지 내에서 여러 객체를 식별하고 바운딩 박스로 정의된 객체의 위치를 파악합니다. 
+
+객체 탐지의 몇 가지 응용 예시는 다음과 같습니다:
+
+* 자율 주행 차량: 다른 차량, 보행자 및 신호등과 같은 일상적인 교통 객체를 감지합니다.
+* 원격 감지: 재난 모니터링, 도시 계획 및 기상 예측 등을 수행합니다.
+* 결함 탐지: 건물의 균열이나 구조적 손상, 제조 결함 등을 탐지합니다.
+
+
+```py
+>>> from transformers import pipeline
+
+>>> detector = pipeline(task="object-detection")
+>>> preds = detector(
+...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"], "box": pred["box"]} for pred in preds]
+>>> preds
+[{'score': 0.9865,
+  'label': 'cat',
+  'box': {'xmin': 178, 'ymin': 154, 'xmax': 882, 'ymax': 598}}]
+```
+
+### 이미지 분할[[image_segmentation]]
+
+
+이미지 분할은 픽셀 차원의 작업으로, 이미지 내의 모든 픽셀을 클래스에 할당합니다. 이는 객체 탐지와 다릅니다. 객체 탐지는 바운딩 박스를 사용하여 이미지 내의 객체를 레이블링하고 예측하는 반면, 분할은 더 세분화된 작업입니다. 분할은 픽셀 수준에서 객체를 감지할 수 있습니다. 
+
+이미지 분할에는 여러 유형이 있습니다:
+
+* 인스턴스 분할: 개체의 클래스를 레이블링하는 것 외에도, 개체의 각 구분된 인스턴스에도 레이블을 지정합니다 ("개-1", "개-2" 등).
+* 파놉틱 분할: 의미적 분할과 인스턴스 분할의 조합입니다. 각 픽셀을 의미적 클래스로 레이블링하는 **동시에** 개체의 각각 구분된 인스턴스로도 레이블을 지정합니다.
+
+분할 작업은 자율 주행 차량에서 유용하며, 주변 환경의 픽셀 수준 지도를 생성하여 보행자와 다른 차량 주변에서 안전하게 탐색할 수 있습니다. 또한 의료 영상에서도 유용합니다. 분할 작업이 픽셀 수준에서 객체를 감지할 수 있기 때문에 비정상적인 세포나 장기의 특징을 식별하는 데 도움이 될 수 있습니다. 이미지 분할은 의류 가상 시착이나 카메라를 통해 실제 세계에 가상 개체를 덧씌워 증강 현실 경험을 만드는 등 전자 상거래 분야에서도 사용될 수 있습니다.
+
+```py
+>>> from transformers import pipeline
+
+>>> segmenter = pipeline(task="image-segmentation")
+>>> preds = segmenter(
+...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> print(*preds, sep="\n")
+{'score': 0.9879, 'label': 'LABEL_184'}
+{'score': 0.9973, 'label': 'snow'}
+{'score': 0.9972, 'label': 'cat'}
+```
+
+### 깊이 추정[[depth_estimation]]
+
+깊이 추정은 카메라로부터 이미지 내부의 각 픽셀의 거리를 예측합니다. 이 컴퓨터 비전 작업은 특히 장면 이해와 재구성에 중요합니다. 예를 들어, 자율 주행 차량은 보행자, 교통 표지판 및 다른 차량과 같은 객체와의 거리를 이해하여 장애물과 충돌을 피해야 합니다. 깊이 정보는 또한 2D 이미지에서 3D 표현을 구성하는 데 도움이 되며 생물학적 구조나 건물의 고품질 3D 표현을 생성하는 데 사용될 수 있습니다.
+
+깊이 추정에는 두 가지 접근 방식이 있습니다:
+
+* 스테레오: 약간 다른 각도에서 촬영된 동일한 이미지 두 장을 비교하여 깊이를 추정합니다.
+* 단안: 단일 이미지에서 깊이를 추정합니다.
+
+
+```py
+>>> from transformers import pipeline
+
+>>> depth_estimator = pipeline(task="depth-estimation")
+>>> preds = depth_estimator(
+...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+```
+
+## 자연어처리[[natural_language_processing]]
+
+텍스트는 인간이 의사 소통하는 자연스러운 방식 중 하나이기 때문에 자연어처리 역시 가장 일반적인 작업 유형 중 하나입니다. 모델이 인식하는 형식으로 텍스트를 변환하려면 토큰화해야 합니다. 이는 텍스트 시퀀스를 개별 단어 또는 하위 단어(토큰)로 분할한 다음 이러한 토큰을 숫자로 변환하는 것을 의미합니다. 결과적으로 텍스트 시퀀스를 숫자 시퀀스로 표현할 수 있으며, 숫자 시퀀스를 다양한 자연어처리 작업을 해결하기 위한 모델에 입력할 수 있습니다!
+
+### 텍스트 분류[[text_classification]]
+
+다른 모달리티에서의 분류 작업과 마찬가지로 텍스트 분류는 미리 정의된 클래스 집합에서 텍스트 시퀀스(문장 수준, 단락 또는 문서 등)에 레이블을 지정합니다. 텍스트 분류에는 다양한 실용적인 응용 사례가 있으며, 일부 예시는 다음과 같습니다:
+
+* 감성 분석: 텍스트를 `긍정` 또는 `부정`과 같은 어떤 극성에 따라 레이블링하여 정치, 금융, 마케팅과 같은 분야에서 의사 결정에 정보를 제공하고 지원할 수 있습니다.
+* 콘텐츠 분류: 텍스트를 주제에 따라 레이블링(날씨, 스포츠, 금융 등)하여 뉴스 및 소셜 미디어 피드에서 정보를 구성하고 필터링하는 데 도움이 될 수 있습니다.
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline(task="sentiment-analysis")
+>>> preds = classifier("Hugging Face is the best thing since sliced bread!")
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> preds
+[{'score': 0.9991, 'label': 'POSITIVE'}]
+```
+
+### 토큰 분류[[token_classification]]
+
+모든 자연어처리 작업에서는 텍스트가 개별 단어나 하위 단어로 분리되어 전처리됩니다. 분리된 단어를 [토큰](/glossary#token)이라고 합니다. 토큰 분류는 각 토큰에 미리 정의된 클래스 집합의 레이블을 할당합니다.
+
+토큰 분류의 두 가지 일반적인 유형은 다음과 같습니다:
+
+* 개체명 인식 (NER): 토큰을 조직, 인물, 위치 또는 날짜와 같은 개체 범주에 따라 레이블링합니다. NER은 특히 유전체학적인 환경에서 유전자, 단백질 및 약물 이름에 레이블을 지정하는 데 널리 사용됩니다.
+* 품사 태깅 (POS): 명사, 동사, 형용사와 같은 품사에 따라 토큰에 레이블을 할당합니다. POS는 번역 시스템이 동일한 단어가 문법적으로 어떻게 다른지 이해하는 데 도움이 됩니다 (명사로 사용되는 "bank(은행)"과 동사로 사용되는 "bank(예금을 예치하다)"과 같은 경우).
+
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline(task="ner")
+>>> preds = classifier("Hugging Face is a French company based in New York City.")
+>>> preds = [
+...     {
+...         "entity": pred["entity"],
+...         "score": round(pred["score"], 4),
+...         "index": pred["index"],
+...         "word": pred["word"],
+...         "start": pred["start"],
+...         "end": pred["end"],
+...     }
+...     for pred in preds
+... ]
+>>> print(*preds, sep="\n")
+{'entity': 'I-ORG', 'score': 0.9968, 'index': 1, 'word': 'Hu', 'start': 0, 'end': 2}
+{'entity': 'I-ORG', 'score': 0.9293, 'index': 2, 'word': '##gging', 'start': 2, 'end': 7}
+{'entity': 'I-ORG', 'score': 0.9763, 'index': 3, 'word': 'Face', 'start': 8, 'end': 12}
+{'entity': 'I-MISC', 'score': 0.9983, 'index': 6, 'word': 'French', 'start': 18, 'end': 24}
+{'entity': 'I-LOC', 'score': 0.999, 'index': 10, 'word': 'New', 'start': 42, 'end': 45}
+{'entity': 'I-LOC', 'score': 0.9987, 'index': 11, 'word': 'York', 'start': 46, 'end': 50}
+{'entity': 'I-LOC', 'score': 0.9992, 'index': 12, 'word': 'City', 'start': 51, 'end': 55}
+```
+
+### 질의응답[[question_answering]]
+
+질의응답은 또 하나의 토큰 차원의 작업으로, 문맥이 있을 때(개방형 도메인)와 문맥이 없을 때(폐쇄형 도메인) 질문에 대한 답변을 반환합니다. 이 작업은 가상 비서에게 식당이 영업 중인지와 같은 질문을 할 때마다 발생할 수 있습니다. 고객 지원 또는 기술 지원을 제공하거나 검색 엔진이 요청한 정보를 검색하는 데 도움을 줄 수 있습니다.
+
+질문 답변에는 일반적으로 두 가지 유형이 있습니다:
+
+* 추출형: 질문과 문맥이 주어졌을 때, 모델이 주어진 문맥의 일부에서 가져온 텍스트의 범위를 답변으로 합니다.
+* 생성형: 질문과 문맥이 주어졌을 때, 주어진 문맥을 통해 답변을 생성합니다. 이 접근 방식은 [`QuestionAnsweringPipeline`] 대신 [`Text2TextGenerationPipeline`]을 통해 처리됩니다.
+
+```py
+>>> from transformers import pipeline
+
+>>> question_answerer = pipeline(task="question-answering")
+>>> preds = question_answerer(
+...     question="What is the name of the repository?",
+...     context="The name of the repository is huggingface/transformers",
+... )
+>>> print(
+...     f"score: {round(preds['score'], 4)}, start: {preds['start']}, end: {preds['end']}, answer: {preds['answer']}"
+... )
+score: 0.9327, start: 30, end: 54, answer: huggingface/transformers
+```
+
+### 요약[[summarization]]
+
+요약은 원본 문서의 의미를 최대한 보존하면서 긴 문서를 짧은 문서로 만드는 작업입니다. 요약은 `sequence-to-sequence` 작업입니다. 입력보다 짧은 텍스트 시퀀스를 출력합니다. 요약 작업은 독자가 장문 문서들의 주요 포인트를 빠르게 이해하는 데 도움을 줄 수 있습니다. 입법안, 법률 및 금융 문서, 특허 및 과학 논문은 요약 작업이 독자의 시간을 절약하고 독서 보조 도구로 사용될 수 있는 몇 가지 예시입니다.
+
+질문 답변과 마찬가지로 요약에는 두 가지 유형이 있습니다:
+
+* 추출형: 원본 텍스트에서 가장 중요한 문장을 식별하고 추출합니다.
+* 생성형: 원본 텍스트에서 목표 요약을 생성합니다. 입력 문서에 없는 새로운 단어를 포함할 수도 있습니다. [`SummarizationPipeline`]은 생성형 접근 방식을 사용합니다.
+
+```py
+>>> from transformers import pipeline
+
+>>> summarizer = pipeline(task="summarization")
+>>> summarizer(
+...     "In this work, we presented the Transformer, the first sequence transduction model based entirely on attention, replacing the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention. For translation tasks, the Transformer can be trained significantly faster than architectures based on recurrent or convolutional layers. On both WMT 2014 English-to-German and WMT 2014 English-to-French translation tasks, we achieve a new state of the art. In the former task our best model outperforms even all previously reported ensembles."
+... )
+[{'summary_text': ' The Transformer is the first sequence transduction model based entirely on attention . It replaces the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention . For translation tasks, the Transformer can be trained significantly faster than architectures based on recurrent or convolutional layers .'}]
+```
+
+### 번역[[translation]]
+
+번역은 한 언어로 된 텍스트 시퀀스를 다른 언어로 변환하는 작업입니다. 이는 서로 다른 배경을 가진 사람들이 서로 소통하는 데 도움을 주는 중요한 역할을 합니다. 더 넓은 대중에게 콘텐츠를 번역하여 전달하거나, 새로운 언어를 배우는 데 도움이 되는 학습 도구가 될 수도 있습니다. 요약과 마찬가지로, 번역은 `sequence-to-sequence` 작업입니다. 즉, 모델은 입력 시퀀스를 받아서 출력이 되는 목표 시퀀스를 반환합니다.
+
+초기의 번역 모델은 대부분 단일 언어로 이루어져 있었지만, 최근에는 많은 언어 쌍 간에 번역을 수행할 수 있는 다중 언어 모델에 대한 관심이 높아지고 있습니다.
+
+```py
+>>> from transformers import pipeline
+
+>>> text = "translate English to French: Hugging Face is a community-based open-source platform for machine learning."
+>>> translator = pipeline(task="translation", model="t5-small")
+>>> translator(text)
+[{'translation_text': "Hugging Face est une tribune communautaire de l'apprentissage des machines."}]
+```
+
+### 언어 모델링[[language_modeling]]
+
+언어 모델링은 텍스트 시퀀스에서 단어를 예측하는 작업입니다. 사전 훈련된 언어 모델은 많은 다른 하위 작업에 따라 미세 조정될 수 있기 때문에 매우 인기 있는 자연어처리 작업이 되었습니다. 최근에는 제로 샷(zero-shot) 또는 퓨 샷(few-shot) 학습이 가능한 대규모 언어 모델(Large Language Models, LLM)에 대한 많은 관심이 발생하고 있습니다. 이는 모델이 명시적으로 훈련되지 않은 작업도 해결할 수 있다는 것을 의미합니다! 언어 모델은 유창하고 설득력 있는 텍스트를 생성하는 데 사용될 수 있지만, 텍스트가 항상 정확하지는 않을 수 있으므로 주의가 필요합니다.
+
+언어 모델링에는 두 가지 유형이 있습니다:
+
+* 인과적 언어 모델링: 이 모델의 목적은 시퀀스에서 다음 토큰을 예측하는 것이며, 미래 토큰이 마스킹 됩니다.
+    ```py
+    >>> from transformers import pipeline
+
+    >>> prompt = "Hugging Face is a community-based open-source platform for machine learning."
+    >>> generator = pipeline(task="text-generation")
+    >>> generator(prompt)  # doctest: +SKIP
+    ```
+
+* 마스킹된 언어 모델링: 이 모델의 목적은 시퀀스 내의 마스킹된 토큰을 예측하는 것이며, 시퀀스 내의 모든 토큰에 대한 접근이 제공됩니다.
+    
+    ```py
+    >>> text = "Hugging Face is a community-based open-source <mask> for machine learning."
+    >>> fill_mask = pipeline(task="fill-mask")
+    >>> preds = fill_mask(text, top_k=1)
+    >>> preds = [
+    ...     {
+    ...         "score": round(pred["score"], 4),
+    ...         "token": pred["token"],
+    ...         "token_str": pred["token_str"],
+    ...         "sequence": pred["sequence"],
+    ...     }
+    ...     for pred in preds
+    ... ]
+    >>> preds
+    [{'score': 0.2236,
+      'token': 1761,
+      'token_str': ' platform',
+      'sequence': 'Hugging Face is a community-based open-source platform for machine learning.'}]
+    ```
+
+이 페이지를 통해 각 모달리티의 다양한 작업 유형과 각 작업의 실용적 중요성에 대해 추가적인 배경 정보를 얻으셨기를 바랍니다. 다음 [섹션](tasks_explained)에서는 🤗 Transformer가 이러한 작업을 해결하는 **방법**에 대해 알아보실 수 있습니다.
\ No newline at end of file
diff --git a/docs/source/ko/tasks/asr.md b/docs/source/ko/tasks/asr.md
new file mode 100644
index 000000000000..47a568ecf02b
--- /dev/null
+++ b/docs/source/ko/tasks/asr.md
@@ -0,0 +1,380 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 자동 음성 인식[[automatic-speech-recognition]]
+
+[[open-in-colab]]
+
+<Youtube id="TksaY_FDgnk"/>
+
+자동 음성 인식(Automatic Speech Recognition, ASR)은 음성 신호를 텍스트로 변환하여 음성 입력 시퀀스를 텍스트 출력에 매핑합니다. 
+Siri와 Alexa와 같은 가상 어시스턴트는 ASR 모델을 사용하여 일상적으로 사용자를 돕고 있으며, 회의 중 라이브 캡션 및 메모 작성과 같은 유용한 사용자 친화적 응용 프로그램도 많이 있습니다.
+
+이 가이드에서 소개할 내용은 아래와 같습니다:
+
+1. [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) 데이터 세트에서 [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base)를 미세 조정하여 오디오를 텍스트로 변환합니다.
+2. 미세 조정한 모델을 추론에 사용합니다.
+
+<Tip>
+이 튜토리얼에서 설명하는 작업은 다음 모델 아키텍처에 의해 지원됩니다:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[Data2VecAudio](../model_doc/data2vec-audio), [Hubert](../model_doc/hubert), [M-CTC-T](../model_doc/mctct), [SEW](../model_doc/sew), [SEW-D](../model_doc/sew-d), [UniSpeech](../model_doc/unispeech), [UniSpeechSat](../model_doc/unispeech-sat), [Wav2Vec2](../model_doc/wav2vec2), [Wav2Vec2-Conformer](../model_doc/wav2vec2-conformer), [WavLM](../model_doc/wavlm)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+시작하기 전에 필요한 모든 라이브러리가 설치되어 있는지 확인하세요:
+
+```bash
+pip install transformers datasets evaluate jiwer
+```
+
+Hugging Face 계정에 로그인하면 모델을 업로드하고 커뮤니티에 공유할 수 있습니다. 토큰을 입력하여 로그인하세요.
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## MInDS-14 데이터 세트 가져오기[[load-minds-14-dataset]]
+
+먼저, 🤗 Datasets 라이브러리에서 [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) 데이터 세트의 일부분을 가져오세요. 
+이렇게 하면 전체 데이터 세트에 대한 훈련에 시간을 들이기 전에 모든 것이 작동하는지 실험하고 검증할 수 있습니다.
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> minds = load_dataset("PolyAI/minds14", name="en-US", split="train[:100]")
+```
+
+[`~Dataset.train_test_split`] 메소드를 사용하여 데이터 세트의 `train`을 훈련 세트와 테스트 세트로 나누세요:
+
+```py
+>>> minds = minds.train_test_split(test_size=0.2)
+```
+
+그리고 데이터 세트를 확인하세요:
+
+```py
+>>> minds
+DatasetDict({
+    train: Dataset({
+        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
+        num_rows: 16
+    })
+    test: Dataset({
+        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
+        num_rows: 4
+    })
+})
+```
+
+데이터 세트에는 `lang_id`와 `english_transcription`과 같은 유용한 정보가 많이 포함되어 있지만, 이 가이드에서는 `audio`와 `transcription`에 초점을 맞출 것입니다. 다른 열은 [`~datasets.Dataset.remove_columns`] 메소드를 사용하여 제거하세요:
+
+```py
+>>> minds = minds.remove_columns(["english_transcription", "intent_class", "lang_id"])
+```
+
+예시를 다시 한번 확인해보세요:
+
+```py
+>>> minds["train"][0]
+{'audio': {'array': array([-0.00024414,  0.        ,  0.        , ...,  0.00024414,
+          0.00024414,  0.00024414], dtype=float32),
+  'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav',
+  'sampling_rate': 8000},
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav',
+ 'transcription': "hi I'm trying to use the banking app on my phone and currently my checking and savings account balance is not refreshing"}
+```
+
+두 개의 필드가 있습니다:
+
+- `audio`: 오디오 파일을 가져오고 리샘플링하기 위해 호출해야 하는 음성 신호의 1차원 `array(배열)`
+- `transcription`: 목표 텍스트
+
+## 전처리[[preprocess]]
+
+다음으로 오디오 신호를 처리하기 위한 Wav2Vec2 프로세서를 가져옵니다:
+
+```py
+>>> from transformers import AutoProcessor
+
+>>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base")
+```
+
+MInDS-14 데이터 세트의 샘플링 레이트는 8000kHz이므로([데이터 세트 카드](https://huggingface.co/datasets/PolyAI/minds14)에서 확인), 사전 훈련된 Wav2Vec2 모델을 사용하려면 데이터 세트를 16000kHz로 리샘플링해야 합니다:
+
+```py
+>>> minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
+>>> minds["train"][0]
+{'audio': {'array': array([-2.38064706e-04, -1.58618059e-04, -5.43987835e-06, ...,
+          2.78103951e-04,  2.38446111e-04,  1.18740834e-04], dtype=float32),
+  'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav',
+  'sampling_rate': 16000},
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav',
+ 'transcription': "hi I'm trying to use the banking app on my phone and currently my checking and savings account balance is not refreshing"}
+```
+
+위의 'transcription'에서 볼 수 있듯이 텍스트는 대문자와 소문자가 섞여 있습니다. Wav2Vec2 토크나이저는 대문자 문자에 대해서만 훈련되어 있으므로 텍스트가 토크나이저의 어휘와 일치하는지 확인해야 합니다:
+
+```py
+>>> def uppercase(example):
+...     return {"transcription": example["transcription"].upper()}
+
+
+>>> minds = minds.map(uppercase)
+```
+
+이제 다음 작업을 수행할 전처리 함수를 만들어보겠습니다:
+
+1. `audio` 열을 호출하여 오디오 파일을 가져오고 리샘플링합니다.
+2. 오디오 파일에서 `input_values`를 추출하고 프로세서로 `transcription` 열을 토큰화합니다.
+
+```py
+>>> def prepare_dataset(batch):
+...     audio = batch["audio"]
+...     batch = processor(audio["array"], sampling_rate=audio["sampling_rate"], text=batch["transcription"])
+...     batch["input_length"] = len(batch["input_values"][0])
+...     return batch
+```
+
+전체 데이터 세트에 전처리 함수를 적용하려면 🤗 Datasets [`~datasets.Dataset.map`] 함수를 사용하세요. `num_proc` 매개변수를 사용하여 프로세스 수를 늘리면 `map`의 속도를 높일 수 있습니다. [`~datasets.Dataset.remove_columns`] 메소드를 사용하여 필요하지 않은 열을 제거하세요:
+
+```py
+>>> encoded_minds = minds.map(prepare_dataset, remove_columns=minds.column_names["train"], num_proc=4)
+```
+
+🤗 Transformers에는 자동 음성 인식용 데이터 콜레이터가 없으므로 예제 배치를 생성하려면 [`DataCollatorWithPadding`]을 조정해야 합니다. 이렇게 하면 데이터 콜레이터는 텍스트와 레이블을 배치에서 가장 긴 요소의 길이에 동적으로 패딩하여 길이를 균일하게 합니다. `tokenizer` 함수에서 `padding=True`를 설정하여 텍스트를 패딩할 수 있지만, 동적 패딩이 더 효율적입니다.
+
+다른 데이터 콜레이터와 달리 이 특정 데이터 콜레이터는 `input_values`와 `labels`에 대해 다른 패딩 방법을 적용해야 합니다.
+
+```py
+>>> import torch
+
+>>> from dataclasses import dataclass, field
+>>> from typing import Any, Dict, List, Optional, Union
+
+
+>>> @dataclass
+... class DataCollatorCTCWithPadding:
+...     processor: AutoProcessor
+...     padding: Union[bool, str] = "longest"
+
+...     def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+...         # 입력과 레이블을 분할합니다
+...         # 길이가 다르고, 각각 다른 패딩 방법을 사용해야 하기 때문입니다
+...         input_features = [{"input_values": feature["input_values"][0]} for feature in features]
+...         label_features = [{"input_ids": feature["labels"]} for feature in features]
+
+...         batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt")
+
+...         labels_batch = self.processor.pad(labels=label_features, padding=self.padding, return_tensors="pt")
+
+...         # 패딩에 대해 손실을 적용하지 않도록 -100으로 대체합니다
+...         labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
+
+...         batch["labels"] = labels
+
+...         return batch
+```
+
+이제 `DataCollatorForCTCWithPadding`을 인스턴스화합니다:
+
+```py
+>>> data_collator = DataCollatorCTCWithPadding(processor=processor, padding="longest")
+```
+
+## 평가하기[[evaluate]]
+
+훈련 중에 평가 지표를 포함하면 모델의 성능을 평가하는 데 도움이 되는 경우가 많습니다. 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) 라이브러리를 사용하면 평가 방법을 빠르게 불러올 수 있습니다. 
+이 작업에서는 [단어 오류율(Word Error Rate, WER)](https://huggingface.co/spaces/evaluate-metric/wer) 평가 지표를 가져옵니다.
+(평가 지표를 불러오고 계산하는 방법은 🤗 Evaluate [둘러보기](https://huggingface.co/docs/evaluate/a_quick_tour)를 참조하세요):
+
+```py
+>>> import evaluate
+
+>>> wer = evaluate.load("wer")
+```
+
+그런 다음 예측값과 레이블을 [`~evaluate.EvaluationModule.compute`]에 전달하여 WER을 계산하는 함수를 만듭니다:
+
+```py
+>>> import numpy as np
+
+
+>>> def compute_metrics(pred):
+...     pred_logits = pred.predictions
+...     pred_ids = np.argmax(pred_logits, axis=-1)
+
+...     pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
+
+...     pred_str = processor.batch_decode(pred_ids)
+...     label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
+
+...     wer = wer.compute(predictions=pred_str, references=label_str)
+
+...     return {"wer": wer}
+```
+
+이제 `compute_metrics` 함수를 사용할 준비가 되었으며, 훈련을 설정할 때 이 함수로 되돌아올 것입니다.
+
+## 훈련하기[[train]]
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+[`Trainer`]로 모델을 미세 조정하는 것이 익숙하지 않다면, [여기](../training#train-with-pytorch-trainer)에서 기본 튜토리얼을 확인해보세요!
+
+</Tip>
+
+이제 모델 훈련을 시작할 준비가 되었습니다! [`AutoModelForCTC`]로 Wav2Vec2를 가져오세요. `ctc_loss_reduction` 매개변수로 CTC 손실에 적용할 축소(reduction) 방법을 지정하세요. 기본값인 합계 대신 평균을 사용하는 것이 더 좋은 경우가 많습니다:
+
+```py
+>>> from transformers import AutoModelForCTC, TrainingArguments, Trainer
+
+>>> model = AutoModelForCTC.from_pretrained(
+...     "facebook/wav2vec2-base",
+...     ctc_loss_reduction="mean",
+...     pad_token_id=processor.tokenizer.pad_token_id,
+... )
+```
+
+이제 세 단계만 남았습니다:
+
+1. [`TrainingArguments`]에서 훈련 하이퍼파라미터를 정의하세요. `output_dir`은 모델을 저장할 경로를 지정하는 유일한 필수 매개변수입니다. `push_to_hub=True`를 설정하여 모델을 Hub에 업로드 할 수 있습니다(모델을 업로드하려면 Hugging Face에 로그인해야 합니다). [`Trainer`]는 각 에폭마다 WER을 평가하고 훈련 체크포인트를 저장합니다.
+2. 모델, 데이터 세트, 토크나이저, 데이터 콜레이터, `compute_metrics` 함수와 함께 [`Trainer`]에 훈련 인수를 전달하세요.
+3. [`~Trainer.train`]을 호출하여 모델을 미세 조정하세요.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_asr_mind_model",
+...     per_device_train_batch_size=8,
+...     gradient_accumulation_steps=2,
+...     learning_rate=1e-5,
+...     warmup_steps=500,
+...     max_steps=2000,
+...     gradient_checkpointing=True,
+...     fp16=True,
+...     group_by_length=True,
+...     evaluation_strategy="steps",
+...     per_device_eval_batch_size=8,
+...     save_steps=1000,
+...     eval_steps=1000,
+...     logging_steps=25,
+...     load_best_model_at_end=True,
+...     metric_for_best_model="wer",
+...     greater_is_better=False,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=encoded_minds["train"],
+...     eval_dataset=encoded_minds["test"],
+...     tokenizer=processor.feature_extractor,
+...     data_collator=data_collator,
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+훈련이 완료되면 모두가 모델을 사용할 수 있도록 [`~transformers.Trainer.push_to_hub`] 메소드를 사용하여 모델을 Hub에 공유하세요:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+</frameworkcontent>
+
+<Tip>
+
+자동 음성 인식을 위해 모델을 미세 조정하는 더 자세한 예제는 영어 자동 음성 인식을 위한 [블로그 포스트](https://huggingface.co/blog/fine-tune-wav2vec2-english)와 다국어 자동 음성 인식을 위한 [포스트](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2)를 참조하세요.
+
+</Tip>
+
+## 추론하기[[inference]]
+
+좋아요, 이제 모델을 미세 조정했으니 추론에 사용할 수 있습니다!
+
+추론에 사용할 오디오 파일을 가져오세요. 필요한 경우 오디오 파일의 샘플링 비율을 모델의 샘플링 레이트에 맞게 리샘플링하는 것을 잊지 마세요!
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> dataset = load_dataset("PolyAI/minds14", "en-US", split="train")
+>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
+>>> sampling_rate = dataset.features["audio"].sampling_rate
+>>> audio_file = dataset[0]["audio"]["path"]
+```
+
+추론을 위해 미세 조정된 모델을 시험해보는 가장 간단한 방법은 [`pipeline`]을 사용하는 것입니다. 모델을 사용하여 자동 음성 인식을 위한 `pipeline`을 인스턴스화하고 오디오 파일을 전달하세요:
+
+```py
+>>> from transformers import pipeline
+
+>>> transcriber = pipeline("automatic-speech-recognition", model="stevhliu/my_awesome_asr_minds_model")
+>>> transcriber(audio_file)
+{'text': 'I WOUD LIKE O SET UP JOINT ACOUNT WTH Y PARTNER'}
+```
+
+<Tip>
+
+텍스트로 변환된 결과가 꽤 괜찮지만 더 좋을 수도 있습니다! 더 나은 결과를 얻으려면 더 많은 예제로 모델을 미세 조정하세요!
+
+</Tip>
+
+`pipeline`의 결과를 수동으로 재현할 수도 있습니다:
+
+<frameworkcontent>
+<pt>
+오디오 파일과 텍스트를 전처리하고 PyTorch 텐서로 `input`을 반환할 프로세서를 가져오세요:
+
+```py
+>>> from transformers import AutoProcessor
+
+>>> processor = AutoProcessor.from_pretrained("stevhliu/my_awesome_asr_mind_model")
+>>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
+```
+
+입력을 모델에 전달하고 로짓을 반환하세요:
+
+```py
+>>> from transformers import AutoModelForCTC
+
+>>> model = AutoModelForCTC.from_pretrained("stevhliu/my_awesome_asr_mind_model")
+>>> with torch.no_grad():
+...     logits = model(**inputs).logits
+```
+
+가장 높은 확률의 `input_ids`를 예측하고, 프로세서를 사용하여 예측된 `input_ids`를 다시 텍스트로 디코딩하세요:
+
+```py
+>>> import torch
+
+>>> predicted_ids = torch.argmax(logits, dim=-1)
+>>> transcription = processor.batch_decode(predicted_ids)
+>>> transcription
+['I WOUL LIKE O SET UP JOINT ACOUNT WTH Y PARTNER']
+```
+</pt>
+</frameworkcontent>
\ No newline at end of file
diff --git a/docs/source/ko/tasks/audio_classification.md b/docs/source/ko/tasks/audio_classification.md
new file mode 100644
index 000000000000..7e1094815fd4
--- /dev/null
+++ b/docs/source/ko/tasks/audio_classification.md
@@ -0,0 +1,329 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 오디오 분류[[audio_classification]]
+
+[[open-in-colab]]
+
+<Youtube id="KWwzcmG98Ds"/>
+
+오디오 분류는 텍스트와 마찬가지로 입력 데이터에 클래스 레이블 출력을 할당합니다. 유일한 차이점은 텍스트 입력 대신 원시 오디오 파형이 있다는 것입니다. 오디오 분류의 실제 적용 분야에는 화자의 의도 파악, 언어 분류, 소리로 동물 종을 식별하는 것 등이 있습니다.
+
+이 문서에서 방법을 알아보겠습니다:
+
+1. [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) 데이터 세트를 [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base)로 미세 조정하여 화자의 의도를 분류합니다.
+2. 추론에 미세 조정된 모델을 사용하세요.
+
+<Tip>
+이 튜토리얼에서 설명하는 작업은 아래의 모델 아키텍처에서 지원됩니다:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[Audio Spectrogram Transformer](../model_doc/audio-spectrogram-transformer), [Data2VecAudio](../model_doc/data2vec-audio), [Hubert](../model_doc/hubert), [SEW](../model_doc/sew), [SEW-D](../model_doc/sew-d), [UniSpeech](../model_doc/unispeech), [UniSpeechSat](../model_doc/unispeech-sat), [Wav2Vec2](../model_doc/wav2vec2), [Wav2Vec2-Conformer](../model_doc/wav2vec2-conformer), [WavLM](../model_doc/wavlm), [Whisper](../model_doc/whisper)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+시작하기 전에 필요한 라이브러리가 모두 설치되어 있는지 확인하세요:
+
+```bash
+pip install transformers datasets evaluate
+```
+
+모델을 업로드하고 커뮤니티와 공유할 수 있도록 허깅페이스 계정에 로그인하는 것이 좋습니다. 메시지가 표시되면 토큰을 입력하여 로그인합니다:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## MInDS-14 데이터셋 불러오기[[load_minds_14_dataset]]
+
+먼저 🤗 Datasets 라이브러리에서 MinDS-14 데이터 세트를 가져옵니다:
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> minds = load_dataset("PolyAI/minds14", name="en-US", split="train")
+```
+
+데이터 세트의 `train` 분할을 [`~datasets.Dataset.train_test_split`] 메소드를 사용하여 더 작은 훈련 및 테스트 집합으로 분할합니다. 이렇게 하면 전체 데이터 세트에 더 많은 시간을 소비하기 전에 모든 것이 작동하는지 실험하고 확인할 수 있습니다.
+
+```py
+>>> minds = minds.train_test_split(test_size=0.2)
+```
+
+이제 데이터 집합을 살펴볼게요:
+
+```py
+>>> minds
+DatasetDict({
+    train: Dataset({
+        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
+        num_rows: 450
+    })
+    test: Dataset({
+        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
+        num_rows: 113
+    })
+})
+```
+
+데이터 세트에는 `lang_id` 및 `english_transcription`과 같은 유용한 정보가 많이 포함되어 있지만 이 가이드에서는 `audio` 및 `intent_class`에 중점을 둘 것입니다. 다른 열은 [`~datasets.Dataset.remove_columns`] 메소드를 사용하여 제거합니다:
+
+```py
+>>> minds = minds.remove_columns(["path", "transcription", "english_transcription", "lang_id"])
+```
+
+예시를 살펴보겠습니다:
+
+```py
+>>> minds["train"][0]
+{'audio': {'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00048828,
+         -0.00024414, -0.00024414], dtype=float32),
+  'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602b9a5fbb1e6d0fbce91f52.wav',
+  'sampling_rate': 8000},
+ 'intent_class': 2}
+```
+
+두 개의 필드가 있습니다:
+
+- `audio`: 오디오 파일을 가져오고 리샘플링하기 위해 호출해야 하는 음성 신호의 1차원 `배열`입니다.
+- `intent_class`: 화자의 의도에 대한 클래스 ID를 나타냅니다.
+
+모델이 레이블 ID에서 레이블 이름을 쉽게 가져올 수 있도록 레이블 이름을 정수로 매핑하는 사전을 만들거나 그 반대로 매핑하는 사전을 만듭니다:
+
+```py
+>>> labels = minds["train"].features["intent_class"].names
+>>> label2id, id2label = dict(), dict()
+>>> for i, label in enumerate(labels):
+...     label2id[label] = str(i)
+...     id2label[str(i)] = label
+```
+
+이제 레이블 ID를 레이블 이름으로 변환할 수 있습니다:
+
+```py
+>>> id2label[str(2)]
+'app_error'
+```
+
+## 전처리[[preprocess]]
+
+다음 단계는 오디오 신호를 처리하기 위해 Wav2Vec2 특징 추출기를 가져오는 것입니다:
+
+```py
+>>> from transformers import AutoFeatureExtractor
+
+>>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
+```
+
+MinDS-14 데이터 세트의 샘플링 속도는 8000khz이므로(이 정보는 [데이터세트 카드](https://huggingface.co/datasets/PolyAI/minds14)에서 확인할 수 있습니다), 사전 훈련된 Wav2Vec2 모델을 사용하려면 데이터 세트를 16000kHz로 리샘플링해야 합니다:
+
+```py
+>>> minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
+>>> minds["train"][0]
+{'audio': {'array': array([ 2.2098757e-05,  4.6582241e-05, -2.2803260e-05, ...,
+         -2.8419291e-04, -2.3305941e-04, -1.1425107e-04], dtype=float32),
+  'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602b9a5fbb1e6d0fbce91f52.wav',
+  'sampling_rate': 16000},
+ 'intent_class': 2}
+```
+
+이제 전처리 함수를 만듭니다:
+
+1. 가져올 `오디오` 열을 호출하고 필요한 경우 오디오 파일을 리샘플링합니다.
+2. 오디오 파일의 샘플링 속도가 모델에 사전 훈련된 오디오 데이터의 샘플링 속도와 일치하는지 확인합니다. 이 정보는 Wav2Vec2 [모델 카드](https://huggingface.co/facebook/wav2vec2-base)에서 확인할 수 있습니다.
+3. 긴 입력이 잘리지 않고 일괄 처리되도록 최대 입력 길이를 설정합니다.
+
+```py
+>>> def preprocess_function(examples):
+...     audio_arrays = [x["array"] for x in examples["audio"]]
+...     inputs = feature_extractor(
+...         audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True
+...     )
+...     return inputs
+```
+
+전체 데이터 세트에 전처리 기능을 적용하려면 🤗 Datasets [`~datasets.Dataset.map`] 함수를 사용합니다. `batched=True`를 설정하여 데이터 집합의 여러 요소를 한 번에 처리하면 `map`의 속도를 높일 수 있습니다. 필요하지 않은 열을 제거하고 `intent_class`의 이름을 모델이 예상하는 이름인 `label`로 변경합니다:
+
+```py
+>>> encoded_minds = minds.map(preprocess_function, remove_columns="audio", batched=True)
+>>> encoded_minds = encoded_minds.rename_column("intent_class", "label")
+```
+
+## 평가하기[[evaluate]]
+
+훈련 중에 메트릭을 포함하면 모델의 성능을 평가하는 데 도움이 되는 경우가 많습니다. 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) 라이브러리를 사용하여 평가 방법을 빠르게 가져올 수 있습니다. 이 작업에서는 [accuracy(정확도)](https://huggingface.co/spaces/evaluate-metric/accuracy) 메트릭을 가져옵니다(메트릭을 가져오고 계산하는 방법에 대한 자세한 내용은 🤗 Evalutate [빠른 둘러보기](https://huggingface.co/docs/evaluate/a_quick_tour) 참조하세요):
+
+```py
+>>> import evaluate
+
+>>> accuracy = evaluate.load("accuracy")
+```
+
+그런 다음 예측과 레이블을 [`~evaluate.EvaluationModule.compute`]에 전달하여 정확도를 계산하는 함수를 만듭니다:
+
+```py
+>>> import numpy as np
+
+
+>>> def compute_metrics(eval_pred):
+...     predictions = np.argmax(eval_pred.predictions, axis=1)
+...     return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)
+```
+
+이제 `compute_metrics` 함수를 사용할 준비가 되었으며, 트레이닝을 설정할 때 이 함수를 사용합니다.
+
+## 훈련[[train]]
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+[`Trainer`]로 모델을 미세 조정하는 데 익숙하지 않다면 기본 튜토리얼 [여기](../training#train-with-pytorch-trainer)을 살펴보세요!
+
+</Tip>
+
+이제 모델 훈련을 시작할 준비가 되었습니다! [`AutoModelForAudioClassification`]을 이용해서 Wav2Vec2를 불러옵니다. 예상되는 레이블 수와 레이블 매핑을 지정합니다:
+
+```py
+>>> from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer
+
+>>> num_labels = len(id2label)
+>>> model = AutoModelForAudioClassification.from_pretrained(
+...     "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label
+... )
+```
+
+이제 세 단계만 남았습니다:
+
+1. 훈련 하이퍼파라미터를 [`TrainingArguments`]에 정의합니다. 유일한 필수 매개변수는 모델을 저장할 위치를 지정하는 `output_dir`입니다. `push_to_hub = True`를 설정하여 이 모델을 허브로 푸시합니다(모델을 업로드하려면 허깅 페이스에 로그인해야 합니다). 각 에폭이 끝날 때마다 [`Trainer`]가 정확도를 평가하고 훈련 체크포인트를 저장합니다.
+2. 모델, 데이터 세트, 토크나이저, 데이터 콜레이터, `compute_metrics` 함수와 함께 훈련 인자를 [`Trainer`]에 전달합니다.
+3. [`~Trainer.train`]을 호출하여 모델을 미세 조정합니다.
+
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_mind_model",
+...     evaluation_strategy="epoch",
+...     save_strategy="epoch",
+...     learning_rate=3e-5,
+...     per_device_train_batch_size=32,
+...     gradient_accumulation_steps=4,
+...     per_device_eval_batch_size=32,
+...     num_train_epochs=10,
+...     warmup_ratio=0.1,
+...     logging_steps=10,
+...     load_best_model_at_end=True,
+...     metric_for_best_model="accuracy",
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=encoded_minds["train"],
+...     eval_dataset=encoded_minds["test"],
+...     tokenizer=feature_extractor,
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+훈련이 완료되면 모든 사람이 모델을 사용할 수 있도록 [`~transformers.Trainer.push_to_hub`] 메소드를 사용하여 모델을 허브에 공유하세요:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+</frameworkcontent>
+
+<Tip>
+
+For a more in-depth example of how to finetune a model for audio classification, take a look at the corresponding [PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/audio_classification.ipynb).
+
+</Tip>
+
+## 추론[[inference]]
+
+이제 모델을 미세 조정했으니 추론에 사용할 수 있습니다!
+
+추론을 실행할 오디오 파일을 가져옵니다. 필요한 경우 오디오 파일의 샘플링 속도를 모델의 샘플링 속도와 일치하도록 리샘플링하는 것을 잊지 마세요!
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
+>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
+>>> sampling_rate = dataset.features["audio"].sampling_rate
+>>> audio_file = dataset[0]["audio"]["path"]
+```
+
+추론을 위해 미세 조정한 모델을 시험해 보는 가장 간단한 방법은 [`pipeline`]에서 사용하는 것입니다. 모델을 사용하여 오디오 분류를 위한 `pipeline`을 인스턴스화하고 오디오 파일을 전달합니다:
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline("audio-classification", model="stevhliu/my_awesome_minds_model")
+>>> classifier(audio_file)
+[
+    {'score': 0.09766869246959686, 'label': 'cash_deposit'},
+    {'score': 0.07998877018690109, 'label': 'app_error'},
+    {'score': 0.0781070664525032, 'label': 'joint_account'},
+    {'score': 0.07667109370231628, 'label': 'pay_bill'},
+    {'score': 0.0755252093076706, 'label': 'balance'}
+]
+```
+
+원하는 경우 `pipeline`의 결과를 수동으로 복제할 수도 있습니다:
+
+<frameworkcontent>
+<pt>
+특징 추출기를 가져와서 오디오 파일을 전처리하고 `입력`을 PyTorch 텐서로 반환합니다:
+
+```py
+>>> from transformers import AutoFeatureExtractor
+
+>>> feature_extractor = AutoFeatureExtractor.from_pretrained("stevhliu/my_awesome_minds_model")
+>>> inputs = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
+```
+
+모델에 입력을 전달하고 로짓을 반환합니다:
+
+```py
+>>> from transformers import AutoModelForAudioClassification
+
+>>> model = AutoModelForAudioClassification.from_pretrained("stevhliu/my_awesome_minds_model")
+>>> with torch.no_grad():
+...     logits = model(**inputs).logits
+```
+
+확률이 가장 높은 클래스를 가져온 다음 모델의 `id2label` 매핑을 사용하여 이를 레이블로 변환합니다:
+
+```py
+>>> import torch
+
+>>> predicted_class_ids = torch.argmax(logits).item()
+>>> predicted_label = model.config.id2label[predicted_class_ids]
+>>> predicted_label
+'cash_deposit'
+```
+</pt>
+</frameworkcontent>
\ No newline at end of file
diff --git a/docs/source/ko/tasks/document_question_answering.md b/docs/source/ko/tasks/document_question_answering.md
new file mode 100644
index 000000000000..b9e98f3bf672
--- /dev/null
+++ b/docs/source/ko/tasks/document_question_answering.md
@@ -0,0 +1,482 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 문서 질의 응답(Document Question Answering) [[document_question_answering]]
+
+[[open-in-colab]]
+
+문서 시각적 질의 응답(Document Visual Question Answering)이라고도 하는 
+문서 질의 응답(Document Question Answering)은 문서 이미지에 대한 질문에 답변을 주는 태스크입니다. 
+이 태스크를 지원하는 모델의 입력은 일반적으로 이미지와 질문의 조합이고, 출력은 자연어로 된 답변입니다. 이러한 모델은 텍스트, 단어의 위치(바운딩 박스), 이미지 등 다양한 모달리티를 활용합니다.
+
+이 가이드는 다음 내용을 설명합니다:
+
+- [DocVQA dataset](https://huggingface.co/datasets/nielsr/docvqa_1200_examples_donut)을 사용해 [LayoutLMv2](../model_doc/layoutlmv2) 미세 조정하기
+- 추론을 위해 미세 조정된 모델을 사용하기
+
+<Tip>
+
+이 튜토리얼에서 설명하는 태스크는 다음과 같은 모델 아키텍처에서 지원됩니다:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+LayoutLMv2는 토큰의 마지막 은닉층 위에 질의 응답 헤드를 추가해 답변의 시작 토큰과 끝 토큰의 위치를 예측함으로써 문서 질의 응답 태스크를 해결합니다. 즉, 문맥이 주어졌을 때 질문에 답하는 정보를 추출하는 추출형 질의 응답(Extractive question answering)으로 문제를 처리합니다.
+문맥은 OCR 엔진의 출력에서 가져오며, 여기서는 Google의 Tesseract를 사용합니다.
+
+시작하기 전에 필요한 라이브러리가 모두 설치되어 있는지 확인하세요. LayoutLMv2는 detectron2, torchvision 및 테서랙트를 필요로 합니다.
+
+```bash
+pip install -q transformers datasets
+```
+
+```bash
+pip install 'git+https://github.com/facebookresearch/detectron2.git'
+pip install torchvision
+```
+
+```bash
+sudo apt install tesseract-ocr
+pip install -q pytesseract
+```
+
+필요한 라이브러리들을 모두 설치한 후 런타임을 다시 시작합니다.
+
+커뮤니티에 당신의 모델을 공유하는 것을 권장합니다. Hugging Face 계정에 로그인해서 모델을 🤗 Hub에 업로드하세요.
+프롬프트가 실행되면, 로그인을 위해 토큰을 입력하세요:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+몇 가지 전역 변수를 정의해 보겠습니다.
+
+```py
+>>> model_checkpoint = "microsoft/layoutlmv2-base-uncased"
+>>> batch_size = 4
+```
+
+## 데이터 불러오기 [[load-the-data]]
+
+이 가이드에서는 🤗 Hub에서 찾을 수 있는 전처리된 DocVQA의 작은 샘플을 사용합니다. 
+DocVQA의 전체 데이터 세트를 사용하고 싶다면, [DocVQA homepage](https://rrc.cvc.uab.es/?ch=17)에 가입 후 다운로드 할 수 있습니다. 전체 데이터 세트를 다운로드 했다면, 이 가이드를 계속 진행하기 위해 [🤗 dataset에 파일을 가져오는 방법](https://huggingface.co/docs/datasets/loading#local-and-remote-files)을 확인하세요.
+
+```py
+>>> from datasets import load_dataset
+
+>>> dataset = load_dataset("nielsr/docvqa_1200_examples")
+>>> dataset
+DatasetDict({
+    train: Dataset({
+        features: ['id', 'image', 'query', 'answers', 'words', 'bounding_boxes', 'answer'],
+        num_rows: 1000
+    })
+    test: Dataset({
+        features: ['id', 'image', 'query', 'answers', 'words', 'bounding_boxes', 'answer'],
+        num_rows: 200
+    })
+})
+```
+
+보시다시피, 데이터 세트는 이미 훈련 세트와 테스트 세트로 나누어져 있습니다. 무작위로 예제를 살펴보면서 특성을 확인해보세요.
+
+```py
+>>> dataset["train"].features
+```
+
+각 필드가 나타내는 내용은 다음과 같습니다:
+* `id`: 예제의 id
+* `image`: 문서 이미지를 포함하는 PIL.Image.Image 객체
+* `query`: 질문 문자열 - 여러 언어의 자연어로 된 질문
+* `answers`: 사람이 주석을 단 정답 리스트
+* `words` and `bounding_boxes`: OCR의 결과값들이며 이 가이드에서는 사용하지 않을 예정
+* `answer`: 다른 모델과 일치하는 답변이며 이 가이드에서는 사용하지 않을 예정
+
+영어로 된 질문만 남기고 다른 모델에 대한 예측을 포함하는 `answer` 특성을 삭제하겠습니다.
+그리고 주석 작성자가 제공한 데이터 세트에서 첫 번째 답변을 가져옵니다. 또는 무작위로 샘플을 추출할 수도 있습니다.
+
+```py
+>>> updated_dataset = dataset.map(lambda example: {"question": example["query"]["en"]}, remove_columns=["query"])
+>>> updated_dataset = updated_dataset.map(
+...     lambda example: {"answer": example["answers"][0]}, remove_columns=["answer", "answers"]
+... )
+```
+
+이 가이드에서 사용하는 LayoutLMv2 체크포인트는 `max_position_embeddings = 512`로 훈련되었습니다(이 정보는 [체크포인트의 `config.json` 파일](https://huggingface.co/microsoft/layoutlmv2-base-uncased/blob/main/config.json#L18)에서 확인할 수 있습니다).
+바로 예제를 잘라낼 수도 있지만, 긴 문서의 끝에 답변이 있어 잘리는 상황을 피하기 위해 여기서는 임베딩이 512보다 길어질 가능성이 있는 몇 가지 예제를 제거하겠습니다.
+데이터 세트에 있는 대부분의 문서가 긴 경우 슬라이딩 윈도우 방법을 사용할 수 있습니다 - 자세한 내용을 확인하고 싶으면 이 [노트북](https://github.com/huggingface/notebooks/blob/main/examples/question_answering.ipynb)을 확인하세요.
+
+```py
+>>> updated_dataset = updated_dataset.filter(lambda x: len(x["words"]) + len(x["question"].split()) < 512)
+```
+
+이 시점에서 이 데이터 세트의 OCR 특성도 제거해 보겠습니다. OCR 특성은 다른 모델을 미세 조정하기 위한 것으로, 이 가이드에서 사용하는 모델의 입력 요구 사항과 일치하지 않기 때문에 이 특성을 사용하기 위해서는 일부 처리가 필요합니다. 
+대신, 원본 데이터에 [`LayoutLMv2Processor`]를 사용하여 OCR 및 토큰화를 모두 수행할 수 있습니다.
+이렇게 하면 모델이 요구하는 입력을 얻을 수 있습니다. 
+이미지를 수동으로 처리하려면, [`LayoutLMv2` model documentation](../model_doc/layoutlmv2)에서 모델이 요구하는 입력 포맷을 확인해보세요.
+
+```py
+>>> updated_dataset = updated_dataset.remove_columns("words")
+>>> updated_dataset = updated_dataset.remove_columns("bounding_boxes")
+```
+
+마지막으로, 데이터 탐색을 완료하기 위해 이미지 예시를 살펴봅시다.
+
+```py
+>>> updated_dataset["train"][11]["image"]
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/docvqa_example.jpg" alt="DocVQA Image Example"/>
+ </div>
+
+## 데이터 전처리 [[preprocess-the-data]]
+
+
+문서 질의 응답 태스크는 멀티모달 태스크이며, 각 모달리티의 입력이 모델의 요구에 맞게 전처리 되었는지 확인해야 합니다.
+이미지 데이터를 처리할 수 있는 이미지 프로세서와 텍스트 데이터를 인코딩할 수 있는 토크나이저를 결합한 [`LayoutLMv2Processor`]를 가져오는 것부터 시작해 보겠습니다.
+
+```py
+>>> from transformers import AutoProcessor
+
+>>> processor = AutoProcessor.from_pretrained(model_checkpoint)
+```
+
+### 문서 이미지 전처리 [[preprocessing-document-images]]
+
+먼저, 프로세서의 `image_processor`를 사용해 모델에 대한 문서 이미지를 준비해 보겠습니다.
+기본값으로, 이미지 프로세서는 이미지 크기를 224x224로 조정하고 색상 채널의 순서가 올바른지 확인한 후 단어와 정규화된 바운딩 박스를 얻기 위해 테서랙트를 사용해 OCR를 적용합니다.
+이 튜토리얼에서 우리가 필요한 것과 기본값은 완전히 동일합니다. 이미지 배치에 기본 이미지 처리를 적용하고 OCR의 결과를 변환하는 함수를 작성합니다.
+
+```py
+>>> image_processor = processor.image_processor
+
+
+>>> def get_ocr_words_and_boxes(examples):
+...     images = [image.convert("RGB") for image in examples["image"]]
+...     encoded_inputs = image_processor(images)
+
+...     examples["image"] = encoded_inputs.pixel_values
+...     examples["words"] = encoded_inputs.words
+...     examples["boxes"] = encoded_inputs.boxes
+
+...     return examples
+```
+
+이 전처리를 데이터 세트 전체에 빠르게 적용하려면 [`~datasets.Dataset.map`]를 사용하세요.
+
+```py
+>>> dataset_with_ocr = updated_dataset.map(get_ocr_words_and_boxes, batched=True, batch_size=2)
+```
+
+### 텍스트 데이터 전처리 [[preprocessing-text-data]]
+
+이미지에 OCR을 적용했으면 데이터 세트의 텍스트 부분을 모델에 맞게 인코딩해야 합니다.
+이 인코딩에는 이전 단계에서 가져온 단어와 박스를 토큰 수준의 `input_ids`, `attention_mask`, `token_type_ids` 및 `bbox`로 변환하는 작업이 포함됩니다. 
+텍스트를 전처리하려면 프로세서의 `tokenizer`가 필요합니다.
+
+```py
+>>> tokenizer = processor.tokenizer
+```
+
+위에서 언급한 전처리 외에도 모델을 위해 레이블을 추가해야 합니다. 🤗 Transformers의 `xxxForQuestionAnswering` 모델의 경우, 레이블은 `start_positions`와 `end_positions`로 구성되며 어떤 토큰이 답변의 시작과 끝에 있는지를 나타냅니다.
+
+레이블 추가를 위해서, 먼저 더 큰 리스트(단어 리스트)에서 하위 리스트(단어로 분할된 답변)을 찾을 수 있는 헬퍼 함수를 정의합니다.
+
+이 함수는 `words_list`와 `answer_list`, 이렇게 두 리스트를 입력으로 받습니다. 
+그런 다음 `words_list`를 반복하여 `words_list`의 현재 단어(words_list[i])가 `answer_list`의 첫 번째 단어(answer_list[0])와 같은지, 
+현재 단어에서 시작해 `answer_list`와 같은 길이만큼의 `words_list`의 하위 리스트가 `answer_list`와 일치하는지 확인합니다.
+이 조건이 참이라면 일치하는 항목을 발견했음을 의미하며, 함수는 일치 항목, 시작 인덱스(idx) 및 종료 인덱스(idx + len(answer_list) - 1)를 기록합니다. 일치하는 항목이 두 개 이상 발견되면 함수는 첫 번째 항목만 반환합니다. 일치하는 항목이 없다면 함수는 (`None`, 0, 0)을 반환합니다.
+
+```py
+>>> def subfinder(words_list, answer_list):
+...     matches = []
+...     start_indices = []
+...     end_indices = []
+...     for idx, i in enumerate(range(len(words_list))):
+...         if words_list[i] == answer_list[0] and words_list[i : i + len(answer_list)] == answer_list:
+...             matches.append(answer_list)
+...             start_indices.append(idx)
+...             end_indices.append(idx + len(answer_list) - 1)
+...     if matches:
+...         return matches[0], start_indices[0], end_indices[0]
+...     else:
+...         return None, 0, 0
+```
+
+이 함수가 어떻게 정답의 위치를 찾는지 설명하기 위해 다음 예제에서 함수를 사용해 보겠습니다:
+
+```py
+>>> example = dataset_with_ocr["train"][1]
+>>> words = [word.lower() for word in example["words"]]
+>>> match, word_idx_start, word_idx_end = subfinder(words, example["answer"].lower().split())
+>>> print("Question: ", example["question"])
+>>> print("Words:", words)
+>>> print("Answer: ", example["answer"])
+>>> print("start_index", word_idx_start)
+>>> print("end_index", word_idx_end)
+Question:  Who is in  cc in this letter?
+Words: ['wie', 'baw', 'brown', '&', 'williamson', 'tobacco', 'corporation', 'research', '&', 'development', 'internal', 'correspondence', 'to:', 'r.', 'h.', 'honeycutt', 'ce:', 't.f.', 'riehl', 'from:', '.', 'c.j.', 'cook', 'date:', 'may', '8,', '1995', 'subject:', 'review', 'of', 'existing', 'brainstorming', 'ideas/483', 'the', 'major', 'function', 'of', 'the', 'product', 'innovation', 'graup', 'is', 'to', 'develop', 'marketable', 'nove!', 'products', 'that', 'would', 'be', 'profitable', 'to', 'manufacture', 'and', 'sell.', 'novel', 'is', 'defined', 'as:', 'of', 'a', 'new', 'kind,', 'or', 'different', 'from', 'anything', 'seen', 'or', 'known', 'before.', 'innovation', 'is', 'defined', 'as:', 'something', 'new', 'or', 'different', 'introduced;', 'act', 'of', 'innovating;', 'introduction', 'of', 'new', 'things', 'or', 'methods.', 'the', 'products', 'may', 'incorporate', 'the', 'latest', 'technologies,', 'materials', 'and', 'know-how', 'available', 'to', 'give', 'then', 'a', 'unique', 'taste', 'or', 'look.', 'the', 'first', 'task', 'of', 'the', 'product', 'innovation', 'group', 'was', 'to', 'assemble,', 'review', 'and', 'categorize', 'a', 'list', 'of', 'existing', 'brainstorming', 'ideas.', 'ideas', 'were', 'grouped', 'into', 'two', 'major', 'categories', 'labeled', 'appearance', 'and', 'taste/aroma.', 'these', 'categories', 'are', 'used', 'for', 'novel', 'products', 'that', 'may', 'differ', 'from', 'a', 'visual', 'and/or', 'taste/aroma', 'point', 'of', 'view', 'compared', 'to', 'canventional', 'cigarettes.', 'other', 'categories', 'include', 'a', 'combination', 'of', 'the', 'above,', 'filters,', 'packaging', 'and', 'brand', 'extensions.', 'appearance', 'this', 'category', 'is', 'used', 'for', 'novel', 'cigarette', 'constructions', 'that', 'yield', 'visually', 'different', 'products', 'with', 'minimal', 'changes', 'in', 'smoke', 'chemistry', 'two', 'cigarettes', 'in', 'cne.', 'emulti-plug', 'te', 'build', 'yaur', 'awn', 'cigarette.', 'eswitchable', 'menthol', 'or', 'non', 'menthol', 'cigarette.', '*cigarettes', 'with', 'interspaced', 'perforations', 'to', 'enable', 'smoker', 'to', 'separate', 'unburned', 'section', 'for', 'future', 'smoking.', '«short', 'cigarette,', 'tobacco', 'section', '30', 'mm.', '«extremely', 'fast', 'buming', 'cigarette.', '«novel', 'cigarette', 'constructions', 'that', 'permit', 'a', 'significant', 'reduction', 'iretobacco', 'weight', 'while', 'maintaining', 'smoking', 'mechanics', 'and', 'visual', 'characteristics.', 'higher', 'basis', 'weight', 'paper:', 'potential', 'reduction', 'in', 'tobacco', 'weight.', '«more', 'rigid', 'tobacco', 'column;', 'stiffing', 'agent', 'for', 'tobacco;', 'e.g.', 'starch', '*colored', 'tow', 'and', 'cigarette', 'papers;', 'seasonal', 'promotions,', 'e.g.', 'pastel', 'colored', 'cigarettes', 'for', 'easter', 'or', 'in', 'an', 'ebony', 'and', 'ivory', 'brand', 'containing', 'a', 'mixture', 'of', 'all', 'black', '(black', 'paper', 'and', 'tow)', 'and', 'ail', 'white', 'cigarettes.', '499150498']
+Answer:  T.F. Riehl
+start_index 17
+end_index 18
+```
+
+한편, 위 예제가 인코딩되면 다음과 같이 표시됩니다:
+
+```py
+>>> encoding = tokenizer(example["question"], example["words"], example["boxes"])
+>>> tokenizer.decode(encoding["input_ids"])
+[CLS] who is in cc in this letter? [SEP] wie baw brown & williamson tobacco corporation research & development ...
+```
+
+이제 인코딩된 입력에서 정답의 위치를 찾아야 합니다.
+* `token_type_ids`는 어떤 토큰이 질문에 속하는지, 그리고 어떤 토큰이 문서의 단어에 포함되는지를 알려줍니다.
+* `tokenizer.cls_token_id` 입력의 시작 부분에 있는 특수 토큰을 찾는 데 도움을 줍니다.
+* `word_ids`는 원본 `words`에서 찾은 답변을 전체 인코딩된 입력의 동일한 답과 일치시키고 인코딩된 입력에서 답변의 시작/끝 위치를 결정합니다.
+
+위 내용들을 염두에 두고 데이터 세트 예제의 배치를 인코딩하는 함수를 만들어 보겠습니다:
+
+```py
+>>> def encode_dataset(examples, max_length=512):
+...     questions = examples["question"]
+...     words = examples["words"]
+...     boxes = examples["boxes"]
+...     answers = examples["answer"]
+
+...     # 예제 배치를 인코딩하고 start_positions와 end_positions를 초기화합니다
+...     encoding = tokenizer(questions, words, boxes, max_length=max_length, padding="max_length", truncation=True)
+...     start_positions = []
+...     end_positions = []
+
+...     # 배치의 예제를 반복합니다
+...     for i in range(len(questions)):
+...         cls_index = encoding["input_ids"][i].index(tokenizer.cls_token_id)
+
+...         # 예제의 words에서 답변의 위치를 찾습니다
+...         words_example = [word.lower() for word in words[i]]
+...         answer = answers[i]
+...         match, word_idx_start, word_idx_end = subfinder(words_example, answer.lower().split())
+
+...         if match:
+...             # 일치하는 항목을 발견하면, `token_type_ids`를 사용해 인코딩에서 단어가 시작하는 위치를 찾습니다
+...             token_type_ids = encoding["token_type_ids"][i]
+...             token_start_index = 0
+...             while token_type_ids[token_start_index] != 1:
+...                 token_start_index += 1
+
+...             token_end_index = len(encoding["input_ids"][i]) - 1
+...             while token_type_ids[token_end_index] != 1:
+...                 token_end_index -= 1
+
+...             word_ids = encoding.word_ids(i)[token_start_index : token_end_index + 1]
+...             start_position = cls_index
+...             end_position = cls_index
+
+...             # words의 답변 위치와 일치할 때까지 word_ids를 반복하고 `token_start_index`를 늘립니다
+...             # 일치하면 `token_start_index`를 인코딩에서 답변의 `start_position`으로 저장합니다
+...             for id in word_ids:
+...                 if id == word_idx_start:
+...                     start_position = token_start_index
+...                 else:
+...                     token_start_index += 1
+
+...             # 비슷하게, 끝에서 시작해 `word_ids`를 반복하며 답변의 `end_position`을 찾습니다
+...             for id in word_ids[::-1]:
+...                 if id == word_idx_end:
+...                     end_position = token_end_index
+...                 else:
+...                     token_end_index -= 1
+
+...             start_positions.append(start_position)
+...             end_positions.append(end_position)
+
+...         else:
+...             start_positions.append(cls_index)
+...             end_positions.append(cls_index)
+
+...     encoding["image"] = examples["image"]
+...     encoding["start_positions"] = start_positions
+...     encoding["end_positions"] = end_positions
+
+...     return encoding
+```
+
+이제 이 전처리 함수가 있으니 전체 데이터 세트를 인코딩할 수 있습니다:
+
+```py
+>>> encoded_train_dataset = dataset_with_ocr["train"].map(
+...     encode_dataset, batched=True, batch_size=2, remove_columns=dataset_with_ocr["train"].column_names
+... )
+>>> encoded_test_dataset = dataset_with_ocr["test"].map(
+...     encode_dataset, batched=True, batch_size=2, remove_columns=dataset_with_ocr["test"].column_names
+... )
+```
+
+인코딩된 데이터 세트의 특성이 어떻게 생겼는지 확인해 보겠습니다:
+
+```py
+>>> encoded_train_dataset.features
+{'image': Sequence(feature=Sequence(feature=Sequence(feature=Value(dtype='uint8', id=None), length=-1, id=None), length=-1, id=None), length=-1, id=None),
+ 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
+ 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
+ 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
+ 'bbox': Sequence(feature=Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), length=-1, id=None),
+ 'start_positions': Value(dtype='int64', id=None),
+ 'end_positions': Value(dtype='int64', id=None)}
+```
+
+## 평가 [[evaluation]]
+
+문서 질의 응답을 평가하려면 상당한 양의 후처리가 필요합니다. 시간이 너무 많이 걸리지 않도록 이 가이드에서는 평가 단계를 생략합니다.
+[`Trainer`]가 훈련 과정에서 평가 손실(evaluation loss)을 계속 계산하기 때문에 모델의 성능을 대략적으로 알 수 있습니다.
+추출적(Extractive) 질의 응답은 보통 F1/exact match 방법을 사용해 평가됩니다.
+직접 구현해보고 싶으시다면, Hugging Face course의 [Question Answering chapter](https://huggingface.co/course/chapter7/7?fw=pt#postprocessing)을 참고하세요.
+
+## 훈련 [[train]]
+
+축하합니다! 이 가이드의 가장 어려운 부분을 성공적으로 처리했으니 이제 나만의 모델을 훈련할 준비가 되었습니다. 
+훈련은 다음과 같은 단계로 이루어져 있습니다:
+* 전처리에서의 동일한 체크포인트를 사용하기 위해 [`AutoModelForDocumentQuestionAnswering`]으로 모델을 가져옵니다.
+* [`TrainingArguments`]로 훈련 하이퍼파라미터를 정합니다.
+* 예제를 배치 처리하는 함수를 정의합니다. 여기서는 [`DefaultDataCollator`]가 적당합니다.
+* 모델, 데이터 세트, 데이터 콜레이터(Data collator)와 함께 [`Trainer`]에 훈련 인수들을 전달합니다.
+* [`~Trainer.train`]을 호출해서 모델을 미세 조정합니다.
+
+```py
+>>> from transformers import AutoModelForDocumentQuestionAnswering
+
+>>> model = AutoModelForDocumentQuestionAnswering.from_pretrained(model_checkpoint)
+```
+
+[`TrainingArguments`]에서 `output_dir`을 사용하여 모델을 저장할 위치를 지정하고, 적절한 하이퍼파라미터를 설정합니다.
+모델을 커뮤니티와 공유하려면 `push_to_hub`를 `True`로 설정하세요 (모델을 업로드하려면 Hugging Face에 로그인해야 합니다).
+이 경우 `output_dir`은 모델의 체크포인트를 푸시할 레포지토리의 이름이 됩니다.
+
+```py
+>>> from transformers import TrainingArguments
+
+>>> # 본인의 레포지토리 ID로 바꾸세요
+>>> repo_id = "MariaK/layoutlmv2-base-uncased_finetuned_docvqa"
+
+>>> training_args = TrainingArguments(
+...     output_dir=repo_id,
+...     per_device_train_batch_size=4,
+...     num_train_epochs=20,
+...     save_steps=200,
+...     logging_steps=50,
+...     evaluation_strategy="steps",
+...     learning_rate=5e-5,
+...     save_total_limit=2,
+...     remove_unused_columns=False,
+...     push_to_hub=True,
+... )
+```
+
+간단한 데이터 콜레이터를 정의하여 예제를 함께 배치합니다.
+
+```py
+>>> from transformers import DefaultDataCollator
+
+>>> data_collator = DefaultDataCollator()
+```
+
+마지막으로, 모든 것을 한 곳에 모아 [`~Trainer.train`]을 호출합니다:
+
+```py
+>>> from transformers import Trainer
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     data_collator=data_collator,
+...     train_dataset=encoded_train_dataset,
+...     eval_dataset=encoded_test_dataset,
+...     tokenizer=processor,
+... )
+
+>>> trainer.train()
+```
+
+최종 모델을 🤗 Hub에 추가하려면, 모델 카드를 생성하고 `push_to_hub`를 호출합니다:
+
+```py
+>>> trainer.create_model_card()
+>>> trainer.push_to_hub()
+```
+
+## 추론 [[inference]]
+
+이제 LayoutLMv2 모델을 미세 조정하고 🤗 Hub에 업로드했으니 추론에도 사용할 수 있습니다. 
+추론을 위해 미세 조정된 모델을 사용해 보는 가장 간단한 방법은 [`Pipeline`]을 사용하는 것 입니다.
+
+예를 들어 보겠습니다:
+```py
+>>> example = dataset["test"][2]
+>>> question = example["query"]["en"]
+>>> image = example["image"]
+>>> print(question)
+>>> print(example["answers"])
+'Who is ‘presiding’ TRRF GENERAL SESSION (PART 1)?'
+['TRRF Vice President', 'lee a. waller']
+```
+
+그 다음, 모델로 문서 질의 응답을 하기 위해 파이프라인을 인스턴스화하고 이미지 + 질문 조합을 전달합니다.
+
+```py
+>>> from transformers import pipeline
+
+>>> qa_pipeline = pipeline("document-question-answering", model="MariaK/layoutlmv2-base-uncased_finetuned_docvqa")
+>>> qa_pipeline(image, question)
+[{'score': 0.9949808120727539,
+  'answer': 'Lee A. Waller',
+  'start': 55,
+  'end': 57}]
+```
+
+원한다면 파이프라인의 결과를 수동으로 복제할 수도 있습니다:
+1. 이미지와 질문을 가져와 모델의 프로세서를 사용해 모델에 맞게 준비합니다.
+2. 모델을 통해 결과 또는 전처리를 전달합니다.
+3. 모델은 어떤 토큰이 답변의 시작에 있는지, 어떤 토큰이 답변이 끝에 있는지를 나타내는 `start_logits`와 `end_logits`를 반환합니다. 둘 다 (batch_size, sequence_length) 형태를 갖습니다.
+4. `start_logits`와 `end_logits`의 마지막 차원을 최대로 만드는 값을 찾아 예상 `start_idx`와 `end_idx`를 얻습니다.
+5. 토크나이저로 답변을 디코딩합니다.
+
+```py
+>>> import torch
+>>> from transformers import AutoProcessor
+>>> from transformers import AutoModelForDocumentQuestionAnswering
+
+>>> processor = AutoProcessor.from_pretrained("MariaK/layoutlmv2-base-uncased_finetuned_docvqa")
+>>> model = AutoModelForDocumentQuestionAnswering.from_pretrained("MariaK/layoutlmv2-base-uncased_finetuned_docvqa")
+
+>>> with torch.no_grad():
+...     encoding = processor(image.convert("RGB"), question, return_tensors="pt")
+...     outputs = model(**encoding)
+...     start_logits = outputs.start_logits
+...     end_logits = outputs.end_logits
+...     predicted_start_idx = start_logits.argmax(-1).item()
+...     predicted_end_idx = end_logits.argmax(-1).item()
+
+>>> processor.tokenizer.decode(encoding.input_ids.squeeze()[predicted_start_idx : predicted_end_idx + 1])
+'lee a. waller'
+```
\ No newline at end of file
diff --git a/docs/source/ko/tasks/image_captioning.md b/docs/source/ko/tasks/image_captioning.md
new file mode 100644
index 000000000000..0521db0dc9ab
--- /dev/null
+++ b/docs/source/ko/tasks/image_captioning.md
@@ -0,0 +1,281 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+
+# 이미지 캡셔닝[[image-captioning]]
+
+[[open-in-colab]]
+
+이미지 캡셔닝(Image captioning)은 주어진 이미지에 대한 캡션을 예측하는 작업입니다. 
+이미지 캡셔닝은 시각 장애인이 다양한 상황을 탐색하는 데 도움을 줄 수 있도록 시각 장애인을 보조하는 등 실생활에서 흔히 활용됩니다. 
+따라서 이미지 캡셔닝은 이미지를 설명함으로써 사람들의 콘텐츠 접근성을 개선하는 데 도움이 됩니다.
+
+이 가이드에서는 소개할 내용은 아래와 같습니다:
+
+* 이미지 캡셔닝 모델을 파인튜닝합니다.
+* 파인튜닝된 모델을 추론에 사용합니다.
+
+시작하기 전에 필요한 모든 라이브러리가 설치되어 있는지 확인하세요:
+
+```bash
+pip install transformers datasets evaluate -q
+pip install jiwer -q
+```
+
+Hugging Face 계정에 로그인하면 모델을 업로드하고 커뮤니티에 공유할 수 있습니다. 
+토큰을 입력하여 로그인하세요.
+
+
+```python
+from huggingface_hub import notebook_login
+
+notebook_login()
+```
+
+## 포켓몬 BLIP 캡션 데이터세트 가져오기[[load-the-pokmon-blip-captions-dataset]]
+
+{이미지-캡션} 쌍으로 구성된 데이터세트를 가져오려면 🤗 Dataset 라이브러리를 사용합니다. 
+PyTorch에서 자신만의 이미지 캡션 데이터세트를 만들려면 [이 노트북](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/GIT/Fine_tune_GIT_on_an_image_captioning_dataset.ipynb)을 참조하세요. 
+
+
+```python
+from datasets import load_dataset
+
+ds = load_dataset("lambdalabs/pokemon-blip-captions")
+ds
+```
+```bash
+DatasetDict({
+    train: Dataset({
+        features: ['image', 'text'],
+        num_rows: 833
+    })
+})
+```
+
+이 데이터세트는 `image`와 `text`라는 두 특성을 가지고 있습니다.
+
+<Tip>
+
+많은 이미지 캡션 데이터세트에는 이미지당 여러 개의 캡션이 포함되어 있습니다. 
+이러한 경우, 일반적으로 학습 중에 사용 가능한 캡션 중에서 무작위로 샘플을 추출합니다. 
+
+</Tip>
+
+[~datasets.Dataset.train_test_split] 메소드를 사용하여 데이터세트의 학습 분할을 학습 및 테스트 세트로 나눕니다:
+
+
+```python
+ds = ds["train"].train_test_split(test_size=0.1)
+train_ds = ds["train"]
+test_ds = ds["test"]
+```
+
+학습 세트의 샘플 몇 개를 시각화해 봅시다.
+Let's visualize a couple of samples from the training set. 
+
+
+```python
+from textwrap import wrap
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+def plot_images(images, captions):
+    plt.figure(figsize=(20, 20))
+    for i in range(len(images)):
+        ax = plt.subplot(1, len(images), i + 1)
+        caption = captions[i]
+        caption = "\n".join(wrap(caption, 12))
+        plt.title(caption)
+        plt.imshow(images[i])
+        plt.axis("off")
+
+
+sample_images_to_visualize = [np.array(train_ds[i]["image"]) for i in range(5)]
+sample_captions = [train_ds[i]["text"] for i in range(5)]
+plot_images(sample_images_to_visualize, sample_captions)
+```
+    
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/sample_training_images_image_cap.png" alt="Sample training images"/>
+</div>
+
+## 데이터세트 전처리[[preprocess-the-dataset]]
+
+데이터세트에는 이미지와 텍스트라는 두 가지 양식이 있기 때문에, 전처리 파이프라인에서 이미지와 캡션을 모두 전처리합니다.
+
+전처리 작업을 위해, 파인튜닝하려는 모델에 연결된 프로세서 클래스를 가져옵니다. 
+
+```python
+from transformers import AutoProcessor
+
+checkpoint = "microsoft/git-base"
+processor = AutoProcessor.from_pretrained(checkpoint)
+```
+
+프로세서는 내부적으로 크기 조정 및 픽셀 크기 조정을 포함한 이미지 전처리를 수행하고 캡션을 토큰화합니다. 
+
+```python
+def transforms(example_batch):
+    images = [x for x in example_batch["image"]]
+    captions = [x for x in example_batch["text"]]
+    inputs = processor(images=images, text=captions, padding="max_length")
+    inputs.update({"labels": inputs["input_ids"]})
+    return inputs
+
+
+train_ds.set_transform(transforms)
+test_ds.set_transform(transforms)
+```
+
+데이터세트가 준비되었으니 이제 파인튜닝을 위해 모델을 설정할 수 있습니다.
+
+## 기본 모델 가져오기[[load-a-base-model]]
+
+["microsoft/git-base"](https://huggingface.co/microsoft/git-base)를 [`AutoModelForCausalLM`](https://huggingface.co/docs/transformers/model_doc/auto#transformers.AutoModelForCausalLM) 객체로 가져옵니다.
+
+
+```python
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained(checkpoint)
+```
+
+## 평가[[evaluate]]
+
+이미지 캡션 모델은 일반적으로 [Rouge 점수](https://huggingface.co/spaces/evaluate-metric/rouge) 또는 [단어 오류율(Word Error Rate)](https://huggingface.co/spaces/evaluate-metric/wer)로 평가합니다. 
+이 가이드에서는 단어 오류율(WER)을 사용합니다. 
+
+이를 위해 🤗 Evaluate 라이브러리를 사용합니다. 
+WER의 잠재적 제한 사항 및 기타 문제점은 [이 가이드](https://huggingface.co/spaces/evaluate-metric/wer)를 참조하세요. 
+
+
+```python
+from evaluate import load
+import torch
+
+wer = load("wer")
+
+
+def compute_metrics(eval_pred):
+    logits, labels = eval_pred
+    predicted = logits.argmax(-1)
+    decoded_labels = processor.batch_decode(labels, skip_special_tokens=True)
+    decoded_predictions = processor.batch_decode(predicted, skip_special_tokens=True)
+    wer_score = wer.compute(predictions=decoded_predictions, references=decoded_labels)
+    return {"wer_score": wer_score}
+```
+
+## 학습![[train!]]
+
+이제 모델 파인튜닝을 시작할 준비가 되었습니다. 이를 위해 🤗 [`Trainer`]를 사용합니다. 
+
+먼저, [`TrainingArguments`]를 사용하여 학습 인수를 정의합니다.
+
+
+```python
+from transformers import TrainingArguments, Trainer
+
+model_name = checkpoint.split("/")[1]
+
+training_args = TrainingArguments(
+    output_dir=f"{model_name}-pokemon",
+    learning_rate=5e-5,
+    num_train_epochs=50,
+    fp16=True,
+    per_device_train_batch_size=32,
+    per_device_eval_batch_size=32,
+    gradient_accumulation_steps=2,
+    save_total_limit=3,
+    evaluation_strategy="steps",
+    eval_steps=50,
+    save_strategy="steps",
+    save_steps=50,
+    logging_steps=50,
+    remove_unused_columns=False,
+    push_to_hub=True,
+    label_names=["labels"],
+    load_best_model_at_end=True,
+)
+```
+
+학습 인수를 데이터세트, 모델과 함께 🤗 Trainer에 전달합니다. 
+
+```python
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_ds,
+    eval_dataset=test_ds,
+    compute_metrics=compute_metrics,
+)
+```
+
+학습을 시작하려면 [`Trainer`] 객체에서 [`~Trainer.train`]을 호출하기만 하면 됩니다.
+
+```python 
+trainer.train()
+```
+
+학습이 진행되면서 학습 손실이 원활하게 감소하는 것을 볼 수 있습니다.
+
+학습이 완료되면 모든 사람이 모델을 사용할 수 있도록 [`~Trainer.push_to_hub`] 메소드를 사용하여 모델을 허브에 공유하세요:
+
+
+```python
+trainer.push_to_hub()
+```
+
+## 추론[[inference]]
+
+`test_ds`에서 샘플 이미지를 가져와 모델을 테스트합니다.
+
+
+```python
+from PIL import Image
+import requests
+
+url = "https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/pokemon.png"
+image = Image.open(requests.get(url, stream=True).raw)
+image
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/test_image_image_cap.png" alt="Test image"/>
+</div>
+    
+모델에 사용할 이미지를 준비합니다.
+
+```python
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+inputs = processor(images=image, return_tensors="pt").to(device)
+pixel_values = inputs.pixel_values
+```
+
+[`generate`]를 호출하고 예측을 디코딩합니다.
+
+```python
+generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
+generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+print(generated_caption)
+```
+```bash
+a drawing of a pink and blue pokemon
+```
+
+파인튜닝된 모델이 꽤 괜찮은 캡션을 생성한 것 같습니다!
diff --git a/docs/source/ko/tasks/image_classification.md b/docs/source/ko/tasks/image_classification.md
new file mode 100644
index 000000000000..031e01ea5c5a
--- /dev/null
+++ b/docs/source/ko/tasks/image_classification.md
@@ -0,0 +1,546 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 이미지 분류[[image-classification]]
+
+[[open-in-colab]]
+
+<Youtube id="tjAIM7BOYhw"/>
+
+이미지 분류는 이미지에 레이블 또는 클래스를 할당합니다. 텍스트 또는 오디오 분류와 달리 입력은
+이미지를 구성하는 픽셀 값입니다. 이미지 분류에는 자연재해 후 피해 감지, 농작물 건강 모니터링, 의료 이미지에서 질병의 징후 검사 지원 등
+다양한 응용 사례가 있습니다.
+
+이 가이드에서는 다음을 설명합니다:
+
+1. [Food-101](https://huggingface.co/datasets/food101) 데이터 세트에서 [ViT](model_doc/vit)를 미세 조정하여 이미지에서 식품 항목을 분류합니다.
+2. 추론을 위해 미세 조정 모델을 사용합니다.
+
+<Tip>
+이 튜토리얼에서 설명하는 작업은 다음 모델 아키텍처에 의해 지원됩니다:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[BEiT](../model_doc/beit), [BiT](../model_doc/bit), [ConvNeXT](../model_doc/convnext), [ConvNeXTV2](../model_doc/convnextv2), [CvT](../model_doc/cvt), [Data2VecVision](../model_doc/data2vec-vision), [DeiT](../model_doc/deit), [DiNAT](../model_doc/dinat), [EfficientFormer](../model_doc/efficientformer), [EfficientNet](../model_doc/efficientnet), [FocalNet](../model_doc/focalnet), [ImageGPT](../model_doc/imagegpt), [LeViT](../model_doc/levit), [MobileNetV1](../model_doc/mobilenet_v1), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [NAT](../model_doc/nat), [Perceiver](../model_doc/perceiver), [PoolFormer](../model_doc/poolformer), [RegNet](../model_doc/regnet), [ResNet](../model_doc/resnet), [SegFormer](../model_doc/segformer), [Swin Transformer](../model_doc/swin), [Swin Transformer V2](../model_doc/swinv2), [VAN](../model_doc/van), [ViT](../model_doc/vit), [ViT Hybrid](../model_doc/vit_hybrid), [ViTMSN](../model_doc/vit_msn)
+<!--End of the generated tip-->
+
+</Tip>
+
+시작하기 전에, 필요한 모든 라이브러리가 설치되어 있는지 확인하세요:
+
+```bash
+pip install transformers datasets evaluate
+```
+
+Hugging Face 계정에 로그인하여 모델을 업로드하고 커뮤니티에 공유하는 것을 권장합니다. 메시지가 표시되면, 토큰을 입력하여 로그인하세요:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## Food-101 데이터 세트 가져오기[[load-food101-dataset]]
+
+🤗 Datasets 라이브러리에서 Food-101 데이터 세트의 더 작은 부분 집합을 가져오는 것으로 시작합니다. 이렇게 하면 전체 데이터 세트에 대한
+훈련에 많은 시간을 할애하기 전에 실험을 통해 모든 것이 제대로 작동하는지 확인할 수 있습니다.
+
+```py
+>>> from datasets import load_dataset
+
+>>> food = load_dataset("food101", split="train[:5000]")
+```
+
+데이터 세트의 `train`을 [`~datasets.Dataset.train_test_split`] 메소드를 사용하여 훈련 및 테스트 세트로 분할하세요:
+
+```py
+>>> food = food.train_test_split(test_size=0.2)
+```
+
+그리고 예시를 살펴보세요:
+
+```py
+>>> food["train"][0]
+{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x512 at 0x7F52AFC8AC50>,
+ 'label': 79}
+```
+
+데이터 세트의 각 예제에는 두 개의 필드가 있습니다:
+
+- `image`: 식품 항목의 PIL 이미지
+- `label`: 식품 항목의 레이블 클래스
+
+모델이 레이블 ID에서 레이블 이름을 쉽게 가져올 수 있도록
+레이블 이름을 정수로 매핑하고, 정수를 레이블 이름으로 매핑하는 사전을 만드세요:
+
+```py
+>>> labels = food["train"].features["label"].names
+>>> label2id, id2label = dict(), dict()
+>>> for i, label in enumerate(labels):
+...     label2id[label] = str(i)
+...     id2label[str(i)] = label
+```
+
+이제 레이블 ID를 레이블 이름으로 변환할 수 있습니다:
+
+```py
+>>> id2label[str(79)]
+'prime_rib'
+```
+
+## 전처리[[preprocess]]
+
+다음 단계는 이미지를 텐서로 처리하기 위해 ViT 이미지 프로세서를 가져오는 것입니다:
+
+```py
+>>> from transformers import AutoImageProcessor
+
+>>> checkpoint = "google/vit-base-patch16-224-in21k"
+>>> image_processor = AutoImageProcessor.from_pretrained(checkpoint)
+```
+
+<frameworkcontent>
+<pt>
+이미지에 몇 가지 이미지 변환을 적용하여 과적합에 대해 모델을 더 견고하게 만듭니다. 여기서 Torchvision의 [`transforms`](https://pytorch.org/vision/stable/transforms.html) 모듈을 사용하지만, 원하는 이미지 라이브러리를 사용할 수도 있습니다.
+
+이미지의 임의 부분을 크롭하고 크기를 조정한 다음, 이미지 평균과 표준 편차로 정규화하세요:
+
+```py
+>>> from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor
+
+>>> normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
+>>> size = (
+...     image_processor.size["shortest_edge"]
+...     if "shortest_edge" in image_processor.size
+...     else (image_processor.size["height"], image_processor.size["width"])
+... )
+>>> _transforms = Compose([RandomResizedCrop(size), ToTensor(), normalize])
+```
+
+그런 다음 전처리 함수를 만들어 변환을 적용하고 이미지의 `pixel_values`(모델에 대한 입력)를 반환하세요:
+
+```py
+>>> def transforms(examples):
+...     examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in examples["image"]]
+...     del examples["image"]
+...     return examples
+```
+
+전체 데이터 세트에 전처리 기능을 적용하려면 🤗 Datasets [`~datasets.Dataset.with_transform`]을 사용합니다. 데이터 세트의 요소를 가져올 때 변환이 즉시 적용됩니다:
+
+```py
+>>> food = food.with_transform(transforms)
+```
+
+이제 [`DefaultDataCollator`]를 사용하여 예제 배치를 만듭니다. 🤗 Transformers의 다른 데이터 콜레이터와 달리, `DefaultDataCollator`는 패딩과 같은 추가적인 전처리를 적용하지 않습니다.
+
+```py
+>>> from transformers import DefaultDataCollator
+
+>>> data_collator = DefaultDataCollator()
+```
+</pt>
+</frameworkcontent>
+
+
+<frameworkcontent>
+<tf>
+
+과적합을 방지하고 모델을 보다 견고하게 만들기 위해 데이터 세트의 훈련 부분에 데이터 증강을 추가합니다.
+여기서 Keras 전처리 레이어로 훈련 데이터에 대한 변환(데이터 증강 포함)과
+검증 데이터에 대한 변환(중앙 크로핑, 크기 조정, 정규화만)을 정의합니다. 
+`tf.image` 또는 다른 원하는 라이브러리를 사용할 수 있습니다.
+
+```py
+>>> from tensorflow import keras
+>>> from tensorflow.keras import layers
+
+>>> size = (image_processor.size["height"], image_processor.size["width"])
+
+>>> train_data_augmentation = keras.Sequential(
+...     [
+...         layers.RandomCrop(size[0], size[1]),
+...         layers.Rescaling(scale=1.0 / 127.5, offset=-1),
+...         layers.RandomFlip("horizontal"),
+...         layers.RandomRotation(factor=0.02),
+...         layers.RandomZoom(height_factor=0.2, width_factor=0.2),
+...     ],
+...     name="train_data_augmentation",
+... )
+
+>>> val_data_augmentation = keras.Sequential(
+...     [
+...         layers.CenterCrop(size[0], size[1]),
+...         layers.Rescaling(scale=1.0 / 127.5, offset=-1),
+...     ],
+...     name="val_data_augmentation",
+... )
+```
+
+다음으로 한 번에 하나의 이미지가 아니라 이미지 배치에 적절한 변환을 적용하는 함수를 만듭니다.
+
+```py
+>>> import numpy as np
+>>> import tensorflow as tf
+>>> from PIL import Image
+
+
+>>> def convert_to_tf_tensor(image: Image):
+...     np_image = np.array(image)
+...     tf_image = tf.convert_to_tensor(np_image)
+...     # `expand_dims()` is used to add a batch dimension since
+...     # the TF augmentation layers operates on batched inputs.
+...     return tf.expand_dims(tf_image, 0)
+
+
+>>> def preprocess_train(example_batch):
+...     """Apply train_transforms across a batch."""
+...     images = [
+...         train_data_augmentation(convert_to_tf_tensor(image.convert("RGB"))) for image in example_batch["image"]
+...     ]
+...     example_batch["pixel_values"] = [tf.transpose(tf.squeeze(image)) for image in images]
+...     return example_batch
+
+
+... def preprocess_val(example_batch):
+...     """Apply val_transforms across a batch."""
+...     images = [
+...         val_data_augmentation(convert_to_tf_tensor(image.convert("RGB"))) for image in example_batch["image"]
+...     ]
+...     example_batch["pixel_values"] = [tf.transpose(tf.squeeze(image)) for image in images]
+...     return example_batch
+```
+
+🤗 Datasets [`~datasets.Dataset.set_transform`]를 사용하여 즉시 변환을 적용하세요:
+
+```py
+food["train"].set_transform(preprocess_train)
+food["test"].set_transform(preprocess_val)
+```
+
+최종 전처리 단계로 `DefaultDataCollator`를 사용하여 예제 배치를 만듭니다. 🤗 Transformers의 다른 데이터 콜레이터와 달리
+`DefaultDataCollator`는 패딩과 같은 추가 전처리를 적용하지 않습니다.
+
+```py
+>>> from transformers import DefaultDataCollator
+
+>>> data_collator = DefaultDataCollator(return_tensors="tf")
+```
+</tf>
+</frameworkcontent>
+
+## 평가[[evaluate]]
+
+훈련 중에 평가 지표를 포함하면 모델의 성능을 평가하는 데 도움이 되는 경우가 많습니다.
+🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) 라이브러리로 평가 방법을 빠르게 가져올 수 있습니다. 이 작업에서는 
+[accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy) 평가 지표를 가져옵니다. (🤗 Evaluate [빠른 둘러보기](https://huggingface.co/docs/evaluate/a_quick_tour)를 참조하여 평가 지표를 가져오고 계산하는 방법에 대해 자세히 알아보세요):
+
+```py
+>>> import evaluate
+
+>>> accuracy = evaluate.load("accuracy")
+```
+
+그런 다음 예측과 레이블을 [`~evaluate.EvaluationModule.compute`]에 전달하여 정확도를 계산하는 함수를 만듭니다:
+
+```py
+>>> import numpy as np
+
+
+>>> def compute_metrics(eval_pred):
+...     predictions, labels = eval_pred
+...     predictions = np.argmax(predictions, axis=1)
+...     return accuracy.compute(predictions=predictions, references=labels)
+```
+
+이제 `compute_metrics` 함수를 사용할 준비가 되었으며, 훈련을 설정하면 이 함수로 되돌아올 것입니다.
+
+## 훈련[[train]]
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+[`Trainer`]를 사용하여 모델을 미세 조정하는 방법에 익숙하지 않은 경우, [여기](../training#train-with-pytorch-trainer)에서 기본 튜토리얼을 확인하세요!
+
+</Tip>
+
+이제 모델을 훈련시킬 준비가 되었습니다! [`AutoModelForImageClassification`]로 ViT를 가져옵니다. 예상되는 레이블 수, 레이블 매핑 및 레이블 수를 지정하세요:
+
+```py
+>>> from transformers import AutoModelForImageClassification, TrainingArguments, Trainer
+
+>>> model = AutoModelForImageClassification.from_pretrained(
+...     checkpoint,
+...     num_labels=len(labels),
+...     id2label=id2label,
+...     label2id=label2id,
+... )
+```
+
+이제 세 단계만 거치면 끝입니다:
+
+1. [`TrainingArguments`]에서 훈련 하이퍼파라미터를 정의하세요. `image` 열이 삭제되기 때문에 미사용 열을 제거하지 않는 것이 중요합니다. `image` 열이 없으면 `pixel_values`을 생성할 수 없습니다. 이 동작을 방지하려면 `remove_unused_columns=False`로 설정하세요! 다른 유일한 필수 매개변수는 모델 저장 위치를 지정하는 `output_dir`입니다. `push_to_hub=True`로 설정하면 이 모델을 허브에 푸시합니다(모델을 업로드하려면 Hugging Face에 로그인해야 합니다). 각 에폭이 끝날 때마다, [`Trainer`]가 정확도를 평가하고 훈련 체크포인트를 저장합니다.
+2. [`Trainer`]에 모델, 데이터 세트, 토크나이저, 데이터 콜레이터 및 `compute_metrics` 함수와 함께 훈련 인수를 전달하세요.
+3. [`~Trainer.train`]을 호출하여 모델을 미세 조정하세요.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_food_model",
+...     remove_unused_columns=False,
+...     evaluation_strategy="epoch",
+...     save_strategy="epoch",
+...     learning_rate=5e-5,
+...     per_device_train_batch_size=16,
+...     gradient_accumulation_steps=4,
+...     per_device_eval_batch_size=16,
+...     num_train_epochs=3,
+...     warmup_ratio=0.1,
+...     logging_steps=10,
+...     load_best_model_at_end=True,
+...     metric_for_best_model="accuracy",
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     data_collator=data_collator,
+...     train_dataset=food["train"],
+...     eval_dataset=food["test"],
+...     tokenizer=image_processor,
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+훈련이 완료되면, 모든 사람이 모델을 사용할 수 있도록 [`~transformers.Trainer.push_to_hub`] 메소드로 모델을 허브에 공유하세요:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+</frameworkcontent>
+
+<frameworkcontent>
+<tf>
+
+<Tip>
+
+Keras를 사용하여 모델을 미세 조정하는 방법에 익숙하지 않은 경우, 먼저 [기본 튜토리얼](./training#train-a-tensorflow-model-with-keras)을 확인하세요!
+
+</Tip>
+
+TensorFlow에서 모델을 미세 조정하려면 다음 단계를 따르세요:
+1. 훈련 하이퍼파라미터를 정의하고 옵티마이저와 학습률 스케쥴을 설정합니다.
+2. 사전 훈련된 모델을 인스턴스화합니다.
+3. 🤗 Dataset을 `tf.data.Dataset`으로 변환합니다.
+4. 모델을 컴파일합니다.
+5. 콜백을 추가하고 훈련을 수행하기 위해 `fit()` 메소드를 사용합니다.
+6. 커뮤니티와 공유하기 위해 모델을 🤗 Hub에 업로드합니다.
+
+하이퍼파라미터, 옵티마이저 및 학습률 스케쥴을 정의하는 것으로 시작합니다:
+
+```py
+>>> from transformers import create_optimizer
+
+>>> batch_size = 16
+>>> num_epochs = 5
+>>> num_train_steps = len(food["train"]) * num_epochs
+>>> learning_rate = 3e-5
+>>> weight_decay_rate = 0.01
+
+>>> optimizer, lr_schedule = create_optimizer(
+...     init_lr=learning_rate,
+...     num_train_steps=num_train_steps,
+...     weight_decay_rate=weight_decay_rate,
+...     num_warmup_steps=0,
+... )
+```
+
+그런 다음 레이블 매핑과 함께 [`TFAuto ModelForImageClassification`]으로 ViT를 가져옵니다:
+
+```py
+>>> from transformers import TFAutoModelForImageClassification
+
+>>> model = TFAutoModelForImageClassification.from_pretrained(
+...     checkpoint,
+...     id2label=id2label,
+...     label2id=label2id,
+... )
+```
+
+데이터 세트를 [`~datasets.Dataset.to_tf_dataset`]와 `data_collator`를 사용하여 `tf.data.Dataset` 형식으로 변환하세요:
+
+```py
+>>> # converting our train dataset to tf.data.Dataset
+>>> tf_train_dataset = food["train"].to_tf_dataset(
+...     columns="pixel_values", label_cols="label", shuffle=True, batch_size=batch_size, collate_fn=data_collator
+... )
+
+>>> # converting our test dataset to tf.data.Dataset
+>>> tf_eval_dataset = food["test"].to_tf_dataset(
+...     columns="pixel_values", label_cols="label", shuffle=True, batch_size=batch_size, collate_fn=data_collator
+... )
+```
+
+`compile()`를 사용하여 훈련 모델을 구성하세요:
+
+```py
+>>> from tensorflow.keras.losses import SparseCategoricalCrossentropy
+
+>>> loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+>>> model.compile(optimizer=optimizer, loss=loss)
+```
+
+예측에서 정확도를 계산하고 모델을 🤗 Hub로 푸시하려면 [Keras callbacks](../main_classes/keras_callbacks)를 사용하세요.
+`compute_metrics` 함수를 [KerasMetricCallback](../main_classes/keras_callbacks#transformers.KerasMetricCallback)에 전달하고, 
+[PushToHubCallback](../main_classes/keras_callbacks#transformers.PushToHubCallback)을 사용하여 모델을 업로드합니다:
+
+```py
+>>> from transformers.keras_callbacks import KerasMetricCallback, PushToHubCallback
+
+>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_eval_dataset)
+>>> push_to_hub_callback = PushToHubCallback(
+...     output_dir="food_classifier",
+...     tokenizer=image_processor,
+...     save_strategy="no",
+... )
+>>> callbacks = [metric_callback, push_to_hub_callback]
+```
+
+이제 모델을 훈련할 준비가 되었습니다! 훈련 및 검증 데이터 세트, 에폭 수와 함께 `fit()`을 호출하고,
+콜백을 사용하여 모델을 미세 조정합니다:
+
+```py
+>>> model.fit(tf_train_dataset, validation_data=tf_eval_dataset, epochs=num_epochs, callbacks=callbacks)
+Epoch 1/5
+250/250 [==============================] - 313s 1s/step - loss: 2.5623 - val_loss: 1.4161 - accuracy: 0.9290
+Epoch 2/5
+250/250 [==============================] - 265s 1s/step - loss: 0.9181 - val_loss: 0.6808 - accuracy: 0.9690
+Epoch 3/5
+250/250 [==============================] - 252s 1s/step - loss: 0.3910 - val_loss: 0.4303 - accuracy: 0.9820
+Epoch 4/5
+250/250 [==============================] - 251s 1s/step - loss: 0.2028 - val_loss: 0.3191 - accuracy: 0.9900
+Epoch 5/5
+250/250 [==============================] - 238s 949ms/step - loss: 0.1232 - val_loss: 0.3259 - accuracy: 0.9890
+```
+
+축하합니다! 모델을 미세 조정하고 🤗 Hub에 공유했습니다. 이제 추론에 사용할 수 있습니다!
+</tf>
+</frameworkcontent>
+
+
+<Tip>
+
+이미지 분류를 위한 모델을 미세 조정하는 자세한 예제는 다음 [PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb)을 참조하세요.
+
+</Tip>
+
+## 추론[[inference]]
+
+좋아요, 이제 모델을 미세 조정했으니 추론에 사용할 수 있습니다!
+
+추론을 수행하고자 하는 이미지를 가져와봅시다:
+
+```py
+>>> ds = load_dataset("food101", split="validation[:10]")
+>>> image = ds["image"][0]
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png" alt="image of beignets"/>
+</div>
+
+미세 조정 모델로 추론을 시도하는 가장 간단한 방법은 [`pipeline`]을 사용하는 것입니다. 모델로 이미지 분류를 위한 `pipeline`을 인스턴스화하고 이미지를 전달합니다:
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline("image-classification", model="my_awesome_food_model")
+>>> classifier(image)
+[{'score': 0.31856709718704224, 'label': 'beignets'},
+ {'score': 0.015232225880026817, 'label': 'bruschetta'},
+ {'score': 0.01519392803311348, 'label': 'chicken_wings'},
+ {'score': 0.013022331520915031, 'label': 'pork_chop'},
+ {'score': 0.012728818692266941, 'label': 'prime_rib'}]
+```
+
+원한다면, `pipeline`의 결과를 수동으로 복제할 수도 있습니다:
+
+<frameworkcontent>
+<pt>
+이미지를 전처리하기 위해 이미지 프로세서를 가져오고 `input`을 PyTorch 텐서로 반환합니다:
+
+```py
+>>> from transformers import AutoImageProcessor
+>>> import torch
+
+>>> image_processor = AutoImageProcessor.from_pretrained("my_awesome_food_model")
+>>> inputs = image_processor(image, return_tensors="pt")
+```
+
+입력을 모델에 전달하고 logits을 반환합니다:
+
+```py
+>>> from transformers import AutoModelForImageClassification
+
+>>> model = AutoModelForImageClassification.from_pretrained("my_awesome_food_model")
+>>> with torch.no_grad():
+...     logits = model(**inputs).logits
+```
+
+확률이 가장 높은 예측 레이블을 가져오고, 모델의 `id2label` 매핑을 사용하여 레이블로 변환합니다:
+
+```py
+>>> predicted_label = logits.argmax(-1).item()
+>>> model.config.id2label[predicted_label]
+'beignets'
+```
+</pt>
+</frameworkcontent>
+
+<frameworkcontent>
+<tf>
+이미지를 전처리하기 위해 이미지 프로세서를 가져오고 `input`을 TensorFlow 텐서로 반환합니다:
+
+```py
+>>> from transformers import AutoImageProcessor
+
+>>> image_processor = AutoImageProcessor.from_pretrained("MariaK/food_classifier")
+>>> inputs = image_processor(image, return_tensors="tf")
+```
+
+입력을 모델에 전달하고 logits을 반환합니다:
+
+```py
+>>> from transformers import TFAutoModelForImageClassification
+
+>>> model = TFAutoModelForImageClassification.from_pretrained("MariaK/food_classifier")
+>>> logits = model(**inputs).logits
+```
+
+확률이 가장 높은 예측 레이블을 가져오고, 모델의 `id2label` 매핑을 사용하여 레이블로 변환합니다:
+
+```py
+>>> predicted_class_id = int(tf.math.argmax(logits, axis=-1)[0])
+>>> model.config.id2label[predicted_class_id]
+'beignets'
+```
+
+</tf>
+</frameworkcontent>
diff --git a/docs/source/ko/tasks/language_modeling.md b/docs/source/ko/tasks/language_modeling.md
new file mode 100644
index 000000000000..ba540825c295
--- /dev/null
+++ b/docs/source/ko/tasks/language_modeling.md
@@ -0,0 +1,417 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 인과 언어 모델링[[causal-language-modeling]]
+
+[[open-in-colab]]
+
+언어 모델링은 인과적 언어 모델링과 마스크드 언어 모델링, 두 가지 유형으로 나뉩니다. 이 가이드에서는 인과적 언어 모델링을 설명합니다.
+인과 언어 모델은 텍스트 생성에 자주 사용됩니다. 또 창의적인 방향으로 응용할 수 있습니다.
+직접 사용하며 재미있는 탐구를 해보거나, Copilot 또는 CodeParrot와 같은 지능형 코딩 어시스턴트의 기반이 되기도 합니다.
+
+<Youtube id="Vpjb1lu0MDk"/>
+
+인과 언어 모델링은 토큰 시퀀스에서 다음 토큰을 예측하며, 모델은 왼쪽의 토큰에만 접근할 수 있습니다.
+이는 모델이 미래의 토큰을 볼 수 없다는 것을 의미합니다. 인과 언어 모델의 예로 GPT-2가 있죠.
+
+이 가이드에서는 다음 작업을 수행하는 방법을 안내합니다:
+
+1. [DistilGPT2](https://huggingface.co/distilgpt2) 모델을 [ELI5](https://huggingface.co/datasets/eli5) 데이터 세트의 [r/askscience](https://www.reddit.com/r/askscience/) 하위 집합으로 미세 조정
+2. 미세 조정된 모델을 추론에 사용
+
+<Tip>
+이 안내서의 단계와 동일한 방법으로 인과 언어 모델링을 위해 다른 아키텍처를 미세 조정할 수 있습니다.
+다음 아키텍처 중 하나를 선택하세요:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+[BART](../model_doc/bart), [BERT](../model_doc/bert), [Bert Generation](../model_doc/bert-generation), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CodeGen](../model_doc/codegen), [CPM-Ant](../model_doc/cpmant), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [GIT](../model_doc/git), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT NeoX Japanese](../model_doc/gpt_neox_japanese), [GPT-J](../model_doc/gptj), [LLaMA](../model_doc/llama), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MVP](../model_doc/mvp), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Pegasus](../model_doc/pegasus), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [RWKV](../model_doc/rwkv), [Speech2Text2](../model_doc/speech_to_text_2), [Transformer-XL](../model_doc/transfo-xl), [TrOCR](../model_doc/trocr), [XGLM](../model_doc/xglm), [XLM](../model_doc/xlm), [XLM-ProphetNet](../model_doc/xlm-prophetnet), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod)
+
+
+<!--End of the generated tip-->
+
+</Tip>
+
+시작하기 전에 필요한 라이브러리가 모두 설치되어 있는지 확인하세요:
+
+```bash
+pip install transformers datasets evaluate
+```
+
+커뮤니티에 모델을 업로드하고 공유하기 위해 Hugging Face 계정에 로그인하는 것을 권장합니다. 알림이 표시되면 토큰을 입력하여 로그인하세요:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## ELI5 데이터 세트 불러오기[[load-eli5-dataset]]
+
+먼저, 🤗 Datasets 라이브러리에서 r/askscience의 작은 하위 집합인 ELI5 데이터 세트를 불러옵니다.
+이를 통해 전체 데이터 세트에서 학습하는 데 더 많은 시간을 투자하기 전에, 실험해봄으로써 모든 것이 작동하는지 확인할 수 있습니다.
+
+```py
+>>> from datasets import load_dataset
+
+>>> eli5 = load_dataset("eli5", split="train_asks[:5000]")
+```
+
+데이터 세트의 `train_asks` 분할을 [`~datasets.Dataset.train_test_split`] 메소드를 사용하여 학습 및 테스트 세트로 분할합니다:
+
+```py
+>>> eli5 = eli5.train_test_split(test_size=0.2)
+```
+
+그런 다음 예제를 살펴보세요:
+
+```py
+>>> eli5["train"][0]
+{'answers': {'a_id': ['c3d1aib', 'c3d4lya'],
+  'score': [6, 3],
+  'text': ["The velocity needed to remain in orbit is equal to the square root of Newton's constant times the mass of earth divided by the distance from the center of the earth. I don't know the altitude of that specific mission, but they're usually around 300 km. That means he's going 7-8 km/s.\n\nIn space there are no other forces acting on either the shuttle or the guy, so they stay in the same position relative to each other. If he were to become unable to return to the ship, he would presumably run out of oxygen, or slowly fall into the atmosphere and burn up.",
+   "Hope you don't mind me asking another question, but why aren't there any stars visible in this photo?"]},
+ 'answers_urls': {'url': []},
+ 'document': '',
+ 'q_id': 'nyxfp',
+ 'selftext': '_URL_0_\n\nThis was on the front page earlier and I have a few questions about it. Is it possible to calculate how fast the astronaut would be orbiting the earth? Also how does he stay close to the shuttle so that he can return safely, i.e is he orbiting at the same speed and can therefore stay next to it? And finally if his propulsion system failed, would he eventually re-enter the atmosphere and presumably die?',
+ 'selftext_urls': {'url': ['http://apod.nasa.gov/apod/image/1201/freeflyer_nasa_3000.jpg']},
+ 'subreddit': 'askscience',
+ 'title': 'Few questions about this space walk photograph.',
+ 'title_urls': {'url': []}}
+```
+
+많아 보일 수 있지만, 실제로는 `text` 필드만 중요합니다. 언어 모델링 작업의 장점은 레이블이 필요하지 않다는 것입니다. 다음 단어 *자체가* 레이블입니다. (이렇게 레이블을 제공하지 않아도 되는 학습을 비지도 학습이라고 일컫습니다)
+
+## 전처리[[preprocess]]
+
+<Youtube id="ma1TrR7gE7I"/>
+
+다음 단계는 `text` 필드를 전처리하기 위해 DistilGPT2 토크나이저를 불러오는 것입니다.
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+```
+
+위의 예제에서 알 수 있듯이, `text` 필드는 `answers` 아래에 중첩되어 있습니다. 따라서 [`flatten`](https://huggingface.co/docs/datasets/process.html#flatten) 메소드를 사용하여 중첩 구조에서 `text` 하위 필드를 추출해야 합니다.
+
+```py
+>>> eli5 = eli5.flatten()
+>>> eli5["train"][0]
+{'answers.a_id': ['c3d1aib', 'c3d4lya'],
+ 'answers.score': [6, 3],
+ 'answers.text': ["The velocity needed to remain in orbit is equal to the square root of Newton's constant times the mass of earth divided by the distance from the center of the earth. I don't know the altitude of that specific mission, but they're usually around 300 km. That means he's going 7-8 km/s.\n\nIn space there are no other forces acting on either the shuttle or the guy, so they stay in the same position relative to each other. If he were to become unable to return to the ship, he would presumably run out of oxygen, or slowly fall into the atmosphere and burn up.",
+  "Hope you don't mind me asking another question, but why aren't there any stars visible in this photo?"],
+ 'answers_urls.url': [],
+ 'document': '',
+ 'q_id': 'nyxfp',
+ 'selftext': '_URL_0_\n\nThis was on the front page earlier and I have a few questions about it. Is it possible to calculate how fast the astronaut would be orbiting the earth? Also how does he stay close to the shuttle so that he can return safely, i.e is he orbiting at the same speed and can therefore stay next to it? And finally if his propulsion system failed, would he eventually re-enter the atmosphere and presumably die?',
+ 'selftext_urls.url': ['http://apod.nasa.gov/apod/image/1201/freeflyer_nasa_3000.jpg'],
+ 'subreddit': 'askscience',
+ 'title': 'Few questions about this space walk photograph.',
+ 'title_urls.url': []}
+```
+
+각 하위 필드는 이제 `answers` 접두사를 가진 별도의 열로 나뉘었으며, `text` 필드는 이제 리스트입니다. 각 문장을 개별적으로 토큰화하는 대신, 먼저 리스트를 문자열로 변환하여 한꺼번에 토큰화할 수 있습니다.
+
+다음은 문자열 리스트를 결합하고 결과를 토큰화하는 첫 번째 전처리 함수입니다:
+
+```py
+>>> def preprocess_function(examples):
+...     return tokenizer([" ".join(x) for x in examples["answers.text"]])
+```
+
+이 전처리 함수를 전체 데이터 세트에 적용하려면 🤗 Datasets [`~datasets.Dataset.map`] 메소드를 사용하세요. `batched=True`로 설정하여 데이터셋의 여러 요소를 한 번에 처리하고, `num_proc`를 증가시켜 프로세스 수를 늘릴 수 있습니다. 필요 없는 열은 제거하세요:
+
+```py
+>>> tokenized_eli5 = eli5.map(
+...     preprocess_function,
+...     batched=True,
+...     num_proc=4,
+...     remove_columns=eli5["train"].column_names,
+... )
+```
+
+이제 데이터 세트는 시퀀스가 토큰화됐지만, 일부 시퀀스는 모델의 최대 입력 길이보다 길 수 있습니다.
+
+이제 두 번째 전처리 함수를 사용하여
+- 모든 시퀀스를 연결하고,
+- `block_size`로 정의된 길이로 연결된 시퀀스를 여러 개의 짧은 묶음으로 나눕니다. 이 값은 최대 입력 길이와 GPU RAM을 고려해 충분히 짧아야 합니다.
+
+```py
+>>> block_size = 128
+
+
+>>> def group_texts(examples):
+...     # Concatenate all texts.
+...     concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+...     total_length = len(concatenated_examples[list(examples.keys())[0]])
+...     # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+...     # customize this part to your needs.
+...     if total_length >= block_size:
+...         total_length = (total_length // block_size) * block_size
+...     # Split by chunks of block_size.
+...     result = {
+...         k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
+...         for k, t in concatenated_examples.items()
+...     }
+...     result["labels"] = result["input_ids"].copy()
+...     return result
+```
+
+전체 데이터 세트에 `group_texts` 함수를 적용하세요:
+
+```py
+>>> lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)
+```
+
+그런 다음 [`DataCollatorForLanguageModeling`]을 사용하여 예제의 배치를 만듭니다. 데이터 세트 전체를 최대 길이로 패딩하는 것보다, 취합 단계에서 각 배치의 최대 길이로 문장을 *동적으로 패딩*하는 것이 더 효율적입니다.
+
+<frameworkcontent>
+<pt>
+패딩 토큰으로 종결 토큰을 사용하고 `mlm=False`로 설정하세요. 이렇게 하면 입력을 오른쪽으로 한 칸씩 시프트한 값을 레이블로 사용합니다:
+
+```py
+>>> from transformers import DataCollatorForLanguageModeling
+
+>>> tokenizer.pad_token = tokenizer.eos_token
+>>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
+```
+
+</pt>
+<tf>
+패딩 토큰으로 종결 토큰을 사용하고 `mlm=False`로 설정하세요. 이렇게 하면 입력을 오른쪽으로 한 칸씩 시프트한 값을 레이블로 사용합니다:
+
+```py
+>>> from transformers import DataCollatorForLanguageModeling
+
+>>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="tf")
+```
+
+</tf>
+</frameworkcontent>
+
+
+## 훈련[[train]]
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+[`Trainer`]를 사용하여 모델을 미세 조정하는 방법을 잘 모르신다면 [기본 튜토리얼](../training#train-with-pytorch-trainer)을 확인해보세요!
+
+</Tip>
+
+이제 모델을 훈련하기 준비가 되었습니다! [`AutoModelForCausalLM`]를 사용하여 DistilGPT2를 불러옵니다:
+
+```py
+>>> from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
+
+>>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+```
+
+여기까지 진행하면 세 단계만 남았습니다:
+
+1. [`TrainingArguments`]에서 훈련 하이퍼파라미터를 정의하세요. `output_dir`은 유일한 필수 매개변수로, 모델을 저장할 위치를 지정합니다. (먼저 Hugging Face에 로그인 필수) `push_to_hub=True`로 설정하여 이 모델을 허브에 업로드할 수 있습니다.
+2. 훈련 인수를 [`Trainer`]에 모델, 데이터 세트 및 데이터 콜레이터와 함께 전달하세요.
+3. [`~Trainer.train`]을 호출하여 모델을 미세 조정하세요.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_eli5_clm-model",
+...     evaluation_strategy="epoch",
+...     learning_rate=2e-5,
+...     weight_decay=0.01,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=lm_dataset["train"],
+...     eval_dataset=lm_dataset["test"],
+...     data_collator=data_collator,
+... )
+
+>>> trainer.train()
+```
+
+훈련이 완료되면 [`~transformers.Trainer.evaluate`] 메소드를 사용하여 모델을 평가하고 퍼플렉서티를 얻을 수 있습니다:
+
+```py
+>>> import math
+
+>>> eval_results = trainer.evaluate()
+>>> print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
+Perplexity: 49.61
+```
+
+그런 다음 [`~transformers.Trainer.push_to_hub`] 메소드를 사용하여 모델을 허브에 공유하세요. 이렇게 하면 누구나 모델을 사용할 수 있습니다:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+<Tip>
+
+Keras를 사용하여 모델을 미세 조정하는 방법에 익숙하지 않다면 [기본 튜토리얼](../training#train-a-tensorflow-model-with-keras)을 확인해보세요!
+
+</Tip>
+TensorFlow에서 모델을 미세 조정하려면, 먼저 옵티마이저 함수, 학습률 스케줄 및 일부 훈련 하이퍼파라미터를 설정하세요:
+
+```py
+>>> from transformers import create_optimizer, AdamWeightDecay
+
+>>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
+```
+
+그런 다음 [`TFAutoModelForCausalLM`]를 사용하여 DistilGPT2를 불러옵니다:
+
+```py
+>>> from transformers import TFAutoModelForCausalLM
+
+>>> model = TFAutoModelForCausalLM.from_pretrained("distilgpt2")
+```
+
+[`~transformers.TFPreTrainedModel.prepare_tf_dataset`]을 사용하여 데이터 세트를 `tf.data.Dataset` 형식으로 변환하세요:
+
+```py
+>>> tf_train_set = model.prepare_tf_dataset(
+...     lm_dataset["train"],
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_test_set = model.prepare_tf_dataset(
+...     lm_dataset["test"],
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+[`compile`](https://keras.io/api/models/model_training_apis/#compile-method)을 사용하여 모델을 훈련하기 위해 구성하세요. Transformers 모델은 모두 기본적인 작업 관련 손실 함수를 가지고 있으므로, 원한다면 별도로 지정하지 않아도 됩니다:
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)  # 별도로 loss 인자를 넣지 않았어요!
+```
+
+[`~transformers.PushToHubCallback`]에서 모델과 토크나이저를 업로드할 위치를 지정할 수 있습니다:
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> callback = PushToHubCallback(
+...     output_dir="my_awesome_eli5_clm-model",
+...     tokenizer=tokenizer,
+... )
+```
+
+마지막으로, 모델을 훈련하기 위해 [`fit`](https://keras.io/api/models/model_training_apis/#fit-method)을 호출하세요. 훈련 데이터 세트, 검증 데이터 세트, 에폭 수 및 콜백을 전달하세요:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=[callback])
+```
+
+훈련이 완료되면 모델이 자동으로 허브에 업로드되어 모두가 사용할 수 있습니다!
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+인과 언어 모델링을 위해 모델을 미세 조정하는 더 자세한 예제는 해당하는 [PyTorch 노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb) 또는 [TensorFlow 노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb)을 참조하세요.
+
+</Tip>
+
+## 추론[[inference]]
+
+좋아요, 이제 모델을 미세 조정했으므로 추론에 사용할 수 있습니다!
+
+생성할 텍스트를 위한 프롬프트를 만들어보세요:
+
+```py
+>>> prompt = "Somatic hypermutation allows the immune system to"
+```
+
+추론을 위해 미세 조정된 모델을 간단히 사용하는 가장 간단한 방법은 [`pipeline`]에서 사용하는 것입니다. 모델과 함께 텍스트 생성을 위한 `pipeline`을 인스턴스화하고 텍스트를 전달하세요:
+
+```py
+>>> from transformers import pipeline
+
+>>> generator = pipeline("text-generation", model="my_awesome_eli5_clm-model")
+>>> generator(prompt)
+[{'generated_text': "Somatic hypermutation allows the immune system to be able to effectively reverse the damage caused by an infection.\n\n\nThe damage caused by an infection is caused by the immune system's ability to perform its own self-correcting tasks."}]
+```
+
+<frameworkcontent>
+<pt>
+텍스트를 토큰화하고 `input_ids`를 PyTorch 텐서로 반환하세요:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_eli5_clm-model")
+>>> inputs = tokenizer(prompt, return_tensors="pt").input_ids
+```
+
+[`~transformers.generation_utils.GenerationMixin.generate`] 메소드를 사용하여 텍스트를 생성하세요. 생성을 제어하는 다양한 텍스트 생성 전략과 매개변수에 대한 자세한 내용은 [텍스트 생성 전략](../generation_strategies) 페이지를 확인하세요.
+
+```py
+>>> from transformers import AutoModelForCausalLM
+
+>>> model = AutoModelForCausalLM.from_pretrained("my_awesome_eli5_clm-model")
+>>> outputs = model.generate(inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)
+```
+
+생성된 토큰 ID를 다시 텍스트로 디코딩하세요:
+
+```py
+>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+["Somatic hypermutation allows the immune system to react to drugs with the ability to adapt to a different environmental situation. In other words, a system of 'hypermutation' can help the immune system to adapt to a different environmental situation or in some cases even a single life. In contrast, researchers at the University of Massachusetts-Boston have found that 'hypermutation' is much stronger in mice than in humans but can be found in humans, and that it's not completely unknown to the immune system. A study on how the immune system"]
+```
+</pt>
+<tf>
+텍스트를 토큰화하고 `input_ids`를 TensorFlow 텐서로 반환하세요:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_eli5_clm-model")
+>>> inputs = tokenizer(prompt, return_tensors="tf").input_ids
+```
+
+[`~transformers.generation_tf_utils.TFGenerationMixin.generate`] 메소드를 사용하여 요약을 생성하세요. 생성을 제어하는 다양한 텍스트 생성 전략과 매개변수에 대한 자세한 내용은 [텍스트 생성 전략](../generation_strategies) 페이지를 확인하세요.
+
+```py
+>>> from transformers import TFAutoModelForCausalLM
+
+>>> model = TFAutoModelForCausalLM.from_pretrained("my_awesome_eli5_clm-model")
+>>> outputs = model.generate(input_ids=inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)
+```
+
+생성된 토큰 ID를 다시 텍스트로 디코딩하세요:
+
+```py
+>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+['Somatic hypermutation allows the immune system to detect the presence of other viruses as they become more prevalent. Therefore, researchers have identified a high proportion of human viruses. The proportion of virus-associated viruses in our study increases with age. Therefore, we propose a simple algorithm to detect the presence of these new viruses in our samples as a sign of improved immunity. A first study based on this algorithm, which will be published in Science on Friday, aims to show that this finding could translate into the development of a better vaccine that is more effective for']
+```
+</tf>
+</frameworkcontent>
diff --git a/docs/source/ko/tasks/masked_language_modeling.md b/docs/source/ko/tasks/masked_language_modeling.md
new file mode 100644
index 000000000000..d22d439dbd51
--- /dev/null
+++ b/docs/source/ko/tasks/masked_language_modeling.md
@@ -0,0 +1,448 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 마스킹된 언어 모델링(Masked language modeling)[[masked-language-modeling]]
+
+[[open-in-colab]]
+
+<Youtube id="mqElG5QJWUg"/>
+
+마스킹된 언어 모델링은 시퀀스에서 마스킹된 토큰을 예측하며, 모델은 양방향으로 토큰에 액세스할 수 있습니다.
+즉, 모델은 토큰의 왼쪽과 오른쪽 양쪽에서 접근할 수 있습니다.
+마스킹된 언어 모델링은 전체 시퀀스에 대한 문맥적 이해가 필요한 작업에 적합하며, BERT가 그 예에 해당합니다.
+
+이번 가이드에서 다룰 내용은 다음과 같습니다:
+
+1. [ELI5](https://huggingface.co/datasets/eli5) 데이터 세트에서 [r/askscience](https://www.reddit.com/r/askscience/) 부분을 사용해 [DistilRoBERTa](https://huggingface.co/distilroberta-base) 모델을 미세 조정합니다.
+2. 추론 시에 직접 미세 조정한 모델을 사용합니다.
+
+<Tip>
+이번 가이드에서처럼 다른 아키텍처를 미세 조정해 마스킹된 언어 모델링을 할 수 있습니다.
+
+다음 아키텍쳐 중 하나를 선택하세요:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [CamemBERT](../model_doc/camembert), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ESM](../model_doc/esm), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [Perceiver](../model_doc/perceiver), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [TAPAS](../model_doc/tapas), [Wav2Vec2](../model_doc/wav2vec2), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+시작하기 전에 필요한 라이브러리가 모두 설치되어 있는지 확인하세요:
+
+```bash
+pip install transformers datasets evaluate
+```
+
+Hugging Face 계정에 로그인하여 모델을 업로드하고 커뮤니티와의 공유를 권장합니다. 메시지가 표시되면(When prompted) 토큰을 입력하여 로그인합니다:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## ELI5 데이터 세트 가져오기[[load-eli5-dataset]]
+
+먼저 🤗 Datasets 라이브러리에서 ELI5 데이터 세트의 r/askscience 중 일부만 가져옵니다. 
+이렇게 하면 전체 데이터 세트 학습에 더 많은 시간을 할애하기 전에 모든 것이 작동하는지 실험하고 확인할 수 있습니다.
+
+```py
+>>> from datasets import load_dataset
+
+>>> eli5 = load_dataset("eli5", split="train_asks[:5000]")
+```
+
+데이터 세트의 `train_asks`를 [`~datasets.Dataset.train_test_split`] 메소드를 사용해 훈련 데이터와 테스트 데이터로 분할합니다:
+
+```py
+>>> eli5 = eli5.train_test_split(test_size=0.2)
+```
+
+그리고 아래 예시를 살펴보세요:
+
+```py
+>>> eli5["train"][0]
+{'answers': {'a_id': ['c3d1aib', 'c3d4lya'],
+  'score': [6, 3],
+  'text': ["The velocity needed to remain in orbit is equal to the square root of Newton's constant times the mass of earth divided by the distance from the center of the earth. I don't know the altitude of that specific mission, but they're usually around 300 km. That means he's going 7-8 km/s.\n\nIn space there are no other forces acting on either the shuttle or the guy, so they stay in the same position relative to each other. If he were to become unable to return to the ship, he would presumably run out of oxygen, or slowly fall into the atmosphere and burn up.",
+   "Hope you don't mind me asking another question, but why aren't there any stars visible in this photo?"]},
+ 'answers_urls': {'url': []},
+ 'document': '',
+ 'q_id': 'nyxfp',
+ 'selftext': '_URL_0_\n\nThis was on the front page earlier and I have a few questions about it. Is it possible to calculate how fast the astronaut would be orbiting the earth? Also how does he stay close to the shuttle so that he can return safely, i.e is he orbiting at the same speed and can therefore stay next to it? And finally if his propulsion system failed, would he eventually re-enter the atmosphere and presumably die?',
+ 'selftext_urls': {'url': ['http://apod.nasa.gov/apod/image/1201/freeflyer_nasa_3000.jpg']},
+ 'subreddit': 'askscience',
+ 'title': 'Few questions about this space walk photograph.',
+ 'title_urls': {'url': []}}
+```
+
+많아 보일 수 있지만 실제로는 `text` 필드에만 집중하면 됩나다.
+언어 모델링 작업의 멋진 점은 (비지도 학습으로) *다음 단어가 레이블*이기 때문에 레이블이 따로 필요하지 않습니다.
+
+## 전처리[[preprocess]]
+
+<Youtube id="8PmhEIXhBvI"/>
+
+마스킹된 언어 모델링을 위해, 다음 단계로 DistilRoBERTa 토크나이저를 가져와서 `text` 하위 필드를 처리합니다:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
+```
+
+위의 예제에서와 마찬가지로, `text` 필드는 `answers` 안에 중첩되어 있습니다. 
+따라서 중첩된 구조에서 [`flatten`](https://huggingface.co/docs/datasets/process.html#flatten) 메소드를 사용하여 `text` 하위 필드를 추출합니다:
+
+```py
+>>> eli5 = eli5.flatten()
+>>> eli5["train"][0]
+{'answers.a_id': ['c3d1aib', 'c3d4lya'],
+ 'answers.score': [6, 3],
+ 'answers.text': ["The velocity needed to remain in orbit is equal to the square root of Newton's constant times the mass of earth divided by the distance from the center of the earth. I don't know the altitude of that specific mission, but they're usually around 300 km. That means he's going 7-8 km/s.\n\nIn space there are no other forces acting on either the shuttle or the guy, so they stay in the same position relative to each other. If he were to become unable to return to the ship, he would presumably run out of oxygen, or slowly fall into the atmosphere and burn up.",
+  "Hope you don't mind me asking another question, but why aren't there any stars visible in this photo?"],
+ 'answers_urls.url': [],
+ 'document': '',
+ 'q_id': 'nyxfp',
+ 'selftext': '_URL_0_\n\nThis was on the front page earlier and I have a few questions about it. Is it possible to calculate how fast the astronaut would be orbiting the earth? Also how does he stay close to the shuttle so that he can return safely, i.e is he orbiting at the same speed and can therefore stay next to it? And finally if his propulsion system failed, would he eventually re-enter the atmosphere and presumably die?',
+ 'selftext_urls.url': ['http://apod.nasa.gov/apod/image/1201/freeflyer_nasa_3000.jpg'],
+ 'subreddit': 'askscience',
+ 'title': 'Few questions about this space walk photograph.',
+ 'title_urls.url': []}
+```
+
+이제 각 하위 필드는 `answers` 접두사(prefix)로 표시된 대로 별도의 열이 되고, `text` 필드는 이제 리스트가 되었습니다. 
+각 문장을 개별적으로 토큰화하는 대신 리스트를 문자열로 변환하여 한번에 토큰화할 수 있습니다.
+
+다음은 각 예제에 대해 문자열로 이루어진 리스트를 `join`하고 결과를 토큰화하는 첫 번째 전처리 함수입니다:
+
+```py
+>>> def preprocess_function(examples):
+...     return tokenizer([" ".join(x) for x in examples["answers.text"]])
+```
+
+이 전처리 함수를 전체 데이터 세트에 적용하기 위해 🤗 Datasets [`~datasets.Dataset.map`] 메소드를 사용합니다. 
+데이터 세트의 여러 요소를 한 번에 처리하도록 `batched=True`를 설정하고 `num_proc`로 처리 횟수를 늘리면 `map` 함수의 속도를 높일 수 있습니다. 
+필요하지 않은 열은 제거합니다:
+
+```py
+>>> tokenized_eli5 = eli5.map(
+...     preprocess_function,
+...     batched=True,
+...     num_proc=4,
+...     remove_columns=eli5["train"].column_names,
+... )
+```
+
+이 데이터 세트에는 토큰 시퀀스가 포함되어 있지만 이 중 일부는 모델의 최대 입력 길이보다 깁니다.
+
+이제 두 번째 전처리 함수를 사용해
+- 모든 시퀀스를 연결하고
+- 연결된 시퀀스를 정의한 `block_size` 보다 더 짧은 덩어리로 분할하는데, 이 덩어리는 모델의 최대 입력 길이보다 짧고 GPU RAM이 수용할 수 있는 길이여야 합니다. 
+
+
+```py
+>>> block_size = 128
+
+
+>>> def group_texts(examples):
+...     # Concatenate all texts.
+...     concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+...     total_length = len(concatenated_examples[list(examples.keys())[0]])
+...     # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+...     # customize this part to your needs.
+...     if total_length >= block_size:
+...         total_length = (total_length // block_size) * block_size
+...     # Split by chunks of block_size.
+...     result = {
+...         k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
+...         for k, t in concatenated_examples.items()
+...     }
+...     result["labels"] = result["input_ids"].copy()
+...     return result
+```
+
+전체 데이터 세트에 `group_texts` 함수를 적용합니다:
+
+```py
+>>> lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)
+```
+
+이제 [`DataCollatorForLanguageModeling`]을 사용하여 데이터 예제의 배치를 생성합니다. 
+데이터 세트 전체를 최대 길이로 패딩하는 것보다 collation 단계에서 매 배치안에서의 최대 길이로 문장을 *동적으로 패딩*하는 것이 더 효율적입니다.
+
+<frameworkcontent>
+<pt>
+
+시퀀스 끝 토큰을 패딩 토큰으로 사용하고 데이터를 반복할 때마다 토큰을 무작위로 마스킹하도록 `mlm_-probability`를 지정합니다:
+
+```py
+>>> from transformers import DataCollatorForLanguageModeling
+
+>>> tokenizer.pad_token = tokenizer.eos_token
+>>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
+```
+</pt>
+<tf>
+
+시퀀스 끝 토큰을 패딩 토큰으로 사용하고 데이터를 반복할 때마다 토큰을 무작위로 마스킹하도록 `mlm_-probability`를 지정합니다:
+
+```py
+>>> from transformers import DataCollatorForLanguageModeling
+
+>>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, return_tensors="tf")
+```
+</tf>
+</frameworkcontent>
+
+## 훈련[[train]]
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+[`Trainer`]로 모델을 미세 조정하는 데 익숙하지 않다면 기본 튜토리얼 [여기](../training#train-with-pytorch-trainer)를 살펴보세요!
+</Tip>
+
+이제 모델 훈련을 시작할 준비가 되었습니다! [`AutoModelForMaskedLM`]를 사용해 DistilRoBERTa 모델을 가져옵니다:
+
+```py
+>>> from transformers import AutoModelForMaskedLM
+
+>>> model = AutoModelForMaskedLM.from_pretrained("distilroberta-base")
+```
+
+이제 세 단계가 남았습니다:
+
+1. [`TrainingArguments`]의 훈련 하이퍼파라미터를 정의합니다. 모델 저장 위치를 지정하는 `output_dir`은 유일한 필수 파라미터입니다. `push_to_hub=True`를 설정하여 이 모델을 Hub에 업로드합니다 (모델을 업로드하려면 Hugging Face에 로그인해야 합니다).
+2. 모델, 데이터 세트 및 데이터 콜레이터(collator)와 함께 훈련 인수를 [`Trainer`]에 전달합니다.
+3. [`~Trainer.train`]을 호출하여 모델을 미세 조정합니다.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_eli5_mlm_model",
+...     evaluation_strategy="epoch",
+...     learning_rate=2e-5,
+...     num_train_epochs=3,
+...     weight_decay=0.01,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=lm_dataset["train"],
+...     eval_dataset=lm_dataset["test"],
+...     data_collator=data_collator,
+... )
+
+>>> trainer.train()
+```
+
+훈련이 완료되면 [`~transformers.Trainer.evaluate`] 메소드를 사용하여 펄플렉서티(perplexity)를 계산하고 모델을 평가합니다:
+
+```py
+>>> import math
+
+>>> eval_results = trainer.evaluate()
+>>> print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
+Perplexity: 8.76
+```
+
+그리고 [`~transformers.Trainer.push_to_hub`] 메소드를 사용해 다른 사람들이 사용할 수 있도록, Hub로 모델을 업로드합니다. 
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+<Tip>
+
+Keras로 모델을 미세 조정하는 데 익숙하지 않다면 기본 튜토리얼 [여기](../training#train-a-tensorflow-model-with-keras)를 살펴보세요!
+
+</Tip>
+TensorFlow로 모델을 미세 조정하기 위해서는 옵티마이저(optimizer) 함수 설정, 학습률(learning rate) 스케쥴링, 훈련 하이퍼파라미터 설정부터 시작하세요:
+
+```py
+>>> from transformers import create_optimizer, AdamWeightDecay
+
+>>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
+```
+
+다음으로 [`TFAutoModelForMaskedLM`]를 사용해 DistilRoBERTa 모델을 가져옵니다:
+
+```py
+>>> from transformers import TFAutoModelForMaskedLM
+
+>>> model = TFAutoModelForMaskedLM.from_pretrained("distilroberta-base")
+```
+
+[`~transformers.TFPreTrainedModel.prepare_tf_dataset`] 메소드를 사용해 데이터 세트를 `tf.data.Dataset` 형식으로 변환하세요:
+
+```py
+>>> tf_train_set = model.prepare_tf_dataset(
+...     lm_dataset["train"],
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_test_set = model.prepare_tf_dataset(
+...     lm_dataset["test"],
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+[`compile`](https://keras.io/api/models/model_training_apis/#compile-method) 메소드를 통해 모델 훈련을 구성합니다:
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)
+```
+
+이는 업로드할 모델과 토크나이저의 위치를 [`~transformers.PushToHubCallback`]에 지정하여 수행할 수 있습니다:
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> callback = PushToHubCallback(
+...     output_dir="my_awesome_eli5_mlm_model",
+...     tokenizer=tokenizer,
+... )
+```
+
+드디어 모델을 훈련할 준비가 되었습니다!
+모델을 미세 조정할 때 훈련 및 검증 데이터 세트, 에포크 수, 콜백이 포함된 [`fit`](https://keras.io/api/models/model_training_apis/#fit-method)을 호출합니다:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=[callback])
+```
+
+훈련이 완료되면, 자동으로 Hub로 업로드되어 누구나 사용할 수 있습니다!
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+마스킹된 언어 모델링을 위해 모델을 미세 조정하는 방법에 대한 보다 심층적인 예제는
+[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)
+또는 [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb)을 참조하세요.
+</Tip>
+
+## 추론[[inference]]
+
+지금까지 모델 미세 조정을 잘 했으니, 추론에 사용할 수 있습니다!
+
+모델이 빈칸을 채울 텍스트를 스페셜 토큰(special token)인 `<mask>` 토큰으로 표시합니다:
+
+
+```py
+>>> text = "The Milky Way is a <mask> galaxy."
+```
+추론을 위해 미세 조정한 모델을 테스트하는 가장 간단한 방법은 [`pipeline`]에서 사용하는 것입니다. 
+`fill-mask`태스크로 `pipeline`을 인스턴스화하고 텍스트를 전달합니다.
+`top_k` 매개변수를 사용하여 반환하는 예측의 수를 지정할 수 있습니다:
+
+```py
+>>> from transformers import pipeline
+
+>>> mask_filler = pipeline("fill-mask", "stevhliu/my_awesome_eli5_mlm_model")
+>>> mask_filler(text, top_k=3)
+[{'score': 0.5150994658470154,
+  'token': 21300,
+  'token_str': ' spiral',
+  'sequence': 'The Milky Way is a spiral galaxy.'},
+ {'score': 0.07087188959121704,
+  'token': 2232,
+  'token_str': ' massive',
+  'sequence': 'The Milky Way is a massive galaxy.'},
+ {'score': 0.06434620916843414,
+  'token': 650,
+  'token_str': ' small',
+  'sequence': 'The Milky Way is a small galaxy.'}]
+```
+
+<frameworkcontent>
+<pt>
+텍스트를 토큰화하고 `input_ids`를 PyTorch 텐서 형태로 반환합니다.
+또한, `<mask>` 토큰의 위치를 지정해야 합니다:
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_eli5_mlm_model")
+>>> inputs = tokenizer(text, return_tensors="pt")
+>>> mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
+```
+
+모델에 `inputs`를 입력하고, 마스킹된 토큰의 `logits`를 반환합니다:
+
+```py
+>>> from transformers import AutoModelForMaskedLM
+
+>>> model = AutoModelForMaskedLM.from_pretrained("stevhliu/my_awesome_eli5_mlm_model")
+>>> logits = model(**inputs).logits
+>>> mask_token_logits = logits[0, mask_token_index, :]
+```
+
+그런 다음 가장 높은 확률은 가진 마스크 토큰 3개를 반환하고, 출력합니다:
+```py
+>>> top_3_tokens = torch.topk(mask_token_logits, 3, dim=1).indices[0].tolist()
+
+>>> for token in top_3_tokens:
+...     print(text.replace(tokenizer.mask_token, tokenizer.decode([token])))
+The Milky Way is a spiral galaxy.
+The Milky Way is a massive galaxy.
+The Milky Way is a small galaxy.
+```
+</pt>
+<tf>
+텍스트를 토큰화하고 `input_ids`를 TensorFlow 텐서 형태로 반환합니다.
+또한, `<mask>` 토큰의 위치를 지정해야 합니다:
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_eli5_mlm_model")
+>>> inputs = tokenizer(text, return_tensors="tf")
+>>> mask_token_index = tf.where(inputs["input_ids"] == tokenizer.mask_token_id)[0, 1]
+```
+
+모델에 `inputs`를 입력하고, 마스킹된 토큰의 `logits`를 반환합니다:
+
+```py
+>>> from transformers import TFAutoModelForMaskedLM
+
+>>> model = TFAutoModelForMaskedLM.from_pretrained("stevhliu/my_awesome_eli5_mlm_model")
+>>> logits = model(**inputs).logits
+>>> mask_token_logits = logits[0, mask_token_index, :]
+```
+
+그런 다음 가장 높은 확률은 가진 마스크 토큰 3개를 반환하고, 출력합니다:
+```py
+>>> top_3_tokens = tf.math.top_k(mask_token_logits, 3).indices.numpy()
+
+>>> for token in top_3_tokens:
+...     print(text.replace(tokenizer.mask_token, tokenizer.decode([token])))
+The Milky Way is a spiral galaxy.
+The Milky Way is a massive galaxy.
+The Milky Way is a small galaxy.
+```
+</tf>
+</frameworkcontent>
diff --git a/docs/source/ko/tasks/monocular_depth_estimation.md b/docs/source/ko/tasks/monocular_depth_estimation.md
new file mode 100644
index 000000000000..e02dd5466b7d
--- /dev/null
+++ b/docs/source/ko/tasks/monocular_depth_estimation.md
@@ -0,0 +1,149 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 단일 영상 기반 깊이 추정[[depth-estimation-pipeline]]
+
+단일 영상 기반 깊이 추정은 한 장면의 단일 이미지에서 장면의 깊이 정보를 예측하는 컴퓨터 비전 작업입니다.
+즉, 단일 카메라 시점의 장면에 있는 물체의 거리를 예측하는 과정입니다.
+
+단일 영상 기반 깊이 추정은 3D 재구성, 증강 현실, 자율 주행, 로봇 공학 등 다양한 분야에서 응용됩니다. 
+조명 조건, 가려짐, 텍스처와 같은 요소의 영향을 받을 수 있는 장면 내 물체와 해당 깊이 정보 간의 복잡한 관계를 모델이 이해해야 하므로 까다로운 작업입니다.
+
+
+<Tip>
+이 튜토리얼에서 다루는 작업은 다음 모델 아키텍처에서 지원됩니다:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[DPT](../model_doc/dpt), [GLPN](../model_doc/glpn)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+이번 가이드에서 배울 내용은 다음과 같습니다:
+
+* 깊이 추정 파이프라인 만들기
+* 직접 깊이 추정 추론하기
+
+시작하기 전에, 필요한 모든 라이브러리가 설치되어 있는지 확인하세요:
+
+```bash
+pip install -q transformers
+```
+
+## 깊이 추정 파이프라인[[depth-estimation-inference-by-hand]]
+
+깊이 추정을 추론하는 가장 간단한 방법은 해당 기능을 제공하는 [`pipeline`]을 사용하는 것입니다.
+[Hugging Face Hub 체크포인트](https://huggingface.co/models?pipeline_tag=depth-estimation&sort=downloads)에서 파이프라인을 초기화합니다:
+
+```py
+>>> from transformers import pipeline
+
+>>> checkpoint = "vinvino02/glpn-nyu"
+>>> depth_estimator = pipeline("depth-estimation", model=checkpoint)
+```
+
+
+다음으로, 분석할 이미지를 한 장 선택하세요:
+
+```py
+>>> from PIL import Image
+>>> import requests
+
+>>> url = "https://unsplash.com/photos/HwBAsSbPBDU/download?ixid=MnwxMjA3fDB8MXxzZWFyY2h8MzR8fGNhciUyMGluJTIwdGhlJTIwc3RyZWV0fGVufDB8MHx8fDE2Nzg5MDEwODg&force=true&w=640"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+>>> image
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/depth-estimation-example.jpg" alt="Photo of a busy street"/>
+</div>
+
+이미지를 파이프라인으로 전달합니다.
+
+```py
+>>> predictions = depth_estimator(image)
+```
+
+파이프라인은 두 개의 항목을 가지는 딕셔너리를 반환합니다.
+첫 번째는 `predicted_depth`로 각 픽셀의 깊이를 미터로 표현한 값을 가지는 텐서입니다.
+두 번째는 `depth`로 깊이 추정 결과를 시각화하는 PIL 이미지입니다.
+
+이제 시각화한 결과를 살펴보겠습니다:
+
+```py
+>>> predictions["depth"]
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/depth-visualization.png" alt="Depth estimation visualization"/>
+</div>
+
+## 직접 깊이 추정 추론하기[[depth-estimation-inference-by-hand]]
+
+이제 깊이 추정 파이프라인 사용법을 살펴보았으니 동일한 결과를 복제하는 방법을 살펴보겠습니다.
+[Hugging Face Hub 체크포인트](https://huggingface.co/models?pipeline_tag=depth-estimation&sort=downloads)에서 모델과 관련 프로세서를 가져오는 것부터 시작합니다.
+여기서 이전에 사용한 체크포인트와 동일한 것을 사용합니다:
+
+```py
+>>> from transformers import AutoImageProcessor, AutoModelForDepthEstimation
+
+>>> checkpoint = "vinvino02/glpn-nyu"
+
+>>> image_processor = AutoImageProcessor.from_pretrained(checkpoint)
+>>> model = AutoModelForDepthEstimation.from_pretrained(checkpoint)
+```
+
+필요한 이미지 변환을 처리하는 `image_processor`를 사용하여 모델에 대한 이미지 입력을 준비합니다.
+`image_processor`는 크기 조정 및 정규화 등 필요한 이미지 변환을 처리합니다:
+
+```py
+>>> pixel_values = image_processor(image, return_tensors="pt").pixel_values
+```
+
+준비한 입력을 모델로 전달합니다:
+
+```py
+>>> import torch
+
+>>> with torch.no_grad():
+...     outputs = model(pixel_values)
+...     predicted_depth = outputs.predicted_depth
+```
+
+결과를 시각화합니다:
+
+```py
+>>> import numpy as np
+
+>>> # 원본 사이즈로 복원
+>>> prediction = torch.nn.functional.interpolate(
+...     predicted_depth.unsqueeze(1),
+...     size=image.size[::-1],
+...     mode="bicubic",
+...     align_corners=False,
+... ).squeeze()
+>>> output = prediction.numpy()
+
+>>> formatted = (output * 255 / np.max(output)).astype("uint8")
+>>> depth = Image.fromarray(formatted)
+>>> depth
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/depth-visualization.png" alt="Depth estimation visualization"/>
+</div>
diff --git a/docs/source/ko/tasks/multiple_choice.md b/docs/source/ko/tasks/multiple_choice.md
new file mode 100644
index 000000000000..c174ca632f69
--- /dev/null
+++ b/docs/source/ko/tasks/multiple_choice.md
@@ -0,0 +1,465 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 객관식 문제[[multiple-choice]]
+
+[[open-in-colab]]
+
+객관식 과제는 문맥과 함께 여러 개의 후보 답변이 제공되고 모델이 정답을 선택하도록 학습된다는 점을 제외하면 질의응답과 유사합니다.
+
+진행하는 방법은 아래와 같습니다:
+
+1. [SWAG](https://huggingface.co/datasets/swag) 데이터 세트의 'regular' 구성으로 [BERT](https://huggingface.co/bert-base-uncased)를 미세 조정하여 여러 옵션과 일부 컨텍스트가 주어졌을 때 가장 적합한 답을 선택합니다.
+2. 추론에 미세 조정된 모델을 사용합니다.
+
+<Tip>
+이 튜토리얼에서 설명하는 작업은 다음 모델 아키텍처에서 지원됩니다:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[ALBERT](../model_doc/albert), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [I-BERT](../model_doc/ibert), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [QDQBert](../model_doc/qdqbert), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+시작하기 전에 필요한 라이브러리가 모두 설치되어 있는지 확인하세요:
+
+```bash
+pip install transformers datasets evaluate
+```
+
+모델을 업로드하고 커뮤니티와 공유할 수 있도록 허깅페이스 계정에 로그인하는 것이 좋습니다. 메시지가 표시되면 토큰을 입력하여 로그인합니다:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## SWAG 데이터 세트 가져오기[[load-swag-dataset]]
+
+먼저 🤗 Datasets  라이브러리에서 SWAG 데이터셋의 '일반' 구성을 가져옵니다:
+
+```py
+>>> from datasets import load_dataset
+
+>>> swag = load_dataset("swag", "regular")
+```
+
+이제 데이터를 살펴봅니다:
+
+```py
+>>> swag["train"][0]
+{'ending0': 'passes by walking down the street playing their instruments.',
+ 'ending1': 'has heard approaching them.',
+ 'ending2': "arrives and they're outside dancing and asleep.",
+ 'ending3': 'turns the lead singer watches the performance.',
+ 'fold-ind': '3416',
+ 'gold-source': 'gold',
+ 'label': 0,
+ 'sent1': 'Members of the procession walk down the street holding small horn brass instruments.',
+ 'sent2': 'A drum line',
+ 'startphrase': 'Members of the procession walk down the street holding small horn brass instruments. A drum line',
+ 'video-id': 'anetv_jkn6uvmqwh4'}
+```
+
+여기에는 많은 필드가 있는 것처럼 보이지만 실제로는 매우 간단합니다:
+
+- `sent1` 및 `sent2`: 이 필드는 문장이 어떻게 시작되는지 보여주며, 이 두 필드를 합치면 `시작 구절(startphrase)` 필드가 됩니다.
+- `종료 구절(ending)`: 문장이 어떻게 끝날 수 있는지에 대한 가능한 종료 구절를 제시하지만 그 중 하나만 정답입니다.
+- `레이블(label)`: 올바른 문장 종료 구절을 식별합니다.
+
+## 전처리[[preprocess]]
+
+다음 단계는 문장의 시작과 네 가지 가능한 구절을 처리하기 위해 BERT 토크나이저를 불러옵니다:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+```
+
+생성하려는 전처리 함수는 다음과 같아야 합니다:
+
+1. `sent1` 필드를 네 개 복사한 다음 각각을 `sent2`와 결합하여 문장이 시작되는 방식을 재현합니다.
+2. `sent2`를 네 가지 가능한 문장 구절 각각과 결합합니다.
+3. 이 두 목록을 토큰화할 수 있도록 평탄화(flatten)하고, 각 예제에 해당하는 `input_ids`, `attention_mask` 및 `labels` 필드를 갖도록 다차원화(unflatten) 합니다.
+
+```py
+>>> ending_names = ["ending0", "ending1", "ending2", "ending3"]
+
+
+>>> def preprocess_function(examples):
+...     first_sentences = [[context] * 4 for context in examples["sent1"]]
+...     question_headers = examples["sent2"]
+...     second_sentences = [
+...         [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)
+...     ]
+
+...     first_sentences = sum(first_sentences, [])
+...     second_sentences = sum(second_sentences, [])
+
+...     tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
+...     return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}
+```
+
+전체 데이터 집합에 전처리 기능을 적용하려면 🤗 Datasets [`~datasets.Dataset.map`] 메소드를 사용합니다. `batched=True`를 설정하여 데이터 집합의 여러 요소를 한 번에 처리하면 `map` 함수의 속도를 높일 수 있습니다:
+
+```py
+tokenized_swag = swag.map(preprocess_function, batched=True)
+```
+
+🤗 Transformers에는 객관식용 데이터 콜레이터가 없으므로 예제 배치를 만들려면 [`DataCollatorWithPadding`]을 조정해야 합니다. 데이터 정렬 중에 전체 데이터 집합을 최대 길이로 패딩하는 대신 배치 중 가장 긴 길이로 문장을 *동적 패딩*하는 것이 더 효율적입니다.
+
+`DataCollatorForMultipleChoice`는 모든 모델 입력을 평탄화하고 패딩을 적용하며 그 결과를 결과를 다차원화합니다:
+
+<frameworkcontent>
+<pt>
+```py
+>>> from dataclasses import dataclass
+>>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
+>>> from typing import Optional, Union
+>>> import torch
+
+
+>>> @dataclass
+... class DataCollatorForMultipleChoice:
+...     """
+...     Data collator that will dynamically pad the inputs for multiple choice received.
+...     """
+
+...     tokenizer: PreTrainedTokenizerBase
+...     padding: Union[bool, str, PaddingStrategy] = True
+...     max_length: Optional[int] = None
+...     pad_to_multiple_of: Optional[int] = None
+
+...     def __call__(self, features):
+...         label_name = "label" if "label" in features[0].keys() else "labels"
+...         labels = [feature.pop(label_name) for feature in features]
+...         batch_size = len(features)
+...         num_choices = len(features[0]["input_ids"])
+...         flattened_features = [
+...             [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
+...         ]
+...         flattened_features = sum(flattened_features, [])
+
+...         batch = self.tokenizer.pad(
+...             flattened_features,
+...             padding=self.padding,
+...             max_length=self.max_length,
+...             pad_to_multiple_of=self.pad_to_multiple_of,
+...             return_tensors="pt",
+...         )
+
+...         batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
+...         batch["labels"] = torch.tensor(labels, dtype=torch.int64)
+...         return batch
+```
+</pt>
+<tf>
+```py
+>>> from dataclasses import dataclass
+>>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
+>>> from typing import Optional, Union
+>>> import tensorflow as tf
+
+
+>>> @dataclass
+... class DataCollatorForMultipleChoice:
+...     """
+...     Data collator that will dynamically pad the inputs for multiple choice received.
+...     """
+
+...     tokenizer: PreTrainedTokenizerBase
+...     padding: Union[bool, str, PaddingStrategy] = True
+...     max_length: Optional[int] = None
+...     pad_to_multiple_of: Optional[int] = None
+
+...     def __call__(self, features):
+...         label_name = "label" if "label" in features[0].keys() else "labels"
+...         labels = [feature.pop(label_name) for feature in features]
+...         batch_size = len(features)
+...         num_choices = len(features[0]["input_ids"])
+...         flattened_features = [
+...             [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
+...         ]
+...         flattened_features = sum(flattened_features, [])
+
+...         batch = self.tokenizer.pad(
+...             flattened_features,
+...             padding=self.padding,
+...             max_length=self.max_length,
+...             pad_to_multiple_of=self.pad_to_multiple_of,
+...             return_tensors="tf",
+...         )
+
+...         batch = {k: tf.reshape(v, (batch_size, num_choices, -1)) for k, v in batch.items()}
+...         batch["labels"] = tf.convert_to_tensor(labels, dtype=tf.int64)
+...         return batch
+```
+</tf>
+</frameworkcontent>
+
+## 평가 하기[[evaluate]]
+
+훈련 중에 메트릭을 포함하면 모델의 성능을 평가하는 데 도움이 되는 경우가 많습니다. 🤗[Evaluate](https://huggingface.co/docs/evaluate/index) 라이브러리를 사용하여 평가 방법을 빠르게 가져올 수 있습니다. 이 작업에서는 [accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy) 지표를 가져옵니다(🤗 Evaluate [둘러보기](https://huggingface.co/docs/evaluate/a_quick_tour)를 참조하여 지표를 가져오고 계산하는 방법에 대해 자세히 알아보세요):
+
+```py
+>>> import evaluate
+
+>>> accuracy = evaluate.load("accuracy")
+```
+
+그리고 예측과 레이블을 [`~evaluate.EvaluationModule.compute`]에 전달하여 정확도를 계산하는 함수를 만듭니다:
+
+```py
+>>> import numpy as np
+
+
+>>> def compute_metrics(eval_pred):
+...     predictions, labels = eval_pred
+...     predictions = np.argmax(predictions, axis=1)
+...     return accuracy.compute(predictions=predictions, references=labels)
+```
+
+이제 `compute_metrics` 함수를 사용할 준비가 되었으며, 훈련을 설정할 때 이 함수로 돌아가게 됩니다.
+
+## 훈련 하기[[train]]
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+[`Trainer`]로 모델을 미세 조정하는 데 익숙하지 않다면 기본 튜토리얼 [여기](../training#train-with-pytorch-trainer)를 살펴보세요!
+
+</Tip>
+
+이제 모델 훈련을 시작할 준비가 되었습니다! [`AutoModelForMultipleChoice`]로 BERT를 로드합니다:
+
+```py
+>>> from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
+
+>>> model = AutoModelForMultipleChoice.from_pretrained("bert-base-uncased")
+```
+
+이제 세 단계만 남았습니다:
+
+1. 훈련 하이퍼파라미터를 [`TrainingArguments`]에 정의합니다. 유일한 필수 매개변수는 모델을 저장할 위치를 지정하는 `output_dir`입니다. `push_to_hub=True`를 설정하여 이 모델을 허브에 푸시합니다(모델을 업로드하려면 허깅 페이스에 로그인해야 합니다). 각 에폭이 끝날 때마다 [`Trainer`]가 정확도를 평가하고 훈련 체크포인트를 저장합니다.
+2. 모델, 데이터 세트, 토크나이저, 데이터 콜레이터, `compute_metrics` 함수와 함께 훈련 인자를 [`Trainer`]에 전달합니다.
+3. [`~Trainer.train`]을 사용하여 모델을 미세 조정합니다.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_swag_model",
+...     evaluation_strategy="epoch",
+...     save_strategy="epoch",
+...     load_best_model_at_end=True,
+...     learning_rate=5e-5,
+...     per_device_train_batch_size=16,
+...     per_device_eval_batch_size=16,
+...     num_train_epochs=3,
+...     weight_decay=0.01,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=tokenized_swag["train"],
+...     eval_dataset=tokenized_swag["validation"],
+...     tokenizer=tokenizer,
+...     data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+훈련이 완료되면 모든 사람이 모델을 사용할 수 있도록 [`~transformers.Trainer.push_to_hub`] 메소드를 사용하여 모델을 허브에 공유하세요:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+<Tip>
+
+Keras로 모델을 미세 조정하는 데 익숙하지 않다면 기본 튜토리얼 [여기](../training#train-a-tensorflow-model-with-keras)를 살펴보시기 바랍니다!
+
+</Tip>
+TensorFlow에서 모델을 미세 조정하려면 최적화 함수, 학습률 스케쥴 및 몇 가지 학습 하이퍼파라미터를 설정하는 것부터 시작하세요:
+
+```py
+>>> from transformers import create_optimizer
+
+>>> batch_size = 16
+>>> num_train_epochs = 2
+>>> total_train_steps = (len(tokenized_swag["train"]) // batch_size) * num_train_epochs
+>>> optimizer, schedule = create_optimizer(init_lr=5e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
+```
+
+그리고 [`TFAutoModelForMultipleChoice`]로 BERT를 가져올 수 있습니다:
+
+```py
+>>> from transformers import TFAutoModelForMultipleChoice
+
+>>> model = TFAutoModelForMultipleChoice.from_pretrained("bert-base-uncased")
+```
+
+[`~transformers.TFPreTrainedModel.prepare_tf_dataset`]을 사용하여 데이터 세트를 `tf.data.Dataset` 형식으로 변환합니다:
+
+```py
+>>> data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
+>>> tf_train_set = model.prepare_tf_dataset(
+...     tokenized_swag["train"],
+...     shuffle=True,
+...     batch_size=batch_size,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_validation_set = model.prepare_tf_dataset(
+...     tokenized_swag["validation"],
+...     shuffle=False,
+...     batch_size=batch_size,
+...     collate_fn=data_collator,
+... )
+```
+
+[`compile`](https://keras.io/api/models/model_training_apis/#compile-method)을 사용하여 훈련 모델을 구성합니다:
+
+```py
+>>> model.compile(optimizer=optimizer)
+```
+
+훈련을 시작하기 전에 설정해야 할 마지막 두 가지는 예측의 정확도를 계산하고 모델을 허브로 푸시하는 방법을 제공하는 것입니다. 이 두 가지 작업은 모두 [Keras 콜백](../main_classes/keras_callbacks)을 사용하여 수행할 수 있습니다.
+
+`compute_metrics`함수를 [`~transformers.KerasMetricCallback`]에 전달하세요:
+
+```py
+>>> from transformers.keras_callbacks import KerasMetricCallback
+
+>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
+```
+
+모델과 토크나이저를 업로드할 위치를 [`~transformers.PushToHubCallback`]에서 지정하세요:
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> push_to_hub_callback = PushToHubCallback(
+...     output_dir="my_awesome_model",
+...     tokenizer=tokenizer,
+... )
+```
+
+그리고 콜백을 함께 묶습니다:
+
+```py
+>>> callbacks = [metric_callback, push_to_hub_callback]
+```
+
+이제 모델 훈련을 시작합니다! 훈련 및 검증 데이터 세트, 에폭 수, 콜백을 사용하여 [`fit`](https://keras.io/api/models/model_training_apis/#fit-method)을 호출하고 모델을 미세 조정합니다:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=2, callbacks=callbacks)
+```
+
+훈련이 완료되면 모델이 자동으로 허브에 업로드되어 누구나 사용할 수 있습니다!
+</tf>
+</frameworkcontent>
+
+
+<Tip>
+
+객관식 모델을 미세 조정하는 방법에 대한 보다 심층적인 예는 아래 문서를 참조하세요.
+[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)
+또는 [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb).
+
+</Tip>
+
+## 추론 하기[[inference]]
+
+이제 모델을 미세 조정했으니 추론에 사용할 수 있습니다!
+
+텍스트와 두 개의 후보 답안을 작성합니다:
+
+```py
+>>> prompt = "France has a bread law, Le Décret Pain, with strict rules on what is allowed in a traditional baguette."
+>>> candidate1 = "The law does not apply to croissants and brioche."
+>>> candidate2 = "The law applies to baguettes."
+```
+
+<frameworkcontent>
+<pt>
+각 프롬프트와 후보 답변 쌍을 토큰화하여 PyTorch 텐서를 반환합니다. 또한 `labels`을 생성해야 합니다:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_swag_model")
+>>> inputs = tokenizer([[prompt, candidate1], [prompt, candidate2]], return_tensors="pt", padding=True)
+>>> labels = torch.tensor(0).unsqueeze(0)
+```
+
+입력과 레이블을 모델에 전달하고 `logits`을 반환합니다:
+
+```py
+>>> from transformers import AutoModelForMultipleChoice
+
+>>> model = AutoModelForMultipleChoice.from_pretrained("my_awesome_swag_model")
+>>> outputs = model(**{k: v.unsqueeze(0) for k, v in inputs.items()}, labels=labels)
+>>> logits = outputs.logits
+```
+
+가장 높은 확률을 가진 클래스를 가져옵니다:
+
+```py
+>>> predicted_class = logits.argmax().item()
+>>> predicted_class
+'0'
+```
+</pt>
+<tf>
+각 프롬프트와 후보 답안 쌍을 토큰화하여 텐서플로 텐서를 반환합니다:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_swag_model")
+>>> inputs = tokenizer([[prompt, candidate1], [prompt, candidate2]], return_tensors="tf", padding=True)
+```
+
+모델에 입력을 전달하고 `logits`를 반환합니다:
+
+```py
+>>> from transformers import TFAutoModelForMultipleChoice
+
+>>> model = TFAutoModelForMultipleChoice.from_pretrained("my_awesome_swag_model")
+>>> inputs = {k: tf.expand_dims(v, 0) for k, v in inputs.items()}
+>>> outputs = model(inputs)
+>>> logits = outputs.logits
+```
+
+가장 높은 확률을 가진 클래스를 가져옵니다:
+
+```py
+>>> predicted_class = int(tf.math.argmax(logits, axis=-1)[0])
+>>> predicted_class
+'0'
+```
+</tf>
+</frameworkcontent>
diff --git a/docs/source/ko/tasks/object_detection.md b/docs/source/ko/tasks/object_detection.md
new file mode 100644
index 000000000000..ca384d038162
--- /dev/null
+++ b/docs/source/ko/tasks/object_detection.md
@@ -0,0 +1,588 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 객체 탐지 [[object-detection]]
+
+[[open-in-colab]]
+
+객체 탐지는 이미지에서 인스턴스(예: 사람, 건물 또는 자동차)를 감지하는 컴퓨터 비전 작업입니다. 객체 탐지 모델은 이미지를 입력으로 받고 탐지된 바운딩 박스의 좌표와 관련된 레이블을 출력합니다.
+하나의 이미지에는 여러 객체가 있을 수 있으며 각각은 자체적인 바운딩 박스와 레이블을 가질 수 있습니다(예: 차와 건물이 있는 이미지).
+또한 각 객체는 이미지의 다른 부분에 존재할 수 있습니다(예: 이미지에 여러 대의 차가 있을 수 있음).
+이 작업은 보행자, 도로 표지판, 신호등과 같은 것들을 감지하는 자율 주행에 일반적으로 사용됩니다.
+다른 응용 분야로는 이미지 내 객체 수 계산 및 이미지 검색 등이 있습니다.
+
+이 가이드에서 다음을 배울 것입니다:
+
+ 1. 합성곱 백본(인풋 데이터의 특성을 추출하는 합성곱 네트워크)과 인코더-디코더 트랜스포머 모델을 결합한 [DETR](https://huggingface.co/docs/transformers/model_doc/detr) 모델을 [CPPE-5](https://huggingface.co/datasets/cppe-5) 데이터 세트에 대해 미세조정 하기
+ 2. 미세조정 한 모델을 추론에 사용하기.
+
+<Tip>
+이 튜토리얼의 태스크는 다음 모델 아키텍처에서 지원됩니다:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[Conditional DETR](../model_doc/conditional_detr), [Deformable DETR](../model_doc/deformable_detr), [DETA](../model_doc/deta), [DETR](../model_doc/detr), [Table Transformer](../model_doc/table-transformer), [YOLOS](../model_doc/yolos)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+시작하기 전에 필요한 모든 라이브러리가 설치되어 있는지 확인하세요:
+```bash
+pip install -q datasets transformers evaluate timm albumentations
+```
+
+허깅페이스 허브에서 데이터 세트를 가져오기 위한 🤗 Datasets과 모델을 학습하기 위한 🤗 Transformers, 데이터를 증강하기 위한 `albumentations`를 사용합니다.
+DETR 모델의 합성곱 백본을 가져오기 위해서는 현재 `timm`이 필요합니다.
+
+커뮤니티에 모델을 업로드하고 공유할 수 있도록 Hugging Face 계정에 로그인하는 것을 권장합니다. 프롬프트가 나타나면 토큰을 입력하여 로그인하세요:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## CPPE-5 데이터 세트 가져오기 [[load-the-CPPE-5-dataset]]
+
+[CPPE-5](https://huggingface.co/datasets/cppe-5) 데이터 세트는 COVID-19 대유행 상황에서 의료 전문인력 보호 장비(PPE)를 식별하는 어노테이션이 포함된 이미지를 담고 있습니다.
+
+데이터 세트를 가져오세요:
+
+```py
+>>> from datasets import load_dataset
+
+>>> cppe5 = load_dataset("cppe-5")
+>>> cppe5
+DatasetDict({
+    train: Dataset({
+        features: ['image_id', 'image', 'width', 'height', 'objects'],
+        num_rows: 1000
+    })
+    test: Dataset({
+        features: ['image_id', 'image', 'width', 'height', 'objects'],
+        num_rows: 29
+    })
+})
+```
+
+이 데이터 세트는 학습 세트 이미지 1,000개와 테스트 세트 이미지 29개를 갖고 있습니다.
+
+데이터에 익숙해지기 위해, 예시가 어떻게 구성되어 있는지 살펴보세요.
+
+```py
+>>> cppe5["train"][0]
+{'image_id': 15,
+ 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=943x663 at 0x7F9EC9E77C10>,
+ 'width': 943,
+ 'height': 663,
+ 'objects': {'id': [114, 115, 116, 117],
+  'area': [3796, 1596, 152768, 81002],
+  'bbox': [[302.0, 109.0, 73.0, 52.0],
+   [810.0, 100.0, 57.0, 28.0],
+   [160.0, 31.0, 248.0, 616.0],
+   [741.0, 68.0, 202.0, 401.0]],
+  'category': [4, 4, 0, 0]}}
+```
+
+데이터 세트에 있는 예시는 다음의 영역을 가지고 있습니다:
+
+- `image_id`: 예시 이미지 id
+- `image`: 이미지를 포함하는 `PIL.Image.Image` 객체
+- `width`: 이미지의 너비
+- `height`: 이미지의 높이
+- `objects`: 이미지 안의 객체들의 바운딩 박스 메타데이터를 포함하는 딕셔너리:
+  - `id`: 어노테이션 id
+  - `area`: 바운딩 박스의 면적
+  - `bbox`: 객체의 바운딩 박스 ([COCO 포맷](https://albumentations.ai/docs/getting_started/bounding_boxes_augmentation/#coco)으로)
+  - `category`: 객체의 카테고리, 가능한 값으로는 `Coverall (0)`, `Face_Shield (1)`, `Gloves (2)`, `Goggles (3)` 및 `Mask (4)` 가 포함됩니다.
+
+`bbox` 필드가 DETR 모델이 요구하는 COCO 형식을 따른다는 것을 알 수 있습니다.
+그러나 `objects` 내부의 필드 그룹은 DETR이 요구하는 어노테이션 형식과 다릅니다. 따라서 이 데이터를 학습에 사용하기 전에 전처리를 적용해야 합니다.
+
+데이터를 더 잘 이해하기 위해서 데이터 세트에서 한 가지 예시를 시각화하세요.
+
+```py
+>>> import numpy as np
+>>> import os
+>>> from PIL import Image, ImageDraw
+
+>>> image = cppe5["train"][0]["image"]
+>>> annotations = cppe5["train"][0]["objects"]
+>>> draw = ImageDraw.Draw(image)
+
+>>> categories = cppe5["train"].features["objects"].feature["category"].names
+
+>>> id2label = {index: x for index, x in enumerate(categories, start=0)}
+>>> label2id = {v: k for k, v in id2label.items()}
+
+>>> for i in range(len(annotations["id"])):
+...     box = annotations["bbox"][i - 1]
+...     class_idx = annotations["category"][i - 1]
+...     x, y, w, h = tuple(box)
+...     draw.rectangle((x, y, x + w, y + h), outline="red", width=1)
+...     draw.text((x, y), id2label[class_idx], fill="white")
+
+>>> image
+```
+
+<div class="flex justify-center">
+    <img src="https://i.imgur.com/TdaqPJO.png" alt="CPPE-5 Image Example"/>
+</div>
+
+바운딩 박스와 연결된 레이블을 시각화하려면 데이터 세트의 메타 데이터, 특히 `category` 필드에서 레이블을 가져와야 합니다.
+또한 레이블 ID를 레이블 클래스에 매핑하는 `id2label`과 반대로 매핑하는 `label2id` 딕셔너리를 만들어야 합니다.
+모델을 설정할 때 이러한 매핑을 사용할 수 있습니다. 이러한 매핑은 허깅페이스 허브에서 모델을 공유했을 때 다른 사람들이 재사용할 수 있습니다.
+
+데이터를 더 잘 이해하기 위한 최종 단계로, 잠재적인 문제를 찾아보세요.
+객체 감지를 위한 데이터 세트에서 자주 발생하는 문제 중 하나는 바운딩 박스가 이미지의 가장자리를 넘어가는 것입니다.
+이러한 바운딩 박스를 "넘어가는 것(run away)"은 훈련 중에 오류를 발생시킬 수 있기에 이 단계에서 처리해야 합니다.
+이 데이터 세트에도 같은 문제가 있는 몇 가지 예가 있습니다. 이 가이드에서는 간단하게하기 위해 데이터에서 이러한 이미지를 제거합니다.
+
+```py
+>>> remove_idx = [590, 821, 822, 875, 876, 878, 879]
+>>> keep = [i for i in range(len(cppe5["train"])) if i not in remove_idx]
+>>> cppe5["train"] = cppe5["train"].select(keep)
+```
+
+## 데이터 전처리하기 [[preprocess-the-data]]
+
+모델을 미세 조정 하려면, 미리 학습된 모델에서 사용한 전처리 방식과 정확하게 일치하도록 사용할 데이터를 전처리해야 합니다.
+[`AutoImageProcessor`]는 이미지 데이터를 처리하여 DETR 모델이 학습에 사용할 수 있는 `pixel_values`, `pixel_mask`, 그리고 `labels`를 생성하는 작업을 담당합니다.
+이 이미지 프로세서에는 걱정하지 않아도 되는 몇 가지 속성이 있습니다:
+
+- `image_mean = [0.485, 0.456, 0.406 ]`
+- `image_std = [0.229, 0.224, 0.225]`
+
+
+이 값들은 모델 사전 훈련 중 이미지를 정규화하는 데 사용되는 평균과 표준 편차입니다.
+이 값들은 추론 또는 사전 훈련된 이미지 모델을 세밀하게 조정할 때 복제해야 하는 중요한 값입니다.
+
+사전 훈련된 모델과 동일한 체크포인트에서 이미지 프로세서를 인스턴스화합니다.
+
+```py
+>>> from transformers import AutoImageProcessor
+
+>>> checkpoint = "facebook/detr-resnet-50"
+>>> image_processor = AutoImageProcessor.from_pretrained(checkpoint)
+```
+
+`image_processor`에 이미지를 전달하기 전에, 데이터 세트에 두 가지 전처리를 적용해야 합니다:
+
+- 이미지 증강
+- DETR 모델의 요구에 맞게 어노테이션을 다시 포맷팅
+
+첫째로, 모델이 학습 데이터에 과적합 되지 않도록 데이터 증강 라이브러리 중 아무거나 사용하여 변환을 적용할 수 있습니다. 여기에서는 [Albumentations](https://albumentations.ai/docs/) 라이브러리를 사용합니다...
+이 라이브러리는 변환을 이미지에 적용하고 바운딩 박스를 적절하게 업데이트하도록 보장합니다.
+🤗 Datasets 라이브러리 문서에는 [객체 탐지를 위해 이미지를 보강하는 방법에 대한 자세한 가이드](https://huggingface.co/docs/datasets/object_detection)가 있으며,
+이 예제와 정확히 동일한 데이터 세트를 사용합니다. 여기서는 각 이미지를 (480, 480) 크기로 조정하고, 좌우로 뒤집고, 밝기를 높이는 동일한 접근법을 적용합니다:
+
+
+```py
+>>> import albumentations
+>>> import numpy as np
+>>> import torch
+
+>>> transform = albumentations.Compose(
+...     [
+...         albumentations.Resize(480, 480),
+...         albumentations.HorizontalFlip(p=1.0),
+...         albumentations.RandomBrightnessContrast(p=1.0),
+...     ],
+...     bbox_params=albumentations.BboxParams(format="coco", label_fields=["category"]),
+... )
+```
+
+이미지 프로세서는 어노테이션이 다음과 같은 형식일 것으로 예상합니다: `{'image_id': int, 'annotations': List[Dict]}`, 여기서 각 딕셔너리는 COCO 객체 어노테이션입니다. 단일 예제에 대해 어노테이션의 형식을 다시 지정하는 함수를 추가해 보겠습니다:
+
+```py
+>>> def formatted_anns(image_id, category, area, bbox):
+...     annotations = []
+...     for i in range(0, len(category)):
+...         new_ann = {
+...             "image_id": image_id,
+...             "category_id": category[i],
+...             "isCrowd": 0,
+...             "area": area[i],
+...             "bbox": list(bbox[i]),
+...         }
+...         annotations.append(new_ann)
+
+...     return annotations
+```
+
+이제 이미지와 어노테이션 전처리 변환을 결합하여 예제 배치에 사용할 수 있습니다:
+
+```py
+>>> # transforming a batch
+>>> def transform_aug_ann(examples):
+...     image_ids = examples["image_id"]
+...     images, bboxes, area, categories = [], [], [], []
+...     for image, objects in zip(examples["image"], examples["objects"]):
+...         image = np.array(image.convert("RGB"))[:, :, ::-1]
+...         out = transform(image=image, bboxes=objects["bbox"], category=objects["category"])
+
+...         area.append(objects["area"])
+...         images.append(out["image"])
+...         bboxes.append(out["bboxes"])
+...         categories.append(out["category"])
+
+...     targets = [
+...         {"image_id": id_, "annotations": formatted_anns(id_, cat_, ar_, box_)}
+...         for id_, cat_, ar_, box_ in zip(image_ids, categories, area, bboxes)
+...     ]
+
+...     return image_processor(images=images, annotations=targets, return_tensors="pt")
+```
+
+이전 단계에서 만든 전처리 함수를 🤗 Datasets의 [`~datasets.Dataset.with_transform`] 메소드를 사용하여 데이터 세트 전체에 적용합니다.
+이 메소드는 데이터 세트의 요소를 가져올 때마다 전처리 함수를 적용합니다.
+
+이 시점에서는 전처리 후 데이터 세트에서 예시 하나를 가져와서 변환 후 모양이 어떻게 되는지 확인해 볼 수 있습니다.
+이때, `pixel_values` 텐서, `pixel_mask` 텐서, 그리고 `labels`로 구성된 텐서가 있어야 합니다.
+
+```py
+>>> cppe5["train"] = cppe5["train"].with_transform(transform_aug_ann)
+>>> cppe5["train"][15]
+{'pixel_values': tensor([[[ 0.9132,  0.9132,  0.9132,  ..., -1.9809, -1.9809, -1.9809],
+          [ 0.9132,  0.9132,  0.9132,  ..., -1.9809, -1.9809, -1.9809],
+          [ 0.9132,  0.9132,  0.9132,  ..., -1.9638, -1.9638, -1.9638],
+          ...,
+          [-1.5699, -1.5699, -1.5699,  ..., -1.9980, -1.9980, -1.9980],
+          [-1.5528, -1.5528, -1.5528,  ..., -1.9980, -1.9809, -1.9809],
+          [-1.5528, -1.5528, -1.5528,  ..., -1.9980, -1.9809, -1.9809]],
+
+         [[ 1.3081,  1.3081,  1.3081,  ..., -1.8431, -1.8431, -1.8431],
+          [ 1.3081,  1.3081,  1.3081,  ..., -1.8431, -1.8431, -1.8431],
+          [ 1.3081,  1.3081,  1.3081,  ..., -1.8256, -1.8256, -1.8256],
+          ...,
+          [-1.3179, -1.3179, -1.3179,  ..., -1.8606, -1.8606, -1.8606],
+          [-1.3004, -1.3004, -1.3004,  ..., -1.8606, -1.8431, -1.8431],
+          [-1.3004, -1.3004, -1.3004,  ..., -1.8606, -1.8431, -1.8431]],
+
+         [[ 1.4200,  1.4200,  1.4200,  ..., -1.6476, -1.6476, -1.6476],
+          [ 1.4200,  1.4200,  1.4200,  ..., -1.6476, -1.6476, -1.6476],
+          [ 1.4200,  1.4200,  1.4200,  ..., -1.6302, -1.6302, -1.6302],
+          ...,
+          [-1.0201, -1.0201, -1.0201,  ..., -1.5604, -1.5604, -1.5604],
+          [-1.0027, -1.0027, -1.0027,  ..., -1.5604, -1.5430, -1.5430],
+          [-1.0027, -1.0027, -1.0027,  ..., -1.5604, -1.5430, -1.5430]]]),
+ 'pixel_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
+         [1, 1, 1,  ..., 1, 1, 1],
+         [1, 1, 1,  ..., 1, 1, 1],
+         ...,
+         [1, 1, 1,  ..., 1, 1, 1],
+         [1, 1, 1,  ..., 1, 1, 1],
+         [1, 1, 1,  ..., 1, 1, 1]]),
+ 'labels': {'size': tensor([800, 800]), 'image_id': tensor([756]), 'class_labels': tensor([4]), 'boxes': tensor([[0.7340, 0.6986, 0.3414, 0.5944]]), 'area': tensor([519544.4375]), 'iscrowd': tensor([0]), 'orig_size': tensor([480, 480])}}
+```
+
+각각의 이미지를 성공적으로 증강하고 이미지의 어노테이션을 준비했습니다.
+그러나 전처리는 아직 끝나지 않았습니다. 마지막 단계로, 이미지를 배치로 만들 사용자 정의 `collate_fn`을 생성합니다.
+해당 배치에서 가장 큰 이미지에 이미지(현재 `pixel_values` 인)를 패드하고, 실제 픽셀(1)과 패딩(0)을 나타내기 위해 그에 해당하는 새로운 `pixel_mask`를 생성해야 합니다.
+
+```py
+>>> def collate_fn(batch):
+...     pixel_values = [item["pixel_values"] for item in batch]
+...     encoding = image_processor.pad(pixel_values, return_tensors="pt")
+...     labels = [item["labels"] for item in batch]
+...     batch = {}
+...     batch["pixel_values"] = encoding["pixel_values"]
+...     batch["pixel_mask"] = encoding["pixel_mask"]
+...     batch["labels"] = labels
+...     return batch
+```
+
+## DETR 모델 학습시키기 [[training-the-DETR-model]]
+
+이전 섹션에서 대부분의 작업을 수행하여 이제 모델을 학습할 준비가 되었습니다!
+이 데이터 세트의 이미지는 리사이즈 후에도 여전히 용량이 크기 때문에, 이 모델을 미세 조정 하려면 적어도 하나의 GPU가 필요합니다.
+
+학습은 다음의 단계를 수행합니다:
+
+1. [`AutoModelForObjectDetection`]을 사용하여 전처리와 동일한 체크포인트를 사용하여 모델을 가져옵니다.
+2. [`TrainingArguments`]에서 학습 하이퍼파라미터를 정의합니다.
+3. 모델, 데이터 세트, 이미지 프로세서 및 데이터 콜레이터와 함께 [`Trainer`]에 훈련 인수를 전달합니다.
+4. [`~Trainer.train`]를 호출하여 모델을 미세 조정 합니다.
+
+전처리에 사용한 체크포인트와 동일한 체크포인트에서 모델을 가져올 때, 데이터 세트의 메타데이터에서 만든 `label2id`와 `id2label` 매핑을 전달해야 합니다.
+또한, `ignore_mismatched_sizes=True`를 지정하여 기존 분류 헤드(모델에서 분류에 사용되는 마지막 레이어)를 새 분류 헤드로 대체합니다.
+
+```py
+>>> from transformers import AutoModelForObjectDetection
+
+>>> model = AutoModelForObjectDetection.from_pretrained(
+...     checkpoint,
+...     id2label=id2label,
+...     label2id=label2id,
+...     ignore_mismatched_sizes=True,
+... )
+```
+
+[`TrainingArguments`]에서 `output_dir`을 사용하여 모델을 저장할 위치를 지정한 다음, 필요에 따라 하이퍼파라미터를 구성하세요.
+사용하지 않는 열을 제거하지 않도록 주의해야 합니다. 만약 `remove_unused_columns`가 `True`일 경우 이미지 열이 삭제됩니다.
+이미지 열이 없는 경우 `pixel_values`를 생성할 수 없기 때문에 `remove_unused_columns`를 `False`로 설정해야 합니다.
+모델을 Hub에 업로드하여 공유하려면 `push_to_hub`를 `True`로 설정하십시오(허깅페이스에 로그인하여 모델을 업로드해야 합니다).
+
+
+```py
+>>> from transformers import TrainingArguments
+
+>>> training_args = TrainingArguments(
+...     output_dir="detr-resnet-50_finetuned_cppe5",
+...     per_device_train_batch_size=8,
+...     num_train_epochs=10,
+...     fp16=True,
+...     save_steps=200,
+...     logging_steps=50,
+...     learning_rate=1e-5,
+...     weight_decay=1e-4,
+...     save_total_limit=2,
+...     remove_unused_columns=False,
+...     push_to_hub=True,
+... )
+```
+
+마지막으로 `model`, `training_args`, `collate_fn`, `image_processor`와 데이터 세트(`cppe5`)를 모두 가져온 후, [`~transformers.Trainer.train`]를 호출합니다.
+
+```py
+>>> from transformers import Trainer
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     data_collator=collate_fn,
+...     train_dataset=cppe5["train"],
+...     tokenizer=image_processor,
+... )
+
+>>> trainer.train()
+```
+
+`training_args`에서 `push_to_hub`를 `True`로 설정한 경우, 학습 체크포인트는 허깅페이스 허브에 업로드됩니다.
+학습 완료 후, [`~transformers.Trainer.push_to_hub`] 메소드를 호출하여 최종 모델을 허깅페이스 허브에 업로드합니다.
+
+```py
+>>> trainer.push_to_hub()
+```
+
+## 평가하기 [[evaluate]]
+
+객체 탐지 모델은 일반적으로 일련의 <a href="https://cocodataset.org/#detection-eval">COCO-스타일 지표</a>로 평가됩니다.
+기존에 구현된 평가 지표 중 하나를 사용할 수도 있지만, 여기에서는 허깅페이스 허브에 푸시한 최종 모델을 평가하는 데 `torchvision`에서 제공하는 평가 지표를 사용합니다.
+
+`torchvision` 평가자(evaluator)를 사용하려면 실측값인 COCO 데이터 세트를 준비해야 합니다.
+COCO 데이터 세트를 빌드하는 API는 데이터를 특정 형식으로 저장해야 하므로, 먼저 이미지와 어노테이션을 디스크에 저장해야 합니다.
+학습을 위해 데이터를 준비할 때와 마찬가지로, cppe5["test"]에서의 어노테이션은 포맷을 맞춰야 합니다. 그러나 이미지는 그대로 유지해야 합니다.
+
+평가 단계는 약간의 작업이 필요하지만, 크게 세 가지 주요 단계로 나눌 수 있습니다.
+먼저, `cppe5["test"]` 세트를 준비합니다: 어노테이션을 포맷에 맞게 만들고 데이터를 디스크에 저장합니다.
+
+```py
+>>> import json
+
+
+>>> # format annotations the same as for training, no need for data augmentation
+>>> def val_formatted_anns(image_id, objects):
+...     annotations = []
+...     for i in range(0, len(objects["id"])):
+...         new_ann = {
+...             "id": objects["id"][i],
+...             "category_id": objects["category"][i],
+...             "iscrowd": 0,
+...             "image_id": image_id,
+...             "area": objects["area"][i],
+...             "bbox": objects["bbox"][i],
+...         }
+...         annotations.append(new_ann)
+
+...     return annotations
+
+
+>>> # Save images and annotations into the files torchvision.datasets.CocoDetection expects
+>>> def save_cppe5_annotation_file_images(cppe5):
+...     output_json = {}
+...     path_output_cppe5 = f"{os.getcwd()}/cppe5/"
+
+...     if not os.path.exists(path_output_cppe5):
+...         os.makedirs(path_output_cppe5)
+
+...     path_anno = os.path.join(path_output_cppe5, "cppe5_ann.json")
+...     categories_json = [{"supercategory": "none", "id": id, "name": id2label[id]} for id in id2label]
+...     output_json["images"] = []
+...     output_json["annotations"] = []
+...     for example in cppe5:
+...         ann = val_formatted_anns(example["image_id"], example["objects"])
+...         output_json["images"].append(
+...             {
+...                 "id": example["image_id"],
+...                 "width": example["image"].width,
+...                 "height": example["image"].height,
+...                 "file_name": f"{example['image_id']}.png",
+...             }
+...         )
+...         output_json["annotations"].extend(ann)
+...     output_json["categories"] = categories_json
+
+...     with open(path_anno, "w") as file:
+...         json.dump(output_json, file, ensure_ascii=False, indent=4)
+
+...     for im, img_id in zip(cppe5["image"], cppe5["image_id"]):
+...         path_img = os.path.join(path_output_cppe5, f"{img_id}.png")
+...         im.save(path_img)
+
+...     return path_output_cppe5, path_anno
+```
+
+다음으로, `cocoevaluator`와 함께 사용할 수 있는 `CocoDetection` 클래스의 인스턴스를 준비합니다.
+
+```py
+>>> import torchvision
+
+
+>>> class CocoDetection(torchvision.datasets.CocoDetection):
+...     def __init__(self, img_folder, image_processor, ann_file):
+...         super().__init__(img_folder, ann_file)
+...         self.image_processor = image_processor
+
+...     def __getitem__(self, idx):
+...         # read in PIL image and target in COCO format
+...         img, target = super(CocoDetection, self).__getitem__(idx)
+
+...         # preprocess image and target: converting target to DETR format,
+...         # resizing + normalization of both image and target)
+...         image_id = self.ids[idx]
+...         target = {"image_id": image_id, "annotations": target}
+...         encoding = self.image_processor(images=img, annotations=target, return_tensors="pt")
+...         pixel_values = encoding["pixel_values"].squeeze()  # remove batch dimension
+...         target = encoding["labels"][0]  # remove batch dimension
+
+...         return {"pixel_values": pixel_values, "labels": target}
+
+
+>>> im_processor = AutoImageProcessor.from_pretrained("devonho/detr-resnet-50_finetuned_cppe5")
+
+>>> path_output_cppe5, path_anno = save_cppe5_annotation_file_images(cppe5["test"])
+>>> test_ds_coco_format = CocoDetection(path_output_cppe5, im_processor, path_anno)
+```
+
+마지막으로, 평가 지표를 가져와서 평가를 실행합니다.
+
+```py
+>>> import evaluate
+>>> from tqdm import tqdm
+
+>>> model = AutoModelForObjectDetection.from_pretrained("devonho/detr-resnet-50_finetuned_cppe5")
+>>> module = evaluate.load("ybelkada/cocoevaluate", coco=test_ds_coco_format.coco)
+>>> val_dataloader = torch.utils.data.DataLoader(
+...     test_ds_coco_format, batch_size=8, shuffle=False, num_workers=4, collate_fn=collate_fn
+... )
+
+>>> with torch.no_grad():
+...     for idx, batch in enumerate(tqdm(val_dataloader)):
+...         pixel_values = batch["pixel_values"]
+...         pixel_mask = batch["pixel_mask"]
+
+...         labels = [
+...             {k: v for k, v in t.items()} for t in batch["labels"]
+...         ]  # these are in DETR format, resized + normalized
+
+...         # forward pass
+...         outputs = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
+
+...         orig_target_sizes = torch.stack([target["orig_size"] for target in labels], dim=0)
+...         results = im_processor.post_process(outputs, orig_target_sizes)  # convert outputs of model to COCO api
+
+...         module.add(prediction=results, reference=labels)
+...         del batch
+
+>>> results = module.compute()
+>>> print(results)
+Accumulating evaluation results...
+DONE (t=0.08s).
+IoU metric: bbox
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.352
+ Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.681
+ Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.292
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.168
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.208
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.429
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.274
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.484
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.501
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.191
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.323
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.590
+```
+
+이러한 결과는 [`~transformers.TrainingArguments`]의 하이퍼파라미터를 조정하여 더욱 개선될 수 있습니다. 한번 시도해 보세요!
+
+## 추론하기 [[inference]]
+
+DETR 모델을 미세 조정 및 평가하고, 허깅페이스 허브에 업로드 했으므로 추론에 사용할 수 있습니다.
+
+미세 조정된 모델을 추론에 사용하는 가장 간단한 방법은 [`pipeline`]에서 모델을 사용하는 것입니다.
+모델과 함께 객체 탐지를 위한 파이프라인을 인스턴스화하고, 이미지를 전달하세요:
+
+```py
+>>> from transformers import pipeline
+>>> import requests
+
+>>> url = "https://i.imgur.com/2lnWoly.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> obj_detector = pipeline("object-detection", model="devonho/detr-resnet-50_finetuned_cppe5")
+>>> obj_detector(image)
+```
+
+만약 원한다면 수동으로 `pipeline`의 결과를 재현할 수 있습니다:
+
+```py
+>>> image_processor = AutoImageProcessor.from_pretrained("devonho/detr-resnet-50_finetuned_cppe5")
+>>> model = AutoModelForObjectDetection.from_pretrained("devonho/detr-resnet-50_finetuned_cppe5")
+
+>>> with torch.no_grad():
+...     inputs = image_processor(images=image, return_tensors="pt")
+...     outputs = model(**inputs)
+...     target_sizes = torch.tensor([image.size[::-1]])
+...     results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[0]
+
+>>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
+...     box = [round(i, 2) for i in box.tolist()]
+...     print(
+...         f"Detected {model.config.id2label[label.item()]} with confidence "
+...         f"{round(score.item(), 3)} at location {box}"
+...     )
+Detected Coverall with confidence 0.566 at location [1215.32, 147.38, 4401.81, 3227.08]
+Detected Mask with confidence 0.584 at location [2449.06, 823.19, 3256.43, 1413.9]
+```
+
+결과를 시각화하겠습니다:
+```py
+>>> draw = ImageDraw.Draw(image)
+
+>>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
+...     box = [round(i, 2) for i in box.tolist()]
+...     x, y, x2, y2 = tuple(box)
+...     draw.rectangle((x, y, x2, y2), outline="red", width=1)
+...     draw.text((x, y), model.config.id2label[label.item()], fill="white")
+
+>>> image
+```
+
+<div class="flex justify-center">
+    <img src="https://i.imgur.com/4QZnf9A.png" alt="Object detection result on a new image"/>
+</div>
diff --git a/docs/source/ko/tasks/question_answering.md b/docs/source/ko/tasks/question_answering.md
new file mode 100644
index 000000000000..4b218ccce214
--- /dev/null
+++ b/docs/source/ko/tasks/question_answering.md
@@ -0,0 +1,428 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 질의 응답(Question Answering)[[question-answering]]
+
+[[open-in-colab]]
+
+<Youtube id="ajPx5LwJD-I"/>
+
+질의 응답 태스크는 주어진 질문에 대한 답변을 제공합니다. Alexa, Siri 또는 Google과 같은 가상 비서에게 날씨가 어떤지 물어본 적이 있다면 질의 응답 모델을 사용해본 적이 있을 것입니다. 질의 응답 태스크에는 일반적으로 두 가지 유형이 있습니다.
+
+- 추출적(Extractive) 질의 응답: 주어진 문맥에서 답변을 추출합니다.
+- 생성적(Abstractive) 질의 응답: 문맥에서 질문에 올바르게 답하는 답변을 생성합니다.
+
+이 가이드는 다음과 같은 방법들을 보여줍니다.
+
+1. 추출적 질의 응답을 하기 위해 [SQuAD](https://huggingface.co/datasets/squad) 데이터 세트에서 [DistilBERT](https://huggingface.co/distilbert-base-uncased) 미세 조정하기
+2. 추론에 미세 조정된 모델 사용하기
+
+<Tip>
+이 튜토리얼에서 설명하는 태스크는 다음과 같은 모델 아키텍처에서 지원됩니다.
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [LXMERT](../model_doc/lxmert), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OPT](../model_doc/opt), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [Splinter](../model_doc/splinter), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
+
+
+<!--End of the generated tip-->
+
+</Tip>
+
+시작하기 전에, 필요한 라이브러리가 모두 설치되어 있는지 확인하세요:
+
+```bash
+pip install transformers datasets evaluate
+```
+
+여러분의 모델을 업로드하고 커뮤니티에 공유할 수 있도록 Hugging Face 계정에 로그인하는 것이 좋습니다. 메시지가 표시되면 토큰을 입력해서 로그인합니다:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## SQuAD 데이터 세트 가져오기[[load-squad-dataset]]
+
+먼저 🤗 Datasets 라이브러리에서 SQuAD 데이터 세트의 일부를 가져옵니다. 이렇게 하면 전체 데이터 세트로 훈련하며 더 많은 시간을 할애하기 전에 모든 것이 잘 작동하는지 실험하고 확인할 수 있습니다.
+
+```py
+>>> from datasets import load_dataset
+
+>>> squad = load_dataset("squad", split="train[:5000]")
+```
+
+데이터 세트의 분할된 `train`을 [`~datasets.Dataset.train_test_split`] 메소드를 사용해 훈련 데이터 세트와 테스트 데이터 세트로 나누어줍니다:
+
+```py
+>>> squad = squad.train_test_split(test_size=0.2)
+```
+
+그리고나서 예시로 데이터를 하나 살펴봅니다:
+
+```py
+>>> squad["train"][0]
+{'answers': {'answer_start': [515], 'text': ['Saint Bernadette Soubirous']},
+ 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
+ 'id': '5733be284776f41900661182',
+ 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
+ 'title': 'University_of_Notre_Dame'
+}
+```
+
+이 중에서 몇 가지 중요한 항목이 있습니다:
+
+- `answers`: 답안 토큰의 시작 위치와 답안 텍스트
+- `context`: 모델이 답을 추출하는데 필요한 배경 지식
+- `question`: 모델이 답해야 하는 질문
+
+## 전처리[[preprocess]]
+
+<Youtube id="qgaM0weJHpA"/>
+
+다음 단계에서는 `question` 및 `context` 항목을 처리하기 위해 DistilBERT 토크나이저를 가져옵니다:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+```
+
+질의 응답 태스크와 관련해서 특히 유의해야할 몇 가지 전처리 단계가 있습니다:
+
+1. 데이터 세트의 일부 예제에는 모델의 최대 입력 길이를 초과하는 매우 긴 `context`가 있을 수 있습니다. 긴 시퀀스를 다루기 위해서는, `truncation="only_second"`로 설정해 `context`만 잘라내면 됩니다.
+2. 그 다음, `return_offset_mapping=True`로 설정해 답변의 시작과 종료 위치를 원래의 `context`에 매핑합니다.
+3. 매핑을 완료하면, 이제 답변에서 시작 토큰과 종료 토큰을 찾을 수 있습니다. 오프셋의 어느 부분이 `question`과 `context`에 해당하는지 찾을 수 있도록 [`~tokenizers.Encoding.sequence_ids`] 메소드를 사용하세요.
+
+다음은 `answer`의 시작 토큰과 종료 토큰을 잘라내서 `context`에 매핑하는 함수를 만드는 방법입니다:
+
+```py
+>>> def preprocess_function(examples):
+...     questions = [q.strip() for q in examples["question"]]
+...     inputs = tokenizer(
+...         questions,
+...         examples["context"],
+...         max_length=384,
+...         truncation="only_second",
+...         return_offsets_mapping=True,
+...         padding="max_length",
+...     )
+
+...     offset_mapping = inputs.pop("offset_mapping")
+...     answers = examples["answers"]
+...     start_positions = []
+...     end_positions = []
+
+...     for i, offset in enumerate(offset_mapping):
+...         answer = answers[i]
+...         start_char = answer["answer_start"][0]
+...         end_char = answer["answer_start"][0] + len(answer["text"][0])
+...         sequence_ids = inputs.sequence_ids(i)
+
+...         # Find the start and end of the context
+...         idx = 0
+...         while sequence_ids[idx] != 1:
+...             idx += 1
+...         context_start = idx
+...         while sequence_ids[idx] == 1:
+...             idx += 1
+...         context_end = idx - 1
+
+...         # If the answer is not fully inside the context, label it (0, 0)
+...         if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
+...             start_positions.append(0)
+...             end_positions.append(0)
+...         else:
+...             # Otherwise it's the start and end token positions
+...             idx = context_start
+...             while idx <= context_end and offset[idx][0] <= start_char:
+...                 idx += 1
+...             start_positions.append(idx - 1)
+
+...             idx = context_end
+...             while idx >= context_start and offset[idx][1] >= end_char:
+...                 idx -= 1
+...             end_positions.append(idx + 1)
+
+...     inputs["start_positions"] = start_positions
+...     inputs["end_positions"] = end_positions
+...     return inputs
+```
+
+모든 데이터 세트에 전처리를 적용하려면, 🤗 Datasets [`~datasets.Dataset.map`] 함수를 사용하세요. `batched=True`로 설정해 데이터 세트의 여러 요소들을 한 번에 처리하면 `map` 함수의 속도를 빠르게 할 수 있습니다. 필요하지 않은 열은 모두 제거합니다:
+
+```py
+>>> tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)
+```
+
+이제 [`DefaultDataCollator`]를 이용해 예시 배치를 생성합니다. 🤗 Transformers의 다른 데이터 콜레이터(data collator)와 달리, [`DefaultDataCollator`]는 패딩과 같은 추가 전처리를 적용하지 않습니다:
+
+<frameworkcontent>
+<pt>
+```py
+>>> from transformers import DefaultDataCollator
+
+>>> data_collator = DefaultDataCollator()
+```
+</pt>
+<tf>
+```py
+>>> from transformers import DefaultDataCollator
+
+>>> data_collator = DefaultDataCollator(return_tensors="tf")
+```
+</tf>
+</frameworkcontent>
+
+## 훈련[[train]]
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+[`Trainer`]를 이용해 모델을 미세 조정하는 것에 익숙하지 않다면, [여기](../training#train-with-pytorch-trainer)에서 기초 튜토리얼을 살펴보세요!
+
+</Tip>
+
+이제 모델 훈련을 시작할 준비가 되었습니다! [`AutoModelForQuestionAnswering`]으로 DistilBERT를 가져옵니다:
+
+```py
+>>> from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
+
+>>> model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+```
+
+이제 세 단계만 남았습니다:
+
+1. [`TrainingArguments`]에서 훈련 하이퍼파라미터를 정합니다. 꼭 필요한 매개변수는 모델을 저장할 위치를 지정하는 `output_dir` 입니다. `push_to_hub=True`로 설정해서 이 모델을 Hub로 푸시합니다 (모델을 업로드하려면 Hugging Face에 로그인해야 합니다).
+2. 모델, 데이터 세트, 토크나이저, 데이터 콜레이터와 함께 [`Trainer`]에 훈련 인수들을 전달합니다.
+3. [`~Trainer.train`]을 호출해서 모델을 미세 조정합니다.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_qa_model",
+...     evaluation_strategy="epoch",
+...     learning_rate=2e-5,
+...     per_device_train_batch_size=16,
+...     per_device_eval_batch_size=16,
+...     num_train_epochs=3,
+...     weight_decay=0.01,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=tokenized_squad["train"],
+...     eval_dataset=tokenized_squad["test"],
+...     tokenizer=tokenizer,
+...     data_collator=data_collator,
+... )
+
+>>> trainer.train()
+```
+
+훈련이 완료되면, [`~transformers.Trainer.push_to_hub`] 매소드를 사용해 모델을 Hub에 공유해서 모든 사람들이 사용할 수 있게 공유해주세요:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+<Tip>
+
+Keras로 모델을 미세 조정하는 것에 익숙하지 않다면, [여기](../training#train-a-tensorflow-model-with-keras)에서 기초 튜토리얼을 살펴보세요!
+
+</Tip>
+TensorFlow를 이용한 모델을 미세 조정하려면 옵티마이저 함수, 학습률 스케쥴 및 몇 가지 훈련 하이퍼파라미터를 설정하는 것부터 시작해야합니다:
+
+```py
+>>> from transformers import create_optimizer
+
+>>> batch_size = 16
+>>> num_epochs = 2
+>>> total_train_steps = (len(tokenized_squad["train"]) // batch_size) * num_epochs
+>>> optimizer, schedule = create_optimizer(
+...     init_lr=2e-5,
+...     num_warmup_steps=0,
+...     num_train_steps=total_train_steps,
+... )
+```
+
+그 다음 [`TFAutoModelForQuestionAnswering`]으로 DistilBERT를 가져옵니다:
+
+```py
+>>> from transformers import TFAutoModelForQuestionAnswering
+
+>>> model = TFAutoModelForQuestionAnswering("distilbert-base-uncased")
+```
+
+[`~transformers.TFPreTrainedModel.prepare_tf_dataset`]을 사용해서 데이터 세트를 `tf.data.Dataset` 형식으로 변환합니다:
+
+```py
+>>> tf_train_set = model.prepare_tf_dataset(
+...     tokenized_squad["train"],
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_validation_set = model.prepare_tf_dataset(
+...     tokenized_squad["test"],
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+[`compile`](https://keras.io/api/models/model_training_apis/#compile-method)로 훈련할 모델을 설정합니다:
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)
+```
+
+마지막으로 모델을 Hub로 푸시할 방법을 설정합니다. [`~transformers.PushToHubCallback`]에서 모델과 토크나이저를 푸시할 경로를 설정합니다:
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> callback = PushToHubCallback(
+...     output_dir="my_awesome_qa_model",
+...     tokenizer=tokenizer,
+... )
+```
+
+드디어 모델 훈련을 시작할 준비가 되었습니다! 훈련 데이터 세트와 평가 데이터 세트, 에폭 수, 콜백을 설정한 후 [`fit`](https://keras.io/api/models/model_training_apis/#fit-method)을 이용해 모델을 미세 조정합니다:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=[callback])
+```
+훈련이 완료되면 모델이 자동으로 Hub에 업로드되어 누구나 사용할 수 있습니다!
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+질의 응답을 위해 모델을 미세 조정하는 방법에 대한 더 자세한 예시는 [PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb) 또는 [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb)을 참조하세요.
+
+</Tip>
+
+## 평가[[evaluate]]
+
+질의 응답을 평가하려면 상당한 양의 후처리가 필요합니다. 시간이 너무 많이 걸리지 않도록 이 가이드에서는 평가 단계를 생략합니다. [`Trainer`]는 훈련 과정에서 평가 손실(evaluation loss)을 계속 계산하기 때문에 모델의 성능을 대략적으로 알 수 있습니다.
+
+시간에 여유가 있고 질의 응답 모델을 평가하는 방법에 관심이 있다면 🤗 Hugging Face Course의 [Question answering](https://huggingface.co/course/chapter7/7?fw=pt#postprocessing) 챕터를 살펴보세요!
+
+## 추론[[inference]]
+
+이제 모델을 미세 조정했으니 추론에 사용할 수 있습니다!
+
+질문과 모델이 예측하기 원하는 문맥(context)를 생각해보세요:
+
+```py
+>>> question = "How many programming languages does BLOOM support?"
+>>> context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages."
+```
+
+추론을 위해 미세 조정한 모델을 테스트하는 가장 쉬운 방법은 [`pipeline`]을 사용하는 것 입니다. 모델을 사용해 질의 응답을 하기 위해서 `pipeline`을 인스턴스화하고 텍스트를 입력합니다:
+
+```py
+>>> from transformers import pipeline
+
+>>> question_answerer = pipeline("question-answering", model="my_awesome_qa_model")
+>>> question_answerer(question=question, context=context)
+{'score': 0.2058267742395401,
+ 'start': 10,
+ 'end': 95,
+ 'answer': '176 billion parameters and can generate text in 46 languages natural languages and 13'}
+```
+
+원한다면 `pipeline`의 결과를 직접 복제할 수도 있습니다:
+
+<frameworkcontent>
+<pt>
+텍스트를 토큰화해서 PyTorch 텐서를 반환합니다:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_qa_model")
+>>> inputs = tokenizer(question, context, return_tensors="pt")
+```
+
+모델에 입력을 전달하고 `logits`을 반환합니다:
+
+```py
+>>> from transformers import AutoModelForQuestionAnswering
+
+>>> model = AutoModelForQuestionAnswering.from_pretrained("my_awesome_qa_model")
+>>> with torch.no_grad():
+...     outputs = model(**inputs)
+```
+
+모델의 출력에서 시작 및 종료 위치가 어딘지 가장 높은 확률을 얻습니다:
+
+```py
+>>> answer_start_index = outputs.start_logits.argmax()
+>>> answer_end_index = outputs.end_logits.argmax()
+```
+
+예측된 토큰을 해독해서 답을 얻습니다:
+
+```py
+>>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
+>>> tokenizer.decode(predict_answer_tokens)
+'176 billion parameters and can generate text in 46 languages natural languages and 13'
+```
+</pt>
+<tf>
+텍스트를 토큰화해서 TensorFlow 텐서를 반환합니다:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_qa_model")
+>>> inputs = tokenizer(question, text, return_tensors="tf")
+```
+
+모델에 입력을 전달하고 `logits`을 반환합니다:
+
+```py
+>>> from transformers import TFAutoModelForQuestionAnswering
+
+>>> model = TFAutoModelForQuestionAnswering.from_pretrained("my_awesome_qa_model")
+>>> outputs = model(**inputs)
+```
+
+모델의 출력에서 시작 및 종료 위치가 어딘지 가장 높은 확률을 얻습니다:
+
+```py
+>>> answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])
+>>> answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])
+```
+
+예측된 토큰을 해독해서 답을 얻습니다:
+
+```py
+>>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
+>>> tokenizer.decode(predict_answer_tokens)
+'176 billion parameters and can generate text in 46 languages natural languages and 13'
+```
+</tf>
+</frameworkcontent>
diff --git a/docs/source/ko/tasks/semantic_segmentation.md b/docs/source/ko/tasks/semantic_segmentation.md
new file mode 100644
index 000000000000..fe27b8f0ba8c
--- /dev/null
+++ b/docs/source/ko/tasks/semantic_segmentation.md
@@ -0,0 +1,591 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 의미적 분할(Semantic segmentation)[[semantic-segmentation]]
+
+[[open-in-colab]]
+
+<Youtube id="dKE8SIt9C-w"/>
+
+의미적 분할(semantic segmentation)은 이미지의 각 픽셀에 레이블 또는 클래스를 할당합니다. 분할(segmentation)에는 여러 종류가 있으며, 의미적 분할의 경우 동일한 물체의 고유 인스턴스를 구분하지 않습니다. 두 물체 모두 동일한 레이블이 지정됩니다(예시로, "car-1" 과 "car-2" 대신 "car"로 지정합니다).
+실생활에서 흔히 볼 수 있는 의미적 분할의 적용 사례로는 보행자와 중요한 교통 정보를 식별하는 자율 주행 자동차 학습, 의료 이미지의 세포와 이상 징후 식별, 그리고 위성 이미지의 환경 변화 모니터링등이 있습니다.
+
+이번 가이드에서 배울 내용은 다음과 같습니다:
+
+1. [SceneParse150](https://huggingface.co/datasets/scene_parse_150) 데이터 세트를 이용해 [SegFormer](https://huggingface.co/docs/transformers/main/en/model_doc/segformer#segformer) 미세 조정하기.
+2. 미세 조정된 모델을 추론에 사용하기.
+
+<Tip>
+이 튜토리얼에서 설명하는 작업은 다음 모델 아키텍처에서 지원됩니다:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[BEiT](../model_doc/beit), [Data2VecVision](../model_doc/data2vec-vision), [DPT](../model_doc/dpt), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [MobileViTV2](../model_doc/mobilevitv2), [SegFormer](../model_doc/segformer), [UPerNet](../model_doc/upernet)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+시작하기 전에 필요한 모든 라이브러리가 설치되었는지 확인하세요:
+
+```bash
+pip install -q datasets transformers evaluate
+```
+커뮤니티에 모델을 업로드하고 공유할 수 있도록 Hugging Face 계정에 로그인하는 것을 권장합니다. 프롬프트가 나타나면 토큰을 입력하여 로그인하세요:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## SceneParse150 데이터 세트 불러오기[[load-sceneparse150-dataset]]
+
+🤗 Datasets 라이브러리에서 SceneParse150 데이터 세트의 더 작은 부분 집합을 가져오는 것으로 시작합니다. 이렇게 하면 데이터 세트 전체에 대한 훈련에 많은 시간을 할애하기 전에 실험을 통해 모든 것이 제대로 작동하는지 확인할 수 있습니다.
+
+```py
+>>> from datasets import load_dataset
+
+>>> ds = load_dataset("scene_parse_150", split="train[:50]")
+```
+
+데이터 세트의 `train`을 [`~datasets.Dataset.train_test_split`] 메소드를 사용하여 훈련 및 테스트 세트로 분할하세요:
+
+```py
+>>> ds = ds.train_test_split(test_size=0.2)
+>>> train_ds = ds["train"]
+>>> test_ds = ds["test"]
+```
+
+그리고 예시를 살펴보세요:
+
+```py
+>>> train_ds[0]
+{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x683 at 0x7F9B0C201F90>,
+ 'annotation': <PIL.PngImagePlugin.PngImageFile image mode=L size=512x683 at 0x7F9B0C201DD0>,
+ 'scene_category': 368}
+```
+
+- `image`: 장면의 PIL 이미지입니다.
+- `annotation`: 분할 지도(segmentation map)의 PIL 이미지입니다. 모델의 타겟이기도 합니다.
+- `scene_category`: "주방" 또는 "사무실"과 같이 이미지 장면을 설명하는 카테고리 ID입니다. 이 가이드에서는 둘 다 PIL 이미지인 `image`와 `annotation`만을 사용합니다.
+
+나중에 모델을 설정할 때 유용하게 사용할 수 있도록 레이블 ID를 레이블 클래스에 매핑하는 사전도 만들고 싶을 것입니다. Hub에서 매핑을 다운로드하고 `id2label` 및 `label2id` 사전을 만드세요:
+
+```py
+>>> import json
+>>> from huggingface_hub import cached_download, hf_hub_url
+
+>>> repo_id = "huggingface/label-files"
+>>> filename = "ade20k-id2label.json"
+>>> id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename, repo_type="dataset")), "r"))
+>>> id2label = {int(k): v for k, v in id2label.items()}
+>>> label2id = {v: k for k, v in id2label.items()}
+>>> num_labels = len(id2label)
+```
+
+## 전처리하기[[preprocess]
+
+다음 단계는 모델에 사용할 이미지와 주석을 준비하기 위해 SegFormer 이미지 프로세서를 불러오는 것입니다. 우리가 사용하는 데이터 세트와 같은 일부 데이터 세트는 배경 클래스로 제로 인덱스를 사용합니다. 하지만 배경 클래스는 150개의 클래스에 실제로는 포함되지 않기 때문에 `reduce_labels=True` 를 설정해 모든 레이블에서 배경 클래스를 제거해야 합니다. 제로 인덱스는 `255`로 대체되므로 SegFormer의 손실 함수에서 무시됩니다:
+
+```py
+>>> from transformers import AutoImageProcessor
+
+>>> checkpoint = "nvidia/mit-b0"
+>>> image_processor = AutoImageProcessor.from_pretrained(checkpoint, reduce_labels=True)
+```
+
+<frameworkcontent>
+<pt>
+
+이미지 데이터 세트에 데이터 증강을 적용하여 과적합에 대해 모델을 보다 강건하게 만드는 것이 일반적입니다. 이 가이드에서는 [torchvision](https://pytorch.org/vision/stable/index.html)의 [`ColorJitter`](https://pytorch.org/vision/stable/generated/torchvision.transforms.ColorJitter.html)를 사용하여 이미지의 색상 속성을 임의로 변경합니다. 하지만, 자신이 원하는 이미지 라이브러리를 사용할 수도 있습니다.
+
+```py
+>>> from torchvision.transforms import ColorJitter
+
+>>> jitter = ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.1)
+```
+
+이제 모델에 사용할 이미지와 주석을 준비하기 위해 두 개의 전처리 함수를 만듭니다. 이 함수들은 이미지를 `pixel_values`로, 주석을 `labels`로 변환합니다. 훈련 세트의 경우 이미지 프로세서에 이미지를 제공하기 전에 `jitter`를 적용합니다. 테스트 세트의 경우 이미지 프로세서는 `images`를 자르고 정규화하며, 테스트 중에는 데이터 증강이 적용되지 않으므로 `labels`만 자릅니다.
+
+```py
+>>> def train_transforms(example_batch):
+...     images = [jitter(x) for x in example_batch["image"]]
+...     labels = [x for x in example_batch["annotation"]]
+...     inputs = image_processor(images, labels)
+...     return inputs
+
+
+>>> def val_transforms(example_batch):
+...     images = [x for x in example_batch["image"]]
+...     labels = [x for x in example_batch["annotation"]]
+...     inputs = image_processor(images, labels)
+...     return inputs
+```
+
+모든 데이터 세트에 `jitter`를 적용하려면, 🤗 Datasets [`~datasets.Dataset.set_transform`] 함수를 사용하세요. 즉시 변환이 적용되기 때문에 더 빠르고 디스크 공간을 덜 차지합니다:
+
+```py
+>>> train_ds.set_transform(train_transforms)
+>>> test_ds.set_transform(val_transforms)
+```
+
+</pt>
+</frameworkcontent>
+
+<frameworkcontent>
+<tf>
+
+이미지 데이터 세트에 데이터 증강을 적용하여 과적합에 대해 모델을 보다 강건하게 만드는 것이 일반적입니다. 이 가이드에서는 [`tf.image`](https://www.tensorflow.org/api_docs/python/tf/image)를 사용하여 이미지의 색상 속성을 임의로 변경합니다. 하지만, 자신이 원하는 이미지 라이브러리를 사용할 수도 있습니다.
+
+별개의 두 변환 함수를 정의합니다:
+- 이미지 증강을 포함하는 학습 데이터 변환
+- 🤗 Transformers의 컴퓨터 비전 모델은 채널 우선 레이아웃을 기대하기 때문에, 이미지만 바꾸는 검증 데이터 변환
+
+```py
+>>> import tensorflow as tf
+
+
+>>> def aug_transforms(image):
+...     image = tf.keras.utils.img_to_array(image)
+...     image = tf.image.random_brightness(image, 0.25)
+...     image = tf.image.random_contrast(image, 0.5, 2.0)
+...     image = tf.image.random_saturation(image, 0.75, 1.25)
+...     image = tf.image.random_hue(image, 0.1)
+...     image = tf.transpose(image, (2, 0, 1))
+...     return image
+
+
+>>> def transforms(image):
+...     image = tf.keras.utils.img_to_array(image)
+...     image = tf.transpose(image, (2, 0, 1))
+...     return image
+```
+
+그런 다음 모델을 위해 두 개의 전처리 함수를 만들어 이미지 및 주석 배치를 준비합니다. 이 함수들은 이미지 변환을 적용하고 이전에 로드한 `image_processor`를 사용하여 이미지를 `pixel_values`로, 주석을 `label`로 변환합니다. `ImageProcessor` 는 이미지의 크기 조정과 정규화도 처리합니다.
+
+```py
+>>> def train_transforms(example_batch):
+...     images = [aug_transforms(x.convert("RGB")) for x in example_batch["image"]]
+...     labels = [x for x in example_batch["annotation"]]
+...     inputs = image_processor(images, labels)
+...     return inputs
+
+
+>>> def val_transforms(example_batch):
+...     images = [transforms(x.convert("RGB")) for x in example_batch["image"]]
+...     labels = [x for x in example_batch["annotation"]]
+...     inputs = image_processor(images, labels)
+...     return inputs
+```
+
+전체 데이터 집합에 전처리 변환을 적용하려면 🤗 Datasets [`~datasets.Dataset.set_transform`] 함수를 사용하세요.
+즉시 변환이 적용되기 때문에 더 빠르고 디스크 공간을 덜 차지합니다:
+
+```py
+>>> train_ds.set_transform(train_transforms)
+>>> test_ds.set_transform(val_transforms)
+```
+</tf>
+</frameworkcontent>
+
+## 평가하기[[evaluate]]
+
+훈련 중에 메트릭을 포함하면 모델의 성능을 평가하는 데 도움이 되는 경우가 많습니다. 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) 라이브러리를 사용하여 평가 방법을 빠르게 로드할 수 있습니다. 이 태스크에서는 [mean Intersection over Union](https://huggingface.co/spaces/evaluate-metric/accuracy) (IoU) 메트릭을 로드하세요 (메트릭을 로드하고 계산하는 방법에 대해 자세히 알아보려면 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour)를 살펴보세요).
+
+```py
+>>> import evaluate
+
+>>> metric = evaluate.load("mean_iou")
+```
+
+그런 다음 메트릭을 [`~evaluate.EvaluationModule.compute`]하는 함수를 만듭니다. 예측을 먼저 로짓으로 변환한 다음, 레이블의 크기에 맞게 모양을 다시 지정해야 [`~evaluate.EvaluationModule.compute`]를 호출할 수 있습니다:
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> import numpy as np
+>>> import torch
+>>> from torch import nn
+
+>>> def compute_metrics(eval_pred):
+...     with torch.no_grad():
+...         logits, labels = eval_pred
+...         logits_tensor = torch.from_numpy(logits)
+...         logits_tensor = nn.functional.interpolate(
+...             logits_tensor,
+...             size=labels.shape[-2:],
+...             mode="bilinear",
+...             align_corners=False,
+...         ).argmax(dim=1)
+
+...         pred_labels = logits_tensor.detach().cpu().numpy()
+...         metrics = metric.compute(
+...             predictions=pred_labels,
+...             references=labels,
+...             num_labels=num_labels,
+...             ignore_index=255,
+...             reduce_labels=False,
+...         )
+...         for key, value in metrics.items():
+...             if type(value) is np.ndarray:
+...                 metrics[key] = value.tolist()
+...         return metrics
+```
+
+</pt>
+</frameworkcontent>
+
+
+<frameworkcontent>
+<tf>
+
+```py
+>>> def compute_metrics(eval_pred):
+...     logits, labels = eval_pred
+...     logits = tf.transpose(logits, perm=[0, 2, 3, 1])
+...     logits_resized = tf.image.resize(
+...         logits,
+...         size=tf.shape(labels)[1:],
+...         method="bilinear",
+...     )
+
+...     pred_labels = tf.argmax(logits_resized, axis=-1)
+...     metrics = metric.compute(
+...         predictions=pred_labels,
+...         references=labels,
+...         num_labels=num_labels,
+...         ignore_index=-1,
+...         reduce_labels=image_processor.do_reduce_labels,
+...     )
+
+...     per_category_accuracy = metrics.pop("per_category_accuracy").tolist()
+...     per_category_iou = metrics.pop("per_category_iou").tolist()
+
+...     metrics.update({f"accuracy_{id2label[i]}": v for i, v in enumerate(per_category_accuracy)})
+...     metrics.update({f"iou_{id2label[i]}": v for i, v in enumerate(per_category_iou)})
+...     return {"val_" + k: v for k, v in metrics.items()}
+```
+
+</tf>
+</frameworkcontent>
+
+이제 `compute_metrics` 함수를 사용할 준비가 되었습니다. 트레이닝을 설정할 때 이 함수로 돌아가게 됩니다.
+
+## 학습하기[[train]]
+<frameworkcontent>
+<pt>
+<Tip>
+
+만약 [`Trainer`]를 사용해 모델을 미세 조정하는 것에 익숙하지 않다면, [여기](../training#finetune-with-trainer)에서 기본 튜토리얼을 살펴보세요!
+
+</Tip>
+
+이제 모델 학습을 시작할 준비가 되었습니다! [`AutoModelForSemanticSegmentation`]로 SegFormer를 불러오고, 모델에 레이블 ID와 레이블 클래스 간의 매핑을 전달합니다:
+
+```py
+>>> from transformers import AutoModelForSemanticSegmentation, TrainingArguments, Trainer
+
+>>> model = AutoModelForSemanticSegmentation.from_pretrained(checkpoint, id2label=id2label, label2id=label2id)
+```
+
+이제 세 단계만 남았습니다:
+
+1. 학습 하이퍼파라미터를 [`TrainingArguments`]에 정의합니다. `image` 열이 삭제되기 때문에 사용하지 않는 열을 제거하지 않는 것이 중요합니다. `image` 열이 없으면 `pixel_values`을 생성할 수 없습니다. 이런 경우를 방지하려면 `remove_unused_columns=False`로 설정하세요! 유일하게 필요한 다른 매개변수는 모델을 저장할 위치를 지정하는 `output_dir`입니다. `push_to_hub=True`를 설정하여 이 모델을 Hub에 푸시합니다(모델을 업로드하려면 Hugging Face에 로그인해야 합니다). 각 에포크가 끝날 때마다 [`Trainer`]가 IoU 메트릭을 평가하고 학습 체크포인트를 저장합니다.
+2. 모델, 데이터 세트, 토크나이저, 데이터 콜레이터, `compute_metrics` 함수와 함께 학습 인자를 [`Trainer`]에 전달하세요.
+3. 모델을 미세 조정하기 위해 [`~Trainer.train`]를 호출하세요.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="segformer-b0-scene-parse-150",
+...     learning_rate=6e-5,
+...     num_train_epochs=50,
+...     per_device_train_batch_size=2,
+...     per_device_eval_batch_size=2,
+...     save_total_limit=3,
+...     evaluation_strategy="steps",
+...     save_strategy="steps",
+...     save_steps=20,
+...     eval_steps=20,
+...     logging_steps=1,
+...     eval_accumulation_steps=5,
+...     remove_unused_columns=False,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=train_ds,
+...     eval_dataset=test_ds,
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+학습이 완료되면, 누구나 모델을 사용할 수 있도록 [`~transformers.Trainer.push_to_hub`] 메서드를 사용해 Hub에 모델을 공유하세요:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+</frameworkcontent>
+
+<frameworkcontent>
+<tf>
+<Tip>
+
+Keras로 모델을 미세 조정하는 데 익숙하지 않은 경우, 먼저 [기본 튜토리얼](../training#train-a-tensorflow-model-with-keras)을 확인해보세요!
+
+</Tip>
+
+TensorFlow에서 모델을 미세 조정하려면 다음 단계를 따르세요:
+1. 학습 하이퍼파라미터를 정의하고 옵티마이저와 학습률 스케쥴러를 설정하세요.
+2. 사전 학습된 모델을 인스턴스화하세요.
+3. 🤗 Dataset을 `tf.data.Dataset`로 변환하세요.
+4. 모델을 컴파일하세요.
+5. 콜백을 추가하여 메트릭을 계산하고 🤗 Hub에 모델을 업로드하세요.
+6. `fit()` 메서드를 사용하여 훈련을 실행하세요.
+
+하이퍼파라미터, 옵티마이저, 학습률 스케쥴러를 정의하는 것으로 시작하세요:
+
+```py
+>>> from transformers import create_optimizer
+
+>>> batch_size = 2
+>>> num_epochs = 50
+>>> num_train_steps = len(train_ds) * num_epochs
+>>> learning_rate = 6e-5
+>>> weight_decay_rate = 0.01
+
+>>> optimizer, lr_schedule = create_optimizer(
+...     init_lr=learning_rate,
+...     num_train_steps=num_train_steps,
+...     weight_decay_rate=weight_decay_rate,
+...     num_warmup_steps=0,
+... )
+```
+
+그런 다음 레이블 매핑과 함께 [`TFAutoModelForSemanticSegmentation`]을 사용하여 SegFormer를 불러오고 옵티마이저로 컴파일합니다. 트랜스포머 모델은 모두 디폴트로 태스크 관련 손실 함수가 있으므로 원치 않으면 지정할 필요가 없습니다:
+
+```py
+>>> from transformers import TFAutoModelForSemanticSegmentation
+
+>>> model = TFAutoModelForSemanticSegmentation.from_pretrained(
+...     checkpoint,
+...     id2label=id2label,
+...     label2id=label2id,
+... )
+>>> model.compile(optimizer=optimizer)  # 손실 함수 인자가 없습니다!
+```
+
+[`~datasets.Dataset.to_tf_dataset`] 와 [`DefaultDataCollator`]를 사용해 데이터 세트를 `tf.data.Dataset` 포맷으로 변환하세요:
+
+```py
+>>> from transformers import DefaultDataCollator
+
+>>> data_collator = DefaultDataCollator(return_tensors="tf")
+
+>>> tf_train_dataset = train_ds.to_tf_dataset(
+...     columns=["pixel_values", "label"],
+...     shuffle=True,
+...     batch_size=batch_size,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_eval_dataset = test_ds.to_tf_dataset(
+...     columns=["pixel_values", "label"],
+...     shuffle=True,
+...     batch_size=batch_size,
+...     collate_fn=data_collator,
+... )
+```
+
+예측으로 정확도를 계산하고 모델을 🤗 Hub로 푸시하려면 [Keras callbacks](../main_classes/keras_callbacks)를 사용하세요. `compute_metrics` 함수를 [`KerasMetricCallback`]에 전달하고, 모델 업로드를 위해 [`PushToHubCallback`]를 사용하세요:
+
+```py
+>>> from transformers.keras_callbacks import KerasMetricCallback, PushToHubCallback
+
+>>> metric_callback = KerasMetricCallback(
+...     metric_fn=compute_metrics, eval_dataset=tf_eval_dataset, batch_size=batch_size, label_cols=["labels"]
+... )
+
+>>> push_to_hub_callback = PushToHubCallback(output_dir="scene_segmentation", tokenizer=image_processor)
+
+>>> callbacks = [metric_callback, push_to_hub_callback]
+```
+
+이제 모델을 훈련할 준비가 되었습니다! 훈련 및 검증 데이터 세트, 에포크 수와 함께 `fit()`을 호출하고, 콜백을 사용하여 모델을 미세 조정합니다:
+
+```py
+>>> model.fit(
+...     tf_train_dataset,
+...     validation_data=tf_eval_dataset,
+...     callbacks=callbacks,
+...     epochs=num_epochs,
+... )
+```
+
+축하합니다! 모델을 미세 조정하고 🤗 Hub에 공유했습니다. 이제 추론에 사용할 수 있습니다!
+
+</tf>
+</frameworkcontent>
+
+
+## 추론하기[[inference]]
+
+이제 모델을 미세 조정했으니 추론에 사용할 수 있습니다!
+
+추론할 이미지를 로드하세요:
+
+```py
+>>> image = ds[0]["image"]
+>>> image
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/semantic-seg-image.png" alt="Image of bedroom"/>
+</div>
+
+<frameworkcontent>
+<pt>
+
+추론을 위해 미세 조정한 모델을 시험해 보는 가장 간단한 방법은 [`pipeline`]에서 사용하는 것입니다. 모델을 사용하여 이미지 분할을 위한 `pipeline`을 인스턴스화하고 이미지를 전달합니다:
+
+```py
+>>> from transformers import pipeline
+
+>>> segmenter = pipeline("image-segmentation", model="my_awesome_seg_model")
+>>> segmenter(image)
+[{'score': None,
+  'label': 'wall',
+  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062690>},
+ {'score': None,
+  'label': 'sky',
+  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062A50>},
+ {'score': None,
+  'label': 'floor',
+  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062B50>},
+ {'score': None,
+  'label': 'ceiling',
+  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062A10>},
+ {'score': None,
+  'label': 'bed ',
+  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062E90>},
+ {'score': None,
+  'label': 'windowpane',
+  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062390>},
+ {'score': None,
+  'label': 'cabinet',
+  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062550>},
+ {'score': None,
+  'label': 'chair',
+  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062D90>},
+ {'score': None,
+  'label': 'armchair',
+  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062E10>}]
+```
+원하는 경우 `pipeline`의 결과를 수동으로 복제할 수도 있습니다. 이미지 프로세서로 이미지를 처리하고 `pixel_values`을 GPU에 배치합니다:
+
+```py
+>>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 가능하다면 GPU를 사용하고, 그렇지 않다면 CPU를 사용하세요
+>>> encoding = image_processor(image, return_tensors="pt")
+>>> pixel_values = encoding.pixel_values.to(device)
+```
+
+모델에 입력을 전달하고 `logits`를 반환합니다:
+
+```py
+>>> outputs = model(pixel_values=pixel_values)
+>>> logits = outputs.logits.cpu()
+```
+그런 다음 로짓의 크기를 원본 이미지 크기로 다시 조정합니다:
+
+```py
+>>> upsampled_logits = nn.functional.interpolate(
+...     logits,
+...     size=image.size[::-1],
+...     mode="bilinear",
+...     align_corners=False,
+... )
+
+>>> pred_seg = upsampled_logits.argmax(dim=1)[0]
+```
+
+</pt>
+</frameworkcontent>
+
+<frameworkcontent>
+<tf>
+이미지 프로세서를 로드하여 이미지를 전처리하고 입력을 TensorFlow 텐서로 반환합니다:
+
+```py
+>>> from transformers import AutoImageProcessor
+
+>>> image_processor = AutoImageProcessor.from_pretrained("MariaK/scene_segmentation")
+>>> inputs = image_processor(image, return_tensors="tf")
+```
+
+모델에 입력을 전달하고 `logits`를 반환합니다:
+
+```py
+>>> from transformers import TFAutoModelForSemanticSegmentation
+
+>>> model = TFAutoModelForSemanticSegmentation.from_pretrained("MariaK/scene_segmentation")
+>>> logits = model(**inputs).logits
+```
+
+그런 다음 로그를 원본 이미지 크기로 재조정하고 클래스 차원에 argmax를 적용합니다:
+
+```py
+>>> logits = tf.transpose(logits, [0, 2, 3, 1])
+
+>>> upsampled_logits = tf.image.resize(
+...     logits,
+...     # `image.size`가 너비와 높이를 반환하기 때문에 `image`의 모양을 반전시킵니다
+...     image.size[::-1],
+... )
+
+>>> pred_seg = tf.math.argmax(upsampled_logits, axis=-1)[0]
+```
+
+</tf>
+</frameworkcontent>
+
+결과를 시각화하려면 [dataset color palette](https://github.com/tensorflow/models/blob/3f1ca33afe3c1631b733ea7e40c294273b9e406d/research/deeplab/utils/get_dataset_colormap.py#L51)를 각 클래스를 RGB 값에 매핑하는 `ade_palette()`로 로드합니다. 그런 다음 이미지와 예측된 분할 지도(segmentation map)을 결합하여 구성할 수 있습니다:
+
+```py
+>>> import matplotlib.pyplot as plt
+>>> import numpy as np
+
+>>> color_seg = np.zeros((pred_seg.shape[0], pred_seg.shape[1], 3), dtype=np.uint8)
+>>> palette = np.array(ade_palette())
+>>> for label, color in enumerate(palette):
+...     color_seg[pred_seg == label, :] = color
+>>> color_seg = color_seg[..., ::-1]  # BGR로 변환
+
+>>> img = np.array(image) * 0.5 + color_seg * 0.5  # 분할 지도으로 이미지 구성
+>>> img = img.astype(np.uint8)
+
+>>> plt.figure(figsize=(15, 10))
+>>> plt.imshow(img)
+>>> plt.show()
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/semantic-seg-preds.png" alt="Image of bedroom overlaid with segmentation map"/>
+</div>
diff --git a/docs/source/ko/tasks/sequence_classification.md b/docs/source/ko/tasks/sequence_classification.md
new file mode 100644
index 000000000000..bc364d3199e2
--- /dev/null
+++ b/docs/source/ko/tasks/sequence_classification.md
@@ -0,0 +1,395 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 텍스트 분류[[text-classification]]
+
+[[open-in-colab]]
+
+<Youtube id="leNG9fN9FQU"/>
+
+텍스트 분류는 자연어 처리의 일종으로, 텍스트에 레이블 또는 클래스를 지정하는 작업입니다. 많은 대기업이 다양한 실용적인 응용 분야에서 텍스트 분류를 운영하고 있습니다. 가장 인기 있는 텍스트 분류 형태 중 하나는 감성 분석으로, 텍스트 시퀀스에 🙂 긍정, 🙁 부정 또는 😐 중립과 같은 레이블을 지정합니다.
+
+이 가이드에서 학습할 내용은:
+
+1. [IMDb](https://huggingface.co/datasets/imdb) 데이터셋에서 [DistilBERT](https://huggingface.co/distilbert-base-uncased)를 파인 튜닝하여 영화 리뷰가 긍정적인지 부정적인지 판단합니다.
+2. 추론을 위해 파인 튜닝 모델을 사용합니다.
+
+<Tip>
+이 튜토리얼에서 설명하는 작업은 다음 모델 아키텍처에 의해 지원됩니다:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [ESM](../model_doc/esm), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPT Neo](../model_doc/gpt_neo), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [LLaMA](../model_doc/llama), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Perceiver](../model_doc/perceiver), [PLBart](../model_doc/plbart), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [TAPAS](../model_doc/tapas), [Transformer-XL](../model_doc/transfo-xl), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
+
+
+<!--End of the generated tip-->
+
+</Tip>
+
+시작하기 전에, 필요한 모든 라이브러리가 설치되어 있는지 확인하세요:
+
+```bash
+pip install transformers datasets evaluate
+```
+
+Hugging Face 계정에 로그인하여 모델을 업로드하고 커뮤니티에 공유하는 것을 권장합니다. 메시지가 표시되면, 토큰을 입력하여 로그인하세요:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## IMDb 데이터셋 가져오기[[load-imdb-dataset]]
+
+먼저 🤗 Datasets 라이브러리에서 IMDb 데이터셋을 가져옵니다:
+
+```py
+>>> from datasets import load_dataset
+
+>>> imdb = load_dataset("imdb")
+```
+
+그런 다음 예시를 살펴봅시다:
+
+```py
+>>> imdb["test"][0]
+{
+    "label": 0,
+    "text": "I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the background, and painfully one-dimensional characters cannot be overcome with a 'sci-fi' setting. (I'm sure there are those of you out there who think Babylon 5 is good sci-fi TV. It's not. It's clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It's really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it's rubbish as they have to always say \"Gene Roddenberry's Earth...\" otherwise people would not continue watching. Roddenberry's ashes must be turning in their orbit as this dull, cheap, poorly edited (watching it without advert breaks really brings this home) trudging Trabant of a show lumbers into space. Spoiler. So, kill off a main character. And then bring him back as another actor. Jeeez! Dallas all over again.",
+}
+```
+
+이 데이터셋에는 두 가지 필드가 있습니다:
+
+- `text`: 영화 리뷰 텍스트
+- `label`: `0`은 부정적인 리뷰, `1`은 긍정적인 리뷰를 나타냅니다.
+
+## 전처리[[preprocess]]
+
+다음 단계는 DistilBERT 토크나이저를 가져와서 `text` 필드를 전처리하는 것입니다:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+```
+
+`text`를 토큰화하고 시퀀스가 DistilBERT의 최대 입력 길이보다 길지 않도록 자르기 위한 전처리 함수를 생성하세요:
+
+```py
+>>> def preprocess_function(examples):
+...     return tokenizer(examples["text"], truncation=True)
+```
+
+전체 데이터셋에 전처리 함수를 적용하려면, 🤗 Datasets [`~datasets.Dataset.map`] 함수를 사용하세요. 데이터셋의 여러 요소를 한 번에 처리하기 위해 `batched=True`로 설정함으로써 데이터셋 `map`를 더 빠르게 처리할 수 있습니다:
+
+```py
+tokenized_imdb = imdb.map(preprocess_function, batched=True)
+```
+
+이제 [`DataCollatorWithPadding`]를 사용하여 예제 배치를 만들어봅시다. 데이터셋 전체를 최대 길이로 패딩하는 대신, *동적 패딩*을 사용하여 배치에서 가장 긴 길이에 맞게 문장을 패딩하는 것이 효율적입니다.
+
+<frameworkcontent>
+<pt>
+```py
+>>> from transformers import DataCollatorWithPadding
+
+>>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+```
+</pt>
+<tf>
+```py
+>>> from transformers import DataCollatorWithPadding
+
+>>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
+```
+</tf>
+</frameworkcontent>
+
+## 평가하기[[evaluate]]
+
+훈련 중 모델의 성능을 평가하기 위해 메트릭을 포함하는 것이 유용합니다. 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) 라이브러리를 사용하여 빠르게 평가 방법을 로드할 수 있습니다. 이 작업에서는 [accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy) 메트릭을 가져옵니다. (메트릭을 가져오고 계산하는 방법에 대해서는 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour)를 참조하세요):
+
+```py
+>>> import evaluate
+
+>>> accuracy = evaluate.load("accuracy")
+```
+
+그런 다음 `compute_metrics` 함수를 만들어서 예측과 레이블을 계산하여 정확도를 계산하도록 [`~evaluate.EvaluationModule.compute`]를 호출합니다:
+
+```py
+>>> import numpy as np
+
+
+>>> def compute_metrics(eval_pred):
+...     predictions, labels = eval_pred
+...     predictions = np.argmax(predictions, axis=1)
+...     return accuracy.compute(predictions=predictions, references=labels)
+```
+
+이제 `compute_metrics` 함수는 준비되었고, 훈련 과정을 설정할 때 다시 살펴볼 예정입니다.
+
+## 훈련[[train]]
+
+모델을 훈련하기 전에, `id2label`와 `label2id`를 사용하여 예상되는 id와 레이블의 맵을 생성하세요:
+
+```py
+>>> id2label = {0: "NEGATIVE", 1: "POSITIVE"}
+>>> label2id = {"NEGATIVE": 0, "POSITIVE": 1}
+```
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+[`Trainer`]를 사용하여 모델을 파인 튜닝하는 방법에 익숙하지 않은 경우, [여기](../training#train-with-pytorch-trainer)의 기본 튜토리얼을 확인하세요!
+
+</Tip>
+
+이제 모델을 훈련시킬 준비가 되었습니다! [`AutoModelForSequenceClassification`]로 DistilBERT를 가쳐오고 예상되는 레이블 수와 레이블 매핑을 지정하세요:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
+
+>>> model = AutoModelForSequenceClassification.from_pretrained(
+...     "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
+... )
+```
+
+이제 세 단계만 거치면 끝입니다:
+
+1. [`TrainingArguments`]에서 하이퍼파라미터를 정의하세요. `output_dir`는 모델을 저장할 위치를 지정하는 유일한 파라미터입니다. 이 모델을 Hub에 업로드하기 위해 `push_to_hub=True`를 설정합니다. (모델을 업로드하기 위해 Hugging Face에 로그인해야합니다.) 각 에폭이 끝날 때마다, [`Trainer`]는 정확도를 평가하고 훈련 체크포인트를 저장합니다.
+2. [`Trainer`]에 훈련 인수와 모델, 데이터셋, 토크나이저, 데이터 수집기 및 `compute_metrics` 함수를 전달하세요.
+3. [`~Trainer.train`]를 호출하여 모델은 파인 튜닝하세요.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_model",
+...     learning_rate=2e-5,
+...     per_device_train_batch_size=16,
+...     per_device_eval_batch_size=16,
+...     num_train_epochs=2,
+...     weight_decay=0.01,
+...     evaluation_strategy="epoch",
+...     save_strategy="epoch",
+...     load_best_model_at_end=True,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=tokenized_imdb["train"],
+...     eval_dataset=tokenized_imdb["test"],
+...     tokenizer=tokenizer,
+...     data_collator=data_collator,
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+<Tip>
+
+[`Trainer`]는 `tokenizer`를 전달하면 기본적으로 동적 매핑을 적용합니다. 이 경우, 명시적으로 데이터 수집기를 지정할 필요가 없습니다.
+
+</Tip>
+
+훈련이 완료되면, [`~transformers.Trainer.push_to_hub`] 메소드를 사용하여 모델을 Hub에 공유할 수 있습니다.
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+<Tip>
+
+Keras를 사용하여 모델을 파인 튜닝하는 방법에 익숙하지 않은 경우, [여기](../training#train-a-tensorflow-model-with-keras)의 기본 튜토리얼을 확인하세요!
+
+</Tip>
+TensorFlow에서 모델을 파인 튜닝하려면, 먼저 옵티마이저 함수와 학습률 스케쥴, 그리고 일부 훈련 하이퍼파라미터를 설정해야 합니다:
+
+```py
+>>> from transformers import create_optimizer
+>>> import tensorflow as tf
+
+>>> batch_size = 16
+>>> num_epochs = 5
+>>> batches_per_epoch = len(tokenized_imdb["train"]) // batch_size
+>>> total_train_steps = int(batches_per_epoch * num_epochs)
+>>> optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
+```
+
+그런 다음 [`TFAutoModelForSequenceClassification`]을 사용하여 DistilBERT를 로드하고, 예상되는 레이블 수와 레이블 매핑을 로드할 수 있습니다:
+
+```py
+>>> from transformers import TFAutoModelForSequenceClassification
+
+>>> model = TFAutoModelForSequenceClassification.from_pretrained(
+...     "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
+... )
+```
+
+[`~transformers.TFPreTrainedModel.prepare_tf_dataset`]을 사용하여 데이터셋을 `tf.data.Dataset` 형식으로 변환합니다:
+
+```py
+>>> tf_train_set = model.prepare_tf_dataset(
+...     tokenized_imdb["train"],
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_validation_set = model.prepare_tf_dataset(
+...     tokenized_imdb["test"],
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+[`compile`](https://keras.io/api/models/model_training_apis/#compile-method)를 사용하여 훈련할 모델을 구성합니다:
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)
+```
+
+훈련을 시작하기 전에 설정해야할 마지막 두 가지는 예측에서 정확도를 계산하고, 모델을 Hub에 업로드할 방법을 제공하는 것입니다. 모두 [Keras callbacks](../main_classes/keras_callbacks)를 사용하여 수행됩니다.
+
+[`~transformers.KerasMetricCallback`]에 `compute_metrics`를 전달하여 정확도를 높입니다.
+
+```py
+>>> from transformers.keras_callbacks import KerasMetricCallback
+
+>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
+```
+
+[`~transformers.PushToHubCallback`]에서 모델과 토크나이저를 업로드할 위치를 지정합니다:
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> push_to_hub_callback = PushToHubCallback(
+...     output_dir="my_awesome_model",
+...     tokenizer=tokenizer,
+... )
+```
+
+그런 다음 콜백을 함께 묶습니다:
+
+```py
+>>> callbacks = [metric_callback, push_to_hub_callback]
+```
+
+드디어, 모델 훈련을 시작할 준비가 되었습니다! [`fit`](https://keras.io/api/models/model_training_apis/#fit-method)에 훈련 데이터셋, 검증 데이터셋, 에폭의 수 및 콜백을 전달하여 파인 튜닝합니다:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=callbacks)
+```
+
+훈련이 완료되면, 모델이 자동으로 Hub에 업로드되어 모든 사람이 사용할 수 있습니다!
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+텍스트 분류를 위한 모델을 파인 튜닝하는 자세한 예제는 다음 [PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb) 또는 [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb)를 참조하세요.
+
+</Tip>
+
+## 추론[[inference]]
+
+좋아요, 이제 모델을 파인 튜닝했으니 추론에 사용할 수 있습니다!
+
+추론을 수행하고자 하는 텍스트를 가져와봅시다:
+
+```py
+>>> text = "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three."
+```
+
+파인 튜닝된 모델로 추론을 시도하는 가장 간단한 방법은 [`pipeline`]를 사용하는 것입니다. 모델로 감정 분석을 위한 `pipeline`을 인스턴스화하고, 텍스트를 전달해보세요:
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline("sentiment-analysis", model="stevhliu/my_awesome_model")
+>>> classifier(text)
+[{'label': 'POSITIVE', 'score': 0.9994940757751465}]
+```
+
+원한다면, `pipeline`의 결과를 수동으로 복제할 수도 있습니다.
+
+<frameworkcontent>
+<pt>
+텍스트를 토큰화하고 PyTorch 텐서를 반환합니다.
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_model")
+>>> inputs = tokenizer(text, return_tensors="pt")
+```
+
+입력을 모델에 전달하고 `logits`을 반환합니다:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("stevhliu/my_awesome_model")
+>>> with torch.no_grad():
+...     logits = model(**inputs).logits
+```
+
+가장 높은 확률을 가진 클래스를 모델의 `id2label` 매핑을 사용하여 텍스트 레이블로 변환합니다:
+
+```py
+>>> predicted_class_id = logits.argmax().item()
+>>> model.config.id2label[predicted_class_id]
+'POSITIVE'
+```
+</pt>
+<tf>
+텍스트를 토큰화하고 TensorFlow 텐서를 반환합니다:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_model")
+>>> inputs = tokenizer(text, return_tensors="tf")
+```
+
+입력값을 모델에 전달하고 `logits`을 반환합니다:
+
+```py
+>>> from transformers import TFAutoModelForSequenceClassification
+
+>>> model = TFAutoModelForSequenceClassification.from_pretrained("stevhliu/my_awesome_model")
+>>> logits = model(**inputs).logits
+```
+
+가장 높은 확률을 가진 클래스를 모델의 `id2label` 매핑을 사용하여 텍스트 레이블로 변환합니다:
+
+```py
+>>> predicted_class_id = int(tf.math.argmax(logits, axis=-1)[0])
+>>> model.config.id2label[predicted_class_id]
+'POSITIVE'
+```
+</tf>
+</frameworkcontent>
diff --git a/docs/source/ko/tasks/summarization.md b/docs/source/ko/tasks/summarization.md
new file mode 100644
index 000000000000..5ca5f63a27c9
--- /dev/null
+++ b/docs/source/ko/tasks/summarization.md
@@ -0,0 +1,418 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 요약[[summarization]]
+
+[[open-in-colab]]
+
+<Youtube id="yHnr5Dk2zCI"/>
+
+요약은 문서나 기사에서 중요한 정보를 모두 포함하되 짧게 만드는 일입니다.
+번역과 마찬가지로, 시퀀스-투-시퀀스 문제로 구성할 수 있는 대표적인 작업 중 하나입니다. 
+요약에는 아래와 같이 유형이 있습니다:
+
+- 추출(Extractive) 요약: 문서에서 가장 관련성 높은 정보를 추출합니다.
+- 생성(Abstractive) 요약: 가장 관련성 높은 정보를 포착해내는 새로운 텍스트를 생성합니다.
+
+이 가이드에서 소개할 내용은 아래와 같습니다:
+
+1. 생성 요약을 위한 [BillSum](https://huggingface.co/datasets/billsum) 데이터셋 중 캘리포니아 주 법안 하위 집합으로 [T5](https://huggingface.co/t5-small)를 파인튜닝합니다.
+2. 파인튜닝된 모델을 사용하여 추론합니다.
+
+<Tip>
+이 튜토리얼에서 설명하는 작업은 다음 모델 아키텍처에서 지원됩니다:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[BART](../model_doc/bart), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [Encoder decoder](../model_doc/encoder-decoder), [FairSeq Machine-Translation](../model_doc/fsmt), [GPTSAN-japanese](../model_doc/gptsan-japanese), [LED](../model_doc/led), [LongT5](../model_doc/longt5), [M2M100](../model_doc/m2m_100), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [NLLB](../model_doc/nllb), [NLLB-MOE](../model_doc/nllb-moe), [Pegasus](../model_doc/pegasus), [PEGASUS-X](../model_doc/pegasus_x), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [SwitchTransformers](../model_doc/switch_transformers), [T5](../model_doc/t5), [XLM-ProphetNet](../model_doc/xlm-prophetnet)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+시작하기 전에 필요한 라이브러리가 모두 설치되어 있는지 확인하세요:
+
+```bash
+pip install transformers datasets evaluate rouge_score
+```
+
+Hugging Face 계정에 로그인하면 모델을 업로드하고 커뮤니티에 공유할 수 있습니다. 
+토큰을 입력하여 로그인하세요.
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## BillSum 데이터셋 가져오기[[load-billsum-dataset]]
+
+🤗 Datasets 라이브러리에서 BillSum 데이터셋의 작은 버전인 캘리포니아 주 법안 하위 집합을 가져오세요:
+
+```py
+>>> from datasets import load_dataset
+
+>>> billsum = load_dataset("billsum", split="ca_test")
+```
+
+[`~datasets.Dataset.train_test_split`] 메소드로 데이터셋을 학습용와 테스트용으로 나누세요:
+
+```py
+>>> billsum = billsum.train_test_split(test_size=0.2)
+```
+
+그런 다음 예시를 하나 살펴보세요:
+
+```py
+>>> billsum["train"][0]
+{'summary': 'Existing law authorizes state agencies to enter into contracts for the acquisition of goods or services upon approval by the Department of General Services. Existing law sets forth various requirements and prohibitions for those contracts, including, but not limited to, a prohibition on entering into contracts for the acquisition of goods or services of $100,000 or more with a contractor that discriminates between spouses and domestic partners or same-sex and different-sex couples in the provision of benefits. Existing law provides that a contract entered into in violation of those requirements and prohibitions is void and authorizes the state or any person acting on behalf of the state to bring a civil action seeking a determination that a contract is in violation and therefore void. Under existing law, a willful violation of those requirements and prohibitions is a misdemeanor.\nThis bill would also prohibit a state agency from entering into contracts for the acquisition of goods or services of $100,000 or more with a contractor that discriminates between employees on the basis of gender identity in the provision of benefits, as specified. By expanding the scope of a crime, this bill would impose a state-mandated local program.\nThe California Constitution requires the state to reimburse local agencies and school districts for certain costs mandated by the state. Statutory provisions establish procedures for making that reimbursement.\nThis bill would provide that no reimbursement is required by this act for a specified reason.',
+ 'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nSection 10295.35 is added to the Public Contract Code, to read:\n10295.35.\n(a) (1) Notwithstanding any other law, a state agency shall not enter into any contract for the acquisition of goods or services in the amount of one hundred thousand dollars ($100,000) or more with a contractor that, in the provision of benefits, discriminates between employees on the basis of an employee’s or dependent’s actual or perceived gender identity, including, but not limited to, the employee’s or dependent’s identification as transgender.\n(2) For purposes of this section, “contract” includes contracts with a cumulative amount of one hundred thousand dollars ($100,000) or more per contractor in each fiscal year.\n(3) For purposes of this section, an employee health plan is discriminatory if the plan is not consistent with Section 1365.5 of the Health and Safety Code and Section 10140 of the Insurance Code.\n(4) The requirements of this section shall apply only to those portions of a contractor’s operations that occur under any of the following conditions:\n(A) Within the state.\n(B) On real property outside the state if the property is owned by the state or if the state has a right to occupy the property, and if the contractor’s presence at that location is connected to a contract with the state.\n(C) Elsewhere in the United States where work related to a state contract is being performed.\n(b) Contractors shall treat as confidential, to the maximum extent allowed by law or by the requirement of the contractor’s insurance provider, any request by an employee or applicant for employment benefits or any documentation of eligibility for benefits submitted by an employee or applicant for employment.\n(c) After taking all reasonable measures to find a contractor that complies with this section, as determined by the state agency, the requirements of this section may be waived under any of the following circumstances:\n(1) There is only one prospective contractor willing to enter into a specific contract with the state agency.\n(2) The contract is necessary to respond to an emergency, as determined by the state agency, that endangers the public health, welfare, or safety, or the contract is necessary for the provision of essential services, and no entity that complies with the requirements of this section capable of responding to the emergency is immediately available.\n(3) The requirements of this section violate, or are inconsistent with, the terms or conditions of a grant, subvention, or agreement, if the agency has made a good faith attempt to change the terms or conditions of any grant, subvention, or agreement to authorize application of this section.\n(4) The contractor is providing wholesale or bulk water, power, or natural gas, the conveyance or transmission of the same, or ancillary services, as required for ensuring reliable services in accordance with good utility practice, if the purchase of the same cannot practically be accomplished through the standard competitive bidding procedures and the contractor is not providing direct retail services to end users.\n(d) (1) A contractor shall not be deemed to discriminate in the provision of benefits if the contractor, in providing the benefits, pays the actual costs incurred in obtaining the benefit.\n(2) If a contractor is unable to provide a certain benefit, despite taking reasonable measures to do so, the contractor shall not be deemed to discriminate in the provision of benefits.\n(e) (1) Every contract subject to this chapter shall contain a statement by which the contractor certifies that the contractor is in compliance with this section.\n(2) The department or other contracting agency shall enforce this section pursuant to its existing enforcement powers.\n(3) (A) If a contractor falsely certifies that it is in compliance with this section, the contract with that contractor shall be subject to Article 9 (commencing with Section 10420), unless, within a time period specified by the department or other contracting agency, the contractor provides to the department or agency proof that it has complied, or is in the process of complying, with this section.\n(B) The application of the remedies or penalties contained in Article 9 (commencing with Section 10420) to a contract subject to this chapter shall not preclude the application of any existing remedies otherwise available to the department or other contracting agency under its existing enforcement powers.\n(f) Nothing in this section is intended to regulate the contracting practices of any local jurisdiction.\n(g) This section shall be construed so as not to conflict with applicable federal laws, rules, or regulations. In the event that a court or agency of competent jurisdiction holds that federal law, rule, or regulation invalidates any clause, sentence, paragraph, or section of this code or the application thereof to any person or circumstances, it is the intent of the state that the court or agency sever that clause, sentence, paragraph, or section so that the remainder of this section shall remain in effect.\nSEC. 2.\nSection 10295.35 of the Public Contract Code shall not be construed to create any new enforcement authority or responsibility in the Department of General Services or any other contracting agency.\nSEC. 3.\nNo reimbursement is required by this act pursuant to Section 6 of Article XIII\u2009B of the California Constitution because the only costs that may be incurred by a local agency or school district will be incurred because this act creates a new crime or infraction, eliminates a crime or infraction, or changes the penalty for a crime or infraction, within the meaning of Section 17556 of the Government Code, or changes the definition of a crime within the meaning of Section 6 of Article XIII\u2009B of the California Constitution.',
+ 'title': 'An act to add Section 10295.35 to the Public Contract Code, relating to public contracts.'}
+```
+
+여기서 다음 두 개의 필드를 사용하게 됩니다:
+
+- `text`: 모델의 입력이 될 법안 텍스트입니다.
+- `summary`: `text`의 간략한 버전으로 모델의 타겟이 됩니다.
+
+## 전처리[[preprocess]]
+
+다음으로 `text`와 `summary`를 처리하기 위한 T5 토크나이저를 가져옵니다:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> checkpoint = "t5-small"
+>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+```
+
+생성하려는 전처리 함수는 아래 조건을 만족해야 합니다:
+
+1. 입력 앞에 프롬프트를 붙여 T5가 요약 작업임을 인식할 수 있도록 합니다. 여러 NLP 작업을 수행할 수 있는 일부 모델은 특정 작업에 대한 프롬프트가 필요합니다.
+2. 레이블을 토큰화할 때 `text_target` 인수를 사용합니다.
+3. `max_length` 매개변수로 설정된 최대 길이를 넘지 않도록 긴 시퀀스를 잘라냅니다.
+
+```py
+>>> prefix = "summarize: "
+
+
+>>> def preprocess_function(examples):
+...     inputs = [prefix + doc for doc in examples["text"]]
+...     model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
+
+...     labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)
+
+...     model_inputs["labels"] = labels["input_ids"]
+...     return model_inputs
+```
+
+전체 데이터셋에 전처리 함수를 적용하려면 🤗 Datasets의 [`~datasets.Dataset.map`] 메소드를 사용하세요. 
+`batched=True`로 설정하여 데이터셋의 여러 요소를 한 번에 처리하면 `map` 함수의 속도를 높일 수 있습니다.
+
+```py
+>>> tokenized_billsum = billsum.map(preprocess_function, batched=True)
+```
+
+이제 [`DataCollatorForSeq2Seq`]를 사용하여 예제 배치를 만드세요. 
+전체 데이터셋을 최대 길이로 패딩하는 것보다 배치마다 가장 긴 문장 길이에 맞춰 *동적 패딩*하는 것이 더 효율적입니다.
+
+<frameworkcontent>
+<pt>
+```py
+>>> from transformers import DataCollatorForSeq2Seq
+
+>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)
+```
+</pt>
+<tf>
+```py
+>>> from transformers import DataCollatorForSeq2Seq
+
+>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf")
+```
+</tf>
+</frameworkcontent>
+
+## 평가[[evaluate]]
+
+학습 중에 평가 지표를 포함하면 모델의 성능을 평가하는 데 도움이 되는 경우가 많습니다. 
+🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) 라이브러리를 사용하면 평가 방법을 빠르게 불러올 수 있습니다. 
+이 작업에서는 [ROUGE](https://huggingface.co/spaces/evaluate-metric/rouge) 평가 지표를 가져옵니다. 
+(평가 지표를 불러오고 계산하는 방법은 🤗 Evaluate [둘러보기](https://huggingface.co/docs/evaluate/a_quick_tour)를 참조하세요.)
+
+```py
+>>> import evaluate
+
+>>> rouge = evaluate.load("rouge")
+```
+
+그런 다음 예측값과 레이블을 [`~evaluate.EvaluationModule.compute`]에 전달하여 ROUGE 지표를 계산하는 함수를 만듭니다:
+
+```py
+>>> import numpy as np
+
+
+>>> def compute_metrics(eval_pred):
+...     predictions, labels = eval_pred
+...     decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
+...     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+...     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
+
+...     result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
+
+...     prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
+...     result["gen_len"] = np.mean(prediction_lens)
+
+...     return {k: round(v, 4) for k, v in result.items()}
+```
+
+이제 `compute_metrics` 함수를 사용할 준비가 되었으며, 학습을 설정할 때 이 함수로 되돌아올 것입니다.
+
+## 학습[[train]]
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+모델을 [`Trainer`]로 파인튜닝 하는 것이 익숙하지 않다면, [여기](../training#train-with-pytorch-trainer)에서 기본 튜토리얼을 확인해보세요!
+
+</Tip>
+
+이제 모델 학습을 시작할 준비가 되었습니다! [`AutoModelForSeq2SeqLM`]로 T5를 가져오세요:
+
+```py
+>>> from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
+
+>>> model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
+```
+
+이제 세 단계만 남았습니다:
+
+1. [`Seq2SeqTrainingArguments`]에서 학습 하이퍼파라미터를 정의하세요. 
+유일한 필수 매개변수는 모델을 저장할 위치를 지정하는 `output_dir`입니다. 
+`push_to_hub=True`를 설정하여 이 모델을 Hub에 푸시할 수 있습니다(모델을 업로드하려면 Hugging Face에 로그인해야 합니다.) 
+[`Trainer`]는 각 에폭이 끝날 때마다 ROUGE 지표를 평가하고 학습 체크포인트를 저장합니다.
+2. 모델, 데이터셋, 토크나이저, 데이터 콜레이터 및 `compute_metrics` 함수와 함께 학습 인수를 [`Seq2SeqTrainer`]에 전달하세요.
+3. [`~Trainer.train`]을 호출하여 모델을 파인튜닝하세요.
+
+```py
+>>> training_args = Seq2SeqTrainingArguments(
+...     output_dir="my_awesome_billsum_model",
+...     evaluation_strategy="epoch",
+...     learning_rate=2e-5,
+...     per_device_train_batch_size=16,
+...     per_device_eval_batch_size=16,
+...     weight_decay=0.01,
+...     save_total_limit=3,
+...     num_train_epochs=4,
+...     predict_with_generate=True,
+...     fp16=True,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Seq2SeqTrainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=tokenized_billsum["train"],
+...     eval_dataset=tokenized_billsum["test"],
+...     tokenizer=tokenizer,
+...     data_collator=data_collator,
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+학습이 완료되면, 누구나 모델을 사용할 수 있도록 [`~transformers.Trainer.push_to_hub`] 메소드로 Hub에 공유합니다:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+<Tip>
+
+Keras로 모델 파인튜닝을 하는 것이 익숙하지 않다면, [여기](../training#train-a-tensorflow-model-with-keras)에서 기본적인 튜토리얼을 확인하세요!
+
+</Tip>
+TensorFlow에서 모델을 파인튜닝하려면, 먼저 옵티마이저, 학습률 스케줄 그리고 몇 가지 학습 하이퍼파라미터를 설정하세요:
+
+```py
+>>> from transformers import create_optimizer, AdamWeightDecay
+
+>>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
+```
+
+그런 다음 [`TFAutoModelForSeq2SeqLM`]을 사용하여 T5를 가져오세요:
+
+```py
+>>> from transformers import TFAutoModelForSeq2SeqLM
+
+>>> model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint)
+```
+
+[`~transformers.TFPreTrainedModel.prepare_tf_dataset`]을 사용하여 데이터셋을 `tf.data.Dataset` 형식으로 변환하세요:
+
+```py
+>>> tf_train_set = model.prepare_tf_dataset(
+...     tokenized_billsum["train"],
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_test_set = model.prepare_tf_dataset(
+...     tokenized_billsum["test"],
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+[`compile`](https://keras.io/api/models/model_training_apis/#compile-method)을 사용하여 모델을 학습할 수 있도록 구성하세요:
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)
+```
+
+학습을 시작하기 전에 설정해야 할 마지막 두 가지는 예측에서 ROUGE 점수를 계산하고 모델을 Hub에 푸시하는 방법을 제공하는 것입니다. 
+두 작업 모두 [Keras callbacks](../main_classes/keras_callbacks)으로 수행할 수 있습니다.
+
+[`~transformers.KerasMetricCallback`]에 `compute_metrics` 함수를 전달하세요:
+
+```py
+>>> from transformers.keras_callbacks import KerasMetricCallback
+
+>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
+```
+
+[`~transformers.PushToHubCallback`]에서 모델과 토크나이저를 푸시할 위치를 지정하세요:
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> push_to_hub_callback = PushToHubCallback(
+...     output_dir="my_awesome_billsum_model",
+...     tokenizer=tokenizer,
+... )
+```
+
+그런 다음 콜백을 번들로 묶어줍니다:
+
+```py
+>>> callbacks = [metric_callback, push_to_hub_callback]
+```
+
+드디어 모델 학습을 시작할 준비가 되었습니다! 
+학습 및 검증 데이터셋, 에폭 수 및 콜백과 함께 [`fit`](https://keras.io/api/models/model_training_apis/#fit-method)을 호출하여 모델을 파인튜닝하세요.
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=callbacks)
+```
+
+학습이 완료되면 모델이 자동으로 Hub에 업로드되어 누구나 사용할 수 있게 됩니다!
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+요약을 위해 모델을 파인튜닝하는 방법에 대한 더 자세한 예제를 보려면 [PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb) 
+또는 [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb)을 참고하세요.
+
+</Tip>
+
+## 추론[[inference]]
+
+좋아요, 이제 모델을 파인튜닝했으니 추론에 사용할 수 있습니다!
+
+요약할 텍스트를 작성해보세요. T5의 경우 작업에 따라 입력 앞에 접두사를 붙여야 합니다. 요약의 경우, 아래와 같은 접두사를 입력 앞에 붙여야 합니다:
+
+```py
+>>> text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."
+```
+
+추론을 위해 파인튜닝한 모델을 시험해 보는 가장 간단한 방법은 [`pipeline`]에서 사용하는 것입니다. 
+모델을 사용하여 요약을 수행할 [`pipeline`]을 인스턴스화하고 텍스트를 전달하세요:
+
+```py
+>>> from transformers import pipeline
+
+>>> summarizer = pipeline("summarization", model="stevhliu/my_awesome_billsum_model")
+>>> summarizer(text)
+[{"summary_text": "The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country."}]
+```
+
+원한다면 수동으로 다음과 같은 작업을 수행하여 [`pipeline`]의 결과와 동일한 결과를 얻을 수 있습니다:
+
+
+<frameworkcontent>
+<pt>
+텍스트를 토크나이즈하고 `input_ids`를 PyTorch 텐서로 반환합니다:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_billsum_model")
+>>> inputs = tokenizer(text, return_tensors="pt").input_ids
+```
+
+요약문을 생성하려면 [`~transformers.generation_utils.GenerationMixin.generate`] 메소드를 사용하세요. 
+텍스트 생성에 대한 다양한 전략과 생성을 제어하기 위한 매개변수에 대한 자세한 내용은 [텍스트 생성](../main_classes/text_generation) API를 참조하세요.
+
+```py
+>>> from transformers import AutoModelForSeq2SeqLM
+
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("stevhliu/my_awesome_billsum_model")
+>>> outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)
+```
+
+생성된 토큰 ID를 텍스트로 디코딩합니다:
+
+```py
+>>> tokenizer.decode(outputs[0], skip_special_tokens=True)
+'the inflation reduction act lowers prescription drug costs, health care costs, and energy costs. it's the most aggressive action on tackling the climate crisis in american history. it will ask the ultra-wealthy and corporations to pay their fair share.'
+```
+</pt>
+<tf>
+텍스트를 토크나이즈하고 `input_ids`를 TensorFlow 텐서로 반환합니다:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_billsum_model")
+>>> inputs = tokenizer(text, return_tensors="tf").input_ids
+```
+
+요약문을 생성하려면 [`~transformers.generation_tf_utils.TFGenerationMixin.generate`] 메소드를 사용하세요. 
+텍스트 생성에 대한 다양한 전략과 생성을 제어하기 위한 매개변수에 대한 자세한 내용은 [텍스트 생성](../main_classes/text_generation) API를 참조하세요.
+
+```py
+>>> from transformers import TFAutoModelForSeq2SeqLM
+
+>>> model = TFAutoModelForSeq2SeqLM.from_pretrained("stevhliu/my_awesome_billsum_model")
+>>> outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)
+```
+
+생성된 토큰 ID를 텍스트로 디코딩합니다:
+
+```py
+>>> tokenizer.decode(outputs[0], skip_special_tokens=True)
+'the inflation reduction act lowers prescription drug costs, health care costs, and energy costs. it's the most aggressive action on tackling the climate crisis in american history. it will ask the ultra-wealthy and corporations to pay their fair share.'
+```
+</tf>
+</frameworkcontent>
diff --git a/docs/source/ko/tasks/token_classification.md b/docs/source/ko/tasks/token_classification.md
new file mode 100644
index 000000000000..b09c2c8078aa
--- /dev/null
+++ b/docs/source/ko/tasks/token_classification.md
@@ -0,0 +1,560 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 토큰 분류[[token-classification]]
+
+[[open-in-colab]]
+
+<Youtube id="wVHdVlPScxA"/>
+
+토큰 분류는 문장의 개별 토큰에 레이블을 할당합니다. 가장 일반적인 토큰 분류 작업 중 하나는 개체명 인식(Named Entity Recognition, NER)입니다. 개체명 인식은 문장에서 사람, 위치 또는 조직과 같은 각 개체의 레이블을 찾으려고 시도합니다.
+
+이 가이드에서 학습할 내용은:
+
+1. [WNUT 17](https://huggingface.co/datasets/wnut_17) 데이터 세트에서 [DistilBERT](https://huggingface.co/distilbert-base-uncased)를 파인 튜닝하여 새로운 개체를 탐지합니다.
+2. 추론을 위해 파인 튜닝 모델을 사용합니다.
+
+<Tip>
+이 튜토리얼에서 설명하는 작업은 다음 모델 아키텍처에 의해 지원됩니다:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[ALBERT](../model_doc/albert), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BioGpt](../model_doc/biogpt), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [ESM](../model_doc/esm), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MarkupLM](../model_doc/markuplm), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [QDQBert](../model_doc/qdqbert), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+시작하기 전에, 필요한 모든 라이브러리가 설치되어 있는지 확인하세요:
+
+```bash
+pip install transformers datasets evaluate seqeval
+```
+
+Hugging Face 계정에 로그인하여 모델을 업로드하고 커뮤니티에 공유하는 것을 권장합니다. 메시지가 표시되면, 토큰을 입력하여 로그인하세요:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## WNUT 17 데이터 세트 가져오기[[load-wnut-17-dataset]]
+
+먼저 🤗 Datasets 라이브러리에서 WNUT 17 데이터 세트를 가져옵니다:
+
+```py
+>>> from datasets import load_dataset
+
+>>> wnut = load_dataset("wnut_17")
+```
+
+다음 예제를 살펴보세요:
+
+```py
+>>> wnut["train"][0]
+{'id': '0',
+ 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0],
+ 'tokens': ['@paulwalk', 'It', "'s", 'the', 'view', 'from', 'where', 'I', "'m", 'living', 'for', 'two', 'weeks', '.', 'Empire', 'State', 'Building', '=', 'ESB', '.', 'Pretty', 'bad', 'storm', 'here', 'last', 'evening', '.']
+}
+```
+
+`ner_tags`의 각 숫자는 개체를 나타냅니다. 숫자를 레이블 이름으로 변환하여 개체가 무엇인지 확인합니다:
+
+```py
+>>> label_list = wnut["train"].features[f"ner_tags"].feature.names
+>>> label_list
+[
+    "O",
+    "B-corporation",
+    "I-corporation",
+    "B-creative-work",
+    "I-creative-work",
+    "B-group",
+    "I-group",
+    "B-location",
+    "I-location",
+    "B-person",
+    "I-person",
+    "B-product",
+    "I-product",
+]
+```
+
+각 `ner_tag`의 앞에 붙은 문자는 개체의 토큰 위치를 나타냅니다:
+
+- `B-`는 개체의 시작을 나타냅니다.
+- `I-`는 토큰이 동일한 개체 내부에 포함되어 있음을 나타냅니다(예를 들어 `State` 토큰은 `Empire State Building`와 같은 개체의 일부입니다).
+- `0`는 토큰이 어떤 개체에도 해당하지 않음을 나타냅니다.
+
+## 전처리[[preprocess]]
+
+<Youtube id="iY2AZYdZAr0"/>
+
+다음으로 `tokens` 필드를 전처리하기 위해 DistilBERT 토크나이저를 가져옵니다:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+```
+
+위의 예제 `tokens` 필드를 보면 입력이 이미 토큰화된 것처럼 보입니다. 그러나 실제로 입력은 아직 토큰화되지 않았으므로 단어를 하위 단어로 토큰화하기 위해 `is_split_into_words=True`를 설정해야 합니다. 예제로 확인합니다: 
+
+```py
+>>> example = wnut["train"][0]
+>>> tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
+>>> tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
+>>> tokens
+['[CLS]', '@', 'paul', '##walk', 'it', "'", 's', 'the', 'view', 'from', 'where', 'i', "'", 'm', 'living', 'for', 'two', 'weeks', '.', 'empire', 'state', 'building', '=', 'es', '##b', '.', 'pretty', 'bad', 'storm', 'here', 'last', 'evening', '.', '[SEP]']
+```
+
+그러나 이로 인해 `[CLS]`과 `[SEP]`라는 특수 토큰이 추가되고, 하위 단어 토큰화로 인해 입력과 레이블 간에 불일치가 발생합니다. 하나의 레이블에 해당하는 단일 단어는 이제 두 개의 하위 단어로 분할될 수 있습니다. 토큰과 레이블을 다음과 같이 재정렬해야 합니다:
+
+1. [`word_ids`](https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.BatchEncoding.word_ids) 메소드로 모든 토큰을 해당 단어에 매핑합니다.
+2. 특수 토큰 `[CLS]`와 `[SEP]`에 `-100` 레이블을 할당하여, PyTorch 손실 함수가 해당 토큰을 무시하도록 합니다.
+3. 주어진 단어의 첫 번째 토큰에만 레이블을 지정합니다. 같은 단어의 다른 하위 토큰에 `-100`을 할당합니다.
+
+다음은 토큰과 레이블을 재정렬하고 DistilBERT의 최대 입력 길이보다 길지 않도록 시퀀스를 잘라내는 함수를 만드는 방법입니다:
+
+```py
+>>> def tokenize_and_align_labels(examples):
+...     tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
+
+...     labels = []
+...     for i, label in enumerate(examples[f"ner_tags"]):
+...         word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
+...         previous_word_idx = None
+...         label_ids = []
+...         for word_idx in word_ids:  # Set the special tokens to -100.
+...             if word_idx is None:
+...                 label_ids.append(-100)
+...             elif word_idx != previous_word_idx:  # Only label the first token of a given word.
+...                 label_ids.append(label[word_idx])
+...             else:
+...                 label_ids.append(-100)
+...             previous_word_idx = word_idx
+...         labels.append(label_ids)
+
+...     tokenized_inputs["labels"] = labels
+...     return tokenized_inputs
+```
+
+전체 데이터 세트에 전처리 함수를 적용하려면, 🤗 Datasets [`~datasets.Dataset.map`] 함수를 사용하세요. `batched=True`로 설정하여 데이터 세트의 여러 요소를 한 번에 처리하면 `map` 함수의 속도를 높일 수 있습니다:
+```py
+>>> tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True)
+```
+
+이제 [`DataCollatorWithPadding`]를 사용하여 예제 배치를 만들어봅시다. 데이터 세트 전체를 최대 길이로 패딩하는 대신, *동적 패딩*을 사용하여 배치에서 가장 긴 길이에 맞게 문장을 패딩하는 것이 효율적입니다.
+
+<frameworkcontent>
+<pt>
+```py
+>>> from transformers import DataCollatorForTokenClassification
+
+>>> data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
+```
+</pt>
+<tf>
+```py
+>>> from transformers import DataCollatorForTokenClassification
+
+>>> data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")
+```
+</tf>
+</frameworkcontent>
+
+## 평가[[evaluation]]
+
+훈련 중 모델의 성능을 평가하기 위해 평가 지표를 포함하는 것이 유용합니다. 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) 라이브러리를 사용하여 빠르게 평가 방법을 가져올 수 있습니다. 이 작업에서는 [seqeval](https://huggingface.co/spaces/evaluate-metric/seqeval) 평가 지표를 가져옵니다. (평가 지표를 가져오고 계산하는 방법에 대해서는 🤗 Evaluate [빠른 둘러보기](https://huggingface.co/docs/evaluate/a_quick_tour)를 참조하세요). Seqeval은 실제로 정밀도, 재현률, F1 및 정확도와 같은 여러 점수를 산출합니다.
+
+```py
+>>> import evaluate
+
+>>> seqeval = evaluate.load("seqeval")
+```
+
+먼저 NER 레이블을 가져온 다음, [`~evaluate.EvaluationModule.compute`]에 실제 예측과 실제 레이블을 전달하여 점수를 계산하는 함수를 만듭니다:
+
+```py
+>>> import numpy as np
+
+>>> labels = [label_list[i] for i in example[f"ner_tags"]]
+
+
+>>> def compute_metrics(p):
+...     predictions, labels = p
+...     predictions = np.argmax(predictions, axis=2)
+
+...     true_predictions = [
+...         [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
+...         for prediction, label in zip(predictions, labels)
+...     ]
+...     true_labels = [
+...         [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
+...         for prediction, label in zip(predictions, labels)
+...     ]
+
+...     results = seqeval.compute(predictions=true_predictions, references=true_labels)
+...     return {
+...         "precision": results["overall_precision"],
+...         "recall": results["overall_recall"],
+...         "f1": results["overall_f1"],
+...         "accuracy": results["overall_accuracy"],
+...     }
+```
+
+이제 `compute_metrics` 함수를 사용할 준비가 되었으며, 훈련을 설정하면 이 함수로 되돌아올 것입니다.
+
+## 훈련[[train]]
+
+모델을 훈련하기 전에, `id2label`와 `label2id`를 사용하여 예상되는 id와 레이블의 맵을 생성하세요:
+
+```py
+>>> id2label = {
+...     0: "O",
+...     1: "B-corporation",
+...     2: "I-corporation",
+...     3: "B-creative-work",
+...     4: "I-creative-work",
+...     5: "B-group",
+...     6: "I-group",
+...     7: "B-location",
+...     8: "I-location",
+...     9: "B-person",
+...     10: "I-person",
+...     11: "B-product",
+...     12: "I-product",
+... }
+>>> label2id = {
+...     "O": 0,
+...     "B-corporation": 1,
+...     "I-corporation": 2,
+...     "B-creative-work": 3,
+...     "I-creative-work": 4,
+...     "B-group": 5,
+...     "I-group": 6,
+...     "B-location": 7,
+...     "I-location": 8,
+...     "B-person": 9,
+...     "I-person": 10,
+...     "B-product": 11,
+...     "I-product": 12,
+... }
+```
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+[`Trainer`]를 사용하여 모델을 파인 튜닝하는 방법에 익숙하지 않은 경우, [여기](../training#train-with-pytorch-trainer)에서 기본 튜토리얼을 확인하세요!
+
+</Tip>
+
+이제 모델을 훈련시킬 준비가 되었습니다! [`AutoModelForSequenceClassification`]로 DistilBERT를 가져오고 예상되는 레이블 수와 레이블 매핑을 지정하세요:
+
+```py
+>>> from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
+
+>>> model = AutoModelForTokenClassification.from_pretrained(
+...     "distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id
+... )
+```
+
+이제 세 단계만 거치면 끝입니다:
+
+1. [`TrainingArguments`]에서 하이퍼파라미터를 정의하세요. `output_dir`는 모델을 저장할 위치를 지정하는 유일한 매개변수입니다. 이 모델을 허브에 업로드하기 위해 `push_to_hub=True`를 설정합니다(모델을 업로드하기 위해 Hugging Face에 로그인해야합니다.) 각 에폭이 끝날 때마다, [`Trainer`]는 seqeval 점수를 평가하고 훈련 체크포인트를 저장합니다.
+2. [`Trainer`]에 훈련 인수와 모델, 데이터 세트, 토크나이저, 데이터 콜레이터 및 `compute_metrics` 함수를 전달하세요.
+3. [`~Trainer.train`]를 호출하여 모델을 파인 튜닝하세요.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_wnut_model",
+...     learning_rate=2e-5,
+...     per_device_train_batch_size=16,
+...     per_device_eval_batch_size=16,
+...     num_train_epochs=2,
+...     weight_decay=0.01,
+...     evaluation_strategy="epoch",
+...     save_strategy="epoch",
+...     load_best_model_at_end=True,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=tokenized_wnut["train"],
+...     eval_dataset=tokenized_wnut["test"],
+...     tokenizer=tokenizer,
+...     data_collator=data_collator,
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+훈련이 완료되면, [`~transformers.Trainer.push_to_hub`] 메소드를 사용하여 모델을 허브에 공유할 수 있습니다.
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+<Tip>
+
+Keras를 사용하여 모델을 파인 튜닝하는 방법에 익숙하지 않은 경우, [여기](../training#train-a-tensorflow-model-with-keras)의 기본 튜토리얼을 확인하세요!
+
+</Tip>
+TensorFlow에서 모델을 파인 튜닝하려면, 먼저 옵티마이저 함수와 학습률 스케쥴, 그리고 일부 훈련 하이퍼파라미터를 설정해야 합니다:
+
+```py
+>>> from transformers import create_optimizer
+
+>>> batch_size = 16
+>>> num_train_epochs = 3
+>>> num_train_steps = (len(tokenized_wnut["train"]) // batch_size) * num_train_epochs
+>>> optimizer, lr_schedule = create_optimizer(
+...     init_lr=2e-5,
+...     num_train_steps=num_train_steps,
+...     weight_decay_rate=0.01,
+...     num_warmup_steps=0,
+... )
+```
+
+그런 다음 [`TFAutoModelForSequenceClassification`]을 사용하여 DistilBERT를 가져오고, 예상되는 레이블 수와 레이블 매핑을 지정합니다:
+
+```py
+>>> from transformers import TFAutoModelForTokenClassification
+
+>>> model = TFAutoModelForTokenClassification.from_pretrained(
+...     "distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id
+... )
+```
+
+[`~transformers.TFPreTrainedModel.prepare_tf_dataset`]을 사용하여 데이터 세트를 `tf.data.Dataset` 형식으로 변환합니다:
+
+```py
+>>> tf_train_set = model.prepare_tf_dataset(
+...     tokenized_wnut["train"],
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_validation_set = model.prepare_tf_dataset(
+...     tokenized_wnut["validation"],
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+[`compile`](https://keras.io/api/models/model_training_apis/#compile-method)를 사용하여 훈련할 모델을 구성합니다:
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)
+```
+
+훈련을 시작하기 전에 설정해야할 마지막 두 가지는 예측에서 seqeval 점수를 계산하고, 모델을 허브에 업로드할 방법을 제공하는 것입니다. 모두 [Keras callbacks](../main_classes/keras_callbacks)를 사용하여 수행됩니다.
+
+[`~transformers.KerasMetricCallback`]에 `compute_metrics` 함수를 전달하세요:
+
+```py
+>>> from transformers.keras_callbacks import KerasMetricCallback
+
+>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
+```
+
+[`~transformers.PushToHubCallback`]에서 모델과 토크나이저를 업로드할 위치를 지정합니다:
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> push_to_hub_callback = PushToHubCallback(
+...     output_dir="my_awesome_wnut_model",
+...     tokenizer=tokenizer,
+... )
+```
+
+그런 다음 콜백을 함께 묶습니다:
+
+```py
+>>> callbacks = [metric_callback, push_to_hub_callback]
+```
+
+드디어, 모델 훈련을 시작할 준비가 되었습니다! [`fit`](https://keras.io/api/models/model_training_apis/#fit-method)에 훈련 데이터 세트, 검증 데이터 세트, 에폭의 수 및 콜백을 전달하여 파인 튜닝합니다:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=callbacks)
+```
+
+훈련이 완료되면, 모델이 자동으로 허브에 업로드되어 누구나 사용할 수 있습니다!
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+토큰 분류를 위한 모델을 파인 튜닝하는 자세한 예제는 다음 
+[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb) 
+또는 [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb)를 참조하세요.
+
+</Tip>
+
+## 추론[[inference]]
+
+좋아요, 이제 모델을 파인 튜닝했으니 추론에 사용할 수 있습니다!
+
+추론을 수행하고자 하는 텍스트를 가져와봅시다:
+
+```py
+>>> text = "The Golden State Warriors are an American professional basketball team based in San Francisco."
+```
+
+파인 튜닝된 모델로 추론을 시도하는 가장 간단한 방법은 [`pipeline`]를 사용하는 것입니다. 모델로 NER의 `pipeline`을 인스턴스화하고, 텍스트를 전달해보세요:
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline("ner", model="stevhliu/my_awesome_wnut_model")
+>>> classifier(text)
+[{'entity': 'B-location',
+  'score': 0.42658573,
+  'index': 2,
+  'word': 'golden',
+  'start': 4,
+  'end': 10},
+ {'entity': 'I-location',
+  'score': 0.35856336,
+  'index': 3,
+  'word': 'state',
+  'start': 11,
+  'end': 16},
+ {'entity': 'B-group',
+  'score': 0.3064001,
+  'index': 4,
+  'word': 'warriors',
+  'start': 17,
+  'end': 25},
+ {'entity': 'B-location',
+  'score': 0.65523505,
+  'index': 13,
+  'word': 'san',
+  'start': 80,
+  'end': 83},
+ {'entity': 'B-location',
+  'score': 0.4668663,
+  'index': 14,
+  'word': 'francisco',
+  'start': 84,
+  'end': 93}]
+```
+
+원한다면, `pipeline`의 결과를 수동으로 복제할 수도 있습니다:
+
+<frameworkcontent>
+<pt>
+텍스트를 토큰화하고 PyTorch 텐서를 반환합니다:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_wnut_model")
+>>> inputs = tokenizer(text, return_tensors="pt")
+```
+
+입력을 모델에 전달하고 `logits`을 반환합니다:
+
+```py
+>>> from transformers import AutoModelForTokenClassification
+
+>>> model = AutoModelForTokenClassification.from_pretrained("stevhliu/my_awesome_wnut_model")
+>>> with torch.no_grad():
+...     logits = model(**inputs).logits
+```
+
+가장 높은 확률을 가진 클래스를 모델의 `id2label` 매핑을 사용하여 텍스트 레이블로 변환합니다:
+
+```py
+>>> predictions = torch.argmax(logits, dim=2)
+>>> predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]
+>>> predicted_token_class
+['O',
+ 'O',
+ 'B-location',
+ 'I-location',
+ 'B-group',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'B-location',
+ 'B-location',
+ 'O',
+ 'O']
+```
+</pt>
+<tf>
+텍스트를 토큰화하고 TensorFlow 텐서를 반환합니다:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_wnut_model")
+>>> inputs = tokenizer(text, return_tensors="tf")
+```
+
+입력값을 모델에 전달하고 `logits`을 반환합니다:
+
+```py
+>>> from transformers import TFAutoModelForTokenClassification
+
+>>> model = TFAutoModelForTokenClassification.from_pretrained("stevhliu/my_awesome_wnut_model")
+>>> logits = model(**inputs).logits
+```
+
+가장 높은 확률을 가진 클래스를 모델의 `id2label` 매핑을 사용하여 텍스트 레이블로 변환합니다:
+
+```py
+>>> predicted_token_class_ids = tf.math.argmax(logits, axis=-1)
+>>> predicted_token_class = [model.config.id2label[t] for t in predicted_token_class_ids[0].numpy().tolist()]
+>>> predicted_token_class
+['O',
+ 'O',
+ 'B-location',
+ 'I-location',
+ 'B-group',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'B-location',
+ 'B-location',
+ 'O',
+ 'O']
+```
+</tf>
+</frameworkcontent>
diff --git a/docs/source/ko/tasks/translation.md b/docs/source/ko/tasks/translation.md
new file mode 100644
index 000000000000..b18f56d13b9d
--- /dev/null
+++ b/docs/source/ko/tasks/translation.md
@@ -0,0 +1,409 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 번역[[translation]]
+
+[[open-in-colab]]
+
+<Youtube id="1JvfrvZgi6c"/>
+
+번역은 한 언어로 된 시퀀스를 다른 언어로 변환합니다. 번역이나 요약은 입력을 받아 일련의 출력을 반환하는 강력한 프레임워크인 시퀀스-투-시퀀스 문제로 구성할 수 있는 대표적인 태스크입니다. 번역 시스템은 일반적으로 다른 언어로 된 텍스트 간의 번역에 사용되지만, 음성 간의 통역이나 텍스트-음성 또는 음성-텍스트와 같은 조합에도 사용될 수 있습니다.
+
+이 가이드에서 학습할 내용은:
+
+1. 영어 텍스트를 프랑스어로 번역하기 위해 [T5](https://huggingface.co/t5-small) 모델을 OPUS Books 데이터세트의 영어-프랑스어 하위 집합으로 파인튜닝하는 방법과
+2. 파인튜닝된 모델을 추론에 사용하는 방법입니다.
+
+<Tip>
+이 태스크 가이드는 아래 모델 아키텍처에도 응용할 수 있습니다.
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[BART](../model_doc/bart), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [Encoder decoder](../model_doc/encoder-decoder), [FairSeq Machine-Translation](../model_doc/fsmt), [GPTSAN-japanese](../model_doc/gptsan-japanese), [LED](../model_doc/led), [LongT5](../model_doc/longt5), [M2M100](../model_doc/m2m_100), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [NLLB](../model_doc/nllb), [NLLB-MOE](../model_doc/nllb-moe), [Pegasus](../model_doc/pegasus), [PEGASUS-X](../model_doc/pegasus_x), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [SwitchTransformers](../model_doc/switch_transformers), [T5](../model_doc/t5), [XLM-ProphetNet](../model_doc/xlm-prophetnet)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+시작하기 전에 필요한 라이브러리가 모두 설치되어 있는지 확인하세요:
+
+```bash
+pip install transformers datasets evaluate sacrebleu
+```
+
+모델을 업로드하고 커뮤니티와 공유할 수 있도록 Hugging Face 계정에 로그인하는 것이 좋습니다. 새로운 창이 표시되면 토큰을 입력하여 로그인하세요.
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## OPUS Books 데이터세트 가져오기[[load-opus-books-dataset]]
+
+먼저 🤗 Datasets 라이브러리에서 [OPUS Books](https://huggingface.co/datasets/opus_books) 데이터세트의 영어-프랑스어 하위 집합을 가져오세요.
+
+```py
+>>> from datasets import load_dataset
+
+>>> books = load_dataset("opus_books", "en-fr")
+```
+
+데이터세트를 [`~datasets.Dataset.train_test_split`] 메서드를 사용하여 훈련 및 테스트 데이터로 분할하세요.
+
+```py
+>>> books = books["train"].train_test_split(test_size=0.2)
+```
+
+훈련 데이터에서 예시를 살펴볼까요?
+
+```py
+>>> books["train"][0]
+{'id': '90560',
+ 'translation': {'en': 'But this lofty plateau measured only a few fathoms, and soon we reentered Our Element.',
+  'fr': 'Mais ce plateau élevé ne mesurait que quelques toises, et bientôt nous fûmes rentrés dans notre élément.'}}
+```
+
+반환된 딕셔너리의 `translation` 키가 텍스트의 영어, 프랑스어 버전을 포함하고 있는 것을 볼 수 있습니다.
+
+## 전처리[[preprocess]]
+
+<Youtube id="XAR8jnZZuUs"/>
+
+다음 단계로 영어-프랑스어 쌍을 처리하기 위해 T5 토크나이저를 가져오세요.
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> checkpoint = "t5-small"
+>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+```
+
+만들 전처리 함수는 아래 요구사항을 충족해야 합니다:
+
+1. T5가 번역 태스크임을 인지할 수 있도록 입력 앞에 프롬프트를 추가하세요. 여러 NLP 태스크를 할 수 있는 모델 중 일부는 이렇게 태스크 프롬프트를 미리 줘야합니다.
+2. 원어(영어)과 번역어(프랑스어)를 별도로 토큰화하세요. 영어 어휘로 사전 학습된 토크나이저로 프랑스어 텍스트를 토큰화할 수는 없기 때문입니다.
+3. `max_length` 매개변수로 설정한 최대 길이보다 길지 않도록 시퀀스를 truncate하세요.
+
+```py
+>>> source_lang = "en"
+>>> target_lang = "fr"
+>>> prefix = "translate English to French: "
+
+
+>>> def preprocess_function(examples):
+...     inputs = [prefix + example[source_lang] for example in examples["translation"]]
+...     targets = [example[target_lang] for example in examples["translation"]]
+...     model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
+...     return model_inputs
+```
+
+전체 데이터세트에 전처리 함수를 적용하려면 🤗 Datasets의 [`~datasets.Dataset.map`] 메서드를 사용하세요. `map` 함수의 속도를 높이려면 `batched=True`를 설정하여 데이터세트의 여러 요소를 한 번에 처리하는 방법이 있습니다.
+
+```py
+>>> tokenized_books = books.map(preprocess_function, batched=True)
+```
+
+이제 [`DataCollatorForSeq2Seq`]를 사용하여 예제 배치를 생성합니다. 데이터세트의 최대 길이로 전부를 padding하는 대신, 데이터 정렬 중 각 배치의 최대 길이로 문장을 *동적으로 padding*하는 것이 더 효율적입니다.
+
+<frameworkcontent>
+<pt>
+```py
+>>> from transformers import DataCollatorForSeq2Seq
+
+>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)
+```
+</pt>
+<tf>
+
+```py
+>>> from transformers import DataCollatorForSeq2Seq
+
+>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf")
+```
+</tf>
+</frameworkcontent>
+
+## 평가[[evalulate]]
+
+훈련 중에 메트릭을 포함하면 모델의 성능을 평가하는 데 도움이 됩니다. 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) 라이브러리로 평가 방법(evaluation method)을 빠르게 가져올 수 있습니다. 현재 태스크에 적합한 SacreBLEU 메트릭을 가져오세요. (메트릭을 가져오고 계산하는 방법에 대해 자세히 알아보려면 🤗 Evaluate [둘러보기](https://huggingface.co/docs/evaluate/a_quick_tour)를 참조하세요):
+
+```py
+>>> import evaluate
+
+>>> metric = evaluate.load("sacrebleu")
+```
+
+그런 다음 [`~evaluate.EvaluationModule.compute`]에 예측값과 레이블을 전달하여 SacreBLEU 점수를 계산하는 함수를 생성하세요:
+
+```py
+>>> import numpy as np
+
+
+>>> def postprocess_text(preds, labels):
+...     preds = [pred.strip() for pred in preds]
+...     labels = [[label.strip()] for label in labels]
+
+...     return preds, labels
+
+
+>>> def compute_metrics(eval_preds):
+...     preds, labels = eval_preds
+...     if isinstance(preds, tuple):
+...         preds = preds[0]
+...     decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
+
+...     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+...     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
+
+...     decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
+
+...     result = metric.compute(predictions=decoded_preds, references=decoded_labels)
+...     result = {"bleu": result["score"]}
+
+...     prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
+...     result["gen_len"] = np.mean(prediction_lens)
+...     result = {k: round(v, 4) for k, v in result.items()}
+...     return result
+```
+
+이제 `compute_metrics` 함수는 준비되었고, 훈련 과정을 설정할 때 다시 살펴볼 예정입니다.
+
+## 훈련[[train]]
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+[`Trainer`]로 모델을 파인튜닝하는 방법에 익숙하지 않다면 [여기](../training#train-with-pytorch-trainer)에서 기본 튜토리얼을 살펴보시기 바랍니다!
+
+</Tip>
+
+모델을 훈련시킬 준비가 되었군요! [`AutoModelForSeq2SeqLM`]으로 T5를 로드하세요:
+
+```py
+>>> from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
+
+>>> model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
+```
+
+이제 세 단계만 거치면 끝입니다:
+
+1. [`Seq2SeqTrainingArguments`]에서 훈련 하이퍼파라미터를 정의하세요. 유일한 필수 매개변수는 모델을 저장할 위치인 `output_dir`입니다. 모델을 Hub에 푸시하기 위해 `push_to_hub=True`로 설정하세요. (모델을 업로드하려면 Hugging Face에 로그인해야 합니다.) [`Trainer`]는 에폭이 끝날때마다 SacreBLEU 메트릭을 평가하고 훈련 체크포인트를 저장합니다.
+2. [`Seq2SeqTrainer`]에 훈련 인수를 전달하세요. 모델, 데이터 세트, 토크나이저, data collator 및 `compute_metrics` 함수도 덩달아 전달해야 합니다.
+3. [`~Trainer.train`]을 호출하여 모델을 파인튜닝하세요.
+
+```py
+>>> training_args = Seq2SeqTrainingArguments(
+...     output_dir="my_awesome_opus_books_model",
+...     evaluation_strategy="epoch",
+...     learning_rate=2e-5,
+...     per_device_train_batch_size=16,
+...     per_device_eval_batch_size=16,
+...     weight_decay=0.01,
+...     save_total_limit=3,
+...     num_train_epochs=2,
+...     predict_with_generate=True,
+...     fp16=True,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Seq2SeqTrainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=tokenized_books["train"],
+...     eval_dataset=tokenized_books["test"],
+...     tokenizer=tokenizer,
+...     data_collator=data_collator,
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+````
+
+학습이 완료되면 [`~transformers.Trainer.push_to_hub`] 메서드로 모델을 Hub에 공유하세요. 이러면 누구나 모델을 사용할 수 있게 됩니다:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+<Tip>
+
+Keras로 모델을 파인튜닝하는 방법이 익숙하지 않다면, [여기](../training#train-a-tensorflow-model-with-keras)에서 기본 튜토리얼을 살펴보시기 바랍니다!
+
+</Tip>
+TensorFlow에서 모델을 파인튜닝하려면 우선 optimizer 함수, 학습률 스케줄 등의 훈련 하이퍼파라미터를 설정하세요:
+
+```py
+>>> from transformers import AdamWeightDecay
+
+>>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
+```
+
+이제 [`TFAutoModelForSeq2SeqLM`]로 T5를 가져오세요:
+
+```py
+>>> from transformers import TFAutoModelForSeq2SeqLM
+
+>>> model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint)
+```
+
+[`~transformers.TFPreTrainedModel.prepare_tf_dataset`]로 데이터 세트를 `tf.data.Dataset` 형식으로 변환하세요:
+
+```py
+>>> tf_train_set = model.prepare_tf_dataset(
+...     tokenized_books["train"],
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_test_set = model.prepare_tf_dataset(
+...     tokenized_books["test"],
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+훈련하기 위해 [`compile`](https://keras.io/api/models/model_training_apis/#compile-method) 메서드로 모델을 구성하세요:
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)
+```
+
+훈련을 시작하기 전에 예측값으로부터 SacreBLEU 메트릭을 계산하는 방법과 모델을 Hub에 업로드하는 방법 두 가지를 미리 설정해둬야 합니다. 둘 다 [Keras callbacks](../main_classes/keras_callbacks)로 구현하세요.
+
+[`~transformers.KerasMetricCallback`]에 `compute_metrics` 함수를 전달하세요.
+
+```py
+>>> from transformers.keras_callbacks import KerasMetricCallback
+
+>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
+```
+
+모델과 토크나이저를 업로드할 위치를 [`~transformers.PushToHubCallback`]에서 지정하세요:
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> push_to_hub_callback = PushToHubCallback(
+...     output_dir="my_awesome_opus_books_model",
+...     tokenizer=tokenizer,
+... )
+```
+
+이제 콜백들을 한데로 묶어주세요:
+
+```py
+>>> callbacks = [metric_callback, push_to_hub_callback]
+```
+
+드디어 모델을 훈련시킬 모든 준비를 마쳤군요! 이제 훈련 및 검증 데이터 세트에 [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) 메서드를 에폭 수와 만들어둔 콜백과 함께 호출하여 모델을 파인튜닝하세요:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=callbacks)
+```
+
+학습이 완료되면 모델이 자동으로 Hub에 업로드되고, 누구나 사용할 수 있게 됩니다!
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+번역을 위해 모델을 파인튜닝하는 방법에 대한 보다 자세한 예제는 해당 [PyTorch 노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation.ipynb) 또는 [TensorFlow 노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation-tf.ipynb)을 참조하세요.
+
+</Tip>
+
+## 추론[[inference]]
+
+좋아요, 이제 모델을 파인튜닝했으니 추론에 사용할 수 있습니다!
+
+다른 언어로 번역하고 싶은 텍스트를 써보세요. T5의 경우 원하는 태스크를 입력의 접두사로 추가해야 합니다. 예를 들어 영어에서 프랑스어로 번역하는 경우, 아래와 같은 접두사가 추가됩니다:
+
+```py
+>>> text = "translate English to French: Legumes share resources with nitrogen-fixing bacteria."
+```
+
+파인튜닝된 모델로 추론하기에 제일 간단한 방법은 [`pipeline`]을 사용하는 것입니다. 해당 모델로 번역 `pipeline`을 만든 뒤, 텍스트를 전달하세요:
+
+```py
+>>> from transformers import pipeline
+
+>>> translator = pipeline("translation", model="my_awesome_opus_books_model")
+>>> translator(text)
+[{'translation_text': 'Legumes partagent des ressources avec des bactéries azotantes.'}]
+```
+
+원한다면 `pipeline`의 결과를 직접 복제할 수도 있습니다:
+
+<frameworkcontent>
+<pt>
+텍스트를 토큰화하고 `input_ids`를 PyTorch 텐서로 반환하세요:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_opus_books_model")
+>>> inputs = tokenizer(text, return_tensors="pt").input_ids
+```
+
+[`~transformers.generation_utils.GenerationMixin.generate`] 메서드로 번역을 생성하세요. 다양한 텍스트 생성 전략 및 생성을 제어하기 위한 매개변수에 대한 자세한 내용은 [Text Generation](../main_classes/text_generation) API를 살펴보시기 바랍니다.
+
+```py
+>>> from transformers import AutoModelForSeq2SeqLM
+
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("my_awesome_opus_books_model")
+>>> outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
+```
+
+생성된 토큰 ID들을 다시 텍스트로 디코딩하세요:
+
+```py
+>>> tokenizer.decode(outputs[0], skip_special_tokens=True)
+'Les lignées partagent des ressources avec des bactéries enfixant l'azote.'
+```
+</pt>
+<tf>
+텍스트를 토큰화하고 `input_ids`를 TensorFlow 텐서로 반환하세요:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_opus_books_model")
+>>> inputs = tokenizer(text, return_tensors="tf").input_ids
+```
+
+[`~transformers.generation_tf_utils.TFGenerationMixin.generate`] 메서드로 번역을 생성하세요. 다양한 텍스트 생성 전략 및 생성을 제어하기 위한 매개변수에 대한 자세한 내용은 [Text Generation](../main_classes/text_generation) API를 살펴보시기 바랍니다.
+
+```py
+>>> from transformers import TFAutoModelForSeq2SeqLM
+
+>>> model = TFAutoModelForSeq2SeqLM.from_pretrained("my_awesome_opus_books_model")
+>>> outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
+```
+
+생성된 토큰 ID들을 다시 텍스트로 디코딩하세요:
+
+```py
+>>> tokenizer.decode(outputs[0], skip_special_tokens=True)
+'Les lugumes partagent les ressources avec des bactéries fixatrices d'azote.'
+```
+</tf>
+</frameworkcontent>
diff --git a/docs/source/ko/tasks/video_classification.md b/docs/source/ko/tasks/video_classification.md
new file mode 100644
index 000000000000..eb04352d84a0
--- /dev/null
+++ b/docs/source/ko/tasks/video_classification.md
@@ -0,0 +1,498 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 영상 분류 [[video-classification]]
+
+[[open-in-colab]]
+
+
+영상 분류는 영상 전체에 레이블 또는 클래스를 지정하는 작업입니다. 각 영상에는 하나의 클래스가 있을 것으로 예상됩니다. 영상 분류 모델은 영상을 입력으로 받아 어느 클래스에 속하는지에 대한 예측을 반환합니다. 이러한 모델은 영상이 어떤 내용인지 분류하는 데 사용될 수 있습니다. 영상 분류의 실제 응용 예는 피트니스 앱에서 유용한 동작 / 운동 인식 서비스가 있습니다. 이는 또한 시각 장애인이 이동할 때 보조하는데 사용될 수 있습니다
+
+이 가이드에서는 다음을 수행하는 방법을 보여줍니다:
+
+1. [UCF101](https://www.crcv.ucf.edu/data/UCF101.php) 데이터 세트의 하위 집합을 통해 [VideoMAE](https://huggingface.co/docs/transformers/main/en/model_doc/videomae) 모델을 미세 조정하기.
+2. 미세 조정한 모델을 추론에 사용하기.
+
+<Tip>
+
+이 튜토리얼에서 설명하는 작업은 다음 모델 아키텍처에서 지원됩니다:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[TimeSformer](../model_doc/timesformer), [VideoMAE](../model_doc/videomae)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+
+시작하기 전에 필요한 모든 라이브러리가 설치되었는지 확인하세요:
+```bash
+pip install -q pytorchvideo transformers evaluate
+```
+
+영상을 처리하고 준비하기 위해 [PyTorchVideo](https://pytorchvideo.org/)(이하 `pytorchvideo`)를 사용합니다.
+
+커뮤니티에 모델을 업로드하고 공유할 수 있도록 Hugging Face 계정에 로그인하는 것을 권장합니다. 프롬프트가 나타나면 토큰을 입력하여 로그인하세요:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## UCF101 데이터셋 불러오기 [[load-ufc101-dataset]]
+
+[UCF-101](https://www.crcv.ucf.edu/data/UCF101.php) 데이터 세트의 하위 집합(subset)을 불러오는 것으로 시작할 수 있습니다. 전체 데이터 세트를 학습하는데 더 많은 시간을 할애하기 전에 데이터의 하위 집합을 불러와 모든 것이 잘 작동하는지 실험하고 확인할 수 있습니다.
+
+```py
+>>> from huggingface_hub import hf_hub_download
+
+>>> hf_dataset_identifier = "sayakpaul/ucf101-subset"
+>>> filename = "UCF101_subset.tar.gz"
+>>> file_path = hf_hub_download(repo_id=hf_dataset_identifier, filename=filename, repo_type="dataset")
+```
+
+데이터 세트의 하위 집합이 다운로드 되면, 압축된 파일의 압축을 해제해야 합니다:
+```py 
+>>> import tarfile
+
+>>> with tarfile.open(file_path) as t:
+...      t.extractall(".")
+```
+
+전체 데이터 세트는 다음과 같이 구성되어 있습니다.
+
+```bash
+UCF101_subset/
+    train/
+        BandMarching/
+            video_1.mp4
+            video_2.mp4
+            ...
+        Archery
+            video_1.mp4
+            video_2.mp4
+            ...
+        ...
+    val/
+        BandMarching/
+            video_1.mp4
+            video_2.mp4
+            ...
+        Archery
+            video_1.mp4
+            video_2.mp4
+            ...
+        ...
+    test/
+        BandMarching/
+            video_1.mp4
+            video_2.mp4
+            ...
+        Archery
+            video_1.mp4
+            video_2.mp4
+            ...
+        ...
+```
+
+
+정렬된 영상의 경로는 다음과 같습니다:
+
+```bash
+...
+'UCF101_subset/train/ApplyEyeMakeup/v_ApplyEyeMakeup_g07_c04.avi',
+'UCF101_subset/train/ApplyEyeMakeup/v_ApplyEyeMakeup_g07_c06.avi',
+'UCF101_subset/train/ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c01.avi',
+'UCF101_subset/train/ApplyEyeMakeup/v_ApplyEyeMakeup_g09_c02.avi',
+'UCF101_subset/train/ApplyEyeMakeup/v_ApplyEyeMakeup_g09_c06.avi'
+...
+```
+
+동일한 그룹/장면에 속하는 영상 클립은 파일 경로에서 `g`로 표시되어 있습니다. 예를 들면, `v_ApplyEyeMakeup_g07_c04.avi`와 `v_ApplyEyeMakeup_g07_c06.avi` 이 있습니다. 이 둘은 같은 그룹입니다.
+
+검증 및 평가 데이터 분할을 할 때, [데이터 누출(data leakage)](https://www.kaggle.com/code/alexisbcook/data-leakage)을 방지하기 위해 동일한 그룹 / 장면의 영상 클립을 사용하지 않아야 합니다. 이 튜토리얼에서 사용하는 하위 집합은 이러한 정보를 고려하고 있습니다.
+
+그 다음으로, 데이터 세트에 존재하는 라벨을 추출합니다. 또한, 모델을 초기화할 때 도움이 될 딕셔너리(dictionary data type)를 생성합니다.
+
+* `label2id`: 클래스 이름을 정수에 매핑합니다.
+* `id2label`: 정수를 클래스 이름에 매핑합니다. 
+
+```py 
+>>> class_labels = sorted({str(path).split("/")[2] for path in all_video_file_paths})
+>>> label2id = {label: i for i, label in enumerate(class_labels)}
+>>> id2label = {i: label for label, i in label2id.items()}
+
+>>> print(f"Unique classes: {list(label2id.keys())}.")
+
+# Unique classes: ['ApplyEyeMakeup', 'ApplyLipstick', 'Archery', 'BabyCrawling', 'BalanceBeam', 'BandMarching', 'BaseballPitch', 'Basketball', 'BasketballDunk', 'BenchPress'].
+```
+
+이 데이터 세트에는 총 10개의 고유한 클래스가 있습니다. 각 클래스마다 30개의 영상이 훈련 세트에 있습니다
+
+## 미세 조정하기 위해 모델 가져오기 [[load-a-model-to-fine-tune]]
+
+사전 훈련된 체크포인트와 체크포인트에 연관된 이미지 프로세서를 사용하여 영상 분류 모델을 인스턴스화합니다. 모델의 인코더에는 미리 학습된 매개변수가 제공되며, 분류 헤드(데이터를 분류하는 마지막 레이어)는 무작위로 초기화됩니다. 데이터 세트의 전처리 파이프라인을 작성할 때는 이미지 프로세서가 유용합니다.
+
+```py 
+>>> from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification
+
+>>> model_ckpt = "MCG-NJU/videomae-base"
+>>> image_processor = VideoMAEImageProcessor.from_pretrained(model_ckpt)
+>>> model = VideoMAEForVideoClassification.from_pretrained(
+...     model_ckpt,
+...     label2id=label2id,
+...     id2label=id2label,
+...     ignore_mismatched_sizes=True,  # provide this in case you're planning to fine-tune an already fine-tuned checkpoint
+... )
+```
+
+모델을 가져오는 동안, 다음과 같은 경고를 마주칠 수 있습니다:
+
+```bash
+Some weights of the model checkpoint at MCG-NJU/videomae-base were not used when initializing VideoMAEForVideoClassification: [..., 'decoder.decoder_layers.1.attention.output.dense.bias', 'decoder.decoder_layers.2.attention.attention.key.weight']
+- This IS expected if you are initializing VideoMAEForVideoClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
+- This IS NOT expected if you are initializing VideoMAEForVideoClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
+Some weights of VideoMAEForVideoClassification were not initialized from the model checkpoint at MCG-NJU/videomae-base and are newly initialized: ['classifier.bias', 'classifier.weight']
+You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+```
+
+
+위 경고는 우리가 일부 가중치(예: `classifier` 층의 가중치와 편향)를 버리고 새로운 `classifier` 층의 가중치와 편향을 무작위로 초기화하고 있다는 것을 알려줍니다. 이 경우에는 미리 학습된 가중치가 없는 새로운 헤드를 추가하고 있으므로, 라이브러리가 모델을 추론에 사용하기 전에 미세 조정하라고 경고를 보내는 것은 당연합니다. 그리고 이제 우리는 이 모델을 미세 조정할 예정입니다.
+
+**참고** 이 [체크포인트](https://huggingface.co/MCG-NJU/videomae-base-finetuned-kinetics)는 도메인이 많이 중첩된 유사한 다운스트림 작업에 대해 미세 조정하여 얻은 체크포인트이므로 이 작업에서 더 나은 성능을 보일 수 있습니다. `MCG-NJU/videomae-base-finetuned-kinetics` 데이터 세트를 미세 조정하여 얻은 [체크포인트](https://huggingface.co/sayakpaul/videomae-base-finetuned-kinetics-finetuned-ucf101-subset)도 있습니다.
+
+## 훈련을 위한 데이터 세트 준비하기[[prepare-the-datasets-for-training]]
+
+영상 전처리를 위해 [PyTorchVideo 라이브러리](https://pytorchvideo.org/)를 활용할 것입니다. 필요한 종속성을 가져오는 것으로 시작하세요.
+
+```py 
+>>> import pytorchvideo.data
+
+>>> from pytorchvideo.transforms import (
+...     ApplyTransformToKey,
+...     Normalize,
+...     RandomShortSideScale,
+...     RemoveKey,
+...     ShortSideScale,
+...     UniformTemporalSubsample,
+... )
+
+>>> from torchvision.transforms import (
+...     Compose,
+...     Lambda,
+...     RandomCrop,
+...     RandomHorizontalFlip,
+...     Resize,
+... )
+```
+
+학습 데이터 세트 변환에는 '균일한 시간 샘플링(uniform temporal subsampling)', '픽셀 정규화(pixel normalization)', '랜덤 잘라내기(random cropping)' 및 '랜덤 수평 뒤집기(random horizontal flipping)'의 조합을 사용합니다. 검증 및 평가 데이터 세트 변환에는 '랜덤 잘라내기'와 '랜덤 뒤집기'를 제외한 동일한 변환 체인을 유지합니다. 이러한 변환에 대해 자세히 알아보려면 [PyTorchVideo 공식 문서](https://pytorchvideo.org)를 확인하세요.
+
+사전 훈련된 모델과 관련된 이미지 프로세서를 사용하여 다음 정보를 얻을 수 있습니다:
+
+* 영상 프레임 픽셀을 정규화하는 데 사용되는 이미지 평균과 표준 편차
+* 영상 프레임이 조정될 공간 해상도
+
+
+먼저, 몇 가지 상수를 정의합니다.
+
+```py
+>>> mean = image_processor.image_mean
+>>> std = image_processor.image_std
+>>> if "shortest_edge" in image_processor.size:
+...     height = width = image_processor.size["shortest_edge"]
+>>> else:
+...     height = image_processor.size["height"]
+...     width = image_processor.size["width"]
+>>> resize_to = (height, width)
+
+>>> num_frames_to_sample = model.config.num_frames
+>>> sample_rate = 4
+>>> fps = 30
+>>> clip_duration = num_frames_to_sample * sample_rate / fps
+```
+
+이제 데이터 세트에 특화된 전처리(transform)과 데이터 세트 자체를 정의합니다. 먼저 훈련 데이터 세트로 시작합니다:
+
+```py 
+>>> train_transform = Compose(
+...     [
+...         ApplyTransformToKey(
+...             key="video",
+...             transform=Compose(
+...                 [
+...                     UniformTemporalSubsample(num_frames_to_sample),
+...                     Lambda(lambda x: x / 255.0),
+...                     Normalize(mean, std),
+...                     RandomShortSideScale(min_size=256, max_size=320),
+...                     RandomCrop(resize_to),
+...                     RandomHorizontalFlip(p=0.5),
+...                 ]
+...             ),
+...         ),
+...     ]
+... )
+
+>>> train_dataset = pytorchvideo.data.Ucf101(
+...     data_path=os.path.join(dataset_root_path, "train"),
+...     clip_sampler=pytorchvideo.data.make_clip_sampler("random", clip_duration),
+...     decode_audio=False,
+...     transform=train_transform,
+... )
+```
+
+같은 방식의 작업 흐름을 검증과 평가 세트에도 적용할 수 있습니다.
+
+```py 
+>>> val_transform = Compose(
+...     [
+...         ApplyTransformToKey(
+...             key="video",
+...             transform=Compose(
+...                 [
+...                     UniformTemporalSubsample(num_frames_to_sample),
+...                     Lambda(lambda x: x / 255.0),
+...                     Normalize(mean, std),
+...                     Resize(resize_to),
+...                 ]
+...             ),
+...         ),
+...     ]
+... )
+
+>>> val_dataset = pytorchvideo.data.Ucf101(
+...     data_path=os.path.join(dataset_root_path, "val"),
+...     clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", clip_duration),
+...     decode_audio=False,
+...     transform=val_transform,
+... )
+
+>>> test_dataset = pytorchvideo.data.Ucf101(
+...     data_path=os.path.join(dataset_root_path, "test"),
+...     clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", clip_duration),
+...     decode_audio=False,
+...     transform=val_transform,
+... )
+```
+
+
+**참고**: 위의 데이터 세트의 파이프라인은 [공식 파이토치 예제](https://pytorchvideo.org/docs/tutorial_classification#dataset)에서 가져온 것입니다. 우리는 UCF-101 데이터셋에 맞게 [`pytorchvideo.data.Ucf101()`](https://pytorchvideo.readthedocs.io/en/latest/api/data/data.html#pytorchvideo.data.Ucf101) 함수를 사용하고 있습니다. 내부적으로 이 함수는 [`pytorchvideo.data.labeled_video_dataset.LabeledVideoDataset`](https://pytorchvideo.readthedocs.io/en/latest/api/data/data.html#pytorchvideo.data.LabeledVideoDataset) 객체를 반환합니다. `LabeledVideoDataset` 클래스는 PyTorchVideo 데이터셋에서 모든 영상 관련 작업의 기본 클래스입니다. 따라서 PyTorchVideo에서 미리 제공하지 않는 사용자 지정 데이터 세트를 사용하려면, 이 클래스를 적절하게 확장하면 됩니다. 더 자세한 사항이 알고 싶다면 `data` API [문서](https://pytorchvideo.readthedocs.io/en/latest/api/data/data.html) 를 참고하세요. 또한 위의 예시와 유사한 구조를 갖는 데이터 세트를 사용하고 있다면, `pytorchvideo.data.Ucf101()` 함수를 사용하는 데 문제가 없을 것입니다.
+
+데이터 세트에 영상의 개수를 알기 위해 `num_videos` 인수에 접근할 수 있습니다.
+
+```py
+>>> print(train_dataset.num_videos, val_dataset.num_videos, test_dataset.num_videos)
+# (300, 30, 75)
+```
+
+## 더 나은 디버깅을 위해 전처리 영상 시각화하기[[visualize-the-preprocessed-video-for-better-debugging]]
+
+```py 
+>>> import imageio
+>>> import numpy as np
+>>> from IPython.display import Image
+
+>>> def unnormalize_img(img):
+...     """Un-normalizes the image pixels."""
+...     img = (img * std) + mean
+...     img = (img * 255).astype("uint8")
+...     return img.clip(0, 255)
+
+>>> def create_gif(video_tensor, filename="sample.gif"):
+...     """Prepares a GIF from a video tensor.
+...     
+...     The video tensor is expected to have the following shape:
+...     (num_frames, num_channels, height, width).
+...     """
+...     frames = []
+...     for video_frame in video_tensor:
+...         frame_unnormalized = unnormalize_img(video_frame.permute(1, 2, 0).numpy())
+...         frames.append(frame_unnormalized)
+...     kargs = {"duration": 0.25}
+...     imageio.mimsave(filename, frames, "GIF", **kargs)
+...     return filename
+
+>>> def display_gif(video_tensor, gif_name="sample.gif"):
+...     """Prepares and displays a GIF from a video tensor."""
+...     video_tensor = video_tensor.permute(1, 0, 2, 3)
+...     gif_filename = create_gif(video_tensor, gif_name)
+...     return Image(filename=gif_filename)
+
+>>> sample_video = next(iter(train_dataset))
+>>> video_tensor = sample_video["video"]
+>>> display_gif(video_tensor)
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/sample_gif.gif" alt="Person playing basketball"/>
+</div>
+
+## 모델 훈련하기[[train-the-model]] 
+
+🤗 Transformers의 [`Trainer`](https://huggingface.co/docs/transformers/main_classes/trainer)를 사용하여 모델을 훈련시켜보세요. `Trainer`를 인스턴스화하려면 훈련 설정과 평가 지표를 정의해야 합니다.  가장 중요한 것은 [`TrainingArguments`](https://huggingface.co/transformers/main_classes/trainer.html#transformers.TrainingArguments)입니다. 이 클래스는 훈련을 구성하는 모든 속성을 포함하며, 훈련 중 체크포인트를 저장할 출력 폴더 이름을 필요로 합니다. 또한 🤗 Hub의 모델 저장소의 모든 정보를 동기화하는 데 도움이 됩니다.
+
+대부분의 훈련 인수는 따로 설명할 필요는 없습니다. 하지만 여기에서 중요한 인수는 `remove_unused_columns=False` 입니다. 이 인자는 모델의 호출 함수에서 사용되지 않는 모든 속성 열(columns)을 삭제합니다. 기본값은 일반적으로 True입니다. 이는 사용되지 않는 기능 열을 삭제하는 것이 이상적이며, 입력을 모델의 호출 함수로 풀기(unpack)가 쉬워지기 때문입니다. 하지만 이 경우에는 `pixel_values`(모델의 입력으로 필수적인 키)를 생성하기 위해 사용되지 않는 기능('video'가 특히 그렇습니다)이 필요합니다. 따라서 remove_unused_columns을 False로 설정해야 합니다.
+
+```py 
+>>> from transformers import TrainingArguments, Trainer
+
+>>> model_name = model_ckpt.split("/")[-1]
+>>> new_model_name = f"{model_name}-finetuned-ucf101-subset"
+>>> num_epochs = 4
+
+>>> args = TrainingArguments(
+...     new_model_name,
+...     remove_unused_columns=False,
+...     evaluation_strategy="epoch",
+...     save_strategy="epoch",
+...     learning_rate=5e-5,
+...     per_device_train_batch_size=batch_size,
+...     per_device_eval_batch_size=batch_size,
+...     warmup_ratio=0.1,
+...     logging_steps=10,
+...     load_best_model_at_end=True,
+...     metric_for_best_model="accuracy",
+...     push_to_hub=True,
+...     max_steps=(train_dataset.num_videos // batch_size) * num_epochs,
+... )
+```
+
+`pytorchvideo.data.Ucf101()` 함수로 반환되는 데이터 세트는 `__len__` 메소드가 이식되어 있지 않습니다. 따라서,  `TrainingArguments`를 인스턴스화할 때 `max_steps`를 정의해야 합니다.
+
+다음으로, 평가지표를 불러오고, 예측값에서 평가지표를 계산할 함수를 정의합니다. 필요한 전처리 작업은 예측된 로짓(logits)에 argmax 값을 취하는 것뿐입니다:
+
+```py
+import evaluate
+
+metric = evaluate.load("accuracy")
+
+
+def compute_metrics(eval_pred):
+    predictions = np.argmax(eval_pred.predictions, axis=1)
+    return metric.compute(predictions=predictions, references=eval_pred.label_ids)
+```
+
+**평가에 대한 참고사항**:
+
+[VideoMAE 논문](https://arxiv.org/abs/2203.12602)에서 저자는 다음과 같은 평가 전략을 사용합니다. 테스트 영상에서 여러 클립을 선택하고 그 클립에 다양한 크롭을 적용하여 집계 점수를 보고합니다. 그러나 이번 튜토리얼에서는 간단함과 간결함을 위해 해당 전략을 고려하지 않습니다.
+
+또한, 예제를 묶어서 배치를 형성하는 `collate_fn`을 정의해야합니다. 각 배치는 `pixel_values`와 `labels`라는 2개의 키로 구성됩니다.
+
+```py 
+>>> def collate_fn(examples):
+...     # permute to (num_frames, num_channels, height, width)
+...     pixel_values = torch.stack(
+...         [example["video"].permute(1, 0, 2, 3) for example in examples]
+...     )
+...     labels = torch.tensor([example["label"] for example in examples])
+...     return {"pixel_values": pixel_values, "labels": labels}
+```
+
+그런 다음 이 모든 것을 데이터 세트와 함께 `Trainer`에 전달하기만 하면 됩니다:
+
+```py 
+>>> trainer = Trainer(
+...     model,
+...     args,
+...     train_dataset=train_dataset,
+...     eval_dataset=val_dataset,
+...     tokenizer=image_processor,
+...     compute_metrics=compute_metrics,
+...     data_collator=collate_fn,
+... )
+```
+
+데이터를 이미 처리했는데도 불구하고 `image_processor`를 토크나이저 인수로 넣은 이유는 JSON으로 저장되는 이미지 프로세서 구성 파일이 Hub의 저장소에 업로드되도록 하기 위함입니다.
+
+`train` 메소드를 호출하여 모델을 미세 조정하세요:
+
+```py 
+>>> train_results = trainer.train()
+```
+
+학습이 완료되면, 모델을 [`~transformers.Trainer.push_to_hub`] 메소드를 사용하여 허브에 공유하여 누구나 모델을 사용할 수 있도록 합니다:
+```py
+>>> trainer.push_to_hub()
+```
+
+## 추론하기[[inference]]
+
+좋습니다. 이제 미세 조정된 모델을 추론하는 데 사용할 수 있습니다.
+
+추론에 사용할 영상을 불러오세요:
+```py 
+>>> sample_test_video = next(iter(test_dataset))
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/sample_gif_two.gif" alt="Teams playing basketball"/>
+</div>
+
+미세 조정된 모델을 추론에 사용하는 가장 간단한 방법은 [`pipeline`](https://huggingface.co/docs/transformers/main/en/main_classes/pipelines#transformers.VideoClassificationPipeline)에서 모델을 사용하는 것입니다. 모델로 영상 분류를 하기 위해 `pipeline`을 인스턴스화하고 영상을 전달하세요:
+
+```py
+>>> from transformers import pipeline
+
+>>> video_cls = pipeline(model="my_awesome_video_cls_model")
+>>> video_cls("https://huggingface.co/datasets/sayakpaul/ucf101-subset/resolve/main/v_BasketballDunk_g14_c06.avi")
+[{'score': 0.9272987842559814, 'label': 'BasketballDunk'},
+ {'score': 0.017777055501937866, 'label': 'BabyCrawling'},
+ {'score': 0.01663011871278286, 'label': 'BalanceBeam'},
+ {'score': 0.009560945443809032, 'label': 'BandMarching'},
+ {'score': 0.0068979403004050255, 'label': 'BaseballPitch'}]
+```
+
+만약 원한다면 수동으로 `pipeline`의 결과를 재현할 수 있습니다:
+
+
+```py
+>>> def run_inference(model, video):
+...     # (num_frames, num_channels, height, width)
+...     perumuted_sample_test_video = video.permute(1, 0, 2, 3)
+...     inputs = {
+...         "pixel_values": perumuted_sample_test_video.unsqueeze(0),
+...         "labels": torch.tensor(
+...             [sample_test_video["label"]]
+...         ),  # this can be skipped if you don't have labels available.
+...     }
+
+...     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+...     inputs = {k: v.to(device) for k, v in inputs.items()}
+...     model = model.to(device)
+
+...     # forward pass
+...     with torch.no_grad():
+...         outputs = model(**inputs)
+...         logits = outputs.logits
+
+...     return logits
+```
+
+모델에 입력값을 넣고 `logits`을 반환받으세요:
+
+```
+>>> logits = run_inference(trained_model, sample_test_video["video"])
+```
+
+`logits`을 디코딩하면, 우리는 다음 결과를 얻을 수 있습니다:
+
+```py 
+>>> predicted_class_idx = logits.argmax(-1).item()
+>>> print("Predicted class:", model.config.id2label[predicted_class_idx])
+# Predicted class: BasketballDunk
+```
diff --git a/docs/source/ko/tasks/visual_question_answering.md b/docs/source/ko/tasks/visual_question_answering.md
new file mode 100644
index 000000000000..f8560b14f9b8
--- /dev/null
+++ b/docs/source/ko/tasks/visual_question_answering.md
@@ -0,0 +1,375 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 시각적 질의응답 (Visual Question Answering)
+
+[[open-in-colab]]
+
+시각적 질의응답(VQA)은 이미지를 기반으로 개방형 질문에 대응하는 작업입니다. 이 작업을 지원하는 모델의 입력은 대부분 이미지와 질문의 조합이며, 출력은 자연어로 된 답변입니다.
+
+VQA의 주요 사용 사례는 다음과 같습니다:
+* 시각 장애인을 위한 접근성 애플리케이션을 구축할 수 있습니다.
+* 교육: 강의나 교과서에 나온 시각 자료에 대한 질문에 답할 수 있습니다. 또한 체험형 전시와 유적 등에서도 VQA를 활용할 수 있습니다.
+* 고객 서비스 및 전자상거래: VQA는 사용자가 제품에 대해 질문할 수 있게 함으로써 사용자 경험을 향상시킬 수 있습니다.
+* 이미지 검색: VQA 모델을 사용하여 원하는 특성을 가진 이미지를 검색할 수 있습니다. 예를 들어 사용자는 "강아지가 있어?"라고 물어봐서 주어진 이미지 묶음에서 강아지가 있는 모든 이미지를 받아볼 수 있습니다.
+
+이 가이드에서 학습할 내용은 다음과 같습니다:
+
+- VQA 모델 중 하나인 [ViLT](../../en/model_doc/vilt)를 [`Graphcore/vqa` 데이터셋](https://huggingface.co/datasets/Graphcore/vqa) 에서 미세조정하는 방법
+- 미세조정된 ViLT 모델로 추론하는 방법
+- BLIP-2 같은 생성 모델로 제로샷 VQA 추론을 실행하는 방법
+
+## ViLT 미세 조정 [[finetuning-vilt]]
+
+ViLT는 Vision Transformer (ViT) 내에 텍스트 임베딩을 포함하여 비전/자연어 사전훈련(VLP; Vision-and-Language Pretraining)을 위한 기본 디자인을 제공합니다.
+ViLT 모델은 비전 트랜스포머(ViT)에 텍스트 임베딩을 넣어 비전/언어 사전훈련(VLP; Vision-and-Language Pre-training)을 위한 기본적인 디자인을 갖췄습니다. 이 모델은 여러 다운스트림 작업에 사용할 수 있습니다. VQA 태스크에서는 (`[CLS]` 토큰의 최종 은닉 상태 위에 선형 레이어인) 분류 헤더가 있으며 무작위로 초기화됩니다. 
+따라서 여기에서 시각적 질의응답은 **분류 문제**로 취급됩니다.
+
+최근의 BLIP, BLIP-2, InstructBLIP와 같은 모델들은 VQA를 생성형 작업으로 간주합니다. 가이드의 후반부에서는 이런 모델들을 사용하여 제로샷 VQA 추론을 하는 방법에 대해 설명하겠습니다.
+
+시작하기 전 필요한 모든 라이브러리를 설치했는지 확인하세요.
+
+```bash
+pip install -q transformers datasets
+```
+
+커뮤니티에 모델을 공유하는 것을 권장 드립니다. Hugging Face 계정에 로그인하여 🤗 Hub에 업로드할 수 있습니다.
+메시지가 나타나면 로그인할 토큰을 입력하세요:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+모델 체크포인트를 전역 변수로 선언하세요.
+
+```py
+>>> model_checkpoint = "dandelin/vilt-b32-mlm"
+```
+
+## 데이터 가져오기 [[load-the-data]]
+
+이 가이드에서는 `Graphcore/vqa` 데이터세트의 작은 샘플을 사용합니다. 전체 데이터세트는 [🤗 Hub](https://huggingface.co/datasets/Graphcore/vqa) 에서 확인할 수 있습니다.
+
+[`Graphcore/vqa` 데이터세트](https://huggingface.co/datasets/Graphcore/vqa) 의 대안으로 공식 [VQA 데이터세트 페이지](https://visualqa.org/download.html) 에서 동일한 데이터를 수동으로 다운로드할 수 있습니다. 직접 공수한 데이터로 튜토리얼을 따르고 싶다면 [이미지 데이터세트 만들기](https://huggingface.co/docs/datasets/image_dataset#loading-script) 라는
+🤗 Datasets 문서를 참조하세요.
+
+검증 데이터의 첫 200개 항목을 불러와 데이터세트의 특성을 확인해 보겠습니다:
+
+```python
+>>> from datasets import load_dataset
+
+>>> dataset = load_dataset("Graphcore/vqa", split="validation[:200]")
+>>> dataset
+Dataset({
+    features: ['question', 'question_type', 'question_id', 'image_id', 'answer_type', 'label'],
+    num_rows: 200
+})
+```
+
+예제를 하나 뽑아 데이터세트의 특성을 이해해 보겠습니다.
+
+```py
+>>> dataset[0]
+{'question': 'Where is he looking?',
+ 'question_type': 'none of the above',
+ 'question_id': 262148000,
+ 'image_id': '/root/.cache/huggingface/datasets/downloads/extracted/ca733e0e000fb2d7a09fbcc94dbfe7b5a30750681d0e965f8e0a23b1c2f98c75/val2014/COCO_val2014_000000262148.jpg',
+ 'answer_type': 'other',
+ 'label': {'ids': ['at table', 'down', 'skateboard', 'table'],
+  'weights': [0.30000001192092896,
+   1.0,
+   0.30000001192092896,
+   0.30000001192092896]}}
+```
+
+데이터세트에는 다음과 같은 특성이 포함되어 있습니다:
+* `question`: 이미지에 대한 질문
+* `image_id`: 질문과 관련된 이미지의 경로
+* `label`: 데이터의 레이블 (annotations)
+
+나머지 특성들은 필요하지 않기 때문에 삭제해도 됩니다:
+
+```py 
+>>> dataset = dataset.remove_columns(['question_type', 'question_id', 'answer_type'])
+```
+
+보시다시피 `label` 특성은 같은 질문마다 답변이 여러 개 있을 수 있습니다. 모두 다른 데이터 라벨러들로부터 수집되었기 때문인데요. 질문의 답변은 주관적일 수 있습니다. 이 경우 질문은 "그는 어디를 보고 있나요?" 였지만, 어떤 사람들은 "아래"로 레이블을 달았고, 다른 사람들은 "테이블" 또는 "스케이트보드" 등으로 주석을 달았습니다.
+
+아래의 이미지를 보고 어떤 답변을 선택할 것인지 생각해 보세요:
+
+```python
+>>> from PIL import Image
+
+>>> image = Image.open(dataset[0]['image_id'])
+>>> image
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/vqa-example.png" alt="VQA Image Example"/>
+</div>
+
+질문과 답변의 모호성으로 인해 이러한 데이터세트는 여러 개의 답변이 가능하므로 다중 레이블 분류 문제로 처리됩니다. 게다가, 원핫(one-hot) 인코딩 벡터를 생성하기보다는 레이블에서 특정 답변이 나타나는 횟수를 기반으로 소프트 인코딩을 생성합니다.
+
+위의 예시에서 "아래"라는 답변이 다른 답변보다 훨씬 더 자주 선택되었기 때문에 데이터세트에서 `weight`라고 불리는 점수로 1.0을 가지며, 나머지 답변들은 1.0 미만의 점수를 가집니다.
+
+적절한 분류 헤더로 모델을 나중에 인스턴스화하기 위해 레이블을 정수로 매핑한 딕셔너리 하나, 반대로 정수를 레이블로 매핑한 딕셔너리 하나 총 2개의 딕셔너리를 생성하세요:
+
+```py
+>>> import itertools
+
+>>> labels = [item['ids'] for item in dataset['label']]
+>>> flattened_labels = list(itertools.chain(*labels))
+>>> unique_labels = list(set(flattened_labels))
+
+>>> label2id = {label: idx for idx, label in enumerate(unique_labels)}
+>>> id2label = {idx: label for label, idx in label2id.items()} 
+```
+
+이제 매핑이 완료되었으므로 문자열 답변을 해당 id로 교체하고, 데이터세트의 더 편리한 후처리를 위해 편평화 할 수 있습니다.
+
+```python
+>>> def replace_ids(inputs):
+...   inputs["label"]["ids"] = [label2id[x] for x in inputs["label"]["ids"]]
+...   return inputs
+
+
+>>> dataset = dataset.map(replace_ids)
+>>> flat_dataset = dataset.flatten()
+>>> flat_dataset.features
+{'question': Value(dtype='string', id=None),
+ 'image_id': Value(dtype='string', id=None),
+ 'label.ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
+ 'label.weights': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None)}
+```
+
+## 데이터 전처리 [[preprocessing-data]]
+
+다음 단계는 모델을 위해 이미지와 텍스트 데이터를 준비하기 위해 ViLT 프로세서를 가져오는 것입니다. 
+[`ViltProcessor`]는 BERT 토크나이저와 ViLT 이미지 프로세서를 편리하게 하나의 프로세서로 묶습니다:
+
+```py 
+>>> from transformers import ViltProcessor
+
+>>> processor = ViltProcessor.from_pretrained(model_checkpoint)
+```
+
+데이터를 전처리하려면 이미지와 질문을 [`ViltProcessor`]로 인코딩해야 합니다. 프로세서는 [`BertTokenizerFast`]로 텍스트를 토크나이즈하고 텍스트 데이터를 위해 `input_ids`, `attention_mask` 및 `token_type_ids`를 생성합니다.
+이미지는 [`ViltImageProcessor`]로 이미지를 크기 조정하고 정규화하며, `pixel_values`와 `pixel_mask`를 생성합니다.
+
+이런 전처리 단계는 모두 내부에서 이루어지므로, `processor`를 호출하기만 하면 됩니다. 하지만 아직 타겟 레이블이 완성되지 않았습니다. 타겟의 표현에서 각 요소는 가능한 답변(레이블)에 해당합니다. 정확한 답변의 요소는 해당 점수(weight)를 유지시키고 나머지 요소는 0으로 설정해야 합니다.
+
+아래 함수가 위에서 설명한대로 이미지와 질문에 `processor`를 적용하고 레이블을 형식에 맞춥니다:
+
+```py
+>>> import torch
+
+>>> def preprocess_data(examples):
+...     image_paths = examples['image_id']
+...     images = [Image.open(image_path) for image_path in image_paths]
+...     texts = examples['question']    
+
+...     encoding = processor(images, texts, padding="max_length", truncation=True, return_tensors="pt")
+
+...     for k, v in encoding.items():
+...           encoding[k] = v.squeeze()
+    
+...     targets = []
+
+...     for labels, scores in zip(examples['label.ids'], examples['label.weights']):
+...         target = torch.zeros(len(id2label))
+
+...         for label, score in zip(labels, scores):
+...             target[label] = score
+      
+...         targets.append(target)
+
+...     encoding["labels"] = targets
+    
+...     return encoding
+```
+
+전체 데이터세트에 전처리 함수를 적용하려면 🤗 Datasets의 [`~datasets.map`] 함수를 사용하십시오. `batched=True`를 설정하여 데이터세트의 여러 요소를 한 번에 처리함으로써 `map`을 더 빠르게 할 수 있습니다. 이 시점에서 필요하지 않은 열은 제거하세요.
+
+```py
+>>> processed_dataset = flat_dataset.map(preprocess_data, batched=True, remove_columns=['question','question_type',  'question_id', 'image_id', 'answer_type', 'label.ids', 'label.weights'])
+>>> processed_dataset
+Dataset({
+    features: ['input_ids', 'token_type_ids', 'attention_mask', 'pixel_values', 'pixel_mask', 'labels'],
+    num_rows: 200
+})
+```
+
+마지막 단계로, [`DefaultDataCollator`]를 사용하여 예제로 쓸 배치를 생성하세요:
+
+```py
+>>> from transformers import DefaultDataCollator
+
+>>> data_collator = DefaultDataCollator()
+```
+
+## 모델 훈련 [[train-the-model]]
+
+이제 모델을 훈련하기 위해 준비되었습니다! [`ViltForQuestionAnswering`]으로 ViLT를 가져올 차례입니다. 레이블의 수와 레이블 매핑을 지정하세요:
+
+```py
+>>> from transformers import ViltForQuestionAnswering
+
+>>> model = ViltForQuestionAnswering.from_pretrained(model_checkpoint, num_labels=len(id2label), id2label=id2label, label2id=label2id)
+```
+
+이 시점에서는 다음 세 단계만 남았습니다:
+
+1. [`TrainingArguments`]에서 훈련 하이퍼파라미터를 정의하세요:
+
+```py
+>>> from transformers import TrainingArguments
+
+>>> repo_id = "MariaK/vilt_finetuned_200"
+
+>>> training_args = TrainingArguments(
+...     output_dir=repo_id,
+...     per_device_train_batch_size=4,
+...     num_train_epochs=20,
+...     save_steps=200,
+...     logging_steps=50,
+...     learning_rate=5e-5,
+...     save_total_limit=2,
+...     remove_unused_columns=False,
+...     push_to_hub=True,
+... )
+```
+
+2. 모델, 데이터세트, 프로세서, 데이터 콜레이터와 함께 훈련 인수를 [`Trainer`]에 전달하세요:
+
+```py
+>>> from transformers import Trainer
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     data_collator=data_collator,
+...     train_dataset=processed_dataset,
+...     tokenizer=processor,
+... )
+```
+
+3. [`~Trainer.train`]을 호출하여 모델을 미세 조정하세요:
+
+```py
+>>> trainer.train() 
+```
+
+훈련이 완료되면, [`~Trainer.push_to_hub`] 메소드를 사용하여 🤗 Hub에 모델을 공유하세요:
+
+```py
+>>> trainer.push_to_hub()
+```
+
+## 추론 [[inference]]
+
+ViLT 모델을 미세 조정하고 🤗 Hub에 업로드했다면 추론에 사용할 수 있습니다. 미세 조정된 모델을 추론에 사용해보는 가장 간단한 방법은 [`Pipeline`]에서 사용하는 것입니다.
+
+```py
+>>> from transformers import pipeline
+
+>>> pipe = pipeline("visual-question-answering", model="MariaK/vilt_finetuned_200")
+```
+
+이 가이드의 모델은 200개의 예제에서만 훈련되었으므로 그다지 많은 것을 기대할 수는 없습니다. 데이터세트의 첫 번째 예제를 사용하여 추론 결과를 설명해보겠습니다:
+
+```py
+>>> example = dataset[0]
+>>> image = Image.open(example['image_id'])
+>>> question = example['question']
+>>> print(question)
+>>> pipe(image, question, top_k=1)
+"Where is he looking?"
+[{'score': 0.5498199462890625, 'answer': 'down'}]
+```
+
+비록 확신은 별로 없지만, 모델은 실제로 무언가를 배웠습니다. 더 많은 예제와 더 긴 훈련 기간이 주어진다면 분명 더 나은 결과를 얻을 수 있을 것입니다!
+
+원한다면 파이프라인의 결과를 수동으로 복제할 수도 있습니다:
+1. 이미지와 질문을 가져와서 프로세서를 사용하여 모델에 준비합니다.
+2. 전처리된 결과를 모델에 전달합니다.
+3. 로짓에서 가장 가능성 있는 답변의 id를 가져와서 `id2label`에서 실제 답변을 찾습니다.
+
+```py
+>>> processor = ViltProcessor.from_pretrained("MariaK/vilt_finetuned_200")
+
+>>> image = Image.open(example['image_id'])
+>>> question = example['question']
+
+>>> # prepare inputs
+>>> inputs = processor(image, question, return_tensors="pt")
+
+>>> model = ViltForQuestionAnswering.from_pretrained("MariaK/vilt_finetuned_200")
+
+>>> # forward pass
+>>> with torch.no_grad():
+...     outputs = model(**inputs)
+
+>>> logits = outputs.logits
+>>> idx = logits.argmax(-1).item()
+>>> print("Predicted answer:", model.config.id2label[idx])
+Predicted answer: down
+```
+
+## 제로샷 VQA [[zeroshot-vqa]]
+
+이전 모델은 VQA를 분류 문제로 처리했습니다. BLIP, BLIP-2 및 InstructBLIP와 같은 최근의 모델은 VQA를 생성 작업으로 접근합니다. [BLIP-2](../../en/model_doc/blip-2)를 예로 들어 보겠습니다. 이 모델은 사전훈련된 비전 인코더와 LLM의 모든 조합을 사용할 수 있는 새로운 비전-자연어 사전 학습 패러다임을 도입했습니다. ([BLIP-2 블로그 포스트](https://huggingface.co/blog/blip-2)를 통해 더 자세히 알아볼 수 있어요)
+이를 통해 시각적 질의응답을 포함한 여러 비전-자연어 작업에서 SOTA를 달성할 수 있었습니다.
+
+이 모델을 어떻게 VQA에 사용할 수 있는지 설명해 보겠습니다. 먼저 모델을 가져와 보겠습니다. 여기서 GPU가 사용 가능한 경우 모델을 명시적으로 GPU로 전송할 것입니다. 이전에는 훈련할 때 쓰지 않은 이유는 [`Trainer`]가 이 부분을 자동으로 처리하기 때문입니다:
+
+```py
+>>> from transformers import AutoProcessor, Blip2ForConditionalGeneration
+>>> import torch
+
+>>> processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
+>>> model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16)
+>>> device = "cuda" if torch.cuda.is_available() else "cpu"
+>>> model.to(device)
+```
+
+모델은 이미지와 텍스트를 입력으로 받으므로, VQA 데이터세트의 첫 번째 예제에서와 동일한 이미지/질문 쌍을 사용해 보겠습니다:
+
+```py 
+>>> example = dataset[0]
+>>> image = Image.open(example['image_id'])
+>>> question = example['question']
+```
+
+BLIP-2를 시각적 질의응답 작업에 사용하려면 텍스트 프롬프트가 `Question: {} Answer:` 형식을 따라야 합니다.
+
+```py
+>>> prompt = f"Question: {question} Answer:" 
+```
+
+이제 모델의 프로세서로 이미지/프롬프트를 전처리하고, 처리된 입력을 모델을 통해 전달하고, 출력을 디코드해야 합니다:
+
+```py
+>>> inputs = processor(image, text=prompt, return_tensors="pt").to(device, torch.float16)
+
+>>> generated_ids = model.generate(**inputs, max_new_tokens=10)
+>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+>>> print(generated_text)
+"He is looking at the crowd" 
+```
+
+보시다시피 모델은 군중을 인식하고, 얼굴의 방향(아래쪽을 보고 있음)을 인식했지만, 군중이 스케이터 뒤에 있다는 사실을 놓쳤습니다. 그러나 사람이 직접 라벨링한 데이터셋을 얻을 수 없는 경우에, 이 접근법은 빠르게 유용한 결과를 생성할 수 있습니다.
diff --git a/docs/source/ko/tasks/zero_shot_image_classification.md b/docs/source/ko/tasks/zero_shot_image_classification.md
new file mode 100644
index 000000000000..f824de93b865
--- /dev/null
+++ b/docs/source/ko/tasks/zero_shot_image_classification.md
@@ -0,0 +1,144 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 제로샷(zero-shot) 이미지 분류[[zeroshot-image-classification]]
+
+[[open-in-colab]]
+
+제로샷(zero-shot) 이미지 분류는 특정 카테고리의 예시가 포함된 데이터를 학습되지 않은 모델을 사용해 이미지 분류를 수행하는 작업입니다.
+
+일반적으로 이미지 분류를 위해서는 레이블이 달린 특정 이미지 데이터로 모델 학습이 필요하며, 이 모델은 특정 이미지의 특징을 레이블에 "매핑"하는 방법을 학습합니다.
+새로운 레이블이 있는 분류 작업에 이러한 모델을 사용해야 하는 경우에는, 모델을 "재보정"하기 위해 미세 조정이 필요합니다.
+
+이와 대조적으로, 제로샷 또는 개방형 어휘(open vocabulary) 이미지 분류 모델은 일반적으로 대규모 이미지 데이터와 해당 설명에 대해 학습된 멀티모달(multimodal) 모델입니다. 
+이러한 모델은 제로샷 이미지 분류를 포함한 많은 다운스트림 작업에 사용할 수 있는 정렬된(aligned) 비전 언어 표현을 학습합니다.
+
+이는 이미지 분류에 대한 보다 유연한 접근 방식으로, 추가 학습 데이터 없이 새로운 레이블이나 학습하지 못한 카테고리에 대해 모델을 일반화할 수 있습니다.
+또한, 사용자가 대상 개체에 대한 자유 형식의 텍스트 설명으로 이미지를 검색할 수 있습니다.
+
+이번 가이드에서 배울 내용은 다음과 같습니다:
+
+* 제로샷 이미지 분류 파이프라인 만들기
+* 직접 제로샷 이미지 분류 모델 추론 실행하기
+
+시작하기 전에 필요한 라이브러리가 모두 설치되어 있는지 확인하세요:
+
+```bash
+pip install -q transformers
+```
+
+## 제로샷(zero-shot) 이미지 분류 파이프라인[[zeroshot-image-classification-pipeline]]
+
+[`pipeline`]을 활용하면 가장 간단하게 제로샷 이미지 분류를 지원하는 모델로 추론해볼 수 있습니다.
+[Hugging Face Hub에 업로드된 체크포인트](https://huggingface.co/models?pipeline_tag=zero-shot-image-classification&sort=downloads)에서 파이프라인을 인스턴스화합니다.
+
+```python
+>>> from transformers import pipeline
+
+>>> checkpoint = "openai/clip-vit-large-patch14"
+>>> detector = pipeline(model=checkpoint, task="zero-shot-image-classification")
+```
+
+다음으로, 분류하고 싶은 이미지를 선택하세요.
+
+```py
+>>> from PIL import Image
+>>> import requests
+
+>>> url = "https://unsplash.com/photos/g8oS8-82DxI/download?ixid=MnwxMjA3fDB8MXx0b3BpY3x8SnBnNktpZGwtSGt8fHx8fDJ8fDE2NzgxMDYwODc&force=true&w=640"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> image
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/owl.jpg" alt="Photo of an owl"/>
+</div>
+
+이미지와 해당 이미지의 후보 레이블인 `candidate_labels`를 파이프라인으로 전달합니다.
+여기서는 이미지를 직접 전달하지만, 컴퓨터에 저장된 이미지의 경로나 url로 전달할 수도 있습니다.
+`candidate_labels`는 이 예시처럼 간단한 단어일 수도 있고 좀 더 설명적인 단어일 수도 있습니다.
+
+```py
+>>> predictions = classifier(image, candidate_labels=["fox", "bear", "seagull", "owl"])
+>>> predictions
+[{'score': 0.9996670484542847, 'label': 'owl'},
+ {'score': 0.000199399160919711, 'label': 'seagull'},
+ {'score': 7.392891711788252e-05, 'label': 'fox'},
+ {'score': 5.96074532950297e-05, 'label': 'bear'}]
+```
+
+## 직접 제로샷(zero-shot) 이미지 분류하기[[zeroshot-image-classification-by-hand]]
+
+이제 제로샷 이미지 분류 파이프라인 사용 방법을 살펴보았으니, 실행하는 방법을 살펴보겠습니다.
+
+[Hugging Face Hub에 업로드된 체크포인트](https://huggingface.co/models?pipeline_tag=zero-shot-image-classification&sort=downloads)에서 모델과 프로세서를 가져오는 것으로 시작합니다.
+여기서는 이전과 동일한 체크포인트를 사용하겠습니다:
+
+```py
+>>> from transformers import AutoProcessor, AutoModelForZeroShotImageClassification
+
+>>> model = AutoModelForZeroShotImageClassification.from_pretrained(checkpoint)
+>>> processor = AutoProcessor.from_pretrained(checkpoint)
+```
+
+다른 이미지를 사용해 보겠습니다.
+
+```py
+>>> from PIL import Image
+>>> import requests
+
+>>> url = "https://unsplash.com/photos/xBRQfR2bqNI/download?ixid=MnwxMjA3fDB8MXxhbGx8fHx8fHx8fHwxNjc4Mzg4ODEx&force=true&w=640"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> image
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg" alt="Photo of a car"/>
+</div>
+
+프로세서를 사용해 모델의 입력을 준비합니다.
+프로세서는 모델의 입력으로 사용하기 위해 이미지 크기를 변환하고 정규화하는 이미지 프로세서와 텍스트 입력을 처리하는 토크나이저로 구성됩니다.
+
+```py
+>>> candidate_labels = ["tree", "car", "bike", "cat"]
+>>> inputs = processor(images=image, text=candidate_labels, return_tensors="pt", padding=True)
+```
+
+모델에 입력을 전달하고, 결과를 후처리합니다:
+
+```py
+>>> import torch
+
+>>> with torch.no_grad():
+...     outputs = model(**inputs)
+
+>>> logits = outputs.logits_per_image[0]
+>>> probs = logits.softmax(dim=-1).numpy()
+>>> scores = probs.tolist()
+
+>>> result = [
+...     {"score": score, "label": candidate_label}
+...     for score, candidate_label in sorted(zip(probs, candidate_labels), key=lambda x: -x[0])
+... ]
+
+>>> result
+[{'score': 0.998572, 'label': 'car'},
+ {'score': 0.0010570387, 'label': 'bike'},
+ {'score': 0.0003393686, 'label': 'tree'},
+ {'score': 3.1572064e-05, 'label': 'cat'}]
+```
\ No newline at end of file
diff --git a/docs/source/ko/tasks/zero_shot_object_detection.md b/docs/source/ko/tasks/zero_shot_object_detection.md
new file mode 100644
index 000000000000..8e9b52e8c7a2
--- /dev/null
+++ b/docs/source/ko/tasks/zero_shot_object_detection.md
@@ -0,0 +1,307 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 제로샷(zero-shot) 객체 탐지[[zeroshot-object-detection]]
+
+[[open-in-colab]]
+
+일반적으로 [객체 탐지](object_detection)에 사용되는 모델을 학습하기 위해서는 레이블이 지정된 이미지 데이터 세트가 필요합니다.
+그리고 학습 데이터에 존재하는 클래스(레이블)만 탐지할 수 있다는 한계점이 있습니다.
+
+다른 방식을 사용하는 [OWL-ViT](../model_doc/owlvit) 모델로 제로샷 객체 탐지가 가능합니다.
+OWL-ViT는 개방형 어휘(open-vocabulary) 객체 탐지기입니다.
+즉, 레이블이 지정된 데이터 세트에 미세 조정하지 않고 자유 텍스트 쿼리를 기반으로 이미지에서 객체를 탐지할 수 있습니다.
+
+OWL-ViT 모델은 멀티 모달 표현을 활용해 개방형 어휘 탐지(open-vocabulary detection)를 수행합니다.
+[CLIP](../model_doc/clip) 모델에 경량화(lightweight)된 객체 분류와 지역화(localization) 헤드를 결합합니다.
+개방형 어휘 탐지는 CLIP의 텍스트 인코더로 free-text 쿼리를 임베딩하고, 객체 분류와 지역화 헤드의 입력으로 사용합니다.
+이미지와 해당 텍스트 설명을 연결하면 ViT가 이미지 패치(image patches)를 입력으로 처리합니다.
+OWL-ViT 모델의 저자들은 CLIP 모델을 처음부터 학습(scratch learning)한 후에, bipartite matching loss를 사용하여 표준 객체 인식 데이터셋으로 OWL-ViT 모델을 미세 조정했습니다.
+
+이 접근 방식을 사용하면 모델은 레이블이 지정된 데이터 세트에 대한 사전 학습 없이도 텍스트 설명을 기반으로 객체를 탐지할 수 있습니다.
+
+이번 가이드에서는 OWL-ViT 모델의 사용법을 다룰 것입니다:
+- 텍스트 프롬프트 기반 객체 탐지
+- 일괄 객체 탐지
+- 이미지 가이드 객체 탐지
+
+시작하기 전에 필요한 라이브러리가 모두 설치되어 있는지 확인하세요:
+```bash
+pip install -q transformers
+```
+
+## 제로샷(zero-shot) 객체 탐지 파이프라인[[zeroshot-object-detection-pipeline]]
+
+[`pipeline`]을 활용하면 가장 간단하게 OWL-ViT 모델을 추론해볼 수 있습니다.
+[Hugging Face Hub에 업로드된 체크포인트](https://huggingface.co/models?pipeline_tag=zero-shot-image-classification&sort=downloads)에서 제로샷(zero-shot) 객체 탐지용 파이프라인을 인스턴스화합니다:
+
+```python
+>>> from transformers import pipeline
+
+>>> checkpoint = "google/owlvit-base-patch32"
+>>> detector = pipeline(model=checkpoint, task="zero-shot-object-detection")
+```
+
+다음으로, 객체를 탐지하고 싶은 이미지를 선택하세요.
+여기서는 [NASA](https://www.nasa.gov/multimedia/imagegallery/index.html) Great Images 데이터 세트의 일부인 우주비행사 에일린 콜린스(Eileen Collins) 사진을 사용하겠습니다.
+
+```py
+>>> import skimage
+>>> import numpy as np
+>>> from PIL import Image
+
+>>> image = skimage.data.astronaut()
+>>> image = Image.fromarray(np.uint8(image)).convert("RGB")
+
+>>> image
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/zero-sh-obj-detection_1.png" alt="Astronaut Eileen Collins"/>
+</div>
+
+이미지와 해당 이미지의 후보 레이블을 파이프라인으로 전달합니다.
+여기서는 이미지를 직접 전달하지만, 컴퓨터에 저장된 이미지의 경로나 url로 전달할 수도 있습니다.
+candidate_labels는 이 예시처럼 간단한 단어일 수도 있고 좀 더 설명적인 단어일 수도 있습니다.
+또한, 이미지를 검색(query)하려는 모든 항목에 대한 텍스트 설명도 전달합니다.
+
+```py
+>>> predictions = detector(
+...     image,
+...     candidate_labels=["human face", "rocket", "nasa badge", "star-spangled banner"],
+... )
+>>> predictions
+[{'score': 0.3571370542049408,
+  'label': 'human face',
+  'box': {'xmin': 180, 'ymin': 71, 'xmax': 271, 'ymax': 178}},
+ {'score': 0.28099656105041504,
+  'label': 'nasa badge',
+  'box': {'xmin': 129, 'ymin': 348, 'xmax': 206, 'ymax': 427}},
+ {'score': 0.2110239565372467,
+  'label': 'rocket',
+  'box': {'xmin': 350, 'ymin': -1, 'xmax': 468, 'ymax': 288}},
+ {'score': 0.13790413737297058,
+  'label': 'star-spangled banner',
+  'box': {'xmin': 1, 'ymin': 1, 'xmax': 105, 'ymax': 509}},
+ {'score': 0.11950037628412247,
+  'label': 'nasa badge',
+  'box': {'xmin': 277, 'ymin': 338, 'xmax': 327, 'ymax': 380}},
+ {'score': 0.10649408400058746,
+  'label': 'rocket',
+  'box': {'xmin': 358, 'ymin': 64, 'xmax': 424, 'ymax': 280}}]
+```
+
+이제 예측값을 시각화해봅시다:
+
+```py
+>>> from PIL import ImageDraw
+
+>>> draw = ImageDraw.Draw(image)
+
+>>> for prediction in predictions:
+...     box = prediction["box"]
+...     label = prediction["label"]
+...     score = prediction["score"]
+
+...     xmin, ymin, xmax, ymax = box.values()
+...     draw.rectangle((xmin, ymin, xmax, ymax), outline="red", width=1)
+...     draw.text((xmin, ymin), f"{label}: {round(score,2)}", fill="white")
+
+>>> image
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/zero-sh-obj-detection_2.png" alt="Visualized predictions on NASA image"/>
+</div>
+
+## 텍스트 프롬프트 기반 객체 탐지[[textprompted-zeroshot-object-detection-by-hand]]
+
+제로샷 객체 탐지 파이프라인 사용법에 대해 살펴보았으니, 이제 동일한 결과를 복제해보겠습니다.
+
+[Hugging Face Hub에 업로드된 체크포인트](https://huggingface.co/models?other=owlvit)에서 관련 모델과 프로세서를 가져오는 것으로 시작합니다.
+여기서는 이전과 동일한 체크포인트를 사용하겠습니다:
+
+```py
+>>> from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
+
+>>> model = AutoModelForZeroShotObjectDetection.from_pretrained(checkpoint)
+>>> processor = AutoProcessor.from_pretrained(checkpoint)
+```
+
+다른 이미지를 사용해 보겠습니다:
+
+```py
+>>> import requests
+
+>>> url = "https://unsplash.com/photos/oj0zeY2Ltk4/download?ixid=MnwxMjA3fDB8MXxzZWFyY2h8MTR8fHBpY25pY3xlbnwwfHx8fDE2Nzc0OTE1NDk&force=true&w=640"
+>>> im = Image.open(requests.get(url, stream=True).raw)
+>>> im
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/zero-sh-obj-detection_3.png" alt="Beach photo"/>
+</div>
+
+프로세서를 사용해 모델의 입력을 준비합니다.
+프로세서는 모델의 입력으로 사용하기 위해 이미지 크기를 변환하고 정규화하는 이미지 프로세서와 텍스트 입력을 처리하는 [`CLIPTokenizer`]로 구성됩니다.
+
+```py
+>>> text_queries = ["hat", "book", "sunglasses", "camera"]
+>>> inputs = processor(text=text_queries, images=im, return_tensors="pt")
+```
+
+모델에 입력을 전달하고 결과를 후처리 및 시각화합니다.
+이미지 프로세서가 모델에 이미지를 입력하기 전에 이미지 크기를 조정했기 때문에, [`~OwlViTImageProcessor.post_process_object_detection`] 메소드를 사용해
+예측값의 바운딩 박스(bounding box)가 원본 이미지의 좌표와 상대적으로 동일한지 확인해야 합니다.
+
+```py
+>>> import torch
+
+>>> with torch.no_grad():
+...     outputs = model(**inputs)
+...     target_sizes = torch.tensor([im.size[::-1]])
+...     results = processor.post_process_object_detection(outputs, threshold=0.1, target_sizes=target_sizes)[0]
+
+>>> draw = ImageDraw.Draw(im)
+
+>>> scores = results["scores"].tolist()
+>>> labels = results["labels"].tolist()
+>>> boxes = results["boxes"].tolist()
+
+>>> for box, score, label in zip(boxes, scores, labels):
+...     xmin, ymin, xmax, ymax = box
+...     draw.rectangle((xmin, ymin, xmax, ymax), outline="red", width=1)
+...     draw.text((xmin, ymin), f"{text_queries[label]}: {round(score,2)}", fill="white")
+
+>>> im
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/zero-sh-obj-detection_4.png" alt="Beach photo with detected objects"/>
+</div>
+
+## 일괄 처리[[batch-processing]]
+
+여러 이미지와 텍스트 쿼리를 전달하여 여러 이미지에서 서로 다른(또는 동일한) 객체를 검색할 수 있습니다.
+일괄 처리를 위해서 텍스트 쿼리는 이중 리스트로, 이미지는 PIL 이미지, PyTorch 텐서, 또는 NumPy 배열로 이루어진 리스트로 프로세서에 전달해야 합니다.
+
+```py
+>>> images = [image, im]
+>>> text_queries = [
+...     ["human face", "rocket", "nasa badge", "star-spangled banner"],
+...     ["hat", "book", "sunglasses", "camera"],
+... ]
+>>> inputs = processor(text=text_queries, images=images, return_tensors="pt")
+```
+
+이전에는 후처리를 위해 단일 이미지의 크기를 텐서로 전달했지만, 튜플을 전달할 수 있고, 여러 이미지를 처리하는 경우에는 튜플로 이루어진 리스트를 전달할 수도 있습니다.
+아래 두 예제에 대한 예측을 생성하고, 두 번째 이미지(`image_idx = 1`)를 시각화해 보겠습니다.
+
+```py
+>>> with torch.no_grad():
+...     outputs = model(**inputs)
+...     target_sizes = [x.size[::-1] for x in images]
+...     results = processor.post_process_object_detection(outputs, threshold=0.1, target_sizes=target_sizes)
+
+>>> image_idx = 1
+>>> draw = ImageDraw.Draw(images[image_idx])
+
+>>> scores = results[image_idx]["scores"].tolist()
+>>> labels = results[image_idx]["labels"].tolist()
+>>> boxes = results[image_idx]["boxes"].tolist()
+
+>>> for box, score, label in zip(boxes, scores, labels):
+...     xmin, ymin, xmax, ymax = box
+...     draw.rectangle((xmin, ymin, xmax, ymax), outline="red", width=1)
+...     draw.text((xmin, ymin), f"{text_queries[image_idx][label]}: {round(score,2)}", fill="white")
+
+>>> images[image_idx]
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/zero-sh-obj-detection_4.png" alt="Beach photo with detected objects"/>
+</div>
+
+## 이미지 가이드 객체 탐지[[imageguided-object-detection]]
+
+텍스트 쿼리를 이용한 제로샷 객체 탐지 외에도 OWL-ViT 모델은 이미지 가이드 객체 탐지 기능을 제공합니다.
+이미지를 쿼리로 사용해 대상 이미지에서 유사한 객체를 찾을 수 있다는 의미입니다.
+텍스트 쿼리와 달리 하나의 예제 이미지에서만 가능합니다.
+
+소파에 고양이 두 마리가 있는 이미지를 대상 이미지(target image)로, 고양이 한 마리가 있는 이미지를 쿼리로 사용해보겠습니다:
+
+```py
+>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+>>> image_target = Image.open(requests.get(url, stream=True).raw)
+
+>>> query_url = "http://images.cocodataset.org/val2017/000000524280.jpg"
+>>> query_image = Image.open(requests.get(query_url, stream=True).raw)
+```
+
+다음 이미지를 살펴보겠습니다:
+
+```py
+>>> import matplotlib.pyplot as plt
+
+>>> fig, ax = plt.subplots(1, 2)
+>>> ax[0].imshow(image_target)
+>>> ax[1].imshow(query_image)
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/zero-sh-obj-detection_5.png" alt="Cats"/>
+</div>
+
+전처리 단계에서 텍스트 쿼리 대신에 `query_images`를 사용합니다:
+
+```py
+>>> inputs = processor(images=image_target, query_images=query_image, return_tensors="pt")
+```
+
+예측의 경우, 모델에 입력을 전달하는 대신 [`~OwlViTForObjectDetection.image_guided_detection`]에 전달합니다.
+레이블이 없다는 점을 제외하면 이전과 동일합니다.
+이전과 동일하게 이미지를 시각화합니다.
+
+```py
+>>> with torch.no_grad():
+...     outputs = model.image_guided_detection(**inputs)
+...     target_sizes = torch.tensor([image_target.size[::-1]])
+...     results = processor.post_process_image_guided_detection(outputs=outputs, target_sizes=target_sizes)[0]
+
+>>> draw = ImageDraw.Draw(image_target)
+
+>>> scores = results["scores"].tolist()
+>>> boxes = results["boxes"].tolist()
+
+>>> for box, score, label in zip(boxes, scores, labels):
+...     xmin, ymin, xmax, ymax = box
+...     draw.rectangle((xmin, ymin, xmax, ymax), outline="white", width=4)
+
+>>> image_target
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/zero-sh-obj-detection_6.png" alt="Cats with bounding boxes"/>
+</div>
+
+OWL-ViT 모델을 추론하고 싶다면 아래 데모를 확인하세요:
+
+<iframe
+	src="https://adirik-owl-vit.hf.space"
+	frameborder="0"
+	width="850"
+	height="450"
+></iframe>
diff --git a/docs/source/ko/tasks_explained.md b/docs/source/ko/tasks_explained.md
new file mode 100644
index 000000000000..78c90849bb89
--- /dev/null
+++ b/docs/source/ko/tasks_explained.md
@@ -0,0 +1,295 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 🤗 Transformers로 작업을 해결하는 방법[[how-transformers-solve-tasks]]
+
+[🤗 Transformers로 할 수 있는 작업](task_summary)에서 자연어 처리(NLP), 음성 및 오디오, 컴퓨터 비전 작업 등의 중요한 응용을 배웠습니다. 이 페이지에서는 모델이 이러한 작업을 어떻게 해결하는지 자세히 살펴보고 내부에서 어떤 일이 일어나는지 설명합니다. 주어진 작업을 해결하는 많은 방법이 있으며, 일부 모델은 특정 기술을 구현하거나 심지어 새로운 방식으로 작업에 접근할 수도 있지만, Transformer 모델의 경우 일반적인 아이디어는 동일합니다. 유연한 아키텍처 덕분에 대부분의 모델은 인코더, 디코더 또는 인코더-디코더 구조의 변형입니다. Transformer 모델뿐만 아니라 우리의 라이브러리에는 오늘날 컴퓨터 비전 작업에 사용되는 몇 가지 합성곱 신경망(CNNs)도 있습니다. 또한, 우리는 현대 CNN의 작동 방식에 대해 설명할 것입니다.
+
+작업이 어떻게 해결되는지 설명하기 위해, 유용한 예측을 출력하고자 모델 내부에서 어떤 일이 일어나는지 살펴봅니다.
+
+- 오디오 분류 및 자동 음성 인식(ASR)을 위한 [Wav2Vec2](model_doc/wav2vec2)
+- 이미지 분류를 위한 [Vision Transformer (ViT)](model_doc/vit) 및 [ConvNeXT](model_doc/convnext)
+- 객체 탐지를 위한 [DETR](model_doc/detr)
+- 이미지 분할을 위한 [Mask2Former](model_doc/mask2former)
+- 깊이 추정을 위한 [GLPN](model_doc/glpn)
+- 인코더를 사용하는 텍스트 분류, 토큰 분류 및 질의응답과 같은 NLP 작업을 위한 [BERT](model_doc/bert)
+- 디코더를 사용하는 텍스트 생성과 같은 NLP 작업을 위한 [GPT2](model_doc/gpt2)
+- 인코더-디코더를 사용하는 요약 및 번역과 같은 NLP 작업을 위한 [BART](model_doc/bart)
+
+<Tip>
+
+더 나아가기 전에, 기존 Transformer 아키텍처에 대한 기본적인 지식을 숙지하는 것이 좋습니다. 인코더, 디코더 및 어텐션의 작동 방식을 알면 다양한 Transformer 모델이 어떻게 작동하는지 이해하는 데 도움이 됩니다. 시작 단계거나 복습이 필요한 경우, 더 많은 정보를 위해 [코스](https://huggingface.co/course/chapter1/4?fw=pt)를 확인하세요!
+
+</Tip>
+
+## 음성 및 오디오[[speech-and-audio]]
+
+[Wav2Vec2](model_doc/wav2vec2)는 레이블이 지정되지 않은 음성 데이터에 대해 사전훈련된 모델로, 오디오 분류 및 자동 음성 인식을 위해 레이블이 지정된 데이터로 미세 조정합니다.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/wav2vec2_architecture.png"/>
+</div>
+
+이 모델에는 4가지 주요 구성 요소가 있습니다:
+
+1. *특징 인코더(feature encoder)*는 원시 오디오 파형(raw audio waveform)을 가져와서 제로 평균 및 단위 분산으로 표준화하고, 각각 20ms 길이의 특징 벡터의 시퀀스로 변환합니다.
+
+2. 오디오 파형은 본질적으로 연속적이기 때문에, 텍스트 시퀀스를 단어로 나누는 것과 같이 분할할 수 없습니다. 그래서 *양자화 모듈(quantization module)*로 전달되는 특징 벡터는 이산형 음성 단위를 학습하기 위한 것입니다. 음성 단위는 *코드북(codebook)*(어휘집이라고 생각할 수 있습니다)이라는 코드단어(codewords) 콜렉션에서 선택됩니다. 코드북에서 연속적인 오디오 입력을 가장 잘 나타내는 벡터 또는 음성 단위가 선택되어 모델을 통과합니다.
+
+3. 특징 벡터의 절반은 무작위로 마스크가 적용되며, 마스크된 특징 벡터는 *상대적 위치 임베딩*을 추가하는 Transformer 인코더인 *문맥 네트워크(context network)*로 전달됩니다.
+
+4. 문맥 네트워크의 사전훈련 목표는 *대조적 작업(contrastive task)*입니다. 모델은 잘못된 예측 시퀀스에서 마스크된 예측의 실제 양자화된 음성 표현을 예측하며, 모델이 가장 유사한 컨텍스트 벡터와 양자화된 음성 단위(타겟 레이블)를 찾도록 권장합니다.
+
+이제 wav2vec2가 사전훈련되었으므로, 오디오 분류 또는 자동 음성 인식을 위해 데이터에 맞춰 미세 조정할 수 있습니다!
+
+### 오디오 분류[[audio-classification]]
+
+사전훈련된 모델을 오디오 분류에 사용하려면, 기본 Wav2Vec2 모델 상단에 시퀀스 분류 헤드를 추가하면 됩니다. 분류 헤드는 인코더의 은닉 상태(hidden states)를 받는 선형 레이어입니다. 은닉 상태는 각각 길이가 다른 오디오 프레임에서 학습된 특징을 나타냅니다. 고정 길이의 벡터 하나를 만들기 위해, 은닉 상태는 먼저 풀링되고, 클래스 레이블에 대한 로짓으로 변환됩니다. 가장 가능성이 높은 클래스를 찾기 위해 로짓과 타겟 사이의 교차 엔트로피 손실이 계산됩니다.
+
+오디오 분류에 직접 도전할 준비가 되셨나요? 완전한 [오디오 분류 가이드](tasks/audio_classification)를 확인하여 Wav2Vec2를 미세 조정하고 추론에 사용하는 방법을 학습하세요!
+
+### 자동 음성 인식[[automatic-speech-recognition]]
+
+사전훈련된 모델을 자동 음성 인식에 사용하려면, [연결주의적 시간 분류(CTC, Connectionist Temporal Classification)](glossary#connectionist-temporal-classification-ctc)를 위해 기본 Wav2Vec2 모델 상단에 언어 모델링 헤드를 추가합니다. 언어 모델링 헤드는 인코더의 은닉 상태를 받아서 로짓으로 변환합니다. 각 로짓은 토큰 클래스(토큰 수는 작업의 어휘에서 나타납니다)를 나타냅니다. CTC 손실은 텍스트로 디코딩된 토큰에서 가장 가능성이 높은 토큰 시퀀스를 찾기 위해 로짓과 타겟 사이에서 계산됩니다. 
+
+자동 음성 인식에 직접 도전할 준비가 되셨나요? 완전한 [자동 음성 인식 가이드](tasks/asr)를 확인하여 Wav2Vec2를 미세 조정하고 추론에 사용하는 방법을 학습하세요!
+
+## 컴퓨터 비전[[computer-vision]]
+
+컴퓨터 비전 작업에 접근하는 2가지 방법이 있습니다:
+
+1. 이미지를 패치 시퀀스로 분리하고 Transformer로 병렬 처리합니다.
+2. [ConvNeXT](model_doc/convnext)와 같은 현대 CNN을 사용합니다. 이는 합성곱 레이어를 기반으로 하지만 현대 네트워크 설계를 적용합니다.
+
+<Tip>
+
+세 번째 방법은 Transformer와 합성곱(예를 들어, [Convolutional Vision Transformer](model_doc/cvt) 또는 [LeViT](model_doc/levit))을 결합하는 것입니다. 우리는 살펴볼 두 가지 방법만 결합하기 때문에 여기서 이 방법을 다루지 않습니다.
+
+</Tip>
+
+ViT와 ConvNeXT는 일반적으로 이미지 분류에서 사용되지만, 물체 감지, 분할, 깊이 추정과 같은 다른 비전 작업에는 각각 DETR, Mask2Former, GLPN이 더 적합하므로 이러한 모델을 살펴보겠습니다.
+
+### 이미지 분류[[image-classification]]
+
+ViT와 ConvNeXT 모두 이미지 분류에 사용될 수 있지만, ViT는 어텐션 메커니즘을, ConvNeXT는 합성곱을 사용하는 것이 주된 차이입니다.
+
+#### Transformer[[transformer]]
+
+[ViT](model_doc/vit)은 합성곱을 전적으로 순수 Transformer 아키텍처로 대체합니다. 기존 Transformer에 익숙하다면, ViT를 이해하는 방법의 대부분을 이미 파악했다고 볼 수 있습니다.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/vit_architecture.jpg"/>
+</div>
+
+ViT가 도입한 주요 변경 사항은 이미지가 Transformer로 어떻게 전달되는지에 있습니다:
+
+1. 이미지는 서로 중첩되지 않는 정사각형 패치로 분할되고, 각 패치는 벡터 또는 *패치 임베딩(patch embedding)*으로 변환됩니다. 패치 임베딩은 적절한 입력 차원을 만드는 2D 합성곱 계층에서 생성됩니다(기본 Transformer의 경우 각 패치의 임베딩마다 768개의 값이 필요합니다). 224x224 픽셀 이미지가 있다면, 16x16 이미지 패치 196개로 분할할 수 있습니다. 텍스트가 단어로 토큰화되는 것처럼, 이미지도 패치 시퀀스로 "토큰화"됩니다.
+
+2. *학습 가능한 임베딩(learnable embedding)*(특수한 `[CLS]` 토큰)이 BERT와 같이 패치 임베딩의 시작 부분에 추가됩니다. `[CLS]` 토큰의 마지막 은닉 상태는 부착된 분류 헤드의 입력으로 사용되고, 다른 출력은 무시됩니다. 이 토큰은 모델이 이미지의 표현을 인코딩하는 방법을 학습하는 데 도움이 됩니다.
+
+3. 패치와 학습 가능한 임베딩에 마지막으로 추가할 것은 *위치 임베딩*입니다. 왜냐하면 모델은 이미지 패치의 순서를 모르기 때문입니다. 위치 임베딩도 학습 가능하며, 패치 임베딩과 동일한 크기를 가집니다. 최종적으로, 모든 임베딩이 Transformer 인코더에 전달됩니다.
+
+4. `[CLS]` 토큰을 포함한 출력은 다층 퍼셉트론 헤드(MLP)에 전달됩니다. ViT의 사전훈련 목표는 단순히 분류입니다. 다른 분류 헤드와 같이, MLP 헤드는 출력을 클래스 레이블에 대해 로짓으로 변환하고 교차 엔트로피 손실을 계산하여 가장 가능성이 높은 클래스를 찾습니다.
+
+이미지 분류에 직접 도전할 준비가 되셨나요? 완전한 [이미지 분류 가이드](tasks/image_classification)를 확인하여 ViT를 미세 조정하고 추론에 사용하는 방법을 학습하세요!
+
+#### CNN[[cnn]]
+
+<Tip>
+
+이 섹션에서는 합성곱에 대해 간략하게 설명합니다. 그러나 이미지의 모양과 크기가 어떻게 변화하는지에 대한 사전 이해가 있다면 도움이 될 것입니다. 합성곱에 익숙하지 않은 경우, fastai book의 [합성곱 신경망 챕터](https://github.com/fastai/fastbook/blob/master/13_convolutions.ipynb)를 확인하세요!
+
+</Tip>
+
+[ConvNeXT](model_doc/convnext)는 성능을 높이기 위해 새로운 현대 네트워크 설계를 적용한 CNN 구조입니다. 그러나 합성곱은 여전히 모델의 핵심입니다. 높은 수준의 관점에서 볼 때, [합성곱](glossary#convolution)은 작은 행렬(*커널*)에 이미지 픽셀의 작은 윈도우를 곱하는 연산입니다. 이는 특정 텍스쳐(texture)이나 선의 곡률과 같은 일부 특징을 계산합니다. 그러고 다음 픽셀 윈도우로 넘어가는데, 여기서 합성곱이 이동하는 거리를 *보폭(stride)*이라고 합니다.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/convolution.gif"/>
+</div>
+
+<small>패딩이나 보폭이 없는 기본 합성곱, <a href="https://arxiv.org/abs/1603.07285">딥러닝을 위한 합성곱 연산 가이드</a></small>
+
+이 출력을 다른 합성곱 레이어에 전달할 수 있으며, 각 연속적인 레이어를 통해 네트워크는 핫도그나 로켓과 같이 더 복잡하고 추상적인 것을 학습합니다. 합성곱 레이어 사이에 풀링 레이어를 추가하여 차원을 줄이고 특징의 위치 변화에 대해 모델을 더 견고하게 만드는 것이 일반적입니다.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/convnext_architecture.png"/>
+</div>
+
+ConvNeXT는 CNN을 5가지 방식으로 현대화합니다:
+
+1. 각 단계의 블록 수를 변경하고 더 큰 보폭과 그에 대응하는 커널 크기로 이미지를 "패치화(patchify)"합니다. 겹치지 않는 슬라이딩 윈도우는 ViT가 이미지를 패치로 분할하는 방법과 유사하게 이 패치화 전략을 만듭니다.
+
+2. *병목(bottleneck)* 레이어는 채널 수를 줄였다가 다시 복원합니다. 왜냐하면 1x1 합성곱을 수행하는 것이 더 빠르고, 깊이를 늘릴 수 있기 때문입니다. 역 병목(inverted bottlenect)은 채널 수를 확장하고 축소함으로써 그 반대로 수행하므로, 메모리 효율이 더 높습니다.
+
+3. 병목 레이어의 일반적인 3x3 합성곱 레이어를 각 입력 채널에 개별적으로 합성곱을 적용한 다음 마지막에 쌓는 *깊이별 합성곱(depthwise convolution)*으로 대체합니다. 이는 네트워크 폭이 넓혀 성능이 향상됩니다.
+
+4. ViT는 어텐션 메커니즘 덕분에 한 번에 더 많은 이미지를 볼 수 있는 전역 수신 필드를 가지고 있습니다. ConvNeXT는 커널 크기를 7x7로 늘려 이 효과를 재현하려고 시도합니다.
+
+5. 또한 ConvNeXT는 Transformer 모델을 모방하는 몇 가지 레이어 설계를 변경합니다. 활성화 및 정규화 레이어가 더 적고, 활성화 함수가 ReLU 대신 GELU로 전환되고, BatchNorm 대신 LayerNorm을 사용합니다.
+
+합성곱 블록의 출력은 분류 헤드로 전달되며, 분류 헤드는 출력을 로짓으로 변환하고 교차 엔트로피 손실을 계산하여 가장 가능성이 높은 레이블을 찾습니다.
+
+### 객체 탐지[[object-detection]]
+
+[DETR](model_doc/detr), *DEtection TRansformer*는 CNN과 Transformer 인코더-디코더를 결합한 종단간(end-to-end) 객체 탐지 모델입니다.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/detr_architecture.png"/>
+</div>
+
+1. 사전훈련된 CNN *백본(backbone)*은 픽셀 값으로 나타낸 이미지를 가져와 저해상도 특징 맵을 만듭니다. 특징 맵에 대해 1x1 합성곱을 적용하여 차원을 줄이고, 고수준 이미지 표현을 가진 새로운 특징 맵을 생성합니다. Transformer는 시퀀스 모델이기 때문에 특징 맵을 위치 임베딩과 결합된 특징 벡터의 시퀀스로 평탄화합니다.
+
+2. 특징 벡터는 어텐션 레이어를 사용하여 이미지 표현을 학습하는 인코더에 전달됩니다. 다음으로, 인코더의 은닉 상태는 디코더에서 *객체 쿼리*와 결합됩니다. 객체 쿼리는 이미지의 다른 영역에 초점을 맞춘 학습된 임베딩으로 학습되고, 각 어텐션 레이어를 진행하면서 갱신됩니다. 디코더의 은닉 상태는 각 객체 쿼리에 대한 바운딩 박스 좌표와 클래스 레이블을 예측하는 순방향 네트워크에 전달되며, 객체가 없는 경우 `no object`가 출력됩니다.
+
+    DETR은 각 객체 쿼리를 병렬로 디코딩하여 *N* 개의 최종 예측을 출력합니다. 여기서 *N*은 쿼리 수입니다. 한 번에 하나의 요소를 예측하는 일반적인 자기회귀 모델과 달리, 객체 탐지는 한 번에 *N* 개의 예측을 수행하는 집합 예측 작업(`바운딩 박스`, `클래스 레이블`)입니다.
+
+3. DETR은 훈련 중 *이분 매칭 손실(bipartite matching loss)*을 사용하여 고정된 수의 예측과 고정된 실제 정답 레이블(ground truth labels) 세트를 비교합니다. *N*개의 레이블 세트에 실제 정답 레이블보다 적은 경우, `no object` 클래스로 패딩됩니다. 이 손실 함수는 DETR이 예측과 실제 정답 레이블 간 1:1 대응을 찾도록 권장합니다. 바운딩 박스 또는 클래스 레이블 중 하나라도 잘못된 경우, 손실이 발생합니다. 마찬가지로, 존재하지 않는 객체를 예측하는 경우, 패널티를 받습니다. 이로 인해 DETR은 이미지에서 눈에 잘 띄는 물체 하나에 집중하는 대신, 다른 객체를 찾도록 권장됩니다.
+
+객체 탐지 헤드가 DETR 상단에 추가되어 클래스 레이블과 바운딩 박스의 좌표를 찾습니다. 객체 탐지 헤드에는 두 가지 구성 요소가 있습니다: 디코더 은닉 상태를 클래스 레이블의 로짓으로 변환하는 선형 레이어 및 바운딩 박스를 예측하는 MLP
+
+객체 탐지에 직접 도전할 준비가 되셨나요? 완전한 [객체 탐지 가이드](tasks/object_detection)를 확인하여 DETR을 미세 조정하고 추론에 사용하는 방법을 학습하세요!
+
+### 이미지 분할[[image-segmentation]]
+
+[Mask2Former](model_doc/mask2former)는 모든 유형의 이미지 분할 작업을 해결하는 범용 아키텍처입니다. 전통적인 분할 모델은 일반적으로 시멘틱(semantic) 또는 파놉틱(panoptic) 분할과 같은 이미지 분할의 특정 하위 작업에 맞춰 조정됩니다. Mask2Former는 모든 작업을 *마스크 분류* 문제로 구성합니다. 마스크 분류는 픽셀을 *N*개 세그먼트로 그룹화하고, 주어진 이미지에 대해 *N*개의 마스크와 그에 대응하는 클래스 레이블을 예측합니다. 이 섹션에서 Mask2Former의 작동 방법을 설명한 다음, 마지막에 SegFormer를 미세 조정해볼 수 있습니다.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/mask2former_architecture.png"/>
+</div>
+
+Mask2Former에는 3가지 주요 구성 요소가 있습니다:
+
+1. [Swin](model_doc/swin) 백본이 이미지를 받아 3개의 연속된 3x3 합성곱에서 저해상도 이미지 특징 맵을 생성합니다.
+
+2. 특징 맵은 *픽셀 디코더*에 전달됩니다. 이 디코더는 저해상도 특징을 고해상도 픽셀 임베딩으로 점진적으로 업샘플링합니다. 픽셀 디코더는 실제로 원본 이미지의 1/32, 1/16, 1/8 해상도의 다중 스케일 특징(저해상도 및 고해상도 특징 모두 포함)을 생성합니다.
+
+3. 이러한 서로 다른 크기의 특징 맵은 고해상도 특징에서 작은 객체를 포착하기 위해 한 번에 하나의 Transformer 디코더 레이어에 연속적으로 공급됩니다. Mask2Former의 핵심은 디코더의 *마스크 어텐션* 메커니즘입니다. 전체 이미지를 참조할 수 있는 크로스 어텐션(cross-attention)과 달리, 마스크 어텐션은 이미지의 특정 영역에만 집중합니다. 이는 이미지의 지역적 특징만으로 모델이 충분히 학습할 수 있기 때문에 더 빠르고 성능이 우수합니다.
+
+4. [DETR](tasks_explained#object-detection)과 같이, Mask2Former는 학습된 객체 쿼리를 사용하고 이를 픽셀 디코더에서의 이미지 특징과 결합하여 예측 집합(`클래스 레이블`, `마스크 예측`)을 생성합니다. 디코더의 은닉 상태는 선형 레이어로 전달되어 클래스 레이블에 대한 로짓으로 변환됩니다. 로짓과 클래스 레이블 사이의 교차 엔트로피 손실을 계산하여 가장 가능성이 높은 것을 찾습니다.
+
+    마스크 예측은 픽셀 임베딩과 최종 디코더 은닉 상태를 결합하여 생성됩니다. 시그모이드 교차 엔트로피 및 Dice 손실은 로짓과 실제 정답 마스크(ground truth mask) 사이에서 계산되어 가장 가능성이 높은 마스크를 찾습니다.
+
+이미지 분할에 직접 도전할 준비가 되셨나요? 완전한 [이미지 분할 가이드](tasks/semantic_segmentation)를 확인하여 SegFormer를 미세 조정하고 추론에 사용하는 방법을 학습하세요!
+
+### 깊이 추정[[depth-estimation]]
+
+[GLPN](model_doc/glpn), *Global-Local Path Network*는 [SegFormer](model_doc/segformer) 인코더와 경량 디코더를 결합한 깊이 추정을 위한 Transformer입니다.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/glpn_architecture.jpg"/>
+</div>
+
+1. ViT와 같이, 이미지는 패치 시퀀스로 분할되지만, 이미지 패치가 더 작다는 점이 다릅니다. 이는 세그멘테이션이나 깊이 추정과 같은 밀도 예측 작업에 더 적합합니다. 이미지 패치는 패치 임베딩으로 변환되어(패치 임베딩이 생성되는 방법은 [이미지 분류](#image-classification) 섹션을 참조하세요), 인코더로 전달됩니다.
+
+2. 인코더는 패치 임베딩을 받아, 여러 인코더 블록에 전달합니다. 각 블록은 어텐션 및 Mix-FFN 레이어로 구성됩니다. 후자의 목적은 위치 정보를 제공하는 것입니다. 각 인코더 블록의 끝에는 계층적 표현을 생성하기 위한 *패치 병합(patch merging)* 레이어가 있습니다. 각 인접한 패치 그룹의 특징은 연결되고, 연결된 특징에 선형 레이어가 적용되어 패치 수를 1/4의 해상도로 줄입니다. 이는 다음 인코더 블록의 입력이 되며, 이러한 전체 프로세스는 1/8, 1/16, 1/32 해상도의 이미지 특징을 가질 때까지 반복됩니다.
+
+3. 경량 디코더는 인코더에서 마지막 특징 맵(1/32 크기)을 가져와 1/16 크기로 업샘플링합니다. 여기서, 특징은 *선택적 특징 융합(SFF, Selective Feature Fusion)* 모듈로 전달됩니다. 이 모듈은 각 특징에 대해 어텐션 맵에서 로컬 및 전역 특징을 선택하고 결합한 다음, 1/8로 업샘플링합니다. 이 프로세스는 디코딩된 특성이 원본 이미지와 동일한 크기가 될 때까지 반복됩니다. 출력은 두 개의 합성곱 레이어를 거친 다음, 시그모이드 활성화가 적용되어 각 픽셀의 깊이를 예측합니다.
+
+## 자연어처리[[natural-language-processing]]
+
+Transformer는 초기에 기계 번역을 위해 설계되었고, 그 이후로는 사실상 모든 NLP 작업을 해결하기 위한 기본 아키텍처가 되었습니다. 어떤 작업은 Transformer의 인코더 구조에 적합하며, 다른 작업은 디코더에 더 적합합니다. 또 다른 작업은 Transformer의 인코더-디코더 구조를 모두 활용합니다.
+
+### 텍스트 분류[[text-classification]]
+
+[BERT](model_doc/bert)는 인코더 전용 모델이며, 텍스트의 풍부한 표현을 학습하기 위해 양방향의 단어에 주목함으로써 심층 양방향성(deep bidirectionality)을 효과적으로 구현한 최초의 모델입니다.
+
+1. BERT는 [WordPiece](tokenizer_summary#wordpiece) 토큰화를 사용하여 문장의 토큰 임베딩을 생성합니다. 단일 문장과 한 쌍의 문장을 구분하기 위해 특수한 `[SEP]` 토큰이 추가됩니다. 모든 텍스트 시퀀스의 시작 부분에는 특수한 `[CLS]` 토큰이 추가됩니다. `[CLS]` 토큰이 있는 최종 출력은 분류 작업을 위한 분류 헤드로 입력에 사용됩니다. BERT는 또한 한 쌍의 문장에서 각 토큰이 첫 번째 문장인지 두 번째 문장에 속하는지 나타내는 세그먼트 임베딩(segment embedding)을 추가합니다.
+
+2. BERT는 마스크드 언어 모델링과 다음 문장 예측, 두 가지 목적으로 사전훈련됩니다. 마스크드 언어 모델링에서는 입력 토큰의 일부가 무작위로 마스킹되고, 모델은 이를 예측해야 합니다. 이는 모델이 모든 단어를 보고 다음 단어를 "예측"할 수 있는 양방향성 문제를 해결합니다. 예측된 마스크 토큰의 최종 은닉 상태는 어휘에 대한 소프트맥스가 있는 순방향 네트워크로 전달되어 마스크된 단어를 예측합니다.
+
+    두 번째 사전훈련 대상은 다음 문장 예측입니다. 모델은 문장 B가 문장 A 다음에 오는지 예측해야 합니다. 문장 B가 다음 문장인 경우와 무작위 문장인 경우 각각 50%의 확률로 발생합니다. 다음 문장인지 아닌지에 대한 예측은 두 개의 클래스(`IsNext` 및 `NotNext`)에 대한 소프트맥스가 있는 순방향 네트워크로 전달됩니다.
+
+3. 입력 임베딩은 여러 인코더 레이어를 거쳐서 최종 은닉 상태를 출력합니다.
+
+사전훈련된 모델을 텍스트 분류에 사용하려면, 기본 BERT 모델 상단에 시퀀스 분류 헤드를 추가합니다. 시퀀스 분류 헤드는 최종 은닉 상태를 받는 선형 레이어이며, 로짓으로 변환하기 위해 선형 변환을 수행합니다. 교차 엔트로피 손실은 로짓과 타겟 간에 계산되어 가장 가능성이 높은 레이블을 찾습니다. 
+
+텍스트 분류에 직접 도전할 준비가 되셨나요? 완전한 [텍스트 분류 가이드](tasks/sequence_classification)를 확인하여 DistilBERT를 미세 조정하고 추론에 사용하는 방법을 학습하세요!
+
+### 토큰 분류[[token-classification]]
+
+개체명 인식(Named Entity Recognition, NER)과 같은 토큰 분류 작업에 BERT를 사용하려면, 기본 BERT 모델 상단에 토큰 분류 헤드를 추가합니다. 토큰 분류 헤드는 최종 은닉 상태를 받는 선형 레이어이며, 로짓으로 변환하기 위해 선형 변환을 수행합니다. 교차 엔트로피 손실은 로짓과 각 토큰 간에 계산되어 가장 가능성이 높은 레이블을 찾습니다. 
+
+토큰 분류에 직접 도전할 준비가 되셨나요? 완전한 [토큰 분류 가이드](tasks/token_classification)를 확인하여 DistilBERT를 미세 조정하고 추론에 사용하는 방법을 학습하세요!
+
+### 질의응답[[question-answering]]
+
+질의응답에 BERT를 사용하려면, 기본 BERT 모델 위에 스팬(span) 분류 헤드를 추가합니다. 이 선형 레이어는 최종 은닉 상태를 받고, 답변에 대응하는 `스팬`의 시작과 끝 로그를 계산하기 위해 선형 변환을 수행합니다. 교차 엔트로피 손실은 로짓과 각 레이블 위치 간에 계산되어 답변에 대응하는 가장 가능성이 높은 텍스트의 스팬을 찾습니다. 
+
+질의응답에 직접 도전할 준비가 되셨나요? 완전한 [질의응답 가이드](tasks/question_answering)를 확인하여 DistilBERT를 미세 조정하고 추론에 사용하는 방법을 학습하세요!
+
+<Tip>
+
+💡 사전훈련된 BERT를 다양한 작업에 사용하는 것이 얼마나 쉬운지 주목하세요. 사전훈련된 모델에 특정 헤드를 추가하기만 하면 은닉 상태를 원하는 출력으로 조작할 수 있습니다!
+
+</Tip>
+
+### 텍스트 생성[[text-generation]]
+
+[GPT-2](model_doc/gpt2)는 대량의 텍스트에 대해 사전훈련된 디코딩 전용 모델입니다. 프롬프트를 주어지면 설득력 있는 (항상 사실은 아니지만!) 텍스트를 생성하고 명시적으로 훈련되지 않았음에도 불구하고 질의응답과 같은 다른 NLP 작업을 완수할 수 있습니다.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/gpt2_architecture.png"/>
+</div>
+
+1. GPT-2는 단어를 토큰화하고 토큰 임베딩을 생성하기 위해 [바이트 페어 인코딩(BPE, byte pair encoding)](tokenizer_summary#bytepair-encoding-bpe)을 사용합니다. 위치 인코딩은 시퀀스에서 각 토큰의 위치를 나타내기 위해 토큰 임베딩에 추가됩니다. 입력 임베딩은 여러 디코더 블록을 거쳐 일부 최종 은닉 상태를 출력합니다. 각 디코더 블록 내에서 GPT-2는 *마스크드 셀프 어텐션(masked self-attention)* 레이어를 사용합니다. 이는 GPT-2가 이후 토큰(future tokens)에 주의를 기울일 수 없도록 합니다. 왼쪽에 있는 토큰에만 주의를 기울일 수 있습니다. 마스크드 셀프 어텐션에서는 어텐션 마스크를 사용하여 이후 토큰에 대한 점수(score)를 `0`으로 설정하기 때문에 BERT의 [`mask`] 토큰과 다릅니다.
+
+2. 디코더의 출력은 언어 모델링 헤드에 전달되며, 언어 모델링 헤드는 은닉 상태를 로짓으로 선형 변환을 수행합니다. 레이블은 시퀀스의 다음 토큰으로, 로짓을 오른쪽으로 하나씩 이동하여 생성됩니다. 교차 엔트로피 손실은 이동된 로짓과 레이블 간에 계산되어 가장 가능성이 높은 다음 토큰을 출력합니다.
+
+GPT-2의 사전훈련 목적은 전적으로 [인과적 언어 모델링](glossary#causal-language-modeling)에 기반하여, 시퀀스에서 다음 단어를 예측하는 것입니다. 이는 GPT-2가 텍스트 생성에 관련된 작업에 특히 우수하도록 합니다.
+
+텍스트 생성에 직접 도전할 준비가 되셨나요? 완전한 [인과적 언어 모델링 가이드](tasks/language_modeling#causal-language-modeling)를 확인하여 DistilGPT-2를 미세 조정하고 추론에 사용하는 방법을 학습하세요!
+
+<Tip>
+
+텍스트 생성에 대한 자세한 내용은 [텍스트 생성 전략](generation_strategies) 가이드를 확인하세요!
+
+</Tip>
+
+### 요약[[summarization]]
+
+[BART](model_doc/bart) 및 [T5](model_doc/t5)와 같은 인코더-디코더 모델은 요약 작업의 시퀀스-투-시퀀스 패턴을 위해 설계되었습니다. 이 섹션에서 BART의 작동 방법을 설명한 다음, 마지막에 T5를 미세 조정해볼 수 있습니다. 
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bart_architecture.png"/>
+</div>
+
+1. BART의 인코더 아키텍처는 BERT와 매우 유사하며 텍스트의 토큰 및 위치 임베딩을 받습니다. BART는 입력을 변형시키고 디코더로 재구성하여 사전훈련됩니다. 특정 변형 기법이 있는 다른 인코더와는 달리, BART는 모든 유형의 변형을 적용할 수 있습니다. 그러나 *text infilling* 변형 기법이 가장 잘 작동합니다. Text Infiling에서는 여러 텍스트 스팬을 **단일** [`mask`] 토큰으로 대체합니다. 이는 모델이 마스크된 토큰을 예측해야 하고, 모델에 누락된 토큰의 수를 예측하도록 가르치기 때문에 중요합니다. 입력 임베딩과 마스크된 스팬이 인코더를 거쳐 최종 은닉 상태를 출력하지만, BERT와 달리 BART는 마지막에 단어를 예측하는 순방향 네트워크를 추가하지 않습니다.
+
+2. 인코더의 출력은 디코더로 전달되며, 디코더는 인코더의 출력에서 마스크 토큰과 변형되지 않은 토큰을 예측해야 합니다. 이는 디코더가 원본 텍스트를 복원하는 데 도움이 되는 추가적인 문맥을 얻도록 합니다. 디코더의 출력은 언어 모델링 헤드에 전달되며, 언어 모델링 헤드는 은닉 상태를 로짓으로 선형 변환을 수행합니다. 교차 엔트로피 손실은 로짓과 토큰이 오른쪽으로 이동된 레이블 간에 계산됩니다.
+
+요약에 직접 도전할 준비가 되셨나요? 완전한 [요약 가이드](tasks/summarization)를 확인하여 T5를 미세 조정하고 추론에 사용하는 방법을 학습하세요!
+
+<Tip>
+
+텍스트 생성에 대한 자세한 내용은 [텍스트 생성 전략](generation_strategies) 가이드를 확인하세요!
+
+</Tip>
+
+### 번역[[translation]]
+
+번역은 시퀀스-투-시퀀스 작업의 또 다른 예로, [BART](model_doc/bart) 또는 [T5](model_doc/t5)와 같은 인코더-디코더 모델을 사용할 수 있습니다. 이 섹션에서 BART의 작동 방법을 설명한 다음, 마지막에 T5를 미세 조정해볼 수 있습니다. 
+
+BART는 원천 언어를 타겟 언어로 디코딩할 수 있는 입력에 매핑하기 위해 무작위로 초기화된 별도의 인코더를 추가하여 번역에 적용합니다. 이 새로운 인코더의 임베딩은 원본 단어 임베딩 대신 사전훈련된 인코더로 전달됩니다. 원천 인코더는 모델 출력의 교차 엔트로피 손실로부터 원천 인코더, 위치 임베딩, 입력 임베딩을 갱신하여 훈련됩니다. 첫 번째 단계에서는 모델 파라미터가 고정되고, 두 번째 단계에서는 모든 모델 파라미터가 함께 훈련됩니다.
+
+BART는 이후 번역을 위해 다양한 언어로 사전훈련된 다국어 버전의 mBART로 확장되었습니다.
+
+번역에 직접 도전할 준비가 되셨나요? 완전한 [번역 가이드](tasks/summarization)를 확인하여 T5를 미세 조정하고 추론에 사용하는 방법을 학습하세요!
+
+<Tip>
+
+텍스트 생성에 대한 자세한 내용은 [텍스트 생성 전략](generation_strategies) 가이드를 확인하세요!
+
+</Tip>
\ No newline at end of file
diff --git a/docs/source/ko/testing.md b/docs/source/ko/testing.md
new file mode 100644
index 000000000000..c8d56ad5d69a
--- /dev/null
+++ b/docs/source/ko/testing.md
@@ -0,0 +1,1278 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 테스트[[testing]]
+
+
+먼저 🤗 Transformers 모델이 어떻게 테스트되는지 살펴보고, 새로운 테스트를 작성 및 기존 테스트를 개선하는 방법을 알아봅시다.
+
+이 저장소에는 2개의 테스트 스위트가 있습니다:
+
+1. `tests` - 일반 API에 대한 테스트
+2. `examples` - API의 일부가 아닌 다양한 응용 프로그램에 대한 테스트
+
+## Transformers 테스트 방법[[how-transformers-are-tested]]
+
+1. PR이 제출되면 9개의 CircleCi 작업으로 테스트가 진행됩니다. 해당 PR에 대해 새로운 커밋이 생성될 때마다 테스트는 다시 진행됩니다. 이 작업들은 
+   이 [config 파일](https://github.com/huggingface/transformers/tree/main/.circleci/config.yml)에 정의되어 있으므로 필요하다면 
+   사용자의 로컬 환경에서 동일하게 재현해 볼 수 있습니다.
+
+   이 CI 작업은 `@slow` 테스트를 실행하지 않습니다.
+
+2. [github actions](https://github.com/huggingface/transformers/actions)에 의해 실행되는 작업은 3개입니다:
+
+   - [torch hub integration](https://github.com/huggingface/transformers/tree/main/.github/workflows/github-torch-hub.yml): 
+    torch hub integration이 작동하는지 확인합니다.
+
+   - [self-hosted (push)](https://github.com/huggingface/transformers/tree/main/.github/workflows/self-push.yml): `main` 브랜치에서 커밋이 업데이트된 경우에만 GPU를 이용한 빠른 테스트를 실행합니다. 
+    이는 `src`, `tests`, `.github` 폴더 중 하나에 코드가 업데이트된 경우에만 실행됩니다. 
+    (model card, notebook, 기타 등등을 추가한 경우 실행되지 않도록 하기 위해서입니다)
+
+   - [self-hosted runner](https://github.com/huggingface/transformers/tree/main/.github/workflows/self-scheduled.yml): `tests` 및 `examples`에서
+   GPU를 이용한 일반 테스트, 느린 테스트를 실행합니다.
+
+
+```bash
+RUN_SLOW=1 pytest tests/
+RUN_SLOW=1 pytest examples/
+```
+
+   결과는 [여기](https://github.com/huggingface/transformers/actions)에서 확인할 수 있습니다.
+
+
+## 테스트 실행[[running-tests]]
+
+
+
+
+
+### 실행할 테스트 선택[[choosing-which-tests-to-run]]
+
+이 문서는 테스트를 실행하는 다양한 방법에 대해 자세히 설명합니다. 
+모든 내용을 읽은 후에도, 더 자세한 내용이 필요하다면 [여기](https://docs.pytest.org/en/latest/usage.html)에서 확인할 수 있습니다.
+
+다음은 가장 유용한 테스트 실행 방법 몇 가지입니다.
+
+모두 실행:
+
+```console
+pytest
+```
+
+또는:
+
+```bash
+make test
+```
+
+후자는 다음과 같이 정의됩니다:
+
+```bash
+python -m pytest -n auto --dist=loadfile -s -v ./tests/
+```
+
+위의 명령어는 pytest에게 아래의 내용을 전달합니다:
+
+- 사용 가능한 CPU 코어 수만큼 테스트 프로세스를 실행합니다. (RAM이 충분하지 않다면, 테스트 프로세스 수가 너무 많을 수 있습니다!)
+- 동일한 파일의 모든 테스트는 동일한 테스트 프로세스에서 실행되어야 합니다.
+- 출력을 캡처하지 않습니다.
+- 자세한 모드로 실행합니다.
+
+
+
+### 모든 테스트 목록 가져오기[[getting-the-list-of-all-tests]]
+
+테스트 스위트의 모든 테스트:
+
+```bash
+pytest --collect-only -q
+```
+
+지정된 테스트 파일의 모든 테스트:
+
+```bash
+pytest tests/test_optimization.py --collect-only -q
+```
+
+### 특정 테스트 모듈 실행[[run-a-specific-test-module]]
+
+개별 테스트 모듈 실행하기:
+
+```bash
+pytest tests/utils/test_logging.py
+```
+
+### 특정 테스트 실행[[run-specific-tests]]
+
+대부분의 테스트 내부에서는 unittest가 사용됩니다. 따라서 특정 하위 테스트를 실행하려면 해당 테스트를 포함하는 unittest 클래스의 이름을 알아야 합니다.
+예를 들어 다음과 같을 수 있습니다:
+
+```bash
+pytest tests/test_optimization.py::OptimizationTest::test_adam_w
+```
+
+위의 명령어의 의미는 다음과 같습니다:
+
+- `tests/test_optimization.py` - 테스트가 있는 파일
+- `OptimizationTest` - 클래스의 이름
+- `test_adam_w` - 특정 테스트 함수의 이름
+
+파일에 여러 클래스가 포함된 경우, 특정 클래스의 테스트만 실행할 수도 있습니다. 예를 들어 다음과 같습니다:
+
+```bash
+pytest tests/test_optimization.py::OptimizationTest
+```
+
+이 명령어는 해당 클래스 내부의 모든 테스트를 실행합니다.
+
+앞에서 언급한 것처럼 `OptimizationTest` 클래스에 포함된 테스트를 확인할 수 있습니다.
+
+```bash
+pytest tests/test_optimization.py::OptimizationTest --collect-only -q
+```
+
+키워드 표현식을 사용하여 테스트를 실행할 수도 있습니다.
+
+`adam`이라는 이름을 포함하는 테스트만 실행하려면 다음과 같습니다:
+
+```bash
+pytest -k adam tests/test_optimization.py
+```
+
+논리 연산자 `and`와 `or`를 사용하여 모든 키워드가 일치해야 하는지 또는 어느 하나가 일치해야 하는지를 나타낼 수 있습니다.
+`not`은 부정할 때 사용할 수 있습니다.
+
+`adam`이라는 이름을 포함하지 않는 모든 테스트를 실행하려면 다음과 같습니다:
+
+```bash
+pytest -k "not adam" tests/test_optimization.py
+```
+
+두 가지 패턴을 하나로 결합할 수도 있습니다:
+
+```bash
+pytest -k "ada and not adam" tests/test_optimization.py
+```
+
+예를 들어 `test_adafactor`와 `test_adam_w`를 모두 실행하려면 다음을 사용할 수 있습니다:
+
+```bash
+pytest -k "test_adam_w or test_adam_w" tests/test_optimization.py
+```
+
+여기서 `or`를 사용하는 것에 유의하세요. 두 키워드 중 하나가 일치하도록 하기 위한 목적으로 사용하기 때문입니다.
+
+두 패턴이 모두 포함되어야 하는 테스트만 실행하려면, `and`를 사용해야 합니다:
+
+```bash
+pytest -k "test and ada" tests/test_optimization.py
+```
+
+### `accelerate` 테스트 실행[[run-`accelerate`-tests]]
+
+모델에서 `accelerate` 테스트를 실행해야 할 때가 있습니다. 이를 위해서는 명령어에 `-m accelerate_tests`를 추가하면 됩니다.
+예를 들어, `OPT`에서 이러한 테스트를 실행하려면 다음과 같습니다:
+```bash
+RUN_SLOW=1 pytest -m accelerate_tests tests/models/opt/test_modeling_opt.py 
+```
+
+### 문서 테스트 실행[[run-documentation-tests]]
+
+예시 문서가 올바른지 테스트하려면 `doctests`가 통과하는지 확인해야 합니다.
+예를 들어, [`WhisperModel.forward`'s docstring](https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py#L1017-L1035)를 사용해 봅시다:
+
+```python 
+r"""
+Returns:
+
+Example:
+    ```python
+    >>> import torch
+    >>> from transformers import WhisperModel, WhisperFeatureExtractor
+    >>> from datasets import load_dataset
+
+    >>> model = WhisperModel.from_pretrained("openai/whisper-base")
+    >>> feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base")
+    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+    >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
+    >>> input_features = inputs.input_features
+    >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
+    >>> last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state
+    >>> list(last_hidden_state.shape)
+    [1, 2, 512]
+    ```"""
+
+```
+
+원하는 파일의 모든 docstring 예제를 자동으로 테스트하려면 다음 명령을 실행하면 됩니다:
+```bash 
+pytest --doctest-modules <path_to_file_or_dir>
+```
+파일의 확장자가 markdown인 경우 `--doctest-glob="*.md"` 인수를 추가해야 합니다.
+
+### 수정된 테스트만 실행[[run-only-modified-tests]]
+
+수정된 파일 또는 현재 브랜치 (Git 기준)와 관련된 테스트를 실행하려면 [pytest-picked](https://github.com/anapaulagomes/pytest-picked)을 사용할 수 있습니다.
+이는 변경한 내용이 테스트에 영향을 주지 않았는지 빠르게 확인할 수 있는 좋은 방법입니다.
+
+```bash
+pip install pytest-picked
+```
+
+```bash
+pytest --picked
+```
+
+수정되었지만, 아직 커밋되지 않은 모든 파일 및 폴더에서 테스트가 실행됩니다.
+
+### 소스 수정 시 실패한 테스트 자동 재실행[[automatically-rerun-failed-tests-on-source-modification]]
+
+[pytest-xdist](https://github.com/pytest-dev/pytest-xdist)는 모든 실패한 테스트를 감지하고, 
+파일을 수정한 후에 파일을 계속 재실행하여 테스트가 성공할 때까지 기다리는 매우 유용한 기능을 제공합니다.
+따라서 수정한 내용을 확인한 후 pytest를 다시 시작할 필요가 없습니다. 
+모든 테스트가 통과될 때까지 이 과정을 반복한 후 다시 전체 실행이 이루어집니다.
+
+```bash
+pip install pytest-xdist
+```
+
+재귀적 모드의 사용: `pytest -f` 또는 `pytest --looponfail`
+
+파일의 변경 사항은 `looponfailroots` 루트 디렉터리와 해당 내용을 (재귀적으로) 확인하여 감지됩니다.
+이 값의 기본값이 작동하지 않는 경우, 
+`setup.cfg`의 설정 옵션을 변경하여 프로젝트에서 변경할 수 있습니다:
+
+```ini
+[tool:pytest]
+looponfailroots = transformers tests
+```
+
+또는 `pytest.ini`/``tox.ini`` 파일:
+
+```ini
+[pytest]
+looponfailroots = transformers tests
+```
+
+이렇게 하면 ini-file의 디렉터리를 기준으로 상대적으로 지정된 각 디렉터리에서 파일 변경 사항만 찾게 됩니다.
+
+
+이 기능을 대체할 수 있는 구현 방법인 [pytest-watch](https://github.com/joeyespo/pytest-watch)도 있습니다.
+
+
+### 특정 테스트 모듈 건너뛰기[[skip-a-test-module]]
+
+모든 테스트 모듈을 실행하되 특정 모듈을 제외하려면, 실행할 테스트 목록을 명시적으로 지정할 수 있습니다. 
+예를 들어, `test_modeling_*.py` 테스트를 제외한 모든 테스트를 실행하려면 다음을 사용할 수 있습니다:
+
+```bash
+pytest *ls -1 tests/*py | grep -v test_modeling*
+```
+
+### 상태 초기화[[clearing state]]
+
+CI 빌드 및 (속도에 대한) 격리가 중요한 경우, 캐시를 지워야 합니다:
+
+```bash
+pytest --cache-clear tests
+```
+
+### 테스트를 병렬로 실행[[running-tests-in-parallel]]
+
+이전에 언급한 것처럼 `make test`는 테스트를 병렬로 실행하기 위해 
+`pytest-xdist` 플러그인(`-n X` 인수, 예를 들어 `-n 2`를 사용하여 2개의 병렬 작업 실행)을 통해 실행됩니다.
+
+`pytest-xdist`의 `--dist=` 옵션을 사용하여 테스트를 어떻게 그룹화할지 제어할 수 있습니다. 
+`--dist=loadfile`은 하나의 파일에 있는 테스트를 동일한 프로세스로 그룹화합니다.
+
+실행된 테스트의 순서가 다르고 예측할 수 없기 때문에, `pytest-xdist`로 테스트 스위트를 실행하면 실패가 발생할 수 있습니다 (검출되지 않은 결합된 테스트가 있는 경우).
+이 경우 [pytest-replay](https://github.com/ESSS/pytest-replay)를 사용하면 동일한 순서로 테스트를 다시 실행해서 
+실패하는 시퀀스를 최소화하는 데에 도움이 됩니다.
+
+### 테스트 순서와 반복[[test-order-and-repetition]]
+
+잠재적인 종속성 및 상태 관련 버그(tear down)를 감지하기 위해 
+테스트를 여러 번, 연속으로, 무작위로 또는 세트로 반복하는 것이 좋습니다.
+그리고 직접적인 여러 번의 반복은 DL의 무작위성에 의해 발견되는 일부 문제를 감지하는 데에도 유용합니다.
+
+
+#### 테스트를 반복[[repeat-tests]]
+
+- [pytest-flakefinder](https://github.com/dropbox/pytest-flakefinder):
+
+```bash
+pip install pytest-flakefinder
+```
+
+모든 테스트를 여러 번 실행합니다(기본값은 50번):
+
+```bash
+pytest --flake-finder --flake-runs=5 tests/test_failing_test.py
+```
+
+<Tip>
+
+이 플러그인은 `pytest-xdist`의 `-n` 플래그와 함께 작동하지 않습니다.
+
+</Tip>
+
+<Tip>
+
+`pytest-repeat`라는 또 다른 플러그인도 있지만 `unittest`와 함께 작동하지 않습니다.
+
+</Tip>
+
+#### 테스트를 임의의 순서로 실행[[run-tests-in-a-random-order]]
+
+```bash
+pip install pytest-random-order
+```
+
+중요: `pytest-random-order`가 설치되면 테스트가 자동으로 임의의 순서로 섞입니다. 
+구성 변경이나 커맨드 라인 옵션이 필요하지 않습니다.
+
+앞서 설명한 것처럼 이를 통해 한 테스트의 상태가 다른 테스트의 상태에 영향을 미치는 결합된 테스트를 감지할 수 있습니다. 
+`pytest-random-order`가 설치되면 해당 세션에서 사용된 랜덤 시드가 출력되며 예를 들어 다음과 같습니다:
+
+```bash
+pytest tests
+[...]
+Using --random-order-bucket=module
+Using --random-order-seed=573663
+```
+
+따라서 특정 시퀀스가 실패하는 경우에는 정확한 시드를 추가하여 재현할 수 있습니다. 예를 들어 다음과 같습니다:
+
+```bash
+pytest --random-order-seed=573663
+[...]
+Using --random-order-bucket=module
+Using --random-order-seed=573663
+```
+
+정확히 동일한 테스트 목록(또는 목록이 없음)을 사용하는 경우에만 정확한 순서를 재현합니다.
+목록을 수동으로 좁히기 시작하면 더 이상 시드에 의존할 수 없고 실패했던 정확한 순서로 수동으로 목록을 나열해야합니다. 그리고 `--random-order-bucket=none`을 사용하여 pytest에게 순서를 임의로 설정하지 않도록 알려야 합니다. 
+예를 들어 다음과 같습니다:
+
+```bash
+pytest --random-order-bucket=none tests/test_a.py tests/test_c.py tests/test_b.py
+```
+
+모든 테스트에 대해 섞기를 비활성화하려면 다음과 같습니다:
+
+```bash
+pytest --random-order-bucket=none
+```
+
+기본적으로 `--random-order-bucket=module`이 내재되어 있으므로, 모듈 수준에서 파일을 섞습니다. 
+또한 `class`, `package`, `global` 및 `none` 수준에서도 섞을 수 있습니다.
+자세한 내용은 해당 [문서](https://github.com/jbasko/pytest-random-order)를 참조하세요.
+
+또 다른 무작위화의 대안은 [`pytest-randomly`](https://github.com/pytest-dev/pytest-randomly)입니다.
+이 모듈은 매우 유사한 기능/인터페이스를 가지고 있지만, `pytest-random-order`에 있는 버킷 모드를 사용할 수는 없습니다. 
+설치 후에는 자동으로 적용되는 문제도 동일하게 가집니다.
+
+### 외관과 느낌을 변경[[look-and-feel-variations]
+
+#### pytest-sugar 사용[[pytest-sugar]]
+
+[pytest-sugar](https://github.com/Frozenball/pytest-sugar)는 테스트가 보여지는 형태를 개선하고, 
+진행 상황 바를 추가하며, 실패한 테스트와 검증을 즉시 표시하는 플러그인입니다. 설치하면 자동으로 활성화됩니다.
+
+```bash
+pip install pytest-sugar
+```
+
+pytest-sugar 없이 테스트를 실행하려면 다음과 같습니다:
+
+```bash
+pytest -p no:sugar
+```
+
+또는 제거하세요.
+
+
+
+#### 각 하위 테스트 이름과 진행 상황 보고[[report-each-sub-test-name-and-its-progress]]
+
+`pytest`를 통해 단일 또는 그룹의 테스트를 실행하는 경우(`pip install pytest-pspec` 이후):
+
+```bash
+pytest --pspec tests/test_optimization.py
+```
+
+#### 실패한 테스트 즉시 표시[[instantly-shows-failed-tests]]
+
+[pytest-instafail](https://github.com/pytest-dev/pytest-instafail)은 테스트 세션의 끝까지 기다리지 않고 
+실패 및 오류를 즉시 표시합니다.
+
+```bash
+pip install pytest-instafail
+```
+
+```bash
+pytest --instafail
+```
+
+### GPU 사용 여부[[to-GPU-or-not-to-GPU]]
+
+GPU가 활성화된 환경에서, CPU 전용 모드로 테스트하려면 `CUDA_VISIBLE_DEVICES=""`를 추가합니다:
+
+```bash
+CUDA_VISIBLE_DEVICES="" pytest tests/utils/test_logging.py
+```
+
+또는 다중 GPU가 있는 경우 `pytest`에서 사용할 GPU를 지정할 수도 있습니다. 
+예를 들어, GPU `0` 및 `1`이 있는 경우 다음을 실행할 수 있습니다:
+
+```bash
+CUDA_VISIBLE_DEVICES="1" pytest tests/utils/test_logging.py
+```
+
+이렇게 하면 다른 GPU에서 다른 작업을 실행하려는 경우 유용합니다.
+
+일부 테스트는 반드시 CPU 전용으로 실행해야 하며, 일부는 CPU 또는 GPU 또는 TPU에서 실행해야 하고, 일부는 여러 GPU에서 실행해야 합니다. 
+다음 스킵 데코레이터는 테스트의 요구 사항을 CPU/GPU/TPU별로 설정하는 데 사용됩니다:
+
+- `require_torch` - 이 테스트는 torch에서만 실행됩니다.
+- `require_torch_gpu` - `require_torch`에 추가로 적어도 1개의 GPU가 필요합니다.
+- `require_torch_multi_gpu` - `require_torch`에 추가로 적어도 2개의 GPU가 필요합니다.
+- `require_torch_non_multi_gpu` - `require_torch`에 추가로 0개 또는 1개의 GPU가 필요합니다.
+- `require_torch_up_to_2_gpus` - `require_torch`에 추가로 0개, 1개 또는 2개의 GPU가 필요합니다.
+- `require_torch_tpu` - `require_torch`에 추가로 적어도 1개의 TPU가 필요합니다.
+
+GPU 요구 사항을 표로 정리하면 아래와 같습니디ㅏ:
+
+
+| n gpus | decorator                      |
+|--------+--------------------------------|
+| `>= 0` | `@require_torch`               |
+| `>= 1` | `@require_torch_gpu`           |
+| `>= 2` | `@require_torch_multi_gpu`     |
+| `< 2`  | `@require_torch_non_multi_gpu` |
+| `< 3`  | `@require_torch_up_to_2_gpus`  |
+
+
+예를 들어, 2개 이상의 GPU가 있고 pytorch가 설치되어 있을 때에만 실행되어야 하는 테스트는 다음과 같습니다:
+
+```python no-style
+@require_torch_multi_gpu
+def test_example_with_multi_gpu():
+```
+
+`tensorflow`가 필요한 경우 `require_tf` 데코레이터를 사용합니다. 예를 들어 다음과 같습니다:
+
+```python no-style
+@require_tf
+def test_tf_thing_with_tensorflow():
+```
+
+이러한 데코레이터는 중첩될 수 있습니다. 
+예를 들어, 느린 테스트로 진행되고 pytorch에서 적어도 하나의 GPU가 필요한 경우 다음과 같이 설정할 수 있습니다:
+
+```python no-style
+@require_torch_gpu
+@slow
+def test_example_slow_on_gpu():
+```
+
+`@parametrized`와 같은 일부 데코레이터는 테스트 이름을 다시 작성하기 때문에 `@require_*` 스킵 데코레이터는 올바르게 작동하려면 항상 맨 마지막에 나열되어야 합니다. 
+다음은 올바른 사용 예입니다:
+
+```python no-style
+@parameterized.expand(...)
+@require_torch_multi_gpu
+def test_integration_foo():
+```
+
+`@pytest.mark.parametrize`에는 이러한 순서 문제는 없으므로 처음 혹은 마지막에 위치시킬 수 있고 이러한 경우에도 잘 작동할 것입니다. 
+하지만 unittest가 아닌 경우에만 작동합니다.
+
+테스트 내부에서 다음을 사용할 수 있습니다:
+
+- 사용 가능한 GPU 수:
+
+```python
+from transformers.testing_utils import get_gpu_count
+
+n_gpu = get_gpu_count()  #torch와 tf와 함께 작동
+```
+
+### 분산 훈련[[distributed-training]]
+
+`pytest`는 분산 훈련을 직접적으로 다루지 못합니다. 
+이를 시도하면 하위 프로세스가 올바른 작업을 수행하지 않고 `pytest`라고 생각하기에 테스트 스위트를 반복해서 실행하게 됩니다.
+그러나 일반 프로세스를 생성한 다음 여러 워커를 생성하고 IO 파이프를 관리하도록 하면 동작합니다.
+
+다음은 사용 가능한 테스트입니다:
+
+- [test_trainer_distributed.py](https://github.com/huggingface/transformers/tree/main/tests/trainer/test_trainer_distributed.py)
+- [test_deepspeed.py](https://github.com/huggingface/transformers/tree/main/tests/deepspeed/test_deepspeed.py)
+
+실행 지점으로 바로 이동하려면, 해당 테스트에서 `execute_subprocess_async` 호출을 검색하세요.
+
+이러한 테스트를 실행하려면 적어도 2개의 GPU가 필요합니다.
+
+```bash
+CUDA_VISIBLE_DEVICES=0,1 RUN_SLOW=1 pytest -sv tests/test_trainer_distributed.py
+```
+
+### 출력 캡처[[output-capture]]
+
+테스트 실행 중 `stdout` 및 `stderr`로 전송된 모든 출력이 캡처됩니다. 
+테스트나 설정 메소드가 실패하면 캡처된 출력은 일반적으로 실패 추적 정보와 함께 표시됩니다.
+
+출력 캡처를 비활성화하고 `stdout` 및 `stderr`를 정상적으로 받으려면 `-s` 또는 `--capture=no`를 사용하세요:
+
+```bash
+pytest -s tests/utils/test_logging.py
+```
+
+테스트 결과를 JUnit 형식의 출력으로 보내려면 다음을 사용하세요:
+
+```bash
+py.test tests --junitxml=result.xml
+```
+
+### 색상 조절[[color-control]]
+
+색상이 없게 하려면 다음과 같이 설정하세요(예를 들어 흰색 배경에 노란색 글씨는 가독성이 좋지 않습니다):
+
+```bash
+pytest --color=no tests/utils/test_logging.py
+```
+
+### online pastebin service에 테스트 보고서 전송[[sending test report to online pastebin service]]
+
+각 테스트 실패에 대한 URL을 만듭니다:
+
+```bash
+pytest --pastebin=failed tests/utils/test_logging.py
+```
+
+이렇게 하면 각 실패에 대한 URL을 제공하는 remote Paste service에 테스트 실행 정보를 제출합니다. 
+일반적인 테스트를 선택할 수도 있고 혹은 특정 실패만 보내려면 `-x`와 같이 추가할 수도 있습니다.
+
+전체 테스트 세션 로그에 대한 URL을 생성합니다:
+
+```bash
+pytest --pastebin=all tests/utils/test_logging.py
+```
+
+## 테스트 작성[[writing-tests]]
+
+🤗 transformers 테스트는 대부분 `unittest`를 기반으로 하지만, 
+`pytest`에서 실행되므로 대부분의 경우 두 시스템의 기능을 사용할 수 있습니다.
+
+지원되는 기능에 대해 [여기](https://docs.pytest.org/en/stable/unittest.html)에서 확인할 수 있지만, 
+기억해야 할 중요한 점은 대부분의 `pytest` fixture가 작동하지 않는다는 것입니다.
+파라미터화도 작동하지 않지만, 우리는 비슷한 방식으로 작동하는 `parameterized` 모듈을 사용합니다.
+
+
+### 매개변수화[[parametrization]]
+
+동일한 테스트를 다른 인수로 여러 번 실행해야 하는 경우가 종종 있습니다. 
+테스트 내에서 이 작업을 수행할 수 있지만, 그렇게 하면 하나의 인수 세트에 대해 테스트를 실행할 수 없습니다.
+
+```python
+# test_this1.py
+import unittest
+from parameterized import parameterized
+
+
+class TestMathUnitTest(unittest.TestCase):
+    @parameterized.expand(
+        [
+            ("negative", -1.5, -2.0),
+            ("integer", 1, 1.0),
+            ("large fraction", 1.6, 1),
+        ]
+    )
+    def test_floor(self, name, input, expected):
+        assert_equal(math.floor(input), expected)
+```
+
+이제 기본적으로 이 테스트는 `test_floor`의 마지막 3개 인수가 
+매개변수 목록의 해당 인수에 할당되는 것으로 3번 실행될 것입니다.
+
+그리고 `negative` 및 `integer` 매개변수 집합만 실행하려면 다음과 같이 실행할 수 있습니다:
+
+```bash
+pytest -k "negative and integer" tests/test_mytest.py
+```
+
+또는 `negative` 하위 테스트를 제외한 모든 서브 테스트를 다음과 같이 실행할 수 있습니다:
+
+```bash
+pytest -k "not negative" tests/test_mytest.py
+```
+
+앞에서 언급한 `-k` 필터를 사용하는 것 외에도, 
+각 서브 테스트의 정확한 이름을 확인한 후에 일부 혹은 전체 서브 테스트를 실행할 수 있습니다.
+
+```bash
+pytest test_this1.py --collect-only -q
+```
+
+그리고 다음의 내용을 확인할 수 있을 것입니다:
+
+```bash
+test_this1.py::TestMathUnitTest::test_floor_0_negative
+test_this1.py::TestMathUnitTest::test_floor_1_integer
+test_this1.py::TestMathUnitTest::test_floor_2_large_fraction
+```
+
+2개의 특정한 서브 테스트만 실행할 수도 있습니다:
+
+```bash
+pytest test_this1.py::TestMathUnitTest::test_floor_0_negative  test_this1.py::TestMathUnitTest::test_floor_1_integer
+```
+
+`transformers`의 개발자 종속성에 이미 있는 [parameterized](https://pypi.org/project/parameterized/) 모듈은 
+`unittests`와 `pytest` 테스트 모두에서 작동합니다.
+
+그러나 테스트가 `unittest`가 아닌 경우 `pytest.mark.parametrize`를 사용할 수 있습니다(이미 있는 일부 테스트에서 사용되는 경우도 있습니다. 
+주로 `examples` 하위에 있습니다).
+
+다음은 `pytest`의 `parametrize` 마커를 사용한 동일한 예입니다:
+
+```python
+# test_this2.py
+import pytest
+
+
+@pytest.mark.parametrize(
+    "name, input, expected",
+    [
+        ("negative", -1.5, -2.0),
+        ("integer", 1, 1.0),
+        ("large fraction", 1.6, 1),
+    ],
+)
+def test_floor(name, input, expected):
+    assert_equal(math.floor(input), expected)
+```
+
+`parameterized`와 마찬가지로 `pytest.mark.parametrize`를 사용하면 
+`-k` 필터가 작동하지 않는 경우에도 실행할 서브 테스트를 정확하게 지정할 수 있습니다. 
+단, 이 매개변수화 함수는 서브 테스트의 이름 집합을 약간 다르게 생성합니다. 다음과 같은 모습입니다:
+
+```bash
+pytest test_this2.py --collect-only -q
+```
+
+그리고 다음의 내용을 확인할 수 있을 것입니다:
+
+```bash
+test_this2.py::test_floor[integer-1-1.0]
+test_this2.py::test_floor[negative--1.5--2.0]
+test_this2.py::test_floor[large fraction-1.6-1]
+```
+
+특정한 테스트에 대해서만 실행할 수도 있습니다:
+
+```bash
+pytest test_this2.py::test_floor[negative--1.5--2.0] test_this2.py::test_floor[integer-1-1.0]
+```
+
+이전의 예시와 같이 실행할 수 있습니다.
+
+
+
+### 파일 및 디렉터리[[files-and-directories]]
+
+테스트에서 종종 현재 테스트 파일과 관련된 상대적인 위치를 알아야 하는 경우가 있습니다. 
+테스트가 여러 디렉터리에서 호출되거나 깊이가 다른 하위 디렉터리에 있을 수 있기 때문에 그 위치를 아는 것은 간단하지 않습니다.
+`transformers.test_utils.TestCasePlus`라는 헬퍼 클래스는 모든 기본 경로를 처리하고 간단한 액세서를 제공하여 이 문제를 해결합니다:
+
+
+- `pathlib` 객체(완전히 정해진 경로)
+
+  - `test_file_path` - 현재 테스트 파일 경로 (예: `__file__`)
+  - test_file_dir` - 현재 테스트 파일이 포함된 디렉터리
+  - tests_dir` - `tests` 테스트 스위트의 디렉터리
+  - examples_dir` - `examples` 테스트 스위트의 디렉터리
+  - repo_root_dir` - 저장소 디렉터리
+  - src_dir` - `src`의 디렉터리(예: `transformers` 하위 디렉터리가 있는 곳)
+
+- 문자열로 변환된 경로---위와 동일하지만, `pathlib` 객체가 아닌 문자열로 경로를 반환합니다:
+
+  - `test_file_path_str`
+  - `test_file_dir_str`
+  - `tests_dir_str`
+  - `examples_dir_str`
+  - `repo_root_dir_str`
+  - `src_dir_str`
+
+위의 내용을 사용하려면 테스트가 'transformers.test_utils.TestCasePlus'의 서브클래스에 있는지 확인해야 합니다. 
+예를 들어 다음과 같습니다:
+
+```python
+from transformers.testing_utils import TestCasePlus
+
+
+class PathExampleTest(TestCasePlus):
+    def test_something_involving_local_locations(self):
+        data_dir = self.tests_dir / "fixtures/tests_samples/wmt_en_ro"
+```
+
+만약 `pathlib`를 통해 경로를 조작할 필요가 없거나 경로를 문자열로만 필요로 하는 경우에는 `pathlib` 객체에 `str()`을 호출하거나 `_str`로 끝나는 접근자를 사용할 수 있습니다. 
+예를 들어 다음과 같습니다:
+
+```python
+from transformers.testing_utils import TestCasePlus
+
+
+class PathExampleTest(TestCasePlus):
+    def test_something_involving_stringified_locations(self):
+        examples_dir = self.examples_dir_str
+```
+
+### 임시 파일 및 디렉터리[[temporary-files-and-directories]]
+
+고유한 임시 파일 및 디렉터리를 사용하는 것은 병렬 테스트 실행에 있어 필수적입니다. 
+이렇게 함으로써 테스트들이 서로의 데이터를 덮어쓰지 않게 할 수 있습니다. 또한 우리는 생성된 테스트의 종료 단계에서 이러한 임시 파일 및 디렉터리를 제거하고 싶습니다.  
+따라서 이러한 요구 사항을 충족시켜주는 `tempfile`과 같은 패키지를 사용하는 것이 중요합니다.
+
+그러나 테스트를 디버깅할 때는 임시 파일이나 디렉터리에 들어가는 내용을 확인할 수 있어야 하며, 
+재실행되는 각 테스트마다 임시 파일이나 디렉터리의 경로에 대해 무작위 값이 아닌 정확한 값을 알고 싶을 것입니다.
+
+`transformers.test_utils.TestCasePlus`라는 도우미 클래스는 이러한 목적에 가장 적합합니다. 
+이 클래스는 `unittest.TestCase`의 하위 클래스이므로, 우리는 이것을 테스트 모듈에서 쉽게 상속할 수 있습니다.
+
+다음은 해당 클래스를 사용하는 예시입니다:
+
+```python
+from transformers.testing_utils import TestCasePlus
+
+
+class ExamplesTests(TestCasePlus):
+    def test_whatever(self):
+        tmp_dir = self.get_auto_remove_tmp_dir()
+```
+
+이 코드는 고유한 임시 디렉터리를 생성하고 `tmp_dir`을 해당 위치로 설정합니다.
+
+- 고유한 임시 디렉터리를 생성합니다:
+
+```python
+def test_whatever(self):
+    tmp_dir = self.get_auto_remove_tmp_dir()
+```
+
+`tmp_dir`에는 생성된 임시 디렉터리의 경로가 포함됩니다. 
+이는 테스트의 종료 단계에서 자동으로 제거됩니다.
+
+- 선택한 경로로 임시 디렉터리 생성 후에 테스트 시작 전에 비어 있는 상태인지 확인하고, 테스트 후에는 비우지 마세요.
+
+```python
+def test_whatever(self):
+    tmp_dir = self.get_auto_remove_tmp_dir("./xxx")
+```
+
+이것은 디버깅할 때 특정 디렉터리를 모니터링하고, 
+그 디렉터리에 이전에 실행된 테스트가 데이터를 남기지 않도록 하는 데에 유용합니다.
+
+- `before` 및 `after` 인수를 직접 오버라이딩하여 기본 동작을 변경할 수 있으며 
+다음 중 하나의 동작으로 이어집니다:
+
+  - `before=True`: 테스트 시작 시 임시 디렉터리가 항상 지워집니다.
+  - `before=False`: 임시 디렉터리가 이미 존재하는 경우 기존 파일은 그대로 남습니다.
+  - `after=True`: 테스트 종료 시 임시 디렉터리가 항상 삭제됩니다.
+  - `after=False`: 테스트 종료 시 임시 디렉터리가 항상 그대로 유지됩니다.
+
+<Tip>
+
+`rm -r`에 해당하는 명령을 안전하게 실행하기 위해,
+명시적인 `tmp_dir`을 사용하는 경우 프로젝트 저장소 체크 아웃의 하위 디렉터리만 허용됩니다.
+따라서 실수로 `/tmp`가 아닌 중요한 파일 시스템의 일부가 삭제되지 않도록 항상 `./`로 시작하는 경로를 전달해야 합니다.
+
+</Tip>
+
+<Tip>
+
+각 테스트는 여러 개의 임시 디렉터리를 등록할 수 있으며, 
+별도로 요청하지 않는 한 모두 자동으로 제거됩니다.
+
+</Tip>
+
+### 임시 sys.path 오버라이드[[temporary-sys.path-override]]
+
+`sys.path`를 다른 테스트로 임시로 오버라이드하기 위해 예를 들어 `ExtendSysPath` 컨텍스트 관리자를 사용할 수 있습니다.
+예를 들어 다음과 같습니다:
+
+
+```python
+import os
+from transformers.testing_utils import ExtendSysPath
+
+bindir = os.path.abspath(os.path.dirname(__file__))
+with ExtendSysPath(f"{bindir}/.."):
+    from test_trainer import TrainerIntegrationCommon  # noqa
+```
+
+### 테스트 건너뛰기[[skipping-tests]]
+
+이것은 버그가 발견되어 새로운 테스트가 작성되었지만 아직 그 버그가 수정되지 않은 경우에 유용합니다. 
+이 테스트를 주 저장소에 커밋하려면 `make test` 중에 건너뛰도록 해야 합니다.
+
+방법:
+
+- **skip**은 테스트가 일부 조건이 충족될 경우에만 통과될 것으로 예상되고, 그렇지 않으면 pytest가 전체 테스트를 건너뛰어야 함을 의미합니다. 
+일반적인 예로는 Windows가 아닌 플랫폼에서 Windows 전용 테스트를 건너뛰거나 
+외부 리소스(예를 들어 데이터베이스)에 의존하는 테스트를 건너뛰는 것이 있습니다.
+
+- **xfail**은 테스트가 특정한 이유로 인해 실패할 것으로 예상하는 것을 의미합니다. 
+일반적인 예로는 아직 구현되지 않은 기능이나 아직 수정되지 않은 버그의 테스트가 있습니다. 
+`xfail`로 표시된 테스트가 예상대로 실패하지 않고 통과된 경우, 이것은 xpass이며 테스트 결과 요약에 기록됩니다.
+
+두 가지 중요한 차이점 중 하나는 `skip`은 테스트를 실행하지 않지만 `xfail`은 실행한다는 것입니다.
+따라서 오류가 있는 코드가 일부 테스트에 영향을 미칠 수 있는 경우 `xfail`을 사용하지 마세요.
+
+#### 구현[[implementation]]
+
+- 전체 테스트를 무조건 건너뛰려면 다음과 같이 할 수 있습니다:
+
+```python no-style
+@unittest.skip("this bug needs to be fixed")
+def test_feature_x():
+```
+
+또는 pytest를 통해:
+
+```python no-style
+@pytest.mark.skip(reason="this bug needs to be fixed")
+```
+
+또는 `xfail` 방식으로:
+
+```python no-style
+@pytest.mark.xfail
+def test_feature_x():
+```
+
+- 테스트 내부에서 내부 확인에 따라 테스트를 건너뛰는 방법은 다음과 같습니다:
+
+```python
+def test_feature_x():
+    if not has_something():
+        pytest.skip("unsupported configuration")
+```
+
+또는 모듈 전체:
+
+```python
+import pytest
+
+if not pytest.config.getoption("--custom-flag"):
+    pytest.skip("--custom-flag is missing, skipping tests", allow_module_level=True)
+```
+
+또는 `xfail` 방식으로:
+
+```python
+def test_feature_x():
+    pytest.xfail("expected to fail until bug XYZ is fixed")
+```
+
+- import가 missing된 모듈이 있을 때 그 모듈의 모든 테스트를 건너뛰는 방법:
+
+```python
+docutils = pytest.importorskip("docutils", minversion="0.3")
+```
+
+- 조건에 따라 테스트를 건너뛰는 방법:
+
+```python no-style
+@pytest.mark.skipif(sys.version_info < (3,6), reason="requires python3.6 or higher")
+def test_feature_x():
+```
+
+또는:
+
+```python no-style
+@unittest.skipIf(torch_device == "cpu", "Can't do half precision")
+def test_feature_x():
+```
+
+또는 모듈 전체를 건너뛰는 방법:
+
+```python no-style
+@pytest.mark.skipif(sys.platform == 'win32', reason="does not run on windows")
+class TestClass():
+    def test_feature_x(self):
+```
+
+보다 자세한 예제 및 방법은 [여기](https://docs.pytest.org/en/latest/skipping.html)에서 확인할 수 있습니다.
+
+### 느린 테스트[[slow-tests]]
+
+테스트 라이브러리는 지속적으로 확장되고 있으며, 일부 테스트는 실행하는 데 몇 분이 걸립니다. 
+그리고 우리에게는 테스트 스위트가 CI를 통해 완료되기까지 한 시간을 기다릴 여유가 없습니다.
+따라서 필수 테스트를 위한 일부 예외를 제외하고 느린 테스트는 다음과 같이 표시해야 합니다.
+
+```python no-style
+from transformers.testing_utils import slow
+@slow
+def test_integration_foo():
+```
+
+`@slow`로 표시된 테스트를 실행하려면 `RUN_SLOW=1` 환경 변수를 설정하세요. 예를 들어 다음과 같습니다:
+
+```bash
+RUN_SLOW=1 pytest tests
+```
+
+`@parameterized`와 같은 몇 가지 데코레이터는 테스트 이름을 다시 작성합니다. 
+그러므로 `@slow`와 나머지 건너뛰기 데코레이터 `@require_*`가 올바르게 작동되려면 마지막에 나열되어야 합니다. 다음은 올바른 사용 예입니다.
+
+```python no-style
+@parameterized.expand(...)
+@slow
+def test_integration_foo():
+```
+
+이 문서의 초반부에 설명된 것처럼 느린 테스트는 PR의 CI 확인이 아닌 예약된 일정 기반으로 실행됩니다. 
+따라서 PR 제출 중에 일부 문제를 놓친 채로 병합될 수 있습니다.
+이러한 문제들은 다음번의 예정된 CI 작업 중에 감지됩니다. 
+하지만 PR을 제출하기 전에 자신의 컴퓨터에서 느린 테스트를 실행하는 것 또한 중요합니다.
+
+느린 테스트로 표시해야 하는지 여부를 결정하는 대략적인 결정 기준은 다음과 같습니다.
+
+만약 테스트가 라이브러리의 내부 구성 요소 중 하나에 집중되어 있다면(예: 모델링 파일, 토큰화 파일, 파이프라인), 
+해당 테스트를 느린 테스트 스위트에서 실행해야 합니다.
+만약 라이브러리의 다른 측면(예: 문서 또는 예제)에 집중되어 있다면, 
+해당 테스트를 느린 테스트 스위트에서 실행해야 합니다. 그리고 이 접근 방식을 보완하기 위해 예외를 만들어야 합니다.
+
+- 무거운 가중치 세트나 50MB보다 큰 데이터셋을 다운로드해야 하는 모든 테스트(예: 모델 통합 테스트, 토크나이저 통합 테스트, 파이프라인 통합 테스트)를 
+  느린 테스트로 설정해야 합니다.
+  새로운 모델을 추가하는 경우 통합 테스트용으로 무작위 가중치로 작은 버전을 만들어 허브에 업로드해야 합니다. 
+  이 내용은 아래 단락에서 설명됩니다.
+- 특별히 빠르게 실행되도록 최적화되지 않은 학습을 수행해야 하는 테스트는 느린 테스트로 설정해야 합니다.
+- 느리지 않아야 할 테스트 중 일부가 극도로 느린 경우 
+  예외를 도입하고 이를 `@slow`로 설정할 수 있습니다. 
+  대용량 파일을 디스크에 저장하고 불러오는 자동 모델링 테스트는 `@slow`으로 표시된 테스트의 좋은 예입니다.
+- CI에서 1초 이내에 테스트가 완료되는 경우(다운로드 포함)에는 느린 테스트가 아니어야 합니다.
+
+느린 테스트가 아닌 경우에는 다양한 내부를 완전히 커버하면서 빠르게 유지되어야 합니다.
+예를 들어, 무작위 가중치를 사용하여 특별히 생성된 작은 모델로 테스트하면 상당한 커버리지를 얻을 수 있습니다.
+이러한 모델은 최소한의 레이어 수(예: 2), 어휘 크기(예: 1000) 등의 요소만 가집니다. 그런 다음 `@slow` 테스트는 대형 느린 모델을 사용하여 정성적인 테스트를 수행할 수 있습니다.
+이러한 작은 모델을 사용하는 방법을 확인하려면 다음과 같이 *tiny* 모델을 찾아보세요.
+
+```bash
+grep tiny tests examples
+```
+
+다음은 작은 모델[stas/tiny-wmt19-en-de](https://huggingface.co/stas/tiny-wmt19-en-de)을 만든 
+[script](https://github.com/huggingface/transformers/tree/main/scripts/fsmt/fsmt-make-tiny-model.py) 예시입니다. 
+특정 모델의 아키텍처에 맞게 쉽게 조정할 수 있습니다.
+
+예를 들어 대용량 모델을 다운로드하는 경우 런타임을 잘못 측정하기 쉽지만, 
+로컬에서 테스트하면 다운로드한 파일이 캐시되어 다운로드 시간이 측정되지 않습니다. 
+대신 CI 로그의 실행 속도 보고서를 확인하세요(`pytest --durations=0 tests`의 출력).
+
+이 보고서는 느린 이상값으로 표시되지 않거나 빠르게 다시 작성해야 하는 느린 이상값을 찾는 데도 유용합니다. 
+CI에서 테스트 스위트가 느려지기 시작하면 이 보고서의 맨 위 목록에 가장 느린 테스트가 표시됩니다.
+
+
+
+### stdout/stderr 출력 테스트[[testing-the-stdout/stderr-output]]
+
+`stdout` 및/또는 `stderr`로 쓰는 함수를 테스트하려면 `pytest`의 [capsys 시스템](https://docs.pytest.org/en/latest/capture.html)을 사용하여 해당 스트림에 액세스할 수 있습니다. 
+다음과 같이 수행할 수 있습니다.
+
+```python
+import sys
+
+
+def print_to_stdout(s):
+    print(s)
+
+
+def print_to_stderr(s):
+    sys.stderr.write(s)
+
+
+def test_result_and_stdout(capsys):
+    msg = "Hello"
+    print_to_stdout(msg)
+    print_to_stderr(msg)
+    out, err = capsys.readouterr()  # 캡처된 출력 스트림 사용
+    # 선택 사항: 캡처된 스트림 재생성
+    sys.stdout.write(out)
+    sys.stderr.write(err)
+    # 테스트:
+    assert msg in out
+    assert msg in err
+```
+
+그리고, 물론 대부분의 경우에는 `stderr`는 예외의 일부로 제공됩니다. 
+그러므로 해당 경우에는 try/except를 사용해야 합니다.
+
+```python
+def raise_exception(msg):
+    raise ValueError(msg)
+
+
+def test_something_exception():
+    msg = "Not a good value"
+    error = ""
+    try:
+        raise_exception(msg)
+    except Exception as e:
+        error = str(e)
+        assert msg in error, f"{msg} is in the exception:\n{error}"
+```
+
+`stdout`를 캡처하는 또 다른 방법은 `contextlib.redirect_stdout`를 사용하는 것입니다.
+
+```python
+from io import StringIO
+from contextlib import redirect_stdout
+
+
+def print_to_stdout(s):
+    print(s)
+
+
+def test_result_and_stdout():
+    msg = "Hello"
+    buffer = StringIO()
+    with redirect_stdout(buffer):
+        print_to_stdout(msg)
+    out = buffer.getvalue()
+    # 선택 사항: 캡처된 스트림 재생성
+    sys.stdout.write(out)
+    # 테스트:
+    assert msg in out
+```
+
+`stdout` 캡처에 관련된 중요한 문제 중 하나는 보통 `print`에서 이전에 인쇄된 내용을 재설정하는 `\r` 문자가 포함될 수 있다는 것입니다.
+`pytest`에서는 문제가 없지만 `pytest -s`에서는 이러한 문자가 버퍼에 포함되므로 
+`-s`가 있거나 없는 상태에서 태스트를 수행할 수 있으려면 캡처된 출력에 대해 추가적인 정리가 필요합니다.
+이 경우에는 `re.sub(r'~.*\r', '', buf, 0, re.M)`을 사용할 수 있습니다.
+
+하지만 도우미 컨텍스트 관리자 래퍼를 사용하면 
+출력에 `\r`이 포함되어 있는지의 여부에 관계없이 모든 것을 자동으로 처리하므로 편리합니다.
+
+```python
+from transformers.testing_utils import CaptureStdout
+
+with CaptureStdout() as cs:
+    function_that_writes_to_stdout()
+print(cs.out)
+```
+
+다음은 전체 테스트 예제입니다.
+
+```python
+from transformers.testing_utils import CaptureStdout
+
+msg = "Secret message\r"
+final = "Hello World"
+with CaptureStdout() as cs:
+    print(msg + final)
+assert cs.out == final + "\n", f"captured: {cs.out}, expecting {final}"
+```
+
+`stderr`를 캡처하고 싶다면, 대신 `CaptureStderr` 클래스를 사용하세요.
+
+```python
+from transformers.testing_utils import CaptureStderr
+
+with CaptureStderr() as cs:
+    function_that_writes_to_stderr()
+print(cs.err)
+```
+
+두 스트림을 동시에 캡처해야 한다면, 부모 `CaptureStd` 클래스를 사용하세요.
+
+```python
+from transformers.testing_utils import CaptureStd
+
+with CaptureStd() as cs:
+    function_that_writes_to_stdout_and_stderr()
+print(cs.err, cs.out)
+```
+
+또한, 테스트의 디버깅을 지원하기 위해 
+이러한 컨텍스트 관리자는 기본적으로 컨텍스트에서 종료할 때 캡처된 스트림을 자동으로 다시 실행합니다.
+
+
+### 로거 스트림 캡처[[capturing-logger-stream]]
+
+로거 출력을 검증해야 하는 경우 `CaptureLogger`를 사용할 수 있습니다.
+
+```python
+from transformers import logging
+from transformers.testing_utils import CaptureLogger
+
+msg = "Testing 1, 2, 3"
+logging.set_verbosity_info()
+logger = logging.get_logger("transformers.models.bart.tokenization_bart")
+with CaptureLogger(logger) as cl:
+    logger.info(msg)
+assert cl.out, msg + "\n"
+```
+
+### 환경 변수를 이용하여 테스트[[testing-with-environment-variables]]
+
+특정 테스트의 환경 변수 영향을 검증하려면 
+`transformers.testing_utils.mockenv`라는 도우미 데코레이터를 사용할 수 있습니다.
+
+```python
+from transformers.testing_utils import mockenv
+
+
+class HfArgumentParserTest(unittest.TestCase):
+    @mockenv(TRANSFORMERS_VERBOSITY="error")
+    def test_env_override(self):
+        env_level_str = os.getenv("TRANSFORMERS_VERBOSITY", None)
+```
+
+일부 경우에는 외부 프로그램을 호출해야할 수도 있는데, 이 때에는 여러 개의 로컬 경로를 포함하는 `os.environ`에서 `PYTHONPATH`의 설정이 필요합니다.  
+헬퍼 클래스 `transformers.test_utils.TestCasePlus`가 도움이 됩니다:
+
+```python
+from transformers.testing_utils import TestCasePlus
+
+
+class EnvExampleTest(TestCasePlus):
+    def test_external_prog(self):
+        env = self.get_env()
+        # 이제 `env`를 사용하여 외부 프로그램 호출
+```
+
+테스트 파일이 `tests` 테스트 스위트 또는 `examples`에 있는지에 따라 
+`env[PYTHONPATH]`가 두 디렉터리 중 하나를 포함하도록 설정되며, 
+현재 저장소에 대해 테스트가 수행되도록 `src` 디렉터리도 포함됩니다.
+테스트 호출 이전에 설정된 경우에는 `env[PYTHONPATH]`를 그대로 사용합니다.
+
+이 헬퍼 메소드는 `os.environ` 객체의 사본을 생성하므로 원본은 그대로 유지됩니다.
+
+
+### 재현 가능한 결과 얻기[[getting-reproducible-results]]
+
+일부 상황에서 테스트에서 임의성을 제거하여 동일하게 재현 가능한 결과를 얻고 싶을 수 있습니다. 
+이를 위해서는 다음과 같이 시드를 고정해야 합니다.
+
+```python
+seed = 42
+
+# 파이썬 RNG
+import random
+
+random.seed(seed)
+
+# 파이토치 RNG
+import torch
+
+torch.manual_seed(seed)
+torch.backends.cudnn.deterministic = True
+if torch.cuda.is_available():
+    torch.cuda.manual_seed_all(seed)
+
+# 넘파이 RNG
+import numpy as np
+
+np.random.seed(seed)
+
+# 텐서플로 RNG
+tf.random.set_seed(seed)
+```
+
+### 테스트 디버깅[[debugging tests]]
+
+경고가 있는 곳에서 디버거를 시작하려면 다음을 수행하세요.
+
+```bash
+pytest tests/utils/test_logging.py -W error::UserWarning --pdb
+```
+
+## Github Actions 워크플로우 작업 처리[[working-with-github-actions-workflows]]
+
+셀프 푸시 워크플로우 CI 작업을 트리거하려면, 다음을 수행해야 합니다.
+
+1. `transformers` 원본에서 새 브랜치를 만듭니다(포크가 아닙니다!).
+2. 브랜치 이름은 `ci_` 또는 `ci-`로 시작해야 합니다(`main`도 트리거하지만 `main`에서는 PR을 할 수 없습니다). 
+   또한 특정 경로에 대해서만 트리거되므로 이 문서가 작성된 후에 변경된 내용은 
+   [여기](https://github.com/huggingface/transformers/blob/main/.github/workflows/self-push.yml)의 *push:*에서 확인할 수 있습니다.
+3. 이 브랜치에서 PR을 생성합니다
+4. 그런 다음 [여기](https://github.com/huggingface/transformers/actions/workflows/self-push.yml)에서 작업이 나타나는지 확인할 수 있습니다. 
+   백로그가 있는 경우, 바로 실행되지 않을 수도 있습니다.
+
+
+
+
+## 실험적인 CI 기능 테스트[[testing-Experimental-CI-Features]]
+
+CI 기능을 테스트하는 것은 일반 CI 작동에 방해가 될 수 있기 때문에 잠재적으로 문제가 발생할 수 있습니다. 
+따라서 새로운 CI 기능을 추가하는 경우 다음과 같이 수행해야 합니다.
+
+1. 테스트해야 할 내용을 테스트하는 새로운 전용 작업을 생성합니다.
+2. 새로운 작업은 항상 성공해야만 녹색 ✓를 받을 수 있습니다(아래에 자세한 내용이 있습니다).
+3. 다양한 PR 유형에 대한 확인을  위해 
+   (사용자 포크 브랜치, 포크되지 않은 브랜치, github.com UI 직접 파일 편집에서 생성된 브랜치, 강제 푸시 등 PR의 유형은 아주 다양합니다.) 
+   며칠 동안 실험 작업의 로그를 모니터링하면서 실행해봅니다.
+   (의도적으로 항상 녹색을 표시하므로 작업 전체가 녹색은 아니라는 점에 유의합니다.)
+4. 모든 것이 안정적인지 확인한 후, 새로운 변경 사항을 기존 작업에 병합합니다.
+
+이렇게 하면 CI 기능 자체에 대한 실험이 일반 작업 흐름에 방해가 되지 않습니다.
+
+그러나 새로운 CI 기능이 개발 중인 동안, 항상 성공하도록 할 수 있는 방법은 무엇일까요?
+
+TravisCI와 같은 일부 CI는 `ignore-step-failure`를 지원하며 전체 작업을 성공한 것으로 보고하지만, 
+현재 우리가 사용하는 CircleCI와 Github Actions는 이를 지원하지 않습니다.
+
+따라서 다음과 같은 해결책을 사용할 수 있습니다.
+
+1. bash 스크립트에서 가능한 많은 오류를 억제하기 위해 실행 명령의 시작 부분에 `set +euo pipefail`을 추가합니다.
+2. 마지막 명령은 반드시 성공해야 합니다. `echo "done"` 또는 `true`를 사용하면 됩니다.
+
+예시는 다음과 같습니다.
+
+```yaml
+- run:
+    name: run CI experiment
+    command: |
+        set +euo pipefail
+        echo "setting run-all-despite-any-errors-mode"
+        this_command_will_fail
+        echo "but bash continues to run"
+        # emulate another failure
+        false
+        # but the last command must be a success
+        echo "during experiment do not remove: reporting success to CI, even if there were failures"
+```
+
+간단한 명령의 경우 다음과 같이 수행할 수도 있습니다.
+
+```bash
+cmd_that_may_fail || true
+```
+
+결과에 만족한 후에는 물론, 실험적인 단계 또는 작업을 일반 작업의 나머지 부분과 통합하면서 
+`set +euo pipefail` 또는 기타 추가한 요소를 제거하여 
+실험 작업이 일반 CI 작동에 방해되지 않도록 해야 합니다.
+
+이 전반적인 과정은 실험 단계가 PR의 전반적인 상태에 영향을 주지 않고 실패하도록 
+`allow-failure`와 같은 기능을 설정할 수 있다면 훨씬 더 쉬웠을 것입니다. 
+그러나 앞에서 언급한 바와 같이 CircleCI와 Github Actions는 현재 이러한 기능들 지원하지 않습니다.
+
+이 기능의 지원을 위한 투표에 참여하고 CI 관련 스레드들에서 이러한 상황을 확인할 수도 있습니다.
+
+- [Github Actions:](https://github.com/actions/toolkit/issues/399)
+- [CircleCI:](https://ideas.circleci.com/ideas/CCI-I-344)
diff --git a/docs/source/ko/tf_xla.md b/docs/source/ko/tf_xla.md
new file mode 100644
index 000000000000..66d30abb2e98
--- /dev/null
+++ b/docs/source/ko/tf_xla.md
@@ -0,0 +1,174 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# TensorFlow 모델을 위한 XLA 통합 [[xla-integration-for-tensorflow-models]]
+
+[[open-in-colab]]
+
+XLA(Accelerated Linear Algebra)는 TensorFlow 모델의 실행 시간을 가속화하기 위한 컴파일러입니다. [공식 문서](https://www.tensorflow.org/xla)에 따르면 다음과 같습니다:
+
+XLA(Accelerated Linear Algebra)는 선형 대수를 위한 도메인 특화 컴파일러로, TensorFlow 모델을 소스 코드 변경 없이 가속화할 수 있습니다.
+
+TensorFlow에서 XLA를 사용하는 것은 간단합니다. XLA는 `tensorflow` 라이브러리 내에 패키지로 제공되며, [`tf.function`](https://www.tensorflow.org/guide/intro_to_graphs)과 같은 그래프 생성 함수에서 `jit_compile` 인수를 사용하여 활성화할 수 있습니다. `fit()` 및 `predict()`와 같은 Keras 메소드를 사용하는 경우, `jit_compile` 인수를 `model.compile()`에 전달하여 XLA를 간단하게 활성화할 수 있습니다. 그러나 XLA는 이러한 메소드에 국한되지 않고 임의의 `tf.function`을 가속화하는 데에도 사용할 수 있습니다.
+
+🤗 Transformers에서는 [GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2), [T5](https://huggingface.co/docs/transformers/model_doc/t5), [OPT](https://huggingface.co/docs/transformers/model_doc/opt)와 같은 모델의 텍스트 생성, 그리고 [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)와 같은 모델의 음성 처리를 포함하여 여러 TensorFlow 메소드가 XLA와 호환되도록 다시 작성되었습니다.
+
+정확한 속도 향상은 모델에 따라 다르지만, 🤗 Transformers 내의 TensorFlow 텍스트 생성 모델의 경우 최대 100배의 속도 향상을 확인했습니다. 이 문서에서는 이러한 모델에 대해 XLA를 사용하여 최대 성능을 얻는 방법을 설명합니다. 또한 XLA 통합의 벤치마크 및 디자인 철학에 대한 추가 자료 링크도 제공할 것입니다.
+
+## XLA를 사용하여 TF 함수 실행하기 [[running-tf-functions-with-xla]]
+
+TensorFlow에서 다음과 같은 모델을 고려해 봅시다:
+
+```py
+import tensorflow as tf
+
+model = tf.keras.Sequential(
+    [tf.keras.layers.Dense(10, input_shape=(10,), activation="relu"), tf.keras.layers.Dense(5, activation="softmax")]
+)
+```
+
+위 모델은 차원이 `(10, )`인 입력을 받습니다. 다음과 같이 모델을 사용하여 순전파를 실행할 수 있습니다:
+
+```py
+# 모델에 대한 임의의 입력을 생성합니다.
+batch_size = 16
+input_vector_dim = 10
+random_inputs = tf.random.normal((batch_size, input_vector_dim))
+
+# 순전파를 실행합니다.
+_ = model(random_inputs)
+```
+
+XLA로 컴파일된 함수로 순전파를 실행하려면 다음과 같이 해야 합니다:
+
+```py
+xla_fn = tf.function(model, jit_compile=True)
+_ = xla_fn(random_inputs)
+```
+
+`model`의 기본 `call()` 함수는 XLA 그래프를 컴파일하는 데 사용됩니다. 그러나 다른 모델 함수를 XLA로 컴파일하려면 다음과 같이 할 수도 있습니다:
+
+```py
+my_xla_fn = tf.function(model.my_xla_fn, jit_compile=True)
+```
+
+## 🤗 Transformers에서 XLA를 사용하여 TF 텍스트 생성 모델 실행하기 [[running-a-tf-text-generation-model-with-xla-from-transformers]]
+
+🤗 Transformers에서 XLA로 가속화된 생성을 활성화하려면 최신 버전의 `transformers`가 설치되어 있어야 합니다. 다음과 같이 설치할 수 있습니다:
+
+```bash
+pip install transformers --upgrade
+```
+
+그리고 다음 코드를 실행할 수 있습니다:
+
+```py
+import tensorflow as tf
+from transformers import AutoTokenizer, TFAutoModelForCausalLM
+
+# 최소 버전의 Transformers가 설치되어 있지 않다면 오류가 발생합니다.
+from transformers.utils import check_min_version
+
+check_min_version("4.21.0")
+
+
+tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side="left", pad_token="</s>")
+model = TFAutoModelForCausalLM.from_pretrained("gpt2")
+input_string = ["TensorFlow is"]
+
+# XLA 생성 함수를 만들기 위한 한 줄
+xla_generate = tf.function(model.generate, jit_compile=True)
+
+tokenized_input = tokenizer(input_string, return_tensors="tf")
+generated_tokens = xla_generate(**tokenized_input, num_beams=2)
+
+decoded_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
+print(f"Generated -- {decoded_text}")
+# Generated -- TensorFlow is an open-source, open-source, distributed-source application # framework for the
+```
+
+알 수 있듯이, `generate()`에서 XLA를 활성화하는 것은 단 한 줄의 코드입니다. 코드의 나머지 부분은 변경되지 않습니다. 그러나 위 코드 스니펫에서는 XLA에 특정한 몇 가지 주의할 점이 있습니다. XLA가 가져다줄 속도 향상을 실현하기 위해서는 이를 알고 있어야 합니다. 다음 섹션에서 이에 대해 논의합니다.
+
+## 주의할 점 [[gotchas-to-be-aware-of]]
+
+XLA 활성화 함수(`xla_generate()`와 같은)를 처음 실행할 때 내부적으로 계산 그래프를 추론하려고 하며, 이는 시간이 소요됩니다. 이 과정은 [“추적(tracing)”](https://www.tensorflow.org/guide/intro_to_graphs#when_is_a_function_tracing)이라고 알려져 있습니다.
+
+생성 시간이 빠르지 않다는 것을 알 수 있을 것입니다. `xla_generate()`(또는 다른 XLA 활성화 함수)의 연속 호출은 함수에 전달된 입력이 초기에 구축된 계산 그래프와 동일한 형태를 따른다면, 계산 그래프를 추론할 필요가 없습니다. 이는 입력 형태가 고정된 모달리티(예: 이미지)에는 문제가 되지 않지만, 가변 입력 형태 모달리티(예: 텍스트)를 사용할 때 주의해야 합니다.
+
+`xla_generate()`가 항상 동일한 입력 형태로 동작하도록 하려면, 토크나이저를 호출할 때 `padding` 인수를 지정할 수 있습니다.
+
+```py
+import tensorflow as tf
+from transformers import AutoTokenizer, TFAutoModelForCausalLM
+
+tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side="left", pad_token="</s>")
+model = TFAutoModelForCausalLM.from_pretrained("gpt2")
+input_string = ["TensorFlow is"]
+
+xla_generate = tf.function(model.generate, jit_compile=True)
+
+# 여기서, padding 옵션이 있는 토크나이저를 호출합니다.
+tokenized_input = tokenizer(input_string, pad_to_multiple_of=8, padding=True, return_tensors="tf")
+
+generated_tokens = xla_generate(**tokenized_input, num_beams=2)
+decoded_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
+print(f"Generated -- {decoded_text}")
+```
+
+이렇게 하면 `xla_generate()`에 대한 입력이 항상 추적된 형태로 전달되어 생성 시간이 가속화됩니다. 다음 코드로 이를 확인할 수 있습니다:
+
+```py
+import time
+import tensorflow as tf
+from transformers import AutoTokenizer, TFAutoModelForCausalLM
+
+tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side="left", pad_token="</s>")
+model = TFAutoModelForCausalLM.from_pretrained("gpt2")
+
+xla_generate = tf.function(model.generate, jit_compile=True)
+
+for input_string in ["TensorFlow is", "TensorFlow is a", "TFLite is a"]:
+    tokenized_input = tokenizer(input_string, pad_to_multiple_of=8, padding=True, return_tensors="tf")
+    start = time.time_ns()
+    generated_tokens = xla_generate(**tokenized_input, num_beams=2)
+    end = time.time_ns()
+    print(f"Execution time -- {(end - start) / 1e6:.1f} ms\n")
+```
+
+Tesla T4 GPU에서는 다음과 같은 출력을 예상할 수 있습니다:
+
+```bash
+Execution time -- 30819.6 ms
+
+Execution time -- 79.0 ms
+
+Execution time -- 78.9 ms
+```
+`xla_generate()`의 첫 번째 호출은 추적 때문에 시간이 오래 걸리지만, 연속 호출은 몇 배나 빠릅니다. 생성 옵션에 대한 어떤 변경이든 다시 추적을 유발하므로 생성 시간이 느려질 수 있음을 명심하세요.
+
+이 문서에서는 🤗 Transformers에서 제공하는 모든 텍스트 생성 옵션을 다루지 않았습니다. 고급 사용 사례에 대해 문서를 참조하시기 바랍니다.
+
+## 추가 자료 [[additional-resources]]
+
+여기에 🤗 Transformers와 XLA에 대해 더 자세히 알고 싶은 경우 도움이 될 수 있는 몇 가지 추가 자료를 제공합니다. 
+ 
+* [이 Colab 노트북](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/91_tf_xla_generate.ipynb)은 XLA와 호환되는 인코더-디코더([T5](https://huggingface.co/docs/transformers/model_doc/t5)와 같은) 및 디코더 전용([GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2)와 같은) 텍스트 생성 모델을 실험해 볼 수 있는 대화형 데모를 제공합니다.
+* [이 블로그 글](https://huggingface.co/blog/tf-xla-generate)은 TensorFlow에서 XLA에 대한 친절한 소개와 함께 XLA와 호환되는 모델의 비교 벤치마크에 대한 개요를 제공합니다.
+* [이 블로그 글](https://blog.tensorflow.org/2022/11/how-hugging-face-improved-text-generation-performance-with-xla.html)은 🤗 Transformers의 TensorFlow 모델에 XLA 지원을 추가하는 것에 대한 디자인 철학을 논의합니다.
+* XLA와 TensorFlow 그래프에 대해 더 자세히 알고 싶은 경우 추천하는 글:
+    * [XLA: 기계 학습을 위한 최적화 컴파일러](https://www.tensorflow.org/xla)
+    * [그래프 및 tf.function 소개](https://www.tensorflow.org/guide/intro_to_graphs)
+    * [tf.function으로 성능 향상하기](https://www.tensorflow.org/guide/function) 
\ No newline at end of file
diff --git a/docs/source/ko/tflite.md b/docs/source/ko/tflite.md
new file mode 100644
index 000000000000..5d08ea407854
--- /dev/null
+++ b/docs/source/ko/tflite.md
@@ -0,0 +1,62 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# TFLite로 내보내기[[export-to-tflite]]
+
+[TensorFlow Lite](https://www.tensorflow.org/lite/guide)는 자원이 제한된 휴대폰, 임베디드 시스템, 사물인터넷(IoT) 기기에서 
+기계학습 모델을 배포하기 위한 경량 프레임워크입니다. 
+TFLite는 연산 능력, 메모리, 전력 소비가 제한된 기기에서 모델을 효율적으로 최적화하고 실행하기 위해 
+설계되었습니다. 
+TensorFlow Lite 모델은 `.tflite` 파일 확장자로 식별되는 특수하고 효율적인 휴대용 포맷으로 표현됩니다. 
+
+🤗 Optimum은 `exporters.tflite` 모듈로 🤗 Transformers 모델을 TFLite로 내보내는 기능을 제공합니다. 
+지원되는 모델 아키텍처 목록은 [🤗 Optimum 문서](https://huggingface.co/docs/optimum/exporters/tflite/overview)를 참고하세요. 
+
+모델을 TFLite로 내보내려면, 필요한 종속성을 설치하세요:
+ 
+```bash
+pip install optimum[exporters-tf]
+```
+
+모든 사용 가능한 인수를 확인하려면, [🤗 Optimum 문서](https://huggingface.co/docs/optimum/main/en/exporters/tflite/usage_guides/export_a_model)를 참고하거나 
+터미널에서 도움말을 살펴보세요:
+
+```bash
+optimum-cli export tflite --help
+```
+
+예를 들어 🤗 Hub에서의 `bert-base-uncased` 모델 체크포인트를 내보내려면, 다음 명령을 실행하세요:
+
+```bash
+optimum-cli export tflite --model bert-base-uncased --sequence_length 128 bert_tflite/
+```
+
+다음과 같이 진행 상황을 나타내는 로그와 결과물인 `model.tflite`가 저장된 위치를 보여주는 로그가 표시됩니다:
+
+```bash
+Validating TFLite model...
+	-[✓] TFLite model output names match reference model (logits)
+	- Validating TFLite Model output "logits":
+		-[✓] (1, 128, 30522) matches (1, 128, 30522)
+		-[x] values not close enough, max diff: 5.817413330078125e-05 (atol: 1e-05)
+The TensorFlow Lite export succeeded with the warning: The maximum absolute difference between the output of the reference model and the TFLite exported model is not within the set tolerance 1e-05:
+- logits: max diff = 5.817413330078125e-05.
+ The exported model was saved at: bert_tflite
+ ```
+
+위 예제는 🤗 Hub에서의 체크포인트를 내보내는 방법을 보여줍니다. 
+로컬 모델을 내보낸다면, 먼저 모델 가중치와 토크나이저 파일이 모두 같은 디렉터리( `local_path` )에 저장됐는지 확인하세요. 
+CLI를 사용할 때, 🤗 Hub에서의 체크포인트 이름 대신 `model` 인수에 `local_path`를 전달하면 됩니다. 
\ No newline at end of file
diff --git a/docs/source/ko/tokenizer_summary.md b/docs/source/ko/tokenizer_summary.md
new file mode 100644
index 000000000000..5c6b9a6b73ca
--- /dev/null
+++ b/docs/source/ko/tokenizer_summary.md
@@ -0,0 +1,253 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 토크나이저 요약[[summary-of-the-tokenizers]]
+
+[[open-in-colab]]
+
+이 페이지에서는 토큰화에 대해 자세히 살펴보겠습니다.
+
+<Youtube id="VFp38yj8h3A"/>
+
+[데이터 전처리하기 튜토리얼](preprocessing)에서 살펴본 것처럼, 텍스트를 토큰화하는 것은 텍스트를 단어 또는 서브워드로 분할하고 룩업 테이블을 통해 id로 변환하는 과정입니다.
+단어 또는 서브워드를 id로 변환하는 것은 간단하기 때문에 이번 문서에서는 텍스트를 단어 또는 서브워드로 쪼개는 것(즉, 텍스트를 토큰화하는 것)에 중점을 두겠습니다.
+구체적으로, 🤗 Transformers에서 사용되는 세 가지 주요 토큰화 유형인 [Byte-Pair Encoding (BPE)](#byte-pair-encoding), [WordPiece](#wordpiece), [SentencePiece](#sentencepiece)를 살펴보고 어떤 모델에서 어떤 토큰화 유형을 사용하는지 예시를 보여드리겠습니다.
+
+각 모델 페이지에 연결된 토크나이저의 문서를 보면 사전 훈련 모델에서 어떤 토크나이저를 사용했는지 알 수 있습니다.
+예를 들어, [`BertTokenizer`]를 보면 이 모델이 [WordPiece](#wordpiece)를 사용하는 것을 알 수 있습니다.
+
+## 개요[[introduction]]
+
+텍스트를 작은 묶음(chunk)으로 쪼개는 것은 보기보다 어려운 작업이며, 여러 가지 방법이 있습니다.
+예를 들어, `"Don't you love 🤗 Transformers? We sure do."` 라는 문장을 살펴보도록 하겠습니다.
+
+<Youtube id="nhJxYji1aho"/>
+
+위 문장을 토큰화하는 간단한 방법은 공백을 기준으로 쪼개는 것입니다.
+토큰화된 결과는 다음과 같습니다:
+
+```
+["Don't", "you", "love", "🤗", "Transformers?", "We", "sure", "do."]
+```
+이는 첫 번째 결과로는 합리적이지만, `"Transformers?"`와 `"do."`토큰을 보면 각각 `"Transformer"`와 `"do"`에 구두점이 붙어있는 것을 확인할 수 있습니다.
+구두점을 고려해야 모델이 단어의 다른 표현과 그 뒤에 올 수 있는 모든 가능한 구두점을 학습할 필요가 없습니다. 그렇지 않으면 모델이 학습해야 하는 표현의 수가 폭발적으로 증가하게 됩니다.
+
+구두점을 고려한 토큰화 결과는 다음과 같습니다:
+
+```
+["Don", "'", "t", "you", "love", "🤗", "Transformers", "?", "We", "sure", "do", "."]
+```
+
+이전보다 나아졌습니다. 하지만, `"Don't"`의 토큰화 결과도 수정이 필요합니다.
+`"Don't"`는 `"do not"`의 줄임말이기 때문에 `["Do", "n't"]`로 토큰화되는 것이 좋습니다.
+여기서부터 복잡해지기 시작합니다. 그리고 이 점이 각 모델마다 고유한 토큰화 유형이 존재하는 이유 중 하나입니다.
+텍스트를 토큰화하는 데 적용하는 규칙에 따라 동일한 텍스트에 대해 토큰화된 결과가 달라집니다.
+사전 훈련된 모델은 훈련 데이터를 토큰화하는 데 사용된 것과 동일한 규칙으로 토큰화된 입력을 제공해야만 제대로 작동합니다.
+
+[spaCy](https://spacy.io/)와 [Moses](http://www.statmt.org/moses/?n=Development.GetStarted)는 유명한 규칙 기반 토크나이저입니다. 예제에 *spaCy*와 *Moses* 를 적용한 결과는 다음과 같습니다:
+
+```
+["Do", "n't", "you", "love", "🤗", "Transformers", "?", "We", "sure", "do", "."]
+```
+
+보시다시피 공백 및 구두점 토큰화와 규칙 기반 토큰화가 사용됩니다.
+공백 및 구두점, 규칙 기반 토큰화은 모두 단어 문장을 단어로 쪼개는 단어 토큰화에 해당합니다.
+이 토큰화 방법은 텍스트를 더 작은 묶음(chunk)로 분할하는 가장 직관적인 방법이지만, 대규모 텍스트 말뭉치에 대해서는 문제가 발생할 수 있습니다.
+이 경우 공백 및 구두점 토큰화는 일반적으로 매우 큰 어휘(사용된 모든 고유 단어와 토큰 집합)을 생성합니다.
+*예를 들어*, [Transformer XL](model_doc/transformerxl)은 공백 및 구두점 토큰화를 사용해 어휘(vocabulary) 크기가 267,735입니다!
+
+어휘 크기가 크면 모델에 입력 및 출력 레이어로 엄청난 임베딩 행렬이 필요하므로 메모리와 시간 복잡성이 모두 증가합니다.
+일반적으로 트랜스포머 모델은 어휘 크기가 50,000개를 넘는 경우가 드물며, 특히 단일 언어에 대해서만 사전 훈련된 경우에는 더욱 그렇습니다.
+단순한 공백과 구두점 토큰화가 만족스럽지 않다면 단순히 문자를 토큰화하면 어떨까요?
+
+<Youtube id="ssLq_EK2jLE"/>
+
+문자 토큰화는 아주 간단하고 메모리와 시간 복잡도를 크게 줄일 수 있지만, 모델이 의미 있는 입력 표현을 학습하기에는 훨씬 더 어렵습니다.
+
+*예를 들어*, 문자 `"t"`에 대한 의미 있는 문맥 독립적 표현을 배우는 것 보다 단어 `"today"`에 대한 의미 있는 문맥 독립적 표현을 배우는 것이 훨씬 더 어렵습니다.
+문자 토큰화는 종종 성능 저하를 동반하기 때문에 두 가지 장점을 모두 얻기 위해 트랜스포머 모델은 **서브워드** 토큰화라고 하는 단어 수준과 문자 수준 토큰화의 하이브리드를 사용합니다.
+
+## 서브워드 토큰화[[subword-tokenization]]
+
+<Youtube id="zHvTiHr506c"/>
+
+서브워드 토큰화 알고리즘은 자주 사용되는 단어는 더 작은 하위 단어로 쪼개고, 드문 단어는 의미 있는 하위 단어로 분해되어야 한다는 원칙에 따라 작동합니다.
+예를 들어 `"annoyingly"`는 드문 단어로 간주되어 `"annoying"`과 `"ly"`로 분해될 수 있습니다.
+`"annoyingly"`가 `"annoying"`과 `"ly"`의 합성어인 반면, `"annoying"`과 `"ly"` 둘 다 독립적인 서브워드로 자주 등장합니다.
+이는 터키어와 같은 응집성 언어에서 특히 유용하며, 서브워드를 묶어 임의로 긴 복합 단어를 만들 수 있습니다.
+
+서브워드 토큰화를 사용하면 모델이 의미 있는 문맥 독립적 표현을 학습하면서 합리적인 어휘 크기를 가질 수 있습니다.
+또한, 서브워드 토큰화를 통해 모델은 이전에 본 적이 없는 단어를 알려진 서브워드로 분해하여 처리할 수 있습니다.
+
+예를 들어, [`~transformers.BertTokenizer`]는 `"I have a new GPU!"` 라는 문장을 아래와 같이 토큰화합니다:
+
+```py
+>>> from transformers import BertTokenizer
+
+>>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+>>> tokenizer.tokenize("I have a new GPU!")
+["i", "have", "a", "new", "gp", "##u", "!"]
+```
+
+대소문자가 없는 모델을 사용해 문장의 시작이 소문자로 표기되었습니다.
+단어 `["i", "have", "a", "new"]`는 토크나이저의 어휘에 속하지만, `"gpu"`는 속하지 않는 것을 확인할 수 있습니다.
+결과적으로 토크나이저는 `"gpu"`를 알려진 두 개의 서브워드로 쪼갭니다: `["gp" and "##u"]`.
+`"##"`은 토큰의 나머지 부분이 공백 없이 이전 토큰에 연결되어야(attach) 함을 의미합니다(토큰화 디코딩 또는 역전을 위해).
+
+또 다른 예로, [`~transformers.XLNetTokenizer`]는 이전에 예시 문장을 다음과 같이 토큰화합니다:
+```py
+>>> from transformers import XLNetTokenizer
+
+>>> tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
+>>> tokenizer.tokenize("Don't you love 🤗 Transformers? We sure do.")
+["▁Don", "'", "t", "▁you", "▁love", "▁", "🤗", "▁", "Transform", "ers", "?", "▁We", "▁sure", "▁do", "."]
+```
+
+`"▁"`가 가지는 의미는 [SentencePiece](#sentencepiece)에서 다시 살펴보도록 하겠습니다.
+보다시피 `"Transformers"` 라는 드문 단어는 서브워드 `"Transform"`와 `"ers"`로 쪼개집니다.
+
+이제 다양한 하위 단어 토큰화 알고리즘이 어떻게 작동하는지 살펴보겠습니다.
+이러한 토큰화 알고리즘은 일반적으로 해당 모델이 학습되는 말뭉치에 대해 수행되는 어떤 형태의 학습에 의존한다는 점에 유의하세요.
+
+<a id='byte-pair-encoding'></a>
+
+### 바이트 페어 인코딩 (Byte-Pair Encoding, BPE)[[bytepair-encoding-bpe]]
+
+바이트 페어 인코딩(BPE)은 [Neural Machine Translation of Rare Words with Subword Units (Sennrich et
+al., 2015)](https://arxiv.org/abs/1508.07909) 에서 소개되었습니다.
+BPE는 훈련 데이터를 단어로 분할하는 사전 토크나이저(pre-tokenizer)에 의존합니다.
+사전 토큰화(Pretokenization)에는 [GPT-2](model_doc/gpt2), [Roberta](model_doc/roberta)와 같은 간단한 공백 토큰화가 있습니다.
+복잡한 사전 토큰화에는 규칙 기반 토큰화가 해당하는데, 훈련 말뭉치에서 각 단어의 빈도를 계산하기 위해 사용합니다.
+[XLM](model_doc/xlm), 대부분의 언어에서 Moses를 사용하는 [FlauBERT](model_doc/flaubert), Spacy와 ftfy를 사용하는 [GPT](model_doc/gpt)가 해당합니다.
+
+
+사전 토큰화 이후에, 고유 단어 집합가 생성되고 훈련 데이터에서 각 단어가 등장하는 빈도가 결정됩니다.
+다음으로, BPE는 고유 단어 집합에 나타나는 모든 기호로 구성된 기본 어휘를 생성하고 기본 어휘의 두 기호에서 새로운 기호를 형성하는 병합 규칙을 학습합니다.
+어휘가 원하는 어휘 크기에 도달할 때까지 위의 과정을 반복합니다.
+어휘 크기는 토크나이저를 훈련시키기 전에 정의해야 하는 하이퍼파라미터라는 점을 유의하세요.
+
+예를 들어, 사전 토큰화 후 빈도를 포함한 다음과 같은 어휘 집합이 결정되었다고 가정해 보겠습니다:
+
+```
+("hug", 10), ("pug", 5), ("pun", 12), ("bun", 4), ("hugs", 5)
+```
+
+결과적으로 기본 어휘는 `["b", "g", "h", "n", "p", "s", "u"]` 이고, 각 단어를 기본 어휘에 속하는 기호로 쪼개면 아래와 같습니다:
+
+```
+("h" "u" "g", 10), ("p" "u" "g", 5), ("p" "u" "n", 12), ("b" "u" "n", 4), ("h" "u" "g" "s", 5)
+```
+
+그런 다음 BPE는 가능한 각 기호 쌍의 빈도를 계산하여 가장 자주 발생하는 기호 쌍을 선택합니다.
+위의 예시에서 `"h"` 뒤에 오는 `"u"`는 _10 + 5 = 15_ 번 등장합니다. (`"hug"`에서 10번, `"hugs"`에서 5번 등장)
+
+하지만, 가장 등장 빈도가 높은 기호 쌍은 `"u"` 뒤에 오는 `"g"`입니다. _10 + 5 + 5 = 20_ 으로 총 20번 등장합니다.
+따라서 토크나이저가 병합하는 가장 첫 번째 쌍은 `"u"` 뒤에 오는 `"g"`입니다. `"ug"`가 어휘에 추가되어 어휘는 다음과 같습니다:
+
+```
+("h" "ug", 10), ("p" "ug", 5), ("p" "u" "n", 12), ("b" "u" "n", 4), ("h" "ug" "s", 5)
+```
+
+BPE는 다음으로 가장 많이 등장하는 기호 쌍을 식별합니다.
+`"u"` 뒤에 오는 `"n"`은 16번 등장해 `"un"` 으로 병합되어 어휘에 추가됩니다.
+그 다음으로 빈도수가 놓은 기호 쌍은 `"h"` 뒤에 오는 `"ug"`로 15번 등장합니다.
+다시 한 번 `"hug"`로 병합되어 어휘에 추가됩니다.
+
+현재 단계에서 어휘는 `["b", "g", "h", "n", "p", "s", "u", "ug", "un", "hug"]` 이고, 고유 단어 집합은 다음과 같습니다:
+
+```
+("hug", 10), ("p" "ug", 5), ("p" "un", 12), ("b" "un", 4), ("hug" "s", 5)
+```
+
+이 시점에서 바이트 페어 인코딩 훈련이 중단된다고 가정하면, 훈련된 병합 규칙은 새로운 단어에 적용됩니다(기본 어휘에 포함된 기호가 새로운 단어에 포함되지 않는 한).
+예를 들어, 단어 `"bug"`는 `["b", "ug"]`로 토큰화되지만, `"m"`이 기본 어휘에 없기 때문에 `"mug"`는 `["<unk>", "ug"]`로 토큰화될 것입니다.
+훈련 데이터에는 단일 문자가 최소한 한 번 등장하기 때문에 일반적으로 `"m"`과 같은 단일 문자는 `"<unk>"` 기호로 대체되지 않지만, 이모티콘과 같은 특별한 문자인 경우에는 대체될 수 있습니다.
+
+이전에 언급했듯이 어휘 크기(즉 기본 어휘 크기 + 병합 횟수)는 선택해야하는 하이퍼파라미터입니다.
+예를 들어 [GPT](model_doc/gpt)의 기본 어휘 크기는 478, 40,000번의 병합 이후에 훈련을 종료하기 때문에 어휘 크기가 40,478입니다.
+
+#### 바이트 수준 BPE (Byte-level BPE)[[bytelevel-bpe]]
+
+가능한 모든 기본 문자를 포함하는 기본 어휘의 크기는 굉장히 커질 수 있습니다. (예: 모든 유니코드 문자를 기본 문자로 간주하는 경우)
+더 나은 기본 어휘를 갖도록 [GPT-2](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)는 기본 어휘로 바이트(bytes)를 사용합니다.
+이 방식은 모든 기본 문자가 어휘에 포함되도록 하면서 기본 어휘의 크기를 256으로 제한합니다.
+구두점을 다루는 추가적인 규칙을 사용해 GPT2 토크나이저는 모든 텍스트를 <unk> 기호 없이 토큰화할 수 있습니다.
+[GPT-2](model_doc/gpt)의 어휘 크기는 50,257로 256 바이트 크기의 기본 토큰, 특별한 end-of-text 토큰과 50,000번의 병합으로 학습한 기호로 구성됩니다.
+
+<a id='wordpiece'></a>
+
+### 워드피스 (WordPiece)[[wordpiece]]
+
+워드피스는 [BERT](model_doc/bert), [DistilBERT](model_doc/distilbert), [Electra](model_doc/electra)에 사용된 서브워드 토큰화 알고리즘입니다.
+이 알고리즘은 [Japanese and Korean Voice Search (Schuster et al., 2012)](https://static.googleusercontent.com/media/research.google.com/ja//pubs/archive/37842.pdf)에서 소개되었고, BPE와 굉장히 유사합니다.
+워드피스는 훈련 데이터에 등장하는 모든 문자로 기본 어휘를 초기화한 후, 주어진 병합 규칙에 따라 점진적으로 학습합니다.
+BPE와는 대조적으로 워드피스는 가장 빈도수가 높은 기호 쌍을 선택하지 않고, 어휘에 추가되었을 때 훈련 데이터의 우도가 최대화되는 쌍을 선택합니다.
+
+정확히 무슨 의미일까요?
+이전 예시를 참조하면, 훈련 데이터의 우도 값을 최대화하는 것은 모든 기호 쌍 중에서 첫 번째 기호와 두 번째 기호의 확률로 나눈 확률이 가장 큰 기호 쌍을 찾는 것과 동일합니다.
+예를 들어 `"ug"`의 확률이 `"u"`와 `"g"` 각각으로 쪼개졌을 때 보다 높아야 `"u"` 뒤에 오는 `"g"`는 병합될 것입니다.
+직관적으로 워드피스는 두 기호를 병합하여 _잃는_ 것을 평가하여 그만한 _가치_가 있는지 확인한다는 점에서 BPE와 약간 다릅니다.
+
+<a id='unigram'></a>
+
+### 유니그램 (Unigram)[[unigram]]
+
+유니그램은 [Subword Regularization: Improving Neural Network Translation Models with Multiple Subword Candidates (Kudo, 2018)](https://arxiv.org/pdf/1804.10959.pdf)에서 제안된 서브워드 토큰화 알고리즘입니다.
+BPE나 워드피스와 달리 유니그램은 기본 어휘를 많은 수의 기호로 초기화한 후 각 기호를 점진적으로 줄여 더 작은 어휘를 얻습니다.
+예를 들어 기본 어휘는 모든 사전 토큰화된 단어와 가장 일반적인 하위 문자열에 해당할 수 있습니다.
+유니그램은 transformers 모델에서 직접적으로 사용되지는 않지만, [SentencePiece](#sentencepiece)와 함께 사용됩니다.
+
+각 훈련 단계에서 유니그램 알고리즘은 현재 어휘와 유니그램 언어 모델이 주어졌을 때 훈련 데이터에 대한 손실(흔히 로그 우도로 정의됨)을 정의합니다.
+그런 다음 어휘의 각 기호에 대해 알고리즘은 해당 기호를 어휘에서 제거할 경우 전체 손실이 얼마나 증가할지 계산합니다.
+이후에 유니그램은 손실 증가율이 가장 낮은 기호의 p(보통 10% 또는 20%) 퍼센트를 제거합니다. (제거되는 기호는 훈련 데이터에 대한 전체 손실에 가장 작은 영향을 미칩니다.)
+어휘가 원하는 크기에 도달할 때까지 이 과정을 반복합니다.
+유니그램 알고리즘은 항상 기본 문자를 포함해 어떤 단어라도 토큰화할 수 있습니다.
+유니그램이 병합 규칙에 기반하지 않기 떄문에 (BPE나 워드피스와는 대조적으로), 해당 알고리즘은 훈련 이후에 새로운 텍스트를 토큰화하는데 여러 가지 방법이 있습니다.
+
+예를 들어, 훈련된 유니그램 토큰화가 다음과 같은 어휘를 가진다면:
+
+```
+["b", "g", "h", "n", "p", "s", "u", "ug", "un", "hug"],
+```
+
+`"hugs"`는 두 가지로 토큰화할 수 있습니다. `["hug", "s"]`와 `["h", "ug", "s"]` 또는 `["h", "u", "g", "s"]`.
+
+그렇다면 어떤 토큰화 방법을 선택해야 할까요?
+유니그램은 어휘를 저장하는 것 외에도 훈련 말뭉치에 각 토큰의 확률을 저장하여 훈련 후 가능한 각 토큰화의 확률을 계산할 수 있도록 합니다.
+이 알고리즘은 단순히 실제로 가장 가능성이 높은 토큰화를 선택하지만, 확률에 따라 가능한 토큰화를 샘플링할 수 있는 가능성도 제공합니다.
+이러한 확률은 토크나이저가 학습한 손실에 의해 정의됩니다.
+
+단어로 구성된 훈련 데이터를 \\(x_{1}, \dots, x_{N}\\)라 하고, 단어 \\(x_{i}\\)에 대한 가능한 모든 토큰화 결과를 \\(S(x_{i})\\)라 한다면, 전체 손실은 다음과 같이 정의됩니다:
+
+$$\mathcal{L} = -\sum_{i=1}^{N} \log \left ( \sum_{x \in S(x_{i})} p(x) \right )$$
+
+
+
+<a id='sentencepiece'></a>
+
+### 센텐스피스 (SentencePiece)[[sentencepiece]]
+
+지금까지 다룬 토큰화 알고리즘은 동일한 문제를 가집니다: 입력 텍스트는 공백을 사용하여 단어를 구분한다고 가정합니다.
+하지만, 모든 언어에서 단어를 구분하기 위해 공백을 사용하지 않습니다.
+한가지 가능한 해결방안은 특정 언어에 특화된 사전 토크나이저를 사용하는 것입니다. 예를 들어 [XLM](model_doc/xlm)은 특정 중국어, 일본어, 태국어 사전 토크나이저를 사용합니다.
+이 문제를 일반적인 방법으로 해결하기 위해, [SentencePiece: A simple and language independent subword tokenizer and detokenizer for Neural Text Processing (Kudo et al., 2018)](https://arxiv.org/pdf/1808.06226.pdf)는 입력을 스트림으로 처리해 공백를 하나의 문자로 사용합니다.
+이후에 BPE 또는 유니그램 알고리즘을 사용해 적절한 어휘를 구성합니다.
+
+[`XLNetTokenizer`]는 센텐스피스를 사용하기 때문에, 위에서 다룬 예시에서 어휘에  `"▁"`가 포함되어있습니다.
+모든 토큰을 합친 후 `"▁"`을 공백으로 대체하면 되기 때문에 센텐스피스로 토큰화된 결과는 디코딩하기 수월합니다.
+
+transformers에서 제공하는 센텐스피스 토크나이저를 사용하는 모든 모델은 유니그램과 함께 사용됩니다. 
+[ALBERT](model_doc/albert), [XLNet](model_doc/xlnet), [Marian](model_doc/marian), [T5](model_doc/t5) 모델이 센텐스피스 토크나이저를 사용합니다.
\ No newline at end of file
diff --git a/docs/source/ko/torchscript.md b/docs/source/ko/torchscript.md
new file mode 100644
index 000000000000..297479caf2c0
--- /dev/null
+++ b/docs/source/ko/torchscript.md
@@ -0,0 +1,189 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# TorchScript로 내보내기[[export-to-torchscript]]
+
+<Tip>
+
+TorchScript를 활용한 실험은 아직 초기 단계로, 가변적인 입력 크기 모델들을 통해 그 기능성을 계속 탐구하고 있습니다. 
+이 기능은 저희가 관심을 두고 있는 분야 중 하나이며, 
+앞으로 출시될 버전에서 더 많은 코드 예제, 더 유연한 구현, 그리고 Python 기반 코드와 컴파일된 TorchScript를 비교하는 벤치마크를 등을 통해 분석을 심화할 예정입니다.
+
+</Tip>
+
+[TorchScript 문서](https://pytorch.org/docs/stable/jit.html)에서는 이렇게 말합니다.
+
+> TorchScript는 PyTorch 코드에서 직렬화 및 최적화 가능한 모델을 생성하는 방법입니다.
+
+[JIT과 TRACE](https://pytorch.org/docs/stable/jit.html)는 개발자가 모델을 내보내서 효율 지향적인 C++ 프로그램과 같은 다른 프로그램에서 재사용할 수 있도록 하는 PyTorch 모듈입니다.
+
+PyTorch 기반 Python 프로그램과 다른 환경에서 모델을 재사용할 수 있도록, 🤗 Transformers 모델을 TorchScript로 내보낼 수 있는 인터페이스를 제공합니다. 
+이 문서에서는 TorchScript를 사용하여 모델을 내보내고 사용하는 방법을 설명합니다.
+
+모델을 내보내려면 두 가지가 필요합니다:
+
+- `torchscript` 플래그로 모델 인스턴스화
+- 더미 입력을 사용한 순전파(forward pass)
+
+이 필수 조건들은 아래에 자세히 설명된 것처럼 개발자들이 주의해야 할 여러 사항들을 의미합니다.
+
+## TorchScript 플래그와 묶인 가중치(tied weights)[[torchscript-flag-and-tied-weights]]
+
+`torchscript` 플래그가 필요한 이유는 대부분의 🤗 Transformers 언어 모델에서 `Embedding` 레이어와 `Decoding` 레이어 간의 묶인 가중치(tied weights)가 존재하기 때문입니다.
+TorchScript는 묶인 가중치를 가진 모델을 내보낼 수 없으므로, 미리 가중치를 풀고 복제해야 합니다.
+
+`torchscript` 플래그로 인스턴스화된 모델은 `Embedding` 레이어와 `Decoding` 레이어가 분리되어 있으므로 이후에 훈련해서는 안 됩니다.
+훈련을 하게 되면 두 레이어 간 동기화가 해제되어 예상치 못한 결과가 발생할 수 있습니다.
+
+언어 모델 헤드를 갖지 않은 모델은 가중치가 묶여 있지 않아서 이 문제가 발생하지 않습니다.
+이러한 모델들은 `torchscript` 플래그 없이 안전하게 내보낼 수 있습니다.
+
+## 더미 입력과 표준 길이[[dummy-inputs-and-standard-lengths]]
+
+더미 입력(dummy inputs)은 모델의 순전파(forward pass)에 사용됩니다. 
+입력 값이 레이어를 통해 전파되는 동안, PyTorch는 각 텐서에서 실행된 다른 연산을 추적합니다. 
+이러한 기록된 연산은 모델의 *추적(trace)*을 생성하는 데 사용됩니다.
+
+추적은 입력의 차원을 기준으로 생성됩니다. 
+따라서 더미 입력의 차원에 제한되어, 다른 시퀀스 길이나 배치 크기에서는 작동하지 않습니다. 
+다른 크기로 시도할 경우 다음과 같은 오류가 발생합니다:
+
+```
+`The expanded size of the tensor (3) must match the existing size (7) at non-singleton dimension 2`
+```
+추론 중 모델에 공급될 가장 큰 입력만큼 큰 더미 입력 크기로 모델을 추적하는 것이 좋습니다. 
+패딩은 누락된 값을 채우는 데 도움이 될 수 있습니다. 
+그러나 모델이 더 큰 입력 크기로 추적되기 때문에, 행렬의 차원이 커지고 계산량이 많아집니다.
+
+다양한 시퀀스 길이 모델을 내보낼 때는 각 입력에 대해 수행되는 총 연산 횟수에 주의하고 성능을 주의 깊게 확인하세요.
+
+## Python에서 TorchScript 사용하기[[using-torchscript-in-python]]
+
+이 섹션에서는 모델을 저장하고 가져오는 방법, 추적을 사용하여 추론하는 방법을 보여줍니다.
+
+### 모델 저장하기[[saving-a-model]]
+
+`BertModel`을 TorchScript로 내보내려면 `BertConfig` 클래스에서 `BertModel`을 인스턴스화한 다음, `traced_bert.pt`라는 파일명으로 디스크에 저장하면 됩니다.
+
+```python
+from transformers import BertModel, BertTokenizer, BertConfig
+import torch
+
+enc = BertTokenizer.from_pretrained("bert-base-uncased")
+
+# 입력 텍스트 토큰화하기
+text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+tokenized_text = enc.tokenize(text)
+
+# 입력 토큰 중 하나를 마스킹하기
+masked_index = 8
+tokenized_text[masked_index] = "[MASK]"
+indexed_tokens = enc.convert_tokens_to_ids(tokenized_text)
+segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
+
+# 더미 입력 만들기
+tokens_tensor = torch.tensor([indexed_tokens])
+segments_tensors = torch.tensor([segments_ids])
+dummy_input = [tokens_tensor, segments_tensors]
+
+# torchscript 플래그로 모델 초기화하기
+# 이 모델은 LM 헤드가 없으므로 필요하지 않지만, 플래그를 True로 설정합니다.
+config = BertConfig(
+    vocab_size_or_config_json_file=32000,
+    hidden_size=768,
+    num_hidden_layers=12,
+    num_attention_heads=12,
+    intermediate_size=3072,
+    torchscript=True,
+)
+
+# 모델을 인스턴트화하기
+model = BertModel(config)
+
+# 모델을 평가 모드로 두어야 합니다.
+model.eval()
+
+# 만약 *from_pretrained*를 사용하여 모델을 인스턴스화하는 경우, TorchScript 플래그를 쉽게 설정할 수 있습니다
+model = BertModel.from_pretrained("bert-base-uncased", torchscript=True)
+
+# 추적 생성하기
+traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
+torch.jit.save(traced_model, "traced_bert.pt")
+```
+
+### 모델 가져오기[[loading-a-model]]
+
+이제 이전에 저장한 `BertModel`, 즉 `traced_bert.pt`를 디스크에서 가져오고, 이전에 초기화한 `dummy_input`에서 사용할 수 있습니다.
+
+```python
+loaded_model = torch.jit.load("traced_bert.pt")
+loaded_model.eval()
+
+all_encoder_layers, pooled_output = loaded_model(*dummy_input)
+```
+
+### 추적된 모델을 사용하여 추론하기[[using-a-traced-model-for-inference]]
+
+`__call__` 이중 언더스코어(dunder) 메소드를 사용하여 추론에 추적된 모델을 사용하세요:
+
+```python
+traced_model(tokens_tensor, segments_tensors)
+```
+
+## Neuron SDK로 Hugging Face TorchScript 모델을 AWS에 배포하기[[deploy-hugging-face-torchscript-models-to-aws-with-the-neuron-sdk]]
+
+AWS가 클라우드에서 저비용, 고성능 머신 러닝 추론을 위한 [Amazon EC2 Inf1](https://aws.amazon.com/ec2/instance-types/inf1/) 인스턴스 제품군을 출시했습니다. 
+Inf1 인스턴스는 딥러닝 추론 워크로드에 특화된 맞춤 하드웨어 가속기인 AWS Inferentia 칩으로 구동됩니다. 
+[AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/#)은 Inferentia를 위한 SDK로, Inf1에 배포하기 위한 transformers 모델 추적 및 최적화를 지원합니다. 
+Neuron SDK는 다음과 같은 기능을 제공합니다:
+
+1. 코드 한 줄만 변경하면 클라우드 추론를 위해 TorchScript 모델을 추적하고 최적화할 수 있는 쉬운 API
+2. 즉시 사용 가능한 성능 최적화로 [비용 효율 향상](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/benchmark/>)
+3. [PyTorch](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/bert_tutorial/tutorial_pretrained_bert.html) 또는 [TensorFlow](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/tensorflow/huggingface_bert/huggingface_bert.html)로 구축된 Hugging Face transformers 모델 지원
+
+### 시사점[[implications]]
+
+[BERT (Bidirectional Encoder Representations from Transformers)](https://huggingface.co/docs/transformers/main/model_doc/bert) 아키텍처 또는 그 변형인 [distilBERT](https://huggingface.co/docs/transformers/main/model_doc/distilbert) 및 [roBERTa](https://huggingface.co/docs/transformers/main/model_doc/roberta)를 기반으로 한 Transformers 모델은 추출 기반 질의응답, 시퀀스 분류 및 토큰 분류와 같은 비생성 작업 시 Inf1에서 최상의 성능을 보입니다. 
+그러나 텍스트 생성 작업도 [AWS Neuron MarianMT 튜토리얼](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/transformers-marianmt.html)을 따라 Inf1에서 실행되도록 조정할 수 있습니다.
+
+Inferentia에서 바로 변환할 수 있는 모델에 대한 자세한 정보는 Neuron 문서의 [Model Architecture Fit](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/models/models-inferentia.html#models-inferentia) 섹션에서 확인할 수 있습니다.
+
+### 종속성[[dependencies]]
+
+AWS Neuron을 사용하여 모델을 변환하려면 [Neuron SDK 환경](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/index.html#installation-guide)이 필요합니다.
+ 이는 [AWS Deep Learning AMI](https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-inferentia-launching.html)에 미리 구성되어 있습니다.
+
+### AWS Neuron으로 모델 변환하기[[converting-a-model-for-aws-neuron]]
+
+`BertModel`을 추적하려면, [Python에서 TorchScript 사용하기](torchscript#using-torchscript-in-python)에서와 동일한 코드를 사용해서 AWS NEURON용 모델을 변환합니다. 
+`torch.neuron` 프레임워크 익스텐션을 가져와 Python API를 통해 Neuron SDK의 구성 요소에 접근합니다:
+
+```python
+from transformers import BertModel, BertTokenizer, BertConfig
+import torch
+import torch.neuron
+```
+
+다음 줄만 수정하면 됩니다:
+
+```diff
+- torch.jit.trace(model, [tokens_tensor, segments_tensors])
++ torch.neuron.trace(model, [token_tensor, segments_tensors])
+```
+
+이로써 Neuron SDK가 모델을 추적하고 Inf1 인스턴스에 최적화할 수 있게 됩니다.
+
+AWS Neuron SDK의 기능, 도구, 예제 튜토리얼 및 최신 업데이트에 대해 자세히 알아보려면 [AWS NeuronSDK 문서](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html)를 참조하세요.
diff --git a/docs/source/ko/training.md b/docs/source/ko/training.md
new file mode 100644
index 000000000000..4e375f0f7215
--- /dev/null
+++ b/docs/source/ko/training.md
@@ -0,0 +1,428 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 사전 학습된 모델 미세 튜닝하기[[finetune-a-pretrained-model]]
+
+[[open-in-colab]]
+
+사전 학습된 모델을 사용하면 상당한 이점이 있습니다. 계산 비용과 탄소발자국을 줄이고, 처음부터 모델을 학습시킬 필요 없이 최신 모델을 사용할 수 있습니다. 🤗 Transformers는 다양한 작업을 위해 사전 학습된 수천 개의 모델에 액세스할 수 있습니다. 사전 학습된 모델을 사용하는 경우, 자신의 작업과 관련된 데이터셋을 사용해 학습합니다. 이것은 미세 튜닝이라고 하는 매우 강력한 훈련 기법입니다. 이 튜토리얼에서는 당신이 선택한 딥러닝 프레임워크로 사전 학습된 모델을 미세 튜닝합니다:
+
+* 🤗 Transformers로 사전 학습된 모델 미세 튜닝하기 [`Trainer`].
+* Keras를 사용하여 TensorFlow에서 사전 학습된 모델을 미세 튜닝하기.
+* 기본 PyTorch에서 사전 학습된 모델을 미세 튜닝하기.
+
+<a id='data-processing'></a>
+
+## 데이터셋 준비[[prepare-a-dataset]]
+
+<Youtube id="_BZearw7f0w"/>
+
+사전 학습된 모델을 미세 튜닝하기 위해서 데이터셋을 다운로드하고 훈련할 수 있도록 준비하세요. 이전 튜토리얼에서 훈련을 위해 데이터를 처리하는 방법을 보여드렸는데, 지금이 배울 걸 되짚을 기회입니다!
+
+먼저 [Yelp 리뷰](https://huggingface.co/datasets/yelp_review_full) 데이터 세트를 로드합니다:
+
+```py
+>>> from datasets import load_dataset
+
+>>> dataset = load_dataset("yelp_review_full")
+>>> dataset["train"][100]
+{'label': 0,
+ 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. I\'ve worked at more than one location. I expect bad days, bad moods, and the occasional mistake. But I have yet to have a decent experience at this store. It will remain a place I avoid unless someone in my party needs to avoid illness from low blood sugar. Perhaps I should go back to the racially biased service of Steak n Shake instead!'}
+```
+
+텍스트를 처리하고 서로 다른 길이의 시퀀스 패딩 및 잘라내기 전략을 포함하려면 토크나이저가 필요합니다. 데이터셋을 한 번에 처리하려면 🤗 Dataset [`map`](https://huggingface.co/docs/datasets/process.html#map) 메서드를 사용하여 전체 데이터셋에 전처리 함수를 적용하세요:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+
+
+>>> def tokenize_function(examples):
+...     return tokenizer(examples["text"], padding="max_length", truncation=True)
+
+
+>>> tokenized_datasets = dataset.map(tokenize_function, batched=True)
+```
+
+필요한 경우 미세 튜닝을 위해 데이터셋의 작은 부분 집합을 만들어 미세 튜닝 작업 시간을 줄일 수 있습니다:
+
+```py
+>>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
+>>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
+```
+
+<a id='trainer'></a>
+
+## Train
+
+여기서부터는 사용하려는 프레임워크에 해당하는 섹션을 따라야 합니다. 오른쪽 사이드바의 링크를 사용하여 원하는 프레임워크로 이동할 수 있으며, 특정 프레임워크의 모든 콘텐츠를 숨기려면 해당 프레임워크 블록의 오른쪽 상단에 있는 버튼을 사용하면 됩니다!
+
+<frameworkcontent>
+<pt>
+<Youtube id="nvBXf7s7vTI"/>
+
+## 파이토치 Trainer로 훈련하기[[train-with-pytorch-trainer]]
+
+🤗 Transformers는 🤗 Transformers 모델 훈련에 최적화된 [`Trainer`] 클래스를 제공하여 훈련 루프를 직접 작성하지 않고도 쉽게 훈련을 시작할 수 있습니다. [`Trainer`] API는 로깅(logging), 경사 누적(gradient accumulation), 혼합 정밀도(mixed precision) 등 다양한 훈련 옵션과 기능을 지원합니다.
+
+먼저 모델을 가져오고 예상되는 레이블 수를 지정합니다. Yelp 리뷰 [데이터셋 카드](https://huggingface.co/datasets/yelp_review_full#data-fields)에서 5개의 레이블이 있음을 알 수 있습니다:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+```
+
+<Tip>
+
+사전 훈련된 가중치 중 일부가 사용되지 않고 일부 가중치가 무작위로 표시된다는 경고가 표시됩니다.
+걱정마세요. 이것은 올바른 동작입니다! 사전 학습된 BERT 모델의 헤드는 폐기되고 무작위로 초기화된 분류 헤드로 대체됩니다. 이제 사전 학습된 모델의 지식으로 시퀀스 분류 작업을 위한 새로운 모델 헤드를 미세 튜닝 합니다.
+
+</Tip>
+
+### 하이퍼파라미터 훈련[[training-hyperparameters]]
+
+다음으로 정할 수 있는 모든 하이퍼파라미터와 다양한 훈련 옵션을 활성화하기 위한 플래그를 포함하는 [`TrainingArguments`] 클래스를 생성합니다.
+
+이 튜토리얼에서는 기본 훈련 [하이퍼파라미터](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments)로 시작하지만, 자유롭게 실험하여 여러분들에게 맞는 최적의 설정을 찾을 수 있습니다.
+
+훈련에서 체크포인트(checkpoints)를 저장할 위치를 지정합니다:
+
+```py
+>>> from transformers import TrainingArguments
+
+>>> training_args = TrainingArguments(output_dir="test_trainer")
+```
+
+### 평가 하기[[evaluate]]
+
+[`Trainer`]는 훈련 중에 모델 성능을 자동으로 평가하지 않습니다. 평가 지표를 계산하고 보고할 함수를 [`Trainer`]에 전달해야 합니다. 
+[🤗 Evaluate](https://huggingface.co/docs/evaluate/index) 라이브러리는 [`evaluate.load`](https://huggingface.co/spaces/evaluate-metric/accuracy) 함수로 로드할 수 있는 간단한 [`accuracy`]함수를 제공합니다 (자세한 내용은 [둘러보기](https://huggingface.co/docs/evaluate/a_quick_tour)를 참조하세요):
+
+```py
+>>> import numpy as np
+>>> import evaluate
+
+>>> metric = evaluate.load("accuracy")
+```
+
+`metric`에서 [`~evaluate.compute`]를 호출하여 예측의 정확도를 계산합니다. 예측을 `compute`에 전달하기 전에 예측을 로짓으로 변환해야 합니다(모든 🤗 Transformers 모델은 로짓으로 반환한다는 점을 기억하세요):
+
+```py
+>>> def compute_metrics(eval_pred):
+...     logits, labels = eval_pred
+...     predictions = np.argmax(logits, axis=-1)
+...     return metric.compute(predictions=predictions, references=labels)
+```
+
+미세 튜닝 중에 평가 지표를 모니터링하려면 훈련 인수에 `evaluation_strategy` 파라미터를 지정하여 각 에폭이 끝날 때 평가 지표를 확인할 수 있습니다:
+
+```py
+>>> from transformers import TrainingArguments, Trainer
+
+>>> training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
+```
+
+### 훈련 하기[[trainer]]
+
+모델, 훈련 인수, 훈련 및 테스트 데이터셋, 평가 함수가 포함된 [`Trainer`] 객체를 만듭니다:
+
+```py
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=small_train_dataset,
+...     eval_dataset=small_eval_dataset,
+...     compute_metrics=compute_metrics,
+... )
+```
+
+그리고 [`~transformers.Trainer.train`]을 호출하여 모델을 미세 튜닝합니다:
+
+```py
+>>> trainer.train()
+```
+</pt>
+<tf>
+<a id='keras'></a>
+
+<Youtube id="rnTGBy2ax1c"/>
+
+## Keras로 텐서플로우 모델 훈련하기[[train-a-tensorflow-model-with-keras]]
+
+Keras API를 사용하여 텐서플로우에서 🤗 Transformers 모델을 훈련할 수도 있습니다!
+
+### Keras용 데이터 로드[[loading-data-for-keras]]
+
+Keras API로 🤗 Transformers 모델을 학습시키려면 데이터셋을 Keras가 이해할 수 있는 형식으로 변환해야 합니다.
+데이터 세트가 작은 경우, 전체를 NumPy 배열로 변환하여 Keras로 전달하면 됩니다.
+더 복잡한 작업을 수행하기 전에 먼저 이 작업을 시도해 보겠습니다.
+
+먼저 데이터 세트를 로드합니다. [GLUE 벤치마크](https://huggingface.co/datasets/glue)의 CoLA 데이터 세트를 사용하겠습니다.
+간단한 바이너리 텍스트 분류 작업이므로 지금은 훈련 데이터 분할만 사용합니다.
+
+```py
+from datasets import load_dataset
+
+dataset = load_dataset("glue", "cola")
+dataset = dataset["train"]  # Just take the training split for now
+```
+
+다음으로 토크나이저를 로드하고 데이터를 NumPy 배열로 토큰화합니다. 레이블은 이미 0과 1로 된 리스트이기 때문에 토큰화하지 않고 바로 NumPy 배열로 변환할 수 있습니다!
+
+```py
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+tokenized_data = tokenizer(dataset["sentence"], return_tensors="np", padding=True)
+# Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras
+tokenized_data = dict(tokenized_data)
+
+labels = np.array(dataset["label"])  # Label is already an array of 0 and 1
+```
+
+마지막으로 모델을 로드, [`compile`](https://keras.io/api/models/model_training_apis/#compile-method), [`fit`](https://keras.io/api/models/model_training_apis/#fit-method)합니다:
+
+```py
+from transformers import TFAutoModelForSequenceClassification
+from tensorflow.keras.optimizers import Adam
+
+# Load and compile our model
+model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
+# Lower learning rates are often better for fine-tuning transformers
+model.compile(optimizer=Adam(3e-5))
+
+model.fit(tokenized_data, labels)
+```
+
+<Tip>
+
+모델을 `compile()`할 때 손실 인수를 모델에 전달할 필요가 없습니다! 
+이 인수를 비워두면 허깅 페이스 모델은 작업과 모델 아키텍처에 적합한 손실을 자동으로 선택합니다. 
+원한다면 언제든지 직접 손실을 지정하여 이를 재정의할 수 있습니다!
+
+</Tip>
+
+이 접근 방식은 소규모 데이터 집합에서는 잘 작동하지만, 대규모 데이터 집합에서는 문제가 될 수 있습니다. 왜 그럴까요?
+토큰화된 배열과 레이블을 메모리에 완전히 로드하고 NumPy는 "들쭉날쭉한" 배열을 처리하지 않기 때문에,
+모든 토큰화된 샘플을 전체 데이터셋에서 가장 긴 샘플의 길이만큼 패딩해야 합니다. 이렇게 하면 배열이 훨씬 더 커지고 이 패딩 토큰으로 인해 학습 속도도 느려집니다!
+
+### 데이터를 tf.data.Dataset으로 로드하기[[loading-data-as-a-tfdatadataset]]
+
+학습 속도가 느려지는 것을 피하려면 데이터를 `tf.data.Dataset`으로 로드할 수 있습니다. 원한다면 직접
+`tf.data` 파이프라인을 직접 작성할 수도 있지만, 이 작업을 간편하게 수행하는 수 있는 두 가지 방법이 있습니다:
+
+- [`~TFPreTrainedModel.prepare_tf_dataset`]: 대부분의 경우 이 방법을 권장합니다. 모델의 메서드이기 때문에 모델을 검사하여 모델 입력으로 사용할 수 있는 열을 자동으로 파악하고
+나머지는 버려서 더 단순하고 성능이 좋은 데이터 집합을 만들 수 있습니다.
+- [`~datasets.Dataset.to_tf_dataset`]: 이 방법은 좀 더 낮은 수준이며, 포함할 '열'과 '레이블'을 정확히 지정하여
+데이터셋을 생성하는 방법을 정확히 제어하고 싶을 때 유용하며, 포함할 'columns'과 'label_cols'을 정확히 지정할 수 있습니다.
+
+[`~TFPreTrainedModel.prepare_tf_dataset`]을 사용하려면 먼저 다음 코드 샘플과 같이 토크나이저 출력을 데이터 세트에 열로 추가해야 합니다:
+
+```py
+def tokenize_dataset(data):
+    # Keys of the returned dictionary will be added to the dataset as columns
+    return tokenizer(data["text"])
+
+
+dataset = dataset.map(tokenize_dataset)
+```
+
+허깅 페이스 데이터셋은 기본적으로 디스크에 저장되므로 메모리 사용량을 늘리지 않는다는 점을 기억하세요! 
+열이 추가되면 데이터셋에서 배치를 스트리밍하고 각 배치에 패딩을 추가할 수 있으므로 전체 데이터셋에 패딩을 추가하는 것보다 패딩 토큰의 수를 크게 줄일 수 있습니다.
+
+
+```py
+>>> tf_dataset = model.prepare_tf_dataset(dataset, batch_size=16, shuffle=True, tokenizer=tokenizer)
+```
+
+위의 코드 샘플에서는 배치가 로드될 때 올바르게 패딩할 수 있도록 `prepare_tf_dataset`에 토크나이저를 전달해야 합니다.
+데이터셋의 모든 샘플 길이가 같고 패딩이 필요하지 않은 경우 이 인수를 건너뛸 수 있습니다.
+샘플을 채우는 것보다 더 복잡한 작업(예: 마스킹된 언어의 토큰 손상 모델링)을 수행하기 위해 토큰을 손상시켜야 하는 경우, 
+`collate_fn` 인수를 사용하여 샘플 목록을 배치로 변환하고 원하는 전처리를 적용할 함수를 전달할 수 있습니다. 
+[예시](https://github.com/huggingface/transformers/tree/main/examples) 또는 
+[노트북](https://huggingface.co/docs/transformers/notebooks)을 참조하여 이 접근 방식이 실제로 작동하는 모습을 확인하세요.
+
+`tf.data.Dataset`을 생성한 후에는 이전과 마찬가지로 모델을 컴파일하고 훈련(fit)할 수 있습니다:
+
+```py
+model.compile(optimizer=Adam(3e-5))
+
+model.fit(tf_dataset)
+```
+
+</tf>
+</frameworkcontent>
+
+<a id='pytorch_native'></a>
+
+## 기본 파이토치로 훈련하기[[train-in-native-pytorch]]
+
+<frameworkcontent>
+<pt>
+<Youtube id="Dh9CL8fyG80"/>
+
+[`Trainer`]는 훈련 루프를 처리하며 한 줄의 코드로 모델을 미세 조정할 수 있습니다. 직접 훈련 루프를 작성하는 것을 선호하는 사용자의 경우, 기본 PyTorch에서 🤗 Transformers 모델을 미세 조정할 수도 있습니다.
+
+이 시점에서 노트북을 다시 시작하거나 다음 코드를 실행해 메모리를 확보해야 할 수 있습니다:
+
+```py
+del model
+del trainer
+torch.cuda.empty_cache()
+```
+
+다음으로, '토큰화된 데이터셋'을 수동으로 후처리하여 훈련련에 사용할 수 있도록 준비합니다.
+
+1. 모델이 원시 텍스트를 입력으로 허용하지 않으므로 `text` 열을 제거합니다:
+
+    ```py
+    >>> tokenized_datasets = tokenized_datasets.remove_columns(["text"])
+    ```
+
+2. 모델에서 인수의 이름이 `labels`로 지정될 것으로 예상하므로 `label` 열의 이름을 `labels`로 변경합니다:
+
+    ```py
+    >>> tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
+    ```
+
+3. 데이터셋의 형식을 List 대신 PyTorch 텐서를 반환하도록 설정합니다:
+
+    ```py
+    >>> tokenized_datasets.set_format("torch")
+    ```
+
+그리고 앞서 표시된 대로 데이터셋의 더 작은 하위 집합을 생성하여 미세 조정 속도를 높입니다:
+
+```py
+>>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
+>>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
+```
+
+### DataLoader[[dataloader]]
+
+훈련 및 테스트 데이터셋에 대한 'DataLoader'를 생성하여 데이터 배치를 반복할 수 있습니다:
+
+```py
+>>> from torch.utils.data import DataLoader
+
+>>> train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
+>>> eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)
+```
+
+예측을 위한 레이블 개수를 사용하여 모델을 로드합니다:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+```
+
+### 옵티마이저 및 학습 속도 스케줄러[[optimizer-and-learning-rate-scheduler]]
+
+옵티마이저와 학습 속도 스케줄러를 생성하여 모델을 미세 조정합니다. 파이토치에서 제공하는 [`AdamW`](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html) 옵티마이저를 사용해 보겠습니다:
+
+```py
+>>> from torch.optim import AdamW
+
+>>> optimizer = AdamW(model.parameters(), lr=5e-5)
+```
+
+[`Trainer`]에서 기본 학습 속도 스케줄러를 생성합니다:
+
+```py
+>>> from transformers import get_scheduler
+
+>>> num_epochs = 3
+>>> num_training_steps = num_epochs * len(train_dataloader)
+>>> lr_scheduler = get_scheduler(
+...     name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
+... )
+```
+
+마지막으로, GPU에 액세스할 수 있는 경우 'device'를 지정하여 GPU를 사용하도록 합니다. 그렇지 않으면 CPU에서 훈련하며 몇 분이 아닌 몇 시간이 걸릴 수 있습니다.
+
+```py
+>>> import torch
+
+>>> device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+>>> model.to(device)
+```
+
+<Tip>
+
+[Colaboratory](https://colab.research.google.com/) 또는 [SageMaker StudioLab](https://studiolab.sagemaker.aws/)과 같은 호스팅 노트북이 없는 경우 클라우드 GPU에 무료로 액세스할 수 있습니다.
+
+</Tip>
+
+이제 훈련할 준비가 되었습니다! 🥳
+
+### 훈련 루프[[training-loop]]
+
+훈련 진행 상황을 추적하려면 [tqdm](https://tqdm.github.io/) 라이브러리를 사용하여 트레이닝 단계 수에 진행률 표시줄을 추가하세요:
+
+```py
+>>> from tqdm.auto import tqdm
+
+>>> progress_bar = tqdm(range(num_training_steps))
+
+>>> model.train()
+>>> for epoch in range(num_epochs):
+...     for batch in train_dataloader:
+...         batch = {k: v.to(device) for k, v in batch.items()}
+...         outputs = model(**batch)
+...         loss = outputs.loss
+...         loss.backward()
+
+...         optimizer.step()
+...         lr_scheduler.step()
+...         optimizer.zero_grad()
+...         progress_bar.update(1)
+```
+
+### 평가 하기[[evaluate]]
+
+[`Trainer`]에 평가 함수를 추가한 방법과 마찬가지로, 훈련 루프를 직접 작성할 때도 동일한 작업을 수행해야 합니다. 하지만 이번에는 각 에포크가 끝날 때마다 평가지표를 계산하여 보고하는 대신, [`~evaluate.add_batch`]를 사용하여 모든 배치를 누적하고 맨 마지막에 평가지표를 계산합니다.
+
+```py
+>>> import evaluate
+
+>>> metric = evaluate.load("accuracy")
+>>> model.eval()
+>>> for batch in eval_dataloader:
+...     batch = {k: v.to(device) for k, v in batch.items()}
+...     with torch.no_grad():
+...         outputs = model(**batch)
+
+...     logits = outputs.logits
+...     predictions = torch.argmax(logits, dim=-1)
+...     metric.add_batch(predictions=predictions, references=batch["labels"])
+
+>>> metric.compute()
+```
+</pt>
+</frameworkcontent>
+
+<a id='additional-resources'></a>
+
+## 추가 자료[[additional-resources]]
+
+더 많은 미세 튜닝 예제는 다음을 참조하세요:
+
+- [🤗 Trnasformers 예제](https://github.com/huggingface/transformers/tree/main/examples)에는 PyTorch 및 텐서플로우에서 일반적인 NLP 작업을 훈련할 수 있는 스크립트가 포함되어 있습니다.
+
+- [🤗 Transformers 노트북](notebooks)에는 PyTorch 및 텐서플로우에서 특정 작업을 위해 모델을 미세 튜닝하는 방법에 대한 다양한 노트북이 포함되어 있습니다.
diff --git a/docs/source/ko/transformers_agents.md b/docs/source/ko/transformers_agents.md
new file mode 100644
index 000000000000..eeb00761e9a7
--- /dev/null
+++ b/docs/source/ko/transformers_agents.md
@@ -0,0 +1,328 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Transformers Agent [[transformers-agent]]
+
+<Tip warning={true}>
+
+Transformers Agent는 실험 중인 API로 언제든지 변경될 수 있습니다. 
+API 또는 기반 모델이 변경되기 쉽기 때문에 에이전트가 반환하는 결과도 달라질 수 있습니다.
+
+</Tip>
+
+Transformers 버전 4.29.0.에서 *도구*와 *에이전트*라는 컨셉을 도입했습니다. [이 colab](https://colab.research.google.com/drive/1c7MHD-T1forUPGcC_jlwsIptOzpG3hSj)에서 사용해볼 수 있습니다.
+
+간단히 말하면, Agent는 트랜스포머 위에 자연어 API를 제공합니다. 
+엄선된 도구 세트를 정의하고, 자연어를 해석하여 이러한 도구를 사용할 수 있는 에이전트를 설계했습니다. 
+이 API는 확장이 가능하도록 설계 되었습니다. 
+주요 도구를 선별해두었지만, 커뮤니티에서 개발한 모든 도구를 사용할 수 있도록 시스템을 쉽게 확장할 수 있는 방법도 보여드리겠습니다.
+
+몇 가지 예를 통해 새로운 API로 무엇을 할 수 있는지 살펴보겠습니다. 
+이 API는 특히 멀티모달 작업에서 강력하므로 이미지를 생성하고 텍스트를 소리내어 읽어보겠습니다.
+
+```py
+agent.run("Caption the following image", image=image)
+```
+
+| **Input**                                                                                                                   | **Output**                        |
+|-----------------------------------------------------------------------------------------------------------------------------|-----------------------------------|
+| <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/beaver.png" width=200> | A beaver is swimming in the water |
+
+---
+
+```py
+agent.run("Read the following text out loud", text=text)
+```
+| **Input**                                                                                                               | **Output**                                   |
+|-------------------------------------------------------------------------------------------------------------------------|----------------------------------------------|
+| A beaver is swimming in the water | <audio controls><source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tts_example.wav" type="audio/wav"> your browser does not support the audio element. </audio>
+
+---
+
+```py
+agent.run(
+    "In the following `document`, where will the TRRF Scientific Advisory Council Meeting take place?",
+    document=document,
+)
+```
+| **Input**                                                                                                                   | **Output**     |
+|-----------------------------------------------------------------------------------------------------------------------------|----------------|
+| <img src="https://datasets-server.huggingface.co/assets/hf-internal-testing/example-documents/--/hf-internal-testing--example-documents/test/0/image/image.jpg" width=200> | ballroom foyer |
+
+## 바로 시작하기 [[quickstart]]
+
+`agent.run`을 사용하려면 먼저 대규모 언어 모델(LLM)인 에이전트를 인스턴스화해야 합니다. 
+저희는 openAI 모델뿐만 아니라 BigCode 및 OpenAssistant의 오픈소스 대체 모델도 지원합니다. 
+openAI 모델의 성능이 더 우수하지만(단, openAI API 키가 필요하므로 무료로 사용할 수 없음), 
+Hugging Face는 BigCode와 OpenAssistant 모델의 엔드포인트에 대한 무료 액세스를 제공하고 있습니다.
+
+우선 모든 기본 종속성을 설치하려면 `agents`를 추가로 설치하세요.
+```bash
+pip install transformers[agents]
+```
+
+openAI 모델을 사용하려면 `openai` 종속성을 설치한 후 [`OpenAiAgent`]를 인스턴스화합니다:
+
+```bash
+pip install openai
+```
+
+
+```py
+from transformers import OpenAiAgent
+
+agent = OpenAiAgent(model="text-davinci-003", api_key="<your_api_key>")
+```
+
+BigCode 또는 OpenAssistant를 사용하려면 먼저 로그인하여 Inference API에 액세스하세요:
+
+```py
+from huggingface_hub import login
+
+login("<YOUR_TOKEN>")
+```
+
+그런 다음 에이전트를 인스턴스화합니다.
+
+```py
+from transformers import HfAgent
+
+# Starcoder
+agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
+# StarcoderBase
+# agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoderbase")
+# OpenAssistant
+# agent = HfAgent(url_endpoint="https://api-inference.huggingface.co/models/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5")
+```
+
+현재 Hugging Face에서 무료로 제공하는 추론 API를 사용하고 있습니다. 
+이 모델에 대한 자체 추론 엔드포인트가 있는 경우(또는 다른 엔드포인트가 있는 경우) 위의 URL을 해당 URL 엔드포인트로 바꿀 수 있습니다.
+
+<Tip>
+
+StarCoder와 OpenAssistant는 무료로 사용할 수 있으며 간단한 작업에서 놀라울 정도로 잘 작동합니다. 
+그러나 더 복잡한 프롬프트를 처리할 때는 체크포인트가 잘 작동하지 않습니다. 
+이러한 문제가 발생하면 OpenAI 모델을 사용해 보시기 바랍니다. 아쉽게도 오픈소스는 아니지만 현재로서는 더 나은 성능을 제공합니다.
+
+</Tip>
+
+이제 준비가 완료되었습니다! 이제 자유롭게 사용할 수 있는 두 가지 API에 대해 자세히 알아보겠습니다.
+
+### 단일 실행 (run) [[single-execution-(run)]] 
+
+단일 실행 방법은 에이전트의 [`~Agent.run`] 메소드를 사용하는 경우입니다:
+
+```py
+agent.run("Draw me a picture of rivers and lakes.")
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200>
+
+수행하려는 작업에 적합한 도구를 자동으로 선택하여 적절하게 실행합니다. 
+동일한 명령어에서 하나 또는 여러 개의 작업을 수행할 수 있습니다
+(다만, 명령어가 복잡할수록 에이전트가 실패할 가능성이 높아집니다).
+
+```py
+agent.run("Draw me a picture of the sea then transform the picture to add an island")
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/sea_and_island.png" width=200>
+
+<br/>
+
+
+모든 [`~Agent.run`] 작업은 독립적이므로 다른 작업으로 여러 번 연속해서 실행할 수 있습니다.
+
+`agent`는 큰 언어 모델일 뿐이므로 프롬프트에 약간의 변화를 주면 완전히 다른 결과가 나올 수 있다는 점에 유의하세요. 
+수행하려는 작업을 최대한 명확하게 설명하는 것이 중요합니다. 
+좋은 프롬프트를 작성하는 방법은 [여기](custom_tools#writing-good-user-inputs)에서 자세히 확인할 수 있습니다.
+
+여러 실행에 걸쳐 상태를 유지하거나 텍스트가 아닌 개체를 에이전트에게 전달하려는 경우에는 에이전트가 사용할 변수를 지정할 수 있습니다. 
+예를 들어 강과 호수의 첫 번째 이미지를 생성한 뒤, 
+모델이 해당 그림에 섬을 추가하도록 다음과 같이 요청할 수 있습니다:
+
+```python
+picture = agent.run("Generate a picture of rivers and lakes.")
+updated_picture = agent.run("Transform the image in `picture` to add an island to it.", picture=picture)
+```
+
+<Tip>
+
+이 방법은 모델이 요청을 이해하지 못하고 도구를 혼합할 때 유용할 수 있습니다. 예를 들면 다음과 같습니다:
+
+```py
+agent.run("Draw me the picture of a capybara swimming in the sea")
+```
+
+여기서 모델은 두 가지 방식으로 해석할 수 있습니다:
+- `text-to-image`이 바다에서 헤엄치는 카피바라를 생성하도록 합니다.
+- 또는 `text-to-image`이 카피바라를 생성한 다음 `image-transformation` 도구를 사용하여 바다에서 헤엄치도록 합니다.
+
+첫 번째 시나리오를 강제로 실행하려면 프롬프트를 인수로 전달하여 실행할 수 있습니다:
+
+```py
+agent.run("Draw me a picture of the `prompt`", prompt="a capybara swimming in the sea")
+```
+
+</Tip>
+
+
+### 대화 기반 실행 (chat) [[chat-based-execution-(chat)]]
+
+에이전트는 [`~Agent.chat`] 메소드를 사용하는 대화 기반 접근 방식도 있습니다:
+
+```py
+agent.chat("Generate a picture of rivers and lakes")
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200> 
+
+```py
+agent.chat("Transform the picture so that there is a rock in there")
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes_and_beaver.png" width=200>
+
+<br/>
+
+이 방식은 여러 명령어에 걸쳐 상태를 유지하고자 할 때 흥미로운 접근 방식입니다. 
+실험용으로 더 좋지만 복잡한 명령어보다는 
+단일 명령어([`~Agent.run`] 메소드가 더 잘 처리하는 명령어)에 훨씬 더 잘 작동하는 경향이 있습니다.
+
+이 메소드는 텍스트가 아닌 유형이나 특정 프롬프트를 전달하려는 경우 인수를 받을 수도 있습니다.
+
+### ⚠️ 원격 실행 [[remote-execution]]
+
+데모 목적과 모든 설정에서 사용할 수 있도록 
+에이전트가 접근할 수 있는 몇 가지 기본 도구에 대한 원격 실행기를 만들었습니다. 
+이러한 도구는 [inference endpoints](https://huggingface.co/inference-endpoints)를 사용하여 만들어졌습니다. 
+원격 실행기 도구를 직접 설정하는 방법을 보려면 [사용자 정의 도구 가이드](./custom_tools)를 읽어보시기 바랍니다.
+
+원격 도구로 실행하려면 [`~Agent.run`] 또는 [`~Agent.chat`] 중 하나에 `remote=True`를 지정하기만 하면 됩니다.
+
+예를 들어 다음 명령은 많은 RAM이나 GPU 없이도 모든 장치에서 효율적으로 실행할 수 있습니다:
+
+```py
+agent.run("Draw me a picture of rivers and lakes", remote=True)
+```
+
+[`~Agent.chat`]도 마찬가지입니다:
+
+```py
+agent.chat("Draw me a picture of rivers and lakes", remote=True)
+```
+
+### 여기서 무슨 일이 일어나는 거죠? 도구란 무엇이고, 에이전트란 무엇인가요? [[whats-happening-here-what-are-tools-and-what-are-agents]]
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/diagram.png">
+
+#### 에이전트 [[agents]]
+
+여기서 "에이전트"는 대규모 언어 모델이며, 특정 도구 모음에 접근할 수 있도록 프롬프트하고 있습니다.
+
+LLM은 작은 코드 샘플을 생성하는 데 상당히 능숙하므로,
+이 장점을 활용해 도구 모음을 사용하여 작업을 수행하는 작은 코드 샘플을 제공하라는 메시지를 표시합니다. 
+그런 다음 에이전트에게 제공하는 작업과 제공하는 도구에 대한 설명으로 이 프롬프트가 완료됩니다. 
+이렇게 하면 사용 중인 도구들의 문서에 접근할 수 있으며, 해당 도구들의 입력과 출력을 예상하고, 관련된 코드를 생성할 수 있습니다.
+
+#### 도구 [[tools]]
+
+도구는 매우 간단합니다. 이름과 설명이 있는 단일 기능으로 구성되어 있습니다. 
+그런 다음 이러한 도구의 설명을 사용하여 상담원에게 프롬프트를 표시합니다. 
+이 프롬프트를 통해 상담원에게 쿼리에서 요청된 작업을 수행하기 위해 도구를 활용하는 방법을 보여줍니다.
+
+에이전트가 매우 원자적인 도구를 사용하여 더 나은 코드를 작성하기 때문에 파이프라인이 아닌 완전히 새로운 도구를 사용합니다. 
+파이프라인은 더 많이 리팩터링되며 종종 여러 작업을 하나로 결합합니다. 
+도구는 하나의 매우 간단한 작업에만 집중하도록 되어 있습니다.
+
+#### 코드 실행?! [[code-execution]]
+
+그런 다음 이 코드는 도구와 함께 전달된 입력 세트에 대해 작은 Python 인터프리터를 사용하여 실행됩니다. 
+"임의 코드 실행이라니!"이라고 비명을 지르는 소리가 들리겠지만, 그렇지 않은 이유를 설명하겠습니다.
+
+호출할 수 있는 함수는 제공한 도구와 인쇄 기능뿐이므로 이미 실행할 수 있는 기능이 제한되어 있습니다. 
+Hugging Face 도구로 제한되어 있다면 안전할 것입니다. 
+
+그리고 어트리뷰트 조회나 가져오기를 허용하지 않으므로
+(어차피 작은 함수 집합에 입/출력을 전달할 때는 필요하지 않아야 합니다) 
+가장 명백한 공격(어차피 LLM에 출력하라는 메시지를 표시해야 합니다)은 문제가 되지 않습니다. 
+매우 안전하게 하고 싶다면 추가 인수 return_code=True를 사용하여 run() 메소드를 실행하면 됩니다.
+이 경우 에이전트가 실행할 코드를 반환하고 실행할지 여부를 결정할 수 있습니다.
+
+불법적인 연산을 수행하려고 하거나 에이전트가 생성한 코드에 일반적인 파이썬 오류가 있는 경우 
+실행이 중지됩니다.
+
+### 엄선된 도구 모음 [[a-curated-set-of-tools]]
+
+저희는 이러한 에이전트들의 역량을 강화할 수 있는 일련의 도구를 확인하고 있습니다. 
+다음은 연동된 도구의 최신 목록입니다:
+
+- **문서 질문 답변**: 이미지 형식의 문서(예: PDF)가 주어지면 이 문서에 대한 질문에 답변합니다. ([Donut](./model_doc/donut))
+- **텍스트 질문 답변**: 긴 텍스트와 질문이 주어지면 텍스트에서 질문에 답변합니다. ([Flan-T5](./model_doc/flan-t5))
+- **무조건 이미지 캡셔닝**: 이미지에 캡션을 답니다! ([BLIP](./model_doc/blip))
+- **이미지 질문 답변**: 이미지가 주어지면 이 이미지에 대한 질문에 답변하기. ([VILT](./model_doc/vilt))
+- **이미지 분할**: 이미지와 프롬프트가 주어지면 해당 프롬프트의 분할 마스크를 출력합니다. ([CLIPSeg](./model_doc/clipseg))
+- **음성을 텍스트로 변환**: 사람이 말하는 오디오 녹음이 주어지면 음성을 텍스트로 변환합니다. ([Whisper](./model_doc/whisper))
+- **텍스트 음성 변환**: 텍스트를 음성으로 변환합니다. ([SpeechT5](./model_doc/speecht5))
+- **제로 샷(zero-shot) 텍스트 분류**: 텍스트와 레이블 목록이 주어지면 텍스트와 가장 관련 있는 레이블을 식별합니다. ([BART](./model_doc/bart))
+- **텍스트 요약**: 긴 텍스트를 한 문장 또는 몇 문장으로 요약합니다. ([BART](./model_doc/bart))
+- **번역**: 텍스트를 지정된 언어로 번역합니다. ([NLLB](./model_doc/nllb))
+
+이러한 도구는 트랜스포머에 통합되어 있으며, 예를 들어 수동으로도 사용할 수 있습니다:
+
+```py
+from transformers import load_tool
+
+tool = load_tool("text-to-speech")
+audio = tool("This is a text to speech tool")
+```
+
+### 사용자 정의 도구 [[custom-tools]]
+
+엄선된 도구 세트도 있지만, 이 구현이 제공하는 가장 큰 가치는 사용자 지정 도구를 빠르게 만들고 공유할 수 있다는 점입니다.
+
+도구의 코드를 Hugging Face Space나 모델 저장소에 푸시하면 에이전트에게 직접 도구를 활용할 수 있습니다.  [`huggingface-tools` organization](https://huggingface.co/huggingface-tools)에 몇 가지 **트랜스포머에 구애받지 않는** 툴을 추가했습니다:
+
+- **텍스트 다운로더**: 웹 URL에서 텍스트를 다운로드합니다.
+- **텍스트 이미지 변환**: 프롬프트에 따라 이미지를 생성하여 안정적인 확산을 활용합니다.
+- **이미지 변환**: 초기 이미지와 프롬프트가 주어진 이미지를 수정하고, 안정적인 확산을 활용하는 지시 픽셀 2 픽셀을 활용합니다.
+- **텍스트 비디오 변환**: 프롬프트에 따라 작은 비디오를 생성하며, damo-vilab을 활용합니다.
+
+저희가 처음부터 사용하고 있는 텍스트-이미지 변환 도구는 [*huggingface-tools/text-to-image*](https://huggingface.co/spaces/huggingface-tools/text-to-image)에 있는 원격 도구입니다! 저희는 이 도구와 다른 조직에 이러한 도구를 계속 출시하여 이 구현을 더욱 강화할 것입니다.
+
+에이전트는 기본적으로 [`huggingface-tools`](https://huggingface.co/huggingface-tools)에 있는 도구에 접근할 수 있습니다.
+[다음 가이드](custom_tools)에서 도구를 작성하고 공유하는 방법과 Hub에 있는 사용자 지정 도구를 활용하는 방법에 대해 설명합니다.
+
+### 코드 생성[[code-generation]]
+
+지금까지 에이전트를 사용하여 작업을 수행하는 방법을 보여드렸습니다. 하지만 에이전트는 매우 제한된 Python 인터프리터를 사용하여 실행할 코드만 생성하고 있습니다. 다른 설정에서 생성된 코드를 사용하려는 경우 에이전트에게 도구 정의 및 정확한 가져오기와 함께 코드를 반환하라는 메시지를 표시할 수 있습니다.
+
+예를 들어 다음 명령어는 
+```python
+agent.run("Draw me a picture of rivers and lakes", return_code=True)
+```
+
+다음 코드를 반환합니다.
+
+```python
+from transformers import load_tool
+
+image_generator = load_tool("huggingface-tools/text-to-image")
+
+image = image_generator(prompt="rivers and lakes")
+```
+
+이 코드는 직접 수정하고 실행할 수 있습니다.
\ No newline at end of file
diff --git a/docs/source/ko/troubleshooting.md b/docs/source/ko/troubleshooting.md
new file mode 100644
index 000000000000..5eef788e0993
--- /dev/null
+++ b/docs/source/ko/troubleshooting.md
@@ -0,0 +1,198 @@
+<!---
+Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 문제 해결[[troubleshoot]]
+
+때때로 오류가 발생할 수 있지만, 저희가 도와드리겠습니다! 이 가이드는 현재까지 확인된 가장 일반적인 문제 몇 가지와 그것들을 해결하는 방법에 대해 다룹니다. 그러나 이 가이드는 모든 🤗 Transformers 문제를 포괄적으로 다루고 있지 않습니다. 문제 해결에 더 많은 도움을 받으려면 다음을 시도해보세요:
+
+<Youtube id="S2EEG3JIt2A"/>
+
+1. [포럼](https://discuss.huggingface.co/)에서 도움을 요청하세요. [Beginners](https://discuss.huggingface.co/c/beginners/5) 또는 [🤗 Transformers](https://discuss.huggingface.co/c/transformers/9)와 같은 특정 카테고리에 질문을 게시할 수 있습니다. 재현 가능한 코드와 함께 잘 서술된 포럼 게시물을 작성하여 여러분의 문제가 해결될 가능성을 극대화하세요!
+
+<Youtube id="_PAli-V4wj0"/>
+
+2. 라이브러리와 관련된 버그이면 🤗 Transformers 저장소에서 [이슈](https://github.com/huggingface/transformers/issues/new/choose)를 생성하세요. 버그에 대해 설명하는 정보를 가능한 많이 포함하려고 노력하여, 무엇이 잘못 되었는지와 어떻게 수정할 수 있는지 더 잘 파악할 수 있도록 도와주세요.
+
+3. 이전 버전의 🤗 Transformers을 사용하는 경우 중요한 변경 사항이 버전 사이에 도입되었기 때문에 [마이그레이션](migration) 가이드를 확인하세요.
+
+문제 해결 및 도움 매뉴얼에 대한 자세한 내용은 Hugging Face 강좌의 [8장](https://huggingface.co/course/chapter8/1?fw=pt)을 참조하세요.
+
+
+## 방화벽 환경[[firewalled-environments]]
+
+클라우드 및 내부망(intranet) 설정의 일부 GPU 인스턴스는 외부 연결에 대한 방화벽으로 차단되어 연결 오류가 발생할 수 있습니다. 스크립트가 모델 가중치나 데이터를 다운로드하려고 할 때, 다운로드가 중단되고 다음 메시지와 함께 시간 초과됩니다: 
+
+```
+ValueError: Connection error, and we cannot find the requested files in the cached path.
+Please try again or make sure your Internet connection is on.
+```
+
+이 경우에는 연결 오류를 피하기 위해 🤗 Transformers를 [오프라인 모드](installation#offline-mode)로 실행해야 합니다.
+
+## CUDA 메모리 부족(CUDA out of memory)[[cuda-out-of-memory]]
+
+수백만 개의 매개변수로 대규모 모델을 훈련하는 것은 적절한 하드웨어 없이 어려울 수 있습니다. GPU 메모리가 부족한 경우 발생할 수 있는 일반적인 오류는 다음과 같습니다:
+
+```
+CUDA out of memory. Tried to allocate 256.00 MiB (GPU 0; 11.17 GiB total capacity; 9.70 GiB already allocated; 179.81 MiB free; 9.85 GiB reserved in total by PyTorch)
+```
+
+다음은 메모리 사용을 줄이기 위해 시도해 볼 수 있는 몇 가지 잠재적인 해결책입니다:
+
+- [`TrainingArguments`]의 [`per_device_train_batch_size`](main_classes/trainer#transformers.TrainingArguments.per_device_train_batch_size) 값을 줄이세요.
+- [`TrainingArguments`]의 [`gradient_accumulation_steps`](main_classes/trainer#transformers.TrainingArguments.gradient_accumulation_steps)은 전체 배치 크기를 효과적으로 늘리세요.
+
+<Tip>
+
+메모리 절약 기술에 대한 자세한 내용은 성능 [가이드](performance)를 참조하세요.
+
+</Tip>
+
+## 저장된 TensorFlow 모델을 가져올 수 없습니다(Unable to load a saved TensorFlow model)[[unable-to-load-a-saved-uensorFlow-model]]
+
+TensorFlow의 [model.save](https://www.tensorflow.org/tutorials/keras/save_and_load#save_the_entire_model) 메소드는 아키텍처, 가중치, 훈련 구성 등 전체 모델을 단일 파일에 저장합니다. 그러나 모델 파일을 다시 가져올 때 🤗 Transformers는 모델 파일에 있는 모든 TensorFlow 관련 객체를 가져오지 않을 수 있기 때문에 오류가 발생할 수 있습니다. TensorFlow 모델 저장 및 가져오기 문제를 피하려면 다음을 권장합니다:
+
+- 모델 가중치를 `h5` 파일 확장자로 [`model.save_weights`](https://www.tensorflow.org/tutorials/keras/save_and_load#save_the_entire_model)로 저장한 다음 [`~TFPreTrainedModel.from_pretrained`]로 모델을 다시 가져옵니다:
+
+```py
+>>> from transformers import TFPreTrainedModel
+>>> from tensorflow import keras
+
+>>> model.save_weights("some_folder/tf_model.h5")
+>>> model = TFPreTrainedModel.from_pretrained("some_folder")
+```
+
+- 모델을 [`~TFPretrainedModel.save_pretrained`]로 저장하고 [`~TFPreTrainedModel.from_pretrained`]로 다시 가져옵니다:
+
+```py
+>>> from transformers import TFPreTrainedModel
+
+>>> model.save_pretrained("path_to/model")
+>>> model = TFPreTrainedModel.from_pretrained("path_to/model")
+```
+
+## ImportError[[importerror]]
+
+특히 최신 모델인 경우 만날 수 있는 다른 일반적인 오류는 `ImportError`입니다:
+
+```
+ImportError: cannot import name 'ImageGPTImageProcessor' from 'transformers' (unknown location)
+```
+
+이러한 오류 유형의 경우 최신 모델에 액세스할 수 있도록 최신 버전의 🤗 Transformers가 설치되어 있는지 확인하세요:
+
+```bash
+pip install transformers --upgrade
+```
+
+## CUDA error: device-side assert triggered[[cuda-error-deviceside-assert-triggered]]
+
+때때로 장치 코드 오류에 대한 일반적인 CUDA 오류가 발생할 수 있습니다.
+
+```
+RuntimeError: CUDA error: device-side assert triggered
+```
+
+더 자세한 오류 메시지를 얻으려면 우선 코드를 CPU에서 실행합니다. 다음 환경 변수를 코드의 시작 부분에 추가하여 CPU로 전환하세요:
+
+```py
+>>> import os
+
+>>> os.environ["CUDA_VISIBLE_DEVICES"] = ""
+```
+
+또 다른 옵션은 GPU에서 더 나은 역추적(traceback)을 얻는 것입니다. 다음 환경 변수를 코드의 시작 부분에 추가하여 역추적이 오류가 발생한 소스를 가리키도록 하세요:
+
+```py
+>>> import os
+
+>>> os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+```
+
+## 패딩 토큰이 마스킹되지 않은 경우 잘못된 출력(Incorrect output when padding tokens aren't masked)[[incorrect-output-when-padding-tokens-arent-masked]]
+
+경우에 따라 `input_ids`에 패딩 토큰이 포함된 경우 `hidden_state` 출력이 올바르지 않을 수 있습니다. 데모를 위해 모델과 토크나이저를 가져오세요. 모델의 `pad_token_id`에 액세스하여 해당 값을 확인할 수 있습니다. 일부 모델의 경우 `pad_token_id`가 `None`일 수 있지만 언제든지 수동으로 설정할 수 있습니다.
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+>>> import torch
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
+>>> model.config.pad_token_id
+0
+```
+
+다음 예제는 패딩 토큰을 마스킹하지 않은 출력을 보여줍니다:
+
+```py
+>>> input_ids = torch.tensor([[7592, 2057, 2097, 2393, 9611, 2115], [7592, 0, 0, 0, 0, 0]])
+>>> output = model(input_ids)
+>>> print(output.logits)
+tensor([[ 0.0082, -0.2307],
+        [ 0.1317, -0.1683]], grad_fn=<AddmmBackward0>)
+```
+
+다음은 두 번째 시퀀스의 실제 출력입니다:
+
+```py
+>>> input_ids = torch.tensor([[7592]])
+>>> output = model(input_ids)
+>>> print(output.logits)
+tensor([[-0.1008, -0.4061]], grad_fn=<AddmmBackward0>)
+```
+
+대부분의 경우 모델에 `attention_mask`를 제공하여 패딩 토큰을 무시해야 이러한 조용한 오류를 방지할 수 있습니다. 이제 두 번째 시퀀스의 출력이 실제 출력과 일치합니다:
+
+<Tip>
+
+일반적으로 토크나이저는 특정 토크나이저의 기본 값을 기준으로 사용자에 대한 'attention_mask'를 만듭니다.
+
+</Tip>
+
+```py
+>>> attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1], [1, 0, 0, 0, 0, 0]])
+>>> output = model(input_ids, attention_mask=attention_mask)
+>>> print(output.logits)
+tensor([[ 0.0082, -0.2307],
+        [-0.1008, -0.4061]], grad_fn=<AddmmBackward0>)
+```
+
+🤗 Transformers는 패딩 토큰이 제공된 경우 패딩 토큰을 마스킹하기 위한 `attention_mask`를 자동으로 생성하지 않습니다. 그 이유는 다음과 같습니다:
+
+- 일부 모델에는 패딩 토큰이 없습니다.
+- 일부 사용 사례의 경우 사용자가 모델이 패딩 토큰을 관리하기를 원합니다.
+
+## ValueError: 이 유형의 AutoModel에 대해 인식할 수 없는 XYZ 구성 클래스(ValueError: Unrecognized configuration class XYZ for this kind of AutoModel)[[valueerror-unrecognized-configuration-class-xyz-for-this-kind-of-automodel]]
+
+일반적으로, 사전 학습된 모델의 인스턴스를 가져오기 위해 [`AutoModel`] 클래스를 사용하는 것이 좋습니다.
+이 클래스는 구성에 따라 주어진 체크포인트에서 올바른 아키텍처를 자동으로 추론하고 가져올 수 있습니다.
+모델을 체크포인트에서 가져올 때 이 `ValueError`가 발생하면, 이는 Auto 클래스가 주어진 체크포인트의 구성에서 
+가져오려는 모델 유형과 매핑을 찾을 수 없다는 것을 의미합니다. 가장 흔하게 발생하는 경우는 
+체크포인트가 주어진 태스크를 지원하지 않을 때입니다.
+예를 들어, 다음 예제에서 질의응답에 대한 GPT2가 없기 때문에 오류가 발생합니다:
+
+```py
+>>> from transformers import AutoProcessor, AutoModelForQuestionAnswering
+
+>>> processor = AutoProcessor.from_pretrained("gpt2-medium")
+>>> model = AutoModelForQuestionAnswering.from_pretrained("gpt2-medium")
+ValueError: Unrecognized configuration class <class 'transformers.models.gpt2.configuration_gpt2.GPT2Config'> for this kind of AutoModel: AutoModelForQuestionAnswering.
+Model type should be one of AlbertConfig, BartConfig, BertConfig, BigBirdConfig, BigBirdPegasusConfig, BloomConfig, ...
+```
diff --git a/docs/source/ms/_toctree.yml b/docs/source/ms/_toctree.yml
new file mode 100644
index 000000000000..0ec1ee59ad89
--- /dev/null
+++ b/docs/source/ms/_toctree.yml
@@ -0,0 +1,688 @@
+- sections:
+    - local: index
+      title: 🤗 Transformers
+    - local: quicktour
+      title: Lawatan cepat
+    - local: installation
+      title: Pemasangan
+  title: Mulakan
+- sections:
+    - local: pipeline_tutorial
+      title: Jalankan inferens dengan saluran paip
+    - local: autoclass_tutorial
+      title: Tulis kod mudah alih dengan AutoClass
+    - local: preprocessing
+      title: Praproses data
+    - local: training
+      title: Perhalusi model yang telah dilatih
+    - local: run_scripts
+      title: Latih dengan skrip
+    - local: accelerate
+      title: Sediakan latihan yang diedarkan dengan 🤗 Accelerate
+    - local: model_sharing
+      title: Kongsi model anda
+    - local: transformers_agents
+      title: Ejen
+  title: Tutorials
+- sections:
+    - sections:
+        - local: tasks/sequence_classification
+          title: Klasifikasi teks
+        - local: tasks/token_classification
+          title: Klasifikasi token
+        - local: tasks/question_answering
+          title: Soalan menjawab
+        - local: tasks/language_modeling
+          title: Pemodelan bahasa sebab-akibat
+        - local: tasks/masked_language_modeling
+          title: Pemodelan bahasa Masked
+        - local: tasks/translation
+          title: Terjemahan
+        - local: tasks/summarization
+          title: Rumusan
+        - local: tasks/multiple_choice
+          title: Pilihan
+      title: Natural Language Processing
+      isExpanded: false
+    - sections:
+        - local: tasks/audio_classification
+          title: Klasifikasi audio
+        - local: tasks/asr
+          title: Pengecaman pertuturan automatik
+      title: Audio
+      isExpanded: false
+    - sections:
+        - local: tasks/image_classification
+          title: Klasifikasi imej
+        - local: tasks/semantic_segmentation
+          title: Segmentasi semantik
+        - local: tasks/video_classification
+          title: Klasifikasi video
+        - local: tasks/object_detection
+          title: Pengesanan objek
+        - local: tasks/zero_shot_object_detection
+          title: Pengesanan objek Zero-Shot
+        - local: tasks/zero_shot_image_classification
+          title: Klasifikasi imej tangkapan Zero-Shot
+        - local: tasks/monocular_depth_estimation
+          title: Anggaran kedalaman
+      title: Visi komputer
+      isExpanded: false
+    - sections:
+        - local: tasks/image_captioning
+          title: Kapsyen imej
+        - local: tasks/document_question_answering
+          title: Menjawab Soalan Dokumen
+        - local: tasks/text-to-speech
+          title: Teks kepada ucapan
+      title: Multimodal
+      isExpanded: false
+  title: Panduan Tugasan
+- sections:
+    - local: fast_tokenizers
+      title: Gunakan tokenizer cepat dari 🤗 Tokenizers
+    - local: multilingual
+      title: Jalankan inferens dengan model berbilang bahasa
+    - local: generation_strategies
+      title: Sesuaikan strategi penjanaan teks
+    - local: create_a_model
+      title: Gunakan API khusus model
+    - local: custom_models
+      title: Kongsi model tersuai
+    - local: sagemaker
+      title: Jalankan latihan di Amazon SageMaker
+    - local: serialization
+      title: Eksport ke ONNX
+    - local: torchscript
+      title: Eksport ke TorchScript
+    - local: benchmarks
+      title: Penanda aras
+    - local: Buku nota dengan contoh
+      title: Notebooks with examples
+    - local: Sumber komuniti
+      title: Community resources
+    - local: Sumber komuniti
+      title: Custom Tools and Prompts
+    - local: Alat dan Gesaan Tersuai
+      title: Selesaikan masalah
+  title: Panduan Developer
+- sections:
+    - local: performance
+      title: Gambaran keseluruhan
+    - local: perf_train_gpu_one
+      title: Latihan pada satu GPU
+    - local: perf_train_gpu_many
+      title: Latihan pada banyak GPU
+    - local: perf_train_cpu
+      title: Latihan mengenai CPU
+    - local: perf_train_cpu_many
+      title: Latihan pada banyak CPU
+    - local: perf_train_tpu
+      title: Latihan mengenai TPU
+    - local: perf_train_tpu_tf
+      title: Latihan tentang TPU dengan TensorFlow
+    - local: perf_train_special
+      title: Latihan mengenai Perkakasan Khusus
+    - local: perf_infer_cpu
+      title: Inferens pada CPU
+    - local: perf_infer_gpu_one
+      title: Inferens pada satu GPU
+    - local: perf_infer_gpu_many
+      title: Inferens pada banyak GPUs
+    - local: perf_infer_special
+      title: Inferens pada Perkakasan Khusus
+    - local: perf_hardware
+      title: Perkakasan tersuai untuk latihan
+    - local: big_models
+      title: Menghidupkan model besar
+    - local: debugging
+      title: Penyahpepijatan
+    - local: hpo_train
+      title: Carian Hiperparameter menggunakan API Pelatih
+    - local: tf_xla
+      title: Penyepaduan XLA untuk Model TensorFlow
+  title: Prestasi dan kebolehskalaan
+- sections:
+    - local: contributing
+      title: Bagaimana untuk menyumbang kepada transformer?
+    - local: add_new_model
+      title: Bagaimana untuk menambah model pada 🤗 Transformers?
+    - local: add_tensorflow_model
+      title: Bagaimana untuk menukar model Transformers kepada TensorFlow?
+    - local: add_new_pipeline
+      title: Bagaimana untuk menambah saluran paip ke 🤗 Transformers?
+    - local: testing
+      title: Ujian
+    - local: pr_checks
+      title: Menyemak Permintaan Tarik
+  title: Sumbangkan
+
+- sections:
+    - local: philosophy
+      title: Falsafah
+    - local: glossary
+      title: Glosari
+    - local: task_summary
+      title: Apa 🤗 Transformers boleh buat
+    - local: tasks_explained
+      title: Bagaimana 🤗 Transformers menyelesaikan tugasan
+    - local: model_summary
+      title: Keluarga model Transformer
+    - local: tokenizer_summary
+      title: Ringkasan tokenizer
+    - local: attention
+      title: Mekanisme perhatian
+    - local: pad_truncation
+      title: Padding dan pemotongan
+    - local: bertology
+      title: BERTology
+    - local: perplexity
+      title: Kekeliruan model panjang tetap
+    - local: pipeline_webserver
+      title: Saluran paip untuk inferens pelayan web
+  title: Panduan konsep
+- sections:
+    - sections:
+        - local: main_classes/agent
+          title: Ejen dan Alat
+        - local: model_doc/auto
+          title: Kelas Auto
+        - local: main_classes/callback
+          title: Panggilan balik
+        - local: main_classes/configuration
+          title: Configuration
+        - local: main_classes/data_collator
+          title: Data Collator
+        - local: main_classes/keras_callbacks
+          title: Keras callbacks
+        - local: main_classes/logging
+          title: Logging
+        - local: main_classes/model
+          title: Models
+        - local: main_classes/text_generation
+          title: Text Generation
+        - local: main_classes/onnx
+          title: ONNX
+        - local: main_classes/optimizer_schedules
+          title: Optimization
+        - local: main_classes/output
+          title: Model outputs
+        - local: main_classes/pipelines
+          title: Pipelines
+        - local: main_classes/processors
+          title: Processors
+        - local: main_classes/quantization
+          title: Quantization
+        - local: main_classes/tokenizer
+          title: Tokenizer
+        - local: main_classes/trainer
+          title: Trainer
+        - local: main_classes/deepspeed
+          title: DeepSpeed Integration
+        - local: main_classes/feature_extractor
+          title: Feature Extractor
+        - local: main_classes/image_processor
+          title: Image Processor
+      title: Main Classes
+    - sections:
+        - isExpanded: false
+          sections:
+            - local: model_doc/albert
+              title: ALBERT
+            - local: model_doc/bart
+              title: BART
+            - local: model_doc/barthez
+              title: BARThez
+            - local: model_doc/bartpho
+              title: BARTpho
+            - local: model_doc/bert
+              title: BERT
+            - local: model_doc/bert-generation
+              title: BertGeneration
+            - local: model_doc/bert-japanese
+              title: BertJapanese
+            - local: model_doc/bertweet
+              title: Bertweet
+            - local: model_doc/big_bird
+              title: BigBird
+            - local: model_doc/bigbird_pegasus
+              title: BigBirdPegasus
+            - local: model_doc/biogpt
+              title: BioGpt
+            - local: model_doc/blenderbot
+              title: Blenderbot
+            - local: model_doc/blenderbot-small
+              title: Blenderbot Small
+            - local: model_doc/bloom
+              title: BLOOM
+            - local: model_doc/bort
+              title: BORT
+            - local: model_doc/byt5
+              title: ByT5
+            - local: model_doc/camembert
+              title: CamemBERT
+            - local: model_doc/canine
+              title: CANINE
+            - local: model_doc/codegen
+              title: CodeGen
+            - local: model_doc/convbert
+              title: ConvBERT
+            - local: model_doc/cpm
+              title: CPM
+            - local: model_doc/cpmant
+              title: CPMANT
+            - local: model_doc/ctrl
+              title: CTRL
+            - local: model_doc/deberta
+              title: DeBERTa
+            - local: model_doc/deberta-v2
+              title: DeBERTa-v2
+            - local: model_doc/dialogpt
+              title: DialoGPT
+            - local: model_doc/distilbert
+              title: DistilBERT
+            - local: model_doc/dpr
+              title: DPR
+            - local: model_doc/electra
+              title: ELECTRA
+            - local: model_doc/encoder-decoder
+              title: Encoder Decoder Models
+            - local: model_doc/ernie
+              title: ERNIE
+            - local: model_doc/ernie_m
+              title: ErnieM
+            - local: model_doc/esm
+              title: ESM
+            - local: model_doc/flan-t5
+              title: FLAN-T5
+            - local: model_doc/flan-ul2
+              title: FLAN-UL2
+            - local: model_doc/flaubert
+              title: FlauBERT
+            - local: model_doc/fnet
+              title: FNet
+            - local: model_doc/fsmt
+              title: FSMT
+            - local: model_doc/funnel
+              title: Funnel Transformer
+            - local: model_doc/openai-gpt
+              title: GPT
+            - local: model_doc/gpt_neo
+              title: GPT Neo
+            - local: model_doc/gpt_neox
+              title: GPT NeoX
+            - local: model_doc/gpt_neox_japanese
+              title: GPT NeoX Japanese
+            - local: model_doc/gptj
+              title: GPT-J
+            - local: model_doc/gpt2
+              title: GPT2
+            - local: model_doc/gpt_bigcode
+              title: GPTBigCode
+            - local: model_doc/gptsan-japanese
+              title: GPTSAN Japanese
+            - local: model_doc/gpt-sw3
+              title: GPTSw3
+            - local: model_doc/herbert
+              title: HerBERT
+            - local: model_doc/ibert
+              title: I-BERT
+            - local: model_doc/jukebox
+              title: Jukebox
+            - local: model_doc/led
+              title: LED
+            - local: model_doc/llama
+              title: LLaMA
+            - local: model_doc/longformer
+              title: Longformer
+            - local: model_doc/longt5
+              title: LongT5
+            - local: model_doc/luke
+              title: LUKE
+            - local: model_doc/m2m_100
+              title: M2M100
+            - local: model_doc/marian
+              title: MarianMT
+            - local: model_doc/markuplm
+              title: MarkupLM
+            - local: model_doc/mbart
+              title: MBart and MBart-50
+            - local: model_doc/mega
+              title: MEGA
+            - local: model_doc/megatron-bert
+              title: MegatronBERT
+            - local: model_doc/megatron_gpt2
+              title: MegatronGPT2
+            - local: model_doc/mluke
+              title: mLUKE
+            - local: model_doc/mobilebert
+              title: MobileBERT
+            - local: model_doc/mpnet
+              title: MPNet
+            - local: model_doc/mt5
+              title: MT5
+            - local: model_doc/mvp
+              title: MVP
+            - local: model_doc/nezha
+              title: NEZHA
+            - local: model_doc/nllb
+              title: NLLB
+            - local: model_doc/nllb-moe
+              title: NLLB-MoE
+            - local: model_doc/nystromformer
+              title: Nyströmformer
+            - local: model_doc/open-llama
+              title: Open-Llama
+            - local: model_doc/opt
+              title: OPT
+            - local: model_doc/pegasus
+              title: Pegasus
+            - local: model_doc/pegasus_x
+              title: PEGASUS-X
+            - local: model_doc/phobert
+              title: PhoBERT
+            - local: model_doc/plbart
+              title: PLBart
+            - local: model_doc/prophetnet
+              title: ProphetNet
+            - local: model_doc/qdqbert
+              title: QDQBert
+            - local: model_doc/rag
+              title: RAG
+            - local: model_doc/realm
+              title: REALM
+            - local: model_doc/reformer
+              title: Reformer
+            - local: model_doc/rembert
+              title: RemBERT
+            - local: model_doc/retribert
+              title: RetriBERT
+            - local: model_doc/roberta
+              title: RoBERTa
+            - local: model_doc/roberta-prelayernorm
+              title: RoBERTa-PreLayerNorm
+            - local: model_doc/roc_bert
+              title: RoCBert
+            - local: model_doc/roformer
+              title: RoFormer
+            - local: model_doc/rwkv
+              title: RWKV
+            - local: model_doc/splinter
+              title: Splinter
+            - local: model_doc/squeezebert
+              title: SqueezeBERT
+            - local: model_doc/switch_transformers
+              title: SwitchTransformers
+            - local: model_doc/t5
+              title: T5
+            - local: model_doc/t5v1.1
+              title: T5v1.1
+            - local: model_doc/tapex
+              title: TAPEX
+            - local: model_doc/transfo-xl
+              title: Transformer XL
+            - local: model_doc/ul2
+              title: UL2
+            - local: model_doc/xmod
+              title: X-MOD
+            - local: model_doc/xglm
+              title: XGLM
+            - local: model_doc/xlm
+              title: XLM
+            - local: model_doc/xlm-prophetnet
+              title: XLM-ProphetNet
+            - local: model_doc/xlm-roberta
+              title: XLM-RoBERTa
+            - local: model_doc/xlm-roberta-xl
+              title: XLM-RoBERTa-XL
+            - local: model_doc/xlm-v
+              title: XLM-V
+            - local: model_doc/xlnet
+              title: XLNet
+            - local: model_doc/yoso
+              title: YOSO
+          title: Text models
+        - isExpanded: false
+          sections:
+            - local: model_doc/beit
+              title: BEiT
+            - local: model_doc/bit
+              title: BiT
+            - local: model_doc/conditional_detr
+              title: Conditional DETR
+            - local: model_doc/convnext
+              title: ConvNeXT
+            - local: model_doc/convnextv2
+              title: ConvNeXTV2
+            - local: model_doc/cvt
+              title: CvT
+            - local: model_doc/deformable_detr
+              title: Deformable DETR
+            - local: model_doc/deit
+              title: DeiT
+            - local: model_doc/deta
+              title: DETA
+            - local: model_doc/detr
+              title: DETR
+            - local: model_doc/dinat
+              title: DiNAT
+            - local: model_doc/dit
+              title: DiT
+            - local: model_doc/dpt
+              title: DPT
+            - local: model_doc/efficientformer
+              title: EfficientFormer
+            - local: model_doc/efficientnet
+              title: EfficientNet
+            - local: model_doc/focalnet
+              title: FocalNet
+            - local: model_doc/glpn
+              title: GLPN
+            - local: model_doc/imagegpt
+              title: ImageGPT
+            - local: model_doc/levit
+              title: LeViT
+            - local: model_doc/mask2former
+              title: Mask2Former
+            - local: model_doc/maskformer
+              title: MaskFormer
+            - local: model_doc/mobilenet_v1
+              title: MobileNetV1
+            - local: model_doc/mobilenet_v2
+              title: MobileNetV2
+            - local: model_doc/mobilevit
+              title: MobileViT
+            - local: model_doc/nat
+              title: NAT
+            - local: model_doc/poolformer
+              title: PoolFormer
+            - local: model_doc/regnet
+              title: RegNet
+            - local: model_doc/resnet
+              title: ResNet
+            - local: model_doc/segformer
+              title: SegFormer
+            - local: model_doc/swiftformer
+              title: SwiftFormer
+            - local: model_doc/swin
+              title: Swin Transformer
+            - local: model_doc/swinv2
+              title: Swin Transformer V2
+            - local: model_doc/swin2sr
+              title: Swin2SR
+            - local: model_doc/table-transformer
+              title: Table Transformer
+            - local: model_doc/timesformer
+              title: TimeSformer
+            - local: model_doc/upernet
+              title: UperNet
+            - local: model_doc/van
+              title: VAN
+            - local: model_doc/videomae
+              title: VideoMAE
+            - local: model_doc/vit
+              title: Vision Transformer (ViT)
+            - local: model_doc/vit_hybrid
+              title: ViT Hybrid
+            - local: model_doc/vit_mae
+              title: ViTMAE
+            - local: model_doc/vit_msn
+              title: ViTMSN
+            - local: model_doc/yolos
+              title: YOLOS
+          title: Vision models
+        - isExpanded: false
+          sections:
+            - local: model_doc/audio-spectrogram-transformer
+              title: Audio Spectrogram Transformer
+            - local: model_doc/clap
+              title: CLAP
+            - local: model_doc/hubert
+              title: Hubert
+            - local: model_doc/mctct
+              title: MCTCT
+            - local: model_doc/sew
+              title: SEW
+            - local: model_doc/sew-d
+              title: SEW-D
+            - local: model_doc/speech_to_text
+              title: Speech2Text
+            - local: model_doc/speech_to_text_2
+              title: Speech2Text2
+            - local: model_doc/speecht5
+              title: SpeechT5
+            - local: model_doc/unispeech
+              title: UniSpeech
+            - local: model_doc/unispeech-sat
+              title: UniSpeech-SAT
+            - local: model_doc/wav2vec2
+              title: Wav2Vec2
+            - local: model_doc/wav2vec2-conformer
+              title: Wav2Vec2-Conformer
+            - local: model_doc/wav2vec2_phoneme
+              title: Wav2Vec2Phoneme
+            - local: model_doc/wavlm
+              title: WavLM
+            - local: model_doc/whisper
+              title: Whisper
+            - local: model_doc/xls_r
+              title: XLS-R
+            - local: model_doc/xlsr_wav2vec2
+              title: XLSR-Wav2Vec2
+          title: Audio models
+        - isExpanded: false
+          sections:
+            - local: model_doc/align
+              title: ALIGN
+            - local: model_doc/altclip
+              title: AltCLIP
+            - local: model_doc/blip
+              title: BLIP
+            - local: model_doc/blip-2
+              title: BLIP-2
+            - local: model_doc/bridgetower
+              title: BridgeTower
+            - local: model_doc/chinese_clip
+              title: Chinese-CLIP
+            - local: model_doc/clip
+              title: CLIP
+            - local: model_doc/clipseg
+              title: CLIPSeg
+            - local: model_doc/data2vec
+              title: Data2Vec
+            - local: model_doc/deplot
+              title: DePlot
+            - local: model_doc/donut
+              title: Donut
+            - local: model_doc/flava
+              title: FLAVA
+            - local: model_doc/git
+              title: GIT
+            - local: model_doc/groupvit
+              title: GroupViT
+            - local: model_doc/layoutlm
+              title: LayoutLM
+            - local: model_doc/layoutlmv2
+              title: LayoutLMV2
+            - local: model_doc/layoutlmv3
+              title: LayoutLMV3
+            - local: model_doc/layoutxlm
+              title: LayoutXLM
+            - local: model_doc/lilt
+              title: LiLT
+            - local: model_doc/lxmert
+              title: LXMERT
+            - local: model_doc/matcha
+              title: MatCha
+            - local: model_doc/mgp-str
+              title: MGP-STR
+            - local: model_doc/oneformer
+              title: OneFormer
+            - local: model_doc/owlvit
+              title: OWL-ViT
+            - local: model_doc/perceiver
+              title: Perceiver
+            - local: model_doc/pix2struct
+              title: Pix2Struct
+            - local: model_doc/sam
+              title: Segment Anything
+            - local: model_doc/speech-encoder-decoder
+              title: Speech Encoder Decoder Models
+            - local: model_doc/tapas
+              title: TAPAS
+            - local: model_doc/trocr
+              title: TrOCR
+            - local: model_doc/tvlt
+              title: TVLT
+            - local: model_doc/vilt
+              title: ViLT
+            - local: model_doc/vision-encoder-decoder
+              title: Vision Encoder Decoder Models
+            - local: model_doc/vision-text-dual-encoder
+              title: Vision Text Dual Encoder
+            - local: model_doc/visual_bert
+              title: VisualBERT
+            - local: model_doc/xclip
+              title: X-CLIP
+          title: Multimodal models
+        - isExpanded: false
+          sections:
+            - local: model_doc/decision_transformer
+              title: Decision Transformer
+            - local: model_doc/trajectory_transformer
+              title: Trajectory Transformer
+          title: Reinforcement learning models
+        - isExpanded: false
+          sections:
+            - local: model_doc/informer
+              title: Informer
+            - local: model_doc/time_series_transformer
+              title: Time Series Transformer
+          title: Time series models
+        - isExpanded: false
+          sections:
+            - local: model_doc/graphormer
+              title: Graphormer
+          title: Graph models
+      title: Models
+    - sections:
+        - local: internal/modeling_utils
+          title: Custom Layers and Utilities
+        - local: internal/pipelines_utils
+          title: Utilities for pipelines
+        - local: internal/tokenization_utils
+          title: Utilities for Tokenizers
+        - local: internal/trainer_utils
+          title: Utilities for Trainer
+        - local: internal/generation_utils
+          title: Utilities for Generation
+        - local: internal/image_processing_utils
+          title: Utilities for Image Processors
+        - local: internal/audio_utils
+          title: Utilities for Audio processing
+        - local: internal/file_utils
+          title: General Utilities
+        - local: internal/time_series_utils
+          title: Utilities for Time Series
+      title: Internal Helpers
+  title: API
diff --git a/docs/source/ms/index.md b/docs/source/ms/index.md
new file mode 100644
index 000000000000..562840d82048
--- /dev/null
+++ b/docs/source/ms/index.md
@@ -0,0 +1,462 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Dilesenkan di bawah Lesen Apache, Versi 2.0 ("Lesen"); anda tidak boleh menggunakan fail ini kecuali dengan mematuhi
+Lesen. Anda boleh mendapatkan salinan Lesen di
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Melainkan diperlukan oleh undang-undang yang terpakai atau dipersetujui secara bertulis, perisian yang diedarkan di bawah Lesen diedarkan pada
+ASAS ""SEBAGAIMANA ADANYA"", TANPA WARANTI ATAU SEBARANG JENIS SYARAT, sama ada nyata atau tersirat. Lihat Lesen untuk
+bahasa tertentu yang mengawal kebenaran dan pengehadan di bawah Lesen.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 🤗 Transformers
+
+Pembelajaran Mesin terkini untuk [PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/), dan [JAX](https://jax.readthedocs.io/en/latest/).
+
+🤗 Transformers menyediakan API dan alatan untuk memuat turun dan melatih model pra-latihan terkini dengan mudah. Menggunakan model terlatih boleh mengurangkan kos pengiraan anda, jejak karbon dan menjimatkan masa serta sumber yang diperlukan untuk melatih model dari awal. Model ini menyokong tugas biasa dalam modaliti yang berbeza, seperti:
+
+📝 **Natural Language Processing**: klasifikasi teks, pengecaman entiti bernama, menjawab soalan, pemodelan bahasa, ringkasan, terjemahan, pilihan berganda dan penjanaan teks.<br>
+🖼️ **Computer Vision**: pengelasan imej, pengesanan objek dan pembahagian.<br>
+🗣️ **Audio**: pengecaman pertuturan automatik dan klasifikasi audio.<br>
+🐙 **Multimodal**: jawapan soalan jadual, pengecaman aksara optik, pengekstrakan maklumat daripada dokumen yang diimbas, klasifikasi video dan jawapan soalan visual.
+
+🤗 Transformer menyokong kebolehoperasian rangka kerja antara PyTorch, TensorFlow, and JAX. Ini memberikan fleksibiliti untuk menggunakan rangka kerja yang berbeza pada setiap peringkat kehidupan model; latih model dalam tiga baris kod dalam satu rangka kerja, dan muatkannya untuk inferens dalam rangka kerja yang lain. Model juga boleh dieksport ke format seperti ONNX.
+
+Sertai komuniti yang semakin berkembang di [Hub](https://huggingface.co/models), [forum](https://discuss.huggingface.co/), atau [Discord](https://discord.com/invite/JfAtkvEtRb) hari ini!
+
+## Jika anda sedang mencari sokongan tersuai daripada pasukan Hugging Face
+
+<a target="_blank" href="https://huggingface.co/support">
+    <img alt="HuggingFace Expert Acceleration Program" src="https://cdn-media.huggingface.co/marketing/transformers/new-support-improved.png" style="width: 100%; max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
+</a>
+
+## Kandungan
+
+Dokumentasi disusun kepada lima bahagian:
+
+- **MULAKAN** menyediakan lawatan pantas ke perpustakaan dan arahan pemasangan untuk bangun dan berjalan.
+- **TUTORIAL** ialah tempat yang bagus untuk bermula jika anda seorang pemula. Bahagian ini akan membantu anda memperoleh kemahiran asas yang anda perlukan untuk mula menggunakan perpustakaan.
+- **PANDUAN CARA-CARA** menunjukkan kepada anda cara untuk mencapai matlamat tertentu, seperti memperhalusi model terlatih untuk pemodelan bahasa atau cara menulis dan berkongsi model tersuai.
+- **PANDUAN KONSEP** menawarkan lebih banyak perbincangan dan penjelasan tentang konsep dan idea asas di sebalik model, tugasan dan falsafah reka bentuk 🤗 Transformers.
+- **API** menerangkan semua kelas dan fungsi:
+
+  - **KELAS UTAMA** memperincikan kelas yang paling penting seperti konfigurasi, model, tokenizer dan saluran paip.
+  - **MODEL** memperincikan kelas dan fungsi yang berkaitan dengan setiap model yang dilaksanakan dalam perpustakaan.
+  - **PEMBANTU DALAMAN** memperincikan kelas utiliti dan fungsi yang digunakan secara dalaman.
+
+### Model yang disokong
+
+<!--Senarai ini dikemas kini secara automatik daripada README dengan _make fix-copies_. Jangan kemas kini secara manual! -->
+
+1. **[ALBERT](model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
+1. **[ALIGN](model_doc/align)** (from Google Research) released with the paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.
+1. **[AltCLIP](model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
+1. **[Audio Spectrogram Transformer](model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
+1. **[Autoformer](model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
+1. **[BART](model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
+1. **[BARThez](model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
+1. **[BARTpho](model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
+1. **[BEiT](model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
+1. **[BERT](model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
+1. **[BERT For Sequence Generation](model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[BERTweet](model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
+1. **[BigBird-Pegasus](model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[BigBird-RoBERTa](model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[BioGpt](model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
+1. **[BiT](model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
+1. **[Blenderbot](model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BlenderbotSmall](model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BLIP](model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
+1. **[BLIP-2](model_doc/blip-2)** (from Salesforce) released with the paper [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) by Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi.
+1. **[BLOOM](model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
+1. **[BORT](model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
+1. **[BridgeTower](model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
+1. **[Bros](model_doc/bros)** (from NAVER) released with the paper [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539) by Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park.
+1. **[ByT5](model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
+1. **[CamemBERT](model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
+1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
+1. **[Chinese-CLIP](model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
+1. **[CLAP](model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
+1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
+1. **[CLIPSeg](model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
+1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
+1. **[Conditional DETR](model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
+1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
+1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
+1. **[ConvNeXTV2](model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
+1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
+1. **[CPM-Ant](model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
+1. **[CTRL](model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
+1. **[CvT](model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
+1. **[Data2Vec](model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
+1. **[DeBERTa](model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[DeBERTa-v2](model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[Decision Transformer](model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
+1. **[Deformable DETR](model_doc/deformable_detr)** (from SenseTime Research) released with the paper [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai.
+1. **[DeiT](model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
+1. **[DePlot](model_doc/deplot)** (from Google AI) released with the paper [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) by Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun.
+1. **[DETA](model_doc/deta)** (from The University of Texas at Austin) released with the paper [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl.
+1. **[DETR](model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
+1. **[DialoGPT](model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
+1. **[DiNAT](model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi.
+1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
+1. **[DiT](model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
+1. **[Donut](model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
+1. **[DPR](model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
+1. **[DPT](master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
+1. **[EfficientFormer](model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
+1. **[EfficientNet](model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
+1. **[ELECTRA](model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
+1. **[EncoderDecoder](model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[ERNIE](model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
+1. **[ErnieM](model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.
+1. **[ESM](model_doc/esm)** (from Meta AI) are transformer protein language models.  **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2 and ESMFold** were released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
+1. **[FLAN-T5](model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
+1. **[FLAN-UL2](model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
+1. **[FlauBERT](model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
+1. **[FLAVA](model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
+1. **[FNet](model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
+1. **[FocalNet](model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
+1. **[Funnel Transformer](model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
+1. **[GIT](model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
+1. **[GLPN](model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
+1. **[GPT](model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+1. **[GPT Neo](model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
+1. **[GPT NeoX](model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
+1. **[GPT NeoX Japanese](model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
+1. **[GPT-2](model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
+1. **[GPT-J](model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
+1. **[GPT-Sw3](model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren.
+1. **[GPTBigCode](model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
+1. **[GPTSAN-japanese](model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
+1. **[Graphormer](model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
+1. **[GroupViT](model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
+1. **[Hubert](model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
+1. **[I-BERT](model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
+1. **[ImageGPT](model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
+1. **[Informer](model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
+1. **[Jukebox](model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
+1. **[LayoutLM](model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
+1. **[LayoutLMv2](model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
+1. **[LayoutLMv3](model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
+1. **[LayoutXLM](model_doc/layoutxlm)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
+1. **[LED](model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+1. **[LeViT](model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
+1. **[LiLT](model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
+1. **[LLaMA](model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
+1. **[Longformer](model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+1. **[LongT5](model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
+1. **[LUKE](model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
+1. **[LXMERT](model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
+1. **[M-CTC-T](model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
+1. **[M2M100](model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
+1. **[MarianMT](model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
+1. **[MarkupLM](model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
+1. **[Mask2Former](model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
+1. **[MaskFormer](model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
+1. **[MatCha](model_doc/matcha)** (from Google AI) released with the paper [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) by Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos.
+1. **[mBART](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
+1. **[mBART-50](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
+1. **[MEGA](model_doc/mega)** (from Meta/USC/CMU/SJTU) released with the paper [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
+1. **[Megatron-BERT](model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[Megatron-GPT2](model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[MGP-STR](model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
+1. **[mLUKE](model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
+1. **[MobileBERT](model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
+1. **[MobileNetV1](model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
+1. **[MobileNetV2](model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
+1. **[MobileViT](model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
+1. **[MPNet](model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
+1. **[MT5](model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
+1. **[MVP](model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
+1. **[NAT](model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
+1. **[Nezha](model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
+1. **[NLLB](model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
+1. **[NLLB-MOE](model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
+1. **[Nyströmformer](model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
+1. **[OneFormer](model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
+1. **[OpenLlama](model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 
+1. **[OPT](master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
+1. **[OWL-ViT](model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
+1. **[Pegasus](model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
+1. **[PEGASUS-X](model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
+1. **[Perceiver IO](model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
+1. **[PhoBERT](model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
+1. **[Pix2Struct](model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.
+1. **[PLBart](model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
+1. **[PoolFormer](model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
+1. **[ProphetNet](model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[QDQBert](model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
+1. **[RAG](model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela.
+1. **[REALM](model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
+1. **[Reformer](model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+1. **[RegNet](model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
+1. **[RemBERT](model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
+1. **[ResNet](model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
+1. **[RoBERTa](model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
+1. **[RoBERTa-PreLayerNorm](model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
+1. **[RoCBert](model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
+1. **[RoFormer](model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
+1. **[RWKV](model_doc/rwkv)** (from Bo Peng), released on [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
+1. **[SegFormer](model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
+1. **[Segment Anything](model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
+1. **[SEW](model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
+1. **[SEW-D](model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
+1. **[SpeechT5](model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
+1. **[SpeechToTextTransformer](model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
+1. **[SpeechToTextTransformer2](model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
+1. **[Splinter](model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
+1. **[SqueezeBERT](model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
+1. **[SwiftFormer](model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
+1. **[Swin Transformer](model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
+1. **[Swin Transformer V2](model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
+1. **[Swin2SR](model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
+1. **[SwitchTransformers](model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
+1. **[T5](model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+1. **[T5v1.1](model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+1. **[Table Transformer](model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
+1. **[TAPAS](model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
+1. **[TAPEX](model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
+1. **[Time Series Transformer](model_doc/time_series_transformer)** (from HuggingFace).
+1. **[TimeSformer](model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
+1. **[Trajectory Transformer](model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
+1. **[Transformer-XL](model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+1. **[TrOCR](model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
+1. **[TVLT](model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
+1. **[UL2](model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
+1. **[UniSpeech](model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
+1. **[UniSpeechSat](model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
+1. **[UPerNet](model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
+1. **[VAN](model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
+1. **[VideoMAE](model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
+1. **[ViLT](model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
+1. **[Vision Transformer (ViT)](model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[VisualBERT](model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
+1. **[ViT Hybrid](model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[ViTMAE](model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
+1. **[ViTMSN](model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
+1. **[Wav2Vec2](model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
+1. **[Wav2Vec2-Conformer](model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
+1. **[Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
+1. **[WavLM](model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
+1. **[Whisper](model_doc/whisper)** (from OpenAI) released with the paper [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever.
+1. **[X-CLIP](model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
+1. **[X-MOD](model_doc/xmod)** (from Meta AI) released with the paper [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) by Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe.
+1. **[XGLM](model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
+1. **[XLM](model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
+1. **[XLM-ProphetNet](model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[XLM-RoBERTa](model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
+1. **[XLM-RoBERTa-XL](model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
+1. **[XLM-V](model_doc/xlm-v)** (from Meta AI) released with the paper [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) by Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa.
+1. **[XLNet](model_doc/xlnet)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
+1. **[XLS-R](model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
+1. **[XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
+1. **[YOLOS](model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
+1. **[YOSO](model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
+
+
+### Rangka kerja yang disokong
+
+Jadual di bawah mewakili sokongan semasa dalam perpustakaan untuk setiap model tersebut, sama ada model tersebut mempunyai Python
+tokenizer (dipanggil ""lambat""). Tokenizer ""pantas"" yang disokong oleh perpustakaan Tokenizers 🤗, sama ada mereka mempunyai sokongan dalam Jax (melalui
+Flax), PyTorch, dan/atau TensorFlow.
+
+<!--Jadual ini dikemas kini secara automatik daripada modul auto dengan _make fix-copies_. Jangan kemas kini secara manual!-->
+
+|             Model             | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support |
+|:-----------------------------:|:--------------:|:--------------:|:---------------:|:------------------:|:------------:|
+|            ALBERT             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|             ALIGN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            AltCLIP            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+| Audio Spectrogram Transformer |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          Autoformer           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             BART              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|             BEiT              |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|             BERT              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|        Bert Generation        |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            BigBird            |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
+|        BigBird-Pegasus        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            BioGpt             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              BiT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          Blenderbot           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|        BlenderbotSmall        |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|             BLIP              |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            BLIP-2             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             BLOOM             |       ❌       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          BridgeTower          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             Bros              |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|           CamemBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            CANINE             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         Chinese-CLIP          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             CLAP              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             CLIP              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|            CLIPSeg            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            CodeGen            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|       Conditional DETR        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           ConvBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           ConvNeXT            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|          ConvNeXTV2           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            CPM-Ant            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             CTRL              |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|              CvT              |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|         Data2VecAudio         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         Data2VecText          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        Data2VecVision         |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            DeBERTa            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|          DeBERTa-v2           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|     Decision Transformer      |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        Deformable DETR        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             DeiT              |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             DETA              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             DETR              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             DiNAT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          DistilBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|           DonutSwin           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              DPR              |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|              DPT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        EfficientFormer        |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|         EfficientNet          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            ELECTRA            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|        Encoder decoder        |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|             ERNIE             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            ErnieM             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              ESM              |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|  FairSeq Machine-Translation  |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           FlauBERT            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             FLAVA             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             FNet              |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|           FocalNet            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|      Funnel Transformer       |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|              GIT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             GLPN              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            GPT Neo            |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|           GPT NeoX            |       ❌       |       ✅       |       ✅        |         ❌         |      ❌      |
+|       GPT NeoX Japanese       |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             GPT-J             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|            GPT-Sw3            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|          GPTBigCode           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        GPTSAN-japanese        |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          Graphormer           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           GroupViT            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            Hubert             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            I-BERT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           ImageGPT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Informer            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            Jukebox            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           LayoutLM            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|          LayoutLMv2           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          LayoutLMv3           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|              LED              |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|             LeViT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             LiLT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             LLaMA             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          Longformer           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            LongT5             |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|             LUKE              |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            LXMERT             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            M-CTC-T            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            M2M100             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            Marian             |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
+|           MarkupLM            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          Mask2Former          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          MaskFormer           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        MaskFormerSwin         |       ❌       |       ❌       |       ❌        |         ❌         |      ❌      |
+|             mBART             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|             MEGA              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         Megatron-BERT         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            MGP-STR            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          MobileBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|          MobileNetV1          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          MobileNetV2          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           MobileViT           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             MPNet             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|              MT5              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|              MVP              |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|              NAT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             Nezha             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           NLLB-MOE            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         Nyströmformer         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           OneFormer           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          OpenAI GPT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|         OpenAI GPT-2          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|           OpenLlama           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              OPT              |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|            OWL-ViT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            Pegasus            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|           PEGASUS-X           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Perceiver           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          Pix2Struct           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            PLBart             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          PoolFormer           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          ProphetNet           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            QDQBert            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              RAG              |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             REALM             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|           Reformer            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|            RegNet             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|            RemBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            ResNet             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|           RetriBERT           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|            RoBERTa            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|     RoBERTa-PreLayerNorm      |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|            RoCBert            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           RoFormer            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|             RWKV              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              SAM              |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|           SegFormer           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|              SEW              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             SEW-D             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|    Speech Encoder decoder     |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|          Speech2Text          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|         Speech2Text2          |       ✅       |       ❌       |       ❌        |         ❌         |      ❌      |
+|           SpeechT5            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Splinter            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          SqueezeBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          SwiftFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|       Swin Transformer        |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|      Swin Transformer V2      |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            Swin2SR            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|      SwitchTransformers       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              T5               |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|       Table Transformer       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             TAPAS             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|    Time Series Transformer    |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          TimeSformer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|    Trajectory Transformer     |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        Transformer-XL         |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             TrOCR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             TVLT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           UniSpeech           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         UniSpeechSat          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            UPerNet            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              VAN              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           VideoMAE            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             ViLT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|    Vision Encoder decoder     |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|     VisionTextDualEncoder     |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|          VisualBERT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              ViT              |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|          ViT Hybrid           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            ViTMAE             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            ViTMSN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Wav2Vec2            |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
+|      Wav2Vec2-Conformer       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             WavLM             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            Whisper            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|            X-CLIP             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             X-MOD             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             XGLM              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|              XLM              |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|        XLM-ProphetNet         |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          XLM-RoBERTa          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|        XLM-RoBERTa-XL         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             XLNet             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|             YOLOS             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             YOSO              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+
+<!-- Tamat -->
diff --git a/docs/source/pt/accelerate.md b/docs/source/pt/accelerate.md
new file mode 100644
index 000000000000..a4e346a2b487
--- /dev/null
+++ b/docs/source/pt/accelerate.md
@@ -0,0 +1,145 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Treinamento distribuído com o 🤗 Accelerate
+
+O paralelismo surgiu como uma estratégia para treinar modelos grandes em hardware limitado e aumentar a velocidade
+de treinamento em várias órdens de magnitude. Na Hugging Face criamos a biblioteca [🤗 Accelerate](https://huggingface.co/docs/accelerate)
+para ajudar os usuários a treinar modelos 🤗 Transformers com qualquer configuração distribuída, seja em uma máquina
+com múltiplos GPUs ou em múltiplos GPUs distribuidos entre muitas máquinas. Neste tutorial, você irá aprender como
+personalizar seu laço de treinamento de PyTorch para poder treinar em ambientes distribuídos.
+
+## Configuração
+
+De início, instale o 🤗 Accelerate:
+
+```bash
+pip install accelerate
+```
+
+Logo, devemos importar e criar um objeto [`Accelerator`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator).
+O `Accelerator` detectará automáticamente a configuração distribuída disponível e inicializará todos os
+componentes necessários para o treinamento. Não há necessidade portanto de especificar o dispositivo onde deve colocar seu modelo.
+
+```py
+>>> from accelerate import Accelerator
+
+>>> accelerator = Accelerator()
+```
+
+## Preparando a aceleração
+
+Passe todos os objetos relevantes ao treinamento para o método [`prepare`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.prepare).
+Isto inclui os DataLoaders de treino e evaluação, um modelo e um otimizador:
+
+```py
+>>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
+...     train_dataloader, eval_dataloader, model, optimizer
+... )
+```
+
+## Backward
+
+Por último, substitua o `loss.backward()` padrão em seu laço de treinamento com o método [`backward`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.backward) do 🤗 Accelerate:
+
+```py
+>>> for epoch in range(num_epochs):
+...     for batch in train_dataloader:
+...         outputs = model(**batch)
+...         loss = outputs.loss
+...         accelerator.backward(loss)
+
+...         optimizer.step()
+...         lr_scheduler.step()
+...         optimizer.zero_grad()
+...         progress_bar.update(1)
+```
+
+Como se poder ver no seguinte código, só precisará adicionar quatro linhas de código ao seu laço de treinamento
+para habilitar o treinamento distribuído!
+
+```diff
++ from accelerate import Accelerator
+  from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
+
++ accelerator = Accelerator()
+
+  model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
+  optimizer = AdamW(model.parameters(), lr=3e-5)
+
+- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+- model.to(device)
+
++ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
++     train_dataloader, eval_dataloader, model, optimizer
++ )
+
+  num_epochs = 3
+  num_training_steps = num_epochs * len(train_dataloader)
+  lr_scheduler = get_scheduler(
+      "linear",
+      optimizer=optimizer,
+      num_warmup_steps=0,
+      num_training_steps=num_training_steps
+  )
+
+  progress_bar = tqdm(range(num_training_steps))
+
+  model.train()
+  for epoch in range(num_epochs):
+      for batch in train_dataloader:
+-         batch = {k: v.to(device) for k, v in batch.items()}
+          outputs = model(**batch)
+          loss = outputs.loss
+-         loss.backward()
++         accelerator.backward(loss)
+
+          optimizer.step()
+          lr_scheduler.step()
+          optimizer.zero_grad()
+          progress_bar.update(1)
+```
+
+## Treinamento
+
+Quando tiver adicionado as linhas de código relevantes, inicie o treinamento por um script ou notebook como o Colab.
+
+### Treinamento em um Script
+
+Se estiver rodando seu treinamento em um Script, execute o seguinte comando para criar e guardar um arquivo de configuração:
+
+```bash
+accelerate config
+```
+
+Comece o treinamento com:
+
+```bash
+accelerate launch train.py
+```
+
+### Treinamento em um Notebook
+
+O 🤗 Accelerate pode rodar em um notebook, por exemplo, se estiver planejando usar as TPUs do Google Colab.
+Encapsule o código responsável pelo treinamento de uma função e passe-o ao `notebook_launcher`:
+
+```py
+>>> from accelerate import notebook_launcher
+
+>>> notebook_launcher(training_function)
+```
+
+Para obter mais informações sobre o 🤗 Accelerate e suas numerosas funções, consulte a [documentación](https://huggingface.co/docs/accelerate/index).
diff --git a/docs/source/pt/accelerate.mdx b/docs/source/pt/accelerate.mdx
deleted file mode 100644
index 59dbd96a83b2..000000000000
--- a/docs/source/pt/accelerate.mdx
+++ /dev/null
@@ -1,141 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Treinamento distribuído com o 🤗 Accelerate
-
-O paralelismo surgiu como uma estratégia para treinar modelos grandes em hardware limitado e aumentar a velocidade
-de treinamento em várias órdens de magnitude. Na Hugging Face criamos a biblioteca [🤗 Accelerate](https://huggingface.co/docs/accelerate)
-para ajudar os usuários a treinar modelos 🤗 Transformers com qualquer configuração distribuída, seja em uma máquina
-com múltiplos GPUs ou em múltiplos GPUs distribuidos entre muitas máquinas. Neste tutorial, você irá aprender como
-personalizar seu laço de treinamento de PyTorch para poder treinar em ambientes distribuídos.
-
-## Configuração
-
-De início, instale o 🤗 Accelerate:
-
-```bash
-pip install accelerate
-```
-
-Logo, devemos importar e criar um objeto [`Accelerator`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator).
-O `Accelerator` detectará automáticamente a configuração distribuída disponível e inicializará todos os
-componentes necessários para o treinamento. Não há necessidade portanto de especificar o dispositivo onde deve colocar seu modelo.
-
-```py
->>> from accelerate import Accelerator
-
->>> accelerator = Accelerator()
-```
-
-## Preparando a aceleração
-
-Passe todos os objetos relevantes ao treinamento para o método [`prepare`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.prepare).
-Isto inclui os DataLoaders de treino e evaluação, um modelo e um otimizador:
-
-```py
->>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
-...     train_dataloader, eval_dataloader, model, optimizer
-... )
-```
-
-## Backward
-
-Por último, substitua o `loss.backward()` padrão em seu laço de treinamento com o método [`backward`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.backward) do 🤗 Accelerate:
-
-```py
->>> for epoch in range(num_epochs):
-...     for batch in train_dataloader:
-...         outputs = model(**batch)
-...         loss = outputs.loss
-...         accelerator.backward(loss)
-
-...         optimizer.step()
-...         lr_scheduler.step()
-...         optimizer.zero_grad()
-...         progress_bar.update(1)
-```
-
-Como se poder ver no seguinte código, só precisará adicionar quatro linhas de código ao seu laço de treinamento
-para habilitar o treinamento distribuído!
-
-```diff
-+ from accelerate import Accelerator
-  from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
-
-+ accelerator = Accelerator()
-
-  model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
-  optimizer = AdamW(model.parameters(), lr=3e-5)
-
-- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
-- model.to(device)
-
-+ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
-+     train_dataloader, eval_dataloader, model, optimizer
-+ )
-
-  num_epochs = 3
-  num_training_steps = num_epochs * len(train_dataloader)
-  lr_scheduler = get_scheduler(
-      "linear",
-      optimizer=optimizer,
-      num_warmup_steps=0,
-      num_training_steps=num_training_steps
-  )
-
-  progress_bar = tqdm(range(num_training_steps))
-
-  model.train()
-  for epoch in range(num_epochs):
-      for batch in train_dataloader:
--         batch = {k: v.to(device) for k, v in batch.items()}
-          outputs = model(**batch)
-          loss = outputs.loss
--         loss.backward()
-+         accelerator.backward(loss)
-
-          optimizer.step()
-          lr_scheduler.step()
-          optimizer.zero_grad()
-          progress_bar.update(1)
-```
-
-## Treinamento
-
-Quando tiver adicionado as linhas de código relevantes, inicie o treinamento por um script ou notebook como o Colab.
-
-### Treinamento em um Script
-
-Se estiver rodando seu treinamento em um Script, execute o seguinte comando para criar e guardar um arquivo de configuração:
-
-```bash
-accelerate config
-```
-
-Comece o treinamento com:
-
-```bash
-accelerate launch train.py
-```
-
-### Treinamento em um Notebook
-
-O 🤗 Accelerate pode rodar em um notebook, por exemplo, se estiver planejando usar as TPUs do Google Colab.
-Encapsule o código responsável pelo treinamento de uma função e passe-o ao `notebook_launcher`:
-
-```py
->>> from accelerate import notebook_launcher
-
->>> notebook_launcher(training_function)
-```
-
-Para obter mais informações sobre o 🤗 Accelerate e suas numerosas funções, consulte a [documentación](https://huggingface.co/docs/accelerate/index).
diff --git a/docs/source/pt/converting_tensorflow_models.md b/docs/source/pt/converting_tensorflow_models.md
new file mode 100644
index 000000000000..ac1271d2764b
--- /dev/null
+++ b/docs/source/pt/converting_tensorflow_models.md
@@ -0,0 +1,166 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Convertendo checkpoints do TensorFlow para Pytorch
+
+Uma interface de linha de comando é fornecida para converter os checkpoints originais Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM em modelos
+que podem ser carregados usando os métodos `from_pretrained` da biblioteca.
+
+<Tip>
+
+A partir da versão 2.3.0 o script de conversão agora faz parte do transformers CLI (**transformers-cli**) disponível em qualquer instalação
+transformers >= 2.3.0.
+
+A documentação abaixo reflete o formato do comando **transformers-cli convert**.
+
+</Tip>
+
+## BERT
+
+Você pode converter qualquer checkpoint do BERT em TensorFlow (em particular [os modelos pré-treinados lançados pelo Google](https://github.com/google-research/bert#pre-trained-models)) em um arquivo PyTorch usando um 
+[convert_bert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py) script.
+
+Esta Interface de Linha de Comando (CLI) recebe como entrada um checkpoint do TensorFlow (três arquivos começando com `bert_model.ckpt`) e o
+arquivo de configuração (`bert_config.json`), e então cria um modelo PyTorch para esta configuração, carrega os pesos 
+do checkpoint do TensorFlow no modelo PyTorch e salva o modelo resultante em um arquivo PyTorch que pode
+ser importado usando `from_pretrained()` (veja o exemplo em [quicktour](quicktour) , [run_glue.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification/run_glue.py) ).
+
+Você só precisa executar este script de conversão **uma vez** para obter um modelo PyTorch. Você pode então desconsiderar o checkpoint em
+ TensorFlow (os três arquivos começando com `bert_model.ckpt`), mas certifique-se de manter o arquivo de configuração (\
+`bert_config.json`) e o arquivo de vocabulário (`vocab.txt`), pois eles também são necessários para o modelo PyTorch.
+
+Para executar este script de conversão específico, você precisará ter o TensorFlow e o PyTorch instalados (`pip install tensorflow`). O resto do repositório requer apenas o PyTorch.
+
+Aqui está um exemplo do processo de conversão para um modelo `BERT-Base Uncased` pré-treinado:
+
+```bash
+export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
+
+transformers-cli convert --model_type bert \
+  --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
+  --config $BERT_BASE_DIR/bert_config.json \
+  --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
+```
+
+Você pode baixar os modelos pré-treinados do Google para a conversão [aqui](https://github.com/google-research/bert#pre-trained-models).
+
+## ALBERT
+
+Converta os checkpoints do modelo ALBERT em TensorFlow para PyTorch usando o
+[convert_albert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py) script.
+
+A Interface de Linha de Comando (CLI) recebe como entrada um checkpoint do TensorFlow (três arquivos começando com `model.ckpt-best`) e o
+arquivo de configuração (`albert_config.json`), então cria e salva um modelo PyTorch. Para executar esta conversão, você
+precisa ter o TensorFlow e o PyTorch instalados.
+
+Aqui está um exemplo do processo de conversão para o modelo `ALBERT Base` pré-treinado:
+
+```bash
+export ALBERT_BASE_DIR=/path/to/albert/albert_base
+
+transformers-cli convert --model_type albert \
+  --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
+  --config $ALBERT_BASE_DIR/albert_config.json \
+  --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
+```
+
+Você pode baixar os modelos pré-treinados do Google para a conversão [aqui](https://github.com/google-research/albert#pre-trained-models).
+
+## OpenAI GPT
+
+Aqui está um exemplo do processo de conversão para um modelo OpenAI GPT pré-treinado, supondo que seu checkpoint NumPy
+foi salvo com o mesmo formato do modelo pré-treinado OpenAI (veja [aqui](https://github.com/openai/finetune-transformer-lm)\
+)
+
+```bash
+export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
+
+transformers-cli convert --model_type gpt \
+  --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
+  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+  [--config OPENAI_GPT_CONFIG] \
+  [--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \
+```
+
+## OpenAI GPT-2
+
+Aqui está um exemplo do processo de conversão para um modelo OpenAI GPT-2 pré-treinado (consulte [aqui](https://github.com/openai/gpt-2))
+
+```bash
+export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights
+
+transformers-cli convert --model_type gpt2 \
+  --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
+  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+  [--config OPENAI_GPT2_CONFIG] \
+  [--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]
+```
+
+## Transformer-XL
+
+Aqui está um exemplo do processo de conversão para um modelo Transformer-XL pré-treinado (consulte [aqui](https://github.com/kimiyoung/transformer-xl/tree/master/tf#obtain-and-evaluate-pretrained-modelos-sota))
+
+```bash
+export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
+
+transformers-cli convert --model_type transfo_xl \
+  --tf_checkpoint $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
+  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+  [--config TRANSFO_XL_CONFIG] \
+  [--finetuning_task_name TRANSFO_XL_FINETUNED_TASK]
+```
+
+## XLNet
+
+Aqui está um exemplo do processo de conversão para um modelo XLNet pré-treinado:
+
+```bash
+export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
+export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
+
+transformers-cli convert --model_type xlnet \
+  --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
+  --config $TRANSFO_XL_CONFIG_PATH \
+  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+  [--finetuning_task_name XLNET_FINETUNED_TASK] \
+```
+
+## XLM
+
+Aqui está um exemplo do processo de conversão para um modelo XLM pré-treinado:
+
+```bash
+export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
+
+transformers-cli convert --model_type xlm \
+  --tf_checkpoint $XLM_CHECKPOINT_PATH \
+  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT
+ [--config XML_CONFIG] \
+ [--finetuning_task_name XML_FINETUNED_TASK]
+```
+
+## T5
+
+Aqui está um exemplo do processo de conversão para um modelo T5 pré-treinado:
+
+```bash
+export T5=/path/to/t5/uncased_L-12_H-768_A-12
+
+transformers-cli convert --model_type t5 \
+  --tf_checkpoint $T5/t5_model.ckpt \
+  --config $T5/t5_config.json \
+  --pytorch_dump_output $T5/pytorch_model.bin
+```
diff --git a/docs/source/pt/converting_tensorflow_models.mdx b/docs/source/pt/converting_tensorflow_models.mdx
deleted file mode 100644
index db7be687c385..000000000000
--- a/docs/source/pt/converting_tensorflow_models.mdx
+++ /dev/null
@@ -1,162 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Convertendo checkpoints do TensorFlow para Pytorch
-
-Uma interface de linha de comando é fornecida para converter os checkpoints originais Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM em modelos
-que podem ser carregados usando os métodos `from_pretrained` da biblioteca.
-
-<Tip>
-
-A partir da versão 2.3.0 o script de conversão agora faz parte do transformers CLI (**transformers-cli**) disponível em qualquer instalação
-transformers >= 2.3.0.
-
-A documentação abaixo reflete o formato do comando **transformers-cli convert**.
-
-</Tip>
-
-## BERT
-
-Você pode converter qualquer checkpoint do BERT em TensorFlow (em particular [os modelos pré-treinados lançados pelo Google](https://github.com/google-research/bert#pre-trained-models)) em um arquivo PyTorch usando um 
-[convert_bert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py) script.
-
-Esta Interface de Linha de Comando (CLI) recebe como entrada um checkpoint do TensorFlow (três arquivos começando com `bert_model.ckpt`) e o
-arquivo de configuração (`bert_config.json`), e então cria um modelo PyTorch para esta configuração, carrega os pesos 
-do checkpoint do TensorFlow no modelo PyTorch e salva o modelo resultante em um arquivo PyTorch que pode
-ser importado usando `from_pretrained()` (veja o exemplo em [quicktour](quicktour) , [run_glue.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification/run_glue.py) ).
-
-Você só precisa executar este script de conversão **uma vez** para obter um modelo PyTorch. Você pode então desconsiderar o checkpoint em
- TensorFlow (os três arquivos começando com `bert_model.ckpt`), mas certifique-se de manter o arquivo de configuração (\
-`bert_config.json`) e o arquivo de vocabulário (`vocab.txt`), pois eles também são necessários para o modelo PyTorch.
-
-Para executar este script de conversão específico, você precisará ter o TensorFlow e o PyTorch instalados (`pip install tensorflow`). O resto do repositório requer apenas o PyTorch.
-
-Aqui está um exemplo do processo de conversão para um modelo `BERT-Base Uncased` pré-treinado:
-
-```bash
-export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
-
-transformers-cli convert --model_type bert \
-  --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
-  --config $BERT_BASE_DIR/bert_config.json \
-  --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
-```
-
-Você pode baixar os modelos pré-treinados do Google para a conversão [aqui](https://github.com/google-research/bert#pre-trained-models).
-
-## ALBERT
-
-Converta os checkpoints do modelo ALBERT em TensorFlow para PyTorch usando o
-[convert_albert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py) script.
-
-A Interface de Linha de Comando (CLI) recebe como entrada um checkpoint do TensorFlow (três arquivos começando com `model.ckpt-best`) e o
-arquivo de configuração (`albert_config.json`), então cria e salva um modelo PyTorch. Para executar esta conversão, você
-precisa ter o TensorFlow e o PyTorch instalados.
-
-Aqui está um exemplo do processo de conversão para o modelo `ALBERT Base` pré-treinado:
-
-```bash
-export ALBERT_BASE_DIR=/path/to/albert/albert_base
-
-transformers-cli convert --model_type albert \
-  --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
-  --config $ALBERT_BASE_DIR/albert_config.json \
-  --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
-```
-
-Você pode baixar os modelos pré-treinados do Google para a conversão [aqui](https://github.com/google-research/albert#pre-trained-models).
-
-## OpenAI GPT
-
-Aqui está um exemplo do processo de conversão para um modelo OpenAI GPT pré-treinado, supondo que seu checkpoint NumPy
-foi salvo com o mesmo formato do modelo pré-treinado OpenAI (veja [aqui](https://github.com/openai/finetune-transformer-lm)\
-)
-
-```bash
-export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
-
-transformers-cli convert --model_type gpt \
-  --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
-  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-  [--config OPENAI_GPT_CONFIG] \
-  [--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \
-```
-
-## OpenAI GPT-2
-
-Aqui está um exemplo do processo de conversão para um modelo OpenAI GPT-2 pré-treinado (consulte [aqui](https://github.com/openai/gpt-2))
-
-```bash
-export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights
-
-transformers-cli convert --model_type gpt2 \
-  --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
-  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-  [--config OPENAI_GPT2_CONFIG] \
-  [--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]
-```
-
-## Transformer-XL
-
-Aqui está um exemplo do processo de conversão para um modelo Transformer-XL pré-treinado (consulte [aqui](https://github.com/kimiyoung/transformer-xl/tree/master/tf#obtain-and-evaluate-pretrained-modelos-sota))
-
-```bash
-export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
-
-transformers-cli convert --model_type transfo_xl \
-  --tf_checkpoint $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
-  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-  [--config TRANSFO_XL_CONFIG] \
-  [--finetuning_task_name TRANSFO_XL_FINETUNED_TASK]
-```
-
-## XLNet
-
-Aqui está um exemplo do processo de conversão para um modelo XLNet pré-treinado:
-
-```bash
-export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
-export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
-
-transformers-cli convert --model_type xlnet \
-  --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
-  --config $TRANSFO_XL_CONFIG_PATH \
-  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-  [--finetuning_task_name XLNET_FINETUNED_TASK] \
-```
-
-## XLM
-
-Aqui está um exemplo do processo de conversão para um modelo XLM pré-treinado:
-
-```bash
-export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
-
-transformers-cli convert --model_type xlm \
-  --tf_checkpoint $XLM_CHECKPOINT_PATH \
-  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT
- [--config XML_CONFIG] \
- [--finetuning_task_name XML_FINETUNED_TASK]
-```
-
-## T5
-
-Aqui está um exemplo do processo de conversão para um modelo T5 pré-treinado:
-
-```bash
-export T5=/path/to/t5/uncased_L-12_H-768_A-12
-
-transformers-cli convert --model_type t5 \
-  --tf_checkpoint $T5/t5_model.ckpt \
-  --config $T5/t5_config.json \
-  --pytorch_dump_output $T5/pytorch_model.bin
-```
diff --git a/docs/source/pt/create_a_model.md b/docs/source/pt/create_a_model.md
new file mode 100644
index 000000000000..8c53752d6cf8
--- /dev/null
+++ b/docs/source/pt/create_a_model.md
@@ -0,0 +1,359 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Criar uma arquitetura customizada
+
+Uma [`AutoClass`](model_doc/auto) automaticamente infere a arquitetura do modelo e baixa configurações e pesos pré-treinados. Geralmente, nós recomendamos usar uma `AutoClass` para produzir um código independente de checkpoints. Mas usuários que querem mais contole sobre parâmetros específicos do modelo pode criar um modelo customizado 🤗 Transformers a partir de algumas classes bases. Isso pode ser particulamente útil para alguém que está interessado em estudar, treinar ou fazer experimentos com um modelo 🤗 Transformers. Nesse tutorial, será explicado como criar um modelo customizado sem uma `AutoClass`. Aprenda como:
+
+- Carregar e customizar a configuração de um modelo.
+- Criar a arquitetura de um modelo.
+- Criar um tokenizer rápido e devagar para textos.
+- Criar extrator de features para tarefas envolvendo audio e imagem.
+- Criar um processador para tarefas multimodais.
+
+## configuration
+
+A [configuration](main_classes/configuration) refere-se a atributos específicos de um modelo. Cada configuração de modelo tem atributos diferentes; por exemplo, todos modelo de PLN possuem os atributos `hidden_size`, `num_attention_heads`, `num_hidden_layers` e `vocab_size` em comum. Esse atributos especificam o numero de 'attention heads' ou 'hidden layers' para construir um modelo.
+
+Dê uma olhada a mais em [DistilBERT](model_doc/distilbert) acessando [`DistilBertConfig`] para observar esses atributos:
+
+```py
+>>> from transformers import DistilBertConfig
+
+>>> config = DistilBertConfig()
+>>> print(config)
+DistilBertConfig {
+  "activation": "gelu",
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "transformers_version": "4.16.2",
+  "vocab_size": 30522
+}
+```
+
+[`DistilBertConfig`] mostra todos os atributos padrões usados para construir um [`DistilBertModel`] base. Todos atributos são customizáveis, o que cria espaço para experimentos. Por exemplo, você pode customizar um modelo padrão para:
+
+- Tentar uma função de ativação diferente com o parâmetro `activation`.
+- Usar uma taxa de desistência maior para as probabilidades de 'attention' com o parâmetro `attention_dropout`.
+
+```py
+>>> my_config = DistilBertConfig(activation="relu", attention_dropout=0.4)
+>>> print(my_config)
+DistilBertConfig {
+  "activation": "relu",
+  "attention_dropout": 0.4,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "transformers_version": "4.16.2",
+  "vocab_size": 30522
+}
+```
+
+Atributos de um modelo pré-treinado podem ser modificados na função [`~PretrainedConfig.from_pretrained`]:
+
+```py
+>>> my_config = DistilBertConfig.from_pretrained("distilbert-base-uncased", activation="relu", attention_dropout=0.4)
+```
+
+Uma vez que você está satisfeito com as configurações do seu modelo, você consegue salvar elas com [`~PretrainedConfig.save_pretrained`]. Seu arquivo de configurações está salvo como um arquivo JSON no diretório especificado:
+
+```py
+>>> my_config.save_pretrained(save_directory="./your_model_save_path")
+```
+
+Para reusar o arquivo de configurações, carregue com [`~PretrainedConfig.from_pretrained`]:
+
+```py
+>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json")
+```
+
+<Tip>
+
+Você pode também salvar seu arquivo de configurações como um dicionário ou até mesmo com a diferença entre as seus atributos de configuração customizados e os atributos de configuração padrões! Olhe a documentação [configuration](main_classes/configuration) para mais detalhes.
+
+</Tip>
+
+## Modelo
+
+O próximo passo é criar um [model](main_classes/models). O modelo - também vagamente referido como arquitetura - define o que cada camada está fazendo e quais operações estão acontecendo. Atributos como `num_hidden_layers` das configurações são utilizados para definir a arquitetura. Todo modelo compartilha a classe base [`PreTrainedModel`] e alguns métodos em comum como redimensionar o tamanho dos embeddings de entrada e podar as 'self-attention heads'. Além disso, todos os modelos também são subclasses de [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html), [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) ou [`flax.linen.Module`](https://flax.readthedocs.io/en/latest/flax.linen.html#module). Isso significa que os modelos são compatíveis com cada respectivo uso de framework.
+
+<frameworkcontent>
+<pt>
+Carregar seus atributos de configuração customizados em um modelo:
+
+```py
+>>> from transformers import DistilBertModel
+
+>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json")
+>>> model = DistilBertModel(my_config)
+```
+
+Isso cria um modelo com valores aleatórios ao invés de pré-treinar os pesos. Você não irá conseguir usar usar esse modelo para nada útil ainda, até você treinar ele. Treino é um processo caro e demorado. Geralmente é melhor utilizar um modelo pré-treinado para obter melhores resultados mais rápido, enquanto usa apenas uma fração dos recursos necessários para treinar.
+
+Criar um modelo pré-treinado com [`~PreTrainedModel.from_pretrained`]:
+
+```py
+>>> model = DistilBertModel.from_pretrained("distilbert-base-uncased")
+```
+
+Quando você carregar os pesos pré-treinados, a configuração padrão do modelo é automaticamente carregada se o modelo é provido pelo 🤗 Transformers. No entanto, você ainda consegue mudar - alguns ou todos - os atributos padrões de configuração do modelo com os seus próprio atributos, se você preferir: 
+
+```py
+>>> model = DistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
+```
+</pt>
+<tf>
+Carregar os seus próprios atributos padrões de contiguração no modelo:
+
+```py
+>>> from transformers import TFDistilBertModel
+
+>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json")
+>>> tf_model = TFDistilBertModel(my_config)
+```
+
+Isso cria um modelo com valores aleatórios ao invés de pré-treinar os pesos. Você não irá conseguir usar usar esse modelo para nada útil ainda, até você treinar ele. Treino é um processo caro e demorado. Geralmente é melhor utilizar um modelo pré-treinado para obter melhores resultados mais rápido, enquanto usa apenas uma fração dos recursos necessários para treinar.
+
+Criar um modelo pré-treinado com [`~TFPreTrainedModel.from_pretrained`]:
+
+```py
+>>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased")
+```
+
+Quando você carregar os pesos pré-treinados, a configuração padrão do modelo é automaticamente carregada se o modelo é provido pelo 🤗 Transformers. No entanto, você ainda consegue mudar - alguns ou todos - os atributos padrões de configuração do modelo com os seus próprio atributos, se você preferir: 
+
+```py
+>>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
+```
+</tf>
+</frameworkcontent>
+
+### Heads do modelo
+
+Neste ponto, você tem um modelo básico do DistilBERT que gera os *estados ocultos*. Os estados ocultos são passados como entrada para a head do moelo para produzir a saída final. 🤗 Transformers fornece uma head de modelo diferente para cada tarefa desde que o modelo suporte essa tarefa (por exemplo, você não consegue utilizar o modelo DistilBERT para uma tarefa de 'sequence-to-sequence' como tradução).
+
+<frameworkcontent>
+<pt>
+Por exemplo, [`DistilBertForSequenceClassification`] é um modelo DistilBERT base com uma head de classificação de sequência. A head de calssificação de sequência é uma camada linear no topo das saídas agrupadas.
+
+```py
+>>> from transformers import DistilBertForSequenceClassification
+
+>>> model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
+```
+
+Reutilize facilmente esse ponto de parada para outra tarefe mudando para uma head de modelo diferente. Para uma tarefe de responder questões, você usaria a head do modelo [`DistilBertForQuestionAnswering`]. A head de responder questões é similar com a de classificação de sequências exceto o fato de que ela é uma camada no topo dos estados das saídas ocultas.
+
+```py
+>>> from transformers import DistilBertForQuestionAnswering
+
+>>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+```
+</pt>
+<tf>
+Por exemplo, [`TFDistilBertForSequenceClassification`] é um modelo DistilBERT base com uma head de classificação de sequência. A head de calssificação de sequência é uma camada linear no topo das saídas agrupadas.
+
+```py
+>>> from transformers import TFDistilBertForSequenceClassification
+
+>>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
+```
+
+Reutilize facilmente esse ponto de parada para outra tarefe mudando para uma head de modelo diferente. Para uma tarefe de responder questões, você usaria a head do modelo [`TFDistilBertForQuestionAnswering`]. A head de responder questões é similar com a de classificação de sequências exceto o fato de que ela é uma camada no topo dos estados das saídas ocultas.
+
+```py
+>>> from transformers import TFDistilBertForQuestionAnswering
+
+>>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+```
+</tf>
+</frameworkcontent>
+
+## Tokenizer
+
+A útlima classe base que você precisa antes de usar um modelo para dados textuais é a [tokenizer](main_classes/tokenizer) para converter textos originais para tensores. Existem dois tipos de tokenizers que você pode usar com 🤗 Transformers:
+
+- [`PreTrainedTokenizer`]: uma implementação em Python de um tokenizer.
+- [`PreTrainedTokenizerFast`]: um tokenizer da nossa biblioteca [🤗 Tokenizer](https://huggingface.co/docs/tokenizers/python/latest/) baseada em Rust. Esse tipo de tokenizer é significantemente mais rapido - especialmente durante tokenization de codificação - devido a implementação em Rust. O tokenizer rápido tambem oferece métodos adicionais como *offset mapping* que mapeia tokens para suar palavras ou caracteres originais.
+
+Os dois tokenizers suporta métodos comuns como os de codificar e decodificar, adicionar novos tokens, e gerenciar tokens especiais.
+
+<Tip warning={true}>
+
+Nem todo modelo suporta um 'fast tokenizer'. De uma olhada aqui [table](index#supported-frameworks) pra checar se um modelo suporta 'fast tokenizer'.
+
+</Tip>
+
+Se você treinou seu prórpio tokenizer, você pode criar um a partir do seu arquivo *vocabulary*:
+
+```py
+>>> from transformers import DistilBertTokenizer
+
+>>> my_tokenizer = DistilBertTokenizer(vocab_file="my_vocab_file.txt", do_lower_case=False, padding_side="left")
+```
+
+É importante lembrar que o vocabulário de um tokenizer customizado será diferente de um vocabulário gerado pelo tokenizer de um modelo pré treinado. Você precisa usar o vocabulário de um modelo pré treinado se você estiver usando um modelo pré treinado, caso contrário as entradas não farão sentido. Criando um tokenizer com um vocabulário de um modelo pré treinado com a classe [`DistilBertTokenizer`]:
+
+```py
+>>> from transformers import DistilBertTokenizer
+
+>>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
+```
+
+Criando um 'fast tokenizer' com a classe [`DistilBertTokenizerFast`]:
+
+```py
+>>> from transformers import DistilBertTokenizerFast
+
+>>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
+```
+
+<Tip>
+
+Pos padrão, [`AutoTokenizer`] tentará carregar um 'fast tokenizer'. Você pode disabilitar esse comportamento colocando `use_fast=False` no `from_pretrained`.
+
+</Tip>
+
+## Extrator de features
+
+Um extrator de features processa entradas de imagem ou áudio. Ele herda da classe base [`~feature_extraction_utils.FeatureExtractionMixin`], e pode também herdar da classe [`ImageFeatureExtractionMixin`] para processamento de features de imagem ou da classe [`SequenceFeatureExtractor`] para processamento de entradas de áudio.
+
+Dependendo do que você está trabalhando em um audio ou uma tarefa de visão, crie um estrator de features associado com o modelo que você está usando. Por exemplo, crie um [`ViTFeatureExtractor`] padrão se você estiver usando [ViT](model_doc/vit) para classificação de imagens:
+
+```py
+>>> from transformers import ViTFeatureExtractor
+
+>>> vit_extractor = ViTFeatureExtractor()
+>>> print(vit_extractor)
+ViTFeatureExtractor {
+  "do_normalize": true,
+  "do_resize": true,
+  "feature_extractor_type": "ViTFeatureExtractor",
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "resample": 2,
+  "size": 224
+}
+```
+
+<Tip>
+
+Se você não estiver procurando por nenhuma customização, apenas use o método `from_pretrained` para carregar parâmetros do modelo de extrator de features padrão.
+
+</Tip>
+
+Modifique qualquer parâmetro dentre os [`ViTFeatureExtractor`] para criar seu extrator de features customizado.
+
+```py
+>>> from transformers import ViTFeatureExtractor
+
+>>> my_vit_extractor = ViTFeatureExtractor(resample="PIL.Image.BOX", do_normalize=False, image_mean=[0.3, 0.3, 0.3])
+>>> print(my_vit_extractor)
+ViTFeatureExtractor {
+  "do_normalize": false,
+  "do_resize": true,
+  "feature_extractor_type": "ViTFeatureExtractor",
+  "image_mean": [
+    0.3,
+    0.3,
+    0.3
+  ],
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "resample": "PIL.Image.BOX",
+  "size": 224
+}
+```
+
+Para entradas de áutio, você pode criar um [`Wav2Vec2FeatureExtractor`] e customizar os parâmetros de uma forma similar:
+
+```py
+>>> from transformers import Wav2Vec2FeatureExtractor
+
+>>> w2v2_extractor = Wav2Vec2FeatureExtractor()
+>>> print(w2v2_extractor)
+Wav2Vec2FeatureExtractor {
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "return_attention_mask": false,
+  "sampling_rate": 16000
+}
+```
+
+## Processor
+
+Para modelos que suportam tarefas multimodais, 🤗 Transformers oferece uma classe processadora que convenientemente cobre um extrator de features e tokenizer dentro de um único objeto. Por exemplo, vamos usar o [`Wav2Vec2Processor`] para uma tarefa de reconhecimento de fala automática (ASR). ASR transcreve áudio para texto, então você irá precisar de um extrator de um features e um tokenizer.
+
+Crie um extrator de features para lidar com as entradas de áudio.
+
+```py
+>>> from transformers import Wav2Vec2FeatureExtractor
+
+>>> feature_extractor = Wav2Vec2FeatureExtractor(padding_value=1.0, do_normalize=True)
+```
+
+Crie um tokenizer para lidar com a entrada de textos:
+
+```py
+>>> from transformers import Wav2Vec2CTCTokenizer
+
+>>> tokenizer = Wav2Vec2CTCTokenizer(vocab_file="my_vocab_file.txt")
+```
+
+Combine o extrator de features e o tokenizer no [`Wav2Vec2Processor`]:
+
+```py
+>>> from transformers import Wav2Vec2Processor
+
+>>> processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+```
+
+Com duas classes básicas - configuração e modelo - e um preprocessamento de classe adicional (tokenizer, extrator de features, ou processador), você pode criar qualquer modelo que suportado por 🤗 Transformers. Qualquer uma dessas classes base são configuráveis, te permitindo usar os atributos específicos que você queira. Você pode facilmente preparar um modelo para treinamento ou modificar um modelo pré-treinado com poucas mudanças.
\ No newline at end of file
diff --git a/docs/source/pt/create_a_model.mdx b/docs/source/pt/create_a_model.mdx
deleted file mode 100644
index bde2b1b18770..000000000000
--- a/docs/source/pt/create_a_model.mdx
+++ /dev/null
@@ -1,355 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Criar uma arquitetura customizada
-
-Uma [`AutoClass`](model_doc/auto) automaticamente infere a arquitetura do modelo e baixa configurações e pesos pré-treinados. Geralmente, nós recomendamos usar uma `AutoClass` para produzir um código independente de checkpoints. Mas usuários que querem mais contole sobre parâmetros específicos do modelo pode criar um modelo customizado 🤗 Transformers a partir de algumas classes bases. Isso pode ser particulamente útil para alguém que está interessado em estudar, treinar ou fazer experimentos com um modelo 🤗 Transformers. Nesse tutorial, será explicado como criar um modelo customizado sem uma `AutoClass`. Aprenda como:
-
-- Carregar e customizar a configuração de um modelo.
-- Criar a arquitetura de um modelo.
-- Criar um tokenizer rápido e devagar para textos.
-- Criar extrator de features para tarefas envolvendo audio e imagem.
-- Criar um processador para tarefas multimodais.
-
-## configuration
-
-A [configuration](main_classes/configuration) refere-se a atributos específicos de um modelo. Cada configuração de modelo tem atributos diferentes; por exemplo, todos modelo de PLN possuem os atributos `hidden_size`, `num_attention_heads`, `num_hidden_layers` e `vocab_size` em comum. Esse atributos especificam o numero de 'attention heads' ou 'hidden layers' para construir um modelo.
-
-Dê uma olhada a mais em [DistilBERT](model_doc/distilbert) acessando [`DistilBertConfig`] para observar esses atributos:
-
-```py
->>> from transformers import DistilBertConfig
-
->>> config = DistilBertConfig()
->>> print(config)
-DistilBertConfig {
-  "activation": "gelu",
-  "attention_dropout": 0.1,
-  "dim": 768,
-  "dropout": 0.1,
-  "hidden_dim": 3072,
-  "initializer_range": 0.02,
-  "max_position_embeddings": 512,
-  "model_type": "distilbert",
-  "n_heads": 12,
-  "n_layers": 6,
-  "pad_token_id": 0,
-  "qa_dropout": 0.1,
-  "seq_classif_dropout": 0.2,
-  "sinusoidal_pos_embds": false,
-  "transformers_version": "4.16.2",
-  "vocab_size": 30522
-}
-```
-
-[`DistilBertConfig`] mostra todos os atributos padrões usados para construir um [`DistilBertModel`] base. Todos atributos são customizáveis, o que cria espaço para experimentos. Por exemplo, você pode customizar um modelo padrão para:
-
-- Tentar uma função de ativação diferente com o parâmetro `activation`.
-- Usar uma taxa de desistência maior para as probabilidades de 'attention' com o parâmetro `attention_dropout`.
-
-```py
->>> my_config = DistilBertConfig(activation="relu", attention_dropout=0.4)
->>> print(my_config)
-DistilBertConfig {
-  "activation": "relu",
-  "attention_dropout": 0.4,
-  "dim": 768,
-  "dropout": 0.1,
-  "hidden_dim": 3072,
-  "initializer_range": 0.02,
-  "max_position_embeddings": 512,
-  "model_type": "distilbert",
-  "n_heads": 12,
-  "n_layers": 6,
-  "pad_token_id": 0,
-  "qa_dropout": 0.1,
-  "seq_classif_dropout": 0.2,
-  "sinusoidal_pos_embds": false,
-  "transformers_version": "4.16.2",
-  "vocab_size": 30522
-}
-```
-
-Atributos de um modelo pré-treinado podem ser modificados na função [`~PretrainedConfig.from_pretrained`]:
-
-```py
->>> my_config = DistilBertConfig.from_pretrained("distilbert-base-uncased", activation="relu", attention_dropout=0.4)
-```
-
-Uma vez que você está satisfeito com as configurações do seu modelo, você consegue salvar elas com [`~PretrainedConfig.save_pretrained`]. Seu arquivo de configurações está salvo como um arquivo JSON no diretório especificado:
-
-```py
->>> my_config.save_pretrained(save_directory="./your_model_save_path")
-```
-
-Para reusar o arquivo de configurações, carregue com [`~PretrainedConfig.from_pretrained`]:
-
-```py
->>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json")
-```
-
-<Tip>
-
-Você pode também salvar seu arquivo de configurações como um dicionário ou até mesmo com a diferença entre as seus atributos de configuração customizados e os atributos de configuração padrões! Olhe a documentação [configuration](main_classes/configuration) para mais detalhes.
-
-</Tip>
-
-## Modelo
-
-O próximo passo é criar um [model](main_classes/models). O modelo - também vagamente referido como arquitetura - define o que cada camada está fazendo e quais operações estão acontecendo. Atributos como `num_hidden_layers` das configurações são utilizados para definir a arquitetura. Todo modelo compartilha a classe base [`PreTrainedModel`] e alguns métodos em comum como redimensionar o tamanho dos embeddings de entrada e podar as 'self-attention heads'. Além disso, todos os modelos também são subclasses de [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html), [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) ou [`flax.linen.Module`](https://flax.readthedocs.io/en/latest/flax.linen.html#module). Isso significa que os modelos são compatíveis com cada respectivo uso de framework.
-
-<frameworkcontent>
-<pt>
-Carregar seus atributos de configuração customizados em um modelo:
-
-```py
->>> from transformers import DistilBertModel
-
->>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json")
->>> model = DistilBertModel(my_config)
-```
-
-Isso cria um modelo com valores aleatórios ao invés de pré-treinar os pesos. Você não irá conseguir usar usar esse modelo para nada útil ainda, até você treinar ele. Treino é um processo caro e demorado. Geralmente é melhor utilizar um modelo pré-treinado para obter melhores resultados mais rápido, enquanto usa apenas uma fração dos recursos necessários para treinar.
-
-Criar um modelo pré-treinado com [`~PreTrainedModel.from_pretrained`]:
-
-```py
->>> model = DistilBertModel.from_pretrained("distilbert-base-uncased")
-```
-
-Quando você carregar os pesos pré-treinados, a configuração padrão do modelo é automaticamente carregada se o modelo é provido pelo 🤗 Transformers. No entanto, você ainda consegue mudar - alguns ou todos - os atributos padrões de configuração do modelo com os seus próprio atributos, se você preferir: 
-
-```py
->>> model = DistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
-```
-</pt>
-<tf>
-Carregar os seus próprios atributos padrões de contiguração no modelo:
-
-```py
->>> from transformers import TFDistilBertModel
-
->>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json")
->>> tf_model = TFDistilBertModel(my_config)
-```
-
-Isso cria um modelo com valores aleatórios ao invés de pré-treinar os pesos. Você não irá conseguir usar usar esse modelo para nada útil ainda, até você treinar ele. Treino é um processo caro e demorado. Geralmente é melhor utilizar um modelo pré-treinado para obter melhores resultados mais rápido, enquanto usa apenas uma fração dos recursos necessários para treinar.
-
-Criar um modelo pré-treinado com [`~TFPreTrainedModel.from_pretrained`]:
-
-```py
->>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased")
-```
-
-Quando você carregar os pesos pré-treinados, a configuração padrão do modelo é automaticamente carregada se o modelo é provido pelo 🤗 Transformers. No entanto, você ainda consegue mudar - alguns ou todos - os atributos padrões de configuração do modelo com os seus próprio atributos, se você preferir: 
-
-```py
->>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
-```
-</tf>
-</frameworkcontent>
-
-### Heads do modelo
-
-Neste ponto, você tem um modelo básico do DistilBERT que gera os *estados ocultos*. Os estados ocultos são passados como entrada para a head do moelo para produzir a saída final. 🤗 Transformers fornece uma head de modelo diferente para cada tarefa desde que o modelo suporte essa tarefa (por exemplo, você não consegue utilizar o modelo DistilBERT para uma tarefa de 'sequence-to-sequence' como tradução).
-
-<frameworkcontent>
-<pt>
-Por exemplo, [`DistilBertForSequenceClassification`] é um modelo DistilBERT base com uma head de classificação de sequência. A head de calssificação de sequência é uma camada linear no topo das saídas agrupadas.
-
-```py
->>> from transformers import DistilBertForSequenceClassification
-
->>> model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
-```
-
-Reutilize facilmente esse ponto de parada para outra tarefe mudando para uma head de modelo diferente. Para uma tarefe de responder questões, você usaria a head do modelo [`DistilBertForQuestionAnswering`]. A head de responder questões é similar com a de classificação de sequências exceto o fato de que ela é uma camada no topo dos estados das saídas ocultas.
-
-```py
->>> from transformers import DistilBertForQuestionAnswering
-
->>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
-```
-</pt>
-<tf>
-Por exemplo, [`TFDistilBertForSequenceClassification`] é um modelo DistilBERT base com uma head de classificação de sequência. A head de calssificação de sequência é uma camada linear no topo das saídas agrupadas.
-
-```py
->>> from transformers import TFDistilBertForSequenceClassification
-
->>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
-```
-
-Reutilize facilmente esse ponto de parada para outra tarefe mudando para uma head de modelo diferente. Para uma tarefe de responder questões, você usaria a head do modelo [`TFDistilBertForQuestionAnswering`]. A head de responder questões é similar com a de classificação de sequências exceto o fato de que ela é uma camada no topo dos estados das saídas ocultas.
-
-```py
->>> from transformers import TFDistilBertForQuestionAnswering
-
->>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
-```
-</tf>
-</frameworkcontent>
-
-## Tokenizer
-
-A útlima classe base que você precisa antes de usar um modelo para dados textuais é a [tokenizer](main_classes/tokenizer) para converter textos originais para tensores. Existem dois tipos de tokenizers que você pode usar com 🤗 Transformers:
-
-- [`PreTrainedTokenizer`]: uma implementação em Python de um tokenizer.
-- [`PreTrainedTokenizerFast`]: um tokenizer da nossa biblioteca [🤗 Tokenizer](https://huggingface.co/docs/tokenizers/python/latest/) baseada em Rust. Esse tipo de tokenizer é significantemente mais rapido - especialmente durante tokenization de codificação - devido a implementação em Rust. O tokenizer rápido tambem oferece métodos adicionais como *offset mapping* que mapeia tokens para suar palavras ou caracteres originais.
-
-Os dois tokenizers suporta métodos comuns como os de codificar e decodificar, adicionar novos tokens, e gerenciar tokens especiais.
-
-<Tip warning={true}>
-
-Nem todo modelo suporta um 'fast tokenizer'. De uma olhada aqui [table](index#supported-frameworks) pra checar se um modelo suporta 'fast tokenizer'.
-
-</Tip>
-
-Se você treinou seu prórpio tokenizer, você pode criar um a partir do seu arquivo *vocabulary*:
-
-```py
->>> from transformers import DistilBertTokenizer
-
->>> my_tokenizer = DistilBertTokenizer(vocab_file="my_vocab_file.txt", do_lower_case=False, padding_side="left")
-```
-
-É importante lembrar que o vocabulário de um tokenizer customizado será diferente de um vocabulário gerado pelo tokenizer de um modelo pré treinado. Você precisa usar o vocabulário de um modelo pré treinado se você estiver usando um modelo pré treinado, caso contrário as entradas não farão sentido. Criando um tokenizer com um vocabulário de um modelo pré treinado com a classe [`DistilBertTokenizer`]:
-
-```py
->>> from transformers import DistilBertTokenizer
-
->>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
-```
-
-Criando um 'fast tokenizer' com a classe [`DistilBertTokenizerFast`]:
-
-```py
->>> from transformers import DistilBertTokenizerFast
-
->>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
-```
-
-<Tip>
-
-Pos padrão, [`AutoTokenizer`] tentará carregar um 'fast tokenizer'. Você pode disabilitar esse comportamento colocando `use_fast=False` no `from_pretrained`.
-
-</Tip>
-
-## Extrator de features
-
-Um extrator de features processa entradas de imagem ou áudio. Ele herda da classe base [`~feature_extraction_utils.FeatureExtractionMixin`], e pode também herdar da classe [`ImageFeatureExtractionMixin`] para processamento de features de imagem ou da classe [`SequenceFeatureExtractor`] para processamento de entradas de áudio.
-
-Dependendo do que você está trabalhando em um audio ou uma tarefa de visão, crie um estrator de features associado com o modelo que você está usando. Por exemplo, crie um [`ViTFeatureExtractor`] padrão se você estiver usando [ViT](model_doc/vit) para classificação de imagens:
-
-```py
->>> from transformers import ViTFeatureExtractor
-
->>> vit_extractor = ViTFeatureExtractor()
->>> print(vit_extractor)
-ViTFeatureExtractor {
-  "do_normalize": true,
-  "do_resize": true,
-  "feature_extractor_type": "ViTFeatureExtractor",
-  "image_mean": [
-    0.5,
-    0.5,
-    0.5
-  ],
-  "image_std": [
-    0.5,
-    0.5,
-    0.5
-  ],
-  "resample": 2,
-  "size": 224
-}
-```
-
-<Tip>
-
-Se você não estiver procurando por nenhuma customização, apenas use o método `from_pretrained` para carregar parâmetros do modelo de extrator de features padrão.
-
-</Tip>
-
-Modifique qualquer parâmetro dentre os [`ViTFeatureExtractor`] para criar seu extrator de features customizado.
-
-```py
->>> from transformers import ViTFeatureExtractor
-
->>> my_vit_extractor = ViTFeatureExtractor(resample="PIL.Image.BOX", do_normalize=False, image_mean=[0.3, 0.3, 0.3])
->>> print(my_vit_extractor)
-ViTFeatureExtractor {
-  "do_normalize": false,
-  "do_resize": true,
-  "feature_extractor_type": "ViTFeatureExtractor",
-  "image_mean": [
-    0.3,
-    0.3,
-    0.3
-  ],
-  "image_std": [
-    0.5,
-    0.5,
-    0.5
-  ],
-  "resample": "PIL.Image.BOX",
-  "size": 224
-}
-```
-
-Para entradas de áutio, você pode criar um [`Wav2Vec2FeatureExtractor`] e customizar os parâmetros de uma forma similar:
-
-```py
->>> from transformers import Wav2Vec2FeatureExtractor
-
->>> w2v2_extractor = Wav2Vec2FeatureExtractor()
->>> print(w2v2_extractor)
-Wav2Vec2FeatureExtractor {
-  "do_normalize": true,
-  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
-  "feature_size": 1,
-  "padding_side": "right",
-  "padding_value": 0.0,
-  "return_attention_mask": false,
-  "sampling_rate": 16000
-}
-```
-
-## Processor
-
-Para modelos que suportam tarefas multimodais, 🤗 Transformers oferece uma classe processadora que convenientemente cobre um extrator de features e tokenizer dentro de um único objeto. Por exemplo, vamos usar o [`Wav2Vec2Processor`] para uma tarefa de reconhecimento de fala automática (ASR). ASR transcreve áudio para texto, então você irá precisar de um extrator de um features e um tokenizer.
-
-Crie um extrator de features para lidar com as entradas de áudio.
-
-```py
->>> from transformers import Wav2Vec2FeatureExtractor
-
->>> feature_extractor = Wav2Vec2FeatureExtractor(padding_value=1.0, do_normalize=True)
-```
-
-Crie um tokenizer para lidar com a entrada de textos:
-
-```py
->>> from transformers import Wav2Vec2CTCTokenizer
-
->>> tokenizer = Wav2Vec2CTCTokenizer(vocab_file="my_vocab_file.txt")
-```
-
-Combine o extrator de features e o tokenizer no [`Wav2Vec2Processor`]:
-
-```py
->>> from transformers import Wav2Vec2Processor
-
->>> processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
-```
-
-Com duas classes básicas - configuração e modelo - e um preprocessamento de classe adicional (tokenizer, extrator de features, ou processador), você pode criar qualquer modelo que suportado por 🤗 Transformers. Qualquer uma dessas classes base são configuráveis, te permitindo usar os atributos específicos que você queira. Você pode facilmente preparar um modelo para treinamento ou modificar um modelo pré-treinado com poucas mudanças.
\ No newline at end of file
diff --git a/docs/source/pt/custom_models.md b/docs/source/pt/custom_models.md
new file mode 100644
index 000000000000..70c56913a383
--- /dev/null
+++ b/docs/source/pt/custom_models.md
@@ -0,0 +1,358 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Compartilhando modelos customizados
+
+A biblioteca 🤗 Transformers foi projetada para ser facilmente extensível. Cada modelo é totalmente codificado em uma determinada subpasta 
+do repositório sem abstração, para que você possa copiar facilmente um arquivo de modelagem e ajustá-lo às suas necessidades.
+
+Se você estiver escrevendo um modelo totalmente novo, pode ser mais fácil começar do zero. Neste tutorial, mostraremos 
+como escrever um modelo customizado e sua configuração para que possa ser usado com Transformers, e como você pode compartilhá-lo 
+com a comunidade (com o código em que se baseia) para que qualquer pessoa possa usá-lo, mesmo se não estiver presente na biblioteca 🤗 Transformers.
+
+Ilustraremos tudo isso em um modelo ResNet, envolvendo a classe ResNet do
+[biblioteca timm](https://github.com/rwightman/pytorch-image-models) em um [`PreTrainedModel`].
+
+## Escrevendo uma configuração customizada
+
+Antes de mergulharmos no modelo, vamos primeiro escrever sua configuração. A configuração de um modelo é um objeto que
+terá todas as informações necessárias para construir o modelo. Como veremos na próxima seção, o modelo só pode
+ter um `config` para ser inicializado, então realmente precisamos que esse objeto seja o mais completo possível.
+
+Em nosso exemplo, pegaremos alguns argumentos da classe ResNet que podemos querer ajustar. Diferentes
+configurações nos dará os diferentes tipos de ResNets que são possíveis. Em seguida, apenas armazenamos esses argumentos,
+após verificar a validade de alguns deles.
+
+```python
+from transformers import PretrainedConfig
+from typing import List
+
+
+class ResnetConfig(PretrainedConfig):
+    model_type = "resnet"
+
+    def __init__(
+        self,
+        block_type="bottleneck",
+        layers: List[int] = [3, 4, 6, 3],
+        num_classes: int = 1000,
+        input_channels: int = 3,
+        cardinality: int = 1,
+        base_width: int = 64,
+        stem_width: int = 64,
+        stem_type: str = "",
+        avg_down: bool = False,
+        **kwargs,
+    ):
+        if block_type not in ["basic", "bottleneck"]:
+            raise ValueError(f"`block_type` must be 'basic' or bottleneck', got {block_type}.")
+        if stem_type not in ["", "deep", "deep-tiered"]:
+            raise ValueError(f"`stem_type` must be '', 'deep' or 'deep-tiered', got {stem_type}.")
+
+        self.block_type = block_type
+        self.layers = layers
+        self.num_classes = num_classes
+        self.input_channels = input_channels
+        self.cardinality = cardinality
+        self.base_width = base_width
+        self.stem_width = stem_width
+        self.stem_type = stem_type
+        self.avg_down = avg_down
+        super().__init__(**kwargs)
+```
+
+As três coisas importantes a serem lembradas ao escrever sua própria configuração são:
+- você tem que herdar de `PretrainedConfig`,
+- o `__init__` do seu `PretrainedConfig` deve aceitar quaisquer kwargs,
+- esses `kwargs` precisam ser passados para a superclasse `__init__`.
+
+A herança é para garantir que você obtenha todas as funcionalidades da biblioteca 🤗 Transformers, enquanto as outras duas
+restrições vêm do fato de um `PretrainedConfig` ter mais campos do que os que você está configurando. Ao recarregar um
+config com o método `from_pretrained`, esses campos precisam ser aceitos pelo seu config e então enviados para a
+superclasse.
+
+Definir um `model_type` para sua configuração (aqui `model_type="resnet"`) não é obrigatório, a menos que você queira
+registrar seu modelo com as classes automáticas (veja a última seção).
+
+Com isso feito, você pode facilmente criar e salvar sua configuração como faria com qualquer outra configuração de modelo da
+biblioteca. Aqui está como podemos criar uma configuração resnet50d e salvá-la:
+
+```py
+resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True)
+resnet50d_config.save_pretrained("custom-resnet")
+```
+
+Isso salvará um arquivo chamado `config.json` dentro da pasta `custom-resnet`. Você pode então recarregar sua configuração com o
+método `from_pretrained`:
+
+```py
+resnet50d_config = ResnetConfig.from_pretrained("custom-resnet")
+```
+
+Você também pode usar qualquer outro método da classe [`PretrainedConfig`], como [`~PretrainedConfig.push_to_hub`] para
+carregar diretamente sua configuração para o Hub.
+
+## Escrevendo um modelo customizado
+
+Agora que temos nossa configuração ResNet, podemos continuar escrevendo o modelo. Na verdade, escreveremos dois: um que
+extrai os recursos ocultos de um lote de imagens (como [`BertModel`]) e um que é adequado para classificação de imagem
+(como [`BertForSequenceClassification`]).
+
+Como mencionamos antes, escreveremos apenas um wrapper solto do modelo para mantê-lo simples para este exemplo. A única
+coisa que precisamos fazer antes de escrever esta classe é um mapa entre os tipos de bloco e as classes de bloco reais. Então o
+modelo é definido a partir da configuração passando tudo para a classe `ResNet`:
+
+```py
+from transformers import PreTrainedModel
+from timm.models.resnet import BasicBlock, Bottleneck, ResNet
+from .configuration_resnet import ResnetConfig
+
+
+BLOCK_MAPPING = {"basic": BasicBlock, "bottleneck": Bottleneck}
+
+
+class ResnetModel(PreTrainedModel):
+    config_class = ResnetConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        block_layer = BLOCK_MAPPING[config.block_type]
+        self.model = ResNet(
+            block_layer,
+            config.layers,
+            num_classes=config.num_classes,
+            in_chans=config.input_channels,
+            cardinality=config.cardinality,
+            base_width=config.base_width,
+            stem_width=config.stem_width,
+            stem_type=config.stem_type,
+            avg_down=config.avg_down,
+        )
+
+    def forward(self, tensor):
+        return self.model.forward_features(tensor)
+```
+
+Para o modelo que irá classificar as imagens, vamos apenas alterar o método forward:
+
+```py
+import torch
+
+
+class ResnetModelForImageClassification(PreTrainedModel):
+    config_class = ResnetConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        block_layer = BLOCK_MAPPING[config.block_type]
+        self.model = ResNet(
+            block_layer,
+            config.layers,
+            num_classes=config.num_classes,
+            in_chans=config.input_channels,
+            cardinality=config.cardinality,
+            base_width=config.base_width,
+            stem_width=config.stem_width,
+            stem_type=config.stem_type,
+            avg_down=config.avg_down,
+        )
+
+    def forward(self, tensor, labels=None):
+        logits = self.model(tensor)
+        if labels is not None:
+            loss = torch.nn.cross_entropy(logits, labels)
+            return {"loss": loss, "logits": logits}
+        return {"logits": logits}
+```
+
+Em ambos os casos, observe como herdamos de `PreTrainedModel` e chamamos a inicialização da superclasse com o `config`
+(um pouco parecido quando você escreve um `torch.nn.Module`). A linha que define o `config_class` não é obrigatória, a menos que
+você deseje registrar seu modelo com as classes automáticas (consulte a última seção).
+
+<Tip>
+
+Se o seu modelo for muito semelhante a um modelo dentro da biblioteca, você poderá reutilizar a mesma configuração desse modelo.
+
+</Tip>
+
+Você pode fazer com que seu modelo retorne o que você quiser,porém retornando um dicionário como fizemos para
+`ResnetModelForImageClassification`, com a função de perda incluída quando os rótulos são passados, vai tornar seu modelo diretamente
+utilizável dentro da classe [`Trainer`]. Você pode usar outro formato de saída, desde que esteja planejando usar seu próprio
+laço de treinamento ou outra biblioteca para treinamento.
+
+Agora que temos nossa classe do modelo, vamos criar uma:
+
+```py
+resnet50d = ResnetModelForImageClassification(resnet50d_config)
+```
+
+Novamente, você pode usar qualquer um dos métodos do [`PreTrainedModel`], como [`~PreTrainedModel.save_pretrained`] ou
+[`~PreTrainedModel.push_to_hub`]. Usaremos o segundo na próxima seção e veremos como enviar os pesos e
+o código do nosso modelo. Mas primeiro, vamos carregar alguns pesos pré-treinados dentro do nosso modelo.
+
+Em seu próprio caso de uso, você provavelmente estará treinando seu modelo customizado em seus próprios dados. Para este tutorial ser rápido,
+usaremos a versão pré-treinada do resnet50d. Como nosso modelo é apenas um wrapper em torno dele, será
+fácil de transferir esses pesos:
+
+```py
+import timm
+
+pretrained_model = timm.create_model("resnet50d", pretrained=True)
+resnet50d.model.load_state_dict(pretrained_model.state_dict())
+```
+
+Agora vamos ver como ter certeza de que quando fazemos [`~PreTrainedModel.save_pretrained`] ou [`~PreTrainedModel.push_to_hub`], o
+código do modelo é salvo.
+
+## Enviando o código para o Hub
+
+<Tip warning={true}>
+
+Esta API é experimental e pode ter algumas pequenas alterações nas próximas versões.
+
+</Tip>
+
+Primeiro, certifique-se de que seu modelo esteja totalmente definido em um arquivo `.py`. Ele pode contar com importações relativas para alguns outros arquivos 
+desde que todos os arquivos estejam no mesmo diretório (ainda não suportamos submódulos para este recurso). Para o nosso exemplo,
+vamos definir um arquivo `modeling_resnet.py` e um arquivo `configuration_resnet.py` em uma pasta no 
+diretório de trabalho atual chamado `resnet_model`. O arquivo de configuração contém o código para `ResnetConfig` e o arquivo de modelagem
+contém o código do `ResnetModel` e `ResnetModelForImageClassification`.
+
+```
+.
+└── resnet_model
+    ├── __init__.py
+    ├── configuration_resnet.py
+    └── modeling_resnet.py
+```
+
+O `__init__.py` pode estar vazio, apenas está lá para que o Python detecte que o `resnet_model` possa ser usado como um módulo.
+
+<Tip warning={true}>
+
+Se estiver copiando arquivos de modelagem da biblioteca, você precisará substituir todas as importações relativas na parte superior do arquivo
+para importar do pacote `transformers`.
+
+</Tip>
+
+Observe que você pode reutilizar (ou subclasse) uma configuração/modelo existente.
+
+Para compartilhar seu modelo com a comunidade, siga estas etapas: primeiro importe o modelo ResNet e a configuração do
+arquivos criados:
+
+```py
+from resnet_model.configuration_resnet import ResnetConfig
+from resnet_model.modeling_resnet import ResnetModel, ResnetModelForImageClassification
+```
+
+Então você tem que dizer à biblioteca que deseja copiar os arquivos de código desses objetos ao usar o `save_pretrained`
+e registrá-los corretamente com uma determinada classe automáticas (especialmente para modelos), basta executar:
+
+```py
+ResnetConfig.register_for_auto_class()
+ResnetModel.register_for_auto_class("AutoModel")
+ResnetModelForImageClassification.register_for_auto_class("AutoModelForImageClassification")
+```
+
+Observe que não há necessidade de especificar uma classe automática para a configuração (há apenas uma classe automática,
+[`AutoConfig`]), mas é diferente para os modelos. Seu modelo customizado pode ser adequado para muitas tarefas diferentes, então você
+tem que especificar qual das classes automáticas é a correta para o seu modelo.
+
+Em seguida, vamos criar a configuração e os modelos como fizemos antes:
+
+```py
+resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True)
+resnet50d = ResnetModelForImageClassification(resnet50d_config)
+
+pretrained_model = timm.create_model("resnet50d", pretrained=True)
+resnet50d.model.load_state_dict(pretrained_model.state_dict())
+```
+
+Agora para enviar o modelo para o Hub, certifique-se de estar logado. Ou execute no seu terminal:
+
+```bash
+huggingface-cli login
+```
+
+ou a partir do notebook:
+
+```py
+from huggingface_hub import notebook_login
+
+notebook_login()
+```
+
+Você pode então enviar para seu próprio namespace (ou uma organização da qual você é membro) assim:
+
+
+```py
+resnet50d.push_to_hub("custom-resnet50d")
+```
+
+Além dos pesos do modelo e da configuração no formato json, isso também copiou o modelo e
+configuração `.py` na pasta `custom-resnet50d` e carregou o resultado para o Hub. Você pode conferir o resultado
+neste [repositório de modelos](https://huggingface.co/sgugger/custom-resnet50d).
+
+Consulte o [tutorial de compartilhamento](model_sharing) para obter mais informações sobre o método push_to_hub.
+
+## Usando um modelo com código customizado
+
+Você pode usar qualquer configuração, modelo ou tokenizador com arquivos de código customizados em seu repositório com as classes automáticas e
+o método `from_pretrained`. Todos os arquivos e códigos carregados no Hub são verificados quanto a malware (consulte a documentação de [Segurança do Hub](https://huggingface.co/docs/hub/security#malware-scanning) para obter mais informações), mas você ainda deve
+revisar o código do modelo e o autor para evitar a execução de código malicioso em sua máquina. Defina `trust_remote_code=True` para usar
+um modelo com código customizado:
+
+```py
+from transformers import AutoModelForImageClassification
+
+model = AutoModelForImageClassification.from_pretrained("sgugger/custom-resnet50d", trust_remote_code=True)
+```
+
+Também é fortemente recomendado passar um hash de confirmação como uma `revisão` para garantir que o autor dos modelos não
+atualize o código com novas linhas maliciosas (a menos que você confie totalmente nos autores dos modelos).
+
+
+```py
+commit_hash = "ed94a7c6247d8aedce4647f00f20de6875b5b292"
+model = AutoModelForImageClassification.from_pretrained(
+    "sgugger/custom-resnet50d", trust_remote_code=True, revision=commit_hash
+)
+```
+
+Observe que ao navegar no histórico de commits do repositório do modelo no Hub, há um botão para copiar facilmente o commit
+hash de qualquer commit.
+
+## Registrando um modelo com código customizado para as classes automáticas
+
+Se você estiver escrevendo uma biblioteca que estende 🤗 Transformers, talvez queira estender as classes automáticas para incluir seus próprios
+modelos. Isso é diferente de enviar o código para o Hub no sentido de que os usuários precisarão importar sua biblioteca para
+obter os modelos customizados (ao contrário de baixar automaticamente o código do modelo do Hub).
+
+Desde que sua configuração tenha um atributo `model_type` diferente dos tipos de modelo existentes e que as classes do seu modelo
+tenha os atributos `config_class` corretos, você pode simplesmente adicioná-los às classes automáticas assim:
+
+```py
+from transformers import AutoConfig, AutoModel, AutoModelForImageClassification
+
+AutoConfig.register("resnet", ResnetConfig)
+AutoModel.register(ResnetConfig, ResnetModel)
+AutoModelForImageClassification.register(ResnetConfig, ResnetModelForImageClassification)
+```
+
+Observe que o primeiro argumento usado ao registrar sua configuração customizada para [`AutoConfig`] precisa corresponder ao `model_type`
+de sua configuração customizada. E o primeiro argumento usado ao registrar seus modelos customizados, para qualquer necessidade de classe de modelo automático
+deve corresponder ao `config_class` desses modelos.
+
diff --git a/docs/source/pt/custom_models.mdx b/docs/source/pt/custom_models.mdx
deleted file mode 100644
index 59484dcc35eb..000000000000
--- a/docs/source/pt/custom_models.mdx
+++ /dev/null
@@ -1,354 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Compartilhando modelos customizados
-
-A biblioteca 🤗 Transformers foi projetada para ser facilmente extensível. Cada modelo é totalmente codificado em uma determinada subpasta 
-do repositório sem abstração, para que você possa copiar facilmente um arquivo de modelagem e ajustá-lo às suas necessidades.
-
-Se você estiver escrevendo um modelo totalmente novo, pode ser mais fácil começar do zero. Neste tutorial, mostraremos 
-como escrever um modelo customizado e sua configuração para que possa ser usado com Transformers, e como você pode compartilhá-lo 
-com a comunidade (com o código em que se baseia) para que qualquer pessoa possa usá-lo, mesmo se não estiver presente na biblioteca 🤗 Transformers.
-
-Ilustraremos tudo isso em um modelo ResNet, envolvendo a classe ResNet do
-[biblioteca timm](https://github.com/rwightman/pytorch-image-models) em um [`PreTrainedModel`].
-
-## Escrevendo uma configuração customizada
-
-Antes de mergulharmos no modelo, vamos primeiro escrever sua configuração. A configuração de um modelo é um objeto que
-terá todas as informações necessárias para construir o modelo. Como veremos na próxima seção, o modelo só pode
-ter um `config` para ser inicializado, então realmente precisamos que esse objeto seja o mais completo possível.
-
-Em nosso exemplo, pegaremos alguns argumentos da classe ResNet que podemos querer ajustar. Diferentes
-configurações nos dará os diferentes tipos de ResNets que são possíveis. Em seguida, apenas armazenamos esses argumentos,
-após verificar a validade de alguns deles.
-
-```python
-from transformers import PretrainedConfig
-from typing import List
-
-
-class ResnetConfig(PretrainedConfig):
-    model_type = "resnet"
-
-    def __init__(
-        self,
-        block_type="bottleneck",
-        layers: List[int] = [3, 4, 6, 3],
-        num_classes: int = 1000,
-        input_channels: int = 3,
-        cardinality: int = 1,
-        base_width: int = 64,
-        stem_width: int = 64,
-        stem_type: str = "",
-        avg_down: bool = False,
-        **kwargs,
-    ):
-        if block_type not in ["basic", "bottleneck"]:
-            raise ValueError(f"`block_type` must be 'basic' or bottleneck', got {block_type}.")
-        if stem_type not in ["", "deep", "deep-tiered"]:
-            raise ValueError(f"`stem_type` must be '', 'deep' or 'deep-tiered', got {stem_type}.")
-
-        self.block_type = block_type
-        self.layers = layers
-        self.num_classes = num_classes
-        self.input_channels = input_channels
-        self.cardinality = cardinality
-        self.base_width = base_width
-        self.stem_width = stem_width
-        self.stem_type = stem_type
-        self.avg_down = avg_down
-        super().__init__(**kwargs)
-```
-
-As três coisas importantes a serem lembradas ao escrever sua própria configuração são:
-- você tem que herdar de `PretrainedConfig`,
-- o `__init__` do seu `PretrainedConfig` deve aceitar quaisquer kwargs,
-- esses `kwargs` precisam ser passados para a superclasse `__init__`.
-
-A herança é para garantir que você obtenha todas as funcionalidades da biblioteca 🤗 Transformers, enquanto as outras duas
-restrições vêm do fato de um `PretrainedConfig` ter mais campos do que os que você está configurando. Ao recarregar um
-config com o método `from_pretrained`, esses campos precisam ser aceitos pelo seu config e então enviados para a
-superclasse.
-
-Definir um `model_type` para sua configuração (aqui `model_type="resnet"`) não é obrigatório, a menos que você queira
-registrar seu modelo com as classes automáticas (veja a última seção).
-
-Com isso feito, você pode facilmente criar e salvar sua configuração como faria com qualquer outra configuração de modelo da
-biblioteca. Aqui está como podemos criar uma configuração resnet50d e salvá-la:
-
-```py
-resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True)
-resnet50d_config.save_pretrained("custom-resnet")
-```
-
-Isso salvará um arquivo chamado `config.json` dentro da pasta `custom-resnet`. Você pode então recarregar sua configuração com o
-método `from_pretrained`:
-
-```py
-resnet50d_config = ResnetConfig.from_pretrained("custom-resnet")
-```
-
-Você também pode usar qualquer outro método da classe [`PretrainedConfig`], como [`~PretrainedConfig.push_to_hub`] para
-carregar diretamente sua configuração para o Hub.
-
-## Escrevendo um modelo customizado
-
-Agora que temos nossa configuração ResNet, podemos continuar escrevendo o modelo. Na verdade, escreveremos dois: um que
-extrai os recursos ocultos de um lote de imagens (como [`BertModel`]) e um que é adequado para classificação de imagem
-(como [`BertForSequenceClassification`]).
-
-Como mencionamos antes, escreveremos apenas um wrapper solto do modelo para mantê-lo simples para este exemplo. A única
-coisa que precisamos fazer antes de escrever esta classe é um mapa entre os tipos de bloco e as classes de bloco reais. Então o
-modelo é definido a partir da configuração passando tudo para a classe `ResNet`:
-
-```py
-from transformers import PreTrainedModel
-from timm.models.resnet import BasicBlock, Bottleneck, ResNet
-from .configuration_resnet import ResnetConfig
-
-
-BLOCK_MAPPING = {"basic": BasicBlock, "bottleneck": Bottleneck}
-
-
-class ResnetModel(PreTrainedModel):
-    config_class = ResnetConfig
-
-    def __init__(self, config):
-        super().__init__(config)
-        block_layer = BLOCK_MAPPING[config.block_type]
-        self.model = ResNet(
-            block_layer,
-            config.layers,
-            num_classes=config.num_classes,
-            in_chans=config.input_channels,
-            cardinality=config.cardinality,
-            base_width=config.base_width,
-            stem_width=config.stem_width,
-            stem_type=config.stem_type,
-            avg_down=config.avg_down,
-        )
-
-    def forward(self, tensor):
-        return self.model.forward_features(tensor)
-```
-
-Para o modelo que irá classificar as imagens, vamos apenas alterar o método forward:
-
-```py
-import torch
-
-
-class ResnetModelForImageClassification(PreTrainedModel):
-    config_class = ResnetConfig
-
-    def __init__(self, config):
-        super().__init__(config)
-        block_layer = BLOCK_MAPPING[config.block_type]
-        self.model = ResNet(
-            block_layer,
-            config.layers,
-            num_classes=config.num_classes,
-            in_chans=config.input_channels,
-            cardinality=config.cardinality,
-            base_width=config.base_width,
-            stem_width=config.stem_width,
-            stem_type=config.stem_type,
-            avg_down=config.avg_down,
-        )
-
-    def forward(self, tensor, labels=None):
-        logits = self.model(tensor)
-        if labels is not None:
-            loss = torch.nn.cross_entropy(logits, labels)
-            return {"loss": loss, "logits": logits}
-        return {"logits": logits}
-```
-
-Em ambos os casos, observe como herdamos de `PreTrainedModel` e chamamos a inicialização da superclasse com o `config`
-(um pouco parecido quando você escreve um `torch.nn.Module`). A linha que define o `config_class` não é obrigatória, a menos que
-você deseje registrar seu modelo com as classes automáticas (consulte a última seção).
-
-<Tip>
-
-Se o seu modelo for muito semelhante a um modelo dentro da biblioteca, você poderá reutilizar a mesma configuração desse modelo.
-
-</Tip>
-
-Você pode fazer com que seu modelo retorne o que você quiser,porém retornando um dicionário como fizemos para
-`ResnetModelForImageClassification`, com a função de perda incluída quando os rótulos são passados, vai tornar seu modelo diretamente
-utilizável dentro da classe [`Trainer`]. Você pode usar outro formato de saída, desde que esteja planejando usar seu próprio
-laço de treinamento ou outra biblioteca para treinamento.
-
-Agora que temos nossa classe do modelo, vamos criar uma:
-
-```py
-resnet50d = ResnetModelForImageClassification(resnet50d_config)
-```
-
-Novamente, você pode usar qualquer um dos métodos do [`PreTrainedModel`], como [`~PreTrainedModel.save_pretrained`] ou
-[`~PreTrainedModel.push_to_hub`]. Usaremos o segundo na próxima seção e veremos como enviar os pesos e
-o código do nosso modelo. Mas primeiro, vamos carregar alguns pesos pré-treinados dentro do nosso modelo.
-
-Em seu próprio caso de uso, você provavelmente estará treinando seu modelo customizado em seus próprios dados. Para este tutorial ser rápido,
-usaremos a versão pré-treinada do resnet50d. Como nosso modelo é apenas um wrapper em torno dele, será
-fácil de transferir esses pesos:
-
-```py
-import timm
-
-pretrained_model = timm.create_model("resnet50d", pretrained=True)
-resnet50d.model.load_state_dict(pretrained_model.state_dict())
-```
-
-Agora vamos ver como ter certeza de que quando fazemos [`~PreTrainedModel.save_pretrained`] ou [`~PreTrainedModel.push_to_hub`], o
-código do modelo é salvo.
-
-## Enviando o código para o Hub
-
-<Tip warning={true}>
-
-Esta API é experimental e pode ter algumas pequenas alterações nas próximas versões.
-
-</Tip>
-
-Primeiro, certifique-se de que seu modelo esteja totalmente definido em um arquivo `.py`. Ele pode contar com importações relativas para alguns outros arquivos 
-desde que todos os arquivos estejam no mesmo diretório (ainda não suportamos submódulos para este recurso). Para o nosso exemplo,
-vamos definir um arquivo `modeling_resnet.py` e um arquivo `configuration_resnet.py` em uma pasta no 
-diretório de trabalho atual chamado `resnet_model`. O arquivo de configuração contém o código para `ResnetConfig` e o arquivo de modelagem
-contém o código do `ResnetModel` e `ResnetModelForImageClassification`.
-
-```
-.
-└── resnet_model
-    ├── __init__.py
-    ├── configuration_resnet.py
-    └── modeling_resnet.py
-```
-
-O `__init__.py` pode estar vazio, apenas está lá para que o Python detecte que o `resnet_model` possa ser usado como um módulo.
-
-<Tip warning={true}>
-
-Se estiver copiando arquivos de modelagem da biblioteca, você precisará substituir todas as importações relativas na parte superior do arquivo
-para importar do pacote `transformers`.
-
-</Tip>
-
-Observe que você pode reutilizar (ou subclasse) uma configuração/modelo existente.
-
-Para compartilhar seu modelo com a comunidade, siga estas etapas: primeiro importe o modelo ResNet e a configuração do
-arquivos criados:
-
-```py
-from resnet_model.configuration_resnet import ResnetConfig
-from resnet_model.modeling_resnet import ResnetModel, ResnetModelForImageClassification
-```
-
-Então você tem que dizer à biblioteca que deseja copiar os arquivos de código desses objetos ao usar o `save_pretrained`
-e registrá-los corretamente com uma determinada classe automáticas (especialmente para modelos), basta executar:
-
-```py
-ResnetConfig.register_for_auto_class()
-ResnetModel.register_for_auto_class("AutoModel")
-ResnetModelForImageClassification.register_for_auto_class("AutoModelForImageClassification")
-```
-
-Observe que não há necessidade de especificar uma classe automática para a configuração (há apenas uma classe automática,
-[`AutoConfig`]), mas é diferente para os modelos. Seu modelo customizado pode ser adequado para muitas tarefas diferentes, então você
-tem que especificar qual das classes automáticas é a correta para o seu modelo.
-
-Em seguida, vamos criar a configuração e os modelos como fizemos antes:
-
-```py
-resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True)
-resnet50d = ResnetModelForImageClassification(resnet50d_config)
-
-pretrained_model = timm.create_model("resnet50d", pretrained=True)
-resnet50d.model.load_state_dict(pretrained_model.state_dict())
-```
-
-Agora para enviar o modelo para o Hub, certifique-se de estar logado. Ou execute no seu terminal:
-
-```bash
-huggingface-cli login
-```
-
-ou a partir do notebook:
-
-```py
-from huggingface_hub import notebook_login
-
-notebook_login()
-```
-
-Você pode então enviar para seu próprio namespace (ou uma organização da qual você é membro) assim:
-
-
-```py
-resnet50d.push_to_hub("custom-resnet50d")
-```
-
-Além dos pesos do modelo e da configuração no formato json, isso também copiou o modelo e
-configuração `.py` na pasta `custom-resnet50d` e carregou o resultado para o Hub. Você pode conferir o resultado
-neste [repositório de modelos](https://huggingface.co/sgugger/custom-resnet50d).
-
-Consulte o [tutorial de compartilhamento](model_sharing) para obter mais informações sobre o método push_to_hub.
-
-## Usando um modelo com código customizado
-
-Você pode usar qualquer configuração, modelo ou tokenizador com arquivos de código customizados em seu repositório com as classes automáticas e
-o método `from_pretrained`. Todos os arquivos e códigos carregados no Hub são verificados quanto a malware (consulte a documentação de [Segurança do Hub](https://huggingface.co/docs/hub/security#malware-scanning) para obter mais informações), mas você ainda deve
-revisar o código do modelo e o autor para evitar a execução de código malicioso em sua máquina. Defina `trust_remote_code=True` para usar
-um modelo com código customizado:
-
-```py
-from transformers import AutoModelForImageClassification
-
-model = AutoModelForImageClassification.from_pretrained("sgugger/custom-resnet50d", trust_remote_code=True)
-```
-
-Também é fortemente recomendado passar um hash de confirmação como uma `revisão` para garantir que o autor dos modelos não
-atualize o código com novas linhas maliciosas (a menos que você confie totalmente nos autores dos modelos).
-
-
-```py
-commit_hash = "ed94a7c6247d8aedce4647f00f20de6875b5b292"
-model = AutoModelForImageClassification.from_pretrained(
-    "sgugger/custom-resnet50d", trust_remote_code=True, revision=commit_hash
-)
-```
-
-Observe que ao navegar no histórico de commits do repositório do modelo no Hub, há um botão para copiar facilmente o commit
-hash de qualquer commit.
-
-## Registrando um modelo com código customizado para as classes automáticas
-
-Se você estiver escrevendo uma biblioteca que estende 🤗 Transformers, talvez queira estender as classes automáticas para incluir seus próprios
-modelos. Isso é diferente de enviar o código para o Hub no sentido de que os usuários precisarão importar sua biblioteca para
-obter os modelos customizados (ao contrário de baixar automaticamente o código do modelo do Hub).
-
-Desde que sua configuração tenha um atributo `model_type` diferente dos tipos de modelo existentes e que as classes do seu modelo
-tenha os atributos `config_class` corretos, você pode simplesmente adicioná-los às classes automáticas assim:
-
-```py
-from transformers import AutoConfig, AutoModel, AutoModelForImageClassification
-
-AutoConfig.register("resnet", ResnetConfig)
-AutoModel.register(ResnetConfig, ResnetModel)
-AutoModelForImageClassification.register(ResnetConfig, ResnetModelForImageClassification)
-```
-
-Observe que o primeiro argumento usado ao registrar sua configuração customizada para [`AutoConfig`] precisa corresponder ao `model_type`
-de sua configuração customizada. E o primeiro argumento usado ao registrar seus modelos customizados, para qualquer necessidade de classe de modelo automático
-deve corresponder ao `config_class` desses modelos.
-
diff --git a/docs/source/pt/fast_tokenizers.md b/docs/source/pt/fast_tokenizers.md
new file mode 100644
index 000000000000..ea1da8a61571
--- /dev/null
+++ b/docs/source/pt/fast_tokenizers.md
@@ -0,0 +1,66 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Usando os Tokenizers do 🤗 Tokenizers
+
+O [`PreTrainedTokenizerFast`] depende da biblioteca [🤗 Tokenizers](https://huggingface.co/docs/tokenizers). O Tokenizer obtido da biblioteca 🤗 Tokenizers pode ser carregado facilmente pelo 🤗 Transformers.
+
+Antes de entrar nos detalhes, vamos começar criando um tokenizer fictício em algumas linhas:
+
+```python
+>>> from tokenizers import Tokenizer
+>>> from tokenizers.models import BPE
+>>> from tokenizers.trainers import BpeTrainer
+>>> from tokenizers.pre_tokenizers import Whitespace
+
+>>> tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
+>>> trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
+
+>>> tokenizer.pre_tokenizer = Whitespace()
+>>> files = [...]
+>>> tokenizer.train(files, trainer)
+```
+
+Agora temos um tokenizer treinado nos arquivos que foram definidos. Nós podemos continuar usando nessa execução ou salvar em um arquivo JSON para re-utilizar no futuro.
+
+## Carregando diretamente de um objeto tokenizer
+
+Vamos ver como aproveitar esse objeto tokenizer na biblioteca 🤗 Transformers. A classe [`PreTrainedTokenizerFast`] permite uma instanciação fácil, aceitando o objeto *tokenizer* instanciado como um argumento:
+
+```python
+>>> from transformers import PreTrainedTokenizerFast
+
+>>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
+```
+Esse objeto pode ser utilizado com todos os métodos compartilhados pelos tokenizers dos 🤗 Transformers! Vá para [a página do tokenizer](main_classes/tokenizer) para mais informações.
+
+## Carregando de um arquivo JSON
+
+Para carregar um tokenizer de um arquivo JSON vamos primeiro começar salvando nosso tokenizer:
+
+```python
+>>> tokenizer.save("tokenizer.json")
+```
+
+A pasta para qual salvamos esse arquivo pode ser passada para o método de inicialização do [`PreTrainedTokenizerFast`] usando o `tokenizer_file` parâmetro:
+
+```python
+>>> from transformers import PreTrainedTokenizerFast
+
+>>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
+```
+
+Esse objeto pode ser utilizado com todos os métodos compartilhados pelos tokenizers dos 🤗 Transformers! Vá para [a página do tokenizer](main_classes/tokenizer) para mais informações.
\ No newline at end of file
diff --git a/docs/source/pt/fast_tokenizers.mdx b/docs/source/pt/fast_tokenizers.mdx
deleted file mode 100644
index aff9afb31f2b..000000000000
--- a/docs/source/pt/fast_tokenizers.mdx
+++ /dev/null
@@ -1,62 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Usando os Tokenizers do 🤗 Tokenizers
-
-O [`PreTrainedTokenizerFast`] depende da biblioteca [🤗 Tokenizers](https://huggingface.co/docs/tokenizers). O Tokenizer obtido da biblioteca 🤗 Tokenizers pode ser carregado facilmente pelo 🤗 Transformers.
-
-Antes de entrar nos detalhes, vamos começar criando um tokenizer fictício em algumas linhas:
-
-```python
->>> from tokenizers import Tokenizer
->>> from tokenizers.models import BPE
->>> from tokenizers.trainers import BpeTrainer
->>> from tokenizers.pre_tokenizers import Whitespace
-
->>> tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
->>> trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
-
->>> tokenizer.pre_tokenizer = Whitespace()
->>> files = [...]
->>> tokenizer.train(files, trainer)
-```
-
-Agora temos um tokenizer treinado nos arquivos que foram definidos. Nós podemos continuar usando nessa execução ou salvar em um arquivo JSON para re-utilizar no futuro.
-
-## Carregando diretamente de um objeto tokenizer
-
-Vamos ver como aproveitar esse objeto tokenizer na biblioteca 🤗 Transformers. A classe [`PreTrainedTokenizerFast`] permite uma instanciação fácil, aceitando o objeto *tokenizer* instanciado como um argumento:
-
-```python
->>> from transformers import PreTrainedTokenizerFast
-
->>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
-```
-Esse objeto pode ser utilizado com todos os métodos compartilhados pelos tokenizers dos 🤗 Transformers! Vá para [a página do tokenizer](main_classes/tokenizer) para mais informações.
-
-## Carregando de um arquivo JSON
-
-Para carregar um tokenizer de um arquivo JSON vamos primeiro começar salvando nosso tokenizer:
-
-```python
->>> tokenizer.save("tokenizer.json")
-```
-
-A pasta para qual salvamos esse arquivo pode ser passada para o método de inicialização do [`PreTrainedTokenizerFast`] usando o `tokenizer_file` parâmetro:
-
-```python
->>> from transformers import PreTrainedTokenizerFast
-
->>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
-```
-
-Esse objeto pode ser utilizado com todos os métodos compartilhados pelos tokenizers dos 🤗 Transformers! Vá para [a página do tokenizer](main_classes/tokenizer) para mais informações.
\ No newline at end of file
diff --git a/docs/source/pt/index.md b/docs/source/pt/index.md
new file mode 100644
index 000000000000..08575b0bea22
--- /dev/null
+++ b/docs/source/pt/index.md
@@ -0,0 +1,296 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 🤗 Transformers
+
+
+Estado da Arte para Aprendizado de Máquina em PyTorch, TensorFlow e JAX.
+O 🤗 Transformers disponibiliza APIs para facilmente baixar e treinar modelos pré-treinados de última geração.
+O uso de modelos pré-treinados pode diminuir os seus custos de computação, a sua pegada de carbono, além de economizar o
+tempo necessário para se treinar um modelo do zero. Os modelos podem ser usados para diversas tarefas:
+
+* 📝 Textos: classificação, extração de informações, perguntas e respostas, resumir, traduzir e gerar textos em mais de 100 idiomas.
+* 🖼 Imagens: classificação, deteção de objetos, e segmentação.
+* 🗣 Audio: reconhecimento de fala e classificação de áudio.
+* 🐙 Multimodal: perguntas tabeladas e respsostas, reconhecimento ótico de charactéres, extração de informação de
+documentos escaneados, classificação de vídeo, perguntas e respostas visuais.
+
+Nossa biblioteca aceita integração contínua entre três das bibliotecas mais populares de aprendizado profundo:
+Our library supports seamless integration between three of the most popular deep learning libraries:
+[PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/) e [JAX](https://jax.readthedocs.io/en/latest/).
+Treine seu modelo em três linhas de código em um framework, e carregue-o para execução em outro.
+
+Cada arquitetura 🤗 Transformers é definida em um módulo individual do Python, para que seja facilmente customizável para pesquisa e experimentos.
+
+## Se você estiver procurando suporte do time da Hugging Face, acesse
+
+<a target="_blank" href="https://huggingface.co/support">
+    <img alt="HuggingFace Expert Acceleration Program" src="https://huggingface.co/front/thumbnails/support.png" style="width: 100%; max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
+</a>
+
+## Conteúdo
+
+A documentação é dividida em cinco partes:
+    - **INÍCIO** contém um tour rápido de instalação e instruções para te dar um empurrão inicial com os 🤗 Transformers.
+    - **TUTORIAIS** são perfeitos para começar a aprender sobre a nossa biblioteca. Essa seção irá te ajudar a desenvolver
+    habilidades básicas necessárias para usar o 🤗 Transformers.
+    - **GUIAS PRÁTICOS** irão te mostrar como alcançar um certo objetivo, como o fine-tuning de um modelo pré-treinado
+    para modelamento de idioma, ou como criar um cabeçalho personalizado para um modelo.
+    - **GUIAS CONCEITUAIS** te darão mais discussões e explicações dos conceitos fundamentais e idéias por trás dos modelos,
+    tarefas e da filosofia de design por trás do 🤗 Transformers.
+    - **API** descreve o funcionamento de cada classe e função, agrupada em:
+
+    - **CLASSES PRINCIPAIS** para as classes que expõe as APIs importantes da biblioteca.
+    - **MODELOS** para as classes e funções relacionadas à cada modelo implementado na biblioteca.
+    - **AUXILIARES INTERNOS** para as classes e funções usadas internamente.
+
+Atualmente a biblioteca contém implementações do PyTorch, TensorFlow e JAX, pesos para modelos pré-treinados e scripts de uso e conversão de utilidades para os seguintes modelos:
+
+### Modelos atuais
+
+<!--This list is updated automatically from the README with _make fix-copies_. Do not update manually! -->
+
+1. **[ALBERT](model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
+1. **[BART](model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
+1. **[BARThez](model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
+1. **[BARTpho](model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
+1. **[BEiT](model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
+1. **[BERT](model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
+1. **[BERTweet](model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
+1. **[BERT For Sequence Generation](model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[BigBird-RoBERTa](model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[BigBird-Pegasus](model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[Blenderbot](model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BlenderbotSmall](model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BORT](model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
+1. **[ByT5](model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
+1. **[CamemBERT](model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
+1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
+1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
+1. **[ConvNeXTV2](model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
+1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
+1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
+1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
+1. **[CTRL](model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
+1. **[Data2Vec](model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
+1. **[DeBERTa](model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[DeBERTa-v2](model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[Decision Transformer](model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
+1. **[DiT](model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
+1. **[DeiT](model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
+1. **[DETR](model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
+1. **[DialoGPT](model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
+1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
+1. **[DPR](model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
+1. **[DPT](master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
+1. **[EfficientNet](model_doc/efficientnet)** (from Google Research) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946)  by Mingxing Tan and Quoc V. Le.
+1. **[EncoderDecoder](model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[ELECTRA](model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
+1. **[FlauBERT](model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
+1. **[FNet](model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
+1. **[Funnel Transformer](model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
+1. **[GLPN](model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
+1. **[GPT](model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+1. **[GPT-2](model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
+1. **[GPT-J](model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
+1. **[GPT Neo](model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
+1. **[GPTSAN-japanese](model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
+1. **[Hubert](model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
+1. **[I-BERT](model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
+1. **[ImageGPT](model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
+1. **[LayoutLM](model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
+1. **[LayoutLMv2](model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
+1. **[LayoutXLM](model_doc/layoutxlm)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
+1. **[LED](model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+1. **[Longformer](model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+1. **[LUKE](model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
+1. **[mLUKE](model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
+1. **[LXMERT](model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
+1. **[M2M100](model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
+1. **[MarianMT](model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
+1. **[Mask2Former](model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
+1. **[MaskFormer](model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
+1. **[MBart](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
+1. **[MBart-50](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
+1. **[Megatron-BERT](model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[Megatron-GPT2](model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[MPNet](model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
+1. **[MT5](model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
+1. **[Nyströmformer](model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
+1. **[OneFormer](model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
+1. **[Pegasus](model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
+1. **[Perceiver IO](model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
+1. **[PhoBERT](model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
+1. **[PLBart](model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
+1. **[PoolFormer](model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
+1. **[ProphetNet](model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[QDQBert](model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
+1. **[REALM](model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
+1. **[Reformer](model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+1. **[RemBERT](model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
+1. **[RegNet](model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
+1. **[ResNet](model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
+1. **[RoBERTa](model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
+1. **[RoFormer](model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
+1. **[SegFormer](model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
+1. **[SEW](model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
+1. **[SEW-D](model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
+1. **[SpeechToTextTransformer](model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
+1. **[SpeechToTextTransformer2](model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
+1. **[Splinter](model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
+1. **[SqueezeBert](model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
+1. **[Swin Transformer](model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
+1. **[T5](model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+1. **[T5v1.1](model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+1. **[TAPAS](model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
+1. **[TAPEX](model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
+1. **[Transformer-XL](model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+1. **[TrOCR](model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
+1. **[UniSpeech](model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
+1. **[UniSpeechSat](model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
+1. **[VAN](model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
+1. **[ViLT](model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
+1. **[Vision Transformer (ViT)](model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[ViTMAE](model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
+1. **[VisualBERT](model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
+1. **[WavLM](model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
+1. **[Wav2Vec2](model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
+1. **[Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
+1. **[XGLM](model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
+1. **[XLM](model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
+1. **[XLM-ProphetNet](model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[XLM-RoBERTa](model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
+1. **[XLM-RoBERTa-XL](model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
+1. **[XLNet](model_doc/xlnet)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
+1. **[XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
+1. **[XLS-R](model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
+1. **[YOSO](model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
+
+
+### Frameworks aceitos
+
+A tabela abaixo representa a lista de suporte na biblioteca para cada um dos seguintes modelos, caso tenham um tokenizer
+do Python (chamado de "slow"), ou um tokenizer construído em cima da biblioteca 🤗 Tokenizers (chamado de "fast"). Além
+disso, são diferenciados pelo suporte em diferentes frameworks: JAX (por meio do Flax); PyTorch; e/ou Tensorflow.
+
+<!--This table is updated automatically from the auto modules with _make fix-copies_. Do not update manually!-->
+
+|            Model            | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support |
+|:---------------------------:|:--------------:|:--------------:|:---------------:|:------------------:|:------------:|
+|           ALBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|            BART             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|            BEiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|            BERT             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|       Bert Generation       |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           BigBird           |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
+|       BigBirdPegasus        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         Blenderbot          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|       BlenderbotSmall       |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|          CamemBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           Canine            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            CLIP             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|          ConvNext           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|        Data2VecAudio        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        Data2VecText         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|       Data2VecVision        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           DeBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|         DeBERTa-v2          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|    Decision Transformer     |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            DeiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            DETR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         DistilBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|             DPR             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|             DPT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           ELECTRA           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|       Encoder decoder       |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+| FairSeq Machine-Translation |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          FlauBERT           |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            FNet             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|     Funnel Transformer      |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            GLPN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           GPT Neo           |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|            GPT-J            |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|           Hubert            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|           I-BERT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          ImageGPT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          LayoutLM           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|         LayoutLMv2          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|             LED             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|         Longformer          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            LUKE             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           LXMERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           M2M100            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Marian            |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
+|         MaskFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            mBART            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|        MegatronBert         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         MobileBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            MPNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|             mT5             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|        Nystromformer        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         OpenAI GPT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|        OpenAI GPT-2         |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|           Pegasus           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|          Perceiver          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           PLBart            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         PoolFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         ProphetNet          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           QDQBert           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             RAG             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            Realm            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          Reformer           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|           RegNet            |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|           RemBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           ResNet            |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|          RetriBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|           RoBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|          RoFormer           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|          SegFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             SEW             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            SEW-D            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|   Speech Encoder decoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|         Speech2Text         |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|        Speech2Text2         |       ✅       |       ❌       |       ❌        |         ❌         |      ❌      |
+|          Splinter           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|         SqueezeBERT         |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|            Swin             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             T5              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|            TAPAS            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            TAPEX            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|       Transformer-XL        |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            TrOCR            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          UniSpeech          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        UniSpeechSat         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             VAN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            ViLT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|   Vision Encoder decoder    |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|    VisionTextDualEncoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|         VisualBert          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             ViT             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|           ViTMAE            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|          Wav2Vec2           |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
+|            WavLM            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            XGLM             |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
+|             XLM             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|         XLM-RoBERTa         |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|       XLM-RoBERTa-XL        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        XLMProphetNet        |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            XLNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            YOSO             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+
+<!-- End table-->
diff --git a/docs/source/pt/index.mdx b/docs/source/pt/index.mdx
deleted file mode 100644
index 745460f53554..000000000000
--- a/docs/source/pt/index.mdx
+++ /dev/null
@@ -1,287 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# 🤗 Transformers
-
-
-Estado da Arte para Aprendizado de Máquina em PyTorch, TensorFlow e JAX.
-O 🤗 Transformers disponibiliza APIs para facilmente baixar e treinar modelos pré-treinados de última geração.
-O uso de modelos pré-treinados pode diminuir os seus custos de computação, a sua pegada de carbono, além de economizar o
-tempo necessário para se treinar um modelo do zero. Os modelos podem ser usados para diversas tarefas:
-
-* 📝 Textos: classificação, extração de informações, perguntas e respostas, resumir, traduzir e gerar textos em mais de 100 idiomas.
-* 🖼 Imagens: classificação, deteção de objetos, e segmentação.
-* 🗣 Audio: reconhecimento de fala e classificação de áudio.
-* 🐙 Multimodal: perguntas tabeladas e respsostas, reconhecimento ótico de charactéres, extração de informação de
-documentos escaneados, classificação de vídeo, perguntas e respostas visuais.
-
-Nossa biblioteca aceita integração contínua entre três das bibliotecas mais populares de aprendizado profundo:
-Our library supports seamless integration between three of the most popular deep learning libraries:
-[PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/) e [JAX](https://jax.readthedocs.io/en/latest/).
-Treine seu modelo em três linhas de código em um framework, e carregue-o para execução em outro.
-
-Cada arquitetura 🤗 Transformers é definida em um módulo individual do Python, para que seja facilmente customizável para pesquisa e experimentos.
-
-## Se você estiver procurando suporte do time da Hugging Face, acesse
-
-<a target="_blank" href="https://huggingface.co/support">
-    <img alt="HuggingFace Expert Acceleration Program" src="https://huggingface.co/front/thumbnails/support.png" style="width: 100%; max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);"></img>
-</a>
-
-## Conteúdo
-
-A documentação é dividida em cinco partes:
-    - **INÍCIO** contém um tour rápido de instalação e instruções para te dar um empurrão inicial com os 🤗 Transformers.
-    - **TUTORIAIS** são perfeitos para começar a aprender sobre a nossa biblioteca. Essa seção irá te ajudar a desenvolver
-    habilidades básicas necessárias para usar o 🤗 Transformers.
-    - **GUIAS PRÁTICOS** irão te mostrar como alcançar um certo objetivo, como o fine-tuning de um modelo pré-treinado
-    para modelamento de idioma, ou como criar um cabeçalho personalizado para um modelo.
-    - **GUIAS CONCEITUAIS** te darão mais discussões e explicações dos conceitos fundamentais e idéias por trás dos modelos,
-    tarefas e da filosofia de design por trás do 🤗 Transformers.
-    - **API** descreve o funcionamento de cada classe e função, agrupada em:
-
-    - **CLASSES PRINCIPAIS** para as classes que expõe as APIs importantes da biblioteca.
-    - **MODELOS** para as classes e funções relacionadas à cada modelo implementado na biblioteca.
-    - **AUXILIARES INTERNOS** para as classes e funções usadas internamente.
-
-Atualmente a biblioteca contém implementações do PyTorch, TensorFlow e JAX, pesos para modelos pré-treinados e scripts de uso e conversão de utilidades para os seguintes modelos:
-
-### Modelos atuais
-
-<!--This list is updated automatically from the README with _make fix-copies_. Do not update manually! -->
-
-1. **[ALBERT](model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-1. **[BART](model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
-1. **[BARThez](model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
-1. **[BARTpho](model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
-1. **[BEiT](model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
-1. **[BERT](model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
-1. **[BERTweet](model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
-1. **[BERT For Sequence Generation](model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[BigBird-RoBERTa](model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BigBird-Pegasus](model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[Blenderbot](model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BlenderbotSmall](model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BORT](model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
-1. **[ByT5](model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
-1. **[CamemBERT](model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
-1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
-1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
-1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
-1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
-1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
-1. **[CTRL](model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
-1. **[Data2Vec](model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
-1. **[DeBERTa](model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[DeBERTa-v2](model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[Decision Transformer](model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
-1. **[DiT](model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
-1. **[DeiT](model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
-1. **[DETR](model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
-1. **[DialoGPT](model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
-1. **[DPR](model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
-1. **[DPT](master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
-1. **[EncoderDecoder](model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[ELECTRA](model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
-1. **[FlauBERT](model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
-1. **[FNet](model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
-1. **[Funnel Transformer](model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-1. **[GLPN](model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
-1. **[GPT](model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
-1. **[GPT-2](model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
-1. **[GPT-J](model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
-1. **[GPT Neo](model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
-1. **[Hubert](model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
-1. **[I-BERT](model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
-1. **[ImageGPT](model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
-1. **[LayoutLM](model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
-1. **[LayoutLMv2](model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
-1. **[LayoutXLM](model_doc/layoutxlm)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
-1. **[LED](model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-1. **[Longformer](model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-1. **[LUKE](model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
-1. **[mLUKE](model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
-1. **[LXMERT](model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
-1. **[M2M100](model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
-1. **[MarianMT](model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
-1. **[MaskFormer](model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
-1. **[MBart](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-1. **[MBart-50](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-1. **[Megatron-BERT](model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-1. **[Megatron-GPT2](model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-1. **[MPNet](model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
-1. **[MT5](model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-1. **[Nyströmformer](model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
-1. **[Pegasus](model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
-1. **[Perceiver IO](model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
-1. **[PhoBERT](model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
-1. **[PLBart](model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
-1. **[PoolFormer](model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
-1. **[ProphetNet](model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-1. **[QDQBert](model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
-1. **[REALM](model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
-1. **[Reformer](model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
-1. **[RemBERT](model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
-1. **[RegNet](model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
-1. **[ResNet](model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
-1. **[RoBERTa](model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-1. **[RoFormer](model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
-1. **[SegFormer](model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
-1. **[SEW](model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
-1. **[SEW-D](model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
-1. **[SpeechToTextTransformer](model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
-1. **[SpeechToTextTransformer2](model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
-1. **[Splinter](model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
-1. **[SqueezeBert](model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
-1. **[Swin Transformer](model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
-1. **[T5](model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-1. **[T5v1.1](model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-1. **[TAPAS](model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
-1. **[TAPEX](model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
-1. **[Transformer-XL](model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-1. **[TrOCR](model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
-1. **[UniSpeech](model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
-1. **[UniSpeechSat](model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
-1. **[VAN](model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
-1. **[ViLT](model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
-1. **[Vision Transformer (ViT)](model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-1. **[ViTMAE](model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
-1. **[VisualBERT](model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
-1. **[WavLM](model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
-1. **[Wav2Vec2](model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
-1. **[Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
-1. **[XGLM](model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
-1. **[XLM](model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
-1. **[XLM-ProphetNet](model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-1. **[XLM-RoBERTa](model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
-1. **[XLM-RoBERTa-XL](model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
-1. **[XLNet](model_doc/xlnet)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-1. **[XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
-1. **[XLS-R](model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
-1. **[YOSO](model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
-
-
-### Frameworks aceitos
-
-A tabela abaixo representa a lista de suporte na biblioteca para cada um dos seguintes modelos, caso tenham um tokenizer
-do Python (chamado de "slow"), ou um tokenizer construído em cima da biblioteca 🤗 Tokenizers (chamado de "fast"). Além
-disso, são diferenciados pelo suporte em diferentes frameworks: JAX (por meio do Flax); PyTorch; e/ou Tensorflow.
-
-<!--This table is updated automatically from the auto modules with _make fix-copies_. Do not update manually!-->
-
-|            Model            | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support |
-|:---------------------------:|:--------------:|:--------------:|:---------------:|:------------------:|:------------:|
-|           ALBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|            BART             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|            BEiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-|            BERT             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|       Bert Generation       |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           BigBird           |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
-|       BigBirdPegasus        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         Blenderbot          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|       BlenderbotSmall       |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|          CamemBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|           Canine            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            CLIP             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|          ConvNext           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|        Data2VecAudio        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|        Data2VecText         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|       Data2VecVision        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           DeBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|         DeBERTa-v2          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|    Decision Transformer     |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            DeiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            DETR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         DistilBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|             DPR             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|             DPT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           ELECTRA           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|       Encoder decoder       |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-| FairSeq Machine-Translation |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          FlauBERT           |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            FNet             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|     Funnel Transformer      |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|            GLPN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           GPT Neo           |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-|            GPT-J            |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|           Hubert            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|           I-BERT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          ImageGPT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          LayoutLM           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|         LayoutLMv2          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|             LED             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|         Longformer          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|            LUKE             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           LXMERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|           M2M100            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           Marian            |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
-|         MaskFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            mBART            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|        MegatronBert         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         MobileBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|            MPNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|             mT5             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|        Nystromformer        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         OpenAI GPT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|        OpenAI GPT-2         |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|           Pegasus           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|          Perceiver          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           PLBart            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         PoolFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         ProphetNet          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           QDQBert           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             RAG             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            Realm            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|          Reformer           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|           RegNet            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           RemBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|           ResNet            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          RetriBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|           RoBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|          RoFormer           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|          SegFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             SEW             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            SEW-D            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|   Speech Encoder decoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-|         Speech2Text         |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|        Speech2Text2         |       ✅       |       ❌       |       ❌        |         ❌         |      ❌      |
-|          Splinter           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|         SqueezeBERT         |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|            Swin             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             T5              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|            TAPAS            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            TAPEX            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|       Transformer-XL        |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            TrOCR            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          UniSpeech          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|        UniSpeechSat         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             VAN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            ViLT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|   Vision Encoder decoder    |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|    VisionTextDualEncoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-|         VisualBert          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             ViT             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|           ViTMAE            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|          Wav2Vec2           |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
-|            WavLM            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            XGLM             |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
-|             XLM             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|         XLM-RoBERTa         |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|       XLM-RoBERTa-XL        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|        XLMProphetNet        |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            XLNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|            YOSO             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-
-<!-- End table-->
diff --git a/docs/source/pt/installation.md b/docs/source/pt/installation.md
new file mode 100644
index 000000000000..15b59f7d8768
--- /dev/null
+++ b/docs/source/pt/installation.md
@@ -0,0 +1,262 @@
+<!---
+Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Guia de Instalação
+
+Neste guia poderá encontrar informações para a instalação do 🤗 Transformers para qualquer biblioteca de
+Machine Learning com a qual esteja a trabalhar. Além disso, poderá encontrar informações sobre como gerar cachês e
+configurar o 🤗 Transformers para execução em modo offline (opcional).
+
+🤗 Transformers foi testado com Python 3.6+, PyTorch 1.1.0+, TensorFlow 2.0+, e Flax. Para instalar a biblioteca de
+deep learning com que deseja trabalhar, siga as instruções correspondentes listadas a seguir:
+
+* [PyTorch](https://pytorch.org/get-started/locally/)
+* [TensorFlow 2.0](https://www.tensorflow.org/install/pip)
+* [Flax](https://flax.readthedocs.io/en/latest/)
+
+## Instalação pelo Pip
+
+É sugerido instalar o 🤗 Transformers num [ambiente virtual](https://docs.python.org/3/library/venv.html). Se precisar
+de mais informações sobre ambientes virtuais em Python, consulte este [guia](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
+Um ambiente virtual facilitará a manipulação e organização de projetos e evita problemas de compatibilidade entre dependências.
+
+Comece criando um ambiente virtual no diretório do seu projeto:
+
+```bash
+python -m venv .env
+```
+
+E para ativar o ambiente virtual:
+
+```bash
+source .env/bin/activate
+```
+
+Agora É possível instalar o 🤗 Transformers com o comando a seguir:
+
+```bash
+pip install transformers
+```
+
+Somente para a CPU, é possível instalar o 🤗 Transformers e a biblioteca de deep learning respectiva apenas numa linha.
+
+Por exemplo, para instalar o 🤗 Transformers e o PyTorch, digite:
+
+```bash
+pip install transformers[torch]
+```
+
+🤗 Transformers e TensorFlow 2.0:
+
+```bash
+pip install transformers[tf-cpu]
+```
+
+🤗 Transformers e Flax:
+
+```bash
+pip install transformers[flax]
+```
+
+Por último, verifique se o 🤗 Transformers foi instalado com sucesso usando o seguinte comando para baixar um modelo pré-treinado:
+
+```bash
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"
+```
+
+Em seguida, imprima um rótulo e sua pontuação:
+
+```bash
+[{'label': 'POSITIVE', 'score': 0.9998704791069031}]
+```
+
+## Instalação usando a fonte
+
+Para instalar o 🤗 Transformers a partir da fonte use o seguinte comando:
+
+```bash
+pip install git+https://github.com/huggingface/transformers
+```
+
+O comando acima instalará a versão `master` mais atual em vez da última versão estável. A versão `master` é útil para
+utilizar os últimos updates contidos em 🤗 Transformers. Por exemplo, um erro recente pode ter sido corrigido somente
+após a última versão estável, antes que houvesse um novo lançamento. No entanto, há a possibilidade que a versão `master` não esteja estável.
+A equipa trata de mantér a versão `master` operacional e a maioria dos erros são resolvidos em poucas horas ou dias.
+Se encontrar quaisquer problemas, por favor abra um [Issue](https://github.com/huggingface/transformers/issues) para que o
+mesmo possa ser corrigido o mais rápido possível.
+
+Verifique que o 🤗 Transformers está instalado corretamente usando o seguinte comando:
+
+```bash
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I love you'))"
+```
+
+## Instalação editável
+
+Uma instalação editável será necessária caso desejas um dos seguintes:
+* Usar a versão `master` do código fonte.
+* Contribuir ao 🤗 Transformers e precisa testar mudanças ao código.
+
+Para tal, clone o repositório e instale o 🤗 Transformers com os seguintes comandos:
+
+```bash
+git clone https://github.com/huggingface/transformers.git
+cd transformers
+pip install -e .
+```
+
+Estes comandos vão ligar o diretório para o qual foi clonado o repositório ao caminho de bibliotecas do Python.
+O Python agora buscará dentro dos arquivos que foram clonados além dos caminhos normais da biblioteca.
+Por exemplo, se os pacotes do Python se encontram instalados no caminho `~/anaconda3/envs/main/lib/python3.7/site-packages/`,
+o Python também buscará módulos no diretório onde clonamos o repositório `~/transformers/`.
+
+<Tip warning={true}>
+
+É necessário manter o diretório `transformers` se desejas continuar usando a biblioteca.
+
+</Tip>
+
+Assim, É possível atualizar sua cópia local para com a última versão do 🤗 Transformers com o seguinte comando:
+
+```bash
+cd ~/transformers/
+git pull
+```
+
+O ambiente de Python que foi criado para a instalação do 🤗 Transformers encontrará a versão `master` em execuções seguintes.
+
+## Instalação usando o Conda
+
+É possível instalar o 🤗 Transformers a partir do canal conda `huggingface` com o seguinte comando:
+
+```bash
+conda install -c huggingface transformers
+```
+
+## Configuração do Cachê
+
+Os modelos pré-treinados são baixados e armazenados no cachê local, encontrado em `~/.cache/huggingface/transformers/`.
+Este é o diretório padrão determinado pela variável `TRANSFORMERS_CACHE` dentro do shell.
+No Windows, este diretório pré-definido é dado por `C:\Users\username\.cache\huggingface\transformers`.
+É possível mudar as variáveis dentro do shell em ordem de prioridade para especificar um diretório de cachê diferente:
+
+1. Variável de ambiente do shell (por padrão): `TRANSFORMERS_CACHE`.
+2. Variável de ambiente do shell:`HF_HOME` + `transformers/`.
+3. Variável de ambiente do shell: `XDG_CACHE_HOME` + `/huggingface/transformers`.
+
+<Tip>
+
+    O 🤗 Transformers usará as variáveis de ambiente do shell `PYTORCH_TRANSFORMERS_CACHE` ou `PYTORCH_PRETRAINED_BERT_CACHE`
+    se estiver vindo de uma versão anterior da biblioteca que tenha configurado essas variáveis de ambiente, a menos que
+    você especifique a variável de ambiente do shell `TRANSFORMERS_CACHE`.
+    
+</Tip>
+
+
+## Modo Offline
+
+O 🤗 Transformers também pode ser executado num ambiente de firewall ou fora da rede (offline) usando arquivos locais.
+Para tal, configure a variável de ambiente de modo que `TRANSFORMERS_OFFLINE=1`.
+
+<Tip>
+
+Você pode adicionar o [🤗 Datasets](https://huggingface.co/docs/datasets/) ao pipeline de treinamento offline declarando
+    a variável de ambiente `HF_DATASETS_OFFLINE=1`.
+
+</Tip>
+
+Segue um exemplo de execução do programa numa rede padrão com firewall para instâncias externas, usando o seguinte comando:
+
+```bash
+python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
+```
+
+Execute esse mesmo programa numa instância offline com o seguinte comando:
+
+```bash
+HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
+python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
+```
+
+O script agora deve ser executado sem travar ou expirar, pois procurará apenas por arquivos locais.
+
+### Obtendo modelos e tokenizers para uso offline
+
+Outra opção para usar o 🤗 Transformers offline é baixar os arquivos antes e depois apontar para o caminho local onde estão localizados. Existem três maneiras de fazer isso:
+
+* Baixe um arquivo por meio da interface de usuário do [Model Hub](https://huggingface.co/models) clicando no ícone ↓.
+
+    ![download-icon](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/download-icon.png)
+
+
+* Use o pipeline do [`PreTrainedModel.from_pretrained`] e [`PreTrainedModel.save_pretrained`]:
+    1. Baixa os arquivos previamente com [`PreTrainedModel.from_pretrained`]:
+
+    ```py
+    >>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/T0_3B")
+    >>> model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0_3B")
+    ```
+
+
+    2. Salve os arquivos em um diretório específico com [`PreTrainedModel.save_pretrained`]:
+
+    ```py
+    >>> tokenizer.save_pretrained("./your/path/bigscience_t0")
+    >>> model.save_pretrained("./your/path/bigscience_t0")
+    ```
+
+    3. Quando estiver offline, acesse os arquivos com [`PreTrainedModel.from_pretrained`] do diretório especificado:
+
+    ```py
+    >>> tokenizer = AutoTokenizer.from_pretrained("./your/path/bigscience_t0")
+    >>> model = AutoModel.from_pretrained("./your/path/bigscience_t0")
+    ```
+
+* Baixando arquivos programaticamente com a biblioteca [huggingface_hub](https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub):
+
+    1. Instale a biblioteca [huggingface_hub](https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub) em seu ambiente virtual:
+
+    ```bash
+    python -m pip install huggingface_hub
+    ```
+
+    2. Utiliza a função [`hf_hub_download`](https://huggingface.co/docs/hub/adding-a-library#download-files-from-the-hub) para baixar um arquivo para um caminho específico. Por exemplo, o comando a seguir baixará o arquivo `config.json` para o modelo [T0](https://huggingface.co/bigscience/T0_3B) no caminho desejado:
+
+    ```py
+    >>> from huggingface_hub import hf_hub_download
+
+    >>> hf_hub_download(repo_id="bigscience/T0_3B", filename="config.json", cache_dir="./your/path/bigscience_t0")
+    ```
+
+Depois que o arquivo for baixado e armazenado no cachê local, especifique seu caminho local para carregá-lo e usá-lo:
+
+```py
+>>> from transformers import AutoConfig
+
+>>> config = AutoConfig.from_pretrained("./your/path/bigscience_t0/config.json")
+```
+
+<Tip>
+
+Para obter mais detalhes sobre como baixar arquivos armazenados no Hub, consulte a seção [How to download files from the Hub](https://huggingface.co/docs/hub/how-to-downstream).
+
+</Tip>
diff --git a/docs/source/pt/installation.mdx b/docs/source/pt/installation.mdx
deleted file mode 100644
index 2325cc74afe2..000000000000
--- a/docs/source/pt/installation.mdx
+++ /dev/null
@@ -1,258 +0,0 @@
-<!---
-Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Guia de Instalação
-
-Neste guia poderá encontrar informações para a instalação do 🤗 Transformers para qualquer biblioteca de
-Machine Learning com a qual esteja a trabalhar. Além disso, poderá encontrar informações sobre como gerar cachês e
-configurar o 🤗 Transformers para execução em modo offline (opcional).
-
-🤗 Transformers foi testado com Python 3.6+, PyTorch 1.1.0+, TensorFlow 2.0+, e Flax. Para instalar a biblioteca de
-deep learning com que deseja trabalhar, siga as instruções correspondentes listadas a seguir:
-
-* [PyTorch](https://pytorch.org/get-started/locally/)
-* [TensorFlow 2.0](https://www.tensorflow.org/install/pip)
-* [Flax](https://flax.readthedocs.io/en/latest/)
-
-## Instalação pelo Pip
-
-É sugerido instalar o 🤗 Transformers num [ambiente virtual](https://docs.python.org/3/library/venv.html). Se precisar
-de mais informações sobre ambientes virtuais em Python, consulte este [guia](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
-Um ambiente virtual facilitará a manipulação e organização de projetos e evita problemas de compatibilidade entre dependências.
-
-Comece criando um ambiente virtual no diretório do seu projeto:
-
-```bash
-python -m venv .env
-```
-
-E para ativar o ambiente virtual:
-
-```bash
-source .env/bin/activate
-```
-
-Agora É possível instalar o 🤗 Transformers com o comando a seguir:
-
-```bash
-pip install transformers
-```
-
-Somente para a CPU, é possível instalar o 🤗 Transformers e a biblioteca de deep learning respectiva apenas numa linha.
-
-Por exemplo, para instalar o 🤗 Transformers e o PyTorch, digite:
-
-```bash
-pip install transformers[torch]
-```
-
-🤗 Transformers e TensorFlow 2.0:
-
-```bash
-pip install transformers[tf-cpu]
-```
-
-🤗 Transformers e Flax:
-
-```bash
-pip install transformers[flax]
-```
-
-Por último, verifique se o 🤗 Transformers foi instalado com sucesso usando o seguinte comando para baixar um modelo pré-treinado:
-
-```bash
-python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"
-```
-
-Em seguida, imprima um rótulo e sua pontuação:
-
-```bash
-[{'label': 'POSITIVE', 'score': 0.9998704791069031}]
-```
-
-## Instalação usando a fonte
-
-Para instalar o 🤗 Transformers a partir da fonte use o seguinte comando:
-
-```bash
-pip install git+https://github.com/huggingface/transformers
-```
-
-O comando acima instalará a versão `master` mais atual em vez da última versão estável. A versão `master` é útil para
-utilizar os últimos updates contidos em 🤗 Transformers. Por exemplo, um erro recente pode ter sido corrigido somente
-após a última versão estável, antes que houvesse um novo lançamento. No entanto, há a possibilidade que a versão `master` não esteja estável.
-A equipa trata de mantér a versão `master` operacional e a maioria dos erros são resolvidos em poucas horas ou dias.
-Se encontrar quaisquer problemas, por favor abra um [Issue](https://github.com/huggingface/transformers/issues) para que o
-mesmo possa ser corrigido o mais rápido possível.
-
-Verifique que o 🤗 Transformers está instalado corretamente usando o seguinte comando:
-
-```bash
-python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I love you'))"
-```
-
-## Instalação editável
-
-Uma instalação editável será necessária caso desejas um dos seguintes:
-* Usar a versão `master` do código fonte.
-* Contribuir ao 🤗 Transformers e precisa testar mudanças ao código.
-
-Para tal, clone o repositório e instale o 🤗 Transformers com os seguintes comandos:
-
-```bash
-git clone https://github.com/huggingface/transformers.git
-cd transformers
-pip install -e .
-```
-
-Estes comandos vão ligar o diretório para o qual foi clonado o repositório ao caminho de bibliotecas do Python.
-O Python agora buscará dentro dos arquivos que foram clonados além dos caminhos normais da biblioteca.
-Por exemplo, se os pacotes do Python se encontram instalados no caminho `~/anaconda3/envs/main/lib/python3.7/site-packages/`,
-o Python também buscará módulos no diretório onde clonamos o repositório `~/transformers/`.
-
-<Tip warning={true}>
-
-É necessário manter o diretório `transformers` se desejas continuar usando a biblioteca.
-
-</Tip>
-
-Assim, É possível atualizar sua cópia local para com a última versão do 🤗 Transformers com o seguinte comando:
-
-```bash
-cd ~/transformers/
-git pull
-```
-
-O ambiente de Python que foi criado para a instalação do 🤗 Transformers encontrará a versão `master` em execuções seguintes.
-
-## Instalação usando o Conda
-
-É possível instalar o 🤗 Transformers a partir do canal conda `huggingface` com o seguinte comando:
-
-```bash
-conda install -c huggingface transformers
-```
-
-## Configuração do Cachê
-
-Os modelos pré-treinados são baixados e armazenados no cachê local, encontrado em `~/.cache/huggingface/transformers/`.
-Este é o diretório padrão determinado pela variável `TRANSFORMERS_CACHE` dentro do shell.
-No Windows, este diretório pré-definido é dado por `C:\Users\username\.cache\huggingface\transformers`.
-É possível mudar as variáveis dentro do shell em ordem de prioridade para especificar um diretório de cachê diferente:
-
-1. Variável de ambiente do shell (por padrão): `TRANSFORMERS_CACHE`.
-2. Variável de ambiente do shell:`HF_HOME` + `transformers/`.
-3. Variável de ambiente do shell: `XDG_CACHE_HOME` + `/huggingface/transformers`.
-
-<Tip>
-
-    O 🤗 Transformers usará as variáveis de ambiente do shell `PYTORCH_TRANSFORMERS_CACHE` ou `PYTORCH_PRETRAINED_BERT_CACHE`
-    se estiver vindo de uma versão anterior da biblioteca que tenha configurado essas variáveis de ambiente, a menos que
-    você especifique a variável de ambiente do shell `TRANSFORMERS_CACHE`.
-    
-</Tip>
-
-
-## Modo Offline
-
-O 🤗 Transformers também pode ser executado num ambiente de firewall ou fora da rede (offline) usando arquivos locais.
-Para tal, configure a variável de ambiente de modo que `TRANSFORMERS_OFFLINE=1`.
-
-<Tip>
-
-Você pode adicionar o [🤗 Datasets](https://huggingface.co/docs/datasets/) ao pipeline de treinamento offline declarando
-    a variável de ambiente `HF_DATASETS_OFFLINE=1`.
-
-</Tip>
-
-Segue um exemplo de execução do programa numa rede padrão com firewall para instâncias externas, usando o seguinte comando:
-
-```bash
-python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
-```
-
-Execute esse mesmo programa numa instância offline com o seguinte comando:
-
-```bash
-HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
-python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
-```
-
-O script agora deve ser executado sem travar ou expirar, pois procurará apenas por arquivos locais.
-
-### Obtendo modelos e tokenizers para uso offline
-
-Outra opção para usar o 🤗 Transformers offline é baixar os arquivos antes e depois apontar para o caminho local onde estão localizados. Existem três maneiras de fazer isso:
-
-* Baixe um arquivo por meio da interface de usuário do [Model Hub](https://huggingface.co/models) clicando no ícone ↓.
-
-    ![download-icon](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/download-icon.png)
-
-
-* Use o pipeline do [`PreTrainedModel.from_pretrained`] e [`PreTrainedModel.save_pretrained`]:
-    1. Baixa os arquivos previamente com [`PreTrainedModel.from_pretrained`]:
-
-    ```py
-    >>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-
-    >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/T0_3B")
-    >>> model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0_3B")
-    ```
-
-
-    2. Salve os arquivos em um diretório específico com [`PreTrainedModel.save_pretrained`]:
-
-    ```py
-    >>> tokenizer.save_pretrained("./your/path/bigscience_t0")
-    >>> model.save_pretrained("./your/path/bigscience_t0")
-    ```
-
-    3. Quando estiver offline, acesse os arquivos com [`PreTrainedModel.from_pretrained`] do diretório especificado:
-
-    ```py
-    >>> tokenizer = AutoTokenizer.from_pretrained("./your/path/bigscience_t0")
-    >>> model = AutoModel.from_pretrained("./your/path/bigscience_t0")
-    ```
-
-* Baixando arquivos programaticamente com a biblioteca [huggingface_hub](https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub):
-
-    1. Instale a biblioteca [huggingface_hub](https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub) em seu ambiente virtual:
-
-    ```bash
-    python -m pip install huggingface_hub
-    ```
-
-    2. Utiliza a função [`hf_hub_download`](https://huggingface.co/docs/hub/adding-a-library#download-files-from-the-hub) para baixar um arquivo para um caminho específico. Por exemplo, o comando a seguir baixará o arquivo `config.json` para o modelo [T0](https://huggingface.co/bigscience/T0_3B) no caminho desejado:
-
-    ```py
-    >>> from huggingface_hub import hf_hub_download
-
-    >>> hf_hub_download(repo_id="bigscience/T0_3B", filename="config.json", cache_dir="./your/path/bigscience_t0")
-    ```
-
-Depois que o arquivo for baixado e armazenado no cachê local, especifique seu caminho local para carregá-lo e usá-lo:
-
-```py
->>> from transformers import AutoConfig
-
->>> config = AutoConfig.from_pretrained("./your/path/bigscience_t0/config.json")
-```
-
-<Tip>
-
-Para obter mais detalhes sobre como baixar arquivos armazenados no Hub, consulte a seção [How to download files from the Hub](https://huggingface.co/docs/hub/how-to-downstream).
-
-</Tip>
diff --git a/docs/source/pt/multilingual.md b/docs/source/pt/multilingual.md
new file mode 100644
index 000000000000..b6366b8c2289
--- /dev/null
+++ b/docs/source/pt/multilingual.md
@@ -0,0 +1,195 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Modelos multilinguísticos para inferência
+
+[[open-in-colab]]
+
+Existem vários modelos multilinguísticos no 🤗 Transformers e seus usos para inferência diferem dos modelos monolíngues.
+No entanto, nem *todos* os usos dos modelos multilíngues são tão diferentes.
+Alguns modelos, como o [bert-base-multilingual-uncased](https://huggingface.co/bert-base-multilingual-uncased),
+podem ser usados como se fossem monolíngues. Este guia irá te ajudar a usar modelos multilíngues cujo uso difere
+para o propósito de inferência.
+
+## XLM
+
+O XLM tem dez checkpoints diferentes dos quais apenas um é monolíngue.
+Os nove checkpoints restantes do modelo são subdivididos em duas categorias:
+checkpoints que usam de language embeddings e os que não.
+
+### XLM com language embeddings
+
+Os seguintes modelos de XLM usam language embeddings para especificar a linguagem utilizada para a inferência.
+
+- `xlm-mlm-ende-1024` (Masked language modeling, English-German)
+- `xlm-mlm-enfr-1024` (Masked language modeling, English-French)
+- `xlm-mlm-enro-1024` (Masked language modeling, English-Romanian)
+- `xlm-mlm-xnli15-1024` (Masked language modeling, XNLI languages)
+- `xlm-mlm-tlm-xnli15-1024` (Masked language modeling + translation, XNLI languages)
+- `xlm-clm-enfr-1024` (Causal language modeling, English-French)
+- `xlm-clm-ende-1024` (Causal language modeling, English-German)
+
+Os language embeddings são representados por um tensor de mesma dimensão que os `input_ids` passados ao modelo.
+Os valores destes tensores dependem do idioma utilizado e se identificam pelos atributos `lang2id` e `id2lang` do tokenizador.
+
+Neste exemplo, carregamos o checkpoint `xlm-clm-enfr-1024`(Causal language modeling, English-French):
+
+```py
+>>> import torch
+>>> from transformers import XLMTokenizer, XLMWithLMHeadModel
+
+>>> tokenizer = XLMTokenizer.from_pretrained("xlm-clm-enfr-1024")
+>>> model = XLMWithLMHeadModel.from_pretrained("xlm-clm-enfr-1024")
+```
+
+O atributo `lang2id` do tokenizador mostra os idiomas deste modelo e seus ids:
+
+```py
+>>> print(tokenizer.lang2id)
+{'en': 0, 'fr': 1}
+```
+
+Em seguida, cria-se um input de exemplo:
+
+```py
+>>> input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")])  # batch size of 1
+```
+
+Estabelece-se o id do idioma, por exemplo `"en"`, e utiliza-se o mesmo para definir a language embedding.
+A language embedding é um tensor preenchido com `0`, que é o id de idioma para o inglês.
+Este tensor deve ser do mesmo tamanho que os `input_ids`.
+
+```py
+>>> language_id = tokenizer.lang2id["en"]  # 0
+>>> langs = torch.tensor([language_id] * input_ids.shape[1])  # torch.tensor([0, 0, 0, ..., 0])
+
+>>> # We reshape it to be of size (batch_size, sequence_length)
+>>> langs = langs.view(1, -1)  # is now of shape [1, sequence_length] (we have a batch size of 1)
+```
+
+Agora você pode passar os `input_ids` e a language embedding ao modelo:
+
+```py
+>>> outputs = model(input_ids, langs=langs)
+```
+
+O script [run_generation.py](https://github.com/huggingface/transformers/tree/master/examples/pytorch/text-generation/run_generation.py) pode gerar um texto com language embeddings utilizando os checkpoints `xlm-clm`.
+
+### XLM sem language embeddings
+
+Os seguintes modelos XLM não requerem o uso de language embeddings durante a inferência:
+
+- `xlm-mlm-17-1280` (Modelagem de linguagem com máscara, 17 idiomas)
+- `xlm-mlm-100-1280` (Modelagem de linguagem com máscara, 100 idiomas)
+
+Estes modelos são utilizados para representações genéricas de frase diferentemente dos checkpoints XLM anteriores.
+
+## BERT
+
+Os seguintes modelos do BERT podem ser utilizados para tarefas multilinguísticas:
+
+- `bert-base-multilingual-uncased` (Modelagem de linguagem com máscara + Previsão de frases, 102 idiomas)
+- `bert-base-multilingual-cased` (Modelagem de linguagem com máscara + Previsão de frases, 104 idiomas)
+
+Estes modelos não requerem language embeddings durante a inferência. Devem identificar a linguagem a partir
+do contexto e realizar a inferência em sequência.
+
+## XLM-RoBERTa
+
+Os seguintes modelos do XLM-RoBERTa podem ser utilizados para tarefas multilinguísticas:
+
+- `xlm-roberta-base` (Modelagem de linguagem com máscara, 100 idiomas)
+- `xlm-roberta-large` Modelagem de linguagem com máscara, 100 idiomas)
+
+O XLM-RoBERTa foi treinado com 2,5 TB de dados do CommonCrawl recém-criados e testados em 100 idiomas.
+Proporciona fortes vantagens sobre os modelos multilinguísticos publicados anteriormente como o mBERT e o XLM em tarefas
+subsequentes como a classificação, a rotulagem de sequências e à respostas a perguntas.
+
+## M2M100
+
+Os seguintes modelos de M2M100 podem ser utilizados para traduções multilinguísticas:
+
+- `facebook/m2m100_418M` (Tradução)
+- `facebook/m2m100_1.2B` (Tradução)
+
+Neste exemplo, o checkpoint `facebook/m2m100_418M` é carregado para traduzir do mandarim ao inglês. É possível
+estabelecer o idioma de origem no tokenizador:
+
+```py
+>>> from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
+
+>>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger."
+>>> chinese_text = "不要插手巫師的事務, 因為他們是微妙的, 很快就會發怒."
+
+>>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="zh")
+>>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
+```
+
+Tokenização do texto:
+
+```py
+>>> encoded_zh = tokenizer(chinese_text, return_tensors="pt")
+```
+
+O M2M100 força o id do idioma de destino como o primeiro token gerado para traduzir ao idioma de destino.
+É definido o `forced_bos_token_id` como `en` no método `generate` para traduzir ao inglês.
+
+```py
+>>> generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en"))
+>>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+'Do not interfere with the matters of the witches, because they are delicate and will soon be angry.'
+```
+
+## MBart
+
+Os seguintes modelos do MBart podem ser utilizados para tradução multilinguística:
+
+- `facebook/mbart-large-50-one-to-many-mmt` (Tradução automática multilinguística de um a vários, 50 idiomas)
+- `facebook/mbart-large-50-many-to-many-mmt` (Tradução automática multilinguística de vários a vários, 50 idiomas)
+- `facebook/mbart-large-50-many-to-one-mmt` (Tradução automática multilinguística vários a um, 50 idiomas)
+- `facebook/mbart-large-50` (Tradução multilinguística, 50 idiomas)
+- `facebook/mbart-large-cc25`
+
+Neste exemplo, carrega-se o checkpoint `facebook/mbart-large-50-many-to-many-mmt` para traduzir do finlandês ao inglês.
+Pode-se definir o idioma de origem no tokenizador:
+
+```py
+>>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+
+>>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger."
+>>> fi_text = "Älä sekaannu velhojen asioihin, sillä ne ovat hienovaraisia ja nopeasti vihaisia."
+
+>>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="fi_FI")
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
+```
+
+Tokenizando o texto:
+
+```py
+>>> encoded_en = tokenizer(en_text, return_tensors="pt")
+```
+
+O MBart força o id do idioma de destino como o primeiro token gerado para traduzir ao idioma de destino.
+É definido o `forced_bos_token_id` como `en` no método `generate` para traduzir ao inglês.
+
+```py
+>>> generated_tokens = model.generate(**encoded_en, forced_bos_token_id=tokenizer.lang_code_to_id("en_XX"))
+>>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+"Don't interfere with the wizard's affairs, because they are subtle, will soon get angry."
+```
+
+Se estiver usando o checkpoint `facebook/mbart-large-50-many-to-one-mmt` não será necessário forçar o id do idioma de destino
+como sendo o primeiro token generado, caso contrário a usagem é a mesma.
diff --git a/docs/source/pt/multilingual.mdx b/docs/source/pt/multilingual.mdx
deleted file mode 100644
index 4db9b54dab34..000000000000
--- a/docs/source/pt/multilingual.mdx
+++ /dev/null
@@ -1,191 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Modelos multilinguísticos para inferência
-
-[[open-in-colab]]
-
-Existem vários modelos multilinguísticos no 🤗 Transformers e seus usos para inferência diferem dos modelos monolíngues.
-No entanto, nem *todos* os usos dos modelos multilíngues são tão diferentes.
-Alguns modelos, como o [bert-base-multilingual-uncased](https://huggingface.co/bert-base-multilingual-uncased),
-podem ser usados como se fossem monolíngues. Este guia irá te ajudar a usar modelos multilíngues cujo uso difere
-para o propósito de inferência.
-
-## XLM
-
-O XLM tem dez checkpoints diferentes dos quais apenas um é monolíngue.
-Os nove checkpoints restantes do modelo são subdivididos em duas categorias:
-checkpoints que usam de language embeddings e os que não.
-
-### XLM com language embeddings
-
-Os seguintes modelos de XLM usam language embeddings para especificar a linguagem utilizada para a inferência.
-
-- `xlm-mlm-ende-1024` (Masked language modeling, English-German)
-- `xlm-mlm-enfr-1024` (Masked language modeling, English-French)
-- `xlm-mlm-enro-1024` (Masked language modeling, English-Romanian)
-- `xlm-mlm-xnli15-1024` (Masked language modeling, XNLI languages)
-- `xlm-mlm-tlm-xnli15-1024` (Masked language modeling + translation, XNLI languages)
-- `xlm-clm-enfr-1024` (Causal language modeling, English-French)
-- `xlm-clm-ende-1024` (Causal language modeling, English-German)
-
-Os language embeddings são representados por um tensor de mesma dimensão que os `input_ids` passados ao modelo.
-Os valores destes tensores dependem do idioma utilizado e se identificam pelos atributos `lang2id` e `id2lang` do tokenizador.
-
-Neste exemplo, carregamos o checkpoint `xlm-clm-enfr-1024`(Causal language modeling, English-French):
-
-```py
->>> import torch
->>> from transformers import XLMTokenizer, XLMWithLMHeadModel
-
->>> tokenizer = XLMTokenizer.from_pretrained("xlm-clm-enfr-1024")
->>> model = XLMWithLMHeadModel.from_pretrained("xlm-clm-enfr-1024")
-```
-
-O atributo `lang2id` do tokenizador mostra os idiomas deste modelo e seus ids:
-
-```py
->>> print(tokenizer.lang2id)
-{'en': 0, 'fr': 1}
-```
-
-Em seguida, cria-se um input de exemplo:
-
-```py
->>> input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")])  # batch size of 1
-```
-
-Estabelece-se o id do idioma, por exemplo `"en"`, e utiliza-se o mesmo para definir a language embedding.
-A language embedding é um tensor preenchido com `0`, que é o id de idioma para o inglês.
-Este tensor deve ser do mesmo tamanho que os `input_ids`.
-
-```py
->>> language_id = tokenizer.lang2id["en"]  # 0
->>> langs = torch.tensor([language_id] * input_ids.shape[1])  # torch.tensor([0, 0, 0, ..., 0])
-
->>> # We reshape it to be of size (batch_size, sequence_length)
->>> langs = langs.view(1, -1)  # is now of shape [1, sequence_length] (we have a batch size of 1)
-```
-
-Agora você pode passar os `input_ids` e a language embedding ao modelo:
-
-```py
->>> outputs = model(input_ids, langs=langs)
-```
-
-O script [run_generation.py](https://github.com/huggingface/transformers/tree/master/examples/pytorch/text-generation/run_generation.py) pode gerar um texto com language embeddings utilizando os checkpoints `xlm-clm`.
-
-### XLM sem language embeddings
-
-Os seguintes modelos XLM não requerem o uso de language embeddings durante a inferência:
-
-- `xlm-mlm-17-1280` (Modelagem de linguagem com máscara, 17 idiomas)
-- `xlm-mlm-100-1280` (Modelagem de linguagem com máscara, 100 idiomas)
-
-Estes modelos são utilizados para representações genéricas de frase diferentemente dos checkpoints XLM anteriores.
-
-## BERT
-
-Os seguintes modelos do BERT podem ser utilizados para tarefas multilinguísticas:
-
-- `bert-base-multilingual-uncased` (Modelagem de linguagem com máscara + Previsão de frases, 102 idiomas)
-- `bert-base-multilingual-cased` (Modelagem de linguagem com máscara + Previsão de frases, 104 idiomas)
-
-Estes modelos não requerem language embeddings durante a inferência. Devem identificar a linguagem a partir
-do contexto e realizar a inferência em sequência.
-
-## XLM-RoBERTa
-
-Os seguintes modelos do XLM-RoBERTa podem ser utilizados para tarefas multilinguísticas:
-
-- `xlm-roberta-base` (Modelagem de linguagem com máscara, 100 idiomas)
-- `xlm-roberta-large` Modelagem de linguagem com máscara, 100 idiomas)
-
-O XLM-RoBERTa foi treinado com 2,5 TB de dados do CommonCrawl recém-criados e testados em 100 idiomas.
-Proporciona fortes vantagens sobre os modelos multilinguísticos publicados anteriormente como o mBERT e o XLM em tarefas
-subsequentes como a classificação, a rotulagem de sequências e à respostas a perguntas.
-
-## M2M100
-
-Os seguintes modelos de M2M100 podem ser utilizados para traduções multilinguísticas:
-
-- `facebook/m2m100_418M` (Tradução)
-- `facebook/m2m100_1.2B` (Tradução)
-
-Neste exemplo, o checkpoint `facebook/m2m100_418M` é carregado para traduzir do mandarim ao inglês. É possível
-estabelecer o idioma de origem no tokenizador:
-
-```py
->>> from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
-
->>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger."
->>> chinese_text = "不要插手巫師的事務, 因為他們是微妙的, 很快就會發怒."
-
->>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="zh")
->>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
-```
-
-Tokenização do texto:
-
-```py
->>> encoded_zh = tokenizer(chinese_text, return_tensors="pt")
-```
-
-O M2M100 força o id do idioma de destino como o primeiro token gerado para traduzir ao idioma de destino.
-É definido o `forced_bos_token_id` como `en` no método `generate` para traduzir ao inglês.
-
-```py
->>> generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en"))
->>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
-'Do not interfere with the matters of the witches, because they are delicate and will soon be angry.'
-```
-
-## MBart
-
-Os seguintes modelos do MBart podem ser utilizados para tradução multilinguística:
-
-- `facebook/mbart-large-50-one-to-many-mmt` (Tradução automática multilinguística de um a vários, 50 idiomas)
-- `facebook/mbart-large-50-many-to-many-mmt` (Tradução automática multilinguística de vários a vários, 50 idiomas)
-- `facebook/mbart-large-50-many-to-one-mmt` (Tradução automática multilinguística vários a um, 50 idiomas)
-- `facebook/mbart-large-50` (Tradução multilinguística, 50 idiomas)
-- `facebook/mbart-large-cc25`
-
-Neste exemplo, carrega-se o checkpoint `facebook/mbart-large-50-many-to-many-mmt` para traduzir do finlandês ao inglês.
-Pode-se definir o idioma de origem no tokenizador:
-
-```py
->>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-
->>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger."
->>> fi_text = "Älä sekaannu velhojen asioihin, sillä ne ovat hienovaraisia ja nopeasti vihaisia."
-
->>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="fi_FI")
->>> model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
-```
-
-Tokenizando o texto:
-
-```py
->>> encoded_en = tokenizer(en_text, return_tensors="pt")
-```
-
-O MBart força o id do idioma de destino como o primeiro token gerado para traduzir ao idioma de destino.
-É definido o `forced_bos_token_id` como `en` no método `generate` para traduzir ao inglês.
-
-```py
->>> generated_tokens = model.generate(**encoded_en, forced_bos_token_id=tokenizer.lang_code_to_id("en_XX"))
->>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
-"Don't interfere with the wizard's affairs, because they are subtle, will soon get angry."
-```
-
-Se estiver usando o checkpoint `facebook/mbart-large-50-many-to-one-mmt` não será necessário forçar o id do idioma de destino
-como sendo o primeiro token generado, caso contrário a usagem é a mesma.
diff --git a/docs/source/pt/pipeline_tutorial.md b/docs/source/pt/pipeline_tutorial.md
new file mode 100644
index 000000000000..a7ea71256808
--- /dev/null
+++ b/docs/source/pt/pipeline_tutorial.md
@@ -0,0 +1,157 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Pipelines para inferência
+
+Um [pipeline] simplifica o uso dos modelos no [Model Hub](https://huggingface.co/models) para a inferência de uma diversidade de tarefas,
+como a geração de texto, a segmentação de imagens e a classificação de áudio.
+Inclusive, se não tem experiência com alguma modalidade específica ou não compreende o código que forma os modelos,
+pode usar eles mesmo assim com o [pipeline]! Este tutorial te ensinará a:
+
+* Utilizar um [`pipeline`] para inferência.
+* Utilizar um tokenizador ou model específico.
+* Utilizar um [`pipeline`] para tarefas de áudio e visão computacional.
+
+<Tip>
+
+    Acesse a documentação do [`pipeline`] para obter uma lista completa de tarefas possíveis.
+
+</Tip>
+
+## Uso do pipeline
+
+Mesmo que cada tarefa tenha um [`pipeline`] associado, é mais simples usar a abstração geral do [`pipeline`] que
+contém todos os pipelines das tarefas mais específicas.
+O [`pipeline`] carrega automaticamenta um modelo predeterminado e um tokenizador com capacidade de inferência para sua
+tarefa.
+
+1. Comece carregando um [`pipeline`] e especifique uma tarefa de inferência:
+
+```py
+>>> from transformers import pipeline
+
+>>> generator = pipeline(task="text-generation")
+```
+
+2. Passe seu dado de entrada, no caso um texto, ao [`pipeline`]:
+
+```py
+>>> generator("Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone")
+[{'generated_text': 'Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone, Seven for the Iron-priests at the door to the east, and thirteen for the Lord Kings at the end of the mountain'}]
+```
+
+Se tiver mais de uma entrada, passe-a como uma lista:
+
+```py
+>>> generator(
+...     [
+...         "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone",
+...         "Nine for Mortal Men, doomed to die, One for the Dark Lord on his dark throne",
+...     ]
+... )
+```
+
+Qualquer parâmetro adicional para a sua tarefa também pode ser incluído no [`pipeline`]. A tarefa `text-generation` tem um método
+[`~generation.GenerationMixin.generate`] com vários parâmetros para controlar a saída.
+Por exemplo, se quiser gerar mais de uma saída, defina-a no parâmetro `num_return_sequences`:
+
+```py
+>>> generator(
+...     "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone",
+...     num_return_sequences=2,
+... )
+```
+
+### Selecionando um modelo e um tokenizador
+
+O [`pipeline`] aceita qualquer modelo do [Model Hub](https://huggingface.co/models). Há rótulos adicionais no Model Hub
+que te permitem filtrar pelo modelo que gostaria de usar para sua tarefa. Uma vez que tiver escolhido o modelo apropriado,
+carregue-o com as classes `AutoModelFor` e [`AutoTokenizer'] correspondentes. Por exemplo, carregue a classe [`AutoModelForCausalLM`]
+para uma tarefa de modelagem de linguagem causal:
+
+```py
+>>> from transformers import AutoTokenizer, AutoModelForCausalLM
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+>>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+```
+
+Crie uma [`pipeline`] para a sua tarefa e especifíque o modelo e o tokenizador que foram carregados:
+
+```py
+>>> from transformers import pipeline
+
+>>> generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
+```
+
+Passe seu texto de entrada ao [`pipeline`] para gerar algum texto:
+
+```py
+>>> generator("Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone")
+[{'generated_text': 'Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone, Seven for the Dragon-lords (for them to rule in a world ruled by their rulers, and all who live within the realm'}]
+```
+
+## Pipeline de audio
+
+A flexibilidade do [`pipeline`] significa que também pode-se extender às tarefas de áudio.
+La flexibilidad de [`pipeline`] significa que también se puede extender a tareas de audio.
+
+Por exemplo, classifiquemos a emoção de um breve fragmento do famoso discurso de John F. Kennedy /home/rzimmerdev/dev/transformers/docs/source/pt/pipeline_tutorial.md
+Encontre um modelo de [audio classification](https://huggingface.co/models?pipeline_tag=audio-classification) para
+reconhecimento de emoções no Model Hub e carregue-o usando o [`pipeline`]:
+
+```py
+>>> from transformers import pipeline
+
+>>> audio_classifier = pipeline(
+...     task="audio-classification", model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
+... )
+```
+
+Passe o arquivo de áudio ao [`pipeline`]:
+
+```py
+>>> audio_classifier("jfk_moon_speech.wav")
+[{'label': 'calm', 'score': 0.13856211304664612},
+ {'label': 'disgust', 'score': 0.13148026168346405},
+ {'label': 'happy', 'score': 0.12635163962841034},
+ {'label': 'angry', 'score': 0.12439591437578201},
+ {'label': 'fearful', 'score': 0.12404385954141617}]
+```
+
+## Pipeline de visão computacional
+
+Finalmente, utilizar um [`pipeline`] para tarefas de visão é praticamente a mesma coisa.
+Especifique a sua tarefa de visão e passe a sua imagem ao classificador.
+A imagem pode ser um link ou uma rota local à imagem. Por exemplo, que espécie de gato está presente na imagem?
+
+![pipeline-cat-chonk](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg)
+
+```py
+>>> from transformers import pipeline
+
+>>> vision_classifier = pipeline(task="image-classification")
+>>> vision_classifier(
+...     images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+[{'label': 'lynx, catamount', 'score': 0.4403027892112732},
+ {'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor',
+  'score': 0.03433405980467796},
+ {'label': 'snow leopard, ounce, Panthera uncia',
+  'score': 0.032148055732250214},
+ {'label': 'Egyptian cat', 'score': 0.02353910356760025},
+ {'label': 'tiger cat', 'score': 0.023034192621707916}]
+```
diff --git a/docs/source/pt/pipeline_tutorial.mdx b/docs/source/pt/pipeline_tutorial.mdx
deleted file mode 100644
index 2991bcecde4f..000000000000
--- a/docs/source/pt/pipeline_tutorial.mdx
+++ /dev/null
@@ -1,153 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Pipelines para inferência
-
-Um [pipeline] simplifica o uso dos modelos no [Model Hub](https://huggingface.co/models) para a inferência de uma diversidade de tarefas,
-como a geração de texto, a segmentação de imagens e a classificação de áudio.
-Inclusive, se não tem experiência com alguma modalidade específica ou não compreende o código que forma os modelos,
-pode usar eles mesmo assim com o [pipeline]! Este tutorial te ensinará a:
-
-* Utilizar um [`pipeline`] para inferência.
-* Utilizar um tokenizador ou model específico.
-* Utilizar um [`pipeline`] para tarefas de áudio e visão computacional.
-
-<Tip>
-
-    Acesse a documentação do [`pipeline`] para obter uma lista completa de tarefas possíveis.
-
-</Tip>
-
-## Uso do pipeline
-
-Mesmo que cada tarefa tenha um [`pipeline`] associado, é mais simples usar a abstração geral do [`pipeline`] que
-contém todos os pipelines das tarefas mais específicas.
-O [`pipeline`] carrega automaticamenta um modelo predeterminado e um tokenizador com capacidade de inferência para sua
-tarefa.
-
-1. Comece carregando um [`pipeline`] e especifique uma tarefa de inferência:
-
-```py
->>> from transformers import pipeline
-
->>> generator = pipeline(task="text-generation")
-```
-
-2. Passe seu dado de entrada, no caso um texto, ao [`pipeline`]:
-
-```py
->>> generator("Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone")
-[{'generated_text': 'Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone, Seven for the Iron-priests at the door to the east, and thirteen for the Lord Kings at the end of the mountain'}]
-```
-
-Se tiver mais de uma entrada, passe-a como uma lista:
-
-```py
->>> generator(
-...     [
-...         "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone",
-...         "Nine for Mortal Men, doomed to die, One for the Dark Lord on his dark throne",
-...     ]
-... )
-```
-
-Qualquer parâmetro adicional para a sua tarefa também pode ser incluído no [`pipeline`]. A tarefa `text-generation` tem um método
-[`~generation.GenerationMixin.generate`] com vários parâmetros para controlar a saída.
-Por exemplo, se quiser gerar mais de uma saída, defina-a no parâmetro `num_return_sequences`:
-
-```py
->>> generator(
-...     "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone",
-...     num_return_sequences=2,
-... )
-```
-
-### Selecionando um modelo e um tokenizador
-
-O [`pipeline`] aceita qualquer modelo do [Model Hub](https://huggingface.co/models). Há rótulos adicionais no Model Hub
-que te permitem filtrar pelo modelo que gostaria de usar para sua tarefa. Uma vez que tiver escolhido o modelo apropriado,
-carregue-o com as classes `AutoModelFor` e [`AutoTokenizer'] correspondentes. Por exemplo, carregue a classe [`AutoModelForCausalLM`]
-para uma tarefa de modelagem de linguagem causal:
-
-```py
->>> from transformers import AutoTokenizer, AutoModelForCausalLM
-
->>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
->>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
-```
-
-Crie uma [`pipeline`] para a sua tarefa e especifíque o modelo e o tokenizador que foram carregados:
-
-```py
->>> from transformers import pipeline
-
->>> generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
-```
-
-Passe seu texto de entrada ao [`pipeline`] para gerar algum texto:
-
-```py
->>> generator("Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone")
-[{'generated_text': 'Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone, Seven for the Dragon-lords (for them to rule in a world ruled by their rulers, and all who live within the realm'}]
-```
-
-## Pipeline de audio
-
-A flexibilidade do [`pipeline`] significa que também pode-se extender às tarefas de áudio.
-La flexibilidad de [`pipeline`] significa que también se puede extender a tareas de audio.
-
-Por exemplo, classifiquemos a emoção de um breve fragmento do famoso discurso de John F. Kennedy /home/rzimmerdev/dev/transformers/docs/source/pt/pipeline_tutorial.mdx
-Encontre um modelo de [audio classification](https://huggingface.co/models?pipeline_tag=audio-classification) para
-reconhecimento de emoções no Model Hub e carregue-o usando o [`pipeline`]:
-
-```py
->>> from transformers import pipeline
-
->>> audio_classifier = pipeline(
-...     task="audio-classification", model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
-... )
-```
-
-Passe o arquivo de áudio ao [`pipeline`]:
-
-```py
->>> audio_classifier("jfk_moon_speech.wav")
-[{'label': 'calm', 'score': 0.13856211304664612},
- {'label': 'disgust', 'score': 0.13148026168346405},
- {'label': 'happy', 'score': 0.12635163962841034},
- {'label': 'angry', 'score': 0.12439591437578201},
- {'label': 'fearful', 'score': 0.12404385954141617}]
-```
-
-## Pipeline de visão computacional
-
-Finalmente, utilizar um [`pipeline`] para tarefas de visão é praticamente a mesma coisa.
-Especifique a sua tarefa de visão e passe a sua imagem ao classificador.
-A imagem pode ser um link ou uma rota local à imagem. Por exemplo, que espécie de gato está presente na imagem?
-
-![pipeline-cat-chonk](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg)
-
-```py
->>> from transformers import pipeline
-
->>> vision_classifier = pipeline(task="image-classification")
->>> vision_classifier(
-...     images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
-... )
-[{'label': 'lynx, catamount', 'score': 0.4403027892112732},
- {'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor',
-  'score': 0.03433405980467796},
- {'label': 'snow leopard, ounce, Panthera uncia',
-  'score': 0.032148055732250214},
- {'label': 'Egyptian cat', 'score': 0.02353910356760025},
- {'label': 'tiger cat', 'score': 0.023034192621707916}]
-```
diff --git a/docs/source/pt/quicktour.md b/docs/source/pt/quicktour.md
new file mode 100644
index 000000000000..fd89b2485599
--- /dev/null
+++ b/docs/source/pt/quicktour.md
@@ -0,0 +1,399 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Tour rápido
+
+[[open-in-colab]]
+
+Comece a trabalhar com 🤗 Transformers! Comece usando [`pipeline`] para rápida inferência e facilmente carregue um modelo pré-treinado e um tokenizer com [AutoClass](./model_doc/auto) para resolver tarefas de texto, visão ou áudio.
+
+<Tip>
+
+Todos os exemplos de código apresentados na documentação têm um botão no canto superior direito para escolher se você deseja ocultar ou mostrar o código no Pytorch ou no TensorFlow. Caso contrário, é esperado que funcione para ambos back-ends sem nenhuma alteração.
+
+</Tip>
+
+## Pipeline
+
+[`pipeline`] é a maneira mais fácil de usar um modelo pré-treinado para uma dada tarefa.
+
+<Youtube id="tiZFewofSLM"/>
+
+A [`pipeline`] apoia diversas tarefas fora da caixa:
+
+**Texto**:
+* Análise sentimental: classifica a polaridade de um texto.
+* Geração de texto (em Inglês): gera texto a partir de uma entrada.
+* Reconhecimento de entidade mencionada: legenda cada palavra com uma classe que a representa (pessoa, data, local, etc...) 
+* Respostas: extrai uma resposta dado algum contexto e uma questão
+* Máscara de preenchimento: preenche o espaço, dado um texto com máscaras de palavras.
+* Sumarização: gera o resumo de um texto longo ou documento.
+* Tradução: traduz texto para outra língua.
+* Extração de características: cria um tensor que representa o texto.
+
+**Imagem**:
+* Classificação de imagens: classifica uma imagem.
+* Segmentação de imagem: classifica cada pixel da imagem.
+* Detecção de objetos: detecta objetos em uma imagem.
+
+**Audio**:
+* Classficação de áudio: legenda um trecho de áudio fornecido.
+* Reconhecimento de fala automático: transcreve audio em texto.
+
+<Tip>
+
+Para mais detalhes sobre a [`pipeline`] e tarefas associadas, siga a documentação [aqui](./main_classes/pipelines).
+
+</Tip>
+
+### Uso da pipeline
+
+No exemplo a seguir, você usará [`pipeline`] para análise sentimental.
+
+Instale as seguintes dependências se você ainda não o fez:
+
+
+<frameworkcontent>
+<pt>
+```bash
+pip install torch
+```
+</pt>
+<tf>
+```bash
+pip install tensorflow
+```
+</tf>
+</frameworkcontent>
+
+Importe [`pipeline`] e especifique a tarefa que deseja completar:
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline("sentiment-analysis")
+```
+
+A pipeline baixa and armazena um [modelo pré-treinado](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english) padrão e tokenizer para análise sentimental. Agora você pode usar `classifier` no texto alvo: 
+
+```py
+>>> classifier("We are very happy to show you the 🤗 Transformers library.")
+[{'label': 'POSITIVE', 'score': 0.9998}]
+```
+
+Para mais de uma sentença, passe uma lista para a [`pipeline`], a qual retornará uma lista de dicionários:
+
+```py
+>>> results = classifier(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."])
+>>> for result in results:
+...     print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
+label: POSITIVE, with score: 0.9998
+label: NEGATIVE, with score: 0.5309
+```
+
+A [`pipeline`] também pode iterar sobre um Dataset inteiro. Comece instalando a biblioteca de [🤗 Datasets](https://huggingface.co/docs/datasets/):
+
+```bash
+pip install datasets 
+```
+
+Crie uma [`pipeline`] com a tarefa que deseja resolver e o modelo que deseja usar.
+
+```py
+>>> import torch
+>>> from transformers import pipeline
+
+>>> speech_recognizer = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
+```
+
+A seguir, carregue uma base de dados (confira a 🤗 [Iniciação em Datasets](https://huggingface.co/docs/datasets/quickstart.html) para mais detalhes) que você gostaria de iterar sobre. Por exemplo, vamos carregar o dataset [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14):
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")  # doctest: +IGNORE_RESULT
+```
+
+Precisamos garantir que a taxa de amostragem do conjunto de dados corresponda à taxa de amostragem em que o facebook/wav2vec2-base-960h foi treinado.
+
+```py
+>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=speech_recognizer.feature_extractor.sampling_rate))
+```
+
+Os arquivos de áudio são carregados e re-amostrados automaticamente ao chamar a coluna `"audio"`. 
+Vamos extrair as arrays de formas de onda originais das primeiras 4 amostras e passá-las como uma lista para o pipeline:
+
+```py
+>>> result = speech_recognizer(dataset[:4]["audio"])
+>>> print([d["text"] for d in result])
+['I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT', "FONDERING HOW I'D SET UP A JOIN TO HET WITH MY WIFE AND WHERE THE AP MIGHT BE", "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE APSO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AND I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS", 'HOW DO I TURN A JOIN A COUNT']
+```
+
+Para um conjunto de dados maior onde as entradas são maiores (como em fala ou visão), será necessário passar um gerador em vez de uma lista que carregue todas as entradas na memória. Consulte a [documentação do pipeline](./main_classes/pipelines) para mais informações.
+
+### Use outro modelo e tokenizer na pipeline
+
+A [`pipeline`] pode acomodar qualquer modelo do [Model Hub](https://huggingface.co/models), facilitando sua adaptação para outros casos de uso. Por exemplo, se você quiser um modelo capaz de lidar com texto em francês, use as tags no Model Hub para filtrar um modelo apropriado. O principal resultado filtrado retorna um [modelo BERT](https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment) bilíngue ajustado para análise de sentimentos. Ótimo, vamos usar este modelo!
+
+```py
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+```
+
+<frameworkcontent>
+<pt>
+Use o [`AutoModelForSequenceClassification`] e [`AutoTokenizer`] para carregar o modelo pré-treinado e seu tokenizer associado (mais em `AutoClass` abaixo):
+
+```py
+>>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained(model_name)
+>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+```
+</pt>
+<tf>
+
+Use o [`TFAutoModelForSequenceClassification`] and [`AutoTokenizer`] para carregar o modelo pré-treinado e o tokenizer associado (mais em `TFAutoClass` abaixo):
+
+```py
+>>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+
+>>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
+>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+```
+</tf>
+</frameworkcontent>
+
+Então você pode especificar o modelo e o tokenizador na [`pipeline`] e aplicar o `classifier` no seu texto alvo: 
+
+```py
+>>> classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
+>>> classifier("Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers.")
+[{'label': '5 stars', 'score': 0.7273}]
+```
+
+Se você não conseguir achar um modelo para o seu caso de uso, precisará usar fine-tune em um modelo pré-treinado nos seus dados. Veja nosso [tutorial de fine-tuning](./training) para descobrir como. Finalmente, depois que você tiver usado esse processo em seu modelo, considere compartilhá-lo conosco (veja o tutorial [aqui](./model_sharing)) na plataforma Model Hub afim de democratizar NLP! 🤗
+
+## AutoClass
+
+<Youtube id="AhChOFRegn4"/>
+
+Por baixo dos panos, as classes [`AutoModelForSequenceClassification`] e [`AutoTokenizer`] trabalham juntas para fortificar o [`pipeline`]. Um [AutoClass](./model_doc/auto) é um atalho que automaticamente recupera a arquitetura de um modelo pré-treinado a partir de seu nome ou caminho. Basta selecionar a `AutoClass` apropriada para sua tarefa e seu tokenizer associado com [`AutoTokenizer`]. 
+
+Vamos voltar ao nosso exemplo e ver como você pode usar a `AutoClass` para replicar os resultados do [`pipeline`].
+
+### AutoTokenizer
+
+Um tokenizer é responsável por pré-processar o texto em um formato que seja compreensível para o modelo. Primeiro, o tokenizer dividirá o texto em palavras chamadas *tokens*. Existem várias regras que regem o processo de tokenização, incluindo como dividir uma palavra e em que nível (saiba mais sobre tokenização [aqui](./tokenizer_summary)). A coisa mais importante a lembrar, porém, é que você precisa instanciar o tokenizer com o mesmo nome do modelo para garantir que está usando as mesmas regras de tokenização com as quais um modelo foi pré-treinado.
+
+Carregue um tokenizer com [`AutoTokenizer`]:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+```
+
+Em seguida, o tokenizer converte os tokens em números para construir um tensor como entrada para o modelo. Isso é conhecido como o *vocabulário* do modelo.
+
+Passe o texto para o tokenizer:
+
+```py
+>>> encoding = tokenizer("We are very happy to show you the 🤗 Transformers library.")
+>>> print(encoding)
+{'input_ids': [101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103, 100, 58263, 13299, 119, 102],
+ 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+ 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+```
+
+O tokenizer retornará um dicionário contendo:
+
+* [input_ids](./glossary#input-ids): representações numéricas de seus tokens.
+* [atttention_mask](.glossary#attention-mask): indica quais tokens devem ser atendidos.
+
+Assim como o [`pipeline`], o tokenizer aceitará uma lista de entradas. Além disso, o tokenizer também pode preencher e truncar o texto para retornar um lote com comprimento uniforme:
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> pt_batch = tokenizer(
+...     ["We are very happy to show you the 🤗 transformers library.", "We hope you don't hate it."],
+...     padding=True,
+...     truncation=True,
+...     max_length=512,
+...     return_tensors="pt",
+... )
+```
+</pt>
+<tf>
+
+```py
+>>> tf_batch = tokenizer(
+...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
+...     padding=True,
+...     truncation=True,
+...     max_length=512,
+...     return_tensors="tf",
+... )
+```
+</tf>
+</frameworkcontent>
+
+Leia o tutorial de [pré-processamento](./pré-processamento) para obter mais detalhes sobre tokenização.
+
+### AutoModel
+
+<frameworkcontent>
+<pt>
+🤗 Transformers fornecem uma maneira simples e unificada de carregar instâncias pré-treinadas. Isso significa que você pode carregar um [`AutoModel`] como carregaria um [`AutoTokenizer`]. A única diferença é selecionar o [`AutoModel`] correto para a tarefa. Como você está fazendo classificação de texto ou sequência, carregue [`AutoModelForSequenceClassification`]:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)
+```
+
+<Tip>
+
+Veja o [sumário de tarefas](./task_summary) para qual classe de [`AutoModel`] usar para cada tarefa.
+
+</Tip>
+
+Agora você pode passar seu grupo de entradas pré-processadas diretamente para o modelo. Você apenas tem que descompactar o dicionário usando `**`:
+
+```py
+>>> pt_outputs = pt_model(**pt_batch)
+```
+
+O modelo gera as ativações finais no atributo `logits`. Aplique a função softmax aos `logits` para recuperar as probabilidades:
+
+```py
+>>> from torch import nn
+
+>>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
+>>> print(pt_predictions)
+tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
+        [0.2084, 0.1826, 0.1969, 0.1755, 0.2365]], grad_fn=<SoftmaxBackward0>)
+```
+</pt>
+<tf>
+🤗 Transformers fornecem uma maneira simples e unificada de carregar instâncias pré-treinadas. Isso significa que você pode carregar um [`TFAutoModel`] como carregaria um [`AutoTokenizer`]. A única diferença é selecionar o [`TFAutoModel`] correto para a tarefa. Como você está fazendo classificação de texto ou sequência, carregue [`TFAutoModelForSequenceClassification`]:
+
+```py
+>>> from transformers import TFAutoModelForSequenceClassification
+
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
+```
+
+<Tip>
+
+Veja o [sumário de tarefas](./task_summary) para qual classe de [`AutoModel`] usar para cada tarefa.
+
+</Tip>
+
+Agora você pode passar seu grupo de entradas pré-processadas diretamente para o modelo através da passagem de chaves de dicionários ao tensor.
+
+```py
+>>> tf_outputs = tf_model(tf_batch)
+```
+
+O modelo gera as ativações finais no atributo `logits`. Aplique a função softmax aos `logits` para recuperar as probabilidades:
+
+```py
+>>> import tensorflow as tf
+
+>>> tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1)
+>>> tf_predictions  # doctest: +IGNORE_RESULT
+```
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+Todos os modelos de 🤗 Transformers (PyTorch ou TensorFlow) geram tensores *antes* da função de ativação final (como softmax) pois essa função algumas vezes é fundida com a perda.
+
+
+</Tip>
+
+Os modelos são um standard [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) ou um [`tf.keras.Model`](https: //www.tensorflow.org/api_docs/python/tf/keras/Model) para que você possa usá-los em seu loop de treinamento habitual. No entanto, para facilitar as coisas, 🤗 Transformers fornece uma classe [`Trainer`] para PyTorch que adiciona funcionalidade para treinamento distribuído, precisão mista e muito mais. Para o TensorFlow, você pode usar o método `fit` de [Keras](https://keras.io/). Consulte o [tutorial de treinamento](./training) para obter mais detalhes.
+
+<Tip>
+
+As saídas do modelo 🤗 Transformers são classes de dados especiais para que seus atributos sejam preenchidos automaticamente em um IDE.
+As saídas do modelo também se comportam como uma tupla ou um dicionário (por exemplo, você pode indexar com um inteiro, uma parte ou uma string), caso em que os atributos `None` são ignorados.
+
+</Tip>
+
+### Salvar um modelo
+
+<frameworkcontent>
+<pt>
+Uma vez que seu modelo estiver afinado, você pode salvá-lo com seu Tokenizer usando [`PreTrainedModel.save_pretrained`]:
+
+```py
+>>> pt_save_directory = "./pt_save_pretrained"
+>>> tokenizer.save_pretrained(pt_save_directory)  # doctest: +IGNORE_RESULT
+>>> pt_model.save_pretrained(pt_save_directory)
+```
+
+Quando você estiver pronto para usá-lo novamente, recarregue com [`PreTrainedModel.from_pretrained`]:
+
+```py
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained")
+```
+</pt>
+<tf>
+Uma vez que seu modelo estiver afinado, você pode salvá-lo com seu Tokenizer usando [`TFPreTrainedModel.save_pretrained`]:
+
+```py
+>>> tf_save_directory = "./tf_save_pretrained"
+>>> tokenizer.save_pretrained(tf_save_directory)  # doctest: +IGNORE_RESULT
+>>> tf_model.save_pretrained(tf_save_directory)
+```
+
+Quando você estiver pronto para usá-lo novamente, recarregue com [`TFPreTrainedModel.from_pretrained`]
+
+```py
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("./tf_save_pretrained")
+```
+</tf>
+</frameworkcontent>
+
+Um recurso particularmente interessante dos 🤗 Transformers é a capacidade de salvar um modelo e recarregá-lo como um modelo PyTorch ou TensorFlow. Use `from_pt` ou `from_tf` para converter o modelo de um framework para outro:
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> from transformers import AutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
+```
+</pt>
+<tf>
+
+```py
+>>> from transformers import TFAutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
+```
+</tf>
+</frameworkcontent>
\ No newline at end of file
diff --git a/docs/source/pt/quicktour.mdx b/docs/source/pt/quicktour.mdx
deleted file mode 100644
index 3c00a64b6652..000000000000
--- a/docs/source/pt/quicktour.mdx
+++ /dev/null
@@ -1,391 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Tour rápido
-
-[[open-in-colab]]
-
-Comece a trabalhar com 🤗 Transformers! Comece usando [`pipeline`] para rápida inferência e facilmente carregue um modelo pré-treinado e um tokenizer com [AutoClass](./model_doc/auto) para resolver tarefas de texto, visão ou áudio.
-
-<Tip>
-
-Todos os exemplos de código apresentados na documentação têm um botão no canto superior direito para escolher se você deseja ocultar ou mostrar o código no Pytorch ou no TensorFlow. Caso contrário, é esperado que funcione para ambos back-ends sem nenhuma alteração.
-
-</Tip>
-
-## Pipeline
-
-[`pipeline`] é a maneira mais fácil de usar um modelo pré-treinado para uma dada tarefa.
-
-<Youtube id="tiZFewofSLM"/>
-
-A [`pipeline`] apoia diversas tarefas fora da caixa:
-
-**Texto**:
-* Análise sentimental: classifica a polaridade de um texto.
-* Geração de texto (em Inglês): gera texto a partir de uma entrada.
-* Reconhecimento de entidade mencionada: legenda cada palavra com uma classe que a representa (pessoa, data, local, etc...) 
-* Respostas: extrai uma resposta dado algum contexto e uma questão
-* Máscara de preenchimento: preenche o espaço, dado um texto com máscaras de palavras.
-* Sumarização: gera o resumo de um texto longo ou documento.
-* Tradução: traduz texto para outra língua.
-* Extração de características: cria um tensor que representa o texto.
-
-**Imagem**:
-* Classificação de imagens: classifica uma imagem.
-* Segmentação de imagem: classifica cada pixel da imagem.
-* Detecção de objetos: detecta objetos em uma imagem.
-
-**Audio**:
-* Classficação de áudio: legenda um trecho de áudio fornecido.
-* Reconhecimento de fala automático: transcreve audio em texto.
-
-<Tip>
-
-Para mais detalhes sobre a [`pipeline`] e tarefas associadas, siga a documentação [aqui](./main_classes/pipelines).
-
-</Tip>
-
-### Uso da pipeline
-
-No exemplo a seguir, você usará [`pipeline`] para análise sentimental.
-
-Instale as seguintes dependências se você ainda não o fez:
-
-
-<frameworkcontent>
-<pt>
-```bash
-pip install torch
-```
-</pt>
-<tf>
-```bash
-pip install tensorflow
-```
-</tf>
-</frameworkcontent>
-
-Importe [`pipeline`] e especifique a tarefa que deseja completar:
-
-```py
->>> from transformers import pipeline
-
->>> classifier = pipeline("sentiment-analysis")
-```
-
-A pipeline baixa and armazena um [modelo pré-treinado](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english) padrão e tokenizer para análise sentimental. Agora você pode usar `classifier` no texto alvo: 
-
-```py
->>> classifier("We are very happy to show you the 🤗 Transformers library.")
-[{'label': 'POSITIVE', 'score': 0.9998}]
-```
-
-Para mais de uma sentença, passe uma lista para a [`pipeline`], a qual retornará uma lista de dicionários:
-
-```py
->>> results = classifier(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."])
->>> for result in results:
-...     print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
-label: POSITIVE, with score: 0.9998
-label: NEGATIVE, with score: 0.5309
-```
-
-A [`pipeline`] também pode iterar sobre um Dataset inteiro. Comece instalando a biblioteca de [🤗 Datasets](https://huggingface.co/docs/datasets/):
-
-```bash
-pip install datasets 
-```
-
-Crie uma [`pipeline`] com a tarefa que deseja resolver e o modelo que deseja usar.
-
-```py
->>> import torch
->>> from transformers import pipeline
-
->>> speech_recognizer = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
-```
-
-A seguir, carregue uma base de dados (confira a 🤗 [Iniciação em Datasets](https://huggingface.co/docs/datasets/quickstart.html) para mais detalhes) que você gostaria de iterar sobre. Por exemplo, vamos carregar o dataset [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14):
-
-```py
->>> from datasets import load_dataset, Audio
-
->>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")  # doctest: +IGNORE_RESULT
-```
-
-Precisamos garantir que a taxa de amostragem do conjunto de dados corresponda à taxa de amostragem em que o facebook/wav2vec2-base-960h foi treinado.
-
-```py
->>> dataset = dataset.cast_column("audio", Audio(sampling_rate=speech_recognizer.feature_extractor.sampling_rate))
-```
-
-Os arquivos de áudio são carregados e re-amostrados automaticamente ao chamar a coluna `"audio"`. 
-Vamos extrair as arrays de formas de onda originais das primeiras 4 amostras e passá-las como uma lista para o pipeline:
-
-```py
->>> result = speech_recognizer(dataset[:4]["audio"])
->>> print([d["text"] for d in result])
-['I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT', "FONDERING HOW I'D SET UP A JOIN TO HET WITH MY WIFE AND WHERE THE AP MIGHT BE", "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE APSO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AND I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS", 'HOW DO I TURN A JOIN A COUNT']
-```
-
-Para um conjunto de dados maior onde as entradas são maiores (como em fala ou visão), será necessário passar um gerador em vez de uma lista que carregue todas as entradas na memória. Consulte a [documentação do pipeline](./main_classes/pipelines) para mais informações.
-
-### Use outro modelo e tokenizer na pipeline
-
-A [`pipeline`] pode acomodar qualquer modelo do [Model Hub](https://huggingface.co/models), facilitando sua adaptação para outros casos de uso. Por exemplo, se você quiser um modelo capaz de lidar com texto em francês, use as tags no Model Hub para filtrar um modelo apropriado. O principal resultado filtrado retorna um [modelo BERT](https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment) bilíngue ajustado para análise de sentimentos. Ótimo, vamos usar este modelo!
-
-```py
->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
-```
-
-<frameworkcontent>
-<pt>
-Use o [`AutoModelForSequenceClassification`] e [`AutoTokenizer`] para carregar o modelo pré-treinado e seu tokenizer associado (mais em `AutoClass` abaixo):
-
-```py
->>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
-
->>> model = AutoModelForSequenceClassification.from_pretrained(model_name)
->>> tokenizer = AutoTokenizer.from_pretrained(model_name)
-```
-</pt>
-<tf>
-
-Use o [`TFAutoModelForSequenceClassification`] and [`AutoTokenizer`] para carregar o modelo pré-treinado e o tokenizer associado (mais em `TFAutoClass` abaixo):
-
-```py
->>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
-
->>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
->>> tokenizer = AutoTokenizer.from_pretrained(model_name)
-```
-</tf>
-</frameworkcontent>
-
-Então você pode especificar o modelo e o tokenizador na [`pipeline`] e aplicar o `classifier` no seu texto alvo: 
-
-```py
->>> classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
->>> classifier("Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers.")
-[{'label': '5 stars', 'score': 0.7273}]
-```
-
-Se você não conseguir achar um modelo para o seu caso de uso, precisará usar fine-tune em um modelo pré-treinado nos seus dados. Veja nosso [tutorial de fine-tuning](./training) para descobrir como. Finalmente, depois que você tiver usado esse processo em seu modelo, considere compartilhá-lo conosco (veja o tutorial [aqui](./model_sharing)) na plataforma Model Hub afim de democratizar NLP! 🤗
-
-## AutoClass
-
-<Youtube id="AhChOFRegn4"/>
-
-Por baixo dos panos, as classes [`AutoModelForSequenceClassification`] e [`AutoTokenizer`] trabalham juntas para fortificar o [`pipeline`]. Um [AutoClass](./model_doc/auto) é um atalho que automaticamente recupera a arquitetura de um modelo pré-treinado a partir de seu nome ou caminho. Basta selecionar a `AutoClass` apropriada para sua tarefa e seu tokenizer associado com [`AutoTokenizer`]. 
-
-Vamos voltar ao nosso exemplo e ver como você pode usar a `AutoClass` para replicar os resultados do [`pipeline`].
-
-### AutoTokenizer
-
-Um tokenizer é responsável por pré-processar o texto em um formato que seja compreensível para o modelo. Primeiro, o tokenizer dividirá o texto em palavras chamadas *tokens*. Existem várias regras que regem o processo de tokenização, incluindo como dividir uma palavra e em que nível (saiba mais sobre tokenização [aqui](./tokenizer_summary)). A coisa mais importante a lembrar, porém, é que você precisa instanciar o tokenizer com o mesmo nome do modelo para garantir que está usando as mesmas regras de tokenização com as quais um modelo foi pré-treinado.
-
-Carregue um tokenizer com [`AutoTokenizer`]:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
->>> tokenizer = AutoTokenizer.from_pretrained(model_name)
-```
-
-Em seguida, o tokenizer converte os tokens em números para construir um tensor como entrada para o modelo. Isso é conhecido como o *vocabulário* do modelo.
-
-Passe o texto para o tokenizer:
-
-```py
->>> encoding = tokenizer("We are very happy to show you the 🤗 Transformers library.")
->>> print(encoding)
-{'input_ids': [101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103, 100, 58263, 13299, 119, 102],
- 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
- 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
-```
-
-O tokenizer retornará um dicionário contendo:
-
-* [input_ids](./glossary#input-ids): representações numéricas de seus tokens.
-* [atttention_mask](.glossary#attention-mask): indica quais tokens devem ser atendidos.
-
-Assim como o [`pipeline`], o tokenizer aceitará uma lista de entradas. Além disso, o tokenizer também pode preencher e truncar o texto para retornar um lote com comprimento uniforme:
-
-<frameworkcontent>
-<pt>
-```py
->>> pt_batch = tokenizer(
-...     ["We are very happy to show you the 🤗 transformers library.", "We hope you don't hate it."],
-...     padding=True,
-...     truncation=True,
-...     max_length=512,
-...     return_tensors="pt",
-... )
-```
-</pt>
-<tf>
-```py
->>> tf_batch = tokenizer(
-...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
-...     padding=True,
-...     truncation=True,
-...     max_length=512,
-...     return_tensors="tf",
-... )
-```
-</tf>
-</frameworkcontent>
-
-Leia o tutorial de [pré-processamento](./pré-processamento) para obter mais detalhes sobre tokenização.
-
-### AutoModel
-
-<frameworkcontent>
-<pt>
-🤗 Transformers fornecem uma maneira simples e unificada de carregar instâncias pré-treinadas. Isso significa que você pode carregar um [`AutoModel`] como carregaria um [`AutoTokenizer`]. A única diferença é selecionar o [`AutoModel`] correto para a tarefa. Como você está fazendo classificação de texto ou sequência, carregue [`AutoModelForSequenceClassification`]:
-
-```py
->>> from transformers import AutoModelForSequenceClassification
-
->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
->>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)
-```
-
-<Tip>
-
-Veja o [sumário de tarefas](./task_summary) para qual classe de [`AutoModel`] usar para cada tarefa.
-
-</Tip>
-
-Agora você pode passar seu grupo de entradas pré-processadas diretamente para o modelo. Você apenas tem que descompactar o dicionário usando `**`:
-
-```py
->>> pt_outputs = pt_model(**pt_batch)
-```
-
-O modelo gera as ativações finais no atributo `logits`. Aplique a função softmax aos `logits` para recuperar as probabilidades:
-
-```py
->>> from torch import nn
-
->>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
->>> print(pt_predictions)
-tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
-        [0.2084, 0.1826, 0.1969, 0.1755, 0.2365]], grad_fn=<SoftmaxBackward0>)
-```
-</pt>
-<tf>
-🤗 Transformers fornecem uma maneira simples e unificada de carregar instâncias pré-treinadas. Isso significa que você pode carregar um [`TFAutoModel`] como carregaria um [`AutoTokenizer`]. A única diferença é selecionar o [`TFAutoModel`] correto para a tarefa. Como você está fazendo classificação de texto ou sequência, carregue [`TFAutoModelForSequenceClassification`]:
-
-```py
->>> from transformers import TFAutoModelForSequenceClassification
-
->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
-```
-
-<Tip>
-
-Veja o [sumário de tarefas](./task_summary) para qual classe de [`AutoModel`] usar para cada tarefa.
-
-</Tip>
-
-Agora você pode passar seu grupo de entradas pré-processadas diretamente para o modelo através da passagem de chaves de dicionários ao tensor.
-
-```py
->>> tf_outputs = tf_model(tf_batch)
-```
-
-O modelo gera as ativações finais no atributo `logits`. Aplique a função softmax aos `logits` para recuperar as probabilidades:
-
-```py
->>> import tensorflow as tf
-
->>> tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1)
->>> tf_predictions  # doctest: +IGNORE_RESULT
-```
-</tf>
-</frameworkcontent>
-
-<Tip>
-
-Todos os modelos de 🤗 Transformers (PyTorch ou TensorFlow) geram tensores *antes* da função de ativação final (como softmax) pois essa função algumas vezes é fundida com a perda.
-
-
-</Tip>
-
-Os modelos são um standard [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) ou um [`tf.keras.Model`](https: //www.tensorflow.org/api_docs/python/tf/keras/Model) para que você possa usá-los em seu loop de treinamento habitual. No entanto, para facilitar as coisas, 🤗 Transformers fornece uma classe [`Trainer`] para PyTorch que adiciona funcionalidade para treinamento distribuído, precisão mista e muito mais. Para o TensorFlow, você pode usar o método `fit` de [Keras](https://keras.io/). Consulte o [tutorial de treinamento](./training) para obter mais detalhes.
-
-<Tip>
-
-As saídas do modelo 🤗 Transformers são classes de dados especiais para que seus atributos sejam preenchidos automaticamente em um IDE.
-As saídas do modelo também se comportam como uma tupla ou um dicionário (por exemplo, você pode indexar com um inteiro, uma parte ou uma string), caso em que os atributos `None` são ignorados.
-
-</Tip>
-
-### Salvar um modelo
-
-<frameworkcontent>
-<pt>
-Uma vez que seu modelo estiver afinado, você pode salvá-lo com seu Tokenizer usando [`PreTrainedModel.save_pretrained`]:
-
-```py
->>> pt_save_directory = "./pt_save_pretrained"
->>> tokenizer.save_pretrained(pt_save_directory)  # doctest: +IGNORE_RESULT
->>> pt_model.save_pretrained(pt_save_directory)
-```
-
-Quando você estiver pronto para usá-lo novamente, recarregue com [`PreTrainedModel.from_pretrained`]:
-
-```py
->>> pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained")
-```
-</pt>
-<tf>
-Uma vez que seu modelo estiver afinado, você pode salvá-lo com seu Tokenizer usando [`TFPreTrainedModel.save_pretrained`]:
-
-```py
->>> tf_save_directory = "./tf_save_pretrained"
->>> tokenizer.save_pretrained(tf_save_directory)  # doctest: +IGNORE_RESULT
->>> tf_model.save_pretrained(tf_save_directory)
-```
-
-Quando você estiver pronto para usá-lo novamente, recarregue com [`TFPreTrainedModel.from_pretrained`]
-
-```py
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("./tf_save_pretrained")
-```
-</tf>
-</frameworkcontent>
-
-Um recurso particularmente interessante dos 🤗 Transformers é a capacidade de salvar um modelo e recarregá-lo como um modelo PyTorch ou TensorFlow. Use `from_pt` ou `from_tf` para converter o modelo de um framework para outro:
-
-<frameworkcontent>
-<pt>
-```py
->>> from transformers import AutoModel
-
->>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
->>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
-```
-</pt>
-<tf>
-```py
->>> from transformers import TFAutoModel
-
->>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
-```
-</tf>
-</frameworkcontent>
\ No newline at end of file
diff --git a/docs/source/pt/run_scripts.md b/docs/source/pt/run_scripts.md
new file mode 100644
index 000000000000..8d87c10c2713
--- /dev/null
+++ b/docs/source/pt/run_scripts.md
@@ -0,0 +1,354 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Treinamento a partir de um script
+
+Junto com os 🤗 Transformers [notebooks](./noteboks/README), também há scripts de exemplo demonstrando como treinar um modelo para uma tarefa com [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow) ou [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax).
+
+Você também encontrará scripts que usamos em nossos [projetos de pesquisa](https://github.com/huggingface/transformers/tree/main/examples/research_projects) e [exemplos legados](https://github.com/huggingface/transformers/tree/main/examples/legacy) que são principalmente contribuições da comunidade. Esses scripts não são mantidos ativamente e exigem uma versão específica de 🤗 Transformers que provavelmente será incompatível com a versão mais recente da biblioteca.
+
+Não se espera que os scripts de exemplo funcionem imediatamente em todos os problemas, você pode precisar adaptar o script ao problema que está tentando resolver. Para ajudá-lo com isso, a maioria dos scripts expõe totalmente como os dados são pré-processados, permitindo que você os edite conforme necessário para seu caso de uso.
+
+Para qualquer recurso que você gostaria de implementar em um script de exemplo, discuta-o no [fórum](https://discuss.huggingface.co/) ou em uma [issue](https://github.com/huggingface/transformers/issues) antes de enviar um Pull Request. Embora recebamos correções de bugs, é improvável que mesclaremos um Pull Request que adicione mais funcionalidades ao custo de legibilidade.
+
+Este guia mostrará como executar um exemplo de script de treinamento de sumarização em [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization) e [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization). Espera-se que todos os exemplos funcionem com ambas as estruturas, a menos que especificado de outra forma.
+
+## Configuração
+
+Para executar com êxito a versão mais recente dos scripts de exemplo, você precisa **instalar o 🤗 Transformers da fonte** em um novo ambiente virtual:
+
+```bash
+git clone https://github.com/huggingface/transformers
+cd transformers
+pip install .
+```
+
+Para versões mais antigas dos scripts de exemplo, clique no botão abaixo:
+
+<details>
+  <summary>Exemplos para versões antigas dos 🤗 Transformers</summary>
+	<ul>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.5.1/examples">v4.5.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.4.2/examples">v4.4.2</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.3.3/examples">v4.3.3</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.2.2/examples">v4.2.2</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.1.1/examples">v4.1.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.0.1/examples">v4.0.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.5.1/examples">v3.5.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.4.0/examples">v3.4.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.3.1/examples">v3.3.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.2.0/examples">v3.2.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.1.0/examples">v3.1.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.0.2/examples">v3.0.2</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.11.0/examples">v2.11.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.10.0/examples">v2.10.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.9.1/examples">v2.9.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.8.0/examples">v2.8.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.7.0/examples">v2.7.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.6.0/examples">v2.6.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.5.1/examples">v2.5.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.4.0/examples">v2.4.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.3.0/examples">v2.3.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.2.0/examples">v2.2.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.1.0/examples">v2.1.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.0.0/examples">v2.0.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v1.2.0/examples">v1.2.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v1.1.0/examples">v1.1.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v1.0.0/examples">v1.0.0</a></li>
+	</ul>
+</details>
+
+Em seguida, mude seu clone atual dos 🤗 Transformers para uma versão específica, como v3.5.1, por exemplo:
+
+```bash
+git checkout tags/v3.5.1
+```
+
+Depois de configurar a versão correta da biblioteca, navegue até a pasta de exemplo de sua escolha e instale os requisitos específicos do exemplo:
+
+```bash
+pip install -r requirements.txt
+```
+
+## Executando um script
+
+<frameworkcontent>
+<pt>
+
+O script de exemplo baixa e pré-processa um conjunto de dados da biblioteca 🤗 [Datasets](https://huggingface.co/docs/datasets/). Em seguida, o script ajusta um conjunto de dados com o [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) em uma arquitetura que oferece suporte à sumarização. O exemplo a seguir mostra como ajustar [T5-small](https://huggingface.co/t5-small) no conjunto de dados [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail). O modelo T5 requer um argumento `source_prefix` adicional devido à forma como foi treinado. Este prompt informa ao T5 que esta é uma tarefa de sumarização.
+
+```bash
+python examples/pytorch/summarization/run_summarization.py \
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+</pt>
+<tf>
+Este outro script de exemplo baixa e pré-processa um conjunto de dados da biblioteca 🤗 [Datasets](https://huggingface.co/docs/datasets/). Em seguida, o script ajusta um conjunto de dados usando Keras em uma arquitetura que oferece suporte à sumarização. O exemplo a seguir mostra como ajustar [T5-small](https://huggingface.co/t5-small) no conjunto de dados [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail). O modelo T5 requer um argumento `source_prefix` adicional devido à forma como foi treinado. Este prompt informa ao T5 que esta é uma tarefa de sumarização.
+
+```bash
+python examples/tensorflow/summarization/run_summarization.py  \
+    --model_name_or_path t5-small \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --output_dir /tmp/tst-summarization  \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 16 \
+    --num_train_epochs 3 \
+    --do_train \
+    --do_eval
+```
+</tf>
+</frameworkcontent>
+
+## Treinamento distribuído e precisão mista
+
+O [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) oferece suporte a treinamento distribuído e precisão mista, o que significa que você também pode usá-lo em um script. Para habilitar esses dois recursos:
+
+- Adicione o argumento `fp16` para habilitar a precisão mista.
+- Defina o número de GPUs a serem usadas com o argumento `nproc_per_node`.
+
+```bash
+python -m torch.distributed.launch \
+    --nproc_per_node 8 pytorch/summarization/run_summarization.py \
+    --fp16 \
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+
+Os scripts do TensorFlow utilizam um [`MirroredStrategy`](https://www.tensorflow.org/guide/distributed_training#mirroredstrategy) para treinamento distribuído, e você não precisa adicionar argumentos adicionais ao script de treinamento. O script do TensorFlow usará várias GPUs por padrão, se estiverem disponíveis.
+
+## Executando um script em uma TPU
+
+<frameworkcontent>
+<pt>
+As Unidades de Processamento de Tensor (TPUs) são projetadas especificamente para acelerar o desempenho. O PyTorch oferece suporte a TPUs com o compilador de aprendizado profundo [XLA](https://www.tensorflow.org/xla) (consulte [aqui](https://github.com/pytorch/xla/blob/master/README.md) para mais detalhes). Para usar uma TPU, inicie o script `xla_spawn.py` e use o argumento `num_cores` para definir o número de núcleos de TPU que você deseja usar.
+
+```bash
+python xla_spawn.py --num_cores 8 \
+    summarization/run_summarization.py \
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+</pt>
+<tf>
+
+As Unidades de Processamento de Tensor (TPUs) são projetadas especificamente para acelerar o desempenho. Os scripts do TensorFlow utilizam uma [`TPUStrategy`](https://www.tensorflow.org/guide/distributed_training#tpustrategy) para treinamento em TPUs. Para usar uma TPU, passe o nome do recurso TPU para o argumento `tpu`.
+
+```bash
+python run_summarization.py  \
+    --tpu name_of_tpu_resource \
+    --model_name_or_path t5-small \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --output_dir /tmp/tst-summarization  \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 16 \
+    --num_train_epochs 3 \
+    --do_train \
+    --do_eval
+```
+</tf>
+</frameworkcontent>
+
+## Execute um script com 🤗 Accelerate
+
+🤗 [Accelerate](https://huggingface.co/docs/accelerate) é uma biblioteca somente do PyTorch que oferece um método unificado para treinar um modelo em vários tipos de configurações (CPU, multiplas GPUs, TPUs), mantendo visibilidade no loop de treinamento do PyTorch. Certifique-se de ter o 🤗 Accelerate instalado se ainda não o tiver:
+
+> Nota: Como o Accelerate está se desenvolvendo rapidamente, a versão git do Accelerate deve ser instalada para executar os scripts
+
+```bash
+pip install git+https://github.com/huggingface/accelerate
+```
+
+Em vez do script `run_summarization.py`, você precisa usar o script `run_summarization_no_trainer.py`. Os scripts suportados pelo 🤗 Accelerate terão um arquivo `task_no_trainer.py` na pasta. Comece executando o seguinte comando para criar e salvar um arquivo de configuração:
+
+```bash
+accelerate config
+```
+
+Teste sua configuração para garantir que ela esteja corretamente configurada :
+
+```bash
+accelerate test
+```
+
+Agora você está pronto para iniciar o treinamento:
+
+```bash
+accelerate launch run_summarization_no_trainer.py \
+    --model_name_or_path t5-small \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir ~/tmp/tst-summarization
+```
+
+## Usando um conjunto de dados personalizado
+
+O script de resumo oferece suporte a conjuntos de dados personalizados, desde que sejam um arquivo CSV ou JSON. Ao usar seu próprio conjunto de dados, você precisa especificar vários argumentos adicionais:
+
+- `train_file` e `validation_file` especificam o caminho para seus arquivos de treinamento e validação respectivamente.
+- `text_column` é o texto de entrada para sumarização.
+- `summary_column` é o texto de destino para saída.
+
+Um script para sumarização usando um conjunto de dados customizado ficaria assim:
+
+```bash
+python examples/pytorch/summarization/run_summarization.py \
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --train_file path_to_csv_or_jsonlines_file \
+    --validation_file path_to_csv_or_jsonlines_file \
+    --text_column text_column_name \
+    --summary_column summary_column_name \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --overwrite_output_dir \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --predict_with_generate
+```
+
+## Testando um script 
+
+Geralmente, é uma boa ideia executar seu script em um número menor de exemplos de conjuntos de dados para garantir que tudo funcione conforme o esperado antes de se comprometer com um conjunto de dados inteiro, que pode levar horas para ser concluído. Use os seguintes argumentos para truncar o conjunto de dados para um número máximo de amostras:
+
+- `max_train_samples`
+- `max_eval_samples`
+- `max_predict_samples`
+
+```bash
+python examples/pytorch/summarization/run_summarization.py \
+    --model_name_or_path t5-small \
+    --max_train_samples 50 \
+    --max_eval_samples 50 \
+    --max_predict_samples 50 \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+
+Nem todos os scripts de exemplo suportam o argumento `max_predict_samples`. Se você não tiver certeza se seu script suporta este argumento, adicione o argumento `-h` para verificar:
+
+```bash
+examples/pytorch/summarization/run_summarization.py -h
+```
+
+## Retomar o treinamento a partir de um checkpoint
+
+Outra opção útil para habilitar é retomar o treinamento de um checkpoint anterior. Isso garantirá que você possa continuar de onde parou sem recomeçar se o seu treinamento for interrompido. Existem dois métodos para retomar o treinamento a partir de um checkpoint.
+
+O primeiro método usa o argumento `output_dir previous_output_dir` para retomar o treinamento do último checkpoint armazenado em `output_dir`. Neste caso, você deve remover `overwrite_output_dir`:
+
+```bash
+python examples/pytorch/summarization/run_summarization.py
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --output_dir previous_output_dir \
+    --predict_with_generate
+```
+
+O segundo método usa o argumento `resume_from_checkpoint path_to_specific_checkpoint` para retomar o treinamento de uma pasta de checkpoint específica.
+
+```bash
+python examples/pytorch/summarization/run_summarization.py
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --resume_from_checkpoint path_to_specific_checkpoint \
+    --predict_with_generate
+```
+
+## Compartilhando seu modelo 
+
+Todos os scripts podem enviar seu modelo final para o [Model Hub](https://huggingface.co/models). Certifique-se de estar conectado ao Hugging Face antes de começar:
+
+```bash
+huggingface-cli login
+```
+
+Em seguida, adicione o argumento `push_to_hub` ao script. Este argumento criará um repositório com seu nome de usuário do Hugging Face e o nome da pasta especificado em `output_dir`.
+
+Para dar um nome específico ao seu repositório, use o argumento `push_to_hub_model_id` para adicioná-lo. O repositório será listado automaticamente em seu namespace.
+
+O exemplo a seguir mostra como fazer upload de um modelo com um nome de repositório específico:
+
+```bash
+python examples/pytorch/summarization/run_summarization.py
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --push_to_hub \
+    --push_to_hub_model_id finetuned-t5-cnn_dailymail \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
diff --git a/docs/source/pt/run_scripts.mdx b/docs/source/pt/run_scripts.mdx
deleted file mode 100644
index e91c4fc87d2d..000000000000
--- a/docs/source/pt/run_scripts.mdx
+++ /dev/null
@@ -1,350 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Treinamento a partir de um script
-
-Junto com os 🤗 Transformers [notebooks](./noteboks/README), também há scripts de exemplo demonstrando como treinar um modelo para uma tarefa com [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow) ou [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax).
-
-Você também encontrará scripts que usamos em nossos [projetos de pesquisa](https://github.com/huggingface/transformers/tree/main/examples/research_projects) e [exemplos legados](https://github.com/huggingface/transformers/tree/main/examples/legacy) que são principalmente contribuições da comunidade. Esses scripts não são mantidos ativamente e exigem uma versão específica de 🤗 Transformers que provavelmente será incompatível com a versão mais recente da biblioteca.
-
-Não se espera que os scripts de exemplo funcionem imediatamente em todos os problemas, você pode precisar adaptar o script ao problema que está tentando resolver. Para ajudá-lo com isso, a maioria dos scripts expõe totalmente como os dados são pré-processados, permitindo que você os edite conforme necessário para seu caso de uso.
-
-Para qualquer recurso que você gostaria de implementar em um script de exemplo, discuta-o no [fórum](https://discuss.huggingface.co/) ou em uma [issue](https://github.com/huggingface/transformers/issues) antes de enviar um Pull Request. Embora recebamos correções de bugs, é improvável que mesclaremos um Pull Request que adicione mais funcionalidades ao custo de legibilidade.
-
-Este guia mostrará como executar um exemplo de script de treinamento de sumarização em [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization) e [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization). Espera-se que todos os exemplos funcionem com ambas as estruturas, a menos que especificado de outra forma.
-
-## Configuração
-
-Para executar com êxito a versão mais recente dos scripts de exemplo, você precisa **instalar o 🤗 Transformers da fonte** em um novo ambiente virtual:
-
-```bash
-git clone https://github.com/huggingface/transformers
-cd transformers
-pip install .
-```
-
-Para versões mais antigas dos scripts de exemplo, clique no botão abaixo:
-
-<details>
-  <summary>Exemplos para versões antigas dos 🤗 Transformers</summary>
-	<ul>
-		<li><a href="https://github.com/huggingface/transformers/tree/v4.5.1/examples">v4.5.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v4.4.2/examples">v4.4.2</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v4.3.3/examples">v4.3.3</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v4.2.2/examples">v4.2.2</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v4.1.1/examples">v4.1.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v4.0.1/examples">v4.0.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v3.5.1/examples">v3.5.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v3.4.0/examples">v3.4.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v3.3.1/examples">v3.3.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v3.2.0/examples">v3.2.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v3.1.0/examples">v3.1.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v3.0.2/examples">v3.0.2</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.11.0/examples">v2.11.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.10.0/examples">v2.10.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.9.1/examples">v2.9.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.8.0/examples">v2.8.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.7.0/examples">v2.7.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.6.0/examples">v2.6.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.5.1/examples">v2.5.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.4.0/examples">v2.4.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.3.0/examples">v2.3.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.2.0/examples">v2.2.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.1.0/examples">v2.1.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.0.0/examples">v2.0.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v1.2.0/examples">v1.2.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v1.1.0/examples">v1.1.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v1.0.0/examples">v1.0.0</a></li>
-	</ul>
-</details>
-
-Em seguida, mude seu clone atual dos 🤗 Transformers para uma versão específica, como v3.5.1, por exemplo:
-
-```bash
-git checkout tags/v3.5.1
-```
-
-Depois de configurar a versão correta da biblioteca, navegue até a pasta de exemplo de sua escolha e instale os requisitos específicos do exemplo:
-
-```bash
-pip install -r requirements.txt
-```
-
-## Executando um script
-
-<frameworkcontent>
-<pt>
-
-O script de exemplo baixa e pré-processa um conjunto de dados da biblioteca 🤗 [Datasets](https://huggingface.co/docs/datasets/). Em seguida, o script ajusta um conjunto de dados com o [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) em uma arquitetura que oferece suporte à sumarização. O exemplo a seguir mostra como ajustar [T5-small](https://huggingface.co/t5-small) no conjunto de dados [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail). O modelo T5 requer um argumento `source_prefix` adicional devido à forma como foi treinado. Este prompt informa ao T5 que esta é uma tarefa de sumarização.
-
-```bash
-python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path t5-small \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --output_dir /tmp/tst-summarization \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --overwrite_output_dir \
-    --predict_with_generate
-```
-</pt>
-<tf>
-Este outro script de exemplo baixa e pré-processa um conjunto de dados da biblioteca 🤗 [Datasets](https://huggingface.co/docs/datasets/). Em seguida, o script ajusta um conjunto de dados usando Keras em uma arquitetura que oferece suporte à sumarização. O exemplo a seguir mostra como ajustar [T5-small](https://huggingface.co/t5-small) no conjunto de dados [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail). O modelo T5 requer um argumento `source_prefix` adicional devido à forma como foi treinado. Este prompt informa ao T5 que esta é uma tarefa de sumarização.
-
-```bash
-python examples/tensorflow/summarization/run_summarization.py  \
-    --model_name_or_path t5-small \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --output_dir /tmp/tst-summarization  \
-    --per_device_train_batch_size 8 \
-    --per_device_eval_batch_size 16 \
-    --num_train_epochs 3 \
-    --do_train \
-    --do_eval
-```
-</tf>
-</frameworkcontent>
-
-## Treinamento distribuído e precisão mista
-
-O [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) oferece suporte a treinamento distribuído e precisão mista, o que significa que você também pode usá-lo em um script. Para habilitar esses dois recursos:
-
-- Adicione o argumento `fp16` para habilitar a precisão mista.
-- Defina o número de GPUs a serem usadas com o argumento `nproc_per_node`.
-
-```bash
-python -m torch.distributed.launch \
-    --nproc_per_node 8 pytorch/summarization/run_summarization.py \
-    --fp16 \
-    --model_name_or_path t5-small \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --output_dir /tmp/tst-summarization \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --overwrite_output_dir \
-    --predict_with_generate
-```
-
-Os scripts do TensorFlow utilizam um [`MirroredStrategy`](https://www.tensorflow.org/guide/distributed_training#mirroredstrategy) para treinamento distribuído, e você não precisa adicionar argumentos adicionais ao script de treinamento. O script do TensorFlow usará várias GPUs por padrão, se estiverem disponíveis.
-
-## Executando um script em uma TPU
-
-<frameworkcontent>
-<pt>
-As Unidades de Processamento de Tensor (TPUs) são projetadas especificamente para acelerar o desempenho. O PyTorch oferece suporte a TPUs com o compilador de aprendizado profundo [XLA](https://www.tensorflow.org/xla) (consulte [aqui](https://github.com/pytorch/xla/blob/master/README.md) para mais detalhes). Para usar uma TPU, inicie o script `xla_spawn.py` e use o argumento `num_cores` para definir o número de núcleos de TPU que você deseja usar.
-
-```bash
-python xla_spawn.py --num_cores 8 \
-    summarization/run_summarization.py \
-    --model_name_or_path t5-small \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --output_dir /tmp/tst-summarization \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --overwrite_output_dir \
-    --predict_with_generate
-```
-</pt>
-<tf>
-
-As Unidades de Processamento de Tensor (TPUs) são projetadas especificamente para acelerar o desempenho. Os scripts do TensorFlow utilizam uma [`TPUStrategy`](https://www.tensorflow.org/guide/distributed_training#tpustrategy) para treinamento em TPUs. Para usar uma TPU, passe o nome do recurso TPU para o argumento `tpu`.
-
-```bash
-python run_summarization.py  \
-    --tpu name_of_tpu_resource \
-    --model_name_or_path t5-small \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --output_dir /tmp/tst-summarization  \
-    --per_device_train_batch_size 8 \
-    --per_device_eval_batch_size 16 \
-    --num_train_epochs 3 \
-    --do_train \
-    --do_eval
-```
-</tf>
-</frameworkcontent>
-
-## Execute um script com 🤗 Accelerate
-
-🤗 [Accelerate](https://huggingface.co/docs/accelerate) é uma biblioteca somente do PyTorch que oferece um método unificado para treinar um modelo em vários tipos de configurações (CPU, multiplas GPUs, TPUs), mantendo visibilidade no loop de treinamento do PyTorch. Certifique-se de ter o 🤗 Accelerate instalado se ainda não o tiver:
-
-> Nota: Como o Accelerate está se desenvolvendo rapidamente, a versão git do Accelerate deve ser instalada para executar os scripts
-
-```bash
-pip install git+https://github.com/huggingface/accelerate
-```
-
-Em vez do script `run_summarization.py`, você precisa usar o script `run_summarization_no_trainer.py`. Os scripts suportados pelo 🤗 Accelerate terão um arquivo `task_no_trainer.py` na pasta. Comece executando o seguinte comando para criar e salvar um arquivo de configuração:
-
-```bash
-accelerate config
-```
-
-Teste sua configuração para garantir que ela esteja corretamente configurada :
-
-```bash
-accelerate test
-```
-
-Agora você está pronto para iniciar o treinamento:
-
-```bash
-accelerate launch run_summarization_no_trainer.py \
-    --model_name_or_path t5-small \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --output_dir ~/tmp/tst-summarization
-```
-
-## Usando um conjunto de dados personalizado
-
-O script de resumo oferece suporte a conjuntos de dados personalizados, desde que sejam um arquivo CSV ou JSON. Ao usar seu próprio conjunto de dados, você precisa especificar vários argumentos adicionais:
-
-- `train_file` e `validation_file` especificam o caminho para seus arquivos de treinamento e validação respectivamente.
-- `text_column` é o texto de entrada para sumarização.
-- `summary_column` é o texto de destino para saída.
-
-Um script para sumarização usando um conjunto de dados customizado ficaria assim:
-
-```bash
-python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path t5-small \
-    --do_train \
-    --do_eval \
-    --train_file path_to_csv_or_jsonlines_file \
-    --validation_file path_to_csv_or_jsonlines_file \
-    --text_column text_column_name \
-    --summary_column summary_column_name \
-    --source_prefix "summarize: " \
-    --output_dir /tmp/tst-summarization \
-    --overwrite_output_dir \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --predict_with_generate
-```
-
-## Testando um script 
-
-Geralmente, é uma boa ideia executar seu script em um número menor de exemplos de conjuntos de dados para garantir que tudo funcione conforme o esperado antes de se comprometer com um conjunto de dados inteiro, que pode levar horas para ser concluído. Use os seguintes argumentos para truncar o conjunto de dados para um número máximo de amostras:
-
-- `max_train_samples`
-- `max_eval_samples`
-- `max_predict_samples`
-
-```bash
-python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path t5-small \
-    --max_train_samples 50 \
-    --max_eval_samples 50 \
-    --max_predict_samples 50 \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --output_dir /tmp/tst-summarization \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --overwrite_output_dir \
-    --predict_with_generate
-```
-
-Nem todos os scripts de exemplo suportam o argumento `max_predict_samples`. Se você não tiver certeza se seu script suporta este argumento, adicione o argumento `-h` para verificar:
-
-```bash
-examples/pytorch/summarization/run_summarization.py -h
-```
-
-## Retomar o treinamento a partir de um checkpoint
-
-Outra opção útil para habilitar é retomar o treinamento de um checkpoint anterior. Isso garantirá que você possa continuar de onde parou sem recomeçar se o seu treinamento for interrompido. Existem dois métodos para retomar o treinamento a partir de um checkpoint.
-
-O primeiro método usa o argumento `output_dir previous_output_dir` para retomar o treinamento do último checkpoint armazenado em `output_dir`. Neste caso, você deve remover `overwrite_output_dir`:
-
-```bash
-python examples/pytorch/summarization/run_summarization.py
-    --model_name_or_path t5-small \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --output_dir /tmp/tst-summarization \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --output_dir previous_output_dir \
-    --predict_with_generate
-```
-
-O segundo método usa o argumento `resume_from_checkpoint path_to_specific_checkpoint` para retomar o treinamento de uma pasta de checkpoint específica.
-
-```bash
-python examples/pytorch/summarization/run_summarization.py
-    --model_name_or_path t5-small \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --output_dir /tmp/tst-summarization \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --overwrite_output_dir \
-    --resume_from_checkpoint path_to_specific_checkpoint \
-    --predict_with_generate
-```
-
-## Compartilhando seu modelo 
-
-Todos os scripts podem enviar seu modelo final para o [Model Hub](https://huggingface.co/models). Certifique-se de estar conectado ao Hugging Face antes de começar:
-
-```bash
-huggingface-cli login
-```
-
-Em seguida, adicione o argumento `push_to_hub` ao script. Este argumento criará um repositório com seu nome de usuário do Hugging Face e o nome da pasta especificado em `output_dir`.
-
-Para dar um nome específico ao seu repositório, use o argumento `push_to_hub_model_id` para adicioná-lo. O repositório será listado automaticamente em seu namespace.
-
-O exemplo a seguir mostra como fazer upload de um modelo com um nome de repositório específico:
-
-```bash
-python examples/pytorch/summarization/run_summarization.py
-    --model_name_or_path t5-small \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --push_to_hub \
-    --push_to_hub_model_id finetuned-t5-cnn_dailymail \
-    --output_dir /tmp/tst-summarization \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --overwrite_output_dir \
-    --predict_with_generate
-```
diff --git a/docs/source/pt/serialization.md b/docs/source/pt/serialization.md
new file mode 100644
index 000000000000..d5a21c7f890d
--- /dev/null
+++ b/docs/source/pt/serialization.md
@@ -0,0 +1,502 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Exportando modelos para ONNX 
+
+Se você precisar implantar modelos 🤗 Transformers em ambientes de produção, recomendamos
+exporta-los para um formato serializado que pode ser carregado e executado em
+tempos de execução e hardware. Neste guia, mostraremos como exportar modelos 🤗 Transformers
+para [ONNX (Open Neural Network eXchange)](http://onnx.ai).
+
+<Tip>
+
+Uma vez exportado, um modelo pode ser otimizado para inferência por meio de técnicas como
+quantização e poda. Se você estiver interessado em otimizar seus modelos para serem executados com
+máxima eficiência, confira a biblioteca [🤗 Optimum
+](https://github.com/huggingface/optimum).
+
+</Tip>
+
+ONNX é um padrão aberto que define um conjunto comum de operadores e um formato de arquivo comum
+para representar modelos de aprendizado profundo em uma ampla variedade de estruturas, incluindo PyTorch e
+TensorFlow. Quando um modelo é exportado para o formato ONNX, esses operadores são usados para
+construir um grafo computacional (muitas vezes chamado de _representação intermediária_) que
+representa o fluxo de dados através da rede neural.
+
+Ao expor um grafo com operadores e tipos de dados padronizados, o ONNX facilita a
+alternar entre os frameworks. Por exemplo, um modelo treinado em PyTorch pode ser exportado para
+formato ONNX e depois importado no TensorFlow (e vice-versa).
+
+🤗 Transformers fornece um pacote [`transformers.onnx`](main_classes/onnx) que permite
+que você converta os checkpoints do modelo em um grafo ONNX aproveitando os objetos de configuração.
+Esses objetos de configuração vêm prontos para várias arquiteturas de modelo e são
+projetado para ser facilmente extensível a outras arquiteturas.
+
+As configurações prontas incluem as seguintes arquiteturas:
+
+<!--This table is automatically generated by `make fix-copies`, do not fill manually!-->
+
+- ALBERT
+- BART
+- BEiT
+- BERT
+- BigBird
+- BigBird-Pegasus
+- Blenderbot
+- BlenderbotSmall
+- BLOOM
+- CamemBERT
+- CLIP
+- CodeGen
+- Conditional DETR
+- ConvBERT
+- ConvNeXT
+- ConvNeXTV2
+- Data2VecText
+- Data2VecVision
+- DeBERTa
+- DeBERTa-v2
+- DeiT
+- DETR
+- DistilBERT
+- ELECTRA
+- ERNIE
+- FlauBERT
+- GPT Neo
+- GPT-J
+- GroupViT
+- I-BERT
+- LayoutLM
+- LayoutLMv3
+- LeViT
+- Longformer
+- LongT5
+- M2M100
+- Marian
+- mBART
+- MobileBERT
+- MobileViT
+- MT5
+- OpenAI GPT-2
+- OWL-ViT
+- Perceiver
+- PLBart
+- ResNet
+- RoBERTa
+- RoFormer
+- SegFormer
+- SqueezeBERT
+- Swin Transformer
+- T5
+- Table Transformer
+- Vision Encoder decoder
+- ViT
+- XLM
+- XLM-RoBERTa
+- XLM-RoBERTa-XL
+- YOLOS
+
+Nas próximas duas seções, mostraremos como:
+
+* Exportar um modelo suportado usando o pacote `transformers.onnx`.
+* Exportar um modelo personalizado para uma arquitetura sem suporte.
+
+## Exportando um modelo para ONNX
+
+Para exportar um modelo 🤗 Transformers para o ONNX, primeiro você precisa instalar algumas
+dependências extras:
+
+```bash
+pip install transformers[onnx]
+```
+
+O pacote `transformers.onnx` pode então ser usado como um módulo Python:
+
+```bash
+python -m transformers.onnx --help
+
+usage: Hugging Face Transformers ONNX exporter [-h] -m MODEL [--feature {causal-lm, ...}] [--opset OPSET] [--atol ATOL] output
+
+positional arguments:
+  output                Path indicating where to store generated ONNX model.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -m MODEL, --model MODEL
+                        Model ID on huggingface.co or path on disk to load model from.
+  --feature {causal-lm, ...}
+                        The type of features to export the model with.
+  --opset OPSET         ONNX opset version to export the model with.
+  --atol ATOL           Absolute difference tolerance when validating the model.
+```
+
+A exportação de um checkpoint usando uma configuração pronta pode ser feita da seguinte forma:
+
+```bash
+python -m transformers.onnx --model=distilbert-base-uncased onnx/
+```
+
+Você deve ver os seguintes logs:
+
+```bash
+Validating ONNX model...
+        -[✓] ONNX model output names match reference model ({'last_hidden_state'})
+        - Validating ONNX Model output "last_hidden_state":
+                -[✓] (2, 8, 768) matches (2, 8, 768)
+                -[✓] all values close (atol: 1e-05)
+All good, model saved at: onnx/model.onnx
+```
+
+Isso exporta um grafo ONNX do ponto de verificação definido pelo argumento `--model`. Nisso
+Por exemplo, é `distilbert-base-uncased`, mas pode ser qualquer checkpoint no Hugging
+Face Hub ou um armazenado localmente.
+
+O arquivo `model.onnx` resultante pode ser executado em um dos [muitos
+aceleradores](https://onnx.ai/supported-tools.html#deployModel) que suportam o ONNX
+padrão. Por exemplo, podemos carregar e executar o modelo com [ONNX
+Tempo de execução](https://onnxruntime.ai/) da seguinte forma:
+
+```python
+>>> from transformers import AutoTokenizer
+>>> from onnxruntime import InferenceSession
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+>>> session = InferenceSession("onnx/model.onnx")
+>>> # ONNX Runtime expects NumPy arrays as input
+>>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
+>>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs))
+```
+
+Os nomes de saída necessários (como `["last_hidden_state"]`) podem ser obtidos pegando uma
+ configuração ONNX de cada modelo. Por exemplo, para DistilBERT temos:
+
+```python
+>>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig
+
+>>> config = DistilBertConfig()
+>>> onnx_config = DistilBertOnnxConfig(config)
+>>> print(list(onnx_config.outputs.keys()))
+["last_hidden_state"]
+```
+
+O processo é idêntico para os checkpoints do TensorFlow no Hub. Por exemplo, podemos
+exportar um checkpoint TensorFlow puro do [Keras
+](https://huggingface.co/keras-io) da seguinte forma:
+
+```bash
+python -m transformers.onnx --model=keras-io/transformers-qa onnx/
+```
+
+Para exportar um modelo armazenado localmente, você precisará ter os pesos e
+arquivos tokenizer armazenados em um diretório. Por exemplo, podemos carregar e salvar um checkpoint como:
+
+```python
+>>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
+
+>>> # Load tokenizer and PyTorch weights form the Hub
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> # Save to disk
+>>> tokenizer.save_pretrained("local-pt-checkpoint")
+>>> pt_model.save_pretrained("local-pt-checkpoint")
+```
+
+Uma vez que o checkpoint é salvo, podemos exportá-lo para o ONNX apontando o `--model`
+argumento do pacote `transformers.onnx` para o diretório desejado:
+
+```bash
+python -m transformers.onnx --model=local-pt-checkpoint onnx/
+```
+
+```python
+>>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+
+>>> # Load tokenizer and TensorFlow weights from the Hub
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> # Save to disk
+>>> tokenizer.save_pretrained("local-tf-checkpoint")
+>>> tf_model.save_pretrained("local-tf-checkpoint")
+```
+
+Uma vez que o checkpoint é salvo, podemos exportá-lo para o ONNX apontando o `--model`
+argumento do pacote `transformers.onnx` para o diretório desejado:
+
+```bash
+python -m transformers.onnx --model=local-tf-checkpoint onnx/
+```
+
+## Selecionando features para diferentes tarefas do modelo
+
+Cada configuração pronta vem com um conjunto de _features_ que permitem exportar
+modelos para diferentes tipos de tarefas. Conforme mostrado na tabela abaixo, cada recurso é
+associado a uma `AutoClass` diferente:
+
+| Feature                              | Auto Class                           |
+| ------------------------------------ | ------------------------------------ |
+| `causal-lm`, `causal-lm-with-past`   | `AutoModelForCausalLM`               |
+| `default`, `default-with-past`       | `AutoModel`                          |
+| `masked-lm`                          | `AutoModelForMaskedLM`               |
+| `question-answering`                 | `AutoModelForQuestionAnswering`      |
+| `seq2seq-lm`, `seq2seq-lm-with-past` | `AutoModelForSeq2SeqLM`              |
+| `sequence-classification`            | `AutoModelForSequenceClassification` |
+| `token-classification`               | `AutoModelForTokenClassification`    |
+
+Para cada configuração, você pode encontrar a lista de recursos suportados por meio do
+[`~transformers.onnx.FeaturesManager`]. Por exemplo, para DistilBERT temos:
+
+```python
+>>> from transformers.onnx.features import FeaturesManager
+
+>>> distilbert_features = list(FeaturesManager.get_supported_features_for_model_type("distilbert").keys())
+>>> print(distilbert_features)
+["default", "masked-lm", "causal-lm", "sequence-classification", "token-classification", "question-answering"]
+```
+
+Você pode então passar um desses recursos para o argumento `--feature` no
+pacote `transformers.onnx`. Por exemplo, para exportar um modelo de classificação de texto, podemos
+escolher um modelo ajustado no Hub e executar:
+
+```bash
+python -m transformers.onnx --model=distilbert-base-uncased-finetuned-sst-2-english \
+                            --feature=sequence-classification onnx/
+```
+
+Isso exibe os seguintes logs:
+
+```bash
+Validating ONNX model...
+        -[✓] ONNX model output names match reference model ({'logits'})
+        - Validating ONNX Model output "logits":
+                -[✓] (2, 2) matches (2, 2)
+                -[✓] all values close (atol: 1e-05)
+All good, model saved at: onnx/model.onnx
+```
+
+Observe que, neste caso, os nomes de saída do modelo ajustado são `logits`
+em vez do `last_hidden_state` que vimos com o checkpoint `distilbert-base-uncased`
+mais cedo. Isso é esperado, pois o modelo ajustado (fine-tuned) possui uma cabeça de classificação de sequência.
+
+<Tip>
+
+Os recursos que têm um sufixo `with-pass` (como `causal-lm-with-pass`) correspondem a
+classes de modelo com estados ocultos pré-computados (chave e valores nos blocos de atenção)
+que pode ser usado para decodificação autorregressiva rápida.
+
+</Tip>
+
+<Tip>
+
+Para modelos do tipo `VisionEncoderDecoder`, as partes do codificador e do decodificador são
+exportados separadamente como dois arquivos ONNX chamados `encoder_model.onnx` e `decoder_model.onnx` respectivamente.
+
+</Tip>
+
+## Exportando um modelo para uma arquitetura sem suporte
+
+Se você deseja exportar um modelo cuja arquitetura não é suportada nativamente pela
+biblioteca, há três etapas principais a seguir:
+
+1. Implemente uma configuração ONNX personalizada.
+2. Exporte o modelo para o ONNX.
+3. Valide as saídas do PyTorch e dos modelos exportados.
+
+Nesta seção, veremos como o DistilBERT foi implementado para mostrar o que está envolvido
+em cada passo.
+
+### Implementando uma configuração ONNX personalizada
+
+Vamos começar com o objeto de configuração ONNX. Fornecemos três classes abstratas que
+você deve herdar, dependendo do tipo de arquitetura de modelo que deseja exportar:
+
+* Modelos baseados em codificador herdam de [`~onnx.config.OnnxConfig`]
+* Modelos baseados em decodificador herdam de [`~onnx.config.OnnxConfigWithPast`]
+* Os modelos codificador-decodificador herdam de [`~onnx.config.OnnxSeq2SeqConfigWithPast`]
+
+<Tip>
+
+Uma boa maneira de implementar uma configuração ONNX personalizada é observar as
+implementação no arquivo `configuration_<model_name>.py` de uma arquitetura semelhante.
+
+</Tip>
+
+Como o DistilBERT é um modelo baseado em codificador, sua configuração é herdada de
+`OnnxConfig`:
+
+```python
+>>> from typing import Mapping, OrderedDict
+>>> from transformers.onnx import OnnxConfig
+
+
+>>> class DistilBertOnnxConfig(OnnxConfig):
+...     @property
+...     def inputs(self) -> Mapping[str, Mapping[int, str]]:
+...         return OrderedDict(
+...             [
+...                 ("input_ids", {0: "batch", 1: "sequence"}),
+...                 ("attention_mask", {0: "batch", 1: "sequence"}),
+...             ]
+...         )
+```
+
+Todo objeto de configuração deve implementar a propriedade `inputs` e retornar um mapeamento,
+onde cada chave corresponde a uma entrada esperada e cada valor indica o eixo 
+dessa entrada. Para o DistilBERT, podemos ver que duas entradas são necessárias: `input_ids` e
+`attention_mask`. Essas entradas têm a mesma forma de `(batch_size, sequence_length)`
+é por isso que vemos os mesmos eixos usados na configuração.
+
+<Tip>
+
+Notice that `inputs` property for `DistilBertOnnxConfig` returns an `OrderedDict`. This
+ensures that the inputs are matched with their relative position within the
+`PreTrainedModel.forward()` method when tracing the graph. We recommend using an
+`OrderedDict` for the `inputs` and `outputs` properties when implementing custom ONNX
+configurations.
+
+Observe que a propriedade `inputs` para `DistilBertOnnxConfig` retorna um `OrderedDict`. Este
+garante que as entradas sejam combinadas com sua posição relativa dentro do
+método `PreTrainedModel.forward()` ao traçar o grafo. Recomendamos o uso de um
+`OrderedDict` para as propriedades `inputs` e `outputs` ao implementar configurações personalizadas ONNX.
+
+</Tip>
+
+Depois de implementar uma configuração ONNX, você pode instanciá-la fornecendo a
+configuração do modelo base da seguinte forma:
+
+```python
+>>> from transformers import AutoConfig
+
+>>> config = AutoConfig.from_pretrained("distilbert-base-uncased")
+>>> onnx_config = DistilBertOnnxConfig(config)
+```
+
+O objeto resultante tem várias propriedades úteis. Por exemplo, você pode visualizar o conjunto de operadores ONNX
+ que será usado durante a exportação:
+
+```python
+>>> print(onnx_config.default_onnx_opset)
+11
+```
+
+Você também pode visualizar as saídas associadas ao modelo da seguinte forma:
+
+```python
+>>> print(onnx_config.outputs)
+OrderedDict([("last_hidden_state", {0: "batch", 1: "sequence"})])
+```
+
+Observe que a propriedade outputs segue a mesma estrutura das entradas; ele retorna um
+`OrderedDict` de saídas nomeadas e suas formas. A estrutura de saída está ligada a
+escolha do recurso com o qual a configuração é inicializada. Por padrão, a configuração do ONNX
+é inicializada com o recurso `default` que corresponde à exportação de um
+modelo carregado com a classe `AutoModel`. Se você deseja exportar um modelo para outra tarefa,
+apenas forneça um recurso diferente para o argumento `task` quando você inicializar a configuração ONNX
+. Por exemplo, se quisermos exportar o DistilBERT com uma sequência
+de classificação, poderíamos usar:
+
+```python
+>>> from transformers import AutoConfig
+
+>>> config = AutoConfig.from_pretrained("distilbert-base-uncased")
+>>> onnx_config_for_seq_clf = DistilBertOnnxConfig(config, task="sequence-classification")
+>>> print(onnx_config_for_seq_clf.outputs)
+OrderedDict([('logits', {0: 'batch'})])
+```
+
+<Tip>
+
+Todas as propriedades e métodos básicos associados a [`~onnx.config.OnnxConfig`] e
+as outras classes de configuração podem ser substituídas se necessário. Confira [`BartOnnxConfig`]
+para um exemplo avançado.
+
+</Tip>
+
+### Exportando um modelo
+
+Depois de ter implementado a configuração do ONNX, o próximo passo é exportar o modelo.
+Aqui podemos usar a função `export()` fornecida pelo pacote `transformers.onnx`.
+Esta função espera a configuração do ONNX, juntamente com o modelo base e o tokenizer,
+e o caminho para salvar o arquivo exportado:
+
+```python
+>>> from pathlib import Path
+>>> from transformers.onnx import export
+>>> from transformers import AutoTokenizer, AutoModel
+
+>>> onnx_path = Path("model.onnx")
+>>> model_ckpt = "distilbert-base-uncased"
+>>> base_model = AutoModel.from_pretrained(model_ckpt)
+>>> tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
+
+>>> onnx_inputs, onnx_outputs = export(tokenizer, base_model, onnx_config, onnx_config.default_onnx_opset, onnx_path)
+```
+
+Os `onnx_inputs` e `onnx_outputs` retornados pela função `export()` são listas de
+ chaves definidas nas propriedades `inputs` e `outputs` da configuração. Uma vez que o
+modelo é exportado, você pode testar se o modelo está bem formado da seguinte forma:
+
+```python
+>>> import onnx
+
+>>> onnx_model = onnx.load("model.onnx")
+>>> onnx.checker.check_model(onnx_model)
+```
+
+<Tip>
+
+Se o seu modelo for maior que 2GB, você verá que muitos arquivos adicionais são criados
+durante a exportação. Isso é _esperado_ porque o ONNX usa [Protocol
+Buffers](https://developers.google.com/protocol-buffers/) para armazenar o modelo e estes
+têm um limite de tamanho de 2GB. Veja a [ONNX
+documentação](https://github.com/onnx/onnx/blob/master/docs/ExternalData.md) para
+instruções sobre como carregar modelos com dados externos.
+
+</Tip>
+
+### Validando a saída dos modelos
+
+A etapa final é validar se as saídas do modelo base e exportado concordam
+dentro de alguma tolerância absoluta. Aqui podemos usar a função `validate_model_outputs()`
+fornecida pelo pacote `transformers.onnx` da seguinte forma:
+
+```python
+>>> from transformers.onnx import validate_model_outputs
+
+>>> validate_model_outputs(
+...     onnx_config, tokenizer, base_model, onnx_path, onnx_outputs, onnx_config.atol_for_validation
+... )
+```
+
+Esta função usa o método [`~transformers.onnx.OnnxConfig.generate_dummy_inputs`] para
+gerar entradas para o modelo base e o exportado, e a tolerância absoluta pode ser
+definida na configuração. Geralmente encontramos concordância numérica em 1e-6 a 1e-4
+de alcance, embora qualquer coisa menor que 1e-3 provavelmente esteja OK.
+
+## Contribuindo com uma nova configuração para 🤗 Transformers
+
+Estamos procurando expandir o conjunto de configurações prontas e receber contribuições
+da comunidade! Se você gostaria de contribuir para a biblioteca, você
+precisará:
+
+* Implemente a configuração do ONNX no arquivo `configuration_<model_name>.py` correspondente
+Arquivo
+* Incluir a arquitetura do modelo e recursos correspondentes em
+  [`~onnx.features.FeatureManager`]
+* Adicione sua arquitetura de modelo aos testes em `test_onnx_v2.py`
+
+Confira como ficou a configuração do [IBERT
+](https://github.com/huggingface/transformers/pull/14868/files) para obter uma
+idéia do que está envolvido.
diff --git a/docs/source/pt/serialization.mdx b/docs/source/pt/serialization.mdx
deleted file mode 100644
index 2a01640be467..000000000000
--- a/docs/source/pt/serialization.mdx
+++ /dev/null
@@ -1,497 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Exportando modelos para ONNX 
-
-Se você precisar implantar modelos 🤗 Transformers em ambientes de produção, recomendamos
-exporta-los para um formato serializado que pode ser carregado e executado em
-tempos de execução e hardware. Neste guia, mostraremos como exportar modelos 🤗 Transformers
-para [ONNX (Open Neural Network eXchange)](http://onnx.ai).
-
-<Tip>
-
-Uma vez exportado, um modelo pode ser otimizado para inferência por meio de técnicas como
-quantização e poda. Se você estiver interessado em otimizar seus modelos para serem executados com
-máxima eficiência, confira a biblioteca [🤗 Optimum
-](https://github.com/huggingface/optimum).
-
-</Tip>
-
-ONNX é um padrão aberto que define um conjunto comum de operadores e um formato de arquivo comum
-para representar modelos de aprendizado profundo em uma ampla variedade de estruturas, incluindo PyTorch e
-TensorFlow. Quando um modelo é exportado para o formato ONNX, esses operadores são usados para
-construir um grafo computacional (muitas vezes chamado de _representação intermediária_) que
-representa o fluxo de dados através da rede neural.
-
-Ao expor um grafo com operadores e tipos de dados padronizados, o ONNX facilita a
-alternar entre os frameworks. Por exemplo, um modelo treinado em PyTorch pode ser exportado para
-formato ONNX e depois importado no TensorFlow (e vice-versa).
-
-🤗 Transformers fornece um pacote [`transformers.onnx`](main_classes/onnx) que permite
-que você converta os checkpoints do modelo em um grafo ONNX aproveitando os objetos de configuração.
-Esses objetos de configuração vêm prontos para várias arquiteturas de modelo e são
-projetado para ser facilmente extensível a outras arquiteturas.
-
-As configurações prontas incluem as seguintes arquiteturas:
-
-<!--This table is automatically generated by `make fix-copies`, do not fill manually!-->
-
-- ALBERT
-- BART
-- BEiT
-- BERT
-- BigBird
-- BigBird-Pegasus
-- Blenderbot
-- BlenderbotSmall
-- BLOOM
-- CamemBERT
-- CLIP
-- CodeGen
-- Conditional DETR
-- ConvBERT
-- ConvNeXT
-- Data2VecText
-- Data2VecVision
-- DeBERTa
-- DeBERTa-v2
-- DeiT
-- DETR
-- DistilBERT
-- ELECTRA
-- ERNIE
-- FlauBERT
-- GPT Neo
-- GPT-J
-- GroupViT
-- I-BERT
-- LayoutLM
-- LayoutLMv3
-- LeViT
-- Longformer
-- LongT5
-- M2M100
-- Marian
-- mBART
-- MobileBERT
-- MobileViT
-- MT5
-- OpenAI GPT-2
-- OWL-ViT
-- Perceiver
-- PLBart
-- ResNet
-- RoBERTa
-- RoFormer
-- SegFormer
-- SqueezeBERT
-- Swin Transformer
-- T5
-- Table Transformer
-- Vision Encoder decoder
-- ViT
-- XLM
-- XLM-RoBERTa
-- XLM-RoBERTa-XL
-- YOLOS
-
-Nas próximas duas seções, mostraremos como:
-
-* Exportar um modelo suportado usando o pacote `transformers.onnx`.
-* Exportar um modelo personalizado para uma arquitetura sem suporte.
-
-## Exportando um modelo para ONNX
-
-Para exportar um modelo 🤗 Transformers para o ONNX, primeiro você precisa instalar algumas
-dependências extras:
-
-```bash
-pip install transformers[onnx]
-```
-
-O pacote `transformers.onnx` pode então ser usado como um módulo Python:
-
-```bash
-python -m transformers.onnx --help
-
-usage: Hugging Face Transformers ONNX exporter [-h] -m MODEL [--feature {causal-lm, ...}] [--opset OPSET] [--atol ATOL] output
-
-positional arguments:
-  output                Path indicating where to store generated ONNX model.
-
-optional arguments:
-  -h, --help            show this help message and exit
-  -m MODEL, --model MODEL
-                        Model ID on huggingface.co or path on disk to load model from.
-  --feature {causal-lm, ...}
-                        The type of features to export the model with.
-  --opset OPSET         ONNX opset version to export the model with.
-  --atol ATOL           Absolute difference tolerance when validating the model.
-```
-
-A exportação de um checkpoint usando uma configuração pronta pode ser feita da seguinte forma:
-
-```bash
-python -m transformers.onnx --model=distilbert-base-uncased onnx/
-```
-
-Você deve ver os seguintes logs:
-
-```bash
-Validating ONNX model...
-        -[✓] ONNX model output names match reference model ({'last_hidden_state'})
-        - Validating ONNX Model output "last_hidden_state":
-                -[✓] (2, 8, 768) matches (2, 8, 768)
-                -[✓] all values close (atol: 1e-05)
-All good, model saved at: onnx/model.onnx
-```
-
-Isso exporta um grafo ONNX do ponto de verificação definido pelo argumento `--model`. Nisso
-Por exemplo, é `distilbert-base-uncased`, mas pode ser qualquer checkpoint no Hugging
-Face Hub ou um armazenado localmente.
-
-O arquivo `model.onnx` resultante pode ser executado em um dos [muitos
-aceleradores](https://onnx.ai/supported-tools.html#deployModel) que suportam o ONNX
-padrão. Por exemplo, podemos carregar e executar o modelo com [ONNX
-Tempo de execução](https://onnxruntime.ai/) da seguinte forma:
-
-```python
->>> from transformers import AutoTokenizer
->>> from onnxruntime import InferenceSession
-
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
->>> session = InferenceSession("onnx/model.onnx")
->>> # ONNX Runtime expects NumPy arrays as input
->>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
->>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs))
-```
-
-Os nomes de saída necessários (como `["last_hidden_state"]`) podem ser obtidos pegando uma
- configuração ONNX de cada modelo. Por exemplo, para DistilBERT temos:
-
-```python
->>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig
-
->>> config = DistilBertConfig()
->>> onnx_config = DistilBertOnnxConfig(config)
->>> print(list(onnx_config.outputs.keys()))
-["last_hidden_state"]
-```
-
-O processo é idêntico para os checkpoints do TensorFlow no Hub. Por exemplo, podemos
-exportar um checkpoint TensorFlow puro do [Keras
-](https://huggingface.co/keras-io) da seguinte forma:
-
-```bash
-python -m transformers.onnx --model=keras-io/transformers-qa onnx/
-```
-
-Para exportar um modelo armazenado localmente, você precisará ter os pesos e
-arquivos tokenizer armazenados em um diretório. Por exemplo, podemos carregar e salvar um checkpoint como:
-
-```python
->>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
-
->>> # Load tokenizer and PyTorch weights form the Hub
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
->>> pt_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
->>> # Save to disk
->>> tokenizer.save_pretrained("local-pt-checkpoint")
->>> pt_model.save_pretrained("local-pt-checkpoint")
-```
-
-Uma vez que o checkpoint é salvo, podemos exportá-lo para o ONNX apontando o `--model`
-argumento do pacote `transformers.onnx` para o diretório desejado:
-
-```bash
-python -m transformers.onnx --model=local-pt-checkpoint onnx/
-```
-
-```python
->>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
-
->>> # Load tokenizer and TensorFlow weights from the Hub
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
->>> # Save to disk
->>> tokenizer.save_pretrained("local-tf-checkpoint")
->>> tf_model.save_pretrained("local-tf-checkpoint")
-```
-
-Uma vez que o checkpoint é salvo, podemos exportá-lo para o ONNX apontando o `--model`
-argumento do pacote `transformers.onnx` para o diretório desejado:
-
-```bash
-python -m transformers.onnx --model=local-tf-checkpoint onnx/
-```
-
-## Selecionando features para diferentes tarefas do modelo
-
-Cada configuração pronta vem com um conjunto de _features_ que permitem exportar
-modelos para diferentes tipos de tarefas. Conforme mostrado na tabela abaixo, cada recurso é
-associado a uma `AutoClass` diferente:
-
-| Feature                              | Auto Class                           |
-| ------------------------------------ | ------------------------------------ |
-| `causal-lm`, `causal-lm-with-past`   | `AutoModelForCausalLM`               |
-| `default`, `default-with-past`       | `AutoModel`                          |
-| `masked-lm`                          | `AutoModelForMaskedLM`               |
-| `question-answering`                 | `AutoModelForQuestionAnswering`      |
-| `seq2seq-lm`, `seq2seq-lm-with-past` | `AutoModelForSeq2SeqLM`              |
-| `sequence-classification`            | `AutoModelForSequenceClassification` |
-| `token-classification`               | `AutoModelForTokenClassification`    |
-
-Para cada configuração, você pode encontrar a lista de recursos suportados por meio do
-[`~transformers.onnx.FeaturesManager`]. Por exemplo, para DistilBERT temos:
-
-```python
->>> from transformers.onnx.features import FeaturesManager
-
->>> distilbert_features = list(FeaturesManager.get_supported_features_for_model_type("distilbert").keys())
->>> print(distilbert_features)
-["default", "masked-lm", "causal-lm", "sequence-classification", "token-classification", "question-answering"]
-```
-
-Você pode então passar um desses recursos para o argumento `--feature` no
-pacote `transformers.onnx`. Por exemplo, para exportar um modelo de classificação de texto, podemos
-escolher um modelo ajustado no Hub e executar:
-
-```bash
-python -m transformers.onnx --model=distilbert-base-uncased-finetuned-sst-2-english \
-                            --feature=sequence-classification onnx/
-```
-
-Isso exibe os seguintes logs:
-
-```bash
-Validating ONNX model...
-        -[✓] ONNX model output names match reference model ({'logits'})
-        - Validating ONNX Model output "logits":
-                -[✓] (2, 2) matches (2, 2)
-                -[✓] all values close (atol: 1e-05)
-All good, model saved at: onnx/model.onnx
-```
-
-Observe que, neste caso, os nomes de saída do modelo ajustado são `logits`
-em vez do `last_hidden_state` que vimos com o checkpoint `distilbert-base-uncased`
-mais cedo. Isso é esperado, pois o modelo ajustado (fine-tuned) possui uma cabeça de classificação de sequência.
-
-<Tip>
-
-Os recursos que têm um sufixo `with-pass` (como `causal-lm-with-pass`) correspondem a
-classes de modelo com estados ocultos pré-computados (chave e valores nos blocos de atenção)
-que pode ser usado para decodificação autorregressiva rápida.
-
-</Tip>
-
-<Tip>
-
-Para modelos do tipo `VisionEncoderDecoder`, as partes do codificador e do decodificador são
-exportados separadamente como dois arquivos ONNX chamados `encoder_model.onnx` e `decoder_model.onnx` respectivamente.
-
-</Tip>
-
-## Exportando um modelo para uma arquitetura sem suporte
-
-Se você deseja exportar um modelo cuja arquitetura não é suportada nativamente pela
-biblioteca, há três etapas principais a seguir:
-
-1. Implemente uma configuração ONNX personalizada.
-2. Exporte o modelo para o ONNX.
-3. Valide as saídas do PyTorch e dos modelos exportados.
-
-Nesta seção, veremos como o DistilBERT foi implementado para mostrar o que está envolvido
-em cada passo.
-
-### Implementando uma configuração ONNX personalizada
-
-Vamos começar com o objeto de configuração ONNX. Fornecemos três classes abstratas que
-você deve herdar, dependendo do tipo de arquitetura de modelo que deseja exportar:
-
-* Modelos baseados em codificador herdam de [`~onnx.config.OnnxConfig`]
-* Modelos baseados em decodificador herdam de [`~onnx.config.OnnxConfigWithPast`]
-* Os modelos codificador-decodificador herdam de [`~onnx.config.OnnxSeq2SeqConfigWithPast`]
-
-<Tip>
-
-Uma boa maneira de implementar uma configuração ONNX personalizada é observar as
-implementação no arquivo `configuration_<model_name>.py` de uma arquitetura semelhante.
-
-</Tip>
-
-Como o DistilBERT é um modelo baseado em codificador, sua configuração é herdada de
-`OnnxConfig`:
-
-```python
->>> from typing import Mapping, OrderedDict
->>> from transformers.onnx import OnnxConfig
-
-
->>> class DistilBertOnnxConfig(OnnxConfig):
-...     @property
-...     def inputs(self) -> Mapping[str, Mapping[int, str]]:
-...         return OrderedDict(
-...             [
-...                 ("input_ids", {0: "batch", 1: "sequence"}),
-...                 ("attention_mask", {0: "batch", 1: "sequence"}),
-...             ]
-...         )
-```
-
-Todo objeto de configuração deve implementar a propriedade `inputs` e retornar um mapeamento,
-onde cada chave corresponde a uma entrada esperada e cada valor indica o eixo 
-dessa entrada. Para o DistilBERT, podemos ver que duas entradas são necessárias: `input_ids` e
-`attention_mask`. Essas entradas têm a mesma forma de `(batch_size, sequence_length)`
-é por isso que vemos os mesmos eixos usados na configuração.
-
-<Tip>
-
-Notice that `inputs` property for `DistilBertOnnxConfig` returns an `OrderedDict`. This
-ensures that the inputs are matched with their relative position within the
-`PreTrainedModel.forward()` method when tracing the graph. We recommend using an
-`OrderedDict` for the `inputs` and `outputs` properties when implementing custom ONNX
-configurations.
-
-Observe que a propriedade `inputs` para `DistilBertOnnxConfig` retorna um `OrderedDict`. Este
-garante que as entradas sejam combinadas com sua posição relativa dentro do
-método `PreTrainedModel.forward()` ao traçar o grafo. Recomendamos o uso de um
-`OrderedDict` para as propriedades `inputs` e `outputs` ao implementar configurações personalizadas ONNX.
-
-</Tip>
-
-Depois de implementar uma configuração ONNX, você pode instanciá-la fornecendo a
-configuração do modelo base da seguinte forma:
-
-```python
->>> from transformers import AutoConfig
-
->>> config = AutoConfig.from_pretrained("distilbert-base-uncased")
->>> onnx_config = DistilBertOnnxConfig(config)
-```
-
-O objeto resultante tem várias propriedades úteis. Por exemplo, você pode visualizar o conjunto de operadores ONNX
- que será usado durante a exportação:
-
-```python
->>> print(onnx_config.default_onnx_opset)
-11
-```
-
-Você também pode visualizar as saídas associadas ao modelo da seguinte forma:
-
-```python
->>> print(onnx_config.outputs)
-OrderedDict([("last_hidden_state", {0: "batch", 1: "sequence"})])
-```
-
-Observe que a propriedade outputs segue a mesma estrutura das entradas; ele retorna um
-`OrderedDict` de saídas nomeadas e suas formas. A estrutura de saída está ligada a
-escolha do recurso com o qual a configuração é inicializada. Por padrão, a configuração do ONNX
-é inicializada com o recurso `default` que corresponde à exportação de um
-modelo carregado com a classe `AutoModel`. Se você deseja exportar um modelo para outra tarefa,
-apenas forneça um recurso diferente para o argumento `task` quando você inicializar a configuração ONNX
-. Por exemplo, se quisermos exportar o DistilBERT com uma sequência
-de classificação, poderíamos usar:
-
-```python
->>> from transformers import AutoConfig
-
->>> config = AutoConfig.from_pretrained("distilbert-base-uncased")
->>> onnx_config_for_seq_clf = DistilBertOnnxConfig(config, task="sequence-classification")
->>> print(onnx_config_for_seq_clf.outputs)
-OrderedDict([('logits', {0: 'batch'})])
-```
-
-<Tip>
-
-Todas as propriedades e métodos básicos associados a [`~onnx.config.OnnxConfig`] e
-as outras classes de configuração podem ser substituídas se necessário. Confira [`BartOnnxConfig`]
-para um exemplo avançado.
-
-</Tip>
-
-### Exportando um modelo
-
-Depois de ter implementado a configuração do ONNX, o próximo passo é exportar o modelo.
-Aqui podemos usar a função `export()` fornecida pelo pacote `transformers.onnx`.
-Esta função espera a configuração do ONNX, juntamente com o modelo base e o tokenizer,
-e o caminho para salvar o arquivo exportado:
-
-```python
->>> from pathlib import Path
->>> from transformers.onnx import export
->>> from transformers import AutoTokenizer, AutoModel
-
->>> onnx_path = Path("model.onnx")
->>> model_ckpt = "distilbert-base-uncased"
->>> base_model = AutoModel.from_pretrained(model_ckpt)
->>> tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
-
->>> onnx_inputs, onnx_outputs = export(tokenizer, base_model, onnx_config, onnx_config.default_onnx_opset, onnx_path)
-```
-
-Os `onnx_inputs` e `onnx_outputs` retornados pela função `export()` são listas de
- chaves definidas nas propriedades `inputs` e `outputs` da configuração. Uma vez que o
-modelo é exportado, você pode testar se o modelo está bem formado da seguinte forma:
-
-```python
->>> import onnx
-
->>> onnx_model = onnx.load("model.onnx")
->>> onnx.checker.check_model(onnx_model)
-```
-
-<Tip>
-
-Se o seu modelo for maior que 2GB, você verá que muitos arquivos adicionais são criados
-durante a exportação. Isso é _esperado_ porque o ONNX usa [Protocol
-Buffers](https://developers.google.com/protocol-buffers/) para armazenar o modelo e estes
-têm um limite de tamanho de 2GB. Veja a [ONNX
-documentação](https://github.com/onnx/onnx/blob/master/docs/ExternalData.md) para
-instruções sobre como carregar modelos com dados externos.
-
-</Tip>
-
-### Validando a saída dos modelos
-
-A etapa final é validar se as saídas do modelo base e exportado concordam
-dentro de alguma tolerância absoluta. Aqui podemos usar a função `validate_model_outputs()`
-fornecida pelo pacote `transformers.onnx` da seguinte forma:
-
-```python
->>> from transformers.onnx import validate_model_outputs
-
->>> validate_model_outputs(
-...     onnx_config, tokenizer, base_model, onnx_path, onnx_outputs, onnx_config.atol_for_validation
-... )
-```
-
-Esta função usa o método [`~transformers.onnx.OnnxConfig.generate_dummy_inputs`] para
-gerar entradas para o modelo base e o exportado, e a tolerância absoluta pode ser
-definida na configuração. Geralmente encontramos concordância numérica em 1e-6 a 1e-4
-de alcance, embora qualquer coisa menor que 1e-3 provavelmente esteja OK.
-
-## Contribuindo com uma nova configuração para 🤗 Transformers
-
-Estamos procurando expandir o conjunto de configurações prontas e receber contribuições
-da comunidade! Se você gostaria de contribuir para a biblioteca, você
-precisará:
-
-* Implemente a configuração do ONNX no arquivo `configuration_<model_name>.py` correspondente
-Arquivo
-* Incluir a arquitetura do modelo e recursos correspondentes em
-  [`~onnx.features.FeatureManager`]
-* Adicione sua arquitetura de modelo aos testes em `test_onnx_v2.py`
-
-Confira como ficou a configuração do [IBERT
-](https://github.com/huggingface/transformers/pull/14868/files) para obter uma
-idéia do que está envolvido.
diff --git a/docs/source/pt/tasks/sequence_classification.md b/docs/source/pt/tasks/sequence_classification.md
new file mode 100644
index 000000000000..6469ac4d4553
--- /dev/null
+++ b/docs/source/pt/tasks/sequence_classification.md
@@ -0,0 +1,216 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Classificação de texto
+
+<Youtube id="leNG9fN9FQU"/>
+
+A classificação de texto é uma tarefa comum de NLP que atribui um rótulo ou classe a um texto. Existem muitas aplicações práticas de classificação de texto amplamente utilizadas em produção por algumas das maiores empresas da atualidade. Uma das formas mais populares de classificação de texto é a análise de sentimento, que atribui um rótulo como positivo, negativo ou neutro a um texto.
+
+Este guia mostrará como realizar o fine-tuning do [DistilBERT](https://huggingface.co/distilbert-base-uncased) no conjunto de dados [IMDb](https://huggingface.co/datasets/imdb) para determinar se a crítica de filme é positiva ou negativa.
+
+<Tip>
+
+Consulte a [página de tarefas de classificação de texto](https://huggingface.co/tasks/text-classification) para obter mais informações sobre outras formas de classificação de texto e seus modelos, conjuntos de dados e métricas associados.
+
+</Tip>
+
+## Carregue o conjunto de dados IMDb
+
+Carregue o conjunto de dados IMDb utilizando a biblioteca 🤗 Datasets:
+
+```py
+>>> from datasets import load_dataset
+
+>>> imdb = load_dataset("imdb")
+```
+
+Em seguida, dê uma olhada em um exemplo:
+
+```py
+>>> imdb["test"][0]
+{
+    "label": 0,
+    "text": "I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the background, and painfully one-dimensional characters cannot be overcome with a 'sci-fi' setting. (I'm sure there are those of you out there who think Babylon 5 is good sci-fi TV. It's not. It's clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It's really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it's rubbish as they have to always say \"Gene Roddenberry's Earth...\" otherwise people would not continue watching. Roddenberry's ashes must be turning in their orbit as this dull, cheap, poorly edited (watching it without advert breaks really brings this home) trudging Trabant of a show lumbers into space. Spoiler. So, kill off a main character. And then bring him back as another actor. Jeeez! Dallas all over again.",
+}
+```
+
+Existem dois campos neste dataset:
+
+- `text`: uma string contendo o texto da crítica do filme.
+- `label`: um valor que pode ser `0` para uma crítica negativa ou `1` para uma crítica positiva.
+
+## Pré-processamento dos dados
+
+Carregue o tokenizador do DistilBERT para processar o campo `text`:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+```
+
+Crie uma função de pré-processamento para tokenizar o campo `text` e truncar as sequências para que não sejam maiores que o comprimento máximo de entrada do DistilBERT:
+
+```py
+>>> def preprocess_function(examples):
+...     return tokenizer(examples["text"], truncation=True)
+```
+
+Use a função [`map`](https://huggingface.co/docs/datasets/process#map) do 🤗 Datasets para aplicar a função de pré-processamento em todo o conjunto de dados. Você pode acelerar a função `map` definindo `batched=True` para processar vários elementos do conjunto de dados de uma só vez:
+
+```py
+tokenized_imdb = imdb.map(preprocess_function, batched=True)
+```
+
+Use o [`DataCollatorWithPadding`] para criar um batch de exemplos. Ele também *preencherá dinamicamente* seu texto até o comprimento do elemento mais longo em seu batch, para que os exemplos do batch tenham um comprimento uniforme. Embora seja possível preencher seu texto com a função `tokenizer` definindo `padding=True`, o preenchimento dinâmico utilizando um data collator é mais eficiente.
+
+<frameworkcontent>
+<pt>
+```py
+>>> from transformers import DataCollatorWithPadding
+
+>>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+```
+</pt>
+<tf>
+```py
+>>> from transformers import DataCollatorWithPadding
+
+>>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
+```
+</tf>
+</frameworkcontent>
+
+## Train
+
+<frameworkcontent>
+<pt>
+Carregue o DistilBERT com [`AutoModelForSequenceClassification`] junto com o número de rótulos esperados:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
+```
+
+<Tip>
+
+Se você não estiver familiarizado com o fine-tuning de um modelo com o [`Trainer`], dê uma olhada no tutorial básico [aqui](../training#finetune-with-trainer)!
+
+</Tip>
+
+Nesse ponto, restam apenas três passos:
+
+1. Definir seus hiperparâmetros de treinamento em [`TrainingArguments`].
+2. Passar os argumentos de treinamento para o [`Trainer`] junto com o modelo, conjunto de dados, tokenizador e o data collator.
+3. Chamar a função [`~Trainer.train`] para executar o fine-tuning do seu modelo.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="./results",
+...     learning_rate=2e-5,
+...     per_device_train_batch_size=16,
+...     per_device_eval_batch_size=16,
+...     num_train_epochs=5,
+...     weight_decay=0.01,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=tokenized_imdb["train"],
+...     eval_dataset=tokenized_imdb["test"],
+...     tokenizer=tokenizer,
+...     data_collator=data_collator,
+... )
+
+>>> trainer.train()
+```
+
+<Tip>
+
+O [`Trainer`] aplicará o preenchimento dinâmico por padrão quando você definir o argumento `tokenizer` dele. Nesse caso, você não precisa especificar um data collator explicitamente.
+
+</Tip>
+</pt>
+<tf>
+Para executar o fine-tuning de um modelo no TensorFlow, comece convertendo seu conjunto de dados para o formato `tf.data.Dataset` com [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.to_tf_dataset). Nessa execução você deverá especificar as entradas e rótulos (no parâmetro `columns`), se deseja embaralhar o conjunto de dados, o tamanho do batch e o data collator:
+
+```py
+>>> tf_train_set = tokenized_imdb["train"].to_tf_dataset(
+...     columns=["attention_mask", "input_ids", "label"],
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_validation_set = tokenized_imdb["test"].to_tf_dataset(
+...     columns=["attention_mask", "input_ids", "label"],
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+<Tip>
+
+Se você não estiver familiarizado com o fine-tuning de um modelo com o Keras, dê uma olhada no tutorial básico [aqui](training#finetune-with-keras)!
+
+</Tip>
+
+Configure o otimizador e alguns hiperparâmetros de treinamento:
+
+```py
+>>> from transformers import create_optimizer
+>>> import tensorflow as tf
+
+>>> batch_size = 16
+>>> num_epochs = 5
+>>> batches_per_epoch = len(tokenized_imdb["train"]) // batch_size
+>>> total_train_steps = int(batches_per_epoch * num_epochs)
+>>> optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
+```
+
+Carregue o DistilBERT com [`TFAutoModelForSequenceClassification`] junto com o número de rótulos esperados:
+
+```py
+>>> from transformers import TFAutoModelForSequenceClassification
+
+>>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
+```
+
+Configure o modelo para treinamento com o método [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)
+```
+
+Chame o método [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) para executar o fine-tuning do modelo:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3)
+```
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+Para obter um exemplo mais aprofundado de como executar o fine-tuning de um modelo para classificação de texto, dê uma olhada nesse [notebook utilizando PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb) ou nesse [notebook utilizando TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb).
+
+</Tip>
\ No newline at end of file
diff --git a/docs/source/pt/tasks/sequence_classification.mdx b/docs/source/pt/tasks/sequence_classification.mdx
deleted file mode 100644
index 7c443e700d4e..000000000000
--- a/docs/source/pt/tasks/sequence_classification.mdx
+++ /dev/null
@@ -1,212 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Classificação de texto
-
-<Youtube id="leNG9fN9FQU"/>
-
-A classificação de texto é uma tarefa comum de NLP que atribui um rótulo ou classe a um texto. Existem muitas aplicações práticas de classificação de texto amplamente utilizadas em produção por algumas das maiores empresas da atualidade. Uma das formas mais populares de classificação de texto é a análise de sentimento, que atribui um rótulo como positivo, negativo ou neutro a um texto.
-
-Este guia mostrará como realizar o fine-tuning do [DistilBERT](https://huggingface.co/distilbert-base-uncased) no conjunto de dados [IMDb](https://huggingface.co/datasets/imdb) para determinar se a crítica de filme é positiva ou negativa.
-
-<Tip>
-
-Consulte a [página de tarefas de classificação de texto](https://huggingface.co/tasks/text-classification) para obter mais informações sobre outras formas de classificação de texto e seus modelos, conjuntos de dados e métricas associados.
-
-</Tip>
-
-## Carregue o conjunto de dados IMDb
-
-Carregue o conjunto de dados IMDb utilizando a biblioteca 🤗 Datasets:
-
-```py
->>> from datasets import load_dataset
-
->>> imdb = load_dataset("imdb")
-```
-
-Em seguida, dê uma olhada em um exemplo:
-
-```py
->>> imdb["test"][0]
-{
-    "label": 0,
-    "text": "I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the background, and painfully one-dimensional characters cannot be overcome with a 'sci-fi' setting. (I'm sure there are those of you out there who think Babylon 5 is good sci-fi TV. It's not. It's clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It's really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it's rubbish as they have to always say \"Gene Roddenberry's Earth...\" otherwise people would not continue watching. Roddenberry's ashes must be turning in their orbit as this dull, cheap, poorly edited (watching it without advert breaks really brings this home) trudging Trabant of a show lumbers into space. Spoiler. So, kill off a main character. And then bring him back as another actor. Jeeez! Dallas all over again.",
-}
-```
-
-Existem dois campos neste dataset:
-
-- `text`: uma string contendo o texto da crítica do filme.
-- `label`: um valor que pode ser `0` para uma crítica negativa ou `1` para uma crítica positiva.
-
-## Pré-processamento dos dados
-
-Carregue o tokenizador do DistilBERT para processar o campo `text`:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
-```
-
-Crie uma função de pré-processamento para tokenizar o campo `text` e truncar as sequências para que não sejam maiores que o comprimento máximo de entrada do DistilBERT:
-
-```py
->>> def preprocess_function(examples):
-...     return tokenizer(examples["text"], truncation=True)
-```
-
-Use a função [`map`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map) do 🤗 Datasets para aplicar a função de pré-processamento em todo o conjunto de dados. Você pode acelerar a função `map` definindo `batched=True` para processar vários elementos do conjunto de dados de uma só vez:
-
-```py
-tokenized_imdb = imdb.map(preprocess_function, batched=True)
-```
-
-Use o [`DataCollatorWithPadding`] para criar um batch de exemplos. Ele também *preencherá dinamicamente* seu texto até o comprimento do elemento mais longo em seu batch, para que os exemplos do batch tenham um comprimento uniforme. Embora seja possível preencher seu texto com a função `tokenizer` definindo `padding=True`, o preenchimento dinâmico utilizando um data collator é mais eficiente.
-
-<frameworkcontent>
-<pt>
-```py
->>> from transformers import DataCollatorWithPadding
-
->>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
-```
-</pt>
-<tf>
-```py
->>> from transformers import DataCollatorWithPadding
-
->>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
-```
-</tf>
-</frameworkcontent>
-
-## Train
-
-<frameworkcontent>
-<pt>
-Carregue o DistilBERT com [`AutoModelForSequenceClassification`] junto com o número de rótulos esperados:
-
-```py
->>> from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
-
->>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
-```
-
-<Tip>
-
-Se você não estiver familiarizado com o fine-tuning de um modelo com o [`Trainer`], dê uma olhada no tutorial básico [aqui](../training#finetune-with-trainer)!
-
-</Tip>
-
-Nesse ponto, restam apenas três passos:
-
-1. Definir seus hiperparâmetros de treinamento em [`TrainingArguments`].
-2. Passar os argumentos de treinamento para o [`Trainer`] junto com o modelo, conjunto de dados, tokenizador e o data collator.
-3. Chamar a função [`~Trainer.train`] para executar o fine-tuning do seu modelo.
-
-```py
->>> training_args = TrainingArguments(
-...     output_dir="./results",
-...     learning_rate=2e-5,
-...     per_device_train_batch_size=16,
-...     per_device_eval_batch_size=16,
-...     num_train_epochs=5,
-...     weight_decay=0.01,
-... )
-
->>> trainer = Trainer(
-...     model=model,
-...     args=training_args,
-...     train_dataset=tokenized_imdb["train"],
-...     eval_dataset=tokenized_imdb["test"],
-...     tokenizer=tokenizer,
-...     data_collator=data_collator,
-... )
-
->>> trainer.train()
-```
-
-<Tip>
-
-O [`Trainer`] aplicará o preenchimento dinâmico por padrão quando você definir o argumento `tokenizer` dele. Nesse caso, você não precisa especificar um data collator explicitamente.
-
-</Tip>
-</pt>
-<tf>
-Para executar o fine-tuning de um modelo no TensorFlow, comece convertendo seu conjunto de dados para o formato `tf.data.Dataset` com [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.to_tf_dataset). Nessa execução você deverá especificar as entradas e rótulos (no parâmetro `columns`), se deseja embaralhar o conjunto de dados, o tamanho do batch e o data collator:
-
-```py
->>> tf_train_set = tokenized_imdb["train"].to_tf_dataset(
-...     columns=["attention_mask", "input_ids", "label"],
-...     shuffle=True,
-...     batch_size=16,
-...     collate_fn=data_collator,
-... )
-
->>> tf_validation_set = tokenized_imdb["test"].to_tf_dataset(
-...     columns=["attention_mask", "input_ids", "label"],
-...     shuffle=False,
-...     batch_size=16,
-...     collate_fn=data_collator,
-... )
-```
-
-<Tip>
-
-Se você não estiver familiarizado com o fine-tuning de um modelo com o Keras, dê uma olhada no tutorial básico [aqui](training#finetune-with-keras)!
-
-</Tip>
-
-Configure o otimizador e alguns hiperparâmetros de treinamento:
-
-```py
->>> from transformers import create_optimizer
->>> import tensorflow as tf
-
->>> batch_size = 16
->>> num_epochs = 5
->>> batches_per_epoch = len(tokenized_imdb["train"]) // batch_size
->>> total_train_steps = int(batches_per_epoch * num_epochs)
->>> optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
-```
-
-Carregue o DistilBERT com [`TFAutoModelForSequenceClassification`] junto com o número de rótulos esperados:
-
-```py
->>> from transformers import TFAutoModelForSequenceClassification
-
->>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
-```
-
-Configure o modelo para treinamento com o método [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
-
-```py
->>> import tensorflow as tf
-
->>> model.compile(optimizer=optimizer)
-```
-
-Chame o método [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) para executar o fine-tuning do modelo:
-
-```py
->>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3)
-```
-</tf>
-</frameworkcontent>
-
-<Tip>
-
-Para obter um exemplo mais aprofundado de como executar o fine-tuning de um modelo para classificação de texto, dê uma olhada nesse [notebook utilizando PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb) ou nesse [notebook utilizando TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb).
-
-</Tip>
\ No newline at end of file
diff --git a/docs/source/pt/tasks/token_classification.md b/docs/source/pt/tasks/token_classification.md
new file mode 100644
index 000000000000..ba8298e9f581
--- /dev/null
+++ b/docs/source/pt/tasks/token_classification.md
@@ -0,0 +1,272 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Classificação de tokens
+
+<Youtube id="wVHdVlPScxA"/>
+
+A classificação de tokens atribui um rótulo a tokens individuais em uma frase. Uma das tarefas de classificação de tokens mais comuns é o Reconhecimento de Entidade Nomeada, também chamada de NER (sigla em inglês para Named Entity Recognition). O NER tenta encontrar um rótulo para cada entidade em uma frase, como uma pessoa, local ou organização.
+
+Este guia mostrará como realizar o fine-tuning do [DistilBERT](https://huggingface.co/distilbert-base-uncased) no conjunto de dados [WNUT 17](https://huggingface.co/datasets/wnut_17) para detectar novas entidades.
+
+<Tip>
+
+Consulte a [página de tarefas de classificação de tokens](https://huggingface.co/tasks/token-classification) para obter mais informações sobre outras formas de classificação de tokens e seus modelos, conjuntos de dados e métricas associadas.
+
+</Tip>
+
+## Carregando o conjunto de dados WNUT 17
+
+Carregue o conjunto de dados WNUT 17 da biblioteca 🤗 Datasets:
+
+```py
+>>> from datasets import load_dataset
+
+>>> wnut = load_dataset("wnut_17")
+```
+
+E dê uma olhada em um exemplo:
+
+```py
+>>> wnut["train"][0]
+{'id': '0',
+ 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0],
+ 'tokens': ['@paulwalk', 'It', "'s", 'the', 'view', 'from', 'where', 'I', "'m", 'living', 'for', 'two', 'weeks', '.', 'Empire', 'State', 'Building', '=', 'ESB', '.', 'Pretty', 'bad', 'storm', 'here', 'last', 'evening', '.']
+}
+```
+
+Cada número em `ner_tags` representa uma entidade. Converta o número em um rótulo para obter mais informações:
+
+```py
+>>> label_list = wnut["train"].features[f"ner_tags"].feature.names
+>>> label_list
+[
+    "O",
+    "B-corporation",
+    "I-corporation",
+    "B-creative-work",
+    "I-creative-work",
+    "B-group",
+    "I-group",
+    "B-location",
+    "I-location",
+    "B-person",
+    "I-person",
+    "B-product",
+    "I-product",
+]
+```
+
+O `ner_tag` descreve uma entidade, como uma organização, local ou pessoa. A letra que prefixa cada `ner_tag` indica a posição do token da entidade:
+
+- `B-` indica o início de uma entidade.
+- `I-` indica que um token está contido dentro da mesma entidade (por exemplo, o token `State` pode fazer parte de uma entidade como `Empire State Building`).
+- `0` indica que o token não corresponde a nenhuma entidade.
+
+## Pré-processamento
+
+<Youtube id="iY2AZYdZAr0"/>
+
+Carregue o tokenizer do DistilBERT para processar os `tokens`:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+```
+
+Como a entrada já foi dividida em palavras, defina `is_split_into_words=True` para tokenizar as palavras em subpalavras:
+
+```py
+>>> tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
+>>> tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
+>>> tokens
+['[CLS]', '@', 'paul', '##walk', 'it', "'", 's', 'the', 'view', 'from', 'where', 'i', "'", 'm', 'living', 'for', 'two', 'weeks', '.', 'empire', 'state', 'building', '=', 'es', '##b', '.', 'pretty', 'bad', 'storm', 'here', 'last', 'evening', '.', '[SEP]']
+```
+
+Ao adicionar os tokens especiais `[CLS]` e `[SEP]` e a tokenização de subpalavras uma incompatibilidade é gerada entre a entrada e os rótulos. Uma única palavra correspondente a um único rótulo pode ser dividida em duas subpalavras. Você precisará realinhar os tokens e os rótulos da seguinte forma:
+
+1. Mapeie todos os tokens para a palavra correspondente com o método [`word_ids`](https://huggingface.co/docs/tokenizers/python/latest/api/reference.html#tokenizers.Encoding.word_ids).
+2. Atribuindo o rótulo `-100` aos tokens especiais `[CLS]` e `[SEP]` para que a função de loss do PyTorch ignore eles.
+3. Rotular apenas o primeiro token de uma determinada palavra. Atribuindo `-100` a outros subtokens da mesma palavra.
+
+Aqui está como você pode criar uma função para realinhar os tokens e rótulos e truncar sequências para não serem maiores que o comprimento máximo de entrada do DistilBERT:
+
+```py
+>>> def tokenize_and_align_labels(examples):
+...     tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
+
+...     labels = []
+...     for i, label in enumerate(examples[f"ner_tags"]):
+...         word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
+...         previous_word_idx = None
+...         label_ids = []
+...         for word_idx in word_ids:  # Set the special tokens to -100.
+...             if word_idx is None:
+...                 label_ids.append(-100)
+...             elif word_idx != previous_word_idx:  # Only label the first token of a given word.
+...                 label_ids.append(label[word_idx])
+...             else:
+...                 label_ids.append(-100)
+...             previous_word_idx = word_idx
+...         labels.append(label_ids)
+
+...     tokenized_inputs["labels"] = labels
+...     return tokenized_inputs
+```
+
+Use a função [`map`](https://huggingface.co/docs/datasets/process#map) do 🤗 Datasets para tokenizar e alinhar os rótulos em todo o conjunto de dados. Você pode acelerar a função `map` configurando `batched=True` para processar vários elementos do conjunto de dados de uma só vez:
+
+```py
+>>> tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True)
+```
+
+Use o [`DataCollatorForTokenClassification`] para criar um batch de exemplos. Ele também *preencherá dinamicamente* seu texto e rótulos para o comprimento do elemento mais longo em seu batch, para que tenham um comprimento uniforme. Embora seja possível preencher seu texto na função `tokenizer` configurando `padding=True`, o preenchimento dinâmico é mais eficiente.
+
+<frameworkcontent>
+<pt>
+```py
+>>> from transformers import DataCollatorForTokenClassification
+
+>>> data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
+```
+</pt>
+<tf>
+```py
+>>> from transformers import DataCollatorForTokenClassification
+
+>>> data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")
+```
+</tf>
+</frameworkcontent>
+
+## Treinamento
+
+<frameworkcontent>
+<pt>
+Carregue o DistilBERT com o [`AutoModelForTokenClassification`] junto com o número de rótulos esperados:
+
+```py
+>>> from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
+
+>>> model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=14)
+```
+
+<Tip>
+
+Se você não estiver familiarizado com o fine-tuning de um modelo com o [`Trainer`], dê uma olhada no tutorial básico [aqui](../training#finetune-with-trainer)!
+
+</Tip>
+
+Nesse ponto, restam apenas três passos:
+
+1. Definir seus hiperparâmetros de treinamento em [`TrainingArguments`].
+2. Passar os argumentos de treinamento para o [`Trainer`] junto com o modelo, conjunto de dados, tokenizador e o data collator.
+3. Chamar a função [`~Trainer.train`] para executar o fine-tuning do seu modelo.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="./results",
+...     evaluation_strategy="epoch",
+...     learning_rate=2e-5,
+...     per_device_train_batch_size=16,
+...     per_device_eval_batch_size=16,
+...     num_train_epochs=3,
+...     weight_decay=0.01,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=tokenized_wnut["train"],
+...     eval_dataset=tokenized_wnut["test"],
+...     tokenizer=tokenizer,
+...     data_collator=data_collator,
+... )
+
+>>> trainer.train()
+```
+</pt>
+<tf>
+Para executar o fine-tuning de um modelo no TensorFlow, comece convertendo seu conjunto de dados para o formato `tf.data.Dataset` com [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.to_tf_dataset). Nessa execução você deverá especificar as entradas e rótulos (no parâmetro `columns`), se deseja embaralhar o conjunto de dados, o tamanho do batch e o data collator:
+
+```py
+>>> tf_train_set = tokenized_wnut["train"].to_tf_dataset(
+...     columns=["attention_mask", "input_ids", "labels"],
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_validation_set = tokenized_wnut["validation"].to_tf_dataset(
+...     columns=["attention_mask", "input_ids", "labels"],
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+<Tip>
+
+Se você não estiver familiarizado com o fine-tuning de um modelo com o Keras, dê uma olhada no tutorial básico [aqui](training#finetune-with-keras)!
+
+</Tip>
+
+Configure o otimizador e alguns hiperparâmetros de treinamento:
+
+```py
+>>> from transformers import create_optimizer
+
+>>> batch_size = 16
+>>> num_train_epochs = 3
+>>> num_train_steps = (len(tokenized_wnut["train"]) // batch_size) * num_train_epochs
+>>> optimizer, lr_schedule = create_optimizer(
+...     init_lr=2e-5,
+...     num_train_steps=num_train_steps,
+...     weight_decay_rate=0.01,
+...     num_warmup_steps=0,
+... )
+```
+
+Carregue o DistilBERT com o [`TFAutoModelForTokenClassification`] junto com o número de rótulos esperados:
+
+```py
+>>> from transformers import TFAutoModelForTokenClassification
+
+>>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
+```
+
+Configure o modelo para treinamento com o método [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)
+```
+
+Chame o método [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) para executar o fine-tuning do modelo:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3)
+```
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+Para obter um exemplo mais aprofundado de como executar o fine-tuning de um modelo para classificação de tokens, dê uma olhada nesse [notebook utilizando PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb) ou nesse [notebook utilizando TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb).
+
+</Tip>
\ No newline at end of file
diff --git a/docs/source/pt/tasks/token_classification.mdx b/docs/source/pt/tasks/token_classification.mdx
deleted file mode 100644
index 780080a60dd3..000000000000
--- a/docs/source/pt/tasks/token_classification.mdx
+++ /dev/null
@@ -1,268 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Classificação de tokens
-
-<Youtube id="wVHdVlPScxA"/>
-
-A classificação de tokens atribui um rótulo a tokens individuais em uma frase. Uma das tarefas de classificação de tokens mais comuns é o Reconhecimento de Entidade Nomeada, também chamada de NER (sigla em inglês para Named Entity Recognition). O NER tenta encontrar um rótulo para cada entidade em uma frase, como uma pessoa, local ou organização.
-
-Este guia mostrará como realizar o fine-tuning do [DistilBERT](https://huggingface.co/distilbert-base-uncased) no conjunto de dados [WNUT 17](https://huggingface.co/datasets/wnut_17) para detectar novas entidades.
-
-<Tip>
-
-Consulte a [página de tarefas de classificação de tokens](https://huggingface.co/tasks/token-classification) para obter mais informações sobre outras formas de classificação de tokens e seus modelos, conjuntos de dados e métricas associadas.
-
-</Tip>
-
-## Carregando o conjunto de dados WNUT 17
-
-Carregue o conjunto de dados WNUT 17 da biblioteca 🤗 Datasets:
-
-```py
->>> from datasets import load_dataset
-
->>> wnut = load_dataset("wnut_17")
-```
-
-E dê uma olhada em um exemplo:
-
-```py
->>> wnut["train"][0]
-{'id': '0',
- 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0],
- 'tokens': ['@paulwalk', 'It', "'s", 'the', 'view', 'from', 'where', 'I', "'m", 'living', 'for', 'two', 'weeks', '.', 'Empire', 'State', 'Building', '=', 'ESB', '.', 'Pretty', 'bad', 'storm', 'here', 'last', 'evening', '.']
-}
-```
-
-Cada número em `ner_tags` representa uma entidade. Converta o número em um rótulo para obter mais informações:
-
-```py
->>> label_list = wnut["train"].features[f"ner_tags"].feature.names
->>> label_list
-[
-    "O",
-    "B-corporation",
-    "I-corporation",
-    "B-creative-work",
-    "I-creative-work",
-    "B-group",
-    "I-group",
-    "B-location",
-    "I-location",
-    "B-person",
-    "I-person",
-    "B-product",
-    "I-product",
-]
-```
-
-O `ner_tag` descreve uma entidade, como uma organização, local ou pessoa. A letra que prefixa cada `ner_tag` indica a posição do token da entidade:
-
-- `B-` indica o início de uma entidade.
-- `I-` indica que um token está contido dentro da mesma entidade (por exemplo, o token `State` pode fazer parte de uma entidade como `Empire State Building`).
-- `0` indica que o token não corresponde a nenhuma entidade.
-
-## Pré-processamento
-
-<Youtube id="iY2AZYdZAr0"/>
-
-Carregue o tokenizer do DistilBERT para processar os `tokens`:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
-```
-
-Como a entrada já foi dividida em palavras, defina `is_split_into_words=True` para tokenizar as palavras em subpalavras:
-
-```py
->>> tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
->>> tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
->>> tokens
-['[CLS]', '@', 'paul', '##walk', 'it', "'", 's', 'the', 'view', 'from', 'where', 'i', "'", 'm', 'living', 'for', 'two', 'weeks', '.', 'empire', 'state', 'building', '=', 'es', '##b', '.', 'pretty', 'bad', 'storm', 'here', 'last', 'evening', '.', '[SEP]']
-```
-
-Ao adicionar os tokens especiais `[CLS]` e `[SEP]` e a tokenização de subpalavras uma incompatibilidade é gerada entre a entrada e os rótulos. Uma única palavra correspondente a um único rótulo pode ser dividida em duas subpalavras. Você precisará realinhar os tokens e os rótulos da seguinte forma:
-
-1. Mapeie todos os tokens para a palavra correspondente com o método [`word_ids`](https://huggingface.co/docs/tokenizers/python/latest/api/reference.html#tokenizers.Encoding.word_ids).
-2. Atribuindo o rótulo `-100` aos tokens especiais `[CLS]` e `[SEP]` para que a função de loss do PyTorch ignore eles.
-3. Rotular apenas o primeiro token de uma determinada palavra. Atribuindo `-100` a outros subtokens da mesma palavra.
-
-Aqui está como você pode criar uma função para realinhar os tokens e rótulos e truncar sequências para não serem maiores que o comprimento máximo de entrada do DistilBERT:
-
-```py
->>> def tokenize_and_align_labels(examples):
-...     tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
-
-...     labels = []
-...     for i, label in enumerate(examples[f"ner_tags"]):
-...         word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
-...         previous_word_idx = None
-...         label_ids = []
-...         for word_idx in word_ids:  # Set the special tokens to -100.
-...             if word_idx is None:
-...                 label_ids.append(-100)
-...             elif word_idx != previous_word_idx:  # Only label the first token of a given word.
-...                 label_ids.append(label[word_idx])
-...             else:
-...                 label_ids.append(-100)
-...             previous_word_idx = word_idx
-...         labels.append(label_ids)
-
-...     tokenized_inputs["labels"] = labels
-...     return tokenized_inputs
-```
-
-Use a função [`map`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map) do 🤗 Datasets para tokenizar e alinhar os rótulos em todo o conjunto de dados. Você pode acelerar a função `map` configurando `batched=True` para processar vários elementos do conjunto de dados de uma só vez:
-
-```py
->>> tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True)
-```
-
-Use o [`DataCollatorForTokenClassification`] para criar um batch de exemplos. Ele também *preencherá dinamicamente* seu texto e rótulos para o comprimento do elemento mais longo em seu batch, para que tenham um comprimento uniforme. Embora seja possível preencher seu texto na função `tokenizer` configurando `padding=True`, o preenchimento dinâmico é mais eficiente.
-
-<frameworkcontent>
-<pt>
-```py
->>> from transformers import DataCollatorForTokenClassification
-
->>> data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
-```
-</pt>
-<tf>
-```py
->>> from transformers import DataCollatorForTokenClassification
-
->>> data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")
-```
-</tf>
-</frameworkcontent>
-
-## Treinamento
-
-<frameworkcontent>
-<pt>
-Carregue o DistilBERT com o [`AutoModelForTokenClassification`] junto com o número de rótulos esperados:
-
-```py
->>> from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
-
->>> model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=14)
-```
-
-<Tip>
-
-Se você não estiver familiarizado com o fine-tuning de um modelo com o [`Trainer`], dê uma olhada no tutorial básico [aqui](../training#finetune-with-trainer)!
-
-</Tip>
-
-Nesse ponto, restam apenas três passos:
-
-1. Definir seus hiperparâmetros de treinamento em [`TrainingArguments`].
-2. Passar os argumentos de treinamento para o [`Trainer`] junto com o modelo, conjunto de dados, tokenizador e o data collator.
-3. Chamar a função [`~Trainer.train`] para executar o fine-tuning do seu modelo.
-
-```py
->>> training_args = TrainingArguments(
-...     output_dir="./results",
-...     evaluation_strategy="epoch",
-...     learning_rate=2e-5,
-...     per_device_train_batch_size=16,
-...     per_device_eval_batch_size=16,
-...     num_train_epochs=3,
-...     weight_decay=0.01,
-... )
-
->>> trainer = Trainer(
-...     model=model,
-...     args=training_args,
-...     train_dataset=tokenized_wnut["train"],
-...     eval_dataset=tokenized_wnut["test"],
-...     tokenizer=tokenizer,
-...     data_collator=data_collator,
-... )
-
->>> trainer.train()
-```
-</pt>
-<tf>
-Para executar o fine-tuning de um modelo no TensorFlow, comece convertendo seu conjunto de dados para o formato `tf.data.Dataset` com [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.to_tf_dataset). Nessa execução você deverá especificar as entradas e rótulos (no parâmetro `columns`), se deseja embaralhar o conjunto de dados, o tamanho do batch e o data collator:
-
-```py
->>> tf_train_set = tokenized_wnut["train"].to_tf_dataset(
-...     columns=["attention_mask", "input_ids", "labels"],
-...     shuffle=True,
-...     batch_size=16,
-...     collate_fn=data_collator,
-... )
-
->>> tf_validation_set = tokenized_wnut["validation"].to_tf_dataset(
-...     columns=["attention_mask", "input_ids", "labels"],
-...     shuffle=False,
-...     batch_size=16,
-...     collate_fn=data_collator,
-... )
-```
-
-<Tip>
-
-Se você não estiver familiarizado com o fine-tuning de um modelo com o Keras, dê uma olhada no tutorial básico [aqui](training#finetune-with-keras)!
-
-</Tip>
-
-Configure o otimizador e alguns hiperparâmetros de treinamento:
-
-```py
->>> from transformers import create_optimizer
-
->>> batch_size = 16
->>> num_train_epochs = 3
->>> num_train_steps = (len(tokenized_wnut["train"]) // batch_size) * num_train_epochs
->>> optimizer, lr_schedule = create_optimizer(
-...     init_lr=2e-5,
-...     num_train_steps=num_train_steps,
-...     weight_decay_rate=0.01,
-...     num_warmup_steps=0,
-... )
-```
-
-Carregue o DistilBERT com o [`TFAutoModelForTokenClassification`] junto com o número de rótulos esperados:
-
-```py
->>> from transformers import TFAutoModelForTokenClassification
-
->>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
-```
-
-Configure o modelo para treinamento com o método [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
-
-```py
->>> import tensorflow as tf
-
->>> model.compile(optimizer=optimizer)
-```
-
-Chame o método [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) para executar o fine-tuning do modelo:
-
-```py
->>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3)
-```
-</tf>
-</frameworkcontent>
-
-<Tip>
-
-Para obter um exemplo mais aprofundado de como executar o fine-tuning de um modelo para classificação de tokens, dê uma olhada nesse [notebook utilizando PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb) ou nesse [notebook utilizando TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb).
-
-</Tip>
\ No newline at end of file
diff --git a/docs/source/pt/training.md b/docs/source/pt/training.md
new file mode 100644
index 000000000000..aa529ac948b8
--- /dev/null
+++ b/docs/source/pt/training.md
@@ -0,0 +1,416 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Fine-tuning de um modelo pré-treinado
+
+[[open-in-colab]]
+
+O uso de um modelo pré-treinado tem importantes vantagens. Redução do custo computacional, a pegada de carbono, e te
+permite utilizar modelos de última geração sem ter que treinar um novo desde o início.
+O 🤗 Transformers proporciona acesso a milhares de modelos pré-treinados numa ampla gama de tarefas.
+Quando utilizar um modelo pré-treinado, treine-o com um dataset específico para a sua tarefa.
+Isto é chamado de fine-tuning, uma técnica de treinamento incrivelmente poderosa. Neste tutorial faremos o fine-tuning
+de um modelo pré-treinado com um framework de Deep Learning da sua escolha:
+
+* Fine-tuning de um modelo pré-treinado com o 🤗 Transformers [`Trainer`].
+* Fine-tuning de um modelo pré-treinado no TensorFlow com o Keras.
+* Fine-tuning de um modelo pré-treinado em PyTorch nativo.
+
+<a id='data-processing'></a>
+
+## Preparando um dataset
+
+<Youtube id="_BZearw7f0w"/>
+
+Antes de aplicar o fine-tuning a um modelo pré-treinado, baixe um dataset e prepare-o para o treinamento.
+O tutorial anterior ensinará a processar os dados para o treinamento, e então poderá ter a oportunidade de testar
+esse novo conhecimento em algo prático.
+
+Comece carregando o dataset [Yelp Reviews](https://huggingface.co/datasets/yelp_review_full):
+
+```py
+>>> from datasets import load_dataset
+
+>>> dataset = load_dataset("yelp_review_full")
+>>> dataset[100]
+{'label': 0,
+ 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. I\'ve worked at more than one location. I expect bad days, bad moods, and the occasional mistake. But I have yet to have a decent experience at this store. It will remain a place I avoid unless someone in my party needs to avoid illness from low blood sugar. Perhaps I should go back to the racially biased service of Steak n Shake instead!'}
+```
+
+Como já sabe, é necessário ter um tokenizador para processar o texto e incluir uma estratégia de padding e truncamento,
+para manejar qualquer tamanho varíavel de sequência. Para processar o seu dataset em apenas um passo, utilize o método de
+🤗 Datasets [`map`](https://huggingface.co/docs/datasets/process.html#map) para aplicar uma função de preprocessamento sobre
+todo o dataset.
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+
+
+>>> def tokenize_function(examples):
+...     return tokenizer(examples["text"], padding="max_length", truncation=True)
+
+
+>>> tokenized_datasets = dataset.map(tokenize_function, batched=True)
+```
+
+Se desejar, é possível criar um subconjunto menor do dataset completo para aplicar o fine-tuning e assim reduzir o tempo necessário.
+
+```py
+>>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
+>>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
+```
+
+<a id='trainer'></a>
+
+## Fine-tuning com o `Trainer`
+
+<Youtube id="nvBXf7s7vTI"/>
+
+O 🤗 Transformers proporciona uma classe [`Trainer`] otimizada para o treinamento de modelos de 🤗 Transformers,
+facilitando os primeiros passos do treinamento sem a necessidade de escrever manualmente o seu próprio ciclo.
+A API do [`Trainer`] suporta um grande conjunto de opções de treinamento e funcionalidades, como o logging,
+o gradient accumulation e o mixed precision.
+
+Comece carregando seu modelo e especifique o número de labels de previsão.
+A partir do [Card Dataset](https://huggingface.co/datasets/yelp_review_full#data-fields) do Yelp Reveiw, que ja
+sabemos ter 5 labels usamos o seguinte código:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+```
+
+<Tip>
+
+    Você verá um alerta sobre alguns pesos pré-treinados que não estão sendo utilizados e que alguns pesos estão
+    sendo inicializados aleatoriamente. Não se preocupe, essa mensagem é completamente normal.
+    O header/cabeçário pré-treinado do modelo BERT é descartado e substitui-se por um header de classificação
+    inicializado aleatoriamente. Assim, pode aplicar o fine-tuning a este novo header do modelo em sua tarefa
+    de classificação de sequências fazendo um transfer learning do modelo pré-treinado.
+
+</Tip>
+
+### Hiperparâmetros de treinamento
+
+Em seguida, crie uma classe [`TrainingArguments`] que contenha todos os hiperparâmetros que possam ser ajustados, assim
+como os indicadores para ativar as diferentes opções de treinamento. Para este tutorial, você pode começar o treinamento
+usando os [hiperparámetros](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments) padrão,
+porém, sinta-se livre para experimentar com eles e encontrar uma configuração ótima.
+
+Especifique onde salvar os checkpoints do treinamento:
+
+```py
+>>> from transformers import TrainingArguments
+
+>>> training_args = TrainingArguments(output_dir="test_trainer")
+```
+
+### Métricas
+
+O [`Trainer`] não avalia automaticamente o rendimento do modelo durante o treinamento. Será necessário passar ao
+[`Trainer`] uma função para calcular e fazer um diagnóstico sobre as métricas. A biblioteca 🤗 Datasets proporciona
+uma função de [`accuracy`](https://huggingface.co/metrics/accuracy) simples que pode ser carregada com a função
+`load_metric` (ver este [tutorial](https://huggingface.co/docs/datasets/metrics.html) para mais informações):
+
+```py
+>>> import numpy as np
+>>> from datasets import load_metric
+
+>>> metric = load_metric("accuracy")
+```
+
+Defina a função `compute` dentro de `metric` para calcular a precisão das suas predições.
+Antes de passar as suas predições ao `compute`, é necessário converter as predições à logits (lembre-se que
+todos os modelos de 🤗 Transformers retornam logits).
+
+```py
+>>> def compute_metrics(eval_pred):
+...     logits, labels = eval_pred
+...     predictions = np.argmax(logits, axis=-1)
+...     return metric.compute(predictions=predictions, references=labels)
+```
+
+Se quiser controlar as suas métricas de avaliação durante o fine-tuning, especifique o parâmetro `evaluation_strategy`
+nos seus argumentos de treinamento para que o modelo considere a métrica de avaliação ao final de cada época:
+
+```py
+>>> from transformers import TrainingArguments
+
+>>> training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
+```
+
+### Trainer
+
+Crie um objeto [`Trainer`] com o seu modelo, argumentos de treinamento, conjuntos de dados de treinamento e de teste, e a sua função de avaliação:
+
+```py
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=small_train_dataset,
+...     eval_dataset=small_eval_dataset,
+...     compute_metrics=compute_metrics,
+... )
+```
+
+Em seguida, aplique o fine-tuning a seu modelo chamado [`~transformers.Trainer.train`]:
+
+```py
+>>> trainer.train()
+```
+
+<a id='keras'></a>
+
+## Fine-tuning com Keras
+
+<Youtube id="rnTGBy2ax1c"/>
+
+Os modelos de 🤗 Transformers também permitem realizar o treinamento com o TensorFlow com a API do Keras.
+Contudo, será necessário fazer algumas mudanças antes de realizar o fine-tuning.
+
+### Conversão do dataset ao formato do TensorFlow
+
+O [`DefaultDataCollator`] junta os tensores em um batch para que o modelo possa ser treinado em cima deles.
+Assegure-se de especificar os `return_tensors` para retornar os tensores do TensorFlow:
+
+```py
+>>> from transformers import DefaultDataCollator
+
+>>> data_collator = DefaultDataCollator(return_tensors="tf")
+```
+
+<Tip>
+
+    O [`Trainer`] utiliza [`DataCollatorWithPadding`] por padrão, então você não precisa especificar explicitamente um
+    colador de dados (data collator).
+
+</Tip>
+
+Em seguida, converta os datasets tokenizados em datasets do TensorFlow com o método
+[`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.to_tf_dataset).
+Especifique suas entradas em `columns` e seu rótulo em `label_cols`:
+
+```py
+>>> tf_train_dataset = small_train_dataset.to_tf_dataset(
+...     columns=["attention_mask", "input_ids", "token_type_ids"],
+...     label_cols="labels",
+...     shuffle=True,
+...     collate_fn=data_collator,
+...     batch_size=8,
+... )
+
+>>> tf_validation_dataset = small_eval_dataset.to_tf_dataset(
+...     columns=["attention_mask", "input_ids", "token_type_ids"],
+...     label_cols="labels",
+...     shuffle=False,
+...     collate_fn=data_collator,
+...     batch_size=8,
+... )
+```
+
+### Compilação e ajustes
+
+Carregue um modelo do TensorFlow com o número esperado de rótulos:
+
+```py
+>>> import tensorflow as tf
+>>> from transformers import TFAutoModelForSequenceClassification
+
+>>> model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+```
+
+A seguir, compile e ajuste o fine-tuning a seu modelo com [`fit`](https://keras.io/api/models/model_training_apis/) como
+faria com qualquer outro modelo do Keras:
+
+```py
+>>> model.compile(
+...     optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
+...     loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+...     metrics=tf.metrics.SparseCategoricalAccuracy(),
+... )
+
+>>> model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3)
+```
+
+<a id='pytorch_native'></a>
+
+## Fine-tune em PyTorch nativo
+
+<Youtube id="Dh9CL8fyG80"/>
+
+O [`Trainer`] se encarrega do ciclo de treinamento e permite aplicar o fine-tuning a um modelo em uma linha de código apenas.
+Para os usuários que preferirem escrever o seu próprio ciclo de treinamento, também é possível aplicar o fine-tuning a um
+modelo de 🤗 Transformers em PyTorch nativo.
+
+Neste momento, talvez ocorra a necessidade de reinicar seu notebook ou executar a seguinte linha de código para liberar
+memória:
+
+```py
+del model
+del pytorch_model
+del trainer
+torch.cuda.empty_cache()
+```
+
+Em sequência, faremos um post-processing manual do `tokenized_dataset` e assim prepará-lo para o treinamento.
+
+1. Apague a coluna de `text` porque o modelo não aceita texto cru como entrada:
+
+    ```py
+    >>> tokenized_datasets = tokenized_datasets.remove_columns(["text"])
+    ```
+
+2. Troque o nome da coluna `label` para `labels`, pois o modelo espera um argumento de mesmo nome:
+
+    ```py
+    >>> tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
+    ```
+
+3. Defina o formato do dataset para retornar tensores do PyTorch no lugar de listas:
+
+    ```py
+    >>> tokenized_datasets.set_format("torch")
+    ```
+
+Em sequência, crie um subconjunto menor do dataset, como foi mostrado anteriormente, para acelerá-lo o fine-tuning.
+
+```py
+>>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
+>>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
+```
+
+### DataLoader
+
+Crie um `DataLoader` para os seus datasets de treinamento e de teste para poder iterar sobre batches de dados:
+
+```py
+>>> from torch.utils.data import DataLoader
+
+>>> train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
+>>> eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)
+```
+
+Carregue seu modelo com o número de labels esperados:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+```
+
+### Otimização e configuração do Learning Rate
+
+Crie um otimizador e um learning rate para aplicar o fine-tuning ao modelo.
+Iremos utilizar o otimizador [`AdamW`](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html) do PyTorch:
+
+```py
+>>> from torch.optim import AdamW
+
+>>> optimizer = AdamW(model.parameters(), lr=5e-5)
+```
+
+Defina o learning rate do [`Trainer`]:
+
+```py
+>>> from transformers import get_scheduler
+
+>>> num_epochs = 3
+>>> num_training_steps = num_epochs * len(train_dataloader)
+>>> lr_scheduler = get_scheduler(
+...     name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
+... )
+```
+
+Por último, especifique o `device` do ambiente para utilizar uma GPU se tiver acesso à alguma. Caso contrário, o treinamento
+em uma CPU pode acabar levando várias horas em vez de minutos.
+
+```py
+>>> import torch
+
+>>> device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+>>> model.to(device)
+```
+
+<Tip>
+
+    Se necessário, você pode obter o acesso gratuito a uma GPU na núvem por meio de um notebook no
+    [Colaboratory](https://colab.research.google.com/) ou [SageMaker StudioLab](https://studiolab.sagemaker.aws/)
+    se não tiver esse recurso de forma local.
+
+</Tip>
+
+Perfeito, agora estamos prontos para começar o treinamento! 🥳
+
+### Ciclo de treinamento
+
+Para visualizar melhor o processo de treinamento, utilize a biblioteca [tqdm](https://tqdm.github.io/) para adicionar
+uma barra de progresso sobre o número de passos percorridos no treinamento atual:
+
+```py
+>>> from tqdm.auto import tqdm
+
+>>> progress_bar = tqdm(range(num_training_steps))
+
+>>> model.train()
+>>> for epoch in range(num_epochs):
+...     for batch in train_dataloader:
+...         batch = {k: v.to(device) for k, v in batch.items()}
+...         outputs = model(**batch)
+...         loss = outputs.loss
+...         loss.backward()
+
+...         optimizer.step()
+...         lr_scheduler.step()
+...         optimizer.zero_grad()
+...         progress_bar.update(1)
+```
+
+### Métricas
+
+Da mesma forma que é necessário adicionar uma função de avaliação ao [`Trainer`], é necessário fazer o mesmo quando
+escrevendo o próprio ciclo de treinamento. Contudo, em vez de calcular e retornar a métrica final de cada época,
+você deverá adicionar todos os batches com [`add_batch`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=add_batch#datasets.Metric.add_batch)
+e calcular a métrica apenas no final.
+
+```py
+>>> metric = load_metric("accuracy")
+>>> model.eval()
+>>> for batch in eval_dataloader:
+...     batch = {k: v.to(device) for k, v in batch.items()}
+...     with torch.no_grad():
+...         outputs = model(**batch)
+
+...     logits = outputs.logits
+...     predictions = torch.argmax(logits, dim=-1)
+...     metric.add_batch(predictions=predictions, references=batch["labels"])
+
+>>> metric.compute()
+```
+
+<a id='additional-resources'></a>
+
+## Recursos adicionais
+
+Para mais exemplos de fine-tuning acesse:
+
+- [🤗 Transformers Examples](https://github.com/huggingface/transformers/tree/main/examples) inclui scripts
+para treinas tarefas comuns de NLP em PyTorch e TensorFlow.
+
+- [🤗 Transformers Notebooks](notebooks) contém vários notebooks sobre como aplicar o fine-tuning a um modelo
+para tarefas específicas no PyTorch e TensorFlow.
diff --git a/docs/source/pt/training.mdx b/docs/source/pt/training.mdx
deleted file mode 100644
index bf59c14528f9..000000000000
--- a/docs/source/pt/training.mdx
+++ /dev/null
@@ -1,412 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Fine-tuning de um modelo pré-treinado
-
-[[open-in-colab]]
-
-O uso de um modelo pré-treinado tem importantes vantagens. Redução do custo computacional, a pegada de carbono, e te
-permite utilizar modelos de última geração sem ter que treinar um novo desde o início.
-O 🤗 Transformers proporciona acesso a milhares de modelos pré-treinados numa ampla gama de tarefas.
-Quando utilizar um modelo pré-treinado, treine-o com um dataset específico para a sua tarefa.
-Isto é chamado de fine-tuning, uma técnica de treinamento incrivelmente poderosa. Neste tutorial faremos o fine-tuning
-de um modelo pré-treinado com um framework de Deep Learning da sua escolha:
-
-* Fine-tuning de um modelo pré-treinado com o 🤗 Transformers [`Trainer`].
-* Fine-tuning de um modelo pré-treinado no TensorFlow com o Keras.
-* Fine-tuning de um modelo pré-treinado em PyTorch nativo.
-
-<a id='data-processing'></a>
-
-## Preparando um dataset
-
-<Youtube id="_BZearw7f0w"/>
-
-Antes de aplicar o fine-tuning a um modelo pré-treinado, baixe um dataset e prepare-o para o treinamento.
-O tutorial anterior ensinará a processar os dados para o treinamento, e então poderá ter a oportunidade de testar
-esse novo conhecimento em algo prático.
-
-Comece carregando o dataset [Yelp Reviews](https://huggingface.co/datasets/yelp_review_full):
-
-```py
->>> from datasets import load_dataset
-
->>> dataset = load_dataset("yelp_review_full")
->>> dataset[100]
-{'label': 0,
- 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. I\'ve worked at more than one location. I expect bad days, bad moods, and the occasional mistake. But I have yet to have a decent experience at this store. It will remain a place I avoid unless someone in my party needs to avoid illness from low blood sugar. Perhaps I should go back to the racially biased service of Steak n Shake instead!'}
-```
-
-Como já sabe, é necessário ter um tokenizador para processar o texto e incluir uma estratégia de padding e truncamento,
-para manejar qualquer tamanho varíavel de sequência. Para processar o seu dataset em apenas um passo, utilize o método de
-🤗 Datasets [`map`](https://huggingface.co/docs/datasets/process.html#map) para aplicar uma função de preprocessamento sobre
-todo o dataset.
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
-
-
->>> def tokenize_function(examples):
-...     return tokenizer(examples["text"], padding="max_length", truncation=True)
-
-
->>> tokenized_datasets = dataset.map(tokenize_function, batched=True)
-```
-
-Se desejar, é possível criar um subconjunto menor do dataset completo para aplicar o fine-tuning e assim reduzir o tempo necessário.
-
-```py
->>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
->>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
-```
-
-<a id='trainer'></a>
-
-## Fine-tuning com o `Trainer`
-
-<Youtube id="nvBXf7s7vTI"/>
-
-O 🤗 Transformers proporciona uma classe [`Trainer`] otimizada para o treinamento de modelos de 🤗 Transformers,
-facilitando os primeiros passos do treinamento sem a necessidade de escrever manualmente o seu próprio ciclo.
-A API do [`Trainer`] suporta um grande conjunto de opções de treinamento e funcionalidades, como o logging,
-o gradient accumulation e o mixed precision.
-
-Comece carregando seu modelo e especifique o número de labels de previsão.
-A partir do [Card Dataset](https://huggingface.co/datasets/yelp_review_full#data-fields) do Yelp Reveiw, que ja
-sabemos ter 5 labels usamos o seguinte código:
-
-```py
->>> from transformers import AutoModelForSequenceClassification
-
->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
-```
-
-<Tip>
-
-    Você verá um alerta sobre alguns pesos pré-treinados que não estão sendo utilizados e que alguns pesos estão
-    sendo inicializados aleatoriamente. Não se preocupe, essa mensagem é completamente normal.
-    O header/cabeçário pré-treinado do modelo BERT é descartado e substitui-se por um header de classificação
-    inicializado aleatoriamente. Assim, pode aplicar o fine-tuning a este novo header do modelo em sua tarefa
-    de classificação de sequências fazendo um transfer learning do modelo pré-treinado.
-
-</Tip>
-
-### Hiperparâmetros de treinamento
-
-Em seguida, crie uma classe [`TrainingArguments`] que contenha todos os hiperparâmetros que possam ser ajustados, assim
-como os indicadores para ativar as diferentes opções de treinamento. Para este tutorial, você pode começar o treinamento
-usando os [hiperparámetros](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments) padrão,
-porém, sinta-se livre para experimentar com eles e encontrar uma configuração ótima.
-
-Especifique onde salvar os checkpoints do treinamento:
-
-```py
->>> from transformers import TrainingArguments
-
->>> training_args = TrainingArguments(output_dir="test_trainer")
-```
-
-### Métricas
-
-O [`Trainer`] não avalia automaticamente o rendimento do modelo durante o treinamento. Será necessário passar ao
-[`Trainer`] uma função para calcular e fazer um diagnóstico sobre as métricas. A biblioteca 🤗 Datasets proporciona
-uma função de [`accuracy`](https://huggingface.co/metrics/accuracy) simples que pode ser carregada com a função
-`load_metric` (ver este [tutorial](https://huggingface.co/docs/datasets/metrics.html) para mais informações):
-
-```py
->>> import numpy as np
->>> from datasets import load_metric
-
->>> metric = load_metric("accuracy")
-```
-
-Defina a função `compute` dentro de `metric` para calcular a precisão das suas predições.
-Antes de passar as suas predições ao `compute`, é necessário converter as predições à logits (lembre-se que
-todos os modelos de 🤗 Transformers retornam logits).
-
-```py
->>> def compute_metrics(eval_pred):
-...     logits, labels = eval_pred
-...     predictions = np.argmax(logits, axis=-1)
-...     return metric.compute(predictions=predictions, references=labels)
-```
-
-Se quiser controlar as suas métricas de avaliação durante o fine-tuning, especifique o parâmetro `evaluation_strategy`
-nos seus argumentos de treinamento para que o modelo considere a métrica de avaliação ao final de cada época:
-
-```py
->>> from transformers import TrainingArguments
-
->>> training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
-```
-
-### Trainer
-
-Crie um objeto [`Trainer`] com o seu modelo, argumentos de treinamento, conjuntos de dados de treinamento e de teste, e a sua função de avaliação:
-
-```py
->>> trainer = Trainer(
-...     model=model,
-...     args=training_args,
-...     train_dataset=small_train_dataset,
-...     eval_dataset=small_eval_dataset,
-...     compute_metrics=compute_metrics,
-... )
-```
-
-Em seguida, aplique o fine-tuning a seu modelo chamado [`~transformers.Trainer.train`]:
-
-```py
->>> trainer.train()
-```
-
-<a id='keras'></a>
-
-## Fine-tuning com Keras
-
-<Youtube id="rnTGBy2ax1c"/>
-
-Os modelos de 🤗 Transformers também permitem realizar o treinamento com o TensorFlow com a API do Keras.
-Contudo, será necessário fazer algumas mudanças antes de realizar o fine-tuning.
-
-### Conversão do dataset ao formato do TensorFlow
-
-O [`DefaultDataCollator`] junta os tensores em um batch para que o modelo possa ser treinado em cima deles.
-Assegure-se de especificar os `return_tensors` para retornar os tensores do TensorFlow:
-
-```py
->>> from transformers import DefaultDataCollator
-
->>> data_collator = DefaultDataCollator(return_tensors="tf")
-```
-
-<Tip>
-
-    O [`Trainer`] utiliza [`DataCollatorWithPadding`] por padrão, então você não precisa especificar explicitamente um
-    colador de dados (data collator).
-
-</Tip>
-
-Em seguida, converta os datasets tokenizados em datasets do TensorFlow com o método
-[`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.to_tf_dataset).
-Especifique suas entradas em `columns` e seu rótulo em `label_cols`:
-
-```py
->>> tf_train_dataset = small_train_dataset.to_tf_dataset(
-...     columns=["attention_mask", "input_ids", "token_type_ids"],
-...     label_cols=["labels"],
-...     shuffle=True,
-...     collate_fn=data_collator,
-...     batch_size=8,
-... )
-
->>> tf_validation_dataset = small_eval_dataset.to_tf_dataset(
-...     columns=["attention_mask", "input_ids", "token_type_ids"],
-...     label_cols=["labels"],
-...     shuffle=False,
-...     collate_fn=data_collator,
-...     batch_size=8,
-... )
-```
-
-### Compilação e ajustes
-
-Carregue um modelo do TensorFlow com o número esperado de rótulos:
-
-```py
->>> import tensorflow as tf
->>> from transformers import TFAutoModelForSequenceClassification
-
->>> model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
-```
-
-A seguir, compile e ajuste o fine-tuning a seu modelo com [`fit`](https://keras.io/api/models/model_training_apis/) como
-faria com qualquer outro modelo do Keras:
-
-```py
->>> model.compile(
-...     optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
-...     loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
-...     metrics=tf.metrics.SparseCategoricalAccuracy(),
-... )
-
->>> model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3)
-```
-
-<a id='pytorch_native'></a>
-
-## Fine-tune em PyTorch nativo
-
-<Youtube id="Dh9CL8fyG80"/>
-
-O [`Trainer`] se encarrega do ciclo de treinamento e permite aplicar o fine-tuning a um modelo em uma linha de código apenas.
-Para os usuários que preferirem escrever o seu próprio ciclo de treinamento, também é possível aplicar o fine-tuning a um
-modelo de 🤗 Transformers em PyTorch nativo.
-
-Neste momento, talvez ocorra a necessidade de reinicar seu notebook ou executar a seguinte linha de código para liberar
-memória:
-
-```py
-del model
-del pytorch_model
-del trainer
-torch.cuda.empty_cache()
-```
-
-Em sequência, faremos um post-processing manual do `tokenized_dataset` e assim prepará-lo para o treinamento.
-
-1. Apague a coluna de `text` porque o modelo não aceita texto cru como entrada:
-
-    ```py
-    >>> tokenized_datasets = tokenized_datasets.remove_columns(["text"])
-    ```
-
-2. Troque o nome da coluna `label` para `labels`, pois o modelo espera um argumento de mesmo nome:
-
-    ```py
-    >>> tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
-    ```
-
-3. Defina o formato do dataset para retornar tensores do PyTorch no lugar de listas:
-
-    ```py
-    >>> tokenized_datasets.set_format("torch")
-    ```
-
-Em sequência, crie um subconjunto menor do dataset, como foi mostrado anteriormente, para acelerá-lo o fine-tuning.
-
-```py
->>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
->>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
-```
-
-### DataLoader
-
-Crie um `DataLoader` para os seus datasets de treinamento e de teste para poder iterar sobre batches de dados:
-
-```py
->>> from torch.utils.data import DataLoader
-
->>> train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
->>> eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)
-```
-
-Carregue seu modelo com o número de labels esperados:
-
-```py
->>> from transformers import AutoModelForSequenceClassification
-
->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
-```
-
-### Otimização e configuração do Learning Rate
-
-Crie um otimizador e um learning rate para aplicar o fine-tuning ao modelo.
-Iremos utilizar o otimizador [`AdamW`](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html) do PyTorch:
-
-```py
->>> from torch.optim import AdamW
-
->>> optimizer = AdamW(model.parameters(), lr=5e-5)
-```
-
-Defina o learning rate do [`Trainer`]:
-
-```py
->>> from transformers import get_scheduler
-
->>> num_epochs = 3
->>> num_training_steps = num_epochs * len(train_dataloader)
->>> lr_scheduler = get_scheduler(
-...     name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
-... )
-```
-
-Por último, especifique o `device` do ambiente para utilizar uma GPU se tiver acesso à alguma. Caso contrário, o treinamento
-em uma CPU pode acabar levando várias horas em vez de minutos.
-
-```py
->>> import torch
-
->>> device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
->>> model.to(device)
-```
-
-<Tip>
-
-    Se necessário, você pode obter o acesso gratuito a uma GPU na núvem por meio de um notebook no
-    [Colaboratory](https://colab.research.google.com/) ou [SageMaker StudioLab](https://studiolab.sagemaker.aws/)
-    se não tiver esse recurso de forma local.
-
-</Tip>
-
-Perfeito, agora estamos prontos para começar o treinamento! 🥳
-
-### Ciclo de treinamento
-
-Para visualizar melhor o processo de treinamento, utilize a biblioteca [tqdm](https://tqdm.github.io/) para adicionar
-uma barra de progresso sobre o número de passos percorridos no treinamento atual:
-
-```py
->>> from tqdm.auto import tqdm
-
->>> progress_bar = tqdm(range(num_training_steps))
-
->>> model.train()
->>> for epoch in range(num_epochs):
-...     for batch in train_dataloader:
-...         batch = {k: v.to(device) for k, v in batch.items()}
-...         outputs = model(**batch)
-...         loss = outputs.loss
-...         loss.backward()
-
-...         optimizer.step()
-...         lr_scheduler.step()
-...         optimizer.zero_grad()
-...         progress_bar.update(1)
-```
-
-### Métricas
-
-Da mesma forma que é necessário adicionar uma função de avaliação ao [`Trainer`], é necessário fazer o mesmo quando
-escrevendo o próprio ciclo de treinamento. Contudo, em vez de calcular e retornar a métrica final de cada época,
-você deverá adicionar todos os batches com [`add_batch`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=add_batch#datasets.Metric.add_batch)
-e calcular a métrica apenas no final.
-
-```py
->>> metric = load_metric("accuracy")
->>> model.eval()
->>> for batch in eval_dataloader:
-...     batch = {k: v.to(device) for k, v in batch.items()}
-...     with torch.no_grad():
-...         outputs = model(**batch)
-
-...     logits = outputs.logits
-...     predictions = torch.argmax(logits, dim=-1)
-...     metric.add_batch(predictions=predictions, references=batch["labels"])
-
->>> metric.compute()
-```
-
-<a id='additional-resources'></a>
-
-## Recursos adicionais
-
-Para mais exemplos de fine-tuning acesse:
-
-- [🤗 Transformers Examples](https://github.com/huggingface/transformers/tree/main/examples) inclui scripts
-para treinas tarefas comuns de NLP em PyTorch e TensorFlow.
-
-- [🤗 Transformers Notebooks](notebooks) contém vários notebooks sobre como aplicar o fine-tuning a um modelo
-para tarefas específicas no PyTorch e TensorFlow.
diff --git a/docs/source/zh/_toctree.yml b/docs/source/zh/_toctree.yml
index 46fe5f1fd629..11403b8b65f9 100644
--- a/docs/source/zh/_toctree.yml
+++ b/docs/source/zh/_toctree.yml
@@ -1,3 +1,8 @@
 - sections:
+  - local: index
+    title: 🤗 Transformers 简介
   - local: quicktour
     title: 快速上手
+  - local: installation
+    title: 安装
+  title: 开始使用
diff --git a/docs/source/zh/index.md b/docs/source/zh/index.md
new file mode 100644
index 000000000000..3f72ce9063d7
--- /dev/null
+++ b/docs/source/zh/index.md
@@ -0,0 +1,398 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 🤗 Transformers简介
+
+为 [PyTorch](https://pytorch.org/)、[TensorFlow](https://www.tensorflow.org/) 和 [JAX](https://jax.readthedocs.io/en/latest/) 打造的先进的机器学习工具.
+
+🤗 Transformers 提供了可以轻松地下载并且训练先进的预训练模型的 API 和工具。使用预训练模型可以减少计算消耗和碳排放，并且节省从头训练所需要的时间和资源。这些模型支持不同模态中的常见任务，比如：
+
+📝 **自然语言处理**：文本分类、命名实体识别、问答、语言建模、摘要、翻译、多项选择和文本生成。<br>
+🖼️ **机器视觉**：图像分类、目标检测和语义分割。<br>
+🗣️ **音频**：自动语音识别和音频分类。<br>
+🐙 **多模态**：表格问答、光学字符识别、从扫描文档提取信息、视频分类和视觉问答。
+
+🤗 Transformers 支持在 PyTorch、TensorFlow 和 JAX 上的互操作性. 这给在模型的每个阶段使用不同的框架带来了灵活性；在一个框架中使用几行代码训练一个模型，然后在另一个框架中加载它并进行推理。模型也可以被导出为 ONNX 和 TorchScript 格式，用于在生产环境中部署。
+
+马上加入在 [Hub](https://huggingface.co/models)、[论坛](https://discuss.huggingface.co/) 或者 [Discord](https://discord.com/invite/JfAtkvEtRb) 上正在快速发展的社区吧！
+
+## 如果你需要来自 Hugging Face 团队的个性化支持
+
+<a target="_blank" href="https://huggingface.co/support">
+    <img alt="HuggingFace Expert Acceleration Program" src="https://cdn-media.huggingface.co/marketing/transformers/new-support-improved.png" style="width: 100%; max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
+</a>
+
+## 目录
+
+这篇文档被组织为以下5个章节:
+
+- **开始使用** 包含了库的快速上手和安装说明，便于配置和运行。
+- **教程** 是一个初学者开始的好地方。本章节将帮助你获得你会用到的使用这个库的基本技能。
+- **操作指南** 向你展示如何实现一个特定目标，比如为语言建模微调一个预训练模型或者如何创造并分享个性化模型。
+- **概念指南** 对 🤗 Transformers 的模型，任务和设计理念背后的基本概念和思想做了更多的讨论和解释。
+- **API 介绍** 描述了所有的类和函数：
+
+  - **MAIN CLASSES** 详述了配置（configuration）、模型（model）、分词器（tokenizer）和流水线（pipeline）这几个最重要的类。
+  - **MODELS** 详述了在这个库中和每个模型实现有关的类和函数。
+  - **INTERNAL HELPERS** 详述了内部使用的工具类和函数。
+
+### 支持的模型
+
+<!--This list is updated automatically from the README with _make fix-copies_. Do not update manually! -->
+
+1. **[ALBERT](model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
+1. **[AltCLIP](model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
+1. **[Audio Spectrogram Transformer](model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
+1. **[BART](model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
+1. **[BARThez](model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
+1. **[BARTpho](model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
+1. **[BEiT](model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
+1. **[BERT](model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
+1. **[BERT For Sequence Generation](model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[BERTweet](model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
+1. **[BigBird-Pegasus](model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[BigBird-RoBERTa](model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[BioGpt](model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
+1. **[BiT](model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
+1. **[Blenderbot](model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BlenderbotSmall](model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BLIP](model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
+1. **[BLOOM](model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
+1. **[BORT](model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
+1. **[ByT5](model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
+1. **[CamemBERT](model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
+1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
+1. **[Chinese-CLIP](model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
+1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
+1. **[CLIPSeg](model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
+1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
+1. **[Conditional DETR](model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
+1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
+1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
+1. **[ConvNeXTV2](model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
+1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
+1. **[CTRL](model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
+1. **[CvT](model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
+1. **[Data2Vec](model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
+1. **[DeBERTa](model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[DeBERTa-v2](model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[Decision Transformer](model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
+1. **[Deformable DETR](model_doc/deformable_detr)** (from SenseTime Research) released with the paper [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai.
+1. **[DeiT](model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
+1. **[DETR](model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
+1. **[DialoGPT](model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
+1. **[DiNAT](model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi.
+1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
+1. **[DiT](model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
+1. **[Donut](model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
+1. **[DPR](model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
+1. **[DPT](master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
+1. **[ELECTRA](model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
+1. **[EncoderDecoder](model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[ERNIE](model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
+1. **[ESM](model_doc/esm)** (from Meta AI) are transformer protein language models.  **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2 and ESMFold** were released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
+1. **[FLAN-T5](model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
+1. **[FlauBERT](model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
+1. **[FLAVA](model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
+1. **[FNet](model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
+1. **[Funnel Transformer](model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
+1. **[GIT](model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
+1. **[GLPN](model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
+1. **[GPT](model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+1. **[GPT Neo](model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
+1. **[GPT NeoX](model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
+1. **[GPT NeoX Japanese](model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
+1. **[GPT-2](model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
+1. **[GPT-J](model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
+1. **[GPT-Sw3](model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren.
+1. **[GroupViT](model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
+1. **[Hubert](model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
+1. **[I-BERT](model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
+1. **[ImageGPT](model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
+1. **[Jukebox](model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
+1. **[LayoutLM](model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
+1. **[LayoutLMv2](model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
+1. **[LayoutLMv3](model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
+1. **[LayoutXLM](model_doc/layoutxlm)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
+1. **[LED](model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+1. **[LeViT](model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
+1. **[LiLT](model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
+1. **[Longformer](model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+1. **[LongT5](model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
+1. **[LUKE](model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
+1. **[LXMERT](model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
+1. **[M-CTC-T](model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
+1. **[M2M100](model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
+1. **[MarianMT](model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
+1. **[MarkupLM](model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
+1. **[Mask2Former](model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
+1. **[MaskFormer](model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
+1. **[mBART](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
+1. **[mBART-50](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
+1. **[Megatron-BERT](model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[Megatron-GPT2](model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[mLUKE](model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
+1. **[MobileBERT](model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
+1. **[MobileNetV1](model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
+1. **[MobileNetV2](model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
+1. **[MobileViT](model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
+1. **[MPNet](model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
+1. **[MT5](model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
+1. **[MVP](model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
+1. **[NAT](model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
+1. **[Nezha](model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
+1. **[NLLB](model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
+1. **[Nyströmformer](model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
+1. **[OPT](master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
+1. **[OWL-ViT](model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
+1. **[Pegasus](model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
+1. **[PEGASUS-X](model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
+1. **[Perceiver IO](model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
+1. **[PhoBERT](model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
+1. **[PLBart](model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
+1. **[PoolFormer](model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
+1. **[ProphetNet](model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[QDQBert](model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
+1. **[RAG](model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela.
+1. **[REALM](model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
+1. **[Reformer](model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+1. **[RegNet](model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
+1. **[RemBERT](model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
+1. **[ResNet](model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
+1. **[RoBERTa](model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
+1. **[RoBERTa-PreLayerNorm](model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
+1. **[RoCBert](model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
+1. **[RoFormer](model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
+1. **[SegFormer](model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
+1. **[SEW](model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
+1. **[SEW-D](model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
+1. **[SpeechToTextTransformer](model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
+1. **[SpeechToTextTransformer2](model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
+1. **[Splinter](model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
+1. **[SqueezeBERT](model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
+1. **[Swin Transformer](model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
+1. **[Swin Transformer V2](model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
+1. **[Swin2SR](model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
+1. **[SwitchTransformers](model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
+1. **[T5](model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+1. **[T5v1.1](model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+1. **[Table Transformer](model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
+1. **[TAPAS](model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
+1. **[TAPEX](model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
+1. **[Time Series Transformer](model_doc/time_series_transformer)** (from HuggingFace).
+1. **[TimeSformer](model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
+1. **[Trajectory Transformer](model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
+1. **[Transformer-XL](model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+1. **[TrOCR](model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
+1. **[UL2](model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
+1. **[UniSpeech](model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
+1. **[UniSpeechSat](model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
+1. **[UPerNet](model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
+1. **[VAN](model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
+1. **[VideoMAE](model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
+1. **[ViLT](model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
+1. **[Vision Transformer (ViT)](model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[VisualBERT](model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
+1. **[ViT Hybrid](model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[ViTMAE](model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
+1. **[ViTMSN](model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
+1. **[Wav2Vec2](model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
+1. **[Wav2Vec2-Conformer](model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
+1. **[Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
+1. **[WavLM](model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
+1. **[Whisper](model_doc/whisper)** (from OpenAI) released with the paper [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever.
+1. **[X-CLIP](model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
+1. **[XGLM](model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
+1. **[XLM](model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
+1. **[XLM-ProphetNet](model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[XLM-RoBERTa](model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
+1. **[XLM-RoBERTa-XL](model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
+1. **[XLNet](model_doc/xlnet)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
+1. **[XLS-R](model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
+1. **[XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
+1. **[YOLOS](model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
+1. **[YOSO](model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
+
+
+### 支持的框架
+
+下表展示了库中对每个模型的支持情况，如是否具有 Python 分词器（表中的“Tokenizer slow”）、是否具有由 🤗 Tokenizers 库支持的快速分词器（表中的“Tokenizer fast”）、是否支持 Jax（通过
+Flax）、PyTorch 与 TensorFlow。
+
+<!--This table is updated automatically from the auto modules with _make fix-copies_. Do not update manually!-->
+
+|             Model             | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support |
+|:-----------------------------:|:--------------:|:--------------:|:---------------:|:------------------:|:------------:|
+|            ALBERT             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|            AltCLIP            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+| Audio Spectrogram Transformer |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             BART              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|             BEiT              |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|             BERT              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|        Bert Generation        |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            BigBird            |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
+|        BigBird-Pegasus        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            BioGpt             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              BiT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          Blenderbot           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|        BlenderbotSmall        |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|             BLIP              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             BLOOM             |       ❌       |       ✅       |       ✅        |         ❌         |      ❌      |
+|           CamemBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            CANINE             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         Chinese-CLIP          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             CLIP              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|            CLIPSeg            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            CodeGen            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|       Conditional DETR        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           ConvBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           ConvNeXT            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             CTRL              |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|              CvT              |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|         Data2VecAudio         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         Data2VecText          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        Data2VecVision         |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            DeBERTa            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|          DeBERTa-v2           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|     Decision Transformer      |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        Deformable DETR        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             DeiT              |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             DETR              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             DiNAT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          DistilBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|           DonutSwin           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              DPR              |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|              DPT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            ELECTRA            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|        Encoder decoder        |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|             ERNIE             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              ESM              |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|  FairSeq Machine-Translation  |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           FlauBERT            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             FLAVA             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             FNet              |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|      Funnel Transformer       |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|              GIT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             GLPN              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            GPT Neo            |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|           GPT NeoX            |       ❌       |       ✅       |       ✅        |         ❌         |      ❌      |
+|       GPT NeoX Japanese       |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             GPT-J             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|            GPT-Sw3            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|           GroupViT            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            Hubert             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            I-BERT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           ImageGPT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            Jukebox            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           LayoutLM            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|          LayoutLMv2           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          LayoutLMv3           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|              LED              |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|             LeViT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             LiLT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          Longformer           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            LongT5             |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|             LUKE              |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            LXMERT             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            M-CTC-T            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            M2M100             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            Marian             |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
+|           MarkupLM            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          Mask2Former          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          MaskFormer           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        MaskFormerSwin         |       ❌       |       ❌       |       ❌        |         ❌         |      ❌      |
+|             mBART             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|         Megatron-BERT         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          MobileBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|          MobileNetV1          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          MobileNetV2          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           MobileViT           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             MPNet             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|              MT5              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|              MVP              |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|              NAT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             Nezha             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         Nyströmformer         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          OpenAI GPT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|         OpenAI GPT-2          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|              OPT              |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|            OWL-ViT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            Pegasus            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|           PEGASUS-X           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Perceiver           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            PLBart             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          PoolFormer           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          ProphetNet           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            QDQBert            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              RAG              |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             REALM             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|           Reformer            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|            RegNet             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|            RemBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            ResNet             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|           RetriBERT           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|            RoBERTa            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|     RoBERTa-PreLayerNorm      |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|            RoCBert            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           RoFormer            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|           SegFormer           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|              SEW              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             SEW-D             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|    Speech Encoder decoder     |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|          Speech2Text          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|         Speech2Text2          |       ✅       |       ❌       |       ❌        |         ❌         |      ❌      |
+|           Splinter            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          SqueezeBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|       Swin Transformer        |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|      Swin Transformer V2      |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            Swin2SR            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|      SwitchTransformers       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              T5               |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|       Table Transformer       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             TAPAS             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|    Time Series Transformer    |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          TimeSformer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|    Trajectory Transformer     |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        Transformer-XL         |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             TrOCR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           UniSpeech           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         UniSpeechSat          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            UPerNet            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              VAN              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           VideoMAE            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             ViLT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|    Vision Encoder decoder     |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|     VisionTextDualEncoder     |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|          VisualBERT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              ViT              |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|          ViT Hybrid           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            ViTMAE             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            ViTMSN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Wav2Vec2            |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
+|      Wav2Vec2-Conformer       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             WavLM             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            Whisper            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            X-CLIP             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             XGLM              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|              XLM              |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|        XLM-ProphetNet         |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          XLM-RoBERTa          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|        XLM-RoBERTa-XL         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             XLNet             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|             YOLOS             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             YOSO              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+
+<!-- End table-->
diff --git a/docs/source/zh/installation.md b/docs/source/zh/installation.md
new file mode 100644
index 000000000000..796f1cc8b31f
--- /dev/null
+++ b/docs/source/zh/installation.md
@@ -0,0 +1,256 @@
+<!---
+Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 安装
+
+为你正在使用的深度学习框架安装 🤗 Transformers、设置缓存，并选择性配置 🤗 Transformers 以离线运行。
+
+🤗 Transformers 已在 Python 3.6+、PyTorch 1.1.0+、TensorFlow 2.0+ 以及 Flax 上进行测试。针对你使用的深度学习框架，请参照以下安装说明进行安装：
+
+* [PyTorch](https://pytorch.org/get-started/locally/) 安装说明。
+* [TensorFlow 2.0](https://www.tensorflow.org/install/pip) 安装说明。
+* [Flax](https://flax.readthedocs.io/en/latest/) 安装说明。
+
+## 使用 pip 安装
+
+你应该使用 [虚拟环境](https://docs.python.org/3/library/venv.html) 安装 🤗 Transformers。如果你不熟悉 Python 虚拟环境，请查看此 [教程](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)。使用虚拟环境，你可以轻松管理不同项目，避免不同依赖项之间的兼容性问题。
+
+首先，在项目目录中创建虚拟环境：
+
+```bash
+python -m venv .env
+```
+
+在 Linux 和 MacOs 系统中激活虚拟环境：
+
+```bash
+source .env/bin/activate
+```
+在 Windows 系统中激活虚拟环境：
+
+```bash
+.env/Scripts/activate
+```
+
+现在你可以使用以下命令安装 🤗 Transformers：
+
+```bash
+pip install transformers
+```
+
+若仅需 CPU 支持，可以使用单行命令方便地安装 🤗 Transformers 和深度学习库。例如，使用以下命令安装 🤗 Transformers 和 PyTorch：
+
+```bash
+pip install 'transformers[torch]'
+```
+
+🤗 Transformers 和 TensorFlow 2.0：
+
+```bash
+pip install 'transformers[tf-cpu]'
+```
+
+<Tip warning={true}>
+
+M1 / ARM用户
+    
+在安装 TensorFlow 2.0 前，你需要安装以下库：
+```
+brew install cmake
+brew install pkg-config
+```
+
+</Tip>
+
+🤗 Transformers 和 Flax:
+
+```bash
+pip install 'transformers[flax]'
+```
+
+最后，运行以下命令以检查 🤗 Transformers 是否已被正确安装。该命令将下载一个预训练模型：
+
+```bash
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"
+```
+
+然后打印标签以及分数：
+
+```bash
+[{'label': 'POSITIVE', 'score': 0.9998704791069031}]
+```
+
+## 源码安装
+
+使用以下命令从源码安装 🤗 Transformers：
+
+```bash
+pip install git+https://github.com/huggingface/transformers
+```
+
+此命令下载的是最新的前沿 `main` 版本而不是最新的 `stable` 版本。`main` 版本适用于跟最新开发保持一致。例如，上次正式版发布带来的 bug 被修复了，但新版本尚未被推出。但是，这也说明 `main` 版本并不一定总是稳定的。我们努力保持 `main` 版本的可操作性，大多数问题通常在几个小时或一天以内就能被解决。如果你遇到问题，请提个 [Issue](https://github.com/huggingface/transformers/issues) 以便我们能更快修复。
+
+运行以下命令以检查 🤗 Transformers 是否已被正确安装：
+
+```bash
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I love you'))"
+```
+
+## 可编辑安装
+
+如果你有下列需求，需要进行可编辑安装：
+
+* 使用源码的 `main` 版本。
+* 为 🤗 Transformers 贡献代码，需要测试代码中的更改。
+
+使用以下命令克隆仓库并安装 🤗 Transformers：
+
+```bash
+git clone https://github.com/huggingface/transformers.git
+cd transformers
+pip install -e .
+```
+
+这些命令将会链接你克隆的仓库以及你的 Python 库路径。现在，Python 不仅会在正常的库路径中搜索库，也会在你克隆到的文件夹中进行查找。例如，如果你的 Python 包通常本应安装在 `~/anaconda3/envs/main/lib/python3.7/site-packages/` 目录中，在这种情况下 Python 也会搜索你克隆到的文件夹：`~/transformers/`。
+
+<Tip warning={true}>
+
+如果你想继续使用这个库，必须保留 `transformers` 文件夹。
+
+</Tip>
+
+现在，你可以使用以下命令，将你克隆的 🤗 Transformers 库轻松更新至最新版本：
+
+```bash
+cd ~/transformers/
+git pull
+```
+
+你的 Python 环境将在下次运行时找到 `main` 版本的 🤗 Transformers。
+
+## 使用 conda 安装
+
+从 conda 的 `huggingface` 频道安装：
+
+```bash
+conda install -c huggingface transformers
+```
+
+## 缓存设置
+
+预训练模型会被下载并本地缓存到 `~/.cache/huggingface/hub`。这是由环境变量 `TRANSFORMERS_CACHE` 指定的默认目录。在 Windows 上，默认目录为 `C:\Users\username\.cache\huggingface\hub`。你可以按照不同优先级改变下述环境变量，以指定不同的缓存目录。
+
+1. 环境变量（默认）: `HUGGINGFACE_HUB_CACHE` 或 `TRANSFORMERS_CACHE`。
+2. 环境变量 `HF_HOME`。
+3. 环境变量 `XDG_CACHE_HOME` + `/huggingface`。
+
+<Tip>
+
+除非你明确指定了环境变量 `TRANSFORMERS_CACHE`，🤗 Transformers 将可能会使用较早版本设置的环境变量 `PYTORCH_TRANSFORMERS_CACHE` 或 `PYTORCH_PRETRAINED_BERT_CACHE`。
+
+</Tip>
+
+## 离线模式
+
+🤗 Transformers 可以仅使用本地文件在防火墙或离线环境中运行。设置环境变量 `TRANSFORMERS_OFFLINE=1` 以启用该行为。
+
+<Tip>
+
+通过设置环境变量 `HF_DATASETS_OFFLINE=1` 将 [🤗 Datasets](https://huggingface.co/docs/datasets/) 添加至你的离线训练工作流程中。
+
+</Tip>
+
+例如，你通常会使用以下命令对外部实例进行防火墙保护的的普通网络上运行程序：
+
+```bash
+python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
+```
+
+在离线环境中运行相同的程序：
+
+```bash
+HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
+python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
+```
+
+现在脚本可以应该正常运行，而无需挂起或等待超时，因为它知道只应查找本地文件。
+
+### 获取离线时使用的模型和分词器
+
+另一种离线时使用 🤗 Transformers 的方法是预先下载好文件，然后在需要离线使用时指向它们的离线路径。有三种实现的方法：
+
+* 单击 [Model Hub](https://huggingface.co/models) 用户界面上的 ↓ 图标下载文件。
+
+    ![下载图标](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/download-icon.png)
+
+* 使用 [`PreTrainedModel.from_pretrained`] 和 [`PreTrainedModel.save_pretrained`] 工作流程：
+
+    1. 预先使用 [`PreTrainedModel.from_pretrained`] 下载文件：
+
+    ```py
+    >>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/T0_3B")
+    >>> model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0_3B")
+    ```
+
+    2. 使用 [`PreTrainedModel.save_pretrained`] 将文件保存至指定目录：
+
+    ```py
+    >>> tokenizer.save_pretrained("./your/path/bigscience_t0")
+    >>> model.save_pretrained("./your/path/bigscience_t0")
+    ```
+
+    3. 现在，你可以在离线时从指定目录使用 [`PreTrainedModel.from_pretrained`] 重新加载你的文件：
+
+    ```py
+    >>> tokenizer = AutoTokenizer.from_pretrained("./your/path/bigscience_t0")
+    >>> model = AutoModel.from_pretrained("./your/path/bigscience_t0")
+    ```
+
+* 使用代码用 [huggingface_hub](https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub) 库下载文件：
+
+    1. 在你的虚拟环境中安装 `huggingface_hub` 库：
+
+    ```bash
+    python -m pip install huggingface_hub
+    ```
+
+    2. 使用 [`hf_hub_download`](https://huggingface.co/docs/hub/adding-a-library#download-files-from-the-hub) 函数将文件下载到指定路径。例如，以下命令将 `config.json` 文件从 [T0](https://huggingface.co/bigscience/T0_3B) 模型下载至你想要的路径：
+
+    ```py
+    >>> from huggingface_hub import hf_hub_download
+
+    >>> hf_hub_download(repo_id="bigscience/T0_3B", filename="config.json", cache_dir="./your/path/bigscience_t0")
+    ```
+
+下载完文件并在本地缓存后，指定其本地路径以加载和使用该模型：
+
+```py
+>>> from transformers import AutoConfig
+
+>>> config = AutoConfig.from_pretrained("./your/path/bigscience_t0/config.json")
+```
+
+<Tip>
+
+请参阅 [如何从 Hub 下载文件](https://huggingface.co/docs/hub/how-to-downstream) 部分，获取有关下载存储在 Hub 上文件的更多详细信息。
+
+</Tip>
diff --git a/docs/source/zh/quicktour.md b/docs/source/zh/quicktour.md
new file mode 100644
index 000000000000..75b5f398e946
--- /dev/null
+++ b/docs/source/zh/quicktour.md
@@ -0,0 +1,547 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 快速上手
+
+[[open-in-colab]]
+
+快来使用 🤗 Transformers 吧！无论你是开发人员还是日常用户，这篇快速上手教程都将帮助你入门并且向你展示如何使用 [`pipeline`] 进行推理，使用 [AutoClass](./model_doc/auto) 加载一个预训练模型和预处理器，以及使用 PyTorch 或 TensorFlow 快速训练一个模型。如果你是一个初学者，我们建议你接下来查看我们的教程或者[课程](https://huggingface.co/course/chapter1/1)，来更深入地了解在这里介绍到的概念。
+
+在开始之前，确保你已经安装了所有必要的库：
+
+```bash
+!pip install transformers datasets
+```
+
+你还需要安装喜欢的机器学习框架：
+
+<frameworkcontent>
+<pt>
+
+```bash
+pip install torch
+```
+</pt>
+<tf>
+
+```bash
+pip install tensorflow
+```
+</tf>
+</frameworkcontent>
+
+## Pipeline
+
+<Youtube id="tiZFewofSLM"/>
+
+使用 [`pipeline`] 是利用预训练模型进行推理的最简单的方式。你能够将 [`pipeline`] 开箱即用地用于跨不同模态的多种任务。来看看它支持的任务列表：
+
+| **任务**                     | **描述**                            | **模态**        | **Pipeline**                       |
+|------------------------------|-----------------------------------|-----------------|-----------------------------------------------|
+| 文本分类                      | 为给定的文本序列分配一个标签                    | NLP             | pipeline(task="sentiment-analysis")           |
+| 文本生成                      | 根据给定的提示生成文本                       | NLP             | pipeline(task="text-generation")              |
+| 命名实体识别                  | 为序列里的每个 token 分配一个标签（人, 组织, 地址等等） | NLP             | pipeline(task="ner")                          |
+| 问答系统                      | 通过给定的上下文和问题, 在文本中提取答案             | NLP             | pipeline(task="question-answering")           |
+| 掩盖填充                      | 预测出正确的在序列中被掩盖的token               | NLP             | pipeline(task="fill-mask")                    |
+| 文本摘要                      | 为文本序列或文档生成总结                      | NLP             | pipeline(task="summarization")                |
+| 文本翻译                      | 将文本从一种语言翻译为另一种语言                  | NLP             | pipeline(task="translation")                  |
+| 图像分类                      | 为图像分配一个标签                         | Computer vision | pipeline(task="image-classification")         |
+| 图像分割                      | 为图像中每个独立的像素分配标签（支持语义、全景和实例分割）     | Computer vision | pipeline(task="image-segmentation")           |
+| 目标检测                      | 预测图像中目标对象的边界框和类别                  | Computer vision | pipeline(task="object-detection")             |
+| 音频分类                      | 给音频文件分配一个标签                       | Audio           | pipeline(task="audio-classification")         |
+| 自动语音识别                   | 将音频文件中的语音提取为文本                    | Audio           | pipeline(task="automatic-speech-recognition") |
+| 视觉问答                      | 给定一个图像和一个问题，正确地回答有关图像的问题          | Multimodal      | pipeline(task="vqa")                          |
+
+创建一个 [`pipeline`] 实例并且指定你想要将它用于的任务，就可以开始了。你可以将 [`pipeline`] 用于任何一个上面提到的任务，如果想知道支持的任务的完整列表，可以查阅 [pipeline API 参考](./main_classes/pipelines)。不过, 在这篇教程中，你将把 [`pipeline`] 用在一个情感分析示例上：
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline("sentiment-analysis")
+```
+
+[`pipeline`] 会下载并缓存一个用于情感分析的默认的[预训练模型](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)和分词器。现在你可以在目标文本上使用 `classifier` 了：
+
+```py
+>>> classifier("We are very happy to show you the 🤗 Transformers library.")
+[{'label': 'POSITIVE', 'score': 0.9998}]
+```
+
+如果你有不止一个输入，可以把所有输入放入一个列表然后传给[`pipeline`]，它将会返回一个字典列表：
+
+```py
+>>> results = classifier(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."])
+>>> for result in results:
+...     print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
+label: POSITIVE, with score: 0.9998
+label: NEGATIVE, with score: 0.5309
+```
+
+[`pipeline`] 也可以为任何你喜欢的任务遍历整个数据集。在下面这个示例中，让我们选择自动语音识别作为我们的任务：
+
+```py
+>>> import torch
+>>> from transformers import pipeline
+
+>>> speech_recognizer = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
+```
+
+加载一个你想遍历的音频数据集（查阅 🤗 Datasets [快速开始](https://huggingface.co/docs/datasets/quickstart#audio) 获得更多信息）。比如，加载 [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) 数据集：
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")  # doctest: +IGNORE_RESULT
+```
+
+你需要确保数据集中的音频的采样率与 [`facebook/wav2vec2-base-960h`](https://huggingface.co/facebook/wav2vec2-base-960h) 训练用到的音频的采样率一致：
+
+```py
+>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=speech_recognizer.feature_extractor.sampling_rate))
+```
+
+当调用 `"audio"` 列时, 音频文件将会自动加载并重采样。
+从前四个样本中提取原始波形数组，将它作为列表传给 pipeline：
+
+```py
+>>> result = speech_recognizer(dataset[:4]["audio"])
+>>> print([d["text"] for d in result])
+['I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT', "FODING HOW I'D SET UP A JOIN TO HET WITH MY WIFE AND WHERE THE AP MIGHT BE", "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE AP SO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AND I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS", 'HOW DO I THURN A JOIN A COUNT']
+```
+
+对于输入非常庞大的大型数据集（比如语音或视觉），你会想到使用一个生成器，而不是一个将所有输入都加载进内存的列表。查阅 [pipeline API 参考](./main_classes/pipelines) 来获取更多信息。
+
+### 在 pipeline 中使用另一个模型和分词器
+
+[`pipeline`] 可以容纳 [Hub](https://huggingface.co/models) 中的任何模型，这让 [`pipeline`] 更容易适用于其他用例。比如，你想要一个能够处理法语文本的模型，就可以使用 Hub 上的标记来筛选出合适的模型。靠前的筛选结果会返回一个为情感分析微调的多语言的 [BERT 模型](https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment)，你可以将它用于法语文本：
+
+```py
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+```
+
+<frameworkcontent>
+<pt>
+使用 [`AutoModelForSequenceClassification`] 和 [`AutoTokenizer`] 来加载预训练模型和它关联的分词器（更多信息可以参考下一节的 `AutoClass`）：
+
+```py
+>>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained(model_name)
+>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+```
+</pt>
+<tf>
+使用 [`TFAutoModelForSequenceClassification`] 和 [`AutoTokenizer`] 来加载预训练模型和它关联的分词器（更多信息可以参考下一节的 `TFAutoClass`）：
+
+```py
+>>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+
+>>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
+>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+```
+</tf>
+</frameworkcontent>
+
+在 [`pipeline`] 中指定模型和分词器，现在你就可以在法语文本上使用 `classifier` 了：
+
+```py
+>>> classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
+>>> classifier("Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers.")
+[{'label': '5 stars', 'score': 0.7273}]
+```
+
+如果你没有找到适合你的模型，就需要在你的数据上微调一个预训练模型了。查看 [微调教程](./training) 来学习怎样进行微调。最后，微调完模型后，考虑一下在 Hub 上与社区 [分享](./model_sharing) 这个模型，把机器学习普及到每一个人! 🤗
+
+## AutoClass
+
+<Youtube id="AhChOFRegn4"/>
+
+在幕后，是由 [`AutoModelForSequenceClassification`] 和 [`AutoTokenizer`] 一起支持你在上面用到的 [`pipeline`]。[AutoClass](./model_doc/auto) 是一个能够通过预训练模型的名称或路径自动查找其架构的快捷方式。你只需要为你的任务选择合适的 `AutoClass` 和它关联的预处理类。
+
+让我们回过头来看上一节的示例，看看怎样使用 `AutoClass` 来重现使用 [`pipeline`] 的结果。
+
+### AutoTokenizer
+
+分词器负责预处理文本，将文本转换为用于输入模型的数字数组。有多个用来管理分词过程的规则，包括如何拆分单词和在什么样的级别上拆分单词（在 [分词器总结](./tokenizer_summary) 学习更多关于分词的信息）。要记住最重要的是你需要实例化的分词器要与模型的名称相同, 来确保和模型训练时使用相同的分词规则。
+
+使用 [`AutoTokenizer`] 加载一个分词器:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+```
+
+将文本传入分词器：
+
+```py
+>>> encoding = tokenizer("We are very happy to show you the 🤗 Transformers library.")
+>>> print(encoding)
+{'input_ids': [101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103, 100, 58263, 13299, 119, 102],
+ 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+ 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+```
+
+分词器返回了含有如下内容的字典:
+
+* [input_ids](./glossary#input-ids)：用数字表示的 token。
+* [attention_mask](.glossary#attention-mask)：应该关注哪些 token 的指示。
+
+分词器也可以接受列表作为输入，并填充和截断文本，返回具有统一长度的批次：
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> pt_batch = tokenizer(
+...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
+...     padding=True,
+...     truncation=True,
+...     max_length=512,
+...     return_tensors="pt",
+... )
+```
+</pt>
+<tf>
+
+```py
+>>> tf_batch = tokenizer(
+...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
+...     padding=True,
+...     truncation=True,
+...     max_length=512,
+...     return_tensors="tf",
+... )
+```
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+查阅[预处理](./preprocessing)教程来获得有关分词的更详细的信息，以及如何使用 [`AutoFeatureExtractor`] 和 [`AutoProcessor`] 来处理图像，音频，还有多模式输入。
+
+</Tip>
+
+### AutoModel
+
+<frameworkcontent>
+<pt>
+🤗 Transformers 提供了一种简单统一的方式来加载预训练的实例. 这表示你可以像加载 [`AutoTokenizer`] 一样加载 [`AutoModel`]。唯一不同的地方是为你的任务选择正确的[`AutoModel`]。对于文本（或序列）分类，你应该加载[`AutoModelForSequenceClassification`]：
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)
+```
+
+<Tip>
+
+通过 [任务摘要](./task_summary) 查找 [`AutoModel`] 支持的任务.
+
+</Tip>
+
+现在可以把预处理好的输入批次直接送进模型。你只需要通过 `**` 来解包字典:
+
+```py
+>>> pt_outputs = pt_model(**pt_batch)
+```
+
+模型在 `logits` 属性输出最终的激活结果. 在 `logits` 上应用 softmax 函数来查询概率:
+
+```py
+>>> from torch import nn
+
+>>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
+>>> print(pt_predictions)
+tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
+        [0.2084, 0.1826, 0.1969, 0.1755, 0.2365]], grad_fn=<SoftmaxBackward0>)
+```
+</pt>
+<tf>
+🤗 Transformers 提供了一种简单统一的方式来加载预训练的实例。这表示你可以像加载 [`AutoTokenizer`] 一样加载 [`TFAutoModel`]。唯一不同的地方是为你的任务选择正确的 [`TFAutoModel`]，对于文本（或序列）分类，你应该加载 [`TFAutoModelForSequenceClassification`]：
+
+```py
+>>> from transformers import TFAutoModelForSequenceClassification
+
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
+```
+
+<Tip>
+
+通过 [任务摘要](./task_summary) 查找 [`AutoModel`] 支持的任务.
+
+</Tip>
+
+现在通过直接将字典的键传给张量，将预处理的输入批次传给模型。
+
+```py
+>>> tf_outputs = tf_model(tf_batch)
+```
+
+模型在 `logits` 属性输出最终的激活结果。在 `logits` 上应用softmax函数来查询概率：
+
+```py
+>>> import tensorflow as tf
+
+>>> tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1)
+>>> tf_predictions  # doctest: +IGNORE_RESULT
+```
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+所有 🤗 Transformers 模型（PyTorch 或 TensorFlow）在最终的激活函数（比如 softmax）*之前* 输出张量，
+因为最终的激活函数常常与 loss 融合。模型的输出是特殊的数据类，所以它们的属性可以在 IDE 中被自动补全。模型的输出就像一个元组或字典（你可以通过整数、切片或字符串来索引它），在这种情况下，为 None 的属性会被忽略。
+
+</Tip>
+
+### 保存模型
+
+<frameworkcontent>
+<pt>
+当你的模型微调完成，你就可以使用 [`PreTrainedModel.save_pretrained`] 把它和它的分词器保存下来：
+
+```py
+>>> pt_save_directory = "./pt_save_pretrained"
+>>> tokenizer.save_pretrained(pt_save_directory)  # doctest: +IGNORE_RESULT
+>>> pt_model.save_pretrained(pt_save_directory)
+```
+
+当你准备再次使用这个模型时，就可以使用 [`PreTrainedModel.from_pretrained`] 加载它了：
+
+```py
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained")
+```
+</pt>
+<tf>
+当你的模型微调完成，你就可以使用 [`TFPreTrainedModel.save_pretrained`] 把它和它的分词器保存下来：
+
+```py
+>>> tf_save_directory = "./tf_save_pretrained"
+>>> tokenizer.save_pretrained(tf_save_directory)  # doctest: +IGNORE_RESULT
+>>> tf_model.save_pretrained(tf_save_directory)
+```
+
+当你准备再次使用这个模型时，就可以使用 [`TFPreTrainedModel.from_pretrained`] 加载它了：
+
+```py
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("./tf_save_pretrained")
+```
+</tf>
+</frameworkcontent>
+
+🤗 Transformers 有一个特别酷的功能，它能够保存一个模型，并且将它加载为 PyTorch 或 TensorFlow 模型。`from_pt` 或 `from_tf` 参数可以将模型从一个框架转换为另一个框架：
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> from transformers import AutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
+```
+</pt>
+<tf>
+
+```py
+>>> from transformers import TFAutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
+```
+</tf>
+</frameworkcontent>
+
+## 自定义模型构建
+
+你可以修改模型的配置类来改变模型的构建方式。配置指明了模型的属性，比如隐藏层或者注意力头的数量。当你从自定义的配置类初始化模型时，你就开始自定义模型构建了。模型属性是随机初始化的，你需要先训练模型，然后才能得到有意义的结果。
+
+通过导入 [`AutoConfig`] 来开始，之后加载你想修改的预训练模型。在 [`AutoConfig.from_pretrained`] 中，你能够指定想要修改的属性，比如注意力头的数量：
+
+```py
+>>> from transformers import AutoConfig
+
+>>> my_config = AutoConfig.from_pretrained("distilbert-base-uncased", n_heads=12)
+```
+
+<frameworkcontent>
+<pt>
+使用 [`AutoModel.from_config`] 根据你的自定义配置创建一个模型：
+
+```py
+>>> from transformers import AutoModel
+
+>>> my_model = AutoModel.from_config(my_config)
+```
+</pt>
+<tf>
+使用 [`TFAutoModel.from_config`] 根据你的自定义配置创建一个模型：
+
+```py
+>>> from transformers import TFAutoModel
+
+>>> my_model = TFAutoModel.from_config(my_config)
+```
+</tf>
+</frameworkcontent>
+
+查阅 [创建一个自定义结构](./create_a_model) 指南获取更多关于构建自定义配置的信息。
+
+## Trainer - PyTorch 优化训练循环
+
+所有的模型都是标准的 [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)，所以你可以在任何典型的训练模型中使用它们。当你编写自己的训练循环时，🤗 Transformers 为 PyTorch 提供了一个 [`Trainer`] 类，它包含了基础的训练循环并且为诸如分布式训练，混合精度等特性增加了额外的功能。
+
+取决于你的任务, 你通常可以传递以下的参数给 [`Trainer`]：
+
+1. [`PreTrainedModel`] 或者 [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)：
+
+   ```py
+   >>> from transformers import AutoModelForSequenceClassification
+
+   >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+   ```
+
+2. [`TrainingArguments`] 含有你可以修改的模型超参数，比如学习率，批次大小和训练时的迭代次数。如果你没有指定训练参数，那么它会使用默认值：
+
+   ```py
+   >>> from transformers import TrainingArguments
+
+   >>> training_args = TrainingArguments(
+   ...     output_dir="path/to/save/folder/",
+   ...     learning_rate=2e-5,
+   ...     per_device_train_batch_size=8,
+   ...     per_device_eval_batch_size=8,
+   ...     num_train_epochs=2,
+   ... )
+   ```
+
+3. 一个预处理类，比如分词器，特征提取器或者处理器：
+
+   ```py
+   >>> from transformers import AutoTokenizer
+
+   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+   ```
+
+4. 加载一个数据集：
+
+   ```py
+   >>> from datasets import load_dataset
+
+   >>> dataset = load_dataset("rotten_tomatoes")  # doctest: +IGNORE_RESULT
+   ```
+
+5. 创建一个给数据集分词的函数，并且使用 [`~datasets.Dataset.map`] 应用到整个数据集：
+
+   ```py
+   >>> def tokenize_dataset(dataset):
+   ...     return tokenizer(dataset["text"])
+
+   >>> dataset = dataset.map(tokenize_dataset, batched=True)
+   ```
+
+6. 用来从数据集中创建批次的 [`DataCollatorWithPadding`]：
+
+   ```py
+   >>> from transformers import DataCollatorWithPadding
+
+   >>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+   ```
+
+现在把所有的类传给 [`Trainer`]：
+
+```py
+>>> from transformers import Trainer
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=dataset["train"],
+...     eval_dataset=dataset["test"],
+...     tokenizer=tokenizer,
+...     data_collator=data_collator,
+... )  # doctest: +SKIP
+```
+
+一切准备就绪后，调用 [`~Trainer.train`] 进行训练：
+
+```py
+>>> trainer.train()  # doctest: +SKIP
+```
+
+<Tip>
+
+对于像翻译或摘要这些使用序列到序列模型的任务，用 [`Seq2SeqTrainer`] 和 [`Seq2SeqTrainingArguments`] 来替代。
+
+</Tip>
+
+你可以通过子类化 [`Trainer`] 中的方法来自定义训练循环。这样你就可以自定义像损失函数，优化器和调度器这样的特性。查阅 [`Trainer`] 参考手册了解哪些方法能够被子类化。
+
+另一个自定义训练循环的方式是通过[回调](./main_classes/callbacks)。你可以使用回调来与其他库集成，查看训练循环来报告进度或提前结束训练。回调不会修改训练循环。如果想自定义损失函数等，就需要子类化 [`Trainer`] 了。
+
+## 使用 Tensorflow 训练
+
+所有模型都是标准的 [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model)，所以你可以通过 [Keras](https://keras.io/) API 实现在 Tensorflow 中训练。🤗 Transformers 提供了 [`~TFPreTrainedModel.prepare_tf_dataset`] 方法来轻松地将数据集加载为 `tf.data.Dataset`，这样你就可以使用 Keras 的 [`compile`](https://keras.io/api/models/model_training_apis/#compile-method) 和 [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) 方法马上开始训练。
+
+1. 使用 [`TFPreTrainedModel`] 或者 [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) 来开始：
+
+   ```py
+   >>> from transformers import TFAutoModelForSequenceClassification
+
+   >>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+   ```
+
+2. 一个预处理类，比如分词器，特征提取器或者处理器：
+
+   ```py
+   >>> from transformers import AutoTokenizer
+
+   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+   ```
+
+3. 创建一个给数据集分词的函数
+
+   ```py
+   >>> def tokenize_dataset(dataset):
+   ...     return tokenizer(dataset["text"])  # doctest: +SKIP
+   ```
+
+4. 使用 [`~datasets.Dataset.map`] 将分词器应用到整个数据集，之后将数据集和分词器传给 [`~TFPreTrainedModel.prepare_tf_dataset`]。如果你需要的话，也可以在这里改变批次大小和是否打乱数据集：
+
+   ```py
+   >>> dataset = dataset.map(tokenize_dataset)  # doctest: +SKIP
+   >>> tf_dataset = model.prepare_tf_dataset(
+   ...     dataset, batch_size=16, shuffle=True, tokenizer=tokenizer
+   ... )  # doctest: +SKIP
+   ```
+
+5. 一切准备就绪后，调用 `compile` 和 `fit` 开始训练：
+
+   ```py
+   >>> from tensorflow.keras.optimizers import Adam
+
+   >>> model.compile(optimizer=Adam(3e-5))
+   >>> model.fit(dataset)  # doctest: +SKIP
+   ```
+
+## 接下来做什么?
+
+现在你已经完成了 🤗 Transformers 的快速上手教程，来看看我们的指南并且学习如何做一些更具体的事情，比如写一个自定义模型，为某个任务微调一个模型以及如何使用脚本来训练模型。如果你有兴趣了解更多 🤗 Transformers 的核心章节，那就喝杯咖啡然后来看看我们的概念指南吧！
diff --git a/docs/source/zh/quicktour.mdx b/docs/source/zh/quicktour.mdx
deleted file mode 100644
index a9125136ced7..000000000000
--- a/docs/source/zh/quicktour.mdx
+++ /dev/null
@@ -1,538 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# 快速上手
-
-[[open-in-colab]]
-
-快来使用 🤗 Transformers 吧! 无论你是开发人员还是日常用户, 这篇快速上手教程都将帮助你入门并且向你展示如何使用[`pipeline`]进行推理, 使用[AutoClass](./model_doc/auto)加载一个预训练模型和预处理器, 以及使用PyTorch或TensorFlow快速训练一个模型. 如果你是一个初学者, 我们建议你接下来查看我们的教程或者[课程](https://huggingface.co/course/chapter1/1), 来更深入地了解在这里介绍到的概念.
-
-在开始之前, 确保你已经安装了所有必要的库:
-
-```bash
-!pip install transformers datasets
-```
-
-你还需要安装喜欢的机器学习框架:
-
-<frameworkcontent>
-<pt>
-```bash
-pip install torch
-```
-</pt>
-<tf>
-```bash
-pip install tensorflow
-```
-</tf>
-</frameworkcontent>
-
-## Pipeline
-
-<Youtube id="tiZFewofSLM"/>
-
-使用[`pipeline`]是利用预训练模型进行推理的最简单的方式. 你能够将[`pipeline`]开箱即用地用于跨不同模态的多种任务. 来看看它支持的任务列表:
-
-| **任务**                     | **描述**                                                                                                      | **模态**        | **Pipeline**                       |
-|------------------------------|--------------------------------------------------------------------------------------------------------------|-----------------|-----------------------------------------------|
-| 文本分类                      | 为给定的文本序列分配一个标签                                                                                    | NLP             | pipeline(task="sentiment-analysis")           |
-| 文本生成                      | 根据给定的提示生成文本                                                                                         | NLP             | pipeline(task="text-generation")              |
-| 命名实体识别                  | 为序列里的每个token分配一个标签(人, 组织, 地址等等)                                                              | NLP             | pipeline(task="ner")                          |
-| 问答系统                      | 通过给定的上下文和问题, 在文本中提取答案                                                                         | NLP             | pipeline(task="question-answering")           |
-| 掩盖填充                      | 预测出正确的在序列中被掩盖的token                                                                               | NLP             | pipeline(task="fill-mask")                    |
-| 文本摘要                      | 为文本序列或文档生成总结                                                                                        | NLP             | pipeline(task="summarization")                |
-| 文本翻译                      | 将文本从一种语言翻译为另一种语言                                                                                | NLP             | pipeline(task="translation")                  |
-| 图像分类                      | 为图像分配一个标签                                                                                             | Computer vision | pipeline(task="image-classification")         |
-| 图像分割                      | 为图像中每个独立的像素分配标签(支持语义、全景和实例分割)                                                          | Computer vision | pipeline(task="image-segmentation")           |
-| 目标检测                      | 预测图像中目标对象的边界框和类别                                                                                | Computer vision | pipeline(task="object-detection")             |
-| 音频分类                      | 给音频文件分配一个标签                                                                                         | Audio           | pipeline(task="audio-classification")         |
-| 自动语音识别                   | 将音频文件中的语音提取为文本                                                                                   | Audio           | pipeline(task="automatic-speech-recognition") |
-| 视觉问答                      | 给定一个图像和一个问题，正确地回答有关图像的问题                                                                  | Multimodal      | pipeline(task="vqa")                          |
-
-创建一个[`pipeline`]实例并且指定你想要将它用于的任务, 就可以开始了. 你可以将[`pipeline`]用于任何一个上面提到的任务, 如果想知道支持的任务的完整列表, 可以查阅[pipeline API 参考](./main_classes/pipelines). 不过, 在这篇教程中, 你将把 [`pipeline`]用在一个情感分析示例上:
-
-```py
->>> from transformers import pipeline
-
->>> classifier = pipeline("sentiment-analysis")
-```
-
-[`pipeline`] 会下载并缓存一个用于情感分析的默认的[预训练模型](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)和分词器. 现在你可以在目标文本上使用 `classifier`了:
-
-```py
->>> classifier("We are very happy to show you the 🤗 Transformers library.")
-[{'label': 'POSITIVE', 'score': 0.9998}]
-```
-
-如果你有不止一个输入, 可以把所有输入放入一个列表然后传给[`pipeline`], 它将会返回一个字典列表:
-
-```py
->>> results = classifier(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."])
->>> for result in results:
-...     print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
-label: POSITIVE, with score: 0.9998
-label: NEGATIVE, with score: 0.5309
-```
-
-[`pipeline`] 也可以为任何你喜欢的任务遍历整个数据集. 在下面这个示例中, 让我们选择自动语音识别作为我们的任务:
-
-```py
->>> import torch
->>> from transformers import pipeline
-
->>> speech_recognizer = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
-```
-
-加载一个你想遍历的音频数据集 (查阅 🤗 Datasets [快速开始](https://huggingface.co/docs/datasets/quickstart#audio) 获得更多信息). 比如, 加载 [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) 数据集:
-
-```py
->>> from datasets import load_dataset, Audio
-
->>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")  # doctest: +IGNORE_RESULT
-```
-
-你需要确保数据集中的音频的采样率与 [`facebook/wav2vec2-base-960h`](https://huggingface.co/facebook/wav2vec2-base-960h) 训练用到的音频的采样率一致:
-
-```py
->>> dataset = dataset.cast_column("audio", Audio(sampling_rate=speech_recognizer.feature_extractor.sampling_rate))
-```
-
-当调用`"audio"` column时, 音频文件将会自动加载并重采样.
-从前四个样本中提取原始波形数组, 将它作为列表传给pipeline:
-
-```py
->>> result = speech_recognizer(dataset[:4]["audio"])
->>> print([d["text"] for d in result])
-['I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT', "FODING HOW I'D SET UP A JOIN TO HET WITH MY WIFE AND WHERE THE AP MIGHT BE", "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE AP SO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AND I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS", 'HOW DO I THURN A JOIN A COUNT']
-```
-
-对于输入非常庞大的大型数据集 (比如语音或视觉), 你会想到使用一个生成器, 而不是一个将所有输入都加载进内存的列表. 查阅 [pipeline API 参考](./main_classes/pipelines) 来获取更多信息.
-
-### 在pipeline中使用另一个模型和分词器
-
-[`pipeline`]可以容纳[Hub](https://huggingface.co/models)中的任何模型, 这让[`pipeline`]更容易适用于其他用例. 比如, 你想要一个能够处理法语文本的模型, 就可以使用Hub上的标记来筛选出合适的模型. 靠前的筛选结果会返回一个为情感分析微调的多语言的 [BERT 模型](https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment), 你可以将它用于法语文本:
-
-```py
->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
-```
-
-<frameworkcontent>
-<pt>
-使用 [`AutoModelForSequenceClassification`]和[`AutoTokenizer`]来加载预训练模型和它关联的分词器 (更多信息可以参考下一节的 `AutoClass`):
-
-```py
->>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
-
->>> model = AutoModelForSequenceClassification.from_pretrained(model_name)
->>> tokenizer = AutoTokenizer.from_pretrained(model_name)
-```
-</pt>
-<tf>
-使用 [`TFAutoModelForSequenceClassification`]和[`AutoTokenizer`] 来加载预训练模型和它关联的分词器 (更多信息可以参考下一节的 `TFAutoClass`):
-
-```py
->>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
-
->>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
->>> tokenizer = AutoTokenizer.from_pretrained(model_name)
-```
-</tf>
-</frameworkcontent>
-
-在[`pipeline`]中指定模型和分词器, 现在你就可以在法语文本上使用 `classifier`了:
-
-```py
->>> classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
->>> classifier("Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers.")
-[{'label': '5 stars', 'score': 0.7273}]
-```
-
-如果你没有找到适合你的模型, 就需要在你的数据上微调一个预训练模型了. 查看[微调教程](./training) 来学习怎样进行微调. 最后, 微调完模型后, 考虑一下在Hub上与社区 [分享](./model_sharing) 这个模型, 把机器学习普及到每一个人! 🤗
-
-## AutoClass
-
-<Youtube id="AhChOFRegn4"/>
-
-在幕后, 是由[`AutoModelForSequenceClassification`]和[`AutoTokenizer`]一起支持你在上面用到的[`pipeline`].  [AutoClass](./model_doc/auto) 是一个能够通过预训练模型的名称或路径自动查找其架构的快捷方式. 你只需要为你的任务选择合适的 `AutoClass` 和它关联的预处理类. 
-
-让我们回过头来看上一节的示例, 看看怎样使用 `AutoClass` 来重现使用[`pipeline`]的结果.
-
-### AutoTokenizer
-
-分词器负责预处理文本, 将文本转换为用于输入模型的数字数组. 有多个用来管理分词过程的规则, 包括如何拆分单词和在什么样的级别上拆分单词 (在 [分词器总结](./tokenizer_summary)学习更多关于分词的信息). 要记住最重要的是你需要实例化的分词器要与模型的名称相同, 来确保和模型训练时使用相同的分词规则.
-
-使用[`AutoTokenizer`]加载一个分词器:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
->>> tokenizer = AutoTokenizer.from_pretrained(model_name)
-```
-
-将文本传入分词器:
-
-```py
->>> encoding = tokenizer("We are very happy to show you the 🤗 Transformers library.")
->>> print(encoding)
-{'input_ids': [101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103, 100, 58263, 13299, 119, 102],
- 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
- 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
-```
-
-分词器返回了含有如下内容的字典:
-
-* [input_ids](./glossary#input-ids): 用数字表示的token.
-* [attention_mask](.glossary#attention-mask): 应该关注哪些token的指示.
-
-分词器也可以接受列表作为输入, 并填充和截断文本, 返回具有统一长度的批次:
-
-<frameworkcontent>
-<pt>
-```py
->>> pt_batch = tokenizer(
-...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
-...     padding=True,
-...     truncation=True,
-...     max_length=512,
-...     return_tensors="pt",
-... )
-```
-</pt>
-<tf>
-```py
->>> tf_batch = tokenizer(
-...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
-...     padding=True,
-...     truncation=True,
-...     max_length=512,
-...     return_tensors="tf",
-... )
-```
-</tf>
-</frameworkcontent>
-
-<Tip>
-
-查阅[预处理](./preprocessing)教程来获得有关分词的更详细的信息, 以及如何使用[`AutoFeatureExtractor`]和[`AutoProcessor`]来处理图像, 音频, 还有多模式输入.
-
-</Tip>
-
-### AutoModel
-
-<frameworkcontent>
-<pt>
-🤗 Transformers 提供了一种简单统一的方式来加载预训练的实例. 这表示你可以像加载[`AutoTokenizer`]一样加载[`AutoModel`]. 唯一不同的地方是为你的任务选择正确的[`AutoModel`]. 对于文本 (或序列) 分类, 你应该加载[`AutoModelForSequenceClassification`]:
-
-```py
->>> from transformers import AutoModelForSequenceClassification
-
->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
->>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)
-```
-
-<Tip>
-
-通过[任务摘要](./task_summary)查找[`AutoModel`]支持的任务.
-
-</Tip>
-
-现在可以把预处理好的输入批次直接送进模型. 你只需要添加`**`来解包字典:
-
-```py
->>> pt_outputs = pt_model(**pt_batch)
-```
-
-模型在`logits`属性输出最终的激活结果. 在 `logits`上应用softmax函数来查询概率:
-
-```py
->>> from torch import nn
-
->>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
->>> print(pt_predictions)
-tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
-        [0.2084, 0.1826, 0.1969, 0.1755, 0.2365]], grad_fn=<SoftmaxBackward0>)
-```
-</pt>
-<tf>
-🤗 Transformers 提供了一种简单统一的方式来加载预训练的实例. 这表示你可以像加载[`AutoTokenizer`]一样加载[`TFAutoModel`]. 唯一不同的地方是为你的任务选择正确的[`TFAutoModel`], 对于文本 (或序列) 分类, 你应该加载[`TFAutoModelForSequenceClassification`]:
-
-```py
->>> from transformers import TFAutoModelForSequenceClassification
-
->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
-```
-
-<Tip>
-
-通过[任务摘要](./task_summary)查找[`AutoModel`]支持的任务.
-
-</Tip>
-
-现在通过直接将字典的键传给张量，将预处理的输入批次传给模型.
-
-```py
->>> tf_outputs = tf_model(tf_batch)
-```
-
-模型在`logits`属性输出最终的激活结果. 在 `logits`上应用softmax函数来查询概率:
-
-```py
->>> import tensorflow as tf
-
->>> tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1)
->>> tf_predictions  # doctest: +IGNORE_RESULT
-```
-</tf>
-</frameworkcontent>
-
-<Tip>
-
-所有 🤗 Transformers 模型 (PyTorch 或 TensorFlow) 在最终的激活函数(比如softmax)*之前* 输出张量,
-因为最终的激活函数常常与loss融合. 模型的输出是特殊的数据类, 所以它们的属性可以在IDE中被自动补全. 模型的输出就像一个元组或字典 (你可以通过整数、切片或字符串来索引它), 在这种情况下, 为None的属性会被忽略.
-
-</Tip>
-
-### 保存模型
-
-<frameworkcontent>
-<pt>
-当你的模型微调完成, 你就可以使用[`PreTrainedModel.save_pretrained`]把它和它的分词器保存下来:
-
-```py
->>> pt_save_directory = "./pt_save_pretrained"
->>> tokenizer.save_pretrained(pt_save_directory)  # doctest: +IGNORE_RESULT
->>> pt_model.save_pretrained(pt_save_directory)
-```
-
-当你准备再次使用这个模型时, 就可以使用[`PreTrainedModel.from_pretrained`]加载它了:
-
-```py
->>> pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained")
-```
-</pt>
-<tf>
-当你的模型微调完成, 你就可以使用[`TFPreTrainedModel.save_pretrained`]把它和它的分词器保存下来:
-
-```py
->>> tf_save_directory = "./tf_save_pretrained"
->>> tokenizer.save_pretrained(tf_save_directory)  # doctest: +IGNORE_RESULT
->>> tf_model.save_pretrained(tf_save_directory)
-```
-
-当你准备再次使用这个模型时, 就可以使用[`TFPreTrainedModel.from_pretrained`]加载它了:
-
-```py
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("./tf_save_pretrained")
-```
-</tf>
-</frameworkcontent>
-
-🤗 Transformers有一个特别酷的功能, 它能够保存一个模型, 并且将它加载为PyTorch或TensorFlow模型. `from_pt`或`from_tf`参数可以将模型从一个框架转换为另一个框架:
-
-<frameworkcontent>
-<pt>
-```py
->>> from transformers import AutoModel
-
->>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
->>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
-```
-</pt>
-<tf>
-```py
->>> from transformers import TFAutoModel
-
->>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
-```
-</tf>
-</frameworkcontent>
-
-## 自定义模型构建
-
-你可以修改模型的配置类来改变模型的构建方式. 配置指明了模型的属性, 比如隐藏层或者注意力头的数量. 当你从自定义的配置类初始化模型时, 你就开始自定义模型构建了. 模型属性是随机初始化的, 你需要先训练模型, 然后才能得到有意义的结果.
-
-通过导入[`AutoConfig`]来开始, 之后加载你想修改的预训练模型. 在[`AutoConfig.from_pretrained`]中, 你能够指定想要修改的属性, 比如注意力头的数量:
-
-```py
->>> from transformers import AutoConfig
-
->>> my_config = AutoConfig.from_pretrained("distilbert-base-uncased", n_heads=12)
-```
-
-<frameworkcontent>
-<pt>
-使用[`AutoModel.from_config`]根据你的自定义配置创建一个模型:
-
-```py
->>> from transformers import AutoModel
-
->>> my_model = AutoModel.from_config(my_config)
-```
-</pt>
-<tf>
-使用[`TFAutoModel.from_config`]根据你的自定义配置创建一个模型:
-
-```py
->>> from transformers import TFAutoModel
-
->>> my_model = TFAutoModel.from_config(my_config)
-```
-</tf>
-</frameworkcontent>
-
-查阅[创建一个自定义结构](./create_a_model)指南获取更多关于构建自定义配置的信息.
-
-## Trainer - PyTorch优化训练循环
-
-所有的模型都是标准的[`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module), 所以你可以在任何典型的训练模型中使用它们. 当你编写自己的训练循环时W, 🤗 Transformers为PyTorch提供了一个[`Trainer`]类, 它包含了基础的训练循环并且为诸如分布式训练, 混合精度等特性增加了额外的功能.
-
-取决于你的任务, 你通常可以传递以下的参数给[`Trainer`]:
-
-1. [`PreTrainedModel`]或者[`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module):
-
-   ```py
-   >>> from transformers import AutoModelForSequenceClassification
-
-   >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
-   ```
-
-2. [`TrainingArguments`]含有你可以修改的模型超参数, 比如学习率, 批次大小和训练时的迭代次数. 如果你没有指定训练参数, 那么它会使用默认值:
-
-   ```py
-   >>> from transformers import TrainingArguments
-
-   >>> training_args = TrainingArguments(
-   ...     output_dir="path/to/save/folder/",
-   ...     learning_rate=2e-5,
-   ...     per_device_train_batch_size=8,
-   ...     per_device_eval_batch_size=8,
-   ...     num_train_epochs=2,
-   ... )
-   ```
-
-3. 一个预处理类, 比如分词器, 特征提取器或者处理器:
-
-   ```py
-   >>> from transformers import AutoTokenizer
-
-   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
-   ```
-
-4. 加载一个数据集:
-
-   ```py
-   >>> from datasets import load_dataset
-
-   >>> dataset = load_dataset("rotten_tomatoes")  # doctest: +IGNORE_RESULT
-   ```
-
-5. 创建一个给数据集分词的函数, 并且使用[`~datasets.Dataset.map`]应用到整个数据集:
-
-   ```py
-   >>> def tokenize_dataset(dataset):
-   ...     return tokenizer(dataset["text"])
-
-
-   >>> dataset = dataset.map(tokenize_dataset, batched=True)
-   ```
-
-6. 用来从数据集中创建批次的[`DataCollatorWithPadding`]:
-
-   ```py
-   >>> from transformers import DataCollatorWithPadding
-
-   >>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
-   ```
-
-现在把所有的类传给[`Trainer`]:
-
-```py
->>> from transformers import Trainer
-
->>> trainer = Trainer(
-...     model=model,
-...     args=training_args,
-...     train_dataset=dataset["train"],
-...     eval_dataset=dataset["test"],
-...     tokenizer=tokenizer,
-...     data_collator=data_collator,
-... )  # doctest: +SKIP
-```
-
-一切准备就绪后, 调用[`~Trainer.train`]进行训练:
-
-```py
->>> trainer.train()  # doctest: +SKIP
-```
-
-<Tip>
-
-对于像翻译或摘要这些使用序列到序列模型的任务, 用[`Seq2SeqTrainer`]和[`Seq2SeqTrainingArguments`]来替代.
-
-</Tip>
-
-你可以通过子类化[`Trainer`]中的方法来自定义训练循环. 这样你就可以自定义像损失函数, 优化器和调度器这样的特性. 查阅[`Trainer`]参考手册了解哪些方法能够被子类化. 
-
-另一个自定义训练循环的方式是通过[回调](./main_classes/callbacks). 你可以使用回调来与其他库集成, 查看训练循环来报告进度或提前结束训练. 回调不会修改训练循环. 如果想自定义损失函数等, 就需要子类化[`Trainer`]了.
-
-## 使用Tensorflow训练
-
-所有模型都是标准的[`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model), 所以你可以通过[Keras](https://keras.io/) API实现在Tensorflow中训练. 🤗 Transformers提供了[`~TFPreTrainedModel.prepare_tf_dataset`]方法来轻松地将数据集加载为`tf.data.Dataset`, 这样你就可以使用Keras的[`compile`](https://keras.io/api/models/model_training_apis/#compile-method)和[`fit`](https://keras.io/api/models/model_training_apis/#fit-method)方法马上开始训练.
-
-1. 使用[`TFPreTrainedModel`]或者[`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model)来开始:
-
-   ```py
-   >>> from transformers import TFAutoModelForSequenceClassification
-
-   >>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
-   ```
-
-2. 一个预处理类, 比如分词器, 特征提取器或者处理器:
-
-   ```py
-   >>> from transformers import AutoTokenizer
-
-   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
-   ```
-
-3. 创建一个给数据集分词的函数
-
-   ```py
-   >>> def tokenize_dataset(dataset):
-   ...     return tokenizer(dataset["text"])  # doctest: +SKIP
-   ```
-
-4. 使用[`~datasets.Dataset.map`]将分词器应用到整个数据集, 之后将数据集和分词器传给[`~TFPreTrainedModel.prepare_tf_dataset`]. 如果你需要的话, 也可以在这里改变批次大小和是否打乱数据集:
-
-   ```py
-   >>> dataset = dataset.map(tokenize_dataset)  # doctest: +SKIP
-   >>> tf_dataset = model.prepare_tf_dataset(
-   ...     dataset, batch_size=16, shuffle=True, tokenizer=tokenizer
-   ... )  # doctest: +SKIP
-   ```
-
-5. 一切准备就绪后, 调用`compile`和`fit`开始训练:
-
-   ```py
-   >>> from tensorflow.keras.optimizers import Adam
-
-   >>> model.compile(optimizer=Adam(3e-5))
-   >>> model.fit(dataset)  # doctest: +SKIP
-   ```
-
-## 接下来做什么?
-
-现在你已经完成了 🤗 Transformers 的快速上手教程, 来看看我们的指南并且学习如何做一些更具体的事情, 比如写一个自定义模型, 为某个任务微调一个模型以及如何使用脚本来训练模型. 如果你有兴趣了解更多 🤗 Transformers 的核心章节, 那就喝杯咖啡然后来看看我们的概念指南吧!
diff --git a/examples/README.md b/examples/README.md
index 0a5ec752d392..3a18950064bf 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -19,7 +19,7 @@ We host a wide range of example scripts for multiple learning frameworks. Simply
 
 We also have some [research projects](https://github.com/huggingface/transformers/tree/main/examples/research_projects), as well as some [legacy examples](https://github.com/huggingface/transformers/tree/main/examples/legacy). Note that unlike the main examples these are not actively maintained, and may require specific older versions of dependencies in order to run. 
 
-While we strive to present as many use cases as possible, the example scripts are just that - examples. It is expected that they won't work out-of-the box on your specific problem and that you will be required to change a few lines of code to adapt them to your needs. To help you with that, most of the examples fully expose the preprocessing of the data, allowing you to tweak and edit them as required.
+While we strive to present as many use cases as possible, the example scripts are just that - examples. It is expected that they won't work out-of-the-box on your specific problem and that you will be required to change a few lines of code to adapt them to your needs. To help you with that, most of the examples fully expose the preprocessing of the data, allowing you to tweak and edit them as required.
 
 Please discuss on the [forum](https://discuss.huggingface.co/) or in an [issue](https://github.com/huggingface/transformers/issues) a feature you would like to implement in an example before submitting a PR; we welcome bug fixes, but since we want to keep the examples as simple as possible it's unlikely that we will merge a pull request adding more functionality at the cost of readability.
 
@@ -94,3 +94,41 @@ Alternatively, you can switch your cloned 🤗 Transformers to a specific versio
 git checkout tags/v3.5.1
 ```
 and run the example command as usual afterward.
+
+## Running the Examples on Remote Hardware with Auto-Setup
+
+[run_on_remote.py](./run_on_remote.py) is a script that launches any example on remote self-hosted hardware, 
+with automatic hardware and environment setup. It uses [Runhouse](https://github.com/run-house/runhouse) to launch 
+on self-hosted hardware (e.g. in your own cloud account or on-premise cluster) but there are other options 
+for running remotely as well. You can easily customize the example used, command line arguments, dependencies, 
+and type of compute hardware, and then run the script to automatically launch the example.
+
+You can refer to 
+[hardware setup](https://runhouse-docs.readthedocs-hosted.com/en/latest/api/python/cluster.html#hardware-setup)
+for more information about hardware and dependency setup with Runhouse, or this
+[Colab tutorial](https://colab.research.google.com/drive/1sh_aNQzJX5BKAdNeXthTNGxKz7sM9VPc) for a more in-depth 
+walkthrough.
+
+You can run the script with the following commands:
+
+```bash
+# First install runhouse:
+pip install runhouse
+
+# For an on-demand V100 with whichever cloud provider you have configured:
+python run_on_remote.py \
+    --example pytorch/text-generation/run_generation.py \
+    --model_type=gpt2 \
+    --model_name_or_path=gpt2 \
+    --prompt "I am a language model and"
+
+# For byo (bring your own) cluster:
+python run_on_remote.py --host <cluster_ip> --user <ssh_user> --key_path <ssh_key_path> \
+  --example <example> <args>
+
+# For on-demand instances
+python run_on_remote.py --instance <instance> --provider <provider> \
+  --example <example> <args>
+```
+
+You can also adapt the script to your own needs.
diff --git a/examples/flax/_tests_requirements.txt b/examples/flax/_tests_requirements.txt
index f1e0fb2d9071..b270591454ef 100644
--- a/examples/flax/_tests_requirements.txt
+++ b/examples/flax/_tests_requirements.txt
@@ -5,4 +5,6 @@ nltk
 rouge-score
 seqeval
 tensorboard
-evaluate >= 0.2.0
\ No newline at end of file
+evaluate >= 0.2.0
+torch
+accelerate
\ No newline at end of file
diff --git a/examples/flax/image-captioning/create_model_from_encoder_decoder_models.py b/examples/flax/image-captioning/create_model_from_encoder_decoder_models.py
index ab2fb8568d52..c5ce0e4ce133 100644
--- a/examples/flax/image-captioning/create_model_from_encoder_decoder_models.py
+++ b/examples/flax/image-captioning/create_model_from_encoder_decoder_models.py
@@ -22,13 +22,7 @@
 from dataclasses import dataclass, field
 from typing import Optional
 
-from transformers import (
-    AutoConfig,
-    AutoFeatureExtractor,
-    AutoTokenizer,
-    FlaxVisionEncoderDecoderModel,
-    HfArgumentParser,
-)
+from transformers import AutoConfig, AutoImageProcessor, AutoTokenizer, FlaxVisionEncoderDecoderModel, HfArgumentParser
 
 
 @dataclass
@@ -108,13 +102,13 @@ def main():
     model.config.decoder_start_token_id = decoder_start_token_id
     model.config.pad_token_id = pad_token_id
 
-    feature_extractor = AutoFeatureExtractor.from_pretrained(model_args.encoder_model_name_or_path)
+    image_processor = AutoImageProcessor.from_pretrained(model_args.encoder_model_name_or_path)
 
     tokenizer = AutoTokenizer.from_pretrained(model_args.decoder_model_name_or_path)
     tokenizer.pad_token = tokenizer.convert_ids_to_tokens(model.config.pad_token_id)
 
     model.save_pretrained(model_args.output_dir)
-    feature_extractor.save_pretrained(model_args.output_dir)
+    image_processor.save_pretrained(model_args.output_dir)
     tokenizer.save_pretrained(model_args.output_dir)
 
 
diff --git a/examples/flax/image-captioning/run_image_captioning_flax.py b/examples/flax/image-captioning/run_image_captioning_flax.py
index 1258eba49f2a..bbc79977a467 100644
--- a/examples/flax/image-captioning/run_image_captioning_flax.py
+++ b/examples/flax/image-captioning/run_image_captioning_flax.py
@@ -22,6 +22,7 @@
 import os
 import sys
 import time
+import warnings
 from dataclasses import asdict, dataclass, field
 from enum import Enum
 from functools import partial
@@ -29,31 +30,31 @@
 from typing import Callable, Optional
 
 import datasets
-import nltk  # Here to have a nice missing dependency error message early on
-import numpy as np
-from datasets import Dataset, load_dataset
-from PIL import Image
-from tqdm import tqdm
-
 import evaluate
 import jax
 import jax.numpy as jnp
+import nltk  # Here to have a nice missing dependency error message early on
+import numpy as np
 import optax
-import transformers
+from datasets import Dataset, load_dataset
 from filelock import FileLock
 from flax import jax_utils, traverse_util
 from flax.jax_utils import unreplicate
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard, shard_prng_key
-from huggingface_hub import Repository
+from huggingface_hub import Repository, create_repo
+from PIL import Image
+from tqdm import tqdm
+
+import transformers
 from transformers import (
-    AutoFeatureExtractor,
+    AutoImageProcessor,
     AutoTokenizer,
     FlaxVisionEncoderDecoderModel,
     HfArgumentParser,
     is_tensorboard_available,
 )
-from transformers.utils import get_full_repo_name, is_offline_mode, send_example_telemetry
+from transformers.utils import is_offline_mode, send_example_telemetry
 
 
 logger = logging.getLogger(__name__)
@@ -106,12 +107,12 @@ class TrainingArguments:
         default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for evaluation."}
     )
     _block_size_doc = """
-        The default value `0` will preprocess (tokenization + feature extraction) the whole dataset before training and
+        The default value `0` will preprocess (tokenization + image processing) the whole dataset before training and
         cache the results. This uses more disk space, but avoids (repeated) processing time during training. This is a
         good option if your disk space is large enough to store the whole processed dataset.
         If a positive value is given, the captions in the dataset will be tokenized before training and the results are
         cached. During training, it iterates the dataset in chunks of size `block_size`. On each block, images are
-        transformed by the feature extractor with the results being kept in memory (no cache), and batches of size
+        transformed by the image processor with the results being kept in memory (no cache), and batches of size
         `batch_size` are yielded before processing the next block. This could avoid the heavy disk usage when the
         dataset is large.
         """
@@ -182,12 +183,28 @@ class ModelArguments:
             )
         },
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -389,6 +406,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_image_captioning", model_args, data_args, framework="flax")
@@ -424,13 +447,14 @@ def main():
 
     # Handle the repository creation
     if training_args.push_to_hub:
-        if training_args.hub_model_id is None:
-            repo_name = get_full_repo_name(
-                Path(training_args.output_dir).absolute().name, token=training_args.hub_token
-            )
-        else:
-            repo_name = training_args.hub_model_id
-        repo = Repository(training_args.output_dir, clone_from=repo_name)
+        # Retrieve of infer repo_name
+        repo_name = training_args.hub_model_id
+        if repo_name is None:
+            repo_name = Path(training_args.output_dir).absolute().name
+        # Create repo and retrieve repo_id
+        repo_id = create_repo(repo_name, exist_ok=True, token=training_args.hub_token).repo_id
+        # Clone repo locally
+        repo = Repository(training_args.output_dir, clone_from=repo_id, token=training_args.hub_token)
 
     # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
     # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
@@ -447,7 +471,7 @@ def main():
             cache_dir=model_args.cache_dir,
             keep_in_memory=False,
             data_dir=data_args.data_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -464,7 +488,7 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -474,18 +498,21 @@ def main():
         model_args.model_name_or_path,
         seed=training_args.seed,
         dtype=getattr(jnp, model_args.dtype),
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
-    feature_extractor = AutoFeatureExtractor.from_pretrained(
+    image_processor = AutoImageProcessor.from_pretrained(
         model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer.pad_token = tokenizer.convert_ids_to_tokens(model.config.pad_token_id)
 
@@ -545,7 +572,7 @@ def filter_fn(examples):
         for image_file in examples[image_column]:
             try:
                 image = Image.open(image_file)
-                feature_extractor(images=image, return_tensors="np")
+                image_processor(images=image, return_tensors="np")
                 bools.append(True)
             except Exception:
                 bools.append(False)
@@ -581,9 +608,9 @@ def tokenization_fn(examples, max_target_length):
 
         return model_inputs
 
-    def feature_extraction_fn(examples, check_image=True):
+    def image_processing_fn(examples, check_image=True):
         """
-        Run feature extraction on images
+        Run preprocessing on images
 
         If `check_image` is `True`, the examples that fails during `Image.open()` will be caught and discarded.
         Otherwise, an exception will be thrown.
@@ -608,18 +635,18 @@ def feature_extraction_fn(examples, check_image=True):
         else:
             images = [Image.open(image_file) for image_file in examples[image_column]]
 
-        encoder_inputs = feature_extractor(images=images, return_tensors="np")
+        encoder_inputs = image_processor(images=images, return_tensors="np")
         model_inputs["pixel_values"] = encoder_inputs.pixel_values
 
         return model_inputs
 
     def preprocess_fn(examples, max_target_length, check_image=True):
-        """Run tokenization + image feature extraction"""
+        """Run tokenization + image processing"""
 
         model_inputs = {}
         # This contains image path column
         model_inputs.update(tokenization_fn(examples, max_target_length))
-        model_inputs.update(feature_extraction_fn(model_inputs, check_image=check_image))
+        model_inputs.update(image_processing_fn(model_inputs, check_image=check_image))
         # Remove image path column
         model_inputs.pop(image_column)
 
@@ -643,15 +670,15 @@ def preprocess_fn(examples, max_target_length, check_image=True):
         }
     )
 
-    # If `block_size` is `0`, tokenization & image feature extraction is done at the beginning
-    run_feat_ext_at_beginning = training_args.block_size == 0
+    # If `block_size` is `0`, tokenization & image processing is done at the beginning
+    run_img_proc_at_beginning = training_args.block_size == 0
     # Used in .map() below
-    function_kwarg = preprocess_fn if run_feat_ext_at_beginning else tokenization_fn
+    function_kwarg = preprocess_fn if run_img_proc_at_beginning else tokenization_fn
     # `features` is used only for the final preprocessed dataset (for the performance purpose).
-    features_kwarg = features if run_feat_ext_at_beginning else None
-    # Keep `image_column` if the feature extraction is done during training
-    remove_columns_kwarg = [x for x in column_names if x != image_column or run_feat_ext_at_beginning]
-    processor_names = "tokenizer and feature extractor" if run_feat_ext_at_beginning else "tokenizer"
+    features_kwarg = features if run_img_proc_at_beginning else None
+    # Keep `image_column` if the image processing is done during training
+    remove_columns_kwarg = [x for x in column_names if x != image_column or run_img_proc_at_beginning]
+    processor_names = "tokenizer and image processor" if run_img_proc_at_beginning else "tokenizer"
 
     # Store some constant
     train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
@@ -670,9 +697,9 @@ def preprocess_fn(examples, max_target_length, check_image=True):
             max_train_samples = min(len(train_dataset), data_args.max_train_samples)
             train_dataset = train_dataset.select(range(max_train_samples))
         # remove problematic examples
-        # (if feature extraction is performed at the beginning, the filtering is done during preprocessing below
+        # (if image processing is performed at the beginning, the filtering is done during preprocessing below
         # instead here.)
-        if not run_feat_ext_at_beginning:
+        if not run_img_proc_at_beginning:
             train_dataset = train_dataset.filter(filter_fn, batched=True, num_proc=data_args.preprocessing_num_workers)
         train_dataset = train_dataset.map(
             function=function_kwarg,
@@ -685,7 +712,7 @@ def preprocess_fn(examples, max_target_length, check_image=True):
             fn_kwargs={"max_target_length": data_args.max_target_length},
             features=features_kwarg,
         )
-        if run_feat_ext_at_beginning:
+        if run_img_proc_at_beginning:
             # set format (for performance) since the dataset is ready to be used
             train_dataset = train_dataset.with_format("numpy")
 
@@ -704,9 +731,9 @@ def preprocess_fn(examples, max_target_length, check_image=True):
             max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
             eval_dataset = eval_dataset.select(range(max_eval_samples))
         # remove problematic examples
-        # (if feature extraction is performed at the beginning, the filtering is done during preprocessing below
+        # (if image processing is performed at the beginning, the filtering is done during preprocessing below
         # instead here.)
-        if not run_feat_ext_at_beginning:
+        if not run_img_proc_at_beginning:
             eval_dataset = eval_dataset.filter(filter_fn, batched=True, num_proc=data_args.preprocessing_num_workers)
         eval_dataset = eval_dataset.map(
             function=function_kwarg,
@@ -719,7 +746,7 @@ def preprocess_fn(examples, max_target_length, check_image=True):
             fn_kwargs={"max_target_length": data_args.val_max_target_length},
             features=features_kwarg,
         )
-        if run_feat_ext_at_beginning:
+        if run_img_proc_at_beginning:
             # set format (for performance) since the dataset is ready to be used
             eval_dataset = eval_dataset.with_format("numpy")
 
@@ -734,9 +761,9 @@ def preprocess_fn(examples, max_target_length, check_image=True):
             max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
             predict_dataset = predict_dataset.select(range(max_predict_samples))
         # remove problematic examples
-        # (if feature extraction is performed at the beginning, the filtering is done during preprocessing below
+        # (if image processing is performed at the beginning, the filtering is done during preprocessing below
         # instead here.)
-        if not run_feat_ext_at_beginning:
+        if not run_img_proc_at_beginning:
             predict_dataset = predict_dataset.filter(
                 filter_fn, batched=True, num_proc=data_args.preprocessing_num_workers
             )
@@ -751,7 +778,7 @@ def preprocess_fn(examples, max_target_length, check_image=True):
             fn_kwargs={"max_target_length": data_args.val_max_target_length},
             features=features_kwarg,
         )
-        if run_feat_ext_at_beginning:
+        if run_img_proc_at_beginning:
             # set format (for performance) since the dataset is ready to be used
             predict_dataset = predict_dataset.with_format("numpy")
 
@@ -770,8 +797,8 @@ def blockwise_data_loader(
         """
         Wrap the simple `data_loader` in a block-wise way if `block_size` > 0, else it's the same as `data_loader`.
 
-        If `block_size` > 0, it requires `ds` to have a column that gives image paths in order to perform image feature
-        extraction (with the column name being specified by `image_column`). The tokenization should be done before
+        If `block_size` > 0, it requires `ds` to have a column that gives image paths in order to perform image
+        processing (with the column name being specified by `image_column`). The tokenization should be done before
         training in this case.
         """
 
@@ -803,7 +830,7 @@ def blockwise_data_loader(
                 _ds = ds.select(selected_indices)
 
                 _ds = _ds.map(
-                    feature_extraction_fn,
+                    image_processing_fn,
                     batched=True,
                     num_proc=data_args.preprocessing_num_workers,
                     remove_columns=[image_column],
@@ -812,7 +839,7 @@ def blockwise_data_loader(
                     keep_in_memory=keep_in_memory,
                     # The images are already checked either in `.filter()` or in `preprocess_fn()`
                     fn_kwargs={"check_image": False},
-                    desc=f"Running feature extraction on {split} dataset".replace("  ", " "),
+                    desc=f"Running image processing on {split} dataset".replace("  ", " "),
                 )
                 _ds = _ds.with_format("numpy")
 
@@ -891,14 +918,12 @@ def decay_mask_fn(params):
         flat_params = traverse_util.flatten_dict(params)
         # find out all LayerNorm parameters
         layer_norm_candidates = ["layernorm", "layer_norm", "ln"]
-        layer_norm_named_params = set(
-            [
-                layer[-2:]
-                for layer_norm_name in layer_norm_candidates
-                for layer in flat_params.keys()
-                if layer_norm_name in "".join(layer).lower()
-            ]
-        )
+        layer_norm_named_params = {
+            layer[-2:]
+            for layer_norm_name in layer_norm_candidates
+            for layer in flat_params.keys()
+            if layer_norm_name in "".join(layer).lower()
+        }
         flat_mask = {path: (path[-1] != "bias" and path[-2:] not in layer_norm_named_params) for path in flat_params}
         return traverse_util.unflatten_dict(flat_mask)
 
diff --git a/examples/flax/language-modeling/run_bart_dlm_flax.py b/examples/flax/language-modeling/run_bart_dlm_flax.py
index 6872e59345f2..d1928dd73130 100644
--- a/examples/flax/language-modeling/run_bart_dlm_flax.py
+++ b/examples/flax/language-modeling/run_bart_dlm_flax.py
@@ -26,26 +26,27 @@
 import os
 import sys
 import time
+import warnings
 from dataclasses import asdict, dataclass, field
 from enum import Enum
 from itertools import chain
 from pathlib import Path
 from typing import Dict, List, Optional
 
-import nltk
-import numpy as np
-from datasets import load_dataset
-from tqdm import tqdm
-
 import flax
 import jax
 import jax.numpy as jnp
+import nltk
+import numpy as np
 import optax
+from datasets import load_dataset
 from flax import jax_utils, traverse_util
 from flax.jax_utils import pad_shard_unpad
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard
-from huggingface_hub import Repository
+from huggingface_hub import Repository, create_repo
+from tqdm import tqdm
+
 from transformers import (
     CONFIG_MAPPING,
     FLAX_MODEL_FOR_MASKED_LM_MAPPING,
@@ -59,7 +60,7 @@
     set_seed,
 )
 from transformers.models.bart.modeling_flax_bart import shift_tokens_right
-from transformers.utils import get_full_repo_name, send_example_telemetry
+from transformers.utils import send_example_telemetry
 
 
 MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_MASKED_LM_MAPPING.keys())
@@ -138,7 +139,7 @@ class ModelArguments:
         default=None,
         metadata={
             "help": (
-                "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch."
+                "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
             )
         },
     )
@@ -168,15 +169,21 @@ class ModelArguments:
             )
         },
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
 
 
 @dataclass
@@ -242,10 +249,12 @@ def __post_init__(self):
         else:
             if self.train_file is not None:
                 extension = self.train_file.split(".")[-1]
-                assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
+                if extension not in ["csv", "json", "txt"]:
+                    raise ValueError("train_file` should be a csv, json or text file.")
             if self.validation_file is not None:
                 extension = self.validation_file.split(".")[-1]
-                assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
+                if extension not in ["csv", "json", "txt"]:
+                    raise ValueError("`validation_file` should be a csv, json or text file.")
 
 
 @flax.struct.dataclass
@@ -319,15 +328,13 @@ def permute_sentences(self, input_ids):
         sentence_ends = np.argwhere(end_sentence_mask)
         sentence_ends[:, 1] += 1
         example_has_multiple_sentences, num_sentences = np.unique(sentence_ends[:, 0], return_counts=True)
-        num_sentences_map = {sent_idx: count for sent_idx, count in zip(example_has_multiple_sentences, num_sentences)}
+        num_sentences_map = dict(zip(example_has_multiple_sentences, num_sentences))
 
         num_to_permute = np.ceil(num_sentences * self.permute_sentence_ratio).astype(int)
-        num_to_permute_map = {
-            sent_idx: count for sent_idx, count in zip(example_has_multiple_sentences, num_to_permute)
-        }
+        num_to_permute_map = dict(zip(example_has_multiple_sentences, num_to_permute))
 
         sentence_ends = np.split(sentence_ends[:, 1], np.unique(sentence_ends[:, 0], return_index=True)[1][1:])
-        sentence_ends_map = {sent_idx: count for sent_idx, count in zip(example_has_multiple_sentences, sentence_ends)}
+        sentence_ends_map = dict(zip(example_has_multiple_sentences, sentence_ends))
 
         for i in range(input_ids.shape[0]):
             if i not in example_has_multiple_sentences:
@@ -463,6 +470,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_bart_dlm", model_args, data_args, framework="flax")
@@ -496,13 +509,14 @@ def main():
 
     # Handle the repository creation
     if training_args.push_to_hub:
-        if training_args.hub_model_id is None:
-            repo_name = get_full_repo_name(
-                Path(training_args.output_dir).absolute().name, token=training_args.hub_token
-            )
-        else:
-            repo_name = training_args.hub_model_id
-        repo = Repository(training_args.output_dir, clone_from=repo_name)
+        # Retrieve of infer repo_name
+        repo_name = training_args.hub_model_id
+        if repo_name is None:
+            repo_name = Path(training_args.output_dir).absolute().name
+        # Create repo and retrieve repo_id
+        repo_id = create_repo(repo_name, exist_ok=True, token=training_args.hub_token).repo_id
+        # Clone repo locally
+        repo = Repository(training_args.output_dir, clone_from=repo_id, token=training_args.hub_token)
 
     # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
     # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
@@ -516,7 +530,8 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            num_proc=data_args.preprocessing_num_workers,
         )
 
         if "validation" not in datasets.keys():
@@ -525,14 +540,16 @@ def main():
                 data_args.dataset_config_name,
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
+                num_proc=data_args.preprocessing_num_workers,
             )
             datasets["train"] = load_dataset(
                 data_args.dataset_name,
                 data_args.dataset_config_name,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
+                num_proc=data_args.preprocessing_num_workers,
             )
     else:
         data_files = {}
@@ -547,7 +564,8 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            num_proc=data_args.preprocessing_num_workers,
         )
 
         if "validation" not in datasets.keys():
@@ -556,14 +574,16 @@ def main():
                 data_files=data_files,
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
+                num_proc=data_args.preprocessing_num_workers,
             )
             datasets["train"] = load_dataset(
                 extension,
                 data_files=data_files,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
+                num_proc=data_args.preprocessing_num_workers,
             )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -575,14 +595,14 @@ def main():
             model_args.tokenizer_name,
             cache_dir=model_args.cache_dir,
             use_fast=model_args.use_fast_tokenizer,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     elif model_args.model_name_or_path:
         tokenizer = AutoTokenizer.from_pretrained(
             model_args.model_name_or_path,
             cache_dir=model_args.cache_dir,
             use_fast=model_args.use_fast_tokenizer,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         raise ValueError(
@@ -595,13 +615,13 @@ def main():
             model_args.config_name,
             cache_dir=model_args.cache_dir,
             vocab_size=len(tokenizer),
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     elif model_args.model_name_or_path:
         config = BartConfig.from_pretrained(
             model_args.model_name_or_path,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         config = CONFIG_MAPPING[model_args.model_type]()
@@ -670,7 +690,7 @@ def group_texts(examples):
     # might be slower to preprocess.
     #
     # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
-    # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+    # https://huggingface.co/docs/datasets/process#map
     tokenized_datasets = tokenized_datasets.map(
         group_texts,
         batched=True,
@@ -706,7 +726,7 @@ def group_texts(examples):
             config=config,
             seed=training_args.seed,
             dtype=getattr(jnp, model_args.dtype),
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         config.vocab_size = len(tokenizer)
@@ -755,14 +775,12 @@ def decay_mask_fn(params):
         flat_params = traverse_util.flatten_dict(params)
         # find out all LayerNorm parameters
         layer_norm_candidates = ["layernorm", "layer_norm", "ln"]
-        layer_norm_named_params = set(
-            [
-                layer[-2:]
-                for layer_norm_name in layer_norm_candidates
-                for layer in flat_params.keys()
-                if layer_norm_name in "".join(layer).lower()
-            ]
-        )
+        layer_norm_named_params = {
+            layer[-2:]
+            for layer_norm_name in layer_norm_candidates
+            for layer in flat_params.keys()
+            if layer_norm_name in "".join(layer).lower()
+        }
         flat_mask = {path: (path[-1] != "bias" and path[-2:] not in layer_norm_named_params) for path in flat_params}
         return traverse_util.unflatten_dict(flat_mask)
 
diff --git a/examples/flax/language-modeling/run_clm_flax.py b/examples/flax/language-modeling/run_clm_flax.py
index 7e0d1010c14c..95e175d494bf 100755
--- a/examples/flax/language-modeling/run_clm_flax.py
+++ b/examples/flax/language-modeling/run_clm_flax.py
@@ -27,6 +27,7 @@
 import os
 import sys
 import time
+import warnings
 from dataclasses import asdict, dataclass, field
 from enum import Enum
 from itertools import chain
@@ -34,19 +35,19 @@
 from typing import Callable, Optional
 
 import datasets
-import numpy as np
-from datasets import Dataset, load_dataset
-from tqdm import tqdm
-
 import jax
 import jax.numpy as jnp
+import numpy as np
 import optax
-import transformers
+from datasets import Dataset, load_dataset
 from flax import jax_utils, traverse_util
 from flax.jax_utils import pad_shard_unpad, unreplicate
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard, shard_prng_key
-from huggingface_hub import Repository
+from huggingface_hub import Repository, create_repo
+from tqdm import tqdm
+
+import transformers
 from transformers import (
     CONFIG_MAPPING,
     FLAX_MODEL_FOR_CAUSAL_LM_MAPPING,
@@ -58,7 +59,7 @@
     set_seed,
 )
 from transformers.testing_utils import CaptureLogger
-from transformers.utils import get_full_repo_name, send_example_telemetry
+from transformers.utils import send_example_telemetry
 
 
 logger = logging.getLogger(__name__)
@@ -139,7 +140,7 @@ class ModelArguments:
         default=None,
         metadata={
             "help": (
-                "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch."
+                "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
             )
         },
     )
@@ -169,12 +170,28 @@ class ModelArguments:
             )
         },
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -251,10 +268,12 @@ def __post_init__(self):
         else:
             if self.train_file is not None:
                 extension = self.train_file.split(".")[-1]
-                assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
+                if extension not in ["csv", "json", "txt"]:
+                    raise ValueError("train_file` should be a csv, json or text file.")
             if self.validation_file is not None:
                 extension = self.validation_file.split(".")[-1]
-                assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
+                if extension not in ["csv", "json", "txt"]:
+                    raise ValueError("`validation_file` should be a csv, json or text file.")
 
 
 class TrainState(train_state.TrainState):
@@ -332,6 +351,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_clm", model_args, data_args, framework="flax")
@@ -370,13 +395,14 @@ def main():
 
     # Handle the repository creation
     if training_args.push_to_hub:
-        if training_args.hub_model_id is None:
-            repo_name = get_full_repo_name(
-                Path(training_args.output_dir).absolute().name, token=training_args.hub_token
-            )
-        else:
-            repo_name = training_args.hub_model_id
-        repo = Repository(training_args.output_dir, clone_from=repo_name)
+        # Retrieve of infer repo_name
+        repo_name = training_args.hub_model_id
+        if repo_name is None:
+            repo_name = Path(training_args.output_dir).absolute().name
+        # Create repo and retrieve repo_id
+        repo_id = create_repo(repo_name, exist_ok=True, token=training_args.hub_token).repo_id
+        # Clone repo locally
+        repo = Repository(training_args.output_dir, clone_from=repo_id, token=training_args.hub_token)
 
     #  Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
     # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
@@ -394,7 +420,8 @@ def main():
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
             keep_in_memory=False,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            num_proc=data_args.preprocessing_num_workers,
         )
 
         if "validation" not in dataset.keys():
@@ -403,14 +430,16 @@ def main():
                 data_args.dataset_config_name,
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
+                num_proc=data_args.preprocessing_num_workers,
             )
             dataset["train"] = load_dataset(
                 data_args.dataset_name,
                 data_args.dataset_config_name,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
+                num_proc=data_args.preprocessing_num_workers,
             )
     else:
         data_files = {}
@@ -428,7 +457,8 @@ def main():
             data_files=data_files,
             cache_dir=model_args.cache_dir,
             **dataset_args,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            num_proc=data_args.preprocessing_num_workers,
         )
 
         if "validation" not in dataset.keys():
@@ -438,7 +468,8 @@ def main():
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
                 **dataset_args,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
+                num_proc=data_args.preprocessing_num_workers,
             )
             dataset["train"] = load_dataset(
                 extension,
@@ -446,7 +477,8 @@ def main():
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
                 **dataset_args,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
+                num_proc=data_args.preprocessing_num_workers,
             )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -460,13 +492,15 @@ def main():
         config = AutoConfig.from_pretrained(
             model_args.config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     elif model_args.model_name_or_path:
         config = AutoConfig.from_pretrained(
             model_args.model_name_or_path,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         config = CONFIG_MAPPING[model_args.model_type]()
@@ -477,14 +511,16 @@ def main():
             model_args.tokenizer_name,
             cache_dir=model_args.cache_dir,
             use_fast=model_args.use_fast_tokenizer,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     elif model_args.model_name_or_path:
         tokenizer = AutoTokenizer.from_pretrained(
             model_args.model_name_or_path,
             cache_dir=model_args.cache_dir,
             use_fast=model_args.use_fast_tokenizer,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         raise ValueError(
@@ -498,13 +534,15 @@ def main():
             config=config,
             seed=training_args.seed,
             dtype=getattr(jnp, model_args.dtype),
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         model = FlaxAutoModelForCausalLM.from_config(
             config,
             seed=training_args.seed,
             dtype=getattr(jnp, model_args.dtype),
+            trust_remote_code=model_args.trust_remote_code,
         )
 
     # Preprocessing the datasets.
@@ -542,9 +580,9 @@ def tokenize_function(examples):
         if block_size > config.max_position_embeddings:
             logger.warning(
                 f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
-                "Picking 1024 instead. You can change that default value by passing --block_size xxx."
+                f"Using block_size={min(1024, config.max_position_embeddings)} instead. You can change that default value by passing --block_size xxx."
             )
-            block_size = 1024
+            block_size = min(1024, config.max_position_embeddings)
     else:
         if data_args.block_size > tokenizer.model_max_length:
             logger.warning(
@@ -575,7 +613,7 @@ def group_texts(examples):
     # to preprocess.
     #
     # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
-    # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+    # https://huggingface.co/docs/datasets/process#map
 
     lm_datasets = tokenized_datasets.map(
         group_texts,
@@ -647,14 +685,12 @@ def decay_mask_fn(params):
         flat_params = traverse_util.flatten_dict(params)
         # find out all LayerNorm parameters
         layer_norm_candidates = ["layernorm", "layer_norm", "ln"]
-        layer_norm_named_params = set(
-            [
-                layer[-2:]
-                for layer_norm_name in layer_norm_candidates
-                for layer in flat_params.keys()
-                if layer_norm_name in "".join(layer).lower()
-            ]
-        )
+        layer_norm_named_params = {
+            layer[-2:]
+            for layer_norm_name in layer_norm_candidates
+            for layer in flat_params.keys()
+            if layer_norm_name in "".join(layer).lower()
+        }
         flat_mask = {path: (path[-1] != "bias" and path[-2:] not in layer_norm_named_params) for path in flat_params}
         return traverse_util.unflatten_dict(flat_mask)
 
diff --git a/examples/flax/language-modeling/run_mlm_flax.py b/examples/flax/language-modeling/run_mlm_flax.py
index 2383492aa497..00c1bb32d099 100755
--- a/examples/flax/language-modeling/run_mlm_flax.py
+++ b/examples/flax/language-modeling/run_mlm_flax.py
@@ -26,6 +26,7 @@
 import os
 import sys
 import time
+import warnings
 from dataclasses import asdict, dataclass, field
 from enum import Enum
 from itertools import chain
@@ -34,19 +35,19 @@
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple
 
-import numpy as np
-from datasets import load_dataset
-from tqdm import tqdm
-
 import flax
 import jax
 import jax.numpy as jnp
+import numpy as np
 import optax
+from datasets import load_dataset
 from flax import jax_utils, traverse_util
 from flax.jax_utils import pad_shard_unpad
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard
-from huggingface_hub import Repository
+from huggingface_hub import Repository, create_repo
+from tqdm import tqdm
+
 from transformers import (
     CONFIG_MAPPING,
     FLAX_MODEL_FOR_MASKED_LM_MAPPING,
@@ -59,7 +60,7 @@
     is_tensorboard_available,
     set_seed,
 )
-from transformers.utils import get_full_repo_name, send_example_telemetry
+from transformers.utils import send_example_telemetry
 
 
 MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_MASKED_LM_MAPPING.keys())
@@ -144,7 +145,7 @@ class ModelArguments:
         default=None,
         metadata={
             "help": (
-                "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch."
+                "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
             )
         },
     )
@@ -174,12 +175,28 @@ class ModelArguments:
             )
         },
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -377,6 +394,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_mlm", model_args, data_args, framework="flax")
@@ -410,13 +433,14 @@ def main():
 
     # Handle the repository creation
     if training_args.push_to_hub:
-        if training_args.hub_model_id is None:
-            repo_name = get_full_repo_name(
-                Path(training_args.output_dir).absolute().name, token=training_args.hub_token
-            )
-        else:
-            repo_name = training_args.hub_model_id
-        repo = Repository(training_args.output_dir, clone_from=repo_name)
+        # Retrieve of infer repo_name
+        repo_name = training_args.hub_model_id
+        if repo_name is None:
+            repo_name = Path(training_args.output_dir).absolute().name
+        # Create repo and retrieve repo_id
+        repo_id = create_repo(repo_name, exist_ok=True, token=training_args.hub_token).repo_id
+        # Clone repo locally
+        repo = Repository(training_args.output_dir, clone_from=repo_id, token=training_args.hub_token)
 
     # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
     # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
@@ -433,7 +457,8 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            num_proc=data_args.preprocessing_num_workers,
         )
 
         if "validation" not in datasets.keys():
@@ -442,14 +467,16 @@ def main():
                 data_args.dataset_config_name,
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
+                num_proc=data_args.preprocessing_num_workers,
             )
             datasets["train"] = load_dataset(
                 data_args.dataset_name,
                 data_args.dataset_config_name,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
+                num_proc=data_args.preprocessing_num_workers,
             )
     else:
         data_files = {}
@@ -464,7 +491,8 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            num_proc=data_args.preprocessing_num_workers,
         )
 
         if "validation" not in datasets.keys():
@@ -473,14 +501,16 @@ def main():
                 data_files=data_files,
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
+                num_proc=data_args.preprocessing_num_workers,
             )
             datasets["train"] = load_dataset(
                 extension,
                 data_files=data_files,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
+                num_proc=data_args.preprocessing_num_workers,
             )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -494,13 +524,15 @@ def main():
         config = AutoConfig.from_pretrained(
             model_args.config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     elif model_args.model_name_or_path:
         config = AutoConfig.from_pretrained(
             model_args.model_name_or_path,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         config = CONFIG_MAPPING[model_args.model_type]()
@@ -511,14 +543,16 @@ def main():
             model_args.tokenizer_name,
             cache_dir=model_args.cache_dir,
             use_fast=model_args.use_fast_tokenizer,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     elif model_args.model_name_or_path:
         tokenizer = AutoTokenizer.from_pretrained(
             model_args.model_name_or_path,
             cache_dir=model_args.cache_dir,
             use_fast=model_args.use_fast_tokenizer,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         raise ValueError(
@@ -597,7 +631,7 @@ def group_texts(examples):
         # might be slower to preprocess.
         #
         # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
-        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+        # https://huggingface.co/docs/datasets/process#map
         tokenized_datasets = tokenized_datasets.map(
             group_texts,
             batched=True,
@@ -637,13 +671,15 @@ def group_texts(examples):
             config=config,
             seed=training_args.seed,
             dtype=getattr(jnp, model_args.dtype),
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         model = FlaxAutoModelForMaskedLM.from_config(
             config,
             seed=training_args.seed,
             dtype=getattr(jnp, model_args.dtype),
+            trust_remote_code=model_args.trust_remote_code,
         )
 
     if training_args.gradient_checkpointing:
@@ -678,14 +714,12 @@ def decay_mask_fn(params):
         flat_params = traverse_util.flatten_dict(params)
         # find out all LayerNorm parameters
         layer_norm_candidates = ["layernorm", "layer_norm", "ln"]
-        layer_norm_named_params = set(
-            [
-                layer[-2:]
-                for layer_norm_name in layer_norm_candidates
-                for layer in flat_params.keys()
-                if layer_norm_name in "".join(layer).lower()
-            ]
-        )
+        layer_norm_named_params = {
+            layer[-2:]
+            for layer_norm_name in layer_norm_candidates
+            for layer in flat_params.keys()
+            if layer_norm_name in "".join(layer).lower()
+        }
         flat_mask = {path: (path[-1] != "bias" and path[-2:] not in layer_norm_named_params) for path in flat_params}
         return traverse_util.unflatten_dict(flat_mask)
 
diff --git a/examples/flax/language-modeling/run_t5_mlm_flax.py b/examples/flax/language-modeling/run_t5_mlm_flax.py
index ceae49c6b109..a4641dc21526 100755
--- a/examples/flax/language-modeling/run_t5_mlm_flax.py
+++ b/examples/flax/language-modeling/run_t5_mlm_flax.py
@@ -25,6 +25,7 @@
 import os
 import sys
 import time
+import warnings
 from dataclasses import asdict, dataclass, field
 
 # You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.
@@ -33,19 +34,19 @@
 from pathlib import Path
 from typing import Dict, List, Optional
 
-import numpy as np
-from datasets import load_dataset
-from tqdm import tqdm
-
 import flax
 import jax
 import jax.numpy as jnp
+import numpy as np
 import optax
+from datasets import load_dataset
 from flax import jax_utils, traverse_util
 from flax.jax_utils import pad_shard_unpad
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard
-from huggingface_hub import Repository
+from huggingface_hub import Repository, create_repo
+from tqdm import tqdm
+
 from transformers import (
     CONFIG_MAPPING,
     FLAX_MODEL_FOR_MASKED_LM_MAPPING,
@@ -59,7 +60,7 @@
     set_seed,
 )
 from transformers.models.t5.modeling_flax_t5 import shift_tokens_right
-from transformers.utils import get_full_repo_name, send_example_telemetry
+from transformers.utils import send_example_telemetry
 
 
 MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_MASKED_LM_MAPPING.keys())
@@ -138,7 +139,7 @@ class ModelArguments:
         default=None,
         metadata={
             "help": (
-                "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch."
+                "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
             )
         },
     )
@@ -168,15 +169,21 @@ class ModelArguments:
             )
         },
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
 
 
 @dataclass
@@ -418,13 +425,14 @@ def random_spans_noise_mask(self, length):
         orig_length = length
 
         num_noise_tokens = int(np.round(length * self.noise_density))
+        num_nonnoise_tokens = length - num_noise_tokens
         # avoid degeneracy by ensuring positive numbers of noise and nonnoise tokens.
         num_noise_tokens = min(max(num_noise_tokens, 1), length - 1)
-        num_noise_spans = int(np.round(num_noise_tokens / self.mean_noise_span_length))
+        # num_noise_tokens should be less than num_noise_tokens and num_nonnoise_tokens
+        num_noise_spans = int(np.round(min(num_noise_tokens, num_nonnoise_tokens) / self.mean_noise_span_length))
 
         # avoid degeneracy by ensuring positive number of noise spans
         num_noise_spans = max(num_noise_spans, 1)
-        num_nonnoise_tokens = length - num_noise_tokens
 
         # pick the lengths of the noise spans and the non-noise spans
         def _random_segmentation(num_items, num_segments):
@@ -503,6 +511,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_t5_mlm", model_args, data_args, framework="flax")
@@ -536,13 +550,14 @@ def main():
 
     # Handle the repository creation
     if training_args.push_to_hub:
-        if training_args.hub_model_id is None:
-            repo_name = get_full_repo_name(
-                Path(training_args.output_dir).absolute().name, token=training_args.hub_token
-            )
-        else:
-            repo_name = training_args.hub_model_id
-        repo = Repository(training_args.output_dir, clone_from=repo_name)
+        # Retrieve of infer repo_name
+        repo_name = training_args.hub_model_id
+        if repo_name is None:
+            repo_name = Path(training_args.output_dir).absolute().name
+        # Create repo and retrieve repo_id
+        repo_id = create_repo(repo_name, exist_ok=True, token=training_args.hub_token).repo_id
+        # Clone repo locally
+        repo = Repository(training_args.output_dir, clone_from=repo_id, token=training_args.hub_token)
 
     # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
     # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
@@ -556,7 +571,8 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            num_proc=data_args.preprocessing_num_workers,
         )
 
         if "validation" not in datasets.keys():
@@ -565,14 +581,16 @@ def main():
                 data_args.dataset_config_name,
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
+                num_proc=data_args.preprocessing_num_workers,
             )
             datasets["train"] = load_dataset(
                 data_args.dataset_name,
                 data_args.dataset_config_name,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
+                num_proc=data_args.preprocessing_num_workers,
             )
     else:
         data_files = {}
@@ -587,7 +605,8 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            num_proc=data_args.preprocessing_num_workers,
         )
 
         if "validation" not in datasets.keys():
@@ -596,14 +615,16 @@ def main():
                 data_files=data_files,
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
+                num_proc=data_args.preprocessing_num_workers,
             )
             datasets["train"] = load_dataset(
                 extension,
                 data_files=data_files,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
+                num_proc=data_args.preprocessing_num_workers,
             )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -615,14 +636,14 @@ def main():
             model_args.tokenizer_name,
             cache_dir=model_args.cache_dir,
             use_fast=model_args.use_fast_tokenizer,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     elif model_args.model_name_or_path:
         tokenizer = AutoTokenizer.from_pretrained(
             model_args.model_name_or_path,
             cache_dir=model_args.cache_dir,
             use_fast=model_args.use_fast_tokenizer,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         raise ValueError(
@@ -635,13 +656,13 @@ def main():
             model_args.config_name,
             cache_dir=model_args.cache_dir,
             vocab_size=len(tokenizer),
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     elif model_args.model_name_or_path:
         config = T5Config.from_pretrained(
             model_args.model_name_or_path,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         config = CONFIG_MAPPING[model_args.model_type]()
@@ -700,7 +721,7 @@ def group_texts(examples):
     # might be slower to preprocess.
     #
     # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
-    # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+    # https://huggingface.co/docs/datasets/process#map
     tokenized_datasets = tokenized_datasets.map(
         group_texts,
         batched=True,
@@ -736,7 +757,7 @@ def group_texts(examples):
             config=config,
             seed=training_args.seed,
             dtype=getattr(jnp, model_args.dtype),
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         config.vocab_size = len(tokenizer)
@@ -790,14 +811,12 @@ def decay_mask_fn(params):
         flat_params = traverse_util.flatten_dict(params)
         # find out all LayerNorm parameters
         layer_norm_candidates = ["layernorm", "layer_norm", "ln"]
-        layer_norm_named_params = set(
-            [
-                layer[-2:]
-                for layer_norm_name in layer_norm_candidates
-                for layer in flat_params.keys()
-                if layer_norm_name in "".join(layer).lower()
-            ]
-        )
+        layer_norm_named_params = {
+            layer[-2:]
+            for layer_norm_name in layer_norm_candidates
+            for layer in flat_params.keys()
+            if layer_norm_name in "".join(layer).lower()
+        }
         flat_mask = {path: (path[-1] != "bias" and path[-2:] not in layer_norm_named_params) for path in flat_params}
         return traverse_util.unflatten_dict(flat_mask)
 
diff --git a/examples/flax/question-answering/run_qa.py b/examples/flax/question-answering/run_qa.py
index 5b628ca9ae62..9cd90f285a02 100644
--- a/examples/flax/question-answering/run_qa.py
+++ b/examples/flax/question-answering/run_qa.py
@@ -25,26 +25,28 @@
 import random
 import sys
 import time
+import warnings
 from dataclasses import asdict, dataclass, field
 from enum import Enum
 from pathlib import Path
 from typing import Any, Callable, Dict, Optional, Tuple
 
 import datasets
-import numpy as np
-from datasets import load_dataset
-from tqdm import tqdm
-
 import evaluate
 import jax
 import jax.numpy as jnp
+import numpy as np
 import optax
-import transformers
+from datasets import load_dataset
 from flax import struct, traverse_util
 from flax.jax_utils import pad_shard_unpad, replicate, unreplicate
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard
-from huggingface_hub import Repository
+from huggingface_hub import Repository, create_repo
+from tqdm import tqdm
+from utils_qa import postprocess_qa_predictions
+
+import transformers
 from transformers import (
     AutoConfig,
     AutoTokenizer,
@@ -54,14 +56,13 @@
     PreTrainedTokenizerFast,
     is_tensorboard_available,
 )
-from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry
-from utils_qa import postprocess_qa_predictions
+from transformers.utils import check_min_version, send_example_telemetry
 
 
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
@@ -155,12 +156,28 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -301,6 +318,7 @@ def __post_init__(self):
 
 # endregion
 
+
 # region Create a train state
 def create_train_state(
     model: FlaxAutoModelForQuestionAnswering,
@@ -332,14 +350,12 @@ def decay_mask_fn(params):
         flat_params = traverse_util.flatten_dict(params)
         # find out all LayerNorm parameters
         layer_norm_candidates = ["layernorm", "layer_norm", "ln"]
-        layer_norm_named_params = set(
-            [
-                layer[-2:]
-                for layer_norm_name in layer_norm_candidates
-                for layer in flat_params.keys()
-                if layer_norm_name in "".join(layer).lower()
-            ]
-        )
+        layer_norm_named_params = {
+            layer[-2:]
+            for layer_norm_name in layer_norm_candidates
+            for layer in flat_params.keys()
+            if layer_norm_name in "".join(layer).lower()
+        }
         flat_mask = {path: (path[-1] != "bias" and path[-2:] not in layer_norm_named_params) for path in flat_params}
         return traverse_util.unflatten_dict(flat_mask)
 
@@ -387,6 +403,7 @@ def create_learning_rate_fn(
 
 # endregion
 
+
 # region train data iterator
 def train_data_collator(rng: PRNGKey, dataset: Dataset, batch_size: int):
     """Returns shuffled batches of size `batch_size` from truncated `train dataset`, sharded over all local devices."""
@@ -405,6 +422,7 @@ def train_data_collator(rng: PRNGKey, dataset: Dataset, batch_size: int):
 
 # endregion
 
+
 # region eval data iterator
 def eval_data_collator(dataset: Dataset, batch_size: int):
     """Returns batches of size `batch_size` from `eval dataset`. Sharding handled by `pad_shard_unpad` in the eval loop."""
@@ -437,6 +455,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_qa", model_args, data_args, framework="flax")
@@ -461,13 +485,14 @@ def main():
 
     # Handle the repository creation
     if training_args.push_to_hub:
-        if training_args.hub_model_id is None:
-            repo_name = get_full_repo_name(
-                Path(training_args.output_dir).absolute().name, token=training_args.hub_token
-            )
-        else:
-            repo_name = training_args.hub_model_id
-        repo = Repository(training_args.output_dir, clone_from=repo_name)
+        # Retrieve of infer repo_name
+        repo_name = training_args.hub_model_id
+        if repo_name is None:
+            repo_name = Path(training_args.output_dir).absolute().name
+        # Create repo and retrieve repo_id
+        repo_id = create_repo(repo_name, exist_ok=True, token=training_args.hub_token).repo_id
+        # Clone repo locally
+        repo = Repository(training_args.output_dir, clone_from=repo_id, token=training_args.hub_token)
 
     # region Load Data
     # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
@@ -485,7 +510,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         # Loading the dataset from local csv or json file.
@@ -505,7 +530,7 @@ def main():
             data_files=data_files,
             field="data",
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -518,14 +543,16 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=True,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     # endregion
 
@@ -638,7 +665,7 @@ def prepare_train_features(examples):
 
         return tokenized_examples
 
-    processed_raw_datasets = dict()
+    processed_raw_datasets = {}
     if training_args.do_train:
         if "train" not in raw_datasets:
             raise ValueError("--do_train requires a train dataset")
@@ -872,7 +899,8 @@ def write_eval_metric(summary_writer, eval_metrics, step):
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
         seed=training_args.seed,
         dtype=getattr(jnp, model_args.dtype),
     )
@@ -933,7 +961,6 @@ def eval_step(state, batch):
     total_steps = step_per_epoch * num_epochs
     epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
     for epoch in epochs:
-
         train_start = time.time()
         train_metrics = []
 
@@ -974,7 +1001,6 @@ def eval_step(state, batch):
                 and (cur_step % training_args.eval_steps == 0 or cur_step % step_per_epoch == 0)
                 and cur_step > 0
             ):
-
                 eval_metrics = {}
                 all_start_logits = []
                 all_end_logits = []
diff --git a/examples/flax/speech-recognition/README.md b/examples/flax/speech-recognition/README.md
new file mode 100644
index 000000000000..943c98761aa6
--- /dev/null
+++ b/examples/flax/speech-recognition/README.md
@@ -0,0 +1,68 @@
+<!---
+Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Automatic Speech Recognition - Flax Examples
+
+## Sequence to Sequence
+
+The script [`run_flax_speech_recognition_seq2seq.py`](https://github.com/huggingface/transformers/blob/main/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py) 
+can be used to fine-tune any [Flax Speech Sequence-to-Sequence Model](https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.FlaxAutoModelForSpeechSeq2Seq) 
+for automatic speech recognition on one of the [official speech recognition datasets](https://huggingface.co/datasets?task_ids=task_ids:automatic-speech-recognition) 
+or a custom dataset. This includes the Whisper model from OpenAI, or a warm-started Speech-Encoder-Decoder Model, 
+an example for which is included below.
+
+### Whisper Model
+
+We can load all components of the Whisper model directly from the pretrained checkpoint, including the pretrained model 
+weights, feature extractor and tokenizer. We simply have to specify the id of fine-tuning dataset and the necessary
+training hyperparameters.
+
+The following example shows how to fine-tune the [Whisper small](https://huggingface.co/openai/whisper-small) checkpoint 
+on the Hindi subset of the [Common Voice 13](https://huggingface.co/datasets/mozilla-foundation/common_voice_13_0) dataset.
+Note that before running this script you must accept the dataset's [terms of use](https://huggingface.co/datasets/mozilla-foundation/common_voice_13_0) 
+and register your Hugging Face Hub token on your device by running `huggingface-hub login`.
+
+```bash
+python run_flax_speech_recognition_seq2seq.py \
+	--model_name_or_path="openai/whisper-small" \
+	--dataset_name="mozilla-foundation/common_voice_13_0" \
+	--dataset_config_name="hi" \
+	--language="hindi" \
+	--train_split_name="train+validation" \
+	--eval_split_name="test" \
+	--output_dir="./whisper-small-hi-flax" \
+	--per_device_train_batch_size="16" \
+	--per_device_eval_batch_size="16" \
+	--num_train_epochs="10" \
+	--learning_rate="1e-4" \
+	--warmup_steps="500" \
+	--logging_steps="25" \
+	--generation_max_length="40" \
+	--preprocessing_num_workers="32" \
+	--dataloader_num_workers="32" \
+	--max_duration_in_seconds="30" \
+	--text_column_name="sentence" \
+	--overwrite_output_dir \
+	--do_train \
+	--do_eval \
+	--predict_with_generate \
+	--push_to_hub \
+	--use_auth_token
+```
+
+On a TPU v4-8, training should take approximately 25 minutes, with a final cross-entropy loss of 0.02 and word error 
+rate of **34%**. See the checkpoint [sanchit-gandhi/whisper-small-hi-flax](https://huggingface.co/sanchit-gandhi/whisper-small-hi-flax)
+for an example training run.
diff --git a/examples/flax/speech-recognition/requirements.txt b/examples/flax/speech-recognition/requirements.txt
new file mode 100644
index 000000000000..b68b236ad76c
--- /dev/null
+++ b/examples/flax/speech-recognition/requirements.txt
@@ -0,0 +1,8 @@
+datasets[audio]>=2.14.0
+jax>=0.3.6
+jaxlib>=0.3.6
+flax>=0.4.1
+optax>=0.0.8
+torch>=1.9.0
+jiwer
+evaluate
diff --git a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
new file mode 100644
index 000000000000..4a2915a31ac7
--- /dev/null
+++ b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
@@ -0,0 +1,857 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the Flax library models for sequence to sequence speech recognition.
+"""
+# You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments.
+
+import logging
+import os
+import sys
+import time
+from dataclasses import field
+from functools import partial
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import datasets
+import evaluate
+import flax
+import jax
+import jax.numpy as jnp
+import numpy as np
+import optax
+from datasets import DatasetDict, load_dataset
+from flax import jax_utils, traverse_util
+from flax.jax_utils import pad_shard_unpad, unreplicate
+from flax.training import train_state
+from flax.training.common_utils import get_metrics, onehot, shard, shard_prng_key
+from huggingface_hub import Repository, create_repo
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+import transformers
+from transformers import (
+    AutoConfig,
+    AutoFeatureExtractor,
+    AutoProcessor,
+    AutoTokenizer,
+    FlaxAutoModelForSpeechSeq2Seq,
+    HfArgumentParser,
+    Seq2SeqTrainingArguments,
+    is_tensorboard_available,
+)
+from transformers.file_utils import get_full_repo_name
+from transformers.utils import check_min_version, send_example_telemetry
+from transformers.utils.versions import require_version
+
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risk.
+check_min_version("4.35.0.dev0")
+
+require_version("datasets>=2.14.0", "To fix: pip install -r examples/flax/speech-recogintion/requirements.txt")
+
+logger = logging.getLogger(__name__)
+
+
+@flax.struct.dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    feature_extractor_name: Optional[str] = field(
+        default=None, metadata={"help": "feature extractor name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+            "with private models)."
+        },
+    )
+    dtype: Optional[str] = field(
+        default="float32",
+        metadata={
+            "help": (
+                "Floating-point format in which the model weights should be initialized and trained. Choose one of"
+                " `[float32, float16, bfloat16]`."
+            )
+        },
+    )
+    num_beams: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Number of beams to use for evaluation. This argument will be passed to `model.generate`, "
+                "which is used during evaluation."
+            )
+        },
+    )
+
+
+@flax.struct.dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    dataset_name: str = field(
+        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    text_column: Optional[str] = field(
+        default=None,
+        metadata={"help": "The name of the column in the datasets containing the full texts (for summarization)."},
+    )
+    dataset_cache_dir: Optional[str] = field(
+        default=None, metadata={"help": "Path to cache directory for saving and loading datasets"}
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+            "value if set."
+        },
+    )
+    audio_column_name: str = field(
+        default="audio",
+        metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
+    )
+    text_column_name: str = field(
+        default="text",
+        metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
+    )
+    max_duration_in_seconds: float = field(
+        default=20.0,
+        metadata={"help": "Filter audio files that are longer than `max_duration_in_seconds` seconds"},
+    )
+    min_duration_in_seconds: float = field(
+        default=0.0,
+        metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"},
+    )
+    max_label_length: float = field(
+        default=128,
+        metadata={"help": "Truncate transcriptions that are longer `max_eval_length` tokens."},
+    )
+    pad_input_to_multiple_of: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "If set will pad the input sequence to a multiple of the provided value. "
+            "This is important to avoid triggering recompilations on TPU. If unspecified, will default to padding the inputs to max length."
+        },
+    )
+    pad_target_to_multiple_of: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "If set will pad the target sequence to a multiple of the provided value. "
+            "This is important to avoid triggering recompilations on TPU. If unspecified, will default to padding the targets to max length."
+        },
+    )
+    preprocessing_only: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to only do data preprocessing and skip training. "
+            "This is especially useful when data preprocessing errors out in distributed training due to timeout. "
+            "In this case, one should run the preprocessing in a non-distributed setup with `preprocessing_only=True` "
+            "so that the cached datasets can consequently be loaded in distributed training"
+        },
+    )
+    train_split_name: str = field(
+        default="train",
+        metadata={
+            "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
+        },
+    )
+    eval_split_name: str = field(
+        default="validation",
+        metadata={
+            "help": "The name of the evaluation data set split to use (via the datasets library). Defaults to 'validation'"
+        },
+    )
+    do_lower_case: bool = field(
+        default=True,
+        metadata={"help": "Whether the target text should be lower cased."},
+    )
+    language: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "Language for multilingual fine-tuning. This argument should be set for multilingual fine-tuning "
+                "only. For English speech recognition, it should be set to `None`."
+            )
+        },
+    )
+    task: str = field(
+        default="transcribe",
+        metadata={"help": "Task, either `transcribe` for speech recognition or `translate` for speech translation."},
+    )
+
+
+def shift_tokens_right(label_ids: np.array, decoder_start_token_id: int) -> np.ndarray:
+    """
+    Shift label ids one token to the right.
+    """
+    shifted_label_ids = np.zeros_like(label_ids)
+    shifted_label_ids[:, 1:] = label_ids[:, :-1]
+    shifted_label_ids[:, 0] = decoder_start_token_id
+
+    return shifted_label_ids
+
+
+@flax.struct.dataclass
+class FlaxDataCollatorSpeechSeq2SeqWithPadding:
+    """
+    Data collator that will dynamically pad the inputs received.
+    Args:
+        processor ([`Wav2Vec2Processor`])
+            The processor used for proccessing the data.
+        decoder_start_token_id (:obj: `int`)
+            The begin-of-sentence of the decoder.
+        input_padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            Select a strategy to pad the returned input sequences (according to the model's padding side and padding index)
+            among:
+            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+              sequence if provided).
+            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+              maximum acceptable input length for the model if that argument is not provided.
+            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+              different lengths).
+        target_padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            Select a strategy to pad the returned target sequences (according to the model's padding side and padding index).
+            See above for details.
+        max_input_length (:obj:`float`, `optional`):
+            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
+        max_target_length (:obj:`int`, `optional`):
+            Maximum length of the ``labels`` of the returned list and optionally padding length (see above).
+        pad_input_to_multiple_of (:obj:`int`, `optional`):
+            If set will pad the input sequence to a multiple of the provided value.
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+            7.5 (Volta).
+        pad_target_to_multiple_of (:obj:`int`, `optional`):
+            If set will pad the target sequence to a multiple of the provided value.
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+            7.5 (Volta).
+    """
+
+    processor: Any
+    decoder_start_token_id: int
+    input_padding: Union[bool, str] = "longest"
+    target_padding: Union[bool, str] = "max_length"
+    max_input_length: Optional[float] = None
+    max_target_length: Optional[int] = None
+    pad_input_to_multiple_of: Optional[int] = None
+    pad_target_to_multiple_of: Optional[int] = None
+
+    def __call__(self, features: List[Dict[str, Union[List[int], np.ndarray]]]) -> Dict[str, np.ndarray]:
+        # split inputs and labels since they have to be of different lengths and need
+        # different padding methods
+        model_input_name = self.processor.model_input_names[0]
+
+        # dataloader returns a list of features which we convert to a dict
+        input_features = {model_input_name: [feature[model_input_name] for feature in features]}
+        label_features = {"input_ids": [feature["labels"] for feature in features]}
+
+        # reformat list to dict and set to pytorch format
+        batch = self.processor.feature_extractor.pad(
+            input_features,
+            max_length=self.max_input_length,
+            padding=self.input_padding,
+            pad_to_multiple_of=self.pad_input_to_multiple_of,
+            return_tensors="np",
+        )
+
+        labels_batch = self.processor.tokenizer.pad(
+            label_features,
+            max_length=self.max_target_length,
+            padding=self.target_padding,
+            pad_to_multiple_of=self.pad_target_to_multiple_of,
+            return_tensors="np",
+        )
+
+        # if bos token is appended in previous tokenization step,
+        # cut bos token here as it's append later anyways
+        labels = labels_batch["input_ids"]
+        if (labels[:, 0] == self.decoder_start_token_id).all().item():
+            labels = labels[:, 1:]
+            labels_batch.attention_mask = labels_batch.attention_mask[:, 1:]
+
+        decoder_input_ids = shift_tokens_right(labels, self.decoder_start_token_id)
+
+        # replace padding with -100 to ignore correctly when computing the loss
+        labels = np.ma.array(labels, mask=np.not_equal(labels_batch.attention_mask, 1))
+        labels = labels.filled(fill_value=-100)
+
+        batch["labels"] = labels
+        batch["decoder_input_ids"] = decoder_input_ids
+
+        return batch
+
+
+class TrainState(train_state.TrainState):
+    dropout_rng: jnp.ndarray
+
+    def replicate(self):
+        return jax_utils.replicate(self).replace(dropout_rng=shard_prng_key(self.dropout_rng))
+
+
+def write_metric(summary_writer, train_metrics, eval_metrics, train_time, step):
+    summary_writer.scalar("train_time", train_time, step)
+
+    train_metrics = get_metrics(train_metrics)
+    for key, vals in train_metrics.items():
+        tag = f"train_{key}"
+        for i, val in enumerate(vals):
+            summary_writer.scalar(tag, val, step - len(vals) + i + 1)
+
+    for metric_name, value in eval_metrics.items():
+        summary_writer.scalar(f"eval_{metric_name}", value, step)
+
+
+def create_learning_rate_fn(
+    num_train_steps: int, num_warmup_steps: int, learning_rate: float
+) -> Callable[[int], jnp.array]:
+    """Returns a linear warmup, linear_decay learning rate function."""
+    warmup_fn = optax.linear_schedule(init_value=0.0, end_value=learning_rate, transition_steps=num_warmup_steps)
+    decay_fn = optax.linear_schedule(
+        init_value=learning_rate, end_value=0, transition_steps=num_train_steps - num_warmup_steps
+    )
+    schedule_fn = optax.join_schedules(schedules=[warmup_fn, decay_fn], boundaries=[num_warmup_steps])
+    return schedule_fn
+
+
+def main():
+    # 1. Parse input arguments
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
+
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
+    # information sent is the one passed as arguments along with your JAX/Flax versions.
+    send_example_telemetry("run_speech_recognition_seq2seq", model_args, data_args, framework="flax")
+
+    # 2. Setup logging
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    # Set the verbosity to info of the Transformers logger.
+    # We only want one process per machine to log things on the screen.
+    logger.setLevel(logging.INFO if jax.process_index() == 0 else logging.ERROR)
+    if jax.process_index() == 0:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+
+    logger.info("Training/evaluation parameters %s", training_args)
+
+    # Check the output dir is valid
+    if (
+        os.path.exists(training_args.output_dir)
+        and os.listdir(training_args.output_dir)
+        and training_args.do_train
+        and not training_args.overwrite_output_dir
+    ):
+        raise ValueError(
+            f"Output directory ({training_args.output_dir}) already exists and is not empty."
+            "Use `--overwrite_output_dir` to overcome."
+        )
+
+    # Handle the repository creation
+    if training_args.push_to_hub:
+        if training_args.hub_model_id is None:
+            repo_name = get_full_repo_name(
+                Path(training_args.output_dir).absolute().name, token=training_args.hub_token
+            )
+        else:
+            repo_name = training_args.hub_model_id
+        create_repo(repo_name, exist_ok=True, token=training_args.hub_token)
+        repo = Repository(training_args.output_dir, clone_from=repo_name, token=training_args.hub_token)
+
+    # 3. Load dataset
+    raw_datasets = DatasetDict()
+
+    if training_args.do_train:
+        raw_datasets["train"] = load_dataset(
+            data_args.dataset_name,
+            data_args.dataset_config_name,
+            split=data_args.train_split_name,
+            cache_dir=data_args.dataset_cache_dir,
+            token=True if model_args.use_auth_token else None,
+        )
+
+    if training_args.do_eval:
+        raw_datasets["eval"] = load_dataset(
+            data_args.dataset_name,
+            data_args.dataset_config_name,
+            split=data_args.eval_split_name,
+            cache_dir=data_args.dataset_cache_dir,
+            token=True if model_args.use_auth_token else None,
+        )
+
+    if not training_args.do_train and not training_args.do_eval:
+        raise ValueError(
+            "Cannot not train and not do evaluation. At least one of training or evaluation has to be performed."
+        )
+
+    if data_args.audio_column_name not in next(iter(raw_datasets.values())).column_names:
+        raise ValueError(
+            f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'. "
+            "Make sure to set `--audio_column_name` to the correct audio column - one of "
+            f"{', '.join(next(iter(raw_datasets.values())).column_names)}."
+        )
+
+    if data_args.text_column_name not in next(iter(raw_datasets.values())).column_names:
+        raise ValueError(
+            f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. "
+            "Make sure to set `--text_column_name` to the correct text column - one of "
+            f"{', '.join(next(iter(raw_datasets.values())).column_names)}."
+        )
+
+    # 5. Load pretrained model, tokenizer, and feature extractor
+    config = AutoConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        token=True if model_args.use_auth_token else None,
+    )
+    feature_extractor = AutoFeatureExtractor.from_pretrained(
+        model_args.feature_extractor_name if model_args.feature_extractor_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        token=True if model_args.use_auth_token else None,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        use_fast=model_args.use_fast_tokenizer,
+        revision=model_args.model_revision,
+        token=True if model_args.use_auth_token else None,
+    )
+
+    model = FlaxAutoModelForSpeechSeq2Seq.from_pretrained(
+        model_args.model_name_or_path,
+        config=config,
+        dtype=getattr(jnp, model_args.dtype),
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        token=True if model_args.use_auth_token else None,
+    )
+
+    if model.config.decoder_start_token_id is None:
+        raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
+
+    # 6. Resample speech dataset: `datasets` takes care of automatically loading and resampling the audio,
+    # so we just need to set the correct target sampling rate.
+    raw_datasets = raw_datasets.cast_column(
+        data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
+    )
+
+    # 7. Preprocessing the datasets.
+    # We need to read the audio files as arrays and tokenize the targets.
+    max_input_length = int(data_args.max_duration_in_seconds * feature_extractor.sampling_rate)
+    min_input_length = int(data_args.min_duration_in_seconds * feature_extractor.sampling_rate)
+    max_label_length = (
+        data_args.max_label_length if data_args.max_label_length is not None else model.config.max_length
+    )
+    pad_input_to_multiple_of = data_args.pad_input_to_multiple_of
+    pad_target_to_multiple_of = data_args.pad_target_to_multiple_of
+    audio_column_name = data_args.audio_column_name
+    num_workers = data_args.preprocessing_num_workers
+    text_column_name = data_args.text_column_name
+    model_input_name = feature_extractor.model_input_names[0]
+    do_lower_case = data_args.do_lower_case
+
+    if training_args.do_train and data_args.max_train_samples is not None:
+        raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
+
+    if training_args.do_eval and data_args.max_eval_samples is not None:
+        raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
+
+    if data_args.language is not None:
+        # We only need to set the task id when the language is specified (i.e. in a multilingual setting)
+        tokenizer.set_prefix_tokens(language=data_args.language, task=data_args.task)
+
+    def prepare_dataset(batch):
+        # process audio
+        sample = batch[audio_column_name]
+        inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
+        # process audio length
+        batch[model_input_name] = inputs.get(model_input_name)[0]
+        batch["input_length"] = len(sample["array"])
+
+        # process targets
+        input_str = batch[text_column_name].lower() if do_lower_case else batch[text_column_name]
+        batch["labels"] = tokenizer(input_str).input_ids
+        return batch
+
+    vectorized_datasets = raw_datasets.map(
+        prepare_dataset,
+        remove_columns=next(iter(raw_datasets.values())).column_names,
+        num_proc=num_workers,
+        desc="preprocess train dataset",
+    )
+
+    # filter training data with inputs longer than max_input_length
+    def is_audio_in_length_range(length):
+        return min_input_length < length < max_input_length
+
+    vectorized_datasets = vectorized_datasets.filter(
+        is_audio_in_length_range,
+        num_proc=num_workers,
+        input_columns=["input_length"],
+    )
+
+    # for large datasets it is advised to run the preprocessing on a
+    # single machine first with `args.preprocessing_only` since there will mostly likely
+    # be a timeout when running the script in distributed mode.
+    # In a second step `args.preprocessing_only` can then be set to `False` to load the
+    # cached dataset
+    if data_args.preprocessing_only:
+        cache = {k: v.cache_files for k, v in vectorized_datasets.items()}
+        logger.info(f"Data preprocessing finished. Files cached at {cache}.")
+        return
+
+    # 8. Load Metric
+    metric = evaluate.load("wer")
+
+    def compute_metrics(preds, labels):
+        # replace padded labels by the padding token
+        for idx in range(len(labels)):
+            labels[idx][labels[idx] == -100] = tokenizer.pad_token_id
+
+        pred_str = tokenizer.batch_decode(preds, skip_special_tokens=True)
+        # we do not want to group tokens when computing the metrics
+        label_str = tokenizer.batch_decode(labels, skip_special_tokens=True)
+
+        wer = metric.compute(predictions=pred_str, references=label_str)
+        return {"wer": wer}
+
+    # 9. Save feature extractor, tokenizer and config
+    feature_extractor.save_pretrained(training_args.output_dir)
+    tokenizer.save_pretrained(training_args.output_dir)
+    config.save_pretrained(training_args.output_dir)
+
+    processor = AutoProcessor.from_pretrained(training_args.output_dir)
+
+    data_collator = FlaxDataCollatorSpeechSeq2SeqWithPadding(
+        processor=processor,
+        decoder_start_token_id=model.config.decoder_start_token_id,
+        input_padding="longest",
+        target_padding="longest",
+        max_target_length=max_label_length,
+        pad_input_to_multiple_of=pad_input_to_multiple_of,
+        pad_target_to_multiple_of=pad_target_to_multiple_of if pad_target_to_multiple_of else max_label_length,
+    )
+
+    # Enable tensorboard only on the master node
+    has_tensorboard = is_tensorboard_available()
+    if has_tensorboard and jax.process_index() == 0:
+        try:
+            from flax.metrics.tensorboard import SummaryWriter
+
+            summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir))
+        except ImportError as ie:
+            has_tensorboard = False
+            logger.warning(
+                f"Unable to display metrics through TensorBoard because some package are not installed: {ie}"
+            )
+    else:
+        logger.warning(
+            "Unable to display metrics through TensorBoard because the package is not installed: "
+            "Please run pip install tensorboard to enable."
+        )
+
+    # Initialize our training
+    rng = jax.random.PRNGKey(training_args.seed)
+    rng, dropout_rng = jax.random.split(rng)
+
+    # Store some constant
+    num_epochs = int(training_args.num_train_epochs)
+    train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
+    per_device_eval_batch_size = int(training_args.per_device_eval_batch_size)
+    eval_batch_size = per_device_eval_batch_size * jax.device_count()
+    steps_per_epoch = len(vectorized_datasets["train"]) // train_batch_size
+    total_train_steps = steps_per_epoch * num_epochs
+
+    # Create learning rate schedule
+    linear_decay_lr_schedule_fn = create_learning_rate_fn(
+        len(vectorized_datasets["train"]),
+        training_args.warmup_steps,
+        training_args.learning_rate,
+    )
+
+    # We use Optax's "masking" functionality to not apply weight decay
+    # to bias and LayerNorm scale parameters. decay_mask_fn returns a
+    # mask boolean with the same structure as the parameters.
+    # The mask is True for parameters that should be decayed.
+    def decay_mask_fn(params):
+        flat_params = traverse_util.flatten_dict(params)
+        # find out all LayerNorm parameters
+        layer_norm_candidates = ["layer_norm", "self_attn_layer_norm", "final_layer_norm", "encoder_attn_layer_norm"]
+        layer_norm_named_params = {
+            layer[-2:]
+            for layer_norm_name in layer_norm_candidates
+            for layer in flat_params.keys()
+            if layer_norm_name in "".join(layer).lower()
+        }
+        flat_mask = {path: (path[-1] != "bias" and path[-2:] not in layer_norm_named_params) for path in flat_params}
+        return traverse_util.unflatten_dict(flat_mask)
+
+    # create adam optimizer
+    adamw = optax.adamw(
+        learning_rate=linear_decay_lr_schedule_fn,
+        b1=training_args.adam_beta1,
+        b2=training_args.adam_beta2,
+        eps=training_args.adam_epsilon,
+        weight_decay=training_args.weight_decay,
+        mask=decay_mask_fn,
+    )
+
+    # Setup train state
+    state = TrainState.create(apply_fn=model.__call__, params=model.params, tx=adamw, dropout_rng=dropout_rng)
+
+    # label smoothed cross entropy
+    def loss_fn(logits, labels, label_smoothing_factor=0.0):
+        """
+        The label smoothing implementation is adapted from Flax's official example:
+        https://github.com/google/flax/blob/87a211135c6a377c8f29048a1cac3840e38b9da4/examples/wmt/train.py#L104
+        """
+        vocab_size = logits.shape[-1]
+        confidence = 1.0 - label_smoothing_factor
+        low_confidence = (1.0 - confidence) / (vocab_size - 1)
+        normalizing_constant = -(
+            confidence * jnp.log(confidence) + (vocab_size - 1) * low_confidence * jnp.log(low_confidence + 1e-20)
+        )
+        soft_labels = onehot(labels, vocab_size, on_value=confidence, off_value=low_confidence)
+
+        loss = optax.softmax_cross_entropy(logits, soft_labels)
+        loss = loss - normalizing_constant
+
+        # ignore padded tokens from loss, i.e. where labels are not set to -100
+        padding_mask = labels >= 0
+        loss = loss * padding_mask
+        loss = loss.sum()
+        num_labels = padding_mask.sum()
+        return loss, num_labels
+
+    # Define gradient update step fn
+    def train_step(state, batch, label_smoothing_factor=0.0):
+        dropout_rng, new_dropout_rng = jax.random.split(state.dropout_rng)
+
+        def compute_loss(params):
+            labels = batch.pop("labels")
+            logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)[0]
+            loss, num_labels = loss_fn(logits, labels, label_smoothing_factor)
+            return loss, num_labels
+
+        grad_fn = jax.value_and_grad(compute_loss, has_aux=True)
+        (loss, num_labels), grad = grad_fn(state.params)
+        num_labels = jax.lax.psum(num_labels, "batch")
+
+        # true loss = total loss / total samples
+        loss = jax.lax.psum(loss, "batch")
+        loss = jax.tree_util.tree_map(lambda x: x / num_labels, loss)
+
+        # true grad = total grad / total samples
+        grad = jax.lax.psum(grad, "batch")
+        grad = jax.tree_util.tree_map(lambda x: x / num_labels, grad)
+        new_state = state.apply_gradients(grads=grad, dropout_rng=new_dropout_rng)
+
+        metrics = {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step)}
+        return new_state, metrics
+
+    # Define eval fn
+    def eval_step(params, batch, label_smoothing_factor=0.0):
+        labels = batch.pop("labels")
+        logits = model(**batch, params=params, train=False)[0]
+
+        loss, num_labels = loss_fn(logits, labels, label_smoothing_factor)
+        num_labels = jax.lax.psum(num_labels, "batch")
+
+        # true loss = total loss / total samples
+        loss = jax.lax.psum(loss, "batch")
+        loss = jax.tree_util.tree_map(lambda x: x / num_labels, loss)
+
+        metrics = {"loss": loss}
+        return metrics
+
+    # Define generation function
+    num_beams = model_args.num_beams if model_args.num_beams is not None else model.config.num_beams
+    gen_kwargs = {"max_length": max_label_length, "num_beams": num_beams}
+
+    def generate_step(params, batch):
+        model.params = params
+        output_ids = model.generate(batch[model_input_name], attention_mask=batch.get("attention_mask"), **gen_kwargs)
+        return output_ids.sequences
+
+    # Create parallel version of the train and eval step
+    p_train_step = jax.pmap(
+        partial(train_step, label_smoothing_factor=training_args.label_smoothing_factor), "batch", donate_argnums=(0,)
+    )
+    p_eval_step = jax.pmap(partial(eval_step, label_smoothing_factor=training_args.label_smoothing_factor), "batch")
+    p_generate_step = jax.pmap(generate_step, "batch")
+
+    # Replicate the train state on each device
+    state = state.replicate()
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(vectorized_datasets['train'])}")
+    logger.info(f"  Num Epochs = {num_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel & distributed) = {train_batch_size}")
+    logger.info(f"  Total optimization steps = {total_train_steps}")
+
+    train_time = 0
+    epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
+    for epoch in epochs:
+        # ======================== Training ================================
+        train_start = time.time()
+
+        train_metrics = []
+
+        # Generate an epoch by shuffling sampling indices from the train dataset and create a data loader
+        vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(training_args.seed)
+        train_loader = DataLoader(
+            vectorized_datasets["train"],
+            batch_size=train_batch_size,
+            drop_last=True,
+            collate_fn=data_collator,
+            num_workers=training_args.dataloader_num_workers,
+        )
+        # train
+        for batch in tqdm(train_loader, desc="Training...", position=1, leave=False):
+            batch = shard(batch.data)
+            state, train_metric = p_train_step(state, batch)
+            train_metrics.append(train_metric)
+
+        train_time += time.time() - train_start
+
+        train_metric = unreplicate(train_metric)
+
+        epochs.write(
+            f"Epoch... ({epoch + 1}/{num_epochs} | Loss: {train_metric['loss']}, Learning Rate:"
+            f" {train_metric['learning_rate']})"
+        )
+
+        # ======================== Evaluating ==============================
+        eval_metrics = []
+        eval_preds = []
+        eval_labels = []
+
+        eval_loader = DataLoader(
+            vectorized_datasets["eval"],
+            batch_size=eval_batch_size,
+            drop_last=False,
+            collate_fn=data_collator,
+            num_workers=training_args.dataloader_num_workers,
+        )
+        for batch in tqdm(eval_loader, desc="Evaluating...", position=2, leave=False):
+            # Model forward
+            labels = batch["labels"]
+
+            metrics = pad_shard_unpad(p_eval_step, static_return=True)(
+                state.params, batch.data, min_device_batch=per_device_eval_batch_size
+            )
+            eval_metrics.append(metrics)
+
+            # generation
+            if training_args.predict_with_generate:
+                generated_ids = pad_shard_unpad(p_generate_step)(state.params, batch.data)
+                eval_preds.extend(jax.device_get(generated_ids.reshape(-1, gen_kwargs["max_length"])))
+                eval_labels.extend(labels)
+
+        # normalize eval metrics
+        eval_metrics = get_metrics(eval_metrics)
+        eval_metrics = jax.tree_util.tree_map(jnp.mean, eval_metrics)
+
+        # compute WER metric
+        wer_desc = ""
+        if training_args.predict_with_generate:
+            wer_metric = compute_metrics(eval_preds, eval_labels)
+            eval_metrics.update(wer_metric)
+            wer_desc = " ".join([f"Eval {key}: {value} |" for key, value in wer_metric.items()])
+
+        # Print metrics and update progress bar
+        desc = f"Epoch... ({epoch + 1}/{num_epochs} | Eval Loss: {eval_metrics['loss']} | {wer_desc})"
+        epochs.write(desc)
+        epochs.desc = desc
+
+        # Save metrics
+        if has_tensorboard and jax.process_index() == 0:
+            cur_step = epoch * (len(vectorized_datasets["train"]) // train_batch_size)
+            write_metric(summary_writer, train_metrics, eval_metrics, train_time, cur_step)
+
+        # save checkpoint after each epoch and push checkpoint to the hub
+        if jax.process_index() == 0:
+            params = jax.device_get(jax.tree_util.tree_map(lambda x: x[0], state.params))
+            model.save_pretrained(training_args.output_dir, params=params)
+            tokenizer.save_pretrained(training_args.output_dir)
+            if training_args.push_to_hub:
+                repo.push_to_hub(commit_message=f"Saving weights and logs of epoch {epoch}", blocking=False)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/flax/summarization/run_summarization_flax.py b/examples/flax/summarization/run_summarization_flax.py
index fb3eb8d28c11..d57aa1769135 100644
--- a/examples/flax/summarization/run_summarization_flax.py
+++ b/examples/flax/summarization/run_summarization_flax.py
@@ -24,6 +24,7 @@
 import os
 import sys
 import time
+import warnings
 from dataclasses import asdict, dataclass, field
 from enum import Enum
 from functools import partial
@@ -31,22 +32,22 @@
 from typing import Callable, Optional
 
 import datasets
-import nltk  # Here to have a nice missing dependency error message early on
-import numpy as np
-from datasets import Dataset, load_dataset
-from tqdm import tqdm
-
 import evaluate
 import jax
 import jax.numpy as jnp
+import nltk  # Here to have a nice missing dependency error message early on
+import numpy as np
 import optax
-import transformers
+from datasets import Dataset, load_dataset
 from filelock import FileLock
 from flax import jax_utils, traverse_util
 from flax.jax_utils import pad_shard_unpad, unreplicate
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard, shard_prng_key
-from huggingface_hub import Repository
+from huggingface_hub import Repository, create_repo
+from tqdm import tqdm
+
+import transformers
 from transformers import (
     CONFIG_MAPPING,
     FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
@@ -56,7 +57,7 @@
     HfArgumentParser,
     is_tensorboard_available,
 )
-from transformers.utils import get_full_repo_name, is_offline_mode, send_example_telemetry
+from transformers.utils import is_offline_mode, send_example_telemetry
 
 
 logger = logging.getLogger(__name__)
@@ -158,7 +159,7 @@ class ModelArguments:
         default=None,
         metadata={
             "help": (
-                "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch."
+                "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
             )
         },
     )
@@ -188,12 +189,28 @@ class ModelArguments:
             )
         },
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -308,8 +325,13 @@ class DataTrainingArguments:
     )
 
     def __post_init__(self):
-        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
-            raise ValueError("Need either a dataset name or a training/validation file.")
+        if (
+            self.dataset_name is None
+            and self.train_file is None
+            and self.validation_file is None
+            and self.test_file is None
+        ):
+            raise ValueError("Need either a dataset name or a training, validation, or test file.")
         else:
             if self.train_file is not None:
                 extension = self.train_file.split(".")[-1]
@@ -317,6 +339,9 @@ def __post_init__(self):
             if self.validation_file is not None:
                 extension = self.validation_file.split(".")[-1]
                 assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
+            if self.test_file is not None:
+                extension = self.test_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`test_file` should be a csv or a json file."
         if self.val_max_target_length is None:
             self.val_max_target_length = self.max_target_length
 
@@ -409,6 +434,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_summarization", model_args, data_args, framework="flax")
@@ -444,13 +475,14 @@ def main():
 
     # Handle the repository creation
     if training_args.push_to_hub:
-        if training_args.hub_model_id is None:
-            repo_name = get_full_repo_name(
-                Path(training_args.output_dir).absolute().name, token=training_args.hub_token
-            )
-        else:
-            repo_name = training_args.hub_model_id
-        repo = Repository(training_args.output_dir, clone_from=repo_name)
+        # Retrieve of infer repo_name
+        repo_name = training_args.hub_model_id
+        if repo_name is None:
+            repo_name = Path(training_args.output_dir).absolute().name
+        # Create repo and retrieve repo_id
+        repo_id = create_repo(repo_name, exist_ok=True, token=training_args.hub_token).repo_id
+        # Clone repo locally
+        repo = Repository(training_args.output_dir, clone_from=repo_id, token=training_args.hub_token)
 
     # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
     # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
@@ -466,7 +498,7 @@ def main():
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
             keep_in_memory=False,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -483,7 +515,7 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -494,13 +526,15 @@ def main():
         config = AutoConfig.from_pretrained(
             model_args.config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     elif model_args.model_name_or_path:
         config = AutoConfig.from_pretrained(
             model_args.model_name_or_path,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         config = CONFIG_MAPPING[model_args.model_type]()
@@ -511,14 +545,16 @@ def main():
             model_args.tokenizer_name,
             cache_dir=model_args.cache_dir,
             use_fast=model_args.use_fast_tokenizer,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     elif model_args.model_name_or_path:
         tokenizer = AutoTokenizer.from_pretrained(
             model_args.model_name_or_path,
             cache_dir=model_args.cache_dir,
             use_fast=model_args.use_fast_tokenizer,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         raise ValueError(
@@ -532,13 +568,15 @@ def main():
             config=config,
             seed=training_args.seed,
             dtype=getattr(jnp, model_args.dtype),
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         model = FlaxAutoModelForSeq2SeqLM.from_config(
             config,
             seed=training_args.seed,
             dtype=getattr(jnp, model_args.dtype),
+            trust_remote_code=model_args.trust_remote_code,
         )
 
     if training_args.gradient_checkpointing:
@@ -552,10 +590,16 @@ def main():
     # Preprocessing the datasets.
     # We need to tokenize inputs and targets.
     if training_args.do_train:
+        if "train" not in dataset:
+            raise ValueError("--do_train requires a train dataset")
         column_names = dataset["train"].column_names
     elif training_args.do_eval:
+        if "validation" not in dataset:
+            raise ValueError("--do_eval requires a validation dataset")
         column_names = dataset["validation"].column_names
     elif training_args.do_predict:
+        if "test" not in dataset:
+            raise ValueError("--do_predict requires a test dataset")
         column_names = dataset["test"].column_names
     else:
         logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.")
@@ -619,8 +663,6 @@ def preprocess_function(examples):
         return model_inputs
 
     if training_args.do_train:
-        if "train" not in dataset:
-            raise ValueError("--do_train requires a train dataset")
         train_dataset = dataset["train"]
         if data_args.max_train_samples is not None:
             max_train_samples = min(len(train_dataset), data_args.max_train_samples)
@@ -636,8 +678,6 @@ def preprocess_function(examples):
 
     if training_args.do_eval:
         max_target_length = data_args.val_max_target_length
-        if "validation" not in dataset:
-            raise ValueError("--do_eval requires a validation dataset")
         eval_dataset = dataset["validation"]
         if data_args.max_eval_samples is not None:
             max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
@@ -653,8 +693,6 @@ def preprocess_function(examples):
 
     if training_args.do_predict:
         max_target_length = data_args.val_max_target_length
-        if "test" not in dataset:
-            raise ValueError("--do_predict requires a test dataset")
         predict_dataset = dataset["test"]
         if data_args.max_predict_samples is not None:
             max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
@@ -741,14 +779,12 @@ def decay_mask_fn(params):
         flat_params = traverse_util.flatten_dict(params)
         # find out all LayerNorm parameters
         layer_norm_candidates = ["layernorm", "layer_norm", "ln"]
-        layer_norm_named_params = set(
-            [
-                layer[-2:]
-                for layer_norm_name in layer_norm_candidates
-                for layer in flat_params.keys()
-                if layer_norm_name in "".join(layer).lower()
-            ]
-        )
+        layer_norm_named_params = {
+            layer[-2:]
+            for layer_norm_name in layer_norm_candidates
+            for layer in flat_params.keys()
+            if layer_norm_name in "".join(layer).lower()
+        }
         flat_mask = {path: (path[-1] != "bias" and path[-2:] not in layer_norm_named_params) for path in flat_params}
         return traverse_util.unflatten_dict(flat_mask)
 
diff --git a/examples/flax/test_flax_examples.py b/examples/flax/test_flax_examples.py
index 2fc2dcc16adc..47ac66de118a 100644
--- a/examples/flax/test_flax_examples.py
+++ b/examples/flax/test_flax_examples.py
@@ -32,6 +32,7 @@
         "summarization",
         "token-classification",
         "question-answering",
+        "speech-recognition",
     ]
 ]
 sys.path.extend(SRC_DIRS)
@@ -41,6 +42,7 @@
     import run_clm_flax
     import run_flax_glue
     import run_flax_ner
+    import run_flax_speech_recognition_seq2seq
     import run_mlm_flax
     import run_qa
     import run_summarization_flax
@@ -252,3 +254,32 @@ def test_run_qa(self):
             result = get_results(tmp_dir)
             self.assertGreaterEqual(result["eval_f1"], 30)
             self.assertGreaterEqual(result["eval_exact"], 30)
+
+    @slow
+    def test_run_flax_speech_recognition_seq2seq(self):
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        testargs = f"""
+            run_flax_speech_recognition_seq2seq.py
+            --model_name_or_path openai/whisper-tiny.en
+            --dataset_name hf-internal-testing/librispeech_asr_dummy
+            --dataset_config clean
+            --train_split_name validation
+            --eval_split_name validation
+            --output_dir {tmp_dir}
+            --overwrite_output_dir
+            --num_train_epochs=2
+            --max_train_samples 10
+            --max_eval_samples 10
+            --warmup_steps=8
+            --do_train
+            --do_eval
+            --learning_rate=2e-4
+            --per_device_train_batch_size=2
+            --per_device_eval_batch_size=1
+            --predict_with_generate
+        """.split()
+
+        with patch.object(sys, "argv", testargs):
+            run_flax_speech_recognition_seq2seq.main()
+            result = get_results(tmp_dir, split="eval")
+            self.assertLessEqual(result["eval_wer"], 0.05)
diff --git a/examples/flax/text-classification/run_flax_glue.py b/examples/flax/text-classification/run_flax_glue.py
index 04457bdd2ab4..b42a25653107 100755
--- a/examples/flax/text-classification/run_flax_glue.py
+++ b/examples/flax/text-classification/run_flax_glue.py
@@ -21,25 +21,26 @@
 import random
 import sys
 import time
+import warnings
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any, Callable, Dict, Optional, Tuple
 
 import datasets
-import numpy as np
-from datasets import load_dataset
-from tqdm import tqdm
-
 import evaluate
 import jax
 import jax.numpy as jnp
+import numpy as np
 import optax
-import transformers
+from datasets import load_dataset
 from flax import struct, traverse_util
 from flax.jax_utils import pad_shard_unpad, replicate, unreplicate
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard
-from huggingface_hub import Repository
+from huggingface_hub import Repository, create_repo
+from tqdm import tqdm
+
+import transformers
 from transformers import (
     AutoConfig,
     AutoTokenizer,
@@ -49,12 +50,12 @@
     TrainingArguments,
     is_tensorboard_available,
 )
-from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry
+from transformers.utils import check_min_version, send_example_telemetry
 
 
 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
@@ -101,12 +102,28 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -229,14 +246,12 @@ def decay_mask_fn(params):
         flat_params = traverse_util.flatten_dict(params)
         # find out all LayerNorm parameters
         layer_norm_candidates = ["layernorm", "layer_norm", "ln"]
-        layer_norm_named_params = set(
-            [
-                layer[-2:]
-                for layer_norm_name in layer_norm_candidates
-                for layer in flat_params.keys()
-                if layer_norm_name in "".join(layer).lower()
-            ]
-        )
+        layer_norm_named_params = {
+            layer[-2:]
+            for layer_norm_name in layer_norm_candidates
+            for layer in flat_params.keys()
+            if layer_norm_name in "".join(layer).lower()
+        }
         flat_mask = {path: (path[-1] != "bias" and path[-2:] not in layer_norm_named_params) for path in flat_params}
         return traverse_util.unflatten_dict(flat_mask)
 
@@ -323,6 +338,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_glue", model_args, data_args, framework="flax")
@@ -344,13 +365,14 @@ def main():
 
     # Handle the repository creation
     if training_args.push_to_hub:
-        if training_args.hub_model_id is None:
-            repo_name = get_full_repo_name(
-                Path(training_args.output_dir).absolute().name, token=training_args.hub_token
-            )
-        else:
-            repo_name = training_args.hub_model_id
-        repo = Repository(training_args.output_dir, clone_from=repo_name)
+        # Retrieve of infer repo_name
+        repo_name = training_args.hub_model_id
+        if repo_name is None:
+            repo_name = Path(training_args.output_dir).absolute().name
+        # Create repo and retrieve repo_id
+        repo_id = create_repo(repo_name, exist_ok=True, token=training_args.hub_token).repo_id
+        # Clone repo locally
+        repo = Repository(training_args.output_dir, clone_from=repo_id, token=training_args.hub_token)
 
     # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
     # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub).
@@ -369,7 +391,7 @@ def main():
         raw_datasets = load_dataset(
             "glue",
             data_args.task_name,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         # Loading the dataset from local csv or json file.
@@ -382,7 +404,7 @@ def main():
         raw_datasets = load_dataset(
             extension,
             data_files=data_files,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -412,17 +434,20 @@ def main():
         model_args.model_name_or_path,
         num_labels=num_labels,
         finetuning_task=data_args.task_name,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.model_name_or_path,
         use_fast=not model_args.use_slow_tokenizer,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     model = FlaxAutoModelForSequenceClassification.from_pretrained(
         model_args.model_name_or_path,
         config=config,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     # Preprocessing the datasets
@@ -448,7 +473,7 @@ def main():
     ):
         # Some have all caps in their config, some don't.
         label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()}
-        if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
+        if sorted(label_name_to_id.keys()) == sorted(label_list):
             logger.info(
                 f"The configuration of the model provided the following label correspondence: {label_name_to_id}. "
                 "Using it!"
@@ -457,7 +482,7 @@ def main():
         else:
             logger.warning(
                 "Your model seems to have been trained with labels, but they don't match the dataset: ",
-                f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
+                f"model labels: {sorted(label_name_to_id.keys())}, dataset labels: {sorted(label_list)}."
                 "\nIgnoring the model labels as a result.",
             )
     elif data_args.task_name is None:
@@ -585,7 +610,6 @@ def eval_step(state, batch):
     total_steps = steps_per_epoch * num_epochs
     epochs = tqdm(range(num_epochs), desc=f"Epoch ... (0/{num_epochs})", position=0)
     for epoch in epochs:
-
         train_start = time.time()
         train_metrics = []
 
@@ -622,7 +646,6 @@ def eval_step(state, batch):
                 train_metrics = []
 
             if (cur_step % training_args.eval_steps == 0 or cur_step % steps_per_epoch == 0) and cur_step > 0:
-
                 # evaluate
                 eval_loader = glue_eval_data_collator(eval_dataset, eval_batch_size)
                 for batch in tqdm(
diff --git a/examples/flax/token-classification/run_flax_ner.py b/examples/flax/token-classification/run_flax_ner.py
index 7224b5915e1f..f4b40220ff12 100644
--- a/examples/flax/token-classification/run_flax_ner.py
+++ b/examples/flax/token-classification/run_flax_ner.py
@@ -21,6 +21,7 @@
 import random
 import sys
 import time
+import warnings
 from dataclasses import asdict, dataclass, field
 from enum import Enum
 from itertools import chain
@@ -28,20 +29,20 @@
 from typing import Any, Callable, Dict, Optional, Tuple
 
 import datasets
-import numpy as np
-from datasets import ClassLabel, load_dataset
-from tqdm import tqdm
-
 import evaluate
 import jax
 import jax.numpy as jnp
+import numpy as np
 import optax
-import transformers
+from datasets import ClassLabel, load_dataset
 from flax import struct, traverse_util
 from flax.jax_utils import pad_shard_unpad, replicate, unreplicate
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard
-from huggingface_hub import Repository
+from huggingface_hub import Repository, create_repo
+from tqdm import tqdm
+
+import transformers
 from transformers import (
     AutoConfig,
     AutoTokenizer,
@@ -49,13 +50,13 @@
     HfArgumentParser,
     is_tensorboard_available,
 )
-from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry
+from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 
 
 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
 
@@ -149,12 +150,28 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -290,14 +307,12 @@ def decay_mask_fn(params):
         flat_params = traverse_util.flatten_dict(params)
         # find out all LayerNorm parameters
         layer_norm_candidates = ["layernorm", "layer_norm", "ln"]
-        layer_norm_named_params = set(
-            [
-                layer[-2:]
-                for layer_norm_name in layer_norm_candidates
-                for layer in flat_params.keys()
-                if layer_norm_name in "".join(layer).lower()
-            ]
-        )
+        layer_norm_named_params = {
+            layer[-2:]
+            for layer_norm_name in layer_norm_candidates
+            for layer in flat_params.keys()
+            if layer_norm_name in "".join(layer).lower()
+        }
         flat_mask = {path: (path[-1] != "bias" and path[-2:] not in layer_norm_named_params) for path in flat_params}
         return traverse_util.unflatten_dict(flat_mask)
 
@@ -379,6 +394,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_ner", model_args, data_args, framework="flax")
@@ -400,13 +421,14 @@ def main():
 
     # Handle the repository creation
     if training_args.push_to_hub:
-        if training_args.hub_model_id is None:
-            repo_name = get_full_repo_name(
-                Path(training_args.output_dir).absolute().name, token=training_args.hub_token
-            )
-        else:
-            repo_name = training_args.hub_model_id
-        repo = Repository(training_args.output_dir, clone_from=repo_name)
+        # Retrieve of infer repo_name
+        repo_name = training_args.hub_model_id
+        if repo_name is None:
+            repo_name = Path(training_args.output_dir).absolute().name
+        # Create repo and retrieve repo_id
+        repo_id = create_repo(repo_name, exist_ok=True, token=training_args.hub_token).repo_id
+        # Clone repo locally
+        repo = Repository(training_args.output_dir, clone_from=repo_id, token=training_args.hub_token)
 
     # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
     # or just provide the name of one of the public datasets for token classification task available on the hub at https://huggingface.co/datasets/
@@ -423,7 +445,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         # Loading the dataset from local csv or json file.
@@ -437,7 +459,7 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -491,7 +513,8 @@ def get_label_list(labels):
         finetuning_task=data_args.task_name,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer_name_or_path = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path
     if config.model_type in {"gpt2", "roberta"}:
@@ -499,7 +522,8 @@ def get_label_list(labels):
             tokenizer_name_or_path,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
             add_prefix_space=True,
         )
     else:
@@ -507,14 +531,16 @@ def get_label_list(labels):
             tokenizer_name_or_path,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     model = FlaxAutoModelForTokenClassification.from_pretrained(
         model_args.model_name_or_path,
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     # Preprocessing the datasets
@@ -694,7 +720,6 @@ def compute_metrics():
     total_steps = step_per_epoch * num_epochs
     epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
     for epoch in epochs:
-
         train_start = time.time()
         train_metrics = []
 
@@ -730,7 +755,6 @@ def compute_metrics():
                 train_metrics = []
 
             if cur_step % training_args.eval_steps == 0 and cur_step > 0:
-
                 eval_metrics = {}
                 # evaluate
                 for batch in tqdm(
diff --git a/examples/flax/vision/requirements.txt b/examples/flax/vision/requirements.txt
index cf1859d75494..539ffdc6fa9f 100644
--- a/examples/flax/vision/requirements.txt
+++ b/examples/flax/vision/requirements.txt
@@ -3,6 +3,6 @@ jaxlib>=0.1.59
 flax>=0.3.5
 optax>=0.0.8
 -f https://download.pytorch.org/whl/torch_stable.html
-torch==1.9.0+cpu 
+torch==1.11.0+cpu
 -f https://download.pytorch.org/whl/torch_stable.html
-torchvision==0.10.0+cpu
\ No newline at end of file
+torchvision==0.12.0+cpu
diff --git a/examples/flax/vision/run_image_classification.py b/examples/flax/vision/run_image_classification.py
index 22065438d2ac..66505014ec57 100644
--- a/examples/flax/vision/run_image_classification.py
+++ b/examples/flax/vision/run_image_classification.py
@@ -24,26 +24,28 @@
 import os
 import sys
 import time
+import warnings
 from dataclasses import asdict, dataclass, field
 from enum import Enum
 from pathlib import Path
 from typing import Callable, Optional
 
+import jax
+import jax.numpy as jnp
+import optax
+
 # for dataset and preprocessing
 import torch
 import torchvision
 import torchvision.transforms as transforms
-from tqdm import tqdm
-
-import jax
-import jax.numpy as jnp
-import optax
-import transformers
 from flax import jax_utils
 from flax.jax_utils import pad_shard_unpad, unreplicate
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard, shard_prng_key
-from huggingface_hub import Repository
+from huggingface_hub import Repository, create_repo
+from tqdm import tqdm
+
+import transformers
 from transformers import (
     CONFIG_MAPPING,
     FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
@@ -53,7 +55,7 @@
     is_tensorboard_available,
     set_seed,
 )
-from transformers.utils import get_full_repo_name, send_example_telemetry
+from transformers.utils import send_example_telemetry
 
 
 logger = logging.getLogger(__name__)
@@ -135,7 +137,7 @@ class ModelArguments:
         default=None,
         metadata={
             "help": (
-                "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch."
+                "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
             )
         },
     )
@@ -158,12 +160,28 @@ class ModelArguments:
             )
         },
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -256,6 +274,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_image_classification", model_args, data_args, framework="flax")
@@ -292,13 +316,14 @@ def main():
 
     # Handle the repository creation
     if training_args.push_to_hub:
-        if training_args.hub_model_id is None:
-            repo_name = get_full_repo_name(
-                Path(training_args.output_dir).absolute().name, token=training_args.hub_token
-            )
-        else:
-            repo_name = training_args.hub_model_id
-        repo = Repository(training_args.output_dir, clone_from=repo_name)
+        # Retrieve of infer repo_name
+        repo_name = training_args.hub_model_id
+        if repo_name is None:
+            repo_name = Path(training_args.output_dir).absolute().name
+        # Create repo and retrieve repo_id
+        repo_id = create_repo(repo_name, exist_ok=True, token=training_args.hub_token).repo_id
+        # Clone repo locally
+        repo = Repository(training_args.output_dir, clone_from=repo_id, token=training_args.hub_token)
 
     # Initialize datasets and pre-processing transforms
     # We use torchvision here for faster pre-processing
@@ -336,7 +361,8 @@ def main():
             num_labels=len(train_dataset.classes),
             image_size=data_args.image_size,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     elif model_args.model_name_or_path:
         config = AutoConfig.from_pretrained(
@@ -344,7 +370,8 @@ def main():
             num_labels=len(train_dataset.classes),
             image_size=data_args.image_size,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         config = CONFIG_MAPPING[model_args.model_type]()
@@ -356,13 +383,15 @@ def main():
             config=config,
             seed=training_args.seed,
             dtype=getattr(jnp, model_args.dtype),
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         model = FlaxAutoModelForImageClassification.from_config(
             config,
             seed=training_args.seed,
             dtype=getattr(jnp, model_args.dtype),
+            trust_remote_code=model_args.trust_remote_code,
         )
 
     # Store some constant
diff --git a/examples/legacy/multiple_choice/run_multiple_choice.py b/examples/legacy/multiple_choice/run_multiple_choice.py
index d8007da6cb67..451397042594 100644
--- a/examples/legacy/multiple_choice/run_multiple_choice.py
+++ b/examples/legacy/multiple_choice/run_multiple_choice.py
@@ -22,6 +22,7 @@
 from typing import Dict, Optional
 
 import numpy as np
+from utils_multiple_choice import MultipleChoiceDataset, Split, processors
 
 import transformers
 from transformers import (
@@ -36,7 +37,6 @@
     set_seed,
 )
 from transformers.trainer_utils import is_main_process
-from utils_multiple_choice import MultipleChoiceDataset, Split, processors
 
 
 logger = logging.getLogger(__name__)
diff --git a/examples/legacy/multiple_choice/utils_multiple_choice.py b/examples/legacy/multiple_choice/utils_multiple_choice.py
index 3dbc3689cc48..9ffaa7971b56 100644
--- a/examples/legacy/multiple_choice/utils_multiple_choice.py
+++ b/examples/legacy/multiple_choice/utils_multiple_choice.py
@@ -26,8 +26,8 @@
 from typing import List, Optional
 
 import tqdm
-
 from filelock import FileLock
+
 from transformers import PreTrainedTokenizer, is_tf_available, is_torch_available
 
 
@@ -112,7 +112,6 @@ def __init__(
             # and the others will use the cache.
             lock_path = cached_features_file + ".lock"
             with FileLock(lock_path):
-
                 if os.path.exists(cached_features_file) and not overwrite_cache:
                     logger.info(f"Loading features from cached file {cached_features_file}")
                     self.features = torch.load(cached_features_file)
diff --git a/examples/legacy/pytorch-lightning/lightning_base.py b/examples/legacy/pytorch-lightning/lightning_base.py
index b3104a25a8b1..f246ecab0dd0 100644
--- a/examples/legacy/pytorch-lightning/lightning_base.py
+++ b/examples/legacy/pytorch-lightning/lightning_base.py
@@ -69,7 +69,7 @@ def __init__(
         config=None,
         tokenizer=None,
         model=None,
-        **config_kwargs
+        **config_kwargs,
     ):
         """Initialize a model, tokenizer and config."""
         super().__init__()
@@ -346,7 +346,7 @@ def generic_train(
     extra_callbacks=[],
     checkpoint_callback=None,
     logging_callback=None,
-    **extra_train_kwargs
+    **extra_train_kwargs,
 ):
     pl.seed_everything(args.seed)
 
diff --git a/examples/legacy/pytorch-lightning/run_glue.py b/examples/legacy/pytorch-lightning/run_glue.py
index 63b58bcf413c..5f22e2fc7a13 100644
--- a/examples/legacy/pytorch-lightning/run_glue.py
+++ b/examples/legacy/pytorch-lightning/run_glue.py
@@ -7,21 +7,19 @@
 
 import numpy as np
 import torch
+from lightning_base import BaseTransformer, add_generic_args, generic_train
 from torch.utils.data import DataLoader, TensorDataset
 
-from lightning_base import BaseTransformer, add_generic_args, generic_train
 from transformers import glue_compute_metrics as compute_metrics
 from transformers import glue_convert_examples_to_features as convert_examples_to_features
-from transformers import glue_output_modes
+from transformers import glue_output_modes, glue_tasks_num_labels
 from transformers import glue_processors as processors
-from transformers import glue_tasks_num_labels
 
 
 logger = logging.getLogger(__name__)
 
 
 class GLUETransformer(BaseTransformer):
-
     mode = "sequence-classification"
 
     def __init__(self, hparams):
@@ -126,7 +124,7 @@ def _eval_end(self, outputs) -> tuple:
 
         results = {**{"val_loss": val_loss_mean}, **compute_metrics(self.hparams.task, preds, out_label_ids)}
 
-        ret = {k: v for k, v in results.items()}
+        ret = dict(results.items())
         ret["log"] = results
         return ret, preds_list, out_label_list
 
@@ -194,7 +192,7 @@ def main():
 
     # Optionally, predict on dev set and write to output_dir
     if args.do_predict:
-        checkpoints = list(sorted(glob.glob(os.path.join(args.output_dir, "checkpoint-epoch=*.ckpt"), recursive=True)))
+        checkpoints = sorted(glob.glob(os.path.join(args.output_dir, "checkpoint-epoch=*.ckpt"), recursive=True))
         model = model.load_from_checkpoint(checkpoints[-1])
         return trainer.test(model)
 
diff --git a/examples/legacy/pytorch-lightning/run_ner.py b/examples/legacy/pytorch-lightning/run_ner.py
index b1bdd125c22e..7f6b00854d99 100644
--- a/examples/legacy/pytorch-lightning/run_ner.py
+++ b/examples/legacy/pytorch-lightning/run_ner.py
@@ -7,11 +7,10 @@
 
 import numpy as np
 import torch
+from lightning_base import BaseTransformer, add_generic_args, generic_train
 from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
 from torch.nn import CrossEntropyLoss
 from torch.utils.data import DataLoader, TensorDataset
-
-from lightning_base import BaseTransformer, add_generic_args, generic_train
 from utils_ner import TokenClassificationTask
 
 
@@ -123,7 +122,7 @@ def _eval_end(self, outputs):
         preds = np.argmax(preds, axis=2)
         out_label_ids = np.concatenate([x["target"] for x in outputs], axis=0)
 
-        label_map = {i: label for i, label in enumerate(self.labels)}
+        label_map = dict(enumerate(self.labels))
         out_label_list = [[] for _ in range(out_label_ids.shape[0])]
         preds_list = [[] for _ in range(out_label_ids.shape[0])]
 
@@ -141,7 +140,7 @@ def _eval_end(self, outputs):
             "f1": f1_score(out_label_list, preds_list),
         }
 
-        ret = {k: v for k, v in results.items()}
+        ret = dict(results.items())
         ret["log"] = results
         return ret, preds_list, out_label_list
 
@@ -212,6 +211,6 @@ def add_model_specific_args(parser, root_dir):
         # pl use this default format to create a checkpoint:
         # https://github.com/PyTorchLightning/pytorch-lightning/blob/master\
         # /pytorch_lightning/callbacks/model_checkpoint.py#L322
-        checkpoints = list(sorted(glob.glob(os.path.join(args.output_dir, "checkpoint-epoch=*.ckpt"), recursive=True)))
+        checkpoints = sorted(glob.glob(os.path.join(args.output_dir, "checkpoint-epoch=*.ckpt"), recursive=True))
         model = model.load_from_checkpoint(checkpoints[-1])
         trainer.test(model)
diff --git a/examples/legacy/question-answering/run_squad.py b/examples/legacy/question-answering/run_squad.py
index 674e7a9accbf..fc9411e95d22 100644
--- a/examples/legacy/question-answering/run_squad.py
+++ b/examples/legacy/question-answering/run_squad.py
@@ -172,7 +172,6 @@ def train(args, train_dataset, model, tokenizer):
     for _ in train_iterator:
         epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
         for step, batch in enumerate(epoch_iterator):
-
             # Skip past any already trained steps if resuming training
             if steps_trained_in_current_epoch > 0:
                 steps_trained_in_current_epoch -= 1
@@ -811,10 +810,10 @@ def main():
             logger.info("Loading checkpoints saved during training for evaluation")
             checkpoints = [args.output_dir]
             if args.eval_all_checkpoints:
-                checkpoints = list(
+                checkpoints = [
                     os.path.dirname(c)
                     for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
-                )
+                ]
 
         else:
             logger.info("Loading checkpoint %s for evaluation", args.model_name_or_path)
@@ -831,7 +830,7 @@ def main():
             # Evaluate
             result = evaluate(args, model, tokenizer, prefix=global_step)
 
-            result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())
+            result = {k + ("_{}".format(global_step) if global_step else ""): v for k, v in result.items()}
             results.update(result)
 
     logger.info("Results: {}".format(results))
diff --git a/examples/legacy/question-answering/run_squad_trainer.py b/examples/legacy/question-answering/run_squad_trainer.py
index 314b140e828c..7e3a6f28e0ba 100644
--- a/examples/legacy/question-answering/run_squad_trainer.py
+++ b/examples/legacy/question-answering/run_squad_trainer.py
@@ -30,9 +30,10 @@
     DataCollatorWithPadding,
     HfArgumentParser,
     SquadDataset,
+    Trainer,
+    TrainingArguments,
 )
 from transformers import SquadDataTrainingArguments as DataTrainingArguments
-from transformers import Trainer, TrainingArguments
 from transformers.trainer_utils import is_main_process
 
 
diff --git a/examples/legacy/run_chinese_ref.py b/examples/legacy/run_chinese_ref.py
index f7c09e37ff87..7d73580aa215 100755
--- a/examples/legacy/run_chinese_ref.py
+++ b/examples/legacy/run_chinese_ref.py
@@ -4,6 +4,7 @@
 from typing import List
 
 from ltp import LTP
+
 from transformers import BertTokenizer
 
 
@@ -93,7 +94,6 @@ def prepare_ref(lines: List[str], ltp_tokenizer: LTP, bert_tokenizer: BertTokeni
 
     ref_ids = []
     for input_ids, chinese_word in zip(bert_res, ltp_res):
-
         input_tokens = []
         for id in input_ids:
             token = bert_tokenizer._convert_id_to_token(id)
diff --git a/examples/legacy/run_openai_gpt.py b/examples/legacy/run_openai_gpt.py
index 1f02570f8f51..03031f205768 100755
--- a/examples/legacy/run_openai_gpt.py
+++ b/examples/legacy/run_openai_gpt.py
@@ -189,7 +189,7 @@ def tokenize_and_encode(obj):
             return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
         elif isinstance(obj, int):
             return obj
-        return list(tokenize_and_encode(o) for o in obj)
+        return [tokenize_and_encode(o) for o in obj]
 
     logger.info("Encoding dataset...")
     train_dataset = load_rocstories_dataset(args.train_dataset)
diff --git a/examples/legacy/run_swag.py b/examples/legacy/run_swag.py
index 5cac1567243c..bde050168752 100755
--- a/examples/legacy/run_swag.py
+++ b/examples/legacy/run_swag.py
@@ -696,9 +696,9 @@ def main():
             checkpoints = [args.model_name_or_path]
 
         if args.eval_all_checkpoints:
-            checkpoints = list(
+            checkpoints = [
                 os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
-            )
+            ]
 
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
 
@@ -712,7 +712,7 @@ def main():
             # Evaluate
             result = evaluate(args, model, tokenizer, prefix=global_step)
 
-            result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())
+            result = {k + ("_{}".format(global_step) if global_step else ""): v for k, v in result.items()}
             results.update(result)
 
     logger.info("Results: {}".format(results))
diff --git a/examples/legacy/seq2seq/finetune_trainer.py b/examples/legacy/seq2seq/finetune_trainer.py
index f174f7fb5018..4e186c96d8c2 100755
--- a/examples/legacy/seq2seq/finetune_trainer.py
+++ b/examples/legacy/seq2seq/finetune_trainer.py
@@ -19,9 +19,10 @@
 from dataclasses import dataclass, field
 from typing import Optional
 
-import transformers
 from seq2seq_trainer import Seq2SeqTrainer
 from seq2seq_training_args import Seq2SeqTrainingArguments
+
+import transformers
 from transformers import (
     AutoConfig,
     AutoModelForSeq2SeqLM,
@@ -337,7 +338,6 @@ def main():
         metrics["val_loss"] = round(metrics["val_loss"], 4)
 
         if trainer.is_world_process_zero():
-
             handle_metrics("val", metrics, training_args.output_dir)
             all_metrics.update(metrics)
 
diff --git a/examples/legacy/seq2seq/old_test_calculate_rouge.py b/examples/legacy/seq2seq/old_test_calculate_rouge.py
index 17b87cb481a6..6cc15e02552b 100644
--- a/examples/legacy/seq2seq/old_test_calculate_rouge.py
+++ b/examples/legacy/seq2seq/old_test_calculate_rouge.py
@@ -16,8 +16,8 @@
 from pathlib import Path
 
 import pandas as pd
-
 from rouge_cli import calculate_rouge_path
+
 from utils import calculate_rouge
 
 
@@ -87,7 +87,6 @@ def test_single_sent_scores_dont_depend_on_newline_sep():
 
 
 def test_pegasus_newline():
-
     pred = [
         """" "a person who has such a video needs to immediately give it to the investigators," prosecutor says .<n> "it is a very disturbing scene," editor-in-chief of bild online tells "erin burnett: outfront" """
     ]
diff --git a/examples/legacy/seq2seq/old_test_datasets.py b/examples/legacy/seq2seq/old_test_datasets.py
index b85d7966e970..0b907b1ed9fb 100644
--- a/examples/legacy/seq2seq/old_test_datasets.py
+++ b/examples/legacy/seq2seq/old_test_datasets.py
@@ -17,11 +17,11 @@
 
 import numpy as np
 import pytest
-from torch.utils.data import DataLoader
-
 from pack_dataset import pack_data_dir
 from parameterized import parameterized
 from save_len_file import save_len_file
+from torch.utils.data import DataLoader
+
 from transformers import AutoTokenizer
 from transformers.models.mbart.modeling_mbart import shift_tokens_right
 from transformers.testing_utils import TestCasePlus, slow
diff --git a/examples/legacy/seq2seq/old_test_fsmt_bleu_score.py b/examples/legacy/seq2seq/old_test_fsmt_bleu_score.py
index beb7f2bc9857..4aefeb388be6 100644
--- a/examples/legacy/seq2seq/old_test_fsmt_bleu_score.py
+++ b/examples/legacy/seq2seq/old_test_fsmt_bleu_score.py
@@ -18,6 +18,7 @@
 import unittest
 
 from parameterized import parameterized
+
 from transformers import FSMTForConditionalGeneration, FSMTTokenizer
 from transformers.testing_utils import get_tests_dir, require_torch, slow, torch_device
 from utils import calculate_bleu
diff --git a/examples/legacy/seq2seq/old_test_seq2seq_examples.py b/examples/legacy/seq2seq/old_test_seq2seq_examples.py
index ecc0524c37d9..864b97c7466a 100644
--- a/examples/legacy/seq2seq/old_test_seq2seq_examples.py
+++ b/examples/legacy/seq2seq/old_test_seq2seq_examples.py
@@ -21,6 +21,7 @@
 from parameterized import parameterized
 from run_eval import run_generate
 from run_eval_search import run_search
+
 from transformers.testing_utils import CaptureStdout, TestCasePlus, slow
 from utils import ROUGE_KEYS
 
diff --git a/examples/legacy/seq2seq/pack_dataset.py b/examples/legacy/seq2seq/pack_dataset.py
index 6f226de2cc2d..8b069e452a71 100755
--- a/examples/legacy/seq2seq/pack_dataset.py
+++ b/examples/legacy/seq2seq/pack_dataset.py
@@ -29,7 +29,6 @@
 
 
 def pack_examples(tok, src_examples, tgt_examples, max_tokens=1024):
-
     finished_src, finished_tgt = [], []
 
     sorted_examples = list(zip(src_examples, tgt_examples))
diff --git a/examples/legacy/seq2seq/run_distributed_eval.py b/examples/legacy/seq2seq/run_distributed_eval.py
index 655807ba172e..55f3839d7364 100755
--- a/examples/legacy/seq2seq/run_distributed_eval.py
+++ b/examples/legacy/seq2seq/run_distributed_eval.py
@@ -111,7 +111,7 @@ def eval_data_dir(
         if num_return_sequences > 1:
             preds = chunks(preds, num_return_sequences)  # batch size chunks, each of size num_return_seq
         for i, pred in enumerate(preds):
-            results.append(dict(pred=pred, id=ids[i].item()))
+            results.append({"pred": pred, "id": ids[i].item()})
     save_json(results, save_path)
     return results, sampler.num_replicas
 
@@ -232,7 +232,7 @@ def combine_partial_results(partial_results) -> List:
     records = []
     for partial_result in partial_results:
         records.extend(partial_result)
-    records = list(sorted(records, key=lambda x: x["id"]))
+    records = sorted(records, key=lambda x: x["id"])
     preds = [x["pred"] for x in records]
     return preds
 
diff --git a/examples/legacy/seq2seq/run_eval.py b/examples/legacy/seq2seq/run_eval.py
index a8aa8e7ef95d..35e11c86a116 100755
--- a/examples/legacy/seq2seq/run_eval.py
+++ b/examples/legacy/seq2seq/run_eval.py
@@ -76,7 +76,7 @@ def generate_summaries_or_translations(
     fout.close()
     runtime = int(time.time() - start_time)  # seconds
     n_obs = len(examples)
-    return dict(n_obs=n_obs, runtime=runtime, seconds_per_sample=round(runtime / n_obs, 4))
+    return {"n_obs": n_obs, "runtime": runtime, "seconds_per_sample": round(runtime / n_obs, 4)}
 
 
 def datetime_now():
diff --git a/examples/legacy/seq2seq/run_eval_search.py b/examples/legacy/seq2seq/run_eval_search.py
index e1a0c8660c9b..9b5debfb2795 100755
--- a/examples/legacy/seq2seq/run_eval_search.py
+++ b/examples/legacy/seq2seq/run_eval_search.py
@@ -20,6 +20,7 @@
 from collections import OrderedDict
 
 from run_eval import datetime_now, run_generate
+
 from utils import ROUGE_KEYS
 
 
@@ -33,9 +34,9 @@
 
 def parse_search_arg(search):
     groups = search.split()
-    entries = {k: vs for k, vs in (g.split("=") for g in groups)}
+    entries = dict((g.split("=") for g in groups))
     entry_names = list(entries.keys())
-    sets = [list(f"--{k} {v}" for v in vs.split(":")) for k, vs in entries.items()]
+    sets = [[f"--{k} {v}" for v in vs.split(":")] for k, vs in entries.items()]
     matrix = [list(x) for x in itertools.product(*sets)]
     return matrix, entry_names
 
@@ -104,7 +105,7 @@ def run_search():
     col_widths = {col: len(str(col)) for col in col_names}
     results = []
     for r in matrix:
-        hparams = {k: v for k, v in (x.replace("--", "").split() for x in r)}
+        hparams = dict((x.replace("--", "").split() for x in r))
         args_exp = " ".join(r).split()
         args_exp.extend(["--bs", str(args.bs)])  # in case we need to reduce its size due to CUDA OOM
         sys.argv = args_normal + args_exp
diff --git a/examples/legacy/seq2seq/seq2seq_trainer.py b/examples/legacy/seq2seq/seq2seq_trainer.py
index dbf12725f2db..6b52d338af40 100644
--- a/examples/legacy/seq2seq/seq2seq_trainer.py
+++ b/examples/legacy/seq2seq/seq2seq_trainer.py
@@ -19,7 +19,6 @@
 from torch.utils.data import DistributedSampler, RandomSampler
 
 from transformers import PreTrainedModel, Trainer, logging
-from transformers.integrations import is_fairscale_available
 from transformers.models.fsmt.configuration_fsmt import FSMTConfig
 from transformers.optimization import (
     Adafactor,
@@ -36,10 +35,6 @@
 from transformers.utils import is_torch_tpu_available
 
 
-if is_fairscale_available():
-    from fairscale.optim import OSS
-
-
 logger = logging.get_logger(__name__)
 
 arg_to_scheduler = {
@@ -118,14 +113,7 @@ def create_optimizer_and_scheduler(self, num_training_steps: int):
                     "eps": self.args.adam_epsilon,
                 }
             optimizer_kwargs["lr"] = self.args.learning_rate
-            if self.sharded_ddp:
-                self.optimizer = OSS(
-                    params=optimizer_grouped_parameters,
-                    optim=optimizer_cls,
-                    **optimizer_kwargs,
-                )
-            else:
-                self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
+            self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
 
         if self.lr_scheduler is None:
             self.lr_scheduler = self._get_lr_scheduler(num_training_steps)
diff --git a/examples/legacy/seq2seq/seq2seq_training_args.py b/examples/legacy/seq2seq/seq2seq_training_args.py
index 6ec220181ad9..d47840fd6d4b 100644
--- a/examples/legacy/seq2seq/seq2seq_training_args.py
+++ b/examples/legacy/seq2seq/seq2seq_training_args.py
@@ -17,6 +17,7 @@
 from typing import Optional
 
 from seq2seq_trainer import arg_to_scheduler
+
 from transformers import TrainingArguments
 
 
@@ -30,7 +31,7 @@ class Seq2SeqTrainingArguments(TrainingArguments):
         label_smoothing (:obj:`float`, `optional`, defaults to 0):
             The label smoothing epsilon to apply (if not zero).
         sortish_sampler (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether to SortishSamler or not. It sorts the inputs according to lenghts in-order to minimizing the padding size.
+            Whether to SortishSamler or not. It sorts the inputs according to lengths in-order to minimizing the padding size.
         predict_with_generate (:obj:`bool`, `optional`, defaults to :obj:`False`):
             Whether to use generate to calculate generative metrics (ROUGE, BLEU).
     """
diff --git a/examples/legacy/seq2seq/test_data/test_data b/examples/legacy/seq2seq/test_data/test_data
deleted file mode 120000
index 9eee112ad741..000000000000
--- a/examples/legacy/seq2seq/test_data/test_data
+++ /dev/null
@@ -1 +0,0 @@
-seq2seq/test_data
\ No newline at end of file
diff --git a/examples/legacy/seq2seq/utils.py b/examples/legacy/seq2seq/utils.py
index e207e4d0dbd0..d7cd84dedb28 100644
--- a/examples/legacy/seq2seq/utils.py
+++ b/examples/legacy/seq2seq/utils.py
@@ -29,10 +29,10 @@
 import torch.distributed as dist
 from rouge_score import rouge_scorer, scoring
 from sacrebleu import corpus_bleu
+from sentence_splitter import add_newline_to_end_of_each_sentence
 from torch import nn
 from torch.utils.data import Dataset, Sampler
 
-from sentence_splitter import add_newline_to_end_of_each_sentence
 from transformers import BartTokenizer, EvalPrediction, PreTrainedTokenizer, T5Tokenizer
 from transformers.models.bart.modeling_bart import shift_tokens_right
 from transformers.utils import cached_property
@@ -132,7 +132,7 @@ def __init__(
         type_path="train",
         n_obs=None,
         prefix="",
-        **dataset_kwargs
+        **dataset_kwargs,
     ):
         super().__init__()
         self.src_file = Path(data_dir).joinpath(type_path + ".source")
@@ -456,7 +456,7 @@ def pickle_save(obj, path):
 
 
 def flatten_list(summary_ids: List[List]):
-    return [x for x in itertools.chain.from_iterable(summary_ids)]
+    return list(itertools.chain.from_iterable(summary_ids))
 
 
 def save_git_info(folder_path: str) -> None:
diff --git a/examples/legacy/token-classification/run_ner.py b/examples/legacy/token-classification/run_ner.py
index 477ccb50fb25..c571d44a1203 100644
--- a/examples/legacy/token-classification/run_ner.py
+++ b/examples/legacy/token-classification/run_ner.py
@@ -24,6 +24,7 @@
 import numpy as np
 from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
 from torch import nn
+from utils_ner import Split, TokenClassificationDataset, TokenClassificationTask
 
 import transformers
 from transformers import (
@@ -38,7 +39,6 @@
     set_seed,
 )
 from transformers.trainer_utils import is_main_process
-from utils_ner import Split, TokenClassificationDataset, TokenClassificationTask
 
 
 logger = logging.getLogger(__name__)
@@ -158,7 +158,7 @@ def main():
 
     # Prepare CONLL-2003 task
     labels = token_classification_task.get_labels(data_args.labels)
-    label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)}
+    label_map: Dict[int, str] = dict(enumerate(labels))
     num_labels = len(labels)
 
     # Load pretrained model and tokenizer
diff --git a/examples/legacy/token-classification/run_tf_ner.py b/examples/legacy/token-classification/run_tf_ner.py
index 857d777238f2..a9c41d58183d 100755
--- a/examples/legacy/token-classification/run_tf_ner.py
+++ b/examples/legacy/token-classification/run_tf_ner.py
@@ -24,6 +24,7 @@
 
 import numpy as np
 from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
+from utils_ner import Split, TFTokenClassificationDataset, TokenClassificationTask
 
 from transformers import (
     AutoConfig,
@@ -35,7 +36,6 @@
     TFTrainingArguments,
 )
 from transformers.utils import logging as hf_logging
-from utils_ner import Split, TFTokenClassificationDataset, TokenClassificationTask
 
 
 hf_logging.set_verbosity_info()
@@ -144,7 +144,7 @@ def main():
 
     # Prepare Token Classification task
     labels = token_classification_task.get_labels(data_args.labels)
-    label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)}
+    label_map: Dict[int, str] = dict(enumerate(labels))
     num_labels = len(labels)
 
     # Load pretrained model and tokenizer
diff --git a/examples/legacy/token-classification/tasks.py b/examples/legacy/token-classification/tasks.py
index 409be0715da3..d893a2ab0347 100644
--- a/examples/legacy/token-classification/tasks.py
+++ b/examples/legacy/token-classification/tasks.py
@@ -3,7 +3,6 @@
 from typing import List, TextIO, Union
 
 from conllu import parse_incr
-
 from utils_ner import InputExample, Split, TokenClassificationTask
 
 
diff --git a/examples/legacy/token-classification/utils_ner.py b/examples/legacy/token-classification/utils_ner.py
index 35fcb5ef5b7d..2b54c7c4a491 100644
--- a/examples/legacy/token-classification/utils_ner.py
+++ b/examples/legacy/token-classification/utils_ner.py
@@ -23,6 +23,7 @@
 from typing import List, Optional, Union
 
 from filelock import FileLock
+
 from transformers import PreTrainedTokenizer, is_tf_available, is_torch_available
 
 
@@ -240,7 +241,6 @@ def __init__(
             # and the others will use the cache.
             lock_path = cached_features_file + ".lock"
             with FileLock(lock_path):
-
                 if os.path.exists(cached_features_file) and not overwrite_cache:
                     logger.info(f"Loading features from cached file {cached_features_file}")
                     self.features = torch.load(cached_features_file)
diff --git a/examples/pytorch/README.md b/examples/pytorch/README.md
index aa669932475c..60d762c69e10 100644
--- a/examples/pytorch/README.md
+++ b/examples/pytorch/README.md
@@ -262,13 +262,13 @@ First, install the Neptune client library. You can do it with either `pip` or `c
 `pip`:
 
 ```bash
-pip install neptune-client
+pip install neptune
 ```
 
 `conda`:
 
 ```bash
-conda install -c conda-forge neptune-client
+conda install -c conda-forge neptune
 ```
 
 Next, in your model training script, import `NeptuneCallback`:
@@ -283,8 +283,8 @@ To enable Neptune logging, in your `TrainingArguments`, set the `report_to` argu
 training_args = TrainingArguments(
     "quick-training-distilbert-mrpc", 
     evaluation_strategy="steps",
-    eval_steps = 20,
-    report_to = "neptune",
+    eval_steps=20,
+    report_to="neptune",
 )
 
 trainer = Trainer(
@@ -294,6 +294,8 @@ trainer = Trainer(
 )
 ```
 
+**Note:** This method requires saving your Neptune credentials as environment variables (see the bottom of the section).
+
 Alternatively, for more logging options, create a Neptune callback:
 
 ```python
@@ -318,7 +320,7 @@ neptune_callback = NeptuneCallback(
 Pass the callback to the Trainer:
 
 ```python
-training_args = TrainingArguments(..., report_to = None)
+training_args = TrainingArguments(..., report_to=None)
 trainer = Trainer(
     model,
     training_args,
@@ -336,7 +338,7 @@ Now, when you start the training with `trainer.train()`, your metadata will be l
 | `NEPTUNE_API_TOKEN`  | Your Neptune API token. To find and copy it, click your Neptune avatar and select **Get your API token**. |
 | `NEPTUNE_PROJECT` | The full name of your Neptune project (`workspace-name/project-name`). To find and copy it, head to **project settings** &rarr; **Properties**. |
 
-For detailed instructions and examples, see the [Neptune docs](https://docs.neptune.ai/integrations-and-supported-tools/model-training/hugging-face).
+For detailed instructions and examples, see the [Neptune docs](https://docs.neptune.ai/integrations/transformers/).
 
 ### ClearML
 
@@ -373,4 +375,4 @@ Advanced configuration is possible by setting environment variables:
 | CLEARML_PROJECT    | Name of the project in ClearML. (default: `"HuggingFace Transformers"`) |
 | CLEARML_TASK       | Name of the task in ClearML. (default: `"Trainer"`) |
 
-Additional configuration options are available through generic [clearml environment variables](https://clear.ml/docs/latest/docs/configs/env_vars).
\ No newline at end of file
+Additional configuration options are available through generic [clearml environment variables](https://clear.ml/docs/latest/docs/configs/env_vars).
diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py
index a183dadd206f..64d532ff42e6 100644
--- a/examples/pytorch/audio-classification/run_audio_classification.py
+++ b/examples/pytorch/audio-classification/run_audio_classification.py
@@ -23,10 +23,10 @@
 from typing import Optional
 
 import datasets
+import evaluate
 import numpy as np
 from datasets import DatasetDict, load_dataset
 
-import evaluate
 import transformers
 from transformers import (
     AutoConfig,
@@ -45,7 +45,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt")
 
@@ -152,12 +152,28 @@ class ModelArguments:
     attention_mask: bool = field(
         default=True, metadata={"help": "Whether to generate an attention mask in the feature extractor."}
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -198,6 +214,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_audio_classification", model_args, data_args)
@@ -209,6 +231,10 @@ def main():
         handlers=[logging.StreamHandler(sys.stdout)],
     )
 
+    if training_args.should_log:
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers.utils.logging.set_verbosity_info()
+
     log_level = training_args.get_process_log_level()
     logger.setLevel(log_level)
     transformers.utils.logging.set_verbosity(log_level)
@@ -218,7 +244,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu} "
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
@@ -246,13 +272,13 @@ def main():
         data_args.dataset_name,
         data_args.dataset_config_name,
         split=data_args.train_split_name,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     raw_datasets["eval"] = load_dataset(
         data_args.dataset_name,
         data_args.dataset_config_name,
         split=data_args.eval_split_name,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
 
     if data_args.audio_column_name not in raw_datasets["train"].column_names:
@@ -276,7 +302,8 @@ def main():
         return_attention_mask=model_args.attention_mask,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     # `datasets` takes care of automatically loading and resampling the audio,
@@ -285,32 +312,35 @@ def main():
         data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
     )
 
+    model_input_name = feature_extractor.model_input_names[0]
+
     def train_transforms(batch):
         """Apply train_transforms across a batch."""
-        output_batch = {"input_values": []}
+        subsampled_wavs = []
         for audio in batch[data_args.audio_column_name]:
             wav = random_subsample(
                 audio["array"], max_length=data_args.max_length_seconds, sample_rate=feature_extractor.sampling_rate
             )
-            output_batch["input_values"].append(wav)
-        output_batch["labels"] = [label for label in batch[data_args.label_column_name]]
+            subsampled_wavs.append(wav)
+        inputs = feature_extractor(subsampled_wavs, sampling_rate=feature_extractor.sampling_rate)
+        output_batch = {model_input_name: inputs.get(model_input_name)}
+        output_batch["labels"] = list(batch[data_args.label_column_name])
 
         return output_batch
 
     def val_transforms(batch):
         """Apply val_transforms across a batch."""
-        output_batch = {"input_values": []}
-        for audio in batch[data_args.audio_column_name]:
-            wav = audio["array"]
-            output_batch["input_values"].append(wav)
-        output_batch["labels"] = [label for label in batch[data_args.label_column_name]]
+        wavs = [audio["array"] for audio in batch[data_args.audio_column_name]]
+        inputs = feature_extractor(wavs, sampling_rate=feature_extractor.sampling_rate)
+        output_batch = {model_input_name: inputs.get(model_input_name)}
+        output_batch["labels"] = list(batch[data_args.label_column_name])
 
         return output_batch
 
     # Prepare label mappings.
     # We'll include these in the model's config to get human readable labels in the Inference API.
     labels = raw_datasets["train"].features[data_args.label_column_name].names
-    label2id, id2label = dict(), dict()
+    label2id, id2label = {}, {}
     for i, label in enumerate(labels):
         label2id[label] = str(i)
         id2label[str(i)] = label
@@ -333,7 +363,8 @@ def compute_metrics(eval_pred):
         finetuning_task="audio-classification",
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     model = AutoModelForAudioClassification.from_pretrained(
         model_args.model_name_or_path,
@@ -341,7 +372,8 @@ def compute_metrics(eval_pred):
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
         ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
     )
 
diff --git a/examples/pytorch/benchmarking/plot_csv_file.py b/examples/pytorch/benchmarking/plot_csv_file.py
index 1a0ae735d8c6..9a9ad9c67047 100644
--- a/examples/pytorch/benchmarking/plot_csv_file.py
+++ b/examples/pytorch/benchmarking/plot_csv_file.py
@@ -83,7 +83,7 @@ def can_convert_to_float(string):
 class Plot:
     def __init__(self, args):
         self.args = args
-        self.result_dict = defaultdict(lambda: dict(bsz=[], seq_len=[], result={}))
+        self.result_dict = defaultdict(lambda: {"bsz": [], "seq_len": [], "result": {}})
 
         with open(self.args.csv_file, newline="") as csv_file:
             reader = csv.DictReader(csv_file)
@@ -116,8 +116,8 @@ def plot(self):
             axis.set_major_formatter(ScalarFormatter())
 
         for model_name_idx, model_name in enumerate(self.result_dict.keys()):
-            batch_sizes = sorted(list(set(self.result_dict[model_name]["bsz"])))
-            sequence_lengths = sorted(list(set(self.result_dict[model_name]["seq_len"])))
+            batch_sizes = sorted(set(self.result_dict[model_name]["bsz"]))
+            sequence_lengths = sorted(set(self.result_dict[model_name]["seq_len"]))
             results = self.result_dict[model_name]["result"]
 
             (x_axis_array, inner_loop_array) = (
diff --git a/examples/pytorch/contrastive-image-text/README.md b/examples/pytorch/contrastive-image-text/README.md
index cfc6a627809f..f22f2c82dce2 100644
--- a/examples/pytorch/contrastive-image-text/README.md
+++ b/examples/pytorch/contrastive-image-text/README.md
@@ -50,17 +50,17 @@ COCO_DIR = os.path.join(os.getcwd(), "data")
 ds = datasets.load_dataset("ydshieh/coco_dataset_script", "2017", data_dir=COCO_DIR)
 ```
 
-### Create a model from a vision encoder model and a text decoder model
+### Create a model from a vision encoder model and a text encoder model
 Next, we create a [VisionTextDualEncoderModel](https://huggingface.co/docs/transformers/model_doc/vision-text-dual-encoder#visiontextdualencoder).
-The `VisionTextDualEncoderModel` class let's you load any vision and text encoder model to create a dual encoder. 
+The `VisionTextDualEncoderModel` class lets you load any vision and text encoder model to create a dual encoder.
 Here is an example of how to load the model using pre-trained vision and text models.
 
 ```python3
 from transformers import (
-    VisionTextDualEncoderModel, 
-    VisionTextDualEncoderProcessor, 
-    AutoTokenizer, 
-    AutoFeatureExtractor
+    VisionTextDualEncoderModel,
+    VisionTextDualEncoderProcessor,
+    AutoTokenizer,
+    AutoImageProcessor
 )
 
 model = VisionTextDualEncoderModel.from_vision_text_pretrained(
@@ -68,8 +68,8 @@ model = VisionTextDualEncoderModel.from_vision_text_pretrained(
 )
 
 tokenizer = AutoTokenizer.from_pretrained("roberta-base")
-feat_ext = AutoFeatureExtractor.from_pretrained("openai/clip-vit-base-patch32")
-processor = VisionTextDualEncoderProcessor(feat_ext, tokenizer)
+image_processor = AutoImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
+processor = VisionTextDualEncoderProcessor(image_processor, tokenizer)
 
 # save the model and processor
 model.save_pretrained("clip-roberta")
diff --git a/examples/pytorch/contrastive-image-text/run_clip.py b/examples/pytorch/contrastive-image-text/run_clip.py
index 51dc1d11bda3..d8eb77ace157 100644
--- a/examples/pytorch/contrastive-image-text/run_clip.py
+++ b/examples/pytorch/contrastive-image-text/run_clip.py
@@ -26,6 +26,7 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -38,7 +39,7 @@
 
 import transformers
 from transformers import (
-    AutoFeatureExtractor,
+    AutoImageProcessor,
     AutoModel,
     AutoTokenizer,
     HfArgumentParser,
@@ -54,7 +55,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")
 
@@ -74,7 +75,7 @@ class ModelArguments:
     tokenizer_name: Optional[str] = field(
         default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
     )
-    feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
+    image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
     cache_dir: Optional[str] = field(
         default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
     )
@@ -86,12 +87,28 @@ class ModelArguments:
         default=True,
         metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -131,6 +148,10 @@ class DataTrainingArguments:
         default=None,
         metadata={"help": "An optional input evaluation data file (a jsonlines file)."},
     )
+    test_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input testing data file (a jsonlines file)."},
+    )
     max_seq_length: Optional[int] = field(
         default=128,
         metadata={
@@ -231,6 +252,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_clip", model_args, data_args)
@@ -242,6 +269,10 @@ def main():
         handlers=[logging.StreamHandler(sys.stdout)],
     )
 
+    if training_args.should_log:
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers.utils.logging.set_verbosity_info()
+
     log_level = training_args.get_process_log_level()
     logger.setLevel(log_level)
     transformers.utils.logging.set_verbosity(log_level)
@@ -251,7 +282,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
@@ -286,7 +317,7 @@ def main():
             cache_dir=model_args.cache_dir,
             keep_in_memory=False,
             data_dir=data_args.data_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -303,19 +334,27 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
 
-    # 5. Load pretrained model, tokenizer, and feature extractor
+    # 5. Load pretrained model, tokenizer, and image processor
     if model_args.tokenizer_name:
         tokenizer = AutoTokenizer.from_pretrained(
-            model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
+            model_args.tokenizer_name,
+            cache_dir=model_args.cache_dir,
+            use_fast=model_args.use_fast_tokenizer,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     elif model_args.model_name_or_path:
         tokenizer = AutoTokenizer.from_pretrained(
-            model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
+            model_args.model_name_or_path,
+            cache_dir=model_args.cache_dir,
+            use_fast=model_args.use_fast_tokenizer,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         raise ValueError(
@@ -323,19 +362,21 @@ def main():
             "You can do it from another script, save it, and load it from here, using --tokenizer_name."
         )
 
-    # Load feature_extractor, in this script we only use this to get the mean and std for normalization.
-    feature_extractor = AutoFeatureExtractor.from_pretrained(
-        model_args.feature_extractor_name or model_args.model_name_or_path,
+    # Load image_processor, in this script we only use this to get the mean and std for normalization.
+    image_processor = AutoImageProcessor.from_pretrained(
+        model_args.image_processor_name or model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     model = AutoModel.from_pretrained(
         model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     config = model.config
 
@@ -386,14 +427,14 @@ def _freeze_params(module):
     # 7. Preprocessing the datasets.
     # Initialize torchvision transforms and jit it for faster processing.
     image_transformations = Transform(
-        config.vision_config.image_size, feature_extractor.image_mean, feature_extractor.image_std
+        config.vision_config.image_size, image_processor.image_mean, image_processor.image_std
     )
     image_transformations = torch.jit.script(image_transformations)
 
     # Preprocessing the datasets.
     # We need to tokenize input captions and transform the images.
     def tokenize_captions(examples):
-        captions = [caption for caption in examples[caption_column]]
+        captions = list(examples[caption_column])
         text_inputs = tokenizer(captions, max_length=data_args.max_seq_length, padding="max_length", truncation=True)
         examples["input_ids"] = text_inputs.input_ids
         examples["attention_mask"] = text_inputs.attention_mask
@@ -502,6 +543,8 @@ def filter_corrupt_images(examples):
             checkpoint = last_checkpoint
         train_result = trainer.train(resume_from_checkpoint=checkpoint)
         trainer.save_model()
+        tokenizer.save_pretrained(training_args.output_dir)
+        image_processor.save_pretrained(training_args.output_dir)
         trainer.log_metrics("train", train_result.metrics)
         trainer.save_metrics("train", train_result.metrics)
         trainer.save_state()
diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py
index a1a2aa5b71e1..0f8a2d723cc0 100644
--- a/examples/pytorch/image-classification/run_image_classification.py
+++ b/examples/pytorch/image-classification/run_image_classification.py
@@ -16,9 +16,11 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
+import evaluate
 import numpy as np
 import torch
 from datasets import load_dataset
@@ -33,12 +35,11 @@
     ToTensor,
 )
 
-import evaluate
 import transformers
 from transformers import (
     MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
     AutoConfig,
-    AutoFeatureExtractor,
+    AutoImageProcessor,
     AutoModelForImageClassification,
     HfArgumentParser,
     Trainer,
@@ -55,7 +56,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")
 
@@ -141,13 +142,29 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
-    feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
+    image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -176,6 +193,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_image_classification", model_args, data_args)
@@ -187,6 +210,10 @@ def main():
         handlers=[logging.StreamHandler(sys.stdout)],
     )
 
+    if training_args.should_log:
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers.utils.logging.set_verbosity_info()
+
     log_level = training_args.get_process_log_level()
     logger.setLevel(log_level)
     transformers.utils.logging.set_verbosity(log_level)
@@ -196,7 +223,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
@@ -225,7 +252,7 @@ def main():
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
             task="image-classification",
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -250,7 +277,7 @@ def main():
     # Prepare label mappings.
     # We'll include these in the model's config to get human readable labels in the Inference API.
     labels = dataset["train"].features["labels"].names
-    label2id, id2label = dict(), dict()
+    label2id, id2label = {}, {}
     for i, label in enumerate(labels):
         label2id[label] = str(i)
         id2label[str(i)] = label
@@ -272,7 +299,8 @@ def compute_metrics(p):
         finetuning_task="image-classification",
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     model = AutoModelForImageClassification.from_pretrained(
         model_args.model_name_or_path,
@@ -280,22 +308,24 @@ def compute_metrics(p):
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
         ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
     )
-    feature_extractor = AutoFeatureExtractor.from_pretrained(
-        model_args.feature_extractor_name or model_args.model_name_or_path,
+    image_processor = AutoImageProcessor.from_pretrained(
+        model_args.image_processor_name or model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     # Define torchvision transforms to be applied to each image.
-    if "shortest_edge" in feature_extractor.size:
-        size = feature_extractor.size["shortest_edge"]
+    if "shortest_edge" in image_processor.size:
+        size = image_processor.size["shortest_edge"]
     else:
-        size = (feature_extractor.size["height"], feature_extractor.size["width"])
-    normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
+        size = (image_processor.size["height"], image_processor.size["width"])
+    normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
     _train_transforms = Compose(
         [
             RandomResizedCrop(size),
@@ -352,7 +382,7 @@ def val_transforms(example_batch):
         train_dataset=dataset["train"] if training_args.do_train else None,
         eval_dataset=dataset["validation"] if training_args.do_eval else None,
         compute_metrics=compute_metrics,
-        tokenizer=feature_extractor,
+        tokenizer=image_processor,
         data_collator=collate_fn,
     )
 
diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
index 037201f16f2a..20b4005f7613 100644
--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@@ -21,8 +21,13 @@
 from pathlib import Path
 
 import datasets
+import evaluate
 import torch
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
 from datasets import load_dataset
+from huggingface_hub import Repository, create_repo
 from torch.utils.data import DataLoader
 from torchvision.transforms import (
     CenterCrop,
@@ -35,25 +40,14 @@
 )
 from tqdm.auto import tqdm
 
-import evaluate
 import transformers
-from accelerate import Accelerator
-from accelerate.logging import get_logger
-from accelerate.utils import set_seed
-from huggingface_hub import Repository
-from transformers import (
-    AutoConfig,
-    AutoFeatureExtractor,
-    AutoModelForImageClassification,
-    SchedulerType,
-    get_scheduler,
-)
-from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry
+from transformers import AutoConfig, AutoImageProcessor, AutoModelForImageClassification, SchedulerType, get_scheduler
+from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 logger = get_logger(__name__)
 
@@ -152,6 +146,16 @@ def parse_args():
         "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
     )
     parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--trust_remote_code",
+        type=bool,
+        default=False,
+        help=(
+            "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+            "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+            "execute code present on the Hub on your local machine."
+        ),
+    )
     parser.add_argument(
         "--checkpointing_steps",
         type=str,
@@ -216,7 +220,7 @@ def main():
 
     if args.with_tracking:
         accelerator_log_kwargs["log_with"] = args.report_to
-        accelerator_log_kwargs["logging_dir"] = args.output_dir
+        accelerator_log_kwargs["project_dir"] = args.output_dir
 
     accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
 
@@ -242,11 +246,14 @@ def main():
     # Handle the repository creation
     if accelerator.is_main_process:
         if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            repo = Repository(args.output_dir, clone_from=repo_name)
+            # Retrieve of infer repo_name
+            repo_name = args.hub_model_id
+            if repo_name is None:
+                repo_name = Path(args.output_dir).absolute().name
+            # Create repo and retrieve repo_id
+            repo_id = create_repo(repo_name, exist_ok=True, token=args.hub_token).repo_id
+            # Clone repo locally
+            repo = Repository(args.output_dir, clone_from=repo_id, token=args.hub_token)
 
             with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                 if "step_*" not in gitignore:
@@ -293,7 +300,7 @@ def main():
     label2id = {label: str(i) for i, label in enumerate(labels)}
     id2label = {str(i): label for i, label in enumerate(labels)}
 
-    # Load pretrained model and feature extractor
+    # Load pretrained model and image processor
     #
     # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
     # download model & vocab.
@@ -303,23 +310,28 @@ def main():
         i2label=id2label,
         label2id=label2id,
         finetuning_task="image-classification",
+        trust_remote_code=args.trust_remote_code,
+    )
+    image_processor = AutoImageProcessor.from_pretrained(
+        args.model_name_or_path,
+        trust_remote_code=args.trust_remote_code,
     )
-    feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_name_or_path)
     model = AutoModelForImageClassification.from_pretrained(
         args.model_name_or_path,
         from_tf=bool(".ckpt" in args.model_name_or_path),
         config=config,
         ignore_mismatched_sizes=args.ignore_mismatched_sizes,
+        trust_remote_code=args.trust_remote_code,
     )
 
     # Preprocessing the datasets
 
     # Define torchvision transforms to be applied to each image.
-    if "shortest_edge" in feature_extractor.size:
-        size = feature_extractor.size["shortest_edge"]
+    if "shortest_edge" in image_processor.size:
+        size = image_processor.size["shortest_edge"]
     else:
-        size = (feature_extractor.size["height"], feature_extractor.size["width"])
-    normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
+        size = (image_processor.size["height"], image_processor.size["width"])
+    normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
     train_transforms = Compose(
         [
             RandomResizedCrop(size),
@@ -442,36 +454,45 @@ def collate_fn(examples):
     # Potentially load in the weights and states from a previous save
     if args.resume_from_checkpoint:
         if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
-            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
-            accelerator.load_state(args.resume_from_checkpoint)
+            checkpoint_path = args.resume_from_checkpoint
             path = os.path.basename(args.resume_from_checkpoint)
         else:
             # Get the most recent checkpoint
             dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
             dirs.sort(key=os.path.getctime)
             path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
+            checkpoint_path = path
+            path = os.path.basename(checkpoint_path)
+
+        accelerator.print(f"Resumed from checkpoint: {checkpoint_path}")
+        accelerator.load_state(path)
         # Extract `epoch_{i}` or `step_{i}`
         training_difference = os.path.splitext(path)[0]
 
         if "epoch" in training_difference:
             starting_epoch = int(training_difference.replace("epoch_", "")) + 1
             resume_step = None
+            completed_steps = starting_epoch * num_update_steps_per_epoch
         else:
-            resume_step = int(training_difference.replace("step_", ""))
+            # need to multiply `gradient_accumulation_steps` to reflect real steps
+            resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
             starting_epoch = resume_step // len(train_dataloader)
+            completed_steps = resume_step // args.gradient_accumulation_steps
             resume_step -= starting_epoch * len(train_dataloader)
 
+    # update the progress_bar if load from checkpoint
+    progress_bar.update(completed_steps)
+
     for epoch in range(starting_epoch, args.num_train_epochs):
         model.train()
         if args.with_tracking:
             total_loss = 0
-        for step, batch in enumerate(train_dataloader):
-            # We need to skip steps until we reach the resumed step
-            if args.resume_from_checkpoint and epoch == starting_epoch:
-                if resume_step is not None and step < resume_step:
-                    completed_steps += 1
-                    continue
-
+        if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
+            # We skip the first `n` batches in the dataloader when resuming from a checkpoint
+            active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
+        else:
+            active_dataloader = train_dataloader
+        for step, batch in enumerate(active_dataloader):
             with accelerator.accumulate(model):
                 outputs = model(**batch)
                 loss = outputs.loss
@@ -490,7 +511,7 @@ def collate_fn(examples):
 
             if isinstance(checkpointing_steps, int):
                 if completed_steps % checkpointing_steps == 0:
-                    output_dir = f"step_{completed_steps }"
+                    output_dir = f"step_{completed_steps}"
                     if args.output_dir is not None:
                         output_dir = os.path.join(args.output_dir, output_dir)
                     accelerator.save_state(output_dir)
@@ -504,7 +525,7 @@ def collate_fn(examples):
                             save_function=accelerator.save,
                         )
                         if accelerator.is_main_process:
-                            feature_extractor.save_pretrained(args.output_dir)
+                            image_processor.save_pretrained(args.output_dir)
                             repo.push_to_hub(
                                 commit_message=f"Training in progress {completed_steps} steps",
                                 blocking=False,
@@ -546,7 +567,7 @@ def collate_fn(examples):
                 args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
             )
             if accelerator.is_main_process:
-                feature_extractor.save_pretrained(args.output_dir)
+                image_processor.save_pretrained(args.output_dir)
                 repo.push_to_hub(
                     commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
                 )
@@ -567,7 +588,7 @@ def collate_fn(examples):
             args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
         )
         if accelerator.is_main_process:
-            feature_extractor.save_pretrained(args.output_dir)
+            image_processor.save_pretrained(args.output_dir)
             if args.push_to_hub:
                 repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
 
diff --git a/examples/pytorch/image-pretraining/run_mae.py b/examples/pytorch/image-pretraining/run_mae.py
index 75fa1f71cadb..442bc1b2f884 100644
--- a/examples/pytorch/image-pretraining/run_mae.py
+++ b/examples/pytorch/image-pretraining/run_mae.py
@@ -16,6 +16,7 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -29,7 +30,7 @@
     HfArgumentParser,
     Trainer,
     TrainingArguments,
-    ViTFeatureExtractor,
+    ViTImageProcessor,
     ViTMAEConfig,
     ViTMAEForPreTraining,
 )
@@ -43,7 +44,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
 
@@ -91,7 +92,7 @@ class DataTrainingArguments:
     )
 
     def __post_init__(self):
-        data_files = dict()
+        data_files = {}
         if self.train_dir is not None:
             data_files["train"] = self.train_dir
         if self.validation_dir is not None:
@@ -102,14 +103,14 @@ def __post_init__(self):
 @dataclass
 class ModelArguments:
     """
-    Arguments pertaining to which model/config/feature extractor we are going to pre-train.
+    Arguments pertaining to which model/config/image processor we are going to pre-train.
     """
 
     model_name_or_path: str = field(
         default=None,
         metadata={
             "help": (
-                "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch."
+                "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
             )
         },
     )
@@ -132,16 +133,22 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
-    feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
-    use_auth_token: bool = field(
-        default=False,
+    image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
     mask_ratio: float = field(
         default=0.75, metadata={"help": "The ratio of the number of masked tokens in the input sequence."}
     )
@@ -175,6 +182,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_mae", model_args, data_args)
@@ -186,6 +199,10 @@ def main():
         handlers=[logging.StreamHandler(sys.stdout)],
     )
 
+    if training_args.should_log:
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers.utils.logging.set_verbosity_info()
+
     log_level = training_args.get_process_log_level()
     logger.setLevel(log_level)
     transformers.utils.logging.set_verbosity(log_level)
@@ -195,7 +212,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
@@ -220,7 +237,7 @@ def main():
         data_args.dataset_config_name,
         data_files=data_args.data_files,
         cache_dir=model_args.cache_dir,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
 
     # If we don't have a validation split, split off a percentage of train as validation.
@@ -230,7 +247,7 @@ def main():
         ds["train"] = split["train"]
         ds["validation"] = split["test"]
 
-    # Load pretrained model and feature extractor
+    # Load pretrained model and image processor
     #
     # Distributed training:
     # The .from_pretrained methods guarantee that only one local process can concurrently
@@ -238,7 +255,7 @@ def main():
     config_kwargs = {
         "cache_dir": model_args.cache_dir,
         "revision": model_args.model_revision,
-        "use_auth_token": True if model_args.use_auth_token else None,
+        "token": model_args.token,
     }
     if model_args.config_name:
         config = ViTMAEConfig.from_pretrained(model_args.config_name, **config_kwargs)
@@ -260,13 +277,13 @@ def main():
         }
     )
 
-    # create feature extractor
-    if model_args.feature_extractor_name:
-        feature_extractor = ViTFeatureExtractor.from_pretrained(model_args.feature_extractor_name, **config_kwargs)
+    # create image processor
+    if model_args.image_processor_name:
+        image_processor = ViTImageProcessor.from_pretrained(model_args.image_processor_name, **config_kwargs)
     elif model_args.model_name_or_path:
-        feature_extractor = ViTFeatureExtractor.from_pretrained(model_args.model_name_or_path, **config_kwargs)
+        image_processor = ViTImageProcessor.from_pretrained(model_args.model_name_or_path, **config_kwargs)
     else:
-        feature_extractor = ViTFeatureExtractor()
+        image_processor = ViTImageProcessor()
 
     # create model
     if model_args.model_name_or_path:
@@ -276,7 +293,7 @@ def main():
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         logger.info("Training new model from scratch")
@@ -298,17 +315,17 @@ def main():
 
     # transformations as done in original MAE paper
     # source: https://github.com/facebookresearch/mae/blob/main/main_pretrain.py
-    if "shortest_edge" in feature_extractor.size:
-        size = feature_extractor.size["shortest_edge"]
+    if "shortest_edge" in image_processor.size:
+        size = image_processor.size["shortest_edge"]
     else:
-        size = (feature_extractor.size["height"], feature_extractor.size["width"])
+        size = (image_processor.size["height"], image_processor.size["width"])
     transforms = Compose(
         [
             Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
             RandomResizedCrop(size, scale=(0.2, 1.0), interpolation=InterpolationMode.BICUBIC),
             RandomHorizontalFlip(),
             ToTensor(),
-            Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std),
+            Normalize(mean=image_processor.image_mean, std=image_processor.image_std),
         ]
     )
 
@@ -349,7 +366,7 @@ def preprocess_images(examples):
         args=training_args,
         train_dataset=ds["train"] if training_args.do_train else None,
         eval_dataset=ds["validation"] if training_args.do_eval else None,
-        tokenizer=feature_extractor,
+        tokenizer=image_processor,
         data_collator=collate_fn,
     )
 
@@ -384,5 +401,10 @@ def preprocess_images(examples):
         trainer.create_model_card(**kwargs)
 
 
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/pytorch/image-pretraining/run_mim.py b/examples/pytorch/image-pretraining/run_mim.py
index 065e4edd6c04..54334fc154da 100644
--- a/examples/pytorch/image-pretraining/run_mim.py
+++ b/examples/pytorch/image-pretraining/run_mim.py
@@ -16,6 +16,7 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -27,10 +28,10 @@
 import transformers
 from transformers import (
     CONFIG_MAPPING,
-    FEATURE_EXTRACTOR_MAPPING,
+    IMAGE_PROCESSOR_MAPPING,
     MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING,
     AutoConfig,
-    AutoFeatureExtractor,
+    AutoImageProcessor,
     AutoModelForMaskedImageModeling,
     HfArgumentParser,
     Trainer,
@@ -48,7 +49,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
 
@@ -104,7 +105,7 @@ class DataTrainingArguments:
     )
 
     def __post_init__(self):
-        data_files = dict()
+        data_files = {}
         if self.train_dir is not None:
             data_files["train"] = self.train_dir
         if self.validation_dir is not None:
@@ -115,7 +116,7 @@ def __post_init__(self):
 @dataclass
 class ModelArguments:
     """
-    Arguments pertaining to which model/config/feature extractor we are going to pre-train.
+    Arguments pertaining to which model/config/image processor we are going to pre-train.
     """
 
     model_name_or_path: str = field(
@@ -152,13 +153,29 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
-    feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
+    image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -239,6 +256,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_mim", model_args, data_args)
@@ -250,6 +273,10 @@ def main():
         handlers=[logging.StreamHandler(sys.stdout)],
     )
 
+    if training_args.should_log:
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers.utils.logging.set_verbosity_info()
+
     log_level = training_args.get_process_log_level()
     logger.setLevel(log_level)
     transformers.utils.logging.set_verbosity(log_level)
@@ -259,7 +286,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
@@ -284,7 +311,7 @@ def main():
         data_args.dataset_config_name,
         data_files=data_args.data_files,
         cache_dir=model_args.cache_dir,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
 
     # If we don't have a validation split, split off a percentage of train as validation.
@@ -301,7 +328,8 @@ def main():
     config_kwargs = {
         "cache_dir": model_args.cache_dir,
         "revision": model_args.model_revision,
-        "use_auth_token": True if model_args.use_auth_token else None,
+        "token": model_args.token,
+        "trust_remote_code": model_args.trust_remote_code,
     }
     if model_args.config_name_or_path:
         config = AutoConfig.from_pretrained(model_args.config_name_or_path, **config_kwargs)
@@ -334,17 +362,16 @@ def main():
         }
     )
 
-    # create feature extractor
-    if model_args.feature_extractor_name:
-        feature_extractor = AutoFeatureExtractor.from_pretrained(model_args.feature_extractor_name, **config_kwargs)
+    # create image processor
+    if model_args.image_processor_name:
+        image_processor = AutoImageProcessor.from_pretrained(model_args.image_processor_name, **config_kwargs)
     elif model_args.model_name_or_path:
-        feature_extractor = AutoFeatureExtractor.from_pretrained(model_args.model_name_or_path, **config_kwargs)
+        image_processor = AutoImageProcessor.from_pretrained(model_args.model_name_or_path, **config_kwargs)
     else:
-        FEATURE_EXTRACTOR_TYPES = {
-            conf.model_type: feature_extractor_class
-            for conf, feature_extractor_class in FEATURE_EXTRACTOR_MAPPING.items()
+        IMAGE_PROCESSOR_TYPES = {
+            conf.model_type: image_processor_class for conf, image_processor_class in IMAGE_PROCESSOR_MAPPING.items()
         }
-        feature_extractor = FEATURE_EXTRACTOR_TYPES[model_args.model_type]()
+        image_processor = IMAGE_PROCESSOR_TYPES[model_args.model_type]()
 
     # create model
     if model_args.model_name_or_path:
@@ -354,11 +381,12 @@ def main():
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         logger.info("Training new model from scratch")
-        model = AutoModelForMaskedImageModeling.from_config(config)
+        model = AutoModelForMaskedImageModeling.from_config(config, trust_remote_code=model_args.trust_remote_code)
 
     if training_args.do_train:
         column_names = ds["train"].column_names
@@ -382,7 +410,7 @@ def main():
             RandomResizedCrop(model_args.image_size, scale=(0.67, 1.0), ratio=(3.0 / 4.0, 4.0 / 3.0)),
             RandomHorizontalFlip(),
             ToTensor(),
-            Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std),
+            Normalize(mean=image_processor.image_mean, std=image_processor.image_std),
         ]
     )
 
@@ -427,7 +455,7 @@ def preprocess_images(examples):
         args=training_args,
         train_dataset=ds["train"] if training_args.do_train else None,
         eval_dataset=ds["validation"] if training_args.do_eval else None,
-        tokenizer=feature_extractor,
+        tokenizer=image_processor,
         data_collator=collate_fn,
     )
 
diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
new file mode 100644
index 000000000000..f0b1d2cb1e07
--- /dev/null
+++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
@@ -0,0 +1,805 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import logging
+import math
+import os
+import warnings
+from pathlib import Path
+
+import datasets
+import numpy as np
+import torch
+from accelerate import Accelerator, DistributedType
+from accelerate.utils import set_seed
+from datasets import load_dataset
+from huggingface_hub import Repository, create_repo
+from torch.utils.data import DataLoader
+from torchvision.transforms import Compose, Lambda, Normalize, RandomHorizontalFlip, RandomResizedCrop, ToTensor
+from tqdm.auto import tqdm
+
+import transformers
+from transformers import (
+    CONFIG_MAPPING,
+    IMAGE_PROCESSOR_MAPPING,
+    MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING,
+    AutoConfig,
+    AutoImageProcessor,
+    AutoModelForMaskedImageModeling,
+    SchedulerType,
+    get_scheduler,
+)
+from transformers.utils import check_min_version, send_example_telemetry
+from transformers.utils.versions import require_version
+
+
+""" Pre-training a 🤗 Transformers model for simple masked image modeling (SimMIM)
+without using HuggingFace Trainer.
+Any model supported by the AutoModelForMaskedImageModeling API can be used.
+"""
+
+logger = logging.getLogger(__name__)
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.35.0.dev0")
+
+require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
+
+MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Finetune a transformers model on a simple Masked Image Modeling task"
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default="cifar10",
+        help="Name of a dataset from the datasets package",
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The configuration name of the dataset to use (via the datasets library).",
+    )
+    parser.add_argument(
+        "--image_column_name",
+        type=str,
+        default=None,
+        help="The column name of the images in the files. If not set, will try to use 'image' or 'img'.",
+    )
+    parser.add_argument(
+        "--train_dir",
+        type=str,
+        default=None,
+        help="A folder containing the training data.",
+    )
+    parser.add_argument(
+        "--validation_dir",
+        type=None,
+        default=None,
+        help="A folder containing the validation data.",
+    )
+    parser.add_argument(
+        "--train_val_split",
+        type=float,
+        default=0.15,
+        help="Percent to split off of train for validation.",
+    )
+    parser.add_argument(
+        "--mask_patch_size",
+        type=int,
+        default=32,
+        help="The size of the square patches to use for masking.",
+    )
+    parser.add_argument(
+        "--mask_ratio",
+        type=float,
+        default=0.6,
+        help="Percentage of patches to mask.",
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--max_eval_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        default=None,
+        help=(
+            "The model checkpoint for weights initialization. Can be a local path to a pytorch_model.bin or a "
+            "checkpoint identifier on the hub. "
+            "Don't set if you want to train a model from scratch."
+        ),
+    )
+    parser.add_argument(
+        "--model_type",
+        type=str,
+        default=None,
+        help="If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES),
+    )
+    parser.add_argument(
+        "--config_name_or_path",
+        type=str,
+        default=None,
+        help="Pretrained config name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--config_overrides",
+        type=str,
+        default=None,
+        help=(
+            "Override some existing default config settings when a model is trained from scratch. Example: "
+            "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
+        ),
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="Where do you want to store (cache) the pretrained models/datasets downloaded from the hub",
+    )
+    parser.add_argument(
+        "--model_revision",
+        type=str,
+        default="main",
+        help="The specific model version to use (can be a branch name, tag name or commit id).",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--image_processor_name",
+        type=str,
+        default=None,
+        help="Name or path of preprocessor config.",
+    )
+    parser.add_argument(
+        "--token",
+        type=str,
+        default=None,
+        help=(
+            "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+            "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+        ),
+    )
+    parser.add_argument(
+        "--use_auth_token",
+        type=bool,
+        default=None,
+        help="The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`.",
+    )
+    parser.add_argument(
+        "--trust_remote_code",
+        type=bool,
+        default=False,
+        help=(
+            "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+            "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+            "execute code present on the Hub on your local machine."
+        ),
+    )
+    parser.add_argument(
+        "--image_size",
+        type=int,
+        default=None,
+        help="The size (resolution) of each image. If not specified, will use `image_size` of the configuration.",
+    )
+    parser.add_argument(
+        "--patch_size",
+        type=int,
+        default=None,
+        help="The size (resolution) of each patch. If not specified, will use `patch_size` of the configuration.",
+    )
+    parser.add_argument(
+        "--encoder_stride",
+        type=int,
+        default=None,
+        help={"help": "Stride to use for the encoder."},
+    )
+    parser.add_argument(
+        "--push_to_hub",
+        action="store_true",
+        help="Whether or not to push the model to the Hub.",
+    )
+    parser.add_argument(
+        "--with_tracking",
+        action="store_true",
+        help="Whether to enable experiment trackers for logging.",
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="all",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
+            ' `"wandb"`, `"comet_ml"` and `"clearml"`. Use `"all"` (default) to report to all integrations.'
+            "Only applicable when `--with_tracking` is passed."
+        ),
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="A seed for reproducible training.",
+    )
+    parser.add_argument(
+        "--per_device_train_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the training dataloader.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-5,
+        help="The initial learning rate for [`AdamW`] optimizer.",
+    )
+    parser.add_argument(
+        "--weight_decay",
+        type=float,
+        default=0.0,
+        help="Weight decay to use.",
+    )
+    parser.add_argument(
+        "--num_train_epochs",
+        type=float,
+        default=3.0,
+        help="Total number of training epochs to perform (if not an integer, will perform the decimal part percents of the last epoch before stopping training).",
+    )
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--lr_scheduler_type",
+        type=SchedulerType,
+        default="linear",
+        help="The scheduler type to use.",
+        choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"],
+    )
+    parser.add_argument(
+        "--num_warmup_steps",
+        type=int,
+        default=0,
+        help="Number of steps for the warmup in the lr scheduler.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=str,
+        default=None,
+        help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help="If the training should continue from a checkpoint folder.",
+    )
+    parser.add_argument(
+        "--per_device_eval_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the evaluation dataloader.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default=None,
+        help="Where to store the final model.",
+    )
+    args = parser.parse_args()
+
+    # Sanity checks
+    data_files = {}
+    if args.train_dir is not None:
+        data_files["train"] = args.train_dir
+    if args.validation_dir is not None:
+        data_files["val"] = args.validation_dir
+    args.data_files = data_files if data_files else None
+
+    if args.push_to_hub:
+        assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed."
+
+    return args
+
+
+class MaskGenerator:
+    """
+    A class to generate boolean masks for the pretraining task.
+
+    A mask is a 1D tensor of shape (model_patch_size**2,) where the value is either 0 or 1,
+    where 1 indicates "masked".
+    """
+
+    def __init__(self, input_size=192, mask_patch_size=32, model_patch_size=4, mask_ratio=0.6):
+        self.input_size = input_size
+        self.mask_patch_size = mask_patch_size
+        self.model_patch_size = model_patch_size
+        self.mask_ratio = mask_ratio
+
+        if self.input_size % self.mask_patch_size != 0:
+            raise ValueError("Input size must be divisible by mask patch size")
+        if self.mask_patch_size % self.model_patch_size != 0:
+            raise ValueError("Mask patch size must be divisible by model patch size")
+
+        self.rand_size = self.input_size // self.mask_patch_size
+        self.scale = self.mask_patch_size // self.model_patch_size
+
+        self.token_count = self.rand_size**2
+        self.mask_count = int(np.ceil(self.token_count * self.mask_ratio))
+
+    def __call__(self):
+        mask_idx = np.random.permutation(self.token_count)[: self.mask_count]
+        mask = np.zeros(self.token_count, dtype=int)
+        mask[mask_idx] = 1
+
+        mask = mask.reshape((self.rand_size, self.rand_size))
+        mask = mask.repeat(self.scale, axis=0).repeat(self.scale, axis=1)
+
+        return torch.tensor(mask.flatten())
+
+
+def collate_fn(examples):
+    pixel_values = torch.stack([example["pixel_values"] for example in examples])
+    mask = torch.stack([example["mask"] for example in examples])
+    return {"pixel_values": pixel_values, "bool_masked_pos": mask}
+
+
+def main():
+    args = parse_args()
+
+    if args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        args.token = args.use_auth_token
+
+    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
+    # information sent is the one passed as arguments along with your Python/PyTorch versions.
+    send_example_telemetry("run_mim_no_trainer", args)
+
+    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
+    # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
+    # in the environment
+    accelerator_log_kwargs = {}
+
+    if args.with_tracking:
+        accelerator_log_kwargs["log_with"] = args.report_to
+        accelerator_log_kwargs["project_dir"] = args.output_dir
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        **accelerator_log_kwargs,
+    )
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.push_to_hub:
+            # Retrieve of infer repo_name
+            repo_name = args.hub_model_id
+            if repo_name is None:
+                repo_name = Path(args.output_dir).absolute().name
+            # Create repo and retrieve repo_id
+            repo_id = create_repo(repo_name, exist_ok=True, token=args.hub_token).repo_id
+            # Clone repo locally
+            repo = Repository(args.output_dir, clone_from=repo_id, token=args.hub_token)
+
+            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
+                if "step_*" not in gitignore:
+                    gitignore.write("step_*\n")
+                if "epoch_*" not in gitignore:
+                    gitignore.write("epoch_*\n")
+        elif args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+    accelerator.wait_for_everyone()
+
+    # Initialize our dataset.
+    ds = load_dataset(
+        args.dataset_name,
+        args.dataset_config_name,
+        data_files=args.data_files,
+        cache_dir=args.cache_dir,
+        token=args.token,
+    )
+
+    # If we don't have a validation split, split off a percentage of train as validation.
+    args.train_val_split = None if "validation" in ds.keys() else args.train_val_split
+    if isinstance(args.train_val_split, float) and args.train_val_split > 0.0:
+        split = ds["train"].train_test_split(args.train_val_split)
+        ds["train"] = split["train"]
+        ds["validation"] = split["test"]
+
+    # Create config
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    config_kwargs = {
+        "cache_dir": args.cache_dir,
+        "revision": args.model_revision,
+        "token": args.token,
+        "trust_remote_code": args.trust_remote_code,
+    }
+    if args.config_name_or_path:
+        config = AutoConfig.from_pretrained(args.config_name_or_path, **config_kwargs)
+    elif args.model_name_or_path:
+        config = AutoConfig.from_pretrained(args.model_name_or_path, **config_kwargs)
+    else:
+        config = CONFIG_MAPPING[args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+        if args.config_overrides is not None:
+            logger.info(f"Overriding config: {args.config_overrides}")
+            config.update_from_string(args.config_overrides)
+            logger.info(f"New config: {config}")
+
+    # make sure the decoder_type is "simmim" (only relevant for BEiT)
+    if hasattr(config, "decoder_type"):
+        config.decoder_type = "simmim"
+
+    # adapt config
+    args.image_size = args.image_size if args.image_size is not None else config.image_size
+    args.patch_size = args.patch_size if args.patch_size is not None else config.patch_size
+    args.encoder_stride = args.encoder_stride if args.encoder_stride is not None else config.encoder_stride
+
+    config.update(
+        {
+            "image_size": args.image_size,
+            "patch_size": args.patch_size,
+            "encoder_stride": args.encoder_stride,
+        }
+    )
+
+    # create image processor
+    if args.image_processor_name:
+        image_processor = AutoImageProcessor.from_pretrained(args.image_processor_name, **config_kwargs)
+    elif args.model_name_or_path:
+        image_processor = AutoImageProcessor.from_pretrained(args.model_name_or_path, **config_kwargs)
+    else:
+        IMAGE_PROCESSOR_TYPES = {
+            conf.model_type: image_processor_class for conf, image_processor_class in IMAGE_PROCESSOR_MAPPING.items()
+        }
+        image_processor = IMAGE_PROCESSOR_TYPES[args.model_type]()
+
+    # create model
+    if args.model_name_or_path:
+        model = AutoModelForMaskedImageModeling.from_pretrained(
+            args.model_name_or_path,
+            from_tf=bool(".ckpt" in args.model_name_or_path),
+            config=config,
+            cache_dir=args.cache_dir,
+            revision=args.model_revision,
+            token=args.token,
+            trust_remote_code=args.trust_remote_code,
+        )
+    else:
+        logger.info("Training new model from scratch")
+        model = AutoModelForMaskedImageModeling.from_config(
+            config,
+            token=args.token,
+            trust_remote_code=args.trust_remote_code,
+        )
+
+    column_names = ds["train"].column_names
+
+    if args.image_column_name is not None:
+        image_column_name = args.image_column_name
+    elif "image" in column_names:
+        image_column_name = "image"
+    elif "img" in column_names:
+        image_column_name = "img"
+    else:
+        image_column_name = column_names[0]
+
+    # transformations as done in original SimMIM paper
+    # source: https://github.com/microsoft/SimMIM/blob/main/data/data_simmim.py
+    transforms = Compose(
+        [
+            Lambda(lambda img: img.convert("RGB")),
+            RandomResizedCrop(args.image_size, scale=(0.67, 1.0), ratio=(3.0 / 4.0, 4.0 / 3.0)),
+            RandomHorizontalFlip(),
+            ToTensor(),
+            Normalize(mean=image_processor.image_mean, std=image_processor.image_std),
+        ]
+    )
+
+    # create mask generator
+    mask_generator = MaskGenerator(
+        input_size=args.image_size,
+        mask_patch_size=args.mask_patch_size,
+        model_patch_size=args.patch_size,
+        mask_ratio=args.mask_ratio,
+    )
+
+    def preprocess_images(examples):
+        """Preprocess a batch of images by applying transforms + creating a corresponding mask, indicating
+        which patches to mask."""
+
+        examples["pixel_values"] = [transforms(image) for image in examples[image_column_name]]
+        examples["mask"] = [mask_generator() for i in range(len(examples[image_column_name]))]
+
+        return examples
+
+    if args.max_train_samples is not None:
+        ds["train"] = ds["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
+    # Set the training transforms
+    ds["train"].set_transform(preprocess_images)
+
+    if args.max_eval_samples is not None:
+        ds["validation"] = ds["validation"].shuffle(seed=args.seed).select(range(args.max_eval_samples))
+    # Set the validation transforms
+    ds["validation"].set_transform(preprocess_images)
+
+    # DataLoaders creation:
+    train_dataloader = DataLoader(
+        ds["train"],
+        shuffle=True,
+        collate_fn=collate_fn,
+        batch_size=args.per_device_train_batch_size,
+    )
+    eval_dataloader = DataLoader(
+        ds["validation"],
+        collate_fn=collate_fn,
+        batch_size=args.per_device_eval_batch_size,
+    )
+
+    # Optimizer
+    # Split weights in two groups, one with weight decay and the other not.
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {
+            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
+            "weight_decay": 0.0,
+        },
+    ]
+    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
+
+    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
+    # shorter in multiprocess)
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        name=args.lr_scheduler_type,
+        optimizer=optimizer,
+        num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+    )
+
+    # Prepare everything with our `accelerator`.
+    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
+        model,
+        optimizer,
+        train_dataloader,
+        eval_dataloader,
+        lr_scheduler,
+    )
+
+    # On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
+    if accelerator.distributed_type == DistributedType.TPU:
+        model.tie_weights()
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # Figure out how many steps we should save the Accelerator states
+    checkpointing_steps = args.checkpointing_steps
+    if checkpointing_steps is not None and checkpointing_steps.isdigit():
+        checkpointing_steps = int(checkpointing_steps)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if args.with_tracking:
+        experiment_config = vars(args)
+        # TensorBoard cannot log Enums, need the raw value
+        experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value
+        accelerator.init_trackers("mim_no_trainer", experiment_config)
+
+    # Train!
+    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(ds['train'])}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(int(args.max_train_steps)), disable=not accelerator.is_local_main_process)
+    completed_steps = 0
+    starting_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
+            checkpoint_path = args.resume_from_checkpoint
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
+            dirs.sort(key=os.path.getctime)
+            path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
+            checkpoint_path = path
+            path = os.path.basename(checkpoint_path)
+
+        accelerator.print(f"Resumed from checkpoint: {checkpoint_path}")
+        accelerator.load_state(path)
+        # Extract `epoch_{i}` or `step_{i}`
+        training_difference = os.path.splitext(path)[0]
+
+        if "epoch" in training_difference:
+            starting_epoch = int(training_difference.replace("epoch_", "")) + 1
+            resume_step = None
+            completed_steps = starting_epoch * num_update_steps_per_epoch
+        else:
+            # need to multiply `gradient_accumulation_steps` to reflect real steps
+            resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
+            starting_epoch = resume_step // len(train_dataloader)
+            completed_steps = resume_step // args.gradient_accumulation_steps
+            resume_step -= starting_epoch * len(train_dataloader)
+
+    # update the progress_bar if load from checkpoint
+    progress_bar.update(completed_steps)
+
+    for epoch in range(starting_epoch, args.num_train_epochs):
+        model.train()
+        if args.with_tracking:
+            total_loss = 0
+        if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
+            # We skip the first `n` batches in the dataloader when resuming from a checkpoint
+            active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
+        else:
+            active_dataloader = train_dataloader
+        for step, batch in enumerate(active_dataloader):
+            with accelerator.accumulate(model):
+                outputs = model(**batch)
+                loss = outputs.loss
+                # We keep track of the loss at each epoch
+                if args.with_tracking:
+                    total_loss += loss.detach().float()
+                accelerator.backward(loss)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                completed_steps += 1
+
+            if isinstance(checkpointing_steps, int):
+                if completed_steps % checkpointing_steps == 0:
+                    output_dir = f"step_{completed_steps}"
+                    if args.output_dir is not None:
+                        output_dir = os.path.join(args.output_dir, output_dir)
+                    accelerator.save_state(output_dir)
+
+            if completed_steps >= args.max_train_steps:
+                break
+
+        model.eval()
+        losses = []
+        for step, batch in enumerate(eval_dataloader):
+            with torch.no_grad():
+                outputs = model(**batch)
+
+            loss = outputs.loss
+            losses.append(accelerator.gather_for_metrics(loss.repeat(args.per_device_eval_batch_size)))
+
+        losses = torch.cat(losses)
+        eval_loss = torch.mean(losses)
+
+        logger.info(f"epoch {epoch}: eval_loss: {eval_loss}")
+
+        if args.with_tracking:
+            accelerator.log(
+                {
+                    "eval_loss": eval_loss,
+                    "train_loss": total_loss.item() / len(train_dataloader),
+                    "epoch": epoch,
+                    "step": completed_steps,
+                },
+                step=completed_steps,
+            )
+
+        if args.push_to_hub and epoch < args.num_train_epochs - 1:
+            accelerator.wait_for_everyone()
+            unwrapped_model = accelerator.unwrap_model(model)
+            unwrapped_model.save_pretrained(
+                args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
+            )
+            if accelerator.is_main_process:
+                image_processor.save_pretrained(args.output_dir)
+                repo.push_to_hub(
+                    commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
+                )
+
+        if args.checkpointing_steps == "epoch":
+            output_dir = f"epoch_{epoch}"
+            if args.output_dir is not None:
+                output_dir = os.path.join(args.output_dir, output_dir)
+            accelerator.save_state(output_dir)
+
+    if args.with_tracking:
+        accelerator.end_training()
+
+    if args.output_dir is not None:
+        accelerator.wait_for_everyone()
+        unwrapped_model = accelerator.unwrap_model(model)
+        unwrapped_model.save_pretrained(
+            args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
+        )
+        if accelerator.is_main_process:
+            image_processor.save_pretrained(args.output_dir)
+            if args.push_to_hub:
+                repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/pytorch/language-modeling/README.md b/examples/pytorch/language-modeling/README.md
index 035a6dd6966d..3069fe9eb974 100644
--- a/examples/pytorch/language-modeling/README.md
+++ b/examples/pytorch/language-modeling/README.md
@@ -174,6 +174,13 @@ concatenates all texts and then splits them in blocks of the same length).
 **Note:** On TPU, you should use the flag `--pad_to_max_length` in conjunction with the `--line_by_line` flag to make
 sure all your batches have the same length.
 
+## Streaming
+
+To use the streaming dataset mode which can be very useful for large datasets, add `--streaming` to the command line. This is currently supported by `run_mlm.py` and `run_clm.py`.
+
+## Low Cpu Memory Usage
+
+To use low cpu memory mode which can be very useful for LLM, add `--low_cpu_mem_usage` to the command line. This is currently supported by `run_clm.py`,`run_mlm.py`, `run_plm.py`,`run_mlm_no_trainer.py` and `run_clm_no_trainer.py`.
 
 ## Creating a model on the fly
 
diff --git a/examples/pytorch/language-modeling/requirements.txt b/examples/pytorch/language-modeling/requirements.txt
index 501840cdf955..19c487fe3f63 100644
--- a/examples/pytorch/language-modeling/requirements.txt
+++ b/examples/pytorch/language-modeling/requirements.txt
@@ -4,3 +4,4 @@ datasets >= 1.8.0
 sentencepiece != 0.1.92
 protobuf
 evaluate
+scikit-learn
diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py
index fc62c614bd97..15c9261be48c 100755
--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@@ -25,15 +25,16 @@
 import math
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from itertools import chain
 from typing import Optional
 
 import datasets
+import evaluate
 import torch
 from datasets import load_dataset
 
-import evaluate
 import transformers
 from transformers import (
     CONFIG_MAPPING,
@@ -55,7 +56,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
@@ -76,7 +77,7 @@ class ModelArguments:
         default=None,
         metadata={
             "help": (
-                "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch."
+                "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
             )
         },
     )
@@ -111,12 +112,28 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -130,6 +147,15 @@ class ModelArguments:
             "choices": ["auto", "bfloat16", "float16", "float32"],
         },
     )
+    low_cpu_mem_usage: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded."
+                "set True will benefit LLM loading time and RAM consumption."
+            )
+        },
+    )
 
     def __post_init__(self):
         if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
@@ -173,7 +199,7 @@ class DataTrainingArguments:
             )
         },
     )
-
+    streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"})
     block_size: Optional[int] = field(
         default=None,
         metadata={
@@ -202,6 +228,9 @@ class DataTrainingArguments:
     )
 
     def __post_init__(self):
+        if self.streaming:
+            require_version("datasets>=2.0.0", "The streaming feature requires `datasets>=2.0.0`")
+
         if self.dataset_name is None and self.train_file is None and self.validation_file is None:
             raise ValueError("Need either a dataset name or a training/validation file.")
         else:
@@ -226,6 +255,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_clm", model_args, data_args)
@@ -237,6 +272,10 @@ def main():
         handlers=[logging.StreamHandler(sys.stdout)],
     )
 
+    if training_args.should_log:
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers.utils.logging.set_verbosity_info()
+
     log_level = training_args.get_process_log_level()
     logger.setLevel(log_level)
     datasets.utils.logging.set_verbosity(log_level)
@@ -247,7 +286,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
@@ -284,7 +323,8 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            streaming=data_args.streaming,
         )
         if "validation" not in raw_datasets.keys():
             raw_datasets["validation"] = load_dataset(
@@ -292,14 +332,16 @@ def main():
                 data_args.dataset_config_name,
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
+                streaming=data_args.streaming,
             )
             raw_datasets["train"] = load_dataset(
                 data_args.dataset_name,
                 data_args.dataset_config_name,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
+                streaming=data_args.streaming,
             )
     else:
         data_files = {}
@@ -320,7 +362,7 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
             **dataset_args,
         )
         # If no validation data is there, validation_split_percentage will be used to divide the dataset.
@@ -330,7 +372,7 @@ def main():
                 data_files=data_files,
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
                 **dataset_args,
             )
             raw_datasets["train"] = load_dataset(
@@ -338,7 +380,7 @@ def main():
                 data_files=data_files,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
                 **dataset_args,
             )
 
@@ -354,7 +396,8 @@ def main():
     config_kwargs = {
         "cache_dir": model_args.cache_dir,
         "revision": model_args.model_revision,
-        "use_auth_token": True if model_args.use_auth_token else None,
+        "token": model_args.token,
+        "trust_remote_code": model_args.trust_remote_code,
     }
     if model_args.config_name:
         config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
@@ -372,7 +415,8 @@ def main():
         "cache_dir": model_args.cache_dir,
         "use_fast": model_args.use_fast_tokenizer,
         "revision": model_args.model_revision,
-        "use_auth_token": True if model_args.use_auth_token else None,
+        "token": model_args.token,
+        "trust_remote_code": model_args.trust_remote_code,
     }
     if model_args.tokenizer_name:
         tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
@@ -396,12 +440,14 @@ def main():
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
             torch_dtype=torch_dtype,
+            low_cpu_mem_usage=model_args.low_cpu_mem_usage,
         )
     else:
-        model = AutoModelForCausalLM.from_config(config)
-        n_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())
+        model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code)
+        n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
         logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
 
     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
@@ -413,9 +459,9 @@ def main():
     # Preprocessing the datasets.
     # First we tokenize all the texts.
     if training_args.do_train:
-        column_names = raw_datasets["train"].column_names
+        column_names = list(raw_datasets["train"].features)
     else:
-        column_names = raw_datasets["validation"].column_names
+        column_names = list(raw_datasets["validation"].features)
     text_column_name = "text" if "text" in column_names else column_names[0]
 
     # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
@@ -433,23 +479,30 @@ def tokenize_function(examples):
         return output
 
     with training_args.main_process_first(desc="dataset map tokenization"):
-        tokenized_datasets = raw_datasets.map(
-            tokenize_function,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=column_names,
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on dataset",
-        )
+        if not data_args.streaming:
+            tokenized_datasets = raw_datasets.map(
+                tokenize_function,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on dataset",
+            )
+        else:
+            tokenized_datasets = raw_datasets.map(
+                tokenize_function,
+                batched=True,
+                remove_columns=column_names,
+            )
 
     if data_args.block_size is None:
         block_size = tokenizer.model_max_length
-        if block_size > 1024:
+        if block_size > config.max_position_embeddings:
             logger.warning(
                 f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
-                "Picking 1024 instead. You can change that default value by passing --block_size xxx."
+                f"Using block_size={min(1024, config.max_position_embeddings)} instead. You can change that default value by passing --block_size xxx."
             )
-            block_size = 1024
+            block_size = min(1024, config.max_position_embeddings)
     else:
         if data_args.block_size > tokenizer.model_max_length:
             logger.warning(
@@ -463,10 +516,9 @@ def group_texts(examples):
         # Concatenate all texts.
         concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
         total_length = len(concatenated_examples[list(examples.keys())[0]])
-        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
-        # customize this part to your needs.
-        if total_length >= block_size:
-            total_length = (total_length // block_size) * block_size
+        # We drop the small remainder, and if the total_length < block_size  we exclude this batch and return an empty dict.
+        # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
+        total_length = (total_length // block_size) * block_size
         # Split by chunks of max_len.
         result = {
             k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
@@ -480,16 +532,22 @@ def group_texts(examples):
     # to preprocess.
     #
     # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
-    # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+    # https://huggingface.co/docs/datasets/process#map
 
     with training_args.main_process_first(desc="grouping texts together"):
-        lm_datasets = tokenized_datasets.map(
-            group_texts,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc=f"Grouping texts in chunks of {block_size}",
-        )
+        if not data_args.streaming:
+            lm_datasets = tokenized_datasets.map(
+                group_texts,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc=f"Grouping texts in chunks of {block_size}",
+            )
+        else:
+            lm_datasets = tokenized_datasets.map(
+                group_texts,
+                batched=True,
+            )
 
     if training_args.do_train:
         if "train" not in tokenized_datasets:
diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
index 0524ca83f48b..d4e034167e62 100755
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -33,15 +33,15 @@
 
 import datasets
 import torch
+from accelerate import Accelerator, DistributedType
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
 from datasets import load_dataset
+from huggingface_hub import Repository, create_repo
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
 
 import transformers
-from accelerate import Accelerator, DistributedType
-from accelerate.logging import get_logger
-from accelerate.utils import set_seed
-from huggingface_hub import Repository
 from transformers import (
     CONFIG_MAPPING,
     MODEL_MAPPING,
@@ -52,12 +52,12 @@
     default_data_collator,
     get_scheduler,
 )
-from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry
+from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 logger = get_logger(__name__)
 
@@ -82,10 +82,10 @@ def parse_args():
         help="The configuration name of the dataset to use (via the datasets library).",
     )
     parser.add_argument(
-        "--train_file", type=str, default=None, help="A csv or a json file containing the training data."
+        "--train_file", type=str, default=None, help="A csv, txt or a json file containing the training data."
     )
     parser.add_argument(
-        "--validation_file", type=str, default=None, help="A csv or a json file containing the validation data."
+        "--validation_file", type=str, default=None, help="A csv, txt or a json file containing the validation data."
     )
     parser.add_argument(
         "--validation_split_percentage",
@@ -193,6 +193,16 @@ def parse_args():
         "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
     )
     parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--trust_remote_code",
+        type=bool,
+        default=False,
+        help=(
+            "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+            "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+            "execute code present on the Hub on your local machine."
+        ),
+    )
     parser.add_argument(
         "--checkpointing_steps",
         type=str,
@@ -220,6 +230,14 @@ def parse_args():
             "Only applicable when `--with_tracking` is passed."
         ),
     )
+    parser.add_argument(
+        "--low_cpu_mem_usage",
+        action="store_true",
+        help=(
+            "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded."
+            "If passed, LLM loading time and RAM consumption will be benefited."
+        ),
+    )
     args = parser.parse_args()
 
     # Sanity checks
@@ -228,13 +246,16 @@ def parse_args():
     else:
         if args.train_file is not None:
             extension = args.train_file.split(".")[-1]
-            assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, json or txt file."
+            if extension not in ["csv", "json", "txt"]:
+                raise ValueError("`train_file` should be a csv, json or txt file.")
         if args.validation_file is not None:
             extension = args.validation_file.split(".")[-1]
-            assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, json or txt file."
+            if extension not in ["csv", "json", "txt"]:
+                raise ValueError("`validation_file` should be a csv, json or txt file.")
 
     if args.push_to_hub:
-        assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed."
+        if args.output_dir is None:
+            raise ValueError("Need an `output_dir` to create a repo when `--push_to_hub` is passed.")
 
     return args
 
@@ -253,7 +274,7 @@ def main():
 
     if args.with_tracking:
         accelerator_log_kwargs["log_with"] = args.report_to
-        accelerator_log_kwargs["logging_dir"] = args.output_dir
+        accelerator_log_kwargs["project_dir"] = args.output_dir
 
     accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
 
@@ -278,11 +299,14 @@ def main():
     # Handle the repository creation
     if accelerator.is_main_process:
         if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            repo = Repository(args.output_dir, clone_from=repo_name)
+            # Retrieve of infer repo_name
+            repo_name = args.hub_model_id
+            if repo_name is None:
+                repo_name = Path(args.output_dir).absolute().name
+            # Create repo and retrieve repo_id
+            repo_id = create_repo(repo_name, exist_ok=True, token=args.hub_token).repo_id
+            # Clone repo locally
+            repo = Repository(args.output_dir, clone_from=repo_id, token=args.hub_token)
 
             with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                 if "step_*" not in gitignore:
@@ -351,17 +375,27 @@ def main():
     # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
     # download model & vocab.
     if args.config_name:
-        config = AutoConfig.from_pretrained(args.config_name)
+        config = AutoConfig.from_pretrained(
+            args.config_name,
+            trust_remote_code=args.trust_remote_code,
+        )
     elif args.model_name_or_path:
-        config = AutoConfig.from_pretrained(args.model_name_or_path)
+        config = AutoConfig.from_pretrained(
+            args.model_name_or_path,
+            trust_remote_code=args.trust_remote_code,
+        )
     else:
         config = CONFIG_MAPPING[args.model_type]()
         logger.warning("You are instantiating a new config instance from scratch.")
 
     if args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer)
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.tokenizer_name, use_fast=not args.use_slow_tokenizer, trust_remote_code=args.trust_remote_code
+        )
     elif args.model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.model_name_or_path, use_fast=not args.use_slow_tokenizer, trust_remote_code=args.trust_remote_code
+        )
     else:
         raise ValueError(
             "You are instantiating a new tokenizer from scratch. This is not supported by this script."
@@ -373,10 +407,12 @@ def main():
             args.model_name_or_path,
             from_tf=bool(".ckpt" in args.model_name_or_path),
             config=config,
+            low_cpu_mem_usage=args.low_cpu_mem_usage,
+            trust_remote_code=args.trust_remote_code,
         )
     else:
         logger.info("Training new model from scratch")
-        model = AutoModelForCausalLM.from_config(config)
+        model = AutoModelForCausalLM.from_config(config, trust_remote_code=args.trust_remote_code)
 
     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
     # on a small vocab and want a smaller embedding size, remove this test.
@@ -404,12 +440,12 @@ def tokenize_function(examples):
 
     if args.block_size is None:
         block_size = tokenizer.model_max_length
-        if block_size > 1024:
+        if block_size > config.max_position_embeddings:
             logger.warning(
                 f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
-                "Picking 1024 instead. You can change that default value by passing --block_size xxx."
+                f"Using block_size={min(1024, config.max_position_embeddings)} instead. You can change that default value by passing --block_size xxx."
             )
-        block_size = 1024
+            block_size = min(1024, config.max_position_embeddings)
     else:
         if args.block_size > tokenizer.model_max_length:
             logger.warning(
@@ -423,10 +459,9 @@ def group_texts(examples):
         # Concatenate all texts.
         concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
         total_length = len(concatenated_examples[list(examples.keys())[0]])
-        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
-        # customize this part to your needs.
-        if total_length >= block_size:
-            total_length = (total_length // block_size) * block_size
+        # We drop the small remainder, and if the total_length < block_size  we exclude this batch and return an empty dict.
+        # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
+        total_length = (total_length // block_size) * block_size
         # Split by chunks of max_len.
         result = {
             k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
@@ -440,7 +475,7 @@ def group_texts(examples):
     # to preprocess.
     #
     # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
-    # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+    # https://huggingface.co/docs/datasets/process#map
 
     with accelerator.main_process_first():
         lm_datasets = tokenized_datasets.map(
@@ -542,43 +577,45 @@ def group_texts(examples):
     # Potentially load in the weights and states from a previous save
     if args.resume_from_checkpoint:
         if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
-            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
-            accelerator.load_state(args.resume_from_checkpoint)
+            checkpoint_path = args.resume_from_checkpoint
             path = os.path.basename(args.resume_from_checkpoint)
         else:
             # Get the most recent checkpoint
             dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
             dirs.sort(key=os.path.getctime)
             path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
+            checkpoint_path = path
+            path = os.path.basename(checkpoint_path)
+
+        accelerator.print(f"Resumed from checkpoint: {checkpoint_path}")
+        accelerator.load_state(path)
         # Extract `epoch_{i}` or `step_{i}`
         training_difference = os.path.splitext(path)[0]
 
         if "epoch" in training_difference:
             starting_epoch = int(training_difference.replace("epoch_", "")) + 1
             resume_step = None
+            completed_steps = starting_epoch * num_update_steps_per_epoch
         else:
             # need to multiply `gradient_accumulation_steps` to reflect real steps
             resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
             starting_epoch = resume_step // len(train_dataloader)
+            completed_steps = resume_step // args.gradient_accumulation_steps
             resume_step -= starting_epoch * len(train_dataloader)
 
     # update the progress_bar if load from checkpoint
-    progress_bar.update(starting_epoch * num_update_steps_per_epoch)
-    completed_steps = starting_epoch * num_update_steps_per_epoch
+    progress_bar.update(completed_steps)
 
     for epoch in range(starting_epoch, args.num_train_epochs):
         model.train()
         if args.with_tracking:
             total_loss = 0
-        for step, batch in enumerate(train_dataloader):
-            # We need to skip steps until we reach the resumed step
-            if args.resume_from_checkpoint and epoch == starting_epoch:
-                if resume_step is not None and step < resume_step:
-                    if step % args.gradient_accumulation_steps == 0:
-                        progress_bar.update(1)
-                        completed_steps += 1
-                    continue
-
+        if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
+            # We skip the first `n` batches in the dataloader when resuming from a checkpoint
+            active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
+        else:
+            active_dataloader = train_dataloader
+        for step, batch in enumerate(active_dataloader):
             with accelerator.accumulate(model):
                 outputs = model(**batch)
                 loss = outputs.loss
@@ -597,7 +634,7 @@ def group_texts(examples):
 
             if isinstance(checkpointing_steps, int):
                 if completed_steps % checkpointing_steps == 0:
-                    output_dir = f"step_{completed_steps }"
+                    output_dir = f"step_{completed_steps}"
                     if args.output_dir is not None:
                         output_dir = os.path.join(args.output_dir, output_dir)
                     accelerator.save_state(output_dir)
diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py
index 893ec0518a9e..0d6525cef521 100755
--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@@ -25,14 +25,15 @@
 import math
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from itertools import chain
 from typing import Optional
 
 import datasets
+import evaluate
 from datasets import load_dataset
 
-import evaluate
 import transformers
 from transformers import (
     CONFIG_MAPPING,
@@ -53,7 +54,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
@@ -107,12 +108,37 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
+    low_cpu_mem_usage: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded."
+                "set True will benefit LLM loading time and RAM consumption."
             )
         },
     )
@@ -197,8 +223,12 @@ class DataTrainingArguments:
             )
         },
     )
+    streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"})
 
     def __post_init__(self):
+        if self.streaming:
+            require_version("datasets>=2.0.0", "The streaming feature requires `datasets>=2.0.0`")
+
         if self.dataset_name is None and self.train_file is None and self.validation_file is None:
             raise ValueError("Need either a dataset name or a training/validation file.")
         else:
@@ -225,6 +255,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_mlm", model_args, data_args)
@@ -236,6 +272,10 @@ def main():
         handlers=[logging.StreamHandler(sys.stdout)],
     )
 
+    if training_args.should_log:
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers.utils.logging.set_verbosity_info()
+
     log_level = training_args.get_process_log_level()
     logger.setLevel(log_level)
     datasets.utils.logging.set_verbosity(log_level)
@@ -246,7 +286,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     # Set the verbosity to info of the Transformers logger (on main process only):
     logger.info(f"Training/evaluation parameters {training_args}")
@@ -284,7 +324,8 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            streaming=data_args.streaming,
         )
         if "validation" not in raw_datasets.keys():
             raw_datasets["validation"] = load_dataset(
@@ -292,14 +333,16 @@ def main():
                 data_args.dataset_config_name,
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
+                streaming=data_args.streaming,
             )
             raw_datasets["train"] = load_dataset(
                 data_args.dataset_name,
                 data_args.dataset_config_name,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
+                streaming=data_args.streaming,
             )
     else:
         data_files = {}
@@ -315,7 +358,7 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
 
         # If no validation data is there, validation_split_percentage will be used to divide the dataset.
@@ -325,14 +368,14 @@ def main():
                 data_files=data_files,
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
             raw_datasets["train"] = load_dataset(
                 extension,
                 data_files=data_files,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
 
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
@@ -346,8 +389,8 @@ def main():
     config_kwargs = {
         "cache_dir": model_args.cache_dir,
         "revision": model_args.model_revision,
-        "use_auth_token": True if model_args.use_auth_token else None,
-        "ort": True if training_args.ort else None,
+        "token": model_args.token,
+        "trust_remote_code": model_args.trust_remote_code,
     }
     if model_args.config_name:
         config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
@@ -365,7 +408,8 @@ def main():
         "cache_dir": model_args.cache_dir,
         "use_fast": model_args.use_fast_tokenizer,
         "revision": model_args.model_revision,
-        "use_auth_token": True if model_args.use_auth_token else None,
+        "token": model_args.token,
+        "trust_remote_code": model_args.trust_remote_code,
     }
     if model_args.tokenizer_name:
         tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
@@ -384,11 +428,13 @@ def main():
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
+            low_cpu_mem_usage=model_args.low_cpu_mem_usage,
         )
     else:
         logger.info("Training new model from scratch")
-        model = AutoModelForMaskedLM.from_config(config)
+        model = AutoModelForMaskedLM.from_config(config, trust_remote_code=model_args.trust_remote_code)
 
     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
     # on a small vocab and want a smaller embedding size, remove this test.
@@ -399,17 +445,18 @@ def main():
     # Preprocessing the datasets.
     # First we tokenize all the texts.
     if training_args.do_train:
-        column_names = raw_datasets["train"].column_names
+        column_names = list(raw_datasets["train"].features)
     else:
-        column_names = raw_datasets["validation"].column_names
+        column_names = list(raw_datasets["validation"].features)
     text_column_name = "text" if "text" in column_names else column_names[0]
 
     if data_args.max_seq_length is None:
         max_seq_length = tokenizer.model_max_length
         if max_seq_length > 1024:
             logger.warning(
-                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
-                "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
+                "The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value"
+                " of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can"
+                " override this default with `--block_size xxx`."
             )
             max_seq_length = 1024
     else:
@@ -440,14 +487,21 @@ def tokenize_function(examples):
             )
 
         with training_args.main_process_first(desc="dataset map tokenization"):
-            tokenized_datasets = raw_datasets.map(
-                tokenize_function,
-                batched=True,
-                num_proc=data_args.preprocessing_num_workers,
-                remove_columns=[text_column_name],
-                load_from_cache_file=not data_args.overwrite_cache,
-                desc="Running tokenizer on dataset line_by_line",
-            )
+            if not data_args.streaming:
+                tokenized_datasets = raw_datasets.map(
+                    tokenize_function,
+                    batched=True,
+                    num_proc=data_args.preprocessing_num_workers,
+                    remove_columns=[text_column_name],
+                    load_from_cache_file=not data_args.overwrite_cache,
+                    desc="Running tokenizer on dataset line_by_line",
+                )
+            else:
+                tokenized_datasets = raw_datasets.map(
+                    tokenize_function,
+                    batched=True,
+                    remove_columns=[text_column_name],
+                )
     else:
         # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
         # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
@@ -456,14 +510,21 @@ def tokenize_function(examples):
             return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
 
         with training_args.main_process_first(desc="dataset map tokenization"):
-            tokenized_datasets = raw_datasets.map(
-                tokenize_function,
-                batched=True,
-                num_proc=data_args.preprocessing_num_workers,
-                remove_columns=column_names,
-                load_from_cache_file=not data_args.overwrite_cache,
-                desc="Running tokenizer on every text in dataset",
-            )
+            if not data_args.streaming:
+                tokenized_datasets = raw_datasets.map(
+                    tokenize_function,
+                    batched=True,
+                    num_proc=data_args.preprocessing_num_workers,
+                    remove_columns=column_names,
+                    load_from_cache_file=not data_args.overwrite_cache,
+                    desc="Running tokenizer on every text in dataset",
+                )
+            else:
+                tokenized_datasets = raw_datasets.map(
+                    tokenize_function,
+                    batched=True,
+                    remove_columns=column_names,
+                )
 
         # Main data processing function that will concatenate all texts from our dataset and generate chunks of
         # max_seq_length.
@@ -471,10 +532,9 @@ def group_texts(examples):
             # Concatenate all texts.
             concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
             total_length = len(concatenated_examples[list(examples.keys())[0]])
-            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
-            # customize this part to your needs.
-            if total_length >= max_seq_length:
-                total_length = (total_length // max_seq_length) * max_seq_length
+            # We drop the small remainder, and if the total_length < max_seq_length  we exclude this batch and return an empty dict.
+            # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
+            total_length = (total_length // max_seq_length) * max_seq_length
             # Split by chunks of max_len.
             result = {
                 k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
@@ -487,16 +547,22 @@ def group_texts(examples):
         # might be slower to preprocess.
         #
         # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
-        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+        # https://huggingface.co/docs/datasets/process#map
 
         with training_args.main_process_first(desc="grouping texts together"):
-            tokenized_datasets = tokenized_datasets.map(
-                group_texts,
-                batched=True,
-                num_proc=data_args.preprocessing_num_workers,
-                load_from_cache_file=not data_args.overwrite_cache,
-                desc=f"Grouping texts in chunks of {max_seq_length}",
-            )
+            if not data_args.streaming:
+                tokenized_datasets = tokenized_datasets.map(
+                    group_texts,
+                    batched=True,
+                    num_proc=data_args.preprocessing_num_workers,
+                    load_from_cache_file=not data_args.overwrite_cache,
+                    desc=f"Grouping texts in chunks of {max_seq_length}",
+                )
+            else:
+                tokenized_datasets = tokenized_datasets.map(
+                    group_texts,
+                    batched=True,
+                )
 
     if training_args.do_train:
         if "train" not in tokenized_datasets:
diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
index f7759cb26faf..048c1e349522 100755
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -33,15 +33,15 @@
 
 import datasets
 import torch
+from accelerate import Accelerator, DistributedType
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
 from datasets import load_dataset
+from huggingface_hub import Repository, create_repo
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
 
 import transformers
-from accelerate import Accelerator, DistributedType
-from accelerate.logging import get_logger
-from accelerate.utils import set_seed
-from huggingface_hub import Repository
 from transformers import (
     CONFIG_MAPPING,
     MODEL_MAPPING,
@@ -52,12 +52,12 @@
     SchedulerType,
     get_scheduler,
 )
-from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry
+from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
@@ -200,6 +200,16 @@ def parse_args():
         "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
     )
     parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--trust_remote_code",
+        type=bool,
+        default=False,
+        help=(
+            "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+            "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+            "execute code present on the Hub on your local machine."
+        ),
+    )
     parser.add_argument(
         "--checkpointing_steps",
         type=str,
@@ -227,6 +237,14 @@ def parse_args():
             "Only applicable when `--with_tracking` is passed."
         ),
     )
+    parser.add_argument(
+        "--low_cpu_mem_usage",
+        action="store_true",
+        help=(
+            "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded."
+            "If passed, LLM loading time and RAM consumption will be benefited."
+        ),
+    )
     args = parser.parse_args()
 
     # Sanity checks
@@ -243,7 +261,8 @@ def parse_args():
                 raise ValueError("`validation_file` should be a csv, json or txt file.")
 
     if args.push_to_hub:
-        assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed."
+        if args.output_dir is None:
+            raise ValueError("Need an `output_dir` to create a repo when `--push_to_hub` is passed.")
 
     return args
 
@@ -262,7 +281,7 @@ def main():
 
     if args.with_tracking:
         accelerator_log_kwargs["log_with"] = args.report_to
-        accelerator_log_kwargs["logging_dir"] = args.output_dir
+        accelerator_log_kwargs["project_dir"] = args.output_dir
 
     accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
 
@@ -287,11 +306,14 @@ def main():
     # Handle the repository creation
     if accelerator.is_main_process:
         if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            repo = Repository(args.output_dir, clone_from=repo_name)
+            # Retrieve of infer repo_name
+            repo_name = args.hub_model_id
+            if repo_name is None:
+                repo_name = Path(args.output_dir).absolute().name
+            # Create repo and retrieve repo_id
+            repo_id = create_repo(repo_name, exist_ok=True, token=args.hub_token).repo_id
+            # Clone repo locally
+            repo = Repository(args.output_dir, clone_from=repo_id, token=args.hub_token)
 
             with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                 if "step_*" not in gitignore:
@@ -356,17 +378,21 @@ def main():
     # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
     # download model & vocab.
     if args.config_name:
-        config = AutoConfig.from_pretrained(args.config_name)
+        config = AutoConfig.from_pretrained(args.config_name, trust_remote_code=args.trust_remote_code)
     elif args.model_name_or_path:
-        config = AutoConfig.from_pretrained(args.model_name_or_path)
+        config = AutoConfig.from_pretrained(args.model_name_or_path, trust_remote_code=args.trust_remote_code)
     else:
         config = CONFIG_MAPPING[args.model_type]()
         logger.warning("You are instantiating a new config instance from scratch.")
 
     if args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer)
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.tokenizer_name, use_fast=not args.use_slow_tokenizer, trust_remote_code=args.trust_remote_code
+        )
     elif args.model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.model_name_or_path, use_fast=not args.use_slow_tokenizer, trust_remote_code=args.trust_remote_code
+        )
     else:
         raise ValueError(
             "You are instantiating a new tokenizer from scratch. This is not supported by this script."
@@ -378,10 +404,12 @@ def main():
             args.model_name_or_path,
             from_tf=bool(".ckpt" in args.model_name_or_path),
             config=config,
+            low_cpu_mem_usage=args.low_cpu_mem_usage,
+            trust_remote_code=args.trust_remote_code,
         )
     else:
         logger.info("Training new model from scratch")
-        model = AutoModelForMaskedLM.from_config(config)
+        model = AutoModelForMaskedLM.from_config(config, trust_remote_code=args.trust_remote_code)
 
     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
     # on a small vocab and want a smaller embedding size, remove this test.
@@ -398,8 +426,9 @@ def main():
         max_seq_length = tokenizer.model_max_length
         if max_seq_length > 1024:
             logger.warning(
-                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
-                "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
+                "The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value"
+                " of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can"
+                " override this default with `--block_size xxx`."
             )
             max_seq_length = 1024
     else:
@@ -461,10 +490,9 @@ def group_texts(examples):
             # Concatenate all texts.
             concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
             total_length = len(concatenated_examples[list(examples.keys())[0]])
-            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
-            # customize this part to your needs.
-            if total_length >= max_seq_length:
-                total_length = (total_length // max_seq_length) * max_seq_length
+            # We drop the small remainder, and if the total_length < max_seq_length  we exclude this batch and return an empty dict.
+            # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
+            total_length = (total_length // max_seq_length) * max_seq_length
             # Split by chunks of max_len.
             result = {
                 k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
@@ -477,7 +505,7 @@ def group_texts(examples):
         # might be slower to preprocess.
         #
         # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
-        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+        # https://huggingface.co/docs/datasets/process#map
 
         with accelerator.main_process_first():
             tokenized_datasets = tokenized_datasets.map(
@@ -586,43 +614,45 @@ def group_texts(examples):
     # Potentially load in the weights and states from a previous save
     if args.resume_from_checkpoint:
         if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
-            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
-            accelerator.load_state(args.resume_from_checkpoint)
+            checkpoint_path = args.resume_from_checkpoint
             path = os.path.basename(args.resume_from_checkpoint)
         else:
             # Get the most recent checkpoint
             dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
             dirs.sort(key=os.path.getctime)
             path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
+            checkpoint_path = path
+            path = os.path.basename(checkpoint_path)
+
+        accelerator.print(f"Resumed from checkpoint: {checkpoint_path}")
+        accelerator.load_state(path)
         # Extract `epoch_{i}` or `step_{i}`
         training_difference = os.path.splitext(path)[0]
 
         if "epoch" in training_difference:
             starting_epoch = int(training_difference.replace("epoch_", "")) + 1
             resume_step = None
+            completed_steps = starting_epoch * num_update_steps_per_epoch
         else:
             # need to multiply `gradient_accumulation_steps` to reflect real steps
             resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
             starting_epoch = resume_step // len(train_dataloader)
+            completed_steps = resume_step // args.gradient_accumulation_steps
             resume_step -= starting_epoch * len(train_dataloader)
 
     # update the progress_bar if load from checkpoint
-    progress_bar.update(starting_epoch * num_update_steps_per_epoch)
-    completed_steps = starting_epoch * num_update_steps_per_epoch
+    progress_bar.update(completed_steps)
 
     for epoch in range(starting_epoch, args.num_train_epochs):
         model.train()
         if args.with_tracking:
             total_loss = 0
-        for step, batch in enumerate(train_dataloader):
-            # We need to skip steps until we reach the resumed step
-            if args.resume_from_checkpoint and epoch == starting_epoch:
-                if resume_step is not None and step < resume_step:
-                    if step % args.gradient_accumulation_steps == 0:
-                        progress_bar.update(1)
-                        completed_steps += 1
-                    continue
-
+        if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
+            # We skip the first `n` batches in the dataloader when resuming from a checkpoint
+            active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
+        else:
+            active_dataloader = train_dataloader
+        for step, batch in enumerate(active_dataloader):
             with accelerator.accumulate(model):
                 outputs = model(**batch)
                 loss = outputs.loss
@@ -641,7 +671,7 @@ def group_texts(examples):
 
             if isinstance(checkpointing_steps, int):
                 if completed_steps % checkpointing_steps == 0:
-                    output_dir = f"step_{completed_steps }"
+                    output_dir = f"step_{completed_steps}"
                     if args.output_dir is not None:
                         output_dir = os.path.join(args.output_dir, output_dir)
                     accelerator.save_state(output_dir)
@@ -665,7 +695,7 @@ def group_texts(examples):
         except OverflowError:
             perplexity = float("inf")
 
-        logger.info(f"epoch {epoch}: perplexity: {perplexity}")
+        logger.info(f"epoch {epoch}: perplexity: {perplexity} eval_loss: {eval_loss}")
 
         if args.with_tracking:
             accelerator.log(
diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py
index 157c0ec5e3e8..1063ca7ac90a 100755
--- a/examples/pytorch/language-modeling/run_plm.py
+++ b/examples/pytorch/language-modeling/run_plm.py
@@ -22,6 +22,7 @@
 import math
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from itertools import chain
 from typing import Optional
@@ -47,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
@@ -64,7 +65,7 @@ class ModelArguments:
         default=None,
         metadata={
             "help": (
-                "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch."
+                "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
             )
         },
     )
@@ -95,12 +96,27 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    low_cpu_mem_usage: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded."
+                "set True will benefit LLM loading time and RAM consumption."
             )
         },
     )
@@ -220,6 +236,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_plm", model_args, data_args)
@@ -231,6 +253,10 @@ def main():
         handlers=[logging.StreamHandler(sys.stdout)],
     )
 
+    if training_args.should_log:
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers.utils.logging.set_verbosity_info()
+
     log_level = training_args.get_process_log_level()
     logger.setLevel(log_level)
     datasets.utils.logging.set_verbosity(log_level)
@@ -241,7 +267,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
@@ -278,7 +304,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
         if "validation" not in raw_datasets.keys():
             raw_datasets["validation"] = load_dataset(
@@ -286,14 +312,14 @@ def main():
                 data_args.dataset_config_name,
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
             raw_datasets["train"] = load_dataset(
                 data_args.dataset_name,
                 data_args.dataset_config_name,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
     else:
         data_files = {}
@@ -312,14 +338,14 @@ def main():
                 data_files=data_files,
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
             raw_datasets["train"] = load_dataset(
                 extension,
                 data_files=data_files,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
 
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
@@ -333,7 +359,7 @@ def main():
     config_kwargs = {
         "cache_dir": model_args.cache_dir,
         "revision": model_args.model_revision,
-        "use_auth_token": True if model_args.use_auth_token else None,
+        "token": model_args.token,
     }
     if model_args.config_name:
         config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
@@ -351,7 +377,7 @@ def main():
         "cache_dir": model_args.cache_dir,
         "use_fast": model_args.use_fast_tokenizer,
         "revision": model_args.model_revision,
-        "use_auth_token": True if model_args.use_auth_token else None,
+        "token": model_args.token,
     }
     if model_args.tokenizer_name:
         tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
@@ -370,7 +396,8 @@ def main():
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            low_cpu_mem_usage=model_args.low_cpu_mem_usage,
         )
     else:
         logger.info("Training new model from scratch")
@@ -436,10 +463,9 @@ def group_texts(examples):
             # Concatenate all texts.
             concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
             total_length = len(concatenated_examples[list(examples.keys())[0]])
-            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
-            # customize this part to your needs.
-            if total_length >= max_seq_length:
-                total_length = (total_length // max_seq_length) * max_seq_length
+            # We drop the small remainder, and if the total_length < max_seq_length  we exclude this batch and return an empty dict.
+            # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
+            total_length = (total_length // max_seq_length) * max_seq_length
             # Split by chunks of max_len.
             result = {
                 k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
@@ -452,7 +478,7 @@ def group_texts(examples):
         # might be slower to preprocess.
         #
         # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
-        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+        # https://huggingface.co/docs/datasets/process#map
 
         with training_args.main_process_first(desc="grouping texts together"):
             tokenized_datasets = tokenized_datasets.map(
diff --git a/examples/pytorch/multiple-choice/README.md b/examples/pytorch/multiple-choice/README.md
index 735d1f5f33a0..8d56ccfe3dbd 100644
--- a/examples/pytorch/multiple-choice/README.md
+++ b/examples/pytorch/multiple-choice/README.md
@@ -28,7 +28,7 @@ python examples/multiple-choice/run_swag.py \
 --learning_rate 5e-5 \
 --num_train_epochs 3 \
 --output_dir /tmp/swag_base \
---per_gpu_eval_batch_size=16 \
+--per_device_eval_batch_size=16 \
 --per_device_train_batch_size=16 \
 --overwrite_output
 ```
diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py
index b124b46ab592..c1b69a51bb1e 100755
--- a/examples/pytorch/multiple-choice/run_swag.py
+++ b/examples/pytorch/multiple-choice/run_swag.py
@@ -21,6 +21,7 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from itertools import chain
 from typing import Optional, Union
@@ -47,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 logger = logging.getLogger(__name__)
 
@@ -79,12 +80,28 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -225,6 +242,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_swag", model_args, data_args)
@@ -235,6 +258,11 @@ def main():
         datefmt="%m/%d/%Y %H:%M:%S",
         handlers=[logging.StreamHandler(sys.stdout)],
     )
+
+    if training_args.should_log:
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers.utils.logging.set_verbosity_info()
+
     log_level = training_args.get_process_log_level()
     logger.setLevel(log_level)
     datasets.utils.logging.set_verbosity(log_level)
@@ -245,7 +273,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
@@ -287,7 +315,7 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         # Downloading and loading the swag dataset from the hub.
@@ -295,7 +323,7 @@ def main():
             "swag",
             "regular",
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -309,14 +337,16 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     model = AutoModelForMultipleChoice.from_pretrained(
         model_args.model_name_or_path,
@@ -324,7 +354,8 @@ def main():
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     # When using your own dataset or a different dataset from swag, you will probably need to change this.
@@ -336,8 +367,9 @@ def main():
         max_seq_length = tokenizer.model_max_length
         if max_seq_length > 1024:
             logger.warning(
-                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
-                "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
+                "The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value"
+                " of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can"
+                " override this default with `--block_size xxx`."
             )
             max_seq_length = 1024
     else:
@@ -456,14 +488,14 @@ def compute_metrics(eval_predictions):
         trainer.log_metrics("eval", metrics)
         trainer.save_metrics("eval", metrics)
 
-    kwargs = dict(
-        finetuned_from=model_args.model_name_or_path,
-        tasks="multiple-choice",
-        dataset_tags="swag",
-        dataset_args="regular",
-        dataset="SWAG",
-        language="en",
-    )
+    kwargs = {
+        "finetuned_from": model_args.model_name_or_path,
+        "tasks": "multiple-choice",
+        "dataset_tags": "swag",
+        "dataset_args": "regular",
+        "dataset": "SWAG",
+        "language": "en",
+    }
 
     if training_args.push_to_hub:
         trainer.push_to_hub(**kwargs)
diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
index 8a971821f752..99123e5dcb48 100755
--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@@ -30,17 +30,17 @@
 from typing import Optional, Union
 
 import datasets
+import evaluate
 import torch
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
 from datasets import load_dataset
+from huggingface_hub import Repository, create_repo
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
 
-import evaluate
 import transformers
-from accelerate import Accelerator
-from accelerate.logging import get_logger
-from accelerate.utils import set_seed
-from huggingface_hub import Repository
 from transformers import (
     CONFIG_MAPPING,
     MODEL_MAPPING,
@@ -52,11 +52,11 @@
     default_data_collator,
     get_scheduler,
 )
-from transformers.utils import PaddingStrategy, check_min_version, get_full_repo_name, send_example_telemetry
+from transformers.utils import PaddingStrategy, check_min_version, send_example_telemetry
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 logger = get_logger(__name__)
 # You should update this to your particular problem to have better documentation of `model_type`
@@ -182,6 +182,16 @@ def parse_args():
         "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
     )
     parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--trust_remote_code",
+        type=bool,
+        default=False,
+        help=(
+            "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+            "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+            "execute code present on the Hub on your local machine."
+        ),
+    )
     parser.add_argument(
         "--checkpointing_steps",
         type=str,
@@ -288,7 +298,7 @@ def main():
 
     if args.with_tracking:
         accelerator_log_kwargs["log_with"] = args.report_to
-        accelerator_log_kwargs["logging_dir"] = args.output_dir
+        accelerator_log_kwargs["project_dir"] = args.output_dir
 
     accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
 
@@ -313,11 +323,14 @@ def main():
     # Handle the repository creation
     if accelerator.is_main_process:
         if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            repo = Repository(args.output_dir, clone_from=repo_name)
+            # Retrieve of infer repo_name
+            repo_name = args.hub_model_id
+            if repo_name is None:
+                repo_name = Path(args.output_dir).absolute().name
+            # Create repo and retrieve repo_id
+            repo_id = create_repo(repo_name, exist_ok=True, token=args.hub_token).repo_id
+            # Clone repo locally
+            repo = Repository(args.output_dir, clone_from=repo_id, token=args.hub_token)
 
             with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                 if "step_*" not in gitignore:
@@ -371,17 +384,21 @@ def main():
     # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
     # download model & vocab.
     if args.config_name:
-        config = AutoConfig.from_pretrained(args.model_name_or_path)
+        config = AutoConfig.from_pretrained(args.model_name_or_path, trust_remote_code=args.trust_remote_code)
     elif args.model_name_or_path:
-        config = AutoConfig.from_pretrained(args.model_name_or_path)
+        config = AutoConfig.from_pretrained(args.model_name_or_path, trust_remote_code=args.trust_remote_code)
     else:
         config = CONFIG_MAPPING[args.model_type]()
         logger.warning("You are instantiating a new config instance from scratch.")
 
     if args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer)
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.tokenizer_name, use_fast=not args.use_slow_tokenizer, trust_remote_code=args.trust_remote_code
+        )
     elif args.model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.model_name_or_path, use_fast=not args.use_slow_tokenizer, trust_remote_code=args.trust_remote_code
+        )
     else:
         raise ValueError(
             "You are instantiating a new tokenizer from scratch. This is not supported by this script."
@@ -393,10 +410,11 @@ def main():
             args.model_name_or_path,
             from_tf=bool(".ckpt" in args.model_name_or_path),
             config=config,
+            trust_remote_code=args.trust_remote_code,
         )
     else:
         logger.info("Training new model from scratch")
-        model = AutoModelForMultipleChoice.from_config(config)
+        model = AutoModelForMultipleChoice.from_config(config, trust_remote_code=args.trust_remote_code)
 
     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
     # on a small vocab and want a smaller embedding size, remove this test.
@@ -542,36 +560,45 @@ def preprocess_function(examples):
     # Potentially load in the weights and states from a previous save
     if args.resume_from_checkpoint:
         if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
-            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
-            accelerator.load_state(args.resume_from_checkpoint)
+            checkpoint_path = args.resume_from_checkpoint
             path = os.path.basename(args.resume_from_checkpoint)
         else:
             # Get the most recent checkpoint
             dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
             dirs.sort(key=os.path.getctime)
             path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
+            checkpoint_path = path
+            path = os.path.basename(checkpoint_path)
+
+        accelerator.print(f"Resumed from checkpoint: {checkpoint_path}")
+        accelerator.load_state(path)
         # Extract `epoch_{i}` or `step_{i}`
         training_difference = os.path.splitext(path)[0]
 
         if "epoch" in training_difference:
             starting_epoch = int(training_difference.replace("epoch_", "")) + 1
             resume_step = None
+            completed_steps = starting_epoch * num_update_steps_per_epoch
         else:
-            resume_step = int(training_difference.replace("step_", ""))
+            # need to multiply `gradient_accumulation_steps` to reflect real steps
+            resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
             starting_epoch = resume_step // len(train_dataloader)
+            completed_steps = resume_step // args.gradient_accumulation_steps
             resume_step -= starting_epoch * len(train_dataloader)
 
+    # update the progress_bar if load from checkpoint
+    progress_bar.update(completed_steps)
+
     for epoch in range(starting_epoch, args.num_train_epochs):
         model.train()
         if args.with_tracking:
             total_loss = 0
-        for step, batch in enumerate(train_dataloader):
-            # We need to skip steps until we reach the resumed step
-            if args.resume_from_checkpoint and epoch == starting_epoch:
-                if resume_step is not None and step < resume_step:
-                    completed_steps += 1
-                    continue
-
+        if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
+            # We skip the first `n` batches in the dataloader when resuming from a checkpoint
+            active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
+        else:
+            active_dataloader = train_dataloader
+        for step, batch in enumerate(active_dataloader):
             with accelerator.accumulate(model):
                 outputs = model(**batch)
                 loss = outputs.loss
@@ -590,7 +617,7 @@ def preprocess_function(examples):
 
             if isinstance(checkpointing_steps, int):
                 if completed_steps % checkpointing_steps == 0:
-                    output_dir = f"step_{completed_steps }"
+                    output_dir = f"step_{completed_steps}"
                     if args.output_dir is not None:
                         output_dir = os.path.join(args.output_dir, output_dir)
                     accelerator.save_state(output_dir)
diff --git a/examples/pytorch/test_xla_examples.py b/examples/pytorch/old_test_xla_examples.py
similarity index 100%
rename from examples/pytorch/test_xla_examples.py
rename to examples/pytorch/old_test_xla_examples.py
diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py
index 83c9a2f2d00b..9063c218729c 100755
--- a/examples/pytorch/question-answering/run_qa.py
+++ b/examples/pytorch/question-answering/run_qa.py
@@ -21,15 +21,17 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
 import datasets
+import evaluate
 from datasets import load_dataset
+from trainer_qa import QuestionAnsweringTrainer
+from utils_qa import postprocess_qa_predictions
 
-import evaluate
 import transformers
-from trainer_qa import QuestionAnsweringTrainer
 from transformers import (
     AutoConfig,
     AutoModelForQuestionAnswering,
@@ -45,11 +47,10 @@
 from transformers.trainer_utils import get_last_checkpoint
 from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
-from utils_qa import postprocess_qa_predictions
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
@@ -79,12 +80,28 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -227,6 +244,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_qa", model_args, data_args)
@@ -238,6 +261,10 @@ def main():
         handlers=[logging.StreamHandler(sys.stdout)],
     )
 
+    if training_args.should_log:
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers.utils.logging.set_verbosity_info()
+
     log_level = training_args.get_process_log_level()
     logger.setLevel(log_level)
     datasets.utils.logging.set_verbosity(log_level)
@@ -248,7 +275,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
@@ -285,7 +312,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -304,7 +331,7 @@ def main():
             data_files=data_files,
             field="data",
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -318,15 +345,16 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
-        ort = training_args.ort,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=True,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     model = AutoModelForQuestionAnswering.from_pretrained(
         model_args.model_name_or_path,
@@ -334,7 +362,8 @@ def main():
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     # Tokenizer check: this script requires a fast tokenizer.
@@ -587,12 +616,12 @@ def post_processing_function(examples, features, predictions, stage="eval"):
         # Format the result to the format the metric expects.
         if data_args.version_2_with_negative:
             formatted_predictions = [
-                {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
+                {"id": str(k), "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
             ]
         else:
-            formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
+            formatted_predictions = [{"id": str(k), "prediction_text": v} for k, v in predictions.items()]
 
-        references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
+        references = [{"id": str(ex["id"]), "answers": ex[answer_column_name]} for ex in examples]
         return EvalPrediction(predictions=formatted_predictions, label_ids=references)
 
     metric = evaluate.load("squad_v2" if data_args.version_2_with_negative else "squad")
diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py
index 619d8700b943..859bbe9da2af 100755
--- a/examples/pytorch/question-answering/run_qa_beam_search.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search.py
@@ -21,15 +21,17 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
 import datasets
+import evaluate
 from datasets import load_dataset
+from trainer_qa import QuestionAnsweringTrainer
+from utils_qa import postprocess_qa_predictions_with_beam_search
 
-import evaluate
 import transformers
-from trainer_qa import QuestionAnsweringTrainer
 from transformers import (
     DataCollatorWithPadding,
     EvalPrediction,
@@ -44,11 +46,10 @@
 from transformers.trainer_utils import get_last_checkpoint
 from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
-from utils_qa import postprocess_qa_predictions_with_beam_search
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
@@ -78,15 +79,21 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
 
 
 @dataclass
@@ -226,6 +233,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_qa_beam_search", model_args, data_args)
@@ -236,6 +249,11 @@ def main():
         datefmt="%m/%d/%Y %H:%M:%S",
         handlers=[logging.StreamHandler(sys.stdout)],
     )
+
+    if training_args.should_log:
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers.utils.logging.set_verbosity_info()
+
     log_level = training_args.get_process_log_level()
     logger.setLevel(log_level)
     datasets.utils.logging.set_verbosity(log_level)
@@ -246,7 +264,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
@@ -283,7 +301,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -301,7 +319,7 @@ def main():
             data_files=data_files,
             field="data",
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -315,13 +333,13 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     tokenizer = XLNetTokenizerFast.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     model = XLNetForQuestionAnswering.from_pretrained(
         model_args.model_name_or_path,
@@ -329,7 +347,7 @@ def main():
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
 
     # Preprocessing the datasets.
diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
index 1576986305f0..c6b1b17c39a7 100644
--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@@ -27,18 +27,19 @@
 from pathlib import Path
 
 import datasets
+import evaluate
 import numpy as np
 import torch
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
 from datasets import load_dataset
+from huggingface_hub import Repository, create_repo
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
+from utils_qa import postprocess_qa_predictions_with_beam_search
 
-import evaluate
 import transformers
-from accelerate import Accelerator
-from accelerate.logging import get_logger
-from accelerate.utils import set_seed
-from huggingface_hub import Repository
 from transformers import (
     AdamW,
     DataCollatorWithPadding,
@@ -50,13 +51,12 @@
     default_data_collator,
     get_scheduler,
 )
-from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry
+from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
-from utils_qa import postprocess_qa_predictions_with_beam_search
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
@@ -303,7 +303,7 @@ def main():
 
     if args.with_tracking:
         accelerator_log_kwargs["log_with"] = args.report_to
-        accelerator_log_kwargs["logging_dir"] = args.output_dir
+        accelerator_log_kwargs["project_dir"] = args.output_dir
 
     accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
 
@@ -328,11 +328,14 @@ def main():
     # Handle the repository creation
     if accelerator.is_main_process:
         if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            repo = Repository(args.output_dir, clone_from=repo_name)
+            # Retrieve of infer repo_name
+            repo_name = args.hub_model_id
+            if repo_name is None:
+                repo_name = Path(args.output_dir).absolute().name
+            # Create repo and retrieve repo_id
+            repo_id = create_repo(repo_name, exist_ok=True, token=args.hub_token).repo_id
+            # Clone repo locally
+            repo = Repository(args.output_dir, clone_from=repo_id, token=args.hub_token)
 
             with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                 if "step_*" not in gitignore:
@@ -794,36 +797,45 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
     # Potentially load in the weights and states from a previous save
     if args.resume_from_checkpoint:
         if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
-            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
-            accelerator.load_state(args.resume_from_checkpoint)
+            checkpoint_path = args.resume_from_checkpoint
             path = os.path.basename(args.resume_from_checkpoint)
         else:
             # Get the most recent checkpoint
             dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
             dirs.sort(key=os.path.getctime)
             path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
+            checkpoint_path = path
+            path = os.path.basename(checkpoint_path)
+
+        accelerator.print(f"Resumed from checkpoint: {checkpoint_path}")
+        accelerator.load_state(path)
         # Extract `epoch_{i}` or `step_{i}`
         training_difference = os.path.splitext(path)[0]
 
         if "epoch" in training_difference:
             starting_epoch = int(training_difference.replace("epoch_", "")) + 1
             resume_step = None
+            completed_steps = starting_epoch * num_update_steps_per_epoch
         else:
-            resume_step = int(training_difference.replace("step_", ""))
+            # need to multiply `gradient_accumulation_steps` to reflect real steps
+            resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
             starting_epoch = resume_step // len(train_dataloader)
+            completed_steps = resume_step // args.gradient_accumulation_steps
             resume_step -= starting_epoch * len(train_dataloader)
 
+    # update the progress_bar if load from checkpoint
+    progress_bar.update(completed_steps)
+
     for epoch in range(starting_epoch, args.num_train_epochs):
         model.train()
         if args.with_tracking:
             total_loss = 0
-        for step, batch in enumerate(train_dataloader):
-            # We need to skip steps until we reach the resumed step
-            if args.resume_from_checkpoint and epoch == starting_epoch:
-                if resume_step is not None and step < resume_step:
-                    completed_steps += 1
-                    continue
-
+        if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
+            # We skip the first `n` batches in the dataloader when resuming from a checkpoint
+            active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
+        else:
+            active_dataloader = train_dataloader
+        for step, batch in enumerate(active_dataloader):
             with accelerator.accumulate(model):
                 outputs = model(**batch)
                 loss = outputs.loss
diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py
index 51dda97f7a84..bcb16233d2aa 100755
--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@@ -27,18 +27,19 @@
 from pathlib import Path
 
 import datasets
+import evaluate
 import numpy as np
 import torch
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
 from datasets import load_dataset
+from huggingface_hub import Repository, create_repo
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
+from utils_qa import postprocess_qa_predictions
 
-import evaluate
 import transformers
-from accelerate import Accelerator
-from accelerate.logging import get_logger
-from accelerate.utils import set_seed
-from huggingface_hub import Repository
 from transformers import (
     CONFIG_MAPPING,
     MODEL_MAPPING,
@@ -51,13 +52,12 @@
     default_data_collator,
     get_scheduler,
 )
-from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry
+from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
-from utils_qa import postprocess_qa_predictions
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
@@ -273,6 +273,16 @@ def parse_args():
         "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
     )
     parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--trust_remote_code",
+        type=bool,
+        default=False,
+        help=(
+            "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+            "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+            "execute code present on the Hub on your local machine."
+        ),
+    )
     parser.add_argument(
         "--checkpointing_steps",
         type=str,
@@ -341,7 +351,7 @@ def main():
 
     if args.with_tracking:
         accelerator_log_kwargs["log_with"] = args.report_to
-        accelerator_log_kwargs["logging_dir"] = args.output_dir
+        accelerator_log_kwargs["project_dir"] = args.output_dir
 
     accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
 
@@ -366,11 +376,14 @@ def main():
     # Handle the repository creation
     if accelerator.is_main_process:
         if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            repo = Repository(args.output_dir, clone_from=repo_name)
+            # Retrieve of infer repo_name
+            repo_name = args.hub_model_id
+            if repo_name is None:
+                repo_name = Path(args.output_dir).absolute().name
+            # Create repo and retrieve repo_id
+            repo_id = create_repo(repo_name, exist_ok=True, token=args.hub_token).repo_id
+            # Clone repo locally
+            repo = Repository(args.output_dir, clone_from=repo_id, token=args.hub_token)
 
             with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                 if "step_*" not in gitignore:
@@ -412,17 +425,21 @@ def main():
     # download model & vocab.
 
     if args.config_name:
-        config = AutoConfig.from_pretrained(args.config_name)
+        config = AutoConfig.from_pretrained(args.config_name, trust_remote_code=args.trust_remote_code)
     elif args.model_name_or_path:
-        config = AutoConfig.from_pretrained(args.model_name_or_path)
+        config = AutoConfig.from_pretrained(args.model_name_or_path, trust_remote_code=args.trust_remote_code)
     else:
         config = CONFIG_MAPPING[args.model_type]()
         logger.warning("You are instantiating a new config instance from scratch.")
 
     if args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=True)
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.tokenizer_name, use_fast=True, trust_remote_code=args.trust_remote_code
+        )
     elif args.model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=True)
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.model_name_or_path, use_fast=True, trust_remote_code=args.trust_remote_code
+        )
     else:
         raise ValueError(
             "You are instantiating a new tokenizer from scratch. This is not supported by this script."
@@ -434,10 +451,11 @@ def main():
             args.model_name_or_path,
             from_tf=bool(".ckpt" in args.model_name_or_path),
             config=config,
+            trust_remote_code=args.trust_remote_code,
         )
     else:
         logger.info("Training new model from scratch")
-        model = AutoModelForQuestionAnswering.from_config(config)
+        model = AutoModelForQuestionAnswering.from_config(config, trust_remote_code=args.trust_remote_code)
 
     # Preprocessing the datasets.
     # Preprocessing is slighlty different for training and evaluation.
@@ -810,36 +828,45 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
     # Potentially load in the weights and states from a previous save
     if args.resume_from_checkpoint:
         if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
-            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
-            accelerator.load_state(args.resume_from_checkpoint)
+            checkpoint_path = args.resume_from_checkpoint
             path = os.path.basename(args.resume_from_checkpoint)
         else:
             # Get the most recent checkpoint
             dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
             dirs.sort(key=os.path.getctime)
             path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
+            checkpoint_path = path
+            path = os.path.basename(checkpoint_path)
+
+        accelerator.print(f"Resumed from checkpoint: {checkpoint_path}")
+        accelerator.load_state(path)
         # Extract `epoch_{i}` or `step_{i}`
         training_difference = os.path.splitext(path)[0]
 
         if "epoch" in training_difference:
             starting_epoch = int(training_difference.replace("epoch_", "")) + 1
             resume_step = None
+            completed_steps = starting_epoch * num_update_steps_per_epoch
         else:
-            resume_step = int(training_difference.replace("step_", ""))
+            # need to multiply `gradient_accumulation_steps` to reflect real steps
+            resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
             starting_epoch = resume_step // len(train_dataloader)
+            completed_steps = resume_step // args.gradient_accumulation_steps
             resume_step -= starting_epoch * len(train_dataloader)
 
+    # update the progress_bar if load from checkpoint
+    progress_bar.update(completed_steps)
+
     for epoch in range(starting_epoch, args.num_train_epochs):
         model.train()
         if args.with_tracking:
             total_loss = 0
-        for step, batch in enumerate(train_dataloader):
-            # We need to skip steps until we reach the resumed step
-            if args.resume_from_checkpoint and epoch == starting_epoch:
-                if resume_step is not None and step < resume_step:
-                    completed_steps += 1
-                    continue
-
+        if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
+            # We skip the first `n` batches in the dataloader when resuming from a checkpoint
+            active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
+        else:
+            active_dataloader = train_dataloader
+        for step, batch in enumerate(active_dataloader):
             with accelerator.accumulate(model):
                 outputs = model(**batch)
                 loss = outputs.loss
@@ -859,7 +886,7 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
 
             if isinstance(checkpointing_steps, int):
                 if completed_steps % checkpointing_steps == 0:
-                    output_dir = f"step_{completed_steps }"
+                    output_dir = f"step_{completed_steps}"
                     if args.output_dir is not None:
                         output_dir = os.path.join(args.output_dir, output_dir)
                     accelerator.save_state(output_dir)
diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py
index fb24db47fce7..91fe3f414801 100644
--- a/examples/pytorch/question-answering/run_seq2seq_qa.py
+++ b/examples/pytorch/question-answering/run_seq2seq_qa.py
@@ -21,15 +21,17 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import List, Optional, Tuple
 
 import datasets
+import evaluate
+import numpy as np
 from datasets import load_dataset
+from trainer_seq2seq_qa import QuestionAnsweringSeq2SeqTrainer
 
-import evaluate
 import transformers
-from trainer_seq2seq_qa import QuestionAnsweringSeq2SeqTrainer
 from transformers import (
     AutoConfig,
     AutoModelForSeq2SeqLM,
@@ -45,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
@@ -79,12 +81,28 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -272,6 +290,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_seq2seq_qa", model_args, data_args)
@@ -283,6 +307,10 @@ def main():
         handlers=[logging.StreamHandler(sys.stdout)],
     )
 
+    if training_args.should_log:
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers.utils.logging.set_verbosity_info()
+
     log_level = training_args.get_process_log_level()
     logger.setLevel(log_level)
     datasets.utils.logging.set_verbosity(log_level)
@@ -293,7 +321,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
@@ -330,7 +358,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -348,7 +376,7 @@ def main():
             data_files=data_files,
             field="data",
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -362,14 +390,16 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     model = AutoModelForSeq2SeqLM.from_pretrained(
         model_args.model_name_or_path,
@@ -377,7 +407,8 @@ def main():
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
@@ -610,6 +641,8 @@ def post_processing_function(
         preds = outputs.predictions
         if isinstance(preds, tuple):
             preds = preds[0]
+        # Replace -100s used for padding as we can't decode them
+        preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
         decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
 
         # Build a map example to its corresponding features.
diff --git a/examples/pytorch/question-answering/trainer_seq2seq_qa.py b/examples/pytorch/question-answering/trainer_seq2seq_qa.py
index 6abb41b33feb..bdf82bda9f36 100644
--- a/examples/pytorch/question-answering/trainer_seq2seq_qa.py
+++ b/examples/pytorch/question-answering/trainer_seq2seq_qa.py
@@ -46,12 +46,13 @@ def evaluate(
         **gen_kwargs,
     ) -> Dict[str, float]:
         gen_kwargs = gen_kwargs.copy()
-        gen_kwargs["max_length"] = (
-            gen_kwargs["max_length"] if gen_kwargs.get("max_length") is not None else self.args.generation_max_length
-        )
-        gen_kwargs["num_beams"] = (
-            gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.args.generation_num_beams
-        )
+
+        # Use legacy argument setting if a) the option is not explicitly passed; and b) the argument is set in the
+        # training args
+        if gen_kwargs.get("max_length") is None and self.args.generation_max_length is not None:
+            gen_kwargs["max_length"] = self.args.generation_max_length
+        if gen_kwargs.get("num_beams") is None and self.args.generation_num_beams is not None:
+            gen_kwargs["num_beams"] = self.args.generation_num_beams
         self._gen_kwargs = gen_kwargs
 
         eval_dataset = self.eval_dataset if eval_dataset is None else eval_dataset
diff --git a/examples/pytorch/semantic-segmentation/README.md b/examples/pytorch/semantic-segmentation/README.md
index 0f2f996761bd..3b9d342d48c7 100644
--- a/examples/pytorch/semantic-segmentation/README.md
+++ b/examples/pytorch/semantic-segmentation/README.md
@@ -40,7 +40,7 @@ from datasets import Dataset, DatasetDict, Image
 
 # your images can of course have a different extension
 # semantic segmentation maps are typically stored in the png format
-image_paths_train = ["path/to/image_1.jpg/jpg", "path/to/image_2.jpg/jpg", ..., "path/to/image_n.jpg/jpg"] 
+image_paths_train = ["path/to/image_1.jpg/jpg", "path/to/image_2.jpg/jpg", ..., "path/to/image_n.jpg/jpg"]
 label_paths_train = ["path/to/annotation_1.png", "path/to/annotation_2.png", ..., "path/to/annotation_n.png"]
 
 # same for validation
@@ -52,7 +52,7 @@ def create_dataset(image_paths, label_paths):
                                 "label": sorted(label_paths)})
     dataset = dataset.cast_column("image", Image())
     dataset = dataset.cast_column("label", Image())
-    
+
     return dataset
 
 # step 1: create Dataset objects
@@ -62,7 +62,7 @@ validation_dataset = create_dataset(image_paths_validation, label_paths_validati
 # step 2: create DatasetDict
 dataset = DatasetDict({
     "train": train_dataset,
-    "validation": val_dataset,
+    "validation": validation_dataset,
   }
 )
 
@@ -91,7 +91,7 @@ You can easily upload this by clicking on "Add file" in the "Files and versions"
 
 ## PyTorch version, Trainer
 
-Based on the script [`run_semantic_segmentation.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py). 
+Based on the script [`run_semantic_segmentation.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py).
 
 The script leverages the [🤗 Trainer API](https://huggingface.co/docs/transformers/main_classes/trainer) to automatically take care of the training for you, running on distributed environments right away.
 
@@ -130,7 +130,7 @@ Note that you can replace the model and dataset by simply setting the `model_nam
 
 Based on the script [`run_semantic_segmentation_no_trainer.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py).
 
-The script leverages [🤗 `Accelerate`](https://github.com/huggingface/accelerate), which allows to write your own training loop in PyTorch, but have it run instantly on any (distributed) environment, including CPU, multi-CPU, GPU, multi-GPU and TPU. It also supports mixed precision. 
+The script leverages [🤗 `Accelerate`](https://github.com/huggingface/accelerate), which allows to write your own training loop in PyTorch, but have it run instantly on any (distributed) environment, including CPU, multi-CPU, GPU, multi-GPU and TPU. It also supports mixed precision.
 
 First, run:
 
@@ -161,11 +161,11 @@ The resulting model can be seen here: https://huggingface.co/nielsr/segformer-fi
 This means that after training, you can easily load your trained model as follows:
 
 ```python
-from transformers import AutoFeatureExtractor, AutoModelForSemanticSegmentation
+from transformers import AutoImageProcessor, AutoModelForSemanticSegmentation
 
 model_name = "name_of_repo_on_the_hub_or_path_to_local_folder"
 
-feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
+image_processor = AutoImageProcessor.from_pretrained(model_name)
 model = AutoModelForSemanticSegmentation.from_pretrained(model_name)
 ```
 
@@ -180,7 +180,7 @@ url = "http://images.cocodataset.org/val2017/000000039769.jpg"
 image = Image.open(requests.get(url, stream=True).raw)
 
 # prepare image for the model
-inputs = feature_extractor(images=image, return_tensors="pt")
+inputs = image_processor(images=image, return_tensors="pt")
 
 with torch.no_grad():
     outputs = model(**inputs)
@@ -201,4 +201,4 @@ For visualization of the segmentation maps, we refer to the [example notebook](h
 
 Some datasets, like [`scene_parse_150`](https://huggingface.co/datasets/scene_parse_150), contain a "background" label that is not part of the classes. The Scene Parse 150 dataset for instance contains labels between 0 and 150, with 0 being the background class, and 1 to 150 being actual class names (like "tree", "person", etc.). For these kind of datasets, one replaces the background label (0) by 255, which is the `ignore_index` of the PyTorch model's loss function, and reduces all labels by 1. This way, the `labels` are PyTorch tensors containing values between 0 and 149, and 255 for all background/padding.
 
-In case you're training on such a dataset, make sure to set the ``reduce_labels`` flag, which will take care of this.
\ No newline at end of file
+In case you're training on such a dataset, make sure to set the ``reduce_labels`` flag, which will take care of this.
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
index 492e58809412..9876f79b6d07 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
@@ -18,23 +18,24 @@
 import os
 import random
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
+import evaluate
 import numpy as np
 import torch
 from datasets import load_dataset
+from huggingface_hub import hf_hub_download
 from PIL import Image
 from torch import nn
 from torchvision import transforms
 from torchvision.transforms import functional
 
-import evaluate
 import transformers
-from huggingface_hub import hf_hub_download
 from transformers import (
     AutoConfig,
-    AutoFeatureExtractor,
+    AutoImageProcessor,
     AutoModelForSemanticSegmentation,
     HfArgumentParser,
     Trainer,
@@ -51,7 +52,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt")
 
@@ -240,13 +241,29 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
-    feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
+    image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -265,6 +282,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_semantic_segmentation", model_args, data_args)
@@ -276,6 +299,10 @@ def main():
         handlers=[logging.StreamHandler(sys.stdout)],
     )
 
+    if training_args.should_log:
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers.utils.logging.set_verbosity_info()
+
     log_level = training_args.get_process_log_level()
     logger.setLevel(log_level)
     transformers.utils.logging.set_verbosity(log_level)
@@ -285,7 +312,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
@@ -358,7 +385,7 @@ def compute_metrics(eval_pred):
             references=labels,
             num_labels=len(id2label),
             ignore_index=0,
-            reduce_labels=feature_extractor.do_reduce_labels,
+            reduce_labels=image_processor.do_reduce_labels,
         )
         # add per category metrics as individual key-value pairs
         per_category_accuracy = metrics.pop("per_category_accuracy").tolist()
@@ -375,7 +402,8 @@ def compute_metrics(eval_pred):
         id2label=id2label,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     model = AutoModelForSemanticSegmentation.from_pretrained(
         model_args.model_name_or_path,
@@ -383,23 +411,25 @@ def compute_metrics(eval_pred):
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
-    feature_extractor = AutoFeatureExtractor.from_pretrained(
-        model_args.feature_extractor_name or model_args.model_name_or_path,
+    image_processor = AutoImageProcessor.from_pretrained(
+        model_args.image_processor_name or model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     # Define torchvision transforms to be applied to each image + target.
     # Not that straightforward in torchvision: https://github.com/pytorch/vision/issues/9
     # Currently based on official torchvision references: https://github.com/pytorch/vision/blob/main/references/segmentation/transforms.py
-    if "shortest_edge" in feature_extractor.size:
+    if "shortest_edge" in image_processor.size:
         # We instead set the target size as (shortest_edge, shortest_edge) to here to ensure all images are batchable.
-        size = (feature_extractor.size["shortest_edge"], feature_extractor.size["shortest_edge"])
+        size = (image_processor.size["shortest_edge"], image_processor.size["shortest_edge"])
     else:
-        size = (feature_extractor.size["height"], feature_extractor.size["width"])
+        size = (image_processor.size["height"], image_processor.size["width"])
     train_transforms = Compose(
         [
             ReduceLabels() if data_args.reduce_labels else Identity(),
@@ -407,7 +437,7 @@ def compute_metrics(eval_pred):
             RandomHorizontalFlip(flip_prob=0.5),
             PILToTensor(),
             ConvertImageDtype(torch.float),
-            Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std),
+            Normalize(mean=image_processor.image_mean, std=image_processor.image_std),
         ]
     )
     # Define torchvision transform to be applied to each image.
@@ -418,7 +448,7 @@ def compute_metrics(eval_pred):
             Resize(size=size),
             PILToTensor(),
             ConvertImageDtype(torch.float),
-            Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std),
+            Normalize(mean=image_processor.image_mean, std=image_processor.image_std),
         ]
     )
 
@@ -430,7 +460,7 @@ def preprocess_train(example_batch):
             pixel_values.append(image)
             labels.append(target)
 
-        encoding = dict()
+        encoding = {}
         encoding["pixel_values"] = torch.stack(pixel_values)
         encoding["labels"] = torch.stack(labels)
 
@@ -444,7 +474,7 @@ def preprocess_val(example_batch):
             pixel_values.append(image)
             labels.append(target)
 
-        encoding = dict()
+        encoding = {}
         encoding["pixel_values"] = torch.stack(pixel_values)
         encoding["labels"] = torch.stack(labels)
 
@@ -477,7 +507,7 @@ def preprocess_val(example_batch):
         train_dataset=dataset["train"] if training_args.do_train else None,
         eval_dataset=dataset["validation"] if training_args.do_eval else None,
         compute_metrics=compute_metrics,
-        tokenizer=feature_extractor,
+        tokenizer=image_processor,
         data_collator=default_data_collator,
     )
 
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
index f1ff9ad720d8..6434b1054bd7 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
@@ -22,35 +22,35 @@
 from pathlib import Path
 
 import datasets
+import evaluate
 import numpy as np
 import torch
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
 from datasets import load_dataset
+from huggingface_hub import Repository, create_repo, hf_hub_download
 from PIL import Image
 from torch.utils.data import DataLoader
 from torchvision import transforms
 from torchvision.transforms import functional
 from tqdm.auto import tqdm
 
-import evaluate
 import transformers
-from accelerate import Accelerator
-from accelerate.logging import get_logger
-from accelerate.utils import set_seed
-from huggingface_hub import Repository, hf_hub_download
 from transformers import (
     AutoConfig,
-    AutoFeatureExtractor,
+    AutoImageProcessor,
     AutoModelForSemanticSegmentation,
     SchedulerType,
     default_data_collator,
     get_scheduler,
 )
-from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry
+from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 logger = get_logger(__name__)
 
@@ -273,6 +273,16 @@ def parse_args():
         "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
     )
     parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--trust_remote_code",
+        type=bool,
+        default=False,
+        help=(
+            "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+            "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+            "execute code present on the Hub on your local machine."
+        ),
+    )
     parser.add_argument(
         "--checkpointing_steps",
         type=str,
@@ -330,7 +340,7 @@ def main():
 
     if args.with_tracking:
         accelerator_log_kwargs["log_with"] = args.report_to
-        accelerator_log_kwargs["logging_dir"] = args.output_dir
+        accelerator_log_kwargs["project_dir"] = args.output_dir
 
     accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
 
@@ -350,11 +360,14 @@ def main():
     # Handle the repository creation
     if accelerator.is_main_process:
         if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            repo = Repository(args.output_dir, clone_from=repo_name)
+            # Retrieve of infer repo_name
+            repo_name = args.hub_model_id
+            if repo_name is None:
+                repo_name = Path(args.output_dir).absolute().name
+            # Create repo and retrieve repo_id
+            repo_id = create_repo(repo_name, exist_ok=True, token=args.hub_token).repo_id
+            # Clone repo locally
+            repo = Repository(args.output_dir, clone_from=repo_id, token=args.hub_token)
 
             with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                 if "step_*" not in gitignore:
@@ -396,20 +409,26 @@ def main():
     id2label = {int(k): v for k, v in id2label.items()}
     label2id = {v: k for k, v in id2label.items()}
 
-    # Load pretrained model and feature extractor
-    config = AutoConfig.from_pretrained(args.model_name_or_path, id2label=id2label, label2id=label2id)
-    feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_name_or_path)
-    model = AutoModelForSemanticSegmentation.from_pretrained(args.model_name_or_path, config=config)
+    # Load pretrained model and image processor
+    config = AutoConfig.from_pretrained(
+        args.model_name_or_path, id2label=id2label, label2id=label2id, trust_remote_code=args.trust_remote_code
+    )
+    image_processor = AutoImageProcessor.from_pretrained(
+        args.model_name_or_path, trust_remote_code=args.trust_remote_code
+    )
+    model = AutoModelForSemanticSegmentation.from_pretrained(
+        args.model_name_or_path, config=config, trust_remote_code=args.trust_remote_code
+    )
 
     # Preprocessing the datasets
     # Define torchvision transforms to be applied to each image + target.
     # Not that straightforward in torchvision: https://github.com/pytorch/vision/issues/9
     # Currently based on official torchvision references: https://github.com/pytorch/vision/blob/main/references/segmentation/transforms.py
-    if "shortest_edge" in feature_extractor.size:
+    if "shortest_edge" in image_processor.size:
         # We instead set the target size as (shortest_edge, shortest_edge) to here to ensure all images are batchable.
-        size = (feature_extractor.size["shortest_edge"], feature_extractor.size["shortest_edge"])
+        size = (image_processor.size["shortest_edge"], image_processor.size["shortest_edge"])
     else:
-        size = (feature_extractor.size["height"], feature_extractor.size["width"])
+        size = (image_processor.size["height"], image_processor.size["width"])
     train_transforms = Compose(
         [
             ReduceLabels() if args.reduce_labels else Identity(),
@@ -417,7 +436,7 @@ def main():
             RandomHorizontalFlip(flip_prob=0.5),
             PILToTensor(),
             ConvertImageDtype(torch.float),
-            Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std),
+            Normalize(mean=image_processor.image_mean, std=image_processor.image_std),
         ]
     )
     # Define torchvision transform to be applied to each image.
@@ -428,7 +447,7 @@ def main():
             Resize(size=size),
             PILToTensor(),
             ConvertImageDtype(torch.float),
-            Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std),
+            Normalize(mean=image_processor.image_mean, std=image_processor.image_std),
         ]
     )
 
@@ -440,7 +459,7 @@ def preprocess_train(example_batch):
             pixel_values.append(image)
             labels.append(target)
 
-        encoding = dict()
+        encoding = {}
         encoding["pixel_values"] = torch.stack(pixel_values)
         encoding["labels"] = torch.stack(labels)
 
@@ -454,7 +473,7 @@ def preprocess_val(example_batch):
             pixel_values.append(image)
             labels.append(target)
 
-        encoding = dict()
+        encoding = {}
         encoding["pixel_values"] = torch.stack(pixel_values)
         encoding["labels"] = torch.stack(labels)
 
@@ -539,36 +558,45 @@ def preprocess_val(example_batch):
     # Potentially load in the weights and states from a previous save
     if args.resume_from_checkpoint:
         if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
-            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
-            accelerator.load_state(args.resume_from_checkpoint)
+            checkpoint_path = args.resume_from_checkpoint
             path = os.path.basename(args.resume_from_checkpoint)
         else:
             # Get the most recent checkpoint
             dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
             dirs.sort(key=os.path.getctime)
             path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
+            checkpoint_path = path
+            path = os.path.basename(checkpoint_path)
+
+        accelerator.print(f"Resumed from checkpoint: {checkpoint_path}")
+        accelerator.load_state(path)
         # Extract `epoch_{i}` or `step_{i}`
         training_difference = os.path.splitext(path)[0]
 
         if "epoch" in training_difference:
             starting_epoch = int(training_difference.replace("epoch_", "")) + 1
             resume_step = None
+            completed_steps = starting_epoch * num_update_steps_per_epoch
         else:
-            resume_step = int(training_difference.replace("step_", ""))
+            # need to multiply `gradient_accumulation_steps` to reflect real steps
+            resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
             starting_epoch = resume_step // len(train_dataloader)
+            completed_steps = resume_step // args.gradient_accumulation_steps
             resume_step -= starting_epoch * len(train_dataloader)
 
+    # update the progress_bar if load from checkpoint
+    progress_bar.update(completed_steps)
+
     for epoch in range(starting_epoch, args.num_train_epochs):
+        model.train()
         if args.with_tracking:
             total_loss = 0
-        model.train()
-        for step, batch in enumerate(train_dataloader):
-            # We need to skip steps until we reach the resumed step
-            if args.resume_from_checkpoint and epoch == starting_epoch:
-                if resume_step is not None and step < resume_step:
-                    completed_steps += 1
-                    continue
-
+        if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
+            # We skip the first `n` batches in the dataloader when resuming from a checkpoint
+            active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
+        else:
+            active_dataloader = train_dataloader
+        for step, batch in enumerate(active_dataloader):
             with accelerator.accumulate(model):
                 outputs = model(**batch)
                 loss = outputs.loss
@@ -587,7 +615,7 @@ def preprocess_val(example_batch):
 
             if isinstance(checkpointing_steps, int):
                 if completed_steps % checkpointing_steps == 0:
-                    output_dir = f"step_{completed_steps }"
+                    output_dir = f"step_{completed_steps}"
                     if args.output_dir is not None:
                         output_dir = os.path.join(args.output_dir, output_dir)
                     accelerator.save_state(output_dir)
@@ -601,7 +629,7 @@ def preprocess_val(example_batch):
                             save_function=accelerator.save,
                         )
                         if accelerator.is_main_process:
-                            feature_extractor.save_pretrained(args.output_dir)
+                            image_processor.save_pretrained(args.output_dir)
                             repo.push_to_hub(
                                 commit_message=f"Training in progress {completed_steps} steps",
                                 blocking=False,
@@ -656,7 +684,7 @@ def preprocess_val(example_batch):
                 args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
             )
             if accelerator.is_main_process:
-                feature_extractor.save_pretrained(args.output_dir)
+                image_processor.save_pretrained(args.output_dir)
                 repo.push_to_hub(
                     commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
                 )
@@ -677,11 +705,13 @@ def preprocess_val(example_batch):
             args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
         )
         if accelerator.is_main_process:
-            feature_extractor.save_pretrained(args.output_dir)
+            image_processor.save_pretrained(args.output_dir)
             if args.push_to_hub:
                 repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
 
-            all_results = {f"eval_{k}": v for k, v in eval_metrics.items()}
+            all_results = {
+                f"eval_{k}": v.tolist() if isinstance(v, np.ndarray) else v for k, v in eval_metrics.items()
+            }
             with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
                 json.dump(all_results, f)
 
diff --git a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
index c15a8b73f548..6bde6d2b7d0f 100755
--- a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
+++ b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
@@ -24,14 +24,14 @@
 
 import datasets
 import torch
+from accelerate import Accelerator
+from accelerate.logging import get_logger
 from datasets import DatasetDict, concatenate_datasets, load_dataset
+from huggingface_hub import Repository, create_repo
 from torch.utils.data.dataloader import DataLoader
 from tqdm.auto import tqdm
 
 import transformers
-from accelerate import Accelerator
-from accelerate.logging import get_logger
-from huggingface_hub import Repository
 from transformers import (
     AdamW,
     SchedulerType,
@@ -43,7 +43,7 @@
     set_seed,
 )
 from transformers.models.wav2vec2.modeling_wav2vec2 import _compute_mask_indices, _sample_negative_indices
-from transformers.utils import get_full_repo_name, send_example_telemetry
+from transformers.utils import send_example_telemetry
 
 
 logger = get_logger(__name__)
@@ -418,11 +418,14 @@ def main():
     # Handle the repository creation
     if accelerator.is_main_process:
         if args.push_to_hub and not args.preprocessing_only:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            repo = Repository(args.output_dir, clone_from=repo_name)
+            # Retrieve of infer repo_name
+            repo_name = args.hub_model_id
+            if repo_name is None:
+                repo_name = Path(args.output_dir).absolute().name
+            # Create repo and retrieve repo_id
+            repo_id = create_repo(repo_name, exist_ok=True, token=args.hub_token).repo_id
+            # Clone repo locally
+            repo = Repository(args.output_dir, clone_from=repo_id, token=args.hub_token)
         elif args.output_dir is not None:
             os.makedirs(args.output_dir, exist_ok=True)
     accelerator.wait_for_everyone()
@@ -640,7 +643,6 @@ def prepare_dataset(batch):
 
             # update step
             if (step + 1) % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
-
                 # compute grad norm for monitoring
                 scale = (
                     accelerator.scaler._scale.item()
diff --git a/examples/pytorch/speech-recognition/README.md b/examples/pytorch/speech-recognition/README.md
index cf5a05c01783..6ae2e1abef60 100644
--- a/examples/pytorch/speech-recognition/README.md
+++ b/examples/pytorch/speech-recognition/README.md
@@ -26,6 +26,10 @@ limitations under the License.
 		- [Librispeech](#librispeech-ctc)
 		- [Common Voice](#common-voice-ctc)
 		- [Multilingual Librispeech](#multilingual-librispeech-ctc)
+- [Automatic Speech Recognition with CTC and Adapter Layers](#connectionist-temporal-classification-with-adapters)
+	- [Massive Multilingual Speech (MMS)](#mms-model)
+	- [Examples](#examples-ctc-adapter)
+		- [Common Voice](#common-voice-ctc-adapter)
 - [Automatic Speech Recognition with Sequence-to-Sequence](#sequence-to-sequence)
 	- [Whisper Model](#whisper-model)
 	- [Speech-Encoder-Decoder Model](#warm-started-speech-encoder-decoder-model)
@@ -243,6 +247,111 @@ they can serve as a baseline to improve upon.
 | [Multilingual Librispeech](https://huggingface.co/datasets/multilingual_librispeech)| `"german"`  | [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53)  | 0.13  | -     | 1 GPU Titan 24 GB RAM  |  15h04                 | [here](https://huggingface.co/patrickvonplaten/wav2vec2-xlsr-53-300m-mls-german-ft)      |  [run.sh](https://huggingface.co/patrickvonplaten/wav2vec2-xlsr-53-300m-mls-german-ft/blob/main/run.sh) |
 | [Multilingual Librispeech](https://huggingface.co/datasets/multilingual_librispeech)| `"german"`  | [facebook/wav2vec2-xls-r-300m](https://huggingface.co/facebook/wav2vec2-xls-r-300m)  | 0.15 | -     | 1 GPU Titan 24 GB RAM  |  15h04                 | [here](https://huggingface.co/patrickvonplaten/wav2vec2-300m-mls-german-ft)      |  [run.sh](https://huggingface.co/patrickvonplaten/wav2vec2-300m-mls-german-ft/blob/main/run.sh) |
 
+## Connectionist Temporal Classification With Adapters
+
+The script [`run_speech_recognition_ctc_adapter.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py) can be used to fine-tune adapter layers for [Wav2Vec2-like models like MMS](https://huggingface.co/docs/transformers/main/en/model_doc/mms) for automatic speech recognition.
+
+### MMS Model
+
+The [Massive Multilingual Speech (MMS) model](https://huggingface.co/facebook/mms-1b-all) has been pre-trained and fine-tuned
+on 1000+ languages. The model makes use of adapter attention layers to fine-tune only a small part 
+of the model on a specific language. The model already comes with fine-tuned adapter layers for 1000+ languages and 
+can be used for inference for 1000+ languages out of the box.
+
+However, for improved performance or more specific use cases one can re-initialize the adapter weights, freeze all 
+other weights and fine-tune them on a specific dataset as shown in the [example below](#examples-ctc-adapter).
+
+Note that the adapter weights include low dimensional linear layers for every attention block as well as the final language
+model head layers.
+
+### Examples CTC Adapter
+
+In the following we will look at how one can fine-tune adapter weights for any of the 
+[MMS CTC checkpoints](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&other=mms&sort=downloads) in less than 1 hour.
+
+#### Common Voice CTC Adapter
+
+As in the examples [above](#examples-ctc), we fine-tune on Common Voice's 6 dataset in Turkish as an example.
+Contrary to [`run_speech_recognition_ctc.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py) before there is a `--target_language` which has to be defined to state for which 
+language or concept the adapter layers shall be trained. The adapter weights will then 
+accordingly be called `adapter.{<target_language}.safetensors`.
+
+Let's run an example script. Make sure to be logged in so that your model can be directly uploaded to the Hub.
+```
+huggingface-cli login
+```
+
+Now, let's run an example and upload it to the Hub under `wav2vec2-common_voice-tr-mms-demo`.
+
+```sh
+python run_speech_recognition_ctc.py \
+	--dataset_name="common_voice" \
+	--model_name_or_path="facebook/mms-1b-all" \
+	--dataset_config_name="tr" \
+	--output_dir="./wav2vec2-common_voice-tr-mms-demo" \
+	--num_train_epochs="4" \
+	--per_device_train_batch_size="32" \
+	--learning_rate="1e-3" \
+	--warmup_steps="100" \
+	--evaluation_strategy="steps" \
+	--text_column_name="sentence" \
+	--length_column_name="input_length" \
+	--save_steps="200" \
+	--eval_steps="100" \
+	--save_total_limit="3" \
+  --target_language="tur" \
+	--gradient_checkpointing \
+	--chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” � \
+	--fp16 \
+	--group_by_length \
+	--do_train --do_eval \
+  --push_to_hub
+```
+
+This should take less than 10 minutes on most GPUs and you should very quickly get word error rates 
+below 27%.
+
+For an example run, you can have a look at [`patrickvonplaten/wav2vec2-common_voice-tr-mms-demo`](https://huggingface.co/patrickvonplaten/wav2vec2-common_voice-tr-mms-demo).
+
+
+If you'd like to train another adapter model with the same base model, you can simply re-use the same `--output_dir`,
+but make sure to pass the `--output_dir` folder also to `--tokenizer_name_or_path` so that the vocabulary is not 
+overwritten but **extended**. Assuming you would like to train adapter weights on Swedish in addition to Turkish and save 
+the adapter weights in the same model repo, you can run:
+
+```sh
+python run_speech_recognition_ctc.py \
+	--dataset_name="common_voice" \
+	--model_name_or_path="facebook/mms-1b-all" \
+	--dataset_config_name="sw" \
+	--output_dir="./wav2vec2-common_voice-tr-mms-demo" \
+	--tokenizer_name_or_path="./wav2vec2-common_voice-tr-mms-demo" \
+	--num_train_epochs="4" \
+	--per_device_train_batch_size="32" \
+	--learning_rate="1e-3" \
+	--warmup_steps="100" \
+	--evaluation_strategy="steps" \
+	--text_column_name="sentence" \
+	--length_column_name="input_length" \
+	--save_steps="200" \
+	--eval_steps="100" \
+	--save_total_limit="3" \
+  --target_language="swe" \
+	--gradient_checkpointing \
+	--chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” � \
+	--fp16 \
+	--group_by_length \
+	--do_train --do_eval \
+  --push_to_hub
+```
+
+Now you should have both `adapter.tur.safetensors` and `adapter.swe.safetensors` in the model repo
+and you can load the respective language with:
+```py
+model.load_adapter("tur")  # or "swe"
+```
+respectively.
+
 ## Sequence to Sequence
 
 The script [`run_speech_recognition_seq2seq.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py) can be used to fine-tune any [Speech Sequence-to-Sequence Model](https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoModelForSpeechSeq2Seq) for automatic speech 
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
index 097123190949..2efbd6e8960e 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
@@ -12,6 +12,7 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
+# limitations under the License.
 
 """ Fine-tuning a 🤗 Transformers CTC model for automatic speech recognition"""
 
@@ -26,11 +27,11 @@
 from typing import Dict, List, Optional, Union
 
 import datasets
+import evaluate
 import numpy as np
 import torch
 from datasets import DatasetDict, load_dataset
 
-import evaluate
 import transformers
 from transformers import (
     AutoConfig,
@@ -50,7 +51,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 
@@ -228,12 +229,28 @@ class DataTrainingArguments:
             )
         },
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "If :obj:`True`, will use the token generated when running"
-                ":obj:`huggingface-cli login` as HTTP bearer authorization for remote files."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -294,7 +311,7 @@ class DataCollatorCTCWithPadding:
     pad_to_multiple_of_labels: Optional[int] = None
 
     def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
-        # split inputs and labels since they have to be of different lenghts and need
+        # split inputs and labels since they have to be of different lengths and need
         # different padding methods
         input_features = [{"input_values": feature["input_values"]} for feature in features]
         label_features = [{"input_ids": feature["labels"]} for feature in features]
@@ -348,7 +365,7 @@ def extract_all_chars(batch):
         lambda vocab_1, vocab_2: set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]), vocabs.values()
     )
 
-    vocab_dict = {v: k for k, v in enumerate(sorted(list(vocab_set)))}
+    vocab_dict = {v: k for k, v in enumerate(sorted(vocab_set))}
 
     # replace white space with delimiter token
     if word_delimiter_token is not None:
@@ -378,6 +395,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if data_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if data_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        data_args.token = data_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_speech_recognition_ctc", model_args, data_args)
@@ -408,7 +431,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     # Set the verbosity to info of the Transformers logger (on main process only):
     if is_main_process(training_args.local_rank):
@@ -426,7 +449,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             split=data_args.train_split_name,
-            use_auth_token=data_args.use_auth_token,
+            token=data_args.token,
         )
 
         if data_args.audio_column_name not in raw_datasets["train"].column_names:
@@ -451,7 +474,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             split=data_args.eval_split_name,
-            use_auth_token=data_args.use_auth_token,
+            token=data_args.token,
         )
 
         if data_args.max_eval_samples is not None:
@@ -489,7 +512,10 @@ def remove_special_characters(batch):
     # the tokenizer
     # load config
     config = AutoConfig.from_pretrained(
-        model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
+        model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
     )
 
     # 4. Next, if no tokenizer file is defined,
@@ -545,11 +571,15 @@ def remove_special_characters(batch):
     # load feature_extractor and tokenizer
     tokenizer = AutoTokenizer.from_pretrained(
         tokenizer_name_or_path,
-        use_auth_token=data_args.use_auth_token,
+        token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
         **tokenizer_kwargs,
     )
     feature_extractor = AutoFeatureExtractor.from_pretrained(
-        model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
+        model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
     )
 
     # adapt config
@@ -577,7 +607,8 @@ def remove_special_characters(batch):
         model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         config=config,
-        use_auth_token=data_args.use_auth_token,
+        token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
     )
 
     # freeze encoder
@@ -672,11 +703,14 @@ def compute_metrics(pred):
         return metrics
 
     # Now save everything to be able to create a single processor later
-    if is_main_process(training_args.local_rank):
-        # save feature extractor, tokenizer and config
-        feature_extractor.save_pretrained(training_args.output_dir)
-        tokenizer.save_pretrained(training_args.output_dir)
-        config.save_pretrained(training_args.output_dir)
+    # make sure all processes wait until data is saved
+    with training_args.main_process_first():
+        # only the main process saves them
+        if is_main_process(training_args.local_rank):
+            # save feature extractor, tokenizer and config
+            feature_extractor.save_pretrained(training_args.output_dir)
+            tokenizer.save_pretrained(training_args.output_dir)
+            config.save_pretrained(training_args.output_dir)
 
     try:
         processor = AutoProcessor.from_pretrained(training_args.output_dir)
@@ -701,14 +735,13 @@ def compute_metrics(pred):
         compute_metrics=compute_metrics,
         train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
         eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
-        tokenizer=feature_extractor,
+        tokenizer=processor,
     )
 
     # 8. Finally, we can start training
 
     # Training
     if training_args.do_train:
-
         # use last checkpoint if exist
         if last_checkpoint is not None:
             checkpoint = last_checkpoint
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
new file mode 100755
index 000000000000..48d5aa410312
--- /dev/null
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
@@ -0,0 +1,833 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Fine-tuning a 🤗 Transformers CTC adapter model for automatic speech recognition"""
+
+import functools
+import json
+import logging
+import os
+import re
+import sys
+import warnings
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Union
+
+import datasets
+import evaluate
+import numpy as np
+import torch
+from datasets import DatasetDict, load_dataset
+from safetensors.torch import save_file as safe_save_file
+
+import transformers
+from transformers import (
+    AutoConfig,
+    AutoFeatureExtractor,
+    AutoModelForCTC,
+    AutoProcessor,
+    AutoTokenizer,
+    HfArgumentParser,
+    Trainer,
+    TrainingArguments,
+    Wav2Vec2Processor,
+    set_seed,
+)
+from transformers.models.wav2vec2.modeling_wav2vec2 import WAV2VEC2_ADAPTER_SAFE_FILE
+from transformers.trainer_utils import get_last_checkpoint, is_main_process
+from transformers.utils import check_min_version, send_example_telemetry
+from transformers.utils.versions import require_version
+
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.35.0.dev0")
+
+require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
+
+
+logger = logging.getLogger(__name__)
+
+
+def list_field(default=None, metadata=None):
+    return field(default_factory=lambda: default, metadata=metadata)
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    tokenizer_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to pretrained tokenizer or tokenizer identifier from huggingface.co/models"},
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
+    )
+    final_dropout: float = field(
+        default=0.0,
+        metadata={"help": "The dropout probability for the final projection layer."},
+    )
+    mask_time_prob: float = field(
+        default=0.05,
+        metadata={
+            "help": (
+                "Probability of each feature vector along the time axis to be chosen as the start of the vector"
+                "span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature"
+                "vectors will be masked along the time axis."
+            )
+        },
+    )
+    mask_time_length: int = field(
+        default=10,
+        metadata={"help": "Length of vector span to mask along the time axis."},
+    )
+    mask_feature_prob: float = field(
+        default=0.0,
+        metadata={
+            "help": (
+                "Probability of each feature vector along the feature axis to be chosen as the start of the vectorspan"
+                " to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature"
+                " bins will be masked along the time axis."
+            )
+        },
+    )
+    mask_feature_length: int = field(
+        default=10,
+        metadata={"help": "Length of vector span to mask along the feature axis."},
+    )
+    layerdrop: float = field(default=0.0, metadata={"help": "The LayerDrop probability."})
+    ctc_loss_reduction: Optional[str] = field(
+        default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}
+    )
+    adapter_attn_dim: int = field(
+        default=16,
+        metadata={
+            "help": "The hidden dimension of the adapter layers that will be randomly initialized and trained. The higher the dimension, the more capacity is given to the adapter weights. Note that only the adapter weights are fine-tuned."
+        },
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+
+    Using `HfArgumentParser` we can turn this class
+    into argparse arguments to be able to specify them on
+    the command line.
+    """
+
+    dataset_name: str = field(
+        metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    target_language: Optional[str] = field(
+        metadata={
+            "help": (
+                "The target language on which the adapter attention layers"
+                " should be trained on in ISO 693-3 code, e.g. `tur` for Turkish"
+                " Wav2Vec2's MMS ISO codes can be looked up here: https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html"
+                " If you are not training the adapter layers on a language, simply choose"
+                " another accronym that fits your data."
+            )
+        },
+    )
+    dataset_config_name: str = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_split_name: str = field(
+        default="train+validation",
+        metadata={
+            "help": (
+                "The name of the training data set split to use (via the datasets library). Defaults to "
+                "'train+validation'"
+            )
+        },
+    )
+    eval_split_name: str = field(
+        default="test",
+        metadata={
+            "help": "The name of the evaluation data set split to use (via the datasets library). Defaults to 'test'"
+        },
+    )
+    audio_column_name: str = field(
+        default="audio",
+        metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
+    )
+    text_column_name: str = field(
+        default="text",
+        metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of training examples to this "
+                "value if set."
+            )
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of validation examples to this "
+                "value if set."
+            )
+        },
+    )
+    chars_to_ignore: Optional[List[str]] = list_field(
+        default=None,
+        metadata={"help": "A list of characters to remove from the transcripts."},
+    )
+    eval_metrics: List[str] = list_field(
+        default=["wer"],
+        metadata={"help": "A list of metrics the model should be evaluated on. E.g. `'wer cer'`"},
+    )
+    max_duration_in_seconds: float = field(
+        default=20.0,
+        metadata={
+            "help": (
+                "Filter audio files that are longer than `max_duration_in_seconds` seconds to"
+                " 'max_duration_in_seconds`"
+            )
+        },
+    )
+    min_duration_in_seconds: float = field(
+        default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
+    )
+    preprocessing_only: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to only do data preprocessing and skip training. This is especially useful when data"
+                " preprocessing errors out in distributed training due to timeout. In this case, one should run the"
+                " preprocessing in a non-distributed setup with `preprocessing_only=True` so that the cached datasets"
+                " can consequently be loaded in distributed training"
+            )
+        },
+    )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
+    unk_token: str = field(
+        default="[UNK]",
+        metadata={"help": "The unk token for the tokenizer"},
+    )
+    pad_token: str = field(
+        default="[PAD]",
+        metadata={"help": "The padding token for the tokenizer"},
+    )
+    word_delimiter_token: str = field(
+        default="|",
+        metadata={"help": "The word delimiter token for the tokenizer"},
+    )
+    overwrite_lang_vocab: bool = field(
+        default=False,
+        metadata={"help": ("If :obj:`True`, will overwrite existing `target_language` vocabulary of tokenizer.")},
+    )
+
+
+@dataclass
+class DataCollatorCTCWithPadding:
+    """
+    Data collator that will dynamically pad the inputs received.
+    Args:
+        processor (:class:`~transformers.AutoProcessor`)
+            The processor used for proccessing the data.
+        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
+            among:
+            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+              sequence if provided).
+            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+              maximum acceptable input length for the model if that argument is not provided.
+            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+              different lengths).
+        max_length (:obj:`int`, `optional`):
+            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
+        max_length_labels (:obj:`int`, `optional`):
+            Maximum length of the ``labels`` returned list and optionally padding length (see above).
+        pad_to_multiple_of (:obj:`int`, `optional`):
+            If set will pad the sequence to a multiple of the provided value.
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+            7.5 (Volta).
+    """
+
+    processor: AutoProcessor
+    padding: Union[bool, str] = "longest"
+    pad_to_multiple_of: Optional[int] = None
+    pad_to_multiple_of_labels: Optional[int] = None
+
+    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+        # split inputs and labels since they have to be of different lengths and need
+        # different padding methods
+        input_features = [{"input_values": feature["input_values"]} for feature in features]
+        label_features = [{"input_ids": feature["labels"]} for feature in features]
+
+        batch = self.processor.pad(
+            input_features,
+            padding=self.padding,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors="pt",
+        )
+
+        labels_batch = self.processor.pad(
+            labels=label_features,
+            padding=self.padding,
+            pad_to_multiple_of=self.pad_to_multiple_of_labels,
+            return_tensors="pt",
+        )
+
+        # replace padding with -100 to ignore loss correctly
+        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
+
+        batch["labels"] = labels
+        if "attention_mask" in batch:
+            batch["attention_mask"] = batch["attention_mask"].to(torch.long)
+
+        return batch
+
+
+def create_vocabulary_from_data(
+    datasets: DatasetDict,
+    word_delimiter_token: Optional[str] = None,
+    unk_token: Optional[str] = None,
+    pad_token: Optional[str] = None,
+):
+    # Given training and test labels create vocabulary
+    def extract_all_chars(batch):
+        all_text = " ".join(batch["target_text"])
+        vocab = list(set(all_text))
+        return {"vocab": [vocab], "all_text": [all_text]}
+
+    vocabs = datasets.map(
+        extract_all_chars,
+        batched=True,
+        batch_size=-1,
+        keep_in_memory=True,
+        remove_columns=datasets["train"].column_names,
+    )
+
+    # take union of all unique characters in each dataset
+    vocab_set = functools.reduce(
+        lambda vocab_1, vocab_2: set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]), vocabs.values()
+    )
+
+    vocab_dict = {v: k for k, v in enumerate(sorted(vocab_set))}
+
+    # replace white space with delimiter token
+    if word_delimiter_token is not None:
+        vocab_dict[word_delimiter_token] = vocab_dict[" "]
+        del vocab_dict[" "]
+
+    # add unk and pad token
+    if unk_token is not None:
+        vocab_dict[unk_token] = len(vocab_dict)
+
+    if pad_token is not None:
+        vocab_dict[pad_token] = len(vocab_dict)
+
+    return vocab_dict
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    if data_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if data_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        data_args.token = data_args.use_auth_token
+
+    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
+    # information sent is the one passed as arguments along with your Python/PyTorch versions.
+    send_example_telemetry("run_speech_recognition_ctc_adapter", model_args, data_args)
+
+    # Detecting last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
+
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
+    )
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    if is_main_process(training_args.local_rank):
+        transformers.utils.logging.set_verbosity_info()
+    logger.info("Training/evaluation parameters %s", training_args)
+
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+
+    # 1. First, let's load the dataset
+    raw_datasets = DatasetDict()
+
+    if training_args.do_train:
+        raw_datasets["train"] = load_dataset(
+            data_args.dataset_name,
+            data_args.dataset_config_name,
+            split=data_args.train_split_name,
+            token=data_args.token,
+        )
+
+        if data_args.audio_column_name not in raw_datasets["train"].column_names:
+            raise ValueError(
+                f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'."
+                " Make sure to set `--audio_column_name` to the correct audio column - one of"
+                f" {', '.join(raw_datasets['train'].column_names)}."
+            )
+
+        if data_args.text_column_name not in raw_datasets["train"].column_names:
+            raise ValueError(
+                f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. "
+                "Make sure to set `--text_column_name` to the correct text column - one of "
+                f"{', '.join(raw_datasets['train'].column_names)}."
+            )
+
+        if data_args.max_train_samples is not None:
+            raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
+
+    if training_args.do_eval:
+        raw_datasets["eval"] = load_dataset(
+            data_args.dataset_name,
+            data_args.dataset_config_name,
+            split=data_args.eval_split_name,
+            token=data_args.token,
+        )
+
+        if data_args.max_eval_samples is not None:
+            raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
+
+    # 2. We remove some special characters from the datasets
+    # that make training complicated and do not help in transcribing the speech
+    # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
+    # that could be easily picked up by the model
+    chars_to_ignore_regex = (
+        f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
+    )
+    text_column_name = data_args.text_column_name
+
+    def remove_special_characters(batch):
+        if chars_to_ignore_regex is not None:
+            batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).lower() + " "
+        else:
+            batch["target_text"] = batch[text_column_name].lower() + " "
+        return batch
+
+    with training_args.main_process_first(desc="dataset map special characters removal"):
+        raw_datasets = raw_datasets.map(
+            remove_special_characters,
+            remove_columns=[text_column_name],
+            desc="remove special characters from datasets",
+        )
+
+    # save special tokens for tokenizer
+    word_delimiter_token = data_args.word_delimiter_token
+    unk_token = data_args.unk_token
+    pad_token = data_args.pad_token
+
+    # 3. Next, let's load the config as we might need it to create
+    # the tokenizer
+    # load config
+    config = AutoConfig.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
+    )
+
+    # 4. Next, if no tokenizer file is defined,
+    # we create the vocabulary of the model by extracting all unique characters from
+    # the training and evaluation datasets
+    # We need to make sure that only first rank saves vocabulary
+    # make sure all processes wait until vocab is created
+    tokenizer_name_or_path = model_args.tokenizer_name_or_path
+    tokenizer_kwargs = {}
+
+    vocab_dict = {}
+    if tokenizer_name_or_path is not None:
+        # load vocabulary of other adapter languages so that new language can be appended
+        tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_name_or_path,
+            token=data_args.token,
+            trust_remote_code=data_args.trust_remote_code,
+        )
+        vocab_dict = tokenizer.vocab.copy()
+        if tokenizer.target_lang is None:
+            raise ValueError("Make sure to load a multi-lingual tokenizer with a set target language.")
+
+        if data_args.target_language in tokenizer.vocab and not data_args.overwrite_lang_vocab:
+            logger.info(
+                "Adapter language already exists."
+                " Skipping vocabulary creating. If you want to create a new vocabulary"
+                f" for {data_args.target_language} make sure to add '--overwrite_lang_vocab'"
+            )
+        else:
+            tokenizer_name_or_path = None
+
+    if tokenizer_name_or_path is None:
+        # save vocab in training output dir
+        tokenizer_name_or_path = training_args.output_dir
+
+        vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json")
+
+        with training_args.main_process_first():
+            if training_args.overwrite_output_dir and os.path.isfile(vocab_file):
+                try:
+                    os.remove(vocab_file)
+                except OSError:
+                    # in shared file-systems it might be the case that
+                    # two processes try to delete the vocab file at the some time
+                    pass
+
+        with training_args.main_process_first(desc="dataset map vocabulary creation"):
+            if not os.path.isfile(vocab_file):
+                os.makedirs(tokenizer_name_or_path, exist_ok=True)
+                lang_dict = create_vocabulary_from_data(
+                    raw_datasets,
+                    word_delimiter_token=word_delimiter_token,
+                    unk_token=unk_token,
+                    pad_token=pad_token,
+                )
+
+                # if we doing adapter language training, save
+                # vocab with adpter language
+                if data_args.target_language is not None:
+                    vocab_dict[data_args.target_language] = lang_dict
+
+                # save vocab dict to be loaded into tokenizer
+                with open(vocab_file, "w") as file:
+                    json.dump(vocab_dict, file)
+
+        # if tokenizer has just been created
+        # it is defined by `tokenizer_class` if present in config else by `model_type`
+        tokenizer_kwargs = {
+            "config": config if config.tokenizer_class is not None else None,
+            "tokenizer_type": config.model_type if config.tokenizer_class is None else None,
+            "unk_token": unk_token,
+            "pad_token": pad_token,
+            "word_delimiter_token": word_delimiter_token,
+            "target_lang": data_args.target_language,
+        }
+
+    # 5. Now we can instantiate the feature extractor, tokenizer and model
+    # Note for distributed training, the .from_pretrained methods guarantee that only
+    # one local process can concurrently download model & vocab.
+
+    # load feature_extractor and tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(
+        tokenizer_name_or_path,
+        token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
+        **tokenizer_kwargs,
+    )
+    feature_extractor = AutoFeatureExtractor.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
+    )
+
+    # adapt config
+    config.update(
+        {
+            "final_dropout": model_args.final_dropout,
+            "mask_time_prob": model_args.mask_time_prob,
+            "mask_time_length": model_args.mask_time_length,
+            "mask_feature_prob": model_args.mask_feature_prob,
+            "mask_feature_length": model_args.mask_feature_length,
+            "gradient_checkpointing": training_args.gradient_checkpointing,
+            "layerdrop": model_args.layerdrop,
+            "ctc_loss_reduction": model_args.ctc_loss_reduction,
+            "pad_token_id": tokenizer.pad_token_id,
+            "vocab_size": len(tokenizer),
+            "adapter_attn_dim": model_args.adapter_attn_dim,
+        }
+    )
+
+    # create model
+    model = AutoModelForCTC.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        config=config,
+        token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
+        ignore_mismatched_sizes=True,
+    )
+
+    # if attn adapter is defined, freeze all non-adapter weights
+    if model.config.adapter_attn_dim is not None:
+        model.init_adapter_layers()
+        # first we freeze the whole base model
+        model.freeze_base_model()
+
+        # next we unfreeze all adapter layers
+        adapter_weights = model._get_adapters()
+        for param in adapter_weights.values():
+            param.requires_grad = True
+
+    # 6. Now we preprocess the datasets including loading the audio, resampling and normalization
+    # Thankfully, `datasets` takes care of automatically loading and resampling the audio,
+    # so that we just need to set the correct target sampling rate and normalize the input
+    # via the `feature_extractor`
+
+    # make sure that dataset decodes audio with correct sampling rate
+    dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
+    if dataset_sampling_rate != feature_extractor.sampling_rate:
+        raw_datasets = raw_datasets.cast_column(
+            data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
+        )
+
+    # derive max & min input length for sample rate & max duration
+    max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
+    min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
+    audio_column_name = data_args.audio_column_name
+    num_workers = data_args.preprocessing_num_workers
+
+    # Preprocessing the datasets.
+    # We need to read the audio files as arrays and tokenize the targets.
+    def prepare_dataset(batch):
+        # load audio
+        sample = batch[audio_column_name]
+
+        inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
+        batch["input_values"] = inputs.input_values[0]
+        batch["input_length"] = len(batch["input_values"])
+
+        # encode targets
+        batch["labels"] = tokenizer(batch["target_text"]).input_ids
+        return batch
+
+    with training_args.main_process_first(desc="dataset map preprocessing"):
+        vectorized_datasets = raw_datasets.map(
+            prepare_dataset,
+            remove_columns=next(iter(raw_datasets.values())).column_names,
+            num_proc=num_workers,
+            desc="preprocess datasets",
+        )
+
+        def is_audio_in_length_range(length):
+            return length > min_input_length and length < max_input_length
+
+        # filter data that is shorter than min_input_length
+        vectorized_datasets = vectorized_datasets.filter(
+            is_audio_in_length_range,
+            num_proc=num_workers,
+            input_columns=["input_length"],
+        )
+
+    # 7. Next, we can prepare the training.
+    # Let's use word error rate (WER) as our evaluation metric,
+    # instantiate a data collator and the trainer
+
+    # Define evaluation metrics during training, *i.e.* word error rate, character error rate
+    eval_metrics = {metric: evaluate.load(metric) for metric in data_args.eval_metrics}
+
+    # for large datasets it is advised to run the preprocessing on a
+    # single machine first with ``args.preprocessing_only`` since there will mostly likely
+    # be a timeout when running the script in distributed mode.
+    # In a second step ``args.preprocessing_only`` can then be set to `False` to load the
+    # cached dataset
+    if data_args.preprocessing_only:
+        logger.info(f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}")
+        return
+
+    def compute_metrics(pred):
+        pred_logits = pred.predictions
+        pred_ids = np.argmax(pred_logits, axis=-1)
+
+        pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
+
+        pred_str = tokenizer.batch_decode(pred_ids)
+        # we do not want to group tokens when computing the metrics
+        label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)
+
+        metrics = {k: v.compute(predictions=pred_str, references=label_str) for k, v in eval_metrics.items()}
+
+        return metrics
+
+    # Now save everything to be able to create a single processor later
+    # make sure all processes wait until data is saved
+    with training_args.main_process_first():
+        # only the main process saves them
+        if is_main_process(training_args.local_rank):
+            # save feature extractor, tokenizer and config
+            feature_extractor.save_pretrained(training_args.output_dir)
+            tokenizer.save_pretrained(training_args.output_dir)
+            config.save_pretrained(training_args.output_dir)
+
+    try:
+        processor = AutoProcessor.from_pretrained(training_args.output_dir)
+    except (OSError, KeyError):
+        warnings.warn(
+            "Loading a processor from a feature extractor config that does not"
+            " include a `processor_class` attribute is deprecated and will be removed in v5. Please add the following "
+            " attribute to your `preprocessor_config.json` file to suppress this warning: "
+            " `'processor_class': 'Wav2Vec2Processor'`",
+            FutureWarning,
+        )
+        processor = Wav2Vec2Processor.from_pretrained(training_args.output_dir)
+
+    # Instantiate custom data collator
+    data_collator = DataCollatorCTCWithPadding(processor=processor)
+
+    # Initialize Trainer
+    trainer = Trainer(
+        model=model,
+        data_collator=data_collator,
+        args=training_args,
+        compute_metrics=compute_metrics,
+        train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
+        eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
+        tokenizer=processor,
+    )
+
+    # 8. Finally, we can start training
+
+    # Training
+    if training_args.do_train:
+        # use last checkpoint if exist
+        if last_checkpoint is not None:
+            checkpoint = last_checkpoint
+        elif os.path.isdir(model_args.model_name_or_path):
+            checkpoint = model_args.model_name_or_path
+        else:
+            checkpoint = None
+
+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        trainer.save_model()
+
+        metrics = train_result.metrics
+        max_train_samples = (
+            data_args.max_train_samples
+            if data_args.max_train_samples is not None
+            else len(vectorized_datasets["train"])
+        )
+        metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"]))
+
+        trainer.log_metrics("train", metrics)
+        trainer.save_metrics("train", metrics)
+        trainer.save_state()
+
+    # Evaluation
+    results = {}
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+        metrics = trainer.evaluate()
+        max_eval_samples = (
+            data_args.max_eval_samples if data_args.max_eval_samples is not None else len(vectorized_datasets["eval"])
+        )
+        metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"]))
+
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    # Write model card and (optionally) push to hub
+    config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na"
+    kwargs = {
+        "finetuned_from": model_args.model_name_or_path,
+        "tasks": "automatic-speech-recognition",
+        "tags": ["automatic-speech-recognition", data_args.dataset_name, "mms"],
+        "dataset_args": (
+            f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split:"
+            f" {data_args.eval_split_name}"
+        ),
+        "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}",
+    }
+    if "common_voice" in data_args.dataset_name:
+        kwargs["language"] = config_name
+
+    # make sure that adapter weights are saved seperately
+    adapter_file = WAV2VEC2_ADAPTER_SAFE_FILE.format(data_args.target_language)
+    adapter_file = os.path.join(training_args.output_dir, adapter_file)
+    logger.info(f"Saving adapter weights under {adapter_file}...")
+    safe_save_file(model._get_adapters(), adapter_file, metadata={"format": "pt"})
+
+    if training_args.push_to_hub:
+        trainer.push_to_hub(**kwargs)
+    else:
+        trainer.create_model_card(**kwargs)
+
+    return results
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
index 0487dd43f03e..52d08e305194 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
@@ -22,14 +22,15 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional, Union
 
 import datasets
+import evaluate
 import torch
 from datasets import DatasetDict, load_dataset
 
-import evaluate
 import transformers
 from transformers import (
     AutoConfig,
@@ -48,7 +49,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 
@@ -85,12 +86,28 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -113,6 +130,12 @@ class ModelArguments:
     suppress_tokens: List[int] = field(
         default=None, metadata={"help": "A list of tokens that will be suppressed at generation."}
     )
+    apply_spec_augment: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to apply *SpecAugment* data augmentation to the input features. This is currently only relevant for Wav2Vec2, HuBERT, WavLM and Whisper models."
+        },
+    )
 
 
 @dataclass
@@ -127,10 +150,6 @@ class DataTrainingArguments:
     dataset_config_name: Optional[str] = field(
         default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
     )
-    text_column: Optional[str] = field(
-        default=None,
-        metadata={"help": "The name of the column in the datasets containing the full texts (for summarization)."},
-    )
     overwrite_cache: bool = field(
         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
     )
@@ -227,10 +246,13 @@ class DataCollatorSpeechSeq2SeqWithPadding:
             The processor used for processing the data.
         decoder_start_token_id (`int`)
             The begin-of-sentence of the decoder.
+        forward_attention_mask (`bool`)
+            Whether to return attention_mask.
     """
 
     processor: Any
     decoder_start_token_id: int
+    forward_attention_mask: bool
 
     def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
         # split inputs and labels since they have to be of different lengths and need
@@ -241,6 +263,9 @@ def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) ->
 
         batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
 
+        if self.forward_attention_mask:
+            batch["attention_mask"] = torch.LongTensor([feature["attention_mask"] for feature in features])
+
         labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
 
         # replace padding with -100 to ignore loss correctly
@@ -270,6 +295,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_speech_recognition_seq2seq", model_args, data_args)
@@ -292,7 +323,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
@@ -328,7 +359,7 @@ def main():
             data_args.dataset_config_name,
             split=data_args.train_split_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
 
     if training_args.do_eval:
@@ -337,7 +368,7 @@ def main():
             data_args.dataset_config_name,
             split=data_args.eval_split_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
 
     if data_args.audio_column_name not in next(iter(raw_datasets.values())).column_names:
@@ -362,30 +393,38 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     config.update({"forced_decoder_ids": model_args.forced_decoder_ids, "suppress_tokens": model_args.suppress_tokens})
 
+    # SpecAugment for whisper models
+    if getattr(config, "model_type", None) == "whisper":
+        config.update({"apply_spec_augment": model_args.apply_spec_augment})
+
     feature_extractor = AutoFeatureExtractor.from_pretrained(
         model_args.feature_extractor_name if model_args.feature_extractor_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     model = AutoModelForSpeechSeq2Seq.from_pretrained(
         model_args.model_name_or_path,
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     if model.config.decoder_start_token_id is None:
@@ -418,6 +457,12 @@ def main():
     text_column_name = data_args.text_column_name
     model_input_name = feature_extractor.model_input_names[0]
     do_lower_case = data_args.do_lower_case
+    # if SpecAugment is used for whisper models, return attention_mask to guide the mask along time axis
+    forward_attention_mask = (
+        getattr(config, "model_type", None) == "whisper"
+        and getattr(config, "apply_spec_augment", False)
+        and getattr(config, "mask_time_prob", 0) > 0
+    )
 
     if data_args.max_train_samples is not None:
         raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
@@ -428,10 +473,14 @@ def main():
     def prepare_dataset(batch):
         # process audio
         sample = batch[audio_column_name]
-        inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
+        inputs = feature_extractor(
+            sample["array"], sampling_rate=sample["sampling_rate"], return_attention_mask=forward_attention_mask
+        )
         # process audio length
         batch[model_input_name] = inputs.get(model_input_name)[0]
         batch["input_length"] = len(sample["array"])
+        if forward_attention_mask:
+            batch["attention_mask"] = inputs.get("attention_mask")[0]
 
         # process targets
         input_str = batch[text_column_name].lower() if do_lower_case else batch[text_column_name]
@@ -484,11 +533,14 @@ def compute_metrics(pred):
         return {"wer": wer}
 
     # 9. Create a single speech processor
-    if is_main_process(training_args.local_rank):
-        # save feature extractor, tokenizer and config
-        feature_extractor.save_pretrained(training_args.output_dir)
-        tokenizer.save_pretrained(training_args.output_dir)
-        config.save_pretrained(training_args.output_dir)
+    # make sure all processes wait until data is saved
+    with training_args.main_process_first():
+        # only the main process saves them
+        if is_main_process(training_args.local_rank):
+            # save feature extractor, tokenizer and config
+            feature_extractor.save_pretrained(training_args.output_dir)
+            tokenizer.save_pretrained(training_args.output_dir)
+            config.save_pretrained(training_args.output_dir)
 
     processor = AutoProcessor.from_pretrained(training_args.output_dir)
 
@@ -496,6 +548,7 @@ def compute_metrics(pred):
     data_collator = DataCollatorSpeechSeq2SeqWithPadding(
         processor=processor,
         decoder_start_token_id=model.config.decoder_start_token_id,
+        forward_attention_mask=forward_attention_mask,
     )
 
     # 11. Initialize Trainer
diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py
index f06d7a2a0001..74c83eee49f2 100755
--- a/examples/pytorch/summarization/run_summarization.py
+++ b/examples/pytorch/summarization/run_summarization.py
@@ -21,17 +21,18 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
 import datasets
+import evaluate
 import nltk  # Here to have a nice missing dependency error message early on
 import numpy as np
 from datasets import load_dataset
+from filelock import FileLock
 
-import evaluate
 import transformers
-from filelock import FileLock
 from transformers import (
     AutoConfig,
     AutoModelForSeq2SeqLM,
@@ -52,7 +53,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
@@ -99,12 +100,28 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -262,8 +279,13 @@ class DataTrainingArguments:
     )
 
     def __post_init__(self):
-        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
-            raise ValueError("Need either a dataset name or a training/validation file.")
+        if (
+            self.dataset_name is None
+            and self.train_file is None
+            and self.validation_file is None
+            and self.test_file is None
+        ):
+            raise ValueError("Need either a dataset name or a training, validation, or test file.")
         else:
             if self.train_file is not None:
                 extension = self.train_file.split(".")[-1]
@@ -271,6 +293,9 @@ def __post_init__(self):
             if self.validation_file is not None:
                 extension = self.validation_file.split(".")[-1]
                 assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
+            if self.test_file is not None:
+                extension = self.test_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`test_file` should be a csv or a json file."
         if self.val_max_target_length is None:
             self.val_max_target_length = self.max_target_length
 
@@ -304,6 +329,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_summarization", model_args, data_args)
@@ -314,6 +345,11 @@ def main():
         datefmt="%m/%d/%Y %H:%M:%S",
         handlers=[logging.StreamHandler(sys.stdout)],
     )
+
+    if training_args.should_log:
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers.utils.logging.set_verbosity_info()
+
     log_level = training_args.get_process_log_level()
     logger.setLevel(log_level)
     datasets.utils.logging.set_verbosity(log_level)
@@ -324,7 +360,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
@@ -373,7 +409,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -390,7 +426,7 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -404,15 +440,16 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
-        ort=True if training_args.ort else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     model = AutoModelForSeq2SeqLM.from_pretrained(
         model_args.model_name_or_path,
@@ -420,7 +457,8 @@ def main():
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
@@ -463,10 +501,16 @@ def main():
     # Preprocessing the datasets.
     # We need to tokenize inputs and targets.
     if training_args.do_train:
+        if "train" not in raw_datasets:
+            raise ValueError("--do_train requires a train dataset")
         column_names = raw_datasets["train"].column_names
     elif training_args.do_eval:
+        if "validation" not in raw_datasets:
+            raise ValueError("--do_eval requires a validation dataset")
         column_names = raw_datasets["validation"].column_names
     elif training_args.do_predict:
+        if "test" not in raw_datasets:
+            raise ValueError("--do_predict requires a test dataset")
         column_names = raw_datasets["test"].column_names
     else:
         logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.")
@@ -542,8 +586,6 @@ def preprocess_function(examples):
         return model_inputs
 
     if training_args.do_train:
-        if "train" not in raw_datasets:
-            raise ValueError("--do_train requires a train dataset")
         train_dataset = raw_datasets["train"]
         if data_args.max_train_samples is not None:
             max_train_samples = min(len(train_dataset), data_args.max_train_samples)
@@ -560,8 +602,6 @@ def preprocess_function(examples):
 
     if training_args.do_eval:
         max_target_length = data_args.val_max_target_length
-        if "validation" not in raw_datasets:
-            raise ValueError("--do_eval requires a validation dataset")
         eval_dataset = raw_datasets["validation"]
         if data_args.max_eval_samples is not None:
             max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
@@ -578,8 +618,6 @@ def preprocess_function(examples):
 
     if training_args.do_predict:
         max_target_length = data_args.val_max_target_length
-        if "test" not in raw_datasets:
-            raise ValueError("--do_predict requires a test dataset")
         predict_dataset = raw_datasets["test"]
         if data_args.max_predict_samples is not None:
             max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
@@ -620,10 +658,10 @@ def compute_metrics(eval_preds):
         preds, labels = eval_preds
         if isinstance(preds, tuple):
             preds = preds[0]
+        # Replace -100s used for padding as we can't decode them
+        preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
         decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
-        if data_args.ignore_pad_token_for_loss:
-            # Replace -100 in the labels as we can't decode them.
-            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
         decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
 
         # Some simple post-processing
@@ -635,6 +673,16 @@ def compute_metrics(eval_preds):
         result["gen_len"] = np.mean(prediction_lens)
         return result
 
+    # Override the decoding parameters of Seq2SeqTrainer
+    training_args.generation_max_length = (
+        training_args.generation_max_length
+        if training_args.generation_max_length is not None
+        else data_args.val_max_target_length
+    )
+    training_args.generation_num_beams = (
+        data_args.num_beams if data_args.num_beams is not None else training_args.generation_num_beams
+    )
+
     # Initialize our Trainer
     trainer = Seq2SeqTrainer(
         model=model,
@@ -668,15 +716,15 @@ def compute_metrics(eval_preds):
 
     # Evaluation
     results = {}
-    max_length = (
-        training_args.generation_max_length
-        if training_args.generation_max_length is not None
-        else data_args.val_max_target_length
-    )
-    num_beams = data_args.num_beams if data_args.num_beams is not None else training_args.generation_num_beams
     if training_args.do_eval:
         logger.info("*** Evaluate ***")
-        metrics = trainer.evaluate(max_length=max_length, num_beams=num_beams, metric_key_prefix="eval")
+        if isinstance(eval_dataset, dict):
+            metrics = {}
+            for eval_ds_name, eval_ds in eval_dataset.items():
+                dataset_metrics = trainer.evaluate(eval_dataset=eval_ds, metric_key_prefix=f"eval_{eval_ds_name}")
+                metrics.update(dataset_metrics)
+        else:
+            metrics = trainer.evaluate(metric_key_prefix="eval")
         max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
         metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
 
@@ -686,9 +734,7 @@ def compute_metrics(eval_preds):
     if training_args.do_predict:
         logger.info("*** Predict ***")
 
-        predict_results = trainer.predict(
-            predict_dataset, metric_key_prefix="predict", max_length=max_length, num_beams=num_beams
-        )
+        predict_results = trainer.predict(predict_dataset, metric_key_prefix="predict")
         metrics = predict_results.metrics
         max_predict_samples = (
             data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
@@ -700,8 +746,10 @@ def compute_metrics(eval_preds):
 
         if trainer.is_world_process_zero():
             if training_args.predict_with_generate:
+                predictions = predict_results.predictions
+                predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
                 predictions = tokenizer.batch_decode(
-                    predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
+                    predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
                 )
                 predictions = [pred.strip() for pred in predictions]
                 output_prediction_file = os.path.join(training_args.output_dir, "generated_predictions.txt")
diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py
index dae706a80b29..d59ead329fe9 100644
--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@@ -27,20 +27,20 @@
 from pathlib import Path
 
 import datasets
+import evaluate
 import nltk
 import numpy as np
 import torch
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
 from datasets import load_dataset
+from filelock import FileLock
+from huggingface_hub import Repository, create_repo
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
 
-import evaluate
 import transformers
-from accelerate import Accelerator
-from accelerate.logging import get_logger
-from accelerate.utils import set_seed
-from filelock import FileLock
-from huggingface_hub import Repository
 from transformers import (
     CONFIG_MAPPING,
     MODEL_MAPPING,
@@ -51,12 +51,12 @@
     SchedulerType,
     get_scheduler,
 )
-from transformers.utils import check_min_version, get_full_repo_name, is_offline_mode, send_example_telemetry
+from transformers.utils import check_min_version, is_offline_mode, send_example_telemetry
 from transformers.utils.versions import require_version
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
@@ -161,15 +161,6 @@ def parse_args():
             "param of ``model.generate``, which is used during ``evaluate`` and ``predict``."
         ),
     )
-    parser.add_argument(
-        "--max_length",
-        type=int,
-        default=128,
-        help=(
-            "The maximum total input sequence length after tokenization. Sequences longer than this will be truncated,"
-            " sequences shorter will be padded if `--pad_to_max_lengh` is passed."
-        ),
-    )
     parser.add_argument(
         "--num_beams",
         type=int,
@@ -275,6 +266,16 @@ def parse_args():
         "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
     )
     parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--trust_remote_code",
+        type=bool,
+        default=False,
+        help=(
+            "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+            "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+            "execute code present on the Hub on your local machine."
+        ),
+    )
     parser.add_argument(
         "--checkpointing_steps",
         type=str,
@@ -334,7 +335,7 @@ def main():
 
     if args.with_tracking:
         accelerator_log_kwargs["log_with"] = args.report_to
-        accelerator_log_kwargs["logging_dir"] = args.output_dir
+        accelerator_log_kwargs["project_dir"] = args.output_dir
 
     accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
     if args.source_prefix is None and args.model_name_or_path in [
@@ -369,11 +370,14 @@ def main():
     # Handle the repository creation
     if accelerator.is_main_process:
         if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            repo = Repository(args.output_dir, clone_from=repo_name)
+            # Retrieve of infer repo_name
+            repo_name = args.hub_model_id
+            if repo_name is None:
+                repo_name = Path(args.output_dir).absolute().name
+            # Create repo and retrieve repo_id
+            repo_id = create_repo(repo_name, exist_ok=True, token=args.hub_token).repo_id
+            # Clone repo locally
+            repo = Repository(args.output_dir, clone_from=repo_id, token=args.hub_token)
 
             with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                 if "step_*" not in gitignore:
@@ -412,17 +416,21 @@ def main():
     # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
     # download model & vocab.
     if args.config_name:
-        config = AutoConfig.from_pretrained(args.config_name)
+        config = AutoConfig.from_pretrained(args.config_name, trust_remote_code=args.trust_remote_code)
     elif args.model_name_or_path:
-        config = AutoConfig.from_pretrained(args.model_name_or_path)
+        config = AutoConfig.from_pretrained(args.model_name_or_path, trust_remote_code=args.trust_remote_code)
     else:
         config = CONFIG_MAPPING[args.model_type]()
         logger.warning("You are instantiating a new config instance from scratch.")
 
     if args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer)
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.tokenizer_name, use_fast=not args.use_slow_tokenizer, trust_remote_code=args.trust_remote_code
+        )
     elif args.model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.model_name_or_path, use_fast=not args.use_slow_tokenizer, trust_remote_code=args.trust_remote_code
+        )
     else:
         raise ValueError(
             "You are instantiating a new tokenizer from scratch. This is not supported by this script."
@@ -434,10 +442,11 @@ def main():
             args.model_name_or_path,
             from_tf=bool(".ckpt" in args.model_name_or_path),
             config=config,
+            trust_remote_code=args.trust_remote_code,
         )
     else:
         logger.info("Training new model from scratch")
-        model = AutoModelForSeq2SeqLM.from_config(config)
+        model = AutoModelForSeq2SeqLM.from_config(config, trust_remote_code=args.trust_remote_code)
 
     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
     # on a small vocab and want a smaller embedding size, remove this test.
@@ -472,6 +481,9 @@ def main():
                 f"--summary_column' value '{args.summary_column}' needs to be one of: {', '.join(column_names)}"
             )
 
+    if args.val_max_target_length is None:
+        args.val_max_target_length = args.max_target_length
+
     # Temporarily set max_target_length for training.
     max_target_length = args.max_target_length
     padding = "max_length" if args.pad_to_max_length else False
@@ -496,7 +508,7 @@ def preprocess_function(examples):
         return model_inputs
 
     with accelerator.main_process_first():
-        processed_datasets = raw_datasets.map(
+        train_dataset = raw_datasets["train"].map(
             preprocess_function,
             batched=True,
             num_proc=args.preprocessing_num_workers,
@@ -505,8 +517,16 @@ def preprocess_function(examples):
             desc="Running tokenizer on dataset",
         )
 
-    train_dataset = processed_datasets["train"]
-    eval_dataset = processed_datasets["validation"]
+        # Temporarily set max_target_length for validation.
+        max_target_length = args.val_max_target_length
+        eval_dataset = raw_datasets["validation"].map(
+            preprocess_function,
+            batched=True,
+            num_proc=args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not args.overwrite_cache,
+            desc="Running tokenizer on dataset",
+        )
 
     # Log a few random samples from the training set:
     for index in random.sample(range(len(train_dataset)), 1):
@@ -609,36 +629,45 @@ def postprocess_text(preds, labels):
     # Potentially load in the weights and states from a previous save
     if args.resume_from_checkpoint:
         if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
-            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
-            accelerator.load_state(args.resume_from_checkpoint)
+            checkpoint_path = args.resume_from_checkpoint
             path = os.path.basename(args.resume_from_checkpoint)
         else:
             # Get the most recent checkpoint
             dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
             dirs.sort(key=os.path.getctime)
             path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
+            checkpoint_path = path
+            path = os.path.basename(checkpoint_path)
+
+        accelerator.print(f"Resumed from checkpoint: {checkpoint_path}")
+        accelerator.load_state(path)
         # Extract `epoch_{i}` or `step_{i}`
         training_difference = os.path.splitext(path)[0]
 
         if "epoch" in training_difference:
             starting_epoch = int(training_difference.replace("epoch_", "")) + 1
             resume_step = None
+            completed_steps = starting_epoch * num_update_steps_per_epoch
         else:
-            resume_step = int(training_difference.replace("step_", ""))
+            # need to multiply `gradient_accumulation_steps` to reflect real steps
+            resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
             starting_epoch = resume_step // len(train_dataloader)
+            completed_steps = resume_step // args.gradient_accumulation_steps
             resume_step -= starting_epoch * len(train_dataloader)
 
+    # update the progress_bar if load from checkpoint
+    progress_bar.update(completed_steps)
+
     for epoch in range(starting_epoch, args.num_train_epochs):
         model.train()
         if args.with_tracking:
             total_loss = 0
-        for step, batch in enumerate(train_dataloader):
-            # We need to skip steps until we reach the resumed step
-            if args.resume_from_checkpoint and epoch == starting_epoch:
-                if resume_step is not None and step < resume_step:
-                    completed_steps += 1
-                    continue
-
+        if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
+            # We skip the first `n` batches in the dataloader when resuming from a checkpoint
+            active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
+        else:
+            active_dataloader = train_dataloader
+        for step, batch in enumerate(active_dataloader):
             with accelerator.accumulate(model):
                 outputs = model(**batch)
                 loss = outputs.loss
@@ -657,7 +686,7 @@ def postprocess_text(preds, labels):
 
             if isinstance(checkpointing_steps, int):
                 if completed_steps % checkpointing_steps == 0:
-                    output_dir = f"step_{completed_steps }"
+                    output_dir = f"step_{completed_steps}"
                     if args.output_dir is not None:
                         output_dir = os.path.join(args.output_dir, output_dir)
                     accelerator.save_state(output_dir)
@@ -666,11 +695,9 @@ def postprocess_text(preds, labels):
                 break
 
         model.eval()
-        if args.val_max_target_length is None:
-            args.val_max_target_length = args.max_target_length
 
         gen_kwargs = {
-            "max_length": args.val_max_target_length if args is not None else config.max_length,
+            "max_length": args.val_max_target_length,
             "num_beams": args.num_beams,
         }
         for step, batch in enumerate(eval_dataloader):
diff --git a/examples/pytorch/test_accelerate_examples.py b/examples/pytorch/test_accelerate_examples.py
index 306cd9a34f16..4cfe45b02294 100644
--- a/examples/pytorch/test_accelerate_examples.py
+++ b/examples/pytorch/test_accelerate_examples.py
@@ -21,11 +21,12 @@
 import shutil
 import sys
 import tempfile
+import unittest
 from unittest import mock
 
 import torch
-
 from accelerate.utils import write_basic_config
+
 from transformers.testing_utils import TestCasePlus, get_gpu_count, run_command, slow, torch_device
 from transformers.utils import is_apex_available
 
@@ -176,6 +177,7 @@ def test_run_ner_no_trainer(self):
         self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0")))
         self.assertTrue(os.path.exists(os.path.join(tmp_dir, "ner_no_trainer")))
 
+    @unittest.skip(reason="Fix me @muellerzr")
     @mock.patch.dict(os.environ, {"WANDB_MODE": "offline"})
     def test_run_squad_no_trainer(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
@@ -268,6 +270,7 @@ def test_run_translation_no_trainer(self):
             --output_dir {tmp_dir}
             --max_train_steps=50
             --num_warmup_steps=8
+            --num_beams=6
             --learning_rate=3e-3
             --per_device_train_batch_size=2
             --per_device_eval_batch_size=1
diff --git a/examples/pytorch/test_pytorch_examples.py b/examples/pytorch/test_pytorch_examples.py
index f4682b8933e7..269d7844f79f 100644
--- a/examples/pytorch/test_pytorch_examples.py
+++ b/examples/pytorch/test_pytorch_examples.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 
 
-import argparse
 import json
 import logging
 import os
@@ -63,6 +62,7 @@
     import run_semantic_segmentation
     import run_seq2seq_qa as run_squad_seq2seq
     import run_speech_recognition_ctc
+    import run_speech_recognition_ctc_adapter
     import run_speech_recognition_seq2seq
     import run_summarization
     import run_swag
@@ -75,13 +75,6 @@
 logger = logging.getLogger()
 
 
-def get_setup_file():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-f")
-    args = parser.parse_args()
-    return args.f
-
-
 def get_results(output_dir):
     results = {}
     path = os.path.join(output_dir, "all_results.json")
@@ -152,8 +145,8 @@ def test_run_clm(self):
             # Skipping because there are not enough batches to train the model + would need a drop_last to work.
             return
 
-        if torch_device != "cuda":
-            testargs.append("--no_cuda")
+        if torch_device == "cpu":
+            testargs.append("--use_cpu")
 
         with patch.object(sys, "argv", testargs):
             run_clm.main()
@@ -174,8 +167,8 @@ def test_run_clm_config_overrides(self):
             --config_overrides n_embd=10,n_head=2
             """.split()
 
-        if torch_device != "cuda":
-            testargs.append("--no_cuda")
+        if torch_device == "cpu":
+            testargs.append("--use_cpu")
 
         logger = run_clm.logger
         with patch.object(sys, "argv", testargs):
@@ -200,8 +193,8 @@ def test_run_mlm(self):
             --num_train_epochs=1
         """.split()
 
-        if torch_device != "cuda":
-            testargs.append("--no_cuda")
+        if torch_device == "cpu":
+            testargs.append("--use_cpu")
 
         with patch.object(sys, "argv", testargs):
             run_mlm.main()
@@ -230,8 +223,8 @@ def test_run_ner(self):
             --seed 7
         """.split()
 
-        if torch_device != "cuda":
-            testargs.append("--no_cuda")
+        if torch_device == "cpu":
+            testargs.append("--use_cpu")
 
         with patch.object(sys, "argv", testargs):
             run_ner.main()
@@ -446,6 +439,38 @@ def test_run_speech_recognition_ctc(self):
             result = get_results(tmp_dir)
             self.assertLess(result["eval_loss"], result["train_loss"])
 
+    def test_run_speech_recognition_ctc_adapter(self):
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        testargs = f"""
+            run_speech_recognition_ctc_adapter.py
+            --output_dir {tmp_dir}
+            --model_name_or_path hf-internal-testing/tiny-random-wav2vec2
+            --dataset_name hf-internal-testing/librispeech_asr_dummy
+            --dataset_config_name clean
+            --train_split_name validation
+            --eval_split_name validation
+            --do_train
+            --do_eval
+            --learning_rate 1e-4
+            --per_device_train_batch_size 2
+            --per_device_eval_batch_size 1
+            --remove_unused_columns False
+            --overwrite_output_dir True
+            --preprocessing_num_workers 16
+            --max_steps 10
+            --target_language tur
+            --seed 42
+        """.split()
+
+        if is_cuda_and_apex_available():
+            testargs.append("--fp16")
+
+        with patch.object(sys, "argv", testargs):
+            run_speech_recognition_ctc_adapter.main()
+            result = get_results(tmp_dir)
+            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, "./adapter.tur.safetensors")))
+            self.assertLess(result["eval_loss"], result["train_loss"])
+
     def test_run_speech_recognition_seq2seq(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
diff --git a/examples/pytorch/text-classification/README.md b/examples/pytorch/text-classification/README.md
index 391aaf4d3f03..3e0d190e516e 100644
--- a/examples/pytorch/text-classification/README.md
+++ b/examples/pytorch/text-classification/README.md
@@ -81,6 +81,55 @@ python run_glue.py \
 
 > If your model classification head dimensions do not fit the number of labels in the dataset, you can specify `--ignore_mismatched_sizes` to adapt it.
 
+## Text classification
+As an alternative, we can use the script [`run_classification.py`](./run_classification.py) to fine-tune models on a single/multi-label classification task. 
+
+The following example fine-tunes BERT on the `en` subset of  [`amazon_reviews_multi`](https://huggingface.co/datasets/amazon_reviews_multi) dataset.
+We can specify the metric, the label column and aso choose which text columns to use jointly for classification. 
+```bash
+dataset="amazon_reviews_multi"
+subset="en"
+python run_classification.py \
+    --model_name_or_path  bert-base-uncased \
+    --dataset_name ${dataset} \
+    --dataset_config_name ${subset} \
+    --shuffle_train_dataset \
+    --metric_name accuracy \
+    --text_column_name "review_title,review_body,product_category" \
+    --text_column_delimiter "\n" \
+    --label_column_name stars \
+    --do_train \
+    --do_eval \
+    --max_seq_length 512 \
+    --per_device_train_batch_size 32 \
+    --learning_rate 2e-5 \
+    --num_train_epochs 1 \
+    --output_dir /tmp/${dataset}_${subset}/
+```
+Training for 1 epoch results in acc of around 0.5958 for review_body only and 0.659 for title+body+category.
+
+The following is a multi-label classification example. It fine-tunes BERT on the `reuters21578` dataset hosted on our [hub](https://huggingface.co/datasets/reuters21578):
+```bash
+dataset="reuters21578"
+subset="ModApte"
+python run_classification.py \
+    --model_name_or_path bert-base-uncased \
+    --dataset_name ${dataset} \
+    --dataset_config_name ${subset} \
+    --shuffle_train_dataset \
+    --remove_splits "unused" \
+    --metric_name f1 \
+    --text_column_name text \
+    --label_column_name topics \
+    --do_train \
+    --do_eval \
+    --max_seq_length 512 \
+    --per_device_train_batch_size 32 \
+    --learning_rate 2e-5 \
+    --num_train_epochs 15 \
+    --output_dir /tmp/${dataset}_${subset}/ 
+```
+ It results in a Micro F1 score of around 0.82 without any text and label filtering. Note that you have to explictly remove the "unused" split from the dataset, since it is not used for classification.
 
 ### Mixed precision training
 
@@ -173,9 +222,9 @@ Note that this library is in alpha release so your feedback is more than welcome
 
 ## XNLI
 
-Based on the script [`run_xnli.py`](https://github.com/huggingface/transformers/examples/pytorch/text-classification/run_xnli.py).
+Based on the script [`run_xnli.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_xnli.py).
 
-[XNLI](https://www.nyu.edu/projects/bowman/xnli/) is a crowd-sourced dataset based on [MultiNLI](http://www.nyu.edu/projects/bowman/multinli/). It is an evaluation benchmark for cross-lingual text representations. Pairs of text are labeled with textual entailment annotations for 15 different languages (including both high-resource language such as English and low-resource languages such as Swahili).
+[XNLI](https://cims.nyu.edu/~sbowman/xnli/) is a crowd-sourced dataset based on [MultiNLI](https://cims.nyu.edu/~sbowman/multinli/). It is an evaluation benchmark for cross-lingual text representations. Pairs of text are labeled with textual entailment annotations for 15 different languages (including both high-resource language such as English and low-resource languages such as Swahili).
 
 #### Fine-tuning on XNLI
 
diff --git a/examples/pytorch/text-classification/run_classification.py b/examples/pytorch/text-classification/run_classification.py
new file mode 100755
index 000000000000..7ae557707df3
--- /dev/null
+++ b/examples/pytorch/text-classification/run_classification.py
@@ -0,0 +1,757 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Finetuning the library models for text classification."""
+# You can also adapt this script on your own text classification task. Pointers for this are left as comments.
+
+import logging
+import os
+import random
+import sys
+import warnings
+from dataclasses import dataclass, field
+from typing import List, Optional
+
+import datasets
+import evaluate
+import numpy as np
+from datasets import Value, load_dataset
+
+import transformers
+from transformers import (
+    AutoConfig,
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    DataCollatorWithPadding,
+    EvalPrediction,
+    HfArgumentParser,
+    Trainer,
+    TrainingArguments,
+    default_data_collator,
+    set_seed,
+)
+from transformers.trainer_utils import get_last_checkpoint
+from transformers.utils import check_min_version, send_example_telemetry
+from transformers.utils.versions import require_version
+
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.35.0.dev0")
+
+require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+
+    Using `HfArgumentParser` we can turn this class
+    into argparse arguments to be able to specify them on
+    the command line.
+    """
+
+    dataset_name: Optional[str] = field(
+        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    do_regression: bool = field(
+        default=None,
+        metadata={
+            "help": "Whether to do regression instead of classification. If None, will be inferred from the dataset."
+        },
+    )
+    text_column_names: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The name of the text column in the input dataset or a CSV/JSON file."
+                'If not specified, will use the "sentence" column for single/multi-label classifcation task.'
+            )
+        },
+    )
+    text_column_delimiter: Optional[str] = field(
+        default=" ", metadata={"help": "THe delimiter to use to join text columns into a single sentence."}
+    )
+    train_split_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": 'The name of the train split in the input dataset. If not specified, will use the "train" split when do_train is enabled'
+        },
+    )
+    validation_split_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": 'The name of the validation split in the input dataset. If not specified, will use the "validation" split when do_eval is enabled'
+        },
+    )
+    test_split_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": 'The name of the test split in the input dataset. If not specified, will use the "test" split when do_predict is enabled'
+        },
+    )
+    remove_splits: Optional[str] = field(
+        default=None,
+        metadata={"help": "The splits to remove from the dataset. Multiple splits should be separated by commas."},
+    )
+    remove_columns: Optional[str] = field(
+        default=None,
+        metadata={"help": "The columns to remove from the dataset. Multiple columns should be separated by commas."},
+    )
+    label_column_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The name of the label column in the input dataset or a CSV/JSON file."
+                'If not specified, will use the "label" column for single/multi-label classifcation task'
+            )
+        },
+    )
+    max_seq_length: int = field(
+        default=128,
+        metadata={
+            "help": (
+                "The maximum total input sequence length after tokenization. Sequences longer "
+                "than this will be truncated, sequences shorter will be padded."
+            )
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
+    )
+    pad_to_max_length: bool = field(
+        default=True,
+        metadata={
+            "help": (
+                "Whether to pad all samples to `max_seq_length`. "
+                "If False, will pad the samples dynamically when batching to the maximum length in the batch."
+            )
+        },
+    )
+    shuffle_train_dataset: bool = field(
+        default=False, metadata={"help": "Whether to shuffle the train dataset or not."}
+    )
+    shuffle_seed: int = field(
+        default=42, metadata={"help": "Random seed that will be used to shuffle the train dataset."}
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of training examples to this "
+                "value if set."
+            )
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+                "value if set."
+            )
+        },
+    )
+    max_predict_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of prediction examples to this "
+                "value if set."
+            )
+        },
+    )
+    metric_name: Optional[str] = field(default=None, metadata={"help": "The metric to use for evaluation."})
+    train_file: Optional[str] = field(
+        default=None, metadata={"help": "A csv or a json file containing the training data."}
+    )
+    validation_file: Optional[str] = field(
+        default=None, metadata={"help": "A csv or a json file containing the validation data."}
+    )
+    test_file: Optional[str] = field(default=None, metadata={"help": "A csv or a json file containing the test data."})
+
+    def __post_init__(self):
+        if self.dataset_name is None:
+            if self.train_file is None or self.validation_file is None:
+                raise ValueError(" training/validation file or a dataset name.")
+
+            train_extension = self.train_file.split(".")[-1]
+            assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
+            validation_extension = self.validation_file.split(".")[-1]
+            assert (
+                validation_extension == train_extension
+            ), "`validation_file` should have the same extension (csv or json) as `train_file`."
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
+    ignore_mismatched_sizes: bool = field(
+        default=False,
+        metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."},
+    )
+
+
+def get_label_list(raw_dataset, split="train") -> List[str]:
+    """Get the list of labels from a mutli-label dataset"""
+
+    if isinstance(raw_dataset[split]["label"][0], list):
+        label_list = [label for sample in raw_dataset[split]["label"] for label in sample]
+        label_list = list(set(label_list))
+    else:
+        label_list = raw_dataset[split].unique("label")
+    # we will treat the label list as a list of string instead of int, consistent with model.config.label2id
+    label_list = [str(label) for label in label_list]
+    return label_list
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
+    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
+    # information sent is the one passed as arguments along with your Python/PyTorch versions.
+    send_example_telemetry("run_classification", model_args, data_args)
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+
+    if training_args.should_log:
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers.utils.logging.set_verbosity_info()
+
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()
+
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
+    )
+    logger.info(f"Training/evaluation parameters {training_args}")
+
+    # Detecting last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+
+    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files, or specify a dataset name
+    # to load from huggingface/datasets. In ether case, you can specify a the key of the column(s) containing the text and
+    # the key of the column containing the label. If multiple columns are specified for the text, they will be joined togather
+    # for the actual text value.
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        raw_datasets = load_dataset(
+            data_args.dataset_name,
+            data_args.dataset_config_name,
+            cache_dir=model_args.cache_dir,
+            token=model_args.token,
+        )
+        # Try print some info about the dataset
+        logger.info(f"Dataset loaded: {raw_datasets}")
+        logger.info(raw_datasets)
+    else:
+        # Loading a dataset from your local files.
+        # CSV/JSON training and evaluation files are needed.
+        data_files = {"train": data_args.train_file, "validation": data_args.validation_file}
+
+        # Get the test dataset: you can provide your own CSV/JSON test file
+        if training_args.do_predict:
+            if data_args.test_file is not None:
+                train_extension = data_args.train_file.split(".")[-1]
+                test_extension = data_args.test_file.split(".")[-1]
+                assert (
+                    test_extension == train_extension
+                ), "`test_file` should have the same extension (csv or json) as `train_file`."
+                data_files["test"] = data_args.test_file
+            else:
+                raise ValueError("Need either a dataset name or a test file for `do_predict`.")
+
+        for key in data_files.keys():
+            logger.info(f"load a local file for {key}: {data_files[key]}")
+
+        if data_args.train_file.endswith(".csv"):
+            # Loading a dataset from local csv files
+            raw_datasets = load_dataset(
+                "csv",
+                data_files=data_files,
+                cache_dir=model_args.cache_dir,
+                token=model_args.token,
+            )
+        else:
+            # Loading a dataset from local json files
+            raw_datasets = load_dataset(
+                "json",
+                data_files=data_files,
+                cache_dir=model_args.cache_dir,
+                token=model_args.token,
+            )
+
+    # See more about loading any type of standard or custom dataset at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+    if data_args.remove_splits is not None:
+        for split in data_args.remove_splits.split(","):
+            logger.info(f"removing split {split}")
+            raw_datasets.pop(split)
+
+    if data_args.train_split_name is not None:
+        logger.info(f"using {data_args.validation_split_name} as validation set")
+        raw_datasets["train"] = raw_datasets[data_args.train_split_name]
+        raw_datasets.pop(data_args.train_split_name)
+
+    if data_args.validation_split_name is not None:
+        logger.info(f"using {data_args.validation_split_name} as validation set")
+        raw_datasets["validation"] = raw_datasets[data_args.validation_split_name]
+        raw_datasets.pop(data_args.validation_split_name)
+
+    if data_args.test_split_name is not None:
+        logger.info(f"using {data_args.test_split_name} as test set")
+        raw_datasets["test"] = raw_datasets[data_args.test_split_name]
+        raw_datasets.pop(data_args.test_split_name)
+
+    if data_args.remove_columns is not None:
+        for split in raw_datasets.keys():
+            for column in data_args.remove_columns.split(","):
+                logger.info(f"removing column {column} from split {split}")
+                raw_datasets[split].remove_columns(column)
+
+    if data_args.label_column_name is not None and data_args.label_column_name != "label":
+        for key in raw_datasets.keys():
+            raw_datasets[key] = raw_datasets[key].rename_column(data_args.label_column_name, "label")
+
+    # Trying to have good defaults here, don't hesitate to tweak to your needs.
+
+    is_regression = (
+        raw_datasets["train"].features["label"].dtype in ["float32", "float64"]
+        if data_args.do_regression is None
+        else data_args.do_regression
+    )
+
+    is_multi_label = False
+    if is_regression:
+        label_list = None
+        num_labels = 1
+        # regession requires float as label type, let's cast it if needed
+        for split in raw_datasets.keys():
+            if raw_datasets[split].features["label"].dtype not in ["float32", "float64"]:
+                logger.warning(
+                    f"Label type for {split} set to float32, was {raw_datasets[split].features['label'].dtype}"
+                )
+                features = raw_datasets[split].features
+                features.update({"label": Value("float32")})
+                try:
+                    raw_datasets[split] = raw_datasets[split].cast(features)
+                except TypeError as error:
+                    logger.error(
+                        f"Unable to cast {split} set to float32, please check the labels are correct, or maybe try with --do_regression=False"
+                    )
+                    raise error
+
+    else:  # classification
+        if raw_datasets["train"].features["label"].dtype == "list":  # multi-label classification
+            is_multi_label = True
+            logger.info("Label type is list, doing multi-label classification")
+        # Trying to find the number of labels in a multi-label classification task
+        # We have to deal with common cases that labels appear in the training set but not in the validation/test set.
+        # So we build the label list from the union of labels in train/val/test.
+        label_list = get_label_list(raw_datasets, split="train")
+        for split in ["validation", "test"]:
+            if split in raw_datasets:
+                val_or_test_labels = get_label_list(raw_datasets, split=split)
+                diff = set(val_or_test_labels).difference(set(label_list))
+                if len(diff) > 0:
+                    # add the labels that appear in val/test but not in train, throw a warning
+                    logger.warning(
+                        f"Labels {diff} in {split} set but not in training set, adding them to the label list"
+                    )
+                    label_list += list(diff)
+        # if label is -1, we throw a warning and remove it from the label list
+        for label in label_list:
+            if label == -1:
+                logger.warning("Label -1 found in label list, removing it.")
+                label_list.remove(label)
+
+        label_list.sort()
+        num_labels = len(label_list)
+        if num_labels <= 1:
+            raise ValueError("You need more than one label to do classification.")
+
+    # Load pretrained model and tokenizer
+    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    config = AutoConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        num_labels=num_labels,
+        finetuning_task="text-classification",
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
+    )
+
+    if is_regression:
+        config.problem_type = "regression"
+        logger.info("setting problem type to regression")
+    elif is_multi_label:
+        config.problem_type = "multi_label_classification"
+        logger.info("setting problem type to multi label classification")
+    else:
+        config.problem_type = "single_label_classification"
+        logger.info("setting problem type to single label classification")
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        use_fast=model_args.use_fast_tokenizer,
+        revision=model_args.model_revision,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
+    )
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_args.model_name_or_path,
+        from_tf=bool(".ckpt" in model_args.model_name_or_path),
+        config=config,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
+        ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
+    )
+
+    # Padding strategy
+    if data_args.pad_to_max_length:
+        padding = "max_length"
+    else:
+        # We will pad later, dynamically at batch creation, to the max sequence length in each batch
+        padding = False
+
+    # for training ,we will update the config with label infos,
+    # if do_train is not set, we will use the label infos in the config
+    if training_args.do_train and not is_regression:  # classification, training
+        label_to_id = {v: i for i, v in enumerate(label_list)}
+        # update config with label infos
+        if model.config.label2id != label_to_id:
+            logger.warning(
+                "The label2id key in the model config.json is not equal to the label2id key of this "
+                "run. You can ignore this if you are doing finetuning."
+            )
+        model.config.label2id = label_to_id
+        model.config.id2label = {id: label for label, id in config.label2id.items()}
+    elif not is_regression:  # classification, but not training
+        logger.info("using label infos in the model config")
+        logger.info("label2id: {}".format(model.config.label2id))
+        label_to_id = model.config.label2id
+    else:  # regression
+        label_to_id = None
+
+    if data_args.max_seq_length > tokenizer.model_max_length:
+        logger.warning(
+            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
+            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
+        )
+    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
+
+    def multi_labels_to_ids(labels: List[str]) -> List[float]:
+        ids = [0.0] * len(label_to_id)  # BCELoss requires float as target type
+        for label in labels:
+            ids[label_to_id[label]] = 1.0
+        return ids
+
+    def preprocess_function(examples):
+        if data_args.text_column_names is not None:
+            text_column_names = data_args.text_column_names.split(",")
+            # join together text columns into "sentence" column
+            examples["sentence"] = examples[text_column_names[0]]
+            for column in text_column_names[1:]:
+                for i in range(len(examples[column])):
+                    examples["sentence"][i] += data_args.text_column_delimiter + examples[column][i]
+        # Tokenize the texts
+        result = tokenizer(examples["sentence"], padding=padding, max_length=max_seq_length, truncation=True)
+        if label_to_id is not None and "label" in examples:
+            if is_multi_label:
+                result["label"] = [multi_labels_to_ids(l) for l in examples["label"]]
+            else:
+                result["label"] = [(label_to_id[str(l)] if l != -1 else -1) for l in examples["label"]]
+        return result
+
+    # Running the preprocessing pipeline on all the datasets
+    with training_args.main_process_first(desc="dataset map pre-processing"):
+        raw_datasets = raw_datasets.map(
+            preprocess_function,
+            batched=True,
+            load_from_cache_file=not data_args.overwrite_cache,
+            desc="Running tokenizer on dataset",
+        )
+
+    if training_args.do_train:
+        if "train" not in raw_datasets:
+            raise ValueError("--do_train requires a train dataset.")
+        train_dataset = raw_datasets["train"]
+        if data_args.shuffle_train_dataset:
+            logger.info("Shuffling the training dataset")
+            train_dataset = train_dataset.shuffle(seed=data_args.shuffle_seed)
+        if data_args.max_train_samples is not None:
+            max_train_samples = min(len(train_dataset), data_args.max_train_samples)
+            train_dataset = train_dataset.select(range(max_train_samples))
+
+    if training_args.do_eval:
+        if "validation" not in raw_datasets and "validation_matched" not in raw_datasets:
+            if "test" not in raw_datasets and "test_matched" not in raw_datasets:
+                raise ValueError("--do_eval requires a validation or test dataset if validation is not defined.")
+            else:
+                logger.warning("Validation dataset not found. Falling back to test dataset for validation.")
+                eval_dataset = raw_datasets["test"]
+        else:
+            eval_dataset = raw_datasets["validation"]
+
+        if data_args.max_eval_samples is not None:
+            max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
+            eval_dataset = eval_dataset.select(range(max_eval_samples))
+
+    if training_args.do_predict or data_args.test_file is not None:
+        if "test" not in raw_datasets:
+            raise ValueError("--do_predict requires a test dataset")
+        predict_dataset = raw_datasets["test"]
+        # remove label column if it exists
+        if data_args.max_predict_samples is not None:
+            max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
+            predict_dataset = predict_dataset.select(range(max_predict_samples))
+
+    # Log a few random samples from the training set:
+    if training_args.do_train:
+        for index in random.sample(range(len(train_dataset)), 3):
+            logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
+
+    if data_args.metric_name is not None:
+        metric = (
+            evaluate.load(data_args.metric_name, config_name="multilabel")
+            if is_multi_label
+            else evaluate.load(data_args.metric_name)
+        )
+        logger.info(f"Using metric {data_args.metric_name} for evaluation.")
+    else:
+        if is_regression:
+            metric = evaluate.load("mse")
+            logger.info("Using mean squared error (mse) as regression score, you can use --metric_name to overwrite.")
+        else:
+            if is_multi_label:
+                metric = evaluate.load("f1", config_name="multilabel")
+                logger.info(
+                    "Using multilabel F1 for multi-label classification task, you can use --metric_name to overwrite."
+                )
+            else:
+                metric = evaluate.load("accuracy")
+                logger.info("Using accuracy as classification score, you can use --metric_name to overwrite.")
+
+    def compute_metrics(p: EvalPrediction):
+        preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
+        if is_regression:
+            preds = np.squeeze(preds)
+            result = metric.compute(predictions=preds, references=p.label_ids)
+        elif is_multi_label:
+            preds = np.array([np.where(p > 0.5, 1, 0) for p in preds])
+            # Micro F1 is commonly used in multi-label classification
+            result = metric.compute(predictions=preds, references=p.label_ids, average="micro")
+        else:
+            preds = np.argmax(preds, axis=1)
+            result = metric.compute(predictions=preds, references=p.label_ids)
+        if len(result) > 1:
+            result["combined_score"] = np.mean(list(result.values())).item()
+        return result
+
+    # Data collator will default to DataCollatorWithPadding when the tokenizer is passed to Trainer, so we change it if
+    # we already did the padding.
+    if data_args.pad_to_max_length:
+        data_collator = default_data_collator
+    elif training_args.fp16:
+        data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
+    else:
+        data_collator = None
+
+    # Initialize our Trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset if training_args.do_train else None,
+        eval_dataset=eval_dataset if training_args.do_eval else None,
+        compute_metrics=compute_metrics,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+    )
+
+    # Training
+    if training_args.do_train:
+        checkpoint = None
+        if training_args.resume_from_checkpoint is not None:
+            checkpoint = training_args.resume_from_checkpoint
+        elif last_checkpoint is not None:
+            checkpoint = last_checkpoint
+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        metrics = train_result.metrics
+        max_train_samples = (
+            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
+        )
+        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
+        trainer.save_model()  # Saves the tokenizer too for easy upload
+        trainer.log_metrics("train", metrics)
+        trainer.save_metrics("train", metrics)
+        trainer.save_state()
+
+    # Evaluation
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+        metrics = trainer.evaluate(eval_dataset=eval_dataset)
+        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
+        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    if training_args.do_predict:
+        logger.info("*** Predict ***")
+        # Removing the `label` columns if exists because it might contains -1 and Trainer won't like that.
+        if "label" in predict_dataset.features:
+            predict_dataset = predict_dataset.remove_columns("label")
+        predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions
+        if is_regression:
+            predictions = np.squeeze(predictions)
+        elif is_multi_label:
+            predictions = np.array([np.where(p > 0.5, 1, 0) for p in predictions])
+        else:
+            predictions = np.argmax(predictions, axis=1)
+        output_predict_file = os.path.join(training_args.output_dir, "predict_results.txt")
+        if trainer.is_world_process_zero():
+            with open(output_predict_file, "w") as writer:
+                logger.info("***** Predict results *****")
+                writer.write("index\tprediction\n")
+                for index, item in enumerate(predictions):
+                    if is_regression:
+                        writer.write(f"{index}\t{item:3.3f}\n")
+                    elif is_multi_label:
+                        # recover from multi-hot encoding
+                        item = [label_list[i] for i in range(len(item)) if item[i] == 1]
+                        writer.write(f"{index}\t{item}\n")
+                    else:
+                        item = label_list[item]
+                        writer.write(f"{index}\t{item}\n")
+        logger.info("Predict results saved at {}".format(output_predict_file))
+    kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-classification"}
+
+    if training_args.push_to_hub:
+        trainer.push_to_hub(**kwargs)
+    else:
+        trainer.create_model_card(**kwargs)
+
+
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py
index c36ce32bc424..aaf98e85bd5a 100755
--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@@ -20,14 +20,15 @@
 import os
 import random
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
 import datasets
+import evaluate
 import numpy as np
 from datasets import load_dataset
 
-import evaluate
 import transformers
 from transformers import (
     AutoConfig,
@@ -48,7 +49,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 
@@ -188,12 +189,28 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -216,6 +233,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_glue", model_args, data_args)
@@ -227,6 +250,10 @@ def main():
         handlers=[logging.StreamHandler(sys.stdout)],
     )
 
+    if training_args.should_log:
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers.utils.logging.set_verbosity_info()
+
     log_level = training_args.get_process_log_level()
     logger.setLevel(log_level)
     datasets.utils.logging.set_verbosity(log_level)
@@ -237,7 +264,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
@@ -277,7 +304,7 @@ def main():
             "glue",
             data_args.task_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     elif data_args.dataset_name is not None:
         # Downloading and loading a dataset from the hub.
@@ -285,7 +312,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         # Loading a dataset from your local files.
@@ -314,7 +341,7 @@ def main():
                 "csv",
                 data_files=data_files,
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
         else:
             # Loading a dataset from local json files
@@ -322,7 +349,7 @@ def main():
                 "json",
                 data_files=data_files,
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
     # See more about loading any type of standard or custom dataset at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -357,15 +384,16 @@ def main():
         finetuning_task=data_args.task_name,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
-        ort=True if training_args.ort else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     model = AutoModelForSequenceClassification.from_pretrained(
         model_args.model_name_or_path,
@@ -373,7 +401,8 @@ def main():
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
         ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
     )
 
@@ -407,12 +436,12 @@ def main():
     ):
         # Some have all caps in their config, some don't.
         label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()}
-        if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
+        if sorted(label_name_to_id.keys()) == sorted(label_list):
             label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)}
         else:
             logger.warning(
                 "Your model seems to have been trained with labels, but they don't match the dataset: ",
-                f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
+                f"model labels: {sorted(label_name_to_id.keys())}, dataset labels: {sorted(label_list)}."
                 "\nIgnoring the model labels as a result.",
             )
     elif data_args.task_name is None and not is_regression:
@@ -483,6 +512,8 @@ def preprocess_function(examples):
     # Get the metric function
     if data_args.task_name is not None:
         metric = evaluate.load("glue", data_args.task_name)
+    elif is_regression:
+        metric = evaluate.load("mse")
     else:
         metric = evaluate.load("accuracy")
 
@@ -491,15 +522,10 @@ def preprocess_function(examples):
     def compute_metrics(p: EvalPrediction):
         preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
         preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
-        if data_args.task_name is not None:
-            result = metric.compute(predictions=preds, references=p.label_ids)
-            if len(result) > 1:
-                result["combined_score"] = np.mean(list(result.values())).item()
-            return result
-        elif is_regression:
-            return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
-        else:
-            return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}
+        result = metric.compute(predictions=preds, references=p.label_ids)
+        if len(result) > 1:
+            result["combined_score"] = np.mean(list(result.values())).item()
+        return result
 
     # Data collator will default to DataCollatorWithPadding when the tokenizer is passed to Trainer, so we change it if
     # we already did the padding.
diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py
index f6c9af68d268..d160dac79622 100644
--- a/examples/pytorch/text-classification/run_glue_no_trainer.py
+++ b/examples/pytorch/text-classification/run_glue_no_trainer.py
@@ -22,17 +22,17 @@
 from pathlib import Path
 
 import datasets
+import evaluate
 import torch
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
 from datasets import load_dataset
+from huggingface_hub import Repository, create_repo
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
 
-import evaluate
 import transformers
-from accelerate import Accelerator
-from accelerate.logging import get_logger
-from accelerate.utils import set_seed
-from huggingface_hub import Repository
 from transformers import (
     AutoConfig,
     AutoModelForSequenceClassification,
@@ -43,12 +43,12 @@
     default_data_collator,
     get_scheduler,
 )
-from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry
+from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 logger = get_logger(__name__)
 
@@ -156,6 +156,16 @@ def parse_args():
         "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
     )
     parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--trust_remote_code",
+        type=bool,
+        default=False,
+        help=(
+            "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+            "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+            "execute code present on the Hub on your local machine."
+        ),
+    )
     parser.add_argument(
         "--checkpointing_steps",
         type=str,
@@ -217,7 +227,7 @@ def main():
     # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
     # in the environment
     accelerator = (
-        Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
+        Accelerator(log_with=args.report_to, project_dir=args.output_dir) if args.with_tracking else Accelerator()
     )
     # Make one log on every process with the configuration for debugging.
     logging.basicConfig(
@@ -240,11 +250,14 @@ def main():
     # Handle the repository creation
     if accelerator.is_main_process:
         if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            repo = Repository(args.output_dir, clone_from=repo_name)
+            # Retrieve of infer repo_name
+            repo_name = args.hub_model_id
+            if repo_name is None:
+                repo_name = Path(args.output_dir).absolute().name
+            # Create repo and retrieve repo_id
+            repo_id = create_repo(repo_name, exist_ok=True, token=args.hub_token).repo_id
+            # Clone repo locally
+            repo = Repository(args.output_dir, clone_from=repo_id, token=args.hub_token)
 
             with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                 if "step_*" not in gitignore:
@@ -306,13 +319,21 @@ def main():
     #
     # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
     # download model & vocab.
-    config = AutoConfig.from_pretrained(args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name)
-    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
+    config = AutoConfig.from_pretrained(
+        args.model_name_or_path,
+        num_labels=num_labels,
+        finetuning_task=args.task_name,
+        trust_remote_code=args.trust_remote_code,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.model_name_or_path, use_fast=not args.use_slow_tokenizer, trust_remote_code=args.trust_remote_code
+    )
     model = AutoModelForSequenceClassification.from_pretrained(
         args.model_name_or_path,
         from_tf=bool(".ckpt" in args.model_name_or_path),
         config=config,
         ignore_mismatched_sizes=args.ignore_mismatched_sizes,
+        trust_remote_code=args.trust_remote_code,
     )
 
     # Preprocessing the datasets
@@ -338,7 +359,7 @@ def main():
     ):
         # Some have all caps in their config, some don't.
         label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()}
-        if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
+        if sorted(label_name_to_id.keys()) == sorted(label_list):
             logger.info(
                 f"The configuration of the model provided the following label correspondence: {label_name_to_id}. "
                 "Using it!"
@@ -347,7 +368,7 @@ def main():
         else:
             logger.warning(
                 "Your model seems to have been trained with labels, but they don't match the dataset: ",
-                f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
+                f"model labels: {sorted(label_name_to_id.keys())}, dataset labels: {sorted(label_list)}."
                 "\nIgnoring the model labels as a result.",
             )
     elif args.task_name is None and not is_regression:
@@ -486,35 +507,45 @@ def preprocess_function(examples):
     # Potentially load in the weights and states from a previous save
     if args.resume_from_checkpoint:
         if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
-            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
-            accelerator.load_state(args.resume_from_checkpoint)
+            checkpoint_path = args.resume_from_checkpoint
             path = os.path.basename(args.resume_from_checkpoint)
         else:
             # Get the most recent checkpoint
             dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
             dirs.sort(key=os.path.getctime)
             path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
+            checkpoint_path = path
+            path = os.path.basename(checkpoint_path)
+
+        accelerator.print(f"Resumed from checkpoint: {checkpoint_path}")
+        accelerator.load_state(path)
         # Extract `epoch_{i}` or `step_{i}`
         training_difference = os.path.splitext(path)[0]
 
         if "epoch" in training_difference:
             starting_epoch = int(training_difference.replace("epoch_", "")) + 1
             resume_step = None
+            completed_steps = starting_epoch * num_update_steps_per_epoch
         else:
-            resume_step = int(training_difference.replace("step_", ""))
+            # need to multiply `gradient_accumulation_steps` to reflect real steps
+            resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
             starting_epoch = resume_step // len(train_dataloader)
+            completed_steps = resume_step // args.gradient_accumulation_steps
             resume_step -= starting_epoch * len(train_dataloader)
 
+    # update the progress_bar if load from checkpoint
+    progress_bar.update(completed_steps)
+
     for epoch in range(starting_epoch, args.num_train_epochs):
         model.train()
         if args.with_tracking:
             total_loss = 0
-        for step, batch in enumerate(train_dataloader):
-            # We need to skip steps until we reach the resumed step
-            if args.resume_from_checkpoint and epoch == starting_epoch:
-                if resume_step is not None and step < resume_step:
-                    completed_steps += 1
-                    continue
+        if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
+            # We skip the first `n` batches in the dataloader when resuming from a checkpoint
+            active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
+        else:
+            active_dataloader = train_dataloader
+        for step, batch in enumerate(active_dataloader):
             outputs = model(**batch)
             loss = outputs.loss
             # We keep track of the loss at each epoch
@@ -531,7 +562,7 @@ def preprocess_function(examples):
 
             if isinstance(checkpointing_steps, int):
                 if completed_steps % checkpointing_steps == 0:
-                    output_dir = f"step_{completed_steps }"
+                    output_dir = f"step_{completed_steps}"
                     if args.output_dir is not None:
                         output_dir = os.path.join(args.output_dir, output_dir)
                     accelerator.save_state(output_dir)
diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py
index e6c0ee8d1dde..95ab10500a1f 100755
--- a/examples/pytorch/text-classification/run_xnli.py
+++ b/examples/pytorch/text-classification/run_xnli.py
@@ -21,14 +21,15 @@
 import os
 import random
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
 import datasets
+import evaluate
 import numpy as np
 from datasets import load_dataset
 
-import evaluate
 import transformers
 from transformers import (
     AutoConfig,
@@ -48,7 +49,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 
@@ -152,12 +153,28 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -175,6 +192,12 @@ def main():
     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_xnli", model_args)
@@ -186,6 +209,10 @@ def main():
         handlers=[logging.StreamHandler(sys.stdout)],
     )
 
+    if training_args.should_log:
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers.utils.logging.set_verbosity_info()
+
     log_level = training_args.get_process_log_level()
     logger.setLevel(log_level)
     datasets.utils.logging.set_verbosity(log_level)
@@ -196,7 +223,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
@@ -228,7 +255,7 @@ def main():
                 model_args.language,
                 split="train",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
         else:
             train_dataset = load_dataset(
@@ -236,7 +263,7 @@ def main():
                 model_args.train_language,
                 split="train",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
         label_list = train_dataset.features["label"].names
 
@@ -246,7 +273,7 @@ def main():
             model_args.language,
             split="validation",
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
         label_list = eval_dataset.features["label"].names
 
@@ -256,7 +283,7 @@ def main():
             model_args.language,
             split="test",
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
         label_list = predict_dataset.features["label"].names
 
@@ -269,10 +296,13 @@ def main():
     config = AutoConfig.from_pretrained(
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         num_labels=num_labels,
+        id2label={str(i): label for i, label in enumerate(label_list)},
+        label2id={label: i for i, label in enumerate(label_list)},
         finetuning_task="xnli",
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
@@ -280,7 +310,8 @@ def main():
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     model = AutoModelForSequenceClassification.from_pretrained(
         model_args.model_name_or_path,
@@ -288,7 +319,8 @@ def main():
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
         ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
     )
 
diff --git a/examples/pytorch/text-generation/README.md b/examples/pytorch/text-generation/README.md
index 2177c45c3b88..fce4aef86b14 100644
--- a/examples/pytorch/text-generation/README.md
+++ b/examples/pytorch/text-generation/README.md
@@ -18,7 +18,7 @@ limitations under the License.
 
 Based on the script [`run_generation.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-generation/run_generation.py).
 
-Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL, XLNet, CTRL.
+Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, GPTJ, Transformer-XL, XLNet, CTRL, BLOOM, LLAMA, OPT.
 A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you
 can try out the different models available in the library.
 
diff --git a/examples/pytorch/text-generation/requirements.txt b/examples/pytorch/text-generation/requirements.txt
index 0ef50f181f64..324a8cfb1c29 100644
--- a/examples/pytorch/text-generation/requirements.txt
+++ b/examples/pytorch/text-generation/requirements.txt
@@ -1,3 +1,4 @@
+accelerate >= 0.21.0
 sentencepiece != 0.1.92
 protobuf
 torch >= 1.3
diff --git a/examples/pytorch/text-generation/run_generation.py b/examples/pytorch/text-generation/run_generation.py
index 9b4b09fc9687..557b75572c99 100755
--- a/examples/pytorch/text-generation/run_generation.py
+++ b/examples/pytorch/text-generation/run_generation.py
@@ -19,18 +19,29 @@
 
 
 import argparse
+import inspect
 import logging
+from typing import Tuple
 
-import numpy as np
 import torch
+from accelerate import PartialState
+from accelerate.utils import set_seed
 
 from transformers import (
+    AutoTokenizer,
+    BloomForCausalLM,
+    BloomTokenizerFast,
     CTRLLMHeadModel,
     CTRLTokenizer,
+    GenerationMixin,
     GPT2LMHeadModel,
     GPT2Tokenizer,
+    GPTJForCausalLM,
+    LlamaForCausalLM,
+    LlamaTokenizer,
     OpenAIGPTLMHeadModel,
     OpenAIGPTTokenizer,
+    OPTForCausalLM,
     TransfoXLLMHeadModel,
     TransfoXLTokenizer,
     XLMTokenizer,
@@ -38,6 +49,7 @@
     XLNetLMHeadModel,
     XLNetTokenizer,
 )
+from transformers.modeling_outputs import CausalLMOutputWithPast
 
 
 logging.basicConfig(
@@ -56,6 +68,10 @@
     "xlnet": (XLNetLMHeadModel, XLNetTokenizer),
     "transfo-xl": (TransfoXLLMHeadModel, TransfoXLTokenizer),
     "xlm": (XLMWithLMHeadModel, XLMTokenizer),
+    "gptj": (GPTJForCausalLM, AutoTokenizer),
+    "bloom": (BloomForCausalLM, BloomTokenizerFast),
+    "llama": (LlamaForCausalLM, LlamaTokenizer),
+    "opt": (OPTForCausalLM, GPT2Tokenizer),
 }
 
 # Padding text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia
@@ -73,13 +89,6 @@
 with people, even a bishop, begging for his blessing. <eod> </s> <eos>"""
 
 
-def set_seed(args):
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if args.n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-
 #
 # Functions to prepare models' input
 #
@@ -151,6 +160,129 @@ def adjust_length_to_model(length, max_sequence_length):
     return length
 
 
+def sparse_model_config(model_config):
+    embedding_size = None
+    if hasattr(model_config, "hidden_size"):
+        embedding_size = model_config.hidden_size
+    elif hasattr(model_config, "n_embed"):
+        embedding_size = model_config.n_embed
+    elif hasattr(model_config, "n_embd"):
+        embedding_size = model_config.n_embd
+
+    num_head = None
+    if hasattr(model_config, "num_attention_heads"):
+        num_head = model_config.num_attention_heads
+    elif hasattr(model_config, "n_head"):
+        num_head = model_config.n_head
+
+    if embedding_size is None or num_head is None or num_head == 0:
+        raise ValueError("Check the model config")
+
+    num_embedding_size_per_head = int(embedding_size / num_head)
+    if hasattr(model_config, "n_layer"):
+        num_layer = model_config.n_layer
+    elif hasattr(model_config, "num_hidden_layers"):
+        num_layer = model_config.num_hidden_layers
+    else:
+        raise ValueError("Number of hidden layers couldn't be determined from the model config")
+
+    return num_layer, num_head, num_embedding_size_per_head
+
+
+def generate_past_key_values(model, batch_size, seq_len):
+    num_block_layers, num_attention_heads, num_embedding_size_per_head = sparse_model_config(model.config)
+    if model.config.model_type == "bloom":
+        past_key_values = tuple(
+            (
+                torch.empty(int(num_attention_heads * batch_size), num_embedding_size_per_head, seq_len)
+                .to(model.dtype)
+                .to(model.device),
+                torch.empty(int(num_attention_heads * batch_size), seq_len, num_embedding_size_per_head)
+                .to(model.dtype)
+                .to(model.device),
+            )
+            for _ in range(num_block_layers)
+        )
+    else:
+        past_key_values = tuple(
+            (
+                torch.empty(batch_size, num_attention_heads, seq_len, num_embedding_size_per_head)
+                .to(model.dtype)
+                .to(model.device),
+                torch.empty(batch_size, num_attention_heads, seq_len, num_embedding_size_per_head)
+                .to(model.dtype)
+                .to(model.device),
+            )
+            for _ in range(num_block_layers)
+        )
+    return past_key_values
+
+
+def prepare_jit_inputs(inputs, model, tokenizer):
+    batch_size = len(inputs)
+    dummy_input = tokenizer.batch_encode_plus(inputs, return_tensors="pt")
+    dummy_input = dummy_input.to(model.device)
+    if model.config.use_cache:
+        dummy_input["past_key_values"] = generate_past_key_values(model, batch_size, 1)
+    dummy_input["attention_mask"] = torch.cat(
+        [
+            torch.zeros(dummy_input["attention_mask"].shape[0], 1)
+            .to(dummy_input["attention_mask"].dtype)
+            .to(model.device),
+            dummy_input["attention_mask"],
+        ],
+        -1,
+    )
+    return dummy_input
+
+
+class _ModelFallbackWrapper(GenerationMixin):
+    __slots__ = ("_optimized", "_default")
+
+    def __init__(self, optimized, default):
+        self._optimized = optimized
+        self._default = default
+
+    def __call__(self, *args, **kwargs):
+        if kwargs["past_key_values"] is None and self._default.config.use_cache:
+            kwargs["past_key_values"] = generate_past_key_values(self._default, kwargs["input_ids"].shape[0], 0)
+        kwargs.pop("position_ids", None)
+        for k in list(kwargs.keys()):
+            if kwargs[k] is None or isinstance(kwargs[k], bool):
+                kwargs.pop(k)
+        outputs = self._optimized(**kwargs)
+        lm_logits = outputs[0]
+        past_key_values = outputs[1]
+        fixed_output = CausalLMOutputWithPast(
+            loss=None,
+            logits=lm_logits,
+            past_key_values=past_key_values,
+            hidden_states=None,
+            attentions=None,
+        )
+        return fixed_output
+
+    def __getattr__(self, item):
+        return getattr(self._default, item)
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, inputs_embeds=None, use_cache=None, **kwargs
+    ):
+        return self._default.prepare_inputs_for_generation(
+            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, **kwargs
+        )
+
+    def _reorder_cache(
+        self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
+    ) -> Tuple[Tuple[torch.Tensor]]:
+        """
+        This function is used to re-order the `past_key_values` cache if [`~PretrainedModel.beam_search`] or
+        [`~PretrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
+        beam_idx at every generation step.
+        """
+        return self._default._reorder_cache(past_key_values, beam_idx)
+
+
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument(
@@ -189,21 +321,27 @@ def main():
     parser.add_argument("--xlm_language", type=str, default="", help="Optional language when used with the XLM model.")
 
     parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
+    parser.add_argument(
+        "--use_cpu",
+        action="store_true",
+        help="Whether or not to use cpu. If set to False, " "we will use gpu/npu or mps device if available",
+    )
     parser.add_argument("--num_return_sequences", type=int, default=1, help="The number of samples to generate.")
     parser.add_argument(
         "--fp16",
         action="store_true",
         help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
     )
+    parser.add_argument("--jit", action="store_true", help="Whether or not to use jit trace to accelerate inference")
     args = parser.parse_args()
 
-    args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-    args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
+    # Initialize the distributed state.
+    distributed_state = PartialState(cpu=args.use_cpu)
 
-    logger.warning(f"device: {args.device}, n_gpu: {args.n_gpu}, 16-bits training: {args.fp16}")
+    logger.warning(f"device: {distributed_state.device}, 16-bits inference: {args.fp16}")
 
-    set_seed(args)
+    if args.seed is not None:
+        set_seed(args.seed)
 
     # Initialize the model and tokenizer
     try:
@@ -213,13 +351,17 @@ def main():
         raise KeyError("the model {} you specified is not supported. You are welcome to add it and open a PR :)")
 
     tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
     model = model_class.from_pretrained(args.model_name_or_path)
-    model.to(args.device)
+
+    # Set the model to the right device
+    model.to(distributed_state.device)
 
     if args.fp16:
         model.half()
-
-    args.length = adjust_length_to_model(args.length, max_sequence_length=model.config.max_position_embeddings)
+    max_seq_length = getattr(model.config, "max_position_embeddings", 0)
+    args.length = adjust_length_to_model(args.length, max_sequence_length=max_seq_length)
     logger.info(args)
 
     prompt_text = args.prompt if args.prompt else input("Model prompt >>> ")
@@ -241,13 +383,30 @@ def main():
     else:
         prefix = args.prefix if args.prefix else args.padding_text
         encoded_prompt = tokenizer.encode(prefix + prompt_text, add_special_tokens=False, return_tensors="pt")
-    encoded_prompt = encoded_prompt.to(args.device)
+    encoded_prompt = encoded_prompt.to(distributed_state.device)
 
     if encoded_prompt.size()[-1] == 0:
         input_ids = None
     else:
         input_ids = encoded_prompt
 
+    if args.jit:
+        jit_input_texts = ["enable jit"]
+        jit_inputs = prepare_jit_inputs(jit_input_texts, model, tokenizer)
+        torch._C._jit_set_texpr_fuser_enabled(False)
+        model.config.return_dict = False
+        if hasattr(model, "forward"):
+            sig = inspect.signature(model.forward)
+        else:
+            sig = inspect.signature(model.__call__)
+        jit_inputs = tuple(jit_inputs[key] for key in sig.parameters if jit_inputs.get(key, None) is not None)
+        traced_model = torch.jit.trace(model, jit_inputs, strict=False)
+        traced_model = torch.jit.freeze(traced_model.eval())
+        traced_model(*jit_inputs)
+        traced_model(*jit_inputs)
+
+        model = _ModelFallbackWrapper(traced_model, model)
+
     output_sequences = model.generate(
         input_ids=input_ids,
         max_length=args.length + len(encoded_prompt[0]),
diff --git a/examples/pytorch/text-generation/run_generation_contrastive_search.py b/examples/pytorch/text-generation/run_generation_contrastive_search.py
index 117f063a6dd9..91781f05185f 100755
--- a/examples/pytorch/text-generation/run_generation_contrastive_search.py
+++ b/examples/pytorch/text-generation/run_generation_contrastive_search.py
@@ -23,8 +23,8 @@
 import argparse
 import logging
 
-import numpy as np
-import torch
+from accelerate import PartialState
+from accelerate.utils import set_seed
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
@@ -37,13 +37,6 @@
 logger = logging.getLogger(__name__)
 
 
-def set_seed(args):
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if args.n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument(
@@ -73,7 +66,11 @@ def main():
     parser.add_argument("--xlm_language", type=str, default="", help="Optional language when used with the XLM model.")
 
     parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
+    parser.add_argument(
+        "--use_cpu",
+        action="store_true",
+        help="Whether or not to use cpu. If set to False, " "we will use gpu/npu or mps device if available",
+    )
     parser.add_argument(
         "--fp16",
         action="store_true",
@@ -81,12 +78,13 @@ def main():
     )
     args = parser.parse_args()
 
-    args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-    args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
+    # Initialize the distributed state.
+    distributed_state = PartialState(cpu=args.use_cpu)
 
-    logger.warning(f"device: {args.device}, n_gpu: {args.n_gpu}, 16-bits training: {args.fp16}")
+    logger.warning(f"device: {distributed_state.device}, 16-bits inference: {args.fp16}")
 
-    set_seed(args)
+    if args.seed is not None:
+        set_seed(args.seed)
 
     # Initialize the model and tokenizer
     tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
@@ -94,7 +92,8 @@ def main():
 
     # tokenizer = GPT2Tokenizer.from_pretrained(args.model_name_or_path)
     # model = OPTForCausalLM.from_pretrained(args.model_name_or_path)
-    model.to(args.device)
+    # Set the model to the right device
+    model.to(distributed_state.device)
 
     if args.fp16:
         model.half()
@@ -103,7 +102,7 @@ def main():
     prompt_text = args.prompt if args.prompt else input("Model prompt >>> ")
 
     inputs = tokenizer(prompt_text, return_tensors="pt", add_special_tokens=False)
-    inputs = {key: value.to(args.device) for key, value in inputs.items()}
+    inputs = {key: value.to(distributed_state.device) for key, value in inputs.items()}
 
     output_sequences = model.generate(
         **inputs,
diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py
index 2993195a07e2..95a1e1078d58 100755
--- a/examples/pytorch/token-classification/run_ner.py
+++ b/examples/pytorch/token-classification/run_ner.py
@@ -22,14 +22,15 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
 import datasets
+import evaluate
 import numpy as np
 from datasets import ClassLabel, load_dataset
 
-import evaluate
 import transformers
 from transformers import (
     AutoConfig,
@@ -49,7 +50,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
 
@@ -79,12 +80,28 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -217,6 +234,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_ner", model_args, data_args)
@@ -228,6 +251,10 @@ def main():
         handlers=[logging.StreamHandler(sys.stdout)],
     )
 
+    if training_args.should_log:
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers.utils.logging.set_verbosity_info()
+
     log_level = training_args.get_process_log_level()
     logger.setLevel(log_level)
     datasets.utils.logging.set_verbosity(log_level)
@@ -238,7 +265,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
@@ -275,7 +302,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -344,7 +371,8 @@ def get_label_list(labels):
         finetuning_task=data_args.task_name,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     tokenizer_name_or_path = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path
@@ -354,7 +382,8 @@ def get_label_list(labels):
             cache_dir=model_args.cache_dir,
             use_fast=True,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
             add_prefix_space=True,
         )
     else:
@@ -363,7 +392,8 @@ def get_label_list(labels):
             cache_dir=model_args.cache_dir,
             use_fast=True,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
 
     model = AutoModelForTokenClassification.from_pretrained(
@@ -372,7 +402,8 @@ def get_label_list(labels):
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
         ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
     )
 
@@ -386,7 +417,7 @@ def get_label_list(labels):
 
     # Model has labels -> use them.
     if model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id:
-        if list(sorted(model.config.label2id.keys())) == list(sorted(label_list)):
+        if sorted(model.config.label2id.keys()) == sorted(label_list):
             # Reorganize `label_list` to match the ordering of the model.
             if labels_are_int:
                 label_to_id = {i: int(model.config.label2id[l]) for i, l in enumerate(label_list)}
@@ -397,13 +428,13 @@ def get_label_list(labels):
         else:
             logger.warning(
                 "Your model seems to have been trained with labels, but they don't match the dataset: ",
-                f"model labels: {list(sorted(model.config.label2id.keys()))}, dataset labels:"
-                f" {list(sorted(label_list))}.\nIgnoring the model labels as a result.",
+                f"model labels: {sorted(model.config.label2id.keys())}, dataset labels:"
+                f" {sorted(label_list)}.\nIgnoring the model labels as a result.",
             )
 
     # Set the correspondences label/ID inside the model config
     model.config.label2id = {l: i for i, l in enumerate(label_list)}
-    model.config.id2label = {i: l for i, l in enumerate(label_list)}
+    model.config.id2label = dict(enumerate(label_list))
 
     # Map that sends B-Xxx label to its I-Xxx counterpart
     b_to_i_label = []
diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py
index 746a1ba0ae76..9d419543268b 100755
--- a/examples/pytorch/token-classification/run_ner_no_trainer.py
+++ b/examples/pytorch/token-classification/run_ner_no_trainer.py
@@ -27,17 +27,18 @@
 from pathlib import Path
 
 import datasets
+import evaluate
+import numpy as np
 import torch
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
 from datasets import ClassLabel, load_dataset
+from huggingface_hub import Repository, create_repo
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
 
-import evaluate
 import transformers
-from accelerate import Accelerator
-from accelerate.logging import get_logger
-from accelerate.utils import set_seed
-from huggingface_hub import Repository
 from transformers import (
     CONFIG_MAPPING,
     MODEL_MAPPING,
@@ -50,12 +51,12 @@
     default_data_collator,
     get_scheduler,
 )
-from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry
+from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
@@ -209,6 +210,16 @@ def parse_args():
         "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
     )
     parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--trust_remote_code",
+        type=bool,
+        default=False,
+        help=(
+            "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+            "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+            "execute code present on the Hub on your local machine."
+        ),
+    )
     parser.add_argument(
         "--checkpointing_steps",
         type=str,
@@ -271,7 +282,7 @@ def main():
     # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
     # in the environment
     accelerator = (
-        Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
+        Accelerator(log_with=args.report_to, project_dir=args.output_dir) if args.with_tracking else Accelerator()
     )
     # Make one log on every process with the configuration for debugging.
     logging.basicConfig(
@@ -294,11 +305,14 @@ def main():
     # Handle the repository creation
     if accelerator.is_main_process:
         if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            repo = Repository(args.output_dir, clone_from=repo_name)
+            # Retrieve of infer repo_name
+            repo_name = args.hub_model_id
+            if repo_name is None:
+                repo_name = Path(args.output_dir).absolute().name
+            # Create repo and retrieve repo_id
+            repo_id = create_repo(repo_name, exist_ok=True, token=args.hub_token).repo_id
+            # Clone repo locally
+            repo = Repository(args.output_dir, clone_from=repo_id, token=args.hub_token)
 
             with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                 if "step_*" not in gitignore:
@@ -384,9 +398,13 @@ def get_label_list(labels):
     # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
     # download model & vocab.
     if args.config_name:
-        config = AutoConfig.from_pretrained(args.config_name, num_labels=num_labels)
+        config = AutoConfig.from_pretrained(
+            args.config_name, num_labels=num_labels, trust_remote_code=args.trust_remote_code
+        )
     elif args.model_name_or_path:
-        config = AutoConfig.from_pretrained(args.model_name_or_path, num_labels=num_labels)
+        config = AutoConfig.from_pretrained(
+            args.model_name_or_path, num_labels=num_labels, trust_remote_code=args.trust_remote_code
+        )
     else:
         config = CONFIG_MAPPING[args.model_type]()
         logger.warning("You are instantiating a new config instance from scratch.")
@@ -399,9 +417,13 @@ def get_label_list(labels):
         )
 
     if config.model_type in {"bloom", "gpt2", "roberta"}:
-        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, use_fast=True, add_prefix_space=True)
+        tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_name_or_path, use_fast=True, add_prefix_space=True, trust_remote_code=args.trust_remote_code
+        )
     else:
-        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, use_fast=True)
+        tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_name_or_path, use_fast=True, trust_remote_code=args.trust_remote_code
+        )
 
     if args.model_name_or_path:
         model = AutoModelForTokenClassification.from_pretrained(
@@ -409,10 +431,11 @@ def get_label_list(labels):
             from_tf=bool(".ckpt" in args.model_name_or_path),
             config=config,
             ignore_mismatched_sizes=args.ignore_mismatched_sizes,
+            trust_remote_code=args.trust_remote_code,
         )
     else:
         logger.info("Training new model from scratch")
-        model = AutoModelForTokenClassification.from_config(config)
+        model = AutoModelForTokenClassification.from_config(config, trust_remote_code=args.trust_remote_code)
 
     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
     # on a small vocab and want a smaller embedding size, remove this test.
@@ -424,7 +447,7 @@ def get_label_list(labels):
 
     # Model has labels -> use them.
     if model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id:
-        if list(sorted(model.config.label2id.keys())) == list(sorted(label_list)):
+        if sorted(model.config.label2id.keys()) == sorted(label_list):
             # Reorganize `label_list` to match the ordering of the model.
             if labels_are_int:
                 label_to_id = {i: int(model.config.label2id[l]) for i, l in enumerate(label_list)}
@@ -435,13 +458,13 @@ def get_label_list(labels):
         else:
             logger.warning(
                 "Your model seems to have been trained with labels, but they don't match the dataset: ",
-                f"model labels: {list(sorted(model.config.label2id.keys()))}, dataset labels:"
-                f" {list(sorted(label_list))}.\nIgnoring the model labels as a result.",
+                f"model labels: {sorted(model.config.label2id.keys())}, dataset labels:"
+                f" {sorted(label_list)}.\nIgnoring the model labels as a result.",
             )
 
     # Set the correspondences label/ID inside the model config
     model.config.label2id = {l: i for i, l in enumerate(label_list)}
-    model.config.id2label = {i: l for i, l in enumerate(label_list)}
+    model.config.id2label = dict(enumerate(label_list))
 
     # Map that sends B-Xxx label to its I-Xxx counterpart
     b_to_i_label = []
@@ -644,35 +667,45 @@ def compute_metrics():
     # Potentially load in the weights and states from a previous save
     if args.resume_from_checkpoint:
         if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
-            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
-            accelerator.load_state(args.resume_from_checkpoint)
+            checkpoint_path = args.resume_from_checkpoint
             path = os.path.basename(args.resume_from_checkpoint)
         else:
             # Get the most recent checkpoint
             dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
             dirs.sort(key=os.path.getctime)
             path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
+            checkpoint_path = path
+            path = os.path.basename(checkpoint_path)
+
+        accelerator.print(f"Resumed from checkpoint: {checkpoint_path}")
+        accelerator.load_state(path)
         # Extract `epoch_{i}` or `step_{i}`
         training_difference = os.path.splitext(path)[0]
 
         if "epoch" in training_difference:
             starting_epoch = int(training_difference.replace("epoch_", "")) + 1
             resume_step = None
+            completed_steps = starting_epoch * num_update_steps_per_epoch
         else:
-            resume_step = int(training_difference.replace("step_", ""))
+            # need to multiply `gradient_accumulation_steps` to reflect real steps
+            resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
             starting_epoch = resume_step // len(train_dataloader)
+            completed_steps = resume_step // args.gradient_accumulation_steps
             resume_step -= starting_epoch * len(train_dataloader)
 
+    # update the progress_bar if load from checkpoint
+    progress_bar.update(completed_steps)
+
     for epoch in range(starting_epoch, args.num_train_epochs):
         model.train()
         if args.with_tracking:
             total_loss = 0
-        for step, batch in enumerate(train_dataloader):
-            # We need to skip steps until we reach the resumed step
-            if args.resume_from_checkpoint and epoch == starting_epoch:
-                if resume_step is not None and step < resume_step:
-                    completed_steps += 1
-                    continue
+        if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
+            # We skip the first `n` batches in the dataloader when resuming from a checkpoint
+            active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
+        else:
+            active_dataloader = train_dataloader
+        for step, batch in enumerate(active_dataloader):
             outputs = model(**batch)
             loss = outputs.loss
             # We keep track of the loss at each epoch
@@ -689,7 +722,7 @@ def compute_metrics():
 
             if isinstance(checkpointing_steps, int):
                 if completed_steps % checkpointing_steps == 0:
-                    output_dir = f"step_{completed_steps }"
+                    output_dir = f"step_{completed_steps}"
                     if args.output_dir is not None:
                         output_dir = os.path.join(args.output_dir, output_dir)
                     accelerator.save_state(output_dir)
@@ -770,6 +803,12 @@ def compute_metrics():
             if args.with_tracking:
                 all_results.update({"train_loss": total_loss.item() / len(train_dataloader)})
             with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
+                # Convert all float64 & int64 type numbers to float & int for json serialization
+                for key, value in all_results.items():
+                    if isinstance(value, np.float64):
+                        all_results[key] = float(value)
+                    elif isinstance(value, np.int64):
+                        all_results[key] = int(value)
                 json.dump(all_results, f)
 
 
diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py
index 83edd78d3b23..8c29beaea69b 100755
--- a/examples/pytorch/translation/run_translation.py
+++ b/examples/pytorch/translation/run_translation.py
@@ -21,14 +21,15 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
 import datasets
+import evaluate
 import numpy as np
 from datasets import load_dataset
 
-import evaluate
 import transformers
 from transformers import (
     AutoConfig,
@@ -52,7 +53,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
 
@@ -89,12 +90,28 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -261,6 +278,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_translation", model_args, data_args)
@@ -272,6 +295,10 @@ def main():
         handlers=[logging.StreamHandler(sys.stdout)],
     )
 
+    if training_args.should_log:
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers.utils.logging.set_verbosity_info()
+
     log_level = training_args.get_process_log_level()
     logger.setLevel(log_level)
     datasets.utils.logging.set_verbosity(log_level)
@@ -282,7 +309,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
@@ -331,7 +358,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -348,10 +375,10 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading.
 
     # Load pretrained model and tokenizer
     #
@@ -364,13 +391,16 @@ def main():
         revision=model_args.model_revision,
         use_auth_token=True if model_args.use_auth_token else None,
         ort=True if training_args.ort else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     model = AutoModelForSeq2SeqLM.from_pretrained(
         model_args.model_name_or_path,
@@ -378,7 +408,8 @@ def main():
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
@@ -540,10 +571,10 @@ def compute_metrics(eval_preds):
         preds, labels = eval_preds
         if isinstance(preds, tuple):
             preds = preds[0]
+        # Replace -100s used for padding as we can't decode them
+        preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
         decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
-        if data_args.ignore_pad_token_for_loss:
-            # Replace -100 in the labels as we can't decode them.
-            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
         decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
 
         # Some simple post-processing
@@ -623,8 +654,10 @@ def compute_metrics(eval_preds):
 
         if trainer.is_world_process_zero():
             if training_args.predict_with_generate:
+                predictions = predict_results.predictions
+                predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
                 predictions = tokenizer.batch_decode(
-                    predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
+                    predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
                 )
                 predictions = [pred.strip() for pred in predictions]
                 output_prediction_file = os.path.join(training_args.output_dir, "generated_predictions.txt")
diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py
index b245c7340845..084be800ebad 100644
--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@@ -27,18 +27,18 @@
 from pathlib import Path
 
 import datasets
+import evaluate
 import numpy as np
 import torch
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
 from datasets import load_dataset
+from huggingface_hub import Repository, create_repo
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
 
-import evaluate
 import transformers
-from accelerate import Accelerator
-from accelerate.logging import get_logger
-from accelerate.utils import set_seed
-from huggingface_hub import Repository
 from transformers import (
     CONFIG_MAPPING,
     MODEL_MAPPING,
@@ -52,12 +52,12 @@
     default_data_collator,
     get_scheduler,
 )
-from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry
+from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
@@ -69,7 +69,6 @@
 
 # Parsing input arguments
 def parse_args():
-
     parser = argparse.ArgumentParser(description="Finetune a transformers model on a text classification task")
     parser.add_argument(
         "--dataset_name",
@@ -258,6 +257,16 @@ def parse_args():
         "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
     )
     parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--trust_remote_code",
+        type=bool,
+        default=False,
+        help=(
+            "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+            "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+            "execute code present on the Hub on your local machine."
+        ),
+    )
     parser.add_argument(
         "--checkpointing_steps",
         type=str,
@@ -317,7 +326,7 @@ def main():
     # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
     # in the environment
     accelerator = (
-        Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
+        Accelerator(log_with=args.report_to, project_dir=args.output_dir) if args.with_tracking else Accelerator()
     )
 
     # Make one log on every process with the configuration for debugging.
@@ -341,11 +350,14 @@ def main():
     # Handle the repository creation
     if accelerator.is_main_process:
         if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            repo = Repository(args.output_dir, clone_from=repo_name)
+            # Retrieve of infer repo_name
+            repo_name = args.hub_model_id
+            if repo_name is None:
+                repo_name = Path(args.output_dir).absolute().name
+            # Create repo and retrieve repo_id
+            repo_id = create_repo(repo_name, exist_ok=True, token=args.hub_token).repo_id
+            # Clone repo locally
+            repo = Repository(args.output_dir, clone_from=repo_id, token=args.hub_token)
 
             with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                 if "step_*" not in gitignore:
@@ -384,17 +396,21 @@ def main():
     # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
     # download model & vocab.
     if args.config_name:
-        config = AutoConfig.from_pretrained(args.config_name)
+        config = AutoConfig.from_pretrained(args.config_name, trust_remote_code=args.trust_remote_code)
     elif args.model_name_or_path:
-        config = AutoConfig.from_pretrained(args.model_name_or_path)
+        config = AutoConfig.from_pretrained(args.model_name_or_path, trust_remote_code=args.trust_remote_code)
     else:
         config = CONFIG_MAPPING[args.model_type]()
         logger.warning("You are instantiating a new config instance from scratch.")
 
     if args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer)
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.tokenizer_name, use_fast=not args.use_slow_tokenizer, trust_remote_code=args.trust_remote_code
+        )
     elif args.model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.model_name_or_path, use_fast=not args.use_slow_tokenizer, trust_remote_code=args.trust_remote_code
+        )
     else:
         raise ValueError(
             "You are instantiating a new tokenizer from scratch. This is not supported by this script."
@@ -406,10 +422,11 @@ def main():
             args.model_name_or_path,
             from_tf=bool(".ckpt" in args.model_name_or_path),
             config=config,
+            trust_remote_code=args.trust_remote_code,
         )
     else:
         logger.info("Training new model from scratch")
-        model = AutoModelForSeq2SeqLM.from_config(config)
+        model = AutoModelForSeq2SeqLM.from_config(config, trust_remote_code=args.trust_remote_code)
 
     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
     # on a small vocab and want a smaller embedding size, remove this test.
@@ -593,42 +610,45 @@ def postprocess_text(preds, labels):
     # Potentially load in the weights and states from a previous save
     if args.resume_from_checkpoint:
         if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
-            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
-            accelerator.load_state(args.resume_from_checkpoint)
+            checkpoint_path = args.resume_from_checkpoint
             path = os.path.basename(args.resume_from_checkpoint)
         else:
             # Get the most recent checkpoint
             dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
             dirs.sort(key=os.path.getctime)
             path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
+            checkpoint_path = path
+            path = os.path.basename(checkpoint_path)
+
+        accelerator.print(f"Resumed from checkpoint: {checkpoint_path}")
+        accelerator.load_state(path)
         # Extract `epoch_{i}` or `step_{i}`
         training_difference = os.path.splitext(path)[0]
 
         if "epoch" in training_difference:
             starting_epoch = int(training_difference.replace("epoch_", "")) + 1
             resume_step = None
+            completed_steps = starting_epoch * num_update_steps_per_epoch
         else:
             # need to multiply `gradient_accumulation_steps` to reflect real steps
             resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
             starting_epoch = resume_step // len(train_dataloader)
+            completed_steps = resume_step // args.gradient_accumulation_steps
             resume_step -= starting_epoch * len(train_dataloader)
 
     # update the progress_bar if load from checkpoint
-    progress_bar.update(starting_epoch * num_update_steps_per_epoch)
-    completed_steps = starting_epoch * num_update_steps_per_epoch
+    progress_bar.update(completed_steps)
 
     for epoch in range(starting_epoch, args.num_train_epochs):
         model.train()
         if args.with_tracking:
             total_loss = 0
-        for step, batch in enumerate(train_dataloader):
-            # We need to skip steps until we reach the resumed step
-            if args.resume_from_checkpoint and epoch == starting_epoch:
-                if resume_step is not None and step < resume_step:
-                    if step % args.gradient_accumulation_steps == 0:
-                        progress_bar.update(1)
-                        completed_steps += 1
-                    continue
+        if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
+            # We skip the first `n` batches in the dataloader when resuming from a checkpoint
+            active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
+        else:
+            active_dataloader = train_dataloader
+        for step, batch in enumerate(active_dataloader):
             outputs = model(**batch)
             loss = outputs.loss
             # We keep track of the loss at each epoch
@@ -645,7 +665,7 @@ def postprocess_text(preds, labels):
 
             if isinstance(checkpointing_steps, int):
                 if completed_steps % checkpointing_steps == 0:
-                    output_dir = f"step_{completed_steps }"
+                    output_dir = f"step_{completed_steps}"
                     if args.output_dir is not None:
                         output_dir = os.path.join(args.output_dir, output_dir)
                     accelerator.save_state(output_dir)
@@ -750,5 +770,4 @@ def postprocess_text(preds, labels):
 
 
 if __name__ == "__main__":
-
     main()
diff --git a/examples/research_projects/adversarial/run_hans.py b/examples/research_projects/adversarial/run_hans.py
index 0576471fbc50..3affbb7a6925 100644
--- a/examples/research_projects/adversarial/run_hans.py
+++ b/examples/research_projects/adversarial/run_hans.py
@@ -22,6 +22,7 @@
 
 import numpy as np
 import torch
+from utils_hans import HansDataset, InputFeatures, hans_processors, hans_tasks_num_labels
 
 import transformers
 from transformers import (
@@ -35,7 +36,6 @@
     set_seed,
 )
 from transformers.trainer_utils import is_main_process
-from utils_hans import HansDataset, InputFeatures, hans_processors, hans_tasks_num_labels
 
 
 logger = logging.getLogger(__name__)
diff --git a/examples/research_projects/adversarial/utils_hans.py b/examples/research_projects/adversarial/utils_hans.py
index e54792ad2f82..f051e60f84fe 100644
--- a/examples/research_projects/adversarial/utils_hans.py
+++ b/examples/research_projects/adversarial/utils_hans.py
@@ -20,8 +20,8 @@
 from typing import List, Optional, Union
 
 import tqdm
-
 from filelock import FileLock
+
 from transformers import (
     BartTokenizer,
     BartTokenizerFast,
@@ -134,7 +134,6 @@ def __init__(
             # and the others will use the cache.
             lock_path = cached_features_file + ".lock"
             with FileLock(lock_path):
-
                 if os.path.exists(cached_features_file) and not overwrite_cache:
                     logger.info(f"Loading features from cached file {cached_features_file}")
                     self.features = torch.load(cached_features_file)
diff --git a/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_albert.py b/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_albert.py
index 5e17352dc19b..57b649ec067b 100644
--- a/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_albert.py
+++ b/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_albert.py
@@ -253,7 +253,7 @@ def forward(
 
         Returns:
             :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
-            loss: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
                 Classification (or regression if config.num_labels==1) loss.
             logits ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
                 Classification (or regression if config.num_labels==1) scores (before SoftMax).
diff --git a/examples/research_projects/bert-loses-patience/run_glue_with_pabee.py b/examples/research_projects/bert-loses-patience/run_glue_with_pabee.py
index d4121655e823..8a59b46ab522 100755
--- a/examples/research_projects/bert-loses-patience/run_glue_with_pabee.py
+++ b/examples/research_projects/bert-loses-patience/run_glue_with_pabee.py
@@ -25,14 +25,14 @@
 
 import numpy as np
 import torch
+from pabee.modeling_pabee_albert import AlbertForSequenceClassificationWithPabee
+from pabee.modeling_pabee_bert import BertForSequenceClassificationWithPabee
 from torch import nn
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
 
 import transformers
-from pabee.modeling_pabee_albert import AlbertForSequenceClassificationWithPabee
-from pabee.modeling_pabee_bert import BertForSequenceClassificationWithPabee
 from transformers import (
     WEIGHTS_NAME,
     AdamW,
@@ -173,7 +173,6 @@ def train(args, train_dataset, model, tokenizer):
     for _ in train_iterator:
         epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
         for step, batch in enumerate(epoch_iterator):
-
             # Skip past any already trained steps if resuming training
             if steps_trained_in_current_epoch > 0:
                 steps_trained_in_current_epoch -= 1
@@ -263,7 +262,6 @@ def train(args, train_dataset, model, tokenizer):
 
 
 def evaluate(args, model, tokenizer, prefix="", patience=0):
-
     if args.model_type == "albert":
         model.albert.set_regression_threshold(args.regression_threshold)
         model.albert.set_patience(patience)
@@ -729,14 +727,13 @@ def main():
         tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
         checkpoints = [args.output_dir]
         if args.eval_all_checkpoints:
-            checkpoints = list(
+            checkpoints = [
                 os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
-            )
+            ]
 
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
 
         for checkpoint in checkpoints:
-
             global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
             prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
 
@@ -746,7 +743,7 @@ def main():
             print(f"Evaluation for checkpoint {prefix}")
             for patience in patience_list:
                 result = evaluate(args, model, tokenizer, prefix=prefix, patience=patience)
-                result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
+                result = {k + "_{}".format(global_step): v for k, v in result.items()}
                 results.update(result)
     return results
 
diff --git a/examples/research_projects/bert-loses-patience/test_run_glue_with_pabee.py b/examples/research_projects/bert-loses-patience/test_run_glue_with_pabee.py
index 22c6f4de06f4..6a084d0741d5 100644
--- a/examples/research_projects/bert-loses-patience/test_run_glue_with_pabee.py
+++ b/examples/research_projects/bert-loses-patience/test_run_glue_with_pabee.py
@@ -4,6 +4,7 @@
 from unittest.mock import patch
 
 import run_glue_with_pabee
+
 from transformers.testing_utils import TestCasePlus
 
 
diff --git a/examples/research_projects/bertabs/convert_bertabs_original_pytorch_checkpoint.py b/examples/research_projects/bertabs/convert_bertabs_original_pytorch_checkpoint.py
index ed2bb11f77b4..53ba3829b150 100644
--- a/examples/research_projects/bertabs/convert_bertabs_original_pytorch_checkpoint.py
+++ b/examples/research_projects/bertabs/convert_bertabs_original_pytorch_checkpoint.py
@@ -24,9 +24,9 @@
 from collections import namedtuple
 
 import torch
-
 from model_bertabs import BertAbsSummarizer
 from models.model_builder import AbsSummarizer  # The authors' implementation
+
 from transformers import BertTokenizer
 
 
diff --git a/examples/research_projects/bertabs/modeling_bertabs.py b/examples/research_projects/bertabs/modeling_bertabs.py
index a7d8611a265f..19e62804ef08 100644
--- a/examples/research_projects/bertabs/modeling_bertabs.py
+++ b/examples/research_projects/bertabs/modeling_bertabs.py
@@ -24,10 +24,10 @@
 
 import numpy as np
 import torch
+from configuration_bertabs import BertAbsConfig
 from torch import nn
 from torch.nn.init import xavier_uniform_
 
-from configuration_bertabs import BertAbsConfig
 from transformers import BertConfig, BertModel, PreTrainedModel
 
 
@@ -54,7 +54,7 @@ def __init__(self, args, checkpoint=None, bert_extractive_checkpoint=None):
         load_bert_pretrained_extractive = True if bert_extractive_checkpoint else False
         if load_bert_pretrained_extractive:
             self.bert.model.load_state_dict(
-                dict([(n[11:], p) for n, p in bert_extractive_checkpoint.items() if n.startswith("bert.model")]),
+                {n[11:]: p for n, p in bert_extractive_checkpoint.items() if n.startswith("bert.model")},
                 strict=True,
             )
 
diff --git a/examples/research_projects/bertabs/run_summarization.py b/examples/research_projects/bertabs/run_summarization.py
index fcfae6b8c6c7..82ef8ab39ea9 100644
--- a/examples/research_projects/bertabs/run_summarization.py
+++ b/examples/research_projects/bertabs/run_summarization.py
@@ -6,10 +6,10 @@
 from collections import namedtuple
 
 import torch
+from modeling_bertabs import BertAbs, build_predictor
 from torch.utils.data import DataLoader, SequentialSampler
 from tqdm import tqdm
 
-from modeling_bertabs import BertAbs, build_predictor
 from transformers import BertTokenizer
 
 from .utils_summarization import (
@@ -45,7 +45,6 @@ def evaluate(args):
         generated_summaries = []
 
         import nltk
-
         import rouge
 
         nltk.download("punkt")
diff --git a/examples/research_projects/bertology/run_bertology.py b/examples/research_projects/bertology/run_bertology.py
index 030573d87f35..4cb046066c76 100644
--- a/examples/research_projects/bertology/run_bertology.py
+++ b/examples/research_projects/bertology/run_bertology.py
@@ -218,9 +218,9 @@ def prune_heads(args, model, eval_dataloader, head_mask):
     original_time = datetime.now() - before_time
 
     original_num_params = sum(p.numel() for p in model.parameters())
-    heads_to_prune = dict(
-        (layer, (1 - head_mask[layer].long()).nonzero().squeeze().tolist()) for layer in range(len(head_mask))
-    )
+    heads_to_prune = {
+        layer: (1 - head_mask[layer].long()).nonzero().squeeze().tolist() for layer in range(len(head_mask))
+    }
 
     assert sum(len(h) for h in heads_to_prune.values()) == (1 - head_mask.long()).sum().item()
     model.prune_heads(heads_to_prune)
diff --git a/examples/research_projects/bertology/run_prune_gpt.py b/examples/research_projects/bertology/run_prune_gpt.py
index 68cece6e997a..fa7484a787b6 100644
--- a/examples/research_projects/bertology/run_prune_gpt.py
+++ b/examples/research_projects/bertology/run_prune_gpt.py
@@ -194,9 +194,9 @@ def prune_heads(args, model, eval_dataloader, head_mask):
     original_time = datetime.now() - before_time
 
     original_num_params = sum(p.numel() for p in model.parameters())
-    heads_to_prune = dict(
-        (layer, (1 - head_mask[layer].long()).nonzero().squeeze().tolist()) for layer in range(len(head_mask))
-    )
+    heads_to_prune = {
+        layer: (1 - head_mask[layer].long()).nonzero().squeeze().tolist() for layer in range(len(head_mask))
+    }
 
     for k, v in heads_to_prune.items():
         if isinstance(v, int):
diff --git a/examples/research_projects/codeparrot/examples/train_complexity_predictor.py b/examples/research_projects/codeparrot/examples/train_complexity_predictor.py
index 8fc30b912468..927a15f9be67 100644
--- a/examples/research_projects/codeparrot/examples/train_complexity_predictor.py
+++ b/examples/research_projects/codeparrot/examples/train_complexity_predictor.py
@@ -3,8 +3,8 @@
 
 import numpy as np
 from datasets import ClassLabel, DatasetDict, load_dataset
-
 from evaluate import load
+
 from transformers import (
     AutoModelForSequenceClassification,
     AutoTokenizer,
diff --git a/examples/research_projects/codeparrot/scripts/bpe_training.py b/examples/research_projects/codeparrot/scripts/bpe_training.py
index 8a3d6ee9eec1..1cbeb4b4ee32 100644
--- a/examples/research_projects/codeparrot/scripts/bpe_training.py
+++ b/examples/research_projects/codeparrot/scripts/bpe_training.py
@@ -1,7 +1,7 @@
+from arguments import TokenizerTrainingArguments
 from datasets import load_dataset
 from tqdm import tqdm
 
-from arguments import TokenizerTrainingArguments
 from transformers import AutoTokenizer, HfArgumentParser
 from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
 
diff --git a/examples/research_projects/codeparrot/scripts/codeparrot_training.py b/examples/research_projects/codeparrot/scripts/codeparrot_training.py
index b2af8767a217..16f6077f2415 100644
--- a/examples/research_projects/codeparrot/scripts/codeparrot_training.py
+++ b/examples/research_projects/codeparrot/scripts/codeparrot_training.py
@@ -6,16 +6,17 @@
 
 import datasets
 import torch
+from accelerate import Accelerator, DistributedType
+from accelerate.utils import ProjectConfiguration
+from arguments import TrainingArguments
 from datasets import load_dataset
+from huggingface_hub import Repository
 from torch.optim import AdamW
 from torch.utils.data import IterableDataset
 from torch.utils.data.dataloader import DataLoader
 from torch.utils.data.datapipes.iter.combinatorics import ShufflerIterDataPipe
 
 import transformers
-from accelerate import Accelerator, DistributedType
-from arguments import TrainingArguments
-from huggingface_hub import Repository
 from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, get_scheduler, set_seed
 
 
@@ -195,7 +196,8 @@ def evaluate(args):
 args = parser.parse_args()
 
 # Accelerator
-accelerator = Accelerator(log_with=["wandb", "tensorboard"], logging_dir=f"{args.save_dir}/log")
+config = ProjectConfiguration(project_dir=args.save_dir, logging_dir="log")
+accelerator = Accelerator(log_with=["wandb", "tensorboard"], project_config=config)
 acc_state = {str(k): str(v) for k, v in accelerator.state.__dict__.items()}
 
 args = Namespace(**vars(args), **acc_state)
diff --git a/examples/research_projects/codeparrot/scripts/human_eval.py b/examples/research_projects/codeparrot/scripts/human_eval.py
index d0614134ad47..ef217a597e33 100644
--- a/examples/research_projects/codeparrot/scripts/human_eval.py
+++ b/examples/research_projects/codeparrot/scripts/human_eval.py
@@ -5,15 +5,15 @@
 from collections import defaultdict
 
 import torch
+from accelerate import Accelerator
+from accelerate.utils import set_seed
+from arguments import HumanEvalArguments
 from datasets import load_dataset, load_metric
 from torch.utils.data import IterableDataset
 from torch.utils.data.dataloader import DataLoader
 from tqdm import tqdm
 
 import transformers
-from accelerate import Accelerator
-from accelerate.utils import set_seed
-from arguments import HumanEvalArguments
 from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, StoppingCriteria, StoppingCriteriaList
 
 
@@ -60,7 +60,7 @@ def __call__(self, input_ids, scores, **kwargs):
         decoded_generations = self.tokenizer.batch_decode(input_ids[:, self.start_length :])
         done = []
         for decoded_generation in decoded_generations:
-            done.append(any([stop_string in decoded_generation for stop_string in self.eof_strings]))
+            done.append(any(stop_string in decoded_generation for stop_string in self.eof_strings))
         return all(done)
 
 
diff --git a/examples/research_projects/codeparrot/scripts/initialize_model.py b/examples/research_projects/codeparrot/scripts/initialize_model.py
index 9d066b190873..6bf028688f12 100644
--- a/examples/research_projects/codeparrot/scripts/initialize_model.py
+++ b/examples/research_projects/codeparrot/scripts/initialize_model.py
@@ -1,4 +1,5 @@
 from arguments import InitializationArguments
+
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, HfArgumentParser
 
 
diff --git a/examples/research_projects/codeparrot/scripts/minhash_deduplication.py b/examples/research_projects/codeparrot/scripts/minhash_deduplication.py
index 9e1ef11ff07d..f1984711278a 100644
--- a/examples/research_projects/codeparrot/scripts/minhash_deduplication.py
+++ b/examples/research_projects/codeparrot/scripts/minhash_deduplication.py
@@ -6,10 +6,9 @@
 from typing import Dict, List, Optional, Set, Tuple, Type
 
 from datasets import Dataset
-from tqdm import tqdm
-
 from datasketch import MinHash, MinHashLSH
 from dpu_utils.utils.iterators import ThreadedIterator
+from tqdm import tqdm
 
 
 NON_ALPHA = re.compile("[^A-Za-z_0-9]")
@@ -30,7 +29,7 @@ def get_min_hash(tokens: List[str]) -> Optional[MinHash]:
 
 def get_tokens(code: str) -> Set[str]:
     """Tokenize a code snippet."""
-    return set([t for t in NON_ALPHA.split(code) if len(t.strip()) > 0])
+    return {t for t in NON_ALPHA.split(code) if len(t.strip()) > 0}
 
 
 class DuplicationIndex:
@@ -244,7 +243,7 @@ def deduplicate_dataset(
         >>> ds_dedup, duplicate_clusters = deduplicate_dataset(ds, jaccard_threshold=0.85)
     """
     duplicate_clusters = make_duplicate_clusters(dataset, jaccard_threshold)
-    duplicate_indices = set(x["base_index"] for cluster in duplicate_clusters for x in cluster)
+    duplicate_indices = {x["base_index"] for cluster in duplicate_clusters for x in cluster}
     extreme_dict = {}
     extremes_clusters = find_extremes(duplicate_clusters, dataset, jaccard_threshold)
     for extremes in extremes_clusters:
diff --git a/examples/research_projects/codeparrot/scripts/preprocessing.py b/examples/research_projects/codeparrot/scripts/preprocessing.py
index 6236a8aad86a..aecc37223f0d 100644
--- a/examples/research_projects/codeparrot/scripts/preprocessing.py
+++ b/examples/research_projects/codeparrot/scripts/preprocessing.py
@@ -9,10 +9,10 @@
 from pathlib import Path
 
 import numpy as np
-from datasets import load_dataset
-
 from arguments import PreprocessingArguments
+from datasets import load_dataset
 from minhash_deduplication import deduplicate_dataset
+
 from transformers import AutoTokenizer, HfArgumentParser
 
 
@@ -114,7 +114,7 @@ def char_token_ratio(example):
 
 def preprocess(example):
     """Chain all preprocessing steps into one function to not fill cache."""
-    results = dict()
+    results = {}
     results.update(get_hash(example))
     results.update(line_stats(example))
     results.update(alpha_stats(example))
diff --git a/examples/research_projects/codeparrot/scripts/pretokenizing.py b/examples/research_projects/codeparrot/scripts/pretokenizing.py
index 9ebe1e577dde..7cac8f511918 100644
--- a/examples/research_projects/codeparrot/scripts/pretokenizing.py
+++ b/examples/research_projects/codeparrot/scripts/pretokenizing.py
@@ -1,14 +1,14 @@
 import multiprocessing
 import time
 
+from arguments import PretokenizationArguments
 from datasets import load_dataset
 
-from arguments import PretokenizationArguments
 from transformers import AutoTokenizer, HfArgumentParser
 
 
 def tokenize(example):
-    output = dict()
+    output = {}
     output["input_ids"] = tokenizer(example["content"], truncation=False)["input_ids"]
     output["ratio_char_token"] = len(example["content"]) / len(output["input_ids"])
     return output
diff --git a/examples/research_projects/codeparrot/scripts/tests/test_deduplicate.py b/examples/research_projects/codeparrot/scripts/tests/test_deduplicate.py
index e44382713557..aaf53de137f4 100644
--- a/examples/research_projects/codeparrot/scripts/tests/test_deduplicate.py
+++ b/examples/research_projects/codeparrot/scripts/tests/test_deduplicate.py
@@ -1,7 +1,6 @@
 from unittest import TestCase
 
 from datasets import Dataset
-
 from minhash_deduplication import deduplicate_dataset, make_duplicate_clusters
 
 
diff --git a/examples/research_projects/codeparrot/scripts/validation_loss.py b/examples/research_projects/codeparrot/scripts/validation_loss.py
index 280a79dbed08..929c2df427e2 100644
--- a/examples/research_projects/codeparrot/scripts/validation_loss.py
+++ b/examples/research_projects/codeparrot/scripts/validation_loss.py
@@ -1,12 +1,12 @@
 import logging
 
 import torch
+from accelerate import Accelerator
+from arguments import EvaluationArguments
 from datasets import load_dataset
 from torch.utils.data import IterableDataset
 from torch.utils.data.dataloader import DataLoader
 
-from accelerate import Accelerator
-from arguments import EvaluationArguments
 from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, set_seed
 
 
diff --git a/examples/research_projects/decision_transformer/requirements.txt b/examples/research_projects/decision_transformer/requirements.txt
index 5eaaaf321dcb..2e020025e5a7 100644
--- a/examples/research_projects/decision_transformer/requirements.txt
+++ b/examples/research_projects/decision_transformer/requirements.txt
@@ -1,5 +1,5 @@
 absl-py==1.0.0
-aiohttp==3.8.1
+aiohttp==3.8.5
 aiosignal==1.2.0
 alembic==1.7.7
 appdirs==1.4.4
@@ -20,7 +20,7 @@ boto3==1.16.34
 botocore==1.19.63
 Brotli==1.0.9
 cachetools==5.0.0
-certifi==2022.12.7
+certifi==2023.7.22
 cffi==1.15.0
 chardet==4.0.0
 charset-normalizer==2.0.12
@@ -34,7 +34,7 @@ cmd2==2.4.0
 codecarbon==1.2.0
 colorlog==6.6.0
 cookiecutter==2.1.1
-cryptography==36.0.2
+cryptography==41.0.2
 csvw==2.0.0
 cycler==0.11.0
 Cython==0.29.28
@@ -57,7 +57,7 @@ fasteners==0.17.3
 filelock==3.6.0
 fire==0.4.0
 flake8==4.0.1
-Flask==2.0.3
+Flask==2.3.2
 Flask-Compress==1.11
 flatbuffers==2.0
 flax==0.4.0
@@ -67,7 +67,7 @@ fsspec==2022.2.0
 fugashi==1.1.2
 gast==0.5.3
 gitdb==4.0.9
-GitPython==3.1.30
+GitPython==3.1.32
 glfw==2.5.1
 google-auth==2.6.2
 google-auth-oauthlib==0.4.6
@@ -85,7 +85,7 @@ importlib-metadata==4.11.3
 importlib-resources==5.4.0
 iniconfig==1.1.1
 ipadic==1.0.0
-ipython==8.1.1
+ipython==8.10.0
 isodate==0.6.1
 isort==5.10.1
 itsdangerous==2.1.1
@@ -118,8 +118,8 @@ mypy-extensions==0.4.3
 nltk==3.7
 numba==0.55.1
 numpy==1.22.3
-oauthlib==3.2.1
-onnx==1.11.0
+oauthlib==3.2.2
+onnx==1.13.0
 onnxconverter-common==1.9.0
 opt-einsum==3.3.0
 optax==0.1.1
@@ -133,7 +133,7 @@ pbr==5.8.1
 pexpect==4.8.0
 phonemizer==3.0.1
 pickleshare==0.7.5
-Pillow==9.3.0
+Pillow==10.0.1
 Pint==0.16.1
 plac==1.3.4
 platformdirs==2.5.1
@@ -157,7 +157,7 @@ pycodestyle==2.8.0
 pycparser==2.21
 pyctcdecode==0.3.0
 pyflakes==2.4.0
-Pygments==2.11.2
+Pygments==2.15.0
 pygtrie==2.4.2
 pynvml==11.4.1
 pyOpenSSL==22.0.0
@@ -175,9 +175,9 @@ pytz==2022.1
 pytz-deprecation-shim==0.1.0.post0
 PyYAML==6.0
 ray==1.11.0
-redis==4.1.4
+redis==4.5.4
 regex==2022.3.15
-requests==2.27.1
+requests==2.31.0
 requests-oauthlib==1.3.1
 resampy==0.2.2
 responses==0.18.0
@@ -229,11 +229,11 @@ tzlocal==4.1
 unidic==1.1.0
 unidic-lite==1.0.8
 uritemplate==4.1.1
-urllib3==1.26.9
+urllib3==1.26.17
 wasabi==0.9.0
 wcwidth==0.2.5
 websocket-client==1.3.1
-Werkzeug==2.0.3
+Werkzeug==2.2.3
 wrapt==1.14.0
 xxhash==3.0.0
 yarl==1.7.2
diff --git a/examples/research_projects/decision_transformer/run_decision_transformer.py b/examples/research_projects/decision_transformer/run_decision_transformer.py
index a1e4785d29fc..d6c3e2833125 100644
--- a/examples/research_projects/decision_transformer/run_decision_transformer.py
+++ b/examples/research_projects/decision_transformer/run_decision_transformer.py
@@ -1,8 +1,8 @@
+import gym
 import numpy as np
 import torch
-
-import gym
 from mujoco_py import GlfwContext
+
 from transformers import DecisionTransformerModel
 
 
diff --git a/examples/research_projects/deebert/run_glue_deebert.py b/examples/research_projects/deebert/run_glue_deebert.py
index f86390375ff7..6f7cfe65d0ef 100644
--- a/examples/research_projects/deebert/run_glue_deebert.py
+++ b/examples/research_projects/deebert/run_glue_deebert.py
@@ -685,9 +685,9 @@ def main():
         tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
         checkpoints = [args.output_dir]
         if args.eval_all_checkpoints:
-            checkpoints = list(
+            checkpoints = [
                 os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
-            )
+            ]
 
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
         for checkpoint in checkpoints:
@@ -725,7 +725,7 @@ def main():
                 for i in range(model.num_layers):
                     info_str += " {:.2f}".format(100 * each_layer_results[i])
                 logger.info(info_str)
-            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
+            result = {k + "_{}".format(global_step): v for k, v in result.items()}
             results.update(result)
 
     return results
diff --git a/examples/research_projects/deebert/src/modeling_highway_bert.py b/examples/research_projects/deebert/src/modeling_highway_bert.py
index 37d81248ed45..2a881decbbd5 100644
--- a/examples/research_projects/deebert/src/modeling_highway_bert.py
+++ b/examples/research_projects/deebert/src/modeling_highway_bert.py
@@ -229,7 +229,10 @@ def forward(
         sequence_output = encoder_outputs[0]
         pooled_output = self.pooler(sequence_output)
 
-        outputs = (sequence_output, pooled_output,) + encoder_outputs[
+        outputs = (
+            sequence_output,
+            pooled_output,
+        ) + encoder_outputs[
             1:
         ]  # add hidden_states and attentions if they are here
         return outputs  # sequence_output, pooled_output, (hidden_states), (attentions), highway exits
diff --git a/examples/research_projects/deebert/src/modeling_highway_roberta.py b/examples/research_projects/deebert/src/modeling_highway_roberta.py
index c8358ac99454..c21fb32fde76 100644
--- a/examples/research_projects/deebert/src/modeling_highway_roberta.py
+++ b/examples/research_projects/deebert/src/modeling_highway_roberta.py
@@ -19,7 +19,6 @@
     ROBERTA_START_DOCSTRING,
 )
 class DeeRobertaModel(DeeBertModel):
-
     config_class = RobertaConfig
     base_model_prefix = "roberta"
 
@@ -36,7 +35,6 @@ def __init__(self, config):
     ROBERTA_START_DOCSTRING,
 )
 class DeeRobertaForSequenceClassification(BertPreTrainedModel):
-
     config_class = RobertaConfig
     base_model_prefix = "roberta"
 
diff --git a/examples/research_projects/deebert/test_glue_deebert.py b/examples/research_projects/deebert/test_glue_deebert.py
index 7a709308e6f7..775c4d70b652 100644
--- a/examples/research_projects/deebert/test_glue_deebert.py
+++ b/examples/research_projects/deebert/test_glue_deebert.py
@@ -4,6 +4,7 @@
 from unittest.mock import patch
 
 import run_glue_deebert
+
 from transformers.testing_utils import TestCasePlus, get_gpu_count, require_torch_non_multi_gpu, slow
 
 
@@ -45,7 +46,6 @@ def run_and_check(self, args):
     @slow
     @require_torch_non_multi_gpu
     def test_glue_deebert_train(self):
-
         train_args = """
             --model_type roberta
             --model_name_or_path roberta-base
diff --git a/examples/research_projects/distillation/distiller.py b/examples/research_projects/distillation/distiller.py
index fc5dc58941f7..3ef2ba87b2e2 100644
--- a/examples/research_projects/distillation/distiller.py
+++ b/examples/research_projects/distillation/distiller.py
@@ -21,14 +21,14 @@
 
 import psutil
 import torch
+from grouped_batch_sampler import GroupedBatchSampler, create_lengths_groups
+from lm_seqs_dataset import LmSeqsDataset
 from torch import nn
 from torch.optim import AdamW
 from torch.utils.data import BatchSampler, DataLoader, RandomSampler
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm
 
-from grouped_batch_sampler import GroupedBatchSampler, create_lengths_groups
-from lm_seqs_dataset import LmSeqsDataset
 from transformers import get_linear_schedule_with_warmup
 from utils import logger
 
diff --git a/examples/research_projects/distillation/grouped_batch_sampler.py b/examples/research_projects/distillation/grouped_batch_sampler.py
index 83addc371f2e..a068f7e09e6a 100644
--- a/examples/research_projects/distillation/grouped_batch_sampler.py
+++ b/examples/research_projects/distillation/grouped_batch_sampler.py
@@ -27,7 +27,7 @@
 def _quantize(x, bins):
     bins = copy.deepcopy(bins)
     bins = sorted(bins)
-    quantized = list(map(lambda y: bisect.bisect_right(bins, y), x))
+    quantized = [bisect.bisect_right(bins, y) for y in x]
     return quantized
 
 
diff --git a/examples/research_projects/distillation/requirements.txt b/examples/research_projects/distillation/requirements.txt
index 80ee9335e6f6..3e4f807c07d3 100644
--- a/examples/research_projects/distillation/requirements.txt
+++ b/examples/research_projects/distillation/requirements.txt
@@ -1,6 +1,6 @@
 transformers
 
-gitpython==3.1.30
+gitpython==3.1.32
 tensorboard>=1.14.0
 tensorboardX==1.8
 psutil==5.6.6
diff --git a/examples/research_projects/distillation/run_squad_w_distillation.py b/examples/research_projects/distillation/run_squad_w_distillation.py
index 3acfd4686406..4b8b8e542f70 100644
--- a/examples/research_projects/distillation/run_squad_w_distillation.py
+++ b/examples/research_projects/distillation/run_squad_w_distillation.py
@@ -189,7 +189,6 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
     for _ in train_iterator:
         epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
         for step, batch in enumerate(epoch_iterator):
-
             # Skip past any already trained steps if resuming training
             if steps_trained_in_current_epoch > 0:
                 steps_trained_in_current_epoch -= 1
@@ -851,9 +850,9 @@ def main():
             logger.info("Loading checkpoints saved during training for evaluation")
         checkpoints = [args.output_dir]
         if args.eval_all_checkpoints:
-            checkpoints = list(
+            checkpoints = [
                 os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
-            )
+            ]
 
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
 
@@ -866,7 +865,7 @@ def main():
             # Evaluate
             result = evaluate(args, model, tokenizer, prefix=global_step)
 
-            result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())
+            result = {k + ("_{}".format(global_step) if global_step else ""): v for k, v in result.items()}
             results.update(result)
 
     logger.info("Results: {}".format(results))
diff --git a/examples/research_projects/distillation/train.py b/examples/research_projects/distillation/train.py
index cc2362888e47..bb35a1df8539 100644
--- a/examples/research_projects/distillation/train.py
+++ b/examples/research_projects/distillation/train.py
@@ -24,9 +24,9 @@
 
 import numpy as np
 import torch
-
 from distiller import Distiller
 from lm_seqs_dataset import LmSeqsDataset
+
 from transformers import (
     BertConfig,
     BertForMaskedLM,
diff --git a/examples/research_projects/fsner/src/fsner/tokenizer_utils.py b/examples/research_projects/fsner/src/fsner/tokenizer_utils.py
index bc5f6650ccd9..b281ae6cfb89 100644
--- a/examples/research_projects/fsner/src/fsner/tokenizer_utils.py
+++ b/examples/research_projects/fsner/src/fsner/tokenizer_utils.py
@@ -17,7 +17,7 @@ def tokenize(self, x):
             `transformers.tokenization_utils_base.BatchEncoding` dict with additional keys and values for start_token_id, end_token_id and sizes of example lists for each entity type
         """
 
-        if isinstance(x, list) and all([isinstance(_x, list) for _x in x]):
+        if isinstance(x, list) and all(isinstance(_x, list) for _x in x):
             d = None
             for l in x:
                 t = self.tokenizer(
@@ -37,7 +37,7 @@ def tokenize(self, x):
             d["start_token_id"] = torch.tensor(self.tokenizer.convert_tokens_to_ids("[E]"))
             d["end_token_id"] = torch.tensor(self.tokenizer.convert_tokens_to_ids("[/E]"))
 
-        elif isinstance(x, list) and all([isinstance(_x, str) for _x in x]):
+        elif isinstance(x, list) and all(isinstance(_x, str) for _x in x):
             d = self.tokenizer(
                 x,
                 padding="max_length",
diff --git a/examples/research_projects/information-gain-filtration/igf/igf.py b/examples/research_projects/information-gain-filtration/igf/igf.py
index 99bd8c2d06d7..6861467a3359 100644
--- a/examples/research_projects/information-gain-filtration/igf/igf.py
+++ b/examples/research_projects/information-gain-filtration/igf/igf.py
@@ -5,13 +5,13 @@
 import logging
 import random
 
+import joblib
 import numpy as np
 import torch
 import torch.nn as nn
 from torch.utils.data import DataLoader
 from tqdm import tqdm
 
-import joblib
 from transformers import AdamW, GPT2LMHeadModel, get_linear_schedule_with_warmup
 
 
@@ -119,7 +119,6 @@ def recopy_gpt2(orig_model, device, max_steps):
 
 
 def intermittent_save(contexts, real_perps, past_perps, filename):
-
     """
     save the perplexity differences to filename
 
@@ -152,7 +151,6 @@ def collect_objective_set(
     filename="dev.jbl",
     recopy_model=recopy_gpt2,
 ):
-
     """
     Collect individual IGF values from pre-trained transformer model
     max_steps samples of training data to train secondary model
@@ -271,7 +269,6 @@ def generate_datasets(
 def train_secondary_learner(
     secondary_learner, train_dataset, max_epochs, batch_size, eval_freq=50, igf_model_path="secondary_learner.pt"
 ):
-
     """
     Train the secondary learner (igf_model)
 
diff --git a/examples/research_projects/information-gain-filtration/run_clm_igf.py b/examples/research_projects/information-gain-filtration/run_clm_igf.py
index eae10060b22f..c1584a2f89ad 100644
--- a/examples/research_projects/information-gain-filtration/run_clm_igf.py
+++ b/examples/research_projects/information-gain-filtration/run_clm_igf.py
@@ -28,11 +28,9 @@
 import argparse
 import random
 
+import joblib
 import numpy as np
 import torch
-from torch.utils.data import DataLoader, RandomSampler
-
-import joblib
 from igf.igf import (
     SecondaryLearner,
     collect_objective_set,
@@ -43,6 +41,8 @@
     set_seed,
     train_secondary_learner,
 )
+from torch.utils.data import DataLoader, RandomSampler
+
 from transformers import GPT2LMHeadModel
 
 
@@ -55,7 +55,6 @@ def generate_n_pairs(
     data_file="data/tokenized_stories_train_wikitext103.jbl",
     igf_data_file="igf_context_pairs.jbl",
 ):
-
     """
     Collecting *n* pairs for training the secondary learner
     Args:
diff --git a/examples/research_projects/jax-projects/big_bird/bigbird_flax.py b/examples/research_projects/jax-projects/big_bird/bigbird_flax.py
index b9ff9da28140..af5e11c83a6a 100644
--- a/examples/research_projects/jax-projects/big_bird/bigbird_flax.py
+++ b/examples/research_projects/jax-projects/big_bird/bigbird_flax.py
@@ -4,8 +4,6 @@
 from functools import partial
 from typing import Callable
 
-from tqdm.auto import tqdm
-
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
@@ -16,6 +14,8 @@
 from flax.serialization import from_bytes, to_bytes
 from flax.training import train_state
 from flax.training.common_utils import shard
+from tqdm.auto import tqdm
+
 from transformers import BigBirdConfig, FlaxBigBirdForQuestionAnswering
 from transformers.models.big_bird.modeling_flax_big_bird import FlaxBigBirdForQuestionAnsweringModule
 
@@ -98,7 +98,6 @@ def __post_init__(self):
 
 @dataclass
 class DataCollator:
-
     pad_id: int
     max_length: int = 4096  # no dynamic padding on TPUs
 
@@ -248,9 +247,12 @@ def train(self, state, tr_dataset, val_dataset):
                     lr = self.scheduler_fn(state_step - 1)
 
                     eval_loss = self.evaluate(state, val_dataset)
-                    logging_dict = dict(
-                        step=state_step.item(), eval_loss=eval_loss.item(), tr_loss=tr_loss, lr=lr.item()
-                    )
+                    logging_dict = {
+                        "step": state_step.item(),
+                        "eval_loss": eval_loss.item(),
+                        "tr_loss": tr_loss,
+                        "lr": lr.item(),
+                    }
                     tqdm.write(str(logging_dict))
                     self.logger.log(logging_dict, commit=True)
 
diff --git a/examples/research_projects/jax-projects/big_bird/evaluate.py b/examples/research_projects/jax-projects/big_bird/evaluate.py
index e3309f494e34..04e9e01ca237 100644
--- a/examples/research_projects/jax-projects/big_bird/evaluate.py
+++ b/examples/research_projects/jax-projects/big_bird/evaluate.py
@@ -1,8 +1,8 @@
-from datasets import load_from_disk
-
 import jax
 import jax.numpy as jnp
 from bigbird_flax import FlaxBigBirdForNaturalQuestions
+from datasets import load_from_disk
+
 from transformers import BigBirdTokenizerFast
 
 
@@ -144,9 +144,9 @@ def evaluate(example):
         predictions = expand_to_aliases(example["output"])
 
         # some preprocessing to both prediction and answer
-        answers = set(["".join(a.split()) for a in answers])
-        predictions = set(["".join(p.split()) for p in predictions])
-        predictions = set([s for s in predictions if s not in ["``", "''", "`", "'"]])
+        answers = {"".join(a.split()) for a in answers}
+        predictions = {"".join(p.split()) for p in predictions}
+        predictions = {s for s in predictions if s not in ["``", "''", "`", "'"]}
 
         # if there is a common element, it's a exact match
         example["match"] = len(list(answers & predictions)) > 0
diff --git a/examples/research_projects/jax-projects/big_bird/prepare_natural_questions.py b/examples/research_projects/jax-projects/big_bird/prepare_natural_questions.py
index 8d2f69031e2a..ebbb184ccb6b 100644
--- a/examples/research_projects/jax-projects/big_bird/prepare_natural_questions.py
+++ b/examples/research_projects/jax-projects/big_bird/prepare_natural_questions.py
@@ -1,10 +1,9 @@
 import os
 
+import jsonlines
 import numpy as np
 from tqdm import tqdm
 
-import jsonlines
-
 
 DOC_STRIDE = 2048
 MAX_LENGTH = 4096
@@ -51,7 +50,7 @@ def choose_first(answer, is_long_answer=False):
         answer["remove_it"] = False
 
     cols = ["start_token", "end_token", "start_byte", "end_byte", "text"]
-    if not all([isinstance(answer[k], list) for k in cols]):
+    if not all(isinstance(answer[k], list) for k in cols):
         raise ValueError("Issue in ID", example["id"])
 
     return answer
@@ -315,12 +314,12 @@ def save_to_disk(hf_data, file_name):
 
     data = data["train" if PROCESS_TRAIN == "true" else "validation"]
 
-    fn_kwargs = dict(
-        tokenizer=tokenizer,
-        doc_stride=DOC_STRIDE,
-        max_length=MAX_LENGTH,
-        assertion=False,
-    )
+    fn_kwargs = {
+        "tokenizer": tokenizer,
+        "doc_stride": DOC_STRIDE,
+        "max_length": MAX_LENGTH,
+        "assertion": False,
+    }
     data = data.map(prepare_inputs, fn_kwargs=fn_kwargs)
     data = data.remove_columns(["annotations", "document", "id", "question"])
     print(data)
diff --git a/examples/research_projects/jax-projects/big_bird/train.py b/examples/research_projects/jax-projects/big_bird/train.py
index 3d67c9d97f67..ce37b7f975bb 100644
--- a/examples/research_projects/jax-projects/big_bird/train.py
+++ b/examples/research_projects/jax-projects/big_bird/train.py
@@ -1,12 +1,12 @@
 import os
 from dataclasses import replace
 
-from datasets import load_dataset
-
 import jax
 import wandb
 from bigbird_flax import Args, DataCollator, FlaxBigBirdForNaturalQuestions, Trainer, build_tx, train_step, val_step
+from datasets import load_dataset
 from flax import jax_utils
+
 from transformers import BigBirdTokenizerFast
 
 
diff --git a/examples/research_projects/jax-projects/dataset-streaming/run_mlm_flax_stream.py b/examples/research_projects/jax-projects/dataset-streaming/run_mlm_flax_stream.py
index e4bec5e28866..74d4813d45d7 100755
--- a/examples/research_projects/jax-projects/dataset-streaming/run_mlm_flax_stream.py
+++ b/examples/research_projects/jax-projects/dataset-streaming/run_mlm_flax_stream.py
@@ -32,17 +32,17 @@
 from typing import Dict, List, Optional, Tuple
 
 import datasets
-import numpy as np
-from datasets import load_dataset
-from tqdm import tqdm
-
 import flax
 import jax
 import jax.numpy as jnp
+import numpy as np
 import optax
+from datasets import load_dataset
 from flax import jax_utils, traverse_util
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard
+from tqdm import tqdm
+
 from transformers import (
     CONFIG_MAPPING,
     FLAX_MODEL_FOR_MASKED_LM_MAPPING,
@@ -76,7 +76,7 @@ class ModelArguments:
         default=None,
         metadata={
             "help": (
-                "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch."
+                "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
             )
         },
     )
diff --git a/examples/research_projects/jax-projects/hybrid_clip/modeling_hybrid_clip.py b/examples/research_projects/jax-projects/hybrid_clip/modeling_hybrid_clip.py
index a5a395272fdc..e60f07bdd063 100644
--- a/examples/research_projects/jax-projects/hybrid_clip/modeling_hybrid_clip.py
+++ b/examples/research_projects/jax-projects/hybrid_clip/modeling_hybrid_clip.py
@@ -20,6 +20,7 @@
 import jax.numpy as jnp
 from configuration_hybrid_clip import HybridCLIPConfig
 from flax.core.frozen_dict import FrozenDict
+
 from transformers import FLAX_MODEL_MAPPING, FlaxCLIPVisionModel
 from transformers.modeling_flax_utils import FlaxPreTrainedModel
 from transformers.models.clip.modeling_flax_clip import FlaxCLIPOutput
@@ -132,7 +133,7 @@ def __init__(
         input_shape: Optional[Tuple] = None,
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
-        **kwargs
+        **kwargs,
     ):
         if input_shape is None:
             input_shape = ((1, 1), (1, config.vision_config.image_size, config.vision_config.image_size, 3))
diff --git a/examples/research_projects/jax-projects/hybrid_clip/run_hybrid_clip.py b/examples/research_projects/jax-projects/hybrid_clip/run_hybrid_clip.py
index 1be46f6af993..f54641408f80 100644
--- a/examples/research_projects/jax-projects/hybrid_clip/run_hybrid_clip.py
+++ b/examples/research_projects/jax-projects/hybrid_clip/run_hybrid_clip.py
@@ -32,22 +32,22 @@
 from pathlib import Path
 from typing import Callable, Optional
 
-import torch
-from torchvision.datasets import VisionDataset
-from torchvision.io import ImageReadMode, read_image
-from torchvision.transforms import CenterCrop, ConvertImageDtype, Normalize, Resize
-from torchvision.transforms.functional import InterpolationMode
-from tqdm import tqdm
-
 import jax
 import jax.numpy as jnp
 import optax
-import transformers
+import torch
 from flax import jax_utils
 from flax.jax_utils import unreplicate
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, shard, shard_prng_key
 from modeling_hybrid_clip import FlaxHybridCLIP
+from torchvision.datasets import VisionDataset
+from torchvision.io import ImageReadMode, read_image
+from torchvision.transforms import CenterCrop, ConvertImageDtype, Normalize, Resize
+from torchvision.transforms.functional import InterpolationMode
+from tqdm import tqdm
+
+import transformers
 from transformers import AutoTokenizer, HfArgumentParser, TrainingArguments, is_tensorboard_available, set_seed
 
 
diff --git a/examples/research_projects/jax-projects/model_parallel/partitions.py b/examples/research_projects/jax-projects/model_parallel/partitions.py
index e32ec97e42b4..86e54ad67027 100644
--- a/examples/research_projects/jax-projects/model_parallel/partitions.py
+++ b/examples/research_projects/jax-projects/model_parallel/partitions.py
@@ -34,7 +34,7 @@
 def _match(qs, ks):
     """Return True if regexes in qs match any window of strings in tuple ks."""
     # compile regexes and force complete match
-    qts = tuple(map(lambda x: re.compile(x + "$"), qs))
+    qts = tuple((re.compile(x + "$") for x in qs))
     for i in range(len(ks) - len(qs) + 1):
         matches = [x.match(y) for x, y in zip(qts, ks[i:])]
         if matches and all(matches):
diff --git a/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py b/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py
index 16eb1007b4c7..a6da8729f0ce 100644
--- a/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py
+++ b/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py
@@ -28,19 +28,19 @@
 from typing import Callable, Optional
 
 import datasets
-import numpy as np
-from datasets import Dataset, load_dataset
-from tqdm import tqdm
-
 import jax
 import jax.numpy as jnp
+import numpy as np
 import optax
-import transformers
+from datasets import Dataset, load_dataset
 from flax.core.frozen_dict import freeze, unfreeze
 from flax.training.common_utils import onehot, stack_forest
 from jax.experimental.maps import mesh
 from jax.experimental.pjit import pjit
 from partitions import set_partitions
+from tqdm import tqdm
+
+import transformers
 from transformers import (
     CONFIG_MAPPING,
     FLAX_MODEL_FOR_CAUSAL_LM_MAPPING,
@@ -70,7 +70,7 @@ class ModelArguments:
         default=None,
         metadata={
             "help": (
-                "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch."
+                "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
             )
         },
     )
@@ -362,9 +362,9 @@ def tokenize_function(examples):
         if block_size > config.max_position_embeddings:
             logger.warning(
                 f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
-                "Picking 1024 instead. You can change that default value by passing --block_size xxx."
+                f"Using block_size={min(1024, config.max_position_embeddings)} instead. You can change that default value by passing --block_size xxx."
             )
-            block_size = 1024
+            block_size = min(1024, config.max_position_embeddings)
     else:
         if data_args.block_size > tokenizer.model_max_length:
             logger.warning(
@@ -395,7 +395,7 @@ def group_texts(examples):
     # to preprocess.
     #
     # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
-    # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+    # https://huggingface.co/docs/datasets/process#map
 
     lm_datasets = tokenized_datasets.map(
         group_texts,
diff --git a/examples/research_projects/jax-projects/wav2vec2/run_wav2vec2_pretrain_flax.py b/examples/research_projects/jax-projects/wav2vec2/run_wav2vec2_pretrain_flax.py
index 71bf60d2c602..5034e1ee9137 100755
--- a/examples/research_projects/jax-projects/wav2vec2/run_wav2vec2_pretrain_flax.py
+++ b/examples/research_projects/jax-projects/wav2vec2/run_wav2vec2_pretrain_flax.py
@@ -6,18 +6,18 @@
 from pathlib import Path
 from typing import Dict, List, Optional, Union
 
-import numpy as np
-from datasets import DatasetDict, load_dataset
-from tqdm import tqdm
-
 import flax
 import jax
 import jax.numpy as jnp
 import librosa
+import numpy as np
 import optax
+from datasets import DatasetDict, load_dataset
 from flax import jax_utils, traverse_util
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard
+from tqdm import tqdm
+
 from transformers import (
     FlaxWav2Vec2ForPreTraining,
     HfArgumentParser,
diff --git a/examples/research_projects/layoutlmv3/run_funsd_cord.py b/examples/research_projects/layoutlmv3/run_funsd_cord.py
index 866f9a9c1b11..e826fd997424 100644
--- a/examples/research_projects/layoutlmv3/run_funsd_cord.py
+++ b/examples/research_projects/layoutlmv3/run_funsd_cord.py
@@ -294,11 +294,11 @@ def get_label_list(labels):
     if isinstance(features[label_column_name].feature, ClassLabel):
         label_list = features[label_column_name].feature.names
         # No need to convert the labels since they are already ints.
-        id2label = {k: v for k, v in enumerate(label_list)}
+        id2label = dict(enumerate(label_list))
         label2id = {v: k for k, v in enumerate(label_list)}
     else:
         label_list = get_label_list(datasets["train"][label_column_name])
-        id2label = {k: v for k, v in enumerate(label_list)}
+        id2label = dict(enumerate(label_list))
         label2id = {v: k for k, v in enumerate(label_list)}
     num_labels = len(label_list)
 
@@ -340,7 +340,7 @@ def get_label_list(labels):
     model.config.id2label = id2label
 
     # Preprocessing the dataset
-    # The processor does everything for us (prepare the image using LayoutLMv3FeatureExtractor
+    # The processor does everything for us (prepare the image using LayoutLMv3ImageProcessor
     # and prepare the words, boxes and word-level labels using LayoutLMv3TokenizerFast)
     def prepare_examples(examples):
         images = examples[image_column_name]
diff --git a/examples/research_projects/longform-qa/eli5_app.py b/examples/research_projects/longform-qa/eli5_app.py
index 7782d6433ba7..1bcb6fd20d25 100644
--- a/examples/research_projects/longform-qa/eli5_app.py
+++ b/examples/research_projects/longform-qa/eli5_app.py
@@ -1,11 +1,9 @@
 import datasets
+import faiss
 import numpy as np
 import streamlit as st
 import torch
 from elasticsearch import Elasticsearch
-
-import faiss
-import transformers
 from eli5_utils import (
     embed_questions_for_retrieval,
     make_qa_s2s_model,
@@ -13,6 +11,8 @@
     query_es_index,
     query_qa_dense_index,
 )
+
+import transformers
 from transformers import AutoModel, AutoModelForSeq2SeqLM, AutoTokenizer
 
 
diff --git a/examples/research_projects/longform-qa/eli5_utils.py b/examples/research_projects/longform-qa/eli5_utils.py
index 82c4bd8caf20..d4b235fdbaab 100644
--- a/examples/research_projects/longform-qa/eli5_utils.py
+++ b/examples/research_projects/longform-qa/eli5_utils.py
@@ -5,6 +5,7 @@
 from time import time
 
 import datasets  # noqa: F401
+import faiss  # noqa: F401
 import numpy as np
 import pandas as pd
 import torch
@@ -15,7 +16,6 @@
 from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
 from tqdm import tqdm
 
-import faiss  # noqa: F401
 from transformers import AdamW, AutoModel, AutoModelForSeq2SeqLM, AutoTokenizer, get_linear_schedule_with_warmup
 
 
@@ -78,7 +78,7 @@ def query_es_index(question, es_client, index_name="english_wiki_kilt_snippets_1
     )
     hits = response["hits"]["hits"]
     support_doc = "<P> " + " <P> ".join([hit["_source"]["passage_text"] for hit in hits])
-    res_list = [dict([(k, hit["_source"][k]) for k in hit["_source"] if k != "passage_text"]) for hit in hits]
+    res_list = [{k: hit["_source"][k] for k in hit["_source"] if k != "passage_text"} for hit in hits]
     for r, hit in zip(res_list, hits):
         r["passage_id"] = hit["_id"]
         r["score"] = hit["_score"]
@@ -601,7 +601,7 @@ def make_qa_dense_index(
     fp = np.memmap(index_name, dtype=dtype, mode="w+", shape=(passages_dset.num_rows, 128))
     n_batches = math.ceil(passages_dset.num_rows / batch_size)
     for i in range(n_batches):
-        passages = [p for p in passages_dset[i * batch_size : (i + 1) * batch_size]["passage_text"]]
+        passages = list(passages_dset[i * batch_size : (i + 1) * batch_size]["passage_text"])
         reps = embed_passages_for_retrieval(passages, tokenizer, qa_embedder, max_length, device)
         fp[i * batch_size : (i + 1) * batch_size] = reps
         if i % 50 == 0:
@@ -634,7 +634,7 @@ def query_qa_dense_index(
     D, I = wiki_index.search(q_rep, 2 * n_results)
     res_passages = [wiki_passages[int(i)] for i in I[0]]
     support_doc = "<P> " + " <P> ".join([p["passage_text"] for p in res_passages])
-    res_list = [dict([(k, p[k]) for k in wiki_passages.column_names]) for p in res_passages]
+    res_list = [{k: p[k] for k in wiki_passages.column_names} for p in res_passages]
     res_list = [res for res in res_list if len(res["passage_text"].split()) > min_length][:n_results]
     for r, sc in zip(res_list, D[0]):
         r["score"] = float(sc)
@@ -650,7 +650,7 @@ def batch_query_qa_dense_index(questions, qa_embedder, tokenizer, wiki_passages,
     ]
     all_res_lists = []
     for res_passages, dl in zip(res_passages_lst, D):
-        res_list = [dict([(k, p[k]) for k in wiki_passages.column_names]) for p in res_passages]
+        res_list = [{k: p[k] for k in wiki_passages.column_names} for p in res_passages]
         for r, sc in zip(res_list, dl):
             r["score"] = float(sc)
         all_res_lists += [res_list[:]]
@@ -663,7 +663,7 @@ def query_qa_dense_index_nn(passage, qa_embedder, tokenizer, wiki_passages, wiki
     D, I = wiki_index.search(a_rep, 2 * n_results)
     res_passages = [wiki_passages[int(i)] for i in I[0]]
     support_doc = "<P> " + " <P> ".join([p["passage_text"] for p in res_passages])
-    res_list = [dict([(k, p[k]) for k in wiki_passages.column_names]) for p in res_passages]
+    res_list = [{k: p[k] for k in wiki_passages.column_names} for p in res_passages]
     res_list = [res for res in res_list if len(res["passage_text"].split()) > min_length][:n_results]
     for r, sc, i in zip(res_list, D[0], I[0]):
         r["passage_id"] = int(i)
@@ -680,7 +680,7 @@ def batch_query_qa_dense_index_nn(passages, qa_embedder, tokenizer, wiki_passage
     ]
     all_res_lists = []
     for res_passages, dl, il in zip(res_passages_lst, D, I):
-        res_list = [dict([(k, p[k]) for k in wiki_passages.column_names]) for p in res_passages]
+        res_list = [{k: p[k] for k in wiki_passages.column_names} for p in res_passages]
         for r, sc, i in zip(res_list, dl, il):
             r["passage_id"] = int(i)
             r["score"] = float(sc)
diff --git a/examples/research_projects/luke/run_luke_ner_no_trainer.py b/examples/research_projects/luke/run_luke_ner_no_trainer.py
index cb81402425ff..f12a7d76d80c 100644
--- a/examples/research_projects/luke/run_luke_ner_no_trainer.py
+++ b/examples/research_projects/luke/run_luke_ner_no_trainer.py
@@ -27,14 +27,14 @@
 
 import datasets
 import torch
+from accelerate import Accelerator, DistributedDataParallelKwargs
 from datasets import ClassLabel, load_dataset, load_metric
+from huggingface_hub import Repository, create_repo
+from luke_utils import DataCollatorForLukeTokenClassification, is_punctuation, padding_tensor
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
 
 import transformers
-from accelerate import Accelerator, DistributedDataParallelKwargs
-from huggingface_hub import Repository
-from luke_utils import DataCollatorForLukeTokenClassification, is_punctuation, padding_tensor
 from transformers import (
     AdamW,
     LukeConfig,
@@ -45,7 +45,6 @@
     get_scheduler,
     set_seed,
 )
-from transformers.file_utils import get_full_repo_name
 from transformers.utils.versions import require_version
 
 
@@ -258,11 +257,14 @@ def main():
     # Handle the repository creation
     if accelerator.is_main_process:
         if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            repo = Repository(args.output_dir, clone_from=repo_name)
+            # Retrieve of infer repo_name
+            repo_name = args.hub_model_id
+            if repo_name is None:
+                repo_name = Path(args.output_dir).absolute().name
+            # Create repo and retrieve repo_id
+            repo_id = create_repo(repo_name, exist_ok=True, token=args.hub_token).repo_id
+            # Clone repo locally
+            repo = Repository(args.output_dir, clone_from=repo_id, token=args.hub_token)
         elif args.output_dir is not None:
             os.makedirs(args.output_dir, exist_ok=True)
     accelerator.wait_for_everyone()
@@ -610,7 +612,7 @@ def get_luke_labels(outputs, ner_tags, original_entity_spans):
             predicted_sequence = [label_list[0]] * len(true_tags)
 
             for _, span, label in sorted(predictions, key=lambda o: o[0], reverse=True):
-                if all([o == label_list[0] for o in predicted_sequence[span[0] : span[1]]]):
+                if all(o == label_list[0] for o in predicted_sequence[span[0] : span[1]]):
                     predicted_sequence[span[0]] = label
                     if span[1] - span[0] > 1:
                         predicted_sequence[span[0] + 1 : span[1]] = [label] * (span[1] - span[0] - 1)
diff --git a/examples/research_projects/lxmert/extracting_data.py b/examples/research_projects/lxmert/extracting_data.py
index 9790e20ad86b..6b1342c9b11f 100644
--- a/examples/research_projects/lxmert/extracting_data.py
+++ b/examples/research_projects/lxmert/extracting_data.py
@@ -9,9 +9,9 @@
 import datasets
 import numpy as np
 import torch
-
 from modeling_frcnn import GeneralizedRCNN
 from processing_image import Preprocess
+
 from utils import Config
 
 
@@ -61,7 +61,7 @@ def __init__(self, argv=sys.argv[1:]):
         assert outputfile is not None and not os.path.isfile(outputfile), f"{outputfile}"
         if subset_list is not None:
             with open(os.path.realpath(subset_list)) as f:
-                self.subset_list = set(map(lambda x: self._vqa_file_split()[0], tryload(f)))
+                self.subset_list = {self._vqa_file_split()[0] for x in tryload(f)}
         else:
             self.subset_list = None
 
diff --git a/examples/research_projects/lxmert/modeling_frcnn.py b/examples/research_projects/lxmert/modeling_frcnn.py
index 33c1133e9589..943588a5ed8c 100644
--- a/examples/research_projects/lxmert/modeling_frcnn.py
+++ b/examples/research_projects/lxmert/modeling_frcnn.py
@@ -169,7 +169,6 @@ def get_norm(norm, out_channels):
 
 
 def _create_grid_offsets(size: List[int], stride: int, offset: float, device):
-
     grid_height, grid_width = size
     shifts_x = torch.arange(
         offset * stride,
@@ -390,7 +389,6 @@ def assign_boxes_to_levels(
     canonical_box_size: int,
     canonical_level: int,
 ):
-
     box_sizes = torch.sqrt(torch.cat([boxes.area() for boxes in box_lists]))
     # Eqn.(1) in FPN paper
     level_assignments = torch.floor(canonical_level + torch.log2(box_sizes / canonical_box_size + 1e-8))
@@ -556,8 +554,8 @@ def __init__(
         assert thresholds[0] > 0
         thresholds.insert(0, -float("inf"))
         thresholds.append(float("inf"))
-        assert all([low <= high for (low, high) in zip(thresholds[:-1], thresholds[1:])])
-        assert all([label_i in [-1, 0, 1] for label_i in labels])
+        assert all(low <= high for (low, high) in zip(thresholds[:-1], thresholds[1:]))
+        assert all(label_i in [-1, 0, 1] for label_i in labels)
         assert len(labels) == len(thresholds) - 1
         self.thresholds = thresholds
         self.labels = labels
@@ -1097,7 +1095,7 @@ def forward(self, feature_maps, boxes):
         Returns:
             A tensor of shape(N*B, Channels, output_size, output_size)
         """
-        x = [v for v in feature_maps.values()]
+        x = list(feature_maps.values())
         num_level_assignments = len(self.level_poolers)
         assert len(x) == num_level_assignments and len(boxes) == x[0].size(0)
 
@@ -1708,9 +1706,10 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
                 archive_file = pretrained_model_name_or_path
             elif os.path.isfile(pretrained_model_name_or_path + ".index"):
-                assert from_tf, (
-                    "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint"
-                    .format(pretrained_model_name_or_path + ".index")
+                assert (
+                    from_tf
+                ), "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format(
+                    pretrained_model_name_or_path + ".index"
                 )
                 archive_file = pretrained_model_name_or_path + ".index"
             else:
diff --git a/examples/research_projects/lxmert/requirements.txt b/examples/research_projects/lxmert/requirements.txt
index e3c3f89a510f..e7579a4d168f 100644
--- a/examples/research_projects/lxmert/requirements.txt
+++ b/examples/research_projects/lxmert/requirements.txt
@@ -4,7 +4,7 @@ async-generator==1.10
 attrs==20.2.0
 backcall==0.2.0
 CacheControl==0.12.6
-certifi==2022.12.7
+certifi==2023.7.22
 cffi==1.14.2
 chardet==3.0.4
 click==7.1.2
@@ -19,7 +19,7 @@ distlib==0.3.0
 distro==1.4.0
 entrypoints==0.3
 filelock==3.0.12
-future==0.18.2
+future==0.18.3
 html5lib==1.0.1
 idna==2.8
 ipaddr==2.2.0
@@ -75,7 +75,7 @@ pyzmq==19.0.2
 qtconsole==4.7.7
 QtPy==1.9.0
 regex==2020.7.14
-requests==2.22.0
+requests==2.31.0
 retrying==1.3.3
 sacremoses==0.0.43
 Send2Trash==1.5.0
@@ -86,11 +86,11 @@ testpath==0.4.4
 tokenizers==0.8.1rc2
 torch==1.6.0
 torchvision==0.7.0
-tornado==6.0.4
+tornado==6.3.3
 tqdm==4.48.2
 traitlets
 git+https://github.com/huggingface/transformers.git
-urllib3==1.26.5
+urllib3==1.26.17
 wcwidth==0.2.5
 webencodings==0.5.1
 wget==3.2
diff --git a/examples/research_projects/lxmert/utils.py b/examples/research_projects/lxmert/utils.py
index 8e830fb8359d..2fc6ea2062ef 100644
--- a/examples/research_projects/lxmert/utils.py
+++ b/examples/research_projects/lxmert/utils.py
@@ -34,14 +34,13 @@
 from urllib.parse import urlparse
 from zipfile import ZipFile, is_zipfile
 
-import numpy as np
-from PIL import Image
-from tqdm.auto import tqdm
-
 import cv2
+import numpy as np
 import requests
 import wget
 from filelock import FileLock
+from PIL import Image
+from tqdm.auto import tqdm
 from yaml import Loader, dump, load
 
 
@@ -181,7 +180,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
 
     @classmethod
     def get_config_dict(cls, pretrained_model_name_or_path: str, **kwargs):
-
         cache_dir = kwargs.pop("cache_dir", None)
         force_download = kwargs.pop("force_download", False)
         resume_download = kwargs.pop("resume_download", False)
@@ -225,14 +223,13 @@ def get_config_dict(cls, pretrained_model_name_or_path: str, **kwargs):
 
 # quick compare tensors
 def compare(in_tensor):
-
     out_tensor = torch.load("dump.pt", map_location=in_tensor.device)
     n1 = in_tensor.numpy()
     n2 = out_tensor.numpy()[0]
     print(n1.shape, n1[0, 0, :5])
     print(n2.shape, n2[0, 0, :5])
     assert np.allclose(n1, n2, rtol=0.01, atol=0.1), (
-        f"{sum([1 for x in np.isclose(n1, n2, rtol=0.01, atol=0.1).flatten() if x == False])/len(n1.flatten())*100:.4f} %"
+        f"{sum([1 for x in np.isclose(n1, n2, rtol=0.01, atol=0.1).flatten() if x is False])/len(n1.flatten())*100:.4f} %"
         " element-wise mismatch"
     )
     raise Exception("tensors are all good")
@@ -300,7 +297,6 @@ def get_from_cache(
     user_agent=None,
     local_files_only=False,
 ):
-
     if cache_dir is None:
         cache_dir = TRANSFORMERS_CACHE
     if isinstance(cache_dir, Path):
@@ -355,7 +351,6 @@ def get_from_cache(
     # Prevent parallel downloads of the same file with a lock.
     lock_path = cache_path + ".lock"
     with FileLock(lock_path):
-
         # If the download just completed while the lock was activated.
         if os.path.exists(cache_path) and not force_download:
             # Even if returning early like here, the lock will be released.
@@ -406,7 +401,6 @@ def _resumable_file_manager():
 
 
 def url_to_filename(url, etag=None):
-
     url_bytes = url.encode("utf-8")
     url_hash = sha256(url_bytes)
     filename = url_hash.hexdigest()
diff --git a/examples/research_projects/lxmert/visualizing_image.py b/examples/research_projects/lxmert/visualizing_image.py
index a02dc66dfb7c..163d661e873e 100644
--- a/examples/research_projects/lxmert/visualizing_image.py
+++ b/examples/research_projects/lxmert/visualizing_image.py
@@ -18,6 +18,7 @@
 import colorsys
 import io
 
+import cv2
 import matplotlib as mpl
 import matplotlib.colors as mplc
 import matplotlib.figure as mplfigure
@@ -25,7 +26,6 @@
 import torch
 from matplotlib.backends.backend_agg import FigureCanvasAgg
 
-import cv2
 from utils import img_tensorize
 
 
diff --git a/examples/research_projects/mlm_wwm/run_chinese_ref.py b/examples/research_projects/mlm_wwm/run_chinese_ref.py
index 4d1c9e81e94a..eca89df97982 100644
--- a/examples/research_projects/mlm_wwm/run_chinese_ref.py
+++ b/examples/research_projects/mlm_wwm/run_chinese_ref.py
@@ -3,6 +3,7 @@
 from typing import List
 
 from ltp import LTP
+
 from transformers.models.bert.tokenization_bert import BertTokenizer
 
 
diff --git a/examples/research_projects/mlm_wwm/run_mlm_wwm.py b/examples/research_projects/mlm_wwm/run_mlm_wwm.py
index f14ad5adfeff..c7cfa0804961 100644
--- a/examples/research_projects/mlm_wwm/run_mlm_wwm.py
+++ b/examples/research_projects/mlm_wwm/run_mlm_wwm.py
@@ -62,7 +62,7 @@ class ModelArguments:
         default=None,
         metadata={
             "help": (
-                "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch."
+                "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
             )
         },
     )
diff --git a/examples/research_projects/mm-imdb/run_mmimdb.py b/examples/research_projects/mm-imdb/run_mmimdb.py
index 9f12257a10a8..2cc3bc3a0c73 100644
--- a/examples/research_projects/mm-imdb/run_mmimdb.py
+++ b/examples/research_projects/mm-imdb/run_mmimdb.py
@@ -30,6 +30,7 @@
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
+from utils_mmimdb import ImageEncoder, JsonlDataset, collate_fn, get_image_transforms, get_mmimdb_labels
 
 import transformers
 from transformers import (
@@ -43,7 +44,6 @@
     get_linear_schedule_with_warmup,
 )
 from transformers.trainer_utils import is_main_process
-from utils_mmimdb import ImageEncoder, JsonlDataset, collate_fn, get_image_transforms, get_mmimdb_labels
 
 
 try:
@@ -554,9 +554,9 @@ def main():
     if args.do_eval and args.local_rank in [-1, 0]:
         checkpoints = [args.output_dir]
         if args.eval_all_checkpoints:
-            checkpoints = list(
+            checkpoints = [
                 os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
-            )
+            ]
 
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
         for checkpoint in checkpoints:
@@ -566,7 +566,7 @@ def main():
             model.load_state_dict(torch.load(checkpoint))
             model.to(args.device)
             result = evaluate(args, model, tokenizer, criterion, prefix=prefix)
-            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
+            result = {k + "_{}".format(global_step): v for k, v in result.items()}
             results.update(result)
 
     return results
diff --git a/examples/research_projects/movement-pruning/bertarize.py b/examples/research_projects/movement-pruning/bertarize.py
index 623b46b94386..0c9cc63571d7 100644
--- a/examples/research_projects/movement-pruning/bertarize.py
+++ b/examples/research_projects/movement-pruning/bertarize.py
@@ -22,7 +22,6 @@
 import shutil
 
 import torch
-
 from emmental.modules import MagnitudeBinarizer, ThresholdBinarizer, TopKBinarizer
 
 
diff --git a/examples/research_projects/movement-pruning/counts_parameters.py b/examples/research_projects/movement-pruning/counts_parameters.py
index 0aec3766b3f9..17ddb029f897 100644
--- a/examples/research_projects/movement-pruning/counts_parameters.py
+++ b/examples/research_projects/movement-pruning/counts_parameters.py
@@ -19,7 +19,6 @@
 import os
 
 import torch
-
 from emmental.modules import ThresholdBinarizer, TopKBinarizer
 
 
diff --git a/examples/research_projects/movement-pruning/emmental/__init__.py b/examples/research_projects/movement-pruning/emmental/__init__.py
index 09c900161d81..6646667ea883 100644
--- a/examples/research_projects/movement-pruning/emmental/__init__.py
+++ b/examples/research_projects/movement-pruning/emmental/__init__.py
@@ -1,4 +1,3 @@
-# flake8: noqa
 from .configuration_bert_masked import MaskedBertConfig
 from .modeling_bert_masked import (
     MaskedBertForMultipleChoice,
diff --git a/examples/research_projects/movement-pruning/emmental/configuration_bert_masked.py b/examples/research_projects/movement-pruning/emmental/configuration_bert_masked.py
index 66d78b0c8fdc..2a3bd763a2de 100644
--- a/examples/research_projects/movement-pruning/emmental/configuration_bert_masked.py
+++ b/examples/research_projects/movement-pruning/emmental/configuration_bert_masked.py
@@ -50,7 +50,7 @@ def __init__(
         pruning_method="topK",
         mask_init="constant",
         mask_scale=0.0,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
 
diff --git a/examples/research_projects/movement-pruning/emmental/modeling_bert_masked.py b/examples/research_projects/movement-pruning/emmental/modeling_bert_masked.py
index 4228050fe123..d404bf49aaa6 100644
--- a/examples/research_projects/movement-pruning/emmental/modeling_bert_masked.py
+++ b/examples/research_projects/movement-pruning/emmental/modeling_bert_masked.py
@@ -649,7 +649,10 @@ def forward(
         sequence_output = encoder_outputs[0]
         pooled_output = self.pooler(sequence_output)
 
-        outputs = (sequence_output, pooled_output,) + encoder_outputs[
+        outputs = (
+            sequence_output,
+            pooled_output,
+        ) + encoder_outputs[
             1:
         ]  # add hidden_states and attentions if they are here
         return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
diff --git a/examples/research_projects/movement-pruning/emmental/modules/__init__.py b/examples/research_projects/movement-pruning/emmental/modules/__init__.py
index c1bfd1397c39..761a6343d6b5 100644
--- a/examples/research_projects/movement-pruning/emmental/modules/__init__.py
+++ b/examples/research_projects/movement-pruning/emmental/modules/__init__.py
@@ -1,3 +1,2 @@
-# flake8: noqa
 from .binarizer import MagnitudeBinarizer, ThresholdBinarizer, TopKBinarizer
 from .masked_nn import MaskedLinear
diff --git a/examples/research_projects/movement-pruning/masked_run_glue.py b/examples/research_projects/movement-pruning/masked_run_glue.py
index e81cf9209c88..a28cdcc583b6 100644
--- a/examples/research_projects/movement-pruning/masked_run_glue.py
+++ b/examples/research_projects/movement-pruning/masked_run_glue.py
@@ -24,12 +24,12 @@
 
 import numpy as np
 import torch
+from emmental import MaskedBertConfig, MaskedBertForSequenceClassification
 from torch import nn
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
 
-from emmental import MaskedBertConfig, MaskedBertForSequenceClassification
 from transformers import (
     WEIGHTS_NAME,
     AdamW,
@@ -228,7 +228,6 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
     for _ in train_iterator:
         epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
         for step, batch in enumerate(epoch_iterator):
-
             # Skip past any already trained steps if resuming training
             if steps_trained_in_current_epoch > 0:
                 steps_trained_in_current_epoch -= 1
@@ -942,9 +941,9 @@ def main():
         tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
         checkpoints = [args.output_dir]
         if args.eval_all_checkpoints:
-            checkpoints = list(
+            checkpoints = [
                 os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
-            )
+            ]
 
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
         for checkpoint in checkpoints:
@@ -954,7 +953,7 @@ def main():
             model = model_class.from_pretrained(checkpoint)
             model.to(args.device)
             result = evaluate(args, model, tokenizer, prefix=prefix)
-            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
+            result = {k + "_{}".format(global_step): v for k, v in result.items()}
             results.update(result)
 
     return results
diff --git a/examples/research_projects/movement-pruning/masked_run_squad.py b/examples/research_projects/movement-pruning/masked_run_squad.py
index 1bd501eda514..189ed5be6707 100644
--- a/examples/research_projects/movement-pruning/masked_run_squad.py
+++ b/examples/research_projects/movement-pruning/masked_run_squad.py
@@ -25,12 +25,12 @@
 
 import numpy as np
 import torch
+from emmental import MaskedBertConfig, MaskedBertForQuestionAnswering
 from torch import nn
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
 
-from emmental import MaskedBertConfig, MaskedBertForQuestionAnswering
 from transformers import (
     WEIGHTS_NAME,
     AdamW,
@@ -236,7 +236,6 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
     for _ in train_iterator:
         epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
         for step, batch in enumerate(epoch_iterator):
-
             # Skip past any already trained steps if resuming training
             if steps_trained_in_current_epoch > 0:
                 steps_trained_in_current_epoch -= 1
@@ -1110,10 +1109,10 @@ def main():
             logger.info("Loading checkpoints saved during training for evaluation")
             checkpoints = [args.output_dir]
             if args.eval_all_checkpoints:
-                checkpoints = list(
+                checkpoints = [
                     os.path.dirname(c)
                     for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
-                )
+                ]
 
         else:
             logger.info("Loading checkpoint %s for evaluation", args.model_name_or_path)
@@ -1130,7 +1129,7 @@ def main():
             # Evaluate
             result = evaluate(args, model, tokenizer, prefix=global_step)
 
-            result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())
+            result = {k + ("_{}".format(global_step) if global_step else ""): v for k, v in result.items()}
             results.update(result)
 
     logger.info("Results: {}".format(results))
diff --git a/examples/research_projects/onnx/summarization/bart_onnx/generation_onnx.py b/examples/research_projects/onnx/summarization/bart_onnx/generation_onnx.py
index 8f4760580fd9..5c1b0da70002 100644
--- a/examples/research_projects/onnx/summarization/bart_onnx/generation_onnx.py
+++ b/examples/research_projects/onnx/summarization/bart_onnx/generation_onnx.py
@@ -264,7 +264,6 @@ def greedy_search(
 
         past: List[torch.Tensor] = []
         while cur_len < max_length:
-
             logits, past = self._decoder_forward(input_ids, encoder_output, attention_mask, past)
             next_token_logits = logits[:, -1, :]
 
@@ -303,7 +302,6 @@ def _prepare_decoder_input_ids_for_generation(
         decoder_start_token_id,
         bos_token_id: Optional[int] = None,
     ) -> torch.LongTensor:
-
         decoder_input_ids = (
             torch.ones((input_ids.shape[0], 1), dtype=input_ids.dtype, device=input_ids.device)
             * decoder_start_token_id
@@ -633,7 +631,6 @@ def _reorder_cache(self, past: List[torch.Tensor], beam_idx):
     def beam_search(
         self, input_ids, encoder_output, attention_mask, num_beams, max_length, pad_token_id: int, eos_token_id: int
     ):
-
         batch_size = self.beam_scorer.batch_size
 
         num_beams = self.beam_scorer.num_beams
diff --git a/examples/research_projects/onnx/summarization/bart_onnx/reduce_onnx_size.py b/examples/research_projects/onnx/summarization/bart_onnx/reduce_onnx_size.py
index 63fae44ffac6..1df20e4504da 100644
--- a/examples/research_projects/onnx/summarization/bart_onnx/reduce_onnx_size.py
+++ b/examples/research_projects/onnx/summarization/bart_onnx/reduce_onnx_size.py
@@ -5,7 +5,6 @@
 import os
 
 import numpy
-
 import onnx
 
 
@@ -43,8 +42,8 @@ def _graph_replace_input_with(graph_proto, name, new_name):
 
 
 def _remove_dup_initializers_from_model(model, model_without_ext, ind_to_replace):
-    inits_with_data = [i for i in model.graph.initializer]
-    inits = [i for i in model_without_ext.graph.initializer]
+    inits_with_data = list(model.graph.initializer)
+    inits = list(model_without_ext.graph.initializer)
     for i, ref_i in ind_to_replace:
         assert inits_with_data[i].name == inits[i].name
         assert inits_with_data[ref_i].name == inits[ref_i].name
@@ -70,7 +69,7 @@ def remove_dup_initializers(onnx_file_path):
 
     model = onnx.load(os.path.join(model_file_folder, model_file_name))
 
-    inits = [i for i in model.graph.initializer]
+    inits = list(model.graph.initializer)
 
     dup_set = set()
     dup_map = {}
diff --git a/examples/research_projects/onnx/summarization/run_onnx_exporter.py b/examples/research_projects/onnx/summarization/run_onnx_exporter.py
index 5d751ace8eee..889eefb4e74b 100644
--- a/examples/research_projects/onnx/summarization/run_onnx_exporter.py
+++ b/examples/research_projects/onnx/summarization/run_onnx_exporter.py
@@ -22,12 +22,12 @@
 import sys
 
 import numpy as np
-import torch
-
 import onnxruntime
-import transformers
+import torch
 from bart_onnx.generation_onnx import BARTBeamSearchGenerator
 from bart_onnx.reduce_onnx_size import remove_dup_initializers
+
+import transformers
 from transformers import BartForConditionalGeneration, BartTokenizer
 
 
diff --git a/examples/research_projects/performer/modeling_flax_performer.py b/examples/research_projects/performer/modeling_flax_performer.py
index b4b9924fae27..7c2fde6ddbb5 100644
--- a/examples/research_projects/performer/modeling_flax_performer.py
+++ b/examples/research_projects/performer/modeling_flax_performer.py
@@ -15,13 +15,13 @@
 
 from typing import Callable, Dict, Tuple
 
-import numpy as np
-
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from jax.random import PRNGKey
 from modeling_flax_performer_utils import make_fast_softmax_attention
+
 from transformers.file_utils import add_start_docstrings
 from transformers.modeling_flax_utils import ACT2FN
 from transformers.models.bert.configuration_bert import BertConfig
@@ -366,7 +366,6 @@ def convert_from_pytorch(pt_state: Dict, config: BertConfig) -> Dict:
 
             # SelfAttention needs also to replace "weight" by "kernel"
             if {"query", "key", "value"} & key_parts:
-
                 # Flax SelfAttention decomposes the heads (num_head, size // num_heads)
                 if "bias" in key:
                     jax_state[key] = tensor.reshape((config.num_attention_heads, -1))
@@ -443,7 +442,6 @@ def module(self) -> nn.Module:
     def __call__(
         self, input_ids, token_type_ids=None, position_ids=None, dropout_rng: PRNGKey = None, attention_mask=None
     ):
-
         input_ids, attention_mask, token_type_ids, position_ids = self._check_inputs(
             input_ids, attention_mask, token_type_ids, position_ids
         )
diff --git a/examples/research_projects/performer/modeling_flax_performer_utils.py b/examples/research_projects/performer/modeling_flax_performer_utils.py
index 915e2fa23dd9..6e6173729cc3 100644
--- a/examples/research_projects/performer/modeling_flax_performer_utils.py
+++ b/examples/research_projects/performer/modeling_flax_performer_utils.py
@@ -30,11 +30,10 @@
 import functools
 from collections.abc import Iterable  # pylint: disable=g-importing-member
 
-import numpy as onp
-from absl import logging
-
 import jax
 import jax.numpy as jnp
+import numpy as onp
+from absl import logging
 from jax import lax, random
 
 
@@ -524,7 +523,6 @@ def dot_product_attention(
         deterministic=False,
         precision=None,
     ):
-
         assert key.shape[:-1] == value.shape[:-1]
         assert query.shape[0:1] == key.shape[0:1] and query.shape[-1] == key.shape[-1]
         if axis is None:
diff --git a/examples/research_projects/performer/run_mlm_performer.py b/examples/research_projects/performer/run_mlm_performer.py
index 35de233f727e..42033857eda9 100644
--- a/examples/research_projects/performer/run_mlm_performer.py
+++ b/examples/research_projects/performer/run_mlm_performer.py
@@ -28,18 +28,18 @@
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple
 
-import numpy as np
-from datasets import load_dataset
-from tqdm import tqdm
-
 import jax
 import jax.numpy as jnp
+import numpy as np
+from datasets import load_dataset
 from flax import jax_utils
 from flax.optim import Adam
 from flax.training import common_utils
 from flax.training.common_utils import get_metrics
 from jax.nn import log_softmax
 from modeling_flax_performer import FlaxPerformerForMaskedLM
+from tqdm import tqdm
+
 from transformers import (
     MODEL_FOR_MASKED_LM_MAPPING,
     AutoTokenizer,
@@ -99,7 +99,7 @@ class ModelArguments:
         default=None,
         metadata={
             "help": (
-                "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch."
+                "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
             )
         },
     )
@@ -632,7 +632,6 @@ def tokenize_function(examples):
 
     epochs = tqdm(range(nb_epochs), desc=f"Epoch ... (1/{nb_epochs})", position=0)
     for epoch in epochs:
-
         # ======================== Training ================================
         # Create sampling rng
         rng, training_rng, eval_rng = jax.random.split(rng, 3)
diff --git a/examples/research_projects/pplm/run_pplm.py b/examples/research_projects/pplm/run_pplm.py
index fdbad607201b..54008d56c14c 100644
--- a/examples/research_projects/pplm/run_pplm.py
+++ b/examples/research_projects/pplm/run_pplm.py
@@ -30,10 +30,10 @@
 
 import numpy as np
 import torch
+from pplm_classification_head import ClassificationHead
 from torch import nn
 from tqdm import trange
 
-from pplm_classification_head import ClassificationHead
 from transformers import GPT2LMHeadModel, GPT2Tokenizer
 from transformers.file_utils import cached_path
 
@@ -127,11 +127,9 @@ def perturb_past(
     _, _, _, curr_length, _ = past[0].shape
 
     if curr_length > window_length and window_length > 0:
-        ones_key_val_shape = tuple(past[0].shape[:-2]) + tuple([window_length]) + tuple(past[0].shape[-1:])
+        ones_key_val_shape = tuple(past[0].shape[:-2]) + (window_length,) + tuple(past[0].shape[-1:])
 
-        zeros_key_val_shape = (
-            tuple(past[0].shape[:-2]) + tuple([curr_length - window_length]) + tuple(past[0].shape[-1:])
-        )
+        zeros_key_val_shape = tuple(past[0].shape[:-2]) + (curr_length - window_length,) + tuple(past[0].shape[-1:])
 
         ones_mask = torch.ones(ones_key_val_shape)
         ones_mask = decay_mask * ones_mask.permute(0, 1, 2, 4, 3)
@@ -345,7 +343,7 @@ def full_text_generation(
     gm_scale=0.9,
     kl_scale=0.01,
     repetition_penalty=1.0,
-    **kwargs
+    **kwargs,
 ):
     classifier, class_id = get_classifier(discrim, class_label, device)
 
@@ -463,7 +461,6 @@ def generate_text_pplm(
     unpert_discrim_loss = 0
     loss_in_time = []
     for i in trange(length, ascii=True):
-
         # Get past/probs for current output, except for last word
         # Note that GPT takes 2 inputs: past + current_token
 
@@ -547,7 +544,6 @@ def generate_text_pplm(
 
         # Fuse the modified model and original model
         if perturb:
-
             unpert_probs = nn.functional.softmax(unpert_logits[:, -1, :], dim=-1)
 
             pert_probs = (pert_probs**gm_scale) * (unpert_probs ** (1 - gm_scale))  # + SMALL_CONST
diff --git a/examples/research_projects/pplm/run_pplm_discrim_train.py b/examples/research_projects/pplm/run_pplm_discrim_train.py
index 6a7351d9e6a6..d53b557d1af0 100644
--- a/examples/research_projects/pplm/run_pplm_discrim_train.py
+++ b/examples/research_projects/pplm/run_pplm_discrim_train.py
@@ -26,12 +26,12 @@
 import torch.optim as optim
 import torch.utils.data as data
 from nltk.tokenize.treebank import TreebankWordDetokenizer
+from pplm_classification_head import ClassificationHead
 from torch import nn
 from torchtext import data as torchtext_data
 from torchtext import datasets
 from tqdm import tqdm, trange
 
-from pplm_classification_head import ClassificationHead
 from transformers import GPT2LMHeadModel, GPT2Tokenizer
 
 
diff --git a/examples/research_projects/quantization-qdqbert/evaluate-hf-trt-qa.py b/examples/research_projects/quantization-qdqbert/evaluate-hf-trt-qa.py
index bd0b1157b01d..814f95d0ab8f 100755
--- a/examples/research_projects/quantization-qdqbert/evaluate-hf-trt-qa.py
+++ b/examples/research_projects/quantization-qdqbert/evaluate-hf-trt-qa.py
@@ -21,19 +21,19 @@
 
 import datasets
 import numpy as np
+import pycuda.autoinit  # noqa: F401
+import pycuda.driver as cuda
+import tensorrt as trt
 import torch
 from absl import logging as absl_logging
+from accelerate import Accelerator
 from datasets import load_dataset, load_metric
 from torch.utils.data import DataLoader
+from utils_qa import postprocess_qa_predictions
 
-import pycuda.autoinit  # noqa: F401
-import pycuda.driver as cuda
-import tensorrt as trt
 import transformers
-from accelerate import Accelerator
 from transformers import AutoTokenizer, EvalPrediction, default_data_collator, set_seed
 from transformers.trainer_pt_utils import nested_concat, nested_truncate
-from utils_qa import postprocess_qa_predictions
 
 
 TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
@@ -395,7 +395,6 @@ def post_processing_function(examples, features, predictions, stage="eval"):
 with open(engine_name, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime, runtime.deserialize_cuda_engine(
     f.read()
 ) as engine, engine.create_execution_context() as context:
-
     # setup for TRT inferrence
     for i in range(len(input_names)):
         context.set_binding_shape(i, INPUT_SHAPE)
@@ -427,7 +426,6 @@ def binding_nbytes(binding):
 
     all_preds = None
     for step, batch in enumerate(eval_dataloader):
-
         outputs, infer_time = model_infer(batch, context, d_inputs, h_output0, h_output1, d_output0, d_output1, stream)
         total_time += infer_time
         niter += 1
diff --git a/examples/research_projects/quantization-qdqbert/ort-infer-benchmark.py b/examples/research_projects/quantization-qdqbert/ort-infer-benchmark.py
index 4ed4203062c0..bb0436c12580 100644
--- a/examples/research_projects/quantization-qdqbert/ort-infer-benchmark.py
+++ b/examples/research_projects/quantization-qdqbert/ort-infer-benchmark.py
@@ -2,7 +2,6 @@
 import time
 
 import numpy as np
-
 import onnxruntime as ort
 
 
diff --git a/examples/research_projects/quantization-qdqbert/quant_trainer.py b/examples/research_projects/quantization-qdqbert/quant_trainer.py
index ce1ecb6c51fe..73e880ebf676 100755
--- a/examples/research_projects/quantization-qdqbert/quant_trainer.py
+++ b/examples/research_projects/quantization-qdqbert/quant_trainer.py
@@ -16,10 +16,9 @@
 import logging
 import re
 
-import torch
-
 import pytorch_quantization
 import pytorch_quantization.nn as quant_nn
+import torch
 from pytorch_quantization import calib
 from pytorch_quantization.tensor_quant import QuantDescriptor
 
@@ -42,8 +41,8 @@ def add_arguments(parser):
     group.add_argument("--quant-disable", action="store_true", help="disable all quantizers")
     group.add_argument("--quant-disable-embeddings", action="store_true", help="disable all embeddings quantizers")
     group.add_argument("--quant-disable-keyword", type=str, nargs="+", help="disable quantizers by keyword")
-    group.add_argument("--quant-disable-layer-module", type=str, help="disable quantizers by keyword under layer.\d+.")
-    group.add_argument("--quant-enable-layer-module", type=str, help="enable quantizers by keyword under layer.\d+.")
+    group.add_argument("--quant-disable-layer-module", type=str, help="disable quantizers by keyword under layer.")
+    group.add_argument("--quant-enable-layer-module", type=str, help="enable quantizers by keyword under layer")
     group.add_argument("--calibrator", default="max", help="which quantization range calibrator to use")
     group.add_argument("--percentile", default=None, type=float, help="percentile for PercentileCalibrator")
     group.add_argument("--fuse-qkv", action="store_true", help="use the same scale factor for qkv")
@@ -95,10 +94,10 @@ def configure_model(model, args, calib=False, eval=False):
             set_quantizer_by_name(model, args.quant_disable_keyword, _disabled=True)
 
         if args.quant_disable_layer_module:
-            set_quantizer_by_name(model, ["layer.\d+." + args.quant_disable_layer_module], _disabled=True)
+            set_quantizer_by_name(model, [r"layer.\d+." + args.quant_disable_layer_module], _disabled=True)
 
         if args.quant_enable_layer_module:
-            set_quantizer_by_name(model, ["layer.\d+." + args.quant_enable_layer_module], _disabled=False)
+            set_quantizer_by_name(model, [r"layer.\d+." + args.quant_enable_layer_module], _disabled=False)
 
         if args.recalibrate_weights:
             recalibrate_weights(model)
diff --git a/examples/research_projects/quantization-qdqbert/run_quant_qa.py b/examples/research_projects/quantization-qdqbert/run_quant_qa.py
index 5008197b8b84..ba5dfe4c0907 100755
--- a/examples/research_projects/quantization-qdqbert/run_quant_qa.py
+++ b/examples/research_projects/quantization-qdqbert/run_quant_qa.py
@@ -26,11 +26,12 @@
 from typing import Optional
 
 import datasets
+import quant_trainer
 from datasets import load_dataset, load_metric
+from trainer_quant_qa import QuestionAnsweringTrainer
+from utils_qa import postprocess_qa_predictions
 
-import quant_trainer
 import transformers
-from trainer_quant_qa import QuestionAnsweringTrainer
 from transformers import (
     AutoTokenizer,
     DataCollatorWithPadding,
@@ -46,7 +47,6 @@
 from transformers.trainer_utils import SchedulerType, get_last_checkpoint
 from transformers.utils import check_min_version
 from transformers.utils.versions import require_version
-from utils_qa import postprocess_qa_predictions
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
diff --git a/examples/research_projects/quantization-qdqbert/trainer_quant_qa.py b/examples/research_projects/quantization-qdqbert/trainer_quant_qa.py
index ef0d93a7e357..9b8c53b272b1 100644
--- a/examples/research_projects/quantization-qdqbert/trainer_quant_qa.py
+++ b/examples/research_projects/quantization-qdqbert/trainer_quant_qa.py
@@ -20,10 +20,10 @@
 import logging
 import os
 
+import quant_trainer
 import torch
 from torch.utils.data import DataLoader
 
-import quant_trainer
 from transformers import Trainer, is_torch_tpu_available
 from transformers.trainer_utils import PredictionOutput
 
diff --git a/examples/research_projects/rag-end2end-retriever/callbacks_rag.py b/examples/research_projects/rag-end2end-retriever/callbacks_rag.py
index 5f18244a7aa4..09a30ff6d5c4 100644
--- a/examples/research_projects/rag-end2end-retriever/callbacks_rag.py
+++ b/examples/research_projects/rag-end2end-retriever/callbacks_rag.py
@@ -6,7 +6,6 @@
 import torch
 from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
 from pytorch_lightning.utilities import rank_zero_only
-
 from utils_rag import save_json
 
 
diff --git a/examples/research_projects/rag-end2end-retriever/distributed_ray_retriever.py b/examples/research_projects/rag-end2end-retriever/distributed_ray_retriever.py
index 50842f062c99..f97467292c25 100644
--- a/examples/research_projects/rag-end2end-retriever/distributed_ray_retriever.py
+++ b/examples/research_projects/rag-end2end-retriever/distributed_ray_retriever.py
@@ -2,6 +2,7 @@
 import random
 
 import ray
+
 from transformers import RagConfig, RagRetriever, RagTokenizer
 from transformers.models.rag.retrieval_rag import CustomHFIndex
 
@@ -166,7 +167,6 @@ def from_pretrained(cls, retriever_name_or_path, actor_handles, indexed_dataset=
         )
 
     def re_load(self):
-
         logger.info("re-loading the new dataset with embeddings")
         # access from the training loop
 
diff --git a/examples/research_projects/rag-end2end-retriever/finetune_rag.py b/examples/research_projects/rag-end2end-retriever/finetune_rag.py
index 1229870e63c6..b0a6c1831907 100644
--- a/examples/research_projects/rag-end2end-retriever/finetune_rag.py
+++ b/examples/research_projects/rag-end2end-retriever/finetune_rag.py
@@ -164,11 +164,11 @@ def __init__(self, hparams, **kwargs):
         self.step_count = 0
         self.metrics = defaultdict(list)
 
-        self.dataset_kwargs: dict = dict(
-            data_dir=self.hparams.data_dir,
-            max_source_length=self.hparams.max_source_length,
-            prefix=prefix or "",
-        )
+        self.dataset_kwargs: dict = {
+            "data_dir": self.hparams.data_dir,
+            "max_source_length": self.hparams.max_source_length,
+            "prefix": prefix or "",
+        }
         n_observations_per_split = {
             "train": self.hparams.n_train,
             "val": self.hparams.n_val,
@@ -252,14 +252,12 @@ def pad(self) -> int:
         raise NotImplementedError("pad not implemented")
 
     def training_step(self, batch, batch_idx) -> Dict:
-
         global isEmUpdateBusy  # use to check whether the entire embedding update process is finished or not
         global isAddIndexBusy  # use to check whether the entire indexing process  is finished or not
         global processes  # use to keep threads embedding update processes
         global threadHandle_index  # use to keep thread in embedding indexing processes
 
         if (self.trainer.global_rank == 0) and (self.custom_config.end2end):
-
             if (not batch_idx == 0) and (batch_idx % self.custom_config.indexing_freq == 0):
                 free_gpu_list = []
                 nvmlInit()
@@ -282,7 +280,6 @@ def training_step(self, batch, batch_idx) -> Dict:
                     has_free_gpus = False
 
                 if (not isEmUpdateBusy) and has_free_gpus:
-
                     model_copy = type(self.model.rag.ctx_encoder)(
                         self.config_dpr
                     )  # get a new instance  #this will be load in the CPU
@@ -336,10 +333,8 @@ def training_step(self, batch, batch_idx) -> Dict:
 
             # check when index building has started
             if isAddIndexBusy:
-
                 # check still the index_building process is happening
                 if not threadHandle_index.is_alive():
-
                     logger.info("Merging the dataset shards")
                     saved_dataset_shards = []
 
@@ -365,7 +360,7 @@ def training_step(self, batch, batch_idx) -> Dict:
 
         loss_tensors = self._step(batch)
 
-        logs = {name: loss for name, loss in zip(self.loss_names, loss_tensors)}
+        logs = dict(zip(self.loss_names, loss_tensors))
         # tokens per batch
         tgt_pad_token_id = (
             self.tokenizer.generator.pad_token_id
@@ -408,11 +403,11 @@ def validation_epoch_end(self, outputs, prefix="val") -> Dict:
         self.save_metrics(metrics, prefix)  # writes to self.metrics_save_path
 
         log_dict = {
-            "val_avg_em": metrics["val_avg_em"],
+            f"{prefix}_avg_em": metrics[f"{prefix}_avg_em"],
             "step_count": metrics["step_count"],
-            "val_avg_loss": metrics["val_avg_loss"],
-            "val_loss": loss,
-            "val_em": metrics_tensor,
+            f"{prefix}_avg_loss": metrics[f"{prefix}_avg_loss"],
+            f"{prefix}_loss": loss,
+            f"{prefix}_em": metrics_tensor,
         }
         self.log_dict(log_dict)
 
@@ -439,7 +434,7 @@ def _generative_step(self, batch: dict) -> dict:
         target: List[str] = self.ids_to_clean_text(batch["decoder_input_ids"])
         # print(preds,target)
         loss_tensors = self._step(batch)
-        base_metrics = {name: loss for name, loss in zip(self.loss_names, loss_tensors)}
+        base_metrics = dict(zip(self.loss_names, loss_tensors))
         gen_metrics: Dict = self.calc_generative_metrics(preds, target)
 
         summ_len = np.mean(lmap(len, generated_ids))
@@ -494,7 +489,6 @@ def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
         self.tokenizer.save_pretrained(save_path)
 
         if self.custom_config.end2end:
-
             modified_state_dict = self.model.state_dict()
             for key in self.model.state_dict().keys():
                 if key.split(".")[1] == "ctx_encoder":
@@ -803,7 +797,6 @@ def main(args=None, model=None) -> GenerativeQAModule:
 
 
 if __name__ == "__main__":
-
     multiprocessing.set_start_method("spawn")
     parser = argparse.ArgumentParser()
     parser = pl.Trainer.add_argparse_args(parser)
diff --git a/examples/research_projects/rag-end2end-retriever/kb_encode_utils.py b/examples/research_projects/rag-end2end-retriever/kb_encode_utils.py
index 25fa737e5aa3..444c07b2bab1 100644
--- a/examples/research_projects/rag-end2end-retriever/kb_encode_utils.py
+++ b/examples/research_projects/rag-end2end-retriever/kb_encode_utils.py
@@ -2,9 +2,9 @@
 from functools import partial
 from glob import glob
 
+import faiss
 from datasets import Features, Sequence, Value, concatenate_datasets, load_dataset, load_from_disk
 
-import faiss
 from transformers import DPRContextEncoder, DPRContextEncoderTokenizerFast
 
 
@@ -26,7 +26,6 @@ def split_documents(documents):
 
 
 def embed_update(ctx_encoder, total_processes, device, process_num, shard_dir, csv_path):
-
     kb_dataset = load_dataset(
         "csv", data_files=[csv_path], split="train", delimiter="\t", column_names=["title", "text"]
     )
diff --git a/examples/research_projects/rag-end2end-retriever/lightning_base.py b/examples/research_projects/rag-end2end-retriever/lightning_base.py
index 84842944059a..b9f8c6e3d7b5 100644
--- a/examples/research_projects/rag-end2end-retriever/lightning_base.py
+++ b/examples/research_projects/rag-end2end-retriever/lightning_base.py
@@ -69,7 +69,7 @@ def __init__(
         config=None,
         tokenizer=None,
         model=None,
-        **config_kwargs
+        **config_kwargs,
     ):
         """Initialize a model, tokenizer and config."""
         super().__init__()
@@ -365,7 +365,7 @@ def generic_train(
     extra_callbacks=[],
     checkpoint_callback=None,
     logging_callback=None,
-    **extra_train_kwargs
+    **extra_train_kwargs,
 ):
     pl.seed_everything(args.seed)
 
diff --git a/examples/research_projects/rag-end2end-retriever/test_run/dummy-train-data/test.source b/examples/research_projects/rag-end2end-retriever/test_run/dummy-train-data/test.source
new file mode 100644
index 000000000000..3d5cbc38039d
--- /dev/null
+++ b/examples/research_projects/rag-end2end-retriever/test_run/dummy-train-data/test.source
@@ -0,0 +1,8 @@
+What does Moses' rod turn into ?
+Who is Aron?
+Where did Moses grow up ?
+What happens at the command of the Moses ?
+Who manages the Pokémon ?
+Who owned the Pokémon trademark ?
+What else include in Pokémon franchise ?
+How many seasons in Pokémon animme series ?
diff --git a/examples/research_projects/rag-end2end-retriever/test_run/dummy-train-data/test.target b/examples/research_projects/rag-end2end-retriever/test_run/dummy-train-data/test.target
new file mode 100644
index 000000000000..a3a6e04372c7
--- /dev/null
+++ b/examples/research_projects/rag-end2end-retriever/test_run/dummy-train-data/test.target
@@ -0,0 +1,8 @@
+to a snake
+Moses' assistant
+Egyptian royal court
+let his rod turn in to a snake
+The Pokémon Company
+Nintendo
+world's top-selling toy brand, the top-selling trading card game
+over 20 seasons
diff --git a/examples/research_projects/rag-end2end-retriever/use_own_knowledge_dataset.py b/examples/research_projects/rag-end2end-retriever/use_own_knowledge_dataset.py
index 432111a2784c..e0aa86a3a65b 100644
--- a/examples/research_projects/rag-end2end-retriever/use_own_knowledge_dataset.py
+++ b/examples/research_projects/rag-end2end-retriever/use_own_knowledge_dataset.py
@@ -6,10 +6,10 @@
 from tempfile import TemporaryDirectory
 from typing import List, Optional
 
+import faiss
 import torch
 from datasets import Features, Sequence, Value, load_dataset
 
-import faiss
 from transformers import DPRContextEncoder, DPRContextEncoderTokenizerFast, HfArgumentParser
 
 
@@ -49,7 +49,6 @@ def main(
     processing_args: "ProcessingArguments",
     index_hnsw_args: "IndexHnswArguments",
 ):
-
     ######################################
     logger.info("Step 1 - Create the dataset")
     ######################################
diff --git a/examples/research_projects/rag-end2end-retriever/utils_rag.py b/examples/research_projects/rag-end2end-retriever/utils_rag.py
index 7bf5d7e35e9e..ec98c1d782e0 100644
--- a/examples/research_projects/rag-end2end-retriever/utils_rag.py
+++ b/examples/research_projects/rag-end2end-retriever/utils_rag.py
@@ -137,7 +137,7 @@ def collate_fn(self, batch) -> Dict[str, torch.Tensor]:
 
 
 def flatten_list(summary_ids: List[List]):
-    return [x for x in itertools.chain.from_iterable(summary_ids)]
+    return list(itertools.chain.from_iterable(summary_ids))
 
 
 def save_git_info(folder_path: str) -> None:
diff --git a/examples/research_projects/rag/README.md b/examples/research_projects/rag/README.md
index 36c4a47841e5..eae1d863fdc1 100644
--- a/examples/research_projects/rag/README.md
+++ b/examples/research_projects/rag/README.md
@@ -17,7 +17,7 @@ Read more about RAG  at https://arxiv.org/abs/2005.11401.
 
 # Finetuning
 
-Our finetuning logic is based on scripts from [`examples/seq2seq`](https://github.com/huggingface/transformers/tree/main/examples/seq2seq). We accept training data in the same format as specified there - we expect a directory consisting of 6 text files:
+Our finetuning logic is based on scripts from [`examples/legacy/seq2seq`](https://github.com/huggingface/transformers/tree/main/examples/legacy/seq2seq). We accept training data in the same format as specified there - we expect a directory consisting of 6 text files:
 ```bash
 train.source
 train.target
diff --git a/examples/research_projects/rag/_test_finetune_rag.py b/examples/research_projects/rag/_test_finetune_rag.py
index fa535f2268be..0906295b3018 100644
--- a/examples/research_projects/rag/_test_finetune_rag.py
+++ b/examples/research_projects/rag/_test_finetune_rag.py
@@ -5,6 +5,7 @@
 from pathlib import Path
 
 import finetune_rag
+
 from transformers.file_utils import is_apex_available
 from transformers.testing_utils import (
     TestCasePlus,
diff --git a/examples/research_projects/rag/callbacks_rag.py b/examples/research_projects/rag/callbacks_rag.py
index af1595b08efd..d75f97995bd1 100644
--- a/examples/research_projects/rag/callbacks_rag.py
+++ b/examples/research_projects/rag/callbacks_rag.py
@@ -6,7 +6,6 @@
 import torch
 from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
 from pytorch_lightning.utilities import rank_zero_only
-
 from utils_rag import save_json
 
 
diff --git a/examples/research_projects/rag/consolidate_rag_checkpoint.py b/examples/research_projects/rag/consolidate_rag_checkpoint.py
index 39ba7e91f6c3..6adae75fea9b 100644
--- a/examples/research_projects/rag/consolidate_rag_checkpoint.py
+++ b/examples/research_projects/rag/consolidate_rag_checkpoint.py
@@ -17,7 +17,6 @@ def consolidate(
     generator_tokenizer_name_or_path: str = None,
     question_encoder_tokenizer_name_or_path: str = None,
 ):
-
     if config_name_or_path is None:
         config_name_or_path = "facebook/rag-token-base" if model_type == "rag_token" else "facebook/rag-sequence-base"
 
diff --git a/examples/research_projects/rag/distributed_ray_retriever.py b/examples/research_projects/rag/distributed_ray_retriever.py
index 9ffc1b1e3845..dd5baaf72611 100644
--- a/examples/research_projects/rag/distributed_ray_retriever.py
+++ b/examples/research_projects/rag/distributed_ray_retriever.py
@@ -2,6 +2,7 @@
 import random
 
 import ray
+
 from transformers import RagConfig, RagRetriever, RagTokenizer
 from transformers.models.rag.retrieval_rag import CustomHFIndex
 
diff --git a/examples/research_projects/rag/finetune_rag.py b/examples/research_projects/rag/finetune_rag.py
index f5cef614e2d9..64116a1d53d3 100644
--- a/examples/research_projects/rag/finetune_rag.py
+++ b/examples/research_projects/rag/finetune_rag.py
@@ -162,11 +162,11 @@ def __init__(self, hparams, **kwargs):
         self.step_count = 0
         self.metrics = defaultdict(list)
 
-        self.dataset_kwargs: dict = dict(
-            data_dir=self.hparams.data_dir,
-            max_source_length=self.hparams.max_source_length,
-            prefix=prefix or "",
-        )
+        self.dataset_kwargs: dict = {
+            "data_dir": self.hparams.data_dir,
+            "max_source_length": self.hparams.max_source_length,
+            "prefix": prefix or "",
+        }
         n_observations_per_split = {
             "train": self.hparams.n_train,
             "val": self.hparams.n_val,
@@ -321,7 +321,7 @@ def _generative_step(self, batch: dict) -> dict:
         preds: List[str] = self.ids_to_clean_text(generated_ids)
         target: List[str] = self.ids_to_clean_text(batch["decoder_input_ids"])
         loss_tensors = self._step(batch)
-        base_metrics = {name: loss for name, loss in zip(self.loss_names, loss_tensors)}
+        base_metrics = dict(zip(self.loss_names, loss_tensors))
         gen_metrics: Dict = self.calc_generative_metrics(preds, target)
 
         summ_len = np.mean(lmap(len, generated_ids))
diff --git a/examples/research_projects/rag/lightning_base.py b/examples/research_projects/rag/lightning_base.py
index 77830a4760ad..e78a75823958 100644
--- a/examples/research_projects/rag/lightning_base.py
+++ b/examples/research_projects/rag/lightning_base.py
@@ -69,7 +69,7 @@ def __init__(
         config=None,
         tokenizer=None,
         model=None,
-        **config_kwargs
+        **config_kwargs,
     ):
         """Initialize a model, tokenizer and config."""
         super().__init__()
@@ -356,7 +356,7 @@ def generic_train(
     extra_callbacks=[],
     checkpoint_callback=None,
     logging_callback=None,
-    **extra_train_kwargs
+    **extra_train_kwargs,
 ):
     pl.seed_everything(args.seed)
 
diff --git a/examples/research_projects/rag/requirements.txt b/examples/research_projects/rag/requirements.txt
index fdeb5567d24d..5988d38de9e9 100644
--- a/examples/research_projects/rag/requirements.txt
+++ b/examples/research_projects/rag/requirements.txt
@@ -3,6 +3,6 @@ datasets >= 1.0.1
 psutil >= 5.7.0
 torch >= 1.4.0
 ray >= 1.10.0
-pytorch-lightning >= 1.5.10
+pytorch-lightning >= 1.5.10, <=1.6.0
 transformers
 GitPython
\ No newline at end of file
diff --git a/examples/research_projects/rag/test_distributed_retriever.py b/examples/research_projects/rag/test_distributed_retriever.py
index ac54d1f9857f..7e75e0a7a7ef 100644
--- a/examples/research_projects/rag/test_distributed_retriever.py
+++ b/examples/research_projects/rag/test_distributed_retriever.py
@@ -7,10 +7,10 @@
 from unittest import TestCase
 from unittest.mock import patch
 
+import faiss
 import numpy as np
 from datasets import Dataset
 
-import faiss
 from transformers import BartConfig, BartTokenizer, DPRConfig, DPRQuestionEncoderTokenizer, RagConfig
 from transformers.file_utils import is_datasets_available, is_faiss_available, is_psutil_available, is_torch_available
 from transformers.integrations import is_ray_available
diff --git a/examples/research_projects/rag/use_own_knowledge_dataset.py b/examples/research_projects/rag/use_own_knowledge_dataset.py
index dc08f508228a..84d7c854975f 100644
--- a/examples/research_projects/rag/use_own_knowledge_dataset.py
+++ b/examples/research_projects/rag/use_own_knowledge_dataset.py
@@ -6,10 +6,10 @@
 from tempfile import TemporaryDirectory
 from typing import List, Optional
 
+import faiss
 import torch
 from datasets import Features, Sequence, Value, load_dataset
 
-import faiss
 from transformers import (
     DPRContextEncoder,
     DPRContextEncoderTokenizerFast,
@@ -56,7 +56,6 @@ def main(
     processing_args: "ProcessingArguments",
     index_hnsw_args: "IndexHnswArguments",
 ):
-
     ######################################
     logger.info("Step 1 - Create the dataset")
     ######################################
diff --git a/examples/research_projects/rag/utils_rag.py b/examples/research_projects/rag/utils_rag.py
index 7bf5d7e35e9e..ec98c1d782e0 100644
--- a/examples/research_projects/rag/utils_rag.py
+++ b/examples/research_projects/rag/utils_rag.py
@@ -137,7 +137,7 @@ def collate_fn(self, batch) -> Dict[str, torch.Tensor]:
 
 
 def flatten_list(summary_ids: List[List]):
-    return [x for x in itertools.chain.from_iterable(summary_ids)]
+    return list(itertools.chain.from_iterable(summary_ids))
 
 
 def save_git_info(folder_path: str) -> None:
diff --git a/examples/research_projects/robust-speech-event/eval.py b/examples/research_projects/robust-speech-event/eval.py
index 32e3d1f2c729..a8acca1825d7 100755
--- a/examples/research_projects/robust-speech-event/eval.py
+++ b/examples/research_projects/robust-speech-event/eval.py
@@ -36,7 +36,6 @@ def log_results(result: Dataset, args: Dict[str, str]):
         target_file = f"log_{dataset_id}_targets.txt"
 
         with open(pred_file, "w") as p, open(target_file, "w") as t:
-
             # mapping function to write output
             def write_to_file(batch, i):
                 p.write(f"{i}" + "\n")
diff --git a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py
index d58e17dd25c2..f35b42aaa019 100755
--- a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py
+++ b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py
@@ -25,12 +25,12 @@
 from dataclasses import dataclass, field
 from typing import Dict, List, Optional, Union
 
+import bitsandbytes as bnb
 import datasets
 import numpy as np
 import torch
 from datasets import DatasetDict, load_dataset, load_metric
 
-import bitsandbytes as bnb
 import transformers
 from transformers import (
     AutoConfig,
@@ -292,7 +292,7 @@ class DataCollatorCTCWithPadding:
     pad_to_multiple_of_labels: Optional[int] = None
 
     def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
-        # split inputs and labels since they have to be of different lenghts and need
+        # split inputs and labels since they have to be of different lengths and need
         # different padding methods
         input_features = [{"input_values": feature["input_values"]} for feature in features]
         label_features = [{"input_ids": feature["labels"]} for feature in features]
@@ -344,7 +344,7 @@ def extract_all_chars(batch):
         lambda vocab_1, vocab_2: set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]), vocabs.values()
     )
 
-    vocab_dict = {v: k for k, v in enumerate(sorted(list(vocab_set)))}
+    vocab_dict = {v: k for k, v in enumerate(sorted(vocab_set))}
 
     # replace white space with delimiter token
     if word_delimiter_token is not None:
@@ -717,7 +717,6 @@ def compute_metrics(pred):
 
     # Training
     if training_args.do_train:
-
         # use last checkpoint if exist
         if last_checkpoint is not None:
             checkpoint = last_checkpoint
diff --git a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_streaming.py b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_streaming.py
index ef2529fb09b2..ea1767642710 100644
--- a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_streaming.py
+++ b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_streaming.py
@@ -284,7 +284,7 @@ class DataCollatorCTCWithPadding:
     pad_to_multiple_of_labels: Optional[int] = None
 
     def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
-        # split inputs and labels since they have to be of different lenghts and need
+        # split inputs and labels since they have to be of different lengths and need
         # different padding methods
         input_features = []
         label_features = []
@@ -622,7 +622,6 @@ def on_epoch_begin(self, args, state, control, train_dataloader, **kwargs):
 
     # Training
     if training_args.do_train:
-
         # use last checkpoint if exist
         if last_checkpoint is not None:
             checkpoint = last_checkpoint
diff --git a/examples/research_projects/self-training-text-classification/selftraining.py b/examples/research_projects/self-training-text-classification/selftraining.py
index 7fde2fd1b8ed..70a6c2f319e0 100644
--- a/examples/research_projects/self-training-text-classification/selftraining.py
+++ b/examples/research_projects/self-training-text-classification/selftraining.py
@@ -23,12 +23,12 @@
 from typing import List, Optional
 
 import datasets
+from accelerate import Accelerator
 from datasets import load_dataset
+from finetuning import finetune
 from tqdm.auto import tqdm
 
 import transformers
-from accelerate import Accelerator
-from finetuning import finetune
 from transformers import AutoConfig, set_seed
 from transformers.trainer_utils import IntervalStrategy
 
diff --git a/examples/research_projects/seq2seq-distillation/_test_bash_script.py b/examples/research_projects/seq2seq-distillation/_test_bash_script.py
index 53922f2b645b..fa84a60c0c88 100644
--- a/examples/research_projects/seq2seq-distillation/_test_bash_script.py
+++ b/examples/research_projects/seq2seq-distillation/_test_bash_script.py
@@ -8,9 +8,9 @@
 import pytorch_lightning as pl
 import timeout_decorator
 import torch
-
 from distillation import SummarizationDistiller, distill_main
 from finetune import SummarizationModule, main
+
 from transformers import MarianMTModel
 from transformers.file_utils import cached_path
 from transformers.testing_utils import TestCasePlus, require_torch_gpu, slow
diff --git a/examples/research_projects/seq2seq-distillation/_test_make_student.py b/examples/research_projects/seq2seq-distillation/_test_make_student.py
index 0a1688a95cc1..73df66315cbd 100644
--- a/examples/research_projects/seq2seq-distillation/_test_make_student.py
+++ b/examples/research_projects/seq2seq-distillation/_test_make_student.py
@@ -2,6 +2,7 @@
 import unittest
 
 from make_student import create_student_by_copying_alternating_layers
+
 from transformers import AutoConfig
 from transformers.file_utils import cached_property
 from transformers.testing_utils import require_torch
diff --git a/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py b/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py
index d97c9d43b333..454951ed3888 100644
--- a/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py
+++ b/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py
@@ -5,18 +5,18 @@
 import tempfile
 from pathlib import Path
 
+import lightning_base
 import pytest
 import pytorch_lightning as pl
 import torch
-from torch import nn
-
-import lightning_base
 from convert_pl_checkpoint_to_hf import convert_pl_to_hf
 from distillation import distill_main
 from finetune import SummarizationModule, main
 from huggingface_hub import list_models
 from parameterized import parameterized
 from run_eval import generate_summaries_or_translations
+from torch import nn
+
 from transformers import AutoConfig, AutoModelForSeq2SeqLM
 from transformers.testing_utils import CaptureStderr, CaptureStdout, TestCasePlus, require_torch_gpu, slow
 from utils import label_smoothed_nll_loss, lmap, load_json
@@ -145,18 +145,18 @@ def test_hub_configs(self):
         assert not failures, f"The following models could not be loaded through AutoConfig: {failures}"
 
     def test_distill_no_teacher(self):
-        updates = dict(student_encoder_layers=2, student_decoder_layers=1, no_teacher=True)
+        updates = {"student_encoder_layers": 2, "student_decoder_layers": 1, "no_teacher": True}
         self._test_distiller_cli(updates)
 
     def test_distill_checkpointing_with_teacher(self):
-        updates = dict(
-            student_encoder_layers=2,
-            student_decoder_layers=1,
-            max_epochs=4,
-            val_check_interval=0.25,
-            alpha_hid=2.0,
-            model_name_or_path="IGNORE_THIS_IT_DOESNT_GET_USED",
-        )
+        updates = {
+            "student_encoder_layers": 2,
+            "student_decoder_layers": 1,
+            "max_epochs": 4,
+            "val_check_interval": 0.25,
+            "alpha_hid": 2.0,
+            "model_name_or_path": "IGNORE_THIS_IT_DOESNT_GET_USED",
+        }
         model = self._test_distiller_cli(updates, check_contents=False)
 
         ckpts = list(Path(model.output_dir).glob("*.ckpt"))
@@ -193,19 +193,19 @@ def test_loss_fn(self):
             self.assertEqual(nll_loss, model_computed_loss)
 
     def test_distill_mbart(self):
-        updates = dict(
-            student_encoder_layers=2,
-            student_decoder_layers=1,
-            num_train_epochs=4,
-            val_check_interval=0.25,
-            alpha_hid=2.0,
-            task="translation",
-            model_name_or_path="IGNORE_THIS_IT_DOESNT_GET_USED",
-            tokenizer_name=MBART_TINY,
-            teacher=MBART_TINY,
-            src_lang="en_XX",
-            tgt_lang="ro_RO",
-        )
+        updates = {
+            "student_encoder_layers": 2,
+            "student_decoder_layers": 1,
+            "num_train_epochs": 4,
+            "val_check_interval": 0.25,
+            "alpha_hid": 2.0,
+            "task": "translation",
+            "model_name_or_path": "IGNORE_THIS_IT_DOESNT_GET_USED",
+            "tokenizer_name": MBART_TINY,
+            "teacher": MBART_TINY,
+            "src_lang": "en_XX",
+            "tgt_lang": "ro_RO",
+        }
         model = self._test_distiller_cli(updates, check_contents=False)
         assert model.model.config.model_type == "mbart"
 
@@ -217,39 +217,39 @@ def test_distill_mbart(self):
         self.assertEqual(len(transformer_ckpts), 2)
 
     def test_distill_t5(self):
-        updates = dict(
-            student_encoder_layers=1,
-            student_decoder_layers=1,
-            alpha_hid=2.0,
-            teacher=T5_TINY,
-            model_name_or_path=T5_TINY,
-            tokenizer_name=T5_TINY,
-        )
+        updates = {
+            "student_encoder_layers": 1,
+            "student_decoder_layers": 1,
+            "alpha_hid": 2.0,
+            "teacher": T5_TINY,
+            "model_name_or_path": T5_TINY,
+            "tokenizer_name": T5_TINY,
+        }
         self._test_distiller_cli(updates)
 
     def test_distill_different_base_models(self):
-        updates = dict(
-            teacher=T5_TINY,
-            student=T5_TINIER,
-            model_name_or_path=T5_TINIER,
-            tokenizer_name=T5_TINIER,
-        )
+        updates = {
+            "teacher": T5_TINY,
+            "student": T5_TINIER,
+            "model_name_or_path": T5_TINIER,
+            "tokenizer_name": T5_TINIER,
+        }
         self._test_distiller_cli(updates)
 
     def _test_distiller_cli(self, updates, check_contents=True):
-        default_updates = dict(
-            label_smoothing=0.0,
-            early_stopping_patience=-1,
-            train_batch_size=1,
-            eval_batch_size=2,
-            max_epochs=2,
-            alpha_mlm=0.2,
-            alpha_ce=0.8,
-            do_predict=True,
-            model_name_or_path="sshleifer/tinier_bart",
-            teacher=CHEAP_ARGS["model_name_or_path"],
-            val_check_interval=0.5,
-        )
+        default_updates = {
+            "label_smoothing": 0.0,
+            "early_stopping_patience": -1,
+            "train_batch_size": 1,
+            "eval_batch_size": 2,
+            "max_epochs": 2,
+            "alpha_mlm": 0.2,
+            "alpha_ce": 0.8,
+            "do_predict": True,
+            "model_name_or_path": "sshleifer/tinier_bart",
+            "teacher": CHEAP_ARGS["model_name_or_path"],
+            "val_check_interval": 0.5,
+        }
         default_updates.update(updates)
         args_d: dict = CHEAP_ARGS.copy()
         tmp_dir = make_test_data_dir(tmp_dir=self.get_auto_remove_tmp_dir())
diff --git a/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples_multi_gpu.py b/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples_multi_gpu.py
index af6ae24bf4c3..9eeb3b30d399 100644
--- a/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples_multi_gpu.py
+++ b/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples_multi_gpu.py
@@ -98,30 +98,29 @@ def setUpClass(cls):
 
     @require_torch_multi_gpu
     def test_multi_gpu(self):
-
-        updates = dict(
-            no_teacher=True,
-            freeze_encoder=True,
-            gpus=2,
-            overwrite_output_dir=True,
-            sortish_sampler=True,
-        )
+        updates = {
+            "no_teacher": True,
+            "freeze_encoder": True,
+            "gpus": 2,
+            "overwrite_output_dir": True,
+            "sortish_sampler": True,
+        }
         self._test_distiller_cli_fork(updates, check_contents=False)
 
     def _test_distiller_cli_fork(self, updates, check_contents=True):
-        default_updates = dict(
-            label_smoothing=0.0,
-            early_stopping_patience=-1,
-            train_batch_size=1,
-            eval_batch_size=2,
-            max_epochs=2,
-            alpha_mlm=0.2,
-            alpha_ce=0.8,
-            do_predict=True,
-            model_name_or_path="sshleifer/tinier_bart",
-            teacher=CHEAP_ARGS["model_name_or_path"],
-            val_check_interval=0.5,
-        )
+        default_updates = {
+            "label_smoothing": 0.0,
+            "early_stopping_patience": -1,
+            "train_batch_size": 1,
+            "eval_batch_size": 2,
+            "max_epochs": 2,
+            "alpha_mlm": 0.2,
+            "alpha_ce": 0.8,
+            "do_predict": True,
+            "model_name_or_path": "sshleifer/tinier_bart",
+            "teacher": CHEAP_ARGS["model_name_or_path"],
+            "val_check_interval": 0.5,
+        }
         default_updates.update(updates)
         args_d: dict = CHEAP_ARGS.copy()
         tmp_dir = make_test_data_dir(tmp_dir=self.get_auto_remove_tmp_dir())
diff --git a/examples/research_projects/seq2seq-distillation/distillation.py b/examples/research_projects/seq2seq-distillation/distillation.py
index 78ff49718bb5..323f62bf4581 100755
--- a/examples/research_projects/seq2seq-distillation/distillation.py
+++ b/examples/research_projects/seq2seq-distillation/distillation.py
@@ -9,11 +9,11 @@
 
 import pytorch_lightning as pl
 import torch
-from torch import nn
-
 from finetune import SummarizationModule, TranslationModule
 from finetune import main as ft_main
 from make_student import create_student_by_copying_alternating_layers, get_layers_to_supervise
+from torch import nn
+
 from transformers import AutoModelForSeq2SeqLM, MBartTokenizer, T5ForConditionalGeneration
 from transformers.models.bart.modeling_bart import shift_tokens_right
 from utils import calculate_bleu, check_output_dir, freeze_params, label_smoothed_nll_loss, use_task_specific_params
diff --git a/examples/research_projects/seq2seq-distillation/finetune.py b/examples/research_projects/seq2seq-distillation/finetune.py
index c20b361d5836..ff889af81e36 100755
--- a/examples/research_projects/seq2seq-distillation/finetune.py
+++ b/examples/research_projects/seq2seq-distillation/finetune.py
@@ -13,10 +13,10 @@
 import numpy as np
 import pytorch_lightning as pl
 import torch
+from callbacks import Seq2SeqLoggingCallback, get_checkpoint_callback, get_early_stopping_callback
 from torch import nn
 from torch.utils.data import DataLoader
 
-from callbacks import Seq2SeqLoggingCallback, get_checkpoint_callback, get_early_stopping_callback
 from transformers import MBartTokenizer, T5ForConditionalGeneration
 from transformers.models.bart.modeling_bart import shift_tokens_right
 from utils import (
@@ -74,11 +74,11 @@ def __init__(self, hparams, **kwargs):
         self.model_type = self.config.model_type
         self.vocab_size = self.config.tgt_vocab_size if self.model_type == "fsmt" else self.config.vocab_size
 
-        self.dataset_kwargs: dict = dict(
-            data_dir=self.hparams.data_dir,
-            max_source_length=self.hparams.max_source_length,
-            prefix=self.model.config.prefix or "",
-        )
+        self.dataset_kwargs: dict = {
+            "data_dir": self.hparams.data_dir,
+            "max_source_length": self.hparams.max_source_length,
+            "prefix": self.model.config.prefix or "",
+        }
         n_observations_per_split = {
             "train": self.hparams.n_train,
             "val": self.hparams.n_val,
@@ -170,7 +170,7 @@ def pad(self) -> int:
     def training_step(self, batch, batch_idx) -> Dict:
         loss_tensors = self._step(batch)
 
-        logs = {name: loss for name, loss in zip(self.loss_names, loss_tensors)}
+        logs = dict(zip(self.loss_names, loss_tensors))
         # tokens per batch
         logs["tpb"] = batch["input_ids"].ne(self.pad).sum() + batch["labels"].ne(self.pad).sum()
         logs["bs"] = batch["input_ids"].shape[0]
@@ -225,7 +225,7 @@ def _generative_step(self, batch: dict) -> dict:
         preds: List[str] = self.ids_to_clean_text(generated_ids)
         target: List[str] = self.ids_to_clean_text(batch["labels"])
         loss_tensors = self._step(batch)
-        base_metrics = {name: loss for name, loss in zip(self.loss_names, loss_tensors)}
+        base_metrics = dict(zip(self.loss_names, loss_tensors))
         rouge: Dict = self.calc_generative_metrics(preds, target)
         summ_len = np.mean(lmap(len, generated_ids))
         base_metrics.update(gen_time=gen_time, gen_len=summ_len, preds=preds, target=target, **rouge)
@@ -433,7 +433,7 @@ def main(args, model=None) -> SummarizationModule:
         return model
 
     model.hparams.test_checkpoint = ""
-    checkpoints = list(sorted(glob.glob(os.path.join(args.output_dir, "*.ckpt"), recursive=True)))
+    checkpoints = sorted(glob.glob(os.path.join(args.output_dir, "*.ckpt"), recursive=True))
     if checkpoints:
         model.hparams.test_checkpoint = checkpoints[-1]
         trainer.resume_from_checkpoint = checkpoints[-1]
diff --git a/examples/research_projects/seq2seq-distillation/lightning_base.py b/examples/research_projects/seq2seq-distillation/lightning_base.py
index b3104a25a8b1..f246ecab0dd0 100644
--- a/examples/research_projects/seq2seq-distillation/lightning_base.py
+++ b/examples/research_projects/seq2seq-distillation/lightning_base.py
@@ -69,7 +69,7 @@ def __init__(
         config=None,
         tokenizer=None,
         model=None,
-        **config_kwargs
+        **config_kwargs,
     ):
         """Initialize a model, tokenizer and config."""
         super().__init__()
@@ -346,7 +346,7 @@ def generic_train(
     extra_callbacks=[],
     checkpoint_callback=None,
     logging_callback=None,
-    **extra_train_kwargs
+    **extra_train_kwargs,
 ):
     pl.seed_everything(args.seed)
 
diff --git a/examples/research_projects/seq2seq-distillation/make_student.py b/examples/research_projects/seq2seq-distillation/make_student.py
index a4021505b998..83e014bf481e 100644
--- a/examples/research_projects/seq2seq-distillation/make_student.py
+++ b/examples/research_projects/seq2seq-distillation/make_student.py
@@ -84,7 +84,7 @@ def create_student_by_copying_alternating_layers(
     copy_first_teacher_layers=False,
     e_layers_to_copy=None,
     d_layers_to_copy=None,
-    **extra_config_kwargs
+    **extra_config_kwargs,
 ) -> Tuple[PreTrainedModel, List[int], List[int]]:
     """Make a student by copying alternating layers from a teacher, save it to save_path.
     Args:
@@ -107,7 +107,6 @@ def create_student_by_copying_alternating_layers(
         AutoTokenizer.from_pretrained(teacher).save_pretrained(save_path)  # purely for convenience
         teacher = AutoModelForSeq2SeqLM.from_pretrained(teacher).eval()
     else:
-
         assert isinstance(teacher, PreTrainedModel), f"teacher must be a model or string got type {type(teacher)}"
     init_kwargs = teacher.config.to_diff_dict()
 
@@ -172,11 +171,11 @@ def create_student_by_copying_alternating_layers(
     logger.info(
         f"Copied encoder layers {e_layers_to_copy} and decoder layers {d_layers_to_copy}. Saving them to {save_path}"
     )
-    student.config.init_metadata = dict(
-        teacher_type=teacher.config.model_type,
-        copied_encoder_layers=e_layers_to_copy,
-        copied_decoder_layers=d_layers_to_copy,
-    )
+    student.config.init_metadata = {
+        "teacher_type": teacher.config.model_type,
+        "copied_encoder_layers": e_layers_to_copy,
+        "copied_decoder_layers": d_layers_to_copy,
+    }
     student.save_pretrained(save_path)
     # Save information about copying for easier reproducibility
 
diff --git a/examples/research_projects/seq2seq-distillation/run_eval.py b/examples/research_projects/seq2seq-distillation/run_eval.py
index 3f685884e8e8..98c9786d2c95 100755
--- a/examples/research_projects/seq2seq-distillation/run_eval.py
+++ b/examples/research_projects/seq2seq-distillation/run_eval.py
@@ -63,7 +63,7 @@ def generate_summaries_or_translations(
     fout.close()
     runtime = int(time.time() - start_time)  # seconds
     n_obs = len(examples)
-    return dict(n_obs=n_obs, runtime=runtime, seconds_per_sample=round(runtime / n_obs, 4))
+    return {"n_obs": n_obs, "runtime": runtime, "seconds_per_sample": round(runtime / n_obs, 4)}
 
 
 def datetime_now():
diff --git a/examples/research_projects/seq2seq-distillation/utils.py b/examples/research_projects/seq2seq-distillation/utils.py
index a45194e6e054..de666e0c2490 100644
--- a/examples/research_projects/seq2seq-distillation/utils.py
+++ b/examples/research_projects/seq2seq-distillation/utils.py
@@ -15,10 +15,10 @@
 import torch.distributed as dist
 from rouge_score import rouge_scorer, scoring
 from sacrebleu import corpus_bleu
+from sentence_splitter import add_newline_to_end_of_each_sentence
 from torch import nn
 from torch.utils.data import Dataset, Sampler
 
-from sentence_splitter import add_newline_to_end_of_each_sentence
 from transformers import BartTokenizer, EvalPrediction, PreTrainedTokenizer, T5Tokenizer
 from transformers.file_utils import cached_property
 from transformers.models.bart.modeling_bart import shift_tokens_right
@@ -115,7 +115,7 @@ def __init__(
         type_path="train",
         n_obs=None,
         prefix="",
-        **dataset_kwargs
+        **dataset_kwargs,
     ):
         super().__init__()
         self.src_file = Path(data_dir).joinpath(type_path + ".source")
@@ -437,7 +437,7 @@ def pickle_save(obj, path):
 
 
 def flatten_list(summary_ids: List[List]):
-    return [x for x in itertools.chain.from_iterable(summary_ids)]
+    return list(itertools.chain.from_iterable(summary_ids))
 
 
 def save_git_info(folder_path: str) -> None:
diff --git a/examples/research_projects/tapex/run_wikisql_with_tapex.py b/examples/research_projects/tapex/run_wikisql_with_tapex.py
index 1d402fa7e8f0..a5717d245cb6 100644
--- a/examples/research_projects/tapex/run_wikisql_with_tapex.py
+++ b/examples/research_projects/tapex/run_wikisql_with_tapex.py
@@ -32,9 +32,10 @@
 import numpy as np
 import pandas as pd
 from datasets import load_dataset
+from filelock import FileLock
+from wikisql_utils import _TYPE_CONVERTER, retrieve_wikisql_query_answer_tapas
 
 import transformers
-from filelock import FileLock
 from transformers import (
     AutoConfig,
     BartForConditionalGeneration,
@@ -48,7 +49,6 @@
 from transformers.file_utils import is_offline_mode
 from transformers.trainer_utils import get_last_checkpoint, is_main_process
 from transformers.utils import check_min_version
-from wikisql_utils import _TYPE_CONVERTER, retrieve_wikisql_query_answer_tapas
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
diff --git a/examples/research_projects/tapex/run_wikitablequestions_with_tapex.py b/examples/research_projects/tapex/run_wikitablequestions_with_tapex.py
index 6f93f9b51669..901e921f26a6 100644
--- a/examples/research_projects/tapex/run_wikitablequestions_with_tapex.py
+++ b/examples/research_projects/tapex/run_wikitablequestions_with_tapex.py
@@ -31,9 +31,9 @@
 import numpy as np
 import pandas as pd
 from datasets import load_dataset
+from filelock import FileLock
 
 import transformers
-from filelock import FileLock
 from transformers import (
     AutoConfig,
     BartForConditionalGeneration,
diff --git a/examples/research_projects/tapex/wikisql_utils.py b/examples/research_projects/tapex/wikisql_utils.py
index 3028e81ad481..110b14e02fb8 100644
--- a/examples/research_projects/tapex/wikisql_utils.py
+++ b/examples/research_projects/tapex/wikisql_utils.py
@@ -30,7 +30,7 @@
 
 def _split_thousands(delimiter, value):
     split = value.split(delimiter)
-    return len(split) > 1 and any(map(lambda x: len(x) == 3, split))
+    return len(split) > 1 and any((len(x) == 3 for x in split))
 
 
 def convert_to_float(value):
@@ -123,7 +123,7 @@ class _Condition:
 
 
 def _normalize_for_match(x):
-    return [t for t in _TOKENIZER.findall(x.lower())]
+    return list(_TOKENIZER.findall(x.lower()))
 
 
 def _compare(operator, src, tgt):
diff --git a/examples/research_projects/visual_bert/extracting_data.py b/examples/research_projects/visual_bert/extracting_data.py
index 9790e20ad86b..6b1342c9b11f 100644
--- a/examples/research_projects/visual_bert/extracting_data.py
+++ b/examples/research_projects/visual_bert/extracting_data.py
@@ -9,9 +9,9 @@
 import datasets
 import numpy as np
 import torch
-
 from modeling_frcnn import GeneralizedRCNN
 from processing_image import Preprocess
+
 from utils import Config
 
 
@@ -61,7 +61,7 @@ def __init__(self, argv=sys.argv[1:]):
         assert outputfile is not None and not os.path.isfile(outputfile), f"{outputfile}"
         if subset_list is not None:
             with open(os.path.realpath(subset_list)) as f:
-                self.subset_list = set(map(lambda x: self._vqa_file_split()[0], tryload(f)))
+                self.subset_list = {self._vqa_file_split()[0] for x in tryload(f)}
         else:
             self.subset_list = None
 
diff --git a/examples/research_projects/visual_bert/modeling_frcnn.py b/examples/research_projects/visual_bert/modeling_frcnn.py
index 33c1133e9589..943588a5ed8c 100644
--- a/examples/research_projects/visual_bert/modeling_frcnn.py
+++ b/examples/research_projects/visual_bert/modeling_frcnn.py
@@ -169,7 +169,6 @@ def get_norm(norm, out_channels):
 
 
 def _create_grid_offsets(size: List[int], stride: int, offset: float, device):
-
     grid_height, grid_width = size
     shifts_x = torch.arange(
         offset * stride,
@@ -390,7 +389,6 @@ def assign_boxes_to_levels(
     canonical_box_size: int,
     canonical_level: int,
 ):
-
     box_sizes = torch.sqrt(torch.cat([boxes.area() for boxes in box_lists]))
     # Eqn.(1) in FPN paper
     level_assignments = torch.floor(canonical_level + torch.log2(box_sizes / canonical_box_size + 1e-8))
@@ -556,8 +554,8 @@ def __init__(
         assert thresholds[0] > 0
         thresholds.insert(0, -float("inf"))
         thresholds.append(float("inf"))
-        assert all([low <= high for (low, high) in zip(thresholds[:-1], thresholds[1:])])
-        assert all([label_i in [-1, 0, 1] for label_i in labels])
+        assert all(low <= high for (low, high) in zip(thresholds[:-1], thresholds[1:]))
+        assert all(label_i in [-1, 0, 1] for label_i in labels)
         assert len(labels) == len(thresholds) - 1
         self.thresholds = thresholds
         self.labels = labels
@@ -1097,7 +1095,7 @@ def forward(self, feature_maps, boxes):
         Returns:
             A tensor of shape(N*B, Channels, output_size, output_size)
         """
-        x = [v for v in feature_maps.values()]
+        x = list(feature_maps.values())
         num_level_assignments = len(self.level_poolers)
         assert len(x) == num_level_assignments and len(boxes) == x[0].size(0)
 
@@ -1708,9 +1706,10 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
                 archive_file = pretrained_model_name_or_path
             elif os.path.isfile(pretrained_model_name_or_path + ".index"):
-                assert from_tf, (
-                    "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint"
-                    .format(pretrained_model_name_or_path + ".index")
+                assert (
+                    from_tf
+                ), "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format(
+                    pretrained_model_name_or_path + ".index"
                 )
                 archive_file = pretrained_model_name_or_path + ".index"
             else:
diff --git a/examples/research_projects/visual_bert/requirements.txt b/examples/research_projects/visual_bert/requirements.txt
index e3c3f89a510f..e7579a4d168f 100644
--- a/examples/research_projects/visual_bert/requirements.txt
+++ b/examples/research_projects/visual_bert/requirements.txt
@@ -4,7 +4,7 @@ async-generator==1.10
 attrs==20.2.0
 backcall==0.2.0
 CacheControl==0.12.6
-certifi==2022.12.7
+certifi==2023.7.22
 cffi==1.14.2
 chardet==3.0.4
 click==7.1.2
@@ -19,7 +19,7 @@ distlib==0.3.0
 distro==1.4.0
 entrypoints==0.3
 filelock==3.0.12
-future==0.18.2
+future==0.18.3
 html5lib==1.0.1
 idna==2.8
 ipaddr==2.2.0
@@ -75,7 +75,7 @@ pyzmq==19.0.2
 qtconsole==4.7.7
 QtPy==1.9.0
 regex==2020.7.14
-requests==2.22.0
+requests==2.31.0
 retrying==1.3.3
 sacremoses==0.0.43
 Send2Trash==1.5.0
@@ -86,11 +86,11 @@ testpath==0.4.4
 tokenizers==0.8.1rc2
 torch==1.6.0
 torchvision==0.7.0
-tornado==6.0.4
+tornado==6.3.3
 tqdm==4.48.2
 traitlets
 git+https://github.com/huggingface/transformers.git
-urllib3==1.26.5
+urllib3==1.26.17
 wcwidth==0.2.5
 webencodings==0.5.1
 wget==3.2
diff --git a/examples/research_projects/visual_bert/utils.py b/examples/research_projects/visual_bert/utils.py
index 8e830fb8359d..2fc6ea2062ef 100644
--- a/examples/research_projects/visual_bert/utils.py
+++ b/examples/research_projects/visual_bert/utils.py
@@ -34,14 +34,13 @@
 from urllib.parse import urlparse
 from zipfile import ZipFile, is_zipfile
 
-import numpy as np
-from PIL import Image
-from tqdm.auto import tqdm
-
 import cv2
+import numpy as np
 import requests
 import wget
 from filelock import FileLock
+from PIL import Image
+from tqdm.auto import tqdm
 from yaml import Loader, dump, load
 
 
@@ -181,7 +180,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
 
     @classmethod
     def get_config_dict(cls, pretrained_model_name_or_path: str, **kwargs):
-
         cache_dir = kwargs.pop("cache_dir", None)
         force_download = kwargs.pop("force_download", False)
         resume_download = kwargs.pop("resume_download", False)
@@ -225,14 +223,13 @@ def get_config_dict(cls, pretrained_model_name_or_path: str, **kwargs):
 
 # quick compare tensors
 def compare(in_tensor):
-
     out_tensor = torch.load("dump.pt", map_location=in_tensor.device)
     n1 = in_tensor.numpy()
     n2 = out_tensor.numpy()[0]
     print(n1.shape, n1[0, 0, :5])
     print(n2.shape, n2[0, 0, :5])
     assert np.allclose(n1, n2, rtol=0.01, atol=0.1), (
-        f"{sum([1 for x in np.isclose(n1, n2, rtol=0.01, atol=0.1).flatten() if x == False])/len(n1.flatten())*100:.4f} %"
+        f"{sum([1 for x in np.isclose(n1, n2, rtol=0.01, atol=0.1).flatten() if x is False])/len(n1.flatten())*100:.4f} %"
         " element-wise mismatch"
     )
     raise Exception("tensors are all good")
@@ -300,7 +297,6 @@ def get_from_cache(
     user_agent=None,
     local_files_only=False,
 ):
-
     if cache_dir is None:
         cache_dir = TRANSFORMERS_CACHE
     if isinstance(cache_dir, Path):
@@ -355,7 +351,6 @@ def get_from_cache(
     # Prevent parallel downloads of the same file with a lock.
     lock_path = cache_path + ".lock"
     with FileLock(lock_path):
-
         # If the download just completed while the lock was activated.
         if os.path.exists(cache_path) and not force_download:
             # Even if returning early like here, the lock will be released.
@@ -406,7 +401,6 @@ def _resumable_file_manager():
 
 
 def url_to_filename(url, etag=None):
-
     url_bytes = url.encode("utf-8")
     url_hash = sha256(url_bytes)
     filename = url_hash.hexdigest()
diff --git a/examples/research_projects/visual_bert/visualizing_image.py b/examples/research_projects/visual_bert/visualizing_image.py
index a02dc66dfb7c..163d661e873e 100644
--- a/examples/research_projects/visual_bert/visualizing_image.py
+++ b/examples/research_projects/visual_bert/visualizing_image.py
@@ -18,6 +18,7 @@
 import colorsys
 import io
 
+import cv2
 import matplotlib as mpl
 import matplotlib.colors as mplc
 import matplotlib.figure as mplfigure
@@ -25,7 +26,6 @@
 import torch
 from matplotlib.backends.backend_agg import FigureCanvasAgg
 
-import cv2
 from utils import img_tensorize
 
 
diff --git a/examples/research_projects/vqgan-clip/README.md b/examples/research_projects/vqgan-clip/README.md
new file mode 100644
index 000000000000..aef950935422
--- /dev/null
+++ b/examples/research_projects/vqgan-clip/README.md
@@ -0,0 +1,70 @@
+# Simple VQGAN CLIP
+
+Author: @ErwannMillon 
+
+This is a very simple VQGAN-CLIP implementation that was built as a part of the <a href= "https://github.com/ErwannMillon/face-editor"> Face Editor project </a> . This simplified version allows you to generate or edit images using text with just three lines of code. For a more full featured implementation with masking, more advanced losses, and a full GUI, check out the Face Editor project. 
+
+By default this uses a CelebA checkpoint (for generating/editing faces), but also has an imagenet checkpoint that can be loaded by specifying vqgan_config and vqgan_checkpoint when instantiating VQGAN_CLIP. 
+
+Learning rate and iterations can be set by modifying vqgan_clip.lr and vqgan_clip.iterations . 
+
+You can edit images by passing `image_path` to the generate function. 
+See the generate function's docstring to learn more about how to format prompts. 
+
+## Usage
+The easiest way to test this out is by <a href="https://colab.research.google.com/drive/1Ez4D1J6-hVkmlXeR5jBPWYyu6CLA9Yor?usp=sharing
+">using the Colab demo</a>
+
+To install locally: 
+- Clone this repo
+- Install git-lfs (ubuntu: sudo apt-get install git-lfs , MacOS: brew install git-lfs) 
+
+In the root of the repo run:
+
+```
+conda create -n vqganclip python=3.8
+conda activate vqganclip
+git-lfs install
+git clone https://huggingface.co/datasets/erwann/face_editor_model_ckpt model_checkpoints
+pip install -r requirements.txt
+```
+
+### Generate new images
+```
+from VQGAN_CLIP import VQGAN_CLIP
+vqgan_clip = VQGAN_CLIP()
+vqgan_clip.generate("a picture of a smiling woman")
+```
+
+### Edit an image
+To get a test image, run 
+`git clone https://huggingface.co/datasets/erwann/vqgan-clip-pic test_images`
+
+To edit:
+```
+from VQGAN_CLIP import VQGAN_CLIP
+vqgan_clip = VQGAN_CLIP()
+
+vqgan_clip.lr = .07
+vqgan_clip.iterations = 15
+vqgan_clip.generate(
+    pos_prompts= ["a picture of a beautiful asian woman", "a picture of a woman from Japan"],
+    neg_prompts=["a picture of an Indian person", "a picture of a white person"],
+    image_path="./test_images/face.jpeg",
+    show_intermediate=True,
+    save_intermediate=True,
+)
+```
+
+### Make an animation from the most recent generation
+`vqgan_clip.make_animation()`
+
+## Features:
+- Positive and negative prompts
+- Multiple prompts
+- Prompt Weights
+- Creating GIF animations of the transformations
+- Wandb logging
+
+
+
diff --git a/examples/research_projects/vqgan-clip/VQGAN_CLIP.py b/examples/research_projects/vqgan-clip/VQGAN_CLIP.py
new file mode 100644
index 000000000000..1bfbc4cd5c36
--- /dev/null
+++ b/examples/research_projects/vqgan-clip/VQGAN_CLIP.py
@@ -0,0 +1,268 @@
+import os
+from glob import glob
+
+import imageio
+import torch
+import torchvision
+import wandb
+from img_processing import custom_to_pil, loop_post_process, preprocess, preprocess_vqgan
+from loaders import load_vqgan
+from PIL import Image
+from torch import nn
+
+from transformers import CLIPModel, CLIPTokenizerFast
+from utils import get_device, get_timestamp, show_pil
+
+
+class ProcessorGradientFlow:
+    """
+    This wraps the huggingface CLIP processor to allow backprop through the image processing step.
+    The original processor forces conversion to PIL images, which is faster for image processing but breaks gradient flow.
+    We call the original processor to get the text embeddings, but use our own image processing to keep images as torch tensors.
+    """
+
+    def __init__(self, device: str = "cpu", clip_model: str = "openai/clip-vit-large-patch14") -> None:
+        self.device = device
+        self.tokenizer = CLIPTokenizerFast.from_pretrained(clip_model)
+        self.image_mean = [0.48145466, 0.4578275, 0.40821073]
+        self.image_std = [0.26862954, 0.26130258, 0.27577711]
+        self.normalize = torchvision.transforms.Normalize(self.image_mean, self.image_std)
+        self.resize = torchvision.transforms.Resize(224)
+        self.center_crop = torchvision.transforms.CenterCrop(224)
+
+    def preprocess_img(self, images):
+        images = self.resize(images)
+        images = self.center_crop(images)
+        images = self.normalize(images)
+        return images
+
+    def __call__(self, text=None, images=None, **kwargs):
+        encoding = self.tokenizer(text=text, **kwargs)
+        encoding["pixel_values"] = self.preprocess_img(images)
+        encoding = {key: value.to(self.device) for (key, value) in encoding.items()}
+        return encoding
+
+
+class VQGAN_CLIP(nn.Module):
+    def __init__(
+        self,
+        iterations=10,
+        lr=0.01,
+        vqgan=None,
+        vqgan_config=None,
+        vqgan_checkpoint=None,
+        clip=None,
+        clip_preprocessor=None,
+        device=None,
+        log=False,
+        save_vector=True,
+        return_val="image",
+        quantize=True,
+        save_intermediate=False,
+        show_intermediate=False,
+        make_grid=False,
+    ) -> None:
+        """
+        Instantiate a VQGAN_CLIP model. If you want to use a custom VQGAN model, pass it as vqgan.
+        """
+        super().__init__()
+        self.latent = None
+        self.device = device if device else get_device()
+        if vqgan:
+            self.vqgan = vqgan
+        else:
+            self.vqgan = load_vqgan(self.device, conf_path=vqgan_config, ckpt_path=vqgan_checkpoint)
+        self.vqgan.eval()
+        if clip:
+            self.clip = clip
+        else:
+            self.clip = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        self.clip.to(self.device)
+        self.clip_preprocessor = ProcessorGradientFlow(device=self.device)
+
+        self.iterations = iterations
+        self.lr = lr
+        self.log = log
+        self.make_grid = make_grid
+        self.return_val = return_val
+        self.quantize = quantize
+        self.latent_dim = self.vqgan.decoder.z_shape
+
+    def make_animation(self, input_path=None, output_path=None, total_duration=5, extend_frames=True):
+        """
+        Make an animation from the intermediate images saved during generation.
+        By default, uses the images from the most recent generation created by the generate function.
+        If you want to use images from a different generation, pass the path to the folder containing the images as input_path.
+        """
+        images = []
+        if output_path is None:
+            output_path = "./animation.gif"
+        if input_path is None:
+            input_path = self.save_path
+        paths = sorted(glob(input_path + "/*"))
+        if not len(paths):
+            raise ValueError(
+                "No images found in save path, aborting (did you pass save_intermediate=True to the generate"
+                " function?)"
+            )
+        if len(paths) == 1:
+            print("Only one image found in save path, (did you pass save_intermediate=True to the generate function?)")
+        frame_duration = total_duration / len(paths)
+        durations = [frame_duration] * len(paths)
+        if extend_frames:
+            durations[0] = 1.5
+            durations[-1] = 3
+        for file_name in paths:
+            if file_name.endswith(".png"):
+                images.append(imageio.imread(file_name))
+        imageio.mimsave(output_path, images, duration=durations)
+        print(f"gif saved to {output_path}")
+
+    def _get_latent(self, path=None, img=None):
+        if not (path or img):
+            raise ValueError("Input either path or tensor")
+        if img is not None:
+            raise NotImplementedError
+        x = preprocess(Image.open(path), target_image_size=256).to(self.device)
+        x_processed = preprocess_vqgan(x)
+        z, *_ = self.vqgan.encode(x_processed)
+        return z
+
+    def _add_vector(self, transform_vector):
+        """Add a vector transform to the base latent and returns the resulting image."""
+        base_latent = self.latent.detach().requires_grad_()
+        trans_latent = base_latent + transform_vector
+        if self.quantize:
+            z_q, *_ = self.vqgan.quantize(trans_latent)
+        else:
+            z_q = trans_latent
+        return self.vqgan.decode(z_q)
+
+    def _get_clip_similarity(self, prompts, image, weights=None):
+        clip_inputs = self.clip_preprocessor(text=prompts, images=image, return_tensors="pt", padding=True)
+        clip_outputs = self.clip(**clip_inputs)
+        similarity_logits = clip_outputs.logits_per_image
+        if weights is not None:
+            similarity_logits = similarity_logits * weights
+        return similarity_logits.sum()
+
+    def _get_clip_loss(self, pos_prompts, neg_prompts, image):
+        pos_logits = self._get_clip_similarity(pos_prompts["prompts"], image, weights=(1 / pos_prompts["weights"]))
+        if neg_prompts:
+            neg_logits = self._get_clip_similarity(neg_prompts["prompts"], image, weights=neg_prompts["weights"])
+        else:
+            neg_logits = torch.tensor([1], device=self.device)
+        loss = -torch.log(pos_logits) + torch.log(neg_logits)
+        return loss
+
+    def _optimize_CLIP(self, original_img, pos_prompts, neg_prompts):
+        vector = torch.randn_like(self.latent, requires_grad=True, device=self.device)
+        optim = torch.optim.Adam([vector], lr=self.lr)
+
+        for i in range(self.iterations):
+            optim.zero_grad()
+            transformed_img = self._add_vector(vector)
+            processed_img = loop_post_process(transformed_img)
+            clip_loss = self._get_CLIP_loss(pos_prompts, neg_prompts, processed_img)
+            print("CLIP loss", clip_loss)
+            if self.log:
+                wandb.log({"CLIP Loss": clip_loss})
+            clip_loss.backward(retain_graph=True)
+            optim.step()
+            if self.return_val == "image":
+                yield custom_to_pil(transformed_img[0])
+            else:
+                yield vector
+
+    def _init_logging(self, positive_prompts, negative_prompts, image_path):
+        wandb.init(reinit=True, project="face-editor")
+        wandb.config.update({"Positive Prompts": positive_prompts})
+        wandb.config.update({"Negative Prompts": negative_prompts})
+        wandb.config.update({"lr": self.lr, "iterations": self.iterations})
+        if image_path:
+            image = Image.open(image_path)
+            image = image.resize((256, 256))
+            wandb.log("Original Image", wandb.Image(image))
+
+    def process_prompts(self, prompts):
+        if not prompts:
+            return []
+        processed_prompts = []
+        weights = []
+        if isinstance(prompts, str):
+            prompts = [prompt.strip() for prompt in prompts.split("|")]
+        for prompt in prompts:
+            if isinstance(prompt, (tuple, list)):
+                processed_prompt = prompt[0]
+                weight = float(prompt[1])
+            elif ":" in prompt:
+                processed_prompt, weight = prompt.split(":")
+                weight = float(weight)
+            else:
+                processed_prompt = prompt
+                weight = 1.0
+            processed_prompts.append(processed_prompt)
+            weights.append(weight)
+        return {
+            "prompts": processed_prompts,
+            "weights": torch.tensor(weights, device=self.device),
+        }
+
+    def generate(
+        self,
+        pos_prompts,
+        neg_prompts=None,
+        image_path=None,
+        show_intermediate=True,
+        save_intermediate=False,
+        show_final=True,
+        save_final=True,
+        save_path=None,
+    ):
+        """Generate an image from the given prompts.
+        If image_path is provided, the image is used as a starting point for the optimization.
+        If image_path is not provided, a random latent vector is used as a starting point.
+        You must provide at least one positive prompt, and optionally provide negative prompts.
+        Prompts must be formatted in one of the following ways:
+        - A single prompt as a string, e.g "A smiling woman"
+        - A set of prompts separated by pipes: "A smiling woman | a woman with brown hair"
+        - A set of prompts and their weights separated by colons: "A smiling woman:1 | a woman with brown hair: 3" (default weight is 1)
+        - A list of prompts, e.g ["A smiling woman", "a woman with brown hair"]
+        - A list of prompts and weights, e.g [("A smiling woman", 1), ("a woman with brown hair", 3)]
+        """
+        if image_path:
+            self.latent = self._get_latent(image_path)
+        else:
+            self.latent = torch.randn(self.latent_dim, device=self.device)
+        if self.log:
+            self._init_logging(pos_prompts, neg_prompts, image_path)
+
+        assert pos_prompts, "You must provide at least one positive prompt."
+        pos_prompts = self.process_prompts(pos_prompts)
+        neg_prompts = self.process_prompts(neg_prompts)
+        if save_final and save_path is None:
+            save_path = os.path.join("./outputs/", "_".join(pos_prompts["prompts"]))
+        if not os.path.exists(save_path):
+            os.makedirs(save_path)
+        else:
+            save_path = save_path + "_" + get_timestamp()
+            os.makedirs(save_path)
+        self.save_path = save_path
+
+        original_img = self.vqgan.decode(self.latent)[0]
+        if show_intermediate:
+            print("Original Image")
+            show_pil(custom_to_pil(original_img))
+
+        original_img = loop_post_process(original_img)
+        for iter, transformed_img in enumerate(self._optimize_CLIP(original_img, pos_prompts, neg_prompts)):
+            if show_intermediate:
+                show_pil(transformed_img)
+            if save_intermediate:
+                transformed_img.save(os.path.join(self.save_path, f"iter_{iter:03d}.png"))
+            if self.log:
+                wandb.log({"Image": wandb.Image(transformed_img)})
+        if show_final:
+            show_pil(transformed_img)
+        if save_final:
+            transformed_img.save(os.path.join(self.save_path, f"iter_{iter:03d}_final.png"))
diff --git a/examples/research_projects/vqgan-clip/img_processing.py b/examples/research_projects/vqgan-clip/img_processing.py
new file mode 100644
index 000000000000..221ebd86dae7
--- /dev/null
+++ b/examples/research_projects/vqgan-clip/img_processing.py
@@ -0,0 +1,50 @@
+import numpy as np
+import PIL
+import torch
+import torchvision.transforms as T
+import torchvision.transforms.functional as TF
+from PIL import Image
+
+
+def preprocess(img, target_image_size=256):
+    s = min(img.size)
+
+    if s < target_image_size:
+        raise ValueError(f"min dim for image {s} < {target_image_size}")
+
+    r = target_image_size / s
+    s = (round(r * img.size[1]), round(r * img.size[0]))
+    img = TF.resize(img, s, interpolation=PIL.Image.LANCZOS)
+    img = TF.center_crop(img, output_size=2 * [target_image_size])
+    img = torch.unsqueeze(T.ToTensor()(img), 0)
+    return img
+
+
+def preprocess_vqgan(x):
+    x = 2.0 * x - 1.0
+    return x
+
+
+def custom_to_pil(x, process=True, mode="RGB"):
+    x = x.detach().cpu()
+    if process:
+        x = post_process_tensor(x)
+    x = x.numpy()
+    if process:
+        x = (255 * x).astype(np.uint8)
+    x = Image.fromarray(x)
+    if not x.mode == mode:
+        x = x.convert(mode)
+    return x
+
+
+def post_process_tensor(x):
+    x = torch.clamp(x, -1.0, 1.0)
+    x = (x + 1.0) / 2.0
+    x = x.permute(1, 2, 0)
+    return x
+
+
+def loop_post_process(x):
+    x = post_process_tensor(x.squeeze())
+    return x.permute(2, 0, 1).unsqueeze(0)
diff --git a/examples/research_projects/vqgan-clip/loaders.py b/examples/research_projects/vqgan-clip/loaders.py
new file mode 100644
index 000000000000..88513bcb6918
--- /dev/null
+++ b/examples/research_projects/vqgan-clip/loaders.py
@@ -0,0 +1,74 @@
+import importlib
+
+import torch
+import yaml
+from omegaconf import OmegaConf
+from taming.models.vqgan import VQModel
+
+
+def load_config(config_path, display=False):
+    config = OmegaConf.load(config_path)
+    if display:
+        print(yaml.dump(OmegaConf.to_container(config)))
+    return config
+
+
+def load_vqgan(device, conf_path=None, ckpt_path=None):
+    if conf_path is None:
+        conf_path = "./model_checkpoints/vqgan_only.yaml"
+    config = load_config(conf_path, display=False)
+    model = VQModel(**config.model.params)
+    if ckpt_path is None:
+        ckpt_path = "./model_checkpoints/vqgan_only.pt"
+    sd = torch.load(ckpt_path, map_location=device)
+    if ".ckpt" in ckpt_path:
+        sd = sd["state_dict"]
+    model.load_state_dict(sd, strict=True)
+    model.to(device)
+    del sd
+    return model
+
+
+def reconstruct_with_vqgan(x, model):
+    z, _, [_, _, indices] = model.encode(x)
+    print(f"VQGAN --- {model.__class__.__name__}: latent shape: {z.shape[2:]}")
+    xrec = model.decode(z)
+    return xrec
+
+
+def get_obj_from_str(string, reload=False):
+    module, cls = string.rsplit(".", 1)
+    if reload:
+        module_imp = importlib.import_module(module)
+        importlib.reload(module_imp)
+    return getattr(importlib.import_module(module, package=None), cls)
+
+
+def instantiate_from_config(config):
+    if "target" not in config:
+        raise KeyError("Expected key `target` to instantiate.")
+    return get_obj_from_str(config["target"])(**config.get("params", {}))
+
+
+def load_model_from_config(config, sd, gpu=True, eval_mode=True):
+    model = instantiate_from_config(config)
+    if sd is not None:
+        model.load_state_dict(sd)
+    if gpu:
+        model.cuda()
+    if eval_mode:
+        model.eval()
+    return {"model": model}
+
+
+def load_model(config, ckpt, gpu, eval_mode):
+    # load the specified checkpoint
+    if ckpt:
+        pl_sd = torch.load(ckpt, map_location="cpu")
+        global_step = pl_sd["global_step"]
+        print(f"loaded model from global step {global_step}.")
+    else:
+        pl_sd = {"state_dict": None}
+        global_step = None
+    model = load_model_from_config(config.model, pl_sd["state_dict"], gpu=gpu, eval_mode=eval_mode)["model"]
+    return model, global_step
diff --git a/examples/research_projects/vqgan-clip/requirements.txt b/examples/research_projects/vqgan-clip/requirements.txt
new file mode 100644
index 000000000000..540bac904f29
--- /dev/null
+++ b/examples/research_projects/vqgan-clip/requirements.txt
@@ -0,0 +1,27 @@
+einops
+gradio
+icecream
+imageio
+lpips
+matplotlib
+more_itertools
+numpy
+omegaconf
+opencv_python_headless
+Pillow
+pudb
+pytorch_lightning
+PyYAML
+requests
+scikit_image
+scipy
+setuptools
+streamlit
+taming-transformers
+torch
+torchvision
+tqdm
+transformers==4.26.0
+tokenizers==0.13.2
+typing_extensions
+wandb
diff --git a/examples/research_projects/vqgan-clip/utils.py b/examples/research_projects/vqgan-clip/utils.py
new file mode 100644
index 000000000000..7db45fcbb52b
--- /dev/null
+++ b/examples/research_projects/vqgan-clip/utils.py
@@ -0,0 +1,35 @@
+from datetime import datetime
+
+import matplotlib.pyplot as plt
+import torch
+
+
+def freeze_module(module):
+    for param in module.parameters():
+        param.requires_grad = False
+
+
+def get_device():
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    if torch.backends.mps.is_available() and torch.backends.mps.is_built():
+        device = "mps"
+    if device == "mps":
+        print(
+            "WARNING: MPS currently doesn't seem to work, and messes up backpropagation without any visible torch"
+            " errors. I recommend using CUDA on a colab notebook or CPU instead if you're facing inexplicable issues"
+            " with generations."
+        )
+    return device
+
+
+def show_pil(img):
+    fig = plt.imshow(img)
+    fig.axes.get_xaxis().set_visible(False)
+    fig.axes.get_yaxis().set_visible(False)
+    plt.show()
+
+
+def get_timestamp():
+    current_time = datetime.now()
+    timestamp = current_time.strftime("%H:%M:%S")
+    return timestamp
diff --git a/examples/research_projects/wav2vec2/alignment.py b/examples/research_projects/wav2vec2/alignment.py
index 24347a55a0bc..55b477f5ee96 100644
--- a/examples/research_projects/wav2vec2/alignment.py
+++ b/examples/research_projects/wav2vec2/alignment.py
@@ -176,7 +176,6 @@ def merge_repeats(path):
                 out_align.write(str(seg) + "\n")
 
     def align_data(self, wav_dir, text_file, output_dir):
-
         if not os.path.exists(output_dir):
             os.makedirs(output_dir)
 
diff --git a/examples/research_projects/wav2vec2/run_asr.py b/examples/research_projects/wav2vec2/run_asr.py
index 692aa39796a7..6535e3485d17 100755
--- a/examples/research_projects/wav2vec2/run_asr.py
+++ b/examples/research_projects/wav2vec2/run_asr.py
@@ -7,13 +7,13 @@
 from typing import Any, Callable, Dict, List, Optional, Set, Union
 
 import datasets
+import librosa
 import numpy as np
 import torch
+from lang_trans import arabic
 from packaging import version
 from torch import nn
 
-import librosa
-from lang_trans import arabic
 from transformers import (
     HfArgumentParser,
     Trainer,
@@ -254,7 +254,7 @@ class DataCollatorCTCWithPadding:
     pad_to_multiple_of_labels: Optional[int] = None
 
     def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
-        # split inputs and labels since they have to be of different lenghts and need
+        # split inputs and labels since they have to be of different lengths and need
         # different padding methods
         input_features = [{"input_values": feature["input_values"]} for feature in features]
         label_features = [{"input_ids": feature["labels"]} for feature in features]
@@ -365,7 +365,7 @@ def main():
     target_sr = processor.feature_extractor.sampling_rate if data_args.target_feature_extractor_sampling_rate else None
     vocabulary_chars_str = "".join(t for t in processor.tokenizer.get_vocab().keys() if len(t) == 1)
     vocabulary_text_cleaner = re.compile(  # remove characters not in vocabulary
-        f"[^\s{re.escape(vocabulary_chars_str)}]",  # allow space in addition to chars in vocabulary
+        rf"[^\s{re.escape(vocabulary_chars_str)}]",  # allow space in addition to chars in vocabulary
         flags=re.IGNORECASE if processor.tokenizer.do_lower_case else 0,
     )
     text_updates = []
diff --git a/examples/research_projects/wav2vec2/run_common_voice.py b/examples/research_projects/wav2vec2/run_common_voice.py
index 01a877a8092e..688cef1ba0c6 100644
--- a/examples/research_projects/wav2vec2/run_common_voice.py
+++ b/examples/research_projects/wav2vec2/run_common_voice.py
@@ -173,7 +173,7 @@ class DataCollatorCTCWithPadding:
     pad_to_multiple_of_labels: Optional[int] = None
 
     def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
-        # split inputs and labels since they have to be of different lenghts and need
+        # split inputs and labels since they have to be of different lengths and need
         # different padding methods
         input_features = [{"input_values": feature["input_values"]} for feature in features]
         label_features = [{"input_ids": feature["labels"]} for feature in features]
diff --git a/examples/research_projects/wav2vec2/run_pretrain.py b/examples/research_projects/wav2vec2/run_pretrain.py
index 8e0801429e61..985e6df40e31 100755
--- a/examples/research_projects/wav2vec2/run_pretrain.py
+++ b/examples/research_projects/wav2vec2/run_pretrain.py
@@ -4,12 +4,12 @@
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional, Union
 
+import librosa
 import torch
 from datasets import DatasetDict, load_dataset
 from packaging import version
 from torch import nn
 
-import librosa
 from transformers import (
     HfArgumentParser,
     Trainer,
diff --git a/examples/research_projects/wav2vec2/test_wav2vec2_deepspeed.py b/examples/research_projects/wav2vec2/test_wav2vec2_deepspeed.py
index a414f7db9770..d44145f3e0c1 100644
--- a/examples/research_projects/wav2vec2/test_wav2vec2_deepspeed.py
+++ b/examples/research_projects/wav2vec2/test_wav2vec2_deepspeed.py
@@ -32,7 +32,7 @@
 
 from parameterized import parameterized  # noqa
 from transformers import TrainingArguments, is_torch_available  # noqa
-from transformers.deepspeed import is_deepspeed_available  # noqa
+from transformers.integrations.deepspeed import is_deepspeed_available  # noqa
 from transformers.file_utils import WEIGHTS_NAME  # noqa
 from transformers.testing_utils import (  # noqa
     CaptureLogger,
@@ -51,7 +51,7 @@
 
 set_seed(42)
 
-models = dict(base="patrickvonplaten/wav2vec2_tiny_random", robust="patrickvonplaten/wav2vec2_tiny_random_robust")
+models = {"base": "patrickvonplaten/wav2vec2_tiny_random", "robust": "patrickvonplaten/wav2vec2_tiny_random_robust"}
 
 ZERO2 = "zero2"
 ZERO3 = "zero3"
@@ -126,7 +126,6 @@ def run_and_check(
         quality_checks: bool = True,
         fp16: bool = True,
     ):
-
         model_name = models[model]
 
         output_dir = self.run_trainer(
@@ -151,7 +150,6 @@ def run_trainer(
         distributed: bool = True,
         fp16: bool = True,
     ):
-
         output_dir = self.get_auto_remove_tmp_dir("./xxx", after=False)
         args = f"""
             --model_name_or_path {model_name}
diff --git a/examples/research_projects/xtreme-s/run_xtreme_s.py b/examples/research_projects/xtreme-s/run_xtreme_s.py
index 16fc1ac8a39c..f00286f3d2b8 100644
--- a/examples/research_projects/xtreme-s/run_xtreme_s.py
+++ b/examples/research_projects/xtreme-s/run_xtreme_s.py
@@ -327,7 +327,6 @@ class DataTrainingArguments:
 
 @dataclass
 class SpeechDataCollatorWithPadding:
-
     processor: AutoProcessor
     decoder_start_token_id: Optional[int] = None
     padding: Union[bool, str] = "longest"
@@ -336,7 +335,7 @@ class SpeechDataCollatorWithPadding:
     pad_to_multiple_of_labels: Optional[int] = None
 
     def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
-        # split inputs and labels since they have to be of different lenghts and need
+        # split inputs and labels since they have to be of different lengths and need
         # different padding methods
         input_features = [{"input_values": feature["input_values"]} for feature in features]
 
@@ -401,7 +400,7 @@ def extract_all_chars(batch):
         | (set(vocabs["predict"]["vocab"][0]) if "predict" in vocabs else set())
     )
 
-    vocab_dict = {v: k for k, v in enumerate(sorted(list(vocab_set)))}
+    vocab_dict = {v: k for k, v in enumerate(sorted(vocab_set))}
 
     # replace white space with delimiter token
     if word_delimiter_token is not None:
@@ -863,7 +862,6 @@ def compute_classification_metric(pred):
 
     # Training
     if training_args.do_train:
-
         # use last checkpoint if exist
         if last_checkpoint is not None:
             checkpoint = last_checkpoint
diff --git a/examples/research_projects/zero-shot-distillation/distill_classifier.py b/examples/research_projects/zero-shot-distillation/distill_classifier.py
index 16d52214376e..3325c3aae050 100644
--- a/examples/research_projects/zero-shot-distillation/distill_classifier.py
+++ b/examples/research_projects/zero-shot-distillation/distill_classifier.py
@@ -303,7 +303,7 @@ def main():
         student_args.student_name_or_path, num_labels=len(class_names)
     )
     tokenizer = AutoTokenizer.from_pretrained(student_args.student_name_or_path, use_fast=data_args.use_fast_tokenizer)
-    model.config.id2label = {i: label for i, label in enumerate(class_names)}
+    model.config.id2label = dict(enumerate(class_names))
     model.config.label2id = {label: i for i, label in enumerate(class_names)}
 
     # 4. train student on teacher predictions
diff --git a/examples/run_on_remote.py b/examples/run_on_remote.py
new file mode 100644
index 000000000000..46f87065d761
--- /dev/null
+++ b/examples/run_on_remote.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import shlex
+
+import runhouse as rh
+
+
+if __name__ == "__main__":
+    # Refer to https://runhouse-docs.readthedocs-hosted.com/en/latest/api/python/cluster.html#hardware-setup for cloud access
+    # setup instructions, if using on-demand hardware
+
+    # If user passes --user <user> --host <host> --key_path <key_path> <example> <args>, fill them in as BYO cluster
+    # If user passes --instance <instance> --provider <provider> <example> <args>, fill them in as on-demand cluster
+    # Throw an error if user passes both BYO and on-demand cluster args
+    # Otherwise, use default values
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--user", type=str, default="ubuntu")
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--key_path", type=str, default=None)
+    parser.add_argument("--instance", type=str, default="V100:1")
+    parser.add_argument("--provider", type=str, default="cheapest")
+    parser.add_argument("--use_spot", type=bool, default=False)
+    parser.add_argument("--example", type=str, default="pytorch/text-generation/run_generation.py")
+    args, unknown = parser.parse_known_args()
+    if args.host != "localhost":
+        if args.instance != "V100:1" or args.provider != "cheapest":
+            raise ValueError("Cannot specify both BYO and on-demand cluster args")
+        cluster = rh.cluster(
+            name="rh-cluster", ips=[args.host], ssh_creds={"ssh_user": args.user, "ssh_private_key": args.key_path}
+        )
+    else:
+        cluster = rh.cluster(
+            name="rh-cluster", instance_type=args.instance, provider=args.provider, use_spot=args.use_spot
+        )
+    example_dir = args.example.rsplit("/", 1)[0]
+
+    # Set up remote environment
+    cluster.install_packages(["pip:./"])  # Installs transformers from local source
+    # Note transformers is copied into the home directory on the remote machine, so we can install from there
+    cluster.run([f"pip install -r transformers/examples/{example_dir}/requirements.txt"])
+    cluster.run(["pip install torch --upgrade --extra-index-url https://download.pytorch.org/whl/cu117"])
+
+    # Run example. You can bypass the CLI wrapper and paste your own code here.
+    cluster.run([f'python transformers/examples/{args.example} {" ".join(shlex.quote(arg) for arg in unknown)}'])
+
+    # Alternatively, we can just import and run a training function (especially if there's no wrapper CLI):
+    # from my_script... import train
+    # reqs = ['pip:./', 'torch', 'datasets', 'accelerate', 'evaluate', 'tqdm', 'scipy', 'scikit-learn', 'tensorboard']
+    # launch_train_gpu = rh.function(fn=train,
+    #                                system=gpu,
+    #                                reqs=reqs,
+    #                                name='train_bert_glue')
+    #
+    # We can pass in arguments just like we would to a function:
+    # launch_train_gpu(num_epochs = 3, lr = 2e-5, seed = 42, batch_size = 16
+    #                  stream_logs=True)
diff --git a/examples/tensorflow/README.md b/examples/tensorflow/README.md
index 7936e3d46509..d44f4646c878 100644
--- a/examples/tensorflow/README.md
+++ b/examples/tensorflow/README.md
@@ -15,7 +15,7 @@ limitations under the License.
 
 # Examples
 
-This folder contains actively maintained examples of use of 🤗 Transformers organized into different ML tasks. All examples in this folder are **TensorFlow** examples, and are written using native Keras rather than classes like `TFTrainer`, which we now consider deprecated. If you've previously only used 🤗 Transformers via `TFTrainer`, we highly recommend taking a look at the new style - we think it's a big improvement!
+This folder contains actively maintained examples of the use of 🤗 Transformers organized into different ML tasks. All examples in this folder are **TensorFlow** examples and are written using native Keras rather than classes like `TFTrainer`, which we now consider deprecated. If you've previously only used 🤗 Transformers via `TFTrainer`, we highly recommend taking a look at the new style - we think it's a big improvement!
 
 In addition, all scripts here now support the [🤗 Datasets](https://github.com/huggingface/datasets) library - you can grab entire datasets just by changing one command-line argument!
 
diff --git a/examples/tensorflow/_tests_requirements.txt b/examples/tensorflow/_tests_requirements.txt
index 837ce6d0d16d..989bda33028c 100644
--- a/examples/tensorflow/_tests_requirements.txt
+++ b/examples/tensorflow/_tests_requirements.txt
@@ -1,10 +1,9 @@
-tensorflow<2.11
+tensorflow<2.15
 tensorboard
 scikit-learn
 seqeval
 psutil
 sacrebleu >= 1.4.12
-git+https://github.com/huggingface/accelerate@main#egg=accelerate
 rouge-score
 tensorflow_datasets
 matplotlib
diff --git a/examples/tensorflow/benchmarking/plot_csv_file.py b/examples/tensorflow/benchmarking/plot_csv_file.py
index 1a0ae735d8c6..9a9ad9c67047 100644
--- a/examples/tensorflow/benchmarking/plot_csv_file.py
+++ b/examples/tensorflow/benchmarking/plot_csv_file.py
@@ -83,7 +83,7 @@ def can_convert_to_float(string):
 class Plot:
     def __init__(self, args):
         self.args = args
-        self.result_dict = defaultdict(lambda: dict(bsz=[], seq_len=[], result={}))
+        self.result_dict = defaultdict(lambda: {"bsz": [], "seq_len": [], "result": {}})
 
         with open(self.args.csv_file, newline="") as csv_file:
             reader = csv.DictReader(csv_file)
@@ -116,8 +116,8 @@ def plot(self):
             axis.set_major_formatter(ScalarFormatter())
 
         for model_name_idx, model_name in enumerate(self.result_dict.keys()):
-            batch_sizes = sorted(list(set(self.result_dict[model_name]["bsz"])))
-            sequence_lengths = sorted(list(set(self.result_dict[model_name]["seq_len"])))
+            batch_sizes = sorted(set(self.result_dict[model_name]["bsz"]))
+            sequence_lengths = sorted(set(self.result_dict[model_name]["seq_len"]))
             results = self.result_dict[model_name]["result"]
 
             (x_axis_array, inner_loop_array) = (
diff --git a/examples/tensorflow/contrastive-image-text/README.md b/examples/tensorflow/contrastive-image-text/README.md
new file mode 100644
index 000000000000..9e3a011fcb33
--- /dev/null
+++ b/examples/tensorflow/contrastive-image-text/README.md
@@ -0,0 +1,81 @@
+<!---
+Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# TFVisionTextDualEncoder and CLIP model training examples
+
+The following example showcases how to train a CLIP-like vision-text dual encoder model
+using a pre-trained vision and text encoder.
+
+Such a model can be used for natural language image search and potentially zero-shot image classification.
+The model is inspired by [CLIP](https://openai.com/blog/clip/), introduced by Alec Radford et al.
+The idea is to train a vision encoder and a text encoder jointly to project the representation of images and their
+captions into the same embedding space, such that the caption embeddings are located near the embeddings
+of the images they describe.
+
+### Download COCO dataset (2017)
+This example uses COCO dataset (2017) through a custom dataset script, which requires users to manually download the
+COCO dataset before training.
+
+```bash
+mkdir data
+cd data
+wget http://images.cocodataset.org/zips/train2017.zip
+wget http://images.cocodataset.org/zips/val2017.zip
+wget http://images.cocodataset.org/zips/test2017.zip
+wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip
+wget http://images.cocodataset.org/annotations/image_info_test2017.zip
+cd ..
+```
+
+Having downloaded COCO dataset manually you should be able to load with the `ydshieh/coc_dataset_script` dataset loading script:
+
+```py
+import os
+import datasets
+
+COCO_DIR = os.path.join(os.getcwd(), "data")
+ds = datasets.load_dataset("ydshieh/coco_dataset_script", "2017", data_dir=COCO_DIR)
+```
+
+### Create a model from a vision encoder model and a text encoder model
+We can either load a CLIP-like vision-text dual encoder model from an existing dual encoder model, or
+by using a pre-trained vision encoder model and a pre-trained text encoder model.
+
+If you wish to load an existing dual encoder model, please use the `--model_name_or_path` argument. If
+you want to use separate pre-trained vision and text models, please use the
+`--vision_model_name_or_path` and `--text_model_name_or_path` arguments instead.
+
+### Train the model
+Finally, we can run the example script to train the model:
+
+```bash
+python examples/tensorflow/contrastive-image-text/run_clip.py \
+    --output_dir ./clip-roberta-finetuned \
+    --vision_model_name_or_path openai/clip-vit-base-patch32 \
+    --text_model_name_or_path roberta-base \
+    --data_dir $PWD/data \
+    --dataset_name ydshieh/coco_dataset_script \
+    --dataset_config_name=2017 \
+    --image_column image_path \
+    --caption_column caption \
+    --remove_unused_columns=False \
+    --do_train  --do_eval \
+    --per_device_train_batch_size="64" \
+    --per_device_eval_batch_size="64" \
+    --learning_rate="5e-5" --warmup_steps="0" --weight_decay 0.1 \
+    --overwrite_output_dir \
+    --push_to_hub
+```
diff --git a/examples/tensorflow/contrastive-image-text/requirements.txt b/examples/tensorflow/contrastive-image-text/requirements.txt
new file mode 100644
index 000000000000..ef4bf188bff2
--- /dev/null
+++ b/examples/tensorflow/contrastive-image-text/requirements.txt
@@ -0,0 +1,2 @@
+tensorflow>=2.6.0
+datasets>=1.8.0
\ No newline at end of file
diff --git a/examples/tensorflow/contrastive-image-text/run_clip.py b/examples/tensorflow/contrastive-image-text/run_clip.py
new file mode 100644
index 000000000000..8d1a87ff15ea
--- /dev/null
+++ b/examples/tensorflow/contrastive-image-text/run_clip.py
@@ -0,0 +1,623 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Training a CLIP like dual encoder models using text and vision encoders in the library.
+
+The script can be used to train CLIP like models for languages other than English by using
+a text encoder pre-trained in the desired language. Currently this script supports the following vision
+and text models:
+Vision models: ViT(https://huggingface.co/models?filter=vit), CLIP (https://huggingface.co/models?filter=clip)
+Text models: BERT, ROBERTa (https://huggingface.co/models?filter=fill-mask)
+"""
+
+import logging
+import os
+import sys
+import warnings
+from dataclasses import dataclass, field
+from typing import Optional
+
+import tensorflow as tf
+from datasets import load_dataset
+from PIL import Image
+
+import transformers
+from transformers import (
+    AutoImageProcessor,
+    AutoTokenizer,
+    HfArgumentParser,
+    PushToHubCallback,
+    TFAutoModel,
+    TFTrainingArguments,
+    TFVisionTextDualEncoderModel,
+    create_optimizer,
+)
+from transformers.utils import check_min_version, send_example_telemetry
+from transformers.utils.versions import require_version
+
+
+logger = logging.getLogger(__name__)
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.35.0.dev0")
+
+require_version(
+    "datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/contrastive-image-text/requirements.txt"
+)
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
+    """
+
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}, default=None
+    )
+    vision_model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained image model or model identifier from huggingface.co/models"},
+        default=None,
+    )
+    text_model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained text model or model identifier from huggingface.co/models"}, default=None
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
+    cache_dir: Optional[str] = field(
+        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
+    freeze_vision_model: bool = field(
+        default=False, metadata={"help": "Whether to freeze the vision model parameters or not."}
+    )
+    freeze_text_model: bool = field(
+        default=False, metadata={"help": "Whether to freeze the text model parameters or not."}
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    dataset_name: Optional[str] = field(
+        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    data_dir: Optional[str] = field(default=None, metadata={"help": "The data directory containing input files."})
+    image_column: Optional[str] = field(
+        default="image_path",
+        metadata={"help": "The name of the column in the datasets containing the full image file paths."},
+    )
+    caption_column: Optional[str] = field(
+        default="caption",
+        metadata={"help": "The name of the column in the datasets containing the image captions."},
+    )
+    train_file: Optional[str] = field(
+        default=None, metadata={"help": "The input training data file (a jsonlines file)."}
+    )
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input evaluation data file (a jsonlines file)."},
+    )
+    test_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input testing data file (a jsonlines file)."},
+    )
+    max_seq_length: Optional[int] = field(
+        default=128,
+        metadata={
+            "help": (
+                "The maximum total input sequence length after tokenization. Sequences longer "
+                "than this will be truncated, sequences shorter will be padded."
+            )
+        },
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of training examples to this "
+                "value if set."
+            )
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+                "value if set."
+            )
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+
+    def __post_init__(self):
+        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
+            raise ValueError("Need either a dataset name or a training/validation file.")
+        else:
+            if self.train_file is not None:
+                extension = self.train_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                assert extension == "json", "`validation_file` should be a json file."
+
+
+dataset_name_mapping = {
+    "image_caption_dataset.py": ("image_path", "caption"),
+}
+
+
+def crop_to_square(image):
+    height, width = tf.shape(image)[0], tf.shape(image)[1]
+    if height > width:
+        image = tf.image.crop_to_bounding_box(image, (height - width) // 2, 0, width, width)
+    elif width > height:
+        image = tf.image.crop_to_bounding_box(image, 0, (width - height) // 2, height, height)
+    return image
+
+
+def load_as_tf_dataset(dataset, image_column, image_size, mean, std, batch_size, shuffle):
+    dataset = dataset.with_format("tensorflow")[:]  # Load the dataset as tensor slices, but not the images yet!
+    tf_dataset = tf.data.Dataset.from_tensor_slices(dataset)
+
+    def load_image(sample):
+        image_path = sample[image_column]
+        image = tf.io.read_file(image_path)
+        image = tf.image.decode_image(image, channels=3, expand_animations=False)
+        image = crop_to_square(image)
+        image = tf.image.resize(image, [image_size, image_size], method="bicubic", antialias=True)
+        image = image / 255.0
+        image = (image - mean) / std
+        image = tf.transpose(image, perm=[2, 0, 1])  # Convert to channels-first
+        sample["pixel_values"] = image
+        del sample[image_column]
+        return sample
+
+    if shuffle:
+        tf_dataset = tf_dataset.shuffle(len(tf_dataset))
+    tf_dataset = tf_dataset.map(load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+    tf_dataset = tf_dataset.batch(batch_size, drop_remainder=shuffle)
+    tf_dataset = tf_dataset.prefetch(tf.data.experimental.AUTOTUNE)
+
+    return tf_dataset
+
+
+def main():
+    # 1. Parse input arguments
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
+    if model_args.model_name_or_path is not None:
+        if model_args.vision_model_name_or_path is not None or model_args.text_model_name_or_path is not None:
+            raise ValueError(
+                "If using model_name_or_path, you cannot specify separate image/text model paths as well!"
+            )
+
+    if model_args.vision_model_name_or_path is not None or model_args.text_model_name_or_path is not None:
+        if model_args.model_name_or_path is not None:
+            raise ValueError(
+                "If using separate image/text model paths, you cannot specify model_name_or_path as well!"
+            )
+        if not (model_args.vision_model_name_or_path is not None and model_args.text_model_name_or_path is not None):
+            raise ValueError(
+                "If using separate image/text model paths, you must specify both vision_model_name_or_path "
+                "and text_model_name_or_path!"
+            )
+
+    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
+    # information sent is the one passed as arguments along with your Python/TensorFlow versions.
+    send_example_telemetry("run_clip", model_args, data_args, framework="tensorflow")
+
+    # 2. Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+
+    # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+    transformers.utils.logging.set_verbosity_info()
+
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()
+
+    # Log on each process the small summary:
+    logger.info(f"Training/evaluation parameters {training_args}")
+
+    # 3. Detecting last checkpoint and eventualy continue from last checkpoint
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+
+    # 4. Load dataset
+    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files this script will use the first column for the full image path and the second column for the
+    # captions (unless you specify column names for this with the `image_column` and `caption_column` arguments).
+    #
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            data_args.dataset_name,
+            data_args.dataset_config_name,
+            cache_dir=model_args.cache_dir,
+            keep_in_memory=False,
+            data_dir=data_args.data_dir,
+            token=model_args.token,
+        )
+    else:
+        data_files = {}
+        if data_args.train_file is not None:
+            data_files["train"] = data_args.train_file
+            extension = data_args.train_file.split(".")[-1]
+        if data_args.validation_file is not None:
+            data_files["validation"] = data_args.validation_file
+            extension = data_args.validation_file.split(".")[-1]
+        if data_args.test_file is not None:
+            data_files["test"] = data_args.test_file
+            extension = data_args.test_file.split(".")[-1]
+        dataset = load_dataset(
+            extension,
+            data_files=data_files,
+            cache_dir=model_args.cache_dir,
+            token=model_args.token,
+        )
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+    # 5. Load pretrained model, tokenizer, and image processor
+    if model_args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.tokenizer_name,
+            cache_dir=model_args.cache_dir,
+            use_fast=model_args.use_fast_tokenizer,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
+        )
+    elif model_args.model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path,
+            cache_dir=model_args.cache_dir,
+            use_fast=model_args.use_fast_tokenizer,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
+        )
+    elif model_args.text_model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.text_model_name_or_path,
+            cache_dir=model_args.cache_dir,
+            use_fast=model_args.use_fast_tokenizer,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
+        )
+    else:
+        raise ValueError(
+            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
+            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
+        )
+
+    if model_args.model_name_or_path:
+        # Load image_processor, in this script we only use this to get the mean and std for normalization.
+        image_processor = AutoImageProcessor.from_pretrained(
+            model_args.image_processor_name or model_args.model_name_or_path,
+            cache_dir=model_args.cache_dir,
+            revision=model_args.model_revision,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
+        )
+        with training_args.strategy.scope():
+            model = TFAutoModel.from_pretrained(
+                model_args.model_name_or_path,
+                cache_dir=model_args.cache_dir,
+                revision=model_args.model_revision,
+                token=model_args.token,
+                trust_remote_code=model_args.trust_remote_code,
+            )
+    else:
+        # Load image_processor, in this script we only use this to get the mean and std for normalization.
+        image_processor = AutoImageProcessor.from_pretrained(
+            model_args.image_processor_name or model_args.vision_model_name_or_path,
+            cache_dir=model_args.cache_dir,
+            revision=model_args.model_revision,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
+        )
+        with training_args.strategy.scope():
+            model = TFVisionTextDualEncoderModel.from_vision_text_pretrained(
+                vision_model_name_or_path=model_args.vision_model_name_or_path,
+                text_model_name_or_path=model_args.text_model_name_or_path,
+                cache_dir=model_args.cache_dir,
+                token=model_args.token,
+                trust_remote_code=model_args.trust_remote_code,
+            )
+    config = model.config
+
+    if model_args.freeze_vision_model:
+        model.vision_model.trainable = False
+
+    if model_args.freeze_text_model:
+        model.text_model.trainable = False
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    if training_args.do_train:
+        column_names = dataset["train"].column_names
+    elif training_args.do_eval:
+        column_names = dataset["validation"].column_names
+    elif training_args.do_predict:
+        column_names = dataset["test"].column_names
+    else:
+        logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.")
+        return
+
+    # 6. Get the column names for input/target.
+    dataset_columns = dataset_name_mapping.get(data_args.dataset_name, None)
+    if data_args.image_column is None:
+        image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
+    else:
+        image_column = data_args.image_column
+        if image_column not in column_names:
+            raise ValueError(
+                f"--image_column' value '{data_args.image_column}' needs to be one of: {', '.join(column_names)}"
+            )
+    if data_args.caption_column is None:
+        caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
+    else:
+        caption_column = data_args.caption_column
+        if caption_column not in column_names:
+            raise ValueError(
+                f"--caption_column' value '{data_args.caption_column}' needs to be one of: {', '.join(column_names)}"
+            )
+
+    # # 7. Preprocessing the datasets.
+
+    # We need to tokenize input captions and transform the images.
+    def tokenize_captions(examples):
+        captions = list(examples[caption_column])
+        text_inputs = tokenizer(captions, max_length=data_args.max_seq_length, padding="max_length", truncation=True)
+        examples["input_ids"] = text_inputs.input_ids
+        examples["attention_mask"] = text_inputs.attention_mask
+        return examples
+
+    def filter_corrupt_images(examples):
+        """remove problematic images"""
+        valid_images = []
+        for image_file in examples[image_column]:
+            try:
+                Image.open(image_file)
+                valid_images.append(True)
+            except Exception:
+                valid_images.append(False)
+        return valid_images
+
+    if training_args.do_train:
+        if "train" not in dataset:
+            raise ValueError("--do_train requires a train dataset")
+        train_dataset = dataset["train"]
+        if data_args.max_train_samples is not None:
+            max_train_samples = min(len(train_dataset), data_args.max_train_samples)
+            train_dataset = train_dataset.select(range(max_train_samples))
+
+        train_dataset = train_dataset.filter(
+            filter_corrupt_images, batched=True, num_proc=data_args.preprocessing_num_workers
+        )
+        train_dataset = train_dataset.map(
+            function=tokenize_captions,
+            batched=True,
+            remove_columns=[col for col in column_names if col != image_column],
+            num_proc=data_args.preprocessing_num_workers,
+            load_from_cache_file=not data_args.overwrite_cache,
+            desc="Running tokenizer on train dataset",
+        )
+
+        tf_train_dataset = load_as_tf_dataset(
+            dataset=train_dataset,
+            batch_size=training_args.per_device_train_batch_size,
+            image_column=image_column,
+            image_size=config.vision_config.image_size,
+            mean=image_processor.image_mean,
+            std=image_processor.image_std,
+            shuffle=True,
+        )
+
+    if training_args.do_eval:
+        if "validation" not in dataset:
+            raise ValueError("--do_eval requires a train validation")
+        eval_dataset = dataset["validation"]
+        if data_args.max_eval_samples is not None:
+            max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
+            eval_dataset = eval_dataset.select(range(max_eval_samples))
+
+        eval_dataset = eval_dataset.filter(
+            filter_corrupt_images, batched=True, num_proc=data_args.preprocessing_num_workers
+        )
+        eval_dataset = eval_dataset.map(
+            function=tokenize_captions,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=[col for col in column_names if col != image_column],
+            load_from_cache_file=not data_args.overwrite_cache,
+            desc="Running tokenizer on validation dataset",
+        )
+
+        tf_eval_dataset = load_as_tf_dataset(
+            dataset=eval_dataset,
+            batch_size=training_args.per_device_eval_batch_size,
+            image_column=image_column,
+            image_size=config.vision_config.image_size,
+            mean=image_processor.image_mean,
+            std=image_processor.image_std,
+            shuffle=False,
+        )
+
+    # 8. Preparing push_to_hub and model card
+    push_to_hub_model_id = training_args.push_to_hub_model_id
+    if model_args.model_name_or_path is not None:
+        model_name = model_args.model_name_or_path.split("/")[-1]
+    else:
+        vision_name = model_args.vision_model_name_or_path.split("/")[-1]
+        text_name = model_args.text_model_name_or_path.split("/")[-1]
+        model_name = f"{vision_name}-{text_name}"
+    if not push_to_hub_model_id:
+        if data_args.dataset_name is not None:
+            push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}"
+        else:
+            push_to_hub_model_id = f"{model_name}-finetuned-contrastive-image-text-modeling"
+
+    model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "contrastive-image-text-modeling"}
+    if data_args.dataset_name is not None:
+        model_card_kwargs["dataset_tags"] = data_args.dataset_name
+        if data_args.dataset_config_name is not None:
+            model_card_kwargs["dataset_args"] = data_args.dataset_config_name
+            model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
+        else:
+            model_card_kwargs["dataset"] = data_args.dataset_name
+
+    if training_args.push_to_hub:
+        callbacks = [
+            PushToHubCallback(
+                output_dir=training_args.output_dir,
+                hub_model_id=push_to_hub_model_id,
+                hub_token=training_args.push_to_hub_token,
+                tokenizer=tokenizer,
+                **model_card_kwargs,
+            )
+        ]
+    else:
+        callbacks = []
+
+    # # 9. Training
+    if training_args.do_train:
+        num_train_steps = int(len(tf_train_dataset) * int(training_args.num_train_epochs))
+        if training_args.warmup_steps > 0:
+            num_warmup_steps = training_args.warmup_steps
+        elif training_args.warmup_ratio > 0:
+            num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
+        else:
+            num_warmup_steps = 0
+        optimizer, lr_schedule = create_optimizer(
+            init_lr=training_args.learning_rate,
+            num_train_steps=num_train_steps,
+            num_warmup_steps=num_warmup_steps,
+            adam_beta1=training_args.adam_beta1,
+            adam_beta2=training_args.adam_beta2,
+            adam_epsilon=training_args.adam_epsilon,
+            weight_decay_rate=training_args.weight_decay,
+            adam_global_clipnorm=training_args.max_grad_norm,
+        )
+        # Transformers models compute the right loss for their task by default when labels are passed, and will
+        # use this for training unless you specify your own loss function in compile().
+        model.compile(optimizer=optimizer, jit_compile=training_args.xla)
+
+        if not training_args.do_eval:
+            tf_eval_dataset = None
+        model.fit(
+            tf_train_dataset,
+            validation_data=tf_eval_dataset,
+            epochs=int(training_args.num_train_epochs),
+            callbacks=callbacks,
+        )
+
+    # # 10. Evaluation
+
+    if training_args.do_eval and not training_args.do_train:
+        model.evaluate(tf_eval_dataset)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/tensorflow/image-classification/README.md b/examples/tensorflow/image-classification/README.md
new file mode 100644
index 000000000000..28da5e894e17
--- /dev/null
+++ b/examples/tensorflow/image-classification/README.md
@@ -0,0 +1,162 @@
+<!---
+Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Image classification examples
+
+This directory contains 2 scripts that showcase how to fine-tune any model supported by the [`TFAutoModelForImageClassification` API](https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.TFAutoModelForImageClassification) (such as [ViT](https://huggingface.co/docs/transformers/main/en/model_doc/vit), [ConvNeXT](https://huggingface.co/docs/transformers/main/en/model_doc/convnext), [ResNet](https://huggingface.co/docs/transformers/main/en/model_doc/resnet), [Swin Transformer](https://huggingface.co/docs/transformers/main/en/model_doc/swin)...) using TensorFlow. They can be used to fine-tune models on both [datasets from the hub](#using-datasets-from-hub) as well as on [your own custom data](#using-your-own-data).
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/image_classification_inference_widget.png" height="400" />
+
+Try out the inference widget here: https://huggingface.co/google/vit-base-patch16-224
+
+## TensorFlow
+
+Based on the script [`run_image_classification.py`](https://github.com/huggingface/transformers/blob/main/examples/tensorflow/image-classification/run_image_classification.py).
+
+### Using datasets from Hub
+
+Here we show how to fine-tune a Vision Transformer (`ViT`) on the [beans](https://huggingface.co/datasets/beans) dataset, to classify the disease type of bean leaves. The following will train a model and push it to the `amyeroberts/vit-base-beans` repo.
+
+```bash
+python run_image_classification.py \
+    --dataset_name beans \
+    --output_dir ./beans_outputs/ \
+    --remove_unused_columns False \
+    --do_train \
+    --do_eval \
+    --push_to_hub \
+    --hub_model_id amyeroberts/vit-base-beans \
+    --learning_rate 2e-5 \
+    --num_train_epochs 5 \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 8 \
+    --logging_strategy steps \
+    --logging_steps 10 \
+    --evaluation_strategy epoch \
+    --save_strategy epoch \
+    --load_best_model_at_end True \
+    --save_total_limit 3 \
+    --seed 1337
+```
+
+👀 See the results here: [amyeroberts/vit-base-beans](https://huggingface.co/amyeroberts/vit-base-beans).
+
+Note that you can replace the model and dataset by simply setting the `model_name_or_path` and `dataset_name` arguments respectively, with any model or dataset from the [hub](https://huggingface.co/). For an overview of all possible arguments, we refer to the [docs](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments) of the `TrainingArguments`, which can be passed as flags.
+
+> If your model classification head dimensions do not fit the number of labels in the dataset, you can specify `--ignore_mismatched_sizes` to adapt it.
+
+### Using your own data
+
+To use your own dataset, there are 2 ways:
+- you can either provide your own folders as `--train_dir` and/or `--validation_dir` arguments
+- you can upload your dataset to the hub (possibly as a private repo, if you prefer so), and simply pass the `--dataset_name` argument.
+
+Below, we explain both in more detail.
+
+#### Provide them as folders
+
+If you provide your own folders with images, the script expects the following directory structure:
+
+```bash
+root/dog/xxx.png
+root/dog/xxy.png
+root/dog/[...]/xxz.png
+
+root/cat/123.png
+root/cat/nsdf3.png
+root/cat/[...]/asd932_.png
+```
+
+In other words, you need to organize your images in subfolders, based on their class. You can then run the script like this:
+
+```bash
+python run_image_classification.py \
+    --train_dir <path-to-train-root> \
+    --output_dir ./outputs/ \
+    --remove_unused_columns False \
+    --do_train \
+    --do_eval
+```
+
+Internally, the script will use the [`ImageFolder`](https://huggingface.co/docs/datasets/v2.0.0/en/image_process#imagefolder) feature which will automatically turn the folders into 🤗 Dataset objects.
+
+##### 💡 The above will split the train dir into training and evaluation sets
+  - To control the split amount, use the `--train_val_split` flag.
+  - To provide your own validation split in its own directory, you can pass the `--validation_dir <path-to-val-root>` flag.
+
+#### Upload your data to the hub, as a (possibly private) repo
+
+To upload your image dataset to the hub you can use the [`ImageFolder`](https://huggingface.co/docs/datasets/v2.0.0/en/image_process#imagefolder) feature available in 🤗 Datasets. Simply do the following:
+
+```python
+from datasets import load_dataset
+
+# example 1: local folder
+dataset = load_dataset("imagefolder", data_dir="path_to_your_folder")
+
+# example 2: local files (suppoted formats are tar, gzip, zip, xz, rar, zstd)
+dataset = load_dataset("imagefolder", data_files="path_to_zip_file")
+
+# example 3: remote files (suppoted formats are tar, gzip, zip, xz, rar, zstd)
+dataset = load_dataset("imagefolder", data_files="https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_3367a.zip")
+
+# example 4: providing several splits
+dataset = load_dataset("imagefolder", data_files={"train": ["path/to/file1", "path/to/file2"], "test": ["path/to/file3", "path/to/file4"]})
+```
+
+`ImageFolder` will create a `label` column, and the label name is based on the directory name.
+
+Next, push it to the hub!
+
+```python
+# assuming you have ran the huggingface-cli login command in a terminal
+dataset.push_to_hub("name_of_your_dataset")
+
+# if you want to push to a private repo, simply pass private=True:
+dataset.push_to_hub("name_of_your_dataset", private=True)
+```
+
+and that's it! You can now train your model by simply setting the `--dataset_name` argument to the name of your dataset on the hub (as explained in [Using datasets from the 🤗 hub](#using-datasets-from-hub)).
+
+More on this can also be found in [this blog post](https://huggingface.co/blog/image-search-datasets).
+
+### Sharing your model on 🤗 Hub
+
+0. If you haven't already, [sign up](https://huggingface.co/join) for a 🤗 account
+
+1. Make sure you have `git-lfs` installed and git set up.
+
+```bash
+$ apt install git-lfs
+$ git config --global user.email "you@example.com"
+$ git config --global user.name "Your Name"
+```
+
+2. Log in with your HuggingFace account credentials using `huggingface-cli`:
+
+```bash
+$ huggingface-cli login
+# ...follow the prompts
+```
+
+3. When running the script, pass the following arguments:
+
+```bash
+python run_image_classification.py \
+    --push_to_hub \
+    --push_to_hub_model_id <name-your-model> \
+    ...
+```
diff --git a/examples/tensorflow/image-classification/requirements.txt b/examples/tensorflow/image-classification/requirements.txt
new file mode 100644
index 000000000000..ccdff7ba7884
--- /dev/null
+++ b/examples/tensorflow/image-classification/requirements.txt
@@ -0,0 +1,3 @@
+datasets>=1.17.0
+evaluate
+tensorflow>=2.4
diff --git a/examples/tensorflow/image-classification/run_image_classification.py b/examples/tensorflow/image-classification/run_image_classification.py
new file mode 100644
index 000000000000..b0b898007f11
--- /dev/null
+++ b/examples/tensorflow/image-classification/run_image_classification.py
@@ -0,0 +1,593 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+"""
+Fine-tuning a 🤗 Transformers model for image classification.
+
+Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
+https://huggingface.co/models?filter=image-classification
+"""
+
+import json
+import logging
+import os
+import sys
+import warnings
+from dataclasses import dataclass, field
+from typing import Optional
+
+import evaluate
+import numpy as np
+import tensorflow as tf
+from datasets import load_dataset
+from PIL import Image
+
+import transformers
+from transformers import (
+    TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
+    AutoConfig,
+    AutoImageProcessor,
+    DefaultDataCollator,
+    HfArgumentParser,
+    PushToHubCallback,
+    TFAutoModelForImageClassification,
+    TFTrainingArguments,
+    create_optimizer,
+    set_seed,
+)
+from transformers.keras_callbacks import KerasMetricCallback
+from transformers.trainer_utils import get_last_checkpoint, is_main_process
+from transformers.utils import check_min_version, send_example_telemetry
+from transformers.utils.versions import require_version
+
+
+logger = logging.getLogger(__name__)
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.35.0.dev0")
+
+require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")
+
+MODEL_CONFIG_CLASSES = list(TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+
+
+def pil_loader(path: str):
+    with open(path, "rb") as f:
+        im = Image.open(f)
+        return im.convert("RGB")
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    Using `HfArgumentParser` we can turn this class into argparse arguments to be able to specify
+    them on the command line.
+    """
+
+    dataset_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Name of a dataset from the hub (could be your own, possibly private dataset hosted on the hub)."
+        },
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_dir: Optional[str] = field(default=None, metadata={"help": "A folder containing the training data."})
+    validation_dir: Optional[str] = field(default=None, metadata={"help": "A folder containing the validation data."})
+    train_val_split: Optional[float] = field(
+        default=0.15, metadata={"help": "Percent to split off of train for validation."}
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of training examples to this "
+                "value if set."
+            )
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+                "value if set."
+            )
+        },
+    )
+    max_predict_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of prediction examples to this "
+                "value if set."
+            )
+        },
+    )
+
+    def __post_init__(self):
+        if self.dataset_name is None and (self.train_dir is None and self.validation_dir is None):
+            raise ValueError(
+                "You must specify either a dataset name from the hub or a train and/or validation directory."
+            )
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        default="google/vit-base-patch16-224-in21k",
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"},
+    )
+    model_type: Optional[str] = field(
+        default=None,
+        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
+    ignore_mismatched_sizes: bool = field(
+        default=False,
+        metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."},
+    )
+
+
+def center_crop(image, size):
+    size = (size, size) if isinstance(size, int) else size
+    orig_height, orig_width, _ = image.shape
+    crop_height, crop_width = size
+    top = (orig_height - orig_width) // 2
+    left = (orig_width - crop_width) // 2
+    image = tf.image.crop_to_bounding_box(image, top, left, crop_height, crop_width)
+    return image
+
+
+# Numpy and TensorFlow compatible version of PyTorch RandomResizedCrop. Code adapted from:
+# https://pytorch.org/vision/main/_modules/torchvision/transforms/transforms.html#RandomResizedCrop
+def random_crop(image, scale=(0.08, 1.0), ratio=(3.0 / 4.0, 4.0 / 3.0)):
+    height, width, _ = image.shape
+    area = height * width
+    log_ratio = np.log(ratio)
+    for _ in range(10):
+        target_area = np.random.uniform(*scale) * area
+        aspect_ratio = np.exp(np.random.uniform(*log_ratio))
+        w = int(round(np.sqrt(target_area * aspect_ratio)))
+        h = int(round(np.sqrt(target_area / aspect_ratio)))
+        if 0 < w <= width and 0 < h <= height:
+            i = np.random.randint(0, height - h + 1)
+            j = np.random.randint(0, width - w + 1)
+            return image[i : i + h, j : j + w, :]
+
+    # Fallback to central crop
+    in_ratio = float(width) / float(height)
+    w = width if in_ratio < min(ratio) else int(round(height * max(ratio)))
+    h = height if in_ratio > max(ratio) else int(round(width / min(ratio)))
+    i = (height - h) // 2
+    j = (width - w) // 2
+    return image[i : i + h, j : j + w, :]
+
+
+def random_resized_crop(image, size, scale=(0.08, 1.0), ratio=(3.0 / 4.0, 4.0 / 3.0)):
+    size = (size, size) if isinstance(size, int) else size
+    image = random_crop(image, scale, ratio)
+    image = tf.image.resize(image, size)
+    return image
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
+    if not (training_args.do_train or training_args.do_eval or training_args.do_predict):
+        exit("Must specify at least one of --do_train, --do_eval or --do_predict!")
+
+    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
+    # information sent is the one passed as arguments along with your Python/TensorFlow versions.
+    send_example_telemetry("run_image_classification", model_args, data_args, framework="tensorflow")
+
+    # Checkpoints. Find the checkpoint the use when loading the model.
+    checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        checkpoint = get_last_checkpoint(training_args.output_dir)
+        if checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    if is_main_process(training_args.local_rank):
+        transformers.utils.logging.set_verbosity_info()
+        transformers.utils.logging.enable_default_handler()
+        transformers.utils.logging.enable_explicit_format()
+    logger.info(f"Training/evaluation parameters {training_args}")
+
+    # region Dataset and labels
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+
+    # Initialize our dataset and prepare it for the 'image-classification' task.
+    if data_args.dataset_name is not None:
+        dataset = load_dataset(
+            data_args.dataset_name,
+            data_args.dataset_config_name,
+            cache_dir=model_args.cache_dir,
+            task="image-classification",
+            token=model_args.token,
+        )
+    else:
+        data_files = {}
+        if data_args.train_dir is not None:
+            data_files["train"] = os.path.join(data_args.train_dir, "**")
+        if data_args.validation_dir is not None:
+            data_files["validation"] = os.path.join(data_args.validation_dir, "**")
+        dataset = load_dataset(
+            "imagefolder",
+            data_files=data_files,
+            cache_dir=model_args.cache_dir,
+            task="image-classification",
+        )
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+    # Prepare label mappings.
+    # We'll include these in the model's config to get human readable labels in the Inference API.
+    labels = dataset["train"].features["labels"].names
+    label2id, id2label = {}, {}
+    for i, label in enumerate(labels):
+        label2id[label] = str(i)
+        id2label[str(i)] = label
+
+    # Load model image processor and configuration
+    config = AutoConfig.from_pretrained(
+        model_args.config_name or model_args.model_name_or_path,
+        num_labels=len(labels),
+        label2id=label2id,
+        id2label=id2label,
+        finetuning_task="image-classification",
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
+    )
+    image_processor = AutoImageProcessor.from_pretrained(
+        model_args.image_processor_name or model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
+    )
+
+    # If we don't have a validation split, split off a percentage of train as validation.
+    data_args.train_val_split = None if "validation" in dataset.keys() else data_args.train_val_split
+    if isinstance(data_args.train_val_split, float) and data_args.train_val_split > 0.0:
+        split = dataset["train"].train_test_split(data_args.train_val_split)
+        dataset["train"] = split["train"]
+        dataset["validation"] = split["test"]
+
+    # Define our data preprocessing function. It takes an image file path as input and returns
+    # Write a note describing the resizing behaviour.
+    if "shortest_edge" in image_processor.size:
+        # We instead set the target size as (shortest_edge, shortest_edge) to here to ensure all images are batchable.
+        image_size = (image_processor.size["shortest_edge"], image_processor.size["shortest_edge"])
+    else:
+        image_size = (image_processor.size["height"], image_processor.size["width"])
+
+    def _train_transforms(image):
+        img_size = image_size
+        image = tf.keras.utils.img_to_array(image)
+        image = random_resized_crop(image, size=img_size)
+        image = tf.image.random_flip_left_right(image)
+        image /= 255.0
+        image = (image - image_processor.image_mean) / image_processor.image_std
+        image = tf.transpose(image, perm=[2, 0, 1])
+        return image
+
+    def _val_transforms(image):
+        image = tf.keras.utils.img_to_array(image)
+        image = tf.image.resize(image, size=image_size)
+        # image = np.array(image) # FIXME - use tf.image function
+        image = center_crop(image, size=image_size)
+        image /= 255.0
+        image = (image - image_processor.image_mean) / image_processor.image_std
+        image = tf.transpose(image, perm=[2, 0, 1])
+        return image
+
+    def train_transforms(example_batch):
+        """Apply _train_transforms across a batch."""
+        example_batch["pixel_values"] = [
+            _train_transforms(pil_img.convert("RGB")) for pil_img in example_batch["image"]
+        ]
+        return example_batch
+
+    def val_transforms(example_batch):
+        """Apply _val_transforms across a batch."""
+        example_batch["pixel_values"] = [_val_transforms(pil_img.convert("RGB")) for pil_img in example_batch["image"]]
+        return example_batch
+
+    train_dataset = None
+    if training_args.do_train:
+        if "train" not in dataset:
+            raise ValueError("--do_train requires a train dataset")
+        train_dataset = dataset["train"]
+        if data_args.max_train_samples is not None:
+            train_dataset = train_dataset.shuffle(seed=training_args.seed).select(range(data_args.max_train_samples))
+        train_dataset = train_dataset.map(
+            train_transforms,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+
+    eval_dataset = None
+    if training_args.do_eval:
+        if "validation" not in dataset:
+            raise ValueError("--do_eval requires a validation dataset")
+        eval_dataset = dataset["validation"]
+        if data_args.max_eval_samples is not None:
+            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
+        # Set the validation transforms
+        eval_dataset = eval_dataset.map(
+            val_transforms,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+
+    predict_dataset = None
+    if training_args.do_predict:
+        if "test" not in dataset:
+            raise ValueError("--do_predict requires a test dataset")
+        predict_dataset = dataset["test"]
+        if data_args.max_predict_samples is not None:
+            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
+        # Set the test transforms
+        predict_dataset = predict_dataset.map(
+            val_transforms,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+
+    collate_fn = DefaultDataCollator(return_tensors="np")
+
+    # Load the accuracy metric from the datasets package
+    metric = evaluate.load("accuracy")
+
+    # Define our compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
+    # predictions and label_ids field) and has to return a dictionary string to float.
+    def compute_metrics(p):
+        """Computes accuracy on a batch of predictions"""
+        logits, label_ids = p
+        predictions = np.argmax(logits, axis=-1)
+        metrics = metric.compute(predictions=predictions, references=label_ids)
+        return metrics
+
+    with training_args.strategy.scope():
+        if checkpoint is None:
+            model_path = model_args.model_name_or_path
+        else:
+            model_path = checkpoint
+
+        model = TFAutoModelForImageClassification.from_pretrained(
+            model_path,
+            config=config,
+            from_pt=bool(".bin" in model_path),
+            cache_dir=model_args.cache_dir,
+            revision=model_args.model_revision,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
+            ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
+        )
+        num_replicas = training_args.strategy.num_replicas_in_sync
+        total_train_batch_size = training_args.per_device_train_batch_size * num_replicas
+        total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas
+
+        dataset_options = tf.data.Options()
+        dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
+
+        if training_args.do_train:
+            num_train_steps = int(len(train_dataset) * training_args.num_train_epochs)
+            if training_args.warmup_steps > 0:
+                num_warmpup_steps = int(training_args.warmup_steps)
+            elif training_args.warmup_ratio > 0:
+                num_warmpup_steps = int(training_args.warmup_ratio * num_train_steps)
+            else:
+                num_warmpup_steps = 0
+
+            optimizer, _ = create_optimizer(
+                init_lr=training_args.learning_rate,
+                num_train_steps=num_train_steps,
+                num_warmup_steps=num_warmpup_steps,
+                adam_beta1=training_args.adam_beta1,
+                adam_beta2=training_args.adam_beta2,
+                adam_epsilon=training_args.adam_epsilon,
+                weight_decay_rate=training_args.weight_decay,
+                adam_global_clipnorm=training_args.max_grad_norm,
+            )
+            # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
+            # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
+            # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
+            # yourself if you use this method, whereas they are automatically inferred from the model input names when
+            # using model.prepare_tf_dataset()
+            # For more info see the docs:
+            # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
+            # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
+            train_dataset = model.prepare_tf_dataset(
+                train_dataset,
+                shuffle=True,
+                batch_size=total_train_batch_size,
+                collate_fn=collate_fn,
+            ).with_options(dataset_options)
+        else:
+            optimizer = None
+
+        if training_args.do_eval:
+            eval_dataset = model.prepare_tf_dataset(
+                eval_dataset,
+                shuffle=False,
+                batch_size=total_eval_batch_size,
+                collate_fn=collate_fn,
+            ).with_options(dataset_options)
+
+        if training_args.do_predict:
+            predict_dataset = model.prepare_tf_dataset(
+                predict_dataset,
+                shuffle=False,
+                batch_size=total_eval_batch_size,
+                collate_fn=collate_fn,
+            ).with_options(dataset_options)
+
+        # Transformers models compute the right loss for their task by default when labels are passed, and will
+        # use this for training unless you specify your own loss function in compile().
+        model.compile(optimizer=optimizer, jit_compile=training_args.xla, metrics=["accuracy"])
+
+        push_to_hub_model_id = training_args.push_to_hub_model_id
+        if not push_to_hub_model_id:
+            model_name = model_args.model_name_or_path.split("/")[-1]
+            push_to_hub_model_id = f"{model_name}-finetuned-image-classification"
+
+        model_card_kwargs = {
+            "finetuned_from": model_args.model_name_or_path,
+            "tasks": "image-classification",
+            "dataset": data_args.dataset_name,
+            "tags": ["image-classification", "tensorflow", "vision"],
+        }
+
+        callbacks = []
+        if eval_dataset is not None:
+            callbacks.append(KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=eval_dataset))
+        if training_args.push_to_hub:
+            callbacks.append(
+                PushToHubCallback(
+                    output_dir=training_args.output_dir,
+                    hub_model_id=push_to_hub_model_id,
+                    hub_token=training_args.push_to_hub_token,
+                    tokenizer=image_processor,
+                    **model_card_kwargs,
+                )
+            )
+
+        if training_args.do_train:
+            model.fit(
+                train_dataset,
+                validation_data=eval_dataset,
+                epochs=int(training_args.num_train_epochs),
+                callbacks=callbacks,
+            )
+
+        if training_args.do_eval:
+            n_eval_batches = len(eval_dataset)
+            eval_predictions = model.predict(eval_dataset, steps=n_eval_batches)
+            eval_labels = dataset["validation"]["labels"][: n_eval_batches * total_eval_batch_size]
+            eval_metrics = compute_metrics((eval_predictions.logits, eval_labels))
+            logging.info("Eval metrics:")
+            for metric_name, value in eval_metrics.items():
+                logging.info(f"{metric_name}: {value:.3f}")
+
+        if training_args.output_dir is not None:
+            os.makedirs(training_args.output_dir, exist_ok=True)
+            with open(os.path.join(training_args.output_dir, "all_results.json"), "w") as f:
+                f.write(json.dumps(eval_metrics))
+
+        if training_args.do_predict:
+            n_predict_batches = len(predict_dataset)
+            test_predictions = model.predict(predict_dataset, steps=n_predict_batches)
+            test_labels = dataset["validation"]["labels"][: n_predict_batches * total_eval_batch_size]
+            test_metrics = compute_metrics((test_predictions.logits, test_labels))
+            logging.info("Test metrics:")
+            for metric_name, value in test_metrics.items():
+                logging.info(f"{metric_name}: {value:.3f}")
+
+        if training_args.output_dir is not None and not training_args.push_to_hub:
+            # If we're not pushing to hub, at least save a local copy when we're done
+            model.save_pretrained(training_args.output_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/tensorflow/language-modeling-tpu/README.md b/examples/tensorflow/language-modeling-tpu/README.md
new file mode 100644
index 000000000000..25381f86d093
--- /dev/null
+++ b/examples/tensorflow/language-modeling-tpu/README.md
@@ -0,0 +1,110 @@
+# Training a masked language model end-to-end from scratch on TPUs
+
+In this example, we're going to demonstrate how to train a TensorFlow model from 🤗 Transformers from scratch. If you're interested in some background theory on training Hugging Face models with TensorFlow on TPU, please check out our 
+[tutorial doc](https://huggingface.co/docs/transformers/main/perf_train_tpu_tf) on this topic!
+If you're interested in smaller-scale TPU training from a pre-trained checkpoint, you can also check out the  [TPU fine-tuning example](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb).
+
+This example will demonstrate pre-training language models at the 100M-1B parameter scale, similar to BERT or GPT-2. More concretely, we will show how to train a [RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta) (base model) from scratch on the [WikiText dataset (v1)](https://huggingface.co/datasets/wikitext).
+
+We've tried to ensure that all the practices we show you here are scalable, though - with relatively few changes, the code could be scaled up to much larger models. 
+
+Google's gargantuan [PaLM model](https://arxiv.org/abs/2204.02311), with
+over 500B parameters, is a good example of how far you can go with pure TPU training, though gathering the dataset and the budget to train at that scale is not an easy task!
+
+### Table of contents 
+
+- [Setting up a TPU-VM](#setting-up-a-tpu-vm)
+- [Training a tokenizer](#training-a-tokenizer)
+- [Preparing the dataset](#preparing-the-dataset)
+- [Training the model](#training-the-model)
+- [Inference](#inference)
+
+## Setting up a TPU-VM
+
+Since this example focuses on using TPUs, the first step is to set up access to TPU hardware. For this example, we chose to use a TPU v3-8 VM. Follow [this guide](https://cloud.google.com/tpu/docs/run-calculation-tensorflow) to quickly create a TPU VM with TensorFlow pre-installed. 
+
+> 💡 **Note**: You don't need a TPU-enabled hardware for tokenizer training and TFRecord shard preparation.
+
+## Training a tokenizer
+
+To train a language model from scratch, the first step is to tokenize text. In most Hugging Face examples, we begin from a pre-trained model and use its tokenizer. However, in this example, we're going to train a tokenizer from scratch as well. The script for this is `train_unigram.py`. An example command is:
+
+```bash 
+python train_unigram.py --batch_size 1000 --vocab_size 25000 --export_to_hub
+```
+
+The script will automatically load the `train` split of the WikiText dataset and train a [Unigram tokenizer](https://huggingface.co/course/chapter6/7?fw=pt) on it.
+
+> 💡 **Note**: In order for `export_to_hub` to work, you must authenticate yourself with the `huggingface-cli`. Run `huggingface-cli login` and follow the on-screen instructions.
+
+## Preparing the dataset
+
+The next step is to prepare the dataset. This consists of loading a text dataset from the Hugging Face Hub, tokenizing it and grouping it into chunks of a fixed length ready for training. The script for this is `prepare_tfrecord_shards.py`.
+
+The reason we create TFRecord output files from this step is that these files work well with [`tf.data` pipelines](https://www.tensorflow.org/guide/data_performance). This makes them very suitable for scalable TPU training - the dataset can easily be sharded and read in parallel just by tweaking a few parameters in the pipeline. An example command is:
+
+```bash
+python prepare_tfrecord_shards.py \
+  --tokenizer_name_or_path tf-tpu/unigram-tokenizer-wikitext \
+  --shard_size 5000  \
+  --split test 
+  --max_length 128 \
+  --output_dir gs://tf-tpu-training-resources
+```
+
+**Notes**:
+
+* While running the above script, you need to specify the `split` accordingly. The example command above will only filter the `test` split of the dataset. 
+* If you append `gs://` in your `output_dir` the TFRecord shards will be directly serialized to a Google Cloud Storage (GCS) bucket. Ensure that you have already [created the GCS bucket](https://cloud.google.com/storage/docs). 
+* If you're using a TPU node, you must stream data from a GCS bucket. Otherwise, if you're using a TPU VM,you can store the data locally. You may need to [attach](https://cloud.google.com/tpu/docs/setup-persistent-disk) a persistent storage to the VM. 
+* Additional CLI arguments are also supported. We encourage you to run `python prepare_tfrecord_shards.py -h` to know more about them.
+
+## Training the model
+
+Once that's done, the model is ready for training. By default, training takes place on TPU, but you can use the `--no_tpu` flag to train on CPU for testing purposes. An example command is:
+
+```bash
+python3 run_mlm.py \
+  --train_dataset gs://tf-tpu-training-resources/train/ \
+  --eval_dataset gs://tf-tpu-training-resources/validation/ \
+  --tokenizer tf-tpu/unigram-tokenizer-wikitext \
+  --output_dir trained_model  
+```
+
+If you had specified a `hub_model_id` while launching training, then your model will be pushed to a model repository on the Hugging Face Hub. You can find such an example repository here:
+[tf-tpu/roberta-base-epochs-500-no-wd](https://huggingface.co/tf-tpu/roberta-base-epochs-500-no-wd).
+
+## Inference
+
+Once the model is trained, you can use 🤗 Pipelines to perform inference:
+
+```python
+from transformers import pipeline
+
+model_id = "tf-tpu/roberta-base-epochs-500-no-wd"
+unmasker = pipeline("fill-mask", model=model_id, framework="tf")
+unmasker("Goal of my life is to [MASK].")
+
+[{'score': 0.1003185287117958,
+  'token': 52,
+  'token_str': 'be',
+  'sequence': 'Goal of my life is to be.'},
+ {'score': 0.032648514956235886,
+  'token': 5,
+  'token_str': '',
+  'sequence': 'Goal of my life is to .'},
+ {'score': 0.02152673341333866,
+  'token': 138,
+  'token_str': 'work',
+  'sequence': 'Goal of my life is to work.'},
+ {'score': 0.019547373056411743,
+  'token': 984,
+  'token_str': 'act',
+  'sequence': 'Goal of my life is to act.'},
+ {'score': 0.01939118467271328,
+  'token': 73,
+  'token_str': 'have',
+  'sequence': 'Goal of my life is to have.'}]
+```
+
+You can also try out inference using the [Inference Widget](https://huggingface.co/tf-tpu/roberta-base-epochs-500-no-wd?text=Goal+of+my+life+is+to+%5BMASK%5D.) from the model page.
\ No newline at end of file
diff --git a/examples/tensorflow/language-modeling-tpu/prepare_tfrecord_shards.py b/examples/tensorflow/language-modeling-tpu/prepare_tfrecord_shards.py
new file mode 100644
index 000000000000..a8bb7d37929f
--- /dev/null
+++ b/examples/tensorflow/language-modeling-tpu/prepare_tfrecord_shards.py
@@ -0,0 +1,181 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Script for preparing TFRecord shards for pre-tokenized examples."""
+
+import argparse
+import logging
+import os
+
+import datasets
+import tensorflow as tf
+
+from transformers import AutoTokenizer
+
+
+logger = logging.getLogger(__name__)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Prepare TFRecord shards from pre-tokenized samples of the wikitext dataset."
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default="wikitext",
+        help="Name of the training. Explore datasets at: hf.co/datasets.",
+    )
+    parser.add_argument(
+        "--dataset_config", type=str, default="wikitext-103-raw-v1", help="Configuration name of the dataset."
+    )
+    parser.add_argument(
+        "--tokenizer_name_or_path",
+        type=str,
+        default="sayakpaul/unigram-tokenizer-wikitext",
+        help="Tokenizer identifier. Can be a local filepath or a Hub identifier.",
+    )
+    parser.add_argument(
+        "--shard_size",
+        type=int,
+        default=1000,
+        help="Number of entries to go in a single shard.",
+    )
+    parser.add_argument("--split", type=str, default="train", choices=["train", "test", "validation"])
+    parser.add_argument(
+        "--limit",
+        default=None,
+        type=int,
+        help="Limit the number of shards (used for debugging).",
+    )
+    parser.add_argument(
+        "--max_length",
+        type=int,
+        default=512,
+        help="Maximum sequence length. For training on TPUs, it helps to have a maximum"
+        " sequence length that is a multiple of 8.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        default="tf-tpu",
+        type=str,
+        help="Output directory where the TFRecord shards will be saved. If the"
+        " path is appended with `gs://` ('gs://tf-tpu', for example) then the TFRecord"
+        " shards will be directly saved to a Google Cloud Storage bucket.",
+    )
+
+    args = parser.parse_args()
+    return args
+
+
+def tokenize_function(tokenizer):
+    def fn(examples):
+        return tokenizer(examples["text"])
+
+    return fn
+
+
+def get_serialized_examples(tokenized_data):
+    records = []
+    for i in range(len(tokenized_data["input_ids"])):
+        features = {
+            "input_ids": tf.train.Feature(int64_list=tf.train.Int64List(value=tokenized_data["input_ids"][i])),
+            "attention_mask": tf.train.Feature(
+                int64_list=tf.train.Int64List(value=tokenized_data["attention_mask"][i])
+            ),
+        }
+        features = tf.train.Features(feature=features)
+        example = tf.train.Example(features=features)
+        record_bytes = example.SerializeToString()
+        records.append(record_bytes)
+    return records
+
+
+def main(args):
+    dataset = datasets.load_dataset(args.dataset_name, args.dataset_config, split=args.split)
+
+    if args.limit is not None:
+        max_samples = min(len(dataset), args.limit)
+        dataset = dataset.select(range(max_samples))
+        print(f"Limiting the dataset to {args.limit} entries.")
+
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name_or_path)
+
+    # Handle output directory creation.
+    # For serializing into a Google Cloud Storage Bucket, one needs to first
+    # create a bucket.
+    if "gs" not in args.output_dir:
+        if not os.path.exists(args.output_dir):
+            os.makedirs(args.output_dir)
+        split_dir = os.path.join(args.output_dir, args.split)
+        if not os.path.exists(split_dir):
+            os.makedirs(split_dir)
+    else:
+        split_dir = os.path.join(args.output_dir, args.split)
+
+    # Tokenize the whole dataset at once.
+    tokenize_fn = tokenize_function(tokenizer)
+    dataset_tokenized = dataset.map(tokenize_fn, batched=True, num_proc=4, remove_columns=["text"])
+
+    # We need to concatenate all our texts together, and then split the result
+    # into chunks of a fixed size, which we will call block_size. To do this, we
+    # will use the map method again, with the option batched=True. When we use batched=True,
+    # the function we pass to map() will be passed multiple inputs at once, allowing us
+    # to group them into more or fewer examples than we had in the input.
+    # This allows us to create our new fixed-length samples. The advantage of this
+    # method is that we don't lose a whole lot of content from the dataset compared to the
+    # case where we simply tokenize with a pre-defined max_length.
+
+    def group_texts(examples):
+        # Concatenate all texts.
+        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+        total_length = len(concatenated_examples[list(examples.keys())[0]])
+        # We drop the small remainder, though you could add padding instead if the model supports it
+        # In this, as in all things, we advise you to follow your heart 🫀
+        total_length = (total_length // args.max_length) * args.max_length
+        # Split by chunks of max_len.
+        result = {
+            k: [t[i : i + args.max_length] for i in range(0, total_length, args.max_length)]
+            for k, t in concatenated_examples.items()
+        }
+        return result
+
+    grouped_dataset = dataset_tokenized.map(group_texts, batched=True, batch_size=1000, num_proc=4)
+
+    shard_count = 0
+    total_records = 0
+    for shard in range(0, len(grouped_dataset), args.shard_size):
+        dataset_snapshot = grouped_dataset[shard : shard + args.shard_size]
+        records_containing = len(dataset_snapshot["input_ids"])
+        filename = os.path.join(split_dir, f"dataset-{shard_count}-{records_containing}.tfrecord")
+        serialized_examples = get_serialized_examples(dataset_snapshot)
+
+        with tf.io.TFRecordWriter(filename) as out_file:
+            for i in range(len(serialized_examples)):
+                example = serialized_examples[i]
+                out_file.write(example)
+            print("Wrote file {} containing {} records".format(filename, records_containing))
+
+        shard_count += 1
+        total_records += records_containing
+
+    with open(f"split-{args.split}-records-count.txt", "w") as f:
+        print(f"Total {args.split} records: {total_records}", file=f)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/examples/tensorflow/language-modeling-tpu/requirements.txt b/examples/tensorflow/language-modeling-tpu/requirements.txt
new file mode 100644
index 000000000000..60bbe767a214
--- /dev/null
+++ b/examples/tensorflow/language-modeling-tpu/requirements.txt
@@ -0,0 +1,3 @@
+transformers==4.26.1
+datasets==2.9.0
+tokenizers==0.13.2
diff --git a/examples/tensorflow/language-modeling-tpu/run_mlm.py b/examples/tensorflow/language-modeling-tpu/run_mlm.py
new file mode 100644
index 000000000000..e9e9862a6da4
--- /dev/null
+++ b/examples/tensorflow/language-modeling-tpu/run_mlm.py
@@ -0,0 +1,309 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Script for training a masked language model on TPU."""
+
+import argparse
+import logging
+import os
+import re
+
+import tensorflow as tf
+
+from transformers import (
+    AutoConfig,
+    AutoTokenizer,
+    DataCollatorForLanguageModeling,
+    PushToHubCallback,
+    TFAutoModelForMaskedLM,
+    create_optimizer,
+)
+
+
+logger = logging.getLogger(__name__)
+
+AUTO = tf.data.AUTOTUNE
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Train a masked language model on TPU.")
+    parser.add_argument(
+        "--pretrained_model_config",
+        type=str,
+        default="roberta-base",
+        help="The model config to use. Note that we don't copy the model's weights, only the config!",
+    )
+    parser.add_argument(
+        "--tokenizer",
+        type=str,
+        default="unigram-tokenizer-wikitext",
+        help="The name of the tokenizer to load. We use the pretrained tokenizer to initialize the model's vocab size.",
+    )
+
+    parser.add_argument(
+        "--per_replica_batch_size",
+        type=int,
+        default=8,
+        help="Batch size per TPU core.",
+    )
+
+    parser.add_argument(
+        "--no_tpu",
+        action="store_true",
+        help="If set, run on CPU and don't try to initialize a TPU. Useful for debugging on non-TPU instances.",
+    )
+
+    parser.add_argument(
+        "--tpu_name",
+        type=str,
+        help="Name of TPU resource to initialize. Should be blank on Colab, and 'local' on TPU VMs.",
+        default="local",
+    )
+
+    parser.add_argument(
+        "--tpu_zone",
+        type=str,
+        help="Google cloud zone that TPU resource is located in. Only used for non-Colab TPU nodes.",
+    )
+
+    parser.add_argument(
+        "--gcp_project", type=str, help="Google cloud project name. Only used for non-Colab TPU nodes."
+    )
+
+    parser.add_argument(
+        "--bfloat16",
+        action="store_true",
+        help="Use mixed-precision bfloat16 for training. This is the recommended lower-precision format for TPU.",
+    )
+
+    parser.add_argument(
+        "--train_dataset",
+        type=str,
+        help="Path to training dataset to load. If the path begins with `gs://`"
+        " then the dataset will be loaded from a Google Cloud Storage bucket.",
+    )
+
+    parser.add_argument(
+        "--shuffle_buffer_size",
+        type=int,
+        default=2**18,  # Default corresponds to a 1GB buffer for seq_len 512
+        help="Size of the shuffle buffer (in samples)",
+    )
+
+    parser.add_argument(
+        "--eval_dataset",
+        type=str,
+        help="Path to evaluation dataset to load. If the path begins with `gs://`"
+        " then the dataset will be loaded from a Google Cloud Storage bucket.",
+    )
+
+    parser.add_argument(
+        "--num_epochs",
+        type=int,
+        default=1,
+        help="Number of epochs to train for.",
+    )
+
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Learning rate to use for training.",
+    )
+
+    parser.add_argument(
+        "--weight_decay_rate",
+        type=float,
+        default=1e-3,
+        help="Weight decay rate to use for training.",
+    )
+
+    parser.add_argument(
+        "--max_length",
+        type=int,
+        default=512,
+        help="Maximum length of tokenized sequences. Should match the setting used in prepare_tfrecord_shards.py",
+    )
+
+    parser.add_argument(
+        "--mlm_probability",
+        type=float,
+        default=0.15,
+        help="Fraction of tokens to mask during training.",
+    )
+
+    parser.add_argument("--output_dir", type=str, required=True, help="Path to save model checkpoints to.")
+    parser.add_argument("--hub_model_id", type=str, help="Model ID to upload to on the Hugging Face Hub.")
+
+    args = parser.parse_args()
+    return args
+
+
+def initialize_tpu(args):
+    try:
+        if args.tpu_name:
+            tpu = tf.distribute.cluster_resolver.TPUClusterResolver(
+                args.tpu_name, zone=args.tpu_zone, project=args.gcp_project
+            )
+        else:
+            tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
+    except ValueError:
+        raise RuntimeError(
+            "Couldn't connect to TPU! Most likely you need to specify --tpu_name, --tpu_zone, or "
+            "--gcp_project. When running on a TPU VM, use --tpu_name local."
+        )
+
+    tf.config.experimental_connect_to_cluster(tpu)
+    tf.tpu.experimental.initialize_tpu_system(tpu)
+
+    return tpu
+
+
+def count_samples(file_list):
+    num_samples = 0
+    for file in file_list:
+        filename = file.split("/")[-1]
+        sample_count = re.search(r"-\d+-(\d+)\.tfrecord", filename).group(1)
+        sample_count = int(sample_count)
+        num_samples += sample_count
+
+    return num_samples
+
+
+def prepare_dataset(records, decode_fn, mask_fn, batch_size, shuffle, shuffle_buffer_size=None):
+    num_samples = count_samples(records)
+    dataset = tf.data.Dataset.from_tensor_slices(records)
+    if shuffle:
+        dataset = dataset.shuffle(len(dataset))
+    dataset = tf.data.TFRecordDataset(dataset, num_parallel_reads=AUTO)
+    # TF can't infer the total sample count because it doesn't read all the records yet, so we assert it here
+    dataset = dataset.apply(tf.data.experimental.assert_cardinality(num_samples))
+    dataset = dataset.map(decode_fn, num_parallel_calls=AUTO)
+    if shuffle:
+        assert shuffle_buffer_size is not None
+        dataset = dataset.shuffle(args.shuffle_buffer_size)
+    dataset = dataset.batch(batch_size, drop_remainder=True)
+    dataset = dataset.map(mask_fn, num_parallel_calls=AUTO)
+    dataset = dataset.prefetch(AUTO)
+    return dataset
+
+
+def main(args):
+    if not args.no_tpu:
+        tpu = initialize_tpu(args)
+        strategy = tf.distribute.TPUStrategy(tpu)
+    else:
+        strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
+
+    if args.bfloat16:
+        tf.keras.mixed_precision.set_global_policy("mixed_bfloat16")
+
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
+    config = AutoConfig.from_pretrained(args.pretrained_model_config)
+    config.vocab_size = tokenizer.vocab_size
+
+    training_records = tf.io.gfile.glob(os.path.join(args.train_dataset, "*.tfrecord"))
+    if not training_records:
+        raise ValueError(f"No .tfrecord files found in {args.train_dataset}.")
+    eval_records = tf.io.gfile.glob(os.path.join(args.eval_dataset, "*.tfrecord"))
+    if not eval_records:
+        raise ValueError(f"No .tfrecord files found in {args.eval_dataset}.")
+
+    num_train_samples = count_samples(training_records)
+
+    steps_per_epoch = num_train_samples // (args.per_replica_batch_size * strategy.num_replicas_in_sync)
+    total_train_steps = steps_per_epoch * args.num_epochs
+
+    with strategy.scope():
+        model = TFAutoModelForMaskedLM.from_config(config)
+        model(model.dummy_inputs)  # Pass some dummy inputs through the model to ensure all the weights are built
+        optimizer, schedule = create_optimizer(
+            num_train_steps=total_train_steps,
+            num_warmup_steps=total_train_steps // 20,
+            init_lr=args.learning_rate,
+            weight_decay_rate=args.weight_decay_rate,
+        )
+
+        # Transformers models compute the right loss for their task by default when labels are passed, and will
+        # use this for training unless you specify your own loss function in compile().
+        model.compile(optimizer=optimizer, metrics=["accuracy"])
+
+    def decode_fn(example):
+        features = {
+            "input_ids": tf.io.FixedLenFeature(dtype=tf.int64, shape=(args.max_length,)),
+            "attention_mask": tf.io.FixedLenFeature(dtype=tf.int64, shape=(args.max_length,)),
+        }
+        return tf.io.parse_single_example(example, features)
+
+    # Many of the data collators in Transformers are TF-compilable when return_tensors == "tf", so we can
+    # use their methods in our data pipeline.
+    data_collator = DataCollatorForLanguageModeling(
+        tokenizer=tokenizer, mlm_probability=args.mlm_probability, mlm=True, return_tensors="tf"
+    )
+
+    def mask_with_collator(batch):
+        # TF really needs an isin() function
+        special_tokens_mask = (
+            ~tf.cast(batch["attention_mask"], tf.bool)
+            | (batch["input_ids"] == tokenizer.cls_token_id)
+            | (batch["input_ids"] == tokenizer.sep_token_id)
+        )
+        batch["input_ids"], batch["labels"] = data_collator.tf_mask_tokens(
+            batch["input_ids"],
+            vocab_size=len(tokenizer),
+            mask_token_id=tokenizer.mask_token_id,
+            special_tokens_mask=special_tokens_mask,
+        )
+        return batch
+
+    batch_size = args.per_replica_batch_size * strategy.num_replicas_in_sync
+
+    train_dataset = prepare_dataset(
+        training_records,
+        decode_fn=decode_fn,
+        mask_fn=mask_with_collator,
+        batch_size=batch_size,
+        shuffle=True,
+        shuffle_buffer_size=args.shuffle_buffer_size,
+    )
+
+    eval_dataset = prepare_dataset(
+        eval_records,
+        decode_fn=decode_fn,
+        mask_fn=mask_with_collator,
+        batch_size=batch_size,
+        shuffle=False,
+    )
+
+    callbacks = []
+    if args.hub_model_id:
+        callbacks.append(
+            PushToHubCallback(output_dir=args.output_dir, hub_model_id=args.hub_model_id, tokenizer=tokenizer)
+        )
+
+    model.fit(
+        train_dataset,
+        validation_data=eval_dataset,
+        epochs=args.num_epochs,
+        callbacks=callbacks,
+    )
+
+    model.save_pretrained(args.output_dir)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/examples/tensorflow/language-modeling-tpu/train_unigram.py b/examples/tensorflow/language-modeling-tpu/train_unigram.py
new file mode 100644
index 000000000000..ea8246a99f3b
--- /dev/null
+++ b/examples/tensorflow/language-modeling-tpu/train_unigram.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Script for training a Unigram tokenizer."""
+
+import argparse
+import logging
+
+import datasets
+from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers, processors
+from tokenizers.models import Unigram
+from tokenizers.trainers import UnigramTrainer
+
+from transformers import AlbertTokenizerFast
+
+
+logger = logging.getLogger(__name__)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Train a unigram tokenizer on the wikitext dataset.")
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default="wikitext",
+        help="Name of the training. Explore datasets at: hf.co/datasets.",
+    )
+    parser.add_argument(
+        "--dataset_config", type=str, default="wikitext-103-raw-v1", help="Configuration name of the dataset."
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=1000,
+        help="Batch size during training.",
+    )
+    parser.add_argument(
+        "--vocab_size",
+        type=int,
+        default=10048,
+        help="Size of the desired vocabulary.",
+    )
+    parser.add_argument(
+        "--limit",
+        default=None,
+        type=int,
+        help="Limit the number of shards (used for debugging).",
+    )
+    parser.add_argument(
+        "--export_to_hub",
+        action="store_true",
+    )
+
+    args = parser.parse_args()
+    return args
+
+
+def main(args):
+    dataset = datasets.load_dataset(args.dataset_name, args.dataset_config, split="train")
+
+    if args.limit is not None:
+        max_train_samples = min(len(dataset), args.limit)
+        dataset = dataset.select(range(max_train_samples))
+        logger.info(f"Limiting the dataset to {args.limit} entries.")
+
+    def batch_iterator():
+        for i in range(0, len(dataset), args.batch_size):
+            yield dataset[i : i + args.batch_size]["text"]
+
+    # Prepare the tokenizer.
+    tokenizer = Tokenizer(Unigram())
+    tokenizer.normalizer = normalizers.Sequence([normalizers.Replace("``", '"'), normalizers.Replace("''", '"')])
+    tokenizer.pre_tokenizer = pre_tokenizers.Metaspace()
+
+    # Prepare the trainer.
+    trainer = UnigramTrainer(
+        unk_token="<unk>",
+        special_tokens=["[CLS]", "[SEP]", "<unk>", "<pad>", "[MASK]"],
+        vocab_size=args.vocab_size,
+    )
+
+    logger.info("Training the tokenizer.")
+    tokenizer.train_from_iterator(batch_iterator(), trainer=trainer)
+    logger.info("Tokenizer training complete!")
+
+    cls_token_id = tokenizer.token_to_id("[CLS]")
+    sep_token_id = tokenizer.token_to_id("[SEP]")
+    tokenizer.post_processor = processors.TemplateProcessing(
+        single="[CLS]:0 $A:0 [SEP]:0",
+        pair="[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
+        special_tokens=[
+            ("[CLS]", cls_token_id),
+            ("[SEP]", sep_token_id),
+        ],
+    )
+    tokenizer.decoder = decoders.Metaspace()
+
+    if args.export_to_hub:
+        logger.info("Exporting the trained tokenzier to Hub.")
+        new_tokenizer = AlbertTokenizerFast(tokenizer_object=tokenizer)
+        new_tokenizer.push_to_hub("unigram-tokenizer-dataset")
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/examples/tensorflow/language-modeling/run_clm.py b/examples/tensorflow/language-modeling/run_clm.py
index 51087123b564..4459f825814f 100755
--- a/examples/tensorflow/language-modeling/run_clm.py
+++ b/examples/tensorflow/language-modeling/run_clm.py
@@ -30,6 +30,7 @@
 import os
 import random
 import sys
+import warnings
 from dataclasses import dataclass, field
 from itertools import chain
 from pathlib import Path
@@ -77,7 +78,7 @@ class ModelArguments:
         default=None,
         metadata={
             "help": (
-                "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch."
+                "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
             )
         },
     )
@@ -112,12 +113,28 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -220,6 +237,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_clm", model_args, data_args, framework="tensorflow")
@@ -287,7 +310,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
         if "validation" not in raw_datasets.keys():
             raw_datasets["validation"] = load_dataset(
@@ -295,14 +318,14 @@ def main():
                 data_args.dataset_config_name,
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
             raw_datasets["train"] = load_dataset(
                 data_args.dataset_name,
                 data_args.dataset_config_name,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
     else:
         data_files = {}
@@ -323,7 +346,7 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
             **dataset_args,
         )
         # If no validation data is there, validation_split_percentage will be used to divide the dataset.
@@ -333,7 +356,7 @@ def main():
                 data_files=data_files,
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
                 **dataset_args,
             )
             raw_datasets["train"] = load_dataset(
@@ -341,7 +364,7 @@ def main():
                 data_files=data_files,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
                 **dataset_args,
             )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
@@ -353,17 +376,27 @@ def main():
     # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
     # download model & vocab.
     if model_args.config_name:
-        config = AutoConfig.from_pretrained(model_args.config_name)
+        config = AutoConfig.from_pretrained(
+            model_args.config_name,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
+        )
     elif model_args.model_name_or_path:
-        config = AutoConfig.from_pretrained(model_args.model_name_or_path)
+        config = AutoConfig.from_pretrained(
+            model_args.model_name_or_path, token=model_args.token, trust_remote_code=model_args.trust_remote_code
+        )
     else:
         config = CONFIG_MAPPING[model_args.model_type]()
         logger.warning("You are instantiating a new config instance from scratch.")
 
     if model_args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name)
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.tokenizer_name, token=model_args.token, trust_remote_code=model_args.trust_remote_code
+        )
     elif model_args.model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path, token=model_args.token, trust_remote_code=model_args.trust_remote_code
+        )
     else:
         raise ValueError(
             "You are instantiating a new tokenizer from scratch. This is not supported by this script."
@@ -390,12 +423,12 @@ def tokenize_function(examples):
 
     if data_args.block_size is None:
         block_size = tokenizer.model_max_length
-        if block_size > 1024:
+        if block_size > config.max_position_embeddings:
             logger.warning(
                 f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
-                "Picking 1024 instead. You can change that default value by passing --block_size xxx."
+                f"Using block_size={min(1024, config.max_position_embeddings)} instead. You can change that default value by passing --block_size xxx."
             )
-            block_size = 1024
+            block_size = min(1024, config.max_position_embeddings)
     else:
         if data_args.block_size > tokenizer.model_max_length:
             logger.warning(
@@ -426,7 +459,7 @@ def group_texts(examples):
     # to preprocess.
     #
     # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
-    # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+    # https://huggingface.co/docs/datasets/process#map
 
     lm_datasets = tokenized_datasets.map(
         group_texts,
@@ -466,16 +499,33 @@ def group_texts(examples):
     with training_args.strategy.scope():
         # region Prepare model
         if checkpoint is not None:
-            model = TFAutoModelForCausalLM.from_pretrained(checkpoint, config=config)
+            model = TFAutoModelForCausalLM.from_pretrained(
+                checkpoint, config=config, token=model_args.token, trust_remote_code=model_args.trust_remote_code
+            )
         elif model_args.model_name_or_path:
-            model = TFAutoModelForCausalLM.from_pretrained(model_args.model_name_or_path, config=config)
+            model = TFAutoModelForCausalLM.from_pretrained(
+                model_args.model_name_or_path,
+                config=config,
+                token=model_args.token,
+                trust_remote_code=model_args.trust_remote_code,
+            )
         else:
             logger.info("Training new model from scratch")
-            model = TFAutoModelForCausalLM.from_config(config)
+            model = TFAutoModelForCausalLM.from_config(
+                config, token=model_args.token, trust_remote_code=model_args.trust_remote_code
+            )
 
         # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
         # on a small vocab and want a smaller embedding size, remove this test.
-        embedding_size = model.get_input_embeddings().weight.shape[0]
+        embeddings = model.get_input_embeddings()
+
+        # Matt: This is a temporary workaround as we transition our models to exclusively using Keras embeddings.
+        #       As soon as the transition is complete, all embeddings should be keras.Embeddings layers, and
+        #       the weights will always be in embeddings.embeddings.
+        if hasattr(embeddings, "embeddings"):
+            embedding_size = embeddings.embeddings.shape[0]
+        else:
+            embedding_size = embeddings.weight.shape[0]
         if len(tokenizer) > embedding_size:
             model.resize_token_embeddings(len(tokenizer))
         # endregion
@@ -529,7 +579,8 @@ def group_texts(examples):
             adam_global_clipnorm=training_args.max_grad_norm,
         )
 
-        # no user-specified loss = will use the model internal loss
+        # Transformers models compute the right loss for their task by default when labels are passed, and will
+        # use this for training unless you specify your own loss function in compile().
         model.compile(optimizer=optimizer, jit_compile=training_args.xla)
         # endregion
 
@@ -555,9 +606,8 @@ def group_texts(examples):
             callbacks = [
                 PushToHubCallback(
                     output_dir=training_args.output_dir,
-                    model_id=push_to_hub_model_id,
-                    organization=training_args.push_to_hub_organization,
-                    token=training_args.push_to_hub_token,
+                    hub_model_id=push_to_hub_model_id,
+                    hub_token=training_args.push_to_hub_token,
                     tokenizer=tokenizer,
                     **model_card_kwargs,
                 )
@@ -600,7 +650,7 @@ def group_texts(examples):
 
         if training_args.output_dir is not None:
             output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
-            results_dict = dict()
+            results_dict = {}
             results_dict["train_loss"] = train_loss
             results_dict["train_perplexity"] = train_perplexity
             results_dict["eval_loss"] = validation_loss
diff --git a/examples/tensorflow/language-modeling/run_mlm.py b/examples/tensorflow/language-modeling/run_mlm.py
index 680efcdbe48d..ad2e62654820 100755
--- a/examples/tensorflow/language-modeling/run_mlm.py
+++ b/examples/tensorflow/language-modeling/run_mlm.py
@@ -28,6 +28,7 @@
 import os
 import random
 import sys
+import warnings
 from dataclasses import dataclass, field
 from itertools import chain
 from pathlib import Path
@@ -75,7 +76,7 @@ class ModelArguments:
         default=None,
         metadata={
             "help": (
-                "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch."
+                "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
             )
         },
     )
@@ -110,12 +111,28 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -226,6 +243,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_mlm", model_args, data_args, framework="tensorflow")
@@ -296,20 +319,20 @@ def main():
         raw_datasets = load_dataset(
             data_args.dataset_name,
             data_args.dataset_config_name,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
         if "validation" not in raw_datasets.keys():
             raw_datasets["validation"] = load_dataset(
                 data_args.dataset_name,
                 data_args.dataset_config_name,
                 split=f"train[:{data_args.validation_split_percentage}%]",
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
             raw_datasets["train"] = load_dataset(
                 data_args.dataset_name,
                 data_args.dataset_config_name,
                 split=f"train[{data_args.validation_split_percentage}%:]",
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
     else:
         data_files = {}
@@ -323,7 +346,7 @@ def main():
         raw_datasets = load_dataset(
             extension,
             data_files=data_files,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
 
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
@@ -335,19 +358,29 @@ def main():
     # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
     # download model & vocab.
     if checkpoint is not None:
-        config = AutoConfig.from_pretrained(checkpoint)
+        config = AutoConfig.from_pretrained(
+            checkpoint, token=model_args.token, trust_remote_code=model_args.trust_remote_code
+        )
     elif model_args.config_name:
-        config = AutoConfig.from_pretrained(model_args.config_name)
+        config = AutoConfig.from_pretrained(
+            model_args.config_name, token=model_args.token, trust_remote_code=model_args.trust_remote_code
+        )
     elif model_args.model_name_or_path:
-        config = AutoConfig.from_pretrained(model_args.model_name_or_path)
+        config = AutoConfig.from_pretrained(
+            model_args.model_name_or_path, token=model_args.token, trust_remote_code=model_args.trust_remote_code
+        )
     else:
         config = CONFIG_MAPPING[model_args.model_type]()
         logger.warning("You are instantiating a new config instance from scratch.")
 
     if model_args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name)
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.tokenizer_name, token=model_args.token, trust_remote_code=model_args.trust_remote_code
+        )
     elif model_args.model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path, token=model_args.token, trust_remote_code=model_args.trust_remote_code
+        )
     else:
         raise ValueError(
             "You are instantiating a new tokenizer from scratch. This is not supported by this script."
@@ -441,7 +474,7 @@ def group_texts(examples):
         # might be slower to preprocess.
         #
         # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
-        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+        # https://huggingface.co/docs/datasets/process#map
 
         tokenized_datasets = tokenized_datasets.map(
             group_texts,
@@ -482,16 +515,33 @@ def group_texts(examples):
     with training_args.strategy.scope():
         # region Prepare model
         if checkpoint is not None:
-            model = TFAutoModelForMaskedLM.from_pretrained(checkpoint, config=config)
+            model = TFAutoModelForMaskedLM.from_pretrained(
+                checkpoint, config=config, token=model_args.token, trust_remote_code=model_args.trust_remote_code
+            )
         elif model_args.model_name_or_path:
-            model = TFAutoModelForMaskedLM.from_pretrained(model_args.model_name_or_path, config=config)
+            model = TFAutoModelForMaskedLM.from_pretrained(
+                model_args.model_name_or_path,
+                config=config,
+                token=model_args.token,
+                trust_remote_code=model_args.trust_remote_code,
+            )
         else:
             logger.info("Training new model from scratch")
-            model = TFAutoModelForMaskedLM.from_config(config)
+            model = TFAutoModelForMaskedLM.from_config(
+                config, token=model_args.token, trust_remote_code=model_args.trust_remote_code
+            )
 
         # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
         # on a small vocab and want a smaller embedding size, remove this test.
-        embedding_size = model.get_input_embeddings().weight.shape[0]
+        embeddings = model.get_input_embeddings()
+
+        # Matt: This is a temporary workaround as we transition our models to exclusively using Keras embeddings.
+        #       As soon as the transition is complete, all embeddings should be keras.Embeddings layers, and
+        #       the weights will always be in embeddings.embeddings.
+        if hasattr(embeddings, "embeddings"):
+            embedding_size = embeddings.embeddings.shape[0]
+        else:
+            embedding_size = embeddings.weight.shape[0]
         if len(tokenizer) > embedding_size:
             model.resize_token_embeddings(len(tokenizer))
         # endregion
@@ -499,7 +549,7 @@ def group_texts(examples):
         # region TF Dataset preparation
         num_replicas = training_args.strategy.num_replicas_in_sync
         data_collator = DataCollatorForLanguageModeling(
-            tokenizer=tokenizer, mlm_probability=data_args.mlm_probability, return_tensors="tf"
+            tokenizer=tokenizer, mlm_probability=data_args.mlm_probability, return_tensors="np"
         )
         options = tf.data.Options()
         options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
@@ -551,8 +601,9 @@ def group_texts(examples):
             adam_global_clipnorm=training_args.max_grad_norm,
         )
 
-        # no user-specified loss = will use the model internal loss
-        model.compile(optimizer=optimizer, jit_compile=training_args.xla, run_eagerly=True)
+        # Transformers models compute the right loss for their task by default when labels are passed, and will
+        # use this for training unless you specify your own loss function in compile().
+        model.compile(optimizer=optimizer, jit_compile=training_args.xla)
         # endregion
 
         # region Preparing push_to_hub and model card
@@ -577,9 +628,8 @@ def group_texts(examples):
             callbacks = [
                 PushToHubCallback(
                     output_dir=training_args.output_dir,
-                    model_id=push_to_hub_model_id,
-                    organization=training_args.push_to_hub_organization,
-                    token=training_args.push_to_hub_token,
+                    hub_model_id=push_to_hub_model_id,
+                    hub_token=training_args.push_to_hub_token,
                     tokenizer=tokenizer,
                     **model_card_kwargs,
                 )
@@ -623,7 +673,7 @@ def group_texts(examples):
 
     if training_args.output_dir is not None:
         output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
-        results_dict = dict()
+        results_dict = {}
         results_dict["train_loss"] = train_loss
         results_dict["train_perplexity"] = train_perplexity
         results_dict["eval_loss"] = validation_loss
diff --git a/examples/tensorflow/multiple-choice/run_swag.py b/examples/tensorflow/multiple-choice/run_swag.py
index 9fc1e7dd67b3..284e1b38a3b4 100644
--- a/examples/tensorflow/multiple-choice/run_swag.py
+++ b/examples/tensorflow/multiple-choice/run_swag.py
@@ -22,6 +22,7 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from itertools import chain
 from pathlib import Path
@@ -50,7 +51,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 logger = logging.getLogger(__name__)
 
@@ -105,7 +106,7 @@ def __call__(self, features):
             padding=self.padding,
             max_length=self.max_length,
             pad_to_multiple_of=self.pad_to_multiple_of,
-            return_tensors="tf",
+            return_tensors="np",
         )
 
         # Un-flatten
@@ -117,6 +118,7 @@ def __call__(self, features):
 
 # endregion
 
+
 # region Arguments
 @dataclass
 class ModelArguments:
@@ -145,12 +147,28 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -238,6 +256,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_swag", model_args, data_args, framework="tensorflow")
@@ -300,7 +324,7 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         # Downloading and loading the swag dataset from the hub.
@@ -308,7 +332,7 @@ def main():
             "swag",
             "regular",
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -334,14 +358,16 @@ def main():
         config_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     # endregion
 
@@ -410,7 +436,7 @@ def preprocess_function(examples):
             )
 
     if data_args.pad_to_max_length:
-        data_collator = DefaultDataCollator(return_tensors="tf")
+        data_collator = DefaultDataCollator(return_tensors="np")
     else:
         # custom class defined above, as HF has no data collator for multiple choice
         data_collator = DataCollatorForMultipleChoice(tokenizer)
@@ -427,7 +453,8 @@ def preprocess_function(examples):
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
 
         num_replicas = training_args.strategy.num_replicas_in_sync
@@ -454,6 +481,8 @@ def preprocess_function(examples):
             )
         else:
             optimizer = None
+        # Transformers models compute the right loss for their task by default when labels are passed, and will
+        # use this for training unless you specify your own loss function in compile().
         model.compile(optimizer=optimizer, metrics=["accuracy"], jit_compile=training_args.xla)
         # endregion
 
@@ -469,9 +498,8 @@ def preprocess_function(examples):
             callbacks = [
                 PushToHubCallback(
                     output_dir=training_args.output_dir,
-                    model_id=push_to_hub_model_id,
-                    organization=training_args.push_to_hub_organization,
-                    token=training_args.push_to_hub_token,
+                    hub_model_id=push_to_hub_model_id,
+                    hub_token=training_args.push_to_hub_token,
                     tokenizer=tokenizer,
                     **model_card_kwargs,
                 )
diff --git a/examples/tensorflow/question-answering/run_qa.py b/examples/tensorflow/question-answering/run_qa.py
index 4e350bac761f..5e6193f0b5ed 100755
--- a/examples/tensorflow/question-answering/run_qa.py
+++ b/examples/tensorflow/question-answering/run_qa.py
@@ -22,14 +22,16 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Optional
 
+import evaluate
 import tensorflow as tf
 from datasets import load_dataset
+from utils_qa import postprocess_qa_predictions
 
-import evaluate
 import transformers
 from transformers import (
     AutoConfig,
@@ -44,11 +46,10 @@
     set_seed,
 )
 from transformers.utils import CONFIG_NAME, TF2_WEIGHTS_NAME, check_min_version, send_example_telemetry
-from utils_qa import postprocess_qa_predictions
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 logger = logging.getLogger(__name__)
 
@@ -77,12 +78,28 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -214,6 +231,7 @@ def __post_init__(self):
 
 # endregion
 
+
 # region Helper classes
 class SavePretrainedCallback(tf.keras.callbacks.Callback):
     # Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
@@ -244,6 +262,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_qa", model_args, data_args, framework="tensorflow")
@@ -303,7 +327,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -322,7 +346,7 @@ def main():
             data_files=data_files,
             field="data",
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -337,14 +361,16 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=True,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     # endregion
 
@@ -463,7 +489,7 @@ def prepare_train_features(examples):
 
         return tokenized_examples
 
-    processed_datasets = dict()
+    processed_datasets = {}
     if training_args.do_train:
         if "train" not in datasets:
             raise ValueError("--do_train requires a train dataset")
@@ -610,7 +636,6 @@ def compute_metrics(p: EvalPrediction):
     # endregion
 
     with training_args.strategy.scope():
-
         dataset_options = tf.data.Options()
         dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
         num_replicas = training_args.strategy.num_replicas_in_sync
@@ -625,10 +650,10 @@ def compute_metrics(p: EvalPrediction):
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
         if training_args.do_train:
-
             training_dataset = model.prepare_tf_dataset(
                 processed_datasets["train"],
                 shuffle=True,
@@ -657,7 +682,8 @@ def compute_metrics(p: EvalPrediction):
                 adam_global_clipnorm=training_args.max_grad_norm,
             )
 
-            # no user-specified loss = will use the model internal loss
+            # Transformers models compute the right loss for their task by default when labels are passed, and will
+            # use this for training unless you specify your own loss function in compile().
             model.compile(optimizer=optimizer, jit_compile=training_args.xla, metrics=["accuracy"])
 
         else:
@@ -710,9 +736,8 @@ def compute_metrics(p: EvalPrediction):
             callbacks = [
                 PushToHubCallback(
                     output_dir=training_args.output_dir,
-                    model_id=push_to_hub_model_id,
-                    organization=training_args.push_to_hub_organization,
-                    token=training_args.push_to_hub_token,
+                    hub_model_id=push_to_hub_model_id,
+                    hub_token=training_args.push_to_hub_token,
                     tokenizer=tokenizer,
                     **model_card_kwargs,
                 )
diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py
index c244a30a7aaf..82decdb26962 100644
--- a/examples/tensorflow/summarization/run_summarization.py
+++ b/examples/tensorflow/summarization/run_summarization.py
@@ -22,18 +22,19 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
 import datasets
+import evaluate
 import nltk  # Here to have a nice missing dependency error message early on
 import numpy as np
 import tensorflow as tf
 from datasets import load_dataset
+from filelock import FileLock
 
-import evaluate
 import transformers
-from filelock import FileLock
 from transformers import (
     AutoConfig,
     AutoTokenizer,
@@ -53,7 +54,7 @@
 
 # region Checking dependencies
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
@@ -99,12 +100,28 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -287,6 +304,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_summarization", model_args, data_args, framework="tensorflow")
@@ -355,7 +378,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -372,7 +395,7 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -388,14 +411,16 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     prefix = data_args.source_prefix if data_args.source_prefix is not None else ""
@@ -513,12 +538,21 @@ def postprocess_text(preds, labels):
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
 
         # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
         # on a small vocab and want a smaller embedding size, remove this test.
-        embedding_size = model.get_input_embeddings().weight.shape[0]
+        embeddings = model.get_input_embeddings()
+
+        # Matt: This is a temporary workaround as we transition our models to exclusively using Keras embeddings.
+        #       As soon as the transition is complete, all embeddings should be keras.Embeddings layers, and
+        #       the weights will always be in embeddings.embeddings.
+        if hasattr(embeddings, "embeddings"):
+            embedding_size = embeddings.embeddings.shape[0]
+        else:
+            embedding_size = embeddings.weight.shape[0]
         if len(tokenizer) > embedding_size:
             model.resize_token_embeddings(len(tokenizer))
         # endregion
@@ -533,7 +567,7 @@ def postprocess_text(preds, labels):
             model=model,
             label_pad_token_id=label_pad_token_id,
             pad_to_multiple_of=128,  # Reduce the number of unique shapes for XLA, especially for generation
-            return_tensors="tf",
+            return_tensors="np",
         )
 
         dataset_options = tf.data.Options()
@@ -657,9 +691,8 @@ def compute_metrics(preds):
             callbacks.append(
                 PushToHubCallback(
                     output_dir=training_args.output_dir,
-                    model_id=push_to_hub_model_id,
-                    organization=training_args.push_to_hub_organization,
-                    token=training_args.push_to_hub_token,
+                    hub_model_id=push_to_hub_model_id,
+                    hub_token=training_args.push_to_hub_token,
                     tokenizer=tokenizer,
                     **model_card_kwargs,
                 )
@@ -667,6 +700,8 @@ def compute_metrics(preds):
         # endregion
 
         # region Training
+        # Transformers models compute the right loss for their task by default when labels are passed, and will
+        # use this for training unless you specify your own loss function in compile().
         model.compile(optimizer=optimizer, jit_compile=training_args.xla)
         eval_metrics = None
         if training_args.do_train:
diff --git a/examples/tensorflow/test_tensorflow_examples.py b/examples/tensorflow/test_tensorflow_examples.py
index f4b383eabe53..956209baade4 100644
--- a/examples/tensorflow/test_tensorflow_examples.py
+++ b/examples/tensorflow/test_tensorflow_examples.py
@@ -38,6 +38,7 @@
         "question-answering",
         "summarization",
         "translation",
+        "image-classification",
     ]
 ]
 sys.path.extend(SRC_DIRS)
@@ -45,6 +46,7 @@
 
 if SRC_DIRS is not None:
     import run_clm
+    import run_image_classification
     import run_mlm
     import run_ner
     import run_qa as run_squad
@@ -294,3 +296,28 @@ def test_run_translation(self):
             run_translation.main()
             result = get_results(tmp_dir)
             self.assertGreaterEqual(result["bleu"], 30)
+
+    def test_run_image_classification(self):
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        testargs = f"""
+            run_image_classification.py
+            --dataset_name hf-internal-testing/cats_vs_dogs_sample
+            --model_name_or_path microsoft/resnet-18
+            --do_train
+            --do_eval
+            --learning_rate 1e-4
+            --per_device_train_batch_size 2
+            --per_device_eval_batch_size 1
+            --output_dir {tmp_dir}
+            --overwrite_output_dir
+            --dataloader_num_workers 16
+            --num_train_epochs 2
+            --train_val_split 0.1
+            --seed 42
+            --ignore_mismatched_sizes True
+            """.split()
+
+        with patch.object(sys, "argv", testargs):
+            run_image_classification.main()
+            result = get_results(tmp_dir)
+            self.assertGreaterEqual(result["accuracy"], 0.7)
diff --git a/examples/tensorflow/text-classification/run_glue.py b/examples/tensorflow/text-classification/run_glue.py
index d7929f07dc94..23d8f2fb2140 100644
--- a/examples/tensorflow/text-classification/run_glue.py
+++ b/examples/tensorflow/text-classification/run_glue.py
@@ -20,14 +20,15 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
+import evaluate
 import numpy as np
 import tensorflow as tf
 from datasets import load_dataset
 
-import evaluate
 import transformers
 from transformers import (
     AutoConfig,
@@ -47,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 task_to_keys = {
     "cola": ("sentence", None),
@@ -164,12 +165,28 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -192,6 +209,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_glue", model_args, data_args, framework="tensorflow")
@@ -242,7 +265,7 @@ def main():
         "glue",
         data_args.task_name,
         cache_dir=model_args.cache_dir,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     # See more about loading any type of standard or custom dataset at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -284,14 +307,16 @@ def main():
         finetuning_task=data_args.task_name,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     # endregion
 
@@ -310,12 +335,12 @@ def main():
     if config.label2id != PretrainedConfig(num_labels=num_labels).label2id and not is_regression:
         # Some have all caps in their config, some don't.
         label_name_to_id = {k.lower(): v for k, v in config.label2id.items()}
-        if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
+        if sorted(label_name_to_id.keys()) == sorted(label_list):
             label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)}
         else:
             logger.warning(
                 "Your model seems to have been trained with labels, but they don't match the dataset: ",
-                f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
+                f"model labels: {sorted(label_name_to_id.keys())}, dataset labels: {sorted(label_list)}."
                 "\nIgnoring the model labels as a result.",
             )
             label_to_id = {label: i for i, label in enumerate(label_list)}
@@ -345,9 +370,9 @@ def preprocess_function(examples):
     datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache)
 
     if data_args.pad_to_max_length:
-        data_collator = DefaultDataCollator(return_tensors="tf")
+        data_collator = DefaultDataCollator(return_tensors="np")
     else:
-        data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")
+        data_collator = DataCollatorWithPadding(tokenizer, return_tensors="np")
     # endregion
 
     # region Metric function
@@ -374,7 +399,8 @@ def compute_metrics(preds, label_ids):
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
         # endregion
 
@@ -383,7 +409,7 @@ def compute_metrics(preds, label_ids):
         dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
         num_replicas = training_args.strategy.num_replicas_in_sync
 
-        tf_data = dict()
+        tf_data = {}
         max_samples = {
             "train": data_args.max_train_samples,
             "validation": data_args.max_eval_samples,
@@ -453,6 +479,8 @@ def compute_metrics(preds, label_ids):
             metrics = []
         else:
             metrics = ["accuracy"]
+        # Transformers models compute the right loss for their task by default when labels are passed, and will
+        # use this for training unless you specify your own loss function in compile().
         model.compile(optimizer=optimizer, metrics=metrics, jit_compile=training_args.xla)
         # endregion
 
@@ -469,9 +497,8 @@ def compute_metrics(preds, label_ids):
             callbacks = [
                 PushToHubCallback(
                     output_dir=training_args.output_dir,
-                    model_id=push_to_hub_model_id,
-                    organization=training_args.push_to_hub_organization,
-                    token=training_args.push_to_hub_token,
+                    hub_model_id=push_to_hub_model_id,
+                    hub_token=training_args.push_to_hub_token,
                     tokenizer=tokenizer,
                     **model_card_kwargs,
                 )
diff --git a/examples/tensorflow/text-classification/run_text_classification.py b/examples/tensorflow/text-classification/run_text_classification.py
index 0cf1972e937f..0d2ea87b96cb 100644
--- a/examples/tensorflow/text-classification/run_text_classification.py
+++ b/examples/tensorflow/text-classification/run_text_classification.py
@@ -20,6 +20,7 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Optional
@@ -170,12 +171,28 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -198,6 +215,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_text_classification", model_args, data_args, framework="tensorflow")
@@ -258,7 +281,7 @@ def main():
             "csv",
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         # Loading a dataset from local json files
@@ -301,20 +324,23 @@ def main():
             num_labels=num_labels,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         config = AutoConfig.from_pretrained(
             config_path,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     # endregion
 
@@ -343,13 +369,13 @@ def main():
     if "train" in datasets:
         if not is_regression and config.label2id != PretrainedConfig(num_labels=num_labels).label2id:
             label_name_to_id = config.label2id
-            if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
+            if sorted(label_name_to_id.keys()) == sorted(label_list):
                 label_to_id = label_name_to_id  # Use the model's labels
             else:
                 logger.warning(
                     "Your model seems to have been trained with labels, but they don't match the dataset: ",
-                    f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels:"
-                    f" {list(sorted(label_list))}.\nIgnoring the model labels as a result.",
+                    f"model labels: {sorted(label_name_to_id.keys())}, dataset labels:"
+                    f" {sorted(label_list)}.\nIgnoring the model labels as a result.",
                 )
                 label_to_id = {v: i for i, v in enumerate(label_list)}
         elif not is_regression:
@@ -402,7 +428,8 @@ def preprocess_function(examples):
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
         # endregion
 
@@ -411,7 +438,7 @@ def preprocess_function(examples):
         dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
         num_replicas = training_args.strategy.num_replicas_in_sync
 
-        tf_data = dict()
+        tf_data = {}
         max_samples = {
             "train": data_args.max_train_samples,
             "validation": data_args.max_val_samples,
@@ -487,6 +514,8 @@ def preprocess_function(examples):
             metrics = []
         else:
             metrics = ["accuracy"]
+        # Transformers models compute the right loss for their task by default when labels are passed, and will
+        # use this for training unless you specify your own loss function in compile().
         model.compile(optimizer=optimizer, metrics=metrics)
         # endregion
 
@@ -502,9 +531,8 @@ def preprocess_function(examples):
             callbacks = [
                 PushToHubCallback(
                     output_dir=training_args.output_dir,
-                    model_id=push_to_hub_model_id,
-                    organization=training_args.push_to_hub_organization,
-                    token=training_args.push_to_hub_token,
+                    hub_model_id=push_to_hub_model_id,
+                    hub_token=training_args.push_to_hub_token,
                     tokenizer=tokenizer,
                     **model_card_kwargs,
                 )
diff --git a/examples/tensorflow/token-classification/run_ner.py b/examples/tensorflow/token-classification/run_ner.py
index 5e8ee5323dd4..f04dae721825 100644
--- a/examples/tensorflow/token-classification/run_ner.py
+++ b/examples/tensorflow/token-classification/run_ner.py
@@ -14,22 +14,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Fine-tuning a 🤗 Transformers model on token classification tasks (NER, POS, CHUNKS) relying on the accelerate library
-without using a Trainer.
+Fine-tuning a 🤗 Transformers model on token classification tasks (NER, POS, CHUNKS)
 """
 
 import json
 import logging
 import os
 import random
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
 import datasets
+import evaluate
 import tensorflow as tf
 from datasets import ClassLabel, load_dataset
 
-import evaluate
 import transformers
 from transformers import (
     CONFIG_MAPPING,
@@ -76,12 +76,28 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -197,6 +213,12 @@ def main():
     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments))
     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_ner", model_args, data_args, framework="tensorflow")
@@ -229,7 +251,7 @@ def main():
         raw_datasets = load_dataset(
             data_args.dataset_name,
             data_args.dataset_config_name,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -241,7 +263,7 @@ def main():
         raw_datasets = load_dataset(
             extension,
             data_files=data_files,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -292,9 +314,19 @@ def get_label_list(labels):
     # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
     # download model & vocab.
     if model_args.config_name:
-        config = AutoConfig.from_pretrained(model_args.config_name, num_labels=num_labels)
+        config = AutoConfig.from_pretrained(
+            model_args.config_name,
+            num_labels=num_labels,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
+        )
     elif model_args.model_name_or_path:
-        config = AutoConfig.from_pretrained(model_args.model_name_or_path, num_labels=num_labels)
+        config = AutoConfig.from_pretrained(
+            model_args.model_name_or_path,
+            num_labels=num_labels,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
+        )
     else:
         config = CONFIG_MAPPING[model_args.model_type]()
         logger.warning("You are instantiating a new config instance from scratch.")
@@ -307,9 +339,20 @@ def get_label_list(labels):
         )
 
     if config.model_type in {"gpt2", "roberta"}:
-        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, use_fast=True, add_prefix_space=True)
+        tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_name_or_path,
+            use_fast=True,
+            add_prefix_space=True,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
+        )
     else:
-        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, use_fast=True)
+        tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_name_or_path,
+            use_fast=True,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
+        )
     # endregion
 
     # region Preprocessing the raw datasets
@@ -380,14 +423,26 @@ def tokenize_and_align_labels(examples):
             model = TFAutoModelForTokenClassification.from_pretrained(
                 model_args.model_name_or_path,
                 config=config,
+                token=model_args.token,
+                trust_remote_code=model_args.trust_remote_code,
             )
         else:
             logger.info("Training new model from scratch")
-            model = TFAutoModelForTokenClassification.from_config(config)
+            model = TFAutoModelForTokenClassification.from_config(
+                config, token=model_args.token, trust_remote_code=model_args.trust_remote_code
+            )
 
         # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
         # on a small vocab and want a smaller embedding size, remove this test.
-        embedding_size = model.get_input_embeddings().weight.shape[0]
+        embeddings = model.get_input_embeddings()
+
+        # Matt: This is a temporary workaround as we transition our models to exclusively using Keras embeddings.
+        #       As soon as the transition is complete, all embeddings should be keras.Embeddings layers, and
+        #       the weights will always be in embeddings.embeddings.
+        if hasattr(embeddings, "embeddings"):
+            embedding_size = embeddings.embeddings.shape[0]
+        else:
+            embedding_size = embeddings.weight.shape[0]
         if len(tokenizer) > embedding_size:
             model.resize_token_embeddings(len(tokenizer))
         # endregion
@@ -396,7 +451,7 @@ def tokenize_and_align_labels(examples):
 
         # We need the DataCollatorForTokenClassification here, as we need to correctly pad labels as
         # well as inputs.
-        collate_fn = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")
+        collate_fn = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="np")
         num_replicas = training_args.strategy.num_replicas_in_sync
         total_train_batch_size = training_args.per_device_train_batch_size * num_replicas
 
@@ -447,7 +502,8 @@ def tokenize_and_align_labels(examples):
             weight_decay_rate=training_args.weight_decay,
             adam_global_clipnorm=training_args.max_grad_norm,
         )
-
+        # Transformers models compute the right loss for their task by default when labels are passed, and will
+        # use this for training unless you specify your own loss function in compile().
         model.compile(optimizer=optimizer, jit_compile=training_args.xla)
         # endregion
 
@@ -512,9 +568,8 @@ def compute_metrics():
             callbacks = [
                 PushToHubCallback(
                     output_dir=training_args.output_dir,
-                    model_id=push_to_hub_model_id,
-                    organization=training_args.push_to_hub_organization,
-                    token=training_args.push_to_hub_token,
+                    hub_model_id=push_to_hub_model_id,
+                    hub_token=training_args.push_to_hub_token,
                     tokenizer=tokenizer,
                     **model_card_kwargs,
                 )
diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py
index edcd3bee092f..fa8315cd2682 100644
--- a/examples/tensorflow/translation/run_translation.py
+++ b/examples/tensorflow/translation/run_translation.py
@@ -22,15 +22,16 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
 import datasets
+import evaluate
 import numpy as np
 import tensorflow as tf
 from datasets import load_dataset
 
-import evaluate
 import transformers
 from transformers import (
     AutoConfig,
@@ -56,7 +57,7 @@
 
 # region Dependencies and constants
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.26.0.dev0")
+check_min_version("4.35.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
@@ -93,12 +94,28 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -268,6 +285,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_translation", model_args, data_args, framework="tensorflow")
@@ -322,7 +345,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -336,10 +359,10 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading
     # endregion
 
     # region Load model config and tokenizer
@@ -352,14 +375,16 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     prefix = data_args.source_prefix if data_args.source_prefix is not None else ""
@@ -466,14 +491,24 @@ def preprocess_function(examples):
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
 
         # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
         # on a small vocab and want a smaller embedding size, remove this test.
-        embedding_size = model.get_input_embeddings().weight.shape[0]
+        embeddings = model.get_input_embeddings()
+
+        # Matt: This is a temporary workaround as we transition our models to exclusively using Keras embeddings.
+        #       As soon as the transition is complete, all embeddings should be keras.Embeddings layers, and
+        #       the weights will always be in embeddings.embeddings.
+        if hasattr(embeddings, "embeddings"):
+            embedding_size = embeddings.embeddings.shape[0]
+        else:
+            embedding_size = embeddings.weight.shape[0]
         if len(tokenizer) > embedding_size:
             model.resize_token_embeddings(len(tokenizer))
+
         if isinstance(tokenizer, tuple(MULTILINGUAL_TOKENIZERS)):
             model.config.forced_bos_token_id = forced_bos_token_id
         # endregion
@@ -499,7 +534,7 @@ def preprocess_function(examples):
             model=model,
             label_pad_token_id=label_pad_token_id,
             pad_to_multiple_of=64,  # Reduce the number of unique shapes for XLA, especially for generation
-            return_tensors="tf",
+            return_tensors="np",
         )
         num_replicas = training_args.strategy.num_replicas_in_sync
         total_train_batch_size = training_args.per_device_train_batch_size * num_replicas
@@ -624,9 +659,8 @@ def compute_metrics(preds):
             callbacks.append(
                 PushToHubCallback(
                     output_dir=training_args.output_dir,
-                    model_id=push_to_hub_model_id,
-                    organization=training_args.push_to_hub_organization,
-                    token=training_args.push_to_hub_token,
+                    hub_model_id=push_to_hub_model_id,
+                    hub_token=training_args.push_to_hub_token,
                     tokenizer=tokenizer,
                     **model_card_kwargs,
                 )
@@ -635,6 +669,8 @@ def compute_metrics(preds):
 
         # region Training
         eval_metrics = None
+        # Transformers models compute the right loss for their task by default when labels are passed, and will
+        # use this for training unless you specify your own loss function in compile().
         model.compile(optimizer=optimizer, jit_compile=training_args.xla)
 
         if training_args.do_train:
diff --git a/notebooks/README.md b/notebooks/README.md
index 38d51ccf1124..31a08476dc1f 100644
--- a/notebooks/README.md
+++ b/notebooks/README.md
@@ -1,5 +1,5 @@
 <!---
-Copyright 2020 The HuggingFace Team. All rights reserved.
+Copyright 2023 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@ limitations under the License.
 You can find here a list of the official notebooks provided by Hugging Face.
 
 Also, we would like to list here interesting content created by the community.
-If you wrote some notebook(s) leveraging 🤗 Transformers and would like be listed here, please open a
+If you wrote some notebook(s) leveraging 🤗 Transformers and would like to be listed here, please open a
 Pull Request so it can be included under the Community notebooks.
 
 
@@ -27,7 +27,7 @@ Pull Request so it can be included under the Community notebooks.
 
 ### Documentation notebooks
 
-You can open any page of the documentation as a notebook in colab (there is a button directly on said pages) but they are also listed here if you need to:
+You can open any page of the documentation as a notebook in Colab (there is a button directly on said pages) but they are also listed here if you need them:
 
 | Notebook     |      Description      |   |   |
 |:----------|:-------------|:-------------|------:|
@@ -41,6 +41,8 @@ You can open any page of the documentation as a notebook in colab (there is a bu
 
 ### PyTorch Examples
 
+#### Natural Language Processing[[pytorch-nlp]]
+
 | Notebook     |      Description      |   |   |
 |:----------|:-------------|:-------------|------:|
 | [Train your tokenizer](https://github.com/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)  | How to train and use your very own tokenizer  |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)|
@@ -52,23 +54,59 @@ You can open any page of the documentation as a notebook in colab (there is a bu
 | [How to fine-tune a model on multiple choice](https://github.com/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on SWAG. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)|
 | [How to fine-tune a model on translation](https://github.com/huggingface/notebooks/blob/main/examples/translation.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on WMT. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/translation.ipynb)|
 | [How to fine-tune a model on summarization](https://github.com/huggingface/notebooks/blob/main/examples/summarization.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on XSUM. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)|
-| [How to fine-tune a speech recognition model in English](https://github.com/huggingface/notebooks/blob/main/examples/speech_recognition.ipynb)| Show how to preprocess the data and fine-tune a pretrained Speech model on TIMIT | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/speech_recognition.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/speech_recognition.ipynb)|
-| [How to fine-tune a speech recognition model in any language](https://github.com/huggingface/notebooks/blob/main/examples/multi_lingual_speech_recognition.ipynb)| Show how to preprocess the data and fine-tune a multi-lingually pretrained speech model on Common Voice | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multi_lingual_speech_recognition.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/multi_lingual_speech_recognition.ipynb)|
-| [How to fine-tune a model on audio classification](https://github.com/huggingface/notebooks/blob/main/examples/audio_classification.ipynb)| Show how to preprocess the data and fine-tune a pretrained Speech model on Keyword Spotting | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/audio_classification.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/audio_classification.ipynb)|
 | [How to train a language model from scratch](https://github.com/huggingface/blog/blob/main/notebooks/01_how_to_train.ipynb)| Highlight all the steps to effectively train Transformer model on custom data | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/01_how_to_train.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/blog/blob/main/notebooks/01_how_to_train.ipynb)|
 | [How to generate text](https://github.com/huggingface/blog/blob/main/notebooks/02_how_to_generate.ipynb)| How to use different decoding methods for language generation with transformers | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/02_how_to_generate.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/blog/blob/main/notebooks/02_how_to_generate.ipynb)|
 | [How to generate text (with constraints)](https://github.com/huggingface/blog/blob/main/notebooks/53_constrained_beam_search.ipynb)| How to guide language generation with user-provided constraints | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/53_constrained_beam_search.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/blog/blob/main/notebooks/53_constrained_beam_search.ipynb)|
-| [How to export model to ONNX](https://github.com/huggingface/notebooks/blob/main/examples/onnx-export.ipynb)| Highlight how to export and run inference workloads through ONNX |
-| [How to use Benchmarks](https://github.com/huggingface/notebooks/blob/main/examples/benchmark.ipynb)| How to benchmark models with transformers | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/benchmark.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/benchmark.ipynb)|
 | [Reformer](https://github.com/huggingface/blog/blob/main/notebooks/03_reformer.ipynb)| How Reformer pushes the limits of language modeling | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/blog/blob/main/notebooks/03_reformer.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/patrickvonplaten/blog/blob/main/notebooks/03_reformer.ipynb)|
-| [How to fine-tune a model on image classification (Torchvision)](https://github.com/huggingface/notebooks/blob/main/examples/image_classification.ipynb) | Show how to preprocess the data using Torchvision and fine-tune any pretrained Vision model on Image Classification | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb)|
-| [How to fine-tune a model on image classification (Albumentations)](https://github.com/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb) | Show how to preprocess the data using Albumentations and fine-tune any pretrained Vision model on Image Classification | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb)|
-| [How to perform zero-shot object detection with OWL-ViT](https://github.com/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb) | Show how to perform zero-shot object detection on images with text queries| [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb)|
+
+#### Computer Vision[[pytorch-cv]]
+
+| Notebook                                                                                                                                                                   | Description                                                                                                            |                                                                                                                                                                                                            |   |
+|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------:|
+| [How to fine-tune a model on image classification (Torchvision)](https://github.com/huggingface/notebooks/blob/main/examples/image_classification.ipynb)                   | Show how to preprocess the data using Torchvision and fine-tune any pretrained Vision model on Image Classification    | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb)                 | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb)|
+| [How to fine-tune a model on image classification (Albumentations)](https://github.com/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb) | Show how to preprocess the data using Albumentations and fine-tune any pretrained Vision model on Image Classification | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb)  | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb)|
+| [How to fine-tune a model on image classification (Kornia)](https://github.com/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb)                 | Show how to preprocess the data using Kornia and fine-tune any pretrained Vision model on Image Classification         | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb)          | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb)|
+| [How to perform zero-shot object detection with OWL-ViT](https://github.com/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb)          | Show how to perform zero-shot object detection on images with text queries                                             | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb)|
+| [How to fine-tune an image captioning model](https://github.com/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb)                                      | Show how to fine-tune BLIP for image captioning on a custom dataset                                                    | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb)                | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb)|
+| [How to build an image similarity system with Transformers](https://github.com/huggingface/notebooks/blob/main/examples/image_similarity.ipynb)                            | Show how to build an image similarity system                                                                           | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_similarity.ipynb)                     | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_similarity.ipynb)|
+| [How to fine-tune a SegFormer model on semantic segmentation](https://github.com/huggingface/notebooks/blob/main/examples/semantic_segmentation.ipynb)                     | Show how to preprocess the data and fine-tune a pretrained SegFormer model on Semantic Segmentation                    | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/semantic_segmentation.ipynb)                | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/semantic_segmentation.ipynb)|
+| [How to fine-tune a VideoMAE model on video classification](https://github.com/huggingface/notebooks/blob/main/examples/video_classification.ipynb)          | Show how to preprocess the data and fine-tune a pretrained VideoMAE model on Video Classification                      | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/video_classification.ipynb)                | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/video_classification.ipynb)|
+
+#### Audio[[pytorch-audio]]
+
+| Notebook     |      Description      |   |   |
+|:----------|:-------------|:-------------|------:|
+| [How to fine-tune a speech recognition model in English](https://github.com/huggingface/notebooks/blob/main/examples/speech_recognition.ipynb)| Show how to preprocess the data and fine-tune a pretrained Speech model on TIMIT | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/speech_recognition.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/speech_recognition.ipynb)|
+| [How to fine-tune a speech recognition model in any language](https://github.com/huggingface/notebooks/blob/main/examples/multi_lingual_speech_recognition.ipynb)| Show how to preprocess the data and fine-tune a multi-lingually pretrained speech model on Common Voice | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multi_lingual_speech_recognition.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/multi_lingual_speech_recognition.ipynb)|
+| [How to fine-tune a model on audio classification](https://github.com/huggingface/notebooks/blob/main/examples/audio_classification.ipynb)| Show how to preprocess the data and fine-tune a pretrained Speech model on Keyword Spotting | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/audio_classification.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/audio_classification.ipynb)|
+
+#### Biological Sequences[[pytorch-bio]]
+
+| Notebook     | Description                                                                             |   |   |
+|:----------|:----------------------------------------------------------------------------------------|:-------------|------:|
 | [How to fine-tune a pre-trained protein model](https://github.com/huggingface/notebooks/blob/main/examples/protein_language_modeling.ipynb) | See how to tokenize proteins and fine-tune a large pre-trained protein "language" model | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/protein_language_modeling.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/protein_language_modeling.ipynb) |
-| [How to generate protein folds](https://github.com/huggingface/notebooks/blob/main/examples/protein_folding.ipynb) | See how to go from protein sequence to a full protein model and PDB file | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/protein_folding.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/protein_folding.ipynb) |
+| [How to generate protein folds](https://github.com/huggingface/notebooks/blob/main/examples/protein_folding.ipynb) | See how to go from protein sequence to a full protein model and PDB file                | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/protein_folding.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/protein_folding.ipynb) |
+| [How to fine-tune a Nucleotide Transformer model](https://github.com/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling.ipynb) | See how to tokenize DNA and fine-tune a large pre-trained DNA "language" model | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling.ipynb) |
+| [Fine-tune a Nucleotide Transformer model with LoRA](https://github.com/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling_with_peft.ipynb) | Train even larger DNA models in a memory-efficient way | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling_with_peft.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling_with_peft.ipynb) |
+
+
+#### Other modalities[[pytorch-other]]
+
+| Notebook     | Description                                                                             |   |   |
+|:----------|:----------------------------------------------------------------------------------------|:-------------|------:|
+| [Probabilistic Time Series Forecasting](https://github.com/huggingface/notebooks/blob/main/examples/time-series-transformers.ipynb) | See how to train Time Series Transformer on a custom dataset                            | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/time-series-transformers.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/time-series-transformers.ipynb) |
+
+#### Utility notebooks[[pytorch-utility]]
+
+| Notebook     |      Description      |   |   |
+|:----------|:-------------|:-------------|------:|
+| [How to export model to ONNX](https://github.com/huggingface/notebooks/blob/main/examples/onnx-export.ipynb)| Highlight how to export and run inference workloads through ONNX |
+| [How to use Benchmarks](https://github.com/huggingface/notebooks/blob/main/examples/benchmark.ipynb)| How to benchmark models with transformers | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/benchmark.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/benchmark.ipynb)|
 
 ### TensorFlow Examples
 
+#### Natural Language Processing[[tensorflow-nlp]]
+
 | Notebook     |      Description      |   |   |
 |:----------|:-------------|:-------------|------:|
 | [Train your tokenizer](https://github.com/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)  | How to train and use your very own tokenizer  |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)|
@@ -80,8 +118,26 @@ You can open any page of the documentation as a notebook in colab (there is a bu
 | [How to fine-tune a model on multiple choice](https://github.com/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on SWAG. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb)|
 | [How to fine-tune a model on translation](https://github.com/huggingface/notebooks/blob/main/examples/translation-tf.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on WMT. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/translation-tf.ipynb)|
 | [How to fine-tune a model on summarization](https://github.com/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on XSUM. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb)|
+
+#### Computer Vision[[tensorflow-cv]]
+
+| Notebook                                                                                                                                                 | Description                                                                                         |   |   |
+|:---------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------|:-------------|------:|
+| [How to fine-tune a model on image classification](https://github.com/huggingface/notebooks/blob/main/examples/image_classification-tf.ipynb)            | Show how to preprocess the data and fine-tune any pretrained Vision model on Image Classification   | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_classification-tf.ipynb)|
+| [How to fine-tune a SegFormer model on semantic segmentation](https://github.com/huggingface/notebooks/blob/main/examples/semantic_segmentation-tf.ipynb) | Show how to preprocess the data and fine-tune a pretrained SegFormer model on Semantic Segmentation | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/semantic_segmentation-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/semantic_segmentation-tf.ipynb)|
+
+#### Biological Sequences[[tensorflow-bio]]
+
+| Notebook     |      Description      |   |   |
+|:----------|:-------------|:-------------|------:|
 | [How to fine-tune a pre-trained protein model](https://github.com/huggingface/notebooks/blob/main/examples/protein_language_modeling-tf.ipynb) | See how to tokenize proteins and fine-tune a large pre-trained protein "language" model | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/protein_language_modeling-tf.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/protein_language_modeling-tf.ipynb) |
 
+#### Utility notebooks[[tensorflow-utility]]
+
+| Notebook     |      Description      |   |                                                                                                                                                                                      |
+|:----------|:-------------|:-------------|------:|
+| [How to train TF/Keras models on TPU](https://github.com/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb) | See how to train at high speed on Google's TPU hardware | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb) |
+
 ### Optimum notebooks
 
 🤗  [Optimum](https://github.com/huggingface/optimum) is an extension of 🤗 Transformers, providing a set of performance optimization tools enabling maximum efficiency to train and run models on targeted hardwares.
diff --git a/pyproject.toml b/pyproject.toml
index 291558c9a3de..c55795eb1378 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,3 +1,23 @@
 [tool.black]
 line-length = 119
-target-version = ['py35']
+target-version = ['py37']
+
+[tool.ruff]
+# Never enforce `E501` (line length violations).
+ignore = ["C901", "E501", "E741"]
+select = ["C", "E", "F", "I", "W"]
+line-length = 119
+
+# Ignore import violations in all `__init__.py` files.
+[tool.ruff.per-file-ignores]
+"__init__.py" = ["E402", "F401", "F403", "F811"]
+"src/transformers/file_utils.py" = ["F401"]
+"src/transformers/utils/dummy_*.py" = ["F401"]
+
+[tool.ruff.isort]
+lines-after-imports = 2
+known-first-party = ["transformers"]
+
+# This is ignored, maybe because of the header? If someone finds a fix, we can uncomment and remove setup.cfg
+# [tool.pytest]
+# doctest_optionflags="NUMBER NORMALIZE_WHITESPACE ELLIPSIS"
diff --git a/scripts/stale.py b/scripts/stale.py
index 88d7efbd3b29..81b608f1e202 100644
--- a/scripts/stale.py
+++ b/scripts/stale.py
@@ -37,19 +37,20 @@ def main():
     open_issues = repo.get_issues(state="open")
 
     for issue in open_issues:
+        print(issue)
         comments = sorted([comment for comment in issue.get_comments()], key=lambda i: i.created_at, reverse=True)
         last_comment = comments[0] if len(comments) > 0 else None
         if (
             last_comment is not None and last_comment.user.login == "github-actions[bot]"
-            and (dt.utcnow() - issue.updated_at).days > 7
-            and (dt.utcnow() - issue.created_at).days >= 30
+            and (dt.utcnow() - issue.updated_at.replace(tzinfo=None)).days > 7
+            and (dt.utcnow() - issue.created_at.replace(tzinfo=None)).days >= 30
             and not any(label.name.lower() in LABELS_TO_EXEMPT for label in issue.get_labels())
         ):
             # print(f"Would close issue {issue.number} since it has been 7 days of inactivity since bot mention.")
             issue.edit(state="closed")
         elif (
-            (dt.utcnow() - issue.updated_at).days > 23
-            and (dt.utcnow() - issue.created_at).days >= 30
+            (dt.utcnow() - issue.updated_at.replace(tzinfo=None)).days > 23
+            and (dt.utcnow() - issue.created_at.replace(tzinfo=None)).days >= 30
             and not any(label.name.lower() in LABELS_TO_EXEMPT for label in issue.get_labels())
         ):
             # print(f"Would add stale comment to {issue.number}")
diff --git a/scripts/tatoeba/README.md b/scripts/tatoeba/README.md
index 7c492ec4f46e..94bb167d51bb 100644
--- a/scripts/tatoeba/README.md
+++ b/scripts/tatoeba/README.md
@@ -54,7 +54,7 @@ To upload all converted models,
 
 1. Install [git-lfs](https://git-lfs.github.com/).
 
-2. Login to `transformers-cli`
+2. Login to `huggingface-cli`
 
 ```bash
 huggingface-cli login
diff --git a/setup.cfg b/setup.cfg
index 2d605ccceca7..ffe8973dd21c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,54 +1,3 @@
-[isort]
-default_section = FIRSTPARTY
-ensure_newline_before_comments = True
-force_grid_wrap = 0
-include_trailing_comma = True
-known_first_party = transformers
-known_third_party =
-    absl
-    conllu
-    datasets
-    elasticsearch
-    fairseq
-    faiss-cpu
-    fastprogress
-    fire
-    fugashi
-    git
-    h5py
-    matplotlib
-    nltk
-    numpy
-    packaging
-    pandas
-    PIL
-    psutil
-    pytest
-    pytorch_lightning
-    rouge_score
-    sacrebleu
-    seqeval
-    sklearn
-    streamlit
-    tensorboardX
-    tensorflow
-    tensorflow_datasets
-    timeout_decorator
-    torch
-    torchaudio
-    torchtext
-    torchvision
-    torch_xla
-    tqdm
-
-line_length = 119
-lines_after_imports = 2
-multi_line_output = 3
-use_parentheses = True
-
-[flake8]
-ignore = E203, E501, E741, W503, W605
-max-line-length = 119
-
 [tool:pytest]
-doctest_optionflags=NUMBER NORMALIZE_WHITESPACE ELLIPSIS
\ No newline at end of file
+doctest_optionflags=NUMBER NORMALIZE_WHITESPACE ELLIPSIS
+doctest_glob=**/*.md
\ No newline at end of file
diff --git a/setup.py b/setup.py
index aa7be0f6cbf2..444efbf268d1 100644
--- a/setup.py
+++ b/setup.py
@@ -17,39 +17,39 @@
 
 To create the package for pypi.
 
-1. Run `make pre-release` (or `make pre-patch` for a patch release) then run `make fix-copies` to fix the index of the
-   documentation.
+1. Create the release branch named: v<RELEASE>-release, for example v4.19-release. For a patch release checkout the
+   current release branch.
 
    If releasing on a special branch, copy the updated README.md on the main branch for your the commit you will make
    for the post-release and run `make fix-copies` on the main branch as well.
 
-2. Run Tests for Amazon Sagemaker. The documentation is located in `./tests/sagemaker/README.md`, otherwise @philschmid.
+2. Run `make pre-release` (or `make pre-patch` for a patch release) and commit these changes with the message:
+   "Release: <VERSION>" and push.
 
-3. Unpin specific versions from setup.py that use a git install.
+3. Go back to the main branch and run `make post-release` then `make fix-copies`. Commit these changes with the
+   message "v<NEXT_VERSION>.dev.0" and push to main.
 
-4. Checkout the release branch (v<RELEASE>-release, for example v4.19-release), and commit these changes with the
-   message: "Release: <VERSION>" and push.
+# If you were just cutting the branch in preparation for a release, you can stop here for now.
 
-5. Wait for the tests on main to be completed and be green (otherwise revert and fix bugs)
+4. Wait for the tests on the release branch to be completed and be green (otherwise revert and fix bugs)
 
-6. Add a tag in git to mark the release: "git tag v<VERSION> -m 'Adds tag v<VERSION> for pypi' "
+5. On the release branch, add a tag in git to mark the release: "git tag v<VERSION> -m 'Adds tag v<VERSION> for pypi' "
    Push the tag to git: git push --tags origin v<RELEASE>-release
 
-7. Build both the sources and the wheel. Do not change anything in setup.py between
+6. Build both the sources and the wheel. Do not change anything in setup.py between
    creating the wheel and the source distribution (obviously).
 
-   For the wheel, run: "python setup.py bdist_wheel" in the top level directory.
-   (this will build a wheel for the python version you use to build it).
+   Run `make build-release`. This will build the release and do some sanity checks for you. If this ends with an error
+   message, you need to fix things before going further.
 
-   For the sources, run: "python setup.py sdist"
    You should now have a /dist directory with both .whl and .tar.gz source versions.
 
-8. Check that everything looks correct by uploading the package to the pypi test server:
+7. Check that everything looks correct by uploading the package to the pypi test server:
 
-   twine upload dist/* -r pypitest
+   twine upload dist/* -r testpypi
    (pypi suggest using twine as other methods upload files via plaintext.)
    You may have to specify the repository url, use the following command then:
-   twine upload dist/* -r pypitest --repository-url=https://test.pypi.org/legacy/
+   twine upload dist/* -r testpypi --repository-url=https://test.pypi.org/legacy/
 
    Check that you can install it in a virtualenv by running:
    pip install -i https://testpypi.python.org/pypi transformers
@@ -57,23 +57,22 @@
    Check you can run the following commands:
    python -c "from transformers import pipeline; classifier = pipeline('text-classification'); print(classifier('What a nice release'))"
    python -c "from transformers import *"
+   python utils/check_build.py --check_lib
 
-9. Upload the final version to actual pypi:
-   twine upload dist/* -r pypi
+   If making a patch release, double check the bug you are patching is indeed resolved.
 
-10. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory.
+8. Upload the final version to actual pypi:
+   twine upload dist/* -r pypi
 
-11. Run `make post-release` then run `make fix-copies`. If you were on a branch for the release,
-    you need to go back to main before executing this.
+9. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory.
 """
 
 import os
 import re
 import shutil
-from distutils.core import Command
 from pathlib import Path
 
-from setuptools import find_packages, setup
+from setuptools import Command, find_packages, setup
 
 
 # Remove stale transformers.egg-info directory to avoid https://github.com/pypa/pip/issues/5466
@@ -96,87 +95,92 @@
 # 1. all dependencies should be listed here with their version requirements if any
 # 2. once modified, run: `make deps_table_update` to update src/transformers/dependency_versions_table.py
 _deps = [
-    "Pillow",
-    "accelerate>=0.10.0",
-    "black==22.3",  # after updating to black 2023, also update Python version in pyproject.toml to 3.7
+    "Pillow<10.0.0",
+    "accelerate>=0.20.3",
+    "av==9.2.0",  # Latest version of PyAV (10.0.0) has issues with audio stream.
+    "beautifulsoup4",
+    "black~=23.1",
     "codecarbon==1.2.0",
     "cookiecutter==1.7.3",
     "dataclasses",
     "datasets!=2.5.0",
     "decord==0.6.0",
-    "deepspeed>=0.6.5",
+    "deepspeed>=0.9.3",
+    "diffusers",
     "dill<0.3.5",
     "evaluate>=0.2.0",
-    "fairscale>0.3",
     "faiss-cpu",
     "fastapi",
     "filelock",
-    "flake8>=3.8.3",
-    "flax>=0.4.1",
+    "flax>=0.4.1,<=0.7.0",
     "ftfy",
     "fugashi>=1.0",
     "GitPython<3.1.19",
     "hf-doc-builder>=0.3.0",
-    "huggingface-hub>=0.10.0,<1.0",
+    "huggingface-hub>=0.16.4,<1.0",
     "importlib_metadata",
     "ipadic>=1.0.0,<2.0",
     "isort>=5.5.4",
-    "jax>=0.2.8,!=0.3.2,<=0.3.6",
-    "jaxlib>=0.1.65,<=0.3.6",
+    "jax>=0.4.1,<=0.4.13",
+    "jaxlib>=0.4.1,<=0.4.13",
     "jieba",
     "kenlm",
     "keras-nlp>=0.3.1",
+    "librosa",
     "nltk",
-    "natten>=0.14.4",
+    "natten>=0.14.6",
     "numpy>=1.17",
     "onnxconverter-common",
     "onnxruntime-tools>=1.4.2",
     "onnxruntime>=1.4.0",
+    "opencv-python",
     "optuna",
-    "optax>=0.0.8",
+    "optax>=0.0.8,<=0.1.4",
     "packaging>=20.0",
     "parameterized",
     "phonemizer",
-    "protobuf<=3.20.2",
+    "protobuf",
     "psutil",
     "pyyaml>=5.1",
-    "pydantic",
-    "pytest",
+    "pydantic<2",
+    "pytest>=7.2.0",
     "pytest-timeout",
     "pytest-xdist",
-    "python>=3.7.0",
+    "python>=3.8.0",
     "ray[tune]",
     "regex!=2019.12.17",
     "requests",
+    "rhoknp>=1.1.0,<1.3.1",
     "rjieba",
     "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1",
+    "ruff>=0.0.241,<=0.0.259",
     "sacrebleu>=1.4.12,<2.0.0",
     "sacremoses",
-    "safetensors>=0.2.1",
+    "safetensors>=0.3.1",
     "sagemaker>=2.31.0",
     "scikit-learn",
     "sentencepiece>=0.1.91,!=0.1.92",
     "sigopt",
-    "librosa",
     "starlette",
-    "tensorflow-cpu>=2.4,<2.12",
-    "tensorflow>=2.4,<2.12",
-    "tensorflow-text",
+    "sudachipy>=0.6.6",
+    "sudachidict_core>=20220729",
+    # TensorFlow pin. When changing this value, update examples/tensorflow/_tests_requirements.txt accordingly
+    "tensorflow-cpu>=2.6,<2.15",
+    "tensorflow>=2.6,<2.15",
+    "tensorflow-text<2.15",
     "tf2onnx",
     "timeout-decorator",
     "timm",
-    "tokenizers>=0.11.1,!=0.11.3,<0.14",
-    "torch>=1.7,!=1.12.0",
+    "tokenizers>=0.14,<0.15",
+    "torch>=1.10,!=1.12.0",
     "torchaudio",
+    "torchvision",
     "pyctcdecode>=0.4.0",
     "tqdm>=4.27",
     "unidic>=1.0.2",
     "unidic_lite>=1.0.7",
+    "urllib3<2.0.0",
     "uvicorn",
-    "beautifulsoup4",
-    "sudachipy>=0.6.6",
-    "sudachidict_core>=20220729",
-    "rhoknp>=1.1.0",
 ]
 
 
@@ -201,7 +205,7 @@
 # You can then feed this for example to `pip`:
 #
 # pip install -U $(python -c 'import sys; from transformers.dependency_versions_table import deps; \
-# print(" ".join([ deps[x] for x in sys.argv[1:]]))' tokenizers datasets)
+# print(" ".join([deps[x] for x in sys.argv[1:]]))' tokenizers datasets)
 #
 
 
@@ -243,6 +247,7 @@ def run(self):
         with open(target, "w", encoding="utf-8", newline="\n") as f:
             f.write("\n".join(content))
 
+
 extras = {}
 
 extras["ja"] = deps_list("fugashi", "ipadic", "unidic_lite", "unidic", "sudachipy", "sudachidict_core", "rhoknp")
@@ -251,7 +256,7 @@ def run(self):
 extras["tf"] = deps_list("tensorflow", "onnxconverter-common", "tf2onnx", "tensorflow-text", "keras-nlp")
 extras["tf-cpu"] = deps_list("tensorflow-cpu", "onnxconverter-common", "tf2onnx", "tensorflow-text", "keras-nlp")
 
-extras["torch"] = deps_list("torch")
+extras["torch"] = deps_list("torch", "accelerate")
 extras["accelerate"] = deps_list("accelerate")
 
 if os.name == "nt":  # windows
@@ -269,7 +274,6 @@ def run(self):
 
 extras["sagemaker"] = deps_list("sagemaker")
 extras["deepspeed"] = deps_list("deepspeed") + extras["accelerate"]
-extras["fairscale"] = deps_list("fairscale")
 extras["optuna"] = deps_list("optuna")
 extras["ray"] = deps_list("ray[tune]")
 extras["sigopt"] = deps_list("sigopt")
@@ -285,9 +289,10 @@ def run(self):
 extras["flax-speech"] = extras["audio"]
 extras["vision"] = deps_list("Pillow")
 extras["timm"] = deps_list("timm")
+extras["torch-vision"] = deps_list("torchvision") + extras["vision"]
 extras["natten"] = deps_list("natten")
 extras["codecarbon"] = deps_list("codecarbon")
-extras["video"] = deps_list("decord")
+extras["video"] = deps_list("decord", "av")
 
 extras["sentencepiece"] = deps_list("sentencepiece", "protobuf")
 extras["testing"] = (
@@ -310,7 +315,6 @@ def run(self):
         "protobuf",  # Can be removed once we can unpin protobuf
         "sacremoses",
         "rjieba",
-        "safetensors",
         "beautifulsoup4",
     )
     + extras["retrieval"]
@@ -319,7 +323,7 @@ def run(self):
 
 extras["deepspeed-testing"] = extras["deepspeed"] + extras["testing"] + extras["optuna"] + extras["sentencepiece"]
 
-extras["quality"] = deps_list("black", "datasets", "isort", "flake8", "GitPython", "hf-doc-builder")
+extras["quality"] = deps_list("black", "datasets", "isort", "ruff", "GitPython", "hf-doc-builder", "urllib3")
 
 extras["all"] = (
     extras["tf"]
@@ -331,6 +335,7 @@ def run(self):
     + extras["vision"]
     + extras["integrations"]
     + extras["timm"]
+    + extras["torch-vision"]
     + extras["codecarbon"]
     + extras["accelerate"]
     + extras["video"]
@@ -351,6 +356,7 @@ def run(self):
     + extras["vision"]
     + extras["integrations"]
     + extras["timm"]
+    + extras["torch-vision"]
     + extras["codecarbon"]
     + extras["quality"]
     + extras["ja"]
@@ -397,9 +403,12 @@ def run(self):
     "tqdm",
 )
 
+extras["agents"] = deps_list(
+    "diffusers", "accelerate", "datasets", "torch", "sentencepiece", "opencv-python", "Pillow"
+)
+
 # when modifying the following list, make sure to update src/transformers/dependency_versions_check.py
 install_requires = [
-    deps["importlib_metadata"] + ";python_version<'3.8'",  # importlib_metadata for Python versions that don't have it
     deps["filelock"],  # filesystem locks, e.g., to prevent parallel downloads
     deps["huggingface-hub"],
     deps["numpy"],
@@ -408,28 +417,30 @@ def run(self):
     deps["regex"],  # for OpenAI GPT
     deps["requests"],  # for downloading models over HTTPS
     deps["tokenizers"],
+    deps["safetensors"],
     deps["tqdm"],  # progress bars in model download and training scripts
 ]
 
 setup(
     name="transformers",
-    version="4.26.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="4.35.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
     author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)",
     author_email="transformers@huggingface.co",
     description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow",
     long_description=open("README.md", "r", encoding="utf-8").read(),
     long_description_content_type="text/markdown",
-    keywords="NLP vision speech deep learning transformer pytorch tensorflow BERT GPT-2 Wav2Vec2 ViT",
-    license="Apache",
+    keywords="NLP vision speech deep learning transformer pytorch tensorflow jax BERT GPT-2 Wav2Vec2 ViT",
+    license="Apache 2.0 License",
     url="https://github.com/huggingface/transformers",
     package_dir={"": "src"},
     packages=find_packages("src"),
-    package_data={"transformers": ["py.typed", "*.cu", "*.cpp", "*.cuh", "*.h"]},
+    include_package_data=True,
+    package_data={"": ["**/*.cu", "**/*.cpp", "**/*.cuh", "**/*.h", "**/*.pyx"]},
     zip_safe=False,
     extras_require=extras,
     entry_points={"console_scripts": ["transformers-cli=transformers.commands.transformers_cli:main"]},
-    python_requires=">=3.7.0",
-    install_requires=install_requires,
+    python_requires=">=3.8.0",
+    install_requires=list(install_requires),
     classifiers=[
         "Development Status :: 5 - Production/Stable",
         "Intended Audience :: Developers",
@@ -438,9 +449,9 @@ def run(self):
         "License :: OSI Approved :: Apache Software License",
         "Operating System :: OS Independent",
         "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.7",
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
         "Topic :: Scientific/Engineering :: Artificial Intelligence",
     ],
     cmdclass={"deps_table_update": DepsTableUpdateCommand},
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
old mode 100755
new mode 100644
index 829c0a18bdc2..5c6791641575
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -22,7 +18,7 @@
 # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
 # in the namespace without actually importing anything (and especially none of the backends).
 
-__version__ = "4.26.0.dev0"
+__version__ = "4.35.0.dev0"
 
 from typing import TYPE_CHECKING
 
@@ -31,8 +27,13 @@
 from .utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
+    is_bitsandbytes_available,
+    is_essentia_available,
     is_flax_available,
     is_keras_nlp_available,
+    is_librosa_available,
+    is_pretty_midi_available,
+    is_scipy_available,
     is_sentencepiece_available,
     is_speech_available,
     is_tensorflow_text_available,
@@ -40,6 +41,7 @@
     is_timm_available,
     is_tokenizers_available,
     is_torch_available,
+    is_torchvision_available,
     is_vision_available,
     logging,
 )
@@ -50,6 +52,7 @@
 
 # Base objects, independent of any specific backend
 _import_structure = {
+    "audio_utils": [],
     "benchmark": [],
     "commands": [],
     "configuration_utils": ["PretrainedConfig"],
@@ -91,14 +94,16 @@
     "data.metrics": [],
     "data.processors": [],
     "debug_utils": [],
+    "deepspeed": [],
     "dependency_versions_check": [],
     "dependency_versions_table": [],
     "dynamic_module_utils": [],
     "feature_extraction_sequence_utils": ["SequenceFeatureExtractor"],
     "feature_extraction_utils": ["BatchFeature", "FeatureExtractionMixin"],
     "file_utils": [],
-    "generation": ["GenerationConfig"],
+    "generation": ["GenerationConfig", "TextIteratorStreamer", "TextStreamer"],
     "hf_argparser": ["HfArgumentParser"],
+    "hyperparameter_search": [],
     "image_transforms": [],
     "integrations": [
         "is_clearml_available",
@@ -124,6 +129,13 @@
     "models": [],
     # Models
     "models.albert": ["ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "AlbertConfig"],
+    "models.align": [
+        "ALIGN_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "AlignConfig",
+        "AlignProcessor",
+        "AlignTextConfig",
+        "AlignVisionConfig",
+    ],
     "models.altclip": [
         "ALTCLIP_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "AltCLIPConfig",
@@ -149,6 +161,17 @@
         "AutoProcessor",
         "AutoTokenizer",
     ],
+    "models.autoformer": [
+        "AUTOFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "AutoformerConfig",
+    ],
+    "models.bark": [
+        "BarkCoarseConfig",
+        "BarkConfig",
+        "BarkFineConfig",
+        "BarkProcessor",
+        "BarkSemanticConfig",
+    ],
     "models.bart": ["BartConfig", "BartTokenizer"],
     "models.barthez": [],
     "models.bartpho": [],
@@ -183,8 +206,22 @@
         "BlipTextConfig",
         "BlipVisionConfig",
     ],
+    "models.blip_2": [
+        "BLIP_2_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "Blip2Config",
+        "Blip2Processor",
+        "Blip2QFormerConfig",
+        "Blip2VisionConfig",
+    ],
     "models.bloom": ["BLOOM_PRETRAINED_CONFIG_ARCHIVE_MAP", "BloomConfig"],
-    "models.bort": [],
+    "models.bridgetower": [
+        "BRIDGETOWER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "BridgeTowerConfig",
+        "BridgeTowerProcessor",
+        "BridgeTowerTextConfig",
+        "BridgeTowerVisionConfig",
+    ],
+    "models.bros": ["BROS_PRETRAINED_CONFIG_ARCHIVE_MAP", "BrosConfig", "BrosProcessor"],
     "models.byt5": ["ByT5Tokenizer"],
     "models.camembert": ["CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "CamembertConfig"],
     "models.canine": ["CANINE_PRETRAINED_CONFIG_ARCHIVE_MAP", "CanineConfig", "CanineTokenizer"],
@@ -195,6 +232,13 @@
         "ChineseCLIPTextConfig",
         "ChineseCLIPVisionConfig",
     ],
+    "models.clap": [
+        "CLAP_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "ClapAudioConfig",
+        "ClapConfig",
+        "ClapProcessor",
+        "ClapTextConfig",
+    ],
     "models.clip": [
         "CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "CLIPConfig",
@@ -210,11 +254,14 @@
         "CLIPSegTextConfig",
         "CLIPSegVisionConfig",
     ],
+    "models.code_llama": [],
     "models.codegen": ["CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP", "CodeGenConfig", "CodeGenTokenizer"],
     "models.conditional_detr": ["CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConditionalDetrConfig"],
     "models.convbert": ["CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvBertConfig", "ConvBertTokenizer"],
     "models.convnext": ["CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvNextConfig"],
+    "models.convnextv2": ["CONVNEXTV2_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvNextV2Config"],
     "models.cpm": [],
+    "models.cpmant": ["CPMANT_PRETRAINED_CONFIG_ARCHIVE_MAP", "CpmAntConfig", "CpmAntTokenizer"],
     "models.ctrl": ["CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP", "CTRLConfig", "CTRLTokenizer"],
     "models.cvt": ["CVT_PRETRAINED_CONFIG_ARCHIVE_MAP", "CvtConfig"],
     "models.data2vec": [
@@ -229,9 +276,32 @@
     "models.decision_transformer": ["DECISION_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "DecisionTransformerConfig"],
     "models.deformable_detr": ["DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeformableDetrConfig"],
     "models.deit": ["DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeiTConfig"],
+    "models.deprecated": [],
+    "models.deprecated.bort": [],
+    "models.deprecated.mctct": [
+        "MCTCT_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "MCTCTConfig",
+        "MCTCTFeatureExtractor",
+        "MCTCTProcessor",
+    ],
+    "models.deprecated.mmbt": ["MMBTConfig"],
+    "models.deprecated.open_llama": ["OPEN_LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP", "OpenLlamaConfig"],
+    "models.deprecated.retribert": [
+        "RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "RetriBertConfig",
+        "RetriBertTokenizer",
+    ],
+    "models.deprecated.tapex": ["TapexTokenizer"],
+    "models.deprecated.trajectory_transformer": [
+        "TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "TrajectoryTransformerConfig",
+    ],
+    "models.deprecated.van": ["VAN_PRETRAINED_CONFIG_ARCHIVE_MAP", "VanConfig"],
+    "models.deta": ["DETA_PRETRAINED_CONFIG_ARCHIVE_MAP", "DetaConfig"],
     "models.detr": ["DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "DetrConfig"],
     "models.dialogpt": [],
     "models.dinat": ["DINAT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DinatConfig"],
+    "models.dinov2": ["DINOV2_PRETRAINED_CONFIG_ARCHIVE_MAP", "Dinov2Config"],
     "models.distilbert": ["DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DistilBertConfig", "DistilBertTokenizer"],
     "models.dit": [],
     "models.donut": ["DONUT_SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP", "DonutProcessor", "DonutSwinConfig"],
@@ -244,13 +314,22 @@
         "DPRReaderTokenizer",
     ],
     "models.dpt": ["DPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DPTConfig"],
+    "models.efficientformer": ["EFFICIENTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "EfficientFormerConfig"],
+    "models.efficientnet": ["EFFICIENTNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "EfficientNetConfig"],
     "models.electra": ["ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP", "ElectraConfig", "ElectraTokenizer"],
+    "models.encodec": [
+        "ENCODEC_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "EncodecConfig",
+        "EncodecFeatureExtractor",
+    ],
     "models.encoder_decoder": ["EncoderDecoderConfig"],
     "models.ernie": [
         "ERNIE_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "ErnieConfig",
     ],
+    "models.ernie_m": ["ERNIE_M_PRETRAINED_CONFIG_ARCHIVE_MAP", "ErnieMConfig"],
     "models.esm": ["ESM_PRETRAINED_CONFIG_ARCHIVE_MAP", "EsmConfig", "EsmTokenizer"],
+    "models.falcon": ["FALCON_PRETRAINED_CONFIG_ARCHIVE_MAP", "FalconConfig"],
     "models.flaubert": ["FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "FlaubertConfig", "FlaubertTokenizer"],
     "models.flava": [
         "FLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -261,16 +340,24 @@
         "FlavaTextConfig",
     ],
     "models.fnet": ["FNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "FNetConfig"],
+    "models.focalnet": ["FOCALNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "FocalNetConfig"],
     "models.fsmt": ["FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP", "FSMTConfig", "FSMTTokenizer"],
     "models.funnel": ["FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP", "FunnelConfig", "FunnelTokenizer"],
     "models.git": ["GIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "GitConfig", "GitProcessor", "GitVisionConfig"],
     "models.glpn": ["GLPN_PRETRAINED_CONFIG_ARCHIVE_MAP", "GLPNConfig"],
     "models.gpt2": ["GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPT2Config", "GPT2Tokenizer"],
+    "models.gpt_bigcode": ["GPT_BIGCODE_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTBigCodeConfig"],
     "models.gpt_neo": ["GPT_NEO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTNeoConfig"],
     "models.gpt_neox": ["GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTNeoXConfig"],
     "models.gpt_neox_japanese": ["GPT_NEOX_JAPANESE_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTNeoXJapaneseConfig"],
     "models.gpt_sw3": [],
     "models.gptj": ["GPTJ_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTJConfig"],
+    "models.gptsan_japanese": [
+        "GPTSAN_JAPANESE_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "GPTSanJapaneseConfig",
+        "GPTSanJapaneseTokenizer",
+    ],
+    "models.graphormer": ["GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "GraphormerConfig"],
     "models.groupvit": [
         "GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "GroupViTConfig",
@@ -280,7 +367,19 @@
     "models.herbert": ["HerbertTokenizer"],
     "models.hubert": ["HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "HubertConfig"],
     "models.ibert": ["IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "IBertConfig"],
+    "models.idefics": [
+        "IDEFICS_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "IdeficsConfig",
+    ],
     "models.imagegpt": ["IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ImageGPTConfig"],
+    "models.informer": ["INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "InformerConfig"],
+    "models.instructblip": [
+        "INSTRUCTBLIP_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "InstructBlipConfig",
+        "InstructBlipProcessor",
+        "InstructBlipQFormerConfig",
+        "InstructBlipVisionConfig",
+    ],
     "models.jukebox": [
         "JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "JukeboxConfig",
@@ -309,6 +408,7 @@
     "models.led": ["LED_PRETRAINED_CONFIG_ARCHIVE_MAP", "LEDConfig", "LEDTokenizer"],
     "models.levit": ["LEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "LevitConfig"],
     "models.lilt": ["LILT_PRETRAINED_CONFIG_ARCHIVE_MAP", "LiltConfig"],
+    "models.llama": ["LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP", "LlamaConfig"],
     "models.longformer": ["LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "LongformerConfig", "LongformerTokenizer"],
     "models.longt5": ["LONGT5_PRETRAINED_CONFIG_ARCHIVE_MAP", "LongT5Config"],
     "models.luke": ["LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP", "LukeConfig", "LukeTokenizer"],
@@ -322,28 +422,44 @@
         "MarkupLMProcessor",
         "MarkupLMTokenizer",
     ],
+    "models.mask2former": [
+        "MASK2FORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "Mask2FormerConfig",
+    ],
     "models.maskformer": ["MASKFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "MaskFormerConfig", "MaskFormerSwinConfig"],
     "models.mbart": ["MBartConfig"],
     "models.mbart50": [],
-    "models.mctct": ["MCTCT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MCTCTConfig", "MCTCTProcessor"],
+    "models.mega": ["MEGA_PRETRAINED_CONFIG_ARCHIVE_MAP", "MegaConfig"],
     "models.megatron_bert": ["MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MegatronBertConfig"],
     "models.megatron_gpt2": [],
+    "models.mgp_str": ["MGP_STR_PRETRAINED_CONFIG_ARCHIVE_MAP", "MgpstrConfig", "MgpstrProcessor", "MgpstrTokenizer"],
+    "models.mistral": ["MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP", "MistralConfig"],
     "models.mluke": [],
-    "models.mmbt": ["MMBTConfig"],
     "models.mobilebert": ["MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MobileBertConfig", "MobileBertTokenizer"],
     "models.mobilenet_v1": ["MOBILENET_V1_PRETRAINED_CONFIG_ARCHIVE_MAP", "MobileNetV1Config"],
     "models.mobilenet_v2": ["MOBILENET_V2_PRETRAINED_CONFIG_ARCHIVE_MAP", "MobileNetV2Config"],
     "models.mobilevit": ["MOBILEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MobileViTConfig"],
+    "models.mobilevitv2": ["MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP", "MobileViTV2Config"],
     "models.mpnet": ["MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "MPNetConfig", "MPNetTokenizer"],
+    "models.mpt": ["MPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MptConfig"],
+    "models.mra": ["MRA_PRETRAINED_CONFIG_ARCHIVE_MAP", "MraConfig"],
     "models.mt5": ["MT5Config"],
+    "models.musicgen": [
+        "MUSICGEN_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "MusicgenConfig",
+        "MusicgenDecoderConfig",
+    ],
     "models.mvp": ["MvpConfig", "MvpTokenizer"],
     "models.nat": ["NAT_PRETRAINED_CONFIG_ARCHIVE_MAP", "NatConfig"],
     "models.nezha": ["NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP", "NezhaConfig"],
     "models.nllb": [],
+    "models.nllb_moe": ["NLLB_MOE_PRETRAINED_CONFIG_ARCHIVE_MAP", "NllbMoeConfig"],
+    "models.nougat": ["NougatProcessor"],
     "models.nystromformer": [
         "NYSTROMFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "NystromformerConfig",
     ],
+    "models.oneformer": ["ONEFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "OneFormerConfig", "OneFormerProcessor"],
     "models.openai": ["OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "OpenAIGPTConfig", "OpenAIGPTTokenizer"],
     "models.opt": ["OPTConfig"],
     "models.owlvit": [
@@ -356,10 +472,23 @@
     "models.pegasus": ["PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP", "PegasusConfig", "PegasusTokenizer"],
     "models.pegasus_x": ["PEGASUS_X_PRETRAINED_CONFIG_ARCHIVE_MAP", "PegasusXConfig"],
     "models.perceiver": ["PERCEIVER_PRETRAINED_CONFIG_ARCHIVE_MAP", "PerceiverConfig", "PerceiverTokenizer"],
+    "models.persimmon": ["PERSIMMON_PRETRAINED_CONFIG_ARCHIVE_MAP", "PersimmonConfig"],
     "models.phobert": ["PhobertTokenizer"],
+    "models.pix2struct": [
+        "PIX2STRUCT_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "Pix2StructConfig",
+        "Pix2StructProcessor",
+        "Pix2StructTextConfig",
+        "Pix2StructVisionConfig",
+    ],
     "models.plbart": ["PLBART_PRETRAINED_CONFIG_ARCHIVE_MAP", "PLBartConfig"],
     "models.poolformer": ["POOLFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "PoolFormerConfig"],
+    "models.pop2piano": [
+        "POP2PIANO_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "Pop2PianoConfig",
+    ],
     "models.prophetnet": ["PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "ProphetNetConfig", "ProphetNetTokenizer"],
+    "models.pvt": ["PVT_PRETRAINED_CONFIG_ARCHIVE_MAP", "PvtConfig"],
     "models.qdqbert": ["QDQBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "QDQBertConfig"],
     "models.rag": ["RagConfig", "RagRetriever", "RagTokenizer"],
     "models.realm": ["REALM_PRETRAINED_CONFIG_ARCHIVE_MAP", "RealmConfig", "RealmTokenizer"],
@@ -367,11 +496,19 @@
     "models.regnet": ["REGNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "RegNetConfig"],
     "models.rembert": ["REMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RemBertConfig"],
     "models.resnet": ["RESNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "ResNetConfig"],
-    "models.retribert": ["RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RetriBertConfig", "RetriBertTokenizer"],
     "models.roberta": ["ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "RobertaConfig", "RobertaTokenizer"],
     "models.roberta_prelayernorm": ["ROBERTA_PRELAYERNORM_PRETRAINED_CONFIG_ARCHIVE_MAP", "RobertaPreLayerNormConfig"],
     "models.roc_bert": ["ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RoCBertConfig", "RoCBertTokenizer"],
     "models.roformer": ["ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "RoFormerConfig", "RoFormerTokenizer"],
+    "models.rwkv": ["RWKV_PRETRAINED_CONFIG_ARCHIVE_MAP", "RwkvConfig"],
+    "models.sam": [
+        "SAM_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "SamConfig",
+        "SamMaskDecoderConfig",
+        "SamProcessor",
+        "SamPromptEncoderConfig",
+        "SamVisionConfig",
+    ],
     "models.segformer": ["SEGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "SegformerConfig"],
     "models.sew": ["SEW_PRETRAINED_CONFIG_ARCHIVE_MAP", "SEWConfig"],
     "models.sew_d": ["SEW_D_PRETRAINED_CONFIG_ARCHIVE_MAP", "SEWDConfig"],
@@ -379,6 +516,7 @@
     "models.speech_to_text": [
         "SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "Speech2TextConfig",
+        "Speech2TextProcessor",
     ],
     "models.speech_to_text_2": [
         "SPEECH_TO_TEXT_2_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -386,8 +524,17 @@
         "Speech2Text2Processor",
         "Speech2Text2Tokenizer",
     ],
+    "models.speecht5": [
+        "SPEECHT5_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "SPEECHT5_PRETRAINED_HIFIGAN_CONFIG_ARCHIVE_MAP",
+        "SpeechT5Config",
+        "SpeechT5FeatureExtractor",
+        "SpeechT5HifiGanConfig",
+        "SpeechT5Processor",
+    ],
     "models.splinter": ["SPLINTER_PRETRAINED_CONFIG_ARCHIVE_MAP", "SplinterConfig", "SplinterTokenizer"],
     "models.squeezebert": ["SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "SqueezeBertConfig", "SqueezeBertTokenizer"],
+    "models.swiftformer": ["SWIFTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "SwiftFormerConfig"],
     "models.swin": ["SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP", "SwinConfig"],
     "models.swin2sr": ["SWIN2SR_PRETRAINED_CONFIG_ARCHIVE_MAP", "Swin2SRConfig"],
     "models.swinv2": ["SWINV2_PRETRAINED_CONFIG_ARCHIVE_MAP", "Swinv2Config"],
@@ -395,16 +542,12 @@
     "models.t5": ["T5_PRETRAINED_CONFIG_ARCHIVE_MAP", "T5Config"],
     "models.table_transformer": ["TABLE_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "TableTransformerConfig"],
     "models.tapas": ["TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP", "TapasConfig", "TapasTokenizer"],
-    "models.tapex": ["TapexTokenizer"],
     "models.time_series_transformer": [
         "TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "TimeSeriesTransformerConfig",
     ],
     "models.timesformer": ["TIMESFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "TimesformerConfig"],
-    "models.trajectory_transformer": [
-        "TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
-        "TrajectoryTransformerConfig",
-    ],
+    "models.timm_backbone": ["TimmBackboneConfig"],
     "models.transfo_xl": [
         "TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "TransfoXLConfig",
@@ -416,6 +559,13 @@
         "TrOCRConfig",
         "TrOCRProcessor",
     ],
+    "models.tvlt": [
+        "TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "TvltConfig",
+        "TvltFeatureExtractor",
+        "TvltProcessor",
+    ],
+    "models.umt5": ["UMT5Config"],
     "models.unispeech": [
         "UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "UniSpeechConfig",
@@ -424,7 +574,7 @@
         "UNISPEECH_SAT_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "UniSpeechSatConfig",
     ],
-    "models.van": ["VAN_PRETRAINED_CONFIG_ARCHIVE_MAP", "VanConfig"],
+    "models.upernet": ["UperNetConfig"],
     "models.videomae": ["VIDEOMAE_PRETRAINED_CONFIG_ARCHIVE_MAP", "VideoMAEConfig"],
     "models.vilt": [
         "VILT_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -440,6 +590,17 @@
     "models.vit_hybrid": ["VIT_HYBRID_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTHybridConfig"],
     "models.vit_mae": ["VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTMAEConfig"],
     "models.vit_msn": ["VIT_MSN_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTMSNConfig"],
+    "models.vitdet": ["VITDET_PRETRAINED_CONFIG_ARCHIVE_MAP", "VitDetConfig"],
+    "models.vitmatte": ["VITMATTE_PRETRAINED_CONFIG_ARCHIVE_MAP", "VitMatteConfig"],
+    "models.vits": [
+        "VITS_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "VitsConfig",
+        "VitsTokenizer",
+    ],
+    "models.vivit": [
+        "VIVIT_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "VivitConfig",
+    ],
     "models.wav2vec2": [
         "WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "Wav2Vec2Config",
@@ -478,6 +639,7 @@
     "models.xlm_roberta": ["XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "XLMRobertaConfig"],
     "models.xlm_roberta_xl": ["XLM_ROBERTA_XL_PRETRAINED_CONFIG_ARCHIVE_MAP", "XLMRobertaXLConfig"],
     "models.xlnet": ["XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "XLNetConfig"],
+    "models.xmod": ["XMOD_PRETRAINED_CONFIG_ARCHIVE_MAP", "XmodConfig"],
     "models.yolos": ["YOLOS_PRETRAINED_CONFIG_ARCHIVE_MAP", "YolosConfig"],
     "models.yoso": ["YOSO_PRETRAINED_CONFIG_ARCHIVE_MAP", "YosoConfig"],
     "onnx": [],
@@ -493,6 +655,7 @@
         "FillMaskPipeline",
         "ImageClassificationPipeline",
         "ImageSegmentationPipeline",
+        "ImageToImagePipeline",
         "ImageToTextPipeline",
         "JsonPipelineDataFormat",
         "NerPipeline",
@@ -506,10 +669,12 @@
         "Text2TextGenerationPipeline",
         "TextClassificationPipeline",
         "TextGenerationPipeline",
+        "TextToAudioPipeline",
         "TokenClassificationPipeline",
         "TranslationPipeline",
         "VideoClassificationPipeline",
         "VisualQuestionAnsweringPipeline",
+        "ZeroShotAudioClassificationPipeline",
         "ZeroShotClassificationPipeline",
         "ZeroShotImageClassificationPipeline",
         "ZeroShotObjectDetectionPipeline",
@@ -526,6 +691,18 @@
         "SpecialTokensMixin",
         "TokenSpan",
     ],
+    "tools": [
+        "Agent",
+        "AzureOpenAiAgent",
+        "HfAgent",
+        "LocalAgent",
+        "OpenAiAgent",
+        "PipelineTool",
+        "RemoteTool",
+        "Tool",
+        "launch_gradio_demo",
+        "load_tool",
+    ],
     "trainer_callback": [
         "DefaultFlowCallback",
         "EarlyStoppingCallback",
@@ -553,6 +730,7 @@
         "add_end_docstrings",
         "add_start_docstrings",
         "is_apex_available",
+        "is_bitsandbytes_available",
         "is_datasets_available",
         "is_decord_available",
         "is_faiss_available",
@@ -572,11 +750,15 @@
         "is_timm_available",
         "is_tokenizers_available",
         "is_torch_available",
+        "is_torch_neuroncore_available",
+        "is_torch_npu_available",
         "is_torch_tpu_available",
+        "is_torchvision_available",
+        "is_torch_xpu_available",
         "is_vision_available",
         "logging",
     ],
-    "utils.bitsandbytes": [],
+    "utils.quantization_config": ["BitsAndBytesConfig", "GPTQConfig"],
 }
 
 # sentencepiece-backed objects
@@ -596,11 +778,14 @@
     _import_structure["models.bert_generation"].append("BertGenerationTokenizer")
     _import_structure["models.big_bird"].append("BigBirdTokenizer")
     _import_structure["models.camembert"].append("CamembertTokenizer")
+    _import_structure["models.code_llama"].append("CodeLlamaTokenizer")
     _import_structure["models.cpm"].append("CpmTokenizer")
     _import_structure["models.deberta_v2"].append("DebertaV2Tokenizer")
+    _import_structure["models.ernie_m"].append("ErnieMTokenizer")
     _import_structure["models.fnet"].append("FNetTokenizer")
     _import_structure["models.gpt_sw3"].append("GPTSw3Tokenizer")
     _import_structure["models.layoutxlm"].append("LayoutXLMTokenizer")
+    _import_structure["models.llama"].append("LlamaTokenizer")
     _import_structure["models.m2m_100"].append("M2M100Tokenizer")
     _import_structure["models.marian"].append("MarianTokenizer")
     _import_structure["models.mbart"].append("MBartTokenizer")
@@ -613,6 +798,7 @@
     _import_structure["models.reformer"].append("ReformerTokenizer")
     _import_structure["models.rembert"].append("RemBertTokenizer")
     _import_structure["models.speech_to_text"].append("Speech2TextTokenizer")
+    _import_structure["models.speecht5"].append("SpeechT5Tokenizer")
     _import_structure["models.t5"].append("T5Tokenizer")
     _import_structure["models.xglm"].append("XGLMTokenizer")
     _import_structure["models.xlm_prophetnet"].append("XLMProphetNetTokenizer")
@@ -641,11 +827,13 @@
     _import_structure["models.bloom"].append("BloomTokenizerFast")
     _import_structure["models.camembert"].append("CamembertTokenizerFast")
     _import_structure["models.clip"].append("CLIPTokenizerFast")
+    _import_structure["models.code_llama"].append("CodeLlamaTokenizerFast")
     _import_structure["models.codegen"].append("CodeGenTokenizerFast")
     _import_structure["models.convbert"].append("ConvBertTokenizerFast")
     _import_structure["models.cpm"].append("CpmTokenizerFast")
     _import_structure["models.deberta"].append("DebertaTokenizerFast")
     _import_structure["models.deberta_v2"].append("DebertaV2TokenizerFast")
+    _import_structure["models.deprecated.retribert"].append("RetriBertTokenizerFast")
     _import_structure["models.distilbert"].append("DistilBertTokenizerFast")
     _import_structure["models.dpr"].extend(
         ["DPRContextEncoderTokenizerFast", "DPRQuestionEncoderTokenizerFast", "DPRReaderTokenizerFast"]
@@ -662,6 +850,7 @@
     _import_structure["models.layoutlmv3"].append("LayoutLMv3TokenizerFast")
     _import_structure["models.layoutxlm"].append("LayoutXLMTokenizerFast")
     _import_structure["models.led"].append("LEDTokenizerFast")
+    _import_structure["models.llama"].append("LlamaTokenizerFast")
     _import_structure["models.longformer"].append("LongformerTokenizerFast")
     _import_structure["models.lxmert"].append("LxmertTokenizerFast")
     _import_structure["models.markuplm"].append("MarkupLMTokenizerFast")
@@ -672,17 +861,18 @@
     _import_structure["models.mt5"].append("MT5TokenizerFast")
     _import_structure["models.mvp"].append("MvpTokenizerFast")
     _import_structure["models.nllb"].append("NllbTokenizerFast")
+    _import_structure["models.nougat"].append("NougatTokenizerFast")
     _import_structure["models.openai"].append("OpenAIGPTTokenizerFast")
     _import_structure["models.pegasus"].append("PegasusTokenizerFast")
     _import_structure["models.realm"].append("RealmTokenizerFast")
     _import_structure["models.reformer"].append("ReformerTokenizerFast")
     _import_structure["models.rembert"].append("RemBertTokenizerFast")
-    _import_structure["models.retribert"].append("RetriBertTokenizerFast")
     _import_structure["models.roberta"].append("RobertaTokenizerFast")
     _import_structure["models.roformer"].append("RoFormerTokenizerFast")
     _import_structure["models.splinter"].append("SplinterTokenizerFast")
     _import_structure["models.squeezebert"].append("SqueezeBertTokenizerFast")
     _import_structure["models.t5"].append("T5TokenizerFast")
+    _import_structure["models.whisper"].append("WhisperTokenizerFast")
     _import_structure["models.xglm"].append("XGLMTokenizerFast")
     _import_structure["models.xlm_roberta"].append("XLMRobertaTokenizerFast")
     _import_structure["models.xlnet"].append("XLNetTokenizerFast")
@@ -713,7 +903,6 @@
     ]
 else:
     _import_structure["models.audio_spectrogram_transformer"].append("ASTFeatureExtractor")
-    _import_structure["models.mctct"].append("MCTCTFeatureExtractor")
     _import_structure["models.speech_to_text"].append("Speech2TextFeatureExtractor")
 
 # Tensorflow-text-specific objects
@@ -742,18 +931,6 @@
 else:
     _import_structure["models.gpt2"].append("TFGPT2Tokenizer")
 
-try:
-    if not (is_sentencepiece_available() and is_speech_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from .utils import dummy_sentencepiece_and_speech_objects
-
-    _import_structure["utils.dummy_sentencepiece_and_speech_objects"] = [
-        name for name in dir(dummy_sentencepiece_and_speech_objects) if not name.startswith("_")
-    ]
-else:
-    _import_structure["models.speech_to_text"].append("Speech2TextProcessor")
-
 # Vision-specific objects
 try:
     if not is_vision_available():
@@ -770,6 +947,7 @@
     _import_structure["models.beit"].extend(["BeitFeatureExtractor", "BeitImageProcessor"])
     _import_structure["models.bit"].extend(["BitImageProcessor"])
     _import_structure["models.blip"].extend(["BlipImageProcessor"])
+    _import_structure["models.bridgetower"].append("BridgeTowerImageProcessor")
     _import_structure["models.chinese_clip"].extend(["ChineseCLIPFeatureExtractor", "ChineseCLIPImageProcessor"])
     _import_structure["models.clip"].extend(["CLIPFeatureExtractor", "CLIPImageProcessor"])
     _import_structure["models.conditional_detr"].extend(
@@ -780,76 +958,43 @@
         ["DeformableDetrFeatureExtractor", "DeformableDetrImageProcessor"]
     )
     _import_structure["models.deit"].extend(["DeiTFeatureExtractor", "DeiTImageProcessor"])
+    _import_structure["models.deta"].append("DetaImageProcessor")
     _import_structure["models.detr"].extend(["DetrFeatureExtractor", "DetrImageProcessor"])
     _import_structure["models.donut"].extend(["DonutFeatureExtractor", "DonutImageProcessor"])
     _import_structure["models.dpt"].extend(["DPTFeatureExtractor", "DPTImageProcessor"])
+    _import_structure["models.efficientformer"].append("EfficientFormerImageProcessor")
+    _import_structure["models.efficientnet"].append("EfficientNetImageProcessor")
     _import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaImageProcessor", "FlavaProcessor"])
     _import_structure["models.glpn"].extend(["GLPNFeatureExtractor", "GLPNImageProcessor"])
+    _import_structure["models.idefics"].extend(["IdeficsImageProcessor"])
     _import_structure["models.imagegpt"].extend(["ImageGPTFeatureExtractor", "ImageGPTImageProcessor"])
     _import_structure["models.layoutlmv2"].extend(["LayoutLMv2FeatureExtractor", "LayoutLMv2ImageProcessor"])
     _import_structure["models.layoutlmv3"].extend(["LayoutLMv3FeatureExtractor", "LayoutLMv3ImageProcessor"])
     _import_structure["models.levit"].extend(["LevitFeatureExtractor", "LevitImageProcessor"])
+    _import_structure["models.mask2former"].append("Mask2FormerImageProcessor")
     _import_structure["models.maskformer"].extend(["MaskFormerFeatureExtractor", "MaskFormerImageProcessor"])
     _import_structure["models.mobilenet_v1"].extend(["MobileNetV1FeatureExtractor", "MobileNetV1ImageProcessor"])
     _import_structure["models.mobilenet_v2"].extend(["MobileNetV2FeatureExtractor", "MobileNetV2ImageProcessor"])
     _import_structure["models.mobilevit"].extend(["MobileViTFeatureExtractor", "MobileViTImageProcessor"])
+    _import_structure["models.nougat"].append("NougatImageProcessor")
+    _import_structure["models.oneformer"].extend(["OneFormerImageProcessor"])
     _import_structure["models.owlvit"].extend(["OwlViTFeatureExtractor", "OwlViTImageProcessor"])
     _import_structure["models.perceiver"].extend(["PerceiverFeatureExtractor", "PerceiverImageProcessor"])
+    _import_structure["models.pix2struct"].extend(["Pix2StructImageProcessor"])
     _import_structure["models.poolformer"].extend(["PoolFormerFeatureExtractor", "PoolFormerImageProcessor"])
+    _import_structure["models.pvt"].extend(["PvtImageProcessor"])
+    _import_structure["models.sam"].extend(["SamImageProcessor"])
     _import_structure["models.segformer"].extend(["SegformerFeatureExtractor", "SegformerImageProcessor"])
     _import_structure["models.swin2sr"].append("Swin2SRImageProcessor")
+    _import_structure["models.tvlt"].append("TvltImageProcessor")
     _import_structure["models.videomae"].extend(["VideoMAEFeatureExtractor", "VideoMAEImageProcessor"])
     _import_structure["models.vilt"].extend(["ViltFeatureExtractor", "ViltImageProcessor", "ViltProcessor"])
     _import_structure["models.vit"].extend(["ViTFeatureExtractor", "ViTImageProcessor"])
     _import_structure["models.vit_hybrid"].extend(["ViTHybridImageProcessor"])
+    _import_structure["models.vitmatte"].append("VitMatteImageProcessor")
+    _import_structure["models.vivit"].append("VivitImageProcessor")
     _import_structure["models.yolos"].extend(["YolosFeatureExtractor", "YolosImageProcessor"])
 
-# Timm-backed objects
-try:
-    if not (is_timm_available() and is_vision_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from .utils import dummy_timm_and_vision_objects
-
-    _import_structure["utils.dummy_timm_and_vision_objects"] = [
-        name for name in dir(dummy_timm_and_vision_objects) if not name.startswith("_")
-    ]
-else:
-    _import_structure["models.deformable_detr"].extend(
-        [
-            "DEFORMABLE_DETR_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "DeformableDetrForObjectDetection",
-            "DeformableDetrModel",
-            "DeformableDetrPreTrainedModel",
-        ]
-    )
-    _import_structure["models.detr"].extend(
-        [
-            "DETR_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "DetrForObjectDetection",
-            "DetrForSegmentation",
-            "DetrModel",
-            "DetrPreTrainedModel",
-        ]
-    )
-    _import_structure["models.table_transformer"].extend(
-        [
-            "TABLE_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "TableTransformerForObjectDetection",
-            "TableTransformerModel",
-            "TableTransformerPreTrainedModel",
-        ]
-    )
-    _import_structure["models.conditional_detr"].extend(
-        [
-            "CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "ConditionalDetrForObjectDetection",
-            "ConditionalDetrForSegmentation",
-            "ConditionalDetrModel",
-            "ConditionalDetrPreTrainedModel",
-        ]
-    )
-
 
 # PyTorch-backed objects
 try:
@@ -874,20 +1019,28 @@
         "TextDataset",
         "TextDatasetForNextSentencePrediction",
     ]
-    _import_structure["deepspeed"] = []
     _import_structure["generation"].extend(
         [
+            "AlternatingCodebooksLogitsProcessor",
             "BeamScorer",
             "BeamSearchScorer",
+            "ClassifierFreeGuidanceLogitsProcessor",
             "ConstrainedBeamSearchScorer",
             "Constraint",
             "ConstraintListState",
             "DisjunctiveConstraint",
+            "EncoderNoRepeatNGramLogitsProcessor",
+            "EncoderRepetitionPenaltyLogitsProcessor",
+            "EpsilonLogitsWarper",
+            "EtaLogitsWarper",
+            "ExponentialDecayLengthPenalty",
             "ForcedBOSTokenLogitsProcessor",
             "ForcedEOSTokenLogitsProcessor",
+            "ForceTokensLogitsProcessor",
             "GenerationMixin",
             "HammingDiversityLogitsProcessor",
             "InfNanRemoveLogitsProcessor",
+            "LogitNormalization",
             "LogitsProcessor",
             "LogitsProcessorList",
             "LogitsWarper",
@@ -900,12 +1053,17 @@
             "PhrasalConstraint",
             "PrefixConstrainedLogitsProcessor",
             "RepetitionPenaltyLogitsProcessor",
+            "SequenceBiasLogitsProcessor",
             "StoppingCriteria",
             "StoppingCriteriaList",
+            "SuppressTokensAtBeginLogitsProcessor",
+            "SuppressTokensLogitsProcessor",
             "TemperatureLogitsWarper",
             "TopKLogitsWarper",
             "TopPLogitsWarper",
             "TypicalLogitsWarper",
+            "UnbatchedClassifierFreeGuidanceLogitsProcessor",
+            "WhisperTimeStampLogitsProcessor",
             "top_k_top_p_filtering",
         ]
     )
@@ -928,6 +1086,15 @@
             "load_tf_weights_in_albert",
         ]
     )
+    _import_structure["models.align"].extend(
+        [
+            "ALIGN_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "AlignModel",
+            "AlignPreTrainedModel",
+            "AlignTextModel",
+            "AlignVisionModel",
+        ]
+    )
     _import_structure["models.altclip"].extend(
         [
             "ALTCLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -948,6 +1115,7 @@
     _import_structure["models.auto"].extend(
         [
             "MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
+            "MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING",
             "MODEL_FOR_AUDIO_XVECTOR_MAPPING",
             "MODEL_FOR_BACKBONE_MAPPING",
             "MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING",
@@ -957,9 +1125,11 @@
             "MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING",
             "MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
             "MODEL_FOR_IMAGE_SEGMENTATION_MAPPING",
+            "MODEL_FOR_IMAGE_TO_IMAGE_MAPPING",
             "MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING",
             "MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING",
             "MODEL_FOR_MASKED_LM_MAPPING",
+            "MODEL_FOR_MASK_GENERATION_MAPPING",
             "MODEL_FOR_MULTIPLE_CHOICE_MAPPING",
             "MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING",
             "MODEL_FOR_OBJECT_DETECTION_MAPPING",
@@ -970,11 +1140,15 @@
             "MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING",
             "MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
             "MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
+            "MODEL_FOR_TEXT_ENCODING_MAPPING",
+            "MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING",
+            "MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING",
             "MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
             "MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING",
             "MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING",
             "MODEL_FOR_VISION_2_SEQ_MAPPING",
             "MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING",
+            "MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING",
             "MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING",
             "MODEL_MAPPING",
             "MODEL_WITH_LM_HEAD_MAPPING",
@@ -989,9 +1163,11 @@
             "AutoModelForDocumentQuestionAnswering",
             "AutoModelForImageClassification",
             "AutoModelForImageSegmentation",
+            "AutoModelForImageToImage",
             "AutoModelForInstanceSegmentation",
             "AutoModelForMaskedImageModeling",
             "AutoModelForMaskedLM",
+            "AutoModelForMaskGeneration",
             "AutoModelForMultipleChoice",
             "AutoModelForNextSentencePrediction",
             "AutoModelForObjectDetection",
@@ -1002,15 +1178,38 @@
             "AutoModelForSequenceClassification",
             "AutoModelForSpeechSeq2Seq",
             "AutoModelForTableQuestionAnswering",
+            "AutoModelForTextEncoding",
+            "AutoModelForTextToSpectrogram",
+            "AutoModelForTextToWaveform",
             "AutoModelForTokenClassification",
             "AutoModelForUniversalSegmentation",
             "AutoModelForVideoClassification",
             "AutoModelForVision2Seq",
             "AutoModelForVisualQuestionAnswering",
+            "AutoModelForZeroShotImageClassification",
             "AutoModelForZeroShotObjectDetection",
             "AutoModelWithLMHead",
         ]
     )
+    _import_structure["models.autoformer"].extend(
+        [
+            "AUTOFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "AutoformerForPrediction",
+            "AutoformerModel",
+            "AutoformerPreTrainedModel",
+        ]
+    )
+    _import_structure["models.bark"].extend(
+        [
+            "BARK_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "BarkCausalModel",
+            "BarkCoarseModel",
+            "BarkFineModel",
+            "BarkModel",
+            "BarkPreTrainedModel",
+            "BarkSemanticModel",
+        ]
+    )
     _import_structure["models.bart"].extend(
         [
             "BART_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1020,6 +1219,7 @@
             "BartForSequenceClassification",
             "BartModel",
             "BartPretrainedModel",
+            "BartPreTrainedModel",
             "PretrainedBartModel",
         ]
     )
@@ -1089,6 +1289,8 @@
         [
             "BIOGPT_PRETRAINED_MODEL_ARCHIVE_LIST",
             "BioGptForCausalLM",
+            "BioGptForSequenceClassification",
+            "BioGptForTokenClassification",
             "BioGptModel",
             "BioGptPreTrainedModel",
         ]
@@ -1132,6 +1334,16 @@
             "BlipVisionModel",
         ]
     )
+    _import_structure["models.blip_2"].extend(
+        [
+            "BLIP_2_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "Blip2ForConditionalGeneration",
+            "Blip2Model",
+            "Blip2PreTrainedModel",
+            "Blip2QFormerModel",
+            "Blip2VisionModel",
+        ]
+    )
     _import_structure["models.bloom"].extend(
         [
             "BLOOM_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1143,6 +1355,27 @@
             "BloomPreTrainedModel",
         ]
     )
+    _import_structure["models.bridgetower"].extend(
+        [
+            "BRIDGETOWER_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "BridgeTowerForContrastiveLearning",
+            "BridgeTowerForImageAndTextRetrieval",
+            "BridgeTowerForMaskedLM",
+            "BridgeTowerModel",
+            "BridgeTowerPreTrainedModel",
+        ]
+    )
+    _import_structure["models.bros"].extend(
+        [
+            "BROS_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "BrosForTokenClassification",
+            "BrosModel",
+            "BrosPreTrainedModel",
+            "BrosProcessor",
+            "BrosSpadeEEForTokenClassification",
+            "BrosSpadeELForTokenClassification",
+        ]
+    )
     _import_structure["models.camembert"].extend(
         [
             "CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1178,6 +1411,18 @@
             "ChineseCLIPVisionModel",
         ]
     )
+    _import_structure["models.clap"].extend(
+        [
+            "CLAP_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "ClapAudioModel",
+            "ClapAudioModelWithProjection",
+            "ClapFeatureExtractor",
+            "ClapModel",
+            "ClapPreTrainedModel",
+            "ClapTextModel",
+            "ClapTextModelWithProjection",
+        ]
+    )
     _import_structure["models.clip"].extend(
         [
             "CLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1207,6 +1452,15 @@
             "CodeGenPreTrainedModel",
         ]
     )
+    _import_structure["models.conditional_detr"].extend(
+        [
+            "CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "ConditionalDetrForObjectDetection",
+            "ConditionalDetrForSegmentation",
+            "ConditionalDetrModel",
+            "ConditionalDetrPreTrainedModel",
+        ]
+    )
     _import_structure["models.convbert"].extend(
         [
             "CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1224,11 +1478,29 @@
     _import_structure["models.convnext"].extend(
         [
             "CONVNEXT_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "ConvNextBackbone",
             "ConvNextForImageClassification",
             "ConvNextModel",
             "ConvNextPreTrainedModel",
         ]
     )
+    _import_structure["models.convnextv2"].extend(
+        [
+            "CONVNEXTV2_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "ConvNextV2Backbone",
+            "ConvNextV2ForImageClassification",
+            "ConvNextV2Model",
+            "ConvNextV2PreTrainedModel",
+        ]
+    )
+    _import_structure["models.cpmant"].extend(
+        [
+            "CPMANT_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "CpmAntForCausalLM",
+            "CpmAntModel",
+            "CpmAntPreTrainedModel",
+        ]
+    )
     _import_structure["models.ctrl"].extend(
         [
             "CTRL_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1303,6 +1575,14 @@
             "DecisionTransformerPreTrainedModel",
         ]
     )
+    _import_structure["models.deformable_detr"].extend(
+        [
+            "DEFORMABLE_DETR_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "DeformableDetrForObjectDetection",
+            "DeformableDetrModel",
+            "DeformableDetrPreTrainedModel",
+        ]
+    )
     _import_structure["models.deit"].extend(
         [
             "DEIT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1313,6 +1593,53 @@
             "DeiTPreTrainedModel",
         ]
     )
+    _import_structure["models.deprecated.mctct"].extend(
+        [
+            "MCTCT_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "MCTCTForCTC",
+            "MCTCTModel",
+            "MCTCTPreTrainedModel",
+        ]
+    )
+    _import_structure["models.deprecated.mmbt"].extend(["MMBTForClassification", "MMBTModel", "ModalEmbeddings"])
+    _import_structure["models.deprecated.open_llama"].extend(
+        ["OpenLlamaForCausalLM", "OpenLlamaForSequenceClassification", "OpenLlamaModel", "OpenLlamaPreTrainedModel"]
+    )
+    _import_structure["models.deprecated.retribert"].extend(
+        ["RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST", "RetriBertModel", "RetriBertPreTrainedModel"]
+    )
+    _import_structure["models.deprecated.trajectory_transformer"].extend(
+        [
+            "TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "TrajectoryTransformerModel",
+            "TrajectoryTransformerPreTrainedModel",
+        ]
+    )
+    _import_structure["models.deprecated.van"].extend(
+        [
+            "VAN_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "VanForImageClassification",
+            "VanModel",
+            "VanPreTrainedModel",
+        ]
+    )
+    _import_structure["models.deta"].extend(
+        [
+            "DETA_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "DetaForObjectDetection",
+            "DetaModel",
+            "DetaPreTrainedModel",
+        ]
+    )
+    _import_structure["models.detr"].extend(
+        [
+            "DETR_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "DetrForObjectDetection",
+            "DetrForSegmentation",
+            "DetrModel",
+            "DetrPreTrainedModel",
+        ]
+    )
     _import_structure["models.dinat"].extend(
         [
             "DINAT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1322,6 +1649,15 @@
             "DinatPreTrainedModel",
         ]
     )
+    _import_structure["models.dinov2"].extend(
+        [
+            "DINOV2_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "Dinov2Backbone",
+            "Dinov2ForImageClassification",
+            "Dinov2Model",
+            "Dinov2PreTrainedModel",
+        ]
+    )
     _import_structure["models.distilbert"].extend(
         [
             "DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1364,6 +1700,23 @@
             "DPTPreTrainedModel",
         ]
     )
+    _import_structure["models.efficientformer"].extend(
+        [
+            "EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "EfficientFormerForImageClassification",
+            "EfficientFormerForImageClassificationWithTeacher",
+            "EfficientFormerModel",
+            "EfficientFormerPreTrainedModel",
+        ]
+    )
+    _import_structure["models.efficientnet"].extend(
+        [
+            "EFFICIENTNET_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "EfficientNetForImageClassification",
+            "EfficientNetModel",
+            "EfficientNetPreTrainedModel",
+        ]
+    )
     _import_structure["models.electra"].extend(
         [
             "ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1379,6 +1732,13 @@
             "load_tf_weights_in_electra",
         ]
     )
+    _import_structure["models.encodec"].extend(
+        [
+            "ENCODEC_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "EncodecModel",
+            "EncodecPreTrainedModel",
+        ]
+    )
     _import_structure["models.encoder_decoder"].append("EncoderDecoderModel")
     _import_structure["models.ernie"].extend(
         [
@@ -1395,6 +1755,18 @@
             "ErniePreTrainedModel",
         ]
     )
+    _import_structure["models.ernie_m"].extend(
+        [
+            "ERNIE_M_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "ErnieMForInformationExtraction",
+            "ErnieMForMultipleChoice",
+            "ErnieMForQuestionAnswering",
+            "ErnieMForSequenceClassification",
+            "ErnieMForTokenClassification",
+            "ErnieMModel",
+            "ErnieMPreTrainedModel",
+        ]
+    )
     _import_structure["models.esm"].extend(
         [
             "ESM_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1407,6 +1779,17 @@
             "EsmPreTrainedModel",
         ]
     )
+    _import_structure["models.falcon"].extend(
+        [
+            "FALCON_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "FalconForCausalLM",
+            "FalconForQuestionAnswering",
+            "FalconForSequenceClassification",
+            "FalconForTokenClassification",
+            "FalconModel",
+            "FalconPreTrainedModel",
+        ]
+    )
     _import_structure["models.flaubert"].extend(
         [
             "FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1447,6 +1830,16 @@
             "FNetPreTrainedModel",
         ]
     )
+    _import_structure["models.focalnet"].extend(
+        [
+            "FOCALNET_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "FocalNetBackbone",
+            "FocalNetForImageClassification",
+            "FocalNetForMaskedImageModeling",
+            "FocalNetModel",
+            "FocalNetPreTrainedModel",
+        ]
+    )
     _import_structure["models.fsmt"].extend(["FSMTForConditionalGeneration", "FSMTModel", "PretrainedFSMTModel"])
     _import_structure["models.funnel"].extend(
         [
@@ -1484,6 +1877,7 @@
         [
             "GPT2_PRETRAINED_MODEL_ARCHIVE_LIST",
             "GPT2DoubleHeadsModel",
+            "GPT2ForQuestionAnswering",
             "GPT2ForSequenceClassification",
             "GPT2ForTokenClassification",
             "GPT2LMHeadModel",
@@ -1492,11 +1886,23 @@
             "load_tf_weights_in_gpt2",
         ]
     )
+    _import_structure["models.gpt_bigcode"].extend(
+        [
+            "GPT_BIGCODE_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "GPTBigCodeForCausalLM",
+            "GPTBigCodeForSequenceClassification",
+            "GPTBigCodeForTokenClassification",
+            "GPTBigCodeModel",
+            "GPTBigCodePreTrainedModel",
+        ]
+    )
     _import_structure["models.gpt_neo"].extend(
         [
             "GPT_NEO_PRETRAINED_MODEL_ARCHIVE_LIST",
             "GPTNeoForCausalLM",
+            "GPTNeoForQuestionAnswering",
             "GPTNeoForSequenceClassification",
+            "GPTNeoForTokenClassification",
             "GPTNeoModel",
             "GPTNeoPreTrainedModel",
             "load_tf_weights_in_gpt_neo",
@@ -1506,6 +1912,9 @@
         [
             "GPT_NEOX_PRETRAINED_MODEL_ARCHIVE_LIST",
             "GPTNeoXForCausalLM",
+            "GPTNeoXForQuestionAnswering",
+            "GPTNeoXForSequenceClassification",
+            "GPTNeoXForTokenClassification",
             "GPTNeoXLayer",
             "GPTNeoXModel",
             "GPTNeoXPreTrainedModel",
@@ -1530,6 +1939,22 @@
             "GPTJPreTrainedModel",
         ]
     )
+    _import_structure["models.gptsan_japanese"].extend(
+        [
+            "GPTSAN_JAPANESE_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "GPTSanJapaneseForConditionalGeneration",
+            "GPTSanJapaneseModel",
+            "GPTSanJapanesePreTrainedModel",
+        ]
+    )
+    _import_structure["models.graphormer"].extend(
+        [
+            "GRAPHORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "GraphormerForGraphClassification",
+            "GraphormerModel",
+            "GraphormerPreTrainedModel",
+        ]
+    )
     _import_structure["models.groupvit"].extend(
         [
             "GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1560,6 +1985,15 @@
             "IBertPreTrainedModel",
         ]
     )
+    _import_structure["models.idefics"].extend(
+        [
+            "IDEFICS_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "IdeficsForVisionText2Text",
+            "IdeficsModel",
+            "IdeficsPreTrainedModel",
+            "IdeficsProcessor",
+        ]
+    )
     _import_structure["models.imagegpt"].extend(
         [
             "IMAGEGPT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1570,6 +2004,23 @@
             "load_tf_weights_in_imagegpt",
         ]
     )
+    _import_structure["models.informer"].extend(
+        [
+            "INFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "InformerForPrediction",
+            "InformerModel",
+            "InformerPreTrainedModel",
+        ]
+    )
+    _import_structure["models.instructblip"].extend(
+        [
+            "INSTRUCTBLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "InstructBlipForConditionalGeneration",
+            "InstructBlipPreTrainedModel",
+            "InstructBlipQFormerModel",
+            "InstructBlipVisionModel",
+        ]
+    )
     _import_structure["models.jukebox"].extend(
         [
             "JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1639,6 +2090,9 @@
             "LiltPreTrainedModel",
         ]
     )
+    _import_structure["models.llama"].extend(
+        ["LlamaForCausalLM", "LlamaForSequenceClassification", "LlamaModel", "LlamaPreTrainedModel"]
+    )
     _import_structure["models.longformer"].extend(
         [
             "LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1706,6 +2160,14 @@
             "MarkupLMPreTrainedModel",
         ]
     )
+    _import_structure["models.mask2former"].extend(
+        [
+            "MASK2FORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "Mask2FormerForUniversalSegmentation",
+            "Mask2FormerModel",
+            "Mask2FormerPreTrainedModel",
+        ]
+    )
     _import_structure["models.maskformer"].extend(
         [
             "MASKFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1725,12 +2187,17 @@
             "MBartPreTrainedModel",
         ]
     )
-    _import_structure["models.mctct"].extend(
+    _import_structure["models.mega"].extend(
         [
-            "MCTCT_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "MCTCTForCTC",
-            "MCTCTModel",
-            "MCTCTPreTrainedModel",
+            "MEGA_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "MegaForCausalLM",
+            "MegaForMaskedLM",
+            "MegaForMultipleChoice",
+            "MegaForQuestionAnswering",
+            "MegaForSequenceClassification",
+            "MegaForTokenClassification",
+            "MegaModel",
+            "MegaPreTrainedModel",
         ]
     )
     _import_structure["models.megatron_bert"].extend(
@@ -1748,7 +2215,17 @@
             "MegatronBertPreTrainedModel",
         ]
     )
-    _import_structure["models.mmbt"].extend(["MMBTForClassification", "MMBTModel", "ModalEmbeddings"])
+    _import_structure["models.mgp_str"].extend(
+        [
+            "MGP_STR_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "MgpstrForSceneTextRecognition",
+            "MgpstrModel",
+            "MgpstrPreTrainedModel",
+        ]
+    )
+    _import_structure["models.mistral"].extend(
+        ["MistralForCausalLM", "MistralForSequenceClassification", "MistralModel", "MistralPreTrainedModel"]
+    )
     _import_structure["models.mobilebert"].extend(
         [
             "MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1793,6 +2270,15 @@
             "MobileViTPreTrainedModel",
         ]
     )
+    _import_structure["models.mobilevitv2"].extend(
+        [
+            "MOBILEVITV2_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "MobileViTV2ForImageClassification",
+            "MobileViTV2ForSemanticSegmentation",
+            "MobileViTV2Model",
+            "MobileViTV2PreTrainedModel",
+        ]
+    )
     _import_structure["models.mpnet"].extend(
         [
             "MPNET_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1806,8 +2292,48 @@
             "MPNetPreTrainedModel",
         ]
     )
+    _import_structure["models.mpt"].extend(
+        [
+            "MPT_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "MptForCausalLM",
+            "MptForQuestionAnswering",
+            "MptForSequenceClassification",
+            "MptForTokenClassification",
+            "MptModel",
+            "MptPreTrainedModel",
+        ]
+    )
+    _import_structure["models.mra"].extend(
+        [
+            "MRA_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "MraForMaskedLM",
+            "MraForMultipleChoice",
+            "MraForQuestionAnswering",
+            "MraForSequenceClassification",
+            "MraForTokenClassification",
+            "MraModel",
+            "MraPreTrainedModel",
+        ]
+    )
     _import_structure["models.mt5"].extend(
-        ["MT5EncoderModel", "MT5ForConditionalGeneration", "MT5Model", "MT5PreTrainedModel"]
+        [
+            "MT5EncoderModel",
+            "MT5ForConditionalGeneration",
+            "MT5ForQuestionAnswering",
+            "MT5ForSequenceClassification",
+            "MT5Model",
+            "MT5PreTrainedModel",
+        ]
+    )
+    _import_structure["models.musicgen"].extend(
+        [
+            "MUSICGEN_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "MusicgenForCausalLM",
+            "MusicgenForConditionalGeneration",
+            "MusicgenModel",
+            "MusicgenPreTrainedModel",
+            "MusicgenProcessor",
+        ]
     )
     _import_structure["models.mvp"].extend(
         [
@@ -1843,6 +2369,16 @@
             "NezhaPreTrainedModel",
         ]
     )
+    _import_structure["models.nllb_moe"].extend(
+        [
+            "NLLB_MOE_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "NllbMoeForConditionalGeneration",
+            "NllbMoeModel",
+            "NllbMoePreTrainedModel",
+            "NllbMoeSparseMLP",
+            "NllbMoeTop2Router",
+        ]
+    )
     _import_structure["models.nystromformer"].extend(
         [
             "NYSTROMFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1856,6 +2392,14 @@
             "NystromformerPreTrainedModel",
         ]
     )
+    _import_structure["models.oneformer"].extend(
+        [
+            "ONEFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "OneFormerForUniversalSegmentation",
+            "OneFormerModel",
+            "OneFormerPreTrainedModel",
+        ]
+    )
     _import_structure["models.openai"].extend(
         [
             "OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1913,6 +2457,18 @@
             "PerceiverPreTrainedModel",
         ]
     )
+    _import_structure["models.persimmon"].extend(
+        ["PersimmonForCausalLM", "PersimmonForSequenceClassification", "PersimmonModel", "PersimmonPreTrainedModel"]
+    )
+    _import_structure["models.pix2struct"].extend(
+        [
+            "PIX2STRUCT_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "Pix2StructForConditionalGeneration",
+            "Pix2StructPreTrainedModel",
+            "Pix2StructTextModel",
+            "Pix2StructVisionModel",
+        ]
+    )
     _import_structure["models.plbart"].extend(
         [
             "PLBART_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1931,6 +2487,13 @@
             "PoolFormerPreTrainedModel",
         ]
     )
+    _import_structure["models.pop2piano"].extend(
+        [
+            "POP2PIANO_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "Pop2PianoForConditionalGeneration",
+            "Pop2PianoPreTrainedModel",
+        ]
+    )
     _import_structure["models.prophetnet"].extend(
         [
             "PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1942,6 +2505,14 @@
             "ProphetNetPreTrainedModel",
         ]
     )
+    _import_structure["models.pvt"].extend(
+        [
+            "PVT_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "PvtForImageClassification",
+            "PvtModel",
+            "PvtPreTrainedModel",
+        ]
+    )
     _import_structure["models.qdqbert"].extend(
         [
             "QDQBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2019,9 +2590,6 @@
             "ResNetPreTrainedModel",
         ]
     )
-    _import_structure["models.retribert"].extend(
-        ["RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST", "RetriBertModel", "RetriBertPreTrainedModel"]
-    )
     _import_structure["models.roberta"].extend(
         [
             "ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2079,6 +2647,21 @@
             "load_tf_weights_in_roformer",
         ]
     )
+    _import_structure["models.rwkv"].extend(
+        [
+            "RWKV_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "RwkvForCausalLM",
+            "RwkvModel",
+            "RwkvPreTrainedModel",
+        ]
+    )
+    _import_structure["models.sam"].extend(
+        [
+            "SAM_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "SamModel",
+            "SamPreTrainedModel",
+        ]
+    )
     _import_structure["models.segformer"].extend(
         [
             "SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2118,6 +2701,17 @@
         ]
     )
     _import_structure["models.speech_to_text_2"].extend(["Speech2Text2ForCausalLM", "Speech2Text2PreTrainedModel"])
+    _import_structure["models.speecht5"].extend(
+        [
+            "SPEECHT5_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "SpeechT5ForSpeechToSpeech",
+            "SpeechT5ForSpeechToText",
+            "SpeechT5ForTextToSpeech",
+            "SpeechT5HifiGan",
+            "SpeechT5Model",
+            "SpeechT5PreTrainedModel",
+        ]
+    )
     _import_structure["models.splinter"].extend(
         [
             "SPLINTER_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2141,6 +2735,14 @@
             "SqueezeBertPreTrainedModel",
         ]
     )
+    _import_structure["models.swiftformer"].extend(
+        [
+            "SWIFTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "SwiftFormerForImageClassification",
+            "SwiftFormerModel",
+            "SwiftFormerPreTrainedModel",
+        ]
+    )
     _import_structure["models.swin"].extend(
         [
             "SWIN_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2184,11 +2786,21 @@
             "T5_PRETRAINED_MODEL_ARCHIVE_LIST",
             "T5EncoderModel",
             "T5ForConditionalGeneration",
+            "T5ForQuestionAnswering",
+            "T5ForSequenceClassification",
             "T5Model",
             "T5PreTrainedModel",
             "load_tf_weights_in_t5",
         ]
     )
+    _import_structure["models.table_transformer"].extend(
+        [
+            "TABLE_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "TableTransformerForObjectDetection",
+            "TableTransformerModel",
+            "TableTransformerPreTrainedModel",
+        ]
+    )
     _import_structure["models.tapas"].extend(
         [
             "TAPAS_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2216,13 +2828,7 @@
             "TimesformerPreTrainedModel",
         ]
     )
-    _import_structure["models.trajectory_transformer"].extend(
-        [
-            "TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "TrajectoryTransformerModel",
-            "TrajectoryTransformerPreTrainedModel",
-        ]
-    )
+    _import_structure["models.timm_backbone"].extend(["TimmBackbone"])
     _import_structure["models.transfo_xl"].extend(
         [
             "TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2237,6 +2843,25 @@
     _import_structure["models.trocr"].extend(
         ["TROCR_PRETRAINED_MODEL_ARCHIVE_LIST", "TrOCRForCausalLM", "TrOCRPreTrainedModel"]
     )
+    _import_structure["models.tvlt"].extend(
+        [
+            "TVLT_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "TvltForAudioVisualClassification",
+            "TvltForPreTraining",
+            "TvltModel",
+            "TvltPreTrainedModel",
+        ]
+    )
+    _import_structure["models.umt5"].extend(
+        [
+            "UMT5EncoderModel",
+            "UMT5ForConditionalGeneration",
+            "UMT5ForQuestionAnswering",
+            "UMT5ForSequenceClassification",
+            "UMT5Model",
+            "UMT5PreTrainedModel",
+        ]
+    )
     _import_structure["models.unispeech"].extend(
         [
             "UNISPEECH_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2259,12 +2884,10 @@
             "UniSpeechSatPreTrainedModel",
         ]
     )
-    _import_structure["models.van"].extend(
+    _import_structure["models.upernet"].extend(
         [
-            "VAN_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "VanForImageClassification",
-            "VanModel",
-            "VanPreTrainedModel",
+            "UperNetForSemanticSegmentation",
+            "UperNetPreTrainedModel",
         ]
     )
     _import_structure["models.videomae"].extend(
@@ -2338,6 +2961,36 @@
             "ViTMSNPreTrainedModel",
         ]
     )
+    _import_structure["models.vitdet"].extend(
+        [
+            "VITDET_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "VitDetBackbone",
+            "VitDetModel",
+            "VitDetPreTrainedModel",
+        ]
+    )
+    _import_structure["models.vitmatte"].extend(
+        [
+            "VITMATTE_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "VitMatteForImageMatting",
+            "VitMattePreTrainedModel",
+        ]
+    )
+    _import_structure["models.vits"].extend(
+        [
+            "VITS_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "VitsModel",
+            "VitsPreTrainedModel",
+        ]
+    )
+    _import_structure["models.vivit"].extend(
+        [
+            "VIVIT_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "VivitForVideoClassification",
+            "VivitModel",
+            "VivitPreTrainedModel",
+        ]
+    )
     _import_structure["models.wav2vec2"].extend(
         [
             "WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2377,6 +3030,7 @@
     _import_structure["models.whisper"].extend(
         [
             "WHISPER_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "WhisperForAudioClassification",
             "WhisperForConditionalGeneration",
             "WhisperModel",
             "WhisperPreTrainedModel",
@@ -2463,6 +3117,19 @@
             "load_tf_weights_in_xlnet",
         ]
     )
+    _import_structure["models.xmod"].extend(
+        [
+            "XMOD_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "XmodForCausalLM",
+            "XmodForMaskedLM",
+            "XmodForMultipleChoice",
+            "XmodForQuestionAnswering",
+            "XmodForSequenceClassification",
+            "XmodForTokenClassification",
+            "XmodModel",
+            "XmodPreTrainedModel",
+        ]
+    )
     _import_structure["models.yolos"].extend(
         [
             "YOLOS_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2491,12 +3158,14 @@
         "get_constant_schedule_with_warmup",
         "get_cosine_schedule_with_warmup",
         "get_cosine_with_hard_restarts_schedule_with_warmup",
+        "get_inverse_sqrt_schedule",
         "get_linear_schedule_with_warmup",
         "get_polynomial_decay_schedule_with_warmup",
         "get_scheduler",
     ]
     _import_structure["pytorch_utils"] = ["Conv1D", "apply_chunking_to_forward", "prune_layer"]
     _import_structure["sagemaker"] = []
+    _import_structure["time_series_utils"] = []
     _import_structure["trainer"] = ["Trainer"]
     _import_structure["trainer_pt_utils"] = ["torch_distributed_zero_first"]
     _import_structure["trainer_seq2seq"] = ["Seq2SeqTrainer"]
@@ -2517,6 +3186,7 @@
         [
             "TFForcedBOSTokenLogitsProcessor",
             "TFForcedEOSTokenLogitsProcessor",
+            "TFForceTokensLogitsProcessor",
             "TFGenerationMixin",
             "TFLogitsProcessor",
             "TFLogitsProcessorList",
@@ -2525,6 +3195,8 @@
             "TFNoBadWordsLogitsProcessor",
             "TFNoRepeatNGramLogitsProcessor",
             "TFRepetitionPenaltyLogitsProcessor",
+            "TFSuppressTokensAtBeginLogitsProcessor",
+            "TFSuppressTokensLogitsProcessor",
             "TFTemperatureLogitsWarper",
             "TFTopKLogitsWarper",
             "TFTopPLogitsWarper",
@@ -2557,11 +3229,13 @@
     )
     _import_structure["models.auto"].extend(
         [
+            "TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
             "TF_MODEL_FOR_CAUSAL_LM_MAPPING",
             "TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING",
             "TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
             "TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING",
             "TF_MODEL_FOR_MASKED_LM_MAPPING",
+            "TF_MODEL_FOR_MASK_GENERATION_MAPPING",
             "TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING",
             "TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING",
             "TF_MODEL_FOR_PRETRAINING_MAPPING",
@@ -2571,15 +3245,20 @@
             "TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING",
             "TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
             "TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
+            "TF_MODEL_FOR_TEXT_ENCODING_MAPPING",
             "TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
             "TF_MODEL_FOR_VISION_2_SEQ_MAPPING",
+            "TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING",
             "TF_MODEL_MAPPING",
             "TF_MODEL_WITH_LM_HEAD_MAPPING",
             "TFAutoModel",
+            "TFAutoModelForAudioClassification",
             "TFAutoModelForCausalLM",
             "TFAutoModelForDocumentQuestionAnswering",
             "TFAutoModelForImageClassification",
+            "TFAutoModelForMaskedImageModeling",
             "TFAutoModelForMaskedLM",
+            "TFAutoModelForMaskGeneration",
             "TFAutoModelForMultipleChoice",
             "TFAutoModelForNextSentencePrediction",
             "TFAutoModelForPreTraining",
@@ -2589,8 +3268,10 @@
             "TFAutoModelForSequenceClassification",
             "TFAutoModelForSpeechSeq2Seq",
             "TFAutoModelForTableQuestionAnswering",
+            "TFAutoModelForTextEncoding",
             "TFAutoModelForTokenClassification",
             "TFAutoModelForVision2Seq",
+            "TFAutoModelForZeroShotImageClassification",
             "TFAutoModelWithLMHead",
         ]
     )
@@ -2620,6 +3301,18 @@
     _import_structure["models.blenderbot_small"].extend(
         ["TFBlenderbotSmallForConditionalGeneration", "TFBlenderbotSmallModel", "TFBlenderbotSmallPreTrainedModel"]
     )
+    _import_structure["models.blip"].extend(
+        [
+            "TF_BLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "TFBlipForConditionalGeneration",
+            "TFBlipForImageTextRetrieval",
+            "TFBlipForQuestionAnswering",
+            "TFBlipModel",
+            "TFBlipPreTrainedModel",
+            "TFBlipTextModel",
+            "TFBlipVisionModel",
+        ]
+    )
     _import_structure["models.camembert"].extend(
         [
             "TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2702,6 +3395,7 @@
         [
             "TF_DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST",
             "TFDebertaV2ForMaskedLM",
+            "TFDebertaV2ForMultipleChoice",
             "TFDebertaV2ForQuestionAnswering",
             "TFDebertaV2ForSequenceClassification",
             "TFDebertaV2ForTokenClassification",
@@ -2745,6 +3439,15 @@
             "TFDPRReader",
         ]
     )
+    _import_structure["models.efficientformer"].extend(
+        [
+            "TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "TFEfficientFormerForImageClassification",
+            "TFEfficientFormerForImageClassificationWithTeacher",
+            "TFEfficientFormerModel",
+            "TFEfficientFormerPreTrainedModel",
+        ]
+    )
     _import_structure["models.electra"].extend(
         [
             "TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -3021,6 +3724,13 @@
             "TFRoFormerPreTrainedModel",
         ]
     )
+    _import_structure["models.sam"].extend(
+        [
+            "TF_SAM_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "TFSamModel",
+            "TFSamPreTrainedModel",
+        ]
+    )
     _import_structure["models.segformer"].extend(
         [
             "TF_SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -3079,6 +3789,7 @@
         ]
     )
     _import_structure["models.vision_encoder_decoder"].extend(["TFVisionEncoderDecoderModel"])
+    _import_structure["models.vision_text_dual_encoder"].extend(["TFVisionTextDualEncoderModel"])
     _import_structure["models.vit"].extend(
         [
             "TFViTForImageClassification",
@@ -3097,6 +3808,7 @@
         [
             "TF_WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST",
             "TFWav2Vec2ForCTC",
+            "TFWav2Vec2ForSequenceClassification",
             "TFWav2Vec2Model",
             "TFWav2Vec2PreTrainedModel",
         ]
@@ -3133,12 +3845,14 @@
     _import_structure["models.xlm_roberta"].extend(
         [
             "TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "TFXLMRobertaForCausalLM",
             "TFXLMRobertaForMaskedLM",
             "TFXLMRobertaForMultipleChoice",
             "TFXLMRobertaForQuestionAnswering",
             "TFXLMRobertaForSequenceClassification",
             "TFXLMRobertaForTokenClassification",
             "TFXLMRobertaModel",
+            "TFXLMRobertaPreTrainedModel",
         ]
     )
     _import_structure["models.xlnet"].extend(
@@ -3159,6 +3873,29 @@
     _import_structure["trainer_tf"] = ["TFTrainer"]
 
 
+try:
+    if not (
+        is_librosa_available()
+        and is_essentia_available()
+        and is_scipy_available()
+        and is_torch_available()
+        and is_pretty_midi_available()
+    ):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_essentia_and_librosa_and_pretty_midi_and_scipy_and_torch_objects
+
+    _import_structure["utils.dummy_essentia_and_librosa_and_pretty_midi_and_scipy_and_torch_objects"] = [
+        name
+        for name in dir(dummy_essentia_and_librosa_and_pretty_midi_and_scipy_and_torch_objects)
+        if not name.startswith("_")
+    ]
+else:
+    _import_structure["models.pop2piano"].append("Pop2PianoFeatureExtractor")
+    _import_structure["models.pop2piano"].append("Pop2PianoTokenizer")
+    _import_structure["models.pop2piano"].append("Pop2PianoProcessor")
+
+
 # FLAX-backed objects
 try:
     if not is_flax_available():
@@ -3174,14 +3911,18 @@
         [
             "FlaxForcedBOSTokenLogitsProcessor",
             "FlaxForcedEOSTokenLogitsProcessor",
+            "FlaxForceTokensLogitsProcessor",
             "FlaxGenerationMixin",
             "FlaxLogitsProcessor",
             "FlaxLogitsProcessorList",
             "FlaxLogitsWarper",
             "FlaxMinLengthLogitsProcessor",
             "FlaxTemperatureLogitsWarper",
+            "FlaxSuppressTokensAtBeginLogitsProcessor",
+            "FlaxSuppressTokensLogitsProcessor",
             "FlaxTopKLogitsWarper",
             "FlaxTopPLogitsWarper",
+            "FlaxWhisperTimeStampLogitsProcessor",
         ]
     )
     _import_structure["generation_flax_utils"] = []
@@ -3201,6 +3942,7 @@
     )
     _import_structure["models.auto"].extend(
         [
+            "FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
             "FLAX_MODEL_FOR_CAUSAL_LM_MAPPING",
             "FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
             "FLAX_MODEL_FOR_MASKED_LM_MAPPING",
@@ -3210,6 +3952,7 @@
             "FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING",
             "FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING",
             "FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING",
+            "FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
             "FLAX_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
             "FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING",
             "FLAX_MODEL_MAPPING",
@@ -3223,6 +3966,7 @@
             "FlaxAutoModelForQuestionAnswering",
             "FlaxAutoModelForSeq2SeqLM",
             "FlaxAutoModelForSequenceClassification",
+            "FlaxAutoModelForSpeechSeq2Seq",
             "FlaxAutoModelForTokenClassification",
             "FlaxAutoModelForVision2Seq",
         ]
@@ -3287,12 +4031,20 @@
             "FlaxBlenderbotSmallPreTrainedModel",
         ]
     )
+    _import_structure["models.bloom"].extend(
+        [
+            "FlaxBloomForCausalLM",
+            "FlaxBloomModel",
+            "FlaxBloomPreTrainedModel",
+        ]
+    )
     _import_structure["models.clip"].extend(
         [
             "FlaxCLIPModel",
             "FlaxCLIPPreTrainedModel",
             "FlaxCLIPTextModel",
             "FlaxCLIPTextPreTrainedModel",
+            "FlaxCLIPTextModelWithProjection",
             "FlaxCLIPVisionModel",
             "FlaxCLIPVisionPreTrainedModel",
         ]
@@ -3361,6 +4113,12 @@
             "FlaxPegasusPreTrainedModel",
         ]
     )
+    _import_structure["models.regnet"].extend(
+        ["FlaxRegNetForImageClassification", "FlaxRegNetModel", "FlaxRegNetPreTrainedModel"]
+    )
+    _import_structure["models.resnet"].extend(
+        ["FlaxResNetForImageClassification", "FlaxResNetModel", "FlaxResNetPreTrainedModel"]
+    )
     _import_structure["models.roberta"].extend(
         [
             "FlaxRobertaForCausalLM",
@@ -3406,6 +4164,14 @@
     _import_structure["models.wav2vec2"].extend(
         ["FlaxWav2Vec2ForCTC", "FlaxWav2Vec2ForPreTraining", "FlaxWav2Vec2Model", "FlaxWav2Vec2PreTrainedModel"]
     )
+    _import_structure["models.whisper"].extend(
+        [
+            "FlaxWhisperForConditionalGeneration",
+            "FlaxWhisperModel",
+            "FlaxWhisperPreTrainedModel",
+            "FlaxWhisperForAudioClassification",
+        ]
+    )
     _import_structure["models.xglm"].extend(
         [
             "FlaxXGLMForCausalLM",
@@ -3415,12 +4181,15 @@
     )
     _import_structure["models.xlm_roberta"].extend(
         [
+            "FLAX_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST",
             "FlaxXLMRobertaForMaskedLM",
             "FlaxXLMRobertaForMultipleChoice",
             "FlaxXLMRobertaForQuestionAnswering",
             "FlaxXLMRobertaForSequenceClassification",
             "FlaxXLMRobertaForTokenClassification",
             "FlaxXLMRobertaModel",
+            "FlaxXLMRobertaForCausalLM",
+            "FlaxXLMRobertaPreTrainedModel",
         ]
     )
 
@@ -3469,7 +4238,7 @@
     from .feature_extraction_utils import BatchFeature, FeatureExtractionMixin
 
     # Generation
-    from .generation import GenerationConfig
+    from .generation import GenerationConfig, TextIteratorStreamer, TextStreamer
     from .hf_argparser import HfArgumentParser
 
     # Integrations
@@ -3499,6 +4268,13 @@
         load_tf2_weights_in_pytorch_model,
     )
     from .models.albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig
+    from .models.align import (
+        ALIGN_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        AlignConfig,
+        AlignProcessor,
+        AlignTextConfig,
+        AlignVisionConfig,
+    )
     from .models.altclip import (
         ALTCLIP_PRETRAINED_CONFIG_ARCHIVE_MAP,
         AltCLIPConfig,
@@ -3524,6 +4300,17 @@
         AutoProcessor,
         AutoTokenizer,
     )
+    from .models.autoformer import (
+        AUTOFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        AutoformerConfig,
+    )
+    from .models.bark import (
+        BarkCoarseConfig,
+        BarkConfig,
+        BarkFineConfig,
+        BarkProcessor,
+        BarkSemanticConfig,
+    )
     from .models.bart import BartConfig, BartTokenizer
     from .models.beit import BEIT_PRETRAINED_CONFIG_ARCHIVE_MAP, BeitConfig
     from .models.bert import (
@@ -3553,7 +4340,22 @@
         BlipTextConfig,
         BlipVisionConfig,
     )
+    from .models.blip_2 import (
+        BLIP_2_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        Blip2Config,
+        Blip2Processor,
+        Blip2QFormerConfig,
+        Blip2VisionConfig,
+    )
     from .models.bloom import BLOOM_PRETRAINED_CONFIG_ARCHIVE_MAP, BloomConfig
+    from .models.bridgetower import (
+        BRIDGETOWER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        BridgeTowerConfig,
+        BridgeTowerProcessor,
+        BridgeTowerTextConfig,
+        BridgeTowerVisionConfig,
+    )
+    from .models.bros import BROS_PRETRAINED_CONFIG_ARCHIVE_MAP, BrosConfig, BrosProcessor
     from .models.byt5 import ByT5Tokenizer
     from .models.camembert import CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CamembertConfig
     from .models.canine import CANINE_PRETRAINED_CONFIG_ARCHIVE_MAP, CanineConfig, CanineTokenizer
@@ -3564,6 +4366,13 @@
         ChineseCLIPTextConfig,
         ChineseCLIPVisionConfig,
     )
+    from .models.clap import (
+        CLAP_PRETRAINED_MODEL_ARCHIVE_LIST,
+        ClapAudioConfig,
+        ClapConfig,
+        ClapProcessor,
+        ClapTextConfig,
+    )
     from .models.clip import (
         CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP,
         CLIPConfig,
@@ -3583,6 +4392,8 @@
     from .models.conditional_detr import CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, ConditionalDetrConfig
     from .models.convbert import CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvBertConfig, ConvBertTokenizer
     from .models.convnext import CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvNextConfig
+    from .models.convnextv2 import CONVNEXTV2_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvNextV2Config
+    from .models.cpmant import CPMANT_PRETRAINED_CONFIG_ARCHIVE_MAP, CpmAntConfig, CpmAntTokenizer
     from .models.ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig, CTRLTokenizer
     from .models.cvt import CVT_PRETRAINED_CONFIG_ARCHIVE_MAP, CvtConfig
     from .models.data2vec import (
@@ -3600,8 +4411,29 @@
     )
     from .models.deformable_detr import DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, DeformableDetrConfig
     from .models.deit import DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP, DeiTConfig
+    from .models.deprecated.mctct import (
+        MCTCT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        MCTCTConfig,
+        MCTCTFeatureExtractor,
+        MCTCTProcessor,
+    )
+    from .models.deprecated.mmbt import MMBTConfig
+    from .models.deprecated.open_llama import OPEN_LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenLlamaConfig
+    from .models.deprecated.retribert import (
+        RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        RetriBertConfig,
+        RetriBertTokenizer,
+    )
+    from .models.deprecated.tapex import TapexTokenizer
+    from .models.deprecated.trajectory_transformer import (
+        TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        TrajectoryTransformerConfig,
+    )
+    from .models.deprecated.van import VAN_PRETRAINED_CONFIG_ARCHIVE_MAP, VanConfig
+    from .models.deta import DETA_PRETRAINED_CONFIG_ARCHIVE_MAP, DetaConfig
     from .models.detr import DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, DetrConfig
     from .models.dinat import DINAT_PRETRAINED_CONFIG_ARCHIVE_MAP, DinatConfig
+    from .models.dinov2 import DINOV2_PRETRAINED_CONFIG_ARCHIVE_MAP, Dinov2Config
     from .models.distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DistilBertConfig, DistilBertTokenizer
     from .models.donut import DONUT_SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP, DonutProcessor, DonutSwinConfig
     from .models.dpr import (
@@ -3613,10 +4445,19 @@
         DPRReaderTokenizer,
     )
     from .models.dpt import DPT_PRETRAINED_CONFIG_ARCHIVE_MAP, DPTConfig
+    from .models.efficientformer import EFFICIENTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, EfficientFormerConfig
+    from .models.efficientnet import EFFICIENTNET_PRETRAINED_CONFIG_ARCHIVE_MAP, EfficientNetConfig
     from .models.electra import ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP, ElectraConfig, ElectraTokenizer
+    from .models.encodec import (
+        ENCODEC_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        EncodecConfig,
+        EncodecFeatureExtractor,
+    )
     from .models.encoder_decoder import EncoderDecoderConfig
     from .models.ernie import ERNIE_PRETRAINED_CONFIG_ARCHIVE_MAP, ErnieConfig
+    from .models.ernie_m import ERNIE_M_PRETRAINED_CONFIG_ARCHIVE_MAP, ErnieMConfig
     from .models.esm import ESM_PRETRAINED_CONFIG_ARCHIVE_MAP, EsmConfig, EsmTokenizer
+    from .models.falcon import FALCON_PRETRAINED_CONFIG_ARCHIVE_MAP, FalconConfig
     from .models.flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig, FlaubertTokenizer
     from .models.flava import (
         FLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -3627,15 +4468,23 @@
         FlavaTextConfig,
     )
     from .models.fnet import FNET_PRETRAINED_CONFIG_ARCHIVE_MAP, FNetConfig
+    from .models.focalnet import FOCALNET_PRETRAINED_CONFIG_ARCHIVE_MAP, FocalNetConfig
     from .models.fsmt import FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP, FSMTConfig, FSMTTokenizer
     from .models.funnel import FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP, FunnelConfig, FunnelTokenizer
     from .models.git import GIT_PRETRAINED_CONFIG_ARCHIVE_MAP, GitConfig, GitProcessor, GitVisionConfig
     from .models.glpn import GLPN_PRETRAINED_CONFIG_ARCHIVE_MAP, GLPNConfig
     from .models.gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config, GPT2Tokenizer
+    from .models.gpt_bigcode import GPT_BIGCODE_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTBigCodeConfig
     from .models.gpt_neo import GPT_NEO_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTNeoConfig
     from .models.gpt_neox import GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTNeoXConfig
     from .models.gpt_neox_japanese import GPT_NEOX_JAPANESE_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTNeoXJapaneseConfig
     from .models.gptj import GPTJ_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTJConfig
+    from .models.gptsan_japanese import (
+        GPTSAN_JAPANESE_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        GPTSanJapaneseConfig,
+        GPTSanJapaneseTokenizer,
+    )
+    from .models.graphormer import GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, GraphormerConfig
     from .models.groupvit import (
         GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP,
         GroupViTConfig,
@@ -3645,7 +4494,19 @@
     from .models.herbert import HerbertTokenizer
     from .models.hubert import HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, HubertConfig
     from .models.ibert import IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, IBertConfig
+    from .models.idefics import (
+        IDEFICS_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        IdeficsConfig,
+    )
     from .models.imagegpt import IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP, ImageGPTConfig
+    from .models.informer import INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, InformerConfig
+    from .models.instructblip import (
+        INSTRUCTBLIP_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        InstructBlipConfig,
+        InstructBlipProcessor,
+        InstructBlipQFormerConfig,
+        InstructBlipVisionConfig,
+    )
     from .models.jukebox import (
         JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP,
         JukeboxConfig,
@@ -3674,6 +4535,7 @@
     from .models.led import LED_PRETRAINED_CONFIG_ARCHIVE_MAP, LEDConfig, LEDTokenizer
     from .models.levit import LEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP, LevitConfig
     from .models.lilt import LILT_PRETRAINED_CONFIG_ARCHIVE_MAP, LiltConfig
+    from .models.llama import LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP, LlamaConfig
     from .models.longformer import LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, LongformerConfig, LongformerTokenizer
     from .models.longt5 import LONGT5_PRETRAINED_CONFIG_ARCHIVE_MAP, LongT5Config
     from .models.luke import LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP, LukeConfig, LukeTokenizer
@@ -3687,21 +4549,34 @@
         MarkupLMProcessor,
         MarkupLMTokenizer,
     )
+    from .models.mask2former import MASK2FORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, Mask2FormerConfig
     from .models.maskformer import MASKFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, MaskFormerConfig, MaskFormerSwinConfig
     from .models.mbart import MBartConfig
-    from .models.mctct import MCTCT_PRETRAINED_CONFIG_ARCHIVE_MAP, MCTCTConfig, MCTCTProcessor
+    from .models.mega import MEGA_PRETRAINED_CONFIG_ARCHIVE_MAP, MegaConfig
     from .models.megatron_bert import MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, MegatronBertConfig
-    from .models.mmbt import MMBTConfig
+    from .models.mgp_str import MGP_STR_PRETRAINED_CONFIG_ARCHIVE_MAP, MgpstrConfig, MgpstrProcessor, MgpstrTokenizer
+    from .models.mistral import MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP, MistralConfig
     from .models.mobilebert import MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, MobileBertConfig, MobileBertTokenizer
     from .models.mobilenet_v1 import MOBILENET_V1_PRETRAINED_CONFIG_ARCHIVE_MAP, MobileNetV1Config
     from .models.mobilenet_v2 import MOBILENET_V2_PRETRAINED_CONFIG_ARCHIVE_MAP, MobileNetV2Config
     from .models.mobilevit import MOBILEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP, MobileViTConfig
+    from .models.mobilevitv2 import MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP, MobileViTV2Config
     from .models.mpnet import MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP, MPNetConfig, MPNetTokenizer
+    from .models.mpt import MPT_PRETRAINED_CONFIG_ARCHIVE_MAP, MptConfig
+    from .models.mra import MRA_PRETRAINED_CONFIG_ARCHIVE_MAP, MraConfig
     from .models.mt5 import MT5Config
+    from .models.musicgen import (
+        MUSICGEN_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        MusicgenConfig,
+        MusicgenDecoderConfig,
+    )
     from .models.mvp import MvpConfig, MvpTokenizer
     from .models.nat import NAT_PRETRAINED_CONFIG_ARCHIVE_MAP, NatConfig
     from .models.nezha import NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP, NezhaConfig
+    from .models.nllb_moe import NLLB_MOE_PRETRAINED_CONFIG_ARCHIVE_MAP, NllbMoeConfig
+    from .models.nougat import NougatProcessor
     from .models.nystromformer import NYSTROMFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, NystromformerConfig
+    from .models.oneformer import ONEFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, OneFormerConfig, OneFormerProcessor
     from .models.openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenAIGPTConfig, OpenAIGPTTokenizer
     from .models.opt import OPTConfig
     from .models.owlvit import (
@@ -3714,10 +4589,23 @@
     from .models.pegasus import PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP, PegasusConfig, PegasusTokenizer
     from .models.pegasus_x import PEGASUS_X_PRETRAINED_CONFIG_ARCHIVE_MAP, PegasusXConfig
     from .models.perceiver import PERCEIVER_PRETRAINED_CONFIG_ARCHIVE_MAP, PerceiverConfig, PerceiverTokenizer
+    from .models.persimmon import PERSIMMON_PRETRAINED_CONFIG_ARCHIVE_MAP, PersimmonConfig
     from .models.phobert import PhobertTokenizer
+    from .models.pix2struct import (
+        PIX2STRUCT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        Pix2StructConfig,
+        Pix2StructProcessor,
+        Pix2StructTextConfig,
+        Pix2StructVisionConfig,
+    )
     from .models.plbart import PLBART_PRETRAINED_CONFIG_ARCHIVE_MAP, PLBartConfig
     from .models.poolformer import POOLFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, PoolFormerConfig
+    from .models.pop2piano import (
+        POP2PIANO_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        Pop2PianoConfig,
+    )
     from .models.prophetnet import PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP, ProphetNetConfig, ProphetNetTokenizer
+    from .models.pvt import PVT_PRETRAINED_CONFIG_ARCHIVE_MAP, PvtConfig
     from .models.qdqbert import QDQBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, QDQBertConfig
     from .models.rag import RagConfig, RagRetriever, RagTokenizer
     from .models.realm import REALM_PRETRAINED_CONFIG_ARCHIVE_MAP, RealmConfig, RealmTokenizer
@@ -3725,7 +4613,6 @@
     from .models.regnet import REGNET_PRETRAINED_CONFIG_ARCHIVE_MAP, RegNetConfig
     from .models.rembert import REMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RemBertConfig
     from .models.resnet import RESNET_PRETRAINED_CONFIG_ARCHIVE_MAP, ResNetConfig
-    from .models.retribert import RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RetriBertConfig, RetriBertTokenizer
     from .models.roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig, RobertaTokenizer
     from .models.roberta_prelayernorm import (
         ROBERTA_PRELAYERNORM_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -3733,19 +4620,41 @@
     )
     from .models.roc_bert import ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RoCBertConfig, RoCBertTokenizer
     from .models.roformer import ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, RoFormerConfig, RoFormerTokenizer
+    from .models.rwkv import RWKV_PRETRAINED_CONFIG_ARCHIVE_MAP, RwkvConfig
+    from .models.sam import (
+        SAM_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        SamConfig,
+        SamMaskDecoderConfig,
+        SamProcessor,
+        SamPromptEncoderConfig,
+        SamVisionConfig,
+    )
     from .models.segformer import SEGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, SegformerConfig
     from .models.sew import SEW_PRETRAINED_CONFIG_ARCHIVE_MAP, SEWConfig
     from .models.sew_d import SEW_D_PRETRAINED_CONFIG_ARCHIVE_MAP, SEWDConfig
     from .models.speech_encoder_decoder import SpeechEncoderDecoderConfig
-    from .models.speech_to_text import SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, Speech2TextConfig
+    from .models.speech_to_text import (
+        SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        Speech2TextConfig,
+        Speech2TextProcessor,
+    )
     from .models.speech_to_text_2 import (
         SPEECH_TO_TEXT_2_PRETRAINED_CONFIG_ARCHIVE_MAP,
         Speech2Text2Config,
         Speech2Text2Processor,
         Speech2Text2Tokenizer,
     )
+    from .models.speecht5 import (
+        SPEECHT5_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        SPEECHT5_PRETRAINED_HIFIGAN_CONFIG_ARCHIVE_MAP,
+        SpeechT5Config,
+        SpeechT5FeatureExtractor,
+        SpeechT5HifiGanConfig,
+        SpeechT5Processor,
+    )
     from .models.splinter import SPLINTER_PRETRAINED_CONFIG_ARCHIVE_MAP, SplinterConfig, SplinterTokenizer
     from .models.squeezebert import SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, SqueezeBertConfig, SqueezeBertTokenizer
+    from .models.swiftformer import SWIFTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, SwiftFormerConfig
     from .models.swin import SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP, SwinConfig
     from .models.swin2sr import SWIN2SR_PRETRAINED_CONFIG_ARCHIVE_MAP, Swin2SRConfig
     from .models.swinv2 import SWINV2_PRETRAINED_CONFIG_ARCHIVE_MAP, Swinv2Config
@@ -3753,16 +4662,12 @@
     from .models.t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config
     from .models.table_transformer import TABLE_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, TableTransformerConfig
     from .models.tapas import TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP, TapasConfig, TapasTokenizer
-    from .models.tapex import TapexTokenizer
     from .models.time_series_transformer import (
         TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
         TimeSeriesTransformerConfig,
     )
     from .models.timesformer import TIMESFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, TimesformerConfig
-    from .models.trajectory_transformer import (
-        TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        TrajectoryTransformerConfig,
-    )
+    from .models.timm_backbone import TimmBackboneConfig
     from .models.transfo_xl import (
         TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
         TransfoXLConfig,
@@ -3770,9 +4675,11 @@
         TransfoXLTokenizer,
     )
     from .models.trocr import TROCR_PRETRAINED_CONFIG_ARCHIVE_MAP, TrOCRConfig, TrOCRProcessor
+    from .models.tvlt import TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP, TvltConfig, TvltFeatureExtractor, TvltProcessor
+    from .models.umt5 import UMT5Config
     from .models.unispeech import UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP, UniSpeechConfig
     from .models.unispeech_sat import UNISPEECH_SAT_PRETRAINED_CONFIG_ARCHIVE_MAP, UniSpeechSatConfig
-    from .models.van import VAN_PRETRAINED_CONFIG_ARCHIVE_MAP, VanConfig
+    from .models.upernet import UperNetConfig
     from .models.videomae import VIDEOMAE_PRETRAINED_CONFIG_ARCHIVE_MAP, VideoMAEConfig
     from .models.vilt import (
         VILT_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -3788,6 +4695,14 @@
     from .models.vit_hybrid import VIT_HYBRID_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTHybridConfig
     from .models.vit_mae import VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTMAEConfig
     from .models.vit_msn import VIT_MSN_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTMSNConfig
+    from .models.vitdet import VITDET_PRETRAINED_CONFIG_ARCHIVE_MAP, VitDetConfig
+    from .models.vitmatte import VITMATTE_PRETRAINED_CONFIG_ARCHIVE_MAP, VitMatteConfig
+    from .models.vits import (
+        VITS_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        VitsConfig,
+        VitsTokenizer,
+    )
+    from .models.vivit import VIVIT_PRETRAINED_CONFIG_ARCHIVE_MAP, VivitConfig
     from .models.wav2vec2 import (
         WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP,
         Wav2Vec2Config,
@@ -3820,6 +4735,7 @@
     from .models.xlm_roberta import XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMRobertaConfig
     from .models.xlm_roberta_xl import XLM_ROBERTA_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMRobertaXLConfig
     from .models.xlnet import XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLNetConfig
+    from .models.xmod import XMOD_PRETRAINED_CONFIG_ARCHIVE_MAP, XmodConfig
     from .models.yolos import YOLOS_PRETRAINED_CONFIG_ARCHIVE_MAP, YolosConfig
     from .models.yoso import YOSO_PRETRAINED_CONFIG_ARCHIVE_MAP, YosoConfig
 
@@ -3836,6 +4752,7 @@
         FillMaskPipeline,
         ImageClassificationPipeline,
         ImageSegmentationPipeline,
+        ImageToImagePipeline,
         ImageToTextPipeline,
         JsonPipelineDataFormat,
         NerPipeline,
@@ -3849,10 +4766,12 @@
         Text2TextGenerationPipeline,
         TextClassificationPipeline,
         TextGenerationPipeline,
+        TextToAudioPipeline,
         TokenClassificationPipeline,
         TranslationPipeline,
         VideoClassificationPipeline,
         VisualQuestionAnsweringPipeline,
+        ZeroShotAudioClassificationPipeline,
         ZeroShotClassificationPipeline,
         ZeroShotImageClassificationPipeline,
         ZeroShotObjectDetectionPipeline,
@@ -3871,6 +4790,20 @@
         TokenSpan,
     )
 
+    # Tools
+    from .tools import (
+        Agent,
+        AzureOpenAiAgent,
+        HfAgent,
+        LocalAgent,
+        OpenAiAgent,
+        PipelineTool,
+        RemoteTool,
+        Tool,
+        launch_gradio_demo,
+        load_tool,
+    )
+
     # Trainer
     from .trainer_callback import (
         DefaultFlowCallback,
@@ -3901,6 +4834,7 @@
         add_end_docstrings,
         add_start_docstrings,
         is_apex_available,
+        is_bitsandbytes_available,
         is_datasets_available,
         is_decord_available,
         is_faiss_available,
@@ -3920,11 +4854,18 @@
         is_timm_available,
         is_tokenizers_available,
         is_torch_available,
+        is_torch_neuroncore_available,
+        is_torch_npu_available,
         is_torch_tpu_available,
+        is_torch_xpu_available,
+        is_torchvision_available,
         is_vision_available,
         logging,
     )
 
+    # bitsandbytes config
+    from .utils.quantization_config import BitsAndBytesConfig, GPTQConfig
+
     try:
         if not is_sentencepiece_available():
             raise OptionalDependencyNotAvailable()
@@ -3937,11 +4878,14 @@
         from .models.bert_generation import BertGenerationTokenizer
         from .models.big_bird import BigBirdTokenizer
         from .models.camembert import CamembertTokenizer
+        from .models.code_llama import CodeLlamaTokenizer
         from .models.cpm import CpmTokenizer
         from .models.deberta_v2 import DebertaV2Tokenizer
+        from .models.ernie_m import ErnieMTokenizer
         from .models.fnet import FNetTokenizer
         from .models.gpt_sw3 import GPTSw3Tokenizer
         from .models.layoutxlm import LayoutXLMTokenizer
+        from .models.llama import LlamaTokenizer
         from .models.m2m_100 import M2M100Tokenizer
         from .models.marian import MarianTokenizer
         from .models.mbart import MBart50Tokenizer, MBartTokenizer
@@ -3953,6 +4897,7 @@
         from .models.reformer import ReformerTokenizer
         from .models.rembert import RemBertTokenizer
         from .models.speech_to_text import Speech2TextTokenizer
+        from .models.speecht5 import SpeechT5Tokenizer
         from .models.t5 import T5Tokenizer
         from .models.xglm import XGLMTokenizer
         from .models.xlm_prophetnet import XLMProphetNetTokenizer
@@ -3976,11 +4921,13 @@
         from .models.bloom import BloomTokenizerFast
         from .models.camembert import CamembertTokenizerFast
         from .models.clip import CLIPTokenizerFast
+        from .models.code_llama import CodeLlamaTokenizerFast
         from .models.codegen import CodeGenTokenizerFast
         from .models.convbert import ConvBertTokenizerFast
         from .models.cpm import CpmTokenizerFast
         from .models.deberta import DebertaTokenizerFast
         from .models.deberta_v2 import DebertaV2TokenizerFast
+        from .models.deprecated.retribert import RetriBertTokenizerFast
         from .models.distilbert import DistilBertTokenizerFast
         from .models.dpr import DPRContextEncoderTokenizerFast, DPRQuestionEncoderTokenizerFast, DPRReaderTokenizerFast
         from .models.electra import ElectraTokenizerFast
@@ -3995,6 +4942,7 @@
         from .models.layoutlmv3 import LayoutLMv3TokenizerFast
         from .models.layoutxlm import LayoutXLMTokenizerFast
         from .models.led import LEDTokenizerFast
+        from .models.llama import LlamaTokenizerFast
         from .models.longformer import LongformerTokenizerFast
         from .models.lxmert import LxmertTokenizerFast
         from .models.markuplm import MarkupLMTokenizerFast
@@ -4005,17 +4953,18 @@
         from .models.mt5 import MT5TokenizerFast
         from .models.mvp import MvpTokenizerFast
         from .models.nllb import NllbTokenizerFast
+        from .models.nougat import NougatTokenizerFast
         from .models.openai import OpenAIGPTTokenizerFast
         from .models.pegasus import PegasusTokenizerFast
         from .models.realm import RealmTokenizerFast
         from .models.reformer import ReformerTokenizerFast
         from .models.rembert import RemBertTokenizerFast
-        from .models.retribert import RetriBertTokenizerFast
         from .models.roberta import RobertaTokenizerFast
         from .models.roformer import RoFormerTokenizerFast
         from .models.splinter import SplinterTokenizerFast
         from .models.squeezebert import SqueezeBertTokenizerFast
         from .models.t5 import T5TokenizerFast
+        from .models.whisper import WhisperTokenizerFast
         from .models.xglm import XGLMTokenizerFast
         from .models.xlm_roberta import XLMRobertaTokenizerFast
         from .models.xlnet import XLNetTokenizerFast
@@ -4036,7 +4985,6 @@
         from .utils.dummy_speech_objects import *
     else:
         from .models.audio_spectrogram_transformer import ASTFeatureExtractor
-        from .models.mctct import MCTCTFeatureExtractor
         from .models.speech_to_text import Speech2TextFeatureExtractor
 
     try:
@@ -4055,14 +5003,6 @@
     else:
         from .models.gpt2 import TFGPT2Tokenizer
 
-    try:
-        if not (is_speech_available() and is_sentencepiece_available()):
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        from .utils.dummy_sentencepiece_and_speech_objects import *
-    else:
-        from .models.speech_to_text import Speech2TextProcessor
-
     try:
         if not is_vision_available():
             raise OptionalDependencyNotAvailable()
@@ -4074,70 +5014,51 @@
         from .models.beit import BeitFeatureExtractor, BeitImageProcessor
         from .models.bit import BitImageProcessor
         from .models.blip import BlipImageProcessor
+        from .models.bridgetower import BridgeTowerImageProcessor
         from .models.chinese_clip import ChineseCLIPFeatureExtractor, ChineseCLIPImageProcessor
         from .models.clip import CLIPFeatureExtractor, CLIPImageProcessor
         from .models.conditional_detr import ConditionalDetrFeatureExtractor, ConditionalDetrImageProcessor
         from .models.convnext import ConvNextFeatureExtractor, ConvNextImageProcessor
         from .models.deformable_detr import DeformableDetrFeatureExtractor, DeformableDetrImageProcessor
         from .models.deit import DeiTFeatureExtractor, DeiTImageProcessor
+        from .models.deta import DetaImageProcessor
         from .models.detr import DetrFeatureExtractor, DetrImageProcessor
         from .models.donut import DonutFeatureExtractor, DonutImageProcessor
         from .models.dpt import DPTFeatureExtractor, DPTImageProcessor
+        from .models.efficientformer import EfficientFormerImageProcessor
+        from .models.efficientnet import EfficientNetImageProcessor
         from .models.flava import FlavaFeatureExtractor, FlavaImageProcessor, FlavaProcessor
         from .models.glpn import GLPNFeatureExtractor, GLPNImageProcessor
+        from .models.idefics import IdeficsImageProcessor
         from .models.imagegpt import ImageGPTFeatureExtractor, ImageGPTImageProcessor
         from .models.layoutlmv2 import LayoutLMv2FeatureExtractor, LayoutLMv2ImageProcessor
         from .models.layoutlmv3 import LayoutLMv3FeatureExtractor, LayoutLMv3ImageProcessor
         from .models.levit import LevitFeatureExtractor, LevitImageProcessor
+        from .models.mask2former import Mask2FormerImageProcessor
         from .models.maskformer import MaskFormerFeatureExtractor, MaskFormerImageProcessor
         from .models.mobilenet_v1 import MobileNetV1FeatureExtractor, MobileNetV1ImageProcessor
         from .models.mobilenet_v2 import MobileNetV2FeatureExtractor, MobileNetV2ImageProcessor
         from .models.mobilevit import MobileViTFeatureExtractor, MobileViTImageProcessor
+        from .models.nougat import NougatImageProcessor
+        from .models.oneformer import OneFormerImageProcessor
         from .models.owlvit import OwlViTFeatureExtractor, OwlViTImageProcessor
         from .models.perceiver import PerceiverFeatureExtractor, PerceiverImageProcessor
+        from .models.pix2struct import Pix2StructImageProcessor
         from .models.poolformer import PoolFormerFeatureExtractor, PoolFormerImageProcessor
+        from .models.pvt import PvtImageProcessor
+        from .models.sam import SamImageProcessor
         from .models.segformer import SegformerFeatureExtractor, SegformerImageProcessor
         from .models.swin2sr import Swin2SRImageProcessor
+        from .models.tvlt import TvltImageProcessor
         from .models.videomae import VideoMAEFeatureExtractor, VideoMAEImageProcessor
         from .models.vilt import ViltFeatureExtractor, ViltImageProcessor, ViltProcessor
         from .models.vit import ViTFeatureExtractor, ViTImageProcessor
         from .models.vit_hybrid import ViTHybridImageProcessor
+        from .models.vitmatte import VitMatteImageProcessor
+        from .models.vivit import VivitImageProcessor
         from .models.yolos import YolosFeatureExtractor, YolosImageProcessor
 
     # Modeling
-    try:
-        if not (is_timm_available() and is_vision_available()):
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        from .utils.dummy_timm_and_vision_objects import *
-    else:
-        from .models.conditional_detr import (
-            CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
-            ConditionalDetrForObjectDetection,
-            ConditionalDetrForSegmentation,
-            ConditionalDetrModel,
-            ConditionalDetrPreTrainedModel,
-        )
-        from .models.deformable_detr import (
-            DEFORMABLE_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
-            DeformableDetrForObjectDetection,
-            DeformableDetrModel,
-            DeformableDetrPreTrainedModel,
-        )
-        from .models.detr import (
-            DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
-            DetrForObjectDetection,
-            DetrForSegmentation,
-            DetrModel,
-            DetrPreTrainedModel,
-        )
-        from .models.table_transformer import (
-            TABLE_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
-            TableTransformerForObjectDetection,
-            TableTransformerModel,
-            TableTransformerPreTrainedModel,
-        )
-
     try:
         if not is_torch_available():
             raise OptionalDependencyNotAvailable()
@@ -4159,17 +5080,26 @@
             TextDatasetForNextSentencePrediction,
         )
         from .generation import (
+            AlternatingCodebooksLogitsProcessor,
             BeamScorer,
             BeamSearchScorer,
+            ClassifierFreeGuidanceLogitsProcessor,
             ConstrainedBeamSearchScorer,
             Constraint,
             ConstraintListState,
             DisjunctiveConstraint,
+            EncoderNoRepeatNGramLogitsProcessor,
+            EncoderRepetitionPenaltyLogitsProcessor,
+            EpsilonLogitsWarper,
+            EtaLogitsWarper,
+            ExponentialDecayLengthPenalty,
             ForcedBOSTokenLogitsProcessor,
             ForcedEOSTokenLogitsProcessor,
+            ForceTokensLogitsProcessor,
             GenerationMixin,
             HammingDiversityLogitsProcessor,
             InfNanRemoveLogitsProcessor,
+            LogitNormalization,
             LogitsProcessor,
             LogitsProcessorList,
             LogitsWarper,
@@ -4182,12 +5112,17 @@
             PhrasalConstraint,
             PrefixConstrainedLogitsProcessor,
             RepetitionPenaltyLogitsProcessor,
+            SequenceBiasLogitsProcessor,
             StoppingCriteria,
             StoppingCriteriaList,
+            SuppressTokensAtBeginLogitsProcessor,
+            SuppressTokensLogitsProcessor,
             TemperatureLogitsWarper,
             TopKLogitsWarper,
             TopPLogitsWarper,
             TypicalLogitsWarper,
+            UnbatchedClassifierFreeGuidanceLogitsProcessor,
+            WhisperTimeStampLogitsProcessor,
             top_k_top_p_filtering,
         )
         from .modeling_utils import PreTrainedModel
@@ -4205,6 +5140,13 @@
             AlbertPreTrainedModel,
             load_tf_weights_in_albert,
         )
+        from .models.align import (
+            ALIGN_PRETRAINED_MODEL_ARCHIVE_LIST,
+            AlignModel,
+            AlignPreTrainedModel,
+            AlignTextModel,
+            AlignVisionModel,
+        )
         from .models.altclip import (
             ALTCLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
             AltCLIPModel,
@@ -4220,6 +5162,7 @@
         )
         from .models.auto import (
             MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
+            MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING,
             MODEL_FOR_AUDIO_XVECTOR_MAPPING,
             MODEL_FOR_BACKBONE_MAPPING,
             MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING,
@@ -4229,7 +5172,9 @@
             MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING,
             MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
             MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
+            MODEL_FOR_IMAGE_TO_IMAGE_MAPPING,
             MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
+            MODEL_FOR_MASK_GENERATION_MAPPING,
             MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING,
             MODEL_FOR_MASKED_LM_MAPPING,
             MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
@@ -4242,11 +5187,15 @@
             MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
             MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
             MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
+            MODEL_FOR_TEXT_ENCODING_MAPPING,
+            MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING,
+            MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING,
             MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
             MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING,
             MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING,
             MODEL_FOR_VISION_2_SEQ_MAPPING,
             MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING,
+            MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING,
             MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING,
             MODEL_MAPPING,
             MODEL_WITH_LM_HEAD_MAPPING,
@@ -4261,9 +5210,11 @@
             AutoModelForDocumentQuestionAnswering,
             AutoModelForImageClassification,
             AutoModelForImageSegmentation,
+            AutoModelForImageToImage,
             AutoModelForInstanceSegmentation,
             AutoModelForMaskedImageModeling,
             AutoModelForMaskedLM,
+            AutoModelForMaskGeneration,
             AutoModelForMultipleChoice,
             AutoModelForNextSentencePrediction,
             AutoModelForObjectDetection,
@@ -4274,14 +5225,33 @@
             AutoModelForSequenceClassification,
             AutoModelForSpeechSeq2Seq,
             AutoModelForTableQuestionAnswering,
+            AutoModelForTextEncoding,
+            AutoModelForTextToSpectrogram,
+            AutoModelForTextToWaveform,
             AutoModelForTokenClassification,
             AutoModelForUniversalSegmentation,
             AutoModelForVideoClassification,
             AutoModelForVision2Seq,
             AutoModelForVisualQuestionAnswering,
+            AutoModelForZeroShotImageClassification,
             AutoModelForZeroShotObjectDetection,
             AutoModelWithLMHead,
         )
+        from .models.autoformer import (
+            AUTOFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            AutoformerForPrediction,
+            AutoformerModel,
+            AutoformerPreTrainedModel,
+        )
+        from .models.bark import (
+            BARK_PRETRAINED_MODEL_ARCHIVE_LIST,
+            BarkCausalModel,
+            BarkCoarseModel,
+            BarkFineModel,
+            BarkModel,
+            BarkPreTrainedModel,
+            BarkSemanticModel,
+        )
         from .models.bart import (
             BART_PRETRAINED_MODEL_ARCHIVE_LIST,
             BartForCausalLM,
@@ -4289,6 +5259,7 @@
             BartForQuestionAnswering,
             BartForSequenceClassification,
             BartModel,
+            BartPreTrainedModel,
             BartPretrainedModel,
             PretrainedBartModel,
         )
@@ -4347,6 +5318,8 @@
         from .models.biogpt import (
             BIOGPT_PRETRAINED_MODEL_ARCHIVE_LIST,
             BioGptForCausalLM,
+            BioGptForSequenceClassification,
+            BioGptForTokenClassification,
             BioGptModel,
             BioGptPreTrainedModel,
         )
@@ -4381,6 +5354,14 @@
             BlipTextModel,
             BlipVisionModel,
         )
+        from .models.blip_2 import (
+            BLIP_2_PRETRAINED_MODEL_ARCHIVE_LIST,
+            Blip2ForConditionalGeneration,
+            Blip2Model,
+            Blip2PreTrainedModel,
+            Blip2QFormerModel,
+            Blip2VisionModel,
+        )
         from .models.bloom import (
             BLOOM_PRETRAINED_MODEL_ARCHIVE_LIST,
             BloomForCausalLM,
@@ -4390,6 +5371,23 @@
             BloomModel,
             BloomPreTrainedModel,
         )
+        from .models.bridgetower import (
+            BRIDGETOWER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            BridgeTowerForContrastiveLearning,
+            BridgeTowerForImageAndTextRetrieval,
+            BridgeTowerForMaskedLM,
+            BridgeTowerModel,
+            BridgeTowerPreTrainedModel,
+        )
+        from .models.bros import (
+            BROS_PRETRAINED_MODEL_ARCHIVE_LIST,
+            BrosForTokenClassification,
+            BrosModel,
+            BrosPreTrainedModel,
+            BrosProcessor,
+            BrosSpadeEEForTokenClassification,
+            BrosSpadeELForTokenClassification,
+        )
         from .models.camembert import (
             CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             CamembertForCausalLM,
@@ -4419,6 +5417,16 @@
             ChineseCLIPTextModel,
             ChineseCLIPVisionModel,
         )
+        from .models.clap import (
+            CLAP_PRETRAINED_MODEL_ARCHIVE_LIST,
+            ClapAudioModel,
+            ClapAudioModelWithProjection,
+            ClapFeatureExtractor,
+            ClapModel,
+            ClapPreTrainedModel,
+            ClapTextModel,
+            ClapTextModelWithProjection,
+        )
         from .models.clip import (
             CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
             CLIPModel,
@@ -4442,6 +5450,13 @@
             CodeGenModel,
             CodeGenPreTrainedModel,
         )
+        from .models.conditional_detr import (
+            CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
+            ConditionalDetrForObjectDetection,
+            ConditionalDetrForSegmentation,
+            ConditionalDetrModel,
+            ConditionalDetrPreTrainedModel,
+        )
         from .models.convbert import (
             CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             ConvBertForMaskedLM,
@@ -4456,10 +5471,24 @@
         )
         from .models.convnext import (
             CONVNEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            ConvNextBackbone,
             ConvNextForImageClassification,
             ConvNextModel,
             ConvNextPreTrainedModel,
         )
+        from .models.convnextv2 import (
+            CONVNEXTV2_PRETRAINED_MODEL_ARCHIVE_LIST,
+            ConvNextV2Backbone,
+            ConvNextV2ForImageClassification,
+            ConvNextV2Model,
+            ConvNextV2PreTrainedModel,
+        )
+        from .models.cpmant import (
+            CPMANT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            CpmAntForCausalLM,
+            CpmAntModel,
+            CpmAntPreTrainedModel,
+        )
         from .models.ctrl import (
             CTRL_PRETRAINED_MODEL_ARCHIVE_LIST,
             CTRLForSequenceClassification,
@@ -4522,6 +5551,12 @@
             DecisionTransformerModel,
             DecisionTransformerPreTrainedModel,
         )
+        from .models.deformable_detr import (
+            DEFORMABLE_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
+            DeformableDetrForObjectDetection,
+            DeformableDetrModel,
+            DeformableDetrPreTrainedModel,
+        )
         from .models.deit import (
             DEIT_PRETRAINED_MODEL_ARCHIVE_LIST,
             DeiTForImageClassification,
@@ -4530,6 +5565,48 @@
             DeiTModel,
             DeiTPreTrainedModel,
         )
+        from .models.deprecated.mctct import (
+            MCTCT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            MCTCTForCTC,
+            MCTCTModel,
+            MCTCTPreTrainedModel,
+        )
+        from .models.deprecated.mmbt import MMBTForClassification, MMBTModel, ModalEmbeddings
+        from .models.deprecated.open_llama import (
+            OpenLlamaForCausalLM,
+            OpenLlamaForSequenceClassification,
+            OpenLlamaModel,
+            OpenLlamaPreTrainedModel,
+        )
+        from .models.deprecated.retribert import (
+            RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            RetriBertModel,
+            RetriBertPreTrainedModel,
+        )
+        from .models.deprecated.trajectory_transformer import (
+            TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TrajectoryTransformerModel,
+            TrajectoryTransformerPreTrainedModel,
+        )
+        from .models.deprecated.van import (
+            VAN_PRETRAINED_MODEL_ARCHIVE_LIST,
+            VanForImageClassification,
+            VanModel,
+            VanPreTrainedModel,
+        )
+        from .models.deta import (
+            DETA_PRETRAINED_MODEL_ARCHIVE_LIST,
+            DetaForObjectDetection,
+            DetaModel,
+            DetaPreTrainedModel,
+        )
+        from .models.detr import (
+            DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
+            DetrForObjectDetection,
+            DetrForSegmentation,
+            DetrModel,
+            DetrPreTrainedModel,
+        )
         from .models.dinat import (
             DINAT_PRETRAINED_MODEL_ARCHIVE_LIST,
             DinatBackbone,
@@ -4537,6 +5614,13 @@
             DinatModel,
             DinatPreTrainedModel,
         )
+        from .models.dinov2 import (
+            DINOV2_PRETRAINED_MODEL_ARCHIVE_LIST,
+            Dinov2Backbone,
+            Dinov2ForImageClassification,
+            Dinov2Model,
+            Dinov2PreTrainedModel,
+        )
         from .models.distilbert import (
             DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             DistilBertForMaskedLM,
@@ -4567,6 +5651,19 @@
             DPTModel,
             DPTPreTrainedModel,
         )
+        from .models.efficientformer import (
+            EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            EfficientFormerForImageClassification,
+            EfficientFormerForImageClassificationWithTeacher,
+            EfficientFormerModel,
+            EfficientFormerPreTrainedModel,
+        )
+        from .models.efficientnet import (
+            EFFICIENTNET_PRETRAINED_MODEL_ARCHIVE_LIST,
+            EfficientNetForImageClassification,
+            EfficientNetModel,
+            EfficientNetPreTrainedModel,
+        )
         from .models.electra import (
             ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST,
             ElectraForCausalLM,
@@ -4580,6 +5677,11 @@
             ElectraPreTrainedModel,
             load_tf_weights_in_electra,
         )
+        from .models.encodec import (
+            ENCODEC_PRETRAINED_MODEL_ARCHIVE_LIST,
+            EncodecModel,
+            EncodecPreTrainedModel,
+        )
         from .models.encoder_decoder import EncoderDecoderModel
         from .models.ernie import (
             ERNIE_PRETRAINED_MODEL_ARCHIVE_LIST,
@@ -4594,6 +5696,16 @@
             ErnieModel,
             ErniePreTrainedModel,
         )
+        from .models.ernie_m import (
+            ERNIE_M_PRETRAINED_MODEL_ARCHIVE_LIST,
+            ErnieMForInformationExtraction,
+            ErnieMForMultipleChoice,
+            ErnieMForQuestionAnswering,
+            ErnieMForSequenceClassification,
+            ErnieMForTokenClassification,
+            ErnieMModel,
+            ErnieMPreTrainedModel,
+        )
         from .models.esm import (
             ESM_PRETRAINED_MODEL_ARCHIVE_LIST,
             EsmFoldPreTrainedModel,
@@ -4604,6 +5716,15 @@
             EsmModel,
             EsmPreTrainedModel,
         )
+        from .models.falcon import (
+            FALCON_PRETRAINED_MODEL_ARCHIVE_LIST,
+            FalconForCausalLM,
+            FalconForQuestionAnswering,
+            FalconForSequenceClassification,
+            FalconForTokenClassification,
+            FalconModel,
+            FalconPreTrainedModel,
+        )
         from .models.flaubert import (
             FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             FlaubertForMultipleChoice,
@@ -4638,6 +5759,14 @@
             FNetModel,
             FNetPreTrainedModel,
         )
+        from .models.focalnet import (
+            FOCALNET_PRETRAINED_MODEL_ARCHIVE_LIST,
+            FocalNetBackbone,
+            FocalNetForImageClassification,
+            FocalNetForMaskedImageModeling,
+            FocalNetModel,
+            FocalNetPreTrainedModel,
+        )
         from .models.fsmt import FSMTForConditionalGeneration, FSMTModel, PretrainedFSMTModel
         from .models.funnel import (
             FUNNEL_PRETRAINED_MODEL_ARCHIVE_LIST,
@@ -4668,6 +5797,7 @@
         from .models.gpt2 import (
             GPT2_PRETRAINED_MODEL_ARCHIVE_LIST,
             GPT2DoubleHeadsModel,
+            GPT2ForQuestionAnswering,
             GPT2ForSequenceClassification,
             GPT2ForTokenClassification,
             GPT2LMHeadModel,
@@ -4675,10 +5805,20 @@
             GPT2PreTrainedModel,
             load_tf_weights_in_gpt2,
         )
+        from .models.gpt_bigcode import (
+            GPT_BIGCODE_PRETRAINED_MODEL_ARCHIVE_LIST,
+            GPTBigCodeForCausalLM,
+            GPTBigCodeForSequenceClassification,
+            GPTBigCodeForTokenClassification,
+            GPTBigCodeModel,
+            GPTBigCodePreTrainedModel,
+        )
         from .models.gpt_neo import (
             GPT_NEO_PRETRAINED_MODEL_ARCHIVE_LIST,
             GPTNeoForCausalLM,
+            GPTNeoForQuestionAnswering,
             GPTNeoForSequenceClassification,
+            GPTNeoForTokenClassification,
             GPTNeoModel,
             GPTNeoPreTrainedModel,
             load_tf_weights_in_gpt_neo,
@@ -4686,6 +5826,9 @@
         from .models.gpt_neox import (
             GPT_NEOX_PRETRAINED_MODEL_ARCHIVE_LIST,
             GPTNeoXForCausalLM,
+            GPTNeoXForQuestionAnswering,
+            GPTNeoXForSequenceClassification,
+            GPTNeoXForTokenClassification,
             GPTNeoXLayer,
             GPTNeoXModel,
             GPTNeoXPreTrainedModel,
@@ -4705,6 +5848,18 @@
             GPTJModel,
             GPTJPreTrainedModel,
         )
+        from .models.gptsan_japanese import (
+            GPTSAN_JAPANESE_PRETRAINED_MODEL_ARCHIVE_LIST,
+            GPTSanJapaneseForConditionalGeneration,
+            GPTSanJapaneseModel,
+            GPTSanJapanesePreTrainedModel,
+        )
+        from .models.graphormer import (
+            GRAPHORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            GraphormerForGraphClassification,
+            GraphormerModel,
+            GraphormerPreTrainedModel,
+        )
         from .models.groupvit import (
             GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST,
             GroupViTModel,
@@ -4729,6 +5884,13 @@
             IBertModel,
             IBertPreTrainedModel,
         )
+        from .models.idefics import (
+            IDEFICS_PRETRAINED_MODEL_ARCHIVE_LIST,
+            IdeficsForVisionText2Text,
+            IdeficsModel,
+            IdeficsPreTrainedModel,
+            IdeficsProcessor,
+        )
         from .models.imagegpt import (
             IMAGEGPT_PRETRAINED_MODEL_ARCHIVE_LIST,
             ImageGPTForCausalImageModeling,
@@ -4737,6 +5899,19 @@
             ImageGPTPreTrainedModel,
             load_tf_weights_in_imagegpt,
         )
+        from .models.informer import (
+            INFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            InformerForPrediction,
+            InformerModel,
+            InformerPreTrainedModel,
+        )
+        from .models.instructblip import (
+            INSTRUCTBLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
+            InstructBlipForConditionalGeneration,
+            InstructBlipPreTrainedModel,
+            InstructBlipQFormerModel,
+            InstructBlipVisionModel,
+        )
         from .models.jukebox import (
             JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST,
             JukeboxModel,
@@ -4792,6 +5967,7 @@
             LiltModel,
             LiltPreTrainedModel,
         )
+        from .models.llama import LlamaForCausalLM, LlamaForSequenceClassification, LlamaModel, LlamaPreTrainedModel
         from .models.longformer import (
             LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
             LongformerForMaskedLM,
@@ -4847,6 +6023,12 @@
             MarkupLMModel,
             MarkupLMPreTrainedModel,
         )
+        from .models.mask2former import (
+            MASK2FORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            Mask2FormerForUniversalSegmentation,
+            Mask2FormerModel,
+            Mask2FormerPreTrainedModel,
+        )
         from .models.maskformer import (
             MASKFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
             MaskFormerForInstanceSegmentation,
@@ -4862,7 +6044,17 @@
             MBartModel,
             MBartPreTrainedModel,
         )
-        from .models.mctct import MCTCT_PRETRAINED_MODEL_ARCHIVE_LIST, MCTCTForCTC, MCTCTModel, MCTCTPreTrainedModel
+        from .models.mega import (
+            MEGA_PRETRAINED_MODEL_ARCHIVE_LIST,
+            MegaForCausalLM,
+            MegaForMaskedLM,
+            MegaForMultipleChoice,
+            MegaForQuestionAnswering,
+            MegaForSequenceClassification,
+            MegaForTokenClassification,
+            MegaModel,
+            MegaPreTrainedModel,
+        )
         from .models.megatron_bert import (
             MEGATRON_BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             MegatronBertForCausalLM,
@@ -4876,7 +6068,18 @@
             MegatronBertModel,
             MegatronBertPreTrainedModel,
         )
-        from .models.mmbt import MMBTForClassification, MMBTModel, ModalEmbeddings
+        from .models.mgp_str import (
+            MGP_STR_PRETRAINED_MODEL_ARCHIVE_LIST,
+            MgpstrForSceneTextRecognition,
+            MgpstrModel,
+            MgpstrPreTrainedModel,
+        )
+        from .models.mistral import (
+            MistralForCausalLM,
+            MistralForSequenceClassification,
+            MistralModel,
+            MistralPreTrainedModel,
+        )
         from .models.mobilebert import (
             MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             MobileBertForMaskedLM,
@@ -4913,6 +6116,13 @@
             MobileViTModel,
             MobileViTPreTrainedModel,
         )
+        from .models.mobilevitv2 import (
+            MOBILEVITV2_PRETRAINED_MODEL_ARCHIVE_LIST,
+            MobileViTV2ForImageClassification,
+            MobileViTV2ForSemanticSegmentation,
+            MobileViTV2Model,
+            MobileViTV2PreTrainedModel,
+        )
         from .models.mpnet import (
             MPNET_PRETRAINED_MODEL_ARCHIVE_LIST,
             MPNetForMaskedLM,
@@ -4924,7 +6134,41 @@
             MPNetModel,
             MPNetPreTrainedModel,
         )
-        from .models.mt5 import MT5EncoderModel, MT5ForConditionalGeneration, MT5Model, MT5PreTrainedModel
+        from .models.mpt import (
+            MPT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            MptForCausalLM,
+            MptForQuestionAnswering,
+            MptForSequenceClassification,
+            MptForTokenClassification,
+            MptModel,
+            MptPreTrainedModel,
+        )
+        from .models.mra import (
+            MRA_PRETRAINED_MODEL_ARCHIVE_LIST,
+            MraForMaskedLM,
+            MraForMultipleChoice,
+            MraForQuestionAnswering,
+            MraForSequenceClassification,
+            MraForTokenClassification,
+            MraModel,
+            MraPreTrainedModel,
+        )
+        from .models.mt5 import (
+            MT5EncoderModel,
+            MT5ForConditionalGeneration,
+            MT5ForQuestionAnswering,
+            MT5ForSequenceClassification,
+            MT5Model,
+            MT5PreTrainedModel,
+        )
+        from .models.musicgen import (
+            MUSICGEN_PRETRAINED_MODEL_ARCHIVE_LIST,
+            MusicgenForCausalLM,
+            MusicgenForConditionalGeneration,
+            MusicgenModel,
+            MusicgenPreTrainedModel,
+            MusicgenProcessor,
+        )
         from .models.mvp import (
             MVP_PRETRAINED_MODEL_ARCHIVE_LIST,
             MvpForCausalLM,
@@ -4953,6 +6197,14 @@
             NezhaModel,
             NezhaPreTrainedModel,
         )
+        from .models.nllb_moe import (
+            NLLB_MOE_PRETRAINED_MODEL_ARCHIVE_LIST,
+            NllbMoeForConditionalGeneration,
+            NllbMoeModel,
+            NllbMoePreTrainedModel,
+            NllbMoeSparseMLP,
+            NllbMoeTop2Router,
+        )
         from .models.nystromformer import (
             NYSTROMFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
             NystromformerForMaskedLM,
@@ -4964,6 +6216,12 @@
             NystromformerModel,
             NystromformerPreTrainedModel,
         )
+        from .models.oneformer import (
+            ONEFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            OneFormerForUniversalSegmentation,
+            OneFormerModel,
+            OneFormerPreTrainedModel,
+        )
         from .models.openai import (
             OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST,
             OpenAIGPTDoubleHeadsModel,
@@ -5014,6 +6272,19 @@
             PerceiverModel,
             PerceiverPreTrainedModel,
         )
+        from .models.persimmon import (
+            PersimmonForCausalLM,
+            PersimmonForSequenceClassification,
+            PersimmonModel,
+            PersimmonPreTrainedModel,
+        )
+        from .models.pix2struct import (
+            PIX2STRUCT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            Pix2StructForConditionalGeneration,
+            Pix2StructPreTrainedModel,
+            Pix2StructTextModel,
+            Pix2StructVisionModel,
+        )
         from .models.plbart import (
             PLBART_PRETRAINED_MODEL_ARCHIVE_LIST,
             PLBartForCausalLM,
@@ -5028,6 +6299,11 @@
             PoolFormerModel,
             PoolFormerPreTrainedModel,
         )
+        from .models.pop2piano import (
+            POP2PIANO_PRETRAINED_MODEL_ARCHIVE_LIST,
+            Pop2PianoForConditionalGeneration,
+            Pop2PianoPreTrainedModel,
+        )
         from .models.prophetnet import (
             PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST,
             ProphetNetDecoder,
@@ -5037,6 +6313,12 @@
             ProphetNetModel,
             ProphetNetPreTrainedModel,
         )
+        from .models.pvt import (
+            PVT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            PvtForImageClassification,
+            PvtModel,
+            PvtPreTrainedModel,
+        )
         from .models.qdqbert import (
             QDQBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             QDQBertForMaskedLM,
@@ -5100,7 +6382,6 @@
             ResNetModel,
             ResNetPreTrainedModel,
         )
-        from .models.retribert import RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST, RetriBertModel, RetriBertPreTrainedModel
         from .models.roberta import (
             ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
             RobertaForCausalLM,
@@ -5150,6 +6431,17 @@
             RoFormerPreTrainedModel,
             load_tf_weights_in_roformer,
         )
+        from .models.rwkv import (
+            RWKV_PRETRAINED_MODEL_ARCHIVE_LIST,
+            RwkvForCausalLM,
+            RwkvModel,
+            RwkvPreTrainedModel,
+        )
+        from .models.sam import (
+            SAM_PRETRAINED_MODEL_ARCHIVE_LIST,
+            SamModel,
+            SamPreTrainedModel,
+        )
         from .models.segformer import (
             SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
             SegformerDecodeHead,
@@ -5181,6 +6473,15 @@
             Speech2TextPreTrainedModel,
         )
         from .models.speech_to_text_2 import Speech2Text2ForCausalLM, Speech2Text2PreTrainedModel
+        from .models.speecht5 import (
+            SPEECHT5_PRETRAINED_MODEL_ARCHIVE_LIST,
+            SpeechT5ForSpeechToSpeech,
+            SpeechT5ForSpeechToText,
+            SpeechT5ForTextToSpeech,
+            SpeechT5HifiGan,
+            SpeechT5Model,
+            SpeechT5PreTrainedModel,
+        )
         from .models.splinter import (
             SPLINTER_PRETRAINED_MODEL_ARCHIVE_LIST,
             SplinterForPreTraining,
@@ -5200,6 +6501,12 @@
             SqueezeBertModule,
             SqueezeBertPreTrainedModel,
         )
+        from .models.swiftformer import (
+            SWIFTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            SwiftFormerForImageClassification,
+            SwiftFormerModel,
+            SwiftFormerPreTrainedModel,
+        )
         from .models.swin import (
             SWIN_PRETRAINED_MODEL_ARCHIVE_LIST,
             SwinBackbone,
@@ -5234,10 +6541,18 @@
             T5_PRETRAINED_MODEL_ARCHIVE_LIST,
             T5EncoderModel,
             T5ForConditionalGeneration,
+            T5ForQuestionAnswering,
+            T5ForSequenceClassification,
             T5Model,
             T5PreTrainedModel,
             load_tf_weights_in_t5,
         )
+        from .models.table_transformer import (
+            TABLE_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TableTransformerForObjectDetection,
+            TableTransformerModel,
+            TableTransformerPreTrainedModel,
+        )
         from .models.tapas import (
             TAPAS_PRETRAINED_MODEL_ARCHIVE_LIST,
             TapasForMaskedLM,
@@ -5259,11 +6574,7 @@
             TimesformerModel,
             TimesformerPreTrainedModel,
         )
-        from .models.trajectory_transformer import (
-            TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
-            TrajectoryTransformerModel,
-            TrajectoryTransformerPreTrainedModel,
-        )
+        from .models.timm_backbone import TimmBackbone
         from .models.transfo_xl import (
             TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST,
             AdaptiveEmbedding,
@@ -5274,6 +6585,21 @@
             load_tf_weights_in_transfo_xl,
         )
         from .models.trocr import TROCR_PRETRAINED_MODEL_ARCHIVE_LIST, TrOCRForCausalLM, TrOCRPreTrainedModel
+        from .models.tvlt import (
+            TVLT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TvltForAudioVisualClassification,
+            TvltForPreTraining,
+            TvltModel,
+            TvltPreTrainedModel,
+        )
+        from .models.umt5 import (
+            UMT5EncoderModel,
+            UMT5ForConditionalGeneration,
+            UMT5ForQuestionAnswering,
+            UMT5ForSequenceClassification,
+            UMT5Model,
+            UMT5PreTrainedModel,
+        )
         from .models.unispeech import (
             UNISPEECH_PRETRAINED_MODEL_ARCHIVE_LIST,
             UniSpeechForCTC,
@@ -5292,12 +6618,7 @@
             UniSpeechSatModel,
             UniSpeechSatPreTrainedModel,
         )
-        from .models.van import (
-            VAN_PRETRAINED_MODEL_ARCHIVE_LIST,
-            VanForImageClassification,
-            VanModel,
-            VanPreTrainedModel,
-        )
+        from .models.upernet import UperNetForSemanticSegmentation, UperNetPreTrainedModel
         from .models.videomae import (
             VIDEOMAE_PRETRAINED_MODEL_ARCHIVE_LIST,
             VideoMAEForPreTraining,
@@ -5355,6 +6676,28 @@
             ViTMSNModel,
             ViTMSNPreTrainedModel,
         )
+        from .models.vitdet import (
+            VITDET_PRETRAINED_MODEL_ARCHIVE_LIST,
+            VitDetBackbone,
+            VitDetModel,
+            VitDetPreTrainedModel,
+        )
+        from .models.vitmatte import (
+            VITMATTE_PRETRAINED_MODEL_ARCHIVE_LIST,
+            VitMatteForImageMatting,
+            VitMattePreTrainedModel,
+        )
+        from .models.vits import (
+            VITS_PRETRAINED_MODEL_ARCHIVE_LIST,
+            VitsModel,
+            VitsPreTrainedModel,
+        )
+        from .models.vivit import (
+            VIVIT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            VivitForVideoClassification,
+            VivitModel,
+            VivitPreTrainedModel,
+        )
         from .models.wav2vec2 import (
             WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST,
             Wav2Vec2ForAudioFrameClassification,
@@ -5387,6 +6730,7 @@
         )
         from .models.whisper import (
             WHISPER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            WhisperForAudioClassification,
             WhisperForConditionalGeneration,
             WhisperModel,
             WhisperPreTrainedModel,
@@ -5453,6 +6797,17 @@
             XLNetPreTrainedModel,
             load_tf_weights_in_xlnet,
         )
+        from .models.xmod import (
+            XMOD_PRETRAINED_MODEL_ARCHIVE_LIST,
+            XmodForCausalLM,
+            XmodForMaskedLM,
+            XmodForMultipleChoice,
+            XmodForQuestionAnswering,
+            XmodForSequenceClassification,
+            XmodForTokenClassification,
+            XmodModel,
+            XmodPreTrainedModel,
+        )
         from .models.yolos import (
             YOLOS_PRETRAINED_MODEL_ARCHIVE_LIST,
             YolosForObjectDetection,
@@ -5479,6 +6834,7 @@
             get_constant_schedule_with_warmup,
             get_cosine_schedule_with_warmup,
             get_cosine_with_hard_restarts_schedule_with_warmup,
+            get_inverse_sqrt_schedule,
             get_linear_schedule_with_warmup,
             get_polynomial_decay_schedule_with_warmup,
             get_scheduler,
@@ -5506,6 +6862,7 @@
         from .generation import (
             TFForcedBOSTokenLogitsProcessor,
             TFForcedEOSTokenLogitsProcessor,
+            TFForceTokensLogitsProcessor,
             TFGenerationMixin,
             TFLogitsProcessor,
             TFLogitsProcessorList,
@@ -5514,22 +6871,14 @@
             TFNoBadWordsLogitsProcessor,
             TFNoRepeatNGramLogitsProcessor,
             TFRepetitionPenaltyLogitsProcessor,
+            TFSuppressTokensAtBeginLogitsProcessor,
+            TFSuppressTokensLogitsProcessor,
             TFTemperatureLogitsWarper,
             TFTopKLogitsWarper,
             TFTopPLogitsWarper,
             tf_top_k_top_p_filtering,
         )
         from .keras_callbacks import KerasMetricCallback, PushToHubCallback
-        from .modeling_tf_layoutlm import (
-            TF_LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST,
-            TFLayoutLMForMaskedLM,
-            TFLayoutLMForQuestionAnswering,
-            TFLayoutLMForSequenceClassification,
-            TFLayoutLMForTokenClassification,
-            TFLayoutLMMainLayer,
-            TFLayoutLMModel,
-            TFLayoutLMPreTrainedModel,
-        )
         from .modeling_tf_utils import TFPreTrainedModel, TFSequenceSummary, TFSharedEmbeddings, shape_list
 
         # TensorFlow model imports
@@ -5546,9 +6895,11 @@
             TFAlbertPreTrainedModel,
         )
         from .models.auto import (
+            TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
             TF_MODEL_FOR_CAUSAL_LM_MAPPING,
             TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING,
             TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
+            TF_MODEL_FOR_MASK_GENERATION_MAPPING,
             TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING,
             TF_MODEL_FOR_MASKED_LM_MAPPING,
             TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
@@ -5560,15 +6911,20 @@
             TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
             TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
             TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
+            TF_MODEL_FOR_TEXT_ENCODING_MAPPING,
             TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
             TF_MODEL_FOR_VISION_2_SEQ_MAPPING,
+            TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING,
             TF_MODEL_MAPPING,
             TF_MODEL_WITH_LM_HEAD_MAPPING,
             TFAutoModel,
+            TFAutoModelForAudioClassification,
             TFAutoModelForCausalLM,
             TFAutoModelForDocumentQuestionAnswering,
             TFAutoModelForImageClassification,
+            TFAutoModelForMaskedImageModeling,
             TFAutoModelForMaskedLM,
+            TFAutoModelForMaskGeneration,
             TFAutoModelForMultipleChoice,
             TFAutoModelForNextSentencePrediction,
             TFAutoModelForPreTraining,
@@ -5578,8 +6934,10 @@
             TFAutoModelForSequenceClassification,
             TFAutoModelForSpeechSeq2Seq,
             TFAutoModelForTableQuestionAnswering,
+            TFAutoModelForTextEncoding,
             TFAutoModelForTokenClassification,
             TFAutoModelForVision2Seq,
+            TFAutoModelForZeroShotImageClassification,
             TFAutoModelWithLMHead,
         )
         from .models.bart import (
@@ -5613,6 +6971,16 @@
             TFBlenderbotSmallModel,
             TFBlenderbotSmallPreTrainedModel,
         )
+        from .models.blip import (
+            TF_BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TFBlipForConditionalGeneration,
+            TFBlipForImageTextRetrieval,
+            TFBlipForQuestionAnswering,
+            TFBlipModel,
+            TFBlipPreTrainedModel,
+            TFBlipTextModel,
+            TFBlipVisionModel,
+        )
         from .models.camembert import (
             TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             TFCamembertForCausalLM,
@@ -5674,6 +7042,7 @@
         from .models.deberta_v2 import (
             TF_DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST,
             TFDebertaV2ForMaskedLM,
+            TFDebertaV2ForMultipleChoice,
             TFDebertaV2ForQuestionAnswering,
             TFDebertaV2ForSequenceClassification,
             TFDebertaV2ForTokenClassification,
@@ -5710,6 +7079,13 @@
             TFDPRQuestionEncoder,
             TFDPRReader,
         )
+        from .models.efficientformer import (
+            TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TFEfficientFormerForImageClassification,
+            TFEfficientFormerForImageClassificationWithTeacher,
+            TFEfficientFormerModel,
+            TFEfficientFormerPreTrainedModel,
+        )
         from .models.electra import (
             TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST,
             TFElectraForMaskedLM,
@@ -5781,6 +7157,16 @@
             TFHubertModel,
             TFHubertPreTrainedModel,
         )
+        from .models.layoutlm import (
+            TF_LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TFLayoutLMForMaskedLM,
+            TFLayoutLMForQuestionAnswering,
+            TFLayoutLMForSequenceClassification,
+            TFLayoutLMForTokenClassification,
+            TFLayoutLMMainLayer,
+            TFLayoutLMModel,
+            TFLayoutLMPreTrainedModel,
+        )
         from .models.layoutlmv3 import (
             TF_LAYOUTLMV3_PRETRAINED_MODEL_ARCHIVE_LIST,
             TFLayoutLMv3ForQuestionAnswering,
@@ -5813,7 +7199,6 @@
         from .models.mbart import TFMBartForConditionalGeneration, TFMBartModel, TFMBartPreTrainedModel
         from .models.mobilebert import (
             TF_MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
-            TF_MOBILEVIT_PRETRAINED_MODEL_ARCHIVE_LIST,
             TFMobileBertForMaskedLM,
             TFMobileBertForMultipleChoice,
             TFMobileBertForNextSentencePrediction,
@@ -5824,6 +7209,9 @@
             TFMobileBertMainLayer,
             TFMobileBertModel,
             TFMobileBertPreTrainedModel,
+        )
+        from .models.mobilevit import (
+            TF_MOBILEVIT_PRETRAINED_MODEL_ARCHIVE_LIST,
             TFMobileViTForImageClassification,
             TFMobileViTForSemanticSegmentation,
             TFMobileViTModel,
@@ -5913,6 +7301,11 @@
             TFRoFormerModel,
             TFRoFormerPreTrainedModel,
         )
+        from .models.sam import (
+            TF_SAM_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TFSamModel,
+            TFSamPreTrainedModel,
+        )
         from .models.segformer import (
             TF_SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
             TFSegformerDecodeHead,
@@ -5959,11 +7352,13 @@
             TFTransfoXLPreTrainedModel,
         )
         from .models.vision_encoder_decoder import TFVisionEncoderDecoderModel
+        from .models.vision_text_dual_encoder import TFVisionTextDualEncoderModel
         from .models.vit import TFViTForImageClassification, TFViTModel, TFViTPreTrainedModel
         from .models.vit_mae import TFViTMAEForPreTraining, TFViTMAEModel, TFViTMAEPreTrainedModel
         from .models.wav2vec2 import (
             TF_WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST,
             TFWav2Vec2ForCTC,
+            TFWav2Vec2ForSequenceClassification,
             TFWav2Vec2Model,
             TFWav2Vec2PreTrainedModel,
         )
@@ -5992,12 +7387,14 @@
         )
         from .models.xlm_roberta import (
             TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TFXLMRobertaForCausalLM,
             TFXLMRobertaForMaskedLM,
             TFXLMRobertaForMultipleChoice,
             TFXLMRobertaForQuestionAnswering,
             TFXLMRobertaForSequenceClassification,
             TFXLMRobertaForTokenClassification,
             TFXLMRobertaModel,
+            TFXLMRobertaPreTrainedModel,
         )
         from .models.xlnet import (
             TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST,
@@ -6017,6 +7414,20 @@
         # Trainer
         from .trainer_tf import TFTrainer
 
+    try:
+        if not (
+            is_librosa_available()
+            and is_essentia_available()
+            and is_scipy_available()
+            and is_torch_available()
+            and is_pretty_midi_available()
+        ):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_essentia_and_librosa_and_pretty_midi_and_scipy_and_torch_objects import *
+    else:
+        from .models.pop2piano import Pop2PianoFeatureExtractor, Pop2PianoProcessor, Pop2PianoTokenizer
+
     try:
         if not is_flax_available():
             raise OptionalDependencyNotAvailable()
@@ -6028,14 +7439,18 @@
         from .generation import (
             FlaxForcedBOSTokenLogitsProcessor,
             FlaxForcedEOSTokenLogitsProcessor,
+            FlaxForceTokensLogitsProcessor,
             FlaxGenerationMixin,
             FlaxLogitsProcessor,
             FlaxLogitsProcessorList,
             FlaxLogitsWarper,
             FlaxMinLengthLogitsProcessor,
+            FlaxSuppressTokensAtBeginLogitsProcessor,
+            FlaxSuppressTokensLogitsProcessor,
             FlaxTemperatureLogitsWarper,
             FlaxTopKLogitsWarper,
             FlaxTopPLogitsWarper,
+            FlaxWhisperTimeStampLogitsProcessor,
         )
         from .modeling_flax_utils import FlaxPreTrainedModel
 
@@ -6051,6 +7466,7 @@
             FlaxAlbertPreTrainedModel,
         )
         from .models.auto import (
+            FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
             FLAX_MODEL_FOR_CAUSAL_LM_MAPPING,
             FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
             FLAX_MODEL_FOR_MASKED_LM_MAPPING,
@@ -6060,6 +7476,7 @@
             FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
             FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
             FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
+            FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
             FLAX_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
             FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING,
             FLAX_MODEL_MAPPING,
@@ -6073,6 +7490,7 @@
             FlaxAutoModelForQuestionAnswering,
             FlaxAutoModelForSeq2SeqLM,
             FlaxAutoModelForSequenceClassification,
+            FlaxAutoModelForSpeechSeq2Seq,
             FlaxAutoModelForTokenClassification,
             FlaxAutoModelForVision2Seq,
         )
@@ -6124,10 +7542,12 @@
             FlaxBlenderbotSmallModel,
             FlaxBlenderbotSmallPreTrainedModel,
         )
+        from .models.bloom import FlaxBloomForCausalLM, FlaxBloomModel, FlaxBloomPreTrainedModel
         from .models.clip import (
             FlaxCLIPModel,
             FlaxCLIPPreTrainedModel,
             FlaxCLIPTextModel,
+            FlaxCLIPTextModelWithProjection,
             FlaxCLIPTextPreTrainedModel,
             FlaxCLIPVisionModel,
             FlaxCLIPVisionPreTrainedModel,
@@ -6168,6 +7588,8 @@
         from .models.mt5 import FlaxMT5EncoderModel, FlaxMT5ForConditionalGeneration, FlaxMT5Model
         from .models.opt import FlaxOPTForCausalLM, FlaxOPTModel, FlaxOPTPreTrainedModel
         from .models.pegasus import FlaxPegasusForConditionalGeneration, FlaxPegasusModel, FlaxPegasusPreTrainedModel
+        from .models.regnet import FlaxRegNetForImageClassification, FlaxRegNetModel, FlaxRegNetPreTrainedModel
+        from .models.resnet import FlaxResNetForImageClassification, FlaxResNetModel, FlaxResNetPreTrainedModel
         from .models.roberta import (
             FlaxRobertaForCausalLM,
             FlaxRobertaForMaskedLM,
@@ -6208,16 +7630,26 @@
             FlaxWav2Vec2Model,
             FlaxWav2Vec2PreTrainedModel,
         )
+        from .models.whisper import (
+            FlaxWhisperForAudioClassification,
+            FlaxWhisperForConditionalGeneration,
+            FlaxWhisperModel,
+            FlaxWhisperPreTrainedModel,
+        )
         from .models.xglm import FlaxXGLMForCausalLM, FlaxXGLMModel, FlaxXGLMPreTrainedModel
         from .models.xlm_roberta import (
+            FLAX_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
+            FlaxXLMRobertaForCausalLM,
             FlaxXLMRobertaForMaskedLM,
             FlaxXLMRobertaForMultipleChoice,
             FlaxXLMRobertaForQuestionAnswering,
             FlaxXLMRobertaForSequenceClassification,
             FlaxXLMRobertaForTokenClassification,
             FlaxXLMRobertaModel,
+            FlaxXLMRobertaPreTrainedModel,
         )
 
+
 else:
     import sys
 
diff --git a/src/transformers/activations.py b/src/transformers/activations.py
index d9caf8763e45..587dc2e59964 100644
--- a/src/transformers/activations.py
+++ b/src/transformers/activations.py
@@ -25,6 +25,27 @@
 logger = logging.get_logger(__name__)
 
 
+class PytorchGELUTanh(nn.Module):
+    """
+    A fast C implementation of the tanh approximation of the GeLU activation function. See
+    https://arxiv.org/abs/1606.08415.
+
+    This implementation is equivalent to NewGELU and FastGELU but much faster. However, it is not an exact numerical
+    match due to rounding errors.
+    """
+
+    def __init__(self):
+        super().__init__()
+        if version.parse(torch.__version__) < version.parse("1.12.0"):
+            raise ImportError(
+                f"You are using torch=={torch.__version__}, but torch>=1.12.0 is required to use "
+                "PytorchGELUTanh. Please upgrade torch."
+            )
+
+    def forward(self, input: Tensor) -> Tensor:
+        return nn.functional.gelu(input, approximate="tanh")
+
+
 class NewGELUActivation(nn.Module):
     """
     Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
@@ -100,6 +121,22 @@ def forward(self, x: Tensor) -> Tensor:
         return torch.clip(gelu(x), self.min, self.max)
 
 
+class AccurateGELUActivation(nn.Module):
+    """
+    Applies GELU approximation that is faster than default and more accurate than QuickGELU. See:
+    https://github.com/hendrycks/GELUs
+
+    Implemented along with MEGA (Moving Average Equipped Gated Attention)
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.precomputed_constant = math.sqrt(2 / math.pi)
+
+    def forward(self, input: Tensor) -> Tensor:
+        return 0.5 * input * (1 + torch.tanh(self.precomputed_constant * (input + 0.044715 * torch.pow(input, 3))))
+
+
 class SiLUActivation(nn.Module):
     """
     See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
@@ -142,6 +179,30 @@ def forward(self, input: Tensor) -> Tensor:
         return input
 
 
+class LaplaceActivation(nn.Module):
+    """
+    Applies elementwise activation based on Laplace function, introduced in MEGA as an attention activation. See
+    https://arxiv.org/abs/2209.10655
+
+    Inspired by squared relu, but with bounded range and gradient for better stability
+    """
+
+    def forward(self, input, mu=0.707107, sigma=0.282095):
+        input = (input - mu).div(sigma * math.sqrt(2.0))
+        return 0.5 * (1.0 + torch.erf(input))
+
+
+class ReLUSquaredActivation(nn.Module):
+    """
+    Applies the relu^2 activation introduced in https://arxiv.org/abs/2109.08668v2
+    """
+
+    def forward(self, input):
+        relu_applied = nn.functional.relu(input)
+        squared = torch.square(relu_applied)
+        return squared
+
+
 class ClassInstantier(OrderedDict):
     def __getitem__(self, key):
         content = super().__getitem__(key)
@@ -155,10 +216,14 @@ def __getitem__(self, key):
     "gelu_fast": FastGELUActivation,
     "gelu_new": NewGELUActivation,
     "gelu_python": (GELUActivation, {"use_gelu_python": True}),
+    "gelu_pytorch_tanh": PytorchGELUTanh,
+    "gelu_accurate": AccurateGELUActivation,
+    "laplace": LaplaceActivation,
     "linear": LinearActivation,
     "mish": MishActivation,
     "quick_gelu": QuickGELUActivation,
     "relu": nn.ReLU,
+    "relu2": ReLUSquaredActivation,
     "relu6": nn.ReLU6,
     "sigmoid": nn.Sigmoid,
     "silu": SiLUActivation,
diff --git a/src/transformers/audio_utils.py b/src/transformers/audio_utils.py
new file mode 100644
index 000000000000..5819f0723fb6
--- /dev/null
+++ b/src/transformers/audio_utils.py
@@ -0,0 +1,721 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team and the librosa & torchaudio authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Audio processing functions to extract features from audio waveforms. This code is pure numpy to support all frameworks
+and remove unnecessary dependencies.
+"""
+import warnings
+from typing import Optional, Union
+
+import numpy as np
+
+
+def hertz_to_mel(freq: Union[float, np.ndarray], mel_scale: str = "htk") -> Union[float, np.ndarray]:
+    """
+    Convert frequency from hertz to mels.
+
+    Args:
+        freq (`float` or `np.ndarray`):
+            The frequency, or multiple frequencies, in hertz (Hz).
+        mel_scale (`str`, *optional*, defaults to `"htk"`):
+            The mel frequency scale to use, `"htk"`, `"kaldi"` or `"slaney"`.
+
+    Returns:
+        `float` or `np.ndarray`: The frequencies on the mel scale.
+    """
+
+    if mel_scale not in ["slaney", "htk", "kaldi"]:
+        raise ValueError('mel_scale should be one of "htk", "slaney" or "kaldi".')
+
+    if mel_scale == "htk":
+        return 2595.0 * np.log10(1.0 + (freq / 700.0))
+    elif mel_scale == "kaldi":
+        return 1127.0 * np.log(1.0 + (freq / 700.0))
+
+    min_log_hertz = 1000.0
+    min_log_mel = 15.0
+    logstep = 27.0 / np.log(6.4)
+    mels = 3.0 * freq / 200.0
+
+    if isinstance(freq, np.ndarray):
+        log_region = freq >= min_log_hertz
+        mels[log_region] = min_log_mel + np.log(freq[log_region] / min_log_hertz) * logstep
+    elif freq >= min_log_hertz:
+        mels = min_log_mel + np.log(freq / min_log_hertz) * logstep
+
+    return mels
+
+
+def mel_to_hertz(mels: Union[float, np.ndarray], mel_scale: str = "htk") -> Union[float, np.ndarray]:
+    """
+    Convert frequency from mels to hertz.
+
+    Args:
+        mels (`float` or `np.ndarray`):
+            The frequency, or multiple frequencies, in mels.
+        mel_scale (`str`, *optional*, `"htk"`):
+            The mel frequency scale to use, `"htk"`, `"kaldi"` or `"slaney"`.
+
+    Returns:
+        `float` or `np.ndarray`: The frequencies in hertz.
+    """
+
+    if mel_scale not in ["slaney", "htk", "kaldi"]:
+        raise ValueError('mel_scale should be one of "htk", "slaney" or "kaldi".')
+
+    if mel_scale == "htk":
+        return 700.0 * (np.power(10, mels / 2595.0) - 1.0)
+    elif mel_scale == "kaldi":
+        return 700.0 * (np.exp(mels / 1127.0) - 1.0)
+
+    min_log_hertz = 1000.0
+    min_log_mel = 15.0
+    logstep = np.log(6.4) / 27.0
+    freq = 200.0 * mels / 3.0
+
+    if isinstance(mels, np.ndarray):
+        log_region = mels >= min_log_mel
+        freq[log_region] = min_log_hertz * np.exp(logstep * (mels[log_region] - min_log_mel))
+    elif mels >= min_log_mel:
+        freq = min_log_hertz * np.exp(logstep * (mels - min_log_mel))
+
+    return freq
+
+
+def _create_triangular_filter_bank(fft_freqs: np.ndarray, filter_freqs: np.ndarray) -> np.ndarray:
+    """
+    Creates a triangular filter bank.
+
+    Adapted from *torchaudio* and *librosa*.
+
+    Args:
+        fft_freqs (`np.ndarray` of shape `(num_frequency_bins,)`):
+            Discrete frequencies of the FFT bins in Hz.
+        filter_freqs (`np.ndarray` of shape `(num_mel_filters,)`):
+            Center frequencies of the triangular filters to create, in Hz.
+
+    Returns:
+        `np.ndarray` of shape `(num_frequency_bins, num_mel_filters)`
+    """
+    filter_diff = np.diff(filter_freqs)
+    slopes = np.expand_dims(filter_freqs, 0) - np.expand_dims(fft_freqs, 1)
+    down_slopes = -slopes[:, :-2] / filter_diff[:-1]
+    up_slopes = slopes[:, 2:] / filter_diff[1:]
+    return np.maximum(np.zeros(1), np.minimum(down_slopes, up_slopes))
+
+
+def mel_filter_bank(
+    num_frequency_bins: int,
+    num_mel_filters: int,
+    min_frequency: float,
+    max_frequency: float,
+    sampling_rate: int,
+    norm: Optional[str] = None,
+    mel_scale: str = "htk",
+    triangularize_in_mel_space: bool = False,
+) -> np.ndarray:
+    """
+    Creates a frequency bin conversion matrix used to obtain a mel spectrogram. This is called a *mel filter bank*, and
+    various implementation exist, which differ in the number of filters, the shape of the filters, the way the filters
+    are spaced, the bandwidth of the filters, and the manner in which the spectrum is warped. The goal of these
+    features is to approximate the non-linear human perception of the variation in pitch with respect to the frequency.
+
+    Different banks of mel filters were introduced in the literature. The following variations are supported:
+
+    - MFCC FB-20: introduced in 1980 by Davis and Mermelstein, it assumes a sampling frequency of 10 kHz and a speech
+      bandwidth of `[0, 4600]` Hz.
+    - MFCC FB-24 HTK: from the Cambridge HMM Toolkit (HTK) (1995) uses a filter bank of 24 filters for a speech
+      bandwidth of `[0, 8000]` Hz. This assumes sampling rate ≥ 16 kHz.
+    - MFCC FB-40: from the Auditory Toolbox for MATLAB written by Slaney in 1998, assumes a sampling rate of 16 kHz and
+      speech bandwidth of `[133, 6854]` Hz. This version also includes area normalization.
+    - HFCC-E FB-29 (Human Factor Cepstral Coefficients) of Skowronski and Harris (2004), assumes a sampling rate of
+      12.5 kHz and speech bandwidth of `[0, 6250]` Hz.
+
+    This code is adapted from *torchaudio* and *librosa*. Note that the default parameters of torchaudio's
+    `melscale_fbanks` implement the `"htk"` filters while librosa uses the `"slaney"` implementation.
+
+    Args:
+        num_frequency_bins (`int`):
+            Number of frequencies used to compute the spectrogram (should be the same as in `stft`).
+        num_mel_filters (`int`):
+            Number of mel filters to generate.
+        min_frequency (`float`):
+            Lowest frequency of interest in Hz.
+        max_frequency (`float`):
+            Highest frequency of interest in Hz. This should not exceed `sampling_rate / 2`.
+        sampling_rate (`int`):
+            Sample rate of the audio waveform.
+        norm (`str`, *optional*):
+            If `"slaney"`, divide the triangular mel weights by the width of the mel band (area normalization).
+        mel_scale (`str`, *optional*, defaults to `"htk"`):
+            The mel frequency scale to use, `"htk"`, `"kaldi"` or `"slaney"`.
+        triangularize_in_mel_space (`bool`, *optional*, defaults to `False`):
+            If this option is enabled, the triangular filter is applied in mel space rather than frequency space. This
+            should be set to `true` in order to get the same results as `torchaudio` when computing mel filters.
+
+    Returns:
+        `np.ndarray` of shape (`num_frequency_bins`, `num_mel_filters`): Triangular filter bank matrix. This is a
+        projection matrix to go from a spectrogram to a mel spectrogram.
+    """
+    if norm is not None and norm != "slaney":
+        raise ValueError('norm must be one of None or "slaney"')
+
+    # center points of the triangular mel filters
+    mel_min = hertz_to_mel(min_frequency, mel_scale=mel_scale)
+    mel_max = hertz_to_mel(max_frequency, mel_scale=mel_scale)
+    mel_freqs = np.linspace(mel_min, mel_max, num_mel_filters + 2)
+    filter_freqs = mel_to_hertz(mel_freqs, mel_scale=mel_scale)
+
+    if triangularize_in_mel_space:
+        # frequencies of FFT bins in Hz, but filters triangularized in mel space
+        fft_bin_width = sampling_rate / (num_frequency_bins * 2)
+        fft_freqs = hertz_to_mel(fft_bin_width * np.arange(num_frequency_bins), mel_scale=mel_scale)
+        filter_freqs = mel_freqs
+    else:
+        # frequencies of FFT bins in Hz
+        fft_freqs = np.linspace(0, sampling_rate // 2, num_frequency_bins)
+
+    mel_filters = _create_triangular_filter_bank(fft_freqs, filter_freqs)
+
+    if norm is not None and norm == "slaney":
+        # Slaney-style mel is scaled to be approx constant energy per channel
+        enorm = 2.0 / (filter_freqs[2 : num_mel_filters + 2] - filter_freqs[:num_mel_filters])
+        mel_filters *= np.expand_dims(enorm, 0)
+
+    if (mel_filters.max(axis=0) == 0.0).any():
+        warnings.warn(
+            "At least one mel filter has all zero values. "
+            f"The value for `num_mel_filters` ({num_mel_filters}) may be set too high. "
+            f"Or, the value for `num_frequency_bins` ({num_frequency_bins}) may be set too low."
+        )
+
+    return mel_filters
+
+
+def optimal_fft_length(window_length: int) -> int:
+    """
+    Finds the best FFT input size for a given `window_length`. This function takes a given window length and, if not
+    already a power of two, rounds it up to the next power or two.
+
+    The FFT algorithm works fastest when the length of the input is a power of two, which may be larger than the size
+    of the window or analysis frame. For example, if the window is 400 samples, using an FFT input size of 512 samples
+    is more optimal than an FFT size of 400 samples. Using a larger FFT size does not affect the detected frequencies,
+    it simply gives a higher frequency resolution (i.e. the frequency bins are smaller).
+    """
+    return 2 ** int(np.ceil(np.log2(window_length)))
+
+
+def window_function(
+    window_length: int,
+    name: str = "hann",
+    periodic: bool = True,
+    frame_length: Optional[int] = None,
+    center: bool = True,
+) -> np.ndarray:
+    """
+    Returns an array containing the specified window. This window is intended to be used with `stft`.
+
+    The following window types are supported:
+
+        - `"boxcar"`: a rectangular window
+        - `"hamming"`: the Hamming window
+        - `"hann"`: the Hann window
+        - `"povey"`: the Povey window
+
+    Args:
+        window_length (`int`):
+            The length of the window in samples.
+        name (`str`, *optional*, defaults to `"hann"`):
+            The name of the window function.
+        periodic (`bool`, *optional*, defaults to `True`):
+            Whether the window is periodic or symmetric.
+        frame_length (`int`, *optional*):
+            The length of the analysis frames in samples. Provide a value for `frame_length` if the window is smaller
+            than the frame length, so that it will be zero-padded.
+        center (`bool`, *optional*, defaults to `True`):
+            Whether to center the window inside the FFT buffer. Only used when `frame_length` is provided.
+
+    Returns:
+        `np.ndarray` of shape `(window_length,)` or `(frame_length,)` containing the window.
+    """
+    length = window_length + 1 if periodic else window_length
+
+    if name == "boxcar":
+        window = np.ones(length)
+    elif name in ["hamming", "hamming_window"]:
+        window = np.hamming(length)
+    elif name in ["hann", "hann_window"]:
+        window = np.hanning(length)
+    elif name in ["povey"]:
+        window = np.power(np.hanning(length), 0.85)
+    else:
+        raise ValueError(f"Unknown window function '{name}'")
+
+    if periodic:
+        window = window[:-1]
+
+    if frame_length is None:
+        return window
+
+    if window_length > frame_length:
+        raise ValueError(
+            f"Length of the window ({window_length}) may not be larger than frame_length ({frame_length})"
+        )
+
+    padded_window = np.zeros(frame_length)
+    offset = (frame_length - window_length) // 2 if center else 0
+    padded_window[offset : offset + window_length] = window
+    return padded_window
+
+
+# TODO This method does not support batching yet as we are mainly focused on inference.
+def spectrogram(
+    waveform: np.ndarray,
+    window: np.ndarray,
+    frame_length: int,
+    hop_length: int,
+    fft_length: Optional[int] = None,
+    power: Optional[float] = 1.0,
+    center: bool = True,
+    pad_mode: str = "reflect",
+    onesided: bool = True,
+    preemphasis: Optional[float] = None,
+    mel_filters: Optional[np.ndarray] = None,
+    mel_floor: float = 1e-10,
+    log_mel: Optional[str] = None,
+    reference: float = 1.0,
+    min_value: float = 1e-10,
+    db_range: Optional[float] = None,
+    remove_dc_offset: Optional[bool] = None,
+    dtype: np.dtype = np.float32,
+) -> np.ndarray:
+    """
+    Calculates a spectrogram over one waveform using the Short-Time Fourier Transform.
+
+    This function can create the following kinds of spectrograms:
+
+      - amplitude spectrogram (`power = 1.0`)
+      - power spectrogram (`power = 2.0`)
+      - complex-valued spectrogram (`power = None`)
+      - log spectrogram (use `log_mel` argument)
+      - mel spectrogram (provide `mel_filters`)
+      - log-mel spectrogram (provide `mel_filters` and `log_mel`)
+
+    How this works:
+
+      1. The input waveform is split into frames of size `frame_length` that are partially overlapping by `frame_length
+         - hop_length` samples.
+      2. Each frame is multiplied by the window and placed into a buffer of size `fft_length`.
+      3. The DFT is taken of each windowed frame.
+      4. The results are stacked into a spectrogram.
+
+    We make a distinction between the following "blocks" of sample data, each of which may have a different lengths:
+
+      - The analysis frame. This is the size of the time slices that the input waveform is split into.
+      - The window. Each analysis frame is multiplied by the window to avoid spectral leakage.
+      - The FFT input buffer. The length of this determines how many frequency bins are in the spectrogram.
+
+    In this implementation, the window is assumed to be zero-padded to have the same size as the analysis frame. A
+    padded window can be obtained from `window_function()`. The FFT input buffer may be larger than the analysis frame,
+    typically the next power of two.
+
+    Note: This function is not optimized for speed yet. It should be mostly compatible with `librosa.stft` and
+    `torchaudio.functional.transforms.Spectrogram`, although it is more flexible due to the different ways spectrograms
+    can be constructed.
+
+    Args:
+        waveform (`np.ndarray` of shape `(length,)`):
+            The input waveform. This must be a single real-valued, mono waveform.
+        window (`np.ndarray` of shape `(frame_length,)`):
+            The windowing function to apply, including zero-padding if necessary. The actual window length may be
+            shorter than `frame_length`, but we're assuming the array has already been zero-padded.
+        frame_length (`int`):
+            The length of the analysis frames in samples. With librosa this is always equal to `fft_length` but we also
+            allow smaller sizes.
+        hop_length (`int`):
+            The stride between successive analysis frames in samples.
+        fft_length (`int`, *optional*):
+            The size of the FFT buffer in samples. This determines how many frequency bins the spectrogram will have.
+            For optimal speed, this should be a power of two. If `None`, uses `frame_length`.
+        power (`float`, *optional*, defaults to 1.0):
+            If 1.0, returns the amplitude spectrogram. If 2.0, returns the power spectrogram. If `None`, returns
+            complex numbers.
+        center (`bool`, *optional*, defaults to `True`):
+            Whether to pad the waveform so that frame `t` is centered around time `t * hop_length`. If `False`, frame
+            `t` will start at time `t * hop_length`.
+        pad_mode (`str`, *optional*, defaults to `"reflect"`):
+            Padding mode used when `center` is `True`. Possible values are: `"constant"` (pad with zeros), `"edge"`
+            (pad with edge values), `"reflect"` (pads with mirrored values).
+        onesided (`bool`, *optional*, defaults to `True`):
+            If True, only computes the positive frequencies and returns a spectrogram containing `fft_length // 2 + 1`
+            frequency bins. If False, also computes the negative frequencies and returns `fft_length` frequency bins.
+        preemphasis (`float`, *optional*)
+            Coefficient for a low-pass filter that applies pre-emphasis before the DFT.
+        mel_filters (`np.ndarray` of shape `(num_freq_bins, num_mel_filters)`, *optional*):
+            The mel filter bank. If supplied, applies a this filter bank to create a mel spectrogram.
+        mel_floor (`float`, *optional*, defaults to 1e-10):
+            Minimum value of mel frequency banks.
+        log_mel (`str`, *optional*):
+            How to convert the spectrogram to log scale. Possible options are: `None` (don't convert), `"log"` (take
+            the natural logarithm) `"log10"` (take the base-10 logarithm), `"dB"` (convert to decibels). Can only be
+            used when `power` is not `None`.
+        reference (`float`, *optional*, defaults to 1.0):
+            Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set
+            the loudest part to 0 dB. Must be greater than zero.
+        min_value (`float`, *optional*, defaults to `1e-10`):
+            The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking
+            `log(0)`. For a power spectrogram, the default of `1e-10` corresponds to a minimum of -100 dB. For an
+            amplitude spectrogram, the value `1e-5` corresponds to -100 dB. Must be greater than zero.
+        db_range (`float`, *optional*):
+            Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the
+            peak value and the smallest value will never be more than 80 dB. Must be greater than zero.
+        remove_dc_offset (`bool`, *optional*):
+            Subtract mean from waveform on each frame, applied before pre-emphasis. This should be set to `true` in
+            order to get the same results as `torchaudio.compliance.kaldi.fbank` when computing mel filters.
+        dtype (`np.dtype`, *optional*, defaults to `np.float32`):
+            Data type of the spectrogram tensor. If `power` is None, this argument is ignored and the dtype will be
+            `np.complex64`.
+
+    Returns:
+        `nd.array` containing a spectrogram of shape `(num_frequency_bins, length)` for a regular spectrogram or shape
+        `(num_mel_filters, length)` for a mel spectrogram.
+    """
+    window_length = len(window)
+
+    if fft_length is None:
+        fft_length = frame_length
+
+    if frame_length > fft_length:
+        raise ValueError(f"frame_length ({frame_length}) may not be larger than fft_length ({fft_length})")
+
+    if window_length != frame_length:
+        raise ValueError(f"Length of the window ({window_length}) must equal frame_length ({frame_length})")
+
+    if hop_length <= 0:
+        raise ValueError("hop_length must be greater than zero")
+
+    if waveform.ndim != 1:
+        raise ValueError(f"Input waveform must have only one dimension, shape is {waveform.shape}")
+
+    if np.iscomplexobj(waveform):
+        raise ValueError("Complex-valued input waveforms are not currently supported")
+
+    # center pad the waveform
+    if center:
+        padding = [(int(frame_length // 2), int(frame_length // 2))]
+        waveform = np.pad(waveform, padding, mode=pad_mode)
+
+    # promote to float64, since np.fft uses float64 internally
+    waveform = waveform.astype(np.float64)
+    window = window.astype(np.float64)
+
+    # split waveform into frames of frame_length size
+    num_frames = int(1 + np.floor((waveform.size - frame_length) / hop_length))
+
+    num_frequency_bins = (fft_length // 2) + 1 if onesided else fft_length
+    spectrogram = np.empty((num_frames, num_frequency_bins), dtype=np.complex64)
+
+    # rfft is faster than fft
+    fft_func = np.fft.rfft if onesided else np.fft.fft
+    buffer = np.zeros(fft_length)
+
+    timestep = 0
+    for frame_idx in range(num_frames):
+        buffer[:frame_length] = waveform[timestep : timestep + frame_length]
+
+        if remove_dc_offset:
+            buffer[:frame_length] = buffer[:frame_length] - buffer[:frame_length].mean()
+
+        if preemphasis is not None:
+            buffer[1:frame_length] -= preemphasis * buffer[: frame_length - 1]
+            buffer[0] *= 1 - preemphasis
+
+        buffer[:frame_length] *= window
+
+        spectrogram[frame_idx] = fft_func(buffer)
+        timestep += hop_length
+
+    # note: ** is much faster than np.power
+    if power is not None:
+        spectrogram = np.abs(spectrogram, dtype=np.float64) ** power
+
+    spectrogram = spectrogram.T
+
+    if mel_filters is not None:
+        spectrogram = np.maximum(mel_floor, np.dot(mel_filters.T, spectrogram))
+
+    if power is not None and log_mel is not None:
+        if log_mel == "log":
+            spectrogram = np.log(spectrogram)
+        elif log_mel == "log10":
+            spectrogram = np.log10(spectrogram)
+        elif log_mel == "dB":
+            if power == 1.0:
+                spectrogram = amplitude_to_db(spectrogram, reference, min_value, db_range)
+            elif power == 2.0:
+                spectrogram = power_to_db(spectrogram, reference, min_value, db_range)
+            else:
+                raise ValueError(f"Cannot use log_mel option '{log_mel}' with power {power}")
+        else:
+            raise ValueError(f"Unknown log_mel option: {log_mel}")
+
+        spectrogram = np.asarray(spectrogram, dtype)
+
+    return spectrogram
+
+
+def power_to_db(
+    spectrogram: np.ndarray,
+    reference: float = 1.0,
+    min_value: float = 1e-10,
+    db_range: Optional[float] = None,
+) -> np.ndarray:
+    """
+    Converts a power spectrogram to the decibel scale. This computes `10 * log10(spectrogram / reference)`, using basic
+    logarithm properties for numerical stability.
+
+    The motivation behind applying the log function on the (mel) spectrogram is that humans do not hear loudness on a
+    linear scale. Generally to double the perceived volume of a sound we need to put 8 times as much energy into it.
+    This means that large variations in energy may not sound all that different if the sound is loud to begin with.
+    This compression operation makes the (mel) spectrogram features match more closely what humans actually hear.
+
+    Based on the implementation of `librosa.power_to_db`.
+
+    Args:
+        spectrogram (`np.ndarray`):
+            The input power (mel) spectrogram. Note that a power spectrogram has the amplitudes squared!
+        reference (`float`, *optional*, defaults to 1.0):
+            Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set
+            the loudest part to 0 dB. Must be greater than zero.
+        min_value (`float`, *optional*, defaults to `1e-10`):
+            The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking
+            `log(0)`. The default of `1e-10` corresponds to a minimum of -100 dB. Must be greater than zero.
+        db_range (`float`, *optional*):
+            Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the
+            peak value and the smallest value will never be more than 80 dB. Must be greater than zero.
+
+    Returns:
+        `np.ndarray`: the spectrogram in decibels
+    """
+    if reference <= 0.0:
+        raise ValueError("reference must be greater than zero")
+    if min_value <= 0.0:
+        raise ValueError("min_value must be greater than zero")
+
+    reference = max(min_value, reference)
+
+    spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None)
+    spectrogram = 10.0 * (np.log10(spectrogram) - np.log10(reference))
+
+    if db_range is not None:
+        if db_range <= 0.0:
+            raise ValueError("db_range must be greater than zero")
+        spectrogram = np.clip(spectrogram, a_min=spectrogram.max() - db_range, a_max=None)
+
+    return spectrogram
+
+
+def amplitude_to_db(
+    spectrogram: np.ndarray,
+    reference: float = 1.0,
+    min_value: float = 1e-5,
+    db_range: Optional[float] = None,
+) -> np.ndarray:
+    """
+    Converts an amplitude spectrogram to the decibel scale. This computes `20 * log10(spectrogram / reference)`, using
+    basic logarithm properties for numerical stability.
+
+    The motivation behind applying the log function on the (mel) spectrogram is that humans do not hear loudness on a
+    linear scale. Generally to double the perceived volume of a sound we need to put 8 times as much energy into it.
+    This means that large variations in energy may not sound all that different if the sound is loud to begin with.
+    This compression operation makes the (mel) spectrogram features match more closely what humans actually hear.
+
+    Args:
+        spectrogram (`np.ndarray`):
+            The input amplitude (mel) spectrogram.
+        reference (`float`, *optional*, defaults to 1.0):
+            Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set
+            the loudest part to 0 dB. Must be greater than zero.
+        min_value (`float`, *optional*, defaults to `1e-5`):
+            The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking
+            `log(0)`. The default of `1e-5` corresponds to a minimum of -100 dB. Must be greater than zero.
+        db_range (`float`, *optional*):
+            Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the
+            peak value and the smallest value will never be more than 80 dB. Must be greater than zero.
+
+    Returns:
+        `np.ndarray`: the spectrogram in decibels
+    """
+    if reference <= 0.0:
+        raise ValueError("reference must be greater than zero")
+    if min_value <= 0.0:
+        raise ValueError("min_value must be greater than zero")
+
+    reference = max(min_value, reference)
+
+    spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None)
+    spectrogram = 20.0 * (np.log10(spectrogram) - np.log10(reference))
+
+    if db_range is not None:
+        if db_range <= 0.0:
+            raise ValueError("db_range must be greater than zero")
+        spectrogram = np.clip(spectrogram, a_min=spectrogram.max() - db_range, a_max=None)
+
+    return spectrogram
+
+
+### deprecated functions below this line ###
+
+
+def get_mel_filter_banks(
+    nb_frequency_bins: int,
+    nb_mel_filters: int,
+    frequency_min: float,
+    frequency_max: float,
+    sample_rate: int,
+    norm: Optional[str] = None,
+    mel_scale: str = "htk",
+) -> np.array:
+    warnings.warn(
+        "The function `get_mel_filter_banks` is deprecated and will be removed in version 4.31.0 of Transformers",
+        FutureWarning,
+    )
+    return mel_filter_bank(
+        num_frequency_bins=nb_frequency_bins,
+        num_mel_filters=nb_mel_filters,
+        min_frequency=frequency_min,
+        max_frequency=frequency_max,
+        sampling_rate=sample_rate,
+        norm=norm,
+        mel_scale=mel_scale,
+    )
+
+
+def fram_wave(waveform: np.array, hop_length: int = 160, fft_window_size: int = 400, center: bool = True):
+    """
+    In order to compute the short time fourier transform, the waveform needs to be split in overlapping windowed
+    segments called `frames`.
+
+    The window length (window_length) defines how much of the signal is contained in each frame, while the hop length
+    defines the step between the beginning of each new frame.
+
+
+    Args:
+        waveform (`np.array` of shape `(sample_length,)`):
+            The raw waveform which will be split into smaller chunks.
+        hop_length (`int`, *optional*, defaults to 160):
+            Step between each window of the waveform.
+        fft_window_size (`int`, *optional*, defaults to 400):
+            Defines the size of the window.
+        center (`bool`, defaults to `True`):
+            Whether or not to center each frame around the middle of the frame. Centering is done by reflecting the
+            waveform on the left and on the right.
+
+    Return:
+        framed_waveform (`np.array` of shape `(waveform.shape // hop_length , fft_window_size)`):
+            The framed waveforms that can be fed to `np.fft`.
+    """
+    warnings.warn(
+        "The function `fram_wave` is deprecated and will be removed in version 4.31.0 of Transformers",
+        FutureWarning,
+    )
+    frames = []
+    for i in range(0, waveform.shape[0] + 1, hop_length):
+        if center:
+            half_window = (fft_window_size - 1) // 2 + 1
+            start = i - half_window if i > half_window else 0
+            end = i + half_window if i < waveform.shape[0] - half_window else waveform.shape[0]
+            frame = waveform[start:end]
+            if start == 0:
+                padd_width = (-i + half_window, 0)
+                frame = np.pad(frame, pad_width=padd_width, mode="reflect")
+
+            elif end == waveform.shape[0]:
+                padd_width = (0, (i - waveform.shape[0] + half_window))
+                frame = np.pad(frame, pad_width=padd_width, mode="reflect")
+
+        else:
+            frame = waveform[i : i + fft_window_size]
+            frame_width = frame.shape[0]
+            if frame_width < waveform.shape[0]:
+                frame = np.lib.pad(
+                    frame, pad_width=(0, fft_window_size - frame_width), mode="constant", constant_values=0
+                )
+        frames.append(frame)
+
+    frames = np.stack(frames, 0)
+    return frames
+
+
+def stft(frames: np.array, windowing_function: np.array, fft_window_size: int = None):
+    """
+    Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same results
+    as `torch.stft`.
+
+    Args:
+        frames (`np.array` of dimension `(num_frames, fft_window_size)`):
+            A framed audio signal obtained using `audio_utils.fram_wav`.
+        windowing_function (`np.array` of dimension `(nb_frequency_bins, nb_mel_filters)`:
+            A array reprensenting the function that will be used to reduces the amplitude of the discontinuities at the
+            boundaries of each frame when computing the STFT. Each frame will be multiplied by the windowing_function.
+            For more information on the discontinuities, called *Spectral leakage*, refer to [this
+            tutorial]https://download.ni.com/evaluation/pxi/Understanding%20FFTs%20and%20Windowing.pdf
+        fft_window_size (`int`, *optional*):
+            Size of the window om which the Fourier transform is applied. This controls the frequency resolution of the
+            spectrogram. 400 means that the fourrier transform is computed on windows of 400 samples. The number of
+            frequency bins (`nb_frequency_bins`) used to divide the window into equal strips is equal to
+            `(1+fft_window_size)//2`. An increase of the fft_window_size slows the calculus time proportionnally.
+
+    Example:
+
+    ```python
+    >>> from transformers.audio_utils import stft, fram_wave
+    >>> import numpy as np
+
+    >>> audio = np.random.rand(50)
+    >>> fft_window_size = 10
+    >>> hop_length = 2
+    >>> framed_audio = fram_wave(audio, hop_length, fft_window_size)
+    >>> spectrogram = stft(framed_audio, np.hanning(fft_window_size + 1))
+    ```
+
+    Returns:
+        spectrogram (`np.ndarray`):
+            A spectrogram of shape `(num_frames, nb_frequency_bins)` obtained using the STFT algorithm
+    """
+    warnings.warn(
+        "The function `stft` is deprecated and will be removed in version 4.31.0 of Transformers",
+        FutureWarning,
+    )
+    frame_size = frames.shape[1]
+
+    if fft_window_size is None:
+        fft_window_size = frame_size
+
+    if fft_window_size < frame_size:
+        raise ValueError("FFT size must greater or equal the frame size")
+    # number of FFT bins to store
+    nb_frequency_bins = (fft_window_size >> 1) + 1
+
+    spectrogram = np.empty((len(frames), nb_frequency_bins), dtype=np.complex64)
+    fft_signal = np.zeros(fft_window_size)
+
+    for f, frame in enumerate(frames):
+        if windowing_function is not None:
+            np.multiply(frame, windowing_function, out=fft_signal[:frame_size])
+        else:
+            fft_signal[:frame_size] = frame
+        spectrogram[f] = np.fft.fft(fft_signal, axis=0)[:nb_frequency_bins]
+    return spectrogram.T
diff --git a/src/transformers/benchmark/benchmark.py b/src/transformers/benchmark/benchmark.py
index 7f95e4b40b7c..3c5c877a454e 100644
--- a/src/transformers/benchmark/benchmark.py
+++ b/src/transformers/benchmark/benchmark.py
@@ -48,7 +48,6 @@
 
 
 class PyTorchBenchmark(Benchmark):
-
     args: PyTorchBenchmarkArguments
     configs: PretrainedConfig
     framework: str = "PyTorch"
diff --git a/src/transformers/benchmark/benchmark_args.py b/src/transformers/benchmark/benchmark_args.py
index 26c0eb95a4bc..b5887e4a9bcb 100644
--- a/src/transformers/benchmark/benchmark_args.py
+++ b/src/transformers/benchmark/benchmark_args.py
@@ -33,7 +33,6 @@
 
 @dataclass
 class PyTorchBenchmarkArguments(BenchmarkArguments):
-
     deprecated_args = [
         "no_inference",
         "no_cuda",
diff --git a/src/transformers/benchmark/benchmark_args_tf.py b/src/transformers/benchmark/benchmark_args_tf.py
index 12cb6f5cbbeb..c1c2ec16ce55 100644
--- a/src/transformers/benchmark/benchmark_args_tf.py
+++ b/src/transformers/benchmark/benchmark_args_tf.py
@@ -30,7 +30,6 @@
 
 @dataclass
 class TensorFlowBenchmarkArguments(BenchmarkArguments):
-
     deprecated_args = [
         "no_inference",
         "no_cuda",
diff --git a/src/transformers/benchmark/benchmark_args_utils.py b/src/transformers/benchmark/benchmark_args_utils.py
index d9233906d281..48fcb311b437 100644
--- a/src/transformers/benchmark/benchmark_args_utils.py
+++ b/src/transformers/benchmark/benchmark_args_utils.py
@@ -147,11 +147,12 @@ def to_json_string(self):
         return json.dumps(dataclasses.asdict(self), indent=2)
 
     @property
-    def model_names(self):
-        assert len(self.models) > 0, (
-            "Please make sure you provide at least one model name / model identifier, *e.g.* `--models"
-            " bert-base-cased` or `args.models = ['bert-base-cased']."
-        )
+    def model_names(self) -> List[str]:
+        if len(self.models) <= 0:
+            raise ValueError(
+                "Please make sure you provide at least one model name / model identifier, *e.g.* `--models"
+                " bert-base-cased` or `args.models = ['bert-base-cased']."
+            )
         return self.models
 
     @property
diff --git a/src/transformers/benchmark/benchmark_tf.py b/src/transformers/benchmark/benchmark_tf.py
index b5fd4b71b562..c813591be0be 100644
--- a/src/transformers/benchmark/benchmark_tf.py
+++ b/src/transformers/benchmark/benchmark_tf.py
@@ -60,9 +60,10 @@ def run_in_graph_mode(*args, **kwargs):
             return func(*args, **kwargs)
 
         if do_eager_mode is True:
-            assert (
-                use_xla is False
-            ), "Cannot run model in XLA, if `args.eager_mode` is set to `True`. Please set `args.eager_mode=False`."
+            if use_xla is not False:
+                raise ValueError(
+                    "Cannot run model in XLA, if `args.eager_mode` is set to `True`. Please set `args.eager_mode=False`."
+                )
             return run_in_eager_mode
         else:
             return run_in_graph_mode
@@ -77,7 +78,6 @@ def random_input_ids(batch_size: int, sequence_length: int, vocab_size: int) ->
 
 
 class TensorFlowBenchmark(Benchmark):
-
     args: TensorFlowBenchmarkArguments
     configs: PretrainedConfig
     framework: str = "TensorFlow"
@@ -89,13 +89,15 @@ def framework_version(self):
     def _inference_speed(self, model_name: str, batch_size: int, sequence_length: int) -> float:
         # initialize GPU on separate process
         strategy = self.args.strategy
-        assert strategy is not None, "A device strategy has to be initialized before using TensorFlow."
+        if strategy is None:
+            raise ValueError("A device strategy has to be initialized before using TensorFlow.")
         _inference = self._prepare_inference_func(model_name, batch_size, sequence_length)
         return self._measure_speed(_inference)
 
     def _train_speed(self, model_name: str, batch_size: int, sequence_length: int) -> float:
         strategy = self.args.strategy
-        assert strategy is not None, "A device strategy has to be initialized before using TensorFlow."
+        if strategy is None:
+            raise ValueError("A device strategy has to be initialized before using TensorFlow.")
         _train = self._prepare_train_func(model_name, batch_size, sequence_length)
         return self._measure_speed(_train)
 
@@ -106,7 +108,8 @@ def _inference_memory(
         if self.args.is_gpu:
             tf.config.experimental.set_memory_growth(self.args.gpu_list[self.args.device_idx], True)
         strategy = self.args.strategy
-        assert strategy is not None, "A device strategy has to be initialized before using TensorFlow."
+        if strategy is None:
+            raise ValueError("A device strategy has to be initialized before using TensorFlow.")
         _inference = self._prepare_inference_func(model_name, batch_size, sequence_length)
         return self._measure_memory(_inference)
 
@@ -116,7 +119,8 @@ def _train_memory(
         if self.args.is_gpu:
             tf.config.experimental.set_memory_growth(self.args.gpu_list[self.args.device_idx], True)
         strategy = self.args.strategy
-        assert strategy is not None, "A device strategy has to be initialized before using TensorFlow."
+        if strategy is None:
+            raise ValueError("A device strategy has to be initialized before using TensorFlow.")
 
         _train = self._prepare_train_func(model_name, batch_size, sequence_length)
         return self._measure_memory(_train)
@@ -165,9 +169,8 @@ def encoder_forward():
     def _prepare_train_func(self, model_name: str, batch_size: int, sequence_length: int) -> Callable[[], None]:
         config = self.config_dict[model_name]
 
-        assert (
-            self.args.eager_mode is False
-        ), "Training cannot be done in eager mode. Please make sure that `args.eager_mode = False`."
+        if self.args.eager_mode is not False:
+            raise ValueError("Training cannot be done in eager mode. Please make sure that `args.eager_mode = False`.")
 
         if self.args.fp16:
             raise NotImplementedError("Mixed precision is currently not supported.")
@@ -241,10 +244,11 @@ def _measure_memory(self, func: Callable[[], None]) -> [Memory, MemorySummary]:
         with self.args.strategy.scope():
             try:
                 if self.args.trace_memory_line_by_line:
-                    assert self.args.eager_mode, (
-                        "`args.eager_mode` is set to `False`. Make sure to run model in eager mode to measure memory"
-                        " consumption line by line."
-                    )
+                    if not self.args.eager_mode:
+                        raise ValueError(
+                            "`args.eager_mode` is set to `False`. Make sure to run model in eager mode to measure memory"
+                            " consumption line by line."
+                        )
                     trace = start_memory_tracing("transformers")
 
                 if self.args.is_tpu:
diff --git a/src/transformers/benchmark/benchmark_utils.py b/src/transformers/benchmark/benchmark_utils.py
index 797408058071..a71b1fb65a23 100644
--- a/src/transformers/benchmark/benchmark_utils.py
+++ b/src/transformers/benchmark/benchmark_utils.py
@@ -557,9 +557,9 @@ def stop_memory_tracing(
             cumulative_memory_dict[frame][2] += cpu_gpu_mem_inc
 
         cumulative_memory = sorted(
-            list(cumulative_memory_dict.items()), key=lambda x: x[1][2], reverse=True
+            cumulative_memory_dict.items(), key=lambda x: x[1][2], reverse=True
         )  # order by the total CPU + GPU memory increase
-        cumulative_memory = list(
+        cumulative_memory = [
             MemoryState(
                 frame=frame,
                 cpu=Memory(cpu_mem_inc),
@@ -567,7 +567,7 @@ def stop_memory_tracing(
                 cpu_gpu=Memory(cpu_gpu_mem_inc),
             )
             for frame, (cpu_mem_inc, gpu_mem_inc, cpu_gpu_mem_inc) in cumulative_memory
-        )
+        ]
 
         memory_curr_trace = sorted(memory_curr_trace, key=lambda x: x.cpu_gpu.bytes, reverse=True)
 
@@ -610,7 +610,7 @@ def __init__(self, args: BenchmarkArguments = None, configs: PretrainedConfig =
                 model_name: AutoConfig.from_pretrained(model_name) for model_name in self.args.model_names
             }
         else:
-            self.config_dict = {model_name: config for model_name, config in zip(self.args.model_names, configs)}
+            self.config_dict = dict(zip(self.args.model_names, configs))
 
         warnings.warn(
             f"The class {self.__class__} is deprecated. Hugging Face Benchmarking utils"
@@ -890,8 +890,8 @@ def save_to_csv(self, result_dict, filename):
             return
         self.print_fn("Saving results to csv.")
         with open(filename, mode="w") as csv_file:
-
-            assert len(self.args.model_names) > 0, f"At least 1 model should be defined, but got {self.model_names}"
+            if len(self.args.model_names) <= 0:
+                raise ValueError(f"At least 1 model should be defined, but got {self.model_names}")
 
             fieldnames = ["model", "batch_size", "sequence_length"]
             writer = csv.DictWriter(csv_file, fieldnames=fieldnames + ["result"])
diff --git a/src/transformers/commands/add_new_model.py b/src/transformers/commands/add_new_model.py
index 85d053a14873..87949827d9f8 100644
--- a/src/transformers/commands/add_new_model.py
+++ b/src/transformers/commands/add_new_model.py
@@ -183,8 +183,8 @@ def remove_copy_lines(path):
             os.remove(f"{directory}/test_modeling_flax_{lowercase_model_name}.py")
 
         shutil.move(
-            f"{directory}/{lowercase_model_name}.mdx",
-            f"{path_to_transformer_root}/docs/source/en/model_doc/{lowercase_model_name}.mdx",
+            f"{directory}/{lowercase_model_name}.md",
+            f"{path_to_transformer_root}/docs/source/en/model_doc/{lowercase_model_name}.md",
         )
 
         shutil.move(
diff --git a/src/transformers/commands/add_new_model_like.py b/src/transformers/commands/add_new_model_like.py
index 45585b1d80f2..df86a22799a5 100644
--- a/src/transformers/commands/add_new_model_like.py
+++ b/src/transformers/commands/add_new_model_like.py
@@ -23,9 +23,10 @@
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Pattern, Tuple, Union
 
-import transformers.models.auto as auto_module
-from transformers.models.auto.configuration_auto import model_type_to_module_name
+import yaml
 
+from ..models import auto as auto_module
+from ..models.auto.configuration_auto import model_type_to_module_name
 from ..utils import is_flax_available, is_tf_available, is_torch_available, logging
 from . import BaseTransformersCLICommand
 
@@ -128,7 +129,7 @@ def find_indent(line: str) -> int:
     """
     Returns the number of spaces that start a line indent.
     """
-    search = re.search("^(\s*)(?:\S|$)", line)
+    search = re.search(r"^(\s*)(?:\S|$)", line)
     if search is None:
         return 0
     return len(search.groups()[0])
@@ -174,6 +175,56 @@ def parse_module_content(content: str) -> List[str]:
     return objects
 
 
+def extract_block(content: str, indent_level: int = 0) -> str:
+    """Return the first block in `content` with the indent level `indent_level`.
+
+    The first line in `content` should be indented at `indent_level` level, otherwise an error will be thrown.
+
+    This method will immediately stop the search when a (non-empty) line with indent level less than `indent_level` is
+    encountered.
+
+    Args:
+        content (`str`): The content to parse
+        indent_level (`int`, *optional*, default to 0): The indent level of the blocks to search for
+
+    Returns:
+        `str`: The first block in `content` with the indent level `indent_level`.
+    """
+    current_object = []
+    lines = content.split("\n")
+    # Doc-styler takes everything between two triple quotes in docstrings, so we need a fake """ here to go with this.
+    end_markers = [")", "]", "}", '"""']
+
+    for idx, line in enumerate(lines):
+        if idx == 0 and indent_level > 0 and not is_empty_line(line) and find_indent(line) != indent_level:
+            raise ValueError(
+                f"When `indent_level > 0`, the first line in `content` should have indent level {indent_level}. Got "
+                f"{find_indent(line)} instead."
+            )
+
+        if find_indent(line) < indent_level and not is_empty_line(line):
+            break
+
+        # End of an object
+        is_valid_object = len(current_object) > 0
+        if (
+            not is_empty_line(line)
+            and not line.endswith(":")
+            and find_indent(line) == indent_level
+            and is_valid_object
+        ):
+            # Closing parts should be included in current object
+            if line.lstrip() in end_markers:
+                current_object.append(line)
+            return "\n".join(current_object)
+        else:
+            current_object.append(line)
+
+    # Add last object
+    if len(current_object) > 0:
+        return "\n".join(current_object)
+
+
 def add_content_to_text(
     text: str,
     content: str,
@@ -403,12 +454,53 @@ def get_module_from_file(module_file: Union[str, os.PathLike]) -> str:
 _re_class_func = re.compile(r"^(?:class|def)\s+([^\s:\(]+)\s*(?:\(|\:)", flags=re.MULTILINE)
 
 
+def remove_attributes(obj, target_attr):
+    """Remove `target_attr` in `obj`."""
+    lines = obj.split(os.linesep)
+
+    target_idx = None
+    for idx, line in enumerate(lines):
+        # search for assignment
+        if line.lstrip().startswith(f"{target_attr} = "):
+            target_idx = idx
+            break
+        # search for function/method definition
+        elif line.lstrip().startswith(f"def {target_attr}("):
+            target_idx = idx
+            break
+
+    # target not found
+    if target_idx is None:
+        return obj
+
+    line = lines[target_idx]
+    indent_level = find_indent(line)
+    # forward pass to find the ending of the block (including empty lines)
+    parsed = extract_block("\n".join(lines[target_idx:]), indent_level)
+    num_lines = len(parsed.split("\n"))
+    for idx in range(num_lines):
+        lines[target_idx + idx] = None
+
+    # backward pass to find comments or decorator
+    for idx in range(target_idx - 1, -1, -1):
+        line = lines[idx]
+        if (line.lstrip().startswith("#") or line.lstrip().startswith("@")) and find_indent(line) == indent_level:
+            lines[idx] = None
+        else:
+            break
+
+    new_obj = os.linesep.join([x for x in lines if x is not None])
+
+    return new_obj
+
+
 def duplicate_module(
     module_file: Union[str, os.PathLike],
     old_model_patterns: ModelPatterns,
     new_model_patterns: ModelPatterns,
     dest_file: Optional[str] = None,
     add_copied_from: bool = True,
+    attrs_to_remove: List[str] = None,
 ):
     """
     Create a new module from an existing one and adapting all function and classes names from old patterns to new ones.
@@ -429,7 +521,7 @@ def duplicate_module(
     with open(module_file, "r", encoding="utf-8") as f:
         content = f.read()
 
-    content = re.sub("# Copyright (\d+)\s", f"# Copyright {CURRENT_YEAR} ", content)
+    content = re.sub(r"# Copyright (\d+)\s", f"# Copyright {CURRENT_YEAR} ", content)
     objects = parse_module_content(content)
 
     # Loop and treat all objects
@@ -478,7 +570,7 @@ def duplicate_module(
         # Regular classes functions
         old_obj = obj
         obj, replacement = replace_model_patterns(obj, old_model_patterns, new_model_patterns)
-        has_copied_from = re.search("^#\s+Copied from", obj, flags=re.MULTILINE) is not None
+        has_copied_from = re.search(r"^#\s+Copied from", obj, flags=re.MULTILINE) is not None
         if add_copied_from and not has_copied_from and _re_class_func.search(obj) is not None and len(replacement) > 0:
             # Copied from statement must be added just before the class/function definition, which may not be the
             # first line because of decorators.
@@ -492,8 +584,14 @@ def duplicate_module(
 
         new_objects.append(obj)
 
+    content = "\n".join(new_objects)
+    # Remove some attributes that we don't want to copy to the new file(s)
+    if attrs_to_remove is not None:
+        for attr in attrs_to_remove:
+            content = remove_attributes(content, target_attr=attr)
+
     with open(dest_file, "w", encoding="utf-8") as f:
-        content = f.write("\n".join(new_objects))
+        f.write(content)
 
 
 def filter_framework_files(
@@ -550,7 +648,7 @@ def get_model_files(model_type: str, frameworks: Optional[List[str]] = None) ->
     model_files = list(model_module.glob("*.py"))
     model_files = filter_framework_files(model_files, frameworks=frameworks)
 
-    doc_file = REPO_PATH / "docs" / "source" / "en" / "model_doc" / f"{model_type}.mdx"
+    doc_file = REPO_PATH / "docs" / "source" / "en" / "model_doc" / f"{model_type}.md"
 
     # Basic pattern for test files
     test_files = [
@@ -571,7 +669,7 @@ def get_model_files(model_type: str, frameworks: Optional[List[str]] = None) ->
     return {"doc_file": doc_file, "model_files": model_files, "module_name": module_name, "test_files": test_files}
 
 
-_re_checkpoint_for_doc = re.compile("^_CHECKPOINT_FOR_DOC\s+=\s+(\S*)\s*$", flags=re.MULTILINE)
+_re_checkpoint_for_doc = re.compile(r"^_CHECKPOINT_FOR_DOC\s+=\s+(\S*)\s*$", flags=re.MULTILINE)
 
 
 def find_base_model_checkpoint(
@@ -817,8 +915,8 @@ def clean_frameworks_in_init(
             idx += 1
         # Otherwise we keep the line, except if it's a tokenizer import and we don't want to keep it.
         elif keep_processing or (
-            re.search('^\s*"(tokenization|processing|feature_extraction|image_processing)', lines[idx]) is None
-            and re.search("^\s*from .(tokenization|processing|feature_extraction|image_processing)", lines[idx])
+            re.search(r'^\s*"(tokenization|processing|feature_extraction|image_processing)', lines[idx]) is None
+            and re.search(r"^\s*from .(tokenization|processing|feature_extraction|image_processing)", lines[idx])
             is None
         ):
             new_lines.append(lines[idx])
@@ -1089,18 +1187,18 @@ def duplicate_doc_file(
         old_model_patterns (`ModelPatterns`): The patterns for the old model.
         new_model_patterns (`ModelPatterns`): The patterns for the new model.
         dest_file (`str` or `os.PathLike`, *optional*): Path to the new doc file.
-            Will default to the a file named `{new_model_patterns.model_type}.mdx` in the same folder as `module_file`.
+            Will default to the a file named `{new_model_patterns.model_type}.md` in the same folder as `module_file`.
         frameworks (`List[str]`, *optional*):
             If passed, will only keep the model classes corresponding to this list of frameworks in the new doc file.
     """
     with open(doc_file, "r", encoding="utf-8") as f:
         content = f.read()
 
-    content = re.sub("<!--\s*Copyright (\d+)\s", f"<!--Copyright {CURRENT_YEAR} ", content)
+    content = re.sub(r"<!--\s*Copyright (\d+)\s", f"<!--Copyright {CURRENT_YEAR} ", content)
     if frameworks is None:
         frameworks = get_default_frameworks()
     if dest_file is None:
-        dest_file = Path(doc_file).parent / f"{new_model_patterns.model_type}.mdx"
+        dest_file = Path(doc_file).parent / f"{new_model_patterns.model_type}.md"
 
     # Parse the doc file in blocks. One block per section/header
     lines = content.split("\n")
@@ -1122,7 +1220,7 @@ def duplicate_doc_file(
         if not block.startswith("#"):
             new_blocks.append(block)
         # Main title
-        elif re.search("^#\s+\S+", block) is not None:
+        elif re.search(r"^#\s+\S+", block) is not None:
             new_blocks.append(f"# {new_model_patterns.model_name}\n")
         # The config starts the part of the doc with the classes.
         elif not in_classes and old_model_patterns.config_class in block.split("\n")[0]:
@@ -1134,7 +1232,7 @@ def duplicate_doc_file(
         elif in_classes:
             in_classes = True
             block_title = block.split("\n")[0]
-            block_class = re.search("^#+\s+(\S.*)$", block_title).groups()[0]
+            block_class = re.search(r"^#+\s+(\S.*)$", block_title).groups()[0]
             new_block, _ = replace_model_patterns(block, old_model_patterns, new_model_patterns)
 
             if "Tokenizer" in block_class:
@@ -1172,6 +1270,56 @@ def duplicate_doc_file(
         f.write("\n".join(new_blocks))
 
 
+def insert_model_in_doc_toc(old_model_patterns, new_model_patterns):
+    """
+    Insert the new model in the doc TOC, in the same section as the old model.
+
+    Args:
+        old_model_patterns (`ModelPatterns`): The patterns for the old model.
+        new_model_patterns (`ModelPatterns`): The patterns for the new model.
+    """
+    toc_file = REPO_PATH / "docs" / "source" / "en" / "_toctree.yml"
+    with open(toc_file, "r", encoding="utf8") as f:
+        content = yaml.safe_load(f)
+
+    # Get to the model API doc
+    api_idx = 0
+    while content[api_idx]["title"] != "API":
+        api_idx += 1
+    api_doc = content[api_idx]["sections"]
+
+    model_idx = 0
+    while api_doc[model_idx]["title"] != "Models":
+        model_idx += 1
+    model_doc = api_doc[model_idx]["sections"]
+
+    # Find the base model in the Toc
+    old_model_type = old_model_patterns.model_type
+    section_idx = 0
+    while section_idx < len(model_doc):
+        sections = [entry["local"] for entry in model_doc[section_idx]["sections"]]
+        if f"model_doc/{old_model_type}" in sections:
+            break
+
+        section_idx += 1
+
+    if section_idx == len(model_doc):
+        old_model = old_model_patterns.model_name
+        new_model = new_model_patterns.model_name
+        print(f"Did not find {old_model} in the table of content, so you will need to add {new_model} manually.")
+        return
+
+    # Add the new model in the same toc
+    toc_entry = {"local": f"model_doc/{new_model_patterns.model_type}", "title": new_model_patterns.model_name}
+    model_doc[section_idx]["sections"].append(toc_entry)
+    model_doc[section_idx]["sections"] = sorted(model_doc[section_idx]["sections"], key=lambda s: s["title"].lower())
+    api_doc[model_idx]["sections"] = model_doc
+    content[api_idx]["sections"] = api_doc
+
+    with open(toc_file, "w", encoding="utf-8") as f:
+        f.write(yaml.dump(content, allow_unicode=True))
+
+
 def create_new_model_like(
     model_type: str,
     new_model_patterns: ModelPatterns,
@@ -1295,6 +1443,7 @@ def disable_fx_test(filename: Path) -> bool:
             new_model_patterns,
             dest_file=dest_file,
             add_copied_from=False,
+            attrs_to_remove=["pipeline_model_mapping", "is_pipeline_test_to_skip"],
         )
         disabled_fx_test = disabled_fx_test | disable_fx_test(dest_file)
 
@@ -1308,8 +1457,9 @@ def disable_fx_test(filename: Path) -> bool:
     add_model_to_auto_classes(old_model_patterns, new_model_patterns, model_classes)
 
     # 5. Add doc file
-    doc_file = REPO_PATH / "docs" / "source" / "en" / "model_doc" / f"{old_model_patterns.model_type}.mdx"
+    doc_file = REPO_PATH / "docs" / "source" / "en" / "model_doc" / f"{old_model_patterns.model_type}.md"
     duplicate_doc_file(doc_file, old_model_patterns, new_model_patterns, frameworks=frameworks)
+    insert_model_in_doc_toc(old_model_patterns, new_model_patterns)
 
     # 6. Warn the user for duplicate patterns
     if old_model_patterns.model_type == old_model_patterns.checkpoint:
@@ -1556,6 +1706,8 @@ def get_user_input():
                 "What will be the name of the image processor class for this model? ",
                 default_value=f"{model_camel_cased}ImageProcessor",
             )
+        else:
+            image_processor_class = None
         if old_feature_extractor_class is not None:
             feature_extractor_class = get_user_field(
                 "What will be the name of the feature extractor class for this model? ",
diff --git a/src/transformers/commands/convert.py b/src/transformers/commands/convert.py
index 8c3e37bfcf3d..b46e14f5a673 100644
--- a/src/transformers/commands/convert.py
+++ b/src/transformers/commands/convert.py
@@ -71,7 +71,7 @@ def __init__(
         pytorch_dump_output: str,
         config: str,
         finetuning_task_name: str,
-        *args
+        *args,
     ):
         self._logger = logging.get_logger("transformers-cli/converting")
 
@@ -167,7 +167,7 @@ def run(self):
 
             convert_xlm_checkpoint_to_pytorch(self._tf_checkpoint, self._pytorch_dump_output)
         elif self._model_type == "lxmert":
-            from ..models.lxmert.convert_lxmert_original_pytorch_checkpoint_to_pytorch import (
+            from ..models.lxmert.convert_lxmert_original_tf_checkpoint_to_pytorch import (
                 convert_lxmert_checkpoint_to_pytorch,
             )
 
diff --git a/src/transformers/commands/download.py b/src/transformers/commands/download.py
index 3c224555dfd5..8af3c6397b44 100644
--- a/src/transformers/commands/download.py
+++ b/src/transformers/commands/download.py
@@ -18,7 +18,7 @@
 
 
 def download_command_factory(args):
-    return DownloadCommand(args.model, args.cache_dir, args.force)
+    return DownloadCommand(args.model, args.cache_dir, args.force, args.trust_remote_code)
 
 
 class DownloadCommand(BaseTransformersCLICommand):
@@ -31,16 +31,26 @@ def register_subcommand(parser: ArgumentParser):
         download_parser.add_argument(
             "--force", action="store_true", help="Force the model to be download even if already in cache-dir"
         )
+        download_parser.add_argument(
+            "--trust-remote-code",
+            action="store_true",
+            help="Whether or not to allow for custom models defined on the Hub in their own modeling files. Use only if you've reviewed the code as it will execute on your local machine",
+        )
         download_parser.add_argument("model", type=str, help="Name of the model to download")
         download_parser.set_defaults(func=download_command_factory)
 
-    def __init__(self, model: str, cache: str, force: bool):
+    def __init__(self, model: str, cache: str, force: bool, trust_remote_code: bool):
         self._model = model
         self._cache = cache
         self._force = force
+        self._trust_remote_code = trust_remote_code
 
     def run(self):
         from ..models.auto import AutoModel, AutoTokenizer
 
-        AutoModel.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
-        AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
+        AutoModel.from_pretrained(
+            self._model, cache_dir=self._cache, force_download=self._force, trust_remote_code=self._trust_remote_code
+        )
+        AutoTokenizer.from_pretrained(
+            self._model, cache_dir=self._cache, force_download=self._force, trust_remote_code=self._trust_remote_code
+        )
diff --git a/src/transformers/commands/env.py b/src/transformers/commands/env.py
index fac0204f8cc8..8567bbcf5b61 100644
--- a/src/transformers/commands/env.py
+++ b/src/transformers/commands/env.py
@@ -12,13 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import importlib.util
+import os
 import platform
 from argparse import ArgumentParser
 
 import huggingface_hub
 
 from .. import __version__ as version
-from ..utils import is_flax_available, is_tf_available, is_torch_available
+from ..utils import (
+    is_accelerate_available,
+    is_flax_available,
+    is_safetensors_available,
+    is_tf_available,
+    is_torch_available,
+)
 from . import BaseTransformersCLICommand
 
 
@@ -26,13 +34,53 @@ def info_command_factory(_):
     return EnvironmentCommand()
 
 
+def download_command_factory(args):
+    return EnvironmentCommand(args.accelerate_config_file)
+
+
 class EnvironmentCommand(BaseTransformersCLICommand):
     @staticmethod
     def register_subcommand(parser: ArgumentParser):
         download_parser = parser.add_parser("env")
         download_parser.set_defaults(func=info_command_factory)
+        download_parser.add_argument(
+            "--accelerate-config_file",
+            default=None,
+            help="The accelerate config file to use for the default values in the launching script.",
+        )
+        download_parser.set_defaults(func=download_command_factory)
+
+    def __init__(self, accelerate_config_file, *args) -> None:
+        self._accelerate_config_file = accelerate_config_file
 
     def run(self):
+        safetensors_version = "not installed"
+        if is_safetensors_available():
+            import safetensors
+
+            safetensors_version = safetensors.__version__
+        elif importlib.util.find_spec("safetensors") is not None:
+            import safetensors
+
+            safetensors_version = f"{safetensors.__version__} but is ignored because of PyTorch version too old."
+
+        accelerate_version = "not installed"
+        accelerate_config = accelerate_config_str = "not found"
+        if is_accelerate_available():
+            import accelerate
+            from accelerate.commands.config import default_config_file, load_config_from_file
+
+            accelerate_version = accelerate.__version__
+            # Get the default from the config file.
+            if self._accelerate_config_file is not None or os.path.isfile(default_config_file):
+                accelerate_config = load_config_from_file(self._accelerate_config_file).to_dict()
+
+            accelerate_config_str = (
+                "\n".join([f"\t- {prop}: {val}" for prop, val in accelerate_config.items()])
+                if isinstance(accelerate_config, dict)
+                else f"\t{accelerate_config}"
+            )
+
         pt_version = "not installed"
         pt_cuda_available = "NA"
         if is_torch_available():
@@ -73,6 +121,9 @@ def run(self):
             "Platform": platform.platform(),
             "Python version": platform.python_version(),
             "Huggingface_hub version": huggingface_hub.__version__,
+            "Safetensors version": f"{safetensors_version}",
+            "Accelerate version": f"{accelerate_version}",
+            "Accelerate config": f"{accelerate_config_str}",
             "PyTorch version (GPU?)": f"{pt_version} ({pt_cuda_available})",
             "Tensorflow version (GPU?)": f"{tf_version} ({tf_cuda_available})",
             "Flax version (CPU?/GPU?/TPU?)": f"{flax_version} ({jax_backend})",
diff --git a/src/transformers/commands/pt_to_tf.py b/src/transformers/commands/pt_to_tf.py
index 62996b051324..0185679d48dd 100644
--- a/src/transformers/commands/pt_to_tf.py
+++ b/src/transformers/commands/pt_to_tf.py
@@ -17,11 +17,10 @@
 from argparse import ArgumentParser, Namespace
 from importlib import import_module
 
+import huggingface_hub
 import numpy as np
 from packaging import version
 
-import huggingface_hub
-
 from .. import (
     FEATURE_EXTRACTOR_MAPPING,
     IMAGE_PROCESSOR_MAPPING,
@@ -69,6 +68,7 @@ def convert_command_factory(args: Namespace):
         args.no_pr,
         args.push,
         args.extra_commit_description,
+        args.override_model_class,
     )
 
 
@@ -127,6 +127,13 @@ def register_subcommand(parser: ArgumentParser):
             default="",
             help="Optional additional commit description to use when opening a PR (e.g. to tag the owner).",
         )
+        train_parser.add_argument(
+            "--override-model-class",
+            type=str,
+            default=None,
+            help="If you think you know better than the auto-detector, you can specify the model class here. "
+            "Can be either an AutoModel class or a specific model class like BertForSequenceClassification.",
+        )
         train_parser.set_defaults(func=convert_command_factory)
 
     @staticmethod
@@ -145,7 +152,6 @@ def find_pt_tf_differences(pt_outputs, tf_outputs):
 
         # 2. For each output attribute, computes the difference
         def _find_pt_tf_differences(pt_out, tf_out, differences, attr_name=""):
-
             # If the current attribute is a tensor, it is a leaf and we make the comparison. Otherwise, we will dig in
             # recursivelly, keeping the name of the attribute.
             if isinstance(pt_out, torch.Tensor):
@@ -177,7 +183,8 @@ def __init__(
         no_pr: bool,
         push: bool,
         extra_commit_description: str,
-        *args
+        override_model_class: str,
+        *args,
     ):
         self._logger = logging.get_logger("transformers-cli/pt_to_tf")
         self._model_name = model_name
@@ -187,8 +194,9 @@ def __init__(
         self._no_pr = no_pr
         self._push = push
         self._extra_commit_description = extra_commit_description
+        self._override_model_class = override_model_class
 
-    def get_inputs(self, pt_model, config):
+    def get_inputs(self, pt_model, tf_dummy_inputs, config):
         """
         Returns the right inputs for the model, based on its signature.
         """
@@ -247,7 +255,11 @@ def _get_audio_input():
         tf_input = processor(**processor_inputs, return_tensors="tf")
 
         # Extra input requirements, in addition to the input modality
-        if config.is_encoder_decoder or (hasattr(pt_model, "encoder") and hasattr(pt_model, "decoder")):
+        if (
+            config.is_encoder_decoder
+            or (hasattr(pt_model, "encoder") and hasattr(pt_model, "decoder"))
+            or "decoder_input_ids" in tf_dummy_inputs
+        ):
             decoder_input_ids = np.asarray([[1], [1]], dtype=int) * (pt_model.config.decoder_start_token_id or 0)
             pt_input.update({"decoder_input_ids": torch.tensor(decoder_input_ids)})
             tf_input.update({"decoder_input_ids": tf.convert_to_tensor(decoder_input_ids)})
@@ -271,7 +283,20 @@ def run(self):
         # Load config and get the appropriate architecture -- the latter is needed to convert the head's weights
         config = AutoConfig.from_pretrained(self._local_dir)
         architectures = config.architectures
-        if architectures is None:  # No architecture defined -- use auto classes
+        if self._override_model_class is not None:
+            if self._override_model_class.startswith("TF"):
+                architectures = [self._override_model_class[2:]]
+            else:
+                architectures = [self._override_model_class]
+            try:
+                pt_class = getattr(import_module("transformers"), architectures[0])
+            except AttributeError:
+                raise ValueError(f"Model class {self._override_model_class} not found in transformers.")
+            try:
+                tf_class = getattr(import_module("transformers"), "TF" + architectures[0])
+            except AttributeError:
+                raise ValueError(f"TF model class TF{self._override_model_class} not found in transformers.")
+        elif architectures is None:  # No architecture defined -- use auto classes
             pt_class = getattr(import_module("transformers"), "AutoModel")
             tf_class = getattr(import_module("transformers"), "TFAutoModel")
             self._logger.warning("No detected architecture, using AutoModel/TFAutoModel")
@@ -285,19 +310,24 @@ def run(self):
             except AttributeError:
                 raise AttributeError(f"The TensorFlow equivalent of {architectures[0]} doesn't exist in transformers.")
 
-        # Load models and acquire a basic input compatible with the model.
+        # Check the TF dummy inputs to see what keys we need in the forward pass
+        tf_from_pt_model = tf_class.from_config(config)
+        tf_dummy_inputs = tf_from_pt_model.dummy_inputs
+
+        del tf_from_pt_model  # Try to keep only one model in memory at a time
+
+        # Load the model and get some basic inputs
         pt_model = pt_class.from_pretrained(self._local_dir)
         pt_model.eval()
 
-        tf_from_pt_model = tf_class.from_pretrained(self._local_dir, from_pt=True)
-        pt_input, tf_input = self.get_inputs(pt_model, config)
+        pt_input, tf_input = self.get_inputs(pt_model, tf_dummy_inputs, config)
 
         with torch.no_grad():
             pt_outputs = pt_model(**pt_input, output_hidden_states=True)
         del pt_model  # will no longer be used, and may have a large memory footprint
 
         tf_from_pt_model = tf_class.from_pretrained(self._local_dir, from_pt=True)
-        tf_from_pt_outputs = tf_from_pt_model(**tf_input, output_hidden_states=True)
+        tf_from_pt_outputs = tf_from_pt_model(**tf_input, output_hidden_states=True, training=False)
 
         # Confirms that cross loading PT weights into TF worked.
         crossload_differences = self.find_pt_tf_differences(pt_outputs, tf_from_pt_outputs)
@@ -359,7 +389,7 @@ def run(self):
             commit_descrition = (
                 "Model converted by the [`transformers`' `pt_to_tf`"
                 " CLI](https://github.com/huggingface/transformers/blob/main/src/transformers/commands/pt_to_tf.py). "
-                "All converted model outputs and hidden layers were validated against its Pytorch counterpart.\n\n"
+                "All converted model outputs and hidden layers were validated against its PyTorch counterpart.\n\n"
                 f"Maximum crossload output difference={max_crossload_output_diff:.3e}; "
                 f"Maximum crossload hidden layer difference={max_crossload_hidden_diff:.3e};\n"
                 f"Maximum conversion output difference={max_conversion_output_diff:.3e}; "
@@ -391,5 +421,5 @@ def run(self):
                 commit_description=commit_descrition,
                 repo_type="model",
                 create_pr=True,
-            )
+            ).pr_url
             self._logger.warning(f"PR open in {hub_pr_url}")
diff --git a/src/transformers/commands/serving.py b/src/transformers/commands/serving.py
index 4deae833f712..803ae71d1c12 100644
--- a/src/transformers/commands/serving.py
+++ b/src/transformers/commands/serving.py
@@ -122,7 +122,6 @@ def register_subcommand(parser: ArgumentParser):
         serve_parser.set_defaults(func=serve_command_factory)
 
     def __init__(self, pipeline: Pipeline, host: str, port: int, workers: int):
-
         self._pipeline = pipeline
 
         self.host = host
diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index e6ff618580a9..2ffefe960589 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -30,6 +30,7 @@
 from .utils import (
     CONFIG_NAME,
     PushToHubMixin,
+    add_model_info_to_auto_map,
     cached_file,
     copy_func,
     download_url,
@@ -46,6 +47,7 @@
 
 
 class PretrainedConfig(PushToHubMixin):
+    # no-format
     r"""
     Base class for all configuration classes. Handles a few parameters common to all models' configurations as well as
     methods for loading/downloading/saving configurations.
@@ -118,7 +120,7 @@ class PretrainedConfig(PushToHubMixin):
 
         max_length (`int`, *optional*, defaults to 20):
             Maximum length that will be used by default in the `generate` method of the model.
-        min_length (`int`, *optional*, defaults to 10):
+        min_length (`int`, *optional*, defaults to 0):
             Minimum length that will be used by default in the `generate` method of the model.
         do_sample (`bool`, *optional*, defaults to `False`):
             Flag that will be used by default in the `generate` method of the model. Whether or not to use sampling ;
@@ -135,7 +137,7 @@ class PretrainedConfig(PushToHubMixin):
         diversity_penalty (`float`, *optional*, defaults to 0.0):
             Value to control diversity for group beam search. that will be used by default in the `generate` method of
             the model. 0 means no diversity penalty. The higher the penalty, the more diverse are the outputs.
-        temperature (`float`, *optional*, defaults to 1):
+        temperature (`float`, *optional*, defaults to 1.0):
             The value used to module the next token probabilities that will be used by default in the `generate` method
             of the model. Must be strictly positive.
         top_k (`int`, *optional*, defaults to 50):
@@ -144,6 +146,12 @@ class PretrainedConfig(PushToHubMixin):
         top_p (`float`, *optional*, defaults to 1):
             Value that will be used by default in the `generate` method of the model for `top_p`. If set to float < 1,
             only the most probable tokens with probabilities that add up to `top_p` or higher are kept for generation.
+        typical_p (`float`, *optional*, defaults to 1):
+            Local typicality measures how similar the conditional probability of predicting a target token next is to
+            the expected conditional probability of predicting a random token next, given the partial text already
+            generated. If set to float < 1, the smallest set of the most locally typical tokens with probabilities that
+            add up to `typical_p` or higher are kept for generation. See [this
+            paper](https://arxiv.org/pdf/2202.00666.pdf) for more details.
         repetition_penalty (`float`, *optional*, defaults to 1):
             Parameter for repetition penalty that will be used by default in the `generate` method of the model. 1.0
             means no penalty.
@@ -312,14 +320,18 @@ def __init__(self, **kwargs):
         self.finetuning_task = kwargs.pop("finetuning_task", None)
         self.id2label = kwargs.pop("id2label", None)
         self.label2id = kwargs.pop("label2id", None)
+        if self.label2id is not None and not isinstance(self.label2id, dict):
+            raise ValueError("Argument label2id should be a dictionary.")
         if self.id2label is not None:
+            if not isinstance(self.id2label, dict):
+                raise ValueError("Argument id2label should be a dictionary.")
             num_labels = kwargs.pop("num_labels", None)
             if num_labels is not None and len(self.id2label) != num_labels:
                 logger.warning(
                     f"You passed along `num_labels={num_labels}` with an incompatible id to label map: "
                     f"{self.id2label}. The number of labels wil be overwritten to {self.num_labels}."
                 )
-            self.id2label = dict((int(key), value) for key, value in self.id2label.items())
+            self.id2label = {int(key): value for key, value in self.id2label.items()}
             # Keys are always strings in JSON so convert ids to int here.
         else:
             self.num_labels = kwargs.pop("num_labels", 2)
@@ -426,9 +438,11 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub:
                 Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
                 repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
                 namespace).
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
         """
+        self._set_token_in_kwargs(kwargs)
+
         if os.path.isfile(save_directory):
             raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
 
@@ -437,7 +451,7 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub:
         if push_to_hub:
             commit_message = kwargs.pop("commit_message", None)
             repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
-            repo_id, token = self._create_repo(repo_id, **kwargs)
+            repo_id = self._create_repo(repo_id, **kwargs)
             files_timestamps = self._get_files_timestamps(save_directory)
 
         # If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
@@ -453,11 +467,50 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub:
 
         if push_to_hub:
             self._upload_modified_files(
-                save_directory, repo_id, files_timestamps, commit_message=commit_message, token=token
+                save_directory,
+                repo_id,
+                files_timestamps,
+                commit_message=commit_message,
+                token=kwargs.get("token"),
             )
 
+    @staticmethod
+    def _set_token_in_kwargs(kwargs, token=None):
+        """Temporary method to deal with `token` and `use_auth_token`.
+
+        This method is to avoid apply the same changes in all model config classes that overwrite `from_pretrained`.
+
+        Need to clean up `use_auth_token` in a follow PR.
+        """
+        # Some model config classes like CLIP define their own `from_pretrained` without the new argument `token` yet.
+        if token is None:
+            token = kwargs.pop("token", None)
+        use_auth_token = kwargs.pop("use_auth_token", None)
+
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
+        if token is not None:
+            kwargs["token"] = token
+
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
+        **kwargs,
+    ) -> "PretrainedConfig":
         r"""
         Instantiate a [`PretrainedConfig`] (or a derived class) from a pretrained model configuration.
 
@@ -483,7 +536,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-            use_auth_token (`str` or `bool`, *optional*):
+            token (`str` or `bool`, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
                 the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
             revision (`str`, *optional*, defaults to `"main"`):
@@ -534,6 +587,13 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
         assert config.output_attentions == True
         assert unused_kwargs == {"foo": False}
         ```"""
+        kwargs["cache_dir"] = cache_dir
+        kwargs["force_download"] = force_download
+        kwargs["local_files_only"] = local_files_only
+        kwargs["revision"] = revision
+
+        cls._set_token_in_kwargs(kwargs, token)
+
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
         if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
             logger.warning(
@@ -559,6 +619,8 @@ def get_config_dict(
             `Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the configuration object.
 
         """
+        cls._set_token_in_kwargs(kwargs)
+
         original_kwargs = copy.deepcopy(kwargs)
         # Get config dict associated with the base config file
         config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs)
@@ -582,7 +644,7 @@ def _get_config_dict(
         force_download = kwargs.pop("force_download", False)
         resume_download = kwargs.pop("resume_download", False)
         proxies = kwargs.pop("proxies", None)
-        use_auth_token = kwargs.pop("use_auth_token", None)
+        token = kwargs.pop("token", None)
         local_files_only = kwargs.pop("local_files_only", False)
         revision = kwargs.pop("revision", None)
         trust_remote_code = kwargs.pop("trust_remote_code", None)
@@ -624,7 +686,7 @@ def _get_config_dict(
                     proxies=proxies,
                     resume_download=resume_download,
                     local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
+                    token=token,
                     user_agent=user_agent,
                     revision=revision,
                     subfolder=subfolder,
@@ -658,6 +720,10 @@ def _get_config_dict(
         else:
             logger.info(f"loading configuration file {configuration_file} from cache at {resolved_config_file}")
 
+        if "auto_map" in config_dict and not is_local:
+            config_dict["auto_map"] = add_model_info_to_auto_map(
+                config_dict["auto_map"], pretrained_model_name_or_path
+            )
         return config_dict, kwargs
 
     @classmethod
@@ -687,7 +753,7 @@ def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "PretrainedConfig":
         config = cls(**config_dict)
 
         if hasattr(config, "pruned_heads"):
-            config.pruned_heads = dict((int(key), value) for key, value in config.pruned_heads.items())
+            config.pruned_heads = {int(key): value for key, value in config.pruned_heads.items()}
 
         # Update config with kwargs if needed
         if "num_labels" in kwargs and "id2label" in kwargs:
@@ -702,6 +768,10 @@ def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "PretrainedConfig":
         to_remove = []
         for key, value in kwargs.items():
             if hasattr(config, key):
+                current_attr = getattr(config, key)
+                # To authorize passing a custom subconfig as kwarg in models that have nested configs.
+                if isinstance(current_attr, PretrainedConfig) and isinstance(value, dict):
+                    value = current_attr.__class__(**value)
                 setattr(config, key, value)
                 if key != "torch_dtype":
                     to_remove.append(key)
@@ -763,6 +833,18 @@ def to_diff_dict(self) -> Dict[str, Any]:
         # only serialize values that differ from the default config
         for key, value in config_dict.items():
             if (
+                isinstance(getattr(self, key, None), PretrainedConfig)
+                and key in class_config_dict
+                and isinstance(class_config_dict[key], dict)
+            ):
+                # For nested configs we need to clean the diff recursively
+                diff = recursive_diff_dict(value, class_config_dict[key], config_obj=getattr(self, key, None))
+                if "model_type" in value:
+                    # Needs to be set even if it's not in the diff
+                    diff["model_type"] = value["model_type"]
+                if len(diff) > 0:
+                    serializable_config_dict[key] = diff
+            elif (
                 key not in default_config_dict
                 or key == "transformers_version"
                 or value != default_config_dict[key]
@@ -770,8 +852,18 @@ def to_diff_dict(self) -> Dict[str, Any]:
             ):
                 serializable_config_dict[key] = value
 
+        if hasattr(self, "quantization_config"):
+            serializable_config_dict["quantization_config"] = (
+                self.quantization_config.to_dict()
+                if not isinstance(self.quantization_config, dict)
+                else self.quantization_config
+            )
+
         self.dict_torch_dtype_to_str(serializable_config_dict)
 
+        if "_flash_attn_2_enabled" in serializable_config_dict:
+            del serializable_config_dict["_flash_attn_2_enabled"]
+
         return serializable_config_dict
 
     def to_dict(self) -> Dict[str, Any]:
@@ -788,10 +880,27 @@ def to_dict(self) -> Dict[str, Any]:
             del output["_auto_class"]
         if "_commit_hash" in output:
             del output["_commit_hash"]
+        if "_flash_attn_2_enabled" in output:
+            del output["_flash_attn_2_enabled"]
 
         # Transformers version when serializing the model
         output["transformers_version"] = __version__
 
+        for key, value in output.items():
+            # Deal with nested configs like CLIP
+            if isinstance(value, PretrainedConfig):
+                value = value.to_dict()
+                del value["transformers_version"]
+
+            output[key] = value
+
+        if hasattr(self, "quantization_config"):
+            output["quantization_config"] = (
+                self.quantization_config.to_dict()
+                if not isinstance(self.quantization_config, dict)
+                else self.quantization_config
+            )
+
         self.dict_torch_dtype_to_str(output)
 
         return output
@@ -946,6 +1055,24 @@ def get_configuration_file(configuration_files: List[str]) -> str:
     return configuration_file
 
 
+def recursive_diff_dict(dict_a, dict_b, config_obj=None):
+    """
+    Helper function to recursively take the diff between two nested dictionaries. The resulting diff only contains the
+    values from `dict_a` that are different from values in `dict_b`.
+    """
+    diff = {}
+    default = config_obj.__class__().to_dict() if config_obj is not None else {}
+    for key, value in dict_a.items():
+        obj_value = getattr(config_obj, str(key), None)
+        if isinstance(obj_value, PretrainedConfig) and key in dict_b and isinstance(dict_b[key], dict):
+            diff_value = recursive_diff_dict(value, dict_b[key], config_obj=obj_value)
+            if len(diff_value) > 0:
+                diff[key] = diff_value
+        elif key not in dict_b or value != dict_b[key] or key not in default or value != default[key]:
+            diff[key] = value
+    return diff
+
+
 PretrainedConfig.push_to_hub = copy_func(PretrainedConfig.push_to_hub)
 if PretrainedConfig.push_to_hub.__doc__ is not None:
     PretrainedConfig.push_to_hub.__doc__ = PretrainedConfig.push_to_hub.__doc__.format(
diff --git a/src/transformers/convert_graph_to_onnx.py b/src/transformers/convert_graph_to_onnx.py
index ce4350b3b1f2..5449d98237ea 100644
--- a/src/transformers/convert_graph_to_onnx.py
+++ b/src/transformers/convert_graph_to_onnx.py
@@ -328,7 +328,6 @@ def convert_tensorflow(nlp: Pipeline, opset: int, output: Path):
 
     try:
         import tensorflow as tf
-
         import tf2onnx
         from tf2onnx import __version__ as t2ov
 
@@ -358,7 +357,7 @@ def convert(
     tokenizer: Optional[str] = None,
     use_external_format: bool = False,
     pipeline_name: str = "feature-extraction",
-    **model_kwargs
+    **model_kwargs,
 ):
     """
     Convert the pipeline object to the ONNX Intermediate Representation (IR) format
diff --git a/src/transformers/convert_pytorch_checkpoint_to_tf2.py b/src/transformers/convert_pytorch_checkpoint_to_tf2.py
index 62a071dd3cc1..f1358408a5cb 100755
--- a/src/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/src/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -358,7 +358,6 @@ def convert_all_pt_checkpoints_to_tf(
     remove_cached_files=False,
     only_convert_finetuned_models=False,
 ):
-
     if args_model_type is None:
         model_types = list(MODEL_CLASSES.keys())
     else:
diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py
index ce52ba3b3beb..a2195d9cae57 100644
--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@@ -22,10 +22,25 @@
 import warnings
 from typing import Dict, List, Tuple
 
-from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
+from packaging import version
+from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
 from tokenizers.models import BPE, Unigram, WordPiece
 
-from .utils import requires_backends
+from .utils import is_protobuf_available, requires_backends
+from .utils.import_utils import PROTOBUF_IMPORT_ERROR
+
+
+def import_protobuf(error_message=""):
+    if is_protobuf_available():
+        import google.protobuf
+
+        if version.parse(google.protobuf.__version__) < version.parse("4.0.0"):
+            from transformers.utils import sentencepiece_model_pb2
+        else:
+            from transformers.utils import sentencepiece_model_pb2_new as sentencepiece_model_pb2
+        return sentencepiece_model_pb2
+    else:
+        raise ImportError(PROTOBUF_IMPORT_ERROR.format(error_message))
 
 
 class SentencePieceExtractor:
@@ -40,21 +55,31 @@ def __init__(self, model: str):
         self.sp = SentencePieceProcessor()
         self.sp.Load(model)
 
-    def extract(self) -> Tuple[Dict[str, int], List[Tuple]]:
+    def extract(self, vocab_scores=None) -> Tuple[Dict[str, int], List[Tuple]]:
+        """
+        By default will return vocab and merges with respect to their order, by sending `vocab_scores` we're going to
+        order the merges with respect to the piece scores instead.
+        """
         sp = self.sp
         vocab = {sp.id_to_piece(index): index for index in range(sp.GetPieceSize())}
+        if vocab_scores is not None:
+            vocab_scores, reverse = dict(vocab_scores), True
+        else:
+            vocab_scores, reverse = vocab, False
 
         # Merges
         merges = []
-        for piece_l in vocab.keys():
-            for piece_r in vocab.keys():
-                merge = f"{piece_l}{piece_r}"
-                piece_id = vocab.get(merge, None)
-                if piece_id:
-                    merges += [(piece_l, piece_r, piece_id)]
-        merges = sorted(merges, key=lambda val: val[2])
+        for merge, piece_score in vocab_scores.items():
+            local = []
+            for index in range(1, len(merge)):
+                piece_l, piece_r = merge[:index], merge[index:]
+                if piece_l in vocab and piece_r in vocab:
+                    local.append((piece_l, piece_r, piece_score))
+            local = sorted(local, key=lambda x: (vocab[x[0]], vocab[x[1]]))
+            merges.extend(local)
+
+        merges = sorted(merges, key=lambda val: val[2], reverse=reverse)
         merges = [(val[0], val[1]) for val in merges]
-
         return vocab, merges
 
 
@@ -286,7 +311,7 @@ def converted(self) -> Tokenizer:
             bos = self.original_tokenizer.bos_token
             bos_token_id = self.original_tokenizer.bos_token_id
             tokenizer.post_processor = processors.TemplateProcessing(
-                single=f"{bos}:0 $A:0",  # token_type_id is 2 for Funnel transformer
+                single=f"{bos}:0 $A:0",
                 pair=f"{bos}:0 $A:0 $B:1",
                 special_tokens=[
                     (bos, bos_token_id),
@@ -435,7 +460,8 @@ def __init__(self, *args):
 
         super().__init__(*args)
 
-        from .utils import sentencepiece_model_pb2 as model_pb2
+        # from .utils import sentencepiece_model_pb2 as model_pb2
+        model_pb2 = import_protobuf()
 
         m = model_pb2.ModelProto()
         with open(self.original_tokenizer.vocab_file, "rb") as f:
@@ -443,12 +469,13 @@ def __init__(self, *args):
         self.proto = m
 
         if self.proto.trainer_spec.byte_fallback:
-            warnings.warn(
-                "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
-                " which is not implemented in the fast tokenizers. In practice this means that the fast version of the"
-                " tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these "
-                "unknown tokens into a sequence of byte tokens matching the original piece of text."
-            )
+            if not getattr(self, "handle_byte_fallback", None):
+                warnings.warn(
+                    "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
+                    " which is not implemented in the fast tokenizers. In practice this means that the fast version of the"
+                    " tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these "
+                    "unknown tokens into a sequence of byte tokens matching the original piece of text."
+                )
 
     def vocab(self, proto):
         return [(piece.piece, piece.score) for piece in proto.pieces]
@@ -458,14 +485,14 @@ def unk_id(self, proto):
 
     def tokenizer(self, proto):
         model_type = proto.trainer_spec.model_type
-        vocab = self.vocab(proto)
+        vocab_scores = self.vocab(proto)
         unk_id = self.unk_id(proto)
 
         if model_type == 1:
-            tokenizer = Tokenizer(Unigram(vocab, unk_id))
+            tokenizer = Tokenizer(Unigram(vocab_scores, unk_id))
         elif model_type == 2:
             _, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract()
-            bpe_vocab = {word: i for i, (word, score) in enumerate(vocab)}
+            bpe_vocab = {word: i for i, (word, score) in enumerate(vocab_scores)}
             tokenizer = Tokenizer(
                 BPE(
                     bpe_vocab,
@@ -496,16 +523,24 @@ def pre_tokenizer(self, replacement, add_prefix_space):
     def post_processor(self):
         return None
 
+    def decoder(self, replacement, add_prefix_space):
+        return decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
+
     def converted(self) -> Tokenizer:
         tokenizer = self.tokenizer(self.proto)
 
         # Tokenizer assemble
-        tokenizer.normalizer = self.normalizer(self.proto)
+        normalizer = self.normalizer(self.proto)
+        if normalizer is not None:
+            tokenizer.normalizer = normalizer
 
         replacement = "▁"
         add_prefix_space = True
-        tokenizer.pre_tokenizer = self.pre_tokenizer(replacement, add_prefix_space)
-        tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
+        pre_tokenizer = self.pre_tokenizer(replacement, add_prefix_space)
+        if pre_tokenizer is not None:
+            tokenizer.pre_tokenizer = pre_tokenizer
+
+        tokenizer.decoder = self.decoder(replacement, add_prefix_space)
         post_processor = self.post_processor()
         if post_processor:
             tokenizer.post_processor = post_processor
@@ -532,7 +567,10 @@ def normalizer(self, proto):
             list_normalizers.append(normalizers.Lowercase())
 
         precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
-        list_normalizers.append(normalizers.Precompiled(precompiled_charsmap))
+
+        if precompiled_charsmap:
+            list_normalizers.append(normalizers.Precompiled(precompiled_charsmap))
+
         list_normalizers.append(normalizers.Replace(Regex(" {2,}"), " "))
         return normalizers.Sequence(list_normalizers)
 
@@ -783,7 +821,10 @@ def normalizer(self, proto):
             list_normalizers.append(normalizers.Lowercase())
 
         precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
-        list_normalizers.append(normalizers.Precompiled(precompiled_charsmap))
+
+        if precompiled_charsmap:
+            list_normalizers.append(normalizers.Precompiled(precompiled_charsmap))
+
         list_normalizers.append(normalizers.Replace(Regex(" {2,}"), " "))
         return normalizers.Sequence(list_normalizers)
 
@@ -817,7 +858,10 @@ def normalizer(self, proto):
             list_normalizers.append(normalizers.Lowercase())
 
         precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
-        list_normalizers.append(normalizers.Precompiled(precompiled_charsmap))
+
+        if precompiled_charsmap:
+            list_normalizers.append(normalizers.Precompiled(precompiled_charsmap))
+
         return normalizers.Sequence(list_normalizers)
 
     def post_processor(self):
@@ -891,6 +935,42 @@ def post_processor(self):
         )
 
 
+class WhisperConverter(Converter):
+    def converted(self) -> Tokenizer:
+        vocab = self.original_tokenizer.encoder
+        merges = list(self.original_tokenizer.bpe_ranks.keys())
+
+        tokenizer = Tokenizer(
+            BPE(
+                vocab=vocab,
+                merges=merges,
+                dropout=None,
+                continuing_subword_prefix="",
+                end_of_word_suffix="",
+                fuse_unk=False,
+            )
+        )
+
+        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=self.original_tokenizer.add_prefix_space)
+        tokenizer.decoder = decoders.ByteLevel()
+
+        prefix_token_ids = self.original_tokenizer.prefix_tokens
+        prefixes = self.original_tokenizer.convert_ids_to_tokens(prefix_token_ids)
+        eos = self.original_tokenizer.eos_token
+        eos_token_id = self.original_tokenizer.eos_token_id
+        prefix_template = " ".join([f"{token}:0" for token in prefixes])
+        tokenizer.post_processor = processors.TemplateProcessing(
+            single=f"{prefix_template} $A:0 {eos}:0",
+            pair=f"{prefix_template} $A:0 $B:1 {eos}:1",
+            special_tokens=[
+                (eos, eos_token_id),
+                *zip(prefixes, prefix_token_ids),
+            ],
+        )
+
+        return tokenizer
+
+
 class BigBirdConverter(SpmConverter):
     def post_processor(self):
         return processors.TemplateProcessing(
@@ -1043,6 +1123,79 @@ def post_processor(self):
         )
 
 
+class LlamaConverter(SpmConverter):
+    handle_byte_fallback = True
+
+    def vocab(self, proto):
+        vocab = [
+            ("<unk>", 0.0),
+            ("<s>", 0.0),
+            ("</s>", 0.0),
+        ]
+        vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
+        return vocab
+
+    def unk_id(self, proto):
+        unk_id = 0
+        return unk_id
+
+    def decoder(self, replacement, add_prefix_space):
+        return decoders.Sequence(
+            [
+                decoders.Replace("▁", " "),
+                decoders.ByteFallback(),
+                decoders.Fuse(),
+                decoders.Strip(content=" ", left=1),
+            ]
+        )
+
+    def tokenizer(self, proto):
+        model_type = proto.trainer_spec.model_type
+        vocab_scores = self.vocab(proto)
+        if model_type == 1:
+            import tokenizers
+
+            if version.parse(tokenizers.__version__) < version.parse("0.14.0"):
+                tokenizer = Tokenizer(Unigram(vocab_scores, 0))
+            else:
+                tokenizer = Tokenizer(Unigram(vocab_scores, 0, byte_fallback=True))
+
+        elif model_type == 2:
+            _, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract(vocab_scores)
+            bpe_vocab = {word: i for i, (word, _score) in enumerate(vocab_scores)}
+            tokenizer = Tokenizer(
+                BPE(bpe_vocab, merges, unk_token=proto.trainer_spec.unk_piece, fuse_unk=True, byte_fallback=True)
+            )
+            tokenizer.add_special_tokens(
+                [
+                    AddedToken("<unk>"),
+                    AddedToken("<s>"),
+                    AddedToken("</s>"),
+                ]
+            )
+        else:
+            raise Exception(
+                "You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
+            )
+
+        return tokenizer
+
+    def normalizer(self, proto):
+        return normalizers.Sequence(
+            [
+                normalizers.Prepend(prepend="▁"),
+                normalizers.Replace(pattern=" ", content="▁"),
+            ]
+        )
+
+    def pre_tokenizer(self, replacement, add_prefix_space):
+        return None
+
+    def post_processor(self):
+        # the processor is defined in the LlamaTokenizerFast class.
+        return None
+
+
 class MarkupLMConverter(Converter):
     def converted(self) -> Tokenizer:
         ot = self.original_tokenizer
@@ -1127,10 +1280,13 @@ def converted(self) -> Tokenizer:
     "RoFormerTokenizer": RoFormerConverter,
     "SqueezeBertTokenizer": BertConverter,
     "T5Tokenizer": T5Converter,
+    "WhisperTokenizer": WhisperConverter,
     "XLMRobertaTokenizer": XLMRobertaConverter,
     "XLNetTokenizer": XLNetConverter,
     "SplinterTokenizer": SplinterConverter,
     "XGLMTokenizer": XGLMConverter,
+    "LlamaTokenizer": LlamaConverter,
+    "CodeLlamaTokenizer": LlamaConverter,
 }
 
 
diff --git a/src/transformers/data/__init__.py b/src/transformers/data/__init__.py
index 7ed4859dd420..1a8ef35ff439 100644
--- a/src/transformers/data/__init__.py
+++ b/src/transformers/data/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py
index 60522344d4fd..57879ec3e2d8 100644
--- a/src/transformers/data/data_collator.py
+++ b/src/transformers/data/data_collator.py
@@ -90,7 +90,7 @@ class DefaultDataCollator(DataCollatorMixin):
     helpful if you need to set a return_tensors value at initialization.
 
     Args:
-        return_tensors (`str`):
+        return_tensors (`str`, *optional*, defaults to `"pt"`):
             The type of Tensor to return. Allowable values are "np", "pt" and "tf".
     """
 
@@ -159,7 +159,7 @@ def tf_default_data_collator(features: List[InputDataClass]) -> Dict[str, Any]:
         label_col_name = None
     if label_col_name is not None:
         if isinstance(first[label_col_name], tf.Tensor):
-            dtype = tf.int64 if first[label_col_name].dtype.is_integer() else tf.float32
+            dtype = tf.int64 if first[label_col_name].dtype.is_integer else tf.float32
         elif isinstance(first[label_col_name], np.ndarray) or isinstance(first[label_col_name], np.generic):
             dtype = tf.int64 if np.issubdtype(first[label_col_name].dtype, np.integer) else tf.float32
         elif isinstance(first[label_col_name], (tuple, list)):
@@ -235,7 +235,7 @@ class DataCollatorWithPadding:
 
             This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
             7.5 (Volta).
-        return_tensors (`str`):
+        return_tensors (`str`, *optional*, defaults to `"pt"`):
             The type of Tensor to return. Allowable values are "np", "pt" and "tf".
     """
 
@@ -274,12 +274,11 @@ class DataCollatorForTokenClassification(DataCollatorMixin):
             Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
             among:
 
-            - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single sequence
-              is provided).
+            - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
+              sequence is provided).
             - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
               acceptable input length for the model if that argument is not provided.
-            - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-              lengths).
+            - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different lengths).
         max_length (`int`, *optional*):
             Maximum length of the returned list and optionally padding length (see above).
         pad_to_multiple_of (`int`, *optional*):
@@ -289,7 +288,7 @@ class DataCollatorForTokenClassification(DataCollatorMixin):
             7.5 (Volta).
         label_pad_token_id (`int`, *optional*, defaults to -100):
             The id to use when padding the labels (-100 will be automatically ignore by PyTorch loss functions).
-        return_tensors (`str`):
+        return_tensors (`str`, *optional*, defaults to `"pt"`):
             The type of Tensor to return. Allowable values are "np", "pt" and "tf".
     """
 
@@ -522,7 +521,7 @@ class DataCollatorForSeq2Seq:
     Args:
         tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
             The tokenizer used for encoding the data.
-        model ([`PreTrainedModel`]):
+        model ([`PreTrainedModel`], *optional*):
             The model that is being trained. If set and has the *prepare_decoder_input_ids_from_labels*, use it to
             prepare the *decoder_input_ids*
 
@@ -531,12 +530,11 @@ class DataCollatorForSeq2Seq:
             Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
             among:
 
-            - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single sequence
-              is provided).
+            - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
+              sequence is provided).
             - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
               acceptable input length for the model if that argument is not provided.
-            - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-              lengths).
+            - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different lengths).
         max_length (`int`, *optional*):
             Maximum length of the returned list and optionally padding length (see above).
         pad_to_multiple_of (`int`, *optional*):
@@ -546,7 +544,7 @@ class DataCollatorForSeq2Seq:
             7.5 (Volta).
         label_pad_token_id (`int`, *optional*, defaults to -100):
             The id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).
-        return_tensors (`str`):
+        return_tensors (`str`, *optional*, defaults to `"pt"`):
             The type of Tensor to return. Allowable values are "np", "pt" and "tf".
     """
 
@@ -666,6 +664,8 @@ def tf_mask_tokens(
         """
         import tensorflow as tf
 
+        mask_token_id = tf.cast(mask_token_id, inputs.dtype)
+
         input_shape = tf.shape(inputs)
         # 1 for a special token, 0 for a normal token in the special tokens mask
         # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
@@ -680,7 +680,8 @@ def tf_mask_tokens(
 
         # 10% of the time, we replace masked input tokens with random word
         indices_random = self.tf_bernoulli(input_shape, 0.1) & masked_indices & ~indices_replaced
-        random_words = tf.random.uniform(input_shape, maxval=vocab_size, dtype=tf.int64)
+        random_words = tf.random.uniform(input_shape, maxval=vocab_size, dtype=inputs.dtype)
+
         inputs = tf.where(indices_random, random_words, inputs)
 
         # The rest of the time (10% of the time) we keep the masked input tokens unchanged
@@ -885,6 +886,8 @@ def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> D
         return {"input_ids": inputs, "labels": labels}
 
     def tf_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
+        import tensorflow as tf
+
         if isinstance(examples[0], Mapping):
             input_ids = [e["input_ids"] for e in examples]
         else:
@@ -909,7 +912,7 @@ def tf_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict
                         ref_tokens[i] = "##" + ref_tokens[i]
             mask_labels.append(self._whole_word_mask(ref_tokens))
         batch_mask = _tf_collate_batch(mask_labels, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
-        inputs, labels = self.tf_mask_tokens(batch_input, batch_mask)
+        inputs, labels = self.tf_mask_tokens(tf.cast(batch_input, tf.int64), batch_mask)
         return {"input_ids": inputs, "labels": labels}
 
     def numpy_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
@@ -1062,7 +1065,7 @@ def tf_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]:
         inputs = tf.where(indices_replaced, self.tokenizer.mask_token_id, inputs)
 
         # 10% of the time, we replace masked input tokens with random word
-        indices_random = self.tf_bernoulli(input_shape, 0.1) & masked_indices & ~indices_replaced
+        indices_random = self.tf_bernoulli(input_shape, 0.5) & masked_indices & ~indices_replaced
         random_words = tf.random.uniform(input_shape, maxval=len(self.tokenizer), dtype=tf.int64)
         inputs = tf.where(indices_random, random_words, inputs)
 
diff --git a/src/transformers/data/datasets/__init__.py b/src/transformers/data/datasets/__init__.py
index 3a8500e2c4b7..378894ab4bbb 100644
--- a/src/transformers/data/datasets/__init__.py
+++ b/src/transformers/data/datasets/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/data/datasets/glue.py b/src/transformers/data/datasets/glue.py
index befa22c2e178..72df3bece219 100644
--- a/src/transformers/data/datasets/glue.py
+++ b/src/transformers/data/datasets/glue.py
@@ -20,9 +20,8 @@
 from typing import List, Optional, Union
 
 import torch
-from torch.utils.data import Dataset
-
 from filelock import FileLock
+from torch.utils.data import Dataset
 
 from ...tokenization_utils_base import PreTrainedTokenizerBase
 from ...utils import logging
@@ -121,7 +120,6 @@ def __init__(
         # and the others will use the cache.
         lock_path = cached_features_file + ".lock"
         with FileLock(lock_path):
-
             if os.path.exists(cached_features_file) and not args.overwrite_cache:
                 start = time.time()
                 self.features = torch.load(cached_features_file)
diff --git a/src/transformers/data/datasets/language_modeling.py b/src/transformers/data/datasets/language_modeling.py
index 8c6c98b264f3..6c23bf23cf14 100644
--- a/src/transformers/data/datasets/language_modeling.py
+++ b/src/transformers/data/datasets/language_modeling.py
@@ -21,9 +21,8 @@
 from typing import Dict, List, Optional
 
 import torch
-from torch.utils.data import Dataset
-
 from filelock import FileLock
+from torch.utils.data import Dataset
 
 from ...tokenization_utils import PreTrainedTokenizer
 from ...utils import logging
@@ -72,7 +71,6 @@ def __init__(
         # and the others will use the cache.
         lock_path = cached_features_file + ".lock"
         with FileLock(lock_path):
-
             if os.path.exists(cached_features_file) and not overwrite_cache:
                 start = time.time()
                 with open(cached_features_file, "rb") as handle:
diff --git a/src/transformers/data/datasets/squad.py b/src/transformers/data/datasets/squad.py
index e1c8c9cb6c05..d81217d818af 100644
--- a/src/transformers/data/datasets/squad.py
+++ b/src/transformers/data/datasets/squad.py
@@ -19,9 +19,8 @@
 from typing import Dict, List, Optional, Union
 
 import torch
-from torch.utils.data import Dataset
-
 from filelock import FileLock
+from torch.utils.data import Dataset
 
 from ...models.auto.modeling_auto import MODEL_FOR_QUESTION_ANSWERING_MAPPING
 from ...tokenization_utils import PreTrainedTokenizer
diff --git a/src/transformers/data/metrics/__init__.py b/src/transformers/data/metrics/__init__.py
index aca2b7ffd43c..ebd0d17aa55b 100644
--- a/src/transformers/data/metrics/__init__.py
+++ b/src/transformers/data/metrics/__init__.py
@@ -1,7 +1,3 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -20,9 +16,8 @@
 
 
 if is_sklearn_available():
-    from sklearn.metrics import f1_score, matthews_corrcoef
-
     from scipy.stats import pearsonr, spearmanr
+    from sklearn.metrics import f1_score, matthews_corrcoef
 
 
 DEPRECATION_WARNING = (
@@ -95,7 +90,8 @@ def glue_compute_metrics(task_name, preds, labels):
 def xnli_compute_metrics(task_name, preds, labels):
     warnings.warn(DEPRECATION_WARNING, FutureWarning)
     requires_backends(xnli_compute_metrics, "sklearn")
-    assert len(preds) == len(labels), f"Predictions and labels have mismatched lengths {len(preds)} and {len(labels)}"
+    if len(preds) != len(labels):
+        raise ValueError(f"Predictions and labels have mismatched lengths {len(preds)} and {len(labels)}")
     if task_name == "xnli":
         return {"acc": simple_accuracy(preds, labels)}
     else:
diff --git a/src/transformers/data/processors/__init__.py b/src/transformers/data/processors/__init__.py
index 6abd6f1b32df..a26ab5776d74 100644
--- a/src/transformers/data/processors/__init__.py
+++ b/src/transformers/data/processors/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/data/processors/squad.py b/src/transformers/data/processors/squad.py
index 64137c95aca2..0f8bd2480551 100644
--- a/src/transformers/data/processors/squad.py
+++ b/src/transformers/data/processors/squad.py
@@ -166,7 +166,6 @@ def squad_convert_example_to_features(
 
     span_doc_tokens = all_doc_tokens
     while len(spans) * doc_stride < len(all_doc_tokens):
-
         # Define the side we want to truncate / pad and the text/pair sorting
         if tokenizer.padding_side == "right":
             texts = truncated_query
diff --git a/src/transformers/data/test_generation_utils.py b/src/transformers/data/test_generation_utils.py
deleted file mode 100644
index b08dd88026ba..000000000000
--- a/src/transformers/data/test_generation_utils.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-import unittest
-
-import timeout_decorator
-
-from ..testing_utils import require_torch
-from ..utils import cached_property, is_torch_available
-
-
-if is_torch_available():
-    import torch
-
-    from ..models.marian import MarianConfig, MarianMTModel
-
-
-@require_torch
-class GenerationUtilsTest(unittest.TestCase):
-    @cached_property
-    def config(self):
-        config = MarianConfig.from_pretrained("sshleifer/tiny-marian-en-de")
-        return config
-
-    @cached_property
-    def model(self):
-        return MarianMTModel(self.config)
-
-    def test_postprocess_next_token_scores(self):
-        config = self.config
-        model = self.model
-        # Initialize an input id tensor with batch size 8 and sequence length 12
-        input_ids = torch.arange(0, 96, 1).view((8, 12))
-        eos = config.eos_token_id
-        bad_words_ids_test_cases = [[[299]], [[23, 24], [54]], [[config.eos_token_id]], []]
-        masked_scores = [
-            [(0, 299), (1, 299), (2, 299), (3, 299), (4, 299), (5, 299), (6, 299), (7, 299)],
-            [(1, 24), (0, 54), (1, 54), (2, 54), (3, 54), (4, 54), (5, 54), (6, 54), (7, 54)],
-            [(0, eos), (1, eos), (2, eos), (3, eos), (4, eos), (5, eos), (6, eos), (7, eos)],
-            [],
-        ]
-
-        for test_case_index, bad_words_ids in enumerate(bad_words_ids_test_cases):
-            # Initialize a scores tensor with batch size 8 and vocabulary size 300
-            scores = torch.rand((8, 300))
-            output = model.postprocess_next_token_scores(
-                scores,
-                input_ids,
-                0,
-                bad_words_ids,
-                13,
-                15,
-                config.max_length,
-                config.eos_token_id,
-                config.repetition_penalty,
-                32,
-                5,
-            )
-            for masked_score in masked_scores[test_case_index]:
-                self.assertTrue(output[masked_score[0], masked_score[1]] == -float("inf"))
-
-    @timeout_decorator.timeout(10)
-    def test_postprocess_next_token_scores_large_bad_words_list(self):
-
-        config = self.config
-        model = self.model
-        # Initialize an input id tensor with batch size 8 and sequence length 12
-        input_ids = torch.arange(0, 96, 1).view((8, 12))
-
-        bad_words_ids = []
-        for _ in range(100):
-            length_bad_word = random.randint(1, 4)
-            bad_words_ids.append(random.sample(range(1, 300), length_bad_word))
-
-        scores = torch.rand((8, 300))
-        _ = model.postprocess_next_token_scores(
-            scores,
-            input_ids,
-            0,
-            bad_words_ids,
-            13,
-            15,
-            config.max_length,
-            config.eos_token_id,
-            config.repetition_penalty,
-            32,
-            5,
-        )
diff --git a/src/transformers/deepspeed.py b/src/transformers/deepspeed.py
index 9465307f5f57..840d9cc2f55a 100644
--- a/src/transformers/deepspeed.py
+++ b/src/transformers/deepspeed.py
@@ -12,356 +12,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Integration with Deepspeed
-"""
-
-import importlib.util
-import weakref
-from copy import deepcopy
-from functools import partialmethod
-
-from .dependency_versions_check import dep_version_check
-from .utils import is_accelerate_available, is_torch_available, logging
-
-
-if is_torch_available():
-    import torch
-
-logger = logging.get_logger(__name__)
-
-
-def is_deepspeed_available():
-    return importlib.util.find_spec("deepspeed") is not None
-
-
-if is_accelerate_available() and is_deepspeed_available():
-    from accelerate.utils.deepspeed import HfDeepSpeedConfig as DeepSpeedConfig
-else:
-    # Inherits from a dummy `object` if accelerate is not available, so that python succeeds to import this file.
-    # Deepspeed glue code will never inherit this dummy object as it checks if accelerate is available.
-    from builtins import object as DeepSpeedConfig
-
-
-class HfDeepSpeedConfig(DeepSpeedConfig):
-    """
-    This object contains a DeepSpeed configuration dictionary and can be quickly queried for things like zero stage.
-
-    A `weakref` of this object is stored in the module's globals to be able to access the config from areas where
-    things like the Trainer object is not available (e.g. `from_pretrained` and `_get_resized_embeddings`). Therefore
-    it's important that this object remains alive while the program is still running.
-
-    [`Trainer`] uses the `HfTrainerDeepSpeedConfig` subclass instead. That subclass has logic to sync the configuration
-    with values of [`TrainingArguments`] by replacing special placeholder values: `"auto"`. Without this special logic
-    the DeepSpeed configuration is not modified in any way.
-
-    Args:
-        config_file_or_dict (`Union[str, Dict]`): path to DeepSpeed config file or dict.
-
-    """
-
-    def __init__(self, config_file_or_dict):
-        # set global weakref object
-        set_hf_deepspeed_config(self)
-        dep_version_check("accelerate")
-        dep_version_check("deepspeed")
-        super().__init__(config_file_or_dict)
-
-
-class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig):
-    """
-    The `HfTrainerDeepSpeedConfig` object is meant to be created during `TrainingArguments` object creation and has the
-    same lifespan as the latter.
-    """
-
-    def __init__(self, config_file_or_dict):
-        super().__init__(config_file_or_dict)
-        self._dtype = None
-        self.mismatches = []
-
-    def dtype(self):
-        if self._dtype is None:
-            raise ValueError("trainer_config_process() wasn't called yet to tell dtype")
-        return self._dtype
-
-    def fill_match(self, ds_key_long, hf_val, hf_key=None, must_match=True):
-        """
-        A utility method that massages the config file and can optionally verify that the values match.
-
-        1. Replace "auto" values with `TrainingArguments` value.
-
-        2. If it wasn't "auto" and `must_match` is true, then check that DS config matches Trainer
-        config values and if mismatched add the entry to `self.mismatched` - will assert during
-        `trainer_config_finalize` for one or more mismatches.
-
-        """
-        config, ds_key = self.find_config_node(ds_key_long)
-        if config is None:
-            return
-
-        if config.get(ds_key) == "auto":
-            config[ds_key] = hf_val
-            return
-
-        if not must_match:
-            return
-
-        ds_val = config.get(ds_key)
-        if ds_val is not None and ds_val != hf_val:
-            self.mismatches.append(f"- ds {ds_key_long}={ds_val} vs hf {hf_key}={hf_val}")
-
-    fill_only = partialmethod(fill_match, must_match=False)
-
-    def trainer_config_process(self, args):
-        """
-        Adjust the config with `TrainingArguments` values. This stage is run during `TrainingArguments` object
-        creation.
-        """
-        # DeepSpeed does:
-        # train_batch_size = world_size * train_micro_batch_size_per_gpu * gradient_accumulation_steps
-        train_batch_size = args.world_size * args.per_device_train_batch_size * args.gradient_accumulation_steps
-        self.fill_match(
-            "train_micro_batch_size_per_gpu", args.per_device_train_batch_size, "per_device_train_batch_size"
-        )
-        self.fill_match("gradient_accumulation_steps", args.gradient_accumulation_steps, "gradient_accumulation_steps")
-        self.fill_match("train_batch_size", train_batch_size, "train_batch_size (calculated)")
-        self.fill_match("gradient_clipping", args.max_grad_norm, "max_grad_norm")
-
-        self.fill_match("optimizer.params.lr", args.learning_rate, "learning_rate")
-        self.fill_match("optimizer.params.betas", [args.adam_beta1, args.adam_beta2], "adam_beta1+adam_beta2")
-        self.fill_match("optimizer.params.eps", args.adam_epsilon, "adam_epsilon")
-        self.fill_match("optimizer.params.weight_decay", args.weight_decay, "weight_decay")
-
-        self.fill_only("scheduler.params.warmup_min_lr", 0)  # not a trainer arg
-        self.fill_match("scheduler.params.warmup_max_lr", args.learning_rate, "learning_rate")
-        # total_num_steps - will get set in trainer_config_finalize
-
-        # fp16
-        if args.fp16 or args.fp16_full_eval:
-            fp16_backend = "apex" if args.fp16_backend == "apex" else "amp"
-        else:
-            fp16_backend = None
-
-        # amp: similar to the pytorch native amp - it has a bunch of optional params but we won't set
-        # any here unless the user did the work
-        self.fill_match(
-            "fp16.enabled",
-            ((args.fp16 or args.fp16_full_eval) and fp16_backend == "amp"),
-            "fp16|fp16_full_eval+fp16_backend(amp)",
-        )
-
-        # apex: delegates amp work to apex (which needs to be available), but it cannot be used with any
-        # ZeRO features
-        self.fill_match("amp.enabled", fp16_backend == "apex", "fp16+fp16_backend(apex)")
-        self.fill_match("amp.opt_level", args.fp16_opt_level, "fp16_opt_level")
-
-        self.fill_match("bf16.enabled", (args.bf16 or args.bf16_full_eval), "bf16|bf16_full_eval")
-
-        # deepspeed's default mode is fp16 unless there is a config that says differently
-        if self.is_true("bf16.enabled"):
-            self._dtype = torch.bfloat16
-        elif self.is_false("fp16.enabled"):
-            self._dtype = torch.float32
-        else:
-            self._dtype = torch.float16
-
-    def trainer_config_finalize(self, args, model, num_training_steps):
-        """
-        This stage is run after we have the model and know num_training_steps.
-
-        Now we can complete the configuration process.
-        """
-        # zero
-        hidden_size = model.config.hidden_size
-        self.fill_only("zero_optimization.reduce_bucket_size", hidden_size * hidden_size)
-        if self.is_zero3():
-            # automatically assign the optimal config values based on model config
-            self.fill_only("zero_optimization.stage3_prefetch_bucket_size", 0.9 * hidden_size * hidden_size)
-            self.fill_only("zero_optimization.stage3_param_persistence_threshold", 10 * hidden_size)
-
-        # scheduler
-        self.fill_match("scheduler.params.total_num_steps", num_training_steps, "num_training_steps (calculated)")
-        self.fill_match("scheduler.params.warmup_num_steps", args.get_warmup_steps(num_training_steps), "warmup_steps")
+Integration with Deepspeed - kept for backward compatiblity, if you plan to make any edit, make sure to modify the file
+in `integrations/deepspeed` instead.
 
-        if len(self.mismatches) > 0:
-            mismatches = "\n".join(self.mismatches)
-            raise ValueError(
-                "Please correct the following DeepSpeed config values that mismatch TrainingArguments"
-                f" values:\n{mismatches}\nThe easiest method is to set these DeepSpeed config values to 'auto'."
-            )
-
-
-# keep the config object global to be able to access it anywhere during TrainingArguments life-cycle
-_hf_deepspeed_config_weak_ref = None
-
-
-def set_hf_deepspeed_config(hf_deepspeed_config_obj):
-    # this is a special weakref global object to allow us to get to Deepspeed config from APIs
-    # that don't have an easy way to get to the Deepspeed config outside of the Trainer domain.
-    global _hf_deepspeed_config_weak_ref
-    # will go away automatically when HfDeepSpeedConfig is destroyed (when TrainingArguments is destroyed)
-    _hf_deepspeed_config_weak_ref = weakref.ref(hf_deepspeed_config_obj)
-
-
-def unset_hf_deepspeed_config():
-    # useful for unit tests to ensure the global state doesn't leak - call from `tearDown` method
-    global _hf_deepspeed_config_weak_ref
-    _hf_deepspeed_config_weak_ref = None
-
-
-def is_deepspeed_zero3_enabled():
-    if _hf_deepspeed_config_weak_ref is not None and _hf_deepspeed_config_weak_ref() is not None:
-        return _hf_deepspeed_config_weak_ref().is_zero3()
-    else:
-        return False
-
-
-def deepspeed_config():
-    if _hf_deepspeed_config_weak_ref is not None and _hf_deepspeed_config_weak_ref() is not None:
-        return _hf_deepspeed_config_weak_ref().config
-    else:
-        return None
-
-
-def deepspeed_optim_sched(trainer, hf_deepspeed_config, args, num_training_steps):
-    """
-    A convenience wrapper that deals with optimizer and lr scheduler configuration.
-    """
-    config = hf_deepspeed_config.config
-
-    # Optimizer + Scheduler
-    # Currently supported combos:
-    # 1. DS scheduler + DS optimizer: Yes
-    # 2. HF scheduler + HF optimizer: Yes
-    # 3. DS scheduler + HF optimizer: Yes
-    # 4. HF scheduler + DS optimizer: Yes
-    #
-    # Unless Offload is enabled in which case it's:
-    # 1. DS scheduler + DS optimizer: Yes
-    # 2. HF scheduler + HF optimizer: Mostly*
-    # 3. DS scheduler + HF optimizer: Mostly*
-    # 4. HF scheduler + DS optimizer: Yes
-    #
-    # Mostly*: All non-native DeepSpeed optimizers that have both CPU and GPU implementation should work (except LAMB)
-
-    optimizer = None
-    if "optimizer" in config:
-        if args.adafactor:
-            raise ValueError(
-                "--adafactor was passed, but also found `optimizer` configured in the DeepSpeed config. "
-                "Only one optimizer can be configured."
-            )
-    else:
-        if hf_deepspeed_config.is_offload():
-            logger.info(
-                "Detected ZeRO Offload and non-DeepSpeed optimizers: This combination should work as long as the"
-                " custom optimizer has both CPU and GPU implementation (except LAMB)"
-            )
-
-        # ds supports Adam, OneBitAdam, and Lamb optimizers and can import other optimizers from torch.
-        # But trainer uses AdamW by default.
-        optimizer = trainer.create_optimizer()
-        # To use other optimizers requires voiding warranty with: `zero_allow_untested_optimizer`
-        config["zero_allow_untested_optimizer"] = True
-
-    def _lr_scheduler_callable(optimizer):
-        return trainer.create_scheduler(num_training_steps=num_training_steps, optimizer=optimizer)
-
-    lr_scheduler = None
-    if "scheduler" not in config:
-        if optimizer is None:
-            # Optimizer is not available, so use callable to defer lr_scheduler creation to DS init
-            lr_scheduler = _lr_scheduler_callable
-        else:
-            lr_scheduler = trainer.create_scheduler(num_training_steps=num_training_steps, optimizer=optimizer)
-
-    return optimizer, lr_scheduler
-
-
-def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None, inference=False):
-    """
-    Init DeepSpeed, after updating the DeepSpeed configuration with any relevant Trainer's args.
-
-    If `resume_from_checkpoint` was passed then an attempt to resume from a previously saved checkpoint will be made.
-
-    Args:
-        trainer: Trainer object
-        num_training_steps: per single gpu
-        resume_from_checkpoint: path to a checkpoint if to resume from after normal DeepSpeedEngine load
-        inference: launch in inference mode (no optimizer and no lr scheduler)
-
-    Returns: model, optimizer, lr_scheduler
-
-    We may use `deepspeed_init` more than once during the life of Trainer, when we do - it's a temp hack based on:
-    https://github.com/microsoft/DeepSpeed/issues/1394#issuecomment-937405374 until Deepspeed fixes a bug where it
-    can't resume from a checkpoint after it did some stepping https://github.com/microsoft/DeepSpeed/issues/1612
-
-    """
-    import deepspeed
-    from deepspeed.utils import logger as ds_logger
-
-    model = trainer.model
-    args = trainer.args
-
-    if hasattr(trainer, "hf_deepspeed_config_orig"):
-        hf_deepspeed_config = deepcopy(trainer.hf_deepspeed_config_orig)
-    else:
-        hf_deepspeed_config = args.hf_deepspeed_config
-        trainer.hf_deepspeed_config_orig = deepcopy(args.hf_deepspeed_config)
-
-    # resume config update - some bits like `model` and `num_training_steps` only become available during train
-    hf_deepspeed_config.trainer_config_finalize(args, model, num_training_steps)
-    config = hf_deepspeed_config.config
-
-    # set the Deepspeed log level consistent with the Trainer
-    ds_logger.setLevel(args.get_process_log_level())
-
-    if inference:
-        # only Z3 makes sense for the inference
-        if not hf_deepspeed_config.is_zero3():
-            raise ValueError("ZeRO inference only makes sense with ZeRO Stage 3 - please adjust your config")
-
-        # in case the training config is re-used for inference
-        hf_deepspeed_config.del_config_sub_tree("optimizer")
-        hf_deepspeed_config.del_config_sub_tree("lr_scheduler")
-        optimizer, lr_scheduler = None, None
-        model_parameters = None
-    else:
-        trainer.optimizer = None  # important for when deepspeed_init is used as re-init
-        optimizer, lr_scheduler = deepspeed_optim_sched(trainer, hf_deepspeed_config, args, num_training_steps)
-        model_parameters = list(filter(lambda p: p.requires_grad, model.parameters()))
-
-    # keep for quick debug:
-    # from pprint import pprint; pprint(config)
-
-    kwargs = dict(
-        model=model,
-        model_parameters=model_parameters,
-        config_params=config,
-        optimizer=optimizer,
-        lr_scheduler=lr_scheduler,
-    )
-
-    deepspeed_engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
-
-    if resume_from_checkpoint is not None:
-
-        # it's possible that the user is trying to resume from model_path, which doesn't necessarily
-        # contain a deepspeed checkpoint. e.g. examples just check if the dir exists and assume it's
-        # a resume from a checkpoint and not just a local pretrained weight. So we check here if the
-        # path contains what looks like a deepspeed checkpoint
-        import glob
-
-        deepspeed_checkpoint_dirs = sorted(glob.glob(f"{resume_from_checkpoint}/global_step*"))
-
-        if len(deepspeed_checkpoint_dirs) > 0:
-            logger.info(f"Attempting to resume from {resume_from_checkpoint}")
-            # this magically updates self.optimizer and self.lr_scheduler
-            load_path, _ = deepspeed_engine.load_checkpoint(
-                resume_from_checkpoint, load_optimizer_states=True, load_lr_scheduler_states=True
-            )
-            if load_path is None:
-                raise ValueError(f"[deepspeed] failed to resume from checkpoint {resume_from_checkpoint}")
-        else:
-            logger.info(f"{resume_from_checkpoint} doesn't have deepspeed checkpoints, doing nothing")
-
-    return deepspeed_engine, optimizer, lr_scheduler
+Check: https://github.com/huggingface/transformers/pull/25599
+"""
+import warnings
+
+
+warnings.warn(
+    "transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations",
+    FutureWarning,
+)
+
+# Backward compatibility imports, to make sure all those objects can be found in integrations/deepspeed
+from .integrations.deepspeed import (  # noqa
+    HfDeepSpeedConfig,
+    HfTrainerDeepSpeedConfig,
+    deepspeed_config,
+    deepspeed_init,
+    deepspeed_load_checkpoint,
+    deepspeed_optim_sched,
+    is_deepspeed_available,
+    is_deepspeed_zero3_enabled,
+    set_hf_deepspeed_config,
+    unset_hf_deepspeed_config,
+)
diff --git a/src/transformers/dependency_versions_check.py b/src/transformers/dependency_versions_check.py
index bbf863222a52..82d07850847e 100644
--- a/src/transformers/dependency_versions_check.py
+++ b/src/transformers/dependency_versions_check.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import sys
 
 from .dependency_versions_table import deps
 from .utils.versions import require_version, require_version_core
@@ -23,11 +22,20 @@
 # order specific notes:
 # - tqdm must be checked before tokenizers
 
-pkgs_to_check_at_runtime = "python tqdm regex requests packaging filelock numpy tokenizers".split()
-if sys.version_info < (3, 7):
-    pkgs_to_check_at_runtime.append("dataclasses")
-if sys.version_info < (3, 8):
-    pkgs_to_check_at_runtime.append("importlib_metadata")
+pkgs_to_check_at_runtime = [
+    "python",
+    "tqdm",
+    "regex",
+    "requests",
+    "packaging",
+    "filelock",
+    "numpy",
+    "tokenizers",
+    "huggingface-hub",
+    "safetensors",
+    "accelerate",
+    "pyyaml",
+]
 
 for pkg in pkgs_to_check_at_runtime:
     if pkg in deps:
@@ -37,6 +45,14 @@
 
             if not is_tokenizers_available():
                 continue  # not required, check version only if installed
+        elif pkg == "accelerate":
+            # must be loaded here, or else tqdm check may fail
+            from .utils import is_accelerate_available
+
+            # Maybe switch to is_torch_available in the future here so that Accelerate is hard dep of
+            # Transformers with PyTorch
+            if not is_accelerate_available():
+                continue  # not required, check version only if installed
 
         require_version_core(deps[pkg])
     else:
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index b81734043fce..20dacb3cf0d4 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -2,85 +2,89 @@
 # 1. modify the `_deps` dict in setup.py
 # 2. run `make deps_table_update``
 deps = {
-    "Pillow": "Pillow",
-    "accelerate": "accelerate>=0.10.0",
-    "black": "black==22.3",
+    "Pillow": "Pillow<10.0.0",
+    "accelerate": "accelerate>=0.20.3",
+    "av": "av==9.2.0",
+    "beautifulsoup4": "beautifulsoup4",
+    "black": "black~=23.1",
     "codecarbon": "codecarbon==1.2.0",
     "cookiecutter": "cookiecutter==1.7.3",
     "dataclasses": "dataclasses",
     "datasets": "datasets!=2.5.0",
     "decord": "decord==0.6.0",
-    "deepspeed": "deepspeed>=0.6.5",
+    "deepspeed": "deepspeed>=0.9.3",
+    "diffusers": "diffusers",
     "dill": "dill<0.3.5",
     "evaluate": "evaluate>=0.2.0",
-    "fairscale": "fairscale>0.3",
     "faiss-cpu": "faiss-cpu",
     "fastapi": "fastapi",
     "filelock": "filelock",
-    "flake8": "flake8>=3.8.3",
-    "flax": "flax>=0.4.1",
+    "flax": "flax>=0.4.1,<=0.7.0",
     "ftfy": "ftfy",
     "fugashi": "fugashi>=1.0",
     "GitPython": "GitPython<3.1.19",
     "hf-doc-builder": "hf-doc-builder>=0.3.0",
-    "huggingface-hub": "huggingface-hub>=0.10.0,<1.0",
+    "huggingface-hub": "huggingface-hub>=0.16.4,<1.0",
     "importlib_metadata": "importlib_metadata",
     "ipadic": "ipadic>=1.0.0,<2.0",
     "isort": "isort>=5.5.4",
-    "jax": "jax>=0.2.8,!=0.3.2,<=0.3.6",
-    "jaxlib": "jaxlib>=0.1.65,<=0.3.6",
+    "jax": "jax>=0.4.1,<=0.4.13",
+    "jaxlib": "jaxlib>=0.4.1,<=0.4.13",
     "jieba": "jieba",
     "kenlm": "kenlm",
     "keras-nlp": "keras-nlp>=0.3.1",
+    "librosa": "librosa",
     "nltk": "nltk",
-    "natten": "natten>=0.14.4",
+    "natten": "natten>=0.14.6",
     "numpy": "numpy>=1.17",
     "onnxconverter-common": "onnxconverter-common",
     "onnxruntime-tools": "onnxruntime-tools>=1.4.2",
     "onnxruntime": "onnxruntime>=1.4.0",
+    "opencv-python": "opencv-python",
     "optuna": "optuna",
-    "optax": "optax>=0.0.8",
+    "optax": "optax>=0.0.8,<=0.1.4",
     "packaging": "packaging>=20.0",
     "parameterized": "parameterized",
     "phonemizer": "phonemizer",
-    "protobuf": "protobuf<=3.20.2",
+    "protobuf": "protobuf",
     "psutil": "psutil",
     "pyyaml": "pyyaml>=5.1",
-    "pydantic": "pydantic",
-    "pytest": "pytest",
+    "pydantic": "pydantic<2",
+    "pytest": "pytest>=7.2.0",
     "pytest-timeout": "pytest-timeout",
     "pytest-xdist": "pytest-xdist",
-    "python": "python>=3.7.0",
+    "python": "python>=3.8.0",
     "ray[tune]": "ray[tune]",
     "regex": "regex!=2019.12.17",
     "requests": "requests",
+    "rhoknp": "rhoknp>=1.1.0,<1.3.1",
     "rjieba": "rjieba",
     "rouge-score": "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1",
+    "ruff": "ruff>=0.0.241,<=0.0.259",
     "sacrebleu": "sacrebleu>=1.4.12,<2.0.0",
     "sacremoses": "sacremoses",
-    "safetensors": "safetensors>=0.2.1",
+    "safetensors": "safetensors>=0.3.1",
     "sagemaker": "sagemaker>=2.31.0",
     "scikit-learn": "scikit-learn",
     "sentencepiece": "sentencepiece>=0.1.91,!=0.1.92",
     "sigopt": "sigopt",
-    "librosa": "librosa",
     "starlette": "starlette",
-    "tensorflow-cpu": "tensorflow-cpu>=2.4,<2.12",
-    "tensorflow": "tensorflow>=2.4,<2.12",
-    "tensorflow-text": "tensorflow-text",
+    "sudachipy": "sudachipy>=0.6.6",
+    "sudachidict_core": "sudachidict_core>=20220729",
+    "tensorflow-cpu": "tensorflow-cpu>=2.6,<2.15",
+    "tensorflow": "tensorflow>=2.6,<2.15",
+    "tensorflow-text": "tensorflow-text<2.15",
     "tf2onnx": "tf2onnx",
     "timeout-decorator": "timeout-decorator",
     "timm": "timm",
-    "tokenizers": "tokenizers>=0.11.1,!=0.11.3,<0.14",
-    "torch": "torch>=1.7,!=1.12.0",
+    "tokenizers": "tokenizers>=0.14,<0.15",
+    "torch": "torch>=1.10,!=1.12.0",
     "torchaudio": "torchaudio",
+    "torchvision": "torchvision",
     "pyctcdecode": "pyctcdecode>=0.4.0",
     "tqdm": "tqdm>=4.27",
     "unidic": "unidic>=1.0.2",
     "unidic_lite": "unidic_lite>=1.0.7",
+    "urllib3": "urllib3<2.0.0",
     "uvicorn": "uvicorn",
-    "beautifulsoup4": "beautifulsoup4",
-    "sudachipy": "sudachipy>=0.6.6",
-    "sudachidict_core": "sudachidict_core>=20220729",
-    "rhoknp": "rhoknp>=1.1.0",
 }
diff --git a/src/transformers/dynamic_module_utils.py b/src/transformers/dynamic_module_utils.py
index 0c2067cf2e53..cd875b7b4132 100644
--- a/src/transformers/dynamic_module_utils.py
+++ b/src/transformers/dynamic_module_utils.py
@@ -13,18 +13,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Utilities to dynamically load objects from the Hub."""
-
+import filecmp
 import importlib
 import os
 import re
 import shutil
+import signal
 import sys
+import typing
+import warnings
 from pathlib import Path
-from typing import Dict, Optional, Union
-
-from huggingface_hub import HfFolder, model_info
+from typing import Any, Dict, List, Optional, Union
 
-from .utils import HF_MODULES_CACHE, TRANSFORMERS_DYNAMIC_MODULE_NAME, cached_file, is_offline_mode, logging
+from .utils import (
+    HF_MODULES_CACHE,
+    TRANSFORMERS_DYNAMIC_MODULE_NAME,
+    cached_file,
+    extract_commit_hash,
+    is_offline_mode,
+    logging,
+    try_to_load_from_cache,
+)
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -43,14 +52,19 @@ def init_hf_modules():
     init_path = Path(HF_MODULES_CACHE) / "__init__.py"
     if not init_path.exists():
         init_path.touch()
+        importlib.invalidate_caches()
 
 
 def create_dynamic_module(name: Union[str, os.PathLike]):
     """
     Creates a dynamic module in the cache directory for modules.
+
+    Args:
+        name (`str` or `os.PathLike`):
+            The name of the dynamic module to create.
     """
     init_hf_modules()
-    dynamic_module_path = Path(HF_MODULES_CACHE) / name
+    dynamic_module_path = (Path(HF_MODULES_CACHE) / name).resolve()
     # If the parent module does not exist yet, recursively create it.
     if not dynamic_module_path.parent.exists():
         create_dynamic_module(dynamic_module_path.parent)
@@ -58,14 +72,20 @@ def create_dynamic_module(name: Union[str, os.PathLike]):
     init_path = dynamic_module_path / "__init__.py"
     if not init_path.exists():
         init_path.touch()
+        # It is extremely important to invalidate the cache when we change stuff in those modules, or users end up
+        # with errors about module that do not exist. Same for all other `invalidate_caches` in this file.
+        importlib.invalidate_caches()
 
 
-def get_relative_imports(module_file):
+def get_relative_imports(module_file: Union[str, os.PathLike]) -> List[str]:
     """
     Get the list of modules that are relatively imported in a module file.
 
     Args:
         module_file (`str` or `os.PathLike`): The module file to inspect.
+
+    Returns:
+        `List[str]`: The list of relative imports in the module.
     """
     with open(module_file, "r", encoding="utf-8") as f:
         content = f.read()
@@ -78,13 +98,17 @@ def get_relative_imports(module_file):
     return list(set(relative_imports))
 
 
-def get_relative_import_files(module_file):
+def get_relative_import_files(module_file: Union[str, os.PathLike]) -> List[str]:
     """
     Get the list of all files that are needed for a given module. Note that this function recurses through the relative
     imports (if a imports b and b imports c, it will return module files for b and c).
 
     Args:
         module_file (`str` or `os.PathLike`): The module file to inspect.
+
+    Returns:
+        `List[str]`: The list of all relative imports a given module needs (recursively), which will give us the list
+        of module files a given module needs.
     """
     no_change = False
     files_to_check = [module_file]
@@ -107,22 +131,43 @@ def get_relative_import_files(module_file):
     return all_relative_imports
 
 
-def check_imports(filename):
+def get_imports(filename: Union[str, os.PathLike]) -> List[str]:
     """
-    Check if the current Python environment contains all the libraries that are imported in a file.
+    Extracts all the libraries (not relative imports this time) that are imported in a file.
+
+    Args:
+        filename (`str` or `os.PathLike`): The module file to inspect.
+
+    Returns:
+        `List[str]`: The list of all packages required to use the input module.
     """
     with open(filename, "r", encoding="utf-8") as f:
         content = f.read()
 
+    # filter out try/except block so in custom code we can have try/except imports
+    content = re.sub(r"\s*try\s*:\s*.*?\s*except\s*.*?:", "", content, flags=re.MULTILINE | re.DOTALL)
+
     # Imports of the form `import xxx`
     imports = re.findall(r"^\s*import\s+(\S+)\s*$", content, flags=re.MULTILINE)
     # Imports of the form `from xxx import yyy`
     imports += re.findall(r"^\s*from\s+(\S+)\s+import", content, flags=re.MULTILINE)
     # Only keep the top-level module
     imports = [imp.split(".")[0] for imp in imports if not imp.startswith(".")]
+    return list(set(imports))
+
 
-    # Unique-ify and test we got them all
-    imports = list(set(imports))
+def check_imports(filename: Union[str, os.PathLike]) -> List[str]:
+    """
+    Check if the current Python environment contains all the libraries that are imported in a file. Will raise if a
+    library is missing.
+
+    Args:
+        filename (`str` or `os.PathLike`): The module file to check.
+
+    Returns:
+        `List[str]`: The list of relative imports in the file.
+    """
+    imports = get_imports(filename)
     missing_packages = []
     for imp in imports:
         try:
@@ -139,9 +184,16 @@ def check_imports(filename):
     return get_relative_imports(filename)
 
 
-def get_class_in_module(class_name, module_path):
+def get_class_in_module(class_name: str, module_path: Union[str, os.PathLike]) -> typing.Type:
     """
     Import a module on the cache directory for modules and extract a class from it.
+
+    Args:
+        class_name (`str`): The name of the class to import.
+        module_path (`str` or `os.PathLike`): The path to the module to import.
+
+    Returns:
+        `typing.Type`: The class looked for.
     """
     module_path = module_path.replace(os.path.sep, ".")
     module = importlib.import_module(module_path)
@@ -155,10 +207,13 @@ def get_cached_module_file(
     force_download: bool = False,
     resume_download: bool = False,
     proxies: Optional[Dict[str, str]] = None,
-    use_auth_token: Optional[Union[bool, str]] = None,
+    token: Optional[Union[bool, str]] = None,
     revision: Optional[str] = None,
     local_files_only: bool = False,
-):
+    repo_type: Optional[str] = None,
+    _commit_hash: Optional[str] = None,
+    **deprecated_kwargs,
+) -> str:
     """
     Prepares Downloads a module from a local folder or a distant repo and returns its path inside the cached
     Transformers module.
@@ -186,7 +241,7 @@ def get_cached_module_file(
         proxies (`Dict[str, str]`, *optional*):
             A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
             'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-        use_auth_token (`str` or *bool*, *optional*):
+        token (`str` or *bool*, *optional*):
             The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
             when running `huggingface-cli login` (stored in `~/.huggingface`).
         revision (`str`, *optional*, defaults to `"main"`):
@@ -195,27 +250,43 @@ def get_cached_module_file(
             identifier allowed by git.
         local_files_only (`bool`, *optional*, defaults to `False`):
             If `True`, will only try to load the tokenizer configuration from local files.
+        repo_type (`str`, *optional*):
+            Specify the repo type (useful when downloading from a space for instance).
 
     <Tip>
 
-    Passing `use_auth_token=True` is required when you want to use a private model.
+    Passing `token=True` is required when you want to use a private model.
 
     </Tip>
 
     Returns:
         `str`: The path to the module inside the cache.
     """
+    use_auth_token = deprecated_kwargs.pop("use_auth_token", None)
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+        )
+        if token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        token = use_auth_token
+
     if is_offline_mode() and not local_files_only:
         logger.info("Offline mode: forcing local_files_only=True")
         local_files_only = True
 
     # Download and cache module_file from the repo `pretrained_model_name_or_path` of grab it if it's a local file.
     pretrained_model_name_or_path = str(pretrained_model_name_or_path)
-    if os.path.isdir(pretrained_model_name_or_path):
-        submodule = "local"
+    is_local = os.path.isdir(pretrained_model_name_or_path)
+    if is_local:
+        submodule = os.path.basename(pretrained_model_name_or_path)
     else:
         submodule = pretrained_model_name_or_path.replace("/", os.path.sep)
+        cached_module = try_to_load_from_cache(
+            pretrained_model_name_or_path, module_file, cache_dir=cache_dir, revision=_commit_hash, repo_type=repo_type
+        )
 
+    new_files = []
     try:
         # Load from URL or cache if already cached
         resolved_module_file = cached_file(
@@ -226,8 +297,13 @@ def get_cached_module_file(
             proxies=proxies,
             resume_download=resume_download,
             local_files_only=local_files_only,
-            use_auth_token=use_auth_token,
+            token=token,
+            revision=revision,
+            repo_type=repo_type,
+            _commit_hash=_commit_hash,
         )
+        if not is_local and cached_module != resolved_module_file:
+            new_files.append(module_file)
 
     except EnvironmentError:
         logger.error(f"Could not locate the {module_file} inside {pretrained_model_name_or_path}.")
@@ -240,25 +316,25 @@ def get_cached_module_file(
     full_submodule = TRANSFORMERS_DYNAMIC_MODULE_NAME + os.path.sep + submodule
     create_dynamic_module(full_submodule)
     submodule_path = Path(HF_MODULES_CACHE) / full_submodule
-    if submodule == "local":
-        # We always copy local files (we could hash the file to see if there was a change, and give them the name of
-        # that hash, to only copy when there is a modification but it seems overkill for now).
-        # The only reason we do the copy is to avoid putting too many folders in sys.path.
-        shutil.copy(resolved_module_file, submodule_path / module_file)
+    if submodule == os.path.basename(pretrained_model_name_or_path):
+        # We copy local files to avoid putting too many folders in sys.path. This copy is done when the file is new or
+        # has changed since last copy.
+        if not (submodule_path / module_file).exists() or not filecmp.cmp(
+            resolved_module_file, str(submodule_path / module_file)
+        ):
+            shutil.copy(resolved_module_file, submodule_path / module_file)
+            importlib.invalidate_caches()
         for module_needed in modules_needed:
             module_needed = f"{module_needed}.py"
-            shutil.copy(os.path.join(pretrained_model_name_or_path, module_needed), submodule_path / module_needed)
+            module_needed_file = os.path.join(pretrained_model_name_or_path, module_needed)
+            if not (submodule_path / module_needed).exists() or not filecmp.cmp(
+                module_needed_file, str(submodule_path / module_needed)
+            ):
+                shutil.copy(module_needed_file, submodule_path / module_needed)
+                importlib.invalidate_caches()
     else:
         # Get the commit hash
-        # TODO: we will get this info in the etag soon, so retrieve it from there and not here.
-        if isinstance(use_auth_token, str):
-            token = use_auth_token
-        elif use_auth_token is True:
-            token = HfFolder.get_token()
-        else:
-            token = None
-
-        commit_hash = model_info(pretrained_model_name_or_path, revision=revision, token=token).sha
+        commit_hash = extract_commit_hash(resolved_module_file, _commit_hash)
 
         # The module file will end up being placed in a subfolder with the git hash of the repo. This way we get the
         # benefit of versioning.
@@ -268,9 +344,10 @@ def get_cached_module_file(
 
         if not (submodule_path / module_file).exists():
             shutil.copy(resolved_module_file, submodule_path / module_file)
+            importlib.invalidate_caches()
         # Make sure we also have every file with relative
         for module_needed in modules_needed:
-            if not (submodule_path / module_needed).exists():
+            if not (submodule_path / f"{module_needed}.py").exists():
                 get_cached_module_file(
                     pretrained_model_name_or_path,
                     f"{module_needed}.py",
@@ -278,26 +355,40 @@ def get_cached_module_file(
                     force_download=force_download,
                     resume_download=resume_download,
                     proxies=proxies,
-                    use_auth_token=use_auth_token,
+                    token=token,
                     revision=revision,
                     local_files_only=local_files_only,
+                    _commit_hash=commit_hash,
                 )
+                new_files.append(f"{module_needed}.py")
+
+    if len(new_files) > 0 and revision is None:
+        new_files = "\n".join([f"- {f}" for f in new_files])
+        repo_type_str = "" if repo_type is None else f"{repo_type}s/"
+        url = f"https://huggingface.co/{repo_type_str}{pretrained_model_name_or_path}"
+        logger.warning(
+            f"A new version of the following files was downloaded from {url}:\n{new_files}"
+            "\n. Make sure to double-check they do not contain any added malicious code. To avoid downloading new "
+            "versions of the code file, you can pin a revision."
+        )
+
     return os.path.join(full_submodule, module_file)
 
 
 def get_class_from_dynamic_module(
+    class_reference: str,
     pretrained_model_name_or_path: Union[str, os.PathLike],
-    module_file: str,
-    class_name: str,
     cache_dir: Optional[Union[str, os.PathLike]] = None,
     force_download: bool = False,
     resume_download: bool = False,
     proxies: Optional[Dict[str, str]] = None,
-    use_auth_token: Optional[Union[bool, str]] = None,
+    token: Optional[Union[bool, str]] = None,
     revision: Optional[str] = None,
     local_files_only: bool = False,
+    repo_type: Optional[str] = None,
+    code_revision: Optional[str] = None,
     **kwargs,
-):
+) -> typing.Type:
     """
     Extracts a class from a module file, present in the local folder or repository of a model.
 
@@ -309,6 +400,8 @@ def get_class_from_dynamic_module(
     </Tip>
 
     Args:
+        class_reference (`str`):
+            The full name of the class to load, including its module and optionally its repo.
         pretrained_model_name_or_path (`str` or `os.PathLike`):
             This can be either:
 
@@ -318,6 +411,7 @@ def get_class_from_dynamic_module(
             - a path to a *directory* containing a configuration file saved using the
               [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
 
+            This is used when `class_reference` does not specify another repo.
         module_file (`str`):
             The name of the module file containing the class to look for.
         class_name (`str`):
@@ -333,7 +427,7 @@ def get_class_from_dynamic_module(
         proxies (`Dict[str, str]`, *optional*):
             A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
             'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-        use_auth_token (`str` or `bool`, *optional*):
+        token (`str` or `bool`, *optional*):
             The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
             when running `huggingface-cli login` (stored in `~/.huggingface`).
         revision (`str`, *optional*, defaults to `"main"`):
@@ -342,39 +436,68 @@ def get_class_from_dynamic_module(
             identifier allowed by git.
         local_files_only (`bool`, *optional*, defaults to `False`):
             If `True`, will only try to load the tokenizer configuration from local files.
+        repo_type (`str`, *optional*):
+            Specify the repo type (useful when downloading from a space for instance).
+        code_revision (`str`, *optional*, defaults to `"main"`):
+            The specific revision to use for the code on the Hub, if the code leaves in a different repository than the
+            rest of the model. It can be a branch name, a tag name, or a commit id, since we use a git-based system for
+            storing models and other artifacts on huggingface.co, so `revision` can be any identifier allowed by git.
 
     <Tip>
 
-    Passing `use_auth_token=True` is required when you want to use a private model.
+    Passing `token=True` is required when you want to use a private model.
 
     </Tip>
 
     Returns:
-        `type`: The class, dynamically imported from the module.
+        `typing.Type`: The class, dynamically imported from the module.
 
     Examples:
 
     ```python
     # Download module `modeling.py` from huggingface.co and cache then extract the class `MyBertModel` from this
     # module.
-    cls = get_class_from_dynamic_module("sgugger/my-bert-model", "modeling.py", "MyBertModel")
+    cls = get_class_from_dynamic_module("modeling.MyBertModel", "sgugger/my-bert-model")
+
+    # Download module `modeling.py` from a given repo and cache then extract the class `MyBertModel` from this
+    # module.
+    cls = get_class_from_dynamic_module("sgugger/my-bert-model--modeling.MyBertModel", "sgugger/another-bert-model")
     ```"""
+    use_auth_token = kwargs.pop("use_auth_token", None)
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+        )
+        if token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        token = use_auth_token
+
+    # Catch the name of the repo if it's specified in `class_reference`
+    if "--" in class_reference:
+        repo_id, class_reference = class_reference.split("--")
+    else:
+        repo_id = pretrained_model_name_or_path
+    module_file, class_name = class_reference.split(".")
+
+    if code_revision is None and pretrained_model_name_or_path == repo_id:
+        code_revision = revision
     # And lastly we get the class inside our newly created module
     final_module = get_cached_module_file(
-        pretrained_model_name_or_path,
-        module_file,
+        repo_id,
+        module_file + ".py",
         cache_dir=cache_dir,
         force_download=force_download,
         resume_download=resume_download,
         proxies=proxies,
-        use_auth_token=use_auth_token,
-        revision=revision,
+        token=token,
+        revision=code_revision,
         local_files_only=local_files_only,
+        repo_type=repo_type,
     )
     return get_class_in_module(class_name, final_module.replace(".py", ""))
 
 
-def custom_object_save(obj, folder, config=None):
+def custom_object_save(obj: Any, folder: Union[str, os.PathLike], config: Optional[Dict] = None) -> List[str]:
     """
     Save the modeling files corresponding to a custom model/configuration/tokenizer etc. in a given folder. Optionally
     adds the proper fields in a config.
@@ -384,6 +507,9 @@ def custom_object_save(obj, folder, config=None):
         folder (`str` or `os.PathLike`): The folder where to save.
         config (`PretrainedConfig` or dictionary, `optional`):
             A config in which to register the auto_map corresponding to this custom object.
+
+    Returns:
+        `List[str]`: The list of files saved.
     """
     if obj.__module__ == "__main__":
         logger.warning(
@@ -391,6 +517,7 @@ def custom_object_save(obj, folder, config=None):
             "this code in a separate module so we can include it in the saved folder and make it easier to share via "
             "the Hub."
         )
+        return
 
     def _set_auto_map_in_config(_config):
         module_name = obj.__class__.__module__
@@ -430,12 +557,68 @@ def _set_auto_map_in_config(_config):
     elif config is not None:
         _set_auto_map_in_config(config)
 
+    result = []
     # Copy module file to the output folder.
     object_file = sys.modules[obj.__module__].__file__
     dest_file = Path(folder) / (Path(object_file).name)
     shutil.copy(object_file, dest_file)
+    result.append(dest_file)
 
     # Gather all relative imports recursively and make sure they are copied as well.
     for needed_file in get_relative_import_files(object_file):
         dest_file = Path(folder) / (Path(needed_file).name)
         shutil.copy(needed_file, dest_file)
+        result.append(dest_file)
+
+    return result
+
+
+def _raise_timeout_error(signum, frame):
+    raise ValueError(
+        "Loading this model requires you to execute custom code contained in the model repository on your local"
+        "machine. Please set the option `trust_remote_code=True` to permit loading of this model."
+    )
+
+
+TIME_OUT_REMOTE_CODE = 15
+
+
+def resolve_trust_remote_code(trust_remote_code, model_name, has_local_code, has_remote_code):
+    if trust_remote_code is None:
+        if has_local_code:
+            trust_remote_code = False
+        elif has_remote_code and TIME_OUT_REMOTE_CODE > 0:
+            try:
+                signal.signal(signal.SIGALRM, _raise_timeout_error)
+                signal.alarm(TIME_OUT_REMOTE_CODE)
+                while trust_remote_code is None:
+                    answer = input(
+                        f"The repository for {model_name} contains custom code which must be executed to correctly"
+                        f"load the model. You can inspect the repository content at https://hf.co/{model_name}.\n"
+                        f"You can avoid this prompt in future by passing the argument `trust_remote_code=True`.\n\n"
+                        f"Do you wish to run the custom code? [y/N] "
+                    )
+                    if answer.lower() in ["yes", "y", "1"]:
+                        trust_remote_code = True
+                    elif answer.lower() in ["no", "n", "0", ""]:
+                        trust_remote_code = False
+                signal.alarm(0)
+            except Exception:
+                # OS which does not support signal.SIGALRM
+                raise ValueError(
+                    f"The repository for {model_name} contains custom code which must be executed to correctly"
+                    f"load the model. You can inspect the repository content at https://hf.co/{model_name}.\n"
+                    f"Please pass the argument `trust_remote_code=True` to allow custom code to be run."
+                )
+        elif has_remote_code:
+            # For the CI which puts the timeout at 0
+            _raise_timeout_error(None, None)
+
+    if has_remote_code and not has_local_code and not trust_remote_code:
+        raise ValueError(
+            f"Loading {model_name} requires you to execute the configuration file in that"
+            " repo on your local machine. Make sure you have read the code there to avoid malicious use, then"
+            " set the option `trust_remote_code=True` to remove this error."
+        )
+
+    return trust_remote_code
diff --git a/src/transformers/feature_extraction_sequence_utils.py b/src/transformers/feature_extraction_sequence_utils.py
index 1b869e4d6baf..40717d993185 100644
--- a/src/transformers/feature_extraction_sequence_utils.py
+++ b/src/transformers/feature_extraction_sequence_utils.py
@@ -34,7 +34,7 @@ class SequenceFeatureExtractor(FeatureExtractionMixin):
         feature_size (`int`):
             The feature dimension of the extracted features.
         sampling_rate (`int`):
-            The sampling rate at which the audio files should be digitalized expressed in Hertz per second (Hz).
+            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
         padding_value (`float`):
             The value that is used to fill the padding values / vectors.
     """
@@ -107,7 +107,7 @@ def pad(
                 If set will pad the sequence to a multiple of the provided value.
 
                 This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
-                >= 7.5 (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
+                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
             return_attention_mask (`bool`, *optional*):
                 Whether to return the attention mask. If left to the default, will return the attention mask according
                 to the specific feature_extractor's default.
@@ -140,7 +140,7 @@ def pad(
             return_attention_mask if return_attention_mask is not None else self.return_attention_mask
         )
 
-        if not required_input:
+        if len(required_input) == 0:
             if return_attention_mask:
                 processed_features["attention_mask"] = []
             return processed_features
@@ -188,7 +188,7 @@ def pad(
 
         truncated_inputs = []
         for i in range(batch_size):
-            inputs = dict((k, v[i]) for k, v in processed_features.items())
+            inputs = {k: v[i] for k, v in processed_features.items()}
             # truncation
             inputs_slice = self._truncate(
                 inputs,
@@ -235,11 +235,13 @@ def _pad(
         Pad inputs (on left/right and up to predefined length or max length in the batch)
 
         Args:
-            processed_features:
+            processed_features (`Union[Dict[str, np.ndarray], BatchFeature]`):
                 Dictionary of input values (`np.ndarray[float]`) / input vectors (`List[np.ndarray[float]]`) or batch
                 of inputs values (`List[np.ndarray[int]]`) / input vectors (`List[np.ndarray[int]]`)
-            max_length: maximum length of the returned list and optionally padding length (see below)
-            padding_strategy: PaddingStrategy to use for padding.
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see below)
+            padding_strategy (`PaddingStrategy`, *optional*, default to `PaddingStrategy.DO_NOT_PAD`):
+                PaddingStrategy to use for padding.
 
                 - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
                 - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
@@ -248,11 +250,12 @@ def _pad(
 
                     - 'left': pads on the left of the sequences
                     - 'right': pads on the right of the sequences
-            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
-                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
-                >= 7.5 (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
-            return_attention_mask:
-                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+            pad_to_multiple_of (`int`, *optional*):
+                Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to
+                enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta), or on TPUs
+                which benefit from having sequence lengths be a multiple of 128.
+            return_attention_mask (`bool`, *optional*):
+                Set to False to avoid returning attention mask (default: set to model specifics)
         """
         required_input = processed_features[self.model_input_names[0]]
 
@@ -303,15 +306,17 @@ def _truncate(
         Truncate inputs to predefined length or max length in the batch
 
         Args:
-            processed_features:
+            processed_features(`Union[Dict[str, np.ndarray], BatchFeature]`):
                 Dictionary of input values (`np.ndarray[float]`) / input vectors (`List[np.ndarray[float]]`) or batch
                 of inputs values (`List[np.ndarray[int]]`) / input vectors (`List[np.ndarray[int]]`)
-            max_length: maximum length of the returned list and optionally padding length (see below)
-            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
-                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
-                >= 7.5 (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
-            truncation:
-                (optional) Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+            max_length (`int`, *optional*):
+                maximum length of the returned list and optionally padding length (see below)
+            pad_to_multiple_of (`int`, *optional*) :
+                Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to
+                enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta), or on TPUs
+                which benefit from having sequence lengths be a multiple of 128.
+            truncation (`bool`, *optional*):
+                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
         """
         if not truncation:
             return processed_features
diff --git a/src/transformers/feature_extraction_utils.py b/src/transformers/feature_extraction_utils.py
index ff8fa009935f..77123fc3ec9a 100644
--- a/src/transformers/feature_extraction_utils.py
+++ b/src/transformers/feature_extraction_utils.py
@@ -19,6 +19,7 @@
 import copy
 import json
 import os
+import warnings
 from collections import UserDict
 from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
 
@@ -29,6 +30,7 @@
     FEATURE_EXTRACTOR_NAME,
     PushToHubMixin,
     TensorType,
+    add_model_info_to_auto_map,
     cached_file,
     copy_func,
     download_url,
@@ -63,7 +65,7 @@ class BatchFeature(UserDict):
     This class is derived from a python dictionary and can be used as a dictionary.
 
     Args:
-        data (`dict`):
+        data (`dict`, *optional*):
             Dictionary of lists/arrays/tensors returned by the __call__/pad methods ('input_values', 'attention_mask',
             etc.).
         tensor_type (`Union[None, str, TensorType]`, *optional*):
@@ -155,7 +157,15 @@ def as_tensor(value):
             as_tensor = jnp.array
             is_tensor = is_jax_tensor
         else:
-            as_tensor = np.asarray
+
+            def as_tensor(value, dtype=None):
+                if isinstance(value, (list, tuple)) and isinstance(value[0], (list, tuple, np.ndarray)):
+                    value_lens = [len(val) for val in value]
+                    if len(set(value_lens)) > 1 and dtype is None:
+                        # we have a ragged list so handle explicitly
+                        value = as_tensor([np.asarray(val) for val in value], dtype=object)
+                return np.asarray(value, dtype=dtype)
+
             is_tensor = is_numpy_array
 
         # Do the tensor conversion in batch
@@ -246,8 +256,15 @@ def _set_processor_class(self, processor_class: str):
 
     @classmethod
     def from_pretrained(
-        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
-    ) -> PreTrainedFeatureExtractor:
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
+        **kwargs,
+    ):
         r"""
         Instantiate a type of [`~feature_extraction_utils.FeatureExtractionMixin`] from a feature extractor, *e.g.* a
         derived class of [`SequenceFeatureExtractor`].
@@ -276,7 +293,7 @@ def from_pretrained(
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-            use_auth_token (`str` or `bool`, *optional*):
+            token (`str` or `bool`, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
                 the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
             revision (`str`, *optional*, defaults to `"main"`):
@@ -326,6 +343,25 @@ def from_pretrained(
         assert feature_extractor.return_attention_mask is False
         assert unused_kwargs == {"foo": False}
         ```"""
+        kwargs["cache_dir"] = cache_dir
+        kwargs["force_download"] = force_download
+        kwargs["local_files_only"] = local_files_only
+        kwargs["revision"] = revision
+
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
+        if token is not None:
+            kwargs["token"] = token
+
         feature_extractor_dict, kwargs = cls.get_feature_extractor_dict(pretrained_model_name_or_path, **kwargs)
 
         return cls.from_dict(feature_extractor_dict, **kwargs)
@@ -342,9 +378,21 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub:
                 Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
                 repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
                 namespace).
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
         """
+        use_auth_token = kwargs.pop("use_auth_token", None)
+
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if kwargs.get("token", None) is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            kwargs["token"] = use_auth_token
+
         if os.path.isfile(save_directory):
             raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
 
@@ -353,7 +401,7 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub:
         if push_to_hub:
             commit_message = kwargs.pop("commit_message", None)
             repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
-            repo_id, token = self._create_repo(repo_id, **kwargs)
+            repo_id = self._create_repo(repo_id, **kwargs)
             files_timestamps = self._get_files_timestamps(save_directory)
 
         # If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
@@ -369,7 +417,11 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub:
 
         if push_to_hub:
             self._upload_modified_files(
-                save_directory, repo_id, files_timestamps, commit_message=commit_message, token=token
+                save_directory,
+                repo_id,
+                files_timestamps,
+                commit_message=commit_message,
+                token=kwargs.get("token"),
             )
 
         return [output_feature_extractor_file]
@@ -393,10 +445,21 @@ def get_feature_extractor_dict(
         force_download = kwargs.pop("force_download", False)
         resume_download = kwargs.pop("resume_download", False)
         proxies = kwargs.pop("proxies", None)
+        token = kwargs.pop("token", None)
         use_auth_token = kwargs.pop("use_auth_token", None)
         local_files_only = kwargs.pop("local_files_only", False)
         revision = kwargs.pop("revision", None)
 
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
         from_pipeline = kwargs.pop("_from_pipeline", None)
         from_auto_class = kwargs.pop("_from_auto", False)
 
@@ -430,7 +493,7 @@ def get_feature_extractor_dict(
                     proxies=proxies,
                     resume_download=resume_download,
                     local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
+                    token=token,
                     user_agent=user_agent,
                     revision=revision,
                 )
@@ -465,6 +528,11 @@ def get_feature_extractor_dict(
                 f"loading configuration file {feature_extractor_file} from cache at {resolved_feature_extractor_file}"
             )
 
+        if "auto_map" in feature_extractor_dict and not is_local:
+            feature_extractor_dict["auto_map"] = add_model_info_to_auto_map(
+                feature_extractor_dict["auto_map"], pretrained_model_name_or_path
+            )
+
         return feature_extractor_dict, kwargs
 
     @classmethod
diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index f5d404f657bc..d710296fc0f5 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -21,6 +17,8 @@
 This module should not be update anymore and is only left for backward compatibility.
 """
 
+from huggingface_hub import get_full_repo_name  # for backward compatibility
+
 from . import __version__
 
 # Backward compatibility imports, to make sure all those objects can be found in file_utils
@@ -75,7 +73,7 @@
     define_sagemaker_information,
     get_cached_models,
     get_file_from_repo,
-    get_full_repo_name,
+    get_torch_version,
     has_file,
     http_user_agent,
     is_apex_available,
@@ -104,6 +102,7 @@
     is_sagemaker_mp_enabled,
     is_scipy_available,
     is_sentencepiece_available,
+    is_seqio_available,
     is_sklearn_available,
     is_soundfile_availble,
     is_spacy_available,
@@ -119,7 +118,7 @@
     is_torch_cuda_available,
     is_torch_fx_available,
     is_torch_fx_proxy,
-    is_torch_onnx_dict_inputs_support_available,
+    is_torch_mps_available,
     is_torch_tf32_available,
     is_torch_tpu_available,
     is_torchaudio_available,
@@ -130,5 +129,4 @@
     to_numpy,
     to_py_obj,
     torch_only_method,
-    torch_version,
 )
diff --git a/src/transformers/generation/__init__.py b/src/transformers/generation/__init__.py
index d0c3a32973bc..a46cb4fa910a 100644
--- a/src/transformers/generation/__init__.py
+++ b/src/transformers/generation/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -21,8 +17,10 @@
 from ..utils import OptionalDependencyNotAvailable, _LazyModule, is_flax_available, is_tf_available, is_torch_available
 
 
-_import_structure = {"configuration_utils": ["GenerationConfig"]}
-
+_import_structure = {
+    "configuration_utils": ["GenerationConfig"],
+    "streamers": ["TextIteratorStreamer", "TextStreamer"],
+}
 
 try:
     if not is_torch_available():
@@ -43,10 +41,19 @@
         "ConstrainedBeamSearchScorer",
     ]
     _import_structure["logits_process"] = [
+        "AlternatingCodebooksLogitsProcessor",
+        "ClassifierFreeGuidanceLogitsProcessor",
+        "EncoderNoRepeatNGramLogitsProcessor",
+        "EncoderRepetitionPenaltyLogitsProcessor",
+        "EpsilonLogitsWarper",
+        "EtaLogitsWarper",
+        "ExponentialDecayLengthPenalty",
         "ForcedBOSTokenLogitsProcessor",
         "ForcedEOSTokenLogitsProcessor",
+        "ForceTokensLogitsProcessor",
         "HammingDiversityLogitsProcessor",
         "InfNanRemoveLogitsProcessor",
+        "LogitNormalization",
         "LogitsProcessor",
         "LogitsProcessorList",
         "LogitsWarper",
@@ -56,13 +63,15 @@
         "NoRepeatNGramLogitsProcessor",
         "PrefixConstrainedLogitsProcessor",
         "RepetitionPenaltyLogitsProcessor",
+        "SequenceBiasLogitsProcessor",
+        "SuppressTokensLogitsProcessor",
+        "SuppressTokensAtBeginLogitsProcessor",
         "TemperatureLogitsWarper",
         "TopKLogitsWarper",
         "TopPLogitsWarper",
         "TypicalLogitsWarper",
-        "EncoderNoRepeatNGramLogitsProcessor",
-        "ExponentialDecayLengthPenalty",
-        "LogitNormalization",
+        "UnbatchedClassifierFreeGuidanceLogitsProcessor",
+        "WhisperTimeStampLogitsProcessor",
     ]
     _import_structure["stopping_criteria"] = [
         "MaxNewTokensCriteria",
@@ -96,6 +105,7 @@
     _import_structure["tf_logits_process"] = [
         "TFForcedBOSTokenLogitsProcessor",
         "TFForcedEOSTokenLogitsProcessor",
+        "TFForceTokensLogitsProcessor",
         "TFLogitsProcessor",
         "TFLogitsProcessorList",
         "TFLogitsWarper",
@@ -103,12 +113,11 @@
         "TFNoBadWordsLogitsProcessor",
         "TFNoRepeatNGramLogitsProcessor",
         "TFRepetitionPenaltyLogitsProcessor",
+        "TFSuppressTokensAtBeginLogitsProcessor",
+        "TFSuppressTokensLogitsProcessor",
         "TFTemperatureLogitsWarper",
         "TFTopKLogitsWarper",
         "TFTopPLogitsWarper",
-        "TFForceTokensLogitsProcessor",
-        "TFSuppressTokensAtBeginLogitsProcessor",
-        "TFSuppressTokensLogitsProcessor",
     ]
     _import_structure["tf_utils"] = [
         "TFGenerationMixin",
@@ -134,13 +143,17 @@
     _import_structure["flax_logits_process"] = [
         "FlaxForcedBOSTokenLogitsProcessor",
         "FlaxForcedEOSTokenLogitsProcessor",
+        "FlaxForceTokensLogitsProcessor",
         "FlaxLogitsProcessor",
         "FlaxLogitsProcessorList",
         "FlaxLogitsWarper",
         "FlaxMinLengthLogitsProcessor",
+        "FlaxSuppressTokensAtBeginLogitsProcessor",
+        "FlaxSuppressTokensLogitsProcessor",
         "FlaxTemperatureLogitsWarper",
         "FlaxTopKLogitsWarper",
         "FlaxTopPLogitsWarper",
+        "FlaxWhisperTimeStampLogitsProcessor",
     ]
     _import_structure["flax_utils"] = [
         "FlaxGenerationMixin",
@@ -151,6 +164,7 @@
 
 if TYPE_CHECKING:
     from .configuration_utils import GenerationConfig
+    from .streamers import TextIteratorStreamer, TextStreamer
 
     try:
         if not is_torch_available():
@@ -161,10 +175,16 @@
         from .beam_constraints import Constraint, ConstraintListState, DisjunctiveConstraint, PhrasalConstraint
         from .beam_search import BeamHypotheses, BeamScorer, BeamSearchScorer, ConstrainedBeamSearchScorer
         from .logits_process import (
+            AlternatingCodebooksLogitsProcessor,
+            ClassifierFreeGuidanceLogitsProcessor,
             EncoderNoRepeatNGramLogitsProcessor,
+            EncoderRepetitionPenaltyLogitsProcessor,
+            EpsilonLogitsWarper,
+            EtaLogitsWarper,
             ExponentialDecayLengthPenalty,
             ForcedBOSTokenLogitsProcessor,
             ForcedEOSTokenLogitsProcessor,
+            ForceTokensLogitsProcessor,
             HammingDiversityLogitsProcessor,
             InfNanRemoveLogitsProcessor,
             LogitNormalization,
@@ -177,10 +197,15 @@
             NoRepeatNGramLogitsProcessor,
             PrefixConstrainedLogitsProcessor,
             RepetitionPenaltyLogitsProcessor,
+            SequenceBiasLogitsProcessor,
+            SuppressTokensAtBeginLogitsProcessor,
+            SuppressTokensLogitsProcessor,
             TemperatureLogitsWarper,
             TopKLogitsWarper,
             TopPLogitsWarper,
             TypicalLogitsWarper,
+            UnbatchedClassifierFreeGuidanceLogitsProcessor,
+            WhisperTimeStampLogitsProcessor,
         )
         from .stopping_criteria import (
             MaxLengthCriteria,
@@ -252,13 +277,17 @@
         from .flax_logits_process import (
             FlaxForcedBOSTokenLogitsProcessor,
             FlaxForcedEOSTokenLogitsProcessor,
+            FlaxForceTokensLogitsProcessor,
             FlaxLogitsProcessor,
             FlaxLogitsProcessorList,
             FlaxLogitsWarper,
             FlaxMinLengthLogitsProcessor,
+            FlaxSuppressTokensAtBeginLogitsProcessor,
+            FlaxSuppressTokensLogitsProcessor,
             FlaxTemperatureLogitsWarper,
             FlaxTopKLogitsWarper,
             FlaxTopPLogitsWarper,
+            FlaxWhisperTimeStampLogitsProcessor,
         )
         from .flax_utils import FlaxBeamSearchOutput, FlaxGenerationMixin, FlaxGreedySearchOutput, FlaxSampleOutput
 else:
diff --git a/src/transformers/generation/beam_constraints.py b/src/transformers/generation/beam_constraints.py
index baf7e3b71e3e..b53c4512427a 100644
--- a/src/transformers/generation/beam_constraints.py
+++ b/src/transformers/generation/beam_constraints.py
@@ -208,12 +208,12 @@ def __init__(self, nested_token_ids: List[List[int]], no_subsets=True):
         """
         self.max_height = max([len(one) for one in nested_token_ids])
 
-        root = dict()
+        root = {}
         for token_ids in nested_token_ids:
             level = root
             for tidx, token_id in enumerate(token_ids):
                 if token_id not in level:
-                    level[token_id] = dict()
+                    level[token_id] = {}
 
                 level = level[token_id]
 
@@ -263,8 +263,9 @@ class DisjunctiveConstraint(Constraint):
     A special [`Constraint`] that is fulfilled by fulfilling just one of several constraints.
 
     Args:
-        nested_token_ids (`List[List[int]]`): a list of words, where each word is a list of ids. This constraint
-        is fulfilled by generating just one from the list of words.
+        nested_token_ids (`List[List[int]]`):
+            A list of words, where each word is a list of ids. This constraint is fulfilled by generating just one from
+            the list of words.
     """
 
     def __init__(self, nested_token_ids: List[List[int]]):
diff --git a/src/transformers/generation/beam_search.py b/src/transformers/generation/beam_search.py
index 6e4f9cb936e8..03334b6b6145 100644
--- a/src/transformers/generation/beam_search.py
+++ b/src/transformers/generation/beam_search.py
@@ -13,10 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import warnings
 from abc import ABC, abstractmethod
 from collections import UserDict
-from typing import List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -44,6 +43,10 @@
             The id of the *padding* token.
         eos_token_id (`Union[int, List[int]]`, *optional*):
             The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+        beam_indices (`torch.LongTensor`, *optional*):
+            Beam indices indicating to which beam hypothesis each token correspond.
+        group_index (`int`, *optional*):
+            The index of the group of beams. Used with [`~PreTrainedModel.group_beam_search`].
 
     Return:
         `UserDict`: A dictionary composed of the fields as defined above:
@@ -99,7 +102,7 @@ def process(
         next_scores: torch.FloatTensor,
         next_tokens: torch.LongTensor,
         next_indices: torch.LongTensor,
-        **kwargs
+        **kwargs,
     ) -> Tuple[torch.Tensor]:
         raise NotImplementedError("This is an abstract method.")
 
@@ -112,7 +115,7 @@ def finalize(
         next_tokens: torch.LongTensor,
         next_indices: torch.LongTensor,
         max_length: int,
-        **kwargs
+        **kwargs,
     ) -> torch.LongTensor:
         raise NotImplementedError("This is an abstract method.")
 
@@ -130,8 +133,6 @@ class BeamSearchScorer(BeamScorer):
     Args:
         batch_size (`int`):
             Batch Size of `input_ids` for which standard beam search decoding is run in parallel.
-        max_length (`int`):
-            The maximum length of the sequence to be generated.
         num_beams (`int`):
             Number of beams for beam search.
         device (`torch.device`):
@@ -142,14 +143,20 @@ class BeamSearchScorer(BeamScorer):
             the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
             likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while
             `length_penalty` < 0.0 encourages shorter sequences.
-        do_early_stopping (`bool`, *optional*, defaults to `False`):
-            Whether to stop the beam search when at least `num_beams` sentences are finished per batch or not.
+        do_early_stopping (`bool` or `str`, *optional*, defaults to `False`):
+            Controls the stopping condition for beam-based methods, like beam-search. It accepts the following values:
+            `True`, where the generation stops as soon as there are `num_beams` complete candidates; `False`, where an
+            heuristic is applied and the generation stops when is it very unlikely to find better candidates;
+            `"never"`, where the beam search procedure only stops when there cannot be better candidates (canonical
+            beam search algorithm).
         num_beam_hyps_to_keep (`int`, *optional*, defaults to 1):
             The number of beam hypotheses that shall be returned upon calling
             [`~transformer.BeamSearchScorer.finalize`].
-        num_beam_groups (`int`):
+        num_beam_groups (`int`, *optional*, defaults to 1):
             Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
             See [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.
+        max_length (`int`, *optional*):
+            The maximum length of the sequence to be generated.
     """
 
     def __init__(
@@ -158,10 +165,10 @@ def __init__(
         num_beams: int,
         device: torch.device,
         length_penalty: Optional[float] = 1.0,
-        do_early_stopping: Optional[bool] = False,
+        do_early_stopping: Optional[Union[bool, str]] = False,
         num_beam_hyps_to_keep: Optional[int] = 1,
         num_beam_groups: Optional[int] = 1,
-        **kwargs,
+        max_length: Optional[int] = None,
     ):
         self.num_beams = num_beams
         self.device = device
@@ -172,15 +179,22 @@ def __init__(
         self.group_size = self.num_beams // self.num_beam_groups
 
         self._is_init = False
+        # self._beam_hyps[i*self.num_beam_groups+j] is the beam_hyps of the j-th group in the i-th mini-batch.
+        # If group_beam_search is not used, the list consists of `batch_size` beam_hyps.
         self._beam_hyps = [
             BeamHypotheses(
-                num_beams=self.num_beams,
+                num_beams=self.group_size,
                 length_penalty=self.length_penalty,
                 early_stopping=self.do_early_stopping,
+                max_length=max_length,
             )
-            for _ in range(batch_size)
+            for _ in range(batch_size * self.num_beam_groups)
         ]
-        self._done = torch.tensor([False for _ in range(batch_size)], dtype=torch.bool, device=self.device)
+        # self._done[i*self.num_beam_groups+j] indicates whether the generation of the beam_hyps of the j-th group
+        # in the i-th mini-batch is complete.
+        self._done = torch.tensor(
+            [False for _ in range(batch_size * self.num_beam_groups)], dtype=torch.bool, device=self.device
+        )
 
         if not isinstance(num_beams, int) or num_beams <= 1:
             raise ValueError(
@@ -194,13 +208,6 @@ def __init__(
                 f" divisible by `num_beam_groups`, but is {num_beam_groups} with `num_beams` being {num_beams}."
             )
 
-        if "max_length" in kwargs:
-            warnings.warn(
-                "Passing `max_length` to BeamSearchScorer is deprecated and has no effect. "
-                "`max_length` should be passed directly to `beam_search(...)`, `beam_sample(...)`"
-                ", or `group_beam_search(...)`."
-            )
-
     @property
     def is_done(self) -> bool:
         return self._done.all()
@@ -214,9 +221,11 @@ def process(
         pad_token_id: Optional[int] = None,
         eos_token_id: Optional[Union[int, List[int]]] = None,
         beam_indices: Optional[torch.LongTensor] = None,
-    ) -> Tuple[torch.Tensor]:
-        cur_len = input_ids.shape[-1]
-        batch_size = len(self._beam_hyps)
+        group_index: Optional[int] = 0,
+    ) -> Dict[str, torch.Tensor]:
+        cur_len = input_ids.shape[-1] + 1  # add up to the length which the next_scores is calculated on
+        batch_size = len(self._beam_hyps) // self.num_beam_groups
+
         if not (batch_size == (input_ids.shape[0] // self.group_size)):
             if self.num_beam_groups > 1:
                 raise ValueError(
@@ -237,9 +246,10 @@ def process(
         if isinstance(eos_token_id, int):
             eos_token_id = [eos_token_id]
 
-        for batch_idx, beam_hyp in enumerate(self._beam_hyps):
-            if self._done[batch_idx]:
-                if self.num_beams < len(beam_hyp):
+        for batch_idx in range(batch_size):
+            batch_group_idx = batch_idx * self.num_beam_groups + group_index
+            if self._done[batch_group_idx]:
+                if self.num_beams < len(self._beam_hyps[batch_group_idx]):
                     raise ValueError(f"Batch can only be done if at least {self.num_beams} beams have been generated")
                 if eos_token_id is None or pad_token_id is None:
                     raise ValueError("Generated beams >= num_beams -> eos_token_id and pad_token have to be defined")
@@ -267,7 +277,7 @@ def process(
                     else:
                         beam_index = None
 
-                    beam_hyp.add(
+                    self._beam_hyps[batch_group_idx].add(
                         input_ids[batch_beam_idx].clone(),
                         next_score.item(),
                         beam_indices=beam_index,
@@ -290,7 +300,7 @@ def process(
                 )
 
             # Check if we are done so that we can save a pad step if all(done)
-            self._done[batch_idx] = self._done[batch_idx] or beam_hyp.is_done(
+            self._done[batch_group_idx] = self._done[batch_group_idx] or self._beam_hyps[batch_group_idx].is_done(
                 next_scores[batch_idx].max().item(), cur_len
             )
 
@@ -313,20 +323,20 @@ def finalize(
         eos_token_id: Optional[Union[int, List[int]]] = None,
         beam_indices: Optional[torch.LongTensor] = None,
     ) -> Tuple[torch.LongTensor]:
-        batch_size = len(self._beam_hyps)
+        batch_size = len(self._beam_hyps) // self.num_beam_groups
 
         if isinstance(eos_token_id, int):
             eos_token_id = [eos_token_id]
 
         # finalize all open beam hypotheses and add to generated hypotheses
-        for batch_idx, beam_hyp in enumerate(self._beam_hyps):
-            if self._done[batch_idx]:
+        for batch_group_idx, beam_hyp in enumerate(self._beam_hyps):
+            if self._done[batch_group_idx]:
                 continue
 
             # all open beam hypotheses are added to the beam hypothesis
             # beam hypothesis class automatically keeps the best beams
-            for beam_id in range(self.num_beams):
-                batch_beam_idx = batch_idx * self.num_beams + beam_id
+            for index_per_group in range(self.group_size):
+                batch_beam_idx = batch_group_idx * self.group_size + index_per_group
                 final_score = final_beam_scores[batch_beam_idx].item()
                 final_tokens = input_ids[batch_beam_idx]
                 beam_index = beam_indices[batch_beam_idx] if beam_indices is not None else None
@@ -339,8 +349,10 @@ def finalize(
         best_scores = torch.zeros(batch_size * self.num_beam_hyps_to_keep, device=self.device, dtype=torch.float32)
 
         # retrieve best hypotheses
-        for i, beam_hyp in enumerate(self._beam_hyps):
-            sorted_hyps = sorted(beam_hyp.beams, key=lambda x: x[0])
+        for i in range(batch_size):
+            beam_hyps_in_batch = self._beam_hyps[i * self.num_beam_groups : (i + 1) * self.num_beam_groups]
+            candidate_beams = [beam for beam_hyp in beam_hyps_in_batch for beam in beam_hyp.beams]
+            sorted_hyps = sorted(candidate_beams, key=lambda x: x[0])
             for j in range(self.num_beam_hyps_to_keep):
                 best_hyp_tuple = sorted_hyps.pop()
                 best_score = best_hyp_tuple[0]
@@ -368,7 +380,8 @@ def finalize(
 
         # shorter batches are padded if needed
         if sent_lengths.min().item() != sent_lengths.max().item():
-            assert pad_token_id is not None, "`pad_token_id` has to be defined"
+            if pad_token_id is None:
+                raise ValueError("`pad_token_id` has to be defined")
             decoded.fill_(pad_token_id)
 
         if indices is not None:
@@ -402,8 +415,6 @@ class ConstrainedBeamSearchScorer(BeamScorer):
     Args:
         batch_size (`int`):
             Batch Size of `input_ids` for which standard beam search decoding is run in parallel.
-        max_length (`int`):
-            The maximum length of the sequence to be generated.
         num_beams (`int`):
             Number of beams for beam search.
         constraints (`List[Constraint]`):
@@ -417,14 +428,20 @@ class ConstrainedBeamSearchScorer(BeamScorer):
             the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
             likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while
             `length_penalty` < 0.0 encourages shorter sequences.
-        do_early_stopping (`bool`, *optional*, defaults to `False`):
-            Whether to stop the beam search when at least `num_beams` sentences are finished per batch or not.
+        do_early_stopping (`bool` or `str`, *optional*, defaults to `False`):
+            Controls the stopping condition for beam-based methods, like beam-search. It accepts the following values:
+            `True`, where the generation stops as soon as there are `num_beams` complete candidates; `False`, where an
+            heuristic is applied and the generation stops when is it very unlikely to find better candidates;
+            `"never"`, where the beam search procedure only stops when there cannot be better candidates (canonical
+            beam search algorithm).
         num_beam_hyps_to_keep (`int`, *optional*, defaults to 1):
             The number of beam hypotheses that shall be returned upon calling
             [`~transformer.BeamSearchScorer.finalize`].
-        num_beam_groups (`int`):
+        num_beam_groups (`int`, *optional*, defaults to 1):
             Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
             See [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.
+        max_length (`int`, *optional*):
+            The maximum length of the sequence to be generated.
     """
 
     def __init__(
@@ -434,10 +451,10 @@ def __init__(
         constraints: List[Constraint],
         device: torch.device,
         length_penalty: Optional[float] = 1.0,
-        do_early_stopping: Optional[bool] = False,
+        do_early_stopping: Optional[Union[bool, str]] = False,
         num_beam_hyps_to_keep: Optional[int] = 1,
         num_beam_groups: Optional[int] = 1,
-        **kwargs,
+        max_length: Optional[int] = None,
     ):
         self.num_beams = num_beams
         self.device = device
@@ -454,6 +471,7 @@ def __init__(
                 num_beams=self.num_beams,
                 length_penalty=self.length_penalty,
                 early_stopping=self.do_early_stopping,
+                max_length=max_length,
             )
             for _ in range(batch_size)
         ]
@@ -471,13 +489,6 @@ def __init__(
                 f" divisible by `num_beam_groups`, but is {num_beam_groups} with `num_beams` being {num_beams}."
             )
 
-        if "max_length" in kwargs:
-            warnings.warn(
-                "Passing `max_length` to ConstrainedBeamSearchScorer is deprecated and has no effect. "
-                "`max_length` should be passed directly to `beam_search(...)`, `beam_sample(...)`"
-                ", or `group_beam_search(...)`."
-            )
-
     @property
     def is_done(self) -> bool:
         return self._done.all()
@@ -499,6 +510,7 @@ def process(
         scores_for_all_vocab: torch.FloatTensor,
         pad_token_id: Optional[int] = None,
         eos_token_id: Optional[Union[int, List[int]]] = None,
+        beam_indices: Optional[torch.LongTensor] = None,
     ) -> Tuple[torch.Tensor]:
         r"""
         Args:
@@ -521,6 +533,8 @@ def process(
                 The id of the *padding* token.
             eos_token_id (`Union[int, List[int]]`, *optional*):
                 The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+            beam_indices (`torch.LongTensor`, *optional*):
+                Beam indices indicating to which beam hypothesis each token correspond.
 
         Return:
             `UserDict`: A dictionary composed of the fields as defined above:
@@ -536,7 +550,7 @@ def process(
                 indicating to which beam the next tokens shall be added.
         """
 
-        cur_len = input_ids.shape[-1]
+        cur_len = input_ids.shape[-1] + 1  # add up to the length which the next_scores is calculated on
         batch_size = len(self._beam_hyps)
         if not (batch_size == (input_ids.shape[0] // self.group_size)):
             if self.num_beam_groups > 1:
@@ -579,7 +593,6 @@ def process(
                 batch_beam_idx = batch_idx * self.group_size + next_index
                 # add to generated hypotheses if end of sentence
                 if (eos_token_id is not None) and (next_token.item() in eos_token_id):
-
                     # if beam_token does not belong to top num_beams tokens, it should not be added
                     is_beam_token_worse_than_top_num_beams = beam_token_rank >= self.group_size
                     if is_beam_token_worse_than_top_num_beams:
@@ -587,9 +600,16 @@ def process(
 
                     completes_constraint = self.check_completes_constraints(input_ids[batch_beam_idx].cpu().tolist())
                     if completes_constraint:
+                        if beam_indices is not None:
+                            beam_index = beam_indices[batch_beam_idx]
+                            beam_index = beam_index + (batch_beam_idx,)
+                        else:
+                            beam_index = None
+
                         beam_hyp.add(
                             input_ids[batch_beam_idx].clone(),
                             next_score.item(),
+                            beam_indices=beam_index,
                         )
                 else:
                     # add next predicted token since it is not eos_token
@@ -784,6 +804,7 @@ def finalize(
         max_length: int,
         pad_token_id: Optional[int] = None,
         eos_token_id: Optional[Union[int, List[int]]] = None,
+        beam_indices: Optional[torch.LongTensor] = None,
     ) -> Tuple[torch.LongTensor]:
         batch_size = len(self._beam_hyps)
 
@@ -806,7 +827,8 @@ def finalize(
 
                 completes_constraint = self.check_completes_constraints(final_tokens.cpu().tolist())
                 if completes_constraint:
-                    beam_hyp.add(final_tokens, final_score)
+                    beam_index = beam_indices[batch_beam_idx] if beam_indices is not None else None
+                    beam_hyp.add(final_tokens, final_score, beam_indices=beam_index)
                     ids_collect.append(beam_id)
 
             # due to overly complex constraints or other factors, sometimes we can't gaurantee a successful
@@ -824,6 +846,7 @@ def finalize(
         # select the best hypotheses
         sent_lengths = input_ids.new(batch_size * self.num_beam_hyps_to_keep)
         best = []
+        best_indices = []
         best_scores = torch.zeros(batch_size * self.num_beam_hyps_to_keep, device=self.device, dtype=torch.float32)
 
         # retrieve best hypotheses
@@ -833,10 +856,15 @@ def finalize(
                 best_hyp_tuple = sorted_hyps.pop()
                 best_score = best_hyp_tuple[0]
                 best_hyp = best_hyp_tuple[1]
+                best_index = best_hyp_tuple[2]
                 sent_lengths[self.num_beam_hyps_to_keep * i + j] = len(best_hyp)
 
                 # append to lists
                 best.append(best_hyp)
+
+                # append indices to list
+                best_indices.append(best_index)
+
                 best_scores[i * self.num_beam_hyps_to_keep + j] = best_score
 
         # prepare for adding eos
@@ -844,14 +872,28 @@ def finalize(
 
         sent_max_len = min(sent_lengths_max, max_length) if max_length is not None else sent_lengths_max
         decoded: torch.LongTensor = input_ids.new(batch_size * self.num_beam_hyps_to_keep, sent_max_len)
+
+        if len(best_indices) > 0 and best_indices[0] is not None:
+            indices: torch.LongTensor = input_ids.new(batch_size * self.num_beam_hyps_to_keep, sent_max_len)
+        else:
+            indices = None
+
         # shorter batches are padded if needed
         if sent_lengths.min().item() != sent_lengths.max().item():
-            assert pad_token_id is not None, "`pad_token_id` has to be defined"
+            if pad_token_id is None:
+                raise ValueError("`pad_token_id` has to be defined")
             decoded.fill_(pad_token_id)
 
+        if indices is not None:
+            indices.fill_(-1)
+
         # fill with hypotheses and eos_token_id if the latter fits in
-        for i, hypo in enumerate(best):
+        for i, (hypo, best_idx) in enumerate(zip(best, best_indices)):
             decoded[i, : sent_lengths[i]] = hypo
+
+            if indices is not None:
+                indices[i, : len(best_idx)] = torch.tensor(best_idx)
+
             if sent_lengths[i] < sent_max_len:
                 # inserting only the first eos_token_id
                 decoded[i, sent_lengths[i]] = eos_token_id[0]
@@ -860,21 +902,29 @@ def finalize(
             {
                 "sequences": decoded,
                 "sequence_scores": best_scores,
+                "beam_indices": indices,
             }
         )
 
 
 class BeamHypotheses:
-    def __init__(self, num_beams: int, length_penalty: float, early_stopping: bool):
+    def __init__(self, num_beams: int, length_penalty: float, early_stopping: bool, max_length: Optional[int] = None):
         """
         Initialize n-best list of hypotheses.
         """
         self.length_penalty = length_penalty
         self.early_stopping = early_stopping
+        self.max_length = max_length
         self.num_beams = num_beams
         self.beams = []
         self.worst_score = 1e9
 
+        if not isinstance(self.early_stopping, bool) and self.max_length is None:
+            raise ValueError(
+                "When `do_early_stopping` is set to a string, `max_length` must be defined. Ensure it is passed to the"
+                " BeamScorer class instance at initialization time."
+            )
+
     def __len__(self):
         """
         Number of hypotheses in the list.
@@ -903,9 +953,26 @@ def is_done(self, best_sum_logprobs: float, cur_len: int) -> bool:
 
         if len(self) < self.num_beams:
             return False
-        elif self.early_stopping:
+
+        # `True`: stop as soon as at least `num_beams` hypotheses are finished
+        if self.early_stopping is True:
             return True
+        # `False`: heuristic -- compute best possible score from `cur_len`, even though it is not entirely accurate
+        #  when `length_penalty` is positive. See the discussion below for more details.
+        # https://github.com/huggingface/transformers/pull/20901#issuecomment-1369845565
+        elif self.early_stopping is False:
+            highest_attainable_score = best_sum_logprobs / cur_len**self.length_penalty
+            ret = self.worst_score >= highest_attainable_score
+            return ret
+        # `"never"`: compute the best possible score, depending on the signal of `length_penalty`
         else:
-            cur_score = best_sum_logprobs / cur_len**self.length_penalty
-            ret = self.worst_score >= cur_score
+            # `length_penalty` > 0.0 -> max denominator is obtaned from `max_length`, not from `cur_len` -> min
+            # abs(`highest_attainable_score`) is obtained -> `highest_attainable_score` is negative, hence we obtain
+            # its max this way
+            if self.length_penalty > 0.0:
+                highest_attainable_score = best_sum_logprobs / self.max_length**self.length_penalty
+            # the opposite logic applies here (max `highest_attainable_score` from `cur_len`)
+            else:
+                highest_attainable_score = best_sum_logprobs / cur_len**self.length_penalty
+            ret = self.worst_score >= highest_attainable_score
             return ret
diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
index a01222c8b41e..18ccdb2835b4 100644
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@@ -17,6 +17,7 @@
 import copy
 import json
 import os
+import warnings
 from typing import Any, Dict, Optional, Union
 
 from .. import __version__
@@ -33,51 +34,54 @@
 
 
 logger = logging.get_logger(__name__)
+METADATA_FIELDS = ("_from_model_config", "_commit_hash", "_original_object_hash", "transformers_version")
 
 
 class GenerationConfig(PushToHubMixin):
+    # no-format
     r"""
     Class that holds a configuration for a generation task. A `generate` call supports the following generation methods
     for text-decoder, text-to-text, speech-to-text, and vision-to-text models:
 
         - *greedy decoding* by calling [`~generation.GenerationMixin.greedy_search`] if `num_beams=1` and
-            `do_sample=False`.
+            `do_sample=False`
         - *contrastive search* by calling [`~generation.GenerationMixin.contrastive_search`] if `penalty_alpha>0.`
             and `top_k>1`
         - *multinomial sampling* by calling [`~generation.GenerationMixin.sample`] if `num_beams=1` and
-            `do_sample=True`.
+            `do_sample=True`
         - *beam-search decoding* by calling [`~generation.GenerationMixin.beam_search`] if `num_beams>1` and
-            `do_sample=False`.
+            `do_sample=False`
         - *beam-search multinomial sampling* by calling [`~generation.GenerationMixin.beam_sample`] if
-            `num_beams>1` and `do_sample=True`.
+            `num_beams>1` and `do_sample=True`
         - *diverse beam-search decoding* by calling [`~generation.GenerationMixin.group_beam_search`], if
-            `num_beams>1` and `num_beam_groups>1`.
+            `num_beams>1` and `num_beam_groups>1`
         - *constrained beam-search decoding* by calling [`~generation.GenerationMixin.constrained_beam_search`], if
-            `constraints!=None` or `force_words_ids!=None`.
+            `constraints!=None` or `force_words_ids!=None`
+        - *assisted decoding* by calling [`~generation.GenerationMixin.assisted_decoding`], if
+            `assistant_model` is passed to `.generate()`
 
-    <Tip>
-
-    A generation configuration file can be loaded and saved to disk. Loading and using a generation configuration file
-    does **not** change a model configuration or weights. It only affects the model's behavior at generation time.
-
-    </Tip>
-
-    Most of these parameters are explained in more detail in [this blog
-    post](https://huggingface.co/blog/how-to-generate).
+    You do not need to call any of the above methods directly. Pass custom parameter values to '.generate()'. To learn
+    more about decoding strategies refer to the [text generation strategies guide](../generation_strategies).
 
     Arg:
         > Parameters that control the length of the output
 
         max_length (`int`, *optional*, defaults to 20):
             The maximum length the generated tokens can have. Corresponds to the length of the input prompt +
-            `max_new_tokens`. In general, prefer the use of `max_new_tokens`, which ignores the number of tokens in the
-            prompt.
+            `max_new_tokens`. Its effect is overridden by `max_new_tokens`, if also set.
         max_new_tokens (`int`, *optional*):
             The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.
         min_length (`int`, *optional*, defaults to 0):
-            The minimum length of the sequence to be generated.
-        early_stopping (`bool`, *optional*, defaults to `False`):
-            Whether to stop the beam search when at least `num_beams` sentences are finished per batch or not.
+            The minimum length of the sequence to be generated. Corresponds to the length of the input prompt +
+            `min_new_tokens`. Its effect is overridden by `min_new_tokens`, if also set.
+        min_new_tokens (`int`, *optional*):
+            The minimum numbers of tokens to generate, ignoring the number of tokens in the prompt.
+        early_stopping (`bool` or `str`, *optional*, defaults to `False`):
+            Controls the stopping condition for beam-based methods, like beam-search. It accepts the following values:
+            `True`, where the generation stops as soon as there are `num_beams` complete candidates; `False`, where an
+            heuristic is applied and the generation stops when is it very unlikely to find better candidates;
+            `"never"`, where the beam search procedure only stops when there cannot be better candidates (canonical
+            beam search algorithm).
         max_time(`float`, *optional*):
             The maximum amount of time you allow the computation to run for in seconds. generation will still finish
             the current pass after allocated time has been passed.
@@ -100,21 +104,39 @@ class GenerationConfig(PushToHubMixin):
         > Parameters for manipulation of the model output logits
 
         temperature (`float`, *optional*, defaults to 1.0):
-            The value used to module the next token probabilities.
+            The value used to modulate the next token probabilities.
         top_k (`int`, *optional*, defaults to 50):
             The number of highest probability vocabulary tokens to keep for top-k-filtering.
         top_p (`float`, *optional*, defaults to 1.0):
             If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to
             `top_p` or higher are kept for generation.
         typical_p (`float`, *optional*, defaults to 1.0):
-            The amount of probability mass from the original distribution to be considered in typical decoding. If set
-            to 1.0 it takes no effect. See [this paper](https://arxiv.org/pdf/2202.00666.pdf) for more details.
+            Local typicality measures how similar the conditional probability of predicting a target token next is to
+            the expected conditional probability of predicting a random token next, given the partial text already
+            generated. If set to float < 1, the smallest set of the most locally typical tokens with probabilities that
+            add up to `typical_p` or higher are kept for generation. See [this
+            paper](https://arxiv.org/pdf/2202.00666.pdf) for more details.
+        epsilon_cutoff (`float`, *optional*, defaults to 0.0):
+            If set to float strictly between 0 and 1, only tokens with a conditional probability greater than
+            `epsilon_cutoff` will be sampled. In the paper, suggested values range from 3e-4 to 9e-4, depending on the
+            size of the model. See [Truncation Sampling as Language Model
+            Desmoothing](https://arxiv.org/abs/2210.15191) for more details.
+        eta_cutoff (`float`, *optional*, defaults to 0.0):
+            Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to float strictly between
+            0 and 1, a token is only considered if it is greater than either `eta_cutoff` or `sqrt(eta_cutoff) *
+            exp(-entropy(softmax(next_token_logits)))`. The latter term is intuitively the expected next token
+            probability, scaled by `sqrt(eta_cutoff)`. In the paper, suggested values range from 3e-4 to 2e-3,
+            depending on the size of the model. See [Truncation Sampling as Language Model
+            Desmoothing](https://arxiv.org/abs/2210.15191) for more details.
         diversity_penalty (`float`, *optional*, defaults to 0.0):
             This value is subtracted from a beam's score if it generates a token same as any beam from other group at a
             particular time. Note that `diversity_penalty` is only effective if `group beam search` is enabled.
         repetition_penalty (`float`, *optional*, defaults to 1.0):
             The parameter for repetition penalty. 1.0 means no penalty. See [this
             paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+        encoder_repetition_penalty (`float`, *optional*, defaults to 1.0):
+            The paramater for encoder_repetition_penalty. An exponential penalty on sequences that are not in the
+            original input. 1.0 means no penalty.
         length_penalty (`float`, *optional*, defaults to 1.0):
             Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
             the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
@@ -123,9 +145,8 @@ class GenerationConfig(PushToHubMixin):
         no_repeat_ngram_size (`int`, *optional*, defaults to 0):
             If set to int > 0, all ngrams of that size can only occur once.
         bad_words_ids(`List[List[int]]`, *optional*):
-            List of token ids that are not allowed to be generated. In order to get the token ids of the words that
-            should not appear in the generated text, use `tokenizer(bad_words, add_prefix_space=True,
-            add_special_tokens=False).input_ids`.
+            List of list of token ids that are not allowed to be generated. Check
+            [`~generation.NoBadWordsLogitsProcessor`] for further documentation and examples.
         force_words_ids(`List[List[int]]` or `List[List[List[int]]]`, *optional*):
             List of token ids that must be generated. If given a `List[List[int]]`, this is treated as a simple list of
             words that must be included, the opposite to `bad_words_ids`. If given `List[List[List[int]]]`, this
@@ -162,6 +183,17 @@ class GenerationConfig(PushToHubMixin):
             A list of pairs of integers which indicates a mapping from generation indices to token indices that will be
             forced before sampling. For example, `[[1, 123]]` means the second generated token will always be a token
             of index 123.
+        sequence_bias (`Dict[Tuple[int], float]`, *optional*)):
+            Dictionary that maps a sequence of tokens to its bias term. Positive biases increase the odds of the
+            sequence being selected, while negative biases do the opposite. Check
+            [`~generation.SequenceBiasLogitsProcessor`] for further documentation and examples.
+        guidance_scale (`float`, *optional*):
+            The guidance scale for classifier free guidance (CFG). CFG is enabled by setting `guidance_scale > 1`.
+            Higher guidance scale encourages the model to generate samples that are more closely linked to the input
+            prompt, usually at the expense of poorer quality.
+        low_memory (`bool`, *optional*):
+            Switch to sequential topk for contrastive search to reduce peak memory. Used with contrastive search.
+
 
         > Parameters that define the output variables of `generate`
 
@@ -204,9 +236,11 @@ class GenerationConfig(PushToHubMixin):
 
     def __init__(self, **kwargs):
         # Parameters that control the length of the output
+        # if the default `max_length` is updated here, make sure to update the `generate` tests following https://github.com/huggingface/transformers/pull/25030
         self.max_length = kwargs.pop("max_length", 20)
         self.max_new_tokens = kwargs.pop("max_new_tokens", None)
         self.min_length = kwargs.pop("min_length", 0)
+        self.min_new_tokens = kwargs.pop("min_new_tokens", None)
         self.early_stopping = kwargs.pop("early_stopping", False)
         self.max_time = kwargs.pop("max_time", None)
 
@@ -222,8 +256,11 @@ def __init__(self, **kwargs):
         self.top_k = kwargs.pop("top_k", 50)
         self.top_p = kwargs.pop("top_p", 1.0)
         self.typical_p = kwargs.pop("typical_p", 1.0)
+        self.epsilon_cutoff = kwargs.pop("epsilon_cutoff", 0.0)
+        self.eta_cutoff = kwargs.pop("eta_cutoff", 0.0)
         self.diversity_penalty = kwargs.pop("diversity_penalty", 0.0)
         self.repetition_penalty = kwargs.pop("repetition_penalty", 1.0)
+        self.encoder_repetition_penalty = kwargs.pop("encoder_repetition_penalty", 1.0)
         self.length_penalty = kwargs.pop("length_penalty", 1.0)
         self.no_repeat_ngram_size = kwargs.pop("no_repeat_ngram_size", 0)
         self.bad_words_ids = kwargs.pop("bad_words_ids", None)
@@ -237,6 +274,9 @@ def __init__(self, **kwargs):
         self.suppress_tokens = kwargs.pop("suppress_tokens", None)
         self.begin_suppress_tokens = kwargs.pop("begin_suppress_tokens", None)
         self.forced_decoder_ids = kwargs.pop("forced_decoder_ids", None)
+        self.sequence_bias = kwargs.pop("sequence_bias", None)
+        self.guidance_scale = kwargs.pop("guidance_scale", None)
+        self.low_memory = kwargs.pop("low_memory", None)
 
         # Parameters that define the output variables of `generate`
         self.num_return_sequences = kwargs.pop("num_return_sequences", 1)
@@ -263,24 +303,184 @@ def __init__(self, **kwargs):
         self._commit_hash = kwargs.pop("_commit_hash", None)
         self.transformers_version = kwargs.pop("transformers_version", __version__)
 
+        # Additional attributes without default values
+        if not self._from_model_config:
+            # we don't want to copy values from the model config if we're initializing a `GenerationConfig` from a
+            # model's default configuration file
+            for key, value in kwargs.items():
+                try:
+                    setattr(self, key, value)
+                except AttributeError as err:
+                    logger.error(f"Can't set {key} with value {value} for {self}")
+                    raise err
+
+        # Validate the values of the attributes
+        self.validate(is_init=True)
+
+    def __hash__(self):
+        return hash(self.to_json_string(ignore_metadata=True))
+
     def __eq__(self, other):
-        self_dict = self.__dict__.copy()
-        other_dict = other.__dict__.copy()
-        # ignore metadata
-        for metadata_field in ("_from_model_config", "_commit_hash", "transformers_version"):
-            self_dict.pop(metadata_field, None)
-            other_dict.pop(metadata_field, None)
-        return self_dict == other_dict
+        if not isinstance(other, GenerationConfig):
+            return False
+
+        self_without_metadata = self.to_json_string(use_diff=False, ignore_metadata=True)
+        other_without_metadata = other.to_json_string(use_diff=False, ignore_metadata=True)
+        return self_without_metadata == other_without_metadata
 
     def __repr__(self):
-        return f"{self.__class__.__name__} {self.to_json_string()}"
+        return f"{self.__class__.__name__} {self.to_json_string(ignore_metadata=True)}"
+
+    def validate(self, is_init=False):
+        """
+        Validates the values of the attributes of the [`GenerationConfig`] instance. Raises exceptions in the presence
+        of parameterization that can be detected as incorrect from the configuration instance alone.
+
+        Note that some parameters are best validated at generate runtime, as they may depend on other inputs and/or the
+        model, such as parameters related to the generation length.
+        """
+
+        # Validation of individual attributes
+        if self.early_stopping not in {True, False, "never"}:
+            raise ValueError(f"`early_stopping` must be a boolean or 'never', but is {self.early_stopping}.")
+
+        # Validation of attribute relations:
+        fix_location = ""
+        if is_init:
+            fix_location = (
+                " This was detected when initializing the generation config instance, which means the corresponding "
+                "file may hold incorrect parameterization and should be fixed."
+            )
+
+        # 1. detect sampling-only parameterization when not in sampling mode
+        if self.do_sample is False:
+            greedy_wrong_parameter_msg = (
+                "`do_sample` is set to `False`. However, `{flag_name}` is set to `{flag_value}` -- this flag is only "
+                "used in sample-based generation modes. You should set `do_sample=True` or unset `{flag_name}`."
+                + fix_location
+            )
+            if self.temperature != 1.0:
+                warnings.warn(
+                    greedy_wrong_parameter_msg.format(flag_name="temperature", flag_value=self.temperature),
+                    UserWarning,
+                )
+            if self.top_p != 1.0:
+                warnings.warn(
+                    greedy_wrong_parameter_msg.format(flag_name="top_p", flag_value=self.top_p),
+                    UserWarning,
+                )
+            if self.typical_p != 1.0:
+                warnings.warn(
+                    greedy_wrong_parameter_msg.format(flag_name="typical_p", flag_value=self.typical_p),
+                    UserWarning,
+                )
+            if self.top_k != 50 and self.penalty_alpha is None:  # contrastive search uses top_k
+                warnings.warn(
+                    greedy_wrong_parameter_msg.format(flag_name="top_k", flag_value=self.top_k),
+                    UserWarning,
+                )
+            if self.epsilon_cutoff != 0.0:
+                warnings.warn(
+                    greedy_wrong_parameter_msg.format(flag_name="epsilon_cutoff", flag_value=self.epsilon_cutoff),
+                    UserWarning,
+                )
+            if self.eta_cutoff != 0.0:
+                warnings.warn(
+                    greedy_wrong_parameter_msg.format(flag_name="eta_cutoff", flag_value=self.eta_cutoff),
+                    UserWarning,
+                )
+
+        # 2. detect beam-only parameterization when not in beam mode
+        if self.num_beams == 1:
+            single_beam_wrong_parameter_msg = (
+                "`num_beams` is set to 1. However, `{flag_name}` is set to `{flag_value}` -- this flag is only used "
+                "in beam-based generation modes. You should set `num_beams>1` or unset `{flag_name}`." + fix_location
+            )
+            if self.early_stopping is not False:
+                warnings.warn(
+                    single_beam_wrong_parameter_msg.format(flag_name="early_stopping", flag_value=self.early_stopping),
+                    UserWarning,
+                )
+            if self.num_beam_groups != 1:
+                warnings.warn(
+                    single_beam_wrong_parameter_msg.format(
+                        flag_name="num_beam_groups", flag_value=self.num_beam_groups
+                    ),
+                    UserWarning,
+                )
+            if self.diversity_penalty != 0.0:
+                warnings.warn(
+                    single_beam_wrong_parameter_msg.format(
+                        flag_name="diversity_penalty", flag_value=self.diversity_penalty
+                    ),
+                    UserWarning,
+                )
+            if self.length_penalty != 1.0:
+                warnings.warn(
+                    single_beam_wrong_parameter_msg.format(flag_name="length_penalty", flag_value=self.length_penalty),
+                    UserWarning,
+                )
+            if self.constraints is not None:
+                warnings.warn(
+                    single_beam_wrong_parameter_msg.format(flag_name="constraints", flag_value=self.constraints),
+                    UserWarning,
+                )
+
+        # 3. detect incorrect paramaterization specific to advanced beam modes
+        else:
+            # constrained beam search
+            if self.constraints is not None:
+                constrained_wrong_parameter_msg = (
+                    "`constraints` is not `None`, triggering constrained beam search. However, `{flag_name}` is set "
+                    "to `{flag_value}`, which is incompatible with this generation mode. Set `constraints=None` or "
+                    "unset `{flag_name}` to continue." + fix_location
+                )
+                if self.do_sample is True:
+                    raise ValueError(
+                        constrained_wrong_parameter_msg.format(flag_name="do_sample", flag_value=self.do_sample)
+                    )
+                if self.num_beam_groups != 1:
+                    raise ValueError(
+                        constrained_wrong_parameter_msg.format(
+                            flag_name="num_beam_groups", flag_value=self.num_beam_groups
+                        )
+                    )
+            # group beam search
+            if self.diversity_penalty != 0.0 or self.num_beam_groups != 1:
+                group_error_prefix = (
+                    "`diversity_penalty` is not 0.0 or `num_beam_groups` is not 1, triggering group beam search. In "
+                    "this generation mode, "
+                )
+                if self.do_sample is True:
+                    raise ValueError(group_error_prefix + "`do_sample` must be set to `False`")
+                if self.num_beams % self.num_beam_groups != 0:
+                    raise ValueError(group_error_prefix + "`num_beams` should be divisible by `num_beam_groups`")
+                if self.diversity_penalty == 0.0:
+                    raise ValueError(
+                        group_error_prefix
+                        + "`diversity_penalty` should be greater than `0.0`, otherwise your groups will be identical."
+                    )
+
+        # 4. check `num_return_sequences`
+        if self.num_return_sequences != 1:
+            if self.num_beams == 1:
+                if self.do_sample is False:
+                    raise ValueError(
+                        "Greedy methods without beam search do not support `num_return_sequences` different than 1 "
+                        f"(got {self.num_return_sequences})."
+                    )
+            elif self.num_return_sequences > self.num_beams:
+                raise ValueError(
+                    f"`num_return_sequences` ({self.num_return_sequences}) has to be smaller or equal to `num_beams` "
+                    f"({self.num_beams})."
+                )
 
     def save_pretrained(
         self,
         save_directory: Union[str, os.PathLike],
         config_file_name: Optional[Union[str, os.PathLike]] = None,
         push_to_hub: bool = False,
-        **kwargs
+        **kwargs,
     ):
         r"""
         Save a generation configuration object to the directory `save_directory`, so that it can be re-loaded using the
@@ -295,9 +495,37 @@ def save_pretrained(
                 Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
                 repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
                 namespace).
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
         """
+
+        # At save time, validate the instance -- if any warning/exception is thrown, we refuse to save the instance
+        try:
+            with warnings.catch_warnings(record=True) as caught_warnings:
+                self.validate()
+            for w in caught_warnings:
+                raise ValueError(w.message)
+        except ValueError as exc:
+            warnings.warn(
+                "The generation config instance is invalid -- `.validate()` throws warnings and/or exceptions. "
+                "Fix these issues to save the configuration. This warning will be raised to an exception in v4.34."
+                "\n\nThrown during validation:\n" + str(exc),
+                UserWarning,
+            )
+            return
+
+        use_auth_token = kwargs.pop("use_auth_token", None)
+
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if kwargs.get("token", None) is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            kwargs["token"] = use_auth_token
+
         config_file_name = config_file_name if config_file_name is not None else GENERATION_CONFIG_NAME
 
         if os.path.isfile(save_directory):
@@ -308,7 +536,7 @@ def save_pretrained(
         if push_to_hub:
             commit_message = kwargs.pop("commit_message", None)
             repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
-            repo_id, token = self._create_repo(repo_id, **kwargs)
+            repo_id = self._create_repo(repo_id, **kwargs)
             files_timestamps = self._get_files_timestamps(save_directory)
 
         output_config_file = os.path.join(save_directory, config_file_name)
@@ -318,7 +546,11 @@ def save_pretrained(
 
         if push_to_hub:
             self._upload_modified_files(
-                save_directory, repo_id, files_timestamps, commit_message=commit_message, token=token
+                save_directory,
+                repo_id,
+                files_timestamps,
+                commit_message=commit_message,
+                token=kwargs.get("token"),
             )
 
     @classmethod
@@ -326,7 +558,12 @@ def from_pretrained(
         cls,
         pretrained_model_name: Union[str, os.PathLike],
         config_file_name: Optional[Union[str, os.PathLike]] = None,
-        **kwargs
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
+        **kwargs,
     ) -> "GenerationConfig":
         r"""
         Instantiate a [`GenerationConfig`] from a generation configuration file.
@@ -354,7 +591,7 @@ def from_pretrained(
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-            use_auth_token (`str` or `bool`, *optional*):
+            token (`str` or `bool`, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
                 the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
             revision (`str`, *optional*, defaults to `"main"`):
@@ -404,7 +641,7 @@ def from_pretrained(
         >>> # If you'd like to try a minor variation to an existing configuration, you can also pass generation
         >>> # arguments to `.from_pretrained()`. Be mindful that typos and unused arguments will be ignored
         >>> generation_config, unused_kwargs = GenerationConfig.from_pretrained(
-        ...     "gpt2", top_k=1, foo=False, return_unused_kwargs=True
+        ...     "gpt2", top_k=1, foo=False, do_sample=True, return_unused_kwargs=True
         ... )
         >>> generation_config.top_k
         1
@@ -414,18 +651,24 @@ def from_pretrained(
         ```"""
         config_file_name = config_file_name if config_file_name is not None else GENERATION_CONFIG_NAME
 
-        cache_dir = kwargs.pop("cache_dir", None)
-        force_download = kwargs.pop("force_download", False)
         resume_download = kwargs.pop("resume_download", False)
         proxies = kwargs.pop("proxies", None)
         use_auth_token = kwargs.pop("use_auth_token", None)
-        local_files_only = kwargs.pop("local_files_only", False)
-        revision = kwargs.pop("revision", None)
         subfolder = kwargs.pop("subfolder", "")
         from_pipeline = kwargs.pop("_from_pipeline", None)
         from_auto_class = kwargs.pop("_from_auto", False)
         commit_hash = kwargs.pop("_commit_hash", None)
 
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
         user_agent = {"file_type": "config", "from_auto_class": from_auto_class}
         if from_pipeline is not None:
             user_agent["using_pipeline"] = from_pipeline
@@ -453,7 +696,7 @@ def from_pretrained(
                     proxies=proxies,
                     resume_download=resume_download,
                     local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
+                    use_auth_token=token,
                     user_agent=user_agent,
                     revision=revision,
                     subfolder=subfolder,
@@ -487,7 +730,9 @@ def from_pretrained(
         else:
             logger.info(f"loading configuration file {configuration_file} from cache at {resolved_config_file}")
 
-        return cls.from_dict(config_dict, **kwargs)
+        config = cls.from_dict(config_dict, **kwargs)
+        config._original_object_hash = hash(config)  # Hash to detect whether the instance was modified
+        return config
 
     @classmethod
     def _dict_from_json_file(cls, json_file: Union[str, os.PathLike]):
@@ -518,7 +763,9 @@ def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "GenerationConfig":
         if "_commit_hash" in kwargs and "_commit_hash" in config_dict:
             kwargs["_commit_hash"] = config_dict["_commit_hash"]
 
-        config = cls(**config_dict)
+        # The line below allows model-specific config to be loaded as well through kwargs, with safety checks.
+        # See https://github.com/huggingface/transformers/pull/21269
+        config = cls(**{**config_dict, **kwargs})
         unused_kwargs = config.update(**kwargs)
 
         logger.info(f"Generate config {config}")
@@ -527,6 +774,18 @@ def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "GenerationConfig":
         else:
             return config
 
+    def dict_torch_dtype_to_str(self, d: Dict[str, Any]) -> None:
+        """
+        Checks whether the passed dictionary and its nested dicts have a *torch_dtype* key and if it's not None,
+        converts torch.dtype to a string of just the type. For example, `torch.float32` get converted into *"float32"*
+        string, which can then be stored in the json format.
+        """
+        if d.get("torch_dtype", None) is not None and not isinstance(d["torch_dtype"], str):
+            d["torch_dtype"] = str(d["torch_dtype"]).split(".")[1]
+        for value in d.values():
+            if isinstance(value, dict):
+                self.dict_torch_dtype_to_str(value)
+
     def to_diff_dict(self) -> Dict[str, Any]:
         """
         Removes all attributes from config which correspond to the default config attributes for better readability and
@@ -547,6 +806,7 @@ def to_diff_dict(self) -> Dict[str, Any]:
             if key not in default_config_dict or key == "transformers_version" or value != default_config_dict[key]:
                 serializable_config_dict[key] = value
 
+        self.dict_torch_dtype_to_str(serializable_config_dict)
         return serializable_config_dict
 
     def to_dict(self) -> Dict[str, Any]:
@@ -557,15 +817,20 @@ def to_dict(self) -> Dict[str, Any]:
             `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
         """
         output = copy.deepcopy(self.__dict__)
+
+        # Fields to ignore at serialization time
         if "_commit_hash" in output:
             del output["_commit_hash"]
+        if "_original_object_hash" in output:
+            del output["_original_object_hash"]
 
         # Transformers version when serializing this file
         output["transformers_version"] = __version__
 
+        self.dict_torch_dtype_to_str(output)
         return output
 
-    def to_json_string(self, use_diff: bool = True) -> str:
+    def to_json_string(self, use_diff: bool = True, ignore_metadata: bool = False) -> str:
         """
         Serializes this instance to a JSON string.
 
@@ -573,6 +838,8 @@ def to_json_string(self, use_diff: bool = True) -> str:
             use_diff (`bool`, *optional*, defaults to `True`):
                 If set to `True`, only the difference between the config instance and the default `GenerationConfig()`
                 is serialized to JSON string.
+            ignore_metadata (`bool`, *optional*, defaults to `False`):
+                Whether to ignore the metadata fields present in the instance
 
         Returns:
             `str`: String containing all the attributes that make up this configuration instance in JSON format.
@@ -581,6 +848,11 @@ def to_json_string(self, use_diff: bool = True) -> str:
             config_dict = self.to_diff_dict()
         else:
             config_dict = self.to_dict()
+
+        if ignore_metadata:
+            for metadata_field in METADATA_FIELDS:
+                config_dict.pop(metadata_field, None)
+
         return json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
 
     def to_json_file(self, json_file_path: Union[str, os.PathLike], use_diff: bool = True):
@@ -611,11 +883,12 @@ def from_model_config(cls, model_config: PretrainedConfig) -> "GenerationConfig"
             [`GenerationConfig`]: The configuration object instantiated from those parameters.
         """
         config_dict = model_config.to_dict()
-        config = cls.from_dict(config_dict, return_unused_kwargs=False)
+        config_dict.pop("_from_model_config", None)
+        config = cls.from_dict(config_dict, return_unused_kwargs=False, _from_model_config=True)
 
         # Special case: some models have generation attributes set in the decoder. Use them if still unset in the
         # generation config.
-        for decoder_name in ("decoder", "generator"):
+        for decoder_name in ("decoder", "generator", "text_config"):
             if decoder_name in config_dict:
                 default_generation_config = GenerationConfig()
                 decoder_config = config_dict[decoder_name]
@@ -623,7 +896,7 @@ def from_model_config(cls, model_config: PretrainedConfig) -> "GenerationConfig"
                     if attr in decoder_config and getattr(config, attr) == getattr(default_generation_config, attr):
                         setattr(config, attr, decoder_config[attr])
 
-        config._from_model_config = True
+        config._original_object_hash = hash(config)  # Hash to detect whether the instance was modified
         return config
 
     def update(self, **kwargs):
diff --git a/src/transformers/generation/flax_logits_process.py b/src/transformers/generation/flax_logits_process.py
index 12fc9c39e599..5c30b92755a4 100644
--- a/src/transformers/generation/flax_logits_process.py
+++ b/src/transformers/generation/flax_logits_process.py
@@ -38,7 +38,7 @@
         scores (`jnp.ndarray` of shape `(batch_size, config.vocab_size)`):
             Prediction scores of a language modeling head. These can be logits for each vocabulary when not using beam
             search or log softmax for each vocabulary token when using beam search
-        kwargs:
+        kwargs (`Dict[str, Any]`, *optional*):
             Additional logits processor specific kwargs.
 
     Return:
@@ -120,7 +120,7 @@ class FlaxTopPLogitsWarper(FlaxLogitsWarper):
         top_p (`float`):
             If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
             higher are kept for generation.
-        filter_value (`float`, *optional*, defaults to `-float("Inf")`):
+        filter_value (`float`, *optional*, defaults to -inf):
             All filtered values will be set to this float value.
         min_tokens_to_keep (`int`, *optional*, defaults to 1):
             Minimum number of tokens that cannot be filtered.
@@ -129,6 +129,8 @@ class FlaxTopPLogitsWarper(FlaxLogitsWarper):
     def __init__(self, top_p: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
         if not isinstance(top_p, float) or (top_p < 0 or top_p > 1.0):
             raise ValueError(f"`top_p` has to be a float > 0 and < 1, but is {top_p}")
+        if not isinstance(min_tokens_to_keep, int) or (min_tokens_to_keep < 1):
+            raise ValueError(f"`min_tokens_to_keep` has to be a positive integer, but is {min_tokens_to_keep}")
 
         self.top_p = top_p
         self.filter_value = filter_value
@@ -161,7 +163,7 @@ class FlaxTopKLogitsWarper(FlaxLogitsWarper):
     Args:
         top_k (`int`):
             The number of highest probability vocabulary tokens to keep for top-k-filtering.
-        filter_value (`float`, *optional*, defaults to `-float("Inf")`):
+        filter_value (`float`, *optional*, defaults to -inf):
             All filtered values will be set to this float value.
         min_tokens_to_keep (`int`, *optional*, defaults to 1):
             Minimum number of tokens that cannot be filtered.
@@ -171,15 +173,14 @@ def __init__(self, top_k: int, filter_value: float = -float("Inf"), min_tokens_t
         if not isinstance(top_k, int) or top_k <= 0:
             raise ValueError(f"`top_k` has to be a strictly positive integer, but is {top_k}")
 
-        self.top_k = top_k
+        self.top_k = max(top_k, min_tokens_to_keep)
         self.filter_value = filter_value
-        self.min_tokens_to_keep = min_tokens_to_keep
 
     def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int) -> jnp.ndarray:
         batch_size, vocab_size = scores.shape
         next_scores_flat = jnp.full(batch_size * vocab_size, self.filter_value)
 
-        topk = min(max(self.top_k, self.min_tokens_to_keep), scores.shape[-1])  # Safety check
+        topk = min(self.top_k, scores.shape[-1])  # Safety check
         topk_scores, topk_indices = lax.top_k(scores, topk)
         shift = jnp.broadcast_to((jnp.arange(batch_size) * vocab_size)[:, None], (batch_size, topk)).flatten()
         topk_scores_flat = topk_scores.flatten()
@@ -259,10 +260,198 @@ def __init__(self, min_length: int, eos_token_id: int):
         self.eos_token_id = eos_token_id
 
     def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int) -> jnp.ndarray:
-
         # create boolean flag to decide if min length penalty should be applied
         apply_penalty = 1 - jnp.clip(cur_len - self.min_length, 0, 1)
 
         scores = jnp.where(apply_penalty, scores.at[:, self.eos_token_id].set(-float("inf")), scores)
 
         return scores
+
+
+class FlaxSuppressTokensAtBeginLogitsProcessor(FlaxLogitsProcessor):
+    r"""
+    [`FlaxLogitsProcessor`] supressing a list of tokens as soon as the `generate` function starts generating using
+    `begin_index` tokens. This should ensure that the tokens defined by `begin_suppress_tokens` are not sampled at the
+    begining of the generation.
+
+    Args:
+        begin_suppress_tokens (`List[int]`):
+            Tokens to not sample.
+        begin_index (`int`):
+            Index where the tokens are suppressed.
+    """
+
+    def __init__(self, begin_suppress_tokens, begin_index):
+        self.begin_suppress_tokens = list(begin_suppress_tokens)
+        self.begin_index = begin_index
+
+    def __call__(self, input_ids, scores, cur_len: int):
+        apply_penalty = 1 - jnp.bool_(cur_len - self.begin_index)
+
+        scores = jnp.where(apply_penalty, scores.at[:, self.begin_suppress_tokens].set(-float("inf")), scores)
+
+        return scores
+
+
+class FlaxSuppressTokensLogitsProcessor(FlaxLogitsProcessor):
+    r"""
+    [`FlaxLogitsProcessor`] suppressing a list of tokens at each decoding step. The processor will set their log probs
+    to be `-inf` so they are not sampled.
+
+    Args:
+        suppress_tokens (`list`):
+            Tokens to not sample.
+    """
+
+    def __init__(self, suppress_tokens: list):
+        self.suppress_tokens = list(suppress_tokens)
+
+    def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int) -> jnp.ndarray:
+        scores = scores.at[..., self.suppress_tokens].set(-float("inf"))
+
+        return scores
+
+
+class FlaxForceTokensLogitsProcessor(FlaxLogitsProcessor):
+    r"""
+    [`FlaxLogitsProcessor`] that takes a list of pairs of integers which indicates a mapping from generation indices to
+    token indices that will be forced before sampling. The processor will set their log probs to 0 and all other tokens
+    to `-inf` so that they are sampled at their corresponding index.
+
+    Args:
+        force_token_map (`list`):
+            Map giving token ids and indices where they will be forced to be sampled.
+    """
+
+    def __init__(self, force_token_map):
+        force_token_map = dict(force_token_map)
+        # Converts the dictionary of format {index: token} containing the tokens to be forced to an array, where the
+        # index of the array corresponds to the index of the token to be forced, for XLA compatibility.
+        # Indexes without forced tokens will have a negative value.
+        force_token_array = jnp.ones((max(force_token_map.keys()) + 1), dtype=jnp.int32) * -1
+        for index, token in force_token_map.items():
+            if token is not None:
+                force_token_array = force_token_array.at[index].set(token)
+        self.force_token_array = jnp.int32(force_token_array)
+
+    def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int) -> jnp.ndarray:
+        def _force_token(generation_idx):
+            batch_size = scores.shape[0]
+            current_token = self.force_token_array[generation_idx]
+
+            new_scores = jnp.ones_like(scores, dtype=scores.dtype) * -float("inf")
+            updates = jnp.zeros((batch_size, 1), dtype=scores.dtype)
+            new_scores = lax.dynamic_update_slice(new_scores, updates, (0, current_token))
+            return new_scores
+
+        scores = lax.cond(
+            cur_len >= self.force_token_array.shape[0],
+            # If the current length is geq than the length of force_token_array, the processor does nothing.
+            lambda: scores,
+            # Otherwise, it may force a certain token.
+            lambda: lax.cond(
+                self.force_token_array[cur_len] >= 0,
+                # Only valid (positive) tokens are forced
+                lambda: _force_token(cur_len),
+                # Otherwise, the processor does nothing.
+                lambda: scores,
+            ),
+        )
+        return scores
+
+
+class FlaxWhisperTimeStampLogitsProcessor(FlaxLogitsProcessor):
+    r"""
+    Whisper specific Processor. This processor can be used to force a list of tokens. The processor will set their log
+    probs to `inf` so that they are sampled at their corresponding index.
+
+    Args:
+        generate_config (`GenerateConfig`):
+            The generate config used to generate the output. The following parameters are required:
+                eos_token_id (`int`, *optional*, defaults to 50257):
+                    The id of the *end-of-sequence* token.
+                no_timestamps_token_id (`int`, *optional*, defaults to 50363):
+                    The id of the `"<|notimestamps|>"` token.
+                max_initial_timestamp_index (`int`, *optional*, defaults to 1):
+                    Used to set the maximum value of the initial timestamp. This is used to prevent the model from
+                    predicting timestamps that are too far in the future.
+    """
+
+    def __init__(self, generate_config, model_config, decoder_input_length):
+        self.eos_token_id = generate_config.eos_token_id
+        self.no_timestamps_token_id = generate_config.no_timestamps_token_id
+        self.timestamp_begin = generate_config.no_timestamps_token_id + 1
+
+        self.begin_index = decoder_input_length + 1
+
+        if generate_config.is_multilingual:
+            # room for language token and task token
+            self.begin_index += 2
+        if hasattr(generate_config, "max_initial_timestamp_index"):
+            self.max_initial_timestamp_index = generate_config.max_initial_timestamp_index
+        else:
+            self.max_initial_timestamp_index = model_config.vocab_size
+        if self.max_initial_timestamp_index is None:
+            self.max_initial_timestamp_index = model_config.vocab_size
+
+    def __call__(self, input_ids, scores, cur_len):
+        # suppress <|notimestamps|> which is handled by without_timestamps
+        scores = scores.at[:, self.no_timestamps_token_id].set(-float("inf"))
+
+        def handle_pairs(input_ids_k, scores_k):
+            last_was_timestamp = jnp.where((cur_len - self.begin_index) >= 1, True, False)
+            last_was_timestamp = jnp.where(
+                input_ids_k[cur_len - 1] >= self.timestamp_begin,
+                True and last_was_timestamp,
+                False,
+            )
+
+            penultimate_was_timestamp = jnp.where((cur_len - self.begin_index) < 2, True, False)
+            penultimate_was_timestamp = jnp.where(
+                input_ids_k[cur_len - 2] >= self.timestamp_begin,
+                True,
+                penultimate_was_timestamp,
+            )
+
+            return jnp.where(
+                last_was_timestamp,
+                jnp.where(
+                    penultimate_was_timestamp > 0,
+                    scores_k.at[self.timestamp_begin :].set(-float("inf")),
+                    scores_k.at[: self.eos_token_id].set(-float("inf")),
+                ),
+                scores_k,
+            )
+
+        scores = jax.vmap(handle_pairs)(input_ids, scores)
+
+        apply_max_initial_timestamp = jnp.where(cur_len == self.begin_index, True, False)
+        apply_max_initial_timestamp = jnp.where(
+            self.max_initial_timestamp_index is not None,
+            True and apply_max_initial_timestamp,
+            False,
+        )
+
+        last_allowed = self.timestamp_begin + self.max_initial_timestamp_index
+
+        scores = jnp.where(
+            apply_max_initial_timestamp,
+            scores.at[:, last_allowed + 1 :].set(-float("inf")),
+            scores,
+        )
+
+        # if sum of probability over timestamps is above any other token, sample timestamp
+        logprobs = jax.nn.log_softmax(scores, axis=-1)
+
+        def handle_cumulative_probs(logprobs_k, scores_k):
+            timestamp_logprob = jax.nn.logsumexp(logprobs_k[self.timestamp_begin :], axis=-1)
+            max_text_token_logprob = jnp.max(logprobs_k[: self.timestamp_begin])
+            return jnp.where(
+                timestamp_logprob > max_text_token_logprob,
+                scores_k.at[: self.timestamp_begin].set(-float("inf")),
+                scores_k,
+            )
+
+        scores = jax.vmap(handle_cumulative_probs)(logprobs, scores)
+
+        return scores
diff --git a/src/transformers/generation/flax_utils.py b/src/transformers/generation/flax_utils.py
index cb82c438f799..c83afdc0a580 100644
--- a/src/transformers/generation/flax_utils.py
+++ b/src/transformers/generation/flax_utils.py
@@ -19,13 +19,12 @@
 import inspect
 import warnings
 from functools import partial
-from typing import Any, Dict, Optional
-
-import numpy as np
+from typing import Any, Dict, Optional, Union
 
 import flax
 import jax
 import jax.numpy as jnp
+import numpy as np
 from jax import lax
 
 from ..models.auto import (
@@ -38,8 +37,11 @@
 from .flax_logits_process import (
     FlaxForcedBOSTokenLogitsProcessor,
     FlaxForcedEOSTokenLogitsProcessor,
+    FlaxForceTokensLogitsProcessor,
     FlaxLogitsProcessorList,
     FlaxMinLengthLogitsProcessor,
+    FlaxSuppressTokensAtBeginLogitsProcessor,
+    FlaxSuppressTokensLogitsProcessor,
     FlaxTemperatureLogitsWarper,
     FlaxTopKLogitsWarper,
     FlaxTopPLogitsWarper,
@@ -131,11 +133,14 @@ class FlaxGenerationMixin:
 
     The class exposes [`~generation.FlaxGenerationMixin.generate`], which can be used for:
             - *greedy decoding* by calling [`~generation.FlaxGenerationMixin._greedy_search`] if `num_beams=1` and
-              `do_sample=False`.
+              `do_sample=False`
             - *multinomial sampling* by calling [`~generation.FlaxGenerationMixin._sample`] if `num_beams=1` and
-              `do_sample=True`.
+              `do_sample=True`
             - *beam-search decoding* by calling [`~generation.FlaxGenerationMixin._beam_search`] if `num_beams>1` and
-              `do_sample=False`.
+              `do_sample=False`
+
+    You do not need to call any of the above methods directly. Pass custom parameter values to 'generate' instead. To
+    learn more about decoding strategies refer to the [text generation strategies guide](../generation_strategies).
     """
 
     def prepare_inputs_for_generation(self, *args, **kwargs):
@@ -162,6 +167,50 @@ def _prepare_encoder_decoder_kwargs_for_generation(self, input_ids, params, mode
         model_kwargs["encoder_outputs"] = self.encode(input_ids, params=params, return_dict=True, **encoder_kwargs)
         return model_kwargs
 
+    def _prepare_decoder_input_ids_for_generation(
+        self,
+        batch_size: int,
+        decoder_start_token_id: int = None,
+        bos_token_id: int = None,
+        model_kwargs: Optional[Dict[str, jnp.ndarray]] = None,
+    ) -> jnp.ndarray:
+        if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
+            # Only use this arg if not None, otherwise just remove from model_kwargs
+            decoder_input_ids = model_kwargs.pop("decoder_input_ids")
+            if decoder_input_ids is not None:
+                return decoder_input_ids
+        decoder_start_token_id = self._get_decoder_start_token_id(decoder_start_token_id, bos_token_id)
+        return jnp.array(decoder_start_token_id, dtype="i4").reshape(1, -1).repeat(batch_size, axis=0)
+
+    def _get_decoder_start_token_id(self, decoder_start_token_id: int = None, bos_token_id: int = None) -> int:
+        # retrieve decoder_start_token_id for encoder-decoder models
+        # fall back to bos_token_id if necessary
+        decoder_start_token_id = (
+            decoder_start_token_id
+            if decoder_start_token_id is not None
+            else self.generation_config.decoder_start_token_id
+        )
+        bos_token_id = bos_token_id if bos_token_id is not None else self.generation_config.bos_token_id
+        if decoder_start_token_id is not None:
+            return decoder_start_token_id
+        elif (
+            hasattr(self.config, "decoder")
+            and hasattr(self.config.decoder, "decoder_start_token_id")
+            and self.config.decoder.decoder_start_token_id is not None
+        ):
+            return self.config.decoder.decoder_start_token_id
+        elif bos_token_id is not None:
+            return bos_token_id
+        elif (
+            hasattr(self.config, "decoder")
+            and hasattr(self.config.decoder, "bos_token_id")
+            and self.config.decoder.bos_token_id is not None
+        ):
+            return self.config.decoder.bos_token_id
+        raise ValueError(
+            "`decoder_start_token_id` or `bos_token_id` has to be defined for encoder-decoder generation."
+        )
+
     @staticmethod
     def _expand_to_num_beams(tensor, num_beams):
         return jnp.broadcast_to(tensor[:, None], (tensor.shape[0], num_beams) + tensor.shape[1:])
@@ -222,29 +271,11 @@ def generate(
         prng_key: Optional[jnp.ndarray] = None,
         trace: bool = True,
         params: Optional[Dict[str, jnp.ndarray]] = None,
+        logits_processor: Optional[FlaxLogitsProcessorList] = None,
         **kwargs,
     ):
         r"""
-        Generates sequences of token ids for models with a language modeling head. The method supports the following
-        generation methods for text-decoder, text-to-text, speech-to-text, and vision-to-text models:
-
-            - *greedy decoding* by calling [`~generation.FlaxGenerationMixin._greedy_search`] if `num_beams=1` and
-              `do_sample=False`.
-            - *multinomial sampling* by calling [`~generation.FlaxGenerationMixin._sample`] if `num_beams=1` and
-              `do_sample=True`.
-            - *beam-search decoding* by calling [`~generation.FlaxGenerationMixin._beam_search`] if `num_beams>1` and
-              `do_sample=False`.
-
-        <Tip warning={true}>
-
-        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
-        model's default generation configuration. You can override any `generation_config` by passing the corresponding
-        parameters to generate, e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
-
-        For a complete overview of generate, check the [following
-        guide](https://huggingface.co/docs/transformers/en/main_classes/text_generation).
-
-        </Tip>
+        Generates sequences of token ids for models with a language modeling head.
 
         Parameters:
             input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
@@ -261,7 +292,11 @@ def generate(
                 considerably slower runtime.
             params (`Dict[str, jnp.ndarray]`, *optional*):
                 Optionally the model parameters can be passed. Can be useful for parallelized generation.
-            kwargs:
+            logits_processor (`FlaxLogitsProcessorList `, *optional*):
+                Custom logits processors that complement the default logits processors built from arguments and
+                generation config. If a logit processor is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            kwargs (`Dict[str, Any]`, *optional*):
                 Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                 forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
                 specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
@@ -269,95 +304,37 @@ def generate(
         Return:
             [`~utils.ModelOutput`].
 
-        Examples:
-
-        Greedy decoding, using the default generation configuration and ad hoc modifications:
-
-        ```python
-        >>> from transformers import AutoTokenizer, FlaxAutoModelForCausalLM
-
-        >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
-        >>> model = FlaxAutoModelForCausalLM.from_pretrained("gpt2")
-
-        >>> prompt = "Today I believe we can finally"
-        >>> input_ids = tokenizer(prompt, return_tensors="np").input_ids
-
-        >>> # Generate up to 30 tokens
-        >>> outputs = model.generate(input_ids, do_sample=False, max_length=30)
-        >>> tokenizer.batch_decode(outputs.sequences, skip_special_tokens=True)
-        ['Today I believe we can finally get to the point where we can make a difference in the lives of the people of the United States of America.\n']
-        ```
-
-        Multinomial sampling, modifying an existing generation configuration:
-
-        ```python
-        >>> from transformers import AutoTokenizer, FlaxAutoModelForCausalLM, GenerationConfig
-        >>> import numpy as np
-
-        >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
-        >>> model = FlaxAutoModelForCausalLM.from_pretrained("gpt2")
-
-        >>> prompt = "Today I believe we can finally"
-        >>> input_ids = tokenizer(prompt, return_tensors="np").input_ids
-
-        >>> # Sample up to 30 tokens
-        >>> generation_config = GenerationConfig.from_pretrained("gpt2")
-        >>> generation_config.max_length = 30
-        >>> generation_config.do_sample = True
-        >>> outputs = model.generate(
-        ...     input_ids, generation_config=generation_config, prng_key=np.asarray([0, 0], dtype=np.uint32)
-        ... )
-        >>> tokenizer.batch_decode(outputs.sequences, skip_special_tokens=True)
-        ['Today I believe we can finally get a change in that system. The way I saw it was this: a few years ago, this company would not']
-        ```
-
-        Beam-search decoding, using a freshly initialized generation configuration:
-
-        ```python
-        >>> from transformers import AutoTokenizer, FlaxAutoModelForSeq2SeqLM, GenerationConfig
-
-        >>> tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
-        >>> model = FlaxAutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-de")
-
-        >>> sentence = "Paris is one of the densest populated areas in Europe."
-        >>> input_ids = tokenizer(sentence, return_tensors="np").input_ids
-
-        >>> generation_config = GenerationConfig(
-        ...     max_length=64,
-        ...     num_beams=5,
-        ...     bos_token_id=0,
-        ...     eos_token_id=0,
-        ...     decoder_start_token_id=58100,
-        ...     pad_token_id=58100,
-        ...     bad_words_ids=[[58100]],
-        ... )
-        >>> outputs = model.generate(input_ids, generation_config=generation_config)
-        >>> tokenizer.batch_decode(outputs.sequences, skip_special_tokens=True)
-        ['Paris ist eines der dichtesten besiedelten Gebiete Europas.']
-        ```"""
+        """
         # Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call
         self._validate_model_class()
 
         # priority: `generation_config` argument > `model.generation_config` (the default generation config)
         if generation_config is None:
-            # legacy: users may modify the model configuration to control generation -- update the generation config
-            # model attribute accordingly, if it was created from the model config
-            if self.generation_config._from_model_config:
+            # legacy: users may modify the model configuration to control generation. To trigger this legacy behavior,
+            # two conditions must be met
+            # 1) the generation config must have been created from the model config (`_from_model_config` field);
+            # 2) the generation config must have seen no modification since its creation (the hash is the same).
+            if self.generation_config._from_model_config and self.generation_config._original_object_hash == hash(
+                self.generation_config
+            ):
                 new_generation_config = GenerationConfig.from_model_config(self.config)
                 if new_generation_config != self.generation_config:
                     warnings.warn(
                         "You have modified the pretrained model configuration to control generation. This is a"
                         " deprecated strategy to control generation and will be removed soon, in a future version."
-                        " Please use a generation configuration file (see"
-                        " https://huggingface.co/docs/transformers/main_classes/text_generation)"
+                        " Please use and modify the model generation configuration (see"
+                        " https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )"
                     )
                     self.generation_config = new_generation_config
             generation_config = self.generation_config
 
         generation_config = copy.deepcopy(generation_config)
         model_kwargs = generation_config.update(**kwargs)  # All unused kwargs must be model kwargs
+        generation_config.validate()
         self._validate_model_kwargs(model_kwargs.copy())
 
+        logits_processor = logits_processor if logits_processor is not None else FlaxLogitsProcessorList()
+
         # set init values
         prng_key = prng_key if prng_key is not None else jax.random.PRNGKey(0)
 
@@ -387,33 +364,39 @@ def generate(
                     "generation results, please set `padding_side='left'` when initializing the tokenizer."
                 )
 
+        batch_size = input_ids.shape[0]
+
         if self.config.is_encoder_decoder:
             # add encoder_outputs to model_kwargs
             if model_kwargs.get("encoder_outputs") is None:
                 model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation(input_ids, params, model_kwargs)
             # prepare decoder_input_ids for generation
-            input_ids = jnp.ones((input_ids.shape[0], 1), dtype="i4") * generation_config.decoder_start_token_id
+            input_ids = self._prepare_decoder_input_ids_for_generation(
+                batch_size,
+                decoder_start_token_id=generation_config.decoder_start_token_id,
+                bos_token_id=generation_config.bos_token_id,
+                model_kwargs=model_kwargs,
+            )
 
         # Prepare `max_length` depending on other stopping criteria.
         input_ids_seq_length = input_ids.shape[-1]
         has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
-        if has_default_max_length and generation_config.max_new_tokens is None:
+        if has_default_max_length and generation_config.max_new_tokens is None and generation_config.max_length == 20:
+            # 20 is the default max_length of the generation config
             warnings.warn(
-                "Neither `max_length` nor `max_new_tokens` have been set, `max_length` will default to"
-                f" {generation_config.max_length} (`generation_config.max_length`). Controlling `max_length` via the"
-                " config is deprecated and `max_length` will be removed from the config in v5 of Transformers -- we"
-                " recommend using `max_new_tokens` to control the maximum length of the generation.",
+                f"Using the model-agnostic default `max_length` (={generation_config.max_length}) "
+                "to control the generation length.  recommend setting `max_new_tokens` to control the maximum length of the generation.",
                 UserWarning,
             )
-        elif has_default_max_length and generation_config.max_new_tokens is not None:
+        elif generation_config.max_new_tokens is not None:
+            if not has_default_max_length and generation_config.max_length is not None:
+                logger.warning(
+                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
+                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
+                    "Please refer to the documentation for more information. "
+                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)"
+                )
             generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
-        elif not has_default_max_length and generation_config.max_new_tokens is not None:
-            raise ValueError(
-                "Both `max_new_tokens` and `max_length` have been set but they serve the same purpose -- setting a"
-                " limit to the generated output length. Remove one of those arguments. Please refer to the"
-                " documentation for more information. "
-                "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)"
-            )
 
         if generation_config.min_length is not None and generation_config.min_length > generation_config.max_length:
             raise ValueError(
@@ -428,7 +411,11 @@ def generate(
                 " increasing`max_new_tokens`."
             )
 
-        logits_processor = self._get_logits_processor(generation_config=generation_config)
+        logits_processor = self._get_logits_processor(
+            generation_config=generation_config,
+            input_ids_seq_length=input_ids_seq_length,
+            logits_processor=logits_processor,
+        )
 
         if not generation_config.do_sample and generation_config.num_beams == 1:
             return self._greedy_search(
@@ -464,10 +451,11 @@ def generate(
                     model_kwargs["encoder_outputs"]["last_hidden_state"], num_beams=generation_config.num_beams
                 )
 
-            if "attention_mask" in model_kwargs:
-                model_kwargs["attention_mask"] = self._expand_to_num_beams(
-                    model_kwargs["attention_mask"], num_beams=generation_config.num_beams
-                )
+            for kwarg in ["attention_mask", "decoder_attention_mask"]:
+                if kwarg in model_kwargs:
+                    model_kwargs[kwarg] = self._expand_to_num_beams(
+                        model_kwargs[kwarg], num_beams=generation_config.num_beams
+                    )
 
             return self._beam_search(
                 input_ids,
@@ -479,6 +467,7 @@ def generate(
                 logits_processor=logits_processor,
                 trace=trace,
                 params=params,
+                num_return_sequences=generation_config.num_return_sequences,
                 model_kwargs=model_kwargs,
             )
         else:
@@ -500,7 +489,12 @@ def _get_logits_warper(self, generation_config: GenerationConfig) -> FlaxLogitsP
 
         return warpers
 
-    def _get_logits_processor(self, generation_config: GenerationConfig) -> FlaxLogitsProcessorList:
+    def _get_logits_processor(
+        self,
+        generation_config: GenerationConfig,
+        input_ids_seq_length: int,
+        logits_processor: Optional[FlaxLogitsProcessorList],
+    ) -> FlaxLogitsProcessorList:
         """
         This class returns a [`FlaxLogitsProcessorList`] list object that contains all relevant [`FlaxLogitsProcessor`]
         instances used to modify the scores of the language model head.
@@ -521,9 +515,51 @@ def _get_logits_processor(self, generation_config: GenerationConfig) -> FlaxLogi
             processors.append(
                 FlaxForcedEOSTokenLogitsProcessor(generation_config.max_length, generation_config.forced_eos_token_id)
             )
+        if generation_config.suppress_tokens is not None:
+            processors.append(FlaxSuppressTokensLogitsProcessor(generation_config.suppress_tokens))
+        if generation_config.begin_suppress_tokens is not None:
+            begin_index = input_ids_seq_length
+            begin_index = (
+                begin_index
+                if (input_ids_seq_length > 1 or generation_config.forced_bos_token_id is None)
+                else begin_index + 1
+            )
+            if generation_config.forced_decoder_ids is not None and len(generation_config.forced_decoder_ids) > 0:
+                # generation starts after the last token that is forced
+                begin_index += generation_config.forced_decoder_ids[-1][0]
+            processors.append(
+                FlaxSuppressTokensAtBeginLogitsProcessor(generation_config.begin_suppress_tokens, begin_index)
+            )
+        if generation_config.forced_decoder_ids is not None:
+            forced_decoder_ids = [
+                [input_ids_seq_length + i[0] - 1, i[1]] for i in generation_config.forced_decoder_ids
+            ]
+            processors.append(FlaxForceTokensLogitsProcessor(forced_decoder_ids))
+        processors = self._merge_criteria_processor_list(processors, logits_processor)
 
         return processors
 
+    def _merge_criteria_processor_list(
+        self,
+        default_list: FlaxLogitsProcessorList,
+        custom_list: FlaxLogitsProcessorList,
+    ) -> FlaxLogitsProcessorList:
+        if len(custom_list) == 0:
+            return default_list
+        for default in default_list:
+            for custom in custom_list:
+                if type(custom) is type(default):
+                    object_type = "logits processor"
+                    raise ValueError(
+                        f"A custom {object_type} of type {type(custom)} with values {custom} has been passed to"
+                        f" `generate`, but it has already been created with the values {default}. {default} has been"
+                        " created by passing the corresponding arguments to generate or by the model's config default"
+                        f" values. If you just want to change the default values of {object_type} consider passing"
+                        f" them as arguments to `generate` instead of using a custom {object_type}."
+                    )
+        default_list.extend(custom_list)
+        return default_list
+
     def _greedy_search(
         self,
         input_ids: None,
@@ -714,10 +750,11 @@ def _beam_search(
         pad_token_id: Optional[int] = None,
         eos_token_id: Optional[int] = None,
         length_penalty: Optional[float] = None,
-        early_stopping: Optional[bool] = None,
+        early_stopping: Optional[Union[bool, str]] = None,
         logits_processor: Optional[FlaxLogitsProcessorList] = None,
         trace: bool = True,
         params: Optional[Dict[str, jnp.ndarray]] = None,
+        num_return_sequences: Optional[int] = None,
         model_kwargs: Optional[Dict[str, jnp.ndarray]] = None,
     ):
         """
@@ -762,6 +799,9 @@ def gather_fn(tensor):
         eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
         length_penalty = length_penalty if length_penalty is not None else self.generation_config.length_penalty
         early_stopping = early_stopping if early_stopping is not None else self.generation_config.early_stopping
+        num_return_sequences = (
+            num_return_sequences if num_return_sequences is not None else self.generation_config.num_return_sequences
+        )
 
         batch_size, num_beams, cur_len = input_ids.shape
 
@@ -790,8 +830,9 @@ def gather_fn(tensor):
             model_kwargs["encoder_outputs"]["last_hidden_state"] = flatten_beam_dim(
                 model_kwargs["encoder_outputs"]["last_hidden_state"]
             )
-        if "attention_mask" in model_kwargs:
-            model_kwargs["attention_mask"] = flatten_beam_dim(model_kwargs["attention_mask"])
+        for kwarg in ["attention_mask", "decoder_attention_mask"]:
+            if kwarg in model_kwargs:
+                model_kwargs[kwarg] = flatten_beam_dim(model_kwargs[kwarg])
 
         # initialize model specific kwargs
         model_kwargs = self.prepare_inputs_for_generation(flatten_beam_dim(input_ids), max_length, **model_kwargs)
@@ -814,14 +855,22 @@ def beam_search_cond_fn(state):
             not_max_length_yet = state.cur_len < max_length
 
             # 2. can the new beams still improve?
-            best_running_score = state.running_scores[:, -1:] / (max_length**length_penalty)
+            # early_stopping == False -> apply heuristic = always get the best score from `cur_len`. See the discussion
+            # below for more details.
+            # https://github.com/huggingface/transformers/pull/20901#issuecomment-1369845565
+            # early_stopping == "never" -> compute the best score from max_length or cur_len, depending on the sign of
+            #   length_penalty. Positive length_penalty favors longer sequences, thus we use max_length there.
+            if early_stopping == "never" and length_penalty > 0.0:
+                best_running_score = state.running_scores[:, :1] / (max_length**length_penalty)
+            else:
+                best_running_score = state.running_scores[:, :1] / (state.cur_len**length_penalty)
             worst_finished_score = jnp.where(
                 state.is_sent_finished, jnp.min(state.scores, axis=1, keepdims=True), np.array(-1.0e7)
             )
-            improvement_still_possible = jnp.all(worst_finished_score < best_running_score)
+            improvement_still_possible = jnp.any(best_running_score > worst_finished_score)
 
             # 3. is there still a beam that has not finished?
-            still_open_beam = ~(jnp.all(state.is_sent_finished) & early_stopping)
+            still_open_beam = ~(jnp.all(state.is_sent_finished) & (early_stopping is True))
 
             return not_max_length_yet & still_open_beam & improvement_still_possible
 
@@ -894,7 +943,7 @@ def beam_search_body_fn(state, input_ids_length=1):
             # 5. Get running sequences scores for next
             # Determine the top k beam indices (from top 2*k beams) from log probs
             # and gather top k beams (from top 2*k beams).
-            next_topk_indices = jnp.flip(lax.top_k(running_topk_log_probs, k=num_beams)[1], axis=1)
+            next_topk_indices = lax.top_k(running_topk_log_probs, k=num_beams)[1]
             next_running_sequences, next_running_scores = gather_beams(
                 [topk_sequences, running_topk_log_probs], next_topk_indices, batch_size, num_beams
             )
@@ -905,10 +954,9 @@ def beam_search_body_fn(state, input_ids_length=1):
             # - make sure no scores can be added anymore if beam is full
             # - make sure still running sequences cannot be chosen as finalized beam
             topk_log_probs = topk_log_probs / (state.cur_len**length_penalty)
-            beams_in_batch_are_full = (
-                jnp.broadcast_to(state.is_sent_finished.all(axis=-1, keepdims=True), did_topk_just_finished.shape)
-                & early_stopping
-            )
+            beams_in_batch_are_full = jnp.broadcast_to(
+                state.is_sent_finished.all(axis=-1, keepdims=True), did_topk_just_finished.shape
+            ) & (early_stopping is True)
             add_penalty = ~did_topk_just_finished | beams_in_batch_are_full
             topk_log_probs += add_penalty * np.array(-1.0e7)
 
@@ -919,7 +967,7 @@ def beam_search_body_fn(state, input_ids_length=1):
             merged_sequences = jnp.concatenate([state.sequences, topk_sequences], axis=1)
             merged_scores = jnp.concatenate([state.scores, topk_log_probs], axis=1)
             merged_is_sent_finished = jnp.concatenate([state.is_sent_finished, did_topk_just_finished], axis=1)
-            topk_merged_indices = jnp.flip(lax.top_k(merged_scores, k=num_beams)[1], axis=1)
+            topk_merged_indices = lax.top_k(merged_scores, k=num_beams)[1]
             next_sequences, next_scores, next_is_sent_finished = gather_beams(
                 [merged_sequences, merged_scores, merged_is_sent_finished], topk_merged_indices, batch_size, num_beams
             )
@@ -957,8 +1005,8 @@ def beam_search_body_fn(state, input_ids_length=1):
         sequences = jnp.where(none_finished[:, None, None], state.sequences, state.running_sequences)
         scores = jnp.where(none_finished[:, None], state.scores, state.running_scores)
 
-        # take best beam for each batch
-        sequences = sequences[:, -1]
-        scores = scores[:, -1]
+        # Take best beams for each batch (the score is sorted in descending order)
+        sequences = flatten_beam_dim(sequences[:, :num_return_sequences, :])
+        scores = flatten_beam_dim(scores[:, :num_return_sequences])
 
         return FlaxBeamSearchOutput(sequences=sequences, scores=scores)
diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index 1639a98d93be..14f772ab6c99 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -15,7 +15,7 @@
 
 import inspect
 import math
-from typing import Callable, Iterable, List, Optional, Tuple, Union
+from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -30,17 +30,10 @@
 LOGITS_PROCESSOR_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
+            Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
         scores (`torch.FloatTensor` of shape `(batch_size, config.vocab_size)`):
             Prediction scores of a language modeling head. These can be logits for each vocabulary when not using beam
             search or log softmax for each vocabulary token when using beam search
-        kwargs:
-            Additional logits processor specific kwargs.
 
     Return:
         `torch.FloatTensor` of shape `(batch_size, config.vocab_size)`: The processed prediction scores.
@@ -53,7 +46,6 @@ class LogitsProcessor:
 
     @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        """Torch method for processing logits."""
         raise NotImplementedError(
             f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
         )
@@ -64,7 +56,6 @@ class LogitsWarper:
 
     @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        """Torch method for warping logits."""
         raise NotImplementedError(
             f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
         )
@@ -77,8 +68,22 @@ class LogitsProcessorList(list):
     [`LogitsProcessor`] or [`LogitsWarper`] to the inputs.
     """
 
-    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> torch.FloatTensor:
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
+            scores (`torch.FloatTensor` of shape `(batch_size, config.vocab_size)`):
+                Prediction scores of a language modeling head. These can be logits for each vocabulary when not using
+                beam search or log softmax for each vocabulary token when using beam search
+            kwargs (`Dict[str, Any]`, *optional*):
+                Additional kwargs that are specific to a logits processor.
+
+        Return:
+            `torch.FloatTensor` of shape `(batch_size, config.vocab_size)`:
+                The processed prediction scores.
+
+        """
         for processor in self:
             function_args = inspect.signature(processor.__call__).parameters
             if len(function_args) > 2:
@@ -106,16 +111,17 @@ class MinLengthLogitsProcessor(LogitsProcessor):
 
     def __init__(self, min_length: int, eos_token_id: Union[int, List[int]]):
         if not isinstance(min_length, int) or min_length < 0:
-            raise ValueError(f"`min_length` has to be a positive integer, but is {min_length}")
+            raise ValueError(f"`min_length` has to be a non-negative integer, but is {min_length}")
 
         if isinstance(eos_token_id, int):
             eos_token_id = [eos_token_id]
-        if not all([isinstance(i, int) for i in eos_token_id]) or any([i < 0 for i in eos_token_id]):
-            raise ValueError(f"`eos_token_id` has to be a list of positive integers, but is {eos_token_id}")
+        if not all(isinstance(i, int) for i in eos_token_id) or any(i < 0 for i in eos_token_id):
+            logger.warning(f"`eos_token_id` has to be a list of positive integers, but is {eos_token_id}")
 
         self.min_length = min_length
         self.eos_token_id = eos_token_id
 
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         cur_len = input_ids.shape[-1]
         if cur_len < self.min_length:
@@ -127,67 +133,174 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
 class MinNewTokensLengthLogitsProcessor(LogitsProcessor):
     r"""
     [`LogitsProcessor`] enforcing a min-length of new tokens by setting EOS (End-Of-Sequence) token probability to 0.
+    Note that for decoder-only models, such as Llama2, `min_length` will compute the length of `prompt + newly
+    generated tokens` whereas for other models it will behave as `min_new_tokens`, that is, taking only into account
+    the newly generated ones.
 
     Args:
         prompt_length_to_skip (`int`):
-            The input tokens length.
+            The input tokens length. Not a valid argument when used with `generate` as it will automatically assign the
+            input length.
         min_new_tokens (`int`):
             The minimum *new* tokens length below which the score of `eos_token_id` is set to `-float("Inf")`.
-        eos_token_id (`int`):
-            The id of the *end-of-sequence* token.
-    """
+        eos_token_id (`Union[int, List[int]]`):
+            The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
 
-    def __init__(self, prompt_length_to_skip: int, min_new_tokens: int, eos_token_id: int):
+    Examples:
+
+    ```python
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+    >>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+    >>> model.config.pad_token_id = model.config.eos_token_id
+    >>> inputs = tokenizer(["Hugging Face Company is"], return_tensors="pt")
+
+    >>> # If the maximum length (default = 20) is smaller than the minimum length constraint, the latter is ignored!
+    >>> outputs = model.generate(**inputs, min_new_tokens=30)
+    >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+    Hugging Face Company is a company that has been working on a new product for the past year.
+
+    >>> # For testing purposes, let's set `eos_token` to `"company"`, the first generated token. This will make
+    >>> # generation end there.
+    >>> outputs = model.generate(**inputs, eos_token_id=1664)
+    >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+    Hugging Face Company is a company
+
+    >>> # Increasing `min_new_tokens` will make generation ignore occurences `"company"` (eos token) before the
+    >>> # minimum length condition is honored.
+    >>> outputs = model.generate(**inputs, min_new_tokens=2, eos_token_id=1664)
+    >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+    Hugging Face Company is a new company
+    ```
+    """
 
+    def __init__(self, prompt_length_to_skip: int, min_new_tokens: int, eos_token_id: Union[int, List[int]]):
         for arg_name, arg_value in [
             ("prompt_length_to_skip", prompt_length_to_skip),
             ("min_new_tokens", min_new_tokens),
-            ("eos_token_id", eos_token_id),
         ]:
             if not isinstance(arg_value, int) or arg_value < 0:
                 raise ValueError(f"`{arg_name}` has to be a positive integer, but is {arg_value}")
 
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+        if not all(isinstance(i, int) for i in eos_token_id) or any(i < 0 for i in eos_token_id):
+            logger.warning(f"`eos_token_id` has to be a list of positive integers, but is {eos_token_id}")
+
         self.prompt_length_to_skip = prompt_length_to_skip
         self.min_new_tokens = min_new_tokens
         self.eos_token_id = eos_token_id
 
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-
         new_tokens_length = input_ids.shape[-1] - self.prompt_length_to_skip
         if new_tokens_length < self.min_new_tokens:
-            scores[:, self.eos_token_id] = -float("inf")
+            for i in self.eos_token_id:
+                scores[:, i] = -float("inf")
 
         return scores
 
 
 class TemperatureLogitsWarper(LogitsWarper):
     r"""
-    [`LogitsWarper`] for temperature (exponential scaling output probability distribution).
+    [`LogitsWarper`] for temperature (exponential scaling output probability distribution), which effectively means
+    that it can control the randomness of the predicted tokens.
+
+    <Tip>
+
+    Make sure that `do_sample=True` is included in the `generate` arguments otherwise the temperature value won't have
+    any effect.
+
+    </Tip>
 
     Args:
         temperature (`float`):
-            The value used to module the logits distribution.
+            Strictly positive float value used to modulate the logits distribution. A value smaller than `1` decreases
+            randomness (and vice versa), with `0` being equivalent to shifting all probability mass to the most likely
+            token.
+
+    Examples:
+
+    ```python
+    >>> import torch
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
+
+    >>> set_seed(0)  # for reproducibility
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+    >>> model.config.pad_token_id = model.config.eos_token_id
+    >>> inputs = tokenizer(["Hugging Face Company is"], return_tensors="pt")
+
+    >>> # With temperature=1.0, the default, we consistently get random outputs due to random sampling.
+    >>> generate_kwargs = {"max_new_tokens": 10, "do_sample": True, "temperature": 1.0, "num_return_sequences": 2}
+    >>> outputs = model.generate(**inputs, **generate_kwargs)
+    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
+    ['Hugging Face Company is a joint venture between GEO Group, one of',
+    'Hugging Face Company is not an exact science – but what we believe does']
+
+    >>> # However, with temperature close to 0, it approximates greedy decoding strategies (invariant)
+    >>> generate_kwargs["temperature"] = 0.0001
+    >>> outputs = model.generate(**inputs, **generate_kwargs)
+    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
+    ['Hugging Face Company is a company that has been around for over 20 years',
+    'Hugging Face Company is a company that has been around for over 20 years']
+    ```
     """
 
     def __init__(self, temperature: float):
         if not isinstance(temperature, float) or not (temperature > 0):
-            raise ValueError(f"`temperature` has to be a strictly positive float, but is {temperature}")
+            except_msg = (
+                f"`temperature` (={temperature}) has to be a strictly positive float, otherwise your next token "
+                "scores will be invalid."
+            )
+            if isinstance(temperature, float) and temperature == 0.0:
+                except_msg += " If you're looking for greedy decoding strategies, set `do_sample=False`."
+            raise ValueError(except_msg)
 
         self.temperature = temperature
 
-    def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.FloatTensor:
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         scores = scores / self.temperature
         return scores
 
 
 class RepetitionPenaltyLogitsProcessor(LogitsProcessor):
     r"""
-    [`LogitsProcessor`] enforcing an exponential penalty on repeated sequences.
+    [`LogitsProcessor`] that prevents the repetition of previous tokens through an exponential penalty. This technique
+    shares some similarities with coverage mechanisms and other aimed at reducing repetition. During the text
+    generation process, the probability distribution for the next token is determined using a formula that incorporates
+    token scores based on their occurrence in the generated sequence. Tokens with higher scores are more likely to be
+    selected. The formula can be seen in the original [paper](https://arxiv.org/pdf/1909.05858.pdf). According to the
+    paper a penalty of around 1.2 yields a good balance between truthful generation and lack of repetition.
 
     Args:
         repetition_penalty (`float`):
             The parameter for repetition penalty. 1.0 means no penalty. See [this
             paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+
+    Examples:
+
+    ```py
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM
+
+    >>> # Initializing the model and tokenizer for it
+    >>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+    >>> inputs = tokenizer(["I'm not going to"], return_tensors="pt")
+
+    >>> # This shows a normal generate without any specific parameters
+    >>> summary_ids = model.generate(**inputs)
+    >>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True)[0])
+    I'm not going to be able to do that. I'm going to be able to do that
+
+    >>> # This generates a penalty for repeated tokens
+    >>> penalized_ids = model.generate(**inputs, repetition_penalty=1.1)
+    >>> print(tokenizer.batch_decode(penalized_ids, skip_special_tokens=True)[0])
+    I'm not going to be able to do that. I'll just have to go out and play
+    ```
     """
 
     def __init__(self, penalty: float):
@@ -196,6 +309,7 @@ def __init__(self, penalty: float):
 
         self.penalty = penalty
 
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         score = torch.gather(scores, 1, input_ids)
 
@@ -206,6 +320,35 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
         return scores
 
 
+class EncoderRepetitionPenaltyLogitsProcessor(LogitsProcessor):
+    r"""
+    [`LogitsProcessor`] enforcing an exponential penalty on tokens that are not in the original input.
+
+    Args:
+        hallucination_penalty (`float`):
+            The parameter for hallucination penalty. 1.0 means no penalty.
+        encoder_input_ids (`torch.LongTensor`):
+            The encoder_input_ids that should be repeated within the decoder ids.
+    """
+
+    def __init__(self, penalty: float, encoder_input_ids: torch.LongTensor):
+        if not isinstance(penalty, float) or not (penalty > 0):
+            raise ValueError(f"`penalty` has to be a strictly positive float, but is {penalty}")
+
+        self.penalty = 1 / penalty
+        self.encoder_input_ids = encoder_input_ids
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        score = torch.gather(scores, 1, self.encoder_input_ids)
+
+        # if score < 0 then repetition penalty has to be multiplied to reduce the previous token probability
+        score = torch.where(score < 0, score * self.penalty, score / self.penalty)
+
+        scores.scatter_(1, self.encoder_input_ids, score)
+        return scores
+
+
 class TopPLogitsWarper(LogitsWarper):
     """
     [`LogitsWarper`] that performs top-p, i.e. restricting to top tokens summing to prob_cut_off <= prob_cut_off.
@@ -214,30 +357,54 @@ class TopPLogitsWarper(LogitsWarper):
         top_p (`float`):
             If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
             higher are kept for generation.
-        filter_value (`float`, *optional*, defaults to `-float("Inf")`):
+        filter_value (`float`, *optional*, defaults to -inf):
             All filtered values will be set to this float value.
         min_tokens_to_keep (`int`, *optional*, defaults to 1):
             Minimum number of tokens that cannot be filtered.
+
+    Examples:
+    ```python
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
+
+    >>> set_seed(0)
+    >>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+
+    >>> inputs = tokenizer("A sequence: 1, 2", return_tensors="pt")
+
+    >>> # With sampling, the output is unexpected -- sometimes too unexpected.
+    >>> outputs = model.generate(**inputs, do_sample=True)
+    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
+    A sequence: 1, 2, 0, 2, 2. 2, 2, 2, 2
+
+    >>> # With `top_p` sampling, the output gets restricted to high-probability tokens.
+    >>> # Pro tip: In practice, LLMs use `top_p` in the 0.9-0.95 range.
+    >>> outputs = model.generate(**inputs, do_sample=True, top_p=0.1)
+    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
+    A sequence: 1, 2, 3, 4, 5, 6, 7, 8, 9
+    ```
     """
 
     def __init__(self, top_p: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
         top_p = float(top_p)
         if top_p < 0 or top_p > 1.0:
             raise ValueError(f"`top_p` has to be a float > 0 and < 1, but is {top_p}")
+        if not isinstance(min_tokens_to_keep, int) or (min_tokens_to_keep < 1):
+            raise ValueError(f"`min_tokens_to_keep` has to be a positive integer, but is {min_tokens_to_keep}")
 
         self.top_p = top_p
         self.filter_value = filter_value
         self.min_tokens_to_keep = min_tokens_to_keep
 
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         sorted_logits, sorted_indices = torch.sort(scores, descending=False)
         cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
 
         # Remove tokens with cumulative top_p above the threshold (token with 0 are kept)
         sorted_indices_to_remove = cumulative_probs <= (1 - self.top_p)
-        if self.min_tokens_to_keep > 1:
-            # Keep at least min_tokens_to_keep
-            sorted_indices_to_remove[..., -self.min_tokens_to_keep :] = 0
+        # Keep at least min_tokens_to_keep
+        sorted_indices_to_remove[..., -self.min_tokens_to_keep :] = 0
 
         # scatter sorted tensors to original indexing
         indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
@@ -252,7 +419,7 @@ class TopKLogitsWarper(LogitsWarper):
     Args:
         top_k (`int`):
             The number of highest probability vocabulary tokens to keep for top-k-filtering.
-        filter_value (`float`, *optional*, defaults to `-float("Inf")`):
+        filter_value (`float`, *optional*, defaults to -inf):
             All filtered values will be set to this float value.
         min_tokens_to_keep (`int`, *optional*, defaults to 1):
             Minimum number of tokens that cannot be filtered.
@@ -262,12 +429,12 @@ def __init__(self, top_k: int, filter_value: float = -float("Inf"), min_tokens_t
         if not isinstance(top_k, int) or top_k <= 0:
             raise ValueError(f"`top_k` has to be a strictly positive integer, but is {top_k}")
 
-        self.top_k = top_k
+        self.top_k = max(top_k, min_tokens_to_keep)
         self.filter_value = filter_value
-        self.min_tokens_to_keep = min_tokens_to_keep
 
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        top_k = min(max(self.top_k, self.min_tokens_to_keep), scores.size(-1))  # Safety check
+        top_k = min(self.top_k, scores.size(-1))  # Safety check
         # Remove all tokens with a probability less than the last token of the top-k
         indices_to_remove = scores < torch.topk(scores, top_k)[0][..., -1, None]
         scores = scores.masked_fill(indices_to_remove, self.filter_value)
@@ -280,9 +447,9 @@ class TypicalLogitsWarper(LogitsWarper):
     Generation](https://arxiv.org/abs/2202.00666) for more information.
 
     Args:
-        mass (`float`):
+        mass (`float`, *optional*, defaults to 0.9):
             Value of typical_p between 0 and 1 inclusive, defaults to 0.9.
-        filter_value (`float`, *optional*, defaults to `-float("Inf")`):
+        filter_value (`float`, *optional*, defaults to -inf):
             All filtered values will be set to this float value.
         min_tokens_to_keep (`int`, *optional*, defaults to 1):
             Minimum number of tokens that cannot be filtered.
@@ -292,13 +459,15 @@ def __init__(self, mass: float = 0.9, filter_value: float = -float("Inf"), min_t
         mass = float(mass)
         if not (mass > 0 and mass < 1):
             raise ValueError(f"`typical_p` has to be a float > 0 and < 1, but is {mass}")
+        if not isinstance(min_tokens_to_keep, int) or (min_tokens_to_keep < 1):
+            raise ValueError(f"`min_tokens_to_keep` has to be a positive integer, but is {min_tokens_to_keep}")
 
         self.filter_value = filter_value
         self.mass = mass
         self.min_tokens_to_keep = min_tokens_to_keep
 
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-
         # calculate entropy
         normalized = torch.nn.functional.log_softmax(scores, dim=-1)
         p = torch.exp(normalized)
@@ -314,20 +483,182 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
         last_ind = (cumulative_probs < self.mass).sum(dim=1)
         last_ind[last_ind < 0] = 0
         sorted_indices_to_remove = sorted_scores > sorted_scores.gather(1, last_ind.view(-1, 1))
-        if self.min_tokens_to_keep > 1:
-            # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
-            sorted_indices_to_remove[..., : self.min_tokens_to_keep] = 0
+        sorted_indices_to_remove[..., : self.min_tokens_to_keep] = 0
         indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
 
         scores = scores.masked_fill(indices_to_remove, self.filter_value)
         return scores
 
 
+class EpsilonLogitsWarper(LogitsWarper):
+    r"""
+    [`LogitsWarper`] that performs epsilon-sampling, i.e. restricting to tokens with `prob >= epsilon`. Takes the
+    largest min_tokens_to_keep tokens if no tokens satisfy this constraint. See [Truncation Sampling as Language Model
+    Desmoothing](https://arxiv.org/abs/2210.15191) for more information.
+
+    Args:
+        epsilon (`float`):
+            If set to > 0, only the most tokens with probabilities `epsilon` or higher are kept for generation.
+        filter_value (`float`, *optional*, defaults to -inf):
+            All filtered values will be set to this float value.
+        min_tokens_to_keep (`int`, *optional*, defaults to 1):
+            Minimum number of tokens that cannot be filtered.
+
+    Examples:
+    ```python
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
+
+    >>> set_seed(0)
+    >>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+
+    >>> inputs = tokenizer("A sequence: 1, 2", return_tensors="pt")
+
+    >>> # With sampling, the output is unexpected -- sometimes too unexpected.
+    >>> outputs = model.generate(**inputs, do_sample=True)
+    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
+    A sequence: 1, 2, 0, 2, 2. 2, 2, 2, 2
+
+    >>> # With epsilon sampling, the output gets restricted to high-probability tokens. Note that this is similar to
+    >>> # Top P sampling, which restricts tokens based on their cumulative probability.
+    >>> # Pro tip: The paper recomends using `epsilon_cutoff` values between 3e-4 and 9e-4
+    >>> outputs = model.generate(**inputs, do_sample=True, epsilon_cutoff=0.1)
+    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
+    A sequence: 1, 2, 3, 4, 5, 6, 7, 8, 9
+    ```
+    """
+
+    def __init__(self, epsilon: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
+        epsilon = float(epsilon)
+        if epsilon <= 0 or epsilon >= 1:
+            raise ValueError(f"`epsilon_cutoff` has to be a float > 0 and < 1, but is {epsilon}")
+
+        min_tokens_to_keep = int(min_tokens_to_keep)
+        if min_tokens_to_keep < 1:
+            raise ValueError(
+                f"`min_tokens_to_keep` has to be a strictly positive integer, but is {min_tokens_to_keep}"
+            )
+
+        self.epsilon = epsilon
+        self.filter_value = filter_value
+        self.min_tokens_to_keep = min_tokens_to_keep
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        # Determine which indices to remove
+        probabilities = scores.softmax(dim=-1)
+        indices_to_remove = probabilities < self.epsilon
+
+        # Keep the words with the 'min_tokens_to_keep'-highest probabilities
+        top_k = min(self.min_tokens_to_keep, scores.size(-1))  # Safety check
+        indices_to_remove = indices_to_remove & (scores < torch.topk(scores, top_k)[0][..., -1, None])
+
+        scores = scores.masked_fill(indices_to_remove, self.filter_value)
+        return scores
+
+
+class EtaLogitsWarper(LogitsWarper):
+    r"""
+    [`LogitsWarper`] that performs eta-sampling, a technique to filter out tokens with probabilities below a dynamic
+    cutoff value, `eta`, which is calculated based on a combination of the hyperparameter `epsilon` and the entropy of
+    the token probabilities, i.e. `eta := min(epsilon, sqrt(epsilon * e^-entropy(probabilities)))`. Takes the largest
+    min_tokens_to_keep tokens if no tokens satisfy this constraint. It addresses the issue of poor quality in long
+    samples of text generated by neural language models leading to more coherent and fluent text. See [Truncation
+    Sampling as Language Model Desmoothing](https://arxiv.org/abs/2210.15191) for more information. Note: `do_sample`
+    must be set to `True` for this `LogitsWarper` to work.
+
+
+    Args:
+        epsilon (`float`):
+            A float value in the range (0, 1). Hyperparameter used to calculate the dynamic cutoff value, `eta`. The
+            suggested values from the paper ranges from 3e-4 to 4e-3 depending on the size of the model.
+        filter_value (`float`, *optional*, defaults to -inf):
+            All values that are found to be below the dynamic cutoff value, `eta`, are set to this float value. This
+            parameter is useful when logits need to be modified for very low probability tokens that should be excluded
+            from generation entirely.
+        min_tokens_to_keep (`int`, *optional*, defaults to 1):
+            Specifies the minimum number of tokens that must be kept for generation, regardless of their probabilities.
+            For example, if `min_tokens_to_keep` is set to 1, at least one token will always be kept for generation,
+            even if all tokens have probabilities below the cutoff `eta`.
+
+    Examples:
+    ```python
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
+
+    >>> set_seed(0)
+    >>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+
+    >>> inputs = tokenizer("A sequence: 1, 2", return_tensors="pt")
+
+    >>> # With sampling, the output is unexpected -- sometimes too unexpected.
+    >>> outputs = model.generate(**inputs, do_sample=True)
+    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
+    A sequence: 1, 2, 0, 2, 2. 2, 2, 2, 2
+
+    >>> # With eta sampling, the output gets restricted to high-probability tokens. You can see it as a dynamic form of
+    >>> # epsilon sampling that adapts its cutoff probability based on the entropy (high entropy = lower cutoff).
+    >>> # Pro tip: The paper recomends using `eta_cutoff` values between 3e-4 to 4e-3
+    >>> outputs = model.generate(**inputs, do_sample=True, eta_cutoff=0.1)
+    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
+    A sequence: 1, 2, 3, 4, 5, 6, 7, 8, 9
+    ```
+    """
+
+    def __init__(self, epsilon: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
+        epsilon = float(epsilon)
+        if epsilon <= 0 or epsilon >= 1:
+            raise ValueError(f"`eta_cutoff` has to be a float > 0 and < 1, but is {epsilon}")
+
+        min_tokens_to_keep = int(min_tokens_to_keep)
+        if min_tokens_to_keep < 1:
+            raise ValueError(
+                f"`min_tokens_to_keep` has to be a strictly positive integer, but is {min_tokens_to_keep}"
+            )
+
+        self.epsilon = torch.tensor(epsilon)
+        self.filter_value = filter_value
+        self.min_tokens_to_keep = min_tokens_to_keep
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        # Calculate the adaptive cutoff
+        probabilities = scores.softmax(dim=-1)
+        entropy = torch.distributions.Categorical(logits=scores).entropy()
+        eta = torch.min(self.epsilon, torch.sqrt(self.epsilon) * torch.exp(-entropy))[..., None]
+        indices_to_remove = probabilities < eta
+
+        # Keep the words with the 'min_tokens_to_keep'-highest probabilities
+        top_k = min(self.min_tokens_to_keep, scores.size(-1))  # Safety check
+        indices_to_remove = indices_to_remove & (scores < torch.topk(scores, top_k)[0][..., -1, None])
+
+        scores = scores.masked_fill(indices_to_remove, self.filter_value)
+        return scores
+
+
 def _get_ngrams(ngram_size: int, prev_input_ids: torch.Tensor, num_hypos: int):
+    """
+    Assume ngram_size=2 and prev_input_ids=tensor([[40, 2883, 2712, 4346]]). The output of generated ngrams look like
+    this {(40,): [2883], (2883,): [2712], (2712,): [4346]}.
+
+    Args:
+        ngram_size (`int`):
+            The number sequential tokens taken as a group which may only occur once before being banned.
+        prev_input_ids (`torch.Tensor`):
+           Generated token ids for the current hypothesis.
+        num_hypos (`int`):
+            The number of hypotheses for which n-grams need to be generated.
+
+    Returns:
+        generated_ngrams (`dict`):
+            Dictionary of generated ngrams.
+    """
+    # Initialize an empty list of dictionaries, one for each hypothesis (index) in the range of num_hypos
     generated_ngrams = [{} for _ in range(num_hypos)]
     for idx in range(num_hypos):
         gen_tokens = prev_input_ids[idx].tolist()
         generated_ngram = generated_ngrams[idx]
+        # Loop through each n-gram of size ngram_size in the list of tokens (gen_tokens)
         for ngram in zip(*[gen_tokens[i:] for i in range(ngram_size)]):
             prev_ngram_tuple = tuple(ngram[:-1])
             generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]]
@@ -335,6 +666,22 @@ def _get_ngrams(ngram_size: int, prev_input_ids: torch.Tensor, num_hypos: int):
 
 
 def _get_generated_ngrams(banned_ngrams, prev_input_ids, ngram_size, cur_len):
+    """
+    Determines the banned tokens for the current hypothesis based on previously generated n-grams.
+
+    Args:
+        banned_ngrams (`dict`):
+            A dictionary containing previously generated n-grams for each hypothesis.
+        prev_input_ids (`torch.Tensor`):
+            Generated token ids for the current hypothesis.
+        ngram_size (`int`):
+            The number sequential tokens taken as a group which may only occur once before being banned.
+        cur_len (`int`):
+            The current length of the token sequences for which the n-grams are being checked.
+
+    Returns:
+        List of tokens that are banned.
+    """
     # Before decoding the next token, prevent decoding of ngrams that have already appeared
     start_idx = cur_len + 1 - ngram_size
     ngram_idx = tuple(prev_input_ids[start_idx:cur_len].tolist())
@@ -348,9 +695,7 @@ def _calc_banned_ngram_tokens(
     if cur_len + 1 < ngram_size:
         # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet
         return [[] for _ in range(num_hypos)]
-
     generated_ngrams = _get_ngrams(ngram_size, prev_input_ids, num_hypos)
-
     banned_tokens = [
         _get_generated_ngrams(generated_ngrams[hypo_idx], prev_input_ids[hypo_idx], ngram_size, cur_len)
         for hypo_idx in range(num_hypos)
@@ -360,12 +705,43 @@ def _calc_banned_ngram_tokens(
 
 class NoRepeatNGramLogitsProcessor(LogitsProcessor):
     r"""
-    [`LogitsProcessor`] that enforces no repetition of n-grams. See
+    N-grams are groups of "n" consecutive words, characters, or tokens taken from a sequence of text. Given the
+    sentence: "She runs fast", the bi-grams (n=2) would be ("she", "runs") and ("runs", "fast"). In text generation,
+    avoiding repetitions of word sequences provides a more diverse output. This [`LogitsProcessor`] enforces no
+    repetition of n-grams by setting the scores of banned tokens to negative infinity which eliminates those tokens
+    from consideration when further processing the scores.
     [Fairseq](https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345).
 
+    <Tip>
+
+    Use n-gram penalties with care. For instance, penalizing 2-grams (bigrams) in an article about the city of New York
+    might lead to undesirable outcomes where the city's name appears only once in the entire text.
+    [Reference](https://huggingface.co/blog/how-to-generate)
+
+    </Tip>
+
     Args:
         ngram_size (`int`):
             All ngrams of size `ngram_size` can only occur once.
+
+    Examples:
+
+    ```py
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM
+
+    >>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+    >>> inputs = tokenizer(["Today I"], return_tensors="pt")
+
+    >>> output = model.generate(**inputs)
+    >>> print(tokenizer.decode(output[0], skip_special_tokens=True))
+    Today I’m not sure if I’m going to be able to do it.
+
+    >>> # Now let's add ngram size using `no_repeat_ngram_size`. This stops the repetitions ("I’m") in the output.
+    >>> output = model.generate(**inputs, no_repeat_ngram_size=2)
+    >>> print(tokenizer.decode(output[0], skip_special_tokens=True))
+    Today I’m not sure if I can get a better understanding of the nature of this issue
+    ```
     """
 
     def __init__(self, ngram_size: int):
@@ -373,11 +749,11 @@ def __init__(self, ngram_size: int):
             raise ValueError(f"`ngram_size` has to be a strictly positive integer, but is {ngram_size}")
         self.ngram_size = ngram_size
 
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         num_batch_hypotheses = scores.shape[0]
         cur_len = input_ids.shape[-1]
         banned_batch_tokens = _calc_banned_ngram_tokens(self.ngram_size, input_ids, num_batch_hypotheses, cur_len)
-
         for i, banned_tokens in enumerate(banned_batch_tokens):
             scores[i, banned_tokens] = -float("inf")
 
@@ -407,6 +783,7 @@ def __init__(self, encoder_ngram_size: int, encoder_input_ids: torch.LongTensor)
         self.batch_size = encoder_input_ids.shape[0]
         self.generated_ngrams = _get_ngrams(encoder_ngram_size, encoder_input_ids, self.batch_size)
 
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         # B x num_beams
         num_hypos = scores.shape[0]
@@ -425,139 +802,239 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
         return scores
 
 
-class NoBadWordsLogitsProcessor(LogitsProcessor):
+class SequenceBiasLogitsProcessor(LogitsProcessor):
     """
-    [`LogitsProcessor`] that enforces that specified sequences will never be sampled.
+    [`LogitsProcessor`] that applies an additive bias on sequences. The bias is applied to the last token of a sequence
+    when the next generated token can complete it. Consequently, to take the most of biasing sequences with more than
+    one token, consider using beam methods (to gracefully work around partially completed sequences that have a
+    negative bias) and applying the bias to their prefixes (to ensure the bias is applied earlier).
+
+    <Tip>
+
+    In order to get the token ids of the sequences that you want to bias, make sure to set `add_prefix_space=True` when
+    initializing the tokenizer, and use `tokenizer(bad_words, add_special_tokens=False).input_ids`. The
+    `add_prefix_space` argument is only supported for some slow tokenizers, as fast tokenizers' prefixing behaviours
+    come from `pre tokenizers`. Read more [here](https://huggingface.co/docs/tokenizers/api/pre-tokenizers).
+
+    </Tip>
 
     Args:
-        bad_words_ids (`List[List[int]]`):
-            List of list of token ids that are not allowed to be generated. In order to get the token ids of the words
-            that should not appear in the generated text, use `tokenizer(bad_words, add_prefix_space=True,
-            add_special_tokens=False).input_ids`.
-        eos_token_id (`Union[int, List[int]]`):
-            The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+        sequence_bias (`Dict[Tuple[int], float]`):
+            Dictionary that maps a sequence of tokens to its bias term. Positive biases increase the odds of the
+            sequence being selected, while negative biases do the opposite. If a sequence has a length of 1, its bias
+            will always be applied. Otherwise, the bias will only be applied if the sequence in question is about to be
+            completed (in the token selection step after this processor is applied).
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM
+
+    >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    >>> inputs = tokenizer(["The full name of Donald is Donald"], return_tensors="pt")
+
+    >>> summary_ids = model.generate(inputs["input_ids"], max_new_tokens=4)
+    >>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True)[0])
+    The full name of Donald is Donald J. Trump Jr
+
+    >>> # Now let's control generation through a bias. Please note that the tokenizer is initialized differently!
+    >>> tokenizer_with_prefix_space = AutoTokenizer.from_pretrained("gpt2", add_prefix_space=True)
+
+
+    >>> def get_tokens_as_tuple(word):
+    ...     return tuple(tokenizer_with_prefix_space([word], add_special_tokens=False).input_ids[0])
+
+
+    >>> # If we add a negative bias without beam search, it may become "stuck" in a prefix without good continuations
+    >>> sequence_bias = {get_tokens_as_tuple("Trump"): -10.0}
+    >>> biased_ids = model.generate(inputs["input_ids"], max_new_tokens=4, sequence_bias=sequence_bias)
+    >>> print(tokenizer.batch_decode(biased_ids, skip_special_tokens=True)[0])
+    The full name of Donald is Donald J. Donald,
+
+    >>> biased_ids = model.generate(inputs["input_ids"], max_new_tokens=4, num_beams=4, sequence_bias=sequence_bias)
+    >>> print(tokenizer.batch_decode(biased_ids, skip_special_tokens=True)[0])
+    The full name of Donald is Donald Rumsfeld,
+
+    >>> # We can also add a positive bias to nudge the model towards specific tokens or continuations
+    >>> sequence_bias = {get_tokens_as_tuple("Donald Duck"): 10.0}
+    >>> biased_ids = model.generate(inputs["input_ids"], max_new_tokens=4, num_beams=4, sequence_bias=sequence_bias)
+    >>> print(tokenizer.batch_decode(biased_ids, skip_special_tokens=True)[0])
+    The full name of Donald is Donald Duck.
+    ```
     """
 
-    def __init__(self, bad_words_ids: List[List[int]], eos_token_id: Union[int, List[int]]):
+    def __init__(self, sequence_bias: Dict[Tuple[int], float]):
+        self.sequence_bias = sequence_bias
+        self._validate_arguments()
 
-        if not isinstance(bad_words_ids, List) or len(bad_words_ids) == 0:
-            raise ValueError(f"`bad_words_ids` has to be a non-empty list, but is {bad_words_ids}.")
-        if any(not isinstance(bad_word_ids, list) for bad_word_ids in bad_words_ids):
-            raise ValueError(f"`bad_words_ids` has to be a list of lists, but is {bad_words_ids}.")
+        # Bias variables that will be populated on the first call (for retrocompatibility purposes, the vocabulary size
+        # is infered in the first usage, which inhibits initializing here)
+        self.length_1_bias = None
+        self.prepared_bias_variables = False
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        # 1 - Prepares the bias tensors. This is only needed the first time the logit processor is called.
+        if not self.prepared_bias_variables:
+            self._prepare_bias_variables(scores)
+
+        # 2 - prepares an empty bias to add
+        bias = torch.zeros_like(scores)
+
+        # 3 - include the bias from length = 1
+        bias += self.length_1_bias
+
+        # 4 - include the bias from length > 1, after determining which biased sequences may be completed.
+        for sequence_ids, sequence_bias in self.sequence_bias.items():
+            if len(sequence_ids) == 1:  # the sequence is of length 1, already applied
+                continue
+            if len(sequence_ids) > input_ids.shape[1]:  # the sequence is longer than the context, ignore
+                continue
+            prefix_length = len(sequence_ids) - 1
+            last_token = sequence_ids[-1]
+            matching_rows = torch.eq(
+                input_ids[:, -prefix_length:],
+                torch.tensor(sequence_ids[:-1], dtype=input_ids.dtype, device=input_ids.device),
+            ).prod(dim=1)
+            bias[:, last_token] += torch.where(
+                matching_rows.bool(),
+                torch.tensor(sequence_bias, device=input_ids.device),
+                torch.tensor(0.0, device=input_ids.device),
+            )
+
+        # 5 - apply the bias to the scores
+        scores = scores + bias
+        return scores
+
+    def _prepare_bias_variables(self, scores: torch.FloatTensor):
+        vocabulary_size = scores.shape[-1]
+
+        # Check biased tokens out of bounds
+        invalid_biases = []
+        for sequence_ids in self.sequence_bias:
+            for token_id in sequence_ids:
+                if token_id >= vocabulary_size:
+                    invalid_biases.append(token_id)
+        if len(invalid_biases) > 0:
+            raise ValueError(
+                f"The model vocabulary size is {vocabulary_size}, but the following tokens were being biased: "
+                f"{invalid_biases}"
+            )
+
+        # Precompute the bias tensors to be applied. Sequences of length 1 are kept separately, as they can be applied
+        # with simpler logic.
+        self.length_1_bias = torch.zeros((vocabulary_size,), dtype=torch.float).to(scores.device)
+        for sequence_ids, bias in self.sequence_bias.items():
+            if len(sequence_ids) == 1:
+                self.length_1_bias[sequence_ids[-1]] = bias
+
+        self.prepared_bias_variables = True
+
+    def _validate_arguments(self):
+        sequence_bias = self.sequence_bias
+        if not isinstance(sequence_bias, dict) or len(sequence_bias) == 0:
+            raise ValueError(f"`sequence_bias` has to be a non-empty dictionary, but is {sequence_bias}.")
+        if any(not isinstance(sequence_ids, tuple) for sequence_ids in sequence_bias.keys()):
+            raise ValueError(f"`sequence_bias` has to be a dict with tuples as keys, but is {sequence_bias}.")
         if any(
-            any((not isinstance(token_id, (int, np.integer)) or token_id < 0) for token_id in bad_word_ids)
-            for bad_word_ids in bad_words_ids
+            any((not isinstance(token_id, (int, np.integer)) or token_id < 0) for token_id in sequence_ids)
+            or len(sequence_ids) == 0
+            for sequence_ids in sequence_bias.keys()
         ):
             raise ValueError(
-                f"Each list in `bad_words_ids` has to be a list of positive integers, but is {bad_words_ids}."
+                f"Each key in `sequence_bias` has to be a non-empty tuple of positive integers, but is "
+                f"{sequence_bias}."
             )
+        if any(not isinstance(bias, float) for bias in sequence_bias.values()):
+            raise ValueError(f"`sequence_bias` has to be a dict with floats as values, but is {sequence_bias}.")
 
-        if eos_token_id is None:
-            eos_token_id = []
-        if isinstance(eos_token_id, int):
-            eos_token_id = [eos_token_id]
 
-        bad_words_ids = list(
-            filter(lambda bad_token_seq: all([bad_token_seq != [i] for i in eos_token_id]), bad_words_ids)
-        )
-        self.bad_words_id_length_1 = []
-        self.bad_words_id_length_greater_than_1 = []
-        for word in bad_words_ids:
-            if len(word) == 1:
-                self.bad_words_id_length_1.append(word[0])
-            else:
-                self.bad_words_id_length_greater_than_1.append(word)
+class NoBadWordsLogitsProcessor(SequenceBiasLogitsProcessor):
+    """
+    [`LogitsProcessor`] that enforces that specified sequences will never be selected.
 
-        self.static_bad_words_mask: Optional[torch.LongTensor] = None
+    <Tip>
 
-        for banned_token_seq in self.bad_words_id_length_greater_than_1:
-            if len(banned_token_seq) == 0:
-                raise ValueError(f"Banned words token sequences {bad_words_ids} cannot have an empty list")
+    In order to get the token ids of the words that should not appear in the generated text, make sure to set
+    `add_prefix_space=True` when initializing the tokenizer, and use `tokenizer(bad_words,
+    add_special_tokens=False).input_ids`. The `add_prefix_space` argument is only supported for some slow tokenizers,
+    as fast tokenizers' prefixing behaviours come from `pre tokenizers`. Read more
+    [here](https://huggingface.co/docs/tokenizers/api/pre-tokenizers).
 
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        if self.static_bad_words_mask is None and len(self.bad_words_id_length_1) > 0:
-            self.static_bad_words_mask = self._calc_static_bad_word_mask(scores)
+    </Tip>
 
-        dynamic_banned_tokens = self._calc_banned_bad_words_ids(input_ids.tolist())
-        scores = self._set_scores_to_inf_for_banned_tokens(scores, dynamic_banned_tokens)
+    Args:
+        bad_words_ids (`List[List[int]]`):
+            List of list of token ids that are not allowed to be generated.
+        eos_token_id (`Union[int, List[int]]`):
+            The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
 
-        return scores
+    Examples:
 
-    def _calc_static_bad_word_mask(self, scores: torch.FloatTensor) -> torch.BoolTensor:
-        static_bad_words_mask = torch.zeros(scores.shape[1])
-        static_bad_words_mask[self.bad_words_id_length_1] = 1
-        return static_bad_words_mask.unsqueeze(0).to(scores.device).bool()
-
-    def _tokens_match(self, prev_tokens: List[int], tokens: List[int]) -> bool:
-        if len(tokens) == 0:
-            # if bad word tokens is just one token always ban it
-            return True
-        elif len(tokens) > len(prev_tokens):
-            # if bad word tokens are longer then prev input_ids they can't be equal
-            return False
-        else:
-            return prev_tokens[-len(tokens) :] == tokens
+    ```python
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM
 
-    def _calc_banned_bad_words_ids(self, prev_input_ids: List[List[int]]) -> Iterable[int]:
-        banned_tokens = []
-        for prev_input_ids_slice in prev_input_ids:
-            banned_tokens_slice = []
-            for banned_token_seq in self.bad_words_id_length_greater_than_1:
-                if self._tokens_match(prev_input_ids_slice, banned_token_seq[:-1]):
-                    banned_tokens_slice.append(banned_token_seq[-1])
+    >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    >>> inputs = tokenizer(["In a word, the cake is a"], return_tensors="pt")
 
-            banned_tokens.append(banned_tokens_slice)
+    >>> output_ids = model.generate(inputs["input_ids"], max_new_tokens=5, pad_token_id=tokenizer.eos_token_id)
+    >>> print(tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0])
+    In a word, the cake is a bit of a mess.
 
-        return banned_tokens
+    >>> # Now let's take the bad words out. Please note that the tokenizer is initialized differently
+    >>> tokenizer_with_prefix_space = AutoTokenizer.from_pretrained("gpt2", add_prefix_space=True)
 
-    def _set_scores_to_inf_for_banned_tokens(
-        self, scores: torch.Tensor, banned_tokens: List[List[int]]
-    ) -> torch.Tensor:
-        """
-        Modifies the scores in place by setting the banned token positions to `-inf`. Banned token is expected to be a
-        list of list of banned tokens to ban in the format [[batch index, vocabulary position],...
 
-        Args:
-            scores: logits distribution of shape (batch size, vocabulary size)
-            banned_tokens: list of list of tokens to ban of length (batch_size)
-        """
-        banned_mask_list = []
-        for idx, batch_banned_tokens in enumerate(banned_tokens):
-            for token in batch_banned_tokens:
-                # Eliminates invalid bad word IDs that are over the vocabulary size.
-                if token <= scores.shape[1]:
-                    banned_mask_list.append([idx, token])
-                else:
-                    logger.error(
-                        f"An invalid bad word ID is defined: {token}. This ID is not contained in the "
-                        "vocabulary, and is therefore ignored."
-                    )
-        if not banned_mask_list and self.static_bad_words_mask is None:
-            return scores
+    >>> def get_tokens_as_list(word_list):
+    ...     "Converts a sequence of words into a list of tokens"
+    ...     tokens_list = []
+    ...     for word in word_list:
+    ...         tokenized_word = tokenizer_with_prefix_space([word], add_special_tokens=False).input_ids[0]
+    ...         tokens_list.append(tokenized_word)
+    ...     return tokens_list
 
-        else:
-            if banned_mask_list:
-                banned_mask = torch.LongTensor(banned_mask_list)
-                indices = torch.ones(len(banned_mask))
-                # A sparse tensor is generated from a list of coordinates: [[0, 1], [0, 2], [2, 0]]. A conversion to dense tensor generates:
-                # [ 0  1  1 ]
-                # [ 0  0  0 ]
-                # [ 1  0  0 ]
-
-                banned_mask = (
-                    torch.sparse.LongTensor(banned_mask.t(), indices, scores.size())
-                    .to(scores.device)
-                    .to_dense()
-                    .bool()
-                )
 
-                if self.static_bad_words_mask is not None:
-                    banned_mask = torch.bitwise_or(banned_mask, self.static_bad_words_mask)
-            else:
-                banned_mask = self.static_bad_words_mask
+    >>> bad_words_ids = get_tokens_as_list(word_list=["mess"])
+    >>> output_ids = model.generate(
+    ...     inputs["input_ids"], max_new_tokens=5, bad_words_ids=bad_words_ids, pad_token_id=tokenizer.eos_token_id
+    ... )
+    >>> print(tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0])
+    In a word, the cake is a bit of a surprise.
+    ```
+    """
 
-            scores = scores.masked_fill(banned_mask, -float("inf"))
-            return scores
+    def __init__(self, bad_words_ids: List[List[int]], eos_token_id: Union[int, List[int]]):
+        self.bad_word_ids = bad_words_ids
+        self._validate_arguments()
+
+        # Filter EOS token from bad_words_ids
+        if eos_token_id is None:
+            eos_token_id = []
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+        bad_words_ids = list(
+            filter(lambda bad_token_seq: all(bad_token_seq != [i] for i in eos_token_id), bad_words_ids)
+        )
+
+        # Forbidding a sequence is equivalent to setting its bias to -inf
+        sequence_bias = {tuple(sequence): float("-inf") for sequence in bad_words_ids}
+        super().__init__(sequence_bias=sequence_bias)
+
+    def _validate_arguments(self):
+        bad_words_ids = self.bad_word_ids
+        if not isinstance(bad_words_ids, list) or len(bad_words_ids) == 0:
+            raise ValueError(f"`bad_words_ids` has to be a non-empty list, but is {bad_words_ids}.")
+        if any(not isinstance(bad_word_ids, list) for bad_word_ids in bad_words_ids):
+            raise ValueError(f"`bad_words_ids` has to be a list of lists, but is {bad_words_ids}.")
+        if any(
+            any((not isinstance(token_id, (int, np.integer)) or token_id < 0) for token_id in bad_word_ids)
+            for bad_word_ids in bad_words_ids
+        ):
+            raise ValueError(
+                f"Each list in `bad_words_ids` has to be a list of positive integers, but is {bad_words_ids}."
+            )
 
 
 class PrefixConstrainedLogitsProcessor(LogitsProcessor):
@@ -566,7 +1043,7 @@ class PrefixConstrainedLogitsProcessor(LogitsProcessor):
     generation. See [Autoregressive Entity Retrieval](https://arxiv.org/abs/2010.00904) for more information.
 
     Args:
-        prefix_allowed_tokens_fn: (`Callable[[int, torch.Tensor], List[int]]`):
+        prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`):
             This function constraints the beam search to allowed tokens only at each step. This function takes 2
             arguments `inputs_ids` and the batch ID `batch_id`. It has to return a list with the allowed tokens for the
             next generation step conditioned on the previously generated tokens `inputs_ids` and the batch ID
@@ -577,6 +1054,7 @@ def __init__(self, prefix_allowed_tokens_fn: Callable[[int, torch.Tensor], List[
         self._prefix_allowed_tokens_fn = prefix_allowed_tokens_fn
         self._num_beams = num_beams
 
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         mask = torch.full_like(scores, -math.inf)
         for batch_id, beam_sent in enumerate(input_ids.view(-1, self._num_beams, input_ids.shape[-1])):
@@ -588,20 +1066,106 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
 
 class HammingDiversityLogitsProcessor(LogitsProcessor):
     r"""
-    [`LogitsProcessor`] that enforces diverse beam search. Note that this logits processor is only effective for
-    [`PreTrainedModel.group_beam_search`]. See [Diverse Beam Search: Decoding Diverse Solutions from Neural Sequence
-    Models](https://arxiv.org/pdf/1610.02424.pdf) for more details.
+    [`LogitsProcessor`] that enforces diverse beam search.
+
+    Note that this logits processor is only effective for [`PreTrainedModel.group_beam_search`]. See [Diverse Beam
+    Search: Decoding Diverse Solutions from Neural Sequence Models](https://arxiv.org/pdf/1610.02424.pdf) for more
+    details.
+
+    <Tip>
+
+    Diverse beam search can be particularly useful in scenarios where a variety of different outputs is desired, rather
+    than multiple similar sequences. It allows the model to explore different generation paths and provides a broader
+    coverage of possible outputs.
+
+    </Tip>
+
+    <Tip warning={true}>
+
+    This logits processor can be resource-intensive, especially when using large models or long sequences.
+
+    </Tip>
+
+    Traditional beam search often generates very similar sequences across different beams.
+    `HammingDiversityLogitsProcessor` addresses this by penalizing beams that generate tokens already chosen by other
+    beams in the same time step.
+
+    How It Works:
+    - **Grouping Beams**: Beams are divided into groups. Each group selects tokens independently of the others.
+    - **Penalizing Repeated Tokens**: If a beam in a group selects a token already chosen by another group in the
+        same step, a penalty is applied to that token's score.
+    - **Promoting Diversity**: This penalty discourages beams within a group from selecting the same tokens as
+        beams in other groups.
+
+    Benefits:
+    - **Diverse Outputs**: Produces a variety of different sequences.
+    - **Exploration**: Allows the model to explore different paths.
 
     Args:
         diversity_penalty (`float`):
             This value is subtracted from a beam's score if it generates a token same as any beam from other group at a
-            particular time. Note that `diversity_penalty` is only effective if `group beam search` is enabled.
+            particular time. Note that `diversity_penalty` is only effective if group beam search is enabled. The
+            penalty applied to a beam's score when it generates a token that has already been chosen by another beam
+            within the same group during the same time step. A higher `diversity_penalty` will enforce greater
+            diversity among the beams, making it less likely for multiple beams to choose the same token. Conversely, a
+            lower penalty will allow beams to more freely choose similar tokens. Adjusting this value can help strike a
+            balance between diversity and natural likelihood.
         num_beams (`int`):
-            Number of beams used for group beam search. See [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more
-            details.
+            Number of beams used for group beam search. Beam search is a method used that maintains beams (or "multiple
+            hypotheses") at each step, expanding each one and keeping the top-scoring sequences. A higher `num_beams`
+            will explore more potential sequences. This can increase chances of finding a high-quality output but also
+            increases computational cost.
         num_beam_groups (`int`):
             Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
-            See [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.
+            Each group of beams will operate independently, selecting tokens without considering the choices of other
+            groups. This division promotes diversity by ensuring that beams within different groups explore different
+            paths. For instance, if `num_beams` is 6 and `num_beam_groups` is 2, there will be 2 groups each containing
+            3 beams. The choice of `num_beam_groups` should be made considering the desired level of output diversity
+            and the total number of beams. See [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+    >>> import torch
+
+    >>> # Initialize the model and tokenizer
+    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+    >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
+
+    >>> # A long text about the solar system
+    >>> text = "The Solar System is a gravitationally bound system comprising the Sun and the objects that orbit it, either directly or indirectly. Of the objects that orbit the Sun directly, the largest are the eight planets, with the remainder being smaller objects, such as the five dwarf planets and small Solar System bodies. The Solar System formed 4.6 billion years ago from the gravitational collapse of a giant interstellar molecular cloud."
+    >>> inputs = tokenizer("summarize: " + text, return_tensors="pt")
+
+    >>> # Generate diverse summary
+    >>> outputs_diverse = model.generate(
+    ...     **inputs,
+    ...     num_beam_groups=2,
+    ...     diversity_penalty=10.0,
+    ...     max_length=100,
+    ...     num_beams=4,
+    ...     num_return_sequences=2,
+    ... )
+    >>> summaries_diverse = tokenizer.batch_decode(outputs_diverse, skip_special_tokens=True)
+
+    >>> # Generate non-diverse summary
+    >>> outputs_non_diverse = model.generate(
+    ...     **inputs,
+    ...     max_length=100,
+    ...     num_beams=4,
+    ...     num_return_sequences=2,
+    ... )
+    >>> summary_non_diverse = tokenizer.batch_decode(outputs_non_diverse, skip_special_tokens=True)
+
+    >>> # With `diversity_penalty`, the resulting beams are much more diverse
+    >>> print(summary_non_diverse)
+    ['the solar system formed 4.6 billion years ago from the collapse of a giant interstellar molecular cloud. of the objects that orbit the Sun directly, the largest are the eight planets.',
+    'the Solar System formed 4.6 billion years ago from the collapse of a giant interstellar molecular cloud. of the objects that orbit the Sun directly, the largest are the eight planets.']
+
+    >>> print(summaries_diverse)
+    ['the solar system formed 4.6 billion years ago from the collapse of a giant interstellar molecular cloud. of the objects that orbit the Sun directly, the largest are the eight planets.',
+    'the solar system formed 4.6 billion years ago from the collapse of a giant interstellar molecular cloud. of the objects that orbit the Sun directly, the largest are the eight planets. the rest of the objects are smaller objects, such as the five dwarf planets and small solar system bodies.']
+    ```
     """
 
     def __init__(self, diversity_penalty: float, num_beams: int, num_beam_groups: int):
@@ -624,6 +1188,23 @@ def __call__(
         current_tokens: torch.LongTensor,
         beam_group_idx: int,
     ) -> torch.FloatTensor:
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
+            scores (`torch.FloatTensor` of shape `(batch_size, config.vocab_size)`):
+                Prediction scores of a language modeling head. These can be logits for each vocabulary when not using
+                beam search or log softmax for each vocabulary token when using beam search
+            current_tokens (`torch.LongTensor` of shape `(batch_size)`):
+                Indices of input sequence tokens in the vocabulary, corresponding to the tokens selected by the other
+                beam groups in the current generation step.
+            beam_group_idx (`int`):
+                The index of the beam group currently being processed.
+
+        Return:
+            `torch.FloatTensor` of shape `(batch_size, config.vocab_size)`:
+                The processed prediction scores.
+        """
         # hamming diversity: penalise using same token in current group which was used in previous groups at
         # the same time step
         batch_size = current_tokens.shape[0] // self._num_beams
@@ -658,6 +1239,7 @@ class ForcedBOSTokenLogitsProcessor(LogitsProcessor):
     def __init__(self, bos_token_id: int):
         self.bos_token_id = bos_token_id
 
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         cur_len = input_ids.shape[-1]
         if cur_len == 1:
@@ -685,6 +1267,7 @@ def __init__(self, max_length: int, eos_token_id: Union[int, List[int]]):
             eos_token_id = [eos_token_id]
         self.eos_token_id = eos_token_id
 
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         cur_len = input_ids.shape[-1]
         if cur_len == self.max_length - 1:
@@ -698,10 +1281,10 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
 class InfNanRemoveLogitsProcessor(LogitsProcessor):
     r"""
     [`LogitsProcessor`] that removes all `nan` and `inf` values to avoid the generation method to fail. Note that using
-    the logits processor should only be used if necessary since it can slow down the generation method. `max_length` is
-    reached.
+    the logits processor should only be used if necessary since it can slow down the generation method.
     """
 
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         # set all nan values to 0.0
         scores[scores != scores] = 0.0
@@ -714,21 +1297,70 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
 
 class ExponentialDecayLengthPenalty(LogitsProcessor):
     r"""
-    [`LogitsProcessor`] that exponentially increases the score of the eos_token_id after regulation_start has been
-    reached.
+    [`LogitsProcessor`] that exponentially increases the score of the `eos_token_id` after `start_index` has been
+    reached. This allows generating shorter sequences without having a hard cutoff, allowing the `eos_token` to be
+    predicted in a meaningful position.
 
     Args:
-        exponential_decay_length_penalty (`tuple(int, float)`, *optional*):
+        exponential_decay_length_penalty (`tuple(int, float)`):
             This tuple shall consist of: `(start_index, decay_factor)` where `start_index` indicates where penalty
             starts and `decay_factor` represents the factor of exponential decay
         eos_token_id (`Union[int, List[int]]`):
             The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
         input_ids_seq_length (`int`):
             The length of the input sequence.
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
+
+    >>> set_seed(1)
+    >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+
+    >>> text = "Just wanted to let you know, I"
+    >>> inputs = tokenizer(text, return_tensors="pt")
+
+    >>> # Generate sequences without exponential penalty. We want short sentences, so we limit max_length=30
+    >>> # see that the answer tends to end abruptly
+    >>> outputs = model.generate(**inputs, do_sample=True, temperature=0.9, max_length=30, pad_token_id=50256)
+    >>> print(tokenizer.batch_decode(outputs)[0])
+    Just wanted to let you know, I'm not even a lawyer. I'm a man. I have no real knowledge of politics. I'm a
+
+    >>> # Generate sequences with exponential penalty, we add the exponential_decay_length_penalty=(start_index, decay_factor)
+    >>> # We see that instead of cutting at max_tokens, the output comes to an end before (at 25 tokens) and with more meaning
+    >>> # What happens is that starting from `start_index` the EOS token score will be increased by decay_factor exponentially
+    >>> outputs = model.generate(
+    ...     **inputs,
+    ...     do_sample=True,
+    ...     temperature=0.9,
+    ...     max_length=30,
+    ...     pad_token_id=50256,
+    ...     exponential_decay_length_penalty=(15, 1.6),
+    ... )
+    >>> print(tokenizer.batch_decode(outputs)[0])
+    Just wanted to let you know, I've got a very cool t-shirt educating people on how to use the Internet<|endoftext|>
+
+    >>> # Generate sequences with smaller decay_factor, still improving the hard cutoff mid-sentence
+    >>> outputs = model.generate(
+    ...     **inputs,
+    ...     do_sample=True,
+    ...     temperature=0.9,
+    ...     max_length=30,
+    ...     pad_token_id=50256,
+    ...     exponential_decay_length_penalty=(15, 1.05),
+    ... )
+    >>> print(tokenizer.batch_decode(outputs)[0])
+    Just wanted to let you know, I've been working on it for about 6 months and now it's in Alpha.<|endoftext|>
+    ```
     """
 
     def __init__(
-        self, exponential_decay_length_penalty: Tuple, eos_token_id: Union[int, List[int]], input_ids_seq_length: int
+        self,
+        exponential_decay_length_penalty: Tuple[int, float],
+        eos_token_id: Union[int, List[int]],
+        input_ids_seq_length: int,
     ):
         self.regulation_start = exponential_decay_length_penalty[0] + input_ids_seq_length
         self.regulation_factor = exponential_decay_length_penalty[1]
@@ -736,11 +1368,14 @@ def __init__(
             eos_token_id = [eos_token_id]
         self.eos_token_id = eos_token_id
 
-    def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.FloatTensor:
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         cur_len = input_ids.shape[-1]
         if cur_len > self.regulation_start:
             for i in self.eos_token_id:
-                scores[:, i] = scores[:, i] * pow(self.regulation_factor, cur_len - self.regulation_start)
+                penalty_idx = cur_len - self.regulation_start
+                # To support negative logits we compute the penalty of the absolute value and add to the original logit
+                scores[:, i] = scores[:, i] + torch.abs(scores[:, i]) * (pow(self.regulation_factor, penalty_idx) - 1)
         return scores
 
 
@@ -752,7 +1387,8 @@ class LogitNormalization(LogitsProcessor, LogitsWarper):
     the scores are normalized when comparing the hypotheses.
     """
 
-    def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         scores = scores.log_softmax(dim=-1)
         return scores
 
@@ -768,7 +1404,8 @@ def __init__(self, begin_suppress_tokens, begin_index):
         self.begin_suppress_tokens = list(begin_suppress_tokens)
         self.begin_index = begin_index
 
-    def __call__(self, input_ids, scores):
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         if input_ids.shape[1] == self.begin_index:
             scores[:, self.begin_suppress_tokens] = -float("inf")
 
@@ -782,7 +1419,8 @@ class SuppressTokensLogitsProcessor(LogitsProcessor):
     def __init__(self, suppress_tokens):
         self.suppress_tokens = list(suppress_tokens)
 
-    def __call__(self, input_ids, scores):
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         scores[:, self.suppress_tokens] = -float("inf")
         return scores
 
@@ -795,10 +1433,272 @@ class ForceTokensLogitsProcessor(LogitsProcessor):
     def __init__(self, force_token_map: List[List[int]]):
         self.force_token_map = dict(force_token_map)
 
-    def __call__(self, input_ids, scores):
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         generation_idx = input_ids.shape[-1]
         current_token = self.force_token_map.get(generation_idx, None)
         if current_token is not None:
             scores[:, :] = -float("inf")
             scores[:, current_token] = 0
         return scores
+
+
+class WhisperTimeStampLogitsProcessor(LogitsProcessor):
+    r"""
+    Whisper specific Processor. This processor can be used to force a list of tokens. The processor will set their log
+    probs to `inf` so that they are sampled at their corresponding index.
+
+    See [the paper](https://arxiv.org/abs/2212.04356) for more information.
+
+    Args:
+        generate_config (`GenerateConfig`):
+            The generate config used to generate the output. The following parameters are required:
+                eos_token_id (`int`, *optional*, defaults to 50257):
+                    The id of the *end-of-sequence* token.
+                no_timestamps_token_id (`int`, *optional*, defaults to 50363):
+                    The id of the `"<|notimestamps|>"` token.
+                max_initial_timestamp_index (`int`, *optional*, defaults to 1):
+                    Used to set the maximum value of the initial timestamp. This is used to prevent the model from
+                    predicting timestamps that are too far in the future.
+    """
+
+    def __init__(self, generate_config):  # support for the kwargs
+        self.eos_token_id = generate_config.eos_token_id
+        self.no_timestamps_token_id = generate_config.no_timestamps_token_id
+        self.timestamp_begin = generate_config.no_timestamps_token_id + 1
+
+        self.begin_index = len(generate_config.forced_decoder_ids) + 2
+        if generate_config.forced_decoder_ids[-1][1] == self.no_timestamps_token_id:
+            self.begin_index -= 1
+        self.max_initial_timestamp_index = generate_config.max_initial_timestamp_index
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        # suppress <|notimestamps|> which is handled by without_timestamps
+        scores[:, self.no_timestamps_token_id] = -float("inf")
+
+        if input_ids.shape[1] == self.begin_index - 1:
+            scores[:, :] = -float("inf")
+            scores[:, self.timestamp_begin] = 0
+            return scores
+
+        # timestamps have to appear in pairs, except directly before eos_token; mask logits accordingly
+        for k in range(input_ids.shape[0]):
+            seq = list(input_ids[k, self.begin_index :].tolist())
+            last_was_timestamp = len(seq) >= 1 and seq[-1] >= self.timestamp_begin
+            penultimate_was_timestamp = len(seq) < 2 or seq[-2] >= self.timestamp_begin
+
+            if last_was_timestamp:
+                if penultimate_was_timestamp:  # has to be non-timestamp
+                    scores[k, self.timestamp_begin :] = -float("inf")
+                else:  # cannot be normal text tokens
+                    scores[k, : self.eos_token_id] = -float("inf")
+
+            # apply the `max_initial_timestamp` option
+            if input_ids.shape[1] == self.begin_index and self.max_initial_timestamp_index is not None:
+                last_allowed = self.timestamp_begin + self.max_initial_timestamp_index
+                scores[:, last_allowed + 1 :] = -float("inf")
+
+        # if sum of probability over timestamps is above any other token, sample timestamp
+        logprobs = torch.nn.functional.log_softmax(scores.float(), dim=-1)
+        for k in range(input_ids.shape[0]):
+            timestamp_logprob = logprobs[k, self.timestamp_begin :].logsumexp(dim=-1)
+            max_text_token_logprob = logprobs[k, : self.timestamp_begin].max()
+            if timestamp_logprob > max_text_token_logprob:
+                scores[k, : self.timestamp_begin] = -float("inf")
+
+        return scores
+
+
+class ClassifierFreeGuidanceLogitsProcessor(LogitsProcessor):
+    r"""Logits processor for classifier free guidance (CFG). The scores are split over the batch dimension,
+    where the first half correspond to the conditional logits (predicted from the input prompt) and the second half
+    correspond to the unconditional logits (predicted from an empty or 'null' prompt). The processor computes a
+    weighted average across the conditional and unconditional logits, parameterised by the `guidance_scale`.
+
+    See [the paper](https://arxiv.org/abs/2306.05284) for more information.
+
+    Args:
+        guidance_scale (float):
+            The guidance scale for classifier free guidance (CFG). CFG is enabled by setting `guidance_scale > 1`.
+            Higher guidance scale encourages the model to generate samples that are more closely linked to the input
+            prompt, usually at the expense of poorer quality.
+    """
+
+    def __init__(self, guidance_scale):
+        if guidance_scale > 1:
+            self.guidance_scale = guidance_scale
+        else:
+            raise ValueError(
+                "Require guidance scale >1 to use the classifier free guidance processor, got guidance scale "
+                f"{guidance_scale}."
+            )
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        # simple check to make sure we have compatible batch sizes between our
+        # logits scores (cond + uncond) and input ids (cond only)
+        if scores.shape[0] != 2 * input_ids.shape[0]:
+            raise ValueError(
+                f"Logits should have twice the batch size of the input ids, the first half of batches corresponding to "
+                f"the conditional inputs, and the second half of batches corresponding to the unconditional inputs. Got "
+                f"batch size {scores.shape[0]} for the logits and {input_ids.shape[0]} for the input ids."
+            )
+        unguided_bsz = scores.shape[0] // 2
+        cond_logits, uncond_logits = scores.split(unguided_bsz, dim=0)
+        scores = uncond_logits + (cond_logits - uncond_logits) * self.guidance_scale
+        return scores
+
+
+class AlternatingCodebooksLogitsProcessor(LogitsProcessor):
+    r"""
+    [`LogitsProcessor`] enforcing alternated generation between the two codebooks of [`Bark`]'s fine submodel.
+
+    Args:
+        input_start_len (`int`):
+            The length of the initial input sequence.
+        semantic_vocab_size (`int`):
+            Vocabulary size of the semantic part, i.e number of tokens associated to the semantic vocabulary.
+        codebook_size (`int`):
+            Number of tokens associated to the codebook.
+    """
+
+    def __init__(self, input_start_len: int, semantic_vocab_size: int, codebook_size: int):
+        if not isinstance(input_start_len, int) or input_start_len < 0:
+            raise ValueError(f"`input_starting_length` has to be a non-negative integer, but is {input_start_len}")
+
+        self.input_start_len = input_start_len
+        self.semantic_vocab_size = semantic_vocab_size
+        self.codebook_size = codebook_size
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        curr_len = input_ids.shape[-1]
+
+        # even -> first codebook, odd -> second codebook
+        is_first_codebook = ((curr_len - self.input_start_len) % 2) == 0
+
+        if is_first_codebook:
+            scores[:, : self.semantic_vocab_size] = -float("inf")
+            scores[:, self.semantic_vocab_size + self.codebook_size :] = -float("inf")
+        else:
+            scores[:, : self.semantic_vocab_size + self.codebook_size] = -float("inf")
+
+        return scores
+
+
+class UnbatchedClassifierFreeGuidanceLogitsProcessor(LogitsProcessor):
+    r"""Logits processor for Classifier-Free Guidance (CFG). The processors
+    computes a weighted average across scores from prompt conditional and prompt unconditional (or negative) logits,
+    parameterized by the `guidance_scale`. The unconditional scores are computed internally by prompting `model` with
+    the `unconditional_ids` branch.
+
+    See [the paper](https://arxiv.org/abs/2306.17806) for more information.
+
+    Args:
+        guidance_scale (`float`):
+            The guidance scale for classifier free guidance (CFG). CFG is enabled by setting `guidance_scale != 1`.
+            Higher guidance scale encourages the model to generate samples that are more closely linked to the input
+            prompt, usually at the expense of poorer quality. A value smaller than 1 has the opposite effect, while
+            making the negative prompt provided with negative_prompt_ids (if any) act as a positive prompt.
+        model (`PreTrainedModel`):
+            The model computing the unconditional scores. Supposedly the same as the one computing the conditional
+            scores. Both models must use the same tokenizer.
+        unconditional_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of input sequence tokens in the vocabulary for the unconditional branch. If unset, will default to
+            the last token of the prompt.
+        unconditional_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Attention mask for unconditional_ids.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether to cache key/values during the negative prompt forward pass.
+
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM
+
+    >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    >>> inputs = tokenizer(["Today, a dragon flew over Paris, France,"], return_tensors="pt")
+    >>> out = model.generate(inputs["input_ids"], guidance_scale=1.5)
+    >>> tokenizer.batch_decode(out, skip_special_tokens=True)[0]
+    'Today, a dragon flew over Paris, France, killing at least 50 people and injuring more than 100'
+
+    >>> # with a negative prompt
+    >>> neg_inputs = tokenizer(["A very happy event happened,"], return_tensors="pt")
+    >>> out = model.generate(inputs["input_ids"], guidance_scale=2, negative_prompt_ids=neg_inputs["input_ids"])
+    >>> tokenizer.batch_decode(out, skip_special_tokens=True)[0]
+    'Today, a dragon flew over Paris, France, killing at least 130 people. French media reported that'
+
+    >>> # with a positive prompt
+    >>> neg_inputs = tokenizer(["A very happy event happened,"], return_tensors="pt")
+    >>> out = model.generate(inputs["input_ids"], guidance_scale=0, negative_prompt_ids=neg_inputs["input_ids"])
+    >>> tokenizer.batch_decode(out, skip_special_tokens=True)[0]
+    "Today, a dragon flew over Paris, France, and I'm very happy to be here. I"
+    ```
+    """
+
+    def __init__(
+        self,
+        guidance_scale: float,
+        model,
+        unconditional_ids: Optional[torch.LongTensor] = None,
+        unconditional_attention_mask: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = True,
+    ):
+        self.guidance_scale = guidance_scale
+        self.model = model
+        self.unconditional_context = {
+            "input_ids": unconditional_ids,
+            "attention_mask": unconditional_attention_mask,
+            "use_cache": use_cache,
+            "past_key_values": None,
+            "first_pass": True,
+        }
+
+    def get_unconditional_logits(self, input_ids):
+        if self.unconditional_context["first_pass"]:
+            if self.unconditional_context["input_ids"] is None:
+                self.unconditional_context["input_ids"] = input_ids[:, -1:]
+            if self.unconditional_context["attention_mask"] is None:
+                self.unconditional_context["attention_mask"] = torch.ones_like(
+                    self.unconditional_context["input_ids"], dtype=torch.long
+                )
+            input_ids = self.unconditional_context["input_ids"]
+            attention_mask = self.unconditional_context["attention_mask"]
+            self.unconditional_context["first_pass"] = False
+        else:
+            attention_mask = torch.cat(
+                [
+                    self.unconditional_context["attention_mask"],
+                    torch.ones_like(input_ids[:, -1:], dtype=torch.long),
+                ],
+                dim=1,
+            )
+            if not self.unconditional_context["use_cache"]:
+                input_ids = torch.cat([self.unconditional_context["input_ids"], input_ids[:, -1:]], dim=1)
+            else:
+                input_ids = input_ids[:, -1:]
+            self.unconditional_context["input_ids"] = input_ids
+            self.unconditional_context["attention_mask"] = attention_mask
+
+        out = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            use_cache=self.unconditional_context["use_cache"],
+            past_key_values=self.unconditional_context["past_key_values"],
+        )
+        self.unconditional_context["past_key_values"] = out.get("past_key_values", None)
+
+        return out.logits
+
+    def __call__(self, input_ids, scores):
+        scores = torch.nn.functional.log_softmax(scores, dim=-1)
+        if self.guidance_scale == 1:
+            return scores
+
+        logits = self.get_unconditional_logits(input_ids)
+
+        unconditional_logits = torch.nn.functional.log_softmax(logits[:, -1], dim=-1)
+        out = self.guidance_scale * (scores - unconditional_logits) + unconditional_logits
+        return out
diff --git a/src/transformers/generation/stopping_criteria.py b/src/transformers/generation/stopping_criteria.py
index 7023fa9998c9..8929bacd84a1 100644
--- a/src/transformers/generation/stopping_criteria.py
+++ b/src/transformers/generation/stopping_criteria.py
@@ -6,7 +6,10 @@
 
 import torch
 
-from ..utils import add_start_docstrings
+from ..utils import add_start_docstrings, logging
+
+
+logger = logging.get_logger(__name__)
 
 
 STOPPING_CRITERIA_INPUTS_DOCSTRING = r"""
@@ -14,14 +17,14 @@
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
         scores (`torch.FloatTensor` of shape `(batch_size, config.vocab_size)`):
             Prediction scores of a language modeling head. These can be scores for each vocabulary token before SoftMax
             or scores for each vocabulary token after SoftMax.
-        kwargs:
+        kwargs (`Dict[str, Any]`, *optional*):
             Additional stopping criteria specific kwargs.
 
     Return:
@@ -46,14 +49,25 @@ class MaxLengthCriteria(StoppingCriteria):
     Args:
         max_length (`int`):
             The maximum length that the output sequence can have in number of tokens.
+        max_position_embeddings (`int`, *optional*):
+            The maximum model length, as defined by the model's `config.max_position_embeddings` attribute.
     """
 
-    def __init__(self, max_length: int):
+    def __init__(self, max_length: int, max_position_embeddings: Optional[int] = None):
         self.max_length = max_length
+        self.max_position_embeddings = max_position_embeddings
 
     @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
-        return input_ids.shape[-1] >= self.max_length
+        cur_len = input_ids.shape[-1]
+        is_done = cur_len >= self.max_length
+        if self.max_position_embeddings is not None and not is_done and cur_len >= self.max_position_embeddings:
+            logger.warning_once(
+                "This is a friendly reminder - the current text generation call will exceed the model's predefined "
+                f"maximum length ({self.max_position_embeddings}). Depending on the model, you may observe "
+                "exceptions, performance degradation, or nothing at all."
+            )
+        return is_done
 
 
 class MaxNewTokensCriteria(StoppingCriteria):
diff --git a/src/transformers/generation/streamers.py b/src/transformers/generation/streamers.py
new file mode 100644
index 000000000000..4b299db5da69
--- /dev/null
+++ b/src/transformers/generation/streamers.py
@@ -0,0 +1,227 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from queue import Queue
+from typing import TYPE_CHECKING, Optional
+
+
+if TYPE_CHECKING:
+    from ..models.auto import AutoTokenizer
+
+
+class BaseStreamer:
+    """
+    Base class from which `.generate()` streamers should inherit.
+    """
+
+    def put(self, value):
+        """Function that is called by `.generate()` to push new tokens"""
+        raise NotImplementedError()
+
+    def end(self):
+        """Function that is called by `.generate()` to signal the end of generation"""
+        raise NotImplementedError()
+
+
+class TextStreamer(BaseStreamer):
+    """
+    Simple text streamer that prints the token(s) to stdout as soon as entire words are formed.
+
+    <Tip warning={true}>
+
+    The API for the streamer classes is still under development and may change in the future.
+
+    </Tip>
+
+    Parameters:
+        tokenizer (`AutoTokenizer`):
+            The tokenized used to decode the tokens.
+        skip_prompt (`bool`, *optional*, defaults to `False`):
+            Whether to skip the prompt to `.generate()` or not. Useful e.g. for chatbots.
+        decode_kwargs (`dict`, *optional*):
+            Additional keyword arguments to pass to the tokenizer's `decode` method.
+
+    Examples:
+
+        ```python
+        >>> from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
+
+        >>> tok = AutoTokenizer.from_pretrained("gpt2")
+        >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+        >>> inputs = tok(["An increasing sequence: one,"], return_tensors="pt")
+        >>> streamer = TextStreamer(tok)
+
+        >>> # Despite returning the usual output, the streamer will also print the generated text to stdout.
+        >>> _ = model.generate(**inputs, streamer=streamer, max_new_tokens=20)
+        An increasing sequence: one, two, three, four, five, six, seven, eight, nine, ten, eleven,
+        ```
+    """
+
+    def __init__(self, tokenizer: "AutoTokenizer", skip_prompt: bool = False, **decode_kwargs):
+        self.tokenizer = tokenizer
+        self.skip_prompt = skip_prompt
+        self.decode_kwargs = decode_kwargs
+
+        # variables used in the streaming process
+        self.token_cache = []
+        self.print_len = 0
+        self.next_tokens_are_prompt = True
+
+    def put(self, value):
+        """
+        Receives tokens, decodes them, and prints them to stdout as soon as they form entire words.
+        """
+        if len(value.shape) > 1 and value.shape[0] > 1:
+            raise ValueError("TextStreamer only supports batch size 1")
+        elif len(value.shape) > 1:
+            value = value[0]
+
+        if self.skip_prompt and self.next_tokens_are_prompt:
+            self.next_tokens_are_prompt = False
+            return
+
+        # Add the new token to the cache and decodes the entire thing.
+        self.token_cache.extend(value.tolist())
+        text = self.tokenizer.decode(self.token_cache, **self.decode_kwargs)
+
+        # After the symbol for a new line, we flush the cache.
+        if text.endswith("\n"):
+            printable_text = text[self.print_len :]
+            self.token_cache = []
+            self.print_len = 0
+        # If the last token is a CJK character, we print the characters.
+        elif len(text) > 0 and self._is_chinese_char(ord(text[-1])):
+            printable_text = text[self.print_len :]
+            self.print_len += len(printable_text)
+        # Otherwise, prints until the last space char (simple heuristic to avoid printing incomplete words,
+        # which may change with the subsequent token -- there are probably smarter ways to do this!)
+        else:
+            printable_text = text[self.print_len : text.rfind(" ") + 1]
+            self.print_len += len(printable_text)
+
+        self.on_finalized_text(printable_text)
+
+    def end(self):
+        """Flushes any remaining cache and prints a newline to stdout."""
+        # Flush the cache, if it exists
+        if len(self.token_cache) > 0:
+            text = self.tokenizer.decode(self.token_cache, **self.decode_kwargs)
+            printable_text = text[self.print_len :]
+            self.token_cache = []
+            self.print_len = 0
+        else:
+            printable_text = ""
+
+        self.next_tokens_are_prompt = True
+        self.on_finalized_text(printable_text, stream_end=True)
+
+    def on_finalized_text(self, text: str, stream_end: bool = False):
+        """Prints the new text to stdout. If the stream is ending, also prints a newline."""
+        print(text, flush=True, end="" if not stream_end else None)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if (
+            (cp >= 0x4E00 and cp <= 0x9FFF)
+            or (cp >= 0x3400 and cp <= 0x4DBF)  #
+            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
+            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
+            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
+        ):  #
+            return True
+
+        return False
+
+
+class TextIteratorStreamer(TextStreamer):
+    """
+    Streamer that stores print-ready text in a queue, to be used by a downstream application as an iterator. This is
+    useful for applications that benefit from acessing the generated text in a non-blocking way (e.g. in an interactive
+    Gradio demo).
+
+    <Tip warning={true}>
+
+    The API for the streamer classes is still under development and may change in the future.
+
+    </Tip>
+
+    Parameters:
+        tokenizer (`AutoTokenizer`):
+            The tokenized used to decode the tokens.
+        skip_prompt (`bool`, *optional*, defaults to `False`):
+            Whether to skip the prompt to `.generate()` or not. Useful e.g. for chatbots.
+        timeout (`float`, *optional*):
+            The timeout for the text queue. If `None`, the queue will block indefinitely. Useful to handle exceptions
+            in `.generate()`, when it is called in a separate thread.
+        decode_kwargs (`dict`, *optional*):
+            Additional keyword arguments to pass to the tokenizer's `decode` method.
+
+    Examples:
+
+        ```python
+        >>> from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+        >>> from threading import Thread
+
+        >>> tok = AutoTokenizer.from_pretrained("gpt2")
+        >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+        >>> inputs = tok(["An increasing sequence: one,"], return_tensors="pt")
+        >>> streamer = TextIteratorStreamer(tok)
+
+        >>> # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
+        >>> generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=20)
+        >>> thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        >>> thread.start()
+        >>> generated_text = ""
+        >>> for new_text in streamer:
+        ...     generated_text += new_text
+        >>> generated_text
+        'An increasing sequence: one, two, three, four, five, six, seven, eight, nine, ten, eleven,'
+        ```
+    """
+
+    def __init__(
+        self, tokenizer: "AutoTokenizer", skip_prompt: bool = False, timeout: Optional[float] = None, **decode_kwargs
+    ):
+        super().__init__(tokenizer, skip_prompt, **decode_kwargs)
+        self.text_queue = Queue()
+        self.stop_signal = None
+        self.timeout = timeout
+
+    def on_finalized_text(self, text: str, stream_end: bool = False):
+        """Put the new text in the queue. If the stream is ending, also put a stop signal in the queue."""
+        self.text_queue.put(text, timeout=self.timeout)
+        if stream_end:
+            self.text_queue.put(self.stop_signal, timeout=self.timeout)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        value = self.text_queue.get(timeout=self.timeout)
+        if value == self.stop_signal:
+            raise StopIteration()
+        else:
+            return value
diff --git a/src/transformers/generation/tf_logits_process.py b/src/transformers/generation/tf_logits_process.py
index 60eb5b73fe8d..fc9799b7ab39 100644
--- a/src/transformers/generation/tf_logits_process.py
+++ b/src/transformers/generation/tf_logits_process.py
@@ -42,7 +42,7 @@
         cur_len (`int`):
             The current length of valid input sequence tokens. In the TF implementation, the input_ids' sequence length
             is the maximum length generate can produce, and we need to know which of its tokens are valid.
-        kwargs:
+        kwargs (`Dict[str, Any]`, *optional*):
             Additional logits processor specific kwargs.
 
     Return:
@@ -122,7 +122,7 @@ class TFTopKLogitsWarper(TFLogitsWarper):
     Args:
         top_k (`int`):
             The number of highest probability vocabulary tokens to keep for top-k-filtering.
-        filter_value (`float`, *optional*, defaults to `-float("Inf")`):
+        filter_value (`float`, *optional*, defaults to -inf):
             All filtered values will be set to this float value.
         min_tokens_to_keep (`int`, *optional*, defaults to 1):
             Minimum number of tokens that cannot be filtered.
@@ -132,12 +132,11 @@ def __init__(self, top_k: int, filter_value: float = -float("Inf"), min_tokens_t
         if not isinstance(top_k, int) or top_k <= 0:
             raise ValueError(f"`top_k` has to be a strictly positive integer, but is {top_k}")
 
-        self.top_k = top_k
+        self.top_k = max(top_k, min_tokens_to_keep)
         self.filter_value = filter_value
-        self.min_tokens_to_keep = min_tokens_to_keep
 
     def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
-        top_k = min(max(self.top_k, self.min_tokens_to_keep), scores.shape[-1])  # Safety check
+        top_k = min(self.top_k, scores.shape[-1])  # Safety check
         # Boolean mask containing all tokens with a probability less than the last token of the top-k
         indices_to_remove = scores < tf.math.top_k(scores, k=top_k)[0][..., -1:]
         next_scores = tf.where(indices_to_remove, self.filter_value, scores)
@@ -152,7 +151,7 @@ class TFTopPLogitsWarper(TFLogitsWarper):
         top_p (`float`):
             If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
             higher are kept for generation.
-        filter_value (`float`, *optional*, defaults to `-float("Inf")`):
+        filter_value (`float`, *optional*, defaults to -inf):
             All filtered values will be set to this float value.
         min_tokens_to_keep (`int`, *optional*, defaults to 1):
             Minimum number of tokens that cannot be filtered.
@@ -161,6 +160,8 @@ class TFTopPLogitsWarper(TFLogitsWarper):
     def __init__(self, top_p: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
         if not isinstance(top_p, float) or (top_p < 0 or top_p > 1.0):
             raise ValueError(f"`top_p` has to be a float > 0 and < 1, but is {top_p}")
+        if not isinstance(min_tokens_to_keep, int) or (min_tokens_to_keep < 1):
+            raise ValueError(f"`min_tokens_to_keep` has to be a positive integer, but is {min_tokens_to_keep}")
 
         self.top_p = top_p
         self.filter_value = filter_value
@@ -291,13 +292,15 @@ class TFNoBadWordsLogitsProcessor(TFLogitsProcessor):
     Args:
         bad_words_ids (`List[List[int]]`):
             List of list of token ids that are not allowed to be generated. In order to get the tokens of the words
-            that should not appear in the generated text, use `tokenizer(bad_word, add_prefix_space=True).input_ids`.
+            that should not appear in the generated text, make sure to set `add_prefix_space=True` when initializing
+            the tokenizer, and use `tokenizer(bad_words, add_special_tokens=False).input_ids`. The `add_prefix_space`
+            argument is only supported for some slow tokenizers, as fast tokenizers' prefixing behaviours come from
+            `pre tokenizers`. Read more [here](https://huggingface.co/docs/tokenizers/api/pre-tokenizers).
         eos_token_id (`int`):
             The id of the *end-of-sequence* token.
     """
 
     def __init__(self, bad_words_ids: List[List[int]], eos_token_id: int):
-
         if not isinstance(bad_words_ids, List) or len(bad_words_ids) == 0:
             raise ValueError(f"`bad_words_ids` has to be a non-empty list, but is {bad_words_ids}.")
         if any(not isinstance(bad_word_ids, list) for bad_word_ids in bad_words_ids):
@@ -315,7 +318,7 @@ def __init__(self, bad_words_ids: List[List[int]], eos_token_id: int):
         self.bad_word_seqs_ids = tf.ragged.constant(bad_words_ids).to_tensor(default_value=-1)
         # 2. a tensor with the unpadded length of each forbidden sequence, for quick length comparisons
         bad_word_seqs_len = [len(bad_words) for bad_words in bad_words_ids]
-        if any([word_len == 0 for word_len in bad_word_seqs_len]):
+        if any(word_len == 0 for word_len in bad_word_seqs_len):
             raise ValueError(f"Banned words token sequences {bad_words_ids} cannot have an empty list")
         self.bad_word_seqs_len = tf.convert_to_tensor(bad_word_seqs_len, dtype=tf.int32)
         # 3. a tensor containing the last token for each sequence, for easy access to the tokens that may be banned
@@ -422,7 +425,6 @@ def _get_generated_ngrams(hypo_idx):
         return banned_tokens
 
     def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
-
         # TODO (joao): enable XLA on this logits processor. See discussion and attempts in
         # https://github.com/huggingface/transformers/pull/16974
         if not tf.executing_eagerly():
@@ -558,7 +560,8 @@ def __init__(self, force_token_map: List[List[int]]):
         # Indexes without forced tokens will have an negative value.
         force_token_array = np.ones((max(force_token_map.keys()) + 1), dtype=np.int32) * -1
         for index, token in force_token_map.items():
-            force_token_array[index] = token
+            if token is not None:
+                force_token_array[index] = token
         self.force_token_array = tf.convert_to_tensor(force_token_array, dtype=tf.int32)
 
     def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
diff --git a/src/transformers/generation/tf_utils.py b/src/transformers/generation/tf_utils.py
index 2d91bdb3eb4c..3c3d7c553149 100644
--- a/src/transformers/generation/tf_utils.py
+++ b/src/transformers/generation/tf_utils.py
@@ -210,6 +210,9 @@ class TFBeamSearchDecoderOnlyOutput(ModelOutput):
             softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this
             beam. Tuple of `tf.Tensor` with up to `max_new_tokens` elements (one element for each generated token),
             with each tensor of shape `(batch_size*num_beams*num_return_sequences, config.vocab_size)`.
+        beam_indices (`tf.Tensor`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+            Beam indices of generated token id at each generation step. `tf.Tensor` of shape
+            `(batch_size*num_return_sequences, sequence_length)`.
         attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
             `tf.Tensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
@@ -221,6 +224,7 @@ class TFBeamSearchDecoderOnlyOutput(ModelOutput):
     sequences: tf.Tensor = None
     sequences_scores: Optional[tf.Tensor] = None
     scores: Optional[Tuple[tf.Tensor]] = None
+    beam_indices: Optional[tf.Tensor] = None
     attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
     hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
 
@@ -243,7 +247,9 @@ class TFBeamSearchEncoderDecoderOutput(ModelOutput):
             softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this
             beam. `Tuple of `tf.Tensor` with up to `max_new_tokens` elements (one element for each generated token),
             with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
-        attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+        beam_indices (`tf.Tensor`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+            Beam indices of generated token id at each generation step. `tf.Tensor` of shape
+            `(batch_size*num_return_sequences, sequence_length)`.
         encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple of `tf.Tensor` (one for each layer of the decoder) of shape `(batch_size, num_heads, sequence_length,
             sequence_length)`.
@@ -265,6 +271,7 @@ class TFBeamSearchEncoderDecoderOutput(ModelOutput):
     sequences: tf.Tensor = None
     sequences_scores: Optional[tf.Tensor] = None
     scores: Optional[Tuple[tf.Tensor]] = None
+    beam_indices: Optional[tf.Tensor] = None
     encoder_attentions: Optional[Tuple[tf.Tensor]] = None
     encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
     decoder_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
@@ -288,6 +295,9 @@ class TFBeamSampleDecoderOnlyOutput(ModelOutput):
             softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this
             beam. Tuple of `tf.Tensor` with up to `max_new_tokens` elements (one element for each generated token),
             with each tensor of shape `(batch_size*num_beams*num_return_sequences, config.vocab_size)`.
+        beam_indices (`tf.Tensor`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+            Beam indices of generated token id at each generation step. `tf.Tensor` of shape
+            `(batch_size*num_return_sequences, sequence_length)`.
         attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
             `tf.Tensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
@@ -299,6 +309,7 @@ class TFBeamSampleDecoderOnlyOutput(ModelOutput):
     sequences: tf.Tensor = None
     sequences_scores: Optional[tf.Tensor] = None
     scores: Optional[Tuple[tf.Tensor]] = None
+    beam_indices: Optional[tf.Tensor] = None
     attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
     hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
 
@@ -321,6 +332,9 @@ class TFBeamSampleEncoderDecoderOutput(ModelOutput):
             softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this
             beam. Tuple of `tf.Tensor` with up to `max_new_tokens` elements (one element for each generated token),
             with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
+        beam_indices (`tf.Tensor`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+            Beam indices of generated token id at each generation step. `tf.Tensor` of shape
+            `(batch_size*num_return_sequences, sequence_length)`.
         encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple of `tf.Tensor` (one for each layer of the decoder) of shape `(batch_size, num_heads, sequence_length,
             sequence_length)`.
@@ -341,6 +355,7 @@ class TFBeamSampleEncoderDecoderOutput(ModelOutput):
     sequences: tf.Tensor = None
     sequences_scores: Optional[tf.Tensor] = None
     scores: Optional[Tuple[tf.Tensor]] = None
+    beam_indices: Optional[tf.Tensor] = None
     encoder_attentions: Optional[Tuple[tf.Tensor]] = None
     encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
     decoder_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
@@ -432,12 +447,15 @@ class TFGenerationMixin:
 
     The class exposes [`~generation.TFGenerationMixin.generate`], which can be used for:
         - *greedy decoding* by calling [`~generation.TFGenerationMixin.greedy_search`] if `num_beams=1` and
-          `do_sample=False`.
+          `do_sample=False`
         - *contrastive search* by calling [`~generation.TFGenerationMixin.contrastive_search`] if `penalty_alpha>0` and
           `top_k>1`
         - *multinomial sampling* by calling [`~generation.TFGenerationMixin.sample`] if `num_beams=1` and
-          `do_sample=True`.
-        - *beam-search decoding* by calling [`~generation.TFGenerationMixin.beam_search`] if `num_beams>1`.
+          `do_sample=True`
+        - *beam-search decoding* by calling [`~generation.TFGenerationMixin.beam_search`] if `num_beams>1`
+
+    You do not need to call any of the above methods directly. Pass custom parameter values to 'generate' instead. To
+    learn more about decoding strategies refer to the [text generation strategies guide](../generation_strategies).
     """
 
     _seed_generator = None
@@ -456,26 +474,125 @@ def prepare_inputs_for_generation(self, *args, **kwargs):
             "A model class needs to define a `prepare_inputs_for_generation` method in order to use `generate`."
         )
 
-    def adjust_logits_during_generation(
-        self, logits, cur_len, max_length, forced_bos_token_id, forced_eos_token_id, **kwargs
-    ):
-        """
-        Implement in subclasses of [`PreTrainedModel`] for custom behavior to adjust the logits in the generate method.
+    def compute_transition_scores(
+        self,
+        sequences: tf.Tensor,
+        scores: Tuple[tf.Tensor],
+        beam_indices: Optional[tf.Tensor] = None,
+        normalize_logits: bool = False,
+    ) -> tf.Tensor:
         """
-        vocab_size = getattr(self.config, "vocab_size", None)
-        if vocab_size is None and self.config.is_encoder_decoder:
-            decoder_config = getattr(self.config, "decoder", None)
-            if decoder_config is not None:
-                vocab_size = getattr(self.config.decoder, "vocab_size", None)
-
-        if cur_len == 1 and forced_bos_token_id is not None:
-            vocab_range = tf.constant(range(vocab_size))
-            return tf.where(vocab_range != forced_bos_token_id, -1e8, logits)
-        elif cur_len == max_length - 1 and forced_eos_token_id is not None:
-            vocab_range = tf.constant(range(vocab_size))
-            return tf.where(vocab_range != forced_eos_token_id, -1e8, logits)
-        else:
-            return logits
+        Computes the transition scores of sequences given the generation scores (and beam indices, if beam search was
+        used). This is a convenient method to quicky obtain the scores of the selected tokens at generation time.
+
+        Parameters:
+            sequences (`tf.Tensor`):
+                The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
+                shorter if all batches finished early due to the `eos_token_id`.
+            scores (`tuple(tf.Tensor)`):
+                Transition scores for each vocabulary token at each generation step. Beam transition scores consisting
+                of log probabilities of tokens conditioned on log softmax of previously generated tokens Tuple of
+                `tf.Tensor` with up to `max_new_tokens` elements (one element for each generated token), with each
+                tensor of shape `(batch_size*num_beams, config.vocab_size)`.
+            beam_indices (`tf.Tensor`, *optional*):
+                Beam indices of generated token id at each generation step. `tf.Tensor` of shape
+                `(batch_size*num_return_sequences, sequence_length)`. Only required if a `num_beams>1` at
+                generate-time.
+            normalize_logits (`bool`, *optional*, defaults to `False`):
+                Whether to normalize the logits (which, for legacy reasons, may be unnormalized).
+
+        Return:
+            `tf.Tensor`: A `tf.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)` containing
+                the transition scores (logits)
+
+        Examples:
+
+        ```python
+        >>> from transformers import GPT2Tokenizer, TFAutoModelForCausalLM
+        >>> import numpy as np
+
+        >>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+        >>> model = TFAutoModelForCausalLM.from_pretrained("gpt2")
+        >>> tokenizer.pad_token_id = tokenizer.eos_token_id
+        >>> inputs = tokenizer(["Today is"], return_tensors="tf")
+
+        >>> # Example 1: Print the scores for each token generated with Greedy Search
+        >>> outputs = model.generate(**inputs, max_new_tokens=5, return_dict_in_generate=True, output_scores=True)
+        >>> transition_scores = model.compute_transition_scores(
+        ...     outputs.sequences, outputs.scores, normalize_logits=True
+        ... )
+        >>> # input_length is the length of the input prompt for decoder-only models, like the GPT family, and 1 for
+        >>> # encoder-decoder models, like BART or T5.
+        >>> input_length = 1 if model.config.is_encoder_decoder else inputs.input_ids.shape[1]
+        >>> generated_tokens = outputs.sequences[:, input_length:]
+        >>> for tok, score in zip(generated_tokens[0], transition_scores[0]):
+        ...     # | token | token string | logits | probability
+        ...     print(f"| {tok:5d} | {tokenizer.decode(tok):8s} | {score.numpy():.3f} | {np.exp(score.numpy()):.2%}")
+        |   262 |  the     | -1.413 | 24.33%
+        |  1110 |  day     | -2.609 | 7.36%
+        |   618 |  when    | -2.009 | 13.41%
+        |   356 |  we      | -1.859 | 15.58%
+        |   460 |  can     | -2.508 | 8.14%
+
+        >>> # Example 2: Reconstruct the sequence scores from Beam Search
+        >>> outputs = model.generate(
+        ...     **inputs,
+        ...     max_new_tokens=5,
+        ...     num_beams=4,
+        ...     num_return_sequences=4,
+        ...     return_dict_in_generate=True,
+        ...     output_scores=True,
+        ... )
+        >>> transition_scores = model.compute_transition_scores(
+        ...     outputs.sequences, outputs.scores, outputs.beam_indices, normalize_logits=False
+        ... )
+        >>> # If you sum the generated tokens' scores and apply the length penalty, you'll get the sequence scores.
+        >>> # Tip: recomputing the scores is only guaranteed to match with `normalize_logits=False`. Depending on the
+        >>> # use case, you might want to recompute it with `normalize_logits=True`.
+        >>> output_length = input_length + np.sum(transition_scores.numpy() < 0, axis=1)
+        >>> length_penalty = model.generation_config.length_penalty
+        >>> reconstructed_scores = np.sum(transition_scores, axis=1) / (output_length**length_penalty)
+        >>> print(np.allclose(outputs.sequences_scores, reconstructed_scores))
+        True
+        ```"""
+        # 1. In absence of `beam_indices`, we can assume that we come from e.g. greedy search, which is equivalent
+        # to a beam search approach were the first (and only) beam is always selected
+        if beam_indices is None:
+            beam_indices = tf.tile(tf.expand_dims(tf.range(scores[0].shape[0]), axis=1), [1, len(scores)])
+
+        # 2. reshape scores as [batch_size, vocab_size, # generation steps] with # generation steps being
+        # seq_len - input_length
+        scores = tf.transpose(tf.reshape(tf.stack(scores), (len(scores), -1)), (1, 0))
+        scores = tf.reshape(scores, (-1, self.config.vocab_size, scores.shape[-1]))
+
+        # 3. Optionally normalize the logits (across the vocab dimension)
+        if normalize_logits:
+            scores = tf.nn.log_softmax(scores, axis=1)
+
+        # 4. cut beam_indices to longest beam length
+        beam_indices_mask = beam_indices < 0
+        max_beam_length = tf.math.reduce_max(
+            tf.math.reduce_sum((1 - tf.cast(beam_indices_mask, dtype=tf.int32)), axis=-1)
+        )
+        beam_indices = beam_indices[:, -max_beam_length:]
+        beam_indices_mask = beam_indices_mask[:, -max_beam_length:]
+
+        # 5. Set indices of beams that finished early to 0; such indices will be masked correctly afterwards
+        beam_indices = tf.where(beam_indices_mask, 0, beam_indices)
+
+        # 6. Define which indices contributed to scores
+        cut_idx = sequences.shape[-1] - max_beam_length
+        token_indices = sequences[:, cut_idx:]
+        gen_step_idx = tf.broadcast_to(tf.range(scores.shape[-1]), token_indices.shape)
+        indices = tf.stack([beam_indices, token_indices, gen_step_idx], axis=-1)
+
+        # 7. Compute scores
+        transition_scores = tf.gather_nd(scores, indices)
+
+        # 8. Mask out transition_scores of beams that stopped early
+        transition_scores = tf.where(beam_indices_mask, 0, transition_scores)
+
+        return transition_scores
 
     def _validate_model_class(self):
         """
@@ -527,8 +644,9 @@ def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]):
 
     def generate(
         self,
-        input_ids: Optional[tf.Tensor] = None,
+        inputs: Optional[tf.Tensor] = None,
         generation_config: Optional[GenerationConfig] = None,
+        logits_processor: Optional[TFLogitsProcessorList] = None,
         seed=None,
         **kwargs,
     ) -> Union[TFGenerateOutput, tf.Tensor]:
@@ -541,15 +659,17 @@ def generate(
         model's default generation configuration. You can override any `generation_config` by passing the corresponding
         parameters to generate, e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
 
-        For a complete overview of generate, check the [following
-        guide](https://huggingface.co/docs/transformers/en/main_classes/text_generation).
+        For an overview of generation strategies and code examples, check out the [following
+        guide](../generation_strategies).
 
         </Tip>
 
         Parameters:
-            input_ids (`tf.Tensor` of `dtype=tf.int32` and shape `(batch_size, sequence_length)`, *optional*):
-                The sequence used as a prompt for the generation. If `None` the method initializes it with
-                `bos_token_id` and a batch size of 1.
+            inputs (`tf.Tensor` of varying shape depending on the modality, *optional*):
+                The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
+                method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs`
+                should of in the format of `input_ids`. For encoder-decoder models *inputs* can represent any of
+                `input_ids`, `input_values`, `input_features`, or `pixel_values`.
             generation_config (`~generation.GenerationConfig`, *optional*):
                 The generation configuration to be used as base parametrization for the generation call. `**kwargs`
                 passed to generate matching the attributes of `generation_config` will override them. If
@@ -557,10 +677,14 @@ def generate(
                 priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
                 configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
                 default values, whose documentation should be checked to parameterize generation.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                Custom logits processors that complement the default logits processors built from arguments and
+                generation config. If a logit processor is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
             seed (`List[int]`, *optional*):
                 Random seed to control sampling, containing two integers, used when `do_sample` is `True`. See the
                 `seed` argument from stateless functions in `tf.random`.
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                 forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
                 specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
@@ -585,101 +709,44 @@ def generate(
                     - [`~generation.TFBeamSearchEncoderDecoderOutput`],
                     - [`~generation.TFBeamSampleEncoderDecoderOutput`]
 
-        Examples:
-
-        Greedy decoding, using the default generation configuration and ad hoc modifications:
-
-        ```python
-        >>> from transformers import AutoTokenizer, TFAutoModelForCausalLM
-
-        >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
-        >>> model = TFAutoModelForCausalLM.from_pretrained("gpt2")
-
-        >>> prompt = "Today I believe we can finally"
-        >>> input_ids = tokenizer(prompt, return_tensors="tf").input_ids
-
-        >>> # Generate up to 30 tokens
-        >>> outputs = model.generate(input_ids, do_sample=False, max_length=30)
-        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        ['Today I believe we can finally get to the point where we can make a difference in the lives of the people of the United States of America.\n']
-        ```
-
-        Multinomial sampling, modifying an existing generation configuration:
-
-        ```python
-        >>> from transformers import AutoTokenizer, TFAutoModelForCausalLM, GenerationConfig
-
-        >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
-        >>> model = TFAutoModelForCausalLM.from_pretrained("gpt2")
-
-        >>> prompt = "Today I believe we can finally"
-        >>> input_ids = tokenizer(prompt, return_tensors="tf").input_ids
-
-        >>> # Sample up to 30 tokens
-        >>> generation_config = GenerationConfig.from_pretrained("gpt2")
-        >>> generation_config.max_length = 30
-        >>> generation_config.do_sample = True
-        >>> outputs = model.generate(input_ids, generation_config=generation_config, seed=[0, 0])
-        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        ["Today I believe we can finally start taking a bold stand against climate change and climate change mitigation efforts such as President Obama's climate ban and President Trump's"]
-        ```
-
-        Beam-search decoding, using a freshly initialized generation configuration:
-
-        ```python
-        >>> from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM, GenerationConfig
-
-        >>> tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
-        >>> model = TFAutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-de")
-
-        >>> sentence = "Paris is one of the densest populated areas in Europe."
-        >>> input_ids = tokenizer(sentence, return_tensors="tf").input_ids
-
-        >>> generation_config = GenerationConfig(
-        ...     max_length=64,
-        ...     num_beams=5,
-        ...     bos_token_id=0,
-        ...     eos_token_id=0,
-        ...     decoder_start_token_id=58100,
-        ...     pad_token_id=58100,
-        ...     bad_words_ids=[[58100]],
-        ... )
-        >>> outputs = model.generate(input_ids, generation_config=generation_config)
-        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        ['Paris ist eines der dichtesten besiedelten Gebiete Europas.']
-        ```"""
+        """
 
         # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call
         self._validate_model_class()
 
         # priority: `generation_config` argument > `model.generation_config` (the default generation config)
         if generation_config is None:
-            # legacy: users may modify the model configuration to control generation -- update the generation config
-            # model attribute accordingly, if it was created from the model config
-            if self.generation_config._from_model_config:
+            # legacy: users may modify the model configuration to control generation. To trigger this legacy behavior,
+            # two conditions must be met
+            # 1) the generation config must have been created from the model config (`_from_model_config` field);
+            # 2) the generation config must have seen no modification since its creation (the hash is the same).
+            if self.generation_config._from_model_config and self.generation_config._original_object_hash == hash(
+                self.generation_config
+            ):
                 new_generation_config = GenerationConfig.from_model_config(self.config)
                 if new_generation_config != self.generation_config:
                     warnings.warn(
                         "You have modified the pretrained model configuration to control generation. This is a"
                         " deprecated strategy to control generation and will be removed soon, in a future version."
-                        " Please use a generation configuration file (see"
-                        " https://huggingface.co/docs/transformers/main_classes/text_generation)"
+                        " Please use and modify the model generation configuration (see"
+                        " https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )"
                     )
                     self.generation_config = new_generation_config
             generation_config = self.generation_config
 
         generation_config = copy.deepcopy(generation_config)
         model_kwargs = generation_config.update(**kwargs)  # All unused kwargs must be model kwargs
+        generation_config.validate()
         self._validate_model_kwargs(model_kwargs.copy())
 
         # 2. Cast input dtypes to tf.int32 unless they're floats (which happens for some image models)
-        if input_ids is not None:
-            if isinstance(input_ids, tf.Tensor) and input_ids.dtype.is_floating:
+        if inputs is not None:
+            if isinstance(inputs, tf.Tensor) and inputs.dtype.is_floating:
                 pass
-            elif isinstance(input_ids, np.ndarray) and np.issubdtype(input_ids.dtype, np.floating):
+            elif isinstance(inputs, np.ndarray) and np.issubdtype(inputs.dtype, np.floating):
                 pass
             else:
-                input_ids = tf.cast(input_ids, tf.int32)
+                inputs = tf.cast(inputs, tf.int32)
         if model_kwargs.get("attention_mask") is not None:
             model_kwargs["attention_mask"] = tf.cast(model_kwargs["attention_mask"], tf.int32)
         if "decoder_input_ids" in model_kwargs:
@@ -696,6 +763,8 @@ def generate(
                 model_kwargs["decoder_input_ids"] = tf.cast(model_kwargs["decoder_input_ids"], tf.int32)
 
         # 3. Set generation parameters if not already defined
+        logits_processor = logits_processor if logits_processor is not None else TFLogitsProcessorList()
+
         if generation_config.pad_token_id is None and generation_config.eos_token_id is not None:
             if model_kwargs.get("attention_mask") is None:
                 logger.warning(
@@ -715,9 +784,11 @@ def generate(
             )
 
         # 4. Define model inputs
-        input_ids = self._prepare_model_inputs(input_ids, generation_config.bos_token_id)
+        inputs_tensor, model_input_name, model_kwargs = self._prepare_model_inputs(
+            inputs, generation_config.bos_token_id, model_kwargs
+        )
         # inputs_ids now has to be defined and cannot be None anymore
-        batch_size = shape_list(input_ids)[0]
+        batch_size = shape_list(inputs_tensor)[0]
 
         # 5. Prepare other model kwargs
         model_kwargs["output_attentions"] = generation_config.output_attentions
@@ -729,64 +800,73 @@ def generate(
 
         if model_kwargs.get("attention_mask", None) is None and requires_attention_mask and accepts_attention_mask:
             model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation(
-                input_ids, generation_config.pad_token_id, generation_config.eos_token_id
+                inputs_tensor, generation_config.pad_token_id, generation_config.eos_token_id
             )
 
         # decoder-only models should use left-padding for generation
         if not self.config.is_encoder_decoder:
             if generation_config.pad_token_id is not None and tf.math.reduce_any(
-                input_ids[:, -1] == generation_config.pad_token_id
+                inputs_tensor[:, -1] == generation_config.pad_token_id
             ):
                 logger.warning(
                     "A decoder-only architecture is being used, but right-padding was detected! For correct "
                     "generation results, please set `padding_side='left'` when initializing the tokenizer."
                 )
+        if self.config.is_encoder_decoder and "encoder_outputs" not in model_kwargs:
+            # if model is encoder decoder encoder_outputs are created and added to `model_kwargs`
+            model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation(
+                inputs_tensor, model_kwargs, model_input_name
+            )
 
         # 6. Prepare model inputs which will be used for auto-regressive generation
         if self.config.is_encoder_decoder:
-            # if encoder-decoder, we create encoder_outputs and add to `model_kwargs`
-            model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation(input_ids, model_kwargs)
-            # if encoder-decoder then `input_ids` come from `decoder_start_token_id`
-            input_ids = self._prepare_decoder_input_ids_for_generation(
-                batch_size,
+            input_ids, model_kwargs = self._prepare_decoder_input_ids_for_generation(
+                batch_size=batch_size,
+                model_input_name=model_input_name,
+                model_kwargs=model_kwargs,
                 decoder_start_token_id=generation_config.decoder_start_token_id,
                 bos_token_id=generation_config.bos_token_id,
-                model_kwargs=model_kwargs,
             )
+        else:
+            input_ids = inputs_tensor if model_input_name == "input_ids" else model_kwargs.pop("input_ids")
 
         # 7. Prepare `max_length` depending on other stopping criteria.
-        input_ids_seq_length = input_ids.shape[-1]
+        input_ids_seq_length = shape_list(input_ids)[-1]
         has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
-        if has_default_max_length and generation_config.max_new_tokens is None:
+        if has_default_max_length and generation_config.max_new_tokens is None and generation_config.max_length == 20:
+            # 20 is the default max_length of the generation config
             warnings.warn(
-                "Neither `max_length` nor `max_new_tokens` have been set, `max_length` will default to"
-                f" {generation_config.max_length} (`generation_config.max_length`). Controlling `max_length` via the"
-                " config is deprecated and `max_length` will be removed from the config in v5 of Transformers -- we"
-                " recommend using `max_new_tokens` to control the maximum length of the generation.",
+                f"Using the model-agnostic default `max_length` (={generation_config.max_length}) "
+                "to control the generation length.  recommend setting `max_new_tokens` to control the maximum length of the generation.",
                 UserWarning,
             )
-        elif has_default_max_length and generation_config.max_new_tokens is not None:
+        elif generation_config.max_new_tokens is not None:
+            if not has_default_max_length and generation_config.max_length is not None:
+                logger.warning(
+                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
+                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
+                    "Please refer to the documentation for more information. "
+                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)"
+                )
             generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
-        elif not has_default_max_length and generation_config.max_new_tokens is not None:
-            raise ValueError(
-                "Both `max_new_tokens` and `max_length` have been set but they serve the same purpose -- setting a"
-                " limit to the generated output length. Remove one of those arguments. Please refer to the"
-                " documentation for more information. "
-                "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)"
-            )
 
-        if generation_config.min_length is not None and generation_config.min_length > generation_config.max_length:
-            raise ValueError(
-                f"Unfeasable length constraints: the minimum length ({generation_config.min_length}) is larger than"
-                f" the maximum length ({generation_config.max_length})"
-            )
-        if input_ids_seq_length >= generation_config.max_length:
-            input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
-            logger.warning(
-                f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
-                f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
-                " increasing`max_new_tokens`."
-            )
+        # If the input length is a tensor (i.e. dynamic length), skip length checks
+        if not isinstance(input_ids_seq_length, tf.Tensor):
+            if (
+                generation_config.min_length is not None
+                and generation_config.min_length > generation_config.max_length
+            ):
+                raise ValueError(
+                    f"Unfeasable length constraints: the minimum length ({generation_config.min_length}) is larger"
+                    f" than the maximum length ({generation_config.max_length})"
+                )
+            if input_ids_seq_length >= generation_config.max_length:
+                input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
+                logger.warning(
+                    f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
+                    f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
+                    " increasing`max_new_tokens`."
+                )
 
         # 8. determine generation mode
         is_contrastive_search_gen_mode = (
@@ -813,6 +893,7 @@ def generate(
         logits_processor = self._get_logits_processor(
             generation_config=generation_config,
             input_ids_seq_length=input_ids_seq_length,
+            logits_processor=logits_processor,
         )
 
         # 10. go into different generation modes
@@ -887,17 +968,13 @@ def generate(
                 )
 
             # 11. broadcast inputs to the desired number of beams
-            input_ids = self._expand_to_num_beams(input_ids, num_beams=generation_config.num_beams)
-
-            if "encoder_outputs" in model_kwargs:
-                model_kwargs["encoder_outputs"]["last_hidden_state"] = self._expand_to_num_beams(
-                    model_kwargs["encoder_outputs"]["last_hidden_state"], num_beams=generation_config.num_beams
-                )
-
-            if "attention_mask" in model_kwargs:
-                model_kwargs["attention_mask"] = self._expand_to_num_beams(
-                    model_kwargs["attention_mask"], num_beams=generation_config.num_beams
-                )
+            input_ids, model_kwargs = self._expand_inputs_for_generation(
+                input_ids=input_ids,
+                expand_size=generation_config.num_beams,
+                is_encoder_decoder=self.config.is_encoder_decoder,
+                expand_in_new_axis=True,
+                **model_kwargs,
+            )
 
             # 12. run beam search
             return self.beam_search(
@@ -908,6 +985,7 @@ def generate(
                 length_penalty=generation_config.length_penalty,
                 early_stopping=generation_config.early_stopping,
                 logits_processor=logits_processor,
+                output_scores=generation_config.output_scores,
                 return_dict_in_generate=generation_config.return_dict_in_generate,
                 num_return_sequences=generation_config.num_return_sequences,
                 **model_kwargs,
@@ -925,17 +1003,13 @@ def generate(
             logits_warper = self._get_logits_warper(generation_config=generation_config)
 
             # 12. broadcast inputs to the desired number of beams
-            input_ids = self._expand_to_num_beams(input_ids, num_beams=generation_config.num_beams)
-
-            if "encoder_outputs" in model_kwargs:
-                model_kwargs["encoder_outputs"]["last_hidden_state"] = self._expand_to_num_beams(
-                    model_kwargs["encoder_outputs"]["last_hidden_state"], num_beams=generation_config.num_beams
-                )
-
-            if "attention_mask" in model_kwargs:
-                model_kwargs["attention_mask"] = self._expand_to_num_beams(
-                    model_kwargs["attention_mask"], num_beams=generation_config.num_beams
-                )
+            input_ids, model_kwargs = self._expand_inputs_for_generation(
+                input_ids=input_ids,
+                expand_size=generation_config.num_beams,
+                is_encoder_decoder=self.config.is_encoder_decoder,
+                expand_in_new_axis=True,
+                **model_kwargs,
+            )
 
             # 13. run beam sample (beam search with sampling)
             return self.beam_search(
@@ -948,16 +1022,12 @@ def generate(
                 early_stopping=generation_config.early_stopping,
                 logits_processor=logits_processor,
                 logits_warper=logits_warper,
+                output_scores=generation_config.output_scores,
                 return_dict_in_generate=generation_config.return_dict_in_generate,
                 num_return_sequences=generation_config.num_return_sequences,
                 **model_kwargs,
             )
 
-    @staticmethod
-    def _expand_to_num_beams(tensor: tf.Tensor, num_beams: int) -> tf.Tensor:
-        shape = shape_list(tensor)
-        return tf.broadcast_to(tensor[:, None], (shape[0], num_beams) + tuple(shape[1:]))
-
     def _prepare_attention_mask_for_generation(
         self,
         inputs: tf.Tensor,
@@ -974,21 +1044,31 @@ def _prepare_attention_mask_for_generation(
         else:
             return tf.ones(inputs.shape[:2], dtype=tf.int32)
 
-    def _prepare_encoder_decoder_kwargs_for_generation(self, inputs_tensor: tf.Tensor, model_kwargs) -> Dict[str, Any]:
-        # get encoder and store encoder outputs
+    def _prepare_encoder_decoder_kwargs_for_generation(
+        self, inputs_tensor: tf.Tensor, model_kwargs, model_input_name: Optional[str] = None
+    ) -> Dict[str, Any]:
+        # 1. get encoder and store encoder outputs
         encoder = self.get_encoder()
 
-        # prepare encoder args and encoder kwargs from model kwargs
+        # 2. prepare encoder args and encoder kwargs from model kwargs
         irrelevant_prefix = ["decoder_", "cross_attn", "use_cache"]
         encoder_kwargs = {
             argument: value
             for argument, value in model_kwargs.items()
             if not any(argument.startswith(p) for p in irrelevant_prefix)
         }
+        encoder_signature = set(inspect.signature(encoder.call).parameters)
+        encoder_accepts_wildcard = "kwargs" in encoder_signature or "model_kwargs" in encoder_signature
+        if not encoder_accepts_wildcard:
+            encoder_kwargs = {
+                argument: value for argument, value in encoder_kwargs.items() if argument in encoder_signature
+            }
 
-        # vision models don't use `attention_mask`.
+        # 3. vision models don't use `attention_mask`.
         encoder_kwargs["return_dict"] = True
-        encoder_kwargs[self.main_input_name] = inputs_tensor
+        encoder_kwargs[model_input_name] = inputs_tensor
+        if model_input_name != self.main_input_name:  # in Keras, the first input must always be passed
+            encoder_kwargs[self.main_input_name] = None
         encoder_outputs = encoder(**encoder_kwargs)
         model_kwargs["encoder_outputs"] = encoder_outputs
 
@@ -997,17 +1077,41 @@ def _prepare_encoder_decoder_kwargs_for_generation(self, inputs_tensor: tf.Tenso
     def _prepare_decoder_input_ids_for_generation(
         self,
         batch_size: int,
+        model_input_name: str,
+        model_kwargs: Dict[str, tf.Tensor],
         decoder_start_token_id: int = None,
         bos_token_id: int = None,
-        model_kwargs: Optional[Dict[str, tf.Tensor]] = None,
-    ) -> tf.Tensor:
-
-        # prepare `input_ids` for decoder if model is encoder-decoder
+    ) -> Tuple[tf.Tensor, Dict[str, tf.Tensor]]:
+        """Prepares `decoder_input_ids` for generation with encoder-decoder models"""
+        # 1. Check whether the user has defined `decoder_input_ids` manually. To facilitate in terms of input naming,
+        # we also allow the user to pass it under `input_ids`, if the encoder does not use it as the main input.
         if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
-            return model_kwargs.pop("decoder_input_ids")
+            decoder_input_ids = model_kwargs.pop("decoder_input_ids")
+        elif "input_ids" in model_kwargs and model_input_name != "input_ids":
+            decoder_input_ids = model_kwargs.pop("input_ids")
         else:
-            decoder_start_token_id = self._get_decoder_start_token_id(decoder_start_token_id, bos_token_id)
-            return tf.ones((batch_size, 1), dtype=tf.int32) * decoder_start_token_id
+            decoder_input_ids = None
+
+        # 2. Encoder-decoder models expect the `decoder_input_ids` to start with a special token. Let's ensure that.
+        decoder_start_token_id = self._get_decoder_start_token_id(decoder_start_token_id, bos_token_id)
+        decoder_input_ids_start = tf.ones((batch_size, 1), dtype=tf.int32) * decoder_start_token_id
+
+        # no user input -> use decoder_start_token_id as decoder_input_ids
+        if decoder_input_ids is None:
+            decoder_input_ids = decoder_input_ids_start
+        # user input but doesn't start with decoder_start_token_id -> prepend decoder_start_token_id (and adjust
+        # decoder_attention_mask if provided)
+        elif tf.reduce_all(decoder_input_ids[:, 0] != decoder_start_token_id):
+            decoder_input_ids = tf.concat([decoder_input_ids_start, decoder_input_ids], axis=-1)
+            if "decoder_attention_mask" in model_kwargs:
+                decoder_attention_mask = model_kwargs["decoder_attention_mask"]
+                decoder_attention_mask = tf.concat(
+                    (tf.ones_like(decoder_attention_mask)[:, :1], decoder_attention_mask),
+                    axis=-1,
+                )
+                model_kwargs["decoder_attention_mask"] = decoder_attention_mask
+
+        return decoder_input_ids, model_kwargs
 
     def _get_decoder_start_token_id(self, decoder_start_token_id: int = None, bos_token_id: int = None) -> int:
         # retrieve decoder_start_token_id for encoder-decoder models
@@ -1032,45 +1136,133 @@ def _expand_inputs_for_generation(
         expand_size: int = 1,
         is_encoder_decoder: bool = False,
         input_ids: Optional[tf.Tensor] = None,
+        expand_in_new_axis: bool = False,
         **model_kwargs,
     ) -> Tuple[tf.Tensor, Dict[str, Any]]:
-        """Expands tensors from [batch_size, ...] to [batch_size * expand_size, ...]"""
-        if input_ids is not None:
-            input_ids = tf.repeat(input_ids, expand_size, axis=0)
+        """
+        Expands tensors from [batch_size, ...] to [batch_size * expand_size, ...] or [batch_size, expand_size, ...],
+        depending on `expand_in_new_axis`. Beam-based approaches expect this function to be used with
+        `expand_in_new_axis=True`
+        """
 
-        if model_kwargs.get("token_type_ids") is not None:
-            model_kwargs["token_type_ids"] = tf.repeat(model_kwargs["token_type_ids"], expand_size, axis=0)
+        def _expand_tensor(tensor: tf.Tensor):
+            if expand_in_new_axis:
+                shape = shape_list(tensor)
+                return tf.broadcast_to(tensor[:, None], (shape[0], expand_size) + tuple(shape[1:]))
+            else:
+                return tf.repeat(tensor, expand_size, axis=0)
 
-        if model_kwargs.get("attention_mask") is not None:
-            model_kwargs["attention_mask"] = tf.repeat(model_kwargs["attention_mask"], expand_size, axis=0)
+        def _expand_dict_for_generation(dict_to_expand):
+            for key in dict_to_expand:
+                if dict_to_expand[key] is not None and isinstance(dict_to_expand[key], tf.Tensor):
+                    dict_to_expand[key] = _expand_tensor(dict_to_expand[key])
+            return dict_to_expand
 
-        if model_kwargs.get("decoder_attention_mask") is not None:
-            model_kwargs["decoder_attention_mask"] = tf.repeat(
-                model_kwargs["decoder_attention_mask"], expand_size, axis=0
-            )
+        if input_ids is not None:
+            input_ids = _expand_tensor(input_ids)
+
+        model_kwargs = _expand_dict_for_generation(model_kwargs)
 
         if is_encoder_decoder:
-            encoder_outputs = model_kwargs.get("encoder_outputs")
-            if encoder_outputs is None:
+            if model_kwargs.get("encoder_outputs") is None:
                 raise ValueError("If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.")
-            encoder_outputs["last_hidden_state"] = tf.repeat(encoder_outputs.last_hidden_state, expand_size, axis=0)
-            model_kwargs["encoder_outputs"] = encoder_outputs
+            model_kwargs["encoder_outputs"] = _expand_dict_for_generation(model_kwargs["encoder_outputs"])
 
         return input_ids, model_kwargs
 
-    def _prepare_model_inputs(self, inputs: Optional[tf.Tensor] = None, bos_token_id: Optional[int] = None):
-        # TODO(Patrick) - adapt this function when making `generate` more flexible
-        # for all kinds of input types
-        if inputs is None:
-            # if no `inputs` are passed create prompt of size (1,1) filled with BOS token
-            if not isinstance(bos_token_id, int) or bos_token_id < 0:
-                raise ValueError(
-                    "you should either supply a context to complete as `input_ids` input "
-                    "or a `bos_token_id` (integer >= 0) as a first token to start the generation."
+    def _prepare_model_inputs(
+        self,
+        inputs: Optional[tf.Tensor] = None,
+        bos_token_id: Optional[int] = None,
+        model_kwargs: Optional[Dict[str, tf.Tensor]] = None,
+    ) -> Tuple[tf.Tensor, Optional[str], Dict[str, tf.Tensor]]:
+        """
+        This function extracts the model-specific `inputs` for generation.
+        """
+        # 1. retrieve all kwargs that are non-None or non-model input related.
+        # some encoder-decoder models have different names for model and encoder
+        if (
+            self.config.is_encoder_decoder
+            and hasattr(self, "encoder")
+            and hasattr(self.encoder, "main_input_name")
+            and self.encoder.main_input_name != self.main_input_name
+        ):
+            input_name = self.encoder.main_input_name
+        else:
+            input_name = self.main_input_name
+
+        model_kwargs = {k: v for k, v in model_kwargs.items() if v is not None or k != input_name}
+
+        # 2. check whether model_input_name is passed as kwarg
+        # if yes and `inputs` is None use kwarg inputs
+        inputs_kwarg = model_kwargs.pop(input_name, None)
+        if inputs_kwarg is not None and inputs is not None:
+            raise ValueError(
+                f"`inputs`: {inputs}` were passed alongside {input_name} which is not allowed."
+                f"Make sure to either pass {inputs} or {input_name}=..."
+            )
+        elif inputs_kwarg is not None:
+            inputs = inputs_kwarg
+
+        # 3. In the presence of `inputs_embeds` for text models:
+        # - decoder-only models should complain if the user attempts to pass `inputs_embeds`, but the model
+        # doesn't have its forwarding implemented. `inputs_embeds` is kept in `model_kwargs` and can coexist with
+        # input_ids (`inputs_embeds` will be used in the 1st generation step, as opposed to `input_ids`)
+        # - encoder-decoder models should complain if the user attempts to pass `inputs_embeds` and `input_ids`, and
+        # pull the former to inputs. It will be used in place of `input_ids` to get the encoder hidden states.
+        if input_name == "input_ids" and "inputs_embeds" in model_kwargs:
+            if not self.config.is_encoder_decoder:
+                has_inputs_embeds_forwarding = "inputs_embeds" in set(
+                    inspect.signature(self.prepare_inputs_for_generation).parameters.keys()
                 )
-            return tf.cast(tf.fill((1, 1), bos_token_id), dtype=tf.int32)
+                if not has_inputs_embeds_forwarding:
+                    raise ValueError(
+                        f"You passed `inputs_embeds` to `.generate()`, but the model class {self.__class__.__name__} "
+                        "doesn't have its forwarding implemented. See the GPT2 implementation for an example "
+                        "(https://github.com/huggingface/transformers/pull/21405), and feel free to open a PR with it!"
+                    )
+                # In this case, `input_ids` is moved to the `model_kwargs`, so a few automations (like the creation of
+                # the attention mask) can rely on the actual model input.
+                model_kwargs["input_ids"] = self._maybe_initialize_input_ids_for_generation(
+                    inputs, bos_token_id, model_kwargs=model_kwargs
+                )
+            else:
+                if inputs is not None:
+                    raise ValueError("You passed `inputs_embeds` and `input_ids` to `.generate()`. Please pick one.")
+            inputs, input_name = model_kwargs["inputs_embeds"], "inputs_embeds"
+
+        # 4. if `inputs` is still None, try to create `input_ids` from BOS token
+        inputs = self._maybe_initialize_input_ids_for_generation(inputs, bos_token_id, model_kwargs)
 
-        return inputs
+        return inputs, input_name, model_kwargs
+
+    def _maybe_initialize_input_ids_for_generation(
+        self,
+        inputs: Optional[tf.Tensor] = None,
+        bos_token_id: Optional[int] = None,
+        model_kwargs: Optional[Dict[str, tf.Tensor]] = None,
+    ) -> tf.Tensor:
+        """Initializes input ids for generation, if necessary."""
+        if inputs is not None:
+            return inputs
+
+        encoder_outputs = model_kwargs.get("encoder_outputs")
+        if self.config.is_encoder_decoder and encoder_outputs is not None:
+            # make dummy input_ids with value -100, as a sanity check ensuring that they won't be used for encoding
+            shape = encoder_outputs.last_hidden_state.shape[:-1]
+            return tf.ones(shape, dtype=tf.int32) * -100
+
+        if bos_token_id is None:
+            raise ValueError("`bos_token_id` has to be defined when no `input_ids` are provided.")
+
+        # If there is some tensor in `model_kwargs`, we can infer the batch size from it. This is helpful with
+        # soft-prompting or in multimodal implementations built on top of decoder-only language models.
+        batch_size = 1
+        for value in model_kwargs.values():
+            if isinstance(value, tf.Tensor):
+                batch_size = value.shape[0]
+                break
+        return tf.ones((batch_size, 1), dtype=tf.int32) * bos_token_id
 
     @staticmethod
     def _extract_past_from_model_output(outputs: ModelOutput):
@@ -1252,6 +1444,7 @@ def _get_logits_processor(
         self,
         generation_config: GenerationConfig,
         input_ids_seq_length: int,
+        logits_processor: Optional[TFLogitsProcessorList],
     ) -> TFLogitsProcessorList:
         """
         This class returns a [`TFLogitsProcessorList`] list object that contains all relevant [`TFLogitsProcessor`]
@@ -1298,8 +1491,31 @@ def _get_logits_processor(
             )
         if generation_config.forced_decoder_ids is not None:
             processors.append(TFForceTokensLogitsProcessor(generation_config.forced_decoder_ids))
+
+        processors = self._merge_criteria_processor_list(processors, logits_processor)
         return processors
 
+    def _merge_criteria_processor_list(
+        self,
+        default_list: TFLogitsProcessorList,
+        custom_list: TFLogitsProcessorList,
+    ) -> TFLogitsProcessorList:
+        if len(custom_list) == 0:
+            return default_list
+        for default in default_list:
+            for custom in custom_list:
+                if type(custom) is type(default):
+                    object_type = "logits processor"
+                    raise ValueError(
+                        f"A custom {object_type} of type {type(custom)} with values {custom} has been passed to"
+                        f" `generate`, but it has already been created with the values {default}. {default} has been"
+                        " created by passing the corresponding arguments to generate or by the model's config default"
+                        f" values. If you just want to change the default values of {object_type} consider passing"
+                        f" them as arguments to `generate` instead of using a custom {object_type}."
+                    )
+        default_list.extend(custom_list)
+        return default_list
+
     def greedy_search(
         self,
         input_ids: tf.Tensor,
@@ -1326,8 +1542,8 @@ def greedy_search(
                 The maximum length of the sequence to be generated.
             pad_token_id (`int`, *optional*):
                 The id of the *padding* token.
-            eos_token_id (`int`, *optional*):
-                The id of the *end-of-sequence* token.
+            eos_token_id (`Union[int, List[int]]`, *optional*):
+                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
             output_attentions (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more details.
@@ -1386,6 +1602,8 @@ def greedy_search(
         max_length = max_length if max_length is not None else self.generation_config.max_length
         pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
         eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
         output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
         output_attentions = (
             output_attentions if output_attentions is not None else self.generation_config.output_attentions
@@ -1403,7 +1621,7 @@ def greedy_search(
         # TODO (Joao): fix cache format or find programatic way to detect cache index
         # GPT2 and other models has a slightly different cache structure, with a different batch axis
         model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
-        cache_batch_axis = 1 if any([model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")]) else 0
+        cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
         # some models, like XLNet, need more than the last token in the presence of past_key_values
         needs_full_input = "use_mems" in set(inspect.signature(self.prepare_inputs_for_generation).parameters.keys())
 
@@ -1444,10 +1662,13 @@ def greedy_search_body_fn(generated, finished_sequences, cur_len, model_kwargs):
             )
             next_token_logits = model_outputs.logits[:, -1]
 
+            # pre-process distribution
+            next_tokens_scores = logits_processor(generated, next_token_logits, cur_len)
+
             # Store scores, attentions and hidden_states when required
             if not use_xla and return_dict_in_generate:
                 if output_scores:
-                    scores.append(next_token_logits)
+                    scores.append(next_tokens_scores)
                 if output_attentions and self.config.is_encoder_decoder:
                     decoder_attentions.append(model_outputs.decoder_attentions)
                 elif output_attentions and not self.config.is_encoder_decoder:
@@ -1460,9 +1681,6 @@ def greedy_search_body_fn(generated, finished_sequences, cur_len, model_kwargs):
                 elif output_hidden_states and self.config.is_encoder_decoder:
                     decoder_hidden_states.append(model_outputs.hidden_states)
 
-            # pre-process distribution
-            next_tokens_scores = logits_processor(generated, next_token_logits, cur_len)
-
             # argmax
             next_tokens = tf.argmax(next_tokens_scores, axis=-1, output_type=tf.int32)
 
@@ -1471,7 +1689,13 @@ def greedy_search_body_fn(generated, finished_sequences, cur_len, model_kwargs):
                     raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
                 unfinished_seq = 1 - tf.cast(finished_sequences, tf.int32)
                 next_tokens = next_tokens * unfinished_seq + pad_token_id * (1 - unfinished_seq)
-            finished_sequences = finished_sequences | (next_tokens == eos_token_id)
+                next_token_is_eos = tf.math.reduce_any(
+                    tf.equal(
+                        tf.broadcast_to(next_tokens, (len(eos_token_id), batch_size)), tf.expand_dims(eos_token_id, -1)
+                    ),
+                    axis=0,
+                )
+                finished_sequences = finished_sequences | next_token_is_eos
 
             # update `generated` and `cur_len`
             update_indices = tf.stack([tf.range(batch_size), tf.broadcast_to(cur_len, [batch_size])], axis=-1)
@@ -1508,14 +1732,13 @@ def greedy_search_body_fn(generated, finished_sequences, cur_len, model_kwargs):
 
         # 2-to-n generation steps can then be run in autoregressive fashion
         # only in case 1st generation step does NOT yield EOS token though
-        if greedy_search_cond_fn(generated, finished_sequences, cur_len, model_kwargs):
-            maximum_iterations = max_length - cur_len
-            generated, _, cur_len, _ = tf.while_loop(
-                greedy_search_cond_fn,
-                greedy_search_body_fn,
-                (generated, finished_sequences, cur_len, model_kwargs),
-                maximum_iterations=maximum_iterations,
-            )
+        maximum_iterations = max_length - cur_len
+        generated, _, cur_len, _ = tf.while_loop(
+            greedy_search_cond_fn,
+            greedy_search_body_fn,
+            (generated, finished_sequences, cur_len, model_kwargs),
+            maximum_iterations=maximum_iterations,
+        )
 
         # 6. prepare outputs
         if not use_xla:
@@ -1587,8 +1810,8 @@ def sample(
                 The maximum length of the sequence to be generated.
             pad_token_id (`int`, *optional*):
                 The id of the *padding* token.
-            eos_token_id (`int`, *optional*):
-                The id of the *end-of-sequence* token.
+            eos_token_id (`Union[int, List[int]]`, *optional*):
+                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
             seed (`List[int]`, *optional*):
                 Random seed to control sampling, containing two integers, used when `do_sample` is `True`. See the
                 `seed` argument from stateless functions in `tf.random`.
@@ -1663,6 +1886,8 @@ def sample(
         max_length = max_length if max_length is not None else self.generation_config.max_length
         pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
         eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
         output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
         output_attentions = (
             output_attentions if output_attentions is not None else self.generation_config.output_attentions
@@ -1680,7 +1905,7 @@ def sample(
         # TODO (Joao): fix cache format or find programatic way to detect cache index
         # GPT2 and other models has a slightly different cache structure, with a different batch axis
         model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
-        cache_batch_axis = 1 if any([model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")]) else 0
+        cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
         # some models, like XLNet, need more than the last token in the presence of past_key_values
         needs_full_input = "use_mems" in set(inspect.signature(self.prepare_inputs_for_generation).parameters.keys())
 
@@ -1717,10 +1942,14 @@ def sample_body_fn(generated, finished_sequences, cur_len, model_kwargs):
             )
             next_token_logits = model_outputs.logits[:, -1]
 
+            # pre-process distribution
+            next_tokens_scores = logits_processor(generated, next_token_logits, cur_len)
+            next_tokens_scores = logits_warper(generated, next_tokens_scores, cur_len)
+
             # Store scores, attentions and hidden_states when required
             if not use_xla and return_dict_in_generate:
                 if output_scores:
-                    scores.append(next_token_logits)
+                    scores.append(next_tokens_scores)
                 if output_attentions and self.config.is_encoder_decoder:
                     decoder_attentions.append(model_outputs.decoder_attentions)
                 elif output_attentions and not self.config.is_encoder_decoder:
@@ -1733,10 +1962,6 @@ def sample_body_fn(generated, finished_sequences, cur_len, model_kwargs):
                 elif output_hidden_states and self.config.is_encoder_decoder:
                     decoder_hidden_states.append(model_outputs.hidden_states)
 
-            # pre-process distribution
-            next_tokens_scores = logits_processor(generated, next_token_logits, cur_len)
-            next_tokens_scores = logits_warper(generated, next_tokens_scores, cur_len)
-
             # sample
             if seed is not None:
                 sample_seed = seed
@@ -1754,7 +1979,13 @@ def sample_body_fn(generated, finished_sequences, cur_len, model_kwargs):
                     raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
                 unfinished_seq = 1 - tf.cast(finished_sequences, tf.int32)
                 next_tokens = next_tokens * unfinished_seq + pad_token_id * (1 - unfinished_seq)
-            finished_sequences = finished_sequences | (next_tokens == eos_token_id)
+                next_token_is_eos = tf.math.reduce_any(
+                    tf.equal(
+                        tf.broadcast_to(next_tokens, (len(eos_token_id), batch_size)), tf.expand_dims(eos_token_id, -1)
+                    ),
+                    axis=0,
+                )
+                finished_sequences = finished_sequences | next_token_is_eos
 
             # update `generated` and `cur_len`
             update_indices = tf.stack([tf.range(batch_size), tf.broadcast_to(cur_len, [batch_size])], axis=-1)
@@ -1791,14 +2022,13 @@ def sample_body_fn(generated, finished_sequences, cur_len, model_kwargs):
 
         # 2-to-n generation steps can then be run in autoregressive fashion
         # only in case 1st generation step does NOT yield EOS token though
-        if sample_cond_fn(generated, finished_sequences, cur_len, model_kwargs):
-            maximum_iterations = max_length - cur_len
-            generated, _, cur_len, _ = tf.while_loop(
-                sample_cond_fn,
-                sample_body_fn,
-                (generated, finished_sequences, cur_len, model_kwargs),
-                maximum_iterations=maximum_iterations,
-            )
+        maximum_iterations = max_length - cur_len
+        generated, _, cur_len, _ = tf.while_loop(
+            sample_cond_fn,
+            sample_body_fn,
+            (generated, finished_sequences, cur_len, model_kwargs),
+            maximum_iterations=maximum_iterations,
+        )
 
         # 6. prepare outputs
         if not use_xla:
@@ -1867,7 +2097,7 @@ def beam_search(
         pad_token_id: Optional[int] = None,
         eos_token_id: Optional[int] = None,
         length_penalty: Optional[float] = None,
-        early_stopping: Optional[bool] = None,
+        early_stopping: Optional[Union[bool, str]] = None,
         logits_processor: Optional[TFLogitsProcessorList] = None,
         logits_warper: Optional[TFLogitsProcessorList] = None,
         num_return_sequences: Optional[int] = None,
@@ -1890,15 +2120,19 @@ def beam_search(
                 The maximum length of the sequence to be generated.
             pad_token_id (`int`, *optional*):
                 The id of the *padding* token.
-            eos_token_id (`int`, *optional*):
-                The id of the *end-of-sequence* token.
+            eos_token_id (`Union[int, List[int]]`, *optional*):
+                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
             length_penalty (`float`, *optional*, defaults to 1.0):
                 Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent
                 to the sequence length, which in turn is used to divide the score of the sequence. Since the score is
                 the log likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences,
                 while `length_penalty` < 0.0 encourages shorter sequences.
-            early_stopping (`bool`, *optional*, defaults to `False`):
-                Whether to stop the beam search when at least `num_beams` sentences are finished per batch or not.
+            early_stopping (`bool` or `str`, *optional*, defaults to `False`):
+                Controls the stopping condition for beam-based methods, like beam-search. It accepts the following
+                values: `True`, where the generation stops as soon as there are `num_beams` complete candidates;
+                `False`, where an heuristic is applied and the generation stops when is it very unlikely to find better
+                candidates; `"never"`, where the beam search procedure only stops when there cannot be better
+                candidates (canonical beam search algorithm).
             logits_processor (`[TFLogitsProcessorList]`, *optional*):
                 An instance of [`TFLogitsProcessorList`]. List of instances of class derived from [`TFLogitsProcessor`]
                 used to modify the prediction scores of the language modeling head applied at each generation step.
@@ -1987,6 +2221,8 @@ def unflatten_beam_dim(tensor, num_beams, batch_axis=0):
         max_length = max_length if max_length is not None else self.generation_config.max_length
         pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
         eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
         num_return_sequences = (
             num_return_sequences if num_return_sequences is not None else self.generation_config.num_return_sequences
         )
@@ -2012,12 +2248,12 @@ def unflatten_beam_dim(tensor, num_beams, batch_axis=0):
         # TODO (Joao): fix cache format or find programatic way to detect cache index
         # GPT2 and other models has a slightly different cache structure, with a different batch axis
         model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
-        cache_batch_axis = 1 if any([model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")]) else 0
+        cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
         # some models, like XLNet, need more than the last token in the presence of past_key_values
         needs_full_input = "use_mems" in set(inspect.signature(self.prepare_inputs_for_generation).parameters.keys())
 
         # 2. init `attentions`, `hidden_states`, and `scores` tuples
-        scores = [] if (return_dict_in_generate and output_scores) else None
+        all_scores = [] if (return_dict_in_generate and output_scores) else None
         decoder_attentions = [] if (return_dict_in_generate and output_attentions) else None
         cross_attentions = [] if (return_dict_in_generate and output_attentions) else None
         decoder_hidden_states = [] if (return_dict_in_generate and output_hidden_states) else None
@@ -2041,6 +2277,10 @@ def unflatten_beam_dim(tensor, num_beams, batch_axis=0):
         )
         scores = tf.ones((batch_size, num_beams)) * -1.0e9
 
+        # per batch beam indices
+        running_beam_indices = tf.ones((batch_size, num_beams, max_length), dtype=tf.int32) * -1
+        beam_indices = tf.ones((batch_size, num_beams, max_length), dtype=tf.int32) * -1
+
         # flatten beam dim
         if "encoder_outputs" in model_kwargs:
             model_kwargs["encoder_outputs"]["last_hidden_state"] = flatten_beam_dim(
@@ -2055,8 +2295,10 @@ def beam_search_cond_fn(
             cur_len,
             running_sequences,
             running_scores,
+            running_beam_indices,
             sequences,
             scores,
+            beam_indices,
             is_sent_finished,
             model_kwargs,
         ):
@@ -2068,23 +2310,33 @@ def beam_search_cond_fn(
             not_max_length_yet = cur_len < max_length
 
             # 2. can the new beams still improve?
-            best_running_score = running_scores[:, :1] / (max_length**length_penalty)
+            # early_stopping == False -> apply heuristic = always get the best score from `cur_len`. See the discussion
+            # below for more details.
+            # https://github.com/huggingface/transformers/pull/20901#issuecomment-1369845565
+            # early_stopping == "never" -> compute the best score from max_length or cur_len, depending on the sign of
+            #   length_penalty. Positive length_penalty favors longer sequences, thus we use max_length there.
+            if early_stopping == "never" and length_penalty > 0.0:
+                best_running_score = running_scores[:, :1] / (max_length**length_penalty)
+            else:
+                best_running_score = running_scores[:, :1] / (tf.cast(cur_len, dtype=tf.float32) ** length_penalty)
             worst_finished_score = tf.where(
                 is_sent_finished, tf.math.reduce_min(scores, axis=1, keepdims=True), -1.0e9
             )
-            improvement_still_possible = tf.math.reduce_all(worst_finished_score < best_running_score)
+            improvement_still_possible = tf.math.reduce_any(best_running_score > worst_finished_score)
 
             # 3. is there still a beam that has not finished?
-            still_open_beam = ~(tf.math.reduce_all(is_sent_finished) & early_stopping)
+            still_open_beam = ~(tf.math.reduce_all(is_sent_finished) & (early_stopping is True))
 
-            return not_max_length_yet & (still_open_beam | improvement_still_possible)
+            return not_max_length_yet & still_open_beam & improvement_still_possible
 
         def beam_search_body_fn(
             cur_len,
             running_sequences,
             running_scores,
+            running_beam_indices,
             sequences,
             scores,
+            beam_indices,
             is_sent_finished,
             model_kwargs,
         ):
@@ -2108,10 +2360,31 @@ def beam_search_body_fn(
             )
             logits = unflatten_beam_dim(model_outputs.logits[:, -1], num_beams)
 
+            # 2. Compute log probs
+            # get log probabilities from logits, process logits with processors (*e.g.* min_length, ...), and
+            # add new logprobs to existing running logprobs scores.
+            log_probs = tf.nn.log_softmax(logits)
+            log_probs = logits_processor(flatten_beam_dim(running_sequences), flatten_beam_dim(log_probs), cur_len)
+            log_probs = unflatten_beam_dim(log_probs, num_beams)
+            log_probs_processed = log_probs
+            log_probs = log_probs + tf.expand_dims(running_scores, axis=2)
+            if do_sample:
+                # Note: logits warpers are intentionally applied after adding running beam scores. On some logits
+                # warpers (like top_p) this is indiferent, but on others (like temperature) it is not. For reference,
+                # see https://github.com/huggingface/transformers/pull/5420#discussion_r449779867
+                log_probs = logits_warper(flatten_beam_dim(running_sequences), flatten_beam_dim(log_probs), cur_len)
+                log_probs = unflatten_beam_dim(log_probs, num_beams)
+            vocab_size = log_probs.shape[2]
+            log_probs = tf.reshape(log_probs, (batch_size, num_beams * vocab_size))
+
             # Store scores, attentions and hidden_states when required
             if not use_xla and return_dict_in_generate:
                 if output_scores:
-                    scores.append(model_outputs.logits[:, -1])
+                    all_scores.append(
+                        logits_warper(
+                            flatten_beam_dim(running_sequences), flatten_beam_dim(log_probs_processed), cur_len
+                        )
+                    )
                 if output_attentions and self.config.is_encoder_decoder:
                     decoder_attentions.append(model_outputs.decoder_attentions)
                 elif output_attentions and not self.config.is_encoder_decoder:
@@ -2124,19 +2397,6 @@ def beam_search_body_fn(
                 elif output_hidden_states and self.config.is_encoder_decoder:
                     decoder_hidden_states.append(model_outputs.hidden_states)
 
-            # 2. Compute log probs
-            # get log probabilities from logits, process logits with processors (*e.g.* min_length, ...), and
-            # add new logprobs to existing running logprobs scores.
-            log_probs = tf.nn.log_softmax(logits)
-            log_probs = logits_processor(flatten_beam_dim(running_sequences), flatten_beam_dim(log_probs), cur_len)
-            log_probs = unflatten_beam_dim(log_probs, num_beams)
-            log_probs = log_probs + tf.expand_dims(running_scores, axis=2)
-            if do_sample:
-                log_probs = logits_warper(flatten_beam_dim(running_sequences), flatten_beam_dim(log_probs), cur_len)
-                log_probs = unflatten_beam_dim(log_probs, num_beams)
-            vocab_size = log_probs.shape[2]
-            log_probs = tf.reshape(log_probs, (batch_size, num_beams * vocab_size))
-
             # 3. Retrieve top-K
             # Each item in batch has num_beams * vocab_size candidate sequences. For each item, get the top 2*k
             # candidates with the highest log-probabilities. We gather the top 2*K beams here so that even if the
@@ -2153,8 +2413,9 @@ def beam_search_body_fn(
                 topk_log_probs = tf.gather(log_probs, topk_indices, axis=1, batch_dims=1)
             else:
                 topk_log_probs, topk_indices = tf.math.top_k(log_probs, k=beams_to_keep)
-            topk_beam_indices = topk_indices // vocab_size
-            topk_running_sequences = self._gather_beams(running_sequences, topk_beam_indices)
+            topk_current_beam_indices = topk_indices // vocab_size
+            topk_running_beam_indices = self._gather_beams(running_beam_indices, topk_current_beam_indices)
+            topk_running_sequences = self._gather_beams(running_sequences, topk_current_beam_indices)
             topk_ids = topk_indices % vocab_size
 
             # writes the new token
@@ -2169,13 +2430,32 @@ def beam_search_body_fn(
                 updates=tf.reshape(topk_ids, [batch_size * beams_to_keep]),
             )
 
+            # we want to store the beam indices with batch information -> real beam index = beam index % num beams
+            batch_modified_indices = topk_current_beam_indices + tf.broadcast_to(
+                tf.expand_dims(tf.range(batch_size) * num_beams, axis=1), topk_current_beam_indices.shape
+            )
+            topk_beam_indices = tf.tensor_scatter_nd_update(
+                tensor=topk_running_beam_indices,
+                indices=update_indices,
+                updates=tf.reshape(batch_modified_indices, [batch_size * beams_to_keep]),
+            )
+
             # 4. Check which sequences have ended
             # Update current sequences: Did the top `num_beams` sequences reach an end marker?
             # To prevent these just finished sequences from being added to the current sequences
             # set of active beam search sequences, set their log probs to a very large negative value.
-            eos_in_next_token = topk_sequences[:, :, cur_len] == eos_token_id
             if eos_token_id is None:
-                eos_in_next_token = tf.broadcast_to(eos_in_next_token, topk_sequences[:, :, cur_len].shape)
+                eos_in_next_token = tf.zeros(topk_sequences[:, :, cur_len].shape, dtype=tf.bool)
+            else:
+                eos_in_next_token = tf.math.reduce_any(
+                    tf.equal(
+                        tf.broadcast_to(
+                            topk_sequences[:, :, cur_len], [len(eos_token_id)] + topk_sequences[:, :, cur_len].shape
+                        ),
+                        tf.expand_dims(tf.expand_dims(eos_token_id, -1), -1),
+                    ),
+                    axis=0,
+                )
             did_topk_just_finished = eos_in_next_token & tf.broadcast_to(
                 tf.concat((tf.ones((num_beams), dtype=tf.bool), tf.zeros((num_beams), dtype=tf.bool)), axis=0),
                 shape_list(eos_in_next_token),
@@ -2189,8 +2469,8 @@ def beam_search_body_fn(
             # Determine the top k beam indices (from top 2*k beams) from log probs and gather top k beams
             # (from top 2*k beams).
             next_topk_indices = tf.math.top_k(running_topk_log_probs, k=num_beams)[1]
-            next_running_sequences, next_running_scores = self._gather_beams(
-                [topk_sequences, running_topk_log_probs], next_topk_indices
+            next_running_sequences, next_running_scores, next_running_beam_indices = self._gather_beams(
+                [topk_sequences, running_topk_log_probs, topk_beam_indices], next_topk_indices
             )
 
             # 6. Process topk logits
@@ -2199,12 +2479,9 @@ def beam_search_body_fn(
             # - make sure no scores can be added anymore if beam is full
             # - make sure still running sequences cannot be chosen as finalized beam
             topk_log_probs = topk_log_probs / (tf.cast(cur_len, dtype=tf.float32) ** length_penalty)
-            beams_in_batch_are_full = (
-                tf.broadcast_to(
-                    tf.math.reduce_all(is_sent_finished, axis=-1, keepdims=True), shape_list(did_topk_just_finished)
-                )
-                & early_stopping
-            )
+            beams_in_batch_are_full = tf.broadcast_to(
+                tf.math.reduce_all(is_sent_finished, axis=-1, keepdims=True), shape_list(did_topk_just_finished)
+            ) & (early_stopping is True)
             add_penalty = ~did_topk_just_finished | beams_in_batch_are_full
             topk_log_probs += tf.cast(add_penalty, tf.float32) * -1.0e9
 
@@ -2213,10 +2490,11 @@ def beam_search_body_fn(
             # to existing finished scores and select the best from the new set of beams
             merged_sequences = tf.concat([sequences, topk_sequences], axis=1)
             merged_scores = tf.concat([scores, topk_log_probs], axis=1)
+            merged_beams = tf.concat([beam_indices, topk_beam_indices], axis=1)
             merged_is_sent_finished = tf.concat([is_sent_finished, did_topk_just_finished], axis=1)
             topk_merged_indices = tf.math.top_k(merged_scores, k=num_beams)[1]
-            next_sequences, next_scores, next_is_sent_finished = self._gather_beams(
-                [merged_sequences, merged_scores, merged_is_sent_finished], topk_merged_indices
+            next_sequences, next_scores, next_beam_indices, next_is_sent_finished = self._gather_beams(
+                [merged_sequences, merged_scores, merged_beams, merged_is_sent_finished], topk_merged_indices
             )
 
             # 8. Prepare data for the next iteration
@@ -2228,7 +2506,7 @@ def beam_search_body_fn(
                     lambda tensor: unflatten_beam_dim(tensor, num_beams, batch_axis=cache_batch_axis),
                     model_outputs.past_key_values,
                 )
-                next_running_indices = self._gather_beams(topk_beam_indices, next_topk_indices)
+                next_running_indices = self._gather_beams(topk_current_beam_indices, next_topk_indices)
                 next_cache = self._gather_beams(cache, next_running_indices, batch_axis=cache_batch_axis)
                 model_outputs["past_key_values"] = tf.nest.map_structure(
                     lambda tensor: flatten_beam_dim(tensor, batch_axis=cache_batch_axis), next_cache
@@ -2258,8 +2536,10 @@ def beam_search_body_fn(
                 cur_len,
                 next_running_sequences,
                 next_running_scores,
+                next_running_beam_indices,
                 next_sequences,
                 next_scores,
+                next_beam_indices,
                 next_is_sent_finished,
                 next_model_kwargs,
             )
@@ -2270,41 +2550,74 @@ def beam_search_body_fn(
             cur_len,
             running_sequences,
             running_scores,
+            running_beam_indices,
             sequences,
             scores,
+            beam_indices,
             is_sent_finished,
             model_kwargs,
         ) = beam_search_body_fn(
-            cur_len, running_sequences, running_scores, sequences, scores, is_sent_finished, model_kwargs
+            cur_len,
+            running_sequences,
+            running_scores,
+            running_beam_indices,
+            sequences,
+            scores,
+            beam_indices,
+            is_sent_finished,
+            model_kwargs,
         )
 
         # 2-to-n generation steps can then be run in autoregressive fashion (only in case 1st generation step does
         # NOT yield EOS token though)
-        if beam_search_cond_fn(
-            cur_len, running_sequences, running_scores, sequences, scores, is_sent_finished, model_kwargs
-        ):
-            maximum_iterations = max_length - cur_len
-            cur_len, running_sequences, running_scores, sequences, scores, is_sent_finished, _ = tf.while_loop(
-                beam_search_cond_fn,
-                beam_search_body_fn,
-                (cur_len, running_sequences, running_scores, sequences, scores, is_sent_finished, model_kwargs),
-                maximum_iterations=maximum_iterations,
-            )
+        maximum_iterations = max_length - cur_len
+        (
+            cur_len,
+            running_sequences,
+            running_scores,
+            running_beam_indices,
+            sequences,
+            scores,
+            beam_indices,
+            is_sent_finished,
+            _,
+        ) = tf.while_loop(
+            beam_search_cond_fn,
+            beam_search_body_fn,
+            (
+                cur_len,
+                running_sequences,
+                running_scores,
+                running_beam_indices,
+                sequences,
+                scores,
+                beam_indices,
+                is_sent_finished,
+                model_kwargs,
+            ),
+            maximum_iterations=maximum_iterations,
+        )
 
         # 6. prepare outputs
         # Account for the edge-case where there are no finished sequences for a particular batch item. If so, return
         # running sequences for that batch item.
         none_finished = tf.math.reduce_any(is_sent_finished, axis=1)
         sequences = tf.where(none_finished[:, None, None], sequences, running_sequences)
+        beam_indices = tf.where(none_finished[:, None, None], beam_indices, running_beam_indices)
+
+        # Apply the length penalty so that running scores match the finalized scores if they are used
+        running_scores = running_scores / (tf.cast(cur_len, dtype=tf.float32) ** length_penalty)
         scores = tf.where(none_finished[:, None], scores, running_scores)
 
-        # Take best beams for each batch (the score is sorted in ascending order)
+        # Take best beams for each batch (the score is sorted in descending order)
         sequences = flatten_beam_dim(sequences[:, :num_return_sequences, :])
         scores = flatten_beam_dim(scores[:, :num_return_sequences])
+        beam_indices = flatten_beam_dim(beam_indices[:, :num_return_sequences, :])
 
         if not use_xla:
             # Cut for backward compatibility
             sequences = sequences[:, :cur_len]
+            beam_indices = beam_indices[:, :cur_len]
 
         if return_dict_in_generate:
             if self.config.is_encoder_decoder:
@@ -2317,7 +2630,9 @@ def beam_search_body_fn(
                 output_cls = TFBeamSampleEncoderDecoderOutput if do_sample else TFBeamSearchEncoderDecoderOutput
                 return output_cls(
                     sequences=sequences,
-                    scores=scores,
+                    sequences_scores=scores,
+                    scores=all_scores,
+                    beam_indices=beam_indices,
                     encoder_attentions=encoder_attentions,
                     encoder_hidden_states=encoder_hidden_states,
                     decoder_attentions=decoder_attentions,
@@ -2328,7 +2643,9 @@ def beam_search_body_fn(
                 output_cls = TFBeamSampleDecoderOnlyOutput if do_sample else TFBeamSearchDecoderOnlyOutput
                 return output_cls(
                     sequences=sequences,
-                    scores=scores,
+                    sequences_scores=scores,
+                    scores=all_scores,
+                    beam_indices=beam_indices,
                     attentions=decoder_attentions,
                     hidden_states=decoder_hidden_states,
                 )
@@ -2373,8 +2690,8 @@ def contrastive_search(
                 The maximum length of the sequence to be generated.
             pad_token_id (`int`, *optional*):
                 The id of the *padding* token.
-            eos_token_id (`int`, *optional*):
-                The id of the *end-of-sequence* token.
+            eos_token_id (`Union[int, List[int]]`, *optional*):
+                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
             output_attentions (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more details.
@@ -2424,6 +2741,8 @@ def gather_fn(tensor):
         max_length = max_length if max_length is not None else self.generation_config.max_length
         pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
         eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
         output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
         output_attentions = (
             output_attentions if output_attentions is not None else self.generation_config.output_attentions
@@ -2437,11 +2756,13 @@ def gather_fn(tensor):
             else self.generation_config.return_dict_in_generate
         )
         use_cache = True  # In contrastive search, we always use cache
+        model_kwargs.pop("use_cache", None)
+
         use_xla = not tf.executing_eagerly()
         # TODO (Joao): fix cache format or find programatic way to detect cache index
         # GPT2 and other models has a slightly different cache structure, with a different batch axis
         model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
-        cache_batch_axis = 1 if any([model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")]) else 0
+        cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
 
         # 2. init `attentions`, `hidden_states`, and `scores` tuples
         scores = [] if (return_dict_in_generate and output_scores) else None
@@ -2474,7 +2795,6 @@ def contrastive_search_body_fn(
             # if the first step in the loop, encode all the prefix and obtain: (1) past_key_values;
             # (2) last_hidden_states; (3) logit_for_next_step; (4) update model kwargs for the next step
             if model_kwargs.get("past_key_values") is None:
-
                 # prepare inputs
                 model_inputs = self.prepare_inputs_for_generation(
                     generated[:, :cur_len], use_cache=use_cache, **model_kwargs
@@ -2552,7 +2872,7 @@ def contrastive_search_body_fn(
             # Store scores, attentions and hidden_states when required
             if not use_xla and return_dict_in_generate:
                 if output_scores:
-                    scores.append(outputs.logits[:, -1])
+                    scores.append(logit_for_next_step)
                 if output_attentions and self.config.is_encoder_decoder:
                     decoder_attentions.append(outputs.decoder_attentions)
                 elif output_attentions and not self.config.is_encoder_decoder:
@@ -2647,7 +2967,13 @@ def contrastive_search_body_fn(
                     raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
                 unfinished_seq = 1 - tf.cast(finished_sequences, tf.int32)
                 next_tokens = next_tokens * unfinished_seq + pad_token_id * (1 - unfinished_seq)
-            finished_sequences = finished_sequences | (next_tokens == eos_token_id)
+                next_token_is_eos = tf.math.reduce_any(
+                    tf.equal(
+                        tf.broadcast_to(next_tokens, (len(eos_token_id), batch_size)), tf.expand_dims(eos_token_id, -1)
+                    ),
+                    axis=0,
+                )
+                finished_sequences = finished_sequences | next_token_is_eos
 
             # update `generated` and `cur_len`
             update_indices = tf.stack([tf.range(batch_size), tf.broadcast_to(cur_len, [batch_size])], axis=-1)
@@ -2687,16 +3013,13 @@ def contrastive_search_body_fn(
 
         # 2-to-n generation steps can then be run in autoregressive fashion
         # only in case 1st generation step does NOT yield EOS token though
-        if contrastive_search_cond_fn(
-            generated, finished_sequences, cur_len, model_kwargs, next_step_cached_variables
-        ):
-            maximum_iterations = max_length - cur_len
-            generated, _, cur_len, _, _, = tf.while_loop(
-                contrastive_search_cond_fn,
-                contrastive_search_body_fn,
-                (generated, finished_sequences, cur_len, model_kwargs, next_step_cached_variables),
-                maximum_iterations=maximum_iterations,
-            )
+        maximum_iterations = max_length - cur_len
+        generated, _, cur_len, _, _ = tf.while_loop(
+            contrastive_search_cond_fn,
+            contrastive_search_body_fn,
+            (generated, finished_sequences, cur_len, model_kwargs, next_step_cached_variables),
+            maximum_iterations=maximum_iterations,
+        )
 
         # 6. prepare outputs
         if not use_xla:
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index b479e500b906..3b1bef6f0400 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -18,12 +18,13 @@
 import inspect
 import warnings
 from dataclasses import dataclass
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
 import torch.distributed as dist
 from torch import nn
 
+from ..integrations.deepspeed import is_deepspeed_zero3_enabled
 from ..modeling_outputs import CausalLMOutputWithPast, Seq2SeqLMOutput
 from ..models.auto import (
     MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING,
@@ -32,13 +33,15 @@
     MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
     MODEL_FOR_VISION_2_SEQ_MAPPING,
 )
-from ..pytorch_utils import torch_int_div
-from ..utils import ModelOutput, logging
+from ..utils import ExplicitEnum, ModelOutput, is_accelerate_available, logging
 from .beam_constraints import DisjunctiveConstraint, PhrasalConstraint
 from .beam_search import BeamScorer, BeamSearchScorer, ConstrainedBeamSearchScorer
 from .configuration_utils import GenerationConfig
 from .logits_process import (
     EncoderNoRepeatNGramLogitsProcessor,
+    EncoderRepetitionPenaltyLogitsProcessor,
+    EpsilonLogitsWarper,
+    EtaLogitsWarper,
     ExponentialDecayLengthPenalty,
     ForcedBOSTokenLogitsProcessor,
     ForcedEOSTokenLogitsProcessor,
@@ -48,16 +51,19 @@
     LogitNormalization,
     LogitsProcessorList,
     MinLengthLogitsProcessor,
+    MinNewTokensLengthLogitsProcessor,
     NoBadWordsLogitsProcessor,
     NoRepeatNGramLogitsProcessor,
     PrefixConstrainedLogitsProcessor,
     RepetitionPenaltyLogitsProcessor,
+    SequenceBiasLogitsProcessor,
     SuppressTokensAtBeginLogitsProcessor,
     SuppressTokensLogitsProcessor,
     TemperatureLogitsWarper,
     TopKLogitsWarper,
     TopPLogitsWarper,
     TypicalLogitsWarper,
+    UnbatchedClassifierFreeGuidanceLogitsProcessor,
 )
 from .stopping_criteria import (
     MaxLengthCriteria,
@@ -68,8 +74,15 @@
 )
 
 
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+    from .streamers import BaseStreamer
+
 logger = logging.get_logger(__name__)
 
+if is_accelerate_available():
+    from accelerate.hooks import AlignDevicesHook, add_hook_to_module
+
 
 @dataclass
 class GreedySearchDecoderOnlyOutput(ModelOutput):
@@ -297,9 +310,9 @@ class BeamSearchDecoderOnlyOutput(ModelOutput):
             of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam.
             Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token),
             with each tensor of shape `(batch_size*num_beams*num_return_sequences, config.vocab_size)`.
-        beam_indices (`tuple(tuple(torch.LongTensor))`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+        beam_indices (`torch.LongTensor`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Beam indices of generated token id at each generation step. `torch.LongTensor` of shape
-            `(batch_size*num_return_sequences, input_ids.shape[-1])`.
+            `(batch_size*num_return_sequences, sequence_length)`.
         attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
             `torch.FloatTensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
@@ -334,10 +347,9 @@ class BeamSearchEncoderDecoderOutput(ModelOutput):
             of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam.
             Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token),
             with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
-        beam_indices (`tuple(tuple(torch.LongTensor))`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+        beam_indices (`torch.LongTensor`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Beam indices of generated token id at each generation step. `torch.LongTensor` of shape
-            `(batch_size*num_return_sequences, max_length-1)`.
-        attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+            `(batch_size*num_return_sequences, sequence_length)`.
         encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape `(batch_size, num_heads,
             sequence_length, sequence_length)`.
@@ -383,9 +395,9 @@ class BeamSampleDecoderOnlyOutput(ModelOutput):
             of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam.
             Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token),
             with each tensor of shape `(batch_size*num_beams*num_return_sequences, config.vocab_size)`.
-        beam_indices (`tuple(tuple(torch.LongTensor))`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+        beam_indices (`torch.LongTensor`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Beam indices of generated token id at each generation step. `torch.LongTensor` of shape
-            `(batch_size*num_return_sequences, input_ids.shape[-1])`.
+            `(batch_size*num_return_sequences, sequence_length)`.
         attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
             `torch.FloatTensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
@@ -422,7 +434,7 @@ class BeamSampleEncoderDecoderOutput(ModelOutput):
             with each tensor of shape `(batch_size*num_beams, config.vocab_size)`).
         beam_indices (`torch.LongTensor`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Beam indices of generated token id at each generation step. `torch.LongTensor` of shape
-            `(batch_size*num_return_sequences, max_length-1)`.
+            `(batch_size*num_return_sequences, sequence_length)`.
         encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape `(batch_size, num_heads,
             sequence_length, sequence_length)`.
@@ -459,30 +471,50 @@ class BeamSampleEncoderDecoderOutput(ModelOutput):
 GenerateOutput = Union[GreedySearchOutput, SampleOutput, BeamSearchOutput, BeamSampleOutput, ContrastiveSearchOutput]
 
 
+class GenerationMode(ExplicitEnum):
+    """
+    Possible generation modes, downstream of the [`~generation.GenerationMixin.generate`] method.
+    """
+
+    # Non-beam methods
+    CONTRASTIVE_SEARCH = "contrastive_search"
+    GREEDY_SEARCH = "greedy_search"
+    SAMPLE = "sample"
+    ASSISTED_GENERATION = "assisted_generation"
+    # Beam methods
+    BEAM_SEARCH = "beam_search"
+    BEAM_SAMPLE = "beam_sample"
+    CONSTRAINED_BEAM_SEARCH = "constrained_beam_search"
+    GROUP_BEAM_SEARCH = "group_beam_search"
+
+
 class GenerationMixin:
     """
     A class containing all functions for auto-regressive text generation, to be used as a mixin in [`PreTrainedModel`].
 
     The class exposes [`~generation.GenerationMixin.generate`], which can be used for:
         - *greedy decoding* by calling [`~generation.GenerationMixin.greedy_search`] if `num_beams=1` and
-          `do_sample=False`.
+          `do_sample=False`
         - *contrastive search* by calling [`~generation.GenerationMixin.contrastive_search`] if `penalty_alpha>0` and
           `top_k>1`
         - *multinomial sampling* by calling [`~generation.GenerationMixin.sample`] if `num_beams=1` and
-          `do_sample=True`.
+          `do_sample=True`
         - *beam-search decoding* by calling [`~generation.GenerationMixin.beam_search`] if `num_beams>1` and
-          `do_sample=False`.
+          `do_sample=False`
         - *beam-search multinomial sampling* by calling [`~generation.GenerationMixin.beam_sample`] if `num_beams>1`
-          and `do_sample=True`.
+          and `do_sample=True`
         - *diverse beam-search decoding* by calling [`~generation.GenerationMixin.group_beam_search`], if `num_beams>1`
-          and `num_beam_groups>1`.
+          and `num_beam_groups>1`
         - *constrained beam-search decoding* by calling [`~generation.GenerationMixin.constrained_beam_search`], if
-          `constraints!=None` or `force_words_ids!=None`.
+          `constraints!=None` or `force_words_ids!=None`
+
+    You do not need to call any of the above methods directly. Pass custom parameter values to 'generate' instead. To
+    learn more about decoding strategies refer to the [text generation strategies guide](../generation_strategies).
     """
 
     def prepare_inputs_for_generation(self, *args, **kwargs):
         raise NotImplementedError(
-            "A model class needs to define a `prepare_inputs_for_generation` method in order to use `generate`."
+            "A model class needs to define a `prepare_inputs_for_generation` method in order to use `.generate()`."
         )
 
     def _prepare_model_inputs(
@@ -512,56 +544,54 @@ def _prepare_model_inputs(
         inputs_kwarg = model_kwargs.pop(input_name, None)
         if inputs_kwarg is not None and inputs is not None:
             raise ValueError(
-                f"`inputs`: {inputs}` were passed alongside "
-                f"{input_name} which is not allowed."
+                f"`inputs`: {inputs}` were passed alongside {input_name} which is not allowed."
                 f"Make sure to either pass {inputs} or {input_name}=..."
             )
         elif inputs_kwarg is not None:
             inputs = inputs_kwarg
 
-        # 3. models with `input_ids` can also make use of `inputs_embeds`
-        if self._can_retrieve_inputs_from_name(inputs, "inputs_embeds", model_kwargs):
+        # 3. In the presence of `inputs_embeds` for text models:
+        # - decoder-only models should complain if the user attempts to pass `inputs_embeds`, but the model
+        # doesn't have its forwarding implemented. `inputs_embeds` is kept in `model_kwargs` and can coexist with
+        # input_ids (`inputs_embeds` will be used in the 1st generation step, as opposed to `input_ids`)
+        # - encoder-decoder models should complain if the user attempts to pass `inputs_embeds` and `input_ids`, and
+        # pull the former to inputs. It will be used in place of `input_ids` to get the encoder hidden states.
+        if input_name == "input_ids" and "inputs_embeds" in model_kwargs:
+            if not self.config.is_encoder_decoder:
+                has_inputs_embeds_forwarding = "inputs_embeds" in set(
+                    inspect.signature(self.prepare_inputs_for_generation).parameters.keys()
+                )
+                if not has_inputs_embeds_forwarding:
+                    raise ValueError(
+                        f"You passed `inputs_embeds` to `.generate()`, but the model class {self.__class__.__name__} "
+                        "doesn't have its forwarding implemented. See the GPT2 implementation for an example "
+                        "(https://github.com/huggingface/transformers/pull/21405), and feel free to open a PR with it!"
+                    )
+                # In this case, `input_ids` is moved to the `model_kwargs`, so a few automations (like the creation of
+                # the attention mask) can rely on the actual model input.
+                model_kwargs["input_ids"] = self._maybe_initialize_input_ids_for_generation(
+                    inputs, bos_token_id, model_kwargs=model_kwargs
+                )
+            else:
+                if inputs is not None:
+                    raise ValueError("You passed `inputs_embeds` and `input_ids` to `.generate()`. Please pick one.")
             inputs, input_name = model_kwargs["inputs_embeds"], "inputs_embeds"
 
-        # 4. Only encoder-decoder models can have non `input_ids` input format
-        if not self.config.is_encoder_decoder and input_name != "input_ids":
-            raise ValueError(
-                f"If {input_name} is passed as model-specific keyword "
-                "input then model has to be an encoder-decoder and not a "
-                f"{self.__class__.__name__}."
-            )
-
-        # 5. if `inputs` is still None, try to create `input_ids` from BOS token
-        if inputs is None:
-            inputs = self._prepare_input_ids_for_generation(bos_token_id, model_kwargs.get("encoder_outputs"))
-
+        # 4. if `inputs` is still None, try to create `input_ids` from BOS token
+        inputs = self._maybe_initialize_input_ids_for_generation(inputs, bos_token_id, model_kwargs)
         return inputs, input_name, model_kwargs
 
-    def _can_retrieve_inputs_from_name(
-        self, inputs: Optional[torch.Tensor], name: str, model_kwargs: Dict[str, torch.Tensor]
-    ) -> torch.Tensor:
-        """
-        If `inputs` is None and `name` is in both forward function and keyword arguments, then inputs can be retrieved
-        from name
-        """
-        can_retrieve_inputs = model_kwargs.get(name, None) is not None and name in set(
-            inspect.signature(self.forward).parameters.keys()
-        )
-
-        if can_retrieve_inputs and inputs is not None:
-            raise ValueError(f"Cannot only pass one of {name} and {self.main_input_name}")
-
-        return can_retrieve_inputs
-
-    def adjust_logits_during_generation(self, logits: torch.FloatTensor, **kwargs) -> torch.FloatTensor:
-        """
-        Implement in subclasses of [`PreTrainedModel`] for custom behavior to adjust the logits in the generate method.
-        """
-        return logits
-
-    def _prepare_input_ids_for_generation(
-        self, bos_token_id: Optional[int], encoder_outputs: Optional[ModelOutput]
+    def _maybe_initialize_input_ids_for_generation(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        bos_token_id: Optional[int] = None,
+        model_kwargs: Optional[Dict[str, torch.Tensor]] = None,
     ) -> torch.LongTensor:
+        """Initializes input ids for generation, if necessary."""
+        if inputs is not None:
+            return inputs
+
+        encoder_outputs = model_kwargs.get("encoder_outputs")
         if self.config.is_encoder_decoder and encoder_outputs is not None:
             # make dummy input_ids with value -100, as a sanity check ensuring that they won't be used for encoding
             shape = encoder_outputs.last_hidden_state.size()[:-1]
@@ -569,7 +599,15 @@ def _prepare_input_ids_for_generation(
 
         if bos_token_id is None:
             raise ValueError("`bos_token_id` has to be defined when no `input_ids` are provided.")
-        return torch.ones((1, 1), dtype=torch.long, device=self.device) * bos_token_id
+
+        # If there is some tensor in `model_kwargs`, we can infer the batch size from it. This is helpful with
+        # soft-prompting or in multimodal implementations built on top of decoder-only language models.
+        batch_size = 1
+        for value in model_kwargs.values():
+            if isinstance(value, torch.Tensor):
+                batch_size = value.shape[0]
+                break
+        return torch.ones((batch_size, 1), dtype=torch.long, device=self.device) * bos_token_id
 
     def _prepare_attention_mask_for_generation(
         self,
@@ -594,14 +632,27 @@ def _prepare_encoder_decoder_kwargs_for_generation(
     ) -> Dict[str, Any]:
         # 1. get encoder
         encoder = self.get_encoder()
+        # Compatibility with Accelerate big model inference: we need the encoder to outputs stuff on the same device
+        # as the inputs.
+        if hasattr(self, "hf_device_map"):
+            if hasattr(encoder, "_hf_hook"):
+                encoder._hf_hook.io_same_device = True
+            else:
+                add_hook_to_module(encoder, AlignDevicesHook(io_same_device=True))
 
-        # 2. prepare encoder args and encoder kwargs from model kwargs
+        # 2. Prepare encoder args and encoder kwargs from model kwargs.
         irrelevant_prefix = ["decoder_", "cross_attn", "use_cache"]
         encoder_kwargs = {
             argument: value
             for argument, value in model_kwargs.items()
             if not any(argument.startswith(p) for p in irrelevant_prefix)
         }
+        encoder_signature = set(inspect.signature(encoder.forward).parameters)
+        encoder_accepts_wildcard = "kwargs" in encoder_signature or "model_kwargs" in encoder_signature
+        if not encoder_accepts_wildcard:
+            encoder_kwargs = {
+                argument: value for argument, value in encoder_kwargs.items() if argument in encoder_signature
+            }
 
         # 3. make sure that encoder returns `ModelOutput`
         model_input_name = model_input_name if model_input_name is not None else self.main_input_name
@@ -614,18 +665,47 @@ def _prepare_encoder_decoder_kwargs_for_generation(
     def _prepare_decoder_input_ids_for_generation(
         self,
         batch_size: int,
+        model_input_name: str,
+        model_kwargs: Dict[str, torch.Tensor],
         decoder_start_token_id: int = None,
         bos_token_id: int = None,
-        model_kwargs: Optional[Dict[str, torch.Tensor]] = None,
         device: torch.device = None,
-    ) -> torch.LongTensor:
+    ) -> Tuple[torch.LongTensor, Dict[str, torch.Tensor]]:
+        """Prepares `decoder_input_ids` for generation with encoder-decoder models"""
+        # 1. Check whether the user has defined `decoder_input_ids` manually. To facilitate in terms of input naming,
+        # we also allow the user to pass it under `input_ids`, if the encoder does not use it as the main input.
         if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
-            return model_kwargs.pop("decoder_input_ids")
+            decoder_input_ids = model_kwargs.pop("decoder_input_ids")
+        elif "input_ids" in model_kwargs and model_input_name != "input_ids":
+            decoder_input_ids = model_kwargs.pop("input_ids")
         else:
-            decoder_start_token_id = self._get_decoder_start_token_id(decoder_start_token_id, bos_token_id)
-            if device is None:
-                device = self.device
-            return torch.ones((batch_size, 1), dtype=torch.long, device=device) * decoder_start_token_id
+            decoder_input_ids = None
+
+        # 2. Encoder-decoder models expect the `decoder_input_ids` to start with a special token. Let's ensure that.
+        decoder_start_token_id = self._get_decoder_start_token_id(decoder_start_token_id, bos_token_id)
+        if device is None:
+            device = self.device
+        decoder_input_ids_start = torch.ones((batch_size, 1), dtype=torch.long, device=device) * decoder_start_token_id
+
+        # no user input -> use decoder_start_token_id as decoder_input_ids
+        if decoder_input_ids is None:
+            decoder_input_ids = decoder_input_ids_start
+        # exception: Donut checkpoints have task-specific decoder starts and don't expect a BOS token
+        elif self.config.model_type == "vision-encoder-decoder" and "donut" in self.name_or_path.lower():
+            pass
+        # user input but doesn't start with decoder_start_token_id -> prepend decoder_start_token_id (and adjust
+        # decoder_attention_mask if provided)
+        elif (decoder_input_ids[:, 0] != decoder_start_token_id).all().item():
+            decoder_input_ids = torch.cat([decoder_input_ids_start, decoder_input_ids], dim=-1)
+            if "decoder_attention_mask" in model_kwargs:
+                decoder_attention_mask = model_kwargs["decoder_attention_mask"]
+                decoder_attention_mask = torch.cat(
+                    (torch.ones_like(decoder_attention_mask)[:, :1], decoder_attention_mask),
+                    dim=-1,
+                )
+                model_kwargs["decoder_attention_mask"] = decoder_attention_mask
+
+        return decoder_input_ids, model_kwargs
 
     def _get_decoder_start_token_id(self, decoder_start_token_id: int = None, bos_token_id: int = None) -> int:
         decoder_start_token_id = (
@@ -651,26 +731,22 @@ def _expand_inputs_for_generation(
         **model_kwargs,
     ) -> Tuple[torch.LongTensor, Dict[str, Any]]:
         """Expands tensors from [batch_size, ...] to [batch_size * expand_size, ...]"""
+
+        def _expand_dict_for_generation(dict_to_expand):
+            for key in dict_to_expand:
+                if dict_to_expand[key] is not None and isinstance(dict_to_expand[key], torch.Tensor):
+                    dict_to_expand[key] = dict_to_expand[key].repeat_interleave(expand_size, dim=0)
+            return dict_to_expand
+
         if input_ids is not None:
             input_ids = input_ids.repeat_interleave(expand_size, dim=0)
 
-        if model_kwargs.get("token_type_ids") is not None:
-            model_kwargs["token_type_ids"] = model_kwargs["token_type_ids"].repeat_interleave(expand_size, dim=0)
-
-        if model_kwargs.get("attention_mask") is not None:
-            model_kwargs["attention_mask"] = model_kwargs["attention_mask"].repeat_interleave(expand_size, dim=0)
+        model_kwargs = _expand_dict_for_generation(model_kwargs)
 
         if is_encoder_decoder:
-            encoder_outputs = model_kwargs.get("encoder_outputs")
-            if encoder_outputs is None:
+            if model_kwargs.get("encoder_outputs") is None:
                 raise ValueError("If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.")
-            encoder_outputs["last_hidden_state"] = encoder_outputs.last_hidden_state.repeat_interleave(
-                expand_size, dim=0
-            )
-            model_kwargs["encoder_outputs"] = encoder_outputs
-            decoder_attention_mask = model_kwargs.get("decoder_attention_mask")
-            if decoder_attention_mask is not None:
-                model_kwargs["decoder_attention_mask"] = decoder_attention_mask.repeat_interleave(expand_size, dim=0)
+            model_kwargs["encoder_outputs"] = _expand_dict_for_generation(model_kwargs["encoder_outputs"])
 
         return input_ids, model_kwargs
 
@@ -700,6 +776,8 @@ def _update_model_kwargs_for_generation(
         model_kwargs["past_key_values"] = self._extract_past_from_model_output(
             outputs, standardize_cache_format=standardize_cache_format
         )
+        if getattr(outputs, "state", None) is not None:
+            model_kwargs["state"] = outputs.state
 
         # update token_type_ids with last value
         if "token_type_ids" in model_kwargs:
@@ -724,7 +802,7 @@ def _update_model_kwargs_for_generation(
 
         return model_kwargs
 
-    def _reorder_cache(self, past, beam_idx):
+    def _reorder_cache(self, past_key_values, beam_idx):
         raise NotImplementedError(
             f"Make sure that a `_reorder_cache` function is correctly implemented in {self.__class__.__module__} to"
             f" enable beam search for {self.__class__}"
@@ -746,29 +824,68 @@ def _get_logits_warper(
         # all samplers can be found in `generation_utils_samplers.py`
         if generation_config.temperature is not None and generation_config.temperature != 1.0:
             warpers.append(TemperatureLogitsWarper(generation_config.temperature))
+        min_tokens_to_keep = 2 if generation_config.num_beams > 1 else 1
         if generation_config.top_k is not None and generation_config.top_k != 0:
+            warpers.append(TopKLogitsWarper(top_k=generation_config.top_k, min_tokens_to_keep=min_tokens_to_keep))
+        if generation_config.top_p is not None and generation_config.top_p < 1.0:
+            warpers.append(TopPLogitsWarper(top_p=generation_config.top_p, min_tokens_to_keep=min_tokens_to_keep))
+        if generation_config.typical_p is not None and generation_config.typical_p < 1.0:
             warpers.append(
-                TopKLogitsWarper(
-                    top_k=generation_config.top_k, min_tokens_to_keep=(2 if generation_config.num_beams > 1 else 1)
-                )
+                TypicalLogitsWarper(mass=generation_config.typical_p, min_tokens_to_keep=min_tokens_to_keep)
             )
-        if generation_config.top_p is not None and generation_config.top_p < 1.0:
+        if generation_config.epsilon_cutoff is not None and 0.0 < generation_config.epsilon_cutoff < 1.0:
             warpers.append(
-                TopPLogitsWarper(
-                    top_p=generation_config.top_p, min_tokens_to_keep=(2 if generation_config.num_beams > 1 else 1)
-                )
+                EpsilonLogitsWarper(epsilon=generation_config.epsilon_cutoff, min_tokens_to_keep=min_tokens_to_keep)
             )
-        if generation_config.typical_p is not None and generation_config.typical_p < 1.0:
+        if generation_config.eta_cutoff is not None and 0.0 < generation_config.eta_cutoff < 1.0:
             warpers.append(
-                TypicalLogitsWarper(
-                    mass=generation_config.typical_p, min_tokens_to_keep=(2 if generation_config.num_beams > 1 else 1)
-                )
+                EtaLogitsWarper(epsilon=generation_config.eta_cutoff, min_tokens_to_keep=min_tokens_to_keep)
             )
         # `LogitNormalization` should always be the last logit processor, when present
         if generation_config.renormalize_logits is True:
             warpers.append(LogitNormalization())
         return warpers
 
+    def _get_generation_mode(
+        self, generation_config: GenerationConfig, assistant_model: Optional["PreTrainedModel"]
+    ) -> GenerationMode:
+        """
+        Returns the generation mode triggered by a [`GenerationConfig`] instance.
+        """
+        if generation_config.constraints is not None or generation_config.force_words_ids is not None:
+            generation_mode = GenerationMode.CONSTRAINED_BEAM_SEARCH
+        elif generation_config.num_beams == 1:
+            if generation_config.do_sample is False:
+                if (
+                    generation_config.top_k is not None
+                    and generation_config.top_k > 1
+                    and generation_config.penalty_alpha is not None
+                    and generation_config.penalty_alpha > 0
+                ):
+                    generation_mode = GenerationMode.CONTRASTIVE_SEARCH
+                else:
+                    generation_mode = GenerationMode.GREEDY_SEARCH
+            else:
+                generation_mode = GenerationMode.SAMPLE
+        else:
+            if generation_config.num_beam_groups > 1:
+                generation_mode = GenerationMode.GROUP_BEAM_SEARCH
+            elif generation_config.do_sample is True:
+                generation_mode = GenerationMode.BEAM_SAMPLE
+            else:
+                generation_mode = GenerationMode.BEAM_SEARCH
+
+        # Assisted generation may extend some generation modes
+        if assistant_model is not None:
+            if generation_mode in ("greedy_search", "sample"):
+                generation_mode = GenerationMode.ASSISTED_GENERATION
+            else:
+                raise ValueError(
+                    "You've set `assistant_model`, which triggers assisted generate. Currently, assisted generate "
+                    "is only supported with Greedy Search and Sample."
+                )
+        return generation_mode
+
     def _get_logits_processor(
         self,
         generation_config: GenerationConfig,
@@ -776,6 +893,9 @@ def _get_logits_processor(
         encoder_input_ids: torch.LongTensor,
         prefix_allowed_tokens_fn: Callable[[int, torch.Tensor], List[int]],
         logits_processor: Optional[LogitsProcessorList],
+        model_kwargs: Optional[Dict[str, Any]] = None,
+        negative_prompt_ids: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
     ) -> LogitsProcessorList:
         """
         This class returns a [`LogitsProcessorList`] list object that contains all relevant [`LogitsProcessor`]
@@ -784,8 +904,19 @@ def _get_logits_processor(
         # instantiate processors list
         processors = LogitsProcessorList()
 
-        # the following idea is largely copied from this PR: https://github.com/huggingface/transformers/pull/5420/files
-        # all samplers can be found in `generation_utils_samplers.py`
+        if generation_config.guidance_scale is not None and generation_config.guidance_scale != 1:
+            processors.append(
+                UnbatchedClassifierFreeGuidanceLogitsProcessor(
+                    generation_config.guidance_scale,
+                    self,
+                    unconditional_ids=negative_prompt_ids,
+                    unconditional_attention_mask=negative_prompt_attention_mask,
+                    use_cache=model_kwargs["use_cache"],
+                )
+            )
+        if generation_config.sequence_bias is not None:
+            processors.append(SequenceBiasLogitsProcessor(sequence_bias=generation_config.sequence_bias))
+
         if generation_config.diversity_penalty is not None and generation_config.diversity_penalty > 0.0:
             processors.append(
                 HammingDiversityLogitsProcessor(
@@ -794,6 +925,15 @@ def _get_logits_processor(
                     num_beam_groups=generation_config.num_beam_groups,
                 )
             )
+        if (
+            generation_config.encoder_repetition_penalty is not None
+            and generation_config.encoder_repetition_penalty != 1.0
+        ):
+            processors.append(
+                EncoderRepetitionPenaltyLogitsProcessor(
+                    penalty=generation_config.encoder_repetition_penalty, encoder_input_ids=encoder_input_ids
+                )
+            )
         if generation_config.repetition_penalty is not None and generation_config.repetition_penalty != 1.0:
             processors.append(RepetitionPenaltyLogitsProcessor(penalty=generation_config.repetition_penalty))
         if generation_config.no_repeat_ngram_size is not None and generation_config.no_repeat_ngram_size > 0:
@@ -822,6 +962,16 @@ def _get_logits_processor(
             and generation_config.min_length > 0
         ):
             processors.append(MinLengthLogitsProcessor(generation_config.min_length, generation_config.eos_token_id))
+        if (
+            generation_config.min_new_tokens is not None
+            and generation_config.eos_token_id is not None
+            and generation_config.min_new_tokens > 0
+        ):
+            processors.append(
+                MinNewTokensLengthLogitsProcessor(
+                    input_ids_seq_length, generation_config.min_new_tokens, generation_config.eos_token_id
+                )
+            )
         if prefix_allowed_tokens_fn is not None:
             processors.append(
                 PrefixConstrainedLogitsProcessor(
@@ -841,7 +991,7 @@ def _get_logits_processor(
                 ExponentialDecayLengthPenalty(
                     generation_config.exponential_decay_length_penalty,
                     generation_config.eos_token_id,
-                    generation_config.input_ids_seq_length,
+                    input_ids_seq_length,
                 )
             )
         if generation_config.suppress_tokens is not None:
@@ -872,7 +1022,13 @@ def _get_stopping_criteria(
     ) -> StoppingCriteriaList:
         criteria = StoppingCriteriaList()
         if generation_config.max_length is not None:
-            criteria.append(MaxLengthCriteria(max_length=generation_config.max_length))
+            max_position_embeddings = getattr(self.config, "max_position_embeddings", None)
+            criteria.append(
+                MaxLengthCriteria(
+                    max_length=generation_config.max_length,
+                    max_position_embeddings=max_position_embeddings,
+                )
+            )
         if generation_config.max_time is not None:
             criteria.append(MaxTimeCriteria(max_time=generation_config.max_time))
         criteria = self._merge_criteria_processor_list(criteria, stopping_criteria)
@@ -891,50 +1047,131 @@ def _merge_criteria_processor_list(
                     object_type = "stopping criteria" if isinstance(custom, StoppingCriteria) else "logits processor"
                     raise ValueError(
                         f"A custom {object_type} of type {type(custom)} with values {custom} has been passed to"
-                        f" `generate`, but it has already been created with the values {default}. {default} has been"
+                        f" `.generate()`, but it has already been created with the values {default}. {default} has been"
                         " created by passing the corresponding arguments to generate or by the model's config default"
                         f" values. If you just want to change the default values of {object_type} consider passing"
-                        f" them as arguments to `generate` instead of using a custom {object_type}."
+                        f" them as arguments to `.generate()` instead of using a custom {object_type}."
                     )
         default_list.extend(custom_list)
         return default_list
 
-    def compute_transition_beam_scores(
+    def compute_transition_scores(
         self,
         sequences: torch.Tensor,
         scores: Tuple[torch.Tensor],
-        beam_indices: torch.Tensor,
-        eos_token_id: Union[int, List[int]] = None,
-    ):
-        """compute the transition probabilities of sequences given generation
-        scores and beam indices"""
+        beam_indices: Optional[torch.Tensor] = None,
+        normalize_logits: bool = False,
+    ) -> torch.Tensor:
+        """
+        Computes the transition scores of sequences given the generation scores (and beam indices, if beam search was
+        used). This is a convenient method to quicky obtain the scores of the selected tokens at generation time.
+
+        Parameters:
+            sequences (`torch.LongTensor`):
+                The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
+                shorter if all batches finished early due to the `eos_token_id`.
+            scores (`tuple(torch.FloatTensor)`):
+                Transition scores for each vocabulary token at each generation step. Beam transition scores consisting
+                of log probabilities of tokens conditioned on log softmax of previously generated tokens Tuple of
+                `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token), with
+                each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
+            beam_indices (`torch.LongTensor`, *optional*):
+                Beam indices of generated token id at each generation step. `torch.LongTensor` of shape
+                `(batch_size*num_return_sequences, sequence_length)`. Only required if a `num_beams>1` at
+                generate-time.
+            normalize_logits (`bool`, *optional*, defaults to `False`):
+                Whether to normalize the logits (which, for legacy reasons, may be unnormalized).
+
+        Return:
+            `torch.Tensor`: A `torch.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)` containing
+                the transition scores (logits)
+
+        Examples:
+
+        ```python
+        >>> from transformers import GPT2Tokenizer, AutoModelForCausalLM
+        >>> import numpy as np
+
+        >>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+        >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+        >>> tokenizer.pad_token_id = tokenizer.eos_token_id
+        >>> inputs = tokenizer(["Today is"], return_tensors="pt")
+
+        >>> # Example 1: Print the scores for each token generated with Greedy Search
+        >>> outputs = model.generate(**inputs, max_new_tokens=5, return_dict_in_generate=True, output_scores=True)
+        >>> transition_scores = model.compute_transition_scores(
+        ...     outputs.sequences, outputs.scores, normalize_logits=True
+        ... )
+        >>> # input_length is the length of the input prompt for decoder-only models, like the GPT family, and 1 for
+        >>> # encoder-decoder models, like BART or T5.
+        >>> input_length = 1 if model.config.is_encoder_decoder else inputs.input_ids.shape[1]
+        >>> generated_tokens = outputs.sequences[:, input_length:]
+        >>> for tok, score in zip(generated_tokens[0], transition_scores[0]):
+        ...     # | token | token string | logits | probability
+        ...     print(f"| {tok:5d} | {tokenizer.decode(tok):8s} | {score.numpy():.3f} | {np.exp(score.numpy()):.2%}")
+        |   262 |  the     | -1.414 | 24.33%
+        |  1110 |  day     | -2.609 | 7.36%
+        |   618 |  when    | -2.010 | 13.40%
+        |   356 |  we      | -1.859 | 15.58%
+        |   460 |  can     | -2.508 | 8.14%
+
+        >>> # Example 2: Reconstruct the sequence scores from Beam Search
+        >>> outputs = model.generate(
+        ...     **inputs,
+        ...     max_new_tokens=5,
+        ...     num_beams=4,
+        ...     num_return_sequences=4,
+        ...     return_dict_in_generate=True,
+        ...     output_scores=True,
+        ... )
+        >>> transition_scores = model.compute_transition_scores(
+        ...     outputs.sequences, outputs.scores, outputs.beam_indices, normalize_logits=False
+        ... )
+        >>> # If you sum the generated tokens' scores and apply the length penalty, you'll get the sequence scores.
+        >>> # Tip: recomputing the scores is only guaranteed to match with `normalize_logits=False`. Depending on the
+        >>> # use case, you might want to recompute it with `normalize_logits=True`.
+        >>> output_length = input_length + np.sum(transition_scores.numpy() < 0, axis=1)
+        >>> length_penalty = model.generation_config.length_penalty
+        >>> reconstructed_scores = transition_scores.sum(axis=1) / (output_length**length_penalty)
+        >>> print(np.allclose(outputs.sequences_scores, reconstructed_scores))
+        True
+        ```"""
+        # 1. In absence of `beam_indices`, we can assume that we come from e.g. greedy search, which is equivalent
+        # to a beam search approach were the first (and only) beam is always selected
+        if beam_indices is None:
+            beam_indices = torch.arange(scores[0].shape[0]).view(-1, 1).to(sequences.device)
+            beam_indices = beam_indices.expand(-1, len(scores))
 
-        # 1. reshape scores as [vocab_size * batch_size, # generation steps]
-        # with batch_size being 2 * vocab_size and # generation steps being
+        # 2. reshape scores as [batch_size*vocab_size, # generation steps] with # generation steps being
         # seq_len - input_length
         scores = torch.stack(scores).reshape(len(scores), -1).transpose(0, 1)
 
-        # 2. cut beam_indices to longest beam length
+        # 3. Optionally normalize the logits (across the vocab dimension)
+        if normalize_logits:
+            scores = scores.reshape(-1, self.config.vocab_size, scores.shape[-1])
+            scores = torch.nn.functional.log_softmax(scores, dim=1)
+            scores = scores.reshape(-1, scores.shape[-1])
+
+        # 4. cut beam_indices to longest beam length
         beam_indices_mask = beam_indices < 0
         max_beam_length = (1 - beam_indices_mask.long()).sum(-1).max()
-        beam_indices = beam_indices[:, :max_beam_length]
+        beam_indices = beam_indices.clone()[:, :max_beam_length]
         beam_indices_mask = beam_indices_mask[:, :max_beam_length]
 
-        # 3. Set indices of beams that finished early to 0
-        # such indices will be masked correctly afterwards
+        # 5. Set indices of beams that finished early to 0; such indices will be masked correctly afterwards
         beam_indices[beam_indices_mask] = 0
 
-        # 4. multiply beam_indices with vocab size to gather correctly from scores
+        # 6. multiply beam_indices with vocab size to gather correctly from scores
         beam_sequence_indices = beam_indices * self.config.vocab_size
 
-        # 5. Define which indices contributed to scores
+        # 7. Define which indices contributed to scores
         cut_idx = sequences.shape[-1] - max_beam_length
         indices = sequences[:, cut_idx:] + beam_sequence_indices
 
-        # 6. Compute scores
+        # 8. Compute scores
         transition_scores = scores.gather(0, indices)
 
-        # 7. Mask out transition_scores of beams that stopped early
+        # 9. Mask out transition_scores of beams that stopped early
         transition_scores[beam_indices_mask] = 0
 
         return transition_scores
@@ -978,6 +1215,32 @@ def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]):
         # `prepare_inputs_for_generation` doesn't accept them, then a stricter check can be made ;)
         if "kwargs" in model_args or "model_kwargs" in model_args:
             model_args |= set(inspect.signature(self.forward).parameters)
+
+        # Encoder-Decoder models may also need Encoder arguments from `model_kwargs`
+        if self.config.is_encoder_decoder:
+            base_model = getattr(self, self.base_model_prefix, None)
+
+            # allow encoder kwargs
+            encoder = getattr(self, "encoder", None)
+            # `MusicgenForConditionalGeneration` has `text_encoder` and `audio_encoder`.
+            # Also, it has `base_model_prefix = "encoder_decoder"` but there is no `self.encoder_decoder`
+            # TODO: A better way to handle this.
+            if encoder is None and base_model is not None:
+                encoder = getattr(base_model, "encoder", None)
+
+            if encoder is not None:
+                encoder_model_args = set(inspect.signature(encoder.forward).parameters)
+                model_args |= encoder_model_args
+
+            # allow decoder kwargs
+            decoder = getattr(self, "decoder", None)
+            if decoder is None and base_model is not None:
+                decoder = getattr(base_model, "decoder", None)
+
+            if decoder is not None:
+                decoder_model_args = set(inspect.signature(decoder.forward).parameters)
+                model_args |= {f"decoder_{x}" for x in decoder_model_args}
+
         for key, value in model_kwargs.items():
             if value is not None and key not in model_args:
                 unused_model_args.append(key)
@@ -988,6 +1251,52 @@ def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]):
                 " generate arguments will also show up in this list)"
             )
 
+    def _validate_generated_length(self, generation_config, input_ids_length, has_default_max_length):
+        """Performs validation related to the resulting generated length"""
+
+        # 1. Max length warnings related to poor parameterization
+        if has_default_max_length and generation_config.max_new_tokens is None and generation_config.max_length == 20:
+            # 20 is the default max_length of the generation config
+            warnings.warn(
+                f"Using the model-agnostic default `max_length` (={generation_config.max_length}) to control the "
+                "generation length. We recommend setting `max_new_tokens` to control the maximum length of the "
+                "generation.",
+                UserWarning,
+            )
+        if input_ids_length >= generation_config.max_length:
+            input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
+            warnings.warn(
+                f"Input length of {input_ids_string} is {input_ids_length}, but `max_length` is set to"
+                f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
+                " increasing `max_new_tokens`.",
+                UserWarning,
+            )
+
+        # 2. Min length warnings due to unfeasible parameter combinations
+        min_length_error_suffix = (
+            " Generation will stop at the defined maximum length. You should decrease the minimum length and/or "
+            "increase the maximum length."
+        )
+        if has_default_max_length:
+            min_length_error_suffix += (
+                f" Note that `max_length` is set to {generation_config.max_length}, its default value."
+            )
+        if generation_config.min_length is not None and generation_config.min_length > generation_config.max_length:
+            warnings.warn(
+                f"Unfeasible length constraints: `min_length` ({generation_config.min_length}) is larger than"
+                f" the maximum possible length ({generation_config.max_length})." + min_length_error_suffix,
+                UserWarning,
+            )
+        if generation_config.min_new_tokens is not None:
+            min_length = generation_config.min_new_tokens + input_ids_length
+            if min_length > generation_config.max_length:
+                warnings.warn(
+                    f"Unfeasible length constraints: `min_new_tokens` ({generation_config.min_new_tokens}), when "
+                    f"added to the prompt length ({input_ids_length}), is larger than"
+                    f" the maximum possible length ({generation_config.max_length})." + min_length_error_suffix,
+                    UserWarning,
+                )
+
     @torch.no_grad()
     def generate(
         self,
@@ -996,7 +1305,11 @@ def generate(
         logits_processor: Optional[LogitsProcessorList] = None,
         stopping_criteria: Optional[StoppingCriteriaList] = None,
         prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
-        synced_gpus: Optional[bool] = False,
+        synced_gpus: Optional[bool] = None,
+        assistant_model: Optional["PreTrainedModel"] = None,
+        streamer: Optional["BaseStreamer"] = None,
+        negative_prompt_ids: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> Union[GenerateOutput, torch.LongTensor]:
         r"""
@@ -1007,10 +1320,10 @@ def generate(
 
         Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
         model's default generation configuration. You can override any `generation_config` by passing the corresponding
-        parameters to generate, e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
+        parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
 
-        For a complete overview of generate, check the [following
-        guide](https://huggingface.co/docs/transformers/en/main_classes/text_generation).
+        For an overview of generation strategies and code examples, check out the [following
+        guide](../generation_strategies).
 
         </Tip>
 
@@ -1042,9 +1355,24 @@ def generate(
                 on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful
                 for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
                 Retrieval](https://arxiv.org/abs/2010.00904).
-            synced_gpus (`bool`, *optional*, defaults to `False`):
-                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
-            kwargs:
+            synced_gpus (`bool`, *optional*):
+                Whether to continue running the while loop until max_length. Unless overridden this flag will be set to
+                `True` under DeepSpeed ZeRO Stage 3 multiple GPUs environment to avoid hanging if one GPU finished
+                generating before other GPUs. Otherwise it'll be set to `False`.
+            assistant_model (`PreTrainedModel`, *optional*):
+                An assistant model that can be used to accelerate generation. The assistant model must have the exact
+                same tokenizer. The acceleration is achieved when forecasting candidate tokens with the assistent model
+                is much faster than running generation with the model you're calling generate from. As such, the
+                assistant model should be much smaller.
+            streamer (`BaseStreamer`, *optional*):
+                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
+                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
+            negative_prompt_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                The negative prompt needed for some processors such as CFG. The batch size must match the input batch
+                size. This is an experimental feature, subject to breaking API changes in future versions.
+            negative_prompt_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Attention_mask for `negative_prompt_ids`.
+            kwargs (`Dict[str, Any]`, *optional*):
                 Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                 forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
                 specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
@@ -1068,93 +1396,40 @@ def generate(
                     - [`~generation.SampleEncoderDecoderOutput`],
                     - [`~generation.BeamSearchEncoderDecoderOutput`],
                     - [`~generation.BeamSampleEncoderDecoderOutput`]
+        """
 
-        Examples:
-
-        Greedy decoding, using the default generation configuration and ad hoc modifications:
-
-        ```python
-        >>> from transformers import AutoTokenizer, AutoModelForCausalLM
-
-        >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
-        >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
-
-        >>> prompt = "Today I believe we can finally"
-        >>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids
-
-        >>> # Generate up to 30 tokens
-        >>> outputs = model.generate(input_ids, do_sample=False, max_length=30)
-        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        ['Today I believe we can finally get to the point where we can make a difference in the lives of the people of the United States of America.\n']
-        ```
-
-        Multinomial sampling, modifying an existing generation configuration:
-
-        ```python
-        >>> from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
-        >>> import torch
-
-        >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
-        >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
-
-        >>> prompt = "Today I believe we can finally"
-        >>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids
-
-        >>> # Sample up to 30 tokens
-        >>> torch.manual_seed(0)  # doctest: +IGNORE_RESULT
-        >>> generation_config = GenerationConfig.from_pretrained("gpt2")
-        >>> generation_config.max_length = 30
-        >>> generation_config.do_sample = True
-        >>> outputs = model.generate(input_ids, generation_config=generation_config)
-        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        ['Today I believe we can finally get rid of discrimination," said Rep. Mark Pocan (D-Wis.).\n\n"Just look at the']
-        ```
-
-        Beam-search decoding, using a freshly initialized generation configuration:
+        if synced_gpus is None:
+            if is_deepspeed_zero3_enabled() and dist.get_world_size() > 1:
+                synced_gpus = True
+            else:
+                synced_gpus = False
 
-        ```python
-        >>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GenerationConfig
-
-        >>> tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
-        >>> model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-de")
-
-        >>> sentence = "Paris is one of the densest populated areas in Europe."
-        >>> input_ids = tokenizer(sentence, return_tensors="pt").input_ids
-
-        >>> generation_config = GenerationConfig(
-        ...     max_length=64,
-        ...     num_beams=5,
-        ...     bos_token_id=0,
-        ...     eos_token_id=0,
-        ...     decoder_start_token_id=58100,
-        ...     pad_token_id=58100,
-        ...     bad_words_ids=[[58100]],
-        ... )
-        >>> outputs = model.generate(input_ids, generation_config=generation_config)
-        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        ['Paris ist eines der dichtesten besiedelten Gebiete Europas.']
-        ```"""
         # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call
         self._validate_model_class()
 
         # priority: `generation_config` argument > `model.generation_config` (the default generation config)
         if generation_config is None:
-            # legacy: users may modify the model configuration to control generation -- update the generation config
-            # model attribute accordingly, if it was created from the model config
-            if self.generation_config._from_model_config:
+            # legacy: users may modify the model configuration to control generation. To trigger this legacy behavior,
+            # two conditions must be met
+            # 1) the generation config must have been created from the model config (`_from_model_config` field);
+            # 2) the generation config must have seen no modification since its creation (the hash is the same).
+            if self.generation_config._from_model_config and self.generation_config._original_object_hash == hash(
+                self.generation_config
+            ):
                 new_generation_config = GenerationConfig.from_model_config(self.config)
                 if new_generation_config != self.generation_config:
                     warnings.warn(
                         "You have modified the pretrained model configuration to control generation. This is a"
                         " deprecated strategy to control generation and will be removed soon, in a future version."
-                        " Please use a generation configuration file (see"
-                        " https://huggingface.co/docs/transformers/main_classes/text_generation)"
+                        " Please use and modify the model generation configuration (see"
+                        " https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )"
                     )
                     self.generation_config = new_generation_config
             generation_config = self.generation_config
 
         generation_config = copy.deepcopy(generation_config)
         model_kwargs = generation_config.update(**kwargs)  # All unused kwargs must be model kwargs
+        generation_config.validate()
         self._validate_model_kwargs(model_kwargs.copy())
 
         # 2. Set generation parameters if not already defined
@@ -1186,7 +1461,12 @@ def generate(
         # 4. Define other model kwargs
         model_kwargs["output_attentions"] = generation_config.output_attentions
         model_kwargs["output_hidden_states"] = generation_config.output_hidden_states
-        model_kwargs["use_cache"] = generation_config.use_cache
+        # decoder-only models with inputs_embeds forwarding must use caching (otherwise we can't detect whether we are
+        # generating the first new token or not, and we only want to use the embeddings for the first new token)
+        if not self.config.is_encoder_decoder and model_input_name == "inputs_embeds":
+            model_kwargs["use_cache"] = True
+        else:
+            model_kwargs["use_cache"] = generation_config.use_cache
 
         accepts_attention_mask = "attention_mask" in set(inspect.signature(self.forward).parameters.keys())
         requires_attention_mask = "encoder_outputs" not in model_kwargs
@@ -1198,8 +1478,11 @@ def generate(
 
         # decoder-only models should use left-padding for generation
         if not self.config.is_encoder_decoder:
+            # If `input_ids` was given, check if the last id in any sequence is `pad_token_id`
+            # Note: If using, `inputs_embeds` this check does not work, because we want to be more hands-off.
             if (
                 generation_config.pad_token_id is not None
+                and len(inputs_tensor.shape) == 2
                 and torch.sum(inputs_tensor[:, -1] == generation_config.pad_token_id) > 0
             ):
                 logger.warning(
@@ -1216,104 +1499,40 @@ def generate(
 
         # 5. Prepare `input_ids` which will be used for auto-regressive generation
         if self.config.is_encoder_decoder:
-            input_ids = self._prepare_decoder_input_ids_for_generation(
-                batch_size,
+            input_ids, model_kwargs = self._prepare_decoder_input_ids_for_generation(
+                batch_size=batch_size,
+                model_input_name=model_input_name,
+                model_kwargs=model_kwargs,
                 decoder_start_token_id=generation_config.decoder_start_token_id,
                 bos_token_id=generation_config.bos_token_id,
-                model_kwargs=model_kwargs,
                 device=inputs_tensor.device,
             )
         else:
-            # if decoder-only then inputs_tensor has to be `input_ids`
-            input_ids = inputs_tensor
+            input_ids = inputs_tensor if model_input_name == "input_ids" else model_kwargs.pop("input_ids")
+
+        if streamer is not None:
+            streamer.put(input_ids.cpu())
 
         # 6. Prepare `max_length` depending on other stopping criteria.
-        input_ids_seq_length = input_ids.shape[-1]
+        input_ids_length = input_ids.shape[-1]
         has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
-        if has_default_max_length and generation_config.max_new_tokens is None:
-            warnings.warn(
-                "Neither `max_length` nor `max_new_tokens` has been set, `max_length` will default to"
-                f" {generation_config.max_length} (`generation_config.max_length`). Controlling `max_length` via the"
-                " config is deprecated and `max_length` will be removed from the config in v5 of Transformers -- we"
-                " recommend using `max_new_tokens` to control the maximum length of the generation.",
-                UserWarning,
-            )
-        elif has_default_max_length and generation_config.max_new_tokens is not None:
-            generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
-        elif not has_default_max_length and generation_config.max_new_tokens is not None:
-            raise ValueError(
-                "Both `max_new_tokens` and `max_length` have been set but they serve the same purpose -- setting a"
-                " limit to the generated output length. Remove one of those arguments. Please refer to the"
-                " documentation for more information. "
-                "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)"
-            )
-
-        if generation_config.min_length is not None and generation_config.min_length > generation_config.max_length:
-            raise ValueError(
-                f"Unfeasible length constraints: the minimum length ({generation_config.min_length}) is larger than"
-                f" the maximum length ({generation_config.max_length})"
-            )
-        if input_ids_seq_length >= generation_config.max_length:
-            input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
-            logger.warning(
-                f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
-                f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
-                " increasing `max_new_tokens`."
-            )
+        if generation_config.max_new_tokens is not None:
+            if not has_default_max_length and generation_config.max_length is not None:
+                logger.warning(
+                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
+                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
+                    "Please refer to the documentation for more information. "
+                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)"
+                )
+            generation_config.max_length = generation_config.max_new_tokens + input_ids_length
+        self._validate_generated_length(generation_config, input_ids_length, has_default_max_length)
 
         # 7. determine generation mode
-        is_constraint_gen_mode = (
-            generation_config.constraints is not None or generation_config.force_words_ids is not None
-        )
-
-        is_contrastive_search_gen_mode = (
-            generation_config.top_k is not None
-            and generation_config.top_k > 1
-            and generation_config.do_sample is False
-            and generation_config.penalty_alpha is not None
-            and generation_config.penalty_alpha > 0
-        )
-
-        is_greedy_gen_mode = (
-            (generation_config.num_beams == 1)
-            and (generation_config.num_beam_groups == 1)
-            and generation_config.do_sample is False
-            and not is_constraint_gen_mode
-            and not is_contrastive_search_gen_mode
-        )
-        is_sample_gen_mode = (
-            (generation_config.num_beams == 1)
-            and (generation_config.num_beam_groups == 1)
-            and generation_config.do_sample is True
-            and not is_constraint_gen_mode
-            and not is_contrastive_search_gen_mode
-        )
-        is_beam_gen_mode = (
-            (generation_config.num_beams > 1)
-            and (generation_config.num_beam_groups == 1)
-            and generation_config.do_sample is False
-            and not is_constraint_gen_mode
-            and not is_contrastive_search_gen_mode
-        )
-        is_beam_sample_gen_mode = (
-            (generation_config.num_beams > 1)
-            and (generation_config.num_beam_groups == 1)
-            and generation_config.do_sample is True
-            and not is_constraint_gen_mode
-            and not is_contrastive_search_gen_mode
-        )
-        is_group_beam_gen_mode = (
-            (generation_config.num_beams > 1)
-            and (generation_config.num_beam_groups > 1)
-            and not is_constraint_gen_mode
-            and not is_contrastive_search_gen_mode
-        )
+        generation_mode = self._get_generation_mode(generation_config, assistant_model)
 
-        if generation_config.num_beam_groups > generation_config.num_beams:
-            raise ValueError("`num_beam_groups` has to be smaller or equal to `num_beams`")
-        if is_group_beam_gen_mode and generation_config.do_sample is True:
+        if streamer is not None and (generation_config.num_beams > 1):
             raise ValueError(
-                "Diverse beam search cannot be used in sampling mode. Make sure that `do_sample` is set to `False`."
+                "`streamer` cannot be used with beam search (yet!). Make sure that `num_beams` is set to 1."
             )
 
         if self.device.type != input_ids.device.type:
@@ -1330,10 +1549,13 @@ def generate(
         # 8. prepare distribution pre_processing samplers
         logits_processor = self._get_logits_processor(
             generation_config=generation_config,
-            input_ids_seq_length=input_ids_seq_length,
+            input_ids_seq_length=input_ids_length,
             encoder_input_ids=inputs_tensor,
             prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
             logits_processor=logits_processor,
+            model_kwargs=model_kwargs,
+            negative_prompt_ids=negative_prompt_ids,
+            negative_prompt_attention_mask=negative_prompt_attention_mask,
         )
 
         # 9. prepare stopping criteria
@@ -1341,13 +1563,45 @@ def generate(
             generation_config=generation_config, stopping_criteria=stopping_criteria
         )
         # 10. go into different generation modes
-        if is_greedy_gen_mode:
+        if generation_mode == GenerationMode.ASSISTED_GENERATION:
             if generation_config.num_return_sequences > 1:
                 raise ValueError(
-                    f"num_return_sequences has to be 1, but is {generation_config.num_return_sequences} when doing"
-                    " greedy search."
+                    "num_return_sequences has to be 1 when doing assisted generate, "
+                    f"but is {generation_config.num_return_sequences}."
                 )
+            if batch_size > 1:
+                raise ValueError("assisted generate is only supported for batch_size = 1")
+            if not model_kwargs["use_cache"]:
+                raise ValueError("assisted generate requires `use_cache=True`")
+
+            # 11. If the assistant model is an encoder-decoder, prepare its encoder outputs
+            if assistant_model.config.is_encoder_decoder:
+                assistant_model_kwargs = copy.deepcopy(model_kwargs)
+                inputs_tensor, model_input_name, assistant_model_kwargs = assistant_model._prepare_model_inputs(
+                    inputs_tensor, assistant_model.generation_config.bos_token_id, assistant_model_kwargs
+                )
+                assistant_model_kwargs = assistant_model._prepare_encoder_decoder_kwargs_for_generation(
+                    inputs_tensor, assistant_model_kwargs, model_input_name
+                )
+                model_kwargs["assistant_encoder_outputs"] = assistant_model_kwargs["encoder_outputs"]
 
+            # 12. run assisted generate
+            return self.assisted_decoding(
+                input_ids,
+                assistant_model=assistant_model,
+                do_sample=generation_config.do_sample,
+                logits_processor=logits_processor,
+                logits_warper=self._get_logits_warper(generation_config) if generation_config.do_sample else None,
+                stopping_criteria=stopping_criteria,
+                pad_token_id=generation_config.pad_token_id,
+                eos_token_id=generation_config.eos_token_id,
+                output_scores=generation_config.output_scores,
+                return_dict_in_generate=generation_config.return_dict_in_generate,
+                synced_gpus=synced_gpus,
+                streamer=streamer,
+                **model_kwargs,
+            )
+        if generation_mode == GenerationMode.GREEDY_SEARCH:
             # 11. run greedy search
             return self.greedy_search(
                 input_ids,
@@ -1358,16 +1612,13 @@ def generate(
                 output_scores=generation_config.output_scores,
                 return_dict_in_generate=generation_config.return_dict_in_generate,
                 synced_gpus=synced_gpus,
+                streamer=streamer,
                 **model_kwargs,
             )
 
-        elif is_contrastive_search_gen_mode:
-
-            if generation_config.num_return_sequences > 1:
-                raise ValueError(
-                    f"num_return_sequences has to be 1, but is {generation_config.num_return_sequences} when doing"
-                    " contrastive search."
-                )
+        elif generation_mode == GenerationMode.CONTRASTIVE_SEARCH:
+            if not model_kwargs["use_cache"]:
+                raise ValueError("Contrastive search requires `use_cache=True`")
 
             return self.contrastive_search(
                 input_ids,
@@ -1380,10 +1631,12 @@ def generate(
                 output_scores=generation_config.output_scores,
                 return_dict_in_generate=generation_config.return_dict_in_generate,
                 synced_gpus=synced_gpus,
+                streamer=streamer,
+                sequential=generation_config.low_memory,
                 **model_kwargs,
             )
 
-        elif is_sample_gen_mode:
+        elif generation_mode == GenerationMode.SAMPLE:
             # 11. prepare logits warper
             logits_warper = self._get_logits_warper(generation_config)
 
@@ -1406,16 +1659,11 @@ def generate(
                 output_scores=generation_config.output_scores,
                 return_dict_in_generate=generation_config.return_dict_in_generate,
                 synced_gpus=synced_gpus,
+                streamer=streamer,
                 **model_kwargs,
             )
 
-        elif is_beam_gen_mode:
-            if generation_config.num_return_sequences > generation_config.num_beams:
-                raise ValueError("`num_return_sequences` has to be smaller or equal to `num_beams`.")
-
-            if stopping_criteria.max_length is None:
-                raise ValueError("`max_length` needs to be a stopping_criteria for now.")
-
+        elif generation_mode == GenerationMode.BEAM_SEARCH:
             # 11. prepare beam search scorer
             beam_scorer = BeamSearchScorer(
                 batch_size=batch_size,
@@ -1424,6 +1672,7 @@ def generate(
                 length_penalty=generation_config.length_penalty,
                 do_early_stopping=generation_config.early_stopping,
                 num_beam_hyps_to_keep=generation_config.num_return_sequences,
+                max_length=generation_config.max_length,
             )
             # 12. interleave input_ids with `num_beams` additional sequences per batch
             input_ids, model_kwargs = self._expand_inputs_for_generation(
@@ -1446,25 +1695,25 @@ def generate(
                 **model_kwargs,
             )
 
-        elif is_beam_sample_gen_mode:
+        elif generation_mode == GenerationMode.BEAM_SAMPLE:
             # 11. prepare logits warper
             logits_warper = self._get_logits_warper(generation_config)
 
-            if stopping_criteria.max_length is None:
-                raise ValueError("`max_length` needs to be a stopping_criteria for now.")
             # 12. prepare beam search scorer
             beam_scorer = BeamSearchScorer(
-                batch_size=batch_size * generation_config.num_return_sequences,
+                batch_size=batch_size,
                 num_beams=generation_config.num_beams,
                 device=inputs_tensor.device,
                 length_penalty=generation_config.length_penalty,
                 do_early_stopping=generation_config.early_stopping,
+                num_beam_hyps_to_keep=generation_config.num_return_sequences,
+                max_length=generation_config.max_length,
             )
 
             # 13. interleave input_ids with `num_beams` additional sequences per batch
             input_ids, model_kwargs = self._expand_inputs_for_generation(
                 input_ids=input_ids,
-                expand_size=generation_config.num_beams * generation_config.num_return_sequences,
+                expand_size=generation_config.num_beams,
                 is_encoder_decoder=self.config.is_encoder_decoder,
                 **model_kwargs,
             )
@@ -1484,30 +1733,17 @@ def generate(
                 **model_kwargs,
             )
 
-        elif is_group_beam_gen_mode:
-            if generation_config.num_return_sequences > generation_config.num_beams:
-                raise ValueError("`num_return_sequences` has to be smaller or equal to `num_beams`.")
-
-            if generation_config.num_beams % generation_config.num_beam_groups != 0:
-                raise ValueError("`num_beams` should be divisible by `num_beam_groups` for group beam search.")
-
-            if stopping_criteria.max_length is None:
-                raise ValueError("`max_length` needs to be a stopping_criteria for now.")
-
-            has_default_typical_p = kwargs.get("typical_p") is None and generation_config.typical_p == 1.0
-            if not has_default_typical_p:
-                raise ValueError("Decoder argument `typical_p` is not supported with beam groups.")
-
+        elif generation_mode == GenerationMode.GROUP_BEAM_SEARCH:
             # 11. prepare beam search scorer
             beam_scorer = BeamSearchScorer(
                 batch_size=batch_size,
                 num_beams=generation_config.num_beams,
-                max_length=stopping_criteria.max_length,
                 device=inputs_tensor.device,
                 length_penalty=generation_config.length_penalty,
                 do_early_stopping=generation_config.early_stopping,
                 num_beam_hyps_to_keep=generation_config.num_return_sequences,
                 num_beam_groups=generation_config.num_beam_groups,
+                max_length=generation_config.max_length,
             )
             # 12. interleave input_ids with `num_beams` additional sequences per batch
             input_ids, model_kwargs = self._expand_inputs_for_generation(
@@ -1530,22 +1766,7 @@ def generate(
                 **model_kwargs,
             )
 
-        elif is_constraint_gen_mode:
-            if generation_config.num_return_sequences > generation_config.num_beams:
-                raise ValueError("`num_return_sequences` has to be smaller or equal to `num_beams`.")
-
-            if stopping_criteria.max_length is None:
-                raise ValueError("`max_length` needs to be a stopping_criteria for now.")
-
-            if generation_config.num_beams <= 1:
-                raise ValueError("`num_beams` needs to be greater than 1 for constrained generation.")
-
-            if generation_config.do_sample:
-                raise ValueError("`do_sample` needs to be false for constrained generation.")
-
-            if generation_config.num_beam_groups is not None and generation_config.num_beam_groups > 1:
-                raise ValueError("`num_beam_groups` not supported yet for constrained generation.")
-
+        elif generation_mode == GenerationMode.CONSTRAINED_BEAM_SEARCH:
             final_constraints = []
             if generation_config.constraints is not None:
                 final_constraints = generation_config.constraints
@@ -1595,6 +1816,7 @@ def typeerror():
                 length_penalty=generation_config.length_penalty,
                 do_early_stopping=generation_config.early_stopping,
                 num_beam_hyps_to_keep=generation_config.num_return_sequences,
+                max_length=generation_config.max_length,
             )
             # 12. interleave input_ids with `num_beams` additional sequences per batch
             input_ids, model_kwargs = self._expand_inputs_for_generation(
@@ -1632,13 +1854,23 @@ def contrastive_search(
         output_hidden_states: Optional[bool] = None,
         output_scores: Optional[bool] = None,
         return_dict_in_generate: Optional[bool] = None,
-        synced_gpus: Optional[bool] = False,
+        synced_gpus: bool = False,
+        streamer: Optional["BaseStreamer"] = None,
+        sequential: Optional[bool] = None,
         **model_kwargs,
     ) -> Union[ContrastiveSearchOutput, torch.LongTensor]:
         r"""
         Generates sequences of token ids for models with a language modeling head using **contrastive search** and can
         be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
 
+        <Tip warning={true}>
+
+        In most cases, you do not need to call [`~generation.GenerationMixin.contrastive_search`] directly. Use
+        generate() instead. For an overview of generation strategies and code examples, check the [following
+        guide](../generation_strategies).
+
+        </Tip>
+
         Parameters:
             input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                 The sequence used as a prompt for the generation.
@@ -1658,8 +1890,8 @@ def contrastive_search(
                 used to tell if the generation loop should stop.
             pad_token_id (`int`, *optional*):
                 The id of the *padding* token.
-            eos_token_id (`int`, *optional*):
-                The id of the *end-of-sequence* token.
+            eos_token_id (`Union[int, List[int]]`, *optional*):
+                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
             output_attentions (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more details.
@@ -1672,6 +1904,11 @@ def contrastive_search(
                 Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
             synced_gpus (`bool`, *optional*, defaults to `False`):
                 Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+            streamer (`BaseStreamer`, *optional*):
+                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
+                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
+            sequential (`bool`, *optional*):
+                Switches topk hidden state computation from parallel to sequential to reduce memory if True.
             model_kwargs:
                 Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
                 If model is an encoder-decoder model the kwargs should include `encoder_outputs`.
@@ -1711,8 +1948,10 @@ def contrastive_search(
         stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
         pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
         eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
+        sequential = sequential if sequential is not None else self.generation_config.low_memory
         if isinstance(eos_token_id, int):
             eos_token_id = [eos_token_id]
+        eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
         output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
         output_attentions = (
             output_attentions if output_attentions is not None else self.generation_config.output_attentions
@@ -1740,7 +1979,7 @@ def contrastive_search(
             )
 
         # keep track of which sequences are already finished
-        unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
+        unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device)
 
         this_peer_finished = False  # used by synced_gpus only
         batch_size = input_ids.shape[0]
@@ -1759,7 +1998,6 @@ def contrastive_search(
             # if the first step in the loop, encode all the prefix and obtain: (1) past_key_values;
             # (2) last_hidden_states; (3) logit_for_next_step; (4) update model kwargs for the next step
             if model_kwargs.get("past_key_values") is None:
-
                 # prepare inputs
                 model_kwargs["use_cache"] = True
                 model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
@@ -1776,6 +2014,7 @@ def contrastive_search(
                     last_hidden_states = outputs.decoder_hidden_states[-1]
                 else:
                     last_hidden_states = outputs.hidden_states[-1]
+
                 # next logit for contrastive search to select top-k candidate tokens
                 logit_for_next_step = outputs.logits[:, -1, :]
 
@@ -1785,11 +2024,11 @@ def contrastive_search(
                     is_encoder_decoder=self.config.is_encoder_decoder,
                     standardize_cache_format=True,
                 )
-
-                # Expands model inputs top_k times, for batched forward passes (akin to beam search).
-                _, model_kwargs = self._expand_inputs_for_generation(
-                    expand_size=top_k, is_encoder_decoder=self.config.is_encoder_decoder, **model_kwargs
-                )
+                if not sequential:
+                    # Expands model inputs top_k times, for batched forward passes (akin to beam search).
+                    _, model_kwargs = self._expand_inputs_for_generation(
+                        expand_size=top_k, is_encoder_decoder=self.config.is_encoder_decoder, **model_kwargs
+                    )
 
                 past_key_values = model_kwargs.get("past_key_values")
                 if past_key_values is None:
@@ -1809,7 +2048,6 @@ def contrastive_search(
             # contrastive_search main logic start:
             # contrastive search decoding consists of two steps: (1) candidate tokens recall; (2) candidate re-rank by
             # degeneration penalty
-
             logit_for_next_step = logits_processor(input_ids, logit_for_next_step)
             logit_for_next_step = logits_warper(input_ids, logit_for_next_step)
             next_probs = nn.functional.softmax(logit_for_next_step, dim=-1)
@@ -1839,30 +2077,81 @@ def contrastive_search(
                 items = []
                 # item is either the key or the value matrix
                 for item in layer:
-                    items.append(item.repeat_interleave(top_k, dim=0))
+                    if sequential:
+                        items.append(item.repeat_interleave(1, dim=0))
+                    else:
+                        items.append(item.repeat_interleave(top_k, dim=0))
                 new_key_values.append(items)
             model_kwargs["past_key_values"] = new_key_values
 
-            # compute the candidate tokens by the language model and collects their hidden_states
-            next_model_inputs = self.prepare_inputs_for_generation(top_k_ids.view(-1, 1), **model_kwargs)
-            outputs = self(
-                **next_model_inputs, return_dict=True, output_hidden_states=True, output_attentions=output_attentions
-            )
-            next_past_key_values = self._extract_past_from_model_output(outputs, standardize_cache_format=True)
+            if sequential:
+                all_outputs = {key: [] for key in outputs}  # defined in first loop iteration
+                all_last_hstates, all_hstates, all_logits = [], [], []
+                for i in range(top_k):
+                    # compute the candidate tokens by the language model and collect their hidden_states
+                    next_model_inputs = self.prepare_inputs_for_generation(top_k_ids[:, i].view(-1, 1), **model_kwargs)
+
+                    outputs = self(
+                        **next_model_inputs,
+                        return_dict=True,
+                        output_hidden_states=True,
+                        output_attentions=output_attentions,
+                    )
+                    for key in all_outputs:
+                        all_outputs[key].append(outputs[key])
+
+                    if self.config.is_encoder_decoder:
+                        next_hidden = outputs.decoder_hidden_states[-1]
+                        full_hidden_states = outputs.decoder_hidden_states
+
+                    else:
+                        next_hidden = outputs.hidden_states[-1]
+                        full_hidden_states = outputs.hidden_states
+
+                    all_last_hstates.append(torch.squeeze(next_hidden, 0))
+                    all_hstates.append(full_hidden_states)
+                    all_logits.append(outputs.logits[:, -1, :])
+
+                # stack hidden states
+                next_hidden = torch.stack([all_last_hstates[i] for i in range(top_k)], dim=0)
+                final_full_hstates = [0 for i in range(len(full_hidden_states))]
+                for layer in range(len(full_hidden_states)):
+                    final_full_hstates[layer] = torch.stack(
+                        [torch.squeeze(all_hstates[i][layer], 0) for i in range(top_k)], dim=0
+                    )
+                full_hidden_states = tuple(final_full_hstates)
+
+                # stack logits
+                logits = torch.cat(all_logits, dim=0)
 
-            logits = outputs.logits[:, -1, :]
-            # name is different for encoder-decoder and decoder-only models
-            if self.config.is_encoder_decoder:
-                next_hidden = outputs.decoder_hidden_states[-1]
-                full_hidden_states = outputs.decoder_hidden_states
             else:
-                next_hidden = outputs.hidden_states[-1]
-                full_hidden_states = outputs.hidden_states
+                # compute the candidate tokens by the language model and collect their hidden_states
+                # assembles top_k_ids into batch of size k
+                next_model_inputs = self.prepare_inputs_for_generation(top_k_ids.view(-1, 1), **model_kwargs)
+
+                outputs = self(
+                    **next_model_inputs,
+                    return_dict=True,
+                    output_hidden_states=True,
+                    output_attentions=output_attentions,
+                )
+                # name is different for encoder-decoder and decoder-only models
+                if self.config.is_encoder_decoder:
+                    next_hidden = outputs.decoder_hidden_states[-1]
+                    full_hidden_states = outputs.decoder_hidden_states
+                else:
+                    next_hidden = outputs.hidden_states[-1]
+                    full_hidden_states = outputs.hidden_states
+
+                logits = outputs.logits[:, -1, :]
+
             context_hidden = last_hidden_states.repeat_interleave(top_k, dim=0)
 
             # compute the degeneration penalty and re-rank the candidates based on the degeneration penalty and the
-            # model confidence
+            # model confidence. Keeping `selected_idx` on CPU enables multi-device contrastive search and doesn't
+            # introduce (noticeable) slowdowns on single-device runs.
             selected_idx = _ranking_fast(context_hidden, next_hidden, top_k_probs, penalty_alpha, top_k)
+            selected_idx = selected_idx.to("cpu")
 
             # prepare for the next step: (1) next token_id; (2) past_key_values; (3) last_hidden_states for computing
             # the degeneration penalty; (4) logits for selecting next top-k candidates; (5) selected tokens scores
@@ -1877,19 +2166,34 @@ def contrastive_search(
                 layer = torch.stack(torch.split(layer, top_k))[range(batch_size), selected_idx, :]
                 next_decoder_hidden_states += (layer,)
 
-            # select the past_key_value
-            new_key_values = ()
-            for layer in next_past_key_values:
-                items = ()
-                # item is either the key or the value matrix
-                for item in layer:
-                    item = torch.stack(torch.split(item, top_k, dim=0))  # [B, K, num_head, seq_len, esz]
-                    item = item[range(batch_size), selected_idx, ...]  # [B, num_head, seq_len, esz]
-                    items += (item,)
-                new_key_values += (items,)
-            next_past_key_values = new_key_values
+            # generate past_key_values cache of only the selected token
+            if sequential:
+                next_model_input = self.prepare_inputs_for_generation(
+                    top_k_ids[:, selected_idx].view(-1, 1), **model_kwargs
+                )
 
-            logit_for_next_step = torch.stack(torch.split(logits, top_k))[range(batch_size), selected_idx, :]
+                selected_outputs = self(
+                    **next_model_input,
+                    return_dict=True,
+                    output_hidden_states=False,
+                    output_attentions=False,
+                )
+                next_past_key_values = selected_outputs["past_key_values"]
+
+            else:
+                next_past_key_values = self._extract_past_from_model_output(outputs, standardize_cache_format=True)
+                new_key_values = ()
+                for layer in next_past_key_values:
+                    items = ()
+                    # item is either the key or the value matrix
+                    for item in layer:
+                        item = torch.stack(torch.split(item, top_k, dim=0))  # [B, K, num_head, seq_len, esz]
+                        item = item[range(batch_size), selected_idx, ...]  # [B, num_head, seq_len, esz]
+                        items += (item,)
+                    new_key_values += (items,)
+                next_past_key_values = new_key_values
+
+            logit_for_next_step = torch.stack(torch.split(logits, top_k))[range(batch_size), selected_idx, :]
 
             # Rebuilds the relevant parts of the model output for the selected token, for use in the next iteration
             if self.config.is_encoder_decoder:
@@ -1932,21 +2236,32 @@ def contrastive_search(
 
             # update generated ids, model inputs, and length for next step
             input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+            if streamer is not None:
+                streamer.put(next_tokens.cpu())
             model_kwargs = self._update_model_kwargs_for_generation(
                 outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
             )
 
             # if eos_token was found in one sentence, set sentence to finished
-            if eos_token_id is not None:
-                unfinished_sequences = unfinished_sequences.mul((sum(next_tokens != i for i in eos_token_id)).long())
+            if eos_token_id_tensor is not None:
+                unfinished_sequences = unfinished_sequences.mul(
+                    next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
+                )
 
-            # stop when each sentence is finished, or if we exceed the maximum length
-            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
-                if not synced_gpus:
-                    break
-                else:
+                # stop when each sentence is finished
+                if unfinished_sequences.max() == 0:
                     this_peer_finished = True
 
+            # stop if we exceed the maximum length
+            if stopping_criteria(input_ids, scores):
+                this_peer_finished = True
+
+            if this_peer_finished and not synced_gpus:
+                break
+
+        if streamer is not None:
+            streamer.end()
+
         if return_dict_in_generate:
             if self.config.is_encoder_decoder:
                 return ContrastiveSearchEncoderDecoderOutput(
@@ -1980,13 +2295,23 @@ def greedy_search(
         output_hidden_states: Optional[bool] = None,
         output_scores: Optional[bool] = None,
         return_dict_in_generate: Optional[bool] = None,
-        synced_gpus: Optional[bool] = False,
+        synced_gpus: bool = False,
+        streamer: Optional["BaseStreamer"] = None,
         **model_kwargs,
     ) -> Union[GreedySearchOutput, torch.LongTensor]:
         r"""
         Generates sequences of token ids for models with a language modeling head using **greedy decoding** and can be
         used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
 
+        <Tip warning={true}>
+
+        In most cases, you do not need to call [`~generation.GenerationMixin.greedy_search`] directly. Use generate()
+        instead. For an overview of generation strategies and code examples, check the [following
+        guide](../generation_strategies).
+
+        </Tip>
+
+
         Parameters:
             input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                 The sequence used as a prompt for the generation.
@@ -2002,8 +2327,8 @@ def greedy_search(
                 tokens. The maximum length of the sequence to be generated.
             pad_token_id (`int`, *optional*):
                 The id of the *padding* token.
-            eos_token_id (`int`, *optional*):
-                The id of the *end-of-sequence* token.
+            eos_token_id (`Union[int, List[int]]`, *optional*):
+                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
             output_attentions (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more details.
@@ -2016,6 +2341,9 @@ def greedy_search(
                 Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
             synced_gpus (`bool`, *optional*, defaults to `False`):
                 Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+            streamer (`BaseStreamer`, *optional*):
+                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
+                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
             model_kwargs:
                 Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
                 If model is an encoder-decoder model the kwargs should include `encoder_outputs`.
@@ -2077,6 +2405,7 @@ def greedy_search(
         eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
         if isinstance(eos_token_id, int):
             eos_token_id = [eos_token_id]
+        eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
         output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
         output_attentions = (
             output_attentions if output_attentions is not None else self.generation_config.output_attentions
@@ -2104,7 +2433,7 @@ def greedy_search(
             )
 
         # keep track of which sequences are already finished
-        unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
+        unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device)
 
         this_peer_finished = False  # used by synced_gpus only
         while True:
@@ -2166,21 +2495,32 @@ def greedy_search(
 
             # update generated ids, model inputs, and length for next step
             input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+            if streamer is not None:
+                streamer.put(next_tokens.cpu())
             model_kwargs = self._update_model_kwargs_for_generation(
                 outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
             )
 
             # if eos_token was found in one sentence, set sentence to finished
-            if eos_token_id is not None:
-                unfinished_sequences = unfinished_sequences.mul((sum(next_tokens != i for i in eos_token_id)).long())
+            if eos_token_id_tensor is not None:
+                unfinished_sequences = unfinished_sequences.mul(
+                    next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
+                )
 
-            # stop when each sentence is finished, or if we exceed the maximum length
-            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
-                if not synced_gpus:
-                    break
-                else:
+                # stop when each sentence is finished
+                if unfinished_sequences.max() == 0:
                     this_peer_finished = True
 
+            # stop if we exceed the maximum length
+            if stopping_criteria(input_ids, scores):
+                this_peer_finished = True
+
+            if this_peer_finished and not synced_gpus:
+                break
+
+        if streamer is not None:
+            streamer.end()
+
         if return_dict_in_generate:
             if self.config.is_encoder_decoder:
                 return GreedySearchEncoderDecoderOutput(
@@ -2215,13 +2555,22 @@ def sample(
         output_hidden_states: Optional[bool] = None,
         output_scores: Optional[bool] = None,
         return_dict_in_generate: Optional[bool] = None,
-        synced_gpus: Optional[bool] = False,
+        synced_gpus: bool = False,
+        streamer: Optional["BaseStreamer"] = None,
         **model_kwargs,
     ) -> Union[SampleOutput, torch.LongTensor]:
         r"""
         Generates sequences of token ids for models with a language modeling head using **multinomial sampling** and
         can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
 
+        <Tip warning={true}>
+
+        In most cases, you do not need to call [`~generation.GenerationMixin.sample`] directly. Use generate() instead.
+        For an overview of generation strategies and code examples, check the [following
+        guide](../generation_strategies).
+
+        </Tip>
+
         Parameters:
             input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                 The sequence used as a prompt for the generation.
@@ -2240,8 +2589,8 @@ def sample(
                 tokens. The maximum length of the sequence to be generated.
             pad_token_id (`int`, *optional*):
                 The id of the *padding* token.
-            eos_token_id (`int`, *optional*):
-                The id of the *end-of-sequence* token.
+            eos_token_id (`Union[int, List[int]]`, *optional*):
+                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
             output_attentions (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more details.
@@ -2254,6 +2603,9 @@ def sample(
                 Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
             synced_gpus (`bool`, *optional*, defaults to `False`):
                 Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+            streamer (`BaseStreamer`, *optional*):
+                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
+                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
             model_kwargs:
                 Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
                 an encoder-decoder model the kwargs should include `encoder_outputs`.
@@ -2315,7 +2667,7 @@ def sample(
         ... )
 
         >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        ['Today is a beautiful day, and a wonderful day.\n\nI was lucky enough to meet the']
+        ['Today is a beautiful day, and we must do everything possible to make it a day of celebration.']
         ```"""
         # init values
         logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
@@ -2332,6 +2684,7 @@ def sample(
         eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
         if isinstance(eos_token_id, int):
             eos_token_id = [eos_token_id]
+        eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
         output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
         output_attentions = (
             output_attentions if output_attentions is not None else self.generation_config.output_attentions
@@ -2359,7 +2712,7 @@ def sample(
             )
 
         # keep track of which sequences are already finished
-        unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
+        unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device)
 
         this_peer_finished = False  # used by synced_gpus only
         # auto-regressive generation
@@ -2424,21 +2777,32 @@ def sample(
 
             # update generated ids, model inputs, and length for next step
             input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+            if streamer is not None:
+                streamer.put(next_tokens.cpu())
             model_kwargs = self._update_model_kwargs_for_generation(
                 outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
             )
 
             # if eos_token was found in one sentence, set sentence to finished
-            if eos_token_id is not None:
-                unfinished_sequences = unfinished_sequences.mul((sum(next_tokens != i for i in eos_token_id)).long())
+            if eos_token_id_tensor is not None:
+                unfinished_sequences = unfinished_sequences.mul(
+                    next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
+                )
 
-            # stop when each sentence is finished, or if we exceed the maximum length
-            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
-                if not synced_gpus:
-                    break
-                else:
+                # stop when each sentence is finished
+                if unfinished_sequences.max() == 0:
                     this_peer_finished = True
 
+            # stop if we exceed the maximum length
+            if stopping_criteria(input_ids, scores):
+                this_peer_finished = True
+
+            if this_peer_finished and not synced_gpus:
+                break
+
+        if streamer is not None:
+            streamer.end()
+
         if return_dict_in_generate:
             if self.config.is_encoder_decoder:
                 return SampleEncoderDecoderOutput(
@@ -2473,13 +2837,21 @@ def beam_search(
         output_hidden_states: Optional[bool] = None,
         output_scores: Optional[bool] = None,
         return_dict_in_generate: Optional[bool] = None,
-        synced_gpus: Optional[bool] = False,
+        synced_gpus: bool = False,
         **model_kwargs,
     ) -> Union[BeamSearchOutput, torch.LongTensor]:
         r"""
         Generates sequences of token ids for models with a language modeling head using **beam search decoding** and
         can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
 
+        <Tip warning={true}>
+
+        In most cases, you do not need to call [`~generation.GenerationMixin.beam_search`] directly. Use generate()
+        instead. For an overview of generation strategies and code examples, check the [following
+        guide](../generation_strategies).
+
+        </Tip>
+
         Parameters:
             input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                 The sequence used as a prompt for the generation.
@@ -2497,8 +2869,8 @@ def beam_search(
                 tokens. The maximum length of the sequence to be generated.
             pad_token_id (`int`, *optional*):
                 The id of the *padding* token.
-            eos_token_id (`int`, *optional*):
-                The id of the *end-of-sequence* token.
+            eos_token_id (`Union[int, List[int]]`, *optional*):
+                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
             output_attentions (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more details.
@@ -2661,15 +3033,14 @@ def beam_search(
                 continue  # don't waste resources running the code we don't need
 
             next_token_logits = outputs.logits[:, -1, :]
-            # hack: adjust tokens for Marian. For Marian we have to make sure that the `pad_token_id`
-            # cannot be generated both before and after the `nn.functional.log_softmax` operation.
-            next_token_logits = self.adjust_logits_during_generation(next_token_logits, cur_len=cur_len)
             next_token_scores = nn.functional.log_softmax(
                 next_token_logits, dim=-1
             )  # (batch_size * num_beams, vocab_size)
 
             next_token_scores_processed = logits_processor(input_ids, next_token_scores)
-            next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as(next_token_scores)
+            next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as(
+                next_token_scores_processed
+            )
 
             # Store scores, attentions and hidden_states when required
             if return_dict_in_generate:
@@ -2693,12 +3064,13 @@ def beam_search(
             vocab_size = next_token_scores.shape[-1]
             next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size)
 
-            # Sample 2 next tokens for each beam (so we have some spare tokens and match output of beam search)
+            # Sample 1 + len(eos_token_id) next tokens for each beam so we have at least 1 non eos token per beam.
+            n_eos_tokens = len(eos_token_id) if eos_token_id else 0
             next_token_scores, next_tokens = torch.topk(
-                next_token_scores, 2 * num_beams, dim=1, largest=True, sorted=True
+                next_token_scores, max(2, 1 + n_eos_tokens) * num_beams, dim=1, largest=True, sorted=True
             )
 
-            next_indices = torch_int_div(next_tokens, vocab_size)
+            next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
             next_tokens = next_tokens % vocab_size
 
             # stateless
@@ -2789,13 +3161,21 @@ def beam_sample(
         output_hidden_states: Optional[bool] = None,
         output_scores: Optional[bool] = None,
         return_dict_in_generate: Optional[bool] = None,
-        synced_gpus: Optional[bool] = False,
+        synced_gpus: bool = False,
         **model_kwargs,
     ) -> Union[BeamSampleOutput, torch.LongTensor]:
         r"""
         Generates sequences of token ids for models with a language modeling head using **beam search multinomial
         sampling** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
 
+        <Tip warning={true}>
+
+        In most cases, you do not need to call [`~generation.GenerationMixin.beam_sample`] directly. Use generate()
+        instead. For an overview of generation strategies and code examples, check the [following
+        guide](../generation_strategies).
+
+        </Tip>
+
         Parameters:
             input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                 The sequence used as a prompt for the generation.
@@ -2817,8 +3197,8 @@ def beam_sample(
                 tokens. The maximum length of the sequence to be generated.
             pad_token_id (`int`, *optional*):
                 The id of the *padding* token.
-            eos_token_id (`int`, *optional*):
-                The id of the *end-of-sequence* token.
+            eos_token_id (`Union[int, List[int]]`, *optional*):
+                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
             output_attentions (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more details.
@@ -2980,15 +3360,17 @@ def beam_sample(
 
             next_token_logits = outputs.logits[:, -1, :]
 
-            # hack: adjust tokens for Marian. For Marian we have to make sure that the `pad_token_id`
-            # cannot be generated both before and after the `nn.functional.log_softmax` operation.
-            next_token_logits = self.adjust_logits_during_generation(next_token_logits, cur_len=cur_len)
             next_token_scores = nn.functional.log_softmax(
                 next_token_logits, dim=-1
             )  # (batch_size * num_beams, vocab_size)
 
             next_token_scores_processed = logits_processor(input_ids, next_token_scores)
-            next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as(next_token_scores)
+            next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as(
+                next_token_scores_processed
+            )
+            # Note: logits warpers are intentionally applied after adding running beam scores. On some logits warpers
+            # (like top_p) this is indiferent, but on others (like temperature) it is not. For reference, see
+            # https://github.com/huggingface/transformers/pull/5420#discussion_r449779867
             next_token_scores = logits_warper(input_ids, next_token_scores)
 
             # Store scores, attentions and hidden_states when required
@@ -3021,7 +3403,7 @@ def beam_sample(
             next_token_scores, _indices = torch.sort(next_token_scores, descending=True, dim=1)
             next_tokens = torch.gather(next_tokens, -1, _indices)
 
-            next_indices = torch_int_div(next_tokens, vocab_size)
+            next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
             next_tokens = next_tokens % vocab_size
 
             # stateless
@@ -3110,13 +3492,21 @@ def group_beam_search(
         output_hidden_states: Optional[bool] = None,
         output_scores: Optional[bool] = None,
         return_dict_in_generate: Optional[bool] = None,
-        synced_gpus: Optional[bool] = False,
+        synced_gpus: bool = False,
         **model_kwargs,
     ):
         r"""
         Generates sequences of token ids for models with a language modeling head using **diverse beam search
         decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
 
+        <Tip warning={true}>
+
+        In most cases, you do not need to call [`~generation.GenerationMixin.group_beam_search`] directly. Use
+        generate() instead. For an overview of generation strategies and code examples, check the [following
+        guide](../generation_strategies).
+
+        </Tip>
+
         Parameters:
             input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                 The sequence used as a prompt for the generation.
@@ -3134,8 +3524,8 @@ def group_beam_search(
                 tokens. The maximum length of the sequence to be generated.
             pad_token_id (`int`, *optional*):
                 The id of the *padding* token.
-            eos_token_id (`int`, *optional*):
-                The id of the *end-of-sequence* token.
+            eos_token_id (`Union[int, List[int]]`, *optional*):
+                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
             output_attentions (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more details.
@@ -3244,10 +3634,10 @@ def group_beam_search(
             else self.generation_config.return_dict_in_generate
         )
 
-        batch_size = len(beam_scorer._beam_hyps)
         num_beams = beam_scorer.num_beams
         num_beam_groups = beam_scorer.num_beam_groups
         num_sub_beams = num_beams // num_beam_groups
+        batch_size = len(beam_scorer._beam_hyps) // num_beam_groups
         device = input_ids.device
 
         batch_beam_size, cur_len = input_ids.shape
@@ -3332,9 +3722,6 @@ def group_beam_search(
                 # select outputs of beams of current group only
                 next_token_logits = outputs.logits[batch_group_indices, -1, :]
 
-                # hack: adjust tokens for Marian. For Marian we have to make sure that the `pad_token_id`
-                # cannot be generated both before and after the `nn.functional.log_softmax` operation.
-                next_token_logits = self.adjust_logits_during_generation(next_token_logits, cur_len=cur_len)
                 next_token_scores = nn.functional.log_softmax(
                     next_token_logits, dim=-1
                 )  # (batch_size * group_size, vocab_size)
@@ -3352,12 +3739,13 @@ def group_beam_search(
                 # reshape for beam search
                 next_token_scores = next_token_scores.view(batch_size, group_size * vocab_size)
 
-                # Sample 2 next tokens for each beam (so we have some spare tokens and match output of beam search)
+                # Sample 1 + len(eos_token_id) next tokens for each beam so we have at least 1 non eos token per beam.
+                n_eos_tokens = len(eos_token_id) if eos_token_id else 0
                 next_token_scores, next_tokens = torch.topk(
-                    next_token_scores, 2 * group_size, dim=1, largest=True, sorted=True
+                    next_token_scores, max(2, 1 + n_eos_tokens) * group_size, dim=1, largest=True, sorted=True
                 )
 
-                next_indices = torch_int_div(next_tokens, vocab_size)
+                next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
                 next_tokens = next_tokens % vocab_size
 
                 # stateless
@@ -3370,6 +3758,7 @@ def group_beam_search(
                     pad_token_id=pad_token_id,
                     eos_token_id=eos_token_id,
                     beam_indices=process_beam_indices,
+                    group_index=beam_group_idx,
                 )
                 beam_scores[batch_group_indices] = beam_outputs["next_beam_scores"]
                 beam_next_tokens = beam_outputs["next_beam_tokens"]
@@ -3387,7 +3776,9 @@ def group_beam_search(
                 # (beam_idx // group_size) -> batch_idx
                 # (beam_idx % group_size) -> offset of idx inside the group
                 reordering_indices[batch_group_indices] = (
-                    num_beams * torch_int_div(beam_idx, group_size) + group_start_idx + (beam_idx % group_size)
+                    num_beams * torch.div(beam_idx, group_size, rounding_mode="floor")
+                    + group_start_idx
+                    + (beam_idx % group_size)
                 )
 
             # Store scores, attentions and hidden_states when required
@@ -3487,6 +3878,14 @@ def constrained_beam_search(
         Generates sequences of token ids for models with a language modeling head using **constrained beam search
         decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
 
+        <Tip warning={true}>
+
+        In most cases, you do not need to call [`~generation.GenerationMixin.constrained_beam_search`] directly. Use
+        generate() instead. For an overview of generation strategies and code examples, check the [following
+        guide](../generation_strategies).
+
+        </Tip>
+
         Parameters:
             input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                 The sequence used as a prompt for the generation.
@@ -3509,8 +3908,8 @@ def constrained_beam_search(
                 tokens. The maximum length of the sequence to be generated.
             pad_token_id (`int`, *optional*):
                 The id of the *padding* token.
-            eos_token_id (`int`, *optional*):
-                The id of the *end-of-sequence* token.
+            eos_token_id (`Union[int, List[int]]`, *optional*):
+                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
             output_attentions (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more details.
@@ -3621,8 +4020,21 @@ def constrained_beam_search(
             else self.generation_config.return_dict_in_generate
         )
 
+        batch_size = len(constrained_beam_scorer._beam_hyps)
+        num_beams = constrained_beam_scorer.num_beams
+
+        batch_beam_size, cur_len = input_ids.shape
+
+        if num_beams * batch_size != batch_beam_size:
+            raise ValueError(
+                f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
+            )
+
         # init attention / hidden states / scores tuples
         scores = () if (return_dict_in_generate and output_scores) else None
+        beam_indices = (
+            tuple(() for _ in range(batch_beam_size)) if (return_dict_in_generate and output_scores) else None
+        )
         decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
         cross_attentions = () if (return_dict_in_generate and output_attentions) else None
         decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
@@ -3634,16 +4046,6 @@ def constrained_beam_search(
                 model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
             )
 
-        batch_size = len(constrained_beam_scorer._beam_hyps)
-        num_beams = constrained_beam_scorer.num_beams
-
-        batch_beam_size, cur_len = input_ids.shape
-
-        if num_beams * batch_size != batch_beam_size:
-            raise ValueError(
-                f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
-            )
-
         # initialise score of first beam with 0 and the rest with -1e9. This makes sure that only tokens
         # of the first beam are considered to avoid sampling the exact same tokens across all beams.
         beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
@@ -3676,16 +4078,15 @@ def constrained_beam_search(
                 continue  # don't waste resources running the code we don't need
 
             next_token_logits = outputs.logits[:, -1, :]
-            # hack: adjust tokens for Marian. For Marian we have to make sure that the `pad_token_id`
-            # cannot be generated both before and after the `nn.functional.log_softmax` operation.
-            next_token_logits = self.adjust_logits_during_generation(next_token_logits, cur_len=cur_len)
             next_token_scores = nn.functional.log_softmax(
                 next_token_logits, dim=-1
             )  # (batch_size * num_beams, vocab_size)
 
             next_token_scores_processed = logits_processor(input_ids, next_token_scores)
 
-            next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as(next_token_scores)
+            next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as(
+                next_token_scores_processed
+            )
 
             scores_for_all_vocab = next_token_scores.clone()
 
@@ -3711,9 +4112,10 @@ def constrained_beam_search(
             vocab_size = next_token_scores.shape[-1]
             next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size)
 
-            # Sample 2 next tokens for each beam (so we have some spare tokens and match output of beam search)
+            # Sample 1 + len(eos_token_id) next tokens for each beam so we have at least 1 non eos token per beam.
+            n_eos_tokens = len(eos_token_id) if eos_token_id else 0
             next_token_scores, next_tokens = torch.topk(
-                next_token_scores, 2 * num_beams, dim=1, largest=True, sorted=True
+                next_token_scores, max(2, 1 + n_eos_tokens) * num_beams, dim=1, largest=True, sorted=True
             )
 
             next_indices = (next_tokens / vocab_size).long()
@@ -3728,6 +4130,7 @@ def constrained_beam_search(
                 scores_for_all_vocab,
                 pad_token_id=pad_token_id,
                 eos_token_id=eos_token_id,
+                beam_indices=beam_indices,
             )
             beam_scores = beam_outputs["next_beam_scores"]
             beam_next_tokens = beam_outputs["next_beam_tokens"]
@@ -3740,6 +4143,9 @@ def constrained_beam_search(
             if model_kwargs["past_key_values"] is not None:
                 model_kwargs["past_key_values"] = self._reorder_cache(model_kwargs["past_key_values"], beam_idx)
 
+            if return_dict_in_generate and output_scores:
+                beam_indices = tuple((beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices))))
+
             # increase cur_len
             cur_len = cur_len + 1
 
@@ -3757,6 +4163,7 @@ def constrained_beam_search(
             pad_token_id=pad_token_id,
             eos_token_id=eos_token_id,
             max_length=stopping_criteria.max_length,
+            beam_indices=beam_indices,
         )
 
         if return_dict_in_generate:
@@ -3767,6 +4174,7 @@ def constrained_beam_search(
                     sequences=sequence_outputs["sequences"],
                     sequences_scores=sequence_outputs["sequence_scores"],
                     scores=scores,
+                    beam_indices=sequence_outputs["beam_indices"],
                     encoder_attentions=encoder_attentions,
                     encoder_hidden_states=encoder_hidden_states,
                     decoder_attentions=decoder_attentions,
@@ -3778,12 +4186,522 @@ def constrained_beam_search(
                     sequences=sequence_outputs["sequences"],
                     sequences_scores=sequence_outputs["sequence_scores"],
                     scores=scores,
+                    beam_indices=sequence_outputs["beam_indices"],
                     attentions=decoder_attentions,
                     hidden_states=decoder_hidden_states,
                 )
         else:
             return sequence_outputs["sequences"]
 
+    def assisted_decoding(
+        self,
+        input_ids: torch.LongTensor,
+        assistant_model: "PreTrainedModel",
+        do_sample: bool = False,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        logits_warper: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[Union[int, List[int]]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_scores: Optional[bool] = None,
+        return_dict_in_generate: Optional[bool] = None,
+        synced_gpus: bool = False,
+        streamer: Optional["BaseStreamer"] = None,
+        **model_kwargs,
+    ):
+        r"""
+        Generates sequences of token ids for models with a language modeling head using **greedy decoding** or
+        **sample** (depending on `do_sample`), assisted by a smaller model. Can be used for text-decoder, text-to-text,
+        speech-to-text, and vision-to-text models.
+
+        <Tip warning={true}>
+
+        In most cases, you do not need to call [`~generation.GenerationMixin.assisted_decoding`] directly. Use
+        generate() instead. For an overview of generation strategies and code examples, check the [following
+        guide](../generation_strategies).
+
+        </Tip>
+
+        Parameters:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            assistant_model (`PreTrainedModel`, *optional*):
+                An assistant model that can be used to accelerate generation. The assistant model must have the exact
+                same tokenizer. The acceleration is achieved when forecasting candidate tokens with the assistent model
+                is much faster than running generation with the model you're calling generate from. As such, the
+                assistant model should be much smaller.
+            do_sample (`bool`, *optional*, defaults to `False`):
+                Whether or not to use sampling ; use greedy decoding otherwise.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+                used to modify the prediction scores of the language modeling head applied at each generation step.
+            logits_warper (`LogitsProcessorList`, *optional*):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used
+                to warp the prediction score distribution of the language modeling head applied before multinomial
+                sampling at each generation step.
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
+                used to tell if the generation loop should stop.
+            pad_token_id (`int`, *optional*):
+                The id of the *padding* token.
+            eos_token_id (`Union[int, List[int]]`, *optional*):
+                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more details.
+            output_hidden_states (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more details.
+            output_scores (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+            synced_gpus (`bool`, *optional*, defaults to `False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+            streamer (`BaseStreamer`, *optional*):
+                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
+                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
+            model_kwargs:
+                Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
+                If model is an encoder-decoder model the kwargs should include `encoder_outputs`.
+
+        Return:
+            [`~generation.GreedySearchDecoderOnlyOutput`], [`~generation.GreedySearchEncoderDecoderOutput`] or
+            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
+            [`~generation.GreedySearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+            `return_dict_in_generate=True` or a [`~generation.GreedySearchEncoderDecoderOutput`] if
+            `model.config.is_encoder_decoder=True`.
+
+        Examples:
+
+        ```python
+        >>> from transformers import (
+        ...     AutoTokenizer,
+        ...     AutoModelForCausalLM,
+        ...     LogitsProcessorList,
+        ...     MinLengthLogitsProcessor,
+        ...     StoppingCriteriaList,
+        ...     MaxLengthCriteria,
+        ... )
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+        >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+        >>> assistant_model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+        >>> # set pad_token_id to eos_token_id because GPT2 does not have a PAD token
+        >>> model.generation_config.pad_token_id = model.generation_config.eos_token_id
+        >>> input_prompt = "It might be possible to"
+        >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
+        >>> # instantiate logits processors
+        >>> logits_processor = LogitsProcessorList(
+        ...     [
+        ...         MinLengthLogitsProcessor(10, eos_token_id=model.generation_config.eos_token_id),
+        ...     ]
+        ... )
+        >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)])
+        >>> outputs = model.assisted_decoding(
+        ...     input_ids,
+        ...     assistant_model=assistant_model,
+        ...     logits_processor=logits_processor,
+        ...     stopping_criteria=stopping_criteria,
+        ... )
+        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        ["It might be possible to get a better understanding of the nature of the problem, but it's not"]
+        ```"""
+        # Assistant: initialize assistant-related variables
+        if not hasattr(assistant_model, "max_assistant_tokens"):
+            assistant_model.max_assistant_tokens = 5  # this value, which will be updated, persists across calls
+
+        # init values
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        logits_warper = logits_warper if logits_warper is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+        pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
+        if eos_token_id is not None and pad_token_id is None:
+            raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+        eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
+        output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.generation_config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
+        )
+        return_dict_in_generate = (
+            return_dict_in_generate
+            if return_dict_in_generate is not None
+            else self.generation_config.return_dict_in_generate
+        )
+
+        # init attention / hidden states / scores tuples
+        scores = () if (return_dict_in_generate and output_scores) else None
+        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+
+        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+        if return_dict_in_generate and self.config.is_encoder_decoder:
+            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+            encoder_hidden_states = (
+                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+            )
+
+        # keep track of which sequences are already finished
+        unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
+
+        # other auxiliary variables
+        max_len = stopping_criteria[0].max_length
+        assistant_kv_indexing = (
+            1
+            if "bloom" in assistant_model.__class__.__name__.lower()
+            or (
+                assistant_model.config.architectures is not None
+                and "bloom" in assistant_model.config.architectures[0].lower()
+            )
+            else 0
+        )
+
+        this_peer_finished = False  # used by synced_gpus only
+        while True:
+            if synced_gpus:
+                # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+                # The following logic allows an early break if all peers finished generating their sequence
+                this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
+                # send 0.0 if we finished, 1.0 otherwise
+                dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+                # did all peers finish? the reduced sum will be 0.0 then
+                if this_peer_finished_flag.item() == 0.0:
+                    break
+
+            # Assistant: main logic start
+            cur_len = input_ids.shape[-1]
+
+            #  1. Forecast next N tokens using the assistant model. This `for` block can be replaced with a
+            # `.generate()` call if we decide to add `past_key_values` as a possible output of generate, as we
+            # need access to the assistant cache to secure strong speedups.
+            candidate_input_ids = input_ids
+            for _ in range(int(assistant_model.max_assistant_tokens)):
+                # 1.1. use the assistant model to obtain the next candidate logits
+                if "assistant_past_key_values" in model_kwargs:
+                    prev_seq_len = model_kwargs["assistant_past_key_values"][0][assistant_kv_indexing].shape[-2]
+                    # `new_token_len` can be 1 or 2 (next token in assistant + last token picked by the larger model)
+                    new_token_len = candidate_input_ids.shape[1] - prev_seq_len
+                    assist_inputs = candidate_input_ids[:, -new_token_len:]
+                    assist_attn = torch.ones_like(candidate_input_ids)
+                    # TODO (joao): make it compatible with models that use unconventional fwd pass logic, like blip2
+                    if assistant_model.config.is_encoder_decoder:
+                        assistant_model_outputs = assistant_model(
+                            decoder_input_ids=assist_inputs,
+                            decoder_attention_mask=assist_attn,
+                            past_key_values=model_kwargs["assistant_past_key_values"],
+                            encoder_outputs=model_kwargs["assistant_encoder_outputs"],
+                        )
+                    else:
+                        assistant_model_outputs = assistant_model(
+                            assist_inputs,
+                            attention_mask=assist_attn,
+                            past_key_values=model_kwargs["assistant_past_key_values"],
+                        )
+                else:
+                    if assistant_model.config.is_encoder_decoder:
+                        assistant_model_outputs = assistant_model(
+                            decoder_input_ids=candidate_input_ids,
+                            encoder_outputs=model_kwargs["assistant_encoder_outputs"],
+                        )
+                    else:
+                        assistant_model_outputs = assistant_model(candidate_input_ids)
+
+                # 1.2. greedily select the next candidate token
+                model_kwargs["assistant_past_key_values"] = assistant_model_outputs.past_key_values
+                if len(logits_processor) > 0:
+                    assistant_model_outputs.logits[:, -1, :] = logits_processor(
+                        candidate_input_ids, assistant_model_outputs.logits[:, -1, :]
+                    )
+                new_token = assistant_model_outputs.logits[:, -1, :].argmax(dim=-1)
+                candidate_input_ids = torch.cat((candidate_input_ids, new_token[:, None]), dim=-1)
+
+                # 1.3. stop assistant generation on EOS
+                if eos_token_id_tensor is not None:
+                    last_assistant_token_is_eos = new_token.tile(eos_token_id_tensor.shape[0], 1)
+                    last_assistant_token_is_eos = (
+                        ~last_assistant_token_is_eos.ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0).bool()
+                    )
+                    if last_assistant_token_is_eos:
+                        break
+                else:
+                    last_assistant_token_is_eos = False
+
+            candidate_length = candidate_input_ids.shape[1] - input_ids.shape[1]
+
+            # 2. Use the original model to obtain the next token logits given the candidate sequence. We obtain
+            # `candidate_length + 1` relevant logits from this process: in the event that all candidates are correct,
+            # we use this forward pass to also pick the subsequent logits in the original model.
+
+            # 2.1. Run a forward pass on the candidate sequence
+            if "past_key_values" in model_kwargs:
+                model_attn = torch.ones_like(candidate_input_ids)
+                model_input_ids = candidate_input_ids[:, -candidate_length - 1 :]
+                if self.config.is_encoder_decoder:
+                    outputs = self(
+                        decoder_input_ids=model_input_ids,
+                        decoder_attention_mask=model_attn,
+                        past_key_values=model_kwargs["past_key_values"],
+                        encoder_outputs=model_kwargs["encoder_outputs"],
+                        output_attentions=output_attentions,
+                        output_hidden_states=output_hidden_states,
+                        use_cache=True,
+                    )
+                else:
+                    outputs = self(
+                        model_input_ids,
+                        attention_mask=model_attn,
+                        past_key_values=model_kwargs["past_key_values"],
+                        output_attentions=output_attentions,
+                        output_hidden_states=output_hidden_states,
+                        use_cache=True,
+                    )
+            else:
+                if self.config.is_encoder_decoder:
+                    outputs = self(
+                        decoder_input_ids=candidate_input_ids,
+                        encoder_outputs=model_kwargs["encoder_outputs"],
+                        output_attentions=output_attentions,
+                        output_hidden_states=output_hidden_states,
+                        use_cache=True,
+                    )
+                else:
+                    outputs = self(
+                        candidate_input_ids,
+                        output_attentions=output_attentions,
+                        output_hidden_states=output_hidden_states,
+                        use_cache=True,
+                    )
+
+            # 2.2. Process the new logits
+            new_logits = outputs.logits[:, -candidate_length - 1 :]  # excludes the input prompt if present
+            if len(logits_processor) > 0:
+                for i in range(candidate_length):
+                    new_logits[:, i, :] = logits_processor(candidate_input_ids[:, : cur_len + i], new_logits[:, i, :])
+            if len(logits_warper) > 0:
+                for i in range(candidate_length):
+                    new_logits[:, i, :] = logits_warper(candidate_input_ids[:, : cur_len + i], new_logits[:, i, :])
+
+            # 3. Obtain the next tokens from the original model logits.
+            if do_sample:
+                probs = new_logits[:, -candidate_length - 1 :, :].softmax(dim=-1)
+                selected_tokens = torch.multinomial(probs[0, :, :], num_samples=1).squeeze(1)[None, :]
+            else:
+                selected_tokens = new_logits[:, -candidate_length - 1 :, :].argmax(dim=-1)
+
+            # 4. Compare the argmax from the original model logits with the assistant forecasted tokens. We can keep
+            # the assistant forecasted tokens until the first mismatch, or until the max length is reached.
+            candidate_new_tokens = candidate_input_ids[:, -candidate_length:]
+            n_matches = ((~(candidate_new_tokens == selected_tokens[:, :-1])).cumsum(dim=-1) < 1).sum()
+
+            # 5. Update variables according to the number of matching assistant tokens. Remember: the token generated
+            # by the model after the last candidate match is also valid, as it is generated from a correct sequence.
+            # Because of this last token, assisted generation search reduces to a normal greedy search/sample if there
+            # is no match.
+
+            # 5.1. Ensure we don't generate beyond max_len or an EOS token
+            if last_assistant_token_is_eos and n_matches == candidate_length:
+                n_matches -= 1
+            n_matches = min(n_matches, max_len - cur_len - 1)
+
+            # 5.2. Get the valid continuation, after the matching tokens
+            valid_tokens = selected_tokens[:, : n_matches + 1]
+            input_ids = torch.cat((input_ids, valid_tokens), dim=-1)
+            if streamer is not None:
+                streamer.put(valid_tokens.cpu())
+            new_cur_len = input_ids.shape[-1]
+
+            # 5.3. Discard past key values relative to unused assistant tokens
+            new_cache_size = new_cur_len - 1
+            outputs.past_key_values = _crop_past_key_values(self, outputs.past_key_values, new_cache_size)
+            model_kwargs["assistant_past_key_values"] = _crop_past_key_values(
+                assistant_model, model_kwargs["assistant_past_key_values"], new_cache_size - 1
+            )  # the assistant does not have the token after the last match, hence the -1
+
+            # 6. Adjust the max number of assistant tokens to use in the next iteration. This is a simple heuristic,
+            # probably can be improved -- we want to balance the benefits of getting assistant tokens correct with the
+            # cost of forecasting incorrect assistant tokens.
+            if n_matches == int(assistant_model.max_assistant_tokens):
+                assistant_model.max_assistant_tokens += 2.0
+            else:
+                assistant_model.max_assistant_tokens = max(1.0, assistant_model.max_assistant_tokens - 1.0)
+
+            # Assistant: main logic end
+
+            if synced_gpus and this_peer_finished:
+                continue  # don't waste resources running the code we don't need
+
+            # Store scores, attentions and hidden_states when required
+            # Assistant: modified to append one tuple element per token, as in the other generation methods.
+            if return_dict_in_generate:
+                if output_scores:
+                    scores += tuple(new_logits[:, i, :] for i in range(n_matches + 1))
+
+                if "past_key_values" not in model_kwargs:
+                    added_len = new_cur_len
+                else:
+                    added_len = n_matches + 1
+
+                if output_attentions:
+                    if self.config.is_encoder_decoder:
+                        cross_attentions = _split_model_outputs(
+                            cross_attentions, outputs.cross_attentions, cur_len, added_len
+                        )
+                        decoder_attentions = _split_model_outputs(
+                            decoder_attentions,
+                            outputs.decoder_attentions,
+                            cur_len,
+                            added_len,
+                            is_decoder_attention=True,
+                        )
+                    else:
+                        decoder_attentions = _split_model_outputs(
+                            decoder_attentions,
+                            outputs.attentions,
+                            cur_len,
+                            added_len,
+                            is_decoder_attention=True,
+                        )
+                if output_hidden_states:
+                    if self.config.is_encoder_decoder:
+                        decoder_hidden_states = _split_model_outputs(
+                            decoder_hidden_states, outputs.decoder_hidden_states, cur_len, added_len
+                        )
+                    else:
+                        decoder_hidden_states = _split_model_outputs(
+                            decoder_hidden_states, outputs.hidden_states, cur_len, added_len
+                        )
+
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+            )
+
+            # if eos_token was found in one sentence, set sentence to finished
+            if eos_token_id_tensor is not None:
+                unfinished_sequences = unfinished_sequences.mul(
+                    input_ids[:, -1]
+                    .tile(eos_token_id_tensor.shape[0], 1)
+                    .ne(eos_token_id_tensor.unsqueeze(1))
+                    .prod(dim=0)
+                )
+
+                # stop when each sentence is finished
+                if unfinished_sequences.max() == 0:
+                    this_peer_finished = True
+
+            # stop if we exceed the maximum length
+            if stopping_criteria(input_ids, scores):
+                this_peer_finished = True
+
+            if this_peer_finished and not synced_gpus:
+                break
+
+        if streamer is not None:
+            streamer.end()
+
+        if return_dict_in_generate:
+            if self.config.is_encoder_decoder:
+                return GreedySearchEncoderDecoderOutput(
+                    sequences=input_ids,
+                    scores=scores,
+                    encoder_attentions=encoder_attentions,
+                    encoder_hidden_states=encoder_hidden_states,
+                    decoder_attentions=decoder_attentions,
+                    cross_attentions=cross_attentions,
+                    decoder_hidden_states=decoder_hidden_states,
+                )
+            else:
+                return GreedySearchDecoderOnlyOutput(
+                    sequences=input_ids,
+                    scores=scores,
+                    attentions=decoder_attentions,
+                    hidden_states=decoder_hidden_states,
+                )
+        else:
+            return input_ids
+
+
+def _crop_past_key_values(model, past_key_values, maximum_length):
+    """Crops the past key values up to a certain maximum length."""
+    new_past = []
+    if model.config.is_encoder_decoder:
+        for idx in range(len(past_key_values)):
+            new_past.append(
+                (
+                    past_key_values[idx][0][:, :, :maximum_length, :],
+                    past_key_values[idx][1][:, :, :maximum_length, :],
+                    past_key_values[idx][2],
+                    past_key_values[idx][3],
+                )
+            )
+        past_key_values = tuple(new_past)
+    # bloom is special
+    elif "bloom" in model.__class__.__name__.lower() or (
+        model.config.architectures is not None and "bloom" in model.config.architectures[0].lower()
+    ):
+        for idx in range(len(past_key_values)):
+            new_past.append(
+                (
+                    past_key_values[idx][0][:, :, :maximum_length],
+                    past_key_values[idx][1][:, :maximum_length, :],
+                )
+            )
+        past_key_values = tuple(new_past)
+    # gptbigcode is too
+    elif "gptbigcode" in model.__class__.__name__.lower() or (
+        model.config.architectures is not None and "gptbigcode" in model.config.architectures[0].lower()
+    ):
+        if model.config.multi_query:
+            for idx in range(len(past_key_values)):
+                past_key_values[idx] = past_key_values[idx][:, :maximum_length, :]
+        else:
+            for idx in range(len(past_key_values)):
+                past_key_values[idx] = past_key_values[idx][:, :, :maximum_length, :]
+    else:
+        for idx in range(len(past_key_values)):
+            new_past.append(
+                (
+                    past_key_values[idx][0][:, :, :maximum_length, :],
+                    past_key_values[idx][1][:, :, :maximum_length, :],
+                )
+            )
+        past_key_values = tuple(new_past)
+    return past_key_values
+
+
+def _split_model_outputs(outputs, new_outputs, cur_len, added_len, is_decoder_attention=False):
+    """
+    Given the (decoder/cross attentions)/(decoder hidden states) for multiple generated tokens, splits it into a tuple
+    where each member corresponds to a single generated token.
+    """
+    # Retrocompatibility: in our generation functions, the first iteration includes the attention/hidden states for the
+    # prompt.
+    if len(outputs) == 0:
+        new_tuple = ()
+        for layer in new_outputs:
+            last_dim_size = cur_len if is_decoder_attention else layer.shape[-1]
+            new_tuple += (layer[..., :cur_len, :last_dim_size],)
+        outputs += (new_tuple,)
+        # The first iteration contains the prompt + 1 generated token, let's update the length variables accordingly
+        cur_len += 1
+        added_len -= cur_len
+
+    for i in range(added_len):
+        new_tuple = ()
+        for layer in new_outputs:
+            last_dim_size = cur_len + i if is_decoder_attention else layer.shape[-1]
+            new_tuple += (layer[..., i : i + 1, :last_dim_size],)
+        outputs += (new_tuple,)
+    return outputs
+
 
 def top_k_top_p_filtering(
     logits: torch.FloatTensor,
diff --git a/src/transformers/hf_argparser.py b/src/transformers/hf_argparser.py
index b1fa67f45823..34570588744a 100644
--- a/src/transformers/hf_argparser.py
+++ b/src/transformers/hf_argparser.py
@@ -15,24 +15,17 @@
 import dataclasses
 import json
 import sys
+import types
 from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser, ArgumentTypeError
 from copy import copy
 from enum import Enum
 from inspect import isclass
 from pathlib import Path
-from typing import Any, Callable, Dict, Iterable, List, NewType, Optional, Tuple, Union, get_type_hints
+from typing import Any, Callable, Dict, Iterable, List, Literal, NewType, Optional, Tuple, Union, get_type_hints
 
 import yaml
 
 
-try:
-    # For Python versions <3.8, Literal is not in typing: https://peps.python.org/pep-0586/
-    from typing import Literal
-except ImportError:
-    # For Python 3.7
-    from typing_extensions import Literal
-
-
 DataClass = NewType("DataClass", Any)
 DataClassType = NewType("DataClassType", Any)
 
@@ -129,8 +122,8 @@ def __init__(self, dataclass_types: Union[DataClassType, Iterable[DataClassType]
         Args:
             dataclass_types:
                 Dataclass type, or list of dataclass types for which we will "fill" instances with the parsed args.
-            kwargs:
-                (Optional) Passed to `argparse.ArgumentParser()` in the regular way.
+            kwargs (`Dict[str, Any]`, *optional*):
+                Passed to `argparse.ArgumentParser()` in the regular way.
         """
         # To make the default appear when using --help
         if "formatter_class" not in kwargs:
@@ -159,7 +152,7 @@ def _parse_dataclass_field(parser: ArgumentParser, field: dataclasses.Field):
             aliases = [aliases]
 
         origin_type = getattr(field.type, "__origin__", field.type)
-        if origin_type is Union:
+        if origin_type is Union or (hasattr(types, "UnionType") and isinstance(origin_type, types.UnionType)):
             if str not in field.type.__args__ and (
                 len(field.type.__args__) != 2 or type(None) not in field.type.__args__
             ):
@@ -245,10 +238,23 @@ def _add_dataclass_arguments(self, dtype: DataClassType):
             type_hints: Dict[str, type] = get_type_hints(dtype)
         except NameError:
             raise RuntimeError(
-                f"Type resolution failed for f{dtype}. Try declaring the class in global scope or "
+                f"Type resolution failed for {dtype}. Try declaring the class in global scope or "
                 "removing line of `from __future__ import annotations` which opts in Postponed "
                 "Evaluation of Annotations (PEP 563)"
             )
+        except TypeError as ex:
+            # Remove this block when we drop Python 3.9 support
+            if sys.version_info[:2] < (3, 10) and "unsupported operand type(s) for |" in str(ex):
+                python_version = ".".join(map(str, sys.version_info[:3]))
+                raise RuntimeError(
+                    f"Type resolution failed for {dtype} on Python {python_version}. Try removing "
+                    "line of `from __future__ import annotations` which opts in union types as "
+                    "`X | Y` (PEP 604) via Postponed Evaluation of Annotations (PEP 563). To "
+                    "support Python versions that lower than 3.10, you need to use "
+                    "`typing.Union[X, Y]` instead of `X | Y` and `typing.Optional[X]` instead of "
+                    "`X | None`."
+                ) from ex
+            raise
 
         for field in dataclasses.fields(dtype):
             if not field.init:
@@ -387,8 +393,8 @@ def parse_json_file(self, json_file: str, allow_extra_keys: bool = False) -> Tup
 
                 - the dataclass instances in the same order as they were passed to the initializer.
         """
-        open_json_file = open(Path(json_file))
-        data = json.loads(open_json_file.read())
+        with open(Path(json_file), encoding="utf-8") as open_json_file:
+            data = json.loads(open_json_file.read())
         outputs = self.parse_dict(data, allow_extra_keys=allow_extra_keys)
         return tuple(outputs)
 
diff --git a/src/transformers/hyperparameter_search.py b/src/transformers/hyperparameter_search.py
new file mode 100644
index 000000000000..8dfd60cc39cd
--- /dev/null
+++ b/src/transformers/hyperparameter_search.py
@@ -0,0 +1,141 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .integrations import (
+    is_optuna_available,
+    is_ray_available,
+    is_sigopt_available,
+    is_wandb_available,
+    run_hp_search_optuna,
+    run_hp_search_ray,
+    run_hp_search_sigopt,
+    run_hp_search_wandb,
+)
+from .trainer_utils import (
+    HPSearchBackend,
+    default_hp_space_optuna,
+    default_hp_space_ray,
+    default_hp_space_sigopt,
+    default_hp_space_wandb,
+)
+from .utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class HyperParamSearchBackendBase:
+    name: str
+    pip_package: str = None
+
+    @staticmethod
+    def is_available():
+        raise NotImplementedError
+
+    def run(self, trainer, n_trials: int, direction: str, **kwargs):
+        raise NotImplementedError
+
+    def default_hp_space(self, trial):
+        raise NotImplementedError
+
+    def ensure_available(self):
+        if not self.is_available():
+            raise RuntimeError(
+                f"You picked the {self.name} backend, but it is not installed. Run {self.pip_install()}."
+            )
+
+    @classmethod
+    def pip_install(cls):
+        return f"`pip install {cls.pip_package or cls.name}`"
+
+
+class OptunaBackend(HyperParamSearchBackendBase):
+    name = "optuna"
+
+    @staticmethod
+    def is_available():
+        return is_optuna_available()
+
+    def run(self, trainer, n_trials: int, direction: str, **kwargs):
+        return run_hp_search_optuna(trainer, n_trials, direction, **kwargs)
+
+    def default_hp_space(self, trial):
+        return default_hp_space_optuna(trial)
+
+
+class RayTuneBackend(HyperParamSearchBackendBase):
+    name = "ray"
+    pip_package = "'ray[tune]'"
+
+    @staticmethod
+    def is_available():
+        return is_ray_available()
+
+    def run(self, trainer, n_trials: int, direction: str, **kwargs):
+        return run_hp_search_ray(trainer, n_trials, direction, **kwargs)
+
+    def default_hp_space(self, trial):
+        return default_hp_space_ray(trial)
+
+
+class SigOptBackend(HyperParamSearchBackendBase):
+    name = "sigopt"
+
+    @staticmethod
+    def is_available():
+        return is_sigopt_available()
+
+    def run(self, trainer, n_trials: int, direction: str, **kwargs):
+        return run_hp_search_sigopt(trainer, n_trials, direction, **kwargs)
+
+    def default_hp_space(self, trial):
+        return default_hp_space_sigopt(trial)
+
+
+class WandbBackend(HyperParamSearchBackendBase):
+    name = "wandb"
+
+    @staticmethod
+    def is_available():
+        return is_wandb_available()
+
+    def run(self, trainer, n_trials: int, direction: str, **kwargs):
+        return run_hp_search_wandb(trainer, n_trials, direction, **kwargs)
+
+    def default_hp_space(self, trial):
+        return default_hp_space_wandb(trial)
+
+
+ALL_HYPERPARAMETER_SEARCH_BACKENDS = {
+    HPSearchBackend(backend.name): backend for backend in [OptunaBackend, RayTuneBackend, SigOptBackend, WandbBackend]
+}
+
+
+def default_hp_search_backend() -> str:
+    available_backends = [backend for backend in ALL_HYPERPARAMETER_SEARCH_BACKENDS.values() if backend.is_available()]
+    if len(available_backends) > 0:
+        name = available_backends[0].name
+        if len(available_backends) > 1:
+            logger.info(
+                f"{len(available_backends)} hyperparameter search backends available. Using {name} as the default."
+            )
+        return name
+    raise RuntimeError(
+        "No hyperparameter search backend available.\n"
+        + "\n".join(
+            f" - To install {backend.name} run {backend.pip_install()}"
+            for backend in ALL_HYPERPARAMETER_SEARCH_BACKENDS.values()
+        )
+    )
diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index 0be771978287..22253fecfe89 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -16,24 +16,34 @@
 import copy
 import json
 import os
-from typing import Any, Dict, Iterable, Optional, Tuple, Union
+import warnings
+from io import BytesIO
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 
 import numpy as np
+import requests
 
 from .dynamic_module_utils import custom_object_save
 from .feature_extraction_utils import BatchFeature as BaseBatchFeature
+from .image_transforms import center_crop, normalize, rescale
+from .image_utils import ChannelDimension
 from .utils import (
     IMAGE_PROCESSOR_NAME,
     PushToHubMixin,
+    add_model_info_to_auto_map,
     cached_file,
     copy_func,
     download_url,
     is_offline_mode,
     is_remote_url,
+    is_vision_available,
     logging,
 )
 
 
+if is_vision_available():
+    from PIL import Image
+
 logger = logging.get_logger(__name__)
 
 
@@ -80,7 +90,16 @@ def _set_processor_class(self, processor_class: str):
         self._processor_class = processor_class
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs):
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
+        **kwargs,
+    ):
         r"""
         Instantiate a type of [`~image_processing_utils.ImageProcessingMixin`] from an image processor.
 
@@ -108,7 +127,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-            use_auth_token (`str` or `bool`, *optional*):
+            token (`str` or `bool`, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
                 the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
             revision (`str`, *optional*, defaults to `"main"`):
@@ -128,6 +147,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
                 functions returns a `Tuple(image_processor, unused_kwargs)` where *unused_kwargs* is a dictionary
                 consisting of the key/value pairs whose keys are not image processor attributes: i.e., the part of
                 `kwargs` which has not been used to update `image_processor` and is otherwise ignored.
+            subfolder (`str`, *optional*, defaults to `""`):
+                In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
+                specify the folder name here.
             kwargs (`Dict[str, Any]`, *optional*):
                 The values in kwargs of any keys which are image processor attributes will be used to override the
                 loaded values. Behavior concerning key/value pairs whose keys are *not* image processor attributes is
@@ -158,6 +180,25 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
         assert image_processor.do_normalize is False
         assert unused_kwargs == {"foo": False}
         ```"""
+        kwargs["cache_dir"] = cache_dir
+        kwargs["force_download"] = force_download
+        kwargs["local_files_only"] = local_files_only
+        kwargs["revision"] = revision
+
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
+        if token is not None:
+            kwargs["token"] = token
+
         image_processor_dict, kwargs = cls.get_image_processor_dict(pretrained_model_name_or_path, **kwargs)
 
         return cls.from_dict(image_processor_dict, **kwargs)
@@ -174,9 +215,21 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub:
                 Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
                 repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
                 namespace).
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
         """
+        use_auth_token = kwargs.pop("use_auth_token", None)
+
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if kwargs.get("token", None) is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            kwargs["token"] = use_auth_token
+
         if os.path.isfile(save_directory):
             raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
 
@@ -185,7 +238,7 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub:
         if push_to_hub:
             commit_message = kwargs.pop("commit_message", None)
             repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
-            repo_id, token = self._create_repo(repo_id, **kwargs)
+            repo_id = self._create_repo(repo_id, **kwargs)
             files_timestamps = self._get_files_timestamps(save_directory)
 
         # If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
@@ -201,7 +254,11 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub:
 
         if push_to_hub:
             self._upload_modified_files(
-                save_directory, repo_id, files_timestamps, commit_message=commit_message, token=token
+                save_directory,
+                repo_id,
+                files_timestamps,
+                commit_message=commit_message,
+                token=kwargs.get("token"),
             )
 
         return [output_image_processor_file]
@@ -217,6 +274,9 @@ def get_image_processor_dict(
         Parameters:
             pretrained_model_name_or_path (`str` or `os.PathLike`):
                 The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
+            subfolder (`str`, *optional*, defaults to `""`):
+                In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
+                specify the folder name here.
 
         Returns:
             `Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the image processor object.
@@ -225,13 +285,25 @@ def get_image_processor_dict(
         force_download = kwargs.pop("force_download", False)
         resume_download = kwargs.pop("resume_download", False)
         proxies = kwargs.pop("proxies", None)
+        token = kwargs.pop("token", None)
         use_auth_token = kwargs.pop("use_auth_token", None)
         local_files_only = kwargs.pop("local_files_only", False)
         revision = kwargs.pop("revision", None)
+        subfolder = kwargs.pop("subfolder", "")
 
         from_pipeline = kwargs.pop("_from_pipeline", None)
         from_auto_class = kwargs.pop("_from_auto", False)
 
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
         user_agent = {"file_type": "image processor", "from_auto_class": from_auto_class}
         if from_pipeline is not None:
             user_agent["using_pipeline"] = from_pipeline
@@ -262,9 +334,10 @@ def get_image_processor_dict(
                     proxies=proxies,
                     resume_download=resume_download,
                     local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
+                    token=token,
                     user_agent=user_agent,
                     revision=revision,
+                    subfolder=subfolder,
                 )
             except EnvironmentError:
                 # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
@@ -297,6 +370,11 @@ def get_image_processor_dict(
                 f"loading configuration file {image_processor_file} from cache at {resolved_image_processor_file}"
             )
 
+        if "auto_map" in image_processor_dict and not is_local:
+            image_processor_dict["auto_map"] = add_model_info_to_auto_map(
+                image_processor_dict["auto_map"], pretrained_model_name_or_path
+            )
+
         return image_processor_dict, kwargs
 
     @classmethod
@@ -436,6 +514,28 @@ def register_for_auto_class(cls, auto_class="AutoImageProcessor"):
 
         cls._auto_class = auto_class
 
+    def fetch_images(self, image_url_or_urls: Union[str, List[str]]):
+        """
+        Convert a single or a list of urls into the corresponding `PIL.Image` objects.
+
+        If a single url is passed, the return value will be a single object. If a list is passed a list of objects is
+        returned.
+        """
+        headers = {
+            "User-Agent": (
+                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0"
+                " Safari/537.36"
+            )
+        }
+        if isinstance(image_url_or_urls, list):
+            return [self.fetch_images(x) for x in image_url_or_urls]
+        elif isinstance(image_url_or_urls, str):
+            response = requests.get(image_url_or_urls, stream=True, headers=headers)
+            response.raise_for_status()
+            return Image.open(BytesIO(response.content))
+        else:
+            raise ValueError(f"only a single or a list of entries is supported but got type={type(image_url_or_urls)}")
+
 
 class BaseImageProcessor(ImageProcessingMixin):
     def __init__(self, **kwargs):
@@ -448,8 +548,116 @@ def __call__(self, images, **kwargs) -> BatchFeature:
     def preprocess(self, images, **kwargs) -> BatchFeature:
         raise NotImplementedError("Each image processor must implement its own preprocess method")
 
+    def rescale(
+        self,
+        image: np.ndarray,
+        scale: float,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Rescale an image by a scale factor. image = image * scale.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            scale (`float`):
+                The scaling factor to rescale pixel values by.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+
+        Returns:
+            `np.ndarray`: The rescaled image.
+        """
+        return rescale(image, scale=scale, data_format=data_format, input_data_format=input_data_format, **kwargs)
+
+    def normalize(
+        self,
+        image: np.ndarray,
+        mean: Union[float, Iterable[float]],
+        std: Union[float, Iterable[float]],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Normalize an image. image = (image - image_mean) / image_std.
+
+        Args:
+            image (`np.ndarray`):
+                Image to normalize.
+            mean (`float` or `Iterable[float]`):
+                Image mean to use for normalization.
+            std (`float` or `Iterable[float]`):
+                Image standard deviation to use for normalization.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+
+        Returns:
+            `np.ndarray`: The normalized image.
+        """
+        return normalize(
+            image, mean=mean, std=std, data_format=data_format, input_data_format=input_data_format, **kwargs
+        )
+
+    def center_crop(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Center crop an image to `(size["height"], size["width"])`. If the input size is smaller than `crop_size` along
+        any edge, the image is padded with 0's and then center cropped.
+
+        Args:
+            image (`np.ndarray`):
+                Image to center crop.
+            size (`Dict[str, int]`):
+                Size of the output image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The size dictionary must have keys 'height' and 'width'. Got {size.keys()}")
+        return center_crop(
+            image,
+            size=(size["height"], size["width"]),
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
 
-VALID_SIZE_DICT_KEYS = ({"height", "width"}, {"shortest_edge"}, {"shortest_edge", "longest_edge"})
+VALID_SIZE_DICT_KEYS = ({"height", "width"}, {"shortest_edge"}, {"shortest_edge", "longest_edge"}, {"longest_edge"})
 
 
 def is_valid_size_dict(size_dict):
@@ -483,6 +691,10 @@ def convert_to_size_dict(
         return {"height": size[0], "width": size[1]}
     elif isinstance(size, (tuple, list)) and not height_width_order:
         return {"height": size[1], "width": size[0]}
+    elif size is None and max_size is not None:
+        if default_to_square:
+            raise ValueError("Cannot specify both default_to_square=True and max_size")
+        return {"longest_edge": max_size}
 
     raise ValueError(f"Could not convert size input to size dict: {size}")
 
diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index d09f29b79044..3cea0c2d1769 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -18,16 +18,15 @@
 
 import numpy as np
 
-from transformers.image_utils import (
+from .image_utils import (
     ChannelDimension,
     ImageInput,
     get_channel_dimension_axis,
     get_image_size,
     infer_channel_dimension_format,
-    to_numpy_array,
 )
-from transformers.utils import ExplicitEnum, TensorType, is_jax_tensor, is_tf_tensor, is_torch_tensor
-from transformers.utils.import_utils import (
+from .utils import ExplicitEnum, TensorType, is_jax_tensor, is_tf_tensor, is_torch_tensor
+from .utils.import_utils import (
     is_flax_available,
     is_tf_available,
     is_torch_available,
@@ -64,6 +63,8 @@ def to_channel_dimension_format(
             The image to have its channel dimension set.
         channel_dim (`ChannelDimension`):
             The channel dimension format to use.
+        input_channel_dim (`ChannelDimension`, *optional*):
+            The channel dimension format of the input image. If not provided, it will be inferred from the input image.
 
     Returns:
         `np.ndarray`: The image with the channel dimension set to `channel_dim`.
@@ -89,7 +90,11 @@ def to_channel_dimension_format(
 
 
 def rescale(
-    image: np.ndarray, scale: float, data_format: Optional[ChannelDimension] = None, dtype=np.float32
+    image: np.ndarray,
+    scale: float,
+    data_format: Optional[ChannelDimension] = None,
+    dtype: np.dtype = np.float32,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
 ) -> np.ndarray:
     """
     Rescales `image` by `scale`.
@@ -104,6 +109,8 @@ def rescale(
         dtype (`np.dtype`, *optional*, defaults to `np.float32`):
             The dtype of the output image. Defaults to `np.float32`. Used for backwards compatibility with feature
             extractors.
+        input_data_format (`ChannelDimension`, *optional*):
+            The channel dimension format of the input image. If not provided, it will be inferred from the input image.
 
     Returns:
         `np.ndarray`: The rescaled image.
@@ -113,14 +120,44 @@ def rescale(
 
     rescaled_image = image * scale
     if data_format is not None:
-        rescaled_image = to_channel_dimension_format(rescaled_image, data_format)
+        rescaled_image = to_channel_dimension_format(rescaled_image, data_format, input_data_format)
+
     rescaled_image = rescaled_image.astype(dtype)
+
     return rescaled_image
 
 
+def _rescale_for_pil_conversion(image):
+    """
+    Detects whether or not the image needs to be rescaled before being converted to a PIL image.
+
+    The assumption is that if the image is of type `np.float` and all values are between 0 and 1, it needs to be
+    rescaled.
+    """
+    if image.dtype == np.uint8:
+        do_rescale = False
+    elif np.allclose(image, image.astype(int)):
+        if np.all(0 <= image) and np.all(image <= 255):
+            do_rescale = False
+        else:
+            raise ValueError(
+                "The image to be converted to a PIL image contains values outside the range [0, 255], "
+                f"got [{image.min()}, {image.max()}] which cannot be converted to uint8."
+            )
+    elif np.all(0 <= image) and np.all(image <= 1):
+        do_rescale = True
+    else:
+        raise ValueError(
+            "The image to be converted to a PIL image contains values outside the range [0, 1], "
+            f"got [{image.min()}, {image.max()}] which cannot be converted to uint8."
+        )
+    return do_rescale
+
+
 def to_pil_image(
     image: Union[np.ndarray, "PIL.Image.Image", "torch.Tensor", "tf.Tensor", "jnp.ndarray"],
     do_rescale: Optional[bool] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
 ) -> "PIL.Image.Image":
     """
     Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
@@ -131,7 +168,10 @@ def to_pil_image(
             The image to convert to the `PIL.Image` format.
         do_rescale (`bool`, *optional*):
             Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will default
-            to `True` if the image type is a floating type, `False` otherwise.
+            to `True` if the image type is a floating type and casting to `int` would result in a loss of precision,
+            and `False` otherwise.
+        input_data_format (`ChannelDimension`, *optional*):
+            The channel dimension format of the input image. If unset, will use the inferred format from the input.
 
     Returns:
         `PIL.Image.Image`: The converted image.
@@ -150,15 +190,17 @@ def to_pil_image(
         raise ValueError("Input image type not supported: {}".format(type(image)))
 
     # If the channel as been moved to first dim, we put it back at the end.
-    image = to_channel_dimension_format(image, ChannelDimension.LAST)
+    image = to_channel_dimension_format(image, ChannelDimension.LAST, input_data_format)
 
     # If there is a single channel, we squeeze it, as otherwise PIL can't handle it.
     image = np.squeeze(image, axis=-1) if image.shape[-1] == 1 else image
 
-    # PIL.Image can only store uint8 values, so we rescale the image to be between 0 and 255 if needed.
-    do_rescale = isinstance(image.flat[0], (float, np.float32, np.float64)) if do_rescale is None else do_rescale
+    # PIL.Image can only store uint8 values so we rescale the image to be between 0 and 255 if needed.
+    do_rescale = _rescale_for_pil_conversion(image) if do_rescale is None else do_rescale
+
     if do_rescale:
         image = rescale(image, 255)
+
     image = image.astype(np.uint8)
     return PIL.Image.fromarray(image)
 
@@ -169,6 +211,7 @@ def get_resize_output_image_size(
     size: Union[int, Tuple[int, int], List[int], Tuple[int]],
     default_to_square: bool = True,
     max_size: Optional[int] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
 ) -> tuple:
     """
     Find the target (height, width) dimension of the output image after resizing given the input image and the desired
@@ -194,6 +237,8 @@ def get_resize_output_image_size(
             than `max_size` after being resized according to `size`, then the image is resized again so that the longer
             edge is equal to `max_size`. As a result, `size` might be overruled, i.e the smaller edge may be shorter
             than `size`. Only used if `default_to_square` is `False`.
+        input_data_format (`ChannelDimension`, *optional*):
+            The channel dimension format of the input image. If unset, will use the inferred format from the input.
 
     Returns:
         `tuple`: The target (height, width) dimension of the output image after resizing.
@@ -210,7 +255,7 @@ def get_resize_output_image_size(
     if default_to_square:
         return (size, size)
 
-    height, width = get_image_size(input_image)
+    height, width = get_image_size(input_image, input_data_format)
     short, long = (width, height) if width <= height else (height, width)
     requested_new_short = size
 
@@ -235,6 +280,7 @@ def resize(
     reducing_gap: Optional[int] = None,
     data_format: Optional[ChannelDimension] = None,
     return_numpy: bool = True,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
 ) -> np.ndarray:
     """
     Resizes `image` to `(height, width)` specified by `size` using the PIL library.
@@ -254,6 +300,8 @@ def resize(
         return_numpy (`bool`, *optional*, defaults to `True`):
             Whether or not to return the resized image as a numpy array. If False a `PIL.Image.Image` object is
             returned.
+        input_data_format (`ChannelDimension`, *optional*):
+            The channel dimension format of the input image. If unset, will use the inferred format from the input.
 
     Returns:
         `np.ndarray`: The resized image.
@@ -267,12 +315,16 @@ def resize(
 
     # For all transformations, we want to keep the same data format as the input image unless otherwise specified.
     # The resized image from PIL will always have channels last, so find the input format first.
-    data_format = infer_channel_dimension_format(image) if data_format is None else data_format
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(image)
+    data_format = input_data_format if data_format is None else data_format
 
     # To maintain backwards compatibility with the resizing done in previous image feature extractors, we use
     # the pillow library to resize the image and then convert back to numpy
+    do_rescale = False
     if not isinstance(image, PIL.Image.Image):
-        image = to_pil_image(image)
+        do_rescale = _rescale_for_pil_conversion(image)
+        image = to_pil_image(image, do_rescale=do_rescale, input_data_format=input_data_format)
     height, width = size
     # PIL images are in the format (width, height)
     resized_image = image.resize((width, height), resample=resample, reducing_gap=reducing_gap)
@@ -286,6 +338,9 @@ def resize(
         resized_image = to_channel_dimension_format(
             resized_image, data_format, input_channel_dim=ChannelDimension.LAST
         )
+        # If an image was rescaled to be in the range [0, 255] before converting to a PIL image, then we need to
+        # rescale it back to the original range.
+        resized_image = rescale(resized_image, 1 / 255) if do_rescale else resized_image
     return resized_image
 
 
@@ -294,6 +349,7 @@ def normalize(
     mean: Union[float, Iterable[float]],
     std: Union[float, Iterable[float]],
     data_format: Optional[ChannelDimension] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
 ) -> np.ndarray:
     """
     Normalizes `image` using the mean and standard deviation specified by `mean` and `std`.
@@ -309,24 +365,15 @@ def normalize(
             The standard deviation to use for normalization.
         data_format (`ChannelDimension`, *optional*):
             The channel dimension format of the output image. If unset, will use the inferred format from the input.
+        input_data_format (`ChannelDimension`, *optional*):
+            The channel dimension format of the input image. If unset, will use the inferred format from the input.
     """
-    requires_backends(normalize, ["vision"])
-
-    if isinstance(image, PIL.Image.Image):
-        warnings.warn(
-            "PIL.Image.Image inputs are deprecated and will be removed in v4.26.0. Please use numpy arrays instead.",
-            FutureWarning,
-        )
-        # Convert PIL image to numpy array with the same logic as in the previous feature extractor normalize -
-        # casting to numpy array and dividing by 255.
-        image = to_numpy_array(image)
-        image = rescale(image, scale=1 / 255)
-
     if not isinstance(image, np.ndarray):
         raise ValueError("image must be a numpy array")
 
-    input_data_format = infer_channel_dimension_format(image)
-    channel_axis = get_channel_dimension_axis(image)
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(image)
+    channel_axis = get_channel_dimension_axis(image, input_data_format=input_data_format)
     num_channels = image.shape[channel_axis]
 
     if isinstance(mean, Iterable):
@@ -348,7 +395,7 @@ def normalize(
     else:
         image = ((image.T - mean) / std).T
 
-    image = to_channel_dimension_format(image, data_format) if data_format is not None else image
+    image = to_channel_dimension_format(image, data_format, input_data_format) if data_format is not None else image
     return image
 
 
@@ -356,6 +403,7 @@ def center_crop(
     image: np.ndarray,
     size: Tuple[int, int],
     data_format: Optional[Union[str, ChannelDimension]] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
     return_numpy: Optional[bool] = None,
 ) -> np.ndarray:
     """
@@ -372,6 +420,11 @@ def center_crop(
                 - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                 - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
             If unset, will use the inferred format of the input image.
+        input_data_format (`str` or `ChannelDimension`, *optional*):
+            The channel dimension format for the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            If unset, will use the inferred format of the input image.
         return_numpy (`bool`, *optional*):
             Whether or not to return the cropped image as a numpy array. Used for backwards compatibility with the
             previous ImageFeatureExtractionMixin method.
@@ -383,15 +436,10 @@ def center_crop(
     """
     requires_backends(center_crop, ["vision"])
 
-    if isinstance(image, PIL.Image.Image):
-        warnings.warn(
-            "PIL.Image.Image inputs are deprecated and will be removed in v4.26.0. Please use numpy arrays instead.",
-            FutureWarning,
-        )
-        image = to_numpy_array(image)
-        return_numpy = False if return_numpy is None else return_numpy
-    else:
-        return_numpy = True if return_numpy is None else return_numpy
+    if return_numpy is not None:
+        warnings.warn("return_numpy is deprecated and will be removed in v.4.33", FutureWarning)
+
+    return_numpy = True if return_numpy is None else return_numpy
 
     if not isinstance(image, np.ndarray):
         raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}")
@@ -399,13 +447,14 @@ def center_crop(
     if not isinstance(size, Iterable) or len(size) != 2:
         raise ValueError("size must have 2 elements representing the height and width of the output image")
 
-    input_data_format = infer_channel_dimension_format(image)
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(image)
     output_data_format = data_format if data_format is not None else input_data_format
 
     # We perform the crop in (C, H, W) format and then convert to the output format
-    image = to_channel_dimension_format(image, ChannelDimension.FIRST)
+    image = to_channel_dimension_format(image, ChannelDimension.FIRST, input_data_format)
 
-    orig_height, orig_width = get_image_size(image)
+    orig_height, orig_width = get_image_size(image, ChannelDimension.FIRST)
     crop_height, crop_width = size
     crop_height, crop_width = int(crop_height), int(crop_width)
 
@@ -419,7 +468,7 @@ def center_crop(
     # Check if cropped area is within image boundaries
     if top >= 0 and bottom <= orig_height and left >= 0 and right <= orig_width:
         image = image[..., top:bottom, left:right]
-        image = to_channel_dimension_format(image, output_data_format)
+        image = to_channel_dimension_format(image, output_data_format, ChannelDimension.FIRST)
         return image
 
     # Otherwise, we may need to pad if the image is too small. Oh joy...
@@ -441,7 +490,7 @@ def center_crop(
     right += left_pad
 
     new_image = new_image[..., max(0, top) : min(new_height, bottom), max(0, left) : min(new_width, right)]
-    new_image = to_channel_dimension_format(new_image, output_data_format)
+    new_image = to_channel_dimension_format(new_image, output_data_format, ChannelDimension.FIRST)
 
     if not return_numpy:
         new_image = to_pil_image(new_image)
@@ -686,7 +735,7 @@ def _expand_for_data_format(values):
     else:
         raise ValueError(f"Invalid padding mode: {mode}")
 
-    image = to_channel_dimension_format(image, data_format) if data_format is not None else image
+    image = to_channel_dimension_format(image, data_format, input_data_format) if data_format is not None else image
     return image
 
 
@@ -707,3 +756,41 @@ def convert_to_rgb(image: ImageInput) -> ImageInput:
 
     image = image.convert("RGB")
     return image
+
+
+def flip_channel_order(
+    image: np.ndarray,
+    data_format: Optional[ChannelDimension] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> np.ndarray:
+    """
+    Flips the channel order of the image.
+
+    If the image is in RGB format, it will be converted to BGR and vice versa.
+
+    Args:
+        image (`np.ndarray`):
+            The image to flip.
+        data_format (`ChannelDimension`, *optional*):
+            The channel dimension format for the output image. Can be one of:
+                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            If unset, will use same as the input image.
+        input_data_format (`ChannelDimension`, *optional*):
+            The channel dimension format for the input image. Can be one of:
+                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            If unset, will use the inferred format of the input image.
+    """
+    input_data_format = infer_channel_dimension_format(image) if input_data_format is None else input_data_format
+
+    if input_data_format == ChannelDimension.LAST:
+        image = image[..., ::-1]
+    elif input_data_format == ChannelDimension.FIRST:
+        image = image[::-1, ...]
+    else:
+        raise ValueError(f"Unsupported channel dimension: {input_data_format}")
+
+    if data_format is not None:
+        image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+    return image
diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index 3e5b3701de9a..628fe5dea72f 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -13,13 +13,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import base64
 import os
-from typing import TYPE_CHECKING, Dict, Iterable, List, Tuple, Union
+from io import BytesIO
+from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union
 
 import numpy as np
-from packaging import version
-
 import requests
+from packaging import version
 
 from .utils import (
     ExplicitEnum,
@@ -36,6 +37,8 @@
     IMAGENET_DEFAULT_STD,
     IMAGENET_STANDARD_MEAN,
     IMAGENET_STANDARD_STD,
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
 )
 
 
@@ -63,6 +66,10 @@ class ChannelDimension(ExplicitEnum):
     LAST = "channels_last"
 
 
+def is_pil_image(img):
+    return is_vision_available() and isinstance(img, PIL.Image.Image)
+
+
 def is_valid_image(img):
     return (
         (is_vision_available() and isinstance(img, PIL.Image.Image))
@@ -91,6 +98,56 @@ def is_batched(img):
     return False
 
 
+def is_scaled_image(image: np.ndarray) -> bool:
+    """
+    Checks to see whether the pixel values have already been rescaled to [0, 1].
+    """
+    if image.dtype == np.uint8:
+        return False
+
+    # It's possible the image has pixel values in [0, 255] but is of floating type
+    return np.min(image) >= 0 and np.max(image) <= 1
+
+
+def make_list_of_images(images, expected_ndims: int = 3) -> List[ImageInput]:
+    """
+    Ensure that the input is a list of images. If the input is a single image, it is converted to a list of length 1.
+    If the input is a batch of images, it is converted to a list of images.
+
+    Args:
+        images (`ImageInput`):
+            Image of images to turn into a list of images.
+        expected_ndims (`int`, *optional*, defaults to 3):
+            Expected number of dimensions for a single input image. If the input image has a different number of
+            dimensions, an error is raised.
+    """
+    if is_batched(images):
+        return images
+
+    # Either the input is a single image, in which case we create a list of length 1
+    if isinstance(images, PIL.Image.Image):
+        # PIL images are never batched
+        return [images]
+
+    if is_valid_image(images):
+        if images.ndim == expected_ndims + 1:
+            # Batch of images
+            images = list(images)
+        elif images.ndim == expected_ndims:
+            # Single image
+            images = [images]
+        else:
+            raise ValueError(
+                f"Invalid image shape. Expected either {expected_ndims + 1} or {expected_ndims} dimensions, but got"
+                f" {images.ndim} dimensions."
+            )
+        return images
+    raise ValueError(
+        "Invalid image type. Expected either PIL.Image.Image, numpy.ndarray, torch.Tensor, tf.Tensor or "
+        f"jax.ndarray, but got {type(images)}."
+    )
+
+
 def to_numpy_array(img) -> np.ndarray:
     if not is_valid_image(img):
         raise ValueError(f"Invalid image type: {type(img)}")
@@ -100,17 +157,24 @@ def to_numpy_array(img) -> np.ndarray:
     return to_numpy(img)
 
 
-def infer_channel_dimension_format(image: np.ndarray) -> ChannelDimension:
+def infer_channel_dimension_format(
+    image: np.ndarray, num_channels: Optional[Union[int, Tuple[int, ...]]] = None
+) -> ChannelDimension:
     """
     Infers the channel dimension format of `image`.
 
     Args:
         image (`np.ndarray`):
             The image to infer the channel dimension of.
+        num_channels (`int` or `Tuple[int, ...]`, *optional*, defaults to `(1, 3)`):
+            The number of channels of the image.
 
     Returns:
         The channel dimension of the image.
     """
+    num_channels = num_channels if num_channels is not None else (1, 3)
+    num_channels = (num_channels,) if isinstance(num_channels, int) else num_channels
+
     if image.ndim == 3:
         first_dim, last_dim = 0, 2
     elif image.ndim == 4:
@@ -118,30 +182,35 @@ def infer_channel_dimension_format(image: np.ndarray) -> ChannelDimension:
     else:
         raise ValueError(f"Unsupported number of image dimensions: {image.ndim}")
 
-    if image.shape[first_dim] in (1, 3):
+    if image.shape[first_dim] in num_channels:
         return ChannelDimension.FIRST
-    elif image.shape[last_dim] in (1, 3):
+    elif image.shape[last_dim] in num_channels:
         return ChannelDimension.LAST
     raise ValueError("Unable to infer channel dimension format")
 
 
-def get_channel_dimension_axis(image: np.ndarray) -> int:
+def get_channel_dimension_axis(
+    image: np.ndarray, input_data_format: Optional[Union[ChannelDimension, str]] = None
+) -> int:
     """
     Returns the channel dimension axis of the image.
 
     Args:
         image (`np.ndarray`):
             The image to get the channel dimension axis of.
+        input_data_format (`ChannelDimension` or `str`, *optional*):
+            The channel dimension format of the image. If `None`, will infer the channel dimension from the image.
 
     Returns:
         The channel dimension axis of the image.
     """
-    channel_dim = infer_channel_dimension_format(image)
-    if channel_dim == ChannelDimension.FIRST:
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(image)
+    if input_data_format == ChannelDimension.FIRST:
         return image.ndim - 3
-    elif channel_dim == ChannelDimension.LAST:
+    elif input_data_format == ChannelDimension.LAST:
         return image.ndim - 1
-    raise ValueError(f"Unsupported data format: {channel_dim}")
+    raise ValueError(f"Unsupported data format: {input_data_format}")
 
 
 def get_image_size(image: np.ndarray, channel_dim: ChannelDimension = None) -> Tuple[int, int]:
@@ -209,13 +278,15 @@ def valid_coco_panoptic_annotations(annotations: Iterable[Dict[str, Union[List,
     return all(is_valid_annotation_coco_panoptic(ann) for ann in annotations)
 
 
-def load_image(image: Union[str, "PIL.Image.Image"]) -> "PIL.Image.Image":
+def load_image(image: Union[str, "PIL.Image.Image"], timeout: Optional[float] = None) -> "PIL.Image.Image":
     """
     Loads `image` to a PIL Image.
 
     Args:
         image (`str` or `PIL.Image.Image`):
             The image to convert to the PIL Image format.
+        timeout (`float`, *optional*):
+            The timeout value in seconds for the URL request.
 
     Returns:
         `PIL.Image.Image`: A PIL Image.
@@ -225,18 +296,26 @@ def load_image(image: Union[str, "PIL.Image.Image"]) -> "PIL.Image.Image":
         if image.startswith("http://") or image.startswith("https://"):
             # We need to actually check for a real protocol, otherwise it's impossible to use a local file
             # like http_huggingface_co.png
-            image = PIL.Image.open(requests.get(image, stream=True).raw)
+            image = PIL.Image.open(requests.get(image, stream=True, timeout=timeout).raw)
         elif os.path.isfile(image):
             image = PIL.Image.open(image)
         else:
-            raise ValueError(
-                f"Incorrect path or url, URLs must start with `http://` or `https://`, and {image} is not a valid path"
-            )
+            if image.startswith("data:image/"):
+                image = image.split(",")[1]
+
+            # Try to load as base64
+            try:
+                b64 = base64.b64decode(image, validate=True)
+                image = PIL.Image.open(BytesIO(b64))
+            except Exception as e:
+                raise ValueError(
+                    f"Incorrect image source. Must be a valid URL starting with `http://` or `https://`, a valid path to an image file, or a base64 encoded string. Got {image}. Failed with {e}"
+                )
     elif isinstance(image, PIL.Image.Image):
         image = image
     else:
         raise ValueError(
-            "Incorrect format used for image. Should be an url linking to an image, a local path, or a PIL image."
+            "Incorrect format used for image. Should be an url linking to an image, a base64 string, a local path, or a PIL image."
         )
     image = PIL.ImageOps.exif_transpose(image)
     image = image.convert("RGB")
diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py
deleted file mode 100644
index 00ebaa29afcc..000000000000
--- a/src/transformers/integrations.py
+++ /dev/null
@@ -1,1478 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Integrations with other Python libraries.
-"""
-import functools
-import importlib.util
-import json
-import numbers
-import os
-import pickle
-import shutil
-import sys
-import tempfile
-from dataclasses import asdict
-from pathlib import Path
-from typing import TYPE_CHECKING, Dict, Optional
-
-import numpy as np
-
-from . import __version__ as version
-from .utils import flatten_dict, is_datasets_available, is_torch_available, logging
-
-
-logger = logging.get_logger(__name__)
-
-if is_torch_available():
-    import torch
-
-# comet_ml requires to be imported before any ML frameworks
-_has_comet = importlib.util.find_spec("comet_ml") is not None and os.getenv("COMET_MODE", "").upper() != "DISABLED"
-if _has_comet:
-    try:
-        import comet_ml  # noqa: F401
-
-        if hasattr(comet_ml, "config") and comet_ml.config.get_config("comet.api_key"):
-            _has_comet = True
-        else:
-            if os.getenv("COMET_MODE", "").upper() != "DISABLED":
-                logger.warning("comet_ml is installed but `COMET_API_KEY` is not set.")
-            _has_comet = False
-    except (ImportError, ValueError):
-        _has_comet = False
-
-_has_neptune = importlib.util.find_spec("neptune") is not None
-if TYPE_CHECKING and _has_neptune:
-    from neptune.new.metadata_containers.run import Run
-
-from .trainer_callback import ProgressCallback, TrainerCallback  # noqa: E402
-from .trainer_utils import PREFIX_CHECKPOINT_DIR, BestRun, IntervalStrategy  # noqa: E402
-from .training_args import ParallelMode  # noqa: E402
-from .utils import ENV_VARS_TRUE_VALUES, is_torch_tpu_available  # noqa: E402
-
-
-# Integration functions:
-def is_wandb_available():
-    # any value of WANDB_DISABLED disables wandb
-    if os.getenv("WANDB_DISABLED", "").upper() in ENV_VARS_TRUE_VALUES:
-        logger.warning(
-            "Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the "
-            "--report_to flag to control the integrations used for logging result (for instance --report_to none)."
-        )
-        return False
-    return importlib.util.find_spec("wandb") is not None
-
-
-def is_clearml_available():
-    return importlib.util.find_spec("clearml") is not None
-
-
-def is_comet_available():
-    return _has_comet
-
-
-def is_tensorboard_available():
-    return importlib.util.find_spec("tensorboard") is not None or importlib.util.find_spec("tensorboardX") is not None
-
-
-def is_optuna_available():
-    return importlib.util.find_spec("optuna") is not None
-
-
-def is_ray_available():
-    return importlib.util.find_spec("ray") is not None
-
-
-def is_ray_tune_available():
-    if not is_ray_available():
-        return False
-    return importlib.util.find_spec("ray.tune") is not None
-
-
-def is_sigopt_available():
-    return importlib.util.find_spec("sigopt") is not None
-
-
-def is_azureml_available():
-    if importlib.util.find_spec("azureml") is None:
-        return False
-    if importlib.util.find_spec("azureml.core") is None:
-        return False
-    return importlib.util.find_spec("azureml.core.run") is not None
-
-
-def is_mlflow_available():
-    if os.getenv("DISABLE_MLFLOW_INTEGRATION", "FALSE").upper() == "TRUE":
-        return False
-    return importlib.util.find_spec("mlflow") is not None
-
-
-def is_fairscale_available():
-    return importlib.util.find_spec("fairscale") is not None
-
-
-def is_neptune_available():
-    return _has_neptune
-
-
-def is_codecarbon_available():
-    return importlib.util.find_spec("codecarbon") is not None
-
-
-def hp_params(trial):
-    if is_optuna_available():
-        import optuna
-
-        if isinstance(trial, optuna.Trial):
-            return trial.params
-    if is_ray_tune_available():
-        if isinstance(trial, dict):
-            return trial
-
-    if is_sigopt_available():
-        if isinstance(trial, dict):
-            return trial
-
-    if is_wandb_available():
-        if isinstance(trial, dict):
-            return trial
-
-    raise RuntimeError(f"Unknown type for trial {trial.__class__}")
-
-
-def default_hp_search_backend():
-    if is_optuna_available():
-        return "optuna"
-    elif is_ray_tune_available():
-        return "ray"
-    elif is_sigopt_available():
-        return "sigopt"
-
-
-def run_hp_search_optuna(trainer, n_trials: int, direction: str, **kwargs) -> BestRun:
-    import optuna
-
-    if trainer.args.process_index == 0:
-
-        def _objective(trial, checkpoint_dir=None):
-            checkpoint = None
-            if checkpoint_dir:
-                for subdir in os.listdir(checkpoint_dir):
-                    if subdir.startswith(PREFIX_CHECKPOINT_DIR):
-                        checkpoint = os.path.join(checkpoint_dir, subdir)
-            trainer.objective = None
-            if trainer.args.world_size > 1:
-                if trainer.args.parallel_mode != ParallelMode.DISTRIBUTED:
-                    raise RuntimeError("only support DDP optuna HPO for ParallelMode.DISTRIBUTED currently.")
-                trainer._hp_search_setup(trial)
-                torch.distributed.broadcast_object_list(pickle.dumps(trainer.args), src=0)
-                trainer.train(resume_from_checkpoint=checkpoint)
-            else:
-                trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
-            # If there hasn't been any evaluation during the training loop.
-            if getattr(trainer, "objective", None) is None:
-                metrics = trainer.evaluate()
-                trainer.objective = trainer.compute_objective(metrics)
-            return trainer.objective
-
-        timeout = kwargs.pop("timeout", None)
-        n_jobs = kwargs.pop("n_jobs", 1)
-        study = optuna.create_study(direction=direction, **kwargs)
-        study.optimize(_objective, n_trials=n_trials, timeout=timeout, n_jobs=n_jobs)
-        best_trial = study.best_trial
-        return BestRun(str(best_trial.number), best_trial.value, best_trial.params)
-    else:
-        for i in range(n_trials):
-            trainer.objective = None
-            args_main_rank = list(pickle.dumps(trainer.args))
-            if trainer.args.parallel_mode != ParallelMode.DISTRIBUTED:
-                raise RuntimeError("only support DDP optuna HPO for ParallelMode.DISTRIBUTED currently.")
-            torch.distributed.broadcast_object_list(args_main_rank, src=0)
-            args = pickle.loads(bytes(args_main_rank))
-            for key, value in asdict(args).items():
-                if key != "local_rank":
-                    setattr(trainer.args, key, value)
-            trainer.train(resume_from_checkpoint=None)
-            # If there hasn't been any evaluation during the training loop.
-            if getattr(trainer, "objective", None) is None:
-                metrics = trainer.evaluate()
-                trainer.objective = trainer.compute_objective(metrics)
-        return None
-
-
-def run_hp_search_ray(trainer, n_trials: int, direction: str, **kwargs) -> BestRun:
-    import ray
-
-    def _objective(trial, local_trainer, checkpoint_dir=None):
-        try:
-            from transformers.utils.notebook import NotebookProgressCallback
-
-            if local_trainer.pop_callback(NotebookProgressCallback):
-                local_trainer.add_callback(ProgressCallback)
-        except ModuleNotFoundError:
-            pass
-
-        checkpoint = None
-        if checkpoint_dir:
-            for subdir in os.listdir(checkpoint_dir):
-                if subdir.startswith(PREFIX_CHECKPOINT_DIR):
-                    checkpoint = os.path.join(checkpoint_dir, subdir)
-        local_trainer.objective = None
-        local_trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
-        # If there hasn't been any evaluation during the training loop.
-        if getattr(local_trainer, "objective", None) is None:
-            metrics = local_trainer.evaluate()
-            local_trainer.objective = local_trainer.compute_objective(metrics)
-            local_trainer._tune_save_checkpoint()
-            ray.tune.report(objective=local_trainer.objective, **metrics, done=True)
-
-    if not trainer._memory_tracker.skip_memory_metrics:
-        from .trainer_utils import TrainerMemoryTracker
-
-        logger.warning(
-            "Memory tracking for your Trainer is currently "
-            "enabled. Automatically disabling the memory tracker "
-            "since the memory tracker is not serializable."
-        )
-        trainer._memory_tracker = TrainerMemoryTracker(skip_memory_metrics=True)
-
-    # The model and TensorBoard writer do not pickle so we have to remove them (if they exists)
-    # while doing the ray hp search.
-    _tb_writer = trainer.pop_callback(TensorBoardCallback)
-    trainer.model = None
-
-    # Setup default `resources_per_trial`.
-    if "resources_per_trial" not in kwargs:
-        # Default to 1 CPU and 1 GPU (if applicable) per trial.
-        kwargs["resources_per_trial"] = {"cpu": 1}
-        if trainer.args.n_gpu > 0:
-            kwargs["resources_per_trial"]["gpu"] = 1
-        resource_msg = "1 CPU" + (" and 1 GPU" if trainer.args.n_gpu > 0 else "")
-        logger.info(
-            "No `resources_per_trial` arg was passed into "
-            "`hyperparameter_search`. Setting it to a default value "
-            f"of {resource_msg} for each trial."
-        )
-    # Make sure each trainer only uses GPUs that were allocated per trial.
-    gpus_per_trial = kwargs["resources_per_trial"].get("gpu", 0)
-    trainer.args._n_gpu = gpus_per_trial
-
-    # Setup default `progress_reporter`.
-    if "progress_reporter" not in kwargs:
-        from ray.tune import CLIReporter
-
-        kwargs["progress_reporter"] = CLIReporter(metric_columns=["objective"])
-    if "keep_checkpoints_num" in kwargs and kwargs["keep_checkpoints_num"] > 0:
-        # `keep_checkpoints_num=0` would disabled checkpointing
-        trainer.use_tune_checkpoints = True
-        if kwargs["keep_checkpoints_num"] > 1:
-            logger.warning(
-                f"Currently keeping {kwargs['keep_checkpoints_num']} checkpoints for each trial. "
-                "Checkpoints are usually huge, "
-                "consider setting `keep_checkpoints_num=1`."
-            )
-    if "scheduler" in kwargs:
-        from ray.tune.schedulers import ASHAScheduler, HyperBandForBOHB, MedianStoppingRule, PopulationBasedTraining
-
-        # Check if checkpointing is enabled for PopulationBasedTraining
-        if isinstance(kwargs["scheduler"], PopulationBasedTraining):
-            if not trainer.use_tune_checkpoints:
-                logger.warning(
-                    "You are using PopulationBasedTraining but you haven't enabled checkpointing. "
-                    "This means your trials will train from scratch everytime they are exploiting "
-                    "new configurations. Consider enabling checkpointing by passing "
-                    "`keep_checkpoints_num=1` as an additional argument to `Trainer.hyperparameter_search`."
-                )
-
-        # Check for `do_eval` and `eval_during_training` for schedulers that require intermediate reporting.
-        if isinstance(
-            kwargs["scheduler"], (ASHAScheduler, MedianStoppingRule, HyperBandForBOHB, PopulationBasedTraining)
-        ) and (not trainer.args.do_eval or trainer.args.evaluation_strategy == IntervalStrategy.NO):
-            raise RuntimeError(
-                "You are using {cls} as a scheduler but you haven't enabled evaluation during training. "
-                "This means your trials will not report intermediate results to Ray Tune, and "
-                "can thus not be stopped early or used to exploit other trials parameters. "
-                "If this is what you want, do not use {cls}. If you would like to use {cls}, "
-                "make sure you pass `do_eval=True` and `evaluation_strategy='steps'` in the "
-                "Trainer `args`.".format(cls=type(kwargs["scheduler"]).__name__)
-            )
-
-    trainable = ray.tune.with_parameters(_objective, local_trainer=trainer)
-
-    @functools.wraps(trainable)
-    def dynamic_modules_import_trainable(*args, **kwargs):
-        """
-        Wrapper around `tune.with_parameters` to ensure datasets_modules are loaded on each Actor.
-
-        Without this, an ImportError will be thrown. See https://github.com/huggingface/transformers/issues/11565.
-
-        Assumes that `_objective`, defined above, is a function.
-        """
-        if is_datasets_available():
-            import datasets.load
-
-            dynamic_modules_path = os.path.join(datasets.load.init_dynamic_modules(), "__init__.py")
-            # load dynamic_modules from path
-            spec = importlib.util.spec_from_file_location("datasets_modules", dynamic_modules_path)
-            datasets_modules = importlib.util.module_from_spec(spec)
-            sys.modules[spec.name] = datasets_modules
-            spec.loader.exec_module(datasets_modules)
-        return trainable(*args, **kwargs)
-
-    # special attr set by tune.with_parameters
-    if hasattr(trainable, "__mixins__"):
-        dynamic_modules_import_trainable.__mixins__ = trainable.__mixins__
-
-    analysis = ray.tune.run(
-        dynamic_modules_import_trainable,
-        config=trainer.hp_space(None),
-        num_samples=n_trials,
-        **kwargs,
-    )
-    best_trial = analysis.get_best_trial(metric="objective", mode=direction[:3], scope=trainer.args.ray_scope)
-    best_run = BestRun(best_trial.trial_id, best_trial.last_result["objective"], best_trial.config)
-    if _tb_writer is not None:
-        trainer.add_callback(_tb_writer)
-    return best_run
-
-
-def run_hp_search_sigopt(trainer, n_trials: int, direction: str, **kwargs) -> BestRun:
-    import sigopt
-    from transformers.utils.versions import importlib_metadata
-
-    if trainer.args.process_index == 0:
-        if importlib_metadata.version("sigopt") >= "8.0.0":
-            sigopt.set_project("huggingface")
-
-            experiment = sigopt.create_experiment(
-                name="huggingface-tune",
-                type="offline",
-                parameters=trainer.hp_space(None),
-                metrics=[dict(name="objective", objective=direction, strategy="optimize")],
-                parallel_bandwidth=1,
-                budget=n_trials,
-            )
-
-            logger.info(f"created experiment: https://app.sigopt.com/experiment/{experiment.id}")
-
-            for run in experiment.loop():
-                with run:
-                    trainer.objective = None
-                    if trainer.args.world_size > 1:
-                        if trainer.args.parallel_mode != ParallelMode.DISTRIBUTED:
-                            raise RuntimeError("only support DDP Sigopt HPO for ParallelMode.DISTRIBUTED currently.")
-                        trainer._hp_search_setup(run.run)
-                        torch.distributed.broadcast_object_list(pickle.dumps(trainer.args), src=0)
-                        trainer.train(resume_from_checkpoint=None)
-                    else:
-                        trainer.train(resume_from_checkpoint=None, trial=run.run)
-                    # If there hasn't been any evaluation during the training loop.
-                    if getattr(trainer, "objective", None) is None:
-                        metrics = trainer.evaluate()
-                        trainer.objective = trainer.compute_objective(metrics)
-                    run.log_metric("objective", trainer.objective)
-
-            best = list(experiment.get_best_runs())[0]
-            best_run = BestRun(best.id, best.values["objective"].value, best.assignments)
-        else:
-            from sigopt import Connection
-
-            conn = Connection()
-            proxies = kwargs.pop("proxies", None)
-            if proxies is not None:
-                conn.set_proxies(proxies)
-
-            experiment = conn.experiments().create(
-                name="huggingface-tune",
-                parameters=trainer.hp_space(None),
-                metrics=[dict(name="objective", objective=direction, strategy="optimize")],
-                parallel_bandwidth=1,
-                observation_budget=n_trials,
-                project="huggingface",
-            )
-            logger.info(f"created experiment: https://app.sigopt.com/experiment/{experiment.id}")
-
-            while experiment.progress.observation_count < experiment.observation_budget:
-                suggestion = conn.experiments(experiment.id).suggestions().create()
-                trainer.objective = None
-                if trainer.args.world_size > 1:
-                    if trainer.args.parallel_mode != ParallelMode.DISTRIBUTED:
-                        raise RuntimeError("only support DDP Sigopt HPO for ParallelMode.DISTRIBUTED currently.")
-                    trainer._hp_search_setup(suggestion)
-                    torch.distributed.broadcast_object_list(pickle.dumps(trainer.args), src=0)
-                    trainer.train(resume_from_checkpoint=None)
-                else:
-                    trainer.train(resume_from_checkpoint=None, trial=suggestion)
-                # If there hasn't been any evaluation during the training loop.
-                if getattr(trainer, "objective", None) is None:
-                    metrics = trainer.evaluate()
-                    trainer.objective = trainer.compute_objective(metrics)
-
-                values = [dict(name="objective", value=trainer.objective)]
-                obs = conn.experiments(experiment.id).observations().create(suggestion=suggestion.id, values=values)
-                logger.info(f"[suggestion_id, observation_id]: [{suggestion.id}, {obs.id}]")
-                experiment = conn.experiments(experiment.id).fetch()
-
-            best = list(conn.experiments(experiment.id).best_assignments().fetch().iterate_pages())[0]
-            best_run = BestRun(best.id, best.value, best.assignments)
-        return best_run
-    else:
-        for i in range(n_trials):
-            trainer.objective = None
-            args_main_rank = list(pickle.dumps(trainer.args))
-            if trainer.args.parallel_mode != ParallelMode.DISTRIBUTED:
-                raise RuntimeError("only support DDP Sigopt HPO for ParallelMode.DISTRIBUTED currently.")
-            torch.distributed.broadcast_object_list(args_main_rank, src=0)
-            args = pickle.loads(bytes(args_main_rank))
-            for key, value in asdict(args).items():
-                if key != "local_rank":
-                    setattr(trainer.args, key, value)
-            trainer.train(resume_from_checkpoint=None)
-            # If there hasn't been any evaluation during the training loop.
-            if getattr(trainer, "objective", None) is None:
-                metrics = trainer.evaluate()
-                trainer.objective = trainer.compute_objective(metrics)
-        return None
-
-
-def run_hp_search_wandb(trainer, n_trials: int, direction: str, **kwargs) -> BestRun:
-    from .integrations import is_wandb_available
-
-    if not is_wandb_available():
-        raise ImportError("This function needs wandb installed: `pip install wandb`")
-    import wandb
-
-    # add WandbCallback if not already added in trainer callbacks
-    reporting_to_wandb = False
-    for callback in trainer.callback_handler.callbacks:
-        if isinstance(callback, WandbCallback):
-            reporting_to_wandb = True
-            break
-    if not reporting_to_wandb:
-        trainer.add_callback(WandbCallback())
-    trainer.args.report_to = "wandb"
-    best_trial = {"run_id": None, "objective": None, "hyperparameters": None}
-    sweep_id = kwargs.pop("sweep_id", None)
-    project = kwargs.pop("project", None)
-    name = kwargs.pop("name", None)
-    entity = kwargs.pop("entity", None)
-    metric = kwargs.pop("metric", "eval/loss")
-
-    sweep_config = trainer.hp_space(None)
-    sweep_config["metric"]["goal"] = direction
-    sweep_config["metric"]["name"] = metric
-    if name:
-        sweep_config["name"] = name
-
-    def _objective():
-        run = wandb.run if wandb.run else wandb.init()
-        trainer.state.trial_name = run.name
-        run.config.update({"assignments": {}, "metric": metric})
-        config = wandb.config
-
-        trainer.objective = None
-
-        trainer.train(resume_from_checkpoint=None, trial=vars(config)["_items"])
-        # If there hasn't been any evaluation during the training loop.
-        if getattr(trainer, "objective", None) is None:
-            metrics = trainer.evaluate()
-            trainer.objective = trainer.compute_objective(metrics)
-            format_metrics = rewrite_logs(metrics)
-            if metric not in format_metrics:
-                logger.warning(
-                    f"Provided metric {metric} not found. This might result in unexpected sweeps charts. The available"
-                    f" metrics are {format_metrics.keys()}"
-                )
-        best_score = False
-        if best_trial["run_id"] is not None:
-            if direction == "minimize":
-                best_score = trainer.objective < best_trial["objective"]
-            elif direction == "maximize":
-                best_score = trainer.objective > best_trial["objective"]
-
-        if best_score or best_trial["run_id"] is None:
-            best_trial["run_id"] = run.id
-            best_trial["objective"] = trainer.objective
-            best_trial["hyperparameters"] = dict(config)
-
-        return trainer.objective
-
-    sweep_id = wandb.sweep(sweep_config, project=project, entity=entity) if not sweep_id else sweep_id
-    logger.info(f"wandb sweep id - {sweep_id}")
-    wandb.agent(sweep_id, function=_objective, count=n_trials)
-
-    return BestRun(best_trial["run_id"], best_trial["objective"], best_trial["hyperparameters"])
-
-
-def get_available_reporting_integrations():
-    integrations = []
-    if is_azureml_available() and not is_mlflow_available():
-        integrations.append("azure_ml")
-    if is_comet_available():
-        integrations.append("comet_ml")
-    if is_mlflow_available():
-        integrations.append("mlflow")
-    if is_neptune_available():
-        integrations.append("neptune")
-    if is_tensorboard_available():
-        integrations.append("tensorboard")
-    if is_wandb_available():
-        integrations.append("wandb")
-    if is_codecarbon_available():
-        integrations.append("codecarbon")
-    if is_clearml_available():
-        integrations.append("clearml")
-    return integrations
-
-
-def rewrite_logs(d):
-    new_d = {}
-    eval_prefix = "eval_"
-    eval_prefix_len = len(eval_prefix)
-    test_prefix = "test_"
-    test_prefix_len = len(test_prefix)
-    for k, v in d.items():
-        if k.startswith(eval_prefix):
-            new_d["eval/" + k[eval_prefix_len:]] = v
-        elif k.startswith(test_prefix):
-            new_d["test/" + k[test_prefix_len:]] = v
-        else:
-            new_d["train/" + k] = v
-    return new_d
-
-
-class TensorBoardCallback(TrainerCallback):
-    """
-    A [`TrainerCallback`] that sends the logs to [TensorBoard](https://www.tensorflow.org/tensorboard).
-
-    Args:
-        tb_writer (`SummaryWriter`, *optional*):
-            The writer to use. Will instantiate one if not set.
-    """
-
-    def __init__(self, tb_writer=None):
-        has_tensorboard = is_tensorboard_available()
-        if not has_tensorboard:
-            raise RuntimeError(
-                "TensorBoardCallback requires tensorboard to be installed. Either update your PyTorch version or"
-                " install tensorboardX."
-            )
-        if has_tensorboard:
-            try:
-                from torch.utils.tensorboard import SummaryWriter  # noqa: F401
-
-                self._SummaryWriter = SummaryWriter
-            except ImportError:
-                try:
-                    from tensorboardX import SummaryWriter
-
-                    self._SummaryWriter = SummaryWriter
-                except ImportError:
-                    self._SummaryWriter = None
-        else:
-            self._SummaryWriter = None
-        self.tb_writer = tb_writer
-
-    def _init_summary_writer(self, args, log_dir=None):
-        log_dir = log_dir or args.logging_dir
-        if self._SummaryWriter is not None:
-            self.tb_writer = self._SummaryWriter(log_dir=log_dir)
-
-    def on_train_begin(self, args, state, control, **kwargs):
-        if not state.is_world_process_zero:
-            return
-
-        log_dir = None
-
-        if state.is_hyper_param_search:
-            trial_name = state.trial_name
-            if trial_name is not None:
-                log_dir = os.path.join(args.logging_dir, trial_name)
-
-        if self.tb_writer is None:
-            self._init_summary_writer(args, log_dir)
-
-        if self.tb_writer is not None:
-            self.tb_writer.add_text("args", args.to_json_string())
-            if "model" in kwargs:
-                model = kwargs["model"]
-                if hasattr(model, "config") and model.config is not None:
-                    model_config_json = model.config.to_json_string()
-                    self.tb_writer.add_text("model_config", model_config_json)
-            # Version of TensorBoard coming from tensorboardX does not have this method.
-            if hasattr(self.tb_writer, "add_hparams"):
-                self.tb_writer.add_hparams(args.to_sanitized_dict(), metric_dict={})
-
-    def on_log(self, args, state, control, logs=None, **kwargs):
-        if not state.is_world_process_zero:
-            return
-
-        if self.tb_writer is None:
-            self._init_summary_writer(args)
-
-        if self.tb_writer is not None:
-            logs = rewrite_logs(logs)
-            for k, v in logs.items():
-                if isinstance(v, (int, float)):
-                    self.tb_writer.add_scalar(k, v, state.global_step)
-                else:
-                    logger.warning(
-                        "Trainer is attempting to log a value of "
-                        f'"{v}" of type {type(v)} for key "{k}" as a scalar. '
-                        "This invocation of Tensorboard's writer.add_scalar() "
-                        "is incorrect so we dropped this attribute."
-                    )
-            self.tb_writer.flush()
-
-    def on_train_end(self, args, state, control, **kwargs):
-        if self.tb_writer:
-            self.tb_writer.close()
-            self.tb_writer = None
-
-
-class WandbCallback(TrainerCallback):
-    """
-    A [`TrainerCallback`] that logs metrics, media, model checkpoints to [Weight and Biases](https://www.wandb.com/).
-    """
-
-    def __init__(self):
-        has_wandb = is_wandb_available()
-        if not has_wandb:
-            raise RuntimeError("WandbCallback requires wandb to be installed. Run `pip install wandb`.")
-        if has_wandb:
-            import wandb
-
-            self._wandb = wandb
-        self._initialized = False
-        # log model
-        if os.getenv("WANDB_LOG_MODEL", "FALSE").upper() in ENV_VARS_TRUE_VALUES.union({"TRUE"}):
-            DeprecationWarning(
-                f"Setting `WANDB_LOG_MODEL` as {os.getenv('WANDB_LOG_MODEL')} is deprecated and will be removed in "
-                "version 5 of transformers. Use one of `'end'` or `'checkpoint'` instead."
-            )
-            logger.info(f"Setting `WANDB_LOG_MODEL` from {os.getenv('WANDB_LOG_MODEL')} to `end` instead")
-            self._log_model = "end"
-        else:
-            self._log_model = os.getenv("WANDB_LOG_MODEL", "false").lower()
-
-    def setup(self, args, state, model, **kwargs):
-        """
-        Setup the optional Weights & Biases (*wandb*) integration.
-
-        One can subclass and override this method to customize the setup if needed. Find more information
-        [here](https://docs.wandb.ai/guides/integrations/huggingface). You can also override the following environment
-        variables:
-
-        Environment:
-        - **WANDB_LOG_MODEL** (`str`, *optional*, defaults to `"false"`):
-            Whether to log model and checkpoints during training. Can be `"end"`, `"checkpoint"` or `"false"`. If set
-            to `"end"`, the model will be uploaded at the end of training. If set to `"checkpoint"`, the checkpoint
-            will be uploaded every `args.save_steps` . If set to `"false"`, the model will not be uploaded. Use along
-            with [`~transformers.TrainingArguments.load_best_model_at_end`] to upload best model.
-
-            <Deprecated version="5.0">
-
-            Setting `WANDB_LOG_MODEL` as `bool` will be deprecated in version 5 of 🤗 Transformers.
-
-            </Deprecated>
-        - **WANDB_WATCH** (`str`, *optional* defaults to `"false"`):
-            Can be `"gradients"`, `"all"`, `"parameters"`, or `"false"`. Set to `"all"` to log gradients and
-            parameters.
-        - **WANDB_PROJECT** (`str`, *optional*, defaults to `"huggingface"`):
-            Set this to a custom string to store results in a different project.
-        - **WANDB_DISABLED** (`bool`, *optional*, defaults to `False`):
-            Whether to disable wandb entirely. Set `WANDB_DISABLED=true` to disable.
-        """
-        if self._wandb is None:
-            return
-        self._initialized = True
-        if state.is_world_process_zero:
-            logger.info(
-                'Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"'
-            )
-            combined_dict = {**args.to_sanitized_dict()}
-
-            if hasattr(model, "config") and model.config is not None:
-                model_config = model.config.to_dict()
-                combined_dict = {**model_config, **combined_dict}
-            trial_name = state.trial_name
-            init_args = {}
-            if trial_name is not None:
-                init_args["name"] = trial_name
-                init_args["group"] = args.run_name
-            else:
-                if not (args.run_name is None or args.run_name == args.output_dir):
-                    init_args["name"] = args.run_name
-
-            if self._wandb.run is None:
-
-                self._wandb.init(
-                    project=os.getenv("WANDB_PROJECT", "huggingface"),
-                    **init_args,
-                )
-            # add config parameters (run may have been created manually)
-            self._wandb.config.update(combined_dict, allow_val_change=True)
-
-            # define default x-axis (for latest wandb versions)
-            if getattr(self._wandb, "define_metric", None):
-                self._wandb.define_metric("train/global_step")
-                self._wandb.define_metric("*", step_metric="train/global_step", step_sync=True)
-
-            # keep track of model topology and gradients, unsupported on TPU
-            _watch_model = os.getenv("WANDB_WATCH", "false")
-            if not is_torch_tpu_available() and _watch_model in ("all", "parameters", "gradients"):
-                self._wandb.watch(model, log=_watch_model, log_freq=max(100, args.logging_steps))
-
-    def on_train_begin(self, args, state, control, model=None, **kwargs):
-        if self._wandb is None:
-            return
-        hp_search = state.is_hyper_param_search
-        if hp_search:
-            self._wandb.finish()
-            self._initialized = False
-            args.run_name = None
-        if not self._initialized:
-            self.setup(args, state, model, **kwargs)
-
-    def on_train_end(self, args, state, control, model=None, tokenizer=None, **kwargs):
-        if self._wandb is None:
-            return
-        if self._log_model in ("end", "checkpoint") and self._initialized and state.is_world_process_zero:
-            from .trainer import Trainer
-
-            fake_trainer = Trainer(args=args, model=model, tokenizer=tokenizer)
-            with tempfile.TemporaryDirectory() as temp_dir:
-                fake_trainer.save_model(temp_dir)
-                metadata = (
-                    {
-                        k: v
-                        for k, v in dict(self._wandb.summary).items()
-                        if isinstance(v, numbers.Number) and not k.startswith("_")
-                    }
-                    if not args.load_best_model_at_end
-                    else {
-                        f"eval/{args.metric_for_best_model}": state.best_metric,
-                        "train/total_floss": state.total_flos,
-                    }
-                )
-                logger.info("Logging model artifacts. ...")
-                model_name = (
-                    f"model-{self._wandb.run.id}"
-                    if (args.run_name is None or args.run_name == args.output_dir)
-                    else f"model-{self._wandb.run.name}"
-                )
-                artifact = self._wandb.Artifact(name=model_name, type="model", metadata=metadata)
-                for f in Path(temp_dir).glob("*"):
-                    if f.is_file():
-                        with artifact.new_file(f.name, mode="wb") as fa:
-                            fa.write(f.read_bytes())
-                self._wandb.run.log_artifact(artifact)
-
-    def on_log(self, args, state, control, model=None, logs=None, **kwargs):
-        if self._wandb is None:
-            return
-        if not self._initialized:
-            self.setup(args, state, model)
-        if state.is_world_process_zero:
-            logs = rewrite_logs(logs)
-            self._wandb.log({**logs, "train/global_step": state.global_step})
-
-    def on_save(self, args, state, control, **kwargs):
-        if self._log_model == "checkpoint" and self._initialized and state.is_world_process_zero:
-            checkpoint_metadata = {
-                k: v
-                for k, v in dict(self._wandb.summary).items()
-                if isinstance(v, numbers.Number) and not k.startswith("_")
-            }
-
-            ckpt_dir = f"checkpoint-{state.global_step}"
-            artifact_path = os.path.join(args.output_dir, ckpt_dir)
-            logger.info(f"Logging checkpoint artifacts in {ckpt_dir}. ...")
-            checkpoint_name = (
-                f"checkpoint-{self._wandb.run.id}"
-                if (args.run_name is None or args.run_name == args.output_dir)
-                else f"checkpoint-{self._wandb.run.name}"
-            )
-            artifact = self._wandb.Artifact(name=checkpoint_name, type="model", metadata=checkpoint_metadata)
-            artifact.add_dir(artifact_path)
-            self._wandb.log_artifact(artifact, aliases=[f"checkpoint-{state.global_step}"])
-
-
-class CometCallback(TrainerCallback):
-    """
-    A [`TrainerCallback`] that sends the logs to [Comet ML](https://www.comet.ml/site/).
-    """
-
-    def __init__(self):
-        if not _has_comet:
-            raise RuntimeError("CometCallback requires comet-ml to be installed. Run `pip install comet-ml`.")
-        self._initialized = False
-        self._log_assets = False
-
-    def setup(self, args, state, model):
-        """
-        Setup the optional Comet.ml integration.
-
-        Environment:
-        - **COMET_MODE** (`str`, *optional*, defaults to `ONLINE`):
-            Whether to create an online, offline experiment or disable Comet logging. Can be `OFFLINE`, `ONLINE`, or
-            `DISABLED`.
-        - **COMET_PROJECT_NAME** (`str`, *optional*):
-            Comet project name for experiments.
-        - **COMET_OFFLINE_DIRECTORY** (`str`, *optional*):
-            Folder to use for saving offline experiments when `COMET_MODE` is `OFFLINE`.
-        - **COMET_LOG_ASSETS** (`str`, *optional*, defaults to `TRUE`):
-            Whether or not to log training assets (tf event logs, checkpoints, etc), to Comet. Can be `TRUE`, or
-            `FALSE`.
-
-        For a number of configurable items in the environment, see
-        [here](https://www.comet.ml/docs/python-sdk/advanced/#comet-configuration-variables).
-        """
-        self._initialized = True
-        log_assets = os.getenv("COMET_LOG_ASSETS", "FALSE").upper()
-        if log_assets in {"TRUE", "1"}:
-            self._log_assets = True
-        if state.is_world_process_zero:
-            comet_mode = os.getenv("COMET_MODE", "ONLINE").upper()
-            experiment = None
-            experiment_kwargs = {"project_name": os.getenv("COMET_PROJECT_NAME", "huggingface")}
-            if comet_mode == "ONLINE":
-                experiment = comet_ml.Experiment(**experiment_kwargs)
-                experiment.log_other("Created from", "transformers")
-                logger.info("Automatic Comet.ml online logging enabled")
-            elif comet_mode == "OFFLINE":
-                experiment_kwargs["offline_directory"] = os.getenv("COMET_OFFLINE_DIRECTORY", "./")
-                experiment = comet_ml.OfflineExperiment(**experiment_kwargs)
-                experiment.log_other("Created from", "transformers")
-                logger.info("Automatic Comet.ml offline logging enabled; use `comet upload` when finished")
-            if experiment is not None:
-                experiment._set_model_graph(model, framework="transformers")
-                experiment._log_parameters(args, prefix="args/", framework="transformers")
-                if hasattr(model, "config"):
-                    experiment._log_parameters(model.config, prefix="config/", framework="transformers")
-
-    def on_train_begin(self, args, state, control, model=None, **kwargs):
-        if not self._initialized:
-            self.setup(args, state, model)
-
-    def on_log(self, args, state, control, model=None, logs=None, **kwargs):
-        if not self._initialized:
-            self.setup(args, state, model)
-        if state.is_world_process_zero:
-            experiment = comet_ml.config.get_global_experiment()
-            if experiment is not None:
-                experiment._log_metrics(logs, step=state.global_step, epoch=state.epoch, framework="transformers")
-
-    def on_train_end(self, args, state, control, **kwargs):
-        if self._initialized and state.is_world_process_zero:
-            experiment = comet_ml.config.get_global_experiment()
-            if experiment is not None:
-                if self._log_assets is True:
-                    logger.info("Logging checkpoints. This may take time.")
-                    experiment.log_asset_folder(
-                        args.output_dir, recursive=True, log_file_name=True, step=state.global_step
-                    )
-                experiment.end()
-
-
-class AzureMLCallback(TrainerCallback):
-    """
-    A [`TrainerCallback`] that sends the logs to [AzureML](https://pypi.org/project/azureml-sdk/).
-    """
-
-    def __init__(self, azureml_run=None):
-        if not is_azureml_available():
-            raise RuntimeError("AzureMLCallback requires azureml to be installed. Run `pip install azureml-sdk`.")
-        self.azureml_run = azureml_run
-
-    def on_init_end(self, args, state, control, **kwargs):
-        from azureml.core.run import Run
-
-        if self.azureml_run is None and state.is_world_process_zero:
-            self.azureml_run = Run.get_context()
-
-    def on_log(self, args, state, control, logs=None, **kwargs):
-        if self.azureml_run and state.is_world_process_zero:
-            for k, v in logs.items():
-                if isinstance(v, (int, float)):
-                    self.azureml_run.log(k, v, description=k)
-
-
-class MLflowCallback(TrainerCallback):
-    """
-    A [`TrainerCallback`] that sends the logs to [MLflow](https://www.mlflow.org/). Can be disabled by setting
-    environment variable `DISABLE_MLFLOW_INTEGRATION = TRUE`.
-    """
-
-    def __init__(self):
-        if not is_mlflow_available():
-            raise RuntimeError("MLflowCallback requires mlflow to be installed. Run `pip install mlflow`.")
-        import mlflow
-
-        self._MAX_PARAM_VAL_LENGTH = mlflow.utils.validation.MAX_PARAM_VAL_LENGTH
-        self._MAX_PARAMS_TAGS_PER_BATCH = mlflow.utils.validation.MAX_PARAMS_TAGS_PER_BATCH
-
-        self._initialized = False
-        self._auto_end_run = False
-        self._log_artifacts = False
-        self._ml_flow = mlflow
-
-    def setup(self, args, state, model):
-        """
-        Setup the optional MLflow integration.
-
-        Environment:
-        - **HF_MLFLOW_LOG_ARTIFACTS** (`str`, *optional*):
-            Whether to use MLflow `.log_artifact()` facility to log artifacts. This only makes sense if logging to a
-            remote server, e.g. s3 or GCS. If set to `True` or *1*, will copy each saved checkpoint on each save in
-            [`TrainingArguments`]'s `output_dir` to the local or remote artifact storage. Using it without a remote
-            storage will just copy the files to your artifact location.
-        - **MLFLOW_EXPERIMENT_NAME** (`str`, *optional*, defaults to `None`):
-            Whether to use an MLflow experiment_name under which to launch the run. Default to `None` which will point
-            to the `Default` experiment in MLflow. Otherwise, it is a case sensitive name of the experiment to be
-            activated. If an experiment with this name does not exist, a new experiment with this name is created.
-        - **MLFLOW_TAGS** (`str`, *optional*):
-            A string dump of a dictionary of key/value pair to be added to the MLflow run as tags. Example:
-            `os.environ['MLFLOW_TAGS']='{"release.candidate": "RC1", "release.version": "2.2.0"}'`.
-        - **MLFLOW_NESTED_RUN** (`str`, *optional*):
-            Whether to use MLflow nested runs. If set to `True` or *1*, will create a nested run inside the current
-            run.
-        - **MLFLOW_RUN_ID** (`str`, *optional*):
-            Allow to reattach to an existing run which can be usefull when resuming training from a checkpoint. When
-            `MLFLOW_RUN_ID` environment variable is set, `start_run` attempts to resume a run with the specified run ID
-            and other parameters are ignored.
-        - **MLFLOW_FLATTEN_PARAMS** (`str`, *optional*, defaults to `False`):
-            Whether to flatten the parameters dictionary before logging.
-        """
-        self._log_artifacts = os.getenv("HF_MLFLOW_LOG_ARTIFACTS", "FALSE").upper() in ENV_VARS_TRUE_VALUES
-        self._nested_run = os.getenv("MLFLOW_NESTED_RUN", "FALSE").upper() in ENV_VARS_TRUE_VALUES
-        self._experiment_name = os.getenv("MLFLOW_EXPERIMENT_NAME", None)
-        self._flatten_params = os.getenv("MLFLOW_FLATTEN_PARAMS", "FALSE").upper() in ENV_VARS_TRUE_VALUES
-        self._run_id = os.getenv("MLFLOW_RUN_ID", None)
-        logger.debug(
-            f"MLflow experiment_name={self._experiment_name}, run_name={args.run_name}, nested={self._nested_run},"
-            f" tags={self._nested_run}"
-        )
-        if state.is_world_process_zero:
-            if self._ml_flow.active_run() is None or self._nested_run or self._run_id:
-                if self._experiment_name:
-                    # Use of set_experiment() ensure that Experiment is created if not exists
-                    self._ml_flow.set_experiment(self._experiment_name)
-                self._ml_flow.start_run(run_name=args.run_name, nested=self._nested_run)
-                logger.debug(f"MLflow run started with run_id={self._ml_flow.active_run().info.run_id}")
-                self._auto_end_run = True
-            combined_dict = args.to_dict()
-            if hasattr(model, "config") and model.config is not None:
-                model_config = model.config.to_dict()
-                combined_dict = {**model_config, **combined_dict}
-            combined_dict = flatten_dict(combined_dict) if self._flatten_params else combined_dict
-            # remove params that are too long for MLflow
-            for name, value in list(combined_dict.items()):
-                # internally, all values are converted to str in MLflow
-                if len(str(value)) > self._MAX_PARAM_VAL_LENGTH:
-                    logger.warning(
-                        f'Trainer is attempting to log a value of "{value}" for key "{name}" as a parameter. MLflow\'s'
-                        " log_param() only accepts values no longer than 250 characters so we dropped this attribute."
-                        " You can use `MLFLOW_FLATTEN_PARAMS` environment variable to flatten the parameters and"
-                        " avoid this message."
-                    )
-                    del combined_dict[name]
-            # MLflow cannot log more than 100 values in one go, so we have to split it
-            combined_dict_items = list(combined_dict.items())
-            for i in range(0, len(combined_dict_items), self._MAX_PARAMS_TAGS_PER_BATCH):
-                self._ml_flow.log_params(dict(combined_dict_items[i : i + self._MAX_PARAMS_TAGS_PER_BATCH]))
-            mlflow_tags = os.getenv("MLFLOW_TAGS", None)
-            if mlflow_tags:
-                mlflow_tags = json.loads(mlflow_tags)
-                self._ml_flow.set_tags(mlflow_tags)
-        self._initialized = True
-
-    def on_train_begin(self, args, state, control, model=None, **kwargs):
-        if not self._initialized:
-            self.setup(args, state, model)
-
-    def on_log(self, args, state, control, logs, model=None, **kwargs):
-        if not self._initialized:
-            self.setup(args, state, model)
-        if state.is_world_process_zero:
-            metrics = {}
-            for k, v in logs.items():
-                if isinstance(v, (int, float)):
-                    metrics[k] = v
-                else:
-                    logger.warning(
-                        f'Trainer is attempting to log a value of "{v}" of type {type(v)} for key "{k}" as a metric. '
-                        "MLflow's log_metric() only accepts float and int types so we dropped this attribute."
-                    )
-            self._ml_flow.log_metrics(metrics=metrics, step=state.global_step)
-
-    def on_train_end(self, args, state, control, **kwargs):
-        if self._initialized and state.is_world_process_zero:
-            if self._auto_end_run and self._ml_flow.active_run():
-                self._ml_flow.end_run()
-
-    def on_save(self, args, state, control, **kwargs):
-        if self._initialized and state.is_world_process_zero and self._log_artifacts:
-            ckpt_dir = f"checkpoint-{state.global_step}"
-            artifact_path = os.path.join(args.output_dir, ckpt_dir)
-            logger.info(f"Logging checkpoint artifacts in {ckpt_dir}. This may take time.")
-            self._ml_flow.pyfunc.log_model(
-                ckpt_dir,
-                artifacts={"model_path": artifact_path},
-                python_model=self._ml_flow.pyfunc.PythonModel(),
-            )
-
-    def __del__(self):
-        # if the previous run is not terminated correctly, the fluent API will
-        # not let you start a new run before the previous one is killed
-        if (
-            self._auto_end_run
-            and callable(getattr(self._ml_flow, "active_run", None))
-            and self._ml_flow.active_run() is not None
-        ):
-            self._ml_flow.end_run()
-
-
-class NeptuneMissingConfiguration(Exception):
-    def __init__(self):
-        super().__init__(
-            """
-        ------ Unsupported ---- We were not able to create new runs. You provided a custom Neptune run to
-        `NeptuneCallback` with the `run` argument. For the integration to work fully, provide your `api_token` and
-        `project` by saving them as environment variables or passing them to the callback.
-        """
-        )
-
-
-class NeptuneCallback(TrainerCallback):
-    """TrainerCallback that sends the logs to [Neptune](https://neptune.ai).
-
-    Args:
-        api_token (`str`, optional):
-            Neptune API token obtained upon registration. You can leave this argument out if you have saved your token
-            to the `NEPTUNE_API_TOKEN` environment variable (strongly recommended). See full setup instructions in the
-            [docs](https://docs.neptune.ai/getting-started/installation).
-        project (`str`, optional):
-            Name of an existing Neptune project, in the form: "workspace-name/project-name". You can find and copy the
-            name from the project Settings -> Properties in Neptune. If None (default), the value of the
-            `NEPTUNE_PROJECT` environment variable will be used.
-        name (`str`, optional): Custom name for the run.
-        base_namespace (`str`, optional, defaults to "finetuning"): In the Neptune run, the root namespace
-            that will contain all of the logged metadata.
-        log_parameters (`bool`, optional, defaults to True):
-            If True, logs all Trainer arguments and model parameters provided by the Trainer.
-        log_checkpoints (`str`, optional, defaults to None):
-            If "same", uploads checkpoints whenever they are saved by the Trainer. If "last", uploads only the most
-            recently saved checkpoint. If "best", uploads the best checkpoint (among the ones saved by the Trainer). If
-            None, does not upload checkpoints.
-        run (`Run`, optional):
-            Pass a Neptune run object if you want to continue logging to an existing run. Read more about resuming runs
-            in the [docs](https://docs.neptune.ai/how-to-guides/neptune-api/resume-run).
-        **neptune_run_kwargs (optional):
-            Additional keyword arguments to be passed directly to the
-            [neptune.init_run()](https://docs.neptune.ai/api-reference/neptune#.init_run) function when a new run is
-            created.
-    """
-
-    integration_version_key = "source_code/integrations/transformers"
-    model_parameters_key = "model_parameters"
-    trial_name_key = "trial"
-    trial_params_key = "trial_params"
-    trainer_parameters_key = "trainer_parameters"
-    flat_metrics = {"train/epoch"}
-
-    def __init__(
-        self,
-        *,
-        api_token: Optional[str] = None,
-        project: Optional[str] = None,
-        name: Optional[str] = None,
-        base_namespace: str = "finetuning",
-        run: Optional["Run"] = None,
-        log_parameters: bool = True,
-        log_checkpoints: Optional[str] = None,
-        **neptune_run_kwargs
-    ):
-        if not is_neptune_available():
-            raise ValueError(
-                "NeptuneCallback requires the Neptune client library to be installed. "
-                "To install the library, run `pip install neptune-client`."
-            )
-
-        from neptune.new.metadata_containers.run import Run
-
-        try:
-            from neptune.new.integrations.utils import verify_type
-        except ImportError:
-            from neptune.new.internal.utils import verify_type
-
-        verify_type("api_token", api_token, (str, type(None)))
-        verify_type("project", project, (str, type(None)))
-        verify_type("name", name, (str, type(None)))
-        verify_type("base_namespace", base_namespace, str)
-        verify_type("run", run, (Run, type(None)))
-        verify_type("log_parameters", log_parameters, bool)
-        verify_type("log_checkpoints", log_checkpoints, (str, type(None)))
-
-        self._base_namespace_path = base_namespace
-        self._log_parameters = log_parameters
-        self._log_checkpoints = log_checkpoints
-        self._initial_run: Optional[Run] = run
-
-        self._run = None
-        self._is_monitoring_run = False
-        self._run_id = None
-        self._force_reset_monitoring_run = False
-        self._init_run_kwargs = {"api_token": api_token, "project": project, "name": name, **neptune_run_kwargs}
-
-        self._volatile_checkpoints_dir = None
-        self._should_upload_checkpoint = self._log_checkpoints is not None
-        self._recent_checkpoint_path = None
-
-        if self._log_checkpoints in {"last", "best"}:
-            self._target_checkpoints_namespace = f"checkpoints/{self._log_checkpoints}"
-            self._should_clean_recently_uploaded_checkpoint = True
-        else:
-            self._target_checkpoints_namespace = "checkpoints"
-            self._should_clean_recently_uploaded_checkpoint = False
-
-    def _stop_run_if_exists(self):
-        if self._run:
-            self._run.stop()
-            del self._run
-            self._run = None
-
-    def _initialize_run(self, **additional_neptune_kwargs):
-        from neptune.new import init_run
-        from neptune.new.exceptions import NeptuneMissingApiTokenException, NeptuneMissingProjectNameException
-
-        self._stop_run_if_exists()
-
-        try:
-            self._run = init_run(**self._init_run_kwargs, **additional_neptune_kwargs)
-            self._run_id = self._run["sys/id"].fetch()
-        except (NeptuneMissingProjectNameException, NeptuneMissingApiTokenException) as e:
-            raise NeptuneMissingConfiguration() from e
-
-    def _use_initial_run(self):
-        self._run = self._initial_run
-        self._is_monitoring_run = True
-        self._run_id = self._run["sys/id"].fetch()
-        self._initial_run = None
-
-    def _ensure_run_with_monitoring(self):
-        if self._initial_run is not None:
-            self._use_initial_run()
-        else:
-            if not self._force_reset_monitoring_run and self._is_monitoring_run:
-                return
-
-            if self._run and not self._is_monitoring_run and not self._force_reset_monitoring_run:
-                self._initialize_run(run=self._run_id)
-                self._is_monitoring_run = True
-            else:
-                self._initialize_run()
-                self._force_reset_monitoring_run = False
-
-    def _ensure_at_least_run_without_monitoring(self):
-        if self._initial_run is not None:
-            self._use_initial_run()
-        else:
-            if not self._run:
-                self._initialize_run(
-                    run=self._run_id,
-                    capture_stdout=False,
-                    capture_stderr=False,
-                    capture_hardware_metrics=False,
-                    capture_traceback=False,
-                )
-                self._is_monitoring_run = False
-
-    @property
-    def run(self):
-        if self._run is None:
-            self._ensure_at_least_run_without_monitoring()
-        return self._run
-
-    @property
-    def _metadata_namespace(self):
-        return self.run[self._base_namespace_path]
-
-    def _log_integration_version(self):
-        self.run[NeptuneCallback.integration_version_key] = version
-
-    def _log_trainer_parameters(self, args):
-        self._metadata_namespace[NeptuneCallback.trainer_parameters_key] = args.to_sanitized_dict()
-
-    def _log_model_parameters(self, model):
-        if model and hasattr(model, "config") and model.config is not None:
-            self._metadata_namespace[NeptuneCallback.model_parameters_key] = model.config.to_dict()
-
-    def _log_hyper_param_search_parameters(self, state):
-        if state and hasattr(state, "trial_name"):
-            self._metadata_namespace[NeptuneCallback.trial_name_key] = state.trial_name
-
-        if state and hasattr(state, "trial_params") and state.trial_params is not None:
-            self._metadata_namespace[NeptuneCallback.trial_params_key] = state.trial_params
-
-    def _log_model_checkpoint(self, source_directory: str, checkpoint: str):
-        target_path = relative_path = os.path.join(source_directory, checkpoint)
-
-        if self._volatile_checkpoints_dir is not None:
-            consistent_checkpoint_path = os.path.join(self._volatile_checkpoints_dir, checkpoint)
-            try:
-                shutil.copytree(relative_path, os.path.join(consistent_checkpoint_path, relative_path))
-                target_path = consistent_checkpoint_path
-            except IOError as e:
-                logger.warning(
-                    "NeptuneCallback was unable to made a copy of checkpoint due to I/O exception: '{}'."
-                    "Could fail trying to upload.".format(e)
-                )
-
-        self._metadata_namespace[self._target_checkpoints_namespace].upload_files(target_path)
-
-        if self._should_clean_recently_uploaded_checkpoint and self._recent_checkpoint_path is not None:
-            self._metadata_namespace[self._target_checkpoints_namespace].delete_files(self._recent_checkpoint_path)
-
-        self._recent_checkpoint_path = relative_path
-
-    def on_init_end(self, args, state, control, **kwargs):
-        self._volatile_checkpoints_dir = None
-        if self._log_checkpoints and (args.overwrite_output_dir or args.save_total_limit is not None):
-            self._volatile_checkpoints_dir = tempfile.TemporaryDirectory().name
-
-        if self._log_checkpoints == "best" and not args.load_best_model_at_end:
-            raise ValueError("To save the best model checkpoint, the load_best_model_at_end argument must be enabled.")
-
-    def on_train_begin(self, args, state, control, model=None, **kwargs):
-        if not state.is_world_process_zero:
-            return
-
-        self._ensure_run_with_monitoring()
-        self._force_reset_monitoring_run = True
-
-        self._log_integration_version()
-        if self._log_parameters:
-            self._log_trainer_parameters(args)
-            self._log_model_parameters(model)
-
-        if state.is_hyper_param_search:
-            self._log_hyper_param_search_parameters(state)
-
-    def on_train_end(self, args, state, control, **kwargs):
-        self._stop_run_if_exists()
-
-    def __del__(self):
-        if self._volatile_checkpoints_dir is not None:
-            shutil.rmtree(self._volatile_checkpoints_dir, ignore_errors=True)
-
-        self._stop_run_if_exists()
-
-    def on_save(self, args, state, control, **kwargs):
-        if self._should_upload_checkpoint:
-            self._log_model_checkpoint(args.output_dir, f"checkpoint-{state.global_step}")
-
-    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
-        if self._log_checkpoints == "best":
-            best_metric_name = args.metric_for_best_model
-            if not best_metric_name.startswith("eval_"):
-                best_metric_name = f"eval_{best_metric_name}"
-
-            metric_value = metrics.get(best_metric_name)
-
-            operator = np.greater if args.greater_is_better else np.less
-
-            self._should_upload_checkpoint = state.best_metric is None or operator(metric_value, state.best_metric)
-
-    @classmethod
-    def get_run(cls, trainer):
-        for callback in trainer.callback_handler.callbacks:
-            if isinstance(callback, cls):
-                return callback.run
-
-        raise Exception("The trainer doesn't have a NeptuneCallback configured.")
-
-    def on_log(self, args, state, control, logs: Optional[Dict[str, float]] = None, **kwargs):
-        if not state.is_world_process_zero:
-            return
-
-        if logs is not None:
-            for name, value in rewrite_logs(logs).items():
-                if isinstance(value, (int, float)):
-                    if name in NeptuneCallback.flat_metrics:
-                        self._metadata_namespace[name] = value
-                    else:
-                        self._metadata_namespace[name].log(value, step=state.global_step)
-
-
-class CodeCarbonCallback(TrainerCallback):
-    """
-    A [`TrainerCallback`] that tracks the CO2 emission of training.
-    """
-
-    def __init__(self):
-        if not is_codecarbon_available():
-            raise RuntimeError(
-                "CodeCarbonCallback requires `codecarbon` to be installed. Run `pip install codecarbon`."
-            )
-        import codecarbon
-
-        self._codecarbon = codecarbon
-        self.tracker = None
-
-    def on_init_end(self, args, state, control, **kwargs):
-        if self.tracker is None and state.is_local_process_zero:
-            # CodeCarbon will automatically handle environment variables for configuration
-            self.tracker = self._codecarbon.EmissionsTracker(output_dir=args.output_dir)
-
-    def on_train_begin(self, args, state, control, model=None, **kwargs):
-        if self.tracker and state.is_local_process_zero:
-            self.tracker.start()
-
-    def on_train_end(self, args, state, control, **kwargs):
-        if self.tracker and state.is_local_process_zero:
-            self.tracker.stop()
-
-
-class ClearMLCallback(TrainerCallback):
-    """
-    A [`TrainerCallback`] that sends the logs to [ClearML](https://clear.ml/).
-
-    Environment:
-    - **CLEARML_PROJECT** (`str`, *optional*, defaults to `HuggingFace Transformers`):
-        ClearML project name.
-    - **CLEARML_TASK** (`str`, *optional*, defaults to `Trainer`):
-        ClearML task name.
-    - **CLEARML_LOG_MODEL** (`bool`, *optional*, defaults to `False`):
-        Whether to log models as artifacts during training.
-    """
-
-    def __init__(self):
-        if is_clearml_available():
-            import clearml
-
-            self._clearml = clearml
-        else:
-            raise RuntimeError("ClearMLCallback requires 'clearml' to be installed. Run `pip install clearml`.")
-
-        self._initialized = False
-        self._clearml_task = None
-
-        self._log_model = os.getenv("CLEARML_LOG_MODEL", "FALSE").upper() in ENV_VARS_TRUE_VALUES.union({"TRUE"})
-
-    def setup(self, args, state, model, tokenizer, **kwargs):
-        if self._clearml is None:
-            return
-        if state.is_world_process_zero:
-            logger.info("Automatic ClearML logging enabled.")
-            if self._clearml_task is None:
-                self._clearml_task = self._clearml.Task.init(
-                    project_name=os.getenv("CLEARML_PROJECT", "HuggingFace Transformers"),
-                    task_name=os.getenv("CLEARML_TASK", "Trainer"),
-                    auto_connect_frameworks={"tensorboard": False, "pytorch": False},
-                    output_uri=True,
-                )
-                self._initialized = True
-                logger.info("ClearML Task has been initialized.")
-
-            self._clearml_task.connect(args, "Args")
-            if hasattr(model, "config") and model.config is not None:
-                self._clearml_task.connect(model.config, "Model Configuration")
-
-    def on_train_begin(self, args, state, control, model=None, tokenizer=None, **kwargs):
-        if self._clearml is None:
-            return
-        if state.is_hyper_param_search:
-            self._initialized = False
-        if not self._initialized:
-            self.setup(args, state, model, tokenizer, **kwargs)
-
-    def on_train_end(self, args, state, control, model=None, tokenizer=None, metrics=None, logs=None, **kwargs):
-        if self._clearml is None:
-            return
-        if self._clearml_task and state.is_world_process_zero:
-            # Close ClearML Task at the end end of training
-            self._clearml_task.close()
-
-    def on_log(self, args, state, control, model=None, tokenizer=None, logs=None, **kwargs):
-        if self._clearml is None:
-            return
-        if not self._initialized:
-            self.setup(args, state, model, tokenizer, **kwargs)
-        if state.is_world_process_zero:
-            eval_prefix = "eval_"
-            eval_prefix_len = len(eval_prefix)
-            test_prefix = "test_"
-            test_prefix_len = len(test_prefix)
-            single_value_scalars = [
-                "train_runtime",
-                "train_samples_per_second",
-                "train_steps_per_second",
-                "train_loss",
-                "total_flos",
-                "epoch",
-            ]
-            for k, v in logs.items():
-                if isinstance(v, (int, float)):
-                    if k in single_value_scalars:
-                        self._clearml_task.get_logger().report_single_value(name=k, value=v)
-                    elif k.startswith(eval_prefix):
-                        self._clearml_task.get_logger().report_scalar(
-                            title=k[eval_prefix_len:], series="eval", value=v, iteration=state.global_step
-                        )
-                    elif k.startswith(test_prefix):
-                        self._clearml_task.get_logger().report_scalar(
-                            title=k[test_prefix_len:], series="test", value=v, iteration=state.global_step
-                        )
-                    else:
-                        self._clearml_task.get_logger().report_scalar(
-                            title=k, series="train", value=v, iteration=state.global_step
-                        )
-                else:
-                    logger.warning(
-                        "Trainer is attempting to log a value of "
-                        f'"{v}" of type {type(v)} for key "{k}" as a scalar. '
-                        "This invocation of ClearML logger's  report_scalar() "
-                        "is incorrect so we dropped this attribute."
-                    )
-
-    def on_save(self, args, state, control, **kwargs):
-        if self._log_model and self._clearml_task and state.is_world_process_zero:
-            ckpt_dir = f"checkpoint-{state.global_step}"
-            artifact_path = os.path.join(args.output_dir, ckpt_dir)
-            logger.info(f"Logging checkpoint artifacts in {ckpt_dir}. This may take time.")
-            self._clearml_task.update_output_model(artifact_path, iteration=state.global_step, auto_delete_file=False)
-
-
-INTEGRATION_TO_CALLBACK = {
-    "azure_ml": AzureMLCallback,
-    "comet_ml": CometCallback,
-    "mlflow": MLflowCallback,
-    "neptune": NeptuneCallback,
-    "tensorboard": TensorBoardCallback,
-    "wandb": WandbCallback,
-    "codecarbon": CodeCarbonCallback,
-    "clearml": ClearMLCallback,
-}
-
-
-def get_reporting_integration_callbacks(report_to):
-    for integration in report_to:
-        if integration not in INTEGRATION_TO_CALLBACK:
-            raise ValueError(
-                f"{integration} is not supported, only {', '.join(INTEGRATION_TO_CALLBACK.keys())} are supported."
-            )
-
-    return [INTEGRATION_TO_CALLBACK[integration] for integration in report_to]
diff --git a/src/transformers/integrations/__init__.py b/src/transformers/integrations/__init__.py
new file mode 100644
index 000000000000..ddd36955b3bf
--- /dev/null
+++ b/src/transformers/integrations/__init__.py
@@ -0,0 +1,140 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ..utils import _LazyModule
+
+
+_import_structure = {
+    "bitsandbytes": [
+        "get_keys_to_not_convert",
+        "replace_8bit_linear",
+        "replace_with_bnb_linear",
+        "set_module_8bit_tensor_to_device",
+        "set_module_quantized_tensor_to_device",
+    ],
+    "deepspeed": [
+        "HfDeepSpeedConfig",
+        "HfTrainerDeepSpeedConfig",
+        "deepspeed_config",
+        "deepspeed_init",
+        "deepspeed_load_checkpoint",
+        "deepspeed_optim_sched",
+        "is_deepspeed_available",
+        "is_deepspeed_zero3_enabled",
+        "set_hf_deepspeed_config",
+        "unset_hf_deepspeed_config",
+    ],
+    "integration_utils": [
+        "INTEGRATION_TO_CALLBACK",
+        "AzureMLCallback",
+        "ClearMLCallback",
+        "CodeCarbonCallback",
+        "CometCallback",
+        "DagsHubCallback",
+        "FlyteCallback",
+        "MLflowCallback",
+        "NeptuneCallback",
+        "NeptuneMissingConfiguration",
+        "TensorBoardCallback",
+        "WandbCallback",
+        "get_available_reporting_integrations",
+        "get_reporting_integration_callbacks",
+        "hp_params",
+        "is_azureml_available",
+        "is_clearml_available",
+        "is_codecarbon_available",
+        "is_comet_available",
+        "is_dagshub_available",
+        "is_flyte_deck_standard_available",
+        "is_flytekit_available",
+        "is_mlflow_available",
+        "is_neptune_available",
+        "is_optuna_available",
+        "is_ray_available",
+        "is_ray_tune_available",
+        "is_sigopt_available",
+        "is_tensorboard_available",
+        "is_wandb_available",
+        "rewrite_logs",
+        "run_hp_search_optuna",
+        "run_hp_search_ray",
+        "run_hp_search_sigopt",
+        "run_hp_search_wandb",
+    ],
+    "peft": ["PeftAdapterMixin"],
+}
+
+if TYPE_CHECKING:
+    from .bitsandbytes import (
+        get_keys_to_not_convert,
+        replace_8bit_linear,
+        replace_with_bnb_linear,
+        set_module_8bit_tensor_to_device,
+        set_module_quantized_tensor_to_device,
+    )
+    from .deepspeed import (
+        HfDeepSpeedConfig,
+        HfTrainerDeepSpeedConfig,
+        deepspeed_config,
+        deepspeed_init,
+        deepspeed_load_checkpoint,
+        deepspeed_optim_sched,
+        is_deepspeed_available,
+        is_deepspeed_zero3_enabled,
+        set_hf_deepspeed_config,
+        unset_hf_deepspeed_config,
+    )
+    from .integration_utils import (
+        INTEGRATION_TO_CALLBACK,
+        AzureMLCallback,
+        ClearMLCallback,
+        CodeCarbonCallback,
+        CometCallback,
+        DagsHubCallback,
+        FlyteCallback,
+        MLflowCallback,
+        NeptuneCallback,
+        NeptuneMissingConfiguration,
+        TensorBoardCallback,
+        WandbCallback,
+        get_available_reporting_integrations,
+        get_reporting_integration_callbacks,
+        hp_params,
+        is_azureml_available,
+        is_clearml_available,
+        is_codecarbon_available,
+        is_comet_available,
+        is_dagshub_available,
+        is_flyte_deck_standard_available,
+        is_flytekit_available,
+        is_mlflow_available,
+        is_neptune_available,
+        is_optuna_available,
+        is_ray_available,
+        is_ray_tune_available,
+        is_sigopt_available,
+        is_tensorboard_available,
+        is_wandb_available,
+        rewrite_logs,
+        run_hp_search_optuna,
+        run_hp_search_ray,
+        run_hp_search_sigopt,
+        run_hp_search_wandb,
+    )
+    from .peft import PeftAdapterMixin
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/integrations/bitsandbytes.py b/src/transformers/integrations/bitsandbytes.py
new file mode 100644
index 000000000000..1a8220b1ed7b
--- /dev/null
+++ b/src/transformers/integrations/bitsandbytes.py
@@ -0,0 +1,290 @@
+import importlib.metadata
+import warnings
+from copy import deepcopy
+
+from packaging import version
+
+from ..utils import is_accelerate_available, is_bitsandbytes_available, logging
+
+
+if is_bitsandbytes_available():
+    import bitsandbytes as bnb
+    import torch
+    import torch.nn as nn
+
+    from ..pytorch_utils import Conv1D
+
+if is_accelerate_available():
+    from accelerate import init_empty_weights
+    from accelerate.utils import find_tied_parameters
+
+logger = logging.get_logger(__name__)
+
+
+def set_module_quantized_tensor_to_device(module, tensor_name, device, value=None, fp16_statistics=None):
+    """
+    A helper function to set a given tensor (parameter of buffer) of a module on a specific device (note that doing
+    `param.to(device)` creates a new tensor not linked to the parameter, which is why we need this function). The
+    function is adapted from `set_module_tensor_to_device` function from accelerate that is adapted to support the
+    class `Int8Params` from `bitsandbytes`.
+
+    Args:
+        module (`torch.nn.Module`):
+            The module in which the tensor we want to move lives.
+        tensor_name (`str`):
+            The full name of the parameter/buffer.
+        device (`int`, `str` or `torch.device`):
+            The device on which to set the tensor.
+        value (`torch.Tensor`, *optional*):
+            The value of the tensor (useful when going from the meta device to any other device).
+        fp16_statistics (`torch.HalfTensor`, *optional*):
+            The list of fp16 statistics to set on the module, used for serialization.
+    """
+    # Recurse if needed
+    if "." in tensor_name:
+        splits = tensor_name.split(".")
+        for split in splits[:-1]:
+            new_module = getattr(module, split)
+            if new_module is None:
+                raise ValueError(f"{module} has no attribute {split}.")
+            module = new_module
+        tensor_name = splits[-1]
+
+    if tensor_name not in module._parameters and tensor_name not in module._buffers:
+        raise ValueError(f"{module} does not have a parameter or a buffer named {tensor_name}.")
+    is_buffer = tensor_name in module._buffers
+    old_value = getattr(module, tensor_name)
+
+    if old_value.device == torch.device("meta") and device not in ["meta", torch.device("meta")] and value is None:
+        raise ValueError(f"{tensor_name} is on the meta device, we need a `value` to put in on {device}.")
+
+    is_4bit = False
+    is_8bit = False
+    if is_buffer or not is_bitsandbytes_available():
+        is_8bit = False
+        is_4bit = False
+    else:
+        is_4bit = hasattr(bnb.nn, "Params4bit") and isinstance(module._parameters[tensor_name], bnb.nn.Params4bit)
+        is_8bit = isinstance(module._parameters[tensor_name], bnb.nn.Int8Params)
+
+    if is_8bit or is_4bit:
+        param = module._parameters[tensor_name]
+        if param.device.type != "cuda":
+            if value is None:
+                new_value = old_value.to(device)
+            elif isinstance(value, torch.Tensor):
+                new_value = value.to("cpu")
+                if value.dtype == torch.int8:
+                    is_8bit_serializable = version.parse(importlib.metadata.version("bitsandbytes")) > version.parse(
+                        "0.37.2"
+                    )
+                    if not is_8bit_serializable:
+                        raise ValueError(
+                            "Detected int8 weights but the version of bitsandbytes is not compatible with int8 serialization. "
+                            "Make sure to download the latest `bitsandbytes` version. `pip install --upgrade bitsandbytes`."
+                        )
+            else:
+                new_value = torch.tensor(value, device="cpu")
+
+            # Support models using `Conv1D` in place of `nn.Linear` (e.g. gpt2) by transposing the weight matrix prior to quantization.
+            # Since weights are saved in the correct "orientation", we skip transposing when loading.
+            if issubclass(module.source_cls, Conv1D) and fp16_statistics is None:
+                new_value = new_value.T
+
+            kwargs = old_value.__dict__
+            if is_8bit:
+                new_value = bnb.nn.Int8Params(new_value, requires_grad=False, **kwargs).to(device)
+            elif is_4bit:
+                new_value = bnb.nn.Params4bit(new_value, requires_grad=False, **kwargs).to(device)
+
+            module._parameters[tensor_name] = new_value
+            if fp16_statistics is not None:
+                setattr(module.weight, "SCB", fp16_statistics.to(device))
+
+    else:
+        if value is None:
+            new_value = old_value.to(device)
+        elif isinstance(value, torch.Tensor):
+            new_value = value.to(device)
+        else:
+            new_value = torch.tensor(value, device=device)
+
+        if is_buffer:
+            module._buffers[tensor_name] = new_value
+        else:
+            new_value = nn.Parameter(new_value, requires_grad=old_value.requires_grad)
+            module._parameters[tensor_name] = new_value
+
+
+def _replace_with_bnb_linear(
+    model, modules_to_not_convert=None, current_key_name=None, quantization_config=None, has_been_replaced=False
+):
+    """
+    Private method that wraps the recursion for module replacement.
+
+    Returns the converted model and a boolean that indicates if the conversion has been successfull or not.
+    """
+    for name, module in model.named_children():
+        if current_key_name is None:
+            current_key_name = []
+        current_key_name.append(name)
+
+        if (isinstance(module, nn.Linear) or isinstance(module, Conv1D)) and name not in modules_to_not_convert:
+            # Check if the current key is not in the `modules_to_not_convert`
+            if not any(key in ".".join(current_key_name) for key in modules_to_not_convert):
+                with init_empty_weights():
+                    if isinstance(module, Conv1D):
+                        in_features, out_features = module.weight.shape
+                    else:
+                        in_features = module.in_features
+                        out_features = module.out_features
+
+                    if quantization_config.quantization_method() == "llm_int8":
+                        model._modules[name] = bnb.nn.Linear8bitLt(
+                            in_features,
+                            out_features,
+                            module.bias is not None,
+                            has_fp16_weights=quantization_config.llm_int8_has_fp16_weight,
+                            threshold=quantization_config.llm_int8_threshold,
+                        )
+                        has_been_replaced = True
+                    else:
+                        if (
+                            quantization_config.llm_int8_skip_modules is not None
+                            and name in quantization_config.llm_int8_skip_modules
+                        ):
+                            pass
+                        else:
+                            model._modules[name] = bnb.nn.Linear4bit(
+                                in_features,
+                                out_features,
+                                module.bias is not None,
+                                quantization_config.bnb_4bit_compute_dtype,
+                                compress_statistics=quantization_config.bnb_4bit_use_double_quant,
+                                quant_type=quantization_config.bnb_4bit_quant_type,
+                            )
+                            has_been_replaced = True
+                    # Store the module class in case we need to transpose the weight later
+                    model._modules[name].source_cls = type(module)
+                    # Force requires grad to False to avoid unexpected errors
+                    model._modules[name].requires_grad_(False)
+        if len(list(module.children())) > 0:
+            _, has_been_replaced = _replace_with_bnb_linear(
+                module,
+                modules_to_not_convert,
+                current_key_name,
+                quantization_config,
+                has_been_replaced=has_been_replaced,
+            )
+        # Remove the last key for recursion
+        current_key_name.pop(-1)
+    return model, has_been_replaced
+
+
+def replace_with_bnb_linear(model, modules_to_not_convert=None, current_key_name=None, quantization_config=None):
+    """
+    A helper function to replace all `torch.nn.Linear` modules by `bnb.nn.Linear8bit` modules from the `bitsandbytes`
+    library. This will enable running your models using mixed int8 precision as described by the paper `LLM.int8():
+    8-bit Matrix Multiplication for Transformers at Scale`. Make sure `bitsandbytes` compiled with the correct CUDA
+    version of your hardware is installed before running this function. `pip install -i https://test.pypi.org/simple/
+    bitsandbytes`
+
+    The function will be run recursively and replace all `torch.nn.Linear` modules except for the `lm_head` that should
+    be kept as a `torch.nn.Linear` module. The replacement is done under `init_empty_weights` context manager so no
+    CPU/GPU memory is required to run this function. Int8 mixed-precision matrix decomposition works by separating a
+    matrix multiplication into two streams: (1) and systematic feature outlier stream matrix multiplied in fp16
+    (0.01%), (2) a regular stream of int8 matrix multiplication (99.9%). With this method, int8 inference with no
+    predictive degradation is possible for very large models (>=176B parameters).
+
+    Parameters:
+        model (`torch.nn.Module`):
+            Input model or `torch.nn.Module` as the function is run recursively.
+        modules_to_not_convert (`List[`str`]`, *optional*, defaults to `["lm_head"]`):
+            Names of the modules to not convert in `Linear8bitLt`. In practice we keep the `lm_head` in full precision
+            for numerical stability reasons.
+        current_key_name (`List[`str`]`, *optional*):
+            An array to track the current key of the recursion. This is used to check whether the current key (part of
+            it) is not in the list of modules to not convert (for instances modules that are offloaded to `cpu` or
+            `disk`).
+    """
+    modules_to_not_convert = ["lm_head"] if modules_to_not_convert is None else modules_to_not_convert
+    model, has_been_replaced = _replace_with_bnb_linear(
+        model, modules_to_not_convert, current_key_name, quantization_config
+    )
+
+    if not has_been_replaced:
+        logger.warning(
+            "You are loading your model in 8bit or 4bit but no linear modules were found in your model."
+            " Please double check your model architecture, or submit an issue on github if you think this is"
+            " a bug."
+        )
+
+    return model
+
+
+# For backward compatibility
+def replace_8bit_linear(*args, **kwargs):
+    warnings.warn(
+        "`replace_8bit_linear` will be deprecated in a future version, please use `replace_with_bnb_linear` instead",
+        FutureWarning,
+    )
+    return replace_with_bnb_linear(*args, **kwargs)
+
+
+# For backward compatiblity
+def set_module_8bit_tensor_to_device(*args, **kwargs):
+    warnings.warn(
+        "`set_module_8bit_tensor_to_device` will be deprecated in a future version, please use `set_module_quantized_tensor_to_device` instead",
+        FutureWarning,
+    )
+    return set_module_quantized_tensor_to_device(*args, **kwargs)
+
+
+def get_keys_to_not_convert(model):
+    r"""
+    An utility function to get the key of the module to keep in full precision if any For example for CausalLM modules
+    we may want to keep the lm_head in full precision for numerical stability reasons. For other architectures, we want
+    to keep the tied weights of the model. The function will return a list of the keys of the modules to not convert in
+    int8.
+
+    Parameters:
+    model (`torch.nn.Module`):
+        Input model
+    """
+    # Create a copy of the model and tie the weights, then
+    # check if it contains tied weights
+    tied_model = deepcopy(model)  # this has 0 cost since it is done inside `init_empty_weights` context manager`
+    tied_model.tie_weights()
+
+    tied_params = find_tied_parameters(tied_model)
+    # For compatibility with Accelerate < 0.18
+    if isinstance(tied_params, dict):
+        tied_keys = sum(list(tied_params.values()), []) + list(tied_params.keys())
+    else:
+        tied_keys = sum(tied_params, [])
+    has_tied_params = len(tied_keys) > 0
+
+    # If there is not tied weights, we want to keep the lm_head（output_embedding) in full precision
+    if not has_tied_params:
+        output_emb = model.get_output_embeddings()
+        if output_emb is not None:
+            list_last_module = [name for name, module in model.named_modules() if id(module) == id(output_emb)]
+            return list_last_module
+
+    # otherwise, no tied weights, no output embedding defined, simply keep the last module in full precision
+    list_modules = list(model.named_parameters())
+    list_last_module = [list_modules[-1][0]]
+    # add last module together with tied weights
+    intersection = set(list_last_module) - set(tied_keys)
+    list_untouched = list(set(tied_keys)) + list(intersection)
+
+    # remove ".weight" from the keys
+    names_to_remove = [".weight", ".bias"]
+    filtered_module_names = []
+    for name in list_untouched:
+        for name_to_remove in names_to_remove:
+            if name_to_remove in name:
+                name = name.replace(name_to_remove, "")
+        filtered_module_names.append(name)
+
+    return filtered_module_names
diff --git a/src/transformers/integrations/deepspeed.py b/src/transformers/integrations/deepspeed.py
new file mode 100644
index 000000000000..fb9c022b0f28
--- /dev/null
+++ b/src/transformers/integrations/deepspeed.py
@@ -0,0 +1,408 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Integration with Deepspeed
+"""
+
+import importlib.metadata as importlib_metadata
+import importlib.util
+import weakref
+from functools import partialmethod
+
+from ..dependency_versions_check import dep_version_check
+from ..utils import is_accelerate_available, is_torch_available, logging
+
+
+if is_torch_available():
+    import torch
+
+    from ..optimization import get_scheduler
+
+logger = logging.get_logger(__name__)
+
+
+def is_deepspeed_available():
+    package_exists = importlib.util.find_spec("deepspeed") is not None
+
+    # Check we're not importing a "deepspeed" directory somewhere but the actual library by trying to grab the version
+    # AND checking it has an author field in the metadata that is HuggingFace.
+    if package_exists:
+        try:
+            _ = importlib_metadata.metadata("deepspeed")
+            return True
+        except importlib_metadata.PackageNotFoundError:
+            return False
+
+
+if is_accelerate_available() and is_deepspeed_available():
+    from accelerate.utils.deepspeed import HfDeepSpeedConfig as DeepSpeedConfig
+else:
+    # Inherits from a dummy `object` if accelerate is not available, so that python succeeds to import this file.
+    # Deepspeed glue code will never inherit this dummy object as it checks if accelerate is available.
+    from builtins import object as DeepSpeedConfig
+
+
+class HfDeepSpeedConfig(DeepSpeedConfig):
+    """
+    This object contains a DeepSpeed configuration dictionary and can be quickly queried for things like zero stage.
+
+    A `weakref` of this object is stored in the module's globals to be able to access the config from areas where
+    things like the Trainer object is not available (e.g. `from_pretrained` and `_get_resized_embeddings`). Therefore
+    it's important that this object remains alive while the program is still running.
+
+    [`Trainer`] uses the `HfTrainerDeepSpeedConfig` subclass instead. That subclass has logic to sync the configuration
+    with values of [`TrainingArguments`] by replacing special placeholder values: `"auto"`. Without this special logic
+    the DeepSpeed configuration is not modified in any way.
+
+    Args:
+        config_file_or_dict (`Union[str, Dict]`): path to DeepSpeed config file or dict.
+
+    """
+
+    def __init__(self, config_file_or_dict):
+        # set global weakref object
+        set_hf_deepspeed_config(self)
+        dep_version_check("accelerate")
+        dep_version_check("deepspeed")
+        super().__init__(config_file_or_dict)
+
+
+class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig):
+    """
+    The `HfTrainerDeepSpeedConfig` object is meant to be created during `TrainingArguments` object creation and has the
+    same lifespan as the latter.
+    """
+
+    def __init__(self, config_file_or_dict):
+        super().__init__(config_file_or_dict)
+        self._dtype = None
+        self.mismatches = []
+
+    def dtype(self):
+        if self._dtype is None:
+            raise ValueError("trainer_config_process() wasn't called yet to tell dtype")
+        return self._dtype
+
+    def is_auto(self, ds_key_long):
+        val = self.get_value(ds_key_long)
+        if val is None:
+            return False
+        else:
+            return val == "auto"
+
+    def fill_match(self, ds_key_long, hf_val, hf_key=None, must_match=True):
+        """
+        A utility method that massages the config file and can optionally verify that the values match.
+
+        1. Replace "auto" values with `TrainingArguments` value.
+
+        2. If it wasn't "auto" and `must_match` is true, then check that DS config matches Trainer
+        config values and if mismatched add the entry to `self.mismatched` - will assert during
+        `trainer_config_finalize` for one or more mismatches.
+
+        """
+        config, ds_key = self.find_config_node(ds_key_long)
+        if config is None:
+            return
+
+        if config.get(ds_key) == "auto":
+            config[ds_key] = hf_val
+            return
+
+        if not must_match:
+            return
+
+        ds_val = config.get(ds_key)
+        if ds_val is not None and ds_val != hf_val:
+            self.mismatches.append(f"- ds {ds_key_long}={ds_val} vs hf {hf_key}={hf_val}")
+
+    fill_only = partialmethod(fill_match, must_match=False)
+
+    def trainer_config_process(self, args):
+        """
+        Adjust the config with `TrainingArguments` values. This stage is run during `TrainingArguments` object
+        creation.
+        """
+        # DeepSpeed does:
+        # train_batch_size = world_size * train_micro_batch_size_per_gpu * gradient_accumulation_steps
+        train_batch_size = args.world_size * args.per_device_train_batch_size * args.gradient_accumulation_steps
+        self.fill_match(
+            "train_micro_batch_size_per_gpu", args.per_device_train_batch_size, "per_device_train_batch_size"
+        )
+        self.fill_match("gradient_accumulation_steps", args.gradient_accumulation_steps, "gradient_accumulation_steps")
+        self.fill_match("train_batch_size", train_batch_size, "train_batch_size (calculated)")
+        self.fill_match("gradient_clipping", args.max_grad_norm, "max_grad_norm")
+
+        self.fill_match("optimizer.params.lr", args.learning_rate, "learning_rate")
+        self.fill_match("optimizer.params.betas", [args.adam_beta1, args.adam_beta2], "adam_beta1+adam_beta2")
+        self.fill_match("optimizer.params.eps", args.adam_epsilon, "adam_epsilon")
+        self.fill_match("optimizer.params.weight_decay", args.weight_decay, "weight_decay")
+
+        self.fill_only("scheduler.params.warmup_min_lr", 0)  # not a trainer arg
+        self.fill_match("scheduler.params.warmup_max_lr", args.learning_rate, "learning_rate")
+        # total_num_steps - will get set in trainer_config_finalize
+
+        # fp16
+        if args.fp16 or args.fp16_full_eval:
+            fp16_backend = "apex" if args.fp16_backend == "apex" else "amp"
+        else:
+            fp16_backend = None
+
+        if args.save_on_each_node:
+            # deepspeed uses shared storage by default. Let's override this setting if save_on_each_node == True
+            self.config["checkpoint"] = self.config.get("checkpoint", {})
+            self.config["checkpoint"]["use_node_local_storage"] = args.save_on_each_node
+
+        # amp: similar to the pytorch native amp - it has a bunch of optional params but we won't set
+        # any here unless the user did the work
+        self.fill_match(
+            "fp16.enabled",
+            ((args.fp16 or args.fp16_full_eval) and fp16_backend == "amp"),
+            "fp16|fp16_full_eval+fp16_backend(amp)",
+        )
+
+        # apex: delegates amp work to apex (which needs to be available), but it cannot be used with any
+        # ZeRO features
+        self.fill_match("amp.enabled", fp16_backend == "apex", "fp16+fp16_backend(apex)")
+        self.fill_match("amp.opt_level", args.fp16_opt_level, "fp16_opt_level")
+
+        self.fill_match("bf16.enabled", (args.bf16 or args.bf16_full_eval), "bf16|bf16_full_eval")
+
+        # deepspeed's default mode is fp16 unless there is a config that says differently
+        if self.is_true("bf16.enabled"):
+            self._dtype = torch.bfloat16
+        elif self.is_false("fp16.enabled"):
+            self._dtype = torch.float32
+        else:
+            self._dtype = torch.float16
+
+    def trainer_config_finalize(self, args, model, num_training_steps):
+        """
+        This stage is run after we have the model and know num_training_steps.
+
+        Now we can complete the configuration process.
+        """
+        # zero
+
+        # deal with config keys that use `auto` value and rely on model's hidden_size
+        hidden_size_based_keys = [
+            "zero_optimization.reduce_bucket_size",
+            "zero_optimization.stage3_prefetch_bucket_size",
+            "zero_optimization.stage3_param_persistence_threshold",
+        ]
+        hidden_size_auto_keys = [x for x in hidden_size_based_keys if self.is_auto(x)]
+
+        if len(hidden_size_auto_keys) > 0:
+            if hasattr(model.config, "hidden_size"):
+                hidden_size = model.config.hidden_size
+            elif hasattr(model.config, "hidden_sizes"):
+                # if there are many hidden sizes pick the largest one
+                hidden_size = max(model.config.hidden_sizes)
+            else:
+                raise ValueError(
+                    "The model's config file has neither `hidden_size` nor `hidden_sizes` entry, "
+                    "therefore it's not possible to automatically fill out the following `auto` entries "
+                    f"in the DeepSpeed config file: {hidden_size_auto_keys}. You can fix that by replacing "
+                    "`auto` values for these keys with an integer value of your choice."
+                )
+
+            self.fill_only("zero_optimization.reduce_bucket_size", hidden_size * hidden_size)
+            if self.is_zero3():
+                # automatically assign the optimal config values based on model config
+                self.fill_only("zero_optimization.stage3_prefetch_bucket_size", 0.9 * hidden_size * hidden_size)
+                self.fill_only("zero_optimization.stage3_param_persistence_threshold", 10 * hidden_size)
+
+        # scheduler
+        self.fill_match("scheduler.params.total_num_steps", num_training_steps, "num_training_steps (calculated)")
+        self.fill_match("scheduler.params.warmup_num_steps", args.get_warmup_steps(num_training_steps), "warmup_steps")
+
+        if len(self.mismatches) > 0:
+            mismatches = "\n".join(self.mismatches)
+            raise ValueError(
+                "Please correct the following DeepSpeed config values that mismatch TrainingArguments"
+                f" values:\n{mismatches}\nThe easiest method is to set these DeepSpeed config values to 'auto'."
+            )
+
+
+# keep the config object global to be able to access it anywhere during TrainingArguments life-cycle
+_hf_deepspeed_config_weak_ref = None
+
+
+def set_hf_deepspeed_config(hf_deepspeed_config_obj):
+    # this is a special weakref global object to allow us to get to Deepspeed config from APIs
+    # that don't have an easy way to get to the Deepspeed config outside of the Trainer domain.
+    global _hf_deepspeed_config_weak_ref
+    # will go away automatically when HfDeepSpeedConfig is destroyed (when TrainingArguments is destroyed)
+    _hf_deepspeed_config_weak_ref = weakref.ref(hf_deepspeed_config_obj)
+
+
+def unset_hf_deepspeed_config():
+    # useful for unit tests to ensure the global state doesn't leak - call from `tearDown` method
+    global _hf_deepspeed_config_weak_ref
+    _hf_deepspeed_config_weak_ref = None
+
+
+def is_deepspeed_zero3_enabled():
+    if _hf_deepspeed_config_weak_ref is not None and _hf_deepspeed_config_weak_ref() is not None:
+        return _hf_deepspeed_config_weak_ref().is_zero3()
+    else:
+        return False
+
+
+def deepspeed_config():
+    if _hf_deepspeed_config_weak_ref is not None and _hf_deepspeed_config_weak_ref() is not None:
+        return _hf_deepspeed_config_weak_ref().config
+    else:
+        return None
+
+
+def deepspeed_optim_sched(trainer, hf_deepspeed_config, args, num_training_steps, model_parameters):
+    """
+    A convenience wrapper that deals with optimizer and lr scheduler configuration.
+    """
+    from accelerate.utils import DummyOptim, DummyScheduler
+
+    config = hf_deepspeed_config.config
+
+    # Optimizer + Scheduler
+    # Currently supported combos:
+    # 1. DS scheduler + DS optimizer: Yes
+    # 2. HF scheduler + HF optimizer: Yes
+    # 3. DS scheduler + HF optimizer: Yes
+    # 4. HF scheduler + DS optimizer: No
+    #
+    # Unless Offload is enabled in which case it's:
+    # 1. DS scheduler + DS optimizer: Yes
+    # 2. HF scheduler + HF optimizer: Mostly*
+    # 3. DS scheduler + HF optimizer: Mostly*
+    # 4. HF scheduler + DS optimizer: Yes
+    #
+    # Mostly*: All non-native DeepSpeed optimizers that have both CPU and GPU implementation should work (except LAMB)
+
+    optimizer = None
+    if "optimizer" in config:
+        if args.adafactor:
+            raise ValueError(
+                "--adafactor was passed, but also found `optimizer` configured in the DeepSpeed config. "
+                "Only one optimizer can be configured."
+            )
+        optimizer = DummyOptim(params=model_parameters)
+    else:
+        if hf_deepspeed_config.is_offload():
+            logger.info(
+                "Detected ZeRO Offload and non-DeepSpeed optimizers: This combination should work as long as the"
+                " custom optimizer has both CPU and GPU implementation (except LAMB)"
+            )
+
+        # ds supports Adam, OneBitAdam, and Lamb optimizers and can import other optimizers from torch.
+        # But trainer uses AdamW by default.
+        optimizer = trainer.create_optimizer()
+        # To use other optimizers requires voiding warranty with: `zero_allow_untested_optimizer`
+        config["zero_allow_untested_optimizer"] = True
+
+    lr_scheduler = None
+    if "scheduler" in config:
+        lr_scheduler = DummyScheduler(optimizer)
+    else:
+        if isinstance(optimizer, DummyOptim):
+
+            def _lr_scheduler_callable(optimizer):
+                return get_scheduler(
+                    trainer.args.lr_scheduler_type,
+                    optimizer=optimizer,
+                    num_warmup_steps=trainer.args.get_warmup_steps(num_training_steps),
+                    num_training_steps=num_training_steps,
+                )
+
+            lr_scheduler = DummyScheduler(optimizer, lr_scheduler_callable=_lr_scheduler_callable)
+        else:
+            lr_scheduler = trainer.create_scheduler(num_training_steps=num_training_steps, optimizer=optimizer)
+
+    return optimizer, lr_scheduler
+
+
+def deepspeed_init(trainer, num_training_steps, inference=False):
+    """
+    Init DeepSpeed, after updating the DeepSpeed configuration with any relevant Trainer's args.
+
+    If `resume_from_checkpoint` was passed then an attempt to resume from a previously saved checkpoint will be made.
+
+    Args:
+        trainer: Trainer object
+        num_training_steps: per single gpu
+        resume_from_checkpoint: path to a checkpoint if to resume from after normal DeepSpeedEngine load
+        inference: launch in inference mode (no optimizer and no lr scheduler)
+
+    Returns: optimizer, lr_scheduler
+
+    We may use `deepspeed_init` more than once during the life of Trainer, when we do - it's a temp hack based on:
+    https://github.com/microsoft/DeepSpeed/issues/1394#issuecomment-937405374 until Deepspeed fixes a bug where it
+    can't resume from a checkpoint after it did some stepping https://github.com/microsoft/DeepSpeed/issues/1612
+
+    """
+    from deepspeed.utils import logger as ds_logger
+
+    model = trainer.model
+    args = trainer.args
+
+    hf_deepspeed_config = trainer.accelerator.state.deepspeed_plugin.hf_ds_config
+
+    # resume config update - some bits like `model` and `num_training_steps` only become available during train
+    hf_deepspeed_config.trainer_config_finalize(args, model, num_training_steps)
+
+    # set the Deepspeed log level consistent with the Trainer
+    ds_logger.setLevel(args.get_process_log_level())
+
+    if inference:
+        # only Z3 makes sense for the inference
+        if not hf_deepspeed_config.is_zero3():
+            raise ValueError("ZeRO inference only makes sense with ZeRO Stage 3 - please adjust your config")
+
+        # in case the training config is re-used for inference
+        hf_deepspeed_config.del_config_sub_tree("optimizer")
+        hf_deepspeed_config.del_config_sub_tree("lr_scheduler")
+        optimizer, lr_scheduler = None, None
+        model_parameters = None
+    else:
+        trainer.optimizer = None  # important for when deepspeed_init is used as re-init
+        model_parameters = list(filter(lambda p: p.requires_grad, model.parameters()))
+        optimizer, lr_scheduler = deepspeed_optim_sched(
+            trainer, hf_deepspeed_config, args, num_training_steps, model_parameters
+        )
+
+    # keep for quick debug:
+    # from pprint import pprint; pprint(config)
+
+    return optimizer, lr_scheduler
+
+
+def deepspeed_load_checkpoint(deepspeed_engine, checkpoint_path):
+    # it's possible that the user is trying to resume from model_path, which doesn't necessarily
+    # contain a deepspeed checkpoint. e.g. examples just check if the dir exists and assume it's
+    # a resume from a checkpoint and not just a local pretrained weight. So we check here if the
+    # path contains what looks like a deepspeed checkpoint
+    import glob
+
+    deepspeed_checkpoint_dirs = sorted(glob.glob(f"{checkpoint_path}/global_step*"))
+
+    if len(deepspeed_checkpoint_dirs) > 0:
+        logger.info(f"Attempting to resume from {checkpoint_path}")
+        # this magically updates self.optimizer and self.lr_scheduler
+        load_path, _ = deepspeed_engine.load_checkpoint(
+            checkpoint_path, load_optimizer_states=True, load_lr_scheduler_states=True
+        )
+        if load_path is None:
+            raise ValueError(f"[deepspeed] failed to resume from checkpoint {checkpoint_path}")
+    else:
+        raise ValueError(f"Can't find a valid checkpoint at {checkpoint_path}")
diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py
new file mode 100644
index 000000000000..10f86ee41980
--- /dev/null
+++ b/src/transformers/integrations/integration_utils.py
@@ -0,0 +1,1629 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Integrations with other Python libraries.
+"""
+import functools
+import importlib.metadata
+import importlib.util
+import json
+import numbers
+import os
+import pickle
+import shutil
+import sys
+import tempfile
+from dataclasses import asdict
+from pathlib import Path
+from typing import TYPE_CHECKING, Dict, Optional
+
+import numpy as np
+
+from .. import __version__ as version
+from ..utils import flatten_dict, is_datasets_available, is_pandas_available, is_torch_available, logging
+
+
+logger = logging.get_logger(__name__)
+
+if is_torch_available():
+    import torch
+
+# comet_ml requires to be imported before any ML frameworks
+_has_comet = importlib.util.find_spec("comet_ml") is not None and os.getenv("COMET_MODE", "").upper() != "DISABLED"
+if _has_comet:
+    try:
+        import comet_ml  # noqa: F401
+
+        if hasattr(comet_ml, "config") and comet_ml.config.get_config("comet.api_key"):
+            _has_comet = True
+        else:
+            if os.getenv("COMET_MODE", "").upper() != "DISABLED":
+                logger.warning("comet_ml is installed but `COMET_API_KEY` is not set.")
+            _has_comet = False
+    except (ImportError, ValueError):
+        _has_comet = False
+
+_has_neptune = (
+    importlib.util.find_spec("neptune") is not None or importlib.util.find_spec("neptune-client") is not None
+)
+if TYPE_CHECKING and _has_neptune:
+    try:
+        _neptune_version = importlib.metadata.version("neptune")
+        logger.info(f"Neptune version {_neptune_version} available.")
+    except importlib.metadata.PackageNotFoundError:
+        try:
+            _neptune_version = importlib.metadata.version("neptune-client")
+            logger.info(f"Neptune-client version {_neptune_version} available.")
+        except importlib.metadata.PackageNotFoundError:
+            _has_neptune = False
+
+from ..trainer_callback import ProgressCallback, TrainerCallback  # noqa: E402
+from ..trainer_utils import PREFIX_CHECKPOINT_DIR, BestRun, IntervalStrategy  # noqa: E402
+from ..training_args import ParallelMode  # noqa: E402
+from ..utils import ENV_VARS_TRUE_VALUES, is_torch_tpu_available  # noqa: E402
+
+
+# Integration functions:
+def is_wandb_available():
+    # any value of WANDB_DISABLED disables wandb
+    if os.getenv("WANDB_DISABLED", "").upper() in ENV_VARS_TRUE_VALUES:
+        logger.warning(
+            "Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the "
+            "--report_to flag to control the integrations used for logging result (for instance --report_to none)."
+        )
+        return False
+    return importlib.util.find_spec("wandb") is not None
+
+
+def is_clearml_available():
+    return importlib.util.find_spec("clearml") is not None
+
+
+def is_comet_available():
+    return _has_comet
+
+
+def is_tensorboard_available():
+    return importlib.util.find_spec("tensorboard") is not None or importlib.util.find_spec("tensorboardX") is not None
+
+
+def is_optuna_available():
+    return importlib.util.find_spec("optuna") is not None
+
+
+def is_ray_available():
+    return importlib.util.find_spec("ray") is not None
+
+
+def is_ray_tune_available():
+    if not is_ray_available():
+        return False
+    return importlib.util.find_spec("ray.tune") is not None
+
+
+def is_sigopt_available():
+    return importlib.util.find_spec("sigopt") is not None
+
+
+def is_azureml_available():
+    if importlib.util.find_spec("azureml") is None:
+        return False
+    if importlib.util.find_spec("azureml.core") is None:
+        return False
+    return importlib.util.find_spec("azureml.core.run") is not None
+
+
+def is_mlflow_available():
+    if os.getenv("DISABLE_MLFLOW_INTEGRATION", "FALSE").upper() == "TRUE":
+        return False
+    return importlib.util.find_spec("mlflow") is not None
+
+
+def is_dagshub_available():
+    return None not in [importlib.util.find_spec("dagshub"), importlib.util.find_spec("mlflow")]
+
+
+def is_neptune_available():
+    return _has_neptune
+
+
+def is_codecarbon_available():
+    return importlib.util.find_spec("codecarbon") is not None
+
+
+def is_flytekit_available():
+    return importlib.util.find_spec("flytekit") is not None
+
+
+def is_flyte_deck_standard_available():
+    if not is_flytekit_available():
+        return False
+    return importlib.util.find_spec("flytekitplugins.deck") is not None
+
+
+def hp_params(trial):
+    if is_optuna_available():
+        import optuna
+
+        if isinstance(trial, optuna.Trial):
+            return trial.params
+    if is_ray_tune_available():
+        if isinstance(trial, dict):
+            return trial
+
+    if is_sigopt_available():
+        if isinstance(trial, dict):
+            return trial
+
+    if is_wandb_available():
+        if isinstance(trial, dict):
+            return trial
+
+    raise RuntimeError(f"Unknown type for trial {trial.__class__}")
+
+
+def run_hp_search_optuna(trainer, n_trials: int, direction: str, **kwargs) -> BestRun:
+    import optuna
+
+    if trainer.args.process_index == 0:
+
+        def _objective(trial, checkpoint_dir=None):
+            checkpoint = None
+            if checkpoint_dir:
+                for subdir in os.listdir(checkpoint_dir):
+                    if subdir.startswith(PREFIX_CHECKPOINT_DIR):
+                        checkpoint = os.path.join(checkpoint_dir, subdir)
+            trainer.objective = None
+            if trainer.args.world_size > 1:
+                if trainer.args.parallel_mode != ParallelMode.DISTRIBUTED:
+                    raise RuntimeError("only support DDP optuna HPO for ParallelMode.DISTRIBUTED currently.")
+                trainer._hp_search_setup(trial)
+                torch.distributed.broadcast_object_list(pickle.dumps(trainer.args), src=0)
+                trainer.train(resume_from_checkpoint=checkpoint)
+            else:
+                trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
+            # If there hasn't been any evaluation during the training loop.
+            if getattr(trainer, "objective", None) is None:
+                metrics = trainer.evaluate()
+                trainer.objective = trainer.compute_objective(metrics)
+            return trainer.objective
+
+        timeout = kwargs.pop("timeout", None)
+        n_jobs = kwargs.pop("n_jobs", 1)
+        directions = direction if isinstance(direction, list) else None
+        direction = None if directions is not None else direction
+        study = optuna.create_study(direction=direction, directions=directions, **kwargs)
+        study.optimize(_objective, n_trials=n_trials, timeout=timeout, n_jobs=n_jobs)
+        if not study._is_multi_objective():
+            best_trial = study.best_trial
+            return BestRun(str(best_trial.number), best_trial.value, best_trial.params)
+        else:
+            best_trials = study.best_trials
+            return [BestRun(str(best.number), best.values, best.params) for best in best_trials]
+    else:
+        for i in range(n_trials):
+            trainer.objective = None
+            args_main_rank = list(pickle.dumps(trainer.args))
+            if trainer.args.parallel_mode != ParallelMode.DISTRIBUTED:
+                raise RuntimeError("only support DDP optuna HPO for ParallelMode.DISTRIBUTED currently.")
+            torch.distributed.broadcast_object_list(args_main_rank, src=0)
+            args = pickle.loads(bytes(args_main_rank))
+            for key, value in asdict(args).items():
+                if key != "local_rank":
+                    setattr(trainer.args, key, value)
+            trainer.train(resume_from_checkpoint=None)
+            # If there hasn't been any evaluation during the training loop.
+            if getattr(trainer, "objective", None) is None:
+                metrics = trainer.evaluate()
+                trainer.objective = trainer.compute_objective(metrics)
+        return None
+
+
+def run_hp_search_ray(trainer, n_trials: int, direction: str, **kwargs) -> BestRun:
+    import ray
+
+    def _objective(trial, local_trainer, checkpoint_dir=None):
+        try:
+            from transformers.utils.notebook import NotebookProgressCallback
+
+            if local_trainer.pop_callback(NotebookProgressCallback):
+                local_trainer.add_callback(ProgressCallback)
+        except ModuleNotFoundError:
+            pass
+
+        checkpoint = None
+        if checkpoint_dir:
+            for subdir in os.listdir(checkpoint_dir):
+                if subdir.startswith(PREFIX_CHECKPOINT_DIR):
+                    checkpoint = os.path.join(checkpoint_dir, subdir)
+        local_trainer.objective = None
+        local_trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
+        # If there hasn't been any evaluation during the training loop.
+        if getattr(local_trainer, "objective", None) is None:
+            metrics = local_trainer.evaluate()
+            local_trainer.objective = local_trainer.compute_objective(metrics)
+            local_trainer._tune_save_checkpoint()
+            ray.tune.report(objective=local_trainer.objective, **metrics, done=True)
+
+    if not trainer._memory_tracker.skip_memory_metrics:
+        from ..trainer_utils import TrainerMemoryTracker
+
+        logger.warning(
+            "Memory tracking for your Trainer is currently "
+            "enabled. Automatically disabling the memory tracker "
+            "since the memory tracker is not serializable."
+        )
+        trainer._memory_tracker = TrainerMemoryTracker(skip_memory_metrics=True)
+
+    # The model and TensorBoard writer do not pickle so we have to remove them (if they exists)
+    # while doing the ray hp search.
+    _tb_writer = trainer.pop_callback(TensorBoardCallback)
+    trainer.model = None
+
+    # Setup default `resources_per_trial`.
+    if "resources_per_trial" not in kwargs:
+        # Default to 1 CPU and 1 GPU (if applicable) per trial.
+        kwargs["resources_per_trial"] = {"cpu": 1}
+        if trainer.args.n_gpu > 0:
+            kwargs["resources_per_trial"]["gpu"] = 1
+        resource_msg = "1 CPU" + (" and 1 GPU" if trainer.args.n_gpu > 0 else "")
+        logger.info(
+            "No `resources_per_trial` arg was passed into "
+            "`hyperparameter_search`. Setting it to a default value "
+            f"of {resource_msg} for each trial."
+        )
+    # Make sure each trainer only uses GPUs that were allocated per trial.
+    gpus_per_trial = kwargs["resources_per_trial"].get("gpu", 0)
+    trainer.args._n_gpu = gpus_per_trial
+
+    # Setup default `progress_reporter`.
+    if "progress_reporter" not in kwargs:
+        from ray.tune import CLIReporter
+
+        kwargs["progress_reporter"] = CLIReporter(metric_columns=["objective"])
+    if "keep_checkpoints_num" in kwargs and kwargs["keep_checkpoints_num"] > 0:
+        # `keep_checkpoints_num=0` would disabled checkpointing
+        trainer.use_tune_checkpoints = True
+        if kwargs["keep_checkpoints_num"] > 1:
+            logger.warning(
+                f"Currently keeping {kwargs['keep_checkpoints_num']} checkpoints for each trial. "
+                "Checkpoints are usually huge, "
+                "consider setting `keep_checkpoints_num=1`."
+            )
+    if "scheduler" in kwargs:
+        from ray.tune.schedulers import ASHAScheduler, HyperBandForBOHB, MedianStoppingRule, PopulationBasedTraining
+
+        # Check if checkpointing is enabled for PopulationBasedTraining
+        if isinstance(kwargs["scheduler"], PopulationBasedTraining):
+            if not trainer.use_tune_checkpoints:
+                logger.warning(
+                    "You are using PopulationBasedTraining but you haven't enabled checkpointing. "
+                    "This means your trials will train from scratch everytime they are exploiting "
+                    "new configurations. Consider enabling checkpointing by passing "
+                    "`keep_checkpoints_num=1` as an additional argument to `Trainer.hyperparameter_search`."
+                )
+
+        # Check for `do_eval` and `eval_during_training` for schedulers that require intermediate reporting.
+        if isinstance(
+            kwargs["scheduler"], (ASHAScheduler, MedianStoppingRule, HyperBandForBOHB, PopulationBasedTraining)
+        ) and (not trainer.args.do_eval or trainer.args.evaluation_strategy == IntervalStrategy.NO):
+            raise RuntimeError(
+                "You are using {cls} as a scheduler but you haven't enabled evaluation during training. "
+                "This means your trials will not report intermediate results to Ray Tune, and "
+                "can thus not be stopped early or used to exploit other trials parameters. "
+                "If this is what you want, do not use {cls}. If you would like to use {cls}, "
+                "make sure you pass `do_eval=True` and `evaluation_strategy='steps'` in the "
+                "Trainer `args`.".format(cls=type(kwargs["scheduler"]).__name__)
+            )
+
+    trainable = ray.tune.with_parameters(_objective, local_trainer=trainer)
+
+    @functools.wraps(trainable)
+    def dynamic_modules_import_trainable(*args, **kwargs):
+        """
+        Wrapper around `tune.with_parameters` to ensure datasets_modules are loaded on each Actor.
+
+        Without this, an ImportError will be thrown. See https://github.com/huggingface/transformers/issues/11565.
+
+        Assumes that `_objective`, defined above, is a function.
+        """
+        if is_datasets_available():
+            import datasets.load
+
+            dynamic_modules_path = os.path.join(datasets.load.init_dynamic_modules(), "__init__.py")
+            # load dynamic_modules from path
+            spec = importlib.util.spec_from_file_location("datasets_modules", dynamic_modules_path)
+            datasets_modules = importlib.util.module_from_spec(spec)
+            sys.modules[spec.name] = datasets_modules
+            spec.loader.exec_module(datasets_modules)
+        return trainable(*args, **kwargs)
+
+    # special attr set by tune.with_parameters
+    if hasattr(trainable, "__mixins__"):
+        dynamic_modules_import_trainable.__mixins__ = trainable.__mixins__
+
+    analysis = ray.tune.run(
+        dynamic_modules_import_trainable,
+        config=trainer.hp_space(None),
+        num_samples=n_trials,
+        **kwargs,
+    )
+    best_trial = analysis.get_best_trial(metric="objective", mode=direction[:3], scope=trainer.args.ray_scope)
+    best_run = BestRun(best_trial.trial_id, best_trial.last_result["objective"], best_trial.config, analysis)
+    if _tb_writer is not None:
+        trainer.add_callback(_tb_writer)
+    return best_run
+
+
+def run_hp_search_sigopt(trainer, n_trials: int, direction: str, **kwargs) -> BestRun:
+    import sigopt
+
+    if trainer.args.process_index == 0:
+        if importlib.metadata.version("sigopt") >= "8.0.0":
+            sigopt.set_project("huggingface")
+
+            experiment = sigopt.create_experiment(
+                name="huggingface-tune",
+                type="offline",
+                parameters=trainer.hp_space(None),
+                metrics=[{"name": "objective", "objective": direction, "strategy": "optimize"}],
+                parallel_bandwidth=1,
+                budget=n_trials,
+            )
+
+            logger.info(f"created experiment: https://app.sigopt.com/experiment/{experiment.id}")
+
+            for run in experiment.loop():
+                with run:
+                    trainer.objective = None
+                    if trainer.args.world_size > 1:
+                        if trainer.args.parallel_mode != ParallelMode.DISTRIBUTED:
+                            raise RuntimeError("only support DDP Sigopt HPO for ParallelMode.DISTRIBUTED currently.")
+                        trainer._hp_search_setup(run.run)
+                        torch.distributed.broadcast_object_list(pickle.dumps(trainer.args), src=0)
+                        trainer.train(resume_from_checkpoint=None)
+                    else:
+                        trainer.train(resume_from_checkpoint=None, trial=run.run)
+                    # If there hasn't been any evaluation during the training loop.
+                    if getattr(trainer, "objective", None) is None:
+                        metrics = trainer.evaluate()
+                        trainer.objective = trainer.compute_objective(metrics)
+                    run.log_metric("objective", trainer.objective)
+
+            best = list(experiment.get_best_runs())[0]
+            best_run = BestRun(best.id, best.values["objective"].value, best.assignments)
+        else:
+            from sigopt import Connection
+
+            conn = Connection()
+            proxies = kwargs.pop("proxies", None)
+            if proxies is not None:
+                conn.set_proxies(proxies)
+
+            experiment = conn.experiments().create(
+                name="huggingface-tune",
+                parameters=trainer.hp_space(None),
+                metrics=[{"name": "objective", "objective": direction, "strategy": "optimize"}],
+                parallel_bandwidth=1,
+                observation_budget=n_trials,
+                project="huggingface",
+            )
+            logger.info(f"created experiment: https://app.sigopt.com/experiment/{experiment.id}")
+
+            while experiment.progress.observation_count < experiment.observation_budget:
+                suggestion = conn.experiments(experiment.id).suggestions().create()
+                trainer.objective = None
+                if trainer.args.world_size > 1:
+                    if trainer.args.parallel_mode != ParallelMode.DISTRIBUTED:
+                        raise RuntimeError("only support DDP Sigopt HPO for ParallelMode.DISTRIBUTED currently.")
+                    trainer._hp_search_setup(suggestion)
+                    torch.distributed.broadcast_object_list(pickle.dumps(trainer.args), src=0)
+                    trainer.train(resume_from_checkpoint=None)
+                else:
+                    trainer.train(resume_from_checkpoint=None, trial=suggestion)
+                # If there hasn't been any evaluation during the training loop.
+                if getattr(trainer, "objective", None) is None:
+                    metrics = trainer.evaluate()
+                    trainer.objective = trainer.compute_objective(metrics)
+
+                values = [{"name": "objective", "value": trainer.objective}]
+                obs = conn.experiments(experiment.id).observations().create(suggestion=suggestion.id, values=values)
+                logger.info(f"[suggestion_id, observation_id]: [{suggestion.id}, {obs.id}]")
+                experiment = conn.experiments(experiment.id).fetch()
+
+            best = list(conn.experiments(experiment.id).best_assignments().fetch().iterate_pages())[0]
+            best_run = BestRun(best.id, best.value, best.assignments)
+        return best_run
+    else:
+        for i in range(n_trials):
+            trainer.objective = None
+            args_main_rank = list(pickle.dumps(trainer.args))
+            if trainer.args.parallel_mode != ParallelMode.DISTRIBUTED:
+                raise RuntimeError("only support DDP Sigopt HPO for ParallelMode.DISTRIBUTED currently.")
+            torch.distributed.broadcast_object_list(args_main_rank, src=0)
+            args = pickle.loads(bytes(args_main_rank))
+            for key, value in asdict(args).items():
+                if key != "local_rank":
+                    setattr(trainer.args, key, value)
+            trainer.train(resume_from_checkpoint=None)
+            # If there hasn't been any evaluation during the training loop.
+            if getattr(trainer, "objective", None) is None:
+                metrics = trainer.evaluate()
+                trainer.objective = trainer.compute_objective(metrics)
+        return None
+
+
+def run_hp_search_wandb(trainer, n_trials: int, direction: str, **kwargs) -> BestRun:
+    from ..integrations import is_wandb_available
+
+    if not is_wandb_available():
+        raise ImportError("This function needs wandb installed: `pip install wandb`")
+    import wandb
+
+    # add WandbCallback if not already added in trainer callbacks
+    reporting_to_wandb = False
+    for callback in trainer.callback_handler.callbacks:
+        if isinstance(callback, WandbCallback):
+            reporting_to_wandb = True
+            break
+    if not reporting_to_wandb:
+        trainer.add_callback(WandbCallback())
+    trainer.args.report_to = ["wandb"]
+    best_trial = {"run_id": None, "objective": None, "hyperparameters": None}
+    sweep_id = kwargs.pop("sweep_id", None)
+    project = kwargs.pop("project", None)
+    name = kwargs.pop("name", None)
+    entity = kwargs.pop("entity", None)
+    metric = kwargs.pop("metric", "eval/loss")
+
+    sweep_config = trainer.hp_space(None)
+    sweep_config["metric"]["goal"] = direction
+    sweep_config["metric"]["name"] = metric
+    if name:
+        sweep_config["name"] = name
+
+    def _objective():
+        run = wandb.run if wandb.run else wandb.init()
+        trainer.state.trial_name = run.name
+        run.config.update({"assignments": {}, "metric": metric})
+        config = wandb.config
+
+        trainer.objective = None
+
+        trainer.train(resume_from_checkpoint=None, trial=vars(config)["_items"])
+        # If there hasn't been any evaluation during the training loop.
+        if getattr(trainer, "objective", None) is None:
+            metrics = trainer.evaluate()
+            trainer.objective = trainer.compute_objective(metrics)
+            format_metrics = rewrite_logs(metrics)
+            if metric not in format_metrics:
+                logger.warning(
+                    f"Provided metric {metric} not found. This might result in unexpected sweeps charts. The available"
+                    f" metrics are {format_metrics.keys()}"
+                )
+        best_score = False
+        if best_trial["run_id"] is not None:
+            if direction == "minimize":
+                best_score = trainer.objective < best_trial["objective"]
+            elif direction == "maximize":
+                best_score = trainer.objective > best_trial["objective"]
+
+        if best_score or best_trial["run_id"] is None:
+            best_trial["run_id"] = run.id
+            best_trial["objective"] = trainer.objective
+            best_trial["hyperparameters"] = dict(config)
+
+        return trainer.objective
+
+    sweep_id = wandb.sweep(sweep_config, project=project, entity=entity) if not sweep_id else sweep_id
+    logger.info(f"wandb sweep id - {sweep_id}")
+    wandb.agent(sweep_id, function=_objective, count=n_trials)
+
+    return BestRun(best_trial["run_id"], best_trial["objective"], best_trial["hyperparameters"])
+
+
+def get_available_reporting_integrations():
+    integrations = []
+    if is_azureml_available() and not is_mlflow_available():
+        integrations.append("azure_ml")
+    if is_comet_available():
+        integrations.append("comet_ml")
+    if is_dagshub_available():
+        integrations.append("dagshub")
+    if is_mlflow_available():
+        integrations.append("mlflow")
+    if is_neptune_available():
+        integrations.append("neptune")
+    if is_tensorboard_available():
+        integrations.append("tensorboard")
+    if is_wandb_available():
+        integrations.append("wandb")
+    if is_codecarbon_available():
+        integrations.append("codecarbon")
+    if is_clearml_available():
+        integrations.append("clearml")
+    return integrations
+
+
+def rewrite_logs(d):
+    new_d = {}
+    eval_prefix = "eval_"
+    eval_prefix_len = len(eval_prefix)
+    test_prefix = "test_"
+    test_prefix_len = len(test_prefix)
+    for k, v in d.items():
+        if k.startswith(eval_prefix):
+            new_d["eval/" + k[eval_prefix_len:]] = v
+        elif k.startswith(test_prefix):
+            new_d["test/" + k[test_prefix_len:]] = v
+        else:
+            new_d["train/" + k] = v
+    return new_d
+
+
+class TensorBoardCallback(TrainerCallback):
+    """
+    A [`TrainerCallback`] that sends the logs to [TensorBoard](https://www.tensorflow.org/tensorboard).
+
+    Args:
+        tb_writer (`SummaryWriter`, *optional*):
+            The writer to use. Will instantiate one if not set.
+    """
+
+    def __init__(self, tb_writer=None):
+        has_tensorboard = is_tensorboard_available()
+        if not has_tensorboard:
+            raise RuntimeError(
+                "TensorBoardCallback requires tensorboard to be installed. Either update your PyTorch version or"
+                " install tensorboardX."
+            )
+        if has_tensorboard:
+            try:
+                from torch.utils.tensorboard import SummaryWriter  # noqa: F401
+
+                self._SummaryWriter = SummaryWriter
+            except ImportError:
+                try:
+                    from tensorboardX import SummaryWriter
+
+                    self._SummaryWriter = SummaryWriter
+                except ImportError:
+                    self._SummaryWriter = None
+        else:
+            self._SummaryWriter = None
+        self.tb_writer = tb_writer
+
+    def _init_summary_writer(self, args, log_dir=None):
+        log_dir = log_dir or args.logging_dir
+        if self._SummaryWriter is not None:
+            self.tb_writer = self._SummaryWriter(log_dir=log_dir)
+
+    def on_train_begin(self, args, state, control, **kwargs):
+        if not state.is_world_process_zero:
+            return
+
+        log_dir = None
+
+        if state.is_hyper_param_search:
+            trial_name = state.trial_name
+            if trial_name is not None:
+                log_dir = os.path.join(args.logging_dir, trial_name)
+
+        if self.tb_writer is None:
+            self._init_summary_writer(args, log_dir)
+
+        if self.tb_writer is not None:
+            self.tb_writer.add_text("args", args.to_json_string())
+            if "model" in kwargs:
+                model = kwargs["model"]
+                if hasattr(model, "config") and model.config is not None:
+                    model_config_json = model.config.to_json_string()
+                    self.tb_writer.add_text("model_config", model_config_json)
+
+    def on_log(self, args, state, control, logs=None, **kwargs):
+        if not state.is_world_process_zero:
+            return
+
+        if self.tb_writer is None:
+            self._init_summary_writer(args)
+
+        if self.tb_writer is not None:
+            logs = rewrite_logs(logs)
+            for k, v in logs.items():
+                if isinstance(v, (int, float)):
+                    self.tb_writer.add_scalar(k, v, state.global_step)
+                else:
+                    logger.warning(
+                        "Trainer is attempting to log a value of "
+                        f'"{v}" of type {type(v)} for key "{k}" as a scalar. '
+                        "This invocation of Tensorboard's writer.add_scalar() "
+                        "is incorrect so we dropped this attribute."
+                    )
+            self.tb_writer.flush()
+
+    def on_train_end(self, args, state, control, **kwargs):
+        if self.tb_writer:
+            self.tb_writer.close()
+            self.tb_writer = None
+
+
+class WandbCallback(TrainerCallback):
+    """
+    A [`TrainerCallback`] that logs metrics, media, model checkpoints to [Weight and Biases](https://www.wandb.com/).
+    """
+
+    def __init__(self):
+        has_wandb = is_wandb_available()
+        if not has_wandb:
+            raise RuntimeError("WandbCallback requires wandb to be installed. Run `pip install wandb`.")
+        if has_wandb:
+            import wandb
+
+            self._wandb = wandb
+        self._initialized = False
+        # log model
+        if os.getenv("WANDB_LOG_MODEL", "FALSE").upper() in ENV_VARS_TRUE_VALUES.union({"TRUE"}):
+            DeprecationWarning(
+                f"Setting `WANDB_LOG_MODEL` as {os.getenv('WANDB_LOG_MODEL')} is deprecated and will be removed in "
+                "version 5 of transformers. Use one of `'end'` or `'checkpoint'` instead."
+            )
+            logger.info(f"Setting `WANDB_LOG_MODEL` from {os.getenv('WANDB_LOG_MODEL')} to `end` instead")
+            self._log_model = "end"
+        else:
+            self._log_model = os.getenv("WANDB_LOG_MODEL", "false").lower()
+
+    def setup(self, args, state, model, **kwargs):
+        """
+        Setup the optional Weights & Biases (*wandb*) integration.
+
+        One can subclass and override this method to customize the setup if needed. Find more information
+        [here](https://docs.wandb.ai/guides/integrations/huggingface). You can also override the following environment
+        variables:
+
+        Environment:
+        - **WANDB_LOG_MODEL** (`str`, *optional*, defaults to `"false"`):
+            Whether to log model and checkpoints during training. Can be `"end"`, `"checkpoint"` or `"false"`. If set
+            to `"end"`, the model will be uploaded at the end of training. If set to `"checkpoint"`, the checkpoint
+            will be uploaded every `args.save_steps` . If set to `"false"`, the model will not be uploaded. Use along
+            with [`~transformers.TrainingArguments.load_best_model_at_end`] to upload best model.
+
+            <Deprecated version="5.0">
+
+            Setting `WANDB_LOG_MODEL` as `bool` will be deprecated in version 5 of 🤗 Transformers.
+
+            </Deprecated>
+        - **WANDB_WATCH** (`str`, *optional* defaults to `"false"`):
+            Can be `"gradients"`, `"all"`, `"parameters"`, or `"false"`. Set to `"all"` to log gradients and
+            parameters.
+        - **WANDB_PROJECT** (`str`, *optional*, defaults to `"huggingface"`):
+            Set this to a custom string to store results in a different project.
+        - **WANDB_DISABLED** (`bool`, *optional*, defaults to `False`):
+            Whether to disable wandb entirely. Set `WANDB_DISABLED=true` to disable.
+        """
+        if self._wandb is None:
+            return
+        self._initialized = True
+        if state.is_world_process_zero:
+            logger.info(
+                'Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"'
+            )
+            combined_dict = {**args.to_dict()}
+
+            if hasattr(model, "config") and model.config is not None:
+                model_config = model.config.to_dict()
+                combined_dict = {**model_config, **combined_dict}
+            trial_name = state.trial_name
+            init_args = {}
+            if trial_name is not None:
+                init_args["name"] = trial_name
+                init_args["group"] = args.run_name
+            else:
+                if not (args.run_name is None or args.run_name == args.output_dir):
+                    init_args["name"] = args.run_name
+
+            if self._wandb.run is None:
+                self._wandb.init(
+                    project=os.getenv("WANDB_PROJECT", "huggingface"),
+                    **init_args,
+                )
+            # add config parameters (run may have been created manually)
+            self._wandb.config.update(combined_dict, allow_val_change=True)
+
+            # define default x-axis (for latest wandb versions)
+            if getattr(self._wandb, "define_metric", None):
+                self._wandb.define_metric("train/global_step")
+                self._wandb.define_metric("*", step_metric="train/global_step", step_sync=True)
+
+            # keep track of model topology and gradients, unsupported on TPU
+            _watch_model = os.getenv("WANDB_WATCH", "false")
+            if not is_torch_tpu_available() and _watch_model in ("all", "parameters", "gradients"):
+                self._wandb.watch(model, log=_watch_model, log_freq=max(100, state.logging_steps))
+            self._wandb.run._label(code="transformers_trainer")
+
+    def on_train_begin(self, args, state, control, model=None, **kwargs):
+        if self._wandb is None:
+            return
+        hp_search = state.is_hyper_param_search
+        if hp_search:
+            self._wandb.finish()
+            self._initialized = False
+            args.run_name = None
+        if not self._initialized:
+            self.setup(args, state, model, **kwargs)
+
+    def on_train_end(self, args, state, control, model=None, tokenizer=None, **kwargs):
+        if self._wandb is None:
+            return
+        if self._log_model in ("end", "checkpoint") and self._initialized and state.is_world_process_zero:
+            from ..trainer import Trainer
+
+            fake_trainer = Trainer(args=args, model=model, tokenizer=tokenizer)
+            with tempfile.TemporaryDirectory() as temp_dir:
+                fake_trainer.save_model(temp_dir)
+                metadata = (
+                    {
+                        k: v
+                        for k, v in dict(self._wandb.summary).items()
+                        if isinstance(v, numbers.Number) and not k.startswith("_")
+                    }
+                    if not args.load_best_model_at_end
+                    else {
+                        f"eval/{args.metric_for_best_model}": state.best_metric,
+                        "train/total_floss": state.total_flos,
+                    }
+                )
+                logger.info("Logging model artifacts. ...")
+                model_name = (
+                    f"model-{self._wandb.run.id}"
+                    if (args.run_name is None or args.run_name == args.output_dir)
+                    else f"model-{self._wandb.run.name}"
+                )
+                artifact = self._wandb.Artifact(name=model_name, type="model", metadata=metadata)
+                for f in Path(temp_dir).glob("*"):
+                    if f.is_file():
+                        with artifact.new_file(f.name, mode="wb") as fa:
+                            fa.write(f.read_bytes())
+                self._wandb.run.log_artifact(artifact)
+
+    def on_log(self, args, state, control, model=None, logs=None, **kwargs):
+        if self._wandb is None:
+            return
+        if not self._initialized:
+            self.setup(args, state, model)
+        if state.is_world_process_zero:
+            logs = rewrite_logs(logs)
+            self._wandb.log({**logs, "train/global_step": state.global_step})
+
+    def on_save(self, args, state, control, **kwargs):
+        if self._log_model == "checkpoint" and self._initialized and state.is_world_process_zero:
+            checkpoint_metadata = {
+                k: v
+                for k, v in dict(self._wandb.summary).items()
+                if isinstance(v, numbers.Number) and not k.startswith("_")
+            }
+
+            ckpt_dir = f"checkpoint-{state.global_step}"
+            artifact_path = os.path.join(args.output_dir, ckpt_dir)
+            logger.info(f"Logging checkpoint artifacts in {ckpt_dir}. ...")
+            checkpoint_name = (
+                f"checkpoint-{self._wandb.run.id}"
+                if (args.run_name is None or args.run_name == args.output_dir)
+                else f"checkpoint-{self._wandb.run.name}"
+            )
+            artifact = self._wandb.Artifact(name=checkpoint_name, type="model", metadata=checkpoint_metadata)
+            artifact.add_dir(artifact_path)
+            self._wandb.log_artifact(artifact, aliases=[f"checkpoint-{state.global_step}"])
+
+
+class CometCallback(TrainerCallback):
+    """
+    A [`TrainerCallback`] that sends the logs to [Comet ML](https://www.comet.ml/site/).
+    """
+
+    def __init__(self):
+        if not _has_comet:
+            raise RuntimeError("CometCallback requires comet-ml to be installed. Run `pip install comet-ml`.")
+        self._initialized = False
+        self._log_assets = False
+
+    def setup(self, args, state, model):
+        """
+        Setup the optional Comet.ml integration.
+
+        Environment:
+        - **COMET_MODE** (`str`, *optional*, defaults to `ONLINE`):
+            Whether to create an online, offline experiment or disable Comet logging. Can be `OFFLINE`, `ONLINE`, or
+            `DISABLED`.
+        - **COMET_PROJECT_NAME** (`str`, *optional*):
+            Comet project name for experiments.
+        - **COMET_OFFLINE_DIRECTORY** (`str`, *optional*):
+            Folder to use for saving offline experiments when `COMET_MODE` is `OFFLINE`.
+        - **COMET_LOG_ASSETS** (`str`, *optional*, defaults to `TRUE`):
+            Whether or not to log training assets (tf event logs, checkpoints, etc), to Comet. Can be `TRUE`, or
+            `FALSE`.
+
+        For a number of configurable items in the environment, see
+        [here](https://www.comet.ml/docs/python-sdk/advanced/#comet-configuration-variables).
+        """
+        self._initialized = True
+        log_assets = os.getenv("COMET_LOG_ASSETS", "FALSE").upper()
+        if log_assets in {"TRUE", "1"}:
+            self._log_assets = True
+        if state.is_world_process_zero:
+            comet_mode = os.getenv("COMET_MODE", "ONLINE").upper()
+            experiment = None
+            experiment_kwargs = {"project_name": os.getenv("COMET_PROJECT_NAME", "huggingface")}
+            if comet_mode == "ONLINE":
+                experiment = comet_ml.Experiment(**experiment_kwargs)
+                experiment.log_other("Created from", "transformers")
+                logger.info("Automatic Comet.ml online logging enabled")
+            elif comet_mode == "OFFLINE":
+                experiment_kwargs["offline_directory"] = os.getenv("COMET_OFFLINE_DIRECTORY", "./")
+                experiment = comet_ml.OfflineExperiment(**experiment_kwargs)
+                experiment.log_other("Created from", "transformers")
+                logger.info("Automatic Comet.ml offline logging enabled; use `comet upload` when finished")
+            if experiment is not None:
+                experiment._set_model_graph(model, framework="transformers")
+                experiment._log_parameters(args, prefix="args/", framework="transformers")
+                if hasattr(model, "config"):
+                    experiment._log_parameters(model.config, prefix="config/", framework="transformers")
+
+    def on_train_begin(self, args, state, control, model=None, **kwargs):
+        if not self._initialized:
+            self.setup(args, state, model)
+
+    def on_log(self, args, state, control, model=None, logs=None, **kwargs):
+        if not self._initialized:
+            self.setup(args, state, model)
+        if state.is_world_process_zero:
+            experiment = comet_ml.config.get_global_experiment()
+            if experiment is not None:
+                experiment._log_metrics(logs, step=state.global_step, epoch=state.epoch, framework="transformers")
+
+    def on_train_end(self, args, state, control, **kwargs):
+        if self._initialized and state.is_world_process_zero:
+            experiment = comet_ml.config.get_global_experiment()
+            if experiment is not None:
+                if self._log_assets is True:
+                    logger.info("Logging checkpoints. This may take time.")
+                    experiment.log_asset_folder(
+                        args.output_dir, recursive=True, log_file_name=True, step=state.global_step
+                    )
+                experiment.end()
+
+
+class AzureMLCallback(TrainerCallback):
+    """
+    A [`TrainerCallback`] that sends the logs to [AzureML](https://pypi.org/project/azureml-sdk/).
+    """
+
+    def __init__(self, azureml_run=None):
+        if not is_azureml_available():
+            raise RuntimeError("AzureMLCallback requires azureml to be installed. Run `pip install azureml-sdk`.")
+        self.azureml_run = azureml_run
+
+    def on_init_end(self, args, state, control, **kwargs):
+        from azureml.core.run import Run
+
+        if self.azureml_run is None and state.is_world_process_zero:
+            self.azureml_run = Run.get_context()
+
+    def on_log(self, args, state, control, logs=None, **kwargs):
+        if self.azureml_run and state.is_world_process_zero:
+            for k, v in logs.items():
+                if isinstance(v, (int, float)):
+                    self.azureml_run.log(k, v, description=k)
+
+
+class MLflowCallback(TrainerCallback):
+    """
+    A [`TrainerCallback`] that sends the logs to [MLflow](https://www.mlflow.org/). Can be disabled by setting
+    environment variable `DISABLE_MLFLOW_INTEGRATION = TRUE`.
+    """
+
+    def __init__(self):
+        if not is_mlflow_available():
+            raise RuntimeError("MLflowCallback requires mlflow to be installed. Run `pip install mlflow`.")
+        import mlflow
+
+        self._MAX_PARAM_VAL_LENGTH = mlflow.utils.validation.MAX_PARAM_VAL_LENGTH
+        self._MAX_PARAMS_TAGS_PER_BATCH = mlflow.utils.validation.MAX_PARAMS_TAGS_PER_BATCH
+
+        self._initialized = False
+        self._auto_end_run = False
+        self._log_artifacts = False
+        self._ml_flow = mlflow
+
+    def setup(self, args, state, model):
+        """
+        Setup the optional MLflow integration.
+
+        Environment:
+        - **HF_MLFLOW_LOG_ARTIFACTS** (`str`, *optional*):
+            Whether to use MLflow `.log_artifact()` facility to log artifacts. This only makes sense if logging to a
+            remote server, e.g. s3 or GCS. If set to `True` or *1*, will copy each saved checkpoint on each save in
+            [`TrainingArguments`]'s `output_dir` to the local or remote artifact storage. Using it without a remote
+            storage will just copy the files to your artifact location.
+        - **MLFLOW_EXPERIMENT_NAME** (`str`, *optional*, defaults to `None`):
+            Whether to use an MLflow experiment_name under which to launch the run. Default to `None` which will point
+            to the `Default` experiment in MLflow. Otherwise, it is a case sensitive name of the experiment to be
+            activated. If an experiment with this name does not exist, a new experiment with this name is created.
+        - **MLFLOW_TAGS** (`str`, *optional*):
+            A string dump of a dictionary of key/value pair to be added to the MLflow run as tags. Example:
+            `os.environ['MLFLOW_TAGS']='{"release.candidate": "RC1", "release.version": "2.2.0"}'`.
+        - **MLFLOW_NESTED_RUN** (`str`, *optional*):
+            Whether to use MLflow nested runs. If set to `True` or *1*, will create a nested run inside the current
+            run.
+        - **MLFLOW_RUN_ID** (`str`, *optional*):
+            Allow to reattach to an existing run which can be usefull when resuming training from a checkpoint. When
+            `MLFLOW_RUN_ID` environment variable is set, `start_run` attempts to resume a run with the specified run ID
+            and other parameters are ignored.
+        - **MLFLOW_FLATTEN_PARAMS** (`str`, *optional*, defaults to `False`):
+            Whether to flatten the parameters dictionary before logging.
+        """
+        self._log_artifacts = os.getenv("HF_MLFLOW_LOG_ARTIFACTS", "FALSE").upper() in ENV_VARS_TRUE_VALUES
+        self._nested_run = os.getenv("MLFLOW_NESTED_RUN", "FALSE").upper() in ENV_VARS_TRUE_VALUES
+        self._experiment_name = os.getenv("MLFLOW_EXPERIMENT_NAME", None)
+        self._flatten_params = os.getenv("MLFLOW_FLATTEN_PARAMS", "FALSE").upper() in ENV_VARS_TRUE_VALUES
+        self._run_id = os.getenv("MLFLOW_RUN_ID", None)
+        logger.debug(
+            f"MLflow experiment_name={self._experiment_name}, run_name={args.run_name}, nested={self._nested_run},"
+            f" tags={self._nested_run}"
+        )
+        if state.is_world_process_zero:
+            if self._ml_flow.active_run() is None or self._nested_run or self._run_id:
+                if self._experiment_name:
+                    # Use of set_experiment() ensure that Experiment is created if not exists
+                    self._ml_flow.set_experiment(self._experiment_name)
+                self._ml_flow.start_run(run_name=args.run_name, nested=self._nested_run)
+                logger.debug(f"MLflow run started with run_id={self._ml_flow.active_run().info.run_id}")
+                self._auto_end_run = True
+            combined_dict = args.to_dict()
+            if hasattr(model, "config") and model.config is not None:
+                model_config = model.config.to_dict()
+                combined_dict = {**model_config, **combined_dict}
+            combined_dict = flatten_dict(combined_dict) if self._flatten_params else combined_dict
+            # remove params that are too long for MLflow
+            for name, value in list(combined_dict.items()):
+                # internally, all values are converted to str in MLflow
+                if len(str(value)) > self._MAX_PARAM_VAL_LENGTH:
+                    logger.warning(
+                        f'Trainer is attempting to log a value of "{value}" for key "{name}" as a parameter. MLflow\'s'
+                        " log_param() only accepts values no longer than 250 characters so we dropped this attribute."
+                        " You can use `MLFLOW_FLATTEN_PARAMS` environment variable to flatten the parameters and"
+                        " avoid this message."
+                    )
+                    del combined_dict[name]
+            # MLflow cannot log more than 100 values in one go, so we have to split it
+            combined_dict_items = list(combined_dict.items())
+            for i in range(0, len(combined_dict_items), self._MAX_PARAMS_TAGS_PER_BATCH):
+                self._ml_flow.log_params(dict(combined_dict_items[i : i + self._MAX_PARAMS_TAGS_PER_BATCH]))
+            mlflow_tags = os.getenv("MLFLOW_TAGS", None)
+            if mlflow_tags:
+                mlflow_tags = json.loads(mlflow_tags)
+                self._ml_flow.set_tags(mlflow_tags)
+        self._initialized = True
+
+    def on_train_begin(self, args, state, control, model=None, **kwargs):
+        if not self._initialized:
+            self.setup(args, state, model)
+
+    def on_log(self, args, state, control, logs, model=None, **kwargs):
+        if not self._initialized:
+            self.setup(args, state, model)
+        if state.is_world_process_zero:
+            metrics = {}
+            for k, v in logs.items():
+                if isinstance(v, (int, float)):
+                    metrics[k] = v
+                else:
+                    logger.warning(
+                        f'Trainer is attempting to log a value of "{v}" of type {type(v)} for key "{k}" as a metric. '
+                        "MLflow's log_metric() only accepts float and int types so we dropped this attribute."
+                    )
+            self._ml_flow.log_metrics(metrics=metrics, step=state.global_step)
+
+    def on_train_end(self, args, state, control, **kwargs):
+        if self._initialized and state.is_world_process_zero:
+            if self._auto_end_run and self._ml_flow.active_run():
+                self._ml_flow.end_run()
+
+    def on_save(self, args, state, control, **kwargs):
+        if self._initialized and state.is_world_process_zero and self._log_artifacts:
+            ckpt_dir = f"checkpoint-{state.global_step}"
+            artifact_path = os.path.join(args.output_dir, ckpt_dir)
+            logger.info(f"Logging checkpoint artifacts in {ckpt_dir}. This may take time.")
+            self._ml_flow.pyfunc.log_model(
+                ckpt_dir,
+                artifacts={"model_path": artifact_path},
+                python_model=self._ml_flow.pyfunc.PythonModel(),
+            )
+
+    def __del__(self):
+        # if the previous run is not terminated correctly, the fluent API will
+        # not let you start a new run before the previous one is killed
+        if (
+            self._auto_end_run
+            and callable(getattr(self._ml_flow, "active_run", None))
+            and self._ml_flow.active_run() is not None
+        ):
+            self._ml_flow.end_run()
+
+
+class DagsHubCallback(MLflowCallback):
+    """
+    A [`TrainerCallback`] that logs to [DagsHub](https://dagshub.com/). Extends [`MLflowCallback`]
+    """
+
+    def __init__(self):
+        super().__init__()
+        if not is_dagshub_available():
+            raise ImportError("DagsHubCallback requires dagshub to be installed. Run `pip install dagshub`.")
+
+        from dagshub.upload import Repo
+
+        self.Repo = Repo
+
+    def setup(self, *args, **kwargs):
+        """
+        Setup the DagsHub's Logging integration.
+
+        Environment:
+        - **HF_DAGSHUB_LOG_ARTIFACTS** (`str`, *optional*):
+                Whether to save the data and model artifacts for the experiment. Default to `False`.
+        """
+
+        self.log_artifacts = os.getenv("HF_DAGSHUB_LOG_ARTIFACTS", "FALSE").upper() in ENV_VARS_TRUE_VALUES
+        self.name = os.getenv("HF_DAGSHUB_MODEL_NAME") or "main"
+        self.remote = os.getenv("MLFLOW_TRACKING_URI")
+        self.repo = self.Repo(
+            owner=self.remote.split(os.sep)[-2],
+            name=self.remote.split(os.sep)[-1].split(".")[0],
+            branch=os.getenv("BRANCH") or "main",
+        )
+        self.path = Path("artifacts")
+
+        if self.remote is None:
+            raise RuntimeError(
+                "DagsHubCallback requires the `MLFLOW_TRACKING_URI` environment variable to be set. Did you run"
+                " `dagshub.init()`?"
+            )
+
+        super().setup(*args, **kwargs)
+
+    def on_train_end(self, args, state, control, **kwargs):
+        if self.log_artifacts:
+            if getattr(self, "train_dataloader", None):
+                torch.save(self.train_dataloader.dataset, os.path.join(args.output_dir, "dataset.pt"))
+
+            self.repo.directory(str(self.path)).add_dir(args.output_dir)
+
+
+class NeptuneMissingConfiguration(Exception):
+    def __init__(self):
+        super().__init__(
+            """
+        ------ Unsupported ---- We were not able to create new runs. You provided a custom Neptune run to
+        `NeptuneCallback` with the `run` argument. For the integration to work fully, provide your `api_token` and
+        `project` by saving them as environment variables or passing them to the callback.
+        """
+        )
+
+
+class NeptuneCallback(TrainerCallback):
+    """TrainerCallback that sends the logs to [Neptune](https://app.neptune.ai).
+
+    Args:
+        api_token (`str`, *optional*): Neptune API token obtained upon registration.
+            You can leave this argument out if you have saved your token to the `NEPTUNE_API_TOKEN` environment
+            variable (strongly recommended). See full setup instructions in the
+            [docs](https://docs.neptune.ai/setup/installation).
+        project (`str`, *optional*): Name of an existing Neptune project, in the form "workspace-name/project-name".
+            You can find and copy the name in Neptune from the project settings -> Properties. If None (default), the
+            value of the `NEPTUNE_PROJECT` environment variable is used.
+        name (`str`, *optional*): Custom name for the run.
+        base_namespace (`str`, optional, defaults to "finetuning"): In the Neptune run, the root namespace
+            that will contain all of the metadata logged by the callback.
+        log_parameters (`bool`, *optional*, defaults to `True`):
+            If True, logs all Trainer arguments and model parameters provided by the Trainer.
+        log_checkpoints (`str`, *optional*): If "same", uploads checkpoints whenever they are saved by the Trainer.
+            If "last", uploads only the most recently saved checkpoint. If "best", uploads the best checkpoint (among
+            the ones saved by the Trainer). If `None`, does not upload checkpoints.
+        run (`Run`, *optional*): Pass a Neptune run object if you want to continue logging to an existing run.
+            Read more about resuming runs in the [docs](https://docs.neptune.ai/logging/to_existing_object).
+        **neptune_run_kwargs (*optional*):
+            Additional keyword arguments to be passed directly to the
+            [`neptune.init_run()`](https://docs.neptune.ai/api/neptune#init_run) function when a new run is created.
+
+    For instructions and examples, see the [Transformers integration
+    guide](https://docs.neptune.ai/integrations/transformers) in the Neptune documentation.
+    """
+
+    integration_version_key = "source_code/integrations/transformers"
+    model_parameters_key = "model_parameters"
+    trial_name_key = "trial"
+    trial_params_key = "trial_params"
+    trainer_parameters_key = "trainer_parameters"
+    flat_metrics = {"train/epoch"}
+
+    def __init__(
+        self,
+        *,
+        api_token: Optional[str] = None,
+        project: Optional[str] = None,
+        name: Optional[str] = None,
+        base_namespace: str = "finetuning",
+        run=None,
+        log_parameters: bool = True,
+        log_checkpoints: Optional[str] = None,
+        **neptune_run_kwargs,
+    ):
+        if not is_neptune_available():
+            raise ValueError(
+                "NeptuneCallback requires the Neptune client library to be installed. "
+                "To install the library, run `pip install neptune`."
+            )
+
+        try:
+            from neptune import Run
+            from neptune.internal.utils import verify_type
+        except ImportError:
+            from neptune.new.internal.utils import verify_type
+            from neptune.new.metadata_containers.run import Run
+
+        verify_type("api_token", api_token, (str, type(None)))
+        verify_type("project", project, (str, type(None)))
+        verify_type("name", name, (str, type(None)))
+        verify_type("base_namespace", base_namespace, str)
+        verify_type("run", run, (Run, type(None)))
+        verify_type("log_parameters", log_parameters, bool)
+        verify_type("log_checkpoints", log_checkpoints, (str, type(None)))
+
+        self._base_namespace_path = base_namespace
+        self._log_parameters = log_parameters
+        self._log_checkpoints = log_checkpoints
+        self._initial_run: Optional[Run] = run
+
+        self._run = None
+        self._is_monitoring_run = False
+        self._run_id = None
+        self._force_reset_monitoring_run = False
+        self._init_run_kwargs = {"api_token": api_token, "project": project, "name": name, **neptune_run_kwargs}
+
+        self._volatile_checkpoints_dir = None
+        self._should_upload_checkpoint = self._log_checkpoints is not None
+        self._recent_checkpoint_path = None
+
+        if self._log_checkpoints in {"last", "best"}:
+            self._target_checkpoints_namespace = f"checkpoints/{self._log_checkpoints}"
+            self._should_clean_recently_uploaded_checkpoint = True
+        else:
+            self._target_checkpoints_namespace = "checkpoints"
+            self._should_clean_recently_uploaded_checkpoint = False
+
+    def _stop_run_if_exists(self):
+        if self._run:
+            self._run.stop()
+            del self._run
+            self._run = None
+
+    def _initialize_run(self, **additional_neptune_kwargs):
+        try:
+            from neptune import init_run
+            from neptune.exceptions import NeptuneMissingApiTokenException, NeptuneMissingProjectNameException
+        except ImportError:
+            from neptune.new import init_run
+            from neptune.new.exceptions import NeptuneMissingApiTokenException, NeptuneMissingProjectNameException
+
+        self._stop_run_if_exists()
+
+        try:
+            self._run = init_run(**self._init_run_kwargs, **additional_neptune_kwargs)
+            self._run_id = self._run["sys/id"].fetch()
+        except (NeptuneMissingProjectNameException, NeptuneMissingApiTokenException) as e:
+            raise NeptuneMissingConfiguration() from e
+
+    def _use_initial_run(self):
+        self._run = self._initial_run
+        self._is_monitoring_run = True
+        self._run_id = self._run["sys/id"].fetch()
+        self._initial_run = None
+
+    def _ensure_run_with_monitoring(self):
+        if self._initial_run is not None:
+            self._use_initial_run()
+        else:
+            if not self._force_reset_monitoring_run and self._is_monitoring_run:
+                return
+
+            if self._run and not self._is_monitoring_run and not self._force_reset_monitoring_run:
+                self._initialize_run(with_id=self._run_id)
+                self._is_monitoring_run = True
+            else:
+                self._initialize_run()
+                self._force_reset_monitoring_run = False
+
+    def _ensure_at_least_run_without_monitoring(self):
+        if self._initial_run is not None:
+            self._use_initial_run()
+        else:
+            if not self._run:
+                self._initialize_run(
+                    with_id=self._run_id,
+                    capture_stdout=False,
+                    capture_stderr=False,
+                    capture_hardware_metrics=False,
+                    capture_traceback=False,
+                )
+                self._is_monitoring_run = False
+
+    @property
+    def run(self):
+        if self._run is None:
+            self._ensure_at_least_run_without_monitoring()
+        return self._run
+
+    @property
+    def _metadata_namespace(self):
+        return self.run[self._base_namespace_path]
+
+    def _log_integration_version(self):
+        self.run[NeptuneCallback.integration_version_key] = version
+
+    def _log_trainer_parameters(self, args):
+        self._metadata_namespace[NeptuneCallback.trainer_parameters_key] = args.to_sanitized_dict()
+
+    def _log_model_parameters(self, model):
+        from neptune.utils import stringify_unsupported
+
+        if model and hasattr(model, "config") and model.config is not None:
+            self._metadata_namespace[NeptuneCallback.model_parameters_key] = stringify_unsupported(
+                model.config.to_dict()
+            )
+
+    def _log_hyper_param_search_parameters(self, state):
+        if state and hasattr(state, "trial_name"):
+            self._metadata_namespace[NeptuneCallback.trial_name_key] = state.trial_name
+
+        if state and hasattr(state, "trial_params") and state.trial_params is not None:
+            self._metadata_namespace[NeptuneCallback.trial_params_key] = state.trial_params
+
+    def _log_model_checkpoint(self, source_directory: str, checkpoint: str):
+        target_path = relative_path = os.path.join(source_directory, checkpoint)
+
+        if self._volatile_checkpoints_dir is not None:
+            consistent_checkpoint_path = os.path.join(self._volatile_checkpoints_dir, checkpoint)
+            try:
+                # Remove leading ../ from a relative path.
+                cpkt_path = relative_path.replace("..", "").lstrip(os.path.sep)
+                copy_path = os.path.join(consistent_checkpoint_path, cpkt_path)
+                shutil.copytree(relative_path, copy_path)
+                target_path = consistent_checkpoint_path
+            except IOError as e:
+                logger.warning(
+                    "NeptuneCallback was unable to made a copy of checkpoint due to I/O exception: '{}'."
+                    "Could fail trying to upload.".format(e)
+                )
+
+        self._metadata_namespace[self._target_checkpoints_namespace].upload_files(target_path)
+
+        if self._should_clean_recently_uploaded_checkpoint and self._recent_checkpoint_path is not None:
+            self._metadata_namespace[self._target_checkpoints_namespace].delete_files(self._recent_checkpoint_path)
+
+        self._recent_checkpoint_path = relative_path
+
+    def on_init_end(self, args, state, control, **kwargs):
+        self._volatile_checkpoints_dir = None
+        if self._log_checkpoints and (args.overwrite_output_dir or args.save_total_limit is not None):
+            self._volatile_checkpoints_dir = tempfile.TemporaryDirectory().name
+
+        if self._log_checkpoints == "best" and not args.load_best_model_at_end:
+            raise ValueError("To save the best model checkpoint, the load_best_model_at_end argument must be enabled.")
+
+    def on_train_begin(self, args, state, control, model=None, **kwargs):
+        if not state.is_world_process_zero:
+            return
+
+        self._ensure_run_with_monitoring()
+        self._force_reset_monitoring_run = True
+
+        self._log_integration_version()
+        if self._log_parameters:
+            self._log_trainer_parameters(args)
+            self._log_model_parameters(model)
+
+        if state.is_hyper_param_search:
+            self._log_hyper_param_search_parameters(state)
+
+    def on_train_end(self, args, state, control, **kwargs):
+        self._stop_run_if_exists()
+
+    def __del__(self):
+        if self._volatile_checkpoints_dir is not None:
+            shutil.rmtree(self._volatile_checkpoints_dir, ignore_errors=True)
+
+        self._stop_run_if_exists()
+
+    def on_save(self, args, state, control, **kwargs):
+        if self._should_upload_checkpoint:
+            self._log_model_checkpoint(args.output_dir, f"checkpoint-{state.global_step}")
+
+    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
+        if self._log_checkpoints == "best":
+            best_metric_name = args.metric_for_best_model
+            if not best_metric_name.startswith("eval_"):
+                best_metric_name = f"eval_{best_metric_name}"
+
+            metric_value = metrics.get(best_metric_name)
+
+            operator = np.greater if args.greater_is_better else np.less
+
+            self._should_upload_checkpoint = state.best_metric is None or operator(metric_value, state.best_metric)
+
+    @classmethod
+    def get_run(cls, trainer):
+        for callback in trainer.callback_handler.callbacks:
+            if isinstance(callback, cls):
+                return callback.run
+
+        raise Exception("The trainer doesn't have a NeptuneCallback configured.")
+
+    def on_log(self, args, state, control, logs: Optional[Dict[str, float]] = None, **kwargs):
+        if not state.is_world_process_zero:
+            return
+
+        if logs is not None:
+            for name, value in rewrite_logs(logs).items():
+                if isinstance(value, (int, float)):
+                    if name in NeptuneCallback.flat_metrics:
+                        self._metadata_namespace[name] = value
+                    else:
+                        self._metadata_namespace[name].log(value, step=state.global_step)
+
+
+class CodeCarbonCallback(TrainerCallback):
+    """
+    A [`TrainerCallback`] that tracks the CO2 emission of training.
+    """
+
+    def __init__(self):
+        if not is_codecarbon_available():
+            raise RuntimeError(
+                "CodeCarbonCallback requires `codecarbon` to be installed. Run `pip install codecarbon`."
+            )
+        import codecarbon
+
+        self._codecarbon = codecarbon
+        self.tracker = None
+
+    def on_init_end(self, args, state, control, **kwargs):
+        if self.tracker is None and state.is_local_process_zero:
+            # CodeCarbon will automatically handle environment variables for configuration
+            self.tracker = self._codecarbon.EmissionsTracker(output_dir=args.output_dir)
+
+    def on_train_begin(self, args, state, control, model=None, **kwargs):
+        if self.tracker and state.is_local_process_zero:
+            self.tracker.start()
+
+    def on_train_end(self, args, state, control, **kwargs):
+        if self.tracker and state.is_local_process_zero:
+            self.tracker.stop()
+
+
+class ClearMLCallback(TrainerCallback):
+    """
+    A [`TrainerCallback`] that sends the logs to [ClearML](https://clear.ml/).
+
+    Environment:
+    - **CLEARML_PROJECT** (`str`, *optional*, defaults to `HuggingFace Transformers`):
+        ClearML project name.
+    - **CLEARML_TASK** (`str`, *optional*, defaults to `Trainer`):
+        ClearML task name.
+    - **CLEARML_LOG_MODEL** (`bool`, *optional*, defaults to `False`):
+        Whether to log models as artifacts during training.
+    """
+
+    def __init__(self):
+        if is_clearml_available():
+            import clearml
+
+            self._clearml = clearml
+        else:
+            raise RuntimeError("ClearMLCallback requires 'clearml' to be installed. Run `pip install clearml`.")
+
+        self._initialized = False
+        self._initialized_externally = False
+        self._clearml_task = None
+
+        self._log_model = os.getenv("CLEARML_LOG_MODEL", "FALSE").upper() in ENV_VARS_TRUE_VALUES.union({"TRUE"})
+
+    def setup(self, args, state, model, tokenizer, **kwargs):
+        if self._clearml is None:
+            return
+        if self._initialized:
+            return
+        if state.is_world_process_zero:
+            logger.info("Automatic ClearML logging enabled.")
+            if self._clearml_task is None:
+                # This might happen when running inside of a pipeline, where the task is already initialized
+                # from outside of Hugging Face
+                if self._clearml.Task.current_task():
+                    self._clearml_task = self._clearml.Task.current_task()
+                    self._initialized = True
+                    self._initialized_externally = True
+                    logger.info("External ClearML Task has been connected.")
+                else:
+                    self._clearml_task = self._clearml.Task.init(
+                        project_name=os.getenv("CLEARML_PROJECT", "HuggingFace Transformers"),
+                        task_name=os.getenv("CLEARML_TASK", "Trainer"),
+                        auto_connect_frameworks={"tensorboard": False, "pytorch": False},
+                        output_uri=True,
+                    )
+                    self._initialized = True
+                    logger.info("ClearML Task has been initialized.")
+
+            self._clearml_task.connect(args, "Args")
+            if hasattr(model, "config") and model.config is not None:
+                self._clearml_task.connect(model.config, "Model Configuration")
+
+    def on_train_begin(self, args, state, control, model=None, tokenizer=None, **kwargs):
+        if self._clearml is None:
+            return
+        if state.is_hyper_param_search:
+            self._initialized = False
+        if not self._initialized:
+            self.setup(args, state, model, tokenizer, **kwargs)
+
+    def on_train_end(self, args, state, control, model=None, tokenizer=None, metrics=None, logs=None, **kwargs):
+        if self._clearml is None:
+            return
+        if self._clearml_task and state.is_world_process_zero and not self._initialized_externally:
+            # Close ClearML Task at the end end of training
+            self._clearml_task.close()
+
+    def on_log(self, args, state, control, model=None, tokenizer=None, logs=None, **kwargs):
+        if self._clearml is None:
+            return
+        if not self._initialized:
+            self.setup(args, state, model, tokenizer, **kwargs)
+        if state.is_world_process_zero:
+            eval_prefix = "eval_"
+            eval_prefix_len = len(eval_prefix)
+            test_prefix = "test_"
+            test_prefix_len = len(test_prefix)
+            single_value_scalars = [
+                "train_runtime",
+                "train_samples_per_second",
+                "train_steps_per_second",
+                "train_loss",
+                "total_flos",
+                "epoch",
+            ]
+            for k, v in logs.items():
+                if isinstance(v, (int, float)):
+                    if k in single_value_scalars:
+                        self._clearml_task.get_logger().report_single_value(name=k, value=v)
+                    elif k.startswith(eval_prefix):
+                        self._clearml_task.get_logger().report_scalar(
+                            title=k[eval_prefix_len:], series="eval", value=v, iteration=state.global_step
+                        )
+                    elif k.startswith(test_prefix):
+                        self._clearml_task.get_logger().report_scalar(
+                            title=k[test_prefix_len:], series="test", value=v, iteration=state.global_step
+                        )
+                    else:
+                        self._clearml_task.get_logger().report_scalar(
+                            title=k, series="train", value=v, iteration=state.global_step
+                        )
+                else:
+                    logger.warning(
+                        "Trainer is attempting to log a value of "
+                        f'"{v}" of type {type(v)} for key "{k}" as a scalar. '
+                        "This invocation of ClearML logger's  report_scalar() "
+                        "is incorrect so we dropped this attribute."
+                    )
+
+    def on_save(self, args, state, control, **kwargs):
+        if self._log_model and self._clearml_task and state.is_world_process_zero:
+            ckpt_dir = f"checkpoint-{state.global_step}"
+            artifact_path = os.path.join(args.output_dir, ckpt_dir)
+            logger.info(f"Logging checkpoint artifacts in {ckpt_dir}. This may take time.")
+            self._clearml_task.update_output_model(artifact_path, iteration=state.global_step, auto_delete_file=False)
+
+
+class FlyteCallback(TrainerCallback):
+    """A [`TrainerCallback`] that sends the logs to [Flyte](https://flyte.org/).
+    NOTE: This callback only works within a Flyte task.
+
+    Args:
+        save_log_history (`bool`, *optional*, defaults to `True`):
+            When set to True, the training logs are saved as a Flyte Deck.
+
+        sync_checkpoints (`bool`, *optional*, defaults to `True`):
+            When set to True, checkpoints are synced with Flyte and can be used to resume training in the case of an
+            interruption.
+
+    Example:
+
+    ```python
+    # Note: This example skips over some setup steps for brevity.
+    from flytekit import current_context, task
+
+
+    @task
+    def train_hf_transformer():
+        cp = current_context().checkpoint
+        trainer = Trainer(..., callbacks=[FlyteCallback()])
+        output = trainer.train(resume_from_checkpoint=cp.restore())
+    ```
+    """
+
+    def __init__(self, save_log_history: bool = True, sync_checkpoints: bool = True):
+        super().__init__()
+        if not is_flytekit_available():
+            raise ImportError("FlyteCallback requires flytekit to be installed. Run `pip install flytekit`.")
+
+        if not is_flyte_deck_standard_available() or not is_pandas_available():
+            logger.warning(
+                "Syncing log history requires both flytekitplugins-deck-standard and pandas to be installed. "
+                "Run `pip install flytekitplugins-deck-standard pandas` to enable this feature."
+            )
+            save_log_history = False
+
+        from flytekit import current_context
+
+        self.cp = current_context().checkpoint
+        self.save_log_history = save_log_history
+        self.sync_checkpoints = sync_checkpoints
+
+    def on_save(self, args, state, control, **kwargs):
+        if self.sync_checkpoints and state.is_world_process_zero:
+            ckpt_dir = f"checkpoint-{state.global_step}"
+            artifact_path = os.path.join(args.output_dir, ckpt_dir)
+
+            logger.info(f"Syncing checkpoint in {ckpt_dir} to Flyte. This may take time.")
+            self.cp.save(artifact_path)
+
+    def on_train_end(self, args, state, control, **kwargs):
+        if self.save_log_history:
+            import pandas as pd
+            from flytekit import Deck
+            from flytekitplugins.deck.renderer import TableRenderer
+
+            log_history_df = pd.DataFrame(state.log_history)
+            Deck("Log History", TableRenderer().to_html(log_history_df))
+
+
+INTEGRATION_TO_CALLBACK = {
+    "azure_ml": AzureMLCallback,
+    "comet_ml": CometCallback,
+    "mlflow": MLflowCallback,
+    "neptune": NeptuneCallback,
+    "tensorboard": TensorBoardCallback,
+    "wandb": WandbCallback,
+    "codecarbon": CodeCarbonCallback,
+    "clearml": ClearMLCallback,
+    "dagshub": DagsHubCallback,
+    "flyte": FlyteCallback,
+}
+
+
+def get_reporting_integration_callbacks(report_to):
+    for integration in report_to:
+        if integration not in INTEGRATION_TO_CALLBACK:
+            raise ValueError(
+                f"{integration} is not supported, only {', '.join(INTEGRATION_TO_CALLBACK.keys())} are supported."
+            )
+
+    return [INTEGRATION_TO_CALLBACK[integration] for integration in report_to]
diff --git a/src/transformers/integrations/peft.py b/src/transformers/integrations/peft.py
new file mode 100644
index 000000000000..de68e01c5fde
--- /dev/null
+++ b/src/transformers/integrations/peft.py
@@ -0,0 +1,474 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import warnings
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+
+from ..utils import (
+    check_peft_version,
+    find_adapter_config_file,
+    is_accelerate_available,
+    is_peft_available,
+    is_torch_available,
+    logging,
+)
+
+
+if is_accelerate_available():
+    from accelerate import dispatch_model
+    from accelerate.utils import get_balanced_memory, infer_auto_device_map
+
+# Minimum PEFT version supported for the integration
+MIN_PEFT_VERSION = "0.5.0"
+
+if TYPE_CHECKING:
+    if is_torch_available():
+        import torch
+
+
+logger = logging.get_logger(__name__)
+
+
+class PeftAdapterMixin:
+    """
+    A class containing all functions for loading and using adapters weights that are supported in PEFT library. For
+    more details about adapters and injecting them on a transformer-based model, check out the documentation of PEFT
+    library: https://huggingface.co/docs/peft/index
+
+    Currently supported PEFT methods are all non-prefix tuning methods. Below is the list of supported PEFT methods
+    that anyone can load, train and run with this mixin class:
+    - Low Rank Adapters (LoRA): https://huggingface.co/docs/peft/conceptual_guides/lora
+    - IA3: https://huggingface.co/docs/peft/conceptual_guides/ia3
+    - AdaLora: https://arxiv.org/abs/2303.10512
+
+    Other PEFT models such as prompt tuning, prompt learning are out of scope as these adapters are not "injectable"
+    into a torch module. For using these methods, please refer to the usage guide of PEFT library.
+
+    With this mixin, if the correct PEFT version is installed, it is possible to:
+
+    - Load an adapter stored on a local path or in a remote Hub repository, and inject it in the model
+    - Attach new adapters in the model and train them with Trainer or by your own.
+    - Attach multiple adapters and iteratively activate / deactivate them
+    - Activate / deactivate all adapters from the model.
+    - Get the `state_dict` of the active adapter.
+    """
+
+    _hf_peft_config_loaded = False
+
+    def load_adapter(
+        self,
+        peft_model_id: Optional[str] = None,
+        adapter_name: Optional[str] = None,
+        revision: Optional[str] = None,
+        token: Optional[str] = None,
+        device_map: Optional[str] = "auto",
+        max_memory: Optional[str] = None,
+        offload_folder: Optional[str] = None,
+        offload_index: Optional[int] = None,
+        peft_config: Dict[str, Any] = None,
+        adapter_state_dict: Optional[Dict[str, "torch.Tensor"]] = None,
+        adapter_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        """
+        Load adapter weights from file or remote Hub folder. If you are not familiar with adapters and PEFT methods, we
+        invite you to read more about them on PEFT official documentation: https://huggingface.co/docs/peft
+
+        Requires peft as a backend to load the adapter weights.
+
+        Args:
+            peft_model_id (`str`, *optional*):
+                The identifier of the model to look for on the Hub, or a local path to the saved adapter config file
+                and adapter weights.
+            adapter_name (`str`, *optional*):
+                The adapter name to use. If not set, will use the default adapter.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+
+                <Tip>
+
+                To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>".
+
+                </Tip>
+
+            token (`str`, `optional`):
+                Whether to use authentication token to load the remote folder. Userful to load private repositories
+                that are on HuggingFace Hub. You might need to call `huggingface-cli login` and paste your tokens to
+                cache it.
+            device_map (`str` or `Dict[str, Union[int, str, torch.device]]` or `int` or `torch.device`, *optional*):
+                A map that specifies where each submodule should go. It doesn't need to be refined to each
+                parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the
+                same device. If we only pass the device (*e.g.*, `"cpu"`, `"cuda:1"`, `"mps"`, or a GPU ordinal rank
+                like `1`) on which the model will be allocated, the device map will map the entire model to this
+                device. Passing `device_map = 0` means put the whole model on GPU 0.
+
+                To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For
+                more information about each option see [designing a device
+                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
+            max_memory (`Dict`, *optional*):
+                A dictionary device identifier to maximum memory. Will default to the maximum memory available for each
+                GPU and the available CPU RAM if unset.
+            offload_folder (`str` or `os.PathLike`, `optional`):
+                If the `device_map` contains any value `"disk"`, the folder where we will offload weights.
+            offload_index (`int`, `optional`):
+                `offload_index` argument to be passed to `accelerate.dispatch_model` method.
+            peft_config (`Dict[str, Any]`, *optional*):
+                The configuration of the adapter to add, supported adapters are non-prefix tuning and adaption prompts
+                methods. This argument is used in case users directly pass PEFT state dicts
+            adapter_state_dict (`Dict[str, torch.Tensor]`, *optional*):
+                The state dict of the adapter to load. This argument is used in case users directly pass PEFT state
+                dicts
+            adapter_kwargs (`Dict[str, Any]`, *optional*):
+                Additional keyword arguments passed along to the `from_pretrained` method of the adapter config and
+                `find_adapter_config_file` method.
+        """
+        check_peft_version(min_version=MIN_PEFT_VERSION)
+
+        adapter_name = adapter_name if adapter_name is not None else "default"
+        if adapter_kwargs is None:
+            adapter_kwargs = {}
+
+        from peft import PeftConfig, inject_adapter_in_model, load_peft_weights
+        from peft.utils import set_peft_model_state_dict
+
+        if self._hf_peft_config_loaded and adapter_name in self.peft_config:
+            raise ValueError(f"Adapter with name {adapter_name} already exists. Please use a different name.")
+
+        if peft_model_id is None and (adapter_state_dict is None and peft_config is None):
+            raise ValueError(
+                "You should either pass a `peft_model_id` or a `peft_config` and `adapter_state_dict` to load an adapter."
+            )
+
+        # We keep `revision` in the signature for backward compatibility
+        if revision is not None and "revision" not in adapter_kwargs:
+            adapter_kwargs["revision"] = revision
+        elif revision is not None and "revision" in adapter_kwargs and revision != adapter_kwargs["revision"]:
+            logger.error(
+                "You passed a `revision` argument both in `adapter_kwargs` and as a standalone argument. "
+                "The one in `adapter_kwargs` will be used."
+            )
+
+        # Override token with adapter_kwargs' token
+        if "token" in adapter_kwargs:
+            token = adapter_kwargs.pop("token")
+
+        if peft_config is None:
+            adapter_config_file = find_adapter_config_file(
+                peft_model_id,
+                token=token,
+                **adapter_kwargs,
+            )
+
+            if adapter_config_file is None:
+                raise ValueError(
+                    f"adapter model file not found in {peft_model_id}. Make sure you are passing the correct path to the "
+                    "adapter model."
+                )
+
+            peft_config = PeftConfig.from_pretrained(
+                peft_model_id,
+                use_auth_token=token,
+                **adapter_kwargs,
+            )
+
+        # Create and add fresh new adapters into the model.
+        inject_adapter_in_model(peft_config, self, adapter_name)
+
+        if not self._hf_peft_config_loaded:
+            self._hf_peft_config_loaded = True
+
+        if peft_model_id is not None:
+            adapter_state_dict = load_peft_weights(peft_model_id, use_auth_token=token, **adapter_kwargs)
+
+        # We need to pre-process the state dict to remove unneeded prefixes - for backward compatibility
+        processed_adapter_state_dict = {}
+        prefix = "base_model.model."
+        for key, value in adapter_state_dict.items():
+            if key.startswith(prefix):
+                new_key = key[len(prefix) :]
+            else:
+                new_key = key
+            processed_adapter_state_dict[new_key] = value
+
+        # Load state dict
+        incompatible_keys = set_peft_model_state_dict(self, processed_adapter_state_dict, adapter_name)
+
+        if incompatible_keys is not None:
+            # check only for unexpected keys
+            if hasattr(incompatible_keys, "unexpected_keys") and len(incompatible_keys.unexpected_keys) > 0:
+                logger.warning(
+                    f"Loading adapter weights from {peft_model_id} led to unexpected keys not found in the model: "
+                    f" {incompatible_keys.unexpected_keys}. "
+                )
+
+        # Re-dispatch model and hooks in case the model is offloaded to CPU / Disk.
+        if (
+            (getattr(self, "hf_device_map", None) is not None)
+            and (len(set(self.hf_device_map.values()).intersection({"cpu", "disk"})) > 0)
+            and len(self.peft_config) == 1
+        ):
+            self._dispatch_accelerate_model(
+                device_map=device_map,
+                max_memory=max_memory,
+                offload_folder=offload_folder,
+                offload_index=offload_index,
+            )
+
+    def add_adapter(self, adapter_config, adapter_name: Optional[str] = None) -> None:
+        r"""
+        If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT
+        official documentation: https://huggingface.co/docs/peft
+
+        Adds a fresh new adapter to the current model for training purpose. If no adapter name is passed, a default
+        name is assigned to the adapter to follow the convention of PEFT library (in PEFT we use "default" as the
+        default adapter name).
+
+        Args:
+            adapter_config (`~peft.PeftConfig`):
+                The configuration of the adapter to add, supported adapters are non-prefix tuning and adaption prompts
+                methods
+            adapter_name (`str`, *optional*, defaults to `"default"`):
+                The name of the adapter to add. If no name is passed, a default name is assigned to the adapter.
+        """
+        check_peft_version(min_version=MIN_PEFT_VERSION)
+
+        from peft import PeftConfig, inject_adapter_in_model
+
+        adapter_name = adapter_name or "default"
+
+        if not self._hf_peft_config_loaded:
+            self._hf_peft_config_loaded = True
+        elif adapter_name in self.peft_config:
+            raise ValueError(f"Adapter with name {adapter_name} already exists. Please use a different name.")
+
+        if not isinstance(adapter_config, PeftConfig):
+            raise ValueError(
+                f"adapter_config should be an instance of PeftConfig. Got {type(adapter_config)} instead."
+            )
+
+        # Retrieve the name or path of the model, one could also use self.config._name_or_path
+        # but to be consistent with what we do in PEFT: https://github.com/huggingface/peft/blob/6e783780ca9df3a623992cc4d1d665001232eae0/src/peft/mapping.py#L100
+        adapter_config.base_model_name_or_path = self.__dict__.get("name_or_path", None)
+        inject_adapter_in_model(adapter_config, self, adapter_name)
+
+        self.set_adapter(adapter_name)
+
+    def set_adapter(self, adapter_name: Union[List[str], str]) -> None:
+        """
+        If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT
+        official documentation: https://huggingface.co/docs/peft
+
+        Sets a specific adapter by forcing the model to use a that adapter and disable the other adapters.
+
+        Args:
+            adapter_name (`Union[List[str], str]`):
+                The name of the adapter to set. Can be also a list of strings to set multiple adapters.
+        """
+        check_peft_version(min_version=MIN_PEFT_VERSION)
+        if not self._hf_peft_config_loaded:
+            raise ValueError("No adapter loaded. Please load an adapter first.")
+        elif isinstance(adapter_name, list):
+            missing = set(adapter_name) - set(self.peft_config)
+            if len(missing) > 0:
+                raise ValueError(
+                    f"Following adapter(s) could not be found: {', '.join(missing)}. Make sure you are passing the correct adapter name(s)."
+                    f" current loaded adapters are: {list(self.peft_config.keys())}"
+                )
+        elif adapter_name not in self.peft_config:
+            raise ValueError(
+                f"Adapter with name {adapter_name} not found. Please pass the correct adapter name among {list(self.peft_config.keys())}"
+            )
+
+        from peft.tuners.tuners_utils import BaseTunerLayer
+
+        _adapters_has_been_set = False
+
+        for _, module in self.named_modules():
+            if isinstance(module, BaseTunerLayer):
+                # For backward compatbility with previous PEFT versions
+                if hasattr(module, "set_adapter"):
+                    module.set_adapter(adapter_name)
+                else:
+                    module.active_adapter = adapter_name
+                _adapters_has_been_set = True
+
+        if not _adapters_has_been_set:
+            raise ValueError(
+                "Did not succeeded in setting the adapter. Please make sure you are using a model that supports adapters."
+            )
+
+    def disable_adapters(self) -> None:
+        r"""
+        If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT
+        official documentation: https://huggingface.co/docs/peft
+
+        Disable all adapters that are attached to the model. This leads to inferring with the base model only.
+        """
+        check_peft_version(min_version=MIN_PEFT_VERSION)
+
+        if not self._hf_peft_config_loaded:
+            raise ValueError("No adapter loaded. Please load an adapter first.")
+
+        from peft.tuners.tuners_utils import BaseTunerLayer
+
+        for _, module in self.named_modules():
+            if isinstance(module, BaseTunerLayer):
+                # The recent version of PEFT need to call `enable_adapters` instead
+                if hasattr(module, "enable_adapters"):
+                    module.enable_adapters(enabled=False)
+                else:
+                    module.disable_adapters = True
+
+    def enable_adapters(self) -> None:
+        """
+        If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT
+        official documentation: https://huggingface.co/docs/peft
+
+        Enable adapters that are attached to the model. The model will use `self.active_adapter()`
+        """
+        check_peft_version(min_version=MIN_PEFT_VERSION)
+
+        if not self._hf_peft_config_loaded:
+            raise ValueError("No adapter loaded. Please load an adapter first.")
+
+        from peft.tuners.tuners_utils import BaseTunerLayer
+
+        for _, module in self.named_modules():
+            if isinstance(module, BaseTunerLayer):
+                # The recent version of PEFT need to call `enable_adapters` instead
+                if hasattr(module, "enable_adapters"):
+                    module.enable_adapters(enabled=True)
+                else:
+                    module.disable_adapters = False
+
+    def active_adapters(self) -> List[str]:
+        """
+        If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT
+        official documentation: https://huggingface.co/docs/peft
+
+        Gets the current active adapters of the model. In case of multi-adapter inference (combining multiple adapters
+        for inference) returns the list of all active adapters so that users can deal with them accordingly.
+
+        For previous PEFT versions (that does not support multi-adapter inference), `module.active_adapter` will return
+        a single string.
+        """
+        check_peft_version(min_version=MIN_PEFT_VERSION)
+
+        if not is_peft_available():
+            raise ImportError("PEFT is not available. Please install PEFT to use this function: `pip install peft`.")
+
+        if not self._hf_peft_config_loaded:
+            raise ValueError("No adapter loaded. Please load an adapter first.")
+
+        from peft.tuners.tuners_utils import BaseTunerLayer
+
+        for _, module in self.named_modules():
+            if isinstance(module, BaseTunerLayer):
+                active_adapters = module.active_adapter
+                break
+
+        # For previous PEFT versions
+        if isinstance(active_adapters, str):
+            active_adapters = [active_adapters]
+
+        return active_adapters
+
+    def active_adapter(self) -> str:
+        warnings.warn(
+            "The `active_adapter` method is deprecated and will be removed in a future version.", FutureWarning
+        )
+
+        return self.active_adapters()[0]
+
+    def get_adapter_state_dict(self, adapter_name: Optional[str] = None) -> dict:
+        """
+        If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT
+        official documentation: https://huggingface.co/docs/peft
+
+        Gets the adapter state dict that should only contain the weights tensors of the specified adapter_name adapter.
+        If no adapter_name is passed, the active adapter is used.
+
+        Args:
+            adapter_name (`str`, *optional*):
+                The name of the adapter to get the state dict from. If no name is passed, the active adapter is used.
+        """
+        check_peft_version(min_version=MIN_PEFT_VERSION)
+
+        if not self._hf_peft_config_loaded:
+            raise ValueError("No adapter loaded. Please load an adapter first.")
+
+        from peft import get_peft_model_state_dict
+
+        if adapter_name is None:
+            adapter_name = self.active_adapter()
+
+        adapter_state_dict = get_peft_model_state_dict(self, adapter_name=adapter_name)
+        return adapter_state_dict
+
+    def _dispatch_accelerate_model(
+        self,
+        device_map: str,
+        max_memory: Optional[int] = None,
+        offload_folder: Optional[str] = None,
+        offload_index: Optional[int] = None,
+    ) -> None:
+        """
+        Optional re-dispatch the model and attach new hooks to the model in case the model has been loaded with
+        accelerate (i.e. with `device_map=xxx`)
+
+        Args:
+            device_map (`str` or `Dict[str, Union[int, str, torch.device]]` or `int` or `torch.device`, *optional*):
+                A map that specifies where each submodule should go. It doesn't need to be refined to each
+                parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the
+                same device. If we only pass the device (*e.g.*, `"cpu"`, `"cuda:1"`, `"mps"`, or a GPU ordinal rank
+                like `1`) on which the model will be allocated, the device map will map the entire model to this
+                device. Passing `device_map = 0` means put the whole model on GPU 0.
+
+                To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For
+                more information about each option see [designing a device
+                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
+            max_memory (`Dict`, *optional*):
+                A dictionary device identifier to maximum memory. Will default to the maximum memory available for each
+                GPU and the available CPU RAM if unset.
+            offload_folder (`str` or `os.PathLike`, *optional*):
+                If the `device_map` contains any value `"disk"`, the folder where we will offload weights.
+            offload_index (`int`, *optional*):
+                The offload_index argument to be passed to `accelerate.dispatch_model` method.
+        """
+        dispatch_model_kwargs = {}
+        # Safety checker for previous `accelerate` versions
+        # `offload_index` was introduced in https://github.com/huggingface/accelerate/pull/873/
+        if "offload_index" in inspect.signature(dispatch_model).parameters:
+            dispatch_model_kwargs["offload_index"] = offload_index
+
+        no_split_module_classes = self._no_split_modules
+
+        if device_map != "sequential":
+            max_memory = get_balanced_memory(
+                self,
+                max_memory=max_memory,
+                no_split_module_classes=no_split_module_classes,
+                low_zero=(device_map == "balanced_low_0"),
+            )
+        if isinstance(device_map, str):
+            device_map = infer_auto_device_map(
+                self, max_memory=max_memory, no_split_module_classes=no_split_module_classes
+            )
+        dispatch_model(
+            self,
+            device_map=device_map,
+            offload_dir=offload_folder,
+            **dispatch_model_kwargs,
+        )
diff --git a/src/transformers/keras_callbacks.py b/src/transformers/keras_callbacks.py
index 7128f348c36e..3bb4e859b1c6 100644
--- a/src/transformers/keras_callbacks.py
+++ b/src/transformers/keras_callbacks.py
@@ -6,14 +6,12 @@
 
 import numpy as np
 import tensorflow as tf
+from huggingface_hub import Repository, create_repo
 from packaging.version import parse
 from tensorflow.keras.callbacks import Callback
 
-from huggingface_hub import Repository, create_repo
-
 from . import IntervalStrategy, PreTrainedTokenizerBase
 from .modelcard import TrainingSummary
-from .utils import get_full_repo_name
 
 
 logger = logging.getLogger(__name__)
@@ -145,7 +143,7 @@ def __init__(
     @staticmethod
     def _concatenate_batches(batches, padding_index=-100):
         # If all batches are unidimensional or same length, do a simple concatenation
-        if batches[0].ndim == 1 or all([batch.shape[1] == batches[0].shape[1] for batch in batches]):
+        if batches[0].ndim == 1 or all(batch.shape[1] == batches[0].shape[1] for batch in batches):
             return np.concatenate(batches, axis=0)
 
         # Welp, they're not the same length. Let's do some padding
@@ -163,7 +161,7 @@ def _concatenate_batches(batches, padding_index=-100):
 
     def _postprocess_predictions_or_labels(self, inputs):
         if isinstance(inputs[0], dict):
-            outputs = dict()
+            outputs = {}
             for key in inputs[0].keys():
                 outputs[key] = self._concatenate_batches([batch[key] for batch in inputs])
             # If it's a dict with only one key, just return the array
@@ -194,8 +192,7 @@ def on_epoch_end(self, epoch, logs=None):
             # This dense conditional recognizes the case where we have an encoder-decoder model, but
             # avoids getting tangled up when we just have a model with a layer called 'encoder'
             if hasattr(self.model, "encoder") and hasattr(self.model.encoder, "main_input_name"):
-                if self.model.encoder.main_input_name != self.model.main_input_name:
-                    main_input_name = self.model.encoder.main_input_name
+                main_input_name = self.model.encoder.main_input_name
             else:
                 main_input_name = getattr(self.model, "main_input_name", "input_ids")
 
@@ -225,17 +222,21 @@ def generation_function(inputs, attention_mask):
                 if self.use_xla_generation:
                     predictions = self.generation_function(generation_inputs, attention_mask=attention_mask)
                 else:
-                    predictions = self.model.generate(generation_inputs, attention_mask=attention_mask)
+                    predictions = self.model.generate(
+                        generation_inputs, attention_mask=attention_mask, **self.generate_kwargs
+                    )
             else:
                 predictions = self.model.predict_on_batch(batch)
                 if isinstance(predictions, dict):
                     # This converts any dict-subclass to a regular dict
                     # Keras REALLY doesn't like it when we pass around a BatchEncoding or other derived class
                     predictions = dict(predictions)
-                if self.output_cols is not None:
-                    predictions = {key: predictions[key] for key in self.output_cols}
-                else:
-                    predictions = {key: val for key, val in predictions.items() if key not in ignore_keys + ["loss"]}
+                    if self.output_cols is not None:
+                        predictions = {key: predictions[key] for key in self.output_cols}
+                    else:
+                        predictions = {
+                            key: val for key, val in predictions.items() if key not in ignore_keys + ["loss"]
+                        }
             prediction_list.append(predictions)
             if not self.use_keras_label:
                 labels = {key: batch[key].numpy() for key in self.label_cols}
@@ -320,7 +321,7 @@ def __init__(
         hub_model_id: Optional[str] = None,
         hub_token: Optional[str] = None,
         checkpoint: bool = False,
-        **model_card_args
+        **model_card_args,
     ):
         super().__init__()
         if checkpoint and save_strategy != "epoch":
@@ -332,19 +333,14 @@ def __init__(
             raise ValueError("Please supply a positive integer argument for save_steps when save_strategy == 'steps'!")
         self.save_steps = save_steps
         output_dir = Path(output_dir)
+
+        # Create repo and retrieve repo_id
         if hub_model_id is None:
             hub_model_id = output_dir.absolute().name
-        if "/" not in hub_model_id:
-            hub_model_id = get_full_repo_name(hub_model_id, token=hub_token)
+        self.hub_model_id = create_repo(repo_id=hub_model_id, exist_ok=True, token=hub_token).repo_id
 
         self.output_dir = output_dir
-        self.hub_model_id = hub_model_id
-        create_repo(self.hub_model_id, exist_ok=True)
-        self.repo = Repository(
-            str(self.output_dir),
-            clone_from=self.hub_model_id,
-            use_auth_token=hub_token if hub_token else True,
-        )
+        self.repo = Repository(str(self.output_dir), clone_from=self.hub_model_id, token=hub_token)
 
         self.tokenizer = tokenizer
         self.last_job = None
diff --git a/src/transformers/models/deformable_detr/custom_kernel/cpu/ms_deform_attn_cpu.cpp b/src/transformers/kernels/deformable_detr/cpu/ms_deform_attn_cpu.cpp
similarity index 100%
rename from src/transformers/models/deformable_detr/custom_kernel/cpu/ms_deform_attn_cpu.cpp
rename to src/transformers/kernels/deformable_detr/cpu/ms_deform_attn_cpu.cpp
diff --git a/src/transformers/models/deformable_detr/custom_kernel/cpu/ms_deform_attn_cpu.h b/src/transformers/kernels/deformable_detr/cpu/ms_deform_attn_cpu.h
similarity index 100%
rename from src/transformers/models/deformable_detr/custom_kernel/cpu/ms_deform_attn_cpu.h
rename to src/transformers/kernels/deformable_detr/cpu/ms_deform_attn_cpu.h
diff --git a/src/transformers/models/deformable_detr/custom_kernel/cuda/ms_deform_attn_cuda.cu b/src/transformers/kernels/deformable_detr/cuda/ms_deform_attn_cuda.cu
similarity index 100%
rename from src/transformers/models/deformable_detr/custom_kernel/cuda/ms_deform_attn_cuda.cu
rename to src/transformers/kernels/deformable_detr/cuda/ms_deform_attn_cuda.cu
diff --git a/src/transformers/models/deformable_detr/custom_kernel/cuda/ms_deform_attn_cuda.cuh b/src/transformers/kernels/deformable_detr/cuda/ms_deform_attn_cuda.cuh
similarity index 100%
rename from src/transformers/models/deformable_detr/custom_kernel/cuda/ms_deform_attn_cuda.cuh
rename to src/transformers/kernels/deformable_detr/cuda/ms_deform_attn_cuda.cuh
diff --git a/src/transformers/models/deformable_detr/custom_kernel/cuda/ms_deform_attn_cuda.h b/src/transformers/kernels/deformable_detr/cuda/ms_deform_attn_cuda.h
similarity index 100%
rename from src/transformers/models/deformable_detr/custom_kernel/cuda/ms_deform_attn_cuda.h
rename to src/transformers/kernels/deformable_detr/cuda/ms_deform_attn_cuda.h
diff --git a/src/transformers/models/deformable_detr/custom_kernel/cuda/ms_deform_im2col_cuda.cuh b/src/transformers/kernels/deformable_detr/cuda/ms_deform_im2col_cuda.cuh
similarity index 100%
rename from src/transformers/models/deformable_detr/custom_kernel/cuda/ms_deform_im2col_cuda.cuh
rename to src/transformers/kernels/deformable_detr/cuda/ms_deform_im2col_cuda.cuh
diff --git a/src/transformers/models/deformable_detr/custom_kernel/ms_deform_attn.h b/src/transformers/kernels/deformable_detr/ms_deform_attn.h
similarity index 100%
rename from src/transformers/models/deformable_detr/custom_kernel/ms_deform_attn.h
rename to src/transformers/kernels/deformable_detr/ms_deform_attn.h
diff --git a/src/transformers/models/deformable_detr/custom_kernel/vision.cpp b/src/transformers/kernels/deformable_detr/vision.cpp
similarity index 100%
rename from src/transformers/models/deformable_detr/custom_kernel/vision.cpp
rename to src/transformers/kernels/deformable_detr/vision.cpp
diff --git a/src/transformers/kernels/mra/cuda_kernel.cu b/src/transformers/kernels/mra/cuda_kernel.cu
new file mode 100644
index 000000000000..87ed89052873
--- /dev/null
+++ b/src/transformers/kernels/mra/cuda_kernel.cu
@@ -0,0 +1,383 @@
+#include "cuda_kernel.h"
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+__global__ void index_max_cuda_kernel(
+  float *index_vals,       // [batch_size, 32, num_block]
+  int   *indices,        // [batch_size, num_block]
+  float *max_vals,        // [batch_size, A_num_block * 32]
+  float *max_vals_scatter,   // [batch_size, 32, num_block]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long num_block
+) {
+
+  long batch_idx = blockIdx.x;
+
+  long thread_idx = threadIdx.x;
+  long num_thread = blockDim.x;
+
+  extern __shared__ float buffer[];
+  int *max_buffer = (int*)buffer;
+
+  for (int i = 0; i < A_num_block * 32; i = i + num_thread) {
+    int idx = i + thread_idx;
+    if (idx < A_num_block * 32) {
+      max_buffer[idx] = -1e8;
+    }
+  }
+  __syncthreads();
+
+  int *indices_pt = &indices[batch_idx * num_block];
+  float *index_vals_pt = &index_vals[batch_idx * num_block * 32];
+
+  for (int idx_start = 0; idx_start < 32 * num_block; idx_start = idx_start + num_thread) {
+    int idx = idx_start + thread_idx;
+    int A_block_idx = indices_pt[idx % num_block] / B_num_block;
+    atomicMax(&max_buffer[A_block_idx * 32 + idx / num_block], (int)(index_vals_pt[idx] * 1000));
+  }
+  __syncthreads();
+  
+  float *max_vals_pt = &max_vals[batch_idx * A_num_block * 32];
+  for (int i = 0; i < A_num_block * 32; i = i + num_thread) {
+    int idx = i + thread_idx;
+    if (idx < A_num_block * 32) {
+      max_vals_pt[idx] = (float)max_buffer[idx] / 1000.;
+    }
+  }
+  
+  float *max_vals_scatter_pt = &max_vals_scatter[batch_idx * num_block * 32];
+  for (int idx_start = 0; idx_start < 32 * num_block; idx_start = idx_start + num_thread) {
+    int idx = idx_start + thread_idx;
+    int A_block_idx = indices_pt[idx % num_block] / B_num_block;
+    max_vals_scatter_pt[idx] = (float)max_buffer[A_block_idx * 32 + idx / num_block] / 1000.;
+  }
+
+}
+
+__global__ void mm_to_sparse_cuda_kernel(
+  float *dense_A,   // [batch_size, A_num_block, dim, 32]
+  float *dense_B,   // [batch_size, B_num_block, dim, 32]
+  int   *indices,   // [batch_size, num_block]
+  float *sparse_C,  // [batch_size, num_block, 32, 32]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long dim,
+  long num_block
+) {
+
+  long batch_idx = blockIdx.y;
+  long block_idx = blockIdx.x * blockDim.y + threadIdx.y;
+
+  long thread_idx = threadIdx.x;
+
+  __shared__ float buffer[4096];
+  float *A_buffer = &buffer[threadIdx.y * 1024]; // [2, 8, 32]
+  float *B_buffer = &buffer[threadIdx.y * 1024 + 512]; // [2, 8, 32]
+
+  long batch_idx__block_idx = batch_idx * num_block + block_idx;
+
+  long AB_block_idx = indices[batch_idx__block_idx];
+  float *dense_A_pt = &dense_A[(batch_idx * A_num_block + AB_block_idx / B_num_block) * dim * 32];
+  float *dense_B_pt = &dense_B[(batch_idx * B_num_block + AB_block_idx % B_num_block) * dim * 32];
+
+  int reg_1_idx = thread_idx / 8;    // [0000000011111111222222223333333344444444555555556666666677777777]
+  int reg_2_idx = thread_idx % 8;    // [0123456701234567012345670123456701234567012345670123456701234567]
+
+  float reg_1[8];
+  float reg_2[8];
+
+  float reg_array[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+  #pragma unroll
+  for (int i = 0; i < 4; i++) {
+    A_buffer[i * 64 + thread_idx] = dense_A_pt[i * 64 + thread_idx];
+    B_buffer[i * 64 + thread_idx] = dense_B_pt[i * 64 + thread_idx];
+  }
+
+  __syncthreads();
+
+  #pragma unroll
+  for (int i = 0; i < 4; i++) {
+    reg_1[i] = A_buffer[reg_1_idx * 4 + i];
+    reg_2[i] = B_buffer[reg_2_idx * 4 + i];
+  }
+
+  for (int dim_stride = 1; dim_stride < (dim / 8); dim_stride++) {
+
+    #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      A_buffer[(dim_stride % 2) * 256 + i * 64 + thread_idx] = dense_A_pt[dim_stride * 256 + i * 64 + thread_idx];
+      B_buffer[(dim_stride % 2) * 256 + i * 64 + thread_idx] = dense_B_pt[dim_stride * 256 + i * 64 + thread_idx];
+    }
+
+    #pragma unroll
+    for (int mini_dim_idx = 1; mini_dim_idx < 8; mini_dim_idx++) {
+      #pragma unroll
+      for (int i = 0; i < 4; i++) {
+        reg_1[(mini_dim_idx % 2) * 4 + i] = A_buffer[((dim_stride - 1) % 2) * 256 + mini_dim_idx * 32 + reg_1_idx * 4 + i];
+        reg_2[(mini_dim_idx % 2) * 4 + i] = B_buffer[((dim_stride - 1) % 2) * 256 + mini_dim_idx * 32 + reg_2_idx * 4 + i];
+      }
+      #pragma unroll
+      for (int i = 0; i < 4; i++) {
+        #pragma unroll
+        for (int j = 0; j < 4; j++) {
+          reg_array[i * 4 + j] += reg_1[((mini_dim_idx - 1) % 2) * 4 + i] * reg_2[((mini_dim_idx - 1) % 2) * 4 + j];
+        }
+      }
+    }
+
+    __syncthreads();
+
+    #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      reg_1[i] = A_buffer[(dim_stride % 2) * 256 + reg_1_idx * 4 + i];
+      reg_2[i] = B_buffer[(dim_stride % 2) * 256 + reg_2_idx * 4 + i];
+    }
+
+    #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      #pragma unroll
+      for (int j = 0; j < 4; j++) {
+        reg_array[i * 4 + j] += reg_1[4 + i] * reg_2[4 + j];
+      }
+    }
+
+  }
+
+  #pragma unroll
+  for (int mini_dim_idx = 1; mini_dim_idx < 8; mini_dim_idx++) {
+    #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      reg_1[(mini_dim_idx % 2) * 4 + i] = A_buffer[256 + mini_dim_idx * 32 + reg_1_idx * 4 + i];
+      reg_2[(mini_dim_idx % 2) * 4 + i] = B_buffer[256 + mini_dim_idx * 32 + reg_2_idx * 4 + i];
+    }
+    #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      #pragma unroll
+      for (int j = 0; j < 4; j++) {
+        reg_array[i * 4 + j] += reg_1[((mini_dim_idx - 1) % 2) * 4 + i] * reg_2[((mini_dim_idx - 1) % 2) * 4 + j];
+      }
+    }
+  }
+  #pragma unroll
+  for (int i = 0; i < 4; i++) {
+    #pragma unroll
+    for (int j = 0; j < 4; j++) {
+      reg_array[i * 4 + j] += reg_1[4 + i] * reg_2[4 + j];
+    }
+  }
+  __syncthreads();
+
+  float *C_buffer = &buffer[threadIdx.y * 1024]; // [32, 32]
+
+  #pragma unroll
+  for (int i = 0; i < 4; i++) {
+    #pragma unroll
+    for (int j = 0; j < 4; j++) {
+      C_buffer[(reg_2_idx * 4 + j) * 32 + reg_1_idx * 4 + i] = reg_array[i * 4 + j];
+    }
+  }
+  __syncthreads();
+
+  float *sparse_C_pt = &sparse_C[batch_idx__block_idx * 1024];
+
+  #pragma unroll
+  for (int i = 0; i < 16; i++) {
+    sparse_C_pt[i * 64 + thread_idx] = C_buffer[i * 64 + thread_idx];
+  }
+
+}
+
+__global__ void sparse_dense_mm_cuda_kernel(
+  float *sparse_A,  // [batch_size, num_block, 32, 32]
+  int   *indices,   // [batch_size, num_block]
+  float *dense_B,   // [batch_size, B_num_block, dim, 32]
+  float *dense_C,   // [batch_size, A_num_block, dim, 32]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long dim,
+  long num_block
+) {
+
+  long batch_idx = blockIdx.y;
+  long block_idx = blockIdx.x * blockDim.y + threadIdx.y;
+
+  long thread_idx = threadIdx.x;
+
+  __shared__ float buffer[6144];
+  float *A_buffer = &buffer[threadIdx.y * 3072]; // [32, 32]
+  float *B_buffer = &buffer[threadIdx.y * 3072 + 1024]; // [32, 64]
+
+  long batch_idx__block_idx = batch_idx * num_block + block_idx;
+
+  float *sparse_A_pt = &sparse_A[batch_idx__block_idx * 1024];
+  #pragma unroll
+  for (int i = 0; i < 8; i++) {
+    A_buffer[i * 128 + thread_idx] = sparse_A_pt[i * 128 + thread_idx];
+  }
+
+  long AB_block_idx = indices[batch_idx__block_idx];
+  float *dense_B_pt = &dense_B[(batch_idx * B_num_block + AB_block_idx % B_num_block) * 32 * dim];
+  float *dense_C_pt = &dense_C[(batch_idx * A_num_block + AB_block_idx / B_num_block) * 32 * dim];
+
+  // [0000000011111111222222223333333344444444555555556666666677777777]
+  // [0123456701234567012345670123456701234567012345670123456701234567]
+  int reg_1_idx = thread_idx / 8;
+  int reg_2_idx = thread_idx % 8;
+
+  float reg_1[8];
+  float reg_2[8];
+
+  float reg_array[16];
+
+  for (int dim_stride = 0; dim_stride < dim; dim_stride = dim_stride + 64) {
+
+    #pragma unroll
+    for (int i = 0; i < 16; i++) {
+      B_buffer[i * 128 + thread_idx] = dense_B_pt[dim_stride * 32 + i * 128 + thread_idx];
+    }
+
+    #pragma unroll
+    for (int i = 0; i < 16; i++) {
+      reg_array[i] = 0;
+    }
+
+    __syncthreads();
+
+    #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      reg_1[i] = B_buffer[(reg_1_idx * 4 + i) * 32];
+      reg_2[i] = A_buffer[reg_2_idx * 4 + i];
+    }
+
+    #pragma unroll
+    for (int mini_dim_idx = 1; mini_dim_idx < 32; mini_dim_idx++) {
+      #pragma unroll
+      for (int i = 0; i < 4; i++) {
+        reg_1[(mini_dim_idx % 2) * 4 + i] = B_buffer[(reg_1_idx * 4 + i) * 32 + mini_dim_idx];
+        reg_2[(mini_dim_idx % 2) * 4 + i] = A_buffer[mini_dim_idx * 32 + reg_2_idx * 4 + i];
+      }
+      #pragma unroll
+      for (int i = 0; i < 4; i++) {
+        #pragma unroll
+        for (int j = 0; j < 4; j++) {
+          reg_array[i * 4 + j] += reg_1[((mini_dim_idx - 1) % 2) * 4 + i] * reg_2[((mini_dim_idx - 1) % 2) * 4 + j];
+        }
+      }
+    }
+
+    #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      #pragma unroll
+      for (int j = 0; j < 4; j++) {
+        reg_array[i * 4 + j] += reg_1[4 + i] * reg_2[4 + j];
+      }
+    }
+
+    __syncthreads();
+
+    float *C_buffer = &buffer[threadIdx.y * 3072 + 1024]; // [64, 32]
+
+    #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      #pragma unroll
+      for (int j = 0; j < 4; j++) {
+        C_buffer[(reg_1_idx * 4 + i) * 32 + reg_2_idx * 4 + j] = reg_array[i * 4 + j];
+      }
+    }
+    __syncthreads();
+
+    #pragma unroll
+    for (int i = 0; i < 16; i++) {
+      atomicAdd(&dense_C_pt[dim_stride * 32 + i * 128 + thread_idx], C_buffer[i * 128 + thread_idx]);
+    }
+    __syncthreads();
+
+  }
+
+}
+
+
+__global__ void reduce_sum_cuda_kernel(
+  float *sparse_A,  // [batch_size, num_block, 32, 32]
+  int   *indices,   // [batch_size, num_block]
+  float *dense_C,   // [batch_size, A_num_block, 32]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long num_block
+) {
+
+  long batch_idx = blockIdx.y;
+  long block_idx = blockIdx.x * blockDim.y + threadIdx.y;
+
+  long thread_idx = threadIdx.x;
+
+  long batch_idx__block_idx = batch_idx * num_block + block_idx;
+
+  long AB_block_idx = indices[batch_idx__block_idx];
+  float *sparse_A_pt = &sparse_A[batch_idx__block_idx * 1024];
+
+  float reg_array[16];
+  float value = 0;
+
+  #pragma unroll
+  for (int i = 0; i < 8; i++) {
+    reg_array[i] = sparse_A_pt[i * 32 + thread_idx];
+  }
+  #pragma unroll
+  for (int stride = 8; stride < 32; stride = stride + 8) {
+    #pragma unroll
+    for (int i = 0; i < 8; i++) {
+      reg_array[(stride + i) % 16] = sparse_A_pt[(stride + i) * 32 + thread_idx];
+    }
+    #pragma unroll
+    for (int i = 0; i < 8; i++) {
+      value = value + reg_array[(stride - 8 + i) % 16];
+    }
+  }
+  #pragma unroll
+  for (int i = 0; i < 8; i++) {
+    value = value + reg_array[8 + i];
+  }
+
+  float *dense_C_pt = &dense_C[(batch_idx * A_num_block + AB_block_idx / B_num_block) * 32];
+
+  atomicAdd(&dense_C_pt[thread_idx], value);
+
+}
+
+__global__ void scatter_cuda_kernel(
+  float *dense_A,   // [batch_size, A_num_block, 32]
+  int   *indices,   // [batch_size, num_block]
+  float *sparse_C,  // [batch_size, num_block, 32, 32]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long num_block
+) {
+
+  long batch_idx = blockIdx.y;
+  long block_idx = blockIdx.x * blockDim.y + threadIdx.y;
+
+  long thread_idx = threadIdx.x;
+
+  long batch_idx__block_idx = batch_idx * num_block + block_idx;
+
+  long AB_block_idx = indices[batch_idx__block_idx];
+  float *dense_A_pt = &dense_A[(batch_idx * A_num_block + AB_block_idx / B_num_block) * 32];
+  float *sparse_C_pt = &sparse_C[(batch_idx * num_block + block_idx) * 1024];
+
+  float value = dense_A_pt[thread_idx];
+
+  #pragma unroll
+  for (int i = 0; i < 32; i++) {
+    sparse_C_pt[i * 32 + thread_idx] = value;
+  }
+
+}
diff --git a/src/transformers/kernels/mra/cuda_kernel.h b/src/transformers/kernels/mra/cuda_kernel.h
new file mode 100644
index 000000000000..a95b46f7d159
--- /dev/null
+++ b/src/transformers/kernels/mra/cuda_kernel.h
@@ -0,0 +1,59 @@
+
+#define WARP_SIZE 32
+#define FULL_MASK 0xffffffff
+#define OPTIMAL_THREADS 256
+
+__global__ void index_max_cuda_kernel(
+  float *index_vals,       // [batch_size, 32, num_block]
+  int   *indices,        // [batch_size, num_block]
+  float *max_vals,        // [batch_size, A_num_block * 32]
+  float *max_vals_scatter,   // [batch_size, 32, num_block]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long num_block
+);
+
+__global__ void mm_to_sparse_cuda_kernel(
+  float *dense_A,   // [batch_size, A_num_block, dim, 32]
+  float *dense_B,   // [batch_size, B_num_block, dim, 32]
+  int   *indices,   // [batch_size, num_block]
+  float *sparse_C,  // [batch_size, num_block, 32, 32]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long dim,
+  long num_block
+);
+
+__global__ void sparse_dense_mm_cuda_kernel(
+  float *sparse_A,  // [batch_size, num_block, 32, 32]
+  int   *indices,   // [batch_size, num_block]
+  float *dense_B,   // [batch_size, B_num_block, dim, 32]
+  float *dense_C,   // [batch_size, A_num_block, dim, 32]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long dim,
+  long num_block
+);
+
+__global__ void reduce_sum_cuda_kernel(
+  float *sparse_A,  // [batch_size, num_block, 32, 32]
+  int   *indices,   // [batch_size, num_block]
+  float *dense_C,   // [batch_size, A_num_block, 32]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long num_block
+);
+
+__global__ void scatter_cuda_kernel(
+  float *dense_A,   // [batch_size, A_num_block, 32]
+  int   *indices,   // [batch_size, num_block]
+  float *sparse_C,  // [batch_size, num_block, 32, 32]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long num_block
+);
diff --git a/src/transformers/kernels/mra/cuda_launch.cu b/src/transformers/kernels/mra/cuda_launch.cu
new file mode 100644
index 000000000000..ba2a0cacfe61
--- /dev/null
+++ b/src/transformers/kernels/mra/cuda_launch.cu
@@ -0,0 +1,154 @@
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include "cuda_launch.h"
+#include "cuda_kernel.h"
+#include <vector>
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+std::vector<at::Tensor> index_max_kernel(
+  at::Tensor index_vals,  // [batch_size, 32, num_block]
+  at::Tensor indices,     // [batch_size, num_block],
+  int A_num_block,
+  int B_num_block
+) {
+  int batch_size = indices.size(0);
+  int num_block = indices.size(1);
+
+  at::Tensor max_vals = at::zeros({batch_size, A_num_block * 32}, index_vals.options());
+  at::Tensor max_vals_scatter = at::zeros({batch_size, 32, num_block}, index_vals.options());
+
+  dim3 threads(256);
+  dim3 blocks(batch_size);
+  int shared_mem = A_num_block * 32 * sizeof(float);
+
+  index_max_cuda_kernel<<<blocks, threads, shared_mem>>>(
+    index_vals.data_ptr<float>(),
+    indices.data_ptr<int>(),
+    max_vals.data_ptr<float>(),
+    max_vals_scatter.data_ptr<float>(),
+    batch_size,
+    A_num_block,
+    B_num_block,
+    num_block
+  );
+
+  return {max_vals, max_vals_scatter};
+}
+
+at::Tensor mm_to_sparse_kernel(
+  at::Tensor dense_A,  // [batch_size, A_num_block, dim, 32]
+  at::Tensor dense_B,  // [batch_size, B_num_block, dim, 32]
+  at::Tensor indices   // [batch_size, num_block]
+) {
+  int batch_size = dense_A.size(0);
+  int A_num_block = dense_A.size(1);
+  int B_num_block = dense_B.size(1);
+  int dim = dense_A.size(2);
+  int num_block = indices.size(1);
+
+  at::Tensor sparse_C = at::zeros({batch_size, num_block, 32, 32}, dense_A.options());
+
+  dim3 threads(64, 4);
+  dim3 blocks(num_block / 4, batch_size);
+
+  mm_to_sparse_cuda_kernel<<<blocks, threads>>>(
+    dense_A.data_ptr<float>(),
+    dense_B.data_ptr<float>(),
+    indices.data_ptr<int>(),
+    sparse_C.data_ptr<float>(),
+    batch_size,
+    A_num_block,
+    B_num_block,
+    dim,
+    num_block
+  );
+
+  return sparse_C;
+}
+
+at::Tensor sparse_dense_mm_kernel(
+  at::Tensor sparse_A,  // [batch_size, num_block, 32, 32]
+  at::Tensor indices,   // [batch_size, num_block]
+  at::Tensor dense_B,   // [batch_size, B_num_block, dim, 32]
+  int A_num_block
+) {
+  int batch_size = sparse_A.size(0);
+  int num_block = sparse_A.size(1);
+  int B_num_block = dense_B.size(1);
+  int dim = dense_B.size(2);
+
+  at::Tensor dense_C = at::zeros({batch_size, A_num_block, dim, 32}, dense_B.options());
+
+  dim3 threads(128, 2);
+  dim3 blocks(num_block / 2, batch_size);
+
+  sparse_dense_mm_cuda_kernel<<<blocks, threads>>>(
+    sparse_A.data_ptr<float>(),
+    indices.data_ptr<int>(),
+    dense_B.data_ptr<float>(),
+    dense_C.data_ptr<float>(),
+    batch_size,
+    A_num_block,
+    B_num_block,
+    dim,
+    num_block
+  );
+
+  return dense_C;
+}
+
+at::Tensor reduce_sum_kernel(
+  at::Tensor sparse_A,  // [batch_size, num_block, 32, 32]
+  at::Tensor indices,   // [batch_size, num_block]
+  int A_num_block,
+  int B_num_block
+) {
+  int batch_size = sparse_A.size(0);
+  int num_block = sparse_A.size(1);
+
+  at::Tensor dense_C = at::zeros({batch_size, A_num_block, 32}, sparse_A.options());
+
+  dim3 threads(32, 4);
+  dim3 blocks(num_block / 4, batch_size);
+
+  reduce_sum_cuda_kernel<<<blocks, threads>>>(
+    sparse_A.data_ptr<float>(),
+    indices.data_ptr<int>(),
+    dense_C.data_ptr<float>(),
+    batch_size,
+    A_num_block,
+    B_num_block,
+    num_block
+  );
+
+  return dense_C;
+}
+
+at::Tensor scatter_kernel(
+  at::Tensor dense_A,   // [batch_size, A_num_block, 32]
+  at::Tensor indices,   // [batch_size, num_block]
+  int B_num_block
+) {
+  int batch_size = dense_A.size(0);
+  int A_num_block = dense_A.size(1);
+  int num_block = indices.size(1);
+
+  at::Tensor sparse_C = at::zeros({batch_size, num_block, 32, 32}, dense_A.options());
+
+  dim3 threads(32, 4);
+  dim3 blocks(num_block / 4, batch_size);
+
+  scatter_cuda_kernel<<<blocks, threads>>>(
+    dense_A.data_ptr<float>(),
+    indices.data_ptr<int>(),
+    sparse_C.data_ptr<float>(),
+    batch_size,
+    A_num_block,
+    B_num_block,
+    num_block
+  );
+
+  return sparse_C;
+}
diff --git a/src/transformers/kernels/mra/cuda_launch.h b/src/transformers/kernels/mra/cuda_launch.h
new file mode 100644
index 000000000000..0200140ee337
--- /dev/null
+++ b/src/transformers/kernels/mra/cuda_launch.h
@@ -0,0 +1,39 @@
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include <vector>
+
+#define min(a, b) ((a)<(b)?(a):(b))
+#define max(a, b) ((a)>(b)?(a):(b))
+
+std::vector<at::Tensor> index_max_kernel(
+  at::Tensor index_vals,
+  at::Tensor indices,
+  int A_num_block,
+  int B_num_block
+);
+
+at::Tensor mm_to_sparse_kernel(
+  at::Tensor dense_A,
+  at::Tensor dense_B,
+  at::Tensor indices
+);
+
+at::Tensor sparse_dense_mm_kernel(
+  at::Tensor sparse_A,
+  at::Tensor indices,
+  at::Tensor dense_B,
+  int A_num_block
+);
+
+at::Tensor reduce_sum_kernel(
+  at::Tensor sparse_A,
+  at::Tensor indices,
+  int A_num_block,
+  int B_num_block
+);
+
+at::Tensor scatter_kernel(
+  at::Tensor dense_A,
+  at::Tensor indices,
+  int B_num_block
+);
diff --git a/src/transformers/kernels/mra/torch_extension.cpp b/src/transformers/kernels/mra/torch_extension.cpp
new file mode 100644
index 000000000000..60c9262b7792
--- /dev/null
+++ b/src/transformers/kernels/mra/torch_extension.cpp
@@ -0,0 +1,78 @@
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include "cuda_launch.h"
+#include <vector>
+
+std::vector<at::Tensor> index_max(
+  at::Tensor index_vals,
+  at::Tensor indices,
+  int A_num_block,
+  int B_num_block
+) {
+  return index_max_kernel(
+    index_vals,
+    indices,
+    A_num_block,
+    B_num_block
+  );
+}
+
+at::Tensor mm_to_sparse(
+  at::Tensor dense_A,
+  at::Tensor dense_B,
+  at::Tensor indices
+) {
+  return mm_to_sparse_kernel(
+    dense_A,
+    dense_B,
+    indices
+  );
+}
+
+at::Tensor sparse_dense_mm(
+  at::Tensor sparse_A,
+  at::Tensor indices,
+  at::Tensor dense_B,
+  int A_num_block
+) {
+  return sparse_dense_mm_kernel(
+    sparse_A,
+    indices,
+    dense_B,
+    A_num_block
+  );
+}
+
+at::Tensor reduce_sum(
+  at::Tensor sparse_A,
+  at::Tensor indices,
+  int A_num_block,
+  int B_num_block
+) {
+  return reduce_sum_kernel(
+    sparse_A,
+    indices,
+    A_num_block,
+    B_num_block
+  );
+}
+
+at::Tensor scatter(
+  at::Tensor dense_A,
+  at::Tensor indices,
+  int B_num_block
+) {
+  return scatter_kernel(
+    dense_A,
+    indices,
+    B_num_block
+  );
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("index_max", &index_max, "index_max (CUDA)");
+  m.def("mm_to_sparse", &mm_to_sparse, "mm_to_sparse (CUDA)");
+  m.def("sparse_dense_mm", &sparse_dense_mm, "sparse_dense_mm (CUDA)");
+  m.def("reduce_sum", &reduce_sum, "reduce_sum (CUDA)");
+  m.def("scatter", &scatter, "scatter (CUDA)");
+}
diff --git a/src/transformers/kernels/rwkv/wkv_cuda.cu b/src/transformers/kernels/rwkv/wkv_cuda.cu
new file mode 100644
index 000000000000..571d5a8a8307
--- /dev/null
+++ b/src/transformers/kernels/rwkv/wkv_cuda.cu
@@ -0,0 +1,187 @@
+#include <stdio.h>
+#include <assert.h>
+
+#define MIN_VALUE (-1e38)
+
+template <typename F>
+__global__ void kernel_forward(
+    const int B, const int T, const int C, const F *__restrict__ const _w, const F *__restrict__ const _u,
+    const F *__restrict__ const _k, const F *__restrict__ const _v, F *__restrict__ const _y
+) {
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int _b = idx / C;
+    const int _c = idx % C;
+    const int _offset = _b * T * C + _c;
+
+    F u = _u[_c];
+    F w = _w[_c];
+    const F *__restrict__ const k = _k + _offset;
+    const F *__restrict__ const v = _v + _offset;
+    F *__restrict__ const y = _y + _offset;
+
+    // aa and bb are running sums divided by exp(pp) (to avoid overflow)
+    F aa = 0, bb = 0, pp = MIN_VALUE;
+    for (int i = 0; i < T; i++) {
+        const int ii = i * C;
+        const F kk = k[ii];
+        const F vv = v[ii];
+
+        F ww = u + kk;
+        F p = max(pp, ww);
+        F e1 = exp(pp - p);
+        F e2 = exp(ww - p);
+        y[ii] = (e1 * aa + e2 * vv) / (e1 * bb + e2);
+        
+        ww = w + pp;
+        p = max(ww, kk);
+        e1 = exp(ww - p);
+        e2 = exp(kk - p);
+        aa = e1 * aa + e2 * vv;
+        bb = e1 * bb + e2;
+        pp = p;
+    }
+}
+
+template <typename F>
+__global__ void kernel_forward_with_state(
+    const int B, const int T, const int C, const F *__restrict__ const _w, const F *__restrict__ const _u,
+    const F *__restrict__ const _k, const F *__restrict__ const _v, F *__restrict__ const _y, F *__restrict__ const _s
+) {
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int _b = idx / C;
+    const int _c = idx % C;
+    const int _offset_s = _b * C * 3 + _c * 3;
+    const int _offset = _b * T * C + _c;
+
+    F u = _u[_c];
+    F w = _w[_c];
+    const F *__restrict__ const k = _k + _offset;
+    const F *__restrict__ const v = _v + _offset;
+    F *__restrict__ const y = _y + _offset;
+    F *__restrict__ const s = _s + _offset_s;
+
+    // aa and bb are running sums divided by exp(pp) (to avoid overflow)
+    F aa = s[0], bb = s[1], pp = s[2];
+    for (int i = 0; i < T; i++) {
+        const int ii = i * C;
+        const F kk = k[ii];
+        const F vv = v[ii];
+
+        F ww = u + kk;
+        F p = max(pp, ww);
+        F e1 = exp(pp - p);
+        F e2 = exp(ww - p);
+        y[ii] = (e1 * aa + e2 * vv) / (e1 * bb + e2);
+        
+        ww = w + pp;
+        p = max(ww, kk);
+        e1 = exp(ww - p);
+        e2 = exp(kk - p);
+        aa = e1 * aa + e2 * vv;
+        bb = e1 * bb + e2;
+        pp = p;
+    }
+    s[0] = aa;
+    s[1] = bb;
+    s[2] = pp;
+}
+
+template <typename F>
+__global__ void kernel_backward(
+    const int B, const int T, const int C, const F *__restrict__ const _w, const F *__restrict__ const _u,
+    const F *__restrict__ const _k, const F *__restrict__ const _v, const F *__restrict__ const _y,
+    const F *__restrict__ const _gy, F *__restrict__ const _gw, F *__restrict__ const _gu, F *__restrict__ const _gk,
+    F *__restrict__ const _gv
+) {
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int _b = idx / C;
+    const int _c = idx % C;
+    const int _offset = _b * T * C + _c;
+
+    F u = _u[_c];
+    F w = _w[_c];
+    const F *__restrict__ const k = _k + _offset;
+    const F *__restrict__ const v = _v + _offset;
+    const F *__restrict__ const y = _y + _offset;
+    const F *__restrict__ const gy = _gy + _offset;
+    F *__restrict__ const gk = _gk + _offset;
+    F *__restrict__ const gv = _gv + _offset;
+
+    F q[Tmax], r[Tmax];
+
+    F gw = 0, gu = 0, aa = 0, bb = 0, ga = 0, gb = 0, pp = MIN_VALUE;
+    for (int i = 0; i < T; i++) {
+        const int ii = i * C;
+        const F kk = k[ii];
+        const F vv = v[ii];
+        const F yy = y[ii];
+
+        F ww = u + kk;
+        F p = max(pp, ww);
+        F e1 = exp(pp - p);
+        F e2 = exp(ww - p);
+        const F qq = gy[ii] / (e1 * bb + e2);
+        gw += (ga - gb * yy) * e1 * qq;
+        gu += (vv - yy) * e2 * qq;
+        q[i] = qq;
+        r[i] = ww - p;
+
+        ww = w + pp;
+        p = max(ww, kk);
+        e1 = exp(ww - p);
+        e2 = exp(kk - p);
+        ga = e1 * (aa + ga);
+        gb = e1 * (bb + gb);
+        aa = e1 * aa + e2 * vv;
+        bb = e1 * bb + e2;
+        pp = p;
+    }
+    const int _offsetBC = _b * C + _c;
+    _gw[_offsetBC] = gw * _w[_c]; // multiply by w because of w -> -exp(w) in python forward()
+    _gu[_offsetBC] = gu;
+
+    aa = 0, bb = 0, pp = MIN_VALUE;
+    for (int i = T - 1; i >= 0; i--) {
+        const int ii = i * C;
+        const F kk = k[ii];
+        const F vv = v[ii];
+        const F yy = y[ii];
+        const F qq = q[i];
+        const F rr = r[i];
+
+        F e1 = qq * exp(rr);
+        F e2 = exp(kk + pp);
+        gk[ii] = e1 * (vv - yy) + e2 * (aa * vv + bb);
+        gv[ii] = e1 + e2 * aa;
+
+        const F ww = w + pp;
+        const F www = rr - u - kk;
+        const F p = max(ww, www);
+        e1 = exp(ww - p);
+        e2 = qq * exp(www - p);
+        aa = e1 * aa + e2;
+        bb = e1 * bb - e2 * yy;
+        pp = p;
+    }
+}
+
+void cuda_forward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y) {
+    dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance
+    assert(B * C % threadsPerBlock.x == 0);
+    dim3 numBlocks(B * C / threadsPerBlock.x);
+    kernel_forward<<<numBlocks, threadsPerBlock>>>(B, T, C, w, u, k, v, y);
+}
+
+void cuda_forward_with_state(int B, int T, int C, float *w, float *u, float *k, float *v, float *y, float *s) {
+    dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance
+    assert(B * C % threadsPerBlock.x == 0);
+    dim3 numBlocks(B * C / threadsPerBlock.x);
+    kernel_forward_with_state<<<numBlocks, threadsPerBlock>>>(B, T, C, w, u, k, v, y, s);
+}
+
+void cuda_backward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y, float *gy, float *gw, float *gu, float *gk, float *gv) {
+    dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance
+    assert(B * C % threadsPerBlock.x == 0);
+    dim3 numBlocks(B * C / threadsPerBlock.x);
+    kernel_backward<<<numBlocks, threadsPerBlock>>>(B, T, C, w, u, k, v, y, gy, gw, gu, gk, gv);
+}
diff --git a/src/transformers/kernels/rwkv/wkv_cuda_bf16.cu b/src/transformers/kernels/rwkv/wkv_cuda_bf16.cu
new file mode 100644
index 000000000000..042cb4aba1db
--- /dev/null
+++ b/src/transformers/kernels/rwkv/wkv_cuda_bf16.cu
@@ -0,0 +1,186 @@
+#include <stdio.h>
+#include <assert.h>
+#include "ATen/ATen.h"
+#define MIN_VALUE (-1e38)
+typedef at::BFloat16 bf16;
+
+__global__ void kernel_forward_bf16(
+    const int B, const int T, const int C, const float *__restrict__ const _w, const bf16 *__restrict__ const _u,
+    const bf16 *__restrict__ const _k, const bf16 *__restrict__ const _v, bf16 *__restrict__ const _y
+) {
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int _b = idx / C;
+    const int _c = idx % C;
+    const int _offset = _b * T * C + _c;
+
+    float u = float(_u[_c]);
+    float w = _w[_c];
+    const bf16 *__restrict__ const k = _k + _offset;
+    const bf16 *__restrict__ const v = _v + _offset;
+    bf16 *__restrict__ const y = _y + _offset;
+
+    // aa and bb are running sums divided by exp(pp) (to avoid overflow)
+    float aa = 0, bb = 0, pp = MIN_VALUE;
+    for (int i = 0; i < T; i++) {
+        const int ii = i * C;
+        const float kk = float(k[ii]);
+        const float vv = float(v[ii]);
+
+        float ww = u + kk;
+        float p = max(pp, ww);
+        float e1 = exp(pp - p);
+        float e2 = exp(ww - p);
+        y[ii] = bf16((e1 * aa + e2 * vv) / (e1 * bb + e2));
+        
+        ww = w + pp;
+        p = max(ww, kk);
+        e1 = exp(ww - p);
+        e2 = exp(kk - p);
+        aa = e1 * aa + e2 * vv;
+        bb = e1 * bb + e2;
+        pp = p;
+    }
+}
+
+__global__ void kernel_forward_with_state_bf16(
+    const int B, const int T, const int C, const float *__restrict__ const _w, const bf16 *__restrict__ const _u,
+    const bf16 *__restrict__ const _k, const bf16 *__restrict__ const _v, bf16 *__restrict__ const _y,
+    float *__restrict__ const _s
+) {
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int _b = idx / C;
+    const int _c = idx % C;
+    const int _offset_s = _b * C * 3 + _c * 3;
+    const int _offset = _b * T * C + _c;
+
+    float u = float(_u[_c]);
+    float w = _w[_c];
+    const bf16 *__restrict__ const k = _k + _offset;
+    const bf16 *__restrict__ const v = _v + _offset;
+    bf16 *__restrict__ const y = _y + _offset;
+    float *__restrict__ const s = _s + _offset_s;
+
+    // aa and bb are running sums divided by exp(pp) (to avoid overflow)
+    float aa = s[0], bb = s[1], pp = s[2];
+    for (int i = 0; i < T; i++) {
+        const int ii = i * C;
+        const float kk = float(k[ii]);
+        const float vv = float(v[ii]);
+
+        float ww = u + kk;
+        float p = max(pp, ww);
+        float e1 = exp(pp - p);
+        float e2 = exp(ww - p);
+        y[ii] = bf16(e1 * aa + e2 * vv) / (e1 * bb + e2);
+        
+        ww = w + pp;
+        p = max(ww, kk);
+        e1 = exp(ww - p);
+        e2 = exp(kk - p);
+        aa = e1 * aa + e2 * vv;
+        bb = e1 * bb + e2;
+        pp = p;
+    }
+    s[0] = aa;
+    s[1] = bb;
+    s[2] = pp;
+}
+
+__global__ void kernel_backward_bf16(
+    const int B, const int T, const int C, const float *__restrict__ const _w, const bf16 *__restrict__ const _u,
+    const bf16 *__restrict__ const _k, const bf16 *__restrict__ const _v, const bf16 *__restrict__ const _y,
+    const bf16 *__restrict__ const _gy, bf16 *__restrict__ const _gw, bf16 *__restrict__ const _gu,
+    bf16 *__restrict__ const _gk, bf16 *__restrict__ const _gv
+) {
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int _b = idx / C;
+    const int _c = idx % C;
+    const int _offset = _b * T * C + _c;
+
+    float u = float(_u[_c]);
+    float w = _w[_c];
+    const bf16 *__restrict__ const k = _k + _offset;
+    const bf16 *__restrict__ const v = _v + _offset;
+    const bf16 *__restrict__ const y = _y + _offset;
+    const bf16 *__restrict__ const gy = _gy + _offset;
+    bf16 *__restrict__ const gk = _gk + _offset;
+    bf16 *__restrict__ const gv = _gv + _offset;
+
+    float q[Tmax], r[Tmax];
+
+    float gw = 0, gu = 0, aa = 0, bb = 0, ga = 0, gb = 0, pp = MIN_VALUE;
+    for (int i = 0; i < T; i++) {
+        const int ii = i * C;
+        const float kk = float(k[ii]);
+        const float vv = float(v[ii]);
+        const float yy = float(y[ii]);
+
+        float ww = u + kk;
+        float p = max(pp, ww);
+        float e1 = exp(pp - p);
+        float e2 = exp(ww - p);
+        const float qq = float(gy[ii]) / (e1 * bb + e2);
+        gw += (ga - gb * yy) * e1 * qq;
+        gu += (vv - yy) * e2 * qq;
+        q[i] = qq;
+        r[i] = ww - p;
+
+        ww = w + pp;
+        p = max(ww, kk);
+        e1 = exp(ww - p);
+        e2 = exp(kk - p);
+        ga = e1 * (aa + ga);
+        gb = e1 * (bb + gb);
+        aa = e1 * aa + e2 * vv;
+        bb = e1 * bb + e2;
+        pp = p;
+    }
+    const int _offsetBC = _b * C + _c;
+    _gw[_offsetBC] = bf16(gw * _w[_c]); // multiply by w because of w -> -exp(w) in python forward()
+    _gu[_offsetBC] = bf16(gu);
+
+    aa = 0, bb = 0, pp = MIN_VALUE;
+    for (int i = T - 1; i >= 0; i--) {
+        const int ii = i * C;
+        const float kk = float(k[ii]);
+        const float vv = float(v[ii]);
+        const float yy = float(y[ii]);
+        const float qq = q[i];
+        const float rr = r[i];
+
+        float e1 = qq * exp(rr);
+        float e2 = exp(kk + pp);
+        gk[ii] = bf16(e1 * (vv - yy) + e2 * (aa * vv + bb));
+        gv[ii] = bf16(e1 + e2 * aa);
+
+        const float ww = w + pp;
+        const float www = rr - u - kk;
+        const float p = max(ww, www);
+        e1 = exp(ww - p);
+        e2 = qq * exp(www - p);
+        aa = e1 * aa + e2;
+        bb = e1 * bb - e2 * yy;
+        pp = p;
+    }
+}
+
+void cuda_forward_bf16(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y) {
+    dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance
+    assert(B * C % threadsPerBlock.x == 0);
+    dim3 numBlocks(B * C / threadsPerBlock.x);
+    kernel_forward_bf16<<<numBlocks, threadsPerBlock>>>(B, T, C, w, u, k, v, y);
+}
+
+void cuda_forward_with_state_bf16(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y, float *s) {
+    dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance
+    assert(B * C % threadsPerBlock.x == 0);
+    dim3 numBlocks(B * C / threadsPerBlock.x);
+    kernel_forward_with_state_bf16<<<numBlocks, threadsPerBlock>>>(B, T, C, w, u, k, v, y, s);
+}
+
+void cuda_backward_bf16(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y, bf16 *gy, bf16 *gw, bf16 *gu, bf16 *gk, bf16 *gv) {
+    dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance
+    assert(B * C % threadsPerBlock.x == 0);
+    dim3 numBlocks(B * C / threadsPerBlock.x);
+    kernel_backward_bf16<<<numBlocks, threadsPerBlock>>>(B, T, C, w, u, k, v, y, gy, gw, gu, gk, gv);
+}
diff --git a/src/transformers/kernels/rwkv/wkv_op.cpp b/src/transformers/kernels/rwkv/wkv_op.cpp
new file mode 100644
index 000000000000..55e728066592
--- /dev/null
+++ b/src/transformers/kernels/rwkv/wkv_op.cpp
@@ -0,0 +1,66 @@
+#include <torch/extension.h>
+#include "ATen/ATen.h"
+typedef at::BFloat16 bf16;
+
+void cuda_forward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y);
+void cuda_forward_bf16(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y);
+void cuda_forward_with_state(int B, int T, int C, float *w, float *u, float *k, float *v, float *y, float *s);
+void cuda_forward_with_state_bf16(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y, float *s);
+void cuda_backward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y, float *gy, float *gw, float *gu, float *gk, float *gv);
+void cuda_backward_bf16(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y, bf16 *gy, bf16 *gw, bf16 *gu, bf16 *gk, bf16 *gv);
+
+void forward(torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y) {
+    const int B = k.size(0);
+    const int T = k.size(1);
+    const int C = k.size(2);
+    cuda_forward(B, T, C, w.data_ptr<float>(), u.data_ptr<float>(), k.data_ptr<float>(), v.data_ptr<float>(), y.data_ptr<float>());
+}
+void forward_bf16(torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y) {
+    const int B = k.size(0);
+    const int T = k.size(1);
+    const int C = k.size(2);
+    cuda_forward_bf16(B, T, C, w.data_ptr<float>(), u.data_ptr<bf16>(), k.data_ptr<bf16>(), v.data_ptr<bf16>(), y.data_ptr<bf16>());
+}
+void forward_with_state(torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y, torch::Tensor &s) {
+    const int B = k.size(0);
+    const int T = k.size(1);
+    const int C = k.size(2);
+    cuda_forward_with_state(B, T, C, w.data_ptr<float>(), u.data_ptr<float>(), k.data_ptr<float>(), v.data_ptr<float>(), y.data_ptr<float>(), s.data_ptr<float>());
+}
+void forward_with_state_bf16(torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y, torch::Tensor &s) {
+    const int B = k.size(0);
+    const int T = k.size(1);
+    const int C = k.size(2);
+    cuda_forward_with_state_bf16(B, T, C, w.data_ptr<float>(), u.data_ptr<bf16>(), k.data_ptr<bf16>(), v.data_ptr<bf16>(), y.data_ptr<bf16>(), s.data_ptr<float>());
+}
+void backward(torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y, torch::Tensor &gy, torch::Tensor &gw, torch::Tensor &gu, torch::Tensor &gk, torch::Tensor &gv) {
+    const int B = k.size(0);
+    const int T = k.size(1);
+    const int C = k.size(2);
+    cuda_backward(B, T, C, w.data_ptr<float>(), u.data_ptr<float>(), k.data_ptr<float>(), v.data_ptr<float>(), y.data_ptr<float>(), gy.data_ptr<float>(), gw.data_ptr<float>(), gu.data_ptr<float>(), gk.data_ptr<float>(), gv.data_ptr<float>());
+}
+void backward_bf16(torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y, torch::Tensor &gy, torch::Tensor &gw, torch::Tensor &gu, torch::Tensor &gk, torch::Tensor &gv) {
+    const int B = k.size(0);
+    const int T = k.size(1);
+    const int C = k.size(2);
+    cuda_backward_bf16(B, T, C, w.data_ptr<float>(), u.data_ptr<bf16>(), k.data_ptr<bf16>(), v.data_ptr<bf16>(), y.data_ptr<bf16>(),
+        gy.data_ptr<bf16>(), gw.data_ptr<bf16>(), gu.data_ptr<bf16>(), gk.data_ptr<bf16>(), gv.data_ptr<bf16>());
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("forward", &forward, "wkv forward");
+    m.def("forward_bf16", &forward_bf16, "wkv forward bf16");
+    m.def("forward_with_state", &forward_with_state, "wkv forward with state");
+    m.def("forward_with_state_bf16", &forward_with_state_bf16, "wkv forward with state bf16");
+    m.def("backward", &backward, "wkv backward");
+    m.def("backward_bf16", &backward_bf16, "wkv backward bf16");
+}
+
+TORCH_LIBRARY(wkv, m) {
+    m.def("forward", forward);
+    m.def("forward_bf16", forward_bf16);
+    m.def("forward_with_state", forward_with_state);
+    m.def("forward_with_state_bf16", forward_with_state_bf16);
+    m.def("backward", backward);
+    m.def("backward_bf16", backward_bf16);
+}
diff --git a/src/transformers/models/yoso/common.h b/src/transformers/kernels/yoso/common.h
similarity index 100%
rename from src/transformers/models/yoso/common.h
rename to src/transformers/kernels/yoso/common.h
diff --git a/src/transformers/models/yoso/common_cuda.h b/src/transformers/kernels/yoso/common_cuda.h
similarity index 100%
rename from src/transformers/models/yoso/common_cuda.h
rename to src/transformers/kernels/yoso/common_cuda.h
diff --git a/src/transformers/models/yoso/common_cuda_device.h b/src/transformers/kernels/yoso/common_cuda_device.h
similarity index 100%
rename from src/transformers/models/yoso/common_cuda_device.h
rename to src/transformers/kernels/yoso/common_cuda_device.h
diff --git a/src/transformers/models/yoso/fast_lsh_cumulation.cu b/src/transformers/kernels/yoso/fast_lsh_cumulation.cu
similarity index 100%
rename from src/transformers/models/yoso/fast_lsh_cumulation.cu
rename to src/transformers/kernels/yoso/fast_lsh_cumulation.cu
diff --git a/src/transformers/models/yoso/fast_lsh_cumulation.h b/src/transformers/kernels/yoso/fast_lsh_cumulation.h
similarity index 100%
rename from src/transformers/models/yoso/fast_lsh_cumulation.h
rename to src/transformers/kernels/yoso/fast_lsh_cumulation.h
diff --git a/src/transformers/models/yoso/fast_lsh_cumulation_cuda.cu b/src/transformers/kernels/yoso/fast_lsh_cumulation_cuda.cu
similarity index 100%
rename from src/transformers/models/yoso/fast_lsh_cumulation_cuda.cu
rename to src/transformers/kernels/yoso/fast_lsh_cumulation_cuda.cu
diff --git a/src/transformers/models/yoso/fast_lsh_cumulation_cuda.h b/src/transformers/kernels/yoso/fast_lsh_cumulation_cuda.h
similarity index 100%
rename from src/transformers/models/yoso/fast_lsh_cumulation_cuda.h
rename to src/transformers/kernels/yoso/fast_lsh_cumulation_cuda.h
diff --git a/src/transformers/models/yoso/fast_lsh_cumulation_torch.cpp b/src/transformers/kernels/yoso/fast_lsh_cumulation_torch.cpp
similarity index 100%
rename from src/transformers/models/yoso/fast_lsh_cumulation_torch.cpp
rename to src/transformers/kernels/yoso/fast_lsh_cumulation_torch.cpp
diff --git a/src/transformers/modelcard.py b/src/transformers/modelcard.py
index 4c93b810ec5d..3f413f152977 100644
--- a/src/transformers/modelcard.py
+++ b/src/transformers/modelcard.py
@@ -43,6 +43,7 @@
     MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES,
     MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES,
     MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES,
+    MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES,
 )
 from .training_args import ParallelMode
 from .utils import (
@@ -70,6 +71,7 @@
     "token-classification": MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES,
     "audio-classification": MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES,
     "automatic-speech-recognition": {**MODEL_FOR_CTC_MAPPING_NAMES, **MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES},
+    "zero-shot-image-classification": MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES,
 }
 
 logger = logging.get_logger(__name__)
@@ -277,6 +279,7 @@ def to_json_file(self, json_file_path):
     "translation": "Translation",
     "zero-shot-classification": "Zero Shot Classification",
     "automatic-speech-recognition": "Automatic Speech Recognition",
+    "audio-classification": "Audio Classification",
 }
 
 
@@ -384,7 +387,7 @@ def __post_init__(self):
                 for tag in info.tags:
                     if tag.startswith("license:"):
                         self.license = tag[8:]
-            except (requests.exceptions.HTTPError, HFValidationError):
+            except (requests.exceptions.HTTPError, requests.exceptions.ConnectionError, HFValidationError):
                 pass
 
     def create_model_index(self, metric_mapping):
@@ -397,9 +400,9 @@ def create_model_index(self, metric_mapping):
         dataset_metadata = _listify(self.dataset_metadata)
         if len(dataset_args) < len(dataset_tags):
             dataset_args = dataset_args + [None] * (len(dataset_tags) - len(dataset_args))
-        dataset_mapping = {tag: name for tag, name in zip(dataset_tags, dataset_names)}
-        dataset_arg_mapping = {tag: arg for tag, arg in zip(dataset_tags, dataset_args)}
-        dataset_metadata_mapping = {tag: metadata for tag, metadata in zip(dataset_tags, dataset_metadata)}
+        dataset_mapping = dict(zip(dataset_tags, dataset_names))
+        dataset_arg_mapping = dict(zip(dataset_tags, dataset_args))
+        dataset_metadata_mapping = dict(zip(dataset_tags, dataset_metadata))
 
         task_mapping = {
             task: TASK_TAG_TO_NAME_MAPPING[task] for task in _listify(self.tasks) if task in TASK_TAG_TO_NAME_MAPPING
@@ -456,6 +459,8 @@ def create_metadata(self):
         metadata = {}
         metadata = _insert_values_as_list(metadata, "language", self.language)
         metadata = _insert_value(metadata, "license", self.license)
+        if self.finetuned_from is not None and isinstance(self.finetuned_from, str) and len(self.finetuned_from) > 0:
+            metadata = _insert_value(metadata, "base_model", self.finetuned_from)
         metadata = _insert_values_as_list(metadata, "tags", self.tags)
         metadata = _insert_values_as_list(metadata, "datasets", self.dataset_tags)
         metadata = _insert_values_as_list(metadata, "metrics", list(metric_mapping.keys()))
@@ -677,7 +682,7 @@ def from_keras(
             _, eval_lines, eval_results = parse_keras_history(keras_history)
         else:
             eval_lines = []
-            eval_results = dict()
+            eval_results = {}
         hyperparameters = extract_hyperparameters_from_keras(model)
 
         return cls(
@@ -706,7 +711,7 @@ def parse_keras_history(logs):
         # This looks like a `History` object
         if not hasattr(logs, "epoch"):
             # This history looks empty, return empty results
-            return None, [], dict()
+            return None, [], {}
         logs.history["epoch"] = logs.epoch
         logs = logs.history
     else:
@@ -716,7 +721,7 @@ def parse_keras_history(logs):
     lines = []
     for i in range(len(logs["epoch"])):
         epoch_dict = {log_key: log_value_list[i] for log_key, log_value_list in logs.items()}
-        values = dict()
+        values = {}
         for k, v in epoch_dict.items():
             if k.startswith("val_"):
                 k = "validation_" + k[4:]
@@ -797,7 +802,7 @@ def parse_log_history(log_history):
 def extract_hyperparameters_from_keras(model):
     import tensorflow as tf
 
-    hyperparameters = dict()
+    hyperparameters = {}
     if hasattr(model, "optimizer") and model.optimizer is not None:
         hyperparameters["optimizer"] = model.optimizer.get_config()
     else:
@@ -888,12 +893,13 @@ def extract_hyperparameters_from_trainer(trainer):
         hyperparameters["training_steps"] = trainer.args.max_steps
     else:
         hyperparameters["num_epochs"] = trainer.args.num_train_epochs
-
-    if trainer.args.fp16:
-        if trainer.use_cuda_amp:
-            hyperparameters["mixed_precision_training"] = "Native AMP"
-        elif trainer.use_apex:
-            hyperparameters["mixed_precision_training"] = f"Apex, opt level {trainer.args.fp16_opt_level}"
+    
+    # Note: "use_cuda_amp" was removed in https://github.com/huggingface/transformers/pull/25702#discussion_r1318935474 and is causing issues in upstream. TODO: Re-enable when solution is found. 
+   # if trainer.args.fp16:
+   #     if trainer.use_cuda_amp:
+   #         hyperparameters["mixed_precision_training"] = "Native AMP"
+   #     elif trainer.use_apex:
+   #         hyperparameters["mixed_precision_training"] = f"Apex, opt level {trainer.args.fp16_opt_level}"
 
     if trainer.args.label_smoothing_factor != 0.0:
         hyperparameters["label_smoothing_factor"] = trainer.args.label_smoothing_factor
diff --git a/src/transformers/modeling_flax_outputs.py b/src/transformers/modeling_flax_outputs.py
index 4f6cc5a901f8..179a0b787936 100644
--- a/src/transformers/modeling_flax_outputs.py
+++ b/src/transformers/modeling_flax_outputs.py
@@ -45,6 +45,64 @@ class FlaxBaseModelOutput(ModelOutput):
     attentions: Optional[Tuple[jnp.ndarray]] = None
 
 
+@flax.struct.dataclass
+class FlaxBaseModelOutputWithNoAttention(ModelOutput):
+    """
+    Base class for model's outputs, with potential hidden states.
+
+    Args:
+        last_hidden_state (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `jnp.ndarray` (one for the output of the embeddings, if the model has an embedding layer, + one
+            for the output of each layer) of shape `(batch_size, num_channels, height, width)`. Hidden-states of the
+            model at the output of each layer plus the optional initial embedding outputs.
+    """
+
+    last_hidden_state: jnp.ndarray = None
+    hidden_states: Optional[Tuple[jnp.ndarray]] = None
+
+
+@flax.struct.dataclass
+class FlaxBaseModelOutputWithPoolingAndNoAttention(ModelOutput):
+    """
+    Base class for model's outputs that also contains a pooling of the last hidden states.
+
+    Args:
+        last_hidden_state (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        pooler_output (`jnp.ndarray` of shape `(batch_size, hidden_size)`):
+            Last layer hidden-state after a pooling operation on the spatial dimensions.
+        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `jnp.ndarray` (one for the output of the embeddings, if the model has an embedding layer, + one
+            for the output of each layer) of shape `(batch_size, num_channels, height, width)`. Hidden-states of the
+            model at the output of each layer plus the optional initial embedding outputs.
+    """
+
+    last_hidden_state: jnp.ndarray = None
+    pooler_output: jnp.ndarray = None
+    hidden_states: Optional[Tuple[jnp.ndarray]] = None
+
+
+@flax.struct.dataclass
+class FlaxImageClassifierOutputWithNoAttention(ModelOutput):
+    """
+    Base class for outputs of image classification models.
+
+    Args:
+        logits (`jnp.ndarray` of shape `(batch_size, config.num_labels)`):
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when
+        `config.output_hidden_states=True`):
+            Tuple of `jnp.ndarray` (one for the output of the embeddings, if the model has an embedding layer, + one
+            for the output of each stage) of shape `(batch_size, num_channels, height, width)`. Hidden-states (also
+            called feature maps) of the model at the output of each stage.
+    """
+
+    logits: jnp.ndarray = None
+    hidden_states: Optional[Tuple[jnp.ndarray]] = None
+
+
 @flax.struct.dataclass
 class FlaxBaseModelOutputWithPast(ModelOutput):
     """
diff --git a/src/transformers/modeling_flax_pytorch_utils.py b/src/transformers/modeling_flax_pytorch_utils.py
index 47da8c2871b3..79d91da49729 100644
--- a/src/transformers/modeling_flax_pytorch_utils.py
+++ b/src/transformers/modeling_flax_pytorch_utils.py
@@ -19,14 +19,14 @@
 from pickle import UnpicklingError
 from typing import Dict, Tuple
 
-import numpy as np
-
 import jax
 import jax.numpy as jnp
-import transformers
+import numpy as np
 from flax.serialization import from_bytes
 from flax.traverse_util import flatten_dict, unflatten_dict
 
+import transformers
+
 from .utils import logging
 
 
@@ -76,13 +76,23 @@ def rename_key_and_reshape_tensor(
 
     def is_key_or_prefix_key_in_dict(key: Tuple[str]) -> bool:
         """Checks if `key` of `(prefix,) + key` is in random_flax_state_dict"""
-        return len(set(random_flax_state_dict) & set([key, (model_prefix,) + key])) > 0
+        return len(set(random_flax_state_dict) & {key, (model_prefix,) + key}) > 0
 
     # layer norm
     renamed_pt_tuple_key = pt_tuple_key[:-1] + ("scale",)
     if pt_tuple_key[-1] in ["weight", "gamma"] and is_key_or_prefix_key_in_dict(renamed_pt_tuple_key):
         return renamed_pt_tuple_key, pt_tensor
 
+    # batch norm layer mean
+    renamed_pt_tuple_key = pt_tuple_key[:-1] + ("mean",)
+    if pt_tuple_key[-1] == "running_mean" and not is_key_or_prefix_key_in_dict(pt_tuple_key):
+        return renamed_pt_tuple_key, pt_tensor
+
+    # batch norm layer var
+    renamed_pt_tuple_key = pt_tuple_key[:-1] + ("var",)
+    if pt_tuple_key[-1] == "running_var" and not is_key_or_prefix_key_in_dict(pt_tuple_key):
+        return renamed_pt_tuple_key, pt_tensor
+
     # embedding
     renamed_pt_tuple_key = pt_tuple_key[:-1] + ("embedding",)
     if pt_tuple_key[-1] == "weight" and is_key_or_prefix_key_in_dict(renamed_pt_tuple_key):
@@ -110,28 +120,64 @@ def is_key_or_prefix_key_in_dict(key: Tuple[str]) -> bool:
     if pt_tuple_key[-1] == "beta":
         return renamed_pt_tuple_key, pt_tensor
 
+    # New `weight_norm` from https://github.com/huggingface/transformers/pull/24030
+    name = None
+    if pt_tuple_key[-3::2] == ("parametrizations", "original0"):
+        name = pt_tuple_key[-2] + "_g"
+    elif pt_tuple_key[-3::2] == ("parametrizations", "original1"):
+        name = pt_tuple_key[-2] + "_v"
+    if name is not None:
+        renamed_pt_tuple_key = pt_tuple_key[:-3] + (name,)
+        return renamed_pt_tuple_key, pt_tensor
+
     return pt_tuple_key, pt_tensor
 
 
 def convert_pytorch_state_dict_to_flax(pt_state_dict, flax_model):
     # convert pytorch tensor to numpy
-    pt_state_dict = {k: v.numpy() for k, v in pt_state_dict.items()}
+    # numpy currently does not support bfloat16, need to go over float32 in this case to not lose precision
+    try:
+        import torch  # noqa: F401
+    except ImportError:
+        logger.error(
+            "Loading a PyTorch model in Flax, requires both PyTorch and Flax to be installed. Please see"
+            " https://pytorch.org/ and https://flax.readthedocs.io/en/latest/installation.html for installation"
+            " instructions."
+        )
+        raise
+
+    weight_dtypes = {k: v.dtype for k, v in pt_state_dict.items()}
+    pt_state_dict = {
+        k: v.numpy() if not v.dtype == torch.bfloat16 else v.float().numpy() for k, v in pt_state_dict.items()
+    }
 
     model_prefix = flax_model.base_model_prefix
-    random_flax_state_dict = flatten_dict(flax_model.params)
+
+    # use params dict if the model contains batch norm layers
+    if "params" in flax_model.params:
+        flax_model_params = flax_model.params["params"]
+    else:
+        flax_model_params = flax_model.params
+    random_flax_state_dict = flatten_dict(flax_model_params)
+
+    # add batch_stats keys,values to dict
+    if "batch_stats" in flax_model.params:
+        flax_batch_stats = flatten_dict(flax_model.params["batch_stats"])
+        random_flax_state_dict.update(flax_batch_stats)
+
     flax_state_dict = {}
 
-    load_model_with_head_into_base_model = (model_prefix not in flax_model.params) and (
-        model_prefix in set([k.split(".")[0] for k in pt_state_dict.keys()])
+    load_model_with_head_into_base_model = (model_prefix not in flax_model_params) and (
+        model_prefix in {k.split(".")[0] for k in pt_state_dict.keys()}
     )
-    load_base_model_into_model_with_head = (model_prefix in flax_model.params) and (
-        model_prefix not in set([k.split(".")[0] for k in pt_state_dict.keys()])
+    load_base_model_into_model_with_head = (model_prefix in flax_model_params) and (
+        model_prefix not in {k.split(".")[0] for k in pt_state_dict.keys()}
     )
 
     # Need to change some parameters name to match Flax names
     for pt_key, pt_tensor in pt_state_dict.items():
-
         pt_tuple_key = tuple(pt_key.split("."))
+        is_bfloat_16 = weight_dtypes[pt_key] == torch.bfloat16
 
         # remove base model prefix if necessary
         has_base_model_prefix = pt_tuple_key[0] == model_prefix
@@ -155,8 +201,26 @@ def convert_pytorch_state_dict_to_flax(pt_state_dict, flax_model):
                     f"{random_flax_state_dict[flax_key].shape}, but is {flax_tensor.shape}."
                 )
 
-        # also add unexpected weight so that warning is thrown
-        flax_state_dict[flax_key] = jnp.asarray(flax_tensor)
+        # add batch stats if the model contains batchnorm layers
+        if "batch_stats" in flax_model.params:
+            if "mean" in flax_key[-1] or "var" in flax_key[-1]:
+                flax_state_dict[("batch_stats",) + flax_key] = jnp.asarray(flax_tensor)
+                continue
+            # remove num_batches_tracked key
+            if "num_batches_tracked" in flax_key[-1]:
+                flax_state_dict.pop(flax_key, None)
+                continue
+
+            # also add unexpected weight so that warning is thrown
+            flax_state_dict[("params",) + flax_key] = (
+                jnp.asarray(flax_tensor) if not is_bfloat_16 else jnp.asarray(flax_tensor, dtype=jnp.bfloat16)
+            )
+
+        else:
+            # also add unexpected weight so that warning is thrown
+            flax_state_dict[flax_key] = (
+                jnp.asarray(flax_tensor) if not is_bfloat_16 else jnp.asarray(flax_tensor, dtype=jnp.bfloat16)
+            )
 
     return unflatten_dict(flax_state_dict)
 
@@ -177,17 +241,25 @@ def convert_pytorch_sharded_state_dict_to_flax(shard_filenames, flax_model):
         pt_state_dict = {k: v.numpy() for k, v in pt_state_dict.items()}
 
         model_prefix = flax_model.base_model_prefix
-        random_flax_state_dict = flatten_dict(flax_model.params)
 
-        load_model_with_head_into_base_model = (model_prefix not in flax_model.params) and (
-            model_prefix in set([k.split(".")[0] for k in pt_state_dict.keys()])
+        # use params dict if the model contains batch norm layers and then add batch_stats keys,values to dict
+        if "batch_stats" in flax_model.params:
+            flax_model_params = flax_model.params["params"]
+
+            random_flax_state_dict = flatten_dict(flax_model_params)
+            random_flax_state_dict.update(flatten_dict(flax_model.params["batch_stats"]))
+        else:
+            flax_model_params = flax_model.params
+            random_flax_state_dict = flatten_dict(flax_model_params)
+
+        load_model_with_head_into_base_model = (model_prefix not in flax_model_params) and (
+            model_prefix in {k.split(".")[0] for k in pt_state_dict.keys()}
         )
-        load_base_model_into_model_with_head = (model_prefix in flax_model.params) and (
-            model_prefix not in set([k.split(".")[0] for k in pt_state_dict.keys()])
+        load_base_model_into_model_with_head = (model_prefix in flax_model_params) and (
+            model_prefix not in {k.split(".")[0] for k in pt_state_dict.keys()}
         )
         # Need to change some parameters name to match Flax names
         for pt_key, pt_tensor in pt_state_dict.items():
-
             pt_tuple_key = tuple(pt_key.split("."))
 
             # remove base model prefix if necessary
@@ -211,8 +283,25 @@ def convert_pytorch_sharded_state_dict_to_flax(shard_filenames, flax_model):
                         f"{random_flax_state_dict[flax_key].shape}, but is {flax_tensor.shape}."
                     )
 
-            # also add unexpected weight so that warning is thrown
-            flax_state_dict[flax_key] = jnp.asarray(flax_tensor)
+            # add batch stats if the model contains batchnorm layers
+            if "batch_stats" in flax_model.params:
+                if "mean" in flax_key[-1]:
+                    flax_state_dict[("batch_stats",) + flax_key] = jnp.asarray(flax_tensor)
+                    continue
+                if "var" in flax_key[-1]:
+                    flax_state_dict[("batch_stats",) + flax_key] = jnp.asarray(flax_tensor)
+                    continue
+                # remove num_batches_tracked key
+                if "num_batches_tracked" in flax_key[-1]:
+                    flax_state_dict.pop(flax_key, None)
+                    continue
+
+                # also add unexpected weight so that warning is thrown
+                flax_state_dict[("params",) + flax_key] = jnp.asarray(flax_tensor)
+
+            else:
+                # also add unexpected weight so that warning is thrown
+                flax_state_dict[flax_key] = jnp.asarray(flax_tensor)
     return unflatten_dict(flax_state_dict)
 
 
@@ -269,10 +358,10 @@ def load_flax_weights_in_pytorch_model(pt_model, flax_state):
     pt_model_dict = pt_model.state_dict()
 
     load_model_with_head_into_base_model = (pt_model.base_model_prefix in flax_state) and (
-        pt_model.base_model_prefix not in set([k.split(".")[0] for k in pt_model_dict.keys()])
+        pt_model.base_model_prefix not in {k.split(".")[0] for k in pt_model_dict.keys()}
     )
     load_base_model_into_model_with_head = (pt_model.base_model_prefix not in flax_state) and (
-        pt_model.base_model_prefix in set([k.split(".")[0] for k in pt_model_dict.keys()])
+        pt_model.base_model_prefix in {k.split(".")[0] for k in pt_model_dict.keys()}
     )
 
     # keep track of unexpected & missing keys
@@ -301,7 +390,34 @@ def load_flax_weights_in_pytorch_model(pt_model, flax_state):
         elif flax_key_tuple[-1] in ["scale", "embedding"]:
             flax_key_tuple = flax_key_tuple[:-1] + ("weight",)
 
-        flax_key = ".".join(flax_key_tuple)
+        # adding batch stats from flax batch norm to pt
+        elif "mean" in flax_key_tuple[-1]:
+            flax_key_tuple = flax_key_tuple[:-1] + ("running_mean",)
+        elif "var" in flax_key_tuple[-1]:
+            flax_key_tuple = flax_key_tuple[:-1] + ("running_var",)
+
+        if "batch_stats" in flax_state:
+            flax_key = ".".join(flax_key_tuple[1:])  # Remove the params/batch_stats header
+        else:
+            flax_key = ".".join(flax_key_tuple)
+
+        # We also need to look at `pt_model_dict` and see if there are keys requiring further transformation.
+        special_pt_names = {}
+        # New `weight_norm` from https://github.com/huggingface/transformers/pull/24030
+        for key in pt_model_dict:
+            key_components = key.split(".")
+            name = None
+            if key_components[-3::2] == ["parametrizations", "original0"]:
+                name = key_components[-2] + "_g"
+            elif key_components[-3::2] == ["parametrizations", "original1"]:
+                name = key_components[-2] + "_v"
+            if name is not None:
+                key_components = key_components[:-3] + [name]
+                key_to_check = ".".join(key_components)
+                special_pt_names[key_to_check] = key
+
+        if flax_key in special_pt_names:
+            flax_key = special_pt_names[flax_key]
 
         if flax_key in pt_model_dict:
             if flax_tensor.shape != pt_model_dict[flax_key].shape:
diff --git a/src/transformers/modeling_flax_utils.py b/src/transformers/modeling_flax_utils.py
index a643b43ab67f..64a42609fc11 100644
--- a/src/transformers/modeling_flax_utils.py
+++ b/src/transformers/modeling_flax_utils.py
@@ -18,9 +18,10 @@
 import json
 import os
 import re
+import warnings
 from functools import partial
 from pickle import UnpicklingError
-from typing import Any, Dict, Set, Tuple, Union
+from typing import Any, Dict, Optional, Set, Tuple, Union
 
 import flax.linen as nn
 import jax
@@ -440,7 +441,7 @@ def load_flax_sharded_weights(cls, shard_files):
         """
 
         # Load the index
-        state_sharded_dict = dict()
+        state_sharded_dict = {}
 
         for shard_file in shard_files:
             # load using msgpack utils
@@ -468,13 +469,15 @@ def load_flax_sharded_weights(cls, shard_files):
         # the state dict is unflattened to the match the format of model.params
         return unflatten_dict(state_sharded_dict, sep="/")
 
-    def can_generate(self) -> bool:
+    @classmethod
+    def can_generate(cls) -> bool:
         """
         Returns whether this model can generate sequences with `.generate()`. Returns:
             `bool`: Whether this model can generate sequences with `.generate()`.
         """
-        # Detects whether `prepare_inputs_for_generation` has been overwritten, which is a requirement for generation
-        if "GenerationMixin" in str(self.prepare_inputs_for_generation):
+        # Detects whether `prepare_inputs_for_generation` has been overwritten, which is a requirement for generation.
+        # Alternativelly, the model can also have a custom `generate` function.
+        if "GenerationMixin" in str(cls.prepare_inputs_for_generation) and "GenerationMixin" in str(cls.generate):
             return False
         return True
 
@@ -484,9 +487,15 @@ def from_pretrained(
         pretrained_model_name_or_path: Union[str, os.PathLike],
         dtype: jnp.dtype = jnp.float32,
         *model_args,
-        **kwargs
+        config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None,
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        ignore_mismatched_sizes: bool = False,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
+        **kwargs,
     ):
-
         r"""
         Instantiate a pretrained flax model from a pre-trained model configuration.
 
@@ -558,7 +567,7 @@ def from_pretrained(
                 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
             local_files_only(`bool`, *optional*, defaults to `False`):
                 Whether or not to only look at local files (i.e., do not try to download the model).
-            use_auth_token (`str` or `bool`, *optional*):
+            token (`str` or `bool`, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
                 the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
             revision (`str`, *optional*, defaults to `"main"`):
@@ -603,16 +612,10 @@ def from_pretrained(
         >>> config = BertConfig.from_json_file("./pt_model/config.json")
         >>> model = FlaxBertModel.from_pretrained("./pt_model/pytorch_model.bin", from_pt=True, config=config)
         ```"""
-        config = kwargs.pop("config", None)
-        cache_dir = kwargs.pop("cache_dir", None)
         from_pt = kwargs.pop("from_pt", False)
-        ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False)
-        force_download = kwargs.pop("force_download", False)
         resume_download = kwargs.pop("resume_download", False)
         proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", False)
         use_auth_token = kwargs.pop("use_auth_token", None)
-        revision = kwargs.pop("revision", None)
         trust_remote_code = kwargs.pop("trust_remote_code", None)
         from_pipeline = kwargs.pop("_from_pipeline", None)
         from_auto_class = kwargs.pop("_from_auto", False)
@@ -620,6 +623,19 @@ def from_pretrained(
         subfolder = kwargs.pop("subfolder", "")
         commit_hash = kwargs.pop("_commit_hash", None)
 
+        # Not relevant for Flax Models
+        _ = kwargs.pop("adapter_kwargs", None)
+
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
         if trust_remote_code is True:
             logger.warning(
                 "The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is"
@@ -645,7 +661,7 @@ def from_pretrained(
                 resume_download=resume_download,
                 proxies=proxies,
                 local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
+                token=token,
                 revision=revision,
                 subfolder=subfolder,
                 _from_auto=from_auto_class,
@@ -654,7 +670,7 @@ def from_pretrained(
                 **kwargs,
             )
         else:
-            model_kwargs = kwargs
+            model_kwargs = kwargs.copy()
 
         if commit_hash is None:
             commit_hash = getattr(config, "_commit_hash", None)
@@ -709,19 +725,19 @@ def from_pretrained(
                 filename = WEIGHTS_NAME if from_pt else FLAX_WEIGHTS_NAME
                 try:
                     # Load from URL or cache if already cached
-                    cached_file_kwargs = dict(
-                        cache_dir=cache_dir,
-                        force_download=force_download,
-                        proxies=proxies,
-                        resume_download=resume_download,
-                        local_files_only=local_files_only,
-                        use_auth_token=use_auth_token,
-                        user_agent=user_agent,
-                        revision=revision,
-                        subfolder=subfolder,
-                        _raise_exceptions_for_missing_entries=False,
-                        _commit_hash=commit_hash,
-                    )
+                    cached_file_kwargs = {
+                        "cache_dir": cache_dir,
+                        "force_download": force_download,
+                        "proxies": proxies,
+                        "resume_download": resume_download,
+                        "local_files_only": local_files_only,
+                        "token": token,
+                        "user_agent": user_agent,
+                        "revision": revision,
+                        "subfolder": subfolder,
+                        "_raise_exceptions_for_missing_entries": False,
+                        "_commit_hash": commit_hash,
+                    }
                     resolved_archive_file = cached_file(pretrained_model_name_or_path, filename, **cached_file_kwargs)
 
                     # Since we set _raise_exceptions_for_missing_entries=False, we don't get an expection but a None
@@ -746,7 +762,7 @@ def from_pretrained(
                         has_file_kwargs = {
                             "revision": revision,
                             "proxies": proxies,
-                            "use_auth_token": use_auth_token,
+                            "token": token,
                         }
                         if has_file(pretrained_model_name_or_path, WEIGHTS_NAME, **has_file_kwargs):
                             raise EnvironmentError(
@@ -797,7 +813,7 @@ def from_pretrained(
                 proxies=proxies,
                 resume_download=resume_download,
                 local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
+                token=token,
                 user_agent=user_agent,
                 revision=revision,
                 subfolder=subfolder,
@@ -810,7 +826,6 @@ def from_pretrained(
         if from_pt:
             state = load_pytorch_checkpoint_in_flax_state_dict(model, resolved_archive_file, is_sharded)
         else:
-
             if is_sharded:
                 state = cls.load_flax_sharded_weights(resolved_archive_file)
             else:
@@ -839,14 +854,35 @@ def from_pretrained(
                 # keep the params on CPU if we don't want to initialize
                 state = jax.tree_util.tree_map(lambda x: jax.device_put(x, jax.devices("cpu")[0]), state)
 
-        # if model is base model only use model_prefix key
-        if cls.base_model_prefix not in dict(model.params_shape_tree) and cls.base_model_prefix in state:
-            state = state[cls.base_model_prefix]
+        if "batch_stats" in state:  # if flax model contains batch norm layers
+            # if model is base model only use model_prefix key
+            if (
+                cls.base_model_prefix not in dict(model.params_shape_tree["params"])
+                and cls.base_model_prefix in state["params"]
+            ):
+                state["params"] = state["params"][cls.base_model_prefix]
+                state["batch_stats"] = state["batch_stats"][cls.base_model_prefix]
 
-        # if model is head model and we are loading weights from base model
-        # we initialize new params dict with base_model_prefix
-        if cls.base_model_prefix in dict(model.params_shape_tree) and cls.base_model_prefix not in state:
-            state = {cls.base_model_prefix: state}
+            # if model is head model and we are loading weights from base model
+            # we initialize new params dict with base_model_prefix
+            if (
+                cls.base_model_prefix in dict(model.params_shape_tree["params"])
+                and cls.base_model_prefix not in state["params"]
+            ):
+                state = {
+                    "params": {cls.base_model_prefix: state["params"]},
+                    "batch_stats": {cls.base_model_prefix: state["batch_stats"]},
+                }
+
+        else:
+            # if model is base model only use model_prefix key
+            if cls.base_model_prefix not in dict(model.params_shape_tree) and cls.base_model_prefix in state:
+                state = state[cls.base_model_prefix]
+
+            # if model is head model and we are loading weights from base model
+            # we initialize new params dict with base_model_prefix
+            if cls.base_model_prefix in dict(model.params_shape_tree) and cls.base_model_prefix not in state:
+                state = {cls.base_model_prefix: state}
 
         # flatten dicts
         state = flatten_dict(state)
@@ -856,6 +892,11 @@ def from_pretrained(
         missing_keys = model.required_params - set(state.keys())
         unexpected_keys = set(state.keys()) - model.required_params
 
+        # Disabling warning when porting pytorch weights to flax, flax does not uses num_batches_tracked
+        for unexpected_key in unexpected_keys.copy():
+            if "num_batches_tracked" in unexpected_key[-1]:
+                unexpected_keys.remove(unexpected_key)
+
         if missing_keys and not _do_init:
             logger.warning(
                 f"The checkpoint {pretrained_model_name_or_path} is missing required keys: {missing_keys}. "
@@ -961,7 +1002,7 @@ def from_pretrained(
                     resume_download=resume_download,
                     proxies=proxies,
                     local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
+                    token=token,
                     revision=revision,
                     subfolder=subfolder,
                     _from_auto=from_auto_class,
@@ -982,7 +1023,13 @@ def from_pretrained(
             return model, unflatten_dict(state)
 
     def save_pretrained(
-        self, save_directory: Union[str, os.PathLike], params=None, push_to_hub=False, max_shard_size="10GB", **kwargs
+        self,
+        save_directory: Union[str, os.PathLike],
+        params=None,
+        push_to_hub=False,
+        max_shard_size="10GB",
+        token: Optional[Union[str, bool]] = None,
+        **kwargs,
     ):
         """
         Save a model and its configuration file to a directory, so that it can be re-loaded using the
@@ -1006,9 +1053,27 @@ def save_pretrained(
 
                 </Tip>
 
-            kwargs:
+            token (`str` or `bool`, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
+                the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
+            kwargs (`Dict[str, Any]`, *optional*):
                 Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
         """
+        use_auth_token = kwargs.pop("use_auth_token", None)
+
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
+        if token is not None:
+            kwargs["token"] = token
+
         if os.path.isfile(save_directory):
             logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
             return
@@ -1018,7 +1083,7 @@ def save_pretrained(
         if push_to_hub:
             commit_message = kwargs.pop("commit_message", None)
             repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
-            repo_id, token = self._create_repo(repo_id, **kwargs)
+            repo_id = self._create_repo(repo_id, **kwargs)
             files_timestamps = self._get_files_timestamps(save_directory)
 
         # get abs dir
@@ -1032,6 +1097,8 @@ def save_pretrained(
             custom_object_save(self, save_directory, config=self.config)
 
         self.config.save_pretrained(save_directory)
+        if self.can_generate():
+            self.generation_config.save_pretrained(save_directory)
 
         # save model
         output_model_file = os.path.join(save_directory, FLAX_WEIGHTS_NAME)
@@ -1075,7 +1142,11 @@ def save_pretrained(
 
         if push_to_hub:
             self._upload_modified_files(
-                save_directory, repo_id, files_timestamps, commit_message=commit_message, token=token
+                save_directory,
+                repo_id,
+                files_timestamps,
+                commit_message=commit_message,
+                token=token,
             )
 
     @classmethod
@@ -1122,10 +1193,9 @@ def overwrite_call_docstring(model_class, docstring):
     model_class.__call__ = add_start_docstrings_to_model_forward(docstring)(model_class.__call__)
 
 
-def append_call_sample_docstring(model_class, tokenizer_class, checkpoint, output_type, config_class, mask=None):
+def append_call_sample_docstring(model_class, checkpoint, output_type, config_class, mask=None):
     model_class.__call__ = copy_func(model_class.__call__)
     model_class.__call__ = add_code_sample_docstrings(
-        processor_class=tokenizer_class,
         checkpoint=checkpoint,
         output_type=output_type,
         config_class=config_class,
diff --git a/src/transformers/modeling_outputs.py b/src/transformers/modeling_outputs.py
index d41e5804fa02..aceec7abd406 100755
--- a/src/transformers/modeling_outputs.py
+++ b/src/transformers/modeling_outputs.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import warnings
 from dataclasses import dataclass
 from typing import Optional, Tuple
 
@@ -286,6 +287,55 @@ class BaseModelOutputWithPastAndCrossAttentions(ModelOutput):
     cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 
+@dataclass
+class MoECausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for causal language model (or autoregressive) outputs as well as Mixture of Expert's router hidden
+    states terms, to train a MoE model.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        z_loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided):
+            z_loss for the sparse modules.
+        aux_loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided):
+            aux_loss for the sparse modules.
+        router_logits (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_logits=True` is passed or when `config.add_router_probs=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.
+
+            Router logits of the encoder model, useful to compute the auxiliary loss and the z_loss for the sparse
+            modules.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    z_loss: torch.FloatTensor = None
+    aux_loss: torch.FloatTensor = None
+    router_logits: Optional[Tuple[torch.FloatTensor]] = None
+
+
 @dataclass
 class MoEModelOutput(ModelOutput):
     """
@@ -1355,3 +1405,258 @@ class BaseModelOutputWithPoolingAndProjection(ModelOutput):
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
     projection_state: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class Seq2SeqSpectrogramOutput(ModelOutput):
+    """
+    Base class for sequence-to-sequence spectrogram outputs.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Spectrogram generation loss.
+        spectrogram (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_bins)`):
+            The predicted spectrogram.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    spectrogram: torch.FloatTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class Seq2SeqTSModelOutput(ModelOutput):
+    """
+    Base class for time series model's encoder outputs that also contains pre-computed hidden states that can speed up
+    sequential decoding.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+
+            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+            hidden_size)` is output.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        loc (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*):
+            Shift values of each time series' context window which is used to give the model inputs of the same
+            magnitude and then used to shift back to the original magnitude.
+        scale (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*):
+            Scaling values of each time series' context window which is used to give the model inputs of the same
+            magnitude and then used to rescale back to the original magnitude.
+        static_features (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
+            Static features of each time series' in a batch which are copied to the covariates at inference time.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    loc: Optional[torch.FloatTensor] = None
+    scale: Optional[torch.FloatTensor] = None
+    static_features: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+class Seq2SeqTSPredictionOutput(ModelOutput):
+    """
+    Base class for time series model's decoder outputs that also contain the loss as well as the parameters of the
+    chosen distribution.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when a `future_values` is provided):
+            Distributional loss.
+        params (`torch.FloatTensor` of shape `(batch_size, num_samples, num_params)`):
+            Parameters of the chosen distribution.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        loc (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*):
+            Shift values of each time series' context window which is used to give the model inputs of the same
+            magnitude and then used to shift back to the original magnitude.
+        scale (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*):
+            Scaling values of each time series' context window which is used to give the model inputs of the same
+            magnitude and then used to rescale back to the original magnitude.
+        static_features (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
+            Static features of each time series' in a batch which are copied to the covariates at inference time.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    params: Optional[Tuple[torch.FloatTensor]] = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    loc: Optional[torch.FloatTensor] = None
+    scale: Optional[torch.FloatTensor] = None
+    static_features: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+class SampleTSPredictionOutput(ModelOutput):
+    """
+    Base class for time series model's predictions outputs that contains the sampled values from the chosen
+    distribution.
+
+    Args:
+        sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length)` or `(batch_size, num_samples, prediction_length, input_size)`):
+            Sampled values from the chosen distribution.
+    """
+
+    sequences: torch.FloatTensor = None
+
+
+@dataclass
+class MaskedImageModelingOutput(ModelOutput):
+    """
+    Base class for outputs of masked image completion / in-painting models.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
+            Reconstruction loss.
+        reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+           Reconstructed / completed images.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or
+        when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
+            (also called feature maps) of the model at the output of each stage.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when
+        `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    reconstruction: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+    @property
+    def logits(self):
+        warnings.warn(
+            "logits attribute is deprecated and will be removed in version 5 of Transformers."
+            " Please use the reconstruction attribute to retrieve the final output instead.",
+            FutureWarning,
+        )
+        return self.reconstruction
diff --git a/src/transformers/modeling_tf_outputs.py b/src/transformers/modeling_tf_outputs.py
index 0fed3e78511a..357c34bc1f25 100644
--- a/src/transformers/modeling_tf_outputs.py
+++ b/src/transformers/modeling_tf_outputs.py
@@ -12,6 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
+import warnings
 from dataclasses import dataclass
 from typing import List, Optional, Tuple
 
@@ -42,8 +45,8 @@ class TFBaseModelOutput(ModelOutput):
     """
 
     last_hidden_state: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -55,8 +58,8 @@ class TFBaseModelOutputWithNoAttention(ModelOutput):
         last_hidden_state (`tf.Tensor` shape `(batch_size, num_channels, height, width)`):
             Sequence of hidden-states at the output of the last layer of the model.
         hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.
+            Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
+            the output of each layer) of shape `(batch_size, num_channels, height, width)`.
 
             Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
     """
@@ -95,8 +98,8 @@ class TFBaseModelOutputWithPooling(ModelOutput):
 
     last_hidden_state: tf.Tensor = None
     pooler_output: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -163,10 +166,10 @@ class TFBaseModelOutputWithPoolingAndCrossAttentions(ModelOutput):
 
     last_hidden_state: tf.Tensor = None
     pooler_output: tf.Tensor = None
-    past_key_values: Optional[List[tf.Tensor]] = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
-    cross_attentions: Optional[Tuple[tf.Tensor]] = None
+    past_key_values: List[tf.Tensor] | None = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+    cross_attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -200,9 +203,9 @@ class TFBaseModelOutputWithPast(ModelOutput):
     """
 
     last_hidden_state: tf.Tensor = None
-    past_key_values: Optional[List[tf.Tensor]] = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    past_key_values: List[tf.Tensor] | None = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -233,9 +236,9 @@ class TFBaseModelOutputWithCrossAttentions(ModelOutput):
     """
 
     last_hidden_state: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
-    cross_attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+    cross_attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -275,10 +278,10 @@ class TFBaseModelOutputWithPastAndCrossAttentions(ModelOutput):
     """
 
     last_hidden_state: tf.Tensor = None
-    past_key_values: Optional[List[tf.Tensor]] = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
-    cross_attentions: Optional[Tuple[tf.Tensor]] = None
+    past_key_values: List[tf.Tensor] | None = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+    cross_attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -332,13 +335,13 @@ class TFSeq2SeqModelOutput(ModelOutput):
     """
 
     last_hidden_state: tf.Tensor = None
-    past_key_values: Optional[List[tf.Tensor]] = None
-    decoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    decoder_attentions: Optional[Tuple[tf.Tensor]] = None
-    cross_attentions: Optional[Tuple[tf.Tensor]] = None
-    encoder_last_hidden_state: Optional[tf.Tensor] = None
-    encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    encoder_attentions: Optional[Tuple[tf.Tensor]] = None
+    past_key_values: List[tf.Tensor] | None = None
+    decoder_hidden_states: Tuple[tf.Tensor] | None = None
+    decoder_attentions: Tuple[tf.Tensor] | None = None
+    cross_attentions: Tuple[tf.Tensor] | None = None
+    encoder_last_hidden_state: tf.Tensor | None = None
+    encoder_hidden_states: Tuple[tf.Tensor] | None = None
+    encoder_attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -364,10 +367,10 @@ class TFCausalLMOutput(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -399,11 +402,11 @@ class TFCausalLMOutputWithPast(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    past_key_values: Optional[List[tf.Tensor]] = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    past_key_values: List[tf.Tensor] | None = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -441,12 +444,12 @@ class TFCausalLMOutputWithCrossAttentions(ModelOutput):
             `past_key_values` input) to speed up sequential decoding.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    past_key_values: Optional[List[tf.Tensor]] = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
-    cross_attentions: Optional[Tuple[tf.Tensor]] = None
+    past_key_values: List[tf.Tensor] | None = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+    cross_attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -472,10 +475,10 @@ class TFMaskedLMOutput(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -526,15 +529,15 @@ class TFSeq2SeqLMOutput(ModelOutput):
             self-attention heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    past_key_values: Optional[List[tf.Tensor]] = None
-    decoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    decoder_attentions: Optional[Tuple[tf.Tensor]] = None
-    cross_attentions: Optional[Tuple[tf.Tensor]] = None
-    encoder_last_hidden_state: Optional[tf.Tensor] = None
-    encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    encoder_attentions: Optional[Tuple[tf.Tensor]] = None
+    past_key_values: List[tf.Tensor] | None = None
+    decoder_hidden_states: Tuple[tf.Tensor] | None = None
+    decoder_attentions: Tuple[tf.Tensor] | None = None
+    cross_attentions: Tuple[tf.Tensor] | None = None
+    encoder_last_hidden_state: tf.Tensor | None = None
+    encoder_hidden_states: Tuple[tf.Tensor] | None = None
+    encoder_attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -561,10 +564,10 @@ class TFNextSentencePredictorOutput(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -590,10 +593,10 @@ class TFSequenceClassifierOutput(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -641,15 +644,15 @@ class TFSeq2SeqSequenceClassifierOutput(ModelOutput):
             self-attention heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    past_key_values: Optional[List[tf.Tensor]] = None
-    decoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    decoder_attentions: Optional[Tuple[tf.Tensor]] = None
-    cross_attentions: Optional[Tuple[tf.Tensor]] = None
-    encoder_last_hidden_state: Optional[tf.Tensor] = None
-    encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    encoder_attentions: Optional[Tuple[tf.Tensor]] = None
+    past_key_values: List[tf.Tensor] | None = None
+    decoder_hidden_states: Tuple[tf.Tensor] | None = None
+    decoder_attentions: Tuple[tf.Tensor] | None = None
+    cross_attentions: Tuple[tf.Tensor] | None = None
+    encoder_last_hidden_state: tf.Tensor | None = None
+    encoder_hidden_states: Tuple[tf.Tensor] | None = None
+    encoder_attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -683,10 +686,10 @@ class TFSemanticSegmenterOutput(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -715,9 +718,9 @@ class TFSemanticSegmenterOutputWithNoAttention(ModelOutput):
             Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -741,10 +744,10 @@ class TFImageClassifierOutput(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -772,10 +775,10 @@ class TFMultipleChoiceModelOutput(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -801,10 +804,10 @@ class TFTokenClassifierOutput(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -832,11 +835,11 @@ class TFQuestionAnsweringModelOutput(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     start_logits: tf.Tensor = None
     end_logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -883,15 +886,15 @@ class TFSeq2SeqQuestionAnsweringModelOutput(ModelOutput):
             self-attention heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     start_logits: tf.Tensor = None
     end_logits: tf.Tensor = None
-    past_key_values: Optional[List[tf.Tensor]] = None
-    decoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    decoder_attentions: Optional[Tuple[tf.Tensor]] = None
-    encoder_last_hidden_state: Optional[tf.Tensor] = None
-    encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    encoder_attentions: Optional[Tuple[tf.Tensor]] = None
+    past_key_values: List[tf.Tensor] | None = None
+    decoder_hidden_states: Tuple[tf.Tensor] | None = None
+    decoder_attentions: Tuple[tf.Tensor] | None = None
+    encoder_last_hidden_state: tf.Tensor | None = None
+    encoder_hidden_states: Tuple[tf.Tensor] | None = None
+    encoder_attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -923,11 +926,11 @@ class TFSequenceClassifierOutputWithPast(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    past_key_values: Optional[List[tf.Tensor]] = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    past_key_values: List[tf.Tensor] | None = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -946,6 +949,43 @@ class TFImageClassifierOutputWithNoAttention(ModelOutput):
             feature maps) of the model at the output of each stage.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
     hidden_states: Optional[Tuple[tf.Tensor, ...]] = None
+
+
+@dataclass
+class TFMaskedImageModelingOutput(ModelOutput):
+    """
+    Base class for outputs of masked image completion / in-painting models.
+
+    Args:
+        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
+            Reconstruction loss.
+        reconstruction (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
+           Reconstructed / completed images.
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when
+        `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
+            the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states (also called
+            feature maps) of the model at the output of each stage.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when
+        `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, patch_size, sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: tf.Tensor | None = None
+    reconstruction: tf.Tensor = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+
+    @property
+    def logits(self):
+        warnings.warn(
+            "logits attribute is deprecated and will be removed in version 5 of Transformers."
+            " Please use the reconstruction attribute to retrieve the final output instead.",
+            FutureWarning,
+        )
+        return self.reconstruction
diff --git a/src/transformers/modeling_tf_pytorch_utils.py b/src/transformers/modeling_tf_pytorch_utils.py
index 9db0f582e2aa..fbce340fea76 100644
--- a/src/transformers/modeling_tf_pytorch_utils.py
+++ b/src/transformers/modeling_tf_pytorch_utils.py
@@ -39,7 +39,9 @@ class TransposeType(ExplicitEnum):
     CONV2D = "conv2d"
 
 
-def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove="", tf_weight_shape=None):
+def convert_tf_weight_name_to_pt_weight_name(
+    tf_name, start_prefix_to_remove="", tf_weight_shape=None, name_scope=None
+):
     """
     Convert a TF 2.0 model variable name in a pytorch model weight name.
 
@@ -54,6 +56,14 @@ def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove="",
         - transpose: `TransposeType` member indicating whether and how TF2.0 and PyTorch weights matrices should be
           transposed with regards to each other
     """
+    if name_scope is not None:
+        if not tf_name.startswith(name_scope):
+            raise ValueError(
+                f"Weight name {tf_name} does not start with name_scope {name_scope}. This is an internal error "
+                "in Transformers, so (unless you were doing something really evil) please open an issue to report it!"
+            )
+        tf_name = tf_name[len(name_scope) :]
+        tf_name = tf_name.lstrip("/")
     tf_name = tf_name.replace(":0", "")  # device ids
     tf_name = re.sub(
         r"/[^/]*___([^/]*)/", r"/\1/", tf_name
@@ -144,7 +154,13 @@ def apply_transpose(transpose: TransposeType, weight, match_shape=None, pt_to_tf
 
 
 def load_pytorch_checkpoint_in_tf2_model(
-    tf_model, pytorch_checkpoint_path, tf_inputs=None, allow_missing_keys=False, output_loading_info=False
+    tf_model,
+    pytorch_checkpoint_path,
+    tf_inputs=None,
+    allow_missing_keys=False,
+    output_loading_info=False,
+    _prefix=None,
+    tf_to_pt_weight_rename=None,
 ):
     """Load pytorch checkpoints in a TF 2.0 model"""
     try:
@@ -176,6 +192,8 @@ def load_pytorch_checkpoint_in_tf2_model(
         tf_inputs=tf_inputs,
         allow_missing_keys=allow_missing_keys,
         output_loading_info=output_loading_info,
+        _prefix=_prefix,
+        tf_to_pt_weight_rename=tf_to_pt_weight_rename,
     )
 
 
@@ -189,7 +207,13 @@ def load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=None, allow_mi
 
 
 def load_pytorch_weights_in_tf2_model(
-    tf_model, pt_state_dict, tf_inputs=None, allow_missing_keys=False, output_loading_info=False
+    tf_model,
+    pt_state_dict,
+    tf_inputs=None,
+    allow_missing_keys=False,
+    output_loading_info=False,
+    _prefix=None,
+    tf_to_pt_weight_rename=None,
 ):
     """Load pytorch state_dict in a TF 2.0 model."""
     try:
@@ -209,30 +233,36 @@ def load_pytorch_weights_in_tf2_model(
         tf_inputs=tf_inputs,
         allow_missing_keys=allow_missing_keys,
         output_loading_info=output_loading_info,
+        _prefix=_prefix,
+        tf_to_pt_weight_rename=tf_to_pt_weight_rename,
     )
 
 
 def load_pytorch_state_dict_in_tf2_model(
-    tf_model, pt_state_dict, tf_inputs=None, allow_missing_keys=False, output_loading_info=False
+    tf_model,
+    pt_state_dict,
+    tf_inputs=None,
+    allow_missing_keys=False,
+    output_loading_info=False,
+    _prefix=None,
+    tf_to_pt_weight_rename=None,
+    ignore_mismatched_sizes=False,
 ):
-    """Load a pytorch state_dict in a TF 2.0 model."""
+    """Load a pytorch state_dict in a TF 2.0 model. pt_state_dict can be either an actual dict or a lazy-loading
+    safetensors archive created with the safe_open() function."""
     import tensorflow as tf
-    from packaging.version import parse
-
-    if parse(tf.__version__) >= parse("2.11.0"):
-        from keras import backend as K
-    else:
-        from tensorflow.python.keras import backend as K
+    from keras import backend as K
 
     if tf_inputs is None:
         tf_inputs = tf_model.dummy_inputs
 
-    if tf_inputs is not None:
-        tf_model(tf_inputs, training=False)  # Make sure model is built
-    # Adapt state dict - TODO remove this and update the AWS weights files instead
+    if _prefix is None:
+        _prefix = ""
+    if tf_inputs:
+        with tf.name_scope(_prefix):
+            tf_model(tf_inputs, training=False)  # Make sure model is built
     # Convert old format to new format if needed from a PyTorch state_dict
-    old_keys = []
-    new_keys = []
+    tf_keys_to_pt_keys = {}
     for key in pt_state_dict.keys():
         new_key = None
         if "gamma" in key:
@@ -243,31 +273,49 @@ def load_pytorch_state_dict_in_tf2_model(
             new_key = key.replace("running_var", "moving_variance")
         if "running_mean" in key:
             new_key = key.replace("running_mean", "moving_mean")
-        if new_key:
-            old_keys.append(key)
-            new_keys.append(new_key)
-    for old_key, new_key in zip(old_keys, new_keys):
-        pt_state_dict[new_key] = pt_state_dict.pop(old_key)
 
-    # Make sure we are able to load PyTorch base models as well as derived models (with heads)
-    # TF models always have a prefix, some of PyTorch models (base ones) don't
+        # New `weight_norm` from https://github.com/huggingface/transformers/pull/24030
+        key_components = key.split(".")
+        name = None
+        if key_components[-3::2] == ["parametrizations", "original0"]:
+            name = key_components[-2] + "_g"
+        elif key_components[-3::2] == ["parametrizations", "original1"]:
+            name = key_components[-2] + "_v"
+        if name is not None:
+            key_components = key_components[:-3] + [name]
+            new_key = ".".join(key_components)
+
+        if new_key is None:
+            new_key = key
+        tf_keys_to_pt_keys[new_key] = key
+
+    # Matt: All TF models store the actual model stem in a MainLayer class, including the base model.
+    # In PT, the derived models (with heads) use the base model class as the stem instead,
+    # and there is no MainLayer class. This means that TF base classes have one
+    # extra layer in their weight names, corresponding to the MainLayer class. This code block compensates for that.
     start_prefix_to_remove = ""
-    if not any(s.startswith(tf_model.base_model_prefix) for s in pt_state_dict.keys()):
+    if not any(s.startswith(tf_model.base_model_prefix) for s in tf_keys_to_pt_keys.keys()):
         start_prefix_to_remove = tf_model.base_model_prefix + "."
 
     symbolic_weights = tf_model.trainable_weights + tf_model.non_trainable_weights
     tf_loaded_numel = 0
-    weight_value_tuples = []
-    all_pytorch_weights = set(list(pt_state_dict.keys()))
+    all_pytorch_weights = set(tf_keys_to_pt_keys.keys())
     missing_keys = []
+    mismatched_keys = []
+    is_safetensor_archive = hasattr(pt_state_dict, "get_tensor")
     for symbolic_weight in symbolic_weights:
         sw_name = symbolic_weight.name
         name, transpose = convert_tf_weight_name_to_pt_weight_name(
-            sw_name, start_prefix_to_remove=start_prefix_to_remove, tf_weight_shape=symbolic_weight.shape
+            sw_name,
+            start_prefix_to_remove=start_prefix_to_remove,
+            tf_weight_shape=symbolic_weight.shape,
+            name_scope=_prefix,
         )
+        if tf_to_pt_weight_rename is not None:
+            name = tf_to_pt_weight_rename(name)
 
         # Find associated numpy array in pytorch model state dict
-        if name not in pt_state_dict:
+        if name not in tf_keys_to_pt_keys:
             if allow_missing_keys:
                 missing_keys.append(name)
                 continue
@@ -276,19 +324,30 @@ def load_pytorch_state_dict_in_tf2_model(
                 if any(re.search(pat, name) is not None for pat in tf_model._keys_to_ignore_on_load_missing):
                     continue
             raise AttributeError(f"{name} not found in PyTorch model")
-
-        array = apply_transpose(transpose, pt_state_dict[name], symbolic_weight.shape)
+        state_dict_name = tf_keys_to_pt_keys[name]
+        if is_safetensor_archive:
+            array = pt_state_dict.get_tensor(state_dict_name)
+        else:
+            array = pt_state_dict[state_dict_name]
+        try:
+            array = apply_transpose(transpose, array, symbolic_weight.shape)
+        except tf.errors.InvalidArgumentError as e:
+            if not ignore_mismatched_sizes:
+                error_msg = str(e)
+                error_msg += (
+                    "\n\tYou may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method."
+                )
+                raise tf.errors.InvalidArgumentError(error_msg)
+            else:
+                mismatched_keys.append((name, array.shape, symbolic_weight.shape))
+                continue
 
         tf_loaded_numel += tensor_size(array)
 
-        weight_value_tuples.append((symbolic_weight, array))
+        K.set_value(symbolic_weight, array)
+        del array  # Immediately free memory to keep peak usage as low as possible
         all_pytorch_weights.discard(name)
 
-    K.batch_set_value(weight_value_tuples)
-
-    if tf_inputs is not None:
-        tf_model(tf_inputs, training=False)  # Make sure restore ops are run
-
     logger.info(f"Loaded {tf_loaded_numel:,} parameters in the TF 2.0 model.")
 
     unexpected_keys = list(all_pytorch_weights)
@@ -325,8 +384,26 @@ def load_pytorch_state_dict_in_tf2_model(
             f"you can already use {tf_model.__class__.__name__} for predictions without further training."
         )
 
+    if len(mismatched_keys) > 0:
+        mismatched_warning = "\n".join(
+            [
+                f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
+                for key, shape1, shape2 in mismatched_keys
+            ]
+        )
+        logger.warning(
+            f"Some weights of {tf_model.__class__.__name__} were not initialized from the model checkpoint"
+            f" are newly initialized because the shapes did not"
+            f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be able"
+            " to use it for predictions and inference."
+        )
+
     if output_loading_info:
-        loading_info = {"missing_keys": missing_keys, "unexpected_keys": unexpected_keys}
+        loading_info = {
+            "missing_keys": missing_keys,
+            "unexpected_keys": unexpected_keys,
+            "mismatched_keys": mismatched_keys,
+        }
         return tf_model, loading_info
 
     return tf_model
@@ -425,7 +502,7 @@ def load_tf2_state_dict_in_pytorch_model(pt_model, tf_state_dict, allow_missing_
         )
         tf_weights_map[pt_name] = (tf_weight, transpose)
 
-    all_tf_weights = set(list(tf_weights_map.keys()))
+    all_tf_weights = set(tf_weights_map.keys())
     loaded_pt_weights_data_ptr = {}
     missing_keys_pt = []
     for pt_weight_name, pt_weight in current_pt_params_dict.items():
@@ -434,15 +511,27 @@ def load_tf2_state_dict_in_pytorch_model(pt_model, tf_state_dict, allow_missing_
             new_pt_params_dict[pt_weight_name] = loaded_pt_weights_data_ptr[pt_weight.data_ptr()]
             continue
 
+        pt_weight_name_to_check = pt_weight_name
+        # New `weight_norm` from https://github.com/huggingface/transformers/pull/24030
+        key_components = pt_weight_name.split(".")
+        name = None
+        if key_components[-3::2] == ["parametrizations", "original0"]:
+            name = key_components[-2] + "_g"
+        elif key_components[-3::2] == ["parametrizations", "original1"]:
+            name = key_components[-2] + "_v"
+        if name is not None:
+            key_components = key_components[:-3] + [name]
+            pt_weight_name_to_check = ".".join(key_components)
+
         # Find associated numpy array in pytorch model state dict
-        if pt_weight_name not in tf_weights_map:
+        if pt_weight_name_to_check not in tf_weights_map:
             if allow_missing_keys:
                 missing_keys_pt.append(pt_weight_name)
                 continue
 
             raise AttributeError(f"{pt_weight_name} not found in TF 2.0 model")
 
-        array, transpose = tf_weights_map[pt_weight_name]
+        array, transpose = tf_weights_map[pt_weight_name_to_check]
 
         array = apply_transpose(transpose, array, pt_weight.shape, pt_to_tf=False)
 
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 2372984b71a8..6505a2ec6dd7 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -15,6 +15,8 @@
 # limitations under the License.
 """TF general model utils."""
 
+from __future__ import annotations
+
 import functools
 import gc
 import inspect
@@ -30,19 +32,23 @@
 import h5py
 import numpy as np
 import tensorflow as tf
-from packaging.version import parse
-
 from huggingface_hub import Repository, list_repo_files
-from transformers.utils.hub import convert_file_size_to_int, get_checkpoint_shard_files
+from keras import backend as K
+from packaging.version import parse
+from tensorflow.python.util.keras_deps import get_call_context_function
 
 from . import DataCollatorWithPadding, DefaultDataCollator
 from .activations_tf import get_tf_activation
 from .configuration_utils import PretrainedConfig
 from .dynamic_module_utils import custom_object_save
 from .generation import GenerationConfig, TFGenerationMixin
-from .tf_utils import shape_list
+from .tf_utils import (
+    expand_1d,
+    load_attributes_from_hdf5_group,
+    save_attributes_to_hdf5_group,
+    shape_list,
+)
 from .utils import (
-    DUMMY_INPUTS,
     SAFE_WEIGHTS_INDEX_NAME,
     SAFE_WEIGHTS_NAME,
     TF2_WEIGHTS_INDEX_NAME,
@@ -59,27 +65,16 @@
     is_offline_mode,
     is_remote_url,
     is_safetensors_available,
+    is_tf_symbolic_tensor,
     logging,
     requires_backends,
     working_or_temp_dir,
 )
-
-
-if parse(tf.__version__) >= parse("2.11.0"):
-    from keras import backend as K
-    from keras.engine import data_adapter
-    from keras.engine.keras_tensor import KerasTensor
-    from keras.saving.legacy import hdf5_format
-else:
-    from tensorflow.python.keras import backend as K
-    from tensorflow.python.keras.engine import data_adapter
-    from tensorflow.python.keras.engine.keras_tensor import KerasTensor
-    from tensorflow.python.keras.saving import hdf5_format
+from .utils.hub import convert_file_size_to_int, get_checkpoint_shard_files
 
 
 if is_safetensors_available():
     from safetensors import safe_open
-    from safetensors.tensorflow import load_file as safe_load_file
     from safetensors.tensorflow import save_file as safe_save_file
 
 if TYPE_CHECKING:
@@ -92,13 +87,10 @@
 TFModelInputType = Union[
     List[tf.Tensor],
     List[np.ndarray],
-    List[KerasTensor],
     Dict[str, tf.Tensor],
     Dict[str, np.ndarray],
-    Dict[str, KerasTensor],
     tf.Tensor,
     np.ndarray,
-    KerasTensor,
 ]
 
 
@@ -407,6 +399,7 @@ def unpack_inputs(func):
         func (`callable`):
             The callable function of the TensorFlow model.
 
+
     Returns:
         A callable that wraps the original `func` with the behavior described above.
     """
@@ -464,7 +457,7 @@ def input_processing(func, config, **kwargs):
     main_input_name = parameter_names[0]
     main_input = kwargs.pop(main_input_name, None)
     output = {}
-    allowed_types = (tf.Tensor, bool, int, ModelOutput, tuple, list, dict, np.ndarray, KerasTensor)
+    allowed_types = (tf.Tensor, bool, int, ModelOutput, tuple, list, dict, np.ndarray)
 
     if "inputs" in kwargs["kwargs_call"]:
         warnings.warn(
@@ -503,7 +496,7 @@ def input_processing(func, config, **kwargs):
         kwargs.pop("kwargs_call")
 
     for k, v in kwargs.items():
-        if isinstance(v, allowed_types) or v is None:
+        if isinstance(v, allowed_types) or tf.is_tensor(v) or v is None:
             output[k] = v
         else:
             raise ValueError(f"Data of type {type(v)} is not allowed only {allowed_types} is accepted for {k}.")
@@ -511,7 +504,7 @@ def input_processing(func, config, **kwargs):
     if isinstance(main_input, (tuple, list)):
         for i, input in enumerate(main_input):
             # EagerTensors don't allow to use the .name property so we check for a real Tensor
-            if type(input) == tf.Tensor:
+            if is_tf_symbolic_tensor(input):
                 # Tensor names have always the pattern `name:id` then we check only the
                 # `name` part
                 tensor_name = input.name.split(":")[0]
@@ -556,7 +549,7 @@ def input_processing(func, config, **kwargs):
             else:
                 raise ValueError(f"Data of type {type(v)} is not allowed only {allowed_types} is accepted for {k}.")
     else:
-        if isinstance(main_input, (tf.Tensor, KerasTensor)) or main_input is None:
+        if tf.is_tensor(main_input) or main_input is None:
             output[main_input_name] = main_input
         else:
             raise ValueError(
@@ -572,7 +565,7 @@ def input_processing(func, config, **kwargs):
     # When creating a SavedModel TF calls the method with LayerCall.__call__(args, **kwargs)
     # So to respect the proper output we have to add this exception
     if "args" in output:
-        if output["args"] is not None and type(output["args"]) == tf.Tensor:
+        if output["args"] is not None and is_tf_symbolic_tensor(output["args"]):
             tensor_name = output["args"].name.split(":")[0]
             output[tensor_name] = output["args"]
         else:
@@ -584,7 +577,7 @@ def input_processing(func, config, **kwargs):
     if "kwargs" in output:
         del output["kwargs"]
 
-    cast_output = dict()
+    cast_output = {}
     for key, val in output.items():
         if isinstance(val, tf.Tensor) and val.dtype == tf.int64:
             cast_output[key] = tf.cast(val, tf.int32)
@@ -707,7 +700,7 @@ def tf_shard_checkpoint(weights, max_shard_size="10GB"):
     return shards, index
 
 
-def load_tf_sharded_weights(model, shard_files, ignore_mismatched_sizes=False, strict=True):
+def load_tf_sharded_weights(model, shard_files, ignore_mismatched_sizes=False, strict=False, _prefix=None):
     """
     This is the same as `load_tf_weights` but for a sharded checkpoint. Detect missing and unexpected layers and load
     the TF weights from the shard file accordingly to their names and shapes.
@@ -729,32 +722,35 @@ def load_tf_sharded_weights(model, shard_files, ignore_mismatched_sizes=False, s
     """
 
     # Load the index
-    missing_keys = []
     unexpected_keys = set()
     saved_keys = set()
-    missmatched_keys = set()
+    mismatched_keys = set()
 
     # Since TF adds the name of the class to its weights, and uses the index and not the name of the layer to load
     # the weight, we have to get rid of the first prefix of the name of the layer.
     model_keys = set()
-    model_layer_map = dict()
+    model_layer_map = {}
     for i, k in enumerate(model.weights):
-        if "model." in k.name or len(k.name.split("/")) == 1:
-            layer_name = k.name
-        else:
-            layer_name = "/".join(k.name.split("/")[1:])
+        layer_name = k.name
+        if _prefix is not None and layer_name.startswith(_prefix):
+            layer_name = layer_name[len(_prefix) :]
+            layer_name = layer_name.lstrip("/")
+        if not ("model." in layer_name or len(layer_name.split("/")) == 1):
+            layer_name = "/".join(layer_name.split("/")[1:])
         model_keys.add(layer_name)
         model_layer_map[layer_name] = i
 
     for shard_file in shard_files:
-        state_dict = tf.io.read_file(shard_file)
-        saved_weight_names_set, unexpected_keys_set, missmatched_keys_set = load_tf_shard(
-            model, model_layer_map, shard_file, ignore_mismatched_sizes=ignore_mismatched_sizes
+        saved_weight_names_set, unexpected_keys_set, mismatched_keys_set = load_tf_shard(
+            model,
+            model_layer_map,
+            shard_file,
+            ignore_mismatched_sizes=ignore_mismatched_sizes,
+            _prefix=_prefix,
         )
         saved_keys.update(saved_weight_names_set)
         unexpected_keys.update(unexpected_keys_set)
-        missmatched_keys.update(missmatched_keys_set)
-        del state_dict
+        mismatched_keys.update(mismatched_keys_set)
         gc.collect()
 
     missing_keys = model_keys - saved_keys
@@ -768,10 +764,10 @@ def load_tf_sharded_weights(model, shard_files, ignore_mismatched_sizes=False, s
             error_message += f"\nMissing key(s): {str_unexpected_keys}."
         raise RuntimeError(error_message)
 
-    return missing_keys, unexpected_keys, missmatched_keys
+    return missing_keys, unexpected_keys, mismatched_keys
 
 
-def load_tf_shard(model, model_layer_map, resolved_archive_file, ignore_mismatched_sizes=False):
+def load_tf_shard(model, model_layer_map, resolved_archive_file, ignore_mismatched_sizes=False, _prefix=None):
     """
     Loads a shard from a sharded checkpoint file. Handles the missing keys and unexpected keys.
 
@@ -783,19 +779,17 @@ def load_tf_shard(model, model_layer_map, resolved_archive_file, ignore_mismatch
 
     Returns:
         `tf.keras.models.Model`: Three lists, one for the layers that were found and succesfully restored (from the
-        shard file), one for the missmatched layers, and another one for the unexpected layers.
+        shard file), one for the mismatched layers, and another one for the unexpected layers.
     """
     saved_weight_names_set = set()
     saved_weights = {}
-    missmatched_keys = set()
+    mismatched_keys = set()
     unexpected_keys = set()
     # Read the H5 file
     try:
         with h5py.File(resolved_archive_file, "r") as sharded_checkpoint_file:
             # Retrieve the name of each layer from the H5 file
-            saved_h5_model_layers_name = set(
-                hdf5_format.load_attributes_from_hdf5_group(sharded_checkpoint_file, "layer_names")
-            )
+            saved_h5_model_layers_name = set(load_attributes_from_hdf5_group(sharded_checkpoint_file, "layer_names"))
             weight_value_tuples = []
 
             # Compute missing and unexpected sub layers
@@ -822,7 +816,7 @@ def load_tf_shard(model, model_layer_map, resolved_archive_file, ignore_mismatch
                                 array = np.reshape(saved_weight_value, K.int_shape(symbolic_weight))
                             except ValueError as e:
                                 if ignore_mismatched_sizes:
-                                    missmatched_keys.add(
+                                    mismatched_keys.add(
                                         (layer_name, saved_weight_value.shape, K.int_shape(symbolic_weight))
                                     )
                                     continue
@@ -836,7 +830,7 @@ def load_tf_shard(model, model_layer_map, resolved_archive_file, ignore_mismatch
 
         K.batch_set_value(weight_value_tuples)
 
-        return saved_weight_names_set, unexpected_keys, missmatched_keys
+        return saved_weight_names_set, unexpected_keys, mismatched_keys
 
     except Exception as e:
         try:
@@ -889,22 +883,18 @@ def load_tf_weights(model, resolved_archive_file, ignore_mismatched_sizes=False,
 
 
 def load_tf_weights_from_h5(model, resolved_archive_file, ignore_mismatched_sizes=False, _prefix=None):
-    missing_layers = []
-    unexpected_layers = []
     mismatched_layers = []
 
     # Read the H5 file
     with h5py.File(resolved_archive_file, "r") as sharded_checkpoint_file:
         # Retrieve the name of each layer from the H5 file
-        saved_h5_model_layers_name = set(
-            hdf5_format.load_attributes_from_hdf5_group(sharded_checkpoint_file, "layer_names")
-        )
+        saved_h5_model_layers_name = set(load_attributes_from_hdf5_group(sharded_checkpoint_file, "layer_names"))
 
         # Find the missing layers from the high level list of layers
-        missing_layers = list(set([layer.name for layer in model.layers]) - saved_h5_model_layers_name)
+        missing_layers = list({layer.name for layer in model.layers} - saved_h5_model_layers_name)
 
         # Find the unexpected layers from the high level list of layers
-        unexpected_layers = list(saved_h5_model_layers_name - set([layer.name for layer in model.layers]))
+        unexpected_layers = list(saved_h5_model_layers_name - {layer.name for layer in model.layers})
         saved_weight_names_set = set()
         symbolic_weights_names = set()
         weight_value_tuples = []
@@ -922,7 +912,7 @@ def load_tf_weights_from_h5(model, resolved_archive_file, ignore_mismatched_size
 
                 # Create a dict from the H5 saved model that looks like {"weight_name": weight_value}
                 # And a set with only the names
-                for weight_name in hdf5_format.load_attributes_from_hdf5_group(h5_layer_object, "weight_names"):
+                for weight_name in load_attributes_from_hdf5_group(h5_layer_object, "weight_names"):
                     # TF names always start with the model name so we ignore it
                     name = "/".join(weight_name.split("/")[1:])
 
@@ -994,42 +984,33 @@ def load_tf_weights_from_h5(model, resolved_archive_file, ignore_mismatched_size
 
 def load_tf_weights_from_safetensors(model, resolved_archive_file, ignore_mismatched_sizes=False, _prefix=None):
     # Read the safetensors file
-    state_dict = safe_load_file(resolved_archive_file)
-
-    weight_value_tuples = []
-    mismatched_layers = []
-
-    weight_names = [format_weight_name(w.name, _prefix=_prefix) for w in model.weights]
-    loaded_weight_names = list(state_dict.keys())
-
-    # Find the missing layers from the high level list of layers
-    missing_layers = list(set(weight_names) - set(loaded_weight_names))
-    # Find the unexpected layers from the high level list of layers
-    unexpected_layers = list(set(loaded_weight_names) - set(weight_names))
-
-    weight_value_tuples = []
-    for weight in model.weights:
-        weight_name = format_weight_name(weight.name, _prefix=_prefix)
-        if weight_name in state_dict:
-            weight_value = state_dict[weight_name]
-            # Check if the shape of the current weight and the one from the H5 file are different
-            if K.int_shape(weight) != weight_value.shape:
-                # If yes we reshape the weight from the H5 file accordingly to the current weight
-                # If the two shapes are not compatible we raise an issue
-                try:
-                    weight_value = tf.reshape(weight_value, K.int_shape(weight))
-                except ValueError as e:
-                    if ignore_mismatched_sizes:
-                        mismatched_layers.append((weight_name, weight_value.shape, K.int_shape(weight)))
-                        continue
-                    else:
-                        raise e
-
-            weight_value_tuples.append((weight, weight_value))
-
-    # Load all the weights
-    K.batch_set_value(weight_value_tuples)
+    with safe_open(resolved_archive_file, framework="tf") as safetensors_archive:
+        mismatched_layers = []
+        weight_names = [format_weight_name(w.name, _prefix=_prefix) for w in model.weights]
+        loaded_weight_names = list(safetensors_archive.keys())
+        # Find the missing layers from the high level list of layers
+        missing_layers = list(set(weight_names) - set(loaded_weight_names))
+        # Find the unexpected layers from the high level list of layers
+        unexpected_layers = list(set(loaded_weight_names) - set(weight_names))
+
+        for weight in model.weights:
+            weight_name = format_weight_name(weight.name, _prefix=_prefix)
+            if weight_name in loaded_weight_names:
+                weight_value = safetensors_archive.get_tensor(weight_name)
+                # Check if the shape of the current weight and the one from the H5 file are different
+                if K.int_shape(weight) != weight_value.shape:
+                    # If yes we reshape the weight from the H5 file accordingly to the current weight
+                    # If the two shapes are not compatible we raise an issue
+                    try:
+                        weight_value = tf.reshape(weight_value, K.int_shape(weight))
+                    except ValueError as e:
+                        if ignore_mismatched_sizes:
+                            mismatched_layers.append((weight_name, weight_value.shape, K.int_shape(weight)))
+                            continue
+                        else:
+                            raise e
 
+                K.set_value(weight, weight_value)  # weight.assign() might break if weight is a DTensor
     return missing_layers, unexpected_layers, mismatched_layers
 
 
@@ -1115,9 +1096,28 @@ def dummy_inputs(self) -> Dict[str, tf.Tensor]:
         Returns:
             `Dict[str, tf.Tensor]`: The dummy inputs.
         """
-        return {
-            "input_ids": tf.constant(DUMMY_INPUTS, dtype=tf.int32),
-        }
+        dummies = {}
+        for key, spec in self.input_signature.items():
+            # 2 is the most correct arbitrary size. I will not be taking questions
+            dummy_shape = [dim if dim is not None else 2 for dim in spec.shape]
+            if spec.shape[0] is None:
+                # But let's make the batch size 1 to save memory anyway
+                dummy_shape[0] = 1
+            dummies[key] = tf.ones(shape=dummy_shape, dtype=spec.dtype)
+            if key == "token_type_ids":
+                # Some models have token_type_ids but with a vocab_size of 1
+                dummies[key] = tf.zeros_like(dummies[key])
+        if self.config.add_cross_attention and "encoder_hidden_states" in inspect.signature(self.call).parameters:
+            if "encoder_hidden_states" not in dummies:
+                if self.main_input_name == "input_ids":
+                    dummies["encoder_hidden_states"] = tf.ones(
+                        shape=(1, 2, self.config.hidden_size), dtype=tf.float32, name="encoder_hidden_states"
+                    )
+                else:
+                    raise NotImplementedError(
+                        "Model has cross-attention but we couldn't infer the shape for the encoder hidden states. Please manually override dummy_inputs!"
+                    )
+        return dummies
 
     @property
     def framework(self) -> str:
@@ -1126,6 +1126,17 @@ def framework(self) -> str:
         """
         return "tf"
 
+    def build(self, input_shape=None):
+        call_context = get_call_context_function()
+        if self.built or call_context().in_call:
+            self.built = True
+        else:
+            self.built = True
+            # Set the serving spec quickly to ensure that Keras doesn't use the specific dummy input shapes as the spec
+            # Setting it in build() allows users to override the shape when loading a non-pretrained model from config
+            self._set_save_spec(self.input_signature)
+            self(self.dummy_inputs, training=False)
+
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(*inputs, **kwargs)
         if not isinstance(config, PretrainedConfig):
@@ -1138,8 +1149,6 @@ def __init__(self, config, *inputs, **kwargs):
         self.config = config
         self.name_or_path = config.name_or_path
         self.generation_config = GenerationConfig.from_model_config(config) if self.can_generate() else None
-        # Set the serving spec quickly to ensure that Keras doesn't use the specific dummy input shapes as the spec
-        self._set_save_spec(self.serving.input_signature[0])
 
     def get_config(self):
         return self.config.to_dict()
@@ -1157,12 +1166,44 @@ def _from_config(cls, config, **kwargs):
         """
         return cls(config, **kwargs)
 
-    def eager_serving(self, inputs):
+    def get_head_mask(self, head_mask: tf.Tensor | None, num_hidden_layers: int) -> tf.Tensor:
         """
-        Method used for serving the model. Intended not to be compiled with a tf.function decorator so that we can use
-        it to generate multiple signatures later.
+        Prepare the head mask if needed.
+
+        Args:
+            head_mask (`tf.Tensor` with shape `[num_heads]` or `[num_hidden_layers x num_heads]`, *optional*):
+                The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard).
+            num_hidden_layers (`int`):
+                The number of hidden layers in the model.
 
+        Returns:
+            `tf.Tensor` with shape `[num_hidden_layers x batch x num_heads x seq_length x seq_length]` or list with
+            `[None]` for each layer.
+        """
+        if head_mask is not None:
+            head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers)
+        else:
+            head_mask = [None] * num_hidden_layers
+
+        return head_mask
+
+    def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers):
+        """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]"""
+        if head_mask.shape.rank == 1:
+            head_mask = head_mask[None, None, :, None, None]
+            head_mask = tf.repeat(head_mask, repeats=num_hidden_layers, axis=0)
+        elif head_mask.shape.rank == 2:
+            head_mask = head_mask[:, None, :, None, None]
+        assert head_mask.shape.rank == 5, f"head_mask.dim != 5, instead {head_mask.dim()}"
+        head_mask = tf.cast(head_mask, tf.float32)  # switch to float if need + fp16 compatibility
+        return head_mask
+
+    @tf.function
+    def serving(self, inputs):
+        """
         Args:
+        Method used for serving the model. Does not have a specific signature, but will be specialized as concrete
+        functions when saving with `save_pretrained`.
             inputs (`Dict[str, tf.Tensor]`):
                 The input of the saved model as a dictionary of tensors.
         """
@@ -1170,46 +1211,105 @@ def eager_serving(self, inputs):
 
         return self.serving_output(output)
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-                "token_type_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"),
-            }
-        ]
-    )
-    def serving(self, inputs):
+    def eager_serving(self, inputs):
         """
-        Method used for serving the model.
+        Method used for serving the model. This method is deprecated, and will be removed.
 
         Args:
             inputs (`Dict[str, tf.Tensor]`):
                 The input of the saved model as a dictionary of tensors.
         """
+        warnings.warn(
+            "The function `eager_serving` is deprecated and will be removed in version 4.32.0 of Transformers",
+            FutureWarning,
+        )
         output = self.call(inputs)
 
         return self.serving_output(output)
 
-    def serving_output(self, output):
+    @property
+    def input_signature(self) -> Dict[str, tf.TensorSpec]:
+        """
+        This property should return a dict mapping input names to tf.TensorSpec objects, representing the expected
+        shape and dtype for model inputs. It is used for both serving and for generating the dummy inputs used to build
+        the model.
         """
-        Prepare the output of the saved model. Each model must implement this function.
+        model_inputs = list(inspect.signature(self.call).parameters)
+        sig = {}
+        if "input_ids" in model_inputs:
+            if self.__class__.__name__.endswith("ForMultipleChoice"):
+                text_dims = 3
+            else:
+                text_dims = 2
+            for input_name in (
+                "input_ids",
+                "attention_mask",
+                "token_type_ids",
+                "decoder_input_ids",
+                "decoder_attention_mask",
+            ):
+                if input_name in model_inputs:
+                    sig[input_name] = tf.TensorSpec([None] * text_dims, tf.int32, name=input_name)
+        if "pixel_values" in model_inputs:
+            pixel_values_shape = [None, None, None, None]
+            if hasattr(self.config, "vision_config"):
+                vision_config = self.config.vision_config
+            else:
+                vision_config = self.config
+            if hasattr(vision_config, "num_channels"):
+                pixel_values_shape[1] = vision_config.num_channels
+            else:
+                raise NotImplementedError(
+                    "Could not infer number of channels from config, please override input_signature to specify input shapes."
+                )
+            if hasattr(vision_config, "image_size"):
+                pixel_values_shape[2] = pixel_values_shape[3] = vision_config.image_size
+            elif hasattr(vision_config, "input_size"):
+                pixel_values_shape[2] = pixel_values_shape[3] = vision_config.input_size
+            else:
+                raise NotImplementedError(
+                    "Could not infer input image shape from config, please override input_signature to specify input shapes."
+                )
+            sig["pixel_values"] = tf.TensorSpec(pixel_values_shape, tf.float32, name="pixel_values")
+        if "input_features" in model_inputs:
+            raise NotImplementedError("Audio models need a manually defined input_signature")
+        return sig
 
-        Args:
-            output ([`TFBaseModelOutput`]):
-                The output returned by the model.
+    def serving_output(self, output):
         """
-        raise NotImplementedError
+        Prepare the output of the saved model. Can be overridden if specific serving modifications are required.
+        """
+        if not isinstance(output, ModelOutput):
+            return output
+        for key in output:
+            if key.endswith("hidden_states") and not getattr(self.config, "output_hidden_states", False):
+                output[key] = None
+            elif key.endswith("attentions") and not getattr(self.config, "output_attentions", False):
+                output[key] = None
+            elif key == "past_key_values" and not getattr(self.config, "use_cache", False):
+                output[key] = None
+            elif key == "cross_attentions" and not (
+                getattr(self.config, "output_attentions", False) and getattr(self.config, "add_cross_attention", False)
+            ):
+                output[key] = None
+            if isinstance(output[key], (tuple, list)):
+                try:
+                    output[key] = tf.convert_to_tensor(output[key])
+                except (ValueError, tf.errors.InvalidArgumentError):
+                    pass  # Layers may not have the same dimensions
+        return output
 
-    def can_generate(self) -> bool:
+    @classmethod
+    def can_generate(cls) -> bool:
         """
         Returns whether this model can generate sequences with `.generate()`.
 
         Returns:
             `bool`: Whether this model can generate sequences with `.generate()`.
         """
-        # Detects whether `prepare_inputs_for_generation` has been overwritten, which is a requirement for generation
-        if "GenerationMixin" in str(self.prepare_inputs_for_generation):
+        # Detects whether `prepare_inputs_for_generation` has been overwritten, which is a requirement for generation.
+        # Alternativelly, the model can also have a custom `generate` function.
+        if "GenerationMixin" in str(cls.prepare_inputs_for_generation) and "GenerationMixin" in str(cls.generate):
             return False
         return True
 
@@ -1258,21 +1358,16 @@ def load_repo_checkpoint(self, repo_path_or_name):
                 "Checkpoint loading failed as no optimizer is attached to the model. "
                 "This is most likely caused by the model not being compiled."
             )
-        if not os.path.isdir(repo_path_or_name):
+        if os.path.isdir(repo_path_or_name):
+            local_dir = repo_path_or_name
+        else:
             # If this isn't a local path, check that the remote repo exists and has a checkpoint in it
             repo_files = list_repo_files(repo_path_or_name)
             for file in ("checkpoint/weights.h5", "checkpoint/extra_data.pickle"):
                 if file not in repo_files:
                     raise FileNotFoundError(f"Repo {repo_path_or_name} does not contain checkpoint file {file}!")
-            if "/" not in repo_path_or_name:
-                model_id = repo_path_or_name
-                repo_path_or_name = self.get_full_repo_name(repo_path_or_name)
-            else:
-                model_id = repo_path_or_name.split("/")[-1]
-            repo = Repository(model_id, clone_from=f"https://huggingface.co/{repo_path_or_name}")
+            repo = Repository(repo_path_or_name.split("/")[-1], clone_from=repo_path_or_name)
             local_dir = repo.local_dir
-        else:
-            local_dir = repo_path_or_name
 
         # Now make sure the repo actually has a checkpoint in it.
         checkpoint_dir = os.path.join(local_dir, "checkpoint")
@@ -1345,15 +1440,15 @@ def prepare_tf_dataset(
 
         if collate_fn is None:
             if tokenizer is None:
-                collate_fn = DefaultDataCollator(return_tensors="tf")
+                collate_fn = DefaultDataCollator(return_tensors="np")
             else:
-                collate_fn = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
+                collate_fn = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="np")
         if collate_fn_args is None:
-            collate_fn_args = dict()
+            collate_fn_args = {}
 
         if not isinstance(dataset, datasets.Dataset):
             raise TypeError("Dataset argument should be a datasets.Dataset!")
-        model_inputs = list(dict(inspect.signature(self.call).parameters).keys())
+        model_inputs = list(inspect.signature(self.call).parameters)
         model_labels = find_labels(self.__class__)
         if "cols_to_retain" in list(inspect.signature(dataset._get_output_signature).parameters.keys()):
             output_signature, _ = dataset._get_output_signature(
@@ -1379,6 +1474,12 @@ def prepare_tf_dataset(
         feature_cols = [col for col in output_columns if col in model_inputs and col not in model_labels]
         label_cols = [col for col in output_columns if col in model_labels]
 
+        # Backwards compatibility for older versions of datasets. Previously, if `columns` or `label_cols`
+        # were a single element list, the returned element spec would be a single element. Now, passing [feature]
+        # will return a dict structure {"feature": feature}, and passing a single string will return a single element.
+        feature_cols = feature_cols[0] if len(feature_cols) == 1 else feature_cols
+        label_cols = label_cols[0] if len(label_cols) == 1 else label_cols
+
         if drop_remainder is None:
             drop_remainder = shuffle
         tf_dataset = dataset.to_tf_dataset(
@@ -1396,25 +1497,28 @@ def prepare_tf_dataset(
     def compile(
         self,
         optimizer="rmsprop",
-        loss="passthrough",
+        loss="auto_with_warning",
         metrics=None,
         loss_weights=None,
         weighted_metrics=None,
         run_eagerly=None,
         steps_per_execution=None,
-        **kwargs
+        **kwargs,
     ):
         """
         This is a thin wrapper that sets the model's loss output head as the loss if the user does not specify a loss
         function themselves.
         """
-        if loss == "passthrough":
-            logger.warning(
+        if loss in ("auto_with_warning", "passthrough"):  # "passthrough" for workflow backward compatibility
+            logger.info(
                 "No loss specified in compile() - the model's internal loss computation will be used as the "
                 "loss. Don't panic - this is a common way to train TensorFlow models in Transformers! "
                 "To disable this behaviour please pass a loss argument, or explicitly pass "
-                "`loss=None` if you do not want your model to compute a loss."
+                "`loss=None` if you do not want your model to compute a loss. You can also specify `loss='auto'` to "
+                "get the internal loss without printing this info string."
             )
+            loss = "auto"
+        if loss == "auto":
             loss = dummy_loss
             self._using_dummy_loss = True
         else:
@@ -1459,7 +1563,7 @@ def compute_loss(self, *args, **kwargs):
             return self.hf_compute_loss(*args, **kwargs)
 
     def get_label_to_output_name_mapping(self):
-        arg_names = list(dict(inspect.signature(self.call).parameters).keys())
+        arg_names = list(inspect.signature(self.call).parameters)
         if self._label_to_output_map is not None:
             return self._label_to_output_map
         elif "start_positions" in arg_names:
@@ -1471,7 +1575,7 @@ def get_label_to_output_name_mapping(self):
         elif "mc_labels" in arg_names:
             return {"labels": "logits", "mc_labels": "mc_logits"}
         else:
-            return dict()
+            return {}
 
     def train_step(self, data):
         """
@@ -1482,14 +1586,14 @@ def train_step(self, data):
         """
 
         # We hardcode the most common renamings; models with weirder names can set `self._label_to_output_map`
-        arg_names = list(dict(inspect.signature(self.call).parameters).keys())
+        arg_names = list(inspect.signature(self.call).parameters)
         label_kwargs = find_labels(self.__class__)
         label_to_output = self.get_label_to_output_name_mapping()
         output_to_label = {val: key for key, val in label_to_output.items()}
         if not self._using_dummy_loss and parse(tf.__version__) < parse("2.11.0"):
             # Newer TF train steps leave this out
-            data = data_adapter.expand_1d(data)
-        x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(data)
+            data = expand_1d(data)
+        x, y, sample_weight = tf.keras.utils.unpack_x_y_sample_weight(data)
         # If the inputs are mutable dictionaries, make a shallow copy of them because we will modify
         # them during input/label pre-processing. This avoids surprising the user by wrecking their data.
         # In addition, modifying mutable Python inputs makes XLA compilation impossible.
@@ -1501,7 +1605,6 @@ def train_step(self, data):
         # When using a dummy loss, we ensure that separate labels are copied to the correct model arguments,
         # if those keys are not already present in the input dict
         if self._using_dummy_loss and y is not None:
-
             # If y is a tensor and the model only has one label-like input, map y to that input
             if len(label_kwargs) == 1 and isinstance(y, tf.Tensor):
                 if isinstance(x, tf.Tensor):
@@ -1590,14 +1693,14 @@ def test_step(self, data):
         that they are available to the model during the forward pass.
         """
         # We hardcode the most common renamings; models with weirder names can set `self._label_to_output_map`
-        arg_names = list(dict(inspect.signature(self.call).parameters).keys())
+        arg_names = list(inspect.signature(self.call).parameters)
         label_kwargs = find_labels(self.__class__)
         label_to_output = self.get_label_to_output_name_mapping()
         output_to_label = {val: key for key, val in label_to_output.items()}
         if not self._using_dummy_loss and parse(tf.__version__) < parse("2.11.0"):
             # Newer versions leave this out
-            data = data_adapter.expand_1d(data)
-        x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(data)
+            data = expand_1d(data)
+        x, y, sample_weight = tf.keras.utils.unpack_x_y_sample_weight(data)
         # If the inputs are mutable dictionaries, make a shallow copy of them because we will modify
         # them during input/label pre-processing. This avoids surprising the user by wrecking their data.
         # In addition, modifying mutable Python inputs makes XLA compilation impossible.
@@ -1609,7 +1712,7 @@ def test_step(self, data):
         # When using a dummy loss, we ensure that separate labels are copied to the correct model arguments,
         # if those keys are not already present in the input dict
         if self._using_dummy_loss and y is not None:
-            arg_names = list(dict(inspect.signature(self.call).parameters).keys())
+            arg_names = list(inspect.signature(self.call).parameters)
             # If y is a tensor and the model only has one label-like input, map y to that input
             if len(label_kwargs) == 1 and isinstance(y, tf.Tensor):
                 if isinstance(x, tf.Tensor):
@@ -1763,7 +1866,7 @@ def set_input_embeddings(self, value):
             main_layer.set_input_embeddings(value)
         except AttributeError:
             logger.info("Building the model")
-            self(self.dummy_inputs)
+            self.build()
             main_layer.set_input_embeddings(value)
 
     def get_output_embeddings(self) -> Union[None, tf.keras.layers.Layer]:
@@ -1780,7 +1883,7 @@ def get_output_embeddings(self) -> Union[None, tf.keras.layers.Layer]:
                 return lm_head.get_output_embeddings()
             except AttributeError:
                 logger.info("Building the model")
-                self(self.dummy_inputs)
+                self.build()
 
                 return lm_head().get_output_embeddings()
 
@@ -1800,7 +1903,7 @@ def set_output_embeddings(self, value):
                 lm_head.set_output_embeddings(value)
             except AttributeError:
                 logger.info("Building the model")
-                self(self.dummy_inputs)
+                self.build()
                 lm_head.set_output_embeddings(value)
 
     def get_output_layer_with_bias(self) -> Union[None, tf.keras.layers.Layer]:
@@ -1838,7 +1941,7 @@ def get_bias(self) -> Union[None, Dict[str, tf.Variable]]:
             try:
                 return lm_head.get_bias()
             except AttributeError:
-                self(self.dummy_inputs)
+                self.build()
 
                 return lm_head.get_bias()
         return None
@@ -1856,7 +1959,7 @@ def set_bias(self, value):
             try:
                 lm_head.set_bias(value)
             except AttributeError:
-                self(self.dummy_inputs)
+                self.build()
                 lm_head.set_bias(value)
 
     def get_lm_head(self) -> tf.keras.layers.Layer:
@@ -1943,7 +2046,7 @@ def _get_word_embedding_weight(model, embedding_layer):
         # The reason why the attributes don't exist might be
         # because the model is not built, so retry getting
         # the argument after building the model
-        model(model.dummy_inputs)
+        model.build()
 
         embeds = getattr(embedding_layer, "weight", None)
         if embeds is not None:
@@ -2228,7 +2331,8 @@ def save_pretrained(
         max_shard_size: Union[int, str] = "10GB",
         create_pr: bool = False,
         safe_serialization: bool = False,
-        **kwargs
+        token: Optional[Union[str, bool]] = None,
+        **kwargs,
     ):
         """
         Save a model and its configuration file to a directory, so that it can be re-loaded using the
@@ -2264,10 +2368,27 @@ def save_pretrained(
                 Whether or not to create a PR with the uploaded files or directly commit.
             safe_serialization (`bool`, *optional*, defaults to `False`):
                 Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
-
-            kwargs:
+            token (`str` or `bool`, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
+                the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
+            kwargs (`Dict[str, Any]`, *optional*):
                 Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
         """
+        use_auth_token = kwargs.pop("use_auth_token", None)
+
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
+        if token is not None:
+            kwargs["token"] = token
+
         if os.path.isfile(save_directory):
             logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
             return
@@ -2277,22 +2398,27 @@ def save_pretrained(
         if push_to_hub:
             commit_message = kwargs.pop("commit_message", None)
             repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
-            repo_id, token = self._create_repo(repo_id, **kwargs)
+            repo_id = self._create_repo(repo_id, **kwargs)
             files_timestamps = self._get_files_timestamps(save_directory)
 
         if saved_model:
+            # If `torch_dtype` is in the config with a torch dtype class as the value, we need to change it to string.
+            # (Although TF doesn't care about this attribute, we can't just remove it or set it to `None`.)
+            if getattr(self.config, "torch_dtype", None) is not None and not isinstance(self.config.torch_dtype, str):
+                self.config.torch_dtype = str(self.config.torch_dtype).split(".")[1]
             if signatures is None:
-                if any(spec.dtype == tf.int32 for spec in self.serving.input_signature[0].values()):
+                serving_default = self.serving.get_concrete_function(self.input_signature)
+                if any(spec.dtype == tf.int32 for spec in self.input_signature.values()):
                     int64_spec = {
                         key: tf.TensorSpec(
                             shape=spec.shape, dtype=tf.int64 if spec.dtype == tf.int32 else spec.dtype, name=spec.name
                         )
-                        for key, spec in self.serving.input_signature[0].items()
+                        for key, spec in self.input_signature.items()
                     }
-                    int64_serving = tf.function(self.eager_serving, input_signature=[int64_spec])
-                    signatures = {"serving_default": self.serving, "int64_serving": int64_serving}
+                    int64_serving = self.serving.get_concrete_function(int64_spec)
+                    signatures = {"serving_default": serving_default, "int64_serving": int64_serving}
                 else:
-                    signatures = self.serving
+                    signatures = serving_default
             saved_model_dir = os.path.join(save_directory, "saved_model", str(version))
             self.save(saved_model_dir, include_optimizer=False, signatures=signatures)
             logger.info(f"Saved model created in {saved_model_dir}")
@@ -2306,6 +2432,8 @@ def save_pretrained(
             custom_object_save(self, save_directory, config=self.config)
 
         self.config.save_pretrained(save_directory)
+        if self.can_generate():
+            self.generation_config.save_pretrained(save_directory)
 
         # If we save using the predefined names, we can load using `from_pretrained`
         weights_name = SAFE_WEIGHTS_NAME if safe_serialization else TF2_WEIGHTS_NAME
@@ -2357,15 +2485,31 @@ def save_pretrained(
                         )
                         param_dset[:] = layer.numpy()
                         layers.append(layer_name.encode("utf8"))
-                    hdf5_format.save_attributes_to_hdf5_group(shard_file, "layer_names", layers)
+                    save_attributes_to_hdf5_group(shard_file, "layer_names", layers)
 
         if push_to_hub:
             self._upload_modified_files(
-                save_directory, repo_id, files_timestamps, commit_message=commit_message, token=token
+                save_directory,
+                repo_id,
+                files_timestamps,
+                commit_message=commit_message,
+                token=token,
             )
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
+        *model_args,
+        config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None,
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        ignore_mismatched_sizes: bool = False,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
+        **kwargs,
+    ):
         r"""
         Instantiate a pretrained TF 2.0 model from a pre-trained model configuration.
 
@@ -2430,8 +2574,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                 output_loading_info(`bool`, *optional*, defaults to `False`): Whether ot not to also return a
                 dictionary containing missing keys, unexpected keys and error messages.
             local_files_only(`bool`, *optional*, defaults to `False`):
-                Whether or not to only look at local files (e.g., not try doanloading the model).
-            use_auth_token (`str` or `bool`, *optional*):
+                Whether or not to only look at local files (e.g., not try downloading the model).
+            token (`str` or `bool`, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
                 the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
             revision (`str`, *optional*, defaults to `"main"`):
@@ -2453,6 +2597,10 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             subfolder (`str`, *optional*, defaults to `""`):
                 In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
                 specify the folder name here.
+            tf_to_pt_weight_rename (`Callable`, *optional*):
+                A function that is called to transform the names of weights during the PyTorch to TensorFlow
+                crossloading process. This is not necessary for most models, but is useful to allow composite models to
+                be crossloaded correctly.
             kwargs (remaining dictionary of keyword arguments, *optional*):
                 Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
                 `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
@@ -2483,17 +2631,11 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         >>> config = BertConfig.from_json_file("./pt_model/my_pt_model_config.json")
         >>> model = TFBertModel.from_pretrained("./pt_model/my_pytorch_model.bin", from_pt=True, config=config)
         ```"""
-        config = kwargs.pop("config", None)
-        cache_dir = kwargs.pop("cache_dir", None)
         from_pt = kwargs.pop("from_pt", False)
-        ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False)
-        force_download = kwargs.pop("force_download", False)
         resume_download = kwargs.pop("resume_download", False)
         proxies = kwargs.pop("proxies", None)
         output_loading_info = kwargs.pop("output_loading_info", False)
-        local_files_only = kwargs.pop("local_files_only", False)
         use_auth_token = kwargs.pop("use_auth_token", None)
-        revision = kwargs.pop("revision", None)
         trust_remote_code = kwargs.pop("trust_remote_code", None)
         _ = kwargs.pop("mirror", None)
         load_weight_prefix = kwargs.pop("load_weight_prefix", None)
@@ -2501,6 +2643,20 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         from_auto_class = kwargs.pop("_from_auto", False)
         subfolder = kwargs.pop("subfolder", "")
         commit_hash = kwargs.pop("_commit_hash", None)
+        tf_to_pt_weight_rename = kwargs.pop("tf_to_pt_weight_rename", None)
+
+        # Not relevant for TF models
+        _ = kwargs.pop("adapter_kwargs", None)
+
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
 
         if trust_remote_code is True:
             logger.warning(
@@ -2527,7 +2683,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                 resume_download=resume_download,
                 proxies=proxies,
                 local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
+                token=token,
                 revision=revision,
                 _from_auto=from_auto_class,
                 _from_pipeline=from_pipeline,
@@ -2608,19 +2764,19 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
 
                 try:
                     # Load from URL or cache if already cached
-                    cached_file_kwargs = dict(
-                        cache_dir=cache_dir,
-                        force_download=force_download,
-                        proxies=proxies,
-                        resume_download=resume_download,
-                        local_files_only=local_files_only,
-                        use_auth_token=use_auth_token,
-                        user_agent=user_agent,
-                        revision=revision,
-                        subfolder=subfolder,
-                        _raise_exceptions_for_missing_entries=False,
-                        _commit_hash=commit_hash,
-                    )
+                    cached_file_kwargs = {
+                        "cache_dir": cache_dir,
+                        "force_download": force_download,
+                        "proxies": proxies,
+                        "resume_download": resume_download,
+                        "local_files_only": local_files_only,
+                        "token": token,
+                        "user_agent": user_agent,
+                        "revision": revision,
+                        "subfolder": subfolder,
+                        "_raise_exceptions_for_missing_entries": False,
+                        "_commit_hash": commit_hash,
+                    }
                     resolved_archive_file = cached_file(pretrained_model_name_or_path, filename, **cached_file_kwargs)
 
                     # Since we set _raise_exceptions_for_missing_entries=False, we don't get an exception but a None
@@ -2661,7 +2817,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                         has_file_kwargs = {
                             "revision": revision,
                             "proxies": proxies,
-                            "use_auth_token": use_auth_token,
+                            "token": token,
                         }
                         if has_file(pretrained_model_name_or_path, WEIGHTS_NAME, **has_file_kwargs):
                             raise EnvironmentError(
@@ -2708,7 +2864,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                 proxies=proxies,
                 resume_download=resume_download,
                 local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
+                token=token,
                 user_agent=user_agent,
                 revision=revision,
                 _commit_hash=commit_hash,
@@ -2740,24 +2896,37 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
 
             # Load from a PyTorch checkpoint
             return load_pytorch_checkpoint_in_tf2_model(
-                model, resolved_archive_file, allow_missing_keys=True, output_loading_info=output_loading_info
+                model,
+                resolved_archive_file,
+                allow_missing_keys=True,
+                output_loading_info=output_loading_info,
+                _prefix=load_weight_prefix,
+                tf_to_pt_weight_rename=tf_to_pt_weight_rename,
             )
 
         # we might need to extend the variable scope for composite models
         if load_weight_prefix is not None:
             with tf.compat.v1.variable_scope(load_weight_prefix):
-                model(model.dummy_inputs)  # build the network with dummy inputs
+                model.build()  # build the network with dummy inputs
         else:
-            model(model.dummy_inputs)  # build the network with dummy inputs
+            model.build()  # build the network with dummy inputs
 
         if safetensors_from_pt:
             from .modeling_tf_pytorch_utils import load_pytorch_state_dict_in_tf2_model
 
-            state_dict = safe_load_file(resolved_archive_file)
-            # Load from a PyTorch checkpoint
-            return load_pytorch_state_dict_in_tf2_model(
-                model, state_dict, allow_missing_keys=True, output_loading_info=output_loading_info
-            )
+            with safe_open(resolved_archive_file, framework="tf") as safetensors_archive:
+                # Load from a PyTorch checkpoint
+                # We load in TF format here because PT weights often need to be transposed, and this is much
+                # faster on GPU. Loading as numpy and transposing on CPU adds several seconds to load times.
+                return load_pytorch_state_dict_in_tf2_model(
+                    model,
+                    safetensors_archive,
+                    tf_inputs=False,  # No need to build the model again
+                    allow_missing_keys=True,
+                    output_loading_info=output_loading_info,
+                    _prefix=load_weight_prefix,
+                    ignore_mismatched_sizes=ignore_mismatched_sizes,
+                )
 
         # 'by_name' allow us to do transfer learning by skipping/adding layers
         # see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1339-L1357
@@ -2770,6 +2939,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                     model,
                     resolved_archive_file,
                     ignore_mismatched_sizes=ignore_mismatched_sizes,
+                    _prefix=load_weight_prefix,
                 )
             else:
                 missing_keys, unexpected_keys, mismatched_keys = load_tf_weights(
@@ -2795,8 +2965,6 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                     "If you tried to load a TF 2.0 model from a PyTorch checkpoint, please set from_pt=True. "
                 )
 
-        model(model.dummy_inputs)  # Make sure restore ops are run
-
         if cls._keys_to_ignore_on_load_missing is not None:
             for pat in cls._keys_to_ignore_on_load_missing:
                 missing_keys = [k for k in missing_keys if re.search(pat, k) is None]
@@ -2855,7 +3023,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                     resume_download=resume_download,
                     proxies=proxies,
                     local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
+                    token=token,
                     revision=revision,
                     subfolder=subfolder,
                     _from_auto=from_auto_class,
@@ -2885,9 +3053,12 @@ def push_to_hub(
         use_temp_dir: Optional[bool] = None,
         commit_message: Optional[str] = None,
         private: Optional[bool] = None,
-        use_auth_token: Optional[Union[bool, str]] = None,
         max_shard_size: Optional[Union[int, str]] = "10GB",
-        **model_card_kwargs
+        token: Optional[Union[bool, str]] = None,
+        # (`use_auth_token` is deprecated: we have to keep it here as we don't have **kwargs)
+        use_auth_token: Optional[Union[bool, str]] = None,
+        create_pr: bool = False,
+        **base_model_card_args,
     ) -> str:
         """
         Upload the model files to the 🤗 Model Hub while synchronizing a local clone of the repo in `repo_path_or_name`.
@@ -2903,7 +3074,7 @@ def push_to_hub(
                 Message to commit while pushing. Will default to `"Upload model"`.
             private (`bool`, *optional*):
                 Whether or not the repository created should be private.
-            use_auth_token (`bool` or `str`, *optional*):
+            token (`bool` or `str`, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
                 when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url`
                 is not specified.
@@ -2911,8 +3082,8 @@ def push_to_hub(
                 Only applicable for models. The maximum size for a checkpoint before being sharded. Checkpoints shard
                 will then be each of size lower than this size. If expressed as a string, needs to be digits followed
                 by a unit (like `"5MB"`).
-            model_card_kwargs:
-                Additional keyword arguments passed along to the [`~TFPreTrainedModel.create_model_card`] method.
+            create_pr (`bool`, *optional*, defaults to `False`):
+                Whether or not to create a PR with the uploaded files or directly commit.
 
         Examples:
 
@@ -2928,15 +3099,25 @@ def push_to_hub(
         model.push_to_hub("huggingface/my-finetuned-bert")
         ```
         """
-        if "repo_path_or_name" in model_card_kwargs:
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
+        if "repo_path_or_name" in base_model_card_args:
             warnings.warn(
                 "The `repo_path_or_name` argument is deprecated and will be removed in v5 of Transformers. Use "
                 "`repo_id` instead."
             )
-            repo_id = model_card_kwargs.pop("repo_path_or_name")
+            repo_id = base_model_card_args.pop("repo_path_or_name")
         # Deprecation warning will be sent after for repo_url and organization
-        repo_url = model_card_kwargs.pop("repo_url", None)
-        organization = model_card_kwargs.pop("organization", None)
+        repo_url = base_model_card_args.pop("repo_url", None)
+        organization = base_model_card_args.pop("organization", None)
 
         if os.path.isdir(repo_id):
             working_dir = repo_id
@@ -2944,8 +3125,8 @@ def push_to_hub(
         else:
             working_dir = repo_id.split("/")[-1]
 
-        repo_id, token = self._create_repo(
-            repo_id, private=private, use_auth_token=use_auth_token, repo_url=repo_url, organization=organization
+        repo_id = self._create_repo(
+            repo_id, private=private, token=token, repo_url=repo_url, organization=organization
         )
 
         if use_temp_dir is None:
@@ -2962,11 +3143,16 @@ def push_to_hub(
                     "output_dir": work_dir,
                     "model_name": Path(repo_id).name,
                 }
-                base_model_card_args.update(model_card_kwargs)
+                base_model_card_args.update(base_model_card_args)
                 self.create_model_card(**base_model_card_args)
 
             self._upload_modified_files(
-                work_dir, repo_id, files_timestamps, commit_message=commit_message, token=token
+                work_dir,
+                repo_id,
+                files_timestamps,
+                commit_message=commit_message,
+                token=token,
+                create_pr=create_pr,
             )
 
     @classmethod
@@ -3009,7 +3195,7 @@ class TFConv1D(tf.keras.layers.Layer):
             The number of input features.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation to use to initialize the weights.
-        kwargs:
+        kwargs (`Dict[str, Any]`, *optional*):
             Additional keyword arguments passed along to the `__init__` of `tf.keras.layers.Layer`.
     """
 
@@ -3051,7 +3237,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
         initializer_range (`float`, *optional*):
             The standard deviation to use when initializing the weights. If no value is provided, it will default to
             \\(1/\sqrt{hidden\_size}\\).
-        kwargs:
+        kwargs (`Dict[str, Any]`, *optional*):
             Additional keyword arguments passed along to the `__init__` of `tf.keras.layers.Layer`.
     """
     # TODO (joao): flagged for delection due to embeddings refactor
@@ -3061,6 +3247,10 @@ def __init__(self, vocab_size: int, hidden_size: int, initializer_range: Optiona
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
         self.initializer_range = hidden_size**-0.5 if initializer_range is None else initializer_range
+        warnings.warn(
+            "`TFSharedEmbeddings` is scheduled for deletion in v4.32, use `tf.keras.layers.Embedding` instead.",
+            DeprecationWarning,
+        )
 
     def build(self, input_shape):
         """
@@ -3161,7 +3351,7 @@ class TFSequenceSummary(tf.keras.layers.Layer):
             - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
 
         initializer_range (`float`, defaults to 0.02): The standard deviation to use to initialize the weights.
-        kwargs:
+        kwargs (`Dict[str, Any]`, *optional*):
             Additional keyword arguments passed along to the `__init__` of `tf.keras.layers.Layer`.
     """
 
@@ -3251,14 +3441,14 @@ def call(self, inputs, cls_index=None, training=False):
         return output
 
 
-def get_initializer(initializer_range: float = 0.02) -> tf.initializers.TruncatedNormal:
+def get_initializer(initializer_range: float = 0.02) -> tf.keras.initializers.TruncatedNormal:
     """
-    Creates a `tf.initializers.TruncatedNormal` with the given range.
+    Creates a `tf.keras.initializers.TruncatedNormal` with the given range.
 
     Args:
         initializer_range (*float*, defaults to 0.02): Standard deviation of the initializer range.
 
     Returns:
-        `tf.initializers.TruncatedNormal`: The truncated normal initializer.
+        `tf.keras.initializers.TruncatedNormal`: The truncated normal initializer.
     """
     return tf.keras.initializers.TruncatedNormal(stddev=initializer_range)
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 97dc1d3c00a2..54f31ab926ba 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 import collections
 import gc
+import importlib.metadata
 import inspect
 import json
 import os
@@ -24,31 +25,32 @@
 import warnings
 from contextlib import contextmanager
 from dataclasses import dataclass
-from functools import partial
+from functools import partial, wraps
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
 from packaging import version
 from torch import Tensor, nn
-from torch.nn import CrossEntropyLoss
-
-from transformers.utils.hub import convert_file_size_to_int, get_checkpoint_shard_files
-from transformers.utils.import_utils import ENV_VARS_TRUE_VALUES, is_sagemaker_mp_enabled
+from torch.nn import CrossEntropyLoss, Identity
 
 from .activations import get_activation
 from .configuration_utils import PretrainedConfig
-from .deepspeed import deepspeed_config, is_deepspeed_zero3_enabled
 from .dynamic_module_utils import custom_object_save
 from .generation import GenerationConfig, GenerationMixin
+from .integrations import PeftAdapterMixin, deepspeed_config, is_deepspeed_zero3_enabled
 from .pytorch_utils import (  # noqa: F401
     Conv1D,
     apply_chunking_to_forward,
     find_pruneable_heads_and_indices,
+    id_tensor_storage,
     prune_conv1d_layer,
     prune_layer,
     prune_linear_layer,
 )
 from .utils import (
+    ADAPTER_SAFE_WEIGHTS_NAME,
+    ADAPTER_WEIGHTS_NAME,
+    CONFIG_NAME,
     DUMMY_INPUTS,
     FLAX_WEIGHTS_NAME,
     SAFE_WEIGHTS_INDEX_NAME,
@@ -63,16 +65,30 @@
     cached_file,
     copy_func,
     download_url,
+    extract_commit_hash,
     has_file,
     is_accelerate_available,
+    is_auto_gptq_available,
     is_bitsandbytes_available,
+    is_flash_attn_available,
     is_offline_mode,
+    is_optimum_available,
+    is_peft_available,
     is_remote_url,
     is_safetensors_available,
     is_torch_tpu_available,
     logging,
     replace_return_docstrings,
+    strtobool,
+)
+from .utils.hub import convert_file_size_to_int, get_checkpoint_shard_files
+from .utils.import_utils import (
+    ENV_VARS_TRUE_VALUES,
+    is_sagemaker_mp_enabled,
+    is_torch_fx_proxy,
+    is_torchdynamo_compiling,
 )
+from .utils.quantization_config import BitsAndBytesConfig, GPTQConfig, QuantizationMethod
 from .utils.versions import require_version_core
 
 
@@ -80,20 +96,19 @@
 XLA_DOWNCAST_BF16 = os.environ.get("XLA_DOWNCAST_BF16", "0").upper()
 
 if is_accelerate_available():
-    from accelerate import __version__ as accelerate_version
     from accelerate import dispatch_model, infer_auto_device_map, init_empty_weights
+    from accelerate.hooks import add_hook_to_module
     from accelerate.utils import (
+        check_tied_parameters_on_same_device,
+        find_tied_parameters,
+        get_balanced_memory,
+        get_max_memory,
         load_offloaded_weights,
         offload_weight,
         save_offload_index,
         set_module_tensor_to_device,
     )
 
-    if version.parse(accelerate_version) > version.parse("0.11.0"):
-        from accelerate.utils import get_balanced_memory
-    else:
-        get_balanced_memory = None
-
 if is_safetensors_available():
     from safetensors import safe_open
     from safetensors.torch import load_file as safe_load_file
@@ -105,6 +120,18 @@
 _init_weights = True
 
 
+def is_fsdp_enabled():
+    return (
+        torch.distributed.is_available()
+        and torch.distributed.is_initialized()
+        and strtobool(os.environ.get("ACCELERATE_USE_FSDP", "False")) == 1
+    )
+
+
+def is_fsdp_enabled_and_dist_rank_0():
+    return is_fsdp_enabled() and int(os.environ.get("LOCAL_RANK", -1)) == 0
+
+
 if is_sagemaker_mp_enabled():
     import smdistributed.modelparallel.torch as smp
     from smdistributed.modelparallel import __version__ as SMP_VERSION
@@ -113,6 +140,9 @@
 else:
     IS_SAGEMAKER_MP_POST_1_10 = False
 
+if is_peft_available():
+    from .utils import find_adapter_config_file
+
 
 @contextmanager
 def no_init_weights(_enable=True):
@@ -131,20 +161,6 @@ def no_init_weights(_enable=True):
         _init_weights = old_init_weights
 
 
-try:
-    from torch.nn import Identity
-except ImportError:
-    # Older PyTorch compatibility
-    class Identity(nn.Module):
-        r"""A placeholder identity operator that is argument-insensitive."""
-
-        def __init__(self, *args, **kwargs):
-            super().__init__()
-
-        def forward(self, input):
-            return input
-
-
 def get_parameter_device(parameter: Union[nn.Module, GenerationMixin, "ModuleUtilsMixin"]):
     try:
         return next(parameter.parameters()).device
@@ -189,36 +205,43 @@ def get_parameter_dtype(parameter: Union[nn.Module, GenerationMixin, "ModuleUtil
             # Adding fix for https://github.com/pytorch/xla/issues/4152
             # Fixes issue where the model code passes a value that is out of range for XLA_USE_BF16=1
             # and XLA_DOWNCAST_BF16=1 so the conversion would cast it to -inf
-            if is_torch_tpu_available():
-                if XLA_USE_BF16 in ENV_VARS_TRUE_VALUES:
+            # NOTE: `is_torch_tpu_available()` is checked last as it induces a graph break in torch dynamo
+            if XLA_USE_BF16 in ENV_VARS_TRUE_VALUES and is_torch_tpu_available():
+                return torch.bfloat16
+            if XLA_DOWNCAST_BF16 in ENV_VARS_TRUE_VALUES and is_torch_tpu_available():
+                if t.dtype == torch.float:
                     return torch.bfloat16
-                if XLA_DOWNCAST_BF16 in ENV_VARS_TRUE_VALUES:
-                    if t.dtype == torch.float:
-                        return torch.bfloat16
-                    if t.dtype == torch.double:
-                        return torch.float32
+                if t.dtype == torch.double:
+                    return torch.float32
             return t.dtype
 
     if last_dtype is not None:
         # if no floating dtype was found return whatever the first dtype is
         return last_dtype
 
-    else:
-        # For nn.DataParallel compatibility in PyTorch > 1.5
-        def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]:
-            tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
-            return tuples
+    # For nn.DataParallel compatibility in PyTorch > 1.5
+    def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]:
+        tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
+        return tuples
 
-        gen = parameter._named_members(get_members_fn=find_tensor_attributes)
-        last_tuple = None
-        for tuple in gen:
-            last_tuple = tuple
-            if tuple[1].is_floating_point():
-                return tuple[1].dtype
+    gen = parameter._named_members(get_members_fn=find_tensor_attributes)
+    last_tuple = None
+    for tuple in gen:
+        last_tuple = tuple
+        if tuple[1].is_floating_point():
+            return tuple[1].dtype
 
+    if last_tuple is not None:
         # fallback to the last dtype
         return last_tuple[1].dtype
 
+    # fallback to buffer dtype
+    for t in parameter.buffers():
+        last_dtype = t.dtype
+        if t.is_floating_point():
+            return t.dtype
+    return last_dtype
+
 
 def get_state_dict_float_dtype(state_dict):
     """
@@ -278,7 +301,7 @@ def shard_checkpoint(
 
     <Tip warning={true}>
 
-    If one of the model's weight is bigger that `max_sahrd_size`, it will end up in its own sub-checkpoint which will
+    If one of the model's weight is bigger than `max_shard_size`, it will end up in its own sub-checkpoint which will
     have a size greater than `max_shard_size`.
 
     </Tip>
@@ -293,26 +316,37 @@ def shard_checkpoint(
     """
     max_shard_size = convert_file_size_to_int(max_shard_size)
 
-    sharded_state_dicts = []
-    current_block = {}
-    current_block_size = 0
+    sharded_state_dicts = [{}]
+    last_block_size = 0
     total_size = 0
+    storage_id_to_block = {}
 
     for key, weight in state_dict.items():
+        # when bnb serialization is used the weights in the state dict can be strings
+        # check: https://github.com/huggingface/transformers/pull/24416 for more details
+        if isinstance(weight, str):
+            continue
+        else:
+            storage_id = id_tensor_storage(weight)
+
+        # If a `weight` shares the same underlying storage as another tensor, we put `weight` in the same `block`
+        if storage_id in storage_id_to_block:
+            block_id = storage_id_to_block[storage_id]
+            sharded_state_dicts[block_id][key] = weight
+            continue
+
         weight_size = weight.numel() * dtype_byte_size(weight.dtype)
 
-        # If this weight is going to tip up over the maximal size, we split.
-        if current_block_size + weight_size > max_shard_size:
-            sharded_state_dicts.append(current_block)
-            current_block = {}
-            current_block_size = 0
+        # If this weight is going to tip up over the maximal size, we split, but only if we have put at least one
+        # weight in the current shard.
+        if last_block_size + weight_size > max_shard_size and len(sharded_state_dicts[-1]) > 0:
+            sharded_state_dicts.append({})
+            last_block_size = 0
 
-        current_block[key] = weight
-        current_block_size += weight_size
+        sharded_state_dicts[-1][key] = weight
+        last_block_size += weight_size
         total_size += weight_size
-
-    # Add the last block
-    sharded_state_dicts.append(current_block)
+        storage_id_to_block[storage_id] = len(sharded_state_dicts) - 1
 
     # If we only have one shard, we return it
     if len(sharded_state_dicts) == 1:
@@ -336,7 +370,7 @@ def shard_checkpoint(
     return shards, index
 
 
-def load_sharded_checkpoint(model, folder, strict=True):
+def load_sharded_checkpoint(model, folder, strict=True, prefer_safe=True):
     """
     This is the same as
     [`torch.nn.Module.load_state_dict`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html?highlight=load_state_dict#torch.nn.Module.load_state_dict)
@@ -350,6 +384,9 @@ def load_sharded_checkpoint(model, folder, strict=True):
         folder (`str` or `os.PathLike`): A path to a folder containing the sharded checkpoint.
         strict (`bool`, *optional`, defaults to `True`):
             Whether to strictly enforce that the keys in the model state dict match the keys in the sharded checkpoint.
+        prefer_safe (`bool`, *optional*, defaults to `False`)
+            If both safetensors and PyTorch save files are present in checkpoint and `prefer_safe` is True, the
+            safetensors files will be loaded. Otherwise, PyTorch files are always loaded when possible.
 
     Returns:
         `NamedTuple`: A named tuple with `missing_keys` and `unexpected_keys` fields
@@ -358,10 +395,32 @@ def load_sharded_checkpoint(model, folder, strict=True):
     """
     # Load the index
     index_file = os.path.join(folder, WEIGHTS_INDEX_NAME)
-    if not os.path.isfile(index_file):
-        raise ValueError(f"Can't find a checkpoint index ({WEIGHTS_INDEX_NAME}) in {folder}.")
+    safe_index_file = os.path.join(folder, SAFE_WEIGHTS_INDEX_NAME)
+
+    index_present = os.path.isfile(index_file)
+    safe_index_present = os.path.isfile(safe_index_file)
 
-    with open(index_file, "r", encoding="utf-8") as f:
+    if not index_present and not (safe_index_present and is_safetensors_available()):
+        filenames = (
+            (WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_INDEX_NAME) if is_safetensors_available() else (WEIGHTS_INDEX_NAME,)
+        )
+        raise ValueError(f"Can't find a checkpoint index ({' or '.join(filenames)}) in {folder}.")
+
+    load_safe = False
+    if safe_index_present:
+        if prefer_safe:
+            if is_safetensors_available():
+                load_safe = True  # load safe due to preference
+            else:
+                logger.warning(
+                    f"Cannot load sharded checkpoint at {folder} safely since safetensors is not installed!"
+                )
+        elif not index_present:
+            load_safe = True  # load safe since we have no other choice
+
+    load_index = safe_index_file if load_safe else index_file
+
+    with open(load_index, "r", encoding="utf-8") as f:
         index = json.load(f)
 
     shard_files = list(set(index["weight_map"].values()))
@@ -381,11 +440,13 @@ def load_sharded_checkpoint(model, folder, strict=True):
             error_message += f"\nMissing key(s): {str_unexpected_keys}."
         raise RuntimeError(error_message)
 
+    loader = safe_load_file if load_safe else partial(torch.load, map_location="cpu")
+
     for shard_file in shard_files:
-        state_dict = torch.load(os.path.join(folder, shard_file), map_location="cpu")
+        state_dict = loader(os.path.join(folder, shard_file))
         model.load_state_dict(state_dict, strict=False)
 
-        # Make sure memory is fred before we load the next state dict.
+        # Make sure memory is freed before we load the next state dict.
         del state_dict
         gc.collect()
 
@@ -412,7 +473,15 @@ def load_state_dict(checkpoint_file: Union[str, os.PathLike]):
             )
         return safe_load_file(checkpoint_file)
     try:
-        return torch.load(checkpoint_file, map_location="cpu")
+        if (
+            (is_deepspeed_zero3_enabled() or is_fsdp_enabled())
+            and torch.distributed.is_initialized()
+            and torch.distributed.get_rank() > 0
+        ):
+            map_location = "meta"
+        else:
+            map_location = "cpu"
+        return torch.load(checkpoint_file, map_location=map_location)
     except Exception as e:
         try:
             with open(checkpoint_file) as f:
@@ -435,6 +504,17 @@ def load_state_dict(checkpoint_file: Union[str, os.PathLike]):
             )
 
 
+def set_initialized_submodules(model, state_dict_keys):
+    """
+    Sets the `_is_hf_initialized` flag in all submodules of a given model when all its weights are in the loaded state
+    dict.
+    """
+    for module_name, module in model.named_modules():
+        loaded_keys = [k.replace(f"{module_name}.", "") for k in state_dict_keys if k.startswith(f"{module_name}.")]
+        if len(set(module.state_dict().keys()) - set(loaded_keys)) == 0:
+            module._is_hf_initialized = True
+
+
 def _load_state_dict_into_model(model_to_load, state_dict, start_prefix):
     # Convert old format to new format if needed from a PyTorch state_dict
     old_keys = []
@@ -528,9 +608,6 @@ def _move_model_to_meta(model, loaded_state_dict_keys, start_prefix):
 
     """
 
-    # meta device was added in pt=1.9
-    require_version_core("torch>=1.9")
-
     # dematerialize param storage for keys that are going to be replaced by state_dict, by
     # putting those on the meta device
     for k in loaded_state_dict_keys:
@@ -560,7 +637,7 @@ def _load_state_dict_into_meta_model(
     state_dict_folder=None,
     state_dict_index=None,
     dtype=None,
-    load_in_8bit=False,
+    is_quantized=False,
     is_safetensors=False,
     keep_in_fp32_modules=None,
 ):
@@ -581,8 +658,8 @@ def _load_state_dict_into_meta_model(
     # - Is there a situation where some keys aren't in `loaded_state_dict_keys` and in which case
     #   they won't get loaded.
 
-    if load_in_8bit:
-        from .utils.bitsandbytes import set_module_8bit_tensor_to_device
+    if is_quantized:
+        from .integrations import set_module_quantized_tensor_to_device
 
     error_msgs = []
 
@@ -616,7 +693,9 @@ def _load_state_dict_into_meta_model(
         if dtype is not None and torch.is_floating_point(param):
             if (
                 keep_in_fp32_modules is not None
-                and any(module_to_keep_in_fp32 in param_name for module_to_keep_in_fp32 in keep_in_fp32_modules)
+                and any(
+                    module_to_keep_in_fp32 in param_name.split(".") for module_to_keep_in_fp32 in keep_in_fp32_modules
+                )
                 and dtype == torch.float16
             ):
                 param = param.to(torch.float32)
@@ -653,20 +732,38 @@ def _load_state_dict_into_meta_model(
                 # TODO: group all errors and raise at the end.
                 raise ValueError(f"{param_name} doesn't have any device set.")
             param_device = device_map[module_name]
+
         if param_device == "disk":
             if not is_safetensors:
                 offload_index = offload_weight(param, param_name, offload_folder, offload_index)
         elif param_device == "cpu" and state_dict_index is not None:
             state_dict_index = offload_weight(param, param_name, state_dict_folder, state_dict_index)
-        elif not load_in_8bit:
+        elif not is_quantized:
             # For backward compatibility with older versions of `accelerate`
             set_module_tensor_to_device(model, param_name, param_device, **set_module_kwargs)
         else:
-            set_module_8bit_tensor_to_device(model, param_name, param_device, value=param)
+            if param.dtype == torch.int8 and param_name.replace("weight", "SCB") in state_dict.keys():
+                fp16_statistics = state_dict[param_name.replace("weight", "SCB")]
+            else:
+                fp16_statistics = None
+
+            if "SCB" not in param_name:
+                set_module_quantized_tensor_to_device(
+                    model, param_name, param_device, value=param, fp16_statistics=fp16_statistics
+                )
 
     return error_msgs, offload_index, state_dict_index
 
 
+def _add_variant(weights_name: str, variant: Optional[str] = None) -> str:
+    if variant is not None:
+        splits = weights_name.split(".")
+        splits = splits[:-1] + [variant] + splits[-1:]
+        weights_name = ".".join(splits)
+
+    return weights_name
+
+
 class ModuleUtilsMixin:
     """
     A few utilities for `torch.nn.Modules`, to be used as a mixin.
@@ -787,7 +884,7 @@ def create_extended_attention_mask_for_decoder(input_shape, attention_mask, devi
         return extended_attention_mask
 
     def get_extended_attention_mask(
-        self, attention_mask: Tensor, input_shape: Tuple[int], device: device = None, dtype: torch.float = None
+        self, attention_mask: Tensor, input_shape: Tuple[int], device: torch.device = None, dtype: torch.float = None
     ) -> Tensor:
         """
         Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
@@ -849,7 +946,7 @@ def get_head_mask(
                 The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard).
             num_hidden_layers (`int`):
                 The number of hidden layers in the model.
-            is_attention_chunked: (`bool`, *optional*, defaults to `False`):
+            is_attention_chunked (`bool`, *optional*, defaults to `False`):
                 Whether or not the attentions scores are computed by chunks or not.
 
         Returns:
@@ -895,12 +992,33 @@ def num_parameters(self, only_trainable: bool = False, exclude_embeddings: bool
             embedding_param_names = [
                 f"{name}.weight" for name, module_type in self.named_modules() if isinstance(module_type, nn.Embedding)
             ]
-            non_embedding_parameters = [
+            total_parameters = [
                 parameter for name, parameter in self.named_parameters() if name not in embedding_param_names
             ]
-            return sum(p.numel() for p in non_embedding_parameters if p.requires_grad or not only_trainable)
         else:
-            return sum(p.numel() for p in self.parameters() if p.requires_grad or not only_trainable)
+            total_parameters = list(self.parameters())
+
+        total_numel = []
+        is_loaded_in_4bit = getattr(self, "is_loaded_in_4bit", False)
+        if is_loaded_in_4bit:
+            if is_bitsandbytes_available():
+                import bitsandbytes as bnb
+            else:
+                raise ValueError(
+                    "bitsandbytes is not installed but it seems that the model has been loaded in 4bit precision, something went wrong"
+                    " make sure to install bitsandbytes with `pip install bitsandbytes`."
+                )
+
+        for param in total_parameters:
+            if param.requires_grad or not only_trainable:
+                # For 4bit models, we need to multiply the number of parameters by 2 as half of the parameters are
+                # used for the 4bit quantization (uint8 tensors are stored)
+                if is_loaded_in_4bit and isinstance(param, bnb.nn.Params4bit):
+                    total_numel.append(param.numel() * 2)
+                else:
+                    total_numel.append(param.numel())
+
+        return sum(total_numel)
 
     def estimate_tokens(self, input_dict: Dict[str, Union[torch.Tensor, Any]]) -> int:
         """
@@ -950,16 +1068,7 @@ def floating_point_ops(
         return 6 * self.estimate_tokens(input_dict) * self.num_parameters(exclude_embeddings=exclude_embeddings)
 
 
-class BackboneMixin:
-    def forward_with_filtered_kwargs(self, *args, **kwargs):
-
-        signature = dict(inspect.signature(self.forward).parameters)
-        filtered_kwargs = {k: v for k, v in kwargs.items() if k in signature}
-
-        return self(*args, **filtered_kwargs)
-
-
-class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMixin):
+class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMixin, PeftAdapterMixin):
     r"""
     Base class for all models.
 
@@ -991,6 +1100,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
     main_input_name = "input_ids"
     _auto_class = None
     _no_split_modules = None
+    _skip_keys_device_placement = None
     _keep_in_fp32_modules = None
 
     # a list of `re` patterns of `state_dict` keys that should be removed from the list of missing
@@ -1003,10 +1113,15 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
     # a list of `state_dict` keys to ignore when saving the model (useful for keys that aren't
     # trained, but which are either deterministic or tied variables)
     _keys_to_ignore_on_save = None
+    # a list of `state_dict` keys that are potentially tied to another key in the state_dict.
+    _tied_weights_keys = None
 
     is_parallelizable = False
     supports_gradient_checkpointing = False
 
+    # Flash Attention 2 support
+    _supports_flash_attn_2 = False
+
     @property
     def dummy_inputs(self) -> Dict[str, torch.Tensor]:
         """
@@ -1116,18 +1231,115 @@ def base_model(self) -> nn.Module:
         """
         return getattr(self, self.base_model_prefix, self)
 
-    def can_generate(self) -> bool:
+    @classmethod
+    def can_generate(cls) -> bool:
         """
         Returns whether this model can generate sequences with `.generate()`.
 
         Returns:
             `bool`: Whether this model can generate sequences with `.generate()`.
         """
-        # Detects whether `prepare_inputs_for_generation` has been overwritten, which is a requirement for generation
-        if "GenerationMixin" in str(self.prepare_inputs_for_generation):
+        # Detects whether `prepare_inputs_for_generation` has been overwritten, which is a requirement for generation.
+        # Alternativelly, the model can also have a custom `generate` function.
+        if "GenerationMixin" in str(cls.prepare_inputs_for_generation) and "GenerationMixin" in str(cls.generate):
             return False
         return True
 
+    @classmethod
+    def _check_and_enable_flash_attn_2(
+        cls, config, torch_dtype: Optional[torch.dtype] = None, device_map: Optional[Union[str, Dict[str, int]]] = None
+    ) -> PretrainedConfig:
+        """
+        If you don't know about Flash Attention, check out the official repository of flash attention:
+        https://github.com/Dao-AILab/flash-attention
+
+        For using Flash Attention 1.0 you can do it directly via the `BetterTransformer` API, have a look at this
+        specific section of the documentation to learn more about it:
+        https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#decoder-models
+
+        The method checks if the current setup is compatible with Flash Attention as it requires the model to be in
+        half precision and not ran on CPU.
+
+        If all checks pass, the method will create an attribute in the config `_flash_attn_2_enabled` so that the model
+        can initialize the correct attention module
+        """
+        if not cls._supports_flash_attn_2:
+            raise ValueError(
+                "The current architecture does not support Flash Attention 2.0. Please open an issue on GitHub to "
+                "request support for this architecture: https://github.com/huggingface/transformers/issues/new"
+            )
+
+        if not is_flash_attn_available():
+            raise ImportError(
+                "Flash Attention 2.0 is not available. Please refer to the documentation of https://github.com/Dao-AILab/flash-attention for"
+                " installing it."
+            )
+        else:
+            flash_attention_version = version.parse(importlib.metadata.version("flash_attn"))
+            is_flash_greater_than_2 = flash_attention_version > version.parse("2.0.0")
+            if not is_flash_greater_than_2:
+                raise ValueError(
+                    f"You need flash_attn package version to be greater than 2.0. Make sure to have that version installed - detected version {flash_attention_version}"
+                )
+
+        _is_bettertransformer = getattr(cls, "use_bettertransformer", False)
+
+        if _is_bettertransformer:
+            raise ValueError(
+                "Flash Attention 2 and BetterTransformer API are not compatible. Please make sure to disable BetterTransformers by doing model.reverse_bettertransformer()"
+            )
+
+        if torch_dtype is None:
+            logger.warning(
+                "You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour"
+            )
+        elif torch_dtype is not None and torch_dtype not in [torch.float16, torch.bfloat16]:
+            raise ValueError(
+                f"Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes. You passed {torch_dtype}, this might lead to"
+                " unexpected behaviour."
+            )
+
+        if device_map is None:
+            if torch.cuda.is_available():
+                logger.warning(
+                    "You are attempting to use Flash Attention 2.0 with a model initialized on CPU. Make sure to move the model to GPU"
+                    " after initializing it on CPU with `model.to('cuda')`."
+                )
+            else:
+                raise ValueError(
+                    "You are attempting to use Flash Attention 2.0 with a model initialized on CPU and with no GPU available. "
+                    "This is not supported yet. Please make sure to have access to a GPU and either initialise the model on a GPU by passing a device_map "
+                    "or initialising the model on CPU and then moving it to GPU."
+                )
+        elif (
+            device_map is not None
+            and isinstance(device_map, dict)
+            and ("cpu" in device_map.values() or "disk" in device_map.values())
+        ):
+            raise ValueError(
+                "You are attempting to use Flash Attention 2.0 with a model dispatched on CPU or disk. This is not supported. Please make sure to "
+                "initialise the model on a GPU by passing a device_map that contains only GPU devices as keys."
+            )
+        config._flash_attn_2_enabled = True
+        return config
+
+    def enable_input_require_grads(self):
+        """
+        Enables the gradients for the input embeddings. This is useful for fine-tuning adapter weights while keeping
+        the model weights fixed.
+        """
+
+        def make_inputs_require_grads(module, input, output):
+            output.requires_grad_(True)
+
+        self._require_grads_hook = self.get_input_embeddings().register_forward_hook(make_inputs_require_grads)
+
+    def disable_input_require_grads(self):
+        """
+        Removes the `_require_grads_hook`.
+        """
+        self._require_grads_hook.remove()
+
     def get_input_embeddings(self) -> nn.Module:
         """
         Returns the model's input embeddings.
@@ -1167,7 +1379,16 @@ def _init_weights(self, module):
         """
         Initialize the weights. This method should be overridden by derived class.
         """
-        raise NotImplementedError(f"Make sure `_init_weights` is implemented for {self.__class__}")
+        pass
+
+    def _initialize_weights(self, module):
+        """
+        Initialize the weights if they are not already initialized.
+        """
+        if getattr(module, "_is_hf_initialized", False):
+            return
+        self._init_weights(module)
+        module._is_hf_initialized = True
 
     def tie_weights(self):
         """
@@ -1224,7 +1445,7 @@ def tie_encoder_to_decoder_recursively(
                     len(encoder_modules) > 0
                 ), f"Encoder module {encoder_pointer} does not match decoder module {decoder_pointer}"
 
-                all_encoder_weights = set([module_name + "/" + sub_name for sub_name in encoder_modules.keys()])
+                all_encoder_weights = {module_name + "/" + sub_name for sub_name in encoder_modules.keys()}
                 encoder_layer_pos = 0
                 for name, module in decoder_modules.items():
                     if name.isdigit():
@@ -1285,7 +1506,9 @@ def _tie_or_clone_weights(self, output_embeddings, input_embeddings):
         if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"):
             output_embeddings.out_features = input_embeddings.num_embeddings
 
-    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> nn.Embedding:
+    def resize_token_embeddings(
+        self, new_num_tokens: Optional[int] = None, pad_to_multiple_of: Optional[int] = None
+    ) -> nn.Embedding:
         """
         Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`.
 
@@ -1296,38 +1519,65 @@ def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> nn.Em
                 The number of new tokens in the embedding matrix. Increasing the size will add newly initialized
                 vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`, just
                 returns a pointer to the input tokens `torch.nn.Embedding` module of the model without doing anything.
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the embedding matrix to a multiple of the provided value.If `new_num_tokens` is set to
+                `None` will just pad the embedding to a multiple of `pad_to_multiple_of`.
+
+                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
+                details about this, or help on choosing the correct value for resizing, refer to this guide:
+                https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
 
         Return:
             `torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
         """
-        model_embeds = self._resize_token_embeddings(new_num_tokens)
-        if new_num_tokens is None:
+        model_embeds = self._resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
+        if new_num_tokens is None and pad_to_multiple_of is None:
             return model_embeds
 
         # Update base model and current model config
-        self.config.vocab_size = new_num_tokens
-        self.vocab_size = new_num_tokens
+        self.config.vocab_size = model_embeds.weight.shape[0]
+        self.vocab_size = model_embeds.weight.shape[0]
 
         # Tie weights again if needed
         self.tie_weights()
 
         return model_embeds
 
-    def _resize_token_embeddings(self, new_num_tokens):
+    def _resize_token_embeddings(self, new_num_tokens, pad_to_multiple_of=None):
         old_embeddings = self.get_input_embeddings()
-        new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
+        new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens, pad_to_multiple_of)
+        if hasattr(old_embeddings, "_hf_hook"):
+            hook = old_embeddings._hf_hook
+            add_hook_to_module(new_embeddings, hook)
         self.set_input_embeddings(new_embeddings)
 
+        # Update new_num_tokens with the actual size of new_embeddings
+        if pad_to_multiple_of is not None:
+            if is_deepspeed_zero3_enabled():
+                import deepspeed
+
+                with deepspeed.zero.GatheredParameters(new_embeddings.weight, modifier_rank=None):
+                    new_num_tokens = new_embeddings.weight.shape[0]
+            else:
+                new_num_tokens = new_embeddings.weight.shape[0]
+
         # if word embeddings are not tied, make sure that lm head is resized as well
         if self.get_output_embeddings() is not None and not self.config.tie_word_embeddings:
             old_lm_head = self.get_output_embeddings()
             new_lm_head = self._get_resized_lm_head(old_lm_head, new_num_tokens)
+            if hasattr(old_lm_head, "_hf_hook"):
+                hook = old_lm_head._hf_hook
+                add_hook_to_module(new_lm_head, hook)
             self.set_output_embeddings(new_lm_head)
 
         return self.get_input_embeddings()
 
     def _get_resized_embeddings(
-        self, old_embeddings: nn.Embedding, new_num_tokens: Optional[int] = None
+        self,
+        old_embeddings: nn.Embedding,
+        new_num_tokens: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
     ) -> nn.Embedding:
         """
         Build a resized Embedding Module from a provided token Embedding Module. Increasing the size will add newly
@@ -1342,11 +1592,37 @@ def _get_resized_embeddings(
                 Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
                 vectors from the end. If not provided or `None`, just returns a pointer to the input tokens
                 `torch.nn.Embedding` module of the model without doing anything.
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the embedding matrix to a multiple of the provided value. If `new_num_tokens` is set to
+                `None` will just pad the embedding to a multiple of `pad_to_multiple_of`.
+
+                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
+                details about this, or help on choosing the correct value for resizing, refer to this guide:
+                https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
+
 
         Return:
             `torch.nn.Embedding`: Pointer to the resized Embedding Module or the old Embedding Module if
             `new_num_tokens` is `None`
         """
+
+        if pad_to_multiple_of is not None:
+            if not isinstance(pad_to_multiple_of, int):
+                raise ValueError(
+                    f"Asking to pad the embedding matrix to a multiple of `{pad_to_multiple_of}`, which is not and integer. Please make sure to pass an integer"
+                )
+            if new_num_tokens is None:
+                new_num_tokens = old_embeddings.weight.shape[0]
+            new_num_tokens = ((new_num_tokens + pad_to_multiple_of - 1) // pad_to_multiple_of) * pad_to_multiple_of
+        else:
+            logger.info(
+                "You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding"
+                f" dimension will be {new_num_tokens}. This might induce some performance reduction as *Tensor Cores* will not be available."
+                " For more details about this, or help on choosing the correct value for resizing, refer to this guide:"
+                " https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc"
+            )
+
         if new_num_tokens is None:
             return old_embeddings
 
@@ -1358,7 +1634,7 @@ def _get_resized_embeddings(
         else:
             old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
 
-        if old_num_tokens == new_num_tokens:
+        if old_num_tokens == new_num_tokens and not is_deepspeed_zero3_enabled():
             return old_embeddings
 
         if not isinstance(old_embeddings, nn.Embedding):
@@ -1369,8 +1645,17 @@ def _get_resized_embeddings(
             )
 
         # Build new embeddings
-        new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim)
-        new_embeddings.to(old_embeddings.weight.device, dtype=old_embeddings.weight.dtype)
+
+        # When using DeepSpeed ZeRO-3, we shouldn't create new embeddings with DeepSpeed init
+        # because the shape of the new embedding layer is used across various modeling files
+        # as well as to update config vocab size. Shape will be 0 when using DeepSpeed init leading
+        # to errors when training.
+        new_embeddings = nn.Embedding(
+            new_num_tokens,
+            old_embedding_dim,
+            device=old_embeddings.weight.device,
+            dtype=old_embeddings.weight.dtype,
+        )
 
         # initialize all new embeddings (in particular added tokens)
         self._init_weights(new_embeddings)
@@ -1379,12 +1664,13 @@ def _get_resized_embeddings(
 
         # numbers of tokens to copy
         n = min(old_num_tokens, new_num_tokens)
+
         if is_deepspeed_zero3_enabled():
             import deepspeed
 
-            with deepspeed.zero.GatheredParameters(old_embeddings.weight, modifier_rank=0):
-                if torch.distributed.get_rank() == 0:
-                    new_embeddings.weight.data[:n, :] = old_embeddings.weight.data[:n, :]
+            params = [old_embeddings.weight, new_embeddings.weight]
+            with deepspeed.zero.GatheredParameters(params, modifier_rank=0):
+                new_embeddings.weight.data[:n, :] = old_embeddings.weight.data[:n, :]
         else:
             new_embeddings.weight.data[:n, :] = old_embeddings.weight.data[:n, :]
 
@@ -1428,7 +1714,7 @@ def _get_resized_lm_head(
                 old_lm_head.weight.size() if not transposed else old_lm_head.weight.t().size()
             )
 
-        if old_num_tokens == new_num_tokens:
+        if old_num_tokens == new_num_tokens and not is_deepspeed_zero3_enabled():
             return old_lm_head
 
         if not isinstance(old_lm_head, nn.Linear):
@@ -1441,47 +1727,51 @@ def _get_resized_lm_head(
         # Build new lm head
         new_lm_head_shape = (old_lm_head_dim, new_num_tokens) if not transposed else (new_num_tokens, old_lm_head_dim)
         has_new_lm_head_bias = old_lm_head.bias is not None
-        new_lm_head = nn.Linear(*new_lm_head_shape, bias=has_new_lm_head_bias)
-        new_lm_head = new_lm_head.to(old_lm_head.weight.device, dtype=old_lm_head.weight.dtype)
+
+        # When using DeepSpeed ZeRO-3, we shouldn't create new embeddings with DeepSpeed init
+        # because the shape of the new embedding layer is used across various modeling files
+        # as well as to update config vocab size. Shape will be 0 when using DeepSpeed init leading
+        # to errors when training.
+        new_lm_head = nn.Linear(
+            *new_lm_head_shape,
+            bias=has_new_lm_head_bias,
+            device=old_lm_head.weight.device,
+            dtype=old_lm_head.weight.dtype,
+        )
 
         # initialize new lm head (in particular added tokens)
         self._init_weights(new_lm_head)
 
         num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
 
-        # XXX: put the long block of code in a wrapper
         if is_deepspeed_zero3_enabled():
             import deepspeed
 
             params = [old_lm_head.weight, old_lm_head.bias, new_lm_head.weight, new_lm_head.bias]
             with deepspeed.zero.GatheredParameters(params, modifier_rank=0):
-                if torch.distributed.get_rank() == 0:
-                    # Copy old lm head weights to new lm head
-                    if not transposed:
-                        new_lm_head.weight.data[:num_tokens_to_copy, :] = old_lm_head.weight.data[
-                            :num_tokens_to_copy, :
-                        ]
-                    else:
-                        new_lm_head.weight.data[:, :num_tokens_to_copy] = old_lm_head.weight.data[
-                            :, :num_tokens_to_copy
-                        ]
-
-                    # Copy bias weights to new lm head
-                    if has_new_lm_head_bias:
-                        new_lm_head.bias.data[:num_tokens_to_copy] = old_lm_head.bias.data[:num_tokens_to_copy]
+                self._copy_lm_head_original_to_resized(
+                    new_lm_head, old_lm_head, num_tokens_to_copy, transposed, has_new_lm_head_bias
+                )
         else:
-            # Copy old lm head weights to new lm head
-            if not transposed:
-                new_lm_head.weight.data[:num_tokens_to_copy, :] = old_lm_head.weight.data[:num_tokens_to_copy, :]
-            else:
-                new_lm_head.weight.data[:, :num_tokens_to_copy] = old_lm_head.weight.data[:, :num_tokens_to_copy]
-
-            # Copy bias weights to new lm head
-            if has_new_lm_head_bias:
-                new_lm_head.bias.data[:num_tokens_to_copy] = old_lm_head.bias.data[:num_tokens_to_copy]
+            self._copy_lm_head_original_to_resized(
+                new_lm_head, old_lm_head, num_tokens_to_copy, transposed, has_new_lm_head_bias
+            )
 
         return new_lm_head
 
+    def _copy_lm_head_original_to_resized(
+        self, new_lm_head, old_lm_head, num_tokens_to_copy, transposed, has_new_lm_head_bias
+    ):
+        # Copy old lm head weights to new lm head
+        if not transposed:
+            new_lm_head.weight.data[:num_tokens_to_copy, :] = old_lm_head.weight.data[:num_tokens_to_copy, :]
+        else:
+            new_lm_head.weight.data[:, :num_tokens_to_copy] = old_lm_head.weight.data[:, :num_tokens_to_copy]
+
+        # Copy bias weights to new lm head
+        if has_new_lm_head_bias:
+            new_lm_head.bias.data[:num_tokens_to_copy] = old_lm_head.bias.data[:num_tokens_to_copy]
+
     def resize_position_embeddings(self, new_num_position_embeddings: int):
         raise NotImplementedError(
             f"`resize_position_embeddings` is not implemented for {self.__class__}`. To implement it, you should "
@@ -1496,7 +1786,8 @@ def get_position_embeddings(self) -> Union[nn.Embedding, Tuple[nn.Embedding]]:
 
     def init_weights(self):
         """
-        If needed prunes and maybe initializes weights.
+        If needed prunes and maybe initializes weights. If using a custom `PreTrainedModel`, you need to implement any
+        initialization logic in `_init_weights`.
         """
         # Prune heads if needed
         if self.config.pruned_heads:
@@ -1504,7 +1795,7 @@ def init_weights(self):
 
         if _init_weights:
             # Initialize weights
-            self.apply(self._init_weights)
+            self.apply(self._initialize_weights)
 
             # Tie weights should be skipped when not initializing all weights
             # since from_pretrained(...) calls tie weights anyways
@@ -1538,6 +1829,13 @@ def gradient_checkpointing_enable(self):
             raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
         self.apply(partial(self._set_gradient_checkpointing, value=True))
 
+        if getattr(self, "_hf_peft_config_loaded", False):
+            # When using PEFT + gradient checkpointing + Trainer we need to make sure the input has requires_grad=True
+            # we do it also on PEFT: https://github.com/huggingface/peft/blob/85013987aa82aa1af3da1236b6902556ce3e483e/src/peft/peft_model.py#L334
+            # When training with PEFT, only LoRA layers will have requires grad set to True, but the output of frozen layers need to propagate
+            # the gradients to make sure the gradient flows.
+            self.enable_input_require_grads()
+
     def gradient_checkpointing_disable(self):
         """
         Deactivates gradient checkpointing for the current model.
@@ -1548,6 +1846,9 @@ def gradient_checkpointing_disable(self):
         if self.supports_gradient_checkpointing:
             self.apply(partial(self._set_gradient_checkpointing, value=False))
 
+        if getattr(self, "_hf_peft_config_loaded", False):
+            self.disable_input_require_grads()
+
     @property
     def is_gradient_checkpointing(self) -> bool:
         """
@@ -1567,6 +1868,9 @@ def save_pretrained(
         push_to_hub: bool = False,
         max_shard_size: Union[int, str] = "10GB",
         safe_serialization: bool = False,
+        variant: Optional[str] = None,
+        token: Optional[Union[str, bool]] = None,
+        save_peft_format: bool = True,
         **kwargs,
     ):
         """
@@ -1604,16 +1908,50 @@ def save_pretrained(
 
             safe_serialization (`bool`, *optional*, defaults to `False`):
                 Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
-
-            kwargs:
+            variant (`str`, *optional*):
+                If specified, weights are saved in the format pytorch_model.<variant>.bin.
+            token (`str` or `bool`, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
+                the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
+            save_peft_format (`bool`, *optional*, defaults to `True`):
+                For backward compatibility with PEFT library, in case adapter weights are attached to the model, all
+                keys of the state dict of adapters needs to be pre-pended with `base_model.model`. Advanced users can
+                disable this behaviours by setting `save_peft_format` to `False`.
+            kwargs (`Dict[str, Any]`, *optional*):
                 Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
         """
-        # Checks if the model has been loaded in 8-bit
-        if getattr(self, "is_loaded_in_8bit", False):
+        use_auth_token = kwargs.pop("use_auth_token", None)
+
+        if use_auth_token is not None:
             warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
+        if token is not None:
+            kwargs["token"] = token
+
+        _hf_peft_config_loaded = getattr(self, "_hf_peft_config_loaded", False)
+
+        # Checks if the model has been loaded in 8-bit
+        if (
+            getattr(self, "is_loaded_in_8bit", False)
+            and not getattr(self, "is_8bit_serializable", False)
+            and not _hf_peft_config_loaded
+        ):
+            raise ValueError(
                 "You are calling `save_pretrained` to a 8-bit converted model you may likely encounter unexepected"
-                " behaviors. ",
-                UserWarning,
+                " behaviors. If you want to save 8-bit models, make sure to have `bitsandbytes>0.37.2` installed."
+            )
+
+        # If the model has adapters attached, you can save the adapters
+        if getattr(self, "is_loaded_in_4bit", False) and not _hf_peft_config_loaded:
+            raise NotImplementedError(
+                "You are calling `save_pretrained` on a 4-bit converted model. This is currently not supported"
             )
 
         if "save_config" in kwargs:
@@ -1633,7 +1971,7 @@ def save_pretrained(
         if push_to_hub:
             commit_message = kwargs.pop("commit_message", None)
             repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
-            repo_id, token = self._create_repo(repo_id, **kwargs)
+            repo_id = self._create_repo(repo_id, **kwargs)
             files_timestamps = self._get_files_timestamps(save_directory)
 
         # Only save the model itself if we are using distributed training
@@ -1654,7 +1992,37 @@ def save_pretrained(
 
         # Save the config
         if is_main_process:
-            model_to_save.config.save_pretrained(save_directory)
+            if not _hf_peft_config_loaded:
+                model_to_save.config.save_pretrained(save_directory)
+            if self.can_generate():
+                model_to_save.generation_config.save_pretrained(save_directory)
+
+            if _hf_peft_config_loaded:
+                logger.info(
+                    "Detected adapters on the model, saving the model in the PEFT format, only adapter weights will be saved."
+                )
+                state_dict = model_to_save.get_adapter_state_dict()
+
+                if save_peft_format:
+                    logger.info(
+                        "To match the expected format of the PEFT library, all keys of the state dict of adapters will be pre-pended with `base_model.model`."
+                    )
+                    peft_state_dict = {}
+                    for key, value in state_dict.items():
+                        peft_state_dict[f"base_model.model.{key}"] = value
+                    state_dict = peft_state_dict
+
+                active_adapter = self.active_adapters()
+
+                if len(active_adapter) > 1:
+                    raise ValueError(
+                        "Multiple active adapters detected, saving multiple active adapters is not supported yet. You can save adapters separately one by one "
+                        "by iteratively calling `model.set_adapter(adapter_name)` then `model.save_pretrained(...)`"
+                    )
+                active_adapter = active_adapter[0]
+
+                current_peft_config = self.peft_config[active_adapter]
+                current_peft_config.save_pretrained(save_directory)
 
         # Save the model
         if state_dict is None:
@@ -1670,9 +2038,52 @@ def save_pretrained(
             for ignore_key in self._keys_to_ignore_on_save:
                 if ignore_key in state_dict.keys():
                     del state_dict[ignore_key]
+        if safe_serialization:
+            # Safetensors does not allow tensor aliasing.
+            # We're going to remove aliases before saving
+            ptrs = collections.defaultdict(list)
+            for name, tensor in state_dict.items():
+                ptrs[id_tensor_storage(tensor)].append(name)
+
+            # These are all the pointers of shared tensors.
+            shared_ptrs = {ptr: names for ptr, names in ptrs.items() if len(names) > 1}
+            warn_names = set()
+            for names in shared_ptrs.values():
+                # Removing the keys which are declared as known duplicates on
+                # load. This allows to make sure the name which is kept is consistent.
+                if self._tied_weights_keys is not None:
+                    found = 0
+                    for name in sorted(names):
+                        matches_pattern = any(re.search(pat, name) for pat in self._tied_weights_keys)
+                        if matches_pattern and name in state_dict:
+                            found += 1
+                            if found < len(names):
+                                del state_dict[name]
+
+                # When not all duplicates have been cleaned, still remove those keys, but put a clear warning.
+                # If the link between tensors was done at runtime then `from_pretrained` will not get
+                # the key back leading to random tensor. A proper warning will be shown
+                # during reload (if applicable), but since the file is not necessarily compatible with
+                # the config, better show a proper warning.
+                found = 0
+                for name in names:
+                    if name in state_dict:
+                        found += 1
+                        if found > 1:
+                            del state_dict[name]
+                            warn_names.add(name)
+            if len(warn_names) > 0:
+                logger.warning_once(
+                    f"Removed shared tensor {warn_names} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading",
+                )
 
         # Shard the model if it is too big.
-        weights_name = SAFE_WEIGHTS_NAME if safe_serialization else WEIGHTS_NAME
+        if not _hf_peft_config_loaded:
+            weights_name = SAFE_WEIGHTS_NAME if safe_serialization else WEIGHTS_NAME
+            weights_name = _add_variant(weights_name, variant)
+        else:
+            weights_name = ADAPTER_SAFE_WEIGHTS_NAME if safe_serialization else ADAPTER_WEIGHTS_NAME
+
         shards, index = shard_checkpoint(state_dict, max_shard_size=max_shard_size, weights_name=weights_name)
 
         # Clean the folder from a previous save
@@ -1681,11 +2092,17 @@ def save_pretrained(
             # If we have a shard file that is not going to be replaced, we delete it, but only from the main process
             # in distributed settings to avoid race conditions.
             weights_no_suffix = weights_name.replace(".bin", "").replace(".safetensors", "")
+
+            # make sure that file to be deleted matches format of sharded file, e.g. pytorch_model-00001-of-00005
+            filename_no_suffix = filename.replace(".bin", "").replace(".safetensors", "")
+            reg = re.compile(r"(.*?)-\d{5}-of-\d{5}")
+
             if (
                 filename.startswith(weights_no_suffix)
                 and os.path.isfile(full_filename)
                 and filename not in shards.keys()
                 and is_main_process
+                and reg.fullmatch(filename_no_suffix) is not None
             ):
                 os.remove(full_filename)
 
@@ -1699,10 +2116,11 @@ def save_pretrained(
                 save_function(shard, os.path.join(save_directory, shard_file))
 
         if index is None:
-            logger.info(f"Model weights saved in {os.path.join(save_directory, WEIGHTS_NAME)}")
+            path_to_weights = os.path.join(save_directory, _add_variant(WEIGHTS_NAME, variant))
+            logger.info(f"Model weights saved in {path_to_weights}")
         else:
             save_index_file = SAFE_WEIGHTS_INDEX_NAME if safe_serialization else WEIGHTS_INDEX_NAME
-            save_index_file = os.path.join(save_directory, save_index_file)
+            save_index_file = os.path.join(save_directory, _add_variant(save_index_file, variant))
             # Save the index as well
             with open(save_index_file, "w", encoding="utf-8") as f:
                 content = json.dumps(index, indent=2, sort_keys=True) + "\n"
@@ -1715,7 +2133,11 @@ def save_pretrained(
 
         if push_to_hub:
             self._upload_modified_files(
-                save_directory, repo_id, files_timestamps, commit_message=commit_message, token=token
+                save_directory,
+                repo_id,
+                files_timestamps,
+                commit_message=commit_message,
+                token=token,
             )
 
     def get_memory_footprint(self, return_buffers=True):
@@ -1736,38 +2158,63 @@ def get_memory_footprint(self, return_buffers=True):
             mem = mem + mem_bufs
         return mem
 
+    @wraps(torch.nn.Module.cuda)
+    def cuda(self, *args, **kwargs):
+        # Checks if the model has been loaded in 8-bit
+        if getattr(self, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES:
+            raise ValueError(
+                "Calling `cuda()` is not supported for `4-bit` or `8-bit` quantized models. Please use the model as it is, since the"
+                " model has already been set to the correct devices and casted to the correct `dtype`."
+            )
+        else:
+            return super().cuda(*args, **kwargs)
+
+    @wraps(torch.nn.Module.to)
     def to(self, *args, **kwargs):
         # Checks if the model has been loaded in 8-bit
-        if getattr(self, "is_loaded_in_8bit", False):
+        if getattr(self, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES:
             raise ValueError(
-                "`.to` is not supported for `8-bit` models. Please use the model as it is, since the"
+                "`.to` is not supported for `4-bit` or `8-bit` bitsandbytes models. Please use the model as it is, since the"
                 " model has already been set to the correct devices and casted to the correct `dtype`."
             )
         else:
             return super().to(*args, **kwargs)
 
     def half(self, *args):
-        # Checks if the model has been loaded in 8-bit
-        if getattr(self, "is_loaded_in_8bit", False):
+        # Checks if the model is quantized
+        if getattr(self, "is_quantized", False):
             raise ValueError(
-                "`.half()` is not supported for `8-bit` models. Please use the model as it is, since the"
+                "`.half()` is not supported for quantized model. Please use the model as it is, since the"
                 " model has already been casted to the correct `dtype`."
             )
         else:
             return super().half(*args)
 
     def float(self, *args):
-        # Checks if the model has been loaded in 8-bit
-        if getattr(self, "is_loaded_in_8bit", False):
+        # Checks if the model is quantized
+        if getattr(self, "is_quantized", False):
             raise ValueError(
-                "`.float()` is not supported for `8-bit` models. Please use the model as it is, since the"
+                "`.float()` is not supported for quantized model. Please use the model as it is, since the"
                 " model has already been casted to the correct `dtype`."
             )
         else:
             return super().float(*args)
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
+        *model_args,
+        config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None,
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        ignore_mismatched_sizes: bool = False,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
+        use_safetensors: bool = None,
+        **kwargs,
+    ):
         r"""
         Instantiate a pretrained pytorch model from a pre-trained model configuration.
 
@@ -1848,7 +2295,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
             local_files_only(`bool`, *optional*, defaults to `False`):
                 Whether or not to only look at local files (i.e., do not try to download the model).
-            use_auth_token (`str` or `bool`, *optional*):
+            token (`str` or `bool`, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
                 the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
             revision (`str`, *optional*, defaults to `"main"`):
@@ -1856,7 +2303,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                 identifier allowed by git.
 
-
                 <Tip>
 
                 To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>".
@@ -1884,12 +2330,33 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 Tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
                 This is an experimental feature and a subject to change at any moment.
             torch_dtype (`str` or `torch.dtype`, *optional*):
-                Override the default `torch.dtype` and load the model under this dtype. If `"auto"` is passed the dtype
-                will be automatically derived from the model's weights.
-            device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
+                Override the default `torch.dtype` and load the model under a specific `dtype`. The different options
+                are:
+
+                1. `torch.float16` or `torch.bfloat16` or `torch.float`: load in a specified
+                  `dtype`, ignoring the model's `config.torch_dtype` if one exists. If not specified
+                  - the model will get loaded in `torch.float` (fp32).
+
+                2. `"auto"` - A `torch_dtype` entry in the `config.json` file of the model will be
+                  attempted to be used. If this entry isn't found then next check the `dtype` of the first weight in
+                  the checkpoint that's of a floating point type and use that as `dtype`. This will load the model
+                  using the `dtype` it was saved in at the end of the training. It can't be used as an indicator of how
+                  the model was trained. Since it could be trained in one of half precision dtypes, but saved in fp32.
+
+                <Tip>
+
+                For some models the `dtype` they were trained in is unknown - you may try to check the model's paper or
+                reach out to the authors and ask them to add this information to the model's card and to insert the
+                `torch_dtype` entry in `config.json` on the hub.
+
+                </Tip>
+
+            device_map (`str` or `Dict[str, Union[int, str, torch.device]]` or `int` or `torch.device`, *optional*):
                 A map that specifies where each submodule should go. It doesn't need to be refined to each
                 parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the
-                same device.
+                same device. If we only pass the device (*e.g.*, `"cpu"`, `"cuda:1"`, `"mps"`, or a GPU ordinal rank
+                like `1`) on which the model will be allocated, the device map will map the entire model to this
+                device. Passing `device_map = 0` means put the whole model on GPU 0.
 
                 To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For
                 more information about each option see [designing a device
@@ -1905,26 +2372,22 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 `True` when there is some disk offload.
             load_in_8bit (`bool`, *optional*, defaults to `False`):
                 If `True`, will convert the loaded model into mixed-8bit quantized model. To use this feature please
-                install `bitsandbytes` compiled with your CUDA version by running `pip install -i
-                https://test.pypi.org/simple/ bitsandbytes-cudaXXX` where XXX is your CUDA version (e.g. 11.6 = 116).
-                Make also sure that you have enough GPU RAM to store half of the model size since the 8bit modules are
-                not compiled and adapted for CPUs.
-            load_in_8bit_threshold (`float`, *optional*, defaults to 6):
-                Works together with `load_in_8bit`. This corresponds to the outlier threshold for outlier detection as
-                described in `GPT3.int8() : 8-bit Matrix Multiplication for Transformers at Scale` paper. Any hidden
-                states value that is above this threshold will be considered an outlier and the operation on those
-                values will be done in fp16. Values are usually normally distributed, that is, most values are in the
-                range [-3.5, 3.5], but there are some exceptional systematic outliers that are very differently
-                distributed for large models. These outliers are often in the interval [-60, -6] or [6, 60]. Int8
-                quantization works well for values of magnitude ~5, but beyond that, there is a significant performance
-                penalty. A good default threshold is 6, but a lower threshold might be needed for more unstable models
-                (small models, fine-tuning).
-            load_in_8bit_skip_modules (`List[str]`, *optional*):
-                An explicit list of the modules that we do not want to convert in 8-bit. This is useful for models such
-                as Jukebox that has several heads in different places and not necessarily at the last position.
+                install `bitsandbytes` (`pip install -U bitsandbytes`).
+            load_in_4bit (`bool`, *optional*, defaults to `False`):
+                If `True`, will convert the loaded model into 4bit precision quantized model. To use this feature
+                install the latest version of `bitsandbytes` (`pip install -U bitsandbytes`).
+            quantization_config (`Union[QuantizationConfigMixin,Dict]`, *optional*):
+                A dictionary of configuration parameters or a QuantizationConfigMixin object for quantization (e.g
+                bitsandbytes, gptq)
             subfolder (`str`, *optional*, defaults to `""`):
                 In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
                 specify the folder name here.
+            variant (`str`, *optional*):
+                If specified load weights from `variant` filename, *e.g.* pytorch_model.<variant>.bin. `variant` is
+                ignored when using `from_tf` or `from_flax`.
+            use_safetensors (`bool`, *optional*, defaults to `None`):
+                Whether or not to use `safetensors` checkpoints. Defaults to `None`. If not specified and `safetensors`
+                is not installed, it will be set to `False`.
 
             kwargs (remaining dictionary of keyword arguments, *optional*):
                 Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
@@ -1982,19 +2445,13 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         Currently, it can't handle deepspeed ZeRO stage 3 and ignores loading errors
 
         """
-        config = kwargs.pop("config", None)
         state_dict = kwargs.pop("state_dict", None)
-        cache_dir = kwargs.pop("cache_dir", None)
         from_tf = kwargs.pop("from_tf", False)
         from_flax = kwargs.pop("from_flax", False)
-        ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False)
-        force_download = kwargs.pop("force_download", False)
         resume_download = kwargs.pop("resume_download", False)
         proxies = kwargs.pop("proxies", None)
         output_loading_info = kwargs.pop("output_loading_info", False)
-        local_files_only = kwargs.pop("local_files_only", False)
         use_auth_token = kwargs.pop("use_auth_token", None)
-        revision = kwargs.pop("revision", None)
         trust_remote_code = kwargs.pop("trust_remote_code", None)
         _ = kwargs.pop("mirror", None)
         from_pipeline = kwargs.pop("_from_pipeline", None)
@@ -2007,16 +2464,106 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         offload_folder = kwargs.pop("offload_folder", None)
         offload_state_dict = kwargs.pop("offload_state_dict", False)
         load_in_8bit = kwargs.pop("load_in_8bit", False)
-        load_in_8bit_threshold = kwargs.pop("load_in_8bit_threshold", 6.0)
-        load_in_8bit_skip_modules = kwargs.pop("load_in_8bit_skip_modules", None)
+        load_in_4bit = kwargs.pop("load_in_4bit", False)
+        quantization_config = kwargs.pop("quantization_config", None)
         subfolder = kwargs.pop("subfolder", "")
         commit_hash = kwargs.pop("_commit_hash", None)
+        variant = kwargs.pop("variant", None)
+        adapter_kwargs = kwargs.pop("adapter_kwargs", {})
+        adapter_name = kwargs.pop("adapter_name", "default")
+        use_flash_attention_2 = kwargs.pop("use_flash_attention_2", False)
+
+        if is_fsdp_enabled():
+            low_cpu_mem_usage = True
+
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
+        if token is not None and adapter_kwargs is not None and "token" not in adapter_kwargs:
+            adapter_kwargs["token"] = token
+
+        if use_safetensors is None and not is_safetensors_available():
+            use_safetensors = False
+
+        if is_bitsandbytes_available():
+            is_8bit_serializable = version.parse(importlib.metadata.version("bitsandbytes")) > version.parse("0.37.2")
+        else:
+            is_8bit_serializable = False
 
         if trust_remote_code is True:
             logger.warning(
                 "The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is"
                 " ignored."
             )
+
+        if commit_hash is None:
+            if not isinstance(config, PretrainedConfig):
+                # We make a call to the config file first (which may be absent) to get the commit hash as soon as possible
+                resolved_config_file = cached_file(
+                    pretrained_model_name_or_path,
+                    CONFIG_NAME,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    resume_download=resume_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    token=token,
+                    revision=revision,
+                    subfolder=subfolder,
+                    _raise_exceptions_for_missing_entries=False,
+                    _raise_exceptions_for_connection_errors=False,
+                )
+                commit_hash = extract_commit_hash(resolved_config_file, commit_hash)
+            else:
+                commit_hash = getattr(config, "_commit_hash", None)
+
+        if is_peft_available():
+            _adapter_model_path = adapter_kwargs.pop("_adapter_model_path", None)
+
+            if _adapter_model_path is None:
+                _adapter_model_path = find_adapter_config_file(
+                    pretrained_model_name_or_path,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    resume_download=resume_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    _commit_hash=commit_hash,
+                    **adapter_kwargs,
+                )
+            if _adapter_model_path is not None and os.path.isfile(_adapter_model_path):
+                with open(_adapter_model_path, "r", encoding="utf-8") as f:
+                    _adapter_model_path = pretrained_model_name_or_path
+                    pretrained_model_name_or_path = json.load(f)["base_model_name_or_path"]
+        else:
+            _adapter_model_path = None
+
+        # change device_map into a map if we passed an int, a str or a torch.device
+        if isinstance(device_map, torch.device):
+            device_map = {"": device_map}
+        elif isinstance(device_map, str) and device_map not in ["auto", "balanced", "balanced_low_0", "sequential"]:
+            try:
+                device_map = {"": torch.device(device_map)}
+            except RuntimeError:
+                raise ValueError(
+                    "When passing device_map as a string, the value needs to be a device name (e.g. cpu, cuda:0) or "
+                    f"'auto', 'balanced', 'balanced_low_0', 'sequential' but found {device_map}."
+                )
+        elif isinstance(device_map, int):
+            if device_map < 0:
+                raise ValueError(
+                    "You can't pass device_map as a negative int. If you want to put the model on the cpu, pass device_map = 'cpu' "
+                )
+            else:
+                device_map = {"": device_map}
+
         if device_map is not None:
             if low_cpu_mem_usage is None:
                 low_cpu_mem_usage = True
@@ -2024,8 +2571,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 raise ValueError("Passing along a `device_map` requires `low_cpu_mem_usage=True`")
 
         if low_cpu_mem_usage:
-            # low_cpu_mem_usage requires PyTorch >= 1.9 to have the meta device.
-            require_version_core("torch>=1.9")
             if device_map is not None:
                 # The max memory utils require PyTorch >= 1.10 to have torch.cuda.mem_get_info.
                 require_version_core("torch>=1.10")
@@ -2039,25 +2584,67 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                     "Using `low_cpu_mem_usage=True` or a `device_map` requires Accelerate: `pip install accelerate`"
                 )
 
-        if load_in_8bit:
+        quantization_method_from_args = None
+        if quantization_config is not None:
+            quantization_method_from_args = getattr(
+                quantization_config, "quant_method", QuantizationMethod.BITS_AND_BYTES
+            )
+
+        if quantization_config is None and (load_in_8bit or load_in_4bit):
+            quantization_method_from_args = QuantizationMethod.BITS_AND_BYTES
+            quantization_config, kwargs = BitsAndBytesConfig.from_dict(
+                config_dict={"load_in_8bit": load_in_8bit, "load_in_4bit": load_in_4bit},
+                return_unused_kwargs=True,
+                **kwargs,
+            )
+        elif quantization_method_from_args == QuantizationMethod.BITS_AND_BYTES:
+            load_in_8bit = quantization_config.load_in_8bit
+            load_in_4bit = quantization_config.load_in_4bit
+
+            quantization_config_kwargs = {
+                k: v for k, v in kwargs.items() if k in inspect.signature(BitsAndBytesConfig).parameters
+            }
+
+            if len(quantization_config_kwargs) > 0:
+                raise ValueError(
+                    "You can't pass `load_in_8bit` or any other `BitsAndBytesConfig` argument as a kwarg when passing "
+                    "`quantization_config` argument at the same time."
+                )
+
+        if load_in_8bit or load_in_4bit:
             if not (is_accelerate_available() and is_bitsandbytes_available()):
                 raise ImportError(
                     "Using `load_in_8bit=True` requires Accelerate: `pip install accelerate` and the latest version of"
                     " bitsandbytes `pip install -i https://test.pypi.org/simple/ bitsandbytes` or"
                     " pip install bitsandbytes` "
                 )
-            if torch_dtype == "auto" or torch_dtype != torch.float16:
+
+            if torch_dtype is None:
                 # We force the `dtype` to be float16, this is a requirement from `bitsandbytes`
+                logger.info(
+                    f"Overriding torch_dtype={torch_dtype} with `torch_dtype=torch.float16` due to "
+                    "requirements of `bitsandbytes` to enable model loading in 8-bit or 4-bit. "
+                    "Pass your own torch_dtype to specify the dtype of the remaining non-linear layers or pass"
+                    " torch_dtype=torch.float16 to remove this warning."
+                )
                 torch_dtype = torch.float16
-                logger.info("Loading the model in mixed int8 - forcing the weights to be casted in float16")
+
             if device_map is None:
-                raise ValueError(
-                    "A device map needs to be passed to run convert models into mixed-int8 format. Please run"
-                    "`.from_pretrained` with `device_map='auto'`"
+                if torch.cuda.is_available():
+                    device_map = {"": torch.cuda.current_device()}
+                else:
+                    raise RuntimeError("No GPU found. A GPU is needed for quantization.")
+                logger.info(
+                    "The device_map was not initialized."
+                    "Setting device_map to {'':torch.cuda.current_device()}."
+                    "If you want to use the model for inference, please set device_map ='auto' "
                 )
+                if low_cpu_mem_usage is None:
+                    low_cpu_mem_usage = True
+
             if from_tf or from_flax:
                 raise ValueError(
-                    "Converting into mixed 8-bit weights from tf/flax weights is currently not supported, please make"
+                    "Converting into 4-bit or 8-bit weights from tf/flax weights is currently not supported, please make"
                     " sure the weights are in PyTorch format."
                 )
 
@@ -2082,7 +2669,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 resume_download=resume_download,
                 proxies=proxies,
                 local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
+                token=token,
                 revision=revision,
                 subfolder=subfolder,
                 _from_auto=from_auto_class,
@@ -2092,8 +2679,106 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         else:
             model_kwargs = kwargs
 
-        if commit_hash is None:
-            commit_hash = getattr(config, "_commit_hash", None)
+        quantizer = None
+        quantization_method_from_config = None
+        if hasattr(config, "quantization_config"):
+            quantization_method_from_config = config.quantization_config.get(
+                "quant_method", QuantizationMethod.BITS_AND_BYTES
+            )
+
+        if quantization_method_from_config == QuantizationMethod.GPTQ and quantization_method_from_args is not None:
+            loading_attr_dict = quantization_config.get_loading_attributes()
+            for attr, val in loading_attr_dict.items():
+                config.quantization_config[attr] = val
+            quantization_method_from_args = None
+            logger.warning(
+                "You passed `quantization_config` to `from_pretrained` but the model you're loading already has a "
+                "`quantization_config` attribute and has already quantized weights. However, loading attributes"
+                " (e.g. disable_exllama, use_cuda_fp16, max_input_length) will be overwritten with the one you passed to `from_pretrained`. The rest will be ignored."
+            )
+        if (
+            quantization_method_from_args == QuantizationMethod.GPTQ
+            or quantization_method_from_config == QuantizationMethod.GPTQ
+        ):
+            if not torch.cuda.is_available():
+                raise RuntimeError("GPU is required to quantize or run quantize model.")
+            elif not (is_optimum_available() and is_auto_gptq_available()):
+                raise ImportError(
+                    "Loading a GPTQ quantized model requires optimum (`pip install optimum`) and auto-gptq library (`pip install auto-gptq`)"
+                )
+            elif version.parse(importlib.metadata.version("auto_gptq")) < version.parse("0.4.2"):
+                raise ImportError(
+                    "You need a version of auto_gptq >= 0.4.2 to use GPTQ: `pip install --upgrade auto-gptq`"
+                )
+            else:
+                # Need to protect the import
+                from optimum.gptq import GPTQQuantizer
+            if quantization_method_from_config == QuantizationMethod.GPTQ:
+                quantization_config = GPTQConfig.from_dict(config.quantization_config)
+                config.quantization_config = quantization_config
+            if torch_dtype is None:
+                torch_dtype = torch.float16
+            else:
+                logger.info("We suggest you to set `torch_dtype=torch.float16` for better efficiency with GPTQ.")
+
+            quantizer = GPTQQuantizer.from_dict(quantization_config.to_dict())
+
+        if (
+            is_8bit_serializable
+            and quantization_method_from_args == QuantizationMethod.BITS_AND_BYTES
+            and load_in_8bit
+        ):
+            if quantization_method_from_config == QuantizationMethod.BITS_AND_BYTES:
+                logger.warning(
+                    "You passed `quantization_config` to `from_pretrained` but the model you're loading already has a"
+                    " `quantization_config` attribute. The `quantization_config` attribute will be overwritten with the"
+                    " one you passed to `from_pretrained`."
+                )
+            config.quantization_config = quantization_config
+        elif (
+            is_8bit_serializable
+            and not load_in_8bit
+            and quantization_method_from_config == QuantizationMethod.BITS_AND_BYTES
+        ):
+            quantization_config = config.quantization_config
+            if isinstance(quantization_config, dict):
+                quantization_config = BitsAndBytesConfig.from_dict(quantization_config, return_unused_kwargs=False)
+            elif isinstance(quantization_config, BitsAndBytesConfig):
+                pass
+            else:
+                raise ValueError(
+                    f"Invalid type for `quantization_config`: {type(quantization_config)}. Should be a `dict` or a"
+                    " `BitsAndBytesConfig` instance."
+                )
+
+            load_in_8bit = quantization_config.load_in_8bit
+
+            if load_in_8bit:
+                if torch_dtype is None:
+                    torch_dtype = torch.float16
+                if device_map is None:
+                    if torch.cuda.is_available():
+                        device_map = {"": torch.cuda.current_device()}
+                    else:
+                        raise RuntimeError("No GPU found. A GPU is needed for quantization.")
+                    logger.info(
+                        "The device_map was not initialized."
+                        "Setting device_map to {'':torch.cuda.current_device()}."
+                        "If you want to use the model for inference, please set device_map ='auto' "
+                    )
+                    if low_cpu_mem_usage is None:
+                        low_cpu_mem_usage = True
+
+        elif (
+            not is_8bit_serializable
+            and not load_in_8bit
+            and quantization_method_from_config == QuantizationMethod.BITS_AND_BYTES
+        ):
+            logger.warning(
+                "Detected the presence of a `quantization_config` attribute in the model's configuration but you don't have the correct"
+                " `bitsandbytes` version to support int8 serialization. Please install the latest version of `bitsandbytes` with "
+                " `pip install --upgrade bitsandbytes`."
+            )
 
         # This variable will flag if we're loading a sharded checkpoint. In this case the archive file is just the
         # index of the files.
@@ -2125,43 +2810,63 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 ):
                     # Load from a Flax checkpoint in priority if from_flax
                     archive_file = os.path.join(pretrained_model_name_or_path, subfolder, FLAX_WEIGHTS_NAME)
-                elif is_safetensors_available() and os.path.isfile(
-                    os.path.join(pretrained_model_name_or_path, subfolder, SAFE_WEIGHTS_NAME)
+                elif use_safetensors is not False and os.path.isfile(
+                    os.path.join(pretrained_model_name_or_path, subfolder, _add_variant(SAFE_WEIGHTS_NAME, variant))
                 ):
                     # Load from a safetensors checkpoint
-                    archive_file = os.path.join(pretrained_model_name_or_path, subfolder, SAFE_WEIGHTS_NAME)
-                elif is_safetensors_available() and os.path.isfile(
-                    os.path.join(pretrained_model_name_or_path, subfolder, SAFE_WEIGHTS_INDEX_NAME)
+                    archive_file = os.path.join(
+                        pretrained_model_name_or_path, subfolder, _add_variant(SAFE_WEIGHTS_NAME, variant)
+                    )
+                elif use_safetensors is not False and os.path.isfile(
+                    os.path.join(
+                        pretrained_model_name_or_path, subfolder, _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant)
+                    )
                 ):
                     # Load from a sharded safetensors checkpoint
-                    archive_file = os.path.join(pretrained_model_name_or_path, subfolder, SAFE_WEIGHTS_INDEX_NAME)
+                    archive_file = os.path.join(
+                        pretrained_model_name_or_path, subfolder, _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant)
+                    )
                     is_sharded = True
-                elif os.path.isfile(os.path.join(pretrained_model_name_or_path, subfolder, WEIGHTS_NAME)):
+                elif os.path.isfile(
+                    os.path.join(pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_NAME, variant))
+                ):
                     # Load from a PyTorch checkpoint
-                    archive_file = os.path.join(pretrained_model_name_or_path, subfolder, WEIGHTS_NAME)
-                elif os.path.isfile(os.path.join(pretrained_model_name_or_path, subfolder, WEIGHTS_INDEX_NAME)):
+                    archive_file = os.path.join(
+                        pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_NAME, variant)
+                    )
+                elif os.path.isfile(
+                    os.path.join(pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_INDEX_NAME, variant))
+                ):
                     # Load from a sharded PyTorch checkpoint
-                    archive_file = os.path.join(pretrained_model_name_or_path, subfolder, WEIGHTS_INDEX_NAME)
+                    archive_file = os.path.join(
+                        pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_INDEX_NAME, variant)
+                    )
                     is_sharded = True
                 # At this stage we don't have a weight file so we will raise an error.
                 elif os.path.isfile(
                     os.path.join(pretrained_model_name_or_path, subfolder, TF_WEIGHTS_NAME + ".index")
                 ) or os.path.isfile(os.path.join(pretrained_model_name_or_path, subfolder, TF2_WEIGHTS_NAME)):
                     raise EnvironmentError(
-                        f"Error no file named {WEIGHTS_NAME} found in directory {pretrained_model_name_or_path} but "
-                        "there is a file for TensorFlow weights. Use `from_tf=True` to load this model from those "
-                        "weights."
+                        f"Error no file named {_add_variant(WEIGHTS_NAME, variant)} found in directory"
+                        f" {pretrained_model_name_or_path} but there is a file for TensorFlow weights. Use"
+                        " `from_tf=True` to load this model from those weights."
                     )
                 elif os.path.isfile(os.path.join(pretrained_model_name_or_path, subfolder, FLAX_WEIGHTS_NAME)):
                     raise EnvironmentError(
-                        f"Error no file named {WEIGHTS_NAME} found in directory {pretrained_model_name_or_path} but "
-                        "there is a file for Flax weights. Use `from_flax=True` to load this model from those "
-                        "weights."
+                        f"Error no file named {_add_variant(WEIGHTS_NAME, variant)} found in directory"
+                        f" {pretrained_model_name_or_path} but there is a file for Flax weights. Use `from_flax=True`"
+                        " to load this model from those weights."
+                    )
+                elif use_safetensors:
+                    raise EnvironmentError(
+                        f"Error no file named {_add_variant(SAFE_WEIGHTS_NAME, variant)} found in directory"
+                        f" {pretrained_model_name_or_path}."
                     )
                 else:
                     raise EnvironmentError(
-                        f"Error no file named {WEIGHTS_NAME}, {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME + '.index'} or "
-                        f"{FLAX_WEIGHTS_NAME} found in directory {pretrained_model_name_or_path}."
+                        f"Error no file named {_add_variant(WEIGHTS_NAME, variant)}, {TF2_WEIGHTS_NAME},"
+                        f" {TF_WEIGHTS_NAME + '.index'} or {FLAX_WEIGHTS_NAME} found in directory"
+                        f" {pretrained_model_name_or_path}."
                     )
             elif os.path.isfile(os.path.join(subfolder, pretrained_model_name_or_path)):
                 archive_file = pretrained_model_name_or_path
@@ -2183,47 +2888,55 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                     filename = TF2_WEIGHTS_NAME
                 elif from_flax:
                     filename = FLAX_WEIGHTS_NAME
-                elif is_safetensors_available():
-                    filename = SAFE_WEIGHTS_NAME
+                elif use_safetensors is not False:
+                    filename = _add_variant(SAFE_WEIGHTS_NAME, variant)
                 else:
-                    filename = WEIGHTS_NAME
+                    filename = _add_variant(WEIGHTS_NAME, variant)
 
                 try:
                     # Load from URL or cache if already cached
-                    cached_file_kwargs = dict(
-                        cache_dir=cache_dir,
-                        force_download=force_download,
-                        proxies=proxies,
-                        resume_download=resume_download,
-                        local_files_only=local_files_only,
-                        use_auth_token=use_auth_token,
-                        user_agent=user_agent,
-                        revision=revision,
-                        subfolder=subfolder,
-                        _raise_exceptions_for_missing_entries=False,
-                        _commit_hash=commit_hash,
-                    )
+                    cached_file_kwargs = {
+                        "cache_dir": cache_dir,
+                        "force_download": force_download,
+                        "proxies": proxies,
+                        "resume_download": resume_download,
+                        "local_files_only": local_files_only,
+                        "token": token,
+                        "user_agent": user_agent,
+                        "revision": revision,
+                        "subfolder": subfolder,
+                        "_raise_exceptions_for_missing_entries": False,
+                        "_commit_hash": commit_hash,
+                    }
                     resolved_archive_file = cached_file(pretrained_model_name_or_path, filename, **cached_file_kwargs)
 
                     # Since we set _raise_exceptions_for_missing_entries=False, we don't get an exception but a None
                     # result when internet is up, the repo and revision exist, but the file does not.
-                    if resolved_archive_file is None and filename == SAFE_WEIGHTS_NAME:
+                    if resolved_archive_file is None and filename == _add_variant(SAFE_WEIGHTS_NAME, variant):
                         # Maybe the checkpoint is sharded, we try to grab the index name in this case.
                         resolved_archive_file = cached_file(
-                            pretrained_model_name_or_path, SAFE_WEIGHTS_INDEX_NAME, **cached_file_kwargs
+                            pretrained_model_name_or_path,
+                            _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant),
+                            **cached_file_kwargs,
                         )
                         if resolved_archive_file is not None:
                             is_sharded = True
+                        elif use_safetensors:
+                            raise EnvironmentError(
+                                f" {_add_variant(SAFE_WEIGHTS_NAME, variant)} or {_add_variant(SAFE_WEIGHTS_INDEX_NAME, variant)} and thus cannot be loaded with `safetensors`. Please make sure that the model has been saved with `safe_serialization=True` or do not set `use_safetensors=True`."
+                            )
                         else:
                             # This repo has no safetensors file of any kind, we switch to PyTorch.
-                            filename = WEIGHTS_NAME
+                            filename = _add_variant(WEIGHTS_NAME, variant)
                             resolved_archive_file = cached_file(
-                                pretrained_model_name_or_path, WEIGHTS_NAME, **cached_file_kwargs
+                                pretrained_model_name_or_path, filename, **cached_file_kwargs
                             )
-                    if resolved_archive_file is None and filename == WEIGHTS_NAME:
+                    if resolved_archive_file is None and filename == _add_variant(WEIGHTS_NAME, variant):
                         # Maybe the checkpoint is sharded, we try to grab the index name in this case.
                         resolved_archive_file = cached_file(
-                            pretrained_model_name_or_path, WEIGHTS_INDEX_NAME, **cached_file_kwargs
+                            pretrained_model_name_or_path,
+                            _add_variant(WEIGHTS_INDEX_NAME, variant),
+                            **cached_file_kwargs,
                         )
                         if resolved_archive_file is not None:
                             is_sharded = True
@@ -2233,24 +2946,33 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                         has_file_kwargs = {
                             "revision": revision,
                             "proxies": proxies,
-                            "use_auth_token": use_auth_token,
+                            "token": token,
                         }
                         if has_file(pretrained_model_name_or_path, TF2_WEIGHTS_NAME, **has_file_kwargs):
                             raise EnvironmentError(
                                 f"{pretrained_model_name_or_path} does not appear to have a file named"
-                                f" {WEIGHTS_NAME} but there is a file for TensorFlow weights. Use `from_tf=True` to"
-                                " load this model from those weights."
+                                f" {_add_variant(WEIGHTS_NAME, variant)} but there is a file for TensorFlow weights."
+                                " Use `from_tf=True` to load this model from those weights."
                             )
                         elif has_file(pretrained_model_name_or_path, FLAX_WEIGHTS_NAME, **has_file_kwargs):
                             raise EnvironmentError(
                                 f"{pretrained_model_name_or_path} does not appear to have a file named"
-                                f" {WEIGHTS_NAME} but there is a file for Flax weights. Use `from_flax=True` to load"
-                                " this model from those weights."
+                                f" {_add_variant(WEIGHTS_NAME, variant)} but there is a file for Flax weights. Use"
+                                " `from_flax=True` to load this model from those weights."
+                            )
+                        elif variant is not None and has_file(
+                            pretrained_model_name_or_path, WEIGHTS_NAME, **has_file_kwargs
+                        ):
+                            raise EnvironmentError(
+                                f"{pretrained_model_name_or_path} does not appear to have a file named"
+                                f" {_add_variant(WEIGHTS_NAME, variant)} but there is a file without the variant"
+                                f" {variant}. Use `variant=None` to load this model from those weights."
                             )
                         else:
                             raise EnvironmentError(
-                                f"{pretrained_model_name_or_path} does not appear to have a file named {WEIGHTS_NAME},"
-                                f" {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME} or {FLAX_WEIGHTS_NAME}."
+                                f"{pretrained_model_name_or_path} does not appear to have a file named"
+                                f" {_add_variant(WEIGHTS_NAME, variant)}, {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME} or"
+                                f" {FLAX_WEIGHTS_NAME}."
                             )
                 except EnvironmentError:
                     # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted
@@ -2262,8 +2984,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                         f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it"
                         " from 'https://huggingface.co/models', make sure you don't have a local directory with the"
                         f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
-                        f" directory containing a file named {WEIGHTS_NAME}, {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME} or"
-                        f" {FLAX_WEIGHTS_NAME}."
+                        f" directory containing a file named {_add_variant(WEIGHTS_NAME, variant)},"
+                        f" {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME} or {FLAX_WEIGHTS_NAME}."
                     )
 
             if is_local:
@@ -2285,7 +3007,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 proxies=proxies,
                 resume_download=resume_download,
                 local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
+                token=token,
                 user_agent=user_agent,
                 revision=revision,
                 subfolder=subfolder,
@@ -2308,39 +3030,41 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             if torch_dtype is not None:
                 if isinstance(torch_dtype, str):
                     if torch_dtype == "auto":
-                        if is_sharded and "dtype" in sharded_metadata:
-                            torch_dtype = sharded_metadata["dtype"]
-                        elif not is_sharded:
-                            torch_dtype = get_state_dict_dtype(state_dict)
+                        if hasattr(config, "torch_dtype") and config.torch_dtype is not None:
+                            torch_dtype = config.torch_dtype
+                            logger.info(f"Will use torch_dtype={torch_dtype} as defined in model's config object")
                         else:
-                            one_state_dict = load_state_dict(resolved_archive_file[0])
-                            torch_dtype = get_state_dict_dtype(one_state_dict)
-                            del one_state_dict  # free CPU memory
+                            if is_sharded and "dtype" in sharded_metadata:
+                                torch_dtype = sharded_metadata["dtype"]
+                            elif not is_sharded:
+                                torch_dtype = get_state_dict_dtype(state_dict)
+                            else:
+                                one_state_dict = load_state_dict(resolved_archive_file[0])
+                                torch_dtype = get_state_dict_dtype(one_state_dict)
+                                del one_state_dict  # free CPU memory
+                            logger.info(
+                                "Since the `torch_dtype` attribute can't be found in model's config object, "
+                                "will use torch_dtype={torch_dtype} as derived from model's weights"
+                            )
                     else:
                         raise ValueError(
-                            f"`torch_dtype` can be either a `torch.dtype` or `auto`, but received {torch_dtype}"
+                            f'`torch_dtype` can be either `torch.dtype` or `"auto"`, but received {torch_dtype}'
                         )
                 dtype_orig = cls._set_default_torch_dtype(torch_dtype)
 
             # Check if `_keep_in_fp32_modules` is not None
-            use_keep_in_fp32_modules = (
-                (cls._keep_in_fp32_modules is not None) and is_accelerate_available() and torch_dtype == torch.float16
+            use_keep_in_fp32_modules = (cls._keep_in_fp32_modules is not None) and (
+                torch_dtype == torch.float16 or load_in_4bit or load_in_8bit
             )
-            if (
-                (cls._keep_in_fp32_modules is not None)
-                and not is_accelerate_available()
-                and torch_dtype == torch.float16
-            ):
-                logger.warning(
-                    "For stability purposes, it is recommended to have accelerate installed when using this model in"
-                    " torch.float16, please install it with `pip install accelerate`"
-                )
 
             if is_sharded:
                 loaded_state_dict_keys = sharded_metadata["all_checkpoint_keys"]
             else:
-                loaded_state_dict_keys = [k for k in state_dict.keys()]
-            if low_cpu_mem_usage or use_keep_in_fp32_modules:
+                loaded_state_dict_keys = list(state_dict.keys())
+            if low_cpu_mem_usage or (use_keep_in_fp32_modules and is_accelerate_available()):
+                # In case some weights need to be kept in float32 and accelerate is not installed,
+                # we later on want to take the path where state_dict is not None, that is the one
+                # that do not require accelerate.
                 state_dict = None
 
         config.name_or_path = pretrained_model_name_or_path
@@ -2353,68 +3077,169 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
 
             logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model")
             init_contexts = [deepspeed.zero.Init(config_dict_or_path=deepspeed_config())] + init_contexts
-        elif load_in_8bit or low_cpu_mem_usage:
+        elif load_in_8bit or load_in_4bit or low_cpu_mem_usage:
             init_contexts.append(init_empty_weights())
 
+        if use_flash_attention_2:
+            config = cls._check_and_enable_flash_attn_2(config, torch_dtype=torch_dtype, device_map=device_map)
+
         with ContextManagers(init_contexts):
             model = cls(config, *model_args, **model_kwargs)
 
         # Check first if we are `from_pt`
         if use_keep_in_fp32_modules:
-            low_cpu_mem_usage = True
+            if is_accelerate_available():
+                low_cpu_mem_usage = True
             keep_in_fp32_modules = model._keep_in_fp32_modules
         else:
             keep_in_fp32_modules = []
 
-        if load_in_8bit:
-            from .utils.bitsandbytes import get_keys_to_not_convert, replace_8bit_linear
+        if load_in_8bit or load_in_4bit:
+            from .integrations import get_keys_to_not_convert, replace_with_bnb_linear
 
-            logger.info("Detected 8-bit loading: activating 8-bit loading for this model")
+            llm_int8_skip_modules = quantization_config.llm_int8_skip_modules
+            load_in_8bit_fp32_cpu_offload = quantization_config.llm_int8_enable_fp32_cpu_offload
+            if load_in_8bit:
+                logger.info("Detected 8-bit loading: activating 8-bit loading for this model")
+            else:
+                logger.info("Detected 4-bit loading: activating 4-bit loading for this model")
 
             # We keep some modules such as the lm_head in their original dtype for numerical stability reasons
-            if load_in_8bit_skip_modules is None:
+            if llm_int8_skip_modules is None:
                 modules_to_not_convert = get_keys_to_not_convert(model)
             else:
-                modules_to_not_convert = load_in_8bit_skip_modules
+                modules_to_not_convert = llm_int8_skip_modules
 
             if not isinstance(modules_to_not_convert, list):
                 modules_to_not_convert = [modules_to_not_convert]
 
             modules_to_not_convert.extend(keep_in_fp32_modules)
 
-            model = replace_8bit_linear(
-                model, threshold=load_in_8bit_threshold, modules_to_not_convert=modules_to_not_convert
+            # Extend the modules to not convert to keys that are supposed to be offloaded to `cpu` or `disk`
+            if isinstance(device_map, dict) and len(device_map.keys()) > 1:
+                keys_on_cpu = [key for key, value in device_map.items() if value in ["disk", "cpu"]]
+
+                if len(keys_on_cpu) > 0 and not load_in_8bit_fp32_cpu_offload:
+                    raise ValueError(
+                        "If you want to offload some keys to `cpu` or `disk`, you need to set "
+                        "`llm_int8_enable_fp32_cpu_offload=True`. Note that these modules will not be "
+                        " converted to 8-bit but kept in 32-bit."
+                    )
+
+                modules_to_not_convert.extend(keys_on_cpu)
+
+            supports_4bit = version.parse(importlib.metadata.version("bitsandbytes")) >= version.parse("0.39.0")
+
+            if load_in_4bit and not supports_4bit:
+                raise ValueError(
+                    "You have a version of `bitsandbytes` that is not compatible with 4bit inference and training"
+                    " make sure you have the latest version of `bitsandbytes` installed"
+                )
+
+            model = replace_with_bnb_linear(
+                model, modules_to_not_convert=modules_to_not_convert, quantization_config=quantization_config
             )
+            # training in 8-bit is only available in 0.37.0+
+            model._is_quantized_training_enabled = version.parse(
+                importlib.metadata.version("bitsandbytes")
+            ) >= version.parse("0.37.0")
+
+            model.config.quantization_config = quantization_config
+            model.is_8bit_serializable = is_8bit_serializable
+
+        if load_in_8bit and torch_dtype is None:
+            logger.warning(
+                "You are loading your model in 8bit but you did not specify a `torch_dtype` attribute."
+                "All non-linear modules will be loaded in full precision."
+                " If you want to load the other modules in other precision, please specify a `torch_dtype` attribute."
+            )
+        if quantization_method_from_config == QuantizationMethod.GPTQ:
+            model = quantizer.convert_model(model)
+            model._is_quantized_training_enabled = True
+
+        if quantization_method_from_config is not None:
+            model.quantization_method = quantization_method_from_config
+        elif quantization_method_from_args is not None:
+            model.quantization_method = quantization_method_from_args
+        if hasattr(model, "quantization_method"):
+            model.is_quantized = True
 
         if isinstance(device_map, str):
+            special_dtypes = {}
+            if load_in_8bit or load_in_4bit:
+                special_dtypes.update(
+                    {
+                        name: torch_dtype
+                        for name, _ in model.named_parameters()
+                        if any(m in name for m in modules_to_not_convert)
+                    }
+                )
+
+            special_dtypes.update(
+                {
+                    name: torch.float32
+                    for name, _ in model.named_parameters()
+                    if any(m in name for m in keep_in_fp32_modules)
+                }
+            )
+
+            target_dtype = torch_dtype
+
+            if load_in_4bit:
+                if version.parse(importlib.metadata.version("accelerate")) > version.parse("0.19.0"):
+                    from accelerate.utils import CustomDtype
+
+                    target_dtype = CustomDtype.INT4
+                else:
+                    raise ValueError(
+                        "You are using `device_map='auto'` on a 4bit loaded version of the model. To automatically compute"
+                        " the appropriate device map, you should upgrade your `accelerate` library,"
+                        "`pip install --upgrade accelerate` or install it from source to support fp4 auto device map"
+                        "calculation. You may encounter unexpected behavior, or pass your own device map"
+                    )
+            elif load_in_8bit:
+                target_dtype = torch.int8
+
             if model._no_split_modules is None:
-                raise ValueError(f"{model.__class__.__name__} does not support `device_map='{device_map}'` yet.")
+                raise ValueError(
+                    f"{model.__class__.__name__} does not support `device_map='{device_map}'`. To implement support, the model"
+                    "class needs to implement the `_no_split_modules` attribute."
+                )
             no_split_modules = model._no_split_modules
             if device_map not in ["auto", "balanced", "balanced_low_0", "sequential"]:
                 raise ValueError(
                     "If passing a string for `device_map`, please choose 'auto', 'balanced', 'balanced_low_0' or "
                     "'sequential'."
                 )
-            elif device_map in ["balanced", "balanced_low_0"] and get_balanced_memory is None:
-                raise ValueError(f"`device_map={device_map}` requires a source install of Accelerate.")
-            if device_map != "sequential" and get_balanced_memory is not None:
+
+            device_map_kwargs = {"no_split_module_classes": no_split_modules}
+            if "special_dtypes" in inspect.signature(infer_auto_device_map).parameters:
+                device_map_kwargs["special_dtypes"] = special_dtypes
+            elif len(special_dtypes) > 0:
+                logger.warning(
+                    "This model has some weights that should be kept in higher precision, you need to upgrade "
+                    "`accelerate` to properly deal with them (`pip install --upgrade accelerate`)."
+                )
+            if device_map != "sequential":
                 max_memory = get_balanced_memory(
                     model,
-                    max_memory=max_memory,
-                    no_split_module_classes=no_split_modules,
-                    dtype=torch_dtype,
+                    dtype=target_dtype,
                     low_zero=(device_map == "balanced_low_0"),
+                    max_memory=max_memory,
+                    **device_map_kwargs,
                 )
+            else:
+                max_memory = get_max_memory(max_memory)
+            if getattr(model, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES:
+                # need more space for buffers that are created during quantization
+                max_memory = {key: val * 0.90 for key, val in max_memory.items()}
+            device_map_kwargs["max_memory"] = max_memory
+
             # Make sure tied weights are tied before creating the device map.
             model.tie_weights()
-            device_map = infer_auto_device_map(
-                model,
-                no_split_module_classes=no_split_modules,
-                dtype=torch_dtype if not load_in_8bit else torch.int8,
-                max_memory=max_memory,
-            )
+            device_map = infer_auto_device_map(model, dtype=target_dtype, **device_map_kwargs)
 
-            if load_in_8bit:
+            if load_in_8bit or load_in_4bit:
                 # The LM head / tied weights or any last module can stay on disk / CPU
                 device_map_without_lm_head = {
                     key: device_map[key] for key in device_map.keys() if key not in modules_to_not_convert
@@ -2423,12 +3248,21 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                     raise ValueError(
                         """
                         Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit
-                        the quantized model. If you have set a value for `max_memory` you should increase that. To have
-                        an idea of the modules that are set on the CPU or RAM you can print model.hf_device_map.
+                        the quantized model. If you want to dispatch the model on the CPU or the disk while keeping
+                        these modules in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom
+                        `device_map` to `from_pretrained`. Check
+                        https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu
+                        for more details.
                         """
                     )
                 del device_map_without_lm_head
 
+        elif device_map is not None:
+            model.tie_weights()
+            tied_params = find_tied_parameters(model)
+            # check if we don't have tied param in different devices
+            check_tied_parameters_on_same_device(tied_params, device_map)
+
         if from_tf:
             if resolved_archive_file.endswith(".index"):
                 # Load from a TensorFlow 1.X checkpoint - provided by original authors
@@ -2461,7 +3295,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 )
                 raise
         elif from_pt:
-
             # restore default dtype
             if dtype_orig is not None:
                 torch.set_default_dtype(dtype_orig)
@@ -2487,10 +3320,11 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 offload_folder=offload_folder,
                 offload_state_dict=offload_state_dict,
                 dtype=torch_dtype,
-                load_in_8bit=load_in_8bit,
+                is_quantized=(getattr(model, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES),
                 keep_in_fp32_modules=keep_in_fp32_modules,
             )
 
+        model.is_loaded_in_4bit = load_in_4bit
         model.is_loaded_in_8bit = load_in_8bit
 
         # make sure token embedding weights are still tied if needed
@@ -2500,7 +3334,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         model.eval()
 
         # If it is a model with generation capabilities, attempt to load the generation config
-        if model.can_generate():
+        if model.can_generate() and pretrained_model_name_or_path is not None:
             try:
                 model.generation_config = GenerationConfig.from_pretrained(
                     pretrained_model_name_or_path,
@@ -2509,7 +3343,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                     resume_download=resume_download,
                     proxies=proxies,
                     local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
+                    token=token,
                     revision=revision,
                     subfolder=subfolder,
                     _from_auto=from_auto_class,
@@ -2524,7 +3358,33 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
 
         # Dispatch model with hooks on all devices if necessary
         if device_map is not None:
-            dispatch_model(model, device_map=device_map, offload_dir=offload_folder, offload_index=offload_index)
+            device_map_kwargs = {
+                "device_map": device_map,
+                "offload_dir": offload_folder,
+                "offload_index": offload_index,
+            }
+            if "skip_keys" in inspect.signature(dispatch_model).parameters:
+                device_map_kwargs["skip_keys"] = model._skip_keys_device_placement
+            dispatch_model(model, **device_map_kwargs)
+
+        if quantization_method_from_args == QuantizationMethod.GPTQ:
+            if quantization_config.tokenizer is None:
+                quantization_config.tokenizer = pretrained_model_name_or_path
+            if cls.main_input_name != "input_ids":
+                raise RuntimeError("We can only quantize pure text model.")
+            quantizer.quantize_model(model, quantization_config.tokenizer)
+            model.config.quantization_config = GPTQConfig.from_dict(quantizer.to_dict())
+            model._is_quantized_training_enabled = True
+        if quantization_method_from_config == QuantizationMethod.GPTQ:
+            model = quantizer.post_init_model(model)
+
+        if _adapter_model_path is not None:
+            model.load_adapter(
+                _adapter_model_path,
+                adapter_name=adapter_name,
+                token=token,
+                adapter_kwargs=adapter_kwargs,
+            )
 
         if output_loading_info:
             if loading_info is None:
@@ -2554,12 +3414,12 @@ def _load_pretrained_model(
         offload_folder=None,
         offload_state_dict=None,
         dtype=None,
-        load_in_8bit=False,
+        is_quantized=False,
         keep_in_fp32_modules=None,
     ):
         is_safetensors = False
-        if load_in_8bit:
-            from .utils.bitsandbytes import set_module_8bit_tensor_to_device
+        if is_quantized:
+            from .integrations import set_module_quantized_tensor_to_device
 
         if device_map is not None and "disk" in device_map.values():
             archive_file = (
@@ -2578,6 +3438,10 @@ def _load_pretrained_model(
                 offload_state_dict = True
 
         is_sharded_safetensors = is_safetensors and sharded_metadata is not None
+
+        # tie the model weights before retrieving the state_dict
+        model.tie_weights()
+
         # Retrieve missing & unexpected_keys
         model_state_dict = model.state_dict()
         expected_keys = list(model_state_dict.keys())
@@ -2613,7 +3477,37 @@ def _fix_key(key):
             expected_keys = [".".join([prefix, s]) for s in expected_keys]
 
         missing_keys = list(set(expected_keys) - set(loaded_keys))
-        unexpected_keys = list(set(loaded_keys) - set(expected_keys))
+        unexpected_keys = set(loaded_keys) - set(expected_keys)
+        # Remove nonpersistent buffers from unexpected keys: they are not in the state dict but will be in the model
+        # buffers
+        model_buffers = {n for n, _ in model.named_buffers()}
+        if remove_prefix_from_model:
+            model_buffers = {key[len(_prefix) :] if key.startswith(_prefix) else key for key in model_buffers}
+        elif add_prefix_to_model:
+            model_buffers = {".".join([prefix, key]) for key in model_buffers}
+        unexpected_keys = list(unexpected_keys - model_buffers)
+
+        model.tie_weights()
+        if device_map is None and not is_fsdp_enabled():
+            ptrs = collections.defaultdict(list)
+            for name, tensor in model.state_dict().items():
+                id_tensor = id_tensor_storage(tensor)
+                ptrs[id_tensor].append(name)
+
+            # These are all the pointers of shared tensors.
+            tied_params = [names for _, names in ptrs.items() if len(names) > 1]
+        else:
+            # id function doesn't work for meta tensor so we need this function
+            tied_params = find_tied_parameters(model)
+
+        for group in tied_params:
+            if remove_prefix_from_model:
+                group = [key[len(_prefix) :] if key.startswith(_prefix) else key for key in group]
+            elif add_prefix_to_model:
+                group = [".".join([prefix, key]) for key in group]
+            missing_in_group = [k for k in missing_keys if k in group]
+            if len(missing_in_group) > 0 and len(missing_in_group) < len(group):
+                missing_keys = [k for k in missing_keys if k not in missing_in_group]
 
         # Some models may have keys that are not in the state by design, removing them before needlessly warning
         # the user.
@@ -2629,7 +3523,11 @@ def _fix_key(key):
         # This is not ideal in terms of memory, but if we don't do that not, we can't initialize them in the next step
         if low_cpu_mem_usage:
             for key in missing_keys:
-                if key.startswith(prefix):
+                if key in list(model_state_dict.keys()):
+                    key = key
+                elif f"{prefix}.{key}" in list(model_state_dict.keys()):
+                    key = f"{prefix}.{key}"
+                elif key.startswith(prefix) and ".".join(key.split(".")[1:]) in list(model_state_dict.keys()):
                     key = ".".join(key.split(".")[1:])
                 param = model_state_dict[key]
 
@@ -2638,31 +3536,38 @@ def _fix_key(key):
                 if (
                     keep_in_fp32_modules is not None
                     and dtype == torch.float16
-                    and any(module_to_keep_in_fp32 in key for module_to_keep_in_fp32 in keep_in_fp32_modules)
+                    and any(
+                        module_to_keep_in_fp32 in key.split(".") for module_to_keep_in_fp32 in keep_in_fp32_modules
+                    )
                 ):
                     target_dtype = torch.float32
 
                 if param.device == torch.device("meta"):
-                    if not load_in_8bit:
+                    if not (is_quantized):
                         set_module_tensor_to_device(model, key, "cpu", torch.empty(*param.size(), dtype=target_dtype))
                     else:
-                        set_module_8bit_tensor_to_device(
+                        set_module_quantized_tensor_to_device(
                             model, key, "cpu", torch.empty(*param.size(), dtype=target_dtype)
                         )
 
         # retrieve unintialized modules and initialize before maybe overriding that with the pretrained weights.
         if _fast_init:
-            uninitialized_modules = model.retrieve_modules_from_names(
-                missing_keys, add_prefix=add_prefix_to_model, remove_prefix=remove_prefix_from_model
-            )
-            for module in uninitialized_modules:
-                model._init_weights(module)
+            if remove_prefix_from_model:
+                _loaded_keys = [f"{prefix}.{k}" for k in loaded_keys]
+            elif add_prefix_to_model:
+                _loaded_keys = [k[len(prefix) + 1 :] for k in loaded_keys]
+            else:
+                _loaded_keys = loaded_keys
+            set_initialized_submodules(model, _loaded_keys)
+            # This will only initialize submodules that are not marked as initialized by the line above.
+            model.apply(model._initialize_weights)
 
         # Set some modules to fp32 if any
         if keep_in_fp32_modules is not None:
             for name, param in model.named_parameters():
-                if any(module_to_keep_in_fp32 in name for module_to_keep_in_fp32 in keep_in_fp32_modules):
-                    param = param.to(torch.float32)
+                if any(module_to_keep_in_fp32 in name.split(".") for module_to_keep_in_fp32 in keep_in_fp32_modules):
+                    # param = param.to(torch.float32) does not work here as only in the local scope.
+                    param.data = param.data.to(torch.float32)
 
         # Make sure we are able to load base models as well as derived models (with heads)
         start_prefix = ""
@@ -2691,6 +3596,9 @@ def _find_mismatched_keys(
             mismatched_keys = []
             if ignore_mismatched_sizes:
                 for checkpoint_key in loaded_keys:
+                    # If the checkpoint is sharded, we may not have the key here.
+                    if checkpoint_key not in state_dict:
+                        continue
                     model_key = checkpoint_key
                     if remove_prefix_from_model:
                         # The model key starts with `prefix` but `checkpoint_key` doesn't so we add it.
@@ -2709,7 +3617,10 @@ def _find_mismatched_keys(
                         del state_dict[checkpoint_key]
             return mismatched_keys
 
-        folder = os.path.sep.join(resolved_archive_file[0].split(os.path.sep)[:-1])
+        if resolved_archive_file is not None:
+            folder = os.path.sep.join(resolved_archive_file[0].split(os.path.sep)[:-1])
+        else:
+            folder = None
         if device_map is not None and is_safetensors:
             param_device_map = expand_device_map(device_map, original_loaded_keys)
 
@@ -2783,25 +3694,36 @@ def _find_mismatched_keys(
                     remove_prefix_from_model,
                     ignore_mismatched_sizes,
                 )
-
                 if low_cpu_mem_usage:
-                    new_error_msgs, offload_index, state_dict_index = _load_state_dict_into_meta_model(
-                        model_to_load,
-                        state_dict,
-                        loaded_keys,
-                        start_prefix,
-                        expected_keys,
-                        device_map=device_map,
-                        offload_folder=offload_folder,
-                        offload_index=offload_index,
-                        state_dict_folder=state_dict_folder,
-                        state_dict_index=state_dict_index,
-                        dtype=dtype,
-                        load_in_8bit=load_in_8bit,
-                        is_safetensors=is_safetensors,
-                        keep_in_fp32_modules=keep_in_fp32_modules,
-                    )
-                    error_msgs += new_error_msgs
+                    if not is_fsdp_enabled() or is_fsdp_enabled_and_dist_rank_0():
+                        new_error_msgs, offload_index, state_dict_index = _load_state_dict_into_meta_model(
+                            model_to_load,
+                            state_dict,
+                            loaded_keys,
+                            start_prefix,
+                            expected_keys,
+                            device_map=device_map,
+                            offload_folder=offload_folder,
+                            offload_index=offload_index,
+                            state_dict_folder=state_dict_folder,
+                            state_dict_index=state_dict_index,
+                            dtype=dtype,
+                            is_quantized=is_quantized,
+                            is_safetensors=is_safetensors,
+                            keep_in_fp32_modules=keep_in_fp32_modules,
+                        )
+                        error_msgs += new_error_msgs
+                    else:
+                        for key, param in model_to_load.state_dict().items():
+                            if param.device == torch.device("meta"):
+                                if not (is_quantized):
+                                    set_module_tensor_to_device(
+                                        model_to_load, key, "cpu", torch.empty(*param.size(), dtype=dtype)
+                                    )
+                                else:
+                                    set_module_quantized_tensor_to_device(
+                                        model_to_load, key, "cpu", torch.empty(*param.size(), dtype=dtype)
+                                    )
                 else:
                     error_msgs += _load_state_dict_into_model(model_to_load, state_dict, start_prefix)
 
@@ -2837,8 +3759,14 @@ def _find_mismatched_keys(
                 )
             raise RuntimeError(f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}")
 
+        if is_quantized:
+            unexpected_keys = [elem for elem in unexpected_keys if "SCB" not in elem]
+            missing_keys = [elem for elem in missing_keys if "SCB" not in elem]
+
         if len(unexpected_keys) > 0:
-            logger.warning(
+            archs = [] if model.config.architectures is None else model.config.architectures
+            warner = logger.warning if model.__class__.__name__ in archs else logger.info
+            warner(
                 f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
                 f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are"
                 f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task or"
@@ -2879,12 +3807,12 @@ def _find_mismatched_keys(
         return model, missing_keys, unexpected_keys, mismatched_keys, offload_index, error_msgs
 
     def retrieve_modules_from_names(self, names, add_prefix=False, remove_prefix=False):
-        module_keys = set([".".join(key.split(".")[:-1]) for key in names])
+        module_keys = {".".join(key.split(".")[:-1]) for key in names}
 
         # torch.nn.ParameterList is a special case where two parameter keywords
         # are appended to the module name, *e.g.* bert.special_embeddings.0
         module_keys = module_keys.union(
-            set([".".join(key.split(".")[:-2]) for key in names if len(key) > 0 and key[-1].isdigit()])
+            {".".join(key.split(".")[:-2]) for key in names if len(key) > 0 and key[-1].isdigit()}
         )
 
         retrieved_modules = []
@@ -2951,6 +3879,91 @@ def register_for_auto_class(cls, auto_class="AutoModel"):
 
         cls._auto_class = auto_class
 
+    def to_bettertransformer(self) -> "PreTrainedModel":
+        """
+        Converts the model to use [PyTorch's native attention
+        implementation](https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html), integrated to
+        Transformers through [Optimum library](https://huggingface.co/docs/optimum/bettertransformer/overview). Only a
+        subset of all Transformers models are supported.
+
+        PyTorch's attention fastpath allows to speed up inference through kernel fusions and the use of [nested
+        tensors](https://pytorch.org/docs/stable/nested.html). Detailed benchmarks can be found in [this blog
+        post](https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2).
+
+        Returns:
+            [`PreTrainedModel`]: The model converted to BetterTransformer.
+        """
+        if not is_optimum_available():
+            raise ImportError("The package `optimum` is required to use Better Transformer.")
+
+        from optimum.version import __version__ as optimum_version
+
+        if version.parse(optimum_version) < version.parse("1.7.0"):
+            raise ImportError(
+                f"Please install optimum>=1.7.0 to use Better Transformer. The version {optimum_version} was found."
+            )
+
+        from optimum.bettertransformer import BetterTransformer
+
+        return BetterTransformer.transform(self)
+
+    def reverse_bettertransformer(self):
+        """
+        Reverts the transformation from [`~PreTrainedModel.to_bettertransformer`] so that the original modeling is
+        used, for example in order to save the model.
+
+        Returns:
+            [`PreTrainedModel`]: The model converted back to the original modeling.
+        """
+        if not is_optimum_available():
+            raise ImportError("The package `optimum` is required to use Better Transformer.")
+
+        from optimum.version import __version__ as optimum_version
+
+        if version.parse(optimum_version) < version.parse("1.7.0"):
+            raise ImportError(
+                f"Please install optimum>=1.7.0 to use Better Transformer. The version {optimum_version} was found."
+            )
+
+        from optimum.bettertransformer import BetterTransformer
+
+        return BetterTransformer.reverse(self)
+
+    def warn_if_padding_and_no_attention_mask(self, input_ids, attention_mask):
+        """
+        Shows a one-time warning if the input_ids appear to contain padding and no attention mask was given.
+        """
+
+        # Skip the check during tracing.
+        if is_torch_fx_proxy(input_ids) or torch.jit.is_tracing() or is_torchdynamo_compiling():
+            return
+
+        if (attention_mask is not None) or (self.config.pad_token_id is None):
+            return
+
+        # Check only the first and last input IDs to reduce overhead.
+        if self.config.pad_token_id in input_ids[:, [-1, 0]]:
+            warn_string = (
+                "We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See "
+                "https://huggingface.co/docs/transformers/troubleshooting"
+                "#incorrect-output-when-padding-tokens-arent-masked."
+            )
+
+            # If the pad token is equal to either BOS, EOS, or SEP, we do not know whether the user should use an
+            # attention_mask or not. In this case, we should still show a warning because this is a rare case.
+            if (
+                (self.config.bos_token_id is not None and self.config.bos_token_id == self.config.pad_token_id)
+                or (self.config.eos_token_id is not None and self.config.eos_token_id == self.config.pad_token_id)
+                or (self.config.sep_token_id is not None and self.config.sep_token_id == self.config.pad_token_id)
+            ):
+                warn_string += (
+                    f"\nYou may ignore this warning if your `pad_token_id` ({self.config.pad_token_id}) is identical "
+                    f"to the `bos_token_id` ({self.config.bos_token_id}), `eos_token_id` ({self.config.eos_token_id}), "
+                    f"or the `sep_token_id` ({self.config.sep_token_id}), and your input is not padded."
+                )
+
+            logger.warning_once(warn_string)
+
 
 PreTrainedModel.push_to_hub = copy_func(PreTrainedModel.push_to_hub)
 if PreTrainedModel.push_to_hub.__doc__ is not None:
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 43ed17f30dee..e98f672f8d37 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,9 +14,12 @@
 
 from . import (
     albert,
+    align,
     altclip,
     audio_spectrogram_transformer,
     auto,
+    autoformer,
+    bark,
     bart,
     barthez,
     bartpho,
@@ -36,19 +35,25 @@
     blenderbot,
     blenderbot_small,
     blip,
+    blip_2,
     bloom,
-    bort,
+    bridgetower,
+    bros,
     byt5,
     camembert,
     canine,
     chinese_clip,
+    clap,
     clip,
     clipseg,
+    code_llama,
     codegen,
     conditional_detr,
     convbert,
     convnext,
+    convnextv2,
     cpm,
+    cpmant,
     ctrl,
     cvt,
     data2vec,
@@ -57,36 +62,51 @@
     decision_transformer,
     deformable_detr,
     deit,
+    deprecated,
+    deta,
     detr,
     dialogpt,
     dinat,
+    dinov2,
     distilbert,
     dit,
     donut,
     dpr,
     dpt,
+    efficientformer,
+    efficientnet,
     electra,
+    encodec,
     encoder_decoder,
     ernie,
+    ernie_m,
     esm,
+    falcon,
     flaubert,
     flava,
     fnet,
+    focalnet,
     fsmt,
     funnel,
     git,
     glpn,
     gpt2,
+    gpt_bigcode,
     gpt_neo,
     gpt_neox,
     gpt_neox_japanese,
     gpt_sw3,
     gptj,
+    gptsan_japanese,
+    graphormer,
     groupvit,
     herbert,
     hubert,
     ibert,
+    idefics,
     imagegpt,
+    informer,
+    instructblip,
     jukebox,
     layoutlm,
     layoutlmv2,
@@ -95,6 +115,7 @@
     led,
     levit,
     lilt,
+    llama,
     longformer,
     longt5,
     luke,
@@ -102,35 +123,48 @@
     m2m_100,
     marian,
     markuplm,
+    mask2former,
     maskformer,
     mbart,
     mbart50,
-    mctct,
+    mega,
     megatron_bert,
     megatron_gpt2,
+    mgp_str,
+    mistral,
     mluke,
-    mmbt,
     mobilebert,
     mobilenet_v1,
     mobilenet_v2,
     mobilevit,
+    mobilevitv2,
     mpnet,
+    mpt,
+    mra,
     mt5,
+    musicgen,
     mvp,
     nat,
     nezha,
     nllb,
+    nllb_moe,
+    nougat,
     nystromformer,
+    oneformer,
     openai,
     opt,
     owlvit,
     pegasus,
     pegasus_x,
     perceiver,
+    persimmon,
     phobert,
+    pix2struct,
     plbart,
     poolformer,
+    pop2piano,
     prophetnet,
+    pvt,
     qdqbert,
     rag,
     realm,
@@ -138,19 +172,22 @@
     regnet,
     rembert,
     resnet,
-    retribert,
     roberta,
     roberta_prelayernorm,
     roc_bert,
     roformer,
+    rwkv,
+    sam,
     segformer,
     sew,
     sew_d,
     speech_encoder_decoder,
     speech_to_text,
     speech_to_text_2,
+    speecht5,
     splinter,
     squeezebert,
+    swiftformer,
     swin,
     swin2sr,
     swinv2,
@@ -158,15 +195,16 @@
     t5,
     table_transformer,
     tapas,
-    tapex,
     time_series_transformer,
     timesformer,
-    trajectory_transformer,
+    timm_backbone,
     transfo_xl,
     trocr,
+    tvlt,
+    umt5,
     unispeech,
     unispeech_sat,
-    van,
+    upernet,
     videomae,
     vilt,
     vision_encoder_decoder,
@@ -176,6 +214,10 @@
     vit_hybrid,
     vit_mae,
     vit_msn,
+    vitdet,
+    vitmatte,
+    vits,
+    vivit,
     wav2vec2,
     wav2vec2_conformer,
     wav2vec2_phoneme,
@@ -189,6 +231,7 @@
     xlm_roberta,
     xlm_roberta_xl,
     xlnet,
+    xmod,
     yolos,
     yoso,
 )
diff --git a/src/transformers/models/albert/__init__.py b/src/transformers/models/albert/__init__.py
index ecfdc84c7bd7..168c68db837d 100644
--- a/src/transformers/models/albert/__init__.py
+++ b/src/transformers/models/albert/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/albert/configuration_albert.py b/src/transformers/models/albert/configuration_albert.py
index d11d49a2c78f..cacc0499035c 100644
--- a/src/transformers/models/albert/configuration_albert.py
+++ b/src/transformers/models/albert/configuration_albert.py
@@ -85,6 +85,12 @@ class AlbertConfig(PretrainedConfig):
             [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
             For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
             with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 2):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 3):
+            End of stream token id.
 
     Examples:
 
@@ -132,7 +138,7 @@ def __init__(
         pad_token_id=0,
         bos_token_id=2,
         eos_token_id=3,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
diff --git a/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py
index 8823a86fc8c6..eecada8b432a 100644
--- a/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py
@@ -19,8 +19,8 @@
 
 import torch
 
-from transformers import AlbertConfig, AlbertForPreTraining, load_tf_weights_in_albert
-from transformers.utils import logging
+from ...utils import logging
+from . import AlbertConfig, AlbertForPreTraining, load_tf_weights_in_albert
 
 
 logging.set_verbosity_info()
diff --git a/src/transformers/models/albert/modeling_albert.py b/src/transformers/models/albert/modeling_albert.py
index 6ba582fa7252..fe6b37732332 100755
--- a/src/transformers/models/albert/modeling_albert.py
+++ b/src/transformers/models/albert/modeling_albert.py
@@ -50,7 +50,6 @@
 
 _CHECKPOINT_FOR_DOC = "albert-base-v2"
 _CONFIG_FOR_DOC = "AlbertConfig"
-_TOKENIZER_FOR_DOC = "AlbertTokenizer"
 
 
 ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -183,7 +182,7 @@ def load_tf_weights_in_albert(model, config, tf_checkpoint_path):
         try:
             if pointer.shape != array.shape:
                 raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
-        except AssertionError as e:
+        except ValueError as e:
             e.args += (pointer.shape, array.shape)
             raise
         print(f"Initialize PyTorch weight {name} from {original_name}")
@@ -209,7 +208,9 @@ def __init__(self, config: AlbertConfig):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
@@ -508,7 +509,6 @@ class AlbertPreTrainedModel(PreTrainedModel):
     config_class = AlbertConfig
     load_tf_weights = load_tf_weights_in_albert
     base_model_prefix = "albert"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights."""
@@ -582,7 +582,7 @@ class AlbertForPreTrainingOutput(ModelOutput):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`AlbertTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -632,7 +632,6 @@ class AlbertForPreTrainingOutput(ModelOutput):
     ALBERT_START_DOCSTRING,
 )
 class AlbertModel(AlbertPreTrainedModel):
-
     config_class = AlbertConfig
     base_model_prefix = "albert"
 
@@ -677,7 +676,6 @@ def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
 
     @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPooling,
         config_class=_CONFIG_FOR_DOC,
@@ -690,9 +688,9 @@ def forward(
         position_ids: Optional[torch.LongTensor] = None,
         head_mask: Optional[torch.FloatTensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
-        output_attentions: Optional[None] = None,
-        output_hidden_states: Optional[None] = None,
-        return_dict: Optional[None] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
     ) -> Union[BaseModelOutputWithPooling, Tuple]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -703,6 +701,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -762,11 +761,7 @@ def forward(
     ALBERT_START_DOCSTRING,
 )
 class AlbertForPreTraining(AlbertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        "predictions.decoder.weight",
-        "predictions.decoder.bias",
-        "embeddings.position_ids",
-    ]
+    _tied_weights_keys = ["predictions.decoder.bias", "predictions.decoder.weight"]
 
     def __init__(self, config: AlbertConfig):
         super().__init__(config)
@@ -818,10 +813,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import AlbertTokenizer, AlbertForPreTraining
+        >>> from transformers import AutoTokenizer, AlbertForPreTraining
         >>> import torch
 
-        >>> tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
+        >>> tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
         >>> model = AlbertForPreTraining.from_pretrained("albert-base-v2")
 
         >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)
@@ -914,13 +909,7 @@ def forward(self, pooled_output: torch.Tensor) -> torch.Tensor:
     ALBERT_START_DOCSTRING,
 )
 class AlbertForMaskedLM(AlbertPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [
-        "predictions.decoder.weight",
-        "predictions.decoder.bias",
-        "embeddings.position_ids",
-    ]
+    _tied_weights_keys = ["predictions.decoder.bias", "predictions.decoder.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -967,9 +956,9 @@ def forward(
 
         ```python
         >>> import torch
-        >>> from transformers import AlbertTokenizer, AlbertForMaskedLM
+        >>> from transformers import AutoTokenizer, AlbertForMaskedLM
 
-        >>> tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
+        >>> tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
         >>> model = AlbertForMaskedLM.from_pretrained("albert-base-v2")
 
         >>> # add mask_token
@@ -1048,7 +1037,6 @@ def __init__(self, config: AlbertConfig):
 
     @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="textattack/albert-base-v2-imdb",
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1136,9 +1124,6 @@ def forward(
     ALBERT_START_DOCSTRING,
 )
 class AlbertForTokenClassification(AlbertPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config: AlbertConfig):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1157,15 +1142,9 @@ def __init__(self, config: AlbertConfig):
 
     @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint="vumichien/tiny-albert",
+        checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output=(
-            "['LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_0', 'LABEL_1', 'LABEL_0', 'LABEL_1', 'LABEL_1', "
-            "'LABEL_0', 'LABEL_1', 'LABEL_0', 'LABEL_0', 'LABEL_1', 'LABEL_1']"
-        ),
-        expected_loss=0.66,
     )
     def forward(
         self,
@@ -1228,9 +1207,6 @@ def forward(
     ALBERT_START_DOCSTRING,
 )
 class AlbertForQuestionAnswering(AlbertPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config: AlbertConfig):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1243,7 +1219,6 @@ def __init__(self, config: AlbertConfig):
 
     @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="twmkn9/albert-base-v2-squad2",
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1347,7 +1322,6 @@ def __init__(self, config: AlbertConfig):
 
     @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/albert/modeling_flax_albert.py b/src/transformers/models/albert/modeling_flax_albert.py
index 84b86fa56302..55fd9d5a4c91 100644
--- a/src/transformers/models/albert/modeling_flax_albert.py
+++ b/src/transformers/models/albert/modeling_flax_albert.py
@@ -15,12 +15,11 @@
 
 from typing import Callable, Optional, Tuple
 
-import numpy as np
-
 import flax
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen.attention import dot_product_attention_weights
 from flax.traverse_util import flatten_dict, unflatten_dict
@@ -50,7 +49,6 @@
 
 _CHECKPOINT_FOR_DOC = "albert-base-v2"
 _CONFIG_FOR_DOC = "AlbertConfig"
-_TOKENIZER_FOR_DOC = "AlbertTokenizer"
 
 
 @flax.struct.dataclass
@@ -122,7 +120,7 @@ class FlaxAlbertForPreTrainingOutput(ModelOutput):
         input_ids (`numpy.ndarray` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`AlbertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -175,7 +173,6 @@ def setup(self):
         self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
         self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
 
-    # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertEmbeddings.__call__
     def __call__(self, input_ids, token_type_ids, position_ids, deterministic: bool = True):
         # Embed
         inputs_embeds = self.word_embeddings(input_ids.astype("i4"))
@@ -245,7 +242,7 @@ def __call__(self, hidden_states, attention_mask, deterministic=True, output_att
             attention_bias = lax.select(
                 attention_mask > 0,
                 jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
-                jnp.full(attention_mask.shape, -1e10).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
             )
         else:
             attention_bias = None
@@ -524,7 +521,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, **kwargs)
         super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
@@ -680,9 +677,7 @@ class FlaxAlbertModel(FlaxAlbertPreTrainedModel):
     module_class = FlaxAlbertModule
 
 
-append_call_sample_docstring(
-    FlaxAlbertModel, _TOKENIZER_FOR_DOC, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutputWithPooling, _CONFIG_FOR_DOC
-)
+append_call_sample_docstring(FlaxAlbertModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutputWithPooling, _CONFIG_FOR_DOC)
 
 
 class FlaxAlbertForPreTrainingModule(nn.Module):
@@ -705,7 +700,6 @@ def __call__(
         output_hidden_states: bool = False,
         return_dict: bool = True,
     ):
-
         # Model
         outputs = self.albert(
             input_ids,
@@ -757,9 +751,9 @@ class FlaxAlbertForPreTraining(FlaxAlbertPreTrainedModel):
     Example:
 
     ```python
-    >>> from transformers import AlbertTokenizer, FlaxAlbertForPreTraining
+    >>> from transformers import AutoTokenizer, FlaxAlbertForPreTraining
 
-    >>> tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
     >>> model = FlaxAlbertForPreTraining.from_pretrained("albert-base-v2")
 
     >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
@@ -834,9 +828,7 @@ class FlaxAlbertForMaskedLM(FlaxAlbertPreTrainedModel):
     module_class = FlaxAlbertForMaskedLMModule
 
 
-append_call_sample_docstring(
-    FlaxAlbertForMaskedLM, _TOKENIZER_FOR_DOC, _CHECKPOINT_FOR_DOC, FlaxMaskedLMOutput, _CONFIG_FOR_DOC
-)
+append_call_sample_docstring(FlaxAlbertForMaskedLM, _CHECKPOINT_FOR_DOC, FlaxMaskedLMOutput, _CONFIG_FOR_DOC)
 
 
 class FlaxAlbertForSequenceClassificationModule(nn.Module):
@@ -906,7 +898,6 @@ class FlaxAlbertForSequenceClassification(FlaxAlbertPreTrainedModel):
 
 append_call_sample_docstring(
     FlaxAlbertForSequenceClassification,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxSequenceClassifierOutput,
     _CONFIG_FOR_DOC,
@@ -983,7 +974,6 @@ class FlaxAlbertForMultipleChoice(FlaxAlbertPreTrainedModel):
 )
 append_call_sample_docstring(
     FlaxAlbertForMultipleChoice,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxMultipleChoiceModelOutput,
     _CONFIG_FOR_DOC,
@@ -1054,7 +1044,6 @@ class FlaxAlbertForTokenClassification(FlaxAlbertPreTrainedModel):
 
 append_call_sample_docstring(
     FlaxAlbertForTokenClassification,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxTokenClassifierOutput,
     _CONFIG_FOR_DOC,
@@ -1123,7 +1112,6 @@ class FlaxAlbertForQuestionAnswering(FlaxAlbertPreTrainedModel):
 
 append_call_sample_docstring(
     FlaxAlbertForQuestionAnswering,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxQuestionAnsweringModelOutput,
     _CONFIG_FOR_DOC,
diff --git a/src/transformers/models/albert/modeling_tf_albert.py b/src/transformers/models/albert/modeling_tf_albert.py
index 0182fcfd3fe7..ad35b6182a4e 100644
--- a/src/transformers/models/albert/modeling_tf_albert.py
+++ b/src/transformers/models/albert/modeling_tf_albert.py
@@ -15,6 +15,9 @@
 # limitations under the License.
 """ TF 2.0 ALBERT model."""
 
+
+from __future__ import annotations
+
 import math
 from dataclasses import dataclass
 from typing import Dict, Optional, Tuple, Union
@@ -44,9 +47,8 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -61,7 +63,6 @@
 
 _CHECKPOINT_FOR_DOC = "albert-base-v2"
 _CONFIG_FOR_DOC = "AlbertConfig"
-_TOKENIZER_FOR_DOC = "AlbertTokenizer"
 
 TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "albert-base-v1",
@@ -138,8 +139,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
     def __init__(self, config: AlbertConfig, **kwargs):
         super().__init__(**kwargs)
 
-        self.vocab_size = config.vocab_size
-        self.type_vocab_size = config.type_vocab_size
+        self.config = config
         self.embedding_size = config.embedding_size
         self.max_position_embeddings = config.max_position_embeddings
         self.initializer_range = config.initializer_range
@@ -150,14 +150,14 @@ def build(self, input_shape: tf.TensorShape):
         with tf.name_scope("word_embeddings"):
             self.weight = self.add_weight(
                 name="weight",
-                shape=[self.vocab_size, self.embedding_size],
+                shape=[self.config.vocab_size, self.embedding_size],
                 initializer=get_initializer(self.initializer_range),
             )
 
         with tf.name_scope("token_type_embeddings"):
             self.token_type_embeddings = self.add_weight(
                 name="embeddings",
-                shape=[self.type_vocab_size, self.embedding_size],
+                shape=[self.config.type_vocab_size, self.embedding_size],
                 initializer=get_initializer(self.initializer_range),
             )
 
@@ -190,16 +190,7 @@ def call(
             raise ValueError("Need to provide either `input_ids` or `input_embeds`.")
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
@@ -481,7 +472,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
     def __init__(self, config: AlbertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs):
         super().__init__(**kwargs)
 
-        self.vocab_size = config.vocab_size
+        self.config = config
         self.embedding_size = config.embedding_size
         self.dense = tf.keras.layers.Dense(
             config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
@@ -498,9 +489,9 @@ def __init__(self, config: AlbertConfig, input_embeddings: tf.keras.layers.Layer
         self.decoder = input_embeddings
 
     def build(self, input_shape: tf.TensorShape):
-        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
+        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
         self.decoder_bias = self.add_weight(
-            shape=(self.vocab_size,), initializer="zeros", trainable=True, name="decoder/bias"
+            shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="decoder/bias"
         )
 
         super().build(input_shape)
@@ -518,7 +509,7 @@ def get_bias(self) -> Dict[str, tf.Variable]:
     def set_bias(self, value: tf.Variable):
         self.bias = value["bias"]
         self.decoder_bias = value["decoder_bias"]
-        self.vocab_size = shape_list(value["bias"])[0]
+        self.config.vocab_size = shape_list(value["bias"])[0]
 
     def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
         hidden_states = self.dense(inputs=hidden_states)
@@ -527,7 +518,7 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
         seq_length = shape_list(tensor=hidden_states)[1]
         hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size])
         hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True)
-        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
         hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.decoder_bias)
 
         return hidden_states
@@ -572,18 +563,17 @@ class PreTrainedModel
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
-
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
@@ -688,8 +678,8 @@ class TFAlbertForPreTrainingOutput(ModelOutput):
     loss: tf.Tensor = None
     prediction_logits: tf.Tensor = None
     sop_logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 ALBERT_START_DOCSTRING = r"""
@@ -739,7 +729,7 @@ class TFAlbertForPreTrainingOutput(ModelOutput):
         input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`AlbertTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -803,19 +793,18 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFBaseModelOutputWithPooling,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -836,17 +825,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutputWithPooling(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            hidden_states=hs,
-            attentions=attns,
-        )
-
 
 @add_start_docstrings(
     """
@@ -876,17 +854,17 @@ def get_lm_head(self) -> tf.keras.layers.Layer:
     @replace_return_docstrings(output_type=TFAlbertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        sentence_order_label: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        sentence_order_label: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFAlbertForPreTrainingOutput, Tuple[tf.Tensor]]:
         r"""
@@ -896,9 +874,9 @@ def call(
 
         ```python
         >>> import tensorflow as tf
-        >>> from transformers import AlbertTokenizer, TFAlbertForPreTraining
+        >>> from transformers import AutoTokenizer, TFAlbertForPreTraining
 
-        >>> tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
+        >>> tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
         >>> model = TFAlbertForPreTraining.from_pretrained("albert-base-v2")
 
         >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]
@@ -943,17 +921,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFAlbertForPreTrainingOutput) -> TFAlbertForPreTrainingOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFAlbertForPreTrainingOutput(
-            prediction_logits=output.prediction_logits,
-            sop_logits=output.sop_logits,
-            hidden_states=hs,
-            attentions=attns,
-        )
-
 
 class TFAlbertSOPHead(tf.keras.layers.Layer):
     def __init__(self, config: AlbertConfig, **kwargs):
@@ -992,16 +959,16 @@ def get_lm_head(self) -> tf.keras.layers.Layer:
     @replace_return_docstrings(output_type=TFMaskedLMOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1016,9 +983,9 @@ def call(
 
         ```python
         >>> import tensorflow as tf
-        >>> from transformers import AlbertTokenizer, TFAlbertForMaskedLM
+        >>> from transformers import AutoTokenizer, TFAlbertForMaskedLM
 
-        >>> tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
+        >>> tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
         >>> model = TFAlbertForMaskedLM.from_pretrained("albert-base-v2")
 
         >>> # add mask_token
@@ -1068,13 +1035,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output
-    def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1102,7 +1062,6 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="vumichien/albert-base-v2-imdb",
         output_type=TFSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1111,16 +1070,16 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1158,13 +1117,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1197,28 +1149,22 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint="vumichien/tiny-albert",
+        checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFTokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output=(
-            "['LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_0', 'LABEL_1', 'LABEL_0', 'LABEL_1', 'LABEL_1', "
-            "'LABEL_0', 'LABEL_1', 'LABEL_0', 'LABEL_0', 'LABEL_1', 'LABEL_1']"
-        ),
-        expected_loss=0.66,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1254,13 +1200,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1286,7 +1225,6 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="vumichien/albert-base-v2-squad2",
         output_type=TFQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1297,17 +1235,17 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1357,15 +1295,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output
-    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
-
 
 @add_start_docstrings(
     """
@@ -1388,36 +1317,25 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
             units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
         )
 
-    @property
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            tf.Tensor with dummy inputs
-        """
-        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32)}
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFMultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1476,25 +1394,3 @@ def call(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-                "token_type_ids": tf.TensorSpec((None, None, None), tf.int32, name="token_type_ids"),
-            }
-        ]
-    )
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving
-    def serving(self, inputs: Dict[str, tf.Tensor]) -> TFMultipleChoiceModelOutput:
-        output = self.call(input_ids=inputs)
-
-        return self.serving_output(output)
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output
-    def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMultipleChoiceModelOutput(logits=output.logits, hidden_states=hs, attentions=attns)
diff --git a/src/transformers/models/albert/tokenization_albert.py b/src/transformers/models/albert/tokenization_albert.py
index 5bebb936cf7d..3ff319199522 100644
--- a/src/transformers/models/albert/tokenization_albert.py
+++ b/src/transformers/models/albert/tokenization_albert.py
@@ -147,7 +147,7 @@ def __init__(
         cls_token="[CLS]",
         mask_token="[MASK]",
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         # Mask token behave like a normal word, i.e. include the space before it and
         # is included in the raw text, there should be a match in a non-normalized sentence.
@@ -159,6 +159,14 @@ def __init__(
 
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
 
+        self.do_lower_case = do_lower_case
+        self.remove_space = remove_space
+        self.keep_accents = keep_accents
+        self.vocab_file = vocab_file
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+
         super().__init__(
             do_lower_case=do_lower_case,
             remove_space=remove_space,
@@ -174,19 +182,11 @@ def __init__(
             **kwargs,
         )
 
-        self.do_lower_case = do_lower_case
-        self.remove_space = remove_space
-        self.keep_accents = keep_accents
-        self.vocab_file = vocab_file
-
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(vocab_file)
-
     @property
-    def vocab_size(self):
+    def vocab_size(self) -> int:
         return len(self.sp_model)
 
-    def get_vocab(self):
+    def get_vocab(self) -> Dict[str, int]:
         vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
         vocab.update(self.added_tokens_encoder)
         return vocab
@@ -228,6 +228,8 @@ def _tokenize(self, text: str) -> List[str]:
         new_pieces = []
         for piece in pieces:
             if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
+                # Logic to handle special cases see https://github.com/google-research/bert/blob/master/README.md#tokenization
+                # `9,9` -> ['▁9', ',', '9'] instead of [`_9,`, '9']
                 cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
                 if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
                     if len(cur_pieces[0]) == 1:
diff --git a/src/transformers/models/albert/tokenization_albert_fast.py b/src/transformers/models/albert/tokenization_albert_fast.py
index c2ffcd90b192..200953f8e6b9 100644
--- a/src/transformers/models/albert/tokenization_albert_fast.py
+++ b/src/transformers/models/albert/tokenization_albert_fast.py
@@ -135,7 +135,7 @@ def __init__(
         pad_token="<pad>",
         cls_token="[CLS]",
         mask_token="[MASK]",
-        **kwargs
+        **kwargs,
     ):
         # Mask token behave like a normal word, i.e. include the space before it and
         # is included in the raw text, there should be a match in a non-normalized sentence.
@@ -165,7 +165,10 @@ def __init__(
         self.remove_space = remove_space
         self.keep_accents = keep_accents
         self.vocab_file = vocab_file
-        self.can_save_slow_tokenizer = False if not self.vocab_file else True
+
+    @property
+    def can_save_slow_tokenizer(self) -> bool:
+        return os.path.isfile(self.vocab_file) if self.vocab_file else False
 
     def build_inputs_with_special_tokens(
         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
diff --git a/src/transformers/models/align/__init__.py b/src/transformers/models/align/__init__.py
new file mode 100644
index 000000000000..8f9a6c40a716
--- /dev/null
+++ b/src/transformers/models/align/__init__.py
@@ -0,0 +1,73 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_align": [
+        "ALIGN_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "AlignConfig",
+        "AlignTextConfig",
+        "AlignVisionConfig",
+    ],
+    "processing_align": ["AlignProcessor"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_align"] = [
+        "ALIGN_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "AlignModel",
+        "AlignPreTrainedModel",
+        "AlignTextModel",
+        "AlignVisionModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_align import (
+        ALIGN_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        AlignConfig,
+        AlignTextConfig,
+        AlignVisionConfig,
+    )
+    from .processing_align import AlignProcessor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_align import (
+            ALIGN_PRETRAINED_MODEL_ARCHIVE_LIST,
+            AlignModel,
+            AlignPreTrainedModel,
+            AlignTextModel,
+            AlignVisionModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/align/configuration_align.py b/src/transformers/models/align/configuration_align.py
new file mode 100644
index 000000000000..74cfbfbe3380
--- /dev/null
+++ b/src/transformers/models/align/configuration_align.py
@@ -0,0 +1,383 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" ALIGN model configuration"""
+
+import os
+from typing import TYPE_CHECKING, List, Union
+
+
+if TYPE_CHECKING:
+    pass
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+ALIGN_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "kakaobrain/align-base": "https://huggingface.co/kakaobrain/align-base/resolve/main/config.json",
+}
+
+
+class AlignTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`AlignTextModel`]. It is used to instantiate a
+    ALIGN text encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the text encoder of the ALIGN
+    [kakaobrain/align-base](https://huggingface.co/kakaobrain/align-base) architecture. The default values here are
+    copied from BERT.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the Align Text model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`AlignTextModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`AlignTextModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+
+    Example:
+
+    ```python
+    >>> from transformers import AlignTextConfig, AlignTextModel
+
+    >>> # Initializing a AlignTextConfig with kakaobrain/align-base style configuration
+    >>> configuration = AlignTextConfig()
+
+    >>> # Initializing a AlignTextModel (with random weights) from the kakaobrain/align-base style configuration
+    >>> model = AlignTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "align_text_model"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        position_embedding_type="absolute",
+        use_cache=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.pad_token_id = pad_token_id
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the text config dict if we are loading from AlignConfig
+        if config_dict.get("model_type") == "align":
+            config_dict = config_dict["text_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class AlignVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`AlignVisionModel`]. It is used to instantiate a
+    ALIGN vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the vision encoder of the ALIGN
+    [kakaobrain/align-base](https://huggingface.co/kakaobrain/align-base) architecture. The default values are copied
+    from EfficientNet (efficientnet-b7)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        image_size (`int`, *optional*, defaults to 600):
+            The input image size.
+        width_coefficient (`float`, *optional*, defaults to 2.0):
+            Scaling coefficient for network width at each stage.
+        depth_coefficient (`float`, *optional*, defaults to 3.1):
+            Scaling coefficient for network depth at each stage.
+        depth_divisor `int`, *optional*, defaults to 8):
+            A unit of network width.
+        kernel_sizes (`List[int]`, *optional*, defaults to `[3, 3, 5, 3, 5, 5, 3]`):
+            List of kernel sizes to be used in each block.
+        in_channels (`List[int]`, *optional*, defaults to `[32, 16, 24, 40, 80, 112, 192]`):
+            List of input channel sizes to be used in each block for convolutional layers.
+        out_channels (`List[int]`, *optional*, defaults to `[16, 24, 40, 80, 112, 192, 320]`):
+            List of output channel sizes to be used in each block for convolutional layers.
+        depthwise_padding (`List[int]`, *optional*, defaults to `[]`):
+            List of block indices with square padding.
+        strides (`List[int]`, *optional*, defaults to `[1, 2, 2, 2, 1, 2, 1]`):
+            List of stride sizes to be used in each block for convolutional layers.
+        num_block_repeats (`List[int]`, *optional*, defaults to `[1, 2, 2, 3, 3, 4, 1]`):
+            List of the number of times each block is to repeated.
+        expand_ratios (`List[int]`, *optional*, defaults to `[1, 6, 6, 6, 6, 6, 6]`):
+            List of scaling coefficient of each block.
+        squeeze_expansion_ratio (`float`, *optional*, defaults to 0.25):
+            Squeeze expansion ratio.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in each block. If string, `"gelu"`, `"relu"`,
+            `"selu", `"gelu_new"`, `"silu"` and `"mish"` are supported.
+        hiddem_dim (`int`, *optional*, defaults to 1280):
+            The hidden dimension of the layer before the classification head.
+        pooling_type (`str` or `function`, *optional*, defaults to `"mean"`):
+            Type of final pooling to be applied before the dense classification head. Available options are [`"mean"`,
+            `"max"`]
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        batch_norm_eps (`float`, *optional*, defaults to 1e-3):
+            The epsilon used by the batch normalization layers.
+        batch_norm_momentum (`float`, *optional*, defaults to 0.99):
+            The momentum used by the batch normalization layers.
+        drop_connect_rate (`float`, *optional*, defaults to 0.2):
+            The drop rate for skip connections.
+
+    Example:
+
+    ```python
+    >>> from transformers import AlignVisionConfig, AlignVisionModel
+
+    >>> # Initializing a AlignVisionConfig with kakaobrain/align-base style configuration
+    >>> configuration = AlignVisionConfig()
+
+    >>> # Initializing a AlignVisionModel (with random weights) from the kakaobrain/align-base style configuration
+    >>> model = AlignVisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "align_vision_model"
+
+    def __init__(
+        self,
+        num_channels: int = 3,
+        image_size: int = 600,
+        width_coefficient: float = 2.0,
+        depth_coefficient: float = 3.1,
+        depth_divisor: int = 8,
+        kernel_sizes: List[int] = [3, 3, 5, 3, 5, 5, 3],
+        in_channels: List[int] = [32, 16, 24, 40, 80, 112, 192],
+        out_channels: List[int] = [16, 24, 40, 80, 112, 192, 320],
+        depthwise_padding: List[int] = [],
+        strides: List[int] = [1, 2, 2, 2, 1, 2, 1],
+        num_block_repeats: List[int] = [1, 2, 2, 3, 3, 4, 1],
+        expand_ratios: List[int] = [1, 6, 6, 6, 6, 6, 6],
+        squeeze_expansion_ratio: float = 0.25,
+        hidden_act: str = "swish",
+        hidden_dim: int = 2560,
+        pooling_type: str = "mean",
+        initializer_range: float = 0.02,
+        batch_norm_eps: float = 0.001,
+        batch_norm_momentum: float = 0.99,
+        drop_connect_rate: float = 0.2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.width_coefficient = width_coefficient
+        self.depth_coefficient = depth_coefficient
+        self.depth_divisor = depth_divisor
+        self.kernel_sizes = kernel_sizes
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.depthwise_padding = depthwise_padding
+        self.strides = strides
+        self.num_block_repeats = num_block_repeats
+        self.expand_ratios = expand_ratios
+        self.squeeze_expansion_ratio = squeeze_expansion_ratio
+        self.hidden_act = hidden_act
+        self.hidden_dim = hidden_dim
+        self.pooling_type = pooling_type
+        self.initializer_range = initializer_range
+        self.batch_norm_eps = batch_norm_eps
+        self.batch_norm_momentum = batch_norm_momentum
+        self.drop_connect_rate = drop_connect_rate
+        self.num_hidden_layers = sum(num_block_repeats) * 4
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from AlignConfig
+        if config_dict.get("model_type") == "align":
+            config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class AlignConfig(PretrainedConfig):
+    r"""
+    [`AlignConfig`] is the configuration class to store the configuration of a [`AlignModel`]. It is used to
+    instantiate a ALIGN model according to the specified arguments, defining the text model and vision model configs.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the ALIGN
+    [kakaobrain/align-base](https://huggingface.co/kakaobrain/align-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`AlignTextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`AlignVisionConfig`].
+        projection_dim (`int`, *optional*, defaults to 640):
+            Dimentionality of text and vision projection layers.
+        temperature_init_value (`float`, *optional*, defaults to 1.0):
+            The inital value of the *temperature* paramter. Default is used as per the original ALIGN implementation.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from transformers import AlignConfig, AlignModel
+
+    >>> # Initializing a AlignConfig with kakaobrain/align-base style configuration
+    >>> configuration = AlignConfig()
+
+    >>> # Initializing a AlignModel (with random weights) from the kakaobrain/align-base style configuration
+    >>> model = AlignModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+
+    >>> # We can also initialize a AlignConfig from a AlignTextConfig and a AlignVisionConfig
+    >>> from transformers import AlignTextConfig, AlignVisionConfig
+
+    >>> # Initializing ALIGN Text and Vision configurations
+    >>> config_text = AlignTextConfig()
+    >>> config_vision = AlignVisionConfig()
+
+    >>> config = AlignConfig.from_text_vision_configs(config_text, config_vision)
+    ```"""
+
+    model_type = "align"
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        projection_dim=640,
+        temperature_init_value=1.0,
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        if text_config is None:
+            text_config = {}
+            logger.info("text_config is None. Initializing the AlignTextConfig with default values.")
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("vision_config is None. Initializing the AlignVisionConfig with default values.")
+
+        self.text_config = AlignTextConfig(**text_config)
+        self.vision_config = AlignVisionConfig(**vision_config)
+
+        self.projection_dim = projection_dim
+        self.temperature_init_value = temperature_init_value
+        self.initializer_range = initializer_range
+
+    @classmethod
+    def from_text_vision_configs(cls, text_config: AlignTextConfig, vision_config: AlignVisionConfig, **kwargs):
+        r"""
+        Instantiate a [`AlignConfig`] (or a derived class) from align text model configuration and align vision model
+        configuration.
+
+        Returns:
+            [`AlignConfig`]: An instance of a configuration object
+        """
+
+        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
diff --git a/src/transformers/models/align/convert_align_tf_to_hf.py b/src/transformers/models/align/convert_align_tf_to_hf.py
new file mode 100644
index 000000000000..96e981079769
--- /dev/null
+++ b/src/transformers/models/align/convert_align_tf_to_hf.py
@@ -0,0 +1,389 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert ALIGN checkpoints from the original repository."""
+
+import argparse
+import os
+
+import align
+import numpy as np
+import requests
+import tensorflow as tf
+import torch
+from PIL import Image
+from tokenizer import Tokenizer
+
+from transformers import (
+    AlignConfig,
+    AlignModel,
+    AlignProcessor,
+    BertConfig,
+    BertTokenizer,
+    EfficientNetConfig,
+    EfficientNetImageProcessor,
+)
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+def preprocess(image):
+    image = tf.image.resize(image, (346, 346))
+    image = tf.image.crop_to_bounding_box(image, (346 - 289) // 2, (346 - 289) // 2, 289, 289)
+    return image
+
+
+def get_align_config():
+    vision_config = EfficientNetConfig.from_pretrained("google/efficientnet-b7")
+    vision_config.image_size = 289
+    vision_config.hidden_dim = 640
+    vision_config.id2label = {"0": "LABEL_0", "1": "LABEL_1"}
+    vision_config.label2id = {"LABEL_0": 0, "LABEL_1": 1}
+    vision_config.depthwise_padding = []
+
+    text_config = BertConfig()
+    config = AlignConfig.from_text_vision_configs(
+        text_config=text_config, vision_config=vision_config, projection_dim=640
+    )
+    return config
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    return im
+
+
+def get_processor():
+    image_processor = EfficientNetImageProcessor(
+        do_center_crop=True,
+        rescale_factor=1 / 127.5,
+        rescale_offset=True,
+        do_normalize=False,
+        include_top=False,
+        resample=Image.BILINEAR,
+    )
+    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+    tokenizer.model_max_length = 64
+    processor = AlignProcessor(image_processor=image_processor, tokenizer=tokenizer)
+    return processor
+
+
+# here we list all keys to be renamed (original name on the left, our name on the right)
+def rename_keys(original_param_names):
+    # EfficientNet image encoder
+    block_names = [v.split("_")[0].split("block")[1] for v in original_param_names if v.startswith("block")]
+    block_names = list(set(block_names))
+    block_names = sorted(block_names)
+    num_blocks = len(block_names)
+    block_name_mapping = {b: str(i) for b, i in zip(block_names, range(num_blocks))}
+
+    rename_keys = []
+    rename_keys.append(("stem_conv/kernel:0", "embeddings.convolution.weight"))
+    rename_keys.append(("stem_bn/gamma:0", "embeddings.batchnorm.weight"))
+    rename_keys.append(("stem_bn/beta:0", "embeddings.batchnorm.bias"))
+    rename_keys.append(("stem_bn/moving_mean:0", "embeddings.batchnorm.running_mean"))
+    rename_keys.append(("stem_bn/moving_variance:0", "embeddings.batchnorm.running_var"))
+
+    for b in block_names:
+        hf_b = block_name_mapping[b]
+        rename_keys.append((f"block{b}_expand_conv/kernel:0", f"encoder.blocks.{hf_b}.expansion.expand_conv.weight"))
+        rename_keys.append((f"block{b}_expand_bn/gamma:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.weight"))
+        rename_keys.append((f"block{b}_expand_bn/beta:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.bias"))
+        rename_keys.append(
+            (f"block{b}_expand_bn/moving_mean:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.running_mean")
+        )
+        rename_keys.append(
+            (f"block{b}_expand_bn/moving_variance:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.running_var")
+        )
+        rename_keys.append(
+            (f"block{b}_dwconv/depthwise_kernel:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_conv.weight")
+        )
+        rename_keys.append((f"block{b}_bn/gamma:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.weight"))
+        rename_keys.append((f"block{b}_bn/beta:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.bias"))
+        rename_keys.append(
+            (f"block{b}_bn/moving_mean:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.running_mean")
+        )
+        rename_keys.append(
+            (f"block{b}_bn/moving_variance:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.running_var")
+        )
+
+        rename_keys.append((f"block{b}_se_reduce/kernel:0", f"encoder.blocks.{hf_b}.squeeze_excite.reduce.weight"))
+        rename_keys.append((f"block{b}_se_reduce/bias:0", f"encoder.blocks.{hf_b}.squeeze_excite.reduce.bias"))
+        rename_keys.append((f"block{b}_se_expand/kernel:0", f"encoder.blocks.{hf_b}.squeeze_excite.expand.weight"))
+        rename_keys.append((f"block{b}_se_expand/bias:0", f"encoder.blocks.{hf_b}.squeeze_excite.expand.bias"))
+        rename_keys.append(
+            (f"block{b}_project_conv/kernel:0", f"encoder.blocks.{hf_b}.projection.project_conv.weight")
+        )
+        rename_keys.append((f"block{b}_project_bn/gamma:0", f"encoder.blocks.{hf_b}.projection.project_bn.weight"))
+        rename_keys.append((f"block{b}_project_bn/beta:0", f"encoder.blocks.{hf_b}.projection.project_bn.bias"))
+        rename_keys.append(
+            (f"block{b}_project_bn/moving_mean:0", f"encoder.blocks.{hf_b}.projection.project_bn.running_mean")
+        )
+        rename_keys.append(
+            (f"block{b}_project_bn/moving_variance:0", f"encoder.blocks.{hf_b}.projection.project_bn.running_var")
+        )
+
+    key_mapping = {}
+    for item in rename_keys:
+        if item[0] in original_param_names:
+            key_mapping[item[0]] = "vision_model." + item[1]
+
+    # BERT text encoder
+    rename_keys = []
+    old = "tf_bert_model/bert"
+    new = "text_model"
+    for i in range(12):
+        rename_keys.append(
+            (
+                f"{old}/encoder/layer_._{i}/attention/self/query/kernel:0",
+                f"{new}.encoder.layer.{i}.attention.self.query.weight",
+            )
+        )
+        rename_keys.append(
+            (
+                f"{old}/encoder/layer_._{i}/attention/self/query/bias:0",
+                f"{new}.encoder.layer.{i}.attention.self.query.bias",
+            )
+        )
+        rename_keys.append(
+            (
+                f"{old}/encoder/layer_._{i}/attention/self/key/kernel:0",
+                f"{new}.encoder.layer.{i}.attention.self.key.weight",
+            )
+        )
+        rename_keys.append(
+            (
+                f"{old}/encoder/layer_._{i}/attention/self/key/bias:0",
+                f"{new}.encoder.layer.{i}.attention.self.key.bias",
+            )
+        )
+        rename_keys.append(
+            (
+                f"{old}/encoder/layer_._{i}/attention/self/value/kernel:0",
+                f"{new}.encoder.layer.{i}.attention.self.value.weight",
+            )
+        )
+        rename_keys.append(
+            (
+                f"{old}/encoder/layer_._{i}/attention/self/value/bias:0",
+                f"{new}.encoder.layer.{i}.attention.self.value.bias",
+            )
+        )
+        rename_keys.append(
+            (
+                f"{old}/encoder/layer_._{i}/attention/output/dense/kernel:0",
+                f"{new}.encoder.layer.{i}.attention.output.dense.weight",
+            )
+        )
+        rename_keys.append(
+            (
+                f"{old}/encoder/layer_._{i}/attention/output/dense/bias:0",
+                f"{new}.encoder.layer.{i}.attention.output.dense.bias",
+            )
+        )
+        rename_keys.append(
+            (
+                f"{old}/encoder/layer_._{i}/attention/output/LayerNorm/gamma:0",
+                f"{new}.encoder.layer.{i}.attention.output.LayerNorm.weight",
+            )
+        )
+        rename_keys.append(
+            (
+                f"{old}/encoder/layer_._{i}/attention/output/LayerNorm/beta:0",
+                f"{new}.encoder.layer.{i}.attention.output.LayerNorm.bias",
+            )
+        )
+        rename_keys.append(
+            (
+                f"{old}/encoder/layer_._{i}/intermediate/dense/kernel:0",
+                f"{new}.encoder.layer.{i}.intermediate.dense.weight",
+            )
+        )
+        rename_keys.append(
+            (
+                f"{old}/encoder/layer_._{i}/intermediate/dense/bias:0",
+                f"{new}.encoder.layer.{i}.intermediate.dense.bias",
+            )
+        )
+        rename_keys.append(
+            (f"{old}/encoder/layer_._{i}/output/dense/kernel:0", f"{new}.encoder.layer.{i}.output.dense.weight")
+        )
+        rename_keys.append(
+            (f"{old}/encoder/layer_._{i}/output/dense/bias:0", f"{new}.encoder.layer.{i}.output.dense.bias")
+        )
+        rename_keys.append(
+            (f"{old}/encoder/layer_._{i}/output/LayerNorm/gamma:0", f"{new}.encoder.layer.{i}.output.LayerNorm.weight")
+        )
+        rename_keys.append(
+            (f"{old}/encoder/layer_._{i}/output/LayerNorm/beta:0", f"{new}.encoder.layer.{i}.output.LayerNorm.bias")
+        )
+
+    rename_keys.append((f"{old}/embeddings/word_embeddings/weight:0", f"{new}.embeddings.word_embeddings.weight"))
+    rename_keys.append(
+        (f"{old}/embeddings/position_embeddings/embeddings:0", f"{new}.embeddings.position_embeddings.weight")
+    )
+    rename_keys.append(
+        (f"{old}/embeddings/token_type_embeddings/embeddings:0", f"{new}.embeddings.token_type_embeddings.weight")
+    )
+    rename_keys.append((f"{old}/embeddings/LayerNorm/gamma:0", f"{new}.embeddings.LayerNorm.weight"))
+    rename_keys.append((f"{old}/embeddings/LayerNorm/beta:0", f"{new}.embeddings.LayerNorm.bias"))
+
+    rename_keys.append((f"{old}/pooler/dense/kernel:0", f"{new}.pooler.dense.weight"))
+    rename_keys.append((f"{old}/pooler/dense/bias:0", f"{new}.pooler.dense.bias"))
+    rename_keys.append(("dense/kernel:0", "text_projection.weight"))
+    rename_keys.append(("dense/bias:0", "text_projection.bias"))
+    rename_keys.append(("dense/bias:0", "text_projection.bias"))
+    rename_keys.append(("temperature:0", "temperature"))
+
+    for item in rename_keys:
+        if item[0] in original_param_names:
+            key_mapping[item[0]] = item[1]
+    return key_mapping
+
+
+def replace_params(hf_params, tf_params, key_mapping):
+    list(hf_params.keys())
+
+    for key, value in tf_params.items():
+        if key not in key_mapping:
+            continue
+
+        hf_key = key_mapping[key]
+        if "_conv" in key and "kernel" in key:
+            new_hf_value = torch.from_numpy(value).permute(3, 2, 0, 1)
+        elif "embeddings" in key:
+            new_hf_value = torch.from_numpy(value)
+        elif "depthwise_kernel" in key:
+            new_hf_value = torch.from_numpy(value).permute(2, 3, 0, 1)
+        elif "kernel" in key:
+            new_hf_value = torch.from_numpy(np.transpose(value))
+        elif "temperature" in key:
+            new_hf_value = value
+        elif "bn/gamma" or "bn/beta" in key:
+            new_hf_value = torch.from_numpy(np.transpose(value)).squeeze()
+        else:
+            new_hf_value = torch.from_numpy(value)
+
+        # Replace HF parameters with original TF model parameters
+        hf_params[hf_key].copy_(new_hf_value)
+
+
+@torch.no_grad()
+def convert_align_checkpoint(checkpoint_path, pytorch_dump_folder_path, save_model, push_to_hub):
+    """
+    Copy/paste/tweak model's weights to our ALIGN structure.
+    """
+    # Load original model
+    seq_length = 64
+    tok = Tokenizer(seq_length)
+    original_model = align.Align("efficientnet-b7", "bert-base", 640, seq_length, tok.get_vocab_size())
+    original_model.compile()
+    original_model.load_weights(checkpoint_path)
+
+    tf_params = original_model.trainable_variables
+    tf_non_train_params = original_model.non_trainable_variables
+    tf_params = {param.name: param.numpy() for param in tf_params}
+    for param in tf_non_train_params:
+        tf_params[param.name] = param.numpy()
+    tf_param_names = list(tf_params.keys())
+
+    # Load HuggingFace model
+    config = get_align_config()
+    hf_model = AlignModel(config).eval()
+    hf_params = hf_model.state_dict()
+
+    # Create src-to-dst parameter name mapping dictionary
+    print("Converting parameters...")
+    key_mapping = rename_keys(tf_param_names)
+    replace_params(hf_params, tf_params, key_mapping)
+
+    # Initialize processor
+    processor = get_processor()
+    inputs = processor(
+        images=prepare_img(), text="A picture of a cat", padding="max_length", max_length=64, return_tensors="pt"
+    )
+
+    # HF model inference
+    hf_model.eval()
+    with torch.no_grad():
+        outputs = hf_model(**inputs)
+
+    hf_image_features = outputs.image_embeds.detach().numpy()
+    hf_text_features = outputs.text_embeds.detach().numpy()
+
+    # Original model inference
+    original_model.trainable = False
+    tf_image_processor = EfficientNetImageProcessor(
+        do_center_crop=True,
+        do_rescale=False,
+        do_normalize=False,
+        include_top=False,
+        resample=Image.BILINEAR,
+    )
+    image = tf_image_processor(images=prepare_img(), return_tensors="tf", data_format="channels_last")["pixel_values"]
+    text = tok(tf.constant(["A picture of a cat"]))
+
+    image_features = original_model.image_encoder(image, training=False)
+    text_features = original_model.text_encoder(text, training=False)
+
+    image_features = tf.nn.l2_normalize(image_features, axis=-1)
+    text_features = tf.nn.l2_normalize(text_features, axis=-1)
+
+    # Check whether original and HF model outputs match  -> np.allclose
+    if not np.allclose(image_features, hf_image_features, atol=1e-3):
+        raise ValueError("The predicted image features are not the same.")
+    if not np.allclose(text_features, hf_text_features, atol=1e-3):
+        raise ValueError("The predicted text features are not the same.")
+    print("Model outputs match!")
+
+    if save_model:
+        # Create folder to save model
+        if not os.path.isdir(pytorch_dump_folder_path):
+            os.mkdir(pytorch_dump_folder_path)
+        # Save converted model and image processor
+        hf_model.save_pretrained(pytorch_dump_folder_path)
+        processor.save_pretrained(pytorch_dump_folder_path)
+
+    if push_to_hub:
+        # Push model and image processor to hub
+        print("Pushing converted ALIGN to the hub...")
+        processor.push_to_hub("align-base")
+        hf_model.push_to_hub("align-base")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--checkpoint_path",
+        default="./weights/model-weights",
+        type=str,
+        help="Path to the pretrained TF ALIGN checkpoint.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        default="hf_model",
+        type=str,
+        help="Path to the output PyTorch model directory.",
+    )
+    parser.add_argument("--save_model", action="store_true", help="Save model to local")
+    parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub")
+
+    args = parser.parse_args()
+    convert_align_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.save_model, args.push_to_hub)
diff --git a/src/transformers/models/align/modeling_align.py b/src/transformers/models/align/modeling_align.py
new file mode 100644
index 000000000000..6cbf01a3432c
--- /dev/null
+++ b/src/transformers/models/align/modeling_align.py
@@ -0,0 +1,1644 @@
+# coding=utf-8
+# Copyright 2023 The Google Research Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch ALIGN model."""
+
+import math
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BaseModelOutputWithNoAttention,
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    BaseModelOutputWithPoolingAndNoAttention,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_align import AlignConfig, AlignTextConfig, AlignVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "kakaobrain/align-base"
+_CONFIG_FOR_DOC = "AlignConfig"
+
+
+ALIGN_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "kakaobrain/align-base",
+    # See all ALIGN models at https://huggingface.co/models?filter=align
+]
+
+
+ALIGN_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`AlignConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+ALIGN_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+ALIGN_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`EfficientNetImageProcessor.__call__`] for details.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+ALIGN_INPUTS_DOCSTRING = r"""
+    Args:
+       input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`EfficientNetImageProcessor.__call__`] for details.
+        return_loss (`bool`, *optional*):
+            Whether or not to return the contrastive loss.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@dataclass
+class AlignVisionModelOutput(ModelOutput):
+    """
+    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
+
+    Args:
+        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The image embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+    """
+
+    image_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class AlignTextModelOutput(ModelOutput):
+    """
+    Base class for text model's outputs that also contains a pooling of the last hidden states.
+
+    Args:
+        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The text embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    text_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class AlignOutput(ModelOutput):
+    """
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
+            Contrastive loss for image-text similarity.
+        logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
+            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
+            similarity scores.
+        logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
+            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
+            similarity scores.
+        text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
+            The text embeddings obtained by applying the projection layer to the pooled output of [`AlignTextModel`].
+        image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
+            The output of [`AlignVisionModel`].
+        text_model_output(`BaseModelOutputWithPoolingAndCrossAttentions`):
+            The output of the [`AlignTextModel`].
+        vision_model_output(`BaseModelOutputWithPoolingAndNoAttention`):
+            The output of the [`AlignVisionModel`].
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits_per_image: torch.FloatTensor = None
+    logits_per_text: torch.FloatTensor = None
+    text_embeds: torch.FloatTensor = None
+    image_embeds: torch.FloatTensor = None
+    text_model_output: BaseModelOutputWithPoolingAndCrossAttentions = None
+    vision_model_output: BaseModelOutputWithPoolingAndNoAttention = None
+
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+# contrastive loss function, adapted from
+# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html
+def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
+    return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device), label_smoothing=0.1)
+
+
+def align_loss(similarity: torch.Tensor) -> torch.Tensor:
+    caption_loss = contrastive_loss(similarity)
+    image_loss = contrastive_loss(similarity.t())
+    return (caption_loss + image_loss) / 2.0
+
+
+# Copied from transformers.models.efficientnet.modeling_efficientnet.round_filters with EfficientNet->AlignVision
+def round_filters(config: AlignVisionConfig, num_channels: int):
+    r"""
+    Round number of filters based on depth multiplier.
+    """
+    divisor = config.depth_divisor
+    num_channels *= config.width_coefficient
+    new_dim = max(divisor, int(num_channels + divisor / 2) // divisor * divisor)
+
+    # Make sure that round down does not go down by more than 10%.
+    if new_dim < 0.9 * num_channels:
+        new_dim += divisor
+
+    return int(new_dim)
+
+
+# Copied from transformers.models.efficientnet.modeling_efficientnet.correct_pad
+def correct_pad(kernel_size: Union[int, Tuple], adjust: bool = True):
+    r"""
+    Utility function to get the tuple padding value for the depthwise convolution.
+
+    Args:
+        kernel_size (`int` or `tuple`):
+            Kernel size of the convolution layers.
+        adjust (`bool`, *optional*, defaults to `True`):
+            Adjusts padding value to apply to right and bottom sides of the input.
+    """
+    if isinstance(kernel_size, int):
+        kernel_size = (kernel_size, kernel_size)
+
+    correct = (kernel_size[0] // 2, kernel_size[1] // 2)
+    if adjust:
+        return (correct[1] - 1, correct[1], correct[0] - 1, correct[0])
+    else:
+        return (correct[1], correct[1], correct[0], correct[0])
+
+
+# Copied from transformers.models.efficientnet.modeling_efficientnet.EfficientNetEmbeddings with EfficientNet->AlignVision
+class AlignVisionEmbeddings(nn.Module):
+    r"""
+    A module that corresponds to the stem module of the original work.
+    """
+
+    def __init__(self, config: AlignVisionConfig):
+        super().__init__()
+
+        self.out_dim = round_filters(config, 32)
+        self.padding = nn.ZeroPad2d(padding=(0, 1, 0, 1))
+        self.convolution = nn.Conv2d(
+            config.num_channels, self.out_dim, kernel_size=3, stride=2, padding="valid", bias=False
+        )
+        self.batchnorm = nn.BatchNorm2d(self.out_dim, eps=config.batch_norm_eps, momentum=config.batch_norm_momentum)
+        self.activation = ACT2FN[config.hidden_act]
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        features = self.padding(pixel_values)
+        features = self.convolution(features)
+        features = self.batchnorm(features)
+        features = self.activation(features)
+
+        return features
+
+
+# Copied from transformers.models.efficientnet.modeling_efficientnet.EfficientNetDepthwiseConv2d with EfficientNet->AlignVision
+class AlignVisionDepthwiseConv2d(nn.Conv2d):
+    def __init__(
+        self,
+        in_channels,
+        depth_multiplier=1,
+        kernel_size=3,
+        stride=1,
+        padding=0,
+        dilation=1,
+        bias=True,
+        padding_mode="zeros",
+    ):
+        out_channels = in_channels * depth_multiplier
+        super().__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=in_channels,
+            bias=bias,
+            padding_mode=padding_mode,
+        )
+
+
+# Copied from transformers.models.efficientnet.modeling_efficientnet.EfficientNetExpansionLayer with EfficientNet->AlignVision
+class AlignVisionExpansionLayer(nn.Module):
+    r"""
+    This corresponds to the expansion phase of each block in the original implementation.
+    """
+
+    def __init__(self, config: AlignVisionConfig, in_dim: int, out_dim: int, stride: int):
+        super().__init__()
+        self.expand_conv = nn.Conv2d(
+            in_channels=in_dim,
+            out_channels=out_dim,
+            kernel_size=1,
+            padding="same",
+            bias=False,
+        )
+        self.expand_bn = nn.BatchNorm2d(num_features=out_dim, eps=config.batch_norm_eps)
+        self.expand_act = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.Tensor:
+        # Expand phase
+        hidden_states = self.expand_conv(hidden_states)
+        hidden_states = self.expand_bn(hidden_states)
+        hidden_states = self.expand_act(hidden_states)
+
+        return hidden_states
+
+
+# Copied from transformers.models.efficientnet.modeling_efficientnet.EfficientNetDepthwiseLayer with with EfficientNet->AlignVision
+class AlignVisionDepthwiseLayer(nn.Module):
+    r"""
+    This corresponds to the depthwise convolution phase of each block in the original implementation.
+    """
+
+    def __init__(
+        self,
+        config: AlignVisionConfig,
+        in_dim: int,
+        stride: int,
+        kernel_size: int,
+        adjust_padding: bool,
+    ):
+        super().__init__()
+        self.stride = stride
+        conv_pad = "valid" if self.stride == 2 else "same"
+        padding = correct_pad(kernel_size, adjust=adjust_padding)
+
+        self.depthwise_conv_pad = nn.ZeroPad2d(padding=padding)
+        self.depthwise_conv = AlignVisionDepthwiseConv2d(
+            in_dim, kernel_size=kernel_size, stride=stride, padding=conv_pad, bias=False
+        )
+        self.depthwise_norm = nn.BatchNorm2d(
+            num_features=in_dim, eps=config.batch_norm_eps, momentum=config.batch_norm_momentum
+        )
+        self.depthwise_act = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.Tensor:
+        # Depthwise convolution
+        if self.stride == 2:
+            hidden_states = self.depthwise_conv_pad(hidden_states)
+
+        hidden_states = self.depthwise_conv(hidden_states)
+        hidden_states = self.depthwise_norm(hidden_states)
+        hidden_states = self.depthwise_act(hidden_states)
+
+        return hidden_states
+
+
+# Copied from transformers.models.efficientnet.modeling_efficientnet.EfficientNetSqueezeExciteLayer with with EfficientNet->AlignVision
+class AlignVisionSqueezeExciteLayer(nn.Module):
+    r"""
+    This corresponds to the Squeeze and Excitement phase of each block in the original implementation.
+    """
+
+    def __init__(self, config: AlignVisionConfig, in_dim: int, expand_dim: int, expand: bool = False):
+        super().__init__()
+        self.dim = expand_dim if expand else in_dim
+        self.dim_se = max(1, int(in_dim * config.squeeze_expansion_ratio))
+
+        self.squeeze = nn.AdaptiveAvgPool2d(output_size=1)
+        self.reduce = nn.Conv2d(
+            in_channels=self.dim,
+            out_channels=self.dim_se,
+            kernel_size=1,
+            padding="same",
+        )
+        self.expand = nn.Conv2d(
+            in_channels=self.dim_se,
+            out_channels=self.dim,
+            kernel_size=1,
+            padding="same",
+        )
+        self.act_reduce = ACT2FN[config.hidden_act]
+        self.act_expand = nn.Sigmoid()
+
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.Tensor:
+        inputs = hidden_states
+        hidden_states = self.squeeze(hidden_states)
+        hidden_states = self.reduce(hidden_states)
+        hidden_states = self.act_reduce(hidden_states)
+
+        hidden_states = self.expand(hidden_states)
+        hidden_states = self.act_expand(hidden_states)
+        hidden_states = torch.mul(inputs, hidden_states)
+
+        return hidden_states
+
+
+class AlignVisionFinalBlockLayer(nn.Module):
+    r"""
+    This corresponds to the final phase of each block in the original implementation.
+    """
+
+    def __init__(
+        self, config: AlignVisionConfig, in_dim: int, out_dim: int, stride: int, drop_rate: float, id_skip: bool
+    ):
+        super().__init__()
+        self.apply_dropout = stride == 1 and not id_skip
+        self.project_conv = nn.Conv2d(
+            in_channels=in_dim,
+            out_channels=out_dim,
+            kernel_size=1,
+            padding="same",
+            bias=False,
+        )
+        self.project_bn = nn.BatchNorm2d(
+            num_features=out_dim, eps=config.batch_norm_eps, momentum=config.batch_norm_momentum
+        )
+        self.dropout = nn.Dropout(p=drop_rate)
+
+    def forward(self, embeddings: torch.FloatTensor, hidden_states: torch.FloatTensor) -> torch.Tensor:
+        hidden_states = self.project_conv(hidden_states)
+        hidden_states = self.project_bn(hidden_states)
+
+        if self.apply_dropout:
+            hidden_states = self.dropout(hidden_states)
+            hidden_states = hidden_states + embeddings
+
+        return hidden_states
+
+
+class AlignVisionBlock(nn.Module):
+    r"""
+    This corresponds to the block module of original the EfficientNet vision encoder implementation.
+
+    Args:
+        config ([`AlignVisionConfig`]):
+            Model configuration class.
+        in_dim (`int`):
+            Number of input channels.
+        out_dim (`int`):
+            Number of output channels.
+        stride (`int`):
+            Stride size to be used in convolution layers.
+        expand_ratio (`int`):
+            Expand ratio to set the output dimensions for the expansion and squeeze-excite layers.
+        kernel_size (`int`):
+            Kernel size for the depthwise convolution layer.
+        drop_rate (`float`):
+            Dropout rate to be used in the final phase of each block.
+        id_skip (`bool`):
+            Whether to apply dropout and sum the final hidden states with the input embeddings during the final phase
+            of each block. Set to `True` for the first block of each stage.
+        adjust_padding (`bool`):
+            Whether to apply padding to only right and bottom side of the input kernel before the depthwise convolution
+            operation, set to `True` for inputs with odd input sizes.
+    """
+
+    def __init__(
+        self,
+        config: AlignVisionConfig,
+        in_dim: int,
+        out_dim: int,
+        stride: int,
+        expand_ratio: int,
+        kernel_size: int,
+        drop_rate: float,
+        id_skip: bool,
+        adjust_padding: bool,
+    ):
+        super().__init__()
+        self.expand_ratio = expand_ratio
+        self.expand = True if self.expand_ratio != 1 else False
+        expand_in_dim = in_dim * expand_ratio
+
+        if self.expand:
+            self.expansion = AlignVisionExpansionLayer(
+                config=config, in_dim=in_dim, out_dim=expand_in_dim, stride=stride
+            )
+
+        self.depthwise_conv = AlignVisionDepthwiseLayer(
+            config=config,
+            in_dim=expand_in_dim if self.expand else in_dim,
+            stride=stride,
+            kernel_size=kernel_size,
+            adjust_padding=adjust_padding,
+        )
+        self.squeeze_excite = AlignVisionSqueezeExciteLayer(
+            config=config, in_dim=in_dim, expand_dim=expand_in_dim, expand=self.expand
+        )
+        self.projection = AlignVisionFinalBlockLayer(
+            config=config,
+            in_dim=expand_in_dim if self.expand else in_dim,
+            out_dim=out_dim,
+            stride=stride,
+            drop_rate=drop_rate,
+            id_skip=id_skip,
+        )
+
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.Tensor:
+        embeddings = hidden_states
+        # Expansion and depthwise convolution phase
+        if self.expand_ratio != 1:
+            hidden_states = self.expansion(hidden_states)
+        hidden_states = self.depthwise_conv(hidden_states)
+
+        # Squeeze and excite phase
+        hidden_states = self.squeeze_excite(hidden_states)
+        hidden_states = self.projection(embeddings, hidden_states)
+        return hidden_states
+
+
+class AlignVisionEncoder(nn.Module):
+    r"""
+    Forward propogates the embeddings through each vision encoder (EfficientNet) block.
+
+    Args:
+        config ([`AlignVisionConfig`]):
+            Model configuration class.
+    """
+
+    def __init__(self, config: AlignVisionConfig):
+        super().__init__()
+        self.depth_coefficient = config.depth_coefficient
+
+        def round_repeats(repeats):
+            # Round number of block repeats based on depth multiplier.
+            return int(math.ceil(self.depth_coefficient * repeats))
+
+        num_base_blocks = len(config.in_channels)
+        num_blocks = sum(round_repeats(n) for n in config.num_block_repeats)
+
+        curr_block_num = 0
+        blocks = []
+        for i in range(num_base_blocks):
+            in_dim = round_filters(config, config.in_channels[i])
+            out_dim = round_filters(config, config.out_channels[i])
+            stride = config.strides[i]
+            kernel_size = config.kernel_sizes[i]
+            expand_ratio = config.expand_ratios[i]
+
+            for j in range(round_repeats(config.num_block_repeats[i])):
+                id_skip = True if j == 0 else False
+                stride = 1 if j > 0 else stride
+                in_dim = out_dim if j > 0 else in_dim
+                adjust_padding = False if curr_block_num in config.depthwise_padding else True
+                drop_rate = config.drop_connect_rate * curr_block_num / num_blocks
+
+                block = AlignVisionBlock(
+                    config=config,
+                    in_dim=in_dim,
+                    out_dim=out_dim,
+                    stride=stride,
+                    kernel_size=kernel_size,
+                    expand_ratio=expand_ratio,
+                    drop_rate=drop_rate,
+                    id_skip=id_skip,
+                    adjust_padding=adjust_padding,
+                )
+                blocks.append(block)
+                curr_block_num += 1
+
+        self.blocks = nn.ModuleList(blocks)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> BaseModelOutputWithPoolingAndNoAttention:
+        all_hidden_states = (hidden_states,) if output_hidden_states else None
+
+        for block in self.blocks:
+            hidden_states = block(hidden_states)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)
+
+        return BaseModelOutputWithNoAttention(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+        )
+
+
+# Copied from transformers.models.bert.modeling_bert.BertEmbeddings with Bert->AlignText
+class AlignTextEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+        self.register_buffer(
+            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        past_key_values_length: int = 0,
+    ) -> torch.Tensor:
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->AlignText
+class AlignTextSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        use_cache = past_key_value is not None
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
+            if use_cache:
+                position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
+                    -1, 1
+                )
+            else:
+                position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in AlignTextModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->AlignText
+class AlignTextSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->AlignText
+class AlignTextAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        self.self = AlignTextSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.output = AlignTextSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->AlignText
+class AlignTextIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->AlignText
+class AlignTextOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->AlignText
+class AlignTextLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = AlignTextAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = AlignTextAttention(config, position_embedding_type="absolute")
+        self.intermediate = AlignTextIntermediate(config)
+        self.output = AlignTextOutput(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->AlignText
+class AlignTextEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([AlignTextLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert -> AlignText
+class AlignTextPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class AlignPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = AlignConfig
+    base_model_prefix = "align"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, AlignModel):
+            nn.init.xavier_uniform_(module.text_projection.weight)
+            module.text_projection.bias.data.zero_()
+            module.text_projection._is_hf_initialized = True
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        if isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (AlignTextModel, AlignVisionModel)):
+            module.gradient_checkpointing = value
+
+
+@add_start_docstrings(
+    """The text model from ALIGN without any head or projection on top.""",
+    ALIGN_START_DOCSTRING,
+)
+class AlignTextModel(AlignPreTrainedModel):
+    config_class = AlignTextConfig
+
+    def __init__(self, config: AlignTextConfig, add_pooling_layer: bool = True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = AlignTextEmbeddings(config)
+        self.encoder = AlignTextEncoder(config)
+
+        self.pooler = AlignTextPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    @add_start_docstrings_to_model_forward(ALIGN_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPoolingAndCrossAttentions, config_class=AlignTextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, AlignTextModel
+
+        >>> model = AlignTextModel.from_pretrained("kakaobrain/align-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("kakaobrain/align-base")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length)), device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """The vision model from ALIGN without any head or projection on top.""",
+    ALIGN_START_DOCSTRING,
+)
+class AlignVisionModel(AlignPreTrainedModel):
+    config_class = AlignVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: AlignVisionConfig):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = AlignVisionEmbeddings(config)
+        self.encoder = AlignVisionEncoder(config)
+
+        # Final pooling layer
+        if config.pooling_type == "mean":
+            self.pooler = nn.AvgPool2d(config.hidden_dim, ceil_mode=True)
+        elif config.pooling_type == "max":
+            self.pooler = nn.MaxPool2d(config.hidden_dim, ceil_mode=True)
+        else:
+            raise ValueError(f"config.pooling must be one of ['mean', 'max'] got {config.pooling}")
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.convolution
+
+    @add_start_docstrings_to_model_forward(ALIGN_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPoolingAndNoAttention, config_class=AlignVisionConfig)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPoolingAndNoAttention]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, AlignVisionModel
+
+        >>> model = AlignVisionModel.from_pretrained("kakaobrain/align-base")
+        >>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled CLS states
+        ```"""
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        embedding_output = self.embeddings(pixel_values)
+        encoder_outputs = self.encoder(
+            embedding_output,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        # Apply pooling
+        last_hidden_state = encoder_outputs[0]
+        pooled_output = self.pooler(last_hidden_state)
+        # Reshape (batch_size, projection_dim, 1 , 1) -> (batch_size, projection_dim)
+        pooled_output = pooled_output.reshape(pooled_output.shape[:2])
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndNoAttention(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+        )
+
+
+@add_start_docstrings(ALIGN_START_DOCSTRING)
+class AlignModel(AlignPreTrainedModel):
+    config_class = AlignConfig
+
+    def __init__(self, config: AlignConfig):
+        super().__init__(config)
+
+        if not isinstance(config.text_config, AlignTextConfig):
+            raise ValueError(
+                "config.text_config is expected to be of type AlignTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+
+        if not isinstance(config.vision_config, AlignVisionConfig):
+            raise ValueError(
+                "config.vision_config is expected to be of type AlignVisionConfig but is of type"
+                f" {type(config.vision_config)}."
+            )
+
+        text_config = config.text_config
+        vision_config = config.vision_config
+
+        self.projection_dim = config.projection_dim
+        self.text_embed_dim = text_config.hidden_size
+
+        self.text_model = AlignTextModel(text_config)
+        self.vision_model = AlignVisionModel(vision_config)
+
+        self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim)
+        self.temperature = nn.Parameter(torch.tensor(self.config.temperature_init_value))
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(ALIGN_TEXT_INPUTS_DOCSTRING)
+    def get_text_features(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output of [`AlignTextModel`].
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, AlignModel
+
+        >>> model = AlignModel.from_pretrained("kakaobrain/align-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("kakaobrain/align-base")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+        >>> text_features = model.get_text_features(**inputs)
+        ```"""
+        # Use ALIGN model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = text_outputs[0][:, 0, :]
+        text_features = self.text_projection(last_hidden_state)
+
+        return text_features
+
+    @add_start_docstrings_to_model_forward(ALIGN_VISION_INPUTS_DOCSTRING)
+    def get_image_features(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+            applying the projection layer to the pooled output of [`AlignVisionModel`].
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, AlignModel
+
+        >>> model = AlignModel.from_pretrained("kakaobrain/align-base")
+        >>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> image_features = model.get_image_features(**inputs)
+        ```"""
+        # Use ALIGN model's config for some fields (if specified) instead of those of vision & text components.
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        image_features = vision_outputs[1]  # pooled_output
+
+        return image_features
+
+    @add_start_docstrings_to_model_forward(ALIGN_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=AlignOutput, config_class=AlignConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, AlignOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, AlignModel
+
+        >>> model = AlignModel.from_pretrained("kakaobrain/align-base")
+        >>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(
+        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
+        ... )
+
+        >>> outputs = model(**inputs)
+        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
+        ```"""
+        # Use ALIGN model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        image_embeds = vision_outputs[1]
+        text_embeds = text_outputs[0][:, 0, :]
+        text_embeds = self.text_projection(text_embeds)
+
+        # normalized features
+        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+
+        # cosine similarity as logits
+        logits_per_text = torch.matmul(text_embeds, image_embeds.t()) / self.temperature
+        logits_per_image = logits_per_text.t()
+
+        loss = None
+        if return_loss:
+            loss = align_loss(logits_per_text)
+
+        if not return_dict:
+            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return AlignOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py
new file mode 100644
index 000000000000..0863c11310e3
--- /dev/null
+++ b/src/transformers/models/align/processing_align.py
@@ -0,0 +1,122 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Image/Text processor class for ALIGN
+"""
+
+
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils_base import BatchEncoding
+
+
+class AlignProcessor(ProcessorMixin):
+    r"""
+    Constructs an ALIGN processor which wraps [`EfficientNetImageProcessor`] and
+    [`BertTokenizer`]/[`BertTokenizerFast`] into a single processor that interits both the image processor and
+    tokenizer functionalities. See the [`~AlignProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more
+    information.
+
+    Args:
+        image_processor ([`EfficientNetImageProcessor`]):
+            The image processor is a required input.
+        tokenizer ([`BertTokenizer`, `BertTokenizerFast`]):
+            The tokenizer is a required input.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "EfficientNetImageProcessor"
+    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
+
+    def __init__(self, image_processor, tokenizer):
+        super().__init__(image_processor, tokenizer)
+
+    def __call__(self, text=None, images=None, padding="max_length", max_length=64, return_tensors=None, **kwargs):
+        """
+        Main method to prepare text(s) and image(s) to be fed as input to the model. This method forwards the `text`
+        and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
+        EfficientNetImageProcessor's [`~EfficientNetImageProcessor.__call__`] if `images` is not `None`. Please refer
+        to the doctsring of the above two methods for more information.
+
+        Args:
+            text (`str`, `List[str]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `max_length`):
+                Activates and controls padding for tokenization of input text. Choose between [`True` or `'longest'`,
+                `'max_length'`, `False` or `'do_not_pad'`]
+            max_length (`int`, *optional*, defaults to `max_length`):
+                Maximum padding value to use to pad the input text during tokenization.
+
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+        if text is None and images is None:
+            raise ValueError("You have to specify either text or images. Both cannot be none.")
+
+        if text is not None:
+            encoding = self.tokenizer(
+                text, padding=padding, max_length=max_length, return_tensors=return_tensors, **kwargs
+            )
+
+        if images is not None:
+            image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
+
+        if text is not None and images is not None:
+            encoding["pixel_values"] = image_features.pixel_values
+            return encoding
+        elif text is not None:
+            return encoding
+        else:
+            return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to BertTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to BertTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
diff --git a/src/transformers/models/altclip/__init__.py b/src/transformers/models/altclip/__init__.py
index 75375c92f0ee..5fc02b192b25 100755
--- a/src/transformers/models/altclip/__init__.py
+++ b/src/transformers/models/altclip/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,7 +13,6 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-# rely on isort to merge the imports
 from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
 
 
diff --git a/src/transformers/models/altclip/configuration_altclip.py b/src/transformers/models/altclip/configuration_altclip.py
index ede1f1b13996..431c61565ba4 100755
--- a/src/transformers/models/altclip/configuration_altclip.py
+++ b/src/transformers/models/altclip/configuration_altclip.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ AltCLIP model configuration"""
-import copy
 import os
 from typing import Union
 
@@ -77,12 +76,8 @@ class AltCLIPTextConfig(PretrainedConfig):
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`.
-        classifier_dropout (`float`, *optional*):
-            The dropout ratio for the classification head.
         project_dim (`int`, *optional*, defaults to 768):
             The dimentions of the teacher model before the mapping layer.
-        pooler_fn (`str`, *optional*, defaults to `"cls"`):
-            Type of pooler we use. We take the first token as pooled output.
 
     Examples:
 
@@ -120,10 +115,8 @@ def __init__(
         eos_token_id=2,
         position_embedding_type="absolute",
         use_cache=True,
-        classifier_dropout=None,
         project_dim=768,
-        pooler_fn="cls",
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
@@ -142,9 +135,7 @@ def __init__(
         self.layer_norm_eps = layer_norm_eps
         self.position_embedding_type = position_embedding_type
         self.use_cache = use_cache
-        self.classifier_dropout = classifier_dropout
         self.project_dim = project_dim
-        self.pooler_fn = pooler_fn
 
 
 class AltCLIPVisionConfig(PretrainedConfig):
@@ -173,10 +164,9 @@ class AltCLIPVisionConfig(PretrainedConfig):
             The size (resolution) of each patch.
         hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
-            defaults to 1e-5): The epsilon used by the layer normalization layers.
-        dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
@@ -213,19 +203,17 @@ def __init__(
         image_size=224,
         patch_size=32,
         hidden_act="quick_gelu",
-        layer_norm_eps=0.00001,
-        dropout=0.0,
+        layer_norm_eps=1e-5,
         attention_dropout=0.0,
         initializer_range=0.02,
         initializer_factor=1.0,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size
         self.projection_dim = projection_dim
-        self.dropout = dropout
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
         self.num_channels = num_channels
@@ -239,6 +227,7 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
 
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
@@ -270,7 +259,7 @@ class AltCLIPConfig(PretrainedConfig):
             Dictionary of configuration options used to initialize [`AltCLIPTextConfig`].
         vision_config (`dict`, *optional*):
             Dictionary of configuration options used to initialize [`AltCLIPVisionConfig`].
-        projection_dim (`int`, *optional*, defaults to 512):
+        projection_dim (`int`, *optional*, defaults to 768):
             Dimentionality of text and vision projection layers.
         logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
             The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
@@ -301,28 +290,87 @@ class AltCLIPConfig(PretrainedConfig):
     ```"""
 
     model_type = "altclip"
-    is_composition = True
 
     def __init__(
         self, text_config=None, vision_config=None, projection_dim=768, logit_scale_init_value=2.6592, **kwargs
     ):
-        super().__init__(**kwargs)
-
         # If `_config_dict` exist, we use them for the backward compatibility.
+        # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
+        # of confusion!).
         text_config_dict = kwargs.pop("text_config_dict", None)
         vision_config_dict = kwargs.pop("vision_config_dict", None)
+
+        super().__init__(**kwargs)
+
+        # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
+        # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
+        # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
         if text_config_dict is not None:
-            text_config = text_config_dict
+            if text_config is None:
+                text_config = {}
+
+            # This is the complete result when using `text_config_dict`.
+            _text_config_dict = AltCLIPTextConfig(**text_config_dict).to_dict()
+
+            # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
+            for key, value in _text_config_dict.items():
+                if key in text_config and value != text_config[key] and key not in ["transformers_version"]:
+                    # If specified in `text_config_dict`
+                    if key in text_config_dict:
+                        message = (
+                            f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
+                            f'The value `text_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`text_config_dict` is provided which will be used to initialize `AltCLIPTextConfig`. The "
+                            f'value `text_config["{key}"]` will be overriden.'
+                        )
+                    logger.warning(message)
+
+            # Update all values in `text_config` with the ones in `_text_config_dict`.
+            text_config.update(_text_config_dict)
+
         if vision_config_dict is not None:
-            vision_config = vision_config_dict
+            if vision_config is None:
+                vision_config = {}
+
+            # This is the complete result when using `vision_config_dict`.
+            _vision_config_dict = AltCLIPVisionConfig(**vision_config_dict).to_dict()
+            # convert keys to string instead of integer
+            if "id2label" in _vision_config_dict:
+                _vision_config_dict["id2label"] = {
+                    str(key): value for key, value in _vision_config_dict["id2label"].items()
+                }
+
+            # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different.
+            for key, value in _vision_config_dict.items():
+                if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]:
+                    # If specified in `vision_config_dict`
+                    if key in vision_config_dict:
+                        message = (
+                            f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different "
+                            f'values. The value `vision_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`vision_config_dict` is provided which will be used to initialize `AltCLIPVisionConfig`. "
+                            f'The value `vision_config["{key}"]` will be overriden.'
+                        )
+                    logger.warning(message)
+
+            # Update all values in `vision_config` with the ones in `_vision_config_dict`.
+            vision_config.update(_vision_config_dict)
 
         if text_config is None:
             text_config = {}
-            logger.info("text_config is None. Initializing the AltCLIPTextConfig with default values.")
+            logger.info("`text_config` is `None`. Initializing the `AltCLIPTextConfig` with default values.")
 
         if vision_config is None:
             vision_config = {}
-            logger.info("vision_config is None. initializing the AltCLIPVisionConfig with default values.")
+            logger.info("`vision_config` is `None`. initializing the `AltCLIPVisionConfig` with default values.")
 
         self.text_config = AltCLIPTextConfig(**text_config)
         self.vision_config = AltCLIPVisionConfig(**vision_config)
@@ -342,16 +390,3 @@ def from_text_vision_configs(cls, text_config: AltCLIPTextConfig, vision_config:
         """
 
         return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["text_config"] = self.text_config.to_dict()
-        output["vision_config"] = self.vision_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py
index 033dc40e4dbc..c4e32de55d9c 100755
--- a/src/transformers/models/altclip/modeling_altclip.py
+++ b/src/transformers/models/altclip/modeling_altclip.py
@@ -37,7 +37,6 @@
 
 logger = logging.get_logger(__name__)
 
-_TOKENIZER_FOR_DOC = "XLMRobertaTokenizer"
 _CHECKPOINT_FOR_DOC = "BAAI/AltCLIP"
 _CONFIG_FOR_DOC = "AltCLIPConfig"
 
@@ -68,7 +67,7 @@
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`XLMRobertaTokenizerFast`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -98,7 +97,7 @@
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
             tensors for more detail.
@@ -115,7 +114,7 @@
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`XLMRobertaTokenizerFast`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -133,7 +132,7 @@
             [What are position IDs?](../glossary#position-ids)
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
         return_loss (`bool`, *optional*):
             Whether or not to return the contrastive loss.
         output_attentions (`bool`, *optional*):
@@ -217,7 +216,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
         )
@@ -629,6 +630,13 @@ def forward(
         all_self_attentions = () if output_attentions else None
         all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         next_decoder_cache = () if use_cache else None
         for i, layer_module in enumerate(self.layer):
             if output_hidden_states:
@@ -639,12 +647,6 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         return module(*inputs, past_key_value, output_attentions)
@@ -845,9 +847,9 @@ def __init__(self, config: AltCLIPConfig):
         super().__init__()
         self.embed_dim = config.hidden_size
         self.self_attn = AltCLIPAttention(config)
-        self.layer_norm1 = nn.LayerNorm(self.embed_dim)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
         self.mlp = AltCLIPMLP(config)
-        self.layer_norm2 = nn.LayerNorm(self.embed_dim)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
 
     def forward(
         self,
@@ -1016,11 +1018,12 @@ def __init__(self, config: AltCLIPVisionConfig):
         self.num_patches = (self.image_size // self.patch_size) ** 2
         self.num_positions = self.num_patches + 1
         self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
-        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)))
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
 
     def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
         batch_size = pixel_values.shape[0]
-        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
         patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
 
         class_embeds = self.class_embedding.expand(batch_size, 1, -1)
@@ -1038,7 +1041,6 @@ class AltCLIPPreTrainedModel(PreTrainedModel):
     config_class = AltCLIPConfig
     base_model_prefix = "altclip"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -1069,10 +1071,12 @@ def _init_weights(self, module):
                 module.text_projection.weight,
                 std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
             )
+            module.text_projection._is_hf_initialized = True
             nn.init.normal_(
                 module.visual_projection.weight,
                 std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
             )
+            module.visual_projection._is_hf_initialized = True
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
@@ -1100,9 +1104,9 @@ def __init__(self, config: AltCLIPVisionConfig):
         embed_dim = config.hidden_size
 
         self.embeddings = AltCLIPVisionEmbeddings(config)
-        self.pre_layrnorm = nn.LayerNorm(embed_dim)
+        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
         self.encoder = AltCLIPEncoder(config)
-        self.post_layernorm = nn.LayerNorm(embed_dim)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
 
     @add_start_docstrings_to_model_forward(ALTCLIP_VISION_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=AltCLIPVisionConfig)
@@ -1181,10 +1185,10 @@ def forward(
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import AltCLIPProcessor, AltCLIPVisionModel
+        >>> from transformers import AutoProcessor, AltCLIPVisionModel
 
         >>> model = AltCLIPVisionModel.from_pretrained("BAAI/AltCLIP")
-        >>> processor = AltCLIPProcessor.from_pretrained("BAAI/AltCLIP")
+        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
@@ -1301,6 +1305,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -1415,17 +1420,17 @@ def forward(
         output_attentions: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-    ):
+    ) -> Union[Tuple, BaseModelOutputWithPoolingAndProjection]:
         r"""
         Returns:
 
         Examples:
 
         ```python
-        >>> from transformers import AltCLIPProcessor, AltCLIPTextModel
+        >>> from transformers import AutoProcessor, AltCLIPTextModel
 
         >>> model = AltCLIPTextModel.from_pretrained("BAAI/AltCLIP")
-        >>> processor = AltCLIPProcessor.from_pretrained("BAAI/AltCLIP")
+        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")
 
         >>> texts = ["it's a cat", "it's a dog"]
 
@@ -1502,7 +1507,7 @@ def __init__(self, config: AltCLIPConfig):
 
         self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
         self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
-        self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
+        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1526,10 +1531,10 @@ def get_text_features(
         Examples:
 
         ```python
-        >>> from transformers import AltCLIPProcessor, AltCLIPModel
+        >>> from transformers import AutoProcessor, AltCLIPModel
 
         >>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
-        >>> processor = AltCLIPProcessor.from_pretrained("BAAI/AltCLIP")
+        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")
         >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
         >>> text_features = model.get_text_features(**inputs)
         ```"""
@@ -1572,10 +1577,10 @@ def get_image_features(
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import AltCLIPProcessor, AltCLIPModel
+        >>> from transformers import AutoProcessor, AltCLIPModel
 
         >>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
-        >>> processor = AltCLIPProcessor.from_pretrained("BAAI/AltCLIP")
+        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
         >>> inputs = processor(images=image, return_tensors="pt")
@@ -1608,7 +1613,7 @@ def forward(
         pixel_values: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        token_type_ids=None,
+        token_type_ids: Optional[torch.Tensor] = None,
         return_loss: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1622,10 +1627,10 @@ def forward(
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import AltCLIPProcessor, AltCLIPModel
+        >>> from transformers import AutoProcessor, AltCLIPModel
 
         >>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
-        >>> processor = AltCLIPProcessor.from_pretrained("BAAI/AltCLIP")
+        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
         >>> inputs = processor(
diff --git a/src/transformers/models/altclip/processing_altclip.py b/src/transformers/models/altclip/processing_altclip.py
index 8fe49ad678e9..102535bc5b0e 100644
--- a/src/transformers/models/altclip/processing_altclip.py
+++ b/src/transformers/models/altclip/processing_altclip.py
@@ -30,9 +30,9 @@ class AltCLIPProcessor(ProcessorMixin):
     the [`~AltCLIPProcessor.__call__`] and [`~AltCLIPProcessor.decode`] for more information.
 
     Args:
-        image_processor ([`CLIPImageProcessor`]):
+        image_processor ([`CLIPImageProcessor`], *optional*):
             The image processor is a required input.
-        tokenizer ([`XLMRobertaTokenizerFast`]):
+        tokenizer ([`XLMRobertaTokenizerFast`], *optional*):
             The tokenizer is a required input.
     """
     attributes = ["image_processor", "tokenizer"]
@@ -40,6 +40,7 @@ class AltCLIPProcessor(ProcessorMixin):
     tokenizer_class = ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast")
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        feature_extractor = None
         if "feature_extractor" in kwargs:
             warnings.warn(
                 "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
diff --git a/src/transformers/models/audio_spectrogram_transformer/__init__.py b/src/transformers/models/audio_spectrogram_transformer/__init__.py
index 37fab5996acb..9aa42423cf5f 100644
--- a/src/transformers/models/audio_spectrogram_transformer/__init__.py
+++ b/src/transformers/models/audio_spectrogram_transformer/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py
index 19f85189ad0d..23a2d83e78ac 100644
--- a/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py
@@ -51,15 +51,15 @@ class ASTConfig(PretrainedConfig):
         hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"selu"` and `"gelu_new"` are supported.
-        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        patch_size (`int`, *optional*, defaults to `16`):
+        patch_size (`int`, *optional*, defaults to 16):
             The size (resolution) of each patch.
         qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to the queries, keys and values.
@@ -105,7 +105,7 @@ def __init__(
         time_stride=10,
         max_length=1024,
         num_mel_bins=128,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py b/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
index f339bbc6c2bf..32e0f33d04fd 100644
--- a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
+++ b/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
@@ -22,8 +22,8 @@
 import torch
 import torchaudio
 from datasets import load_dataset
-
 from huggingface_hub import hf_hub_download
+
 from transformers import ASTConfig, ASTFeatureExtractor, ASTForAudioClassification
 from transformers.utils import logging
 
diff --git a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py
index 73041b7ae48a..786548fd2336 100644
--- a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py
@@ -44,7 +44,7 @@ class ASTFeatureExtractor(SequenceFeatureExtractor):
         feature_size (`int`, *optional*, defaults to 1):
             The feature dimension of the extracted features.
         sampling_rate (`int`, *optional*, defaults to 16000):
-            The sampling rate at which the audio files should be digitalized expressed in Hertz per second (Hz).
+            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
         num_mel_bins (`int`, *optional*, defaults to 128):
             Number of Mel-frequency bins.
         max_length (`int`, *optional*, defaults to 1024):
@@ -73,7 +73,7 @@ def __init__(
         mean=-4.2677393,
         std=4.5689974,
         return_attention_mask=False,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
         self.num_mel_bins = num_mel_bins
@@ -127,7 +127,7 @@ def __call__(
         raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
         sampling_rate: Optional[int] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs
+        **kwargs,
     ) -> BatchFeature:
         """
         Main method to featurize and prepare for the model one or several sequence(s).
@@ -135,7 +135,8 @@ def __call__(
         Args:
             raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                 The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
-                values, a list of numpy arrays or a list of list of float values.
+                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
+                stereo, i.e. single float per timestep.
             sampling_rate (`int`, *optional*):
                 The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
                 `sampling_rate` at the forward call to prevent silent errors.
@@ -160,9 +161,11 @@ def __call__(
                 "Failing to do so can result in silent errors that might be hard to debug."
             )
 
-        is_batched = bool(
-            isinstance(raw_speech, (list, tuple))
-            and (isinstance(raw_speech[0], np.ndarray) or isinstance(raw_speech[0], (tuple, list)))
+        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
+        if is_batched_numpy and len(raw_speech.shape) > 2:
+            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
+        is_batched = is_batched_numpy or (
+            isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
         )
 
         if is_batched:
diff --git a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
index 9cf24f31e5f9..28969f50b672 100644
--- a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
@@ -34,7 +34,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "ASTConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "ASTFeatureExtractor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "MIT/ast-finetuned-audioset-10-10-0.4593"
@@ -189,7 +188,6 @@ def __init__(self, config: ASTConfig) -> None:
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
 
@@ -247,7 +245,6 @@ def __init__(self, config: ASTConfig) -> None:
             self.intermediate_act_fn = config.hidden_act
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.intermediate_act_fn(hidden_states)
 
@@ -417,9 +414,12 @@ def _set_gradient_checkpointing(self, module: ASTEncoder, value: bool = False) -
 
 AUDIO_SPECTROGRAM_TRANSFORMER_INPUTS_DOCSTRING = r"""
     Args:
-        input_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`ASTFeatureExtractor`]. See
-            [`ASTFeatureExtractor.__call__`] for details.
+        input_values (`torch.FloatTensor` of shape `(batch_size, max_length, num_mel_bins)`):
+            Float values mel features extracted from the raw audio waveform. Raw audio waveform can be obtained by
+            loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via
+            the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
+            [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
+            tensor of type `torch.FloatTensor`. See [`~ASTFeatureExtractor.__call__`]
 
         head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
             Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
@@ -443,7 +443,7 @@ def _set_gradient_checkpointing(self, module: ASTEncoder, value: bool = False) -
     AUDIO_SPECTROGRAM_TRANSFORMER_START_DOCSTRING,
 )
 class ASTModel(ASTPreTrainedModel):
-    def __init__(self, config: ASTConfig):
+    def __init__(self, config: ASTConfig) -> None:
         super().__init__(config)
         self.config = config
 
@@ -468,7 +468,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(AUDIO_SPECTROGRAM_TRANSFORMER_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPooling,
         config_class=_CONFIG_FOR_DOC,
@@ -482,7 +481,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ):
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -558,7 +557,6 @@ def __init__(self, config: ASTConfig) -> None:
 
     @add_start_docstrings_to_model_forward(AUDIO_SPECTROGRAM_TRANSFORMER_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_SEQ_CLASS_CHECKPOINT,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py
index da8ceb8e7e62..dc01c93406b7 100644
--- a/src/transformers/models/auto/__init__.py
+++ b/src/transformers/models/auto/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -44,6 +40,7 @@
 else:
     _import_structure["modeling_auto"] = [
         "MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
+        "MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING",
         "MODEL_FOR_AUDIO_XVECTOR_MAPPING",
         "MODEL_FOR_BACKBONE_MAPPING",
         "MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING",
@@ -53,9 +50,11 @@
         "MODEL_FOR_DEPTH_ESTIMATION_MAPPING",
         "MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
         "MODEL_FOR_IMAGE_SEGMENTATION_MAPPING",
+        "MODEL_FOR_IMAGE_TO_IMAGE_MAPPING",
         "MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING",
         "MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING",
         "MODEL_FOR_MASKED_LM_MAPPING",
+        "MODEL_FOR_MASK_GENERATION_MAPPING",
         "MODEL_FOR_MULTIPLE_CHOICE_MAPPING",
         "MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING",
         "MODEL_FOR_OBJECT_DETECTION_MAPPING",
@@ -66,6 +65,9 @@
         "MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING",
         "MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
         "MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
+        "MODEL_FOR_TEXT_ENCODING_MAPPING",
+        "MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING",
+        "MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING",
         "MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
         "MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING",
         "MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING",
@@ -73,6 +75,7 @@
         "MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING",
         "MODEL_MAPPING",
         "MODEL_WITH_LM_HEAD_MAPPING",
+        "MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING",
         "MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING",
         "AutoModel",
         "AutoBackbone",
@@ -84,7 +87,10 @@
         "AutoModelForDepthEstimation",
         "AutoModelForImageClassification",
         "AutoModelForImageSegmentation",
+        "AutoModelForImageToImage",
         "AutoModelForInstanceSegmentation",
+        "AutoModelForMaskGeneration",
+        "AutoModelForTextEncoding",
         "AutoModelForMaskedImageModeling",
         "AutoModelForMaskedLM",
         "AutoModelForMultipleChoice",
@@ -97,6 +103,8 @@
         "AutoModelForSequenceClassification",
         "AutoModelForSpeechSeq2Seq",
         "AutoModelForTableQuestionAnswering",
+        "AutoModelForTextToSpectrogram",
+        "AutoModelForTextToWaveform",
         "AutoModelForTokenClassification",
         "AutoModelForUniversalSegmentation",
         "AutoModelForVideoClassification",
@@ -104,6 +112,7 @@
         "AutoModelForVisualQuestionAnswering",
         "AutoModelForDocumentQuestionAnswering",
         "AutoModelWithLMHead",
+        "AutoModelForZeroShotImageClassification",
         "AutoModelForZeroShotObjectDetection",
     ]
 
@@ -114,8 +123,10 @@
     pass
 else:
     _import_structure["modeling_tf_auto"] = [
+        "TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
         "TF_MODEL_FOR_CAUSAL_LM_MAPPING",
         "TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
+        "TF_MODEL_FOR_MASK_GENERATION_MAPPING",
         "TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING",
         "TF_MODEL_FOR_MASKED_LM_MAPPING",
         "TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING",
@@ -128,14 +139,19 @@
         "TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING",
         "TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
         "TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
+        "TF_MODEL_FOR_TEXT_ENCODING_MAPPING",
         "TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
         "TF_MODEL_FOR_VISION_2_SEQ_MAPPING",
+        "TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING",
         "TF_MODEL_MAPPING",
         "TF_MODEL_WITH_LM_HEAD_MAPPING",
         "TFAutoModel",
+        "TFAutoModelForAudioClassification",
         "TFAutoModelForCausalLM",
         "TFAutoModelForImageClassification",
+        "TFAutoModelForMaskedImageModeling",
         "TFAutoModelForMaskedLM",
+        "TFAutoModelForMaskGeneration",
         "TFAutoModelForMultipleChoice",
         "TFAutoModelForNextSentencePrediction",
         "TFAutoModelForPreTraining",
@@ -146,8 +162,10 @@
         "TFAutoModelForSequenceClassification",
         "TFAutoModelForSpeechSeq2Seq",
         "TFAutoModelForTableQuestionAnswering",
+        "TFAutoModelForTextEncoding",
         "TFAutoModelForTokenClassification",
         "TFAutoModelForVision2Seq",
+        "TFAutoModelForZeroShotImageClassification",
         "TFAutoModelWithLMHead",
     ]
 
@@ -158,6 +176,7 @@
     pass
 else:
     _import_structure["modeling_flax_auto"] = [
+        "FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
         "FLAX_MODEL_FOR_CAUSAL_LM_MAPPING",
         "FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
         "FLAX_MODEL_FOR_MASKED_LM_MAPPING",
@@ -167,6 +186,7 @@
         "FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING",
         "FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING",
         "FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING",
+        "FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
         "FLAX_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
         "FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING",
         "FLAX_MODEL_MAPPING",
@@ -180,6 +200,7 @@
         "FlaxAutoModelForQuestionAnswering",
         "FlaxAutoModelForSeq2SeqLM",
         "FlaxAutoModelForSequenceClassification",
+        "FlaxAutoModelForSpeechSeq2Seq",
         "FlaxAutoModelForTokenClassification",
         "FlaxAutoModelForVision2Seq",
     ]
@@ -201,6 +222,7 @@
     else:
         from .modeling_auto import (
             MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
+            MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING,
             MODEL_FOR_AUDIO_XVECTOR_MAPPING,
             MODEL_FOR_BACKBONE_MAPPING,
             MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING,
@@ -210,7 +232,9 @@
             MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING,
             MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
             MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
+            MODEL_FOR_IMAGE_TO_IMAGE_MAPPING,
             MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
+            MODEL_FOR_MASK_GENERATION_MAPPING,
             MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING,
             MODEL_FOR_MASKED_LM_MAPPING,
             MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
@@ -223,11 +247,15 @@
             MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
             MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
             MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
+            MODEL_FOR_TEXT_ENCODING_MAPPING,
+            MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING,
+            MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING,
             MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
             MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING,
             MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING,
             MODEL_FOR_VISION_2_SEQ_MAPPING,
             MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING,
+            MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING,
             MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING,
             MODEL_MAPPING,
             MODEL_WITH_LM_HEAD_MAPPING,
@@ -242,9 +270,11 @@
             AutoModelForDocumentQuestionAnswering,
             AutoModelForImageClassification,
             AutoModelForImageSegmentation,
+            AutoModelForImageToImage,
             AutoModelForInstanceSegmentation,
             AutoModelForMaskedImageModeling,
             AutoModelForMaskedLM,
+            AutoModelForMaskGeneration,
             AutoModelForMultipleChoice,
             AutoModelForNextSentencePrediction,
             AutoModelForObjectDetection,
@@ -255,11 +285,15 @@
             AutoModelForSequenceClassification,
             AutoModelForSpeechSeq2Seq,
             AutoModelForTableQuestionAnswering,
+            AutoModelForTextEncoding,
+            AutoModelForTextToSpectrogram,
+            AutoModelForTextToWaveform,
             AutoModelForTokenClassification,
             AutoModelForUniversalSegmentation,
             AutoModelForVideoClassification,
             AutoModelForVision2Seq,
             AutoModelForVisualQuestionAnswering,
+            AutoModelForZeroShotImageClassification,
             AutoModelForZeroShotObjectDetection,
             AutoModelWithLMHead,
         )
@@ -271,9 +305,11 @@
         pass
     else:
         from .modeling_tf_auto import (
+            TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
             TF_MODEL_FOR_CAUSAL_LM_MAPPING,
             TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING,
             TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
+            TF_MODEL_FOR_MASK_GENERATION_MAPPING,
             TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING,
             TF_MODEL_FOR_MASKED_LM_MAPPING,
             TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
@@ -285,15 +321,20 @@
             TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
             TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
             TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
+            TF_MODEL_FOR_TEXT_ENCODING_MAPPING,
             TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
             TF_MODEL_FOR_VISION_2_SEQ_MAPPING,
+            TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING,
             TF_MODEL_MAPPING,
             TF_MODEL_WITH_LM_HEAD_MAPPING,
             TFAutoModel,
+            TFAutoModelForAudioClassification,
             TFAutoModelForCausalLM,
             TFAutoModelForDocumentQuestionAnswering,
             TFAutoModelForImageClassification,
+            TFAutoModelForMaskedImageModeling,
             TFAutoModelForMaskedLM,
+            TFAutoModelForMaskGeneration,
             TFAutoModelForMultipleChoice,
             TFAutoModelForNextSentencePrediction,
             TFAutoModelForPreTraining,
@@ -303,8 +344,10 @@
             TFAutoModelForSequenceClassification,
             TFAutoModelForSpeechSeq2Seq,
             TFAutoModelForTableQuestionAnswering,
+            TFAutoModelForTextEncoding,
             TFAutoModelForTokenClassification,
             TFAutoModelForVision2Seq,
+            TFAutoModelForZeroShotImageClassification,
             TFAutoModelWithLMHead,
         )
 
@@ -315,6 +358,7 @@
         pass
     else:
         from .modeling_flax_auto import (
+            FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
             FLAX_MODEL_FOR_CAUSAL_LM_MAPPING,
             FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
             FLAX_MODEL_FOR_MASKED_LM_MAPPING,
@@ -324,6 +368,7 @@
             FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
             FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
             FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
+            FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
             FLAX_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
             FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING,
             FLAX_MODEL_MAPPING,
@@ -337,6 +382,7 @@
             FlaxAutoModelForQuestionAnswering,
             FlaxAutoModelForSeq2SeqLM,
             FlaxAutoModelForSequenceClassification,
+            FlaxAutoModelForSpeechSeq2Seq,
             FlaxAutoModelForTokenClassification,
             FlaxAutoModelForVision2Seq,
         )
diff --git a/src/transformers/models/auto/auto_factory.py b/src/transformers/models/auto/auto_factory.py
index d90650598740..c7c6b0093080 100644
--- a/src/transformers/models/auto/auto_factory.py
+++ b/src/transformers/models/auto/auto_factory.py
@@ -13,12 +13,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Factory function to build auto-model classes."""
+import copy
 import importlib
+import json
+import os
+import warnings
 from collections import OrderedDict
 
 from ...configuration_utils import PretrainedConfig
-from ...dynamic_module_utils import get_class_from_dynamic_module
-from ...utils import copy_func, logging
+from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
+from ...utils import (
+    CONFIG_NAME,
+    cached_file,
+    copy_func,
+    extract_commit_hash,
+    find_adapter_config_file,
+    is_peft_available,
+    logging,
+    requires_backends,
+)
 from .configuration_auto import AutoConfig, model_type_to_module_name, replace_list_option_in_docstrings
 
 
@@ -127,6 +140,11 @@
                 Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
                 should only be set to `True` for repositories you trust and in which you have read the code, as it will
                 execute code present on the Hub on your local machine.
+            code_revision (`str`, *optional*, defaults to `"main"`):
+                The specific revision to use for the code on the Hub, if the code leaves in a different repository than
+                the rest of the model. It can be a branch name, a tag name, or a commit id, since we use a git-based
+                system for storing models and other artifacts on huggingface.co, so `revision` can be any identifier
+                allowed by git.
             kwargs (additional keyword arguments, *optional*):
                 Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
                 `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
@@ -223,6 +241,11 @@
                 Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
                 should only be set to `True` for repositories you trust and in which you have read the code, as it will
                 execute code present on the Hub on your local machine.
+            code_revision (`str`, *optional*, defaults to `"main"`):
+                The specific revision to use for the code on the Hub, if the code leaves in a different repository than
+                the rest of the model. It can be a branch name, a tag name, or a commit id, since we use a git-based
+                system for storing models and other artifacts on huggingface.co, so `revision` can be any identifier
+                allowed by git.
             kwargs (additional keyword arguments, *optional*):
                 Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
                 `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
@@ -319,6 +342,11 @@
                 Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
                 should only be set to `True` for repositories you trust and in which you have read the code, as it will
                 execute code present on the Hub on your local machine.
+            code_revision (`str`, *optional*, defaults to `"main"`):
+                The specific revision to use for the code on the Hub, if the code leaves in a different repository than
+                the rest of the model. It can be a branch name, a tag name, or a commit id, since we use a git-based
+                system for storing models and other artifacts on huggingface.co, so `revision` can be any identifier
+                allowed by git.
             kwargs (additional keyword arguments, *optional*):
                 Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
                 `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
@@ -388,22 +416,25 @@ def __init__(self, *args, **kwargs):
 
     @classmethod
     def from_config(cls, config, **kwargs):
-        trust_remote_code = kwargs.pop("trust_remote_code", False)
-        if hasattr(config, "auto_map") and cls.__name__ in config.auto_map:
-            if not trust_remote_code:
-                raise ValueError(
-                    "Loading this model requires you to execute the modeling file in that repo "
-                    "on your local machine. Make sure you have read the code there to avoid malicious use, then set "
-                    "the option `trust_remote_code=True` to remove this error."
-                )
-            if kwargs.get("revision", None) is None:
-                logger.warning(
-                    "Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure "
-                    "no malicious code has been contributed in a newer revision."
-                )
+        trust_remote_code = kwargs.pop("trust_remote_code", None)
+        has_remote_code = hasattr(config, "auto_map") and cls.__name__ in config.auto_map
+        has_local_code = type(config) in cls._model_mapping.keys()
+        trust_remote_code = resolve_trust_remote_code(
+            trust_remote_code, config._name_or_path, has_local_code, has_remote_code
+        )
+
+        if has_remote_code and trust_remote_code:
             class_ref = config.auto_map[cls.__name__]
-            module_file, class_name = class_ref.split(".")
-            model_class = get_class_from_dynamic_module(config.name_or_path, module_file + ".py", class_name, **kwargs)
+            if "--" in class_ref:
+                repo_id, class_ref = class_ref.split("--")
+            else:
+                repo_id = config.name_or_path
+            model_class = get_class_from_dynamic_module(class_ref, repo_id, **kwargs)
+            if os.path.isdir(config._name_or_path):
+                model_class.register_for_auto_class(cls.__name__)
+            else:
+                cls.register(config.__class__, model_class, exist_ok=True)
+            _ = kwargs.pop("code_revision", None)
             return model_class._from_config(config, **kwargs)
         elif type(config) in cls._model_mapping.keys():
             model_class = _get_model_class(config, cls._model_mapping)
@@ -417,7 +448,7 @@ def from_config(cls, config, **kwargs):
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         config = kwargs.pop("config", None)
-        trust_remote_code = kwargs.pop("trust_remote_code", False)
+        trust_remote_code = kwargs.pop("trust_remote_code", None)
         kwargs["_from_auto"] = True
         hub_kwargs_names = [
             "cache_dir",
@@ -428,34 +459,104 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             "revision",
             "subfolder",
             "use_auth_token",
+            "token",
         ]
         hub_kwargs = {name: kwargs.pop(name) for name in hub_kwargs_names if name in kwargs}
+        code_revision = kwargs.pop("code_revision", None)
+        commit_hash = kwargs.pop("_commit_hash", None)
+        adapter_kwargs = kwargs.pop("adapter_kwargs", None)
+
+        token = hub_kwargs.pop("token", None)
+        use_auth_token = hub_kwargs.pop("use_auth_token", None)
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
+        if token is not None:
+            hub_kwargs["token"] = token
+
+        if commit_hash is None:
+            if not isinstance(config, PretrainedConfig):
+                # We make a call to the config file first (which may be absent) to get the commit hash as soon as possible
+                resolved_config_file = cached_file(
+                    pretrained_model_name_or_path,
+                    CONFIG_NAME,
+                    _raise_exceptions_for_missing_entries=False,
+                    _raise_exceptions_for_connection_errors=False,
+                    **hub_kwargs,
+                )
+                commit_hash = extract_commit_hash(resolved_config_file, commit_hash)
+            else:
+                commit_hash = getattr(config, "_commit_hash", None)
+
+        if is_peft_available():
+            if adapter_kwargs is None:
+                adapter_kwargs = {}
+                if token is not None:
+                    adapter_kwargs["token"] = token
+
+            maybe_adapter_path = find_adapter_config_file(
+                pretrained_model_name_or_path, _commit_hash=commit_hash, **adapter_kwargs
+            )
+
+            if maybe_adapter_path is not None:
+                with open(maybe_adapter_path, "r", encoding="utf-8") as f:
+                    adapter_config = json.load(f)
+
+                    adapter_kwargs["_adapter_model_path"] = pretrained_model_name_or_path
+                    pretrained_model_name_or_path = adapter_config["base_model_name_or_path"]
+
         if not isinstance(config, PretrainedConfig):
+            kwargs_orig = copy.deepcopy(kwargs)
+            # ensure not to pollute the config object with torch_dtype="auto" - since it's
+            # meaningless in the context of the config object - torch.dtype values are acceptable
+            if kwargs.get("torch_dtype", None) == "auto":
+                _ = kwargs.pop("torch_dtype")
+            # to not overwrite the quantization_config if config has a quantization_config
+            if kwargs.get("quantization_config", None) is not None:
+                _ = kwargs.pop("quantization_config")
+
             config, kwargs = AutoConfig.from_pretrained(
                 pretrained_model_name_or_path,
                 return_unused_kwargs=True,
                 trust_remote_code=trust_remote_code,
+                code_revision=code_revision,
+                _commit_hash=commit_hash,
                 **hub_kwargs,
                 **kwargs,
             )
-        if hasattr(config, "auto_map") and cls.__name__ in config.auto_map:
-            if not trust_remote_code:
-                raise ValueError(
-                    f"Loading {pretrained_model_name_or_path} requires you to execute the modeling file in that repo "
-                    "on your local machine. Make sure you have read the code there to avoid malicious use, then set "
-                    "the option `trust_remote_code=True` to remove this error."
-                )
-            if hub_kwargs.get("revision", None) is None:
-                logger.warning(
-                    "Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure "
-                    "no malicious code has been contributed in a newer revision."
-                )
+
+            # if torch_dtype=auto was passed here, ensure to pass it on
+            if kwargs_orig.get("torch_dtype", None) == "auto":
+                kwargs["torch_dtype"] = "auto"
+            if kwargs_orig.get("quantization_config", None) is not None:
+                kwargs["quantization_config"] = kwargs_orig["quantization_config"]
+
+        has_remote_code = hasattr(config, "auto_map") and cls.__name__ in config.auto_map
+        has_local_code = type(config) in cls._model_mapping.keys()
+        trust_remote_code = resolve_trust_remote_code(
+            trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code
+        )
+
+        # Set the adapter kwargs
+        kwargs["adapter_kwargs"] = adapter_kwargs
+
+        if has_remote_code and trust_remote_code:
             class_ref = config.auto_map[cls.__name__]
-            module_file, class_name = class_ref.split(".")
             model_class = get_class_from_dynamic_module(
-                pretrained_model_name_or_path, module_file + ".py", class_name, **hub_kwargs, **kwargs
+                class_ref, pretrained_model_name_or_path, code_revision=code_revision, **hub_kwargs, **kwargs
             )
-            model_class.register_for_auto_class(cls.__name__)
+            _ = hub_kwargs.pop("code_revision", None)
+            if os.path.isdir(pretrained_model_name_or_path):
+                model_class.register_for_auto_class(cls.__name__)
+            else:
+                cls.register(config.__class__, model_class, exist_ok=True)
             return model_class.from_pretrained(
                 pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
             )
@@ -470,7 +571,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         )
 
     @classmethod
-    def register(cls, config_class, model_class):
+    def register(cls, config_class, model_class, exist_ok=False):
         """
         Register a new model for this class.
 
@@ -486,7 +587,49 @@ def register(cls, config_class, model_class):
                 f"config class you passed (model has {model_class.config_class} and you passed {config_class}. Fix "
                 "one of those so they match!"
             )
-        cls._model_mapping.register(config_class, model_class)
+        cls._model_mapping.register(config_class, model_class, exist_ok=exist_ok)
+
+
+class _BaseAutoBackboneClass(_BaseAutoModelClass):
+    # Base class for auto backbone models.
+    _model_mapping = None
+
+    @classmethod
+    def _load_timm_backbone_from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        requires_backends(cls, ["vision", "timm"])
+        from ...models.timm_backbone import TimmBackboneConfig
+
+        config = kwargs.pop("config", TimmBackboneConfig())
+
+        use_timm = kwargs.pop("use_timm_backbone", True)
+        if not use_timm:
+            raise ValueError("`use_timm_backbone` must be `True` for timm backbones")
+
+        if kwargs.get("out_features", None) is not None:
+            raise ValueError("Cannot specify `out_features` for timm backbones")
+
+        if kwargs.get("output_loading_info", False):
+            raise ValueError("Cannot specify `output_loading_info=True` when loading from timm")
+
+        num_channels = kwargs.pop("num_channels", config.num_channels)
+        features_only = kwargs.pop("features_only", config.features_only)
+        use_pretrained_backbone = kwargs.pop("use_pretrained_backbone", config.use_pretrained_backbone)
+        out_indices = kwargs.pop("out_indices", config.out_indices)
+        config = TimmBackboneConfig(
+            backbone=pretrained_model_name_or_path,
+            num_channels=num_channels,
+            features_only=features_only,
+            use_pretrained_backbone=use_pretrained_backbone,
+            out_indices=out_indices,
+        )
+        return super().from_config(config, **kwargs)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        if kwargs.get("use_timm_backbone", False):
+            return cls._load_timm_backbone_from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
 
 
 def insert_head_doc(docstring, head_doc=""):
@@ -579,9 +722,14 @@ def __init__(self, config_mapping, model_mapping):
         self._config_mapping = config_mapping
         self._reverse_config_mapping = {v: k for k, v in config_mapping.items()}
         self._model_mapping = model_mapping
+        self._model_mapping._model_mapping = self
         self._extra_content = {}
         self._modules = {}
 
+    def __len__(self):
+        common_keys = set(self._config_mapping.keys()).intersection(self._model_mapping.keys())
+        return len(common_keys) + len(self._extra_content)
+
     def __getitem__(self, key):
         if key in self._extra_content:
             return self._extra_content[key]
@@ -651,13 +799,13 @@ def __contains__(self, item):
         model_type = self._reverse_config_mapping[item.__name__]
         return model_type in self._model_mapping
 
-    def register(self, key, value):
+    def register(self, key, value, exist_ok=False):
         """
         Register a new model in this mapping.
         """
         if hasattr(key, "__name__") and key.__name__ in self._reverse_config_mapping:
             model_type = self._reverse_config_mapping[key.__name__]
-            if model_type in self._model_mapping.keys():
+            if model_type in self._model_mapping.keys() and not exist_ok:
                 raise ValueError(f"'{key}' is already used by a Transformers model.")
 
         self._extra_content[key] = value
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 6a49d2f4e2c0..a5d8df8f2f7f 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -14,13 +14,14 @@
 # limitations under the License.
 """ Auto Config class."""
 import importlib
+import os
 import re
 import warnings
 from collections import OrderedDict
 from typing import List, Union
 
 from ...configuration_utils import PretrainedConfig
-from ...dynamic_module_utils import get_class_from_dynamic_module
+from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
 from ...utils import CONFIG_NAME, logging
 
 
@@ -30,8 +31,11 @@
     [
         # Add configs here
         ("albert", "AlbertConfig"),
+        ("align", "AlignConfig"),
         ("altclip", "AltCLIPConfig"),
         ("audio-spectrogram-transformer", "ASTConfig"),
+        ("autoformer", "AutoformerConfig"),
+        ("bark", "BarkConfig"),
         ("bart", "BartConfig"),
         ("beit", "BeitConfig"),
         ("bert", "BertConfig"),
@@ -43,16 +47,23 @@
         ("blenderbot", "BlenderbotConfig"),
         ("blenderbot-small", "BlenderbotSmallConfig"),
         ("blip", "BlipConfig"),
+        ("blip-2", "Blip2Config"),
         ("bloom", "BloomConfig"),
+        ("bridgetower", "BridgeTowerConfig"),
+        ("bros", "BrosConfig"),
         ("camembert", "CamembertConfig"),
         ("canine", "CanineConfig"),
         ("chinese_clip", "ChineseCLIPConfig"),
+        ("clap", "ClapConfig"),
         ("clip", "CLIPConfig"),
         ("clipseg", "CLIPSegConfig"),
+        ("code_llama", "LlamaConfig"),
         ("codegen", "CodeGenConfig"),
         ("conditional_detr", "ConditionalDetrConfig"),
         ("convbert", "ConvBertConfig"),
         ("convnext", "ConvNextConfig"),
+        ("convnextv2", "ConvNextV2Config"),
+        ("cpmant", "CpmAntConfig"),
         ("ctrl", "CTRLConfig"),
         ("cvt", "CvtConfig"),
         ("data2vec-audio", "Data2VecAudioConfig"),
@@ -63,33 +74,47 @@
         ("decision_transformer", "DecisionTransformerConfig"),
         ("deformable_detr", "DeformableDetrConfig"),
         ("deit", "DeiTConfig"),
+        ("deta", "DetaConfig"),
         ("detr", "DetrConfig"),
         ("dinat", "DinatConfig"),
+        ("dinov2", "Dinov2Config"),
         ("distilbert", "DistilBertConfig"),
         ("donut-swin", "DonutSwinConfig"),
         ("dpr", "DPRConfig"),
         ("dpt", "DPTConfig"),
+        ("efficientformer", "EfficientFormerConfig"),
+        ("efficientnet", "EfficientNetConfig"),
         ("electra", "ElectraConfig"),
+        ("encodec", "EncodecConfig"),
         ("encoder-decoder", "EncoderDecoderConfig"),
         ("ernie", "ErnieConfig"),
+        ("ernie_m", "ErnieMConfig"),
         ("esm", "EsmConfig"),
+        ("falcon", "FalconConfig"),
         ("flaubert", "FlaubertConfig"),
         ("flava", "FlavaConfig"),
         ("fnet", "FNetConfig"),
+        ("focalnet", "FocalNetConfig"),
         ("fsmt", "FSMTConfig"),
         ("funnel", "FunnelConfig"),
         ("git", "GitConfig"),
         ("glpn", "GLPNConfig"),
         ("gpt-sw3", "GPT2Config"),
         ("gpt2", "GPT2Config"),
+        ("gpt_bigcode", "GPTBigCodeConfig"),
         ("gpt_neo", "GPTNeoConfig"),
         ("gpt_neox", "GPTNeoXConfig"),
         ("gpt_neox_japanese", "GPTNeoXJapaneseConfig"),
         ("gptj", "GPTJConfig"),
+        ("gptsan-japanese", "GPTSanJapaneseConfig"),
+        ("graphormer", "GraphormerConfig"),
         ("groupvit", "GroupViTConfig"),
         ("hubert", "HubertConfig"),
         ("ibert", "IBertConfig"),
+        ("idefics", "IdeficsConfig"),
         ("imagegpt", "ImageGPTConfig"),
+        ("informer", "InformerConfig"),
+        ("instructblip", "InstructBlipConfig"),
         ("jukebox", "JukeboxConfig"),
         ("layoutlm", "LayoutLMConfig"),
         ("layoutlmv2", "LayoutLMv2Config"),
@@ -97,6 +122,7 @@
         ("led", "LEDConfig"),
         ("levit", "LevitConfig"),
         ("lilt", "LiltConfig"),
+        ("llama", "LlamaConfig"),
         ("longformer", "LongformerConfig"),
         ("longt5", "LongT5Config"),
         ("luke", "LukeConfig"),
@@ -104,30 +130,46 @@
         ("m2m_100", "M2M100Config"),
         ("marian", "MarianConfig"),
         ("markuplm", "MarkupLMConfig"),
+        ("mask2former", "Mask2FormerConfig"),
         ("maskformer", "MaskFormerConfig"),
         ("maskformer-swin", "MaskFormerSwinConfig"),
         ("mbart", "MBartConfig"),
         ("mctct", "MCTCTConfig"),
+        ("mega", "MegaConfig"),
         ("megatron-bert", "MegatronBertConfig"),
+        ("mgp-str", "MgpstrConfig"),
+        ("mistral", "MistralConfig"),
         ("mobilebert", "MobileBertConfig"),
         ("mobilenet_v1", "MobileNetV1Config"),
         ("mobilenet_v2", "MobileNetV2Config"),
         ("mobilevit", "MobileViTConfig"),
+        ("mobilevitv2", "MobileViTV2Config"),
         ("mpnet", "MPNetConfig"),
+        ("mpt", "MptConfig"),
+        ("mra", "MraConfig"),
         ("mt5", "MT5Config"),
+        ("musicgen", "MusicgenConfig"),
         ("mvp", "MvpConfig"),
         ("nat", "NatConfig"),
         ("nezha", "NezhaConfig"),
+        ("nllb-moe", "NllbMoeConfig"),
+        ("nougat", "VisionEncoderDecoderConfig"),
         ("nystromformer", "NystromformerConfig"),
+        ("oneformer", "OneFormerConfig"),
+        ("open-llama", "OpenLlamaConfig"),
         ("openai-gpt", "OpenAIGPTConfig"),
         ("opt", "OPTConfig"),
         ("owlvit", "OwlViTConfig"),
         ("pegasus", "PegasusConfig"),
         ("pegasus_x", "PegasusXConfig"),
         ("perceiver", "PerceiverConfig"),
+        ("persimmon", "PersimmonConfig"),
+        ("pix2struct", "Pix2StructConfig"),
         ("plbart", "PLBartConfig"),
         ("poolformer", "PoolFormerConfig"),
+        ("pop2piano", "Pop2PianoConfig"),
         ("prophetnet", "ProphetNetConfig"),
+        ("pvt", "PvtConfig"),
         ("qdqbert", "QDQBertConfig"),
         ("rag", "RagConfig"),
         ("realm", "RealmConfig"),
@@ -140,14 +182,18 @@
         ("roberta-prelayernorm", "RobertaPreLayerNormConfig"),
         ("roc_bert", "RoCBertConfig"),
         ("roformer", "RoFormerConfig"),
+        ("rwkv", "RwkvConfig"),
+        ("sam", "SamConfig"),
         ("segformer", "SegformerConfig"),
         ("sew", "SEWConfig"),
         ("sew-d", "SEWDConfig"),
         ("speech-encoder-decoder", "SpeechEncoderDecoderConfig"),
         ("speech_to_text", "Speech2TextConfig"),
         ("speech_to_text_2", "Speech2Text2Config"),
+        ("speecht5", "SpeechT5Config"),
         ("splinter", "SplinterConfig"),
         ("squeezebert", "SqueezeBertConfig"),
+        ("swiftformer", "SwiftFormerConfig"),
         ("swin", "SwinConfig"),
         ("swin2sr", "Swin2SRConfig"),
         ("swinv2", "Swinv2Config"),
@@ -157,11 +203,15 @@
         ("tapas", "TapasConfig"),
         ("time_series_transformer", "TimeSeriesTransformerConfig"),
         ("timesformer", "TimesformerConfig"),
+        ("timm_backbone", "TimmBackboneConfig"),
         ("trajectory_transformer", "TrajectoryTransformerConfig"),
         ("transfo-xl", "TransfoXLConfig"),
         ("trocr", "TrOCRConfig"),
+        ("tvlt", "TvltConfig"),
+        ("umt5", "UMT5Config"),
         ("unispeech", "UniSpeechConfig"),
         ("unispeech-sat", "UniSpeechSatConfig"),
+        ("upernet", "UperNetConfig"),
         ("van", "VanConfig"),
         ("videomae", "VideoMAEConfig"),
         ("vilt", "ViltConfig"),
@@ -172,6 +222,10 @@
         ("vit_hybrid", "ViTHybridConfig"),
         ("vit_mae", "ViTMAEConfig"),
         ("vit_msn", "ViTMSNConfig"),
+        ("vitdet", "VitDetConfig"),
+        ("vitmatte", "VitMatteConfig"),
+        ("vits", "VitsConfig"),
+        ("vivit", "VivitConfig"),
         ("wav2vec2", "Wav2Vec2Config"),
         ("wav2vec2-conformer", "Wav2Vec2ConformerConfig"),
         ("wavlm", "WavLMConfig"),
@@ -183,6 +237,7 @@
         ("xlm-roberta", "XLMRobertaConfig"),
         ("xlm-roberta-xl", "XLMRobertaXLConfig"),
         ("xlnet", "XLNetConfig"),
+        ("xmod", "XmodConfig"),
         ("yolos", "YolosConfig"),
         ("yoso", "YosoConfig"),
     ]
@@ -192,8 +247,11 @@
     [
         # Add archive maps here)
         ("albert", "ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("align", "ALIGN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("altclip", "ALTCLIP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("audio-spectrogram-transformer", "AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("autoformer", "AUTOFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("bark", "BARK_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("bart", "BART_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("beit", "BEIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("bert", "BERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -204,16 +262,22 @@
         ("blenderbot", "BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("blenderbot-small", "BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("blip", "BLIP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("blip-2", "BLIP_2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("bloom", "BLOOM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("bridgetower", "BRIDGETOWER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("bros", "BROS_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("camembert", "CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("canine", "CANINE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("chinese_clip", "CHINESE_CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("clap", "CLAP_PRETRAINED_MODEL_ARCHIVE_LIST"),
         ("clip", "CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("clipseg", "CLIPSEG_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("codegen", "CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("conditional_detr", "CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("convbert", "CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("convnext", "CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("convnextv2", "CONVNEXTV2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("cpmant", "CPMANT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("ctrl", "CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("cvt", "CVT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("data2vec-audio", "DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -223,31 +287,45 @@
         ("deberta-v2", "DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("deformable_detr", "DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("deit", "DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("deta", "DETA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("detr", "DETR_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("dinat", "DINAT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("dinov2", "DINOV2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("distilbert", "DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("donut-swin", "DONUT_SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("dpr", "DPR_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("dpt", "DPT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("efficientformer", "EFFICIENTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("efficientnet", "EFFICIENTNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("electra", "ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("encodec", "ENCODEC_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("ernie", "ERNIE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("ernie_m", "ERNIE_M_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("esm", "ESM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("falcon", "FALCON_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("flaubert", "FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("flava", "FLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("fnet", "FNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("focalnet", "FOCALNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("fsmt", "FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("funnel", "FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("git", "GIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("glpn", "GLPN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("gpt2", "GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("gpt_bigcode", "GPT_BIGCODE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("gpt_neo", "GPT_NEO_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("gpt_neox", "GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("gpt_neox_japanese", "GPT_NEOX_JAPANESE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("gptj", "GPTJ_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("gptsan-japanese", "GPTSAN_JAPANESE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("graphormer", "GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("groupvit", "GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("hubert", "HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("ibert", "IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("idefics", "IDEFICS_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("imagegpt", "IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("informer", "INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("instructblip", "INSTRUCTBLIP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("jukebox", "JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("layoutlm", "LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("layoutlmv2", "LAYOUTLMV2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -255,33 +333,49 @@
         ("led", "LED_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("levit", "LEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("lilt", "LILT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("llama", "LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("longformer", "LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("longt5", "LONGT5_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("luke", "LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("lxmert", "LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("m2m_100", "M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("markuplm", "MARKUPLM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("mask2former", "MASK2FORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("maskformer", "MASKFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("mbart", "MBART_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("mctct", "MCTCT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("mega", "MEGA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("megatron-bert", "MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("mgp-str", "MGP_STR_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("mistral", "MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("mobilenet_v1", "MOBILENET_V1_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("mobilenet_v2", "MOBILENET_V2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("mobilevit", "MOBILEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("mobilevitv2", "MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("mpnet", "MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("mpt", "MPT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("mra", "MRA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("musicgen", "MUSICGEN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("mvp", "MVP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("nat", "NAT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("nezha", "NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("nllb-moe", "NLLB_MOE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("nystromformer", "NYSTROMFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("oneformer", "ONEFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("open-llama", "OPEN_LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("openai-gpt", "OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("opt", "OPT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("owlvit", "OWLVIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("pegasus", "PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("pegasus_x", "PEGASUS_X_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("perceiver", "PERCEIVER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("persimmon", "PERSIMMON_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("pix2struct", "PIX2STRUCT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("plbart", "PLBART_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("poolformer", "POOLFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("pop2piano", "POP2PIANO_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("prophetnet", "PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("pvt", "PVT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("qdqbert", "QDQBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("realm", "REALM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("regnet", "REGNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -292,13 +386,17 @@
         ("roberta-prelayernorm", "ROBERTA_PRELAYERNORM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("roc_bert", "ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("roformer", "ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("rwkv", "RWKV_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("sam", "SAM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("segformer", "SEGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("sew", "SEW_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("sew-d", "SEW_D_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("speech_to_text", "SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("speech_to_text_2", "SPEECH_TO_TEXT_2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("speecht5", "SPEECHT5_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("splinter", "SPLINTER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("squeezebert", "SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("swiftformer", "SWIFTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("swin", "SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("swin2sr", "SWIN2SR_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("swinv2", "SWINV2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -309,6 +407,7 @@
         ("time_series_transformer", "TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("timesformer", "TIMESFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("transfo-xl", "TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("tvlt", "TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("unispeech", "UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("unispeech-sat", "UNISPEECH_SAT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("van", "VAN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -319,15 +418,20 @@
         ("vit_hybrid", "VIT_HYBRID_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("vit_mae", "VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("vit_msn", "VIT_MSN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("vitdet", "VITDET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("vitmatte", "VITMATTE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("vits", "VITS_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("vivit", "VIVIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("wav2vec2", "WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("wav2vec2-conformer", "WAV2VEC2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("whisper", "WHISPER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
-        ("xclip", "X_CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("xclip", "XCLIP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("xglm", "XGLM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("xlm", "XLM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("xlm-prophetnet", "XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("xlm-roberta", "XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("xlnet", "XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("xmod", "XMOD_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("yolos", "YOLOS_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("yoso", "YOSO_PRETRAINED_CONFIG_ARCHIVE_MAP"),
     ]
@@ -337,8 +441,11 @@
     [
         # Add full (and cased) model names here
         ("albert", "ALBERT"),
+        ("align", "ALIGN"),
         ("altclip", "AltCLIP"),
         ("audio-spectrogram-transformer", "Audio Spectrogram Transformer"),
+        ("autoformer", "Autoformer"),
+        ("bark", "Bark"),
         ("bart", "BART"),
         ("barthez", "BARThez"),
         ("bartpho", "BARTpho"),
@@ -354,19 +461,26 @@
         ("blenderbot", "Blenderbot"),
         ("blenderbot-small", "BlenderbotSmall"),
         ("blip", "BLIP"),
+        ("blip-2", "BLIP-2"),
         ("bloom", "BLOOM"),
         ("bort", "BORT"),
+        ("bridgetower", "BridgeTower"),
+        ("bros", "BROS"),
         ("byt5", "ByT5"),
         ("camembert", "CamemBERT"),
         ("canine", "CANINE"),
         ("chinese_clip", "Chinese-CLIP"),
+        ("clap", "CLAP"),
         ("clip", "CLIP"),
         ("clipseg", "CLIPSeg"),
+        ("code_llama", "CodeLlama"),
         ("codegen", "CodeGen"),
         ("conditional_detr", "Conditional DETR"),
         ("convbert", "ConvBERT"),
         ("convnext", "ConvNeXT"),
+        ("convnextv2", "ConvNeXTV2"),
         ("cpm", "CPM"),
+        ("cpmant", "CPM-Ant"),
         ("ctrl", "CTRL"),
         ("cvt", "CvT"),
         ("data2vec-audio", "Data2VecAudio"),
@@ -377,37 +491,53 @@
         ("decision_transformer", "Decision Transformer"),
         ("deformable_detr", "Deformable DETR"),
         ("deit", "DeiT"),
+        ("deplot", "DePlot"),
+        ("deta", "DETA"),
         ("detr", "DETR"),
         ("dialogpt", "DialoGPT"),
         ("dinat", "DiNAT"),
+        ("dinov2", "DINOv2"),
         ("distilbert", "DistilBERT"),
         ("dit", "DiT"),
         ("donut-swin", "DonutSwin"),
         ("dpr", "DPR"),
         ("dpt", "DPT"),
+        ("efficientformer", "EfficientFormer"),
+        ("efficientnet", "EfficientNet"),
         ("electra", "ELECTRA"),
+        ("encodec", "EnCodec"),
         ("encoder-decoder", "Encoder decoder"),
         ("ernie", "ERNIE"),
+        ("ernie_m", "ErnieM"),
         ("esm", "ESM"),
+        ("falcon", "Falcon"),
         ("flan-t5", "FLAN-T5"),
+        ("flan-ul2", "FLAN-UL2"),
         ("flaubert", "FlauBERT"),
         ("flava", "FLAVA"),
         ("fnet", "FNet"),
+        ("focalnet", "FocalNet"),
         ("fsmt", "FairSeq Machine-Translation"),
         ("funnel", "Funnel Transformer"),
         ("git", "GIT"),
         ("glpn", "GLPN"),
         ("gpt-sw3", "GPT-Sw3"),
         ("gpt2", "OpenAI GPT-2"),
+        ("gpt_bigcode", "GPTBigCode"),
         ("gpt_neo", "GPT Neo"),
         ("gpt_neox", "GPT NeoX"),
         ("gpt_neox_japanese", "GPT NeoX Japanese"),
         ("gptj", "GPT-J"),
+        ("gptsan-japanese", "GPTSAN-japanese"),
+        ("graphormer", "Graphormer"),
         ("groupvit", "GroupViT"),
         ("herbert", "HerBERT"),
         ("hubert", "Hubert"),
         ("ibert", "I-BERT"),
+        ("idefics", "IDEFICS"),
         ("imagegpt", "ImageGPT"),
+        ("informer", "Informer"),
+        ("instructblip", "InstructBLIP"),
         ("jukebox", "Jukebox"),
         ("layoutlm", "LayoutLM"),
         ("layoutlmv2", "LayoutLMv2"),
@@ -416,6 +546,8 @@
         ("led", "LED"),
         ("levit", "LeViT"),
         ("lilt", "LiLT"),
+        ("llama", "LLaMA"),
+        ("llama2", "Llama2"),
         ("longformer", "Longformer"),
         ("longt5", "LongT5"),
         ("luke", "LUKE"),
@@ -423,35 +555,53 @@
         ("m2m_100", "M2M100"),
         ("marian", "Marian"),
         ("markuplm", "MarkupLM"),
+        ("mask2former", "Mask2Former"),
         ("maskformer", "MaskFormer"),
         ("maskformer-swin", "MaskFormerSwin"),
+        ("matcha", "MatCha"),
         ("mbart", "mBART"),
         ("mbart50", "mBART-50"),
         ("mctct", "M-CTC-T"),
+        ("mega", "MEGA"),
         ("megatron-bert", "Megatron-BERT"),
         ("megatron_gpt2", "Megatron-GPT2"),
+        ("mgp-str", "MGP-STR"),
+        ("mistral", "Mistral"),
         ("mluke", "mLUKE"),
+        ("mms", "MMS"),
         ("mobilebert", "MobileBERT"),
         ("mobilenet_v1", "MobileNetV1"),
         ("mobilenet_v2", "MobileNetV2"),
         ("mobilevit", "MobileViT"),
+        ("mobilevitv2", "MobileViTV2"),
         ("mpnet", "MPNet"),
+        ("mpt", "MPT"),
+        ("mra", "MRA"),
         ("mt5", "MT5"),
+        ("musicgen", "MusicGen"),
         ("mvp", "MVP"),
         ("nat", "NAT"),
         ("nezha", "Nezha"),
         ("nllb", "NLLB"),
+        ("nllb-moe", "NLLB-MOE"),
+        ("nougat", "Nougat"),
         ("nystromformer", "Nyströmformer"),
+        ("oneformer", "OneFormer"),
+        ("open-llama", "OpenLlama"),
         ("openai-gpt", "OpenAI GPT"),
         ("opt", "OPT"),
         ("owlvit", "OWL-ViT"),
         ("pegasus", "Pegasus"),
         ("pegasus_x", "PEGASUS-X"),
         ("perceiver", "Perceiver"),
+        ("persimmon", "Persimmon"),
         ("phobert", "PhoBERT"),
+        ("pix2struct", "Pix2Struct"),
         ("plbart", "PLBart"),
         ("poolformer", "PoolFormer"),
+        ("pop2piano", "Pop2Piano"),
         ("prophetnet", "ProphetNet"),
+        ("pvt", "PVT"),
         ("qdqbert", "QDQBert"),
         ("rag", "RAG"),
         ("realm", "REALM"),
@@ -464,14 +614,18 @@
         ("roberta-prelayernorm", "RoBERTa-PreLayerNorm"),
         ("roc_bert", "RoCBert"),
         ("roformer", "RoFormer"),
+        ("rwkv", "RWKV"),
+        ("sam", "SAM"),
         ("segformer", "SegFormer"),
         ("sew", "SEW"),
         ("sew-d", "SEW-D"),
         ("speech-encoder-decoder", "Speech Encoder decoder"),
         ("speech_to_text", "Speech2Text"),
         ("speech_to_text_2", "Speech2Text2"),
+        ("speecht5", "SpeechT5"),
         ("splinter", "Splinter"),
         ("squeezebert", "SqueezeBERT"),
+        ("swiftformer", "SwiftFormer"),
         ("swin", "Swin Transformer"),
         ("swin2sr", "Swin2SR"),
         ("swinv2", "Swin Transformer V2"),
@@ -483,12 +637,16 @@
         ("tapex", "TAPEX"),
         ("time_series_transformer", "Time Series Transformer"),
         ("timesformer", "TimeSformer"),
+        ("timm_backbone", "TimmBackbone"),
         ("trajectory_transformer", "Trajectory Transformer"),
         ("transfo-xl", "Transformer-XL"),
         ("trocr", "TrOCR"),
+        ("tvlt", "TVLT"),
         ("ul2", "UL2"),
+        ("umt5", "UMT5"),
         ("unispeech", "UniSpeech"),
         ("unispeech-sat", "UniSpeechSat"),
+        ("upernet", "UPerNet"),
         ("van", "VAN"),
         ("videomae", "VideoMAE"),
         ("vilt", "ViLT"),
@@ -499,6 +657,10 @@
         ("vit_hybrid", "ViT Hybrid"),
         ("vit_mae", "ViTMAE"),
         ("vit_msn", "ViTMSN"),
+        ("vitdet", "VitDet"),
+        ("vitmatte", "ViTMatte"),
+        ("vits", "VITS"),
+        ("vivit", "ViViT"),
         ("wav2vec2", "Wav2Vec2"),
         ("wav2vec2-conformer", "Wav2Vec2-Conformer"),
         ("wav2vec2_phoneme", "Wav2Vec2Phoneme"),
@@ -510,14 +672,27 @@
         ("xlm-prophetnet", "XLM-ProphetNet"),
         ("xlm-roberta", "XLM-RoBERTa"),
         ("xlm-roberta-xl", "XLM-RoBERTa-XL"),
+        ("xlm-v", "XLM-V"),
         ("xlnet", "XLNet"),
         ("xls_r", "XLS-R"),
         ("xlsr_wav2vec2", "XLSR-Wav2Vec2"),
+        ("xmod", "X-MOD"),
         ("yolos", "YOLOS"),
         ("yoso", "YOSO"),
     ]
 )
 
+DEPRECATED_MODELS = [
+    "bort",
+    "mctct",
+    "mmbt",
+    "open_llama",
+    "retribert",
+    "tapex",
+    "trajectory_transformer",
+    "van",
+]
+
 SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict(
     [
         ("openai-gpt", "openai"),
@@ -537,7 +712,11 @@ def model_type_to_module_name(key):
     if key in SPECIAL_MODEL_TYPE_TO_MODULE_NAME:
         return SPECIAL_MODEL_TYPE_TO_MODULE_NAME[key]
 
-    return key.replace("-", "_")
+    key = key.replace("-", "_")
+    if key in DEPRECATED_MODELS:
+        key = f"deprecated.{key}"
+
+    return key
 
 
 def config_class_to_model_type(config):
@@ -545,6 +724,10 @@ def config_class_to_model_type(config):
     for key, cls in CONFIG_MAPPING_NAMES.items():
         if cls == config:
             return key
+    # if key not found check in extra content
+    for key, cls in CONFIG_MAPPING._extra_content.items():
+        if cls.__name__ == config:
+            return key
     return None
 
 
@@ -590,11 +773,11 @@ def __iter__(self):
     def __contains__(self, item):
         return item in self._mapping or item in self._extra_content
 
-    def register(self, key, value):
+    def register(self, key, value, exist_ok=False):
         """
         Register a new configuration in this mapping.
         """
-        if key in self._mapping.keys():
+        if key in self._mapping.keys() and not exist_ok:
             raise ValueError(f"'{key}' is already used by a Transformers config, pick another name.")
         self._extra_content[key] = value
 
@@ -832,28 +1015,36 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         >>> unused_kwargs
         {'foo': False}
         ```"""
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if kwargs.get("token", None) is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            kwargs["token"] = use_auth_token
+
         kwargs["_from_auto"] = True
         kwargs["name_or_path"] = pretrained_model_name_or_path
-        trust_remote_code = kwargs.pop("trust_remote_code", False)
+        trust_remote_code = kwargs.pop("trust_remote_code", None)
+        code_revision = kwargs.pop("code_revision", None)
+
         config_dict, unused_kwargs = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs)
-        if "auto_map" in config_dict and "AutoConfig" in config_dict["auto_map"]:
-            if not trust_remote_code:
-                raise ValueError(
-                    f"Loading {pretrained_model_name_or_path} requires you to execute the configuration file in that"
-                    " repo on your local machine. Make sure you have read the code there to avoid malicious use, then"
-                    " set the option `trust_remote_code=True` to remove this error."
-                )
-            if kwargs.get("revision", None) is None:
-                logger.warning(
-                    "Explicitly passing a `revision` is encouraged when loading a configuration with custom code to "
-                    "ensure no malicious code has been contributed in a newer revision."
-                )
+        has_remote_code = "auto_map" in config_dict and "AutoConfig" in config_dict["auto_map"]
+        has_local_code = "model_type" in config_dict and config_dict["model_type"] in CONFIG_MAPPING
+        trust_remote_code = resolve_trust_remote_code(
+            trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code
+        )
+
+        if has_remote_code and trust_remote_code:
             class_ref = config_dict["auto_map"]["AutoConfig"]
-            module_file, class_name = class_ref.split(".")
             config_class = get_class_from_dynamic_module(
-                pretrained_model_name_or_path, module_file + ".py", class_name, **kwargs
+                class_ref, pretrained_model_name_or_path, code_revision=code_revision, **kwargs
             )
-            config_class.register_for_auto_class()
+            if os.path.isdir(pretrained_model_name_or_path):
+                config_class.register_for_auto_class()
             return config_class.from_pretrained(pretrained_model_name_or_path, **kwargs)
         elif "model_type" in config_dict:
             config_class = CONFIG_MAPPING[config_dict["model_type"]]
@@ -872,7 +1063,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         )
 
     @staticmethod
-    def register(model_type, config):
+    def register(model_type, config, exist_ok=False):
         """
         Register a new configuration for this class.
 
@@ -886,4 +1077,4 @@ def register(model_type, config):
                 f"you passed (config has {config.model_type} and you passed {model_type}. Fix one of those so they "
                 "match!"
             )
-        CONFIG_MAPPING.register(model_type, config)
+        CONFIG_MAPPING.register(model_type, config, exist_ok=exist_ok)
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 3726f9f238cc..befca6a64b81 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -16,12 +16,13 @@
 import importlib
 import json
 import os
+import warnings
 from collections import OrderedDict
 from typing import Dict, Optional, Union
 
 # Build the list of all feature extractors
 from ...configuration_utils import PretrainedConfig
-from ...dynamic_module_utils import get_class_from_dynamic_module
+from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
 from ...feature_extraction_utils import FeatureExtractionMixin
 from ...utils import CONFIG_NAME, FEATURE_EXTRACTOR_NAME, get_file_from_repo, logging
 from .auto_factory import _LazyAutoMapping
@@ -40,6 +41,7 @@
         ("audio-spectrogram-transformer", "ASTFeatureExtractor"),
         ("beit", "BeitFeatureExtractor"),
         ("chinese_clip", "ChineseCLIPFeatureExtractor"),
+        ("clap", "ClapFeatureExtractor"),
         ("clip", "CLIPFeatureExtractor"),
         ("clipseg", "ViTFeatureExtractor"),
         ("conditional_detr", "ConditionalDetrFeatureExtractor"),
@@ -53,6 +55,7 @@
         ("dinat", "ViTFeatureExtractor"),
         ("donut-swin", "DonutFeatureExtractor"),
         ("dpt", "DPTFeatureExtractor"),
+        ("encodec", "EncodecFeatureExtractor"),
         ("flava", "FlavaFeatureExtractor"),
         ("glpn", "GLPNFeatureExtractor"),
         ("groupvit", "CLIPFeatureExtractor"),
@@ -70,16 +73,20 @@
         ("owlvit", "OwlViTFeatureExtractor"),
         ("perceiver", "PerceiverFeatureExtractor"),
         ("poolformer", "PoolFormerFeatureExtractor"),
+        ("pop2piano", "Pop2PianoFeatureExtractor"),
         ("regnet", "ConvNextFeatureExtractor"),
         ("resnet", "ConvNextFeatureExtractor"),
         ("segformer", "SegformerFeatureExtractor"),
         ("sew", "Wav2Vec2FeatureExtractor"),
         ("sew-d", "Wav2Vec2FeatureExtractor"),
         ("speech_to_text", "Speech2TextFeatureExtractor"),
+        ("speecht5", "SpeechT5FeatureExtractor"),
+        ("swiftformer", "ViTFeatureExtractor"),
         ("swin", "ViTFeatureExtractor"),
         ("swinv2", "ViTFeatureExtractor"),
         ("table-transformer", "DetrFeatureExtractor"),
         ("timesformer", "VideoMAEFeatureExtractor"),
+        ("tvlt", "TvltFeatureExtractor"),
         ("unispeech", "Wav2Vec2FeatureExtractor"),
         ("unispeech-sat", "Wav2Vec2FeatureExtractor"),
         ("van", "ConvNextFeatureExtractor"),
@@ -130,7 +137,7 @@ def get_feature_extractor_config(
     force_download: bool = False,
     resume_download: bool = False,
     proxies: Optional[Dict[str, str]] = None,
-    use_auth_token: Optional[Union[bool, str]] = None,
+    token: Optional[Union[bool, str]] = None,
     revision: Optional[str] = None,
     local_files_only: bool = False,
     **kwargs,
@@ -159,7 +166,7 @@ def get_feature_extractor_config(
         proxies (`Dict[str, str]`, *optional*):
             A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
             'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-        use_auth_token (`str` or *bool*, *optional*):
+        token (`str` or *bool*, *optional*):
             The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
             when running `huggingface-cli login` (stored in `~/.huggingface`).
         revision (`str`, *optional*, defaults to `"main"`):
@@ -171,7 +178,7 @@ def get_feature_extractor_config(
 
     <Tip>
 
-    Passing `use_auth_token=True` is required when you want to use a private model.
+    Passing `token=True` is required when you want to use a private model.
 
     </Tip>
 
@@ -193,6 +200,15 @@ def get_feature_extractor_config(
     tokenizer.save_pretrained("tokenizer-test")
     tokenizer_config = get_tokenizer_config("tokenizer-test")
     ```"""
+    use_auth_token = kwargs.pop("use_auth_token", None)
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+        )
+        if token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        token = use_auth_token
+
     resolved_config_file = get_file_from_repo(
         pretrained_model_name_or_path,
         FEATURE_EXTRACTOR_NAME,
@@ -200,7 +216,7 @@ def get_feature_extractor_config(
         force_download=force_download,
         resume_download=resume_download,
         proxies=proxies,
-        use_auth_token=use_auth_token,
+        token=token,
         revision=revision,
         local_files_only=local_files_only,
     )
@@ -264,7 +280,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-            use_auth_token (`str` or *bool*, *optional*):
+            token (`str` or *bool*, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
                 when running `huggingface-cli login` (stored in `~/.huggingface`).
             revision (`str`, *optional*, defaults to `"main"`):
@@ -287,7 +303,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
 
         <Tip>
 
-        Passing `use_auth_token=True` is required when you want to use a private model.
+        Passing `token=True` is required when you want to use a private model.
 
         </Tip>
 
@@ -300,10 +316,21 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
 
         >>> # If feature extractor files are in a directory (e.g. feature extractor was saved using *save_pretrained('./test/saved_model/')*)
-        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("./test/saved_model/")
+        >>> # feature_extractor = AutoFeatureExtractor.from_pretrained("./test/saved_model/")
         ```"""
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if kwargs.get("token", None) is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            kwargs["token"] = use_auth_token
+
         config = kwargs.pop("config", None)
-        trust_remote_code = kwargs.pop("trust_remote_code", False)
+        trust_remote_code = kwargs.pop("trust_remote_code", None)
         kwargs["_from_auto"] = True
 
         config_dict, _ = FeatureExtractionMixin.get_feature_extractor_dict(pretrained_model_name_or_path, **kwargs)
@@ -322,28 +349,23 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
                 feature_extractor_auto_map = config.auto_map["AutoFeatureExtractor"]
 
         if feature_extractor_class is not None:
-            # If we have custom code for a feature extractor, we get the proper class.
-            if feature_extractor_auto_map is not None:
-                if not trust_remote_code:
-                    raise ValueError(
-                        f"Loading {pretrained_model_name_or_path} requires you to execute the feature extractor file "
-                        "in that repo on your local machine. Make sure you have read the code there to avoid "
-                        "malicious use, then set the option `trust_remote_code=True` to remove this error."
-                    )
-                if kwargs.get("revision", None) is None:
-                    logger.warning(
-                        "Explicitly passing a `revision` is encouraged when loading a feature extractor with custom "
-                        "code to ensure no malicious code has been contributed in a newer revision."
-                    )
-
-                module_file, class_name = feature_extractor_auto_map.split(".")
-                feature_extractor_class = get_class_from_dynamic_module(
-                    pretrained_model_name_or_path, module_file + ".py", class_name, **kwargs
-                )
-                feature_extractor_class.register_for_auto_class()
-            else:
-                feature_extractor_class = feature_extractor_class_from_name(feature_extractor_class)
+            feature_extractor_class = feature_extractor_class_from_name(feature_extractor_class)
+
+        has_remote_code = feature_extractor_auto_map is not None
+        has_local_code = feature_extractor_class is not None or type(config) in FEATURE_EXTRACTOR_MAPPING
+        trust_remote_code = resolve_trust_remote_code(
+            trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code
+        )
 
+        if has_remote_code and trust_remote_code:
+            feature_extractor_class = get_class_from_dynamic_module(
+                feature_extractor_auto_map, pretrained_model_name_or_path, **kwargs
+            )
+            _ = kwargs.pop("code_revision", None)
+            if os.path.isdir(pretrained_model_name_or_path):
+                feature_extractor_class.register_for_auto_class()
+            return feature_extractor_class.from_dict(config_dict, **kwargs)
+        elif feature_extractor_class is not None:
             return feature_extractor_class.from_dict(config_dict, **kwargs)
         # Last try: we use the FEATURE_EXTRACTOR_MAPPING.
         elif type(config) in FEATURE_EXTRACTOR_MAPPING:
@@ -357,7 +379,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         )
 
     @staticmethod
-    def register(config_class, feature_extractor_class):
+    def register(config_class, feature_extractor_class, exist_ok=False):
         """
         Register a new feature extractor for this class.
 
@@ -366,4 +388,4 @@ def register(config_class, feature_extractor_class):
                 The configuration corresponding to the model to register.
             feature_extractor_class ([`FeatureExtractorMixin`]): The feature extractor to register.
         """
-        FEATURE_EXTRACTOR_MAPPING.register(config_class, feature_extractor_class)
+        FEATURE_EXTRACTOR_MAPPING.register(config_class, feature_extractor_class, exist_ok=exist_ok)
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index e23458955c68..7eeb1392d8e0 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -16,12 +16,13 @@
 import importlib
 import json
 import os
+import warnings
 from collections import OrderedDict
 from typing import Dict, Optional, Union
 
 # Build the list of all image processors
 from ...configuration_utils import PretrainedConfig
-from ...dynamic_module_utils import get_class_from_dynamic_module
+from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
 from ...image_processing_utils import ImageProcessingMixin
 from ...utils import CONFIG_NAME, IMAGE_PROCESSOR_NAME, get_file_from_repo, logging
 from .auto_factory import _LazyAutoMapping
@@ -37,48 +38,69 @@
 
 IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict(
     [
+        ("align", "EfficientNetImageProcessor"),
         ("beit", "BeitImageProcessor"),
         ("bit", "BitImageProcessor"),
         ("blip", "BlipImageProcessor"),
+        ("blip-2", "BlipImageProcessor"),
+        ("bridgetower", "BridgeTowerImageProcessor"),
         ("chinese_clip", "ChineseCLIPImageProcessor"),
         ("clip", "CLIPImageProcessor"),
         ("clipseg", "ViTImageProcessor"),
         ("conditional_detr", "ConditionalDetrImageProcessor"),
         ("convnext", "ConvNextImageProcessor"),
+        ("convnextv2", "ConvNextImageProcessor"),
         ("cvt", "ConvNextImageProcessor"),
         ("data2vec-vision", "BeitImageProcessor"),
         ("deformable_detr", "DeformableDetrImageProcessor"),
         ("deit", "DeiTImageProcessor"),
+        ("deta", "DetaImageProcessor"),
         ("detr", "DetrImageProcessor"),
         ("dinat", "ViTImageProcessor"),
+        ("dinov2", "BitImageProcessor"),
         ("donut-swin", "DonutImageProcessor"),
         ("dpt", "DPTImageProcessor"),
+        ("efficientformer", "EfficientFormerImageProcessor"),
+        ("efficientnet", "EfficientNetImageProcessor"),
         ("flava", "FlavaImageProcessor"),
-        ("git", ("CLIPImageProcessor", "VideoMAEImageProcessor")),
+        ("focalnet", "BitImageProcessor"),
+        ("git", "CLIPImageProcessor"),
         ("glpn", "GLPNImageProcessor"),
         ("groupvit", "CLIPImageProcessor"),
+        ("idefics", "IdeficsImageProcessor"),
         ("imagegpt", "ImageGPTImageProcessor"),
+        ("instructblip", "BlipImageProcessor"),
         ("layoutlmv2", "LayoutLMv2ImageProcessor"),
         ("layoutlmv3", "LayoutLMv3ImageProcessor"),
         ("levit", "LevitImageProcessor"),
+        ("mask2former", "Mask2FormerImageProcessor"),
         ("maskformer", "MaskFormerImageProcessor"),
+        ("mgp-str", "ViTImageProcessor"),
         ("mobilenet_v1", "MobileNetV1ImageProcessor"),
         ("mobilenet_v2", "MobileNetV2ImageProcessor"),
-        ("mobilenet_v2", "MobileNetV2ImageProcessor"),
         ("mobilevit", "MobileViTImageProcessor"),
         ("mobilevit", "MobileViTImageProcessor"),
+        ("mobilevitv2", "MobileViTImageProcessor"),
         ("nat", "ViTImageProcessor"),
+        ("nougat", "NougatImageProcessor"),
+        ("oneformer", "OneFormerImageProcessor"),
         ("owlvit", "OwlViTImageProcessor"),
         ("perceiver", "PerceiverImageProcessor"),
+        ("pix2struct", "Pix2StructImageProcessor"),
         ("poolformer", "PoolFormerImageProcessor"),
+        ("pvt", "PvtImageProcessor"),
         ("regnet", "ConvNextImageProcessor"),
         ("resnet", "ConvNextImageProcessor"),
+        ("sam", "SamImageProcessor"),
         ("segformer", "SegformerImageProcessor"),
+        ("swiftformer", "ViTImageProcessor"),
         ("swin", "ViTImageProcessor"),
         ("swin2sr", "Swin2SRImageProcessor"),
         ("swinv2", "ViTImageProcessor"),
         ("table-transformer", "DetrImageProcessor"),
         ("timesformer", "VideoMAEImageProcessor"),
+        ("tvlt", "TvltImageProcessor"),
+        ("upernet", "SegformerImageProcessor"),
         ("van", "ConvNextImageProcessor"),
         ("videomae", "VideoMAEImageProcessor"),
         ("vilt", "ViltImageProcessor"),
@@ -86,6 +108,7 @@
         ("vit_hybrid", "ViTHybridImageProcessor"),
         ("vit_mae", "ViTImageProcessor"),
         ("vit_msn", "ViTImageProcessor"),
+        ("vitmatte", "VitMatteImageProcessor"),
         ("xclip", "CLIPImageProcessor"),
         ("yolos", "YolosImageProcessor"),
     ]
@@ -124,7 +147,7 @@ def get_image_processor_config(
     force_download: bool = False,
     resume_download: bool = False,
     proxies: Optional[Dict[str, str]] = None,
-    use_auth_token: Optional[Union[bool, str]] = None,
+    token: Optional[Union[bool, str]] = None,
     revision: Optional[str] = None,
     local_files_only: bool = False,
     **kwargs,
@@ -153,7 +176,7 @@ def get_image_processor_config(
         proxies (`Dict[str, str]`, *optional*):
             A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
             'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-        use_auth_token (`str` or *bool*, *optional*):
+        token (`str` or *bool*, *optional*):
             The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
             when running `huggingface-cli login` (stored in `~/.huggingface`).
         revision (`str`, *optional*, defaults to `"main"`):
@@ -165,7 +188,7 @@ def get_image_processor_config(
 
     <Tip>
 
-    Passing `use_auth_token=True` is required when you want to use a private model.
+    Passing `token=True` is required when you want to use a private model.
 
     </Tip>
 
@@ -187,6 +210,15 @@ def get_image_processor_config(
     image_processor.save_pretrained("image-processor-test")
     image_processor_config = get_image_processor_config("image-processor-test")
     ```"""
+    use_auth_token = kwargs.pop("use_auth_token", None)
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+        )
+        if token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        token = use_auth_token
+
     resolved_config_file = get_file_from_repo(
         pretrained_model_name_or_path,
         IMAGE_PROCESSOR_NAME,
@@ -194,7 +226,7 @@ def get_image_processor_config(
         force_download=force_download,
         resume_download=resume_download,
         proxies=proxies,
-        use_auth_token=use_auth_token,
+        token=token,
         revision=revision,
         local_files_only=local_files_only,
     )
@@ -258,7 +290,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-            use_auth_token (`str` or *bool*, *optional*):
+            token (`str` or *bool*, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
                 when running `huggingface-cli login` (stored in `~/.huggingface`).
             revision (`str`, *optional*, defaults to `"main"`):
@@ -281,7 +313,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
 
         <Tip>
 
-        Passing `use_auth_token=True` is required when you want to use a private model.
+        Passing `token=True` is required when you want to use a private model.
 
         </Tip>
 
@@ -294,10 +326,21 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
 
         >>> # If image processor files are in a directory (e.g. image processor was saved using *save_pretrained('./test/saved_model/')*)
-        >>> image_processor = AutoImageProcessor.from_pretrained("./test/saved_model/")
+        >>> # image_processor = AutoImageProcessor.from_pretrained("./test/saved_model/")
         ```"""
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if kwargs.get("token", None) is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            kwargs["token"] = use_auth_token
+
         config = kwargs.pop("config", None)
-        trust_remote_code = kwargs.pop("trust_remote_code", False)
+        trust_remote_code = kwargs.pop("trust_remote_code", None)
         kwargs["_from_auto"] = True
 
         config_dict, _ = ImageProcessingMixin.get_image_processor_dict(pretrained_model_name_or_path, **kwargs)
@@ -334,28 +377,23 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
                 image_processor_auto_map = config.auto_map["AutoImageProcessor"]
 
         if image_processor_class is not None:
-            # If we have custom code for a image processor, we get the proper class.
-            if image_processor_auto_map is not None:
-                if not trust_remote_code:
-                    raise ValueError(
-                        f"Loading {pretrained_model_name_or_path} requires you to execute the image processor file "
-                        "in that repo on your local machine. Make sure you have read the code there to avoid "
-                        "malicious use, then set the option `trust_remote_code=True` to remove this error."
-                    )
-                if kwargs.get("revision", None) is None:
-                    logger.warning(
-                        "Explicitly passing a `revision` is encouraged when loading a image processor with custom "
-                        "code to ensure no malicious code has been contributed in a newer revision."
-                    )
-
-                module_file, class_name = image_processor_auto_map.split(".")
-                image_processor_class = get_class_from_dynamic_module(
-                    pretrained_model_name_or_path, module_file + ".py", class_name, **kwargs
-                )
-                image_processor_class.register_for_auto_class()
-            else:
-                image_processor_class = image_processor_class_from_name(image_processor_class)
+            image_processor_class = image_processor_class_from_name(image_processor_class)
+
+        has_remote_code = image_processor_auto_map is not None
+        has_local_code = image_processor_class is not None or type(config) in IMAGE_PROCESSOR_MAPPING
+        trust_remote_code = resolve_trust_remote_code(
+            trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code
+        )
 
+        if has_remote_code and trust_remote_code:
+            image_processor_class = get_class_from_dynamic_module(
+                image_processor_auto_map, pretrained_model_name_or_path, **kwargs
+            )
+            _ = kwargs.pop("code_revision", None)
+            if os.path.isdir(pretrained_model_name_or_path):
+                image_processor_class.register_for_auto_class()
+            return image_processor_class.from_dict(config_dict, **kwargs)
+        elif image_processor_class is not None:
             return image_processor_class.from_dict(config_dict, **kwargs)
         # Last try: we use the IMAGE_PROCESSOR_MAPPING.
         elif type(config) in IMAGE_PROCESSOR_MAPPING:
@@ -369,7 +407,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         )
 
     @staticmethod
-    def register(config_class, image_processor_class):
+    def register(config_class, image_processor_class, exist_ok=False):
         """
         Register a new image processor for this class.
 
@@ -378,4 +416,4 @@ def register(config_class, image_processor_class):
                 The configuration corresponding to the model to register.
             image_processor_class ([`ImageProcessingMixin`]): The image processor to register.
         """
-        IMAGE_PROCESSOR_MAPPING.register(config_class, image_processor_class)
+        IMAGE_PROCESSOR_MAPPING.register(config_class, image_processor_class, exist_ok=exist_ok)
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index a6c43a2f1e78..0ad5994aca43 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -18,7 +18,7 @@
 from collections import OrderedDict
 
 from ...utils import logging
-from .auto_factory import _BaseAutoModelClass, _LazyAutoMapping, auto_class_update
+from .auto_factory import _BaseAutoBackboneClass, _BaseAutoModelClass, _LazyAutoMapping, auto_class_update
 from .configuration_auto import CONFIG_MAPPING_NAMES
 
 
@@ -29,8 +29,11 @@
     [
         # Base model mapping
         ("albert", "AlbertModel"),
+        ("align", "AlignModel"),
         ("altclip", "AltCLIPModel"),
         ("audio-spectrogram-transformer", "ASTModel"),
+        ("autoformer", "AutoformerModel"),
+        ("bark", "BarkModel"),
         ("bart", "BartModel"),
         ("beit", "BeitModel"),
         ("bert", "BertModel"),
@@ -42,16 +45,23 @@
         ("blenderbot", "BlenderbotModel"),
         ("blenderbot-small", "BlenderbotSmallModel"),
         ("blip", "BlipModel"),
+        ("blip-2", "Blip2Model"),
         ("bloom", "BloomModel"),
+        ("bridgetower", "BridgeTowerModel"),
+        ("bros", "BrosModel"),
         ("camembert", "CamembertModel"),
         ("canine", "CanineModel"),
         ("chinese_clip", "ChineseCLIPModel"),
+        ("clap", "ClapModel"),
         ("clip", "CLIPModel"),
         ("clipseg", "CLIPSegModel"),
+        ("code_llama", "LlamaModel"),
         ("codegen", "CodeGenModel"),
         ("conditional_detr", "ConditionalDetrModel"),
         ("convbert", "ConvBertModel"),
         ("convnext", "ConvNextModel"),
+        ("convnextv2", "ConvNextV2Model"),
+        ("cpmant", "CpmAntModel"),
         ("ctrl", "CTRLModel"),
         ("cvt", "CvtModel"),
         ("data2vec-audio", "Data2VecAudioModel"),
@@ -60,35 +70,47 @@
         ("deberta", "DebertaModel"),
         ("deberta-v2", "DebertaV2Model"),
         ("decision_transformer", "DecisionTransformerModel"),
-        ("decision_transformer_gpt2", "DecisionTransformerGPT2Model"),
         ("deformable_detr", "DeformableDetrModel"),
         ("deit", "DeiTModel"),
+        ("deta", "DetaModel"),
         ("detr", "DetrModel"),
         ("dinat", "DinatModel"),
+        ("dinov2", "Dinov2Model"),
         ("distilbert", "DistilBertModel"),
         ("donut-swin", "DonutSwinModel"),
         ("dpr", "DPRQuestionEncoder"),
         ("dpt", "DPTModel"),
+        ("efficientformer", "EfficientFormerModel"),
+        ("efficientnet", "EfficientNetModel"),
         ("electra", "ElectraModel"),
+        ("encodec", "EncodecModel"),
         ("ernie", "ErnieModel"),
+        ("ernie_m", "ErnieMModel"),
         ("esm", "EsmModel"),
+        ("falcon", "FalconModel"),
         ("flaubert", "FlaubertModel"),
         ("flava", "FlavaModel"),
         ("fnet", "FNetModel"),
+        ("focalnet", "FocalNetModel"),
         ("fsmt", "FSMTModel"),
         ("funnel", ("FunnelModel", "FunnelBaseModel")),
         ("git", "GitModel"),
         ("glpn", "GLPNModel"),
         ("gpt-sw3", "GPT2Model"),
         ("gpt2", "GPT2Model"),
+        ("gpt_bigcode", "GPTBigCodeModel"),
         ("gpt_neo", "GPTNeoModel"),
         ("gpt_neox", "GPTNeoXModel"),
         ("gpt_neox_japanese", "GPTNeoXJapaneseModel"),
         ("gptj", "GPTJModel"),
+        ("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"),
+        ("graphormer", "GraphormerModel"),
         ("groupvit", "GroupViTModel"),
         ("hubert", "HubertModel"),
         ("ibert", "IBertModel"),
+        ("idefics", "IdeficsModel"),
         ("imagegpt", "ImageGPTModel"),
+        ("informer", "InformerModel"),
         ("jukebox", "JukeboxModel"),
         ("layoutlm", "LayoutLMModel"),
         ("layoutlmv2", "LayoutLMv2Model"),
@@ -96,6 +118,7 @@
         ("led", "LEDModel"),
         ("levit", "LevitModel"),
         ("lilt", "LiltModel"),
+        ("llama", "LlamaModel"),
         ("longformer", "LongformerModel"),
         ("longt5", "LongT5Model"),
         ("luke", "LukeModel"),
@@ -103,31 +126,42 @@
         ("m2m_100", "M2M100Model"),
         ("marian", "MarianModel"),
         ("markuplm", "MarkupLMModel"),
+        ("mask2former", "Mask2FormerModel"),
         ("maskformer", "MaskFormerModel"),
         ("maskformer-swin", "MaskFormerSwinModel"),
         ("mbart", "MBartModel"),
         ("mctct", "MCTCTModel"),
+        ("mega", "MegaModel"),
         ("megatron-bert", "MegatronBertModel"),
+        ("mgp-str", "MgpstrForSceneTextRecognition"),
+        ("mistral", "MistralModel"),
         ("mobilebert", "MobileBertModel"),
         ("mobilenet_v1", "MobileNetV1Model"),
         ("mobilenet_v2", "MobileNetV2Model"),
         ("mobilevit", "MobileViTModel"),
+        ("mobilevitv2", "MobileViTV2Model"),
         ("mpnet", "MPNetModel"),
+        ("mpt", "MptModel"),
+        ("mra", "MraModel"),
         ("mt5", "MT5Model"),
         ("mvp", "MvpModel"),
         ("nat", "NatModel"),
         ("nezha", "NezhaModel"),
-        ("nllb", "M2M100Model"),
+        ("nllb-moe", "NllbMoeModel"),
         ("nystromformer", "NystromformerModel"),
+        ("oneformer", "OneFormerModel"),
+        ("open-llama", "OpenLlamaModel"),
         ("openai-gpt", "OpenAIGPTModel"),
         ("opt", "OPTModel"),
         ("owlvit", "OwlViTModel"),
         ("pegasus", "PegasusModel"),
         ("pegasus_x", "PegasusXModel"),
         ("perceiver", "PerceiverModel"),
+        ("persimmon", "PersimmonModel"),
         ("plbart", "PLBartModel"),
         ("poolformer", "PoolFormerModel"),
         ("prophetnet", "ProphetNetModel"),
+        ("pvt", "PvtModel"),
         ("qdqbert", "QDQBertModel"),
         ("reformer", "ReformerModel"),
         ("regnet", "RegNetModel"),
@@ -138,12 +172,16 @@
         ("roberta-prelayernorm", "RobertaPreLayerNormModel"),
         ("roc_bert", "RoCBertModel"),
         ("roformer", "RoFormerModel"),
+        ("rwkv", "RwkvModel"),
+        ("sam", "SamModel"),
         ("segformer", "SegformerModel"),
         ("sew", "SEWModel"),
         ("sew-d", "SEWDModel"),
         ("speech_to_text", "Speech2TextModel"),
+        ("speecht5", "SpeechT5Model"),
         ("splinter", "SplinterModel"),
         ("squeezebert", "SqueezeBertModel"),
+        ("swiftformer", "SwiftFormerModel"),
         ("swin", "SwinModel"),
         ("swin2sr", "Swin2SRModel"),
         ("swinv2", "Swinv2Model"),
@@ -153,8 +191,11 @@
         ("tapas", "TapasModel"),
         ("time_series_transformer", "TimeSeriesTransformerModel"),
         ("timesformer", "TimesformerModel"),
+        ("timm_backbone", "TimmBackbone"),
         ("trajectory_transformer", "TrajectoryTransformerModel"),
         ("transfo-xl", "TransfoXLModel"),
+        ("tvlt", "TvltModel"),
+        ("umt5", "UMT5Model"),
         ("unispeech", "UniSpeechModel"),
         ("unispeech-sat", "UniSpeechSatModel"),
         ("van", "VanModel"),
@@ -166,6 +207,9 @@
         ("vit_hybrid", "ViTHybridModel"),
         ("vit_mae", "ViTMAEModel"),
         ("vit_msn", "ViTMSNModel"),
+        ("vitdet", "VitDetModel"),
+        ("vits", "VitsModel"),
+        ("vivit", "VivitModel"),
         ("wav2vec2", "Wav2Vec2Model"),
         ("wav2vec2-conformer", "Wav2Vec2ConformerModel"),
         ("wavlm", "WavLMModel"),
@@ -177,6 +221,7 @@
         ("xlm-roberta", "XLMRobertaModel"),
         ("xlm-roberta-xl", "XLMRobertaXLModel"),
         ("xlnet", "XLNetModel"),
+        ("xmod", "XmodModel"),
         ("yolos", "YolosModel"),
         ("yoso", "YosoModel"),
     ]
@@ -205,27 +250,36 @@
         ("funnel", "FunnelForPreTraining"),
         ("gpt-sw3", "GPT2LMHeadModel"),
         ("gpt2", "GPT2LMHeadModel"),
+        ("gpt_bigcode", "GPTBigCodeForCausalLM"),
+        ("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"),
         ("ibert", "IBertForMaskedLM"),
+        ("idefics", "IdeficsForVisionText2Text"),
         ("layoutlm", "LayoutLMForMaskedLM"),
         ("longformer", "LongformerForMaskedLM"),
         ("luke", "LukeForMaskedLM"),
         ("lxmert", "LxmertForPreTraining"),
+        ("mega", "MegaForMaskedLM"),
         ("megatron-bert", "MegatronBertForPreTraining"),
         ("mobilebert", "MobileBertForPreTraining"),
         ("mpnet", "MPNetForMaskedLM"),
+        ("mpt", "MptForCausalLM"),
+        ("mra", "MraForMaskedLM"),
         ("mvp", "MvpForConditionalGeneration"),
         ("nezha", "NezhaForPreTraining"),
+        ("nllb-moe", "NllbMoeForConditionalGeneration"),
         ("openai-gpt", "OpenAIGPTLMHeadModel"),
         ("retribert", "RetriBertModel"),
         ("roberta", "RobertaForMaskedLM"),
         ("roberta-prelayernorm", "RobertaPreLayerNormForMaskedLM"),
         ("roc_bert", "RoCBertForPreTraining"),
+        ("rwkv", "RwkvForCausalLM"),
         ("splinter", "SplinterForPreTraining"),
         ("squeezebert", "SqueezeBertForMaskedLM"),
         ("switch_transformers", "SwitchTransformersForConditionalGeneration"),
         ("t5", "T5ForConditionalGeneration"),
         ("tapas", "TapasForMaskedLM"),
         ("transfo-xl", "TransfoXLLMHeadModel"),
+        ("tvlt", "TvltForPreTraining"),
         ("unispeech", "UniSpeechForPreTraining"),
         ("unispeech-sat", "UniSpeechSatForPreTraining"),
         ("videomae", "VideoMAEForPreTraining"),
@@ -237,6 +291,7 @@
         ("xlm-roberta", "XLMRobertaForMaskedLM"),
         ("xlm-roberta-xl", "XLMRobertaXLForMaskedLM"),
         ("xlnet", "XLNetLMHeadModel"),
+        ("xmod", "XmodForMaskedLM"),
     ]
 )
 
@@ -253,6 +308,7 @@
         ("camembert", "CamembertForMaskedLM"),
         ("codegen", "CodeGenForCausalLM"),
         ("convbert", "ConvBertForMaskedLM"),
+        ("cpmant", "CpmAntForCausalLM"),
         ("ctrl", "CTRLLMHeadModel"),
         ("data2vec-text", "Data2VecTextForMaskedLM"),
         ("deberta", "DebertaForMaskedLM"),
@@ -269,10 +325,12 @@
         ("git", "GitForCausalLM"),
         ("gpt-sw3", "GPT2LMHeadModel"),
         ("gpt2", "GPT2LMHeadModel"),
+        ("gpt_bigcode", "GPTBigCodeForCausalLM"),
         ("gpt_neo", "GPTNeoForCausalLM"),
         ("gpt_neox", "GPTNeoXForCausalLM"),
         ("gpt_neox_japanese", "GPTNeoXJapaneseForCausalLM"),
         ("gptj", "GPTJForCausalLM"),
+        ("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"),
         ("ibert", "IBertForMaskedLM"),
         ("layoutlm", "LayoutLMForMaskedLM"),
         ("led", "LEDForConditionalGeneration"),
@@ -281,16 +339,20 @@
         ("luke", "LukeForMaskedLM"),
         ("m2m_100", "M2M100ForConditionalGeneration"),
         ("marian", "MarianMTModel"),
+        ("mega", "MegaForMaskedLM"),
         ("megatron-bert", "MegatronBertForCausalLM"),
         ("mobilebert", "MobileBertForMaskedLM"),
         ("mpnet", "MPNetForMaskedLM"),
+        ("mpt", "MptForCausalLM"),
+        ("mra", "MraForMaskedLM"),
         ("mvp", "MvpForConditionalGeneration"),
         ("nezha", "NezhaForMaskedLM"),
-        ("nllb", "M2M100ForConditionalGeneration"),
+        ("nllb-moe", "NllbMoeForConditionalGeneration"),
         ("nystromformer", "NystromformerForMaskedLM"),
         ("openai-gpt", "OpenAIGPTLMHeadModel"),
         ("pegasus_x", "PegasusXForConditionalGeneration"),
         ("plbart", "PLBartForConditionalGeneration"),
+        ("pop2piano", "Pop2PianoForConditionalGeneration"),
         ("qdqbert", "QDQBertForMaskedLM"),
         ("reformer", "ReformerModelWithLMHead"),
         ("rembert", "RemBertForMaskedLM"),
@@ -298,6 +360,7 @@
         ("roberta-prelayernorm", "RobertaPreLayerNormForMaskedLM"),
         ("roc_bert", "RoCBertForMaskedLM"),
         ("roformer", "RoFormerForMaskedLM"),
+        ("rwkv", "RwkvForCausalLM"),
         ("speech_to_text", "Speech2TextForConditionalGeneration"),
         ("squeezebert", "SqueezeBertForMaskedLM"),
         ("switch_transformers", "SwitchTransformersForConditionalGeneration"),
@@ -310,6 +373,7 @@
         ("xlm-roberta", "XLMRobertaForMaskedLM"),
         ("xlm-roberta-xl", "XLMRobertaXLForMaskedLM"),
         ("xlnet", "XLNetLMHeadModel"),
+        ("xmod", "XmodForMaskedLM"),
         ("yoso", "YosoForMaskedLM"),
     ]
 )
@@ -327,25 +391,36 @@
         ("blenderbot-small", "BlenderbotSmallForCausalLM"),
         ("bloom", "BloomForCausalLM"),
         ("camembert", "CamembertForCausalLM"),
+        ("code_llama", "LlamaForCausalLM"),
         ("codegen", "CodeGenForCausalLM"),
+        ("cpmant", "CpmAntForCausalLM"),
         ("ctrl", "CTRLLMHeadModel"),
         ("data2vec-text", "Data2VecTextForCausalLM"),
         ("electra", "ElectraForCausalLM"),
         ("ernie", "ErnieForCausalLM"),
+        ("falcon", "FalconForCausalLM"),
         ("git", "GitForCausalLM"),
         ("gpt-sw3", "GPT2LMHeadModel"),
         ("gpt2", "GPT2LMHeadModel"),
+        ("gpt_bigcode", "GPTBigCodeForCausalLM"),
         ("gpt_neo", "GPTNeoForCausalLM"),
         ("gpt_neox", "GPTNeoXForCausalLM"),
         ("gpt_neox_japanese", "GPTNeoXJapaneseForCausalLM"),
         ("gptj", "GPTJForCausalLM"),
+        ("llama", "LlamaForCausalLM"),
         ("marian", "MarianForCausalLM"),
         ("mbart", "MBartForCausalLM"),
+        ("mega", "MegaForCausalLM"),
         ("megatron-bert", "MegatronBertForCausalLM"),
+        ("mistral", "MistralForCausalLM"),
+        ("mpt", "MptForCausalLM"),
+        ("musicgen", "MusicgenForCausalLM"),
         ("mvp", "MvpForCausalLM"),
+        ("open-llama", "OpenLlamaForCausalLM"),
         ("openai-gpt", "OpenAIGPTLMHeadModel"),
         ("opt", "OPTForCausalLM"),
         ("pegasus", "PegasusForCausalLM"),
+        ("persimmon", "PersimmonForCausalLM"),
         ("plbart", "PLBartForCausalLM"),
         ("prophetnet", "ProphetNetForCausalLM"),
         ("qdqbert", "QDQBertLMHeadModel"),
@@ -355,6 +430,7 @@
         ("roberta-prelayernorm", "RobertaPreLayerNormForCausalLM"),
         ("roc_bert", "RoCBertForCausalLM"),
         ("roformer", "RoFormerForCausalLM"),
+        ("rwkv", "RwkvForCausalLM"),
         ("speech_to_text_2", "Speech2Text2ForCausalLM"),
         ("transfo-xl", "TransfoXLLMHeadModel"),
         ("trocr", "TrOCRForCausalLM"),
@@ -364,12 +440,14 @@
         ("xlm-roberta", "XLMRobertaForCausalLM"),
         ("xlm-roberta-xl", "XLMRobertaXLForCausalLM"),
         ("xlnet", "XLNetLMHeadModel"),
+        ("xmod", "XmodForCausalLM"),
     ]
 )
 
 MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES = OrderedDict(
     [
         ("deit", "DeiTForMaskedImageModeling"),
+        ("focalnet", "FocalNetForMaskedImageModeling"),
         ("swin", "SwinForMaskedImageModeling"),
         ("swinv2", "Swinv2ForMaskedImageModeling"),
         ("vit", "ViTForMaskedImageModeling"),
@@ -390,15 +468,27 @@
         ("beit", "BeitForImageClassification"),
         ("bit", "BitForImageClassification"),
         ("convnext", "ConvNextForImageClassification"),
+        ("convnextv2", "ConvNextV2ForImageClassification"),
         ("cvt", "CvtForImageClassification"),
         ("data2vec-vision", "Data2VecVisionForImageClassification"),
         ("deit", ("DeiTForImageClassification", "DeiTForImageClassificationWithTeacher")),
         ("dinat", "DinatForImageClassification"),
+        ("dinov2", "Dinov2ForImageClassification"),
+        (
+            "efficientformer",
+            (
+                "EfficientFormerForImageClassification",
+                "EfficientFormerForImageClassificationWithTeacher",
+            ),
+        ),
+        ("efficientnet", "EfficientNetForImageClassification"),
+        ("focalnet", "FocalNetForImageClassification"),
         ("imagegpt", "ImageGPTForImageClassification"),
         ("levit", ("LevitForImageClassification", "LevitForImageClassificationWithTeacher")),
         ("mobilenet_v1", "MobileNetV1ForImageClassification"),
         ("mobilenet_v2", "MobileNetV2ForImageClassification"),
         ("mobilevit", "MobileViTForImageClassification"),
+        ("mobilevitv2", "MobileViTV2ForImageClassification"),
         ("nat", "NatForImageClassification"),
         (
             "perceiver",
@@ -409,9 +499,11 @@
             ),
         ),
         ("poolformer", "PoolFormerForImageClassification"),
+        ("pvt", "PvtForImageClassification"),
         ("regnet", "RegNetForImageClassification"),
         ("resnet", "ResNetForImageClassification"),
         ("segformer", "SegformerForImageClassification"),
+        ("swiftformer", "SwiftFormerForImageClassification"),
         ("swin", "SwinForImageClassification"),
         ("swinv2", "Swinv2ForImageClassification"),
         ("van", "VanForImageClassification"),
@@ -437,7 +529,9 @@
         ("dpt", "DPTForSemanticSegmentation"),
         ("mobilenet_v2", "MobileNetV2ForSemanticSegmentation"),
         ("mobilevit", "MobileViTForSemanticSegmentation"),
+        ("mobilevitv2", "MobileViTV2ForSemanticSegmentation"),
         ("segformer", "SegformerForSemanticSegmentation"),
+        ("upernet", "UperNetForSemanticSegmentation"),
     ]
 )
 
@@ -453,7 +547,9 @@
     [
         # Model for Universal Segmentation mapping
         ("detr", "DetrForSegmentation"),
+        ("mask2former", "Mask2FormerForUniversalSegmentation"),
         ("maskformer", "MaskFormerForInstanceSegmentation"),
+        ("oneformer", "OneFormerForUniversalSegmentation"),
     ]
 )
 
@@ -461,11 +557,17 @@
     [
         ("timesformer", "TimesformerForVideoClassification"),
         ("videomae", "VideoMAEForVideoClassification"),
+        ("vivit", "VivitForVideoClassification"),
     ]
 )
 
 MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = OrderedDict(
     [
+        ("blip", "BlipForConditionalGeneration"),
+        ("blip-2", "Blip2ForConditionalGeneration"),
+        ("git", "GitForCausalLM"),
+        ("instructblip", "InstructBlipForConditionalGeneration"),
+        ("pix2struct", "Pix2StructForConditionalGeneration"),
         ("vision-encoder-decoder", "VisionEncoderDecoderModel"),
     ]
 )
@@ -494,9 +596,11 @@
         ("longformer", "LongformerForMaskedLM"),
         ("luke", "LukeForMaskedLM"),
         ("mbart", "MBartForConditionalGeneration"),
+        ("mega", "MegaForMaskedLM"),
         ("megatron-bert", "MegatronBertForMaskedLM"),
         ("mobilebert", "MobileBertForMaskedLM"),
         ("mpnet", "MPNetForMaskedLM"),
+        ("mra", "MraForMaskedLM"),
         ("mvp", "MvpForConditionalGeneration"),
         ("nezha", "NezhaForMaskedLM"),
         ("nystromformer", "NystromformerForMaskedLM"),
@@ -514,6 +618,7 @@
         ("xlm", "XLMWithLMHeadModel"),
         ("xlm-roberta", "XLMRobertaForMaskedLM"),
         ("xlm-roberta-xl", "XLMRobertaXLForMaskedLM"),
+        ("xmod", "XmodForMaskedLM"),
         ("yoso", "YosoForMaskedLM"),
     ]
 )
@@ -523,6 +628,7 @@
         # Model for Object Detection mapping
         ("conditional_detr", "ConditionalDetrForObjectDetection"),
         ("deformable_detr", "DeformableDetrForObjectDetection"),
+        ("deta", "DetaForObjectDetection"),
         ("detr", "DetrForObjectDetection"),
         ("table-transformer", "TableTransformerForObjectDetection"),
         ("yolos", "YolosForObjectDetection"),
@@ -552,6 +658,7 @@
         ("blenderbot-small", "BlenderbotSmallForConditionalGeneration"),
         ("encoder-decoder", "EncoderDecoderModel"),
         ("fsmt", "FSMTForConditionalGeneration"),
+        ("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"),
         ("led", "LEDForConditionalGeneration"),
         ("longt5", "LongT5ForConditionalGeneration"),
         ("m2m_100", "M2M100ForConditionalGeneration"),
@@ -559,21 +666,24 @@
         ("mbart", "MBartForConditionalGeneration"),
         ("mt5", "MT5ForConditionalGeneration"),
         ("mvp", "MvpForConditionalGeneration"),
-        ("nllb", "M2M100ForConditionalGeneration"),
+        ("nllb-moe", "NllbMoeForConditionalGeneration"),
         ("pegasus", "PegasusForConditionalGeneration"),
         ("pegasus_x", "PegasusXForConditionalGeneration"),
         ("plbart", "PLBartForConditionalGeneration"),
         ("prophetnet", "ProphetNetForConditionalGeneration"),
         ("switch_transformers", "SwitchTransformersForConditionalGeneration"),
         ("t5", "T5ForConditionalGeneration"),
+        ("umt5", "UMT5ForConditionalGeneration"),
         ("xlm-prophetnet", "XLMProphetNetForConditionalGeneration"),
     ]
 )
 
 MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = OrderedDict(
     [
+        ("pop2piano", "Pop2PianoForConditionalGeneration"),
         ("speech-encoder-decoder", "SpeechEncoderDecoderModel"),
         ("speech_to_text", "Speech2TextForConditionalGeneration"),
+        ("speecht5", "SpeechT5ForSpeechToText"),
         ("whisper", "WhisperForConditionalGeneration"),
     ]
 )
@@ -586,9 +696,11 @@
         ("bert", "BertForSequenceClassification"),
         ("big_bird", "BigBirdForSequenceClassification"),
         ("bigbird_pegasus", "BigBirdPegasusForSequenceClassification"),
+        ("biogpt", "BioGptForSequenceClassification"),
         ("bloom", "BloomForSequenceClassification"),
         ("camembert", "CamembertForSequenceClassification"),
         ("canine", "CanineForSequenceClassification"),
+        ("code_llama", "LlamaForSequenceClassification"),
         ("convbert", "ConvBertForSequenceClassification"),
         ("ctrl", "CTRLForSequenceClassification"),
         ("data2vec-text", "Data2VecTextForSequenceClassification"),
@@ -597,13 +709,17 @@
         ("distilbert", "DistilBertForSequenceClassification"),
         ("electra", "ElectraForSequenceClassification"),
         ("ernie", "ErnieForSequenceClassification"),
+        ("ernie_m", "ErnieMForSequenceClassification"),
         ("esm", "EsmForSequenceClassification"),
+        ("falcon", "FalconForSequenceClassification"),
         ("flaubert", "FlaubertForSequenceClassification"),
         ("fnet", "FNetForSequenceClassification"),
         ("funnel", "FunnelForSequenceClassification"),
         ("gpt-sw3", "GPT2ForSequenceClassification"),
         ("gpt2", "GPT2ForSequenceClassification"),
+        ("gpt_bigcode", "GPTBigCodeForSequenceClassification"),
         ("gpt_neo", "GPTNeoForSequenceClassification"),
+        ("gpt_neox", "GPTNeoXForSequenceClassification"),
         ("gptj", "GPTJForSequenceClassification"),
         ("ibert", "IBertForSequenceClassification"),
         ("layoutlm", "LayoutLMForSequenceClassification"),
@@ -611,19 +727,27 @@
         ("layoutlmv3", "LayoutLMv3ForSequenceClassification"),
         ("led", "LEDForSequenceClassification"),
         ("lilt", "LiltForSequenceClassification"),
+        ("llama", "LlamaForSequenceClassification"),
         ("longformer", "LongformerForSequenceClassification"),
         ("luke", "LukeForSequenceClassification"),
         ("markuplm", "MarkupLMForSequenceClassification"),
         ("mbart", "MBartForSequenceClassification"),
+        ("mega", "MegaForSequenceClassification"),
         ("megatron-bert", "MegatronBertForSequenceClassification"),
+        ("mistral", "MistralForSequenceClassification"),
         ("mobilebert", "MobileBertForSequenceClassification"),
         ("mpnet", "MPNetForSequenceClassification"),
+        ("mpt", "MptForSequenceClassification"),
+        ("mra", "MraForSequenceClassification"),
+        ("mt5", "MT5ForSequenceClassification"),
         ("mvp", "MvpForSequenceClassification"),
         ("nezha", "NezhaForSequenceClassification"),
         ("nystromformer", "NystromformerForSequenceClassification"),
+        ("open-llama", "OpenLlamaForSequenceClassification"),
         ("openai-gpt", "OpenAIGPTForSequenceClassification"),
         ("opt", "OPTForSequenceClassification"),
         ("perceiver", "PerceiverForSequenceClassification"),
+        ("persimmon", "PersimmonForSequenceClassification"),
         ("plbart", "PLBartForSequenceClassification"),
         ("qdqbert", "QDQBertForSequenceClassification"),
         ("reformer", "ReformerForSequenceClassification"),
@@ -633,12 +757,15 @@
         ("roc_bert", "RoCBertForSequenceClassification"),
         ("roformer", "RoFormerForSequenceClassification"),
         ("squeezebert", "SqueezeBertForSequenceClassification"),
+        ("t5", "T5ForSequenceClassification"),
         ("tapas", "TapasForSequenceClassification"),
         ("transfo-xl", "TransfoXLForSequenceClassification"),
+        ("umt5", "UMT5ForSequenceClassification"),
         ("xlm", "XLMForSequenceClassification"),
         ("xlm-roberta", "XLMRobertaForSequenceClassification"),
         ("xlm-roberta-xl", "XLMRobertaXLForSequenceClassification"),
         ("xlnet", "XLNetForSequenceClassification"),
+        ("xmod", "XmodForSequenceClassification"),
         ("yoso", "YosoForSequenceClassification"),
     ]
 )
@@ -661,9 +788,14 @@
         ("distilbert", "DistilBertForQuestionAnswering"),
         ("electra", "ElectraForQuestionAnswering"),
         ("ernie", "ErnieForQuestionAnswering"),
+        ("ernie_m", "ErnieMForQuestionAnswering"),
+        ("falcon", "FalconForQuestionAnswering"),
         ("flaubert", "FlaubertForQuestionAnsweringSimple"),
         ("fnet", "FNetForQuestionAnswering"),
         ("funnel", "FunnelForQuestionAnswering"),
+        ("gpt2", "GPT2ForQuestionAnswering"),
+        ("gpt_neo", "GPTNeoForQuestionAnswering"),
+        ("gpt_neox", "GPTNeoXForQuestionAnswering"),
         ("gptj", "GPTJForQuestionAnswering"),
         ("ibert", "IBertForQuestionAnswering"),
         ("layoutlmv2", "LayoutLMv2ForQuestionAnswering"),
@@ -675,9 +807,13 @@
         ("lxmert", "LxmertForQuestionAnswering"),
         ("markuplm", "MarkupLMForQuestionAnswering"),
         ("mbart", "MBartForQuestionAnswering"),
+        ("mega", "MegaForQuestionAnswering"),
         ("megatron-bert", "MegatronBertForQuestionAnswering"),
         ("mobilebert", "MobileBertForQuestionAnswering"),
         ("mpnet", "MPNetForQuestionAnswering"),
+        ("mpt", "MptForQuestionAnswering"),
+        ("mra", "MraForQuestionAnswering"),
+        ("mt5", "MT5ForQuestionAnswering"),
         ("mvp", "MvpForQuestionAnswering"),
         ("nezha", "NezhaForQuestionAnswering"),
         ("nystromformer", "NystromformerForQuestionAnswering"),
@@ -691,10 +827,13 @@
         ("roformer", "RoFormerForQuestionAnswering"),
         ("splinter", "SplinterForQuestionAnswering"),
         ("squeezebert", "SqueezeBertForQuestionAnswering"),
+        ("t5", "T5ForQuestionAnswering"),
+        ("umt5", "UMT5ForQuestionAnswering"),
         ("xlm", "XLMForQuestionAnsweringSimple"),
         ("xlm-roberta", "XLMRobertaForQuestionAnswering"),
         ("xlm-roberta-xl", "XLMRobertaXLForQuestionAnswering"),
         ("xlnet", "XLNetForQuestionAnsweringSimple"),
+        ("xmod", "XmodForQuestionAnswering"),
         ("yoso", "YosoForQuestionAnswering"),
     ]
 )
@@ -708,6 +847,7 @@
 
 MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
     [
+        ("blip-2", "Blip2ForConditionalGeneration"),
         ("vilt", "ViltForQuestionAnswering"),
     ]
 )
@@ -726,7 +866,9 @@
         ("albert", "AlbertForTokenClassification"),
         ("bert", "BertForTokenClassification"),
         ("big_bird", "BigBirdForTokenClassification"),
+        ("biogpt", "BioGptForTokenClassification"),
         ("bloom", "BloomForTokenClassification"),
+        ("bros", "BrosForTokenClassification"),
         ("camembert", "CamembertForTokenClassification"),
         ("canine", "CanineForTokenClassification"),
         ("convbert", "ConvBertForTokenClassification"),
@@ -736,12 +878,17 @@
         ("distilbert", "DistilBertForTokenClassification"),
         ("electra", "ElectraForTokenClassification"),
         ("ernie", "ErnieForTokenClassification"),
+        ("ernie_m", "ErnieMForTokenClassification"),
         ("esm", "EsmForTokenClassification"),
+        ("falcon", "FalconForTokenClassification"),
         ("flaubert", "FlaubertForTokenClassification"),
         ("fnet", "FNetForTokenClassification"),
         ("funnel", "FunnelForTokenClassification"),
         ("gpt-sw3", "GPT2ForTokenClassification"),
         ("gpt2", "GPT2ForTokenClassification"),
+        ("gpt_bigcode", "GPTBigCodeForTokenClassification"),
+        ("gpt_neo", "GPTNeoForTokenClassification"),
+        ("gpt_neox", "GPTNeoXForTokenClassification"),
         ("ibert", "IBertForTokenClassification"),
         ("layoutlm", "LayoutLMForTokenClassification"),
         ("layoutlmv2", "LayoutLMv2ForTokenClassification"),
@@ -750,9 +897,12 @@
         ("longformer", "LongformerForTokenClassification"),
         ("luke", "LukeForTokenClassification"),
         ("markuplm", "MarkupLMForTokenClassification"),
+        ("mega", "MegaForTokenClassification"),
         ("megatron-bert", "MegatronBertForTokenClassification"),
         ("mobilebert", "MobileBertForTokenClassification"),
         ("mpnet", "MPNetForTokenClassification"),
+        ("mpt", "MptForTokenClassification"),
+        ("mra", "MraForTokenClassification"),
         ("nezha", "NezhaForTokenClassification"),
         ("nystromformer", "NystromformerForTokenClassification"),
         ("qdqbert", "QDQBertForTokenClassification"),
@@ -766,6 +916,7 @@
         ("xlm-roberta", "XLMRobertaForTokenClassification"),
         ("xlm-roberta-xl", "XLMRobertaXLForTokenClassification"),
         ("xlnet", "XLNetForTokenClassification"),
+        ("xmod", "XmodForTokenClassification"),
         ("yoso", "YosoForTokenClassification"),
     ]
 )
@@ -784,15 +935,18 @@
         ("distilbert", "DistilBertForMultipleChoice"),
         ("electra", "ElectraForMultipleChoice"),
         ("ernie", "ErnieForMultipleChoice"),
+        ("ernie_m", "ErnieMForMultipleChoice"),
         ("flaubert", "FlaubertForMultipleChoice"),
         ("fnet", "FNetForMultipleChoice"),
         ("funnel", "FunnelForMultipleChoice"),
         ("ibert", "IBertForMultipleChoice"),
         ("longformer", "LongformerForMultipleChoice"),
         ("luke", "LukeForMultipleChoice"),
+        ("mega", "MegaForMultipleChoice"),
         ("megatron-bert", "MegatronBertForMultipleChoice"),
         ("mobilebert", "MobileBertForMultipleChoice"),
         ("mpnet", "MPNetForMultipleChoice"),
+        ("mra", "MraForMultipleChoice"),
         ("nezha", "NezhaForMultipleChoice"),
         ("nystromformer", "NystromformerForMultipleChoice"),
         ("qdqbert", "QDQBertForMultipleChoice"),
@@ -806,6 +960,7 @@
         ("xlm-roberta", "XLMRobertaForMultipleChoice"),
         ("xlm-roberta-xl", "XLMRobertaXLForMultipleChoice"),
         ("xlnet", "XLNetForMultipleChoice"),
+        ("xmod", "XmodForMultipleChoice"),
         ("yoso", "YosoForMultipleChoice"),
     ]
 )
@@ -835,6 +990,7 @@
         ("wav2vec2", "Wav2Vec2ForSequenceClassification"),
         ("wav2vec2-conformer", "Wav2Vec2ConformerForSequenceClassification"),
         ("wavlm", "WavLMForSequenceClassification"),
+        ("whisper", "WhisperForAudioClassification"),
     ]
 )
 
@@ -876,9 +1032,26 @@
     ]
 )
 
-_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
+MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for Text-To-Spectrogram mapping
+        ("speecht5", "SpeechT5ForTextToSpeech"),
+    ]
+)
+
+MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for Text-To-Waveform mapping
+        ("bark", "BarkModel"),
+        ("musicgen", "MusicgenForConditionalGeneration"),
+        ("vits", "VitsModel"),
+    ]
+)
+
+MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Zero Shot Image Classification mapping
+        ("align", "AlignModel"),
         ("altclip", "AltCLIPModel"),
         ("blip", "BlipModel"),
         ("chinese_clip", "ChineseCLIPModel"),
@@ -891,11 +1064,60 @@
     [
         # Backbone mapping
         ("bit", "BitBackbone"),
+        ("convnext", "ConvNextBackbone"),
+        ("convnextv2", "ConvNextV2Backbone"),
         ("dinat", "DinatBackbone"),
+        ("dinov2", "Dinov2Backbone"),
+        ("focalnet", "FocalNetBackbone"),
         ("maskformer-swin", "MaskFormerSwinBackbone"),
         ("nat", "NatBackbone"),
         ("resnet", "ResNetBackbone"),
         ("swin", "SwinBackbone"),
+        ("timm_backbone", "TimmBackbone"),
+        ("vitdet", "VitDetBackbone"),
+    ]
+)
+
+MODEL_FOR_MASK_GENERATION_MAPPING_NAMES = OrderedDict(
+    [
+        ("sam", "SamModel"),
+    ]
+)
+
+MODEL_FOR_TEXT_ENCODING_MAPPING_NAMES = OrderedDict(
+    [
+        ("albert", "AlbertModel"),
+        ("bert", "BertModel"),
+        ("big_bird", "BigBirdModel"),
+        ("data2vec-text", "Data2VecTextModel"),
+        ("deberta", "DebertaModel"),
+        ("deberta-v2", "DebertaV2Model"),
+        ("distilbert", "DistilBertModel"),
+        ("electra", "ElectraModel"),
+        ("flaubert", "FlaubertModel"),
+        ("ibert", "IBertModel"),
+        ("longformer", "LongformerModel"),
+        ("mobilebert", "MobileBertModel"),
+        ("mt5", "MT5EncoderModel"),
+        ("nystromformer", "NystromformerModel"),
+        ("reformer", "ReformerModel"),
+        ("rembert", "RemBertModel"),
+        ("roberta", "RobertaModel"),
+        ("roberta-prelayernorm", "RobertaPreLayerNormModel"),
+        ("roc_bert", "RoCBertModel"),
+        ("roformer", "RoFormerModel"),
+        ("squeezebert", "SqueezeBertModel"),
+        ("t5", "T5EncoderModel"),
+        ("umt5", "UMT5EncoderModel"),
+        ("xlm", "XLMModel"),
+        ("xlm-roberta", "XLMRobertaModel"),
+        ("xlm-roberta-xl", "XLMRobertaXLModel"),
+    ]
+)
+
+MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = OrderedDict(
+    [
+        ("swin2sr", "Swin2SRForImageSuperResolution"),
     ]
 )
 
@@ -909,6 +1131,9 @@
 MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING = _LazyAutoMapping(
     CONFIG_MAPPING_NAMES, MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
 )
+MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES
+)
 MODEL_FOR_IMAGE_SEGMENTATION_MAPPING = _LazyAutoMapping(
     CONFIG_MAPPING_NAMES, MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES
 )
@@ -969,8 +1194,32 @@
 )
 MODEL_FOR_AUDIO_XVECTOR_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES)
 
+MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES
+)
+
+MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES)
+
 MODEL_FOR_BACKBONE_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_BACKBONE_MAPPING_NAMES)
 
+MODEL_FOR_MASK_GENERATION_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_MASK_GENERATION_MAPPING_NAMES)
+
+MODEL_FOR_TEXT_ENCODING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_TEXT_ENCODING_MAPPING_NAMES)
+
+MODEL_FOR_IMAGE_TO_IMAGE_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES)
+
+
+class AutoModelForMaskGeneration(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_MASK_GENERATION_MAPPING
+
+
+class AutoModelForTextEncoding(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_TEXT_ENCODING_MAPPING
+
+
+class AutoModelForImageToImage(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_IMAGE_TO_IMAGE_MAPPING
+
 
 class AutoModel(_BaseAutoModelClass):
     _model_mapping = MODEL_MAPPING
@@ -1096,6 +1345,15 @@ class AutoModelForImageClassification(_BaseAutoModelClass):
 AutoModelForImageClassification = auto_class_update(AutoModelForImageClassification, head_doc="image classification")
 
 
+class AutoModelForZeroShotImageClassification(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING
+
+
+AutoModelForZeroShotImageClassification = auto_class_update(
+    AutoModelForZeroShotImageClassification, head_doc="zero-shot image classification"
+)
+
+
 class AutoModelForImageSegmentation(_BaseAutoModelClass):
     _model_mapping = MODEL_FOR_IMAGE_SEGMENTATION_MAPPING
 
@@ -1203,7 +1461,15 @@ class AutoModelForAudioXVector(_BaseAutoModelClass):
     _model_mapping = MODEL_FOR_AUDIO_XVECTOR_MAPPING
 
 
-class AutoBackbone(_BaseAutoModelClass):
+class AutoModelForTextToSpectrogram(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING
+
+
+class AutoModelForTextToWaveform(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING
+
+
+class AutoBackbone(_BaseAutoBackboneClass):
     _model_mapping = MODEL_FOR_BACKBONE_MAPPING
 
 
diff --git a/src/transformers/models/auto/modeling_flax_auto.py b/src/transformers/models/auto/modeling_flax_auto.py
index 2335b31728d4..ebc768963429 100644
--- a/src/transformers/models/auto/modeling_flax_auto.py
+++ b/src/transformers/models/auto/modeling_flax_auto.py
@@ -35,6 +35,7 @@
         ("big_bird", "FlaxBigBirdModel"),
         ("blenderbot", "FlaxBlenderbotModel"),
         ("blenderbot-small", "FlaxBlenderbotSmallModel"),
+        ("bloom", "FlaxBloomModel"),
         ("clip", "FlaxCLIPModel"),
         ("distilbert", "FlaxDistilBertModel"),
         ("electra", "FlaxElectraModel"),
@@ -48,6 +49,8 @@
         ("mt5", "FlaxMT5Model"),
         ("opt", "FlaxOPTModel"),
         ("pegasus", "FlaxPegasusModel"),
+        ("regnet", "FlaxRegNetModel"),
+        ("resnet", "FlaxResNetModel"),
         ("roberta", "FlaxRobertaModel"),
         ("roberta-prelayernorm", "FlaxRobertaPreLayerNormModel"),
         ("roformer", "FlaxRoFormerModel"),
@@ -55,6 +58,7 @@
         ("vision-text-dual-encoder", "FlaxVisionTextDualEncoderModel"),
         ("vit", "FlaxViTModel"),
         ("wav2vec2", "FlaxWav2Vec2Model"),
+        ("whisper", "FlaxWhisperModel"),
         ("xglm", "FlaxXGLMModel"),
         ("xlm-roberta", "FlaxXLMRobertaModel"),
     ]
@@ -76,6 +80,7 @@
         ("roformer", "FlaxRoFormerForMaskedLM"),
         ("t5", "FlaxT5ForConditionalGeneration"),
         ("wav2vec2", "FlaxWav2Vec2ForPreTraining"),
+        ("whisper", "FlaxWhisperForConditionalGeneration"),
         ("xlm-roberta", "FlaxXLMRobertaForMaskedLM"),
     ]
 )
@@ -117,6 +122,8 @@
     [
         # Model for Image-classsification
         ("beit", "FlaxBeitForImageClassification"),
+        ("regnet", "FlaxRegNetForImageClassification"),
+        ("resnet", "FlaxResNetForImageClassification"),
         ("vit", "FlaxViTForImageClassification"),
     ]
 )
@@ -133,6 +140,7 @@
         ("bart", "FlaxBartForCausalLM"),
         ("bert", "FlaxBertForCausalLM"),
         ("big_bird", "FlaxBigBirdForCausalLM"),
+        ("bloom", "FlaxBloomForCausalLM"),
         ("electra", "FlaxElectraForCausalLM"),
         ("gpt-sw3", "FlaxGPT2LMHeadModel"),
         ("gpt2", "FlaxGPT2LMHeadModel"),
@@ -142,6 +150,7 @@
         ("roberta", "FlaxRobertaForCausalLM"),
         ("roberta-prelayernorm", "FlaxRobertaPreLayerNormForCausalLM"),
         ("xglm", "FlaxXGLMForCausalLM"),
+        ("xlm-roberta", "FlaxXLMRobertaForCausalLM"),
     ]
 )
 
@@ -218,9 +227,15 @@
 FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = OrderedDict(
     [
         ("speech-encoder-decoder", "FlaxSpeechEncoderDecoderModel"),
+        ("whisper", "FlaxWhisperForConditionalGeneration"),
     ]
 )
 
+FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
+    [
+        ("whisper", "FlaxWhisperForAudioClassification"),
+    ]
+)
 
 FLAX_MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_MAPPING_NAMES)
 FLAX_MODEL_FOR_PRETRAINING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_PRETRAINING_MAPPING_NAMES)
@@ -251,6 +266,9 @@
 FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING = _LazyAutoMapping(
     CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES
 )
+FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES
+)
 
 
 class FlaxAutoModel(_BaseAutoModelClass):
diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py
index c77fba4f66fa..b334dd30917f 100644
--- a/src/transformers/models/auto/modeling_tf_auto.py
+++ b/src/transformers/models/auto/modeling_tf_auto.py
@@ -34,6 +34,7 @@
         ("bert", "TFBertModel"),
         ("blenderbot", "TFBlenderbotModel"),
         ("blenderbot-small", "TFBlenderbotSmallModel"),
+        ("blip", "TFBlipModel"),
         ("camembert", "TFCamembertModel"),
         ("clip", "TFCLIPModel"),
         ("convbert", "TFConvBertModel"),
@@ -46,6 +47,7 @@
         ("deit", "TFDeiTModel"),
         ("distilbert", "TFDistilBertModel"),
         ("dpr", "TFDPRQuestionEncoder"),
+        ("efficientformer", "TFEfficientFormerModel"),
         ("electra", "TFElectraModel"),
         ("esm", "TFEsmModel"),
         ("flaubert", "TFFlaubertModel"),
@@ -75,12 +77,14 @@
         ("roberta", "TFRobertaModel"),
         ("roberta-prelayernorm", "TFRobertaPreLayerNormModel"),
         ("roformer", "TFRoFormerModel"),
+        ("sam", "TFSamModel"),
         ("segformer", "TFSegformerModel"),
         ("speech_to_text", "TFSpeech2TextModel"),
         ("swin", "TFSwinModel"),
         ("t5", "TFT5Model"),
         ("tapas", "TFTapasModel"),
         ("transfo-xl", "TFTransfoXLModel"),
+        ("vision-text-dual-encoder", "TFVisionTextDualEncoderModel"),
         ("vit", "TFViTModel"),
         ("vit_mae", "TFViTMAEModel"),
         ("wav2vec2", "TFWav2Vec2Model"),
@@ -180,6 +184,7 @@
         ("transfo-xl", "TFTransfoXLLMHeadModel"),
         ("xglm", "TFXGLMForCausalLM"),
         ("xlm", "TFXLMWithLMHeadModel"),
+        ("xlm-roberta", "TFXLMRobertaForCausalLM"),
         ("xlnet", "TFXLNetLMHeadModel"),
     ]
 )
@@ -198,6 +203,10 @@
         ("cvt", "TFCvtForImageClassification"),
         ("data2vec-vision", "TFData2VecVisionForImageClassification"),
         ("deit", ("TFDeiTForImageClassification", "TFDeiTForImageClassificationWithTeacher")),
+        (
+            "efficientformer",
+            ("TFEfficientFormerForImageClassification", "TFEfficientFormerForImageClassificationWithTeacher"),
+        ),
         ("mobilevit", "TFMobileViTForImageClassification"),
         ("regnet", "TFRegNetForImageClassification"),
         ("resnet", "TFResNetForImageClassification"),
@@ -207,6 +216,16 @@
     ]
 )
 
+
+TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for Zero Shot Image Classification mapping
+        ("blip", "TFBlipModel"),
+        ("clip", "TFCLIPModel"),
+    ]
+)
+
+
 TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Semantic Segmentation mapping
@@ -218,6 +237,7 @@
 
 TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = OrderedDict(
     [
+        ("blip", "TFBlipForConditionalGeneration"),
         ("vision-encoder-decoder", "TFVisionEncoderDecoderModel"),
     ]
 )
@@ -337,10 +357,12 @@
         ("xlnet", "TFXLNetForQuestionAnsweringSimple"),
     ]
 )
+TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = OrderedDict([("wav2vec2", "TFWav2Vec2ForSequenceClassification")])
 
 TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
     [
         ("layoutlm", "TFLayoutLMForQuestionAnswering"),
+        ("layoutlmv3", "TFLayoutLMv3ForQuestionAnswering"),
     ]
 )
 
@@ -388,6 +410,7 @@
         ("bert", "TFBertForMultipleChoice"),
         ("camembert", "TFCamembertForMultipleChoice"),
         ("convbert", "TFConvBertForMultipleChoice"),
+        ("deberta-v2", "TFDebertaV2ForMultipleChoice"),
         ("distilbert", "TFDistilBertForMultipleChoice"),
         ("electra", "TFElectraForMultipleChoice"),
         ("flaubert", "TFFlaubertForMultipleChoice"),
@@ -411,6 +434,33 @@
         ("mobilebert", "TFMobileBertForNextSentencePrediction"),
     ]
 )
+TF_MODEL_FOR_MASK_GENERATION_MAPPING_NAMES = OrderedDict(
+    [
+        ("sam", "TFSamModel"),
+    ]
+)
+TF_MODEL_FOR_TEXT_ENCODING_MAPPING_NAMES = OrderedDict(
+    [
+        ("albert", "TFAlbertModel"),
+        ("bert", "TFBertModel"),
+        ("convbert", "TFConvBertModel"),
+        ("deberta", "TFDebertaModel"),
+        ("deberta-v2", "TFDebertaV2Model"),
+        ("distilbert", "TFDistilBertModel"),
+        ("electra", "TFElectraModel"),
+        ("flaubert", "TFFlaubertModel"),
+        ("longformer", "TFLongformerModel"),
+        ("mobilebert", "TFMobileBertModel"),
+        ("mt5", "TFMT5EncoderModel"),
+        ("rembert", "TFRemBertModel"),
+        ("roberta", "TFRobertaModel"),
+        ("roberta-prelayernorm", "TFRobertaPreLayerNormModel"),
+        ("roformer", "TFRoFormerModel"),
+        ("t5", "TFT5EncoderModel"),
+        ("xlm", "TFXLMModel"),
+        ("xlm-roberta", "TFXLMRobertaModel"),
+    ]
+)
 
 TF_MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_MAPPING_NAMES)
 TF_MODEL_FOR_PRETRAINING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_FOR_PRETRAINING_MAPPING_NAMES)
@@ -422,6 +472,9 @@
 TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING = _LazyAutoMapping(
     CONFIG_MAPPING_NAMES, TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
 )
+TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES
+)
 TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING = _LazyAutoMapping(
     CONFIG_MAPPING_NAMES, TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES
 )
@@ -454,6 +507,23 @@
 TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING = _LazyAutoMapping(
     CONFIG_MAPPING_NAMES, TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES
 )
+TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES
+)
+
+TF_MODEL_FOR_MASK_GENERATION_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, TF_MODEL_FOR_MASK_GENERATION_MAPPING_NAMES
+)
+
+TF_MODEL_FOR_TEXT_ENCODING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_FOR_TEXT_ENCODING_MAPPING_NAMES)
+
+
+class TFAutoModelForMaskGeneration(_BaseAutoModelClass):
+    _model_mapping = TF_MODEL_FOR_MASK_GENERATION_MAPPING
+
+
+class TFAutoModelForTextEncoding(_BaseAutoModelClass):
+    _model_mapping = TF_MODEL_FOR_TEXT_ENCODING_MAPPING
 
 
 class TFAutoModel(_BaseAutoModelClass):
@@ -463,6 +533,15 @@ class TFAutoModel(_BaseAutoModelClass):
 TFAutoModel = auto_class_update(TFAutoModel)
 
 
+class TFAutoModelForAudioClassification(_BaseAutoModelClass):
+    _model_mapping = TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
+
+
+TFAutoModelForAudioClassification = auto_class_update(
+    TFAutoModelForAudioClassification, head_doc="audio classification"
+)
+
+
 class TFAutoModelForPreTraining(_BaseAutoModelClass):
     _model_mapping = TF_MODEL_FOR_PRETRAINING_MAPPING
 
@@ -503,11 +582,20 @@ class TFAutoModelForImageClassification(_BaseAutoModelClass):
 )
 
 
+class TFAutoModelForZeroShotImageClassification(_BaseAutoModelClass):
+    _model_mapping = TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING
+
+
+TFAutoModelForZeroShotImageClassification = auto_class_update(
+    TFAutoModelForZeroShotImageClassification, head_doc="zero-shot image classification"
+)
+
+
 class TFAutoModelForSemanticSegmentation(_BaseAutoModelClass):
     _model_mapping = TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING
 
 
-TF_AutoModelForSemanticSegmentation = auto_class_update(
+TFAutoModelForSemanticSegmentation = auto_class_update(
     TFAutoModelForSemanticSegmentation, head_doc="semantic segmentation"
 )
 
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index f1ad8f221adf..b9c0c23e54e9 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -16,11 +16,13 @@
 import importlib
 import inspect
 import json
+import os
+import warnings
 from collections import OrderedDict
 
 # Build the list of all feature extractors
 from ...configuration_utils import PretrainedConfig
-from ...dynamic_module_utils import get_class_from_dynamic_module
+from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
 from ...feature_extraction_utils import FeatureExtractionMixin
 from ...image_processing_utils import ImageProcessingMixin
 from ...tokenization_utils import TOKENIZER_CONFIG_FILE
@@ -41,31 +43,45 @@
 
 PROCESSOR_MAPPING_NAMES = OrderedDict(
     [
+        ("align", "AlignProcessor"),
         ("altclip", "AltCLIPProcessor"),
-        ("blip", "BLIPProcessor"),
+        ("bark", "BarkProcessor"),
+        ("blip", "BlipProcessor"),
+        ("blip-2", "Blip2Processor"),
+        ("bridgetower", "BridgeTowerProcessor"),
         ("chinese_clip", "ChineseCLIPProcessor"),
+        ("clap", "ClapProcessor"),
         ("clip", "CLIPProcessor"),
         ("clipseg", "CLIPSegProcessor"),
         ("flava", "FlavaProcessor"),
-        ("git", "GITProcessor"),
+        ("git", "GitProcessor"),
         ("groupvit", "CLIPProcessor"),
+        ("hubert", "Wav2Vec2Processor"),
+        ("idefics", "IdeficsProcessor"),
+        ("instructblip", "InstructBlipProcessor"),
         ("layoutlmv2", "LayoutLMv2Processor"),
         ("layoutlmv3", "LayoutLMv3Processor"),
-        ("layoutxlm", "LayoutXLMProcessor"),
         ("markuplm", "MarkupLMProcessor"),
+        ("mctct", "MCTCTProcessor"),
+        ("mgp-str", "MgpstrProcessor"),
+        ("oneformer", "OneFormerProcessor"),
         ("owlvit", "OwlViTProcessor"),
+        ("pix2struct", "Pix2StructProcessor"),
+        ("pop2piano", "Pop2PianoProcessor"),
+        ("sam", "SamProcessor"),
         ("sew", "Wav2Vec2Processor"),
         ("sew-d", "Wav2Vec2Processor"),
         ("speech_to_text", "Speech2TextProcessor"),
         ("speech_to_text_2", "Speech2Text2Processor"),
+        ("speecht5", "SpeechT5Processor"),
         ("trocr", "TrOCRProcessor"),
+        ("tvlt", "TvltProcessor"),
         ("unispeech", "Wav2Vec2Processor"),
         ("unispeech-sat", "Wav2Vec2Processor"),
         ("vilt", "ViltProcessor"),
         ("vision-text-dual-encoder", "VisionTextDualEncoderProcessor"),
         ("wav2vec2", "Wav2Vec2Processor"),
         ("wav2vec2-conformer", "Wav2Vec2Processor"),
-        ("wav2vec2_with_lm", "Wav2Vec2ProcessorWithLM"),
         ("wavlm", "Wav2Vec2Processor"),
         ("whisper", "WhisperProcessor"),
         ("xclip", "XCLIPProcessor"),
@@ -145,7 +161,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-            use_auth_token (`str` or *bool*, *optional*):
+            token (`str` or *bool*, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
                 when running `huggingface-cli login` (stored in `~/.huggingface`).
             revision (`str`, *optional*, defaults to `"main"`):
@@ -168,7 +184,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
 
         <Tip>
 
-        Passing `use_auth_token=True` is required when you want to use a private model.
+        Passing `token=True` is required when you want to use a private model.
 
         </Tip>
 
@@ -181,10 +197,21 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
 
         >>> # If processor files are in a directory (e.g. processor was saved using *save_pretrained('./test/saved_model/')*)
-        >>> processor = AutoProcessor.from_pretrained("./test/saved_model/")
+        >>> # processor = AutoProcessor.from_pretrained("./test/saved_model/")
         ```"""
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if kwargs.get("token", None) is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            kwargs["token"] = use_auth_token
+
         config = kwargs.pop("config", None)
-        trust_remote_code = kwargs.pop("trust_remote_code", False)
+        trust_remote_code = kwargs.pop("trust_remote_code", None)
         kwargs["_from_auto"] = True
 
         processor_class = None
@@ -238,34 +265,30 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
                 processor_auto_map = config.auto_map["AutoProcessor"]
 
         if processor_class is not None:
-            # If we have custom code for a feature extractor, we get the proper class.
-            if processor_auto_map is not None:
-                if not trust_remote_code:
-                    raise ValueError(
-                        f"Loading {pretrained_model_name_or_path} requires you to execute the feature extractor file "
-                        "in that repo on your local machine. Make sure you have read the code there to avoid "
-                        "malicious use, then set the option `trust_remote_code=True` to remove this error."
-                    )
-                if kwargs.get("revision", None) is None:
-                    logger.warning(
-                        "Explicitly passing a `revision` is encouraged when loading a feature extractor with custom "
-                        "code to ensure no malicious code has been contributed in a newer revision."
-                    )
-
-                module_file, class_name = processor_auto_map.split(".")
-                processor_class = get_class_from_dynamic_module(
-                    pretrained_model_name_or_path, module_file + ".py", class_name, **kwargs
-                )
-                processor_class.register_for_auto_class()
-            else:
-                processor_class = processor_class_from_name(processor_class)
+            processor_class = processor_class_from_name(processor_class)
+
+        has_remote_code = processor_auto_map is not None
+        has_local_code = processor_class is not None or type(config) in PROCESSOR_MAPPING
+        trust_remote_code = resolve_trust_remote_code(
+            trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code
+        )
 
+        if has_remote_code and trust_remote_code:
+            processor_class = get_class_from_dynamic_module(
+                processor_auto_map, pretrained_model_name_or_path, **kwargs
+            )
+            _ = kwargs.pop("code_revision", None)
+            if os.path.isdir(pretrained_model_name_or_path):
+                processor_class.register_for_auto_class()
+            return processor_class.from_pretrained(
+                pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
+            )
+        elif processor_class is not None:
             return processor_class.from_pretrained(
                 pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
             )
-
         # Last try: we use the PROCESSOR_MAPPING.
-        if type(config) in PROCESSOR_MAPPING:
+        elif type(config) in PROCESSOR_MAPPING:
             return PROCESSOR_MAPPING[type(config)].from_pretrained(pretrained_model_name_or_path, **kwargs)
 
         # At this stage, there doesn't seem to be a `Processor` class available for this model, so let's try a
@@ -296,7 +319,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         )
 
     @staticmethod
-    def register(config_class, processor_class):
+    def register(config_class, processor_class, exist_ok=False):
         """
         Register a new processor for this class.
 
@@ -305,4 +328,4 @@ def register(config_class, processor_class):
                 The configuration corresponding to the model to register.
             processor_class ([`FeatureExtractorMixin`]): The processor to register.
         """
-        PROCESSOR_MAPPING.register(config_class, processor_class)
+        PROCESSOR_MAPPING.register(config_class, processor_class, exist_ok=exist_ok)
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index 0b21273ca96c..094d3351e8c0 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -17,14 +17,14 @@
 import importlib
 import json
 import os
+import warnings
 from collections import OrderedDict
 from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
 
 from ...configuration_utils import PretrainedConfig
-from ...dynamic_module_utils import get_class_from_dynamic_module
+from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
 from ...tokenization_utils import PreTrainedTokenizer
 from ...tokenization_utils_base import TOKENIZER_CONFIG_FILE
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
 from ...utils import cached_file, extract_commit_hash, is_sentencepiece_available, is_tokenizers_available, logging
 from ..encoder_decoder import EncoderDecoderConfig
 from .auto_factory import _LazyAutoMapping
@@ -37,6 +37,12 @@
 )
 
 
+if is_tokenizers_available():
+    from ...tokenization_utils_fast import PreTrainedTokenizerFast
+else:
+    PreTrainedTokenizerFast = None
+
+
 logger = logging.get_logger(__name__)
 
 if TYPE_CHECKING:
@@ -53,6 +59,8 @@
                     "AlbertTokenizerFast" if is_tokenizers_available() else None,
                 ),
             ),
+            ("align", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
+            ("bark", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
             ("bart", ("BartTokenizer", "BartTokenizerFast")),
             (
                 "barthez",
@@ -78,7 +86,10 @@
             ("blenderbot", ("BlenderbotTokenizer", "BlenderbotTokenizerFast")),
             ("blenderbot-small", ("BlenderbotSmallTokenizer", None)),
             ("blip", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
+            ("blip-2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
             ("bloom", (None, "BloomTokenizerFast" if is_tokenizers_available() else None)),
+            ("bridgetower", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
+            ("bros", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
             ("byt5", ("ByT5Tokenizer", None)),
             (
                 "camembert",
@@ -89,6 +100,13 @@
             ),
             ("canine", ("CanineTokenizer", None)),
             ("chinese_clip", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
+            (
+                "clap",
+                (
+                    "RobertaTokenizer",
+                    "RobertaTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
             (
                 "clip",
                 (
@@ -103,6 +121,13 @@
                     "CLIPTokenizerFast" if is_tokenizers_available() else None,
                 ),
             ),
+            (
+                "code_llama",
+                (
+                    "CodeLlamaTokenizer" if is_sentencepiece_available() else None,
+                    "CodeLlamaTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
             ("codegen", ("CodeGenTokenizer", "CodeGenTokenizerFast" if is_tokenizers_available() else None)),
             ("convbert", ("ConvBertTokenizer", "ConvBertTokenizerFast" if is_tokenizers_available() else None)),
             (
@@ -112,7 +137,9 @@
                     "CpmTokenizerFast" if is_tokenizers_available() else None,
                 ),
             ),
+            ("cpmant", ("CpmAntTokenizer", None)),
             ("ctrl", ("CTRLTokenizer", None)),
+            ("data2vec-audio", ("Wav2Vec2CTCTokenizer", None)),
             ("data2vec-text", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
             ("deberta", ("DebertaTokenizer", "DebertaTokenizerFast" if is_tokenizers_available() else None)),
             (
@@ -132,6 +159,7 @@
             ),
             ("electra", ("ElectraTokenizer", "ElectraTokenizerFast" if is_tokenizers_available() else None)),
             ("ernie", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
+            ("ernie_m", ("ErnieMTokenizer" if is_sentencepiece_available() else None, None)),
             ("esm", ("EsmTokenizer", None)),
             ("flaubert", ("FlaubertTokenizer", None)),
             ("fnet", ("FNetTokenizer", "FNetTokenizerFast" if is_tokenizers_available() else None)),
@@ -140,14 +168,18 @@
             ("git", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
             ("gpt-sw3", ("GPTSw3Tokenizer" if is_sentencepiece_available() else None, None)),
             ("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
+            ("gpt_bigcode", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
             ("gpt_neo", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
             ("gpt_neox", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
             ("gpt_neox_japanese", ("GPTNeoXJapaneseTokenizer", None)),
             ("gptj", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
+            ("gptsan-japanese", ("GPTSanJapaneseTokenizer", None)),
             ("groupvit", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
             ("herbert", ("HerbertTokenizer", "HerbertTokenizerFast" if is_tokenizers_available() else None)),
             ("hubert", ("Wav2Vec2CTCTokenizer", None)),
             ("ibert", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
+            ("idefics", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)),
+            ("instructblip", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
             ("jukebox", ("JukeboxTokenizer", None)),
             ("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)),
             ("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)),
@@ -155,6 +187,13 @@
             ("layoutxlm", ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast" if is_tokenizers_available() else None)),
             ("led", ("LEDTokenizer", "LEDTokenizerFast" if is_tokenizers_available() else None)),
             ("lilt", ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast" if is_tokenizers_available() else None)),
+            (
+                "llama",
+                (
+                    "LlamaTokenizer" if is_sentencepiece_available() else None,
+                    "LlamaTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
             ("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)),
             (
                 "longt5",
@@ -181,10 +220,21 @@
                     "MBart50TokenizerFast" if is_tokenizers_available() else None,
                 ),
             ),
+            ("mega", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
             ("megatron-bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
+            ("mgp-str", ("MgpstrTokenizer", None)),
+            (
+                "mistral",
+                (
+                    "LlamaTokenizer" if is_sentencepiece_available() else None,
+                    "LlamaTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
             ("mluke", ("MLukeTokenizer" if is_sentencepiece_available() else None, None)),
             ("mobilebert", ("MobileBertTokenizer", "MobileBertTokenizerFast" if is_tokenizers_available() else None)),
             ("mpnet", ("MPNetTokenizer", "MPNetTokenizerFast" if is_tokenizers_available() else None)),
+            ("mpt", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
+            ("mra", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
             (
                 "mt5",
                 (
@@ -192,6 +242,7 @@
                     "MT5TokenizerFast" if is_tokenizers_available() else None,
                 ),
             ),
+            ("musicgen", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)),
             ("mvp", ("MvpTokenizer", "MvpTokenizerFast" if is_tokenizers_available() else None)),
             ("nezha", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
             (
@@ -201,6 +252,13 @@
                     "NllbTokenizerFast" if is_tokenizers_available() else None,
                 ),
             ),
+            (
+                "nllb-moe",
+                (
+                    "NllbTokenizer" if is_sentencepiece_available() else None,
+                    "NllbTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
             (
                 "nystromformer",
                 (
@@ -208,8 +266,9 @@
                     "AlbertTokenizerFast" if is_tokenizers_available() else None,
                 ),
             ),
+            ("oneformer", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
             ("openai-gpt", ("OpenAIGPTTokenizer", "OpenAIGPTTokenizerFast" if is_tokenizers_available() else None)),
-            ("opt", ("GPT2Tokenizer", None)),
+            ("opt", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
             ("owlvit", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
             (
                 "pegasus",
@@ -232,7 +291,15 @@
                     None,
                 ),
             ),
+            (
+                "persimmon",
+                (
+                    "LlamaTokenizer" if is_sentencepiece_available() else None,
+                    "LlamaTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
             ("phobert", ("PhobertTokenizer", None)),
+            ("pix2struct", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)),
             ("plbart", ("PLBartTokenizer" if is_sentencepiece_available() else None, None)),
             ("prophetnet", ("ProphetNetTokenizer", None)),
             ("qdqbert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
@@ -260,8 +327,10 @@
             ),
             ("roc_bert", ("RoCBertTokenizer", None)),
             ("roformer", ("RoFormerTokenizer", "RoFormerTokenizerFast" if is_tokenizers_available() else None)),
+            ("rwkv", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
             ("speech_to_text", ("Speech2TextTokenizer" if is_sentencepiece_available() else None, None)),
             ("speech_to_text_2", ("Speech2Text2Tokenizer", None)),
+            ("speecht5", ("SpeechT5Tokenizer" if is_sentencepiece_available() else None, None)),
             ("splinter", ("SplinterTokenizer", "SplinterTokenizerFast")),
             (
                 "squeezebert",
@@ -284,12 +353,20 @@
             ("tapas", ("TapasTokenizer", None)),
             ("tapex", ("TapexTokenizer", None)),
             ("transfo-xl", ("TransfoXLTokenizer", None)),
+            (
+                "umt5",
+                (
+                    "T5Tokenizer" if is_sentencepiece_available() else None,
+                    "T5TokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
             ("vilt", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
             ("visual_bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
+            ("vits", ("VitsTokenizer", None)),
             ("wav2vec2", ("Wav2Vec2CTCTokenizer", None)),
             ("wav2vec2-conformer", ("Wav2Vec2CTCTokenizer", None)),
             ("wav2vec2_phoneme", ("Wav2Vec2PhonemeCTCTokenizer", None)),
-            ("whisper", ("WhisperTokenizer" if is_sentencepiece_available() else None, None)),
+            ("whisper", ("WhisperTokenizer", "WhisperTokenizerFast" if is_tokenizers_available() else None)),
             ("xclip", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
             (
                 "xglm",
@@ -321,6 +398,13 @@
                     "XLNetTokenizerFast" if is_tokenizers_available() else None,
                 ),
             ),
+            (
+                "xmod",
+                (
+                    "XLMRobertaTokenizer" if is_sentencepiece_available() else None,
+                    "XLMRobertaTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
             (
                 "yoso",
                 (
@@ -370,7 +454,7 @@ def get_tokenizer_config(
     force_download: bool = False,
     resume_download: bool = False,
     proxies: Optional[Dict[str, str]] = None,
-    use_auth_token: Optional[Union[bool, str]] = None,
+    token: Optional[Union[bool, str]] = None,
     revision: Optional[str] = None,
     local_files_only: bool = False,
     subfolder: str = "",
@@ -400,7 +484,7 @@ def get_tokenizer_config(
         proxies (`Dict[str, str]`, *optional*):
             A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
             'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-        use_auth_token (`str` or *bool*, *optional*):
+        token (`str` or *bool*, *optional*):
             The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
             when running `huggingface-cli login` (stored in `~/.huggingface`).
         revision (`str`, *optional*, defaults to `"main"`):
@@ -415,7 +499,7 @@ def get_tokenizer_config(
 
     <Tip>
 
-    Passing `use_auth_token=True` is required when you want to use a private model.
+    Passing `token=True` is required when you want to use a private model.
 
     </Tip>
 
@@ -437,6 +521,15 @@ def get_tokenizer_config(
     tokenizer.save_pretrained("tokenizer-test")
     tokenizer_config = get_tokenizer_config("tokenizer-test")
     ```"""
+    use_auth_token = kwargs.pop("use_auth_token", None)
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+        )
+        if token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        token = use_auth_token
+
     commit_hash = kwargs.get("_commit_hash", None)
     resolved_config_file = cached_file(
         pretrained_model_name_or_path,
@@ -445,7 +538,7 @@ def get_tokenizer_config(
         force_download=force_download,
         resume_download=resume_download,
         proxies=proxies,
-        use_auth_token=use_auth_token,
+        token=token,
         revision=revision,
         local_files_only=local_files_only,
         subfolder=subfolder,
@@ -505,7 +598,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
             inputs (additional positional arguments, *optional*):
                 Will be passed along to the Tokenizer `__init__()` method.
             config ([`PretrainedConfig`], *optional*)
-                The configuration object used to dertermine the tokenizer class to instantiate.
+                The configuration object used to determine the tokenizer class to instantiate.
             cache_dir (`str` or `os.PathLike`, *optional*):
                 Path to a directory in which a downloaded pretrained model configuration should be cached if the
                 standard cache should not be used.
@@ -552,17 +645,28 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
         >>> tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")
 
         >>> # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)
-        >>> tokenizer = AutoTokenizer.from_pretrained("./test/bert_saved_model/")
+        >>> # tokenizer = AutoTokenizer.from_pretrained("./test/bert_saved_model/")
 
         >>> # Download vocabulary from huggingface.co and define model-specific arguments
         >>> tokenizer = AutoTokenizer.from_pretrained("roberta-base", add_prefix_space=True)
         ```"""
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if kwargs.get("token", None) is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            kwargs["token"] = use_auth_token
+
         config = kwargs.pop("config", None)
         kwargs["_from_auto"] = True
 
         use_fast = kwargs.pop("use_fast", True)
         tokenizer_type = kwargs.pop("tokenizer_type", None)
-        trust_remote_code = kwargs.pop("trust_remote_code", False)
+        trust_remote_code = kwargs.pop("trust_remote_code", None)
 
         # First, let's see whether the tokenizer_type is passed so that we can leverage it
         if tokenizer_type is not None:
@@ -616,40 +720,30 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
             if hasattr(config, "auto_map") and "AutoTokenizer" in config.auto_map:
                 tokenizer_auto_map = config.auto_map["AutoTokenizer"]
 
-        # If we have the tokenizer class from the tokenizer config or the model config we're good!
-        if config_tokenizer_class is not None:
-            tokenizer_class = None
-            if tokenizer_auto_map is not None:
-                if not trust_remote_code:
-                    raise ValueError(
-                        f"Loading {pretrained_model_name_or_path} requires you to execute the tokenizer file in that"
-                        " repo on your local machine. Make sure you have read the code there to avoid malicious use,"
-                        " then set the option `trust_remote_code=True` to remove this error."
-                    )
-                if kwargs.get("revision", None) is None:
-                    logger.warning(
-                        "Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure"
-                        " no malicious code has been contributed in a newer revision."
-                    )
-
-                if use_fast and tokenizer_auto_map[1] is not None:
-                    class_ref = tokenizer_auto_map[1]
-                else:
-                    class_ref = tokenizer_auto_map[0]
+        has_remote_code = tokenizer_auto_map is not None
+        has_local_code = config_tokenizer_class is not None or type(config) in TOKENIZER_MAPPING
+        trust_remote_code = resolve_trust_remote_code(
+            trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code
+        )
 
-                module_file, class_name = class_ref.split(".")
-                tokenizer_class = get_class_from_dynamic_module(
-                    pretrained_model_name_or_path, module_file + ".py", class_name, **kwargs
-                )
+        if has_remote_code and trust_remote_code:
+            if use_fast and tokenizer_auto_map[1] is not None:
+                class_ref = tokenizer_auto_map[1]
+            else:
+                class_ref = tokenizer_auto_map[0]
+            tokenizer_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs)
+            _ = kwargs.pop("code_revision", None)
+            if os.path.isdir(pretrained_model_name_or_path):
                 tokenizer_class.register_for_auto_class()
-
-            elif use_fast and not config_tokenizer_class.endswith("Fast"):
+            return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        elif config_tokenizer_class is not None:
+            tokenizer_class = None
+            if use_fast and not config_tokenizer_class.endswith("Fast"):
                 tokenizer_class_candidate = f"{config_tokenizer_class}Fast"
                 tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
             if tokenizer_class is None:
                 tokenizer_class_candidate = config_tokenizer_class
                 tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
-
             if tokenizer_class is None:
                 raise ValueError(
                     f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported."
@@ -687,7 +781,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
             f"Model type should be one of {', '.join(c.__name__ for c in TOKENIZER_MAPPING.keys())}."
         )
 
-    def register(config_class, slow_tokenizer_class=None, fast_tokenizer_class=None):
+    def register(config_class, slow_tokenizer_class=None, fast_tokenizer_class=None, exist_ok=False):
         """
         Register a new tokenizer in this mapping.
 
@@ -697,7 +791,7 @@ def register(config_class, slow_tokenizer_class=None, fast_tokenizer_class=None)
                 The configuration corresponding to the model to register.
             slow_tokenizer_class ([`PretrainedTokenizer`], *optional*):
                 The slow tokenizer to register.
-            slow_tokenizer_class ([`PretrainedTokenizerFast`], *optional*):
+            fast_tokenizer_class ([`PretrainedTokenizerFast`], *optional*):
                 The fast tokenizer to register.
         """
         if slow_tokenizer_class is None and fast_tokenizer_class is None:
@@ -728,4 +822,4 @@ def register(config_class, slow_tokenizer_class=None, fast_tokenizer_class=None)
             if fast_tokenizer_class is None:
                 fast_tokenizer_class = existing_fast
 
-        TOKENIZER_MAPPING.register(config_class, (slow_tokenizer_class, fast_tokenizer_class))
+        TOKENIZER_MAPPING.register(config_class, (slow_tokenizer_class, fast_tokenizer_class), exist_ok=exist_ok)
diff --git a/src/transformers/models/autoformer/__init__.py b/src/transformers/models/autoformer/__init__.py
new file mode 100644
index 000000000000..f87bfdea532d
--- /dev/null
+++ b/src/transformers/models/autoformer/__init__.py
@@ -0,0 +1,63 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+# rely on isort to merge the imports
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {
+    "configuration_autoformer": [
+        "AUTOFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "AutoformerConfig",
+    ],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_autoformer"] = [
+        "AUTOFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "AutoformerForPrediction",
+        "AutoformerModel",
+        "AutoformerPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_autoformer import (
+        AUTOFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        AutoformerConfig,
+    )
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_autoformer import (
+            AUTOFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            AutoformerForPrediction,
+            AutoformerModel,
+            AutoformerPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/autoformer/configuration_autoformer.py b/src/transformers/models/autoformer/configuration_autoformer.py
new file mode 100644
index 000000000000..ced76448cd1e
--- /dev/null
+++ b/src/transformers/models/autoformer/configuration_autoformer.py
@@ -0,0 +1,245 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Autoformer model configuration"""
+
+from typing import List, Optional
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+AUTOFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "huggingface/autoformer-tourism-monthly": "https://huggingface.co/huggingface/autoformer-tourism-monthly/resolve/main/config.json",
+}
+
+
+class AutoformerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of an [`AutoformerModel`]. It is used to instantiate an
+    Autoformer model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the Autoformer
+    [huggingface/autoformer-tourism-monthly](https://huggingface.co/huggingface/autoformer-tourism-monthly)
+    architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        prediction_length (`int`):
+            The prediction length for the decoder. In other words, the prediction horizon of the model.
+        context_length (`int`, *optional*, defaults to `prediction_length`):
+            The context length for the encoder. If unset, the context length will be the same as the
+            `prediction_length`.
+        distribution_output (`string`, *optional*, defaults to `"student_t"`):
+            The distribution emission head for the model. Could be either "student_t", "normal" or "negative_binomial".
+        loss (`string`, *optional*, defaults to `"nll"`):
+            The loss function for the model corresponding to the `distribution_output` head. For parametric
+            distributions it is the negative log likelihood (nll) - which currently is the only supported one.
+        input_size (`int`, *optional*, defaults to 1):
+            The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of
+            multivariate targets.
+        lags_sequence (`list[int]`, *optional*, defaults to `[1, 2, 3, 4, 5, 6, 7]`):
+            The lags of the input time series as covariates often dictated by the frequency. Default is `[1, 2, 3, 4,
+            5, 6, 7]`.
+        scaling (`bool`, *optional* defaults to `True`):
+            Whether to scale the input targets.
+        num_time_features (`int`, *optional*, defaults to 0):
+            The number of time features in the input time series.
+        num_dynamic_real_features (`int`, *optional*, defaults to 0):
+            The number of dynamic real valued features.
+        num_static_categorical_features (`int`, *optional*, defaults to 0):
+            The number of static categorical features.
+        num_static_real_features (`int`, *optional*, defaults to 0):
+            The number of static real valued features.
+        cardinality (`list[int]`, *optional*):
+            The cardinality (number of different values) for each of the static categorical features. Should be a list
+            of integers, having the same length as `num_static_categorical_features`. Cannot be `None` if
+            `num_static_categorical_features` is > 0.
+        embedding_dimension (`list[int]`, *optional*):
+            The dimension of the embedding for each of the static categorical features. Should be a list of integers,
+            having the same length as `num_static_categorical_features`. Cannot be `None` if
+            `num_static_categorical_features` is > 0.
+        d_model (`int`, *optional*, defaults to 64):
+            Dimensionality of the transformer layers.
+        encoder_layers (`int`, *optional*, defaults to 2):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 2):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 2):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 2):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 32):
+            Dimension of the "intermediate" (often named feed-forward) layer in encoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 32):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and decoder. If string, `"gelu"` and
+            `"relu"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the encoder, and decoder.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the attention and fully connected layers for each encoder layer.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the attention and fully connected layers for each decoder layer.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability used between the two layers of the feed-forward networks.
+        num_parallel_samples (`int`, *optional*, defaults to 100):
+            The number of samples to generate in parallel for each time step of inference.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated normal weight initialization distribution.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether to use the past key/values attentions (if applicable to the model) to speed up decoding.
+        label_length (`int`, *optional*, defaults to 10):
+            Start token length of the Autoformer decoder, which is used for direct multi-step prediction (i.e.
+            non-autoregressive generation).
+        moving_average (`int`, defaults to 25):
+            The window size of the moving average. In practice, it's the kernel size in AvgPool1d of the Decomposition
+            Layer.
+        autocorrelation_factor (`int`, defaults to 3):
+            "Attention" (i.e. AutoCorrelation mechanism) factor which is used to find top k autocorrelations delays.
+            It's recommended in the paper to set it to a number between 1 and 5.
+
+
+        Example:
+
+    ```python
+    >>> from transformers import AutoformerConfig, AutoformerModel
+
+    >>> # Initializing a default Autoformer configuration
+    >>> configuration = AutoformerConfig()
+
+    >>> # Randomly initializing a model (with random weights) from the configuration
+    >>> model = AutoformerModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "autoformer"
+    attribute_map = {
+        "hidden_size": "d_model",
+        "num_attention_heads": "encoder_attention_heads",
+        "num_hidden_layers": "encoder_layers",
+    }
+
+    def __init__(
+        self,
+        prediction_length: Optional[int] = None,
+        context_length: Optional[int] = None,
+        distribution_output: str = "student_t",
+        loss: str = "nll",
+        input_size: int = 1,
+        lags_sequence: List[int] = [1, 2, 3, 4, 5, 6, 7],
+        scaling: bool = True,
+        num_time_features: int = 0,
+        num_dynamic_real_features: int = 0,
+        num_static_categorical_features: int = 0,
+        num_static_real_features: int = 0,
+        cardinality: Optional[List[int]] = None,
+        embedding_dimension: Optional[List[int]] = None,
+        d_model: int = 64,
+        encoder_attention_heads: int = 2,
+        decoder_attention_heads: int = 2,
+        encoder_layers: int = 2,
+        decoder_layers: int = 2,
+        encoder_ffn_dim: int = 32,
+        decoder_ffn_dim: int = 32,
+        activation_function: str = "gelu",
+        dropout: float = 0.1,
+        encoder_layerdrop: float = 0.1,
+        decoder_layerdrop: float = 0.1,
+        attention_dropout: float = 0.1,
+        activation_dropout: float = 0.1,
+        num_parallel_samples: int = 100,
+        init_std: float = 0.02,
+        use_cache: bool = True,
+        is_encoder_decoder=True,
+        # Autoformer arguments
+        label_length: int = 10,
+        moving_average: int = 25,
+        autocorrelation_factor: int = 3,
+        **kwargs,
+    ):
+        # time series specific configuration
+        self.prediction_length = prediction_length
+        self.context_length = context_length if context_length is not None else prediction_length
+        self.distribution_output = distribution_output
+        self.loss = loss
+        self.input_size = input_size
+        self.num_time_features = num_time_features
+        self.lags_sequence = lags_sequence
+        self.scaling = scaling
+        self.num_dynamic_real_features = num_dynamic_real_features
+        self.num_static_real_features = num_static_real_features
+        self.num_static_categorical_features = num_static_categorical_features
+        if cardinality is not None and num_static_categorical_features > 0:
+            if len(cardinality) != num_static_categorical_features:
+                raise ValueError(
+                    "The cardinality should be a list of the same length as `num_static_categorical_features`"
+                )
+            self.cardinality = cardinality
+        else:
+            self.cardinality = [0]
+        if embedding_dimension is not None and num_static_categorical_features > 0:
+            if len(embedding_dimension) != num_static_categorical_features:
+                raise ValueError(
+                    "The embedding dimension should be a list of the same length as `num_static_categorical_features`"
+                )
+            self.embedding_dimension = embedding_dimension
+        else:
+            self.embedding_dimension = [min(50, (cat + 1) // 2) for cat in self.cardinality]
+        self.num_parallel_samples = num_parallel_samples
+
+        # Transformer architecture configuration
+        self.feature_size = input_size * len(self.lags_sequence) + self._number_of_features
+        self.d_model = d_model
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_attention_heads = decoder_attention_heads
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.decoder_layers = decoder_layers
+
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+
+        self.activation_function = activation_function
+        self.init_std = init_std
+
+        self.use_cache = use_cache
+
+        # Autoformer
+        self.label_length = label_length
+        self.moving_average = moving_average
+        self.autocorrelation_factor = autocorrelation_factor
+
+        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
+
+    @property
+    def _number_of_features(self) -> int:
+        return (
+            sum(self.embedding_dimension)
+            + self.num_dynamic_real_features
+            + self.num_time_features
+            + self.num_static_real_features
+            + self.input_size * 2  # the log1p(abs(loc)) and log(scale) features
+        )
diff --git a/src/transformers/models/autoformer/modeling_autoformer.py b/src/transformers/models/autoformer/modeling_autoformer.py
new file mode 100644
index 000000000000..96298c77a344
--- /dev/null
+++ b/src/transformers/models/autoformer/modeling_autoformer.py
@@ -0,0 +1,2184 @@
+# coding=utf-8
+# Copyright (c) 2021 THUML @ Tsinghua University
+# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Autoformer model."""
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BaseModelOutput,
+    ModelOutput,
+    SampleTSPredictionOutput,
+    Seq2SeqTSPredictionOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from .configuration_autoformer import AutoformerConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "AutoformerConfig"
+
+
+@dataclass
+class AutoFormerDecoderOutput(ModelOutput):
+    """
+    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+
+            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+            hidden_size)` is output.
+        trend (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Trend tensor for each time series.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
+            encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
+            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
+            input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    trend: torch.FloatTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class AutoformerModelOutput(ModelOutput):
+    """
+    Autoformer model output that contains the additional trend output.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+
+            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+            hidden_size)` is output.
+        trend (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Trend tensor for each time series.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        loc (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*):
+            Shift values of each time series' context window which is used to give the model inputs of the same
+            magnitude and then used to shift back to the original magnitude.
+        scale (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*):
+            Scaling values of each time series' context window which is used to give the model inputs of the same
+            magnitude and then used to rescale back to the original magnitude.
+        static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
+            Static features of each time series' in a batch which are copied to the covariates at inference time.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    trend: torch.FloatTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    loc: Optional[torch.FloatTensor] = None
+    scale: Optional[torch.FloatTensor] = None
+    static_features: Optional[torch.FloatTensor] = None
+
+
+AUTOFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "huggingface/autoformer-tourism-monthly",
+    # See all Autoformer models at https://huggingface.co/models?filter=autoformer
+]
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesFeatureEmbedder with TimeSeries->Autoformer
+class AutoformerFeatureEmbedder(nn.Module):
+    """
+    Embed a sequence of categorical features.
+
+    Args:
+        cardinalities (`list[int]`):
+            List of cardinalities of the categorical features.
+        embedding_dims (`list[int]`):
+            List of embedding dimensions of the categorical features.
+    """
+
+    def __init__(self, cardinalities: List[int], embedding_dims: List[int]) -> None:
+        super().__init__()
+
+        self.num_features = len(cardinalities)
+        self.embedders = nn.ModuleList([nn.Embedding(c, d) for c, d in zip(cardinalities, embedding_dims)])
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        if self.num_features > 1:
+            # we slice the last dimension, giving an array of length
+            # self.num_features with shape (N,T) or (N)
+            cat_feature_slices = torch.chunk(features, self.num_features, dim=-1)
+        else:
+            cat_feature_slices = [features]
+
+        return torch.cat(
+            [
+                embed(cat_feature_slice.squeeze(-1))
+                for embed, cat_feature_slice in zip(self.embedders, cat_feature_slices)
+            ],
+            dim=-1,
+        )
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeries->Autoformer
+class AutoformerStdScaler(nn.Module):
+    """
+    Standardize features by calculating the mean and scaling along some given dimension `dim`, and then normalizes it
+    by subtracting from the mean and dividing by the standard deviation.
+
+    Args:
+        dim (`int`):
+            Dimension along which to calculate the mean and standard deviation.
+        keepdim (`bool`, *optional*, defaults to `False`):
+            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+        minimum_scale (`float`, *optional*, defaults to 1e-5):
+            Default scale that is used for elements that are constantly zero along dimension `dim`.
+    """
+
+    def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-5):
+        super().__init__()
+        if not dim > 0:
+            raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0")
+        self.dim = dim
+        self.keepdim = keepdim
+        self.minimum_scale = minimum_scale
+
+    @torch.no_grad()
+    def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        denominator = weights.sum(self.dim, keepdim=self.keepdim)
+        denominator = denominator.clamp_min(1.0)
+        loc = (data * weights).sum(self.dim, keepdim=self.keepdim) / denominator
+
+        variance = (((data - loc) * weights) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator
+        scale = torch.sqrt(variance + self.minimum_scale)
+        return (data - loc) / scale, loc, scale
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesMeanScaler with TimeSeries->Autoformer
+class AutoformerMeanScaler(nn.Module):
+    """
+    Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data
+    accordingly.
+
+    Args:
+        dim (`int`):
+            Dimension along which to compute the scale.
+        keepdim (`bool`, *optional*, defaults to `False`):
+            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+        default_scale (`float`, *optional*, defaults to `None`):
+            Default scale that is used for elements that are constantly zero. If `None`, we use the scale of the batch.
+        minimum_scale (`float`, *optional*, defaults to 1e-10):
+            Default minimum possible scale that is used for any item.
+    """
+
+    def __init__(
+        self, dim: int = -1, keepdim: bool = True, default_scale: Optional[float] = None, minimum_scale: float = 1e-10
+    ):
+        super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
+        self.minimum_scale = minimum_scale
+        self.default_scale = default_scale
+
+    @torch.no_grad()
+    def forward(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        # shape: (N, [C], T=1)
+        ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True)
+        num_observed = observed_indicator.sum(self.dim, keepdim=True)
+
+        scale = ts_sum / torch.clamp(num_observed, min=1)
+
+        # If `default_scale` is provided, we use it, otherwise we use the scale
+        # of the batch.
+        if self.default_scale is None:
+            batch_sum = ts_sum.sum(dim=0)
+            batch_observations = torch.clamp(num_observed.sum(0), min=1)
+            default_scale = torch.squeeze(batch_sum / batch_observations)
+        else:
+            default_scale = self.default_scale * torch.ones_like(scale)
+
+        # apply default scale where there are no observations
+        scale = torch.where(num_observed > 0, scale, default_scale)
+
+        # ensure the scale is at least `self.minimum_scale`
+        scale = torch.clamp(scale, min=self.minimum_scale)
+        scaled_data = data / scale
+
+        if not self.keepdim:
+            scale = scale.squeeze(dim=self.dim)
+
+        return scaled_data, torch.zeros_like(scale), scale
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesNOPScaler with TimeSeries->Autoformer
+class AutoformerNOPScaler(nn.Module):
+    """
+    Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data.
+
+    Args:
+        dim (`int`):
+            Dimension along which to compute the scale.
+        keepdim (`bool`, *optional*, defaults to `False`):
+            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+    """
+
+    def __init__(self, dim: int, keepdim: bool = False):
+        super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
+
+    def forward(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
+        loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
+        return data, loc, scale
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.weighted_average
+def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
+    """
+    Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
+    meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.
+
+    Args:
+        input_tensor (`torch.FloatTensor`):
+            Input tensor, of which the average must be computed.
+        weights (`torch.FloatTensor`, *optional*):
+            Weights tensor, of the same shape as `input_tensor`.
+        dim (`int`, *optional*):
+            The dim along which to average `input_tensor`.
+
+    Returns:
+        `torch.FloatTensor`: The tensor with values averaged along the specified `dim`.
+    """
+    if weights is not None:
+        weighted_tensor = torch.where(weights != 0, input_tensor * weights, torch.zeros_like(input_tensor))
+        sum_weights = torch.clamp(weights.sum(dim=dim) if dim else weights.sum(), min=1.0)
+        return (weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum()) / sum_weights
+    else:
+        return input_tensor.mean(dim=dim)
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.nll
+def nll(input: torch.distributions.Distribution, target: torch.Tensor) -> torch.Tensor:
+    """
+    Computes the negative log likelihood loss from input distribution with respect to target.
+    """
+    return -input.log_prob(target)
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+# Copied from transformers.models.marian.modeling_marian.MarianSinusoidalPositionalEmbedding with Marian->Autoformer
+class AutoformerSinusoidalPositionalEmbedding(nn.Embedding):
+    """This module produces sinusoidal positional embeddings of any length."""
+
+    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None) -> None:
+        super().__init__(num_positions, embedding_dim)
+        self.weight = self._init_weight(self.weight)
+
+    @staticmethod
+    def _init_weight(out: nn.Parameter) -> nn.Parameter:
+        """
+        Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in
+        the 2nd half of the vector. [dim // 2:]
+        """
+        n_pos, dim = out.shape
+        position_enc = np.array(
+            [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
+        )
+        out.requires_grad = False  # set early to avoid an error in pytorch-1.8+
+        sentinel = dim // 2 if dim % 2 == 0 else (dim // 2) + 1
+        out[:, 0:sentinel] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
+        out[:, sentinel:] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
+        out.detach_()
+        return out
+
+    @torch.no_grad()
+    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0) -> torch.Tensor:
+        """`input_ids_shape` is expected to be [bsz x seqlen]."""
+        bsz, seq_len = input_ids_shape[:2]
+        positions = torch.arange(
+            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
+        )
+        return super().forward(positions)
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesValueEmbedding with TimeSeries->Autoformer
+class AutoformerValueEmbedding(nn.Module):
+    def __init__(self, feature_size, d_model):
+        super().__init__()
+        self.value_projection = nn.Linear(in_features=feature_size, out_features=d_model, bias=False)
+
+    def forward(self, x):
+        return self.value_projection(x)
+
+
+# Class based on
+# https://github.com/thuml/Autoformer/blob/c6a0694ff484753f2d986cc0bb1f99ee850fc1a8/layers/Autoformer_EncDec.py#L39
+# where AutoformerSeriesDecompositionLayer is series_decomp + moving_average
+class AutoformerSeriesDecompositionLayer(nn.Module):
+    """
+    Returns the trend and the seasonal parts of the time series. Calculated as:
+
+        x_trend = AvgPool(Padding(X)) and x_seasonal = X - x_trend
+    """
+
+    def __init__(self, config: AutoformerConfig):
+        super().__init__()
+        self.kernel_size = config.moving_average
+        self.avg = nn.AvgPool1d(kernel_size=self.kernel_size, stride=1, padding=0)
+
+    def forward(self, x):
+        """Input shape: Batch x Time x EMBED_DIM"""
+        # padding on the both ends of time series
+        num_of_pads = (self.kernel_size - 1) // 2
+        front = x[:, 0:1, :].repeat(1, num_of_pads, 1)
+        end = x[:, -1:, :].repeat(1, num_of_pads, 1)
+        x_padded = torch.cat([front, x, end], dim=1)
+
+        # calculate the trend and seasonal part of the series
+        x_trend = self.avg(x_padded.permute(0, 2, 1)).permute(0, 2, 1)
+        x_seasonal = x - x_trend
+        return x_seasonal, x_trend
+
+
+# Class based on
+# https://github.com/thuml/Autoformer/blob/c6a0694ff484753f2d986cc0bb1f99ee850fc1a8/layers/Autoformer_EncDec.py#L6
+# where AutoformerLayernorm is my_Layernorm
+class AutoformerLayernorm(nn.Module):
+    """
+    Special designed layer normalization for the seasonal part, calculated as: AutoformerLayernorm(x) = nn.LayerNorm(x)
+    - torch.mean(nn.LayerNorm(x))
+    """
+
+    def __init__(self, config: AutoformerConfig):
+        super().__init__()
+        self.layernorm = nn.LayerNorm(config.d_model)
+
+    def forward(self, x):
+        x_hat = self.layernorm(x)
+        bias = torch.mean(x_hat, dim=1).unsqueeze(1).repeat(1, x.shape[1], 1)
+        return x_hat - bias
+
+
+class AutoformerAttention(nn.Module):
+    """
+    AutoCorrelation Mechanism with the following two phases:
+        (1) period-based dependencies discovery (2) time delay aggregation
+    This block replace the canonical self-attention mechanism.
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        autocorrelation_factor: int = 3,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+        self.autocorrelation_factor = autocorrelation_factor
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states)
+        # get key, value proj
+        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_value` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        if (
+            is_cross_attention
+            and past_key_value is not None
+            and past_key_value[0].shape[2] == key_value_states.shape[1]
+        ):
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        # (1) period-based dependencies discovery
+        # Resize (truncation or zero filling)
+        queries_time_length = query_states.size(1)
+        values_time_length = value_states.size(1)
+        if queries_time_length > values_time_length:
+            query_states = query_states[:, : (queries_time_length - values_time_length), :]
+            zeros = torch.zeros_like(query_states).float()
+            value_states = torch.cat([value_states, zeros], dim=1)
+            key_states = torch.cat([key_states, zeros], dim=1)
+        else:
+            value_states = value_states[:, :queries_time_length, :]
+            key_states = key_states[:, :queries_time_length, :]
+
+        query_states_fft = torch.fft.rfft(query_states, n=tgt_len, dim=1)
+        key_states_fft = torch.fft.rfft(key_states, n=tgt_len, dim=1)
+        attn_weights = query_states_fft * torch.conj(key_states_fft)
+        attn_weights = torch.fft.irfft(attn_weights, n=tgt_len, dim=1)  # Autocorrelation(Q,K)
+
+        src_len = key_states.size(1)
+        channel = key_states.size(2)
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, channel):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, channel)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, channel)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, channel)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, channel)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, channel)
+        else:
+            attn_weights_reshaped = None
+
+        # time delay aggregation
+        time_length = value_states.size(1)
+        autocorrelations = attn_weights.view(bsz, self.num_heads, tgt_len, channel)
+
+        # find top k autocorrelations delays
+        top_k = int(self.autocorrelation_factor * math.log(time_length))
+        autocorrelations_mean_on_head_channel = torch.mean(autocorrelations, dim=(1, -1))  # bsz x tgt_len
+        if self.training:
+            autocorrelations_mean_on_bsz = torch.mean(autocorrelations_mean_on_head_channel, dim=0)
+            _, top_k_delays_index = torch.topk(autocorrelations_mean_on_bsz, top_k)
+            top_k_autocorrelations = torch.stack(
+                [autocorrelations_mean_on_head_channel[:, top_k_delays_index[i]] for i in range(top_k)], dim=-1
+            )
+        else:
+            top_k_autocorrelations, top_k_delays_index = torch.topk(
+                autocorrelations_mean_on_head_channel, top_k, dim=1
+            )
+
+        top_k_autocorrelations = torch.softmax(top_k_autocorrelations, dim=-1)  # bsz x top_k
+
+        # compute aggregation: value_states.roll(delay) * top_k_autocorrelations(delay)
+        if not self.training:
+            # used for compute values_states.roll(delay) in inference
+            tmp_values = value_states.repeat(1, 2, 1)
+            init_index = (
+                torch.arange(time_length)
+                .view(1, -1, 1)
+                .repeat(bsz * self.num_heads, 1, channel)
+                .to(value_states.device)
+            )
+
+        delays_agg = torch.zeros_like(value_states).float()  # bsz x time_length x channel
+        for i in range(top_k):
+            # compute value_states roll delay
+            if not self.training:
+                tmp_delay = init_index + top_k_delays_index[:, i].view(-1, 1, 1).repeat(
+                    self.num_heads, tgt_len, channel
+                )
+                value_states_roll_delay = torch.gather(tmp_values, dim=1, index=tmp_delay)
+            else:
+                value_states_roll_delay = value_states.roll(shifts=-int(top_k_delays_index[i]), dims=1)
+
+            # aggregation
+            top_k_autocorrelations_at_delay = (
+                top_k_autocorrelations[:, i].view(-1, 1, 1).repeat(self.num_heads, tgt_len, channel)
+            )
+            delays_agg += value_states_roll_delay * top_k_autocorrelations_at_delay
+
+        attn_output = delays_agg.contiguous()
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
+class AutoformerEncoderLayer(nn.Module):
+    def __init__(self, config: AutoformerConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = AutoformerAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+            autocorrelation_factor=config.autocorrelation_factor,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = AutoformerLayernorm(config)
+        self.decomp1 = AutoformerSeriesDecompositionLayer(config)
+        self.decomp2 = AutoformerSeriesDecompositionLayer(config)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: torch.FloatTensor,
+        layer_head_mask: torch.FloatTensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states, attn_weights, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        # added layer norm here as an improvement
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, _ = self.decomp1(hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states, _ = self.decomp2(hidden_states)
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        if hidden_states.dtype == torch.float16 and (
+            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
+        ):
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class AutoformerDecoderLayer(nn.Module):
+    def __init__(self, config: AutoformerConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = AutoformerAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+            autocorrelation_factor=config.autocorrelation_factor,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.encoder_attn = AutoformerAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+            autocorrelation_factor=config.autocorrelation_factor,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = AutoformerLayernorm(config)
+
+        self.decomp1 = AutoformerSeriesDecompositionLayer(config)
+        self.decomp2 = AutoformerSeriesDecompositionLayer(config)
+        self.decomp3 = AutoformerSeriesDecompositionLayer(config)
+
+        # source: https://github.com/thuml/Autoformer/blob/e6371e24f2ae2dd53e472edefdd5814c5176f864/layers/Autoformer_EncDec.py#L128
+        self.trend_projection = nn.Conv1d(
+            in_channels=self.embed_dim,
+            out_channels=config.feature_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            padding_mode="circular",
+            bias=False,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+                size `(decoder_attention_heads,)`.
+            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache: (`bool`, *optional*, defaults to `True`):
+                Whether or not the model should return the `present_key_value` state to be used for subsequent
+                decoding.
+        """
+        residual = hidden_states
+
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states, trend1 = self.decomp1(hidden_states)
+        # added layer norm here as an improvement
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+            )
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+            hidden_states, trend2 = self.decomp2(hidden_states)
+            # added layer norm here as an improvement
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states, trend3 = self.decomp3(hidden_states)
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        if encoder_hidden_states is not None:
+            residual_trend = trend1 + trend2 + trend3
+        else:
+            residual_trend = trend1 + trend3
+        residual_trend = self.trend_projection(residual_trend.permute(0, 2, 1)).transpose(1, 2)
+        outputs = ((hidden_states, residual_trend),)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+class AutoformerPreTrainedModel(PreTrainedModel):
+    config_class = AutoformerConfig
+    base_model_prefix = "model"
+    main_input_name = "past_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, AutoformerSinusoidalPositionalEmbedding):
+            pass
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (AutoformerDecoder, AutoformerEncoder)):
+            module.gradient_checkpointing = value
+
+
+AUTOFORMER_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`AutoformerConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+AUTOFORMER_INPUTS_DOCSTRING = r"""
+    Args:
+        past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Past values of the time series, that serve as context in order to predict the future. These values may
+            contain lags, i.e. additional values from the past which are added in order to serve as "extra context".
+            The `past_values` is what the Transformer encoder gets as input (with optional additional features, such as
+            `static_categorical_features`, `static_real_features`, `past_time_features`).
+
+            The sequence length here is equal to `context_length` + `max(config.lags_sequence)`.
+
+            Missing values need to be replaced with zeros.
+
+        past_time_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`, *optional*):
+            Optional time features, which the model internally will add to `past_values`. These could be things like
+            "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These
+            could also be so-called "age" features, which basically help the model know "at which point in life" a
+            time-series is. Age features have small values for distant past time steps and increase monotonically the
+            more we approach the current time step.
+
+            These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
+            the position encodings are learned from scratch internally as parameters of the model, the Time Series
+            Transformer requires to provide additional time features.
+
+            The Autoformer only learns additional embeddings for `static_categorical_features`.
+
+        past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected in
+            `[0, 1]`:
+
+            - 1 for values that are **observed**,
+            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+
+        static_categorical_features (`torch.LongTensor` of shape `(batch_size, number of static categorical features)`, *optional*):
+            Optional static categorical features for which the model will learn an embedding, which it will add to the
+            values of the time series.
+
+            Static categorical features are features which have the same value for all time steps (static over time).
+
+            A typical example of a static categorical feature is a time series ID.
+
+        static_real_features (`torch.FloatTensor` of shape `(batch_size, number of static real features)`, *optional*):
+            Optional static real features which the model will add to the values of the time series.
+
+            Static real features are features which have the same value for all time steps (static over time).
+
+            A typical example of a static real feature is promotion information.
+
+        future_values (`torch.FloatTensor` of shape `(batch_size, prediction_length)`):
+            Future values of the time series, that serve as labels for the model. The `future_values` is what the
+            Transformer needs to learn to output, given the `past_values`.
+
+            See the demo notebook and code snippets for details.
+
+            Missing values need to be replaced with zeros.
+
+        future_time_features (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`, *optional*):
+            Optional time features, which the model internally will add to `future_values`. These could be things like
+            "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These
+            could also be so-called "age" features, which basically help the model know "at which point in life" a
+            time-series is. Age features have small values for distant past time steps and increase monotonically the
+            more we approach the current time step.
+
+            These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
+            the position encodings are learned from scratch internally as parameters of the model, the Time Series
+            Transformer requires to provide additional features.
+
+            The Autoformer only learns additional embeddings for `static_categorical_features`.
+
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on certain token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Mask to avoid performing attention on certain token indices. By default, a causal mask will be used, to
+            make sure the model can only look at previous inputs in order to predict the future.
+
+        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of `last_hidden_state`, `hidden_states` (*optional*) and `attentions` (*optional*)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` (*optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerEncoder with TimeSeriesTransformer->Autoformer,TimeSeries->Autoformer
+class AutoformerEncoder(AutoformerPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    [`AutoformerEncoderLayer`].
+
+    Args:
+        config: AutoformerConfig
+    """
+
+    def __init__(self, config: AutoformerConfig):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+        if config.prediction_length is None:
+            raise ValueError("The `prediction_length` config needs to be specified.")
+
+        self.value_embedding = AutoformerValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
+        self.embed_positions = AutoformerSinusoidalPositionalEmbedding(
+            config.context_length + config.prediction_length, config.d_model
+        )
+        self.layers = nn.ModuleList([AutoformerEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layernorm_embedding = nn.LayerNorm(config.d_model)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states = self.value_embedding(inputs_embeds)
+        embed_pos = self.embed_positions(inputs_embeds.size())
+
+        hidden_states = self.layernorm_embedding(hidden_states + embed_pos)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            if head_mask.size()[0] != (len(self.layers)):
+                raise ValueError(
+                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
+                    f" {head_mask.size()[0]}."
+                )
+
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
+                layer_outputs = (None, None)
+            else:
+                if self.gradient_checkpointing and self.training:
+
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
+
+                        return custom_forward
+
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(encoder_layer),
+                        hidden_states,
+                        attention_mask,
+                        (head_mask[idx] if head_mask is not None else None),
+                    )
+                else:
+                    layer_outputs = encoder_layer(
+                        hidden_states,
+                        attention_mask,
+                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                        output_attentions=output_attentions,
+                    )
+
+                hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class AutoformerDecoder(AutoformerPreTrainedModel):
+    """
+    Transformer decoder consisting of `config.decoder_layers` layers. Each layer is a [`AutoformerDecoderLayer`]
+
+    Args:
+        config: AutoformerConfig
+    """
+
+    def __init__(self, config: AutoformerConfig):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        if config.prediction_length is None:
+            raise ValueError("The `prediction_length` config needs to be specified.")
+
+        self.value_embedding = AutoformerValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
+        self.embed_positions = AutoformerSinusoidalPositionalEmbedding(
+            config.context_length + config.prediction_length, config.d_model
+        )
+        self.layers = nn.ModuleList([AutoformerDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.layernorm_embedding = nn.LayerNorm(config.d_model)
+
+        # https://github.com/thuml/Autoformer/blob/e6371e24f2ae2dd53e472edefdd5814c5176f864/models/Autoformer.py#L74
+        self.seasonality_projection = nn.Linear(config.d_model, config.feature_size)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            ).to(inputs_embeds.device)
+
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+
+        return combined_attention_mask
+
+    def forward(
+        self,
+        trend: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, AutoFormerDecoderOutput]:
+        r"""
+        Args:
+            trend (`torch.FloatTensor` of shape `(batch_size, prediction_length, feature_size)`, *optional*):
+                The trend sequence to be fed to the decoder.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+                selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
+                cross-attention on hidden heads. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            use_cache (`bool`, *optional*):
+                If `use_cache` is True, `past_key_values` key value states are returned and can be used to speed up
+                decoding (see `past_key_values`).
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        input_shape = inputs_embeds.size()[:-1]
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+
+        hidden_states = self.value_embedding(inputs_embeds)
+        embed_pos = self.embed_positions(
+            inputs_embeds.size(), past_key_values_length=self.config.context_length - self.config.label_length
+        )
+        hidden_states = self.layernorm_embedding(hidden_states + embed_pos)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        next_decoder_cache = () if use_cache else None
+
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+            if attn_mask is not None:
+                if attn_mask.size()[0] != (len(self.layers)):
+                    raise ValueError(
+                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                        f" {head_mask.size()[0]}."
+                    )
+
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, use_cache)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    head_mask[idx] if head_mask is not None else None,
+                    cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    cross_attn_layer_head_mask=(
+                        cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
+                    ),
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            (hidden_states, residual_trend) = layer_outputs[0]
+            trend = trend + residual_trend
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        # project seasonality representation
+        hidden_states = self.seasonality_projection(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, trend, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return AutoFormerDecoderOutput(
+            last_hidden_state=hidden_states,
+            trend=trend,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    "The bare Autoformer Model outputting raw hidden-states without any specific head on top.",
+    AUTOFORMER_START_DOCSTRING,
+)
+class AutoformerModel(AutoformerPreTrainedModel):
+    def __init__(self, config: AutoformerConfig):
+        super().__init__(config)
+
+        if config.scaling == "mean" or config.scaling is True:
+            self.scaler = AutoformerMeanScaler(dim=1, keepdim=True)
+        elif config.scaling == "std":
+            self.scaler = AutoformerStdScaler(dim=1, keepdim=True)
+        else:
+            self.scaler = AutoformerNOPScaler(dim=1, keepdim=True)
+
+        if config.num_static_categorical_features > 0:
+            self.embedder = AutoformerFeatureEmbedder(
+                cardinalities=config.cardinality, embedding_dims=config.embedding_dimension
+            )
+
+        # transformer encoder-decoder and mask initializer
+        self.encoder = AutoformerEncoder(config)
+        self.decoder = AutoformerDecoder(config)
+
+        # used for decoder seasonal and trend initialization
+        self.decomposition_layer = AutoformerSeriesDecompositionLayer(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @property
+    def _past_length(self) -> int:
+        return self.config.context_length + max(self.config.lags_sequence)
+
+    def get_lagged_subsequences(
+        self, sequence: torch.Tensor, subsequences_length: int, shift: int = 0
+    ) -> torch.Tensor:
+        """
+        Returns lagged subsequences of a given sequence. Returns a tensor of shape (batch_size, subsequences_length,
+        feature_size, indices_length), containing lagged subsequences. Specifically, lagged[i, j, :, k] = sequence[i,
+        -indices[k]-subsequences_length+j, :].
+
+        Args:
+            sequence (`torch.Tensor` or shape `(batch_size, context_length,
+                feature_size)`): The sequence from which lagged subsequences should be extracted.
+            subsequences_length (`int`):
+                Length of the subsequences to be extracted.
+            shift (`int`, *optional* defaults to 0):
+                Shift the lags by this amount back in the time index.
+        """
+
+        # calculates the indices of the lags by subtracting the shift value from the given lags_sequence
+        indices = [lag - shift for lag in self.config.lags_sequence]
+
+        # checks if the maximum lag plus the length of the subsequences exceeds the length of the input sequence
+        sequence_length = sequence.shape[1]
+        if max(indices) + subsequences_length > sequence_length:
+            raise ValueError(
+                f"lags cannot go further than history length, found lag {max(indices)} "
+                f"while history length is only {sequence_length}"
+            )
+
+        # extracts the lagged subsequences from the input sequence using the calculated indices
+        lagged_values = []
+        for lag_index in indices:
+            begin_index = -lag_index - subsequences_length
+            end_index = -lag_index if lag_index > 0 else None
+            lagged_values.append(sequence[:, begin_index:end_index, ...])
+
+        # return as stacked tensor in the feature dimension
+        return torch.stack(lagged_values, dim=-1)
+
+    def create_network_inputs(
+        self,
+        past_values: torch.Tensor,
+        past_time_features: torch.Tensor,
+        static_categorical_features: Optional[torch.Tensor] = None,
+        static_real_features: Optional[torch.Tensor] = None,
+        past_observed_mask: Optional[torch.Tensor] = None,
+        future_values: Optional[torch.Tensor] = None,
+        future_time_features: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Creates the inputs for the network given the past and future values, time features, and static features.
+
+        Args:
+            past_values (`torch.Tensor`):
+                A tensor of shape `(batch_size, past_length, input_size)` containing the past values.
+            past_time_features (`torch.Tensor`):
+                A tensor of shape `(batch_size, past_length, num_features)` containing the past time features.
+            static_categorical_features (`Optional[torch.Tensor]`):
+                An optional tensor of shape `(batch_size, num_categorical_features)` containing the static categorical
+                features.
+            static_real_features (`Optional[torch.Tensor]`):
+                An optional tensor of shape `(batch_size, num_real_features)` containing the static real features.
+            past_observed_mask (`Optional[torch.Tensor]`):
+                An optional tensor of shape `(batch_size, past_length, input_size)` containing the mask of observed
+                values in the past.
+            future_values (`Optional[torch.Tensor]`):
+                An optional tensor of shape `(batch_size, future_length, input_size)` containing the future values.
+
+        Returns:
+            A tuple containing the following tensors:
+            - reshaped_lagged_sequence (`torch.Tensor`): A tensor of shape `(batch_size, sequence_length, num_lags *
+              input_size)` containing the lagged subsequences of the inputs.
+            - features (`torch.Tensor`): A tensor of shape `(batch_size, sequence_length, num_features)` containing the
+              concatenated static and time features.
+            - loc (`torch.Tensor`): A tensor of shape `(batch_size, input_size)` containing the mean of the input
+              values.
+            - scale (`torch.Tensor`): A tensor of shape `(batch_size, input_size)` containing the std of the input
+              values.
+            - static_feat (`torch.Tensor`): A tensor of shape `(batch_size, num_static_features)` containing the
+              concatenated static features.
+        """
+        # time feature
+        time_feat = (
+            torch.cat(
+                (
+                    past_time_features[:, self._past_length - self.config.context_length :, ...],
+                    future_time_features,
+                ),
+                dim=1,
+            )
+            if future_values is not None
+            else past_time_features[:, self._past_length - self.config.context_length :, ...]
+        )
+
+        # target
+        if past_observed_mask is None:
+            past_observed_mask = torch.ones_like(past_values)
+
+        context = past_values[:, -self.config.context_length :]
+        observed_context = past_observed_mask[:, -self.config.context_length :]
+        _, loc, scale = self.scaler(context, observed_context)
+
+        inputs = (
+            (torch.cat((past_values, future_values), dim=1) - loc) / scale
+            if future_values is not None
+            else (past_values - loc) / scale
+        )
+
+        # static features
+        log_abs_loc = loc.abs().log1p() if self.config.input_size == 1 else loc.squeeze(1).abs().log1p()
+        log_scale = scale.log() if self.config.input_size == 1 else scale.squeeze(1).log()
+        static_feat = torch.cat((log_abs_loc, log_scale), dim=1)
+
+        if static_real_features is not None:
+            static_feat = torch.cat((static_real_features, static_feat), dim=1)
+        if static_categorical_features is not None:
+            embedded_cat = self.embedder(static_categorical_features)
+            static_feat = torch.cat((embedded_cat, static_feat), dim=1)
+        expanded_static_feat = static_feat.unsqueeze(1).expand(-1, time_feat.shape[1], -1)
+
+        # all features
+        features = torch.cat((expanded_static_feat, time_feat), dim=-1)
+
+        # lagged features
+        subsequences_length = (
+            self.config.context_length + self.config.prediction_length
+            if future_values is not None
+            else self.config.context_length
+        )
+        lagged_sequence = self.get_lagged_subsequences(sequence=inputs, subsequences_length=subsequences_length)
+        lags_shape = lagged_sequence.shape
+        reshaped_lagged_sequence = lagged_sequence.reshape(lags_shape[0], lags_shape[1], -1)
+
+        if reshaped_lagged_sequence.shape[1] != time_feat.shape[1]:
+            raise ValueError(
+                f"input length {reshaped_lagged_sequence.shape[1]} and time feature lengths {time_feat.shape[1]} does not match"
+            )
+        return reshaped_lagged_sequence, features, loc, scale, static_feat
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    @add_start_docstrings_to_model_forward(AUTOFORMER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=AutoformerModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        past_values: torch.Tensor,
+        past_time_features: torch.Tensor,
+        past_observed_mask: torch.Tensor,
+        static_categorical_features: Optional[torch.Tensor] = None,
+        static_real_features: Optional[torch.Tensor] = None,
+        future_values: Optional[torch.Tensor] = None,
+        future_time_features: Optional[torch.Tensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[AutoformerModelOutput, Tuple]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from huggingface_hub import hf_hub_download
+        >>> import torch
+        >>> from transformers import AutoformerModel
+
+        >>> file = hf_hub_download(
+        ...     repo_id="hf-internal-testing/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
+        ... )
+        >>> batch = torch.load(file)
+
+        >>> model = AutoformerModel.from_pretrained("huggingface/autoformer-tourism-monthly")
+
+        >>> # during training, one provides both past and future values
+        >>> # as well as possible additional features
+        >>> outputs = model(
+        ...     past_values=batch["past_values"],
+        ...     past_time_features=batch["past_time_features"],
+        ...     past_observed_mask=batch["past_observed_mask"],
+        ...     static_categorical_features=batch["static_categorical_features"],
+        ...     future_values=batch["future_values"],
+        ...     future_time_features=batch["future_time_features"],
+        ... )
+
+        >>> last_hidden_state = outputs.last_hidden_state
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_inputs, temporal_features, loc, scale, static_feat = self.create_network_inputs(
+            past_values=past_values,
+            past_time_features=past_time_features,
+            past_observed_mask=past_observed_mask,
+            static_categorical_features=static_categorical_features,
+            static_real_features=static_real_features,
+            future_values=future_values,
+            future_time_features=future_time_features,
+        )
+
+        if encoder_outputs is None:
+            enc_input = torch.cat(
+                (
+                    transformer_inputs[:, : self.config.context_length, ...],
+                    temporal_features[:, : self.config.context_length, ...],
+                ),
+                dim=-1,
+            )
+            encoder_outputs = self.encoder(
+                inputs_embeds=enc_input,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        if future_values is not None:
+            # Decoder inputs
+            # seasonality and trend from context length
+            seasonal_input, trend_input = self.decomposition_layer(
+                transformer_inputs[:, : self.config.context_length, ...]
+            )
+            mean = (
+                torch.mean(transformer_inputs[:, : self.config.context_length, ...], dim=1)
+                .unsqueeze(1)
+                .repeat(1, self.config.prediction_length, 1)
+            )
+            zeros = torch.zeros(
+                [transformer_inputs.shape[0], self.config.prediction_length, transformer_inputs.shape[2]],
+                device=enc_input.device,
+            )
+
+            decoder_input = torch.cat(
+                (
+                    torch.cat((seasonal_input[:, -self.config.label_length :, ...], zeros), dim=1),
+                    temporal_features[:, self.config.context_length - self.config.label_length :, ...],
+                ),
+                dim=-1,
+            )
+            trend_init = torch.cat(
+                (
+                    torch.cat((trend_input[:, -self.config.label_length :, ...], mean), dim=1),
+                    temporal_features[:, self.config.context_length - self.config.label_length :, ...],
+                ),
+                dim=-1,
+            )
+
+            decoder_outputs = self.decoder(
+                trend=trend_init,
+                inputs_embeds=decoder_input,
+                attention_mask=decoder_attention_mask,
+                encoder_hidden_states=encoder_outputs[0],
+                head_mask=decoder_head_mask,
+                cross_attn_head_mask=cross_attn_head_mask,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        else:
+            decoder_outputs = AutoFormerDecoderOutput()
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs + (loc, scale, static_feat)
+
+        return AutoformerModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            trend=decoder_outputs.trend,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            loc=loc,
+            scale=scale,
+            static_features=static_feat,
+        )
+
+
+@add_start_docstrings(
+    "The Autoformer Model with a distribution head on top for time-series forecasting.",
+    AUTOFORMER_START_DOCSTRING,
+)
+class AutoformerForPrediction(AutoformerPreTrainedModel):
+    def __init__(self, config: AutoformerConfig):
+        super().__init__(config)
+        self.model = AutoformerModel(config)
+        if config.distribution_output == "student_t":
+            self.distribution_output = StudentTOutput(dim=config.input_size)
+        elif config.distribution_output == "normal":
+            self.distribution_output = NormalOutput(dim=config.input_size)
+        elif config.distribution_output == "negative_binomial":
+            self.distribution_output = NegativeBinomialOutput(dim=config.input_size)
+        else:
+            raise ValueError(f"Unknown distribution output {config.distribution_output}")
+
+        self.parameter_projection = self.distribution_output.get_parameter_projection(self.model.config.feature_size)
+        self.target_shape = self.distribution_output.event_shape
+
+        if config.loss == "nll":
+            self.loss = nll
+        else:
+            raise ValueError(f"Unknown loss function {config.loss}")
+
+        # Initialize weights of distribution_output and apply final processing
+        self.post_init()
+
+    def output_params(self, decoder_output):
+        return self.parameter_projection(decoder_output[:, -self.config.prediction_length :, :])
+
+    def get_encoder(self):
+        return self.model.get_encoder()
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    @torch.jit.ignore
+    def output_distribution(self, params, loc=None, scale=None, trailing_n=None) -> torch.distributions.Distribution:
+        sliced_params = params
+        if trailing_n is not None:
+            sliced_params = [p[:, -trailing_n:] for p in params]
+        return self.distribution_output.distribution(sliced_params, loc=loc, scale=scale)
+
+    @add_start_docstrings_to_model_forward(AUTOFORMER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqTSPredictionOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        past_values: torch.Tensor,
+        past_time_features: torch.Tensor,
+        past_observed_mask: torch.Tensor,
+        static_categorical_features: Optional[torch.Tensor] = None,
+        static_real_features: Optional[torch.Tensor] = None,
+        future_values: Optional[torch.Tensor] = None,
+        future_time_features: Optional[torch.Tensor] = None,
+        future_observed_mask: Optional[torch.Tensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Seq2SeqTSPredictionOutput, Tuple]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from huggingface_hub import hf_hub_download
+        >>> import torch
+        >>> from transformers import AutoformerForPrediction
+
+        >>> file = hf_hub_download(
+        ...     repo_id="hf-internal-testing/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
+        ... )
+        >>> batch = torch.load(file)
+
+        >>> model = AutoformerForPrediction.from_pretrained("huggingface/autoformer-tourism-monthly")
+
+        >>> # during training, one provides both past and future values
+        >>> # as well as possible additional features
+        >>> outputs = model(
+        ...     past_values=batch["past_values"],
+        ...     past_time_features=batch["past_time_features"],
+        ...     past_observed_mask=batch["past_observed_mask"],
+        ...     static_categorical_features=batch["static_categorical_features"],
+        ...     static_real_features=batch["static_real_features"],
+        ...     future_values=batch["future_values"],
+        ...     future_time_features=batch["future_time_features"],
+        ... )
+
+        >>> loss = outputs.loss
+        >>> loss.backward()
+
+        >>> # during inference, one only provides past values
+        >>> # as well as possible additional features
+        >>> # the model autoregressively generates future values
+        >>> outputs = model.generate(
+        ...     past_values=batch["past_values"],
+        ...     past_time_features=batch["past_time_features"],
+        ...     past_observed_mask=batch["past_observed_mask"],
+        ...     static_categorical_features=batch["static_categorical_features"],
+        ...     static_real_features=batch["static_real_features"],
+        ...     future_time_features=batch["future_time_features"],
+        ... )
+
+        >>> mean_prediction = outputs.sequences.mean(dim=1)
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if future_values is not None:
+            use_cache = False
+
+        outputs = self.model(
+            past_values=past_values,
+            past_time_features=past_time_features,
+            past_observed_mask=past_observed_mask,
+            static_categorical_features=static_categorical_features,
+            static_real_features=static_real_features,
+            future_values=future_values,
+            future_time_features=future_time_features,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            encoder_outputs=encoder_outputs,
+            past_key_values=past_key_values,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            return_dict=return_dict,
+        )
+
+        prediction_loss = None
+        params = None
+        if future_values is not None:
+            # outputs.last_hidden_state and trend
+            # loc is 4rd last and scale is 3rd last output
+            params = self.output_params(outputs[0] + outputs[1])
+            distribution = self.output_distribution(params, loc=outputs[-3], scale=outputs[-2])
+
+            loss = self.loss(distribution, future_values)
+
+            if future_observed_mask is None:
+                future_observed_mask = torch.ones_like(future_values)
+
+            if len(self.target_shape) == 0:
+                loss_weights = future_observed_mask
+            else:
+                loss_weights, _ = future_observed_mask.min(dim=-1, keepdim=False)
+
+            prediction_loss = weighted_average(loss, weights=loss_weights)
+
+        if not return_dict:
+            outputs = ((params,) + outputs[2:]) if params is not None else outputs[2:]
+            return ((prediction_loss,) + outputs) if prediction_loss is not None else outputs
+
+        return Seq2SeqTSPredictionOutput(
+            loss=prediction_loss,
+            params=params,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+            loc=outputs.loc,
+            scale=outputs.scale,
+            static_features=outputs.static_features,
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        past_values: torch.Tensor,
+        past_time_features: torch.Tensor,
+        future_time_features: torch.Tensor,
+        past_observed_mask: Optional[torch.Tensor] = None,
+        static_categorical_features: Optional[torch.Tensor] = None,
+        static_real_features: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> SampleTSPredictionOutput:
+        r"""
+        Greedily generate sequences of sample predictions from a model with a probability distribution head.
+
+        Parameters:
+            past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`):
+                Past values of the time series, that serve as context in order to predict the future. The sequence size
+                of this tensor must be larger than the `context_length` of the model, since the model will use the
+                larger size to construct lag features, i.e. additional values from the past which are added in order to
+                serve as "extra context".
+
+                The `sequence_length` here is equal to `config.context_length` + `max(config.lags_sequence)`, which if
+                no `lags_sequence` is configured, is equal to `config.context_length` + 7 (as by default, the largest
+                look-back index in `config.lags_sequence` is 7). The property `_past_length` returns the actual length
+                of the past.
+
+                The `past_values` is what the Transformer encoder gets as input (with optional additional features,
+                such as `static_categorical_features`, `static_real_features`, `past_time_features` and lags).
+
+                Optionally, missing values need to be replaced with zeros and indicated via the `past_observed_mask`.
+
+                For multivariate time series, the `input_size` > 1 dimension is required and corresponds to the number
+                of variates in the time series per time step.
+            past_time_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`):
+                Required time features, which the model internally will add to `past_values`. These could be things
+                like "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features).
+                These could also be so-called "age" features, which basically help the model know "at which point in
+                life" a time-series is. Age features have small values for distant past time steps and increase
+                monotonically the more we approach the current time step. Holiday features are also a good example of
+                time features.
+
+                These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT,
+                where the position encodings are learned from scratch internally as parameters of the model, the Time
+                Series Transformer requires to provide additional time features. The Time Series Transformer only
+                learns additional embeddings for `static_categorical_features`.
+
+                Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these
+                features must but known at prediction time.
+
+                The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`.
+            future_time_features (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`):
+                Required time features for the prediction window, which the model internally will add to sampled
+                predictions. These could be things like "month of year", "day of the month", etc. encoded as vectors
+                (for instance as Fourier features). These could also be so-called "age" features, which basically help
+                the model know "at which point in life" a time-series is. Age features have small values for distant
+                past time steps and increase monotonically the more we approach the current time step. Holiday features
+                are also a good example of time features.
+
+                These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT,
+                where the position encodings are learned from scratch internally as parameters of the model, the Time
+                Series Transformer requires to provide additional time features. The Time Series Transformer only
+                learns additional embeddings for `static_categorical_features`.
+
+                Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these
+                features must but known at prediction time.
+
+                The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`.
+            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`, *optional*):
+                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
+                in `[0, 1]`:
+
+                - 1 for values that are **observed**,
+                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+
+            static_categorical_features (`torch.LongTensor` of shape `(batch_size, number of static categorical features)`, *optional*):
+                Optional static categorical features for which the model will learn an embedding, which it will add to
+                the values of the time series.
+
+                Static categorical features are features which have the same value for all time steps (static over
+                time).
+
+                A typical example of a static categorical feature is a time series ID.
+            static_real_features (`torch.FloatTensor` of shape `(batch_size, number of static real features)`, *optional*):
+                Optional static real features which the model will add to the values of the time series.
+
+                Static real features are features which have the same value for all time steps (static over time).
+
+                A typical example of a static real feature is promotion information.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers.
+
+        Return:
+            [`SampleTSPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of
+            samples, prediction_length)` or `(batch_size, number of samples, prediction_length, input_size)` for
+            multivariate predictions.
+        """
+        outputs = self(
+            static_categorical_features=static_categorical_features,
+            static_real_features=static_real_features,
+            past_time_features=past_time_features,
+            past_values=past_values,
+            past_observed_mask=past_observed_mask,
+            future_time_features=None,
+            future_values=None,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            use_cache=False,
+        )
+
+        decoder = self.model.get_decoder()
+        enc_last_hidden = outputs.encoder_last_hidden_state
+        loc = outputs.loc
+        scale = outputs.scale
+        static_feat = outputs.static_features
+
+        num_parallel_samples = self.config.num_parallel_samples
+        repeated_loc = loc.repeat_interleave(repeats=num_parallel_samples, dim=0)
+        repeated_scale = scale.repeat_interleave(repeats=num_parallel_samples, dim=0)
+
+        repeated_past_values = (
+            past_values.repeat_interleave(repeats=num_parallel_samples, dim=0) - repeated_loc
+        ) / repeated_scale
+
+        time_features = torch.cat((past_time_features, future_time_features), dim=1)
+
+        expanded_static_feat = static_feat.unsqueeze(1).expand(-1, time_features.shape[1], -1)
+        features = torch.cat((expanded_static_feat, time_features), dim=-1)
+        repeated_features = features.repeat_interleave(repeats=num_parallel_samples, dim=0)
+
+        repeated_enc_last_hidden = enc_last_hidden.repeat_interleave(repeats=num_parallel_samples, dim=0)
+
+        lagged_sequence = self.model.get_lagged_subsequences(
+            sequence=repeated_past_values, subsequences_length=self.config.context_length
+        )
+        lags_shape = lagged_sequence.shape
+        reshaped_lagged_sequence = lagged_sequence.reshape(lags_shape[0], lags_shape[1], -1)
+        seasonal_input, trend_input = self.model.decomposition_layer(reshaped_lagged_sequence)
+
+        mean = torch.mean(reshaped_lagged_sequence, dim=1).unsqueeze(1).repeat(1, self.config.prediction_length, 1)
+        zeros = torch.zeros(
+            [reshaped_lagged_sequence.shape[0], self.config.prediction_length, reshaped_lagged_sequence.shape[2]],
+            device=reshaped_lagged_sequence.device,
+        )
+
+        decoder_input = torch.cat(
+            (
+                torch.cat((seasonal_input[:, -self.config.label_length :, ...], zeros), dim=1),
+                repeated_features[:, -self.config.prediction_length - self.config.label_length :, ...],
+            ),
+            dim=-1,
+        )
+        trend_init = torch.cat(
+            (
+                torch.cat((trend_input[:, -self.config.label_length :, ...], mean), dim=1),
+                repeated_features[:, -self.config.prediction_length - self.config.label_length :, ...],
+            ),
+            dim=-1,
+        )
+        decoder_outputs = decoder(
+            trend=trend_init, inputs_embeds=decoder_input, encoder_hidden_states=repeated_enc_last_hidden
+        )
+        decoder_last_hidden = decoder_outputs.last_hidden_state
+        trend = decoder_outputs.trend
+        params = self.output_params(decoder_last_hidden + trend)
+        distr = self.output_distribution(params, loc=repeated_loc, scale=repeated_scale)
+        future_samples = distr.sample()
+
+        return SampleTSPredictionOutput(
+            sequences=future_samples.reshape(
+                (-1, num_parallel_samples, self.config.prediction_length) + self.target_shape,
+            )
+        )
diff --git a/src/transformers/models/bark/__init__.py b/src/transformers/models/bark/__init__.py
new file mode 100644
index 000000000000..03e5865ca4a4
--- /dev/null
+++ b/src/transformers/models/bark/__init__.py
@@ -0,0 +1,79 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_bark": [
+        "BARK_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "BarkCoarseConfig",
+        "BarkConfig",
+        "BarkFineConfig",
+        "BarkSemanticConfig",
+    ],
+    "processing_bark": ["BarkProcessor"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_bark"] = [
+        "BARK_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "BarkFineModel",
+        "BarkSemanticModel",
+        "BarkCoarseModel",
+        "BarkModel",
+        "BarkPreTrainedModel",
+        "BarkCausalModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_bark import (
+        BARK_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        BarkCoarseConfig,
+        BarkConfig,
+        BarkFineConfig,
+        BarkSemanticConfig,
+    )
+    from .processing_bark import BarkProcessor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_bark import (
+            BARK_PRETRAINED_MODEL_ARCHIVE_LIST,
+            BarkCausalModel,
+            BarkCoarseModel,
+            BarkFineModel,
+            BarkModel,
+            BarkPreTrainedModel,
+            BarkSemanticModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/bark/configuration_bark.py b/src/transformers/models/bark/configuration_bark.py
new file mode 100644
index 000000000000..15efb11dc7d4
--- /dev/null
+++ b/src/transformers/models/bark/configuration_bark.py
@@ -0,0 +1,330 @@
+# coding=utf-8
+# Copyright 2023 The Suno AI Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" BARK model configuration"""
+
+import os
+from typing import Dict, Optional, Union
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import add_start_docstrings, logging
+from ..auto import CONFIG_MAPPING
+
+
+logger = logging.get_logger(__name__)
+
+
+BARK_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "suno/bark-small": "https://huggingface.co/suno/bark-small/resolve/main/config.json",
+    "suno/bark": "https://huggingface.co/suno/bark/resolve/main/config.json",
+}
+
+BARK_SUBMODELCONFIG_START_DOCSTRING = """
+    This is the configuration class to store the configuration of a [`{model}`]. It is used to instantiate the model
+    according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Bark [suno/bark](https://huggingface.co/suno/bark)
+    architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        block_size (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        input_vocab_size (`int`, *optional*, defaults to 10_048):
+            Vocabulary size of a Bark sub-model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`{model}`]. Defaults to 10_048 but should be carefully thought with
+            regards to the chosen sub-model.
+        output_vocab_size (`int`, *optional*, defaults to 10_048):
+            Output vocabulary size of a Bark sub-model. Defines the number of different tokens that can be represented
+            by the: `output_ids` when passing forward a [`{model}`]. Defaults to 10_048 but should be carefully thought
+            with regards to the chosen sub-model.
+        num_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the given sub-model.
+        num_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer architecture.
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the architecture.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        bias (`bool`, *optional*, defaults to `True`):
+            Whether or not to use bias in the linear layers and layer norm layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+"""
+
+
+class BarkSubModelConfig(PretrainedConfig):
+    model_type = "bark_module"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    attribute_map = {
+        "num_attention_heads": "num_heads",
+        "num_hidden_layers": "num_layers",
+        "vocab_size": "input_vocab_size",
+        "window_size": "block_size",
+    }
+
+    def __init__(
+        self,
+        block_size=1024,
+        input_vocab_size=10_048,
+        output_vocab_size=10_048,
+        num_layers=12,
+        num_heads=12,
+        hidden_size=768,
+        dropout=0.0,
+        bias=True,  # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
+        initializer_range=0.02,
+        use_cache=True,
+        **kwargs,
+    ):
+        self.block_size = block_size
+        self.input_vocab_size = input_vocab_size
+        self.output_vocab_size = output_vocab_size
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.dropout = dropout
+        self.bias = bias
+        self.use_cache = use_cache
+        self.initializer_range = initializer_range
+
+        super().__init__(**kwargs)
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
+        **kwargs,
+    ) -> "PretrainedConfig":
+        kwargs["cache_dir"] = cache_dir
+        kwargs["force_download"] = force_download
+        kwargs["local_files_only"] = local_files_only
+        kwargs["revision"] = revision
+
+        cls._set_token_in_kwargs(kwargs, token)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the config dict if we are loading from Bark
+        if config_dict.get("model_type") == "bark":
+            config_dict = config_dict[f"{cls.model_type}_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+@add_start_docstrings(
+    BARK_SUBMODELCONFIG_START_DOCSTRING.format(config="BarkSemanticConfig", model="BarkSemanticModel"),
+    """
+    Example:
+
+    ```python
+    >>> from transformers import BarkSemanticConfig, BarkSemanticModel
+
+    >>> # Initializing a Bark sub-module style configuration
+    >>> configuration = BarkSemanticConfig()
+
+    >>> # Initializing a model (with random weights) from the suno/bark style configuration
+    >>> model = BarkSemanticModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```""",
+)
+class BarkSemanticConfig(BarkSubModelConfig):
+    model_type = "semantic"
+
+
+@add_start_docstrings(
+    BARK_SUBMODELCONFIG_START_DOCSTRING.format(config="BarkCoarseConfig", model="BarkCoarseModel"),
+    """
+    Example:
+
+    ```python
+    >>> from transformers import BarkCoarseConfig, BarkCoarseModel
+
+    >>> # Initializing a Bark sub-module style configuration
+    >>> configuration = BarkCoarseConfig()
+
+    >>> # Initializing a model (with random weights) from the suno/bark style configuration
+    >>> model = BarkCoarseModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```""",
+)
+class BarkCoarseConfig(BarkSubModelConfig):
+    model_type = "coarse_acoustics"
+
+
+@add_start_docstrings(
+    BARK_SUBMODELCONFIG_START_DOCSTRING.format(config="BarkFineConfig", model="BarkFineModel"),
+    """
+        n_codes_total (`int`, *optional*, defaults to 8):
+            The total number of audio codebooks predicted. Used in the fine acoustics sub-model.
+        n_codes_given (`int`, *optional*, defaults to 1):
+            The number of audio codebooks predicted in the coarse acoustics sub-model. Used in the acoustics
+            sub-models.
+    Example:
+
+    ```python
+    >>> from transformers import BarkFineConfig, BarkFineModel
+
+    >>> # Initializing a Bark sub-module style configuration
+    >>> configuration = BarkFineConfig()
+
+    >>> # Initializing a model (with random weights) from the suno/bark style configuration
+    >>> model = BarkFineModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```""",
+)
+class BarkFineConfig(BarkSubModelConfig):
+    model_type = "fine_acoustics"
+
+    def __init__(self, tie_word_embeddings=True, n_codes_total=8, n_codes_given=1, **kwargs):
+        self.n_codes_total = n_codes_total
+        self.n_codes_given = n_codes_given
+
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+
+
+class BarkConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`BarkModel`]. It is used to instantiate a Bark
+    model according to the specified sub-models configurations, defining the model architecture.
+
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Bark
+    [suno/bark](https://huggingface.co/suno/bark) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+    semantic_config ([`BarkSemanticConfig`], *optional*):
+        Configuration of the underlying semantic sub-model.
+    coarse_acoustics_config ([`BarkCoarseConfig`], *optional*):
+        Configuration of the underlying coarse acoustics sub-model.
+    fine_acoustics_config ([`BarkFineConfig`], *optional*):
+        Configuration of the underlying fine acoustics sub-model.
+    codec_config ([`AutoConfig`], *optional*):
+        Configuration of the underlying codec sub-model.
+
+    Example:
+
+    ```python
+    >>> from transformers import (
+    ...     BarkSemanticConfig,
+    ...     BarkCoarseConfig,
+    ...     BarkFineConfig,
+    ...     BarkModel,
+    ...     BarkConfig,
+    ...     AutoConfig,
+    ... )
+
+    >>> # Initializing Bark sub-modules configurations.
+    >>> semantic_config = BarkSemanticConfig()
+    >>> coarse_acoustics_config = BarkCoarseConfig()
+    >>> fine_acoustics_config = BarkFineConfig()
+    >>> codec_config = AutoConfig.from_pretrained("facebook/encodec_24khz")
+
+
+    >>> # Initializing a Bark module style configuration
+    >>> configuration = BarkConfig.from_sub_model_configs(
+    ...     semantic_config, coarse_acoustics_config, fine_acoustics_config, codec_config
+    ... )
+
+    >>> # Initializing a model (with random weights)
+    >>> model = BarkModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "bark"
+
+    def __init__(
+        self,
+        semantic_config: Dict = None,
+        coarse_acoustics_config: Dict = None,
+        fine_acoustics_config: Dict = None,
+        codec_config: Dict = None,
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        if semantic_config is None:
+            semantic_config = {}
+            logger.info("semantic_config is None. initializing the semantic model with default values.")
+
+        if coarse_acoustics_config is None:
+            coarse_acoustics_config = {}
+            logger.info("coarse_acoustics_config is None. initializing the coarse model with default values.")
+
+        if fine_acoustics_config is None:
+            fine_acoustics_config = {}
+            logger.info("fine_acoustics_config is None. initializing the fine model with default values.")
+
+        if codec_config is None:
+            codec_config = {}
+            logger.info("codec_config is None. initializing the codec model with default values.")
+
+        self.semantic_config = BarkSemanticConfig(**semantic_config)
+        self.coarse_acoustics_config = BarkCoarseConfig(**coarse_acoustics_config)
+        self.fine_acoustics_config = BarkFineConfig(**fine_acoustics_config)
+        codec_model_type = codec_config["model_type"] if "model_type" in codec_config else "encodec"
+        self.codec_config = CONFIG_MAPPING[codec_model_type](**codec_config)
+
+        self.initializer_range = initializer_range
+
+        super().__init__(**kwargs)
+
+    @classmethod
+    def from_sub_model_configs(
+        cls,
+        semantic_config: BarkSemanticConfig,
+        coarse_acoustics_config: BarkCoarseConfig,
+        fine_acoustics_config: BarkFineConfig,
+        codec_config: PretrainedConfig,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a [`BarkConfig`] (or a derived class) from bark sub-models configuration.
+
+        Returns:
+            [`BarkConfig`]: An instance of a configuration object
+        """
+        return cls(
+            semantic_config=semantic_config.to_dict(),
+            coarse_acoustics_config=coarse_acoustics_config.to_dict(),
+            fine_acoustics_config=fine_acoustics_config.to_dict(),
+            codec_config=codec_config.to_dict(),
+            **kwargs,
+        )
diff --git a/src/transformers/models/bark/convert_suno_to_hf.py b/src/transformers/models/bark/convert_suno_to_hf.py
new file mode 100644
index 000000000000..4720a70d5cd2
--- /dev/null
+++ b/src/transformers/models/bark/convert_suno_to_hf.py
@@ -0,0 +1,262 @@
+"""Convert Bark checkpoint."""
+import argparse
+import os
+from pathlib import Path
+
+import torch
+from bark.generation import _load_model as _bark_load_model
+from huggingface_hub import hf_hub_download
+
+from transformers import EncodecConfig, EncodecModel, set_seed
+from transformers.models.bark.configuration_bark import (
+    BarkCoarseConfig,
+    BarkConfig,
+    BarkFineConfig,
+    BarkSemanticConfig,
+)
+from transformers.models.bark.generation_configuration_bark import (
+    BarkCoarseGenerationConfig,
+    BarkFineGenerationConfig,
+    BarkGenerationConfig,
+    BarkSemanticGenerationConfig,
+)
+from transformers.models.bark.modeling_bark import BarkCoarseModel, BarkFineModel, BarkModel, BarkSemanticModel
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+set_seed(770)
+
+
+new_layer_name_dict = {
+    "c_attn": "att_proj",
+    "c_proj": "out_proj",
+    "c_fc": "in_proj",
+    "transformer.": "",
+    "h.": "layers.",
+    "ln_1": "layernorm_1",
+    "ln_2": "layernorm_2",
+    "ln_f": "layernorm_final",
+    "wpe": "position_embeds_layer",
+    "wte": "input_embeds_layer",
+}
+
+
+REMOTE_MODEL_PATHS = {
+    "text_small": {
+        "repo_id": "suno/bark",
+        "file_name": "text.pt",
+    },
+    "coarse_small": {
+        "repo_id": "suno/bark",
+        "file_name": "coarse.pt",
+    },
+    "fine_small": {
+        "repo_id": "suno/bark",
+        "file_name": "fine.pt",
+    },
+    "text": {
+        "repo_id": "suno/bark",
+        "file_name": "text_2.pt",
+    },
+    "coarse": {
+        "repo_id": "suno/bark",
+        "file_name": "coarse_2.pt",
+    },
+    "fine": {
+        "repo_id": "suno/bark",
+        "file_name": "fine_2.pt",
+    },
+}
+
+CUR_PATH = os.path.dirname(os.path.abspath(__file__))
+default_cache_dir = os.path.join(os.path.expanduser("~"), ".cache")
+CACHE_DIR = os.path.join(os.getenv("XDG_CACHE_HOME", default_cache_dir), "suno", "bark_v0")
+
+
+def _get_ckpt_path(model_type, use_small=False):
+    key = model_type
+    if use_small:
+        key += "_small"
+    return os.path.join(CACHE_DIR, REMOTE_MODEL_PATHS[key]["file_name"])
+
+
+def _download(from_hf_path, file_name):
+    os.makedirs(CACHE_DIR, exist_ok=True)
+    hf_hub_download(repo_id=from_hf_path, filename=file_name, local_dir=CACHE_DIR)
+
+
+def _load_model(ckpt_path, device, use_small=False, model_type="text"):
+    if model_type == "text":
+        ModelClass = BarkSemanticModel
+        ConfigClass = BarkSemanticConfig
+        GenerationConfigClass = BarkSemanticGenerationConfig
+    elif model_type == "coarse":
+        ModelClass = BarkCoarseModel
+        ConfigClass = BarkCoarseConfig
+        GenerationConfigClass = BarkCoarseGenerationConfig
+    elif model_type == "fine":
+        ModelClass = BarkFineModel
+        ConfigClass = BarkFineConfig
+        GenerationConfigClass = BarkFineGenerationConfig
+    else:
+        raise NotImplementedError()
+    model_key = f"{model_type}_small" if use_small else model_type
+    model_info = REMOTE_MODEL_PATHS[model_key]
+    if not os.path.exists(ckpt_path):
+        logger.info(f"{model_type} model not found, downloading into `{CACHE_DIR}`.")
+        _download(model_info["repo_id"], model_info["file_name"])
+    checkpoint = torch.load(ckpt_path, map_location=device)
+    # this is a hack
+    model_args = checkpoint["model_args"]
+    if "input_vocab_size" not in model_args:
+        model_args["input_vocab_size"] = model_args["vocab_size"]
+        model_args["output_vocab_size"] = model_args["vocab_size"]
+        del model_args["vocab_size"]
+
+    # convert Bark model arguments to HF Bark model arguments
+    model_args["num_heads"] = model_args.pop("n_head")
+    model_args["hidden_size"] = model_args.pop("n_embd")
+    model_args["num_layers"] = model_args.pop("n_layer")
+
+    model_config = ConfigClass(**checkpoint["model_args"])
+    model = ModelClass(config=model_config)
+    model_generation_config = GenerationConfigClass()
+
+    model.generation_config = model_generation_config
+    state_dict = checkpoint["model"]
+    # fixup checkpoint
+    unwanted_prefix = "_orig_mod."
+    for k, v in list(state_dict.items()):
+        if k.startswith(unwanted_prefix):
+            # replace part of the key with corresponding layer name in HF implementation
+            new_k = k[len(unwanted_prefix) :]
+            for old_layer_name in new_layer_name_dict:
+                new_k = new_k.replace(old_layer_name, new_layer_name_dict[old_layer_name])
+
+            state_dict[new_k] = state_dict.pop(k)
+
+    extra_keys = set(state_dict.keys()) - set(model.state_dict().keys())
+    extra_keys = {k for k in extra_keys if not k.endswith(".attn.bias")}
+    missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
+    missing_keys = {k for k in missing_keys if not k.endswith(".attn.bias")}
+    if len(extra_keys) != 0:
+        raise ValueError(f"extra keys found: {extra_keys}")
+    if len(missing_keys) != 0:
+        raise ValueError(f"missing keys: {missing_keys}")
+    model.load_state_dict(state_dict, strict=False)
+    n_params = model.num_parameters(exclude_embeddings=True)
+    val_loss = checkpoint["best_val_loss"].item()
+    logger.info(f"model loaded: {round(n_params/1e6,1)}M params, {round(val_loss,3)} loss")
+    model.eval()
+    model.to(device)
+    del checkpoint, state_dict
+
+    return model
+
+
+def load_model(pytorch_dump_folder_path, use_small=False, model_type="text"):
+    if model_type not in ("text", "coarse", "fine"):
+        raise NotImplementedError()
+
+    device = "cpu"  # do conversion on cpu
+
+    ckpt_path = _get_ckpt_path(model_type, use_small=use_small)
+    model = _load_model(ckpt_path, device, model_type=model_type, use_small=use_small)
+
+    # load bark initial model
+    bark_model = _bark_load_model(ckpt_path, "cpu", model_type=model_type, use_small=use_small)
+
+    if model_type == "text":
+        bark_model = bark_model["model"]
+
+    if model.num_parameters(exclude_embeddings=True) != bark_model.get_num_params():
+        raise ValueError("initial and new models don't have the same number of parameters")
+
+    # check if same output as the bark model
+    batch_size = 5
+    sequence_length = 10
+
+    if model_type in ["text", "coarse"]:
+        vec = torch.randint(256, (batch_size, sequence_length), dtype=torch.int)
+        output_old_model = bark_model(vec)[0]
+
+        output_new_model_total = model(vec)
+
+        # take last logits
+        output_new_model = output_new_model_total.logits[:, [-1], :]
+
+    else:
+        prediction_codeboook_channel = 3
+        n_codes_total = 8
+        vec = torch.randint(256, (batch_size, sequence_length, n_codes_total), dtype=torch.int)
+
+        output_new_model_total = model(prediction_codeboook_channel, vec)
+        output_old_model = bark_model(prediction_codeboook_channel, vec)
+
+        output_new_model = output_new_model_total.logits
+
+    # output difference should come from the difference of self-attention implementation design
+    if output_new_model.shape != output_old_model.shape:
+        raise ValueError("initial and new outputs don't have the same shape")
+    if (output_new_model - output_old_model).abs().max().item() > 1e-3:
+        raise ValueError("initial and new outputs are not equal")
+
+    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+    model.save_pretrained(pytorch_dump_folder_path)
+
+
+def load_whole_bark_model(
+    semantic_path,
+    coarse_path,
+    fine_path,
+    append_text,
+    hub_path,
+    folder_path,
+):
+    pytorch_dump_folder_path = os.path.join(folder_path, append_text)
+
+    semanticConfig = BarkSemanticConfig.from_pretrained(os.path.join(semantic_path, "config.json"))
+    coarseAcousticConfig = BarkCoarseConfig.from_pretrained(os.path.join(coarse_path, "config.json"))
+    fineAcousticConfig = BarkFineConfig.from_pretrained(os.path.join(fine_path, "config.json"))
+    codecConfig = EncodecConfig.from_pretrained("facebook/encodec_24khz")
+
+    semantic = BarkSemanticModel.from_pretrained(semantic_path)
+    coarseAcoustic = BarkCoarseModel.from_pretrained(coarse_path)
+    fineAcoustic = BarkFineModel.from_pretrained(fine_path)
+    codec = EncodecModel.from_pretrained("facebook/encodec_24khz")
+
+    bark_config = BarkConfig.from_sub_model_configs(
+        semanticConfig, coarseAcousticConfig, fineAcousticConfig, codecConfig
+    )
+
+    bark_generation_config = BarkGenerationConfig.from_sub_model_configs(
+        semantic.generation_config, coarseAcoustic.generation_config, fineAcoustic.generation_config
+    )
+
+    bark = BarkModel(bark_config)
+
+    bark.semantic = semantic
+    bark.coarse_acoustics = coarseAcoustic
+    bark.fine_acoustics = fineAcoustic
+    bark.codec_model = codec
+
+    bark.generation_config = bark_generation_config
+
+    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+    bark.save_pretrained(pytorch_dump_folder_path, repo_id=hub_path, push_to_hub=True)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+
+    parser.add_argument("model_type", type=str, help="text, coarse or fine.")
+    parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+    parser.add_argument("--is_small", action="store_true", help="convert the small version instead of the large.")
+
+    args = parser.parse_args()
+
+    load_model(args.pytorch_dump_folder_path, model_type=args.model_type, use_small=args.is_small)
diff --git a/src/transformers/models/bark/generation_configuration_bark.py b/src/transformers/models/bark/generation_configuration_bark.py
new file mode 100644
index 000000000000..92d836333935
--- /dev/null
+++ b/src/transformers/models/bark/generation_configuration_bark.py
@@ -0,0 +1,325 @@
+# coding=utf-8
+# Copyright 2023 The Suno AI Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" BARK model generation configuration"""
+
+import copy
+from typing import Dict
+
+from ...generation.configuration_utils import GenerationConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class BarkSemanticGenerationConfig(GenerationConfig):
+    model_type = "semantic"
+
+    def __init__(
+        self,
+        eos_token_id=10_000,
+        renormalize_logits=True,
+        max_new_tokens=768,
+        output_scores=False,
+        return_dict_in_generate=False,
+        output_hidden_states=False,
+        output_attentions=False,
+        temperature=1.0,
+        do_sample=False,
+        text_encoding_offset=10_048,
+        text_pad_token=129_595,
+        semantic_infer_token=129_599,
+        semantic_vocab_size=10_000,
+        max_input_semantic_length=256,
+        semantic_rate_hz=49.9,
+        **kwargs,
+    ):
+        """Class that holds a generation configuration for [`BarkSemanticModel`].
+
+        This configuration inherit from [`GenerationConfig`] and can be used to control the model generation. Read the
+        documentation from [`GenerationConfig`] for more information.
+
+        Args:
+            eos_token_id (`int`, *optional*, defaults to 10_000):
+                The id of the *end-of-sequence* token.
+            renormalize_logits (`bool`, *optional*, defaults to `True`):
+                Whether to renormalize the logits after applying all the logits processors or warpers (including the
+                custom ones). It's highly recommended to set this flag to `True` as the search algorithms suppose the
+                score logits are normalized but some logit processors or warpers break the normalization.
+            max_new_tokens (`int`, *optional*, defaults to 768):
+                The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.
+            output_scores (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+            output_hidden_states (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more details.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more details.
+            temperature (`float`, *optional*, defaults to 1.0):
+                The value used to modulate the next token probabilities.
+            do_sample (`bool`, *optional*, defaults to `False`):
+                Whether or not to use sampling ; use greedy decoding otherwise.
+            text_encoding_offset (`int`, *optional*, defaults to 10_048):
+                Text encoding offset.
+            text_pad_token (`int`, *optional*, defaults to 129_595):
+                Text pad token.
+            semantic_infer_token (`int`, *optional*, defaults to 129_599):
+                Semantic infer token.
+            semantic_vocab_size (`int`, *optional*, defaults to 10_000):
+                Semantic vocab size.
+            max_input_semantic_length (`int`, *optional*, defaults to 256):
+                Max length of semantic input vector.
+            semantic_rate_hz (`float`, *optional*, defaults to 49.9):
+                Semantic rate in Hertz.
+        """
+        super().__init__(
+            temperature=temperature,
+            do_sample=do_sample,
+            eos_token_id=eos_token_id,
+            renormalize_logits=renormalize_logits,
+            max_new_tokens=max_new_tokens,
+            output_scores=output_scores,
+            return_dict_in_generate=return_dict_in_generate,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            **kwargs,
+        )
+
+        self.text_encoding_offset = text_encoding_offset
+        self.text_pad_token = text_pad_token
+        self.semantic_pad_token = eos_token_id
+        self.semantic_infer_token = semantic_infer_token
+        self.semantic_vocab_size = semantic_vocab_size
+        self.max_input_semantic_length = max_input_semantic_length
+        self.semantic_rate_hz = semantic_rate_hz
+
+
+class BarkCoarseGenerationConfig(GenerationConfig):
+    model_type = "coarse_acoustics"
+
+    def __init__(
+        self,
+        renormalize_logits=True,
+        output_scores=False,
+        return_dict_in_generate=False,
+        output_hidden_states=False,
+        output_attentions=False,
+        temperature=1.0,
+        do_sample=False,
+        coarse_semantic_pad_token=12_048,
+        coarse_rate_hz=75,
+        n_coarse_codebooks=2,
+        coarse_infer_token=12_050,
+        max_coarse_input_length=256,
+        max_coarse_history: int = 630,
+        sliding_window_len: int = 60,
+        **kwargs,
+    ):
+        """Class that holds a generation configuration for [`BarkCoarseModel`].
+
+        This configuration inherit from [`GenerationConfig`] and can be used to control the model generation. Read the
+        documentation from [`GenerationConfig`] for more information.
+
+        Args:
+            renormalize_logits (`bool`, *optional*, defaults to `True`):
+                Whether to renormalize the logits after applying all the logits processors or warpers (including the
+                custom ones). It's highly recommended to set this flag to `True` as the search algorithms suppose the
+                score logits are normalized but some logit processors or warpers break the normalization.
+            output_scores (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+            output_hidden_states (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more details.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more details.
+            temperature (`float`, *optional*, defaults to 1.0):
+                The value used to modulate the next token probabilities.
+            do_sample (`bool`, *optional*, defaults to `False`):
+                Whether or not to use sampling ; use greedy decoding otherwise.
+            coarse_semantic_pad_token (`int`, *optional*, defaults to 12_048):
+                Coarse semantic pad token.
+            coarse_rate_hz (`int`, *optional*, defaults to 75):
+                Coarse rate in Hertz.
+            n_coarse_codebooks (`int`, *optional*, defaults to 2):
+                Number of coarse codebooks.
+            coarse_infer_token (`int`, *optional*, defaults to 12_050):
+                Coarse infer token.
+            max_coarse_input_length (`int`, *optional*, defaults to 256):
+                Max length of input coarse vector.
+            max_coarse_history (`int`, *optional*, defaults to 630):
+                Max length of the output of the coarse acoustics model used in the fine generation step.
+            sliding_window_len (`int`, *optional*, defaults to 60):
+                The coarse generation step uses a sliding window to generate raw audio.
+        """
+        super().__init__(
+            temperature=temperature,
+            do_sample=do_sample,
+            renormalize_logits=renormalize_logits,
+            output_scores=output_scores,
+            return_dict_in_generate=return_dict_in_generate,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            **kwargs,
+        )
+
+        self.coarse_semantic_pad_token = coarse_semantic_pad_token
+        self.coarse_rate_hz = coarse_rate_hz
+        self.n_coarse_codebooks = n_coarse_codebooks
+        self.coarse_infer_token = coarse_infer_token
+        self.max_coarse_input_length = max_coarse_input_length
+        self.max_coarse_history = max_coarse_history
+        self.sliding_window_len = sliding_window_len
+
+
+class BarkFineGenerationConfig(GenerationConfig):
+    model_type = "fine_acoustics"
+
+    def __init__(
+        self,
+        temperature=1.0,
+        max_fine_history_length=512,
+        max_fine_input_length=1024,
+        n_fine_codebooks=8,
+        **kwargs,
+    ):
+        """Class that holds a generation configuration for [`BarkFineModel`].
+
+        [`BarkFineModel`] is an autoencoder model, so should not usually be used for generation. However, under the
+        hood, it uses `temperature` when used by [`BarkModel`]
+
+        This configuration inherit from [`GenerationConfig`] and can be used to control the model generation. Read the
+        documentation from [`GenerationConfig`] for more information.
+
+        Args:
+            temperature (`float`, *optional*):
+                The value used to modulate the next token probabilities.
+            max_fine_history_length (`int`, *optional*, defaults to 512):
+                Max length of the fine history vector.
+            max_fine_input_length (`int`, *optional*, defaults to 1024):
+                Max length of fine input vector.
+            n_fine_codebooks (`int`, *optional*, defaults to 8):
+                Number of codebooks used.
+        """
+        super().__init__(temperature=temperature)
+
+        self.max_fine_history_length = max_fine_history_length
+        self.max_fine_input_length = max_fine_input_length
+        self.n_fine_codebooks = n_fine_codebooks
+
+    def validate(self, **kwargs):
+        """
+        Overrides GenerationConfig.validate because BarkFineGenerationConfig don't use any parameters outside
+        temperature.
+        """
+        pass
+
+
+class BarkGenerationConfig(GenerationConfig):
+    model_type = "bark"
+    is_composition = True
+
+    # TODO (joao): nested from_dict
+
+    def __init__(
+        self,
+        semantic_config: Dict = None,
+        coarse_acoustics_config: Dict = None,
+        fine_acoustics_config: Dict = None,
+        sample_rate=24_000,
+        codebook_size=1024,
+        **kwargs,
+    ):
+        """Class that holds a generation configuration for [`BarkModel`].
+
+        The [`BarkModel`] does not have a `generate` method, but uses this class to generate speeches with a nested
+        [`BarkGenerationConfig`] which uses [`BarkSemanticGenerationConfig`], [`BarkCoarseGenerationConfig`],
+        [`BarkFineGenerationConfig`].
+
+        This configuration inherit from [`GenerationConfig`] and can be used to control the model generation. Read the
+        documentation from [`GenerationConfig`] for more information.
+
+        Args:
+            semantic_config (`Dict`, *optional*):
+                Semantic generation configuration.
+            coarse_acoustics_config (`Dict`, *optional*):
+                Coarse generation configuration.
+            fine_acoustics_config (`Dict`, *optional*):
+                Fine generation configuration.
+            sample_rate (`int`, *optional*, defaults to 24_000):
+                Sample rate.
+            codebook_size (`int`, *optional*, defaults to 1024):
+                Vector length for each codebook.
+        """
+        if semantic_config is None:
+            semantic_config = {}
+            logger.info("semantic_config is None. initializing the semantic model with default values.")
+
+        if coarse_acoustics_config is None:
+            coarse_acoustics_config = {}
+            logger.info("coarse_acoustics_config is None. initializing the coarse model with default values.")
+
+        if fine_acoustics_config is None:
+            fine_acoustics_config = {}
+            logger.info("fine_acoustics_config is None. initializing the fine model with default values.")
+
+        self.semantic_config = BarkSemanticGenerationConfig(**semantic_config)
+        self.coarse_acoustics_config = BarkCoarseGenerationConfig(**coarse_acoustics_config)
+        self.fine_acoustics_config = BarkFineGenerationConfig(**fine_acoustics_config)
+
+        self.sample_rate = sample_rate
+        self.codebook_size = codebook_size
+
+    @classmethod
+    def from_sub_model_configs(
+        cls,
+        semantic_config: BarkSemanticGenerationConfig,
+        coarse_acoustics_config: BarkCoarseGenerationConfig,
+        fine_acoustics_config: BarkFineGenerationConfig,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a [`BarkGenerationConfig`] (or a derived class) from bark sub-models generation configuration.
+
+        Returns:
+            [`BarkGenerationConfig`]: An instance of a configuration object
+        """
+        return cls(
+            semantic_config=semantic_config.to_dict(),
+            coarse_acoustics_config=coarse_acoustics_config.to_dict(),
+            fine_acoustics_config=fine_acoustics_config.to_dict(),
+            **kwargs,
+        )
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+
+        output["semantic_config"] = self.semantic_config.to_dict()
+        output["coarse_acoustics_config"] = self.coarse_acoustics_config.to_dict()
+        output["fine_acoustics_config"] = self.fine_acoustics_config.to_dict()
+
+        output["model_type"] = self.__class__.model_type
+        return output
diff --git a/src/transformers/models/bark/modeling_bark.py b/src/transformers/models/bark/modeling_bark.py
new file mode 100644
index 000000000000..bdafb6347755
--- /dev/null
+++ b/src/transformers/models/bark/modeling_bark.py
@@ -0,0 +1,1625 @@
+# coding=utf-8
+# Copyright 2023 The Suno AI Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch BARK model."""
+import math
+from typing import Dict, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from ...generation.logits_process import AlternatingCodebooksLogitsProcessor, SuppressTokensLogitsProcessor
+from ...modeling_outputs import CausalLMOutputWithPast, MaskedLMOutput
+from ...modeling_utils import PreTrainedModel, get_parameter_device
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_accelerate_available,
+    logging,
+)
+from ..auto import AutoModel
+from .configuration_bark import (
+    BarkCoarseConfig,
+    BarkConfig,
+    BarkFineConfig,
+    BarkSemanticConfig,
+    BarkSubModelConfig,
+)
+from .generation_configuration_bark import (
+    BarkCoarseGenerationConfig,
+    BarkFineGenerationConfig,
+    BarkSemanticGenerationConfig,
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+_CHECKPOINT_FOR_DOC = "suno/bark-small"
+_CONFIG_FOR_DOC = "BarkConfig"
+
+BARK_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "suno/bark-small",
+    "suno/bark",
+    # See all Bark models at https://huggingface.co/models?filter=bark
+]
+
+
+class BarkSelfAttention(nn.Module):
+    # adapted from GPTNeoSelfAttention and Bark code
+    # BarkSelfAttention can have two attention type, i.e full attention or causal attention
+
+    def __init__(self, config, is_causal=False):
+        super().__init__()
+
+        # regularization
+        self.dropout = config.dropout
+        self.attn_dropout = nn.Dropout(config.dropout)
+        self.resid_dropout = nn.Dropout(config.dropout)
+
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_heads
+        self.head_dim = self.embed_dim // self.num_heads
+
+        if config.hidden_size % config.num_heads != 0:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+
+        # key, query, value projections for all heads, but in a batch
+        self.att_proj = nn.Linear(config.hidden_size, 3 * config.hidden_size, bias=config.bias)
+        # output projection
+        self.out_proj = nn.Linear(config.hidden_size, config.hidden_size, bias=config.bias)
+
+        self.is_causal = is_causal
+        if is_causal:
+            block_size = config.block_size
+            bias = torch.tril(torch.ones((block_size, block_size), dtype=bool)).view(1, 1, block_size, block_size)
+            self.register_buffer("bias", bias)
+
+    # Copied from transformers.models.gpt_neo.modeling_gpt_neo.GPTNeoSelfAttention._split_heads
+    def _split_heads(self, tensor, num_heads, attn_head_size):
+        """
+        Splits hidden_size dim into attn_head_size and num_heads
+        """
+        new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
+        tensor = tensor.view(new_shape)
+        return tensor.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
+
+    def _merge_heads(self, tensor, num_heads, attn_head_size):
+        """
+        Merges attn_head_size dim and num_attn_heads dim into hidden_size
+        """
+
+        # re-assemble all head outputs side by side
+        # (batch, num_heads, seq_len, attn_head_size) -> (batch, seq_len, num_heads*attn_head_size)
+        tensor = tensor.transpose(1, 2).contiguous()
+        tensor = tensor.view(tensor.size()[:-2] + (num_heads * attn_head_size,))
+
+        return tensor
+
+    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
+        # unlike GPTNeo's SelfAttention, divide by the square root of the dimension of the query and the key
+        attn_weights = torch.matmul(query, key.transpose(-1, -2)) * (1.0 / math.sqrt(self.head_dim))
+
+        if self.is_causal:
+            query_length, key_length = query.size(-2), key.size(-2)
+
+            # fill the upper left part of the attention weights with inf
+            attn_weights = attn_weights.masked_fill(
+                self.bias[:, :, key_length - query_length : key_length, :key_length] == 0,
+                torch.finfo(attn_weights.dtype).min,
+            )
+
+        if attention_mask is not None:
+            # Apply the attention mask
+            attn_weights = attn_weights + attention_mask
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        attn_weights = attn_weights.to(value.dtype)
+        attn_weights = self.attn_dropout(attn_weights)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+
+        # (batch, num_heads, seq_len, seq_len) x (batch, num_heads, seq_len, attn_head_size)
+        # -> (batch, num_heads, seq_len, attn_head_size)
+        attn_output = torch.matmul(attn_weights, value)
+
+        return attn_output, attn_weights
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        past_key_values=None,
+        head_mask=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        query, key, value = self.att_proj(hidden_states).split(self.embed_dim, dim=2)
+
+        query = self._split_heads(query, self.num_heads, self.head_dim)
+        key = self._split_heads(key, self.num_heads, self.head_dim)
+        value = self._split_heads(value, self.num_heads, self.head_dim)
+
+        if past_key_values is not None:
+            past_key = past_key_values[0]
+            past_value = past_key_values[1]
+            key = torch.cat((past_key, key), dim=-2)
+            value = torch.cat((past_value, value), dim=-2)
+
+        if use_cache is True:
+            present = (key, value)
+        else:
+            present = None
+
+        attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
+
+        attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
+        attn_output = self.out_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output)
+
+        outputs = (attn_output, present)
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class BarkLayerNorm(nn.Module):
+    """LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False."""
+
+    def __init__(self, hidden_size, bias=True):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.bias = nn.Parameter(torch.zeros(hidden_size)) if bias else None
+
+    def forward(self, input):
+        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, eps=1e-5)
+
+
+class BarkMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.in_proj = nn.Linear(config.hidden_size, 4 * config.hidden_size, bias=config.bias)
+        self.out_proj = nn.Linear(4 * config.hidden_size, config.hidden_size, bias=config.bias)
+        self.dropout = nn.Dropout(config.dropout)
+        self.gelu = nn.GELU()
+
+    def forward(self, hidden_states):
+        hidden_states = self.in_proj(hidden_states)
+        hidden_states = self.gelu(hidden_states)
+        hidden_states = self.out_proj(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class BarkBlock(nn.Module):
+    def __init__(self, config, is_causal=False):
+        super().__init__()
+
+        if is_causal:
+            # if causal, uses handmade LayerNorm, so that the layerNorm bias is optional
+            # this handmade layerNorm is used to stick with Bark choice of leaving optional bias in
+            # AutoRegressive models (corresponding to the "Text" and the "Coarse" modules)
+            self.layernorm_1 = BarkLayerNorm(config.hidden_size, bias=config.bias)
+            self.layernorm_2 = BarkLayerNorm(config.hidden_size, bias=config.bias)
+        else:
+            self.layernorm_1 = nn.LayerNorm(config.hidden_size)
+            self.layernorm_2 = nn.LayerNorm(config.hidden_size)
+
+        self.attn = BarkSelfAttention(config, is_causal=is_causal)
+
+        self.mlp = BarkMLP(config)
+
+    def forward(
+        self,
+        hidden_states,
+        past_key_values=None,
+        attention_mask=None,
+        head_mask=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        intermediary_hidden_states = self.layernorm_1(hidden_states)
+
+        attn_outputs = self.attn(
+            intermediary_hidden_states,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+
+        attn_output = attn_outputs[0]  # output_attn: output, present_key_values, (attn_weights)
+        outputs = attn_outputs[1:]
+
+        intermediary_hidden_states = hidden_states + attn_output
+        intermediary_hidden_states = intermediary_hidden_states + self.mlp(
+            self.layernorm_2(intermediary_hidden_states)
+        )
+
+        if use_cache:
+            outputs = (intermediary_hidden_states,) + outputs
+        else:
+            outputs = (intermediary_hidden_states,) + outputs[1:]
+
+        return outputs  # hidden_states, ((present), attentions)
+
+
+class BarkPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = BarkConfig
+    supports_gradient_checkpointing = False
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, (nn.Linear,)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+
+    @property
+    def device(self) -> torch.device:
+        """
+        `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
+        device).
+        """
+
+        # if has _hf_hook, has been offloaded so the device has to be found in the hook
+        if not hasattr(self, "_hf_hook"):
+            return get_parameter_device(self)
+        for module in self.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+
+        return get_parameter_device(self)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, BarkCausalModel) or isinstance(module, BarkFineModel) or isinstance(module, BarkModel):
+            module.gradient_checkpointing = value
+
+
+BARK_MODEL_START_DOCSTRING = """
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`{config}`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+BARK_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`BarkConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+BARK_FINE_INPUTS_DOCSTRING = r"""
+    Args:
+        codebook_idx (`int`):
+            Index of the codebook that will be predicted.
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, number_of_codebooks)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it. Initially, indices of the first two codebooks are obtained from the `coarse` sub-model. The rest is
+            predicted recursively by attending the previously predicted channels. The model predicts on windows of
+            length 1024.
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): NOT IMPLEMENTED YET.
+        input_embeds (`torch.FloatTensor` of shape `(batch_size, input_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. If
+            `past_key_values` is used, optionally only the last `input_embeds` have to be input (see
+            `past_key_values`). This is useful if you want more control over how to convert `input_ids` indices into
+            associated vectors than the model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+BARK_CAUSAL_MODEL_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `input_ids` of shape `(batch_size, sequence_length)`.
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        input_embeds (`torch.FloatTensor` of shape `(batch_size, input_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+            Here, due to `Bark` particularities, if `past_key_values` is used, `input_embeds` will be ignored and you
+            have to use `input_ids`. If `past_key_values` is not used and `use_cache` is set to `True`, `input_embeds`
+            is used in priority instead of `input_ids`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# GPT2-like autoregressive model
+class BarkCausalModel(BarkPreTrainedModel):
+    config_class = BarkSubModelConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        # initialize as an autoregressive GPT-like model
+        self.input_embeds_layer = nn.Embedding(config.input_vocab_size, config.hidden_size)
+        self.position_embeds_layer = nn.Embedding(config.block_size, config.hidden_size)
+
+        self.drop = nn.Dropout(config.dropout)
+
+        self.layers = nn.ModuleList([BarkBlock(config, is_causal=True) for _ in range(config.num_layers)])
+
+        self.layernorm_final = BarkLayerNorm(config.hidden_size, bias=config.bias)
+
+        self.lm_head = nn.Linear(config.hidden_size, config.output_vocab_size, bias=False)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.input_embeds_layer
+
+    def set_input_embeddings(self, new_embeddings):
+        self.input_embeds_layer = new_embeddings
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs):
+        input_embeds = kwargs.get("input_embeds", None)
+
+        attention_mask = kwargs.get("attention_mask", None)
+        position_ids = kwargs.get("position_ids", None)
+
+        if past_key_values is not None:
+            # only last token for inputs_ids if past is defined in kwargs
+            seq_len = input_ids.shape[1]
+            input_ids = input_ids[:, [-1]]
+
+            # input_embeds have already been used and is not required anymore
+            input_embeds = None
+        else:
+            if input_embeds is not None and kwargs.get("use_cache"):
+                seq_len = input_embeds.shape[1]
+            else:
+                seq_len = input_ids.shape[1]
+
+        # ensure that attention_mask and position_ids shapes are aligned with the weird Bark hack of reducing
+        # sequence length on the first forward pass
+        if attention_mask is not None:
+            attention_mask = attention_mask[:, :seq_len]
+        if position_ids is not None:
+            position_ids = position_ids[:, :seq_len]
+
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+        else:
+            position_ids = None
+
+        if input_embeds is not None and kwargs.get("use_cache"):
+            return {
+                "input_ids": None,
+                "input_embeds": input_embeds,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "position_ids": position_ids,
+                "attention_mask": attention_mask,
+            }
+        return {
+            "input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "use_cache": kwargs.get("use_cache"),
+            "position_ids": position_ids,
+            "attention_mask": attention_mask,
+        }
+
+    @add_start_docstrings_to_model_forward(BARK_CAUSAL_MODEL_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        input_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # Verify if input_embeds already exists
+        # then compute embeddings.
+        if input_ids is not None and input_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and input_embeds at the same time")
+        elif input_embeds is not None and past_key_values is None:
+            # we want to return the input_embeds in priority so that it is in line with a weird hack
+            # of Bark which concatenate two bits of the input_embeds on the first forward pass of the semantic model
+            pass
+        elif input_ids is not None:
+            input_embeds = self.input_embeds_layer(input_ids)  # token embeddings of shape (b, t, n_embd)
+        elif input_embeds is not None:
+            pass
+        else:
+            raise ValueError("You have to specify either input_ids or input_embeds")
+
+        input_shape = input_embeds.size()[:-1]
+        batch_size = input_embeds.shape[0]
+        seq_length = input_shape[-1]
+
+        device = input_ids.device if input_ids is not None else input_embeds.device
+
+        if past_key_values is None:
+            past_length = 0
+            past_key_values = tuple([None] * len(self.layers))
+        else:
+            past_length = past_key_values[0][0].size(-2)
+
+        if position_ids is None:
+            position_ids = torch.arange(past_length, seq_length + past_length, dtype=torch.long, device=device)
+            position_ids = position_ids.unsqueeze(0)  # shape (1, seq_length)
+
+        position_embeds = self.position_embeds_layer(position_ids)  # position embeddings of shape (1, t, n_embd)
+
+        # Attention mask.
+        if attention_mask is not None:
+            if batch_size <= 0:
+                raise ValueError("batch_size has to be defined and > 0")
+            attention_mask = attention_mask.view(batch_size, -1)
+            # We create a 3D attention mask from a 2D tensor mask.
+            # Sizes are [batch_size, 1, 1, to_seq_length]
+            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+            # this attention mask is more simple than the triangular masking of causal attention
+            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+            attention_mask = attention_mask[:, None, None, :]
+
+            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+            # masked positions, this operation will create a tensor which is 0.0 for
+            # positions we want to attend and the dtype's smallest value for masked positions.
+            # Since we are adding it to the raw scores before the softmax, this is
+            # effectively the same as removing these entirely.
+            attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+            attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x num_heads x N x N
+        # head_mask has shape num_layers x batch x num_heads x N x N
+        head_mask = self.get_head_mask(head_mask, self.config.num_layers)
+
+        hidden_states = self.drop(input_embeds + position_embeds)
+        output_shape = input_shape + (hidden_states.size(-1),)
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        present_key_values = () if use_cache else None
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        for i, (block, past_layer_key_values) in enumerate(zip(self.layers, past_key_values)):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, use_cache, output_attentions)
+
+                    return custom_forward
+
+                outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    None,
+                    attention_mask,
+                    head_mask[i],
+                )
+            else:
+                outputs = block(
+                    hidden_states,
+                    past_key_values=past_layer_key_values,
+                    attention_mask=attention_mask,
+                    head_mask=head_mask[i],
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = outputs[0]
+
+            if use_cache:
+                present_key_values = present_key_values + (outputs[1],)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
+
+        hidden_states = self.layernorm_final(hidden_states)
+
+        hidden_states = hidden_states.view(output_shape)
+
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            raise NotImplementedError(
+                "Training is not implemented yet for Bark - ensure you do not pass `labels` to the model."
+            )
+
+        if not return_dict:
+            return tuple(
+                v for v in [None, logits, present_key_values, all_hidden_states, all_self_attentions] if v is not None
+            )
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=present_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+    @staticmethod
+    def _reorder_cache(
+        past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
+    ) -> Tuple[Tuple[torch.Tensor]]:
+        """
+        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
+        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
+        beam_idx at every generation step.
+        """
+        # Necessary for beam_search
+        return tuple(
+            tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
+            for layer_past in past_key_values
+        )
+
+
+@add_start_docstrings(
+    """Bark semantic (or text) model. It shares the same architecture as the coarse model.
+    It is a GPT-2 like autoregressive model with a language modeling head on top.""",
+    BARK_MODEL_START_DOCSTRING.format(config="BarkSemanticConfig"),
+)
+class BarkSemanticModel(BarkCausalModel):
+    base_model_prefix = "semantic"
+    config_class = BarkSemanticConfig
+
+    def generate(
+        self,
+        input_ids: torch.Tensor,
+        semantic_generation_config: BarkSemanticGenerationConfig = None,
+        history_prompt: Optional[Dict[str, torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.LongTensor:
+        """
+        Generates text semantic tokens from an input prompt and an additional optional `Bark` speaker prompt.
+
+        Args:
+            input_ids (`Optional[torch.Tensor]` of shape (batch_size, seq_len), *optional*):
+                Input ids, i.e tokenized input sentences. Will be truncated up to
+                semantic_generation_config.max_input_semantic_length tokens. Note that the output audios will be as
+                long as the longest generation among the batch.
+            semantic_generation_config (`BarkSemanticGenerationConfig`):
+                Generation config indicating how to generate the semantic tokens.
+            history_prompt (`Optional[Dict[str,torch.Tensor]]`, *optional*):
+                Optional `Bark` speaker prompt.
+            attention_mask (`Optional[torch.Tensor]`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+        Returns:
+            torch.LongTensor: Output semantic tokens.
+        """
+        if semantic_generation_config is None:
+            raise ValueError("`semantic_generation_config` has to be provided")
+
+        batch_size = input_ids.shape[0]
+
+        max_input_semantic_length = semantic_generation_config.max_input_semantic_length
+
+        input_ids = input_ids + semantic_generation_config.text_encoding_offset
+
+        if attention_mask is not None:
+            input_ids = input_ids.masked_fill((1 - attention_mask).bool(), semantic_generation_config.text_pad_token)
+
+        if history_prompt is not None:
+            semantic_history = history_prompt["semantic_prompt"][-max_input_semantic_length:]
+            semantic_history = nn.functional.pad(
+                semantic_history,
+                (0, max_input_semantic_length - len(semantic_history)),
+                value=semantic_generation_config.semantic_pad_token,
+                mode="constant",
+            )
+        else:
+            semantic_history = torch.tensor(
+                [semantic_generation_config.semantic_pad_token] * max_input_semantic_length, dtype=torch.int
+            ).to(self.device)
+
+        semantic_history = torch.repeat_interleave(semantic_history[None], batch_size, dim=0)
+
+        infer_array = torch.tensor(
+            [[semantic_generation_config.semantic_infer_token]] * batch_size, dtype=torch.int
+        ).to(self.device)
+
+        input_embeds = torch.cat(
+            [
+                self.input_embeds_layer(input_ids[:, :max_input_semantic_length])
+                + self.input_embeds_layer(semantic_history[:, : max_input_semantic_length + 1]),
+                self.input_embeds_layer(infer_array),
+            ],
+            dim=1,
+        )
+
+        tokens_to_suppress = list(
+            range(semantic_generation_config.semantic_vocab_size, semantic_generation_config.semantic_pad_token)
+        )
+        tokens_to_suppress.extend(
+            list(range(semantic_generation_config.semantic_pad_token + 1, self.config.output_vocab_size))
+        )
+
+        suppress_tokens_logits_processor = SuppressTokensLogitsProcessor(tokens_to_suppress)
+
+        # pass input_ids in order to stay consistent with the transformers generate method even though it is not used
+        # (except to get the input seq_len - that's why we keep the first 257 tokens)
+        semantic_output = super().generate(
+            torch.ones((batch_size, max_input_semantic_length + 1), dtype=torch.int).to(self.device),
+            input_embeds=input_embeds,
+            logits_processor=[suppress_tokens_logits_processor],
+            generation_config=semantic_generation_config,
+            **kwargs,
+        )  # size: 10048
+
+        # take the generated semantic tokens
+        semantic_output = semantic_output[:, max_input_semantic_length + 1 :]
+
+        return semantic_output
+
+
+@add_start_docstrings(
+    """Bark coarse acoustics model.
+    It shares the same architecture as the semantic (or text) model. It is a GPT-2 like autoregressive model with a
+    language modeling head on top.""",
+    BARK_MODEL_START_DOCSTRING.format(config="BarkCoarseConfig"),
+)
+class BarkCoarseModel(BarkCausalModel):
+    base_model_prefix = "coarse_acoustics"
+    config_class = BarkCoarseConfig
+
+    def preprocess_histories(
+        self,
+        max_coarse_history: int,
+        semantic_to_coarse_ratio: int,
+        batch_size: int,
+        semantic_generation_config: int,
+        codebook_size: int,
+        history_prompt: Optional[Dict[str, torch.Tensor]] = None,
+    ):
+        """
+        Preprocess the optional `Bark` speaker prompts before `self.generate`.
+
+        Args:
+            max_coarse_history (`int`):
+                Maximum size of coarse tokens used.
+            semantic_to_coarse_ratio (`int`):
+                Ratio of semantic to coarse frequency
+            batch_size (`int`):
+                Batch size, i.e the number of samples.
+            semantic_generation_config (`BarkSemanticGenerationConfig`):
+                Generation config indicating how to generate the semantic tokens.
+            codebook_size (`int`):
+                Codebook channel size, i.e. the size of the output vocabulary per codebook channel.
+            history_prompt (`Optional[Dict[str,torch.Tensor]]`):
+                Optional `Bark` speaker prompt.
+        Returns: Returns:
+            `tuple(torch.FloatTensor)`:
+            - **x_semantic_history** (`torch.FloatTensor` -- Processed semantic speaker prompt.
+            - **x_coarse_history** (`torch.FloatTensor`) -- Processed coarse speaker prompt.
+        """
+        if history_prompt is not None:
+            x_semantic_history = torch.repeat_interleave(history_prompt["semantic_prompt"][None], batch_size, dim=0)
+            # clone to avoid modifying history_prompt.coarse_prompt
+            x_coarse_history = history_prompt["coarse_prompt"].clone()
+
+            # offset x_coarse_history
+            if codebook_size is not None:
+                for n in range(1, x_coarse_history.shape[0]):
+                    # offset
+                    x_coarse_history[n, :] += codebook_size * n
+
+            # flatten x_coarse_history
+            x_coarse_history = torch.transpose(x_coarse_history, 0, 1).view(-1)
+
+            x_coarse_history = x_coarse_history + semantic_generation_config.semantic_vocab_size
+
+            x_coarse_history = torch.repeat_interleave(x_coarse_history[None], batch_size, dim=0)
+            # e.g: after SEMANTIC_VOCAB_SIZE (10000), 1024 tokens dedicated to first codebook, 1024 next tokens
+            # dedicated to second codebook.
+
+            max_semantic_history = int(np.floor(max_coarse_history / semantic_to_coarse_ratio))
+            # trim histories correctly
+            n_semantic_hist_provided = min(
+                [
+                    max_semantic_history,
+                    x_semantic_history.shape[1] - x_semantic_history.shape[1] % 2,
+                    int(np.floor(x_coarse_history.shape[1] / semantic_to_coarse_ratio)),
+                ]
+            )
+
+            n_coarse_hist_provided = int(round(n_semantic_hist_provided * semantic_to_coarse_ratio))
+
+            x_semantic_history = x_semantic_history[:, -n_semantic_hist_provided:].int()
+            x_coarse_history = x_coarse_history[:, -n_coarse_hist_provided:].int()
+            # bit of a hack for time alignment (sounds better) - from Bark original implementation
+            x_coarse_history = x_coarse_history[:, :-2]
+
+        else:
+            # shape: (batch_size, 0)
+            x_semantic_history = torch.tensor([[]] * batch_size, dtype=torch.int).to(self.device)
+            x_coarse_history = torch.tensor([[]] * batch_size, dtype=torch.int).to(self.device)
+
+        return x_semantic_history, x_coarse_history
+
+    def generate(
+        self,
+        semantic_output: torch.Tensor,
+        semantic_generation_config: BarkSemanticGenerationConfig = None,
+        coarse_generation_config: BarkCoarseGenerationConfig = None,
+        codebook_size: int = 1024,
+        history_prompt: Optional[Dict[str, torch.Tensor]] = None,
+        **kwargs,
+    ) -> torch.LongTensor:
+        """
+        Generates coarse acoustics tokens from input text semantic tokens and an additional optional `Bark` speaker
+        prompt.
+
+        Args:
+            semantic_output (`torch.Tensor` of shape (batch_size, seq_len), *optional*):
+                Input text semantic ids, i.e the output of `BarkSemanticModel.generate`.
+            semantic_generation_config (`BarkSemanticGenerationConfig`):
+                Generation config indicating how to generate the semantic tokens.
+            coarse_generation_config (`BarkCoarseGenerationConfig`):
+                Generation config indicating how to generate the coarse tokens.
+            codebook_size (`int`, *optional*, defaults to 1024):
+                Codebook channel size, i.e. the size of the output vocabulary per codebook channel.
+            history_prompt (`Optional[Dict[str,torch.Tensor]]`, *optional*):
+                Optional `Bark` speaker prompt.
+        Returns:
+            torch.LongTensor: Output coarse acoustics tokens.
+        """
+
+        if semantic_generation_config is None:
+            raise ValueError("`semantic_generation_config` has to be provided")
+
+        if coarse_generation_config is None:
+            raise ValueError("`coarse_generation_config` has to be provided")
+
+        max_coarse_input_length = coarse_generation_config.max_coarse_input_length
+        max_coarse_history = coarse_generation_config.max_coarse_history
+        sliding_window_len = coarse_generation_config.sliding_window_len
+
+        # replace semantic_pad_token (eos_tok and pad_tok here) with coarse_semantic_pad_token i.e the pad_token
+        # used in the next model
+        semantic_output.masked_fill_(
+            semantic_output == semantic_generation_config.semantic_pad_token,
+            coarse_generation_config.coarse_semantic_pad_token,
+        )
+
+        semantic_to_coarse_ratio = (
+            coarse_generation_config.coarse_rate_hz
+            / semantic_generation_config.semantic_rate_hz
+            * coarse_generation_config.n_coarse_codebooks
+        )
+        max_semantic_history = int(np.floor(max_coarse_history / semantic_to_coarse_ratio))
+
+        # beware, depends on the seq_len of the longest sequence of the batch.
+        # Also, the seq_len might be one token too long because of an added
+        # pad_token as compared to Bark original implementation.
+        max_generated_len = np.floor(
+            semantic_output.shape[1] * semantic_to_coarse_ratio / coarse_generation_config.n_coarse_codebooks
+        )
+        max_generated_len = int(round(max_generated_len * coarse_generation_config.n_coarse_codebooks))
+
+        batch_size = semantic_output.shape[0]
+
+        x_semantic_history, x_coarse = self.preprocess_histories(
+            history_prompt=history_prompt,
+            max_coarse_history=max_coarse_history,
+            semantic_to_coarse_ratio=semantic_to_coarse_ratio,
+            batch_size=batch_size,
+            semantic_generation_config=semantic_generation_config,
+            codebook_size=codebook_size,
+        )
+        base_semantic_idx = x_semantic_history.shape[1]
+
+        semantic_output = torch.hstack([x_semantic_history, semantic_output])
+
+        n_window_steps = int(np.ceil(max_generated_len / sliding_window_len))
+
+        total_generated_len = 0
+
+        len_coarse_history = x_coarse.shape[1]
+
+        for _ in range(n_window_steps):
+            semantic_idx = base_semantic_idx + int(round(total_generated_len / semantic_to_coarse_ratio))
+
+            # pad from right side
+            input_coarse = semantic_output[:, np.max([0, semantic_idx - max_semantic_history]) :]
+            input_coarse = input_coarse[:, :max_coarse_input_length]
+            input_coarse = F.pad(
+                input_coarse,
+                (0, max_coarse_input_length - input_coarse.shape[-1]),
+                "constant",
+                coarse_generation_config.coarse_semantic_pad_token,
+            )
+
+            input_coarse = torch.hstack(
+                [
+                    input_coarse,
+                    torch.tensor([[coarse_generation_config.coarse_infer_token]] * batch_size).to(self.device),
+                    x_coarse[:, -max_coarse_history:],
+                ]
+            )
+
+            alternatingLogitsProcessor = AlternatingCodebooksLogitsProcessor(
+                input_coarse.shape[1],
+                semantic_generation_config.semantic_vocab_size,
+                codebook_size,
+            )
+
+            output_coarse = super().generate(
+                input_coarse,
+                logits_processor=[alternatingLogitsProcessor],
+                max_new_tokens=min(sliding_window_len, max_generated_len - total_generated_len),
+                generation_config=coarse_generation_config,
+                **kwargs,
+            )
+
+            input_coarse_len = input_coarse.shape[1]
+
+            x_coarse = torch.hstack([x_coarse, output_coarse[:, input_coarse_len:]])
+            total_generated_len = x_coarse.shape[1] - len_coarse_history
+
+            del output_coarse
+
+        coarse_output = x_coarse[:, len_coarse_history:]
+
+        return coarse_output
+
+
+@add_start_docstrings(
+    """Bark fine acoustics model. It is a non-causal GPT-like model with `config.n_codes_total` embedding layers and
+    language modeling heads, one for each codebook.""",
+    BARK_MODEL_START_DOCSTRING.format(config="BarkFineConfig"),
+)
+class BarkFineModel(BarkPreTrainedModel):
+    base_model_prefix = "fine_acoustics"
+    config_class = BarkFineConfig
+    main_input_name = "codebook_idx"
+
+    def __init__(self, config):
+        # non-causal gpt-like model with one embedding layer and one lm_head for each codebook of Encodec
+        super().__init__(config)
+        self.config = config
+
+        # initialize a modified non causal GPT-like model
+        # note that for there is one embedding layer and one lm_head for each codebook of Encodec
+        self.input_embeds_layers = nn.ModuleList(
+            [nn.Embedding(config.input_vocab_size, config.hidden_size) for _ in range(config.n_codes_total)]
+        )
+        self.position_embeds_layer = nn.Embedding(config.block_size, config.hidden_size)
+
+        self.drop = nn.Dropout(config.dropout)
+
+        self.layers = nn.ModuleList([BarkBlock(config, is_causal=False) for _ in range(config.num_layers)])
+
+        self.layernorm_final = nn.LayerNorm(config.hidden_size)
+
+        self.lm_heads = nn.ModuleList(
+            [
+                nn.Linear(config.hidden_size, config.output_vocab_size, bias=False)
+                for _ in range(config.n_codes_given, config.n_codes_total)
+            ]
+        )
+        self.gradient_checkpointing = False
+        self.n_codes_total = config.n_codes_total
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        # one embedding layers for each codebook
+        return self.input_embeds_layers
+
+    def set_input_embeddings(self, new_embeddings):
+        # one embedding layers for each codebook
+        self.input_embeds_layers = new_embeddings
+
+    def get_output_embeddings(self):
+        # one lm_head for each codebook
+        return self.lm_heads
+
+    def set_output_embeddings(self, new_output_embeddings):
+        # one lm_head for each codebook
+        self.lm_heads = new_output_embeddings
+
+    def _resize_token_embeddings(self, new_num_tokens, pad_to_multiple_of=None):
+        old_embeddings_list = self.get_input_embeddings()
+        new_embeddings_list = nn.ModuleList(
+            [
+                self._get_resized_embeddings(old_embeddings, new_num_tokens, pad_to_multiple_of)
+                for old_embeddings in old_embeddings_list
+            ]
+        )
+        self.set_input_embeddings(new_embeddings_list)
+        new_num_tokens = new_embeddings_list[0].weight.shape[0]
+
+        # if word embeddings are not tied, make sure that lm head is resized as well
+        if self.get_output_embeddings() is not None and not self.config.tie_word_embeddings:
+            old_lm_head_list = self.get_output_embeddings()
+            new_lm_head_list = nn.ModuleList(
+                [self._get_resized_lm_head(old_lm_head, new_num_tokens) for old_lm_head in old_lm_head_list]
+            )
+            self.set_output_embeddings(new_lm_head_list)
+
+        return self.get_input_embeddings()
+
+    def resize_token_embeddings(
+        self, new_num_tokens: Optional[int] = None, pad_to_multiple_of: Optional[int] = None
+    ) -> nn.Embedding:
+        """
+        Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`.
+
+        Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
+
+        Arguments:
+            new_num_tokens (`int`, *optional*):
+                The number of new tokens in the embedding matrix. Increasing the size will add newly initialized
+                vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`, just
+                returns a pointer to the input tokens `torch.nn.Embedding` module of the model without doing anything.
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the embedding matrix to a multiple of the provided value.
+
+                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
+                details about this, or help on choosing the correct value for resizing, refer to this guide:
+                https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
+
+        Return:
+            `torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
+        """
+        model_embeds = self._resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
+        if new_num_tokens is None and pad_to_multiple_of is None:
+            return model_embeds
+
+        # Update base model and current model config
+        self.config.output_vocab_size = model_embeds[0].weight.shape[0]
+        self.config.vocab_size = model_embeds[0].weight.shape[0]
+        self.output_vocab_size = model_embeds[0].weight.shape[0]
+        self.vocab_size = model_embeds[0].weight.shape[0]
+
+        # Tie weights again if needed
+        self.tie_weights()
+
+        return model_embeds
+
+    def tie_weights(self):
+        """
+        Tie the weights between the input embeddings list and the output embeddings list.
+
+        If the `torchscript` flag is set in the configuration, can't handle parameter sharing so we are cloning the
+        weights instead.
+        """
+        if getattr(self.config, "tie_word_embeddings", True):
+            self._tied_weights_keys = []
+            output_embeddings = self.get_output_embeddings()
+            input_embeddings = self.get_input_embeddings()
+
+            for i in range(self.config.n_codes_total - self.config.n_codes_given):
+                # self.input_embeds_layers[i + 1].weight = self.lm_heads[i].weight
+                self._tie_or_clone_weights(output_embeddings[i], input_embeddings[i + 1])
+                self._tied_weights_keys.append(f"lm_heads.{i}.weight")
+
+        for module in self.modules():
+            if hasattr(module, "_tie_weights"):
+                module._tie_weights()
+
+    @add_start_docstrings_to_model_forward(BARK_FINE_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        codebook_idx: int,  # an additionnal idx corresponding to the id of the codebook that will be predicted
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        input_embeds: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if codebook_idx == 0:
+            raise ValueError("Cannot predict 0th codebook - 0th codebook should be predicted by the coarse model")
+
+        if input_ids is not None and input_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and input_embeds at the same time")
+
+        if input_ids is None and input_embeds is None:
+            raise ValueError("You have to specify either input_ids or input_embeds")
+
+        if input_ids is not None:
+            # the input_embeddings are the sum of the j previous codebooks embeddings before
+            # the current codebook_idx codebook
+
+            # forward the GPT model itself
+            input_embeds = [
+                input_embeds_layer(input_ids[:, :, i]).unsqueeze(-1)
+                for i, input_embeds_layer in enumerate(self.input_embeds_layers)
+            ]  # token embeddings of shape (b, t, n_embd)
+            input_embeds = torch.cat(input_embeds, dim=-1)
+            input_embeds = input_embeds[:, :, :, : codebook_idx + 1].sum(dim=-1)
+
+        input_shape = input_embeds.size()[:-1]
+        batch_size = input_embeds.shape[0]
+        seq_length = input_shape[1]
+
+        device = input_ids.device if input_ids is not None else input_embeds.device
+
+        if position_ids is None:
+            position_ids = torch.arange(0, seq_length, dtype=torch.long, device=device)
+            position_ids = position_ids.unsqueeze(0)  # shape (1, seq_length)
+
+        position_embeds = self.position_embeds_layer(position_ids)  # position embeddings of shape (1, t, n_embd)
+
+        # Attention mask.
+        if attention_mask is not None:
+            if batch_size <= 0:
+                raise ValueError("batch_size has to be defined and > 0")
+            attention_mask = attention_mask.view(batch_size, -1)
+            attention_mask = attention_mask[:, None, None, :]
+            attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+            attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
+
+        head_mask = self.get_head_mask(head_mask, self.config.num_layers)
+
+        hidden_states = self.drop(input_embeds + position_embeds)
+        output_shape = input_shape + (hidden_states.size(-1),)
+
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        for i, block in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            outputs = block(
+                hidden_states,
+                attention_mask=attention_mask,
+                head_mask=head_mask[i],
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[1],)
+
+        hidden_states = self.layernorm_final(hidden_states)
+        hidden_states = hidden_states.view(output_shape)
+
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        logits = self.lm_heads[codebook_idx - self.config.n_codes_given](hidden_states)
+
+        loss = None
+        if labels is not None:
+            raise NotImplementedError("Training is not implemented yet")
+
+        if not return_dict:
+            return tuple(v for v in [None, logits, all_hidden_states, all_self_attentions] if v is not None)
+
+        return MaskedLMOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+    def generate(
+        self,
+        coarse_output: torch.Tensor,
+        semantic_generation_config: BarkSemanticGenerationConfig = None,
+        coarse_generation_config: BarkCoarseGenerationConfig = None,
+        fine_generation_config: BarkFineGenerationConfig = None,
+        codebook_size: int = 1024,
+        history_prompt: Optional[Dict[str, torch.Tensor]] = None,
+        **kwargs,
+    ) -> torch.LongTensor:
+        """
+        Generates fine acoustics tokens from input coarse acoustics tokens and an additional optional `Bark` speaker
+        prompt.
+
+        Args:
+            coarse_output (`torch.Tensor` of shape (batch_size, seq_len)):
+                Input coarse acoustics ids, i.e the output of `BarkCoarseModel.generate`.
+            semantic_generation_config (`BarkSemanticGenerationConfig`):
+                Generation config indicating how to generate the semantic tokens.
+            coarse_generation_config (`BarkCoarseGenerationConfig`):
+                Generation config indicating how to generate the coarse tokens.
+            fine_generation_config (`BarkFineGenerationConfig`):
+                Generation config indicating how to generate the fine tokens.
+            codebook_size (`int`, *optional*, defaults to 1024):
+                Codebook channel size, i.e. the size of the output vocabulary per codebook channel.
+            history_prompt (`Optional[Dict[str,torch.Tensor]]`, *optional*):
+                Optional `Bark` speaker prompt.
+        Returns:
+            torch.LongTensor: Output fine acoustics tokens.
+        """
+        if semantic_generation_config is None:
+            raise ValueError("`semantic_generation_config` has to be provided")
+
+        if coarse_generation_config is None:
+            raise ValueError("`coarse_generation_config` has to be provided")
+
+        if fine_generation_config is None:
+            raise ValueError("`fine_generation_config` has to be provided")
+
+        # since we don't really use GenerationConfig through the fine model (autoencoder)
+        # and since only temperature is used from the classic GenerationConfig parameters
+        # manually impose the kwargs priority over the generation config
+        temperature = kwargs.get("temperature", fine_generation_config.temperature)
+
+        max_fine_history_length = fine_generation_config.max_fine_history_length
+        max_fine_input_length = fine_generation_config.max_fine_input_length
+
+        # shape: (batch, n_coarse_codebooks * seq_len)
+        # new_shape: (batch, seq_len, n_coarse_codebooks)
+        coarse_output = coarse_output.view(coarse_output.shape[0], -1, coarse_generation_config.n_coarse_codebooks)
+
+        # brings ids into the range [0, codebook_size -1]
+        coarse_output = torch.remainder(coarse_output - semantic_generation_config.semantic_vocab_size, codebook_size)
+        batch_size = coarse_output.shape[0]
+
+        if history_prompt is not None:
+            x_fine_history = torch.repeat_interleave(history_prompt["fine_prompt"].T[None], batch_size, dim=0)
+            # transpose to get to shape (seq_len, n_fine_codebooks)
+        else:
+            x_fine_history = None
+
+        n_coarse = coarse_generation_config.n_coarse_codebooks
+
+        # pad the last 6th codebooks
+        fine_input = F.pad(
+            coarse_output,
+            (0, fine_generation_config.n_fine_codebooks - n_coarse),
+            "constant",
+            codebook_size,
+        )
+
+        # prepend history if available (max max_fine_history_length)
+        if x_fine_history is not None:
+            fine_input = torch.cat([x_fine_history[:, -max_fine_history_length:, :], fine_input], dim=1)
+
+            # len of the fine_history that has been added to fine_input
+            n_history = x_fine_history[:, -max_fine_history_length:, :].shape[1]
+        else:
+            n_history = 0
+
+        n_remove_from_end = 0
+        # need to pad if too short (since non-causal model)
+        if fine_input.shape[1] < max_fine_input_length:
+            n_remove_from_end = max_fine_input_length - fine_input.shape[1]
+            fine_input = F.pad(fine_input, (0, 0, 0, n_remove_from_end), mode="constant", value=codebook_size)
+
+        # we can be lazy about fractional loop and just keep overwriting codebooks.
+        # seems that coarse_output.shape[1] - (max_fine_input_length - n_history) is equal to minus n_remove_from_end
+        # So if we needed to pad because too short, n_loops is always 1 (because n_remove_from_end > 0)
+        # If not, we loop over at least twice.
+
+        n_loops = (coarse_output.shape[1] - (max_fine_input_length - n_history)) / max_fine_history_length
+        n_loops = int(np.ceil(n_loops))
+        n_loops = max(0, n_loops) + 1
+
+        for n_outer in range(n_loops):
+            start_idx = min([n_outer * max_fine_history_length, fine_input.shape[1] - max_fine_input_length])
+
+            start_fill_idx = min(
+                [n_history + n_outer * max_fine_history_length, fine_input.shape[1] - max_fine_history_length]
+            )
+            rel_start_fill_idx = start_fill_idx - start_idx
+            input_buffer = fine_input[:, start_idx : start_idx + max_fine_input_length, :]
+            for n_inner in range(n_coarse, fine_generation_config.n_fine_codebooks):
+                logits = self.forward(n_inner, input_buffer).logits
+                if temperature is None or temperature == 1.0:
+                    relevant_logits = logits[:, rel_start_fill_idx:, :codebook_size]
+                    codebook_preds = torch.argmax(relevant_logits, -1)
+                else:
+                    relevant_logits = logits[:, :, :codebook_size] / temperature
+                    # apply softmax
+                    probs = F.softmax(relevant_logits, dim=-1)[:, rel_start_fill_idx:max_fine_input_length]
+                    # reshape to 2D: (batch_size, seq_len, codebook_size) -> (batch_size*seq_len, codebook_size)
+                    probs = probs.reshape((-1, codebook_size))
+                    # multinomial then reshape : (batch_size*seq_len)-> (batch_size,seq_len)
+                    codebook_preds = torch.multinomial(probs, num_samples=1).view(batch_size, -1)
+                codebook_preds = codebook_preds.to(torch.int32)
+                input_buffer[:, rel_start_fill_idx:, n_inner] = codebook_preds
+                del logits, codebook_preds
+
+            # transfer into fine_input
+            for n_inner in range(n_coarse, fine_generation_config.n_fine_codebooks):
+                fine_input[
+                    :, start_fill_idx : start_fill_idx + (max_fine_input_length - rel_start_fill_idx), n_inner
+                ] = input_buffer[:, rel_start_fill_idx:, n_inner]
+            del input_buffer
+
+        fine_input = fine_input.transpose(1, 2)[:, :, n_history:]
+        if n_remove_from_end > 0:
+            fine_input = fine_input[:, :, :-n_remove_from_end]
+
+        if fine_input.shape[-1] != coarse_output.shape[-2]:
+            raise ValueError("input and output should have the same seq_len")
+
+        return fine_input
+
+
+@add_start_docstrings(
+    """
+    The full Bark model, a text-to-speech model composed of 4 sub-models:
+    - [`BarkSemanticModel`] (also referred to as the 'text' model): a causal auto-regressive transformer model that
+      takes
+    as input tokenized text, and predicts semantic text tokens that capture the meaning of the text.
+    - [`BarkCoarseModel`] (also refered to as the 'coarse acoustics' model), also a causal autoregressive transformer,
+    that takes into input the results of the last model. It aims at regressing the first two audio codebooks necessary
+    to `encodec`.
+    - [`BarkFineModel`] (the 'fine acoustics' model), this time a non-causal autoencoder transformer, which iteratively
+    predicts the last codebooks based on the sum of the previous codebooks embeddings.
+    - having predicted all the codebook channels from the [`EncodecModel`], Bark uses it to decode the output audio
+      array.
+
+    It should be noted that each of the first three modules can support conditional speaker embeddings to condition the
+    output sound according to specific predefined voice.
+    """,
+    BARK_START_DOCSTRING,
+)
+class BarkModel(BarkPreTrainedModel):
+    config_class = BarkConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.semantic = BarkSemanticModel(config.semantic_config)
+        self.coarse_acoustics = BarkCoarseModel(config.coarse_acoustics_config)
+        self.fine_acoustics = BarkFineModel(config.fine_acoustics_config)
+
+        self.codec_model = AutoModel.from_config(config.codec_config)
+
+        self.config = config
+
+    @property
+    def device(self) -> torch.device:
+        """
+        `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
+        device).
+        """
+        # for bark_model, device must be verified on its sub-models
+        # if has _hf_hook, has been offloaded so the device has to be found in the hook
+        if not hasattr(self.semantic, "_hf_hook"):
+            return get_parameter_device(self)
+        for module in self.semantic.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+
+    def enable_cpu_offload(self, gpu_id: Optional[int] = 0):
+        r"""
+        Offloads all sub-models to CPU using accelerate, reducing memory usage with a low impact on performance. This
+        method moves one whole sub-model at a time to the GPU when it is used, and the sub-model remains in GPU until
+        the next sub-model runs.
+
+        Args:
+            gpu_id (`int`, *optional*, defaults to 0):
+                GPU id on which the sub-models will be loaded and offloaded.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate`.")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        if self.device.type != "cpu":
+            self.to("cpu")
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        # this layer is used outside the first foward pass of semantic so need to be loaded before semantic
+        self.semantic.input_embeds_layer, _ = cpu_offload_with_hook(self.semantic.input_embeds_layer, device)
+
+        hook = None
+        for cpu_offloaded_model in [
+            self.semantic,
+            self.coarse_acoustics,
+            self.fine_acoustics,
+        ]:
+            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
+
+        self.fine_acoustics_hook = hook
+
+        _, hook = cpu_offload_with_hook(self.codec_model, device, prev_module_hook=hook)
+
+        # We'll offload the last model manually.
+        self.codec_model_hook = hook
+
+    def codec_decode(self, fine_output):
+        """Turn quantized audio codes into audio array using encodec."""
+
+        fine_output = fine_output.transpose(0, 1)
+        emb = self.codec_model.quantizer.decode(fine_output)
+        out = self.codec_model.decoder(emb)
+        audio_arr = out.squeeze(1)  # squeeze the codebook dimension
+
+        return audio_arr
+
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        history_prompt: Optional[Dict[str, torch.Tensor]] = None,
+        **kwargs,
+    ) -> torch.LongTensor:
+        """
+        Generates audio from an input prompt and an additional optional `Bark` speaker prompt.
+
+        Args:
+            input_ids (`Optional[torch.Tensor]` of shape (batch_size, seq_len), *optional*):
+                Input ids. Will be truncated up to 256 tokens. Note that the output audios will be as long as the
+                longest generation among the batch.
+            history_prompt (`Optional[Dict[str,torch.Tensor]]`, *optional*):
+                Optional `Bark` speaker prompt. Note that for now, this model takes only one speaker prompt per batch.
+            kwargs (*optional*): Remaining dictionary of keyword arguments. Keyword arguments are of two types:
+
+                - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model.
+                - With a *semantic_*, *coarse_*, *fine_* prefix, they will be input for the `generate` method of the
+                semantic, coarse and fine respectively. It has the priority over the keywords without a prefix.
+
+                This means you can, for example, specify a generation strategy for all sub-models except one.
+        Returns:
+            torch.LongTensor: Output generated audio.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoProcessor, BarkModel
+
+        >>> processor = AutoProcessor.from_pretrained("suno/bark-small")
+        >>> model = BarkModel.from_pretrained("suno/bark-small")
+
+        >>> # To add a voice preset, you can pass `voice_preset` to `BarkProcessor.__call__(...)`
+        >>> voice_preset = "v2/en_speaker_6"
+
+        >>> inputs = processor("Hello, my dog is cute, I need him in my life", voice_preset=voice_preset)
+
+        >>> audio_array = model.generate(**inputs, semantic_max_new_tokens=100)
+        >>> audio_array = audio_array.cpu().numpy().squeeze()
+        ```
+        """
+        # TODO (joao):workaround until nested generation config is compatible with PreTrained Model
+        # todo: dict
+        semantic_generation_config = BarkSemanticGenerationConfig(**self.generation_config.semantic_config)
+        coarse_generation_config = BarkCoarseGenerationConfig(**self.generation_config.coarse_acoustics_config)
+        fine_generation_config = BarkFineGenerationConfig(**self.generation_config.fine_acoustics_config)
+
+        kwargs_semantic = {
+            # if "attention_mask" is set, it should not be passed to CoarseModel and FineModel
+            "attention_mask": kwargs.pop("attention_mask", None)
+        }
+        kwargs_coarse = {}
+        kwargs_fine = {}
+        for key, value in kwargs.items():
+            if key.startswith("semantic_"):
+                key = key[len("semantic_") :]
+                kwargs_semantic[key] = value
+            elif key.startswith("coarse_"):
+                key = key[len("coarse_") :]
+                kwargs_coarse[key] = value
+            elif key.startswith("fine_"):
+                key = key[len("fine_") :]
+                kwargs_fine[key] = value
+            else:
+                # If the key is already in a specific config, then it's been set with a
+                # submodules specific value and we don't override
+                if key not in kwargs_semantic:
+                    kwargs_semantic[key] = value
+                if key not in kwargs_coarse:
+                    kwargs_coarse[key] = value
+                if key not in kwargs_fine:
+                    kwargs_fine[key] = value
+
+        # 1. Generate from the semantic model
+        semantic_output = self.semantic.generate(
+            input_ids,
+            history_prompt=history_prompt,
+            semantic_generation_config=semantic_generation_config,
+            **kwargs_semantic,
+        )
+
+        # 2. Generate from the coarse model
+        coarse_output = self.coarse_acoustics.generate(
+            semantic_output,
+            history_prompt=history_prompt,
+            semantic_generation_config=semantic_generation_config,
+            coarse_generation_config=coarse_generation_config,
+            codebook_size=self.generation_config.codebook_size,
+            **kwargs_coarse,
+        )
+
+        # 3. "generate" from the fine model
+        output = self.fine_acoustics.generate(
+            coarse_output,
+            history_prompt=history_prompt,
+            semantic_generation_config=semantic_generation_config,
+            coarse_generation_config=coarse_generation_config,
+            fine_generation_config=fine_generation_config,
+            codebook_size=self.generation_config.codebook_size,
+            **kwargs_fine,
+        )
+
+        if getattr(self, "fine_acoustics_hook", None) is not None:
+            # Manually offload fine_acoustics to CPU
+            # and load codec_model to GPU
+            # since bark doesn't use codec_model forward pass
+            self.fine_acoustics_hook.offload()
+            self.codec_model = self.codec_model.to(self.device)
+
+        # 4. Decode the output and generate audio array
+        audio = self.codec_decode(output)
+
+        if getattr(self, "codec_model_hook", None) is not None:
+            # Offload codec_model to CPU
+            self.codec_model_hook.offload()
+
+        return audio
diff --git a/src/transformers/models/bark/processing_bark.py b/src/transformers/models/bark/processing_bark.py
new file mode 100644
index 000000000000..2b381327592e
--- /dev/null
+++ b/src/transformers/models/bark/processing_bark.py
@@ -0,0 +1,286 @@
+# coding=utf-8
+# Copyright 2023 The Suno AI Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Bark
+"""
+import json
+import os
+from typing import Optional
+
+import numpy as np
+
+from ...feature_extraction_utils import BatchFeature
+from ...processing_utils import ProcessorMixin
+from ...utils import logging
+from ...utils.hub import get_file_from_repo
+from ..auto import AutoTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+
+class BarkProcessor(ProcessorMixin):
+    r"""
+    Constructs a Bark processor which wraps a text tokenizer and optional Bark voice presets into a single processor.
+
+    Args:
+        tokenizer ([`PreTrainedTokenizer`]):
+            An instance of [`PreTrainedTokenizer`].
+        speaker_embeddings (`Dict[Dict[str]]`, *optional*):
+            Optional nested speaker embeddings dictionary. The first level contains voice preset names (e.g
+            `"en_speaker_4"`). The second level contains `"semantic_prompt"`, `"coarse_prompt"` and `"fine_prompt"`
+            embeddings. The values correspond to the path of the corresponding `np.ndarray`. See
+            [here](https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c) for
+            a list of `voice_preset_names`.
+
+    """
+    tokenizer_class = "AutoTokenizer"
+    attributes = ["tokenizer"]
+
+    preset_shape = {
+        "semantic_prompt": 1,
+        "coarse_prompt": 2,
+        "fine_prompt": 2,
+    }
+
+    def __init__(self, tokenizer, speaker_embeddings=None):
+        super().__init__(tokenizer)
+
+        self.speaker_embeddings = speaker_embeddings
+
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_processor_name_or_path, speaker_embeddings_dict_path="speaker_embeddings_path.json", **kwargs
+    ):
+        r"""
+        Instantiate a Bark processor associated with a pretrained model.
+
+        Args:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                This can be either:
+
+                - a string, the *model id* of a pretrained [`BarkProcessor`] hosted inside a model repo on
+                  huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                - a path to a *directory* containing a processor saved using the [`~BarkProcessor.save_pretrained`]
+                  method, e.g., `./my_model_directory/`.
+            speaker_embeddings_dict_path (`str`, *optional*, defaults to `"speaker_embeddings_path.json"`):
+                The name of the `.json` file containing the speaker_embeddings dictionnary located in
+                `pretrained_model_name_or_path`. If `None`, no speaker_embeddings is loaded.
+            **kwargs
+                Additional keyword arguments passed along to both
+                [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`].
+        """
+
+        if speaker_embeddings_dict_path is not None:
+            speaker_embeddings_path = get_file_from_repo(
+                pretrained_processor_name_or_path,
+                speaker_embeddings_dict_path,
+                subfolder=kwargs.pop("subfolder", None),
+                cache_dir=kwargs.pop("cache_dir", None),
+                force_download=kwargs.pop("force_download", False),
+                proxies=kwargs.pop("proxies", None),
+                resume_download=kwargs.pop("resume_download", False),
+                local_files_only=kwargs.pop("local_files_only", False),
+                use_auth_token=kwargs.pop("use_auth_token", None),
+                revision=kwargs.pop("revision", None),
+            )
+            if speaker_embeddings_path is None:
+                logger.warning(
+                    f"""`{os.path.join(pretrained_processor_name_or_path,speaker_embeddings_dict_path)}` does not exists
+                    , no preloaded speaker embeddings will be used - Make sure to provide a correct path to the json
+                    dictionnary if wanted, otherwise set `speaker_embeddings_dict_path=None`."""
+                )
+                speaker_embeddings = None
+            else:
+                with open(speaker_embeddings_path) as speaker_embeddings_json:
+                    speaker_embeddings = json.load(speaker_embeddings_json)
+        else:
+            speaker_embeddings = None
+
+        tokenizer = AutoTokenizer.from_pretrained(pretrained_processor_name_or_path, **kwargs)
+
+        return cls(tokenizer=tokenizer, speaker_embeddings=speaker_embeddings)
+
+    def save_pretrained(
+        self,
+        save_directory,
+        speaker_embeddings_dict_path="speaker_embeddings_path.json",
+        speaker_embeddings_directory="speaker_embeddings",
+        push_to_hub: bool = False,
+        **kwargs,
+    ):
+        """
+        Saves the attributes of this processor (tokenizer...) in the specified directory so that it can be reloaded
+        using the [`~BarkProcessor.from_pretrained`] method.
+
+        Args:
+            save_directory (`str` or `os.PathLike`):
+                Directory where the tokenizer files and the speaker embeddings will be saved (directory will be created
+                if it does not exist).
+            speaker_embeddings_dict_path (`str`, *optional*, defaults to `"speaker_embeddings_path.json"`):
+                The name of the `.json` file that will contains the speaker_embeddings nested path dictionnary, if it
+                exists, and that will be located in `pretrained_model_name_or_path/speaker_embeddings_directory`.
+            speaker_embeddings_directory (`str`, *optional*, defaults to `"speaker_embeddings/"`):
+                The name of the folder in which the speaker_embeddings arrays will be saved.
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
+                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
+                namespace).
+            kwargs:
+                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        if self.speaker_embeddings is not None:
+            os.makedirs(os.path.join(save_directory, speaker_embeddings_directory, "v2"), exist_ok=True)
+
+            embeddings_dict = {}
+
+            embeddings_dict["repo_or_path"] = save_directory
+
+            for prompt_key in self.speaker_embeddings:
+                if prompt_key != "repo_or_path":
+                    voice_preset = self._load_voice_preset(prompt_key)
+
+                    tmp_dict = {}
+                    for key in self.speaker_embeddings[prompt_key]:
+                        np.save(
+                            os.path.join(
+                                embeddings_dict["repo_or_path"], speaker_embeddings_directory, f"{prompt_key}_{key}"
+                            ),
+                            voice_preset[key],
+                            allow_pickle=False,
+                        )
+                        tmp_dict[key] = os.path.join(speaker_embeddings_directory, f"{prompt_key}_{key}.npy")
+
+                    embeddings_dict[prompt_key] = tmp_dict
+
+            with open(os.path.join(save_directory, speaker_embeddings_dict_path), "w") as fp:
+                json.dump(embeddings_dict, fp)
+
+        super().save_pretrained(save_directory, push_to_hub, **kwargs)
+
+    def _load_voice_preset(self, voice_preset: str = None, **kwargs):
+        voice_preset_paths = self.speaker_embeddings[voice_preset]
+
+        voice_preset_dict = {}
+        for key in ["semantic_prompt", "coarse_prompt", "fine_prompt"]:
+            if key not in voice_preset_paths:
+                raise ValueError(
+                    f"Voice preset unrecognized, missing {key} as a key in self.speaker_embeddings[{voice_preset}]."
+                )
+
+            path = get_file_from_repo(
+                self.speaker_embeddings.get("repo_or_path", "/"),
+                voice_preset_paths[key],
+                subfolder=kwargs.pop("subfolder", None),
+                cache_dir=kwargs.pop("cache_dir", None),
+                force_download=kwargs.pop("force_download", False),
+                proxies=kwargs.pop("proxies", None),
+                resume_download=kwargs.pop("resume_download", False),
+                local_files_only=kwargs.pop("local_files_only", False),
+                use_auth_token=kwargs.pop("use_auth_token", None),
+                revision=kwargs.pop("revision", None),
+            )
+            if path is None:
+                raise ValueError(
+                    f"""`{os.path.join(self.speaker_embeddings.get("repo_or_path", "/"),voice_preset_paths[key])}` does not exists
+                    , no preloaded voice preset will be used - Make sure to provide correct paths to the {voice_preset}
+                    embeddings."""
+                )
+
+            voice_preset_dict[key] = np.load(path)
+
+        return voice_preset_dict
+
+    def _validate_voice_preset_dict(self, voice_preset: Optional[dict] = None):
+        for key in ["semantic_prompt", "coarse_prompt", "fine_prompt"]:
+            if key not in voice_preset:
+                raise ValueError(f"Voice preset unrecognized, missing {key} as a key.")
+
+            if not isinstance(voice_preset[key], np.ndarray):
+                raise ValueError(f"{key} voice preset must be a {str(self.preset_shape[key])}D ndarray.")
+
+            if len(voice_preset[key].shape) != self.preset_shape[key]:
+                raise ValueError(f"{key} voice preset must be a {str(self.preset_shape[key])}D ndarray.")
+
+    def __call__(
+        self,
+        text=None,
+        voice_preset=None,
+        return_tensors="pt",
+        max_length=256,
+        add_special_tokens=False,
+        return_attention_mask=True,
+        return_token_type_ids=False,
+        **kwargs,
+    ):
+        """
+        Main method to prepare for the model one or several sequences(s). This method forwards the `text` and `kwargs`
+        arguments to the AutoTokenizer's [`~AutoTokenizer.__call__`] to encode the text. The method also proposes a
+        voice preset which is a dictionary of arrays that conditions `Bark`'s output. `kwargs` arguments are forwarded
+        to the tokenizer and to `cached_file` method if `voice_preset` is a valid filename.
+
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            voice_preset (`str`, `Dict[np.ndarray]`):
+                The voice preset, i.e the speaker embeddings. It can either be a valid voice_preset name, e.g
+                `"en_speaker_1"`, or directly a dictionnary of `np.ndarray` embeddings for each submodel of `Bark`. Or
+                it can be a valid file name of a local `.npz` single voice preset.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+
+        Returns:
+            Tuple([`BatchEncoding`], [`BatchFeature`]): A tuple composed of a [`BatchEncoding`], i.e the output of the
+            `tokenizer` and a [`BatchFeature`], i.e the voice preset with the right tensors type.
+        """
+        if voice_preset is not None and not isinstance(voice_preset, dict):
+            if (
+                isinstance(voice_preset, str)
+                and self.speaker_embeddings is not None
+                and voice_preset in self.speaker_embeddings
+            ):
+                voice_preset = self._load_voice_preset(voice_preset)
+
+            else:
+                if isinstance(voice_preset, str) and not voice_preset.endswith(".npz"):
+                    voice_preset = voice_preset + ".npz"
+
+                voice_preset = np.load(voice_preset)
+
+        if voice_preset is not None:
+            self._validate_voice_preset_dict(voice_preset, **kwargs)
+            voice_preset = BatchFeature(data=voice_preset, tensor_type=return_tensors)
+
+        encoded_text = self.tokenizer(
+            text,
+            return_tensors=return_tensors,
+            padding="max_length",
+            max_length=max_length,
+            return_attention_mask=return_attention_mask,
+            return_token_type_ids=return_token_type_ids,
+            add_special_tokens=add_special_tokens,
+            **kwargs,
+        )
+
+        if voice_preset is not None:
+            encoded_text["history_prompt"] = voice_preset
+
+        return encoded_text
diff --git a/src/transformers/models/bart/__init__.py b/src/transformers/models/bart/__init__.py
index 99ce16525a2e..4f104efce1a4 100644
--- a/src/transformers/models/bart/__init__.py
+++ b/src/transformers/models/bart/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -53,6 +49,7 @@
         "BartForQuestionAnswering",
         "BartForSequenceClassification",
         "BartModel",
+        "BartPreTrainedModel",
         "BartPretrainedModel",
         "PretrainedBartModel",
     ]
@@ -111,6 +108,7 @@
             BartForQuestionAnswering,
             BartForSequenceClassification,
             BartModel,
+            BartPreTrainedModel,
             BartPretrainedModel,
             PretrainedBartModel,
         )
diff --git a/src/transformers/models/bart/configuration_bart.py b/src/transformers/models/bart/configuration_bart.py
index 2558311a44bd..2a04657f4199 100644
--- a/src/transformers/models/bart/configuration_bart.py
+++ b/src/transformers/models/bart/configuration_bart.py
@@ -139,7 +139,7 @@ def __init__(
         is_encoder_decoder=True,
         decoder_start_token_id=2,
         forced_eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
diff --git a/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py
index baa2fff290f7..d09b39d51e00 100644
--- a/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py
@@ -101,7 +101,10 @@ def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path, hf_checkp
     config = BartConfig.from_pretrained(hf_checkpoint_name)
     tokens = bart.encode(SAMPLE_TEXT).unsqueeze(0)
     tokens2 = BartTokenizer.from_pretrained(hf_checkpoint_name).encode(SAMPLE_TEXT, return_tensors="pt").unsqueeze(0)
-    assert torch.eq(tokens, tokens2).all()
+    if not torch.eq(tokens, tokens2).all():
+        raise ValueError(
+            f"converted tokenizer and pretrained tokenizer returned different output: {tokens} != {tokens2}"
+        )
 
     if checkpoint_path == "bart.large.mnli":
         state_dict = bart.state_dict()
@@ -130,8 +133,12 @@ def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path, hf_checkp
             new_model_outputs = model.model(tokens)[0]
 
     # Check results
-    assert fairseq_output.shape == new_model_outputs.shape
-    assert (fairseq_output == new_model_outputs).all().item()
+    if fairseq_output.shape != new_model_outputs.shape:
+        raise ValueError(
+            f"`fairseq_output` shape and `new_model_output` shape are different: {fairseq_output.shape=}, {new_model_outputs.shape}"
+        )
+    if (fairseq_output != new_model_outputs).any().item():
+        raise ValueError("Some values in `fairseq_output` are different from `new_model_outputs`")
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     model.save_pretrained(pytorch_dump_folder_path)
 
diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
index 313eb249367f..52dfa5e39229 100755
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -15,7 +15,6 @@
 """ PyTorch BART model."""
 import copy
 import math
-import random
 import warnings
 from typing import List, Optional, Tuple, Union
 
@@ -50,7 +49,6 @@
 
 _CHECKPOINT_FOR_DOC = "facebook/bart-base"
 _CONFIG_FOR_DOC = "BartConfig"
-_TOKENIZER_FOR_DOC = "BartTokenizer"
 
 # Base model docstring
 _EXPECTED_OUTPUT_SHAPE = [1, 8, 768]
@@ -88,18 +86,20 @@ def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start
     return shifted_input_ids
 
 
-def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
     """
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
-    mask_cond = torch.arange(mask.size(-1))
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
 
     if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
     return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
 
 
@@ -230,8 +230,8 @@ def forward(
 
         proj_shape = (bsz * self.num_heads, -1, self.head_dim)
         query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-        key_states = key_states.view(*proj_shape)
-        value_states = value_states.view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
 
         src_len = key_states.size(1)
         attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
@@ -277,7 +277,7 @@ def forward(
 
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
             raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
                 f" {attn_output.size()}"
             )
 
@@ -285,7 +285,7 @@ def forward(
         attn_output = attn_output.transpose(1, 2)
 
         # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned aross GPUs when using tensor-parallelism.
+        # partitioned across GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
@@ -319,7 +319,7 @@ def forward(
     ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
@@ -502,12 +502,13 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-class BartPretrainedModel(PreTrainedModel):
+class BartPreTrainedModel(PreTrainedModel):
     config_class = BartConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_unexpected = [r"encoder.version", r"decoder.version"]
+    _keys_to_ignore_on_load_unexpected = ["encoder.version", "decoder.version"]
     _no_split_modules = [r"BartEncoderLayer", r"BartDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
 
     def _init_weights(self, module):
         std = self.config.init_std
@@ -535,10 +536,18 @@ def dummy_inputs(self):
         return dummy_inputs
 
 
-class PretrainedBartModel(BartPretrainedModel):
+class PretrainedBartModel(BartPreTrainedModel):
+    def __init_subclass__(self):
+        warnings.warn(
+            "The class `PretrainedBartModel` has been depreciated, please use `BartPreTrainedModel` instead.",
+            FutureWarning,
+        )
+
+
+class BartPretrainedModel(BartPreTrainedModel):
     def __init_subclass__(self):
         warnings.warn(
-            "The class `PretrainedBartModel` has been depreciated, please use `BartPretrainedModel` instead.",
+            "The class `PretrainedBartModel` has been depreciated, please use `BartPreTrainedModel` instead.",
             FutureWarning,
         )
 
@@ -563,10 +572,10 @@ def __init_subclass__(self):
     Summarization example:
 
     ```python
-    >>> from transformers import BartTokenizer, BartForConditionalGeneration
+    >>> from transformers import AutoTokenizer, BartForConditionalGeneration
 
     >>> model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
-    >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
+    >>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
 
     >>> ARTICLE_TO_SUMMARIZE = (
     ...     "PG&E stated it scheduled the blackouts in response to forecasts for high winds "
@@ -584,9 +593,9 @@ def __init_subclass__(self):
     Mask filling example:
 
     ```python
-    >>> from transformers import BartTokenizer, BartForConditionalGeneration
+    >>> from transformers import AutoTokenizer, BartForConditionalGeneration
 
-    >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
+    >>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
     >>> model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
 
     >>> TXT = "My friends are <mask> but they eat too many carbs."
@@ -608,7 +617,7 @@ def __init_subclass__(self):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -622,7 +631,7 @@ def __init_subclass__(self):
         decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -699,7 +708,7 @@ def __init_subclass__(self):
 """
 
 
-class BartEncoder(BartPretrainedModel):
+class BartEncoder(BartPreTrainedModel):
     """
     Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
     [`BartEncoderLayer`].
@@ -758,7 +767,7 @@ def forward(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -835,8 +844,13 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
                 layer_outputs = (None, None)
             else:
                 if self.gradient_checkpointing and self.training:
@@ -876,7 +890,7 @@ def custom_forward(*inputs):
         )
 
 
-class BartDecoder(BartPretrainedModel):
+class BartDecoder(BartPreTrainedModel):
     """
     Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`BartDecoderLayer`]
 
@@ -921,8 +935,11 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
         combined_attention_mask = None
         if input_shape[-1] > 1:
             combined_attention_mask = _make_causal_mask(
-                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
-            ).to(inputs_embeds.device)
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
 
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
@@ -956,7 +973,7 @@ def forward(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -1059,6 +1076,13 @@ def forward(
 
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
@@ -1078,20 +1102,15 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         # None for past_key_value
@@ -1110,7 +1129,6 @@ def custom_forward(*inputs):
                     None,
                 )
             else:
-
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
@@ -1159,8 +1177,8 @@ def custom_forward(*inputs):
     "The bare BART Model outputting raw hidden-states without any specific head on top.",
     BART_START_DOCSTRING,
 )
-class BartModel(BartPretrainedModel):
-    _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+class BartModel(BartPreTrainedModel):
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: BartConfig):
         super().__init__(config)
@@ -1190,7 +1208,6 @@ def get_decoder(self):
 
     @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=Seq2SeqModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1214,7 +1231,6 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, Seq2SeqModelOutput]:
-
         # different to other models, Bart automatically creates decoder_input_ids from
         # input_ids if no decoder_input_ids are provided
         if decoder_input_ids is None and decoder_inputs_embeds is None:
@@ -1288,14 +1304,10 @@ def forward(
 @add_start_docstrings(
     "The BART Model with a language modeling head. Can be used for summarization.", BART_START_DOCSTRING
 )
-class BartForConditionalGeneration(BartPretrainedModel):
+class BartForConditionalGeneration(BartPreTrainedModel):
     base_model_prefix = "model"
-    _keys_to_ignore_on_load_missing = [
-        r"final_logits_bias",
-        r"lm_head.weight",
-        "encoder.embed_tokens.weight",
-        "decoder.embed_tokens.weight",
-    ]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
+    _keys_to_ignore_on_load_missing = ["final_logits_bias"]
 
     def __init__(self, config: BartConfig):
         super().__init__(config)
@@ -1312,9 +1324,9 @@ def get_encoder(self):
     def get_decoder(self):
         return self.model.get_decoder()
 
-    def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
-        new_embeddings = super().resize_token_embeddings(new_num_tokens)
-        self._resize_final_logits_bias(new_num_tokens)
+    def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding:
+        new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
+        self._resize_final_logits_bias(new_embeddings.weight.shape[0])
         return new_embeddings
 
     def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
@@ -1396,6 +1408,7 @@ def forward(
 
         masked_lm_loss = None
         if labels is not None:
+            labels = labels.to(lm_logits.device)
             loss_fct = CrossEntropyLoss()
             masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
 
@@ -1426,7 +1439,7 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # cut decoder_input_ids if past_key_values is used
         if past_key_values is not None:
@@ -1449,12 +1462,13 @@ def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
         return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
 
     @staticmethod
-    def _reorder_cache(past, beam_idx):
+    def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
+        for layer_past in past_key_values:
             # cached cross_attention states don't have to be reordered -> they are always the same
             reordered_past += (
-                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
+                + layer_past[2:],
             )
         return reordered_past
 
@@ -1466,8 +1480,8 @@ def _reorder_cache(past, beam_idx):
     """,
     BART_START_DOCSTRING,
 )
-class BartForSequenceClassification(BartPretrainedModel):
-    _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+class BartForSequenceClassification(BartPreTrainedModel):
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: BartConfig, **kwargs):
         super().__init__(config, **kwargs)
@@ -1478,12 +1492,12 @@ def __init__(self, config: BartConfig, **kwargs):
             config.num_labels,
             config.classifier_dropout,
         )
-        self.model._init_weights(self.classification_head.dense)
-        self.model._init_weights(self.classification_head.out_proj)
+
+        # Initialize weights and apply final processing
+        self.post_init()
 
     @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
         output_type=Seq2SeqSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1551,6 +1565,7 @@ def forward(
 
         loss = None
         if labels is not None:
+            labels = labels.to(logits.device)
             if self.config.problem_type is None:
                 if self.config.num_labels == 1:
                     self.config.problem_type = "regression"
@@ -1595,8 +1610,8 @@ def forward(
     """,
     BART_START_DOCSTRING,
 )
-class BartForQuestionAnswering(BartPretrainedModel):
-    _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+class BartForQuestionAnswering(BartPreTrainedModel):
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1607,11 +1622,11 @@ def __init__(self, config):
         self.model = BartModel(config)
         self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
 
-        self.model._init_weights(self.qa_outputs)
+        # Initialize weights and apply final processing
+        self.post_init()
 
     @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_QA,
         output_type=Seq2SeqQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1713,7 +1728,7 @@ def forward(
         )
 
 
-class BartDecoderWrapper(BartPretrainedModel):
+class BartDecoderWrapper(BartPreTrainedModel):
     """
     This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
     used in combination with the [`EncoderDecoderModel`] framework.
@@ -1733,8 +1748,8 @@ def forward(self, *args, **kwargs):
     """,
     BART_START_DOCSTRING,
 )
-class BartForCausalLM(BartPretrainedModel):
-    _keys_to_ignore_on_load_missing = ["lm_head.weight"]
+class BartForCausalLM(BartPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         config = copy.deepcopy(config)
@@ -1789,7 +1804,7 @@ def forward(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -1854,9 +1869,9 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import BartTokenizer, BartForCausalLM
+        >>> from transformers import AutoTokenizer, BartForCausalLM
 
-        >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
         >>> model = BartForCausalLM.from_pretrained("facebook/bart-base", add_cross_attention=False)
         >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
@@ -1894,6 +1909,7 @@ def forward(
 
         loss = None
         if labels is not None:
+            labels = labels.to(logits.device)
             loss_fct = CrossEntropyLoss()
             loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
 
@@ -1928,8 +1944,10 @@ def prepare_inputs_for_generation(
         }
 
     @staticmethod
-    def _reorder_cache(past, beam_idx):
+    def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
         return reordered_past
diff --git a/src/transformers/models/bart/modeling_flax_bart.py b/src/transformers/models/bart/modeling_flax_bart.py
index 90ddfa57cbd6..9858eb2d1bf4 100644
--- a/src/transformers/models/bart/modeling_flax_bart.py
+++ b/src/transformers/models/bart/modeling_flax_bart.py
@@ -19,8 +19,6 @@
 from functools import partial
 from typing import Callable, Optional, Tuple
 
-import numpy as np
-
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
@@ -55,7 +53,6 @@
 
 _CHECKPOINT_FOR_DOC = "facebook/bart-base"
 _CONFIG_FOR_DOC = "BartConfig"
-_TOKENIZER_FOR_DOC = "BartTokenizer"
 
 
 BART_START_DOCSTRING = r"""
@@ -98,7 +95,7 @@
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -112,7 +109,7 @@
         decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -149,7 +146,7 @@
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -178,7 +175,7 @@
         decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -220,15 +217,15 @@
 """
 
 
-def shift_tokens_right(input_ids: np.array, pad_token_id: int, decoder_start_token_id: int) -> np.ndarray:
+def shift_tokens_right(input_ids: jnp.array, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray:
     """
     Shift input ids one token to the right.
     """
-    shifted_input_ids = np.zeros_like(input_ids)
-    shifted_input_ids[:, 1:] = input_ids[:, :-1]
-    shifted_input_ids[:, 0] = decoder_start_token_id
+    shifted_input_ids = jnp.zeros_like(input_ids)
+    shifted_input_ids = shifted_input_ids.at[:, 1:].set(input_ids[:, :-1])
+    shifted_input_ids = shifted_input_ids.at[:, 0].set(decoder_start_token_id)
 
-    shifted_input_ids = np.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
+    shifted_input_ids = jnp.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
     return shifted_input_ids
 
 
@@ -371,7 +368,7 @@ def __call__(
             attention_bias = lax.select(
                 attention_mask > 0,
                 jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
-                jnp.full(attention_mask.shape, float("-inf")).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
             )
         else:
             attention_bias = None
@@ -917,7 +914,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, **kwargs)
         super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
@@ -1019,10 +1016,10 @@ def encode(
         Example:
 
         ```python
-        >>> from transformers import BartTokenizer, FlaxBartForConditionalGeneration
+        >>> from transformers import AutoTokenizer, FlaxBartForConditionalGeneration
 
         >>> model = FlaxBartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
-        >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
+        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
 
         >>> text = "My friends are cool but they eat too many carbs."
         >>> inputs = tokenizer(text, max_length=1024, return_tensors="jax")
@@ -1086,10 +1083,10 @@ def decode(
 
         ```python
         >>> import jax.numpy as jnp
-        >>> from transformers import BartTokenizer, FlaxBartForConditionalGeneration
+        >>> from transformers import AutoTokenizer, FlaxBartForConditionalGeneration
 
         >>> model = FlaxBartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
-        >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
+        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
 
         >>> text = "My friends are cool but they eat too many carbs."
         >>> inputs = tokenizer(text, max_length=1024, return_tensors="jax")
@@ -1247,9 +1244,7 @@ class FlaxBartModel(FlaxBartPreTrainedModel):
     module_class = FlaxBartModule
 
 
-append_call_sample_docstring(
-    FlaxBartModel, _TOKENIZER_FOR_DOC, _CHECKPOINT_FOR_DOC, FlaxSeq2SeqModelOutput, _CONFIG_FOR_DOC
-)
+append_call_sample_docstring(FlaxBartModel, _CHECKPOINT_FOR_DOC, FlaxSeq2SeqModelOutput, _CONFIG_FOR_DOC)
 
 
 class FlaxBartForConditionalGenerationModule(nn.Module):
@@ -1355,10 +1350,10 @@ def decode(
 
         ```python
         >>> import jax.numpy as jnp
-        >>> from transformers import BartTokenizer, FlaxBartForConditionalGeneration
+        >>> from transformers import AutoTokenizer, FlaxBartForConditionalGeneration
 
         >>> model = FlaxBartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
-        >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
+        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
 
         >>> text = "My friends are cool but they eat too many carbs."
         >>> inputs = tokenizer(text, max_length=1024, return_tensors="jax")
@@ -1472,10 +1467,10 @@ def prepare_inputs_for_generation(
         self,
         decoder_input_ids,
         max_length,
-        attention_mask: Optional[jnp.DeviceArray] = None,
-        decoder_attention_mask: Optional[jnp.DeviceArray] = None,
+        attention_mask: Optional[jax.Array] = None,
+        decoder_attention_mask: Optional[jax.Array] = None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # initializing the cache
         batch_size, seq_length = decoder_input_ids.shape
@@ -1511,10 +1506,10 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
     Summarization example:
 
     ```python
-    >>> from transformers import BartTokenizer, FlaxBartForConditionalGeneration
+    >>> from transformers import AutoTokenizer, FlaxBartForConditionalGeneration
 
     >>> model = FlaxBartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
-    >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
+    >>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
 
     >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
     >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="np")
@@ -1528,10 +1523,10 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
 
     ```python
     >>> import jax
-    >>> from transformers import BartTokenizer, FlaxBartForConditionalGeneration
+    >>> from transformers import AutoTokenizer, FlaxBartForConditionalGeneration
 
     >>> model = FlaxBartForConditionalGeneration.from_pretrained("facebook/bart-large")
-    >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
+    >>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large")
 
     >>> TXT = "My friends are <mask> but they eat too many carbs."
     >>> input_ids = tokenizer([TXT], return_tensors="jax")["input_ids"]
@@ -1647,7 +1642,6 @@ class FlaxBartForSequenceClassification(FlaxBartPreTrainedModel):
 
 append_call_sample_docstring(
     FlaxBartForSequenceClassification,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxSeq2SeqSequenceClassifierOutput,
     _CONFIG_FOR_DOC,
@@ -1734,7 +1728,6 @@ class FlaxBartForQuestionAnswering(FlaxBartPreTrainedModel):
 
 append_call_sample_docstring(
     FlaxBartForQuestionAnswering,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxSeq2SeqQuestionAnsweringModelOutput,
     _CONFIG_FOR_DOC,
@@ -1753,7 +1746,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         config.is_decoder = True
         config.is_encoder_decoder = False
@@ -1925,7 +1918,6 @@ def __call__(
         return_dict: bool = True,
         deterministic: bool = True,
     ):
-
         outputs = self.model(
             input_ids,
             attention_mask,
@@ -1968,7 +1960,7 @@ def __call__(
 class FlaxBartForCausalLM(FlaxBartDecoderPreTrainedModel):
     module_class = FlaxBartForCausalLMModule
 
-    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jnp.DeviceArray] = None):
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
         # initializing the cache
         batch_size, seq_length = input_ids.shape
 
@@ -1997,7 +1989,6 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
 
 append_call_sample_docstring(
     FlaxBartForCausalLM,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxCausalLMOutputWithCrossAttentions,
     _CONFIG_FOR_DOC,
diff --git a/src/transformers/models/bart/modeling_tf_bart.py b/src/transformers/models/bart/modeling_tf_bart.py
index 355db78d84e8..497dad424911 100644
--- a/src/transformers/models/bart/modeling_tf_bart.py
+++ b/src/transformers/models/bart/modeling_tf_bart.py
@@ -15,6 +15,8 @@
 """ TF 2.0 Bart model."""
 
 
+from __future__ import annotations
+
 import random
 from typing import Optional, Tuple, Union
 
@@ -32,7 +34,6 @@
 
 # Public API
 from ...modeling_tf_utils import (
-    DUMMY_INPUTS,
     TFCausalLanguageModelingLoss,
     TFModelInputType,
     TFPreTrainedModel,
@@ -40,7 +41,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     ContextManagers,
     add_code_sample_docstrings,
@@ -57,7 +58,6 @@
 
 _CHECKPOINT_FOR_DOC = "facebook/bart-large"
 _CONFIG_FOR_DOC = "BartConfig"
-_TOKENIZER_FOR_DOC = "BartTokenizer"
 
 
 LARGE_NEGATIVE = -1e8
@@ -132,7 +132,7 @@ def call(
         self,
         input_shape: Optional[tf.TensorShape] = None,
         past_key_values_length: int = 0,
-        position_ids: Optional[tf.Tensor] = None,
+        position_ids: tf.Tensor | None = None,
     ):
         """Input is expected to be of size [bsz x seqlen]."""
         if position_ids is None:
@@ -181,12 +181,12 @@ def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
     def call(
         self,
         hidden_states: tf.Tensor,
-        key_value_states: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[Tuple[tf.Tensor]]] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
+        key_value_states: tf.Tensor | None = None,
+        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
+        attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
         training: Optional[bool] = False,
-    ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
+    ) -> Tuple[tf.Tensor, tf.Tensor | None]:
         """Input shape: Batch x Time x Channel"""
 
         # if key_value_states are provided this layer is used as a cross-attention layer
@@ -315,13 +315,13 @@ def __init__(self, config: BartConfig, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]],
-        layer_head_mask: Optional[tf.Tensor],
+        attention_mask: np.ndarray | tf.Tensor | None,
+        layer_head_mask: tf.Tensor | None,
         training: Optional[bool] = False,
     ) -> tf.Tensor:
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`tf.Tensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
@@ -384,21 +384,21 @@ def __init__(self, config: BartConfig, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
-        cross_attn_layer_head_mask: Optional[tf.Tensor] = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
+        cross_attn_layer_head_mask: tf.Tensor | None = None,
         past_key_value: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         training: Optional[bool] = False,
     ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`tf.Tensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             encoder_hidden_states (`tf.Tensor`):
-                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
             encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
@@ -486,31 +486,14 @@ class TFBartPretrainedModel(TFPreTrainedModel):
 
     @property
     def dummy_inputs(self):
-        pad_token = 1
-        input_ids = tf.convert_to_tensor(DUMMY_INPUTS, dtype=tf.int32)
-        decoder_input_ids = tf.convert_to_tensor(DUMMY_INPUTS, dtype=tf.int32)
-        dummy_inputs = {
-            "decoder_input_ids": decoder_input_ids,
-            "attention_mask": tf.cast(input_ids != pad_token, tf.int32),
-            "input_ids": input_ids,
-        }
+        dummy_inputs = super().dummy_inputs
+        # Dummy inputs should not contain the default val of 1
+        # as this is the padding token and some assertions check it
+        dummy_inputs["input_ids"] = dummy_inputs["input_ids"] * 2
+        if "decoder_input_ids" in dummy_inputs:
+            dummy_inputs["decoder_input_ids"] = dummy_inputs["decoder_input_ids"] * 2
         return dummy_inputs
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-                "decoder_input_ids": tf.TensorSpec((None, None), tf.int32, name="decoder_input_ids"),
-                "decoder_attention_mask": tf.TensorSpec((None, None), tf.int32, name="decoder_attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 BART_START_DOCSTRING = r"""
     This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
@@ -558,10 +541,10 @@ def serving(self, inputs):
     Summarization example:
 
     ```python
-    >>> from transformers import BartTokenizer, TFBartForConditionalGeneration
+    >>> from transformers import AutoTokenizer, TFBartForConditionalGeneration
 
     >>> model = TFBartForConditionalGeneration.from_pretrained("facebook/bart-large")
-    >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
+    >>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large")
 
     >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
     >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="tf")
@@ -574,9 +557,9 @@ def serving(self, inputs):
     Mask filling example:
 
     ```python
-    >>> from transformers import BartTokenizer, TFBartForConditionalGeneration
+    >>> from transformers import AutoTokenizer, TFBartForConditionalGeneration
 
-    >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
+    >>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large")
     >>> TXT = "My friends are <mask> but they eat too many carbs."
 
     >>> model = TFBartForConditionalGeneration.from_pretrained("facebook/bart-large")
@@ -593,7 +576,7 @@ def serving(self, inputs):
         input_ids (`tf.Tensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -607,7 +590,7 @@ def serving(self, inputs):
         decoder_input_ids (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -701,10 +684,10 @@ def __init__(self, config: BartConfig, embed_tokens: Optional[tf.keras.layers.Em
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -716,7 +699,7 @@ def call(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -764,16 +747,7 @@ def call(
             if hasattr(self.embed_tokens, "load_weight_prefix"):
                 context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
             with ContextManagers(context):
-                # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-                # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-                tf.debugging.assert_less(
-                    input_ids,
-                    tf.cast(self.embed_tokens.input_dim, dtype=input_ids.dtype),
-                    message=(
-                        "input_ids must be smaller than the embedding layer's input dimension (got"
-                        f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.input_dim})"
-                    ),
-                )
+                check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
                 inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
         embed_pos = self.embed_positions(input_shape)
@@ -804,7 +778,6 @@ def call(
 
         # encoder layers
         for idx, encoder_layer in enumerate(self.layers):
-
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
@@ -862,14 +835,14 @@ def __init__(self, config: BartConfig, embed_tokens: Optional[tf.keras.layers.Em
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        cross_attn_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -883,7 +856,7 @@ def call(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -967,16 +940,7 @@ def call(
             if hasattr(self.embed_tokens, "load_weight_prefix"):
                 context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
             with ContextManagers(context):
-                # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-                # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-                tf.debugging.assert_less(
-                    input_ids,
-                    tf.cast(self.embed_tokens.input_dim, dtype=input_ids.dtype),
-                    message=(
-                        "input_ids must be smaller than the embedding layer's input dimension (got"
-                        f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.input_dim})"
-                    ),
-                )
+                check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
                 inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
         hidden_states = inputs_embeds
@@ -1093,26 +1057,25 @@ def set_input_embeddings(self, new_embeddings):
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        cross_attn_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_head_mask: np.ndarray | tf.Tensor | None = None,
+        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
         encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
-        **kwargs
+        **kwargs,
     ) -> Union[TFSeq2SeqModelOutput, Tuple[tf.Tensor]]:
-
         # different to other models, Bart automatically creates decoder_input_ids from
         # input_ids if no decoder_input_ids are provided
         if decoder_input_ids is None and decoder_inputs_embeds is None:
@@ -1186,7 +1149,6 @@ def call(
     BART_START_DOCSTRING,
 )
 class TFBartModel(TFBartPretrainedModel):
-
     _requires_load_weight_prefix = True
 
     def __init__(self, config: BartConfig, load_weight_prefix=None, *inputs, **kwargs):
@@ -1202,7 +1164,6 @@ def get_decoder(self):
 
     @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFSeq2SeqModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1210,26 +1171,25 @@ def get_decoder(self):
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        cross_attn_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_head_mask: np.ndarray | tf.Tensor | None = None,
+        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
         encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
-        **kwargs
+        **kwargs,
     ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
-
         outputs = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -1335,23 +1295,23 @@ def set_bias(self, value):
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        cross_attn_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_head_mask: np.ndarray | tf.Tensor | None = None,
+        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
         encoder_outputs: Optional[TFBaseModelOutput] = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSeq2SeqLMOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1444,9 +1404,8 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
-
         # cut decoder_input_ids if past_key_values is used
         if past_key_values is not None:
             decoder_input_ids = decoder_input_ids[:, -1:]
@@ -1484,16 +1443,6 @@ def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor):
     BART_START_DOCSTRING,
 )
 class TFBartForSequenceClassification(TFBartPretrainedModel, TFSequenceClassificationLoss):
-    @property
-    def dummy_inputs(self):
-        pad_token = self.config.pad_token_id
-        input_ids = tf.constant([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]])
-        dummy_inputs = {
-            "attention_mask": tf.cast(tf.math.not_equal(input_ids, (pad_token)), dtype=tf.int32),
-            "input_ids": input_ids,
-        }
-        return dummy_inputs
-
     def __init__(self, config: BartConfig, load_weight_prefix=None, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.model = TFBartMainLayer(config, load_weight_prefix=load_weight_prefix, name="model")
@@ -1506,23 +1455,23 @@ def __init__(self, config: BartConfig, load_weight_prefix=None, *inputs, **kwarg
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        cross_attn_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_head_mask: np.ndarray | tf.Tensor | None = None,
+        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
         encoder_outputs: Optional[TFBaseModelOutput] = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSeq2SeqSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
diff --git a/src/transformers/models/bart/tokenization_bart.py b/src/transformers/models/bart/tokenization_bart.py
index a6c5fdc68115..7dd008c4dbba 100644
--- a/src/transformers/models/bart/tokenization_bart.py
+++ b/src/transformers/models/bart/tokenization_bart.py
@@ -105,12 +105,14 @@ class BartTokenizer(PreTrainedTokenizer):
     This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
     be encoded differently whether it is at the beginning of the sentence (without space) or not:
 
-    ```
+    ```python
     >>> from transformers import BartTokenizer
+
     >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
-    >>> tokenizer("Hello world")['input_ids']
+    >>> tokenizer("Hello world")["input_ids"]
     [0, 31414, 232, 2]
-    >>> tokenizer(" Hello world")['input_ids']
+
+    >>> tokenizer(" Hello world")["input_ids"]
     [0, 20920, 232, 2]
     ```
 
@@ -192,7 +194,7 @@ def __init__(
         pad_token="<pad>",
         mask_token="<mask>",
         add_prefix_space=False,
-        **kwargs
+        **kwargs,
     ):
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
@@ -202,21 +204,10 @@ def __init__(
         pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
 
         # Mask token behave like a normal word, i.e. include the space before it
+        # TODO seems like both slow and fast actually don't strip left and right soooooooo yeah. See `test_embeded_special_tokens`
+        # Also this not only will strip the spaces but any punctuation
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
 
-        super().__init__(
-            errors=errors,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            add_prefix_space=add_prefix_space,
-            **kwargs,
-        )
-
         with open(vocab_file, encoding="utf-8") as vocab_handle:
             self.encoder = json.load(vocab_handle)
         self.decoder = {v: k for k, v in self.encoder.items()}
@@ -233,6 +224,19 @@ def __init__(
         # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
         self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
 
+        super().__init__(
+            errors=errors,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            add_prefix_space=add_prefix_space,
+            **kwargs,
+        )
+
     @property
     def vocab_size(self):
         return len(self.encoder)
diff --git a/src/transformers/models/bart/tokenization_bart_fast.py b/src/transformers/models/bart/tokenization_bart_fast.py
index 26c546be001a..464b17c4d4c2 100644
--- a/src/transformers/models/bart/tokenization_bart_fast.py
+++ b/src/transformers/models/bart/tokenization_bart_fast.py
@@ -75,12 +75,14 @@ class BartTokenizerFast(PreTrainedTokenizerFast):
     This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
     be encoded differently whether it is at the beginning of the sentence (without space) or not:
 
-    ```
+    ```python
     >>> from transformers import BartTokenizerFast
+
     >>> tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-base")
-    >>> tokenizer("Hello world")['input_ids']
+    >>> tokenizer("Hello world")["input_ids"]
     [0, 31414, 232, 2]
-    >>> tokenizer(" Hello world")['input_ids']
+
+    >>> tokenizer(" Hello world")["input_ids"]
     [0, 20920, 232, 2]
     ```
 
@@ -166,8 +168,9 @@ def __init__(
         mask_token="<mask>",
         add_prefix_space=False,
         trim_offsets=True,
-        **kwargs
+        **kwargs,
     ):
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
         super().__init__(
             vocab_file,
             merges_file,
diff --git a/src/transformers/models/barthez/__init__.py b/src/transformers/models/barthez/__init__.py
index df1a006d3154..084cd22bdf1d 100644
--- a/src/transformers/models/barthez/__init__.py
+++ b/src/transformers/models/barthez/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -41,7 +37,6 @@
 
 
 if TYPE_CHECKING:
-
     try:
         if not is_sentencepiece_available():
             raise OptionalDependencyNotAvailable()
diff --git a/src/transformers/models/barthez/tokenization_barthez.py b/src/transformers/models/barthez/tokenization_barthez.py
index 2e58db113e15..586801eed866 100644
--- a/src/transformers/models/barthez/tokenization_barthez.py
+++ b/src/transformers/models/barthez/tokenization_barthez.py
@@ -47,6 +47,8 @@
 
 SPIECE_UNDERLINE = "▁"
 
+# TODO this class is useless. This is the most standard sentencpiece model. Let's find which one is closest and nuke this.
+
 
 class BarthezTokenizer(PreTrainedTokenizer):
     """
@@ -95,8 +97,6 @@ class BarthezTokenizer(PreTrainedTokenizer):
         mask_token (`str`, *optional*, defaults to `"<mask>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
-            Additional special tokens used by the tokenizer.
         sp_model_kwargs (`dict`, *optional*):
             Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
             SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
@@ -134,13 +134,16 @@ def __init__(
         pad_token="<pad>",
         mask_token="<mask>",
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
 
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
 
+        self.vocab_file = vocab_file
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(str(vocab_file))
         super().__init__(
             bos_token=bos_token,
             eos_token=eos_token,
@@ -153,15 +156,6 @@ def __init__(
             **kwargs,
         )
 
-        self.vocab_file = vocab_file
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(str(vocab_file))
-
-        self.fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3}
-
-        self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) - 1
-        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
-
     def build_inputs_with_special_tokens(
         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
     ) -> List[int]:
@@ -251,16 +245,10 @@ def _tokenize(self, text: str) -> List[str]:
 
     def _convert_token_to_id(self, token):
         """Converts a token (str) in an id using the vocab."""
-        if token in self.fairseq_tokens_to_ids:
-            return self.fairseq_tokens_to_ids[token]
-        spm_id = self.sp_model.PieceToId(token)
-
-        return spm_id if spm_id else self.unk_token_id
+        return self.sp_model.PieceToId(token)
 
     def _convert_id_to_token(self, index):
         """Converts an index (integer) in a token (str) using the vocab."""
-        if index in self.fairseq_ids_to_tokens:
-            return self.fairseq_ids_to_tokens[index]
         return self.sp_model.IdToPiece(index)
 
     def convert_tokens_to_string(self, tokens):
diff --git a/src/transformers/models/barthez/tokenization_barthez_fast.py b/src/transformers/models/barthez/tokenization_barthez_fast.py
index a7f36e007c14..fb4a114b43bf 100644
--- a/src/transformers/models/barthez/tokenization_barthez_fast.py
+++ b/src/transformers/models/barthez/tokenization_barthez_fast.py
@@ -127,7 +127,7 @@ def __init__(
         unk_token="<unk>",
         pad_token="<pad>",
         mask_token="<mask>",
-        **kwargs
+        **kwargs,
     ):
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
@@ -146,7 +146,10 @@ def __init__(
         )
 
         self.vocab_file = vocab_file
-        self.can_save_slow_tokenizer = False if not self.vocab_file else True
+
+    @property
+    def can_save_slow_tokenizer(self) -> bool:
+        return os.path.isfile(self.vocab_file) if self.vocab_file else False
 
     def build_inputs_with_special_tokens(
         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
diff --git a/src/transformers/models/bartpho/__init__.py b/src/transformers/models/bartpho/__init__.py
index 8c11514864c7..c20d7370c656 100644
--- a/src/transformers/models/bartpho/__init__.py
+++ b/src/transformers/models/bartpho/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/bartpho/tokenization_bartpho.py b/src/transformers/models/bartpho/tokenization_bartpho.py
index b12a962eae3e..6b9dc266b29f 100644
--- a/src/transformers/models/bartpho/tokenization_bartpho.py
+++ b/src/transformers/models/bartpho/tokenization_bartpho.py
@@ -92,8 +92,6 @@ class BartphoTokenizer(PreTrainedTokenizer):
         mask_token (`str`, *optional*, defaults to `"<mask>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
-            Additional special tokens used by the tokenizer.
         sp_model_kwargs (`dict`, *optional*):
             Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
             SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
@@ -132,25 +130,13 @@ def __init__(
         pad_token="<pad>",
         mask_token="<mask>",
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
 
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
 
-        super().__init__(
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            sp_model_kwargs=self.sp_model_kwargs,
-            **kwargs,
-        )
-
         self.vocab_file = vocab_file
         self.monolingual_vocab_file = monolingual_vocab_file
         self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
@@ -174,6 +160,18 @@ def __init__(
 
         self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
 
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+
     def __getstate__(self):
         state = self.__dict__.copy()
         state["sp_model"] = None
diff --git a/src/transformers/models/beit/__init__.py b/src/transformers/models/beit/__init__.py
index 9f625fe54d18..4b631ac1c36a 100644
--- a/src/transformers/models/beit/__init__.py
+++ b/src/transformers/models/beit/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/beit/configuration_beit.py b/src/transformers/models/beit/configuration_beit.py
index c44c59942f03..e554f45f7910 100644
--- a/src/transformers/models/beit/configuration_beit.py
+++ b/src/transformers/models/beit/configuration_beit.py
@@ -41,7 +41,7 @@ class BeitConfig(PretrainedConfig):
     [microsoft/beit-base-patch16-224-pt22k](https://huggingface.co/microsoft/beit-base-patch16-224-pt22k) architecture.
 
     Args:
-        vocab_size (`int`, *optional*, defaults to 8092):
+        vocab_size (`int`, *optional*, defaults to 8192):
             Vocabulary size of the BEiT model. Defines the number of different image tokens that can be used during
             pre-training.
         hidden_size (`int`, *optional*, defaults to 768):
@@ -147,7 +147,7 @@ def __init__(
         auxiliary_num_convs=1,
         auxiliary_concat_input=False,
         semantic_loss_ignore_index=255,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -186,7 +186,6 @@ def __init__(
 
 # Copied from transformers.models.vit.configuration_vit.ViTOnnxConfig
 class BeitOnnxConfig(OnnxConfig):
-
     torch_onnx_minimum_version = version.parse("1.11")
 
     @property
diff --git a/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py b/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py
index 12da57ea386c..757113c8a60f 100644
--- a/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py
+++ b/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py
@@ -19,18 +19,18 @@
 import json
 from pathlib import Path
 
+import requests
 import torch
 from datasets import load_dataset
+from huggingface_hub import hf_hub_download
 from PIL import Image
 
-import requests
-from huggingface_hub import hf_hub_download
 from transformers import (
     BeitConfig,
-    BeitFeatureExtractor,
     BeitForImageClassification,
     BeitForMaskedImageModeling,
     BeitForSemanticSegmentation,
+    BeitImageProcessor,
 )
 from transformers.image_utils import PILImageResampling
 from transformers.utils import logging
@@ -266,16 +266,16 @@ def convert_beit_checkpoint(checkpoint_url, pytorch_dump_folder_path):
 
     # Check outputs on an image
     if is_semantic:
-        feature_extractor = BeitFeatureExtractor(size=config.image_size, do_center_crop=False)
+        image_processor = BeitImageProcessor(size=config.image_size, do_center_crop=False)
         ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
         image = Image.open(ds[0]["file"])
     else:
-        feature_extractor = BeitFeatureExtractor(
+        image_processor = BeitImageProcessor(
             size=config.image_size, resample=PILImageResampling.BILINEAR, do_center_crop=False
         )
         image = prepare_img()
 
-    encoding = feature_extractor(images=image, return_tensors="pt")
+    encoding = image_processor(images=image, return_tensors="pt")
     pixel_values = encoding["pixel_values"]
 
     outputs = model(pixel_values)
@@ -337,24 +337,25 @@ def convert_beit_checkpoint(checkpoint_url, pytorch_dump_folder_path):
     else:
         raise ValueError("Can't verify logits as model is not supported")
 
-    assert logits.shape == expected_shape, "Shape of logits not as expected"
+    if logits.shape != expected_shape:
+        raise ValueError(f"Shape of logits not as expected. {logits.shape=}, {expected_shape=}")
     if not has_lm_head:
         if is_semantic:
-            assert torch.allclose(
-                logits[0, :3, :3, :3], expected_logits, atol=1e-3
-            ), "First elements of logits not as expected"
+            if not torch.allclose(logits[0, :3, :3, :3], expected_logits, atol=1e-3):
+                raise ValueError("First elements of logits not as expected")
         else:
             print("Predicted class idx:", logits.argmax(-1).item())
-            assert torch.allclose(
-                logits[0, :3], expected_logits, atol=1e-3
-            ), "First elements of logits not as expected"
-            assert logits.argmax(-1).item() == expected_class_idx, "Predicted class index not as expected"
+
+            if not torch.allclose(logits[0, :3], expected_logits, atol=1e-3):
+                raise ValueError("First elements of logits not as expected")
+            if logits.argmax(-1).item() != expected_class_idx:
+                raise ValueError("Predicted class index not as expected")
 
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     print(f"Saving model to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/beit/image_processing_beit.py b/src/transformers/models/beit/image_processing_beit.py
index 0e81cb9c4469..6f8ce403e0a5 100644
--- a/src/transformers/models/beit/image_processing_beit.py
+++ b/src/transformers/models/beit/image_processing_beit.py
@@ -19,22 +19,21 @@
 
 import numpy as np
 
-from transformers.utils import is_torch_available, is_torch_tensor, is_vision_available
-from transformers.utils.generic import TensorType
-
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import center_crop, normalize, rescale, resize, to_channel_dimension_format
+from ...image_transforms import resize, to_channel_dimension_format
 from ...image_utils import (
     IMAGENET_STANDARD_MEAN,
     IMAGENET_STANDARD_STD,
     ChannelDimension,
     ImageInput,
     PILImageResampling,
-    is_batched,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
     to_numpy_array,
     valid_images,
 )
-from ...utils import logging
+from ...utils import TensorType, is_torch_available, is_torch_tensor, is_vision_available, logging
 
 
 if is_vision_available():
@@ -58,7 +57,7 @@ class BeitImageProcessor(BaseImageProcessor):
         size (`Dict[str, int]` *optional*, defaults to `{"height": 256, "width": 256}`):
             Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
             method.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
             Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
             `preprocess` method.
         do_center_crop (`bool`, *optional*, defaults to `True`):
@@ -68,12 +67,12 @@ class BeitImageProcessor(BaseImageProcessor):
         crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
             Desired output size when applying center-cropping. Only has an effect if `do_center_crop` is set to `True`.
             Can be overridden by the `crop_size` parameter in the `preprocess` method.
-        do_rescale (`bool`, *optional*, defaults to `True`):
-            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
-            parameter in the `preprocess` method.
         rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
             Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
             `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
         do_normalize (`bool`, *optional*, defaults to `True`):
             Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
             method.
@@ -105,7 +104,7 @@ def __init__(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_reduce_labels: bool = False,
-        **kwargs
+        **kwargs,
     ) -> None:
         if "reduce_labels" in kwargs:
             warnings.warn(
@@ -148,7 +147,8 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image to (size["height"], size["width"]).
@@ -162,79 +162,21 @@ def resize(
                 Resampling filter to use when resiizing the image.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
         """
         size = get_size_dict(size, default_to_square=True, param_name="size")
         if "height" not in size or "width" not in size:
             raise ValueError(f"The `size` argument must contain `height` and `width` keys. Got {size.keys()}")
         return resize(
-            image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs
+            image,
+            size=(size["height"], size["width"]),
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
         )
 
-    def center_crop(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Center crop an image to (size["height"], size["width"]). If the input size is smaller than `size` along any
-        edge, the image is padded with 0's and then center cropped.
-
-        Args:
-            image (`np.ndarray`):
-                Image to center crop.
-            size (`Dict[str, int]`):
-                Size of the output image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        size = get_size_dict(size, default_to_square=True, param_name="size")
-        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
-
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            image_mean (`float` or `List[float]`):
-                Image mean.
-            image_std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
     def reduce_label(self, label: ImageInput) -> np.ndarray:
         label = to_numpy_array(label)
         # Avoid using underflow conversion
@@ -257,21 +199,22 @@ def _preprocess(
         do_normalize: bool = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ):
         if do_reduce_labels:
             image = self.reduce_label(image)
 
         if do_resize:
-            image = self.resize(image=image, size=size, resample=resample)
+            image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
 
         if do_center_crop:
-            image = self.center_crop(image=image, size=crop_size)
+            image = self.center_crop(image=image, size=crop_size, input_data_format=input_data_format)
 
         if do_rescale:
-            image = self.rescale(image=image, scale=rescale_factor)
+            image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
 
         if do_normalize:
-            image = self.normalize(image=image, mean=image_mean, std=image_std)
+            image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
 
         return image
 
@@ -289,10 +232,18 @@ def _preprocess_image(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.ndarray:
         """Preprocesses a single image."""
         # All transformations expect numpy arrays.
         image = to_numpy_array(image)
+        if is_scaled_image(image) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(image)
         image = self._preprocess(
             image,
             do_reduce_labels=False,
@@ -306,9 +257,10 @@ def _preprocess_image(
             do_normalize=do_normalize,
             image_mean=image_mean,
             image_std=image_std,
+            input_data_format=input_data_format,
         )
         if data_format is not None:
-            image = to_channel_dimension_format(image, data_format)
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
         return image
 
     def _preprocess_segmentation_map(
@@ -320,6 +272,7 @@ def _preprocess_segmentation_map(
         do_center_crop: bool = None,
         crop_size: Dict[str, int] = None,
         do_reduce_labels: bool = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ):
         """Preprocesses a single segmentation map."""
         # All transformations expect numpy arrays.
@@ -328,8 +281,11 @@ def _preprocess_segmentation_map(
         if segmentation_map.ndim == 2:
             segmentation_map = segmentation_map[None, ...]
             added_dimension = True
+            input_data_format = ChannelDimension.FIRST
         else:
             added_dimension = False
+            if input_data_format is None:
+                input_data_format = infer_channel_dimension_format(segmentation_map, num_channels=1)
         segmentation_map = self._preprocess(
             image=segmentation_map,
             do_reduce_labels=do_reduce_labels,
@@ -340,6 +296,7 @@ def _preprocess_segmentation_map(
             crop_size=crop_size,
             do_normalize=False,
             do_rescale=False,
+            input_data_format=ChannelDimension.FIRST,
         )
         # Remove extra axis if added
         if added_dimension:
@@ -369,6 +326,7 @@ def preprocess(
         do_reduce_labels: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
         **kwargs,
     ) -> PIL.Image.Image:
         """
@@ -376,7 +334,8 @@ def preprocess(
 
         Args:
             images (`ImageInput`):
-                Image to preprocess.
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
             do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                 Whether to resize the image.
             size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@@ -412,8 +371,15 @@ def preprocess(
                     - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
             data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
                 The channel dimension format for the output image. Can be one of:
-                    - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                    - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         size = size if size is not None else self.size
@@ -429,9 +395,9 @@ def preprocess(
         image_std = image_std if image_std is not None else self.image_std
         do_reduce_labels = do_reduce_labels if do_reduce_labels is not None else self.do_reduce_labels
 
-        if not is_batched(images):
-            images = [images]
-            segmentation_maps = [segmentation_maps] if segmentation_maps is not None else None
+        images = make_list_of_images(images)
+        if segmentation_maps is not None:
+            segmentation_maps = make_list_of_images(segmentation_maps, expected_ndims=2)
 
         if not valid_images(images):
             raise ValueError(
@@ -471,6 +437,7 @@ def preprocess(
                 image_mean=image_mean,
                 image_std=image_std,
                 data_format=data_format,
+                input_data_format=input_data_format,
             )
             for img in images
         ]
@@ -502,8 +469,9 @@ def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple]
             outputs ([`BeitForSemanticSegmentation`]):
                 Raw outputs of the model.
             target_sizes (`List[Tuple]` of length `batch_size`, *optional*):
-                List of tuples corresponding to the requested final size (height, width) of each prediction. If left to
-                None, predictions will not be resized.
+                List of tuples corresponding to the requested final size (height, width) of each prediction. If unset,
+                predictions will not be resized.
+
         Returns:
             semantic_segmentation: `List[torch.Tensor]` of length `batch_size`, where each item is a semantic
             segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
diff --git a/src/transformers/models/beit/modeling_beit.py b/src/transformers/models/beit/modeling_beit.py
index 87c787634e6a..d698cff88b14 100755
--- a/src/transformers/models/beit/modeling_beit.py
+++ b/src/transformers/models/beit/modeling_beit.py
@@ -49,7 +49,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "BeitConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "BeitImageProcessor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "microsoft/beit-base-patch16-224-pt22k"
@@ -150,7 +149,6 @@ def __init__(self, config: BeitConfig) -> None:
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.BoolTensor] = None) -> torch.Tensor:
-
         embeddings = self.patch_embeddings(pixel_values)
         batch_size, seq_len, _ = embeddings.size()
 
@@ -461,7 +459,7 @@ def __init__(self, config: BeitConfig, window_size: tuple) -> None:
         relative_position_index[0:, 0] = self.num_relative_distance - 2
         relative_position_index[0, 0] = self.num_relative_distance - 1
 
-        self.register_buffer("relative_position_index", relative_position_index)
+        self.register_buffer("relative_position_index", relative_position_index, persistent=False)
 
     def forward(self) -> torch.Tensor:
         relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
@@ -593,7 +591,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 BEIT_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`BeitImageProcessor`]. See
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
             [`BeitImageProcessor.__call__`] for details.
 
         head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
@@ -646,7 +644,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(BEIT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BeitModelOutputWithPooling,
         config_class=_CONFIG_FOR_DOC,
@@ -662,6 +659,10 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[tuple, BeitModelOutputWithPooling]:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+        """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -769,7 +770,7 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import BeitImageProcessor, BeitForMaskedImageModeling
+        >>> from transformers import AutoImageProcessor, BeitForMaskedImageModeling
         >>> import torch
         >>> from PIL import Image
         >>> import requests
@@ -777,7 +778,7 @@ def forward(
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> image_processor = BeitImageProcessor.from_pretrained("microsoft/beit-base-patch16-224-pt22k")
+        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-patch16-224-pt22k")
         >>> model = BeitForMaskedImageModeling.from_pretrained("microsoft/beit-base-patch16-224-pt22k")
 
         >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
@@ -844,7 +845,6 @@ def __init__(self, config: BeitConfig) -> None:
 
     @add_start_docstrings_to_model_forward(BEIT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_IMAGE_CLASS_CHECKPOINT,
         output_type=ImageClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1192,8 +1192,10 @@ def compute_loss(self, logits, auxiliary_logits, labels):
         # compute weighted loss
         loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index)
         main_loss = loss_fct(upsampled_logits, labels)
-        auxiliary_loss = loss_fct(upsampled_auxiliary_logits, labels)
-        loss = main_loss + self.config.auxiliary_loss_weight * auxiliary_loss
+        loss = main_loss
+        if auxiliary_logits is not None:
+            auxiliary_loss = loss_fct(upsampled_auxiliary_logits, labels)
+            loss += self.config.auxiliary_loss_weight * auxiliary_loss
 
         return loss
 
diff --git a/src/transformers/models/beit/modeling_flax_beit.py b/src/transformers/models/beit/modeling_flax_beit.py
index 4a866584fb3b..0f0dc809e680 100644
--- a/src/transformers/models/beit/modeling_flax_beit.py
+++ b/src/transformers/models/beit/modeling_flax_beit.py
@@ -16,12 +16,11 @@
 
 from typing import Callable, List, Optional, Tuple
 
-import numpy as np
-
 import flax
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen.attention import dot_product_attention_weights
 from flax.traverse_util import flatten_dict, unflatten_dict
@@ -102,8 +101,8 @@ class FlaxBeitModelOutputWithPooling(FlaxBaseModelOutputWithPooling):
 BEIT_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`BeitImageProcessor`]. See
-            [`BeitImageProcessor.__call__`] for details.
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`AutoImageProcessor.__call__`] for details.
 
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
@@ -166,7 +165,6 @@ def __call__(self, inputs, deterministic: Optional[bool] = True):
 
 
 class FlaxBeitPatchEmbeddings(nn.Module):
-
     config: BeitConfig
     dtype: jnp.dtype = jnp.float32  # the dtype of the computation
 
@@ -217,7 +215,6 @@ def setup(self):
         self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
 
     def __call__(self, pixel_values, bool_masked_pos=None, deterministic=True):
-
         embeddings = self.patch_embeddings(pixel_values)
         batch_size, seq_len, _ = embeddings.shape
 
@@ -518,7 +515,6 @@ def __call__(
         output_hidden_states: bool = False,
         return_dict: bool = True,
     ):
-
         all_attentions = () if output_attentions else None
         all_hidden_states = () if output_hidden_states else None
 
@@ -559,7 +555,7 @@ def setup(self):
             )
 
         # stochastic depth decay rule
-        drop_path_rates = [x for x in np.linspace(0, self.config.drop_path_rate, self.config.num_hidden_layers)]
+        drop_path_rates = list(np.linspace(0, self.config.drop_path_rate, self.config.num_hidden_layers))
         self.layer = FlaxBeitLayerCollection(
             self.config,
             window_size=self.window_size,
@@ -605,7 +601,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, **kwargs)
         if input_shape is None:
@@ -713,7 +709,6 @@ def __call__(
         output_hidden_states: bool = False,
         return_dict: bool = True,
     ):
-
         hidden_states = self.embeddings(pixel_values, bool_masked_pos, deterministic=deterministic)
 
         outputs = self.encoder(
@@ -756,14 +751,14 @@ class FlaxBeitModel(FlaxBeitPreTrainedModel):
     Examples:
 
     ```python
-    >>> from transformers import BeitImageProcessor, FlaxBeitModel
+    >>> from transformers import AutoImageProcessor, FlaxBeitModel
     >>> from PIL import Image
     >>> import requests
 
     >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
     >>> image = Image.open(requests.get(url, stream=True).raw)
 
-    >>> image_processor = BeitImageProcessor.from_pretrained("microsoft/beit-base-patch16-224-pt22k-ft22k")
+    >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-patch16-224-pt22k-ft22k")
     >>> model = FlaxBeitModel.from_pretrained("microsoft/beit-base-patch16-224-pt22k-ft22k")
 
     >>> inputs = image_processor(images=image, return_tensors="np")
@@ -836,21 +831,21 @@ class FlaxBeitForMaskedImageModeling(FlaxBeitPreTrainedModel):
 
 FLAX_BEIT_MLM_DOCSTRING = """
     bool_masked_pos (`numpy.ndarray` of shape `(batch_size, num_patches)`):
-            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+        Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
 
     Returns:
 
     Examples:
 
     ```python
-    >>> from transformers import BeitImageProcessor, BeitForMaskedImageModeling
+    >>> from transformers import AutoImageProcessor, BeitForMaskedImageModeling
     >>> from PIL import Image
     >>> import requests
 
     >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
     >>> image = Image.open(requests.get(url, stream=True).raw)
 
-    >>> image_processor = BeitImageProcessor.from_pretrained("microsoft/beit-base-patch16-224-pt22k")
+    >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-patch16-224-pt22k")
     >>> model = BeitForMaskedImageModeling.from_pretrained("microsoft/beit-base-patch16-224-pt22k")
 
     >>> inputs = image_processor(images=image, return_tensors="np")
@@ -927,14 +922,14 @@ class FlaxBeitForImageClassification(FlaxBeitPreTrainedModel):
     Example:
 
     ```python
-    >>> from transformers import BeitImageProcessor, FlaxBeitForImageClassification
+    >>> from transformers import AutoImageProcessor, FlaxBeitForImageClassification
     >>> from PIL import Image
     >>> import requests
 
     >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
     >>> image = Image.open(requests.get(url, stream=True).raw)
 
-    >>> image_processor = BeitImageProcessor.from_pretrained("microsoft/beit-base-patch16-224")
+    >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-patch16-224")
     >>> model = FlaxBeitForImageClassification.from_pretrained("microsoft/beit-base-patch16-224")
 
     >>> inputs = image_processor(images=image, return_tensors="np")
diff --git a/src/transformers/models/bert/__init__.py b/src/transformers/models/bert/__init__.py
index 1699b6d5ee16..882655f394e9 100644
--- a/src/transformers/models/bert/__init__.py
+++ b/src/transformers/models/bert/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/bert/configuration_bert.py b/src/transformers/models/bert/configuration_bert.py
index b2d64b7fde67..589c2b026185 100644
--- a/src/transformers/models/bert/configuration_bert.py
+++ b/src/transformers/models/bert/configuration_bert.py
@@ -156,7 +156,7 @@ def __init__(
         position_embedding_type="absolute",
         use_cache=True,
         classifier_dropout=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
 
diff --git a/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py b/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py
index a58240c8c3c2..5e3ef4df9fea 100644
--- a/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py
+++ b/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py
@@ -26,7 +26,6 @@
 
 
 def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name: str):
-
     """
     Args:
         model: BertModel Pytorch model instance to be converted
@@ -79,7 +78,7 @@ def create_tf_var(tensor: np.ndarray, name: str, session: tf.Session):
         for var_name in state_dict:
             tf_name = to_tf_var_name(var_name)
             torch_tensor = state_dict[var_name].numpy()
-            if any([x in var_name for x in tensors_to_transpose]):
+            if any(x in var_name for x in tensors_to_transpose):
                 torch_tensor = torch_tensor.T
             tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session)
             tf.keras.backend.set_value(tf_var, torch_tensor)
diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py
index 65bb8a2ddb7b..29846b8051f8 100755
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -56,7 +56,6 @@
 
 _CHECKPOINT_FOR_DOC = "bert-base-uncased"
 _CONFIG_FOR_DOC = "BertConfig"
-_TOKENIZER_FOR_DOC = "BertTokenizer"
 
 # TokenClassification docstring
 _CHECKPOINT_FOR_TOKEN_CLASSIFICATION = "dbmdz/bert-large-cased-finetuned-conll03-english"
@@ -170,7 +169,7 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
         try:
             if pointer.shape != array.shape:
                 raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
-        except AssertionError as e:
+        except ValueError as e:
             e.args += (pointer.shape, array.shape)
             raise
         logger.info(f"Initialize PyTorch weight {name}")
@@ -193,7 +192,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
         )
@@ -576,6 +577,13 @@ def forward(
         all_self_attentions = () if output_attentions else None
         all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         next_decoder_cache = () if use_cache else None
         for i, layer_module in enumerate(self.layer):
             if output_hidden_states:
@@ -586,12 +594,6 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         return module(*inputs, past_key_value, output_attentions)
@@ -743,7 +745,6 @@ class BertPreTrainedModel(PreTrainedModel):
     load_tf_weights = load_tf_weights_in_bert
     base_model_prefix = "bert"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -821,7 +822,7 @@ class BertForPreTrainingOutput(ModelOutput):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -911,7 +912,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPoolingAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -966,6 +966,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -1054,7 +1055,7 @@ def forward(
     BERT_START_DOCSTRING,
 )
 class BertForPreTraining(BertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", r"cls.predictions.decoder.weight"]
+    _tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1106,10 +1107,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import BertTokenizer, BertForPreTraining
+        >>> from transformers import AutoTokenizer, BertForPreTraining
         >>> import torch
 
-        >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
         >>> model = BertForPreTraining.from_pretrained("bert-base-uncased")
 
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
@@ -1160,9 +1161,7 @@ def forward(
     """Bert Model with a `language modeling` head on top for CLM fine-tuning.""", BERT_START_DOCSTRING
 )
 class BertLMHeadModel(BertPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", r"cls.predictions.decoder.weight"]
+    _tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1184,7 +1183,6 @@ def set_output_embeddings(self, new_embeddings):
 
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=CausalLMOutputWithCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -1293,18 +1291,18 @@ def prepare_inputs_for_generation(
             "use_cache": use_cache,
         }
 
-    def _reorder_cache(self, past, beam_idx):
+    def _reorder_cache(self, past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
         return reordered_past
 
 
 @add_start_docstrings("""Bert Model with a `language modeling` head on top.""", BERT_START_DOCSTRING)
 class BertForMaskedLM(BertPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", r"cls.predictions.decoder.weight"]
+    _tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1329,7 +1327,6 @@ def set_output_embeddings(self, new_embeddings):
 
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1453,10 +1450,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import BertTokenizer, BertForNextSentencePrediction
+        >>> from transformers import AutoTokenizer, BertForNextSentencePrediction
         >>> import torch
 
-        >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
         >>> model = BertForNextSentencePrediction.from_pretrained("bert-base-uncased")
 
         >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
@@ -1537,7 +1534,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1639,7 +1635,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1719,9 +1714,6 @@ def forward(
     BERT_START_DOCSTRING,
 )
 class BertForTokenClassification(BertPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1738,7 +1730,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_TOKEN_CLASSIFICATION,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1806,9 +1797,6 @@ def forward(
     BERT_START_DOCSTRING,
 )
 class BertForQuestionAnswering(BertPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1821,7 +1809,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_QA,
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/bert/modeling_flax_bert.py b/src/transformers/models/bert/modeling_flax_bert.py
index f7c78632e5e9..99dfa2a0e2f9 100644
--- a/src/transformers/models/bert/modeling_flax_bert.py
+++ b/src/transformers/models/bert/modeling_flax_bert.py
@@ -15,12 +15,11 @@
 
 from typing import Callable, Optional, Tuple
 
-import numpy as np
-
 import flax
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen import combine_masks, make_causal_mask
 from flax.linen import partitioning as nn_partitioning
@@ -55,7 +54,6 @@
 
 _CHECKPOINT_FOR_DOC = "bert-base-uncased"
 _CONFIG_FOR_DOC = "BertConfig"
-_TOKENIZER_FOR_DOC = "BertTokenizer"
 
 remat = nn_partitioning.remat
 
@@ -142,7 +140,7 @@ class FlaxBertForPreTrainingOutput(ModelOutput):
         input_ids (`numpy.ndarray` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -358,7 +356,7 @@ def __call__(
             attention_bias = lax.select(
                 attention_mask > 0,
                 jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
-                jnp.full(attention_mask.shape, -1e10).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
             )
         else:
             attention_bias = None
@@ -776,7 +774,7 @@ def __init__(
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
         gradient_checkpointing: bool = False,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(
             config=config,
@@ -1032,9 +1030,7 @@ class FlaxBertModel(FlaxBertPreTrainedModel):
     module_class = FlaxBertModule
 
 
-append_call_sample_docstring(
-    FlaxBertModel, _TOKENIZER_FOR_DOC, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutputWithPooling, _CONFIG_FOR_DOC
-)
+append_call_sample_docstring(FlaxBertModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutputWithPooling, _CONFIG_FOR_DOC)
 
 
 class FlaxBertForPreTrainingModule(nn.Module):
@@ -1062,7 +1058,6 @@ def __call__(
         output_hidden_states: bool = False,
         return_dict: bool = True,
     ):
-
         # Model
         outputs = self.bert(
             input_ids,
@@ -1116,9 +1111,9 @@ class FlaxBertForPreTraining(FlaxBertPreTrainedModel):
     Example:
 
     ```python
-    >>> from transformers import BertTokenizer, FlaxBertForPreTraining
+    >>> from transformers import AutoTokenizer, FlaxBertForPreTraining
 
-    >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
     >>> model = FlaxBertForPreTraining.from_pretrained("bert-base-uncased")
 
     >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
@@ -1201,9 +1196,7 @@ class FlaxBertForMaskedLM(FlaxBertPreTrainedModel):
     module_class = FlaxBertForMaskedLMModule
 
 
-append_call_sample_docstring(
-    FlaxBertForMaskedLM, _TOKENIZER_FOR_DOC, _CHECKPOINT_FOR_DOC, FlaxMaskedLMOutput, _CONFIG_FOR_DOC
-)
+append_call_sample_docstring(FlaxBertForMaskedLM, _CHECKPOINT_FOR_DOC, FlaxMaskedLMOutput, _CONFIG_FOR_DOC)
 
 
 class FlaxBertForNextSentencePredictionModule(nn.Module):
@@ -1273,9 +1266,9 @@ class FlaxBertForNextSentencePrediction(FlaxBertPreTrainedModel):
     Example:
 
     ```python
-    >>> from transformers import BertTokenizer, FlaxBertForNextSentencePrediction
+    >>> from transformers import AutoTokenizer, FlaxBertForNextSentencePrediction
 
-    >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
     >>> model = FlaxBertForNextSentencePrediction.from_pretrained("bert-base-uncased")
 
     >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
@@ -1372,7 +1365,6 @@ class FlaxBertForSequenceClassification(FlaxBertPreTrainedModel):
 
 append_call_sample_docstring(
     FlaxBertForSequenceClassification,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxSequenceClassifierOutput,
     _CONFIG_FOR_DOC,
@@ -1455,7 +1447,7 @@ class FlaxBertForMultipleChoice(FlaxBertPreTrainedModel):
     FlaxBertForMultipleChoice, BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
 )
 append_call_sample_docstring(
-    FlaxBertForMultipleChoice, _TOKENIZER_FOR_DOC, _CHECKPOINT_FOR_DOC, FlaxMultipleChoiceModelOutput, _CONFIG_FOR_DOC
+    FlaxBertForMultipleChoice, _CHECKPOINT_FOR_DOC, FlaxMultipleChoiceModelOutput, _CONFIG_FOR_DOC
 )
 
 
@@ -1530,7 +1522,7 @@ class FlaxBertForTokenClassification(FlaxBertPreTrainedModel):
 
 
 append_call_sample_docstring(
-    FlaxBertForTokenClassification, _TOKENIZER_FOR_DOC, _CHECKPOINT_FOR_DOC, FlaxTokenClassifierOutput, _CONFIG_FOR_DOC
+    FlaxBertForTokenClassification, _CHECKPOINT_FOR_DOC, FlaxTokenClassifierOutput, _CONFIG_FOR_DOC
 )
 
 
@@ -1604,7 +1596,6 @@ class FlaxBertForQuestionAnswering(FlaxBertPreTrainedModel):
 
 append_call_sample_docstring(
     FlaxBertForQuestionAnswering,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxQuestionAnsweringModelOutput,
     _CONFIG_FOR_DOC,
@@ -1686,7 +1677,7 @@ def __call__(
 class FlaxBertForCausalLM(FlaxBertPreTrainedModel):
     module_class = FlaxBertForCausalLMModule
 
-    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jnp.DeviceArray] = None):
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
         # initializing the cache
         batch_size, seq_length = input_ids.shape
 
@@ -1715,7 +1706,6 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
 
 append_call_sample_docstring(
     FlaxBertForCausalLM,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxCausalLMOutputWithCrossAttentions,
     _CONFIG_FOR_DOC,
diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py
index da9233955d6d..fd0a07b415f4 100644
--- a/src/transformers/models/bert/modeling_tf_bert.py
+++ b/src/transformers/models/bert/modeling_tf_bert.py
@@ -15,6 +15,9 @@
 # limitations under the License.
 """ TF 2.0 BERT model."""
 
+
+from __future__ import annotations
+
 import math
 import warnings
 from dataclasses import dataclass
@@ -49,10 +52,8 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
-    DUMMY_INPUTS,
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -67,7 +68,6 @@
 
 _CHECKPOINT_FOR_DOC = "bert-base-uncased"
 _CONFIG_FOR_DOC = "BertConfig"
-_TOKENIZER_FOR_DOC = "BertTokenizer"
 
 # TokenClassification docstring
 _CHECKPOINT_FOR_TOKEN_CLASSIFICATION = "dbmdz/bert-large-cased-finetuned-conll03-english"
@@ -149,8 +149,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
     def __init__(self, config: BertConfig, **kwargs):
         super().__init__(**kwargs)
 
-        self.vocab_size = config.vocab_size
-        self.type_vocab_size = config.type_vocab_size
+        self.config = config
         self.hidden_size = config.hidden_size
         self.max_position_embeddings = config.max_position_embeddings
         self.initializer_range = config.initializer_range
@@ -161,14 +160,14 @@ def build(self, input_shape: tf.TensorShape):
         with tf.name_scope("word_embeddings"):
             self.weight = self.add_weight(
                 name="weight",
-                shape=[self.vocab_size, self.hidden_size],
+                shape=[self.config.vocab_size, self.hidden_size],
                 initializer=get_initializer(self.initializer_range),
             )
 
         with tf.name_scope("token_type_embeddings"):
             self.token_type_embeddings = self.add_weight(
                 name="embeddings",
-                shape=[self.type_vocab_size, self.hidden_size],
+                shape=[self.config.type_vocab_size, self.hidden_size],
                 initializer=get_initializer(self.initializer_range),
             )
 
@@ -200,16 +199,7 @@ def call(
             raise ValueError("Need to provide either `input_ids` or `input_embeds`.")
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
@@ -463,9 +453,9 @@ def call(
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
         head_mask: tf.Tensor,
-        encoder_hidden_states: Optional[tf.Tensor],
-        encoder_attention_mask: Optional[tf.Tensor],
-        past_key_value: Optional[Tuple[tf.Tensor]],
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_value: Tuple[tf.Tensor] | None,
         output_attentions: bool,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
@@ -541,9 +531,9 @@ def call(
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
         head_mask: tf.Tensor,
-        encoder_hidden_states: Optional[tf.Tensor],
-        encoder_attention_mask: Optional[tf.Tensor],
-        past_key_values: Optional[Tuple[Tuple[tf.Tensor]]],
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_values: Tuple[Tuple[tf.Tensor]] | None,
         use_cache: Optional[bool],
         output_attentions: bool,
         output_hidden_states: bool,
@@ -648,7 +638,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
     def __init__(self, config: BertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs):
         super().__init__(**kwargs)
 
-        self.vocab_size = config.vocab_size
+        self.config = config
         self.hidden_size = config.hidden_size
 
         self.transform = TFBertPredictionHeadTransform(config, name="transform")
@@ -658,7 +648,7 @@ def __init__(self, config: BertConfig, input_embeddings: tf.keras.layers.Layer,
         self.input_embeddings = input_embeddings
 
     def build(self, input_shape: tf.TensorShape):
-        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
+        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
 
         super().build(input_shape)
 
@@ -674,14 +664,14 @@ def get_bias(self) -> Dict[str, tf.Variable]:
 
     def set_bias(self, value: tf.Variable):
         self.bias = value["bias"]
-        self.vocab_size = shape_list(value["bias"])[0]
+        self.config.vocab_size = shape_list(value["bias"])[0]
 
     def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
         hidden_states = self.transform(hidden_states=hidden_states)
         seq_length = shape_list(hidden_states)[1]
         hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
         hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
-        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
         hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
 
         return hidden_states
@@ -746,14 +736,14 @@ class PreTrainedModel
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -761,7 +751,6 @@ def call(
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFBaseModelOutputWithPoolingAndCrossAttentions, Tuple[tf.Tensor]]:
-
         if not self.config.is_decoder:
             use_cache = False
 
@@ -912,24 +901,6 @@ class TFBertPreTrainedModel(TFPreTrainedModel):
     config_class = BertConfig
     base_model_prefix = "bert"
 
-    @property
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        dummy = {"input_ids": tf.constant(DUMMY_INPUTS, dtype=tf.int32)}
-        # Add `encoder_hidden_states` to make the cross-attention layers' weights initialized
-        if self.config.add_cross_attention:
-            batch_size, seq_len = tf.constant(DUMMY_INPUTS).shape
-            shape = (batch_size, seq_len) + (self.config.hidden_size,)
-            h = tf.random.uniform(shape=shape)
-            dummy["encoder_hidden_states"] = h
-
-        return dummy
-
 
 @dataclass
 class TFBertForPreTrainingOutput(ModelOutput):
@@ -955,7 +926,7 @@ class TFBertForPreTrainingOutput(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     prediction_logits: tf.Tensor = None
     seq_relationship_logits: tf.Tensor = None
     hidden_states: Optional[Union[Tuple[tf.Tensor], tf.Tensor]] = None
@@ -1009,7 +980,7 @@ class TFBertForPreTrainingOutput(ModelOutput):
         input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -1073,21 +1044,20 @@ def __init__(self, config: BertConfig, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFBaseModelOutputWithPoolingAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -1133,26 +1103,6 @@ def call(
         )
         return outputs
 
-    def serving_output(
-        self, output: TFBaseModelOutputWithPoolingAndCrossAttentions
-    ) -> TFBaseModelOutputWithPoolingAndCrossAttentions:
-        output_cache = self.config.use_cache and self.config.is_decoder
-        pkv = tf.convert_to_tensor(output.past_key_values) if output_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = tf.convert_to_tensor(output.cross_attentions) if output.cross_attentions is not None else None
-        if not (self.config.output_attentions and self.config.add_cross_attention):
-            cross_attns = None
-
-        return TFBaseModelOutputWithPoolingAndCrossAttentions(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            past_key_values=pkv,
-            hidden_states=hs,
-            attentions=attns,
-            cross_attentions=cross_attns,
-        )
-
 
 @add_start_docstrings(
     """
@@ -1188,17 +1138,17 @@ def get_prefix_bias_name(self) -> str:
     @replace_return_docstrings(output_type=TFBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        next_sentence_label: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        next_sentence_label: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFBertForPreTrainingOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1221,9 +1171,9 @@ def call(
 
         ```python
         >>> import tensorflow as tf
-        >>> from transformers import BertTokenizer, TFBertForPreTraining
+        >>> from transformers import AutoTokenizer, TFBertForPreTraining
 
-        >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
         >>> model = TFBertForPreTraining.from_pretrained("bert-base-uncased")
         >>> input_ids = tokenizer("Hello, my dog is cute", add_special_tokens=True, return_tensors="tf")
         >>> # Batch size 1
@@ -1265,17 +1215,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFBertForPreTrainingOutput) -> TFBertForPreTrainingOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBertForPreTrainingOutput(
-            prediction_logits=output.prediction_logits,
-            seq_relationship_logits=output.seq_relationship_logits,
-            hidden_states=hs,
-            attentions=attns,
-        )
-
 
 @add_start_docstrings("""Bert Model with a `language modeling` head on top.""", BERT_START_DOCSTRING)
 class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss):
@@ -1309,7 +1248,6 @@ def get_prefix_bias_name(self) -> str:
     @unpack_inputs
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFMaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1318,16 +1256,16 @@ def get_prefix_bias_name(self) -> str:
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1363,12 +1301,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss):
     # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
@@ -1409,27 +1341,26 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attenti
 
     @unpack_inputs
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFCausalLMOutputWithCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
         **kwargs,
     ) -> Union[TFCausalLMOutputWithCrossAttentions, Tuple[tf.Tensor]]:
@@ -1495,19 +1426,6 @@ def call(
             cross_attentions=outputs.cross_attentions,
         )
 
-    def serving_output(self, output: TFCausalLMOutputWithCrossAttentions) -> TFCausalLMOutputWithCrossAttentions:
-        output_cache = self.config.use_cache and self.config.is_decoder
-        pkv = tf.convert_to_tensor(output.past_key_values) if output_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = tf.convert_to_tensor(output.cross_attentions) if output.cross_attentions is not None else None
-        if not (self.config.output_attentions and self.config.add_cross_attention):
-            cross_attns = None
-
-        return TFCausalLMOutputWithCrossAttentions(
-            logits=output.logits, past_key_values=pkv, hidden_states=hs, attentions=attns, cross_attentions=cross_attns
-        )
-
 
 @add_start_docstrings(
     """Bert Model with a `next sentence prediction (classification)` head on top.""",
@@ -1528,16 +1446,16 @@ def __init__(self, config: BertConfig, *inputs, **kwargs):
     @replace_return_docstrings(output_type=TFNextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        next_sentence_label: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        next_sentence_label: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFNextSentencePredictorOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1547,9 +1465,9 @@ def call(
 
         ```python
         >>> import tensorflow as tf
-        >>> from transformers import BertTokenizer, TFBertForNextSentencePrediction
+        >>> from transformers import AutoTokenizer, TFBertForNextSentencePrediction
 
-        >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
         >>> model = TFBertForNextSentencePrediction.from_pretrained("bert-base-uncased")
 
         >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
@@ -1590,12 +1508,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFNextSentencePredictorOutput) -> TFNextSentencePredictorOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFNextSentencePredictorOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1628,7 +1540,6 @@ def __init__(self, config: BertConfig, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
         output_type=TFSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1637,16 +1548,16 @@ def __init__(self, config: BertConfig, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1683,12 +1594,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1711,36 +1616,25 @@ def __init__(self, config: BertConfig, *inputs, **kwargs):
             units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
         )
 
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            tf.Tensor with dummy inputs
-        """
-        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32)}
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFMultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1799,26 +1693,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-                "token_type_ids": tf.TensorSpec((None, None, None), tf.int32, name="token_type_ids"),
-            }
-        ]
-    )
-    def serving(self, inputs: Dict[str, tf.Tensor]) -> TFMultipleChoiceModelOutput:
-        output = self.call(input_ids=inputs)
-
-        return self.serving_output(output)
-
-    def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMultipleChoiceModelOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1857,7 +1731,6 @@ def __init__(self, config: BertConfig, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_TOKEN_CLASSIFICATION,
         output_type=TFTokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1866,16 +1739,16 @@ def __init__(self, config: BertConfig, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1910,12 +1783,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1949,26 +1816,27 @@ def __init__(self, config: BertConfig, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_QA,
         output_type=TFQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
+        qa_target_start_index=_QA_TARGET_START_INDEX,
+        qa_target_end_index=_QA_TARGET_END_INDEX,
         expected_output=_QA_EXPECTED_OUTPUT,
         expected_loss=_QA_EXPECTED_LOSS,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -2016,11 +1884,3 @@ def call(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
-
-    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
diff --git a/src/transformers/models/bert/tokenization_bert.py b/src/transformers/models/bert/tokenization_bert.py
index d398fc5154ae..16044973343b 100644
--- a/src/transformers/models/bert/tokenization_bert.py
+++ b/src/transformers/models/bert/tokenization_bert.py
@@ -194,22 +194,8 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
-        super().__init__(
-            do_lower_case=do_lower_case,
-            do_basic_tokenize=do_basic_tokenize,
-            never_split=never_split,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            tokenize_chinese_chars=tokenize_chinese_chars,
-            strip_accents=strip_accents,
-            **kwargs,
-        )
-
         if not os.path.isfile(vocab_file):
             raise ValueError(
                 f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
@@ -225,7 +211,22 @@ def __init__(
                 tokenize_chinese_chars=tokenize_chinese_chars,
                 strip_accents=strip_accents,
             )
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+
+        super().__init__(
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
 
     @property
     def do_lower_case(self):
@@ -238,11 +239,12 @@ def vocab_size(self):
     def get_vocab(self):
         return dict(self.vocab, **self.added_tokens_encoder)
 
-    def _tokenize(self, text):
+    def _tokenize(self, text, split_special_tokens=False):
         split_tokens = []
         if self.do_basic_tokenize:
-            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-
+            for token in self.basic_tokenizer.tokenize(
+                text, never_split=self.all_special_tokens if not split_special_tokens else None
+            ):
                 # If the token is part of the never_split set
                 if token in self.basic_tokenizer.never_split:
                     split_tokens.append(token)
@@ -386,20 +388,30 @@ class BasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -418,7 +430,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -446,7 +460,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
diff --git a/src/transformers/models/bert/tokenization_bert_fast.py b/src/transformers/models/bert/tokenization_bert_fast.py
index b057f7e4ce79..80d542367dca 100644
--- a/src/transformers/models/bert/tokenization_bert_fast.py
+++ b/src/transformers/models/bert/tokenization_bert_fast.py
@@ -216,7 +216,7 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
@@ -265,7 +265,7 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
         """
         output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
 
-        if token_ids_1:
+        if token_ids_1 is not None:
             output += token_ids_1 + [self.sep_token_id]
 
         return output
diff --git a/src/transformers/models/bert/tokenization_bert_tf.py b/src/transformers/models/bert/tokenization_bert_tf.py
index e7ef0b411d2e..53adb390fa2f 100644
--- a/src/transformers/models/bert/tokenization_bert_tf.py
+++ b/src/transformers/models/bert/tokenization_bert_tf.py
@@ -2,7 +2,6 @@
 from typing import List, Union
 
 import tensorflow as tf
-
 from tensorflow_text import BertTokenizer as BertTokenizerLayer
 from tensorflow_text import FastBertTokenizer, ShrinkLongestTrimmer, case_fold_utf8, combine_segments, pad_model_inputs
 
@@ -49,7 +48,9 @@ class TFBertTokenizer(tf.keras.layers.Layer):
         return_attention_mask (`bool`, *optional*, defaults to `True`):
             Whether to return the attention_mask.
         use_fast_bert_tokenizer (`bool`, *optional*, defaults to `True`):
-            If set to false will use standard TF Text BertTokenizer, making it servable by TF Serving.
+            If True, will use the FastBertTokenizer class from Tensorflow Text. If False, will use the BertTokenizer
+            class instead. BertTokenizer supports some additional options, but is slower and cannot be exported to
+            TFLite.
     """
 
     def __init__(
@@ -66,11 +67,12 @@ def __init__(
         return_token_type_ids: bool = True,
         return_attention_mask: bool = True,
         use_fast_bert_tokenizer: bool = True,
+        **tokenizer_kwargs,
     ):
         super().__init__()
         if use_fast_bert_tokenizer:
             self.tf_tokenizer = FastBertTokenizer(
-                vocab_list, token_out_type=tf.int64, lower_case_nfd_strip_accents=do_lower_case
+                vocab_list, token_out_type=tf.int64, lower_case_nfd_strip_accents=do_lower_case, **tokenizer_kwargs
             )
         else:
             lookup_table = tf.lookup.StaticVocabularyTable(
@@ -82,7 +84,9 @@ def __init__(
                 ),
                 num_oov_buckets=1,
             )
-            self.tf_tokenizer = BertTokenizerLayer(lookup_table, token_out_type=tf.int64, lower_case=do_lower_case)
+            self.tf_tokenizer = BertTokenizerLayer(
+                lookup_table, token_out_type=tf.int64, lower_case=do_lower_case, **tokenizer_kwargs
+            )
 
         self.vocab_list = vocab_list
         self.do_lower_case = do_lower_case
@@ -115,15 +119,24 @@ def from_tokenizer(cls, tokenizer: "PreTrainedTokenizerBase", **kwargs):  # noqa
         tf_tokenizer = TFBertTokenizer.from_tokenizer(tokenizer)
         ```
         """
+        do_lower_case = kwargs.pop("do_lower_case", None)
+        do_lower_case = tokenizer.do_lower_case if do_lower_case is None else do_lower_case
+        cls_token_id = kwargs.pop("cls_token_id", None)
+        cls_token_id = tokenizer.cls_token_id if cls_token_id is None else cls_token_id
+        sep_token_id = kwargs.pop("sep_token_id", None)
+        sep_token_id = tokenizer.sep_token_id if sep_token_id is None else sep_token_id
+        pad_token_id = kwargs.pop("pad_token_id", None)
+        pad_token_id = tokenizer.pad_token_id if pad_token_id is None else pad_token_id
+
         vocab = tokenizer.get_vocab()
-        vocab = sorted([(wordpiece, idx) for wordpiece, idx in vocab.items()], key=lambda x: x[1])
+        vocab = sorted(vocab.items(), key=lambda x: x[1])
         vocab_list = [entry[0] for entry in vocab]
         return cls(
             vocab_list=vocab_list,
-            do_lower_case=tokenizer.do_lower_case,
-            cls_token_id=tokenizer.cls_token_id,
-            sep_token_id=tokenizer.sep_token_id,
-            pad_token_id=tokenizer.pad_token_id,
+            do_lower_case=do_lower_case,
+            cls_token_id=cls_token_id,
+            sep_token_id=sep_token_id,
+            pad_token_id=pad_token_id,
             **kwargs,
         )
 
diff --git a/src/transformers/models/bert_generation/__init__.py b/src/transformers/models/bert_generation/__init__.py
index 14d570bb33bb..14cf8bb58793 100644
--- a/src/transformers/models/bert_generation/__init__.py
+++ b/src/transformers/models/bert_generation/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/bert_generation/configuration_bert_generation.py b/src/transformers/models/bert_generation/configuration_bert_generation.py
index d602de22f044..f0cb795d9361 100644
--- a/src/transformers/models/bert_generation/configuration_bert_generation.py
+++ b/src/transformers/models/bert_generation/configuration_bert_generation.py
@@ -100,7 +100,7 @@ def __init__(
         eos_token_id=1,
         position_embedding_type="absolute",
         use_cache=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
diff --git a/src/transformers/models/bert_generation/modeling_bert_generation.py b/src/transformers/models/bert_generation/modeling_bert_generation.py
index eec18e03d65b..f245ac155e75 100755
--- a/src/transformers/models/bert_generation/modeling_bert_generation.py
+++ b/src/transformers/models/bert_generation/modeling_bert_generation.py
@@ -40,7 +40,6 @@
 
 _CHECKPOINT_FOR_DOC = "google/bert_for_seq_generation_L-24_bbc_encoder"
 _CONFIG_FOR_DOC = "BertGenerationConfig"
-_TOKENIZER_FOR_DOC = "BertGenerationTokenizer"
 
 
 # Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->BertGeneration
@@ -386,6 +385,13 @@ def forward(
         all_self_attentions = () if output_attentions else None
         all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         next_decoder_cache = () if use_cache else None
         for i, layer_module in enumerate(self.layer):
             if output_hidden_states:
@@ -396,12 +402,6 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         return module(*inputs, past_key_value, output_attentions)
@@ -465,7 +465,6 @@ def load_tf_weights_in_bert_generation(
     try:
         import numpy as np
         import tensorflow.compat.v1 as tf
-
         import tensorflow_hub as hub
         import tensorflow_text  # noqa: F401
 
@@ -557,7 +556,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
     def forward(self, input_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0):
         if input_ids is not None:
@@ -589,7 +590,6 @@ class BertGenerationPreTrainedModel(PreTrainedModel):
     config_class = BertGenerationConfig
     base_model_prefix = "bert"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -633,7 +633,7 @@ def _set_gradient_checkpointing(self, module, value=False):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertGenerationTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -717,7 +717,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(BERT_GENERATION_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPastAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -769,6 +768,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -862,7 +862,7 @@ def _tie_weights(self):
     BERT_GENERATION_START_DOCSTRING,
 )
 class BertGenerationDecoder(BertGenerationPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["lm_head.decoder.weight", "lm_head.decoder.bias", "embeddings.position_ids"]
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -929,10 +929,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import BertGenerationTokenizer, BertGenerationDecoder, BertGenerationConfig
+        >>> from transformers import AutoTokenizer, BertGenerationDecoder, BertGenerationConfig
         >>> import torch
 
-        >>> tokenizer = BertGenerationTokenizer.from_pretrained("google/bert_for_seq_generation_L-24_bbc_encoder")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/bert_for_seq_generation_L-24_bbc_encoder")
         >>> config = BertGenerationConfig.from_pretrained("google/bert_for_seq_generation_L-24_bbc_encoder")
         >>> config.is_decoder = True
         >>> model = BertGenerationDecoder.from_pretrained(
@@ -1002,5 +1002,7 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attenti
     def _reorder_cache(self, past_key_values, beam_idx):
         reordered_past = ()
         for layer_past in past_key_values:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
         return reordered_past
diff --git a/src/transformers/models/bert_generation/tokenization_bert_generation.py b/src/transformers/models/bert_generation/tokenization_bert_generation.py
index 711dcdf50c25..f8d49f86ac51 100644
--- a/src/transformers/models/bert_generation/tokenization_bert_generation.py
+++ b/src/transformers/models/bert_generation/tokenization_bert_generation.py
@@ -92,10 +92,15 @@ def __init__(
         pad_token="<pad>",
         sep_token="<::::>",
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
 
+        self.vocab_file = vocab_file
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+
         # Add extra_ids to the special token list
         super().__init__(
             bos_token=bos_token,
@@ -107,11 +112,6 @@ def __init__(
             **kwargs,
         )
 
-        self.vocab_file = vocab_file
-
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(vocab_file)
-
     @property
     def vocab_size(self):
         return self.sp_model.get_piece_size()
diff --git a/src/transformers/models/bert_japanese/__init__.py b/src/transformers/models/bert_japanese/__init__.py
index 3331d71c5e12..a569c3cc54bf 100644
--- a/src/transformers/models/bert_japanese/__init__.py
+++ b/src/transformers/models/bert_japanese/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py
index 27d66ae9a990..e0f09c20b2e6 100644
--- a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py
+++ b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py
@@ -158,27 +158,8 @@ def __init__(
         mecab_kwargs=None,
         sudachi_kwargs=None,
         jumanpp_kwargs=None,
-        **kwargs
+        **kwargs,
     ):
-        super().__init__(
-            spm_file=spm_file,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            do_lower_case=do_lower_case,
-            do_word_tokenize=do_word_tokenize,
-            do_subword_tokenize=do_subword_tokenize,
-            word_tokenizer_type=word_tokenizer_type,
-            subword_tokenizer_type=subword_tokenizer_type,
-            never_split=never_split,
-            mecab_kwargs=mecab_kwargs,
-            sudachi_kwargs=sudachi_kwargs,
-            jumanpp_kwargs=jumanpp_kwargs,
-            **kwargs,
-        )
-
         if subword_tokenizer_type == "sentencepiece":
             if not os.path.isfile(spm_file):
                 raise ValueError(
@@ -226,13 +207,31 @@ def __init__(
         self.subword_tokenizer_type = subword_tokenizer_type
         if do_subword_tokenize:
             if subword_tokenizer_type == "wordpiece":
-                self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+                self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
             elif subword_tokenizer_type == "character":
-                self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+                self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab, unk_token=str(unk_token))
             elif subword_tokenizer_type == "sentencepiece":
-                self.subword_tokenizer = SentencepieceTokenizer(vocab=self.spm_file, unk_token=self.unk_token)
+                self.subword_tokenizer = SentencepieceTokenizer(vocab=self.spm_file, unk_token=str(unk_token))
             else:
                 raise ValueError(f"Invalid subword_tokenizer_type '{subword_tokenizer_type}' is specified.")
+        super().__init__(
+            spm_file=spm_file,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            do_lower_case=do_lower_case,
+            do_word_tokenize=do_word_tokenize,
+            do_subword_tokenize=do_subword_tokenize,
+            word_tokenizer_type=word_tokenizer_type,
+            subword_tokenizer_type=subword_tokenizer_type,
+            never_split=never_split,
+            mecab_kwargs=mecab_kwargs,
+            sudachi_kwargs=sudachi_kwargs,
+            jumanpp_kwargs=jumanpp_kwargs,
+            **kwargs,
+        )
 
     @property
     def do_lower_case(self):
@@ -748,20 +747,30 @@ class BasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -780,7 +789,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -808,7 +819,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
diff --git a/src/transformers/models/bertweet/__init__.py b/src/transformers/models/bertweet/__init__.py
index 5663d6d6731f..42e4a23337c2 100644
--- a/src/transformers/models/bertweet/__init__.py
+++ b/src/transformers/models/bertweet/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/bertweet/tokenization_bertweet.py b/src/transformers/models/bertweet/tokenization_bertweet.py
index 50de2db480fa..75975680dde5 100644
--- a/src/transformers/models/bertweet/tokenization_bertweet.py
+++ b/src/transformers/models/bertweet/tokenization_bertweet.py
@@ -77,7 +77,7 @@ class BertweetTokenizer(PreTrainedTokenizer):
             Path to the vocabulary file.
         merges_file (`str`):
             Path to the merges file.
-        normalization (`bool`, *optional*, defaults to `False`)
+        normalization (`bool`, *optional*, defaults to `False`):
             Whether or not to apply a normalization preprocess.
         bos_token (`str`, *optional*, defaults to `"<s>"`):
             The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
@@ -132,20 +132,8 @@ def __init__(
         unk_token="<unk>",
         pad_token="<pad>",
         mask_token="<mask>",
-        **kwargs
+        **kwargs,
     ):
-        super().__init__(
-            normalization=normalization,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            unk_token=unk_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            **kwargs,
-        )
-
         try:
             from emoji import demojize
 
@@ -161,10 +149,10 @@ def __init__(
         self.merges_file = merges_file
 
         self.encoder = {}
-        self.encoder[self.bos_token] = 0
-        self.encoder[self.pad_token] = 1
-        self.encoder[self.eos_token] = 2
-        self.encoder[self.unk_token] = 3
+        self.encoder[bos_token] = 0
+        self.encoder[pad_token] = 1
+        self.encoder[eos_token] = 2
+        self.encoder[unk_token] = 3
 
         self.add_from_file(vocab_file)
 
@@ -178,9 +166,20 @@ def __init__(
 
         self.normalization = normalization
         self.tweetPreprocessor = TweetTokenizer()
-
         self.special_puncts = {"’": "'", "…": "..."}
 
+        super().__init__(
+            normalization=normalization,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            **kwargs,
+        )
+
     def build_inputs_with_special_tokens(
         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
     ) -> List[int]:
@@ -318,7 +317,7 @@ def _tokenize(self, text):
         split_tokens = []
         words = re.findall(r"\S+\n?", text)
         for token in words:
-            split_tokens.extend([t for t in self.bpe(token).split(" ")])
+            split_tokens.extend(list(self.bpe(token).split(" ")))
         return split_tokens
 
     def normalizeTweet(self, tweet):
@@ -398,8 +397,12 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
             save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
         )
 
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
             copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
 
         if os.path.abspath(self.merges_file) != os.path.abspath(out_merge_file):
             copyfile(self.merges_file, out_merge_file)
@@ -640,9 +643,17 @@ def _replace_html_entities(text, keep=(), remove_illegal=True, encoding="utf-8")
 
     See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py
 
-        >>> from nltk.tokenize.casual import _replace_html_entities >>> _replace_html_entities(b'Price: &pound;100')
-        'Price: \\xa3100' >>> print(_replace_html_entities(b'Price: &pound;100')) Price: £100 >>>
-    """
+    Examples:
+
+    ```python
+    >>> from nltk.tokenize.casual import _replace_html_entities
+
+    >>> _replace_html_entities(b"Price: &pound;100")
+    'Price: \\xa3100'
+
+    >>> print(_replace_html_entities(b"Price: &pound;100"))
+    Price: £100
+    ```"""
 
     def _convert_entity(match):
         entity_body = match.group(3)
@@ -726,7 +737,7 @@ def tokenize(self, text):
         words = WORD_RE.findall(safe_text)
         # Possibly alter the case, but avoid changing emoticons like :D into :d:
         if not self.preserve_case:
-            words = list(map((lambda x: x if EMOTICON_RE.search(x) else x.lower()), words))
+            words = [x if EMOTICON_RE.search(x) else x.lower() for x in words]
         return words
 
 
diff --git a/src/transformers/models/big_bird/__init__.py b/src/transformers/models/big_bird/__init__.py
index 425d88402d87..ef8ad80aa6b5 100644
--- a/src/transformers/models/big_bird/__init__.py
+++ b/src/transformers/models/big_bird/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/big_bird/configuration_big_bird.py b/src/transformers/models/big_bird/configuration_big_bird.py
index d9bcbfef081b..53bf1ee6f44b 100644
--- a/src/transformers/models/big_bird/configuration_big_bird.py
+++ b/src/transformers/models/big_bird/configuration_big_bird.py
@@ -131,7 +131,7 @@ def __init__(
         block_size=64,
         num_random_blocks=3,
         classifier_dropout=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             pad_token_id=pad_token_id,
diff --git a/src/transformers/models/big_bird/convert_bigbird_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/big_bird/convert_bigbird_original_tf_checkpoint_to_pytorch.py
index 614443d81a5e..34db9771b1e7 100644
--- a/src/transformers/models/big_bird/convert_bigbird_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/big_bird/convert_bigbird_original_tf_checkpoint_to_pytorch.py
@@ -43,7 +43,6 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, big_bird_config_file, p
 
 
 if __name__ == "__main__":
-
     parser = argparse.ArgumentParser()
     # Required parameters
     parser.add_argument(
diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py
index bc3d037b6c61..867aca67e99e 100755
--- a/src/transformers/models/big_bird/modeling_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_big_bird.py
@@ -53,7 +53,6 @@
 
 _CHECKPOINT_FOR_DOC = "google/bigbird-roberta-base"
 _CONFIG_FOR_DOC = "BigBirdConfig"
-_TOKENIZER_FOR_DOC = "BigBirdTokenizer"
 
 BIG_BIRD_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "google/bigbird-roberta-base",
@@ -228,7 +227,7 @@ def load_tf_weights_trivia_qa(init_vars):
                 raise ValueError(
                     f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched of {txt_name}."
                 )
-        except AssertionError as e:
+        except ValueError as e:
             e.args += (pointer.shape, array.shape)
             raise
         pt_weight_name = ".".join(pt_name)
@@ -258,7 +257,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
         )
@@ -535,7 +536,6 @@ def bigbird_block_sparse_attention(
         plan_num_rand_blocks,
         output_attentions,
     ):
-
         # BigBird block-sparse attention as suggested in paper
 
         # ITC:
@@ -973,11 +973,8 @@ def torch_gather_b2(params, indices):
         num_indices_to_gather = indices.shape[-2] * indices.shape[-1]
         num_indices_to_pick_from = params.shape[2]
 
-        indices_shift = (
-            torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device)
-            // num_indices_to_gather
-            * num_indices_to_pick_from
-        )
+        shift = torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device)
+        indices_shift = torch.div(shift, num_indices_to_gather, rounding_mode="floor") * num_indices_to_pick_from
 
         flattened_indices = indices.view(-1) + indices_shift
         flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1])
@@ -1057,9 +1054,8 @@ def _get_rand_attn_plan(from_seq_length, from_block_size, num_rand_blocks):
 
         return plan_from_length, plan_num_rand_blocks
 
-    @staticmethod
     def _bigbird_block_rand_mask(
-        from_seq_length, to_seq_length, from_block_size, to_block_size, num_rand_blocks, last_idx=-1
+        self, from_seq_length, to_seq_length, from_block_size, to_block_size, num_rand_blocks, last_idx=-1
     ):
         """
         Create adjacency list of random attention.
@@ -1082,6 +1078,9 @@ def _bigbird_block_rand_mask(
             raise ValueError("Error the number of blocks needs to be same!")
 
         rand_attn = np.zeros((from_seq_length // from_block_size - 2, num_rand_blocks), dtype=np.int32)
+        # During inference (eval) no randomness
+        if not self.training:
+            return rand_attn
         middle_seq = np.arange(1, to_seq_length // to_block_size - 1, dtype=np.int32)
         last = to_seq_length // to_block_size - 1
         if last_idx > (2 * to_block_size):
@@ -1165,11 +1164,17 @@ def _bigbird_block_rand_mask_with_head(
         plan_block_length = np.array(plan_from_length) // from_block_size
         # till when to follow plan
         max_plan_idx = plan_from_length.index(from_seq_length)
+
         # Random Attention adjacency list
         rand_attn = [
             np.zeros((num_blocks, np.sum(plan_num_rand_blocks[: max_plan_idx + 1])), dtype=np.int32)
             for i in range(num_heads)
         ]
+        # During inference (eval) no randomness
+        if not self.training:
+            for nh in range(num_heads):
+                rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :]
+            return rand_attn
 
         # We will go iteratively over the plan blocks and pick random number of
         # Attention blocks from the legally allowed blocks
@@ -1358,7 +1363,6 @@ def set_attention_type(self, value: str):
         attn_weights.key = self.self.key
         self.self = attn_weights
         self.attention_type = value
-
         if not self.training:
             self.self.eval()
 
@@ -1385,7 +1389,6 @@ def forward(
             from_mask = from_mask.to(hidden_states.dtype)
         if to_mask is not None:
             to_mask = to_mask.to(hidden_states.dtype)
-
         if self.attention_type == "original_full":
             self_outputs = self.self(
                 hidden_states,
@@ -1597,6 +1600,13 @@ def forward(
         all_self_attentions = () if output_attentions else None
         all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         next_decoder_cache = () if use_cache else None
 
         for i, layer_module in enumerate(self.layer):
@@ -1608,12 +1618,6 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         return module(*inputs, past_key_value, output_attentions)
@@ -1633,7 +1637,6 @@ def custom_forward(*inputs):
                     blocked_encoder_mask,
                 )
             else:
-
                 layer_outputs = layer_module(
                     hidden_states,
                     attention_mask,
@@ -1764,7 +1767,6 @@ class BigBirdPreTrainedModel(PreTrainedModel):
     load_tf_weights = load_tf_weights_in_big_bird
     base_model_prefix = "bert"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -1803,7 +1805,7 @@ def _set_gradient_checkpointing(self, module, value=False):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BigBirdTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -1980,7 +1982,6 @@ def set_attention_type(self, value: str):
 
     @add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPoolingAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -2034,6 +2035,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -2180,7 +2182,6 @@ def forward(
 
     @staticmethod
     def create_masks_for_block_sparse_attn(attention_mask: torch.Tensor, block_size: int):
-
         batch_size, seq_length = attention_mask.size()
         if seq_length % block_size != 0:
             raise ValueError(
@@ -2262,7 +2263,7 @@ def _pad_to_block_size(
 
 
 class BigBirdForPreTraining(BigBirdPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -2315,10 +2316,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import BigBirdTokenizer, BigBirdForPreTraining
+        >>> from transformers import AutoTokenizer, BigBirdForPreTraining
         >>> import torch
 
-        >>> tokenizer = BigBirdTokenizer.from_pretrained("google/bigbird-roberta-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base")
         >>> model = BigBirdForPreTraining.from_pretrained("google/bigbird-roberta-base")
 
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
@@ -2368,7 +2369,7 @@ def forward(
 
 @add_start_docstrings("""BigBird Model with a `language modeling` head on top.""", BIG_BIRD_START_DOCSTRING)
 class BigBirdForMaskedLM(BigBirdPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -2420,10 +2421,10 @@ def forward(
 
         ```python
         >>> import torch
-        >>> from transformers import BigBirdTokenizer, BigBirdForMaskedLM
+        >>> from transformers import AutoTokenizer, BigBirdForMaskedLM
         >>> from datasets import load_dataset
 
-        >>> tokenizer = BigBirdTokenizer.from_pretrained("google/bigbird-roberta-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base")
         >>> model = BigBirdForMaskedLM.from_pretrained("google/bigbird-roberta-base")
         >>> squad_ds = load_dataset("squad_v2", split="train")  # doctest: +IGNORE_RESULT
 
@@ -2454,7 +2455,7 @@ def forward(
         >>> labels = torch.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100)
         >>> outputs = model(**inputs, labels=labels)
         >>> round(outputs.loss.item(), 2)
-        1.08
+        1.99
         ```
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -2512,12 +2513,7 @@ def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_
     """BigBird Model with a `language modeling` head on top for CLM fine-tuning.""", BIG_BIRD_START_DOCSTRING
 )
 class BigBirdForCausalLM(BigBirdPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        r"position_ids",
-        r"predictions.decoder.bias",
-        "cls.predictions.decoder.weight",
-        "cls.predictions.decoder.bias",
-    ]
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -2539,7 +2535,6 @@ def set_output_embeddings(self, new_embeddings):
 
     @add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=CausalLMOutputWithCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -2643,7 +2638,8 @@ def _reorder_cache(self, past_key_values, beam_idx):
         reordered_past = ()
         for layer_past in past_key_values:
             reordered_past += (
-                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
+                + layer_past[2:],
             )
         return reordered_past
 
@@ -2717,10 +2713,10 @@ def forward(
 
         ```python
         >>> import torch
-        >>> from transformers import BigBirdTokenizer, BigBirdForSequenceClassification
+        >>> from transformers import AutoTokenizer, BigBirdForSequenceClassification
         >>> from datasets import load_dataset
 
-        >>> tokenizer = BigBirdTokenizer.from_pretrained("l-yohai/bigbird-roberta-base-mnli")
+        >>> tokenizer = AutoTokenizer.from_pretrained("l-yohai/bigbird-roberta-base-mnli")
         >>> model = BigBirdForSequenceClassification.from_pretrained("l-yohai/bigbird-roberta-base-mnli")
         >>> squad_ds = load_dataset("squad_v2", split="train")  # doctest: +IGNORE_RESULT
 
@@ -2822,7 +2818,6 @@ def __init__(self, config):
         BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
     )
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -2918,15 +2913,9 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint="vumichien/token-classification-bigbird-roberta-base-random",
+        checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output=(
-            "['LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_1', "
-            "'LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_1']"
-        ),
-        expected_loss=0.54,
     )
     def forward(
         self,
@@ -3024,9 +3013,9 @@ def __init__(self, config, add_pooling_layer=False):
     @replace_return_docstrings(output_type=BigBirdForQuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
-        question_lengths=None,
+        question_lengths: Optional[torch.Tensor] = None,
         token_type_ids: Optional[torch.LongTensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         head_mask: Optional[torch.FloatTensor] = None,
@@ -3053,11 +3042,11 @@ def forward(
 
         ```python
         >>> import torch
-        >>> from transformers import BigBirdTokenizer, BigBirdForQuestionAnswering
+        >>> from transformers import AutoTokenizer, BigBirdForQuestionAnswering
         >>> from datasets import load_dataset
 
-        >>> tokenizer = BigBirdTokenizer.from_pretrained("abhinavkulkarni/bigbird-roberta-base-finetuned-squad")
-        >>> model = BigBirdForQuestionAnswering.from_pretrained("abhinavkulkarni/bigbird-roberta-base-finetuned-squad")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base")
+        >>> model = BigBirdForQuestionAnswering.from_pretrained("google/bigbird-roberta-base")
         >>> squad_ds = load_dataset("squad_v2", split="train")  # doctest: +IGNORE_RESULT
 
         >>> # select random article and question
@@ -3076,17 +3065,14 @@ def forward(
 
         >>> answer_start_index = outputs.start_logits.argmax()
         >>> answer_end_index = outputs.end_logits.argmax()
-        >>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
-        >>> tokenizer.decode(predict_answer_tokens)
-        '80 °C (176 °F) or more'
+        >>> predict_answer_token_ids = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
+        >>> predict_answer_token = tokenizer.decode(predict_answer_token_ids)
         ```
 
         ```python
         >>> target_start_index, target_end_index = torch.tensor([130]), torch.tensor([132])
         >>> outputs = model(**inputs, start_positions=target_start_index, end_positions=target_end_index)
         >>> loss = outputs.loss
-        >>> round(outputs.loss.item(), 2)
-        7.63
         ```
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
diff --git a/src/transformers/models/big_bird/modeling_flax_big_bird.py b/src/transformers/models/big_bird/modeling_flax_big_bird.py
index b38492f61fb9..afdac2645f26 100644
--- a/src/transformers/models/big_bird/modeling_flax_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_flax_big_bird.py
@@ -15,8 +15,6 @@
 
 from typing import Callable, Optional, Tuple
 
-import numpy as np
-
 import flax
 import flax.linen as nn
 import jax
@@ -53,7 +51,6 @@
 
 _CHECKPOINT_FOR_DOC = "google/bigbird-roberta-base"
 _CONFIG_FOR_DOC = "BigBirdConfig"
-_TOKENIZER_FOR_DOC = "BigBirdTokenizer"
 
 remat = nn_partitioning.remat
 
@@ -159,7 +156,7 @@ class FlaxBigBirdForQuestionAnsweringModelOutput(ModelOutput):
         input_ids (`numpy.ndarray` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BigBirdTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -380,7 +377,7 @@ def __call__(
             attention_bias = lax.select(
                 attention_mask > 0,
                 jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
-                jnp.full(attention_mask.shape, -1e10).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
             )
         else:
             attention_bias = None
@@ -461,6 +458,10 @@ def __call__(
         key_layer = self.transpose_for_scores(self.key(hidden_states), n_heads, head_size)
         value_layer = self.transpose_for_scores(self.value(hidden_states), n_heads, head_size)
 
+        indices_prng_key = None
+        if not deterministic:
+            indices_prng_key = self.make_rng("indices")
+
         attn_output, attn_weights = self.bigbird_block_sparse_attention(
             query_layer,
             key_layer,
@@ -472,6 +473,8 @@ def __call__(
             blocked_encoder_mask,
             n_heads,
             head_size,
+            indices_prng_key=indices_prng_key,
+            deterministic=deterministic,
             plan_from_length=None,
             plan_num_rand_blocks=None,
             output_attentions=output_attentions,
@@ -482,7 +485,6 @@ def __call__(
 
     @staticmethod
     def create_masks_for_block_sparse_attn(attention_mask, block_size: int):
-
         batch_size, seq_length = attention_mask.shape
         if seq_length % block_size != 0:
             raise ValueError(
@@ -531,6 +533,8 @@ def bigbird_block_sparse_attention(
         to_blocked_mask,
         n_heads,
         head_size,
+        indices_prng_key: Optional[jax.random.PRNGKey] = None,
+        deterministic: Optional[bool] = True,
         plan_from_length=None,
         plan_num_rand_blocks=None,
         output_attentions=None,
@@ -574,12 +578,18 @@ def bigbird_block_sparse_attention(
         rsqrt_d = 1 / jnp.sqrt(head_size)
         attn_mask_penalty = -10000.0
 
-        np.random.seed(self.block_sparse_seed)
         if from_seq_len in [1024, 3072, 4096]:  # old plans used in paper
             max_seqlen = self.config.max_position_embeddings
             rand_attn = [
                 self._bigbird_block_rand_mask(
-                    max_seqlen, max_seqlen, from_block_size, to_block_size, n_rand_blocks, last_idx=1024
+                    max_seqlen,
+                    max_seqlen,
+                    from_block_size,
+                    to_block_size,
+                    n_rand_blocks,
+                    indices_prng_key=indices_prng_key,
+                    deterministic=deterministic,
+                    last_idx=1024,
                 )[: (from_seq_len // from_block_size - 2)]
                 for _ in range(n_heads)
             ]
@@ -588,7 +598,6 @@ def bigbird_block_sparse_attention(
                 plan_from_length, plan_num_rand_blocks = self._get_rand_attn_plan(
                     from_seq_len, from_block_size, n_rand_blocks
                 )
-
             rand_attn = self._bigbird_block_rand_mask_with_head(
                 from_seq_length=from_seq_len,
                 to_seq_length=to_seq_len,
@@ -597,6 +606,7 @@ def bigbird_block_sparse_attention(
                 num_heads=n_heads,
                 plan_from_length=plan_from_length,
                 plan_num_rand_blocks=plan_num_rand_blocks,
+                indices_prng_key=indices_prng_key,
             )
 
         rand_attn = jnp.stack(rand_attn, axis=0)
@@ -945,7 +955,14 @@ def _get_rand_attn_plan(from_seq_length, from_block_size, num_rand_blocks):
 
     @staticmethod
     def _bigbird_block_rand_mask(
-        from_seq_length, to_seq_length, from_block_size, to_block_size, num_rand_blocks, last_idx=-1
+        from_seq_length,
+        to_seq_length,
+        from_block_size,
+        to_block_size,
+        num_rand_blocks,
+        indices_prng_key: Optional[jax.random.PRNGKey] = None,
+        deterministic: Optional[bool] = True,
+        last_idx: Optional[int] = -1,
     ):
         """
         Create adjacency list of random attention.
@@ -956,6 +973,8 @@ def _bigbird_block_rand_mask(
             from_block_size: int. size of block in from sequence.
             to_block_size: int. size of block in to sequence.
             num_rand_blocks: int. Number of random chunks per row.
+            indices_prng_key: jax.random.PRNGKey. PRNG key that is used to perform random jax operations.
+            deterministic: bool. When False random attention will be used.
             last_idx: if -1 then num_rand_blocks blocks chosen anywhere in to sequence,
             if positive then num_rand_blocks blocks chosen only up to last_idx.
 
@@ -966,9 +985,12 @@ def _bigbird_block_rand_mask(
 
         if from_seq_length // from_block_size != to_seq_length // to_block_size:
             raise ValueError("Error the number of blocks needs to be same!")
+        rand_attn = jnp.zeros((from_seq_length // from_block_size - 2, num_rand_blocks), dtype=jnp.int32)
+        # deterministic nor randomness
+        if deterministic:
+            return rand_attn
 
-        rand_attn = np.zeros((from_seq_length // from_block_size - 2, num_rand_blocks), dtype=np.int32)
-        middle_seq = np.arange(1, to_seq_length // to_block_size - 1, dtype=np.int32)
+        middle_seq = jnp.arange(1, to_seq_length // to_block_size - 1, dtype=jnp.int32)
         last = to_seq_length // to_block_size - 1
         if last_idx > (2 * to_block_size):
             last = (last_idx // to_block_size) - 1
@@ -978,25 +1000,31 @@ def _bigbird_block_rand_mask(
             start = i - 2
             end = i
             if i == 1:
-                rand_attn[i - 1, :] = np.random.permutation(middle_seq[2:last])[:r]
+                seq_values = jax.random.permutation(indices_prng_key, middle_seq[2:last])[:r]
+                rand_attn = rand_attn.at[i - 1].set(seq_values)
             elif i == 2:
-                rand_attn[i - 1, :] = np.random.permutation(middle_seq[3:last])[:r]
+                seq_values = jax.random.permutation(indices_prng_key, middle_seq[3:last])[:r]
+                rand_attn = rand_attn.at[i - 1].set(seq_values)
             elif i == from_seq_length // from_block_size - 3:
-                rand_attn[i - 1, :] = np.random.permutation(middle_seq[:last])[:r]
+                seq_values = jax.random.permutation(indices_prng_key, middle_seq[:last])[:r]
+                rand_attn = rand_attn.at[i - 1].set(seq_values)
             # Missing -3: should have been sliced till last-3
             elif i == from_seq_length // from_block_size - 2:
-                rand_attn[i - 1, :] = np.random.permutation(middle_seq[:last])[:r]
+                seq_values = jax.random.permutation(indices_prng_key, middle_seq[:last])[:r]
+                rand_attn = rand_attn.at[i - 1].set(seq_values)
             # Missing -4: should have been sliced till last-4
             else:
                 if start > last:
                     start = last
-                    rand_attn[i - 1, :] = np.random.permutation(middle_seq[:start])[:r]
+                    seq_values = jax.random.permutation(indices_prng_key, middle_seq[:start])[:r]
+                    rand_attn = rand_attn.at[i - 1].set(seq_values)
                 elif (end + 1) == last:
-                    rand_attn[i - 1, :] = np.random.permutation(middle_seq[:start])[:r]
+                    seq_values = jax.random.permutation(indices_prng_key, middle_seq[:start])[:r]
+                    rand_attn = rand_attn.at[i - 1].set(seq_values)
                 else:
-                    rand_attn[i - 1, :] = np.random.permutation(
-                        np.concatenate((middle_seq[:start], middle_seq[end + 1 : last]))
-                    )[:r]
+                    concat_values = jnp.concatenate((middle_seq[:start], middle_seq[end + 1 : last]))
+                    seq_values = jax.random.permutation(indices_prng_key, concat_values)[:r]
+                    rand_attn = rand_attn.at[i - 1].set(seq_values)
         return rand_attn
 
     def _bigbird_block_rand_mask_with_head(
@@ -1008,6 +1036,8 @@ def _bigbird_block_rand_mask_with_head(
         num_heads,
         plan_from_length,
         plan_num_rand_blocks,
+        indices_prng_key: Optional[jax.random.PRNGKey] = None,
+        deterministic: Optional[bool] = True,
         window_block_left=1,
         window_block_right=1,
         global_block_top=1,
@@ -1026,6 +1056,8 @@ def _bigbird_block_rand_mask_with_head(
             num_heads: int. total number of heads.
             plan_from_length: list. plan from length where num_random_blocks are choosen from.
             plan_num_rand_blocks: list. number of rand blocks within the plan.
+            indices_prng_key: jax.random.PRNGKey. PRNG key that is used to perform random jax operations.
+            deterministic: bool. When False random attention will be used.
             window_block_left: int. number of blocks of window to left of a block.
             window_block_right: int. number of blocks of window to right of a block.
             global_block_top: int. number of blocks at the top.
@@ -1048,15 +1080,22 @@ def _bigbird_block_rand_mask_with_head(
         # Total number of blocks in the mmask
         num_blocks = from_seq_length // from_block_size
         # Number of blocks per plan
-        plan_block_length = np.array(plan_from_length) // from_block_size
+        plan_block_length = jnp.array(plan_from_length) // from_block_size
         # till when to follow plan
         max_plan_idx = plan_from_length.index(from_seq_length)
+
         # Random Attention adjacency list
         rand_attn = [
-            np.zeros((num_blocks, np.sum(plan_num_rand_blocks[: max_plan_idx + 1])), dtype=np.int32)
+            jnp.zeros((num_blocks, sum(plan_num_rand_blocks[: max_plan_idx + 1])), dtype=jnp.int32)
             for i in range(num_heads)
         ]
 
+        # deterministic
+        if deterministic:
+            for nh in range(num_heads):
+                rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :]
+            return rand_attn
+
         # We will go iteratively over the plan blocks and pick random number of
         # Attention blocks from the legally allowed blocks
         for plan_idx in range(max_plan_idx + 1):
@@ -1067,11 +1106,11 @@ def _bigbird_block_rand_mask_with_head(
                 # column indx start fromm plan_block_length[plan_idx-1] and ends at
                 # plan_block_length[plan_idx]
                 if plan_num_rand_blocks[plan_idx] > 0:
-                    rnd_r_cnt = int(np.sum(plan_num_rand_blocks[:plan_idx]))
-                    curr_r_cnt = int(np.sum(plan_num_rand_blocks[: plan_idx + 1]))
+                    rnd_r_cnt = int(sum(plan_num_rand_blocks[:plan_idx]))
+                    curr_r_cnt = int(sum(plan_num_rand_blocks[: plan_idx + 1]))
                     for blk_rw_idx in range(global_block_top, plan_block_length[plan_idx - 1]):
                         for h in range(num_heads):
-                            rand_attn[h][blk_rw_idx, rnd_r_cnt:curr_r_cnt] = self._get_single_block_row_attention(
+                            single_block_row_attention = self._get_single_block_row_attention(
                                 block_id=blk_rw_idx,
                                 to_start_block_id=plan_block_length[plan_idx - 1],
                                 to_end_block_id=plan_block_length[plan_idx],
@@ -1080,6 +1119,10 @@ def _bigbird_block_rand_mask_with_head(
                                 window_block_right=window_block_right,
                                 global_block_left=global_block_left,
                                 global_block_right=global_block_right,
+                                indices_prng_key=indices_prng_key,
+                            )
+                            rand_attn[h] = (
+                                rand_attn[h].at[blk_rw_idx, rnd_r_cnt:curr_r_cnt].set(single_block_row_attention)
                             )
 
                 for pl_id in range(plan_idx):
@@ -1089,11 +1132,11 @@ def _bigbird_block_rand_mask_with_head(
                         rnd_r_cnt = 0
                         to_start_block_id = 0
                         if pl_id > 0:
-                            rnd_r_cnt = int(np.sum(plan_num_rand_blocks[:pl_id]))
+                            rnd_r_cnt = int(sum(plan_num_rand_blocks[:pl_id]))
                             to_start_block_id = plan_block_length[pl_id - 1]
-                        curr_r_cnt = int(np.sum(plan_num_rand_blocks[: pl_id + 1]))
+                        curr_r_cnt = int(sum(plan_num_rand_blocks[: pl_id + 1]))
                         for h in range(num_heads):
-                            rand_attn[h][blk_rw_idx, rnd_r_cnt:curr_r_cnt] = self._get_single_block_row_attention(
+                            single_block_row_attention = self._get_single_block_row_attention(
                                 block_id=blk_rw_idx,
                                 to_start_block_id=to_start_block_id,
                                 to_end_block_id=plan_block_length[pl_id],
@@ -1102,21 +1145,24 @@ def _bigbird_block_rand_mask_with_head(
                                 window_block_right=window_block_right,
                                 global_block_left=global_block_left,
                                 global_block_right=global_block_right,
+                                indices_prng_key=indices_prng_key,
+                            )
+                            rand_attn[h] = (
+                                rand_attn[h].at[blk_rw_idx, rnd_r_cnt:curr_r_cnt].set(single_block_row_attention)
                             )
 
             if plan_num_rand_blocks[plan_idx] == 0:
                 continue
-            curr_r_cnt = int(np.sum(plan_num_rand_blocks[: plan_idx + 1]))
+            curr_r_cnt = int(sum(plan_num_rand_blocks[: plan_idx + 1]))
             from_start_block_id = global_block_top
             to_start_block_id = 0
             if plan_idx > 0:
-                rnd_r_cnt = int(np.sum(plan_num_rand_blocks[:plan_idx]))
+                rnd_r_cnt = int(sum(plan_num_rand_blocks[:plan_idx]))
                 from_start_block_id = plan_block_length[plan_idx - 1]
                 to_start_block_id = plan_block_length[plan_idx - 1]
-
             for blk_rw_idx in range(from_start_block_id, plan_block_length[plan_idx]):
                 for h in range(num_heads):
-                    rand_attn[h][blk_rw_idx, rnd_r_cnt:curr_r_cnt] = self._get_single_block_row_attention(
+                    single_block_row_attention = self._get_single_block_row_attention(
                         block_id=blk_rw_idx,
                         to_start_block_id=to_start_block_id,
                         to_end_block_id=plan_block_length[plan_idx],
@@ -1125,11 +1171,12 @@ def _bigbird_block_rand_mask_with_head(
                         window_block_right=window_block_right,
                         global_block_left=global_block_left,
                         global_block_right=global_block_right,
+                        indices_prng_key=indices_prng_key,
                     )
+                    rand_attn[h] = rand_attn[h].at[blk_rw_idx, rnd_r_cnt:curr_r_cnt].set(single_block_row_attention)
 
         for nh in range(num_heads):
             rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :]
-
         return rand_attn
 
     @staticmethod
@@ -1138,6 +1185,7 @@ def _get_single_block_row_attention(
         to_start_block_id,
         to_end_block_id,
         num_rand_blocks,
+        indices_prng_key: Optional[jax.random.PRNGKey] = None,
         window_block_left=1,
         window_block_right=1,
         global_block_left=1,
@@ -1151,6 +1199,7 @@ def _get_single_block_row_attention(
             to_start_block_id: int. random attention column start id.
             to_end_block_id: int. random attention column end id.
             num_rand_blocks: int. number of random blocks to be selected.
+            indices_prng_key: jax.random.PRNGKey. PRNG key that is used to perform random jax operations
             window_block_left: int. number of blocks of window to left of a block.
             window_block_right: int. number of blocks of window to right of a block.
             global_block_left: int. Number of blocks globally used to the left.
@@ -1160,9 +1209,9 @@ def _get_single_block_row_attention(
             row containing the random attention vector of size num_rand_blocks.
         """
         # list of to_blocks from which to choose random attention
-        to_block_list = np.arange(to_start_block_id, to_end_block_id, dtype=np.int32)
+        to_block_list = jnp.arange(to_start_block_id, to_end_block_id, dtype=jnp.int32)
         # permute the blocks
-        perm_block = np.random.permutation(to_block_list)
+        perm_block = jax.random.permutation(indices_prng_key, to_block_list)
 
         # illegal blocks for the current block id, using window
         illegal_blocks = list(range(block_id - window_block_left, block_id + window_block_right + 1))
@@ -1179,14 +1228,14 @@ def _get_single_block_row_attention(
         if block_id == to_end_block_id - 2:
             illegal_blocks.append(1)
 
-        selected_random_blokcs = []
+        selected_random_blocks = []
 
         for i in range(to_end_block_id - to_start_block_id):
             if perm_block[i] not in illegal_blocks:
-                selected_random_blokcs.append(perm_block[i])
-            if len(selected_random_blokcs) == num_rand_blocks:
+                selected_random_blocks.append(perm_block[i])
+            if len(selected_random_blocks) == num_rand_blocks:
                 break
-        return np.array(selected_random_blokcs, dtype=np.int32)
+        return jnp.array(selected_random_blocks, dtype=jnp.int32)
 
 
 # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertSelfOutput with Bert->BigBird
@@ -1510,11 +1559,11 @@ def __call__(self, hidden_states):
         return self.LayerNorm(hidden_states)
 
 
-# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertLMPredictionHead with Bert->BigBird
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertLMPredictionHead with Bert->BigBird, np.ndarray->jnp.ndarray
 class FlaxBigBirdLMPredictionHead(nn.Module):
     config: BigBirdConfig
     dtype: jnp.dtype = jnp.float32
-    bias_init: Callable[..., np.ndarray] = jax.nn.initializers.zeros
+    bias_init: Callable[..., jnp.ndarray] = jax.nn.initializers.zeros
 
     def setup(self):
         self.transform = FlaxBigBirdPredictionHeadTransform(self.config, dtype=self.dtype)
@@ -1579,7 +1628,7 @@ def __init__(
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
         gradient_checkpointing: bool = False,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, gradient_checkpointing=gradient_checkpointing, **kwargs)
         if config.attention_type == "block_sparse" and input_shape is None:
@@ -1597,7 +1646,6 @@ def enable_gradient_checkpointing(self):
             gradient_checkpointing=True,
         )
 
-    # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPreTrainedModel.init_weights
     def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
         # init input tensors
         input_ids = jnp.zeros(input_shape, dtype="i4")
@@ -1606,8 +1654,8 @@ def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: Froz
         attention_mask = jnp.ones_like(input_ids)
         head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads))
 
-        params_rng, dropout_rng = jax.random.split(rng)
-        rngs = {"params": params_rng, "dropout": dropout_rng}
+        params_rng, dropout_rng, indices_rng = jax.random.split(rng, num=3)
+        rngs = {"params": params_rng, "dropout": dropout_rng, "indices": indices_rng}
 
         if self.config.add_cross_attention:
             encoder_hidden_states = jnp.zeros(input_shape + (self.config.hidden_size,))
@@ -1625,7 +1673,13 @@ def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: Froz
             )
         else:
             module_init_outputs = self.module.init(
-                rngs, input_ids, attention_mask, token_type_ids, position_ids, head_mask, return_dict=False
+                rngs,
+                input_ids,
+                attention_mask,
+                token_type_ids,
+                position_ids,
+                head_mask,
+                return_dict=False,
             )
 
         random_params = module_init_outputs["params"]
@@ -1661,7 +1715,6 @@ def init_cache(self, batch_size, max_length):
         return unfreeze(init_variables["cache"])
 
     @add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
-    # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPreTrainedModel.__call__ with Bert->BigBird
     def __call__(
         self,
         input_ids,
@@ -1672,7 +1725,8 @@ def __call__(
         encoder_hidden_states=None,
         encoder_attention_mask=None,
         params: dict = None,
-        dropout_rng: jax.random.PRNGKey = None,
+        dropout_rng: Optional[jax.random.PRNGKey] = None,
+        indices_rng: Optional[jax.random.PRNGKey] = None,
         train: bool = False,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1700,6 +1754,9 @@ def __call__(
 
         # Handle any PRNG if needed
         rngs = {}
+        if indices_rng is not None:
+            rngs["indices"] = indices_rng
+
         if dropout_rng is not None:
             rngs["dropout"] = dropout_rng
 
@@ -1834,9 +1891,7 @@ class FlaxBigBirdModel(FlaxBigBirdPreTrainedModel):
     module_class = FlaxBigBirdModule
 
 
-append_call_sample_docstring(
-    FlaxBigBirdModel, _TOKENIZER_FOR_DOC, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutputWithPooling, _CONFIG_FOR_DOC
-)
+append_call_sample_docstring(FlaxBigBirdModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutputWithPooling, _CONFIG_FOR_DOC)
 
 
 # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForPreTrainingModule with Bert->BigBird
@@ -1865,7 +1920,6 @@ def __call__(
         output_hidden_states: bool = False,
         return_dict: bool = True,
     ):
-
         # Model
         outputs = self.bert(
             input_ids,
@@ -1920,9 +1974,9 @@ class FlaxBigBirdForPreTraining(FlaxBigBirdPreTrainedModel):
     Example:
 
     ```python
-    >>> from transformers import BigBirdTokenizer, FlaxBigBirdForPreTraining
+    >>> from transformers import AutoTokenizer, FlaxBigBirdForPreTraining
 
-    >>> tokenizer = BigBirdTokenizer.from_pretrained("google/bigbird-roberta-base")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base")
     >>> model = FlaxBigBirdForPreTraining.from_pretrained("google/bigbird-roberta-base")
 
     >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
@@ -2007,9 +2061,7 @@ class FlaxBigBirdForMaskedLM(FlaxBigBirdPreTrainedModel):
     module_class = FlaxBigBirdForMaskedLMModule
 
 
-append_call_sample_docstring(
-    FlaxBigBirdForMaskedLM, _TOKENIZER_FOR_DOC, _CHECKPOINT_FOR_DOC, FlaxMaskedLMOutput, _CONFIG_FOR_DOC
-)
+append_call_sample_docstring(FlaxBigBirdForMaskedLM, _CHECKPOINT_FOR_DOC, FlaxMaskedLMOutput, _CONFIG_FOR_DOC)
 
 
 class FlaxBigBirdClassificationHead(nn.Module):
@@ -2101,7 +2153,6 @@ class FlaxBigBirdForSequenceClassification(FlaxBigBirdPreTrainedModel):
 
 append_call_sample_docstring(
     FlaxBigBirdForSequenceClassification,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxSequenceClassifierOutput,
     _CONFIG_FOR_DOC,
@@ -2187,7 +2238,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         if config.attention_type == "block_sparse" and input_shape is None:
             input_shape = (1, 1, 12 * config.block_size)
@@ -2201,7 +2252,6 @@ def __init__(
 )
 append_call_sample_docstring(
     FlaxBigBirdForMultipleChoice,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxMultipleChoiceModelOutput,
     _CONFIG_FOR_DOC,
@@ -2282,7 +2332,6 @@ class FlaxBigBirdForTokenClassification(FlaxBigBirdPreTrainedModel):
 
 append_call_sample_docstring(
     FlaxBigBirdForTokenClassification,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxTokenClassifierOutput,
     _CONFIG_FOR_DOC,
@@ -2336,7 +2385,6 @@ def __call__(
         output_hidden_states: bool = False,
         return_dict: bool = True,
     ):
-
         # Model
         outputs = self.bert(
             input_ids,
@@ -2394,7 +2442,8 @@ def __call__(
         head_mask=None,
         question_lengths=None,
         params: dict = None,
-        dropout_rng: jax.random.PRNGKey = None,
+        dropout_rng: Optional[jax.random.PRNGKey] = None,
+        indices_rng: Optional[jax.random.PRNGKey] = None,
         train: bool = False,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -2440,6 +2489,9 @@ def __call__(
         if dropout_rng is not None:
             rngs["dropout"] = dropout_rng
 
+        if indices_rng is not None:
+            rngs["indices"] = indices_rng
+
         return self.module.apply(
             {"params": params or self.params},
             jnp.array(input_ids, dtype="i4"),
@@ -2465,14 +2517,12 @@ def prepare_question_mask(q_lengths, maxlen: int):
 
 append_call_sample_docstring(
     FlaxBigBirdForQuestionAnswering,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxBigBirdForQuestionAnsweringModelOutput,
     _CONFIG_FOR_DOC,
 )
 
 
-# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForCausalLMModule with Bert->BigBird
 class FlaxBigBirdForCausalLMModule(nn.Module):
     config: BigBirdConfig
     dtype: jnp.dtype = jnp.float32
@@ -2504,11 +2554,11 @@ def __call__(
     ):
         # Model
         outputs = self.bert(
-            input_ids,
-            attention_mask,
-            token_type_ids,
-            position_ids,
-            head_mask,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
             encoder_hidden_states=encoder_hidden_states,
             encoder_attention_mask=encoder_attention_mask,
             init_cache=init_cache,
@@ -2549,7 +2599,7 @@ def __call__(
 class FlaxBigBirdForCausalLM(FlaxBigBirdPreTrainedModel):
     module_class = FlaxBigBirdForCausalLMModule
 
-    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jnp.DeviceArray] = None):
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
         # initializing the cache
         batch_size, seq_length = input_ids.shape
 
@@ -2578,7 +2628,6 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
 
 append_call_sample_docstring(
     FlaxBigBirdForCausalLM,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxCausalLMOutputWithCrossAttentions,
     _CONFIG_FOR_DOC,
diff --git a/src/transformers/models/big_bird/tokenization_big_bird.py b/src/transformers/models/big_bird/tokenization_big_bird.py
index 47c00fa7c2fa..12041a4ce115 100644
--- a/src/transformers/models/big_bird/tokenization_big_bird.py
+++ b/src/transformers/models/big_bird/tokenization_big_bird.py
@@ -60,25 +60,25 @@ class BigBirdTokenizer(PreTrainedTokenizer):
         vocab_file (`str`):
             [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
             contains the vocabulary necessary to instantiate a tokenizer.
-        eos_token (`str`, *optional*, defaults to `"</s>"`):
-            The end of sequence token.
-        bos_token (`str`, *optional*, defaults to `"<s>"`):
-            The begin of sequence token.
         unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The begin of sequence token.
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
         pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
         sep_token (`str`, *optional*, defaults to `"[SEP]"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
         mask_token (`str`, *optional*, defaults to `"[MASK]"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
         sp_model_kwargs (`dict`, *optional*):
             Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
             SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
@@ -113,7 +113,7 @@ def __init__(
         mask_token="[MASK]",
         cls_token="[CLS]",
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
@@ -127,6 +127,11 @@ def __init__(
 
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
 
+        self.vocab_file = vocab_file
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+
         super().__init__(
             bos_token=bos_token,
             eos_token=eos_token,
@@ -139,11 +144,6 @@ def __init__(
             **kwargs,
         )
 
-        self.vocab_file = vocab_file
-
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(vocab_file)
-
     @property
     def vocab_size(self):
         return self.sp_model.get_piece_size()
@@ -204,9 +204,9 @@ def _decode(
         self,
         token_ids: List[int],
         skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: bool = True,
+        clean_up_tokenization_spaces: bool = None,
         spaces_between_special_tokens: bool = True,
-        **kwargs
+        **kwargs,
     ) -> str:
         self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
 
@@ -237,6 +237,11 @@ def _decode(
         else:
             text = "".join(sub_texts)
 
+        clean_up_tokenization_spaces = (
+            clean_up_tokenization_spaces
+            if clean_up_tokenization_spaces is not None
+            else self.clean_up_tokenization_spaces
+        )
         if clean_up_tokenization_spaces:
             clean_text = self.clean_up_tokenization(text)
             return clean_text
diff --git a/src/transformers/models/big_bird/tokenization_big_bird_fast.py b/src/transformers/models/big_bird/tokenization_big_bird_fast.py
index 6ff063e772e2..24fc33d80529 100644
--- a/src/transformers/models/big_bird/tokenization_big_bird_fast.py
+++ b/src/transformers/models/big_bird/tokenization_big_bird_fast.py
@@ -124,7 +124,7 @@ def __init__(
         sep_token="[SEP]",
         mask_token="[MASK]",
         cls_token="[CLS]",
-        **kwargs
+        **kwargs,
     ):
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
@@ -150,7 +150,10 @@ def __init__(
         )
 
         self.vocab_file = vocab_file
-        self.can_save_slow_tokenizer = False if not self.vocab_file else True
+
+    @property
+    def can_save_slow_tokenizer(self) -> bool:
+        return os.path.isfile(self.vocab_file) if self.vocab_file else False
 
     def build_inputs_with_special_tokens(
         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
@@ -202,7 +205,7 @@ def get_special_tokens_mask(
                     "You should not supply a second sequence if the provided sequence of "
                     "ids is already formatted with special tokens for the model."
                 )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+            return [1 if x in [self.sep_token_id, self.cls_token_id] else 0 for x in token_ids_0]
 
         if token_ids_1 is None:
             return [1] + ([0] * len(token_ids_0)) + [1]
diff --git a/src/transformers/models/bigbird_pegasus/__init__.py b/src/transformers/models/bigbird_pegasus/__init__.py
index 80491bebb892..c4245496e73d 100644
--- a/src/transformers/models/bigbird_pegasus/__init__.py
+++ b/src/transformers/models/bigbird_pegasus/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py
index ed9c0a42e076..a7f198a735b3 100644
--- a/src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py
@@ -158,7 +158,7 @@ def __init__(
         block_size=64,
         num_random_blocks=3,
         use_bias=False,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
diff --git a/src/transformers/models/bigbird_pegasus/convert_bigbird_pegasus_tf_to_pytorch.py b/src/transformers/models/bigbird_pegasus/convert_bigbird_pegasus_tf_to_pytorch.py
index 2d2efdec7741..e17369e48041 100644
--- a/src/transformers/models/bigbird_pegasus/convert_bigbird_pegasus_tf_to_pytorch.py
+++ b/src/transformers/models/bigbird_pegasus/convert_bigbird_pegasus_tf_to_pytorch.py
@@ -87,7 +87,6 @@ def rename_state_dict_key(k, patterns):
 
 
 def convert_bigbird_pegasus(tf_weights: dict, config_update: dict) -> BigBirdPegasusForConditionalGeneration:
-
     cfg = BigBirdPegasusConfig(**config_update)
     torch_model = BigBirdPegasusForConditionalGeneration(cfg)
     state_dict = torch_model.state_dict()
@@ -105,7 +104,7 @@ def convert_bigbird_pegasus(tf_weights: dict, config_update: dict) -> BigBirdPeg
         new_k = rename_state_dict_key(k, patterns)
         if new_k not in state_dict:
             raise ValueError(f"could not find new key {new_k} in state dict. (converted from {k})")
-        if any([True if i in k else False for i in ["dense", "query", "key", "value"]]):
+        if any(True if i in k else False for i in ["dense", "query", "key", "value"]):
             v = v.T
         mapping[new_k] = torch.from_numpy(v)
         assert v.shape == state_dict[new_k].shape, f"{new_k}, {k}, {v.shape}, {state_dict[new_k].shape}"
@@ -118,7 +117,7 @@ def convert_bigbird_pegasus(tf_weights: dict, config_update: dict) -> BigBirdPeg
         new_k = rename_state_dict_key(k, patterns)
         if new_k not in state_dict and k != "pegasus/embeddings/position_embeddings":
             raise ValueError(f"could not find new key {new_k} in state dict. (converted from {k})")
-        if any([True if i in k else False for i in ["dense", "query", "key", "value"]]):
+        if any(True if i in k else False for i in ["dense", "query", "key", "value"]):
             v = v.T
         mapping[new_k] = torch.from_numpy(v)
         if k != "pegasus/embeddings/position_embeddings":
@@ -148,7 +147,7 @@ def get_tf_weights_as_numpy(path) -> Dict:
     tf_weights = {}
     ignore_name = ["global_step"]
     for name, shape in tqdm(init_vars, desc="converting tf checkpoint to dict"):
-        skip_key = any([pat in name for pat in ignore_name])
+        skip_key = any(pat in name for pat in ignore_name)
         if skip_key:
             continue
         array = tf.train.load_variable(path, name)
diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
index b3d9142a2f11..a32f3ecde76f 100755
--- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
@@ -17,7 +17,6 @@
 
 import copy
 import math
-import random
 from typing import List, Optional, Tuple, Union
 
 import numpy as np
@@ -51,21 +50,8 @@
 
 _CHECKPOINT_FOR_DOC = "google/bigbird-pegasus-large-arxiv"
 _CONFIG_FOR_DOC = "BigBirdPegasusConfig"
-_TOKENIZER_FOR_DOC = "PegasusTokenizerFast"
-
-# Base model docstring
 _EXPECTED_OUTPUT_SHAPE = [1, 7, 1024]
 
-# SequenceClassification docstring
-_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "hf-internal-testing/tiny-random-bigbird_pegasus"
-_SEQ_CLASS_EXPECTED_LOSS = 0.69
-_SEQ_CLASS_EXPECTED_OUTPUT = "'LABEL_1'"
-
-# QuestionAsnwering docstring
-_CHECKPOINT_FOR_QA = "hf-internal-testing/tiny-random-bigbird_pegasus"
-_QA_EXPECTED_LOSS = 3.96
-_QA_EXPECTED_OUTPUT = "''"
-
 
 BIGBIRD_PEGASUS_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "google/bigbird-pegasus-large-arxiv",
@@ -91,18 +77,21 @@ def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start
     return shifted_input_ids
 
 
-def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
     """
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
-    mask_cond = torch.arange(mask.size(-1))
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
 
     if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
     return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
 
 
@@ -365,7 +354,6 @@ def bigbird_block_sparse_attention(
         plan_num_rand_blocks,
         output_attentions,
     ):
-
         # BigBirdPegasus block-sparse attention as suggested in paper
 
         # ITC:
@@ -803,11 +791,8 @@ def torch_gather_b2(params, indices):
         num_indices_to_gather = indices.shape[-2] * indices.shape[-1]
         num_indices_to_pick_from = params.shape[2]
 
-        indices_shift = (
-            torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device)
-            // num_indices_to_gather
-            * num_indices_to_pick_from
-        )
+        shift = torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device)
+        indices_shift = torch.div(shift, num_indices_to_gather, rounding_mode="floor") * num_indices_to_pick_from
 
         flattened_indices = indices.view(-1) + indices_shift
         flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1])
@@ -887,9 +872,8 @@ def _get_rand_attn_plan(from_seq_length, from_block_size, num_rand_blocks):
 
         return plan_from_length, plan_num_rand_blocks
 
-    @staticmethod
     def _bigbird_block_rand_mask(
-        from_seq_length, to_seq_length, from_block_size, to_block_size, num_rand_blocks, last_idx=-1
+        self, from_seq_length, to_seq_length, from_block_size, to_block_size, num_rand_blocks, last_idx=-1
     ):
         """
         Create adjacency list of random attention.
@@ -912,6 +896,9 @@ def _bigbird_block_rand_mask(
             raise ValueError("Error the number of blocks needs to be same!")
 
         rand_attn = np.zeros((from_seq_length // from_block_size - 2, num_rand_blocks), dtype=np.int32)
+        # During inference (eval) no randomness
+        if not self.training:
+            return rand_attn
         middle_seq = np.arange(1, to_seq_length // to_block_size - 1, dtype=np.int32)
         last = to_seq_length // to_block_size - 1
         if last_idx > (2 * to_block_size):
@@ -995,11 +982,17 @@ def _bigbird_block_rand_mask_with_head(
         plan_block_length = np.array(plan_from_length) // from_block_size
         # till when to follow plan
         max_plan_idx = plan_from_length.index(from_seq_length)
+
         # Random Attention adjacency list
         rand_attn = [
             np.zeros((num_blocks, np.sum(plan_num_rand_blocks[: max_plan_idx + 1])), dtype=np.int32)
             for i in range(num_heads)
         ]
+        # During inference (eval) no randomness
+        if not self.training:
+            for nh in range(num_heads):
+                rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :]
+            return rand_attn
 
         # We will go iteratively over the plan blocks and pick random number of
         # Attention blocks from the legally allowed blocks
@@ -1304,8 +1297,8 @@ def forward(
 
         proj_shape = (bsz * self.num_heads, -1, self.head_dim)
         query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-        key_states = key_states.view(*proj_shape)
-        value_states = value_states.view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
 
         src_len = key_states.size(1)
         attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
@@ -1351,7 +1344,7 @@ def forward(
 
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
             raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
                 f" {attn_output.size()}"
             )
 
@@ -1359,7 +1352,7 @@ def forward(
         attn_output = attn_output.transpose(1, 2)
 
         # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned aross GPUs when using tensor-parallelism.
+        # partitioned across GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
@@ -1395,7 +1388,7 @@ def forward(
     ):
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             output_attentions (`bool`, *optional*):
@@ -1497,11 +1490,11 @@ def forward(
     ) -> torch.Tensor:
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             encoder_hidden_states (`torch.FloatTensor`):
-                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
             encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
@@ -1603,6 +1596,7 @@ class BigBirdPegasusPreTrainedModel(PreTrainedModel):
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
     _no_split_modules = ["BigBirdPegasusEncoderLayer", "BigBirdPegasusDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
 
     def _init_weights(self, module):
         std = self.config.init_std
@@ -1649,10 +1643,10 @@ def dummy_inputs(self):
     Summarization example:
 
     ```python
-    >>> from transformers import PegasusTokenizer, BigBirdPegasusForConditionalGeneration
+    >>> from transformers import AutoTokenizer, BigBirdPegasusForConditionalGeneration
 
     >>> model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-arxiv")
-    >>> tokenizer = PegasusTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")
 
     >>> ARTICLE_TO_SUMMARIZE = (
     ...     "The dominant sequence transduction models are based on complex recurrent or convolutional neural "
@@ -1677,7 +1671,7 @@ def dummy_inputs(self):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`PegasusTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -1828,7 +1822,7 @@ def forward(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`PegasusTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -1863,6 +1857,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
         elif inputs_embeds is not None:
@@ -1938,8 +1933,13 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
                 layer_outputs = (None, None)
             else:
                 if self.gradient_checkpointing and self.training:
@@ -2011,7 +2011,6 @@ def set_attention_type(self, value: str):
 
     @staticmethod  # Copied from transformers.models.big_bird.modeling_big_bird.BigBirdModel.create_masks_for_block_sparse_attn
     def create_masks_for_block_sparse_attn(attention_mask: torch.Tensor, block_size: int):
-
         batch_size, seq_length = attention_mask.size()
         if seq_length % block_size != 0:
             raise ValueError(
@@ -2119,8 +2118,11 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
         combined_attention_mask = None
         if input_shape[-1] > 1:
             combined_attention_mask = _make_causal_mask(
-                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
-            ).to(inputs_embeds.device)
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
 
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
@@ -2154,7 +2156,7 @@ def forward(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`PegasusTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -2254,6 +2256,13 @@ def forward(
 
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
@@ -2272,20 +2281,15 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         # None for past_key_value
@@ -2304,7 +2308,6 @@ def custom_forward(*inputs):
                     None,
                 )
             else:
-
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
@@ -2355,9 +2358,8 @@ def custom_forward(*inputs):
     "The bare BigBirdPegasus Model outputting raw hidden-states without any specific head on top.",
     BIGBIRD_PEGASUS_START_DOCSTRING,
 )
-# Copied from transformers.models.bart.modeling_bart.BartModel with Bart->BigBirdPegasus, BART->BIGBIRD_PEGASUS
 class BigBirdPegasusModel(BigBirdPegasusPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: BigBirdPegasusConfig):
         super().__init__(config)
@@ -2387,12 +2389,12 @@ def get_decoder(self):
 
     @add_start_docstrings_to_model_forward(BIGBIRD_PEGASUS_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=Seq2SeqModelOutput,
         config_class=_CONFIG_FOR_DOC,
         expected_output=_EXPECTED_OUTPUT_SHAPE,
     )
+    # Copied from transformers.models.bart.modeling_bart.BartModel.forward with Bart->BigBirdPegasus
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -2411,7 +2413,6 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, Seq2SeqModelOutput]:
-
         # different to other models, BigBirdPegasus automatically creates decoder_input_ids from
         # input_ids if no decoder_input_ids are provided
         if decoder_input_ids is None and decoder_inputs_embeds is None:
@@ -2489,12 +2490,8 @@ def forward(
 # Copied from transformers.models.bart.modeling_bart.BartForConditionalGeneration with Bart->BigBirdPegasus, BART->BIGBIRD_PEGASUS
 class BigBirdPegasusForConditionalGeneration(BigBirdPegasusPreTrainedModel):
     base_model_prefix = "model"
-    _keys_to_ignore_on_load_missing = [
-        r"final_logits_bias",
-        r"lm_head.weight",
-        "encoder.embed_tokens.weight",
-        "decoder.embed_tokens.weight",
-    ]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
+    _keys_to_ignore_on_load_missing = ["final_logits_bias"]
 
     def __init__(self, config: BigBirdPegasusConfig):
         super().__init__(config)
@@ -2511,9 +2508,9 @@ def get_encoder(self):
     def get_decoder(self):
         return self.model.get_decoder()
 
-    def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
-        new_embeddings = super().resize_token_embeddings(new_num_tokens)
-        self._resize_final_logits_bias(new_num_tokens)
+    def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding:
+        new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
+        self._resize_final_logits_bias(new_embeddings.weight.shape[0])
         return new_embeddings
 
     def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
@@ -2595,6 +2592,7 @@ def forward(
 
         masked_lm_loss = None
         if labels is not None:
+            labels = labels.to(lm_logits.device)
             loss_fct = CrossEntropyLoss()
             masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
 
@@ -2625,7 +2623,7 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # cut decoder_input_ids if past_key_values is used
         if past_key_values is not None:
@@ -2648,12 +2646,13 @@ def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
         return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
 
     @staticmethod
-    def _reorder_cache(past, beam_idx):
+    def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
+        for layer_past in past_key_values:
             # cached cross_attention states don't have to be reordered -> they are always the same
             reordered_past += (
-                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
+                + layer_past[2:],
             )
         return reordered_past
 
@@ -2665,9 +2664,8 @@ def _reorder_cache(past, beam_idx):
     """,
     BIGBIRD_PEGASUS_START_DOCSTRING,
 )
-# Copied from transformers.models.bart.modeling_bart.BartForSequenceClassification with Bart->BigBirdPegasus, BART->BIGBIRD_PEGASUS
 class BigBirdPegasusForSequenceClassification(BigBirdPegasusPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: BigBirdPegasusConfig, **kwargs):
         super().__init__(config, **kwargs)
@@ -2678,18 +2676,17 @@ def __init__(self, config: BigBirdPegasusConfig, **kwargs):
             config.num_labels,
             config.classifier_dropout,
         )
-        self.model._init_weights(self.classification_head.dense)
-        self.model._init_weights(self.classification_head.out_proj)
+
+        # Initialize weights and apply final processing
+        self.post_init()
 
     @add_start_docstrings_to_model_forward(BIGBIRD_PEGASUS_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
+        checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=Seq2SeqSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
-        expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
     )
+    # Copied from transformers.models.bart.modeling_bart.BartForSequenceClassification.forward
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -2751,6 +2748,7 @@ def forward(
 
         loss = None
         if labels is not None:
+            labels = labels.to(logits.device)
             if self.config.problem_type is None:
                 if self.config.num_labels == 1:
                     self.config.problem_type = "regression"
@@ -2795,9 +2793,8 @@ def forward(
     """,
     BIGBIRD_PEGASUS_START_DOCSTRING,
 )
-# Copied from transformers.models.bart.modeling_bart.BartForQuestionAnswering with Bart->BigBirdPegasus, BART->BIGBIRD_PEGASUS
 class BigBirdPegasusForQuestionAnswering(BigBirdPegasusPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -2808,17 +2805,16 @@ def __init__(self, config):
         self.model = BigBirdPegasusModel(config)
         self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
 
-        self.model._init_weights(self.qa_outputs)
+        # Initialize weights and apply final processing
+        self.post_init()
 
     @add_start_docstrings_to_model_forward(BIGBIRD_PEGASUS_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_QA,
+        checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=Seq2SeqQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_loss=_QA_EXPECTED_LOSS,
-        expected_output=_QA_EXPECTED_OUTPUT,
     )
+    # Copied from transformers.models.bart.modeling_bart.BartForQuestionAnswering.forward
     def forward(
         self,
         input_ids: torch.Tensor = None,
@@ -2930,7 +2926,7 @@ def forward(self, *args, **kwargs):
 
 
 class BigBirdPegasusForCausalLM(BigBirdPegasusPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         config = copy.deepcopy(config)
@@ -2985,7 +2981,7 @@ def forward(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`PegasusTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -3050,9 +3046,9 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import PegasusTokenizer, BigBirdPegasusForCausalLM
+        >>> from transformers import AutoTokenizer, BigBirdPegasusForCausalLM
 
-        >>> tokenizer = PegasusTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")
         >>> model = BigBirdPegasusForCausalLM.from_pretrained(
         ...     "google/bigbird-pegasus-large-arxiv", add_cross_attention=False
         ... )
@@ -3123,8 +3119,10 @@ def prepare_inputs_for_generation(
         }
 
     @staticmethod
-    def _reorder_cache(past, beam_idx):
+    def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
         return reordered_past
diff --git a/src/transformers/models/biogpt/__init__.py b/src/transformers/models/biogpt/__init__.py
index 90d1f4b40ba3..ec3d6966ac41 100644
--- a/src/transformers/models/biogpt/__init__.py
+++ b/src/transformers/models/biogpt/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,7 +13,6 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-# rely on isort to merge the imports
 from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
 
 
@@ -35,6 +30,8 @@
     _import_structure["modeling_biogpt"] = [
         "BIOGPT_PRETRAINED_MODEL_ARCHIVE_LIST",
         "BioGptForCausalLM",
+        "BioGptForTokenClassification",
+        "BioGptForSequenceClassification",
         "BioGptModel",
         "BioGptPreTrainedModel",
     ]
@@ -53,6 +50,8 @@
         from .modeling_biogpt import (
             BIOGPT_PRETRAINED_MODEL_ARCHIVE_LIST,
             BioGptForCausalLM,
+            BioGptForSequenceClassification,
+            BioGptForTokenClassification,
             BioGptModel,
             BioGptPreTrainedModel,
         )
diff --git a/src/transformers/models/biogpt/configuration_biogpt.py b/src/transformers/models/biogpt/configuration_biogpt.py
index 4803b9dc1231..b6911e2ef903 100644
--- a/src/transformers/models/biogpt/configuration_biogpt.py
+++ b/src/transformers/models/biogpt/configuration_biogpt.py
@@ -68,19 +68,18 @@ class BioGptConfig(PretrainedConfig):
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`.
-        is_encoder_decoder (`bool`, *optional*, defaults to `False`):
-            Whether this is an encoder/decoder model.
         layerdrop (`float`, *optional*, defaults to 0.0):
             Please refer to the paper about LayerDrop: https://arxiv.org/abs/1909.11556 for further details
         activation_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for activations inside the fully connected layer.
-        pad_token_id (`int`, *optional*, defaults to 1)
+        pad_token_id (`int`, *optional*, defaults to 1):
             Padding token id.
-        bos_token_id (`int`, *optional*, defaults to 0)
+        bos_token_id (`int`, *optional*, defaults to 0):
             Beginning of stream token id.
-        eos_token_id (`int`, *optional*, defaults to 2)
+        eos_token_id (`int`, *optional*, defaults to 2):
             End of stream token id.
-        Example:
+
+    Example:
 
     ```python
     >>> from transformers import BioGptModel, BioGptConfig
@@ -111,13 +110,12 @@ def __init__(
         layer_norm_eps=1e-12,
         scale_embedding=True,
         use_cache=True,
-        is_encoder_decoder=False,
         layerdrop=0.0,
         activation_dropout=0.0,
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
@@ -132,7 +130,6 @@ def __init__(
         self.layer_norm_eps = layer_norm_eps
         self.scale_embedding = scale_embedding
         self.use_cache = use_cache
-        self.is_encoder_decoder = is_encoder_decoder
         self.layerdrop = layerdrop
         self.activation_dropout = activation_dropout
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
diff --git a/src/transformers/models/biogpt/convert_biogpt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/biogpt/convert_biogpt_original_pytorch_checkpoint_to_pytorch.py
index bcbda452a325..c930a850462c 100755
--- a/src/transformers/models/biogpt/convert_biogpt_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/biogpt/convert_biogpt_original_pytorch_checkpoint_to_pytorch.py
@@ -157,7 +157,6 @@ def rewrite_dict_keys(d):
 
 
 def convert_biogpt_checkpoint_to_pytorch(biogpt_checkpoint_path, pytorch_dump_folder_path):
-
     # prep
     if not os.path.exists(biogpt_checkpoint_path):
         raise ValueError(f"path {biogpt_checkpoint_path} does not exist!")
diff --git a/src/transformers/models/biogpt/modeling_biogpt.py b/src/transformers/models/biogpt/modeling_biogpt.py
index 95f297d011ad..7534ed17fe84 100755
--- a/src/transformers/models/biogpt/modeling_biogpt.py
+++ b/src/transformers/models/biogpt/modeling_biogpt.py
@@ -16,18 +16,27 @@
 
 
 import math
-import random
 from typing import Optional, Tuple, Union
 
 import torch
 import torch.utils.checkpoint
 from torch import nn
-from torch.nn import CrossEntropyLoss
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
+from ...modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
 from ...modeling_utils import PreTrainedModel
-from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
 from .configuration_biogpt import BioGptConfig
 
 
@@ -35,27 +44,30 @@
 
 _CHECKPOINT_FOR_DOC = "microsoft/biogpt"
 _CONFIG_FOR_DOC = "BioGptConfig"
-_TOKENIZER_FOR_DOC = "BioGptTokenizer"
+
 
 BIOGPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "microsoft/biogpt",
+    "microsoft/BioGPT-Large",
     # See all BioGPT models at https://huggingface.co/models?filter=biogpt
 ]
 
 
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
     """
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
-    mask_cond = torch.arange(mask.size(-1))
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
 
     if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
     return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
 
 
@@ -191,8 +203,8 @@ def forward(
 
         proj_shape = (bsz * self.num_heads, -1, self.head_dim)
         query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-        key_states = key_states.view(*proj_shape)
-        value_states = value_states.view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
 
         src_len = key_states.size(1)
         attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
@@ -238,7 +250,7 @@ def forward(
 
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
             raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
                 f" {attn_output.size()}"
             )
 
@@ -246,7 +258,7 @@ def forward(
         attn_output = attn_output.transpose(1, 2)
 
         # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned aross GPUs when using tensor-parallelism.
+        # partitioned across GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
@@ -385,7 +397,7 @@ def _set_gradient_checkpointing(self, module, value=False):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BioGptTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -471,8 +483,11 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
         combined_attention_mask = None
         if input_shape[-1] > 1:
             combined_attention_mask = _make_causal_mask(
-                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
-            ).to(inputs_embeds.device)
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
 
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
@@ -487,7 +502,6 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
 
     @add_start_docstrings_to_model_forward(BIOGPT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPastAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -531,6 +545,12 @@ def forward(
 
         if attention_mask is None:
             attention_mask = torch.ones(inputs_embeds.shape[:2], dtype=torch.bool, device=inputs_embeds.device)
+        elif attention_mask.shape[1] != past_key_values_length + input_shape[1]:
+            raise ValueError(
+                f"The provided attention mask has length {attention_mask.shape[1]}, but its length should be "
+                f"{past_key_values_length + input_shape[1]} (sum of the lengths of current and past inputs)"
+            )
+
         # embed positions
         positions = self.embed_positions(attention_mask, past_key_values_length)
 
@@ -542,6 +562,13 @@ def forward(
 
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
         all_cross_attentions = None
@@ -551,20 +578,15 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         # None for past_key_value
@@ -580,7 +602,6 @@ def custom_forward(*inputs):
                     None,
                 )
             else:
-
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
@@ -625,7 +646,7 @@ def custom_forward(*inputs):
     """BioGPT Model with a `language modeling` head on top for CLM fine-tuning.""", BIOGPT_START_DOCSTRING
 )
 class BioGptForCausalLM(BioGptPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["output_projection.weight"]
+    _tied_weights_keys = ["output_projection.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -644,7 +665,6 @@ def set_output_embeddings(self, new_embeddings):
 
     @add_start_docstrings_to_model_forward(BIOGPT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=CausalLMOutputWithCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -706,22 +726,251 @@ def forward(
             cross_attentions=outputs.cross_attentions,
         )
 
-    def prepare_inputs_for_generation(self, input_ids, attention_mask, past_key_values=None, **kwargs):
-
+    def prepare_inputs_for_generation(
+        self, input_ids, attention_mask, inputs_embeds=None, past_key_values=None, **kwargs
+    ):
         # only last token for inputs_ids if past is defined in kwargs
         if past_key_values:
             input_ids = input_ids[:, -1].unsqueeze(-1)
 
-        return {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "past_key_values": past_key_values,
-            "use_cache": kwargs.get("use_cache"),
-        }
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "attention_mask": attention_mask,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+            }
+        )
+
+        return model_inputs
 
     @staticmethod
-    def _reorder_cache(past, beam_idx):
+    def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
         return reordered_past
+
+
+@add_start_docstrings(
+    """
+    BioGPT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    BIOGPT_START_DOCSTRING,
+)
+class BioGptForTokenClassification(BioGptPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.biogpt = BioGptModel(config)
+        if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
+            classifier_dropout = config.classifier_dropout
+        else:
+            classifier_dropout = config.hidden_dropout_prob
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(BIOGPT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.biogpt(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+        hidden_states = self.dropout(hidden_states)
+        logits = self.classifier(hidden_states)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
+                )
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The BioGpt Model transformer with a sequence classification head on top (linear layer).
+
+    [`BioGptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it is required to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    BIOGPT_START_DOCSTRING,
+)
+class BioGptForSequenceClassification(BioGptPreTrainedModel):
+    def __init__(self, config: BioGptConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.biogpt = BioGptModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(BIOGPT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutputWithPast,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.biogpt(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size, sequence_length = input_ids.shape[:2]
+        else:
+            batch_size, sequence_length = inputs_embeds.shape[:2]
+
+        if self.config.pad_token_id is None:
+            sequence_length = -1
+        else:
+            if input_ids is not None:
+                sequence_length = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device)
+            else:
+                sequence_length = -1
+                logger.warning(
+                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                    "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+                )
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_length]
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def get_input_embeddings(self):
+        return self.biogpt.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.biogpt.embed_tokens = value
diff --git a/src/transformers/models/biogpt/tokenization_biogpt.py b/src/transformers/models/biogpt/tokenization_biogpt.py
index 405e4c862513..093991ecb388 100644
--- a/src/transformers/models/biogpt/tokenization_biogpt.py
+++ b/src/transformers/models/biogpt/tokenization_biogpt.py
@@ -110,17 +110,8 @@ def __init__(
         eos_token="</s>",
         sep_token="</s>",
         pad_token="<pad>",
-        **kwargs
+        **kwargs,
     ):
-        super().__init__(
-            bos_token=bos_token,
-            eos_token=eos_token,
-            sep_token=sep_token,
-            unk_token=unk_token,
-            pad_token=pad_token,
-            **kwargs,
-        )
-
         try:
             import sacremoses
         except ImportError:
@@ -132,8 +123,8 @@ def __init__(
         self.lang = "en"
         self.sm = sacremoses
         # cache of sm.MosesTokenizer instance
-        self.cache_moses_tokenizer = dict()
-        self.cache_moses_detokenizer = dict()
+        self.cache_moses_tokenizer = {}
+        self.cache_moses_detokenizer = {}
 
         """ Initialisation"""
         with open(vocab_file, encoding="utf-8") as vocab_handle:
@@ -145,6 +136,15 @@ def __init__(
         self.bpe_ranks = dict(zip(merges, range(len(merges))))
         self.cache = {}
 
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            **kwargs,
+        )
+
     @property
     def vocab_size(self):
         """Returns vocab size"""
@@ -221,7 +221,7 @@ def _tokenize(self, text, bypass_tokenizer=False):
         split_tokens = []
         for token in text:
             if token:
-                split_tokens.extend([t for t in self.bpe(token).split(" ")])
+                split_tokens.extend(list(self.bpe(token).split(" ")))
 
         return split_tokens
 
diff --git a/src/transformers/models/bit/__init__.py b/src/transformers/models/bit/__init__.py
index 53118146ad14..fc50659d9fa0 100644
--- a/src/transformers/models/bit/__init__.py
+++ b/src/transformers/models/bit/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,7 +13,6 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-# rely on isort to merge the imports
 from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
 
 
diff --git a/src/transformers/models/bit/configuration_bit.py b/src/transformers/models/bit/configuration_bit.py
index 7c1e105107e3..1e5ded1e1913 100644
--- a/src/transformers/models/bit/configuration_bit.py
+++ b/src/transformers/models/bit/configuration_bit.py
@@ -16,6 +16,7 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
 
 
 logger = logging.get_logger(__name__)
@@ -25,7 +26,7 @@
 }
 
 
-class BitConfig(PretrainedConfig):
+class BitConfig(BackboneConfigMixin, PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`BitModel`]. It is used to instantiate an BiT
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
@@ -51,7 +52,7 @@ class BitConfig(PretrainedConfig):
             are supported.
         global_padding (`str`, *optional*):
             Padding strategy to use for the convolutional layers. Can be either `"valid"`, `"same"`, or `None`.
-        num_groups (`int`, *optional*, defaults to `32`):
+        num_groups (`int`, *optional*, defaults to 32):
             Number of groups used for the `BitGroupNormActivation` layers.
         drop_path_rate (`float`, *optional*, defaults to 0.0):
             The drop path rate for the stochastic depth.
@@ -63,7 +64,12 @@ class BitConfig(PretrainedConfig):
             The width factor for the model.
         out_features (`List[str]`, *optional*):
             If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
-            (depending on how many stages the model has). Will default to the last stage if unset.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage.
+        out_indices (`List[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage.
 
     Example:
     ```python
@@ -98,7 +104,8 @@ def __init__(
         output_stride=32,
         width_factor=1,
         out_features=None,
-        **kwargs
+        out_indices=None,
+        **kwargs,
     ):
         super().__init__(**kwargs)
         if layer_type not in self.layer_types:
@@ -122,12 +129,6 @@ def __init__(
         self.width_factor = width_factor
 
         self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
-        if out_features is not None:
-            if not isinstance(out_features, list):
-                raise ValueError("out_features should be a list")
-            for feature in out_features:
-                if feature not in self.stage_names:
-                    raise ValueError(
-                        f"Feature {feature} is not a valid feature name. Valid names are {self.stage_names}"
-                    )
-        self.out_features = out_features
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
diff --git a/src/transformers/models/bit/convert_bit_to_pytorch.py b/src/transformers/models/bit/convert_bit_to_pytorch.py
index 106c67d17e5e..7cc7f64107ce 100644
--- a/src/transformers/models/bit/convert_bit_to_pytorch.py
+++ b/src/transformers/models/bit/convert_bit_to_pytorch.py
@@ -19,14 +19,14 @@
 import json
 from pathlib import Path
 
-import torch
-from PIL import Image
-
 import requests
+import torch
 from huggingface_hub import hf_hub_download
+from PIL import Image
 from timm import create_model
 from timm.data import resolve_data_config
 from timm.data.transforms_factory import create_transform
+
 from transformers import BitConfig, BitForImageClassification, BitImageProcessor
 from transformers.image_utils import PILImageResampling
 from transformers.utils import logging
diff --git a/src/transformers/models/bit/image_processing_bit.py b/src/transformers/models/bit/image_processing_bit.py
index f210ad30dec1..1bd9d1095d75 100644
--- a/src/transformers/models/bit/image_processing_bit.py
+++ b/src/transformers/models/bit/image_processing_bit.py
@@ -18,21 +18,26 @@
 
 import numpy as np
 
-from transformers.utils.generic import TensorType
-
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import (
-    center_crop,
     convert_to_rgb,
     get_resize_output_image_size,
-    normalize,
-    rescale,
     resize,
     to_channel_dimension_format,
 )
-from ...image_utils import ChannelDimension, ImageInput, PILImageResampling, is_batched, to_numpy_array, valid_images
-from ...utils import logging
-from ...utils.import_utils import is_vision_available
+from ...image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import TensorType, is_vision_available, logging
 
 
 logger = logging.get_logger(__name__)
@@ -70,14 +75,15 @@ class BitImageProcessor(BaseImageProcessor):
             method.
         do_normalize:
             Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
-        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+        image_mean (`float` or `List[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
             Mean to use if normalizing the image. This is a float or list of floats the length of the number of
             channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
-        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
-            Image standard deviation.
-        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+        image_std (`float` or `List[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
             Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
             number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
     """
 
     model_input_names = ["pixel_values"]
@@ -95,7 +101,7 @@ def __init__(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_convert_rgb: bool = True,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"shortest_edge": 224}
@@ -111,17 +117,19 @@ def __init__(
         self.do_rescale = do_rescale
         self.rescale_factor = rescale_factor
         self.do_normalize = do_normalize
-        self.image_mean = image_mean if image_mean is not None else [0.48145466, 0.4578275, 0.40821073]
-        self.image_std = image_std if image_std is not None else [0.26862954, 0.26130258, 0.27577711]
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
         self.do_convert_rgb = do_convert_rgb
 
+    # Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize
     def resize(
         self,
         image: np.ndarray,
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
@@ -136,79 +144,23 @@ def resize(
                 Resampling filter to use when resiizing the image.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
         """
         size = get_size_dict(size, default_to_square=False)
         if "shortest_edge" not in size:
             raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}")
-        output_size = get_resize_output_image_size(image, size=size["shortest_edge"], default_to_square=False)
-        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
-
-    def center_crop(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Center crop an image. If the image is too small to be cropped to the size given, it will be padded (so the
-        returned result will always be of size `size`).
-
-        Args:
-            image (`np.ndarray`):
-                Image to center crop.
-            size (`Dict[str, int]`):
-                Size of the output image in the form of a dictionary with keys `height` and `width`.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        size = get_size_dict(size)
-        if "height" not in size or "width" not in size:
-            raise ValueError(f"The `size` parameter must contain the keys (height, width). Got {size.keys()}")
-        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
-
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            image_mean (`float` or `List[float]`):
-                Image mean.
-            image_std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
+        output_size = get_resize_output_image_size(
+            image, size=size["shortest_edge"], default_to_square=False, input_data_format=input_data_format
+        )
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
 
     def preprocess(
         self,
@@ -226,14 +178,16 @@ def preprocess(
         do_convert_rgb: bool = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> PIL.Image.Image:
         """
         Preprocess an image or batch of images.
 
         Args:
             images (`ImageInput`):
-                Image to preprocess.
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
             do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                 Whether to resize the image.
             size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@@ -268,9 +222,15 @@ def preprocess(
                 - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
             data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
                 The channel dimension format for the output image. Can be one of:
-                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                - Unset: defaults to the channel dimension format of the input image.
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         size = size if size is not None else self.size
@@ -286,8 +246,7 @@ def preprocess(
         image_std = image_std if image_std is not None else self.image_std
         do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
 
-        if not is_batched(images):
-            images = [images]
+        images = make_list_of_images(images)
 
         if not valid_images(images):
             raise ValueError(
@@ -314,19 +273,42 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
         if do_resize:
-            images = [self.resize(image=image, size=size, resample=resample) for image in images]
+            images = [
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
 
         if do_center_crop:
-            images = [self.center_crop(image=image, size=crop_size) for image in images]
+            images = [
+                self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images
+            ]
 
         if do_rescale:
-            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
 
         if do_normalize:
-            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
-
-        images = [to_channel_dimension_format(image, data_format) for image in images]
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
 
         data = {"pixel_values": images}
         return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index 71caabf91c49..12a5ecd42b74 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -31,7 +31,7 @@
     BaseModelOutputWithPoolingAndNoAttention,
     ImageClassifierOutputWithNoAttention,
 )
-from ...modeling_utils import BackboneMixin, PreTrainedModel
+from ...modeling_utils import PreTrainedModel
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -39,6 +39,7 @@
     logging,
     replace_return_docstrings,
 )
+from ...utils.backbone_utils import BackboneMixin
 from .configuration_bit import BitConfig
 
 
@@ -46,7 +47,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "BitConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "google/bit-50"
@@ -300,7 +300,7 @@ def forward(self, pixel_values: Tensor) -> Tensor:
 
 
 # Copied from transformers.models.convnext.modeling_convnext.drop_path
-def drop_path(input, drop_prob: float = 0.0, training: bool = False):
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
@@ -688,8 +688,8 @@ def _set_gradient_checkpointing(self, module, value=False):
 BIT_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
-            [`AutoFeatureExtractor.__call__`] for details.
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`BitImageProcessor.__call__`]
+            for details.
 
         output_hidden_states (`bool`, *optional*):
             Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
@@ -723,7 +723,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(BIT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPoolingAndNoAttention,
         config_class=_CONFIG_FOR_DOC,
@@ -782,7 +781,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(BIT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_IMAGE_CLASS_CHECKPOINT,
         output_type=ImageClassifierOutputWithNoAttention,
         config_class=_CONFIG_FOR_DOC,
@@ -847,26 +845,14 @@ def forward(
 class BitBackbone(BitPreTrainedModel, BackboneMixin):
     def __init__(self, config):
         super().__init__(config)
+        super()._init_backbone(config)
 
-        self.stage_names = config.stage_names
         self.bit = BitModel(config)
-
-        self.out_features = config.out_features if config.out_features is not None else [self.stage_names[-1]]
-
-        out_feature_channels = {}
-        out_feature_channels["stem"] = config.embedding_size
-        for idx, stage in enumerate(self.stage_names[1:]):
-            out_feature_channels[stage] = config.hidden_sizes[idx]
-
-        self.out_feature_channels = out_feature_channels
+        self.num_features = [config.embedding_size] + config.hidden_sizes
 
         # initialize weights and apply final processing
         self.post_init()
 
-    @property
-    def channels(self):
-        return [self.out_feature_channels[name] for name in self.out_features]
-
     @add_start_docstrings_to_model_forward(BIT_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
diff --git a/src/transformers/models/blenderbot/__init__.py b/src/transformers/models/blenderbot/__init__.py
index dce59863fc74..86d857b1e9a2 100644
--- a/src/transformers/models/blenderbot/__init__.py
+++ b/src/transformers/models/blenderbot/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/blenderbot/configuration_blenderbot.py b/src/transformers/models/blenderbot/configuration_blenderbot.py
index ef18a3cec2e5..93ee92813645 100644
--- a/src/transformers/models/blenderbot/configuration_blenderbot.py
+++ b/src/transformers/models/blenderbot/configuration_blenderbot.py
@@ -135,7 +135,7 @@ def __init__(
         eos_token_id=2,
         encoder_no_repeat_ngram_size=3,
         forced_eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py
index f19a26f68266..bdb8c52a5520 100755
--- a/src/transformers/models/blenderbot/modeling_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_blenderbot.py
@@ -18,7 +18,6 @@
 import copy
 import math
 import os
-import random
 import warnings
 from typing import List, Optional, Tuple, Union
 
@@ -50,7 +49,6 @@
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "BlenderbotConfig"
-_TOKENIZER_FOR_DOC = "BlenderbotTokenizer"
 _CHECKPOINT_FOR_DOC = "facebook/blenderbot-400M-distill"
 
 
@@ -78,18 +76,20 @@ def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start
 
 
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
     """
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
-    mask_cond = torch.arange(mask.size(-1))
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
 
     if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
     return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
 
 
@@ -217,8 +217,8 @@ def forward(
 
         proj_shape = (bsz * self.num_heads, -1, self.head_dim)
         query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-        key_states = key_states.view(*proj_shape)
-        value_states = value_states.view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
 
         src_len = key_states.size(1)
         attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
@@ -264,7 +264,7 @@ def forward(
 
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
             raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
                 f" {attn_output.size()}"
             )
 
@@ -272,7 +272,7 @@ def forward(
         attn_output = attn_output.transpose(1, 2)
 
         # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned aross GPUs when using tensor-parallelism.
+        # partitioned across GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
@@ -307,7 +307,7 @@ def forward(
     ) -> torch.Tensor:
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
@@ -391,11 +391,11 @@ def forward(
     ) -> torch.Tensor:
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             encoder_hidden_states (`torch.FloatTensor`):
-                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
             encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
@@ -519,11 +519,11 @@ def dummy_inputs(self):
     Conversation example:
 
     ```python
-    >>> from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration
+    >>> from transformers import AutoTokenizer, BlenderbotForConditionalGeneration
 
     >>> mname = "facebook/blenderbot-400M-distill"
     >>> model = BlenderbotForConditionalGeneration.from_pretrained(mname)
-    >>> tokenizer = BlenderbotTokenizer.from_pretrained(mname)
+    >>> tokenizer = AutoTokenizer.from_pretrained(mname)
     >>> UTTERANCE = "My friends are cool but they eat too many carbs."
     >>> print("Human: ", UTTERANCE)
     Human:  My friends are cool but they eat too many carbs.
@@ -545,7 +545,7 @@ def dummy_inputs(self):
     >>> inputs = tokenizer([NEXT_UTTERANCE], return_tensors="pt")
     >>> next_reply_ids = model.generate(**inputs)
     >>> print("Bot: ", tokenizer.batch_decode(next_reply_ids, skip_special_tokens=True)[0])
-    Bot:   That's too bad. Have you tried encouraging them to change their eating habits?
+    Bot:   I see. Well, it's good that they're trying to change their eating habits.
     ```
 """
 
@@ -555,7 +555,7 @@ def dummy_inputs(self):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`BlenderbotTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -569,7 +569,7 @@ def dummy_inputs(self):
         decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BlenderbotTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -692,7 +692,7 @@ def forward(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`BlenderbotTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -732,6 +732,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
         elif inputs_embeds is not None:
@@ -766,8 +767,13 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
                 layer_outputs = (None, None)
             else:
                 if self.gradient_checkpointing and self.training:
@@ -856,8 +862,11 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
         combined_attention_mask = None
         if input_shape[-1] > 1:
             combined_attention_mask = _make_causal_mask(
-                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
-            ).to(inputs_embeds.device)
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
 
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
@@ -891,7 +900,7 @@ def forward(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`BlenderbotTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -991,6 +1000,12 @@ def forward(
 
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
@@ -1009,20 +1024,15 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         # None for past_key_value
@@ -1041,7 +1051,6 @@ def custom_forward(*inputs):
                     None,
                 )
             else:
-
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
@@ -1094,7 +1103,7 @@ def custom_forward(*inputs):
     BLENDERBOT_START_DOCSTRING,
 )
 class BlenderbotModel(BlenderbotPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
+    _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
 
     def __init__(self, config: BlenderbotConfig):
         super().__init__(config)
@@ -1161,10 +1170,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import BlenderbotTokenizer, BlenderbotModel
+        >>> from transformers import AutoTokenizer, BlenderbotModel
 
         >>> model = BlenderbotModel.from_pretrained("facebook/blenderbot-400M-distill")
-        >>> tokenizer = BlenderbotTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
+        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
 
         >>> inputs = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt")
         >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
@@ -1235,14 +1244,8 @@ def forward(
 )
 class BlenderbotForConditionalGeneration(BlenderbotPreTrainedModel):
     base_model_prefix = "model"
-    _keys_to_ignore_on_load_missing = [
-        r"final_logits_bias",
-        r"encoder.version",
-        r"decoder.version",
-        r"lm_head.weight",
-        "decoder.embed_tokens.weight",
-        "encoder.embed_tokens.weight",
-    ]
+    _keys_to_ignore_on_load_missing = ["final_logits_bias"]
+    _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "lm_head.weight"]
 
     def __init__(self, config: BlenderbotConfig):
         super().__init__(config)
@@ -1274,9 +1277,9 @@ def get_encoder(self):
     def get_decoder(self):
         return self.model.get_decoder()
 
-    def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
-        new_embeddings = super().resize_token_embeddings(new_num_tokens)
-        self._resize_final_logits_bias(new_num_tokens)
+    def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding:
+        new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
+        self._resize_final_logits_bias(new_embeddings.weight.shape[0])
         return new_embeddings
 
     def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
@@ -1385,7 +1388,7 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # cut decoder_input_ids if past is used
         if past_key_values is not None:
@@ -1404,12 +1407,13 @@ def prepare_inputs_for_generation(
         }
 
     @staticmethod
-    def _reorder_cache(past, beam_idx):
+    def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
+        for layer_past in past_key_values:
             # cached cross_attention states don't have to be reordered -> they are always the same
             reordered_past += (
-                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
+                + layer_past[2:],
             )
         return reordered_past
 
@@ -1431,7 +1435,7 @@ def forward(self, *args, **kwargs):
 
 # Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->Blenderbot, facebook/bart-base->facebook/blenderbot-400M-distill
 class BlenderbotForCausalLM(BlenderbotPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         config = copy.deepcopy(config)
@@ -1486,7 +1490,7 @@ def forward(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`BlenderbotTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -1551,9 +1555,9 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import BlenderbotTokenizer, BlenderbotForCausalLM
+        >>> from transformers import AutoTokenizer, BlenderbotForCausalLM
 
-        >>> tokenizer = BlenderbotTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
+        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
         >>> model = BlenderbotForCausalLM.from_pretrained(
         ...     "facebook/blenderbot-400M-distill", add_cross_attention=False
         ... )
@@ -1593,6 +1597,7 @@ def forward(
 
         loss = None
         if labels is not None:
+            labels = labels.to(logits.device)
             loss_fct = CrossEntropyLoss()
             loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
 
@@ -1627,8 +1632,10 @@ def prepare_inputs_for_generation(
         }
 
     @staticmethod
-    def _reorder_cache(past, beam_idx):
+    def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
         return reordered_past
diff --git a/src/transformers/models/blenderbot/modeling_flax_blenderbot.py b/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
index 1b3b57b95b11..1035272fd053 100644
--- a/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
@@ -19,8 +19,6 @@
 from functools import partial
 from typing import Callable, Optional, Tuple
 
-import numpy as np
-
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
@@ -52,7 +50,6 @@
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "BlenderbotConfig"
-_TOKENIZER_FOR_DOC = "BlenderbotTokenizer"
 _CHECKPOINT_FOR_DOC = "facebook/blenderbot-400M-distill"
 
 
@@ -84,7 +81,7 @@
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`BlenderbotTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -98,7 +95,7 @@
         decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BlenderbotTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -135,7 +132,7 @@
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`BlenderbotTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -164,7 +161,7 @@
         decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BlenderbotTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -207,15 +204,15 @@
 
 
 # Copied from transformers.models.bart.modeling_flax_bart.shift_tokens_right
-def shift_tokens_right(input_ids: np.array, pad_token_id: int, decoder_start_token_id: int) -> np.ndarray:
+def shift_tokens_right(input_ids: jnp.array, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray:
     """
     Shift input ids one token to the right.
     """
-    shifted_input_ids = np.zeros_like(input_ids)
-    shifted_input_ids[:, 1:] = input_ids[:, :-1]
-    shifted_input_ids[:, 0] = decoder_start_token_id
+    shifted_input_ids = jnp.zeros_like(input_ids)
+    shifted_input_ids = shifted_input_ids.at[:, 1:].set(input_ids[:, :-1])
+    shifted_input_ids = shifted_input_ids.at[:, 0].set(decoder_start_token_id)
 
-    shifted_input_ids = np.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
+    shifted_input_ids = jnp.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
     return shifted_input_ids
 
 
@@ -359,7 +356,7 @@ def __call__(
             attention_bias = lax.select(
                 attention_mask > 0,
                 jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
-                jnp.full(attention_mask.shape, float("-inf")).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
             )
         else:
             attention_bias = None
@@ -890,7 +887,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, **kwargs)
         super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
@@ -992,10 +989,10 @@ def encode(
         Example:
 
         ```python
-        >>> from transformers import BlenderbotTokenizer, FlaxBlenderbotForConditionalGeneration
+        >>> from transformers import AutoTokenizer, FlaxBlenderbotForConditionalGeneration
 
         >>> model = FlaxBlenderbotForConditionalGeneration.from_pretrained("facebook/blenderbot-400M-distill")
-        >>> tokenizer = BlenderbotTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
+        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
 
         >>> text = "My friends are cool but they eat too many carbs."
         >>> inputs = tokenizer(text, max_length=1024, return_tensors="jax")
@@ -1061,10 +1058,10 @@ def decode(
 
         ```python
         >>> import jax.numpy as jnp
-        >>> from transformers import BlenderbotTokenizer, FlaxBlenderbotForConditionalGeneration
+        >>> from transformers import AutoTokenizer, FlaxBlenderbotForConditionalGeneration
 
         >>> model = FlaxBlenderbotForConditionalGeneration.from_pretrained("facebook/blenderbot-400M-distill")
-        >>> tokenizer = BlenderbotTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
+        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
 
         >>> text = "My friends are cool but they eat too many carbs."
         >>> inputs = tokenizer(text, max_length=1024, return_tensors="jax")
@@ -1222,9 +1219,7 @@ class FlaxBlenderbotModel(FlaxBlenderbotPreTrainedModel):
     module_class = FlaxBlenderbotModule
 
 
-append_call_sample_docstring(
-    FlaxBlenderbotModel, _TOKENIZER_FOR_DOC, _CHECKPOINT_FOR_DOC, FlaxSeq2SeqModelOutput, _CONFIG_FOR_DOC
-)
+append_call_sample_docstring(FlaxBlenderbotModel, _CHECKPOINT_FOR_DOC, FlaxSeq2SeqModelOutput, _CONFIG_FOR_DOC)
 
 
 # Copied from transformers.models.bart.modeling_flax_bart.FlaxBartForConditionalGenerationModule with Bart->Blenderbot
@@ -1331,10 +1326,10 @@ def decode(
 
         ```python
         >>> import jax.numpy as jnp
-        >>> from transformers import BlenderbotTokenizer, FlaxBlenderbotForConditionalGeneration
+        >>> from transformers import AutoTokenizer, FlaxBlenderbotForConditionalGeneration
 
         >>> model = FlaxBlenderbotForConditionalGeneration.from_pretrained("facebook/blenderbot-400M-distill")
-        >>> tokenizer = BlenderbotTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
+        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
 
         >>> text = "My friends are cool but they eat too many carbs."
         >>> inputs = tokenizer(text, max_length=1024, return_tensors="jax")
@@ -1448,10 +1443,10 @@ def prepare_inputs_for_generation(
         self,
         decoder_input_ids,
         max_length,
-        attention_mask: Optional[jnp.DeviceArray] = None,
-        decoder_attention_mask: Optional[jnp.DeviceArray] = None,
+        attention_mask: Optional[jax.Array] = None,
+        decoder_attention_mask: Optional[jax.Array] = None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # initializing the cache
         batch_size, seq_length = decoder_input_ids.shape
@@ -1486,17 +1481,19 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
 
     Conversation example::
 
-        >>> from transformers import BlenderbotTokenizer, FlaxBlenderbotForConditionalGeneration, BlenderbotConfig
+    ```py
+    >>> from transformers import AutoTokenizer, FlaxBlenderbotForConditionalGeneration
 
-        >>> model = FlaxBlenderbotForConditionalGeneration.from_pretrained('facebook/blenderbot-400M-distill') >>>
-        tokenizer = BlenderbotTokenizer.from_pretrained('facebook/blenderbot-400M-distill')
+    >>> model = FlaxBlenderbotForConditionalGeneration.from_pretrained("facebook/blenderbot-400M-distill")
+    >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
 
-        >>> UTTERANCE = "My friends are cool but they eat too many carbs." >>> inputs = tokenizer([UTTERANCE],
-        max_length=1024, return_tensors='np')
+    >>> UTTERANCE = "My friends are cool but they eat too many carbs."
+    >>> inputs = tokenizer([UTTERANCE], max_length=1024, return_tensors="np")
 
-        >>> # Generate Reply >>> reply_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5,
-        early_stopping=True).sequences >>> print([tokenizer.decode(g, skip_special_tokens=True,
-        clean_up_tokenization_spaces=False) for g in reply_ids])
+    >>> # Generate Reply
+    >>> reply_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=5, early_stopping=True).sequences
+    >>> print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in reply_ids])
+    ```
 """
 
 overwrite_call_docstring(
diff --git a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py
index c28cc66cadec..fdd85a7f8783 100644
--- a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py
@@ -15,6 +15,8 @@
 """ TF 2.0 Blenderbot model."""
 
 
+from __future__ import annotations
+
 import os
 import random
 import warnings
@@ -32,13 +34,12 @@
 
 # Public API
 from ...modeling_tf_utils import (
-    DUMMY_INPUTS,
     TFCausalLanguageModelingLoss,
     TFPreTrainedModel,
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     ContextManagers,
     add_code_sample_docstrings,
@@ -55,7 +56,6 @@
 
 _CHECKPOINT_FOR_DOC = "facebook/blenderbot-400M-distill"
 _CONFIG_FOR_DOC = "BlenderbotConfig"
-_TOKENIZER_FOR_DOC = "BlenderbotTokenizer"
 
 
 LARGE_NEGATIVE = -1e8
@@ -127,7 +127,7 @@ def __init__(self, num_embeddings: int, embedding_dim: int, **kwargs):
         super().__init__(num_embeddings, embedding_dim, **kwargs)
 
     def call(
-        self, input_shape: tf.TensorShape, past_key_values_length: int = 0, position_ids: Optional[tf.Tensor] = None
+        self, input_shape: tf.TensorShape, past_key_values_length: int = 0, position_ids: tf.Tensor | None = None
     ):
         """Input is expected to be of size [bsz x seqlen]."""
         if position_ids is None:
@@ -176,12 +176,12 @@ def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
     def call(
         self,
         hidden_states: tf.Tensor,
-        key_value_states: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[Tuple[tf.Tensor]]] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
+        key_value_states: tf.Tensor | None = None,
+        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
+        attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
         training: Optional[bool] = False,
-    ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
+    ) -> Tuple[tf.Tensor, tf.Tensor | None]:
         """Input shape: Batch x Time x Channel"""
 
         # if key_value_states are provided this layer is used as a cross-attention layer
@@ -317,7 +317,7 @@ def call(
     ):
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+            hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
             attention_mask (`tf.Tensor`): attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
@@ -381,21 +381,21 @@ def __init__(self, config: BlenderbotConfig, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
-        encoder_hidden_states: Optional[tf.Tensor] = None,
-        encoder_attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
-        cross_attn_layer_head_mask: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[tf.Tensor]] = None,
+        attention_mask: tf.Tensor | None = None,
+        encoder_hidden_states: tf.Tensor | None = None,
+        encoder_attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
+        cross_attn_layer_head_mask: tf.Tensor | None = None,
+        past_key_value: Tuple[tf.Tensor] | None = None,
         training: Optional[bool] = False,
     ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+            hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
             attention_mask (`tf.Tensor`): attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             encoder_hidden_states (`tf.Tensor`):
-                cross attention input to the layer of shape *(seq_len, batch, embed_dim)*
+                cross attention input to the layer of shape *(batch, seq_len, embed_dim)*
             encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
@@ -463,34 +463,6 @@ class TFBlenderbotPreTrainedModel(TFPreTrainedModel):
     config_class = BlenderbotConfig
     base_model_prefix = "model"
 
-    @property
-    def dummy_inputs(self):
-        pad_token = 1
-        input_ids = tf.convert_to_tensor(DUMMY_INPUTS, dtype=tf.int32)
-        decoder_input_ids = tf.convert_to_tensor(DUMMY_INPUTS, dtype=tf.int32)
-        dummy_inputs = {
-            "decoder_input_ids": decoder_input_ids,
-            "attention_mask": tf.cast(input_ids != pad_token, tf.int32),
-            "input_ids": input_ids,
-        }
-        return dummy_inputs
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-                "decoder_input_ids": tf.TensorSpec((None, None), tf.int32, name="decoder_input_ids"),
-                "decoder_attention_mask": tf.TensorSpec((None, None), tf.int32, name="decoder_attention_mask"),
-            }
-        ]
-    )
-    # Copied from transformers.models.bart.modeling_tf_bart.TFBartPretrainedModel.serving
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 BLENDERBOT_START_DOCSTRING = r"""
     This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
@@ -536,18 +508,30 @@ def serving(self, inputs):
 BLENDERBOT_GENERATION_EXAMPLE = r"""
     Conversation example::
 
-        >>> from transformers import BlenderbotTokenizer, TFBlenderbotForConditionalGeneration >>> mname =
-        'facebook/blenderbot-400M-distill' >>> model = TFBlenderbotForConditionalGeneration.from_pretrained(mname) >>>
-        tokenizer = BlenderbotTokenizer.from_pretrained(mname) >>> UTTERANCE = "My friends are cool but they eat too
-        many carbs." >>> print("Human: ", UTTERANCE) >>> inputs = tokenizer([UTTERANCE], return_tensors='tf') >>>
-        reply_ids = model.generate(**inputs) >>> print("Bot: ", tokenizer.batch_decode(reply_ids,
-        skip_special_tokens=True)[0])
-
-        >>> REPLY = "I'm not sure" >>> print("Human: ", REPLY) >>> NEXT_UTTERANCE = ( ... "My friends are cool but they
-        eat too many carbs.</s> <s>That's unfortunate. " ... "Are they trying to lose weight or are they just trying to
-        be healthier?</s> " ... "<s> I'm not sure." ... ) >>> inputs = tokenizer([NEXT_UTTERANCE], return_tensors='tf')
-        >>> next_reply_ids = model.generate(**inputs) >>> print("Bot: ", tokenizer.batch_decode(next_reply_ids,
-        skip_special_tokens=True)[0])
+    ```py
+    >>> from transformers import AutoTokenizer, TFBlenderbotForConditionalGeneration
+
+    >>> mname = "facebook/blenderbot-400M-distill"
+    >>> model = TFBlenderbotForConditionalGeneration.from_pretrained(mname)
+    >>> tokenizer = AutoTokenizer.from_pretrained(mname)
+    >>> UTTERANCE = "My friends are cool but they eat too many carbs."
+    >>> print("Human: ", UTTERANCE)
+
+    >>> inputs = tokenizer([UTTERANCE], return_tensors="tf")
+    >>> reply_ids = model.generate(**inputs)
+    >>> print("Bot: ", tokenizer.batch_decode(reply_ids, skip_special_tokens=True)[0])
+
+    >>> REPLY = "I'm not sure"
+    >>> print("Human: ", REPLY)
+    >>> NEXT_UTTERANCE = (
+    ...     "My friends are cool but they eat too many carbs.</s> <s>That's unfortunate. "
+    ...     "Are they trying to lose weight or are they just trying to be healthier?</s> "
+    ...     "<s> I'm not sure."
+    ... )
+    >>> inputs = tokenizer([NEXT_UTTERANCE], return_tensors="tf")
+    >>> next_reply_ids = model.generate(**inputs)
+    >>> print("Bot: ", tokenizer.batch_decode(next_reply_ids, skip_special_tokens=True)[0])
+    ```
 """
 
 BLENDERBOT_INPUTS_DOCSTRING = r"""
@@ -555,7 +539,7 @@ def serving(self, inputs):
         input_ids (`tf.Tensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BlenderbotTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -569,7 +553,7 @@ def serving(self, inputs):
         decoder_input_ids (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BlenderbotTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -681,7 +665,7 @@ def call(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`BlenderbotTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -735,16 +719,7 @@ def call(
             if hasattr(self.embed_tokens, "load_weight_prefix"):
                 context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
             with ContextManagers(context):
-                # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-                # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-                tf.debugging.assert_less(
-                    input_ids,
-                    tf.cast(self.embed_tokens.input_dim, dtype=input_ids.dtype),
-                    message=(
-                        "input_ids must be smaller than the embedding layer's input dimension (got"
-                        f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.input_dim})"
-                    ),
-                )
+                check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
                 inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
         embed_pos = self.embed_positions(input_shape)
@@ -774,7 +749,6 @@ def call(
 
         # encoder layers
         for idx, encoder_layer in enumerate(self.layers):
-
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
@@ -861,7 +835,7 @@ def call(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`BlenderbotTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -946,16 +920,7 @@ def call(
             if hasattr(self.embed_tokens, "load_weight_prefix"):
                 context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
             with ContextManagers(context):
-                # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-                # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-                tf.debugging.assert_less(
-                    input_ids,
-                    tf.cast(self.embed_tokens.input_dim, dtype=input_ids.dtype),
-                    message=(
-                        "input_ids must be smaller than the embedding layer's input dimension (got"
-                        f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.input_dim})"
-                    ),
-                )
+                check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
                 inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
         hidden_states = inputs_embeds
@@ -1090,7 +1055,7 @@ def call(
         output_hidden_states=None,
         return_dict=None,
         training=False,
-        **kwargs
+        **kwargs,
     ):
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1185,31 +1150,30 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
     @unpack_inputs
     @add_start_docstrings_to_model_forward(BLENDERBOT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFSeq2SeqModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        decoder_input_ids: Optional[tf.Tensor] = None,
-        decoder_attention_mask: Optional[tf.Tensor] = None,
-        decoder_position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        decoder_head_mask: Optional[tf.Tensor] = None,
-        cross_attn_head_mask: Optional[tf.Tensor] = None,
+        input_ids: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        decoder_input_ids: tf.Tensor | None = None,
+        decoder_attention_mask: tf.Tensor | None = None,
+        decoder_position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        decoder_head_mask: tf.Tensor | None = None,
+        cross_attn_head_mask: tf.Tensor | None = None,
         encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
-        past_key_values: Optional[List[tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        decoder_inputs_embeds: Optional[tf.Tensor] = None,
+        past_key_values: List[tf.Tensor] | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        decoder_inputs_embeds: tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
-        **kwargs
+        **kwargs,
     ) -> Union[Tuple[tf.Tensor], TFSeq2SeqModelOutput]:
         outputs = self.model(
             input_ids=input_ids,
@@ -1336,23 +1300,23 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
     @add_end_docstrings(BLENDERBOT_GENERATION_EXAMPLE)
     def call(
         self,
-        input_ids: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        decoder_input_ids: Optional[tf.Tensor] = None,
-        decoder_attention_mask: Optional[tf.Tensor] = None,
-        decoder_position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        decoder_head_mask: Optional[tf.Tensor] = None,
-        cross_attn_head_mask: Optional[tf.Tensor] = None,
+        input_ids: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        decoder_input_ids: tf.Tensor | None = None,
+        decoder_attention_mask: tf.Tensor | None = None,
+        decoder_position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        decoder_head_mask: tf.Tensor | None = None,
+        cross_attn_head_mask: tf.Tensor | None = None,
         encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
-        past_key_values: Optional[List[tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        decoder_inputs_embeds: Optional[tf.Tensor] = None,
+        past_key_values: List[tf.Tensor] | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        decoder_inputs_embeds: tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple[tf.Tensor], TFSeq2SeqLMOutput]:
         r"""
@@ -1446,9 +1410,8 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
-
         # cut decoder_input_ids if past_key_values is used
         if past_key_values is not None:
             decoder_input_ids = decoder_input_ids[:, -1:]
diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot.py b/src/transformers/models/blenderbot/tokenization_blenderbot.py
index ace4afc6d503..9a81e73b8da3 100644
--- a/src/transformers/models/blenderbot/tokenization_blenderbot.py
+++ b/src/transformers/models/blenderbot/tokenization_blenderbot.py
@@ -17,7 +17,7 @@
 import json
 import os
 from functools import lru_cache
-from typing import TYPE_CHECKING, List, Optional, Tuple
+from typing import List, Optional, Tuple
 
 import regex as re
 
@@ -25,9 +25,6 @@
 from ...utils import logging
 
 
-if TYPE_CHECKING:
-    from transformers.pipelines.conversational import Conversation
-
 logger = logging.get_logger(__name__)
 
 
@@ -96,13 +93,15 @@ class BlenderbotTokenizer(PreTrainedTokenizer):
     This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
     be encoded differently whether it is at the beginning of the sentence (without space) or not:
 
-    ```
+    ```python
     >>> from transformers import BlenderbotTokenizer
+
     >>> tokenizer = BlenderbotTokenizer.from_pretrained("facebook/blenderbot-3B")
     >>> tokenizer.add_prefix_space = False
-    >>> tokenizer("Hello world")['input_ids']
+    >>> tokenizer("Hello world")["input_ids"]
     [47, 921, 86, 1085, 2]
-    >>> tokenizer(" Hello world")['input_ids']
+
+    >>> tokenizer(" Hello world")["input_ids"]
     [6950, 1085, 2]
     ```
 
@@ -185,31 +184,24 @@ def __init__(
         pad_token="<pad>",
         mask_token="<mask>",
         add_prefix_space=False,
-        **kwargs
+        **kwargs,
     ):
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
         sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
         cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
-        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
-        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
 
         # Mask token behave like a normal word, i.e. include the space before it
-        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
-
-        super().__init__(
-            errors=errors,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            add_prefix_space=add_prefix_space,
-            **kwargs,
+        mask_token = (
+            AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
+            if isinstance(mask_token, str)
+            else mask_token
         )
 
+        # these special tokens are not part of the vocab.json, let's add them in the correct order
+
         with open(vocab_file, encoding="utf-8") as vocab_handle:
             self.encoder = json.load(vocab_handle)
         self.decoder = {v: k for k, v in self.encoder.items()}
@@ -226,6 +218,19 @@ def __init__(
         # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
         self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
 
+        super().__init__(
+            errors=errors,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            add_prefix_space=add_prefix_space,
+            **kwargs,
+        )
+
     @property
     # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.vocab_size with Roberta->Blenderbot, RoBERTa->Blenderbot
     def vocab_size(self):
@@ -233,7 +238,9 @@ def vocab_size(self):
 
     # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.get_vocab with Roberta->Blenderbot, RoBERTa->Blenderbot
     def get_vocab(self):
-        return dict(self.encoder, **self.added_tokens_encoder)
+        vocab = dict(self.encoder).copy()
+        vocab.update(self.added_tokens_encoder)
+        return vocab
 
     # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.bpe with Roberta->Blenderbot, RoBERTa->Blenderbot
     def bpe(self, token):
@@ -411,19 +418,16 @@ def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1:
         """
         return token_ids_0 + [self.eos_token_id]
 
-    def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
-        inputs = []
-        for is_user, text in conversation.iter_texts():
-            if is_user:
-                # We need to space prefix as it's being done within blenderbot
-                inputs.append(" " + text)
-            else:
-                # Generated responses should contain them already.
-                inputs.append(text)
-
-        full_string = "  ".join(inputs)
-        input_ids = self.encode(full_string)
-        if len(input_ids) > self.model_max_length:
-            input_ids = input_ids[-self.model_max_length :]
-            logger.warning(f"Trimmed input from conversation as it was longer than {self.model_max_length} tokens.")
-        return input_ids
+    @property
+    def default_chat_template(self):
+        """
+        A very simple chat template that just adds whitespace between messages.
+        """
+        return (
+            "{% for message in messages %}"
+            "{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}"
+            "{{ message['content'] }}"
+            "{% if not loop.last %}{{ '  ' }}{% endif %}"
+            "{% endfor %}"
+            "{{ eos_token }}"
+        )
diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py
index 4e65294cb904..fdd490b12adc 100644
--- a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py
+++ b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 """Fast Tokenization class for Blenderbot."""
 import json
-from typing import TYPE_CHECKING, List, Optional, Tuple
+from typing import List, Optional, Tuple
 
 from tokenizers import pre_tokenizers, processors
 
@@ -24,9 +24,6 @@
 from .tokenization_blenderbot import BlenderbotTokenizer
 
 
-if TYPE_CHECKING:
-    from transformers.pipelines.conversational import Conversation
-
 logger = logging.get_logger(__name__)
 
 
@@ -55,12 +52,14 @@ class BlenderbotTokenizerFast(PreTrainedTokenizerFast):
     This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
     be encoded differently whether it is at the beginning of the sentence (without space) or not:
 
-    ```
+    ```python
     >>> from transformers import BlenderbotTokenizerFast
+
     >>> tokenizer = BlenderbotTokenizerFast.from_pretrained("facebook/blenderbot-3B")
-    >>> tokenizer("Hello world")['input_ids']
+    >>> tokenizer("Hello world")["input_ids"]
     [6950, 1085, 2]
-    >>> tokenizer(" Hello world")['input_ids']
+
+    >>> tokenizer(" Hello world")["input_ids"]
     [6950, 1085, 2]
     ```
 
@@ -148,8 +147,13 @@ def __init__(
         mask_token="<mask>",
         add_prefix_space=False,
         trim_offsets=True,
-        **kwargs
+        **kwargs,
     ):
+        mask_token = (
+            AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
+            if isinstance(mask_token, str)
+            else mask_token
+        )
         super().__init__(
             vocab_file,
             merges_file,
@@ -295,19 +299,17 @@ def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1:
         """
         return token_ids_0 + [self.eos_token_id]
 
-    def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
-        inputs = []
-        for is_user, text in conversation.iter_texts():
-            if is_user:
-                # We need to space prefix as it's being done within blenderbot
-                inputs.append(" " + text)
-            else:
-                # Generated responses should contain them already.
-                inputs.append(text)
-
-        full_string = "  ".join(inputs)
-        input_ids = self.encode(full_string)
-        if len(input_ids) > self.model_max_length:
-            input_ids = input_ids[-self.model_max_length :]
-            logger.warning(f"Trimmed input from conversation as it was longer than {self.model_max_length} tokens.")
-        return input_ids
+    @property
+    # Copied from transformers.models.blenderbot.tokenization_blenderbot.BlenderbotTokenizer.default_chat_template
+    def default_chat_template(self):
+        """
+        A very simple chat template that just adds whitespace between messages.
+        """
+        return (
+            "{% for message in messages %}"
+            "{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}"
+            "{{ message['content'] }}"
+            "{% if not loop.last %}{{ '  ' }}{% endif %}"
+            "{% endfor %}"
+            "{{ eos_token }}"
+        )
diff --git a/src/transformers/models/blenderbot_small/__init__.py b/src/transformers/models/blenderbot_small/__init__.py
index 30d6d188c345..5622ab70de64 100644
--- a/src/transformers/models/blenderbot_small/__init__.py
+++ b/src/transformers/models/blenderbot_small/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py b/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
index c056fa46bd1f..fbc23435d66f 100644
--- a/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
@@ -134,7 +134,7 @@ def __init__(
         bos_token_id=1,
         eos_token_id=2,
         forced_eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
index 5b8614de12dc..a1e888aec908 100755
--- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
@@ -17,7 +17,6 @@
 
 import copy
 import math
-import random
 from typing import List, Optional, Tuple, Union
 
 import torch
@@ -47,8 +46,6 @@
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "BlenderbotSmallConfig"
-_TOKENIZER_FOR_DOC = "BlenderbotSmallTokenizer"
-_CHECKPOINT_FOR_DOC = "facebook/blenderbot_small-90M"
 
 
 BLENDERBOT_SMALL_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -75,18 +72,20 @@ def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start
 
 
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
     """
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
-    mask_cond = torch.arange(mask.size(-1))
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
 
     if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
     return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
 
 
@@ -215,8 +214,8 @@ def forward(
 
         proj_shape = (bsz * self.num_heads, -1, self.head_dim)
         query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-        key_states = key_states.view(*proj_shape)
-        value_states = value_states.view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
 
         src_len = key_states.size(1)
         attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
@@ -262,7 +261,7 @@ def forward(
 
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
             raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
                 f" {attn_output.size()}"
             )
 
@@ -270,7 +269,7 @@ def forward(
         attn_output = attn_output.transpose(1, 2)
 
         # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned aross GPUs when using tensor-parallelism.
+        # partitioned across GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
@@ -305,7 +304,7 @@ def forward(
     ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
@@ -517,11 +516,11 @@ def dummy_inputs(self):
     Conversation example:
 
     ```python
-    >>> from transformers import BlenderbotSmallTokenizer, BlenderbotSmallForConditionalGeneration
+    >>> from transformers import AutoTokenizer, BlenderbotSmallForConditionalGeneration
 
     >>> mname = "facebook/blenderbot_small-90M"
     >>> model = BlenderbotSmallForConditionalGeneration.from_pretrained(mname)
-    >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained(mname)
+    >>> tokenizer = AutoTokenizer.from_pretrained(mname)
     >>> UTTERANCE = "My friends are cool but they eat too many carbs."
     >>> print("Human: ", UTTERANCE)
     Human:  My friends are cool but they eat too many carbs.
@@ -536,14 +535,14 @@ def dummy_inputs(self):
     Human: I'm not sure
 
     >>> NEXT_UTTERANCE = (
-    ...     "My friends are cool but they eat too many carbs.</s> <s>what kind of carbs do they eat? "
-    ...     "i don't know much about carbs</s> "
-    ...     "<s> I'm not sure."
+    ...     "My friends are cool but they eat too many carbs.__end__ __start__what kind of carbs do they eat? "
+    ...     "i don't know much about carbs__end__ "
+    ...     "__start__ I'm not sure."
     ... )
     >>> inputs = tokenizer([NEXT_UTTERANCE], return_tensors="pt")
     >>> next_reply_ids = model.generate(**inputs)
     >>> print("Bot: ", tokenizer.batch_decode(next_reply_ids, skip_special_tokens=True)[0])
-    Bot:  they eat a lot of carbs. carbs are high in fat, protein, and carbohydrates.
+    Bot:  they eat a lot of carbs. carbs are high in fat, protein, and fats.
     ```
 """
 
@@ -553,7 +552,7 @@ def dummy_inputs(self):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`BlenderbotSmallTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -567,7 +566,7 @@ def dummy_inputs(self):
         decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BlenderbotSmallTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -690,7 +689,7 @@ def forward(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`BlenderbotSmallTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -730,6 +729,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
         elif inputs_embeds is not None:
@@ -765,8 +765,13 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
                 layer_outputs = (None, None)
             else:
                 if self.gradient_checkpointing and self.training:
@@ -852,8 +857,11 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
         combined_attention_mask = None
         if input_shape[-1] > 1:
             combined_attention_mask = _make_causal_mask(
-                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
-            ).to(inputs_embeds.device)
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
 
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
@@ -887,7 +895,7 @@ def forward(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`BlenderbotSmallTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -988,6 +996,13 @@ def forward(
 
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
@@ -1006,20 +1021,15 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         # None for past_key_value
@@ -1038,7 +1048,6 @@ def custom_forward(*inputs):
                     None,
                 )
             else:
-
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
@@ -1088,7 +1097,7 @@ def custom_forward(*inputs):
     BLENDERBOT_SMALL_START_DOCSTRING,
 )
 class BlenderbotSmallModel(BlenderbotSmallPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+    _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
 
     def __init__(self, config: BlenderbotSmallConfig):
         super().__init__(config)
@@ -1142,10 +1151,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import BlenderbotSmallTokenizer, BlenderbotSmallModel
+        >>> from transformers import AutoTokenizer, BlenderbotSmallModel
 
         >>> model = BlenderbotSmallModel.from_pretrained("facebook/blenderbot_small-90M")
-        >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained("facebook/blenderbot_small-90M")
+        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot_small-90M")
 
         >>> inputs = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt")
         >>> decoder_inputs = tokenizer("Studies show that", return_tensors="pt")  # Batch size 1
@@ -1217,14 +1226,8 @@ def forward(
 )
 class BlenderbotSmallForConditionalGeneration(BlenderbotSmallPreTrainedModel):
     base_model_prefix = "model"
-    _keys_to_ignore_on_load_missing = [
-        r"final_logits_bias",
-        r"encoder.version",
-        r"decoder.version",
-        r"lm_head.weight",
-        "encoder.embed_tokens.weight",
-        "decoder.embed_tokens.weight",
-    ]
+    _keys_to_ignore_on_load_missing = ["final_logits_bias"]
+    _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "lm_head.weight"]
 
     def __init__(self, config: BlenderbotSmallConfig):
         super().__init__(config)
@@ -1241,9 +1244,9 @@ def get_encoder(self):
     def get_decoder(self):
         return self.model.get_decoder()
 
-    def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
-        new_embeddings = super().resize_token_embeddings(new_num_tokens)
-        self._resize_final_logits_bias(new_num_tokens)
+    def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding:
+        new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
+        self._resize_final_logits_bias(new_embeddings.weight.shape[0])
         return new_embeddings
 
     def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
@@ -1352,7 +1355,7 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # cut decoder_input_ids if past is used
         if past_key_values is not None:
@@ -1371,12 +1374,13 @@ def prepare_inputs_for_generation(
         }
 
     @staticmethod
-    def _reorder_cache(past, beam_idx):
+    def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
+        for layer_past in past_key_values:
             # cached cross_attention states don't have to be reordered -> they are always the same
             reordered_past += (
-                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
+                + layer_past[2:],
             )
         return reordered_past
 
@@ -1398,7 +1402,7 @@ def forward(self, *args, **kwargs):
 
 # Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->BlenderbotSmall, facebook/bart-base->facebook/blenderbot_small-90M
 class BlenderbotSmallForCausalLM(BlenderbotSmallPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         config = copy.deepcopy(config)
@@ -1453,7 +1457,7 @@ def forward(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`BlenderbotSmallTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -1518,9 +1522,9 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import BlenderbotSmallTokenizer, BlenderbotSmallForCausalLM
+        >>> from transformers import AutoTokenizer, BlenderbotSmallForCausalLM
 
-        >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained("facebook/blenderbot_small-90M")
+        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot_small-90M")
         >>> model = BlenderbotSmallForCausalLM.from_pretrained(
         ...     "facebook/blenderbot_small-90M", add_cross_attention=False
         ... )
@@ -1560,6 +1564,7 @@ def forward(
 
         loss = None
         if labels is not None:
+            labels = labels.to(logits.device)
             loss_fct = CrossEntropyLoss()
             loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
 
@@ -1594,8 +1599,10 @@ def prepare_inputs_for_generation(
         }
 
     @staticmethod
-    def _reorder_cache(past, beam_idx):
+    def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
         return reordered_past
diff --git a/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
index e5a0352d24d9..2bf8b59e2757 100644
--- a/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
@@ -20,8 +20,6 @@
 from functools import partial
 from typing import Callable, Optional, Tuple
 
-import numpy as np
-
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
@@ -54,7 +52,6 @@
 
 _CHECKPOINT_FOR_DOC = "facebook/blenderbot_small-90M"
 _CONFIG_FOR_DOC = "BlenderbotSmallConfig"
-_TOKENIZER_FOR_DOC = "BlenderbotSmallTokenizer"
 
 BLENDERBOT_SMALL_START_DOCSTRING = r"""
     This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
@@ -96,7 +93,7 @@
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`BlenderbotSmallTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -110,7 +107,7 @@
         decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BlenderbotSmallTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -147,7 +144,7 @@
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`BlenderbotSmallTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -176,7 +173,7 @@
         decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BlenderbotSmallTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -219,15 +216,15 @@
 
 
 # Copied from transformers.models.bart.modeling_flax_bart.shift_tokens_right
-def shift_tokens_right(input_ids: jnp.ndarray, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray:
+def shift_tokens_right(input_ids: jnp.array, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray:
     """
     Shift input ids one token to the right.
     """
-    shifted_input_ids = np.zeros_like(input_ids)
-    shifted_input_ids[:, 1:] = input_ids[:, :-1]
-    shifted_input_ids[:, 0] = decoder_start_token_id
+    shifted_input_ids = jnp.zeros_like(input_ids)
+    shifted_input_ids = shifted_input_ids.at[:, 1:].set(input_ids[:, :-1])
+    shifted_input_ids = shifted_input_ids.at[:, 0].set(decoder_start_token_id)
 
-    shifted_input_ids = np.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
+    shifted_input_ids = jnp.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
     return shifted_input_ids
 
 
@@ -371,7 +368,7 @@ def __call__(
             attention_bias = lax.select(
                 attention_mask > 0,
                 jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
-                jnp.full(attention_mask.shape, float("-inf")).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
             )
         else:
             attention_bias = None
@@ -888,7 +885,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, **kwargs)
         super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
@@ -990,10 +987,10 @@ def encode(
         Example:
 
         ```python
-        >>> from transformers import BlenderbotSmallTokenizer, FlaxBlenderbotSmallForConditionalGeneration
+        >>> from transformers import AutoTokenizer, FlaxBlenderbotSmallForConditionalGeneration
 
         >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained("facebook/blenderbot_small-90M")
-        >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained("facebook/blenderbot_small-90M")
+        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot_small-90M")
 
         >>> text = "My friends are cool but they eat too many carbs."
         >>> inputs = tokenizer(text, max_length=1024, return_tensors="np")
@@ -1059,10 +1056,10 @@ def decode(
 
         ```python
         >>> import jax.numpy as jnp
-        >>> from transformers import BlenderbotSmallTokenizer, FlaxBlenderbotSmallForConditionalGeneration
+        >>> from transformers import AutoTokenizer, FlaxBlenderbotSmallForConditionalGeneration
 
         >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained("facebook/blenderbot_small-90M")
-        >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained("facebook/blenderbot_small-90M")
+        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot_small-90M")
 
         >>> text = "My friends are cool but they eat too many carbs."
         >>> inputs = tokenizer(text, max_length=1024, return_tensors="np")
@@ -1219,9 +1216,7 @@ class FlaxBlenderbotSmallModel(FlaxBlenderbotSmallPreTrainedModel):
     module_class = FlaxBlenderbotSmallModule
 
 
-append_call_sample_docstring(
-    FlaxBlenderbotSmallModel, _TOKENIZER_FOR_DOC, _CHECKPOINT_FOR_DOC, FlaxSeq2SeqModelOutput, _CONFIG_FOR_DOC
-)
+append_call_sample_docstring(FlaxBlenderbotSmallModel, _CHECKPOINT_FOR_DOC, FlaxSeq2SeqModelOutput, _CONFIG_FOR_DOC)
 
 
 # Copied from transformers.models.bart.modeling_flax_bart.FlaxBartForConditionalGenerationModule with Bart->BlenderbotSmall
@@ -1329,10 +1324,10 @@ def decode(
 
         ```python
         >>> import jax.numpy as jnp
-        >>> from transformers import BlenderbotSmallTokenizer, FlaxBlenderbotSmallForConditionalGeneration
+        >>> from transformers import AutoTokenizer, FlaxBlenderbotSmallForConditionalGeneration
 
         >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained("facebook/blenderbot_small-90M")
-        >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained("facebook/blenderbot_small-90M")
+        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot_small-90M")
 
         >>> text = "My friends are cool but they eat too many carbs."
         >>> inputs = tokenizer(text, max_length=1024, return_tensors="np")
@@ -1446,10 +1441,10 @@ def prepare_inputs_for_generation(
         self,
         decoder_input_ids,
         max_length,
-        attention_mask: Optional[jnp.DeviceArray] = None,
-        decoder_attention_mask: Optional[jnp.DeviceArray] = None,
+        attention_mask: Optional[jax.Array] = None,
+        decoder_attention_mask: Optional[jax.Array] = None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # initializing the cache
         batch_size, seq_length = decoder_input_ids.shape
@@ -1484,30 +1479,38 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
 
     Summarization example:
 
-        >>> from transformers import BlenderbotSmallTokenizer, FlaxBlenderbotSmallForConditionalGeneration
+    ```py
+    >>> from transformers import AutoTokenizer, FlaxBlenderbotSmallForConditionalGeneration
 
-        >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained('facebook/blenderbot_small-90M') >>>
-        tokenizer = BlenderbotSmallTokenizer.from_pretrained('facebook/blenderbot_small-90M')
+    >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained("facebook/blenderbot_small-90M")
+    >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot_small-90M")
 
-        >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs." >>> inputs =
-        tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='np')
+    >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
+    >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="np")
 
-        >>> # Generate Summary >>> summary_ids = model.generate(inputs['input_ids']).sequences >>>
-        print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
+    >>> # Generate Summary
+    >>> summary_ids = model.generate(inputs["input_ids"]).sequences
+    >>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
+    ```
 
     Mask filling example:
 
-        >>> from transformers import BlenderbotSmallTokenizer, FlaxBlenderbotSmallForConditionalGeneration >>>
-        tokenizer = BlenderbotSmallTokenizer.from_pretrained('facebook/blenderbot_small-90M') >>> TXT = "My friends are
-        <mask> but they eat too many carbs."
+    ```py
+    >>> from transformers import AutoTokenizer, FlaxBlenderbotSmallForConditionalGeneration
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot_small-90M")
+    >>> TXT = "My friends are <mask> but they eat too many carbs."
 
-        >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained('facebook/blenderbot_small-90M') >>>
-        input_ids = tokenizer([TXT], return_tensors='np')['input_ids'] >>> logits = model(input_ids).logits
+    >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained("facebook/blenderbot_small-90M")
+    >>> input_ids = tokenizer([TXT], return_tensors="np")["input_ids"]
+    >>> logits = model(input_ids).logits
 
-        >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item() >>> probs =
-        jax.nn.softmax(logits[0, masked_index], axis=0) >>> values, predictions = jax.lax.top_k(probs)
+    >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
+    >>> probs = jax.nn.softmax(logits[0, masked_index], axis=0)
+    >>> values, predictions = jax.lax.top_k(probs)
 
-        >>> tokenizer.decode(predictions).split()
+    >>> tokenizer.decode(predictions).split()
+    ```
 """
 
 overwrite_call_docstring(
diff --git a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py
index 384ad68c3363..09c49bea1b4d 100644
--- a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py
@@ -15,6 +15,8 @@
 """ TF 2.0 BlenderbotSmall model."""
 
 
+from __future__ import annotations
+
 import random
 from typing import List, Optional, Tuple, Union
 
@@ -31,13 +33,12 @@
 
 # Public API
 from ...modeling_tf_utils import (
-    DUMMY_INPUTS,
     TFCausalLanguageModelingLoss,
     TFPreTrainedModel,
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     ContextManagers,
     add_code_sample_docstrings,
@@ -54,7 +55,6 @@
 
 _CHECKPOINT_FOR_DOC = "facebook/blenderbot_small-90M"
 _CONFIG_FOR_DOC = "BlenderbotSmallConfig"
-_TOKENIZER_FOR_DOC = "BlenderbotSmallTokenizer"
 
 
 LARGE_NEGATIVE = -1e8
@@ -127,7 +127,7 @@ def __init__(self, num_embeddings: int, embedding_dim: int, **kwargs):
         super().__init__(num_embeddings, embedding_dim, **kwargs)
 
     def call(
-        self, input_shape: tf.TensorShape, past_key_values_length: int = 0, position_ids: Optional[tf.Tensor] = None
+        self, input_shape: tf.TensorShape, past_key_values_length: int = 0, position_ids: tf.Tensor | None = None
     ):
         """Input is expected to be of size [bsz x seqlen]."""
         if position_ids is None:
@@ -176,12 +176,12 @@ def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
     def call(
         self,
         hidden_states: tf.Tensor,
-        key_value_states: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[Tuple[tf.Tensor]]] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
+        key_value_states: tf.Tensor | None = None,
+        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
+        attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
         training: Optional[bool] = False,
-    ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
+    ) -> Tuple[tf.Tensor, tf.Tensor | None]:
         """Input shape: Batch x Time x Channel"""
 
         # if key_value_states are provided this layer is used as a cross-attention layer
@@ -311,13 +311,13 @@ def __init__(self, config: BlenderbotSmallConfig, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]],
-        layer_head_mask: Optional[tf.Tensor],
+        attention_mask: np.ndarray | tf.Tensor | None,
+        layer_head_mask: tf.Tensor | None,
         training: Optional[bool] = False,
     ) -> tf.Tensor:
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`tf.Tensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
@@ -381,21 +381,21 @@ def __init__(self, config: BlenderbotSmallConfig, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
-        cross_attn_layer_head_mask: Optional[tf.Tensor] = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
+        cross_attn_layer_head_mask: tf.Tensor | None = None,
         past_key_value: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         training: Optional[bool] = False,
     ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`tf.Tensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             encoder_hidden_states (`tf.Tensor`):
-                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
             encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
@@ -463,34 +463,6 @@ class TFBlenderbotSmallPreTrainedModel(TFPreTrainedModel):
     config_class = BlenderbotSmallConfig
     base_model_prefix = "model"
 
-    @property
-    def dummy_inputs(self):
-        pad_token = 1
-        input_ids = tf.convert_to_tensor(DUMMY_INPUTS, dtype=tf.int32)
-        decoder_input_ids = tf.convert_to_tensor(DUMMY_INPUTS, dtype=tf.int32)
-        dummy_inputs = {
-            "decoder_input_ids": decoder_input_ids,
-            "attention_mask": tf.cast(input_ids != pad_token, tf.int32),
-            "input_ids": input_ids,
-        }
-        return dummy_inputs
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-                "decoder_input_ids": tf.TensorSpec((None, None), tf.int32, name="decoder_input_ids"),
-                "decoder_attention_mask": tf.TensorSpec((None, None), tf.int32, name="decoder_attention_mask"),
-            }
-        ]
-    )
-    # Copied from transformers.models.bart.modeling_tf_bart.TFBartPretrainedModel.serving
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 BLENDERBOT_SMALL_START_DOCSTRING = r"""
     This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
@@ -536,23 +508,34 @@ def serving(self, inputs):
 BLENDERBOT_SMALL_GENERATION_EXAMPLE = r"""
     Conversation example::
 
-        >>> from transformers import BlenderbotSmallTokenizer, TFBlenderbotSmallForConditionalGeneration >>> mname =
-        'facebook/blenderbot_small-90M' >>> model = BlenderbotSmallForConditionalGeneration.from_pretrained(mname) >>>
-        tokenizer = BlenderbotSmallTokenizer.from_pretrained(mname)
-
-        >>> UTTERANCE = "My friends are cool but they eat too many carbs." >>> print("Human: ", UTTERANCE) >>> inputs =
-        tokenizer([UTTERANCE], return_tensors='tf')
-
-        >>> reply_ids = model.generate(**inputs) >>> print("Bot: ", tokenizer.batch_decode(reply_ids,
-        skip_special_tokens=True)[0]) what kind of carbs do they eat? i don't know much about carbs.
-
-        >>> REPLY = "I'm not sure" >>> print("Human: ", REPLY) >>> NEXT_UTTERANCE = ( ... "My friends are cool but they
-        eat too many carbs.</s> " ... "<s>what kind of carbs do they eat? i don't know much about carbs.</s> " ...
-        "<s>I'm not sure." ... )
-
-        >>> inputs = tokenizer([NEXT_UTTERANCE], return_tensors='tf') >>> inputs.pop("token_type_ids") >>>
-        next_reply_ids = model.generate(**inputs) >>> print("Bot: ", tokenizer.batch_decode(next_reply_ids,
-        skip_special_tokens=True)[0])
+    ```py
+    >>> from transformers import AutoTokenizer, TFBlenderbotSmallForConditionalGeneration
+
+    >>> mname = "facebook/blenderbot_small-90M"
+    >>> model = BlenderbotSmallForConditionalGeneration.from_pretrained(mname)
+    >>> tokenizer = AutoTokenizer.from_pretrained(mname)
+
+    >>> UTTERANCE = "My friends are cool but they eat too many carbs."
+    >>> print("Human: ", UTTERANCE)
+    >>> inputs = tokenizer([UTTERANCE], return_tensors="tf")
+
+    >>> reply_ids = model.generate(**inputs)
+    >>> print("Bot: ", tokenizer.batch_decode(reply_ids, skip_special_tokens=True)[0])
+    what kind of carbs do they eat? i don't know much about carbs.
+
+    >>> REPLY = "I'm not sure"
+    >>> print("Human: ", REPLY)
+    >>> NEXT_UTTERANCE = (
+    ...     "My friends are cool but they eat too many carbs.</s> "
+    ...     "<s>what kind of carbs do they eat? i don't know much about carbs.</s> "
+    ...     "<s>I'm not sure."
+    ... )
+
+    >>> inputs = tokenizer([NEXT_UTTERANCE], return_tensors="tf")
+    >>> inputs.pop("token_type_ids")
+    >>> next_reply_ids = model.generate(**inputs)
+    >>> print("Bot: ", tokenizer.batch_decode(next_reply_ids, skip_special_tokens=True)[0])
+    ```
 """
 
 BLENDERBOT_SMALL_INPUTS_DOCSTRING = r"""
@@ -560,7 +543,7 @@ def serving(self, inputs):
         input_ids (`tf.Tensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BlenderbotSmallTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -574,7 +557,7 @@ def serving(self, inputs):
         decoder_input_ids (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BlenderbotSmallTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -688,7 +671,7 @@ def call(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`BlenderbotSmallTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -742,16 +725,7 @@ def call(
             if hasattr(self.embed_tokens, "load_weight_prefix"):
                 context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
             with ContextManagers(context):
-                # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-                # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-                tf.debugging.assert_less(
-                    input_ids,
-                    tf.cast(self.embed_tokens.input_dim, dtype=input_ids.dtype),
-                    message=(
-                        "input_ids must be smaller than the embedding layer's input dimension (got"
-                        f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.input_dim})"
-                    ),
-                )
+                check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
                 inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
         embed_pos = self.embed_positions(input_shape)
@@ -782,7 +756,6 @@ def call(
 
         # encoder layers
         for idx, encoder_layer in enumerate(self.layers):
-
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
@@ -869,7 +842,7 @@ def call(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`BlenderbotSmallTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -952,16 +925,7 @@ def call(
             if hasattr(self.embed_tokens, "load_weight_prefix"):
                 context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
             with ContextManagers(context):
-                # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-                # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-                tf.debugging.assert_less(
-                    input_ids,
-                    tf.cast(self.embed_tokens.input_dim, dtype=input_ids.dtype),
-                    message=(
-                        "input_ids must be smaller than the embedding layer's input dimension (got"
-                        f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.input_dim})"
-                    ),
-                )
+                check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
                 inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
         # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
@@ -1099,9 +1063,8 @@ def call(
         output_hidden_states=None,
         return_dict=None,
         training=False,
-        **kwargs
+        **kwargs,
     ):
-
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
@@ -1179,33 +1142,31 @@ def get_decoder(self):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(BLENDERBOT_SMALL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFSeq2SeqModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        decoder_input_ids: Optional[tf.Tensor] = None,
-        decoder_attention_mask: Optional[tf.Tensor] = None,
-        decoder_position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        decoder_head_mask: Optional[tf.Tensor] = None,
-        cross_attn_head_mask: Optional[tf.Tensor] = None,
+        input_ids: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        decoder_input_ids: tf.Tensor | None = None,
+        decoder_attention_mask: tf.Tensor | None = None,
+        decoder_position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        decoder_head_mask: tf.Tensor | None = None,
+        cross_attn_head_mask: tf.Tensor | None = None,
         encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
-        past_key_values: Optional[List[tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        decoder_inputs_embeds: Optional[tf.Tensor] = None,
+        past_key_values: List[tf.Tensor] | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        decoder_inputs_embeds: tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
-        **kwargs
+        **kwargs,
     ) -> Union[Tuple[tf.Tensor], TFSeq2SeqModelOutput]:
-
         outputs = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -1315,23 +1276,23 @@ def set_bias(self, value):
     @add_end_docstrings(BLENDERBOT_SMALL_GENERATION_EXAMPLE)
     def call(
         self,
-        input_ids: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        decoder_input_ids: Optional[tf.Tensor] = None,
-        decoder_attention_mask: Optional[tf.Tensor] = None,
-        decoder_position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        decoder_head_mask: Optional[tf.Tensor] = None,
-        cross_attn_head_mask: Optional[tf.Tensor] = None,
+        input_ids: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        decoder_input_ids: tf.Tensor | None = None,
+        decoder_attention_mask: tf.Tensor | None = None,
+        decoder_position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        decoder_head_mask: tf.Tensor | None = None,
+        cross_attn_head_mask: tf.Tensor | None = None,
         encoder_outputs: Optional[TFBaseModelOutput] = None,
-        past_key_values: Optional[List[tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        decoder_inputs_embeds: Optional[tf.Tensor] = None,
+        past_key_values: List[tf.Tensor] | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        decoder_inputs_embeds: tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple[tf.Tensor], TFSeq2SeqLMOutput]:
         r"""
@@ -1426,9 +1387,8 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
-
         # cut decoder_input_ids if past_key_values is used
         if past_key_values is not None:
             decoder_input_ids = decoder_input_ids[:, -1:]
diff --git a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
index 6ef94e636aeb..fb8086e981a9 100644
--- a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
@@ -85,9 +85,9 @@ class BlenderbotSmallTokenizer(PreTrainedTokenizer):
         unk_token (`str`, *optional*, defaults to `"__unk__"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (`str`, *optional*, defaults to `"__pad__"`):
+        pad_token (`str`, *optional*, defaults to `"__null__"`):
             The token used for padding, for example when batching sequences of different lengths.
-        **kwargs
+        kwargs (*optional*):
             Additional keyword arguments passed along to [`PreTrainedTokenizer`]
     """
 
@@ -104,10 +104,8 @@ def __init__(
         eos_token="__end__",
         unk_token="__unk__",
         pad_token="__null__",
-        **kwargs
+        **kwargs,
     ):
-        super().__init__(unk_token=unk_token, bos_token=bos_token, eos_token=eos_token, pad_token=pad_token, **kwargs)
-
         with open(vocab_file, encoding="utf-8") as vocab_handle:
             self.encoder = json.load(vocab_handle)
         self.decoder = {v: k for k, v in self.encoder.items()}
@@ -116,6 +114,7 @@ def __init__(
         merges = [tuple(merge.split()) for merge in merges]
         self.bpe_ranks = dict(zip(merges, range(len(merges))))
         self.cache = {}
+        super().__init__(unk_token=unk_token, bos_token=bos_token, eos_token=eos_token, pad_token=pad_token, **kwargs)
 
     @property
     def vocab_size(self) -> int:
@@ -191,7 +190,7 @@ def _tokenize(self, text: str) -> List[str]:
         words = re.findall(r"\S+\n?", text)
 
         for token in words:
-            split_tokens.extend([t for t in self.bpe(token).split(" ")])
+            split_tokens.extend(list(self.bpe(token).split(" ")))
         return split_tokens
 
     def _convert_token_to_id(self, token: str) -> int:
@@ -236,3 +235,18 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
                 index += 1
 
         return vocab_file, merge_file
+
+    @property
+    # Copied from transformers.models.blenderbot.tokenization_blenderbot.BlenderbotTokenizer.default_chat_template
+    def default_chat_template(self):
+        """
+        A very simple chat template that just adds whitespace between messages.
+        """
+        return (
+            "{% for message in messages %}"
+            "{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}"
+            "{{ message['content'] }}"
+            "{% if not loop.last %}{{ '  ' }}{% endif %}"
+            "{% endfor %}"
+            "{{ eos_token }}"
+        )
diff --git a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py
index 8dfae5894fa6..8daac3e04fc2 100644
--- a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py
+++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py
@@ -72,7 +72,7 @@ def __init__(
         eos_token="<|endoftext|>",
         add_prefix_space=False,
         trim_offsets=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             ByteLevelBPETokenizer(
@@ -117,3 +117,18 @@ def create_token_type_ids_from_sequences(
         if token_ids_1 is None:
             return len(cls + token_ids_0 + sep) * [0]
         return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+    @property
+    # Copied from transformers.models.blenderbot.tokenization_blenderbot.BlenderbotTokenizer.default_chat_template
+    def default_chat_template(self):
+        """
+        A very simple chat template that just adds whitespace between messages.
+        """
+        return (
+            "{% for message in messages %}"
+            "{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}"
+            "{{ message['content'] }}"
+            "{% if not loop.last %}{{ '  ' }}{% endif %}"
+            "{% endfor %}"
+            "{{ eos_token }}"
+        )
diff --git a/src/transformers/models/blip/__init__.py b/src/transformers/models/blip/__init__.py
index 9b021adf5b1e..a7001788e629 100644
--- a/src/transformers/models/blip/__init__.py
+++ b/src/transformers/models/blip/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,7 +13,13 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_tf_available,
+    is_torch_available,
+    is_vision_available,
+)
 
 
 _import_structure = {
@@ -56,6 +58,23 @@
         "BlipForImageTextRetrieval",
     ]
 
+try:
+    if not is_tf_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_tf_blip"] = [
+        "TF_BLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "TFBlipModel",
+        "TFBlipPreTrainedModel",
+        "TFBlipForConditionalGeneration",
+        "TFBlipForQuestionAnswering",
+        "TFBlipVisionModel",
+        "TFBlipTextModel",
+        "TFBlipForImageTextRetrieval",
+    ]
+
 if TYPE_CHECKING:
     from .configuration_blip import BLIP_PRETRAINED_CONFIG_ARCHIVE_MAP, BlipConfig, BlipTextConfig, BlipVisionConfig
     from .processing_blip import BlipProcessor
@@ -85,6 +104,23 @@
             BlipVisionModel,
         )
 
+    try:
+        if not is_tf_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_tf_blip import (
+            TF_BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TFBlipForConditionalGeneration,
+            TFBlipForImageTextRetrieval,
+            TFBlipForQuestionAnswering,
+            TFBlipModel,
+            TFBlipPreTrainedModel,
+            TFBlipTextModel,
+            TFBlipVisionModel,
+        )
+
 else:
     import sys
 
diff --git a/src/transformers/models/blip/configuration_blip.py b/src/transformers/models/blip/configuration_blip.py
index 3ed32824d09a..39760a7e22a9 100644
--- a/src/transformers/models/blip/configuration_blip.py
+++ b/src/transformers/models/blip/configuration_blip.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ Blip model configuration"""
 
-import copy
 import os
 from typing import Union
 
@@ -74,17 +73,15 @@ class BlipTextConfig(PretrainedConfig):
             just in case (e.g., 512 or 1024 or 2048).
         hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults
-            to 1e-5): The epsilon used by the layer normalization layers.
+            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        initializer_factor (`float``, *optional*, defaults to 1):
-            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
-            testing).
         bos_token_id (`int`, *optional*, defaults to 30522):
             The id of the `beginning-of-sequence` token.
         eos_token_id (`int`, *optional*, defaults to 2):
@@ -129,14 +126,13 @@ def __init__(
         hidden_dropout_prob=0.0,
         attention_probs_dropout_prob=0.0,
         initializer_range=0.02,
-        initializer_factor=1.0,
         bos_token_id=30522,
         eos_token_id=2,
         pad_token_id=0,
         sep_token_id=102,
         is_decoder=True,
         use_cache=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             pad_token_id=pad_token_id,
@@ -158,13 +154,13 @@ def __init__(
         self.layer_norm_eps = layer_norm_eps
         self.hidden_act = hidden_act
         self.initializer_range = initializer_range
-        self.initializer_factor = initializer_factor
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
         self.is_decoder = is_decoder
         self.use_cache = use_cache
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
 
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
@@ -207,17 +203,13 @@ class BlipVisionConfig(PretrainedConfig):
             The size (resolution) of each patch.
         hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults
-            to 1e-5): The epsilon used by the layer normalization layers.
-        dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        initializer_factor (`float``, *optional*, defaults to 1):
-            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
-            testing).
 
     Example:
 
@@ -243,36 +235,31 @@ def __init__(
         projection_dim=512,
         num_hidden_layers=12,
         num_attention_heads=12,
-        num_channels=3,
         image_size=384,
         patch_size=16,
         hidden_act="gelu",
-        layer_norm_eps=0.00001,
-        dropout=0.0,
+        layer_norm_eps=1e-5,
         attention_dropout=0.0,
         initializer_range=1e-10,
-        initializer_factor=1.0,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size
         self.projection_dim = projection_dim
-        self.dropout = dropout
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
-        self.num_channels = num_channels
         self.patch_size = patch_size
         self.image_size = image_size
         self.initializer_range = initializer_range
-        self.initializer_factor = initializer_factor
         self.attention_dropout = attention_dropout
         self.layer_norm_eps = layer_norm_eps
         self.hidden_act = hidden_act
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
 
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
@@ -308,7 +295,7 @@ class BlipConfig(PretrainedConfig):
             Dimentionality of text and vision projection layers.
         logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
             The inital value of the *logit_scale* paramter. Default is used as per the original BLIP implementation.
-        image_text_hidden_size (`int`, *optional*, defaults to 768):
+        image_text_hidden_size (`int`, *optional*, defaults to 256):
             Dimentionality of the hidden state of the image-text fusion layer.
         kwargs (*optional*):
             Dictionary of keyword arguments.
@@ -337,7 +324,6 @@ class BlipConfig(PretrainedConfig):
     ```"""
 
     model_type = "blip"
-    is_composition = True
 
     def __init__(
         self,
@@ -346,25 +332,17 @@ def __init__(
         projection_dim=512,
         logit_scale_init_value=2.6592,
         image_text_hidden_size=256,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
-        # If `_config_dict` exist, we use them for the backward compatibility.
-        text_config_dict = kwargs.pop("text_config_dict", None)
-        vision_config_dict = kwargs.pop("vision_config_dict", None)
-        if text_config_dict is not None:
-            text_config = text_config_dict
-        if vision_config_dict is not None:
-            vision_config = vision_config_dict
-
         if text_config is None:
             text_config = {}
-            logger.info("text_config is None. Initializing the BlipTextConfig with default values.")
+            logger.info("`text_config` is `None`. Initializing the `BlipTextConfig` with default values.")
 
         if vision_config is None:
             vision_config = {}
-            logger.info("vision_config is None. initializing the BlipVisionConfig with default values.")
+            logger.info("`vision_config` is `None`. Initializing the `BlipVisionConfig` with default values.")
 
         self.text_config = BlipTextConfig(**text_config)
         self.vision_config = BlipVisionConfig(**vision_config)
@@ -388,16 +366,3 @@ def from_text_vision_configs(cls, text_config: BlipTextConfig, vision_config: Bl
         """
 
         return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["text_config"] = self.text_config.to_dict()
-        output["vision_config"] = self.vision_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py b/src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py
index 9deda9c11609..7609b4a40e85 100644
--- a/src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py
+++ b/src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py
@@ -16,17 +16,17 @@
 import argparse
 import re
 
-import torch
-from PIL import Image
-from torchvision import transforms
-from torchvision.transforms.functional import InterpolationMode
-
 import requests
+import torch
 
 # git clone https://github.com/salesforce/BLIP.git
 from models.blip import blip_decoder
 from models.blip_itm import blip_itm
 from models.blip_vqa import blip_vqa
+from PIL import Image
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+
 from transformers import (
     BertTokenizer,
     BlipConfig,
diff --git a/src/transformers/models/blip/image_processing_blip.py b/src/transformers/models/blip/image_processing_blip.py
index 4310a073fcad..fd8873cb7a45 100644
--- a/src/transformers/models/blip/image_processing_blip.py
+++ b/src/transformers/models/blip/image_processing_blip.py
@@ -18,22 +18,21 @@
 
 import numpy as np
 
-from transformers.utils import is_vision_available
-from transformers.utils.generic import TensorType
-
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import convert_to_rgb, normalize, rescale, resize, to_channel_dimension_format
+from ...image_transforms import convert_to_rgb, resize, to_channel_dimension_format
 from ...image_utils import (
-    IMAGENET_STANDARD_MEAN,
-    IMAGENET_STANDARD_STD,
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
     ChannelDimension,
     ImageInput,
     PILImageResampling,
-    is_batched,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
     to_numpy_array,
     valid_images,
 )
-from ...utils import logging
+from ...utils import TensorType, is_vision_available, logging
 
 
 if is_vision_available():
@@ -54,7 +53,7 @@ class BlipImageProcessor(BaseImageProcessor):
         size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`):
             Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
             method.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
             Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
             overridden by the `resample` parameter in the `preprocess` method.
         do_rescale (`bool`, *optional*, defaults to `True`):
@@ -91,9 +90,8 @@ def __init__(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_convert_rgb: bool = True,
-        **kwargs
+        **kwargs,
     ) -> None:
-
         super().__init__(**kwargs)
         size = size if size is not None else {"height": 384, "width": 384}
         size = get_size_dict(size, default_to_square=True)
@@ -104,81 +102,58 @@ def __init__(
         self.do_rescale = do_rescale
         self.rescale_factor = rescale_factor
         self.do_normalize = do_normalize
-        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
-        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
         self.do_convert_rgb = do_convert_rgb
 
+    # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize with PILImageResampling.BILINEAR->PILImageResampling.BICUBIC
     def resize(
         self,
         image: np.ndarray,
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> np.ndarray:
         """
-        Resize an image.
-
-        Resizes the shorter side of the image to `size["shortest_edge"]` while preserving the aspect ratio. If the
-        longer side is larger than the max size `(int(`size["shortest_edge"]` * 1333 / 800))`, the longer side is then
-        resized to the max size while preserving the aspect ratio.
+        Resize an image to `(size["height"], size["width"])`.
 
         Args:
             image (`np.ndarray`):
                 Image to resize.
             size (`Dict[str, int]`):
-                Controls the size of the output image. Should be of the form `{"shortest_edge": int}`.
-            resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.BICUBIC`):
-                Resampling filter to use when resiizing the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        size = get_size_dict(size, default_to_square=True)
-        output_size = (size["width"], size["height"])
-        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
-
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
         """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            mean (`float` or `List[float]`):
-                Image mean.
-            std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        output_size = (size["height"], size["width"])
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
 
     def preprocess(
         self,
@@ -194,6 +169,7 @@ def preprocess(
         return_tensors: Optional[Union[str, TensorType]] = None,
         do_convert_rgb: bool = None,
         data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
         **kwargs,
     ) -> PIL.Image.Image:
         """
@@ -201,7 +177,8 @@ def preprocess(
 
         Args:
             images (`ImageInput`):
-                Image to preprocess.
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
             do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                 Whether to resize the image.
             size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@@ -232,8 +209,15 @@ def preprocess(
                     - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
             data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
                 The channel dimension format for the output image. Can be one of:
-                    - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                    - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         resample = resample if resample is not None else self.resample
@@ -247,8 +231,7 @@ def preprocess(
         size = size if size is not None else self.size
         size = get_size_dict(size, default_to_square=False)
 
-        if not is_batched(images):
-            images = [images]
+        images = make_list_of_images(images)
 
         if not valid_images(images):
             raise ValueError(
@@ -272,16 +255,37 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
         if do_resize:
-            images = [self.resize(image=image, size=size, resample=resample) for image in images]
+            images = [
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
 
         if do_rescale:
-            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
 
         if do_normalize:
-            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
-
-        images = [to_channel_dimension_format(image, data_format) for image in images]
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
 
         encoded_outputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
 
diff --git a/src/transformers/models/blip/modeling_blip.py b/src/transformers/models/blip/modeling_blip.py
index 8856fe04e867..9fca7c28a1a0 100644
--- a/src/transformers/models/blip/modeling_blip.py
+++ b/src/transformers/models/blip/modeling_blip.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 """ PyTorch BLIP model."""
 
+import warnings
 from dataclasses import dataclass
 from typing import Any, Optional, Tuple, Union
 
@@ -42,13 +43,13 @@
 
 BLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "Salesforce/blip-vqa-base",
-    "Salesforce/blip-vqa-capfit-large",
+    "Salesforce/blip-vqa-capfilt-large",
     "Salesforce/blip-image-captioning-base",
     "Salesforce/blip-image-captioning-large",
     "Salesforce/blip-itm-base-coco",
     "Salesforce/blip-itm-large-coco",
-    "Salesforce/blip-itm-base-flikr",
-    "Salesforce/blip-itm-large-flikr",
+    "Salesforce/blip-itm-base-flickr",
+    "Salesforce/blip-itm-large-flickr",
     # See all BLIP models at https://huggingface.co/models?filter=blip
 ]
 
@@ -74,7 +75,7 @@ class BlipForConditionalGenerationModelOutput(ModelOutput):
     Args:
         loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
             Languge modeling loss from the text decoder.
-        decoder_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`, *optional*):
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`, *optional*):
             Prediction scores of the language modeling head of the text decoder model.
         image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*):
             The image embeddings obtained after applying the Vision Transformer model to the input image.
@@ -94,12 +95,21 @@ class BlipForConditionalGenerationModelOutput(ModelOutput):
     """
 
     loss: Optional[Tuple[torch.FloatTensor]] = None
-    decoder_logits: Optional[Tuple[torch.FloatTensor]] = None
+    logits: Optional[Tuple[torch.FloatTensor]] = None
     image_embeds: Optional[torch.FloatTensor] = None
     last_hidden_state: torch.FloatTensor = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
+    @property
+    def decoder_logits(self):
+        warnings.warn(
+            "`decoder_logits` attribute is deprecated and will be removed in version 5 of Transformers."
+            " Please use the `logits` attribute to retrieve the final output instead.",
+            FutureWarning,
+        )
+        return self.logits
+
 
 @dataclass
 class BlipTextVisionModelOutput(ModelOutput):
@@ -222,9 +232,7 @@ def __init__(self, config: BlipVisionConfig):
         self.image_size = config.image_size
         self.patch_size = config.patch_size
 
-        self.class_embedding = nn.Parameter(
-            torch.randn(1, 1, self.embed_dim),
-        )
+        self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim))
 
         self.patch_embedding = nn.Conv2d(
             in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
@@ -238,7 +246,7 @@ def __init__(self, config: BlipVisionConfig):
     def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
         batch_size = pixel_values.shape[0]
         target_dtype = self.patch_embedding.weight.dtype
-        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
         patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
 
         class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
@@ -257,7 +265,9 @@ def __init__(self, config: BlipTextConfig):
         self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
     def forward(
         self,
@@ -313,17 +323,12 @@ def forward(
 
         bsz, tgt_len, embed_dim = hidden_states.size()
 
-        mixed_qkv = self.qkv(hidden_states)
         mixed_qkv = (
             self.qkv(hidden_states)
             .reshape(bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads)
             .permute(2, 0, 3, 1, 4)
         )
-        query_states, key_states, value_states = (
-            mixed_qkv[0],
-            mixed_qkv[1],
-            mixed_qkv[2],
-        )
+        query_states, key_states, value_states = mixed_qkv[0], mixed_qkv[1], mixed_qkv[2]
 
         # Take the dot product between "query" and "key" to get the raw attention scores.
         attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2))
@@ -374,9 +379,9 @@ def __init__(self, config: BlipConfig):
         super().__init__()
         self.embed_dim = config.hidden_size
         self.self_attn = BlipAttention(config)
-        self.layer_norm1 = nn.LayerNorm(self.embed_dim)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
         self.mlp = BlipMLP(config)
-        self.layer_norm2 = nn.LayerNorm(self.embed_dim)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
 
     def forward(
         self,
@@ -426,7 +431,6 @@ class BlipPreTrainedModel(PreTrainedModel):
     config_class = BlipConfig
     base_model_prefix = "blip"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -483,7 +487,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`BlipProcessor`]. See [`BlipProcessor.__call__`] for details.
+            Indices can be obtained using [`AutoProcessor`]. See [`BlipProcessor.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -529,7 +533,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`BlipProcessor`]. See [`BlipProcessor.__call__`] for details.
+            Indices can be obtained using [`AutoProcessor`]. See [`BlipProcessor.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -587,9 +591,7 @@ def forward(
         r"""
         Args:
             inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
-                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-                than the model's internal embedding lookup matrix.
+                Embedded representation of the inputs. Should be float, not int tokens.
             attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
 
@@ -665,7 +667,7 @@ def __init__(self, config: BlipVisionConfig):
 
         self.embeddings = BlipVisionEmbeddings(config)
         self.encoder = BlipEncoder(config)
-        self.post_layernorm = nn.LayerNorm(embed_dim)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
 
         self.post_init()
 
@@ -751,7 +753,7 @@ def __init__(self, config: BlipConfig):
 
         self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
         self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
-        self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
+        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -772,10 +774,10 @@ def get_text_features(
         Examples:
 
         ```python
-        >>> from transformers import BlipProcessor, BlipModel
+        >>> from transformers import AutoProcessor, BlipModel
 
         >>> model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base")
-        >>> processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 
         >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
         >>> text_features = model.get_text_features(**inputs)
@@ -810,10 +812,10 @@ def get_image_features(
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import BlipProcessor, BlipModel
+        >>> from transformers import AutoProcessor, BlipModel
 
         >>> model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base")
-        >>> processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
@@ -824,10 +826,7 @@ def get_image_features(
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        vision_outputs = self.vision_model(
-            pixel_values=pixel_values,
-            return_dict=return_dict,
-        )
+        vision_outputs = self.vision_model(pixel_values=pixel_values, return_dict=return_dict)
 
         pooled_output = vision_outputs[1]  # pooled_output
         image_features = self.visual_projection(pooled_output)
@@ -855,10 +854,10 @@ def forward(
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import BlipProcessor, BlipModel
+        >>> from transformers import AutoProcessor, BlipModel
 
         >>> model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base")
-        >>> processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
@@ -939,7 +938,7 @@ def forward(
 )
 class BlipForConditionalGeneration(BlipPreTrainedModel):
     config_class = BlipConfig
-    _keys_to_ignore_on_load_missing = [r"text_decoder.cls.predictions.decoder.bias"]
+    _tied_weights_keys = ["text_decoder.cls.predictions.decoder.bias"]
     main_input_name = "pixel_values"
 
     def __init__(self, config: BlipConfig):
@@ -978,20 +977,25 @@ def forward(
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import BlipProcessor, BlipForConditionalGeneration
+        >>> from transformers import AutoProcessor, BlipForConditionalGeneration
 
-        >>> processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
         >>> model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> text = "A picture of"
 
-        >>> inputs = processor(images=image, return_tensors="pt")
+        >>> inputs = processor(images=image, text=text, return_tensors="pt")
 
         >>> outputs = model(**inputs)
         ```"""
-        batch_size = pixel_values.shape[0]
+
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
 
         vision_outputs = self.vision_model(
             pixel_values=pixel_values,
@@ -1002,18 +1006,13 @@ def forward(
 
         image_embeds = vision_outputs[0]
 
-        if input_ids is None:
-            input_ids = torch.LongTensor([[self.decoder_input_ids] * batch_size]).to(image_embeds.device)
-
-        if labels is None:
-            labels = input_ids.masked_fill(input_ids == self.decoder_pad_token_id, -100)
-
         outputs = self.text_decoder(
             input_ids=input_ids,
             attention_mask=attention_mask,
             encoder_hidden_states=image_embeds,
             labels=labels,
             return_dict=return_dict,
+            reduction="mean",
         )
 
         if not return_dict:
@@ -1022,7 +1021,7 @@ def forward(
 
         return BlipForConditionalGenerationModelOutput(
             loss=outputs.loss,
-            decoder_logits=outputs.logits,
+            logits=outputs.logits,
             image_embeds=image_embeds,
             last_hidden_state=vision_outputs.last_hidden_state,
             hidden_states=vision_outputs.hidden_states,
@@ -1035,13 +1034,13 @@ def generate(
         pixel_values: torch.FloatTensor,
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.LongTensor] = None,
-        **generate_kwargs
+        **generate_kwargs,
     ) -> torch.LongTensor:
         r"""
         Overrides *generate* function to be able to use the model as a conditional generator
 
         Parameters:
-            pixel_values (*torch.FloatTensor* of shape *(batch_size, image_width, image_height)*:
+            pixel_values (*torch.FloatTensor* of shape *(batch_size, num_channels, image_height, image_width)*:
                 Input image to be processed
             input_ids (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*):
                 The sequence used as a prompt for the generation.
@@ -1053,10 +1052,10 @@ def generate(
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import BlipProcessor, BlipForConditionalGeneration
+        >>> from transformers import AutoProcessor, BlipForConditionalGeneration
 
         >>> model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
-        >>> processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
@@ -1065,14 +1064,12 @@ def generate(
 
         >>> outputs = model.generate(**inputs)
         >>> print(processor.decode(outputs[0], skip_special_tokens=True))
-        two cats are laying on a couch
+        two cats sleeping on a couch
         ```
         """
 
         batch_size = pixel_values.shape[0]
-        vision_outputs = self.vision_model(
-            pixel_values=pixel_values,
-        )
+        vision_outputs = self.vision_model(pixel_values=pixel_values)
 
         image_embeds = vision_outputs[0]
 
@@ -1113,7 +1110,7 @@ def generate(
 )
 class BlipForQuestionAnswering(BlipPreTrainedModel):
     config_class = BlipConfig
-    _keys_to_ignore_on_load_missing = [r"text_decoder.cls.predictions.decoder.bias"]
+    _tied_weights_keys = ["text_decoder.cls.predictions.decoder.bias"]
 
     def __init__(self, config: BlipConfig):
         super().__init__(config)
@@ -1125,7 +1122,7 @@ def __init__(self, config: BlipConfig):
         self.text_decoder = BlipTextLMHeadModel(config.text_config)
 
         self.decoder_pad_token_id = config.text_config.pad_token_id
-        self.decoder_bos_token_id = config.text_config.bos_token_id
+        self.decoder_start_token_id = config.text_config.bos_token_id
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1155,21 +1152,44 @@ def forward(
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import BlipProcessor, BlipForQuestionAnswering
+        >>> from transformers import AutoProcessor, BlipForQuestionAnswering
 
         >>> model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
-        >>> processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
+        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
-        >>> text = "How many cats are in the picture?"
 
+        >>> # training
+        >>> text = "How many cats are in the picture?"
+        >>> label = "2"
         >>> inputs = processor(images=image, text=text, return_tensors="pt")
+        >>> labels = processor(text=label, return_tensors="pt").input_ids
 
+        >>> inputs["labels"] = labels
         >>> outputs = model(**inputs)
+        >>> loss = outputs.loss
+        >>> loss.backward()
+
+        >>> # inference
+        >>> text = "How many cats are in the picture?"
+        >>> inputs = processor(images=image, text=text, return_tensors="pt")
+        >>> outputs = model.generate(**inputs)
+        >>> print(processor.decode(outputs[0], skip_special_tokens=True))
+        2
         ```"""
+        if labels is None and decoder_input_ids is None:
+            raise ValueError(
+                "Either `decoder_input_ids` or `labels` should be passed when calling `forward` with"
+                " `BlipForQuestionAnswering`. if you are training the model make sure that `labels` is passed, if you"
+                " are using the model for inference make sure that `decoder_input_ids` is passed or call `generate`"
+            )
+
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        batch_size = input_ids.shape[0]
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
 
         vision_outputs = self.vision_model(
             pixel_values=pixel_values,
@@ -1189,13 +1209,11 @@ def forward(
             return_dict=return_dict,
         )
 
-        question_embeds = question_embeds[0] if not return_dict else question_embeds.last_hidden_state
-
-        if decoder_input_ids is None:
-            decoder_input_ids = torch.LongTensor([self.decoder_bos_token_id]).repeat((batch_size, 1))
+        if labels is not None and decoder_input_ids is None:
+            # labels are already shifted right, see: https://github.com/huggingface/transformers/pull/23153
+            decoder_input_ids = labels
 
-        if labels is None:
-            labels = decoder_input_ids.masked_fill(decoder_input_ids == self.decoder_pad_token_id, -100)
+        question_embeds = question_embeds[0] if not return_dict else question_embeds.last_hidden_state
 
         answer_output = self.text_decoder(
             input_ids=decoder_input_ids,
@@ -1204,10 +1222,13 @@ def forward(
             encoder_attention_mask=attention_mask,
             labels=labels,
             return_dict=return_dict,
-            reduction="none",
+            reduction="mean",
         )
 
-        decoder_loss = answer_output.loss.mean() if return_dict else answer_output[0].mean()
+        if labels is not None:
+            decoder_loss = answer_output.loss.mean() if return_dict else answer_output[0].mean()
+        else:
+            decoder_loss = None
 
         if not return_dict:
             outputs = (decoder_loss, image_embeds, vision_outputs[0]) + vision_outputs[2:]
@@ -1227,7 +1248,7 @@ def generate(
         input_ids: torch.LongTensor,
         pixel_values: torch.FloatTensor,
         attention_mask: Optional[torch.LongTensor] = None,
-        **generate_kwargs
+        **generate_kwargs,
     ) -> torch.LongTensor:
         r"""
         Overrides *generate* function to be able to use the model as a conditional generator
@@ -1235,7 +1256,7 @@ def generate(
         Parameters:
             input_ids (*torch.LongTensor* of shape *(batch_size, sequence_length)*):
                 The sequence used as a prompt for the generation.
-            pixel_values (*torch.FloatTensor* of shape *(batch_size, image_width, image_height)*:
+            pixel_values (*torch.FloatTensor* of shape *(batch_size, num_channels, image_height, image_width)*:
                 Input image to be processed
             attention_mask (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*):
                 Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`. `1` for
@@ -1248,10 +1269,10 @@ def generate(
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import BlipProcessor, BlipForQuestionAnswering
+        >>> from transformers import AutoProcessor, BlipForQuestionAnswering
 
         >>> model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
-        >>> processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
+        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
@@ -1264,9 +1285,7 @@ def generate(
         2
         ```
         """
-        vision_outputs = self.vision_model(
-            pixel_values=pixel_values,
-        )
+        vision_outputs = self.vision_model(pixel_values=pixel_values)
 
         image_embeds = vision_outputs[0]
 
@@ -1288,7 +1307,7 @@ def generate(
         question_attention_mask = torch.ones(question_embeds.size()[:-1], dtype=torch.long).to(question_embeds.device)
 
         bos_ids = torch.full(
-            (question_embeds.size(0), 1), fill_value=self.decoder_bos_token_id, device=question_embeds.device
+            (question_embeds.size(0), 1), fill_value=self.decoder_start_token_id, device=question_embeds.device
         )
 
         outputs = self.text_decoder.generate(
@@ -1330,8 +1349,16 @@ def __init__(self, config: BlipConfig):
         # image text matching head
         self.itm_head = nn.Linear(config.text_config.hidden_size, 2)
 
-        self.decoder_pad_token_id = config.text_config.pad_token_id
-        self.decoder_bos_token_id = config.text_config.bos_token_id
+        self.decoder_pad_token_id = (
+            config.text_config.pad_token_id
+            if not hasattr(config, "decoder_pad_token_id")
+            else config.decoder_pad_token_id
+        )
+        self.decoder_start_token_id = (
+            config.text_config.bos_token_id
+            if not hasattr(config, "decoder_start_token_id")
+            else config.decoder_start_token_id
+        )
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1359,10 +1386,10 @@ def forward(
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import BlipProcessor, BlipForImageTextRetrieval
+        >>> from transformers import AutoProcessor, BlipForImageTextRetrieval
 
-        >>> model = BlipForImageTextRetrieval.from_pretrained("Salesforce/blip-itm-base")
-        >>> processor = BlipProcessor.from_pretrained("Salesforce/blip-itm-base")
+        >>> model = BlipForImageTextRetrieval.from_pretrained("Salesforce/blip-itm-base-coco")
+        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-itm-base-coco")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
@@ -1373,6 +1400,10 @@ def forward(
         ```
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
 
         vision_outputs = self.vision_model(
             pixel_values=pixel_values,
diff --git a/src/transformers/models/blip/modeling_blip_text.py b/src/transformers/models/blip/modeling_blip_text.py
index fac1a906ef86..2ae3ac053bea 100644
--- a/src/transformers/models/blip/modeling_blip_text.py
+++ b/src/transformers/models/blip/modeling_blip_text.py
@@ -15,27 +15,26 @@
 
 
 import math
-from typing import Optional, Tuple
+from typing import List, Optional, Tuple, Union
 
 import torch
 import torch.utils.checkpoint
 from torch import Tensor, device, nn
 from torch.nn import CrossEntropyLoss
 
-from transformers.activations import ACT2FN
-from transformers.modeling_outputs import (
+from ...activations import ACT2FN
+from ...modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     BaseModelOutputWithPoolingAndCrossAttentions,
     CausalLMOutputWithCrossAttentions,
 )
-from transformers.modeling_utils import (
+from ...modeling_utils import (
     PreTrainedModel,
     apply_chunking_to_forward,
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
-from transformers.utils import logging
-
+from ...utils import logging
 from .configuration_blip import BlipTextConfig
 
 
@@ -57,12 +56,20 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
 
         self.config = config
 
-    def forward(self, input_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0):
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        past_key_values_length: int = 0,
+    ) -> torch.Tensor:
         if input_ids is not None:
             input_shape = input_ids.size()
         else:
@@ -135,14 +142,14 @@ def transpose_for_scores(self, x):
 
     def forward(
         self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
-        output_attentions=False,
-    ):
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
         mixed_query_layer = self.query(hidden_states)
 
         # If this is instantiated as a cross-attention module, the keys
@@ -264,7 +271,7 @@ def forward(
         encoder_attention_mask: Optional[torch.FloatTensor] = None,
         past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         output_attentions: Optional[bool] = False,
-    ):
+    ) -> Tuple[torch.Tensor]:
         self_outputs = self.self(
             hidden_states,
             attention_mask,
@@ -325,14 +332,14 @@ def __init__(self, config, layer_num):
 
     def forward(
         self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
-        output_attentions=False,
-    ):
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
         # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
         self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
         self_attention_outputs = self.attention(
@@ -383,17 +390,23 @@ def __init__(self, config):
 
     def forward(
         self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_values=None,
-        use_cache=None,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict=True,
-    ):
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
         all_cross_attentions = () if output_attentions and self.config.is_decoder else None
@@ -410,12 +423,6 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warn(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         return module(*inputs, past_key_value, output_attentions)
@@ -446,6 +453,7 @@ def custom_forward(*inputs):
                 next_decoder_cache += (layer_outputs[-1],)
             if output_attentions:
                 all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
 
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
@@ -546,7 +554,6 @@ class BlipTextPreTrainedModel(PreTrainedModel):
 
     config_class = BlipTextConfig
     base_model_prefix = "bert"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -607,7 +614,7 @@ def get_extended_attention_mask(
                 Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
             input_shape (`Tuple[int]`):
                 The shape of the input to the model.
-            device: (`torch.device`):
+            device (`torch.device`):
                 The device of the input to the model.
 
         Returns:
@@ -663,38 +670,35 @@ def get_extended_attention_mask(
 
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        encoder_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_values=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        is_decoder=False,
-    ):
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        is_decoder: Optional[bool] = False,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
         r"""
-        encoder_hidden_states  (:
-            obj:*torch.FloatTensor* of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of
-            hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is
-            configured as a decoder.
-        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+        encoder_hidden_states  (`torch.FloatTensor`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor`, *optional*):
             Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
             the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
             - 1 for tokens that are **not masked**,
             - 0 for tokens that are **masked**.
-        past_key_values (:
-            obj:*tuple(tuple(torch.FloatTensor))* of length `config.n_layers` with each tuple having 4 tensors of shape
-            `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and value
-            hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are used, the
-            user can optionally input only the last `decoder_input_ids` (those that don't have their past key value
-            states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape
-            `(batch_size, sequence_length)`.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
         use_cache (`bool`, *optional*):
             If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
             `past_key_values`).
@@ -713,6 +717,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             batch_size, seq_length = input_shape
             device = input_ids.device
@@ -731,7 +736,7 @@ def forward(
         past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
 
         if attention_mask is None:
-            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)))
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length))).to(device)
 
         # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
         # ourselves in which case we just need to make it broadcastable to all heads.
@@ -805,10 +810,6 @@ def forward(
 
 # Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L811
 class BlipTextLMHeadModel(BlipTextPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -823,50 +824,44 @@ def set_output_embeddings(self, new_embeddings):
 
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        labels=None,
-        past_key_values=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        return_logits=False,
-        is_decoder=True,
-        reduction="mean",
-    ):
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.Tensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        return_logits: Optional[bool] = False,
+        is_decoder: Optional[bool] = True,
+        reduction: Optional[str] = "mean",
+    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
         r"""
-        encoder_hidden_states  (:
-            obj:*torch.FloatTensor* of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of
+        encoder_hidden_states (`torch.FloatTensor`, *optional*): Sequence of
             hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is
             configured as a decoder.
-        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+        encoder_attention_mask (`torch.FloatTensor`, *optional*):
             Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
             the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
             - 1 for tokens that are **not masked**,
             - 0 for tokens that are **masked**.
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+        labels (`torch.LongTensor`, *optional*):
             Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
             `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
             ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
-        past_key_values (:
-            obj:*tuple(tuple(torch.FloatTensor))* of length `config.n_layers` with each tuple having 4 tensors of shape
-            `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and value
-            hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are used, the
-            user can optionally input only the last `decoder_input_ids` (those that don't have their past key value
-            states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape
-            `(batch_size, sequence_length)`.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
         use_cache (`bool`, *optional*):
             If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
             `past_key_values`).
-        Returns:
-        Example:
-
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if labels is not None:
@@ -936,8 +931,10 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attenti
             "is_decoder": True,
         }
 
-    def _reorder_cache(self, past, beam_idx):
+    def _reorder_cache(self, past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
         return reordered_past
diff --git a/src/transformers/models/blip/modeling_tf_blip.py b/src/transformers/models/blip/modeling_tf_blip.py
new file mode 100644
index 000000000000..54d15b3088c6
--- /dev/null
+++ b/src/transformers/models/blip/modeling_tf_blip.py
@@ -0,0 +1,1560 @@
+# coding=utf-8
+# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" TensorFlow BLIP model."""
+
+from __future__ import annotations
+
+import warnings
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple, Union
+
+import tensorflow as tf
+
+from ...modeling_tf_outputs import TFBaseModelOutput, TFBaseModelOutputWithPooling
+from ...modeling_tf_utils import (
+    TFPreTrainedModel,
+    get_initializer,
+    get_tf_activation,
+    keras_serializable,
+    shape_list,
+    unpack_inputs,
+)
+from ...tf_utils import check_embeddings_within_bounds, stable_softmax
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_blip import BlipConfig, BlipTextConfig, BlipVisionConfig
+from .modeling_tf_blip_text import BLIP_TEXT_INPUTS_DOCSTRING, TFBlipTextLMHeadModel, TFBlipTextModel
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "Salesforce/blip-vqa-base"
+
+TF_BLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "Salesforce/blip-vqa-base",
+    "Salesforce/blip-vqa-capfilt-large",
+    "Salesforce/blip-image-captioning-base",
+    "Salesforce/blip-image-captioning-large",
+    "Salesforce/blip-itm-base-coco",
+    "Salesforce/blip-itm-large-coco",
+    "Salesforce/blip-itm-base-flickr",
+    "Salesforce/blip-itm-large-flickr",
+    # See all BLIP models at https://huggingface.co/models?filter=blip
+]
+
+
+# Copied from transformers.models.clip.modeling_tf_clip.contrastive_loss
+def contrastive_loss(logits: tf.Tensor) -> tf.Tensor:
+    return tf.math.reduce_mean(
+        tf.keras.metrics.sparse_categorical_crossentropy(
+            y_true=tf.range(shape_list(logits)[0]), y_pred=logits, from_logits=True
+        )
+    )
+
+
+# Copied from transformers.models.clip.modeling_tf_clip.clip_loss with clip->blip
+def blip_loss(similarity: tf.Tensor) -> tf.Tensor:
+    caption_loss = contrastive_loss(similarity)
+    image_loss = contrastive_loss(tf.transpose(similarity))
+    return (caption_loss + image_loss) / 2.0
+
+
+@dataclass
+class TFBlipForConditionalGenerationModelOutput(ModelOutput):
+    """
+    Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the
+    last hidden states. This class also adds the loss term from the text decoder.
+
+    Args:
+        loss (`tf.Tensor`, *optional*, returned when `labels` is provided, `tf.Tensor` of shape `(1,)`):
+            Languge modeling loss from the text decoder.
+        logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`, *optional*):
+            Prediction scores of the language modeling head of the text decoder model.
+        image_embeds (`tf.Tensor` of shape `(batch_size, output_dim)`, *optional*):
+            The image embeddings obtained after applying the Vision Transformer model to the input image.
+        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
+            the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.`
+    """
+
+    loss: Tuple[tf.Tensor] | None = None
+    logits: Tuple[tf.Tensor] | None = None
+    image_embeds: tf.Tensor | None = None
+    last_hidden_state: tf.Tensor = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+
+    @property
+    def decoder_logits(self):
+        warnings.warn(
+            "`decoder_logits` attribute is deprecated and will be removed in version 5 of Transformers."
+            " Please use the `logits` attribute to retrieve the final output instead.",
+            FutureWarning,
+        )
+        return self.logits
+
+
+@dataclass
+class TFBlipTextVisionModelOutput(ModelOutput):
+    """
+    Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the
+    last hidden states. This class also adds the loss term from the text decoder.
+
+    Args:
+        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Languge modeling loss from the text decoder.
+        image_embeds (`tf.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The image embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
+            the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: tf.Tensor | None = None
+    image_embeds: tf.Tensor | None = None
+    last_hidden_state: tf.Tensor = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+
+
+@dataclass
+class TFBlipImageTextMatchingModelOutput(ModelOutput):
+    """
+    Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the
+    last hidden states. This class also adds the loss term from the text decoder as well as the image-text similarity
+    scores.
+
+    Args:
+        itm_score (`tf.Tensor`):
+            The image-text similarity scores.
+        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Languge modeling loss from the text decoder.
+        image_embeds (`tf.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The image embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
+            the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        vision_pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`, *optional*):
+            Last layer hidden-state of the vision of the vision-only branch of the model.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        question_embeds (`tf.Tensor`):
+            The question embeddings obtained by the text projection layer.
+    """
+
+    itm_score: tf.Tensor | None = None
+    loss: tf.Tensor | None = None
+    image_embeds: tf.Tensor | None = None
+    last_hidden_state: tf.Tensor = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    vision_pooler_output: tf.Tensor | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+    question_embeds: Tuple[tf.Tensor] | None = None
+
+
+@dataclass
+class TFBlipOutput(ModelOutput):
+    """
+    Args:
+        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
+            Contrastive loss for image-text similarity.
+        logits_per_image:(`tf.Tensor` of shape `(image_batch_size, text_batch_size)`):
+            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
+            similarity scores.
+        logits_per_text:(`tf.Tensor` of shape `(text_batch_size, image_batch_size)`):
+            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
+            similarity scores.
+        text_embeds(`tf.Tensor` of shape `(batch_size, output_dim`):
+            The text embeddings obtained by applying the projection layer to the pooled output of [`BlipTextModel`].
+        image_embeds(`tf.Tensor` of shape `(batch_size, output_dim`):
+            The image embeddings obtained by applying the projection layer to the pooled output of [`BlipVisionModel`].
+        text_model_output(`BaseModelOutputWithPooling`):
+            The output of the [`BlipTextModel`].
+        vision_model_output(`BaseModelOutputWithPooling`):
+            The output of the [`BlipVisionModel`].
+    """
+
+    loss: tf.Tensor | None = None
+    logits_per_image: tf.Tensor = None
+    logits_per_text: tf.Tensor = None
+    text_embeds: tf.Tensor = None
+    image_embeds: tf.Tensor = None
+    text_model_output: TFBaseModelOutputWithPooling = None
+    vision_model_output: TFBaseModelOutputWithPooling = None
+
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+class TFBlipVisionEmbeddings(tf.keras.layers.Layer):
+    def __init__(self, config: BlipVisionConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.patch_embedding = tf.keras.layers.Conv2D(
+            filters=self.embed_dim,
+            kernel_size=self.patch_size,
+            strides=self.patch_size,
+            kernel_initializer=get_initializer(self.config.initializer_range),
+            data_format="channels_last",
+            name="patch_embedding",
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+
+    def build(self, input_shape):
+        self.class_embedding = self.add_weight(
+            shape=(1, 1, self.embed_dim),
+            initializer=get_initializer(self.config.initializer_range),
+            trainable=True,
+            name="class_embedding",
+        )
+
+        self.position_embedding = self.add_weight(
+            shape=(1, self.num_positions, self.embed_dim),
+            initializer=get_initializer(self.config.initializer_range),
+            trainable=True,
+            name="position_embedding",
+        )
+        super().build(input_shape)
+
+    def call(self, pixel_values: tf.Tensor) -> tf.Tensor:
+        # Input is channels-first, we transpose. PyTorch transposes after the conv because PyTorch
+        # likes channels-first convs.
+        batch_size = tf.shape(pixel_values)[0]
+        pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
+        patch_embeds = self.patch_embedding(pixel_values)
+        patch_embeds = tf.reshape(patch_embeds, (batch_size, self.num_patches, -1))
+
+        class_embeds = tf.broadcast_to(self.class_embedding, (batch_size, 1, self.embed_dim))
+        embeddings = tf.concat([class_embeds, patch_embeds], axis=1)
+        embeddings = embeddings + self.position_embedding[:, : tf.shape(embeddings)[1], :]
+        return embeddings
+
+
+# Copied from transformers.models.clip.modeling_tf_clip.TFCLIPTextEmbeddings with CLIP->Blip
+class TFBlipTextEmbeddings(tf.keras.layers.Layer):
+    def __init__(self, config: BlipTextConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.embed_dim = config.hidden_size
+
+        self.config = config
+
+    def build(self, input_shape: tf.TensorShape = None):
+        with tf.name_scope("token_embedding"):
+            self.weight = self.add_weight(
+                shape=(self.config.vocab_size, self.embed_dim),
+                initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range),
+                trainable=True,
+                name="weight",
+            )
+
+        with tf.name_scope("position_embedding"):
+            self.position_embedding = self.add_weight(
+                shape=(self.config.max_position_embeddings, self.embed_dim),
+                initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range),
+                trainable=True,
+                name="embeddings",
+            )
+
+        super().build(input_shape)
+
+    def call(
+        self,
+        input_ids: tf.Tensor = None,
+        position_ids: tf.Tensor = None,
+        inputs_embeds: tf.Tensor = None,
+    ) -> tf.Tensor:
+        """
+        Applies embedding based on inputs tensor.
+
+        Returns:
+            final_embeddings (`tf.Tensor`): output embedding tensor.
+        """
+        if input_ids is None and inputs_embeds is None:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
+            inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
+
+        input_shape = shape_list(inputs_embeds)[:-1]
+
+        if position_ids is None:
+            position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)
+
+        position_embeds = tf.gather(params=self.position_embedding, indices=position_ids)
+        position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1))
+        final_embeddings = inputs_embeds + position_embeds
+
+        return final_embeddings
+
+
+class TFBlipAttention(tf.keras.layers.Layer):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = tf.keras.layers.Dropout(config.attention_dropout, name="dropout")
+
+        self.qkv = tf.keras.layers.Dense(
+            3 * self.embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="qkv"
+        )
+
+        self.projection = tf.keras.layers.Dense(
+            self.embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="projection"
+        )
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        head_mask: tf.Tensor | None = None,
+        output_attentions: Optional[bool] = False,
+        training: Optional[bool] = None,
+    ) -> Tuple[tf.Tensor, tf.Tensor | None, Tuple[tf.Tensor] | None]:
+        """Input shape: Batch x Time x Channel"""
+
+        bsz, tgt_len, embed_dim = shape_list(hidden_states)
+
+        mixed_qkv = self.qkv(hidden_states)
+        mixed_qkv = tf.reshape(mixed_qkv, (bsz, tgt_len, 3, self.num_heads, self.head_dim))
+        mixed_qkv = tf.transpose(mixed_qkv, perm=(2, 0, 3, 1, 4))
+
+        query_states, key_states, value_states = mixed_qkv[0], mixed_qkv[1], mixed_qkv[2]
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = query_states @ tf.transpose(key_states, (0, 1, 3, 2))
+
+        attention_scores = attention_scores * self.scale
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = stable_softmax(attention_scores, axis=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs, training=training)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = tf.transpose(attention_probs @ value_states, perm=(0, 2, 1, 3))
+
+        new_context_layer_shape = shape_list(context_layer)[:-2] + [self.embed_dim]
+        context_layer = tf.reshape(context_layer, new_context_layer_shape)
+
+        output = self.projection(context_layer)
+
+        outputs = (output, attention_probs) if output_attentions else (output, None)
+
+        return outputs
+
+
+class TFBlipMLP(tf.keras.layers.Layer):
+    def __init__(self, config: BlipConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.activation_fn = get_tf_activation(config.hidden_act)
+
+        in_proj_std = (config.hidden_size**-0.5) * ((2 * config.num_hidden_layers) ** -0.5)
+        fc_std = (2 * config.hidden_size) ** -0.5
+
+        self.fc1 = tf.keras.layers.Dense(
+            units=config.intermediate_size, kernel_initializer=get_initializer(fc_std), name="fc1"
+        )
+        self.fc2 = tf.keras.layers.Dense(
+            units=config.hidden_size, kernel_initializer=get_initializer(in_proj_std), name="fc2"
+        )
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        hidden_states = self.fc1(inputs=hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(inputs=hidden_states)
+        return hidden_states
+
+
+class TFBlipEncoderLayer(tf.keras.layers.Layer):
+    def __init__(self, config: BlipConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.embed_dim = config.hidden_size
+        self.self_attn = TFBlipAttention(config, name="self_attn")
+        self.layer_norm1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1")
+        self.mlp = TFBlipMLP(config, name="mlp")
+        self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2")
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor,
+        output_attentions: Optional[bool] = False,
+        training: Optional[bool] = None,
+    ) -> Tuple[tf.Tensor]:
+        """
+        Args:
+            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`tf.Tensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            head_mask=attention_mask,
+            output_attentions=output_attentions,
+            training=training,
+        )
+        hidden_states = hidden_states + residual
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+
+        hidden_states = hidden_states + residual
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class TFBlipPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = BlipConfig
+    base_model_prefix = "blip"
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+
+BLIP_START_DOCSTRING = r"""
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`BlipConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+BLIP_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`BlipImageProcessor`]. See [`BlipImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+BLIP_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoProcessor`]. See [`BlipProcessor.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`BlipImageProcessor`]. See [`BlipImageProcessor.__call__`] for details.
+        return_loss (`bool`, *optional*):
+            Whether or not to return the contrastive loss.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@keras_serializable
+class TFBlipEncoder(tf.keras.layers.Layer):
+    config_class = BlipConfig
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`BlipEncoderLayer`].
+
+    Args:
+        config (`BlipConfig`):
+            The corresponding vision configuration for the `BlipEncoder`.
+    """
+
+    def __init__(self, config: BlipConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.layers = [TFBlipEncoderLayer(config, name=f"layers_._{i}") for i in range(config.num_hidden_layers)]
+
+    @unpack_inputs
+    def call(
+        self,
+        inputs_embeds,
+        attention_mask: tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: Optional[bool] = None,
+    ) -> Union[Tuple, TFBaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Embedded representation of the inputs. Should be float, not int tokens.
+            attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask,
+                output_attentions=output_attentions,
+                training=training,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return TFBaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class TFBlipVisionModel(TFBlipPreTrainedModel):
+    main_input_name = "pixel_values"
+    config_class = BlipVisionConfig
+
+    def __init__(self, config: BlipVisionConfig, *args, **kwargs):
+        super().__init__(config, *args, **kwargs)
+        self.config = config
+
+        self.embeddings = TFBlipVisionEmbeddings(config, name="embeddings")
+        self.encoder = TFBlipEncoder(config, name="encoder")
+        self.post_layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="post_layernorm")
+
+    def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
+        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
+        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
+
+        return TFBaseModelOutputWithPooling(
+            last_hidden_state=output.last_hidden_state,
+            pooler_output=output.pooler_output,
+            hidden_states=hs,
+            attentions=attns,
+        )
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=BlipVisionConfig)
+    def call(
+        self,
+        pixel_values: tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: Optional[bool] = None,
+    ) -> Union[Tuple, TFBaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        pooled_output = last_hidden_state[:, 0, :]
+        # TF gets confused if we call the layer with inputs of different ranks, so insert a singleton dimension
+        pooled_output = self.post_layernorm(tf.expand_dims(pooled_output, 1))
+        pooled_output = tf.squeeze(pooled_output, 1)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return TFBaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+
+class TFBlipMainLayer(tf.keras.layers.Layer):
+    config_class = BlipConfig
+
+    def __init__(self, config: BlipConfig, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        if not isinstance(config.text_config, BlipTextConfig):
+            raise ValueError(
+                "config.text_config is expected to be of type BlipTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+
+        if not isinstance(config.vision_config, BlipVisionConfig):
+            raise ValueError(
+                "config.vision_config is expected to be of type BlipVisionConfig but is of type"
+                f" {type(config.vision_config)}."
+            )
+
+        text_config = config.text_config
+        vision_config = config.vision_config
+
+        self.projection_dim = config.projection_dim
+        self.text_embed_dim = text_config.hidden_size
+        self.vision_embed_dim = vision_config.hidden_size
+
+        self.text_model = TFBlipTextModel(text_config, name="text_model")
+        self.vision_model = TFBlipVisionModel(vision_config, name="vision_model")
+
+        self.visual_projection = tf.keras.layers.Dense(
+            self.projection_dim,
+            use_bias=False,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="visual_projection",
+        )
+        self.text_projection = tf.keras.layers.Dense(
+            self.projection_dim,
+            use_bias=False,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="text_projection",
+        )
+
+        self.config = config
+
+    def build(self, input_shape=None):
+        self.logit_scale = self.add_weight(
+            name="logit_scale",
+            shape=[],
+            initializer=tf.keras.initializers.Constant(self.config.logit_scale_init_value),
+            trainable=True,
+        )
+        super().build(input_shape)
+
+    @unpack_inputs
+    def call(
+        self,
+        input_ids: tf.Tensor | None = None,
+        pixel_values: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: Optional[bool] = None,
+    ) -> Union[Tuple, TFBlipOutput]:
+        # Use BLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        image_embeds = vision_outputs[1]
+        image_embeds = self.visual_projection(image_embeds)
+
+        text_embeds = text_outputs[1]
+        text_embeds = self.text_projection(text_embeds)
+
+        # normalized features
+        image_embeds = image_embeds / tf.norm(image_embeds, ord=2, axis=-1, keepdims=True)
+        text_embeds = text_embeds / tf.norm(text_embeds, ord=2, axis=-1, keepdims=True)
+
+        # cosine similarity as logits
+        logit_scale = tf.exp(self.logit_scale)
+        logits_per_text = tf.matmul(text_embeds, image_embeds, transpose_b=True) * logit_scale
+        logits_per_image = tf.transpose(logits_per_text)
+
+        loss = None
+        if return_loss:
+            loss = blip_loss(logits_per_text)
+            loss = tf.reshape(loss, (1,))
+
+        if not return_dict:
+            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return TFBlipOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
+
+
+class TFBlipModel(TFBlipPreTrainedModel):
+    config_class = BlipConfig
+    _keys_to_ignore_on_load_missing = [r"text_decoder.cls.predictions.decoder.bias"]
+    main_input_name = "input_ids"
+
+    def __init__(self, config: BlipConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.blip = TFBlipMainLayer(config, name="blip")
+
+    def serving_output(self, output: TFBlipOutput) -> TFBlipOutput:
+        return TFBlipOutput(
+            logits_per_image=output.logits_per_image,
+            logits_per_text=output.logits_per_text,
+            text_embeds=output.text_embeds,
+            image_embeds=output.image_embeds,
+        )
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(BLIP_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFBlipOutput, config_class=BlipConfig)
+    def call(
+        self,
+        input_ids: tf.Tensor | None = None,
+        pixel_values: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: Optional[bool] = None,
+    ) -> Union[Tuple, TFBlipOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, TFBlipModel
+
+        >>> model = TFBlipModel.from_pretrained("Salesforce/blip-image-captioning-base")
+        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(
+        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="tf", padding=True
+        ... )
+
+        >>> outputs = model(**inputs)
+        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+        >>> probs = tf.nn.softmax(logits_per_image, axis=1)  # we can take the softmax to get the label probabilities
+        ```"""
+        outputs = self.blip(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            return_loss=return_loss,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        return outputs
+
+    @add_start_docstrings_to_model_forward(BLIP_TEXT_INPUTS_DOCSTRING)
+    def get_text_features(
+        self,
+        input_ids: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        return_dict: Optional[bool] = None,
+    ) -> tf.Tensor:
+        r"""
+        Returns:
+            text_features (`tf.Tensor` of shape `(batch_size, output_dim`): The text embeddings obtained by applying
+            the projection layer to the pooled output of [`TFBlipTextModel`].
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoProcessor, TFBlipModel
+
+        >>> model = TFBlipModel.from_pretrained("Salesforce/blip-image-captioning-base")
+        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+
+        >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="tf")
+        >>> text_features = model.get_text_features(**inputs)
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_outputs = self.blip.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            return_dict=return_dict,
+        )
+
+        pooled_output = text_outputs[1]
+        text_features = self.blip.text_projection(pooled_output)
+
+        return text_features
+
+    @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
+    def get_image_features(
+        self,
+        pixel_values: tf.Tensor | None = None,
+        return_dict: Optional[bool] = None,
+    ) -> tf.Tensor:
+        r"""
+        Returns:
+            image_features (`tf.Tensor` of shape `(batch_size, output_dim`): The image embeddings obtained by applying
+            the projection layer to the pooled output of [`TFBlipVisionModel`].
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, TFBlipModel
+
+        >>> model = TFBlipModel.from_pretrained("Salesforce/blip-image-captioning-base")
+        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="tf")
+
+        >>> image_features = model.get_image_features(**inputs)
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.blip.vision_model(pixel_values=pixel_values, return_dict=return_dict)
+
+        pooled_output = vision_outputs[1]  # pooled_output
+        image_features = self.blip.visual_projection(pooled_output)
+
+        return image_features
+
+
+@add_start_docstrings(
+    """
+    BLIP Model for image captioning. The model consists of a vision encoder and a text decoder. One can optionally pass
+    `input_ids` to the model, which serve as a text prompt, to make the text decoder continue the prompt. Otherwise,
+    the decoder starts generating text from the [BOS] (beginning-of-sequence) token. will start generating the caption
+    from the text input. If no text input is provided, the decoder will start with the [BOS] token only.
+    """,
+    BLIP_START_DOCSTRING,
+)
+class TFBlipForConditionalGeneration(TFBlipPreTrainedModel):
+    config_class = BlipConfig
+    _keys_to_ignore_on_load_missing = [r"text_decoder.cls.predictions.decoder.bias"]
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: BlipConfig, *args, **kwargs):
+        super().__init__(config, *args, **kwargs)
+
+        self.vision_model = TFBlipVisionModel(config.vision_config, name="vision_model")
+
+        self.text_decoder = TFBlipTextLMHeadModel(config.text_config, name="text_decoder")
+
+        self.decoder_input_ids = config.text_config.bos_token_id
+        self.decoder_pad_token_id = config.text_config.pad_token_id
+
+    def get_input_embeddings(self) -> tf.keras.layers.Layer:
+        return self.vision_model.embeddings.patch_embedding
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFBlipForConditionalGenerationModelOutput, config_class=BlipConfig)
+    def call(
+        self,
+        pixel_values: tf.Tensor,
+        input_ids: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: tf.Tensor | None = None,
+        return_dict: Optional[bool] = None,
+        training: Optional[bool] = None,
+    ) -> Union[Tuple, TFBlipForConditionalGenerationModelOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, TFBlipForConditionalGeneration
+
+        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+        >>> model = TFBlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> text = "A picture of"
+
+        >>> inputs = processor(images=image, text=text, return_tensors="tf")
+
+        >>> outputs = model(**inputs)
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        image_embeds = vision_outputs[0]
+
+        outputs = self.text_decoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            encoder_hidden_states=image_embeds,
+            labels=labels,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        if not return_dict:
+            outputs = (outputs[0], outputs[1], image_embeds, vision_outputs[0]) + vision_outputs[2:]
+            return tuple(output for output in outputs if output is not None)
+
+        if outputs.loss is not None and outputs.loss.shape.rank == 0:
+            outputs.loss = tf.reshape(outputs.loss, (1,))
+
+        return TFBlipForConditionalGenerationModelOutput(
+            loss=outputs.loss,
+            logits=outputs.logits,
+            image_embeds=image_embeds,
+            last_hidden_state=vision_outputs.last_hidden_state,
+            hidden_states=vision_outputs.hidden_states,
+            attentions=vision_outputs.attentions,
+        )
+
+    def generate(
+        self,
+        pixel_values: tf.Tensor,
+        input_ids: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        **generate_kwargs,
+    ) -> tf.Tensor:
+        r"""
+        Overrides *generate* function to be able to use the model as a conditional generator
+
+        Parameters:
+            pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, image_height, image_width)`:
+                Input image to be processed
+            input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                The sequence used as a prompt for the generation.
+            attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, TFBlipForConditionalGeneration
+
+        >>> model = TFBlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="tf")
+
+        >>> outputs = model.generate(**inputs)
+        >>> print(processor.decode(outputs[0], skip_special_tokens=True))
+        two cats sleeping on a couch
+        ```
+        """
+
+        batch_size = pixel_values.shape[0]
+        vision_outputs = self.vision_model(pixel_values=pixel_values)
+
+        image_embeds = vision_outputs[0]
+
+        image_attention_mask = tf.ones(shape_list(image_embeds)[:-1], dtype=tf.int32)
+
+        if isinstance(input_ids, list):
+            input_ids = tf.convert_to_tensor(input_ids, dtype=tf.int32)
+        elif input_ids is None:
+            input_ids = tf.convert_to_tensor(
+                [[self.decoder_input_ids, self.config.text_config.eos_token_id]], dtype=tf.int32
+            )
+
+            input_ids = tf.tile(input_ids, (batch_size, 1))
+
+        # PyTorch: input_ids[:, 0] = self.config.text_config.bos_token_id
+        input_ids = tf.concat(
+            [tf.ones((batch_size, 1), dtype=tf.int32) * self.config.text_config.bos_token_id, input_ids[:, 1:]], axis=1
+        )
+        attention_mask = attention_mask[:, :-1] if attention_mask is not None else None
+
+        outputs = self.text_decoder.generate(
+            input_ids=input_ids[:, :-1],
+            eos_token_id=self.config.text_config.sep_token_id,
+            pad_token_id=self.config.text_config.pad_token_id,
+            attention_mask=attention_mask,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            **generate_kwargs,
+        )
+
+        return outputs
+
+
+@add_start_docstrings(
+    """
+    BLIP Model for visual question answering. The model consists of a vision encoder, a text encoder as well as a text
+    decoder. The vision encoder will encode the input image, the text encoder will encode the input question together
+    with the encoding of the image, and the text decoder will output the answer to the question.
+    """,
+    BLIP_START_DOCSTRING,
+)
+class TFBlipForQuestionAnswering(TFBlipPreTrainedModel):
+    config_class = BlipConfig
+    _keys_to_ignore_on_load_missing = [r"text_decoder.cls.predictions.decoder.bias"]
+
+    def __init__(self, config: BlipConfig, *args, **kwargs):
+        super().__init__(config, *args, **kwargs)
+
+        self.vision_model = TFBlipVisionModel(config.vision_config, name="vision_model")
+
+        self.text_encoder = TFBlipTextModel(config.text_config, name="text_encoder", add_pooling_layer=False)
+
+        self.text_decoder = TFBlipTextLMHeadModel(config.text_config, name="text_decoder")
+
+        self.decoder_pad_token_id = config.text_config.pad_token_id
+        self.decoder_start_token_id = config.text_config.bos_token_id
+
+    def get_input_embeddings(self) -> tf.keras.layers.Layer:
+        return self.vision_model.embeddings.patch_embedding
+
+    # Adapted from transformers.models.t5.modeling_tf_t5.TFT5PreTrainedModel._shift_right
+    def _shift_right(self, input_ids):
+        decoder_start_token_id = self.decoder_start_token_id
+        pad_token_id = self.decoder_pad_token_id
+
+        if decoder_start_token_id is None or pad_token_id is None:
+            raise ValueError("decoder_start_token_id and pad_token_id must be defined!")
+
+        start_tokens = tf.fill((shape_list(input_ids)[0], 1), decoder_start_token_id)
+        start_tokens = tf.cast(start_tokens, input_ids.dtype)  # Ensure compatible dtypes for concatenation
+        shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1)
+
+        # replace possible -100 values in labels by `pad_token_id`
+        shifted_input_ids = tf.where(
+            shifted_input_ids == -100,
+            tf.cast(tf.fill(shape_list(shifted_input_ids), pad_token_id), shifted_input_ids.dtype),
+            shifted_input_ids,
+        )
+
+        # "Verify that `labels` has only positive values and -100"
+        tf.debugging.assert_greater_equal(shifted_input_ids, tf.constant(0, dtype=shifted_input_ids.dtype))
+
+        return shifted_input_ids
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFBlipTextVisionModelOutput, config_class=BlipVisionConfig)
+    def call(
+        self,
+        input_ids: tf.Tensor,
+        pixel_values: tf.Tensor | None = None,
+        decoder_input_ids: tf.Tensor | None = None,
+        decoder_attention_mask: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: tf.Tensor | None = None,
+        return_dict: Optional[bool] = None,
+        training: Optional[bool] = None,
+    ) -> Union[Tuple, TFBlipTextVisionModelOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, TFBlipForQuestionAnswering
+
+        >>> model = TFBlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
+        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> # training
+        >>> text = "How many cats are in the picture?"
+        >>> label = "2"
+        >>> inputs = processor(images=image, text=text, return_tensors="tf")
+        >>> labels = processor(text=label, return_tensors="tf").input_ids
+
+        >>> inputs["labels"] = labels
+        >>> outputs = model(**inputs)
+        >>> loss = outputs.loss
+
+        >>> # inference
+        >>> text = "How many cats are in the picture?"
+        >>> inputs = processor(images=image, text=text, return_tensors="tf")
+        >>> outputs = model.generate(**inputs)
+        >>> print(processor.decode(outputs[0], skip_special_tokens=True))
+        2
+        ```"""
+        if labels is None and decoder_input_ids is None:
+            raise ValueError(
+                "Either `decoder_input_ids` or `labels` should be passed when calling"
+                " `TFBlipForQuestionAnswering`. if you are training the model make sure that `labels` is passed, if you"
+                " are using the model for inference make sure that `decoder_input_ids` is passed or call `generate`"
+            )
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        image_embeds = vision_outputs[0]
+        image_attention_mask = tf.ones(shape_list(image_embeds)[:-1], dtype=tf.int64)
+
+        question_embeds = self.text_encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        question_embeds = question_embeds[0] if not return_dict else question_embeds.last_hidden_state
+
+        if labels is not None and decoder_input_ids is None:
+            # labels are already shifted right, see: https://github.com/huggingface/transformers/pull/23153
+            decoder_input_ids = labels
+
+        answer_output = self.text_decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=question_embeds,
+            encoder_attention_mask=attention_mask,
+            labels=labels,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        if labels is not None:
+            decoder_loss = tf.reduce_mean(answer_output.loss) if return_dict else tf.reduce_mean(answer_output[0])
+        else:
+            decoder_loss = None
+
+        if not return_dict:
+            outputs = (decoder_loss, image_embeds, vision_outputs[0]) + vision_outputs[2:]
+            return tuple(output for output in outputs if output is not None)
+
+        return TFBlipTextVisionModelOutput(
+            loss=decoder_loss,
+            image_embeds=image_embeds,
+            last_hidden_state=vision_outputs.last_hidden_state,
+            hidden_states=vision_outputs.hidden_states,
+            attentions=vision_outputs.attentions,
+        )
+
+    def generate(
+        self,
+        input_ids: tf.Tensor,
+        pixel_values: tf.Tensor,
+        attention_mask: tf.Tensor | None = None,
+        **generate_kwargs,
+    ) -> tf.Tensor:
+        r"""
+        Overrides *generate* function to be able to use the model as a conditional generator
+
+        Parameters:
+            input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, image_height, image_width)`:
+                Input image to be processed
+            attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`. `1` for
+                tokens that are NOT MASKED, `0` for MASKED tokens.
+            generate_kwargs (dict, *optional*):
+                Additional arguments passed to the `generate` function of the decoder
+
+
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, TFBlipForQuestionAnswering
+
+        >>> model = TFBlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
+        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> text = "How many cats are in the picture?"
+
+        >>> inputs = processor(images=image, text=text, return_tensors="tf")
+
+        >>> outputs = model.generate(**inputs)
+        >>> print(processor.decode(outputs[0], skip_special_tokens=True))
+        2
+        ```
+        """
+        vision_outputs = self.vision_model(pixel_values=pixel_values)
+
+        image_embeds = vision_outputs[0]
+
+        image_attention_mask = tf.ones(shape_list(image_embeds)[:-1], dtype=tf.int32)
+
+        if isinstance(input_ids, list):
+            input_ids = tf.Tensor(input_ids)
+
+        question_outputs = self.text_encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            return_dict=False,
+        )
+
+        question_embeds = question_outputs[0]
+
+        question_attention_mask = tf.ones(shape_list(question_embeds)[:-1], dtype=tf.int32)
+
+        bos_ids = tf.fill(
+            (tf.shape(question_embeds)[0], 1), value=tf.cast(self.decoder_start_token_id, input_ids.dtype)
+        )
+
+        outputs = self.text_decoder.generate(
+            input_ids=bos_ids,
+            eos_token_id=self.config.text_config.sep_token_id,
+            pad_token_id=self.config.text_config.pad_token_id,
+            encoder_hidden_states=question_embeds,
+            encoder_attention_mask=question_attention_mask,
+            **generate_kwargs,
+        )
+
+        return outputs
+
+
+@add_start_docstrings(
+    """
+    BLIP Model with a vision and text projector, and a classification head on top. The model is used in the context of
+    image-text retrieval. Given an image and a text, the model returns the probability of the text being relevant to
+    the image.
+    """,
+    BLIP_START_DOCSTRING,
+)
+class TFBlipForImageTextRetrieval(TFBlipPreTrainedModel):
+    config_class = BlipConfig
+
+    def __init__(self, config: BlipConfig, *args, **kwargs):
+        super().__init__(config, *args, **kwargs)
+
+        self.vision_model = TFBlipVisionModel(config.vision_config, name="vision_model")
+
+        self.text_encoder = TFBlipTextModel(config.text_config, name="text_encoder", add_pooling_layer=False)
+
+        # vision projection layer
+        self.vision_proj = tf.keras.layers.Dense(
+            config.image_text_hidden_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="vision_proj",
+        )
+
+        # text projection layer
+        self.text_proj = tf.keras.layers.Dense(
+            config.image_text_hidden_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="text_proj",
+        )
+
+        # image text matching head
+        self.itm_head = tf.keras.layers.Dense(
+            2, kernel_initializer=get_initializer(config.initializer_range), name="itm_head"
+        )
+
+        self.decoder_pad_token_id = (
+            config.text_config.pad_token_id
+            if not hasattr(config, "decoder_pad_token_id")
+            else config.decoder_pad_token_id
+        )
+        self.decoder_start_token_id = (
+            config.text_config.bos_token_id
+            if not hasattr(config, "decoder_start_token_id")
+            else config.decoder_start_token_id
+        )
+
+    def get_input_embeddings(self) -> tf.keras.layers.Layer:
+        return self.vision_model.embeddings.patch_embedding
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFBlipImageTextMatchingModelOutput, config_class=BlipVisionConfig)
+    def call(
+        self,
+        input_ids: tf.Tensor,
+        pixel_values: tf.Tensor | None = None,
+        use_itm_head: Optional[bool] = True,
+        attention_mask: tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: Optional[bool] = None,
+    ) -> Union[Tuple, TFBlipImageTextMatchingModelOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, TFBlipForImageTextRetrieval
+
+        >>> model = TFBlipForImageTextRetrieval.from_pretrained("Salesforce/blip-itm-base-coco")
+        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-itm-base-coco")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> text = "an image of a cat"
+
+        >>> inputs = processor(images=image, text=text, return_tensors="tf")
+        >>> outputs = model(**inputs)
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        image_embeds = vision_outputs[0]
+        image_atts = tf.ones(shape_list(image_embeds)[:-1], dtype=tf.int64)
+
+        # Matt: In PyTorch, only one path (itm/non-itm) is taken. However, in TensorFlow this can result in
+        # some layers not being built! To avoid this, we always call both paths, then use an if statement to select
+        # which output to pass to the final output. The unnecessary nodes will be pruned from the final graph, but
+        # not before the layers have all been built correctly.
+        itm_question_embeds = self.text_encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_atts,
+            return_dict=return_dict,
+            training=training,
+        )
+        itm_question_embeds = itm_question_embeds[0] if not return_dict else itm_question_embeds.last_hidden_state
+
+        itm_output = self.itm_head(itm_question_embeds[:, 0, :])
+
+        no_itm_question_embeds = self.text_encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            return_dict=return_dict,
+            training=training,
+        )
+        no_itm_question_embeds = (
+            no_itm_question_embeds[0] if not return_dict else no_itm_question_embeds.last_hidden_state
+        )
+
+        image_feat, _ = tf.linalg.normalize(self.vision_proj(image_embeds[:, 0, :]), ord=2, axis=-1)
+        text_feat, _ = tf.linalg.normalize(self.text_proj(no_itm_question_embeds[:, 0, :]), ord=2, axis=-1)
+
+        no_itm_output = tf.matmul(image_feat, text_feat, transpose_b=True)
+
+        if use_itm_head:
+            output = itm_output
+            question_embeds = itm_question_embeds
+        else:
+            output = no_itm_output
+            question_embeds = no_itm_question_embeds
+
+        if not return_dict:
+            outputs = (output, vision_outputs[0]) + vision_outputs[2:] + (question_embeds,)
+            return tuple(output for output in outputs if output is not None)
+
+        return TFBlipImageTextMatchingModelOutput(
+            itm_score=output,
+            last_hidden_state=vision_outputs.last_hidden_state,
+            hidden_states=vision_outputs.hidden_states,
+            attentions=vision_outputs.attentions,
+            question_embeds=question_embeds,
+        )
diff --git a/src/transformers/models/blip/modeling_tf_blip_text.py b/src/transformers/models/blip/modeling_tf_blip_text.py
new file mode 100644
index 000000000000..9873c292b7af
--- /dev/null
+++ b/src/transformers/models/blip/modeling_tf_blip_text.py
@@ -0,0 +1,944 @@
+# coding=utf-8
+# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the BSD-3-clause license (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from __future__ import annotations
+
+import math
+from typing import Optional, Tuple
+
+import tensorflow as tf
+
+from ...modeling_tf_outputs import (
+    TFBaseModelOutputWithPastAndCrossAttentions,
+    TFBaseModelOutputWithPoolingAndCrossAttentions,
+    TFCausalLMOutputWithCrossAttentions,
+)
+from ...modeling_tf_utils import (
+    TFModelInputType,
+    TFPreTrainedModel,
+    get_initializer,
+    get_tf_activation,
+    keras_serializable,
+    shape_list,
+    unpack_inputs,
+)
+from ...tf_utils import check_embeddings_within_bounds, invert_attention_mask, stable_softmax
+from ...utils import add_start_docstrings_to_model_forward, logging
+from .configuration_blip import BlipTextConfig
+
+
+logger = logging.get_logger(__name__)
+
+BLIP_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoProcessor`]. See [`BlipProcessor.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L52
+class TFBlipTextEmbeddings(tf.keras.layers.Layer):
+    """Construct the embeddings from word and position embeddings."""
+
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.word_embeddings = tf.keras.layers.Embedding(
+            config.vocab_size,
+            config.hidden_size,
+            embeddings_initializer=get_initializer(config.initializer_range),
+            name="word_embeddings",
+        )
+        self.position_embeddings = tf.keras.layers.Embedding(
+            config.max_position_embeddings,
+            config.hidden_size,
+            embeddings_initializer=get_initializer(config.initializer_range),
+            name="position_embeddings",
+        )
+
+        # self.LayerNorm is not snake-cased to stick with PyTorch model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob, name="dropout")
+
+        self.position_ids = tf.expand_dims(tf.range(config.max_position_embeddings), 0)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+
+        self.config = config
+
+    def call(self, input_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0, training=None):
+        if input_ids is not None:
+            input_shape = tf.shape(input_ids)
+        else:
+            input_shape = tf.shape(inputs_embeds)[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+
+        if inputs_embeds is None:
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        embeddings = inputs_embeds
+
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings, training=training)
+        return embeddings
+
+
+# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L97
+class TFBlipTextSelfAttention(tf.keras.layers.Layer):
+    def __init__(self, config, is_cross_attention, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention heads (%d)"
+                % (config.hidden_size, config.num_attention_heads)
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = tf.keras.layers.Dense(
+            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
+        )
+        self.key = tf.keras.layers.Dense(
+            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
+        )
+        self.value = tf.keras.layers.Dense(
+            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
+        )
+
+        self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = tf.keras.layers.Embedding(
+                2 * config.max_position_embeddings - 1, self.attention_head_size
+            )
+
+    def transpose_for_scores(self, x):
+        new_x_shape = tf.concat(
+            [tf.shape(x)[:-1], tf.constant([self.num_attention_heads, self.attention_head_size], dtype=tf.int32)],
+            axis=0,
+        )
+        x = tf.reshape(x, new_x_shape)
+        return tf.transpose(x, perm=(0, 2, 1, 3))
+
+    def call(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        training=None,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = tf.concat([past_key_value[0], key_layer], axis=2)
+            value_layer = tf.concat([past_key_value[1], value_layer], axis=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = shape_list(hidden_states)[1]
+            position_ids_l = tf.expand_dims(tf.range(seq_length, dtype=tf.int64, device=hidden_states.device), 1)
+            position_ids_r = tf.expand_dims(tf.range(seq_length, dtype=tf.int64, device=hidden_states.device), 0)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = tf.cast(positional_embedding, query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = tf.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = tf.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = tf.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BlipTextModel forward() function)
+            attention_scores = attention_scores + tf.cast(attention_mask, attention_scores.dtype)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = stable_softmax(attention_scores, axis=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs, training=training)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs_dropped = attention_probs_dropped * head_mask
+
+        context_layer = attention_probs_dropped @ value_layer
+
+        context_layer = tf.transpose(context_layer, perm=(0, 2, 1, 3))
+        new_context_layer_shape = shape_list(context_layer)[:-2] + [self.all_head_size]
+        context_layer = tf.reshape(context_layer, new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        outputs = outputs + (past_key_value,)
+        return outputs
+
+
+class TFBlipTextSelfOutput(tf.keras.layers.Layer):
+    def __init__(self, config: BlipTextConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = tf.keras.layers.Dense(
+            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
+
+    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: Optional[bool] = None) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.dropout(inputs=hidden_states, training=training)
+        hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
+
+        return hidden_states
+
+
+# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#242
+class TFBlipTextAttention(tf.keras.layers.Layer):
+    def __init__(self, config, is_cross_attention=False, **kwargs):
+        super().__init__(**kwargs)
+        self.self = TFBlipTextSelfAttention(config, is_cross_attention, name="self")
+        # "output" is a protected attribute on TF models
+        self.self_output = TFBlipTextSelfOutput(config, name="output")
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        encoder_hidden_states: tf.Tensor | None = None,
+        encoder_attention_mask: tf.Tensor | None = None,
+        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
+        output_attentions: Optional[bool] = False,
+        training: Optional[bool] = None,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+            training=training,
+        )
+        attention_output = self.self_output(self_outputs[0], hidden_states, training=training)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->BlipText
+class TFBlipTextIntermediate(tf.keras.layers.Layer):
+    def __init__(self, config: BlipTextConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = tf.keras.layers.Dense(
+            units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+
+        return hidden_states
+
+
+class TFBlipTextOutput(tf.keras.layers.Layer):
+    def __init__(self, config: BlipTextConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = tf.keras.layers.Dense(
+            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
+
+    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.dropout(inputs=hidden_states, training=training)
+        hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
+
+        return hidden_states
+
+
+class TFBlipTextLayer(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.attention = TFBlipTextAttention(config, name="attention")
+        if self.config.is_decoder:
+            self.crossattention = TFBlipTextAttention(
+                config, is_cross_attention=self.config.is_decoder, name="crossattention"
+            )
+        self.intermediate = TFBlipTextIntermediate(config, name="intermediate")
+        self.self_output = TFBlipTextOutput(config, name="output")
+
+    def call(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        training=None,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+            training=training,
+        )
+        attention_output = self_attention_outputs[0]
+
+        outputs = self_attention_outputs[1:-1]
+        present_key_value = self_attention_outputs[-1]
+
+        if encoder_hidden_states is not None:
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                output_attentions=output_attentions,
+                training=training,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.self_output(intermediate_output, attention_output, training=training)
+        outputs = (layer_output,) + outputs
+
+        outputs = outputs + (present_key_value,)
+
+        return outputs
+
+
+# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L386
+@keras_serializable
+class TFBlipTextEncoder(tf.keras.layers.Layer):
+    config_class = BlipTextConfig
+
+    def __init__(self, config, name=None, **kwargs):
+        super().__init__(name=name, **kwargs)
+        self.config = config
+        self.layer = [TFBlipTextLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]
+
+    @unpack_inputs
+    def call(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        training=None,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.is_decoder else None
+
+        next_decoder_cache = () if use_cache else None
+
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask,
+                layer_head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                past_key_value,
+                output_attentions,
+                training=training,
+            )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return TFBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->BlipText
+class TFBlipTextPooler(tf.keras.layers.Layer):
+    def __init__(self, config: BlipTextConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = tf.keras.layers.Dense(
+            units=config.hidden_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            activation="tanh",
+            name="dense",
+        )
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(inputs=first_token_tensor)
+
+        return pooled_output
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->BlipText
+class TFBlipTextPredictionHeadTransform(tf.keras.layers.Layer):
+    def __init__(self, config: BlipTextConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = tf.keras.layers.Dense(
+            units=config.hidden_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="dense",
+        )
+
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = get_tf_activation(config.hidden_act)
+        else:
+            self.transform_act_fn = config.hidden_act
+
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(inputs=hidden_states)
+
+        return hidden_states
+
+
+class TFBlipTextLMPredictionHead(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.transform = TFBlipTextPredictionHeadTransform(config, name="transform")
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = tf.keras.layers.Dense(
+            config.vocab_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="decoder",
+            use_bias=False,
+        )
+        self.config = config
+
+    def build(self, input_shape=None):
+        self.bias = self.add_weight(name="bias", shape=(self.config.vocab_size,), initializer="zeros", trainable=True)
+        super().build(input_shape)
+
+    def call(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states) + self.bias
+        return hidden_states
+
+
+class TFBlipTextOnlyMLMHead(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.predictions = TFBlipTextLMPredictionHead(config, name="predictions")
+
+    def call(self, sequence_output: tf.Tensor) -> tf.Tensor:
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L548
+class TFBlipTextPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = BlipTextConfig
+    base_model_prefix = "bert"
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+
+# Adapted from https://github.com/salesforce/BLIP/blob/3a29b7410476bf5f2ba0955827390eb6ea1f4f9d/models/med.py#L571
+class TFBlipTextModel(TFBlipTextPreTrainedModel):
+    """
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
+    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. argument and `is_decoder` set to `True`; an
+    `encoder_hidden_states` is then expected as an input to the forward pass.
+    """
+
+    def __init__(self, config, add_pooling_layer=True, name=None, **kwargs):
+        super().__init__(config, name=name, **kwargs)
+        self.config = config
+
+        self.embeddings = TFBlipTextEmbeddings(config, name="embeddings")
+        self.encoder = TFBlipTextEncoder(config, name="encoder")
+        self.pooler = TFBlipTextPooler(config, name="pooler") if add_pooling_layer else None
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    @tf.function
+    def get_extended_attention_mask(
+        self, attention_mask: tf.Tensor, input_shape: Tuple[int], is_decoder: bool
+    ) -> tf.Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+
+        Arguments:
+            attention_mask (`tf.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (`Tuple[int]`):
+                The shape of the input to the model.
+            is_decoder (`bool`):
+                Whether the model is used as a decoder.
+
+        Returns:
+            `tf.Tensor` The extended attention mask, with the same dtype as `attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if not isinstance(attention_mask, tf.Tensor):
+            attention_mask = tf.convert_to_tensor(attention_mask)  # Catches NumPy inputs that haven't been cast yet
+        if attention_mask.shape.rank == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.shape.rank == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - if the model is a decoder, apply a causal mask in addition to the padding mask
+            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            if is_decoder:
+                batch_size, seq_length = input_shape
+
+                seq_ids = tf.range(seq_length, dtype=attention_mask.dtype)
+                causal_mask = tf.broadcast_to(seq_ids, (batch_size, seq_length, seq_length)) <= seq_ids[None, :, None]
+                # in case past_key_values are used we need to add a prefix ones mask to the causal mask
+
+                if shape_list(causal_mask)[1] < shape_list(attention_mask)[1]:
+                    prefix_seq_len = tf.shape(attention_mask)[1] - tf.shape(causal_mask)[1]
+                    causal_mask = tf.concat(
+                        [
+                            tf.ones((batch_size, seq_length, prefix_seq_len), dtype=causal_mask.dtype),
+                            causal_mask,
+                        ],
+                        axis=-1,
+                    )
+                extended_attention_mask = (
+                    tf.cast(causal_mask[:, None, :, :], attention_mask.dtype) * attention_mask[:, None, None, :]
+                )
+            else:
+                extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = tf.cast(extended_attention_mask, self.dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+
+    @add_start_docstrings_to_model_forward(BLIP_TEXT_INPUTS_DOCSTRING)
+    @unpack_inputs
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        encoder_embeds: tf.Tensor | None = None,
+        encoder_hidden_states: tf.Tensor | None = None,
+        encoder_attention_mask: tf.Tensor | None = None,
+        past_key_values: Tuple[Tuple[tf.Tensor]] | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        is_decoder: bool = False,
+        training: bool = False,
+    ) -> Tuple[tf.Tensor] | TFBaseModelOutputWithPoolingAndCrossAttentions:
+        r"""
+        encoder_hidden_states  (`tf.Tensor`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`tf.Tensor`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(tf.Tensor))`, *optional*):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = shape_list(input_ids)
+            batch_size, seq_length = input_shape
+        elif inputs_embeds is not None:
+            input_shape = shape_list(inputs_embeds)[:-1]
+            batch_size, seq_length = input_shape
+        elif encoder_embeds is not None:
+            input_shape = shape_list(encoder_embeds)[:-1]
+            batch_size, seq_length = input_shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds or encoder_embeds")
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = tf.ones(((batch_size, seq_length + past_key_values_length)))
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: tf.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, is_decoder)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_hidden_states is not None:
+            if type(encoder_hidden_states) == list:
+                encoder_batch_size, encoder_sequence_length, _ = shape_list(encoder_hidden_states[0])
+            else:
+                encoder_batch_size, encoder_sequence_length, _ = shape_list(encoder_hidden_states)
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+
+            if type(encoder_attention_mask) == list:
+                encoder_extended_attention_mask = [invert_attention_mask(mask) for mask in encoder_attention_mask]
+            elif encoder_attention_mask is None:
+                encoder_attention_mask = tf.ones(encoder_hidden_shape)
+                encoder_extended_attention_mask = invert_attention_mask(encoder_attention_mask)
+            else:
+                encoder_extended_attention_mask = invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        if encoder_embeds is None:
+            embedding_output = self.embeddings(
+                input_ids=input_ids,
+                position_ids=position_ids,
+                inputs_embeds=inputs_embeds,
+                past_key_values_length=past_key_values_length,
+            )
+        else:
+            embedding_output = encoder_embeds
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return TFBaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L811
+class TFBlipTextLMHeadModel(TFBlipTextPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+    def __init__(self, config, **kwargs):
+        super().__init__(config, **kwargs)
+
+        self.bert = TFBlipTextModel(config, add_pooling_layer=False, name="bert")
+        self.cls = TFBlipTextOnlyMLMHead(config, name="cls")
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(BLIP_TEXT_INPUTS_DOCSTRING)
+    @unpack_inputs
+    def call(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        return_logits=False,
+        is_decoder=True,
+        training=None,
+    ):
+        r"""
+        encoder_hidden_states (`tf.Tensor`, *optional*): Sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is
+            configured as a decoder.
+        encoder_attention_mask (`tf.Tensor`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        labels (`tf.Tensor`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
+        past_key_values (`tuple(tuple(tf.Tensor))`, *optional*):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            is_decoder=is_decoder,
+            training=training,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        if return_logits:
+            return prediction_scores[:, :-1, :]
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :]
+            shifted_prediction_scores = tf.reshape(shifted_prediction_scores, (-1, self.config.vocab_size))
+            labels = labels[:, 1:]
+            labels = tf.reshape(labels, (-1,))
+            # Keras won't give us label smoothing for sparse CE, so we de-sparsify things here
+            one_hot_labels = tf.one_hot(labels, depth=self.config.vocab_size, dtype=tf.float32)
+            loss_fct = tf.keras.losses.CategoricalCrossentropy(from_logits=True, label_smoothing=0.1, reduction="none")
+            masked_positions = tf.cast(tf.not_equal(labels, -100), dtype=tf.float32)
+            lm_loss = loss_fct(one_hot_labels, shifted_prediction_scores)
+            lm_loss *= masked_positions
+            lm_loss = tf.reduce_sum(lm_loss, axis=0) / tf.math.count_nonzero(masked_positions, dtype=tf.float32)
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return TFCausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        # cut decoder_input_ids if past_key_values is used
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "past_key_values": past_key_values,
+            "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None),
+            "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None),
+            "is_decoder": True,
+        }
+
+    def _reorder_cache(self, past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
diff --git a/src/transformers/models/blip/processing_blip.py b/src/transformers/models/blip/processing_blip.py
index e860f6723a26..c4df8ddffaba 100644
--- a/src/transformers/models/blip/processing_blip.py
+++ b/src/transformers/models/blip/processing_blip.py
@@ -18,6 +18,7 @@
 
 from typing import List, Optional, Union
 
+from ...image_utils import ImageInput
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
 from ...utils import TensorType
@@ -47,7 +48,7 @@ def __init__(self, image_processor, tokenizer):
 
     def __call__(
         self,
-        images=None,
+        images: ImageInput = None,
         text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
         add_special_tokens: bool = True,
         padding: Union[bool, str, PaddingStrategy] = False,
@@ -63,7 +64,7 @@ def __call__(
         return_length: bool = False,
         verbose: bool = True,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         This method uses [`BlipImageProcessor.__call__`] method to prepare image(s) for the model, and
@@ -76,7 +77,6 @@ def __call__(
 
         # Get only text
         if images is None:
-
             self.current_processor = self.tokenizer
             text_encoding = self.tokenizer(
                 text=text,
diff --git a/src/transformers/models/blip_2/__init__.py b/src/transformers/models/blip_2/__init__.py
new file mode 100644
index 000000000000..6fbfd53b3703
--- /dev/null
+++ b/src/transformers/models/blip_2/__init__.py
@@ -0,0 +1,71 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {
+    "configuration_blip_2": [
+        "BLIP_2_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "Blip2Config",
+        "Blip2QFormerConfig",
+        "Blip2VisionConfig",
+    ],
+    "processing_blip_2": ["Blip2Processor"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_blip_2"] = [
+        "BLIP_2_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "Blip2Model",
+        "Blip2QFormerModel",
+        "Blip2PreTrainedModel",
+        "Blip2ForConditionalGeneration",
+        "Blip2VisionModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_blip_2 import (
+        BLIP_2_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        Blip2Config,
+        Blip2QFormerConfig,
+        Blip2VisionConfig,
+    )
+    from .processing_blip_2 import Blip2Processor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_blip_2 import (
+            BLIP_2_PRETRAINED_MODEL_ARCHIVE_LIST,
+            Blip2ForConditionalGeneration,
+            Blip2Model,
+            Blip2PreTrainedModel,
+            Blip2QFormerModel,
+            Blip2VisionModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/blip_2/configuration_blip_2.py b/src/transformers/models/blip_2/configuration_blip_2.py
new file mode 100644
index 000000000000..1b375e147f78
--- /dev/null
+++ b/src/transformers/models/blip_2/configuration_blip_2.py
@@ -0,0 +1,355 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" BLIP-2 model configuration"""
+
+import os
+from typing import Union
+
+from ...configuration_utils import PretrainedConfig
+from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+from ...utils import logging
+from ..auto import CONFIG_MAPPING
+
+
+logger = logging.get_logger(__name__)
+
+BLIP_2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "salesforce/blip2-opt-2.7b": "https://huggingface.co/salesforce/blip2-opt-2.7b/resolve/main/config.json",
+}
+
+
+class Blip2VisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Blip2VisionModel`]. It is used to instantiate a
+    BLIP-2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration defaults will yield a similar configuration to that of the BLIP-2
+    [Salesforce/blip2-opt-2.7b](https://huggingface.co/Salesforce/blip2-opt-2.7b) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1408):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 6144):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 39):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults
+            to 1e-5): The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries and values in the self-attention layers.
+
+    Example:
+
+    ```python
+    >>> from transformers import Blip2VisionConfig, Blip2VisionModel
+
+    >>> # Initializing a Blip2VisionConfig with Salesforce/blip2-opt-2.7b style configuration
+    >>> configuration = Blip2VisionConfig()
+
+    >>> # Initializing a Blip2VisionModel (with random weights) from the Salesforce/blip2-opt-2.7b style configuration
+    >>> model = Blip2VisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "blip_2_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=1408,
+        intermediate_size=6144,
+        num_hidden_layers=39,
+        num_attention_heads=16,
+        image_size=224,
+        patch_size=14,
+        hidden_act="gelu",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        initializer_range=1e-10,
+        qkv_bias=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.qkv_bias = qkv_bias
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from Blip2Config
+        if config_dict.get("model_type") == "blip-2":
+            config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class Blip2QFormerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Blip2QFormerModel`]. It is used to instantiate a
+    BLIP-2 Querying Transformer (Q-Former) model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the BLIP-2
+    [Salesforce/blip2-opt-2.7b](https://huggingface.co/Salesforce/blip2-opt-2.7b) architecture. Configuration objects
+    inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from
+    [`PretrainedConfig`] for more information.
+
+    Note that [`Blip2QFormerModel`] is very similar to [`BertLMHeadModel`] with interleaved cross-attention.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the Q-Former model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling the model.
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        cross_attention_frequency (`int`, *optional*, defaults to 2):
+            The frequency of adding cross-attention to the Transformer layers.
+        encoder_hidden_size (`int`, *optional*, defaults to 1408):
+            The hidden size of the hidden states for cross-attention.
+
+    Examples:
+
+    ```python
+    >>> from transformers import Blip2QFormerConfig, Blip2QFormerModel
+
+    >>> # Initializing a BLIP-2 Salesforce/blip2-opt-2.7b style configuration
+    >>> configuration = Blip2QFormerConfig()
+
+    >>> # Initializing a model (with random weights) from the Salesforce/blip2-opt-2.7b style configuration
+    >>> model = Blip2QFormerModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "blip_2_qformer"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        position_embedding_type="absolute",
+        cross_attention_frequency=2,
+        encoder_hidden_size=1408,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.cross_attention_frequency = cross_attention_frequency
+        self.encoder_hidden_size = encoder_hidden_size
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the qformer config dict if we are loading from Blip2Config
+        if config_dict.get("model_type") == "blip-2":
+            config_dict = config_dict["qformer_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class Blip2Config(PretrainedConfig):
+    r"""
+    [`Blip2Config`] is the configuration class to store the configuration of a [`Blip2ForConditionalGeneration`]. It is
+    used to instantiate a BLIP-2 model according to the specified arguments, defining the vision model, Q-Former model
+    and language model configs. Instantiating a configuration with the defaults will yield a similar configuration to
+    that of the BLIP-2 [Salesforce/blip2-opt-2.7b](https://huggingface.co/Salesforce/blip2-opt-2.7b) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`Blip2VisionConfig`].
+        qformer_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`Blip2QFormerConfig`].
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize any [`PretrainedConfig`].
+        num_query_tokens (`int`, *optional*, defaults to 32):
+            The number of query tokens passed through the Transformer.
+
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from transformers import (
+    ...     Blip2VisionConfig,
+    ...     Blip2QFormerConfig,
+    ...     OPTConfig,
+    ...     Blip2Config,
+    ...     Blip2ForConditionalGeneration,
+    ... )
+
+    >>> # Initializing a Blip2Config with Salesforce/blip2-opt-2.7b style configuration
+    >>> configuration = Blip2Config()
+
+    >>> # Initializing a Blip2ForConditionalGeneration (with random weights) from the Salesforce/blip2-opt-2.7b style configuration
+    >>> model = Blip2ForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+
+    >>> # We can also initialize a Blip2Config from a Blip2VisionConfig, Blip2QFormerConfig and any PretrainedConfig
+
+    >>> # Initializing BLIP-2 vision, BLIP-2 Q-Former and language model configurations
+    >>> vision_config = Blip2VisionConfig()
+    >>> qformer_config = Blip2QFormerConfig()
+    >>> text_config = OPTConfig()
+
+    >>> config = Blip2Config.from_text_vision_configs(vision_config, qformer_config, text_config)
+    ```"""
+
+    model_type = "blip-2"
+
+    def __init__(self, vision_config=None, qformer_config=None, text_config=None, num_query_tokens=32, **kwargs):
+        super().__init__(**kwargs)
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("vision_config is None. initializing the Blip2VisionConfig with default values.")
+
+        if qformer_config is None:
+            qformer_config = {}
+            logger.info("qformer_config is None. Initializing the Blip2QFormerConfig with default values.")
+
+        if text_config is None:
+            text_config = {}
+            logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).")
+
+        self.vision_config = Blip2VisionConfig(**vision_config)
+        self.qformer_config = Blip2QFormerConfig(**qformer_config)
+        text_model_type = text_config["model_type"] if "model_type" in text_config else "opt"
+        self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
+
+        self.tie_word_embeddings = self.text_config.tie_word_embeddings
+        self.is_encoder_decoder = self.text_config.is_encoder_decoder
+
+        self.num_query_tokens = num_query_tokens
+        self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
+        self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+        self.initializer_factor = 1.0
+        self.initializer_range = 0.02
+
+    @classmethod
+    def from_vision_qformer_text_configs(
+        cls,
+        vision_config: Blip2VisionConfig,
+        qformer_config: Blip2QFormerConfig,
+        text_config: PretrainedConfig,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a [`Blip2Config`] (or a derived class) from a BLIP-2 vision model, Q-Former and language model
+        configurations.
+
+        Returns:
+            [`Blip2Config`]: An instance of a configuration object
+        """
+
+        return cls(
+            vision_config=vision_config.to_dict(),
+            qformer_config=qformer_config.to_dict(),
+            text_config=text_config.to_dict(),
+            **kwargs,
+        )
diff --git a/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py b/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py
new file mode 100644
index 000000000000..c2e6eceae532
--- /dev/null
+++ b/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py
@@ -0,0 +1,291 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Convert BLIP-2 checkpoints from the original repository.
+
+URL: https://github.com/salesforce/LAVIS/tree/main/projects/blip2
+"""
+
+import argparse
+
+import requests
+import torch
+
+# pip3 install salesforce-lavis
+# I'm actually installing a slightly modified version: pip3 install -U git+https://github.com/nielsrogge/LAVIS.git@blip2_float32
+# to make sure we can compare both original and HF implementation in float32
+from lavis.models import load_model_and_preprocess
+from PIL import Image
+
+from transformers import (
+    AutoTokenizer,
+    Blip2Config,
+    Blip2ForConditionalGeneration,
+    Blip2Processor,
+    Blip2VisionConfig,
+    BlipImageProcessor,
+    OPTConfig,
+    T5Config,
+    set_seed,
+)
+from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
+
+
+def load_demo_image():
+    url = "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png"
+    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+
+    return image
+
+
+# here we list all keys to be renamed (original name on the left, our name on the right)
+def create_rename_keys(config):
+    rename_keys = []
+    # fmt: off
+
+    # vision encoder
+    rename_keys.append(("visual_encoder.cls_token", "vision_model.embeddings.class_embedding"))
+    rename_keys.append(("visual_encoder.pos_embed", "vision_model.embeddings.position_embedding"))
+    rename_keys.append(("visual_encoder.patch_embed.proj.weight", "vision_model.embeddings.patch_embedding.weight"))
+    rename_keys.append(("visual_encoder.patch_embed.proj.bias", "vision_model.embeddings.patch_embedding.bias"))
+    rename_keys.append(("ln_vision.weight", "vision_model.post_layernorm.weight"))
+    rename_keys.append(("ln_vision.bias", "vision_model.post_layernorm.bias"))
+
+    for i in range(config.vision_config.num_hidden_layers):
+        rename_keys.append((f"visual_encoder.blocks.{i}.norm1.weight", f"vision_model.encoder.layers.{i}.layer_norm1.weight"))
+        rename_keys.append((f"visual_encoder.blocks.{i}.norm1.bias", f"vision_model.encoder.layers.{i}.layer_norm1.bias"))
+        rename_keys.append((f"visual_encoder.blocks.{i}.norm2.weight", f"vision_model.encoder.layers.{i}.layer_norm2.weight"))
+        rename_keys.append((f"visual_encoder.blocks.{i}.norm2.bias", f"vision_model.encoder.layers.{i}.layer_norm2.bias"))
+        rename_keys.append((f"visual_encoder.blocks.{i}.attn.qkv.weight", f"vision_model.encoder.layers.{i}.self_attn.qkv.weight"))
+        rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.weight", f"vision_model.encoder.layers.{i}.self_attn.projection.weight",))
+        rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.bias", f"vision_model.encoder.layers.{i}.self_attn.projection.bias"))
+        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.weight", f"vision_model.encoder.layers.{i}.mlp.fc1.weight"))
+        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.bias", f"vision_model.encoder.layers.{i}.mlp.fc1.bias"))
+        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.weight", f"vision_model.encoder.layers.{i}.mlp.fc2.weight"))
+        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.bias", f"vision_model.encoder.layers.{i}.mlp.fc2.bias"))
+
+    # QFormer
+    rename_keys.append(("Qformer.bert.embeddings.LayerNorm.weight", "qformer.layernorm.weight"))
+    rename_keys.append(("Qformer.bert.embeddings.LayerNorm.bias", "qformer.layernorm.bias"))
+
+    # fmt: on
+    return rename_keys
+
+
+def rename_key(dct, old, new):
+    val = dct.pop(old)
+    dct[new] = val
+
+
+def read_in_q_v_bias(state_dict, config):
+    for i in range(config.vision_config.num_hidden_layers):
+        # read in original q and v biases
+        q_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.q_bias")
+        v_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.v_bias")
+
+        # next, set bias in the state dict
+        qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias))
+        state_dict[f"vision_model.encoder.layers.{i}.self_attn.qkv.bias"] = qkv_bias
+
+
+def get_blip2_config(model_name, eos_token_id):
+    image_size = 364 if "coco" in model_name else 224
+    vision_config = Blip2VisionConfig(image_size=image_size).to_dict()
+
+    # make sure the models have proper bos_token_id and eos_token_id set (important for generation)
+    # seems like flan-T5 models don't have bos_token_id properly set?
+    if "opt-2.7b" in model_name:
+        text_config = OPTConfig.from_pretrained("facebook/opt-2.7b", eos_token_id=eos_token_id).to_dict()
+    elif "opt-6.7b" in model_name:
+        text_config = OPTConfig.from_pretrained("facebook/opt-6.7b", eos_token_id=eos_token_id).to_dict()
+    elif "t5-xl" in model_name:
+        text_config = T5Config.from_pretrained("google/flan-t5-xl", dense_act_fn="gelu", bos_token_id=1).to_dict()
+    elif "t5-xxl" in model_name:
+        text_config = T5Config.from_pretrained("google/flan-t5-xxl", dense_act_fn="gelu", bos_token_id=1).to_dict()
+
+    config = Blip2Config(vision_config=vision_config, text_config=text_config)
+
+    return config, image_size
+
+
+@torch.no_grad()
+def convert_blip2_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
+    """
+    Copy/paste/tweak model's weights to Transformers design.
+    """
+    tokenizer = (
+        AutoTokenizer.from_pretrained("facebook/opt-2.7b")
+        if "opt" in model_name
+        else AutoTokenizer.from_pretrained("google/flan-t5-xl")
+    )
+    eos_token_id = tokenizer("\n", add_special_tokens=False).input_ids[0]
+    config, image_size = get_blip2_config(model_name, eos_token_id=eos_token_id)
+
+    hf_model = Blip2ForConditionalGeneration(config).eval()
+
+    model_name_to_original = {
+        "blip2-opt-2.7b": ("blip2_opt", "pretrain_opt2.7b"),
+        "blip2-opt-6.7b": ("blip2_opt", "pretrain_opt6.7b"),
+        "blip2-opt-2.7b-coco": ("blip2_opt", "caption_coco_opt2.7b"),
+        "blip2-opt-6.7b-coco": ("blip2_opt", "caption_coco_opt6.7b"),
+        "blip2-flan-t5-xl": ("blip2_t5", "pretrain_flant5xl"),
+        "blip2-flan-t5-xl-coco": ("blip2_t5", "caption_coco_flant5xl"),
+        "blip2-flan-t5-xxl": ("blip2_t5", "pretrain_flant5xxl"),
+    }
+
+    name, type = model_name_to_original[model_name]
+
+    # note: this script is tested on 2 GPUs, as models are compared in float32,
+    # which requires quite some memory. Hence loading both on a
+    # separate device is the easiest to compare
+    hf_model_device = "cuda:0" if torch.cuda.is_available() else "cpu"
+    lavis_device = "cuda:1" if torch.cuda.is_available() else "cpu"
+
+    # load original model
+    print("Loading original model...")
+    original_model, vis_processors, _ = load_model_and_preprocess(
+        name=name, model_type=type, is_eval=True, device=lavis_device
+    )
+    original_model.eval()
+    print("Done!")
+
+    # update state dict keys
+    state_dict = original_model.state_dict()
+    rename_keys = create_rename_keys(config)
+    for src, dest in rename_keys:
+        rename_key(state_dict, src, dest)
+
+    # some keys can be renamed efficiently
+    for key, val in state_dict.copy().items():
+        val = state_dict.pop(key)
+        if key.startswith("Qformer.bert"):
+            key = key.replace("Qformer.bert", "qformer")
+        if "attention.self" in key:
+            key = key.replace("self", "attention")
+        if "opt_proj" in key:
+            key = key.replace("opt_proj", "language_projection")
+        if "t5_proj" in key:
+            key = key.replace("t5_proj", "language_projection")
+        if key.startswith("opt"):
+            key = key.replace("opt", "language")
+        if key.startswith("t5"):
+            key = key.replace("t5", "language")
+        state_dict[key] = val
+
+    # read in qv biases
+    read_in_q_v_bias(state_dict, config)
+
+    missing_keys, unexpected_keys = hf_model.load_state_dict(state_dict, strict=False)
+    assert len(missing_keys) == 0
+    assert unexpected_keys == ["qformer.embeddings.position_ids"]
+
+    image = load_demo_image()
+    original_pixel_values = vis_processors["eval"](image).unsqueeze(0).to(lavis_device)
+    input_ids = tokenizer(["\n"], return_tensors="pt").input_ids.to(hf_model_device)
+
+    # create processor
+    image_processor = BlipImageProcessor(
+        size={"height": image_size, "width": image_size}, image_mean=OPENAI_CLIP_MEAN, image_std=OPENAI_CLIP_STD
+    )
+    processor = Blip2Processor(image_processor=image_processor, tokenizer=tokenizer)
+    pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(hf_model_device)
+
+    # make sure processor creates exact same pixel values
+    assert torch.allclose(pixel_values, original_pixel_values.to(pixel_values.device))
+
+    original_model.to(lavis_device)
+    hf_model.to(hf_model_device)
+    with torch.no_grad():
+        if "opt" in model_name:
+            original_logits = original_model({"image": original_pixel_values, "text_input": [""]}).logits
+            logits = hf_model(pixel_values, input_ids).logits
+        else:
+            original_logits = original_model(
+                {"image": original_pixel_values, "text_input": ["\n"], "text_output": ["\n"]}
+            ).logits
+            labels = input_ids.masked_fill(input_ids == tokenizer.pad_token_id, -100)
+            logits = hf_model(pixel_values, input_ids, labels=labels).logits
+
+    assert original_logits.shape == logits.shape
+    print("First values of original logits:", original_logits[0, :3, :3])
+    print("First values of HF logits:", logits[0, :3, :3])
+
+    # assert values
+    assert torch.allclose(original_logits.to(logits.device), logits, atol=1e-4)
+    print("Looks ok!")
+
+    print("Generating a caption...")
+    prompt = "Question: what object is in this image? Answer:"
+    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(hf_model_device)
+
+    set_seed(42)
+
+    original_outputs = original_model.generate(
+        {"image": original_pixel_values, "prompt": prompt}, use_nucleus_sampling=True
+    )
+    outputs = hf_model.generate(
+        pixel_values,
+        input_ids,
+        do_sample=True,
+        num_beams=5,
+        max_length=30,
+        min_length=1,
+        top_p=0.9,
+        repetition_penalty=1.0,
+        length_penalty=1.0,
+        temperature=1,
+    )
+    output_text = processor.batch_decode(outputs, skip_special_tokens=True)
+    output_text = [text.strip() for text in output_text]
+    print("Original generation:", original_outputs)
+    print("HF generation:", output_text)
+
+    if pytorch_dump_folder_path is not None:
+        processor.save_pretrained(pytorch_dump_folder_path)
+        hf_model.save_pretrained(pytorch_dump_folder_path)
+
+    if push_to_hub:
+        processor.push_to_hub(f"nielsr/{model_name}")
+        hf_model.push_to_hub(f"nielsr/{model_name}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    choices = [
+        "blip2-opt-2.7b",
+        "blip2-opt-6.7b",
+        "blip2-opt-2.7b-coco",
+        "blip2-opt-6.7b-coco",
+        "blip2-flan-t5-xl",
+        "blip2-flan-t5-xl-coco",
+        "blip2-flan-t5-xxl",
+    ]
+    parser.add_argument(
+        "--model_name",
+        default="blip2-opt-2.7b",
+        choices=choices,
+        type=str,
+        help="Path to hf config.json of model to convert",
+    )
+    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+    parser.add_argument(
+        "--push_to_hub",
+        action="store_true",
+        help="Whether to push the model and processor to the hub after converting",
+    )
+
+    args = parser.parse_args()
+
+    convert_blip2_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py
new file mode 100644
index 000000000000..87c8132ff4fd
--- /dev/null
+++ b/src/transformers/models/blip_2/modeling_blip_2.py
@@ -0,0 +1,1886 @@
+# coding=utf-8
+# Copyright 2023 The Salesforce Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch BLIP-2 model."""
+
+import math
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPooling,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from ..auto import AutoModelForCausalLM, AutoModelForSeq2SeqLM
+from .configuration_blip_2 import Blip2Config, Blip2QFormerConfig, Blip2VisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "Salesforce/blip2-opt-2.7b"
+
+BLIP_2_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "Salesforce/blip2-opt-2.7b",
+    # See all BLIP-2 models at https://huggingface.co/models?filter=blip
+]
+
+
+@dataclass
+class Blip2ForConditionalGenerationModelOutput(ModelOutput):
+    """
+    Class defining the outputs of [`Blip2ForConditionalGeneration`].
+
+    Args:
+        loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
+            Language modeling loss from the language model.
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head of the language model.
+        vision_outputs (`BaseModelOutputWithPooling`):
+            Outputs of the vision encoder.
+        qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
+            Outputs of the Q-Former (Querying Transformer).
+        language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
+            Outputs of the language model.
+    """
+
+    loss: Optional[Tuple[torch.FloatTensor]] = None
+    logits: Optional[Tuple[torch.FloatTensor]] = None
+    vision_outputs: Optional[torch.FloatTensor] = None
+    qformer_outputs: Optional[Tuple[torch.FloatTensor]] = None
+    language_model_outputs: Optional[Tuple[torch.FloatTensor]] = None
+
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k]
+            if k not in ["vision_outputs", "qformer_outputs", "language_model_outputs"]
+            else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+# Copied from transformers.models.blip.modeling_blip.BlipVisionEmbeddings with Blip->Blip2
+class Blip2VisionEmbeddings(nn.Module):
+    def __init__(self, config: Blip2VisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim))
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+
+        self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
+
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        embeddings = embeddings + self.position_embedding[:, : embeddings.size(1), :].to(target_dtype)
+        return embeddings
+
+
+class Blip2Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = nn.Dropout(config.attention_dropout)
+
+        # small tweak here compared to CLIP, no bias here
+        self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=False)
+
+        if config.qkv_bias:
+            q_bias = nn.Parameter(torch.zeros(self.embed_dim))
+            v_bias = nn.Parameter(torch.zeros(self.embed_dim))
+        else:
+            q_bias = None
+            v_bias = None
+
+        if q_bias is not None:
+            qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias))
+            self.qkv.bias = nn.Parameter(qkv_bias)
+
+        self.projection = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        mixed_qkv = self.qkv(hidden_states)
+
+        mixed_qkv = mixed_qkv.reshape(bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads).permute(
+            2, 0, 3, 1, 4
+        )
+        query_states, key_states, value_states = mixed_qkv[0], mixed_qkv[1], mixed_qkv[2]
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2))
+
+        attention_scores = attention_scores * self.scale
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_states).permute(0, 2, 1, 3)
+
+        new_context_layer_shape = context_layer.size()[:-2] + (self.embed_dim,)
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        output = self.projection(context_layer)
+
+        outputs = (output, attention_probs) if output_attentions else (output, None)
+
+        return outputs
+
+
+# Copied from transformers.models.blip.modeling_blip.BlipMLP
+class Blip2MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.blip.modeling_blip.BlipEncoderLayer with Blip->Blip2
+class Blip2EncoderLayer(nn.Module):
+    def __init__(self, config: Blip2Config):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = Blip2Attention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = Blip2MLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            head_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = hidden_states + residual
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+
+        hidden_states = hidden_states + residual
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class Blip2PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = Blip2Config
+    base_model_prefix = "blip"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Blip2Attention", "T5Block", "OPTDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _keep_in_fp32_modules = ["wo"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_range
+        if isinstance(module, nn.Conv2d) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=factor)
+            if hasattr(module, "bias") and module.bias is not None:
+                module.bias.data.zero_()
+
+        if isinstance(module, Blip2VisionEmbeddings):
+            if hasattr(self.config, "vision_config"):
+                factor = self.config.vision_config.initializer_range
+            nn.init.trunc_normal_(module.position_embedding, mean=0.0, std=factor)
+            nn.init.trunc_normal_(module.class_embedding, mean=0.0, std=factor)
+
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, Blip2Encoder):
+            module.gradient_checkpointing = value
+
+
+BLIP_2_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`Blip2Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+BLIP_2_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`Blip2Processor`]. See [`Blip2Processor.__call__`] for
+            details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+BLIP_2_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            T5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
+            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
+
+            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [T5
+            Training](./t5#training).
+        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+BLIP_2_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`Blip2Processor`]. See [`Blip2Processor.__call__`] for
+            details.
+
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of input sequence tokens in the vocabulary of the language model. Input tokens can optionally be
+            provided to serve as text prompt, which the language model can continue.
+
+            Indices can be obtained using [`Blip2Processor`]. See [`Blip2Processor.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary of the language model. Only relevant in case an
+            encoder-decoder language model (like T5) is used.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details. [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+
+            Only relevant in case an encoder-decoder language model (like T5) is used.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.blip.modeling_blip.BlipEncoder with Blip->Blip2
+class Blip2Encoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`Blip2EncoderLayer`].
+
+    Args:
+        config (`Blip2Config`):
+            The corresponding vision configuration for the `Blip2Encoder`.
+    """
+
+    def __init__(self, config: Blip2Config):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([Blip2EncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Embedded representation of the inputs. Should be float, not int tokens.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(encoder_layer),
+                    hidden_states,
+                    attention_mask,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+# Copied from transformers.models.blip.modeling_blip.BlipVisionModel with Blip->Blip2, BLIP->BLIP_2
+class Blip2VisionModel(Blip2PreTrainedModel):
+    main_input_name = "pixel_values"
+    config_class = Blip2VisionConfig
+
+    def __init__(self, config: Blip2VisionConfig):
+        super().__init__(config)
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = Blip2VisionEmbeddings(config)
+        self.encoder = Blip2Encoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(BLIP_2_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Blip2VisionConfig)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+
+class Blip2QFormerMultiHeadAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention heads (%d)"
+                % (config.hidden_size, config.num_attention_heads)
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        if is_cross_attention:
+            self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size)
+        else:
+            self.key = nn.Linear(config.hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+        self.save_attention = False
+
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+
+    def get_attn_gradients(self):
+        return self.attn_gradients
+
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+
+    def get_attention_map(self):
+        return self.attention_map
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        mixed_query_layer = self.query(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        if is_cross_attention and self.save_attention:
+            self.save_attention_map(attention_probs)
+            attention_probs.register_hook(self.save_attn_gradients)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs_dropped = attention_probs_dropped * head_mask
+
+        context_layer = torch.matmul(attention_probs_dropped, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        outputs = outputs + (past_key_value,)
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->Blip2QFormer
+class Blip2QFormerSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class Blip2QFormerAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.attention = Blip2QFormerMultiHeadAttention(config, is_cross_attention)
+        self.output = Blip2QFormerSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        self_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->Blip2QFormer
+class Blip2QFormerIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->Blip2QFormer
+class Blip2QFormerOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class Blip2QFormerLayer(nn.Module):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = Blip2QFormerAttention(config)
+
+        self.layer_idx = layer_idx
+
+        if layer_idx % config.cross_attention_frequency == 0:
+            self.crossattention = Blip2QFormerAttention(config, is_cross_attention=True)
+            self.has_cross_attention = True
+        else:
+            self.has_cross_attention = False
+
+        self.intermediate_query = Blip2QFormerIntermediate(config)
+        self.output_query = Blip2QFormerOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        query_length=0,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:-1]
+
+        present_key_value = self_attention_outputs[-1]
+
+        if query_length > 0:
+            query_attention_output = attention_output[:, :query_length, :]
+
+            if self.has_cross_attention:
+                if encoder_hidden_states is None:
+                    raise ValueError("encoder_hidden_states must be given for cross-attention layers")
+                cross_attention_outputs = self.crossattention(
+                    query_attention_output,
+                    attention_mask,
+                    head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )
+                query_attention_output = cross_attention_outputs[0]
+                # add cross attentions if we output attention weights
+                outputs = outputs + cross_attention_outputs[1:-1]
+
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk_query,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                query_attention_output,
+            )
+
+            if attention_output.shape[1] > query_length:
+                layer_output_text = apply_chunking_to_forward(
+                    self.feed_forward_chunk,
+                    self.chunk_size_feed_forward,
+                    self.seq_len_dim,
+                    attention_output[:, query_length:, :],
+                )
+                layer_output = torch.cat([layer_output, layer_output_text], dim=1)
+        else:
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                attention_output,
+            )
+        outputs = (layer_output,) + outputs
+
+        outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+    def feed_forward_chunk_query(self, attention_output):
+        intermediate_output = self.intermediate_query(attention_output)
+        layer_output = self.output_query(intermediate_output, attention_output)
+        return layer_output
+
+
+class Blip2QFormerEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [Blip2QFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        query_length=0,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions else None
+
+        next_decoder_cache = () if use_cache else None
+
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions, query_length)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                    query_length,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if layer_module.has_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class Blip2QFormerModel(Blip2PreTrainedModel):
+    """
+    Querying Transformer (Q-Former), used in BLIP-2.
+    """
+
+    def __init__(self, config: Blip2QFormerConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        self.encoder = Blip2QFormerEncoder(config)
+
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def get_extended_attention_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_shape: Tuple[int],
+        device: torch.device,
+        has_query: bool = False,
+    ) -> torch.Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+
+        Arguments:
+            attention_mask (`torch.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (`Tuple[int]`):
+                The shape of the input to the model.
+            device (`torch.device`):
+                The device of the input to the model.
+
+        Returns:
+            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - the model is an encoder, so make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+
+    def forward(
+        self,
+        query_embeds: torch.FloatTensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of:
+            shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and
+            value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are
+            used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key
+            value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape
+            `(batch_size, sequence_length)`.
+        use_cache (`bool`, `optional`):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # past_key_values_length
+        past_key_values_length = (
+            past_key_values[0][0].shape[2] - self.config.query_length if past_key_values is not None else 0
+        )
+
+        query_length = query_embeds.shape[1] if query_embeds is not None else 0
+
+        embedding_output = self.layernorm(query_embeds)
+        embedding_output = self.dropout(embedding_output)
+
+        input_shape = embedding_output.size()[:-1]
+        batch_size, seq_length = input_shape
+        device = embedding_output.device
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, device)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_hidden_states is not None:
+            if type(encoder_hidden_states) == list:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size()
+            else:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+
+            if type(encoder_attention_mask) == list:
+                encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
+            elif encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            query_length=query_length,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = sequence_output[:, 0, :]
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    BLIP-2 Model for generating text and image features. The model consists of a vision encoder, Querying Transformer
+    (Q-Former) and a language model.
+    """,
+    BLIP_2_START_DOCSTRING,
+)
+class Blip2Model(Blip2PreTrainedModel):
+    config_class = Blip2Config
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: Blip2Config):
+        super().__init__(config)
+
+        self.vision_model = Blip2VisionModel(config.vision_config)
+
+        self.query_tokens = nn.Parameter(torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size))
+        self.qformer = Blip2QFormerModel(config.qformer_config)
+
+        self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
+        if config.use_decoder_only_language_model:
+            language_model = AutoModelForCausalLM.from_config(config.text_config)
+        else:
+            language_model = AutoModelForSeq2SeqLM.from_config(config.text_config)
+
+        # Update _tied_weights_keys using the base model used.
+        if language_model._tied_weights_keys is not None:
+            self._tied_weights_keys = [f"language_model.{k}" for k in language_model._tied_weights_keys]
+
+        self.language_model = language_model
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+
+    def get_output_embeddings(self) -> nn.Module:
+        return self.language_model.get_output_embeddings()
+
+    def get_encoder(self):
+        return self.language_model.get_encoder()
+
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+
+    def _tie_weights(self):
+        if not self.config.use_decoder_only_language_model:
+            self.language_model.encoder.embed_tokens = self.language_model.shared
+            self.language_model.decoder.embed_tokens = self.language_model.shared
+
+    @add_start_docstrings_to_model_forward(BLIP_2_TEXT_INPUTS_DOCSTRING)
+    def get_text_features(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.Tensor] = None,
+        decoder_attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Returns:
+            text_outputs (`CausalLMOutputWithPast`, or `tuple(torch.FloatTensor)` if `return_dict=False`):
+                The language model outputs. If `return_dict=True`, the output is a [`CausalLMOutputWithPast`] that
+                contains the language model logits, the past key values and the hidden states if
+                `output_hidden_states=True`.
+        Examples:
+        ```python
+        >>> import torch
+        >>> from transformers import AutoTokenizer, Blip2Model
+
+        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
+
+        >>> model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16)
+
+        >>> model.to(device)  # doctest: +IGNORE_RESULT
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/blip2-opt-2.7b")
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt").to(device)
+        >>> text_features = model.get_text_features(**inputs)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.use_decoder_only_language_model:
+            text_outputs = self.language_model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        else:
+            inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
+
+            text_outputs = self.language_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                labels=labels,
+            )
+
+        return text_outputs
+
+    @add_start_docstrings_to_model_forward(BLIP_2_VISION_INPUTS_DOCSTRING)
+    def get_image_features(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Returns:
+            vision_outputs (`BaseModelOutputWithPooling` or tuple of `torch.FloatTensor`):
+                The vision model outputs. If `return_dict=True`, the output is a [`BaseModelOutputWithPooling`] that
+                contains the image features, the pooled image features and the hidden states if
+                `output_hidden_states=True`.
+        Examples:
+        ```python
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, Blip2Model
+
+        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
+
+        >>> model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16)
+
+        >>> model.to(device)  # doctest: +IGNORE_RESULT
+
+        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)
+        >>> image_outputs = model.get_image_features(**inputs)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        return vision_outputs
+
+    @add_start_docstrings_to_model_forward(BLIP_2_INPUTS_DOCSTRING)
+    def get_qformer_features(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Returns:
+            vision_outputs (`BaseModelOutputWithPooling` or tuple of `torch.FloatTensor`):
+                The vision model outputs. If `return_dict=True`, the output is a [`BaseModelOutputWithPooling`] that
+                contains the image features, the pooled image features and the hidden states if
+                `output_hidden_states=True`.
+        Examples:
+        ```python
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import Blip2Processor, Blip2Model
+
+        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
+
+        >>> processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
+        >>> model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16)
+        >>> model.to(device)  # doctest: +IGNORE_RESULT
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)
+        >>> qformer_outputs = model.get_qformer_features(**inputs)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        image_embeds = vision_outputs[0]
+
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+
+        query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        return query_outputs
+
+    @add_start_docstrings_to_model_forward(BLIP_2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Blip2ForConditionalGenerationModelOutput, config_class=Blip2VisionConfig)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        input_ids: torch.FloatTensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Blip2ForConditionalGenerationModelOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import Blip2Processor, Blip2Model
+        >>> import torch
+
+        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
+
+        >>> processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
+        >>> model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16)
+        >>> model.to(device)  # doctest: +IGNORE_RESULT
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> prompt = "Question: how many cats are there? Answer:"
+        >>> inputs = processor(images=image, text=prompt, return_tensors="pt").to(device, torch.float16)
+
+        >>> outputs = model(**inputs)
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # step 1: forward the images through the vision encoder,
+        # to get image embeddings of shape (batch_size, seq_len, hidden_size)
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        image_embeds = vision_outputs[0]
+
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+
+        query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        query_output = query_outputs[0]
+
+        # step 3: use the language model, conditioned on the query outputs and the prompt
+        language_model_inputs = self.language_projection(query_output)
+        language_model_attention_mask = torch.ones(
+            language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device
+        )
+        inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
+        inputs_embeds = torch.cat([language_model_inputs, inputs_embeds], dim=1)
+
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        expected_device = language_model_attention_mask.device
+        attention_mask = torch.cat([language_model_attention_mask, attention_mask.to(expected_device)], dim=1)
+
+        if self.config.use_decoder_only_language_model:
+            outputs = self.language_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            logits = outputs.logits if return_dict else outputs[0]
+            loss = None
+            # we compute the loss here since we need to take into account the sequence length of the query embeds
+            if labels is not None:
+                labels = labels.to(logits.device)
+                logits = logits[:, -labels.size(1) :, :]
+                # Shift so that tokens < n predict n
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous().to(logits.device)
+
+                # Flatten the tokens
+                loss_fct = CrossEntropyLoss(reduction="mean")
+
+                loss = loss_fct(shift_logits.view(-1, self.config.text_config.vocab_size), shift_labels.view(-1))
+        else:
+            outputs = self.language_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                labels=labels,
+            )
+            loss = outputs.loss if return_dict else outputs[0]
+            logits = outputs.logits if return_dict else outputs[1]
+
+        if not return_dict:
+            output = (logits, vision_outputs, query_outputs, outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return Blip2ForConditionalGenerationModelOutput(
+            loss=loss,
+            logits=logits,
+            vision_outputs=vision_outputs,
+            qformer_outputs=query_outputs,
+            language_model_outputs=outputs,
+        )
+
+
+@add_start_docstrings(
+    """
+    BLIP-2 Model for generating text given an image and an optional text prompt. The model consists of a vision
+    encoder, Querying Transformer (Q-Former) and a language model.
+
+    One can optionally pass `input_ids` to the model, which serve as a text prompt, to make the language model continue
+    the prompt. Otherwise, the language model starts generating text from the [BOS] (beginning-of-sequence) token.
+
+    <Tip>
+
+    Note that Flan-T5 checkpoints cannot be cast to float16. They are pre-trained using bfloat16.
+
+    </Tip>
+    """,
+    BLIP_2_START_DOCSTRING,
+)
+class Blip2ForConditionalGeneration(Blip2PreTrainedModel):
+    config_class = Blip2Config
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: Blip2Config):
+        super().__init__(config)
+
+        self.vision_model = Blip2VisionModel(config.vision_config)
+
+        self.query_tokens = nn.Parameter(torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size))
+        self.qformer = Blip2QFormerModel(config.qformer_config)
+
+        self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
+        if config.use_decoder_only_language_model:
+            language_model = AutoModelForCausalLM.from_config(config.text_config)
+        else:
+            language_model = AutoModelForSeq2SeqLM.from_config(config.text_config)
+
+        # Update _tied_weights_keys using the base model used.
+        if language_model._tied_weights_keys is not None:
+            self._tied_weights_keys = [f"language_model.{k}" for k in language_model._tied_weights_keys]
+
+        self.language_model = language_model
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+
+    def get_output_embeddings(self) -> nn.Module:
+        return self.language_model.get_output_embeddings()
+
+    def get_encoder(self):
+        return self.language_model.get_encoder()
+
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+
+    def _tie_weights(self):
+        if not self.config.use_decoder_only_language_model:
+            self.language_model.encoder.embed_tokens = self.language_model.shared
+            self.language_model.decoder.embed_tokens = self.language_model.shared
+
+    def _preprocess_accelerate(self):
+        r"""
+        Some pre-processing hacks to make the model `accelerate` compatible. Check
+        https://github.com/huggingface/transformers/pull/21707 for more details.
+        """
+        hf_device_map = self.hf_device_map
+
+        if len(hf_device_map) > 1 and "language_model" not in hf_device_map and torch.cuda.device_count() > 1:
+            # warn users about unexpected behavior when using multi-GPU + BLIP-2 + `accelerate`.
+            logger.warning(
+                "The `language_model` is not in the `hf_device_map` dictionary and you are running your script"
+                " in a multi-GPU environment. this may lead to unexpected behavior when using `accelerate`."
+                " Please pass a `device_map` that contains `language_model` to remove this warning."
+                " Please refer to https://github.com/huggingface/blog/blob/main/accelerate-large-models.md for"
+                " more details on creating a `device_map` for large models.",
+            )
+
+        if hasattr(self.language_model, "_hf_hook"):
+            self.language_model._hf_hook.io_same_device = True  # For `generate` compatibility
+
+    @add_start_docstrings_to_model_forward(BLIP_2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Blip2ForConditionalGenerationModelOutput, config_class=Blip2VisionConfig)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        input_ids: torch.FloatTensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Blip2ForConditionalGenerationModelOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        Image captioning (without providing a text prompt):
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import Blip2Processor, Blip2ForConditionalGeneration
+        >>> import torch
+
+        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
+
+        >>> processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
+        >>> model = Blip2ForConditionalGeneration.from_pretrained(
+        ...     "Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16
+        ... )
+        >>> model.to(device)  # doctest: +IGNORE_RESULT
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)
+
+        >>> generated_ids = model.generate(**inputs)
+        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+        >>> print(generated_text)
+        two cats laying on a couch
+        ```
+
+        Visual question answering (prompt = question):
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import Blip2Processor, Blip2ForConditionalGeneration
+        >>> import torch
+
+        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
+
+        >>> processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
+        >>> model = Blip2ForConditionalGeneration.from_pretrained(
+        ...     "Salesforce/blip2-opt-2.7b", load_in_8bit=True, device_map={"": 0}, torch_dtype=torch.float16
+        ... )  # doctest: +IGNORE_RESULT
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> prompt = "Question: how many cats are there? Answer:"
+        >>> inputs = processor(images=image, text=prompt, return_tensors="pt").to(device="cuda", dtype=torch.float16)
+
+        >>> generated_ids = model.generate(**inputs)
+        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+        >>> print(generated_text)
+        two
+        ```
+
+        Note that int8 inference is also supported through [bitsandbytes](https://github.com/TimDettmers/bitsandbytes).
+        This greatly reduces the amount of memory used by the model while maintaining the same performance.
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import Blip2Processor, Blip2ForConditionalGeneration
+        >>> import torch
+
+        >>> processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
+        >>> model = Blip2ForConditionalGeneration.from_pretrained(
+        ...     "Salesforce/blip2-flan-t5-xl", load_in_8bit=True, device_map={"": 0}, torch_dtype=torch.bfloat16
+        ... )  # doctest: +IGNORE_RESULT
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> prompt = "Question: how many cats are there? Answer:"
+        >>> inputs = processor(images=image, text=prompt, return_tensors="pt").to(device="cuda", dtype=torch.bfloat16)
+
+        >>> generated_ids = model.generate(**inputs)
+        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+        >>> print(generated_text)
+        two
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # step 1: forward the images through the vision encoder,
+        # to get image embeddings of shape (batch_size, seq_len, hidden_size)
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        image_embeds = vision_outputs[0]
+
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+
+        query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        query_output = query_outputs[0]
+
+        # step 3: use the language model, conditioned on the query outputs and the prompt
+        language_model_inputs = self.language_projection(query_output)
+        language_model_attention_mask = torch.ones(
+            language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device
+        )
+        inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
+        inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
+
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        expected_device = language_model_attention_mask.device
+        attention_mask = torch.cat([language_model_attention_mask, attention_mask.to(expected_device)], dim=1)
+
+        if self.config.use_decoder_only_language_model:
+            outputs = self.language_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            logits = outputs.logits if return_dict else outputs[0]
+            loss = None
+            # we compute the loss here since we need to take into account the sequence length of the query embeds
+            if labels is not None:
+                labels = labels.to(logits.device)
+                logits = logits[:, -labels.size(1) :, :]
+                # Shift so that tokens < n predict n
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous().to(logits.device)
+
+                # Flatten the tokens
+                loss_fct = CrossEntropyLoss(reduction="mean")
+
+                loss = loss_fct(shift_logits.view(-1, self.config.text_config.vocab_size), shift_labels.view(-1))
+        else:
+            outputs = self.language_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                labels=labels,
+            )
+            loss = outputs.loss if return_dict else outputs[0]
+            logits = outputs.logits if return_dict else outputs[1]
+
+        if not return_dict:
+            output = (logits, vision_outputs, query_outputs, outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return Blip2ForConditionalGenerationModelOutput(
+            loss=loss,
+            logits=logits,
+            vision_outputs=vision_outputs,
+            qformer_outputs=query_outputs,
+            language_model_outputs=outputs,
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        pixel_values: torch.FloatTensor,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        **generate_kwargs,
+    ) -> torch.LongTensor:
+        """
+        Overrides `generate` function to be able to use the model as a conditional generator.
+
+        Args:
+            pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width)):
+                Input images to be processed.
+            input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
+                The sequence used as a prompt for the generation.
+            attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
+                Mask to avoid performing attention on padding token indices
+
+        Returns:
+            captions (list): A list of strings of length batch_size * num_captions.
+        """
+        if hasattr(self, "hf_device_map"):
+            # preprocess for `accelerate`
+            self._preprocess_accelerate()
+
+        batch_size = pixel_values.shape[0]
+        image_embeds = self.vision_model(pixel_values, return_dict=True).last_hidden_state
+        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+
+        query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            return_dict=True,
+        )
+        query_output = query_outputs.last_hidden_state
+
+        language_model_inputs = self.language_projection(query_output)
+        language_attention_mask = torch.ones(
+            language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device
+        )
+        if input_ids is None:
+            input_ids = (
+                torch.LongTensor([[self.config.text_config.bos_token_id]])
+                .repeat(batch_size, 1)
+                .to(image_embeds.device)
+            )
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        attention_mask = torch.cat([language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1)
+
+        # concatenate query embeddings with prompt embeddings
+        inputs_embeds = self.get_input_embeddings()(input_ids)
+        inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
+
+        outputs = self.language_model.generate(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            **generate_kwargs,
+        )
+
+        return outputs
diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py
new file mode 100644
index 000000000000..837056f88891
--- /dev/null
+++ b/src/transformers/models/blip_2/processing_blip_2.py
@@ -0,0 +1,154 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for BLIP-2.
+"""
+
+from typing import List, Optional, Union
+
+from ...image_utils import ImageInput
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
+from ...utils import TensorType
+
+
+class Blip2Processor(ProcessorMixin):
+    r"""
+    Constructs a BLIP-2 processor which wraps a BLIP image processor and an OPT/T5 tokenizer into a single processor.
+
+    [`BlipProcessor`] offers all the functionalities of [`BlipImageProcessor`] and [`AutoTokenizer`]. See the docstring
+    of [`~BlipProcessor.__call__`] and [`~BlipProcessor.decode`] for more information.
+
+    Args:
+        image_processor (`BlipImageProcessor`):
+            An instance of [`BlipImageProcessor`]. The image processor is a required input.
+        tokenizer (`AutoTokenizer`):
+            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "BlipImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    # Copied from transformers.models.blip.processing_blip.BlipProcessor.__init__
+    def __init__(self, image_processor, tokenizer):
+        tokenizer.return_token_type_ids = False
+        super().__init__(image_processor, tokenizer)
+        self.current_processor = self.image_processor
+
+    # Copied from transformers.models.blip.processing_blip.BlipProcessor.__call__
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_token_type_ids: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        This method uses [`BlipImageProcessor.__call__`] method to prepare image(s) for the model, and
+        [`BertTokenizerFast.__call__`] to prepare text for the model.
+
+        Please refer to the docstring of the above two methods for more information.
+        """
+        if images is None and text is None:
+            raise ValueError("You have to specify either images or text.")
+
+        # Get only text
+        if images is None:
+            self.current_processor = self.tokenizer
+            text_encoding = self.tokenizer(
+                text=text,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_token_type_ids=return_token_type_ids,
+                return_length=return_length,
+                verbose=verbose,
+                return_tensors=return_tensors,
+                **kwargs,
+            )
+            return text_encoding
+
+        # add pixel_values
+        encoding_image_processor = self.image_processor(images, return_tensors=return_tensors)
+
+        if text is not None:
+            text_encoding = self.tokenizer(
+                text=text,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_token_type_ids=return_token_type_ids,
+                return_length=return_length,
+                verbose=verbose,
+                return_tensors=return_tensors,
+                **kwargs,
+            )
+        else:
+            text_encoding = None
+
+        if text_encoding is not None:
+            encoding_image_processor.update(text_encoding)
+
+        return encoding_image_processor
+
+    # Copied from transformers.models.blip.processing_blip.BlipProcessor.batch_decode with BertTokenizerFast->PreTrainedTokenizer
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    # Copied from transformers.models.blip.processing_blip.BlipProcessor.decode with BertTokenizerFast->PreTrainedTokenizer
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
+        to the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    # Copied from transformers.models.blip.processing_blip.BlipProcessor.model_input_names
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
diff --git a/src/transformers/models/bloom/__init__.py b/src/transformers/models/bloom/__init__.py
index ece85ac30122..32e8617e8270 100644
--- a/src/transformers/models/bloom/__init__.py
+++ b/src/transformers/models/bloom/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,7 +14,13 @@
 
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_flax_available,
+    is_tokenizers_available,
+    is_torch_available,
+)
 
 
 _import_structure = {
@@ -48,6 +50,19 @@
         "BloomForQuestionAnswering",
     ]
 
+try:
+    if not is_flax_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_flax_bloom"] = [
+        "FlaxBloomForCausalLM",
+        "FlaxBloomModel",
+        "FlaxBloomPreTrainedModel",
+    ]
+
+
 if TYPE_CHECKING:
     from .configuration_bloom import BLOOM_PRETRAINED_CONFIG_ARCHIVE_MAP, BloomConfig, BloomOnnxConfig
 
@@ -75,6 +90,13 @@
             BloomPreTrainedModel,
         )
 
+    try:
+        if not is_flax_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_flax_bloom import FlaxBloomForCausalLM, FlaxBloomModel, FlaxBloomPreTrainedModel
 else:
     import sys
 
diff --git a/src/transformers/models/bloom/configuration_bloom.py b/src/transformers/models/bloom/configuration_bloom.py
index f2ea93c11683..17395625e017 100644
--- a/src/transformers/models/bloom/configuration_bloom.py
+++ b/src/transformers/models/bloom/configuration_bloom.py
@@ -18,15 +18,13 @@
 
 from packaging import version
 
-from transformers import is_torch_available
-
 
 if TYPE_CHECKING:
-    from transformers import PreTrainedTokenizer, TensorType
+    from ... import PreTrainedTokenizer, TensorType
 
 from ...configuration_utils import PretrainedConfig
 from ...onnx import OnnxConfigWithPast, PatchingSpec
-from ...utils import logging
+from ...utils import is_torch_available, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py b/src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py
index c8a069784d5e..eda9a2d815e6 100644
--- a/src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py
+++ b/src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py
@@ -71,7 +71,7 @@ def layer_name_mapping(key, file):
 def get_dtype_size(dtype):
     if dtype == torch.bool:
         return 1 / 8
-    bit_search = re.search("[^\d](\d+)$", str(dtype))
+    bit_search = re.search(r"[^\d](\d+)$", str(dtype))
     if bit_search is None:
         raise ValueError(f"`dtype` is not a valid dtype: {dtype}.")
     bit_size = int(bit_search.groups()[0])
@@ -89,7 +89,7 @@ def convert_bloom_checkpoint_to_pytorch(
 
     if shard_model:
         file_names = os.listdir(bloom_checkpoint_path)
-        file_names = list(sorted(filter(lambda s: s.startswith("layer") and "model_00" in s, file_names)))
+        file_names = sorted(filter(lambda s: s.startswith("layer") and "model_00" in s, file_names))
 
         index_dict = {"weight_map": {}, "metadata": {}}
         total_size = 0
@@ -157,7 +157,7 @@ def convert_bloom_checkpoint_to_pytorch(
         model = BloomModel(config)
 
         file_names = os.listdir(bloom_checkpoint_path)
-        file_names = list(sorted(filter(lambda s: s.startswith("layer") and "model_00" in s, file_names)))
+        file_names = sorted(filter(lambda s: s.startswith("layer") and "model_00" in s, file_names))
 
         missing_keys = None
         for i, file in enumerate(file_names):
diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py
index d12dcdc69e2f..d12ec1724f70 100644
--- a/src/transformers/models/bloom/modeling_bloom.py
+++ b/src/transformers/models/bloom/modeling_bloom.py
@@ -41,7 +41,6 @@
 
 _CHECKPOINT_FOR_DOC = "bigscience/bloom-560m"
 _CONFIG_FOR_DOC = "BloomConfig"
-_TOKENIZER_FOR_DOC = "BloomTokenizerFast"
 
 BLOOM_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "bigscience/bigscience-small-testing",
@@ -136,7 +135,7 @@ def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training:
         x (`torch.tensor`, *required*):
             input tensor
         residual (`torch.tensor`, *required*):
-            esidual tensor
+            residual tensor
         prob (`float`, *required*):
             dropout probability
         training (`bool`, *required*):
@@ -254,10 +253,10 @@ def _split_heads(self, fused_qkv: torch.Tensor) -> Tuple[torch.Tensor, torch.Ten
 
     def _merge_heads(self, x: torch.Tensor) -> torch.Tensor:
         """
-        Merge heads together over the last dimenstion
+        Merge heads together over the last dimension
 
         Args:
-            x: (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim]
+            x (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim]
 
         Returns:
             torch.tensor: [batch_size, seq_length, num_heads * head_dim]
@@ -345,7 +344,7 @@ def forward(
         # matmul: [batch_size * num_heads, q_length, head_dim]
         context_layer = torch.bmm(attention_probs_reshaped, value_layer)
 
-        # change view [batch_size, num_heads, q_length, head_dim]
+        # change view [batch_size, q_length, num_heads * head_dim]
         context_layer = self._merge_heads(context_layer)
 
         # aggregate results across tp ranks. See here: https://github.com/pytorch/pytorch/issues/76232
@@ -472,16 +471,11 @@ def forward(
 
 
 class BloomPreTrainedModel(PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
     config_class = BloomConfig
     base_model_prefix = "transformer"
     supports_gradient_checkpointing = True
     _no_split_modules = ["BloomBlock"]
+    _skip_keys_device_placement = "past_key_values"
 
     def __init__(self, *inputs, **kwargs):
         super().__init__(*inputs, **kwargs)
@@ -570,7 +564,7 @@ def _convert_to_bloom_cache(
             If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
             `input_ids`.
 
-            Indices can be obtained using [`BloomTokenizerFast`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -642,6 +636,9 @@ def __init__(self, config: BloomConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
+    def build_alibi_tensor(self, attention_mask: torch.Tensor, num_heads: int, dtype: torch.dtype) -> torch.Tensor:
+        return build_alibi_tensor(attention_mask, num_heads, dtype)
+
     def get_input_embeddings(self):
         return self.word_embeddings
 
@@ -672,7 +669,6 @@ def set_input_embeddings(self, new_embeddings: torch.Tensor):
 
     @add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPastAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -688,7 +684,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        **deprecated_arguments
+        **deprecated_arguments,
     ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
         if deprecated_arguments.pop("position_ids", False) is not False:
             # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None`
@@ -734,6 +730,13 @@ def forward(
         all_self_attentions = () if output_attentions else None
         all_hidden_states = () if output_hidden_states else None
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         # Compute alibi tensor: check build_alibi_tensor documentation
         seq_length_with_past = seq_length
         past_key_values_length = 0
@@ -745,7 +748,7 @@ def forward(
         else:
             attention_mask = attention_mask.to(hidden_states.device)
 
-        alibi = build_alibi_tensor(attention_mask, self.num_heads, dtype=hidden_states.dtype)
+        alibi = self.build_alibi_tensor(attention_mask, self.num_heads, dtype=hidden_states.dtype)
 
         causal_mask = self._prepare_attn_mask(
             attention_mask,
@@ -754,18 +757,11 @@ def forward(
         )
 
         for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         # None for past_key_value
@@ -778,6 +774,7 @@ def custom_forward(*inputs):
                     hidden_states,
                     alibi,
                     causal_mask,
+                    layer_past,
                     head_mask[i],
                 )
             else:
@@ -823,7 +820,7 @@ def custom_forward(*inputs):
     BLOOM_START_DOCSTRING,
 )
 class BloomForCausalLM(BloomPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config: BloomConfig):
         super().__init__(config)
@@ -844,7 +841,8 @@ def prepare_inputs_for_generation(
         input_ids: torch.LongTensor,
         past_key_values: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
-        **kwargs
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
     ) -> dict:
         # only last token for input_ids if past is not None
         if past_key_values:
@@ -854,16 +852,23 @@ def prepare_inputs_for_generation(
             if past_key_values[0][0].shape[0] == input_ids.shape[0]:
                 past_key_values = self._convert_to_bloom_cache(past_key_values)
 
-        return {
-            "input_ids": input_ids,
-            "past_key_values": past_key_values,
-            "use_cache": kwargs.get("use_cache"),
-            "attention_mask": attention_mask,
-        }
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
 
     @add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=CausalLMOutputWithCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -880,7 +885,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        **deprecated_arguments
+        **deprecated_arguments,
     ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -917,6 +922,8 @@ def forward(
 
         loss = None
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(lm_logits.device)
             # Shift so that tokens < n predict n
             shift_logits = lm_logits[..., :-1, :].contiguous()
             shift_labels = labels[..., 1:].contiguous()
@@ -981,8 +988,6 @@ def _reorder_cache(
     BLOOM_START_DOCSTRING,
 )
 class BloomForSequenceClassification(BloomPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
-
     def __init__(self, config: BloomConfig):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -994,7 +999,6 @@ def __init__(self, config: BloomConfig):
 
     @add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=SequenceClassifierOutputWithPast,
         config_class=_CONFIG_FOR_DOC,
@@ -1011,7 +1015,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        **deprecated_arguments
+        **deprecated_arguments,
     ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutputWithPast]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -1110,8 +1114,6 @@ def forward(
     BLOOM_START_DOCSTRING,
 )
 class BloomForTokenClassification(BloomPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
-
     def __init__(self, config: BloomConfig):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1131,7 +1133,6 @@ def __init__(self, config: BloomConfig):
 
     @add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1148,7 +1149,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        **deprecated_arguments
+        **deprecated_arguments,
     ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -1186,6 +1187,8 @@ def forward(
 
         loss = None
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
             batch_size, seq_length = labels.shape
             loss_fct = CrossEntropyLoss()
             loss = loss_fct(
@@ -1212,8 +1215,6 @@ def forward(
     BLOOM_START_DOCSTRING,
 )
 class BloomForQuestionAnswering(BloomPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
-
     def __init__(self, config):
         super().__init__(config)
         self.transformer = BloomModel(config)
diff --git a/src/transformers/models/bloom/modeling_flax_bloom.py b/src/transformers/models/bloom/modeling_flax_bloom.py
new file mode 100644
index 000000000000..187230f35ab9
--- /dev/null
+++ b/src/transformers/models/bloom/modeling_flax_bloom.py
@@ -0,0 +1,734 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc. Team and Bigscience Workshop. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Flax BLOOM model."""
+
+import math
+from functools import partial
+from typing import Optional, Tuple
+
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.linen import combine_masks, dot_product_attention_weights, make_causal_mask
+from flax.linen.activation import tanh
+from flax.traverse_util import flatten_dict, unflatten_dict
+from jax import lax
+
+from ...modeling_flax_outputs import (
+    FlaxBaseModelOutput,
+    FlaxBaseModelOutputWithPastAndCrossAttentions,
+    FlaxCausalLMOutput,
+)
+from ...modeling_flax_utils import FlaxPreTrainedModel, append_call_sample_docstring
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_bloom import BloomConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "bigscience/bloom"
+_CONFIG_FOR_DOC = "BloomConfig"
+
+
+BLOOM_START_DOCSTRING = r"""
+
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a Flax Linen
+    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
+    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.
+
+    Finally, this model supports inherent JAX features such as:
+
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+    Parameters:
+        config ([`BloomConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+            `jax.numpy.bfloat16` (on TPUs).
+
+            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
+            specified all the computation will be performed with the given `dtype`.
+
+            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+            parameters.**
+
+            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
+            [`~FlaxPreTrainedModel.to_bf16`].
+"""
+
+BLOOM_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length`. Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`BloomTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        past_key_values (`Dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
+            Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
+            auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+def build_alibi_tensor(attention_mask: jnp.ndarray, num_heads: int, dtype: Optional[jnp.dtype] = jnp.float32):
+    """
+    Flax implementation of the BLOOM Alibi tensor. BLOOM Alibi tensor is not causal as the original paper mentions, it
+    relies on a translation invariance of softmax for quick implementation: with l being a tensor, and a fixed value
+    `softmax(l+a) = softmax(l)`. Based on
+    https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
+    Link to paper: https://arxiv.org/abs/2108.12409
+
+    Args:
+        attention_mask (`jnp.ndarray`):
+            Token-wise attention mask, this should be of shape `(batch_size, max_seq_len)`.
+        num_heads (`int`):
+            Number of attention heads.
+        dtype (`jnp.dtype`, *optional*, defaults to `jnp.float32`):
+            The data type (dtype) of the output tensor.
+
+    Returns: Alibi tensor of shape `(batch_size * num_heads, 1, max_seq_len)`.
+    """
+    batch_size, seq_length = attention_mask.shape
+    closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
+    base = jnp.array(2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))), dtype=jnp.float32)
+    powers = jnp.arange(1, 1 + closest_power_of_2, dtype=jnp.float32)
+    slopes = jax.lax.pow(base, powers)
+
+    if closest_power_of_2 != num_heads:
+        extra_base = jnp.array(2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))), dtype=jnp.float32)
+        num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
+        extra_powers = jnp.arange(1, 1 + 2 * num_remaining_heads, 2, dtype=jnp.float32)
+        slopes = jnp.cat([slopes, jax.lax.pow(extra_base, extra_powers)], axis=0)
+
+    # Note: the Alibi tensor will added to the attention bias that will be applied to the query, key product of attention
+    # therefore, Alibi will have to be of shape (batch_size, num_heads, query_length, key_length)
+    # => here we set (batch_size=1, num_heads=num_heads, query_length=1, key_length=max_length)
+    # so that the query_length dimension will then be broadcast correctly.
+    # This is more or less identical to T5's relative position bias:
+    # https://github.com/huggingface/transformers/blob/f681437203baa7671de3174b0fa583c349d9d5e1/src/transformers/models/t5/modeling_t5.py#L527
+    arange_tensor = ((attention_mask.cumsum(axis=-1) - 1) * attention_mask)[:, None, :]
+    alibi = slopes[..., None] * arange_tensor
+    alibi = jnp.expand_dims(alibi, axis=2)
+    return jnp.asarray(alibi, dtype)
+
+
+class FlaxBloomAttention(nn.Module):
+    config: BloomConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.hidden_size = self.config.hidden_size
+        self.num_heads = self.config.n_head
+        self.head_dim = self.hidden_size // self.num_heads
+        self.attention_softmax_in_fp32 = self.dtype is not jnp.float32
+
+        if self.head_dim * self.num_heads != self.hidden_size:
+            raise ValueError(
+                f"`hidden_size` must be divisible by `num_heads` (got `hidden_size`: {self.hidden_size} and "
+                f"`num_heads`: {self.num_heads})."
+            )
+
+        dense = partial(
+            nn.Dense,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+
+        self.query_key_value = dense(self.hidden_size * 3)
+        self.dense = dense(self.hidden_size)
+        self.resid_dropout = nn.Dropout(rate=self.config.hidden_dropout)
+
+    def _split_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:-1] + (self.num_heads, self.head_dim * 3))
+
+    def _merge_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.hidden_size,))
+
+    @nn.compact
+    # Copied from transformers.models.gptj.modeling_flax_gptj.FlaxGPTJAttention._concatenate_to_cache
+    def _concatenate_to_cache(self, key, value, query, attention_mask):
+        """
+        This function takes projected key, value states from a single input token and concatenates the states to cached
+        states from previous steps. This function is slighly adapted from the official Flax repository:
+        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
+        """
+        # detect if we're initializing by absence of existing cache data.
+        is_initialized = self.has_variable("cache", "cached_key")
+        cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
+        cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
+        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
+
+        if is_initialized:
+            *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
+            # update key, value caches with our new 1d spatial slices
+            cur_index = cache_index.value
+            indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
+            key = lax.dynamic_update_slice(cached_key.value, key, indices)
+            value = lax.dynamic_update_slice(cached_value.value, value, indices)
+            cached_key.value = key
+            cached_value.value = value
+            num_updated_cache_vectors = query.shape[1]
+            cache_index.value = cache_index.value + num_updated_cache_vectors
+            # causal mask for cached decoder self-attention: our single query position should only attend to those key
+            # positions that have already been generated and cached, not the remaining zero elements.
+            pad_mask = jnp.broadcast_to(
+                jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
+                tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
+            )
+            attention_mask = combine_masks(pad_mask, attention_mask)
+        return key, value, attention_mask
+
+    def __call__(
+        self,
+        hidden_states,
+        residual,
+        alibi,
+        attention_mask=None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+    ):
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        # proj q, k, v
+        fused_qkv = self.query_key_value(hidden_states)
+        fused_qkv = self._split_heads(fused_qkv)
+        query, key, value = jnp.split(fused_qkv, 3, axis=-1)
+
+        causal_attention_mask = make_causal_mask(attention_mask, dtype="bool")
+
+        # for fast decoding causal attention mask should be shifted
+        causal_attention_mask_shift = (
+            self.variables["cache"]["cache_index"] if self.has_variable("cache", "cached_key") else 0
+        )
+
+        # fast decoding for generate requires special attention_mask
+        if self.has_variable("cache", "cached_key"):
+            max_decoder_length = self.variables["cache"]["cached_key"].shape[1]
+            causal_attention_mask = jax.lax.dynamic_slice(
+                causal_attention_mask,
+                (0, 0, causal_attention_mask_shift, 0),
+                (1, 1, seq_length, max_decoder_length),
+            )
+
+        # broadcast causal attention mask & attention mask to fit for merge
+        causal_attention_mask = jnp.broadcast_to(
+            causal_attention_mask, (batch_size,) + causal_attention_mask.shape[1:]
+        )
+        attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_attention_mask.shape)
+        attention_mask = combine_masks(attention_mask, causal_attention_mask)
+
+        dropout_rng = None
+        if not deterministic and self.config.attention_dropout > 0.0:
+            dropout_rng = self.make_rng("dropout")
+
+        # During fast autoregressive decoding, we feed one position at a time,
+        # and cache the keys and values step by step.
+        if self.has_variable("cache", "cached_key") or init_cache:
+            key, value, attention_mask = self._concatenate_to_cache(key, value, query, attention_mask)
+
+        # transform boolean mask into float mask
+        mask_value = jnp.finfo(self.dtype).min
+        attention_bias = lax.select(
+            attention_mask > 0,
+            jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
+            jnp.full(attention_mask.shape, mask_value).astype(self.dtype),
+        )
+
+        attention_bias = attention_bias + alibi
+
+        # Cast in fp32 if the original dtype is different from fp32
+        attention_dtype = jnp.float32 if self.attention_softmax_in_fp32 else self.dtype
+
+        attn_weights = dot_product_attention_weights(
+            query,
+            key,
+            bias=attention_bias,
+            dropout_rng=dropout_rng,
+            dropout_rate=self.config.attention_dropout,
+            deterministic=deterministic,
+            dtype=attention_dtype,
+        )
+
+        # Cast back in the original dtype if the native dtype is not fp32
+        if self.attention_softmax_in_fp32:
+            attn_weights = attn_weights.astype(self.dtype)
+
+        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value)
+        attn_output = self._merge_heads(attn_output)
+        attn_output = self.dense(attn_output)
+        attn_output = self.resid_dropout(attn_output, deterministic=deterministic)
+
+        attn_output = attn_output + residual
+
+        outputs = (attn_output, attn_weights) if output_attentions else (attn_output,)
+        return outputs
+
+
+class BloomGELU(nn.Module):
+    def setup(self):
+        self.dtype = jnp.float32
+
+    def __call__(self, x):
+        return x * 0.5 * (1.0 + tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
+
+
+class FlaxBloomMLP(nn.Module):
+    config: BloomConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        hidden_size = self.config.hidden_size
+
+        kernel_init = jax.nn.initializers.normal(self.config.initializer_range)
+
+        self.dense_h_to_4h = nn.Dense(4 * hidden_size, dtype=self.dtype, kernel_init=kernel_init)
+        self.dense_4h_to_h = nn.Dense(hidden_size, dtype=self.dtype, kernel_init=kernel_init)
+        self.hidden_dropout = nn.Dropout(self.config.hidden_dropout)
+        self.act = BloomGELU()
+
+    def __call__(self, hidden_states, residual, deterministic: bool = True):
+        hidden_states = self.dense_h_to_4h(hidden_states)
+        hidden_states = self.act(hidden_states)
+
+        intermediate_output = self.dense_4h_to_h(hidden_states)
+
+        intermediate_output = intermediate_output + residual
+        hidden_states = self.hidden_dropout(intermediate_output, deterministic=deterministic)
+
+        return hidden_states
+
+
+class FlaxBloomBlock(nn.Module):
+    config: BloomConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.input_layernorm = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
+
+        self.self_attention = FlaxBloomAttention(self.config, dtype=self.dtype)
+        self.post_attention_layernorm = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
+
+        self.mlp = FlaxBloomMLP(self.config, dtype=self.dtype)
+
+        self.apply_residual_connection_post_layernorm = self.config.apply_residual_connection_post_layernorm
+        self.hidden_dropout = self.config.hidden_dropout
+
+    def __call__(
+        self,
+        hidden_states,
+        alibi,
+        attention_mask=None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+    ):
+        layernorm_output = self.input_layernorm(hidden_states)
+
+        # layer norm before saving residual if config calls for it
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+
+        # self-attention
+        attn_outputs = self.self_attention(
+            layernorm_output,
+            residual=residual,
+            alibi=alibi,
+            attention_mask=attention_mask,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+        )
+
+        attention_output = attn_outputs[0]
+
+        outputs = attn_outputs[1:]
+
+        post_layernorm = self.post_attention_layernorm(attention_output)
+
+        # set residual based on config
+        if self.apply_residual_connection_post_layernorm:
+            residual = post_layernorm
+        else:
+            residual = attention_output
+
+        output = self.mlp(post_layernorm, residual, deterministic=deterministic)
+
+        outputs = (output,) + outputs
+
+        return outputs
+
+
+class FlaxBloomPreTrainedModel(FlaxPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = BloomConfig
+    base_model_prefix = "transformer"
+    module_class: nn.Module = None
+
+    def __init__(
+        self,
+        config: BloomConfig,
+        input_shape: Tuple = (1, 1),
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        **kwargs,
+    ):
+        module = self.module_class(config=config, dtype=dtype, **kwargs)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        input_ids = jnp.zeros(input_shape, dtype="i4")
+        attention_mask = jnp.ones_like(input_ids)
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+
+        random_params = self.module.init(rngs, input_ids, attention_mask, return_dict=False)["params"]
+
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+
+    def init_cache(self, batch_size, max_length):
+        r"""
+        Args:
+            batch_size (`int`):
+                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
+            max_length (`int`):
+                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
+                cache.
+        """
+        # init input variables to retrieve cache
+        input_ids = jnp.ones((batch_size, max_length), dtype="i4")
+        attention_mask = jnp.ones_like(input_ids)
+
+        init_variables = self.module.init(
+            jax.random.PRNGKey(0), input_ids, attention_mask, return_dict=False, init_cache=True
+        )
+        return unfreeze(init_variables["cache"])
+
+    @add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING)
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        past_key_values: dict = None,
+        params: dict = None,
+        dropout_rng: jax.random.PRNGKey = None,
+        train: bool = False,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, sequence_length = input_ids.shape
+
+        if attention_mask is None:
+            attention_mask = jnp.ones((batch_size, sequence_length))
+
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        inputs = {"params": params or self.params}
+
+        # If past_key_values are passed then cache is already initialized a private flag init_cache has to be passed
+        # down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be
+        # changed by FlaxBloomAttention module
+        if past_key_values:
+            inputs["cache"] = past_key_values
+            mutable = ["cache"]
+        else:
+            mutable = False
+
+        outputs = self.module.apply(
+            inputs,
+            jnp.array(input_ids, dtype="i4"),
+            jnp.array(attention_mask, dtype="i4"),
+            not train,
+            False,
+            output_attentions,
+            output_hidden_states,
+            return_dict,
+            rngs=rngs,
+            mutable=mutable,
+        )
+
+        # add updated cache to model output
+        if past_key_values is not None and return_dict:
+            outputs, past_key_values = outputs
+            outputs["past_key_values"] = unfreeze(past_key_values["cache"])
+            return outputs
+        elif past_key_values is not None and not return_dict:
+            outputs, past_key_values = outputs
+            outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:]
+
+        return outputs
+
+
+class FlaxBloomBlockCollection(nn.Module):
+    config: BloomConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.layers = [
+            FlaxBloomBlock(self.config, name=str(layer_number), dtype=self.dtype)
+            for layer_number in range(self.config.num_hidden_layers)
+        ]
+
+    def __call__(
+        self,
+        hidden_states,
+        alibi,
+        attention_mask=None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+    ):
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        for layer_number in range(self.config.num_hidden_layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = self.layers[layer_number](
+                hidden_states,
+                alibi=alibi,
+                attention_mask=attention_mask,
+                deterministic=deterministic,
+                init_cache=init_cache,
+                output_attentions=output_attentions,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions += (layer_outputs[1],)
+
+        # this contains possible `None` values - `FlaxBloomModule` will filter them out
+        outputs = (hidden_states, all_hidden_states, all_attentions)
+
+        return outputs
+
+
+class FlaxBloomModule(nn.Module):
+    config: BloomConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.embed_dim = self.config.hidden_size
+
+        # word embeddings (no positional embedding layer)
+        self.word_embeddings = nn.Embed(
+            self.config.vocab_size,
+            self.embed_dim,
+            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+            dtype=self.dtype,
+        )
+
+        # post-embedding layernorm
+        self.word_embeddings_layernorm = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
+
+        # transformer layers
+        self.h = FlaxBloomBlockCollection(self.config, dtype=self.dtype)
+
+        # final layernorm
+        self.ln_f = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        deterministic=True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        inputs_embeds = self.word_embeddings(input_ids)
+        # do post-embedding layernorm
+        hidden_states = self.word_embeddings_layernorm(inputs_embeds)
+
+        # build alibi depending on `attention_mask`
+        alibi = build_alibi_tensor(attention_mask, self.config.n_head, dtype=hidden_states.dtype)
+
+        outputs = self.h(
+            hidden_states,
+            alibi=alibi,
+            attention_mask=attention_mask,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = self.ln_f(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = outputs[1] + (hidden_states,)
+            outputs = (hidden_states, all_hidden_states) + outputs[2:]
+        else:
+            outputs = (hidden_states,) + outputs[1:]
+
+        if not return_dict:
+            return tuple(v for v in [outputs[0], outputs[-1]] if v is not None)
+
+        return FlaxBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            hidden_states=outputs[1],
+            attentions=outputs[-1],
+        )
+
+
+@add_start_docstrings(
+    "The bare Bloom Model transformer outputting raw hidden-states without any specific head on top.",
+    BLOOM_START_DOCSTRING,
+)
+# Copied from transformers.models.gpt_neo.modeling_flax_gpt_neo.FlaxGPTNeoModel with GPTNeo->Bloom
+class FlaxBloomModel(FlaxBloomPreTrainedModel):
+    module_class = FlaxBloomModule
+
+
+append_call_sample_docstring(FlaxBloomModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutput, _CONFIG_FOR_DOC)
+
+
+class FlaxBloomForCausalLMModule(nn.Module):
+    config: BloomConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.transformer = FlaxBloomModule(self.config, dtype=self.dtype)
+        self.lm_head = nn.Dense(
+            self.config.vocab_size,
+            use_bias=False,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+        )
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+
+        if self.config.tie_word_embeddings:
+            shared_kernel = self.transformer.variables["params"]["word_embeddings"]["embedding"].T
+            lm_logits = self.lm_head.apply({"params": {"kernel": shared_kernel}}, hidden_states)
+        else:
+            lm_logits = self.lm_head(hidden_states)
+
+        if not return_dict:
+            return (lm_logits,) + outputs[1:]
+
+        return FlaxCausalLMOutput(logits=lm_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
+
+
+@add_start_docstrings(
+    """
+    The Bloom Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """,
+    BLOOM_START_DOCSTRING,
+)
+class FlaxBloomForCausalLM(FlaxBloomPreTrainedModel):
+    module_class = FlaxBloomForCausalLMModule
+
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
+        # initializing the cache
+        batch_size, seq_length = input_ids.shape
+
+        past_key_values = self.init_cache(batch_size, max_length)
+        # Note that usually one would have to put 0's in the attention_mask for
+        # x > input_ids.shape[-1] and x < cache_length. But since Bloom uses a causal mask,
+        # those positions are masked anyway. Thus, we can create a single static attention_mask here,
+        # which is more efficient for compilation
+        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
+        if attention_mask is not None:
+            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0))
+
+        return {
+            "past_key_values": past_key_values,
+            "attention_mask": extended_attention_mask,
+        }
+
+    def update_inputs_for_generation(self, model_outputs, model_kwargs):
+        model_kwargs["past_key_values"] = model_outputs.past_key_values
+        return model_kwargs
+
+
+append_call_sample_docstring(FlaxBloomForCausalLM, _CHECKPOINT_FOR_DOC, FlaxCausalLMOutput, _CONFIG_FOR_DOC)
diff --git a/src/transformers/models/bloom/tokenization_bloom_fast.py b/src/transformers/models/bloom/tokenization_bloom_fast.py
index 1d6f405039a8..47b78ac723f7 100644
--- a/src/transformers/models/bloom/tokenization_bloom_fast.py
+++ b/src/transformers/models/bloom/tokenization_bloom_fast.py
@@ -15,20 +15,14 @@
 """Tokenization classes for Bloom."""
 
 
-import json
-from typing import TYPE_CHECKING, List, Optional, Tuple
-
-from tokenizers import pre_tokenizers
+import pickle
+from typing import Optional, Tuple
 
 from ...tokenization_utils_base import BatchEncoding
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
 from ...utils import logging
 
 
-if TYPE_CHECKING:
-    from transformers.pipelines.conversational import Conversation
-
-
 logger = logging.get_logger(__name__)
 
 VOCAB_FILES_NAMES = {"tokenizer_file": "tokenizer.json"}
@@ -54,13 +48,15 @@ class BloomTokenizerFast(PreTrainedTokenizerFast):
     This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
     be encoded differently whether it is at the beginning of the sentence (without space) or not:
 
-    ```
+    ```python
     >>> from transformers import BloomTokenizerFast
+
     >>> tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom")
-    >>> tokenizer("Hello world")['input_ids']
-    [15496, 995]
-    >>> tokenizer(" Hello world")['input_ids']
-    [18435, 995]
+    >>> tokenizer("Hello world")["input_ids"]
+    [59414, 8876]
+
+    >>> tokenizer(" Hello world")["input_ids"]
+    [86153, 8876]
     ```
 
     You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since
@@ -113,7 +109,8 @@ def __init__(
         eos_token="</s>",
         pad_token="<pad>",
         add_prefix_space=False,
-        **kwargs
+        clean_up_tokenization_spaces=False,
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
@@ -124,13 +121,19 @@ def __init__(
             eos_token=eos_token,
             pad_token=pad_token,
             add_prefix_space=add_prefix_space,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
             **kwargs,
         )
-        pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
-        if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
-            pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
-            pre_tok_state["add_prefix_space"] = add_prefix_space
-            self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
+        # TODO @ArthurZucker this can only work one way for now, to update later-on. Tests should also properly
+        # check this as they were green before.
+        pre_tok_state = pickle.dumps(self.backend_tokenizer.pre_tokenizer)
+        decoder_state = pickle.dumps(self.backend_tokenizer.decoder)
+
+        if add_prefix_space:
+            pre_tok_state = pre_tok_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
+            decoder_state = decoder_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
+        self.backend_tokenizer.pre_tokenizer = pickle.loads(pre_tok_state)
+        self.backend_tokenizer.decoder = pickle.loads(decoder_state)
 
         self.add_prefix_space = add_prefix_space
 
@@ -159,12 +162,10 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
         files = self._tokenizer.model.save(save_directory, name=filename_prefix)
         return tuple(files)
 
-    def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
-        """This corresponds to DialoGPT variants of models."""
-        input_ids = []
-        for is_user, text in conversation.iter_texts():
-            input_ids.extend(self.encode(text, add_special_tokens=False) + [self.eos_token_id])
-
-        if len(input_ids) > self.model_max_length:
-            input_ids = input_ids[-self.model_max_length :]
-        return input_ids
+    @property
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.default_chat_template
+    def default_chat_template(self):
+        """
+        A simple chat template that ignores role information and just concatenates messages with EOS tokens.
+        """
+        return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}"
diff --git a/src/transformers/models/bridgetower/__init__.py b/src/transformers/models/bridgetower/__init__.py
new file mode 100644
index 000000000000..cbd5bd4a366a
--- /dev/null
+++ b/src/transformers/models/bridgetower/__init__.py
@@ -0,0 +1,89 @@
+# Copyright 2023 The Intel Labs Team Authors, The Microsoft Research Team Authors and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
+
+
+_import_structure = {
+    "configuration_bridgetower": [
+        "BRIDGETOWER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "BridgeTowerConfig",
+        "BridgeTowerTextConfig",
+        "BridgeTowerVisionConfig",
+    ],
+    "processing_bridgetower": ["BridgeTowerProcessor"],
+}
+
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["image_processing_bridgetower"] = ["BridgeTowerImageProcessor"]
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_bridgetower"] = [
+        "BRIDGETOWER_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "BridgeTowerForContrastiveLearning",
+        "BridgeTowerForImageAndTextRetrieval",
+        "BridgeTowerForMaskedLM",
+        "BridgeTowerModel",
+        "BridgeTowerPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_bridgetower import (
+        BRIDGETOWER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        BridgeTowerConfig,
+        BridgeTowerTextConfig,
+        BridgeTowerVisionConfig,
+    )
+    from .processing_bridgetower import BridgeTowerProcessor
+
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .image_processing_bridgetower import BridgeTowerImageProcessor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_bridgetower import (
+            BRIDGETOWER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            BridgeTowerForContrastiveLearning,
+            BridgeTowerForImageAndTextRetrieval,
+            BridgeTowerForMaskedLM,
+            BridgeTowerModel,
+            BridgeTowerPreTrainedModel,
+        )
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
diff --git a/src/transformers/models/bridgetower/configuration_bridgetower.py b/src/transformers/models/bridgetower/configuration_bridgetower.py
new file mode 100644
index 000000000000..30b6bf28795a
--- /dev/null
+++ b/src/transformers/models/bridgetower/configuration_bridgetower.py
@@ -0,0 +1,350 @@
+# coding=utf-8
+# Copyright 2023 The Intel Labs Team Authors, The Microsoft Research Team Authors and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License=, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing=, software
+# distributed under the License is distributed on an "AS IS" BASIS=,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND=, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" BridgeTower model configuration"""
+
+import os
+from typing import Union
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+BRIDGETOWER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "BridgeTower/bridgetower-base": "https://huggingface.co/BridgeTower/bridgetower-base/blob/main/config.json",
+    "BridgeTower/bridgetower-base-itm-mlm": (
+        "https://huggingface.co/BridgeTower/bridgetower-base-itm-mlm/blob/main/config.json"
+    ),
+}
+
+
+class BridgeTowerVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the vision configuration of a [`BridgeTowerModel`]. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the bridgetower-base
+    [BridgeTower/bridgetower-base](https://huggingface.co/BridgeTower/bridgetower-base/) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in visual encoder model.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        image_size (`int`, *optional*, defaults to 288):
+            The size (resolution) of each image.
+        initializer_factor (`float``, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        stop_gradient (`bool`, *optional*, defaults to `False`):
+            Whether to stop gradient for training.
+        share_layernorm (`bool`, *optional*, defaults to `True`):
+            Whether LayerNorm layers are shared.
+        remove_last_layer (`bool`, *optional*, defaults to `False`):
+            Whether to remove the last layer from the vision encoder.
+
+
+    Example:
+
+    ```python
+    >>> from transformers import BridgeTowerVisionConfig
+
+    >>> # Initializing a BridgeTower BridgeTower/bridgetower-base style configuration for the vision model
+    >>> configuration = BridgeTowerVisionConfig()
+
+    >>> # Accessing the configuration
+    >>> configuration
+    ```"""
+    model_type = "bridgetower_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_channels=3,
+        patch_size=16,
+        image_size=288,
+        initializer_factor=1,
+        layer_norm_eps=1e-05,
+        stop_gradient=False,
+        share_layernorm=True,
+        remove_last_layer=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_factor = initializer_factor
+        self.layer_norm_eps = layer_norm_eps
+        self.stop_gradient = stop_gradient
+        self.share_layernorm = share_layernorm
+        self.remove_last_layer = remove_last_layer
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        if config_dict.get("model_type") == "bridgetower":
+            config_dict = config_dict["text_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class BridgeTowerTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the text configuration of a [`BridgeTowerModel`]. The default values here
+    are copied from RoBERTa. Instantiating a configuration with the defaults will yield a similar configuration to that
+    of the bridgetower-base [BridegTower/bridgetower-base](https://huggingface.co/BridgeTower/bridgetower-base/)
+    architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50265):
+            Vocabulary size of the text part of the model. Defines the number of different tokens that can be
+            represented by the `inputs_ids` passed when calling [`BridgeTowerModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 514):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids`.
+        initializer_factor (`float``, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+
+    Example:
+
+    ```python
+    >>> from transformers import BridgeTowerTextConfig
+
+    >>> # Initializing a BridgeTower BridgeTower/bridgetower-base style configuration for the text model
+    >>> configuration = BridgeTowerTextConfig()
+
+    >>> # Accessing the configuration
+    >>> configuration
+    ```"""
+    model_type = "bridgetower_text_model"
+
+    def __init__(
+        self,
+        vocab_size=50265,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        initializer_factor=1,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=514,
+        type_vocab_size=1,
+        layer_norm_eps=1e-05,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        position_embedding_type="absolute",
+        use_cache=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.initializer_factor = initializer_factor
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        if config_dict.get("model_type") == "bridgetower":
+            config_dict = config_dict["text_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class BridgeTowerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`BridgeTowerModel`]. It is used to instantiate a
+    BridgeTower model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the bridgetower-base
+    [BridgeTower/bridgetower-base](https://huggingface.co/BridgeTower/bridgetower-base/) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        share_cross_modal_transformer_layers (`bool`, *optional*, defaults to `True`):
+            Whether cross modal transformer layers are shared.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler.
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        initializer_factor (`float``, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        share_link_tower_layers (`bool`, *optional*, defaults to `False`):
+            Whether the bride/link tower layers are shared.
+        link_tower_type (`str`, *optional*, defaults to `"add"`):
+            Type of the bridge/link layer.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 6):
+            Number of hidden layers in the Transformer encoder.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie input and output embeddings.
+        init_layernorm_from_vision_encoder (`bool`, *optional*, defaults to `False`):
+            Whether to init LayerNorm from the vision encoder.
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`BridgeTowerTextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`BridgeTowerVisionConfig`].
+
+    Example:
+
+    ```python
+    >>> from transformers import BridgeTowerModel, BridgeTowerConfig
+
+    >>> # Initializing a BridgeTower BridgeTower/bridgetower-base style configuration
+    >>> configuration = BridgeTowerConfig()
+
+    >>> # Initializing a model from the BridgeTower/bridgetower-base style configuration
+    >>> model = BridgeTowerModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "bridgetower"
+
+    def __init__(
+        self,
+        share_cross_modal_transformer_layers=True,
+        hidden_act="gelu",
+        hidden_size=768,
+        initializer_factor=1,
+        layer_norm_eps=1e-05,
+        share_link_tower_layers=False,
+        link_tower_type="add",
+        num_attention_heads=12,
+        num_hidden_layers=6,
+        tie_word_embeddings=False,
+        init_layernorm_from_vision_encoder=False,
+        text_config=None,
+        vision_config=None,
+        **kwargs,
+    ):
+        # TODO: remove this once the Hub files are updated.
+        _ = kwargs.pop("text_config_dict", None)
+        _ = kwargs.pop("vision_config_dict", None)
+
+        super().__init__(**kwargs)
+        self.share_cross_modal_transformer_layers = share_cross_modal_transformer_layers
+        self.hidden_act = hidden_act
+        self.hidden_size = hidden_size
+        self.initializer_factor = initializer_factor
+        self.layer_norm_eps = layer_norm_eps
+        self.share_link_tower_layers = share_link_tower_layers
+        self.link_tower_type = link_tower_type
+        self.num_attention_heads = num_attention_heads
+        self.num_hidden_layers = num_hidden_layers
+        self.tie_word_embeddings = tie_word_embeddings
+        self.init_layernorm_from_vision_encoder = init_layernorm_from_vision_encoder
+
+        if text_config is None:
+            text_config = {}
+            logger.info("`text_config` is `None`. Initializing the `BridgeTowerTextConfig` with default values.")
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("`vision_config` is `None`. Initializing the `BridgeTowerVisionConfig` with default values.")
+
+        self.text_config = BridgeTowerTextConfig(**text_config)
+        self.vision_config = BridgeTowerVisionConfig(**vision_config)
+
+    @classmethod
+    def from_text_vision_configs(
+        cls, text_config: BridgeTowerTextConfig, vision_config: BridgeTowerVisionConfig, **kwargs
+    ):
+        r"""
+        Instantiate a [`BridgeTowerConfig`] (or a derived class) from BridgeTower text model configuration. Returns:
+            [`BridgeTowerConfig`]: An instance of a configuration object
+        """
+
+        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower.py b/src/transformers/models/bridgetower/image_processing_bridgetower.py
new file mode 100644
index 000000000000..1e2b8ea40b07
--- /dev/null
+++ b/src/transformers/models/bridgetower/image_processing_bridgetower.py
@@ -0,0 +1,520 @@
+# coding=utf-8
+# Copyright 2023 The Intel Labs Team Authors, The Microsoft Research Team Authors and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for BridgeTower."""
+
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import PaddingMode, center_crop, pad, resize, to_channel_dimension_format
+from ...image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_batched,
+    is_scaled_image,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import TensorType, is_vision_available, logging
+
+
+if is_vision_available():
+    import PIL
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.models.vilt.image_processing_vilt.max_across_indices
+def max_across_indices(values: Iterable[Any]) -> List[Any]:
+    """
+    Return the maximum value across all indices of an iterable of values.
+    """
+    return [max(values_i) for values_i in zip(*values)]
+
+
+# Copied from transformers.models.vilt.image_processing_vilt.make_pixel_mask
+def make_pixel_mask(
+    image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> np.ndarray:
+    """
+    Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
+
+    Args:
+        image (`np.ndarray`):
+            Image to make the pixel mask for.
+        output_size (`Tuple[int, int]`):
+            Output size of the mask.
+    """
+    input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+    mask = np.zeros(output_size, dtype=np.int64)
+    mask[:input_height, :input_width] = 1
+    return mask
+
+
+# Copied from transformers.models.vilt.image_processing_vilt.get_max_height_width
+def get_max_height_width(
+    images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> List[int]:
+    """
+    Get the maximum height and width across all images in a batch.
+    """
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(images[0])
+
+    if input_data_format == ChannelDimension.FIRST:
+        _, max_height, max_width = max_across_indices([img.shape for img in images])
+    elif input_data_format == ChannelDimension.LAST:
+        max_height, max_width, _ = max_across_indices([img.shape for img in images])
+    else:
+        raise ValueError(f"Invalid channel dimension format: {input_data_format}")
+    return (max_height, max_width)
+
+
+# Copied from transformers.models.vilt.image_processing_vilt.get_resize_output_image_size
+def get_resize_output_image_size(
+    input_image: np.ndarray,
+    shorter: int = 800,
+    longer: int = 1333,
+    size_divisor: int = 32,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> Tuple[int, int]:
+    input_height, input_width = get_image_size(input_image, input_data_format)
+    min_size, max_size = shorter, longer
+
+    scale = min_size / min(input_height, input_width)
+
+    if input_height < input_width:
+        new_height = min_size
+        new_width = scale * input_width
+    else:
+        new_height = scale * input_height
+        new_width = min_size
+
+    if max(new_height, new_width) > max_size:
+        scale = max_size / max(new_height, new_width)
+        new_height = scale * new_height
+        new_width = scale * new_width
+
+    new_height, new_width = int(new_height + 0.5), int(new_width + 0.5)
+    new_height = new_height // size_divisor * size_divisor
+    new_width = new_width // size_divisor * size_divisor
+
+    return new_height, new_width
+
+
+class BridgeTowerImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a BridgeTower image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
+            `do_resize` parameter in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to 288):
+            Resize the shorter side of the input to `size["shortest_edge"]`. The longer side will be limited to under
+            `int((1333 / 800) * size["shortest_edge"])` while preserving the aspect ratio. Only has an effect if
+            `do_resize` is set to `True`. Can be overridden by the `size` parameter in the `preprocess` method.
+        size_divisor (`int`, *optional*, defaults to 32):
+            The size by which to make sure both the height and width can be divided. Only has an effect if `do_resize`
+            is set to `True`. Can be overridden by the `size_divisor` parameter in the `preprocess` method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
+            overridden by the `resample` parameter in the `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be
+            overridden by the `rescale_factor` parameter in the `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
+            overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to center crop the image. Can be overridden by the `do_center_crop` parameter in the `preprocess`
+            method.
+        do_pad (`bool`, *optional*, defaults to `True`):
+            Whether to pad the image to the `(max_height, max_width)` of the images in the batch. Can be overridden by
+            the `do_pad` parameter in the `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = 288,
+        size_divisor: int = 32,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_center_crop: bool = True,
+        do_pad: bool = True,
+        **kwargs,
+    ) -> None:
+        if "pad_and_return_pixel_mask" in kwargs:
+            do_pad = kwargs.pop("pad_and_return_pixel_mask")
+
+        super().__init__(**kwargs)
+        size = size if size is not None else {"shortest_edge": 288}
+        size = get_size_dict(size, default_to_square=False)
+
+        self.do_resize = do_resize
+        self.size = size
+        self.size_divisor = size_divisor
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+        self.do_pad = do_pad
+        self.do_center_crop = do_center_crop
+
+    # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.resize
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        size_divisor: int = 32,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image.
+
+        Resizes the shorter side of the image to `size["shortest_edge"]` while preserving the aspect ratio. If the
+        longer side is larger than the max size `(int(`size["shortest_edge"]` * 1333 / 800))`, the longer side is then
+        resized to the max size while preserving the aspect ratio.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Controls the size of the output image. Should be of the form `{"shortest_edge": int}`.
+            size_divisor (`int`, defaults to 32):
+                The image is resized to a size that is a multiple of this value.
+            resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        size = get_size_dict(size, default_to_square=False)
+        if "shortest_edge" not in size:
+            raise ValueError(f"The `size` dictionary must contain the key `shortest_edge`. Got {size.keys()}")
+        shorter = size["shortest_edge"]
+        longer = int(1333 / 800 * shorter)
+        output_size = get_resize_output_image_size(
+            image, shorter=shorter, longer=longer, size_divisor=size_divisor, input_data_format=input_data_format
+        )
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    def center_crop(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Center crop an image to `(size["height"], size["width"])`. If the input size is smaller than `crop_size` along
+        any edge, the image is padded with 0's and then center cropped.
+
+        Args:
+            image (`np.ndarray`):
+                Image to center crop.
+            size (`Dict[str, int]`):
+                Size of the output image in the form `{"height": h, "width": w}`.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred from the input
+                image.
+        """
+        output_size = size["shortest_edge"]
+        return center_crop(
+            image,
+            size=(output_size, output_size),
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
+    def _pad_image(
+        self,
+        image: np.ndarray,
+        output_size: Tuple[int, int],
+        constant_values: Union[float, Iterable[float]] = 0,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Pad an image with zeros to the given size.
+        """
+        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+        output_height, output_width = output_size
+
+        pad_bottom = output_height - input_height
+        pad_right = output_width - input_width
+        padding = ((0, pad_bottom), (0, pad_right))
+        padded_image = pad(
+            image,
+            padding,
+            mode=PaddingMode.CONSTANT,
+            constant_values=constant_values,
+            data_format=data_format,
+            input_data_format=input_data_format,
+        )
+        return padded_image
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
+    def pad(
+        self,
+        images: List[np.ndarray],
+        constant_values: Union[float, Iterable[float]] = 0,
+        return_pixel_mask: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> BatchFeature:
+        """
+        Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
+        in the batch and optionally returns their corresponding pixel mask.
+
+        Args:
+            image (`np.ndarray`):
+                Image to pad.
+            constant_values (`float` or `Iterable[float]`, *optional*):
+                The value to use for the padding if `mode` is `"constant"`.
+            return_pixel_mask (`bool`, *optional*, defaults to `True`):
+                Whether to return a pixel mask.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        pad_size = get_max_height_width(images, input_data_format=input_data_format)
+
+        padded_images = [
+            self._pad_image(
+                image,
+                pad_size,
+                constant_values=constant_values,
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
+            for image in images
+        ]
+        data = {"pixel_values": padded_images}
+
+        if return_pixel_mask:
+            masks = [
+                make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
+                for image in images
+            ]
+            data["pixel_mask"] = masks
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[Dict[str, int]] = None,
+        size_divisor: Optional[int] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_pad: Optional[bool] = None,
+        do_center_crop: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Controls the size of the image after `resize`. The shortest edge of the image is resized to
+                `size["shortest_edge"]` whilst preserving the aspect ratio. If the longest edge of this resized image
+                is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest
+                edge equal to `int(size["shortest_edge"] * (1333 / 800))`.
+            size_divisor (`int`, *optional*, defaults to `self.size_divisor`):
+                The image is resized to a size that is a multiple of this value.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to normalize the image by if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to normalize the image by if `do_normalize` is set to `True`.
+            do_pad (`bool`, *optional*, defaults to `self.do_pad`):
+                Whether to pad the image to the (max_height, max_width) in the batch. If `True`, a pixel mask is also
+                created and returned.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+                Whether to center crop the image. If the input size is smaller than `crop_size` along any edge, the
+                image is padded with 0's and then center cropped.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size_divisor = size_divisor if size_divisor is not None else self.size_divisor
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_pad = do_pad if do_pad is not None else self.do_pad
+        do_center_crop if do_center_crop is not None else self.do_center_crop
+
+        size = size if size is not None else self.size
+        size = get_size_dict(size, default_to_square=False)
+
+        if not is_batched(images):
+            images = [images]
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        if do_resize and size is None or resample is None:
+            raise ValueError("Size and resample must be specified if do_resize is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize and (image_mean is None or image_std is None):
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if do_resize:
+            images = [
+                self.resize(
+                    image=image,
+                    size=size,
+                    size_divisor=size_divisor,
+                    resample=resample,
+                    input_data_format=input_data_format,
+                )
+                for image in images
+            ]
+
+        if do_center_crop:
+            images = [
+                self.center_crop(image=image, size=size, input_data_format=input_data_format) for image in images
+            ]
+
+        if do_rescale:
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_normalize:
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
+
+        if do_pad:
+            encoded_outputs = self.pad(
+                images, return_pixel_mask=True, return_tensors=return_tensors, input_data_format=data_format
+            )
+        else:
+            encoded_outputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
+
+        return encoded_outputs
diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py
new file mode 100644
index 000000000000..ce569157b811
--- /dev/null
+++ b/src/transformers/models/bridgetower/modeling_bridgetower.py
@@ -0,0 +1,1906 @@
+# coding=utf-8
+# Copyright 2023 The Intel Labs Team Authors, The Microsoft Research Team Authors and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BridgeTower Model"""
+
+import math
+from collections import OrderedDict
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN, QuickGELUActivation
+from ...modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    MaskedLMOutput,
+    ModelOutput,
+    SequenceClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel, apply_chunking_to_forward
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from .configuration_bridgetower import BridgeTowerConfig, BridgeTowerTextConfig, BridgeTowerVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "BridgeTowerConfig"
+_CHECKPOINT_FOR_DOC = "BridgeTower/bridgetower-base"
+_TOKENIZER_FOR_DOC = "RobertaTokenizer"
+
+BRIDGETOWER_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "BridgeTower/bridgetower-base",
+    "BridgeTower/bridgetower-base-itm-mlm"
+    # See all bridgetower models at https://huggingface.co/BridgeTower
+]
+
+
+BRIDGETOWER_START_DOCSTRING = r"""
+    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ subclass. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`BridgeTowerConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+BRIDGETOWER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
+            IDs?](../glossary#input-ids)
+
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            [What are token type IDs?](../glossary#token-type-ids)
+
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`BridgeTowerImageProcessor`]. See
+            [`BridgeTowerImageProcessor.__call__`] for details.
+
+        pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+            Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
+
+            - 1 for pixels that are real (i.e. **not masked**),
+            - 0 for pixels that are padding (i.e. **masked**).
+            `What are attention masks? <../glossary.html#attention-mask>`__
+
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+
+        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
+            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
+
+        image_token_type_idx (`int`, *optional*):
+            - The token type ids for images.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@dataclass
+class BridgeTowerModelOutput(ModelOutput):
+    """
+    Output type of [`BridgeTowerModel`].
+
+    Args:
+        text_features (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, hidden_size)`):
+            Sequence of hidden-states at the text output of the last layer of the model.
+        image_features (`torch.FloatTensor` of shape `(batch_size, image_sequence_length, hidden_size)`):
+            Sequence of hidden-states at the image output of the last layer of the model.
+        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size x 2)`):
+            Concatenation of last layer hidden-state of the first token of the text and image sequence (classification
+            token), respectively, after further processing through layers used for auxiliary pretraining tasks.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
+            the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    text_features: torch.FloatTensor = None
+    image_features: torch.FloatTensor = None
+    pooler_output: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class BridgeTowerContrastiveOutput(ModelOutput):
+    """
+    Output type of ['BridgeTowerForContrastiveLearning']
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`:
+            Image-text contrastive loss.
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        text_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
+            The text embeddings obtained by applying the projection layer to the pooler_output.
+        image_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
+            The image embeddings obtained by applying the projection layer to the pooler_output.
+        cross_embeds  (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
+            The text-image cross-modal embeddings obtained by applying the projection layer to the pooler_output.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
+            the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    text_embeds: Optional[Tuple[torch.FloatTensor]] = None
+    image_embeds: Optional[Tuple[torch.FloatTensor]] = None
+    cross_embeds: Optional[Tuple[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+class BridgeTowerResidualAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.attn = nn.MultiheadAttention(config.hidden_size, config.hidden_size // 64)
+        self.ln_1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.mlp = nn.ModuleDict(
+            OrderedDict(
+                [
+                    ("c_fc", nn.Linear(config.hidden_size, config.hidden_size * 4)),
+                    ("gelu", QuickGELUActivation()),
+                    ("c_proj", nn.Linear(config.hidden_size * 4, config.hidden_size)),
+                ]
+            )
+        )
+        self.ln_2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.attn_mask = None
+
+    def attention(self, hidden_state: torch.Tensor, attention_mask: torch.Tensor):
+        if attention_mask is not None:
+            attention_mask = attention_mask.to(dtype=torch.bool, device=hidden_state.device)
+        self.attn_mask = (
+            self.attn_mask.to(dtype=hidden_state.dtype, device=hidden_state.device)
+            if self.attn_mask is not None
+            else None
+        )
+        return self.attn(
+            hidden_state,
+            hidden_state,
+            hidden_state,
+            need_weights=False,
+            attn_mask=self.attn_mask,
+            key_padding_mask=attention_mask,
+        )[0]
+
+    def forward(self, hidden_state: torch.Tensor, attention_mask: torch.Tensor = None):
+        residual_state = hidden_state + self.attention(self.ln_1(hidden_state), attention_mask)
+        hidden_state = self.ln_2(residual_state)
+        for _, layer in self.mlp.items():
+            hidden_state = layer(hidden_state)
+        hidden_state = residual_state + hidden_state
+        return hidden_state
+
+
+class BridgeTowerTransformer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.num_hidden_layers = config.num_hidden_layers
+        if config.remove_last_layer:
+            self.resblocks = nn.ModuleList(
+                [BridgeTowerResidualAttention(config) for _ in range(self.num_hidden_layers - 1)]
+            )
+        else:
+            self.resblocks = nn.ModuleList(
+                [BridgeTowerResidualAttention(config) for _ in range(self.num_hidden_layers)]
+            )
+        self.stop_gradient = config.stop_gradient
+
+    def forward(self, hidden_state: torch.Tensor, attention_mask: Optional[torch.Tensor] = None):
+        hidden_states = []
+        for block in self.resblocks:
+            hidden_state = block(hidden_state, attention_mask)
+            if self.stop_gradient:
+                hidden_states.append(hidden_state.detach())
+            else:
+                hidden_states.append(hidden_state)
+        return hidden_states
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings with CLIP->BridgeTower
+class BridgeTowerVisionEmbeddings(nn.Module):
+    def __init__(self, config: BridgeTowerVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            bias=False,
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
+
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        embeddings = embeddings + self.position_embedding(self.position_ids)
+        return embeddings
+
+
+class BridgeTowerVisionTransformer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.embeddings = BridgeTowerVisionEmbeddings(config)
+        self.ln_pre = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.transformer = BridgeTowerTransformer(config)
+        self.ln_post = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.share_layernorm = config.share_layernorm
+        if not config.share_layernorm:
+            self.ln_separate = nn.ModuleList(
+                [nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) for _ in range(config.num_hidden_layers)]
+            )
+
+    def forward(self, pixel_values: torch.Tensor, attention_mask):
+        hidden_states = self.embeddings(pixel_values)
+        hidden_states = self.ln_pre(hidden_states)
+        # NLD -> LND
+        hidden_states = hidden_states.permute(1, 0, 2)
+
+        hidden_states = self.transformer(hidden_states, attention_mask)
+        # shape = [num_hidden_layers, hidden_size, *, grid ** 2]
+        hidden_states = torch.stack(hidden_states, dim=0)
+        # shape = [num_hidden_layers, *, hidden_size, grid ** 2]
+        hidden_states = hidden_states.permute(0, 2, 1, 3)
+        if self.share_layernorm:
+            hidden_states = self.ln_post(hidden_states)
+        else:
+            hidden_states_stack = []
+            for hidden_states, ln in zip(hidden_states, self.ln_separate):
+                hidden_states = ln(hidden_states)
+                hidden_states_stack.append(hidden_states)
+            # shape = [num_hidden_layers, *, hidden_size, grid ** 2]
+            hidden_states = torch.stack(hidden_states_stack, dim=0)
+        return hidden_states
+
+    def forward_pre(self, pixel_values: torch.Tensor):
+        hidden_states = self.embeddings(pixel_values)
+        hidden_states = self.ln_pre(hidden_states)
+        # NLD -> LND
+        hidden_states = hidden_states.permute(1, 0, 2)
+        return hidden_states
+
+    def forward_post(self, hidden_state: torch.Tensor):
+        visual_output_post = hidden_state.permute(1, 0, 2)
+        visual_output_post = self.ln_post(visual_output_post)
+        return visual_output_post
+
+
+class BridgeTowerLinkTower(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.link_tower_type = config.link_tower_type
+        self.hidden_size = config.hidden_size
+        if config.link_tower_type in ["add", "scaled_add", "interpolate"]:
+            if config.link_tower_type == "scaled_add":
+                self.scaled_factor = nn.Parameter(torch.tensor(1.0))
+            elif config.link_tower_type == "interpolate":
+                self.beta = nn.Parameter(torch.tensor(0.5))
+            self.LayerNorm = nn.LayerNorm(self.hidden_size, eps=config.layer_norm_eps)
+        else:
+            raise NotImplementedError(f"link_tower_type {config.link_tower_type} is not implemented")
+
+    def forward(self, hidden_states, cross_modal_hidden_states, attention_mask):
+        if self.link_tower_type == "add":
+            return self.LayerNorm(hidden_states + cross_modal_hidden_states)
+        elif self.link_tower_type == "scaled_add":
+            return self.LayerNorm(hidden_states * self.scaled_factor + cross_modal_hidden_states)
+        elif self.link_tower_type == "interpolate":
+            return self.LayerNorm(hidden_states * (1 - self.beta) + cross_modal_hidden_states * self.beta)
+        else:
+            raise NotImplementedError(f"link_tower_type {self.link_tower_type} is not implemented")
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->BridgeTower
+class BridgeTowerSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->BridgeTower
+class BridgeTowerIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->BridgeTower
+class BridgeTowerOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->BridgeTower
+class BridgeTowerPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaSelfAttention with Roberta->BridgeTower
+class BridgeTowerSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        use_cache = past_key_value is not None
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
+            if use_cache:
+                position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
+                    -1, 1
+                )
+            else:
+                position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BridgeTowerModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->BridgeTower
+class BridgeTowerAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        self.self = BridgeTowerSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.output = BridgeTowerSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class BridgeTowerBertCrossLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BridgeTowerAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        self.crossattention = BridgeTowerAttention(config)
+        self.intermediate = BridgeTowerIntermediate(config)
+        self.output = BridgeTowerOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        encoder_hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            head_mask=None,
+            output_attentions=output_attentions,
+            past_key_value=None,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        # add self attentions if we output attention weights
+        outputs = self_attention_outputs[1:]
+
+        cross_attention_outputs = self.crossattention(
+            attention_output,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+        )
+        attention_output = cross_attention_outputs[0]
+        # add cross attentions if we output attention weights
+        outputs = outputs + cross_attention_outputs[1:-1]
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class BridgeTowerTextLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BridgeTowerAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = BridgeTowerAttention(config, position_embedding_type="absolute")
+        self.intermediate = BridgeTowerIntermediate(config)
+        self.output = BridgeTowerOutput(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaEncoder with Roberta->BridgeTowerText
+class BridgeTowerTextEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([BridgeTowerTextLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->BridgeTowerText
+class BridgeTowerTextEmbeddings(nn.Module):
+    """
+    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
+    """
+
+    # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+        self.register_buffer(
+            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
+        )
+
+        # End copy
+        self.padding_idx = config.pad_token_id
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
+        )
+
+    def forward(
+        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        if position_ids is None:
+            if input_ids is not None:
+                # Create the position ids from the input token ids. Any padded tokens remain padded.
+                position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
+            else:
+                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
+
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
+        """
+        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+
+        Args:
+            inputs_embeds: torch.Tensor
+
+        Returns: torch.Tensor
+        """
+        input_shape = inputs_embeds.size()[:-1]
+        sequence_length = input_shape[1]
+
+        position_ids = torch.arange(
+            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+        )
+        return position_ids.unsqueeze(0).expand(input_shape)
+
+
+# Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids
+def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+
+    Args:
+        x: torch.Tensor x:
+
+    Returns: torch.Tensor
+    """
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    mask = input_ids.ne(padding_idx).int()
+    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
+    return incremental_indices.long() + padding_idx
+
+
+class BridgeTowerPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = BridgeTowerConfig
+    base_model_prefix = "bridgetower"
+    supports_gradient_checkpointing = False
+    _no_split_modules = ["BridgeTowerSelfAttention", "BridgeTowerResidualAttention"]
+    _skip_keys_device_placement = "past_key_values"
+
+    def _init_weights(self, module):
+        if isinstance(module, BridgeTowerVisionModel):
+            proj_std = (module.visual.transformer.hidden_size**-0.5) * (
+                (2 * module.visual.transformer.num_hidden_layers) ** -0.5
+            )
+            attn_std = module.visual.transformer.hidden_size**-0.5
+            fc_std = (2 * module.visual.transformer.hidden_size) ** -0.5
+            for block in module.visual.transformer.resblocks:
+                nn.init.normal_(block.attn.in_proj_weight, std=attn_std * self.config.initializer_factor)
+                nn.init.normal_(block.attn.out_proj.weight, std=proj_std * self.config.initializer_factor)
+                nn.init.normal_(block.mlp.c_fc.weight, std=fc_std * self.config.initializer_factor)
+                nn.init.normal_(block.mlp.c_proj.weight, std=proj_std * self.config.initializer_factor)
+
+            nn.init.normal_(module.visual.embeddings.class_embedding, std=attn_std * self.config.initializer_factor)
+            nn.init.normal_(
+                module.visual.embeddings.position_embedding.weight, std=attn_std * self.config.initializer_factor
+            )
+        elif isinstance(module, (nn.Linear, nn.Conv2d, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.05 * self.config.initializer_factor)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+
+class BridgeTowerVisionModel(BridgeTowerPreTrainedModel):
+    config_class = BridgeTowerVisionConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.visual = BridgeTowerVisionTransformer(config)
+
+    @property
+    def dtype(self):
+        return self.visual.embeddings.patch_embedding.weight.dtype
+
+    def forward(self, image, image_mask=None):
+        return self.visual(image.type(self.dtype), image_mask)
+
+
+class BridgeTowerTextModel(BridgeTowerPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
+    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
+    Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
+    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
+    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
+
+    .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
+
+    """
+
+    config_class = BridgeTowerTextConfig
+
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = BridgeTowerTextEmbeddings(config)
+        self.encoder = BridgeTowerTextEncoder(config)
+
+        self.pooler = BridgeTowerPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaModel.forward
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    "The bare BridgeTower Model transformer outputting BridgeTowerModelOutput object without any specific head on"
+    " top.",
+    BRIDGETOWER_START_DOCSTRING,
+)
+class BridgeTowerModel(BridgeTowerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        vision_config = config.vision_config
+        text_config = config.text_config
+
+        if config.share_cross_modal_transformer_layers:
+            self.cross_modal_text_transform = nn.Linear(text_config.hidden_size, config.hidden_size)
+            self.cross_modal_image_transform = nn.Linear(vision_config.hidden_size, config.hidden_size)
+        else:
+            self.cross_modal_text_transform = nn.ModuleList(
+                [nn.Linear(text_config.hidden_size, config.hidden_size) for _ in range(config.num_hidden_layers)]
+            )
+            self.cross_modal_image_transform = nn.ModuleList(
+                [nn.Linear(vision_config.hidden_size, config.hidden_size) for _ in range(config.num_hidden_layers)]
+            )
+
+        self.token_type_embeddings = nn.Embedding(2, config.hidden_size)
+
+        self.vision_model = BridgeTowerVisionModel(vision_config)
+
+        self.text_model = BridgeTowerTextModel(text_config)
+
+        if not vision_config.share_layernorm and config.init_layernorm_from_vision_encoder:
+            for ln in self.vision_model.visual.cross_modal_ln_separate:
+                ln.weight.data = self.vision_model.visual.ln_post.weight.data
+                ln.bias.data = self.vision_model.visual.ln_post.bias.data
+
+        self.cross_modal_image_layers = nn.ModuleList(
+            [BridgeTowerBertCrossLayer(text_config) for _ in range(config.num_hidden_layers)]
+        )
+        self.cross_modal_text_layers = nn.ModuleList(
+            [BridgeTowerBertCrossLayer(text_config) for _ in range(config.num_hidden_layers)]
+        )
+
+        # Class token => Linear => Tanh
+        self.cross_modal_image_pooler = BridgeTowerPooler(config)
+        self.cross_modal_text_pooler = BridgeTowerPooler(config)
+
+        # Initialize BridgeTower Components
+        self.cross_modal_text_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.cross_modal_image_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        if config.share_link_tower_layers:
+            self.cross_modal_text_link_tower = BridgeTowerLinkTower(config)
+            self.cross_modal_image_link_tower = BridgeTowerLinkTower(config)
+        else:
+            self.cross_modal_text_link_tower = nn.ModuleList(
+                [BridgeTowerLinkTower(config) for _ in range(config.num_hidden_layers - 1)]
+            )
+            self.cross_modal_image_link_tower = nn.ModuleList(
+                [BridgeTowerLinkTower(config) for _ in range(config.num_hidden_layers - 1)]
+            )
+
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.text_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.text_model.set_input_embeddings(value)
+
+    @add_start_docstrings_to_model_forward(BRIDGETOWER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BridgeTowerModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        image_embeds: Optional[torch.FloatTensor] = None,
+        image_token_type_idx: Optional[int] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple[torch.Tensor], BridgeTowerModelOutput]:
+        r"""
+        output_hidden_states (`bool`, *optional*):
+            If set to `True`, hidden states are returned as a list containing the hidden states of text, image, and
+            cross-modal components respectively. i.e. `(hidden_states_text, hidden_states_image,
+            hidden_states_cross_modal)` where each element is a list of the hidden states of the corresponding
+            modality. `hidden_states_txt/img` are a list of tensors corresponding to unimodal hidden states and
+            `hidden_states_cross_modal` is a list of tuples containing `cross_modal_text_hidden_states` and
+            `cross_modal_image_hidden_states` of each brdige layer.
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels are currently not supported.
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import BridgeTowerProcessor, BridgeTowerModel
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> # prepare image and text
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> text = "hello world"
+        >>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base")
+        >>> model = BridgeTowerModel.from_pretrained("BridgeTower/bridgetower-base")
+
+        >>> inputs = processor(image, text, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> outputs.keys()
+        odict_keys(['text_features', 'image_features', 'pooler_output'])
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        all_hidden_states_text = () if output_hidden_states else None
+        all_hidden_states_image = () if output_hidden_states else None
+        all_hidden_states_cross = () if output_hidden_states else None
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        image_token_type_idx = image_token_type_idx if image_token_type_idx else 1
+        input_shape = input_ids.size()
+        text_embeds = self.text_model.embeddings(input_ids=input_ids)
+
+        if output_hidden_states:
+            all_hidden_states_text += (text_embeds,)
+
+        if attention_mask is None:
+            attention_mask = torch.ones(input_shape, dtype=torch.long, device=input_ids.device)
+        extend_text_masks = self.text_model.get_extended_attention_mask(attention_mask, input_shape).to(
+            input_ids.device
+        )
+
+        # The split_index determines how many layers of the uni-modal encoder are applied before the cross-modal encoder
+        split_index = len(self.text_model.encoder.layer) - self.config.num_hidden_layers + 1
+
+        # Run the first 'split_index' layers of the textual encoder
+        for layer in self.text_model.encoder.layer[:split_index]:
+            text_embeds = layer(text_embeds, extend_text_masks)[0]
+
+            if output_hidden_states:
+                all_hidden_states_text += (text_embeds,)
+
+        if image_embeds is None:
+            image_embeds = self.vision_model.visual.forward_pre(pixel_values.type(self.vision_model.dtype))
+        else:
+            # Permute as BridgeTowerResidualAttention has batch_first=True
+            image_embeds = image_embeds.permute(1, 0, 2)
+
+        if output_hidden_states:
+            all_hidden_states_image += (image_embeds,)
+
+        # Run the first 'split_index' layers of the visual encoder
+        for block in self.vision_model.visual.transformer.resblocks[:split_index]:
+            image_embeds = block(image_embeds)
+            if output_hidden_states:
+                all_hidden_states_image += (image_embeds,)
+
+        image_embeds_with_ln = self.vision_model.visual.forward_post(image_embeds.type(self.vision_model.dtype))
+
+        # first layer is a special case because we don't have the output from the cross-encoder yet
+        cross_modal_text = self.cross_modal_text_transform(text_embeds)
+
+        text_token_type_embeddings = self.token_type_embeddings(
+            torch.zeros(1, dtype=torch.long, device=input_ids.device)
+        ).expand_as(cross_modal_text)
+
+        cross_modal_text = self.cross_modal_text_layernorm(cross_modal_text + text_token_type_embeddings)
+
+        image_embeds_with_ln = self.cross_modal_image_transform(image_embeds_with_ln)
+        image_token_type_embeddings = self.token_type_embeddings(
+            torch.full((1,), image_token_type_idx, dtype=torch.long, device=input_ids.device)
+        ).expand_as(image_embeds_with_ln)
+
+        image_embeds_with_ln = image_embeds_with_ln + image_token_type_embeddings
+        cross_modal_image = self.cross_modal_image_layernorm(image_embeds_with_ln)
+
+        pixel_mask = torch.ones(
+            (cross_modal_image.size(0), cross_modal_image.size(1)),
+            dtype=torch.long,
+            device=input_ids.device,
+        )
+        extend_image_masks = self.text_model.get_extended_attention_mask(pixel_mask, pixel_mask.size()).to(
+            input_ids.device
+        )
+
+        layer_outputs_text = self.cross_modal_text_layers[0](
+            cross_modal_text,
+            cross_modal_image,
+            attention_mask=extend_text_masks,
+            encoder_attention_mask=extend_image_masks,
+            output_attentions=output_attentions,
+        )
+        cross_text_features = layer_outputs_text[0]
+
+        layer_outputs_image = self.cross_modal_image_layers[0](
+            cross_modal_image,
+            cross_modal_text,
+            attention_mask=extend_image_masks,
+            encoder_attention_mask=extend_text_masks,
+            output_attentions=output_attentions,
+        )
+        cross_image_features = layer_outputs_image[0]
+
+        if output_hidden_states:
+            all_hidden_states_cross += ((cross_text_features, cross_image_features),)
+
+        if output_attentions:
+            all_self_attentions += ((layer_outputs_text[1], layer_outputs_image[1]),)
+
+        link_layer_index = 0
+
+        #  Each of the top 6 layers of the visual and textual encoders ([split_index:]) is connected to each layer of
+        #  the cross-modal encoder via bridge layers, which brings bottom-up alignment and fusion to the cross-modal encoder.
+        for i in range(split_index, len(self.text_model.encoder.layer)):
+            text_embeds = self.text_model.encoder.layer[i](text_embeds, extend_text_masks)[0]
+            image_embeds = self.vision_model.visual.transformer.resblocks[i](image_embeds).type(
+                self.vision_model.dtype
+            )
+            image_embeds_with_ln = (
+                self.cross_modal_image_transform(self.vision_model.visual.forward_post(image_embeds))
+                + image_token_type_embeddings
+            )
+
+            text_link_tower = self.cross_modal_text_link_tower[link_layer_index]
+            image_link_tower = self.cross_modal_image_link_tower[link_layer_index]
+
+            # Bridge layers for textual and visual encoders
+            cross_text_features_ = text_link_tower(
+                self.cross_modal_text_transform(text_embeds) + text_token_type_embeddings,
+                cross_text_features,
+                extend_text_masks,
+            )
+            cross_image_features_ = image_link_tower(image_embeds_with_ln, cross_image_features, extend_image_masks)
+
+            # Cross-modal encoder via bridge layers of textual and visual encoders
+            layer_outputs_text = self.cross_modal_text_layers[link_layer_index + 1](
+                cross_text_features_,
+                cross_image_features_,
+                attention_mask=extend_text_masks,
+                encoder_attention_mask=extend_image_masks,
+                output_attentions=output_attentions,
+            )
+            cross_text_features = layer_outputs_text[0]
+
+            layer_outputs_image = self.cross_modal_image_layers[link_layer_index + 1](
+                cross_image_features_,
+                cross_text_features_,
+                attention_mask=extend_image_masks,
+                encoder_attention_mask=extend_text_masks,
+                output_attentions=output_attentions,
+            )
+            cross_image_features = layer_outputs_image[0]
+
+            link_layer_index += 1
+
+            if output_hidden_states:
+                all_hidden_states_text += (text_embeds,)
+                all_hidden_states_image += (image_embeds,)
+                all_hidden_states_cross += ((cross_text_features, cross_image_features),)
+
+            if output_attentions:
+                all_self_attentions += ((layer_outputs_text[1], layer_outputs_image[1]),)
+
+        #  Concatenate the cls token of the text and image features to get the final represtation
+        text_features, image_features = cross_text_features, cross_image_features
+        cls_features = self.get_cls_features(text_features, image_features)
+
+        if output_hidden_states:
+            all_hidden_states = (all_hidden_states_text, all_hidden_states_image, all_hidden_states_cross)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [text_features, image_features, cls_features, all_hidden_states, all_self_attentions]
+                if v is not None
+            )
+
+        return BridgeTowerModelOutput(
+            text_features=text_features,
+            image_features=image_features,
+            pooler_output=cls_features,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+    def get_cls_features(self, text_features, image_features):
+        cls_features_text = self.cross_modal_text_pooler(text_features)
+        cls_features_image = self.cross_modal_image_pooler(image_features)
+        return torch.cat([cls_features_text, cls_features_image], dim=-1)
+
+
+# Copied from transformers.models.vilt.modeling_vilt.ViltPredictionHeadTransform with Vilt->BridgeTower
+class BridgeTowerPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class BridgeTowerMLMHead(nn.Module):
+    def __init__(self, config, weight=None):
+        super().__init__()
+        self.config = config
+        self.transform = BridgeTowerPredictionHeadTransform(config)
+        self.decoder = nn.Linear(config.hidden_size, config.text_config.vocab_size, bias=False)
+        self.bias = nn.Parameter(torch.zeros(config.text_config.vocab_size))
+        if weight is not None:
+            self.decoder.weight = weight
+
+    def forward(self, x):
+        mlm_score = self.transform(x)
+        mlm_score = self.decoder(mlm_score) + self.bias
+        return mlm_score
+
+
+class BridgeTowerITMHead(nn.Module):
+    def __init__(self, hidden_size):
+        super().__init__()
+        self.fc = nn.Linear(hidden_size, 2)
+
+    def forward(self, x):
+        itm_score = self.fc(x)
+        return itm_score
+
+
+@add_start_docstrings(
+    """
+    BridgeTower Model with a language modeling head on top as done during pretraining.
+    """,
+    BRIDGETOWER_START_DOCSTRING,
+)
+class BridgeTowerForMaskedLM(BridgeTowerPreTrainedModel):
+    _tied_weights_keys = ["mlm_score.decoder.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bridgetower = BridgeTowerModel(config)
+        self.mlm_score = BridgeTowerMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.mlm_score.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.mlm_score.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(BRIDGETOWER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        image_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+    ) -> Union[MaskedLMOutput, Tuple[torch.FloatTensor]]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import BridgeTowerProcessor, BridgeTowerForMaskedLM
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000360943.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+        >>> text = "a <mask> looking out of the window"
+
+        >>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")
+        >>> model = BridgeTowerForMaskedLM.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")
+
+        >>> # prepare inputs
+        >>> encoding = processor(image, text, return_tensors="pt")
+
+        >>> # forward pass
+        >>> outputs = model(**encoding)
+
+        >>> results = processor.decode(outputs.logits.argmax(dim=-1).squeeze(0).tolist())
+
+        >>> print(results)
+        .a cat looking out of the window.
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bridgetower(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            pixel_values=pixel_values,
+            pixel_mask=pixel_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            image_embeds=image_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        mlm_logits = self.mlm_score(outputs.text_features if return_dict else outputs[0])
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+
+            labels = labels.to(mlm_logits.device)
+            masked_lm_loss = loss_fct(mlm_logits.view(-1, self.config.text_config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = tuple(mlm_logits)
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=mlm_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    BridgeTower Model transformer with a classifier head on top (a linear layer on top of the final hidden state of the
+    [CLS] token) for image-to-text matching.
+    """,
+    BRIDGETOWER_START_DOCSTRING,
+)
+class BridgeTowerForImageAndTextRetrieval(BridgeTowerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bridgetower = BridgeTowerModel(config)
+
+        self.itm_score = BridgeTowerITMHead(config.hidden_size * 2)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(BRIDGETOWER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        image_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+    ) -> Union[SequenceClassifierOutput, Tuple[torch.FloatTensor]]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*):
+            Labels for computing the image-text matching loss. 0 means the pairs don't match and 1 means they match.
+            The pairs with 0 will be skipped for calculation.
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import BridgeTowerProcessor, BridgeTowerForImageAndTextRetrieval
+        >>> import requests
+        >>> from PIL import Image
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> texts = ["An image of two cats chilling on a couch", "A football player scoring a goal"]
+
+        >>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")
+        >>> model = BridgeTowerForImageAndTextRetrieval.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")
+
+        >>> # forward pass
+        >>> scores = dict()
+        >>> for text in texts:
+        ...     # prepare inputs
+        ...     encoding = processor(image, text, return_tensors="pt")
+        ...     outputs = model(**encoding)
+        ...     scores[text] = outputs.logits[0, 1].item()
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bridgetower(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            pixel_values=pixel_values,
+            pixel_mask=pixel_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            image_embeds=image_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooler_output = outputs.pooler_output if return_dict else outputs[2]
+
+        logits = self.itm_score(pooler_output)
+
+        itm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+
+            labels = labels.to(logits.device)
+            itm_loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = tuple(logits)
+            return ((itm_loss,) + output) if itm_loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=itm_loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class BridgeTowerContrastiveHead(nn.Module):
+    def __init__(self, hidden_size, embed_size):
+        super().__init__()
+        self.fc = nn.Linear(hidden_size, embed_size)
+
+    def forward(self, x):
+        x = self.fc(x)
+        return x
+
+
+@add_start_docstrings(
+    """
+    BridgeTower Model with a image-text contrastive head on top computing image-text contrastive loss.
+    """,
+    BRIDGETOWER_START_DOCSTRING,
+)
+class BridgeTowerForContrastiveLearning(BridgeTowerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bridgetower = BridgeTowerModel(config)
+
+        self.itc_text_head = BridgeTowerContrastiveHead(config.hidden_size, config.contrastive_hidden_size)
+        self.itc_image_head = BridgeTowerContrastiveHead(config.hidden_size, config.contrastive_hidden_size)
+        self.itc_cross_modal_head = BridgeTowerContrastiveHead(config.hidden_size * 2, config.contrastive_hidden_size)
+
+        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(BRIDGETOWER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BridgeTowerContrastiveOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        image_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = True,
+        return_dict: Optional[bool] = None,
+        return_loss: Optional[bool] = None,
+    ) -> Union[BridgeTowerContrastiveOutput, Tuple[torch.FloatTensor]]:
+        r"""
+        return_loss (`bool`, *optional*):
+            Whether or not to return the contrastive loss.
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import BridgeTowerProcessor, BridgeTowerForContrastiveLearning
+        >>> import requests
+        >>> from PIL import Image
+        >>> import torch
+
+        >>> image_urls = [
+        ...     "https://farm4.staticflickr.com/3395/3428278415_81c3e27f15_z.jpg",
+        ...     "http://images.cocodataset.org/val2017/000000039769.jpg",
+        ... ]
+        >>> texts = ["two dogs in a car", "two cats sleeping on a couch"]
+        >>> images = [Image.open(requests.get(url, stream=True).raw) for url in image_urls]
+
+        >>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc")
+        >>> model = BridgeTowerForContrastiveLearning.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc")
+
+        >>> inputs = processor(images, texts, padding=True, return_tensors="pt")
+        >>> loss = model(**inputs, return_loss=True).loss
+
+        >>> inputs = processor(images, texts[::-1], padding=True, return_tensors="pt")
+        >>> loss_swapped = model(**inputs, return_loss=True).loss
+
+        >>> print("Loss", round(loss.item(), 4))
+        Loss 0.0019
+
+        >>> print("Loss with swapped images", round(loss_swapped.item(), 4))
+        Loss with swapped images 2.126
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bridgetower(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            pixel_values=pixel_values,
+            pixel_mask=pixel_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            image_embeds=image_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=True,
+            return_dict=return_dict,
+        )
+
+        pooler_output = outputs.pooler_output if return_dict else outputs[2]
+        hidden_states_txt, hidden_states_img, hidden_states_cross_modal = (
+            outputs.hidden_states if return_dict else outputs[3]
+        )
+
+        text_embeds = hidden_states_txt[-1]
+        image_embeds = hidden_states_img[-1]
+
+        image_embeds_with_ln = self.bridgetower.vision_model.visual.forward_post(image_embeds)
+        image_token_type_embeddings = self.bridgetower.token_type_embeddings(
+            torch.full((1,), 1, dtype=torch.long, device=self.bridgetower.token_type_embeddings.weight.device)
+        ).expand_as(image_embeds_with_ln)
+
+        image_embeds = self.bridgetower.cross_modal_image_transform(image_embeds_with_ln) + image_token_type_embeddings
+
+        # normalized features
+        text_embeds = nn.functional.normalize(self.itc_text_head(text_embeds[:, 0, :]), dim=-1, p=2)
+        image_embeds = nn.functional.normalize(self.itc_image_head(image_embeds[:, 0, :]), dim=-1, p=2).to(
+            device=text_embeds.device
+        )
+        cross_embeds = nn.functional.normalize(self.itc_cross_modal_head(pooler_output), dim=-1, p=2).to(
+            device=text_embeds.device
+        )
+
+        logits = torch.stack([text_embeds, image_embeds, cross_embeds], dim=-2)
+
+        logit_scale = self.logit_scale.exp().to(device=text_embeds.device)
+        logits_text_to_image = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
+        logits_text_to_cross = torch.matmul(text_embeds, cross_embeds.t()) * logit_scale
+        logits_image_to_cross = torch.matmul(image_embeds, cross_embeds.t()) * logit_scale
+
+        itc_loss = None
+
+        if return_loss:
+            labels = torch.arange(len(logits), device=logits.device)
+            text_to_image_loss = nn.functional.cross_entropy(logits_text_to_image, labels)
+            text_to_cross_loss = nn.functional.cross_entropy(logits_text_to_cross, labels)
+            image_to_cross_loss = nn.functional.cross_entropy(logits_image_to_cross, labels)
+            itc_loss = (text_to_image_loss + text_to_cross_loss + image_to_cross_loss) / 3.0
+
+        if not return_dict:
+            output = (logits, text_embeds, image_embeds, cross_embeds) + outputs[3:]
+            return ((itc_loss,) + output) if itc_loss is not None else output
+
+        return BridgeTowerContrastiveOutput(
+            loss=itc_loss,
+            logits=logits,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            cross_embeds=cross_embeds,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/bridgetower/processing_bridgetower.py b/src/transformers/models/bridgetower/processing_bridgetower.py
new file mode 100644
index 000000000000..c268d7c26f43
--- /dev/null
+++ b/src/transformers/models/bridgetower/processing_bridgetower.py
@@ -0,0 +1,118 @@
+# coding=utf-8
+# Copyright 2023 The Intel Labs Team Authors, The Microsoft Research Team Authors and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for BridgeTower.
+"""
+
+from typing import List, Optional, Union
+
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
+from ...utils import TensorType
+
+
+class BridgeTowerProcessor(ProcessorMixin):
+    r"""
+    Constructs a BridgeTower processor which wraps a Roberta tokenizer and BridgeTower image processor into a single
+    processor.
+
+    [`BridgeTowerProcessor`] offers all the functionalities of [`BridgeTowerImageProcessor`] and
+    [`RobertaTokenizerFast`]. See the docstring of [`~BridgeTowerProcessor.__call__`] and
+    [`~BridgeTowerProcessor.decode`] for more information.
+
+    Args:
+        image_processor (`BridgeTowerImageProcessor`):
+            An instance of [`BridgeTowerImageProcessor`]. The image processor is a required input.
+        tokenizer (`RobertaTokenizerFast`):
+            An instance of ['RobertaTokenizerFast`]. The tokenizer is a required input.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "BridgeTowerImageProcessor"
+    tokenizer_class = ("RobertaTokenizer", "RobertaTokenizerFast")
+
+    def __init__(self, image_processor, tokenizer):
+        super().__init__(image_processor, tokenizer)
+
+    def __call__(
+        self,
+        images,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        This method uses [`BridgeTowerImageProcessor.__call__`] method to prepare image(s) for the model, and
+        [`RobertaTokenizerFast.__call__`] to prepare text for the model.
+
+        Please refer to the docstring of the above two methods for more information.
+        """
+        encoding = self.tokenizer(
+            text=text,
+            add_special_tokens=add_special_tokens,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            return_tensors=return_tensors,
+            **kwargs,
+        )
+        # add pixel_values + pixel_mask
+        encoding_image_processor = self.image_processor(
+            images, return_tensors=return_tensors, do_normalize=True, do_center_crop=True, **kwargs
+        )
+        encoding.update(encoding_image_processor)
+
+        return encoding
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to RobertaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to RobertaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer
+        to the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
diff --git a/src/transformers/models/bros/__init__.py b/src/transformers/models/bros/__init__.py
new file mode 100644
index 000000000000..b08d55836488
--- /dev/null
+++ b/src/transformers/models/bros/__init__.py
@@ -0,0 +1,77 @@
+# Copyright 2023-present NAVER Corp, The Microsoft Research Asia LayoutLM Team Authors and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
+
+
+_import_structure = {
+    "configuration_bros": ["BROS_PRETRAINED_CONFIG_ARCHIVE_MAP", "BrosConfig"],
+}
+
+try:
+    if not is_tokenizers_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["processing_bros"] = ["BrosProcessor"]
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_bros"] = [
+        "BROS_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "BrosPreTrainedModel",
+        "BrosModel",
+        "BrosForTokenClassification",
+        "BrosSpadeEEForTokenClassification",
+        "BrosSpadeELForTokenClassification",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_bros import BROS_PRETRAINED_CONFIG_ARCHIVE_MAP, BrosConfig
+
+    try:
+        if not is_tokenizers_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .processing_bros import BrosProcessor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_bros import (
+            BROS_PRETRAINED_MODEL_ARCHIVE_LIST,
+            BrosForTokenClassification,
+            BrosModel,
+            BrosPreTrainedModel,
+            BrosSpadeEEForTokenClassification,
+            BrosSpadeELForTokenClassification,
+        )
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/bros/configuration_bros.py b/src/transformers/models/bros/configuration_bros.py
new file mode 100644
index 000000000000..f0a5dbff86ed
--- /dev/null
+++ b/src/transformers/models/bros/configuration_bros.py
@@ -0,0 +1,139 @@
+# coding=utf-8
+# Copyright 2023-present NAVER Corp, The Microsoft Research Asia LayoutLM Team Authors and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Bros model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+BROS_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "jinho8345/bros-base-uncased": "https://huggingface.co/jinho8345/bros-base-uncased/blob/main/config.json",
+    "jinho8345/bros-large-uncased": "https://huggingface.co/jinho8345/bros-large-uncased/blob/main/config.json",
+}
+
+
+class BrosConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`BrosModel`] or a [`TFBrosModel`]. It is used to
+    instantiate a Bros model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the Bros
+    [jinho8345/bros-base-uncased](https://huggingface.co/jinho8345/bros-base-uncased) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the Bros model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`BrosModel`] or [`TFBrosModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`BrosModel`] or [`TFBrosModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The index of the padding token in the token vocabulary.
+        dim_bbox (`int`, *optional*, defaults to 8):
+            The dimension of the bounding box coordinates. (x0, y1, x1, y0, x1, y1, x0, y1)
+        bbox_scale (`float`, *optional*, defaults to 100.0):
+            The scale factor of the bounding box coordinates.
+        n_relations (`int`, *optional*, defaults to 1):
+            The number of relations for SpadeEE(entity extraction), SpadeEL(entity linking) head.
+        classifier_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the classifier head.
+
+
+    Examples:
+
+    ```python
+    >>> from transformers import BrosConfig, BrosModel
+
+    >>> # Initializing a BROS jinho8345/bros-base-uncased style configuration
+    >>> configuration = BrosConfig()
+
+    >>> # Initializing a model from the jinho8345/bros-base-uncased style configuration
+    >>> model = BrosModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "bros"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        dim_bbox=8,
+        bbox_scale=100.0,
+        n_relations=1,
+        classifier_dropout_prob=0.1,
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_size=vocab_size,
+            hidden_size=hidden_size,
+            num_hidden_layers=num_hidden_layers,
+            num_attention_heads=num_attention_heads,
+            intermediate_size=intermediate_size,
+            hidden_act=hidden_act,
+            hidden_dropout_prob=hidden_dropout_prob,
+            attention_probs_dropout_prob=attention_probs_dropout_prob,
+            max_position_embeddings=max_position_embeddings,
+            type_vocab_size=type_vocab_size,
+            initializer_range=initializer_range,
+            layer_norm_eps=layer_norm_eps,
+            pad_token_id=pad_token_id,
+            **kwargs,
+        )
+
+        self.dim_bbox = dim_bbox
+        self.bbox_scale = bbox_scale
+        self.n_relations = n_relations
+        self.dim_bbox_sinusoid_emb_2d = self.hidden_size // 4
+        self.dim_bbox_sinusoid_emb_1d = self.dim_bbox_sinusoid_emb_2d // self.dim_bbox
+        self.dim_bbox_projection = self.hidden_size // self.num_attention_heads
+        self.classifier_dropout_prob = classifier_dropout_prob
diff --git a/src/transformers/models/bros/convert_bros_to_pytorch.py b/src/transformers/models/bros/convert_bros_to_pytorch.py
new file mode 100644
index 000000000000..c0984f2c74b2
--- /dev/null
+++ b/src/transformers/models/bros/convert_bros_to_pytorch.py
@@ -0,0 +1,145 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert Bros checkpoints."""
+
+import argparse
+
+import bros  # original repo
+import torch
+
+from transformers import BrosConfig, BrosModel, BrosProcessor
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+def get_configs(model_name):
+    bros_config = BrosConfig.from_pretrained(model_name)
+    return bros_config
+
+
+def remove_ignore_keys_(state_dict):
+    ignore_keys = [
+        "embeddings.bbox_sinusoid_emb.inv_freq",
+    ]
+    for k in ignore_keys:
+        state_dict.pop(k, None)
+
+
+def rename_key(name):
+    if name == "embeddings.bbox_projection.weight":
+        name = "bbox_embeddings.bbox_projection.weight"
+
+    if name == "embeddings.bbox_sinusoid_emb.x_pos_emb.inv_freq":
+        name = "bbox_embeddings.bbox_sinusoid_emb.x_pos_emb.inv_freq"
+
+    if name == "embeddings.bbox_sinusoid_emb.y_pos_emb.inv_freq":
+        name = "bbox_embeddings.bbox_sinusoid_emb.y_pos_emb.inv_freq"
+
+    return name
+
+
+def convert_state_dict(orig_state_dict, model):
+    # rename keys
+    for key in orig_state_dict.copy().keys():
+        val = orig_state_dict.pop(key)
+        orig_state_dict[rename_key(key)] = val
+
+    # remove ignore keys
+    remove_ignore_keys_(orig_state_dict)
+
+    return orig_state_dict
+
+
+def convert_bros_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
+    # load original model
+    original_model = bros.BrosModel.from_pretrained(model_name).eval()
+
+    # load HuggingFace Model
+    bros_config = get_configs(model_name)
+    model = BrosModel.from_pretrained(model_name, config=bros_config)
+    model.eval()
+
+    state_dict = original_model.state_dict()
+    new_state_dict = convert_state_dict(state_dict, model)
+    model.load_state_dict(new_state_dict)
+
+    # verify results
+
+    # original BROS model require 4 points (8 float values) for each bbox, prepare bbox with [batch_size, seq_len, 8] shape
+    bbox = torch.tensor(
+        [
+            [
+                [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
+                [0.4396, 0.6720, 0.4659, 0.6720, 0.4659, 0.6850, 0.4396, 0.6850],
+                [0.4698, 0.6720, 0.4843, 0.6720, 0.4843, 0.6850, 0.4698, 0.6850],
+                [0.4698, 0.6720, 0.4843, 0.6720, 0.4843, 0.6850, 0.4698, 0.6850],
+                [0.2047, 0.6870, 0.2730, 0.6870, 0.2730, 0.7000, 0.2047, 0.7000],
+                [0.2047, 0.6870, 0.2730, 0.6870, 0.2730, 0.7000, 0.2047, 0.7000],
+                [1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
+            ]
+        ]
+    )
+
+    processor = BrosProcessor.from_pretrained(model_name)
+
+    encoding = processor("His name is Rocco.", return_tensors="pt")
+    encoding["bbox"] = bbox
+
+    original_hidden_states = original_model(**encoding).last_hidden_state
+    # pixel_values = processor(image, return_tensors="pt").pixel_values
+
+    last_hidden_states = model(**encoding).last_hidden_state
+
+    assert torch.allclose(original_hidden_states, last_hidden_states, atol=1e-4)
+
+    if pytorch_dump_folder_path is not None:
+        print(f"Saving model and processor to {pytorch_dump_folder_path}")
+        model.save_pretrained(pytorch_dump_folder_path)
+        processor.save_pretrained(pytorch_dump_folder_path)
+
+    if push_to_hub:
+        model.push_to_hub("jinho8345/" + model_name.split("/")[-1], commit_message="Update model")
+        processor.push_to_hub("jinho8345/" + model_name.split("/")[-1], commit_message="Update model")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    # Required parameters
+    parser.add_argument(
+        "--model_name",
+        default="jinho8345/bros-base-uncased",
+        required=False,
+        type=str,
+        help="Name of the original model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        default=None,
+        required=False,
+        type=str,
+        help="Path to the output PyTorch model directory.",
+    )
+    parser.add_argument(
+        "--push_to_hub",
+        action="store_true",
+        help="Whether or not to push the converted model and processor to the 🤗 hub.",
+    )
+
+    args = parser.parse_args()
+    convert_bros_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/bros/modeling_bros.py b/src/transformers/models/bros/modeling_bros.py
new file mode 100755
index 000000000000..a8ea8d49195b
--- /dev/null
+++ b/src/transformers/models/bros/modeling_bros.py
@@ -0,0 +1,1326 @@
+# coding=utf-8
+# Copyright 2023-present NAVER Corp, The Microsoft Research Asia LayoutLM Team Authors and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Bros model."""
+
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_bros import BrosConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "jinho8345/bros-base-uncased"
+_CONFIG_FOR_DOC = "BrosConfig"
+
+BROS_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "jinho8345/bros-base-uncased",
+    "jinho8345/bros-large-uncased",
+    # See all Bros models at https://huggingface.co/models?filter=bros
+]
+
+BROS_START_DOCSTRING = r"""
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`BrosConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+BROS_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`BrosProcessor`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+
+        bbox ('torch.FloatTensor' of shape '(batch_size, num_boxes, 4)'):
+            Bounding box coordinates for each token in the input sequence. Each bounding box is a list of four values
+            (x1, y1, x2, y2), where (x1, y1) is the top left corner, and (x2, y2) is the bottom right corner of the
+            bounding box.
+
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+        bbox_first_token_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to indicate the first token of each bounding box. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@dataclass
+class BrosSpadeOutput(ModelOutput):
+    """
+    Base class for outputs of token classification models.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided) :
+            Classification loss.
+        initial_token_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
+            Classification scores for entity initial tokens (before SoftMax).
+        subsequent_token_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, sequence_length+1)`):
+            Classification scores for entity sequence tokens (before SoftMax).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    initial_token_logits: torch.FloatTensor = None
+    subsequent_token_logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+class BrosPositionalEmbedding1D(nn.Module):
+    # Reference: https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/mem_transformer.py#L15
+
+    def __init__(self, config):
+        super(BrosPositionalEmbedding1D, self).__init__()
+
+        self.dim_bbox_sinusoid_emb_1d = config.dim_bbox_sinusoid_emb_1d
+
+        inv_freq = 1 / (
+            10000 ** (torch.arange(0.0, self.dim_bbox_sinusoid_emb_1d, 2.0) / self.dim_bbox_sinusoid_emb_1d)
+        )
+        self.register_buffer("inv_freq", inv_freq)
+
+    def forward(self, pos_seq: torch.Tensor) -> torch.Tensor:
+        seq_size = pos_seq.size()
+        b1, b2, b3 = seq_size
+        sinusoid_inp = pos_seq.view(b1, b2, b3, 1) * self.inv_freq.view(1, 1, 1, self.dim_bbox_sinusoid_emb_1d // 2)
+        pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1)
+        return pos_emb
+
+
+class BrosPositionalEmbedding2D(nn.Module):
+    def __init__(self, config):
+        super(BrosPositionalEmbedding2D, self).__init__()
+
+        self.dim_bbox = config.dim_bbox
+        self.x_pos_emb = BrosPositionalEmbedding1D(config)
+        self.y_pos_emb = BrosPositionalEmbedding1D(config)
+
+    def forward(self, bbox: torch.Tensor) -> torch.Tensor:
+        stack = []
+        for i in range(self.dim_bbox):
+            if i % 2 == 0:
+                stack.append(self.x_pos_emb(bbox[..., i]))
+            else:
+                stack.append(self.y_pos_emb(bbox[..., i]))
+        bbox_pos_emb = torch.cat(stack, dim=-1)
+        return bbox_pos_emb
+
+
+class BrosBboxEmbeddings(nn.Module):
+    def __init__(self, config):
+        super(BrosBboxEmbeddings, self).__init__()
+        self.bbox_sinusoid_emb = BrosPositionalEmbedding2D(config)
+        self.bbox_projection = nn.Linear(config.dim_bbox_sinusoid_emb_2d, config.dim_bbox_projection, bias=False)
+
+    def forward(self, bbox: torch.Tensor):
+        bbox_t = bbox.transpose(0, 1)
+        bbox_pos = bbox_t[None, :, :, :] - bbox_t[:, None, :, :]
+        bbox_pos_emb = self.bbox_sinusoid_emb(bbox_pos)
+        bbox_pos_emb = self.bbox_projection(bbox_pos_emb)
+
+        return bbox_pos_emb
+
+
+class BrosTextEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "token_type_ids",
+            torch.zeros(
+                self.position_ids.size(),
+                dtype=torch.long,
+                device=self.position_ids.device,
+            ),
+            persistent=False,
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values_length: int = 0,
+    ) -> torch.Tensor:
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BrosSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x: torch.Tensor):
+        new_x_shape = x.size()[:-1] + (
+            self.num_attention_heads,
+            self.attention_head_size,
+        )
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        bbox_pos_emb: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[torch.Tensor] = False,
+    ) -> Tuple[torch.Tensor]:
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        # bbox positional encoding
+        batch_size, n_head, seq_length, d_head = query_layer.shape
+        bbox_pos_emb = bbox_pos_emb.view(seq_length, seq_length, batch_size, d_head)
+        bbox_pos_emb = bbox_pos_emb.permute([2, 0, 1, 3])
+        bbox_pos_scores = torch.einsum("bnid,bijd->bnij", (query_layer, bbox_pos_emb))
+
+        attention_scores = attention_scores + bbox_pos_scores
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BrosModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->Bros
+class BrosSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BrosAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.self = BrosSelfAttention(config)
+        self.output = BrosSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads,
+            self.self.num_attention_heads,
+            self.self.attention_head_size,
+            self.pruned_heads,
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        bbox_pos_emb: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        self_outputs = self.self(
+            hidden_states=hidden_states,
+            bbox_pos_emb=bbox_pos_emb,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->Bros
+class BrosIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BrosOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BrosLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BrosAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise Exception(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = BrosAttention(config)
+        self.intermediate = BrosIntermediate(config)
+        self.output = BrosOutput(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        bbox_pos_emb: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            bbox_pos_emb=bbox_pos_emb,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if hasattr(self, "crossattention"):
+                raise Exception(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`"
+                )
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk,
+            self.chunk_size_feed_forward,
+            self.seq_len_dim,
+            attention_output,
+        )
+        outputs = (layer_output,) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class BrosEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([BrosLayer(config) for _ in range(config.num_hidden_layers)])
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        bbox_pos_emb: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
+                        "`use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    bbox_pos_emb,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states=hidden_states,
+                    bbox_pos_emb=bbox_pos_emb,
+                    attention_mask=attention_mask,
+                    head_mask=layer_head_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->Bros
+class BrosPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BrosRelationExtractor(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.n_relations = config.n_relations
+        self.backbone_hidden_size = config.hidden_size
+        self.head_hidden_size = config.hidden_size
+        self.classifier_dropout_prob = config.classifier_dropout_prob
+
+        self.drop = nn.Dropout(self.classifier_dropout_prob)
+        self.query = nn.Linear(self.backbone_hidden_size, self.n_relations * self.head_hidden_size)
+
+        self.key = nn.Linear(self.backbone_hidden_size, self.n_relations * self.head_hidden_size)
+
+        self.dummy_node = nn.Parameter(torch.zeros(1, self.backbone_hidden_size))
+
+    def forward(self, query_layer: torch.Tensor, key_layer: torch.Tensor):
+        query_layer = self.query(self.drop(query_layer))
+
+        dummy_vec = self.dummy_node.unsqueeze(0).repeat(1, key_layer.size(1), 1)
+        key_layer = torch.cat([key_layer, dummy_vec], axis=0)
+        key_layer = self.key(self.drop(key_layer))
+
+        query_layer = query_layer.view(
+            query_layer.size(0), query_layer.size(1), self.n_relations, self.head_hidden_size
+        )
+        key_layer = key_layer.view(key_layer.size(0), key_layer.size(1), self.n_relations, self.head_hidden_size)
+
+        relation_score = torch.matmul(
+            query_layer.permute(2, 1, 0, 3), key_layer.permute(2, 1, 3, 0)
+        )  # equivalent to torch.einsum("ibnd,jbnd->nbij", (query_layer, key_layer))
+
+        return relation_score
+
+
+class BrosPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = BrosConfig
+    base_model_prefix = "bros"
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+@add_start_docstrings(
+    "The bare Bros Model transformer outputting raw hidden-states without any specific head on top.",
+    BROS_START_DOCSTRING,
+)
+class BrosModel(BrosPreTrainedModel):
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = BrosTextEmbeddings(config)
+        self.bbox_embeddings = BrosBboxEmbeddings(config)
+        self.encoder = BrosEncoder(config)
+
+        self.pooler = BrosPooler(config) if add_pooling_layer else None
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(BROS_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=BaseModelOutputWithPoolingAndCrossAttentions, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        bbox: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> import torch
+        >>> from transformers import BrosProcessor, BrosModel
+
+        >>> processor = BrosProcessor.from_pretrained("jinho8345/bros-base-uncased")
+
+        >>> model = BrosModel.from_pretrained("jinho8345/bros-base-uncased")
+
+        >>> encoding = processor("Hello, my dog is cute", add_special_tokens=False, return_tensors="pt")
+        >>> bbox = torch.tensor([[[0, 0, 1, 1]]]).repeat(1, encoding["input_ids"].shape[-1], 1)
+        >>> encoding["bbox"] = bbox
+
+        >>> outputs = model(**encoding)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if bbox is None:
+            raise ValueError("You have to specify bbox")
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(input_shape, device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+
+        # if bbox has 2 points (4 float tensors) per token, convert it to 4 points (8 float tensors) per token
+        if bbox.shape[-1] == 4:
+            bbox = bbox[:, :, [0, 1, 2, 1, 2, 3, 0, 3]]
+        scaled_bbox = bbox * self.config.bbox_scale
+        bbox_position_embeddings = self.bbox_embeddings(scaled_bbox)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            bbox_pos_emb=bbox_position_embeddings,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Bros Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    BROS_START_DOCSTRING,
+)
+class BrosForTokenClassification(BrosPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bros = BrosModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if hasattr(config, "classifier_dropout") else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(BROS_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=TokenClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        bbox: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        bbox_first_token_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> import torch
+        >>> from transformers import BrosProcessor, BrosForTokenClassification
+
+        >>> processor = BrosProcessor.from_pretrained("jinho8345/bros-base-uncased")
+
+        >>> model = BrosForTokenClassification.from_pretrained("jinho8345/bros-base-uncased")
+
+        >>> encoding = processor("Hello, my dog is cute", add_special_tokens=False, return_tensors="pt")
+        >>> bbox = torch.tensor([[[0, 0, 1, 1]]]).repeat(1, encoding["input_ids"].shape[-1], 1)
+        >>> encoding["bbox"] = bbox
+
+        >>> outputs = model(**encoding)
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bros(
+            input_ids,
+            bbox=bbox,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            if bbox_first_token_mask is not None:
+                bbox_first_token_mask = bbox_first_token_mask.view(-1)
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels)[bbox_first_token_mask], labels.view(-1)[bbox_first_token_mask]
+                )
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Bros Model with a token classification head on top (initial_token_layers and subsequent_token_layer on top of the
+    hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. The initial_token_classifier is used to
+    predict the first token of each entity, and the subsequent_token_classifier is used to predict the subsequent
+    tokens within an entity. Compared to BrosForTokenClassification, this model is more robust to serialization errors
+    since it predicts next token from one token.
+    """,
+    BROS_START_DOCSTRING,
+)
+class BrosSpadeEEForTokenClassification(BrosPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.num_labels = config.num_labels
+        self.n_relations = config.n_relations
+        self.backbone_hidden_size = config.hidden_size
+
+        self.bros = BrosModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if hasattr(config, "classifier_dropout") else config.hidden_dropout_prob
+        )
+
+        # Initial token classification for Entity Extraction (NER)
+        self.initial_token_classifier = nn.Sequential(
+            nn.Dropout(classifier_dropout),
+            nn.Linear(config.hidden_size, config.hidden_size),
+            nn.Dropout(classifier_dropout),
+            nn.Linear(config.hidden_size, config.num_labels),
+        )
+
+        # Subsequent token classification for Entity Extraction (NER)
+        self.subsequent_token_classifier = BrosRelationExtractor(config)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(BROS_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=BrosSpadeOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        bbox: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        bbox_first_token_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        initial_token_labels: Optional[torch.Tensor] = None,
+        subsequent_token_labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], BrosSpadeOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> import torch
+        >>> from transformers import BrosProcessor, BrosSpadeEEForTokenClassification
+
+        >>> processor = BrosProcessor.from_pretrained("jinho8345/bros-base-uncased")
+
+        >>> model = BrosSpadeEEForTokenClassification.from_pretrained("jinho8345/bros-base-uncased")
+
+        >>> encoding = processor("Hello, my dog is cute", add_special_tokens=False, return_tensors="pt")
+        >>> bbox = torch.tensor([[[0, 0, 1, 1]]]).repeat(1, encoding["input_ids"].shape[-1], 1)
+        >>> encoding["bbox"] = bbox
+
+        >>> outputs = model(**encoding)
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bros(
+            input_ids=input_ids,
+            bbox=bbox,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_states = outputs[0]
+        last_hidden_states = last_hidden_states.transpose(0, 1).contiguous()
+        initial_token_logits = self.initial_token_classifier(last_hidden_states).transpose(0, 1).contiguous()
+        subsequent_token_logits = self.subsequent_token_classifier(last_hidden_states, last_hidden_states).squeeze(0)
+
+        # make subsequent token (sequence token classification) mask
+        inv_attention_mask = 1 - attention_mask
+        batch_size, max_seq_length = inv_attention_mask.shape
+        device = inv_attention_mask.device
+        invalid_token_mask = torch.cat([inv_attention_mask, torch.zeros([batch_size, 1]).to(device)], axis=1).bool()
+        subsequent_token_logits = subsequent_token_logits.masked_fill(
+            invalid_token_mask[:, None, :], torch.finfo(subsequent_token_logits.dtype).min
+        )
+        self_token_mask = torch.eye(max_seq_length, max_seq_length + 1).to(device).bool()
+        subsequent_token_logits = subsequent_token_logits.masked_fill(
+            self_token_mask[None, :, :], torch.finfo(subsequent_token_logits.dtype).min
+        )
+        subsequent_token_mask = attention_mask.view(-1).bool()
+
+        loss = None
+        if initial_token_labels is not None and subsequent_token_labels is not None:
+            loss_fct = CrossEntropyLoss()
+
+            # get initial token loss
+            initial_token_labels = initial_token_labels.view(-1)
+            if bbox_first_token_mask is not None:
+                bbox_first_token_mask = bbox_first_token_mask.view(-1)
+                initial_token_loss = loss_fct(
+                    initial_token_logits.view(-1, self.num_labels)[bbox_first_token_mask],
+                    initial_token_labels[bbox_first_token_mask],
+                )
+            else:
+                initial_token_loss = loss_fct(initial_token_logits.view(-1, self.num_labels), initial_token_labels)
+
+            subsequent_token_labels = subsequent_token_labels.view(-1)
+            subsequent_token_loss = loss_fct(
+                subsequent_token_logits.view(-1, max_seq_length + 1)[subsequent_token_mask],
+                subsequent_token_labels[subsequent_token_mask],
+            )
+
+            loss = initial_token_loss + subsequent_token_loss
+
+        if not return_dict:
+            output = (initial_token_logits, subsequent_token_logits) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return BrosSpadeOutput(
+            loss=loss,
+            initial_token_logits=initial_token_logits,
+            subsequent_token_logits=subsequent_token_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Bros Model with a token classification head on top (a entity_linker layer on top of the hidden-states output) e.g.
+    for Entity-Linking. The entity_linker is used to predict intra-entity links (one entity to another entity).
+    """,
+    BROS_START_DOCSTRING,
+)
+class BrosSpadeELForTokenClassification(BrosPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.num_labels = config.num_labels
+        self.n_relations = config.n_relations
+        self.backbone_hidden_size = config.hidden_size
+
+        self.bros = BrosModel(config)
+        (config.classifier_dropout if hasattr(config, "classifier_dropout") else config.hidden_dropout_prob)
+
+        self.entity_linker = BrosRelationExtractor(config)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(BROS_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=TokenClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        bbox: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        bbox_first_token_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> import torch
+        >>> from transformers import BrosProcessor, BrosSpadeELForTokenClassification
+
+        >>> processor = BrosProcessor.from_pretrained("jinho8345/bros-base-uncased")
+
+        >>> model = BrosSpadeELForTokenClassification.from_pretrained("jinho8345/bros-base-uncased")
+
+        >>> encoding = processor("Hello, my dog is cute", add_special_tokens=False, return_tensors="pt")
+        >>> bbox = torch.tensor([[[0, 0, 1, 1]]]).repeat(1, encoding["input_ids"].shape[-1], 1)
+        >>> encoding["bbox"] = bbox
+
+        >>> outputs = model(**encoding)
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bros(
+            input_ids=input_ids,
+            bbox=bbox,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_states = outputs[0]
+        last_hidden_states = last_hidden_states.transpose(0, 1).contiguous()
+
+        logits = self.entity_linker(last_hidden_states, last_hidden_states).squeeze(0)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+
+            batch_size, max_seq_length = attention_mask.shape
+            device = attention_mask.device
+
+            self_token_mask = torch.eye(max_seq_length, max_seq_length + 1).to(device).bool()
+
+            mask = bbox_first_token_mask.view(-1)
+            bbox_first_token_mask = torch.cat(
+                [
+                    ~bbox_first_token_mask,
+                    torch.zeros([batch_size, 1], dtype=torch.bool).to(device),
+                ],
+                axis=1,
+            )
+            logits = logits.masked_fill(bbox_first_token_mask[:, None, :], torch.finfo(logits.dtype).min)
+            logits = logits.masked_fill(self_token_mask[None, :, :], torch.finfo(logits.dtype).min)
+
+            loss = loss_fct(logits.view(-1, max_seq_length + 1)[mask], labels.view(-1)[mask])
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/bros/processing_bros.py b/src/transformers/models/bros/processing_bros.py
new file mode 100644
index 000000000000..77b73e48b90a
--- /dev/null
+++ b/src/transformers/models/bros/processing_bros.py
@@ -0,0 +1,108 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Bros.
+"""
+
+from typing import List, Optional, Union
+
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
+from ...utils import TensorType
+
+
+class BrosProcessor(ProcessorMixin):
+    r"""
+    Constructs a Bros processor which wraps a BERT tokenizer.
+
+    [`BrosProcessor`] offers all the functionalities of [`BertTokenizerFast`]. See the docstring of
+    [`~BrosProcessor.__call__`] and [`~BrosProcessor.decode`] for more information.
+
+    Args:
+        tokenizer (`BertTokenizerFast`, *optional*):
+            An instance of ['BertTokenizerFast`]. The tokenizer is a required input.
+    """
+    attributes = ["tokenizer"]
+    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
+
+    def __init__(self, tokenizer=None, **kwargs):
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+
+        super().__init__(tokenizer)
+
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        This method uses [`BertTokenizerFast.__call__`] to prepare text for the model.
+
+        Please refer to the docstring of the above two methods for more information.
+        """
+        encoding = self.tokenizer(
+            text=text,
+            add_special_tokens=add_special_tokens,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            return_tensors=return_tensors,
+            **kwargs,
+        )
+
+        return encoding
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to BertTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to BertTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names))
diff --git a/src/transformers/models/byt5/__init__.py b/src/transformers/models/byt5/__init__.py
index d7cffb390beb..662a427383ff 100644
--- a/src/transformers/models/byt5/__init__.py
+++ b/src/transformers/models/byt5/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/byt5/tokenization_byt5.py b/src/transformers/models/byt5/tokenization_byt5.py
index 0071d7a9afe4..68c70db0d18d 100644
--- a/src/transformers/models/byt5/tokenization_byt5.py
+++ b/src/transformers/models/byt5/tokenization_byt5.py
@@ -16,7 +16,7 @@
 
 
 import warnings
-from typing import Dict, List, Optional, Tuple
+from typing import List, Optional, Tuple
 
 from ...tokenization_utils import AddedToken, PreTrainedTokenizer
 from ...utils import logging
@@ -48,7 +48,7 @@ class ByT5Tokenizer(PreTrainedTokenizer):
             token instead.
         pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        extra_ids (`int`, *optional*, defaults to 100):
+        extra_ids (`int`, *optional*, defaults to 125):
             Add a number of extra ids added to the end of the vocabulary for use as sentinels. These tokens are
             accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. Extra tokens are
             indexed from the end of the vocabulary up to beginning ("<extra_id_0>" is the last token in the vocabulary
@@ -67,12 +67,12 @@ def __init__(
         pad_token="<pad>",
         extra_ids=125,
         additional_special_tokens=None,
-        **kwargs
+        **kwargs,
     ) -> None:
         # Add extra_ids to the special token list
         if extra_ids > 0 and additional_special_tokens is None:
             additional_special_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)]
-        elif extra_ids > 0 and additional_special_tokens is not None:
+        elif extra_ids > 0 and additional_special_tokens is not None and len(additional_special_tokens) > 0:
             # Check that we have the right number of extra_id special tokens
             extra_tokens = len(set(filter(lambda x: bool("extra_id" in str(x)), additional_special_tokens)))
             if extra_tokens != extra_ids:
@@ -82,38 +82,31 @@ def __init__(
                     " extra_ids tokens"
                 )
 
-        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
-        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
-        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
-
+        pad_token = AddedToken(pad_token, lstrip=True, rstrip=True) if isinstance(pad_token, str) else pad_token
+        # we force left and right stripping for backward compatibility. The byt5tests depend on this.
+        eos_token = AddedToken(eos_token, lstrip=True, rstrip=True) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, lstrip=True, rstrip=True) if isinstance(unk_token, str) else unk_token
+        # unk token needs to be in the vocab with correct index
+        self._added_tokens_decoder = {0: pad_token, 1: eos_token, 2: unk_token}
+        self.offset = len(self._added_tokens_decoder)
+        self._utf_vocab_size = 2**8  # utf is 8 bits
         super().__init__(
             eos_token=eos_token,
             unk_token=unk_token,
             pad_token=pad_token,
-            extra_ids=extra_ids,
-            additional_special_tokens=additional_special_tokens,
+            extra_ids=0,
+            additional_special_tokens=additional_special_tokens,  # TODO extra ids are not used :sweatywmile:
             **kwargs,
         )
 
-        self._extra_ids = extra_ids
-
-        self._utf_vocab_size = 2**8  # utf is 8 bits
-
-        # define special tokens dict
-        self.special_tokens_encoder: Dict[int, str] = {
-            self.pad_token: 0,
-            self.eos_token: 1,
-            self.unk_token: 2,
-        }
-        self._num_special_tokens = len(self.special_tokens_encoder)
-        n = len(additional_special_tokens)
-        for i, token in enumerate(additional_special_tokens):
-            self.special_tokens_encoder[token] = self.vocab_size + i - n
-        self.special_tokens_decoder: Dict[str, int] = {v: k for k, v in self.special_tokens_encoder.items()}
-
     @property
     def vocab_size(self):
-        return self._utf_vocab_size + self._num_special_tokens + self._extra_ids
+        return self._utf_vocab_size
+
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size + self.offset)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
 
     def get_special_tokens_mask(
         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
@@ -209,34 +202,25 @@ def _tokenize(self, text: str) -> List[str]:
 
     def _convert_token_to_id(self, token):
         """Converts a token (str) in an id using the vocab."""
-        if token in self.special_tokens_encoder:
-            token_id = self.special_tokens_encoder[token]
-        elif token in self.added_tokens_encoder:
-            token_id = self.added_tokens_encoder[token]
-        elif len(token) != 1:
-            token_id = self.unk_token_id
+
+        if len(token) != 1:
+            token_id = None
         else:
-            token_id = ord(token) + self._num_special_tokens
+            token_id = ord(token) + self.offset
+
         return token_id
 
     def _convert_id_to_token(self, index):
         """Converts an index (integer) in a token (str) using the vocab."""
-        if index in self.special_tokens_decoder:
-            token = self.special_tokens_decoder[index]
-        else:
-            token = chr(index - self._num_special_tokens)
+        token = chr(index - self.offset)
         return token
 
     def convert_tokens_to_string(self, tokens):
         """Converts a sequence of tokens (string) in a single string."""
         bstring = b""
         for token in tokens:
-            if token in self.special_tokens_decoder:
-                tok_string = self.special_tokens_decoder[token].encode("utf-8")
-            elif token in self.added_tokens_decoder:
-                tok_string = self.special_tokens_decoder[token].encode("utf-8")
-            elif token in self.special_tokens_encoder:
-                tok_string = token.encode("utf-8")
+            if token in self.added_tokens_decoder:
+                tok_string = self.added_tokens_decoder[token].encode("utf-8")
             elif token in self.added_tokens_encoder:
                 tok_string = token.encode("utf-8")
             else:
diff --git a/src/transformers/models/camembert/__init__.py b/src/transformers/models/camembert/__init__.py
index 133cf41ff6ed..9882fc2b9733 100644
--- a/src/transformers/models/camembert/__init__.py
+++ b/src/transformers/models/camembert/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/camembert/configuration_camembert.py b/src/transformers/models/camembert/configuration_camembert.py
index 09989f1cb85f..d712726492ae 100644
--- a/src/transformers/models/camembert/configuration_camembert.py
+++ b/src/transformers/models/camembert/configuration_camembert.py
@@ -126,7 +126,7 @@ def __init__(
         position_embedding_type="absolute",
         use_cache=True,
         classifier_dropout=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
diff --git a/src/transformers/models/camembert/modeling_camembert.py b/src/transformers/models/camembert/modeling_camembert.py
index e7c22b917af6..4635c061980b 100644
--- a/src/transformers/models/camembert/modeling_camembert.py
+++ b/src/transformers/models/camembert/modeling_camembert.py
@@ -50,7 +50,6 @@
 
 _CHECKPOINT_FOR_DOC = "camembert-base"
 _CONFIG_FOR_DOC = "CamembertConfig"
-_TOKENIZER_FOR_DOC = "CamembertTokenizer"
 
 CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "camembert-base",
@@ -95,7 +94,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
         )
@@ -507,6 +508,13 @@ def forward(
         all_self_attentions = () if output_attentions else None
         all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         next_decoder_cache = () if use_cache else None
         for i, layer_module in enumerate(self.layer):
             if output_hidden_states:
@@ -517,12 +525,6 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         return module(*inputs, past_key_value, output_attentions)
@@ -627,22 +629,13 @@ def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, CamembertEncoder):
             module.gradient_checkpointing = value
 
-    def update_keys_to_ignore(self, config, del_keys_to_ignore):
-        """Remove some keys from ignore list"""
-        if not config.tie_word_embeddings:
-            # must make a new list, or the class variable gets modified!
-            self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore]
-            self._keys_to_ignore_on_load_missing = [
-                k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore
-            ]
-
 
 CAMEMBERT_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`CamembertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -762,7 +755,6 @@ class CamembertModel(CamembertPreTrainedModel):
 
     """
 
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
     _no_split_modules = []
 
     # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Camembert
@@ -794,7 +786,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPoolingAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -850,6 +841,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -936,9 +928,7 @@ def forward(
 )
 # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM with Roberta->Camembert, ROBERTA->CAMEMBERT
 class CamembertForMaskedLM(CamembertPreTrainedModel):
-    _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -952,9 +942,6 @@ def __init__(self, config):
         self.roberta = CamembertModel(config, add_pooling_layer=False)
         self.lm_head = CamembertLMHead(config)
 
-        # The LM head weights require special treatment only when they are tied with the word embeddings
-        self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
-
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -966,7 +953,6 @@ def set_output_embeddings(self, new_embeddings):
 
     @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1017,6 +1003,8 @@ def forward(
 
         masked_lm_loss = None
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(prediction_scores.device)
             loss_fct = CrossEntropyLoss()
             masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
 
@@ -1041,8 +1029,6 @@ def forward(
 )
 # Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification with Roberta->Camembert, ROBERTA->CAMEMBERT
 class CamembertForSequenceClassification(CamembertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1056,7 +1042,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="cardiffnlp/twitter-roberta-base-emotion",
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1100,6 +1085,8 @@ def forward(
 
         loss = None
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
             if self.config.problem_type is None:
                 if self.num_labels == 1:
                     self.config.problem_type = "regression"
@@ -1142,8 +1129,6 @@ def forward(
 )
 # Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice with Roberta->Camembert, ROBERTA->CAMEMBERT
 class CamembertForMultipleChoice(CamembertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -1158,7 +1143,6 @@ def __init__(self, config):
         CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
     )
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1214,6 +1198,8 @@ def forward(
 
         loss = None
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(reshaped_logits.device)
             loss_fct = CrossEntropyLoss()
             loss = loss_fct(reshaped_logits, labels)
 
@@ -1238,9 +1224,6 @@ def forward(
 )
 # Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification with Roberta->Camembert, ROBERTA->CAMEMBERT
 class CamembertForTokenClassification(CamembertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1257,7 +1240,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="Jean-Baptiste/roberta-large-ner-english",
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1302,6 +1284,8 @@ def forward(
 
         loss = None
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
             loss_fct = CrossEntropyLoss()
             loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
 
@@ -1326,9 +1310,6 @@ def forward(
 )
 # Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering with Roberta->Camembert, ROBERTA->CAMEMBERT
 class CamembertForQuestionAnswering(CamembertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1341,7 +1322,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="deepset/roberta-base-squad2",
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1428,9 +1408,7 @@ def forward(
 )
 # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with Roberta->Camembert, ROBERTA->CAMEMBERT, roberta-base->camembert-base
 class CamembertForCausalLM(CamembertPreTrainedModel):
-    _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1441,9 +1419,6 @@ def __init__(self, config):
         self.roberta = CamembertModel(config, add_pooling_layer=False)
         self.lm_head = CamembertLMHead(config)
 
-        # The LM head weights require special treatment only when they are tied with the word embeddings
-        self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
-
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -1540,6 +1515,8 @@ def forward(
 
         lm_loss = None
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(prediction_scores.device)
             # we are doing next-token prediction; shift prediction scores and input ids by one
             shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
             labels = labels[:, 1:].contiguous()
@@ -1571,10 +1548,12 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attenti
 
         return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
 
-    def _reorder_cache(self, past, beam_idx):
+    def _reorder_cache(self, past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
         return reordered_past
 
 
diff --git a/src/transformers/models/camembert/modeling_tf_camembert.py b/src/transformers/models/camembert/modeling_tf_camembert.py
index 025bc3945dc1..8def74a5b304 100644
--- a/src/transformers/models/camembert/modeling_tf_camembert.py
+++ b/src/transformers/models/camembert/modeling_tf_camembert.py
@@ -15,6 +15,9 @@
 # limitations under the License.
 """ TF 2.0 CamemBERT model."""
 
+
+from __future__ import annotations
+
 import math
 import warnings
 from typing import Optional, Tuple, Union
@@ -46,10 +49,8 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
-    DUMMY_INPUTS,
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -62,7 +63,6 @@
 
 _CHECKPOINT_FOR_DOC = "camembert-base"
 _CONFIG_FOR_DOC = "CamembertConfig"
-_TOKENIZER_FOR_DOC = "CamembertTokenizer"
 
 TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     # See all CamemBERT models at https://huggingface.co/models?filter=camembert
@@ -116,7 +116,7 @@
         input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`CamembertTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -177,8 +177,7 @@ def __init__(self, config, **kwargs):
         super().__init__(**kwargs)
 
         self.padding_idx = 1
-        self.vocab_size = config.vocab_size
-        self.type_vocab_size = config.type_vocab_size
+        self.config = config
         self.hidden_size = config.hidden_size
         self.max_position_embeddings = config.max_position_embeddings
         self.initializer_range = config.initializer_range
@@ -189,14 +188,14 @@ def build(self, input_shape: tf.TensorShape):
         with tf.name_scope("word_embeddings"):
             self.weight = self.add_weight(
                 name="weight",
-                shape=[self.vocab_size, self.hidden_size],
+                shape=[self.config.vocab_size, self.hidden_size],
                 initializer=get_initializer(self.initializer_range),
             )
 
         with tf.name_scope("token_type_embeddings"):
             self.token_type_embeddings = self.add_weight(
                 name="embeddings",
-                shape=[self.type_vocab_size, self.hidden_size],
+                shape=[self.config.type_vocab_size, self.hidden_size],
                 initializer=get_initializer(self.initializer_range),
             )
 
@@ -241,16 +240,7 @@ def call(
         assert not (input_ids is None and inputs_embeds is None)
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
@@ -537,9 +527,9 @@ def call(
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
         head_mask: tf.Tensor,
-        encoder_hidden_states: Optional[tf.Tensor],
-        encoder_attention_mask: Optional[tf.Tensor],
-        past_key_value: Optional[Tuple[tf.Tensor]],
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_value: Tuple[tf.Tensor] | None,
         output_attentions: bool,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
@@ -616,9 +606,9 @@ def call(
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
         head_mask: tf.Tensor,
-        encoder_hidden_states: Optional[tf.Tensor],
-        encoder_attention_mask: Optional[tf.Tensor],
-        past_key_values: Optional[Tuple[Tuple[tf.Tensor]]],
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_values: Tuple[Tuple[tf.Tensor]] | None,
         use_cache: Optional[bool],
         output_attentions: bool,
         output_hidden_states: bool,
@@ -716,14 +706,14 @@ class PreTrainedModel
     # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.call
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -731,7 +721,6 @@ def call(
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFBaseModelOutputWithPoolingAndCrossAttentions, Tuple[tf.Tensor]]:
-
         if not self.config.is_decoder:
             use_cache = False
 
@@ -882,38 +871,6 @@ class TFCamembertPreTrainedModel(TFPreTrainedModel):
     config_class = CamembertConfig
     base_model_prefix = "roberta"
 
-    @property
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertPreTrainedModel.dummy_inputs
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        dummy = {"input_ids": tf.constant(DUMMY_INPUTS, dtype=tf.int32)}
-        # Add `encoder_hidden_states` to make the cross-attention layers' weights initialized
-        if self.config.add_cross_attention:
-            batch_size, seq_len = tf.constant(DUMMY_INPUTS).shape
-            shape = (batch_size, seq_len) + (self.config.hidden_size,)
-            h = tf.random.uniform(shape=shape)
-            dummy["encoder_hidden_states"] = h
-
-        return dummy
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 @add_start_docstrings(
     "The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.",
@@ -928,21 +885,20 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFBaseModelOutputWithPoolingAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -989,27 +945,6 @@ def call(
 
         return outputs
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertModel.serving_output
-    def serving_output(
-        self, output: TFBaseModelOutputWithPoolingAndCrossAttentions
-    ) -> TFBaseModelOutputWithPoolingAndCrossAttentions:
-        output_cache = self.config.use_cache and self.config.is_decoder
-        pkv = tf.convert_to_tensor(output.past_key_values) if output_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = tf.convert_to_tensor(output.cross_attentions) if output.cross_attentions is not None else None
-        if not (self.config.output_attentions and self.config.add_cross_attention):
-            cross_attns = None
-
-        return TFBaseModelOutputWithPoolingAndCrossAttentions(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            past_key_values=pkv,
-            hidden_states=hs,
-            attentions=attns,
-            cross_attentions=cross_attns,
-        )
-
 
 # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->Camembert
 class TFCamembertLMHead(tf.keras.layers.Layer):
@@ -1018,7 +953,7 @@ class TFCamembertLMHead(tf.keras.layers.Layer):
     def __init__(self, config, input_embeddings, **kwargs):
         super().__init__(**kwargs)
 
-        self.vocab_size = config.vocab_size
+        self.config = config
         self.hidden_size = config.hidden_size
         self.dense = tf.keras.layers.Dense(
             config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
@@ -1031,7 +966,7 @@ def __init__(self, config, input_embeddings, **kwargs):
         self.decoder = input_embeddings
 
     def build(self, input_shape):
-        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
+        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
 
         super().build(input_shape)
 
@@ -1047,7 +982,7 @@ def get_bias(self):
 
     def set_bias(self, value):
         self.bias = value["bias"]
-        self.vocab_size = shape_list(value["bias"])[0]
+        self.config.vocab_size = shape_list(value["bias"])[0]
 
     def call(self, hidden_states):
         hidden_states = self.dense(hidden_states)
@@ -1058,7 +993,7 @@ def call(self, hidden_states):
         seq_length = shape_list(tensor=hidden_states)[1]
         hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
         hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True)
-        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
         hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
 
         return hidden_states
@@ -1089,7 +1024,6 @@ def get_prefix_bias_name(self):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFMaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1099,16 +1033,16 @@ def get_prefix_bias_name(self):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1146,13 +1080,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output
-    def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead
 class TFCamembertClassificationHead(tf.keras.layers.Layer):
@@ -1205,7 +1132,6 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="cardiffnlp/twitter-roberta-base-emotion",
         output_type=TFSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1214,16 +1140,16 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1260,13 +1186,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1297,7 +1216,6 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="ydshieh/roberta-large-ner-english",
         output_type=TFTokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1306,16 +1224,16 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1352,13 +1270,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1382,38 +1293,27 @@ def __init__(self, config, *inputs, **kwargs):
             1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
         )
 
-    @property
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            tf.Tensor with dummy inputs
-        """
-        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32)}
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(
         CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
     )
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFMultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1463,26 +1363,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output
-    def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMultipleChoiceModelOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1508,7 +1388,6 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="ydshieh/roberta-base-squad2",
         output_type=TFQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1517,17 +1396,17 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1577,15 +1456,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output
-    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
-
 
 @add_start_docstrings(
     """CamemBERT Model with a `language modeling` head on top for CLM fine-tuning.""", CAMEMBERT_START_DOCSTRING
@@ -1627,27 +1497,26 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attenti
     @unpack_inputs
     @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFCausalLMOutputWithCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFCausalLMOutputWithCrossAttentions, Tuple[tf.Tensor]]:
         r"""
@@ -1712,17 +1581,3 @@ def call(
             attentions=outputs.attentions,
             cross_attentions=outputs.cross_attentions,
         )
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMHeadModel.serving_output
-    def serving_output(self, output: TFCausalLMOutputWithCrossAttentions) -> TFCausalLMOutputWithCrossAttentions:
-        output_cache = self.config.use_cache and self.config.is_decoder
-        pkv = tf.convert_to_tensor(output.past_key_values) if output_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = tf.convert_to_tensor(output.cross_attentions) if output.cross_attentions is not None else None
-        if not (self.config.output_attentions and self.config.add_cross_attention):
-            cross_attns = None
-
-        return TFCausalLMOutputWithCrossAttentions(
-            logits=output.logits, past_key_values=pkv, hidden_states=hs, attentions=attns, cross_attentions=cross_attns
-        )
diff --git a/src/transformers/models/camembert/tokenization_camembert.py b/src/transformers/models/camembert/tokenization_camembert.py
index f5988fd9d784..5a23d9b73b94 100644
--- a/src/transformers/models/camembert/tokenization_camembert.py
+++ b/src/transformers/models/camembert/tokenization_camembert.py
@@ -89,7 +89,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
         mask_token (`str`, *optional*, defaults to `"<mask>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
+        additional_special_tokens (`List[str]`, *optional*, defaults to `['<s>NOTUSED', '</s>NOTUSED']`):
             Additional special tokens used by the tokenizer.
         sp_model_kwargs (`dict`, *optional*):
             Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
@@ -129,13 +129,36 @@ def __init__(
         mask_token="<mask>",
         additional_special_tokens=["<s>NOTUSED", "</s>NOTUSED"],
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
 
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
 
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(str(vocab_file))
+        self.vocab_file = vocab_file
+
+        # HACK: These tokens were added by the author for an obscure reason as they were already part of the
+        # sentencepiece vocabulary (this is the case for <s> and </s> and <unk>).
+        # In this case it is recommended to properly set the tokens by hand.
+        self._added_tokens_decoder = {
+            0: AddedToken("<s>NOTUSED"),
+            1: AddedToken(pad_token),
+            2: AddedToken("</s>NOTUSED"),
+            3: AddedToken(unk_token),
+            4: AddedToken("<unk>NOTUSED"),
+        }
+
+        self.fairseq_offset = 4  # 3 tokens are newly added, but the offset starts from 4
+
+        # legacy: camemebert is a particular case were we have to make sure `"<unk>NOTUSED"` is here
+        if "added_tokens_decoder" in kwargs:
+            # this is the only class that requires this unfortunately.....
+            # the reason is that the fast version has a whole.
+            kwargs["added_tokens_decoder"].update(self._added_tokens_decoder)
+
         super().__init__(
             bos_token=bos_token,
             eos_token=eos_token,
@@ -148,15 +171,83 @@ def __init__(
             sp_model_kwargs=self.sp_model_kwargs,
             **kwargs,
         )
+
+    @property
+    def vocab_size(self):
+        # The length of the vocabulary without added tokens is len(self.sp_model) but the added tokens are added at the beginning.
+        return len(self.sp_model)
+
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size + self.fairseq_offset)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text: str) -> List[str]:
+        return self.sp_model.encode(text, out_type=str)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        # specifi to camembert, both 3 and 4 point to the unk token.
+        if self.sp_model.PieceToId(token) == 0:
+            # Convert sentence piece unk token to fairseq unk token index
+            return self.unk_token_id
+        return self.fairseq_offset + self.sp_model.PieceToId(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.sp_model.IdToPiece(index - self.fairseq_offset)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        # TODO decode outputs do not match between fast and slow
+        current_sub_tokens = []
+        out_string = ""
+        prev_is_special = False
+        for token in tokens:
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                if not prev_is_special:
+                    out_string += " "
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                prev_is_special = True
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+                prev_is_special = False
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string.strip()
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+
+        # for backward compatibility
+        if not hasattr(self, "sp_model_kwargs"):
+            self.sp_model_kwargs = {}
+
         self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(str(vocab_file))
-        self.vocab_file = vocab_file
-        # HACK: These tokens were added by fairseq but don't seem to be actually used when duplicated in the actual
-        # sentencepiece vocabulary (this is the case for <s> and </s>
-        self.fairseq_tokens_to_ids = {"<s>NOTUSED": 0, "<pad>": 1, "</s>NOTUSED": 2, "<unk>": 3}
-        self.fairseq_offset = len(self.fairseq_tokens_to_ids)
-        self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + len(self.fairseq_tokens_to_ids)
-        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
+        self.sp_model.Load(self.vocab_file)
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        return (out_vocab_file,)
 
     def build_inputs_with_special_tokens(
         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
@@ -233,81 +324,3 @@ def create_token_type_ids_from_sequences(
         if token_ids_1 is None:
             return len(cls + token_ids_0 + sep) * [0]
         return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
-
-    @property
-    def vocab_size(self):
-        return len(self.fairseq_tokens_to_ids) + len(self.sp_model)
-
-    def get_vocab(self):
-        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-
-    def _tokenize(self, text: str) -> List[str]:
-        return self.sp_model.encode(text, out_type=str)
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        if token in self.fairseq_tokens_to_ids:
-            return self.fairseq_tokens_to_ids[token]
-        elif self.sp_model.PieceToId(token) == 0:
-            # Convert sentence piece unk token to fairseq unk token index
-            return self.unk_token_id
-        return self.fairseq_offset + self.sp_model.PieceToId(token)
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        if index in self.fairseq_ids_to_tokens:
-            return self.fairseq_ids_to_tokens[index]
-        return self.sp_model.IdToPiece(index - self.fairseq_offset)
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        current_sub_tokens = []
-        out_string = ""
-        prev_is_special = False
-        for token in tokens:
-            # make sure that special tokens are not decoded using sentencepiece model
-            if token in self.all_special_tokens:
-                if not prev_is_special:
-                    out_string += " "
-                out_string += self.sp_model.decode(current_sub_tokens) + token
-                prev_is_special = True
-                current_sub_tokens = []
-            else:
-                current_sub_tokens.append(token)
-                prev_is_special = False
-        out_string += self.sp_model.decode(current_sub_tokens)
-        return out_string.strip()
-
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state["sp_model"] = None
-        return state
-
-    def __setstate__(self, d):
-        self.__dict__ = d
-
-        # for backward compatibility
-        if not hasattr(self, "sp_model_kwargs"):
-            self.sp_model_kwargs = {}
-
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(self.vocab_file)
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-        elif not os.path.isfile(self.vocab_file):
-            with open(out_vocab_file, "wb") as fi:
-                content_spiece_model = self.sp_model.serialized_model_proto()
-                fi.write(content_spiece_model)
-
-        return (out_vocab_file,)
diff --git a/src/transformers/models/camembert/tokenization_camembert_fast.py b/src/transformers/models/camembert/tokenization_camembert_fast.py
index cbffa75a28b7..6a1b9bb54b83 100644
--- a/src/transformers/models/camembert/tokenization_camembert_fast.py
+++ b/src/transformers/models/camembert/tokenization_camembert_fast.py
@@ -120,7 +120,7 @@ def __init__(
         pad_token="<pad>",
         mask_token="<mask>",
         additional_special_tokens=["<s>NOTUSED", "</s>NOTUSED"],
-        **kwargs
+        **kwargs,
     ):
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
@@ -140,7 +140,10 @@ def __init__(
         )
 
         self.vocab_file = vocab_file
-        self.can_save_slow_tokenizer = False if not self.vocab_file else True
+
+    @property
+    def can_save_slow_tokenizer(self) -> bool:
+        return os.path.isfile(self.vocab_file) if self.vocab_file else False
 
     def build_inputs_with_special_tokens(
         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
diff --git a/src/transformers/models/canine/__init__.py b/src/transformers/models/canine/__init__.py
index 307a819e1284..d036045e2f21 100644
--- a/src/transformers/models/canine/__init__.py
+++ b/src/transformers/models/canine/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/canine/configuration_canine.py b/src/transformers/models/canine/configuration_canine.py
index b75ab9cc42b9..1fdeb3204a52 100644
--- a/src/transformers/models/canine/configuration_canine.py
+++ b/src/transformers/models/canine/configuration_canine.py
@@ -112,7 +112,7 @@ def __init__(
         num_hash_functions=8,
         num_hash_buckets=16384,
         local_transformer_stride=128,  # Good TPU/XLA memory alignment.
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
diff --git a/src/transformers/models/canine/convert_canine_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/canine/convert_canine_original_tf_checkpoint_to_pytorch.py
index 15b7b6c32ae5..5d50050d0396 100644
--- a/src/transformers/models/canine/convert_canine_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/canine/convert_canine_original_tf_checkpoint_to_pytorch.py
@@ -25,7 +25,6 @@
 
 
 def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, pytorch_dump_path):
-
     # Initialize PyTorch model
     config = CanineConfig()
     model = CanineModel(config)
diff --git a/src/transformers/models/canine/modeling_canine.py b/src/transformers/models/canine/modeling_canine.py
index 39ab58434234..657104ad6965 100644
--- a/src/transformers/models/canine/modeling_canine.py
+++ b/src/transformers/models/canine/modeling_canine.py
@@ -51,7 +51,6 @@
 
 _CHECKPOINT_FOR_DOC = "google/canine-s"
 _CONFIG_FOR_DOC = "CanineConfig"
-_TOKENIZER_FOR_DOC = "CanineTokenizer"
 
 CANINE_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "google/canine-s",
@@ -217,7 +216,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
 
     def _hash_bucket_tensors(self, input_ids, num_hashes: int, num_buckets: int):
@@ -313,7 +314,6 @@ def __init__(self, config):
         self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
     def forward(self, char_encoding: torch.Tensor) -> torch.Tensor:
-
         # `cls_encoding`: [batch, 1, hidden_size]
         cls_encoding = char_encoding[:, 0:1, :]
 
@@ -902,7 +902,6 @@ class CaninePreTrainedModel(PreTrainedModel):
     load_tf_weights = load_tf_weights_in_canine
     base_model_prefix = "canine"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -941,7 +940,7 @@ def _set_gradient_checkpointing(self, module, value=False):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`CanineTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -1100,7 +1099,6 @@ def _repeat_molecules(self, molecules: torch.Tensor, char_seq_length: torch.Tens
 
     @add_start_docstrings_to_model_forward(CANINE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=CanineModelOutputWithPooling,
         config_class=_CONFIG_FOR_DOC,
@@ -1128,6 +1126,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -1170,7 +1169,9 @@ def forward(
         # Contextualize character embeddings using shallow Transformer.
         # We use a 3D attention mask for the local attention.
         # `input_char_encoding`: shape (batch_size, char_seq_len, char_dim)
-        char_attention_mask = self._create_3d_attention_mask_from_input_mask(input_ids, attention_mask)
+        char_attention_mask = self._create_3d_attention_mask_from_input_mask(
+            input_ids if input_ids is not None else inputs_embeds, attention_mask
+        )
         init_chars_encoder_outputs = self.initial_char_encoder(
             input_char_embeddings,
             attention_mask=char_attention_mask,
@@ -1282,12 +1283,9 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(CANINE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint="vicl/canine-c-finetuned-cola",
+        checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output="'LABEL_0'",
-        expected_loss=0.82,
     )
     def forward(
         self,
@@ -1381,7 +1379,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(CANINE_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1496,10 +1493,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import CanineTokenizer, CanineForTokenClassification
+        >>> from transformers import AutoTokenizer, CanineForTokenClassification
         >>> import torch
 
-        >>> tokenizer = CanineTokenizer.from_pretrained("google/canine-s")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/canine-s")
         >>> model = CanineForTokenClassification.from_pretrained("google/canine-s")
 
         >>> inputs = tokenizer(
@@ -1579,7 +1576,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(CANINE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="Splend1dchan/canine-c-squad",
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/canine/tokenization_canine.py b/src/transformers/models/canine/tokenization_canine.py
index fba01e03c0e8..25932ae75d2a 100644
--- a/src/transformers/models/canine/tokenization_canine.py
+++ b/src/transformers/models/canine/tokenization_canine.py
@@ -33,7 +33,6 @@
 # Below: Constants defining canonical codepoints for special, pseudo-characters.
 # Copied from https://github.com/google-research/language/blob/master/language/canine/special_codepoints.py
 PAD = 0
-
 CLS = 0xE000
 SEP = 0xE001
 BOS = 0xE002
@@ -86,7 +85,7 @@ def __init__(
         mask_token=chr(MASK),
         add_prefix_space=False,
         model_max_length=2048,
-        **kwargs
+        **kwargs,
     ):
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
@@ -97,18 +96,6 @@ def __init__(
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
 
-        super().__init__(
-            bos_token=bos_token,
-            eos_token=eos_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            add_prefix_space=add_prefix_space,
-            model_max_length=model_max_length,
-            **kwargs,
-        )
-
         # Creates a mapping for looking up the IDs of special symbols.
         self._special_codepoints: Dict[str, int] = {}
         for codepoint, name in SPECIAL_CODEPOINTS.items():
@@ -122,10 +109,27 @@ def __init__(
         self._unicode_vocab_size = UNICODE_VOCAB_SIZE
         self._num_special_tokens = len(self._special_codepoints)
 
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            add_prefix_space=add_prefix_space,
+            model_max_length=model_max_length,
+            **kwargs,
+        )
+
     @property
     def vocab_size(self) -> int:
         return self._unicode_vocab_size
 
+    def get_vocab(self):
+        vocab = {chr(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
     def _tokenize(self, text: str) -> List[str]:
         """Tokenize a string (i.e. perform character splitting)."""
         return list(text)
diff --git a/src/transformers/models/chinese_clip/__init__.py b/src/transformers/models/chinese_clip/__init__.py
index bf59169a7e11..dbc0a57e8324 100644
--- a/src/transformers/models/chinese_clip/__init__.py
+++ b/src/transformers/models/chinese_clip/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The OFA-Sys Team Authors and The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/chinese_clip/configuration_chinese_clip.py b/src/transformers/models/chinese_clip/configuration_chinese_clip.py
index a12c36eb83db..cbbf429e1bd5 100644
--- a/src/transformers/models/chinese_clip/configuration_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/configuration_chinese_clip.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ Chinese-CLIP model configuration"""
 
-import copy
 import os
 from collections import OrderedDict
 from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
@@ -122,7 +121,7 @@ def __init__(
         pad_token_id=0,
         position_embedding_type="absolute",
         use_cache=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
 
@@ -144,6 +143,7 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
 
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
@@ -187,10 +187,9 @@ class ChineseCLIPVisionConfig(PretrainedConfig):
             The size (resolution) of each patch.
         hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
-            defaults to 1e-5): The epsilon used by the layer normalization layers.
-        dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
@@ -225,19 +224,17 @@ def __init__(
         image_size=224,
         patch_size=32,
         hidden_act="quick_gelu",
-        layer_norm_eps=0.00001,
-        dropout=0.0,
+        layer_norm_eps=1e-5,
         attention_dropout=0.0,
         initializer_range=0.02,
         initializer_factor=1.0,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size
         self.projection_dim = projection_dim
-        self.dropout = dropout
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
         self.num_channels = num_channels
@@ -251,6 +248,7 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
 
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
@@ -315,28 +313,87 @@ class ChineseCLIPConfig(PretrainedConfig):
     ```"""
 
     model_type = "chinese_clip"
-    is_composition = True
 
     def __init__(
         self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
     ):
-        super().__init__(**kwargs)
-
         # If `_config_dict` exist, we use them for the backward compatibility.
+        # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
+        # of confusion!).
         text_config_dict = kwargs.pop("text_config_dict", None)
         vision_config_dict = kwargs.pop("vision_config_dict", None)
+
+        super().__init__(**kwargs)
+
+        # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
+        # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
+        # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
         if text_config_dict is not None:
-            text_config = text_config_dict
+            if text_config is None:
+                text_config = {}
+
+            # This is the complete result when using `text_config_dict`.
+            _text_config_dict = ChineseCLIPTextConfig(**text_config_dict).to_dict()
+
+            # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
+            for key, value in _text_config_dict.items():
+                if key in text_config and value != text_config[key] and key not in ["transformers_version"]:
+                    # If specified in `text_config_dict`
+                    if key in text_config_dict:
+                        message = (
+                            f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
+                            f'The value `text_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`text_config_dict` is provided which will be used to initialize `ChineseCLIPTextConfig`. "
+                            f'The value `text_config["{key}"]` will be overriden.'
+                        )
+                    logger.warning(message)
+
+            # Update all values in `text_config` with the ones in `_text_config_dict`.
+            text_config.update(_text_config_dict)
+
         if vision_config_dict is not None:
-            vision_config = vision_config_dict
+            if vision_config is None:
+                vision_config = {}
+
+            # This is the complete result when using `vision_config_dict`.
+            _vision_config_dict = ChineseCLIPVisionConfig(**vision_config_dict).to_dict()
+            # convert keys to string instead of integer
+            if "id2label" in _vision_config_dict:
+                _vision_config_dict["id2label"] = {
+                    str(key): value for key, value in _vision_config_dict["id2label"].items()
+                }
+
+            # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different.
+            for key, value in _vision_config_dict.items():
+                if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]:
+                    # If specified in `vision_config_dict`
+                    if key in vision_config_dict:
+                        message = (
+                            f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different "
+                            f'values. The value `vision_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`vision_config_dict` is provided which will be used to initialize "
+                            f'`ChineseCLIPVisionConfig`. The value `vision_config["{key}"]` will be overriden.'
+                        )
+                    logger.warning(message)
+
+            # Update all values in `vision_config` with the ones in `_vision_config_dict`.
+            vision_config.update(_vision_config_dict)
 
         if text_config is None:
             text_config = {}
-            logger.info("text_config is None. Initializing the ChineseCLIPTextConfig with default values.")
+            logger.info("`text_config` is `None`. Initializing the `ChineseCLIPTextConfig` with default values.")
 
         if vision_config is None:
             vision_config = {}
-            logger.info("vision_config is None. initializing the ChineseCLIPVisionConfig with default values.")
+            logger.info("`vision_config` is `None`. initializing the `ChineseCLIPVisionConfig` with default values.")
 
         self.text_config = ChineseCLIPTextConfig(**text_config)
         self.vision_config = ChineseCLIPVisionConfig(**vision_config)
@@ -358,19 +415,6 @@ def from_text_vision_configs(
 
         return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
 
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["text_config"] = self.text_config.to_dict()
-        output["vision_config"] = self.vision_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
-
 
 class ChineseCLIPOnnxConfig(OnnxConfig):
     @property
@@ -405,12 +449,11 @@ def generate_dummy_inputs(
         seq_length: int = -1,
         framework: Optional["TensorType"] = None,
     ) -> Mapping[str, Any]:
-
         text_input_dict = super().generate_dummy_inputs(
             processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
         )
         image_input_dict = super().generate_dummy_inputs(
-            processor.feature_extractor, batch_size=batch_size, framework=framework
+            processor.image_processor, batch_size=batch_size, framework=framework
         )
         return {**text_input_dict, **image_input_dict}
 
diff --git a/src/transformers/models/chinese_clip/convert_chinese_clip_original_pytorch_to_hf.py b/src/transformers/models/chinese_clip/convert_chinese_clip_original_pytorch_to_hf.py
index 6016c51b376c..02c4b7b754b2 100644
--- a/src/transformers/models/chinese_clip/convert_chinese_clip_original_pytorch_to_hf.py
+++ b/src/transformers/models/chinese_clip/convert_chinese_clip_original_pytorch_to_hf.py
@@ -21,7 +21,6 @@
 
 
 def copy_attn_layer(hf_attn_layer, pt_weights, prefix):
-
     q_proj, k_proj, v_proj = pt_weights[f"{prefix}.in_proj_weight"].chunk(3, dim=0)
     q_proj_bias, k_proj_bias, v_proj_bias = pt_weights[f"{prefix}.in_proj_bias"].chunk(3, dim=0)
 
diff --git a/src/transformers/models/chinese_clip/image_processing_chinese_clip.py b/src/transformers/models/chinese_clip/image_processing_chinese_clip.py
index 593ba05f82c3..5f843ae5d8b0 100644
--- a/src/transformers/models/chinese_clip/image_processing_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/image_processing_chinese_clip.py
@@ -18,21 +18,26 @@
 
 import numpy as np
 
-from transformers.utils.generic import TensorType
-
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import (
-    center_crop,
     convert_to_rgb,
     get_resize_output_image_size,
-    normalize,
-    rescale,
     resize,
     to_channel_dimension_format,
 )
-from ...image_utils import ChannelDimension, ImageInput, PILImageResampling, is_batched, to_numpy_array, valid_images
-from ...utils import logging
-from ...utils.import_utils import is_vision_available
+from ...image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import TensorType, is_vision_available, logging
 
 
 logger = logging.get_logger(__name__)
@@ -74,10 +79,11 @@ class ChineseCLIPImageProcessor(BaseImageProcessor):
             Mean to use if normalizing the image. This is a float or list of floats the length of the number of
             channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
         image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
-            Image standard deviation.
-        do_convert_rgb (`bool`, *optional*, defaults to `True`):
             Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
             number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
     """
 
     model_input_names = ["pixel_values"]
@@ -95,7 +101,7 @@ def __init__(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_convert_rgb: bool = True,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"shortest_edge": 224}
@@ -111,8 +117,8 @@ def __init__(
         self.do_rescale = do_rescale
         self.rescale_factor = rescale_factor
         self.do_normalize = do_normalize
-        self.image_mean = image_mean if image_mean is not None else [0.48145466, 0.4578275, 0.40821073]
-        self.image_std = image_std if image_std is not None else [0.26862954, 0.26130258, 0.27577711]
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
         self.do_convert_rgb = do_convert_rgb
 
     def resize(
@@ -121,7 +127,8 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
@@ -136,77 +143,22 @@ def resize(
                 Resampling filter to use when resiizing the image.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred from the input
+                image.
         """
         size = get_size_dict(size, default_to_square=False)
         output_size = get_resize_output_image_size(
-            image, size=(size["height"], size["width"]), default_to_square=False
+            image, size=(size["height"], size["width"]), default_to_square=False, input_data_format=input_data_format
+        )
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
         )
-        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
-
-    def center_crop(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Center crop an image. If the image is too small to be cropped to the size given, it will be padded (so the
-        returned result will always be of size `size`).
-
-        Args:
-            image (`np.ndarray`):
-                Image to center crop.
-            size (`Dict[str, int]`):
-                Size of the output image in the form of a dictionary with keys `height` and `width`.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        size = get_size_dict(size)
-        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
-
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            image_mean (`float` or `List[float]`):
-                Image mean.
-            image_std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
 
     def preprocess(
         self,
@@ -224,14 +176,16 @@ def preprocess(
         do_convert_rgb: bool = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> PIL.Image.Image:
         """
         Preprocess an image or batch of images.
 
         Args:
             images (`ImageInput`):
-                Image to preprocess.
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
             do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                 Whether to resize the image.
             size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@@ -266,9 +220,15 @@ def preprocess(
                 - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
             data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
                 The channel dimension format for the output image. Can be one of:
-                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                - Unset: defaults to the channel dimension format of the input image.
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         size = size if size is not None else self.size
@@ -284,8 +244,7 @@ def preprocess(
         image_std = image_std if image_std is not None else self.image_std
         do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
 
-        if not is_batched(images):
-            images = [images]
+        images = make_list_of_images(images)
 
         if not valid_images(images):
             raise ValueError(
@@ -312,19 +271,42 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
         if do_resize:
-            images = [self.resize(image=image, size=size, resample=resample) for image in images]
+            images = [
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
 
         if do_center_crop:
-            images = [self.center_crop(image=image, size=crop_size) for image in images]
+            images = [
+                self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images
+            ]
 
         if do_rescale:
-            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
 
         if do_normalize:
-            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
-
-        images = [to_channel_dimension_format(image, data_format) for image in images]
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
 
         data = {"pixel_values": images}
         return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py
index 635c7a95f784..7bab0aea6eb9 100644
--- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py
@@ -47,7 +47,6 @@
 
 _CHECKPOINT_FOR_DOC = "OFA-Sys/chinese-clip-vit-base-patch16"
 _CONFIG_FOR_DOC = "ChineseCLIPConfig"
-_TOKENIZER_FOR_DOC = "BertTokenizer"
 
 CHINESE_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "OFA-Sys/chinese-clip-vit-base-patch16",
@@ -122,7 +121,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
         )
@@ -191,11 +192,12 @@ def __init__(self, config: ChineseCLIPVisionConfig):
         self.num_patches = (self.image_size // self.patch_size) ** 2
         self.num_positions = self.num_patches + 1
         self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
-        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)))
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
 
     def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
         batch_size = pixel_values.shape[0]
-        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
         patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
 
         class_embeds = self.class_embedding.expand(batch_size, 1, -1)
@@ -627,9 +629,9 @@ def __init__(self, config: ChineseCLIPConfig):
         super().__init__()
         self.embed_dim = config.hidden_size
         self.self_attn = ChineseCLIPVisionAttention(config)
-        self.layer_norm1 = nn.LayerNorm(self.embed_dim)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
         self.mlp = ChineseCLIPVisionMLP(config)
-        self.layer_norm2 = nn.LayerNorm(self.embed_dim)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
 
     def forward(
         self,
@@ -690,7 +692,6 @@ class ChineseCLIPPreTrainedModel(PreTrainedModel):
     config_class = ChineseCLIPConfig
     base_model_prefix = "chinese_clip"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -762,7 +763,7 @@ def _set_gradient_checkpointing(self, module, value=False):
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -810,7 +811,7 @@ def _set_gradient_checkpointing(self, module, value=False):
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`ChineseCLIPFeatureExtractor`]. See [`ChineseCLIPFeatureExtractor.__call__`] for details.
+            [`AutoImageProcessor`]. See [`ChineseCLIPImageProcessor.__call__`] for details.
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
             tensors for more detail.
@@ -827,7 +828,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -853,7 +854,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             [What are position IDs?](../glossary#position-ids)
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`ChineseCLIPFeatureExtractor`]. See [`ChineseCLIPFeatureExtractor.__call__`] for details.
+            [`AutoImageProcessor`]. See [`ChineseCLIPImageProcessor.__call__`] for details.
         return_loss (`bool`, *optional*):
             Whether or not to return the contrastive loss.
         output_attentions (`bool`, *optional*):
@@ -892,6 +893,13 @@ def forward(
         all_self_attentions = () if output_attentions else None
         all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         next_decoder_cache = () if use_cache else None
         for i, layer_module in enumerate(self.layer):
             if output_hidden_states:
@@ -902,12 +910,6 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         return module(*inputs, past_key_value, output_attentions)
@@ -1055,9 +1057,9 @@ def __init__(self, config: ChineseCLIPVisionConfig):
         embed_dim = config.hidden_size
 
         self.embeddings = ChineseCLIPVisionEmbeddings(config)
-        self.pre_layrnorm = nn.LayerNorm(embed_dim)
+        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
         self.encoder = ChineseCLIPVisionEncoder(config)
-        self.post_layernorm = nn.LayerNorm(embed_dim)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
 
     @add_start_docstrings_to_model_forward(CHINESE_CLIP_VISION_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=ChineseCLIPVisionConfig)
@@ -1152,7 +1154,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(CHINESE_CLIP_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPoolingAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -1207,6 +1208,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -1376,7 +1378,7 @@ def __init__(self, config: ChineseCLIPConfig):
 
         self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
         self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
-        self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
+        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1400,10 +1402,10 @@ def get_text_features(
         Examples:
 
         ```python
-        >>> from transformers import BertTokenizer, ChineseCLIPModel
+        >>> from transformers import AutoTokenizer, ChineseCLIPModel
 
         >>> model = ChineseCLIPModel.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16")
-        >>> tokenizer = BertTokenizer.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16")
+        >>> tokenizer = AutoTokenizer.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16")
 
         >>> inputs = tokenizer(["杰尼龟", "妙蛙种子", "小火龙", "皮卡丘"], padding=True, return_tensors="pt")
         >>> text_features = model.get_text_features(**inputs)
@@ -1449,10 +1451,10 @@ def get_image_features(
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import ChineseCLIPProcessor, ChineseCLIPModel
+        >>> from transformers import AutoProcessor, ChineseCLIPModel
 
         >>> model = ChineseCLIPModel.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16")
-        >>> processor = ChineseCLIPProcessor.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16")
+        >>> processor = AutoProcessor.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16")
 
         >>> url = "https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/pokemon.jpeg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
@@ -1503,10 +1505,10 @@ def forward(
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import ChineseCLIPProcessor, ChineseCLIPModel
+        >>> from transformers import AutoProcessor, ChineseCLIPModel
 
         >>> model = ChineseCLIPModel.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16")
-        >>> processor = ChineseCLIPProcessor.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16")
+        >>> processor = AutoProcessor.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16")
 
         >>> url = "https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/pokemon.jpeg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
diff --git a/src/transformers/models/chinese_clip/processing_chinese_clip.py b/src/transformers/models/chinese_clip/processing_chinese_clip.py
index 6a8d9c961a37..fbd4d579df90 100644
--- a/src/transformers/models/chinese_clip/processing_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/processing_chinese_clip.py
@@ -31,9 +31,9 @@ class ChineseCLIPProcessor(ProcessorMixin):
     See the [`~ChineseCLIPProcessor.__call__`] and [`~ChineseCLIPProcessor.decode`] for more information.
 
     Args:
-        image_processor ([`ChineseCLIPImageProcessor`]):
+        image_processor ([`ChineseCLIPImageProcessor`], *optional*):
             The image processor is a required input.
-        tokenizer ([`BertTokenizerFast`]):
+        tokenizer ([`BertTokenizerFast`], *optional*):
             The tokenizer is a required input.
     """
     attributes = ["image_processor", "tokenizer"]
@@ -41,6 +41,7 @@ class ChineseCLIPProcessor(ProcessorMixin):
     tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        feature_extractor = None
         if "feature_extractor" in kwargs:
             warnings.warn(
                 "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
diff --git a/src/transformers/models/clap/__init__.py b/src/transformers/models/clap/__init__.py
new file mode 100644
index 000000000000..57e39b6e1fa6
--- /dev/null
+++ b/src/transformers/models/clap/__init__.py
@@ -0,0 +1,76 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {
+    "configuration_clap": [
+        "CLAP_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "ClapAudioConfig",
+        "ClapConfig",
+        "ClapTextConfig",
+    ],
+    "processing_clap": ["ClapProcessor"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_clap"] = [
+        "CLAP_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "ClapModel",
+        "ClapPreTrainedModel",
+        "ClapTextModel",
+        "ClapTextModelWithProjection",
+        "ClapAudioModel",
+        "ClapAudioModelWithProjection",
+    ]
+    _import_structure["feature_extraction_clap"] = ["ClapFeatureExtractor"]
+
+if TYPE_CHECKING:
+    from .configuration_clap import (
+        CLAP_PRETRAINED_MODEL_ARCHIVE_LIST,
+        ClapAudioConfig,
+        ClapConfig,
+        ClapTextConfig,
+    )
+    from .processing_clap import ClapProcessor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .feature_extraction_clap import ClapFeatureExtractor
+        from .modeling_clap import (
+            CLAP_PRETRAINED_MODEL_ARCHIVE_LIST,
+            ClapAudioModel,
+            ClapAudioModelWithProjection,
+            ClapModel,
+            ClapPreTrainedModel,
+            ClapTextModel,
+            ClapTextModelWithProjection,
+        )
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
new file mode 100644
index 000000000000..fca9b0087c8f
--- /dev/null
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -0,0 +1,431 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" CLAP model configuration"""
+
+import os
+from typing import Union
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+CLAP_PRETRAINED_MODEL_ARCHIVE_LIST = {
+    "laion/clap-htsat-fused": "https://huggingface.co/laion/clap-htsat-fused/resolve/main/config.json",
+    "laion/clap-htsat-unfused": "https://huggingface.co/laion/clap-htsat-unfused/resolve/main/config.json",
+}
+
+
+class ClapTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ClapTextModel`]. It is used to instantiate a CLAP
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the CLAP
+    [calp-hsat-fused](https://huggingface.co/laion/clap-hsat-fused) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the CLAP model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`ClapTextModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"relu"`,
+            `"relu"`, `"silu"` and `"relu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`ClapTextModel`].
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        projection_hidden_act (`str`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the projection layer. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        projection_dim (`int`, *optional*, defaults to 512)
+            Dimension of the projection head of the `ClapTextModelWithProjection`.
+
+    Examples:
+
+    ```python
+    >>> from transformers import ClapTextConfig, ClapTextModel
+
+    >>> # Initializing a CLAP text configuration
+    >>> configuration = ClapTextConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = ClapTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "clap_text_model"
+
+    def __init__(
+        self,
+        vocab_size=50265,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=514,
+        type_vocab_size=1,
+        initializer_factor=1.0,
+        layer_norm_eps=1e-12,
+        projection_dim=512,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        position_embedding_type="absolute",
+        use_cache=True,
+        projection_hidden_act="relu",
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_factor = initializer_factor
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.projection_hidden_act = projection_hidden_act
+        self.projection_dim = projection_dim
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the text config dict if we are loading from ClapConfig
+        if config_dict.get("model_type") == "clap":
+            config_dict = config_dict["text_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class ClapAudioConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ClapAudioModel`]. It is used to instantiate a
+    CLAP audio encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the audio encoder of the CLAP
+    [laion/clap-htsat-fused](https://huggingface.co/laion/clap-htsat-fused) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        window_size (`int`, *optional*, defaults to 8):
+            Image size of the spectrogram
+        num_mel_bins (`int`, *optional*, defaults to 64):
+            Number of mel features used per frames. Should correspond to the value used in the `ClapProcessor` class.
+        spec_size (`int`, *optional*, defaults to 256):
+            Desired input size of the spectrogram that the model supports. It can be different from the output of the
+            `ClapFeatureExtractor`, in which case the input features will be resized. Corresponds to the `image_size`
+            of the audio models.
+        hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        patch_size (`int`, *optional*, defaults to 4):
+            Patch size for the audio spectrogram
+        patch_stride (`list`, *optional*, defaults to `[4, 4]`):
+            Patch stride for the audio spectrogram
+        num_classes (`int`, *optional*, defaults to 527):
+            Number of classes used for the head training
+        hidden_size (`int`, *optional*, defaults to 768):
+            Hidden size of the output of the audio encoder. Correspond to the dimension of the penultimate layer's
+            output,which is sent to the projection MLP layer.
+        projection_dim (`int`, *optional*, defaults to 512):
+            Hidden size of the projection layer.
+        depths (`list`, *optional*, defaults to `[2, 2, 6, 2]`):
+            Depths used for the Swin Layers of the audio model
+        num_attention_heads (`list`, *optional*, defaults to `[4, 8, 16, 32]`):
+            Number of attention heads used for the Swin Layers of the audio model
+        enable_fusion (`bool`, *optional*, defaults to `False`):
+            Whether or not to enable patch fusion. This is the main contribution of the authors, and should give the
+            best results.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the encoder.
+        fusion_type (`[type]`, *optional*):
+            Fusion type used for the patch fusion.
+        patch_embed_input_channels (`int`, *optional*, defaults to 1):
+            Number of channels used for the input spectrogram
+        flatten_patch_embeds (`bool`, *optional*, defaults to `True`):
+            Whether or not to flatten the patch embeddings
+        patch_embeds_hidden_size (`int`, *optional*, defaults to 96):
+            Hidden size of the patch embeddings. It is used as the number of output channels.
+        enable_patch_layer_norm (`bool`, *optional*, defaults to `True`):
+            Whether or not to enable layer normalization for the patch embeddings
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            Drop path rate for the patch fusion
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether or not to add a bias to the query, key, value projections.
+        mlp_ratio (`float`, *optional*, defaults to 4.0):
+            Ratio of the mlp hidden dim to embedding dim.
+        aff_block_r (`int`, *optional*, defaults to 4):
+            downsize_ratio used in the AudioFF block
+        num_hidden_layers (`int`, *optional*, defaults to 4):
+            Number of hidden layers in the Transformer encoder.
+        projection_hidden_act (`str`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the projection layer. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        layer_norm_eps (`[type]`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        initializer_factor (`float`, *optional*, defaults to 1.0):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+
+    Example:
+
+    ```python
+    >>> from transformers import ClapAudioConfig, ClapAudioModel
+
+    >>> # Initializing a ClapAudioConfig with laion/clap-htsat-fused style configuration
+    >>> configuration = ClapAudioConfig()
+
+    >>> # Initializing a ClapAudioModel (with random weights) from the laion/clap-htsat-fused style configuration
+    >>> model = ClapAudioModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "clap_audio_model"
+
+    def __init__(
+        self,
+        window_size=8,
+        num_mel_bins=64,
+        spec_size=256,
+        hidden_act="gelu",
+        patch_size=4,
+        patch_stride=[4, 4],
+        num_classes=527,
+        hidden_size=768,
+        projection_dim=512,
+        depths=[2, 2, 6, 2],
+        num_attention_heads=[4, 8, 16, 32],
+        enable_fusion=False,
+        hidden_dropout_prob=0.1,
+        fusion_type=None,
+        patch_embed_input_channels=1,
+        flatten_patch_embeds=True,
+        patch_embeds_hidden_size=96,
+        enable_patch_layer_norm=True,
+        drop_path_rate=0.0,
+        attention_probs_dropout_prob=0.0,
+        qkv_bias=True,
+        mlp_ratio=4.0,
+        aff_block_r=4,
+        num_hidden_layers=4,
+        projection_hidden_act="relu",
+        layer_norm_eps=1e-5,
+        initializer_factor=1.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.window_size = window_size
+        self.num_mel_bins = num_mel_bins
+        self.spec_size = spec_size
+        self.patch_size = patch_size
+        self.patch_stride = patch_stride
+        self.num_classes = num_classes
+        self.hidden_size = hidden_size
+        self.depths = depths
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.window_size = window_size
+        self.enable_fusion = enable_fusion
+        self.fusion_type = fusion_type
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.projection_dim = projection_dim
+        self.flatten_patch_embeds = flatten_patch_embeds
+        self.patch_embeds_hidden_size = patch_embeds_hidden_size
+        self.enable_patch_layer_norm = enable_patch_layer_norm
+        self.drop_path_rate = drop_path_rate
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.qkv_bias = qkv_bias
+        self.mlp_ratio = mlp_ratio
+        self.patch_embed_input_channels = patch_embed_input_channels
+        self.aff_block_r = aff_block_r
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_factor = initializer_factor
+        self.projection_hidden_act = projection_hidden_act
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the audio config dict if we are loading from ClapConfig
+        if config_dict.get("model_type") == "clap":
+            config_dict = config_dict["audio_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class ClapConfig(PretrainedConfig):
+    r"""
+    [`ClapConfig`] is the configuration class to store the configuration of a [`ClapModel`]. It is used to instantiate
+    a CLAP model according to the specified arguments, defining the text model and audio model configs. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the CLAP
+    [laion/clap-htsat-fused](https://huggingface.co/laion/clap-htsat-fused) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`ClapTextConfig`].
+        audio_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`ClapAudioConfig`].
+        logit_scale_init_value (`float`, *optional*, defaults to 14.29):
+            The inital value of the *logit_scale* paramter. Default is used as per the original CLAP implementation.
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimentionality of text and audio projection layers.
+        projection_hidden_act (`str`, *optional*, defaults to `"relu"`):
+            Activation function for the projection layers.
+        initializer_factor (`float`, *optional*, defaults to 1.0):
+            Factor to scale the initialization of the model weights.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from transformers import ClapConfig, ClapModel
+
+    >>> # Initializing a ClapConfig with laion-ai/base style configuration
+    >>> configuration = ClapConfig()
+
+    >>> # Initializing a ClapModel (with random weights) from the laion-ai/base style configuration
+    >>> model = ClapModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+
+    >>> # We can also initialize a ClapConfig from a ClapTextConfig and a ClapAudioConfig
+    >>> from transformers import ClapTextConfig, ClapAudioConfig
+
+    >>> # Initializing a ClapText and ClapAudioConfig configuration
+    >>> config_text = ClapTextConfig()
+    >>> config_audio = ClapAudioConfig()
+
+    >>> config = ClapConfig.from_text_audio_configs(config_text, config_audio)
+    ```"""
+
+    model_type = "clap"
+
+    def __init__(
+        self,
+        text_config=None,
+        audio_config=None,
+        logit_scale_init_value=(1 / 0.07),
+        projection_dim=512,
+        projection_hidden_act="relu",
+        initializer_factor=1.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        if text_config is None:
+            text_config = {}
+            logger.info("text_config is None. Initializing the ClapTextConfig with default values.")
+
+        if audio_config is None:
+            audio_config = {}
+            logger.info("audio_config is None. initializing the ClapAudioConfig with default values.")
+
+        self.text_config = ClapTextConfig(**text_config)
+        self.audio_config = ClapAudioConfig(**audio_config)
+        self.text_config.projection_dim = projection_dim
+        self.audio_config.projection_dim = projection_dim
+
+        self.text_config.projection_hidden_act = projection_hidden_act
+        self.audio_config.projection_hidden_act = projection_hidden_act
+
+        self.projection_dim = projection_dim
+        self.projection_hidden_act = projection_hidden_act
+        self.hidden_size = self.text_config.hidden_size
+
+        self.logit_scale_init_value = logit_scale_init_value
+        self.initializer_factor = initializer_factor
+        self.num_hidden_layers = self.text_config.num_hidden_layers + len(self.audio_config.depths)
+
+    @classmethod
+    def from_text_audio_configs(cls, text_config: ClapTextConfig, audio_config: ClapAudioConfig, **kwargs):
+        r"""
+        Instantiate a [`ClapConfig`] (or a derived class) from clap text model configuration and clap audio model
+        configuration.
+
+        Returns:
+            [`ClapConfig`]: An instance of a configuration object
+        """
+
+        return cls(text_config=text_config.to_dict(), audio_config=audio_config.to_dict(), **kwargs)
diff --git a/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py b/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
new file mode 100644
index 000000000000..908fef5927af
--- /dev/null
+++ b/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
@@ -0,0 +1,123 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import re
+
+import torch
+from CLAP import create_model
+
+from transformers import AutoFeatureExtractor, ClapConfig, ClapModel
+
+
+KEYS_TO_MODIFY_MAPPING = {
+    "text_branch": "text_model",
+    "audio_branch": "audio_model.audio_encoder",
+    "attn": "attention.self",
+    "self.proj": "output.dense",
+    "attention.self_mask": "attn_mask",
+    "mlp.fc1": "intermediate.dense",
+    "mlp.fc2": "output.dense",
+    "norm1": "layernorm_before",
+    "norm2": "layernorm_after",
+    "bn0": "batch_norm",
+}
+
+processor = AutoFeatureExtractor.from_pretrained("laion/clap-htsat-unfused", truncation="rand_trunc")
+
+
+def init_clap(checkpoint_path, enable_fusion=False):
+    model, model_cfg = create_model(
+        "HTSAT-tiny",
+        "roberta",
+        checkpoint_path,
+        precision="fp32",
+        device="cuda:0" if torch.cuda.is_available() else "cpu",
+        enable_fusion=enable_fusion,
+        fusion_type="aff_2d" if enable_fusion else None,
+    )
+    return model, model_cfg
+
+
+def rename_state_dict(state_dict):
+    model_state_dict = {}
+
+    sequential_layers_pattern = r".*sequential.(\d+).*"
+    text_projection_pattern = r".*_projection.(\d+).*"
+
+    for key, value in state_dict.items():
+        # check if any key needs to be modified
+        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
+            if key_to_modify in key:
+                key = key.replace(key_to_modify, new_key)
+
+        if re.match(sequential_layers_pattern, key):
+            # replace sequential layers with list
+            sequential_layer = re.match(sequential_layers_pattern, key).group(1)
+
+            key = key.replace(f"sequential.{sequential_layer}.", f"layers.{int(sequential_layer)//3}.linear.")
+        elif re.match(text_projection_pattern, key):
+            projecton_layer = int(re.match(text_projection_pattern, key).group(1))
+
+            # Because in CLAP they use `nn.Sequential`...
+            transformers_projection_layer = 1 if projecton_layer == 0 else 2
+
+            key = key.replace(f"_projection.{projecton_layer}.", f"_projection.linear{transformers_projection_layer}.")
+
+        if "audio" and "qkv" in key:
+            # split qkv into query key and value
+            mixed_qkv = value
+            qkv_dim = mixed_qkv.size(0) // 3
+
+            query_layer = mixed_qkv[:qkv_dim]
+            key_layer = mixed_qkv[qkv_dim : qkv_dim * 2]
+            value_layer = mixed_qkv[qkv_dim * 2 :]
+
+            model_state_dict[key.replace("qkv", "query")] = query_layer
+            model_state_dict[key.replace("qkv", "key")] = key_layer
+            model_state_dict[key.replace("qkv", "value")] = value_layer
+        else:
+            model_state_dict[key] = value
+
+    return model_state_dict
+
+
+def convert_clap_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path, enable_fusion=False):
+    clap_model, clap_model_cfg = init_clap(checkpoint_path, enable_fusion=enable_fusion)
+
+    clap_model.eval()
+    state_dict = clap_model.state_dict()
+    state_dict = rename_state_dict(state_dict)
+
+    transformers_config = ClapConfig()
+    transformers_config.audio_config.enable_fusion = enable_fusion
+    model = ClapModel(transformers_config)
+
+    # ignore the spectrogram embedding layer
+    model.load_state_dict(state_dict, strict=False)
+
+    model.save_pretrained(pytorch_dump_folder_path)
+    transformers_config.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
+    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
+    parser.add_argument("--enable_fusion", action="store_true", help="Whether to enable fusion or not")
+    args = parser.parse_args()
+
+    convert_clap_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.enable_fusion)
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
new file mode 100644
index 000000000000..1b7c28444001
--- /dev/null
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -0,0 +1,363 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for CLAP."""
+
+
+import copy
+from typing import Any, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+
+from ...audio_utils import mel_filter_bank, spectrogram, window_function
+from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
+from ...feature_extraction_utils import BatchFeature
+from ...utils import TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class ClapFeatureExtractor(SequenceFeatureExtractor):
+    r"""
+    Constructs a CLAP feature extractor.
+
+    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
+    most of the main methods. Users should refer to this superclass for more information regarding those methods.
+
+    This class extracts mel-filter bank features from raw speech using a custom numpy implementation of the *Short Time
+    Fourier Transform* (STFT) which should match pytorch's `torch.stft` equivalent.
+
+    Args:
+        feature_size (`int`, *optional*, defaults to 64):
+            The feature dimension of the extracted Mel spectrograms. This corresponds to the number of mel filters
+            (`n_mels`).
+        sampling_rate (`int`, *optional*, defaults to 48000):
+            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz). This only serves
+            to warn users if the audio fed to the feature extractor does not have the same sampling rate.
+        hop_length (`int`,*optional*, defaults to 480):
+            Length of the overlaping windows for the STFT used to obtain the Mel Spectrogram. The audio will be split
+            in smaller `frames` with a step of `hop_length` between each frame.
+        max_length_s (`int`, *optional*, defaults to 10):
+            The maximum input length of the model in seconds. This is used to pad the audio.
+        fft_window_size (`int`, *optional*, defaults to 1024):
+            Size of the window (in samples) on which the Fourier transform is applied. This controls the frequency
+            resolution of the spectrogram. 400 means that the fourrier transform is computed on windows of 400 samples.
+        padding_value (`float`, *optional*, defaults to 0.0):
+            Padding value used to pad the audio. Should correspond to silences.
+        return_attention_mask (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should return the attention masks coresponding to the input.
+        frequency_min (`float`, *optional*, defaults to 0):
+            The lowest frequency of interest. The STFT will not be computed for values below this.
+        frequency_max (`float`, *optional*, defaults to 14000):
+            The highest frequency of interest. The STFT will not be computed for values above this.
+        top_db (`float`, *optional*):
+            The highest decibel value used to convert the mel spectrogram to the log scale. For more details see the
+            `audio_utils.power_to_db` function
+        truncation (`str`, *optional*, defaults to `"fusion"`):
+            Truncation pattern for long audio inputs. Two patterns are available:
+                - `fusion` will use `_random_mel_fusion`, which stacks 3 random crops from the mel spectrogram and a
+                  downsampled version of the entire mel spectrogram.
+            If `config.fusion` is set to True, shorter audios also need to to return 4 mels, which will just be a copy
+            of the original mel obtained from the padded audio.
+                - `rand_trunc` will select a random crop of the mel spectrogram.
+        padding (`str`, *optional*, defaults to `"repeatpad"`):
+               Padding pattern for shorter audio inputs. Three patterns were originally implemented:
+                - `repeatpad`: the audio is repeated, and then padded to fit the `max_length`.
+                - `repeat`: the audio is repeated and then cut to fit the `max_length`
+                - `pad`: the audio is padded.
+    """
+
+    model_input_names = ["input_features", "is_longer"]
+
+    def __init__(
+        self,
+        feature_size=64,
+        sampling_rate=48_000,
+        hop_length=480,
+        max_length_s=10,
+        fft_window_size=1024,
+        padding_value=0.0,
+        return_attention_mask=False,  # pad inputs to max length with silence token (zero) and no attention mask
+        frequency_min: float = 0,
+        frequency_max: float = 14_000,
+        top_db: int = None,
+        truncation: str = "fusion",
+        padding: str = "repeatpad",
+        **kwargs,
+    ):
+        super().__init__(
+            feature_size=feature_size,
+            sampling_rate=sampling_rate,
+            padding_value=padding_value,
+            return_attention_mask=return_attention_mask,
+            **kwargs,
+        )
+        self.top_db = top_db
+        self.truncation = truncation
+        self.padding = padding
+        self.fft_window_size = fft_window_size
+        self.nb_frequency_bins = (fft_window_size >> 1) + 1
+        self.hop_length = hop_length
+        self.max_length_s = max_length_s
+        self.nb_max_samples = max_length_s * sampling_rate
+        self.sampling_rate = sampling_rate
+        self.frequency_min = frequency_min
+        self.frequency_max = frequency_max
+        self.mel_filters = mel_filter_bank(
+            num_frequency_bins=self.nb_frequency_bins,
+            num_mel_filters=feature_size,
+            min_frequency=frequency_min,
+            max_frequency=frequency_max,
+            sampling_rate=sampling_rate,
+            norm=None,
+            mel_scale="htk",
+        )
+        self.mel_filters_slaney = mel_filter_bank(
+            num_frequency_bins=self.nb_frequency_bins,
+            num_mel_filters=feature_size,
+            min_frequency=frequency_min,
+            max_frequency=frequency_max,
+            sampling_rate=sampling_rate,
+            norm="slaney",
+            mel_scale="slaney",
+        )
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes this instance to a Python dictionary.
+
+        Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, excpet for the
+            mel filter banks, which do not need to be saved or printed as they are too long.
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["feature_extractor_type"] = self.__class__.__name__
+        if "mel_filters" in output:
+            del output["mel_filters"]
+        if "mel_filters_slaney" in output:
+            del output["mel_filters_slaney"]
+        return output
+
+    def _np_extract_fbank_features(self, waveform: np.array, mel_filters: Optional[np.array] = None) -> np.ndarray:
+        """
+        Compute the log-mel spectrogram of the provided `waveform` using the Hann window. In CLAP, two different filter
+        banks are used depending on the truncation pattern:
+            - `self.mel_filters`: they correspond to the default parameters of `torchaudio` which can be obtained from
+              calling `torchaudio.transforms.MelSpectrogram().mel_scale.fb`. These filters are used when `truncation`
+              is set to `"fusion"`.
+            - `self.mel_filteres_slaney` : they correspond to the default parameters of `librosa` which used
+              `librosa.filters.mel` when computing the mel spectrogram. These filters were only used in the original
+              implementation when the truncation mode is not `"fusion"`.
+        """
+        log_mel_spectrogram = spectrogram(
+            waveform,
+            window_function(self.fft_window_size, "hann"),
+            frame_length=self.fft_window_size,
+            hop_length=self.hop_length,
+            power=2.0,
+            mel_filters=mel_filters,
+            log_mel="dB",
+        )
+        return log_mel_spectrogram.T
+
+    def _random_mel_fusion(self, mel, total_frames, chunk_frames):
+        ranges = np.array_split(list(range(0, total_frames - chunk_frames + 1)), 3)
+        if len(ranges[1]) == 0:
+            # if the audio is too short, we just use the first chunk
+            ranges[1] = [0]
+        if len(ranges[2]) == 0:
+            # if the audio is too short, we just use the first chunk
+            ranges[2] = [0]
+        # randomly choose index for each part
+        idx_front = np.random.choice(ranges[0])
+        idx_middle = np.random.choice(ranges[1])
+        idx_back = np.random.choice(ranges[2])
+
+        mel_chunk_front = mel[idx_front : idx_front + chunk_frames, :]
+        mel_chunk_middle = mel[idx_middle : idx_middle + chunk_frames, :]
+        mel_chunk_back = mel[idx_back : idx_back + chunk_frames, :]
+
+        mel = torch.tensor(mel[None, None, :])
+        mel_shrink = torch.nn.functional.interpolate(
+            mel, size=[chunk_frames, 64], mode="bilinear", align_corners=False
+        )
+        mel_shrink = mel_shrink[0][0].numpy()
+        mel_fusion = np.stack([mel_shrink, mel_chunk_front, mel_chunk_middle, mel_chunk_back], axis=0)
+        return mel_fusion
+
+    def _get_input_mel(self, waveform: np.array, max_length, truncation, padding) -> np.array:
+        """
+        Extracts the mel spectrogram and prepares it for the mode based on the `truncation` and `padding` arguments.
+        Four different path are possible:
+            - `truncation="fusion"` and the length of the waveform is greater than the max length: the mel spectrogram
+              will be computed on the entire audio. 3 random crops and a dowsampled version of the full mel spectrogram
+              are then stacked together. They will later be used for `feature_fusion`.
+            - `truncation="rand_trunc"` and the length of the waveform is smaller than the max length: the audio is
+              padded based on `padding`.
+            - `truncation="fusion"` and the length of the waveform is smaller than the max length: the audio is padded
+              based on `padding`, and is repeated `4` times.
+            - `truncation="rand_trunc"` and the length of the waveform is greater than the max length: the mel
+              spectrogram will be computed on a random crop of the waveform.
+
+        """
+        if waveform.shape[0] > max_length:
+            if truncation == "rand_trunc":
+                longer = True
+                # random crop to max_length (for compatibility) -> this should be handled by self.pad
+                overflow = len(waveform) - max_length
+                idx = np.random.randint(0, overflow + 1)
+                waveform = waveform[idx : idx + max_length]
+                input_mel = self._np_extract_fbank_features(waveform, self.mel_filters_slaney)[None, :]
+            elif truncation == "fusion":
+                mel = self._np_extract_fbank_features(waveform, self.mel_filters)
+                chunk_frames = max_length // self.hop_length + 1  # the +1 related to how the spectrogram is computed
+                total_frames = mel.shape[0]
+                if chunk_frames == total_frames:
+                    # there is a corner case where the audio length is larger than max_length but smaller than max_length+hop_length.
+                    # In this case, we just use the whole audio.
+                    input_mel = np.stack([mel, mel, mel, mel], axis=0)
+                    longer = False
+                else:
+                    input_mel = self._random_mel_fusion(mel, total_frames, chunk_frames)
+                    longer = True
+            else:
+                raise NotImplementedError(f"data_truncating {truncation} not implemented")
+
+        else:
+            longer = False
+            # only use repeat as a new possible value for padding. you repeat the audio before applying the usual max_length padding
+            if waveform.shape[0] < max_length:
+                if padding == "repeat":
+                    n_repeat = int(max_length / len(waveform))
+                    waveform = np.stack(np.tile(waveform, n_repeat + 1))[:max_length]
+                if padding == "repeatpad":
+                    n_repeat = int(max_length / len(waveform))
+                    waveform = np.stack(np.tile(waveform, n_repeat))
+                waveform = np.pad(waveform, (0, max_length - waveform.shape[0]), mode="constant", constant_values=0)
+
+            if truncation == "fusion":
+                input_mel = self._np_extract_fbank_features(waveform, self.mel_filters)
+                input_mel = np.stack([input_mel, input_mel, input_mel, input_mel], axis=0)
+            else:
+                input_mel = self._np_extract_fbank_features(waveform, self.mel_filters_slaney)[None, :]
+
+        return input_mel, longer
+
+    def __call__(
+        self,
+        raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
+        truncation: str = None,
+        padding: Optional[str] = None,
+        max_length: Optional[int] = None,
+        sampling_rate: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Main method to featurize and prepare for the model one or several sequence(s).
+
+        Args:
+            raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
+                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
+                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
+                stereo, i.e. single float per timestep.
+            truncation (`str`, *optional*):
+                Truncation pattern for long audio inputs. Two patterns are available:
+                    - `fusion` will use `_random_mel_fusion`, which stacks 3 random crops from the mel spectrogram and
+                      a downsampled version of the entire mel spectrogram.
+                If `config.fusion` is set to True, shorter audios also need to to return 4 mels, which will just be a
+                copy of the original mel obtained from the padded audio.
+                    - `rand_trunc` will select a random crop of the mel spectrogram.
+            padding (`str`, *optional*):
+               Padding pattern for shorter audio inputs. Three patterns were originally implemented:
+                    - `repeatpad`: the audio is repeated, and then padded to fit the `max_length`.
+                    - `repeat`: the audio is repeated and then cut to fit the `max_length`
+                    - `pad`: the audio is padded.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.np.array` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            sampling_rate (`int`, *optional*):
+                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
+                `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
+                pipeline.
+        """
+        truncation = truncation if truncation is not None else self.truncation
+        padding = padding if padding else self.padding
+
+        if sampling_rate is not None:
+            if sampling_rate != self.sampling_rate:
+                raise ValueError(
+                    f"The model corresponding to this feature extractor: {self.__class__.__name__} was trained using a"
+                    f" sampling rate of {self.sampling_rate}. Please make sure that the provided `raw_speech` input"
+                    f" was sampled with {self.sampling_rate} and not {sampling_rate}."
+                )
+        else:
+            logger.warning(
+                "It is strongly recommended to pass the `sampling_rate` argument to this function. "
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+
+        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
+        if is_batched_numpy and len(raw_speech.shape) > 2:
+            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
+        is_batched = is_batched_numpy or (
+            isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
+        )
+
+        if is_batched:
+            raw_speech = [np.asarray(speech, dtype=np.float64) for speech in raw_speech]
+        elif not is_batched and not isinstance(raw_speech, np.ndarray):
+            raw_speech = np.asarray(raw_speech, dtype=np.float64)
+        elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64):
+            raw_speech = raw_speech.astype(np.float64)
+
+        # always return batch
+        if not is_batched:
+            raw_speech = [np.asarray(raw_speech)]
+
+        # convert to mel spectrogram, truncate and pad if needed.
+        padded_inputs = [
+            self._get_input_mel(waveform, max_length if max_length else self.nb_max_samples, truncation, padding)
+            for waveform in raw_speech
+        ]
+
+        input_mel = []
+        is_longer = []
+        for mel, longer in padded_inputs:
+            input_mel.append(mel)
+            is_longer.append(longer)
+
+        if truncation == "fusion" and sum(is_longer) == 0:
+            # if no audio is longer than 10s, then randomly select one audio to be longer
+            rand_idx = np.random.randint(0, len(input_mel))
+            is_longer[rand_idx] = True
+
+        if isinstance(input_mel[0], List):
+            input_mel = [np.asarray(feature, dtype=np.float64) for feature in input_mel]
+
+        # is_longer is a list of bool
+        is_longer = [[longer] for longer in is_longer]
+
+        input_features = {"input_features": input_mel, "is_longer": is_longer}
+        input_features = BatchFeature(input_features)
+
+        if return_tensors is not None:
+            input_features = input_features.convert_to_tensors(return_tensors)
+
+        return input_features
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
new file mode 100644
index 000000000000..1d17a5188387
--- /dev/null
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -0,0 +1,2316 @@
+# coding=utf-8
+# Copyright 2023 The LAION-AI Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch CLAP model."""
+import collections
+import math
+from dataclasses import dataclass
+from typing import Any, List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPooling,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_clap import ClapAudioConfig, ClapConfig, ClapTextConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "laion/clap-htsat-fused"
+
+CLAP_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "laion/clap-htsat-fused",
+    "laion/clap-htsat-unfused",
+    # See all clap models at https://huggingface.co/models?filter=clap
+]
+
+
+# Adapted from: https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/utils.py#L191
+def interpolate(hidden_states, ratio):
+    """
+    Interpolate data in time domain. This is used to compensate the resolution reduction in downsampling of a CNN.
+
+    Args:
+        hidden_states (`torch.FloatTensor` of shape (batch_size, time_length, classes_num)):
+            Input hidden states
+        ratio (`int`):
+            The ratio of the length of the output to the length of the input.
+    """
+    (batch_size, time_length, classes_num) = hidden_states.shape
+    upsampled = hidden_states[:, :, None, :].repeat(1, 1, ratio, 1)
+    upsampled = upsampled.reshape(batch_size, time_length * ratio, classes_num)
+    return upsampled
+
+
+# Adapted from https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/htsat.py#L249
+def window_partition(hidden_states, window_size):
+    """
+    Returns the resized hidden states. The output shape should be `(batch_size * num_windows, window_size, window_size,
+    num_channels)`
+
+    Args:
+        hidden_states (`torch.FloatTensor` of shape `(batch_size, height, width, num_channels)`):
+            Input hidden states
+        window_size (`int`):
+            Window size
+    """
+    batch_size, height, width, num_channels = hidden_states.shape
+
+    hidden_states = hidden_states.view(
+        batch_size, height // window_size, window_size, width // window_size, window_size, num_channels
+    )
+    windows = hidden_states.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, num_channels)
+    return windows
+
+
+# Adapted from https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/htsat.py#L263
+def window_reverse(windows, window_size, height, width):
+    """
+    Args:
+        windows (`torch.FloatTensor` of shape `(num_windows * batch_size, window_size, window_size, num_channels)`):
+            Input windows
+        window_size (`int`):
+            Window size
+        height (`int`):
+            Height of the resized audio
+        width (`int`):
+            Width of the resized audio
+    """
+    batch_size = int(windows.shape[0] / (height * width / window_size / window_size))
+
+    hidden_states = windows.view(batch_size, height // window_size, width // window_size, window_size, window_size, -1)
+    hidden_states = hidden_states.permute(0, 1, 3, 2, 4, 5).contiguous().view(batch_size, height, width, -1)
+    return hidden_states
+
+
+# Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids
+def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+
+    Args:
+        x: torch.Tensor x:
+
+    Returns: torch.Tensor
+    """
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    mask = input_ids.ne(padding_idx).int()
+    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
+    return incremental_indices.long() + padding_idx
+
+
+# contrastive loss function, adapted from
+# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html#CLIP-loss-function
+def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
+    labels = torch.arange(len(logits), device=logits.device)
+    return nn.functional.cross_entropy(logits, labels)
+
+
+@dataclass
+# Copied from transformers.models.clip.modeling_clip.CLIPTextModelOutput with CLIP->Clap
+class ClapTextModelOutput(ModelOutput):
+    """
+    Base class for text model's outputs that also contains a pooling of the last hidden states.
+
+    Args:
+        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The text embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    text_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class ClapAudioModelOutput(ModelOutput):
+    """
+    ClapAudio model output to mimic the output of the original implementation.
+
+    Args:
+        audio_embeds (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
+            The Audio embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+    """
+
+    audio_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+# Copied from transformers.models.clip.modeling_clip.CLIPOutput with CLIP->Clap, vision->audio, Vision->Audio, image->audio
+class ClapOutput(ModelOutput):
+    """
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
+            Contrastive loss for audio-text similarity.
+        logits_per_audio:(`torch.FloatTensor` of shape `(audio_batch_size, text_batch_size)`):
+            The scaled dot product scores between `audio_embeds` and `text_embeds`. This represents the audio-text
+            similarity scores.
+        logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, audio_batch_size)`):
+            The scaled dot product scores between `text_embeds` and `audio_embeds`. This represents the text-audio
+            similarity scores.
+        text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
+            The text embeddings obtained by applying the projection layer to the pooled output of [`ClapTextModel`].
+        audio_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
+            The audio embeddings obtained by applying the projection layer to the pooled output of [`ClapAudioModel`].
+        text_model_output(`BaseModelOutputWithPooling`):
+            The output of the [`ClapTextModel`].
+        audio_model_output(`BaseModelOutputWithPooling`):
+            The output of the [`ClapAudioModel`].
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits_per_audio: torch.FloatTensor = None
+    logits_per_text: torch.FloatTensor = None
+    text_embeds: torch.FloatTensor = None
+    audio_embeds: torch.FloatTensor = None
+    text_model_output: BaseModelOutputWithPooling = None
+    audio_model_output: BaseModelOutputWithPooling = None
+
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_model_output", "audio_model_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+# Adapted from transformers.models.swin.modeling_swin.SwinDropPath
+class ClapDropPath(nn.Module):
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). This is a slightly
+    refactored version of the `SwinDropPath` implementation.
+    """
+
+    def __init__(self, drop_prob=None):
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states):
+        if self.drop_prob == 0.0 or not self.training:
+            return hidden_states
+
+        keep_prob = 1 - self.drop_prob
+        # work with diff dim tensors, not just 2D ConvNets
+        shape = (hidden_states.shape[0],) + (1,) * (hidden_states.ndim - 1)
+
+        random_tensor = keep_prob + torch.rand(shape, dtype=hidden_states.dtype, device=hidden_states.device)
+        random_tensor.floor_()  # binarize
+        output = hidden_states.div(keep_prob) * random_tensor
+        return output
+
+
+# Adapted from https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/feature_fusion.py#L133
+class ClapAudioAFFBlock(nn.Module):
+    r"""
+    ATTENTIONAL FEATURE FUSION Block from CLAP, since in CLAP we are always in 2D mode, it is not needed to implement
+    the 1D version.
+    """
+
+    def __init__(self, config: ClapAudioConfig):
+        super().__init__()
+        channels = config.patch_embeds_hidden_size
+        downsize_ratio = config.aff_block_r
+        inter_channels = int(channels // downsize_ratio)
+
+        self.local_att = nn.Sequential(
+            nn.Conv2d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
+            nn.BatchNorm2d(inter_channels),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
+            nn.BatchNorm2d(channels),
+        )
+        self.global_att = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
+            nn.BatchNorm2d(inter_channels),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
+            nn.BatchNorm2d(channels),
+        )
+
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, hidden_states, residual):
+        attention_input = hidden_states + residual
+
+        fused_layer_output = self.local_att(attention_input) + self.global_att(attention_input)
+        fused_layer_output = self.sigmoid(fused_layer_output)
+
+        output = 2 * hidden_states * fused_layer_output + 2 * residual * (1 - fused_layer_output)
+        return output
+
+
+class ClapAudioPatchEmbed(nn.Module):
+    """
+    This module converts the hidden states reshaped as an image to patch embeddings ready to be passed to the
+    Transformer block.
+    """
+
+    def __init__(self, config: ClapAudioConfig):
+        super().__init__()
+        img_size = (config.spec_size, config.spec_size) if isinstance(config.spec_size, int) else config.spec_size
+        patch_size = (
+            (config.patch_size, config.patch_size) if isinstance(config.patch_size, int) else config.patch_size
+        )
+        patch_stride = (
+            (config.patch_stride, config.patch_stride) if isinstance(config.patch_stride, int) else config.patch_stride
+        )
+
+        self.img_size = img_size
+        self.patch_stride = patch_stride
+
+        self.grid_size = (img_size[0] // patch_stride[0], img_size[1] // patch_stride[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+
+        self.flatten = config.flatten_patch_embeds
+        self.enable_fusion = config.enable_fusion
+
+        padding = ((patch_size[0] - patch_stride[0]) // 2, (patch_size[1] - patch_stride[1]) // 2)
+
+        scale_factor = 4 if (self.enable_fusion) and (config.fusion_type == "channel_map") else 1
+
+        self.proj = nn.Conv2d(
+            config.patch_embed_input_channels * scale_factor,
+            config.patch_embeds_hidden_size,
+            kernel_size=patch_size,
+            stride=patch_stride,
+            padding=padding,
+        )
+
+        self.norm = nn.LayerNorm(config.patch_embeds_hidden_size) if config.enable_patch_layer_norm else nn.Identity()
+        if self.enable_fusion:
+            self.fusion_model = ClapAudioAFFBlock(config)
+            self.mel_conv2d = nn.Conv2d(
+                config.patch_embed_input_channels,
+                config.patch_embeds_hidden_size,
+                kernel_size=(patch_size[0], patch_size[1] * 3),
+                stride=(patch_stride[0], patch_stride[1] * 3),
+                padding=padding,
+            )
+
+    def forward(self, hidden_states, is_longer_idx=None):
+        if self.enable_fusion:
+            # retrieve the last mel as we have transposed the input
+            global_hidden_states = hidden_states[:, 0:1, :, :]
+
+            # global processing
+            batch_size, num_channels, height, width = global_hidden_states.shape
+
+            if height != self.img_size[0] or width != self.img_size[1]:
+                raise ValueError(
+                    f"Input audio size ({height}*{width}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+                )
+
+            global_hidden_states = self.proj(global_hidden_states)
+            output_width = global_hidden_states.size(-1)
+            if len(is_longer_idx) > 0:
+                # local processing
+                local_hidden_states = hidden_states[is_longer_idx, 1:, :, :].contiguous()
+                batch_size, num_channels, height, width = local_hidden_states.shape
+                local_hidden_states = local_hidden_states.view(batch_size * num_channels, 1, height, width)
+
+                local_hidden_states = self.mel_conv2d(local_hidden_states)
+
+                _, features, height, width = local_hidden_states.shape
+                local_hidden_states = local_hidden_states.view(batch_size, num_channels, features, height, width)
+                local_hidden_states = local_hidden_states.permute((0, 2, 3, 1, 4)).contiguous().flatten(3)
+
+                local_width = local_hidden_states.size(-1)
+                local_hidden_states = torch.nn.functional.pad(
+                    local_hidden_states, (0, output_width - local_width), "constant", 0
+                )
+
+                global_hidden_states[is_longer_idx] = self.fusion_model(
+                    global_hidden_states[is_longer_idx], local_hidden_states
+                )
+            hidden_states = global_hidden_states
+        else:
+            _, _, height, width = hidden_states.shape
+            if height != self.img_size[0] or width != self.img_size[1]:
+                raise ValueError(
+                    f"Input audio size ({height}*{width}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+                )
+            hidden_states = self.proj(hidden_states)
+
+        if self.flatten:
+            hidden_states = hidden_states.flatten(2).transpose(1, 2)
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinSelfAttention with Swin->ClapAudio
+class ClapAudioSelfAttention(nn.Module):
+    def __init__(self, config, dim, num_heads, window_size):
+        super().__init__()
+        if dim % num_heads != 0:
+            raise ValueError(
+                f"The hidden size ({dim}) is not a multiple of the number of attention heads ({num_heads})"
+            )
+
+        self.num_attention_heads = num_heads
+        self.attention_head_size = int(dim / num_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.window_size = (
+            window_size if isinstance(window_size, collections.abc.Iterable) else (window_size, window_size)
+        )
+
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1), num_heads)
+        )
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(meshgrid([coords_h, coords_w], indexing="ij"))
+        coords_flatten = torch.flatten(coords, 1)
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()
+        relative_coords[:, :, 0] += self.window_size[0] - 1
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)
+        self.register_buffer("relative_position_index", relative_position_index)
+
+        self.query = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        batch_size, dim, num_channels = hidden_states.shape
+        mixed_query_layer = self.query(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)]
+        relative_position_bias = relative_position_bias.view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1
+        )
+
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()
+        attention_scores = attention_scores + relative_position_bias.unsqueeze(0)
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in ClapAudioModel forward() function)
+            mask_shape = attention_mask.shape[0]
+            attention_scores = attention_scores.view(
+                batch_size // mask_shape, mask_shape, self.num_attention_heads, dim, dim
+            )
+            attention_scores = attention_scores + attention_mask.unsqueeze(1).unsqueeze(0)
+            attention_scores = attention_scores.view(-1, self.num_attention_heads, dim, dim)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinSelfOutput with Swin->ClapAudio
+class ClapAudioSelfOutput(nn.Module):
+    def __init__(self, config, dim):
+        super().__init__()
+        self.dense = nn.Linear(dim, dim)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinAttention with Swin->ClapAudio
+class ClapAudioAttention(nn.Module):
+    def __init__(self, config, dim, num_heads, window_size):
+        super().__init__()
+        self.self = ClapAudioSelfAttention(config, dim, num_heads, window_size)
+        self.output = ClapAudioSelfOutput(config, dim)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        self_outputs = self.self(hidden_states, attention_mask, head_mask, output_attentions)
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinIntermediate with Swin->ClapAudio
+class ClapAudioIntermediate(nn.Module):
+    def __init__(self, config, dim):
+        super().__init__()
+        self.dense = nn.Linear(dim, int(config.mlp_ratio * dim))
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinOutput with Swin->ClapAudio
+class ClapAudioOutput(nn.Module):
+    def __init__(self, config, dim):
+        super().__init__()
+        self.dense = nn.Linear(int(config.mlp_ratio * dim), dim)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinLayer with SwinDropPath->ClapDropPath, Swin->ClapAudio
+class ClapAudioLayer(nn.Module):
+    def __init__(self, config, dim, input_resolution, num_heads, shift_size=0):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.shift_size = shift_size
+        self.window_size = config.window_size
+        self.input_resolution = input_resolution
+        self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+        self.attention = ClapAudioAttention(config, dim, num_heads, window_size=self.window_size)
+        self.drop_path = ClapDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
+        self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+        self.intermediate = ClapAudioIntermediate(config, dim)
+        self.output = ClapAudioOutput(config, dim)
+
+    def set_shift_and_window_size(self, input_resolution):
+        if min(input_resolution) <= self.window_size:
+            # if window size is larger than input resolution, we don't partition windows
+            self.shift_size = 0
+            self.window_size = min(input_resolution)
+
+    def get_attn_mask(self, height, width, dtype):
+        if self.shift_size > 0:
+            # calculate attention mask for SW-MSA
+            img_mask = torch.zeros((1, height, width, 1), dtype=dtype)
+            height_slices = (
+                slice(0, -self.window_size),
+                slice(-self.window_size, -self.shift_size),
+                slice(-self.shift_size, None),
+            )
+            width_slices = (
+                slice(0, -self.window_size),
+                slice(-self.window_size, -self.shift_size),
+                slice(-self.shift_size, None),
+            )
+            count = 0
+            for height_slice in height_slices:
+                for width_slice in width_slices:
+                    img_mask[:, height_slice, width_slice, :] = count
+                    count += 1
+
+            mask_windows = window_partition(img_mask, self.window_size)
+            mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+        else:
+            attn_mask = None
+        return attn_mask
+
+    def maybe_pad(self, hidden_states, height, width):
+        pad_right = (self.window_size - width % self.window_size) % self.window_size
+        pad_bottom = (self.window_size - height % self.window_size) % self.window_size
+        pad_values = (0, 0, 0, pad_right, 0, pad_bottom)
+        hidden_states = nn.functional.pad(hidden_states, pad_values)
+        return hidden_states, pad_values
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_dimensions: Tuple[int, int],
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+        always_partition: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if not always_partition:
+            self.set_shift_and_window_size(input_dimensions)
+        else:
+            pass
+        height, width = input_dimensions
+        batch_size, _, channels = hidden_states.size()
+        shortcut = hidden_states
+
+        hidden_states = self.layernorm_before(hidden_states)
+
+        hidden_states = hidden_states.view(batch_size, height, width, channels)
+
+        # pad hidden_states to multiples of window size
+        hidden_states, pad_values = self.maybe_pad(hidden_states, height, width)
+
+        _, height_pad, width_pad, _ = hidden_states.shape
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_hidden_states = torch.roll(hidden_states, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+        else:
+            shifted_hidden_states = hidden_states
+
+        # partition windows
+        hidden_states_windows = window_partition(shifted_hidden_states, self.window_size)
+        hidden_states_windows = hidden_states_windows.view(-1, self.window_size * self.window_size, channels)
+        attn_mask = self.get_attn_mask(height_pad, width_pad, dtype=hidden_states.dtype)
+        if attn_mask is not None:
+            attn_mask = attn_mask.to(hidden_states_windows.device)
+
+        attention_outputs = self.attention(
+            hidden_states_windows, attn_mask, head_mask, output_attentions=output_attentions
+        )
+
+        attention_output = attention_outputs[0]
+
+        attention_windows = attention_output.view(-1, self.window_size, self.window_size, channels)
+        shifted_windows = window_reverse(attention_windows, self.window_size, height_pad, width_pad)
+
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            attention_windows = torch.roll(shifted_windows, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            attention_windows = shifted_windows
+
+        was_padded = pad_values[3] > 0 or pad_values[5] > 0
+        if was_padded:
+            attention_windows = attention_windows[:, :height, :width, :].contiguous()
+
+        attention_windows = attention_windows.view(batch_size, height * width, channels)
+
+        hidden_states = shortcut + self.drop_path(attention_windows)
+
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.intermediate(layer_output)
+        layer_output = hidden_states + self.output(layer_output)
+
+        layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,)
+        return layer_outputs
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinStage with Swin->ClapAudio
+class ClapAudioStage(nn.Module):
+    def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, downsample):
+        super().__init__()
+        self.config = config
+        self.dim = dim
+        self.blocks = nn.ModuleList(
+            [
+                ClapAudioLayer(
+                    config=config,
+                    dim=dim,
+                    input_resolution=input_resolution,
+                    num_heads=num_heads,
+                    shift_size=0 if (i % 2 == 0) else config.window_size // 2,
+                )
+                for i in range(depth)
+            ]
+        )
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(input_resolution, dim=dim, norm_layer=nn.LayerNorm)
+        else:
+            self.downsample = None
+
+        self.pointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_dimensions: Tuple[int, int],
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+        always_partition: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        height, width = input_dimensions
+        for i, layer_module in enumerate(self.blocks):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition
+            )
+
+            hidden_states = layer_outputs[0]
+
+        hidden_states_before_downsampling = hidden_states
+        if self.downsample is not None:
+            height_downsampled, width_downsampled = (height + 1) // 2, (width + 1) // 2
+            output_dimensions = (height, width, height_downsampled, width_downsampled)
+            hidden_states = self.downsample(hidden_states_before_downsampling, input_dimensions)
+        else:
+            output_dimensions = (height, width, height, width)
+
+        stage_outputs = (hidden_states, hidden_states_before_downsampling, output_dimensions)
+
+        if output_attentions:
+            stage_outputs += layer_outputs[1:]
+        return stage_outputs
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinPatchMerging with Swin->ClapAudio
+class ClapAudioPatchMerging(nn.Module):
+    """
+    Patch Merging Layer.
+
+    Args:
+        input_resolution (`Tuple[int]`):
+            Resolution of input feature.
+        dim (`int`):
+            Number of input channels.
+        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
+            Normalization layer class.
+    """
+
+    def __init__(self, input_resolution: Tuple[int], dim: int, norm_layer: nn.Module = nn.LayerNorm) -> None:
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+
+    def maybe_pad(self, input_feature, height, width):
+        should_pad = (height % 2 == 1) or (width % 2 == 1)
+        if should_pad:
+            pad_values = (0, 0, 0, width % 2, 0, height % 2)
+            input_feature = nn.functional.pad(input_feature, pad_values)
+
+        return input_feature
+
+    def forward(self, input_feature: torch.Tensor, input_dimensions: Tuple[int, int]) -> torch.Tensor:
+        height, width = input_dimensions
+        # `dim` is height * width
+        batch_size, dim, num_channels = input_feature.shape
+
+        input_feature = input_feature.view(batch_size, height, width, num_channels)
+        # pad input to be disible by width and height, if needed
+        input_feature = self.maybe_pad(input_feature, height, width)
+        # [batch_size, height/2, width/2, num_channels]
+        input_feature_0 = input_feature[:, 0::2, 0::2, :]
+        # [batch_size, height/2, width/2, num_channels]
+        input_feature_1 = input_feature[:, 1::2, 0::2, :]
+        # [batch_size, height/2, width/2, num_channels]
+        input_feature_2 = input_feature[:, 0::2, 1::2, :]
+        # [batch_size, height/2, width/2, num_channels]
+        input_feature_3 = input_feature[:, 1::2, 1::2, :]
+        # batch_size height/2 width/2 4*num_channels
+        input_feature = torch.cat([input_feature_0, input_feature_1, input_feature_2, input_feature_3], -1)
+        input_feature = input_feature.view(batch_size, -1, 4 * num_channels)  # batch_size height/2*width/2 4*C
+
+        input_feature = self.norm(input_feature)
+        input_feature = self.reduction(input_feature)
+
+        return input_feature
+
+
+class ClapAudioEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.num_layers = len(config.depths)
+
+        self.config = config
+        self.patch_embed = ClapAudioPatchEmbed(config)
+        self.enable_fusion = config.enable_fusion
+        self.patch_stride = self.patch_embed.patch_stride
+        self.spec_size = config.spec_size
+        self.freq_ratio = config.spec_size // config.num_mel_bins
+
+        self.num_features = int(config.patch_embeds_hidden_size * 2 ** (self.num_layers - 1))
+
+        drop_path_rate = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
+
+        grid_size = self.patch_embed.grid_size
+        self.input_resolutions = [(grid_size[0] // (2**i), grid_size[1] // (2**i)) for i in range(self.num_layers)]
+
+        self.layers = nn.ModuleList(
+            [
+                ClapAudioStage(
+                    config=config,
+                    dim=int(config.patch_embeds_hidden_size * 2**i_layer),
+                    input_resolution=self.input_resolutions[i_layer],
+                    depth=config.depths[i_layer],
+                    num_heads=config.num_attention_heads[i_layer],
+                    drop_path=drop_path_rate[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
+                    downsample=ClapAudioPatchMerging if (i_layer < self.num_layers - 1) else None,
+                )
+                for i_layer in range(self.num_layers)
+            ]
+        )
+
+        self.gradient_checkpointing = False
+
+        self.batch_norm = nn.BatchNorm2d(config.num_mel_bins)
+        self.norm = nn.LayerNorm(self.num_features)
+        self.depths = config.depths
+        self.avgpool = nn.AdaptiveAvgPool1d(1)
+
+    def reshape_mel2img(self, normalized_input_features):
+        """
+        The input is 4 normalized log mel spectrograms. It is reshape to the common shape of images. Each channel
+        should represent 1 of the 4 crops of the spectrogram. For more details, refer to the [`ClapFeatureExtractor`].
+        """
+        _, _, time_length, freq_length = normalized_input_features.shape
+
+        spec_width = int(self.spec_size * self.freq_ratio)
+        spec_heigth = self.spec_size // self.freq_ratio
+
+        if time_length > spec_width or freq_length > spec_heigth:
+            raise ValueError("the wav size should be less than or equal to the swin input size")
+
+        # to avoid bicubic zero error
+        if time_length < spec_width:
+            normalized_input_features = nn.functional.interpolate(
+                normalized_input_features, (spec_width, freq_length), mode="bicubic", align_corners=True
+            )
+        if freq_length < spec_heigth:
+            normalized_input_features = nn.functional.interpolate(
+                normalized_input_features, (time_length, spec_heigth), mode="bicubic", align_corners=True
+            )
+
+        batch, channels, time, freq = normalized_input_features.shape
+
+        # batch_size, channels, spec_width, spec_heigth --> batch_size, channels, spec_heigth * freq_ratio, spec_width // freq_ratio
+        normalized_input_features = normalized_input_features.reshape(
+            batch, channels * self.freq_ratio, time // self.freq_ratio, freq
+        )
+        normalized_input_features = normalized_input_features.permute(0, 1, 3, 2).contiguous()
+        normalized_input_features = normalized_input_features.reshape(
+            batch, channels, freq * self.freq_ratio, time // self.freq_ratio
+        )
+
+        return normalized_input_features
+
+    def forward(
+        self,
+        input_features,
+        is_longer: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        output_hidden_states_before_downsampling: Optional[bool] = False,
+        always_partition: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple, ClapAudioModelOutput]:
+        input_features = input_features.transpose(1, 3)
+        normalized_input_features = self.batch_norm(input_features)
+        normalized_input_features = normalized_input_features.transpose(1, 3)
+
+        is_longer_list_idx = None
+        if self.enable_fusion:
+            is_longer_list = is_longer.to(input_features.device)
+            is_longer_list_idx = torch.where(is_longer_list == 1)[0]
+
+        hidden_states = self.reshape_mel2img(normalized_input_features)
+
+        frames_num = hidden_states.shape[2]
+
+        hidden_states = self.patch_embed(hidden_states, is_longer_list_idx)
+
+        all_hidden_states = () if output_hidden_states else None
+        all_reshaped_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        input_dimensions = self.input_resolutions[0]
+
+        if output_hidden_states:
+            batch_size, _, hidden_size = hidden_states.shape
+            # rearrange batch_size (height width) channels -> batch_size channel height width
+            reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size)
+            reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
+            all_hidden_states += (hidden_states,)
+            all_reshaped_hidden_states += (reshaped_hidden_state,)
+
+        for i, layer_module in enumerate(self.layers):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            input_dimensions = self.input_resolutions[i]
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module), hidden_states, input_dimensions, layer_head_mask
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition
+                )
+
+            hidden_states = layer_outputs[0]
+
+            hidden_states_before_downsampling = layer_outputs[1]
+            output_dimensions = layer_outputs[2]
+
+            input_dimensions = (output_dimensions[-2], output_dimensions[-1])
+
+            if output_hidden_states and output_hidden_states_before_downsampling:
+                batch_size, _, hidden_size = hidden_states_before_downsampling.shape
+                # rearrange batch_size (height width) channels -> batch_size channel height width
+                # here we use the original (not downsampled) height and width
+                reshaped_hidden_state = hidden_states_before_downsampling.view(
+                    batch_size, *(output_dimensions[0], output_dimensions[1]), hidden_size
+                )
+                reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
+                all_hidden_states += (hidden_states_before_downsampling,)
+                all_reshaped_hidden_states += (reshaped_hidden_state,)
+            elif output_hidden_states and not output_hidden_states_before_downsampling:
+                batch_size, _, hidden_size = hidden_states.shape
+                # rearrange batch_size (height width) channels -> batch_size channel height width
+                reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size)
+                reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
+                all_hidden_states += (hidden_states,)
+                all_reshaped_hidden_states += (reshaped_hidden_state,)
+
+            if output_attentions:
+                all_self_attentions += layer_outputs[3:]
+
+        last_hidden_state = self.norm(hidden_states)
+
+        batch_size, _, n_channels = last_hidden_state.shape
+
+        freq_shape = frames_num // (2 ** (len(self.depths) - 1)) // self.patch_stride[0]
+        temporal_shape = frames_num // (2 ** (len(self.depths) - 1)) // self.patch_stride[1]
+
+        last_hidden_state = (
+            last_hidden_state.permute(0, 2, 1).contiguous().reshape(batch_size, n_channels, freq_shape, temporal_shape)
+        )
+
+        batch_size, n_channels, n_frequencies, n_temp = last_hidden_state.shape
+        # group 2D CNN
+        c_freq_bin = n_frequencies // self.freq_ratio
+        last_hidden_state = last_hidden_state.reshape(
+            batch_size, n_channels, n_frequencies // c_freq_bin, c_freq_bin, n_temp
+        )
+        last_hidden_state = (
+            last_hidden_state.permute(0, 1, 3, 2, 4).contiguous().reshape(batch_size, n_channels, c_freq_bin, -1)
+        )
+        latent_output = self.avgpool(torch.flatten(last_hidden_state, 2))
+        latent_output = torch.flatten(latent_output, 1)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    last_hidden_state,
+                    latent_output,
+                    all_reshaped_hidden_states,
+                    all_self_attentions,
+                ]
+                if v is not None
+            )
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=latent_output,
+            hidden_states=all_reshaped_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+CLAP_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`ClapConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+CLAP_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+CLAP_AUDIO_INPUTS_DOCSTRING = r"""
+    Args:
+        input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Input audio features. This should be returnes by the [`ClapFeatureExtractor`] class that you can also
+            retrieve from [`AutoFeatureExtractor`]. See [`ClapFeatureExtractor.__call__`] for details.
+        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
+            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
+            the features.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+CLAP_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Input audio features. This should be returnes by the [`ClapFeatureExtractor`] class that you can also
+            retrieve from [`AutoFeatureExtractor`]. See [`ClapFeatureExtractor.__call__`] for details.
+        return_loss (`bool`, *optional*):
+            Whether or not to return the contrastive loss.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class ClapProjectionLayer(nn.Module):
+    def __init__(self, config: Union[ClapAudioConfig, ClapTextConfig]):
+        super().__init__()
+        self.config = config
+        hidden_size = config.hidden_size
+        projection_dim = config.projection_dim
+
+        self.linear1 = nn.Linear(hidden_size, projection_dim)
+        self.activation = ACT2FN[config.projection_hidden_act]
+        self.linear2 = nn.Linear(projection_dim, projection_dim)
+
+    def forward(self, hidden_states):
+        hidden_states = self.linear1(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.linear2(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->ClapText, persistent=False->persistent=True
+class ClapTextEmbeddings(nn.Module):
+    """
+    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
+    """
+
+    # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=True
+        )
+        self.register_buffer(
+            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=True
+        )
+
+        # End copy
+        self.padding_idx = config.pad_token_id
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
+        )
+
+    def forward(
+        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        if position_ids is None:
+            if input_ids is not None:
+                # Create the position ids from the input token ids. Any padded tokens remain padded.
+                position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
+            else:
+                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
+
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
+        """
+        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+
+        Args:
+            inputs_embeds: torch.Tensor
+
+        Returns: torch.Tensor
+        """
+        input_shape = inputs_embeds.size()[:-1]
+        sequence_length = input_shape[1]
+
+        position_ids = torch.arange(
+            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+        )
+        return position_ids.unsqueeze(0).expand(input_shape)
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->ClapText
+class ClapTextSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        use_cache = past_key_value is not None
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
+            if use_cache:
+                position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
+                    -1, 1
+                )
+            else:
+                position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in ClapTextModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput
+class ClapTextSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->ClapText
+class ClapTextAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        self.self = ClapTextSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.output = ClapTextSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate
+class ClapTextIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput
+class ClapTextOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->ClapText
+class ClapTextLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = ClapTextAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = ClapTextAttention(config, position_embedding_type="absolute")
+        self.intermediate = ClapTextIntermediate(config)
+        self.output = ClapTextOutput(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->ClapText
+class ClapTextEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([ClapTextLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPooler
+class ClapTextPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class ClapPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = ClapConfig
+    base_model_prefix = "clap"
+    supports_gradient_checkpointing = False
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_factor
+
+        if isinstance(module, ClapTextEmbeddings):
+            module.position_embeddings.weight.data.normal_(mean=0.0, std=factor * 0.02)
+            module.token_type_embeddings.weight.data.normal_(mean=0.0, std=factor * 0.02)
+        elif isinstance(module, ClapModel):
+            nn.init.normal_(module.logit_scale_a, std=factor * 0.02)
+            nn.init.normal_(module.logit_scale_t, std=factor * 0.02)
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=factor * 0.02)
+
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, (nn.Conv2d, nn.Linear)):
+            in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
+            nn.init.normal_(module.weight, std=in_proj_std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, ClapTextEncoder):
+            module.gradient_checkpointing = value
+
+
+class ClapAudioModel(ClapPreTrainedModel):
+    config_class = ClapAudioConfig
+    main_input_name = "input_features"
+
+    def __init__(self, config: ClapAudioConfig):
+        super().__init__(config)
+        self.audio_encoder = ClapAudioEncoder(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.audio_encoder.patch_embed.proj
+
+    @add_start_docstrings_to_model_forward(CLAP_AUDIO_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=ClapAudioConfig)
+    def forward(
+        self,
+        input_features: Optional[torch.FloatTensor] = None,
+        is_longer: Optional[torch.BoolTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from datasets import load_dataset
+        >>> from transformers import AutoProcessor, ClapAudioModel
+
+        >>> dataset = load_dataset("ashraq/esc50")
+        >>> audio_sample = dataset["train"]["audio"][0]["array"]
+
+        >>> model = ClapAudioModel.from_pretrained("laion/clap-htsat-fused")
+        >>> processor = AutoProcessor.from_pretrained("laion/clap-htsat-fused")
+
+        >>> inputs = processor(audios=audio_sample, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        return self.audio_encoder(
+            input_features=input_features,
+            is_longer=is_longer,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class ClapTextModel(ClapPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
+    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
+    Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
+    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
+    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
+
+    .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
+
+    """
+
+    config_class = ClapTextConfig
+
+    # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->ClapText
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = ClapTextEmbeddings(config)
+        self.encoder = ClapTextEncoder(config)
+
+        self.pooler = ClapTextPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    # Copied from transformers.models.bert.modeling_bert.BertModel.forward
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(CLAP_START_DOCSTRING)
+class ClapModel(ClapPreTrainedModel):
+    config_class = ClapConfig
+
+    def __init__(self, config: ClapConfig):
+        super().__init__(config)
+
+        if not isinstance(config.text_config, ClapTextConfig):
+            raise ValueError(
+                "config.text_config is expected to be of type ClapTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+
+        if not isinstance(config.audio_config, ClapAudioConfig):
+            raise ValueError(
+                "config.audio_config is expected to be of type ClapAudioConfig but is of type"
+                f" {type(config.audio_config)}."
+            )
+
+        text_config = config.text_config
+        audio_config = config.audio_config
+
+        self.logit_scale_a = nn.Parameter(torch.tensor(math.log(config.logit_scale_init_value)))
+        self.logit_scale_t = nn.Parameter(torch.tensor(math.log(config.logit_scale_init_value)))
+
+        self.projection_dim = config.projection_dim
+
+        self.text_model = ClapTextModel(text_config)
+        self.text_projection = ClapProjectionLayer(text_config)
+
+        self.audio_model = ClapAudioModel(audio_config)
+        self.audio_projection = ClapProjectionLayer(audio_config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(CLAP_TEXT_INPUTS_DOCSTRING)
+    def get_text_features(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output of [`ClapTextModel`].
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, ClapModel
+
+        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
+        >>> tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")
+
+        >>> inputs = tokenizer(["the sound of a cat", "the sound of a dog"], padding=True, return_tensors="pt")
+        >>> text_features = model.get_text_features(**inputs)
+        ```"""
+        # Use CLAP model's config for some fields (if specified) instead of those of audio & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = text_outputs[1] if return_dict is not None else text_outputs.pooler_output
+        text_features = self.text_projection(pooled_output)
+        text_features = F.normalize(text_features, dim=-1)
+
+        return text_features
+
+    @add_start_docstrings_to_model_forward(CLAP_AUDIO_INPUTS_DOCSTRING)
+    def get_audio_features(
+        self,
+        input_features: Optional[torch.Tensor] = None,
+        is_longer: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            audio_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The audio embeddings obtained by
+            applying the projection layer to the pooled output of [`ClapAudioModel`].
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoFeatureExtractor, ClapModel
+        >>> import torch
+
+        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
+        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("laion/clap-htsat-unfused")
+        >>> random_audio = torch.rand((16_000))
+        >>> inputs = feature_extractor(random_audio, return_tensors="pt")
+        >>> audio_features = model.get_audio_features(**inputs)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        audio_outputs = self.audio_model(
+            input_features=input_features,
+            is_longer=is_longer,
+            return_dict=return_dict,
+        )
+
+        pooled_output = audio_outputs[1] if not return_dict else audio_outputs.pooler_output
+
+        audio_features = self.audio_projection(pooled_output)
+        audio_features = F.normalize(audio_features, dim=-1)
+
+        return audio_features
+
+    @add_start_docstrings_to_model_forward(CLAP_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=ClapOutput, config_class=ClapConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        input_features: Optional[torch.FloatTensor] = None,
+        is_longer: Optional[torch.BoolTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, ClapOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from datasets import load_dataset
+        >>> from transformers import AutoProcessor, ClapModel
+
+        >>> dataset = load_dataset("ashraq/esc50")
+        >>> audio_sample = dataset["train"]["audio"][0]["array"]
+
+        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
+        >>> processor = AutoProcessor.from_pretrained("laion/clap-htsat-unfused")
+
+        >>> input_text = ["Sound of a dog", "Sound of vaccum cleaner"]
+
+        >>> inputs = processor(text=input_text, audios=audio_sample, return_tensors="pt", padding=True)
+
+        >>> outputs = model(**inputs)
+        >>> logits_per_audio = outputs.logits_per_audio  # this is the audio-text similarity score
+        >>> probs = logits_per_audio.softmax(dim=-1)  # we can take the softmax to get the label probabilities
+        ```"""
+        # Use CLAP model's config for some fields (if specified) instead of those of audio & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        audio_outputs = self.audio_model(
+            input_features=input_features,
+            is_longer=is_longer,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        audio_embeds = audio_outputs[1] if not return_dict else audio_outputs.pooler_output
+        audio_embeds = self.audio_projection(audio_embeds)
+
+        text_embeds = text_outputs[1] if not return_dict else text_outputs.pooler_output
+        text_embeds = self.text_projection(text_embeds)
+
+        # normalized features
+        audio_embeds = audio_embeds / audio_embeds.norm(p=2, dim=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+
+        # cosine similarity as logits
+        logit_scale_text = self.logit_scale_t.exp()
+        logit_scale_audio = self.logit_scale_a.exp()
+        logits_per_text = torch.matmul(text_embeds, audio_embeds.t()) * logit_scale_text
+        logits_per_audio = torch.matmul(audio_embeds, text_embeds.t()) * logit_scale_audio
+
+        loss = None
+        if return_loss:
+            caption_loss = contrastive_loss(logits_per_text)
+            audio_loss = contrastive_loss(logits_per_audio.t())
+            loss = (caption_loss + audio_loss) / 2.0
+
+        if not return_dict:
+            output = (logits_per_audio, logits_per_text, text_embeds, audio_embeds, text_outputs, audio_outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return ClapOutput(
+            loss=loss,
+            logits_per_audio=logits_per_audio,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            audio_embeds=audio_embeds,
+            text_model_output=text_outputs,
+            audio_model_output=audio_outputs,
+        )
+
+
+@add_start_docstrings(
+    """
+    CLAP Text Model with a projection layer on top (a linear layer on top of the pooled output).
+    """,
+    CLAP_START_DOCSTRING,
+)
+class ClapTextModelWithProjection(ClapPreTrainedModel):
+    config_class = ClapTextConfig
+
+    def __init__(self, config: ClapTextConfig):
+        super().__init__(config)
+        self.text_model = ClapTextModel(config)
+        self.text_projection = ClapProjectionLayer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.text_model.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.text_model.embeddings.word_embeddings = value
+
+    @add_start_docstrings_to_model_forward(CLAP_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=ClapTextModelOutput, config_class=ClapTextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, ClapTextModelOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, ClapTextModelWithProjection
+
+        >>> model = ClapTextModelWithProjection.from_pretrained("laion/clap-htsat-unfused")
+        >>> tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")
+
+        >>> inputs = tokenizer(["a sound of a cat", "a sound of a dog"], padding=True, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> text_embeds = outputs.text_embeds
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = text_outputs[1] if not return_dict else text_outputs.pooler_output
+
+        text_embeds = self.text_projection(pooled_output)
+
+        if not return_dict:
+            outputs = (text_embeds, text_outputs[0]) + text_outputs[2:]
+            return tuple(output for output in outputs if output is not None)
+
+        return ClapTextModelOutput(
+            text_embeds=text_embeds,
+            last_hidden_state=text_outputs.last_hidden_state,
+            hidden_states=text_outputs.hidden_states,
+            attentions=text_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    CLAP Audio Model with a projection layer on top (a linear layer on top of the pooled output).
+    """,
+    CLAP_START_DOCSTRING,
+)
+class ClapAudioModelWithProjection(ClapPreTrainedModel):
+    config_class = ClapAudioConfig
+    main_input_name = "input_features"
+
+    def __init__(self, config: ClapAudioConfig):
+        super().__init__(config)
+        self.audio_model = ClapAudioModel(config)
+        self.audio_projection = ClapProjectionLayer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.audio_model.audio_encoder.patch_embed.proj
+
+    @add_start_docstrings_to_model_forward(CLAP_AUDIO_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=ClapAudioModelOutput, config_class=ClapAudioConfig)
+    def forward(
+        self,
+        input_features: Optional[torch.FloatTensor] = None,
+        is_longer: Optional[torch.BoolTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, ClapAudioModelOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from datasets import load_dataset
+        >>> from transformers import ClapAudioModelWithProjection, ClapProcessor
+
+        >>> model = ClapAudioModelWithProjection.from_pretrained("laion/clap-htsat-fused")
+        >>> processor = ClapProcessor.from_pretrained("laion/clap-htsat-fused")
+
+        >>> dataset = load_dataset("ashraq/esc50")
+        >>> audio_sample = dataset["train"]["audio"][0]["array"]
+
+        >>> inputs = processor(audios=audio_sample, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> audio_embeds = outputs.audio_embeds
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        audio_outputs = self.audio_model(
+            input_features=input_features,
+            is_longer=is_longer,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = audio_outputs[1] if not return_dict else audio_outputs.pooler_output
+
+        audio_embeds = self.audio_projection(pooled_output)
+
+        if not return_dict:
+            outputs = (audio_embeds, audio_outputs[0]) + audio_outputs[2:]
+            return tuple(output for output in outputs if output is not None)
+
+        return ClapAudioModelOutput(
+            audio_embeds=audio_embeds,
+            last_hidden_state=audio_outputs.last_hidden_state,
+            attentions=audio_outputs.attentions,
+            hidden_states=audio_outputs.hidden_states,
+        )
diff --git a/src/transformers/models/clap/processing_clap.py b/src/transformers/models/clap/processing_clap.py
new file mode 100644
index 000000000000..7492f102b4b2
--- /dev/null
+++ b/src/transformers/models/clap/processing_clap.py
@@ -0,0 +1,116 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Audio/Text processor class for CLAP
+"""
+
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils_base import BatchEncoding
+
+
+class ClapProcessor(ProcessorMixin):
+    r"""
+    Constructs a CLAP processor which wraps a CLAP feature extractor and a RoBerta tokenizer into a single processor.
+
+    [`ClapProcessor`] offers all the functionalities of [`ClapFeatureExtractor`] and [`RobertaTokenizerFast`]. See the
+    [`~ClapProcessor.__call__`] and [`~ClapProcessor.decode`] for more information.
+
+    Args:
+        feature_extractor ([`ClapFeatureExtractor`]):
+            The audio processor is a required input.
+        tokenizer ([`RobertaTokenizerFast`]):
+            The tokenizer is a required input.
+    """
+    feature_extractor_class = "ClapFeatureExtractor"
+    tokenizer_class = ("RobertaTokenizer", "RobertaTokenizerFast")
+
+    def __init__(self, feature_extractor, tokenizer):
+        super().__init__(feature_extractor, tokenizer)
+
+    def __call__(self, text=None, audios=None, return_tensors=None, **kwargs):
+        """
+        Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text`
+        and `kwargs` arguments to RobertaTokenizerFast's [`~RobertaTokenizerFast.__call__`] if `text` is not `None` to
+        encode the text. To prepare the audio(s), this method forwards the `audios` and `kwrags` arguments to
+        ClapFeatureExtractor's [`~ClapFeatureExtractor.__call__`] if `audios` is not `None`. Please refer to the
+        doctsring of the above two methods for more information.
+
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            audios (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The audio or batch of audios to be prepared. Each audio can be NumPy array or PyTorch tensor. In case
+                of a NumPy array/PyTorch tensor, each audio should be of shape (C, T), where C is a number of channels,
+                and T the sample length of the audio.
+
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **audio_features** -- Audio features to be fed to a model. Returned when `audios` is not `None`.
+        """
+        sampling_rate = kwargs.pop("sampling_rate", None)
+
+        if text is None and audios is None:
+            raise ValueError("You have to specify either text or audios. Both cannot be none.")
+
+        if text is not None:
+            encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
+
+        if audios is not None:
+            audio_features = self.feature_extractor(
+                audios, sampling_rate=sampling_rate, return_tensors=return_tensors, **kwargs
+            )
+
+        if text is not None and audios is not None:
+            encoding["input_features"] = audio_features.input_features
+            return encoding
+        elif text is not None:
+            return encoding
+        else:
+            return BatchEncoding(data=dict(**audio_features), tensor_type=return_tensors)
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to RobertaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to RobertaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer
+        to the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        feature_extractor_input_names = self.feature_extractor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names))
diff --git a/src/transformers/models/clip/__init__.py b/src/transformers/models/clip/__init__.py
index 4fb4ca8eca16..0ee0cfb0915f 100644
--- a/src/transformers/models/clip/__init__.py
+++ b/src/transformers/models/clip/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -98,6 +94,7 @@
         "FlaxCLIPPreTrainedModel",
         "FlaxCLIPTextModel",
         "FlaxCLIPTextPreTrainedModel",
+        "FlaxCLIPTextModelWithProjection",
         "FlaxCLIPVisionModel",
         "FlaxCLIPVisionPreTrainedModel",
     ]
@@ -171,6 +168,7 @@
             FlaxCLIPModel,
             FlaxCLIPPreTrainedModel,
             FlaxCLIPTextModel,
+            FlaxCLIPTextModelWithProjection,
             FlaxCLIPTextPreTrainedModel,
             FlaxCLIPVisionModel,
             FlaxCLIPVisionPreTrainedModel,
diff --git a/src/transformers/models/clip/configuration_clip.py b/src/transformers/models/clip/configuration_clip.py
index 624b7cf824b1..f9ecf5f7d463 100644
--- a/src/transformers/models/clip/configuration_clip.py
+++ b/src/transformers/models/clip/configuration_clip.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ CLIP model configuration"""
 
-import copy
 import os
 from collections import OrderedDict
 from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
@@ -55,6 +54,8 @@ class CLIPTextConfig(PretrainedConfig):
             Dimensionality of the encoder layers and the pooler layer.
         intermediate_size (`int`, *optional*, defaults to 2048):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimentionality of text and vision projection layers.
         num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
         num_attention_heads (`int`, *optional*, defaults to 8):
@@ -64,17 +65,22 @@ class CLIPTextConfig(PretrainedConfig):
             just in case (e.g., 512 or 1024 or 2048).
         hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
-            defaults to 1e-5): The epsilon used by the layer normalization layers.
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        initializer_factor (`float`, *optional*, defaults to 1):
+        initializer_factor (`float`, *optional*, defaults to 1.0):
             A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
             testing).
+        pad_token_id (`int`, *optional*, defaults to 1):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 49406):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 49407):
+            End of stream token id.
 
     Example:
 
@@ -102,15 +108,16 @@ def __init__(
         num_attention_heads=8,
         max_position_embeddings=77,
         hidden_act="quick_gelu",
-        layer_norm_eps=0.00001,
-        dropout=0.0,
+        layer_norm_eps=1e-5,
         attention_dropout=0.0,
         initializer_range=0.02,
         initializer_factor=1.0,
+        # This differs from `CLIPTokenizer`'s default and from openai/clip
+        # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
         pad_token_id=1,
-        bos_token_id=0,
-        eos_token_id=2,
-        **kwargs
+        bos_token_id=49406,
+        eos_token_id=49407,
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
@@ -118,7 +125,6 @@ def __init__(
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size
         self.projection_dim = projection_dim
-        self.dropout = dropout
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
         self.max_position_embeddings = max_position_embeddings
@@ -130,6 +136,7 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
 
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
@@ -161,25 +168,28 @@ class CLIPVisionConfig(PretrainedConfig):
             Dimensionality of the encoder layers and the pooler layer.
         intermediate_size (`int`, *optional*, defaults to 3072):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimentionality of text and vision projection layers.
         num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
         num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
         image_size (`int`, *optional*, defaults to 224):
             The size (resolution) of each image.
         patch_size (`int`, *optional*, defaults to 32):
             The size (resolution) of each patch.
         hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
-            defaults to 1e-5): The epsilon used by the layer normalization layers.
-        dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        initializer_factor (`float`, *optional*, defaults to 1):
+        initializer_factor (`float`, *optional*, defaults to 1.0):
             A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
             testing).
 
@@ -211,19 +221,17 @@ def __init__(
         image_size=224,
         patch_size=32,
         hidden_act="quick_gelu",
-        layer_norm_eps=0.00001,
-        dropout=0.0,
+        layer_norm_eps=1e-5,
         attention_dropout=0.0,
         initializer_range=0.02,
         initializer_factor=1.0,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size
         self.projection_dim = projection_dim
-        self.dropout = dropout
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
         self.num_channels = num_channels
@@ -237,6 +245,7 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
 
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
@@ -300,28 +309,87 @@ class CLIPConfig(PretrainedConfig):
     ```"""
 
     model_type = "clip"
-    is_composition = True
 
     def __init__(
         self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
     ):
-        super().__init__(**kwargs)
-
         # If `_config_dict` exist, we use them for the backward compatibility.
+        # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
+        # of confusion!).
         text_config_dict = kwargs.pop("text_config_dict", None)
         vision_config_dict = kwargs.pop("vision_config_dict", None)
+
+        super().__init__(**kwargs)
+
+        # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
+        # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
+        # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
         if text_config_dict is not None:
-            text_config = text_config_dict
+            if text_config is None:
+                text_config = {}
+
+            # This is the complete result when using `text_config_dict`.
+            _text_config_dict = CLIPTextConfig(**text_config_dict).to_dict()
+
+            # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
+            for key, value in _text_config_dict.items():
+                if key in text_config and value != text_config[key] and key not in ["transformers_version"]:
+                    # If specified in `text_config_dict`
+                    if key in text_config_dict:
+                        message = (
+                            f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
+                            f'The value `text_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The "
+                            f'value `text_config["{key}"]` will be overriden.'
+                        )
+                    logger.warning(message)
+
+            # Update all values in `text_config` with the ones in `_text_config_dict`.
+            text_config.update(_text_config_dict)
+
         if vision_config_dict is not None:
-            vision_config = vision_config_dict
+            if vision_config is None:
+                vision_config = {}
+
+            # This is the complete result when using `vision_config_dict`.
+            _vision_config_dict = CLIPVisionConfig(**vision_config_dict).to_dict()
+            # convert keys to string instead of integer
+            if "id2label" in _vision_config_dict:
+                _vision_config_dict["id2label"] = {
+                    str(key): value for key, value in _vision_config_dict["id2label"].items()
+                }
+
+            # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different.
+            for key, value in _vision_config_dict.items():
+                if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]:
+                    # If specified in `vision_config_dict`
+                    if key in vision_config_dict:
+                        message = (
+                            f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different "
+                            f'values. The value `vision_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. "
+                            f'The value `vision_config["{key}"]` will be overriden.'
+                        )
+                    logger.warning(message)
+
+            # Update all values in `vision_config` with the ones in `_vision_config_dict`.
+            vision_config.update(_vision_config_dict)
 
         if text_config is None:
             text_config = {}
-            logger.info("text_config is None. Initializing the CLIPTextConfig with default values.")
+            logger.info("`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.")
 
         if vision_config is None:
             vision_config = {}
-            logger.info("vision_config is None. initializing the CLIPVisionConfig with default values.")
+            logger.info("`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.")
 
         self.text_config = CLIPTextConfig(**text_config)
         self.vision_config = CLIPVisionConfig(**vision_config)
@@ -342,19 +410,6 @@ def from_text_vision_configs(cls, text_config: CLIPTextConfig, vision_config: CL
 
         return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
 
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["text_config"] = self.text_config.to_dict()
-        output["vision_config"] = self.vision_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
-
 
 class CLIPOnnxConfig(OnnxConfig):
     @property
@@ -389,12 +444,11 @@ def generate_dummy_inputs(
         seq_length: int = -1,
         framework: Optional["TensorType"] = None,
     ) -> Mapping[str, Any]:
-
         text_input_dict = super().generate_dummy_inputs(
             processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
         )
         image_input_dict = super().generate_dummy_inputs(
-            processor.feature_extractor, batch_size=batch_size, framework=framework
+            processor.image_processor, batch_size=batch_size, framework=framework
         )
         return {**text_input_dict, **image_input_dict}
 
diff --git a/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py b/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py
index 58886aa88a34..2127da4f6cf9 100644
--- a/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py
+++ b/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py
@@ -16,8 +16,8 @@
 import argparse
 
 import torch
-
 from clip import load
+
 from transformers import CLIPConfig, CLIPModel
 
 
@@ -127,9 +127,9 @@ def convert_clip_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_pa
     input_ids = torch.arange(0, 77).unsqueeze(0)
     pixel_values = torch.randn(1, 3, 224, 224)
 
-    hf_logits_per_image, hf_logits_per_text = hf_model(
-        input_ids=input_ids, pixel_values=pixel_values, return_dict=True
-    )[1:3]
+    hf_outputs = hf_model(input_ids=input_ids, pixel_values=pixel_values, return_dict=True)
+    hf_logits_per_image = hf_outputs.logits_per_image
+    hf_logits_per_text = hf_outputs.logits_per_text
     pt_logits_per_image, pt_logits_per_text = pt_model(pixel_values, input_ids)
 
     assert torch.allclose(hf_logits_per_image, pt_logits_per_image, atol=1e-3)
diff --git a/src/transformers/models/clip/image_processing_clip.py b/src/transformers/models/clip/image_processing_clip.py
index 380411b47a7c..811d69afce70 100644
--- a/src/transformers/models/clip/image_processing_clip.py
+++ b/src/transformers/models/clip/image_processing_clip.py
@@ -18,21 +18,26 @@
 
 import numpy as np
 
-from transformers.utils.generic import TensorType
-
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import (
-    center_crop,
     convert_to_rgb,
     get_resize_output_image_size,
-    normalize,
-    rescale,
     resize,
     to_channel_dimension_format,
 )
-from ...image_utils import ChannelDimension, ImageInput, PILImageResampling, is_batched, to_numpy_array, valid_images
-from ...utils import logging
-from ...utils.import_utils import is_vision_available
+from ...image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import TensorType, is_vision_available, logging
 
 
 logger = logging.get_logger(__name__)
@@ -54,7 +59,7 @@ class CLIPImageProcessor(BaseImageProcessor):
             Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
             the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
             method.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
             Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
         do_center_crop (`bool`, *optional*, defaults to `True`):
             Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
@@ -68,16 +73,17 @@ class CLIPImageProcessor(BaseImageProcessor):
         rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
             Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
             method.
-        do_normalize:
+        do_normalize (`bool`, *optional*, defaults to `True`):
             Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
         image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
             Mean to use if normalizing the image. This is a float or list of floats the length of the number of
             channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
         image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
-            Image standard deviation.
-        do_convert_rgb (`bool`, *optional*, defaults to `True`):
             Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
             number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
     """
 
     model_input_names = ["pixel_values"]
@@ -95,7 +101,7 @@ def __init__(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_convert_rgb: bool = True,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"shortest_edge": 224}
@@ -111,8 +117,8 @@ def __init__(
         self.do_rescale = do_rescale
         self.rescale_factor = rescale_factor
         self.do_normalize = do_normalize
-        self.image_mean = image_mean if image_mean is not None else [0.48145466, 0.4578275, 0.40821073]
-        self.image_std = image_std if image_std is not None else [0.26862954, 0.26130258, 0.27577711]
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
         self.do_convert_rgb = do_convert_rgb
 
     def resize(
@@ -121,7 +127,8 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
@@ -136,79 +143,23 @@ def resize(
                 Resampling filter to use when resiizing the image.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
         """
         size = get_size_dict(size, default_to_square=False)
         if "shortest_edge" not in size:
             raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}")
-        output_size = get_resize_output_image_size(image, size=size["shortest_edge"], default_to_square=False)
-        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
-
-    def center_crop(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Center crop an image. If the image is too small to be cropped to the size given, it will be padded (so the
-        returned result will always be of size `size`).
-
-        Args:
-            image (`np.ndarray`):
-                Image to center crop.
-            size (`Dict[str, int]`):
-                Size of the output image in the form of a dictionary with keys `height` and `width`.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        size = get_size_dict(size)
-        if "height" not in size or "width" not in size:
-            raise ValueError(f"The `size` parameter must contain the keys (height, width). Got {size.keys()}")
-        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
-
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            image_mean (`float` or `List[float]`):
-                Image mean.
-            image_std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
+        output_size = get_resize_output_image_size(
+            image, size=size["shortest_edge"], default_to_square=False, input_data_format=input_data_format
+        )
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
 
     def preprocess(
         self,
@@ -226,14 +177,16 @@ def preprocess(
         do_convert_rgb: bool = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> PIL.Image.Image:
         """
         Preprocess an image or batch of images.
 
         Args:
             images (`ImageInput`):
-                Image to preprocess.
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
             do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                 Whether to resize the image.
             size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@@ -268,9 +221,15 @@ def preprocess(
                 - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
             data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
                 The channel dimension format for the output image. Can be one of:
-                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                - Unset: defaults to the channel dimension format of the input image.
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         size = size if size is not None else self.size
@@ -286,8 +245,7 @@ def preprocess(
         image_std = image_std if image_std is not None else self.image_std
         do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
 
-        if not is_batched(images):
-            images = [images]
+        images = make_list_of_images(images)
 
         if not valid_images(images):
             raise ValueError(
@@ -314,19 +272,42 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
         if do_resize:
-            images = [self.resize(image=image, size=size, resample=resample) for image in images]
+            images = [
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
 
         if do_center_crop:
-            images = [self.center_crop(image=image, size=crop_size) for image in images]
+            images = [
+                self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images
+            ]
 
         if do_rescale:
-            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
 
         if do_normalize:
-            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
-
-        images = [to_channel_dimension_format(image, data_format) for image in images]
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
 
         data = {"pixel_values": images}
         return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py
index aa46008f91d6..3a894b9727c9 100644
--- a/src/transformers/models/clip/modeling_clip.py
+++ b/src/transformers/models/clip/modeling_clip.py
@@ -61,7 +61,7 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
 
 
 # contrastive loss function, adapted from
-# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html
+# https://sachinruk.github.io/blog/2021-03-07-clip.html
 def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
     return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))
 
@@ -188,11 +188,12 @@ def __init__(self, config: CLIPVisionConfig):
         self.num_patches = (self.image_size // self.patch_size) ** 2
         self.num_positions = self.num_patches + 1
         self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
-        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)))
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
 
     def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
         batch_size = pixel_values.shape[0]
-        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
         patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
 
         class_embeds = self.class_embedding.expand(batch_size, 1, -1)
@@ -210,7 +211,9 @@ def __init__(self, config: CLIPTextConfig):
         self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
     def forward(
         self,
@@ -356,9 +359,9 @@ def __init__(self, config: CLIPConfig):
         super().__init__()
         self.embed_dim = config.hidden_size
         self.self_attn = CLIPAttention(config)
-        self.layer_norm1 = nn.LayerNorm(self.embed_dim)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
         self.mlp = CLIPMLP(config)
-        self.layer_norm2 = nn.LayerNorm(self.embed_dim)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
 
     def forward(
         self,
@@ -410,7 +413,6 @@ class CLIPPreTrainedModel(PreTrainedModel):
     config_class = CLIPConfig
     base_model_prefix = "clip"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -491,7 +493,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`CLIPTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -521,7 +523,7 @@ def _set_gradient_checkpointing(self, module, value=False):
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
             tensors for more detail.
@@ -538,7 +540,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`CLIPTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -556,7 +558,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             [What are position IDs?](../glossary#position-ids)
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
         return_loss (`bool`, *optional*):
             Whether or not to return the contrastive loss.
         output_attentions (`bool`, *optional*):
@@ -673,6 +675,24 @@ def custom_forward(*inputs):
         )
 
 
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
 class CLIPTextTransformer(nn.Module):
     def __init__(self, config: CLIPTextConfig):
         super().__init__()
@@ -680,7 +700,10 @@ def __init__(self, config: CLIPTextConfig):
         embed_dim = config.hidden_size
         self.embeddings = CLIPTextEmbeddings(config)
         self.encoder = CLIPEncoder(config)
-        self.final_layer_norm = nn.LayerNorm(embed_dim)
+        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+        # For `pooled_output` computation
+        self.eos_token_id = config.eos_token_id
 
     @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig)
@@ -711,12 +734,9 @@ def forward(
 
         hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
 
-        bsz, seq_len = input_shape
         # CLIP's text model uses causal mask, prepare it here.
         # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
-        causal_attention_mask = self._build_causal_attention_mask(bsz, seq_len, hidden_states.dtype).to(
-            hidden_states.device
-        )
+        causal_attention_mask = _make_causal_mask(input_shape, hidden_states.dtype, device=hidden_states.device)
         # expand attention_mask
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
@@ -734,13 +754,26 @@ def forward(
         last_hidden_state = encoder_outputs[0]
         last_hidden_state = self.final_layer_norm(last_hidden_state)
 
-        # text_embeds.shape = [batch_size, sequence_length, transformer.width]
-        # take features from the eot embedding (eot_token is the highest number in each sequence)
-        # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
-        pooled_output = last_hidden_state[
-            torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
-            input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
-        ]
+        if self.eos_token_id == 2:
+            # The `eos_token_id` was incorrect before PR #24773: Let's keep what have been done here.
+            # A CLIP model with such `eos_token_id` in the config can't work correctly with extra new tokens added
+            # ------------------------------------------------------------
+            # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+            # take features from the eot embedding (eot_token is the highest number in each sequence)
+            # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+            pooled_output = last_hidden_state[
+                torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+                input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
+            ]
+        else:
+            # The config gets updated `eos_token_id` from PR #24773 (so the use of exta new tokens is possible)
+            pooled_output = last_hidden_state[
+                torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+                # We need to get the first position of `eos_token_id` value (`pad_token_ids` might equal to `eos_token_id`)
+                (input_ids.to(dtype=torch.int, device=last_hidden_state.device) == self.eos_token_id)
+                .int()
+                .argmax(dim=-1),
+            ]
 
         if not return_dict:
             return (last_hidden_state, pooled_output) + encoder_outputs[1:]
@@ -752,15 +785,6 @@ def forward(
             attentions=encoder_outputs.attentions,
         )
 
-    def _build_causal_attention_mask(self, bsz, seq_len, dtype):
-        # lazily create causal attention mask, with full attention between the vision tokens
-        # pytorch uses additive attention mask; fill with -inf
-        mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype)
-        mask.fill_(torch.tensor(torch.finfo(dtype).min))
-        mask.triu_(1)  # zero out the lower diagonal
-        mask = mask.unsqueeze(1)  # expand mask
-        return mask
-
 
 @add_start_docstrings(
     """The text model from CLIP without any head or projection on top.""",
@@ -769,7 +793,7 @@ def _build_causal_attention_mask(self, bsz, seq_len, dtype):
 class CLIPTextModel(CLIPPreTrainedModel):
     config_class = CLIPTextConfig
 
-    _no_split_modules = ["CLIPEncoderLayer"]
+    _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"]
 
     def __init__(self, config: CLIPTextConfig):
         super().__init__(config)
@@ -800,10 +824,10 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import CLIPTokenizer, CLIPTextModel
+        >>> from transformers import AutoTokenizer, CLIPTextModel
 
         >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
-        >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
 
         >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
 
@@ -830,9 +854,9 @@ def __init__(self, config: CLIPVisionConfig):
         embed_dim = config.hidden_size
 
         self.embeddings = CLIPVisionEmbeddings(config)
-        self.pre_layrnorm = nn.LayerNorm(embed_dim)
+        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
         self.encoder = CLIPEncoder(config)
-        self.post_layernorm = nn.LayerNorm(embed_dim)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
 
     @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig)
@@ -915,10 +939,10 @@ def forward(
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import CLIPProcessor, CLIPVisionModel
+        >>> from transformers import AutoProcessor, CLIPVisionModel
 
         >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
-        >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
@@ -970,7 +994,7 @@ def __init__(self, config: CLIPConfig):
 
         self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
         self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
-        self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
+        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -993,10 +1017,10 @@ def get_text_features(
         Examples:
 
         ```python
-        >>> from transformers import CLIPTokenizer, CLIPModel
+        >>> from transformers import AutoTokenizer, CLIPModel
 
         >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
-        >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
 
         >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
         >>> text_features = model.get_text_features(**inputs)
@@ -1040,10 +1064,10 @@ def get_image_features(
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import CLIPProcessor, CLIPModel
+        >>> from transformers import AutoProcessor, CLIPModel
 
         >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
-        >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
@@ -1092,10 +1116,10 @@ def forward(
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import CLIPProcessor, CLIPModel
+        >>> from transformers import AutoProcessor, CLIPModel
 
         >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
-        >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
@@ -1174,7 +1198,7 @@ def forward(
 class CLIPTextModelWithProjection(CLIPPreTrainedModel):
     config_class = CLIPTextConfig
 
-    _no_split_modules = ["CLIPEncoderLayer"]
+    _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"]
 
     def __init__(self, config: CLIPTextConfig):
         super().__init__(config)
@@ -1209,10 +1233,10 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import CLIPTokenizer, CLIPTextModelWithProjection
+        >>> from transformers import AutoTokenizer, CLIPTextModelWithProjection
 
         >>> model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
-        >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
 
         >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
 
@@ -1286,10 +1310,10 @@ def forward(
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import CLIPProcessor, CLIPVisionModelWithProjection
+        >>> from transformers import AutoProcessor, CLIPVisionModelWithProjection
 
         >>> model = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
-        >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
diff --git a/src/transformers/models/clip/modeling_flax_clip.py b/src/transformers/models/clip/modeling_flax_clip.py
index aa8ef87d5bf1..5aeaa5d960a7 100644
--- a/src/transformers/models/clip/modeling_flax_clip.py
+++ b/src/transformers/models/clip/modeling_flax_clip.py
@@ -78,7 +78,7 @@
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`CLIPTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -108,7 +108,7 @@
     Args:
         pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
             tensors for more detail.
@@ -125,7 +125,7 @@
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`CLIPTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -143,7 +143,7 @@
             [What are position IDs?](../glossary#position-ids)
         pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
             tensors for more detail.
@@ -155,6 +155,36 @@
 """
 
 
+@flax.struct.dataclass
+class FlaxCLIPTextModelOutput(ModelOutput):
+    """
+    Base class for text model's outputs that also contains a pooling of the last hidden states.
+
+    Args:
+        text_embeds (`jnp.ndarray` of shape `(batch_size, output_dim`):
+            The text embeddings obtained by applying the projection layer to the pooled output of
+            [`FlaxCLIPTextModel`].
+        last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    text_embeds: jnp.ndarray = None
+    last_hidden_state: jnp.ndarray = None
+    hidden_states: Optional[Tuple[jnp.ndarray]] = None
+    attentions: Optional[Tuple[jnp.ndarray]] = None
+
+
 @flax.struct.dataclass
 class FlaxCLIPOutput(ModelOutput):
     """
@@ -315,7 +345,7 @@ def __call__(
             attention_bias = lax.select(
                 attention_mask > 0,
                 jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
-                jnp.full(attention_mask.shape, -1e4).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
             )
         else:
             attention_bias = None
@@ -487,6 +517,9 @@ def setup(self):
         self.encoder = FlaxCLIPEncoder(self.config, dtype=self.dtype)
         self.final_layer_norm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
 
+        # For `pooled_output` computation
+        self.eos_token_id = self.config.eos_token_id
+
     def __call__(
         self,
         input_ids,
@@ -517,9 +550,18 @@ def __call__(
         last_hidden_state = encoder_outputs[0]
         last_hidden_state = self.final_layer_norm(last_hidden_state)
 
-        # text_embeds.shape = [batch_size, sequence_length, transformer.width]
-        # take features from the EOS embedding (eos_token_id is the highest number in each sequence)
-        pooled_output = last_hidden_state[jnp.arange(last_hidden_state.shape[0]), input_ids.argmax(axis=-1)]
+        if self.eos_token_id == 2:
+            # The `eos_token_id` was incorrect before PR #24773: Let's keep what have been done here.
+            # A CLIP model with such `eos_token_id` in the config can't work correctly with extra new tokens added
+            # ------------------------------------------------------------
+            # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+            # take features from the EOS embedding (eos_token_id is the highest number in each sequence)
+            pooled_output = last_hidden_state[jnp.arange(last_hidden_state.shape[0]), input_ids.argmax(axis=-1)]
+        else:
+            # (no need to cast from bool to int after comparing to `eos_token_id`)
+            pooled_output = last_hidden_state[
+                jnp.arange(last_hidden_state.shape[0]), (input_ids == self.eos_token_id).argmax(axis=-1)
+            ]
 
         if not return_dict:
             return (last_hidden_state, pooled_output) + encoder_outputs[1:]
@@ -593,7 +635,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, **kwargs)
         super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
@@ -673,7 +715,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         if input_shape is None:
             input_shape = (1, config.image_size, config.image_size, 3)
@@ -744,7 +786,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         if input_shape is None:
             input_shape = ((1, 1), (1, config.vision_config.image_size, config.vision_config.image_size, 3))
@@ -834,7 +876,7 @@ def get_text_features(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`CLIPTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -846,10 +888,10 @@ def get_text_features(
         Examples:
 
         ```python
-        >>> from transformers import CLIPTokenizer, FlaxCLIPModel
+        >>> from transformers import AutoTokenizer, FlaxCLIPModel
 
         >>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
-        >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
 
         >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="np")
         >>> text_features = model.get_text_features(**inputs)
@@ -893,7 +935,7 @@ def get_image_features(
         Args:
             pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`):
                 Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained
-                using [`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
+                using [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
 
         Returns:
             image_features (`jnp.ndarray` of shape `(batch_size, output_dim`): The image embeddings obtained by
@@ -904,10 +946,10 @@ def get_image_features(
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import CLIPProcessor, FlaxCLIPModel
+        >>> from transformers import AutoProcessor, FlaxCLIPModel
 
         >>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
-        >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
@@ -976,10 +1018,10 @@ class FlaxCLIPTextModel(FlaxCLIPTextPreTrainedModel):
     Example:
 
     ```python
-    >>> from transformers import CLIPTokenizer, FlaxCLIPTextModel
+    >>> from transformers import AutoTokenizer, FlaxCLIPTextModel
 
     >>> model = FlaxCLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
-    >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+    >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
 
     >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="np")
 
@@ -995,6 +1037,78 @@ class FlaxCLIPTextModel(FlaxCLIPTextPreTrainedModel):
 )
 
 
+class FlaxCLIPTextModelWithProjectionModule(nn.Module):
+    config: CLIPTextConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.text_model = FlaxCLIPTextTransformer(self.config, dtype=self.dtype)
+        self.text_projection = nn.Dense(self.config.projection_dim, use_bias=False, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        position_ids,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = text_outputs[1]
+        text_embeds = self.text_projection(pooled_output)
+
+        if not return_dict:
+            return (text_embeds, text_outputs[0]) + text_outputs[2:]
+
+        return FlaxCLIPTextModelOutput(
+            text_embeds=text_embeds,
+            last_hidden_state=text_outputs.last_hidden_state,
+            hidden_states=text_outputs.hidden_states,
+            attentions=text_outputs.attentions,
+        )
+
+
+class FlaxCLIPTextModelWithProjection(FlaxCLIPTextPreTrainedModel):
+    module_class = FlaxCLIPTextModelWithProjectionModule
+
+
+FLAX_CLIP_TEXT_MODEL_WITH_PROJECTION_DOCSTRING = """
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, FlaxCLIPTextModelWithProjection
+
+    >>> model = FlaxCLIPTextModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
+    >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+
+    >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="np")
+
+    >>> outputs = model(**inputs)
+    >>> text_embeds = outputs.text_embeds
+    ```
+"""
+
+overwrite_call_docstring(
+    FlaxCLIPTextModelWithProjection, CLIP_TEXT_INPUTS_DOCSTRING + FLAX_CLIP_TEXT_MODEL_WITH_PROJECTION_DOCSTRING
+)
+append_replace_return_docstrings(
+    FlaxCLIPTextModelWithProjection, output_type=FlaxCLIPTextModelOutput, config_class=CLIPTextConfig
+)
+
+
 class FlaxCLIPVisionModule(nn.Module):
     config: CLIPVisionConfig
     dtype: jnp.dtype = jnp.float32
@@ -1031,10 +1145,10 @@ class FlaxCLIPVisionModel(FlaxCLIPVisionPreTrainedModel):
     ```python
     >>> from PIL import Image
     >>> import requests
-    >>> from transformers import CLIPProcessor, FlaxCLIPVisionModel
+    >>> from transformers import AutoProcessor, FlaxCLIPVisionModel
 
     >>> model = FlaxCLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
-    >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+    >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
 
     >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
     >>> image = Image.open(requests.get(url, stream=True).raw)
@@ -1158,10 +1272,10 @@ class FlaxCLIPModel(FlaxCLIPPreTrainedModel):
     >>> import jax
     >>> from PIL import Image
     >>> import requests
-    >>> from transformers import CLIPProcessor, FlaxCLIPModel
+    >>> from transformers import AutoProcessor, FlaxCLIPModel
 
     >>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
-    >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+    >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
 
     >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
     >>> image = Image.open(requests.get(url, stream=True).raw)
diff --git a/src/transformers/models/clip/modeling_tf_clip.py b/src/transformers/models/clip/modeling_tf_clip.py
index e6646d447242..335b1f7da8e4 100644
--- a/src/transformers/models/clip/modeling_tf_clip.py
+++ b/src/transformers/models/clip/modeling_tf_clip.py
@@ -15,9 +15,11 @@
 """ TF 2.0 CLIP model."""
 
 
+from __future__ import annotations
+
 import math
 from dataclasses import dataclass
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Any, Optional, Tuple, Union
 
 import numpy as np
 import tensorflow as tf
@@ -27,14 +29,13 @@
 
 # Public API
 from ...modeling_tf_utils import (
-    DUMMY_INPUTS,
     TFModelInputType,
     TFPreTrainedModel,
     get_initializer,
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -111,7 +112,7 @@ class TFCLIPOutput(ModelOutput):
             The output of the [`TFCLIPVisionModel`].
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits_per_image: tf.Tensor = None
     logits_per_text: tf.Tensor = None
     text_embeds: tf.Tensor = None
@@ -150,8 +151,7 @@ def __init__(self, config: CLIPVisionConfig, **kwargs):
             name="patch_embedding",
         )
 
-    def build(self, input_shape: tf.TensorShape):
-
+    def build(self, input_shape: tf.TensorShape = None):
         factor = self.config.initializer_factor
 
         self.class_embedding = self.add_weight(
@@ -201,15 +201,13 @@ def __init__(self, config: CLIPTextConfig, **kwargs):
         super().__init__(**kwargs)
 
         self.embed_dim = config.hidden_size
-        self.vocab_size = config.vocab_size
 
         self.config = config
 
-    def build(self, input_shape: tf.TensorShape):
-
+    def build(self, input_shape: tf.TensorShape = None):
         with tf.name_scope("token_embedding"):
             self.weight = self.add_weight(
-                shape=(self.vocab_size, self.embed_dim),
+                shape=(self.config.vocab_size, self.embed_dim),
                 initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range),
                 trainable=True,
                 name="weight",
@@ -241,16 +239,7 @@ def call(
             raise ValueError("You have to specify either input_ids or inputs_embeds")
 
         if inputs_embeds is None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
@@ -382,7 +371,6 @@ def __init__(self, config: CLIPConfig, **kwargs):
         )
 
     def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
-
         hidden_states = self.fc1(inputs=hidden_states)
         hidden_states = self.activation_fn(hidden_states)
         hidden_states = self.fc2(inputs=hidden_states)
@@ -506,6 +494,9 @@ def __init__(self, config: CLIPTextConfig, **kwargs):
             epsilon=config.layer_norm_eps, name="final_layer_norm"
         )
 
+        # For `pooled_output` computation
+        self.eos_token_id = config.eos_token_id
+
     def call(
         self,
         input_ids: TFModelInputType,
@@ -542,14 +533,30 @@ def call(
         sequence_output = encoder_outputs[0]
         sequence_output = self.final_layer_norm(inputs=sequence_output)
 
-        # text_embeds.shape = [batch_size, n_ctx, transformer.width]
-        # take features from the eot embedding (eot_token is the highest number in each sequence)
-        pooled_output = tf.gather_nd(
-            params=sequence_output,
-            indices=tf.stack(
-                values=(tf.range(input_shape[0], dtype=tf.int64), tf.math.argmax(input_ids, axis=-1)), axis=1
-            ),
-        )
+        if self.eos_token_id == 2:
+            # The `eos_token_id` was incorrect before PR #24773: Let's keep what have been done here.
+            # A CLIP model with such `eos_token_id` in the config can't work correctly with extra new tokens added
+            # ------------------------------------------------------------
+            # text_embeds.shape = [batch_size, n_ctx, transformer.width]
+            # take features from the eot embedding (eot_token is the highest number in each sequence)
+            pooled_output = tf.gather_nd(
+                params=sequence_output,
+                indices=tf.stack(
+                    values=(tf.range(input_shape[0], dtype=tf.int64), tf.math.argmax(input_ids, axis=-1)), axis=1
+                ),
+            )
+        else:
+            # The config gets updated `eos_token_id` from PR #24773 (so the use of exta new tokens is possible)
+            pooled_output = tf.gather_nd(
+                params=sequence_output,
+                indices=tf.stack(
+                    values=(
+                        tf.range(input_shape[0], dtype=tf.int64),
+                        tf.math.argmax(tf.cast(input_ids == self.eos_token_id, dtype=tf.int8), axis=-1),
+                    ),
+                    axis=1,
+                ),
+            )
 
         if not return_dict:
             return (sequence_output, pooled_output) + encoder_outputs[1:]
@@ -599,9 +606,9 @@ def set_input_embeddings(self, value: tf.Variable):
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -645,7 +652,6 @@ def call(
         return_dict: bool,
         training: bool = False,
     ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
-
         embedding_output = self.embeddings(pixel_values=pixel_values)
         embedding_output = self.pre_layernorm(inputs=embedding_output)
 
@@ -689,13 +695,12 @@ def get_input_embeddings(self) -> tf.keras.layers.Layer:
     @unpack_inputs
     def call(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
+        pixel_values: TFModelInputType | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
-
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
 
@@ -753,8 +758,7 @@ def __init__(self, config: CLIPConfig, **kwargs):
             name="text_projection",
         )
 
-    def build(self, input_shape: tf.TensorShape):
-
+    def build(self, input_shape: tf.TensorShape = None):
         self.logit_scale = self.add_weight(
             shape=(1,),
             initializer=tf.keras.initializers.Constant(self.config.logit_scale_init_value),
@@ -767,15 +771,14 @@ def build(self, input_shape: tf.TensorShape):
     @unpack_inputs
     def get_text_features(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> tf.Tensor:
-
         if input_ids is None:
             raise ValueError("You have to specify either input_ids")
 
@@ -802,7 +805,7 @@ def get_text_features(
     @unpack_inputs
     def get_image_features(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
+        pixel_values: TFModelInputType | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -827,17 +830,16 @@ def get_image_features(
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        pixel_values: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        pixel_values: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
         return_loss: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFCLIPOutput, Tuple[tf.Tensor]]:
-
         if input_ids is None:
             raise ValueError("You have to specify either input_ids")
         if pixel_values is None:
@@ -909,6 +911,8 @@ class TFCLIPPreTrainedModel(TFPreTrainedModel):
 
     config_class = CLIPConfig
     base_model_prefix = "clip"
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+    _keys_to_ignore_on_load_unexpected = [r"position_ids"]
 
 
 CLIP_START_DOCSTRING = r"""
@@ -958,7 +962,7 @@ class TFCLIPPreTrainedModel(TFPreTrainedModel):
         input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -993,8 +997,8 @@ class TFCLIPPreTrainedModel(TFPreTrainedModel):
 CLIP_VISION_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`CLIPFeatureExtractor`]. See
-            [`CLIPFeatureExtractor.__call__`] for details. output_attentions (`bool`, *optional*): Whether or not to
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`CLIPImageProcessor.__call__`] for details. output_attentions (`bool`, *optional*): Whether or not to
             return the attentions tensors of all attention layers. See `attentions` under returned tensors for more
             detail. This argument can be used only in eager mode, in graph mode the value in the config will be used
             instead.
@@ -1015,13 +1019,13 @@ class TFCLIPPreTrainedModel(TFPreTrainedModel):
         input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
         pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`CLIPFeatureExtractor`]. See
-            [`CLIPFeatureExtractor.__call__`] for details.
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`CLIPImageProcessor.__call__`] for details.
         attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
             Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
 
@@ -1066,9 +1070,9 @@ def __init__(self, config: CLIPTextConfig, *inputs, **kwargs):
     @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=CLIPTextConfig)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1080,10 +1084,10 @@ def call(
         Examples:
 
         ```python
-        >>> from transformers import CLIPTokenizer, TFCLIPTextModel
+        >>> from transformers import AutoTokenizer, TFCLIPTextModel
 
         >>> model = TFCLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
-        >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
 
         >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="tf")
 
@@ -1104,29 +1108,6 @@ def call(
 
         return outputs
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs: Dict[str, tf.Tensor]) -> TFBaseModelOutputWithPooling:
-        output = self.call(inputs)
-        return self.serving_output(output)
-
-    def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutputWithPooling(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            hidden_states=hs,
-            attentions=attns,
-        )
-
 
 class TFCLIPVisionModel(TFCLIPPreTrainedModel):
     config_class = CLIPVisionConfig
@@ -1137,44 +1118,12 @@ def __init__(self, config: CLIPVisionConfig, *inputs, **kwargs):
 
         self.clip = TFCLIPVisionMainLayer(config, name="clip")
 
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        VISION_DUMMY_INPUTS = tf.random.uniform(
-            shape=(len(DUMMY_INPUTS), 3, self.config.image_size, self.config.image_size), dtype=tf.float32
-        )
-        return {"pixel_values": VISION_DUMMY_INPUTS}
-
-    @tf.function(
-        input_signature=[
-            {
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
-            }
-        ]
-    )
-    def serving(self, inputs: Dict[str, tf.Tensor]) -> TFBaseModelOutputWithPooling:
-        """
-        Method used for serving the model.
-
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
-        """
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=CLIPVisionConfig)
     def call(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
+        pixel_values: TFModelInputType | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1188,10 +1137,10 @@ def call(
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import CLIPProcessor, TFCLIPVisionModel
+        >>> from transformers import AutoProcessor, TFCLIPVisionModel
 
         >>> model = TFCLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
-        >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
@@ -1213,17 +1162,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutputWithPooling(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            hidden_states=hs,
-            attentions=attns,
-        )
-
 
 @add_start_docstrings(CLIP_START_DOCSTRING)
 class TFCLIPModel(TFCLIPPreTrainedModel):
@@ -1234,51 +1172,13 @@ def __init__(self, config: CLIPConfig, *inputs, **kwargs):
 
         self.clip = TFCLIPMainLayer(config, name="clip")
 
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        VISION_DUMMY_INPUTS = tf.random.uniform(
-            shape=(len(DUMMY_INPUTS), 3, self.config.vision_config.image_size, self.config.vision_config.image_size),
-            dtype=tf.float32,
-        )
-        return {
-            "input_ids": tf.constant(DUMMY_INPUTS, dtype=tf.int32),
-            "pixel_values": VISION_DUMMY_INPUTS,
-        }
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs: Dict[str, tf.Tensor]) -> TFCLIPOutput:
-        """
-        Method used for serving the model.
-
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
-        """
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     def get_text_features(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1292,10 +1192,10 @@ def get_text_features(
         Examples:
 
         ```python
-        >>> from transformers import CLIPTokenizer, TFCLIPModel
+        >>> from transformers import AutoTokenizer, TFCLIPModel
 
         >>> model = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
-        >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
 
         >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="tf")
         >>> text_features = model.get_text_features(**inputs)
@@ -1316,7 +1216,7 @@ def get_text_features(
     @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
     def get_image_features(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
+        pixel_values: TFModelInputType | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1332,10 +1232,10 @@ def get_image_features(
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import CLIPProcessor, TFCLIPModel
+        >>> from transformers import AutoProcessor, TFCLIPModel
 
         >>> model = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
-        >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
@@ -1359,10 +1259,10 @@ def get_image_features(
     @replace_return_docstrings(output_type=TFCLIPOutput, config_class=CLIPConfig)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        pixel_values: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        pixel_values: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
         return_loss: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1378,10 +1278,10 @@ def call(
         >>> import tensorflow as tf
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import CLIPProcessor, TFCLIPModel
+        >>> from transformers import AutoProcessor, TFCLIPModel
 
         >>> model = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
-        >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
diff --git a/src/transformers/models/clip/processing_clip.py b/src/transformers/models/clip/processing_clip.py
index 3e2f438d263e..f083380e6ad1 100644
--- a/src/transformers/models/clip/processing_clip.py
+++ b/src/transformers/models/clip/processing_clip.py
@@ -30,9 +30,9 @@ class CLIPProcessor(ProcessorMixin):
     [`~CLIPProcessor.__call__`] and [`~CLIPProcessor.decode`] for more information.
 
     Args:
-        image_processor ([`CLIPImageProcessor`]):
+        image_processor ([`CLIPImageProcessor`], *optional*):
             The image processor is a required input.
-        tokenizer ([`CLIPTokenizerFast`]):
+        tokenizer ([`CLIPTokenizerFast`], *optional*):
             The tokenizer is a required input.
     """
     attributes = ["image_processor", "tokenizer"]
@@ -40,6 +40,7 @@ class CLIPProcessor(ProcessorMixin):
     tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        feature_extractor = None
         if "feature_extractor" in kwargs:
             warnings.warn(
                 "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
diff --git a/src/transformers/models/clip/tokenization_clip.py b/src/transformers/models/clip/tokenization_clip.py
index ef8da45cda2a..f62ef65c5ede 100644
--- a/src/transformers/models/clip/tokenization_clip.py
+++ b/src/transformers/models/clip/tokenization_clip.py
@@ -126,20 +126,30 @@ class BasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -158,7 +168,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -186,7 +198,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
@@ -272,13 +284,15 @@ class CLIPTokenizer(PreTrainedTokenizer):
         errors (`str`, *optional*, defaults to `"replace"`):
             Paradigm to follow when decoding bytes to UTF-8. See
             [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
-        unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
+        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        bos_token (`str`, *optional*, defaults to `<|startoftext|>`):
+        bos_token (`str`, *optional*, defaults to `"<|startoftext|>"`):
             The beginning of sequence token.
-        eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
+        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
             The end of sequence token.
+        pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The token used for padding, for example when batching sequences of different lengths.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -295,28 +309,18 @@ def __init__(
         bos_token="<|startoftext|>",
         eos_token="<|endoftext|>",
         pad_token="<|endoftext|>",  # hack to enable padding
-        **kwargs
+        **kwargs,
     ):
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
         unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
-
-        super().__init__(
-            errors=errors,
-            unk_token=unk_token,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            pad_token=pad_token,
-            **kwargs,
-        )
-
         try:
             import ftfy
 
             self.fix_text = ftfy.fix_text
         except ImportError:
             logger.info("ftfy or spacy is not installed using custom BasicTokenizer instead of ftfy.")
-            self.nlp = BasicTokenizer(do_lower_case=True)
+            self.nlp = BasicTokenizer(strip_accents=False, do_split_on_punc=False)
             self.fix_text = None
 
         with open(vocab_file, encoding="utf-8") as vocab_handle:
@@ -336,6 +340,15 @@ def __init__(
             re.IGNORECASE,
         )
 
+        super().__init__(
+            errors=errors,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            **kwargs,
+        )
+
     @property
     def vocab_size(self):
         return len(self.encoder)
diff --git a/src/transformers/models/clip/tokenization_clip_fast.py b/src/transformers/models/clip/tokenization_clip_fast.py
index df11bf793f08..3b092b0f8d50 100644
--- a/src/transformers/models/clip/tokenization_clip_fast.py
+++ b/src/transformers/models/clip/tokenization_clip_fast.py
@@ -56,17 +56,21 @@ class CLIPTokenizerFast(PreTrainedTokenizerFast):
     refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (`str`):
+        vocab_file (`str`, *optional*):
             Path to the vocabulary file.
-        merges_file (`str`):
+        merges_file (`str`, *optional*):
             Path to the merges file.
-        unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
+        tokenizer_file (`str`, *optional*):
+            The path to a tokenizer file to use instead of the vocab file.
+        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        bos_token (`str`, *optional*, defaults to `<|startoftext|>`):
+        bos_token (`str`, *optional*, defaults to `"<|startoftext|>"`):
             The beginning of sequence token.
-        eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
+        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
             The end of sequence token.
+        pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The token used for padding, for example when batching sequences of different lengths.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -84,7 +88,7 @@ def __init__(
         bos_token="<|startoftext|>",
         eos_token="<|endoftext|>",
         pad_token="<|endoftext|>",  # hack to enable padding
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
diff --git a/src/transformers/models/clipseg/__init__.py b/src/transformers/models/clipseg/__init__.py
index f6b09b9af975..0e2e250e507a 100644
--- a/src/transformers/models/clipseg/__init__.py
+++ b/src/transformers/models/clipseg/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/clipseg/configuration_clipseg.py b/src/transformers/models/clipseg/configuration_clipseg.py
index 1fe27b0d0b0f..56b90f721e1a 100644
--- a/src/transformers/models/clipseg/configuration_clipseg.py
+++ b/src/transformers/models/clipseg/configuration_clipseg.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ CLIPSeg model configuration"""
 
-import copy
 import os
 from typing import Union
 
@@ -56,17 +55,22 @@ class CLIPSegTextConfig(PretrainedConfig):
             just in case (e.g., 512 or 1024 or 2048).
         hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
-            defaults to 1e-5): The epsilon used by the layer normalization layers.
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        initializer_factor (`float``, *optional*, defaults to 1):
+        initializer_factor (`float``, *optional*, defaults to 1.0):
             A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
             testing).
+        pad_token_id (`int`, *optional*, defaults to 1):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 49406):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 49407):
+            End of stream token id.
 
     Example:
 
@@ -93,22 +97,20 @@ def __init__(
         num_attention_heads=8,
         max_position_embeddings=77,
         hidden_act="quick_gelu",
-        layer_norm_eps=0.00001,
-        dropout=0.0,
+        layer_norm_eps=1e-5,
         attention_dropout=0.0,
         initializer_range=0.02,
         initializer_factor=1.0,
         pad_token_id=1,
-        bos_token_id=0,
-        eos_token_id=2,
-        **kwargs
+        bos_token_id=49406,
+        eos_token_id=49407,
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size
-        self.dropout = dropout
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
         self.max_position_embeddings = max_position_embeddings
@@ -120,6 +122,7 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
 
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
@@ -155,21 +158,22 @@ class CLIPSegVisionConfig(PretrainedConfig):
             Number of hidden layers in the Transformer encoder.
         num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
         image_size (`int`, *optional*, defaults to 224):
             The size (resolution) of each image.
         patch_size (`int`, *optional*, defaults to 32):
             The size (resolution) of each patch.
         hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
-            defaults to 1e-5): The epsilon used by the layer normalization layers.
-        dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        initializer_factor (`float``, *optional*, defaults to 1):
+        initializer_factor (`float`, *optional*, defaults to 1.0):
             A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
             testing).
 
@@ -200,18 +204,16 @@ def __init__(
         image_size=224,
         patch_size=32,
         hidden_act="quick_gelu",
-        layer_norm_eps=0.00001,
-        dropout=0.0,
+        layer_norm_eps=1e-5,
         attention_dropout=0.0,
         initializer_range=0.02,
         initializer_factor=1.0,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size
-        self.dropout = dropout
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
         self.num_channels = num_channels
@@ -225,6 +227,7 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
 
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
@@ -260,7 +263,7 @@ class CLIPSegConfig(PretrainedConfig):
             Dimensionality of text and vision projection layers.
         logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
             The inital value of the *logit_scale* paramter. Default is used as per the original CLIPSeg implementation.
-        extract_layers (`List[int]`, *optional*, defaults to [3, 6, 9]):
+        extract_layers (`List[int]`, *optional*, defaults to `[3, 6, 9]`):
             Layers to extract when forwarding the query image through the frozen visual backbone of CLIP.
         reduce_dim (`int`, *optional*, defaults to 64):
             Dimensionality to reduce the CLIP vision embedding.
@@ -270,8 +273,7 @@ class CLIPSegConfig(PretrainedConfig):
             The dropout ratio for the attention probabilities.
         decoder_hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
-            defaults to 1e-5): The epsilon used by the layer normalization layers.
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
         decoder_intermediate_size (`int`, *optional*, defaults to 2048):
             Dimensionality of the "intermediate" (i.e., feed-forward) layers in the Transformer decoder.
         conditional_layer (`int`, *optional*, defaults to 0):
@@ -307,7 +309,6 @@ class CLIPSegConfig(PretrainedConfig):
     ```"""
 
     model_type = "clipseg"
-    is_composition = True
 
     def __init__(
         self,
@@ -323,24 +324,85 @@ def __init__(
         decoder_intermediate_size=2048,
         conditional_layer=0,
         use_complex_transposed_convolution=False,
-        **kwargs
+        **kwargs,
     ):
-        super().__init__(**kwargs)
-
+        # If `_config_dict` exist, we use them for the backward compatibility.
+        # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
+        # of confusion!).
         text_config_dict = kwargs.pop("text_config_dict", None)
         vision_config_dict = kwargs.pop("vision_config_dict", None)
+
+        super().__init__(**kwargs)
+
+        # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
+        # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
+        # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
         if text_config_dict is not None:
-            text_config = text_config_dict
+            if text_config is None:
+                text_config = {}
+
+            # This is the complete result when using `text_config_dict`.
+            _text_config_dict = CLIPSegTextConfig(**text_config_dict).to_dict()
+
+            # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
+            for key, value in _text_config_dict.items():
+                if key in text_config and value != text_config[key] and key not in ["transformers_version"]:
+                    # If specified in `text_config_dict`
+                    if key in text_config_dict:
+                        message = (
+                            f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
+                            f'The value `text_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`text_config_dict` is provided which will be used to initialize `CLIPSegTextConfig`. The "
+                            f'value `text_config["{key}"]` will be overriden.'
+                        )
+                    logger.warning(message)
+
+            # Update all values in `text_config` with the ones in `_text_config_dict`.
+            text_config.update(_text_config_dict)
+
         if vision_config_dict is not None:
-            vision_config = vision_config_dict
+            if vision_config is None:
+                vision_config = {}
+
+            # This is the complete result when using `vision_config_dict`.
+            _vision_config_dict = CLIPSegVisionConfig(**vision_config_dict).to_dict()
+            # convert keys to string instead of integer
+            if "id2label" in _vision_config_dict:
+                _vision_config_dict["id2label"] = {
+                    str(key): value for key, value in _vision_config_dict["id2label"].items()
+                }
+
+            # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different.
+            for key, value in _vision_config_dict.items():
+                if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]:
+                    # If specified in `vision_config_dict`
+                    if key in vision_config_dict:
+                        message = (
+                            f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different "
+                            f'values. The value `vision_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`vision_config_dict` is provided which will be used to initialize `CLIPSegVisionConfig`. "
+                            f'The value `vision_config["{key}"]` will be overriden.'
+                        )
+                    logger.warning(message)
+
+            # Update all values in `vision_config` with the ones in `_vision_config_dict`.
+            vision_config.update(_vision_config_dict)
 
         if text_config is None:
             text_config = {}
-            logger.info("text_config is None. Initializing the CLIPSegTextConfig with default values.")
+            logger.info("`text_config` is `None`. Initializing the `CLIPSegTextConfig` with default values.")
 
         if vision_config is None:
             vision_config = {}
-            logger.info("vision_config is None. initializing the CLIPSegVisionConfig with default values.")
+            logger.info("`vision_config` is `None`. initializing the `CLIPSegVisionConfig` with default values.")
 
         self.text_config = CLIPSegTextConfig(**text_config)
         self.vision_config = CLIPSegVisionConfig(**vision_config)
@@ -368,16 +430,3 @@ def from_text_vision_configs(cls, text_config: CLIPSegTextConfig, vision_config:
         """
 
         return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["text_config"] = self.text_config.to_dict()
-        output["vision_config"] = self.vision_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
index 778dbca29967..c614d61e5b3d 100644
--- a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
+++ b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
@@ -17,10 +17,10 @@
 
 import argparse
 
+import requests
 import torch
 from PIL import Image
 
-import requests
 from transformers import (
     CLIPSegConfig,
     CLIPSegForImageSegmentation,
@@ -28,7 +28,7 @@
     CLIPSegTextConfig,
     CLIPSegVisionConfig,
     CLIPTokenizer,
-    ViTFeatureExtractor,
+    ViTImageProcessor,
 )
 
 
@@ -185,9 +185,9 @@ def convert_clipseg_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_
     if unexpected_keys != ["decoder.reduce.weight", "decoder.reduce.bias"]:
         raise ValueError(f"Unexpected keys: {unexpected_keys}")
 
-    feature_extractor = ViTFeatureExtractor(size=352)
+    image_processor = ViTImageProcessor(size=352)
     tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
-    processor = CLIPSegProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+    processor = CLIPSegProcessor(image_processor=image_processor, tokenizer=tokenizer)
 
     image = prepare_img()
     text = ["a glass", "something to fill", "wood", "a jar"]
diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index d5074a8dff20..96f13217aaf8 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -160,7 +160,7 @@ def to_tuple(self) -> Tuple[Any]:
 
 
 class CLIPSegVisionEmbeddings(nn.Module):
-    # Copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings.__init__
+    # Copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings.__init__ with CLIP->CLIPSeg
     def __init__(self, config: CLIPSegVisionConfig):
         super().__init__()
         self.config = config
@@ -181,7 +181,7 @@ def __init__(self, config: CLIPSegVisionConfig):
         self.num_patches = (self.image_size // self.patch_size) ** 2
         self.num_positions = self.num_patches + 1
         self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
-        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)))
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
 
     def interpolate_position_embeddings(self, new_size):
         if len(new_size) != 2:
@@ -230,7 +230,9 @@ def __init__(self, config: CLIPSegTextConfig):
         self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
     def forward(
         self,
@@ -379,9 +381,9 @@ def __init__(self, config: CLIPSegConfig):
         super().__init__()
         self.embed_dim = config.hidden_size
         self.self_attn = CLIPSegAttention(config)
-        self.layer_norm1 = nn.LayerNorm(self.embed_dim)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
         self.mlp = CLIPSegMLP(config)
-        self.layer_norm2 = nn.LayerNorm(self.embed_dim)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
 
     def forward(
         self,
@@ -433,7 +435,6 @@ class CLIPSegPreTrainedModel(PreTrainedModel):
     config_class = CLIPSegConfig
     base_model_prefix = "clip"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -500,7 +501,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`CLIPTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -530,7 +531,7 @@ def _set_gradient_checkpointing(self, module, value=False):
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
             tensors for more detail.
@@ -547,7 +548,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`CLIPTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -565,7 +566,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             [What are position IDs?](../glossary#position-ids)
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
         return_loss (`bool`, *optional*):
             Whether or not to return the contrastive loss.
         output_attentions (`bool`, *optional*):
@@ -683,6 +684,24 @@ def custom_forward(*inputs):
         )
 
 
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
 class CLIPSegTextTransformer(nn.Module):
     # Copied from transformers.models.clip.modeling_clip.CLIPTextTransformer.__init__ with CLIP->CLIPSeg
     def __init__(self, config: CLIPSegTextConfig):
@@ -691,7 +710,10 @@ def __init__(self, config: CLIPSegTextConfig):
         embed_dim = config.hidden_size
         self.embeddings = CLIPSegTextEmbeddings(config)
         self.encoder = CLIPSegEncoder(config)
-        self.final_layer_norm = nn.LayerNorm(embed_dim)
+        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+        # For `pooled_output` computation
+        self.eos_token_id = config.eos_token_id
 
     @add_start_docstrings_to_model_forward(CLIPSEG_TEXT_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPSegTextConfig)
@@ -723,12 +745,9 @@ def forward(
 
         hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
 
-        bsz, seq_len = input_shape
         # CLIPSeg's text model uses causal mask, prepare it here.
         # https://github.com/openai/CLIPSeg/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clipseg/model.py#L324
-        causal_attention_mask = self._build_causal_attention_mask(bsz, seq_len, hidden_states.dtype).to(
-            hidden_states.device
-        )
+        causal_attention_mask = _make_causal_mask(input_shape, hidden_states.dtype, device=hidden_states.device)
         # expand attention_mask
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
@@ -746,13 +765,26 @@ def forward(
         last_hidden_state = encoder_outputs[0]
         last_hidden_state = self.final_layer_norm(last_hidden_state)
 
-        # text_embeds.shape = [batch_size, sequence_length, transformer.width]
-        # take features from the eot embedding (eot_token is the highest number in each sequence)
-        # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
-        pooled_output = last_hidden_state[
-            torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
-            input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
-        ]
+        if self.eos_token_id == 2:
+            # The `eos_token_id` was incorrect before PR #24773: Let's keep what have been done here.
+            # A CLIPSeg model with such `eos_token_id` in the config can't work correctly with extra new tokens added
+            # ------------------------------------------------------------
+            # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+            # take features from the eot embedding (eot_token is the highest number in each sequence)
+            # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+            pooled_output = last_hidden_state[
+                torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+                input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
+            ]
+        else:
+            # The config gets updated `eos_token_id` from PR #24773 (so the use of exta new tokens is possible)
+            pooled_output = last_hidden_state[
+                torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+                # We need to get the first position of `eos_token_id` value (`pad_token_ids` might equal to `eos_token_id`)
+                (input_ids.to(dtype=torch.int, device=last_hidden_state.device) == self.eos_token_id)
+                .int()
+                .argmax(dim=-1),
+            ]
 
         if not return_dict:
             return (last_hidden_state, pooled_output) + encoder_outputs[1:]
@@ -764,20 +796,11 @@ def forward(
             attentions=encoder_outputs.attentions,
         )
 
-    def _build_causal_attention_mask(self, bsz, seq_len, dtype):
-        # lazily create causal attention mask, with full attention between the vision tokens
-        # pytorch uses additive attention mask; fill with -inf
-        mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype)
-        mask.fill_(torch.tensor(torch.finfo(dtype).min))
-        mask.triu_(1)  # zero out the lower diagonal
-        mask = mask.unsqueeze(1)  # expand mask
-        return mask
-
 
 class CLIPSegTextModel(CLIPSegPreTrainedModel):
     config_class = CLIPSegTextConfig
 
-    _no_split_modules = ["CLIPSegEncoderLayer"]
+    _no_split_modules = ["CLIPSegTextEmbeddings", "CLIPSegEncoderLayer"]
 
     def __init__(self, config: CLIPSegTextConfig):
         super().__init__(config)
@@ -808,9 +831,9 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import CLIPTokenizer, CLIPSegTextModel
+        >>> from transformers import AutoTokenizer, CLIPSegTextModel
 
-        >>> tokenizer = CLIPTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
+        >>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
         >>> model = CLIPSegTextModel.from_pretrained("CIDAS/clipseg-rd64-refined")
 
         >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
@@ -837,9 +860,9 @@ def __init__(self, config: CLIPSegVisionConfig):
         embed_dim = config.hidden_size
 
         self.embeddings = CLIPSegVisionEmbeddings(config)
-        self.pre_layrnorm = nn.LayerNorm(embed_dim)
+        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
         self.encoder = CLIPSegEncoder(config)
-        self.post_layernorm = nn.LayerNorm(embed_dim)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
 
     @add_start_docstrings_to_model_forward(CLIPSEG_VISION_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPSegVisionConfig)
@@ -919,9 +942,9 @@ def forward(
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import CLIPSegProcessor, CLIPSegVisionModel
+        >>> from transformers import AutoProcessor, CLIPSegVisionModel
 
-        >>> processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
+        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
         >>> model = CLIPSegVisionModel.from_pretrained("CIDAS/clipseg-rd64-refined")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
@@ -972,7 +995,7 @@ def __init__(self, config: CLIPSegConfig):
 
         self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
         self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
-        self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
+        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -995,9 +1018,9 @@ def get_text_features(
         Examples:
 
         ```python
-        >>> from transformers import CLIPTokenizer, CLIPSegModel
+        >>> from transformers import AutoTokenizer, CLIPSegModel
 
-        >>> tokenizer = CLIPTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
+        >>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
         >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")
 
         >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
@@ -1042,9 +1065,9 @@ def get_image_features(
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import CLIPSegProcessor, CLIPSegModel
+        >>> from transformers import AutoProcessor, CLIPSegModel
 
-        >>> processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
+        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
         >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
@@ -1094,9 +1117,9 @@ def forward(
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import CLIPSegProcessor, CLIPSegModel
+        >>> from transformers import AutoProcessor, CLIPSegModel
 
-        >>> processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
+        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
         >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
@@ -1178,9 +1201,9 @@ def __init__(self, config: CLIPSegConfig):
         super().__init__()
         self.embed_dim = config.hidden_size
         self.self_attn = CLIPSegAttention(config)
-        self.layer_norm1 = nn.LayerNorm(self.embed_dim)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
         self.mlp = CLIPSegMLP(config)
-        self.layer_norm2 = nn.LayerNorm(self.embed_dim)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
 
     def forward(
         self,
@@ -1401,11 +1424,11 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import CLIPSegProcessor, CLIPSegForImageSegmentation
+        >>> from transformers import AutoProcessor, CLIPSegForImageSegmentation
         >>> from PIL import Image
         >>> import requests
 
-        >>> processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
+        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
         >>> model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
@@ -1480,6 +1503,8 @@ def forward(
 
         loss = None
         if labels is not None:
+            # move labels to the correct device to enable PP
+            labels = labels.to(logits.device)
             loss_fn = nn.BCEWithLogitsLoss()
             loss = loss_fn(logits, labels)
 
diff --git a/src/transformers/models/clipseg/processing_clipseg.py b/src/transformers/models/clipseg/processing_clipseg.py
index df3705e99e2c..bc1d36a1c668 100644
--- a/src/transformers/models/clipseg/processing_clipseg.py
+++ b/src/transformers/models/clipseg/processing_clipseg.py
@@ -30,9 +30,9 @@ class CLIPSegProcessor(ProcessorMixin):
     [`~CLIPSegProcessor.__call__`] and [`~CLIPSegProcessor.decode`] for more information.
 
     Args:
-        image_processor ([`ViTImageProcessor`]):
+        image_processor ([`ViTImageProcessor`], *optional*):
             The image processor is a required input.
-        tokenizer ([`CLIPTokenizerFast`]):
+        tokenizer ([`CLIPTokenizerFast`], *optional*):
             The tokenizer is a required input.
     """
     attributes = ["image_processor", "tokenizer"]
@@ -40,6 +40,7 @@ class CLIPSegProcessor(ProcessorMixin):
     tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        feature_extractor = None
         if "feature_extractor" in kwargs:
             warnings.warn(
                 "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
diff --git a/src/transformers/models/code_llama/__init__.py b/src/transformers/models/code_llama/__init__.py
new file mode 100644
index 000000000000..8c99c023419b
--- /dev/null
+++ b/src/transformers/models/code_llama/__init__.py
@@ -0,0 +1,57 @@
+# Copyright 2023 MetaAI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_sentencepiece_available, is_tokenizers_available
+
+
+_import_structure = {}
+
+try:
+    if not is_sentencepiece_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["tokenization_code_llama"] = ["CodeLlamaTokenizer"]
+
+try:
+    if not is_tokenizers_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["tokenization_code_llama_fast"] = ["CodeLlamaTokenizerFast"]
+
+if TYPE_CHECKING:
+    try:
+        if not is_sentencepiece_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .tokenization_code_llama import CodeLlamaTokenizer
+
+    try:
+        if not is_tokenizers_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .tokenization_code_llama_fast import CodeLlamaTokenizerFast
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/code_llama/tokenization_code_llama.py b/src/transformers/models/code_llama/tokenization_code_llama.py
new file mode 100644
index 000000000000..1dbe6731852e
--- /dev/null
+++ b/src/transformers/models/code_llama/tokenization_code_llama.py
@@ -0,0 +1,505 @@
+# coding=utf-8
+# Copyright 2023 MetaAI and the HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tokenization classes for Code LLaMA."""
+import os
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple
+
+import sentencepiece as spm
+
+from ...convert_slow_tokenizer import import_protobuf
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...utils import logging, requires_backends
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "hf-internal-testing/llama-code-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model",
+    },
+    "tokenizer_file": {
+        "hf-internal-testing/llama-code-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer_config.json",
+    },
+}
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "hf-internal-testing/llama-code-tokenizer": 2048,
+}
+SPIECE_UNDERLINE = "▁"
+
+B_INST, E_INST = "[INST]", "[/INST]"
+B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
+
+# fmt: off
+DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your \
+answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
+ that your responses are socially unbiased and positive in nature.
+
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not \
+correct. If you don't know the answer to a question, please don't share false information."""
+# fmt: on
+
+
+class CodeLlamaTokenizer(PreTrainedTokenizer):
+    """
+    Construct a CodeLlama tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as
+    there is no padding token in the original model.
+
+    The default configuration match that of
+    [codellama/CodeLlama-7b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf/blob/main/tokenizer_config.json)
+    which supports prompt infilling.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+
+            </Tip>
+
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        prefix_token (`str`, *optional*, defaults to `"▁<PRE>"`):
+            Prefix token used for infilling.
+        suffix_token (`str`, *optional*, defaults to `"▁<SUF>"`):
+            Suffix token used for infilling.
+        middle_token (`str`, *optional*, defaults to `"▁<MID>"`):
+            Middle token used for infilling.
+        eot_token (`str`, *optional*, defaults to `"▁<EOT>"`):
+            End of text token used for infilling.
+        fill_token (`str`, *optional*, defaults to `"<FILL_ME>"`):
+            The token used to split the input between the prefix and suffix.
+        suffix_first (`bool`, *optional*, default to `False`):
+            Whether the input prompt and suffix should be formatted with the suffix first.
+        additional_special_tokens (`List[str]`, *optional*):
+            Additional special tokens used by the tokenizer.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+        use_default_system_prompt (`bool`, *optional*, defaults to `False`):
+            Whether or not the default system prompt for Llama should be used.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        unk_token="<unk>",
+        bos_token="<s>",
+        eos_token="</s>",
+        prefix_token="▁<PRE>",
+        middle_token="▁<MID>",
+        suffix_token="▁<SUF>",
+        eot_token="▁<EOT>",
+        fill_token="<FILL_ME>",
+        suffix_first=False,
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        add_bos_token=True,
+        add_eos_token=False,
+        clean_up_tokenization_spaces=False,
+        additional_special_tokens=None,
+        use_default_system_prompt=False,
+        **kwargs,
+    ):
+        requires_backends(self, "protobuf")
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+
+        self.use_default_system_prompt = use_default_system_prompt
+        # mark tokens special to skip them
+        additional_special_tokens = additional_special_tokens or []
+        for token in [prefix_token, middle_token, suffix_token, eot_token]:
+            additional_special_tokens += [token] if token is not None else []
+
+        self.vocab_file = vocab_file
+        self.add_bos_token = add_bos_token
+        self.add_eos_token = add_eos_token
+        self._prefix_token = prefix_token
+        self._middle_token = middle_token
+        self._suffix_token = suffix_token
+        self._eot_token = eot_token
+        self.fill_token = fill_token
+        self.suffix_first = suffix_first
+        self.sp_model = self.get_spm_processor()
+
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            add_bos_token=add_bos_token,
+            add_eos_token=add_eos_token,
+            prefix_token=prefix_token,
+            middle_token=middle_token,
+            suffix_token=suffix_token,
+            eot_token=eot_token,
+            fill_token=fill_token,
+            sp_model_kwargs=self.sp_model_kwargs,
+            suffix_first=suffix_first,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            additional_special_tokens=additional_special_tokens,
+            use_default_system_prompt=use_default_system_prompt,
+            **kwargs,
+        )
+
+    @property
+    def unk_token_length(self):
+        return len(self.sp_model.encode(str(self.unk_token)))
+
+    def get_spm_processor(self):
+        tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        with open(self.vocab_file, "rb") as f:
+            sp_model = f.read()
+            model_pb2 = import_protobuf()
+            model = model_pb2.ModelProto.FromString(sp_model)
+            normalizer_spec = model_pb2.NormalizerSpec()
+            normalizer_spec.add_dummy_prefix = False
+            model.normalizer_spec.MergeFrom(normalizer_spec)
+            sp_model = model.SerializeToString()
+            tokenizer.LoadFromSerializedProto(sp_model)
+        return tokenizer
+
+    @property
+    def prefix_token(self):
+        return self._prefix_token
+
+    @property
+    def prefix_id(self):
+        if self._prefix_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.prefix_token)
+
+    @property
+    def middle_token(self):
+        return self._middle_token
+
+    @property
+    def middle_id(self):
+        if self._middle_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.middle_token)
+
+    @property
+    def suffix_token(self):
+        return self._suffix_token
+
+    @property
+    def suffix_id(self):
+        if self._suffix_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.suffix_token)
+
+    @property
+    def eot_token(self):
+        return self._eot_token
+
+    @property
+    def eot_id(self):
+        if self._eot_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.eot_token)
+
+    @property
+    def vocab_size(self):
+        """Returns vocab size"""
+        return self.sp_model.get_piece_size()
+
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.get_vocab
+    def get_vocab(self):
+        """Returns vocab as a dict"""
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def tokenize(self, prefix, suffix=None, suffix_first=False, **kwargs) -> List[int]:
+        # add a prefix space to `prefix`
+        if self.fill_token is not None and self.fill_token in prefix and suffix is None:
+            prefix, suffix = prefix.split(self.fill_token)
+
+        if len(prefix) > 0:
+            prefix = SPIECE_UNDERLINE + prefix.replace(SPIECE_UNDERLINE, " ")
+
+        if suffix is None or len(suffix) < 1:
+            tokens = super().tokenize(prefix, **kwargs)
+            if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens:
+                tokens = tokens[1:]
+            return tokens
+
+        prefix_tokens = self._tokenize(prefix)  # prefix has an extra `SPIECE_UNDERLINE`
+
+        if None in (self.prefix_id, self.middle_id, self.suffix_id):
+            raise ValueError(
+                "The input either includes a `prefix` and a `suffix` used for the infilling task,"
+                f"  or can be split on the {self.fill_token} token, creating a suffix and prefix,"
+                " but the model does not support `infilling`."
+            )
+        suffix_tokens = self._tokenize(suffix)  # make sure CodeLlama sp model does not mess up
+
+        suffix_first = suffix_first if suffix_first is not None else self.suffix_first
+        if suffix_first:
+            # format as " <PRE> <SUF>{suf} <MID> {pre}"
+            return [self.prefix_token, self.suffix_token] + suffix_tokens + [self.middle_token] + prefix_tokens
+        else:
+            # format as " <PRE> {pre} <SUF>{suf} <MID>"
+            return [self.prefix_token] + prefix_tokens + [self.suffix_token] + suffix_tokens + [self.middle_token]
+
+    def _tokenize(self, text, **kwargs):
+        """
+        Returns a tokenized string.
+
+        We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
+        SPIECE_UNDERLINE. For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give
+        `['H', 'e', 'y']` instead of `['▁He', 'y']`. Thus we always encode `f"{unk_token}text"` and strip the
+        `unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
+        `self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
+        """
+        tokens = self.sp_model.encode(text, out_type=str)
+        if not text.startswith((SPIECE_UNDERLINE, " ")):
+            return tokens
+        # 1. Encode string + prefix ex: "<unk> Hey"
+        tokens = self.sp_model.encode(self.unk_token + text, out_type=str)
+        # 2. Remove self.unk_token from ['<','unk','>', '▁Hey']
+        return tokens[self.unk_token_length :] if len(tokens) >= self.unk_token_length else tokens
+
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer._convert_token_to_id
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.sp_model.piece_to_id(token)
+
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer._convert_id_to_token
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        token = self.sp_model.IdToPiece(index)
+        return token
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        # since we manually add the prefix space, we have to remove it when decoding
+        if tokens[0].startswith(SPIECE_UNDERLINE):
+            tokens[0] = tokens[0][1:]
+
+        current_sub_tokens = []
+        out_string = ""
+        for _, token in enumerate(tokens):
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string
+
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.save_vocabulary
+    def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        """
+        Save the vocabulary and special tokens file to a directory.
+
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        return (out_vocab_file,)
+
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.build_inputs_with_special_tokens
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+
+        output = bos_token_id + token_ids_0 + eos_token_id
+
+        if token_ids_1 is not None:
+            output = output + bos_token_id + token_ids_1 + eos_token_id
+
+        return output
+
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.get_special_tokens_mask
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        bos_token_id = [1] if self.add_bos_token else []
+        eos_token_id = [1] if self.add_eos_token else []
+
+        if token_ids_1 is None:
+            return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
+        return (
+            bos_token_id
+            + ([0] * len(token_ids_0))
+            + eos_token_id
+            + bos_token_id
+            + ([0] * len(token_ids_1))
+            + eos_token_id
+        )
+
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.create_token_type_ids_from_sequences
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
+        sequence pair mask has the following format:
+
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+
+        if token_ids_1 is None, only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of ids.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+        """
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+
+        output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
+
+        if token_ids_1 is not None:
+            output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
+
+        return output
+
+    @property
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.default_chat_template
+    def default_chat_template(self):
+        """
+        LLaMA uses [INST] and [/INST] to indicate user messages, and <<SYS>> and <</SYS>> to indicate system messages.
+        Assistant messages do not have special tokens, because LLaMA chat models are generally trained with strict
+        user/assistant/user/assistant message ordering, and so assistant messages can be identified from the ordering
+        rather than needing special tokens. The system message is partly 'embedded' in the first user message, which
+        results in an unusual token ordering when it is present. This template should definitely be changed if you wish
+        to fine-tune a model with more flexible role ordering!
+
+        The output should look something like:
+
+        <bos>[INST] B_SYS SystemPrompt E_SYS Prompt [/INST] Answer <eos> <bos>[INST] Prompt [/INST] Answer <eos>
+        <bos>[INST] Prompt [/INST]
+        """
+
+        template = (
+            "{% if messages[0]['role'] == 'system' %}"
+            "{% set loop_messages = messages[1:] %}"  # Extract system message if it's present
+            "{% set system_message = messages[0]['content'] %}"
+            "{% elif USE_DEFAULT_PROMPT == true and not '<<SYS>>' in messages[0]['content'] %}"
+            "{% set loop_messages = messages %}"  # Or use the default system message if the flag is set
+            "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}"
+            "{% else %}"
+            "{% set loop_messages = messages %}"
+            "{% set system_message = false %}"
+            "{% endif %}"
+            "{% for message in loop_messages %}"  # Loop over all non-system messages
+            "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}"
+            "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}"
+            "{% endif %}"
+            "{% if loop.index0 == 0 and system_message != false %}"  # Embed system message in first message
+            "{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}"
+            "{% else %}"
+            "{% set content = message['content'] %}"
+            "{% endif %}"
+            "{% if message['role'] == 'user' %}"  # After all of that, handle messages/roles in a fairly normal way
+            "{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}"
+            "{% elif message['role'] == 'system' %}"
+            "{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}"
+            "{% elif message['role'] == 'assistant' %}"
+            "{{ ' '  + content.strip() + ' ' + eos_token }}"
+            "{% endif %}"
+            "{% endfor %}"
+        )
+        template = template.replace("USE_DEFAULT_PROMPT", "true" if self.use_default_system_prompt else "false")
+        default_message = DEFAULT_SYSTEM_PROMPT.replace("\n", "\\n").replace("'", "\\'")
+        template = template.replace("DEFAULT_SYSTEM_MESSAGE", default_message)
+
+        return template
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        state["sp_model_proto"] = self.sp_model.serialized_model_proto()
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
diff --git a/src/transformers/models/code_llama/tokenization_code_llama_fast.py b/src/transformers/models/code_llama/tokenization_code_llama_fast.py
new file mode 100644
index 000000000000..5e8a7945dc1e
--- /dev/null
+++ b/src/transformers/models/code_llama/tokenization_code_llama_fast.py
@@ -0,0 +1,426 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from shutil import copyfile
+from typing import List, Optional, Tuple
+
+from tokenizers import normalizers, processors
+
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import is_sentencepiece_available, logging
+from ...utils.versions import require_version
+
+
+require_version("tokenizers>=0.13.3")
+
+if is_sentencepiece_available():
+    from .tokenization_code_llama import CodeLlamaTokenizer
+else:
+    CodeLlamaTokenizer = None
+
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model", "tokenizer_file": "tokenizer.json"}
+
+SPIECE_UNDERLINE = "▁"
+
+
+B_INST, E_INST = "[INST]", "[/INST]"
+B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
+
+# fmt: off
+DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your \
+answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
+ that your responses are socially unbiased and positive in nature.
+
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not \
+correct. If you don't know the answer to a question, please don't share false information."""
+# fmt: on
+
+
+class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a Llama tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+    This uses notably ByteFallback and no normalization.
+
+    ```python
+    >>> from transformers import CodeLlamaTokenizerFast
+
+    >>> tokenizer = CodeLlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer")
+    >>> tokenizer.encode("Hello this is a test")
+    [1, 15043, 445, 338, 263, 1243]
+    ```
+
+    If you want to change the `bos_token` or the `eos_token`, make sure to specify them when initializing the model, or
+    call `tokenizer.update_post_processor()` to make sure that the post-processing is correctly done (otherwise the
+    values of the first token and final token of an encoded sequence will not be correct). For more details, checkout
+    [post-processors] (https://huggingface.co/docs/tokenizers/api/post-processors) documentation.
+
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods. The default configuration match that of
+    [codellama/CodeLlama-7b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf/blob/main/tokenizer_config.json)
+    which supports prompt infilling.
+
+    Args:
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .model extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        tokenizer_file (`str`):
+            [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
+            contains everything needed to load the tokenizer.
+        clean_up_tokenization_spaces (`str`, *optional*, defaults to `False`):
+            Wether to cleanup spaces after decoding, cleanup consists in removing potential artifacts like extra
+            spaces.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        prefix_token (`str`, *optional*, defaults to `"▁<PRE>"`):
+            Prefix token used for infilling.
+        suffix_token (`str`, *optional*, defaults to `"▁<SUF>"`):
+            Suffix token used for infilling.
+        middle_token (`str`, *optional*, defaults to `"▁<MID>"`):
+            Middle token used for infilling.
+        eot_token (`str`, *optional*, defaults to `"▁<EOT>"`):
+            End of text token used for infilling.
+        fill_token (`str`, *optional*, defaults to `"<FILL_ME>"`):
+            The token used to split the input between the prefix and suffix.
+        suffix_first (`bool`, *optional*, default to `False`):
+            Whether the input prompt and suffix should be formatted with the suffix first.
+        additional_special_tokens (`List[str]`, *optional*):
+            Additional special tokens used by the tokenizer.
+        use_default_system_prompt (`bool`, *optional*, defaults to `True`):
+            Whether or not the default system prompt for Llama should be used.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    slow_tokenizer_class = CodeLlamaTokenizer
+    padding_side = "left"
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file=None,
+        tokenizer_file=None,
+        clean_up_tokenization_spaces=False,
+        unk_token="<unk>",
+        bos_token="<s>",
+        eos_token="</s>",
+        prefix_token="▁<PRE>",
+        middle_token="▁<MID>",
+        suffix_token="▁<SUF>",
+        eot_token="▁<EOT>",
+        fill_token="<FILL_ME>",
+        additional_special_tokens=None,
+        add_bos_token=True,
+        add_eos_token=False,
+        use_default_system_prompt=False,
+        **kwargs,
+    ):
+        # mark tokens special to skip them
+        additional_special_tokens = additional_special_tokens or []
+        for token in [prefix_token, middle_token, suffix_token, eot_token]:
+            additional_special_tokens += [token] if token is not None else []
+        self.use_default_system_prompt = use_default_system_prompt
+
+        super().__init__(
+            vocab_file=vocab_file,
+            tokenizer_file=tokenizer_file,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            additional_special_tokens=additional_special_tokens,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            prefix_token=prefix_token,
+            middle_token=middle_token,
+            suffix_token=suffix_token,
+            eot_token=eot_token,
+            fill_token=fill_token,
+            use_default_system_prompt=use_default_system_prompt,
+            **kwargs,
+        )
+        self._add_bos_token = add_bos_token
+        self._add_eos_token = add_eos_token
+        self.update_post_processor()
+
+        self.vocab_file = vocab_file
+
+        self._prefix_token = prefix_token
+        self._middle_token = middle_token
+        self._suffix_token = suffix_token
+        self._eot_token = eot_token
+        self.fill_token = fill_token
+
+    @property
+    def can_save_slow_tokenizer(self) -> bool:
+        return os.path.isfile(self.vocab_file) if self.vocab_file else False
+
+    # Copied from transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast.update_post_processor
+    def update_post_processor(self):
+        """
+        Updates the underlying post processor with the current `bos_token` and `eos_token`.
+        """
+        bos = self.bos_token
+        bos_token_id = self.bos_token_id
+        if bos is None and self.add_bos_token:
+            raise ValueError("add_bos_token = True but bos_token = None")
+
+        eos = self.eos_token
+        eos_token_id = self.eos_token_id
+        if eos is None and self.add_eos_token:
+            raise ValueError("add_eos_token = True but eos_token = None")
+
+        single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
+        pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
+
+        special_tokens = []
+        if self.add_bos_token:
+            special_tokens.append((bos, bos_token_id))
+        if self.add_eos_token:
+            special_tokens.append((eos, eos_token_id))
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=single, pair=pair, special_tokens=special_tokens
+        )
+
+    @property
+    def prefix_token(self):
+        return self._prefix_token
+
+    @property
+    def prefix_id(self):
+        if self._prefix_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.prefix_token)
+
+    @property
+    def middle_token(self):
+        return self._middle_token
+
+    @property
+    def middle_id(self):
+        if self._middle_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.middle_token)
+
+    @property
+    def suffix_token(self):
+        return self._suffix_token
+
+    @property
+    def suffix_id(self):
+        if self._suffix_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.suffix_token)
+
+    @property
+    def eot_id(self):
+        if self._eot_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.eot_token)
+
+    @property
+    def eot_token(self):
+        return self._eot_token
+
+    @property
+    def add_eos_token(self):
+        return self._add_eos_token
+
+    @property
+    def add_bos_token(self):
+        return self._add_bos_token
+
+    @add_eos_token.setter
+    def add_eos_token(self, value):
+        self._add_eos_token = value
+        self.update_post_processor()
+
+    @add_bos_token.setter
+    def add_bos_token(self, value):
+        self._add_bos_token = value
+        self.update_post_processor()
+
+    def set_infilling_processor(self, reset, suffix_first=False, add_special_tokens=True):
+        """
+        Updates the normalizer to make sure the prompt format for `infilling` is respected. The infilling format is the
+        following: if suffix_first
+            " <PRE> <SUF>{suf} <MID> {pre}"
+        else:
+            " <PRE> {pre} <SUF>{suf} <MID>"
+
+        If `reset` is set to `True`, the `normalizer` and `post_processor` are reset to their "normal" behaviour, which
+        is to add a prefix space for the normalizer, and add a `bos_token` to the input text for the `post_processor`.
+        """
+        if reset:
+            self._tokenizer.normalizer = normalizers.Sequence(
+                [
+                    normalizers.Prepend(prepend="▁"),
+                    normalizers.Replace(pattern=" ", content="▁"),
+                ]
+            )
+            self.update_post_processor()
+            return
+
+        self._tokenizer.normalizer = normalizers.Replace(pattern=" ", content="▁")
+        pair = [self.bos_token] if self.add_bos_token and add_special_tokens else []
+        special_tokens = [(self.bos_token, self.bos_token_id)] if self.add_bos_token and add_special_tokens else []
+        if suffix_first:
+            # format as " <PRE> <SUF>{suf} <MID> {pre}"
+            pair += [self.prefix_token, self.suffix_token, "$B", self.middle_token, "$A"]
+            special_tokens += [
+                (self.prefix_token, self.prefix_id),
+                (self.suffix_token, self.suffix_id),
+                (self.middle_token, self.middle_id),
+            ]
+        else:
+            # format as " <PRE> {pre} <SUF>{suf} <MID>"
+            pair += [self.prefix_token, "$A", self.suffix_token, "$B", self.middle_token]
+            special_tokens += [
+                (self.prefix_token, self.prefix_id),
+                (self.suffix_token, self.suffix_id),
+                (self.middle_token, self.middle_id),
+            ]
+
+        if self.add_eos_token and add_special_tokens:
+            pair += [self.eos_token]
+            special_tokens += [(self.eos_token, self.eos_token_id)]
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single="$A", pair=pair, special_tokens=special_tokens
+        )
+
+    def encode_plus(self, text, text_pair=None, suffix_first=False, add_special_tokens=True, **kwargs):
+        # hack to make sure the input is pre-process but outside rust
+        text_pair = kwargs.pop("suffix", text_pair)
+        if self.fill_token is not None and self.fill_token in text and text_pair is None:
+            text, text_pair = text.split(self.fill_token)
+
+        if text_pair is None or len(text_pair) < 1:
+            return super().encode_plus(text, text_pair, add_special_tokens=add_special_tokens, **kwargs)
+
+        if None in (self.prefix_id, self.middle_id, self.suffix_id):
+            raise ValueError(
+                "Then input includes a `prefix` and a `suffix` used for the infilling task,"
+                " the `prefix_id, middle_id, suffix_id` must all be initialized. Current"
+                f" values : {self.prefix_id, self.middle_id, self.suffix_id}"
+            )
+
+        self.set_infilling_processor(False, suffix_first=suffix_first, add_special_tokens=add_special_tokens)
+        tokens = super().encode_plus(" " + text, text_pair=text_pair, add_special_tokens=True, **kwargs)
+        self.set_infilling_processor(True)
+        return tokens
+
+    # Copied from transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast.save_vocabulary
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not self.can_save_slow_tokenizer:
+            raise ValueError(
+                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
+                "tokenizer."
+            )
+
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)
+
+    @property
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.default_chat_template
+    def default_chat_template(self):
+        """
+        LLaMA uses [INST] and [/INST] to indicate user messages, and <<SYS>> and <</SYS>> to indicate system messages.
+        Assistant messages do not have special tokens, because LLaMA chat models are generally trained with strict
+        user/assistant/user/assistant message ordering, and so assistant messages can be identified from the ordering
+        rather than needing special tokens. The system message is partly 'embedded' in the first user message, which
+        results in an unusual token ordering when it is present. This template should definitely be changed if you wish
+        to fine-tune a model with more flexible role ordering!
+
+        The output should look something like:
+
+        <bos>[INST] B_SYS SystemPrompt E_SYS Prompt [/INST] Answer <eos> <bos>[INST] Prompt [/INST] Answer <eos>
+        <bos>[INST] Prompt [/INST]
+        """
+
+        template = (
+            "{% if messages[0]['role'] == 'system' %}"
+            "{% set loop_messages = messages[1:] %}"  # Extract system message if it's present
+            "{% set system_message = messages[0]['content'] %}"
+            "{% elif USE_DEFAULT_PROMPT == true and not '<<SYS>>' in messages[0]['content'] %}"
+            "{% set loop_messages = messages %}"  # Or use the default system message if the flag is set
+            "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}"
+            "{% else %}"
+            "{% set loop_messages = messages %}"
+            "{% set system_message = false %}"
+            "{% endif %}"
+            "{% for message in loop_messages %}"  # Loop over all non-system messages
+            "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}"
+            "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}"
+            "{% endif %}"
+            "{% if loop.index0 == 0 and system_message != false %}"  # Embed system message in first message
+            "{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}"
+            "{% else %}"
+            "{% set content = message['content'] %}"
+            "{% endif %}"
+            "{% if message['role'] == 'user' %}"  # After all of that, handle messages/roles in a fairly normal way
+            "{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}"
+            "{% elif message['role'] == 'system' %}"
+            "{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}"
+            "{% elif message['role'] == 'assistant' %}"
+            "{{ ' '  + content.strip() + ' ' + eos_token }}"
+            "{% endif %}"
+            "{% endfor %}"
+        )
+        template = template.replace("USE_DEFAULT_PROMPT", "true" if self.use_default_system_prompt else "false")
+        default_message = DEFAULT_SYSTEM_PROMPT.replace("\n", "\\n").replace("'", "\\'")
+        template = template.replace("DEFAULT_SYSTEM_MESSAGE", default_message)
+
+        return template
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. The special tokens depend on calling set_lang.
+
+        An NLLB sequence has the following format, where `X` represents the sequence:
+
+        - `input_ids` (for encoder) `X [eos, src_lang_code]`
+        - `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`
+
+        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
+        separator.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return self.bos_token_id + token_ids_0 + self.eos_token_id
+        return self.bos_token_id + token_ids_0 + token_ids_1 + self.eos_token_id
diff --git a/src/transformers/models/codegen/__init__.py b/src/transformers/models/codegen/__init__.py
index e3aaf55cfb1e..a1ce89620035 100644
--- a/src/transformers/models/codegen/__init__.py
+++ b/src/transformers/models/codegen/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 Salesforce authors, The EleutherAI, and HuggingFace Teams. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/codegen/configuration_codegen.py b/src/transformers/models/codegen/configuration_codegen.py
index 292188e1ec6a..1a1e609f0111 100644
--- a/src/transformers/models/codegen/configuration_codegen.py
+++ b/src/transformers/models/codegen/configuration_codegen.py
@@ -79,8 +79,6 @@ class CodeGenConfig(PretrainedConfig):
             The epsilon to use in the layer normalization layers.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        scale_attn_weights (`bool`, *optional*, defaults to `True`):
-            Scale attention weights by dividing by sqrt(hidden_size).
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models).
 
@@ -122,12 +120,11 @@ def __init__(
         attn_pdrop=0.0,
         layer_norm_epsilon=1e-5,
         initializer_range=0.02,
-        scale_attn_weights=True,
         use_cache=True,
         bos_token_id=50256,
         eos_token_id=50256,
         tie_word_embeddings=False,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.n_ctx = n_ctx
@@ -143,7 +140,6 @@ def __init__(
         self.attn_pdrop = attn_pdrop
         self.layer_norm_epsilon = layer_norm_epsilon
         self.initializer_range = initializer_range
-        self.scale_attn_weights = scale_attn_weights
         self.use_cache = use_cache
 
         self.bos_token_id = bos_token_id
diff --git a/src/transformers/models/codegen/modeling_codegen.py b/src/transformers/models/codegen/modeling_codegen.py
index 52e62cb73737..93d5aa7ee476 100644
--- a/src/transformers/models/codegen/modeling_codegen.py
+++ b/src/transformers/models/codegen/modeling_codegen.py
@@ -32,7 +32,6 @@
 
 _CHECKPOINT_FOR_DOC = "Salesforce/codegen-2B-mono"
 _CONFIG_FOR_DOC = "CodeGenConfig"
-_TOKENIZER_FOR_DOC = "GPT2Tokenizer"
 
 
 CODEGEN_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -52,43 +51,26 @@
 ]
 
 
-# Copied from transformers.models.gptj.modeling_gptj.fixed_pos_embedding
-def fixed_pos_embedding(x, seq_dim=1, seq_len=None):
-    dim = x.shape[-1]
-    if seq_len is None:
-        seq_len = x.shape[seq_dim]
+# Copied from transformers.models.gptj.modeling_gptj.create_sinusoidal_positions
+def create_sinusoidal_positions(num_pos: int, dim: int) -> torch.Tensor:
     inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2) / dim))
-    sinusoid_inp = (
-        torch.einsum("i , j -> i j", torch.arange(seq_len, dtype=torch.float), inv_freq).to(x.device).float()
-    )
-    return torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)
+    sinusoid_inp = torch.einsum("i , j -> i j", torch.arange(num_pos, dtype=torch.float), inv_freq).float()
+    return torch.cat((torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)), dim=1)
 
 
 # Copied from transformers.models.gptj.modeling_gptj.rotate_every_two
-def rotate_every_two(x):
+def rotate_every_two(x: torch.Tensor) -> torch.Tensor:
     x1 = x[:, :, :, ::2]
     x2 = x[:, :, :, 1::2]
     x = torch.stack((-x2, x1), dim=-1)
     return x.flatten(-2)  # in einsum notation: rearrange(x, '... d j -> ... (d j)')
 
 
-# Copied from transformers.models.gptj.modeling_gptj.duplicate_interleave
-def duplicate_interleave(m):
-    """
-    A simple version of `torch.repeat_interleave` for duplicating a matrix while interleaving the copy.
-    """
-    dim0 = m.shape[0]
-    m = m.view(-1, 1)  # flatten the matrix
-    m = m.repeat(1, 2)  # repeat all elements into the 2nd dimension
-    m = m.view(dim0, -1)  # reshape into a matrix, interleaving the copy
-    return m
-
-
 # Copied from transformers.models.gptj.modeling_gptj.apply_rotary_pos_emb
-def apply_rotary_pos_emb(x, sincos, offset=0):
-    sin, cos = map(lambda t: duplicate_interleave(t)[None, offset : x.shape[1] + offset, None, :], sincos)
-    # einsum notation for lambda t: repeat(t[offset:x.shape[1]+offset,:], "n d -> () n () (d j)", j=2)
-    return (x * cos) + (rotate_every_two(x) * sin)
+def apply_rotary_pos_emb(tensor: torch.Tensor, sin: torch.Tensor, cos: torch.Tensor) -> torch.Tensor:
+    sin = torch.repeat_interleave(sin[:, :, None, :], 2, 3)
+    cos = torch.repeat_interleave(cos[:, :, None, :], 2, 3)
+    return (tensor * cos) + (rotate_every_two(tensor) * sin)
 
 
 class CodeGenAttention(nn.Module):
@@ -98,9 +80,10 @@ def __init__(self, config):
         max_positions = config.max_position_embeddings
         self.register_buffer(
             "causal_mask",
-            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.uint8)).view(
+            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
                 1, 1, max_positions, max_positions
             ),
+            persistent=False,
         )
 
         self.attn_dropout = nn.Dropout(config.attn_pdrop)
@@ -118,9 +101,9 @@ def __init__(self, config):
         self.qkv_proj = nn.Linear(self.embed_dim, self.embed_dim * 3, bias=False)
 
         self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
-        self.rotary_dim = None
-        if config.rotary_dim is not None:
-            self.rotary_dim = config.rotary_dim
+        self.rotary_dim = config.rotary_dim
+        pos_embd_dim = self.rotary_dim or self.embed_dim
+        self.embed_positions = create_sinusoidal_positions(max_positions, pos_embd_dim)
 
     def _split_heads(self, x, n_head, dim_head, mp_num):
         reshaped = x.reshape(x.shape[:-1] + (n_head // mp_num, dim_head))
@@ -148,7 +131,6 @@ def _attn(
         attention_mask=None,
         head_mask=None,
     ):
-
         # compute causal mask from causal mask buffer
         query_length, key_length = query.size(-2), key.size(-2)
         causal_mask = self.causal_mask[:, :, key_length - query_length : key_length, :key_length]
@@ -185,8 +167,9 @@ def _attn(
     def forward(
         self,
         hidden_states: Optional[torch.FloatTensor],
-        attention_mask: Optional[torch.FloatTensor] = None,
         layer_past: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
         head_mask: Optional[torch.FloatTensor] = None,
         use_cache: Optional[bool] = False,
         output_attentions: Optional[bool] = False,
@@ -194,7 +177,6 @@ def forward(
         Tuple[torch.Tensor, Tuple[torch.Tensor]],
         Optional[Tuple[torch.Tensor, Tuple[torch.Tensor], Tuple[torch.Tensor, ...]]],
     ]:
-
         qkv = self.qkv_proj(hidden_states)
         # TODO(enijkamp): factor out number of logical TPU-v4 cores or make forward pass agnostic
         mp_num = 4
@@ -208,12 +190,13 @@ def forward(
         value = self._split_heads(value, self.num_attention_heads, self.head_dim, mp_num=mp_num)
         value = value.permute(0, 2, 1, 3)
 
-        seq_len = key.shape[1]
-        offset = 0
+        embed_positions = self.embed_positions
+        if embed_positions.device != position_ids.device:
+            embed_positions = embed_positions.to(position_ids.device)
+            self.embed_positions = embed_positions
 
-        if layer_past is not None:
-            offset = layer_past[0].shape[-2]
-            seq_len += offset
+        sincos = embed_positions[position_ids]
+        sin, cos = torch.split(sincos, sincos.shape[-1] // 2, dim=-1)
 
         if self.rotary_dim is not None:
             k_rot = key[:, :, :, : self.rotary_dim]
@@ -222,16 +205,14 @@ def forward(
             q_rot = query[:, :, :, : self.rotary_dim]
             q_pass = query[:, :, :, self.rotary_dim :]
 
-            sincos = fixed_pos_embedding(k_rot, 1, seq_len=seq_len)
-            k_rot = apply_rotary_pos_emb(k_rot, sincos, offset=offset)
-            q_rot = apply_rotary_pos_emb(q_rot, sincos, offset=offset)
+            k_rot = apply_rotary_pos_emb(k_rot, sin, cos)
+            q_rot = apply_rotary_pos_emb(q_rot, sin, cos)
 
             key = torch.cat([k_rot, k_pass], dim=-1)
             query = torch.cat([q_rot, q_pass], dim=-1)
         else:
-            sincos = fixed_pos_embedding(key, 1, seq_len=seq_len)
-            key = apply_rotary_pos_emb(key, sincos, offset=offset)
-            query = apply_rotary_pos_emb(query, sincos, offset=offset)
+            key = apply_rotary_pos_emb(key, sin, cos)
+            query = apply_rotary_pos_emb(query, sin, cos)
 
         key = key.permute(0, 2, 1, 3)
         query = query.permute(0, 2, 1, 3)
@@ -295,6 +276,7 @@ def forward(
         hidden_states: Optional[torch.FloatTensor],
         layer_past: Optional[Tuple[torch.Tensor]] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
         head_mask: Optional[torch.FloatTensor] = None,
         use_cache: Optional[bool] = False,
         output_attentions: Optional[bool] = False,
@@ -302,9 +284,10 @@ def forward(
         residual = hidden_states
         hidden_states = self.ln_1(hidden_states)
         attn_outputs = self.attn(
-            hidden_states,
+            hidden_states=hidden_states,
             layer_past=layer_past,
             attention_mask=attention_mask,
+            position_ids=position_ids,
             head_mask=head_mask,
             use_cache=use_cache,
             output_attentions=output_attentions,
@@ -333,6 +316,7 @@ class CodeGenPreTrainedModel(PreTrainedModel):
     base_model_prefix = "transformer"
     supports_gradient_checkpointing = True
     _no_split_modules = ["CodeGenBlock"]
+    _skip_keys_device_placement = "past_key_values"
 
     def __init__(self, *inputs, **kwargs):
         super().__init__(*inputs, **kwargs)
@@ -374,7 +358,7 @@ def _set_gradient_checkpointing(self, module, value=False):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`GPT2Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoProcenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -448,7 +432,6 @@ def set_input_embeddings(self, new_embeddings):
 
     @add_start_docstrings_to_model_forward(CODEGEN_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPast,
         config_class=_CONFIG_FOR_DOC,
@@ -477,6 +460,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
             batch_size = input_ids.shape[0]
@@ -491,9 +475,6 @@ def forward(
         if token_type_ids is not None:
             token_type_ids = token_type_ids.view(-1, input_shape[-1])
 
-        if position_ids is not None:
-            position_ids = position_ids.view(-1, input_shape[-1])
-
         if past_key_values is None:
             past_length = 0
             past_key_values = tuple([None] * len(self.h))
@@ -502,7 +483,7 @@ def forward(
 
         if position_ids is None:
             position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
-            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
+            position_ids = position_ids.unsqueeze(0)
 
         # Attention mask.
         if attention_mask is not None:
@@ -543,23 +524,23 @@ def forward(
 
         output_shape = input_shape + (hidden_states.size(-1),)
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
+                    "`use_cache=False`..."
+                )
+                use_cache = False
+
         presents = () if use_cache else None
         all_self_attentions = () if output_attentions else None
         all_hidden_states = () if output_hidden_states else None
         for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
-                        "`use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         # None for past_key_value
@@ -572,13 +553,15 @@ def custom_forward(*inputs):
                     hidden_states,
                     None,
                     attention_mask,
+                    position_ids,
                     head_mask[i],
                 )
             else:
                 outputs = block(
-                    hidden_states,
+                    hidden_states=hidden_states,
                     layer_past=layer_past,
                     attention_mask=attention_mask,
+                    position_ids=position_ids,
                     head_mask=head_mask[i],
                     use_cache=use_cache,
                     output_attentions=output_attentions,
@@ -616,7 +599,7 @@ def custom_forward(*inputs):
     CODEGEN_START_DOCSTRING,
 )
 class CodeGenForCausalLM(CodeGenPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -649,8 +632,7 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg
             position_ids.masked_fill_(attention_mask == 0, 1)
             if past_key_values:
                 position_ids = position_ids[:, -1].unsqueeze(-1)
-        else:
-            position_ids = None
+
         return {
             "input_ids": input_ids,
             "past_key_values": past_key_values,
@@ -662,7 +644,6 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg
 
     @add_start_docstrings_to_model_forward(CODEGEN_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=CausalLMOutputWithPast,
         config_class=_CONFIG_FOR_DOC,
@@ -712,6 +693,8 @@ def forward(
 
         loss = None
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(lm_logits.device)
             # Shift so that tokens < n predict n
             shift_logits = lm_logits[..., :-1, :].contiguous()
             shift_labels = labels[..., 1:].contiguous()
@@ -734,7 +717,9 @@ def forward(
         )
 
     @staticmethod
-    def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> Tuple[Tuple[torch.Tensor]]:
+    def _reorder_cache(
+        past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
+    ) -> Tuple[Tuple[torch.Tensor]]:
         """
         This function is used to re-order the `past_key_values` cache if [`~PretrainedModel.beam_search`] or
         [`~PretrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
@@ -742,5 +727,5 @@ def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) ->
         """
         return tuple(
             tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
-            for layer_past in past
+            for layer_past in past_key_values
         )
diff --git a/src/transformers/models/codegen/tokenization_codegen.py b/src/transformers/models/codegen/tokenization_codegen.py
index ff86eee8231a..e5f0332a92da 100644
--- a/src/transformers/models/codegen/tokenization_codegen.py
+++ b/src/transformers/models/codegen/tokenization_codegen.py
@@ -21,7 +21,6 @@
 from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 
 import numpy as np
-
 import regex as re
 
 from ...utils import is_tf_available, is_torch_available, logging
@@ -103,12 +102,14 @@ class CodeGenTokenizer(PreTrainedTokenizer):
     This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
     be encoded differently whether it is at the beginning of the sentence (without space) or not:
 
-    ```
+    ```python
     >>> from transformers import CodeGenTokenizer
+
     >>> tokenizer = CodeGenTokenizer.from_pretrained("Salesforce/codegen-350M-mono")
-    >>> tokenizer("Hello world")['input_ids']
+    >>> tokenizer("Hello world")["input_ids"]
     [15496, 995]
-    >>> tokenizer(" Hello world")['input_ids']
+
+    >>> tokenizer(" Hello world")["input_ids"]
     [18435, 995]
     ```
 
@@ -160,22 +161,12 @@ def __init__(
         pad_token=None,
         add_prefix_space=False,
         add_bos_token=False,
-        **kwargs
+        **kwargs,
     ):
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
         unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
         pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
-        super().__init__(
-            errors=errors,
-            unk_token=unk_token,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            pad_token=pad_token,
-            add_prefix_space=add_prefix_space,
-            add_bos_token=add_bos_token,
-            **kwargs,
-        )
         self.add_bos_token = add_bos_token
 
         with open(vocab_file, encoding="utf-8") as vocab_handle:
@@ -193,6 +184,16 @@ def __init__(
 
         # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
         self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+        super().__init__(
+            errors=errors,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            add_prefix_space=add_prefix_space,
+            add_bos_token=add_bos_token,
+            **kwargs,
+        )
 
     @property
     def vocab_size(self):
@@ -319,9 +320,9 @@ def decode(
         self,
         token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
         skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: bool = True,
+        clean_up_tokenization_spaces: bool = None,
         truncate_before_pattern: Optional[List[str]] = None,
-        **kwargs
+        **kwargs,
     ) -> str:
         """
         Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
@@ -334,8 +335,9 @@ def decode(
                 List of tokenized input ids. Can be obtained using the `__call__` method.
             skip_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not to remove special tokens in the decoding.
-            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
-                Whether or not to clean up the tokenization spaces.
+            clean_up_tokenization_spaces (`bool`, *optional*):
+                Whether or not to clean up the tokenization spaces. If `None`, will default to
+                `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
             truncate_before_pattern (`List[str]`, *optional*, defaults to `None`):
                 A list of regular expression strings that will be used to truncate the returned string. This can be
                 used to remove extra pieces of code (e.g. truncate if observing a comment symbol "#" at the beginning
diff --git a/src/transformers/models/codegen/tokenization_codegen_fast.py b/src/transformers/models/codegen/tokenization_codegen_fast.py
index 1c3dcf85fb5e..fad5e24dbcf5 100644
--- a/src/transformers/models/codegen/tokenization_codegen_fast.py
+++ b/src/transformers/models/codegen/tokenization_codegen_fast.py
@@ -68,12 +68,14 @@ class CodeGenTokenizerFast(PreTrainedTokenizerFast):
     This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
     be encoded differently whether it is at the beginning of the sentence (without space) or not:
 
-    ```
+    ```python
     >>> from transformers import CodeGenTokenizerFast
+
     >>> tokenizer = CodeGenTokenizerFast.from_pretrained("Salesforce/codegen-350M-mono")
-    >>> tokenizer("Hello world")['input_ids']
+    >>> tokenizer("Hello world")["input_ids"]
     [15496, 995]
-    >>> tokenizer(" Hello world")['input_ids']
+
+    >>> tokenizer(" Hello world")["input_ids"]
     [18435, 995]
     ```
 
@@ -126,7 +128,7 @@ def __init__(
         bos_token="<|endoftext|>",
         eos_token="<|endoftext|>",
         add_prefix_space=False,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
@@ -185,9 +187,9 @@ def decode(
         self,
         token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
         skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: bool = True,
+        clean_up_tokenization_spaces: bool = None,
         truncate_before_pattern: Optional[List[str]] = None,
-        **kwargs
+        **kwargs,
     ) -> str:
         """
         Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
@@ -200,8 +202,9 @@ def decode(
                 List of tokenized input ids. Can be obtained using the `__call__` method.
             skip_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not to remove special tokens in the decoding.
-            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
-                Whether or not to clean up the tokenization spaces.
+            clean_up_tokenization_spaces (`bool`, *optional*):
+                Whether or not to clean up the tokenization spaces. If `None`, will default to
+                `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
             truncate_before_pattern (`List[str]`, *optional*, defaults to `None`):
                 A list of regular expression strings that will be used to truncate the returned string. This can be
                 used to remove extra pieces of code (e.g. truncate if observing a comment symbol "#" at the beginning
diff --git a/src/transformers/models/conditional_detr/__init__.py b/src/transformers/models/conditional_detr/__init__.py
index fd69edfeb7a8..565323321160 100644
--- a/src/transformers/models/conditional_detr/__init__.py
+++ b/src/transformers/models/conditional_detr/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,7 +14,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_timm_available, is_vision_available
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
 
 
 _import_structure = {
@@ -39,7 +35,7 @@
     _import_structure["image_processing_conditional_detr"] = ["ConditionalDetrImageProcessor"]
 
 try:
-    if not is_timm_available():
+    if not is_torch_available():
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     pass
@@ -70,7 +66,7 @@
         from .image_processing_conditional_detr import ConditionalDetrImageProcessor
 
     try:
-        if not is_timm_available():
+        if not is_torch_available():
             raise OptionalDependencyNotAvailable()
     except OptionalDependencyNotAvailable:
         pass
diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 4866affb505b..356e5c0a574b 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Conditional DETR model configuration"""
-
 from collections import OrderedDict
 from typing import Mapping
 
@@ -22,6 +21,7 @@
 from ...configuration_utils import PretrainedConfig
 from ...onnx import OnnxConfig
 from ...utils import logging
+from ..auto import CONFIG_MAPPING
 
 
 logger = logging.get_logger(__name__)
@@ -44,6 +44,12 @@ class ConditionalDetrConfig(PretrainedConfig):
     documentation from [`PretrainedConfig`] for more information.
 
     Args:
+        use_timm_backbone (`bool`, *optional*, defaults to `True`):
+            Whether or not to use the `timm` library for the backbone. If set to `False`, will use the [`AutoBackbone`]
+            API.
+        backbone_config (`PretrainedConfig` or `dict`, *optional*):
+            The configuration of the backbone model. Only used in case `use_timm_backbone` is set to `False` in which
+            case it will default to `ResNetConfig()`.
         num_channels (`int`, *optional*, defaults to 3):
             The number of input channels.
         num_queries (`int`, *optional*, defaults to 100):
@@ -87,13 +93,14 @@ class ConditionalDetrConfig(PretrainedConfig):
         position_embedding_type (`str`, *optional*, defaults to `"sine"`):
             Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`.
         backbone (`str`, *optional*, defaults to `"resnet50"`):
-            Name of convolutional backbone to use. Supports any convolutional backbone from the timm package. For a
-            list of all available models, see [this
+            Name of convolutional backbone to use in case `use_timm_backbone` = `True`. Supports any convolutional
+            backbone from the timm package. For a list of all available models, see [this
             page](https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model).
         use_pretrained_backbone (`bool`, *optional*, defaults to `True`):
-            Whether to use pretrained weights for the backbone.
+            Whether to use pretrained weights for the backbone. Only supported when `use_timm_backbone` = `True`.
         dilation (`bool`, *optional*, defaults to `False`):
-            Whether to replace stride with dilation in the last convolutional block (DC5).
+            Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when
+            `use_timm_backbone` = `True`.
         class_cost (`float`, *optional*, defaults to 1):
             Relative weight of the classification error in the Hungarian matching cost.
         bbox_cost (`float`, *optional*, defaults to 5):
@@ -136,6 +143,8 @@ class ConditionalDetrConfig(PretrainedConfig):
 
     def __init__(
         self,
+        use_timm_backbone=True,
+        backbone_config=None,
         num_channels=3,
         num_queries=300,
         encoder_layers=6,
@@ -154,8 +163,6 @@ def __init__(
         activation_dropout=0.0,
         init_std=0.02,
         init_xavier_std=1.0,
-        classifier_dropout=0.0,
-        scale_embedding=False,
         auxiliary_loss=False,
         position_embedding_type="sine",
         backbone="resnet50",
@@ -170,8 +177,22 @@ def __init__(
         bbox_loss_coefficient=5,
         giou_loss_coefficient=2,
         focal_alpha=0.25,
-        **kwargs
+        **kwargs,
     ):
+        if backbone_config is not None and use_timm_backbone:
+            raise ValueError("You can't specify both `backbone_config` and `use_timm_backbone`.")
+
+        if not use_timm_backbone:
+            if backbone_config is None:
+                logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
+                backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"])
+            elif isinstance(backbone_config, dict):
+                backbone_model_type = backbone_config.get("model_type")
+                config_class = CONFIG_MAPPING[backbone_model_type]
+                backbone_config = config_class.from_dict(backbone_config)
+
+        self.use_timm_backbone = use_timm_backbone
+        self.backbone_config = backbone_config
         self.num_channels = num_channels
         self.num_queries = num_queries
         self.d_model = d_model
@@ -190,7 +211,6 @@ def __init__(
         self.encoder_layerdrop = encoder_layerdrop
         self.decoder_layerdrop = decoder_layerdrop
         self.num_hidden_layers = encoder_layers
-        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
         self.auxiliary_loss = auxiliary_loss
         self.position_embedding_type = position_embedding_type
         self.backbone = backbone
@@ -219,7 +239,6 @@ def hidden_size(self) -> int:
 
 
 class ConditionalDetrOnnxConfig(OnnxConfig):
-
     torch_onnx_minimum_version = version.parse("1.11")
 
     @property
diff --git a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
index a4e28cbb558a..b1a1b1c817ae 100644
--- a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
@@ -20,16 +20,16 @@
 from collections import OrderedDict
 from pathlib import Path
 
+import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 
-import requests
-from huggingface_hub import hf_hub_download
 from transformers import (
     ConditionalDetrConfig,
-    ConditionalDetrFeatureExtractor,
     ConditionalDetrForObjectDetection,
     ConditionalDetrForSegmentation,
+    ConditionalDetrImageProcessor,
 )
 from transformers.utils import logging
 
@@ -244,13 +244,13 @@ def convert_conditional_detr_checkpoint(model_name, pytorch_dump_folder_path):
         config.id2label = id2label
         config.label2id = {v: k for k, v in id2label.items()}
 
-    # load feature extractor
+    # load image processor
     format = "coco_panoptic" if is_panoptic else "coco_detection"
-    feature_extractor = ConditionalDetrFeatureExtractor(format=format)
+    image_processor = ConditionalDetrImageProcessor(format=format)
 
     # prepare image
     img = prepare_img()
-    encoding = feature_extractor(images=img, return_tensors="pt")
+    encoding = image_processor(images=img, return_tensors="pt")
     pixel_values = encoding["pixel_values"]
 
     logger.info(f"Converting model {model_name}...")
@@ -302,11 +302,11 @@ def convert_conditional_detr_checkpoint(model_name, pytorch_dump_folder_path):
     if is_panoptic:
         assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4)
 
-    # Save model and feature extractor
-    logger.info(f"Saving PyTorch model and feature extractor to {pytorch_dump_folder_path}...")
+    # Save model and image processor
+    logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     model.save_pretrained(pytorch_dump_folder_path)
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
index b5f4c639f7e0..c2b28cd57d9b 100644
--- a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
@@ -16,27 +16,25 @@
 
 import io
 import pathlib
-import warnings
 from collections import defaultdict
 from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 import numpy as np
 
-from transformers.feature_extraction_utils import BatchFeature
-from transformers.image_processing_utils import BaseImageProcessor, get_size_dict
-from transformers.image_transforms import (
+from ...feature_extraction_utils import BatchFeature
+from ...image_processing_utils import BaseImageProcessor, get_size_dict
+from ...image_transforms import (
     PaddingMode,
     center_to_corners_format,
     corners_to_center_format,
     id_to_rgb,
-    normalize,
     pad,
     rescale,
     resize,
     rgb_to_id,
     to_channel_dimension_format,
 )
-from transformers.image_utils import (
+from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
     ChannelDimension,
@@ -44,13 +42,16 @@
     PILImageResampling,
     get_image_size,
     infer_channel_dimension_format,
-    is_batched,
+    is_scaled_image,
+    make_list_of_images,
     to_numpy_array,
     valid_coco_detection_annotations,
     valid_coco_panoptic_annotations,
     valid_images,
 )
-from transformers.utils import (
+from ...utils import (
+    ExplicitEnum,
+    TensorType,
     is_flax_available,
     is_jax_tensor,
     is_scipy_available,
@@ -59,8 +60,8 @@
     is_torch_available,
     is_torch_tensor,
     is_vision_available,
+    logging,
 )
-from transformers.utils.generic import ExplicitEnum, TensorType
 
 
 if is_torch_available():
@@ -77,6 +78,11 @@
     import scipy.stats
 
 
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+AnnotationType = Dict[str, Union[int, str, List[Dict]]]
+
+
 class AnnotionFormat(ExplicitEnum):
     COCO_DETECTION = "coco_detection"
     COCO_PANOPTIC = "coco_panoptic"
@@ -119,7 +125,10 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in
 
 # Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size
 def get_resize_output_image_size(
-    input_image: np.ndarray, size: Union[int, Tuple[int, int], List[int]], max_size: Optional[int] = None
+    input_image: np.ndarray,
+    size: Union[int, Tuple[int, int], List[int]],
+    max_size: Optional[int] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
 ) -> Tuple[int, int]:
     """
     Computes the output image size given the input image size and the desired output size. If the desired output size
@@ -133,8 +142,10 @@ def get_resize_output_image_size(
             The desired output size.
         max_size (`int`, *optional*):
             The maximum allowed output size.
+        input_data_format (`ChannelDimension` or `str`, *optional*):
+            The channel dimension format of the input image. If not provided, it will be inferred from the input image.
     """
-    image_size = get_image_size(input_image)
+    image_size = get_image_size(input_image, input_data_format)
     if isinstance(size, (list, tuple)):
         return size
 
@@ -204,23 +215,28 @@ def max_across_indices(values: Iterable[Any]) -> List[Any]:
 
 
 # Copied from transformers.models.detr.image_processing_detr.get_max_height_width
-def get_max_height_width(images: List[np.ndarray]) -> List[int]:
+def get_max_height_width(
+    images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> List[int]:
     """
     Get the maximum height and width across all images in a batch.
     """
-    input_channel_dimension = infer_channel_dimension_format(images[0])
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(images[0])
 
-    if input_channel_dimension == ChannelDimension.FIRST:
+    if input_data_format == ChannelDimension.FIRST:
         _, max_height, max_width = max_across_indices([img.shape for img in images])
-    elif input_channel_dimension == ChannelDimension.LAST:
+    elif input_data_format == ChannelDimension.LAST:
         max_height, max_width, _ = max_across_indices([img.shape for img in images])
     else:
-        raise ValueError(f"Invalid channel dimension format: {input_channel_dimension}")
+        raise ValueError(f"Invalid channel dimension format: {input_data_format}")
     return (max_height, max_width)
 
 
 # Copied from transformers.models.detr.image_processing_detr.make_pixel_mask
-def make_pixel_mask(image: np.ndarray, output_size: Tuple[int, int]) -> np.ndarray:
+def make_pixel_mask(
+    image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> np.ndarray:
     """
     Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
 
@@ -230,7 +246,7 @@ def make_pixel_mask(image: np.ndarray, output_size: Tuple[int, int]) -> np.ndarr
         output_size (`Tuple[int, int]`):
             Output size of the mask.
     """
-    input_height, input_width = get_image_size(image)
+    input_height, input_width = get_image_size(image, channel_dim=input_data_format)
     mask = np.zeros(output_size, dtype=np.int64)
     mask[:input_height, :input_width] = 1
     return mask
@@ -272,11 +288,16 @@ def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndar
 
 
 # Copied from transformers.models.detr.image_processing_detr.prepare_coco_detection_annotation with DETR->ConditionalDetr
-def prepare_coco_detection_annotation(image, target, return_segmentation_masks: bool = False):
+def prepare_coco_detection_annotation(
+    image,
+    target,
+    return_segmentation_masks: bool = False,
+    input_data_format: Optional[Union[ChannelDimension, str]] = None,
+):
     """
     Convert the target in COCO format into the format expected by ConditionalDetr.
     """
-    image_height, image_width = get_image_size(image)
+    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
 
     image_id = target["image_id"]
     image_id = np.asarray([image_id], dtype=np.int64)
@@ -361,12 +382,16 @@ def masks_to_boxes(masks: np.ndarray) -> np.ndarray:
 
 # Copied from transformers.models.detr.image_processing_detr.prepare_coco_panoptic_annotation with DETR->ConditionalDetr
 def prepare_coco_panoptic_annotation(
-    image: np.ndarray, target: Dict, masks_path: Union[str, pathlib.Path], return_masks: bool = True
+    image: np.ndarray,
+    target: Dict,
+    masks_path: Union[str, pathlib.Path],
+    return_masks: bool = True,
+    input_data_format: Union[ChannelDimension, str] = None,
 ) -> Dict:
     """
     Prepare a coco panoptic annotation for ConditionalDetr.
     """
-    image_height, image_width = get_image_size(image)
+    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
     annotation_path = pathlib.Path(masks_path) / target["file_name"]
 
     new_target = {}
@@ -601,7 +626,7 @@ def binary_mask_to_rle(mask):
     pixels = np.concatenate([[0], pixels, [0]])
     runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
     runs[1::2] -= runs[::2]
-    return [x for x in runs]
+    return list(runs)
 
 
 # Copied from transformers.models.detr.image_processing_detr.convert_segmentation_to_rle
@@ -785,16 +810,15 @@ def __init__(
         image_mean: Union[float, List[float]] = None,
         image_std: Union[float, List[float]] = None,
         do_pad: bool = True,
-        **kwargs
+        **kwargs,
     ) -> None:
         if "pad_and_return_pixel_mask" in kwargs:
             do_pad = kwargs.pop("pad_and_return_pixel_mask")
 
         if "max_size" in kwargs:
-            warnings.warn(
+            logger.warning_once(
                 "The `max_size` parameter is deprecated and will be removed in v4.26. "
                 "Please specify in `size['longest_edge'] instead`.",
-                FutureWarning,
             )
             max_size = kwargs.pop("max_size")
         else:
@@ -838,6 +862,7 @@ def prepare_annotation(
         format: Optional[AnnotionFormat] = None,
         return_segmentation_masks: bool = None,
         masks_path: Optional[Union[str, pathlib.Path]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> Dict:
         """
         Prepare an annotation for feeding into ConditionalDetr model.
@@ -846,20 +871,26 @@ def prepare_annotation(
 
         if format == AnnotionFormat.COCO_DETECTION:
             return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
-            target = prepare_coco_detection_annotation(image, target, return_segmentation_masks)
+            target = prepare_coco_detection_annotation(
+                image, target, return_segmentation_masks, input_data_format=input_data_format
+            )
         elif format == AnnotionFormat.COCO_PANOPTIC:
             return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
             target = prepare_coco_panoptic_annotation(
-                image, target, masks_path=masks_path, return_masks=return_segmentation_masks
+                image,
+                target,
+                masks_path=masks_path,
+                return_masks=return_segmentation_masks,
+                input_data_format=input_data_format,
             )
         else:
             raise ValueError(f"Format {format} is not supported.")
         return target
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare
-    def prepare(self, image, target, return_segmentation_masks=False, masks_path=None):
-        warnings.warn(
-            "The `prepare` method is deprecated and will be removed in a future version. "
+    def prepare(self, image, target, return_segmentation_masks=None, masks_path=None):
+        logger.warning_once(
+            "The `prepare` method is deprecated and will be removed in a v4.33. "
             "Please use `prepare_annotation` instead. Note: the `prepare_annotation` method "
             "does not return the image anymore.",
         )
@@ -868,17 +899,17 @@ def prepare(self, image, target, return_segmentation_masks=False, masks_path=Non
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.convert_coco_poly_to_mask
     def convert_coco_poly_to_mask(self, *args, **kwargs):
-        warnings.warn("The `convert_coco_poly_to_mask` method is deprecated and will be removed in a future version. ")
+        logger.warning_once("The `convert_coco_poly_to_mask` method is deprecated and will be removed in v4.33. ")
         return convert_coco_poly_to_mask(*args, **kwargs)
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_detection with DETR->ConditionalDetr
     def prepare_coco_detection(self, *args, **kwargs):
-        warnings.warn("The `prepare_coco_detection` method is deprecated and will be removed in a future version. ")
+        logger.warning_once("The `prepare_coco_detection` method is deprecated and will be removed in v4.33. ")
         return prepare_coco_detection_annotation(*args, **kwargs)
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_panoptic
     def prepare_coco_panoptic(self, *args, **kwargs):
-        warnings.warn("The `prepare_coco_panoptic` method is deprecated and will be removed in a future version. ")
+        logger.warning_once("The `prepare_coco_panoptic` method is deprecated and will be removed in v4.33. ")
         return prepare_coco_panoptic_annotation(*args, **kwargs)
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize
@@ -888,24 +919,40 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BILINEAR,
         data_format: Optional[ChannelDimension] = None,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
         int, smaller edge of the image will be matched to this number.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or
+                `height` and `width`.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                Resampling filter to use if resizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
         """
         if "max_size" in kwargs:
-            warnings.warn(
+            logger.warning_once(
                 "The `max_size` parameter is deprecated and will be removed in v4.26. "
                 "Please specify in `size['longest_edge'] instead`.",
-                FutureWarning,
             )
             max_size = kwargs.pop("max_size")
         else:
             max_size = None
         size = get_size_dict(size, max_size=max_size, default_to_square=False)
         if "shortest_edge" in size and "longest_edge" in size:
-            size = get_resize_output_image_size(image, size["shortest_edge"], size["longest_edge"])
+            size = get_resize_output_image_size(
+                image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
+            )
         elif "height" in size and "width" in size:
             size = (size["height"], size["width"])
         else:
@@ -913,7 +960,9 @@ def resize(
                 "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
                 f" {size.keys()}."
             )
-        image = resize(image, size=size, resample=resample, data_format=data_format)
+        image = resize(
+            image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
+        )
         return image
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize_annotation
@@ -932,25 +981,32 @@ def resize_annotation(
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
     def rescale(
-        self, image: np.ndarray, rescale_factor: Union[float, int], data_format: Optional[ChannelDimension] = None
-    ) -> np.ndarray:
-        """
-        Rescale the image by the given factor.
-        """
-        return rescale(image, rescale_factor, data_format=data_format)
-
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize
-    def normalize(
         self,
         image: np.ndarray,
-        mean: Union[float, Iterable[float]],
-        std: Union[float, Iterable[float]],
-        data_format: Optional[ChannelDimension] = None,
+        rescale_factor: float,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.ndarray:
         """
-        Normalize the image with the given mean and standard deviation.
+        Rescale the image by the given factor. image = image * rescale_factor.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            rescale_factor (`float`):
+                The value to use for rescaling.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the input image. If unset, is inferred from the input image. Can be
+                one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
         """
-        return normalize(image, mean=mean, std=std, data_format=data_format)
+        return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation
     def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
@@ -960,42 +1016,6 @@ def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) ->
         """
         return normalize_annotation(annotation, image_size=image_size)
 
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad_and_create_pixel_mask
-    def pad_and_create_pixel_mask(
-        self,
-        pixel_values_list: List[ImageInput],
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        data_format: Optional[ChannelDimension] = None,
-    ) -> BatchFeature:
-        """
-        Pads a batch of images with zeros to the size of largest height and width in the batch and returns their
-        corresponding pixel mask.
-
-        Args:
-            images (`List[np.ndarray]`):
-                Batch of images to pad.
-            return_tensors (`str` or `TensorType`, *optional*):
-                The type of tensors to return. Can be one of:
-                    - Unset: Return a list of `np.ndarray`.
-                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
-                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
-                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
-                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        warnings.warn(
-            "This method is deprecated and will be removed in v4.27.0. Please use pad instead.", FutureWarning
-        )
-        # pad expects a list of np.ndarray, but the previous feature extractors expected torch tensors
-        images = [to_numpy_array(image) for image in pixel_values_list]
-        return self.pad(
-            images=images,
-            return_pixel_mask=True,
-            return_tensors=return_tensors,
-            data_format=data_format,
-        )
-
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
     def _pad_image(
         self,
@@ -1003,18 +1023,24 @@ def _pad_image(
         output_size: Tuple[int, int],
         constant_values: Union[float, Iterable[float]] = 0,
         data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.ndarray:
         """
         Pad an image with zeros to the given size.
         """
-        input_height, input_width = get_image_size(image)
+        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
         output_height, output_width = output_size
 
         pad_bottom = output_height - input_height
         pad_right = output_width - input_width
         padding = ((0, pad_bottom), (0, pad_right))
         padded_image = pad(
-            image, padding, mode=PaddingMode.CONSTANT, constant_values=constant_values, data_format=data_format
+            image,
+            padding,
+            mode=PaddingMode.CONSTANT,
+            constant_values=constant_values,
+            data_format=data_format,
+            input_data_format=input_data_format,
         )
         return padded_image
 
@@ -1026,7 +1052,8 @@ def pad(
         return_pixel_mask: bool = True,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = None,
-    ) -> np.ndarray:
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> BatchFeature:
         """
         Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
         in the batch and optionally returns their corresponding pixel mask.
@@ -1038,21 +1065,37 @@ def pad(
                 The value to use for the padding if `mode` is `"constant"`.
             return_pixel_mask (`bool`, *optional*, defaults to `True`):
                 Whether to return a pixel mask.
-            input_channel_dimension (`ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be inferred from the input image.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
         """
-        pad_size = get_max_height_width(images)
+        pad_size = get_max_height_width(images, input_data_format=input_data_format)
 
         padded_images = [
-            self._pad_image(image, pad_size, constant_values=constant_values, data_format=data_format)
+            self._pad_image(
+                image,
+                pad_size,
+                constant_values=constant_values,
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
             for image in images
         ]
         data = {"pixel_values": padded_images}
 
         if return_pixel_mask:
-            masks = [make_pixel_mask(image=image, output_size=pad_size) for image in images]
+            masks = [
+                make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
+                for image in images
+            ]
             data["pixel_mask"] = masks
 
         return BatchFeature(data=data, tensor_type=return_tensors)
@@ -1061,7 +1104,7 @@ def pad(
     def preprocess(
         self,
         images: ImageInput,
-        annotations: Optional[Union[List[Dict], List[List[Dict]]]] = None,
+        annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
         return_segmentation_masks: bool = None,
         masks_path: Optional[Union[str, pathlib.Path]] = None,
         do_resize: Optional[bool] = None,
@@ -1076,21 +1119,23 @@ def preprocess(
         format: Optional[Union[str, AnnotionFormat]] = None,
         return_tensors: Optional[Union[TensorType, str]] = None,
         data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> BatchFeature:
         """
         Preprocess an image or a batch of images so that it can be used by the model.
 
         Args:
             images (`ImageInput`):
-                Image or batch of images to preprocess.
-            annotations (`List[Dict]` or `List[List[Dict]]`, *optional*):
-                List of annotations associated with the image or batch of images. If annotionation is for object
+                Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging
+                from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
+                List of annotations associated with the image or batch of images. If annotation is for object
                 detection, the annotations should be a dictionary with the following keys:
                 - "image_id" (`int`): The image id.
                 - "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a
                   dictionary. An image can have no annotations, in which case the list should be empty.
-                If annotionation is for segmentation, the annotations should be a dictionary with the following keys:
+                If annotation is for segmentation, the annotations should be a dictionary with the following keys:
                 - "image_id" (`int`): The image id.
                 - "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary.
                   An image can have no segments, in which case the list should be empty.
@@ -1121,23 +1166,30 @@ def preprocess(
                 Format of the annotations.
             return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
                 Type of tensors to return. If `None`, will return the list of images.
-            data_format (`str` or `ChannelDimension`, *optional*, defaults to self.data_format):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
         if "pad_and_return_pixel_mask" in kwargs:
-            warnings.warn(
+            logger.warning_once(
                 "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, "
-                "use `do_pad` instead.",
-                FutureWarning,
+                "use `do_pad` instead."
             )
             do_pad = kwargs.pop("pad_and_return_pixel_mask")
 
         max_size = None
         if "max_size" in kwargs:
-            warnings.warn(
+            logger.warning_once(
                 "The `max_size` argument is deprecated and will be removed in a future version, use"
-                " `size['longest_edge']` instead.",
-                FutureWarning,
+                " `size['longest_edge']` instead."
             )
             size = kwargs.pop("max_size")
 
@@ -1162,9 +1214,9 @@ def preprocess(
         if do_normalize is not None and (image_mean is None or image_std is None):
             raise ValueError("Image mean and std must be specified if do_normalize is True.")
 
-        if not is_batched(images):
-            images = [images]
-            annotations = [annotations] if annotations is not None else None
+        images = make_list_of_images(images)
+        if annotations is not None and isinstance(annotations, dict):
+            annotations = [annotations]
 
         if annotations is not None and len(images) != len(annotations):
             raise ValueError(
@@ -1209,13 +1261,28 @@ def preprocess(
         # All transformations expect numpy arrays
         images = [to_numpy_array(image) for image in images]
 
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
         # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
         if annotations is not None:
             prepared_images = []
             prepared_annotations = []
             for image, target in zip(images, annotations):
                 target = self.prepare_annotation(
-                    image, target, format, return_segmentation_masks=return_segmentation_masks, masks_path=masks_path
+                    image,
+                    target,
+                    format,
+                    return_segmentation_masks=return_segmentation_masks,
+                    masks_path=masks_path,
+                    input_data_format=input_data_format,
                 )
                 prepared_images.append(image)
                 prepared_annotations.append(target)
@@ -1228,33 +1295,47 @@ def preprocess(
             if annotations is not None:
                 resized_images, resized_annotations = [], []
                 for image, target in zip(images, annotations):
-                    orig_size = get_image_size(image)
-                    resized_image = self.resize(image, size=size, max_size=max_size, resample=resample)
-                    resized_annotation = self.resize_annotation(target, orig_size, get_image_size(resized_image))
+                    orig_size = get_image_size(image, input_data_format)
+                    resized_image = self.resize(
+                        image, size=size, max_size=max_size, resample=resample, input_data_format=input_data_format
+                    )
+                    resized_annotation = self.resize_annotation(
+                        target, orig_size, get_image_size(resized_image, input_data_format)
+                    )
                     resized_images.append(resized_image)
                     resized_annotations.append(resized_annotation)
                 images = resized_images
                 annotations = resized_annotations
                 del resized_images, resized_annotations
             else:
-                images = [self.resize(image, size=size, resample=resample) for image in images]
+                images = [
+                    self.resize(image, size=size, resample=resample, input_data_format=input_data_format)
+                    for image in images
+                ]
 
         if do_rescale:
-            images = [self.rescale(image, rescale_factor) for image in images]
+            images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images]
 
         if do_normalize:
-            images = [self.normalize(image, image_mean, image_std) for image in images]
+            images = [
+                self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
+            ]
             if annotations is not None:
                 annotations = [
-                    self.normalize_annotation(annotation, get_image_size(image))
+                    self.normalize_annotation(annotation, get_image_size(image, input_data_format))
                     for annotation, image in zip(annotations, images)
                 ]
 
         if do_pad:
             # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
-            data = self.pad(images, return_pixel_mask=True, data_format=data_format)
+            data = self.pad(
+                images, return_pixel_mask=True, data_format=data_format, input_data_format=input_data_format
+            )
         else:
-            images = [to_channel_dimension_format(image, data_format) for image in images]
+            images = [
+                to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+                for image in images
+            ]
             data = {"pixel_values": images}
 
         encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
@@ -1282,10 +1363,9 @@ def post_process(self, outputs, target_sizes):
             `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
             in the batch as predicted by the model.
         """
-        warnings.warn(
+        logging.warning_once(
             "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
-            " `post_process_object_detection`",
-            FutureWarning,
+            " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
         )
 
         out_logits, out_bbox = outputs.logits, outputs.pred_boxes
@@ -1298,7 +1378,7 @@ def post_process(self, outputs, target_sizes):
         prob = out_logits.sigmoid()
         topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 300, dim=1)
         scores = topk_values
-        topk_boxes = topk_indexes // out_logits.shape[2]
+        topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
         labels = topk_indexes % out_logits.shape[2]
         boxes = center_to_corners_format(out_bbox)
         boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
@@ -1314,7 +1394,7 @@ def post_process(self, outputs, target_sizes):
 
     # Copied from transformers.models.deformable_detr.image_processing_deformable_detr.DeformableDetrImageProcessor.post_process_object_detection with DeformableDetr->ConditionalDetr
     def post_process_object_detection(
-        self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None
+        self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None, top_k: int = 100
     ):
         """
         Converts the raw output of [`ConditionalDetrForObjectDetection`] into final bounding boxes in (top_left_x,
@@ -1328,6 +1408,8 @@ def post_process_object_detection(
             target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
                 Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
                 (height, width) of each image in the batch. If left to None, predictions will not be resized.
+            top_k (`int`, *optional*, defaults to 100):
+                Keep only top k bounding boxes before filtering by thresholding.
 
         Returns:
             `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
@@ -1342,9 +1424,11 @@ def post_process_object_detection(
                 )
 
         prob = out_logits.sigmoid()
-        topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
+        prob = prob.view(out_logits.shape[0], -1)
+        k_value = min(top_k, prob.size(1))
+        topk_values, topk_indexes = torch.topk(prob, k_value, dim=1)
         scores = topk_values
-        topk_boxes = topk_indexes // out_logits.shape[2]
+        topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
         labels = topk_indexes % out_logits.shape[2]
         boxes = center_to_corners_format(out_bbox)
         boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
@@ -1546,7 +1630,7 @@ def post_process_panoptic_segmentation(
         """
 
         if label_ids_to_fuse is None:
-            warnings.warn("`label_ids_to_fuse` unset. No instance will be fused.")
+            logger.warning_once("`label_ids_to_fuse` unset. No instance will be fused.")
             label_ids_to_fuse = set()
 
         class_queries_logits = outputs.logits  # [batch_size, num_queries, num_classes+1]
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index a0a45b9dd723..15f24084f469 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -16,9 +16,8 @@
 
 
 import math
-import random
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple, Union
 
 import torch
 from torch import Tensor, nn
@@ -26,7 +25,6 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_int_div
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -38,6 +36,7 @@
     replace_return_docstrings,
     requires_backends,
 )
+from ..auto import AutoBackbone
 from .configuration_conditional_detr import ConditionalDetrConfig
 
 
@@ -48,7 +47,7 @@
     from timm import create_model
 
 if is_vision_available():
-    from transformers.image_transforms import center_to_corners_format
+    from ...image_transforms import center_to_corners_format
 
 logger = logging.get_logger(__name__)
 
@@ -311,61 +310,80 @@ def forward(self, x):
 
 
 # Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->ConditionalDetr
-def replace_batch_norm(m, name=""):
-    for attr_str in dir(m):
-        target_attr = getattr(m, attr_str)
-        if isinstance(target_attr, nn.BatchNorm2d):
-            frozen = ConditionalDetrFrozenBatchNorm2d(target_attr.num_features)
-            bn = getattr(m, attr_str)
-            frozen.weight.data.copy_(bn.weight)
-            frozen.bias.data.copy_(bn.bias)
-            frozen.running_mean.data.copy_(bn.running_mean)
-            frozen.running_var.data.copy_(bn.running_var)
-            setattr(m, attr_str, frozen)
-    for n, ch in m.named_children():
-        replace_batch_norm(ch, n)
-
-
-# Copied from transformers.models.detr.modeling_detr.DetrTimmConvEncoder
-class ConditionalDetrTimmConvEncoder(nn.Module):
+def replace_batch_norm(model):
+    r"""
+    Recursively replace all `torch.nn.BatchNorm2d` with `ConditionalDetrFrozenBatchNorm2d`.
+
+    Args:
+        model (torch.nn.Module):
+            input model
     """
-    Convolutional encoder (backbone) from the timm library.
+    for name, module in model.named_children():
+        if isinstance(module, nn.BatchNorm2d):
+            new_module = ConditionalDetrFrozenBatchNorm2d(module.num_features)
+
+            new_module.weight.data.copy_(module.weight)
+            new_module.bias.data.copy_(module.bias)
+            new_module.running_mean.data.copy_(module.running_mean)
+            new_module.running_var.data.copy_(module.running_var)
+
+            model._modules[name] = new_module
+
+        if len(list(module.children())) > 0:
+            replace_batch_norm(module)
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrConvEncoder
+class ConditionalDetrConvEncoder(nn.Module):
+    """
+    Convolutional backbone, using either the AutoBackbone API or one from the timm library.
 
     nn.BatchNorm2d layers are replaced by DetrFrozenBatchNorm2d as defined above.
 
     """
 
-    def __init__(self, name: str, dilation: bool, use_pretrained_backbone: bool, num_channels: int = 3):
+    def __init__(self, config):
         super().__init__()
 
-        kwargs = {}
-        if dilation:
-            kwargs["output_stride"] = 16
-
-        requires_backends(self, ["timm"])
+        self.config = config
+
+        if config.use_timm_backbone:
+            requires_backends(self, ["timm"])
+            kwargs = {}
+            if config.dilation:
+                kwargs["output_stride"] = 16
+            backbone = create_model(
+                config.backbone,
+                pretrained=config.use_pretrained_backbone,
+                features_only=True,
+                out_indices=(1, 2, 3, 4),
+                in_chans=config.num_channels,
+                **kwargs,
+            )
+        else:
+            backbone = AutoBackbone.from_config(config.backbone_config)
 
-        backbone = create_model(
-            name,
-            pretrained=use_pretrained_backbone,
-            features_only=True,
-            out_indices=(1, 2, 3, 4),
-            in_chans=num_channels,
-            **kwargs,
-        )
         # replace batch norm by frozen batch norm
         with torch.no_grad():
             replace_batch_norm(backbone)
         self.model = backbone
-        self.intermediate_channel_sizes = self.model.feature_info.channels()
+        self.intermediate_channel_sizes = (
+            self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
+        )
 
-        if "resnet" in name:
+        backbone_model_type = config.backbone if config.use_timm_backbone else config.backbone_config.model_type
+        if "resnet" in backbone_model_type:
             for name, parameter in self.model.named_parameters():
-                if "layer2" not in name and "layer3" not in name and "layer4" not in name:
-                    parameter.requires_grad_(False)
+                if config.use_timm_backbone:
+                    if "layer2" not in name and "layer3" not in name and "layer4" not in name:
+                        parameter.requires_grad_(False)
+                else:
+                    if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name:
+                        parameter.requires_grad_(False)
 
     def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
         # send pixel_values through the model to get list of feature maps
-        features = self.model(pixel_values)
+        features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps
 
         out = []
         for feature_map in features:
@@ -440,7 +458,7 @@ def forward(self, pixel_values, pixel_mask):
             x_embed = x_embed / (x_embed[:, :, -1:] + 1e-6) * self.scale
 
         dim_t = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device)
-        dim_t = self.temperature ** (2 * torch_int_div(dim_t, 2) / self.embedding_dim)
+        dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim)
 
         pos_x = x_embed[:, :, :, None] / dim_t
         pos_y = y_embed[:, :, :, None] / dim_t
@@ -489,10 +507,11 @@ def build_position_encoding(config):
 
 
 # function to generate sine positional embedding for 2d coordinates
-def gen_sine_position_embeddings(pos_tensor):
+def gen_sine_position_embeddings(pos_tensor, d_model):
     scale = 2 * math.pi
-    dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device)
-    dim_t = 10000 ** (2 * (dim_t // 2) / 128)
+    dim = d_model // 2
+    dim_t = torch.arange(dim, dtype=torch.float32, device=pos_tensor.device)
+    dim_t = 10000 ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / dim)
     x_embed = pos_tensor[:, :, 0] * scale
     y_embed = pos_tensor[:, :, 1] * scale
     pos_x = x_embed[:, :, None] / dim_t
@@ -523,7 +542,6 @@ def __init__(
         embed_dim: int,
         num_heads: int,
         dropout: float = 0.0,
-        is_decoder: bool = False,
         bias: bool = True,
     ):
         super().__init__()
@@ -546,34 +564,79 @@ def __init__(
     def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
         return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
 
-    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
-        return tensor if position_embeddings is None else tensor + position_embeddings
+    def with_pos_embed(self, tensor: torch.Tensor, object_queries: Optional[Tensor], **kwargs):
+        position_embeddings = kwargs.pop("position_embeddings", None)
+
+        if kwargs:
+            raise ValueError(f"Unexpected arguments {kwargs.keys()}")
+
+        if position_embeddings is not None and object_queries is not None:
+            raise ValueError(
+                "Cannot specify both position_embeddings and object_queries. Please use just object_queries"
+            )
+
+        if position_embeddings is not None:
+            logger.warning_once(
+                "position_embeddings has been deprecated and will be removed in v4.34. Please use object_queries instead"
+            )
+            object_queries = position_embeddings
+
+        return tensor if object_queries is None else tensor + object_queries
 
     def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
-        position_embeddings: Optional[torch.Tensor] = None,
+        object_queries: Optional[torch.Tensor] = None,
         key_value_states: Optional[torch.Tensor] = None,
-        key_value_position_embeddings: Optional[torch.Tensor] = None,
+        spatial_position_embeddings: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
+        **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         """Input shape: Batch x Time x Channel"""
 
+        position_embeddings = kwargs.pop("position_ebmeddings", None)
+        key_value_position_embeddings = kwargs.pop("key_value_position_embeddings", None)
+
+        if kwargs:
+            raise ValueError(f"Unexpected arguments {kwargs.keys()}")
+
+        if position_embeddings is not None and object_queries is not None:
+            raise ValueError(
+                "Cannot specify both position_embeddings and object_queries. Please use just object_queries"
+            )
+
+        if key_value_position_embeddings is not None and spatial_position_embeddings is not None:
+            raise ValueError(
+                "Cannot specify both key_value_position_embeddings and spatial_position_embeddings. Please use just spatial_position_embeddings"
+            )
+
+        if position_embeddings is not None:
+            logger.warning_once(
+                "position_embeddings has been deprecated and will be removed in v4.34. Please use object_queries instead"
+            )
+            object_queries = position_embeddings
+
+        if key_value_position_embeddings is not None:
+            logger.warning_once(
+                "key_value_position_embeddings has been deprecated and will be removed in v4.34. Please use spatial_position_embeddings instead"
+            )
+            spatial_position_embeddings = key_value_position_embeddings
+
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
         batch_size, target_len, embed_dim = hidden_states.size()
 
         # add position embeddings to the hidden states before projecting to queries and keys
-        if position_embeddings is not None:
+        if object_queries is not None:
             hidden_states_original = hidden_states
-            hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
+            hidden_states = self.with_pos_embed(hidden_states, object_queries)
 
         # add key-value position embeddings to the key value states
-        if key_value_position_embeddings is not None:
+        if spatial_position_embeddings is not None:
             key_value_states_original = key_value_states
-            key_value_states = self.with_pos_embed(key_value_states, key_value_position_embeddings)
+            key_value_states = self.with_pos_embed(key_value_states, spatial_position_embeddings)
 
         # get query proj
         query_states = self.q_proj(hidden_states) * self.scaling
@@ -781,25 +844,43 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: torch.Tensor,
-        position_embeddings: torch.Tensor = None,
+        object_queries: torch.Tensor = None,
         output_attentions: bool = False,
+        **kwargs,
     ):
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
                 values.
-            position_embeddings (`torch.FloatTensor`, *optional*): position embeddings, to be added to hidden_states.
+            object_queries (`torch.FloatTensor`, *optional*):
+                Object queries (also called content embeddings), to be added to the hidden states.
             output_attentions (`bool`, *optional*):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more detail.
         """
+        position_embeddings = kwargs.pop("position_embeddings", None)
+
+        if kwargs:
+            raise ValueError(f"Unexpected arguments {kwargs.keys()}")
+
+        if position_embeddings is not None and object_queries is not None:
+            raise ValueError(
+                "Cannot specify both position_embeddings and object_queries. Please use just object_queries"
+            )
+
+        if position_embeddings is not None:
+            logger.warning_once(
+                "position_embeddings has been deprecated and will be removed in v4.34. Please use object_queries instead"
+            )
+            object_queries = position_embeddings
+
         residual = hidden_states
         hidden_states, attn_weights = self.self_attn(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
-            position_embeddings=position_embeddings,
+            object_queries=object_queries,
             output_attentions=output_attentions,
         )
 
@@ -876,13 +957,14 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
-        position_embeddings: Optional[torch.Tensor] = None,
+        object_queries: Optional[torch.Tensor] = None,
         query_position_embeddings: Optional[torch.Tensor] = None,
         query_sine_embed: Optional[torch.Tensor] = None,
         encoder_hidden_states: Optional[torch.Tensor] = None,
         encoder_attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
         is_first: Optional[bool] = False,
+        **kwargs,
     ):
         """
         Args:
@@ -890,11 +972,11 @@ def forward(
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
                 values.
-            position_embeddings (`torch.FloatTensor`, *optional*):
-                position embeddings that are added to the queries and keys
+            object_queries (`torch.FloatTensor`, *optional*):
+                object_queries that are added to the queries and keys
             in the cross-attention layer.
             query_position_embeddings (`torch.FloatTensor`, *optional*):
-                position embeddings that are added to the queries and keys
+                object_queries that are added to the queries and keys
             in the self-attention layer.
             encoder_hidden_states (`torch.FloatTensor`):
                 cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
@@ -905,6 +987,22 @@ def forward(
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more detail.
         """
+        position_embeddings = kwargs.pop("position_embeddings", None)
+
+        if kwargs:
+            raise ValueError(f"Unexpected arguments {kwargs.keys()}")
+
+        if position_embeddings is not None and object_queries is not None:
+            raise ValueError(
+                "Cannot specify both position_embeddings and object_queries. Please use just object_queries"
+            )
+
+        if position_embeddings is not None:
+            logger.warning_once(
+                "position_embeddings has been deprecated and will be removed in v4.34. Please use object_queries instead"
+            )
+            object_queries = position_embeddings
+
         residual = hidden_states
 
         # ========== Begin of Self-Attention =============
@@ -945,7 +1043,7 @@ def forward(
         batch_size, num_queries, n_model = q_content.shape
         _, source_len, _ = k_content.shape
 
-        k_pos = self.ca_kpos_proj(position_embeddings)
+        k_pos = self.ca_kpos_proj(object_queries)
 
         # For the first decoder layer, we concatenate the positional embedding predicted from
         # the object query (the positional embedding) into the original query (key) in DETR.
@@ -1097,8 +1195,8 @@ def _set_gradient_checkpointing(self, module, value=False):
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it.
 
-            Pixel values can be obtained using [`ConditionalDetrImageProcessor`]. See
-            [`ConditionalDetrImageProcessor.__call__`] for details.
+            Pixel values can be obtained using [`AutoImageProcessor`]. See [`ConditionalDetrImageProcessor.__call__`]
+            for details.
 
         pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
             Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
@@ -1108,7 +1206,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 
             [What are attention masks?](../glossary#attention-mask)
 
-        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, num_queries)`, *optional*):
+        decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
             Not used by default. Can be used to mask object queries.
         encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
             Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
@@ -1141,7 +1239,7 @@ class ConditionalDetrEncoder(ConditionalDetrPreTrainedModel):
 
     Small tweak for ConditionalDETR:
 
-    - position_embeddings are added to the forward pass.
+    - object_queries are added to the forward pass.
 
     Args:
         config: ConditionalDetrConfig
@@ -1164,10 +1262,11 @@ def forward(
         self,
         inputs_embeds=None,
         attention_mask=None,
-        position_embeddings=None,
+        object_queries=None,
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
+        **kwargs,
     ):
         r"""
         Args:
@@ -1182,8 +1281,8 @@ def forward(
 
                 [What are attention masks?](../glossary#attention-mask)
 
-            position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-                Position embeddings that are added to the queries and keys in each self-attention layer.
+            object_queries (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Object queries that are added to the queries in each self-attention layer.
 
             output_attentions (`bool`, *optional*):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
@@ -1194,6 +1293,22 @@ def forward(
             return_dict (`bool`, *optional*):
                 Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
         """
+        position_embeddings = kwargs.pop("position_embeddings", None)
+
+        if kwargs:
+            raise ValueError(f"Unexpected arguments {kwargs.keys()}")
+
+        if position_embeddings is not None and object_queries is not None:
+            raise ValueError(
+                "Cannot specify both position_embeddings and object_queries. Please use just object_queries"
+            )
+
+        if position_embeddings is not None:
+            logger.warning_once(
+                "position_embeddings has been deprecated and will be removed in v4.34. Please use object_queries instead"
+            )
+            object_queries = position_embeddings
+
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1214,15 +1329,20 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
                 layer_outputs = (None, None)
             else:
-                # we add position_embeddings as extra input to the encoder_layer
+                # we add object_queries as extra input to the encoder_layer
                 layer_outputs = encoder_layer(
                     hidden_states,
                     attention_mask,
-                    position_embeddings=position_embeddings,
+                    object_queries=object_queries,
                     output_attentions=output_attentions,
                 )
 
@@ -1249,7 +1369,7 @@ class ConditionalDetrDecoder(ConditionalDetrPreTrainedModel):
 
     Some small tweaks for Conditional DETR:
 
-    - position_embeddings and query_position_embeddings are added to the forward pass.
+    - object_queries and query_position_embeddings are added to the forward pass.
     - if self.config.auxiliary_loss is set to True, also returns a stack of activations from all decoding layers.
 
     Args:
@@ -1282,11 +1402,12 @@ def forward(
         attention_mask=None,
         encoder_hidden_states=None,
         encoder_attention_mask=None,
-        position_embeddings=None,
+        object_queries=None,
         query_position_embeddings=None,
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
+        **kwargs,
     ):
         r"""
         Args:
@@ -1310,7 +1431,7 @@ def forward(
                 - 1 for pixels that are real (i.e. **not masked**),
                 - 0 for pixels that are padding (i.e. **masked**).
 
-            position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            object_queries (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                 Position embeddings that are added to the queries and keys in each cross-attention layer.
             query_position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
                 , *optional*): Position embeddings that are added to the queries and keys in each self-attention layer.
@@ -1323,6 +1444,22 @@ def forward(
             return_dict (`bool`, *optional*):
                 Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
         """
+        position_embeddings = kwargs.pop("position_embeddings", None)
+
+        if kwargs:
+            raise ValueError(f"Unexpected arguments {kwargs.keys()}")
+
+        if position_embeddings is not None and object_queries is not None:
+            raise ValueError(
+                "Cannot specify both position_embeddings and object_queries. Please use just object_queries"
+            )
+
+        if position_embeddings is not None:
+            logger.warning_once(
+                "position_embeddings has been deprecated and will be removed in v4.34. Please use object_queries instead"
+            )
+            object_queries = position_embeddings
+
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1362,15 +1499,16 @@ def forward(
         reference_points = reference_points_before_sigmoid.sigmoid().transpose(0, 1)
         obj_center = reference_points[..., :2].transpose(0, 1)
         # get sine embedding for the query vector
-        query_sine_embed_before_transformation = gen_sine_position_embeddings(obj_center)
+        query_sine_embed_before_transformation = gen_sine_position_embeddings(obj_center, self.config.d_model)
 
         for idx, decoder_layer in enumerate(self.layers):
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
             if idx == 0:
                 pos_transformation = 1
             else:
@@ -1389,7 +1527,7 @@ def custom_forward(*inputs):
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     combined_attention_mask,
-                    position_embeddings,
+                    object_queries,
                     query_position_embeddings,
                     query_sine_embed,
                     encoder_hidden_states,
@@ -1401,7 +1539,7 @@ def custom_forward(*inputs):
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=combined_attention_mask,
-                    position_embeddings=position_embeddings,
+                    object_queries=object_queries,
                     query_position_embeddings=query_position_embeddings,
                     query_sine_embed=query_sine_embed,
                     encoder_hidden_states=encoder_hidden_states,
@@ -1468,11 +1606,9 @@ def __init__(self, config: ConditionalDetrConfig):
         super().__init__(config)
 
         # Create backbone + positional encoding
-        backbone = ConditionalDetrTimmConvEncoder(
-            config.backbone, config.dilation, config.use_pretrained_backbone, config.num_channels
-        )
-        position_embeddings = build_position_encoding(config)
-        self.backbone = ConditionalDetrConvModel(backbone, position_embeddings)
+        backbone = ConditionalDetrConvEncoder(config)
+        object_queries = build_position_encoding(config)
+        self.backbone = ConditionalDetrConvModel(backbone, object_queries)
 
         # Create projection layer
         self.input_projection = nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1)
@@ -1503,16 +1639,16 @@ def unfreeze_backbone(self):
     @replace_return_docstrings(output_type=ConditionalDetrModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        pixel_values,
-        pixel_mask=None,
-        decoder_attention_mask=None,
-        encoder_outputs=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        pixel_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], ConditionalDetrModelOutput]:
         r"""
         Returns:
 
@@ -1556,7 +1692,7 @@ def forward(
         # First, sent pixel_values + pixel_mask through Backbone to obtain the features
         # pixel_values should be of shape (batch_size, num_channels, height, width)
         # pixel_mask should be of shape (batch_size, height, width)
-        features, position_embeddings_list = self.backbone(pixel_values, pixel_mask)
+        features, object_queries_list = self.backbone(pixel_values, pixel_mask)
 
         # get final feature map and downsampled mask
         feature_map, mask = features[-1]
@@ -1567,21 +1703,21 @@ def forward(
         # Second, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
         projected_feature_map = self.input_projection(feature_map)
 
-        # Third, flatten the feature map + position embeddings of shape NxCxHxW to NxCxHW, and permute it to NxHWxC
+        # Third, flatten the feature map + object_queries of shape NxCxHxW to NxCxHW, and permute it to NxHWxC
         # In other words, turn their shape into (batch_size, sequence_length, hidden_size)
         flattened_features = projected_feature_map.flatten(2).permute(0, 2, 1)
-        position_embeddings = position_embeddings_list[-1].flatten(2).permute(0, 2, 1)
+        object_queries = object_queries_list[-1].flatten(2).permute(0, 2, 1)
 
         flattened_mask = mask.flatten(1)
 
-        # Fourth, sent flattened_features + flattened_mask + position embeddings through encoder
+        # Fourth, sent flattened_features + flattened_mask + object_queries through encoder
         # flattened_features is a Tensor of shape (batch_size, heigth*width, hidden_size)
         # flattened_mask is a Tensor of shape (batch_size, heigth*width)
         if encoder_outputs is None:
             encoder_outputs = self.encoder(
                 inputs_embeds=flattened_features,
                 attention_mask=flattened_mask,
-                position_embeddings=position_embeddings,
+                object_queries=object_queries,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict,
@@ -1594,7 +1730,7 @@ def forward(
                 attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
             )
 
-        # Fifth, sent query embeddings + position embeddings through the decoder (which is conditioned on the encoder output)
+        # Fifth, sent query embeddings + object_queries through the decoder (which is conditioned on the encoder output)
         query_position_embeddings = self.query_position_embeddings.weight.unsqueeze(0).repeat(batch_size, 1, 1)
         queries = torch.zeros_like(query_position_embeddings)
 
@@ -1602,7 +1738,7 @@ def forward(
         decoder_outputs = self.decoder(
             inputs_embeds=queries,
             attention_mask=None,
-            position_embeddings=position_embeddings,
+            object_queries=object_queries,
             query_position_embeddings=query_position_embeddings,
             encoder_hidden_states=encoder_outputs[0],
             encoder_attention_mask=flattened_mask,
@@ -1664,17 +1800,17 @@ def _set_aux_loss(self, outputs_class, outputs_coord):
     @replace_return_docstrings(output_type=ConditionalDetrObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        pixel_values,
-        pixel_mask=None,
-        decoder_attention_mask=None,
-        encoder_outputs=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        pixel_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[List[dict]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], ConditionalDetrObjectDetectionOutput]:
         r"""
         labels (`List[Dict]` of len `(batch_size,)`, *optional*):
             Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
@@ -1848,17 +1984,17 @@ def __init__(self, config: ConditionalDetrConfig):
     @replace_return_docstrings(output_type=ConditionalDetrSegmentationOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        pixel_values,
-        pixel_mask=None,
-        decoder_attention_mask=None,
-        encoder_outputs=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        pixel_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_outputs: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[List[dict]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], ConditionalDetrSegmentationOutput]:
         r"""
         labels (`List[Dict]` of len `(batch_size,)`, *optional*):
             Labels for computing the bipartite matching loss, DICE/F-1 loss and Focal loss. List of dicts, each
@@ -1901,7 +2037,7 @@ def forward(
         >>> # forward pass
         >>> outputs = model(**inputs)
 
-        >>> # Use the `post_process_panoptic_segmentation` method of `ConditionalDetrImageProcessor` to retrieve post-processed panoptic segmentation maps
+        >>> # Use the `post_process_panoptic_segmentation` method of the `image_processor` to retrieve post-processed panoptic segmentation maps
         >>> # Segmentation results are returned as a list of dictionaries
         >>> result = image_processor.post_process_panoptic_segmentation(outputs, target_sizes=[(300, 500)])
         >>> # A tensor of shape (height, width) where each value denotes a segment id, filled with -1 if no segment is found
@@ -1918,29 +2054,29 @@ def forward(
         if pixel_mask is None:
             pixel_mask = torch.ones((batch_size, height, width), device=device)
 
-        # First, get list of feature maps and position embeddings
-        features, position_embeddings_list = self.conditional_detr.model.backbone(pixel_values, pixel_mask=pixel_mask)
+        # First, get list of feature maps and object_queries
+        features, object_queries_list = self.conditional_detr.model.backbone(pixel_values, pixel_mask=pixel_mask)
 
         # Second, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
         feature_map, mask = features[-1]
         batch_size, num_channels, height, width = feature_map.shape
         projected_feature_map = self.conditional_detr.model.input_projection(feature_map)
 
-        # Third, flatten the feature map + position embeddings of shape NxCxHxW to NxCxHW, and permute it to NxHWxC
+        # Third, flatten the feature map + object_queries of shape NxCxHxW to NxCxHW, and permute it to NxHWxC
         # In other words, turn their shape into (batch_size, sequence_length, hidden_size)
         flattened_features = projected_feature_map.flatten(2).permute(0, 2, 1)
-        position_embeddings = position_embeddings_list[-1].flatten(2).permute(0, 2, 1)
+        object_queries = object_queries_list[-1].flatten(2).permute(0, 2, 1)
 
         flattened_mask = mask.flatten(1)
 
-        # Fourth, sent flattened_features + flattened_mask + position embeddings through encoder
+        # Fourth, sent flattened_features + flattened_mask + object_queries through encoder
         # flattened_features is a Tensor of shape (batch_size, heigth*width, hidden_size)
         # flattened_mask is a Tensor of shape (batch_size, heigth*width)
         if encoder_outputs is None:
             encoder_outputs = self.conditional_detr.model.encoder(
                 inputs_embeds=flattened_features,
                 attention_mask=flattened_mask,
-                position_embeddings=position_embeddings,
+                object_queries=object_queries,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict,
@@ -1953,7 +2089,7 @@ def forward(
                 attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
             )
 
-        # Fifth, sent query embeddings + position embeddings through the decoder (which is conditioned on the encoder output)
+        # Fifth, sent query embeddings + object_queries through the decoder (which is conditioned on the encoder output)
         query_position_embeddings = self.conditional_detr.model.query_position_embeddings.weight.unsqueeze(0).repeat(
             batch_size, 1, 1
         )
@@ -1963,7 +2099,7 @@ def forward(
         decoder_outputs = self.conditional_detr.model.decoder(
             inputs_embeds=queries,
             attention_mask=None,
-            position_embeddings=position_embeddings,
+            object_queries=object_queries,
             query_position_embeddings=query_position_embeddings,
             encoder_hidden_states=encoder_outputs[0],
             encoder_attention_mask=flattened_mask,
@@ -2080,13 +2216,13 @@ def __init__(self, dim, fpn_dims, context_dim):
         self.lay1 = nn.Conv2d(dim, dim, 3, padding=1)
         self.gn1 = nn.GroupNorm(8, dim)
         self.lay2 = nn.Conv2d(dim, inter_dims[1], 3, padding=1)
-        self.gn2 = nn.GroupNorm(8, inter_dims[1])
+        self.gn2 = nn.GroupNorm(min(8, inter_dims[1]), inter_dims[1])
         self.lay3 = nn.Conv2d(inter_dims[1], inter_dims[2], 3, padding=1)
-        self.gn3 = nn.GroupNorm(8, inter_dims[2])
+        self.gn3 = nn.GroupNorm(min(8, inter_dims[2]), inter_dims[2])
         self.lay4 = nn.Conv2d(inter_dims[2], inter_dims[3], 3, padding=1)
-        self.gn4 = nn.GroupNorm(8, inter_dims[3])
+        self.gn4 = nn.GroupNorm(min(8, inter_dims[3]), inter_dims[3])
         self.lay5 = nn.Conv2d(inter_dims[3], inter_dims[4], 3, padding=1)
-        self.gn5 = nn.GroupNorm(8, inter_dims[4])
+        self.gn5 = nn.GroupNorm(min(8, inter_dims[4]), inter_dims[4])
         self.out_lay = nn.Conv2d(inter_dims[4], 1, 3, padding=1)
 
         self.dim = dim
diff --git a/src/transformers/models/convbert/__init__.py b/src/transformers/models/convbert/__init__.py
index 1f8224f4b648..f1b19a949abb 100644
--- a/src/transformers/models/convbert/__init__.py
+++ b/src/transformers/models/convbert/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/convbert/modeling_convbert.py b/src/transformers/models/convbert/modeling_convbert.py
index 5922e652788d..a6fccf5b72b4 100755
--- a/src/transformers/models/convbert/modeling_convbert.py
+++ b/src/transformers/models/convbert/modeling_convbert.py
@@ -44,7 +44,6 @@
 
 _CHECKPOINT_FOR_DOC = "YituTech/conv-bert-base"
 _CONFIG_FOR_DOC = "ConvBertConfig"
-_TOKENIZER_FOR_DOC = "ConvBertTokenizer"
 
 CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "YituTech/conv-bert-base",
@@ -192,7 +191,9 @@ def __init__(self, config):
         self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
         )
@@ -246,8 +247,6 @@ class ConvBertPreTrainedModel(PreTrainedModel):
     load_tf_weights = load_tf_weights_in_convbert
     base_model_prefix = "convbert"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-    _keys_to_ignore_on_load_unexpected = [r"convbert.embeddings_project.weight", r"convbert.embeddings_project.bias"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -317,7 +316,7 @@ def __init__(self, config):
         if config.hidden_size % self.num_attention_heads != 0:
             raise ValueError("hidden_size should be divisible by num_attention_heads")
 
-        self.attention_head_size = config.hidden_size // config.num_attention_heads
+        self.attention_head_size = (config.hidden_size // self.num_attention_heads) // 2
         self.all_head_size = self.num_attention_heads * self.attention_head_size
 
         self.query = nn.Linear(config.hidden_size, self.all_head_size)
@@ -414,7 +413,10 @@ def forward(
         conv_out = torch.reshape(conv_out_layer, [batch_size, -1, self.num_attention_heads, self.attention_head_size])
         context_layer = torch.cat([context_layer, conv_out], 2)
 
-        new_context_layer_shape = context_layer.size()[:-2] + (self.head_ratio * self.all_head_size,)
+        # conv and context
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.num_attention_heads * self.attention_head_size * 2,
+        )
         context_layer = context_layer.view(*new_context_layer_shape)
 
         outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
@@ -710,7 +712,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`ConvBertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -763,8 +765,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
     CONVBERT_START_DOCSTRING,
 )
 class ConvBertModel(ConvBertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["embeddings.position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.embeddings = ConvBertEmbeddings(config)
@@ -793,7 +793,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -819,13 +818,14 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
-            batch_size, seq_length = input_shape
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
         else:
             raise ValueError("You have to specify either input_ids or inputs_embeds")
 
+        batch_size, seq_length = input_shape
         device = input_ids.device if input_ids is not None else inputs_embeds.device
 
         if attention_mask is None:
@@ -879,7 +879,7 @@ def forward(self, generator_hidden_states: torch.FloatTensor) -> torch.FloatTens
 
 @add_start_docstrings("""ConvBERT Model with a `language modeling` head on top.""", CONVBERT_START_DOCSTRING)
 class ConvBertForMaskedLM(ConvBertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["embeddings.position_ids", "generator.lm_head.weight"]
+    _tied_weights_keys = ["generator.lm_head.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -899,7 +899,6 @@ def set_output_embeddings(self, word_embeddings):
 
     @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -991,8 +990,6 @@ def forward(self, hidden_states: torch.Tensor, **kwargs) -> torch.Tensor:
     CONVBERT_START_DOCSTRING,
 )
 class ConvBertForSequenceClassification(ConvBertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["embeddings.position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1005,7 +1002,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1089,8 +1085,6 @@ def forward(
     CONVBERT_START_DOCSTRING,
 )
 class ConvBertForMultipleChoice(ConvBertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["embeddings.position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -1105,7 +1099,6 @@ def __init__(self, config):
         CONVBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
     )
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1185,8 +1178,6 @@ def forward(
     CONVBERT_START_DOCSTRING,
 )
 class ConvBertForTokenClassification(ConvBertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["embeddings.position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1203,7 +1194,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1269,8 +1259,6 @@ def forward(
     CONVBERT_START_DOCSTRING,
 )
 class ConvBertForQuestionAnswering(ConvBertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["embeddings.position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -1283,7 +1271,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/convbert/modeling_tf_convbert.py b/src/transformers/models/convbert/modeling_tf_convbert.py
index 4f9d4af41f56..4beb01cb78b0 100644
--- a/src/transformers/models/convbert/modeling_tf_convbert.py
+++ b/src/transformers/models/convbert/modeling_tf_convbert.py
@@ -15,6 +15,8 @@
 """ TF 2.0 ConvBERT model."""
 
 
+from __future__ import annotations
+
 from typing import Optional, Tuple, Union
 
 import numpy as np
@@ -42,9 +44,8 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -57,7 +58,6 @@
 
 _CHECKPOINT_FOR_DOC = "YituTech/conv-bert-base"
 _CONFIG_FOR_DOC = "ConvBertConfig"
-_TOKENIZER_FOR_DOC = "ConvBertTokenizer"
 
 TF_CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "YituTech/conv-bert-base",
@@ -74,8 +74,7 @@ class TFConvBertEmbeddings(tf.keras.layers.Layer):
     def __init__(self, config: ConvBertConfig, **kwargs):
         super().__init__(**kwargs)
 
-        self.vocab_size = config.vocab_size
-        self.type_vocab_size = config.type_vocab_size
+        self.config = config
         self.embedding_size = config.embedding_size
         self.max_position_embeddings = config.max_position_embeddings
         self.initializer_range = config.initializer_range
@@ -86,14 +85,14 @@ def build(self, input_shape: tf.TensorShape):
         with tf.name_scope("word_embeddings"):
             self.weight = self.add_weight(
                 name="weight",
-                shape=[self.vocab_size, self.embedding_size],
+                shape=[self.config.vocab_size, self.embedding_size],
                 initializer=get_initializer(self.initializer_range),
             )
 
         with tf.name_scope("token_type_embeddings"):
             self.token_type_embeddings = self.add_weight(
                 name="embeddings",
-                shape=[self.type_vocab_size, self.embedding_size],
+                shape=[self.config.type_vocab_size, self.embedding_size],
                 initializer=get_initializer(self.initializer_range),
             )
 
@@ -126,16 +125,7 @@ def call(
             raise ValueError("Need to provide either `input_ids` or `input_embeds`.")
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
@@ -356,7 +346,7 @@ def __init__(self, input_size, output_size, num_groups, kernel_initializer, **kw
         self.group_in_dim = self.input_size // self.num_groups
         self.group_out_dim = self.output_size // self.num_groups
 
-    def build(self, input_shape):
+    def build(self, input_shape=None):
         self.kernel = self.add_weight(
             "kernel",
             shape=[self.group_out_dim, self.group_in_dim, self.num_groups],
@@ -367,6 +357,7 @@ def build(self, input_shape):
         self.bias = self.add_weight(
             "bias", shape=[self.output_size], initializer=self.kernel_initializer, dtype=self.dtype, trainable=True
         )
+        super().build(input_shape)
 
     def call(self, hidden_states):
         batch_size = shape_list(hidden_states)[0]
@@ -683,7 +674,7 @@ class TFConvBertPreTrainedModel(TFPreTrainedModel):
         input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`ConvBertTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -747,19 +738,18 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFBaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
+        input_ids: TFModelInputType | None = None,
         attention_mask: Optional[Union[np.array, tf.Tensor]] = None,
         token_type_ids: Optional[Union[np.array, tf.Tensor]] = None,
         position_ids: Optional[Union[np.array, tf.Tensor]] = None,
         head_mask: Optional[Union[np.array, tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -780,23 +770,17 @@ def call(
 
         return outputs
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutput(last_hidden_state=output.last_hidden_state, hidden_states=hs, attentions=attns)
-
 
 class TFConvBertMaskedLMHead(tf.keras.layers.Layer):
     def __init__(self, config, input_embeddings, **kwargs):
         super().__init__(**kwargs)
 
-        self.vocab_size = config.vocab_size
+        self.config = config
         self.embedding_size = config.embedding_size
         self.input_embeddings = input_embeddings
 
     def build(self, input_shape):
-        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
+        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
 
         super().build(input_shape)
 
@@ -812,13 +796,13 @@ def get_bias(self):
 
     def set_bias(self, value):
         self.bias = value["bias"]
-        self.vocab_size = shape_list(value["bias"])[0]
+        self.config.vocab_size = shape_list(value["bias"])[0]
 
     def call(self, hidden_states):
         seq_length = shape_list(tensor=hidden_states)[1]
         hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size])
         hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
-        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
         hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
 
         return hidden_states
@@ -844,7 +828,7 @@ class TFConvBertForMaskedLM(TFConvBertPreTrainedModel, TFMaskedLanguageModelingL
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, **kwargs)
 
-        self.vocab_size = config.vocab_size
+        self.config = config
         self.convbert = TFConvBertMainLayer(config, name="convbert")
         self.generator_predictions = TFConvBertGeneratorPredictions(config, name="generator_predictions")
 
@@ -864,23 +848,22 @@ def get_prefix_bias_name(self):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFMaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFMaskedLMOutput]:
         r"""
@@ -918,13 +901,6 @@ def call(
             attentions=generator_hidden_states.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 class TFConvBertClassificationHead(tf.keras.layers.Layer):
     """Head for sentence-level classification tasks."""
@@ -972,23 +948,22 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFSequenceClassifierOutput]:
         r"""
@@ -1024,12 +999,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1050,38 +1019,27 @@ def __init__(self, config, *inputs, **kwargs):
             1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
         )
 
-    @property
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            tf.Tensor with dummy inputs
-        """
-        return {"input_ids": tf.convert_to_tensor(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32)}
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(
         CONVBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
     )
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFMultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFMultipleChoiceModelOutput]:
         r"""
@@ -1134,26 +1092,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-                "token_type_ids": tf.TensorSpec((None, None, None), tf.int32, name="token_type_ids"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMultipleChoiceModelOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1179,23 +1117,22 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFTokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFTokenClassifierOutput]:
         r"""
@@ -1230,12 +1167,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1257,24 +1188,23 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[tf.Tensor] = None,
-        end_positions: Optional[tf.Tensor] = None,
+        start_positions: tf.Tensor | None = None,
+        end_positions: tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFQuestionAnsweringModelOutput]:
         r"""
@@ -1322,11 +1252,3 @@ def call(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
-
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
diff --git a/src/transformers/models/convbert/tokenization_convbert.py b/src/transformers/models/convbert/tokenization_convbert.py
index bf3fb9994727..439beb7abb4d 100644
--- a/src/transformers/models/convbert/tokenization_convbert.py
+++ b/src/transformers/models/convbert/tokenization_convbert.py
@@ -133,22 +133,8 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
-        super().__init__(
-            do_lower_case=do_lower_case,
-            do_basic_tokenize=do_basic_tokenize,
-            never_split=never_split,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            tokenize_chinese_chars=tokenize_chinese_chars,
-            strip_accents=strip_accents,
-            **kwargs,
-        )
-
         if not os.path.isfile(vocab_file):
             raise ValueError(
                 f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
@@ -164,7 +150,22 @@ def __init__(
                 tokenize_chinese_chars=tokenize_chinese_chars,
                 strip_accents=strip_accents,
             )
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+
+        super().__init__(
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
 
     @property
     def do_lower_case(self):
@@ -177,11 +178,12 @@ def vocab_size(self):
     def get_vocab(self):
         return dict(self.vocab, **self.added_tokens_encoder)
 
-    def _tokenize(self, text):
+    def _tokenize(self, text, split_special_tokens=False):
         split_tokens = []
         if self.do_basic_tokenize:
-            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-
+            for token in self.basic_tokenizer.tokenize(
+                text, never_split=self.all_special_tokens if not split_special_tokens else None
+            ):
                 # If the token is part of the never_split set
                 if token in self.basic_tokenizer.never_split:
                     split_tokens.append(token)
@@ -326,20 +328,30 @@ class BasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -358,7 +370,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -386,7 +400,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
diff --git a/src/transformers/models/convbert/tokenization_convbert_fast.py b/src/transformers/models/convbert/tokenization_convbert_fast.py
index 65c37a9b0927..7ccc21b3e058 100644
--- a/src/transformers/models/convbert/tokenization_convbert_fast.py
+++ b/src/transformers/models/convbert/tokenization_convbert_fast.py
@@ -110,7 +110,7 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
@@ -159,7 +159,7 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
         """
         output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
 
-        if token_ids_1:
+        if token_ids_1 is not None:
             output += token_ids_1 + [self.sep_token_id]
 
         return output
diff --git a/src/transformers/models/convnext/__init__.py b/src/transformers/models/convnext/__init__.py
index 109f79daea8f..099a7fc9d63d 100644
--- a/src/transformers/models/convnext/__init__.py
+++ b/src/transformers/models/convnext/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,7 +13,6 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-# rely on isort to merge the imports
 from ...utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
@@ -51,6 +46,7 @@
         "ConvNextForImageClassification",
         "ConvNextModel",
         "ConvNextPreTrainedModel",
+        "ConvNextBackbone",
     ]
 
 try:
@@ -85,6 +81,7 @@
     else:
         from .modeling_convnext import (
             CONVNEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            ConvNextBackbone,
             ConvNextForImageClassification,
             ConvNextModel,
             ConvNextPreTrainedModel,
@@ -96,7 +93,7 @@
     except OptionalDependencyNotAvailable:
         pass
     else:
-        from .modeling_convnext import TFConvNextForImageClassification, TFConvNextModel, TFConvNextPreTrainedModel
+        from .modeling_tf_convnext import TFConvNextForImageClassification, TFConvNextModel, TFConvNextPreTrainedModel
 
 
 else:
diff --git a/src/transformers/models/convnext/configuration_convnext.py b/src/transformers/models/convnext/configuration_convnext.py
index 4027973e08de..0cba78040579 100644
--- a/src/transformers/models/convnext/configuration_convnext.py
+++ b/src/transformers/models/convnext/configuration_convnext.py
@@ -22,6 +22,7 @@
 from ...configuration_utils import PretrainedConfig
 from ...onnx import OnnxConfig
 from ...utils import logging
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
 
 
 logger = logging.get_logger(__name__)
@@ -32,7 +33,7 @@
 }
 
 
-class ConvNextConfig(PretrainedConfig):
+class ConvNextConfig(BackboneConfigMixin, PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`ConvNextModel`]. It is used to instantiate an
     ConvNeXT model according to the specified arguments, defining the model architecture. Instantiating a configuration
@@ -64,6 +65,14 @@ class ConvNextConfig(PretrainedConfig):
             The initial value for the layer scale.
         drop_path_rate (`float`, *optional*, defaults to 0.0):
             The drop rate for stochastic depth.
+        out_features (`List[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage.
+        out_indices (`List[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage.
 
     Example:
     ```python
@@ -93,7 +102,9 @@ def __init__(
         layer_scale_init_value=1e-6,
         drop_path_rate=0.0,
         image_size=224,
-        **kwargs
+        out_features=None,
+        out_indices=None,
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -108,10 +119,13 @@ def __init__(
         self.layer_scale_init_value = layer_scale_init_value
         self.drop_path_rate = drop_path_rate
         self.image_size = image_size
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(self.depths) + 1)]
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
 
 
 class ConvNextOnnxConfig(OnnxConfig):
-
     torch_onnx_minimum_version = version.parse("1.11")
 
     @property
diff --git a/src/transformers/models/convnext/convert_convnext_to_pytorch.py b/src/transformers/models/convnext/convert_convnext_to_pytorch.py
index e40565c7a691..cdcbf24d5523 100644
--- a/src/transformers/models/convnext/convert_convnext_to_pytorch.py
+++ b/src/transformers/models/convnext/convert_convnext_to_pytorch.py
@@ -21,12 +21,12 @@
 import json
 from pathlib import Path
 
+import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 
-import requests
-from huggingface_hub import hf_hub_download
-from transformers import ConvNextConfig, ConvNextFeatureExtractor, ConvNextForImageClassification
+from transformers import ConvNextConfig, ConvNextForImageClassification, ConvNextImageProcessor
 from transformers.utils import logging
 
 
@@ -144,10 +144,10 @@ def convert_convnext_checkpoint(checkpoint_url, pytorch_dump_folder_path):
     model.load_state_dict(state_dict)
     model.eval()
 
-    # Check outputs on an image, prepared by ConvNextFeatureExtractor
+    # Check outputs on an image, prepared by ConvNextImageProcessor
     size = 224 if "224" in checkpoint_url else 384
-    feature_extractor = ConvNextFeatureExtractor(size=size)
-    pixel_values = feature_extractor(images=prepare_img(), return_tensors="pt").pixel_values
+    image_processor = ConvNextImageProcessor(size=size)
+    pixel_values = image_processor(images=prepare_img(), return_tensors="pt").pixel_values
 
     logits = model(pixel_values).logits
 
@@ -191,8 +191,8 @@ def convert_convnext_checkpoint(checkpoint_url, pytorch_dump_folder_path):
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     print(f"Saving model to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
     print("Pushing model to the hub...")
     model_name = "convnext"
diff --git a/src/transformers/models/convnext/image_processing_convnext.py b/src/transformers/models/convnext/image_processing_convnext.py
index 57382a05a895..09944527bbb9 100644
--- a/src/transformers/models/convnext/image_processing_convnext.py
+++ b/src/transformers/models/convnext/image_processing_convnext.py
@@ -18,15 +18,10 @@
 
 import numpy as np
 
-from transformers.utils import is_vision_available
-from transformers.utils.generic import TensorType
-
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import (
     center_crop,
     get_resize_output_image_size,
-    normalize,
-    rescale,
     resize,
     to_channel_dimension_format,
 )
@@ -36,11 +31,13 @@
     ChannelDimension,
     ImageInput,
     PILImageResampling,
-    is_batched,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
     to_numpy_array,
     valid_images,
 )
-from ...utils import logging
+from ...utils import TensorType, is_vision_available, logging
 
 
 if is_vision_available():
@@ -64,10 +61,10 @@ class ConvNextImageProcessor(BaseImageProcessor):
             be matched to `int(size["shortest_edge"]/crop_pct)`, after which the image is cropped to
             `(size["shortest_edge"], size["shortest_edge"])`. Only has an effect if `do_resize` is set to `True`. Can
             be overriden by `size` in the `preprocess` method.
-        crop_pct (`float` *optional*, defaults to 244 / 256):
+        crop_pct (`float` *optional*, defaults to 224 / 256):
             Percentage of the image to crop. Only has an effect if `do_resize` is `True` and size < 384. Can be
             overriden by `crop_pct` in the `preprocess` method.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
             Resampling filter to use if resizing the image. Can be overriden by `resample` in the `preprocess` method.
         do_rescale (`bool`, *optional*, defaults to `True`):
             Whether to rescale the image by the specified scale `rescale_factor`. Can be overriden by `do_rescale` in
@@ -99,7 +96,7 @@ def __init__(
         do_normalize: bool = True,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"shortest_edge": 384}
@@ -123,7 +120,8 @@ def resize(
         crop_pct: float,
         resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image.
@@ -142,6 +140,9 @@ def resize(
                 Resampling filter to use when resizing the image.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred from the input
+                image.
         """
         size = get_size_dict(size, default_to_square=False)
         if "shortest_edge" not in size:
@@ -151,59 +152,36 @@ def resize(
         if shortest_edge < 384:
             # maintain same ratio, resizing shortest edge to shortest_edge/crop_pct
             resize_shortest_edge = int(shortest_edge / crop_pct)
-            resize_size = get_resize_output_image_size(image, size=resize_shortest_edge, default_to_square=False)
-            image = resize(image=image, size=resize_size, resample=resample, data_format=data_format, **kwargs)
+            resize_size = get_resize_output_image_size(
+                image, size=resize_shortest_edge, default_to_square=False, input_data_format=input_data_format
+            )
+            image = resize(
+                image=image,
+                size=resize_size,
+                resample=resample,
+                data_format=data_format,
+                input_data_format=input_data_format,
+                **kwargs,
+            )
             # then crop to (shortest_edge, shortest_edge)
-            return center_crop(image=image, size=(shortest_edge, shortest_edge), data_format=data_format, **kwargs)
+            return center_crop(
+                image=image,
+                size=(shortest_edge, shortest_edge),
+                data_format=data_format,
+                input_data_format=input_data_format,
+                **kwargs,
+            )
         else:
             # warping (no cropping) when evaluated at 384 or larger
             return resize(
-                image, size=(shortest_edge, shortest_edge), resample=resample, data_format=data_format, **kwargs
+                image,
+                size=(shortest_edge, shortest_edge),
+                resample=resample,
+                data_format=data_format,
+                input_data_format=input_data_format,
+                **kwargs,
             )
 
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            image_mean (`float` or `List[float]`):
-                Image mean.
-            image_std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: ImageInput,
@@ -218,6 +196,7 @@ def preprocess(
         image_std: Optional[Union[float, List[float]]] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
         **kwargs,
     ) -> PIL.Image.Image:
         """
@@ -225,7 +204,8 @@ def preprocess(
 
         Args:
             images (`ImageInput`):
-                Image to preprocess.
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
             do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                 Whether to resize the image.
             size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@@ -257,8 +237,15 @@ def preprocess(
                     - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
             data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
                 The channel dimension format for the output image. Can be one of:
-                    - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                    - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         crop_pct = crop_pct if crop_pct is not None else self.crop_pct
@@ -272,8 +259,7 @@ def preprocess(
         size = size if size is not None else self.size
         size = get_size_dict(size, default_to_square=False)
 
-        if not is_batched(images):
-            images = [images]
+        images = make_list_of_images(images)
 
         if not valid_images(images):
             raise ValueError(
@@ -296,16 +282,39 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
         if do_resize:
-            images = [self.resize(image=image, size=size, crop_pct=crop_pct, resample=resample) for image in images]
+            images = [
+                self.resize(
+                    image=image, size=size, crop_pct=crop_pct, resample=resample, input_data_format=input_data_format
+                )
+                for image in images
+            ]
 
         if do_rescale:
-            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
 
         if do_normalize:
-            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
-
-        images = [to_channel_dimension_format(image, data_format) for image in images]
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
 
         data = {"pixel_values": images}
         return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/src/transformers/models/convnext/modeling_convnext.py b/src/transformers/models/convnext/modeling_convnext.py
index 605c01dbd72c..e6cf336517a5 100755
--- a/src/transformers/models/convnext/modeling_convnext.py
+++ b/src/transformers/models/convnext/modeling_convnext.py
@@ -24,12 +24,20 @@
 
 from ...activations import ACT2FN
 from ...modeling_outputs import (
+    BackboneOutput,
     BaseModelOutputWithNoAttention,
     BaseModelOutputWithPoolingAndNoAttention,
     ImageClassifierOutputWithNoAttention,
 )
 from ...modeling_utils import PreTrainedModel
-from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from ...utils.backbone_utils import BackboneMixin
 from .configuration_convnext import ConvNextConfig
 
 
@@ -37,7 +45,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "ConvNextConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "ConvNextImageProcessor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "facebook/convnext-tiny-224"
@@ -54,7 +61,7 @@
 
 
 # Copied from transformers.models.beit.modeling_beit.drop_path
-def drop_path(input, drop_prob: float = 0.0, training: bool = False):
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
@@ -290,7 +297,7 @@ def _init_weights(self, module):
             module.weight.data.fill_(1.0)
 
     def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, ConvNextModel):
+        if isinstance(module, ConvNextEncoder):
             module.gradient_checkpointing = value
 
 
@@ -309,7 +316,7 @@ def _set_gradient_checkpointing(self, module, value=False):
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
-            [`AutoImageProcessor.__call__`] for details.
+            [`ConvNextImageProcessor.__call__`] for details.
 
         output_hidden_states (`bool`, *optional*):
             Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
@@ -339,7 +346,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPoolingAndNoAttention,
         config_class=_CONFIG_FOR_DOC,
@@ -407,7 +413,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_IMAGE_CLASS_CHECKPOINT,
         output_type=ImageClassifierOutputWithNoAttention,
         config_class=_CONFIG_FOR_DOC,
@@ -465,3 +470,90 @@ def forward(
             logits=logits,
             hidden_states=outputs.hidden_states,
         )
+
+
+@add_start_docstrings(
+    """
+    ConvNeXt backbone, to be used with frameworks like DETR and MaskFormer.
+    """,
+    CONVNEXT_START_DOCSTRING,
+)
+class ConvNextBackbone(ConvNextPreTrainedModel, BackboneMixin):
+    def __init__(self, config):
+        super().__init__(config)
+        super()._init_backbone(config)
+
+        self.embeddings = ConvNextEmbeddings(config)
+        self.encoder = ConvNextEncoder(config)
+        self.num_features = [config.hidden_sizes[0]] + config.hidden_sizes
+
+        # Add layer norms to hidden states of out_features
+        hidden_states_norms = {}
+        for stage, num_channels in zip(self._out_features, self.channels):
+            hidden_states_norms[stage] = ConvNextLayerNorm(num_channels, data_format="channels_first")
+        self.hidden_states_norms = nn.ModuleDict(hidden_states_norms)
+
+        # initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> BackboneOutput:
+        """
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoBackbone
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> processor = AutoImageProcessor.from_pretrained("facebook/convnext-tiny-224")
+        >>> model = AutoBackbone.from_pretrained("facebook/convnext-tiny-224")
+
+        >>> inputs = processor(image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        embedding_output = self.embeddings(pixel_values)
+
+        outputs = self.encoder(
+            embedding_output,
+            output_hidden_states=True,
+            return_dict=True,
+        )
+
+        hidden_states = outputs.hidden_states
+
+        feature_maps = ()
+        # we skip the stem
+        for idx, (stage, hidden_state) in enumerate(zip(self.stage_names[1:], hidden_states[1:])):
+            if stage in self.out_features:
+                hidden_state = self.hidden_states_norms[stage](hidden_state)
+                feature_maps += (hidden_state,)
+
+        if not return_dict:
+            output = (feature_maps,)
+            if output_hidden_states:
+                output += (outputs.hidden_states,)
+            return output
+
+        return BackboneOutput(
+            feature_maps=feature_maps,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=None,
+        )
diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py
index 8906fa6b476e..1629988900aa 100644
--- a/src/transformers/models/convnext/modeling_tf_convnext.py
+++ b/src/transformers/models/convnext/modeling_tf_convnext.py
@@ -15,13 +15,13 @@
 """ TF 2.0 ConvNext model."""
 
 
-from typing import Dict, Optional, Tuple, Union
+from __future__ import annotations
+
+from typing import Optional, Tuple, Union
 
 import numpy as np
 import tensorflow as tf
 
-from transformers import shape_list
-
 from ...activations_tf import get_tf_activation
 from ...modeling_tf_outputs import TFBaseModelOutput, TFBaseModelOutputWithPooling, TFSequenceClassifierOutput
 from ...modeling_tf_utils import (
@@ -32,6 +32,7 @@
     keras_serializable,
     unpack_inputs,
 )
+from ...tf_utils import shape_list
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_convnext import ConvNextConfig
 
@@ -154,7 +155,7 @@ def __init__(self, config, dim, drop_path=0.0, **kwargs):
             else tf.keras.layers.Activation("linear", name="drop_path")
         )
 
-    def build(self, input_shape: tf.TensorShape):
+    def build(self, input_shape: tf.TensorShape = None):
         # PT's `nn.Parameters` must be mapped to a TF layer weight to inherit the same name hierarchy (and vice-versa)
         self.layer_scale_parameter = (
             self.add_weight(
@@ -298,7 +299,7 @@ def __init__(self, config: ConvNextConfig, add_pooling_layer: bool = True, **kwa
     @unpack_inputs
     def call(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
+        pixel_values: TFModelInputType | None = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
@@ -350,43 +351,6 @@ class TFConvNextPreTrainedModel(TFPreTrainedModel):
     base_model_prefix = "convnext"
     main_input_name = "pixel_values"
 
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        VISION_DUMMY_INPUTS = tf.random.uniform(
-            shape=(
-                3,
-                self.config.num_channels,
-                self.config.image_size,
-                self.config.image_size,
-            ),
-            dtype=tf.float32,
-        )
-        return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)}
-
-    @tf.function(
-        input_signature=[
-            {
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        """
-        Method used for serving the model.
-
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
-        """
-        output = self.call(inputs)
-        return self.serving_output(output)
-
 
 CONVNEXT_START_DOCSTRING = r"""
     This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
@@ -432,7 +396,7 @@ def serving(self, inputs):
 CONVNEXT_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`ConvNextImageProcessor`]. See
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
             [`ConvNextImageProcessor.__call__`] for details.
 
         output_hidden_states (`bool`, *optional*):
@@ -459,7 +423,7 @@ def __init__(self, config, *inputs, add_pooling_layer=True, **kwargs):
     @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
+        pixel_values: TFModelInputType | None = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
@@ -470,14 +434,14 @@ def call(
         Examples:
 
         ```python
-        >>> from transformers import ConvNextImageProcessor, TFConvNextModel
+        >>> from transformers import AutoImageProcessor, TFConvNextModel
         >>> from PIL import Image
         >>> import requests
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> image_processor = ConvNextImageProcessor.from_pretrained("facebook/convnext-tiny-224")
+        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/convnext-tiny-224")
         >>> model = TFConvNextModel.from_pretrained("facebook/convnext-tiny-224")
 
         >>> inputs = image_processor(images=image, return_tensors="tf")
@@ -508,14 +472,6 @@ def call(
             hidden_states=outputs.hidden_states,
         )
 
-    def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
-        # hidden_states not converted to Tensor with tf.convert_to_tensor as they are all of different dimensions
-        return TFBaseModelOutputWithPooling(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            hidden_states=output.hidden_states,
-        )
-
 
 @add_start_docstrings(
     """
@@ -544,10 +500,10 @@ def __init__(self, config: ConvNextConfig, *inputs, **kwargs):
     @replace_return_docstrings(output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
+        pixel_values: TFModelInputType | None = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -561,7 +517,7 @@ def call(
         Examples:
 
         ```python
-        >>> from transformers import ConvNextImageProcessor, TFConvNextForImageClassification
+        >>> from transformers import AutoImageProcessor, TFConvNextForImageClassification
         >>> import tensorflow as tf
         >>> from PIL import Image
         >>> import requests
@@ -569,7 +525,7 @@ def call(
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> image_processor = ConvNextImageProcessor.from_pretrained("facebook/convnext-tiny-224")
+        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/convnext-tiny-224")
         >>> model = TFConvNextForImageClassification.from_pretrained("facebook/convnext-tiny-224")
 
         >>> inputs = image_processor(images=image, return_tensors="tf")
@@ -608,7 +564,3 @@ def call(
             logits=logits,
             hidden_states=outputs.hidden_states,
         )
-
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        # hidden_states not converted to Tensor with tf.convert_to_tensor as they are all of different dimensions
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=output.hidden_states)
diff --git a/src/transformers/models/convnextv2/__init__.py b/src/transformers/models/convnextv2/__init__.py
new file mode 100644
index 000000000000..9bfd6b26e05c
--- /dev/null
+++ b/src/transformers/models/convnextv2/__init__.py
@@ -0,0 +1,73 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+# rely on isort to merge the imports
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_convnextv2": [
+        "CONVNEXTV2_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "ConvNextV2Config",
+    ]
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_convnextv2"] = [
+        "CONVNEXTV2_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "ConvNextV2ForImageClassification",
+        "ConvNextV2Model",
+        "ConvNextV2PreTrainedModel",
+        "ConvNextV2Backbone",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_convnextv2 import (
+        CONVNEXTV2_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        ConvNextV2Config,
+    )
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_convnextv2 import (
+            CONVNEXTV2_PRETRAINED_MODEL_ARCHIVE_LIST,
+            ConvNextV2Backbone,
+            ConvNextV2ForImageClassification,
+            ConvNextV2Model,
+            ConvNextV2PreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
diff --git a/src/transformers/models/convnextv2/configuration_convnextv2.py b/src/transformers/models/convnextv2/configuration_convnextv2.py
new file mode 100644
index 000000000000..14dfcf85124e
--- /dev/null
+++ b/src/transformers/models/convnextv2/configuration_convnextv2.py
@@ -0,0 +1,115 @@
+# coding=utf-8
+# Copyright 2023 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" ConvNeXTV2 model configuration"""
+
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
+
+
+logger = logging.get_logger(__name__)
+
+CONVNEXTV2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "facebook/convnextv2-tiny-1k-224": "https://huggingface.co/facebook/convnextv2-tiny-1k-224/resolve/main/config.json",
+}
+
+
+class ConvNextV2Config(BackboneConfigMixin, PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ConvNextV2Model`]. It is used to instantiate an
+    ConvNeXTV2 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the ConvNeXTV2
+    [facebook/convnextv2-tiny-1k-224](https://huggingface.co/facebook/convnextv2-tiny-1k-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        patch_size (`int`, optional, defaults to 4):
+            Patch size to use in the patch embedding layer.
+        num_stages (`int`, optional, defaults to 4):
+            The number of stages in the model.
+        hidden_sizes (`List[int]`, *optional*, defaults to `[96, 192, 384, 768]`):
+            Dimensionality (hidden size) at each stage.
+        depths (`List[int]`, *optional*, defaults to `[3, 3, 9, 3]`):
+            Depth (number of blocks) for each stage.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in each block. If string, `"gelu"`, `"relu"`,
+            `"selu"` and `"gelu_new"` are supported.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            The drop rate for stochastic depth.
+        out_features (`List[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage.
+        out_indices (`List[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage.
+
+    Example:
+    ```python
+    >>> from transformers import ConvNeXTV2Config, ConvNextV2Model
+
+    >>> # Initializing a ConvNeXTV2 convnextv2-tiny-1k-224 style configuration
+    >>> configuration = ConvNeXTV2Config()
+
+    >>> # Initializing a model (with random weights) from the convnextv2-tiny-1k-224 style configuration
+    >>> model = ConvNextV2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "convnextv2"
+
+    def __init__(
+        self,
+        num_channels=3,
+        patch_size=4,
+        num_stages=4,
+        hidden_sizes=None,
+        depths=None,
+        hidden_act="gelu",
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        drop_path_rate=0.0,
+        image_size=224,
+        out_features=None,
+        out_indices=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.num_stages = num_stages
+        self.hidden_sizes = [96, 192, 384, 768] if hidden_sizes is None else hidden_sizes
+        self.depths = [3, 3, 9, 3] if depths is None else depths
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.drop_path_rate = drop_path_rate
+        self.image_size = image_size
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(self.depths) + 1)]
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
diff --git a/src/transformers/models/convnextv2/convert_convnextv2_to_pytorch.py b/src/transformers/models/convnextv2/convert_convnextv2_to_pytorch.py
new file mode 100644
index 000000000000..8094ecf0d615
--- /dev/null
+++ b/src/transformers/models/convnextv2/convert_convnextv2_to_pytorch.py
@@ -0,0 +1,286 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert ConvNeXTV2 checkpoints from the original repository.
+
+URL: https://github.com/facebookresearch/ConvNeXt"""
+
+import argparse
+import json
+import os
+
+import requests
+import torch
+from huggingface_hub import hf_hub_download
+from PIL import Image
+
+from transformers import ConvNextImageProcessor, ConvNextV2Config, ConvNextV2ForImageClassification
+from transformers.image_utils import PILImageResampling
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+def get_convnextv2_config(checkpoint_url):
+    config = ConvNextV2Config()
+
+    if "atto" in checkpoint_url:
+        depths = [2, 2, 6, 2]
+        hidden_sizes = [40, 80, 160, 320]
+    if "femto" in checkpoint_url:
+        depths = [2, 2, 6, 2]
+        hidden_sizes = [48, 96, 192, 384]
+    if "pico" in checkpoint_url:
+        depths = [2, 2, 6, 2]
+        hidden_sizes = [64, 128, 256, 512]
+    if "nano" in checkpoint_url:
+        depths = [2, 2, 8, 2]
+        hidden_sizes = [80, 160, 320, 640]
+    if "tiny" in checkpoint_url:
+        depths = [3, 3, 9, 3]
+        hidden_sizes = [96, 192, 384, 768]
+    if "base" in checkpoint_url:
+        depths = [3, 3, 27, 3]
+        hidden_sizes = [128, 256, 512, 1024]
+    if "large" in checkpoint_url:
+        depths = [3, 3, 27, 3]
+        hidden_sizes = [192, 384, 768, 1536]
+    if "huge" in checkpoint_url:
+        depths = [3, 3, 27, 3]
+        hidden_sizes = [352, 704, 1408, 2816]
+
+    num_labels = 1000
+    filename = "imagenet-1k-id2label.json"
+    expected_shape = (1, 1000)
+
+    repo_id = "huggingface/label-files"
+    config.num_labels = num_labels
+    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
+    id2label = {int(k): v for k, v in id2label.items()}
+
+    config.id2label = id2label
+    config.label2id = {v: k for k, v in id2label.items()}
+    config.hidden_sizes = hidden_sizes
+    config.depths = depths
+
+    return config, expected_shape
+
+
+def rename_key(name):
+    if "downsample_layers.0.0" in name:
+        name = name.replace("downsample_layers.0.0", "embeddings.patch_embeddings")
+    if "downsample_layers.0.1" in name:
+        name = name.replace("downsample_layers.0.1", "embeddings.norm")  # we rename to layernorm later on
+    if "downsample_layers.1.0" in name:
+        name = name.replace("downsample_layers.1.0", "stages.1.downsampling_layer.0")
+    if "downsample_layers.1.1" in name:
+        name = name.replace("downsample_layers.1.1", "stages.1.downsampling_layer.1")
+    if "downsample_layers.2.0" in name:
+        name = name.replace("downsample_layers.2.0", "stages.2.downsampling_layer.0")
+    if "downsample_layers.2.1" in name:
+        name = name.replace("downsample_layers.2.1", "stages.2.downsampling_layer.1")
+    if "downsample_layers.3.0" in name:
+        name = name.replace("downsample_layers.3.0", "stages.3.downsampling_layer.0")
+    if "downsample_layers.3.1" in name:
+        name = name.replace("downsample_layers.3.1", "stages.3.downsampling_layer.1")
+    if "stages" in name and "downsampling_layer" not in name:
+        # stages.0.0. for instance should be renamed to stages.0.layers.0.
+        name = name[: len("stages.0")] + ".layers" + name[len("stages.0") :]
+    if "gamma" in name:
+        name = name.replace("gamma", "weight")
+    if "beta" in name:
+        name = name.replace("beta", "bias")
+    if "stages" in name:
+        name = name.replace("stages", "encoder.stages")
+    if "norm" in name:
+        name = name.replace("norm", "layernorm")
+    if "head" in name:
+        name = name.replace("head", "classifier")
+
+    return name
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    return im
+
+
+def convert_preprocessor(checkpoint_url):
+    if "224" in checkpoint_url:
+        size = 224
+        crop_pct = 224 / 256
+    elif "384" in checkpoint_url:
+        size = 384
+        crop_pct = None
+    else:
+        size = 512
+        crop_pct = None
+
+    return ConvNextImageProcessor(
+        size=size,
+        crop_pct=crop_pct,
+        image_mean=[0.485, 0.456, 0.406],
+        image_std=[0.229, 0.224, 0.225],
+        resample=PILImageResampling.BICUBIC,
+    )
+
+
+@torch.no_grad()
+def convert_convnextv2_checkpoint(checkpoint_url, pytorch_dump_folder_path, save_model, push_to_hub):
+    """
+    Copy/paste/tweak model's weights to our ConvNeXTV2 structure.
+    """
+    print("Downloading original model from checkpoint...")
+    # define ConvNeXTV2 configuration based on URL
+    config, expected_shape = get_convnextv2_config(checkpoint_url)
+    # load original state_dict from URL
+    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url)["model"]
+
+    print("Converting model parameters...")
+    # rename keys
+    for key in state_dict.copy().keys():
+        val = state_dict.pop(key)
+        state_dict[rename_key(key)] = val
+    # add prefix to all keys expect classifier head
+    for key in state_dict.copy().keys():
+        val = state_dict.pop(key)
+        if not key.startswith("classifier"):
+            key = "convnextv2." + key
+        state_dict[key] = val
+
+    # load HuggingFace model
+    model = ConvNextV2ForImageClassification(config)
+    model.load_state_dict(state_dict)
+    model.eval()
+
+    # Check outputs on an image, prepared by ConvNextImageProcessor
+    preprocessor = convert_preprocessor(checkpoint_url)
+    inputs = preprocessor(images=prepare_img(), return_tensors="pt")
+    logits = model(**inputs).logits
+
+    # note: the logits below were obtained without center cropping
+    if checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_atto_1k_224_ema.pt":
+        expected_logits = torch.tensor([-0.3930, 0.1747, -0.5246, 0.4177, 0.4295])
+    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_femto_1k_224_ema.pt":
+        expected_logits = torch.tensor([-0.1727, -0.5341, -0.7818, -0.4745, -0.6566])
+    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_pico_1k_224_ema.pt":
+        expected_logits = torch.tensor([-0.0333, 0.1563, -0.9137, 0.1054, 0.0381])
+    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_nano_1k_224_ema.pt":
+        expected_logits = torch.tensor([-0.1744, -0.1555, -0.0713, 0.0950, -0.1431])
+    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_tiny_1k_224_ema.pt":
+        expected_logits = torch.tensor([0.9996, 0.1966, -0.4386, -0.3472, 0.6661])
+    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_base_1k_224_ema.pt":
+        expected_logits = torch.tensor([-0.2553, -0.6708, -0.1359, 0.2518, -0.2488])
+    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_large_1k_224_ema.pt":
+        expected_logits = torch.tensor([-0.0673, -0.5627, -0.3753, -0.2722, 0.0178])
+    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_huge_1k_224_ema.pt":
+        expected_logits = torch.tensor([-0.6377, -0.7458, -0.2150, 0.1184, -0.0597])
+    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_nano_22k_224_ema.pt":
+        expected_logits = torch.tensor([1.0799, 0.2322, -0.8860, 1.0219, 0.6231])
+    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_nano_22k_384_ema.pt":
+        expected_logits = torch.tensor([0.3766, 0.4917, -1.1426, 0.9942, 0.6024])
+    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_tiny_22k_224_ema.pt":
+        expected_logits = torch.tensor([0.4220, -0.6919, -0.4317, -0.2881, -0.6609])
+    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_tiny_22k_384_ema.pt":
+        expected_logits = torch.tensor([0.1082, -0.8286, -0.5095, 0.4681, -0.8085])
+    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_base_22k_224_ema.pt":
+        expected_logits = torch.tensor([-0.2419, -0.6221, 0.2176, -0.0980, -0.7527])
+    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_base_22k_384_ema.pt":
+        expected_logits = torch.tensor([0.0391, -0.4371, 0.3786, 0.1251, -0.2784])
+    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_large_22k_224_ema.pt":
+        expected_logits = torch.tensor([-0.0504, 0.5636, -0.1729, -0.6507, -0.3949])
+    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_large_22k_384_ema.pt":
+        expected_logits = torch.tensor([0.3560, 0.9486, 0.3149, -0.2667, -0.5138])
+    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_huge_22k_384_ema.pt":
+        expected_logits = torch.tensor([-0.2469, -0.4550, -0.5853, -0.0810, 0.0309])
+    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_huge_22k_512_ema.pt":
+        expected_logits = torch.tensor([-0.3090, 0.0802, -0.0682, -0.1979, -0.2826])
+    else:
+        raise ValueError(f"Unknown URL: {checkpoint_url}")
+
+    assert torch.allclose(logits[0, :5], expected_logits, atol=1e-3)
+    assert logits.shape == expected_shape
+    print("Model outputs match the original results!")
+
+    if save_model:
+        print("Saving model to local...")
+        # Create folder to save model
+        if not os.path.isdir(pytorch_dump_folder_path):
+            os.mkdir(pytorch_dump_folder_path)
+
+        model.save_pretrained(pytorch_dump_folder_path)
+        preprocessor.save_pretrained(pytorch_dump_folder_path)
+
+    model_name = "convnextv2"
+    if "atto" in checkpoint_url:
+        model_name += "-atto"
+    if "femto" in checkpoint_url:
+        model_name += "-femto"
+    if "pico" in checkpoint_url:
+        model_name += "-pico"
+    if "nano" in checkpoint_url:
+        model_name += "-nano"
+    elif "tiny" in checkpoint_url:
+        model_name += "-tiny"
+    elif "base" in checkpoint_url:
+        model_name += "-base"
+    elif "large" in checkpoint_url:
+        model_name += "-large"
+    elif "huge" in checkpoint_url:
+        model_name += "-huge"
+    if "22k" in checkpoint_url and "1k" not in checkpoint_url:
+        model_name += "-22k"
+    elif "22k" in checkpoint_url and "1k" in checkpoint_url:
+        model_name += "-22k-1k"
+    elif "1k" in checkpoint_url:
+        model_name += "-1k"
+    if "224" in checkpoint_url:
+        model_name += "-224"
+    elif "384" in checkpoint_url:
+        model_name += "-384"
+    elif "512" in checkpoint_url:
+        model_name += "-512"
+
+    if push_to_hub:
+        print(f"Pushing {model_name} to the hub...")
+        model.push_to_hub(model_name)
+        preprocessor.push_to_hub(model_name)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--checkpoint_url",
+        default="https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_atto_1k_224_ema.pt",
+        type=str,
+        help="URL of the original ConvNeXTV2 checkpoint you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        default="model",
+        type=str,
+        help="Path to the output PyTorch model directory.",
+    )
+    parser.add_argument("--save_model", action="store_true", help="Save model to local")
+    parser.add_argument("--push_to_hub", action="store_true", help="Push model and image preprocessor to the hub")
+
+    args = parser.parse_args()
+    convert_convnextv2_checkpoint(
+        args.checkpoint_url, args.pytorch_dump_folder_path, args.save_model, args.push_to_hub
+    )
diff --git a/src/transformers/models/convnextv2/modeling_convnextv2.py b/src/transformers/models/convnextv2/modeling_convnextv2.py
new file mode 100644
index 000000000000..3a268c713d50
--- /dev/null
+++ b/src/transformers/models/convnextv2/modeling_convnextv2.py
@@ -0,0 +1,582 @@
+# coding=utf-8
+# Copyright 2023 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch ConvNextV2 model."""
+
+
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BackboneOutput,
+    BaseModelOutputWithNoAttention,
+    BaseModelOutputWithPoolingAndNoAttention,
+    ImageClassifierOutputWithNoAttention,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from ...utils.backbone_utils import BackboneMixin
+from .configuration_convnextv2 import ConvNextV2Config
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "ConvNextV2Config"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "facebook/convnextv2-tiny-1k-224"
+_EXPECTED_OUTPUT_SHAPE = [1, 768, 7, 7]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "facebook/convnextv2-tiny-1k-224"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
+
+CONVNEXTV2_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/convnextv2-tiny-1k-224",
+    # See all ConvNextV2 models at https://huggingface.co/models?filter=convnextv2
+]
+
+
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->ConvNextV2
+class ConvNextV2DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+
+
+class ConvNextV2GRN(nn.Module):
+    """GRN (Global Response Normalization) layer"""
+
+    def __init__(self, dim: int):
+        super().__init__()
+        self.weight = nn.Parameter(torch.zeros(1, 1, 1, dim))
+        self.bias = nn.Parameter(torch.zeros(1, 1, 1, dim))
+
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+        # Compute and normalize global spatial feature maps
+        global_features = torch.norm(hidden_states, p=2, dim=(1, 2), keepdim=True)
+        norm_features = global_features / (global_features.mean(dim=-1, keepdim=True) + 1e-6)
+        hidden_states = self.weight * (hidden_states * norm_features) + self.bias + hidden_states
+
+        return hidden_states
+
+
+# Copied from transformers.models.convnext.modeling_convnext.ConvNextLayerNorm with ConvNext->ConvNextV2
+class ConvNextV2LayerNorm(nn.Module):
+    r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
+    width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
+    """
+
+    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.data_format = data_format
+        if self.data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError(f"Unsupported data format: {self.data_format}")
+        self.normalized_shape = (normalized_shape,)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.data_format == "channels_last":
+            x = torch.nn.functional.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+        elif self.data_format == "channels_first":
+            input_dtype = x.dtype
+            x = x.float()
+            u = x.mean(1, keepdim=True)
+            s = (x - u).pow(2).mean(1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.eps)
+            x = x.to(dtype=input_dtype)
+            x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
+
+
+# Copied from transformers.models.convnext.modeling_convnext.ConvNextEmbeddings with ConvNext->ConvNextV2
+class ConvNextV2Embeddings(nn.Module):
+    """This class is comparable to (and inspired by) the SwinEmbeddings class
+    found in src/transformers/models/swin/modeling_swin.py.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.patch_embeddings = nn.Conv2d(
+            config.num_channels, config.hidden_sizes[0], kernel_size=config.patch_size, stride=config.patch_size
+        )
+        self.layernorm = ConvNextV2LayerNorm(config.hidden_sizes[0], eps=1e-6, data_format="channels_first")
+        self.num_channels = config.num_channels
+
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        num_channels = pixel_values.shape[1]
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        embeddings = self.patch_embeddings(pixel_values)
+        embeddings = self.layernorm(embeddings)
+        return embeddings
+
+
+class ConvNextV2Layer(nn.Module):
+    """This corresponds to the `Block` class in the original implementation.
+
+    There are two equivalent implementations: [DwConv, LayerNorm (channels_first), Conv, GELU,1x1 Conv]; all in (N, C,
+    H, W) (2) [DwConv, Permute to (N, H, W, C), LayerNorm (channels_last), Linear, GELU, Linear]; Permute back
+
+    The authors used (2) as they find it slightly faster in PyTorch.
+
+    Args:
+        config ([`ConvNextV2Config`]): Model configuration class.
+        dim (`int`): Number of input channels.
+        drop_path (`float`): Stochastic depth rate. Default: 0.0.
+    """
+
+    def __init__(self, config, dim, drop_path=0):
+        super().__init__()
+        # depthwise conv
+        self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim)
+        self.layernorm = ConvNextV2LayerNorm(dim, eps=1e-6)
+        # pointwise/1x1 convs, implemented with linear layers
+        self.pwconv1 = nn.Linear(dim, 4 * dim)
+        self.act = ACT2FN[config.hidden_act]
+        self.grn = ConvNextV2GRN(4 * dim)
+        self.pwconv2 = nn.Linear(4 * dim, dim)
+        self.drop_path = ConvNextV2DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.Tensor:
+        input = hidden_states
+        x = self.dwconv(hidden_states)
+        # (batch_size, num_channels, height, width) -> (batch_size, height, width, num_channels)
+        x = x.permute(0, 2, 3, 1)
+        x = self.layernorm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.grn(x)
+        x = self.pwconv2(x)
+        # (batch_size, height, width, num_channels) -> (batch_size, num_channels, height, width)
+        x = x.permute(0, 3, 1, 2)
+
+        x = input + self.drop_path(x)
+        return x
+
+
+# Copied from transformers.models.convnext.modeling_convnext.ConvNextStage with ConvNeXT->ConvNeXTV2, ConvNext->ConvNextV2
+class ConvNextV2Stage(nn.Module):
+    """ConvNeXTV2 stage, consisting of an optional downsampling layer + multiple residual blocks.
+
+    Args:
+        config ([`ConvNextV2Config`]): Model configuration class.
+        in_channels (`int`): Number of input channels.
+        out_channels (`int`): Number of output channels.
+        depth (`int`): Number of residual blocks.
+        drop_path_rates(`List[float]`): Stochastic depth rates for each layer.
+    """
+
+    def __init__(self, config, in_channels, out_channels, kernel_size=2, stride=2, depth=2, drop_path_rates=None):
+        super().__init__()
+
+        if in_channels != out_channels or stride > 1:
+            self.downsampling_layer = nn.Sequential(
+                ConvNextV2LayerNorm(in_channels, eps=1e-6, data_format="channels_first"),
+                nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride),
+            )
+        else:
+            self.downsampling_layer = nn.Identity()
+        drop_path_rates = drop_path_rates or [0.0] * depth
+        self.layers = nn.Sequential(
+            *[ConvNextV2Layer(config, dim=out_channels, drop_path=drop_path_rates[j]) for j in range(depth)]
+        )
+
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.Tensor:
+        hidden_states = self.downsampling_layer(hidden_states)
+        hidden_states = self.layers(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.convnext.modeling_convnext.ConvNextEncoder with ConvNext->ConvNextV2
+class ConvNextV2Encoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.stages = nn.ModuleList()
+        drop_path_rates = [
+            x.tolist() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths)).split(config.depths)
+        ]
+        prev_chs = config.hidden_sizes[0]
+        for i in range(config.num_stages):
+            out_chs = config.hidden_sizes[i]
+            stage = ConvNextV2Stage(
+                config,
+                in_channels=prev_chs,
+                out_channels=out_chs,
+                stride=2 if i > 0 else 1,
+                depth=config.depths[i],
+                drop_path_rates=drop_path_rates[i],
+            )
+            self.stages.append(stage)
+            prev_chs = out_chs
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple, BaseModelOutputWithNoAttention]:
+        all_hidden_states = () if output_hidden_states else None
+
+        for i, layer_module in enumerate(self.stages):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            hidden_states = layer_module(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)
+
+        return BaseModelOutputWithNoAttention(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+        )
+
+
+# Copied from transformers.models.convnext.modeling_convnext.ConvNextPreTrainedModel with ConvNext->ConvNextV2, convnext->convnextv2
+class ConvNextV2PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = ConvNextV2Config
+    base_model_prefix = "convnextv2"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, ConvNextV2Encoder):
+            module.gradient_checkpointing = value
+
+
+CONVNEXTV2_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`ConvNextV2Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+CONVNEXTV2_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`ConvNextImageProcessor`]. See
+            [`ConvNextImageProcessor.__call__`] for details.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare ConvNextV2 model outputting raw features without any specific head on top.",
+    CONVNEXTV2_START_DOCSTRING,
+)
+# Copied from transformers.models.convnext.modeling_convnext.ConvNextModel with CONVNEXT->CONVNEXTV2, ConvNext->ConvNextV2
+class ConvNextV2Model(ConvNextV2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = ConvNextV2Embeddings(config)
+        self.encoder = ConvNextV2Encoder(config)
+
+        # final layernorm layer
+        self.layernorm = nn.LayerNorm(config.hidden_sizes[-1], eps=config.layer_norm_eps)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(CONVNEXTV2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndNoAttention,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPoolingAndNoAttention]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        embedding_output = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+
+        # global average pooling, (N, C, H, W) -> (N, C)
+        pooled_output = self.layernorm(last_hidden_state.mean([-2, -1]))
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndNoAttention(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+        )
+
+
+@add_start_docstrings(
+    """
+    ConvNextV2 Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
+    ImageNet.
+    """,
+    CONVNEXTV2_START_DOCSTRING,
+)
+# Copied from transformers.models.convnext.modeling_convnext.ConvNextForImageClassification with CONVNEXT->CONVNEXTV2,ConvNext->ConvNextV2,convnext->convnextv2
+class ConvNextV2ForImageClassification(ConvNextV2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.convnextv2 = ConvNextV2Model(config)
+
+        # Classifier head
+        self.classifier = (
+            nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(CONVNEXTV2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=ImageClassifierOutputWithNoAttention,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, ImageClassifierOutputWithNoAttention]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.convnextv2(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
+
+        pooled_output = outputs.pooler_output if return_dict else outputs[1]
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ImageClassifierOutputWithNoAttention(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+        )
+
+
+@add_start_docstrings(
+    """
+    ConvNeXT V2 backbone, to be used with frameworks like DETR and MaskFormer.
+    """,
+    CONVNEXTV2_START_DOCSTRING,
+)
+# Copied from transformers.models.convnext.modeling_convnext.ConvNextBackbone with CONVNEXT->CONVNEXTV2,ConvNext->ConvNextV2,facebook/convnext-tiny-224->facebook/convnextv2-tiny-1k-224
+class ConvNextV2Backbone(ConvNextV2PreTrainedModel, BackboneMixin):
+    def __init__(self, config):
+        super().__init__(config)
+        super()._init_backbone(config)
+
+        self.embeddings = ConvNextV2Embeddings(config)
+        self.encoder = ConvNextV2Encoder(config)
+        self.num_features = [config.hidden_sizes[0]] + config.hidden_sizes
+
+        # Add layer norms to hidden states of out_features
+        hidden_states_norms = {}
+        for stage, num_channels in zip(self._out_features, self.channels):
+            hidden_states_norms[stage] = ConvNextV2LayerNorm(num_channels, data_format="channels_first")
+        self.hidden_states_norms = nn.ModuleDict(hidden_states_norms)
+
+        # initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(CONVNEXTV2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> BackboneOutput:
+        """
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoBackbone
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> processor = AutoImageProcessor.from_pretrained("facebook/convnextv2-tiny-1k-224")
+        >>> model = AutoBackbone.from_pretrained("facebook/convnextv2-tiny-1k-224")
+
+        >>> inputs = processor(image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        embedding_output = self.embeddings(pixel_values)
+
+        outputs = self.encoder(
+            embedding_output,
+            output_hidden_states=True,
+            return_dict=True,
+        )
+
+        hidden_states = outputs.hidden_states
+
+        feature_maps = ()
+        # we skip the stem
+        for idx, (stage, hidden_state) in enumerate(zip(self.stage_names[1:], hidden_states[1:])):
+            if stage in self.out_features:
+                hidden_state = self.hidden_states_norms[stage](hidden_state)
+                feature_maps += (hidden_state,)
+
+        if not return_dict:
+            output = (feature_maps,)
+            if output_hidden_states:
+                output += (outputs.hidden_states,)
+            return output
+
+        return BackboneOutput(
+            feature_maps=feature_maps,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=None,
+        )
diff --git a/src/transformers/models/cpm/__init__.py b/src/transformers/models/cpm/__init__.py
index 01f45c436b11..be6b0f66898e 100644
--- a/src/transformers/models/cpm/__init__.py
+++ b/src/transformers/models/cpm/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/cpm/tokenization_cpm.py b/src/transformers/models/cpm/tokenization_cpm.py
index bf2ec8f7c451..67281b3cf185 100644
--- a/src/transformers/models/cpm/tokenization_cpm.py
+++ b/src/transformers/models/cpm/tokenization_cpm.py
@@ -38,6 +38,9 @@
 class CpmTokenizer(PreTrainedTokenizer):
     """Runs pre-tokenization with Jieba segmentation tool. It is used in CPM models."""
 
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+
     def __init__(
         self,
         vocab_file,
@@ -53,7 +56,7 @@ def __init__(
         mask_token="<mask>",
         additional_special_tokens=["<eop>", "<eod>"],
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         """
         Construct a CPM tokenizer. Based on [Jieba](https://pypi.org/project/jieba/) and
@@ -121,24 +124,6 @@ def __init__(
 
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
 
-        super().__init__(
-            do_lower_case=do_lower_case,
-            remove_space=remove_space,
-            keep_accents=keep_accents,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            additional_special_tokens=additional_special_tokens,
-            sp_model_kwargs=self.sp_model_kwargs,
-            **kwargs,
-        )
-
-        self._pad_token_type_id = 3
-
         self.do_lower_case = do_lower_case
         self.remove_space = remove_space
         self.keep_accents = keep_accents
@@ -157,6 +142,24 @@ def __init__(
         self.jieba = jieba
         self.translator = str.maketrans(" \n", "\u2582\u2583")
 
+        super().__init__(
+            do_lower_case=do_lower_case,
+            remove_space=remove_space,
+            keep_accents=keep_accents,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            additional_special_tokens=additional_special_tokens,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+
+        self._pad_token_type_id = 3
+
     @property
     # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.vocab_size
     def vocab_size(self):
diff --git a/src/transformers/models/cpm/tokenization_cpm_fast.py b/src/transformers/models/cpm/tokenization_cpm_fast.py
index 032aebcf5b1e..8e8f927e813b 100644
--- a/src/transformers/models/cpm/tokenization_cpm_fast.py
+++ b/src/transformers/models/cpm/tokenization_cpm_fast.py
@@ -53,7 +53,7 @@ def __init__(
         cls_token="<cls>",
         mask_token="<mask>",
         additional_special_tokens=["<eop>", "<eod>"],
-        **kwargs
+        **kwargs,
     ):
         """
         Construct a CPM tokenizer. Based on [Jieba](https://pypi.org/project/jieba/) and
@@ -141,7 +141,6 @@ def __init__(
         self.remove_space = remove_space
         self.keep_accents = keep_accents
         self.vocab_file = vocab_file
-        self.can_save_slow_tokenizer = False if not self.vocab_file else True
 
         try:
             import jieba
@@ -153,6 +152,10 @@ def __init__(
         self.jieba = jieba
         self.translator = str.maketrans(" \n", "\u2582\u2583")
 
+    @property
+    def can_save_slow_tokenizer(self) -> bool:
+        return os.path.isfile(self.vocab_file) if self.vocab_file else False
+
     # Copied from transformers.models.xlnet.tokenization_xlnet_fast.XLNetTokenizerFast.build_inputs_with_special_tokens
     def build_inputs_with_special_tokens(
         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
diff --git a/src/transformers/models/cpmant/__init__.py b/src/transformers/models/cpmant/__init__.py
new file mode 100644
index 000000000000..8140009b60f1
--- /dev/null
+++ b/src/transformers/models/cpmant/__init__.py
@@ -0,0 +1,64 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2022 The HuggingFace Team and The OpenBMB Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+# rely on isort to merge the imports
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
+
+
+_import_structure = {
+    "configuration_cpmant": ["CPMANT_PRETRAINED_CONFIG_ARCHIVE_MAP", "CpmAntConfig"],
+    "tokenization_cpmant": ["CpmAntTokenizer"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_cpmant"] = [
+        "CPMANT_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "CpmAntForCausalLM",
+        "CpmAntModel",
+        "CpmAntPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_cpmant import CPMANT_PRETRAINED_CONFIG_ARCHIVE_MAP, CpmAntConfig
+    from .tokenization_cpmant import CpmAntTokenizer
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_cpmant import (
+            CPMANT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            CpmAntForCausalLM,
+            CpmAntModel,
+            CpmAntPreTrainedModel,
+        )
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/cpmant/configuration_cpmant.py b/src/transformers/models/cpmant/configuration_cpmant.py
new file mode 100644
index 000000000000..bd85244c81f3
--- /dev/null
+++ b/src/transformers/models/cpmant/configuration_cpmant.py
@@ -0,0 +1,123 @@
+# coding=utf-8
+# Copyright 2022 The OpenBMB Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" CPMAnt model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+CPMANT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "openbmb/cpm-ant-10b": "https://huggingface.co/openbmb/cpm-ant-10b/blob/main/config.json"
+    # See all CPMAnt models at https://huggingface.co/models?filter=cpmant
+}
+
+
+class CpmAntConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`CpmAntModel`]. It is used to instantiate an
+    CPMAnt model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the CPMAnt
+    [openbmb/cpm-ant-10b](https://huggingface.co/openbmb/cpm-ant-10b) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30720):
+            Vocabulary size of the CPMAnt model. Defines the number of different tokens that can be represented by the
+            `input` passed when calling [`CpmAntModel`].
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the encoder layers.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads in the Transformer encoder.
+        dim_head (`int`, *optional*, defaults to 128):
+            Dimension of attention heads for each attention layer in the Transformer encoder.
+        dim_ff (`int`, *optional*, defaults to 10240):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 48):
+            Number of layers of the Transformer encoder.
+        dropout_p (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder.
+        position_bias_num_buckets (`int`, *optional*, defaults to 512):
+            The number of position_bias buckets.
+        position_bias_max_distance (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        init_std (`float`, *optional*, defaults to 1.0):
+            Initialize parameters with std = init_std.
+        prompt_types (`int`, *optional*, defaults to 32):
+            The type of prompt.
+        prompt_length (`int`, *optional*, defaults to 32):
+            The length of prompt.
+        segment_types (`int`, *optional*, defaults to 32):
+            The type of segment.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether to use cache.
+
+    Example:
+
+    ```python
+    >>> from transformers import CpmAntModel, CpmAntConfig
+
+    >>> # Initializing a CPMAnt cpm-ant-10b style configuration
+    >>> configuration = CpmAntConfig()
+
+    >>> # Initializing a model from the cpm-ant-10b style configuration
+    >>> model = CpmAntModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "cpmant"
+
+    def __init__(
+        self,
+        vocab_size: int = 30720,
+        hidden_size: int = 4096,
+        num_attention_heads: int = 32,
+        dim_head: int = 128,
+        dim_ff: int = 10240,
+        num_hidden_layers: int = 48,
+        dropout_p: int = 0.0,
+        position_bias_num_buckets: int = 512,
+        position_bias_max_distance: int = 2048,
+        eps: int = 1e-6,
+        init_std: float = 1.0,
+        prompt_types: int = 32,
+        prompt_length: int = 32,
+        segment_types: int = 32,
+        use_cache: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.prompt_types = prompt_types
+        self.prompt_length = prompt_length
+        self.segment_types = segment_types
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.dim_head = dim_head
+        self.dim_ff = dim_ff
+        self.num_hidden_layers = num_hidden_layers
+        self.position_bias_num_buckets = position_bias_num_buckets
+        self.position_bias_max_distance = position_bias_max_distance
+        self.dropout_p = dropout_p
+        self.eps = eps
+        self.use_cache = use_cache
+        self.vocab_size = vocab_size
+        self.init_std = init_std
diff --git a/src/transformers/models/cpmant/modeling_cpmant.py b/src/transformers/models/cpmant/modeling_cpmant.py
new file mode 100755
index 000000000000..6d2dc596fa65
--- /dev/null
+++ b/src/transformers/models/cpmant/modeling_cpmant.py
@@ -0,0 +1,879 @@
+# coding=utf-8
+# Copyright 2022 The OpenBMB Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch CPMAnt"""
+
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_utils import PreTrainedModel
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_cpmant import CpmAntConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "openbmb/cpm-ant-10b"
+_CONFIG_FOR_DOC = "CpmAntConfig"
+
+CPMANT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "openbmb/cpm-ant-10b",
+    # See all CPMAnt models at https://huggingface.co/models?filter=cpmant
+]
+
+
+class CpmAntLayerNorm(nn.Module):
+    """
+    We use Root Mean Square (RMS) Layer Normalization, please see https://arxiv.org/abs/1910.07467 for details."
+    """
+
+    def __init__(self, config: CpmAntConfig):
+        super().__init__()
+
+        self.eps = config.eps
+        self.dim_norm = config.hidden_size
+        self.weight = nn.Parameter(torch.empty(config.hidden_size))
+
+    def forward(self, hidden_states: torch.Tensor):
+        """
+        Args:
+            hidden_states (`torch.Tensor` of shape `(batch, seq_len, dim_in)`)
+        """
+        if hidden_states.size(-1) != self.dim_norm:
+            raise AssertionError("hidden_states.size(-1) != self.dim_norm")
+        old_dtype = hidden_states.dtype
+        variance = hidden_states.to(torch.float32).pow(2).mean(dim=-1, keepdim=True)
+        hidden_states = (hidden_states * torch.rsqrt(variance + self.eps)).to(old_dtype) * self.weight
+        return hidden_states
+
+
+class CpmAntAttention(nn.Module):
+    def __init__(self, config: CpmAntConfig):
+        super().__init__()
+        self.dim_model = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.dim_head = config.dim_head
+
+        self.project_q = nn.Linear(self.dim_model, self.num_heads * self.dim_head, bias=False)
+        self.project_k = nn.Linear(self.dim_model, self.num_heads * self.dim_head, bias=False)
+        self.project_v = nn.Linear(self.dim_model, self.num_heads * self.dim_head, bias=False)
+
+        self.attention_out = nn.Linear(self.num_heads * self.dim_head, self.dim_model, bias=False)
+
+        self.softmax = torch.nn.Softmax(dim=-1)
+
+        if config.dropout_p is not None:
+            self.dropout = torch.nn.Dropout(p=config.dropout_p)
+        else:
+            self.dropout = None
+
+    def forward(
+        self,
+        hidden_q: torch.Tensor,
+        hidden_kv: torch.Tensor,
+        attention_mask: torch.BoolTensor,
+        position_bias: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+        past_key_values: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: Optional[bool] = None,
+    ):
+        """
+        Args:
+            hidden_q (`torch.Tensor`):
+                Input of transformer block(self-attention block). It can be the raw embedding of a batch of sequences.
+            hidden_kv (`torch.Tensor` of shape `(batch, len_k, dim_model)`)):
+                Tensor *key_value* and *query* of shape `(batch, len_k, dim_model)`
+            attention_mask (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
+                Avoid invalid areas to participate in the calculation of self-attention.
+            position_bias (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
+                Provide positional information to self-attention block.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers.
+            past_key_values (`Tuple[torch.Tensor, torch.Tensor]`, *optional*):
+                Cached past key and value projection states.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+        """
+        batch_size = hidden_q.size(0)
+        len_q = hidden_q.size(1)
+        len_k = hidden_kv.size(1)
+
+        query = self.project_q(hidden_q)
+        key = self.project_k(hidden_kv)
+        value = self.project_v(hidden_kv)
+
+        query = query.view(batch_size, len_q, self.num_heads, self.dim_head).permute(0, 2, 1, 3)
+        key = key.view(batch_size, len_k, self.num_heads, self.dim_head).permute(0, 2, 1, 3)
+        value = value.view(batch_size, len_k, self.num_heads, self.dim_head).permute(0, 2, 1, 3)
+
+        if past_key_values is not None:
+            key = torch.cat([past_key_values[0], key], dim=-2)
+            value = torch.cat([past_key_values[1], value], dim=-2)
+            len_k = key.size(-2)
+
+        # (batch_size, num_heads, len_q, dim_head) @ (batch_size, num_heads, dim_head, len_k) -> (batch_size, num_heads, len_q, len_k)
+        score = torch.matmul(query, key.transpose(-1, -2)) / math.sqrt(self.dim_head)
+        score = score + position_bias
+
+        score = torch.masked_fill(
+            score,
+            attention_mask.view(batch_size, 1, len_q, len_k) == torch.tensor(False),
+            torch.scalar_tensor(float("-inf"), device=score.device, dtype=score.dtype),
+        )
+        score = self.softmax(score)
+
+        score = torch.masked_fill(
+            score,
+            attention_mask.view(batch_size, 1, len_q, len_k) == torch.tensor(False),
+            torch.scalar_tensor(0, device=score.device, dtype=score.dtype),
+        )
+        if output_attentions:
+            attn_weights = score
+        else:
+            attn_weights = None
+
+        if self.dropout is not None:
+            score = self.dropout(score)
+
+        # (batch_size, num_heads, len_q, len_k) @ (batch_size, num_heads, len_k, dim_head) -> (batch_size, num_heads, len_q, dim_head)
+        score = torch.matmul(score, value)
+
+        score = score.view(batch_size, self.num_heads, len_q, self.dim_head).permute(0, 2, 1, 3)
+        score = score.contiguous().view(batch_size, len_q, self.num_heads * self.dim_head)
+
+        score = self.attention_out(score)
+
+        past_key_values = None
+        if use_cache:
+            past_key_values = (key, value)
+
+        return score, attn_weights, past_key_values
+
+
+class CpmAntSelfAttentionBlock(nn.Module):
+    def __init__(self, config: CpmAntConfig):
+        super().__init__()
+        self.layernorm_before_attention = CpmAntLayerNorm(config)
+        self.self_attention = CpmAntAttention(config)
+        if config.dropout_p:
+            self.dropout = torch.nn.Dropout(config.dropout_p)
+        else:
+            self.dropout = None
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        position_bias: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        past_key_values: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: Optional[bool] = None,
+    ):
+        """
+        Args:
+            hidden_states (`torch.Tensor` of shape `(batch, len_seq, dim_model)`):
+                Input of transformer block(self-attention block). It can be the raw embedding of a batch of sequences.
+            attention_mask (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
+                Avoid invalid areas to participate in the calculation of self-attention.
+            position_bias (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
+                Provide positional information to self-attention block.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers.
+            past_key_values (`Tuple(torch.FloatTensor)`, *optional*):
+                Cached past key and value projection states.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+        """
+        outputs = self.layernorm_before_attention(hidden_states)
+        outputs = self.self_attention(
+            outputs, outputs, attention_mask, position_bias, output_attentions, past_key_values, use_cache
+        )
+
+        outputs, attn_weights, current_key_value = outputs
+
+        if self.dropout is not None:
+            outputs = self.dropout(outputs)
+        hidden_states = hidden_states + outputs
+
+        return hidden_states, attn_weights, current_key_value
+
+
+class CpmAntDenseGatedACT(nn.Module):
+    def __init__(self, config: CpmAntConfig):
+        super().__init__()
+        self.w_0 = nn.Linear(config.hidden_size, config.dim_ff, bias=False)
+        self.w_1 = nn.Linear(config.hidden_size, config.dim_ff, bias=False)
+        self.act = torch.nn.GELU()
+
+    def forward(self, hidden_states: torch.Tensor):
+        """Transform an input tensor from one feature space to another via a nonlinear operation
+
+        Args:
+            hidden_states (`torch.Tensor` of shape `(batch, seq_len, dim_in)`)
+        """
+        gate_score = self.act(self.w_0(hidden_states))
+        hidden_states = self.w_1(hidden_states)
+
+        hidden_states = gate_score * hidden_states
+        return hidden_states
+
+
+class CpmAntFeedForward(nn.Module):
+    def __init__(self, config: CpmAntConfig):
+        super().__init__()
+        self.w_in = CpmAntDenseGatedACT(config)
+        if config.dropout_p is not None:
+            self.dropout = torch.nn.Dropout(config.dropout_p)
+        else:
+            self.dropout = None
+
+        self.w_out = nn.Linear(config.dim_ff, config.hidden_size, bias=False)
+
+    def forward(self, hidden_states: torch.Tensor):
+        """
+        Args:
+            hidden_states (`torch.Tensor` of shape `(batch, seq_len, dim_in)`)
+        """
+        hidden_states = self.w_in(hidden_states)
+
+        if self.dropout is not None:
+            hidden_states = self.dropout(hidden_states)
+
+        hidden_states = self.w_out(hidden_states)
+
+        return hidden_states
+
+
+class CpmAntFFNBlock(nn.Module):
+    def __init__(self, config: CpmAntConfig):
+        super().__init__()
+        self.layernorm_before_ffn = CpmAntLayerNorm(config)
+        self.ffn = CpmAntFeedForward(config)
+        if config.dropout_p:
+            self.dropout = torch.nn.Dropout(config.dropout_p)
+        else:
+            self.dropout = None
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ):
+        """
+        Args:
+            hidden_states (`torch.Tensor` of shape `(batch, len_seq, dim_model)`):
+                Hidden states before feed forward layer.
+        """
+        ln_outputs = self.layernorm_before_ffn(hidden_states)
+        outputs = self.ffn(ln_outputs)
+        if self.dropout is not None:
+            outputs = self.dropout(outputs)
+        hidden_states = hidden_states + outputs
+        return hidden_states
+
+
+class CpmAntTransformerBlock(nn.Module):
+    def __init__(self, config: CpmAntConfig):
+        super().__init__()
+        self.self_att = CpmAntSelfAttentionBlock(config)
+        self.ffn = CpmAntFFNBlock(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        position_bias: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        past_key_values: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: Optional[bool] = None,
+    ):
+        """
+        Args:
+            hidden_states (`torch.Tensor`):
+                Input to the layer of shape `(batch, seq_len, dim_model)`
+            attention_mask (`torch.Tensor`):
+                Avoid invalid areas to participate in the calculation of shape `(batch, seq_len, seq_len)`
+            position_bias (`torch.Tensor`):
+                Provides position information to attention mechanism of shape `(num_heads, seq_len, seq_len)`
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers.
+            past_key_values (`Tuple[torch.Tensor, torch.Tensor])`, *optional*):
+                Cached past key and value projection states
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+        """
+        hidden_states = self.self_att(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            output_attentions=output_attentions,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+        )
+
+        hidden_states, attn_weights, current_key_value = hidden_states
+
+        hidden_states = self.ffn(hidden_states)
+
+        return hidden_states, attn_weights, current_key_value
+
+
+class CpmAntEncoder(nn.Module):
+    def __init__(self, config: CpmAntConfig):
+        super().__init__()
+        self.num_layers = config.num_hidden_layers
+        self.layers = nn.ModuleList([CpmAntTransformerBlock(config) for ith in range(self.num_layers)])
+
+        self.output_layernorm = CpmAntLayerNorm(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        position_bias: torch.Tensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        past_key_values: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: Optional[bool] = None,
+    ):
+        """
+        Args:
+            hidden_states (`torch.Tensor`):
+                Input to the layer of shape `(batch, seq_len, dim_model)`
+            attention_mask (`torch.Tensor`):
+                Avoid invalid areas to participate in the calculation of shape `(batch, seq_len, seq_len)`
+            position_bias (`torch.Tensor`):
+                Provides position information to attention mechanism of shape `(num_heads, seq_len, seq_len)`
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers.
+            past_key_values (`Tuple[torch.Tensor, torch.Tensor])`, *optional*):
+                Cached past key and value projection states
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+        """
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        current_key_values = () if use_cache else None
+
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            layer_outputs = layer(
+                hidden_states,
+                attention_mask,
+                position_bias,
+                output_attentions=output_attentions,
+                past_key_values=past_key_values[i] if past_key_values else None,
+                use_cache=use_cache,
+            )
+            hidden_states, attn_weights, current_key_value = layer_outputs
+            if output_attentions:
+                all_self_attns += (attn_weights,)
+            if current_key_value is not None:
+                current_key_values = current_key_values + (current_key_value,)
+
+        hidden_states = self.output_layernorm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        return hidden_states, current_key_values, all_hidden_states, all_self_attns
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->CPMAnt
+class CpmAntIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class CpmAntSegmentPositionEmbedding(nn.Module):
+    def __init__(self, config: CpmAntConfig):
+        super().__init__()
+
+        self.num_heads = config.num_attention_heads
+        self.num_buckets = config.position_bias_num_buckets
+        self.max_distance = config.position_bias_max_distance
+        self.num_segments = config.segment_types
+
+        self.relative_attention_bias = nn.Parameter(
+            torch.empty(
+                config.segment_types * config.segment_types + config.position_bias_num_buckets,
+                config.num_attention_heads,
+            )
+        )
+
+    def forward(
+        self,
+        key_pos: torch.Tensor,
+        query_pos: torch.Tensor,
+        key_segment: torch.Tensor,
+        query_segment: torch.Tensor,
+    ):
+        with torch.no_grad():
+            batch = key_pos.size(0)
+            keylen = key_pos.size(1)
+            querylen = query_pos.size(1)
+
+            if key_pos.size(0) != query_pos.size(0):
+                raise AssertionError(
+                    f"key_pos.size(0) should be equal to query_pos.size(0), but got {key_pos.size(0)} and {query_pos.size(0)}!"
+                )
+            if keylen != key_segment.size(1) or querylen != query_segment.size(1):
+                raise AssertionError(
+                    f"keylen should be equal to key_segment.size(1), but got {keylen} and {key_segment.size(1)}!"
+                )
+            if querylen != query_segment.size(1):
+                raise AssertionError(
+                    f"querylen should be equal to query_segment.size(1), but got {querylen} and {query_segment.szie(1)}!"
+                )
+
+            key_pos = key_pos.view(batch, -1, keylen)
+            query_pos = query_pos.view(batch, querylen, -1)
+            key_segment = key_segment.view(batch, -1, keylen)
+            query_segment = query_segment.view(batch, querylen, -1)
+
+            relative_position_bucket = self._segment_relative_position_bucket(query_segment, key_segment)
+            relative_position_bucket = relative_position_bucket + self.num_buckets
+
+            # (batch, len_q, len_k)
+            absolute_position_bucket = self._position_bucket(
+                torch.arange(keylen, dtype=torch.int32, device=relative_position_bucket.device)[None, :]
+                - torch.arange(querylen, dtype=torch.int32, device=relative_position_bucket.device)[:, None],
+                num_buckets=self.num_buckets,
+                max_distance=self.max_distance,
+            )
+            relative_position_bucket = torch.where(
+                (key_segment == query_segment),
+                absolute_position_bucket[None, :, :],
+                relative_position_bucket,
+            )
+
+        # (batch, len_q, len_k, num_heads)
+        embeds = F.embedding(relative_position_bucket, self.relative_attention_bias)
+        # (batch, num_heads, len_q, len_k)
+        embeds = embeds.permute(0, 3, 1, 2).contiguous()
+        return embeds
+
+    def _segment_relative_position_bucket(self, query_segment, key_segment):
+        return query_segment * self.num_segments + key_segment
+
+    def _position_bucket(self, relative_position, num_buckets=32, max_distance=128):
+        relative_buckets = 0
+        # always bidirectional in CPMAnt
+        num_buckets //= 2
+        relative_buckets = (relative_position > 0).to(torch.int32) * num_buckets
+        relative_position = torch.abs(relative_position)
+        max_exact = num_buckets // 2
+        is_small = relative_position < max_exact
+        relative_postion_if_large = max_exact + (
+            torch.log(relative_position.float() / max_exact)
+            / math.log(max_distance / max_exact)
+            * (num_buckets - max_exact)
+        ).to(torch.int32)
+        relative_postion_if_large = torch.min(
+            relative_postion_if_large,
+            torch.full_like(relative_postion_if_large, num_buckets - 1),
+        )
+        relative_buckets += torch.where(is_small, relative_position.to(torch.int32), relative_postion_if_large)
+        return relative_buckets
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->CPMAnt
+class CpmAntOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class CpmAntPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = CpmAntConfig
+    base_model_prefix = "cpmant"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.init_std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.init_std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, CpmAntLayerNorm):
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, CpmAntSegmentPositionEmbedding):
+            module.relative_attention_bias.data.normal_(mean=0.0, std=self.config.init_std)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, CpmAntEncoder):
+            module.gradient_checkpointing = value
+
+
+CPMANT_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters
+        config ([`~CpmAntConfig`]): Model configuration class with all the parameters of the
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+CPMANT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.Tensor` of shape `(batch_size, seq_len)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`CPMAntTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare CPMAnt Model outputting raw hidden-states without any specific head on top.",
+    CPMANT_START_DOCSTRING,
+)
+class CpmAntModel(CpmAntPreTrainedModel):
+    def __init__(self, config: CpmAntConfig):
+        super().__init__(config)
+        self.encoder = CpmAntEncoder(config)
+        self.segment_embedding = nn.Embedding(config.segment_types, config.hidden_size)
+        self.input_embedding = nn.Embedding(
+            config.vocab_size + config.prompt_types * config.prompt_length, config.hidden_size
+        )
+        self.position_bias = CpmAntSegmentPositionEmbedding(config)
+        self.prompt_length = config.prompt_length
+        self.vocab_size = config.vocab_size
+
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.input_embedding
+
+    def set_input_embeddings(self, embeddings, **kwargs):
+        self.input_embedding = embeddings
+
+    def _prepare_attention_mask(self, input_ids, span, context, length):
+        batch = input_ids.size(0)
+        seqlen = input_ids.size(1)
+        device = input_ids.device
+        directional_mask_2d = torch.arange(seqlen, device=device) <= torch.arange(seqlen, device=device).view(-1, 1)
+        attention_mask = context[:, None, :] | (
+            context[:, :, None].logical_not() & directional_mask_2d.view(1, seqlen, seqlen)
+        )
+        attention_mask = attention_mask & (span[:, None, :] == span[:, :, None])
+        # mask for left padding
+        mask_1d = (
+            torch.tensor(list(range(seqlen - self.prompt_length))[::-1], device=device)[None, :].repeat(batch, 1)
+            < length[:, None]
+        )
+        mask_1d = torch.cat((torch.ones(batch, self.prompt_length, device=device).bool(), mask_1d), dim=1)
+        attention_mask = mask_1d.view(batch, seqlen, 1) & mask_1d.view(batch, 1, seqlen) & attention_mask
+        return attention_mask
+
+    @add_start_docstrings_to_model_forward(CPMANT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPast,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        use_cache: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        # add prompts ahead
+        if input_ids.dtype != torch.int32:
+            input_ids = input_ids.to(torch.int32)
+        dtype, device = input_ids.dtype, input_ids.device
+        segment = torch.where(input_ids != 0, 2, 0).to(dtype=dtype, device=device)
+        length = (segment != 0).sum(-1).to(dtype=dtype, device=device)
+        input_ids = torch.cat(
+            (
+                torch.arange(
+                    self.prompt_length * 2 + self.vocab_size,
+                    self.prompt_length * 3 + self.vocab_size,
+                    dtype=dtype,
+                    device=device,
+                ).repeat(input_ids.size(0), 1),
+                input_ids,
+            ),
+            dim=1,
+        )
+        batch, seq_length = input_ids.size()
+        segment = torch.cat((torch.zeros(batch, self.prompt_length, dtype=dtype, device=device), segment), dim=1)
+        context = torch.full((batch, seq_length), 1, dtype=dtype, device=device)
+        position = torch.arange(seq_length, dtype=dtype, device=device).repeat(batch, 1)
+        span = torch.full((batch, seq_length), 0, dtype=dtype, device=device)
+
+        if past_key_values is None:
+            past_length = 0
+            past_key_values = tuple([None] * self.encoder.num_layers)
+            input_ids = input_ids.contiguous()
+            hidden_states = self.input_embedding(input_ids)
+            segment_states = self.segment_embedding(segment)
+            hidden_states = hidden_states + segment_states
+        else:
+            past_length = past_key_values[0][0].size(-2)
+            segment_states = self.segment_embedding(segment)
+            hidden_states = self.input_embedding(input_ids) + segment_states[:, -1:, :]
+
+        attention_mask = self._prepare_attention_mask(input_ids, span, context, length)
+        position_bias = self.position_bias(position, position, segment, segment)
+
+        attention_mask = attention_mask[:, past_length:, :]
+        position_bias = position_bias[:, :, past_length:, :]
+        hidden_states = hidden_states[:, past_length:, :]
+
+        hidden_states, present_key_values, all_hidden_states, all_attentions = self.encoder(
+            hidden_states,
+            attention_mask,
+            position_bias,
+            output_attentions,
+            output_hidden_states,
+            past_key_values,
+            use_cache,
+        )
+
+        if past_length == 0:
+            hidden_states = hidden_states[:, self.prompt_length :, :]
+            # drop the prompt
+            if all_attentions is not None:
+                new_attentions = ()
+                for attention in all_attentions:
+                    new_attentions += (attention[:, :, self.prompt_length :, self.prompt_length :],)
+                all_attentions = new_attentions
+            if all_hidden_states is not None:
+                new_hidden_states = ()
+                for hidden_state in all_hidden_states:
+                    new_hidden_states += (hidden_state[:, self.prompt_length :, :],)
+                all_hidden_states = new_hidden_states
+
+        if not return_dict:
+            return tuple(
+                v for v in [hidden_states, present_key_values, all_hidden_states, all_attentions] if v is not None
+            )
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=present_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The CPMAnt Model with a language modeling head on top (linear layer with weights tied to the input embeddings).
+    """,
+    CPMANT_START_DOCSTRING,
+)
+class CpmAntForCausalLM(CpmAntPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: CpmAntConfig):
+        super().__init__(config)
+        self.cpmant = CpmAntModel(config)
+
+        # lm_head.weight is tied to cpmant.input_embedding.weight
+        self.lm_head = nn.Linear(
+            config.hidden_size, config.vocab_size + config.prompt_types * config.prompt_length, bias=False
+        )
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(CPMANT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutputWithPast,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+        return_dict: Optional[bool] = None,
+        attention_mask: Optional[torch.Tensor] = None,  # dummy parameter for text-generation pipeline
+        **kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            input_ids (`torch.Tensor` of shape `(batch_size, seq_len)`):
+                Indices of input sequence tokens in the vocabulary.
+
+                Indices can be obtained using [`CPMAntTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers.
+            labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                CPMAnt will process attention mask automatically, this parameter is a dummy parameter for
+                text-generation pipeline.
+
+        Example:
+
+        Text Generation with CpmAntForCausalLM.
+        ```python
+        >>> from transformers import CPMAntTokenizer, CpmAntForCausalLM
+
+        >>> texts = "今天天气不错，"
+        >>> model = CpmAntForCausalLM.from_pretrained("openbmb/cpm-ant-10b")
+        >>> tokenizer = CPMAntTokenizer.from_pretrained("openbmb/cpm-ant-10b")
+        >>> input_ids = tokenizer(texts, return_tensors="pt")
+        >>> outputs = model.generate(**input_ids)
+        >>> output_texts = tokenizer.batch_decode(outputs)
+        >>> print(output_texts)
+        ['今天天气不错，阳光明媚，我和妈妈一起去超市买东西。\n在超市里，我看到了一个很好玩的玩具，它的名字叫“机器人”。它有一个圆圆的脑袋，两只圆圆的眼睛，还有一个圆圆的']
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        model_output = self.cpmant(
+            input_ids, output_attentions, output_hidden_states, past_key_values, use_cache, return_dict
+        )
+        hidden_states = model_output.last_hidden_state if return_dict else model_output[0]
+
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            loss_func = CrossEntropyLoss()
+            loss = loss_func(logits.view(-1, logits.size(-1)), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + model_output[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=model_output.past_key_values,
+            hidden_states=model_output.hidden_states,
+            attentions=model_output.attentions,
+        )
+
+    def get_input_embeddings(self):
+        return self.cpmant.input_embedding
+
+    def set_input_embeddings(self, embeddings):
+        self.cpmant.input_embedding = embeddings
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def prepare_inputs_for_generation(self, input_ids, **kwargs):
+        input_ids = input_ids.int()
+        # save the memory usage of dummy attention mask
+        if "attention_mask" in kwargs:
+            kwargs["attention_mask"] = torch.zeros(1, 1)
+
+        return {
+            "input_ids": input_ids,
+            "use_cache": kwargs["use_cache"],
+            "past_key_values": kwargs.get("past_key_values", None),
+        }
+
+    def _reorder_cache(self, past_key_values, beam_idx):
+        past_key_values = [list(each) if each is not None else each for each in past_key_values]
+        for key_value_layer in past_key_values:
+            key_value_layer[0] = key_value_layer[0][beam_idx]
+            key_value_layer[1] = key_value_layer[1][beam_idx]
+        return past_key_values
diff --git a/src/transformers/models/cpmant/tokenization_cpmant.py b/src/transformers/models/cpmant/tokenization_cpmant.py
new file mode 100644
index 000000000000..c10f48e2de28
--- /dev/null
+++ b/src/transformers/models/cpmant/tokenization_cpmant.py
@@ -0,0 +1,278 @@
+# coding=utf-8
+# Copyright 2022 The OpenBMB Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for CPMAnt."""
+import collections
+import os
+from typing import List, Optional, Tuple
+
+from transformers.utils import is_jieba_available, requires_backends
+
+
+if is_jieba_available():
+    import jieba
+
+from ...tokenization_utils import PreTrainedTokenizer
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "openbmb/cpm-ant-10b": "https://huggingface.co/openbmb/cpm-ant-10b/blob/main/vocab.txt",
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "openbmb/cpm-ant-10b": 1024,
+}
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+
+
+class WordpieceTokenizer(object):
+    def __init__(self, vocab, unk_token="<unk>", max_input_chars_per_word=200):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, token):
+        chars = list(token)
+        if len(chars) > self.max_input_chars_per_word:
+            return [self.unk_token]
+
+        start = 0
+        sub_tokens = []
+        while start < len(chars):
+            end = len(chars)
+            cur_substr = None
+            while start < end:
+                substr = "".join(chars[start:end])
+                if substr in self.vocab:
+                    cur_substr = substr
+                    break
+                end -= 1
+            if cur_substr is None:
+                sub_tokens.append(self.unk_token)
+                start += 1
+            else:
+                sub_tokens.append(cur_substr)
+                start = end
+
+        return sub_tokens
+
+
+class CpmAntTokenizer(PreTrainedTokenizer):
+    """
+    Construct a CPMAnt tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        bod_token (`str`, *optional*, defaults to `"<d>"`):
+            The beginning of document token.
+        eod_token (`str`, *optional*, defaults to `"</d>"`):
+            The end of document token.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token.
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token.
+        line_token (`str`, *optional*, defaults to `"</n>"`):
+            The line token.
+        space_token (`str`, *optional*, defaults to `"</_>"`):
+            The space token.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+    add_prefix_space = False
+
+    def __init__(
+        self,
+        vocab_file,
+        bod_token="<d>",
+        eod_token="</d>",
+        bos_token="<s>",
+        eos_token="</s>",
+        pad_token="<pad>",
+        unk_token="<unk>",
+        line_token="</n>",
+        space_token="</_>",
+        padding_side="left",
+        **kwargs,
+    ):
+        requires_backends(self, ["jieba"])
+        self.bod_token = bod_token
+        self.eod_token = eod_token
+        self.encoder = load_vocab(vocab_file)
+        self.encoder[" "] = self.encoder[space_token]
+        self.encoder["\n"] = self.encoder[line_token]
+
+        del self.encoder[space_token]
+        del self.encoder[line_token]
+
+        self.encoder = collections.OrderedDict(sorted(self.encoder.items(), key=lambda x: x[1]))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.encoder, unk_token=unk_token)
+
+        super().__init__(
+            bod_token=bod_token,
+            eod_token=eod_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            unk_token=unk_token,
+            line_token=line_token,
+            space_token=space_token,
+            padding_side=padding_side,
+            **kwargs,
+        )
+
+    @property
+    def bod_token_id(self):
+        return self.encoder[self.bod_token]
+
+    @property
+    def eod_token_id(self):
+        return self.encoder[self.eod_token]
+
+    @property
+    def newline_id(self):
+        return self.encoder["\n"]
+
+    @property
+    def vocab_size(self) -> int:
+        return len(self.encoder)
+
+    def get_vocab(self):
+        return dict(self.encoder, **self.added_tokens_encoder)
+
+    def _tokenize(self, text):
+        """Tokenize a string."""
+        output_tokens = []
+        for x in jieba.cut(text, cut_all=False):
+            output_tokens.extend(self.wordpiece_tokenizer.tokenize(x))
+        return output_tokens
+
+    def _decode(self, token_ids, **kwargs):
+        """Decode ids into a string."""
+        token_ids = [i for i in token_ids if i >= 0]
+        token_ids = [
+            x for x in token_ids if x != self.pad_token_id and x != self.eos_token_id and x != self.bos_token_id
+        ]
+        return super()._decode(token_ids, **kwargs)
+
+    def check(self, token):
+        return token in self.encoder
+
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        return "".join(tokens)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index, self.unk_token)
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+            )
+        else:
+            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
+        index = 0
+        if " " in self.encoder:
+            self.encoder["</_>"] = self.encoder[" "]
+            del self.encoder[" "]
+        if "\n" in self.encoder:
+            self.encoder["</n>"] = self.encoder["\n"]
+            del self.encoder["\n"]
+        self.encoder = collections.OrderedDict(sorted(self.encoder.items(), key=lambda x: x[1]))
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in self.encoder.items():
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
+        return (vocab_file,)
+
+    def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: List[int] = None) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A CPMAnt sequence has the following format:
+
+        - single sequence: `[BOS] Sequence`.
+
+        Args:
+            token_ids_0 (`List[int]`): The first tokenized sequence that special tokens will be added.
+            token_ids_1 (`List[int]`): The optional second tokenized sequence that special tokens will be added.
+
+        Returns:
+            `List[int]`: The model input with special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.bos_token_id] + token_ids_0
+        return [self.bos_token_id] + token_ids_0 + [self.bos_token_id] + token_ids_1
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`): List of IDs.
+            token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1))
+        return [1] + ([0] * len(token_ids_0))
diff --git a/src/transformers/models/ctrl/__init__.py b/src/transformers/models/ctrl/__init__.py
index 53200a1b0319..7463117bfbc6 100644
--- a/src/transformers/models/ctrl/__init__.py
+++ b/src/transformers/models/ctrl/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/ctrl/configuration_ctrl.py b/src/transformers/models/ctrl/configuration_ctrl.py
index bd18fb460777..553e919b4a77 100644
--- a/src/transformers/models/ctrl/configuration_ctrl.py
+++ b/src/transformers/models/ctrl/configuration_ctrl.py
@@ -20,7 +20,9 @@
 
 logger = logging.get_logger(__name__)
 
-CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://huggingface.co/ctrl/resolve/main/config.json"}
+CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "Salesforce/ctrl": "https://huggingface.co/Salesforce/ctrl/resolve/main/config.json"
+}
 
 
 class CTRLConfig(PretrainedConfig):
@@ -28,7 +30,7 @@ class CTRLConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`CTRLModel`] or a [`TFCTRLModel`]. It is used to
     instantiate a CTRL model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the
-    [ctrl](https://huggingface.co/ctrl) architecture from SalesForce.
+    [Salesforce/ctrl](https://huggingface.co/Salesforce/ctrl) architecture from SalesForce.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -52,7 +54,7 @@ class CTRLConfig(PretrainedConfig):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         embd_pdrop (`int`, *optional*, defaults to 0.1):
             The dropout ratio for the embeddings.
-        layer_norm_epsilon (`float`, *optional*, defaults to 1e-6):
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-06):
             The epsilon to use in the layer normalization layers
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
@@ -96,13 +98,8 @@ def __init__(
         embd_pdrop=0.1,
         layer_norm_epsilon=1e-6,
         initializer_range=0.02,
-        summary_type="cls_index",
-        summary_use_proj=True,
-        summary_activation=None,
-        summary_proj_to_labels=True,
-        summary_first_dropout=0.1,
         use_cache=True,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.n_positions = n_positions
@@ -115,11 +112,6 @@ def __init__(
         self.layer_norm_epsilon = layer_norm_epsilon
         self.initializer_range = initializer_range
 
-        self.summary_type = summary_type
-        self.summary_use_proj = summary_use_proj
-        self.summary_activation = summary_activation
-        self.summary_first_dropout = summary_first_dropout
-        self.summary_proj_to_labels = summary_proj_to_labels
         self.use_cache = use_cache
 
         super().__init__(**kwargs)
diff --git a/src/transformers/models/ctrl/modeling_ctrl.py b/src/transformers/models/ctrl/modeling_ctrl.py
index fffa4e141413..70cd4ec0597a 100644
--- a/src/transformers/models/ctrl/modeling_ctrl.py
+++ b/src/transformers/models/ctrl/modeling_ctrl.py
@@ -34,7 +34,7 @@
 _CONFIG_FOR_DOC = "CTRLConfig"
 
 CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "ctrl"
+    "Salesforce/ctrl"
     # See all CTRL models at https://huggingface.co/models?filter=ctrl
 ]
 
@@ -260,7 +260,7 @@ def _init_weights(self, module):
             If `past_key_values` is used, only input IDs that do not have their past calculated should be passed as
             `input_ids`.
 
-            Indices can be obtained using [`CTRLTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -371,11 +371,11 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import CTRLTokenizer, CTRLModel
+        >>> from transformers import AutoTokenizer, CTRLModel
         >>> import torch
 
-        >>> tokenizer = CTRLTokenizer.from_pretrained("ctrl")
-        >>> model = CTRLModel.from_pretrained("ctrl")
+        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
+        >>> model = CTRLModel.from_pretrained("Salesforce/ctrl")
 
         >>> # CTRL was trained with control codes as the first token
         >>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
@@ -397,6 +397,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
             batch_size = input_ids.shape[0]
@@ -415,7 +416,7 @@ def forward(
             past_length = past_key_values[0][0].size(-2)
         if position_ids is None:
             position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
-            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
+            position_ids = position_ids.unsqueeze(0)
 
         # Attention mask.
         if attention_mask is not None:
@@ -446,7 +447,6 @@ def forward(
             token_type_embeds *= np.sqrt(self.d_model_size)
         else:
             token_type_embeds = 0
-        position_ids = position_ids.view(-1, input_shape[-1])
 
         if inputs_embeds is None:
             inputs_embeds = self.w(input_ids)
@@ -509,7 +509,7 @@ def forward(
     CTRL_START_DOCSTRING,
 )
 class CTRLLMHeadModel(CTRLPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -561,10 +561,10 @@ def forward(
 
         ```python
         >>> import torch
-        >>> from transformers import CTRLTokenizer, CTRLLMHeadModel
+        >>> from transformers import AutoTokenizer, CTRLLMHeadModel
 
-        >>> tokenizer = CTRLTokenizer.from_pretrained("ctrl")
-        >>> model = CTRLLMHeadModel.from_pretrained("ctrl")
+        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
+        >>> model = CTRLLMHeadModel.from_pretrained("Salesforce/ctrl")
 
         >>> # CTRL was trained with control codes as the first token
         >>> inputs = tokenizer("Wikipedia The llama is", return_tensors="pt")
@@ -624,7 +624,9 @@ def forward(
         )
 
     @staticmethod
-    def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> Tuple[Tuple[torch.Tensor]]:
+    def _reorder_cache(
+        past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
+    ) -> Tuple[Tuple[torch.Tensor]]:
         """
         This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
         [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
@@ -632,7 +634,7 @@ def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) ->
         """
         return tuple(
             tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
-            for layer_past in past
+            for layer_past in past_key_values
         )
 
 
@@ -687,10 +689,10 @@ def forward(
 
         ```python
         >>> import torch
-        >>> from transformers import CTRLTokenizer, CTRLForSequenceClassification
+        >>> from transformers import AutoTokenizer, CTRLForSequenceClassification
 
-        >>> tokenizer = CTRLTokenizer.from_pretrained("ctrl")
-        >>> model = CTRLForSequenceClassification.from_pretrained("ctrl")
+        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
+        >>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl")
 
         >>> # CTRL was trained with control codes as the first token
         >>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
@@ -710,7 +712,7 @@ def forward(
         >>> torch.manual_seed(42)  # doctest: +IGNORE_RESULT
         >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
         >>> num_labels = len(model.config.id2label)
-        >>> model = CTRLForSequenceClassification.from_pretrained("ctrl", num_labels=num_labels)
+        >>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl", num_labels=num_labels)
 
         >>> labels = torch.tensor(1)
         >>> loss = model(**inputs, labels=labels).loss
@@ -722,10 +724,12 @@ def forward(
 
         ```python
         >>> import torch
-        >>> from transformers import CTRLTokenizer, CTRLForSequenceClassification
+        >>> from transformers import AutoTokenizer, CTRLForSequenceClassification
 
-        >>> tokenizer = CTRLTokenizer.from_pretrained("ctrl")
-        >>> model = CTRLForSequenceClassification.from_pretrained("ctrl", problem_type="multi_label_classification")
+        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
+        >>> model = CTRLForSequenceClassification.from_pretrained(
+        ...     "Salesforce/ctrl", problem_type="multi_label_classification"
+        ... )
 
         >>> # CTRL was trained with control codes as the first token
         >>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
@@ -742,7 +746,7 @@ def forward(
         ```python
         >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
         >>> num_labels = len(model.config.id2label)
-        >>> model = CTRLForSequenceClassification.from_pretrained("ctrl", num_labels=num_labels)
+        >>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl", num_labels=num_labels)
 
         >>> num_labels = len(model.config.id2label)
         >>> labels = torch.nn.functional.one_hot(torch.tensor([predicted_class_id]), num_classes=num_labels).to(
@@ -783,7 +787,9 @@ def forward(
             sequence_lengths = -1
         else:
             if input_ids is not None:
-                sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+                    logits.device
+                )
             else:
                 sequence_lengths = -1
                 logger.warning(
diff --git a/src/transformers/models/ctrl/modeling_tf_ctrl.py b/src/transformers/models/ctrl/modeling_tf_ctrl.py
index c6a4d8b65a02..70a5c1746259 100644
--- a/src/transformers/models/ctrl/modeling_tf_ctrl.py
+++ b/src/transformers/models/ctrl/modeling_tf_ctrl.py
@@ -15,7 +15,8 @@
 # limitations under the License.
 """ TF 2.0 CTRL model."""
 
-import warnings
+from __future__ import annotations
+
 from typing import Optional, Tuple, Union
 
 import numpy as np
@@ -27,24 +28,22 @@
     TFModelInputType,
     TFPreTrainedModel,
     TFSequenceClassificationLoss,
-    TFSharedEmbeddings,
     get_initializer,
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_ctrl import CTRLConfig
 
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "ctrl"
+_CHECKPOINT_FOR_DOC = "Salesforce/ctrl"
 _CONFIG_FOR_DOC = "CTRLConfig"
-_TOKENIZER_FOR_DOC = "CTRLTokenizer"
 
 TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "ctrl"
+    "Salesforce/ctrl"
     # See all CTRL models at https://huggingface.co/models?filter=ctrl
 ]
 
@@ -222,8 +221,11 @@ def __init__(self, config, **kwargs):
 
         self.pos_encoding = positional_encoding(config.n_positions, self.d_model_size)
 
-        self.w = TFSharedEmbeddings(
-            config.vocab_size, config.n_embd, initializer_range=config.initializer_range, name="w"
+        self.w = tf.keras.layers.Embedding(
+            input_dim=config.vocab_size,
+            output_dim=config.n_embd,
+            embeddings_initializer=get_initializer(config.initializer_range),
+            name="w",
         )
 
         self.dropout = tf.keras.layers.Dropout(config.embd_pdrop)
@@ -244,9 +246,8 @@ def __init__(self, config, **kwargs):
     def get_input_embeddings(self):
         return self.w
 
-    def set_input_embeddings(self, value):
-        self.w.weight = value
-        self.w.vocab_size = shape_list(value)[0]
+    def set_input_embeddings(self, new_embeddings):
+        self.w = new_embeddings
 
     def _prune_heads(self, heads_to_prune):
         """
@@ -257,20 +258,19 @@ def _prune_heads(self, heads_to_prune):
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
+        input_ids: TFModelInputType | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFBaseModelOutputWithPast]:
-
         # If using past key value states, only the last tokens
         # should be given as an input
         if past_key_values is not None:
@@ -307,7 +307,7 @@ def call(
             # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
             # this attention mask is more simple than the triangular masking of causal attention
             # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-            attention_mask = tf.reshape(attention_mask, (input_shape[0], 1, 1, input_shape[1]))
+            attention_mask = tf.reshape(attention_mask, (input_shape[0], 1, 1, input_shape[1] + past_length))
 
             # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
             # masked positions, this operation will create a tensor which is 0.0 for
@@ -331,24 +331,15 @@ def call(
 
         if token_type_ids is not None:
             token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
-            token_type_embeds = self.w(token_type_ids, mode="embedding")
+            token_type_embeds = self.w(token_type_ids)
             token_type_embeds *= tf.math.sqrt(tf.cast(self.d_model_size, dtype=token_type_embeds.dtype))
         else:
             token_type_embeds = tf.constant(0.0)
         position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
 
         if inputs_embeds is None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.w.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.w.vocab_size})"
-                ),
-            )
-            inputs_embeds = self.w(input_ids, mode="embedding")
+            check_embeddings_within_bounds(input_ids, self.w.input_dim)
+            inputs_embeds = self.w(input_ids)
         seq_len = input_shape[-1]
         mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
 
@@ -468,7 +459,7 @@ class TFCTRLPreTrainedModel(TFPreTrainedModel):
 
             If `past` is used, only input IDs that do not have their past calculated should be passed as `input_ids`.
 
-            Indices can be obtained using [`CTRLTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -537,20 +528,19 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFBaseModelOutputWithPast,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
+        input_ids: TFModelInputType | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -573,49 +563,27 @@ def call(
         )
         return outputs
 
-    def serving_output(self, output):
-        pkv = tf.convert_to_tensor(output.past_key_values) if self.config.use_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
 
-        return TFBaseModelOutputWithPast(
-            last_hidden_state=output.last_hidden_state, past_key_values=pkv, hidden_states=hs, attentions=attns
-        )
-
-
-class TFCTRLLMHead(tf.keras.layers.Layer):
-    def __init__(self, config, input_embeddings, **kwargs):
-        super().__init__(**kwargs)
-        self.vocab_size = config.vocab_size
-        # CTRL has numerical issues in XLA generate
-        self.supports_xla_generation = False
+class TFCTRLBiasLayer(tf.keras.layers.Layer):
+    """
+    Bias as a layer. It is used for serialization purposes: `tf.keras.Model.save_weights` stores on a per-layer basis,
+    so all weights have to be registered in a layer.
+    """
 
-        # The output weights are the same as the input embeddings, but there is
-        # an output-only bias for each token.
-        self.input_embeddings = input_embeddings
+    def __init__(self, shape, initializer, trainable, name, **kwargs):
+        super().__init__(name=name, **kwargs)
+        self.shape = shape
+        self.initializer = initializer
+        self.trainable = trainable
 
     def build(self, input_shape):
-        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
+        self.bias = self.add_weight(
+            name="bias", shape=self.shape, initializer=self.initializer, trainable=self.trainable
+        )
         super().build(input_shape)
 
-    def get_output_embeddings(self):
-        return self.input_embeddings
-
-    def set_output_embeddings(self, value):
-        self.input_embeddings.weight = value
-        self.input_embeddings.vocab_size = shape_list(value)[0]
-
-    def get_bias(self):
-        return {"bias": self.bias}
-
-    def set_bias(self, value):
-        self.bias = value["bias"]
-        self.vocab_size = shape_list(value["bias"])[0]
-
-    def call(self, hidden_states):
-        hidden_states = self.input_embeddings(hidden_states, mode="linear")
-        hidden_states = hidden_states + self.bias
-        return hidden_states
+    def call(self, x):
+        return x + self.bias
 
 
 @add_start_docstrings(
@@ -629,47 +597,75 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel, TFCausalLanguageModelingLoss):
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.transformer = TFCTRLMainLayer(config, name="transformer")
+        self.bias_layer = TFCTRLBiasLayer(
+            name="lm_head", shape=[1, config.vocab_size], initializer="zeros", trainable=True
+        )
 
-        self.lm_head = TFCTRLLMHead(config, self.transformer.w, name="lm_head")
-        # CTRL has numerical issues in XLA generate
-        self.supports_xla_generation = False
+    def get_output_embeddings(self):
+        return self.get_input_embeddings()
 
-    def get_lm_head(self):
-        return self.lm_head
+    def set_output_embeddings(self, value):
+        self.set_input_embeddings(value)
 
-    def get_prefix_bias_name(self):
-        warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
-        return self.name + "/" + self.lm_head.name
+    def get_bias(self):
+        return {"lm_head.bias": self.bias_layer.bias}
 
-    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, use_cache=None, **kwargs):
+    def set_bias(self, value):
+        # Replaces the existing layers containing bias for correct (de)serialization.
+        vocab_size = value["lm_head.bias"].shape[-1]
+        self.bias_layer = TFCTRLBiasLayer(
+            name="final_logits_bias", shape=[1, vocab_size], initializer="zeros", trainable=True
+        )
+        self.bias_layer.build(None)
+        self.bias_layer.bias.assign(value["lm_head.bias"])
+
+    # Copied from transformers.models.gpt2.modeling_tf_gpt2.TFGPT2LMHeadModel.prepare_inputs_for_generation
+    def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_cache=None, **kwargs):
+        token_type_ids = kwargs.get("token_type_ids", None)
         # only last token for inputs_ids if past is defined in kwargs
         if past_key_values:
-            input_ids = tf.expand_dims(input_ids[:, -1], -1)
+            inputs = tf.expand_dims(inputs[:, -1], -1)
+            if token_type_ids is not None:
+                token_type_ids = tf.expand_dims(token_type_ids[:, -1], -1)
 
-        return {"input_ids": input_ids, "past_key_values": past_key_values, "use_cache": use_cache}
+        position_ids = kwargs.get("position_ids", None)
+        attention_mask = kwargs.get("attention_mask", None)
+
+        if attention_mask is not None and position_ids is None:
+            position_ids = tf.math.cumsum(attention_mask, axis=-1, exclusive=True)
+            if past_key_values:
+                position_ids = tf.expand_dims(position_ids[:, -1], -1)
+
+        return {
+            "input_ids": inputs,
+            "attention_mask": attention_mask,
+            "position_ids": position_ids,
+            "past_key_values": past_key_values,
+            "use_cache": use_cache,
+            "token_type_ids": token_type_ids,
+        }
 
     @unpack_inputs
     @add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFCausalLMOutputWithPast,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
+        input_ids: TFModelInputType | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFCausalLMOutputWithPast]:
         r"""
@@ -691,10 +687,9 @@ def call(
             return_dict=return_dict,
             training=training,
         )
-
         hidden_states = transformer_outputs[0]
-
-        logits = self.lm_head(hidden_states)
+        logits = tf.matmul(hidden_states, self.transformer.w.weights, transpose_b=True)
+        logits = self.bias_layer(logits)
 
         loss = None
         if labels is not None:
@@ -715,13 +710,6 @@ def call(
             attentions=transformer_outputs.attentions,
         )
 
-    def serving_output(self, output):
-        pkv = tf.convert_to_tensor(output.past_key_values) if self.config.use_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFCausalLMOutputWithPast(logits=output.logits, past_key_values=pkv, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -751,30 +739,34 @@ def __init__(self, config, *inputs, **kwargs):
         self.transformer = TFCTRLMainLayer(config, name="transformer")
 
     def get_output_embeddings(self):
+        # Remove after transformers v4.32. Fix this model's `test_model_common_attributes` test too.
+        logger.warning(
+            "Sequence classification models do not have output embeddings. `.get_output_embeddings` will be removed "
+            "in transformers v4.32."
+        )
         return self.transformer.w
 
     @unpack_inputs
     @add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
+        input_ids: TFModelInputType | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFSequenceClassifierOutput]:
         r"""
@@ -806,16 +798,10 @@ def call(
         else:
             if input_ids is not None:
                 sequence_lengths = (
-                    tf.reduce_sum(
-                        tf.cast(
-                            tf.math.not_equal(input_ids, self.config.pad_token_id),
-                            dtype=input_ids.dtype,
-                        ),
-                        -1,
-                        keepdims=False,
-                    )
+                    tf.argmax(tf.cast(tf.math.equal(input_ids, self.config.pad_token_id), input_ids.dtype), axis=-1)
                     - 1
                 )
+                sequence_lengths = tf.where(sequence_lengths >= 0, sequence_lengths, input_ids.shape[-1] - 1)
                 in_logits = tf.gather(logits, sequence_lengths, batch_dims=1, axis=1)
             else:
                 sequence_lengths = -1
@@ -850,10 +836,3 @@ def call(
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
         )
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
diff --git a/src/transformers/models/ctrl/tokenization_ctrl.py b/src/transformers/models/ctrl/tokenization_ctrl.py
index f8524bdf1f54..f00b50348048 100644
--- a/src/transformers/models/ctrl/tokenization_ctrl.py
+++ b/src/transformers/models/ctrl/tokenization_ctrl.py
@@ -139,8 +139,6 @@ class CTRLTokenizer(PreTrainedTokenizer):
     control_codes = CONTROL_CODES
 
     def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
-        super().__init__(unk_token=unk_token, **kwargs)
-
         with open(vocab_file, encoding="utf-8") as vocab_handle:
             self.encoder = json.load(vocab_handle)
         self.decoder = {v: k for k, v in self.encoder.items()}
@@ -149,6 +147,7 @@ def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
         merges = [tuple(merge.split()) for merge in merges]
         self.bpe_ranks = dict(zip(merges, range(len(merges))))
         self.cache = {}
+        super().__init__(unk_token=unk_token, **kwargs)
 
     @property
     def vocab_size(self):
@@ -208,7 +207,7 @@ def _tokenize(self, text):
         words = re.findall(r"\S+\n?", text)
 
         for token in words:
-            split_tokens.extend([t for t in self.bpe(token).split(" ")])
+            split_tokens.extend(list(self.bpe(token).split(" ")))
         return split_tokens
 
     def _convert_token_to_id(self, token):
diff --git a/src/transformers/models/cvt/__init__.py b/src/transformers/models/cvt/__init__.py
index 66b18f334411..5241bb5a5f3a 100644
--- a/src/transformers/models/cvt/__init__.py
+++ b/src/transformers/models/cvt/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/cvt/configuration_cvt.py b/src/transformers/models/cvt/configuration_cvt.py
index 0ab32857d496..a540c0f4807c 100644
--- a/src/transformers/models/cvt/configuration_cvt.py
+++ b/src/transformers/models/cvt/configuration_cvt.py
@@ -121,7 +121,7 @@ def __init__(
         stride_q=[1, 1, 1],
         initializer_range=0.02,
         layer_norm_eps=1e-12,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.num_channels = num_channels
diff --git a/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py
index 1c9f58f4a68d..ea4edac16cdb 100644
--- a/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py
@@ -22,9 +22,9 @@
 from collections import OrderedDict
 
 import torch
-
 from huggingface_hub import cached_download, hf_hub_url
-from transformers import AutoFeatureExtractor, CvtConfig, CvtForImageClassification
+
+from transformers import AutoImageProcessor, CvtConfig, CvtForImageClassification
 
 
 def embeddings(idx):
@@ -307,8 +307,8 @@ def convert_cvt_checkpoint(cvt_model, image_size, cvt_file_name, pytorch_dump_fo
         config.embed_dim = [192, 768, 1024]
 
     model = CvtForImageClassification(config)
-    feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/convnext-base-224-22k-1k")
-    feature_extractor.size["shortest_edge"] = image_size
+    image_processor = AutoImageProcessor.from_pretrained("facebook/convnext-base-224-22k-1k")
+    image_processor.size["shortest_edge"] = image_size
     original_weights = torch.load(cvt_file_name, map_location=torch.device("cpu"))
 
     huggingface_weights = OrderedDict()
@@ -329,7 +329,7 @@ def convert_cvt_checkpoint(cvt_model, image_size, cvt_file_name, pytorch_dump_fo
 
     model.load_state_dict(huggingface_weights)
     model.save_pretrained(pytorch_dump_folder)
-    feature_extractor.save_pretrained(pytorch_dump_folder)
+    image_processor.save_pretrained(pytorch_dump_folder)
 
 
 # Download the weights from zoo: https://1drv.ms/u/s!AhIXJn_J-blW9RzF3rMW7SsLHa8h?e=blQ0Al
@@ -350,7 +350,7 @@ def convert_cvt_checkpoint(cvt_model, image_size, cvt_file_name, pytorch_dump_fo
     )
     parser.add_argument(
         "--cvt_file_name",
-        default="cvtmodels\CvT-w24-384x384-IN-22k.pth",
+        default=r"cvtmodels\CvT-w24-384x384-IN-22k.pth",
         type=str,
         help="Input Image Size",
     )
diff --git a/src/transformers/models/cvt/modeling_cvt.py b/src/transformers/models/cvt/modeling_cvt.py
index 0cc8b16564c5..d21b5c9a8749 100644
--- a/src/transformers/models/cvt/modeling_cvt.py
+++ b/src/transformers/models/cvt/modeling_cvt.py
@@ -35,7 +35,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "CvtConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "microsoft/cvt-13"
@@ -79,7 +78,7 @@ class BaseModelOutputWithCLSToken(ModelOutput):
 
 
 # Copied from transformers.models.beit.modeling_beit.drop_path
-def drop_path(input, drop_prob: float = 0.0, training: bool = False):
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
@@ -213,7 +212,7 @@ def __init__(
         qkv_bias,
         attention_drop_rate,
         with_cls_token=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__()
         self.scale = embed_dim**-0.5
@@ -451,11 +450,7 @@ def __init__(self, config, stage):
         self.config = config
         self.stage = stage
         if self.config.cls_token[self.stage]:
-            self.cls_token = nn.Parameter(
-                nn.init.trunc_normal_(
-                    torch.zeros(1, 1, self.config.embed_dim[-1]), mean=0.0, std=config.initializer_range
-                )
-            )
+            self.cls_token = nn.Parameter(torch.randn(1, 1, self.config.embed_dim[-1]))
 
         self.embedding = CvtEmbeddings(
             patch_size=config.patch_sizes[self.stage],
@@ -557,6 +552,11 @@ def _init_weights(self, module):
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
+        elif isinstance(module, CvtStage):
+            if self.config.cls_token[module.stage]:
+                module.cls_token.data = nn.init.trunc_normal_(
+                    torch.zeros(1, 1, self.config.embed_dim[-1]), mean=0.0, std=self.config.initializer_range
+                )
 
 
 CVT_START_DOCSTRING = r"""
@@ -573,7 +573,7 @@ def _init_weights(self, module):
 CVT_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`CvtImageProcessor`]. See [`CvtImageProcessor.__call__`]
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`CvtImageProcessor.__call__`]
             for details.
         output_hidden_states (`bool`, *optional*):
             Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
@@ -604,7 +604,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(CVT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithCLSToken,
         config_class=_CONFIG_FOR_DOC,
@@ -617,7 +616,6 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithCLSToken]:
-
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
@@ -667,7 +665,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(CVT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_IMAGE_CLASS_CHECKPOINT,
         output_type=ImageClassifierOutputWithNoAttention,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/cvt/modeling_tf_cvt.py b/src/transformers/models/cvt/modeling_tf_cvt.py
index 17880eaa9d82..80e15a196f85 100644
--- a/src/transformers/models/cvt/modeling_tf_cvt.py
+++ b/src/transformers/models/cvt/modeling_tf_cvt.py
@@ -15,9 +15,11 @@
 """ TF 2.0 Cvt model."""
 
 
+from __future__ import annotations
+
 import collections.abc
 from dataclasses import dataclass
-from typing import Dict, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import tensorflow as tf
 
@@ -75,7 +77,7 @@ class TFBaseModelOutputWithCLSToken(ModelOutput):
 
     last_hidden_state: tf.Tensor = None
     cls_token_value: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
 
 
 class TFCvtDropPath(tf.keras.layers.Layer):
@@ -93,7 +95,7 @@ def call(self, x: tf.Tensor, training=None):
             return x
         keep_prob = 1 - self.drop_prob
         shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1)
-        random_tensor = keep_prob + tf.random.uniform(shape, 0, 1)
+        random_tensor = keep_prob + tf.random.uniform(shape, 0, 1, dtype=self.compute_dtype)
         random_tensor = tf.floor(random_tensor)
         return (x / keep_prob) * random_tensor
 
@@ -109,7 +111,7 @@ def __init__(
         stride: int,
         padding: int,
         dropout_rate: float,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.convolution_embeddings = TFCvtConvEmbeddings(
@@ -211,7 +213,7 @@ def __init__(
         stride: int,
         padding: int,
         projection_method: str = "dw_bn",
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         if projection_method == "dw_bn":
@@ -246,7 +248,7 @@ def __init__(
         qkv_bias: bool,
         attention_drop_rate: float,
         with_cls_token: bool = True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.scale = embed_dim**-0.5
@@ -470,7 +472,7 @@ def __init__(
         mlp_ratio: float,
         drop_path_rate: float,
         with_cls_token: bool = True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.attention = TFCvtAttention(
@@ -668,7 +670,7 @@ def __init__(self, config: CvtConfig, **kwargs):
     @unpack_inputs
     def call(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
+        pixel_values: TFModelInputType | None = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
@@ -705,35 +707,6 @@ class TFCvtPreTrainedModel(TFPreTrainedModel):
     base_model_prefix = "cvt"
     main_input_name = "pixel_values"
 
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        VISION_DUMMY_INPUTS = tf.random.uniform(shape=(3, self.config.num_channels, 224, 224), dtype=tf.float32)
-        return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)}
-
-    @tf.function(
-        input_signature=[
-            {
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        """
-        Method used for serving the model.
-
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
-        """
-        output = self.call(inputs)
-        return self.serving_output(output)
-
 
 TFCVT_START_DOCSTRING = r"""
 
@@ -766,8 +739,8 @@ def serving(self, inputs):
 TFCVT_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
-            [`AutoImageProcessor.__call__`] for details.
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`CvtImageProcessor.__call__`]
+            for details.
 
         output_hidden_states (`bool`, *optional*):
             Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
@@ -797,7 +770,7 @@ def __init__(self, config: CvtConfig, *inputs, **kwargs):
     @replace_return_docstrings(output_type=TFBaseModelOutputWithCLSToken, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
@@ -842,13 +815,6 @@ def call(
             hidden_states=outputs.hidden_states,
         )
 
-    def serving_output(self, output: TFBaseModelOutputWithCLSToken) -> TFBaseModelOutputWithCLSToken:
-        return TFBaseModelOutputWithCLSToken(
-            last_hidden_state=output.last_hidden_state,
-            cls_token_value=output.cls_token_value,
-            hidden_states=output.hidden_states,
-        )
-
 
 @add_start_docstrings(
     """
@@ -880,8 +846,8 @@ def __init__(self, config: CvtConfig, *inputs, **kwargs):
     @replace_return_docstrings(output_type=TFImageClassifierOutputWithNoAttention, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
-        labels: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
+        labels: tf.Tensor | None = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
@@ -943,6 +909,3 @@ def call(
             return ((loss,) + output) if loss is not None else output
 
         return TFImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
-
-    def serving_output(self, output: TFImageClassifierOutputWithNoAttention) -> TFImageClassifierOutputWithNoAttention:
-        return TFImageClassifierOutputWithNoAttention(logits=output.logits, hidden_states=output.hidden_states)
diff --git a/src/transformers/models/data2vec/__init__.py b/src/transformers/models/data2vec/__init__.py
index 794124575e13..45522f4ba893 100644
--- a/src/transformers/models/data2vec/__init__.py
+++ b/src/transformers/models/data2vec/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/data2vec/configuration_data2vec_audio.py b/src/transformers/models/data2vec/configuration_data2vec_audio.py
index c1aee4683214..6c24f3effbaa 100644
--- a/src/transformers/models/data2vec/configuration_data2vec_audio.py
+++ b/src/transformers/models/data2vec/configuration_data2vec_audio.py
@@ -58,10 +58,15 @@ class Data2VecAudioConfig(PretrainedConfig):
             `"relu"`, `"selu"` and `"gelu_new"` are supported.
         hidden_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        activation_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for activations inside the fully connected layer.
         attention_dropout (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
         final_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for the final projection layer of [`Data2VecAudioForCTC`].
+        layerdrop (`float`, *optional*, defaults to 0.1):
+            The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
+            details.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
@@ -211,7 +216,7 @@ def __init__(
         adapter_stride=2,
         num_adapter_layers=3,
         output_hidden_size=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
         self.hidden_size = hidden_size
diff --git a/src/transformers/models/data2vec/configuration_data2vec_text.py b/src/transformers/models/data2vec/configuration_data2vec_text.py
index a990e933bc67..305a3ea5e4ff 100644
--- a/src/transformers/models/data2vec/configuration_data2vec_text.py
+++ b/src/transformers/models/data2vec/configuration_data2vec_text.py
@@ -117,7 +117,7 @@ def __init__(
         position_embedding_type="absolute",
         use_cache=True,
         classifier_dropout=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
diff --git a/src/transformers/models/data2vec/configuration_data2vec_vision.py b/src/transformers/models/data2vec/configuration_data2vec_vision.py
index a63c9429c338..b45f8420ca00 100644
--- a/src/transformers/models/data2vec/configuration_data2vec_vision.py
+++ b/src/transformers/models/data2vec/configuration_data2vec_vision.py
@@ -142,7 +142,7 @@ def __init__(
         auxiliary_num_convs=1,
         auxiliary_concat_input=False,
         semantic_loss_ignore_index=255,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -180,7 +180,6 @@ def __init__(
 
 # Copied from transformers.models.vit.configuration_vit.ViTOnnxConfig
 class Data2VecVisionOnnxConfig(OnnxConfig):
-
     torch_onnx_minimum_version = version.parse("1.11")
 
     @property
diff --git a/src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py
index 9a38b3ae0bd1..81f5cd23fb9e 100644
--- a/src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py
@@ -24,7 +24,12 @@
 from fairseq.modules import TransformerSentenceEncoderLayer
 from packaging import version
 
-from transformers import Data2VecTextConfig, Data2VecTextForMaskedLM, Data2VecTextForSequenceClassification
+from transformers import (
+    Data2VecTextConfig,
+    Data2VecTextForMaskedLM,
+    Data2VecTextForSequenceClassification,
+    Data2VecTextModel,
+)
 from transformers.models.bert.modeling_bert import (
     BertIntermediate,
     BertLayer,
@@ -35,7 +40,6 @@
 
 # IMPORTANT: In order for this script to run, please make sure to download the dictionary: `dict.txt` from wget https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.tar.gz
 # File copied from https://github.com/pytorch/fairseq/blob/main/examples/data2vec/models/data2vec_text.py
-from transformers.models.data2vec.data2vec_text import Data2VecTextModel
 from transformers.utils import logging
 
 
diff --git a/src/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py
index 7777e85927cd..0c6f42f4ba7f 100755
--- a/src/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py
@@ -3,12 +3,12 @@
 import json
 
 import torch
-from PIL import Image
-
 from huggingface_hub import hf_hub_download
+from PIL import Image
 from timm.models import create_model
+
 from transformers import (
-    BeitFeatureExtractor,
+    BeitImageProcessor,
     Data2VecVisionConfig,
     Data2VecVisionForImageClassification,
     Data2VecVisionModel,
@@ -304,9 +304,9 @@ def main():
     orig_model.eval()
 
     # 3. Forward Beit model
-    feature_extractor = BeitFeatureExtractor(size=config.image_size, do_center_crop=False)
+    image_processor = BeitImageProcessor(size=config.image_size, do_center_crop=False)
     image = Image.open("../../../../tests/fixtures/tests_samples/COCO/000000039769.png")
-    encoding = feature_extractor(images=image, return_tensors="pt")
+    encoding = image_processor(images=image, return_tensors="pt")
     pixel_values = encoding["pixel_values"]
 
     orig_args = (pixel_values,) if is_finetuned else (pixel_values, None)
@@ -354,7 +354,7 @@ def main():
     # 7. Save
     print(f"Saving to {args.hf_checkpoint_name}")
     hf_model.save_pretrained(args.hf_checkpoint_name)
-    feature_extractor.save_pretrained(args.hf_checkpoint_name)
+    image_processor.save_pretrained(args.hf_checkpoint_name)
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py
index eb40a73ca430..b886c6ad48ce 100755
--- a/src/transformers/models/data2vec/modeling_data2vec_audio.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py
@@ -25,7 +25,7 @@
 from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
-from ...deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
 from ...modeling_outputs import (
     BaseModelOutput,
     CausalLMOutput,
@@ -35,7 +35,6 @@
     XVectorOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_int_div
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_data2vec_audio import Data2VecAudioConfig
 
@@ -47,7 +46,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "Data2VecAudioConfig"
-_PROCESSOR_FOR_DOC = "Wav2Vec2Processor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "facebook/data2vec-audio-base-960h"
@@ -57,20 +55,6 @@
 _CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
 _CTC_EXPECTED_LOSS = 66.95
 
-# Audio class docstring
-_FEAT_EXTRACTOR_FOR_DOC = "Wav2Vec2FeatureExtractor"
-_SEQ_CLASS_CHECKPOINT = "hf-internal-testing/tiny-random-data2vec-seq-class"
-_SEQ_CLASS_EXPECTED_OUTPUT = "'LABEL_1'"
-_SEQ_CLASS_EXPECTED_LOSS = 0.69
-
-# Frame class docstring
-_FRAME_CLASS_CHECKPOINT = "hf-internal-testing/tiny-random-data2vec-audio-frame"
-_FRAME_EXPECTED_OUTPUT = [1, 1]
-
-# Speaker Verification docstring
-_XVECTOR_CHECKPOINT = "hf-internal-testing/tiny-random-data2vec-xvector"
-_XVECTOR_EXPECTED_OUTPUT = 1.0
-
 
 DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "facebook/data2vec-audio-base",
@@ -434,8 +418,8 @@ def forward(
 
         proj_shape = (bsz * self.num_heads, -1, self.head_dim)
         query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-        key_states = key_states.view(*proj_shape)
-        value_states = value_states.view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
 
         src_len = key_states.size(1)
         attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
@@ -481,7 +465,7 @@ def forward(
 
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
             raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
                 f" {attn_output.size()}"
             )
 
@@ -489,7 +473,7 @@ def forward(
         attn_output = attn_output.transpose(1, 2)
 
         # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned aross GPUs when using tensor-parallelism.
+        # partitioned across GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
@@ -603,7 +587,7 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = np.random.uniform(0, 1)
+            dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
             if not skip_the_layer or deepspeed_zero3_is_enabled:
@@ -705,7 +689,6 @@ class Data2VecAudioPreTrainedModel(PreTrainedModel):
     config_class = Data2VecAudioConfig
     base_model_prefix = "data2vec_audio"
     main_input_name = "input_values"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
     supports_gradient_checkpointing = True
 
     def _init_weights(self, module):
@@ -746,7 +729,7 @@ def _get_feat_extract_output_lengths(
         def _conv_out_length(input_length, kernel_size, stride):
             # 1D convolutional layer output length formula taken
             # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch_int_div(input_length - kernel_size, stride) + 1
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
 
         for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
             input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
@@ -807,8 +790,8 @@ def _set_gradient_checkpointing(self, module, value=False):
         input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
             Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
             into an array of type *List[float]* or a *numpy.ndarray*, *e.g.* via the soundfile library (*pip install
-            soundfile*). To prepare the array into *input_values*, the [`Wav2Vec2Processor`] should be used for padding
-            and conversion into a tensor of type *torch.FloatTensor*. See [`Wav2Vec2Processor.__call__`] for details.
+            soundfile*). To prepare the array into *input_values*, the [`AutoProcessor`] should be used for padding and
+            conversion into a tensor of type *torch.FloatTensor*. See [`Wav2Vec2Processor.__call__`] for details.
         attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
             1]`:
@@ -917,7 +900,6 @@ def _mask_hidden_states(
 
     @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_PROCESSOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=Wav2Vec2BaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -981,7 +963,6 @@ def forward(
     """Data2VecAudio Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
     DATA2VEC_AUDIO_START_DOCSTRING,
 )
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->Data2VecAudio, wav2vec2->data2vec_audio, WAV_2_VEC_2->DATA2VEC_AUDIO
 class Data2VecAudioForCTC(Data2VecAudioPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -1025,13 +1006,13 @@ def freeze_feature_encoder(self):
 
     @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_PROCESSOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=CausalLMOutput,
         config_class=_CONFIG_FOR_DOC,
         expected_output=_CTC_EXPECTED_OUTPUT,
         expected_loss=_CTC_EXPECTED_LOSS,
     )
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.forward with wav2vec2->data2vec_audio
     def forward(
         self,
         input_values: Optional[torch.Tensor],
@@ -1066,7 +1047,6 @@ def forward(
 
         loss = None
         if labels is not None:
-
             if labels.max() >= self.config.vocab_size:
                 raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
 
@@ -1112,7 +1092,6 @@ def forward(
     """,
     DATA2VEC_AUDIO_START_DOCSTRING,
 )
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification with Wav2Vec2->Data2VecAudio, wav2vec2->data2vec_audio, WAV_2_VEC_2->DATA2VEC_AUDIO
 class Data2VecAudioForSequenceClassification(Data2VecAudioPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -1160,14 +1139,12 @@ def freeze_base_model(self):
 
     @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
-        checkpoint=_SEQ_CLASS_CHECKPOINT,
+        checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
         modality="audio",
-        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
-        expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
     )
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.forward with wav2vec2->data2vec_audio
     def forward(
         self,
         input_values: Optional[torch.Tensor],
@@ -1236,7 +1213,6 @@ def forward(
     """,
     DATA2VEC_AUDIO_START_DOCSTRING,
 )
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification with Wav2Vec2->Data2VecAudio, wav2vec2->data2vec_audio, WAV_2_VEC_2->DATA2VEC_AUDIO
 class Data2VecAudioForAudioFrameClassification(Data2VecAudioPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -1284,13 +1260,12 @@ def freeze_base_model(self):
 
     @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
-        checkpoint=_FRAME_CLASS_CHECKPOINT,
+        checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
         modality="audio",
-        expected_output=_FRAME_EXPECTED_OUTPUT,
     )
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification.forward with wav2vec2->data2vec_audio
     def forward(
         self,
         input_values: Optional[torch.Tensor],
@@ -1402,7 +1377,6 @@ def forward(self, hidden_states):
     """,
     DATA2VEC_AUDIO_START_DOCSTRING,
 )
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector with Wav2Vec2->Data2VecAudio, wav2vec2->data2vec_audio, WAV_2_VEC_2->DATA2VEC_AUDIO
 class Data2VecAudioForXVector(Data2VecAudioPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -1467,13 +1441,12 @@ def _conv_out_length(input_length, kernel_size, stride):
 
     @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
-        checkpoint=_XVECTOR_CHECKPOINT,
+        checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=XVectorOutput,
         config_class=_CONFIG_FOR_DOC,
         modality="audio",
-        expected_output=_XVECTOR_EXPECTED_OUTPUT,
     )
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector.forward with wav2vec2->data2vec_audio
     def forward(
         self,
         input_values: Optional[torch.Tensor],
diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py
index 29930094f380..7cbaee692564 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_text.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_text.py
@@ -53,7 +53,6 @@
 # General docstring
 _CHECKPOINT_FOR_DOC = "facebook/data2vec-text-base"
 _CONFIG_FOR_DOC = "Data2VecTextConfig"
-_TOKENIZER_FOR_DOC = "RobertaTokenizer"
 
 
 DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -81,7 +80,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
         )
@@ -493,6 +494,13 @@ def forward(
         all_self_attentions = () if output_attentions else None
         all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         next_decoder_cache = () if use_cache else None
         for i, layer_module in enumerate(self.layer):
             if output_hidden_states:
@@ -503,12 +511,6 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         return module(*inputs, past_key_value, output_attentions)
@@ -591,7 +593,7 @@ class Data2VecTextPreTrainedModel(PreTrainedModel):
     config_class = Data2VecTextConfig
     base_model_prefix = "data2vec_text"
     supports_gradient_checkpointing = True
-    _no_split_modules = []
+    _no_split_modules = ["Data2VecTextForTextEmbeddings", "Data2VecTextLayer"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -615,15 +617,6 @@ def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, Data2VecTextEncoder):
             module.gradient_checkpointing = value
 
-    def update_keys_to_ignore(self, config, del_keys_to_ignore):
-        """Remove some keys from ignore list"""
-        if not config.tie_word_embeddings:
-            # must make a new list, or the class variable gets modified!
-            self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore]
-            self._keys_to_ignore_on_load_missing = [
-                k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore
-            ]
-
 
 DATA2VECTEXT_START_DOCSTRING = r"""
     Data2VecText was proposed in [data2vec: A General Framework for Self-supervised Learning in Speech, Vision and
@@ -649,7 +642,7 @@ def update_keys_to_ignore(self, config, del_keys_to_ignore):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`RobertaTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -714,8 +707,6 @@ class Data2VecTextModel(Data2VecTextPreTrainedModel):
 
     """
 
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
@@ -744,7 +735,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPoolingAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -800,6 +790,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -884,9 +875,7 @@ def forward(
     """Data2VecText Model with a `language modeling` head on top for CLM fine-tuning.""", DATA2VECTEXT_START_DOCSTRING
 )
 class Data2VecTextForCausalLM(Data2VecTextPreTrainedModel):
-    _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -897,9 +886,6 @@ def __init__(self, config):
         self.data2vec_text = Data2VecTextModel(config, add_pooling_layer=False)
         self.lm_head = Data2VecTextLMHead(config)
 
-        # The LM head weights require special treatment only when they are tied with the word embeddings
-        self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
-
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -1000,6 +986,8 @@ def forward(
             shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
             labels = labels[:, 1:].contiguous()
             loss_fct = CrossEntropyLoss()
+
+            labels = labels.to(shifted_prediction_scores.device)
             lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
 
         if not return_dict:
@@ -1027,18 +1015,18 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attenti
 
         return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
 
-    def _reorder_cache(self, past, beam_idx):
+    def _reorder_cache(self, past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
         return reordered_past
 
 
 @add_start_docstrings("""data2vec Model with a `language modeling` head on top.""", DATA2VECTEXT_START_DOCSTRING)
 class Data2VecTextForMaskedLM(Data2VecTextPreTrainedModel):
-    _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1052,9 +1040,6 @@ def __init__(self, config):
         self.data2vec_text = Data2VecTextModel(config, add_pooling_layer=False)
         self.lm_head = Data2VecTextLMHead(config)
 
-        # The LM head weights require special treatment only when they are tied with the word embeddings
-        self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
-
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -1066,7 +1051,6 @@ def set_output_embeddings(self, new_embeddings):
 
     @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1116,6 +1100,8 @@ def forward(
         masked_lm_loss = None
         if labels is not None:
             loss_fct = CrossEntropyLoss()
+
+            labels = labels.to(prediction_scores.device)
             masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
 
         if not return_dict:
@@ -1170,8 +1156,6 @@ def _tie_weights(self):
     DATA2VECTEXT_START_DOCSTRING,
 )
 class Data2VecTextForSequenceClassification(Data2VecTextPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1185,7 +1169,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1227,6 +1210,8 @@ def forward(
 
         loss = None
         if labels is not None:
+            labels = labels.to(logits.device)
+
             if self.config.problem_type is None:
                 if self.num_labels == 1:
                     self.config.problem_type = "regression"
@@ -1268,8 +1253,6 @@ def forward(
     DATA2VECTEXT_START_DOCSTRING,
 )
 class Data2VecTextForMultipleChoice(Data2VecTextPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -1284,7 +1267,6 @@ def __init__(self, config):
         DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
     )
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1341,6 +1323,8 @@ def forward(
         loss = None
         if labels is not None:
             loss_fct = CrossEntropyLoss()
+
+            labels = labels.to(reshaped_logits.device)
             loss = loss_fct(reshaped_logits, labels)
 
         if not return_dict:
@@ -1363,9 +1347,6 @@ def forward(
     DATA2VECTEXT_START_DOCSTRING,
 )
 class Data2VecTextForTokenClassification(Data2VecTextPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1382,7 +1363,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1426,6 +1406,8 @@ def forward(
         loss = None
         if labels is not None:
             loss_fct = CrossEntropyLoss()
+
+            labels = labels.to(logits.device)
             loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
 
         if not return_dict:
@@ -1471,9 +1453,6 @@ def forward(self, features, **kwargs):
     DATA2VECTEXT_START_DOCSTRING,
 )
 class Data2VecTextForQuestionAnswering(Data2VecTextPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1486,7 +1465,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/data2vec/modeling_data2vec_vision.py b/src/transformers/models/data2vec/modeling_data2vec_vision.py
index bdd0b58cc3f5..f8fe59587af0 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_vision.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_vision.py
@@ -48,7 +48,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "Data2VecVisionConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "BeitImageProcessor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "facebook/data2vec-vision-base"
@@ -151,7 +150,6 @@ def __init__(self, config: Data2VecVisionConfig) -> None:
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.BoolTensor] = None) -> torch.Tensor:
-
         embeddings = self.patch_embeddings(pixel_values)
         batch_size, seq_len, _ = embeddings.size()
 
@@ -472,7 +470,7 @@ def __init__(self, config: Data2VecVisionConfig, window_size: tuple) -> None:
         relative_position_index[0:, 0] = self.num_relative_distance - 2
         relative_position_index[0, 0] = self.num_relative_distance - 1
 
-        self.register_buffer("relative_position_index", relative_position_index)
+        self.register_buffer("relative_position_index", relative_position_index, persistent=False)
 
     def forward(self) -> torch.Tensor:
         relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
@@ -606,7 +604,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 DATA2VEC_VISION_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`BeitImageProcessor`]. See
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
             [`BeitImageProcessor.__call__`] for details.
 
         head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
@@ -660,7 +658,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(DATA2VEC_VISION_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=Data2VecVisionModelOutputWithPooling,
         config_class=_CONFIG_FOR_DOC,
@@ -676,6 +673,10 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[tuple, Data2VecVisionModelOutputWithPooling]:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+        """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -760,7 +761,6 @@ def __init__(self, config: Data2VecVisionConfig) -> None:
 
     @add_start_docstrings_to_model_forward(DATA2VEC_VISION_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_IMAGE_CLASS_CHECKPOINT,
         output_type=ImageClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1120,8 +1120,10 @@ def compute_loss(self, logits, auxiliary_logits, labels):
         # compute weighted loss
         loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index)
         main_loss = loss_fct(upsampled_logits, labels)
-        auxiliary_loss = loss_fct(upsampled_auxiliary_logits, labels)
-        loss = main_loss + self.config.auxiliary_loss_weight * auxiliary_loss
+        loss = main_loss
+        if auxiliary_logits is not None:
+            auxiliary_loss = loss_fct(upsampled_auxiliary_logits, labels)
+            loss += self.config.auxiliary_loss_weight * auxiliary_loss
 
         return loss
 
diff --git a/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py b/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py
index 0a804aebd0a9..a5953467cdd2 100644
--- a/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py
+++ b/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py
@@ -14,16 +14,17 @@
 # limitations under the License.
 """ TF 2.0 Data2Vec Vision model."""
 
+
+from __future__ import annotations
+
 import collections.abc
 import math
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import numpy as np
 import tensorflow as tf
 
-from transformers.tf_utils import shape_list, stable_softmax
-
 from ...activations_tf import get_tf_activation
 from ...modeling_tf_outputs import (
     TFBaseModelOutput,
@@ -39,6 +40,7 @@
     keras_serializable,
     unpack_inputs,
 )
+from ...tf_utils import shape_list, stable_softmax
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -53,7 +55,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "Data2VecVisionConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "BeitImageProcessor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "facebook/data2vec-vision-base"
@@ -96,8 +97,8 @@ class TFData2VecVisionModelOutputWithPooling(TFBaseModelOutputWithPooling):
 
     last_hidden_state: tf.Tensor = None
     pooler_output: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 class TFData2VecVisionDropPath(tf.keras.layers.Layer):
@@ -165,8 +166,7 @@ def build(self, input_shape: tf.TensorShape):
 
         super().build(input_shape)
 
-    def call(self, pixel_values: tf.Tensor, bool_masked_pos: Optional[tf.Tensor] = None) -> tf.Tensor:
-
+    def call(self, pixel_values: tf.Tensor, bool_masked_pos: tf.Tensor | None = None) -> tf.Tensor:
         embeddings = self.patch_embeddings(pixel_values)
         batch_size, seq_len, projection_dim = shape_list(embeddings)
 
@@ -464,7 +464,7 @@ def __init__(
         )
         self.init_values = config.layer_scale_init_value
 
-    def build(self, input_shape: tf.TensorShape):
+    def build(self, input_shape: tf.TensorShape = None):
         if self.init_values > 0:
             self.lambda_1 = self.add_weight(
                 shape=(self.config.hidden_size),
@@ -598,7 +598,7 @@ def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] =
             self.relative_position_bias = None
 
         # stochastic depth decay rule
-        dpr = [x for x in tf.linspace(0.0, config.drop_path_rate, config.num_hidden_layers)]
+        dpr = list(tf.linspace(0.0, config.drop_path_rate, config.num_hidden_layers))
         self.layer = [
             TFData2VecVisionLayer(
                 config,
@@ -612,7 +612,7 @@ def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] =
     def call(
         self,
         hidden_states: tf.Tensor,
-        head_mask: Optional[tf.Tensor] = None,
+        head_mask: tf.Tensor | None = None,
         output_attentions: bool = False,
         output_hidden_states: bool = False,
         return_dict: bool = True,
@@ -688,9 +688,9 @@ class PreTrainedModel
     @unpack_inputs
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
-        bool_masked_pos: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
+        bool_masked_pos: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -774,36 +774,6 @@ class TFData2VecVisionPreTrainedModel(TFPreTrainedModel):
     main_input_name = "pixel_values"
     _keys_to_ignore_on_load_unexpected = [r"relative_position_index"]
 
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network. Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        VISION_DUMMY_INPUTS = tf.random.uniform(
-            shape=(3, self.config.num_channels, self.config.image_size, self.config.image_size),
-            dtype=tf.float32,
-        )
-        return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)}
-
-    @tf.function(
-        input_signature=[
-            {
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        """
-        Method used for serving the model.
-
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
-        """
-        output = self.call(inputs)
-        return self.serving_output(output)
-
 
 DATA2VEC_VISION_START_DOCSTRING = r"""
     This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
@@ -848,8 +818,8 @@ def serving(self, inputs):
 
 DATA2VEC_VISION_INPUTS_DOCSTRING = r"""
     Args:
-        pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`BeitImageProcessor`]. See
+        pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
             [`BeitImageProcessor.__call__`] for details.
 
         head_mask (`np.ndarray` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
@@ -894,7 +864,6 @@ def get_input_embeddings(self):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(DATA2VEC_VISION_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFData2VecVisionModelOutputWithPooling,
         config_class=_CONFIG_FOR_DOC,
@@ -903,15 +872,18 @@ def get_input_embeddings(self):
     )
     def call(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
-        bool_masked_pos: Optional[tf.Tensor] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        pixel_values: TFModelInputType | None = None,
+        bool_masked_pos: tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[tuple, TFData2VecVisionModelOutputWithPooling]:
-
+        r"""
+        bool_masked_pos (`tf.Tensor` of shape `(batch_size, num_patches)`, *optional*):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+        """
         outputs = self.data2vec_vision(
             pixel_values=pixel_values,
             bool_masked_pos=bool_masked_pos,
@@ -924,17 +896,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output: TFData2VecVisionModelOutputWithPooling) -> TFData2VecVisionModelOutputWithPooling:
-        hidden_states = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attentions = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFData2VecVisionModelOutputWithPooling(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            hidden_states=hidden_states,
-            attentions=attentions,
-        )
-
 
 @add_start_docstrings(
     """
@@ -960,7 +921,6 @@ def __init__(self, config: Data2VecVisionConfig, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(DATA2VEC_VISION_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_IMAGE_CLASS_CHECKPOINT,
         output_type=TFSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -968,12 +928,12 @@ def __init__(self, config: Data2VecVisionConfig, *inputs, **kwargs):
     )
     def call(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        pixel_values: TFModelInputType | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSequenceClassifierOutput, tuple]:
         r"""
@@ -1008,12 +968,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hidden_states = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attentions = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hidden_states, attentions=attentions)
-
 
 class TFData2VecVisionConvModule(tf.keras.layers.Layer):
     """
@@ -1030,7 +984,7 @@ def __init__(
         padding: str = "valid",
         bias: bool = False,
         dilation: Union[int, Tuple[int, int]] = 1,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         self.conv = tf.keras.layers.Conv2D(
@@ -1264,7 +1218,7 @@ def __init__(
         in_index: int = 2,
         kernel_size: int = 3,
         dilation: Union[int, Tuple[int, int]] = 1,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         self.in_channels = config.hidden_size
@@ -1380,9 +1334,9 @@ def masked_loss(real, pred):
     @replace_return_docstrings(output_type=TFSemanticSegmenterOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        labels: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        labels: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1429,11 +1383,11 @@ def call(
         # only keep certain features, and reshape
         # note that we do +1 as the encoder_hidden_states also includes the initial embeddings
         features = [feature for idx, feature in enumerate(encoder_hidden_states) if idx + 1 in self.config.out_indices]
-        batch_size = shape_list(pixel_values)[0]
         patch_resolution = self.config.image_size // self.config.patch_size
 
         def reshape_features(x):
-            x = tf.reshape(x, (batch_size, patch_resolution, patch_resolution, -1))
+            # We do it this way so TF can always infer the non-batch dims at compile time
+            x = tf.reshape(x, (-1, patch_resolution, patch_resolution, self.config.hidden_size))
             return x
 
         features = [reshape_features(x[:, 1:, :]) for x in features]
@@ -1474,9 +1428,3 @@ def reshape_features(x):
             hidden_states=outputs.hidden_states if output_hidden_states else None,
             attentions=outputs.attentions,
         )
-
-    def serving_output(self, output: TFSemanticSegmenterOutput) -> TFSemanticSegmenterOutput:
-        hidden_states = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attentions = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSemanticSegmenterOutput(logits=output.logits, hidden_states=hidden_states, attentions=attentions)
diff --git a/src/transformers/models/deberta/__init__.py b/src/transformers/models/deberta/__init__.py
index dda0c776c17a..87806dd60d60 100644
--- a/src/transformers/models/deberta/__init__.py
+++ b/src/transformers/models/deberta/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/deberta/configuration_deberta.py b/src/transformers/models/deberta/configuration_deberta.py
index ec00f0eccb2d..94ea91cd3a08 100644
--- a/src/transformers/models/deberta/configuration_deberta.py
+++ b/src/transformers/models/deberta/configuration_deberta.py
@@ -128,7 +128,7 @@ def __init__(
         pos_att_type=None,
         pooler_dropout=0,
         pooler_hidden_act="gelu",
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/deberta/modeling_deberta.py b/src/transformers/models/deberta/modeling_deberta.py
index ea575a42dada..6f6c2af63a67 100644
--- a/src/transformers/models/deberta/modeling_deberta.py
+++ b/src/transformers/models/deberta/modeling_deberta.py
@@ -38,7 +38,6 @@
 
 logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "DebertaConfig"
-_TOKENIZER_FOR_DOC = "DebertaTokenizer"
 _CHECKPOINT_FOR_DOC = "microsoft/deberta-base"
 
 # Masked LM docstring
@@ -46,14 +45,6 @@
 _MASKED_LM_EXPECTED_OUTPUT = "' Paris'"
 _MASKED_LM_EXPECTED_LOSS = "0.54"
 
-# TokenClassification docstring
-_CHECKPOINT_FOR_TOKEN_CLASSIFICATION = "dbsamu/deberta-base-finetuned-ner"
-_TOKEN_CLASS_EXPECTED_OUTPUT = (
-    "['LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0',"
-    " 'LABEL_0', 'LABEL_0']"
-)
-_TOKEN_CLASS_EXPECTED_LOSS = 0.04
-
 # QuestionAnswering docstring
 _CHECKPOINT_FOR_QA = "Palak/microsoft_deberta-large_squad"
 _QA_EXPECTED_OUTPUT = "' a nice puppet'"
@@ -61,11 +52,6 @@
 _QA_TARGET_START_INDEX = 12
 _QA_TARGET_END_INDEX = 14
 
-# SequenceClassification docstring
-_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "hf-internal-testing/tiny-random-deberta"
-_SEQ_CLASS_EXPECTED_OUTPUT = "'LABEL_0'"
-_SEQ_CLASS_EXPECTED_LOSS = "0.69"
-
 
 DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "microsoft/deberta-base",
@@ -153,13 +139,13 @@ def symbolic(g, self, mask, dim):
         r_mask = g.op(
             "Cast",
             g.op("Sub", g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64)), mask_cast_value),
-            to_i=sym_help.cast_pytorch_to_onnx["Byte"],
+            to_i=sym_help.cast_pytorch_to_onnx["Bool"],
         )
         output = masked_fill(
             g, self, r_mask, g.op("Constant", value_t=torch.tensor(torch.finfo(self.type().dtype()).min))
         )
         output = softmax(g, output, dim)
-        return masked_fill(g, output, r_mask, g.op("Constant", value_t=torch.tensor(0, dtype=torch.uint8)))
+        return masked_fill(g, output, r_mask, g.op("Constant", value_t=torch.tensor(0, dtype=torch.bool)))
 
 
 class DropoutContext(object):
@@ -434,7 +420,6 @@ def get_attention_mask(self, attention_mask):
         if attention_mask.dim() <= 2:
             extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
             attention_mask = extended_attention_mask * extended_attention_mask.squeeze(-2).unsqueeze(-1)
-            attention_mask = attention_mask.byte()
         elif attention_mask.dim() == 3:
             attention_mask = attention_mask.unsqueeze(1)
 
@@ -468,7 +453,6 @@ def forward(
             next_kv = hidden_states
         rel_embeddings = self.get_rel_embedding()
         for i, layer_module in enumerate(self.layer):
-
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
@@ -629,7 +613,7 @@ def forward(
                 Input states to the module usually the output from previous layer, it will be the Q,K and V in
                 *Attention(Q,K,V)*
 
-            attention_mask (`torch.ByteTensor`):
+            attention_mask (`torch.BoolTensor`):
                 An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
                 sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
                 th token.
@@ -780,7 +764,9 @@ def __init__(self, config):
         self.config = config
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
     def forward(self, input_ids=None, token_type_ids=None, position_ids=None, mask=None, inputs_embeds=None):
         if input_ids is not None:
@@ -837,7 +823,6 @@ class DebertaPreTrainedModel(PreTrainedModel):
 
     config_class = DebertaConfig
     base_model_prefix = "deberta"
-    _keys_to_ignore_on_load_missing = ["position_ids"]
     _keys_to_ignore_on_load_unexpected = ["position_embeddings"]
     supports_gradient_checkpointing = True
 
@@ -881,7 +866,7 @@ def _set_gradient_checkpointing(self, module, value=False):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`DebertaTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -950,7 +935,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -975,6 +959,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -1037,8 +1022,7 @@ def forward(
 
 @add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING)
 class DebertaForMaskedLM(DebertaPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"]
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1057,7 +1041,6 @@ def set_output_embeddings(self, new_embeddings):
 
     @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_MASKED_LM,
         output_type=MaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1117,16 +1100,17 @@ def forward(
         )
 
 
-# copied from transformers.models.bert.BertPredictionHeadTransform with bert -> deberta
 class DebertaPredictionHeadTransform(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
+
+        self.dense = nn.Linear(config.hidden_size, self.embedding_size)
         if isinstance(config.hidden_act, str):
             self.transform_act_fn = ACT2FN[config.hidden_act]
         else:
             self.transform_act_fn = config.hidden_act
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.LayerNorm = nn.LayerNorm(self.embedding_size, eps=config.layer_norm_eps)
 
     def forward(self, hidden_states):
         hidden_states = self.dense(hidden_states)
@@ -1135,15 +1119,15 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-# copied from transformers.models.bert.BertLMPredictionHead with bert -> deberta
 class DebertaLMPredictionHead(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.transform = DebertaPredictionHeadTransform(config)
 
+        self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
         # The output weights are the same as the input embeddings, but there is
         # an output-only bias for each token.
-        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.decoder = nn.Linear(self.embedding_size, config.vocab_size, bias=False)
 
         self.bias = nn.Parameter(torch.zeros(config.vocab_size))
 
@@ -1201,12 +1185,9 @@ def set_input_embeddings(self, new_embeddings):
 
     @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
+        checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
-        expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
     )
     def forward(
         self,
@@ -1296,8 +1277,6 @@ def forward(
     DEBERTA_START_DOCSTRING,
 )
 class DebertaForTokenClassification(DebertaPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1311,12 +1290,9 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_TOKEN_CLASSIFICATION,
+        checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output=_TOKEN_CLASS_EXPECTED_OUTPUT,
-        expected_loss=_TOKEN_CLASS_EXPECTED_LOSS,
     )
     def forward(
         self,
@@ -1374,8 +1350,6 @@ def forward(
     DEBERTA_START_DOCSTRING,
 )
 class DebertaForQuestionAnswering(DebertaPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1388,7 +1362,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_QA,
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/deberta/modeling_tf_deberta.py b/src/transformers/models/deberta/modeling_tf_deberta.py
index ff8e41d3abed..29c5a256d305 100644
--- a/src/transformers/models/deberta/modeling_tf_deberta.py
+++ b/src/transformers/models/deberta/modeling_tf_deberta.py
@@ -15,6 +15,8 @@
 """ TF 2.0 DeBERTa model."""
 
 
+from __future__ import annotations
+
 import math
 from typing import Dict, Optional, Sequence, Tuple, Union
 
@@ -39,7 +41,7 @@
     get_initializer,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_deberta import DebertaConfig
 
@@ -48,7 +50,6 @@
 
 
 _CONFIG_FOR_DOC = "DebertaConfig"
-_TOKENIZER_FOR_DOC = "DebertaTokenizer"
 _CHECKPOINT_FOR_DOC = "kamalkraj/deberta-base"
 
 TF_DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -93,7 +94,6 @@ def __init__(self, axis=-1, **kwargs):
         self.axis = axis
 
     def call(self, inputs: tf.Tensor, mask: tf.Tensor):
-
         rmask = tf.logical_not(tf.cast(mask, tf.bool))
         output = tf.where(rmask, float("-inf"), inputs)
         output = stable_softmax(output, self.axis)
@@ -353,7 +353,6 @@ def call(
         rel_embeddings = self.get_rel_embedding()
 
         for i, layer_module in enumerate(self.layer):
-
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
@@ -594,11 +593,10 @@ def call(
         else:
 
             def linear(w, b, x):
-                return tf.cond(
-                    b is not None,
-                    lambda: tf.matmul(x, w, transpose_b=True) + tf.transpose(b),
-                    lambda: tf.matmul(x, w, transpose_b=True),
-                )
+                out = tf.matmul(x, w, transpose_b=True)
+                if b is not None:
+                    out += tf.transpose(b)
+                return out
 
             ws = tf.split(
                 tf.transpose(self.in_proj.weight[0]), num_or_size_splits=self.num_attention_heads * 3, axis=0
@@ -660,7 +658,6 @@ def linear(w, b, x):
         return outputs
 
     def disentangled_att_bias(self, query_layer, key_layer, relative_pos, rel_embeddings, scale_factor):
-
         if relative_pos is None:
             q = shape_list(query_layer)[-2]
             relative_pos = build_relative_position(q, shape_list(key_layer)[-2])
@@ -722,15 +719,19 @@ class TFDebertaEmbeddings(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super().__init__(**kwargs)
 
-        self.vocab_size = config.vocab_size
-        self.type_vocab_size = config.type_vocab_size
+        self.config = config
         self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
         self.hidden_size = config.hidden_size
         self.max_position_embeddings = config.max_position_embeddings
         self.position_biased_input = getattr(config, "position_biased_input", True)
         self.initializer_range = config.initializer_range
         if self.embedding_size != config.hidden_size:
-            self.embed_proj = tf.keras.layers.Dense(config.hidden_size, use_bias=False)
+            self.embed_proj = tf.keras.layers.Dense(
+                config.hidden_size,
+                kernel_initializer=get_initializer(config.initializer_range),
+                name="embed_proj",
+                use_bias=False,
+            )
         self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
         self.dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="dropout")
 
@@ -738,15 +739,15 @@ def build(self, input_shape: tf.TensorShape):
         with tf.name_scope("word_embeddings"):
             self.weight = self.add_weight(
                 name="weight",
-                shape=[self.vocab_size, self.embedding_size],
+                shape=[self.config.vocab_size, self.embedding_size],
                 initializer=get_initializer(self.initializer_range),
             )
 
         with tf.name_scope("token_type_embeddings"):
-            if self.type_vocab_size > 0:
+            if self.config.type_vocab_size > 0:
                 self.token_type_embeddings = self.add_weight(
                     name="embeddings",
-                    shape=[self.type_vocab_size, self.embedding_size],
+                    shape=[self.config.type_vocab_size, self.embedding_size],
                     initializer=get_initializer(self.initializer_range),
                 )
             else:
@@ -783,16 +784,7 @@ def call(
             raise ValueError("Need to provide either `input_ids` or `input_embeds`.")
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
@@ -807,7 +799,7 @@ def call(
         if self.position_biased_input:
             position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
             final_embeddings += position_embeds
-        if self.type_vocab_size > 0:
+        if self.config.type_vocab_size > 0:
             token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
             final_embeddings += token_type_embeds
 
@@ -833,8 +825,10 @@ class TFDebertaPredictionHeadTransform(tf.keras.layers.Layer):
     def __init__(self, config: DebertaConfig, **kwargs):
         super().__init__(**kwargs)
 
+        self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
+
         self.dense = tf.keras.layers.Dense(
-            units=config.hidden_size,
+            units=self.embedding_size,
             kernel_initializer=get_initializer(config.initializer_range),
             name="dense",
         )
@@ -857,8 +851,8 @@ class TFDebertaLMPredictionHead(tf.keras.layers.Layer):
     def __init__(self, config: DebertaConfig, input_embeddings: tf.keras.layers.Layer, **kwargs):
         super().__init__(**kwargs)
 
-        self.vocab_size = config.vocab_size
-        self.hidden_size = config.hidden_size
+        self.config = config
+        self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
 
         self.transform = TFDebertaPredictionHeadTransform(config, name="transform")
 
@@ -867,7 +861,7 @@ def __init__(self, config: DebertaConfig, input_embeddings: tf.keras.layers.Laye
         self.input_embeddings = input_embeddings
 
     def build(self, input_shape: tf.TensorShape):
-        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
+        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
 
         super().build(input_shape)
 
@@ -883,14 +877,14 @@ def get_bias(self) -> Dict[str, tf.Variable]:
 
     def set_bias(self, value: tf.Variable):
         self.bias = value["bias"]
-        self.vocab_size = shape_list(value["bias"])[0]
+        self.config.vocab_size = shape_list(value["bias"])[0]
 
     def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
         hidden_states = self.transform(hidden_states=hidden_states)
         seq_length = shape_list(hidden_states)[1]
-        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size])
         hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
-        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
         hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
 
         return hidden_states
@@ -936,17 +930,16 @@ class PreTrainedModel
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
-
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
@@ -1049,7 +1042,7 @@ class TFDebertaPreTrainedModel(TFPreTrainedModel):
         input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`DebertaTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -1101,18 +1094,17 @@ def __init__(self, config: DebertaConfig, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFBaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1132,12 +1124,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output: TFBaseModelOutput) -> TFBaseModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutput(last_hidden_state=output.last_hidden_state, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING)
 class TFDebertaForMaskedLM(TFDebertaPreTrainedModel, TFMaskedLanguageModelingLoss):
@@ -1159,22 +1145,21 @@ def get_lm_head(self) -> tf.keras.layers.Layer:
     @unpack_inputs
     @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFMaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1209,12 +1194,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1244,22 +1223,21 @@ def __init__(self, config: DebertaConfig, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1297,12 +1275,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1326,22 +1298,21 @@ def __init__(self, config: DebertaConfig, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFTokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1375,12 +1346,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1403,23 +1368,22 @@ def __init__(self, config: DebertaConfig, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1466,11 +1430,3 @@ def call(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
-
-    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
diff --git a/src/transformers/models/deberta/tokenization_deberta.py b/src/transformers/models/deberta/tokenization_deberta.py
index bbddb00a2682..55fe35a427eb 100644
--- a/src/transformers/models/deberta/tokenization_deberta.py
+++ b/src/transformers/models/deberta/tokenization_deberta.py
@@ -16,7 +16,7 @@
 
 import json
 import os
-from typing import TYPE_CHECKING, List, Optional, Tuple
+from typing import List, Optional, Tuple
 
 import regex as re
 
@@ -24,9 +24,6 @@
 from ...utils import logging
 
 
-if TYPE_CHECKING:
-    from transformers.pipelines.conversational import Conversation
-
 logger = logging.get_logger(__name__)
 
 VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
@@ -116,13 +113,15 @@ class DebertaTokenizer(PreTrainedTokenizer):
     This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
     be encoded differently whether it is at the beginning of the sentence (without space) or not:
 
-    ```
+    ```python
     >>> from transformers import DebertaTokenizer
+
     >>> tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")
-    >>> tokenizer("Hello world")['input_ids']
-    [15496, 995]
-    >>> tokenizer(" Hello world")['input_ids']
-    [18435, 995]
+    >>> tokenizer("Hello world")["input_ids"]
+    [1, 31414, 232, 2]
+
+    >>> tokenizer(" Hello world")["input_ids"]
+    [1, 20920, 232, 2]
     ```
 
     You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
@@ -191,7 +190,7 @@ def __init__(
         mask_token="[MASK]",
         add_prefix_space=False,
         add_bos_token=False,
-        **kwargs
+        **kwargs,
     ):
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
@@ -202,20 +201,6 @@ def __init__(
 
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
-
-        super().__init__(
-            errors=errors,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            add_prefix_space=add_prefix_space,
-            add_bos_token=add_bos_token,
-            **kwargs,
-        )
         self.add_bos_token = add_bos_token
 
         with open(vocab_file, encoding="utf-8") as vocab_handle:
@@ -234,6 +219,20 @@ def __init__(
         # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
         self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
 
+        super().__init__(
+            errors=errors,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            add_prefix_space=add_prefix_space,
+            add_bos_token=add_bos_token,
+            **kwargs,
+        )
+
     @property
     # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.vocab_size
     def vocab_size(self):
@@ -431,12 +430,3 @@ def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
         if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
             text = " " + text
         return (text, kwargs)
-
-    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._build_conversation_input_ids
-    def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
-        input_ids = []
-        for is_user, text in conversation.iter_texts():
-            input_ids.extend(self.encode(text, add_special_tokens=False) + [self.eos_token_id])
-        if len(input_ids) > self.model_max_length:
-            input_ids = input_ids[-self.model_max_length :]
-        return input_ids
diff --git a/src/transformers/models/deberta/tokenization_deberta_fast.py b/src/transformers/models/deberta/tokenization_deberta_fast.py
index f708de163621..6d157fdf3c70 100644
--- a/src/transformers/models/deberta/tokenization_deberta_fast.py
+++ b/src/transformers/models/deberta/tokenization_deberta_fast.py
@@ -15,7 +15,7 @@
 """ Fast Tokenization class for model DeBERTa."""
 
 import json
-from typing import TYPE_CHECKING, List, Optional, Tuple
+from typing import List, Optional, Tuple
 
 from tokenizers import pre_tokenizers
 
@@ -25,10 +25,6 @@
 from .tokenization_deberta import DebertaTokenizer
 
 
-if TYPE_CHECKING:
-    from transformers.pipelines.conversational import Conversation
-
-
 logger = logging.get_logger(__name__)
 
 VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
@@ -79,13 +75,15 @@ class DebertaTokenizerFast(PreTrainedTokenizerFast):
     This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
     be encoded differently whether it is at the beginning of the sentence (without space) or not:
 
-    ```
+    ```python
     >>> from transformers import DebertaTokenizerFast
+
     >>> tokenizer = DebertaTokenizerFast.from_pretrained("microsoft/deberta-base")
-    >>> tokenizer("Hello world")['input_ids']
-    [15496, 995]
-    >>> tokenizer(" Hello world")['input_ids']
-    [18435, 995]
+    >>> tokenizer("Hello world")["input_ids"]
+    [1, 31414, 232, 2]
+
+    >>> tokenizer(" Hello world")["input_ids"]
+    [1, 20920, 232, 2]
     ```
 
     You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since
@@ -101,9 +99,9 @@ class DebertaTokenizerFast(PreTrainedTokenizerFast):
     refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (`str`):
+        vocab_file (`str`, *optional*):
             Path to the vocabulary file.
-        merges_file (`str`):
+        merges_file (`str`, *optional*):
             Path to the merges file.
         tokenizer_file (`str`, *optional*):
             The path to a tokenizer file to use instead of the vocab file.
@@ -154,9 +152,8 @@ def __init__(
         pad_token="[PAD]",
         mask_token="[MASK]",
         add_prefix_space=False,
-        **kwargs
+        **kwargs,
     ):
-
         super().__init__(
             vocab_file,
             merges_file,
@@ -287,14 +284,3 @@ def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
         files = self._tokenizer.model.save(save_directory, name=filename_prefix)
         return tuple(files)
-
-    # Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast._build_conversation_input_ids
-    def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
-        """This corresponds to DialoGPT variants of models."""
-        input_ids = []
-        for is_user, text in conversation.iter_texts():
-            input_ids.extend(self.encode(text, add_special_tokens=False) + [self.eos_token_id])
-
-        if len(input_ids) > self.model_max_length:
-            input_ids = input_ids[-self.model_max_length :]
-        return input_ids
diff --git a/src/transformers/models/deberta_v2/__init__.py b/src/transformers/models/deberta_v2/__init__.py
index 2b95253529e8..fb1b20a331fe 100644
--- a/src/transformers/models/deberta_v2/__init__.py
+++ b/src/transformers/models/deberta_v2/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -50,6 +46,7 @@
         "TF_DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST",
         "TFDebertaV2ForMaskedLM",
         "TFDebertaV2ForQuestionAnswering",
+        "TFDebertaV2ForMultipleChoice",
         "TFDebertaV2ForSequenceClassification",
         "TFDebertaV2ForTokenClassification",
         "TFDebertaV2Model",
@@ -99,6 +96,7 @@
         from .modeling_tf_deberta_v2 import (
             TF_DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST,
             TFDebertaV2ForMaskedLM,
+            TFDebertaV2ForMultipleChoice,
             TFDebertaV2ForQuestionAnswering,
             TFDebertaV2ForSequenceClassification,
             TFDebertaV2ForTokenClassification,
diff --git a/src/transformers/models/deberta_v2/configuration_deberta_v2.py b/src/transformers/models/deberta_v2/configuration_deberta_v2.py
index 3e7d0d97fe6f..d55486cd5633 100644
--- a/src/transformers/models/deberta_v2/configuration_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/configuration_deberta_v2.py
@@ -130,7 +130,7 @@ def __init__(
         pos_att_type=None,
         pooler_dropout=0,
         pooler_hidden_act="gelu",
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
index ce8830b0dcb7..b924ecd893a4 100644
--- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
@@ -40,34 +40,10 @@
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "DebertaV2Config"
-_TOKENIZER_FOR_DOC = "DebertaV2Tokenizer"
 _CHECKPOINT_FOR_DOC = "microsoft/deberta-v2-xlarge"
-
-# Masked LM docstring
-_CHECKPOINT_FOR_MASKED_LM = "hf-internal-testing/tiny-random-deberta-v2"
-_MASKED_LM_EXPECTED_OUTPUT = "'enberry'"
-_MASKED_LM_EXPECTED_LOSS = "11.85"
-
-# TokenClassification docstring
-_CHECKPOINT_FOR_TOKEN_CLASSIFICATION = "hf-internal-testing/tiny-random-deberta-v2"
-_TOKEN_CLASS_EXPECTED_OUTPUT = (
-    "['LABEL_0', 'LABEL_0', 'LABEL_1', 'LABEL_0', 'LABEL_0', 'LABEL_1', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0',"
-    " 'LABEL_0', 'LABEL_0']"
-)
-_TOKEN_CLASS_EXPECTED_LOSS = 0.61
-
-# QuestionAnswering docstring
-_CHECKPOINT_FOR_QA = "hf-internal-testing/tiny-random-deberta-v2"
-_QA_EXPECTED_OUTPUT = "'was Jim Henson? Jim Henson was'"
-_QA_EXPECTED_LOSS = 2.47
 _QA_TARGET_START_INDEX = 2
 _QA_TARGET_END_INDEX = 9
 
-# SequenceClassification docstring
-_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "hf-internal-testing/tiny-random-deberta-v2"
-_SEQ_CLASS_EXPECTED_OUTPUT = "'LABEL_1'"
-_SEQ_CLASS_EXPECTED_LOSS = "0.69"
-
 DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "microsoft/deberta-v2-xlarge",
     "microsoft/deberta-v2-xxlarge",
@@ -157,13 +133,13 @@ def symbolic(g, self, mask, dim):
         r_mask = g.op(
             "Cast",
             g.op("Sub", g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64)), mask_cast_value),
-            to_i=sym_help.cast_pytorch_to_onnx["Byte"],
+            to_i=sym_help.cast_pytorch_to_onnx["Bool"],
         )
         output = masked_fill(
             g, self, r_mask, g.op("Constant", value_t=torch.tensor(torch.finfo(self.type().dtype()).min))
         )
         output = softmax(g, output, dim)
-        return masked_fill(g, output, r_mask, g.op("Constant", value_t=torch.tensor(0, dtype=torch.uint8)))
+        return masked_fill(g, output, r_mask, g.op("Constant", value_t=torch.tensor(0, dtype=torch.bool)))
 
 
 # Copied from transformers.models.deberta.modeling_deberta.DropoutContext
@@ -491,7 +467,6 @@ def get_attention_mask(self, attention_mask):
         if attention_mask.dim() <= 2:
             extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
             attention_mask = extended_attention_mask * extended_attention_mask.squeeze(-2).unsqueeze(-1)
-            attention_mask = attention_mask.byte()
         elif attention_mask.dim() == 3:
             attention_mask = attention_mask.unsqueeze(1)
 
@@ -522,7 +497,7 @@ def forward(
         if attention_mask.dim() <= 2:
             input_mask = attention_mask
         else:
-            input_mask = (attention_mask.sum(-2) > 0).byte()
+            input_mask = attention_mask.sum(-2) > 0
         attention_mask = self.get_attention_mask(attention_mask)
         relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos)
 
@@ -536,7 +511,6 @@ def forward(
         rel_embeddings = self.get_rel_embedding()
         output_states = next_kv
         for i, layer_module in enumerate(self.layer):
-
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (output_states,)
 
@@ -731,7 +705,7 @@ def forward(
                 Input states to the module usually the output from previous layer, it will be the Q,K and V in
                 *Attention(Q,K,V)*
 
-            attention_mask (`torch.ByteTensor`):
+            attention_mask (`torch.BoolTensor`):
                 An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
                 sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
                 th token.
@@ -766,7 +740,7 @@ def forward(
         if "p2c" in self.pos_att_type:
             scale_factor += 1
         scale = torch.sqrt(torch.tensor(query_layer.size(-1), dtype=torch.float) * scale_factor)
-        attention_scores = torch.bmm(query_layer, key_layer.transpose(-1, -2)) / scale.to(dtype=query_layer.dtype)
+        attention_scores = torch.bmm(query_layer, key_layer.transpose(-1, -2) / scale.to(dtype=query_layer.dtype))
         if self.relative_attention:
             rel_embeddings = self.pos_dropout(rel_embeddings)
             rel_att = self.disentangled_attention_bias(
@@ -910,7 +884,9 @@ def __init__(self, config):
         self.config = config
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
     def forward(self, input_ids=None, token_type_ids=None, position_ids=None, mask=None, inputs_embeds=None):
         if input_ids is not None:
@@ -968,7 +944,6 @@ class DebertaV2PreTrainedModel(PreTrainedModel):
 
     config_class = DebertaV2Config
     base_model_prefix = "deberta"
-    _keys_to_ignore_on_load_missing = ["position_ids"]
     _keys_to_ignore_on_load_unexpected = ["position_embeddings"]
     supports_gradient_checkpointing = True
 
@@ -1012,7 +987,7 @@ def _set_gradient_checkpointing(self, module, value=False):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`DebertaV2Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -1082,7 +1057,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1107,6 +1081,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -1168,10 +1143,8 @@ def forward(
 
 
 @add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING)
-# Copied from transformers.models.deberta.modeling_deberta.DebertaForMaskedLM with Deberta->DebertaV2
 class DebertaV2ForMaskedLM(DebertaV2PreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"]
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1190,14 +1163,12 @@ def set_output_embeddings(self, new_embeddings):
 
     @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_MASKED_LM,
+        checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
         mask="[MASK]",
-        expected_output=_MASKED_LM_EXPECTED_OUTPUT,
-        expected_loss=_MASKED_LM_EXPECTED_LOSS,
     )
+    # Copied from transformers.models.deberta.modeling_deberta.DebertaForMaskedLM.forward with Deberta->DebertaV2
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
@@ -1250,16 +1221,18 @@ def forward(
         )
 
 
-# copied from transformers.models.bert.BertPredictionHeadTransform with bert -> deberta
+# Copied from transformers.models.deberta.modeling_deberta.DebertaPredictionHeadTransform with Deberta->DebertaV2
 class DebertaV2PredictionHeadTransform(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
+
+        self.dense = nn.Linear(config.hidden_size, self.embedding_size)
         if isinstance(config.hidden_act, str):
             self.transform_act_fn = ACT2FN[config.hidden_act]
         else:
             self.transform_act_fn = config.hidden_act
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.LayerNorm = nn.LayerNorm(self.embedding_size, eps=config.layer_norm_eps)
 
     def forward(self, hidden_states):
         hidden_states = self.dense(hidden_states)
@@ -1268,15 +1241,16 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-# copied from transformers.models.bert.BertLMPredictionHead with bert -> deberta
+# Copied from transformers.models.deberta.modeling_deberta.DebertaLMPredictionHead with Deberta->DebertaV2
 class DebertaV2LMPredictionHead(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.transform = DebertaV2PredictionHeadTransform(config)
 
+        self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
         # The output weights are the same as the input embeddings, but there is
         # an output-only bias for each token.
-        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.decoder = nn.Linear(self.embedding_size, config.vocab_size, bias=False)
 
         self.bias = nn.Parameter(torch.zeros(config.vocab_size))
 
@@ -1307,7 +1281,6 @@ def forward(self, sequence_output):
     """,
     DEBERTA_START_DOCSTRING,
 )
-# Copied from transformers.models.deberta.modeling_deberta.DebertaForSequenceClassification with Deberta->DebertaV2
 class DebertaV2ForSequenceClassification(DebertaV2PreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -1338,13 +1311,11 @@ def set_input_embeddings(self, new_embeddings):
 
     @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
+        checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
-        expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
     )
+    # Copied from transformers.models.deberta.modeling_deberta.DebertaForSequenceClassification.forward with Deberta->DebertaV2
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
@@ -1434,8 +1405,6 @@ def forward(
 )
 # Copied from transformers.models.deberta.modeling_deberta.DebertaForTokenClassification with Deberta->DebertaV2
 class DebertaV2ForTokenClassification(DebertaV2PreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1449,12 +1418,9 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_TOKEN_CLASSIFICATION,
+        checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output=_TOKEN_CLASS_EXPECTED_OUTPUT,
-        expected_loss=_TOKEN_CLASS_EXPECTED_LOSS,
     )
     def forward(
         self,
@@ -1511,10 +1477,7 @@ def forward(
     """,
     DEBERTA_START_DOCSTRING,
 )
-# Copied from transformers.models.deberta.modeling_deberta.DebertaForQuestionAnswering with Deberta->DebertaV2
 class DebertaV2ForQuestionAnswering(DebertaV2PreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1527,15 +1490,13 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_QA,
+        checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output=_QA_EXPECTED_OUTPUT,
-        expected_loss=_QA_EXPECTED_LOSS,
         qa_target_start_index=_QA_TARGET_START_INDEX,
         qa_target_end_index=_QA_TARGET_END_INDEX,
     )
+    # Copied from transformers.models.deberta.modeling_deberta.DebertaForQuestionAnswering.forward with Deberta->DebertaV2
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
@@ -1642,7 +1603,6 @@ def set_input_embeddings(self, new_embeddings):
 
     @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
index 3890731b4dbc..fa2cf1df74d0 100644
--- a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 """ TF 2.0 DeBERTa-v2 model."""
 
+from __future__ import annotations
 
 from typing import Dict, Optional, Tuple, Union
 
@@ -24,6 +25,7 @@
 from ...modeling_tf_outputs import (
     TFBaseModelOutput,
     TFMaskedLMOutput,
+    TFMultipleChoiceModelOutput,
     TFQuestionAnsweringModelOutput,
     TFSequenceClassifierOutput,
     TFTokenClassifierOutput,
@@ -31,6 +33,7 @@
 from ...modeling_tf_utils import (
     TFMaskedLanguageModelingLoss,
     TFModelInputType,
+    TFMultipleChoiceLoss,
     TFPreTrainedModel,
     TFQuestionAnsweringLoss,
     TFSequenceClassificationLoss,
@@ -38,16 +41,14 @@
     get_initializer,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_deberta_v2 import DebertaV2Config
 
 
 logger = logging.get_logger(__name__)
 
-
 _CONFIG_FOR_DOC = "DebertaV2Config"
-_TOKENIZER_FOR_DOC = "DebertaV2Tokenizer"
 _CHECKPOINT_FOR_DOC = "kamalkraj/deberta-v2-xlarge"
 
 TF_DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -94,7 +95,6 @@ def __init__(self, axis=-1, **kwargs):
         self.axis = axis
 
     def call(self, inputs: tf.Tensor, mask: tf.Tensor):
-
         rmask = tf.logical_not(tf.cast(mask, tf.bool))
         output = tf.where(rmask, float("-inf"), inputs)
         output = stable_softmax(output, self.axis)
@@ -417,7 +417,6 @@ def call(
         rel_embeddings = self.get_rel_embedding()
         output_states = next_kv
         for i, layer_module in enumerate(self.layer):
-
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (output_states,)
 
@@ -677,7 +676,7 @@ def call(
         if "p2c" in self.pos_att_type:
             scale_factor += 1
         scale = tf.math.sqrt(tf.cast(shape_list(query_layer)[-1] * scale_factor, tf.float32))
-        attention_scores = tf.matmul(query_layer, tf.transpose(key_layer, [0, 2, 1])) / scale
+        attention_scores = tf.matmul(query_layer, tf.transpose(key_layer, [0, 2, 1]) / scale)
         if self.relative_attention:
             rel_embeddings = self.pos_dropout(rel_embeddings)
             rel_att = self.disentangled_att_bias(query_layer, key_layer, relative_pos, rel_embeddings, scale_factor)
@@ -714,7 +713,6 @@ def call(
         return outputs
 
     def disentangled_att_bias(self, query_layer, key_layer, relative_pos, rel_embeddings, scale_factor):
-
         if relative_pos is None:
             q = shape_list(query_layer)[-2]
             relative_pos = build_relative_position(
@@ -811,15 +809,19 @@ class TFDebertaV2Embeddings(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super().__init__(**kwargs)
 
-        self.vocab_size = config.vocab_size
-        self.type_vocab_size = config.type_vocab_size
+        self.config = config
         self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
         self.hidden_size = config.hidden_size
         self.max_position_embeddings = config.max_position_embeddings
         self.position_biased_input = getattr(config, "position_biased_input", True)
         self.initializer_range = config.initializer_range
         if self.embedding_size != config.hidden_size:
-            self.embed_proj = tf.keras.layers.Dense(config.hidden_size, use_bias=False)
+            self.embed_proj = tf.keras.layers.Dense(
+                config.hidden_size,
+                kernel_initializer=get_initializer(config.initializer_range),
+                name="embed_proj",
+                use_bias=False,
+            )
         self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
         self.dropout = TFDebertaV2StableDropout(config.hidden_dropout_prob, name="dropout")
 
@@ -827,15 +829,15 @@ def build(self, input_shape: tf.TensorShape):
         with tf.name_scope("word_embeddings"):
             self.weight = self.add_weight(
                 name="weight",
-                shape=[self.vocab_size, self.embedding_size],
+                shape=[self.config.vocab_size, self.embedding_size],
                 initializer=get_initializer(self.initializer_range),
             )
 
         with tf.name_scope("token_type_embeddings"):
-            if self.type_vocab_size > 0:
+            if self.config.type_vocab_size > 0:
                 self.token_type_embeddings = self.add_weight(
                     name="embeddings",
-                    shape=[self.type_vocab_size, self.embedding_size],
+                    shape=[self.config.type_vocab_size, self.embedding_size],
                     initializer=get_initializer(self.initializer_range),
                 )
             else:
@@ -872,16 +874,7 @@ def call(
             raise ValueError("Need to provide either `input_ids` or `input_embeds`.")
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
@@ -896,7 +889,7 @@ def call(
         if self.position_biased_input:
             position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
             final_embeddings += position_embeds
-        if self.type_vocab_size > 0:
+        if self.config.type_vocab_size > 0:
             token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
             final_embeddings += token_type_embeds
 
@@ -923,8 +916,10 @@ class TFDebertaV2PredictionHeadTransform(tf.keras.layers.Layer):
     def __init__(self, config: DebertaV2Config, **kwargs):
         super().__init__(**kwargs)
 
+        self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
+
         self.dense = tf.keras.layers.Dense(
-            units=config.hidden_size,
+            units=self.embedding_size,
             kernel_initializer=get_initializer(config.initializer_range),
             name="dense",
         )
@@ -948,8 +943,8 @@ class TFDebertaV2LMPredictionHead(tf.keras.layers.Layer):
     def __init__(self, config: DebertaV2Config, input_embeddings: tf.keras.layers.Layer, **kwargs):
         super().__init__(**kwargs)
 
-        self.vocab_size = config.vocab_size
-        self.hidden_size = config.hidden_size
+        self.config = config
+        self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
 
         self.transform = TFDebertaV2PredictionHeadTransform(config, name="transform")
 
@@ -958,7 +953,7 @@ def __init__(self, config: DebertaV2Config, input_embeddings: tf.keras.layers.La
         self.input_embeddings = input_embeddings
 
     def build(self, input_shape: tf.TensorShape):
-        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
+        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
 
         super().build(input_shape)
 
@@ -974,14 +969,14 @@ def get_bias(self) -> Dict[str, tf.Variable]:
 
     def set_bias(self, value: tf.Variable):
         self.bias = value["bias"]
-        self.vocab_size = shape_list(value["bias"])[0]
+        self.config.vocab_size = shape_list(value["bias"])[0]
 
     def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
         hidden_states = self.transform(hidden_states=hidden_states)
         seq_length = shape_list(hidden_states)[1]
-        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size])
         hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
-        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
         hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
 
         return hidden_states
@@ -1028,17 +1023,16 @@ class PreTrainedModel
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
-
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
@@ -1142,7 +1136,7 @@ class TFDebertaV2PreTrainedModel(TFPreTrainedModel):
         input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`DebertaV2Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -1195,18 +1189,17 @@ def __init__(self, config: DebertaV2Config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFBaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1226,12 +1219,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output: TFBaseModelOutput) -> TFBaseModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutput(last_hidden_state=output.last_hidden_state, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING)
 # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaForMaskedLM with Deberta->DebertaV2
@@ -1254,22 +1241,21 @@ def get_lm_head(self) -> tf.keras.layers.Layer:
     @unpack_inputs
     @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFMaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1304,12 +1290,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1340,22 +1320,21 @@ def __init__(self, config: DebertaV2Config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1393,12 +1372,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1423,22 +1396,21 @@ def __init__(self, config: DebertaV2Config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFTokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1472,12 +1444,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1501,23 +1467,22 @@ def __init__(self, config: DebertaV2Config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1565,10 +1530,101 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
 
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
+@add_start_docstrings(
+    """
+    DeBERTa Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    DEBERTA_START_DOCSTRING,
+)
+class TFDebertaV2ForMultipleChoice(TFDebertaV2PreTrainedModel, TFMultipleChoiceLoss):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    # _keys_to_ignore_on_load_unexpected = [r"mlm___cls", r"nsp___cls", r"cls.predictions", r"cls.seq_relationship"]
+    # _keys_to_ignore_on_load_missing = [r"dropout"]
+
+    def __init__(self, config: DebertaV2Config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.deberta = TFDebertaV2MainLayer(config, name="deberta")
+        self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.pooler = TFDebertaV2ContextPooler(config, name="pooler")
+        self.classifier = tf.keras.layers.Dense(
+            units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFMultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: Optional[bool] = False,
+    ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
+        r"""
+        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
+            where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
+        """
+        if input_ids is not None:
+            num_choices = shape_list(input_ids)[1]
+            seq_length = shape_list(input_ids)[2]
+        else:
+            num_choices = shape_list(inputs_embeds)[1]
+            seq_length = shape_list(inputs_embeds)[2]
+
+        flat_input_ids = tf.reshape(tensor=input_ids, shape=(-1, seq_length)) if input_ids is not None else None
+        flat_attention_mask = (
+            tf.reshape(tensor=attention_mask, shape=(-1, seq_length)) if attention_mask is not None else None
+        )
+        flat_token_type_ids = (
+            tf.reshape(tensor=token_type_ids, shape=(-1, seq_length)) if token_type_ids is not None else None
+        )
+        flat_position_ids = (
+            tf.reshape(tensor=position_ids, shape=(-1, seq_length)) if position_ids is not None else None
+        )
+        flat_inputs_embeds = (
+            tf.reshape(tensor=inputs_embeds, shape=(-1, seq_length, shape_list(inputs_embeds)[3]))
+            if inputs_embeds is not None
+            else None
+        )
+        outputs = self.deberta(
+            input_ids=flat_input_ids,
+            attention_mask=flat_attention_mask,
+            token_type_ids=flat_token_type_ids,
+            position_ids=flat_position_ids,
+            inputs_embeds=flat_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = outputs[0]
+        pooled_output = self.pooler(sequence_output, training=training)
+        pooled_output = self.dropout(pooled_output, training=training)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = tf.reshape(tensor=logits, shape=(-1, num_choices))
+        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=reshaped_logits)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFMultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
diff --git a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
index fc259dd7d5ee..4d408252a2bd 100644
--- a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
@@ -20,9 +20,12 @@
 
 import sentencepiece as sp
 
-from ...tokenization_utils import PreTrainedTokenizer
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...utils import logging
 
 
+logger = logging.get_logger(__name__)
+
 PRETRAINED_VOCAB_FILES_MAP = {
     "vocab_file": {
         "microsoft/deberta-v2-xlarge": "https://huggingface.co/microsoft/deberta-v2-xlarge/resolve/main/spm.model",
@@ -120,10 +123,22 @@ def __init__(
         cls_token="[CLS]",
         mask_token="[MASK]",
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
 
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
+                " model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
+        self.do_lower_case = do_lower_case
+        self.split_by_punct = split_by_punct
+        self.vocab_file = vocab_file
+        self._tokenizer = SPMTokenizer(
+            vocab_file, None, split_by_punct=split_by_punct, sp_model_kwargs=self.sp_model_kwargs
+        )
+        unk_token = AddedToken(unk_token, normalized=True, lstrip=False, rstrip=False)
         super().__init__(
             do_lower_case=do_lower_case,
             bos_token=bos_token,
@@ -137,18 +152,7 @@ def __init__(
             sp_model_kwargs=self.sp_model_kwargs,
             **kwargs,
         )
-
-        if not os.path.isfile(vocab_file):
-            raise ValueError(
-                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
-                " model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
-            )
-        self.do_lower_case = do_lower_case
-        self.split_by_punct = split_by_punct
-        self.vocab_file = vocab_file
-        self._tokenizer = SPMTokenizer(
-            vocab_file, self.all_special_tokens, split_by_punct=split_by_punct, sp_model_kwargs=self.sp_model_kwargs
-        )
+        self._tokenizer.special_tokens = self.all_special_tokens
 
     @property
     def vocab_size(self):
@@ -374,6 +378,7 @@ def decode(self, tokens, start=-1, end=-1, raw_text=None):
             text = "".join(words[word_start:word_end])
             return text
 
+    # TODO add a deprecation cycle as this can have different behaviour from our API
     def add_special_token(self, token):
         if token not in self.special_tokens:
             self.special_tokens.append(token)
@@ -383,6 +388,9 @@ def add_special_token(self, token):
         return self.id(token)
 
     def part_of_whole_word(self, token, is_bos=False):
+        logger.warning_once(
+            "The `DebertaTokenizer.part_of_whole_word` method is deprecated and will be removed in `transformers==4.35`"
+        )
         if is_bos:
             return True
         if (
@@ -413,6 +421,9 @@ def sym(self, id):
         return self.ids_to_tokens[id]
 
     def id(self, sym):
+        logger.warning_once(
+            "The `DebertaTokenizer.id` method is deprecated and will be removed in `transformers==4.35`"
+        )
         return self.vocab[sym] if sym in self.vocab else 1
 
     def _encode_as_pieces(self, text):
@@ -460,17 +471,6 @@ def split_to_words(self, text):
 
         return words
 
-    def _run_strip_accents(self, text):
-        """Strips accents from a piece of text."""
-        text = unicodedata.normalize("NFD", text)
-        output = []
-        for char in text:
-            cat = unicodedata.category(char)
-            if cat == "Mn":
-                continue
-            output.append(char)
-        return "".join(output)
-
     def _run_split_on_punc(self, text):
         """Splits punctuation on a piece of text."""
         chars = list(text)
diff --git a/src/transformers/models/deberta_v2/tokenization_deberta_v2_fast.py b/src/transformers/models/deberta_v2/tokenization_deberta_v2_fast.py
index 32ccd84862fa..dab376ce95be 100644
--- a/src/transformers/models/deberta_v2/tokenization_deberta_v2_fast.py
+++ b/src/transformers/models/deberta_v2/tokenization_deberta_v2_fast.py
@@ -128,7 +128,7 @@ def __init__(
         pad_token="[PAD]",
         cls_token="[CLS]",
         mask_token="[MASK]",
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(
             vocab_file,
@@ -148,7 +148,10 @@ def __init__(
         self.do_lower_case = do_lower_case
         self.split_by_punct = split_by_punct
         self.vocab_file = vocab_file
-        self.can_save_slow_tokenizer = False if not self.vocab_file else True
+
+    @property
+    def can_save_slow_tokenizer(self) -> bool:
+        return os.path.isfile(self.vocab_file) if self.vocab_file else False
 
     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
         """
diff --git a/src/transformers/models/decision_transformer/__init__.py b/src/transformers/models/decision_transformer/__init__.py
index b4b083af04b6..44070229aaa8 100644
--- a/src/transformers/models/decision_transformer/__init__.py
+++ b/src/transformers/models/decision_transformer/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,7 +13,6 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-# rely on isort to merge the imports
 from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
 
 
diff --git a/src/transformers/models/decision_transformer/configuration_decision_transformer.py b/src/transformers/models/decision_transformer/configuration_decision_transformer.py
index fc11b1062437..88ff005469cd 100644
--- a/src/transformers/models/decision_transformer/configuration_decision_transformer.py
+++ b/src/transformers/models/decision_transformer/configuration_decision_transformer.py
@@ -57,11 +57,9 @@ class DecisionTransformerConfig(PretrainedConfig):
         n_positions (`int`, *optional*, defaults to 1024):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        n_embd (`int`, *optional*, defaults to 768):
-            Dimensionality of the embeddings and hidden states.
-        n_layer (`int`, *optional*, defaults to 12):
+        n_layer (`int`, *optional*, defaults to 3):
             Number of hidden layers in the Transformer encoder.
-        n_head (`int`, *optional*, defaults to 12):
+        n_head (`int`, *optional*, defaults to 1):
             Number of attention heads for each attention layer in the Transformer encoder.
         n_inner (`int`, *optional*):
             Dimensionality of the inner feed-forward layers. If unset, will default to 4 times `n_embd`.
@@ -119,7 +117,6 @@ def __init__(
         action_tanh=True,
         vocab_size=1,
         n_positions=1024,
-        n_embd=768,
         n_layer=3,
         n_head=1,
         n_inner=None,
@@ -129,11 +126,6 @@ def __init__(
         attn_pdrop=0.1,
         layer_norm_epsilon=1e-5,
         initializer_range=0.02,
-        summary_type="cls_index",
-        summary_use_proj=True,
-        summary_activation=None,
-        summary_proj_to_labels=True,
-        summary_first_dropout=0.1,
         scale_attn_weights=True,
         use_cache=True,
         bos_token_id=50256,
@@ -142,7 +134,6 @@ def __init__(
         reorder_and_upcast_attn=False,
         **kwargs,
     ):
-
         self.state_dim = state_dim
         self.act_dim = act_dim
         self.hidden_size = hidden_size
@@ -150,7 +141,6 @@ def __init__(
         self.action_tanh = action_tanh
         self.vocab_size = vocab_size
         self.n_positions = n_positions
-        self.n_embd = n_embd
         self.n_layer = n_layer
         self.n_head = n_head
         self.n_inner = n_inner
@@ -160,11 +150,6 @@ def __init__(
         self.attn_pdrop = attn_pdrop
         self.layer_norm_epsilon = layer_norm_epsilon
         self.initializer_range = initializer_range
-        self.summary_type = summary_type
-        self.summary_use_proj = summary_use_proj
-        self.summary_activation = summary_activation
-        self.summary_first_dropout = summary_first_dropout
-        self.summary_proj_to_labels = summary_proj_to_labels
         self.scale_attn_weights = scale_attn_weights
         self.use_cache = use_cache
         self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
diff --git a/src/transformers/models/decision_transformer/modeling_decision_transformer.py b/src/transformers/models/decision_transformer/modeling_decision_transformer.py
index 5008c7ed7e9c..8e5053a4160d 100755
--- a/src/transformers/models/decision_transformer/modeling_decision_transformer.py
+++ b/src/transformers/models/decision_transformer/modeling_decision_transformer.py
@@ -96,10 +96,9 @@ def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
                 num = int(scope_names[1])
                 pointer = pointer[num]
         try:
-            assert (
-                pointer.shape == array.shape
-            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
-        except AssertionError as e:
+            if pointer.shape != array.shape:
+                raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
+        except ValueError as e:
             e.args += (pointer.shape, array.shape)
             raise
         logger.info(f"Initialize PyTorch weight {name}")
@@ -115,11 +114,12 @@ def __init__(self, config, is_cross_attention=False, layer_idx=None):
         max_positions = config.max_position_embeddings
         self.register_buffer(
             "bias",
-            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.uint8)).view(
+            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
                 1, 1, max_positions, max_positions
             ),
+            persistent=False,
         )
-        self.register_buffer("masked_bias", torch.tensor(-1e4))
+        self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)
 
         self.embed_dim = config.hidden_size
         self.num_heads = config.num_attention_heads
@@ -181,7 +181,7 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None):
         if not self.is_cross_attention:
             # if only "normal" attention layer implements causal mask
             query_length, key_length = query.size(-2), key.size(-2)
-            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].to(torch.bool)
+            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
             mask_value = torch.finfo(attn_weights.dtype).min
             # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
             # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
@@ -231,7 +231,7 @@ def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, hea
         if not self.is_cross_attention:
             # if only "normal" attention layer implements causal mask
             query_length, key_length = query.size(-2), key.size(-2)
-            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].bool()
+            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
             mask_value = torch.finfo(attn_weights.dtype).min
             # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
             # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
@@ -288,8 +288,8 @@ def forward(
         if encoder_hidden_states is not None:
             if not hasattr(self, "q_attn"):
                 raise ValueError(
-                    "If class is used as cross attention, the weights `q_attn` have to be defined. Please make sure to"
-                    " instantiate class with `DecisionTransformerGPT2Attention(..., is_cross_attention=True)`."
+                    "If class is used as cross attention, the weights `q_attn` have to be defined. "
+                    "Please make sure to instantiate class with `DecisionTransformerGPT2Attention(..., is_cross_attention=True)`."
                 )
 
             query = self.q_attn(hidden_states)
@@ -475,8 +475,6 @@ def _set_gradient_checkpointing(self, module, value=False):
 
 
 class DecisionTransformerGPT2Model(DecisionTransformerGPT2PreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["attn.masked_bias"]
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -532,6 +530,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
             batch_size = input_ids.shape[0]
@@ -545,8 +544,6 @@ def forward(
 
         if token_type_ids is not None:
             token_type_ids = token_type_ids.view(-1, input_shape[-1])
-        if position_ids is not None:
-            position_ids = position_ids.view(-1, input_shape[-1])
 
         if past_key_values is None:
             past_length = 0
@@ -555,7 +552,7 @@ def forward(
             past_length = past_key_values[0][0].size(-2)
         if position_ids is None:
             position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
-            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
+            position_ids = position_ids.unsqueeze(0)
 
         # GPT2Attention mask.
         if attention_mask is not None:
@@ -605,14 +602,20 @@ def forward(
 
         hidden_states = self.drop(hidden_states)
 
-        output_shape = input_shape + (hidden_states.size(-1),)
+        output_shape = (-1,) + input_shape[1:] + (hidden_states.size(-1),)
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
 
         presents = () if use_cache else None
         all_self_attentions = () if output_attentions else None
         all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
         all_hidden_states = () if output_hidden_states else None
         for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-
             # Model parallel
             if self.model_parallel:
                 torch.cuda.set_device(hidden_states.device)
@@ -629,12 +632,6 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         # None for past_key_value
@@ -746,7 +743,6 @@ class DecisionTransformerPreTrainedModel(PreTrainedModel):
     base_model_prefix = "decision_transformer"
     main_input_name = "states"
     supports_gradient_checkpointing = False
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -789,7 +785,7 @@ def _init_weights(self, module):
             The returns for each state in the trajectory
         timesteps (`torch.LongTensor` of shape `(batch_size, episode_length)`):
             The timestep for each step in the trajectory
-        attention_mask (`torch.LongTensor` of shape `(batch_size, episode_length)`):
+        attention_mask (`torch.FloatTensor` of shape `(batch_size, episode_length)`):
             Masking, used to mask the actions when performing autoregressive prediction
 """
 
@@ -832,16 +828,16 @@ def __init__(self, config):
     @replace_return_docstrings(output_type=DecisionTransformerOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        states=None,
-        actions=None,
-        rewards=None,
-        returns_to_go=None,
-        timesteps=None,
-        attention_mask=None,
-        output_hidden_states=None,
-        output_attentions=None,
-        return_dict=None,
-    ) -> Union[Tuple, DecisionTransformerOutput]:
+        states: Optional[torch.FloatTensor] = None,
+        actions: Optional[torch.FloatTensor] = None,
+        rewards: Optional[torch.FloatTensor] = None,
+        returns_to_go: Optional[torch.FloatTensor] = None,
+        timesteps: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], DecisionTransformerOutput]:
         r"""
         Returns:
 
diff --git a/src/transformers/models/deformable_detr/__init__.py b/src/transformers/models/deformable_detr/__init__.py
index dd76e06c7ba0..a560265f4bfc 100644
--- a/src/transformers/models/deformable_detr/__init__.py
+++ b/src/transformers/models/deformable_detr/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,7 +14,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_timm_available, is_vision_available
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
 
 
 _import_structure = {
@@ -35,7 +31,7 @@
     _import_structure["image_processing_deformable_detr"] = ["DeformableDetrImageProcessor"]
 
 try:
-    if not is_timm_available():
+    if not is_torch_available():
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     pass
@@ -61,7 +57,7 @@
         from .image_processing_deformable_detr import DeformableDetrImageProcessor
 
     try:
-        if not is_timm_available():
+        if not is_torch_available():
             raise OptionalDependencyNotAvailable()
     except OptionalDependencyNotAvailable:
         pass
diff --git a/src/transformers/models/deformable_detr/configuration_deformable_detr.py b/src/transformers/models/deformable_detr/configuration_deformable_detr.py
index 218f6d3506e9..dbe5fd7f0a78 100644
--- a/src/transformers/models/deformable_detr/configuration_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/configuration_deformable_detr.py
@@ -16,6 +16,7 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
+from ..auto import CONFIG_MAPPING
 
 
 logger = logging.get_logger(__name__)
@@ -37,6 +38,14 @@ class DeformableDetrConfig(PretrainedConfig):
     documentation from [`PretrainedConfig`] for more information.
 
     Args:
+        use_timm_backbone (`bool`, *optional*, defaults to `True`):
+            Whether or not to use the `timm` library for the backbone. If set to `False`, will use the [`AutoBackbone`]
+            API.
+        backbone_config (`PretrainedConfig` or `dict`, *optional*):
+            The configuration of the backbone model. Only used in case `use_timm_backbone` is set to `False` in which
+            case it will default to `ResNetConfig()`.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
         num_queries (`int`, *optional*, defaults to 300):
             Number of object queries, i.e. detection slots. This is the maximal number of objects
             [`DeformableDetrModel`] can detect in a single image. In case `two_stage` is set to `True`, we use
@@ -68,22 +77,22 @@ class DeformableDetrConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         init_xavier_std (`float`, *optional*, defaults to 1):
             The scaling factor used for the Xavier initialization gain in the HM Attention map module.
-        encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
             The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
             for more details.
-        decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
-            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
-            for more details.
         auxiliary_loss (`bool`, *optional*, defaults to `False`):
             Whether auxiliary decoding losses (loss at each decoder layer) are to be used.
         position_embedding_type (`str`, *optional*, defaults to `"sine"`):
             Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`.
         backbone (`str`, *optional*, defaults to `"resnet50"`):
-            Name of convolutional backbone to use. Supports any convolutional backbone from the timm package. For a
-            list of all available models, see [this
+            Name of convolutional backbone to use in case `use_timm_backbone` = `True`. Supports any convolutional
+            backbone from the timm package. For a list of all available models, see [this
             page](https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model).
+        use_pretrained_backbone (`bool`, *optional*, defaults to `True`):
+            Whether to use pretrained weights for the backbone. Only supported when `use_timm_backbone` = `True`.
         dilation (`bool`, *optional*, defaults to `False`):
-            Whether to replace stride with dilation in the last convolutional block (DC5).
+            Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when
+            `use_timm_backbone` = `True`.
         class_cost (`float`, *optional*, defaults to 1):
             Relative weight of the classification error in the Hungarian matching cost.
         bbox_cost (`float`, *optional*, defaults to 5):
@@ -116,6 +125,9 @@ class DeformableDetrConfig(PretrainedConfig):
             based on the predictions from the previous layer.
         focal_alpha (`float`, *optional*, defaults to 0.25):
             Alpha parameter in the focal loss.
+        disable_custom_kernels (`bool`, *optional*, defaults to `False`):
+            Disable the use of custom CUDA and CPU kernels. This option is necessary for the ONNX export, as custom
+            kernels are not supported by PyTorch ONNX export.
 
     Examples:
 
@@ -139,6 +151,9 @@ class DeformableDetrConfig(PretrainedConfig):
 
     def __init__(
         self,
+        use_timm_backbone=True,
+        backbone_config=None,
+        num_channels=3,
         num_queries=300,
         max_position_embeddings=1024,
         encoder_layers=6,
@@ -148,7 +163,6 @@ def __init__(
         decoder_ffn_dim=1024,
         decoder_attention_heads=8,
         encoder_layerdrop=0.0,
-        decoder_layerdrop=0.0,
         is_encoder_decoder=True,
         activation_function="relu",
         d_model=256,
@@ -161,6 +175,7 @@ def __init__(
         auxiliary_loss=False,
         position_embedding_type="sine",
         backbone="resnet50",
+        use_pretrained_backbone=True,
         dilation=False,
         num_feature_levels=4,
         encoder_n_points=4,
@@ -177,8 +192,23 @@ def __init__(
         giou_loss_coefficient=2,
         eos_coefficient=0.1,
         focal_alpha=0.25,
-        **kwargs
+        disable_custom_kernels=False,
+        **kwargs,
     ):
+        if backbone_config is not None and use_timm_backbone:
+            raise ValueError("You can't specify both `backbone_config` and `use_timm_backbone`.")
+
+        if not use_timm_backbone:
+            if backbone_config is None:
+                logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
+                backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"])
+            elif isinstance(backbone_config, dict):
+                backbone_model_type = backbone_config.get("model_type")
+                config_class = CONFIG_MAPPING[backbone_model_type]
+                backbone_config = config_class.from_dict(backbone_config)
+        self.use_timm_backbone = use_timm_backbone
+        self.backbone_config = backbone_config
+        self.num_channels = num_channels
         self.num_queries = num_queries
         self.max_position_embeddings = max_position_embeddings
         self.d_model = d_model
@@ -195,10 +225,10 @@ def __init__(
         self.init_std = init_std
         self.init_xavier_std = init_xavier_std
         self.encoder_layerdrop = encoder_layerdrop
-        self.decoder_layerdrop = decoder_layerdrop
         self.auxiliary_loss = auxiliary_loss
         self.position_embedding_type = position_embedding_type
         self.backbone = backbone
+        self.use_pretrained_backbone = use_pretrained_backbone
         self.dilation = dilation
         # deformable attributes
         self.num_feature_levels = num_feature_levels
@@ -220,6 +250,7 @@ def __init__(
         self.giou_loss_coefficient = giou_loss_coefficient
         self.eos_coefficient = eos_coefficient
         self.focal_alpha = focal_alpha
+        self.disable_custom_kernels = disable_custom_kernels
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
 
     @property
diff --git a/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py b/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py
index 8e4461d515c2..928fa368ed34 100644
--- a/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py
+++ b/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py
@@ -19,12 +19,12 @@
 import json
 from pathlib import Path
 
+import requests
 import torch
+from huggingface_hub import cached_download, hf_hub_url
 from PIL import Image
 
-import requests
-from huggingface_hub import cached_download, hf_hub_url
-from transformers import DeformableDetrConfig, DeformableDetrFeatureExtractor, DeformableDetrForObjectDetection
+from transformers import DeformableDetrConfig, DeformableDetrForObjectDetection, DeformableDetrImageProcessor
 from transformers.utils import logging
 
 
@@ -115,12 +115,12 @@ def convert_deformable_detr_checkpoint(
     config.id2label = id2label
     config.label2id = {v: k for k, v in id2label.items()}
 
-    # load feature extractor
-    feature_extractor = DeformableDetrFeatureExtractor(format="coco_detection")
+    # load image processor
+    image_processor = DeformableDetrImageProcessor(format="coco_detection")
 
     # prepare image
     img = prepare_img()
-    encoding = feature_extractor(images=img, return_tensors="pt")
+    encoding = image_processor(images=img, return_tensors="pt")
     pixel_values = encoding["pixel_values"]
 
     logger.info("Converting model...")
@@ -185,11 +185,11 @@ def convert_deformable_detr_checkpoint(
 
     print("Everything ok!")
 
-    # Save model and feature extractor
-    logger.info(f"Saving PyTorch model and feature extractor to {pytorch_dump_folder_path}...")
+    # Save model and image processor
+    logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     model.save_pretrained(pytorch_dump_folder_path)
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
     # Push to hub
     if push_to_hub:
diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
index 499313dd5295..ae35a07e43d8 100644
--- a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
@@ -16,27 +16,25 @@
 
 import io
 import pathlib
-import warnings
 from collections import defaultdict
 from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 import numpy as np
 
-from transformers.feature_extraction_utils import BatchFeature
-from transformers.image_processing_utils import BaseImageProcessor, get_size_dict
-from transformers.image_transforms import (
+from ...feature_extraction_utils import BatchFeature
+from ...image_processing_utils import BaseImageProcessor, get_size_dict
+from ...image_transforms import (
     PaddingMode,
     center_to_corners_format,
     corners_to_center_format,
     id_to_rgb,
-    normalize,
     pad,
     rescale,
     resize,
     rgb_to_id,
     to_channel_dimension_format,
 )
-from transformers.image_utils import (
+from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
     ChannelDimension,
@@ -44,13 +42,16 @@
     PILImageResampling,
     get_image_size,
     infer_channel_dimension_format,
-    is_batched,
+    is_scaled_image,
+    make_list_of_images,
     to_numpy_array,
     valid_coco_detection_annotations,
     valid_coco_panoptic_annotations,
     valid_images,
 )
-from transformers.utils import (
+from ...utils import (
+    ExplicitEnum,
+    TensorType,
     is_flax_available,
     is_jax_tensor,
     is_scipy_available,
@@ -59,8 +60,8 @@
     is_torch_available,
     is_torch_tensor,
     is_vision_available,
+    logging,
 )
-from transformers.utils.generic import ExplicitEnum, TensorType
 
 
 if is_torch_available():
@@ -76,6 +77,11 @@
     import scipy.stats
 
 
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+AnnotationType = Dict[str, Union[int, str, List[Dict]]]
+
+
 class AnnotionFormat(ExplicitEnum):
     COCO_DETECTION = "coco_detection"
     COCO_PANOPTIC = "coco_panoptic"
@@ -118,7 +124,10 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in
 
 # Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size
 def get_resize_output_image_size(
-    input_image: np.ndarray, size: Union[int, Tuple[int, int], List[int]], max_size: Optional[int] = None
+    input_image: np.ndarray,
+    size: Union[int, Tuple[int, int], List[int]],
+    max_size: Optional[int] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
 ) -> Tuple[int, int]:
     """
     Computes the output image size given the input image size and the desired output size. If the desired output size
@@ -132,8 +141,10 @@ def get_resize_output_image_size(
             The desired output size.
         max_size (`int`, *optional*):
             The maximum allowed output size.
+        input_data_format (`ChannelDimension` or `str`, *optional*):
+            The channel dimension format of the input image. If not provided, it will be inferred from the input image.
     """
-    image_size = get_image_size(input_image)
+    image_size = get_image_size(input_image, input_data_format)
     if isinstance(size, (list, tuple)):
         return size
 
@@ -203,23 +214,28 @@ def max_across_indices(values: Iterable[Any]) -> List[Any]:
 
 
 # Copied from transformers.models.detr.image_processing_detr.get_max_height_width
-def get_max_height_width(images: List[np.ndarray]) -> List[int]:
+def get_max_height_width(
+    images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> List[int]:
     """
     Get the maximum height and width across all images in a batch.
     """
-    input_channel_dimension = infer_channel_dimension_format(images[0])
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(images[0])
 
-    if input_channel_dimension == ChannelDimension.FIRST:
+    if input_data_format == ChannelDimension.FIRST:
         _, max_height, max_width = max_across_indices([img.shape for img in images])
-    elif input_channel_dimension == ChannelDimension.LAST:
+    elif input_data_format == ChannelDimension.LAST:
         max_height, max_width, _ = max_across_indices([img.shape for img in images])
     else:
-        raise ValueError(f"Invalid channel dimension format: {input_channel_dimension}")
+        raise ValueError(f"Invalid channel dimension format: {input_data_format}")
     return (max_height, max_width)
 
 
 # Copied from transformers.models.detr.image_processing_detr.make_pixel_mask
-def make_pixel_mask(image: np.ndarray, output_size: Tuple[int, int]) -> np.ndarray:
+def make_pixel_mask(
+    image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> np.ndarray:
     """
     Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
 
@@ -229,7 +245,7 @@ def make_pixel_mask(image: np.ndarray, output_size: Tuple[int, int]) -> np.ndarr
         output_size (`Tuple[int, int]`):
             Output size of the mask.
     """
-    input_height, input_width = get_image_size(image)
+    input_height, input_width = get_image_size(image, channel_dim=input_data_format)
     mask = np.zeros(output_size, dtype=np.int64)
     mask[:input_height, :input_width] = 1
     return mask
@@ -271,11 +287,16 @@ def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndar
 
 
 # Copied from transformers.models.detr.image_processing_detr.prepare_coco_detection_annotation with DETR->DeformableDetr
-def prepare_coco_detection_annotation(image, target, return_segmentation_masks: bool = False):
+def prepare_coco_detection_annotation(
+    image,
+    target,
+    return_segmentation_masks: bool = False,
+    input_data_format: Optional[Union[ChannelDimension, str]] = None,
+):
     """
     Convert the target in COCO format into the format expected by DeformableDetr.
     """
-    image_height, image_width = get_image_size(image)
+    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
 
     image_id = target["image_id"]
     image_id = np.asarray([image_id], dtype=np.int64)
@@ -360,12 +381,16 @@ def masks_to_boxes(masks: np.ndarray) -> np.ndarray:
 
 # Copied from transformers.models.detr.image_processing_detr.prepare_coco_panoptic_annotation with DETR->DeformableDetr
 def prepare_coco_panoptic_annotation(
-    image: np.ndarray, target: Dict, masks_path: Union[str, pathlib.Path], return_masks: bool = True
+    image: np.ndarray,
+    target: Dict,
+    masks_path: Union[str, pathlib.Path],
+    return_masks: bool = True,
+    input_data_format: Union[ChannelDimension, str] = None,
 ) -> Dict:
     """
     Prepare a coco panoptic annotation for DeformableDetr.
     """
-    image_height, image_width = get_image_size(image)
+    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
     annotation_path = pathlib.Path(masks_path) / target["file_name"]
 
     new_target = {}
@@ -599,7 +624,7 @@ def binary_mask_to_rle(mask):
     pixels = np.concatenate([[0], pixels, [0]])
     runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
     runs[1::2] -= runs[::2]
-    return [x for x in runs]
+    return list(runs)
 
 
 # Copied from transformers.models.detr.image_processing_detr.convert_segmentation_to_rle
@@ -783,16 +808,15 @@ def __init__(
         image_mean: Union[float, List[float]] = None,
         image_std: Union[float, List[float]] = None,
         do_pad: bool = True,
-        **kwargs
+        **kwargs,
     ) -> None:
         if "pad_and_return_pixel_mask" in kwargs:
             do_pad = kwargs.pop("pad_and_return_pixel_mask")
 
         if "max_size" in kwargs:
-            warnings.warn(
+            logger.warning_once(
                 "The `max_size` parameter is deprecated and will be removed in v4.26. "
                 "Please specify in `size['longest_edge'] instead`.",
-                FutureWarning,
             )
             max_size = kwargs.pop("max_size")
         else:
@@ -836,6 +860,7 @@ def prepare_annotation(
         format: Optional[AnnotionFormat] = None,
         return_segmentation_masks: bool = None,
         masks_path: Optional[Union[str, pathlib.Path]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> Dict:
         """
         Prepare an annotation for feeding into DeformableDetr model.
@@ -844,11 +869,17 @@ def prepare_annotation(
 
         if format == AnnotionFormat.COCO_DETECTION:
             return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
-            target = prepare_coco_detection_annotation(image, target, return_segmentation_masks)
+            target = prepare_coco_detection_annotation(
+                image, target, return_segmentation_masks, input_data_format=input_data_format
+            )
         elif format == AnnotionFormat.COCO_PANOPTIC:
             return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
             target = prepare_coco_panoptic_annotation(
-                image, target, masks_path=masks_path, return_masks=return_segmentation_masks
+                image,
+                target,
+                masks_path=masks_path,
+                return_masks=return_segmentation_masks,
+                input_data_format=input_data_format,
             )
         else:
             raise ValueError(f"Format {format} is not supported.")
@@ -856,8 +887,8 @@ def prepare_annotation(
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare
     def prepare(self, image, target, return_segmentation_masks=None, masks_path=None):
-        warnings.warn(
-            "The `prepare` method is deprecated and will be removed in a future version. "
+        logger.warning_once(
+            "The `prepare` method is deprecated and will be removed in a v4.33. "
             "Please use `prepare_annotation` instead. Note: the `prepare_annotation` method "
             "does not return the image anymore.",
         )
@@ -866,17 +897,17 @@ def prepare(self, image, target, return_segmentation_masks=None, masks_path=None
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.convert_coco_poly_to_mask
     def convert_coco_poly_to_mask(self, *args, **kwargs):
-        warnings.warn("The `convert_coco_poly_to_mask` method is deprecated and will be removed in a future version. ")
+        logger.warning_once("The `convert_coco_poly_to_mask` method is deprecated and will be removed in v4.33. ")
         return convert_coco_poly_to_mask(*args, **kwargs)
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_detection
     def prepare_coco_detection(self, *args, **kwargs):
-        warnings.warn("The `prepare_coco_detection` method is deprecated and will be removed in a future version. ")
+        logger.warning_once("The `prepare_coco_detection` method is deprecated and will be removed in v4.33. ")
         return prepare_coco_detection_annotation(*args, **kwargs)
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_panoptic
     def prepare_coco_panoptic(self, *args, **kwargs):
-        warnings.warn("The `prepare_coco_panoptic` method is deprecated and will be removed in a future version. ")
+        logger.warning_once("The `prepare_coco_panoptic` method is deprecated and will be removed in v4.33. ")
         return prepare_coco_panoptic_annotation(*args, **kwargs)
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize
@@ -886,24 +917,40 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BILINEAR,
         data_format: Optional[ChannelDimension] = None,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
         int, smaller edge of the image will be matched to this number.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or
+                `height` and `width`.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                Resampling filter to use if resizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
         """
         if "max_size" in kwargs:
-            warnings.warn(
+            logger.warning_once(
                 "The `max_size` parameter is deprecated and will be removed in v4.26. "
                 "Please specify in `size['longest_edge'] instead`.",
-                FutureWarning,
             )
             max_size = kwargs.pop("max_size")
         else:
             max_size = None
         size = get_size_dict(size, max_size=max_size, default_to_square=False)
         if "shortest_edge" in size and "longest_edge" in size:
-            size = get_resize_output_image_size(image, size["shortest_edge"], size["longest_edge"])
+            size = get_resize_output_image_size(
+                image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
+            )
         elif "height" in size and "width" in size:
             size = (size["height"], size["width"])
         else:
@@ -911,7 +958,9 @@ def resize(
                 "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
                 f" {size.keys()}."
             )
-        image = resize(image, size=size, resample=resample, data_format=data_format)
+        image = resize(
+            image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
+        )
         return image
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize_annotation
@@ -930,25 +979,32 @@ def resize_annotation(
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
     def rescale(
-        self, image: np.ndarray, rescale_factor: Union[float, int], data_format: Optional[ChannelDimension] = None
-    ) -> np.ndarray:
-        """
-        Rescale the image by the given factor.
-        """
-        return rescale(image, rescale_factor, data_format=data_format)
-
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize
-    def normalize(
         self,
         image: np.ndarray,
-        mean: Union[float, Iterable[float]],
-        std: Union[float, Iterable[float]],
-        data_format: Optional[ChannelDimension] = None,
+        rescale_factor: float,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.ndarray:
         """
-        Normalize the image with the given mean and standard deviation.
+        Rescale the image by the given factor. image = image * rescale_factor.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            rescale_factor (`float`):
+                The value to use for rescaling.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the input image. If unset, is inferred from the input image. Can be
+                one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
         """
-        return normalize(image, mean=mean, std=std, data_format=data_format)
+        return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation
     def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
@@ -958,42 +1014,6 @@ def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) ->
         """
         return normalize_annotation(annotation, image_size=image_size)
 
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad_and_create_pixel_mask
-    def pad_and_create_pixel_mask(
-        self,
-        pixel_values_list: List[ImageInput],
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        data_format: Optional[ChannelDimension] = None,
-    ) -> BatchFeature:
-        """
-        Pads a batch of images with zeros to the size of largest height and width in the batch and returns their
-        corresponding pixel mask.
-
-        Args:
-            images (`List[np.ndarray]`):
-                Batch of images to pad.
-            return_tensors (`str` or `TensorType`, *optional*):
-                The type of tensors to return. Can be one of:
-                    - Unset: Return a list of `np.ndarray`.
-                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
-                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
-                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
-                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        warnings.warn(
-            "This method is deprecated and will be removed in v4.27.0. Please use pad instead.", FutureWarning
-        )
-        # pad expects a list of np.ndarray, but the previous feature extractors expected torch tensors
-        images = [to_numpy_array(image) for image in pixel_values_list]
-        return self.pad(
-            images=images,
-            return_pixel_mask=True,
-            return_tensors=return_tensors,
-            data_format=data_format,
-        )
-
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
     def _pad_image(
         self,
@@ -1001,18 +1021,24 @@ def _pad_image(
         output_size: Tuple[int, int],
         constant_values: Union[float, Iterable[float]] = 0,
         data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.ndarray:
         """
         Pad an image with zeros to the given size.
         """
-        input_height, input_width = get_image_size(image)
+        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
         output_height, output_width = output_size
 
         pad_bottom = output_height - input_height
         pad_right = output_width - input_width
         padding = ((0, pad_bottom), (0, pad_right))
         padded_image = pad(
-            image, padding, mode=PaddingMode.CONSTANT, constant_values=constant_values, data_format=data_format
+            image,
+            padding,
+            mode=PaddingMode.CONSTANT,
+            constant_values=constant_values,
+            data_format=data_format,
+            input_data_format=input_data_format,
         )
         return padded_image
 
@@ -1024,7 +1050,8 @@ def pad(
         return_pixel_mask: bool = True,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = None,
-    ) -> np.ndarray:
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> BatchFeature:
         """
         Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
         in the batch and optionally returns their corresponding pixel mask.
@@ -1036,21 +1063,37 @@ def pad(
                 The value to use for the padding if `mode` is `"constant"`.
             return_pixel_mask (`bool`, *optional*, defaults to `True`):
                 Whether to return a pixel mask.
-            input_channel_dimension (`ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be inferred from the input image.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
         """
-        pad_size = get_max_height_width(images)
+        pad_size = get_max_height_width(images, input_data_format=input_data_format)
 
         padded_images = [
-            self._pad_image(image, pad_size, constant_values=constant_values, data_format=data_format)
+            self._pad_image(
+                image,
+                pad_size,
+                constant_values=constant_values,
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
             for image in images
         ]
         data = {"pixel_values": padded_images}
 
         if return_pixel_mask:
-            masks = [make_pixel_mask(image=image, output_size=pad_size) for image in images]
+            masks = [
+                make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
+                for image in images
+            ]
             data["pixel_mask"] = masks
 
         return BatchFeature(data=data, tensor_type=return_tensors)
@@ -1059,7 +1102,7 @@ def pad(
     def preprocess(
         self,
         images: ImageInput,
-        annotations: Optional[Union[List[Dict], List[List[Dict]]]] = None,
+        annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
         return_segmentation_masks: bool = None,
         masks_path: Optional[Union[str, pathlib.Path]] = None,
         do_resize: Optional[bool] = None,
@@ -1074,21 +1117,23 @@ def preprocess(
         format: Optional[Union[str, AnnotionFormat]] = None,
         return_tensors: Optional[Union[TensorType, str]] = None,
         data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> BatchFeature:
         """
         Preprocess an image or a batch of images so that it can be used by the model.
 
         Args:
             images (`ImageInput`):
-                Image or batch of images to preprocess.
-            annotations (`List[Dict]` or `List[List[Dict]]`, *optional*):
-                List of annotations associated with the image or batch of images. If annotionation is for object
+                Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging
+                from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
+                List of annotations associated with the image or batch of images. If annotation is for object
                 detection, the annotations should be a dictionary with the following keys:
                 - "image_id" (`int`): The image id.
                 - "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a
                   dictionary. An image can have no annotations, in which case the list should be empty.
-                If annotionation is for segmentation, the annotations should be a dictionary with the following keys:
+                If annotation is for segmentation, the annotations should be a dictionary with the following keys:
                 - "image_id" (`int`): The image id.
                 - "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary.
                   An image can have no segments, in which case the list should be empty.
@@ -1119,23 +1164,30 @@ def preprocess(
                 Format of the annotations.
             return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
                 Type of tensors to return. If `None`, will return the list of images.
-            data_format (`str` or `ChannelDimension`, *optional*, defaults to self.data_format):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
         if "pad_and_return_pixel_mask" in kwargs:
-            warnings.warn(
+            logger.warning_once(
                 "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, "
-                "use `do_pad` instead.",
-                FutureWarning,
+                "use `do_pad` instead."
             )
             do_pad = kwargs.pop("pad_and_return_pixel_mask")
 
         max_size = None
         if "max_size" in kwargs:
-            warnings.warn(
+            logger.warning_once(
                 "The `max_size` argument is deprecated and will be removed in a future version, use"
-                " `size['longest_edge']` instead.",
-                FutureWarning,
+                " `size['longest_edge']` instead."
             )
             size = kwargs.pop("max_size")
 
@@ -1160,9 +1212,9 @@ def preprocess(
         if do_normalize is not None and (image_mean is None or image_std is None):
             raise ValueError("Image mean and std must be specified if do_normalize is True.")
 
-        if not is_batched(images):
-            images = [images]
-            annotations = [annotations] if annotations is not None else None
+        images = make_list_of_images(images)
+        if annotations is not None and isinstance(annotations, dict):
+            annotations = [annotations]
 
         if annotations is not None and len(images) != len(annotations):
             raise ValueError(
@@ -1207,13 +1259,28 @@ def preprocess(
         # All transformations expect numpy arrays
         images = [to_numpy_array(image) for image in images]
 
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
         # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
         if annotations is not None:
             prepared_images = []
             prepared_annotations = []
             for image, target in zip(images, annotations):
                 target = self.prepare_annotation(
-                    image, target, format, return_segmentation_masks=return_segmentation_masks, masks_path=masks_path
+                    image,
+                    target,
+                    format,
+                    return_segmentation_masks=return_segmentation_masks,
+                    masks_path=masks_path,
+                    input_data_format=input_data_format,
                 )
                 prepared_images.append(image)
                 prepared_annotations.append(target)
@@ -1226,33 +1293,47 @@ def preprocess(
             if annotations is not None:
                 resized_images, resized_annotations = [], []
                 for image, target in zip(images, annotations):
-                    orig_size = get_image_size(image)
-                    resized_image = self.resize(image, size=size, max_size=max_size, resample=resample)
-                    resized_annotation = self.resize_annotation(target, orig_size, get_image_size(resized_image))
+                    orig_size = get_image_size(image, input_data_format)
+                    resized_image = self.resize(
+                        image, size=size, max_size=max_size, resample=resample, input_data_format=input_data_format
+                    )
+                    resized_annotation = self.resize_annotation(
+                        target, orig_size, get_image_size(resized_image, input_data_format)
+                    )
                     resized_images.append(resized_image)
                     resized_annotations.append(resized_annotation)
                 images = resized_images
                 annotations = resized_annotations
                 del resized_images, resized_annotations
             else:
-                images = [self.resize(image, size=size, resample=resample) for image in images]
+                images = [
+                    self.resize(image, size=size, resample=resample, input_data_format=input_data_format)
+                    for image in images
+                ]
 
         if do_rescale:
-            images = [self.rescale(image, rescale_factor) for image in images]
+            images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images]
 
         if do_normalize:
-            images = [self.normalize(image, image_mean, image_std) for image in images]
+            images = [
+                self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
+            ]
             if annotations is not None:
                 annotations = [
-                    self.normalize_annotation(annotation, get_image_size(image))
+                    self.normalize_annotation(annotation, get_image_size(image, input_data_format))
                     for annotation, image in zip(annotations, images)
                 ]
 
         if do_pad:
             # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
-            data = self.pad(images, return_pixel_mask=True, data_format=data_format)
+            data = self.pad(
+                images, return_pixel_mask=True, data_format=data_format, input_data_format=input_data_format
+            )
         else:
-            images = [to_channel_dimension_format(image, data_format) for image in images]
+            images = [
+                to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+                for image in images
+            ]
             data = {"pixel_values": images}
 
         encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
@@ -1280,10 +1361,9 @@ def post_process(self, outputs, target_sizes):
             `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
             in the batch as predicted by the model.
         """
-        warnings.warn(
+        logger.warning_once(
             "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
-            " `post_process_object_detection`.",
-            FutureWarning,
+            " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
         )
 
         out_logits, out_bbox = outputs.logits, outputs.pred_boxes
@@ -1296,7 +1376,7 @@ def post_process(self, outputs, target_sizes):
         prob = out_logits.sigmoid()
         topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
         scores = topk_values
-        topk_boxes = topk_indexes // out_logits.shape[2]
+        topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
         labels = topk_indexes % out_logits.shape[2]
         boxes = center_to_corners_format(out_bbox)
         boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
@@ -1311,7 +1391,7 @@ def post_process(self, outputs, target_sizes):
         return results
 
     def post_process_object_detection(
-        self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None
+        self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None, top_k: int = 100
     ):
         """
         Converts the raw output of [`DeformableDetrForObjectDetection`] into final bounding boxes in (top_left_x,
@@ -1325,6 +1405,8 @@ def post_process_object_detection(
             target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
                 Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
                 (height, width) of each image in the batch. If left to None, predictions will not be resized.
+            top_k (`int`, *optional*, defaults to 100):
+                Keep only top k bounding boxes before filtering by thresholding.
 
         Returns:
             `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
@@ -1339,9 +1421,11 @@ def post_process_object_detection(
                 )
 
         prob = out_logits.sigmoid()
-        topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
+        prob = prob.view(out_logits.shape[0], -1)
+        k_value = min(top_k, prob.size(1))
+        topk_values, topk_indexes = torch.topk(prob, k_value, dim=1)
         scores = topk_values
-        topk_boxes = topk_indexes // out_logits.shape[2]
+        topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
         labels = topk_indexes % out_logits.shape[2]
         boxes = center_to_corners_format(out_bbox)
         boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
diff --git a/src/transformers/models/deformable_detr/load_custom.py b/src/transformers/models/deformable_detr/load_custom.py
index d2a8bc0cb2c0..c3a822e27641 100644
--- a/src/transformers/models/deformable_detr/load_custom.py
+++ b/src/transformers/models/deformable_detr/load_custom.py
@@ -13,16 +13,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Loading of Deformable DETR's CUDA kernels"""
-
 import os
+from pathlib import Path
 
 
 def load_cuda_kernels():
     from torch.utils.cpp_extension import load
 
-    root = os.path.join(os.path.dirname(os.path.realpath(__file__)), "custom_kernel")
+    root = Path(__file__).resolve().parent.parent.parent / "kernels" / "deformable_detr"
     src_files = [
-        os.path.join(root, filename)
+        root / filename
         for filename in [
             "vision.cpp",
             os.path.join("cpu", "ms_deform_attn_cpu.cpp"),
@@ -33,10 +33,8 @@ def load_cuda_kernels():
     load(
         "MultiScaleDeformableAttention",
         src_files,
-        # verbose=True,
         with_cuda=True,
-        extra_include_paths=[root],
-        # build_directory=os.path.dirname(os.path.realpath(__file__)),
+        extra_include_paths=[str(root)],
         extra_cflags=["-DWITH_CUDA=1"],
         extra_cuda_cflags=[
             "-DCUDA_HAS_FP16=1",
diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
index e6766782cce2..f541ca130544 100755
--- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
@@ -19,7 +19,7 @@
 import math
 import warnings
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple, Union
 
 import torch
 import torch.nn.functional as F
@@ -43,6 +43,7 @@
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import meshgrid
 from ...utils import is_ninja_available, logging
+from ..auto import AutoBackbone
 from .configuration_deformable_detr import DeformableDetrConfig
 from .load_custom import load_cuda_kernels
 
@@ -241,7 +242,7 @@ class DeformableDetrObjectDetectionOutput(ModelOutput):
         pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
             Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
             values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
-            possible padding). You can use [`~AutoImageProcessor.post_process_object_detection`] to retrieve the
+            possible padding). You can use [`~DeformableDetrProcessor.post_process_object_detection`] to retrieve the
             unnormalized bounding boxes.
         auxiliary_outputs (`list[Dict]`, *optional*):
             Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
@@ -356,60 +357,80 @@ def forward(self, x):
 
 
 # Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->DeformableDetr
-def replace_batch_norm(m, name=""):
-    for attr_str in dir(m):
-        target_attr = getattr(m, attr_str)
-        if isinstance(target_attr, nn.BatchNorm2d):
-            frozen = DeformableDetrFrozenBatchNorm2d(target_attr.num_features)
-            bn = getattr(m, attr_str)
-            frozen.weight.data.copy_(bn.weight)
-            frozen.bias.data.copy_(bn.bias)
-            frozen.running_mean.data.copy_(bn.running_mean)
-            frozen.running_var.data.copy_(bn.running_var)
-            setattr(m, attr_str, frozen)
-    for n, ch in m.named_children():
-        replace_batch_norm(ch, n)
-
-
-class DeformableDetrTimmConvEncoder(nn.Module):
+def replace_batch_norm(model):
+    r"""
+    Recursively replace all `torch.nn.BatchNorm2d` with `DeformableDetrFrozenBatchNorm2d`.
+
+    Args:
+        model (torch.nn.Module):
+            input model
+    """
+    for name, module in model.named_children():
+        if isinstance(module, nn.BatchNorm2d):
+            new_module = DeformableDetrFrozenBatchNorm2d(module.num_features)
+
+            new_module.weight.data.copy_(module.weight)
+            new_module.bias.data.copy_(module.bias)
+            new_module.running_mean.data.copy_(module.running_mean)
+            new_module.running_var.data.copy_(module.running_var)
+
+            model._modules[name] = new_module
+
+        if len(list(module.children())) > 0:
+            replace_batch_norm(module)
+
+
+class DeformableDetrConvEncoder(nn.Module):
     """
-    Convolutional encoder (backbone) from the timm library.
+    Convolutional backbone, using either the AutoBackbone API or one from the timm library.
 
     nn.BatchNorm2d layers are replaced by DeformableDetrFrozenBatchNorm2d as defined above.
+
     """
 
     def __init__(self, config):
         super().__init__()
 
-        kwargs = {}
-        if config.dilation:
-            kwargs["output_stride"] = 16
-
-        requires_backends(self, ["timm"])
+        self.config = config
+
+        if config.use_timm_backbone:
+            requires_backends(self, ["timm"])
+            kwargs = {}
+            if config.dilation:
+                kwargs["output_stride"] = 16
+            backbone = create_model(
+                config.backbone,
+                pretrained=config.use_pretrained_backbone,
+                features_only=True,
+                out_indices=(2, 3, 4) if config.num_feature_levels > 1 else (4,),
+                in_chans=config.num_channels,
+                **kwargs,
+            )
+        else:
+            backbone = AutoBackbone.from_config(config.backbone_config)
 
-        out_indices = (2, 3, 4) if config.num_feature_levels > 1 else (4,)
-        backbone = create_model(
-            config.backbone, pretrained=True, features_only=True, out_indices=out_indices, **kwargs
-        )
         # replace batch norm by frozen batch norm
         with torch.no_grad():
             replace_batch_norm(backbone)
         self.model = backbone
-        self.intermediate_channel_sizes = self.model.feature_info.channels()
-        self.strides = self.model.feature_info.reduction()
+        self.intermediate_channel_sizes = (
+            self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
+        )
 
-        if "resnet" in config.backbone:
+        backbone_model_type = config.backbone if config.use_timm_backbone else config.backbone_config.model_type
+        if "resnet" in backbone_model_type:
             for name, parameter in self.model.named_parameters():
-                if "layer2" not in name and "layer3" not in name and "layer4" not in name:
-                    parameter.requires_grad_(False)
+                if config.use_timm_backbone:
+                    if "layer2" not in name and "layer3" not in name and "layer4" not in name:
+                        parameter.requires_grad_(False)
+                else:
+                    if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name:
+                        parameter.requires_grad_(False)
 
+    # Copied from transformers.models.detr.modeling_detr.DetrConvEncoder.forward with Detr->DeformableDetr
     def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
-        """
-        Outputs feature maps of latter stages C_3 through C_5 in ResNet if `config.num_feature_levels > 1`, otherwise
-        outputs feature maps of C_5.
-        """
         # send pixel_values through the model to get list of feature maps
-        features = self.model(pixel_values)
+        features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps
 
         out = []
         for feature_map in features:
@@ -484,7 +505,7 @@ def forward(self, pixel_values, pixel_mask):
             x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale
 
         dim_t = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device)
-        dim_t = self.temperature ** (2 * (dim_t // 2) / self.embedding_dim)
+        dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim)
 
         pos_x = x_embed[:, :, :, None] / dim_t
         pos_y = y_embed[:, :, :, None] / dim_t
@@ -532,27 +553,42 @@ def build_position_encoding(config):
     return position_embedding
 
 
-def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
-    # for debug and test only,
-    # need to use cuda version instead
-    N_, S_, M_, D_ = value.shape
-    _, Lq_, M_, L_, P_, _ = sampling_locations.shape
-    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
+def multi_scale_deformable_attention(
+    value: Tensor, value_spatial_shapes: Tensor, sampling_locations: Tensor, attention_weights: Tensor
+) -> Tensor:
+    batch_size, _, num_heads, hidden_dim = value.shape
+    _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
+    value_list = value.split([height.item() * width.item() for height, width in value_spatial_shapes], dim=1)
     sampling_grids = 2 * sampling_locations - 1
     sampling_value_list = []
-    for lid_, (H_, W_) in enumerate(value_spatial_shapes):
-        # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
-        value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_ * M_, D_, H_, W_)
-        # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
-        sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
-        # N_*M_, D_, Lq_, P_
-        sampling_value_l_ = F.grid_sample(
+    for level_id, (height, width) in enumerate(value_spatial_shapes):
+        # batch_size, height*width, num_heads, hidden_dim
+        # -> batch_size, height*width, num_heads*hidden_dim
+        # -> batch_size, num_heads*hidden_dim, height*width
+        # -> batch_size*num_heads, hidden_dim, height, width
+        value_l_ = (
+            value_list[level_id].flatten(2).transpose(1, 2).reshape(batch_size * num_heads, hidden_dim, height, width)
+        )
+        # batch_size, num_queries, num_heads, num_points, 2
+        # -> batch_size, num_heads, num_queries, num_points, 2
+        # -> batch_size*num_heads, num_queries, num_points, 2
+        sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1)
+        # batch_size*num_heads, hidden_dim, num_queries, num_points
+        sampling_value_l_ = nn.functional.grid_sample(
             value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
         )
         sampling_value_list.append(sampling_value_l_)
-    # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
-    attention_weights = attention_weights.transpose(1, 2).reshape(N_ * M_, 1, Lq_, L_ * P_)
-    output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_ * D_, Lq_)
+    # (batch_size, num_queries, num_heads, num_levels, num_points)
+    # -> (batch_size, num_heads, num_queries, num_levels, num_points)
+    # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points)
+    attention_weights = attention_weights.transpose(1, 2).reshape(
+        batch_size * num_heads, 1, num_queries, num_levels * num_points
+    )
+    output = (
+        (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
+        .sum(-1)
+        .view(batch_size, num_heads * hidden_dim, num_queries)
+    )
     return output.transpose(1, 2).contiguous()
 
 
@@ -561,13 +597,13 @@ class DeformableDetrMultiscaleDeformableAttention(nn.Module):
     Multiscale deformable attention as proposed in Deformable DETR.
     """
 
-    def __init__(self, embed_dim: int, num_heads: int, n_levels: int, n_points: int):
+    def __init__(self, config: DeformableDetrConfig, num_heads: int, n_points: int):
         super().__init__()
-        if embed_dim % num_heads != 0:
+        if config.d_model % num_heads != 0:
             raise ValueError(
-                f"embed_dim (d_model) must be divisible by num_heads, but got {embed_dim} and {num_heads}"
+                f"embed_dim (d_model) must be divisible by num_heads, but got {config.d_model} and {num_heads}"
             )
-        dim_per_head = embed_dim // num_heads
+        dim_per_head = config.d_model // num_heads
         # check if dim_per_head is power of 2
         if not ((dim_per_head & (dim_per_head - 1) == 0) and dim_per_head != 0):
             warnings.warn(
@@ -578,15 +614,17 @@ def __init__(self, embed_dim: int, num_heads: int, n_levels: int, n_points: int)
 
         self.im2col_step = 64
 
-        self.d_model = embed_dim
-        self.n_levels = n_levels
+        self.d_model = config.d_model
+        self.n_levels = config.num_feature_levels
         self.n_heads = num_heads
         self.n_points = n_points
 
-        self.sampling_offsets = nn.Linear(embed_dim, num_heads * n_levels * n_points * 2)
-        self.attention_weights = nn.Linear(embed_dim, num_heads * n_levels * n_points)
-        self.value_proj = nn.Linear(embed_dim, embed_dim)
-        self.output_proj = nn.Linear(embed_dim, embed_dim)
+        self.sampling_offsets = nn.Linear(config.d_model, num_heads * self.n_levels * n_points * 2)
+        self.attention_weights = nn.Linear(config.d_model, num_heads * self.n_levels * n_points)
+        self.value_proj = nn.Linear(config.d_model, config.d_model)
+        self.output_proj = nn.Linear(config.d_model, config.d_model)
+
+        self.disable_custom_kernels = config.disable_custom_kernels
 
         self._reset_parameters()
 
@@ -664,19 +702,24 @@ def forward(
             )
         else:
             raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}")
-        try:
-            # GPU
-            output = MultiScaleDeformableAttentionFunction.apply(
-                value,
-                spatial_shapes,
-                level_start_index,
-                sampling_locations,
-                attention_weights,
-                self.im2col_step,
-            )
-        except Exception:
-            # CPU
-            output = ms_deform_attn_core_pytorch(value, spatial_shapes, sampling_locations, attention_weights)
+
+        if self.disable_custom_kernels:
+            # PyTorch implementation
+            output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights)
+        else:
+            try:
+                # custom kernel
+                output = MultiScaleDeformableAttentionFunction.apply(
+                    value,
+                    spatial_shapes,
+                    level_start_index,
+                    sampling_locations,
+                    attention_weights,
+                    self.im2col_step,
+                )
+            except Exception:
+                # PyTorch implementation
+                output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights)
         output = self.output_proj(output)
 
         return output, attention_weights
@@ -804,10 +847,7 @@ def __init__(self, config: DeformableDetrConfig):
         super().__init__()
         self.embed_dim = config.d_model
         self.self_attn = DeformableDetrMultiscaleDeformableAttention(
-            embed_dim=self.embed_dim,
-            num_heads=config.encoder_attention_heads,
-            n_levels=config.num_feature_levels,
-            n_points=config.encoder_n_points,
+            config, num_heads=config.encoder_attention_heads, n_points=config.encoder_n_points
         )
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.dropout = config.dropout
@@ -905,9 +945,8 @@ def __init__(self, config: DeformableDetrConfig):
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         # cross-attention
         self.encoder_attn = DeformableDetrMultiscaleDeformableAttention(
-            embed_dim=self.embed_dim,
+            config,
             num_heads=config.decoder_attention_heads,
-            n_levels=config.num_feature_levels,
             n_points=config.decoder_n_points,
         )
         self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
@@ -1073,7 +1112,8 @@ def _set_gradient_checkpointing(self, module, value=False):
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it.
 
-            Pixel values can be obtained using [`AutoImageProcessor`]. See [`AutoImageProcessor.__call__`] for details.
+            Pixel values can be obtained using [`AutoImageProcessor`]. See [`DeformableDetrImageProcessor.__call__`]
+            for details.
 
         pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
             Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
@@ -1083,7 +1123,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 
             [What are attention masks?](../glossary#attention-mask)
 
-        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, num_queries)`, *optional*):
+        decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
             Not used by default. Can be used to mask object queries.
         encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
             Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
@@ -1143,7 +1183,6 @@ def get_reference_points(spatial_shapes, valid_ratios, device):
         """
         reference_points_list = []
         for level, (height, width) in enumerate(spatial_shapes):
-
             ref_y, ref_x = meshgrid(
                 torch.linspace(0.5, height - 0.5, height, dtype=torch.float32, device=device),
                 torch.linspace(0.5, width - 0.5, width, dtype=torch.float32, device=device),
@@ -1438,13 +1477,13 @@ def __init__(self, config: DeformableDetrConfig):
         super().__init__(config)
 
         # Create backbone + positional encoding
-        backbone = DeformableDetrTimmConvEncoder(config)
+        backbone = DeformableDetrConvEncoder(config)
         position_embeddings = build_position_encoding(config)
         self.backbone = DeformableDetrConvModel(backbone, position_embeddings)
 
         # Create input projection layers
         if config.num_feature_levels > 1:
-            num_backbone_outs = len(backbone.strides)
+            num_backbone_outs = len(backbone.intermediate_channel_sizes)
             input_proj_list = []
             for _ in range(num_backbone_outs):
                 in_channels = backbone.intermediate_channel_sizes[_]
@@ -1519,12 +1558,12 @@ def get_valid_ratio(self, mask):
     def get_proposal_pos_embed(self, proposals):
         """Get the position embedding of the proposals."""
 
-        num_pos_feats = 128
+        num_pos_feats = self.config.d_model // 2
         temperature = 10000
         scale = 2 * math.pi
 
         dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=proposals.device)
-        dim_t = temperature ** (2 * (dim_t // 2) / num_pos_feats)
+        dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats)
         # batch_size, num_queries, 4
         proposals = proposals.sigmoid() * scale
         # batch_size, num_queries, 4, 128
@@ -1586,16 +1625,16 @@ def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes)
     @replace_return_docstrings(output_type=DeformableDetrModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        pixel_values,
-        pixel_mask=None,
-        decoder_attention_mask=None,
-        encoder_outputs=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        pixel_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_outputs: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], DeformableDetrModelOutput]:
         r"""
         Returns:
 
@@ -1792,7 +1831,7 @@ def forward(
 )
 class DeformableDetrForObjectDetection(DeformableDetrPreTrainedModel):
     # When using clones, all layers > 0 will be clones, but layer 0 *is* required
-    _keys_to_ignore_on_load_missing = ["bbox_embed\.[1-9]\d*", "class_embed\.[1-9]\d*"]
+    _tied_weights_keys = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"]
 
     def __init__(self, config: DeformableDetrConfig):
         super().__init__(config)
@@ -1846,17 +1885,17 @@ def _set_aux_loss(self, outputs_class, outputs_coord):
     @replace_return_docstrings(output_type=DeformableDetrObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        pixel_values,
-        pixel_mask=None,
-        decoder_attention_mask=None,
-        encoder_outputs=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        pixel_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_outputs: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[List[dict]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], DeformableDetrObjectDetectionOutput]:
         r"""
         labels (`List[Dict]` of len `(batch_size,)`, *optional*):
             Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
@@ -1938,12 +1977,11 @@ def forward(
             outputs_coord = outputs_coord_logits.sigmoid()
             outputs_classes.append(outputs_class)
             outputs_coords.append(outputs_coord)
-        # Keep batch_size as first dimension
-        outputs_class = torch.stack(outputs_classes, dim=1)
-        outputs_coord = torch.stack(outputs_coords, dim=1)
+        outputs_class = torch.stack(outputs_classes)
+        outputs_coord = torch.stack(outputs_coords)
 
-        logits = outputs_class[:, -1]
-        pred_boxes = outputs_coord[:, -1]
+        logits = outputs_class[-1]
+        pred_boxes = outputs_coord[-1]
 
         loss, loss_dict, auxiliary_outputs = None, None, None
         if labels is not None:
@@ -1969,7 +2007,7 @@ def forward(
                 outputs_loss["auxiliary_outputs"] = auxiliary_outputs
             if self.config.two_stage:
                 enc_outputs_coord = outputs.enc_outputs_coord_logits.sigmoid()
-                outputs["enc_outputs"] = {"pred_logits": outputs.enc_outputs_class, "pred_boxes": enc_outputs_coord}
+                outputs_loss["enc_outputs"] = {"logits": outputs.enc_outputs_class, "pred_boxes": enc_outputs_coord}
 
             loss_dict = criterion(outputs_loss, labels)
             # Fourth: compute total loss, as a weighted sum of the various losses
@@ -2201,7 +2239,7 @@ def forward(self, outputs, targets):
                 List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the
                 losses applied, see each loss' doc.
         """
-        outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs"}
+        outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs" and k != "enc_outputs"}
 
         # Retrieve the matching between the outputs of the last layer and the targets
         indices = self.matcher(outputs_without_aux, targets)
@@ -2233,14 +2271,10 @@ def forward(self, outputs, targets):
             enc_outputs = outputs["enc_outputs"]
             bin_targets = copy.deepcopy(targets)
             for bt in bin_targets:
-                bt["labels"] = torch.zeros_like(bt["labels"])
+                bt["class_labels"] = torch.zeros_like(bt["class_labels"])
             indices = self.matcher(enc_outputs, bin_targets)
             for loss in self.losses:
-                kwargs = {}
-                if loss == "labels":
-                    # Logging is enabled only for the last layer
-                    kwargs["log"] = False
-                l_dict = self.get_loss(loss, enc_outputs, bin_targets, indices, num_boxes, **kwargs)
+                l_dict = self.get_loss(loss, enc_outputs, bin_targets, indices, num_boxes)
                 l_dict = {k + "_enc": v for k, v in l_dict.items()}
                 losses.update(l_dict)
 
diff --git a/src/transformers/models/deit/__init__.py b/src/transformers/models/deit/__init__.py
index c9932c26e22c..a0b44186efbc 100644
--- a/src/transformers/models/deit/__init__.py
+++ b/src/transformers/models/deit/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/deit/configuration_deit.py b/src/transformers/models/deit/configuration_deit.py
index 8fbba3e9be08..905473c13eb5 100644
--- a/src/transformers/models/deit/configuration_deit.py
+++ b/src/transformers/models/deit/configuration_deit.py
@@ -58,23 +58,23 @@ class DeiTConfig(PretrainedConfig):
         hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"selu"` and `"gelu_new"` are supported.
-        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        image_size (`int`, *optional*, defaults to `224`):
+        image_size (`int`, *optional*, defaults to 224):
             The size (resolution) of each image.
-        patch_size (`int`, *optional*, defaults to `16`):
+        patch_size (`int`, *optional*, defaults to 16):
             The size (resolution) of each patch.
-        num_channels (`int`, *optional*, defaults to `3`):
+        num_channels (`int`, *optional*, defaults to 3):
             The number of input channels.
         qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to the queries, keys and values.
-        encoder_stride (`int`, `optional`, defaults to 16):
+        encoder_stride (`int`, *optional*, defaults to 16):
             Factor to increase the spatial resolution by in the decoder head for masked image modeling.
 
     Example:
@@ -109,7 +109,7 @@ def __init__(
         num_channels=3,
         qkv_bias=True,
         encoder_stride=16,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/deit/convert_deit_timm_to_pytorch.py b/src/transformers/models/deit/convert_deit_timm_to_pytorch.py
index 8a8a394c3f81..2b5c795ff2d2 100644
--- a/src/transformers/models/deit/convert_deit_timm_to_pytorch.py
+++ b/src/transformers/models/deit/convert_deit_timm_to_pytorch.py
@@ -19,13 +19,13 @@
 import json
 from pathlib import Path
 
-import torch
-from PIL import Image
-
 import requests
 import timm
+import torch
 from huggingface_hub import hf_hub_download
-from transformers import DeiTConfig, DeiTFeatureExtractor, DeiTForImageClassificationWithTeacher
+from PIL import Image
+
+from transformers import DeiTConfig, DeiTForImageClassificationWithTeacher, DeiTImageProcessor
 from transformers.utils import logging
 
 
@@ -182,12 +182,12 @@ def convert_deit_checkpoint(deit_name, pytorch_dump_folder_path):
     model = DeiTForImageClassificationWithTeacher(config).eval()
     model.load_state_dict(state_dict)
 
-    # Check outputs on an image, prepared by DeiTFeatureExtractor
+    # Check outputs on an image, prepared by DeiTImageProcessor
     size = int(
         (256 / 224) * config.image_size
     )  # to maintain same ratio w.r.t. 224 images, see https://github.com/facebookresearch/deit/blob/ab5715372db8c6cad5740714b2216d55aeae052e/datasets.py#L103
-    feature_extractor = DeiTFeatureExtractor(size=size, crop_size=config.image_size)
-    encoding = feature_extractor(images=prepare_img(), return_tensors="pt")
+    image_processor = DeiTImageProcessor(size=size, crop_size=config.image_size)
+    encoding = image_processor(images=prepare_img(), return_tensors="pt")
     pixel_values = encoding["pixel_values"]
     outputs = model(pixel_values)
 
@@ -198,8 +198,8 @@ def convert_deit_checkpoint(deit_name, pytorch_dump_folder_path):
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     print(f"Saving model {deit_name} to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/deit/image_processing_deit.py b/src/transformers/models/deit/image_processing_deit.py
index 6d60a1701202..96425278adbd 100644
--- a/src/transformers/models/deit/image_processing_deit.py
+++ b/src/transformers/models/deit/image_processing_deit.py
@@ -18,22 +18,21 @@
 
 import numpy as np
 
-from transformers.utils import is_vision_available
-from transformers.utils.generic import TensorType
-
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import center_crop, normalize, rescale, resize, to_channel_dimension_format
+from ...image_transforms import resize, to_channel_dimension_format
 from ...image_utils import (
     IMAGENET_STANDARD_MEAN,
     IMAGENET_STANDARD_STD,
     ChannelDimension,
     ImageInput,
     PILImageResampling,
-    is_batched,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
     to_numpy_array,
     valid_images,
 )
-from ...utils import logging
+from ...utils import TensorType, is_vision_available, logging
 
 
 if is_vision_available():
@@ -53,19 +52,19 @@ class DeiTImageProcessor(BaseImageProcessor):
             `do_resize` in `preprocess`.
         size (`Dict[str, int]` *optional*, defaults to `{"height": 256, "width": 256}`):
             Size of the image after `resize`. Can be overridden by `size` in `preprocess`.
-        resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.BICUBIC`):
+        resample (`PILImageResampling` filter, *optional*, defaults to `Resampling.BICUBIC`):
             Resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`.
         do_center_crop (`bool`, *optional*, defaults to `True`):
             Whether to center crop the image. If the input size is smaller than `crop_size` along any edge, the image
             is padded with 0's and then center cropped. Can be overridden by `do_center_crop` in `preprocess`.
         crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
             Desired output size when applying center-cropping. Can be overridden by `crop_size` in `preprocess`.
-        do_rescale (`bool`, *optional*, defaults to `True`):
-            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
-            parameter in the `preprocess` method.
         rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
             Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
             `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
         do_normalize (`bool`, *optional*, defaults to `True`):
             Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
             method.
@@ -91,7 +90,7 @@ def __init__(
         do_normalize: bool = True,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"height": 256, "width": 256}
@@ -110,101 +109,55 @@ def __init__(
         self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
         self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
 
+    # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize with PILImageResampling.BILINEAR->PILImageResampling.BICUBIC
     def resize(
         self,
         image: np.ndarray,
         size: Dict[str, int],
-        resample: PILImageResampling = PIL.Image.BICUBIC,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> np.ndarray:
         """
-        Resize an image to `(size["height"], size["width"])` using the specified resampling filter.
+        Resize an image to `(size["height"], size["width"])`.
 
         Args:
             image (`np.ndarray`):
                 Image to resize.
             size (`Dict[str, int]`):
-                Size of the output image.
-            resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.BICUBIC`):
-                Resampling filter to use when resizing the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
         """
         size = get_size_dict(size)
         if "height" not in size or "width" not in size:
-            raise ValueError(f"The size dictionary must have keys 'height' and 'width'. Got {size.keys()}")
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        output_size = (size["height"], size["width"])
         return resize(
-            image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
         )
 
-    def center_crop(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Center crop an image to `(crop_size["height"], crop_size["width"])`. If the input size is smaller than
-        `crop_size` along any edge, the image is padded with 0's and then center cropped.
-
-        Args:
-            image (`np.ndarray`):
-                Image to center crop.
-            size (`Dict[str, int]`):
-                Size of the output image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        size = get_size_dict(size)
-        if "height" not in size or "width" not in size:
-            raise ValueError(f"The size dictionary must have keys 'height' and 'width'. Got {size.keys()}")
-        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
-
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            image_mean (`float` or `List[float]`):
-                Image mean.
-            image_std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: ImageInput,
@@ -220,6 +173,7 @@ def preprocess(
         image_std: Optional[Union[float, List[float]]] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
         **kwargs,
     ) -> PIL.Image.Image:
         """
@@ -227,7 +181,8 @@ def preprocess(
 
         Args:
             images (`ImageInput`):
-                Image to preprocess.
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
             do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                 Whether to resize the image.
             size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@@ -261,6 +216,12 @@ def preprocess(
                 The channel dimension format for the output image. Can be one of:
                     - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                     - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         resample = resample if resample is not None else self.resample
@@ -276,8 +237,7 @@ def preprocess(
         crop_size = crop_size if crop_size is not None else self.crop_size
         crop_size = get_size_dict(crop_size, param_name="crop_size")
 
-        if not is_batched(images):
-            images = [images]
+        images = make_list_of_images(images)
 
         if not valid_images(images):
             raise ValueError(
@@ -300,19 +260,42 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
         if do_resize:
-            images = [self.resize(image=image, size=size, resample=resample) for image in images]
+            images = [
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
 
         if do_center_crop:
-            images = [self.center_crop(image=image, size=crop_size) for image in images]
+            images = [
+                self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images
+            ]
 
         if do_rescale:
-            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
 
         if do_normalize:
-            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
-
-        images = [to_channel_dimension_format(image, data_format) for image in images]
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
 
         data = {"pixel_values": images}
         return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/src/transformers/models/deit/modeling_deit.py b/src/transformers/models/deit/modeling_deit.py
index 176ba012448d..38c28dbbedc6 100644
--- a/src/transformers/models/deit/modeling_deit.py
+++ b/src/transformers/models/deit/modeling_deit.py
@@ -26,7 +26,12 @@
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput, MaskedLMOutput
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPooling,
+    ImageClassifierOutput,
+    MaskedImageModelingOutput,
+)
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
@@ -44,7 +49,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "DeiTConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "DeiTImageProcessor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "facebook/deit-base-distilled-patch16-224"
@@ -205,7 +209,6 @@ def __init__(self, config: DeiTConfig) -> None:
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
 
@@ -263,7 +266,6 @@ def __init__(self, config: DeiTConfig) -> None:
             self.intermediate_act_fn = config.hidden_act
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.intermediate_act_fn(hidden_states)
 
@@ -397,7 +399,7 @@ class DeiTPreTrainedModel(PreTrainedModel):
     base_model_prefix = "deit"
     main_input_name = "pixel_values"
     supports_gradient_checkpointing = True
-    _no_split_modules = []
+    _no_split_modules = ["DeiTLayer"]
 
     def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
         """Initialize the weights"""
@@ -432,7 +434,7 @@ def _set_gradient_checkpointing(self, module: DeiTEncoder, value: bool = False)
 DEIT_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`DeiTImageProcessor`]. See
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
             [`DeiTImageProcessor.__call__`] for details.
 
         head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
@@ -483,7 +485,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPooling,
         config_class=_CONFIG_FOR_DOC,
@@ -499,6 +500,10 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+        """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -592,7 +597,7 @@ def __init__(self, config: DeiTConfig) -> None:
         self.post_init()
 
     @add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(output_type=MaskedImageModelingOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         pixel_values: Optional[torch.Tensor] = None,
@@ -601,7 +606,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[tuple, MaskedLMOutput]:
+    ) -> Union[tuple, MaskedImageModelingOutput]:
         r"""
         bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
             Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
@@ -610,7 +615,7 @@ def forward(
 
         Examples:
         ```python
-        >>> from transformers import DeiTImageProcessor, DeiTForMaskedImageModeling
+        >>> from transformers import AutoImageProcessor, DeiTForMaskedImageModeling
         >>> import torch
         >>> from PIL import Image
         >>> import requests
@@ -618,7 +623,7 @@ def forward(
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> image_processor = DeiTImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
+        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
         >>> model = DeiTForMaskedImageModeling.from_pretrained("facebook/deit-base-distilled-patch16-224")
 
         >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
@@ -627,7 +632,7 @@ def forward(
         >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()
 
         >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
-        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.logits
+        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
         >>> list(reconstructed_pixel_values.shape)
         [1, 3, 224, 224]
         ```"""
@@ -670,9 +675,9 @@ def forward(
             output = (reconstructed_pixel_values,) + outputs[1:]
             return ((masked_im_loss,) + output) if masked_im_loss is not None else output
 
-        return MaskedLMOutput(
+        return MaskedImageModelingOutput(
             loss=masked_im_loss,
-            logits=reconstructed_pixel_values,
+            reconstruction=reconstructed_pixel_values,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
@@ -720,7 +725,7 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import DeiTImageProcessor, DeiTForImageClassification
+        >>> from transformers import AutoImageProcessor, DeiTForImageClassification
         >>> import torch
         >>> from PIL import Image
         >>> import requests
@@ -731,7 +736,7 @@ def forward(
 
         >>> # note: we are loading a DeiTForImageClassificationWithTeacher from the hub here,
         >>> # so the head will be randomly initialized, hence the predictions will be random
-        >>> image_processor = DeiTImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
+        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
         >>> model = DeiTForImageClassification.from_pretrained("facebook/deit-base-distilled-patch16-224")
 
         >>> inputs = image_processor(images=image, return_tensors="pt")
@@ -759,6 +764,7 @@ def forward(
 
         loss = None
         if labels is not None:
+            labels = labels.to(logits.device)
             if self.config.problem_type is None:
                 if self.num_labels == 1:
                     self.config.problem_type = "regression"
@@ -854,7 +860,6 @@ def __init__(self, config: DeiTConfig) -> None:
 
     @add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_IMAGE_CLASS_CHECKPOINT,
         output_type=DeiTForImageClassificationWithTeacherOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/deit/modeling_tf_deit.py b/src/transformers/models/deit/modeling_tf_deit.py
index 1eca5a623957..efd25788b033 100644
--- a/src/transformers/models/deit/modeling_tf_deit.py
+++ b/src/transformers/models/deit/modeling_tf_deit.py
@@ -15,10 +15,12 @@
 """ TensorFlow DeiT model."""
 
 
+from __future__ import annotations
+
 import collections.abc
 import math
 from dataclasses import dataclass
-from typing import Dict, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import tensorflow as tf
 
@@ -27,7 +29,7 @@
     TFBaseModelOutput,
     TFBaseModelOutputWithPooling,
     TFImageClassifierOutput,
-    TFMaskedLMOutput,
+    TFMaskedImageModelingOutput,
 )
 from ...modeling_tf_utils import (
     TFPreTrainedModel,
@@ -52,7 +54,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "DeiTConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "DeiTImageProcessor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "facebook/deit-base-distilled-patch16-224"
@@ -96,8 +97,8 @@ class token).
     logits: tf.Tensor = None
     cls_logits: tf.Tensor = None
     distillation_logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 class TFDeiTEmbeddings(tf.keras.layers.Layer):
@@ -143,7 +144,7 @@ def build(self, input_shape: tf.TensorShape):
         super().build(input_shape)
 
     def call(
-        self, pixel_values: tf.Tensor, bool_masked_pos: Optional[tf.Tensor] = None, training: bool = False
+        self, pixel_values: tf.Tensor, bool_masked_pos: tf.Tensor | None = None, training: bool = False
     ) -> tf.Tensor:
         embeddings = self.patch_embeddings(pixel_values)
         batch_size, seq_length, _ = shape_list(embeddings)
@@ -502,9 +503,9 @@ def get_head_mask(self, head_mask):
     @unpack_inputs
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
-        bool_masked_pos: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
+        bool_masked_pos: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -567,38 +568,6 @@ class TFDeiTPreTrainedModel(TFPreTrainedModel):
     base_model_prefix = "deit"
     main_input_name = "pixel_values"
 
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        VISION_DUMMY_INPUTS = tf.random.uniform(
-            shape=(3, self.config.num_channels, self.config.image_size, self.config.image_size), dtype=tf.float32
-        )
-        return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)}
-
-    @tf.function(
-        input_signature=[
-            {
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        """
-        Method used for serving the model.
-
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
-        """
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 DEIT_START_DOCSTRING = r"""
     This model is a TensorFlow
@@ -614,7 +583,7 @@ def serving(self, inputs):
 DEIT_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`DeiTImageProcessor`]. See
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
             [`DeiTImageProcessor.__call__`] for details.
 
         head_mask (`tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
@@ -651,7 +620,6 @@ def __init__(
     @unpack_inputs
     @add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFBaseModelOutputWithPooling,
         config_class=_CONFIG_FOR_DOC,
@@ -660,14 +628,14 @@ def __init__(
     )
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
-        bool_masked_pos: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
+        bool_masked_pos: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
-    ):
+    ) -> Union[Tuple, TFBaseModelOutputWithPooling]:
         outputs = self.deit(
             pixel_values=pixel_values,
             bool_masked_pos=bool_masked_pos,
@@ -679,17 +647,6 @@ def call(
         )
         return outputs
 
-    def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
-        hidden_states = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attentions = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutputWithPooling(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            hidden_states=hidden_states,
-            attentions=attentions,
-        )
-
 
 # Copied from transformers.models.vit.modeling_tf_vit.TFViTPooler with ViT->DeiT
 class TFDeiTPooler(tf.keras.layers.Layer):
@@ -767,17 +724,17 @@ def __init__(self, config: DeiTConfig) -> None:
 
     @unpack_inputs
     @add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=TFMaskedLMOutput, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(output_type=TFMaskedImageModelingOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
-        bool_masked_pos: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
+        bool_masked_pos: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
-    ) -> Union[tuple, TFMaskedLMOutput]:
+    ) -> Union[tuple, TFMaskedImageModelingOutput]:
         r"""
         bool_masked_pos (`tf.Tensor` of type bool and shape `(batch_size, num_patches)`):
             Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
@@ -786,7 +743,7 @@ def call(
 
         Examples:
         ```python
-        >>> from transformers import DeiTImageProcessor, TFDeiTForMaskedImageModeling
+        >>> from transformers import AutoImageProcessor, TFDeiTForMaskedImageModeling
         >>> import tensorflow as tf
         >>> from PIL import Image
         >>> import requests
@@ -794,7 +751,7 @@ def call(
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> image_processor = DeiTImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
+        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
         >>> model = TFDeiTForMaskedImageModeling.from_pretrained("facebook/deit-base-distilled-patch16-224")
 
         >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
@@ -803,7 +760,7 @@ def call(
         >>> bool_masked_pos = tf.cast(tf.random.uniform((1, num_patches), minval=0, maxval=2, dtype=tf.int32), tf.bool)
 
         >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
-        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.logits
+        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
         >>> list(reconstructed_pixel_values.shape)
         [1, 3, 224, 224]
         ```"""
@@ -858,19 +815,13 @@ def call(
             output = (reconstructed_pixel_values,) + outputs[1:]
             return ((masked_im_loss,) + output) if masked_im_loss is not None else output
 
-        return TFMaskedLMOutput(
+        return TFMaskedImageModelingOutput(
             loss=masked_im_loss,
-            logits=reconstructed_pixel_values,
+            reconstruction=reconstructed_pixel_values,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
-        hidden_states = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attentions = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedLMOutput(logits=output.logits, hidden_states=hidden_states, attentions=attentions)
-
 
 @add_start_docstrings(
     """
@@ -898,9 +849,9 @@ def __init__(self, config: DeiTConfig):
     @replace_return_docstrings(output_type=TFImageClassifierOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        labels: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        labels: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -917,7 +868,7 @@ def call(
         Examples:
 
         ```python
-        >>> from transformers import DeiTImageProcessor, TFDeiTForImageClassification
+        >>> from transformers import AutoImageProcessor, TFDeiTForImageClassification
         >>> import tensorflow as tf
         >>> from PIL import Image
         >>> import requests
@@ -928,7 +879,7 @@ def call(
 
         >>> # note: we are loading a TFDeiTForImageClassificationWithTeacher from the hub here,
         >>> # so the head will be randomly initialized, hence the predictions will be random
-        >>> image_processor = DeiTImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
+        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
         >>> model = TFDeiTForImageClassification.from_pretrained("facebook/deit-base-distilled-patch16-224")
 
         >>> inputs = image_processor(images=image, return_tensors="tf")
@@ -968,12 +919,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFImageClassifierOutput) -> TFImageClassifierOutput:
-        hidden_states = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attentions = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFImageClassifierOutput(logits=output.logits, hidden_states=hidden_states, attentions=attentions)
-
 
 @add_start_docstrings(
     """
@@ -1009,7 +954,6 @@ def __init__(self, config: DeiTConfig) -> None:
     @unpack_inputs
     @add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_IMAGE_CLASS_CHECKPOINT,
         output_type=TFDeiTForImageClassificationWithTeacherOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1017,8 +961,8 @@ def __init__(self, config: DeiTConfig) -> None:
     )
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1054,17 +998,3 @@ def call(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
-
-    def serving_output(
-        self, output: TFDeiTForImageClassificationWithTeacherOutput
-    ) -> TFDeiTForImageClassificationWithTeacherOutput:
-        hidden_states = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attentions = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFDeiTForImageClassificationWithTeacherOutput(
-            logits=output.logits,
-            cls_logits=output.cls_logits,
-            distillation_logits=output.distillation_logits,
-            hidden_states=hidden_states,
-            attentions=attentions,
-        )
diff --git a/src/transformers/models/bort/__init__.py b/src/transformers/models/deprecated/__init__.py
similarity index 100%
rename from src/transformers/models/bort/__init__.py
rename to src/transformers/models/deprecated/__init__.py
diff --git a/tests/mixed_int8/__init__.py b/src/transformers/models/deprecated/bort/__init__.py
similarity index 100%
rename from tests/mixed_int8/__init__.py
rename to src/transformers/models/deprecated/bort/__init__.py
diff --git a/src/transformers/models/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py
similarity index 100%
rename from src/transformers/models/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py
rename to src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py
index 933c88795ab9..4753f593da19 100644
--- a/src/transformers/models/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py
+++ b/src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py
@@ -18,17 +18,17 @@
 import argparse
 import os
 
-import numpy as np
-import torch
-from packaging import version
-from torch import nn
-
 import gluonnlp as nlp
 import mxnet as mx
+import numpy as np
+import torch
 from gluonnlp.base import get_home_dir
 from gluonnlp.model.bert import BERTEncoder
 from gluonnlp.model.utils import _load_vocab
 from gluonnlp.vocab import Vocab
+from packaging import version
+from torch import nn
+
 from transformers import BertConfig, BertForMaskedLM, BertModel, RobertaTokenizer
 from transformers.models.bert.modeling_bert import (
     BertIntermediate,
diff --git a/src/transformers/models/deprecated/mctct/__init__.py b/src/transformers/models/deprecated/mctct/__init__.py
new file mode 100644
index 000000000000..567be97b7cd8
--- /dev/null
+++ b/src/transformers/models/deprecated/mctct/__init__.py
@@ -0,0 +1,56 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {
+    "configuration_mctct": ["MCTCT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MCTCTConfig"],
+    "feature_extraction_mctct": ["MCTCTFeatureExtractor"],
+    "processing_mctct": ["MCTCTProcessor"],
+}
+
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_mctct"] = [
+        "MCTCT_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "MCTCTForCTC",
+        "MCTCTModel",
+        "MCTCTPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_mctct import MCTCT_PRETRAINED_CONFIG_ARCHIVE_MAP, MCTCTConfig
+    from .feature_extraction_mctct import MCTCTFeatureExtractor
+    from .processing_mctct import MCTCTProcessor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_mctct import MCTCT_PRETRAINED_MODEL_ARCHIVE_LIST, MCTCTForCTC, MCTCTModel, MCTCTPreTrainedModel
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/mctct/configuration_mctct.py b/src/transformers/models/deprecated/mctct/configuration_mctct.py
similarity index 95%
rename from src/transformers/models/mctct/configuration_mctct.py
rename to src/transformers/models/deprecated/mctct/configuration_mctct.py
index 1c84f2325928..e91104112b68 100644
--- a/src/transformers/models/mctct/configuration_mctct.py
+++ b/src/transformers/models/deprecated/mctct/configuration_mctct.py
@@ -14,8 +14,8 @@
 # limitations under the License.
 """M-CTC-T model configuration"""
 
-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
 
 
 logger = logging.get_logger(__name__)
@@ -53,7 +53,7 @@ class MCTCTConfig(PretrainedConfig):
             Dimensions of each attention head for each attention layer in the Transformer encoder.
         max_position_embeddings (`int`, *optional*, defaults to 920):
             The maximum sequence length that this model might ever be used with (after log-mel spectrogram extraction).
-        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
         layerdrop (`float`, *optional*, defaults to 0.3):
             The probability of dropping an encoder layer during training. The default 0.3 value is used in the original
@@ -63,9 +63,9 @@ class MCTCTConfig(PretrainedConfig):
             `"relu"`, `"selu"` and `"gelu_new"` are supported.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.3):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.3):
             The dropout ratio for the attention probabilities.
         pad_token_id (`int`, *optional*, defaults to 1):
             The tokenizer index of the pad token.
@@ -80,17 +80,17 @@ class MCTCTConfig(PretrainedConfig):
             The probability of randomly dropping the `Conv1dSubsampler` layer during training.
         num_conv_layers (`int`, *optional*, defaults to 1):
             Number of convolution layers before applying transformer encoder layers.
-        conv_kernel (`List[int]`, *optional*, defaults to `[7]`):
+        conv_kernel (`Sequence[int]`, *optional*, defaults to `(7,)`):
             The kernel size of the 1D convolution applied before transformer layers. `len(conv_kernel)` must be equal
             to `num_conv_layers`.
-        conv_stride (`List[int]`, *optional*, defaults to `[3]`):
+        conv_stride (`Sequence[int]`, *optional*, defaults to `(3,)`):
             The stride length of the 1D convolution applied before transformer layers. `len(conv_stride)` must be equal
             to `num_conv_layers`.
         input_feat_per_channel (`int`, *optional*, defaults to 80):
             Feature dimensions of the channels of the input to the Conv1D layer.
         input_channels (`int`, *optional*, defaults to 1):
             Number of input channels of the input to the Conv1D layer.
-        conv_channels (`List[int]`, *optional*, defaults to None):
+        conv_channels (`List[int]`, *optional*):
             Channel sizes of intermediate Conv1D layers.
         ctc_loss_reduction (`str`, *optional*, defaults to `"sum"`):
             Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
@@ -144,7 +144,7 @@ def __init__(
         conv_channels=None,
         ctc_loss_reduction="sum",
         ctc_zero_infinity=False,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
         self.vocab_size = vocab_size
diff --git a/src/transformers/models/deprecated/mctct/feature_extraction_mctct.py b/src/transformers/models/deprecated/mctct/feature_extraction_mctct.py
new file mode 100644
index 000000000000..e1e17c4b12f9
--- /dev/null
+++ b/src/transformers/models/deprecated/mctct/feature_extraction_mctct.py
@@ -0,0 +1,288 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Feature extractor class for M-CTC-T
+"""
+
+from typing import List, Optional, Union
+
+import numpy as np
+
+from ....audio_utils import mel_filter_bank, optimal_fft_length, spectrogram, window_function
+from ....feature_extraction_sequence_utils import SequenceFeatureExtractor
+from ....feature_extraction_utils import BatchFeature
+from ....file_utils import PaddingStrategy, TensorType
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class MCTCTFeatureExtractor(SequenceFeatureExtractor):
+    r"""
+    Constructs a M-CTC-T feature extractor.
+
+    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
+    most of the main methods. Users should refer to this superclass for more information regarding those methods. This
+    code has been adapted from Flashlight's C++ code. For more information about the implementation, one can refer to
+    this [notebook](https://colab.research.google.com/drive/1GLtINkkhzms-IsdcGy_-tVCkv0qNF-Gt#scrollTo=pMCRGMmUC_an)
+    that takes the user step-by-step in the implementation.
+
+    Args:
+        feature_size (`int`, defaults to 80):
+            The feature dimension of the extracted features. This is the number of mel_frequency
+        sampling_rate (`int`, defaults to 16000):
+            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
+        padding_value (`float`, defaults to 0.0):
+            The value that is used to fill the padding values.
+        hop_length (`int`, defaults to 10):
+            Number of audio samples between windows. Otherwise referred to as "shift" in many papers.
+        win_length (`int`, defaults to 25):
+            Number of ms per window
+        win_function (`str`, defaults to `"hamming_window"`):
+            Name for the window function used for windowing, must be accessible via `torch.{win_function}`
+        frame_signal_scale (`float`, defaults to 32768.0):
+            Constant multiplied in creating the frames before applying DFT.
+        preemphasis_coeff (`float`, defaults to 0.97):
+            Constant multiplied in applying Pre-emphasis before DFT.
+        mel_floor (`float` defaults to 1.0):
+            Minimum value of mel frequency banks.
+        normalize_means (`bool`, *optional*, defaults to `True`):
+            Whether or not to zero-mean normalize the extracted features.
+        normalize_vars (`bool`, *optional*, defaults to `True`):
+            Whether or not to unit-variance normalize the extracted features.
+    """
+
+    model_input_names = ["input_features", "attention_mask"]
+
+    def __init__(
+        self,
+        feature_size=80,
+        sampling_rate=16000,
+        padding_value=0.0,
+        hop_length=10,
+        win_length=25,
+        win_function="hamming_window",
+        frame_signal_scale=32768.0,
+        preemphasis_coeff=0.97,
+        mel_floor=1.0,
+        normalize_means=True,
+        normalize_vars=True,
+        return_attention_mask=False,
+        **kwargs,
+    ):
+        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
+
+        self.feature_size = feature_size
+        self.sampling_rate = sampling_rate
+        self.padding_value = padding_value
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.frame_signal_scale = frame_signal_scale
+        self.preemphasis_coeff = preemphasis_coeff
+        self.mel_floor = mel_floor
+        self.normalize_means = normalize_means
+        self.normalize_vars = normalize_vars
+        self.win_function = win_function
+        self.return_attention_mask = return_attention_mask
+
+        self.sample_size = win_length * sampling_rate // 1000
+        self.sample_stride = hop_length * sampling_rate // 1000
+
+        self.n_fft = optimal_fft_length(self.sample_size)
+        self.n_freqs = (self.n_fft // 2) + 1
+
+    def _extract_mfsc_features(self, one_waveform: np.array) -> np.ndarray:
+        """
+        Extracts MFSC Features for one waveform vector (unbatched). Adapted from Flashlight's C++ MFSC code.
+        """
+        if self.win_function == "hamming_window":
+            window = window_function(window_length=self.sample_size, name=self.win_function, periodic=False)
+        else:
+            window = window_function(window_length=self.sample_size, name=self.win_function)
+
+        fbanks = mel_filter_bank(
+            num_frequency_bins=self.n_freqs,
+            num_mel_filters=self.feature_size,
+            min_frequency=0.0,
+            max_frequency=self.sampling_rate / 2.0,
+            sampling_rate=self.sampling_rate,
+        )
+
+        msfc_features = spectrogram(
+            one_waveform * self.frame_signal_scale,
+            window=window,
+            frame_length=self.sample_size,
+            hop_length=self.sample_stride,
+            fft_length=self.n_fft,
+            center=False,
+            preemphasis=self.preemphasis_coeff,
+            mel_filters=fbanks,
+            mel_floor=self.mel_floor,
+            log_mel="log",
+        )
+        return msfc_features.T
+
+    def _normalize_one(self, x, input_length, padding_value):
+        # make sure we normalize float32 arrays
+        if self.normalize_means:
+            mean = x[:input_length].mean(axis=0)
+            x = np.subtract(x, mean)
+        if self.normalize_vars:
+            std = x[:input_length].std(axis=0)
+            x = np.divide(x, std)
+
+        if input_length < x.shape[0]:
+            x[input_length:] = padding_value
+
+        # make sure array is in float32
+        x = x.astype(np.float32)
+
+        return x
+
+    def normalize(
+        self, input_features: List[np.ndarray], attention_mask: Optional[np.ndarray] = None
+    ) -> List[np.ndarray]:
+        lengths = attention_mask.sum(-1) if attention_mask is not None else [x.shape[0] for x in input_features]
+        return [self._normalize_one(x, n, self.padding_value) for x, n in zip(input_features, lengths)]
+
+    def __call__(
+        self,
+        raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
+        padding: Union[bool, str, PaddingStrategy] = False,
+        max_length: Optional[int] = None,
+        truncation: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        sampling_rate: Optional[int] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Main method to featurize and prepare for the model one or several sequence(s). sequences. It returns the
+        log-mel spectrogram of the input audio, as implemented in the original Flashlight MFSC feature extraction code.
+
+        Args:
+            raw_speech (`torch.Tensor`, `np.ndarray`, `List[float]`, `List[torch.Tensor]`, `List[np.ndarray]`, `List[List[float]]`):
+                The sequence or batch of sequences to be padded. Each sequence can be a tensor, a numpy array, a list
+                of float values, a list of tensors, a list of numpy arrays or a list of list of float values. Must be
+                mono channel audio, not stereo, i.e. single float per timestep.
+            padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            truncation (`bool`):
+                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the sequence to a multiple of the provided value.
+
+                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
+            return_attention_mask (`bool`, *optional*):
+                Whether to return the attention mask. If left to the default, will return the attention mask according
+                to the specific feature_extractor's default.
+
+                [What are attention masks?](../glossary#attention-mask)
+
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            sampling_rate (`int`, *optional*):
+                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
+                `sampling_rate` at the forward call to prevent silent errors.
+            padding_value (`float`, defaults to 0.0):
+        """
+
+        if sampling_rate is not None:
+            if sampling_rate != self.sampling_rate:
+                raise ValueError(
+                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
+                    f" {self.sampling_rate}. Please make sure that the provided `raw_speech` input was sampled with"
+                    f" {self.sampling_rate} and not {sampling_rate}."
+                )
+        else:
+            logger.warning(
+                "It is strongly recommended to pass the ``sampling_rate`` argument to this function. "
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+
+        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
+        if is_batched_numpy and len(raw_speech.shape) > 2:
+            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
+        is_batched = is_batched_numpy or (
+            isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
+        )
+
+        if is_batched:
+            raw_speech = [np.asarray(speech, dtype=np.float32) for speech in raw_speech]
+        elif not is_batched and not isinstance(raw_speech, np.ndarray):
+            raw_speech = np.asarray(raw_speech, dtype=np.float32)
+        elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64):
+            raw_speech = raw_speech.astype(np.float32)
+
+        # always return batch
+        if not is_batched:
+            raw_speech = [raw_speech]
+
+        # extract fbank features
+        features = [self._extract_mfsc_features(one_waveform) for one_waveform in raw_speech]
+
+        # convert into correct format for padding
+        encoded_inputs = BatchFeature({"input_features": features})
+
+        padded_inputs = self.pad(
+            encoded_inputs,
+            padding=padding,
+            max_length=max_length,
+            truncation=truncation,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=True,
+            **kwargs,
+        )
+        # make sure list is in array format
+        input_features = padded_inputs.get("input_features")
+        if isinstance(input_features[0], list):
+            padded_inputs["input_features"] = [np.asarray(feature, dtype=np.float32) for feature in input_features]
+
+        attention_mask = padded_inputs.get("attention_mask")
+        if attention_mask is not None:
+            padded_inputs["attention_mask"] = [np.asarray(array, dtype=np.int32) for array in attention_mask]
+
+        if self.normalize_means or self.normalize_vars:
+            attention_mask = (
+                np.array(attention_mask, dtype=np.int32)
+                if self._get_padding_strategies(padding, max_length=max_length) is not PaddingStrategy.DO_NOT_PAD
+                and padding
+                else None
+            )
+            padded_inputs["input_features"] = self.normalize(
+                padded_inputs["input_features"], attention_mask=attention_mask
+            )
+
+        if return_tensors is not None:
+            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
+
+        return padded_inputs
diff --git a/src/transformers/models/mctct/modeling_mctct.py b/src/transformers/models/deprecated/mctct/modeling_mctct.py
similarity index 97%
rename from src/transformers/models/mctct/modeling_mctct.py
rename to src/transformers/models/deprecated/mctct/modeling_mctct.py
index 43a45f237725..eca5ba014e51 100755
--- a/src/transformers/models/mctct/modeling_mctct.py
+++ b/src/transformers/models/deprecated/mctct/modeling_mctct.py
@@ -16,40 +16,31 @@
 
 
 import math
-import random
 from typing import Optional, Tuple, Union
 
 import torch
 import torch.utils.checkpoint
 from torch import nn
 
-from ...activations import ACT2FN
-from ...deepspeed import is_deepspeed_zero3_enabled
-from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
-from ...modeling_outputs import BaseModelOutput, CausalLMOutput
-from ...modeling_utils import (
+from ....activations import ACT2FN
+from ....file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
+from ....integrations.deepspeed import is_deepspeed_zero3_enabled
+from ....modeling_outputs import BaseModelOutput, CausalLMOutput
+from ....modeling_utils import (
     PreTrainedModel,
     apply_chunking_to_forward,
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
-from ...pytorch_utils import is_torch_less_than_1_9
-from ...utils import logging
+from ....utils import logging
 from .configuration_mctct import MCTCTConfig
 
 
 logger = logging.get_logger(__name__)
 
-if is_torch_less_than_1_9:
-    logger.warning(
-        f"You are using torch=={torch.__version__}, but torch>=1.9.0 is required to use MCTCTModel. Please upgrade"
-        " torch."
-    )
-
 _HIDDEN_STATES_START_POSITION = 1
 
 _CONFIG_FOR_DOC = "MCTCTConfig"
-_PROCESSOR_FOR_DOC = "MCTCTProcessor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "speechbrain/m-ctc-t-large"
@@ -158,7 +149,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.register_buffer(
             "token_type_ids",
             torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device),
@@ -452,7 +445,6 @@ class MCTCTPreTrainedModel(PreTrainedModel):
     config_class = MCTCTConfig
     base_model_prefix = "mctct"
     main_input_name = "input_features"
-    _keys_to_ignore_on_load_missing = ["position_ids"]
     supports_gradient_checkpointing = True
 
     def _init_weights(self, module):
@@ -618,7 +610,7 @@ def forward(
                 encoder_states = encoder_states + (hidden_states,)
 
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
             if not skip_the_layer or deepspeed_zero3_is_enabled:
@@ -678,7 +670,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(MCTCT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_PROCESSOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -749,7 +740,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(MCTCT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_PROCESSOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=CausalLMOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/mctct/processing_mctct.py b/src/transformers/models/deprecated/mctct/processing_mctct.py
similarity index 99%
rename from src/transformers/models/mctct/processing_mctct.py
rename to src/transformers/models/deprecated/mctct/processing_mctct.py
index eb20fa09b34c..764ed8d3db50 100644
--- a/src/transformers/models/mctct/processing_mctct.py
+++ b/src/transformers/models/deprecated/mctct/processing_mctct.py
@@ -18,7 +18,7 @@
 import warnings
 from contextlib import contextmanager
 
-from ...processing_utils import ProcessorMixin
+from ....processing_utils import ProcessorMixin
 
 
 class MCTCTProcessor(ProcessorMixin):
diff --git a/src/transformers/models/deprecated/mmbt/__init__.py b/src/transformers/models/deprecated/mmbt/__init__.py
new file mode 100644
index 000000000000..e467090cb4fb
--- /dev/null
+++ b/src/transformers/models/deprecated/mmbt/__init__.py
@@ -0,0 +1,45 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {"configuration_mmbt": ["MMBTConfig"]}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_mmbt"] = ["MMBTForClassification", "MMBTModel", "ModalEmbeddings"]
+
+
+if TYPE_CHECKING:
+    from .configuration_mmbt import MMBTConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_mmbt import MMBTForClassification, MMBTModel, ModalEmbeddings
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/mmbt/configuration_mmbt.py b/src/transformers/models/deprecated/mmbt/configuration_mmbt.py
similarity index 98%
rename from src/transformers/models/mmbt/configuration_mmbt.py
rename to src/transformers/models/deprecated/mmbt/configuration_mmbt.py
index aa453db592f8..df5161b0927a 100644
--- a/src/transformers/models/mmbt/configuration_mmbt.py
+++ b/src/transformers/models/deprecated/mmbt/configuration_mmbt.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 """ MMBT configuration"""
 
-from ...utils import logging
+from ....utils import logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/mmbt/modeling_mmbt.py b/src/transformers/models/deprecated/mmbt/modeling_mmbt.py
similarity index 98%
rename from src/transformers/models/mmbt/modeling_mmbt.py
rename to src/transformers/models/deprecated/mmbt/modeling_mmbt.py
index 8819dc4d5178..db0cef3a6502 100644
--- a/src/transformers/models/mmbt/modeling_mmbt.py
+++ b/src/transformers/models/deprecated/mmbt/modeling_mmbt.py
@@ -20,9 +20,9 @@
 from torch import nn
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from ...modeling_outputs import BaseModelOutputWithPooling, SequenceClassifierOutput
-from ...modeling_utils import ModuleUtilsMixin
-from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from ....modeling_outputs import BaseModelOutputWithPooling, SequenceClassifierOutput
+from ....modeling_utils import ModuleUtilsMixin
+from ....utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 
 
 logger = logging.get_logger(__name__)
@@ -106,7 +106,7 @@ def forward(self, input_modal, start_token=None, end_token=None, position_ids=No
             Encoder, the shape would be (batch_size, channels, height, width)
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. It does not expect [CLS] token to be added as it's
-            appended to the end of other modality embeddings. Indices can be obtained using [`BertTokenizer`]. See
+            appended to the end of other modality embeddings. Indices can be obtained using [`AutoTokenizer`]. See
             [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
diff --git a/src/transformers/models/deprecated/open_llama/__init__.py b/src/transformers/models/deprecated/open_llama/__init__.py
new file mode 100644
index 000000000000..446c9f076d31
--- /dev/null
+++ b/src/transformers/models/deprecated/open_llama/__init__.py
@@ -0,0 +1,95 @@
+# Copyright 2023 EleutherAI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_sentencepiece_available,
+    is_tokenizers_available,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_open_llama": ["OPEN_LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP", "OpenLlamaConfig"],
+}
+
+try:
+    if not is_sentencepiece_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["tokenization_open_llama"] = ["LlamaTokenizer"]
+
+try:
+    if not is_tokenizers_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["tokenization_open_llama_fast"] = ["LlamaTokenizerFast"]
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_open_llama"] = [
+        "OpenLlamaForCausalLM",
+        "OpenLlamaModel",
+        "OpenLlamaPreTrainedModel",
+        "OpenLlamaForSequenceClassification",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_open_llama import OPEN_LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenLlamaConfig
+
+    try:
+        if not is_sentencepiece_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from transformers import LlamaTokenizer
+
+    try:
+        if not is_tokenizers_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from transformers import LlamaTokenizerFast
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_open_llama import (
+            OpenLlamaForCausalLM,
+            OpenLlamaForSequenceClassification,
+            OpenLlamaModel,
+            OpenLlamaPreTrainedModel,
+        )
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/deprecated/open_llama/configuration_open_llama.py b/src/transformers/models/deprecated/open_llama/configuration_open_llama.py
new file mode 100644
index 000000000000..93e1394ab6d9
--- /dev/null
+++ b/src/transformers/models/deprecated/open_llama/configuration_open_llama.py
@@ -0,0 +1,167 @@
+# coding=utf-8
+# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Open-Llama model configuration"""
+
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+OPEN_LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "s-JoL/Open-Llama-V1": "https://huggingface.co/s-JoL/Open-Llama-V1/blob/main/config.json",
+}
+
+
+class OpenLlamaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`OpenLlamaModel`]. It is used to instantiate an
+    Open-Llama model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the
+    [s-JoL/Open-Llama-V1](https://huggingface.co/s-JoL/Open-Llama-V1).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Open-Llama model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`OpenLlamaModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
+            is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+
+        Example:
+
+    ```python
+    >>> from transformers import OpenLlamaModel, OpenLlamaConfig
+
+    >>> # Initializing a Open-Llama open_llama-7b style configuration
+    >>> configuration = OpenLlamaConfig()
+
+    >>> # Initializing a model from the open_llama-7b style configuration
+    >>> model = OpenLlamaModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "open-llama"
+
+    def __init__(
+        self,
+        vocab_size=100000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        use_memory_efficient_attention=True,
+        hidden_dropout_prob=0.1,
+        attention_dropout_prob=0.1,
+        use_stable_embedding=True,
+        shared_input_output_embedding=True,
+        rope_scaling=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.use_memory_efficient_attention = kwargs.pop(
+            "use_memorry_efficient_attention", use_memory_efficient_attention
+        )
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_dropout_prob = attention_dropout_prob
+        self.use_stable_embedding = use_stable_embedding
+        self.shared_input_output_embedding = shared_input_output_embedding
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    # Copied from transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")
diff --git a/src/transformers/models/deprecated/open_llama/modeling_open_llama.py b/src/transformers/models/deprecated/open_llama/modeling_open_llama.py
new file mode 100644
index 000000000000..c975aa40877c
--- /dev/null
+++ b/src/transformers/models/deprecated/open_llama/modeling_open_llama.py
@@ -0,0 +1,1003 @@
+# coding=utf-8
+# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Open-Llama model."""
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ....activations import ACT2FN
+from ....modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from ....modeling_utils import PreTrainedModel
+from ....utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from .configuration_open_llama import OpenLlamaConfig
+
+
+logger = logging.get_logger(__name__)
+
+try:
+    from xformers import ops as xops
+except ImportError:
+    xops = None
+
+
+_CONFIG_FOR_DOC = "OpenLlamaConfig"
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->OpenLlama
+class OpenLlamaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        OpenLlamaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->OpenLlama
+class OpenLlamaRotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->OpenLlama
+class OpenLlamaLinearScalingRotaryEmbedding(OpenLlamaRotaryEmbedding):
+    """OpenLlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        t = t / self.scaling_factor
+
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->OpenLlama
+class OpenLlamaDynamicNTKScalingRotaryEmbedding(OpenLlamaRotaryEmbedding):
+    """OpenLlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+
+        if seq_len > self.max_position_embeddings:
+            base = self.base * (
+                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+            ) ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+    cos = cos[position_ids].unsqueeze(1)  # [seq_len, dim] -> [batch_size, 1, seq_len, head_dim]
+    sin = sin[position_ids].unsqueeze(1)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class OpenLlamaMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        dropout_prob: float,
+    ):
+        super().__init__()
+        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
+        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.act_fn = ACT2FN[hidden_act]
+        self.dropout = nn.Dropout(dropout_prob)
+
+    def forward(self, x):
+        out = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return self.dropout(out)
+
+
+class OpenLlamaAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: OpenLlamaConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.dropout_prob = config.attention_dropout_prob
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self._init_rope()
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaAttention._init_rope with Llama->OpenLlama
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = OpenLlamaRotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            scaling_factor = self.config.rope_scaling["factor"]
+            if scaling_type == "linear":
+                self.rotary_emb = OpenLlamaLinearScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            elif scaling_type == "dynamic":
+                self.rotary_emb = OpenLlamaDynamicNTKScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        # [bsz, nh, t, hd]
+
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+        past_key_value = (key_states, value_states) if use_cache else None
+
+        if self.config.use_memory_efficient_attention and xops is not None and self.training:
+            attn_weights = None
+            query_states = query_states.transpose(1, 2)
+            key_states = key_states.transpose(1, 2)
+            value_states = value_states.transpose(1, 2)
+            attn_output = xops.memory_efficient_attention(
+                query_states, key_states, value_states, attn_bias=xops.LowerTriangularMask(), p=self.dropout_prob
+            )
+        else:
+            attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+            if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is"
+                    f" {attn_weights.size()}"
+                )
+
+            if attention_mask is not None:
+                if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                    raise ValueError(
+                        f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                    )
+                attn_weights = attn_weights + attention_mask
+                attn_weights = torch.max(
+                    attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min, device=attn_weights.device)
+                )
+
+            # upcast attention to fp32
+            attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+            attn_output = torch.matmul(attn_weights, value_states)
+
+            if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+                raise ValueError(
+                    f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                    f" {attn_output.size()}"
+                )
+
+            attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class OpenLlamaDecoderLayer(nn.Module):
+    def __init__(self, config: OpenLlamaConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = OpenLlamaAttention(config=config)
+        self.mlp = OpenLlamaMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            dropout_prob=config.hidden_dropout_prob,
+        )
+        self.input_layernorm = OpenLlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = OpenLlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+OPEN_LLAMA_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`OpenLlamaConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Open-Llama Model outputting raw hidden-states without any specific head on top.",
+    OPEN_LLAMA_START_DOCSTRING,
+)
+class OpenLlamaPreTrainedModel(PreTrainedModel):
+    config_class = OpenLlamaConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["OpenLlamaDecoderLayer"]
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            if self.config.use_stable_embedding:
+                torch.nn.init.xavier_normal_(module.weight.data)
+            else:
+                module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, OpenLlamaModel):
+            module.gradient_checkpointing = value
+
+
+OPEN_LLAMA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Open-Llama Model outputting raw hidden-states without any specific head on top.",
+    OPEN_LLAMA_START_DOCSTRING,
+)
+class OpenLlamaModel(OpenLlamaPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`OpenLlamaDecoderLayer`]
+
+    Args:
+        config: OpenLlamaConfig
+    """
+
+    def __init__(self, config: OpenLlamaConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        if config.use_stable_embedding:
+            self.embed_layer_norm = nn.LayerNorm(config.hidden_size)
+        else:
+            self.embed_layer_norm = None
+        self.layers = nn.ModuleList([OpenLlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.norm = OpenLlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+
+        return combined_attention_mask
+
+    @add_start_docstrings_to_model_forward(OPEN_LLAMA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+            if self.embed_layer_norm:
+                inputs_embeds = self.embed_layer_norm(inputs_embeds)
+        # embed positions
+        if self.config.use_memory_efficient_attention and self.training:
+            attention_mask = None
+        elif attention_mask is None:
+            attention_mask = torch.ones(
+                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+            )
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+        )
+
+        hidden_states = inputs_embeds
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, None)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class OpenLlamaForCausalLM(OpenLlamaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = OpenLlamaModel(config)
+        if config.shared_input_output_embedding:
+            self.lm_head = None
+        else:
+            self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(OPEN_LLAMA_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, OpenLlamaForCausalLM
+
+        >>> model = OpenLlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        if self.config.shared_input_output_embedding:
+            logits = torch.einsum(
+                "blh,vh->blv", hidden_states.to(self.model.embed_tokens.weight.device), self.model.embed_tokens.weight
+            )
+        else:
+            logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+
+@add_start_docstrings(
+    """
+    The LLaMa Model transformer with a sequence classification head on top (linear layer).
+
+    [`OpenLlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal
+    models (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    OPEN_LLAMA_START_DOCSTRING,
+)
+# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with LLAMA->OPEN_LLAMA,Llama->OpenLlama
+class OpenLlamaForSequenceClassification(OpenLlamaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = OpenLlamaModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(OPEN_LLAMA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+                    logits.device
+                )
+            else:
+                sequence_lengths = -1
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
diff --git a/src/transformers/models/deprecated/retribert/__init__.py b/src/transformers/models/deprecated/retribert/__init__.py
new file mode 100644
index 000000000000..dba5e14594e1
--- /dev/null
+++ b/src/transformers/models/deprecated/retribert/__init__.py
@@ -0,0 +1,73 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
+
+
+_import_structure = {
+    "configuration_retribert": ["RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RetriBertConfig"],
+    "tokenization_retribert": ["RetriBertTokenizer"],
+}
+
+try:
+    if not is_tokenizers_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["tokenization_retribert_fast"] = ["RetriBertTokenizerFast"]
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_retribert"] = [
+        "RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "RetriBertModel",
+        "RetriBertPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_retribert import RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RetriBertConfig
+    from .tokenization_retribert import RetriBertTokenizer
+
+    try:
+        if not is_tokenizers_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .tokenization_retribert_fast import RetriBertTokenizerFast
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_retribert import (
+            RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            RetriBertModel,
+            RetriBertPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/retribert/configuration_retribert.py b/src/transformers/models/deprecated/retribert/configuration_retribert.py
similarity index 98%
rename from src/transformers/models/retribert/configuration_retribert.py
rename to src/transformers/models/deprecated/retribert/configuration_retribert.py
index 23172cf40ec7..11d19193b360 100644
--- a/src/transformers/models/retribert/configuration_retribert.py
+++ b/src/transformers/models/deprecated/retribert/configuration_retribert.py
@@ -14,8 +14,8 @@
 # limitations under the License.
 """ RetriBERT model configuration"""
 
-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
 
 
 logger = logging.get_logger(__name__)
@@ -91,7 +91,7 @@ def __init__(
         share_encoders=True,
         projection_dim=128,
         pad_token_id=0,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
 
diff --git a/src/transformers/models/retribert/modeling_retribert.py b/src/transformers/models/deprecated/retribert/modeling_retribert.py
similarity index 97%
rename from src/transformers/models/retribert/modeling_retribert.py
rename to src/transformers/models/deprecated/retribert/modeling_retribert.py
index 03ffc92ba659..00d47bce5121 100644
--- a/src/transformers/models/retribert/modeling_retribert.py
+++ b/src/transformers/models/deprecated/retribert/modeling_retribert.py
@@ -24,9 +24,9 @@
 import torch.utils.checkpoint as checkpoint
 from torch import nn
 
-from ...modeling_utils import PreTrainedModel
-from ...utils import add_start_docstrings, logging
-from ..bert.modeling_bert import BertModel
+from ....modeling_utils import PreTrainedModel
+from ....utils import add_start_docstrings, logging
+from ...bert.modeling_bert import BertModel
 from .configuration_retribert import RetriBertConfig
 
 
@@ -186,7 +186,7 @@ def forward(
             input_ids_query (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                 Indices of input sequence tokens in the vocabulary for the queries in a batch.
 
-                Indices can be obtained using [`RetriBertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
diff --git a/src/transformers/models/retribert/tokenization_retribert.py b/src/transformers/models/deprecated/retribert/tokenization_retribert.py
similarity index 94%
rename from src/transformers/models/retribert/tokenization_retribert.py
rename to src/transformers/models/deprecated/retribert/tokenization_retribert.py
index a15cbcdf74bf..d0904e3c931e 100644
--- a/src/transformers/models/retribert/tokenization_retribert.py
+++ b/src/transformers/models/deprecated/retribert/tokenization_retribert.py
@@ -19,8 +19,8 @@
 import unicodedata
 from typing import List, Optional, Tuple
 
-from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
-from ...utils import logging
+from ....tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
+from ....utils import logging
 
 
 logger = logging.get_logger(__name__)
@@ -68,7 +68,6 @@ def whitespace_tokenize(text):
 
 
 class RetriBertTokenizer(PreTrainedTokenizer):
-
     r"""
     Constructs a RetriBERT tokenizer.
 
@@ -131,22 +130,8 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
-        super().__init__(
-            do_lower_case=do_lower_case,
-            do_basic_tokenize=do_basic_tokenize,
-            never_split=never_split,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            tokenize_chinese_chars=tokenize_chinese_chars,
-            strip_accents=strip_accents,
-            **kwargs,
-        )
-
         if not os.path.isfile(vocab_file):
             raise ValueError(
                 f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
@@ -162,7 +147,22 @@ def __init__(
                 tokenize_chinese_chars=tokenize_chinese_chars,
                 strip_accents=strip_accents,
             )
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+
+        super().__init__(
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
 
     @property
     # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.do_lower_case
@@ -179,11 +179,12 @@ def get_vocab(self):
         return dict(self.vocab, **self.added_tokens_encoder)
 
     # Copied from transformers.models.bert.tokenization_bert.BertTokenizer._tokenize
-    def _tokenize(self, text):
+    def _tokenize(self, text, split_special_tokens=False):
         split_tokens = []
         if self.do_basic_tokenize:
-            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-
+            for token in self.basic_tokenizer.tokenize(
+                text, never_split=self.all_special_tokens if not split_special_tokens else None
+            ):
                 # If the token is part of the never_split set
                 if token in self.basic_tokenizer.never_split:
                     split_tokens.append(token)
@@ -335,20 +336,30 @@ class BasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -367,7 +378,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -395,7 +408,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
diff --git a/src/transformers/models/retribert/tokenization_retribert_fast.py b/src/transformers/models/deprecated/retribert/tokenization_retribert_fast.py
similarity index 98%
rename from src/transformers/models/retribert/tokenization_retribert_fast.py
rename to src/transformers/models/deprecated/retribert/tokenization_retribert_fast.py
index 2532f839a300..07f7964b9f3f 100644
--- a/src/transformers/models/retribert/tokenization_retribert_fast.py
+++ b/src/transformers/models/deprecated/retribert/tokenization_retribert_fast.py
@@ -19,8 +19,8 @@
 
 from tokenizers import normalizers
 
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
+from ....tokenization_utils_fast import PreTrainedTokenizerFast
+from ....utils import logging
 from .tokenization_retribert import RetriBertTokenizer
 
 
@@ -114,7 +114,7 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
@@ -164,7 +164,7 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
         """
         output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
 
-        if token_ids_1:
+        if token_ids_1 is not None:
             output += token_ids_1 + [self.sep_token_id]
 
         return output
diff --git a/src/transformers/models/deprecated/tapex/__init__.py b/src/transformers/models/deprecated/tapex/__init__.py
new file mode 100644
index 000000000000..82bbacd15b0d
--- /dev/null
+++ b/src/transformers/models/deprecated/tapex/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+
+
+_import_structure = {"tokenization_tapex": ["TapexTokenizer"]}
+
+
+if TYPE_CHECKING:
+    from .tokenization_tapex import TapexTokenizer
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
diff --git a/src/transformers/models/tapex/tokenization_tapex.py b/src/transformers/models/deprecated/tapex/tokenization_tapex.py
similarity index 98%
rename from src/transformers/models/tapex/tokenization_tapex.py
rename to src/transformers/models/deprecated/tapex/tokenization_tapex.py
index f862ba7e4f76..a5ee093c56bd 100644
--- a/src/transformers/models/tapex/tokenization_tapex.py
+++ b/src/transformers/models/deprecated/tapex/tokenization_tapex.py
@@ -22,10 +22,10 @@
 
 import regex as re
 
-from ...file_utils import ExplicitEnum, PaddingStrategy, TensorType, add_end_docstrings, is_pandas_available
-from ...tokenization_utils import AddedToken, PreTrainedTokenizer
-from ...tokenization_utils_base import ENCODE_KWARGS_DOCSTRING, BatchEncoding, TextInput, TruncationStrategy
-from ...utils import logging
+from ....file_utils import ExplicitEnum, PaddingStrategy, TensorType, add_end_docstrings, is_pandas_available
+from ....tokenization_utils import AddedToken, PreTrainedTokenizer
+from ....tokenization_utils_base import ENCODE_KWARGS_DOCSTRING, BatchEncoding, TextInput, TruncationStrategy
+from ....utils import logging
 
 
 if is_pandas_available():
@@ -106,7 +106,7 @@ class TapexTruncationStrategy(ExplicitEnum):
                 argument defines the number of overlapping tokens.
             pad_to_multiple_of (`int`, *optional*):
                 If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
-                the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
+                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
             return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
                 If set, will return tensors instead of list of python integers. Acceptable values are:
 
@@ -284,7 +284,7 @@ def __init__(
         mask_token="<mask>",
         add_prefix_space=False,
         max_cell_length=15,
-        **kwargs
+        **kwargs,
     ):
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
@@ -296,23 +296,6 @@ def __init__(
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
 
-        super().__init__(
-            vocab_file=vocab_file,
-            merges_file=merges_file,
-            do_lower_case=do_lower_case,
-            errors=errors,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            add_prefix_space=add_prefix_space,
-            max_cell_length=max_cell_length,
-            **kwargs,
-        )
-
         with open(vocab_file, encoding="utf-8") as vocab_handle:
             self.encoder = json.load(vocab_handle)
         self.decoder = {v: k for k, v in self.encoder.items()}
@@ -331,6 +314,24 @@ def __init__(
         self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
 
         # additional properties
+
+        super().__init__(
+            vocab_file=vocab_file,
+            merges_file=merges_file,
+            do_lower_case=do_lower_case,
+            errors=errors,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            add_prefix_space=add_prefix_space,
+            max_cell_length=max_cell_length,
+            **kwargs,
+        )
+
         self.max_cell_length = max_cell_length
         self.table_linearize = IndexedRowTableLinearize()
 
@@ -531,7 +532,7 @@ def __call__(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Main method to tokenize and prepare for the model one or several table-sequence pair(s).
@@ -608,7 +609,7 @@ def source_call_func(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         # Input type checking for clearer error
         valid_table = False
@@ -695,7 +696,7 @@ def batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         <Tip warning={true}>
@@ -753,9 +754,8 @@ def _batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
-
         if return_offsets_mapping:
             raise NotImplementedError(
                 "return_offset_mapping is not available when using Python tokenizers. "
@@ -875,7 +875,7 @@ def encode(
         truncation: Union[bool, str, TruncationStrategy, TapexTruncationStrategy] = None,
         max_length: Optional[int] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs
+        **kwargs,
     ) -> List[int]:
         """
         Prepare a table, a string and possible answer for the model. This method does not return token type IDs,
@@ -914,7 +914,7 @@ def encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
         padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
@@ -964,7 +964,7 @@ def _encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         if return_offsets_mapping:
             raise NotImplementedError(
@@ -1020,7 +1020,7 @@ def target_call_func(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         The method tokenizes and prepares the answer label for the model.
@@ -1084,7 +1084,7 @@ def target_batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Prepare answer strings for the model.
@@ -1138,12 +1138,10 @@ def _target_batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
-
         batch_outputs = {}
         for text in answer:
-
             if self.do_lower_case:
                 text = text.lower()
 
@@ -1191,7 +1189,7 @@ def target_encode(
         truncation: Union[bool, str, TruncationStrategy, TapexTruncationStrategy] = None,
         max_length: Optional[int] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs
+        **kwargs,
     ) -> List[int]:
         """
         Prepare the answer string for the model. This method does not return token type IDs, attention masks, etc.
@@ -1229,7 +1227,7 @@ def target_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Prepare a answer string for the model.
@@ -1282,7 +1280,7 @@ def _target_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         if return_offsets_mapping:
             raise NotImplementedError(
@@ -1456,16 +1454,16 @@ def delete_unrelated_rows(self, table_content: Dict, question: str, answer: List
         truncated_unrelated_indices = []
         related_indices = []
         if answer is None or len(answer) == 0:
-            answer_set = set([])
+            answer_set = set()
         else:
-            answer_set = set([ans_ex.lower() for ans_ex in answer])
+            answer_set = {ans_ex.lower() for ans_ex in answer}
         # add question key words into answer set
         if question is not None:
             answer_set.update(question.split())
         question_set = set(question.strip("?!.,").split(" "))
         row_max_len = len(table_content["rows"])
         for _row_idx, row in enumerate(table_content["rows"]):
-            lower_row = set([str(cell).lower() for cell in row])
+            lower_row = {str(cell).lower() for cell in row}
             if len(lower_row & answer_set) == 0 and len(lower_row & question_set) == 0:
                 truncated_unrelated_indices.append(_row_idx)
             else:
diff --git a/src/transformers/models/deprecated/trajectory_transformer/__init__.py b/src/transformers/models/deprecated/trajectory_transformer/__init__.py
new file mode 100644
index 000000000000..b7af1bb48cb7
--- /dev/null
+++ b/src/transformers/models/deprecated/trajectory_transformer/__init__.py
@@ -0,0 +1,63 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {
+    "configuration_trajectory_transformer": [
+        "TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "TrajectoryTransformerConfig",
+    ],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_trajectory_transformer"] = [
+        "TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "TrajectoryTransformerModel",
+        "TrajectoryTransformerPreTrainedModel",
+        "load_tf_weights_in_trajectory_transformer",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_trajectory_transformer import (
+        TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        TrajectoryTransformerConfig,
+    )
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_trajectory_transformer import (
+            TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TrajectoryTransformerModel,
+            TrajectoryTransformerPreTrainedModel,
+            load_tf_weights_in_trajectory_transformer,
+        )
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/trajectory_transformer/configuration_trajectory_transformer.py b/src/transformers/models/deprecated/trajectory_transformer/configuration_trajectory_transformer.py
similarity index 93%
rename from src/transformers/models/trajectory_transformer/configuration_trajectory_transformer.py
rename to src/transformers/models/deprecated/trajectory_transformer/configuration_trajectory_transformer.py
index 626877a13d94..a64a0cbd89e1 100644
--- a/src/transformers/models/trajectory_transformer/configuration_trajectory_transformer.py
+++ b/src/transformers/models/deprecated/trajectory_transformer/configuration_trajectory_transformer.py
@@ -14,8 +14,8 @@
 # limitations under the License.
 """ TrajectoryTransformer model configuration"""
 
-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
 
 
 logger = logging.get_logger(__name__)
@@ -45,8 +45,6 @@ class TrajectoryTransformerConfig(PretrainedConfig):
         vocab_size (`int`, *optional*, defaults to 100):
             Vocabulary size of the TrajectoryTransformer model. Defines the number of different tokens that can be
             represented by the `trajectories` passed when calling [`TrajectoryTransformerModel`]
-        batch_size (`int`, *optional*, defaults to 256):
-            Size of the batch of trajectories passed to the model.
         action_weight (`int`, *optional*, defaults to 5):
             Weight of the action in the loss function
         reward_weight (`int`, *optional*, defaults to 1):
@@ -79,8 +77,6 @@ class TrajectoryTransformerConfig(PretrainedConfig):
         max_position_embeddings (`int`, *optional*, defaults to 512):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (`int`, *optional*, defaults to 2):
-            The vocabulary size of the `token_type_ids` passed when calling [`TrajectoryTransformerModel`]
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
@@ -115,7 +111,6 @@ class TrajectoryTransformerConfig(PretrainedConfig):
     def __init__(
         self,
         vocab_size=100,
-        batch_size=256,
         action_weight=5,
         reward_weight=1,
         value_weight=1,
@@ -131,7 +126,6 @@ def __init__(
         resid_pdrop=0.1,
         learning_rate=0.0006,
         max_position_embeddings=512,
-        type_vocab_size=2,
         initializer_range=0.02,
         layer_norm_eps=1e-12,
         kaiming_initializer_range=1,
@@ -139,10 +133,9 @@ def __init__(
         pad_token_id=1,
         bos_token_id=50256,
         eos_token_id=50256,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
-        self.batch_size = batch_size
         self.action_weight = action_weight
         self.reward_weight = reward_weight
         self.value_weight = value_weight
@@ -159,7 +152,6 @@ def __init__(
         self.attn_pdrop = attn_pdrop
         self.resid_pdrop = resid_pdrop
         self.initializer_range = initializer_range
-        self.type_vocab_size = type_vocab_size
         self.layer_norm_eps = layer_norm_eps
         self.kaiming_initializer_range = kaiming_initializer_range
         self.use_cache = use_cache
diff --git a/src/transformers/models/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py
similarity index 100%
rename from src/transformers/models/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py
rename to src/transformers/models/deprecated/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py
index 14e6556e07b7..622552fa7836 100644
--- a/src/transformers/models/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/deprecated/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py
@@ -15,8 +15,8 @@
 """ TrajectoryTransformer pytorch checkpoint conversion"""
 
 import torch
-
 import trajectory.utils as utils
+
 from transformers import TrajectoryTransformerModel
 
 
diff --git a/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py b/src/transformers/models/deprecated/trajectory_transformer/modeling_trajectory_transformer.py
similarity index 98%
rename from src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py
rename to src/transformers/models/deprecated/trajectory_transformer/modeling_trajectory_transformer.py
index cf41166b9390..75415dbe77bf 100644
--- a/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py
+++ b/src/transformers/models/deprecated/trajectory_transformer/modeling_trajectory_transformer.py
@@ -25,8 +25,8 @@
 from torch import nn
 from torch.nn import functional as F
 
-from ...modeling_utils import PreTrainedModel
-from ...utils import (
+from ....modeling_utils import PreTrainedModel
+from ....utils import (
     ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -284,6 +284,7 @@ def __init__(self, config):
             torch.tril(torch.ones(config.block_size, config.block_size)).view(
                 1, 1, config.block_size, config.block_size
             ),
+            persistent=False,
         )
 
         # mask previous value estimates
@@ -533,21 +534,22 @@ def forward(
 
         hidden_states = self.drop(token_embeddings + position_embeddings)
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         presents = () if use_cache else None
         all_self_attentions = () if output_attentions else None
         all_hidden_states = () if output_hidden_states else None
 
         for i, (block, layer_past) in enumerate(zip(self.blocks, past_key_values)):
-
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             if self.gradient_checkpointing and self.training:
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
 
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
diff --git a/src/transformers/models/deprecated/van/__init__.py b/src/transformers/models/deprecated/van/__init__.py
new file mode 100644
index 000000000000..2db730984ffa
--- /dev/null
+++ b/src/transformers/models/deprecated/van/__init__.py
@@ -0,0 +1,54 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
+
+
+_import_structure = {"configuration_van": ["VAN_PRETRAINED_CONFIG_ARCHIVE_MAP", "VanConfig"]}
+
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_van"] = [
+        "VAN_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "VanForImageClassification",
+        "VanModel",
+        "VanPreTrainedModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_van import VAN_PRETRAINED_CONFIG_ARCHIVE_MAP, VanConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_van import (
+            VAN_PRETRAINED_MODEL_ARCHIVE_LIST,
+            VanForImageClassification,
+            VanModel,
+            VanPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
diff --git a/src/transformers/models/van/configuration_van.py b/src/transformers/models/deprecated/van/configuration_van.py
similarity index 96%
rename from src/transformers/models/van/configuration_van.py
rename to src/transformers/models/deprecated/van/configuration_van.py
index 47d5a9b6c11a..70942ad645b4 100644
--- a/src/transformers/models/van/configuration_van.py
+++ b/src/transformers/models/deprecated/van/configuration_van.py
@@ -14,8 +14,8 @@
 # limitations under the License.
 """ VAN model configuration"""
 
-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
 
 
 logger = logging.get_logger(__name__)
@@ -57,9 +57,9 @@ class VanConfig(PretrainedConfig):
             `"selu"` and `"gelu_new"` are supported.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
             The epsilon used by the layer normalization layers.
-        layer_scale_init_value (`float`, *optional*, defaults to 1e-2):
+        layer_scale_init_value (`float`, *optional*, defaults to 0.01):
             The initial value for layer scaling.
         drop_path_rate (`float`, *optional*, defaults to 0.0):
             The dropout probability for stochastic depth.
@@ -94,7 +94,7 @@ def __init__(
         layer_scale_init_value=1e-2,
         drop_path_rate=0.0,
         dropout_rate=0.0,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.image_size = image_size
diff --git a/src/transformers/models/van/convert_van_to_pytorch.py b/src/transformers/models/deprecated/van/convert_van_to_pytorch.py
similarity index 95%
rename from src/transformers/models/van/convert_van_to_pytorch.py
rename to src/transformers/models/deprecated/van/convert_van_to_pytorch.py
index ded3c3500dad..20492e42be20 100644
--- a/src/transformers/models/van/convert_van_to_pytorch.py
+++ b/src/transformers/models/deprecated/van/convert_van_to_pytorch.py
@@ -27,11 +27,11 @@
 
 import torch
 import torch.nn as nn
+from huggingface_hub import cached_download, hf_hub_download
 from torch import Tensor
 
-from huggingface_hub import cached_download, hf_hub_download
-from transformers import AutoFeatureExtractor, VanConfig, VanForImageClassification
-from transformers.models.van.modeling_van import VanLayerScaling
+from transformers import AutoImageProcessor, VanConfig, VanForImageClassification
+from transformers.models.deprecated.van.modeling_van import VanLayerScaling
 from transformers.utils import logging
 
 
@@ -55,7 +55,7 @@ def __call__(self, x: Tensor):
         for m in self.module.modules():
             self.handles.append(m.register_forward_hook(self._forward_hook))
         self.module(x)
-        list(map(lambda x: x.remove(), self.handles))
+        [x.remove() for x in self.handles]
         return self
 
     @property
@@ -154,10 +154,10 @@ def convert_weight_and_push(
         )
 
         # we can use the convnext one
-        feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/convnext-base-224-22k-1k")
-        feature_extractor.push_to_hub(
+        image_processor = AutoImageProcessor.from_pretrained("facebook/convnext-base-224-22k-1k")
+        image_processor.push_to_hub(
             repo_path_or_name=save_directory / checkpoint_name,
-            commit_message="Add feature extractor",
+            commit_message="Add image processor",
             use_temp_dir=True,
         )
 
@@ -277,7 +277,7 @@ def convert_weights_and_push(save_directory: Path, model_name: str = None, push_
         default=True,
         type=bool,
         required=False,
-        help="If True, push model and feature extractor to the hub.",
+        help="If True, push model and image processor to the hub.",
     )
 
     args = parser.parse_args()
diff --git a/src/transformers/models/van/modeling_van.py b/src/transformers/models/deprecated/van/modeling_van.py
similarity index 97%
rename from src/transformers/models/van/modeling_van.py
rename to src/transformers/models/deprecated/van/modeling_van.py
index 91515599ed7f..4ef18f54158f 100644
--- a/src/transformers/models/van/modeling_van.py
+++ b/src/transformers/models/deprecated/van/modeling_van.py
@@ -23,14 +23,14 @@
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
-from ...activations import ACT2FN
-from ...modeling_outputs import (
+from ....activations import ACT2FN
+from ....modeling_outputs import (
     BaseModelOutputWithNoAttention,
     BaseModelOutputWithPoolingAndNoAttention,
     ImageClassifierOutputWithNoAttention,
 )
-from ...modeling_utils import PreTrainedModel
-from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from ....modeling_utils import PreTrainedModel
+from ....utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_van import VanConfig
 
 
@@ -38,7 +38,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "VanConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "Visual-Attention-Network/van-base"
@@ -55,7 +54,7 @@
 
 
 # Copied from transformers.models.convnext.modeling_convnext.drop_path
-def drop_path(input, drop_prob: float = 0.0, training: bool = False):
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
@@ -375,7 +374,7 @@ class VanPreTrainedModel(PreTrainedModel):
     def _init_weights(self, module):
         """Initialize the weights"""
         if isinstance(module, nn.Linear):
-            nn.init.trunc_normal_(module.weight, std=0.02)
+            nn.init.trunc_normal_(module.weight, std=self.config.initializer_range)
             if isinstance(module, nn.Linear) and module.bias is not None:
                 nn.init.constant_(module.bias, 0)
         elif isinstance(module, nn.LayerNorm):
@@ -408,7 +407,7 @@ def _set_gradient_checkpointing(self, module, value=False):
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
-            [`AutoImageProcessor.__call__`] for details.
+            [`ConvNextImageProcessor.__call__`] for details.
 
         output_hidden_states (`bool`, *optional*):
             Whether or not to return the hidden states of all stages. See `hidden_states` under returned tensors for
@@ -435,7 +434,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(VAN_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPoolingAndNoAttention,
         config_class=_CONFIG_FOR_DOC,
@@ -493,7 +491,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(VAN_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_IMAGE_CLASS_CHECKPOINT,
         output_type=ImageClassifierOutputWithNoAttention,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/deta/__init__.py b/src/transformers/models/deta/__init__.py
new file mode 100644
index 000000000000..2d25a6a71602
--- /dev/null
+++ b/src/transformers/models/deta/__init__.py
@@ -0,0 +1,73 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
+
+
+_import_structure = {
+    "configuration_deta": ["DETA_PRETRAINED_CONFIG_ARCHIVE_MAP", "DetaConfig"],
+}
+
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["image_processing_deta"] = ["DetaImageProcessor"]
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_deta"] = [
+        "DETA_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "DetaForObjectDetection",
+        "DetaModel",
+        "DetaPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_deta import DETA_PRETRAINED_CONFIG_ARCHIVE_MAP, DetaConfig
+
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .image_processing_deta import DetaImageProcessor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_deta import (
+            DETA_PRETRAINED_MODEL_ARCHIVE_LIST,
+            DetaForObjectDetection,
+            DetaModel,
+            DetaPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/deta/configuration_deta.py b/src/transformers/models/deta/configuration_deta.py
new file mode 100644
index 000000000000..8abe077ae126
--- /dev/null
+++ b/src/transformers/models/deta/configuration_deta.py
@@ -0,0 +1,231 @@
+# coding=utf-8
+# Copyright 2022 SenseTime and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" DETA model configuration"""
+
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto import CONFIG_MAPPING
+
+
+logger = logging.get_logger(__name__)
+
+DETA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "ut/deta": "https://huggingface.co/ut/deta/resolve/main/config.json",
+}
+
+
+class DetaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DetaModel`]. It is used to instantiate a DETA
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the DETA
+    [SenseTime/deformable-detr](https://huggingface.co/SenseTime/deformable-detr) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `ResNetConfig()`):
+            The configuration of the backbone model.
+        num_queries (`int`, *optional*, defaults to 900):
+            Number of object queries, i.e. detection slots. This is the maximal number of objects [`DetaModel`] can
+            detect in a single image. In case `two_stage` is set to `True`, we use `two_stage_num_proposals` instead.
+        d_model (`int`, *optional*, defaults to 256):
+            Dimension of the layers.
+        encoder_layers (`int`, *optional*, defaults to 6):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 6):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 2048):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 2048):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        init_xavier_std (`float`, *optional*, defaults to 1):
+            The scaling factor used for the Xavier initialization gain in the HM Attention map module.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        auxiliary_loss (`bool`, *optional*, defaults to `False`):
+            Whether auxiliary decoding losses (loss at each decoder layer) are to be used.
+        position_embedding_type (`str`, *optional*, defaults to `"sine"`):
+            Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`.
+        class_cost (`float`, *optional*, defaults to 1):
+            Relative weight of the classification error in the Hungarian matching cost.
+        bbox_cost (`float`, *optional*, defaults to 5):
+            Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost.
+        giou_cost (`float`, *optional*, defaults to 2):
+            Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost.
+        mask_loss_coefficient (`float`, *optional*, defaults to 1):
+            Relative weight of the Focal loss in the panoptic segmentation loss.
+        dice_loss_coefficient (`float`, *optional*, defaults to 1):
+            Relative weight of the DICE/F-1 loss in the panoptic segmentation loss.
+        bbox_loss_coefficient (`float`, *optional*, defaults to 5):
+            Relative weight of the L1 bounding box loss in the object detection loss.
+        giou_loss_coefficient (`float`, *optional*, defaults to 2):
+            Relative weight of the generalized IoU loss in the object detection loss.
+        eos_coefficient (`float`, *optional*, defaults to 0.1):
+            Relative classification weight of the 'no-object' class in the object detection loss.
+        num_feature_levels (`int`, *optional*, defaults to 5):
+            The number of input feature levels.
+        encoder_n_points (`int`, *optional*, defaults to 4):
+            The number of sampled keys in each feature level for each attention head in the encoder.
+        decoder_n_points (`int`, *optional*, defaults to 4):
+            The number of sampled keys in each feature level for each attention head in the decoder.
+        two_stage (`bool`, *optional*, defaults to `True`):
+            Whether to apply a two-stage deformable DETR, where the region proposals are also generated by a variant of
+            DETA, which are further fed into the decoder for iterative bounding box refinement.
+        two_stage_num_proposals (`int`, *optional*, defaults to 300):
+            The number of region proposals to be generated, in case `two_stage` is set to `True`.
+        with_box_refine (`bool`, *optional*, defaults to `True`):
+            Whether to apply iterative bounding box refinement, where each decoder layer refines the bounding boxes
+            based on the predictions from the previous layer.
+        focal_alpha (`float`, *optional*, defaults to 0.25):
+            Alpha parameter in the focal loss.
+
+    Examples:
+
+    ```python
+    >>> from transformers import DetaConfig, DetaModel
+
+    >>> # Initializing a DETA SenseTime/deformable-detr style configuration
+    >>> configuration = DetaConfig()
+
+    >>> # Initializing a model (with random weights) from the SenseTime/deformable-detr style configuration
+    >>> model = DetaModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "deta"
+    attribute_map = {
+        "hidden_size": "d_model",
+        "num_attention_heads": "encoder_attention_heads",
+    }
+
+    def __init__(
+        self,
+        backbone_config=None,
+        num_queries=900,
+        max_position_embeddings=2048,
+        encoder_layers=6,
+        encoder_ffn_dim=2048,
+        encoder_attention_heads=8,
+        decoder_layers=6,
+        decoder_ffn_dim=1024,
+        decoder_attention_heads=8,
+        encoder_layerdrop=0.0,
+        is_encoder_decoder=True,
+        activation_function="relu",
+        d_model=256,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        init_xavier_std=1.0,
+        return_intermediate=True,
+        auxiliary_loss=False,
+        position_embedding_type="sine",
+        num_feature_levels=5,
+        encoder_n_points=4,
+        decoder_n_points=4,
+        two_stage=True,
+        two_stage_num_proposals=300,
+        with_box_refine=True,
+        assign_first_stage=True,
+        class_cost=1,
+        bbox_cost=5,
+        giou_cost=2,
+        mask_loss_coefficient=1,
+        dice_loss_coefficient=1,
+        bbox_loss_coefficient=5,
+        giou_loss_coefficient=2,
+        eos_coefficient=0.1,
+        focal_alpha=0.25,
+        **kwargs,
+    ):
+        if backbone_config is None:
+            logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
+            backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage2", "stage3", "stage4"])
+        else:
+            if isinstance(backbone_config, dict):
+                backbone_model_type = backbone_config.pop("model_type")
+                config_class = CONFIG_MAPPING[backbone_model_type]
+                backbone_config = config_class.from_dict(backbone_config)
+
+        self.backbone_config = backbone_config
+        self.num_queries = num_queries
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.init_xavier_std = init_xavier_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.auxiliary_loss = auxiliary_loss
+        self.position_embedding_type = position_embedding_type
+        # deformable attributes
+        self.num_feature_levels = num_feature_levels
+        self.encoder_n_points = encoder_n_points
+        self.decoder_n_points = decoder_n_points
+        self.two_stage = two_stage
+        self.two_stage_num_proposals = two_stage_num_proposals
+        self.with_box_refine = with_box_refine
+        self.assign_first_stage = assign_first_stage
+        if two_stage is True and with_box_refine is False:
+            raise ValueError("If two_stage is True, with_box_refine must be True.")
+        # Hungarian matcher
+        self.class_cost = class_cost
+        self.bbox_cost = bbox_cost
+        self.giou_cost = giou_cost
+        # Loss coefficients
+        self.mask_loss_coefficient = mask_loss_coefficient
+        self.dice_loss_coefficient = dice_loss_coefficient
+        self.bbox_loss_coefficient = bbox_loss_coefficient
+        self.giou_loss_coefficient = giou_loss_coefficient
+        self.eos_coefficient = eos_coefficient
+        self.focal_alpha = focal_alpha
+        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model
diff --git a/src/transformers/models/deta/convert_deta_resnet_to_pytorch.py b/src/transformers/models/deta/convert_deta_resnet_to_pytorch.py
new file mode 100644
index 000000000000..cc17568bd641
--- /dev/null
+++ b/src/transformers/models/deta/convert_deta_resnet_to_pytorch.py
@@ -0,0 +1,320 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert DETA checkpoints from the original repository.
+
+URL: https://github.com/jozhang97/DETA/tree/master"""
+
+
+import argparse
+import json
+from pathlib import Path
+
+import requests
+import torch
+from huggingface_hub import cached_download, hf_hub_download, hf_hub_url
+from PIL import Image
+
+from transformers import DetaConfig, DetaForObjectDetection, DetaImageProcessor
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+def get_deta_config():
+    config = DetaConfig(
+        num_queries=900,
+        encoder_ffn_dim=2048,
+        decoder_ffn_dim=2048,
+        num_feature_levels=5,
+        assign_first_stage=True,
+        with_box_refine=True,
+        two_stage=True,
+    )
+
+    # set labels
+    config.num_labels = 91
+    repo_id = "huggingface/label-files"
+    filename = "coco-detection-id2label.json"
+    id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename, repo_type="dataset")), "r"))
+    id2label = {int(k): v for k, v in id2label.items()}
+    config.id2label = id2label
+    config.label2id = {v: k for k, v in id2label.items()}
+
+    return config
+
+
+# here we list all keys to be renamed (original name on the left, our name on the right)
+def create_rename_keys(config):
+    rename_keys = []
+
+    # stem
+    # fmt: off
+    rename_keys.append(("backbone.0.body.conv1.weight", "model.backbone.model.embedder.embedder.convolution.weight"))
+    rename_keys.append(("backbone.0.body.bn1.weight", "model.backbone.model.embedder.embedder.normalization.weight"))
+    rename_keys.append(("backbone.0.body.bn1.bias", "model.backbone.model.embedder.embedder.normalization.bias"))
+    rename_keys.append(("backbone.0.body.bn1.running_mean", "model.backbone.model.embedder.embedder.normalization.running_mean"))
+    rename_keys.append(("backbone.0.body.bn1.running_var", "model.backbone.model.embedder.embedder.normalization.running_var"))
+    # stages
+    for stage_idx in range(len(config.backbone_config.depths)):
+        for layer_idx in range(config.backbone_config.depths[stage_idx]):
+            # shortcut
+            if layer_idx == 0:
+                rename_keys.append(
+                    (
+                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.0.weight",
+                        f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.convolution.weight",
+                    )
+                )
+                rename_keys.append(
+                    (
+                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.weight",
+                        f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.weight",
+                    )
+                )
+                rename_keys.append(
+                    (
+                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.bias",
+                        f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.bias",
+                    )
+                )
+                rename_keys.append(
+                    (
+                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_mean",
+                        f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_mean",
+                    )
+                )
+                rename_keys.append(
+                    (
+                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_var",
+                        f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_var",
+                    )
+                )
+            # 3 convs
+            for i in range(3):
+                rename_keys.append(
+                    (
+                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.conv{i+1}.weight",
+                        f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.convolution.weight",
+                    )
+                )
+                rename_keys.append(
+                    (
+                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.weight",
+                        f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.weight",
+                    )
+                )
+                rename_keys.append(
+                    (
+                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.bias",
+                        f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.bias",
+                    )
+                )
+                rename_keys.append(
+                    (
+                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.running_mean",
+                        f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_mean",
+                    )
+                )
+                rename_keys.append(
+                    (
+                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.running_var",
+                        f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_var",
+                    )
+                )
+    # transformer encoder
+    for i in range(config.encoder_layers):
+        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.weight", f"model.encoder.layers.{i}.self_attn.sampling_offsets.weight"))
+        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.bias", f"model.encoder.layers.{i}.self_attn.sampling_offsets.bias"))
+        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.weight", f"model.encoder.layers.{i}.self_attn.attention_weights.weight"))
+        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.bias", f"model.encoder.layers.{i}.self_attn.attention_weights.bias"))
+        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.weight", f"model.encoder.layers.{i}.self_attn.value_proj.weight"))
+        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.bias", f"model.encoder.layers.{i}.self_attn.value_proj.bias"))
+        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.weight", f"model.encoder.layers.{i}.self_attn.output_proj.weight"))
+        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.bias", f"model.encoder.layers.{i}.self_attn.output_proj.bias"))
+        rename_keys.append((f"transformer.encoder.layers.{i}.norm1.weight", f"model.encoder.layers.{i}.self_attn_layer_norm.weight"))
+        rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"model.encoder.layers.{i}.self_attn_layer_norm.bias"))
+        rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"model.encoder.layers.{i}.fc1.weight"))
+        rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"model.encoder.layers.{i}.fc1.bias"))
+        rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"model.encoder.layers.{i}.fc2.weight"))
+        rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"model.encoder.layers.{i}.fc2.bias"))
+        rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"model.encoder.layers.{i}.final_layer_norm.weight"))
+        rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"model.encoder.layers.{i}.final_layer_norm.bias"))
+
+    # transformer decoder
+    for i in range(config.decoder_layers):
+        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.weight", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.weight"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.bias", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.bias"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.weight", f"model.decoder.layers.{i}.encoder_attn.attention_weights.weight"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.bias", f"model.decoder.layers.{i}.encoder_attn.attention_weights.bias"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.weight", f"model.decoder.layers.{i}.encoder_attn.value_proj.weight"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.bias", f"model.decoder.layers.{i}.encoder_attn.value_proj.bias"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.weight", f"model.decoder.layers.{i}.encoder_attn.output_proj.weight"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.bias", f"model.decoder.layers.{i}.encoder_attn.output_proj.bias"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.norm1.weight", f"model.decoder.layers.{i}.encoder_attn_layer_norm.weight"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"model.decoder.layers.{i}.encoder_attn_layer_norm.bias"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"model.decoder.layers.{i}.self_attn.out_proj.weight"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"model.decoder.layers.{i}.self_attn.out_proj.bias"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.norm2.weight", f"model.decoder.layers.{i}.self_attn_layer_norm.weight"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.norm2.bias", f"model.decoder.layers.{i}.self_attn_layer_norm.bias"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"model.decoder.layers.{i}.fc1.weight"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"model.decoder.layers.{i}.fc1.bias"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"model.decoder.layers.{i}.fc2.weight"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"model.decoder.layers.{i}.fc2.bias"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"model.decoder.layers.{i}.final_layer_norm.weight"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"model.decoder.layers.{i}.final_layer_norm.bias"))
+
+    # fmt: on
+
+    return rename_keys
+
+
+def rename_key(dct, old, new):
+    val = dct.pop(old)
+    dct[new] = val
+
+
+def read_in_decoder_q_k_v(state_dict, config):
+    # transformer decoder self-attention layers
+    hidden_size = config.d_model
+    for i in range(config.decoder_layers):
+        # read in weights + bias of input projection layer of self-attention
+        in_proj_weight = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_weight")
+        in_proj_bias = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_bias")
+        # next, add query, keys and values (in that order) to the state dict
+        state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:hidden_size, :]
+        state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:hidden_size]
+        state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[
+            hidden_size : hidden_size * 2, :
+        ]
+        state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[hidden_size : hidden_size * 2]
+        state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-hidden_size:, :]
+        state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-hidden_size:]
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+
+    return im
+
+
+@torch.no_grad()
+def convert_deta_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
+    """
+    Copy/paste/tweak model's weights to our DETA structure.
+    """
+
+    # load config
+    config = get_deta_config()
+
+    # load original state dict
+    if model_name == "deta-resnet-50":
+        filename = "adet_checkpoint0011.pth"
+    elif model_name == "deta-resnet-50-24-epochs":
+        filename = "adet_2x_checkpoint0023.pth"
+    else:
+        raise ValueError(f"Model name {model_name} not supported")
+    checkpoint_path = hf_hub_download(repo_id="nielsr/deta-checkpoints", filename=filename)
+    state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
+
+    # rename keys
+    rename_keys = create_rename_keys(config)
+    for src, dest in rename_keys:
+        rename_key(state_dict, src, dest)
+    read_in_decoder_q_k_v(state_dict, config)
+
+    # fix some prefixes
+    for key in state_dict.copy().keys():
+        if "transformer.decoder.class_embed" in key or "transformer.decoder.bbox_embed" in key:
+            val = state_dict.pop(key)
+            state_dict[key.replace("transformer.decoder", "model.decoder")] = val
+        if "input_proj" in key:
+            val = state_dict.pop(key)
+            state_dict["model." + key] = val
+        if "level_embed" in key or "pos_trans" in key or "pix_trans" in key or "enc_output" in key:
+            val = state_dict.pop(key)
+            state_dict[key.replace("transformer", "model")] = val
+
+    # finally, create HuggingFace model and load state dict
+    model = DetaForObjectDetection(config)
+    model.load_state_dict(state_dict)
+    model.eval()
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model.to(device)
+
+    # load image processor
+    processor = DetaImageProcessor(format="coco_detection")
+
+    # verify our conversion on image
+    img = prepare_img()
+    encoding = processor(images=img, return_tensors="pt")
+    pixel_values = encoding["pixel_values"]
+    outputs = model(pixel_values.to(device))
+
+    # verify logits
+    if model_name == "deta-resnet-50":
+        expected_logits = torch.tensor(
+            [[-7.3978, -2.5406, -4.1668], [-8.2684, -3.9933, -3.8096], [-7.0515, -3.7973, -5.8516]]
+        )
+        expected_boxes = torch.tensor([[0.5043, 0.4973, 0.9998], [0.2542, 0.5489, 0.4748], [0.5490, 0.2765, 0.0570]])
+    elif model_name == "deta-resnet-50-24-epochs":
+        expected_logits = torch.tensor(
+            [[-7.1688, -2.4857, -4.8669], [-7.8630, -3.8154, -4.2674], [-7.2730, -4.1865, -5.5323]]
+        )
+        expected_boxes = torch.tensor([[0.5021, 0.4971, 0.9994], [0.2546, 0.5486, 0.4731], [0.1686, 0.1986, 0.2142]])
+
+    assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4)
+    assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4)
+    print("Everything ok!")
+
+    if pytorch_dump_folder_path:
+        # Save model and processor
+        logger.info(f"Saving PyTorch model and processor to {pytorch_dump_folder_path}...")
+        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+        model.save_pretrained(pytorch_dump_folder_path)
+        processor.save_pretrained(pytorch_dump_folder_path)
+
+    # Push to hub
+    if push_to_hub:
+        print("Pushing model and processor to hub...")
+        model.push_to_hub(f"jozhang97/{model_name}")
+        processor.push_to_hub(f"jozhang97/{model_name}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        default="deta-resnet-50",
+        choices=["deta-resnet-50", "deta-resnet-50-24-epochs"],
+        help="Name of the model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        default=None,
+        type=str,
+        help="Path to the folder to output PyTorch model.",
+    )
+    parser.add_argument(
+        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
+    )
+    args = parser.parse_args()
+    convert_deta_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/deta/convert_deta_swin_to_pytorch.py b/src/transformers/models/deta/convert_deta_swin_to_pytorch.py
new file mode 100644
index 000000000000..911bc434e142
--- /dev/null
+++ b/src/transformers/models/deta/convert_deta_swin_to_pytorch.py
@@ -0,0 +1,327 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert DETA checkpoints from the original repository.
+
+URL: https://github.com/jozhang97/DETA/tree/master"""
+
+
+import argparse
+import json
+from pathlib import Path
+
+import requests
+import torch
+from huggingface_hub import cached_download, hf_hub_download, hf_hub_url
+from PIL import Image
+
+from transformers import DetaConfig, DetaForObjectDetection, DetaImageProcessor, SwinConfig
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+def get_deta_config(model_name):
+    backbone_config = SwinConfig(
+        embed_dim=192,
+        depths=(2, 2, 18, 2),
+        num_heads=(6, 12, 24, 48),
+        window_size=12,
+        out_features=["stage2", "stage3", "stage4"],
+    )
+
+    config = DetaConfig(
+        backbone_config=backbone_config,
+        num_queries=900,
+        encoder_ffn_dim=2048,
+        decoder_ffn_dim=2048,
+        num_feature_levels=5,
+        assign_first_stage=True,
+        with_box_refine=True,
+        two_stage=True,
+    )
+
+    # set labels
+    repo_id = "huggingface/label-files"
+    if "o365" in model_name:
+        num_labels = 366
+        filename = "object365-id2label.json"
+    else:
+        num_labels = 91
+        filename = "coco-detection-id2label.json"
+
+    config.num_labels = num_labels
+    id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename, repo_type="dataset")), "r"))
+    id2label = {int(k): v for k, v in id2label.items()}
+    config.id2label = id2label
+    config.label2id = {v: k for k, v in id2label.items()}
+
+    return config
+
+
+# here we list all keys to be renamed (original name on the left, our name on the right)
+def create_rename_keys(config):
+    rename_keys = []
+
+    # stem
+    # fmt: off
+    rename_keys.append(("backbone.0.body.patch_embed.proj.weight", "model.backbone.model.embeddings.patch_embeddings.projection.weight"))
+    rename_keys.append(("backbone.0.body.patch_embed.proj.bias", "model.backbone.model.embeddings.patch_embeddings.projection.bias"))
+    rename_keys.append(("backbone.0.body.patch_embed.norm.weight", "model.backbone.model.embeddings.norm.weight"))
+    rename_keys.append(("backbone.0.body.patch_embed.norm.bias", "model.backbone.model.embeddings.norm.bias"))
+    # stages
+    for i in range(len(config.backbone_config.depths)):
+        for j in range(config.backbone_config.depths[i]):
+            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm1.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_before.weight"))
+            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm1.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_before.bias"))
+            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.relative_position_bias_table", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_bias_table"))
+            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.relative_position_index", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_index"))
+            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.proj.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.output.dense.weight"))
+            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.proj.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.output.dense.bias"))
+            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm2.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_after.weight"))
+            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm2.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_after.bias"))
+            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc1.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.intermediate.dense.weight"))
+            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc1.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.intermediate.dense.bias"))
+            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc2.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.output.dense.weight"))
+            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc2.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.output.dense.bias"))
+
+        if i < 3:
+            rename_keys.append((f"backbone.0.body.layers.{i}.downsample.reduction.weight", f"model.backbone.model.encoder.layers.{i}.downsample.reduction.weight"))
+            rename_keys.append((f"backbone.0.body.layers.{i}.downsample.norm.weight", f"model.backbone.model.encoder.layers.{i}.downsample.norm.weight"))
+            rename_keys.append((f"backbone.0.body.layers.{i}.downsample.norm.bias", f"model.backbone.model.encoder.layers.{i}.downsample.norm.bias"))
+
+    rename_keys.append(("backbone.0.body.norm1.weight", "model.backbone.model.hidden_states_norms.stage2.weight"))
+    rename_keys.append(("backbone.0.body.norm1.bias", "model.backbone.model.hidden_states_norms.stage2.bias"))
+    rename_keys.append(("backbone.0.body.norm2.weight", "model.backbone.model.hidden_states_norms.stage3.weight"))
+    rename_keys.append(("backbone.0.body.norm2.bias", "model.backbone.model.hidden_states_norms.stage3.bias"))
+    rename_keys.append(("backbone.0.body.norm3.weight", "model.backbone.model.hidden_states_norms.stage4.weight"))
+    rename_keys.append(("backbone.0.body.norm3.bias", "model.backbone.model.hidden_states_norms.stage4.bias"))
+
+    # transformer encoder
+    for i in range(config.encoder_layers):
+        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.weight", f"model.encoder.layers.{i}.self_attn.sampling_offsets.weight"))
+        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.bias", f"model.encoder.layers.{i}.self_attn.sampling_offsets.bias"))
+        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.weight", f"model.encoder.layers.{i}.self_attn.attention_weights.weight"))
+        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.bias", f"model.encoder.layers.{i}.self_attn.attention_weights.bias"))
+        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.weight", f"model.encoder.layers.{i}.self_attn.value_proj.weight"))
+        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.bias", f"model.encoder.layers.{i}.self_attn.value_proj.bias"))
+        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.weight", f"model.encoder.layers.{i}.self_attn.output_proj.weight"))
+        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.bias", f"model.encoder.layers.{i}.self_attn.output_proj.bias"))
+        rename_keys.append((f"transformer.encoder.layers.{i}.norm1.weight", f"model.encoder.layers.{i}.self_attn_layer_norm.weight"))
+        rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"model.encoder.layers.{i}.self_attn_layer_norm.bias"))
+        rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"model.encoder.layers.{i}.fc1.weight"))
+        rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"model.encoder.layers.{i}.fc1.bias"))
+        rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"model.encoder.layers.{i}.fc2.weight"))
+        rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"model.encoder.layers.{i}.fc2.bias"))
+        rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"model.encoder.layers.{i}.final_layer_norm.weight"))
+        rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"model.encoder.layers.{i}.final_layer_norm.bias"))
+
+    # transformer decoder
+    for i in range(config.decoder_layers):
+        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.weight", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.weight"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.bias", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.bias"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.weight", f"model.decoder.layers.{i}.encoder_attn.attention_weights.weight"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.bias", f"model.decoder.layers.{i}.encoder_attn.attention_weights.bias"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.weight", f"model.decoder.layers.{i}.encoder_attn.value_proj.weight"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.bias", f"model.decoder.layers.{i}.encoder_attn.value_proj.bias"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.weight", f"model.decoder.layers.{i}.encoder_attn.output_proj.weight"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.bias", f"model.decoder.layers.{i}.encoder_attn.output_proj.bias"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.norm1.weight", f"model.decoder.layers.{i}.encoder_attn_layer_norm.weight"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"model.decoder.layers.{i}.encoder_attn_layer_norm.bias"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"model.decoder.layers.{i}.self_attn.out_proj.weight"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"model.decoder.layers.{i}.self_attn.out_proj.bias"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.norm2.weight", f"model.decoder.layers.{i}.self_attn_layer_norm.weight"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.norm2.bias", f"model.decoder.layers.{i}.self_attn_layer_norm.bias"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"model.decoder.layers.{i}.fc1.weight"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"model.decoder.layers.{i}.fc1.bias"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"model.decoder.layers.{i}.fc2.weight"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"model.decoder.layers.{i}.fc2.bias"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"model.decoder.layers.{i}.final_layer_norm.weight"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"model.decoder.layers.{i}.final_layer_norm.bias"))
+
+    # fmt: on
+
+    return rename_keys
+
+
+def rename_key(dct, old, new):
+    val = dct.pop(old)
+    dct[new] = val
+
+
+# we split up the matrix of each encoder layer into queries, keys and values
+def read_in_swin_q_k_v(state_dict, backbone_config):
+    num_features = [int(backbone_config.embed_dim * 2**i) for i in range(len(backbone_config.depths))]
+    for i in range(len(backbone_config.depths)):
+        dim = num_features[i]
+        for j in range(backbone_config.depths[i]):
+            # fmt: off
+            # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias)
+            in_proj_weight = state_dict.pop(f"backbone.0.body.layers.{i}.blocks.{j}.attn.qkv.weight")
+            in_proj_bias = state_dict.pop(f"backbone.0.body.layers.{i}.blocks.{j}.attn.qkv.bias")
+            # next, add query, keys and values (in that order) to the state dict
+            state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.query.weight"] = in_proj_weight[:dim, :]
+            state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.query.bias"] = in_proj_bias[: dim]
+            state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.key.weight"] = in_proj_weight[
+                dim : dim * 2, :
+            ]
+            state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.key.bias"] = in_proj_bias[
+                dim : dim * 2
+            ]
+            state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.value.weight"] = in_proj_weight[
+                -dim :, :
+            ]
+            state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.value.bias"] = in_proj_bias[-dim :]
+            # fmt: on
+
+
+def read_in_decoder_q_k_v(state_dict, config):
+    # transformer decoder self-attention layers
+    hidden_size = config.d_model
+    for i in range(config.decoder_layers):
+        # read in weights + bias of input projection layer of self-attention
+        in_proj_weight = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_weight")
+        in_proj_bias = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_bias")
+        # next, add query, keys and values (in that order) to the state dict
+        state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:hidden_size, :]
+        state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:hidden_size]
+        state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[
+            hidden_size : hidden_size * 2, :
+        ]
+        state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[hidden_size : hidden_size * 2]
+        state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-hidden_size:, :]
+        state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-hidden_size:]
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+
+    return im
+
+
+@torch.no_grad()
+def convert_deta_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
+    """
+    Copy/paste/tweak model's weights to our DETA structure.
+    """
+
+    # load config
+    config = get_deta_config(model_name)
+
+    # load original state dict
+    if model_name == "deta-swin-large":
+        checkpoint_path = hf_hub_download(repo_id="nielsr/deta-checkpoints", filename="adet_swin_ft.pth")
+    elif model_name == "deta-swin-large-o365":
+        checkpoint_path = hf_hub_download(repo_id="jozhang97/deta-swin-l-o365", filename="deta_swin_pt_o365.pth")
+    else:
+        raise ValueError(f"Model name {model_name} not supported")
+
+    state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
+
+    # original state dict
+    for name, param in state_dict.items():
+        print(name, param.shape)
+
+    # rename keys
+    rename_keys = create_rename_keys(config)
+    for src, dest in rename_keys:
+        rename_key(state_dict, src, dest)
+    read_in_swin_q_k_v(state_dict, config.backbone_config)
+    read_in_decoder_q_k_v(state_dict, config)
+
+    # fix some prefixes
+    for key in state_dict.copy().keys():
+        if "transformer.decoder.class_embed" in key or "transformer.decoder.bbox_embed" in key:
+            val = state_dict.pop(key)
+            state_dict[key.replace("transformer.decoder", "model.decoder")] = val
+        if "input_proj" in key:
+            val = state_dict.pop(key)
+            state_dict["model." + key] = val
+        if "level_embed" in key or "pos_trans" in key or "pix_trans" in key or "enc_output" in key:
+            val = state_dict.pop(key)
+            state_dict[key.replace("transformer", "model")] = val
+
+    # finally, create HuggingFace model and load state dict
+    model = DetaForObjectDetection(config)
+    model.load_state_dict(state_dict)
+    model.eval()
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model.to(device)
+
+    # load image processor
+    processor = DetaImageProcessor(format="coco_detection")
+
+    # verify our conversion on image
+    img = prepare_img()
+    encoding = processor(images=img, return_tensors="pt")
+    pixel_values = encoding["pixel_values"]
+    outputs = model(pixel_values.to(device))
+
+    # verify logits
+    print("Logits:", outputs.logits[0, :3, :3])
+    print("Boxes:", outputs.pred_boxes[0, :3, :3])
+    if model_name == "deta-swin-large":
+        expected_logits = torch.tensor(
+            [[-7.6308, -2.8485, -5.3737], [-7.2037, -4.5505, -4.8027], [-7.2943, -4.2611, -4.6617]]
+        )
+        expected_boxes = torch.tensor([[0.4987, 0.4969, 0.9999], [0.2549, 0.5498, 0.4805], [0.5498, 0.2757, 0.0569]])
+    elif model_name == "deta-swin-large-o365":
+        expected_logits = torch.tensor(
+            [[-8.0122, -3.5720, -4.9717], [-8.1547, -3.6886, -4.6389], [-7.6610, -3.6194, -5.0134]]
+        )
+        expected_boxes = torch.tensor([[0.2523, 0.5549, 0.4881], [0.7715, 0.4149, 0.4601], [0.5503, 0.2753, 0.0575]])
+    assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4)
+    assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4)
+    print("Everything ok!")
+
+    if pytorch_dump_folder_path:
+        # Save model and processor
+        logger.info(f"Saving PyTorch model and processor to {pytorch_dump_folder_path}...")
+        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+        model.save_pretrained(pytorch_dump_folder_path)
+        processor.save_pretrained(pytorch_dump_folder_path)
+
+    # Push to hub
+    if push_to_hub:
+        print("Pushing model and processor to hub...")
+        model.push_to_hub(f"jozhang97/{model_name}")
+        processor.push_to_hub(f"jozhang97/{model_name}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        default="deta-swin-large",
+        choices=["deta-swin-large", "deta-swin-large-o365"],
+        help="Name of the model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        default=None,
+        type=str,
+        help="Path to the folder to output PyTorch model.",
+    )
+    parser.add_argument(
+        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
+    )
+    args = parser.parse_args()
+    convert_deta_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/deta/image_processing_deta.py b/src/transformers/models/deta/image_processing_deta.py
new file mode 100644
index 000000000000..568990f536c8
--- /dev/null
+++ b/src/transformers/models/deta/image_processing_deta.py
@@ -0,0 +1,1092 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Deformable DETR."""
+
+import pathlib
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
+
+import numpy as np
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_processing_utils import BaseImageProcessor, get_size_dict
+from ...image_transforms import (
+    PaddingMode,
+    center_to_corners_format,
+    corners_to_center_format,
+    pad,
+    rescale,
+    resize,
+    rgb_to_id,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_batched,
+    is_scaled_image,
+    to_numpy_array,
+    valid_coco_detection_annotations,
+    valid_coco_panoptic_annotations,
+    valid_images,
+)
+from ...utils import (
+    is_flax_available,
+    is_jax_tensor,
+    is_tf_available,
+    is_tf_tensor,
+    is_torch_available,
+    is_torch_tensor,
+    is_torchvision_available,
+    is_vision_available,
+    logging,
+)
+from ...utils.generic import ExplicitEnum, TensorType
+
+
+if is_torch_available():
+    import torch
+
+
+if is_torchvision_available():
+    from torchvision.ops.boxes import batched_nms
+
+if is_vision_available():
+    import PIL
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class AnnotionFormat(ExplicitEnum):
+    COCO_DETECTION = "coco_detection"
+    COCO_PANOPTIC = "coco_panoptic"
+
+
+SUPPORTED_ANNOTATION_FORMATS = (AnnotionFormat.COCO_DETECTION, AnnotionFormat.COCO_PANOPTIC)
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio
+def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, int]:
+    """
+    Computes the output image size given the input image size and the desired output size.
+
+    Args:
+        image_size (`Tuple[int, int]`):
+            The input image size.
+        size (`int`):
+            The desired output size.
+        max_size (`int`, *optional*):
+            The maximum allowed output size.
+    """
+    height, width = image_size
+    if max_size is not None:
+        min_original_size = float(min((height, width)))
+        max_original_size = float(max((height, width)))
+        if max_original_size / min_original_size * size > max_size:
+            size = int(round(max_size * min_original_size / max_original_size))
+
+    if (height <= width and height == size) or (width <= height and width == size):
+        return height, width
+
+    if width < height:
+        ow = size
+        oh = int(size * height / width)
+    else:
+        oh = size
+        ow = int(size * width / height)
+    return (oh, ow)
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size
+def get_resize_output_image_size(
+    input_image: np.ndarray,
+    size: Union[int, Tuple[int, int], List[int]],
+    max_size: Optional[int] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> Tuple[int, int]:
+    """
+    Computes the output image size given the input image size and the desired output size. If the desired output size
+    is a tuple or list, the output image size is returned as is. If the desired output size is an integer, the output
+    image size is computed by keeping the aspect ratio of the input image size.
+
+    Args:
+        image_size (`Tuple[int, int]`):
+            The input image size.
+        size (`int`):
+            The desired output size.
+        max_size (`int`, *optional*):
+            The maximum allowed output size.
+        input_data_format (`ChannelDimension` or `str`, *optional*):
+            The channel dimension format of the input image. If not provided, it will be inferred from the input image.
+    """
+    image_size = get_image_size(input_image, input_data_format)
+    if isinstance(size, (list, tuple)):
+        return size
+
+    return get_size_with_aspect_ratio(image_size, size, max_size)
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn
+def get_numpy_to_framework_fn(arr) -> Callable:
+    """
+    Returns a function that converts a numpy array to the framework of the input array.
+
+    Args:
+        arr (`np.ndarray`): The array to convert.
+    """
+    if isinstance(arr, np.ndarray):
+        return np.array
+    if is_tf_available() and is_tf_tensor(arr):
+        import tensorflow as tf
+
+        return tf.convert_to_tensor
+    if is_torch_available() and is_torch_tensor(arr):
+        import torch
+
+        return torch.tensor
+    if is_flax_available() and is_jax_tensor(arr):
+        import jax.numpy as jnp
+
+        return jnp.array
+    raise ValueError(f"Cannot convert arrays of type {type(arr)}")
+
+
+# Copied from transformers.models.detr.image_processing_detr.safe_squeeze
+def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray:
+    """
+    Squeezes an array, but only if the axis specified has dim 1.
+    """
+    if axis is None:
+        return arr.squeeze()
+
+    try:
+        return arr.squeeze(axis=axis)
+    except ValueError:
+        return arr
+
+
+# Copied from transformers.models.detr.image_processing_detr.normalize_annotation
+def normalize_annotation(annotation: Dict, image_size: Tuple[int, int]) -> Dict:
+    image_height, image_width = image_size
+    norm_annotation = {}
+    for key, value in annotation.items():
+        if key == "boxes":
+            boxes = value
+            boxes = corners_to_center_format(boxes)
+            boxes /= np.asarray([image_width, image_height, image_width, image_height], dtype=np.float32)
+            norm_annotation[key] = boxes
+        else:
+            norm_annotation[key] = value
+    return norm_annotation
+
+
+# Copied from transformers.models.detr.image_processing_detr.max_across_indices
+def max_across_indices(values: Iterable[Any]) -> List[Any]:
+    """
+    Return the maximum value across all indices of an iterable of values.
+    """
+    return [max(values_i) for values_i in zip(*values)]
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_max_height_width
+def get_max_height_width(
+    images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> List[int]:
+    """
+    Get the maximum height and width across all images in a batch.
+    """
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(images[0])
+
+    if input_data_format == ChannelDimension.FIRST:
+        _, max_height, max_width = max_across_indices([img.shape for img in images])
+    elif input_data_format == ChannelDimension.LAST:
+        max_height, max_width, _ = max_across_indices([img.shape for img in images])
+    else:
+        raise ValueError(f"Invalid channel dimension format: {input_data_format}")
+    return (max_height, max_width)
+
+
+# Copied from transformers.models.detr.image_processing_detr.make_pixel_mask
+def make_pixel_mask(
+    image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> np.ndarray:
+    """
+    Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
+
+    Args:
+        image (`np.ndarray`):
+            Image to make the pixel mask for.
+        output_size (`Tuple[int, int]`):
+            Output size of the mask.
+    """
+    input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+    mask = np.zeros(output_size, dtype=np.int64)
+    mask[:input_height, :input_width] = 1
+    return mask
+
+
+# Copied from transformers.models.detr.image_processing_detr.convert_coco_poly_to_mask
+def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray:
+    """
+    Convert a COCO polygon annotation to a mask.
+
+    Args:
+        segmentations (`List[List[float]]`):
+            List of polygons, each polygon represented by a list of x-y coordinates.
+        height (`int`):
+            Height of the mask.
+        width (`int`):
+            Width of the mask.
+    """
+    try:
+        from pycocotools import mask as coco_mask
+    except ImportError:
+        raise ImportError("Pycocotools is not installed in your environment.")
+
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = np.asarray(mask, dtype=np.uint8)
+        mask = np.any(mask, axis=2)
+        masks.append(mask)
+    if masks:
+        masks = np.stack(masks, axis=0)
+    else:
+        masks = np.zeros((0, height, width), dtype=np.uint8)
+
+    return masks
+
+
+# Copied from transformers.models.detr.image_processing_detr.prepare_coco_detection_annotation with DETR->DETA
+def prepare_coco_detection_annotation(
+    image,
+    target,
+    return_segmentation_masks: bool = False,
+    input_data_format: Optional[Union[ChannelDimension, str]] = None,
+):
+    """
+    Convert the target in COCO format into the format expected by DETA.
+    """
+    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
+
+    image_id = target["image_id"]
+    image_id = np.asarray([image_id], dtype=np.int64)
+
+    # Get all COCO annotations for the given image.
+    annotations = target["annotations"]
+    annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0]
+
+    classes = [obj["category_id"] for obj in annotations]
+    classes = np.asarray(classes, dtype=np.int64)
+
+    # for conversion to coco api
+    area = np.asarray([obj["area"] for obj in annotations], dtype=np.float32)
+    iscrowd = np.asarray([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in annotations], dtype=np.int64)
+
+    boxes = [obj["bbox"] for obj in annotations]
+    # guard against no boxes via resizing
+    boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4)
+    boxes[:, 2:] += boxes[:, :2]
+    boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
+    boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)
+
+    keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+
+    new_target = {}
+    new_target["image_id"] = image_id
+    new_target["class_labels"] = classes[keep]
+    new_target["boxes"] = boxes[keep]
+    new_target["area"] = area[keep]
+    new_target["iscrowd"] = iscrowd[keep]
+    new_target["orig_size"] = np.asarray([int(image_height), int(image_width)], dtype=np.int64)
+
+    if annotations and "keypoints" in annotations[0]:
+        keypoints = [obj["keypoints"] for obj in annotations]
+        keypoints = np.asarray(keypoints, dtype=np.float32)
+        num_keypoints = keypoints.shape[0]
+        keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
+        new_target["keypoints"] = keypoints[keep]
+
+    if return_segmentation_masks:
+        segmentation_masks = [obj["segmentation"] for obj in annotations]
+        masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width)
+        new_target["masks"] = masks[keep]
+
+    return new_target
+
+
+# Copied from transformers.models.detr.image_processing_detr.masks_to_boxes
+def masks_to_boxes(masks: np.ndarray) -> np.ndarray:
+    """
+    Compute the bounding boxes around the provided panoptic segmentation masks.
+
+    Args:
+        masks: masks in format `[number_masks, height, width]` where N is the number of masks
+
+    Returns:
+        boxes: bounding boxes in format `[number_masks, 4]` in xyxy format
+    """
+    if masks.size == 0:
+        return np.zeros((0, 4))
+
+    h, w = masks.shape[-2:]
+    y = np.arange(0, h, dtype=np.float32)
+    x = np.arange(0, w, dtype=np.float32)
+    # see https://github.com/pytorch/pytorch/issues/50276
+    y, x = np.meshgrid(y, x, indexing="ij")
+
+    x_mask = masks * np.expand_dims(x, axis=0)
+    x_max = x_mask.reshape(x_mask.shape[0], -1).max(-1)
+    x = np.ma.array(x_mask, mask=~(np.array(masks, dtype=bool)))
+    x_min = x.filled(fill_value=1e8)
+    x_min = x_min.reshape(x_min.shape[0], -1).min(-1)
+
+    y_mask = masks * np.expand_dims(y, axis=0)
+    y_max = y_mask.reshape(x_mask.shape[0], -1).max(-1)
+    y = np.ma.array(y_mask, mask=~(np.array(masks, dtype=bool)))
+    y_min = y.filled(fill_value=1e8)
+    y_min = y_min.reshape(y_min.shape[0], -1).min(-1)
+
+    return np.stack([x_min, y_min, x_max, y_max], 1)
+
+
+# Copied from transformers.models.detr.image_processing_detr.prepare_coco_panoptic_annotation with DETR->DETA
+def prepare_coco_panoptic_annotation(
+    image: np.ndarray,
+    target: Dict,
+    masks_path: Union[str, pathlib.Path],
+    return_masks: bool = True,
+    input_data_format: Union[ChannelDimension, str] = None,
+) -> Dict:
+    """
+    Prepare a coco panoptic annotation for DETA.
+    """
+    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
+    annotation_path = pathlib.Path(masks_path) / target["file_name"]
+
+    new_target = {}
+    new_target["image_id"] = np.asarray([target["image_id"] if "image_id" in target else target["id"]], dtype=np.int64)
+    new_target["size"] = np.asarray([image_height, image_width], dtype=np.int64)
+    new_target["orig_size"] = np.asarray([image_height, image_width], dtype=np.int64)
+
+    if "segments_info" in target:
+        masks = np.asarray(PIL.Image.open(annotation_path), dtype=np.uint32)
+        masks = rgb_to_id(masks)
+
+        ids = np.array([segment_info["id"] for segment_info in target["segments_info"]])
+        masks = masks == ids[:, None, None]
+        masks = masks.astype(np.uint8)
+        if return_masks:
+            new_target["masks"] = masks
+        new_target["boxes"] = masks_to_boxes(masks)
+        new_target["class_labels"] = np.array(
+            [segment_info["category_id"] for segment_info in target["segments_info"]], dtype=np.int64
+        )
+        new_target["iscrowd"] = np.asarray(
+            [segment_info["iscrowd"] for segment_info in target["segments_info"]], dtype=np.int64
+        )
+        new_target["area"] = np.asarray(
+            [segment_info["area"] for segment_info in target["segments_info"]], dtype=np.float32
+        )
+
+    return new_target
+
+
+# Copied from transformers.models.detr.image_processing_detr.resize_annotation
+def resize_annotation(
+    annotation: Dict[str, Any],
+    orig_size: Tuple[int, int],
+    target_size: Tuple[int, int],
+    threshold: float = 0.5,
+    resample: PILImageResampling = PILImageResampling.NEAREST,
+):
+    """
+    Resizes an annotation to a target size.
+
+    Args:
+        annotation (`Dict[str, Any]`):
+            The annotation dictionary.
+        orig_size (`Tuple[int, int]`):
+            The original size of the input image.
+        target_size (`Tuple[int, int]`):
+            The target size of the image, as returned by the preprocessing `resize` step.
+        threshold (`float`, *optional*, defaults to 0.5):
+            The threshold used to binarize the segmentation masks.
+        resample (`PILImageResampling`, defaults to `PILImageResampling.NEAREST`):
+            The resampling filter to use when resizing the masks.
+    """
+    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(target_size, orig_size))
+    ratio_height, ratio_width = ratios
+
+    new_annotation = {}
+    new_annotation["size"] = target_size
+
+    for key, value in annotation.items():
+        if key == "boxes":
+            boxes = value
+            scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32)
+            new_annotation["boxes"] = scaled_boxes
+        elif key == "area":
+            area = value
+            scaled_area = area * (ratio_width * ratio_height)
+            new_annotation["area"] = scaled_area
+        elif key == "masks":
+            masks = value[:, None]
+            masks = np.array([resize(mask, target_size, resample=resample) for mask in masks])
+            masks = masks.astype(np.float32)
+            masks = masks[:, 0] > threshold
+            new_annotation["masks"] = masks
+        elif key == "size":
+            new_annotation["size"] = target_size
+        else:
+            new_annotation[key] = value
+
+    return new_annotation
+
+
+class DetaImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Deformable DETR image processor.
+
+    Args:
+        format (`str`, *optional*, defaults to `"coco_detection"`):
+            Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
+            overridden by the `do_resize` parameter in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
+            Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in
+            the `preprocess` method.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+            Resampling filter to use if resizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+            `do_rescale` parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+            `preprocess` method.
+        do_normalize:
+            Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
+            `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
+            Mean values to use when normalizing the image. Can be a single value or a list of values, one for each
+            channel. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
+            Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
+            for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_pad (`bool`, *optional*, defaults to `True`):
+            Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be
+            overridden by the `do_pad` parameter in the `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values", "pixel_mask"]
+
+    def __init__(
+        self,
+        format: Union[str, AnnotionFormat] = AnnotionFormat.COCO_DETECTION,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Union[float, List[float]] = None,
+        image_std: Union[float, List[float]] = None,
+        do_pad: bool = True,
+        **kwargs,
+    ) -> None:
+        if "pad_and_return_pixel_mask" in kwargs:
+            do_pad = kwargs.pop("pad_and_return_pixel_mask")
+
+        size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
+        size = get_size_dict(size, default_to_square=False)
+
+        super().__init__(**kwargs)
+        self.format = format
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+        self.do_pad = do_pad
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_annotation with DETR->DETA
+    def prepare_annotation(
+        self,
+        image: np.ndarray,
+        target: Dict,
+        format: Optional[AnnotionFormat] = None,
+        return_segmentation_masks: bool = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> Dict:
+        """
+        Prepare an annotation for feeding into DETA model.
+        """
+        format = format if format is not None else self.format
+
+        if format == AnnotionFormat.COCO_DETECTION:
+            return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
+            target = prepare_coco_detection_annotation(
+                image, target, return_segmentation_masks, input_data_format=input_data_format
+            )
+        elif format == AnnotionFormat.COCO_PANOPTIC:
+            return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
+            target = prepare_coco_panoptic_annotation(
+                image,
+                target,
+                masks_path=masks_path,
+                return_masks=return_segmentation_masks,
+                input_data_format=input_data_format,
+            )
+        else:
+            raise ValueError(f"Format {format} is not supported.")
+        return target
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare
+    def prepare(self, image, target, return_segmentation_masks=None, masks_path=None):
+        logger.warning_once(
+            "The `prepare` method is deprecated and will be removed in a v4.33. "
+            "Please use `prepare_annotation` instead. Note: the `prepare_annotation` method "
+            "does not return the image anymore.",
+        )
+        target = self.prepare_annotation(image, target, return_segmentation_masks, masks_path, self.format)
+        return image, target
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.convert_coco_poly_to_mask
+    def convert_coco_poly_to_mask(self, *args, **kwargs):
+        logger.warning_once("The `convert_coco_poly_to_mask` method is deprecated and will be removed in v4.33. ")
+        return convert_coco_poly_to_mask(*args, **kwargs)
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_detection
+    def prepare_coco_detection(self, *args, **kwargs):
+        logger.warning_once("The `prepare_coco_detection` method is deprecated and will be removed in v4.33. ")
+        return prepare_coco_detection_annotation(*args, **kwargs)
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_panoptic
+    def prepare_coco_panoptic(self, *args, **kwargs):
+        logger.warning_once("The `prepare_coco_panoptic` method is deprecated and will be removed in v4.33. ")
+        return prepare_coco_panoptic_annotation(*args, **kwargs)
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
+        int, smaller edge of the image will be matched to this number.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                The desired output size. Can contain keys `shortest_edge` and `longest_edge` or `height` and `width`.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                Resampling filter to use if resizing the image.
+            data_format (`ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred from the input
+                image.
+        """
+        size = get_size_dict(size, default_to_square=False)
+        if "shortest_edge" in size and "longest_edge" in size:
+            size = get_resize_output_image_size(
+                image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
+            )
+        elif "height" in size and "width" in size:
+            size = (size["height"], size["width"])
+        else:
+            raise ValueError(
+                "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
+                f" {size.keys()}."
+            )
+        image = resize(
+            image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format
+        )
+        return image
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize_annotation
+    def resize_annotation(
+        self,
+        annotation,
+        orig_size,
+        size,
+        resample: PILImageResampling = PILImageResampling.NEAREST,
+    ) -> Dict:
+        """
+        Resize the annotation to match the resized image. If size is an int, smaller edge of the mask will be matched
+        to this number.
+        """
+        return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample)
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
+    def rescale(
+        self,
+        image: np.ndarray,
+        rescale_factor: float,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Rescale the image by the given factor. image = image * rescale_factor.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            rescale_factor (`float`):
+                The value to use for rescaling.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the input image. If unset, is inferred from the input image. Can be
+                one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+        """
+        return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation
+    def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
+        """
+        Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
+        `[center_x, center_y, width, height]` format.
+        """
+        return normalize_annotation(annotation, image_size=image_size)
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
+    def _pad_image(
+        self,
+        image: np.ndarray,
+        output_size: Tuple[int, int],
+        constant_values: Union[float, Iterable[float]] = 0,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Pad an image with zeros to the given size.
+        """
+        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+        output_height, output_width = output_size
+
+        pad_bottom = output_height - input_height
+        pad_right = output_width - input_width
+        padding = ((0, pad_bottom), (0, pad_right))
+        padded_image = pad(
+            image,
+            padding,
+            mode=PaddingMode.CONSTANT,
+            constant_values=constant_values,
+            data_format=data_format,
+            input_data_format=input_data_format,
+        )
+        return padded_image
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
+    def pad(
+        self,
+        images: List[np.ndarray],
+        constant_values: Union[float, Iterable[float]] = 0,
+        return_pixel_mask: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> BatchFeature:
+        """
+        Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
+        in the batch and optionally returns their corresponding pixel mask.
+
+        Args:
+            image (`np.ndarray`):
+                Image to pad.
+            constant_values (`float` or `Iterable[float]`, *optional*):
+                The value to use for the padding if `mode` is `"constant"`.
+            return_pixel_mask (`bool`, *optional*, defaults to `True`):
+                Whether to return a pixel mask.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        pad_size = get_max_height_width(images, input_data_format=input_data_format)
+
+        padded_images = [
+            self._pad_image(
+                image,
+                pad_size,
+                constant_values=constant_values,
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
+            for image in images
+        ]
+        data = {"pixel_values": padded_images}
+
+        if return_pixel_mask:
+            masks = [
+                make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
+                for image in images
+            ]
+            data["pixel_mask"] = masks
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        annotations: Optional[Union[List[Dict], List[List[Dict]]]] = None,
+        return_segmentation_masks: bool = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        do_resize: Optional[bool] = None,
+        size: Optional[Dict[str, int]] = None,
+        resample=None,  # PILImageResampling
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[Union[int, float]] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_pad: Optional[bool] = None,
+        format: Optional[Union[str, AnnotionFormat]] = None,
+        return_tensors: Optional[Union[TensorType, str]] = None,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Preprocess an image or a batch of images so that it can be used by the model.
+
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging
+                from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            annotations (`List[Dict]` or `List[List[Dict]]`, *optional*):
+                List of annotations associated with the image or batch of images. If annotionation is for object
+                detection, the annotations should be a dictionary with the following keys:
+                - "image_id" (`int`): The image id.
+                - "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a
+                  dictionary. An image can have no annotations, in which case the list should be empty.
+                If annotionation is for segmentation, the annotations should be a dictionary with the following keys:
+                - "image_id" (`int`): The image id.
+                - "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary.
+                  An image can have no segments, in which case the list should be empty.
+                - "file_name" (`str`): The file name of the image.
+            return_segmentation_masks (`bool`, *optional*, defaults to self.return_segmentation_masks):
+                Whether to return segmentation masks.
+            masks_path (`str` or `pathlib.Path`, *optional*):
+                Path to the directory containing the segmentation masks.
+            do_resize (`bool`, *optional*, defaults to self.do_resize):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to self.size):
+                Size of the image after resizing.
+            resample (`PILImageResampling`, *optional*, defaults to self.resample):
+                Resampling filter to use when resizing the image.
+            do_rescale (`bool`, *optional*, defaults to self.do_rescale):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to self.rescale_factor):
+                Rescale factor to use when rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to self.do_normalize):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean):
+                Mean to use when normalizing the image.
+            image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
+                Standard deviation to use when normalizing the image.
+            do_pad (`bool`, *optional*, defaults to self.do_pad):
+                Whether to pad the image.
+            format (`str` or `AnnotionFormat`, *optional*, defaults to self.format):
+                Format of the annotations.
+            return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
+                Type of tensors to return. If `None`, will return the list of images.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        if "pad_and_return_pixel_mask" in kwargs:
+            logger.warning_once(
+                "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, "
+                "use `do_pad` instead.",
+            )
+            do_pad = kwargs.pop("pad_and_return_pixel_mask")
+
+        do_resize = self.do_resize if do_resize is None else do_resize
+        size = self.size if size is None else size
+        size = get_size_dict(size=size, default_to_square=False)
+        resample = self.resample if resample is None else resample
+        do_rescale = self.do_rescale if do_rescale is None else do_rescale
+        rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor
+        do_normalize = self.do_normalize if do_normalize is None else do_normalize
+        image_mean = self.image_mean if image_mean is None else image_mean
+        image_std = self.image_std if image_std is None else image_std
+        do_pad = self.do_pad if do_pad is None else do_pad
+        format = self.format if format is None else format
+
+        if do_resize is not None and size is None:
+            raise ValueError("Size and max_size must be specified if do_resize is True.")
+
+        if do_rescale is not None and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize is not None and (image_mean is None or image_std is None):
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
+
+        if not is_batched(images):
+            images = [images]
+            annotations = [annotations] if annotations is not None else None
+
+        if annotations is not None and len(images) != len(annotations):
+            raise ValueError(
+                f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match."
+            )
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        format = AnnotionFormat(format)
+        if annotations is not None:
+            if format == AnnotionFormat.COCO_DETECTION and not valid_coco_detection_annotations(annotations):
+                raise ValueError(
+                    "Invalid COCO detection annotations. Annotations must a dict (single image) of list of dicts"
+                    "(batch of images) with the following keys: `image_id` and `annotations`, with the latter "
+                    "being a list of annotations in the COCO format."
+                )
+            elif format == AnnotionFormat.COCO_PANOPTIC and not valid_coco_panoptic_annotations(annotations):
+                raise ValueError(
+                    "Invalid COCO panoptic annotations. Annotations must a dict (single image) of list of dicts "
+                    "(batch of images) with the following keys: `image_id`, `file_name` and `segments_info`, with "
+                    "the latter being a list of annotations in the COCO format."
+                )
+            elif format not in SUPPORTED_ANNOTATION_FORMATS:
+                raise ValueError(
+                    f"Unsupported annotation format: {format} must be one of {SUPPORTED_ANNOTATION_FORMATS}"
+                )
+
+        if (
+            masks_path is not None
+            and format == AnnotionFormat.COCO_PANOPTIC
+            and not isinstance(masks_path, (pathlib.Path, str))
+        ):
+            raise ValueError(
+                "The path to the directory containing the mask PNG files should be provided as a"
+                f" `pathlib.Path` or string object, but is {type(masks_path)} instead."
+            )
+
+        # All transformations expect numpy arrays
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
+        if annotations is not None:
+            prepared_images = []
+            prepared_annotations = []
+            for image, target in zip(images, annotations):
+                target = self.prepare_annotation(
+                    image,
+                    target,
+                    format,
+                    return_segmentation_masks=return_segmentation_masks,
+                    masks_path=masks_path,
+                    input_data_format=input_data_format,
+                )
+                prepared_images.append(image)
+                prepared_annotations.append(target)
+            images = prepared_images
+            annotations = prepared_annotations
+            del prepared_images, prepared_annotations
+
+        # transformations
+        if do_resize:
+            if annotations is not None:
+                resized_images, resized_annotations = [], []
+                for image, target in zip(images, annotations):
+                    orig_size = get_image_size(image, input_data_format)
+                    resized_image = self.resize(
+                        image, size=size, resample=resample, input_data_format=input_data_format
+                    )
+                    resized_annotation = self.resize_annotation(
+                        target, orig_size, get_image_size(resized_image, input_data_format)
+                    )
+                    resized_images.append(resized_image)
+                    resized_annotations.append(resized_annotation)
+                images = resized_images
+                annotations = resized_annotations
+                del resized_images, resized_annotations
+            else:
+                images = [
+                    self.resize(image, size=size, resample=resample, input_data_format=input_data_format)
+                    for image in images
+                ]
+
+        if do_rescale:
+            images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images]
+
+        if do_normalize:
+            images = [
+                self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
+            ]
+            if annotations is not None:
+                annotations = [
+                    self.normalize_annotation(annotation, get_image_size(image, input_data_format))
+                    for annotation, image in zip(annotations, images)
+                ]
+
+        if do_pad:
+            # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
+            data = self.pad(
+                images, return_pixel_mask=True, data_format=data_format, input_data_format=input_data_format
+            )
+        else:
+            images = [
+                to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+                for image in images
+            ]
+            data = {"pixel_values": images}
+
+        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+        if annotations is not None:
+            encoded_inputs["labels"] = [
+                BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
+            ]
+
+        return encoded_inputs
+
+    def post_process_object_detection(
+        self,
+        outputs,
+        threshold: float = 0.5,
+        target_sizes: Union[TensorType, List[Tuple]] = None,
+        nms_threshold: float = 0.7,
+    ):
+        """
+        Converts the output of [`DetaForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+        Args:
+            outputs ([`DetrObjectDetectionOutput`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*, defaults to 0.5):
+                Score threshold to keep object detection predictions.
+            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+                (height, width) of each image in the batch. If left to None, predictions will not be resized.
+            nms_threshold (`float`, *optional*, defaults to 0.7):
+                NMS threshold.
+
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+        batch_size, num_queries, num_labels = out_logits.shape
+
+        if target_sizes is not None:
+            if len(out_logits) != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+
+        prob = out_logits.sigmoid()
+
+        all_scores = prob.view(batch_size, num_queries * num_labels).to(out_logits.device)
+        all_indexes = torch.arange(num_queries * num_labels)[None].repeat(batch_size, 1).to(out_logits.device)
+        all_boxes = torch.div(all_indexes, out_logits.shape[2], rounding_mode="floor")
+        all_labels = all_indexes % out_logits.shape[2]
+
+        boxes = center_to_corners_format(out_bbox)
+        boxes = torch.gather(boxes, 1, all_boxes.unsqueeze(-1).repeat(1, 1, 4))
+
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        if target_sizes is not None:
+            if isinstance(target_sizes, List):
+                img_h = torch.Tensor([i[0] for i in target_sizes])
+                img_w = torch.Tensor([i[1] for i in target_sizes])
+            else:
+                img_h, img_w = target_sizes.unbind(1)
+
+            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
+            boxes = boxes * scale_fct[:, None, :]
+
+        results = []
+        for b in range(batch_size):
+            box = boxes[b]
+            score = all_scores[b]
+            lbls = all_labels[b]
+
+            pre_topk = score.topk(min(10000, len(score))).indices
+            box = box[pre_topk]
+            score = score[pre_topk]
+            lbls = lbls[pre_topk]
+
+            # apply NMS
+            keep_inds = batched_nms(box, score, lbls, nms_threshold)[:100]
+            score = score[keep_inds]
+            lbls = lbls[keep_inds]
+            box = box[keep_inds]
+
+            results.append(
+                {
+                    "scores": score[score > threshold],
+                    "labels": lbls[score > threshold],
+                    "boxes": box[score > threshold],
+                }
+            )
+
+        return results
diff --git a/src/transformers/models/deta/modeling_deta.py b/src/transformers/models/deta/modeling_deta.py
new file mode 100644
index 000000000000..9cd29e940887
--- /dev/null
+++ b/src/transformers/models/deta/modeling_deta.py
@@ -0,0 +1,2762 @@
+# coding=utf-8
+# Copyright 2022 SenseTime and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch DETA model."""
+
+
+import copy
+import math
+import warnings
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+
+from ...activations import ACT2FN
+from ...file_utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_scipy_available,
+    is_vision_available,
+    replace_return_docstrings,
+)
+from ...modeling_outputs import BaseModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import meshgrid
+from ...utils import is_torchvision_available, logging, requires_backends
+from ..auto import AutoBackbone
+from .configuration_deta import DetaConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_vision_available():
+    from transformers.image_transforms import center_to_corners_format
+
+if is_torchvision_available():
+    from torchvision.ops.boxes import batched_nms
+
+if is_scipy_available():
+    from scipy.optimize import linear_sum_assignment
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "DetaConfig"
+_CHECKPOINT_FOR_DOC = "jozhang97/deta-swin-large-o365"
+
+DETA_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "jozhang97/deta-swin-large-o365",
+    # See all DETA models at https://huggingface.co/models?filter=deta
+]
+
+
+@dataclass
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoderOutput with DeformableDetr->Deta
+class DetaDecoderOutput(ModelOutput):
+    """
+    Base class for outputs of the DetaDecoder. This class adds two attributes to BaseModelOutputWithCrossAttentions,
+    namely:
+    - a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer)
+    - a stacked tensor of intermediate reference points.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+            Stacked intermediate hidden states (output of each layer of the decoder).
+        intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`):
+            Stacked intermediate reference points (reference points of each layer of the decoder).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+            used to compute the weighted average in the cross-attention heads.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    intermediate_hidden_states: torch.FloatTensor = None
+    intermediate_reference_points: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModelOutput with DeformableDetr->Deta,Deformable DETR->DETA
+class DetaModelOutput(ModelOutput):
+    """
+    Base class for outputs of the Deformable DETR encoder-decoder model.
+
+    Args:
+        init_reference_points (`torch.FloatTensor` of shape  `(batch_size, num_queries, 4)`):
+            Initial reference points sent through the Transformer decoder.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+        intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+            Stacked intermediate hidden states (output of each layer of the decoder).
+        intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+            Stacked intermediate reference points (reference points of each layer of the decoder).
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer
+            plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries,
+            num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted
+            average in the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
+            layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+            Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
+            picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
+            foreground and background).
+        enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+            Logits of predicted bounding boxes coordinates in the first stage.
+    """
+
+    init_reference_points: torch.FloatTensor = None
+    last_hidden_state: torch.FloatTensor = None
+    intermediate_hidden_states: torch.FloatTensor = None
+    intermediate_reference_points: torch.FloatTensor = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    enc_outputs_class: Optional[torch.FloatTensor] = None
+    enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrObjectDetectionOutput with DeformableDetr->Deta
+class DetaObjectDetectionOutput(ModelOutput):
+    """
+    Output type of [`DetaForObjectDetection`].
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
+            Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
+            bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
+            scale-invariant IoU loss.
+        loss_dict (`Dict`, *optional*):
+            A dictionary containing the individual losses. Useful for logging.
+        logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
+            Classification logits (including no-object) for all queries.
+        pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
+            values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
+            possible padding). You can use [`~DetaProcessor.post_process_object_detection`] to retrieve the
+            unnormalized bounding boxes.
+        auxiliary_outputs (`list[Dict]`, *optional*):
+            Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
+            and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
+            `pred_boxes`) for each decoder layer.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer
+            plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries,
+            num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted
+            average in the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
+            layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_heads, 4,
+            4)`. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average
+            in the self-attention heads.
+        intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+            Stacked intermediate hidden states (output of each layer of the decoder).
+        intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+            Stacked intermediate reference points (reference points of each layer of the decoder).
+        init_reference_points (`torch.FloatTensor` of shape  `(batch_size, num_queries, 4)`):
+            Initial reference points sent through the Transformer decoder.
+        enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+            Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
+            picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
+            foreground and background).
+        enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+            Logits of predicted bounding boxes coordinates in the first stage.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    loss_dict: Optional[Dict] = None
+    logits: torch.FloatTensor = None
+    pred_boxes: torch.FloatTensor = None
+    auxiliary_outputs: Optional[List[Dict]] = None
+    init_reference_points: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    intermediate_reference_points: Optional[torch.FloatTensor] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    enc_outputs_class: Optional = None
+    enc_outputs_coord_logits: Optional = None
+
+
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+
+def inverse_sigmoid(x, eps=1e-5):
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1 / x2)
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->Deta
+class DetaFrozenBatchNorm2d(nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than
+    torchvision.models.resnet[18,34,50,101] produce nans.
+    """
+
+    def __init__(self, n):
+        super().__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        num_batches_tracked_key = prefix + "num_batches_tracked"
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it user-friendly
+        weight = self.weight.reshape(1, -1, 1, 1)
+        bias = self.bias.reshape(1, -1, 1, 1)
+        running_var = self.running_var.reshape(1, -1, 1, 1)
+        running_mean = self.running_mean.reshape(1, -1, 1, 1)
+        epsilon = 1e-5
+        scale = weight * (running_var + epsilon).rsqrt()
+        bias = bias - running_mean * scale
+        return x * scale + bias
+
+
+# Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->Deta
+def replace_batch_norm(model):
+    r"""
+    Recursively replace all `torch.nn.BatchNorm2d` with `DetaFrozenBatchNorm2d`.
+
+    Args:
+        model (torch.nn.Module):
+            input model
+    """
+    for name, module in model.named_children():
+        if isinstance(module, nn.BatchNorm2d):
+            new_module = DetaFrozenBatchNorm2d(module.num_features)
+
+            new_module.weight.data.copy_(module.weight)
+            new_module.bias.data.copy_(module.bias)
+            new_module.running_mean.data.copy_(module.running_mean)
+            new_module.running_var.data.copy_(module.running_var)
+
+            model._modules[name] = new_module
+
+        if len(list(module.children())) > 0:
+            replace_batch_norm(module)
+
+
+class DetaBackboneWithPositionalEncodings(nn.Module):
+    """
+    Backbone model with positional embeddings.
+
+    nn.BatchNorm2d layers are replaced by DetaFrozenBatchNorm2d as defined above.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        backbone = AutoBackbone.from_config(config.backbone_config)
+        with torch.no_grad():
+            replace_batch_norm(backbone)
+        self.model = backbone
+        self.intermediate_channel_sizes = self.model.channels
+
+        # TODO fix this
+        if config.backbone_config.model_type == "resnet":
+            for name, parameter in self.model.named_parameters():
+                if "stages.1" not in name and "stages.2" not in name and "stages.3" not in name:
+                    parameter.requires_grad_(False)
+
+        self.position_embedding = build_position_encoding(config)
+
+    def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
+        """
+        Outputs feature maps of latter stages C_3 through C_5 in ResNet if `config.num_feature_levels > 1`, otherwise
+        outputs feature maps of C_5.
+        """
+        # first, send pixel_values through the backbone to get list of feature maps
+        features = self.model(pixel_values).feature_maps
+
+        # next, create position embeddings
+        out = []
+        pos = []
+        for feature_map in features:
+            # downsample pixel_mask to match shape of corresponding feature_map
+            mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0]
+            position_embeddings = self.position_embedding(feature_map, mask).to(feature_map.dtype)
+            out.append((feature_map, mask))
+            pos.append(position_embeddings)
+
+        return out, pos
+
+
+# Copied from transformers.models.detr.modeling_detr._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, target_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[batch_size, seq_len]` to `[batch_size, 1, target_seq_len, source_seq_len]`.
+    """
+    batch_size, source_len = mask.size()
+    target_len = target_len if target_len is not None else source_len
+
+    expanded_mask = mask[:, None, None, :].expand(batch_size, 1, target_len, source_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrSinePositionEmbedding with DeformableDetr->Deta
+class DetaSinePositionEmbedding(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
+    need paper, generalized to work on images.
+    """
+
+    def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=None):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+
+    def forward(self, pixel_values, pixel_mask):
+        if pixel_mask is None:
+            raise ValueError("No pixel mask provided")
+        y_embed = pixel_mask.cumsum(1, dtype=torch.float32)
+        x_embed = pixel_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale
+
+        dim_t = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device)
+        dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim)
+
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrLearnedPositionEmbedding
+class DetaLearnedPositionEmbedding(nn.Module):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+
+    def __init__(self, embedding_dim=256):
+        super().__init__()
+        self.row_embeddings = nn.Embedding(50, embedding_dim)
+        self.column_embeddings = nn.Embedding(50, embedding_dim)
+
+    def forward(self, pixel_values, pixel_mask=None):
+        height, width = pixel_values.shape[-2:]
+        width_values = torch.arange(width, device=pixel_values.device)
+        height_values = torch.arange(height, device=pixel_values.device)
+        x_emb = self.column_embeddings(width_values)
+        y_emb = self.row_embeddings(height_values)
+        pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1)
+        pos = pos.permute(2, 0, 1)
+        pos = pos.unsqueeze(0)
+        pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
+        return pos
+
+
+# Copied from transformers.models.detr.modeling_detr.build_position_encoding with Detr->Deta
+def build_position_encoding(config):
+    n_steps = config.d_model // 2
+    if config.position_embedding_type == "sine":
+        # TODO find a better way of exposing other arguments
+        position_embedding = DetaSinePositionEmbedding(n_steps, normalize=True)
+    elif config.position_embedding_type == "learned":
+        position_embedding = DetaLearnedPositionEmbedding(n_steps)
+    else:
+        raise ValueError(f"Not supported {config.position_embedding_type}")
+
+    return position_embedding
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.multi_scale_deformable_attention
+def multi_scale_deformable_attention(
+    value: Tensor, value_spatial_shapes: Tensor, sampling_locations: Tensor, attention_weights: Tensor
+) -> Tensor:
+    batch_size, _, num_heads, hidden_dim = value.shape
+    _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
+    value_list = value.split([height.item() * width.item() for height, width in value_spatial_shapes], dim=1)
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for level_id, (height, width) in enumerate(value_spatial_shapes):
+        # batch_size, height*width, num_heads, hidden_dim
+        # -> batch_size, height*width, num_heads*hidden_dim
+        # -> batch_size, num_heads*hidden_dim, height*width
+        # -> batch_size*num_heads, hidden_dim, height, width
+        value_l_ = (
+            value_list[level_id].flatten(2).transpose(1, 2).reshape(batch_size * num_heads, hidden_dim, height, width)
+        )
+        # batch_size, num_queries, num_heads, num_points, 2
+        # -> batch_size, num_heads, num_queries, num_points, 2
+        # -> batch_size*num_heads, num_queries, num_points, 2
+        sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1)
+        # batch_size*num_heads, hidden_dim, num_queries, num_points
+        sampling_value_l_ = nn.functional.grid_sample(
+            value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
+        )
+        sampling_value_list.append(sampling_value_l_)
+    # (batch_size, num_queries, num_heads, num_levels, num_points)
+    # -> (batch_size, num_heads, num_queries, num_levels, num_points)
+    # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points)
+    attention_weights = attention_weights.transpose(1, 2).reshape(
+        batch_size * num_heads, 1, num_queries, num_levels * num_points
+    )
+    output = (
+        (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
+        .sum(-1)
+        .view(batch_size, num_heads * hidden_dim, num_queries)
+    )
+    return output.transpose(1, 2).contiguous()
+
+
+class DetaMultiscaleDeformableAttention(nn.Module):
+    """
+    Multiscale deformable attention as proposed in Deformable DETR.
+    """
+
+    def __init__(self, embed_dim: int, num_heads: int, n_levels: int, n_points: int):
+        super().__init__()
+        if embed_dim % num_heads != 0:
+            raise ValueError(
+                f"embed_dim (d_model) must be divisible by num_heads, but got {embed_dim} and {num_heads}"
+            )
+        dim_per_head = embed_dim // num_heads
+        # check if dim_per_head is power of 2
+        if not ((dim_per_head & (dim_per_head - 1) == 0) and dim_per_head != 0):
+            warnings.warn(
+                "You'd better set embed_dim (d_model) in DetaMultiscaleDeformableAttention to make the"
+                " dimension of each attention head a power of 2 which is more efficient in the authors' CUDA"
+                " implementation."
+            )
+
+        self.im2col_step = 64
+
+        self.d_model = embed_dim
+        self.n_levels = n_levels
+        self.n_heads = num_heads
+        self.n_points = n_points
+
+        self.sampling_offsets = nn.Linear(embed_dim, num_heads * n_levels * n_points * 2)
+        self.attention_weights = nn.Linear(embed_dim, num_heads * n_levels * n_points)
+        self.value_proj = nn.Linear(embed_dim, embed_dim)
+        self.output_proj = nn.Linear(embed_dim, embed_dim)
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        nn.init.constant_(self.sampling_offsets.weight.data, 0.0)
+        thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (
+            (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
+            .view(self.n_heads, 1, 1, 2)
+            .repeat(1, self.n_levels, self.n_points, 1)
+        )
+        for i in range(self.n_points):
+            grid_init[:, :, i, :] *= i + 1
+        with torch.no_grad():
+            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+        nn.init.constant_(self.attention_weights.weight.data, 0.0)
+        nn.init.constant_(self.attention_weights.bias.data, 0.0)
+        nn.init.xavier_uniform_(self.value_proj.weight.data)
+        nn.init.constant_(self.value_proj.bias.data, 0.0)
+        nn.init.xavier_uniform_(self.output_proj.weight.data)
+        nn.init.constant_(self.output_proj.bias.data, 0.0)
+
+    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
+        return tensor if position_embeddings is None else tensor + position_embeddings
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        reference_points=None,
+        spatial_shapes=None,
+        level_start_index=None,
+        output_attentions: bool = False,
+    ):
+        # add position embeddings to the hidden states before projecting to queries and keys
+        if position_embeddings is not None:
+            hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
+
+        batch_size, num_queries, _ = hidden_states.shape
+        batch_size, sequence_length, _ = encoder_hidden_states.shape
+        if (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() != sequence_length:
+            raise ValueError(
+                "Make sure to align the spatial shapes with the sequence length of the encoder hidden states"
+            )
+
+        value = self.value_proj(encoder_hidden_states)
+        if attention_mask is not None:
+            # we invert the attention_mask
+            value = value.masked_fill(~attention_mask[..., None], float(0))
+        value = value.view(batch_size, sequence_length, self.n_heads, self.d_model // self.n_heads)
+        sampling_offsets = self.sampling_offsets(hidden_states).view(
+            batch_size, num_queries, self.n_heads, self.n_levels, self.n_points, 2
+        )
+        attention_weights = self.attention_weights(hidden_states).view(
+            batch_size, num_queries, self.n_heads, self.n_levels * self.n_points
+        )
+        attention_weights = F.softmax(attention_weights, -1).view(
+            batch_size, num_queries, self.n_heads, self.n_levels, self.n_points
+        )
+        # batch_size, num_queries, n_heads, n_levels, n_points, 2
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :]
+                + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
+            )
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :2]
+                + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
+            )
+        else:
+            raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}")
+        # PyTorch implementation (for now)
+        output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights)
+        output = self.output_proj(output)
+
+        return output, attention_weights
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiheadAttention with DeformableDetr->Deta,Deformable DETR->DETA
+class DetaMultiheadAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper.
+
+    Here, we add position embeddings to the queries and keys (as explained in the Deformable DETR paper).
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        if self.head_dim * num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
+        return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
+        return tensor if position_embeddings is None else tensor + position_embeddings
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, target_len, embed_dim = hidden_states.size()
+        # add position embeddings to the hidden states before projecting to queries and keys
+        if position_embeddings is not None:
+            hidden_states_original = hidden_states
+            hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
+
+        # get queries, keys and values
+        query_states = self.q_proj(hidden_states) * self.scaling
+        key_states = self._shape(self.k_proj(hidden_states), -1, batch_size)
+        value_states = self._shape(self.v_proj(hidden_states_original), -1, batch_size)
+
+        proj_shape = (batch_size * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, target_len, batch_size).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        source_len = key_states.size(1)
+
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (batch_size * self.num_heads, target_len, source_len):
+            raise ValueError(
+                f"Attention weights should be of size {(batch_size * self.num_heads, target_len, source_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len]
+            attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
+
+        if attention_mask is not None:
+            if attention_mask.size() != (batch_size, 1, target_len, source_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(batch_size, 1, target_len, source_len)}, but is"
+                    f" {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attention_mask
+            attn_weights = attn_weights.view(batch_size * self.num_heads, target_len, source_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(batch_size, self.num_heads, target_len, source_len)
+            attn_weights = attn_weights_reshaped.view(batch_size * self.num_heads, target_len, source_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (batch_size * self.num_heads, target_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(batch_size, self.num_heads, target_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(batch_size, target_len, embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+
+class DetaEncoderLayer(nn.Module):
+    def __init__(self, config: DetaConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = DetaMultiscaleDeformableAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            n_levels=config.num_feature_levels,
+            n_points=config.encoder_n_points,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        position_embeddings: torch.Tensor = None,
+        reference_points=None,
+        spatial_shapes=None,
+        level_start_index=None,
+        output_attentions: bool = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Input to the layer.
+            attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+                Attention mask.
+            position_embeddings (`torch.FloatTensor`, *optional*):
+                Position embeddings, to be added to `hidden_states`.
+            reference_points (`torch.FloatTensor`, *optional*):
+                Reference points.
+            spatial_shapes (`torch.LongTensor`, *optional*):
+                Spatial shapes of the backbone feature maps.
+            level_start_index (`torch.LongTensor`, *optional*):
+                Level start index.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # Apply Multi-scale Deformable Attention Module on the multi-scale feature maps.
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            position_embeddings=position_embeddings,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        if self.training:
+            if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
+                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class DetaDecoderLayer(nn.Module):
+    def __init__(self, config: DetaConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        # self-attention
+        self.self_attn = DetaMultiheadAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        # cross-attention
+        self.encoder_attn = DetaMultiscaleDeformableAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            n_levels=config.num_feature_levels,
+            n_points=config.decoder_n_points,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        # feedforward neural networks
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Optional[torch.Tensor] = None,
+        reference_points=None,
+        spatial_shapes=None,
+        level_start_index=None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+            position_embeddings (`torch.FloatTensor`, *optional*):
+                Position embeddings that are added to the queries and keys in the self-attention layer.
+            reference_points (`torch.FloatTensor`, *optional*):
+                Reference points.
+            spatial_shapes (`torch.LongTensor`, *optional*):
+                Spatial shapes.
+            level_start_index (`torch.LongTensor`, *optional*):
+                Level start index.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
+                values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            position_embeddings=position_embeddings,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        second_residual = hidden_states
+
+        # Cross-Attention
+        cross_attn_weights = None
+        hidden_states, cross_attn_weights = self.encoder_attn(
+            hidden_states=hidden_states,
+            attention_mask=encoder_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            position_embeddings=position_embeddings,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = second_residual + hidden_states
+
+        hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        return outputs
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrClassificationHead
+class DetaClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, input_dim: int, inner_dim: int, num_classes: int, pooler_dropout: float):
+        super().__init__()
+        self.dense = nn.Linear(input_dim, inner_dim)
+        self.dropout = nn.Dropout(p=pooler_dropout)
+        self.out_proj = nn.Linear(inner_dim, num_classes)
+
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = torch.tanh(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.out_proj(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrPreTrainedModel with DeformableDetr->Deta
+class DetaPreTrainedModel(PreTrainedModel):
+    config_class = DetaConfig
+    base_model_prefix = "model"
+    main_input_name = "pixel_values"
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+
+        if isinstance(module, DetaLearnedPositionEmbedding):
+            nn.init.uniform_(module.row_embeddings.weight)
+            nn.init.uniform_(module.column_embeddings.weight)
+        elif isinstance(module, DetaMultiscaleDeformableAttention):
+            module._reset_parameters()
+        elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        if hasattr(module, "reference_points") and not self.config.two_stage:
+            nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0)
+            nn.init.constant_(module.reference_points.bias.data, 0.0)
+        if hasattr(module, "level_embed"):
+            nn.init.normal_(module.level_embed)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, DetaDecoder):
+            module.gradient_checkpointing = value
+
+
+DETA_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`DetaConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+DETA_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it.
+
+            Pixel values can be obtained using [`AutoImageProcessor`]. See [`AutoImageProcessor.__call__`] for details.
+
+        pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+            Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
+
+            - 1 for pixels that are real (i.e. **not masked**),
+            - 0 for pixels that are padding (i.e. **masked**).
+
+            [What are attention masks?](../glossary#attention-mask)
+
+        decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
+            Not used by default. Can be used to mask object queries.
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
+            can choose to directly pass a flattened representation of an image.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+            Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
+            embedded representation.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrEncoder with DeformableDetr->Deta
+class DetaEncoder(DetaPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a
+    [`DetaEncoderLayer`].
+
+    The encoder updates the flattened multi-scale feature maps through multiple deformable attention layers.
+
+    Args:
+        config: DetaConfig
+    """
+
+    def __init__(self, config: DetaConfig):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layers = nn.ModuleList([DetaEncoderLayer(config) for _ in range(config.encoder_layers)])
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios, device):
+        """
+        Get reference points for each feature map. Used in decoder.
+
+        Args:
+            spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
+                Spatial shapes of each feature map.
+            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
+                Valid ratios of each feature map.
+            device (`torch.device`):
+                Device on which to create the tensors.
+        Returns:
+            `torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)`
+        """
+        reference_points_list = []
+        for level, (height, width) in enumerate(spatial_shapes):
+            ref_y, ref_x = meshgrid(
+                torch.linspace(0.5, height - 0.5, height, dtype=torch.float32, device=device),
+                torch.linspace(0.5, width - 0.5, width, dtype=torch.float32, device=device),
+                indexing="ij",
+            )
+            # TODO: valid_ratios could be useless here. check https://github.com/fundamentalvision/Deformable-DETR/issues/36
+            ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, level, 1] * height)
+            ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, level, 0] * width)
+            ref = torch.stack((ref_x, ref_y), -1)
+            reference_points_list.append(ref)
+        reference_points = torch.cat(reference_points_list, 1)
+        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+        return reference_points
+
+    def forward(
+        self,
+        inputs_embeds=None,
+        attention_mask=None,
+        position_embeddings=None,
+        spatial_shapes=None,
+        level_start_index=None,
+        valid_ratios=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Flattened feature map (output of the backbone + projection layer) that is passed to the encoder.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:
+                - 1 for pixel features that are real (i.e. **not masked**),
+                - 0 for pixel features that are padding (i.e. **masked**).
+                [What are attention masks?](../glossary#attention-mask)
+            position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Position embeddings that are added to the queries and keys in each self-attention layer.
+            spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
+                Spatial shapes of each feature map.
+            level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`):
+                Starting index of each feature map.
+            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
+                Ratio of valid area in each feature level.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states = inputs_embeds
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=inputs_embeds.device)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        for i, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask,
+                position_embeddings=position_embeddings,
+                reference_points=reference_points,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoder with DeformableDetr->Deta,Deformable DETR->DETA
+class DetaDecoder(DetaPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`DetaDecoderLayer`].
+
+    The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
+
+    Some tweaks for Deformable DETR:
+
+    - `position_embeddings`, `reference_points`, `spatial_shapes` and `valid_ratios` are added to the forward pass.
+    - it also returns a stack of intermediate outputs and reference points from all decoding layers.
+
+    Args:
+        config: DetaConfig
+    """
+
+    def __init__(self, config: DetaConfig):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layers = nn.ModuleList([DetaDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.gradient_checkpointing = False
+
+        # hack implementation for iterative bounding box refinement and two-stage Deformable DETR
+        self.bbox_embed = None
+        self.class_embed = None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        position_embeddings=None,
+        reference_points=None,
+        spatial_shapes=None,
+        level_start_index=None,
+        valid_ratios=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+                The query embeddings that are passed into the decoder.
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected
+                in `[0, 1]`:
+                - 1 for pixels that are real (i.e. **not masked**),
+                - 0 for pixels that are padding (i.e. **masked**).
+            position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+                Position embeddings that are added to the queries and keys in each self-attention layer.
+            reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*):
+                Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area.
+            spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`):
+                Spatial shapes of the feature maps.
+            level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`, *optional*):
+                Indexes for the start of each feature level. In range `[0, sequence_length]`.
+            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`, *optional*):
+                Ratio of valid area in each feature level.
+
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        intermediate = ()
+        intermediate_reference_points = ()
+
+        for idx, decoder_layer in enumerate(self.layers):
+            if reference_points.shape[-1] == 4:
+                reference_points_input = (
+                    reference_points[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None]
+                )
+            else:
+                if reference_points.shape[-1] != 2:
+                    raise ValueError("Reference points' last dimension must be of size 2")
+                reference_points_input = reference_points[:, :, None] * valid_ratios[:, None]
+
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    position_embeddings=position_embeddings,
+                    encoder_hidden_states=encoder_hidden_states,
+                    reference_points=reference_points_input,
+                    spatial_shapes=spatial_shapes,
+                    level_start_index=level_start_index,
+                    encoder_attention_mask=encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            # hack implementation for iterative bounding box refinement
+            if self.bbox_embed is not None:
+                tmp = self.bbox_embed[idx](hidden_states)
+                if reference_points.shape[-1] == 4:
+                    new_reference_points = tmp + inverse_sigmoid(reference_points)
+                    new_reference_points = new_reference_points.sigmoid()
+                else:
+                    if reference_points.shape[-1] != 2:
+                        raise ValueError(
+                            f"Reference points' last dimension must be of size 2, but is {reference_points.shape[-1]}"
+                        )
+                    new_reference_points = tmp
+                    new_reference_points[..., :2] = tmp[..., :2] + inverse_sigmoid(reference_points)
+                    new_reference_points = new_reference_points.sigmoid()
+                reference_points = new_reference_points.detach()
+
+            intermediate += (hidden_states,)
+            intermediate_reference_points += (reference_points,)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        # Keep batch_size as first dimension
+        intermediate = torch.stack(intermediate, dim=1)
+        intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    intermediate,
+                    intermediate_reference_points,
+                    all_hidden_states,
+                    all_self_attns,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return DetaDecoderOutput(
+            last_hidden_state=hidden_states,
+            intermediate_hidden_states=intermediate,
+            intermediate_reference_points=intermediate_reference_points,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The bare DETA Model (consisting of a backbone and encoder-decoder Transformer) outputting raw hidden-states without
+    any specific head on top.
+    """,
+    DETA_START_DOCSTRING,
+)
+class DetaModel(DetaPreTrainedModel):
+    def __init__(self, config: DetaConfig):
+        super().__init__(config)
+
+        if config.two_stage:
+            requires_backends(self, ["torchvision"])
+
+        # Create backbone with positional encoding
+        self.backbone = DetaBackboneWithPositionalEncodings(config)
+        intermediate_channel_sizes = self.backbone.intermediate_channel_sizes
+
+        # Create input projection layers
+        if config.num_feature_levels > 1:
+            num_backbone_outs = len(intermediate_channel_sizes)
+            input_proj_list = []
+            for _ in range(num_backbone_outs):
+                in_channels = intermediate_channel_sizes[_]
+                input_proj_list.append(
+                    nn.Sequential(
+                        nn.Conv2d(in_channels, config.d_model, kernel_size=1),
+                        nn.GroupNorm(32, config.d_model),
+                    )
+                )
+            for _ in range(config.num_feature_levels - num_backbone_outs):
+                input_proj_list.append(
+                    nn.Sequential(
+                        nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1),
+                        nn.GroupNorm(32, config.d_model),
+                    )
+                )
+                in_channels = config.d_model
+            self.input_proj = nn.ModuleList(input_proj_list)
+        else:
+            self.input_proj = nn.ModuleList(
+                [
+                    nn.Sequential(
+                        nn.Conv2d(intermediate_channel_sizes[-1], config.d_model, kernel_size=1),
+                        nn.GroupNorm(32, config.d_model),
+                    )
+                ]
+            )
+
+        if not config.two_stage:
+            self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model * 2)
+
+        self.encoder = DetaEncoder(config)
+        self.decoder = DetaDecoder(config)
+
+        self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model))
+
+        if config.two_stage:
+            self.enc_output = nn.Linear(config.d_model, config.d_model)
+            self.enc_output_norm = nn.LayerNorm(config.d_model)
+            self.pos_trans = nn.Linear(config.d_model * 2, config.d_model * 2)
+            self.pos_trans_norm = nn.LayerNorm(config.d_model * 2)
+            self.pix_trans = nn.Linear(config.d_model, config.d_model)
+            self.pix_trans_norm = nn.LayerNorm(config.d_model)
+        else:
+            self.reference_points = nn.Linear(config.d_model, 2)
+
+        self.assign_first_stage = config.assign_first_stage
+        self.two_stage_num_proposals = config.two_stage_num_proposals
+
+        self.post_init()
+
+    # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModel.get_encoder
+    def get_encoder(self):
+        return self.encoder
+
+    # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModel.get_decoder
+    def get_decoder(self):
+        return self.decoder
+
+    # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModel.freeze_backbone
+    def freeze_backbone(self):
+        for name, param in self.backbone.conv_encoder.model.named_parameters():
+            param.requires_grad_(False)
+
+    # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModel.unfreeze_backbone
+    def unfreeze_backbone(self):
+        for name, param in self.backbone.conv_encoder.model.named_parameters():
+            param.requires_grad_(True)
+
+    # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModel.get_valid_ratio
+    def get_valid_ratio(self, mask):
+        """Get the valid ratio of all feature maps."""
+
+        _, height, width = mask.shape
+        valid_height = torch.sum(mask[:, :, 0], 1)
+        valid_width = torch.sum(mask[:, 0, :], 1)
+        valid_ratio_heigth = valid_height.float() / height
+        valid_ratio_width = valid_width.float() / width
+        valid_ratio = torch.stack([valid_ratio_width, valid_ratio_heigth], -1)
+        return valid_ratio
+
+    # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModel.get_proposal_pos_embed
+    def get_proposal_pos_embed(self, proposals):
+        """Get the position embedding of the proposals."""
+
+        num_pos_feats = self.config.d_model // 2
+        temperature = 10000
+        scale = 2 * math.pi
+
+        dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=proposals.device)
+        dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats)
+        # batch_size, num_queries, 4
+        proposals = proposals.sigmoid() * scale
+        # batch_size, num_queries, 4, 128
+        pos = proposals[:, :, :, None] / dim_t
+        # batch_size, num_queries, 4, 64, 2 -> batch_size, num_queries, 512
+        pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2)
+        return pos
+
+    def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes):
+        """Generate the encoder output proposals from encoded enc_output.
+
+        Args:
+            enc_output (Tensor[batch_size, sequence_length, hidden_size]): Output of the encoder.
+            padding_mask (Tensor[batch_size, sequence_length]): Padding mask for `enc_output`.
+            spatial_shapes (Tensor[num_feature_levels, 2]): Spatial shapes of the feature maps.
+
+        Returns:
+            `tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction.
+                - object_query (Tensor[batch_size, sequence_length, hidden_size]): Object query features. Later used to
+                  directly predict a bounding box. (without the need of a decoder)
+                - output_proposals (Tensor[batch_size, sequence_length, 4]): Normalized proposals, after an inverse
+                  sigmoid.
+        """
+        batch_size = enc_output.shape[0]
+        proposals = []
+        _cur = 0
+        level_ids = []
+        for level, (height, width) in enumerate(spatial_shapes):
+            mask_flatten_ = padding_mask[:, _cur : (_cur + height * width)].view(batch_size, height, width, 1)
+            valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
+            valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
+
+            grid_y, grid_x = meshgrid(
+                torch.linspace(0, height - 1, height, dtype=torch.float32, device=enc_output.device),
+                torch.linspace(0, width - 1, width, dtype=torch.float32, device=enc_output.device),
+                indexing="ij",
+            )
+            grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
+
+            scale = torch.cat([valid_width.unsqueeze(-1), valid_height.unsqueeze(-1)], 1).view(batch_size, 1, 1, 2)
+            grid = (grid.unsqueeze(0).expand(batch_size, -1, -1, -1) + 0.5) / scale
+            width_heigth = torch.ones_like(grid) * 0.05 * (2.0**level)
+            proposal = torch.cat((grid, width_heigth), -1).view(batch_size, -1, 4)
+            proposals.append(proposal)
+            _cur += height * width
+            level_ids.append(grid.new_ones(height * width, dtype=torch.long) * level)
+        output_proposals = torch.cat(proposals, 1)
+        output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
+        output_proposals = torch.log(output_proposals / (1 - output_proposals))  # inverse sigmoid
+        output_proposals = output_proposals.masked_fill(padding_mask.unsqueeze(-1), float("inf"))
+        output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf"))
+
+        # assign each pixel as an object query
+        object_query = enc_output
+        object_query = object_query.masked_fill(padding_mask.unsqueeze(-1), float(0))
+        object_query = object_query.masked_fill(~output_proposals_valid, float(0))
+        object_query = self.enc_output_norm(self.enc_output(object_query))
+        level_ids = torch.cat(level_ids)
+        return object_query, output_proposals, level_ids
+
+    @add_start_docstrings_to_model_forward(DETA_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=DetaModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_outputs: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], DetaModelOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, DetaModel
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("jozhang97/deta-swin-large-o365")
+        >>> model = DetaModel.from_pretrained("jozhang97/deta-swin-large-o365", two_stage=False)
+
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+
+        >>> last_hidden_states = outputs.last_hidden_state
+        >>> list(last_hidden_states.shape)
+        [1, 900, 256]
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, num_channels, height, width = pixel_values.shape
+        device = pixel_values.device
+
+        if pixel_mask is None:
+            pixel_mask = torch.ones(((batch_size, height, width)), dtype=torch.long, device=device)
+
+        # Extract multi-scale feature maps of same resolution `config.d_model` (cf Figure 4 in paper)
+        # First, sent pixel_values + pixel_mask through Backbone to obtain the features
+        # which is a list of tuples
+        features, position_embeddings_list = self.backbone(pixel_values, pixel_mask)
+
+        # Then, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
+        sources = []
+        masks = []
+        for level, (source, mask) in enumerate(features):
+            sources.append(self.input_proj[level](source))
+            masks.append(mask)
+            if mask is None:
+                raise ValueError("No attention mask was provided")
+
+        # Lowest resolution feature maps are obtained via 3x3 stride 2 convolutions on the final stage
+        if self.config.num_feature_levels > len(sources):
+            _len_sources = len(sources)
+            for level in range(_len_sources, self.config.num_feature_levels):
+                if level == _len_sources:
+                    source = self.input_proj[level](features[-1][0])
+                else:
+                    source = self.input_proj[level](sources[-1])
+                mask = nn.functional.interpolate(pixel_mask[None].float(), size=source.shape[-2:]).to(torch.bool)[0]
+                pos_l = self.backbone.position_embedding(source, mask).to(source.dtype)
+                sources.append(source)
+                masks.append(mask)
+                position_embeddings_list.append(pos_l)
+
+        # Create queries
+        query_embeds = None
+        if not self.config.two_stage:
+            query_embeds = self.query_position_embeddings.weight
+
+        # Prepare encoder inputs (by flattening)
+        spatial_shapes = [(source.shape[2:]) for source in sources]
+        source_flatten = [source.flatten(2).transpose(1, 2) for source in sources]
+        mask_flatten = [mask.flatten(1) for mask in masks]
+
+        lvl_pos_embed_flatten = []
+        for level, pos_embed in enumerate(position_embeddings_list):
+            pos_embed = pos_embed.flatten(2).transpose(1, 2)
+            lvl_pos_embed = pos_embed + self.level_embed[level].view(1, 1, -1)
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+
+        source_flatten = torch.cat(source_flatten, 1)
+        mask_flatten = torch.cat(mask_flatten, 1)
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
+        spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=source_flatten.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
+        valid_ratios = valid_ratios.float()
+
+        # Fourth, sent source_flatten + mask_flatten + lvl_pos_embed_flatten (backbone + proj layer output) through encoder
+        # Also provide spatial_shapes, level_start_index and valid_ratios
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                inputs_embeds=source_flatten,
+                attention_mask=mask_flatten,
+                position_embeddings=lvl_pos_embed_flatten,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                valid_ratios=valid_ratios,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # Fifth, prepare decoder inputs
+        batch_size, _, num_channels = encoder_outputs[0].shape
+        enc_outputs_class = None
+        enc_outputs_coord_logits = None
+        if self.config.two_stage:
+            object_query_embedding, output_proposals, level_ids = self.gen_encoder_output_proposals(
+                encoder_outputs[0], ~mask_flatten, spatial_shapes
+            )
+
+            # hack implementation for two-stage DETA
+            # apply a detection head to each pixel (A.4 in paper)
+            # linear projection for bounding box binary classification (i.e. foreground and background)
+            enc_outputs_class = self.decoder.class_embed[-1](object_query_embedding)
+            # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch)
+            delta_bbox = self.decoder.bbox_embed[-1](object_query_embedding)
+            enc_outputs_coord_logits = delta_bbox + output_proposals
+
+            # only keep top scoring `config.two_stage_num_proposals` proposals
+            topk = self.two_stage_num_proposals
+            proposal_logit = enc_outputs_class[..., 0]
+
+            if self.assign_first_stage:
+                proposal_boxes = center_to_corners_format(enc_outputs_coord_logits.sigmoid().float()).clamp(0, 1)
+                topk_proposals = []
+                for b in range(batch_size):
+                    prop_boxes_b = proposal_boxes[b]
+                    prop_logits_b = proposal_logit[b]
+
+                    # pre-nms per-level topk
+                    pre_nms_topk = 1000
+                    pre_nms_inds = []
+                    for lvl in range(len(spatial_shapes)):
+                        lvl_mask = level_ids == lvl
+                        pre_nms_inds.append(torch.topk(prop_logits_b.sigmoid() * lvl_mask, pre_nms_topk)[1])
+                    pre_nms_inds = torch.cat(pre_nms_inds)
+
+                    # nms on topk indices
+                    post_nms_inds = batched_nms(
+                        prop_boxes_b[pre_nms_inds], prop_logits_b[pre_nms_inds], level_ids[pre_nms_inds], 0.9
+                    )
+                    keep_inds = pre_nms_inds[post_nms_inds]
+
+                    if len(keep_inds) < self.two_stage_num_proposals:
+                        print(
+                            f"[WARNING] nms proposals ({len(keep_inds)}) < {self.two_stage_num_proposals}, running"
+                            " naive topk"
+                        )
+                        keep_inds = torch.topk(proposal_logit[b], topk)[1]
+
+                    # keep top Q/L indices for L levels
+                    q_per_l = topk // len(spatial_shapes)
+                    is_level_ordered = (
+                        level_ids[keep_inds][None]
+                        == torch.arange(len(spatial_shapes), device=level_ids.device)[:, None]
+                    )
+                    keep_inds_mask = is_level_ordered & (is_level_ordered.cumsum(1) <= q_per_l)  # LS
+                    keep_inds_mask = keep_inds_mask.any(0)  # S
+
+                    # pad to Q indices (might let ones filtered from pre-nms sneak by... unlikely because we pick high conf anyways)
+                    if keep_inds_mask.sum() < topk:
+                        num_to_add = topk - keep_inds_mask.sum()
+                        pad_inds = (~keep_inds_mask).nonzero()[:num_to_add]
+                        keep_inds_mask[pad_inds] = True
+
+                    keep_inds_topk = keep_inds[keep_inds_mask]
+                    topk_proposals.append(keep_inds_topk)
+                topk_proposals = torch.stack(topk_proposals)
+            else:
+                topk_proposals = torch.topk(enc_outputs_class[..., 0], topk, dim=1)[1]
+
+            topk_coords_logits = torch.gather(
+                enc_outputs_coord_logits, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4)
+            )
+            topk_coords_logits = topk_coords_logits.detach()
+            reference_points = topk_coords_logits.sigmoid()
+            init_reference_points = reference_points
+            pos_trans_out = self.pos_trans_norm(self.pos_trans(self.get_proposal_pos_embed(topk_coords_logits)))
+            query_embed, target = torch.split(pos_trans_out, num_channels, dim=2)
+        else:
+            query_embed, target = torch.split(query_embeds, num_channels, dim=1)
+            query_embed = query_embed.unsqueeze(0).expand(batch_size, -1, -1)
+            target = target.unsqueeze(0).expand(batch_size, -1, -1)
+            reference_points = self.reference_points(query_embed).sigmoid()
+            init_reference_points = reference_points
+
+        decoder_outputs = self.decoder(
+            inputs_embeds=target,
+            position_embeddings=query_embed,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=mask_flatten,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            enc_outputs = tuple(value for value in [enc_outputs_class, enc_outputs_coord_logits] if value is not None)
+            tuple_outputs = (init_reference_points,) + decoder_outputs + encoder_outputs + enc_outputs
+
+            return tuple_outputs
+
+        return DetaModelOutput(
+            init_reference_points=init_reference_points,
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            intermediate_hidden_states=decoder_outputs.intermediate_hidden_states,
+            intermediate_reference_points=decoder_outputs.intermediate_reference_points,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            enc_outputs_class=enc_outputs_class,
+            enc_outputs_coord_logits=enc_outputs_coord_logits,
+        )
+
+
+@add_start_docstrings(
+    """
+    DETA Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top, for tasks
+    such as COCO detection.
+    """,
+    DETA_START_DOCSTRING,
+)
+class DetaForObjectDetection(DetaPreTrainedModel):
+    # When using clones, all layers > 0 will be clones, but layer 0 *is* required
+    _tied_weights_keys = [r"bbox_embed\.\d+"]
+
+    # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrForObjectDetection.__init__ with DeformableDetr->Deta
+    def __init__(self, config: DetaConfig):
+        super().__init__(config)
+
+        # Deformable DETR encoder-decoder model
+        self.model = DetaModel(config)
+
+        # Detection heads on top
+        self.class_embed = nn.Linear(config.d_model, config.num_labels)
+        self.bbox_embed = DetaMLPPredictionHead(
+            input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
+        )
+
+        prior_prob = 0.01
+        bias_value = -math.log((1 - prior_prob) / prior_prob)
+        self.class_embed.bias.data = torch.ones(config.num_labels) * bias_value
+        nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0)
+
+        # if two-stage, the last class_embed and bbox_embed is for region proposal generation
+        num_pred = (config.decoder_layers + 1) if config.two_stage else config.decoder_layers
+        if config.with_box_refine:
+            self.class_embed = _get_clones(self.class_embed, num_pred)
+            self.bbox_embed = _get_clones(self.bbox_embed, num_pred)
+            nn.init.constant_(self.bbox_embed[0].layers[-1].bias.data[2:], -2.0)
+            # hack implementation for iterative bounding box refinement
+            self.model.decoder.bbox_embed = self.bbox_embed
+        else:
+            nn.init.constant_(self.bbox_embed.layers[-1].bias.data[2:], -2.0)
+            self.class_embed = nn.ModuleList([self.class_embed for _ in range(num_pred)])
+            self.bbox_embed = nn.ModuleList([self.bbox_embed for _ in range(num_pred)])
+            self.model.decoder.bbox_embed = None
+        if config.two_stage:
+            # hack implementation for two-stage
+            self.model.decoder.class_embed = self.class_embed
+            for box_embed in self.bbox_embed:
+                nn.init.constant_(box_embed.layers[-1].bias.data[2:], 0.0)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @torch.jit.unused
+    # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrForObjectDetection._set_aux_loss
+    def _set_aux_loss(self, outputs_class, outputs_coord):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
+
+    @add_start_docstrings_to_model_forward(DETA_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=DetaObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_outputs: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[List[dict]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], DetaObjectDetectionOutput]:
+        r"""
+        labels (`List[Dict]` of len `(batch_size,)`, *optional*):
+            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
+            following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
+            respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
+            in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, DetaForObjectDetection
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("jozhang97/deta-swin-large")
+        >>> model = DetaForObjectDetection.from_pretrained("jozhang97/deta-swin-large")
+
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> # convert outputs (bounding boxes and class logits) to COCO API
+        >>> target_sizes = torch.tensor([image.size[::-1]])
+        >>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[
+        ...     0
+        ... ]
+        >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
+        ...     box = [round(i, 2) for i in box.tolist()]
+        ...     print(
+        ...         f"Detected {model.config.id2label[label.item()]} with confidence "
+        ...         f"{round(score.item(), 3)} at location {box}"
+        ...     )
+        Detected cat with confidence 0.683 at location [345.85, 23.68, 639.86, 372.83]
+        Detected cat with confidence 0.683 at location [8.8, 52.49, 316.93, 473.45]
+        Detected remote with confidence 0.568 at location [40.02, 73.75, 175.96, 117.33]
+        Detected remote with confidence 0.546 at location [333.68, 77.13, 370.12, 187.51]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # First, sent images through DETR base model to obtain encoder + decoder outputs
+        outputs = self.model(
+            pixel_values,
+            pixel_mask=pixel_mask,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2]
+        init_reference = outputs.init_reference_points if return_dict else outputs[0]
+        inter_references = outputs.intermediate_reference_points if return_dict else outputs[3]
+
+        # class logits + predicted bounding boxes
+        outputs_classes = []
+        outputs_coords = []
+
+        for level in range(hidden_states.shape[1]):
+            if level == 0:
+                reference = init_reference
+            else:
+                reference = inter_references[:, level - 1]
+            reference = inverse_sigmoid(reference)
+            outputs_class = self.class_embed[level](hidden_states[:, level])
+            delta_bbox = self.bbox_embed[level](hidden_states[:, level])
+            if reference.shape[-1] == 4:
+                outputs_coord_logits = delta_bbox + reference
+            elif reference.shape[-1] == 2:
+                delta_bbox[..., :2] += reference
+                outputs_coord_logits = delta_bbox
+            else:
+                raise ValueError(f"reference.shape[-1] should be 4 or 2, but got {reference.shape[-1]}")
+            outputs_coord = outputs_coord_logits.sigmoid()
+            outputs_classes.append(outputs_class)
+            outputs_coords.append(outputs_coord)
+        # Keep batch_size as first dimension
+        outputs_class = torch.stack(outputs_classes, dim=1)
+        outputs_coord = torch.stack(outputs_coords, dim=1)
+
+        logits = outputs_class[:, -1]
+        pred_boxes = outputs_coord[:, -1]
+
+        loss, loss_dict, auxiliary_outputs = None, None, None
+        if labels is not None:
+            # First: create the matcher
+            matcher = DetaHungarianMatcher(
+                class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost
+            )
+            # Second: create the criterion
+            losses = ["labels", "boxes", "cardinality"]
+            criterion = DetaLoss(
+                matcher=matcher,
+                num_classes=self.config.num_labels,
+                focal_alpha=self.config.focal_alpha,
+                losses=losses,
+                num_queries=self.config.num_queries,
+            )
+            criterion.to(logits.device)
+            # Third: compute the losses, based on outputs and labels
+            outputs_loss = {}
+            outputs_loss["logits"] = logits
+            outputs_loss["pred_boxes"] = pred_boxes
+            if self.config.auxiliary_loss:
+                intermediate = outputs.intermediate_hidden_states if return_dict else outputs[4]
+                outputs_class = self.class_embed(intermediate)
+                outputs_coord = self.bbox_embed(intermediate).sigmoid()
+                auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord)
+                outputs_loss["auxiliary_outputs"] = auxiliary_outputs
+            if self.config.two_stage:
+                enc_outputs_coord = outputs.enc_outputs_coord_logits.sigmoid()
+                outputs["enc_outputs"] = {"pred_logits": outputs.enc_outputs_class, "pred_boxes": enc_outputs_coord}
+
+            loss_dict = criterion(outputs_loss, labels)
+            # Fourth: compute total loss, as a weighted sum of the various losses
+            weight_dict = {"loss_ce": 1, "loss_bbox": self.config.bbox_loss_coefficient}
+            weight_dict["loss_giou"] = self.config.giou_loss_coefficient
+            if self.config.auxiliary_loss:
+                aux_weight_dict = {}
+                for i in range(self.config.decoder_layers - 1):
+                    aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
+                weight_dict.update(aux_weight_dict)
+            loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
+
+        if not return_dict:
+            if auxiliary_outputs is not None:
+                output = (logits, pred_boxes) + auxiliary_outputs + outputs
+            else:
+                output = (logits, pred_boxes) + outputs
+            tuple_outputs = ((loss, loss_dict) + output) if loss is not None else output
+
+            return tuple_outputs
+
+        dict_outputs = DetaObjectDetectionOutput(
+            loss=loss,
+            loss_dict=loss_dict,
+            logits=logits,
+            pred_boxes=pred_boxes,
+            auxiliary_outputs=auxiliary_outputs,
+            last_hidden_state=outputs.last_hidden_state,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+            intermediate_hidden_states=outputs.intermediate_hidden_states,
+            intermediate_reference_points=outputs.intermediate_reference_points,
+            init_reference_points=outputs.init_reference_points,
+            enc_outputs_class=outputs.enc_outputs_class,
+            enc_outputs_coord_logits=outputs.enc_outputs_coord_logits,
+        )
+
+        return dict_outputs
+
+
+# Copied from transformers.models.detr.modeling_detr.dice_loss
+def dice_loss(inputs, targets, num_boxes):
+    """
+    Compute the DICE loss, similar to generalized IOU for masks
+
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs (0 for the negative class and 1 for the positive
+                 class).
+    """
+    inputs = inputs.sigmoid()
+    inputs = inputs.flatten(1)
+    numerator = 2 * (inputs * targets).sum(1)
+    denominator = inputs.sum(-1) + targets.sum(-1)
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    return loss.sum() / num_boxes
+
+
+# Copied from transformers.models.detr.modeling_detr.sigmoid_focal_loss
+def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
+    """
+    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
+
+    Args:
+        inputs (`torch.FloatTensor` of arbitrary shape):
+            The predictions for each example.
+        targets (`torch.FloatTensor` with the same shape as `inputs`)
+            A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class
+            and 1 for the positive class).
+        alpha (`float`, *optional*, defaults to `0.25`):
+            Optional weighting factor in the range (0,1) to balance positive vs. negative examples.
+        gamma (`int`, *optional*, defaults to `2`):
+            Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples.
+
+    Returns:
+        Loss tensor
+    """
+    prob = inputs.sigmoid()
+    ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    # add modulating factor
+    p_t = prob * targets + (1 - prob) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+
+    return loss.mean(1).sum() / num_boxes
+
+
+class DetaLoss(nn.Module):
+    """
+    This class computes the losses for `DetaForObjectDetection`. The process happens in two steps: 1) we compute
+    hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of matched
+    ground-truth / prediction (supervised class and box).
+
+    Args:
+        matcher (`DetaHungarianMatcher`):
+            Module able to compute a matching between targets and proposals.
+        num_classes (`int`):
+            Number of object categories, omitting the special no-object category.
+        focal_alpha (`float`):
+            Alpha parameter in focal loss.
+        losses (`List[str]`):
+            List of all the losses to be applied. See `get_loss` for a list of all available losses.
+    """
+
+    def __init__(
+        self,
+        matcher,
+        num_classes,
+        focal_alpha,
+        losses,
+        num_queries,
+        assign_first_stage=False,
+        assign_second_stage=False,
+    ):
+        super().__init__()
+        self.matcher = matcher
+        self.num_classes = num_classes
+        self.focal_alpha = focal_alpha
+        self.losses = losses
+        self.assign_first_stage = assign_first_stage
+        self.assign_second_stage = assign_second_stage
+
+        if self.assign_first_stage:
+            self.stg1_assigner = DetaStage1Assigner()
+        if self.assign_second_stage:
+            self.stg2_assigner = DetaStage2Assigner(num_queries)
+
+    # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss.loss_labels
+    def loss_labels(self, outputs, targets, indices, num_boxes):
+        """
+        Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor
+        of dim [nb_target_boxes]
+        """
+        if "logits" not in outputs:
+            raise KeyError("No logits were found in the outputs")
+        source_logits = outputs["logits"]
+
+        idx = self._get_source_permutation_idx(indices)
+        target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(
+            source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device
+        )
+        target_classes[idx] = target_classes_o
+
+        target_classes_onehot = torch.zeros(
+            [source_logits.shape[0], source_logits.shape[1], source_logits.shape[2] + 1],
+            dtype=source_logits.dtype,
+            layout=source_logits.layout,
+            device=source_logits.device,
+        )
+        target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
+
+        target_classes_onehot = target_classes_onehot[:, :, :-1]
+        loss_ce = (
+            sigmoid_focal_loss(source_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2)
+            * source_logits.shape[1]
+        )
+        losses = {"loss_ce": loss_ce}
+
+        return losses
+
+    @torch.no_grad()
+    # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss.loss_cardinality
+    def loss_cardinality(self, outputs, targets, indices, num_boxes):
+        """
+        Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes.
+
+        This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients.
+        """
+        logits = outputs["logits"]
+        device = logits.device
+        target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device)
+        # Count the number of predictions that are NOT "no-object" (which is the last class)
+        card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1)
+        card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float())
+        losses = {"cardinality_error": card_err}
+        return losses
+
+    # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss.loss_boxes
+    def loss_boxes(self, outputs, targets, indices, num_boxes):
+        """
+        Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss.
+
+        Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes
+        are expected in format (center_x, center_y, w, h), normalized by the image size.
+        """
+        if "pred_boxes" not in outputs:
+            raise KeyError("No predicted boxes found in outputs")
+        idx = self._get_source_permutation_idx(indices)
+        source_boxes = outputs["pred_boxes"][idx]
+        target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)
+
+        loss_bbox = nn.functional.l1_loss(source_boxes, target_boxes, reduction="none")
+
+        losses = {}
+        losses["loss_bbox"] = loss_bbox.sum() / num_boxes
+
+        loss_giou = 1 - torch.diag(
+            generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes))
+        )
+        losses["loss_giou"] = loss_giou.sum() / num_boxes
+        return losses
+
+    # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss._get_source_permutation_idx
+    def _get_source_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)])
+        source_idx = torch.cat([source for (source, _) in indices])
+        return batch_idx, source_idx
+
+    # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss._get_target_permutation_idx
+    def _get_target_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)])
+        target_idx = torch.cat([target for (_, target) in indices])
+        return batch_idx, target_idx
+
+    # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss.get_loss
+    def get_loss(self, loss, outputs, targets, indices, num_boxes):
+        loss_map = {
+            "labels": self.loss_labels,
+            "cardinality": self.loss_cardinality,
+            "boxes": self.loss_boxes,
+        }
+        if loss not in loss_map:
+            raise ValueError(f"Loss {loss} not supported")
+        return loss_map[loss](outputs, targets, indices, num_boxes)
+
+    def forward(self, outputs, targets):
+        """
+        This performs the loss computation.
+
+        Args:
+             outputs (`dict`, *optional*):
+                Dictionary of tensors, see the output specification of the model for the format.
+             targets (`List[dict]`, *optional*):
+                List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the
+                losses applied, see each loss' doc.
+        """
+        outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs"}
+
+        # Retrieve the matching between the outputs of the last layer and the targets
+        if self.assign_second_stage:
+            indices = self.stg2_assigner(outputs_without_aux, targets)
+        else:
+            indices = self.matcher(outputs_without_aux, targets)
+
+        # Compute the average number of target boxes accross all nodes, for normalization purposes
+        num_boxes = sum(len(t["class_labels"]) for t in targets)
+        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
+        # (Niels): comment out function below, distributed training to be added
+        # if is_dist_avail_and_initialized():
+        #     torch.distributed.all_reduce(num_boxes)
+        # (Niels) in original implementation, num_boxes is divided by get_world_size()
+        num_boxes = torch.clamp(num_boxes, min=1).item()
+
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))
+
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if "auxiliary_outputs" in outputs:
+            for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]):
+                if not self.assign_second_stage:
+                    indices = self.matcher(auxiliary_outputs, targets)
+                for loss in self.losses:
+                    l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes)
+                    l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+
+        if "enc_outputs" in outputs:
+            enc_outputs = outputs["enc_outputs"]
+            bin_targets = copy.deepcopy(targets)
+            for bt in bin_targets:
+                bt["labels"] = torch.zeros_like(bt["labels"])
+            if self.assign_first_stage:
+                indices = self.stg1_assigner(enc_outputs, bin_targets)
+            else:
+                indices = self.matcher(enc_outputs, bin_targets)
+            for loss in self.losses:
+                kwargs = {}
+                if loss == "labels":
+                    # Logging is enabled only for the last layer
+                    kwargs["log"] = False
+                l_dict = self.get_loss(loss, enc_outputs, bin_targets, indices, num_boxes, **kwargs)
+                l_dict = {k + "_enc": v for k, v in l_dict.items()}
+                losses.update(l_dict)
+
+        return losses
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead
+class DetaMLPPredictionHead(nn.Module):
+    """
+    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
+    height and width of a bounding box w.r.t. an image.
+
+    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+
+    """
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrHungarianMatcher with DeformableDetr->Deta
+class DetaHungarianMatcher(nn.Module):
+    """
+    This class computes an assignment between the targets and the predictions of the network.
+
+    For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more
+    predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are
+    un-matched (and thus treated as non-objects).
+
+    Args:
+        class_cost:
+            The relative weight of the classification error in the matching cost.
+        bbox_cost:
+            The relative weight of the L1 error of the bounding box coordinates in the matching cost.
+        giou_cost:
+            The relative weight of the giou loss of the bounding box in the matching cost.
+    """
+
+    def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1):
+        super().__init__()
+        requires_backends(self, ["scipy"])
+
+        self.class_cost = class_cost
+        self.bbox_cost = bbox_cost
+        self.giou_cost = giou_cost
+        if class_cost == 0 and bbox_cost == 0 and giou_cost == 0:
+            raise ValueError("All costs of the Matcher can't be 0")
+
+    @torch.no_grad()
+    def forward(self, outputs, targets):
+        """
+        Args:
+            outputs (`dict`):
+                A dictionary that contains at least these entries:
+                * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
+                * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates.
+            targets (`List[dict]`):
+                A list of targets (len(targets) = batch_size), where each target is a dict containing:
+                * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of
+                  ground-truth
+                 objects in the target) containing the class labels
+                * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates.
+
+        Returns:
+            `List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where:
+            - index_i is the indices of the selected predictions (in order)
+            - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        batch_size, num_queries = outputs["logits"].shape[:2]
+
+        # We flatten to compute the cost matrices in a batch
+        out_prob = outputs["logits"].flatten(0, 1).sigmoid()  # [batch_size * num_queries, num_classes]
+        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
+
+        # Also concat the target labels and boxes
+        target_ids = torch.cat([v["class_labels"] for v in targets])
+        target_bbox = torch.cat([v["boxes"] for v in targets])
+
+        # Compute the classification cost.
+        alpha = 0.25
+        gamma = 2.0
+        neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(1 - out_prob + 1e-8).log())
+        pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
+        class_cost = pos_cost_class[:, target_ids] - neg_cost_class[:, target_ids]
+
+        # Compute the L1 cost between boxes
+        bbox_cost = torch.cdist(out_bbox, target_bbox, p=1)
+
+        # Compute the giou cost between boxes
+        giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox))
+
+        # Final cost matrix
+        cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost
+        cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu()
+
+        sizes = [len(v["boxes"]) for v in targets]
+        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))]
+        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
+
+
+# Copied from transformers.models.detr.modeling_detr._upcast
+def _upcast(t: Tensor) -> Tensor:
+    # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
+    if t.is_floating_point():
+        return t if t.dtype in (torch.float32, torch.float64) else t.float()
+    else:
+        return t if t.dtype in (torch.int32, torch.int64) else t.int()
+
+
+# Copied from transformers.models.detr.modeling_detr.box_area
+def box_area(boxes: Tensor) -> Tensor:
+    """
+    Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.
+
+    Args:
+        boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
+            Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1
+            < x2` and `0 <= y1 < y2`.
+
+    Returns:
+        `torch.FloatTensor`: a tensor containing the area for each box.
+    """
+    boxes = _upcast(boxes)
+    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+
+
+# Copied from transformers.models.detr.modeling_detr.box_iou
+def box_iou(boxes1, boxes2):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+
+    left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+    right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+
+    width_height = (right_bottom - left_top).clamp(min=0)  # [N,M,2]
+    inter = width_height[:, :, 0] * width_height[:, :, 1]  # [N,M]
+
+    union = area1[:, None] + area2 - inter
+
+    iou = inter / union
+    return iou, union
+
+
+# Copied from transformers.models.detr.modeling_detr.generalized_box_iou
+def generalized_box_iou(boxes1, boxes2):
+    """
+    Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format.
+
+    Returns:
+        `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2)
+    """
+    # degenerate boxes gives inf / nan results
+    # so do an early check
+    if not (boxes1[:, 2:] >= boxes1[:, :2]).all():
+        raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}")
+    if not (boxes2[:, 2:] >= boxes2[:, :2]).all():
+        raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}")
+    iou, union = box_iou(boxes1, boxes2)
+
+    top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2])
+    bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
+
+    width_height = (bottom_right - top_left).clamp(min=0)  # [N,M,2]
+    area = width_height[:, :, 0] * width_height[:, :, 1]
+
+    return iou - (area - union) / area
+
+
+# from https://github.com/facebookresearch/detectron2/blob/cbbc1ce26473cb2a5cc8f58e8ada9ae14cb41052/detectron2/layers/wrappers.py#L100
+def nonzero_tuple(x):
+    """
+    A 'as_tuple=True' version of torch.nonzero to support torchscript. because of
+    https://github.com/pytorch/pytorch/issues/38718
+    """
+    if torch.jit.is_scripting():
+        if x.dim() == 0:
+            return x.unsqueeze(0).nonzero().unbind(1)
+        return x.nonzero().unbind(1)
+    else:
+        return x.nonzero(as_tuple=True)
+
+
+# from https://github.com/facebookresearch/detectron2/blob/9921a2caa585d4fa66c4b534b6fab6e74d89b582/detectron2/modeling/matcher.py#L9
+class DetaMatcher(object):
+    """
+    This class assigns to each predicted "element" (e.g., a box) a ground-truth element. Each predicted element will
+    have exactly zero or one matches; each ground-truth element may be matched to zero or more predicted elements.
+
+    The matching is determined by the MxN match_quality_matrix, that characterizes how well each (ground-truth,
+    prediction)-pair match each other. For example, if the elements are boxes, this matrix may contain box
+    intersection-over-union overlap values.
+
+    The matcher returns (a) a vector of length N containing the index of the ground-truth element m in [0, M) that
+    matches to prediction n in [0, N). (b) a vector of length N containing the labels for each prediction.
+    """
+
+    def __init__(self, thresholds: List[float], labels: List[int], allow_low_quality_matches: bool = False):
+        """
+        Args:
+            thresholds (`list[float]`):
+                A list of thresholds used to stratify predictions into levels.
+            labels (`list[int`):
+                A list of values to label predictions belonging at each level. A label can be one of {-1, 0, 1}
+                signifying {ignore, negative class, positive class}, respectively.
+            allow_low_quality_matches (`bool`, *optional*, defaults to `False`):
+                If `True`, produce additional matches for predictions with maximum match quality lower than
+                high_threshold. See `set_low_quality_matches_` for more details.
+
+            For example,
+                thresholds = [0.3, 0.5] labels = [0, -1, 1] All predictions with iou < 0.3 will be marked with 0 and
+                thus will be considered as false positives while training. All predictions with 0.3 <= iou < 0.5 will
+                be marked with -1 and thus will be ignored. All predictions with 0.5 <= iou will be marked with 1 and
+                thus will be considered as true positives.
+        """
+        # Add -inf and +inf to first and last position in thresholds
+        thresholds = thresholds[:]
+        if thresholds[0] < 0:
+            raise ValueError("Thresholds should be positive")
+        thresholds.insert(0, -float("inf"))
+        thresholds.append(float("inf"))
+        # Currently torchscript does not support all + generator
+        if not all(low <= high for (low, high) in zip(thresholds[:-1], thresholds[1:])):
+            raise ValueError("Thresholds should be sorted.")
+        if not all(l in [-1, 0, 1] for l in labels):
+            raise ValueError("All labels should be either -1, 0 or 1")
+        if len(labels) != len(thresholds) - 1:
+            raise ValueError("Number of labels should be equal to number of thresholds - 1")
+        self.thresholds = thresholds
+        self.labels = labels
+        self.allow_low_quality_matches = allow_low_quality_matches
+
+    def __call__(self, match_quality_matrix):
+        """
+        Args:
+            match_quality_matrix (Tensor[float]): an MxN tensor, containing the
+                pairwise quality between M ground-truth elements and N predicted elements. All elements must be >= 0
+                (due to the us of `torch.nonzero` for selecting indices in `set_low_quality_matches_`).
+
+        Returns:
+            matches (Tensor[int64]): a vector of length N, where matches[i] is a matched
+                ground-truth index in [0, M)
+            match_labels (Tensor[int8]): a vector of length N, where pred_labels[i] indicates
+                whether a prediction is a true or false positive or ignored
+        """
+        assert match_quality_matrix.dim() == 2
+        if match_quality_matrix.numel() == 0:
+            default_matches = match_quality_matrix.new_full((match_quality_matrix.size(1),), 0, dtype=torch.int64)
+            # When no gt boxes exist, we define IOU = 0 and therefore set labels
+            # to `self.labels[0]`, which usually defaults to background class 0
+            # To choose to ignore instead, can make labels=[-1,0,-1,1] + set appropriate thresholds
+            default_match_labels = match_quality_matrix.new_full(
+                (match_quality_matrix.size(1),), self.labels[0], dtype=torch.int8
+            )
+            return default_matches, default_match_labels
+
+        assert torch.all(match_quality_matrix >= 0)
+
+        # match_quality_matrix is M (gt) x N (predicted)
+        # Max over gt elements (dim 0) to find best gt candidate for each prediction
+        matched_vals, matches = match_quality_matrix.max(dim=0)
+
+        match_labels = matches.new_full(matches.size(), 1, dtype=torch.int8)
+
+        for l, low, high in zip(self.labels, self.thresholds[:-1], self.thresholds[1:]):
+            low_high = (matched_vals >= low) & (matched_vals < high)
+            match_labels[low_high] = l
+
+        if self.allow_low_quality_matches:
+            self.set_low_quality_matches_(match_labels, match_quality_matrix)
+
+        return matches, match_labels
+
+    def set_low_quality_matches_(self, match_labels, match_quality_matrix):
+        """
+        Produce additional matches for predictions that have only low-quality matches. Specifically, for each
+        ground-truth G find the set of predictions that have maximum overlap with it (including ties); for each
+        prediction in that set, if it is unmatched, then match it to the ground-truth G.
+
+        This function implements the RPN assignment case (i) in Sec. 3.1.2 of :paper:`Faster R-CNN`.
+        """
+        # For each gt, find the prediction with which it has highest quality
+        highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1)
+        # Find the highest quality match available, even if it is low, including ties.
+        # Note that the matches qualities must be positive due to the use of
+        # `torch.nonzero`.
+        _, pred_inds_with_highest_quality = nonzero_tuple(match_quality_matrix == highest_quality_foreach_gt[:, None])
+        # If an anchor was labeled positive only due to a low-quality match
+        # with gt_A, but it has larger overlap with gt_B, it's matched index will still be gt_B.
+        # This follows the implementation in Detectron, and is found to have no significant impact.
+        match_labels[pred_inds_with_highest_quality] = 1
+
+
+# from https://github.com/facebookresearch/detectron2/blob/cbbc1ce26473cb2a5cc8f58e8ada9ae14cb41052/detectron2/modeling/sampling.py#L9
+def subsample_labels(labels: torch.Tensor, num_samples: int, positive_fraction: float, bg_label: int):
+    """
+    Return `num_samples` (or fewer, if not enough found) random samples from `labels` which is a mixture of positives &
+    negatives. It will try to return as many positives as possible without exceeding `positive_fraction * num_samples`,
+    and then try to fill the remaining slots with negatives.
+
+    Args:
+        labels (Tensor): (N, ) label vector with values:
+            * -1: ignore
+            * bg_label: background ("negative") class
+            * otherwise: one or more foreground ("positive") classes
+        num_samples (int): The total number of labels with value >= 0 to return.
+            Values that are not sampled will be filled with -1 (ignore).
+        positive_fraction (float): The number of subsampled labels with values > 0
+            is `min(num_positives, int(positive_fraction * num_samples))`. The number of negatives sampled is
+            `min(num_negatives, num_samples - num_positives_sampled)`. In order words, if there are not enough
+            positives, the sample is filled with negatives. If there are also not enough negatives, then as many
+            elements are sampled as is possible.
+        bg_label (int): label index of background ("negative") class.
+
+    Returns:
+        pos_idx, neg_idx (Tensor):
+            1D vector of indices. The total length of both is `num_samples` or fewer.
+    """
+    positive = nonzero_tuple((labels != -1) & (labels != bg_label))[0]
+    negative = nonzero_tuple(labels == bg_label)[0]
+
+    num_pos = int(num_samples * positive_fraction)
+    # protect against not enough positive examples
+    num_pos = min(positive.numel(), num_pos)
+    num_neg = num_samples - num_pos
+    # protect against not enough negative examples
+    num_neg = min(negative.numel(), num_neg)
+
+    # randomly select positive and negative examples
+    perm1 = torch.randperm(positive.numel(), device=positive.device)[:num_pos]
+    perm2 = torch.randperm(negative.numel(), device=negative.device)[:num_neg]
+
+    pos_idx = positive[perm1]
+    neg_idx = negative[perm2]
+    return pos_idx, neg_idx
+
+
+def sample_topk_per_gt(pr_inds, gt_inds, iou, k):
+    if len(gt_inds) == 0:
+        return pr_inds, gt_inds
+    # find topk matches for each gt
+    gt_inds2, counts = gt_inds.unique(return_counts=True)
+    scores, pr_inds2 = iou[gt_inds2].topk(k, dim=1)
+    gt_inds2 = gt_inds2[:, None].repeat(1, k)
+
+    # filter to as many matches that gt has
+    pr_inds3 = torch.cat([pr[:c] for c, pr in zip(counts, pr_inds2)])
+    gt_inds3 = torch.cat([gt[:c] for c, gt in zip(counts, gt_inds2)])
+    return pr_inds3, gt_inds3
+
+
+# modified from https://github.com/facebookresearch/detectron2/blob/cbbc1ce26473cb2a5cc8f58e8ada9ae14cb41052/detectron2/modeling/roi_heads/roi_heads.py#L123
+class DetaStage2Assigner(nn.Module):
+    def __init__(self, num_queries, max_k=4):
+        super().__init__()
+        self.positive_fraction = 0.25
+        self.bg_label = 400  # number > 91 to filter out later
+        self.batch_size_per_image = num_queries
+        self.proposal_matcher = DetaMatcher(thresholds=[0.6], labels=[0, 1], allow_low_quality_matches=True)
+        self.k = max_k
+
+    def _sample_proposals(self, matched_idxs: torch.Tensor, matched_labels: torch.Tensor, gt_classes: torch.Tensor):
+        """
+        Based on the matching between N proposals and M groundtruth, sample the proposals and set their classification
+        labels.
+
+        Args:
+            matched_idxs (Tensor): a vector of length N, each is the best-matched
+                gt index in [0, M) for each proposal.
+            matched_labels (Tensor): a vector of length N, the matcher's label
+                (one of cfg.MODEL.ROI_HEADS.IOU_LABELS) for each proposal.
+            gt_classes (Tensor): a vector of length M.
+
+        Returns:
+            Tensor: a vector of indices of sampled proposals. Each is in [0, N). Tensor: a vector of the same length,
+            the classification label for
+                each sampled proposal. Each sample is labeled as either a category in [0, num_classes) or the
+                background (num_classes).
+        """
+        has_gt = gt_classes.numel() > 0
+        # Get the corresponding GT for each proposal
+        if has_gt:
+            gt_classes = gt_classes[matched_idxs]
+            # Label unmatched proposals (0 label from matcher) as background (label=num_classes)
+            gt_classes[matched_labels == 0] = self.bg_label
+            # Label ignore proposals (-1 label)
+            gt_classes[matched_labels == -1] = -1
+        else:
+            gt_classes = torch.zeros_like(matched_idxs) + self.bg_label
+
+        sampled_fg_idxs, sampled_bg_idxs = subsample_labels(
+            gt_classes, self.batch_size_per_image, self.positive_fraction, self.bg_label
+        )
+
+        sampled_idxs = torch.cat([sampled_fg_idxs, sampled_bg_idxs], dim=0)
+        return sampled_idxs, gt_classes[sampled_idxs]
+
+    def forward(self, outputs, targets, return_cost_matrix=False):
+        # COCO categories are from 1 to 90. They set num_classes=91 and apply sigmoid.
+
+        bs = len(targets)
+        indices = []
+        ious = []
+        for b in range(bs):
+            iou, _ = box_iou(
+                center_to_corners_format(targets[b]["boxes"]),
+                center_to_corners_format(outputs["init_reference"][b].detach()),
+            )
+            matched_idxs, matched_labels = self.proposal_matcher(
+                iou
+            )  # proposal_id -> highest_iou_gt_id, proposal_id -> [1 if iou > 0.6, 0 ow]
+            (
+                sampled_idxs,
+                sampled_gt_classes,
+            ) = self._sample_proposals(  # list of sampled proposal_ids, sampled_id -> [0, num_classes)+[bg_label]
+                matched_idxs, matched_labels, targets[b]["labels"]
+            )
+            pos_pr_inds = sampled_idxs[sampled_gt_classes != self.bg_label]
+            pos_gt_inds = matched_idxs[pos_pr_inds]
+            pos_pr_inds, pos_gt_inds = self.postprocess_indices(pos_pr_inds, pos_gt_inds, iou)
+            indices.append((pos_pr_inds, pos_gt_inds))
+            ious.append(iou)
+        if return_cost_matrix:
+            return indices, ious
+        return indices
+
+    def postprocess_indices(self, pr_inds, gt_inds, iou):
+        return sample_topk_per_gt(pr_inds, gt_inds, iou, self.k)
+
+
+# modified from https://github.com/facebookresearch/detectron2/blob/cbbc1ce26473cb2a5cc8f58e8ada9ae14cb41052/detectron2/modeling/proposal_generator/rpn.py#L181
+class DetaStage1Assigner(nn.Module):
+    def __init__(self, t_low=0.3, t_high=0.7, max_k=4):
+        super().__init__()
+        self.positive_fraction = 0.5
+        self.batch_size_per_image = 256
+        self.k = max_k
+        self.t_low = t_low
+        self.t_high = t_high
+        self.anchor_matcher = DetaMatcher(
+            thresholds=[t_low, t_high], labels=[0, -1, 1], allow_low_quality_matches=True
+        )
+
+    def _subsample_labels(self, label):
+        """
+        Randomly sample a subset of positive and negative examples, and overwrite the label vector to the ignore value
+        (-1) for all elements that are not included in the sample.
+
+        Args:
+            labels (Tensor): a vector of -1, 0, 1. Will be modified in-place and returned.
+        """
+        pos_idx, neg_idx = subsample_labels(label, self.batch_size_per_image, self.positive_fraction, 0)
+        # Fill with the ignore label (-1), then set positive and negative labels
+        label.fill_(-1)
+        label.scatter_(0, pos_idx, 1)
+        label.scatter_(0, neg_idx, 0)
+        return label
+
+    def forward(self, outputs, targets):
+        bs = len(targets)
+        indices = []
+        for b in range(bs):
+            anchors = outputs["anchors"][b]
+            if len(targets[b]["boxes"]) == 0:
+                indices.append(
+                    (
+                        torch.tensor([], dtype=torch.long, device=anchors.device),
+                        torch.tensor([], dtype=torch.long, device=anchors.device),
+                    )
+                )
+                continue
+            iou, _ = box_iou(
+                center_to_corners_format(targets[b]["boxes"]),
+                center_to_corners_format(anchors),
+            )
+            matched_idxs, matched_labels = self.anchor_matcher(
+                iou
+            )  # proposal_id -> highest_iou_gt_id, proposal_id -> [1 if iou > 0.7, 0 if iou < 0.3, -1 ow]
+            matched_labels = self._subsample_labels(matched_labels)
+
+            all_pr_inds = torch.arange(len(anchors))
+            pos_pr_inds = all_pr_inds[matched_labels == 1]
+            pos_gt_inds = matched_idxs[pos_pr_inds]
+            pos_pr_inds, pos_gt_inds = self.postprocess_indices(pos_pr_inds, pos_gt_inds, iou)
+            pos_pr_inds, pos_gt_inds = pos_pr_inds.to(anchors.device), pos_gt_inds.to(anchors.device)
+            indices.append((pos_pr_inds, pos_gt_inds))
+        return indices
+
+    def postprocess_indices(self, pr_inds, gt_inds, iou):
+        return sample_topk_per_gt(pr_inds, gt_inds, iou, self.k)
diff --git a/src/transformers/models/detr/__init__.py b/src/transformers/models/detr/__init__.py
index 9b0ca07cc332..9cbaca9a5458 100644
--- a/src/transformers/models/detr/__init__.py
+++ b/src/transformers/models/detr/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,7 +14,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_timm_available, is_vision_available
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
 
 
 _import_structure = {"configuration_detr": ["DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "DetrConfig", "DetrOnnxConfig"]}
@@ -33,7 +29,7 @@
     _import_structure["image_processing_detr"] = ["DetrImageProcessor"]
 
 try:
-    if not is_timm_available():
+    if not is_torch_available():
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     pass
@@ -60,7 +56,7 @@
         from .image_processing_detr import DetrImageProcessor
 
     try:
-        if not is_timm_available():
+        if not is_torch_available():
             raise OptionalDependencyNotAvailable()
     except OptionalDependencyNotAvailable:
         pass
diff --git a/src/transformers/models/detr/configuration_detr.py b/src/transformers/models/detr/configuration_detr.py
index a679cb100bd8..3b6ac3624f10 100644
--- a/src/transformers/models/detr/configuration_detr.py
+++ b/src/transformers/models/detr/configuration_detr.py
@@ -22,6 +22,7 @@
 from ...configuration_utils import PretrainedConfig
 from ...onnx import OnnxConfig
 from ...utils import logging
+from ..auto import CONFIG_MAPPING
 
 
 logger = logging.get_logger(__name__)
@@ -43,6 +44,12 @@ class DetrConfig(PretrainedConfig):
     documentation from [`PretrainedConfig`] for more information.
 
     Args:
+        use_timm_backbone (`bool`, *optional*, defaults to `True`):
+            Whether or not to use the `timm` library for the backbone. If set to `False`, will use the [`AutoBackbone`]
+            API.
+        backbone_config (`PretrainedConfig` or `dict`, *optional*):
+            The configuration of the backbone model. Only used in case `use_timm_backbone` is set to `False` in which
+            case it will default to `ResNetConfig()`.
         num_channels (`int`, *optional*, defaults to 3):
             The number of input channels.
         num_queries (`int`, *optional*, defaults to 100):
@@ -86,13 +93,14 @@ class DetrConfig(PretrainedConfig):
         position_embedding_type (`str`, *optional*, defaults to `"sine"`):
             Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`.
         backbone (`str`, *optional*, defaults to `"resnet50"`):
-            Name of convolutional backbone to use. Supports any convolutional backbone from the timm package. For a
-            list of all available models, see [this
+            Name of convolutional backbone to use in case `use_timm_backbone` = `True`. Supports any convolutional
+            backbone from the timm package. For a list of all available models, see [this
             page](https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model).
         use_pretrained_backbone (`bool`, *optional*, defaults to `True`):
-            Whether to use pretrained weights for the backbone.
+            Whether to use pretrained weights for the backbone. Only supported when `use_timm_backbone` = `True`.
         dilation (`bool`, *optional*, defaults to `False`):
-            Whether to replace stride with dilation in the last convolutional block (DC5).
+            Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when
+            `use_timm_backbone` = `True`.
         class_cost (`float`, *optional*, defaults to 1):
             Relative weight of the classification error in the Hungarian matching cost.
         bbox_cost (`float`, *optional*, defaults to 5):
@@ -133,6 +141,8 @@ class DetrConfig(PretrainedConfig):
 
     def __init__(
         self,
+        use_timm_backbone=True,
+        backbone_config=None,
         num_channels=3,
         num_queries=100,
         encoder_layers=6,
@@ -151,8 +161,6 @@ def __init__(
         activation_dropout=0.0,
         init_std=0.02,
         init_xavier_std=1.0,
-        classifier_dropout=0.0,
-        scale_embedding=False,
         auxiliary_loss=False,
         position_embedding_type="sine",
         backbone="resnet50",
@@ -166,8 +174,24 @@ def __init__(
         bbox_loss_coefficient=5,
         giou_loss_coefficient=2,
         eos_coefficient=0.1,
-        **kwargs
+        **kwargs,
     ):
+        if backbone_config is not None and use_timm_backbone:
+            raise ValueError("You can't specify both `backbone_config` and `use_timm_backbone`.")
+
+        if not use_timm_backbone:
+            if backbone_config is None:
+                logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
+                backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"])
+            elif isinstance(backbone_config, dict):
+                backbone_model_type = backbone_config.get("model_type")
+                config_class = CONFIG_MAPPING[backbone_model_type]
+                backbone_config = config_class.from_dict(backbone_config)
+            # set timm attributes to None
+            dilation, backbone, use_pretrained_backbone = None, None, None
+
+        self.use_timm_backbone = use_timm_backbone
+        self.backbone_config = backbone_config
         self.num_channels = num_channels
         self.num_queries = num_queries
         self.d_model = d_model
@@ -186,7 +210,6 @@ def __init__(
         self.encoder_layerdrop = encoder_layerdrop
         self.decoder_layerdrop = decoder_layerdrop
         self.num_hidden_layers = encoder_layers
-        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
         self.auxiliary_loss = auxiliary_loss
         self.position_embedding_type = position_embedding_type
         self.backbone = backbone
@@ -212,9 +235,20 @@ def num_attention_heads(self) -> int:
     def hidden_size(self) -> int:
         return self.d_model
 
+    @classmethod
+    def from_backbone_config(cls, backbone_config: PretrainedConfig, **kwargs):
+        """Instantiate a [`DetrConfig`] (or a derived class) from a pre-trained backbone model configuration.
 
-class DetrOnnxConfig(OnnxConfig):
+        Args:
+            backbone_config ([`PretrainedConfig`]):
+                The backbone configuration.
+        Returns:
+            [`DetrConfig`]: An instance of a configuration object
+        """
+        return cls(backbone_config=backbone_config, **kwargs)
 
+
+class DetrOnnxConfig(OnnxConfig):
     torch_onnx_minimum_version = version.parse("1.11")
 
     @property
diff --git a/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py
index abb7ed72a862..72de2be8701a 100644
--- a/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Convert DETR checkpoints."""
+"""Convert DETR checkpoints with timm backbone."""
 
 
 import argparse
@@ -20,12 +20,12 @@
 from collections import OrderedDict
 from pathlib import Path
 
+import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 
-import requests
-from huggingface_hub import hf_hub_download
-from transformers import DetrConfig, DetrFeatureExtractor, DetrForObjectDetection, DetrForSegmentation
+from transformers import DetrConfig, DetrForObjectDetection, DetrForSegmentation, DetrImageProcessor
 from transformers.utils import logging
 
 
@@ -201,13 +201,13 @@ def convert_detr_checkpoint(model_name, pytorch_dump_folder_path):
         config.id2label = id2label
         config.label2id = {v: k for k, v in id2label.items()}
 
-    # load feature extractor
+    # load image processor
     format = "coco_panoptic" if is_panoptic else "coco_detection"
-    feature_extractor = DetrFeatureExtractor(format=format)
+    image_processor = DetrImageProcessor(format=format)
 
     # prepare image
     img = prepare_img()
-    encoding = feature_extractor(images=img, return_tensors="pt")
+    encoding = image_processor(images=img, return_tensors="pt")
     pixel_values = encoding["pixel_values"]
 
     logger.info(f"Converting model {model_name}...")
@@ -258,11 +258,11 @@ def convert_detr_checkpoint(model_name, pytorch_dump_folder_path):
     if is_panoptic:
         assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4)
 
-    # Save model and feature extractor
-    logger.info(f"Saving PyTorch model and feature extractor to {pytorch_dump_folder_path}...")
+    # Save model and image processor
+    logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     model.save_pretrained(pytorch_dump_folder_path)
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/detr/convert_detr_to_pytorch.py b/src/transformers/models/detr/convert_detr_to_pytorch.py
new file mode 100644
index 000000000000..a52e592b945d
--- /dev/null
+++ b/src/transformers/models/detr/convert_detr_to_pytorch.py
@@ -0,0 +1,386 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert DETR checkpoints with native (Transformers) backbone."""
+
+
+import argparse
+import json
+from pathlib import Path
+
+import requests
+import torch
+from huggingface_hub import hf_hub_download
+from PIL import Image
+
+from transformers import DetrConfig, DetrForObjectDetection, DetrForSegmentation, DetrImageProcessor, ResNetConfig
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+def get_detr_config(model_name):
+    # initialize config
+    if "resnet-50" in model_name:
+        backbone_config = ResNetConfig.from_pretrained("microsoft/resnet-50")
+    elif "resnet-101" in model_name:
+        backbone_config = ResNetConfig.from_pretrained("microsoft/resnet-101")
+    else:
+        raise ValueError("Model name should include either resnet50 or resnet101")
+
+    config = DetrConfig(use_timm_backbone=False, backbone_config=backbone_config)
+
+    # set label attributes
+    is_panoptic = "panoptic" in model_name
+    if is_panoptic:
+        config.num_labels = 250
+    else:
+        config.num_labels = 91
+        repo_id = "huggingface/label-files"
+        filename = "coco-detection-id2label.json"
+        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
+        id2label = {int(k): v for k, v in id2label.items()}
+        config.id2label = id2label
+        config.label2id = {v: k for k, v in id2label.items()}
+
+    return config, is_panoptic
+
+
+def create_rename_keys(config):
+    # here we list all keys to be renamed (original name on the left, our name on the right)
+    rename_keys = []
+
+    # stem
+    # fmt: off
+    rename_keys.append(("backbone.0.body.conv1.weight", "backbone.conv_encoder.model.embedder.embedder.convolution.weight"))
+    rename_keys.append(("backbone.0.body.bn1.weight", "backbone.conv_encoder.model.embedder.embedder.normalization.weight"))
+    rename_keys.append(("backbone.0.body.bn1.bias", "backbone.conv_encoder.model.embedder.embedder.normalization.bias"))
+    rename_keys.append(("backbone.0.body.bn1.running_mean", "backbone.conv_encoder.model.embedder.embedder.normalization.running_mean"))
+    rename_keys.append(("backbone.0.body.bn1.running_var", "backbone.conv_encoder.model.embedder.embedder.normalization.running_var"))
+    # stages
+    for stage_idx in range(len(config.backbone_config.depths)):
+        for layer_idx in range(config.backbone_config.depths[stage_idx]):
+            # shortcut
+            if layer_idx == 0:
+                rename_keys.append(
+                    (
+                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.0.weight",
+                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.convolution.weight",
+                    )
+                )
+                rename_keys.append(
+                    (
+                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.weight",
+                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.weight",
+                    )
+                )
+                rename_keys.append(
+                    (
+                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.bias",
+                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.bias",
+                    )
+                )
+                rename_keys.append(
+                    (
+                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_mean",
+                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_mean",
+                    )
+                )
+                rename_keys.append(
+                    (
+                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_var",
+                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_var",
+                    )
+                )
+            # 3 convs
+            for i in range(3):
+                rename_keys.append(
+                    (
+                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.conv{i+1}.weight",
+                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.convolution.weight",
+                    )
+                )
+                rename_keys.append(
+                    (
+                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.weight",
+                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.weight",
+                    )
+                )
+                rename_keys.append(
+                    (
+                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.bias",
+                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.bias",
+                    )
+                )
+                rename_keys.append(
+                    (
+                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.running_mean",
+                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_mean",
+                    )
+                )
+                rename_keys.append(
+                    (
+                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.running_var",
+                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_var",
+                    )
+                )
+    # fmt: on
+
+    for i in range(config.encoder_layers):
+        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
+        rename_keys.append(
+            (
+                f"transformer.encoder.layers.{i}.self_attn.out_proj.weight",
+                f"encoder.layers.{i}.self_attn.out_proj.weight",
+            )
+        )
+        rename_keys.append(
+            (f"transformer.encoder.layers.{i}.self_attn.out_proj.bias", f"encoder.layers.{i}.self_attn.out_proj.bias")
+        )
+        rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"encoder.layers.{i}.fc1.weight"))
+        rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"encoder.layers.{i}.fc1.bias"))
+        rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"encoder.layers.{i}.fc2.weight"))
+        rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"encoder.layers.{i}.fc2.bias"))
+        rename_keys.append(
+            (f"transformer.encoder.layers.{i}.norm1.weight", f"encoder.layers.{i}.self_attn_layer_norm.weight")
+        )
+        rename_keys.append(
+            (f"transformer.encoder.layers.{i}.norm1.bias", f"encoder.layers.{i}.self_attn_layer_norm.bias")
+        )
+        rename_keys.append(
+            (f"transformer.encoder.layers.{i}.norm2.weight", f"encoder.layers.{i}.final_layer_norm.weight")
+        )
+        rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"encoder.layers.{i}.final_layer_norm.bias"))
+        # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms
+        rename_keys.append(
+            (
+                f"transformer.decoder.layers.{i}.self_attn.out_proj.weight",
+                f"decoder.layers.{i}.self_attn.out_proj.weight",
+            )
+        )
+        rename_keys.append(
+            (f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"decoder.layers.{i}.self_attn.out_proj.bias")
+        )
+        rename_keys.append(
+            (
+                f"transformer.decoder.layers.{i}.multihead_attn.out_proj.weight",
+                f"decoder.layers.{i}.encoder_attn.out_proj.weight",
+            )
+        )
+        rename_keys.append(
+            (
+                f"transformer.decoder.layers.{i}.multihead_attn.out_proj.bias",
+                f"decoder.layers.{i}.encoder_attn.out_proj.bias",
+            )
+        )
+        rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"decoder.layers.{i}.fc1.weight"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"decoder.layers.{i}.fc1.bias"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"decoder.layers.{i}.fc2.weight"))
+        rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"decoder.layers.{i}.fc2.bias"))
+        rename_keys.append(
+            (f"transformer.decoder.layers.{i}.norm1.weight", f"decoder.layers.{i}.self_attn_layer_norm.weight")
+        )
+        rename_keys.append(
+            (f"transformer.decoder.layers.{i}.norm1.bias", f"decoder.layers.{i}.self_attn_layer_norm.bias")
+        )
+        rename_keys.append(
+            (f"transformer.decoder.layers.{i}.norm2.weight", f"decoder.layers.{i}.encoder_attn_layer_norm.weight")
+        )
+        rename_keys.append(
+            (f"transformer.decoder.layers.{i}.norm2.bias", f"decoder.layers.{i}.encoder_attn_layer_norm.bias")
+        )
+        rename_keys.append(
+            (f"transformer.decoder.layers.{i}.norm3.weight", f"decoder.layers.{i}.final_layer_norm.weight")
+        )
+        rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias"))
+
+    # convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads
+    rename_keys.extend(
+        [
+            ("input_proj.weight", "input_projection.weight"),
+            ("input_proj.bias", "input_projection.bias"),
+            ("query_embed.weight", "query_position_embeddings.weight"),
+            ("transformer.decoder.norm.weight", "decoder.layernorm.weight"),
+            ("transformer.decoder.norm.bias", "decoder.layernorm.bias"),
+            ("class_embed.weight", "class_labels_classifier.weight"),
+            ("class_embed.bias", "class_labels_classifier.bias"),
+            ("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"),
+            ("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"),
+            ("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"),
+            ("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"),
+            ("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"),
+            ("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"),
+        ]
+    )
+
+    return rename_keys
+
+
+def rename_key(state_dict, old, new):
+    val = state_dict.pop(old)
+    state_dict[new] = val
+
+
+def read_in_q_k_v(state_dict, is_panoptic=False):
+    prefix = ""
+    if is_panoptic:
+        prefix = "detr."
+
+    # first: transformer encoder
+    for i in range(6):
+        # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias)
+        in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight")
+        in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias")
+        # next, add query, keys and values (in that order) to the state dict
+        state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
+        state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
+        state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
+        state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
+        state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
+        state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
+    # next: transformer decoder (which is a bit more complex because it also includes cross-attention)
+    for i in range(6):
+        # read in weights + bias of input projection layer of self-attention
+        in_proj_weight = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_weight")
+        in_proj_bias = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_bias")
+        # next, add query, keys and values (in that order) to the state dict
+        state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
+        state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
+        state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
+        state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
+        state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
+        state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
+        # read in weights + bias of input projection layer of cross-attention
+        in_proj_weight_cross_attn = state_dict.pop(
+            f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_weight"
+        )
+        in_proj_bias_cross_attn = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_bias")
+        # next, add query, keys and values (in that order) of cross-attention to the state dict
+        state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.weight"] = in_proj_weight_cross_attn[:256, :]
+        state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.bias"] = in_proj_bias_cross_attn[:256]
+        state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.weight"] = in_proj_weight_cross_attn[256:512, :]
+        state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.bias"] = in_proj_bias_cross_attn[256:512]
+        state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.weight"] = in_proj_weight_cross_attn[-256:, :]
+        state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.bias"] = in_proj_bias_cross_attn[-256:]
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+
+    return im
+
+
+@torch.no_grad()
+def convert_detr_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
+    """
+    Copy/paste/tweak model's weights to our DETR structure.
+    """
+
+    # load default config
+    config, is_panoptic = get_detr_config(model_name)
+
+    # load original model from torch hub
+    model_name_to_original_name = {
+        "detr-resnet-50": "detr_resnet50",
+        "detr-resnet-101": "detr_resnet101",
+    }
+    logger.info(f"Converting model {model_name}...")
+    detr = torch.hub.load("facebookresearch/detr", model_name_to_original_name[model_name], pretrained=True).eval()
+    state_dict = detr.state_dict()
+    # rename keys
+    for src, dest in create_rename_keys(config):
+        if is_panoptic:
+            src = "detr." + src
+        rename_key(state_dict, src, dest)
+    # query, key and value matrices need special treatment
+    read_in_q_k_v(state_dict, is_panoptic=is_panoptic)
+    # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
+    prefix = "detr.model." if is_panoptic else "model."
+    for key in state_dict.copy().keys():
+        if is_panoptic:
+            if (
+                key.startswith("detr")
+                and not key.startswith("class_labels_classifier")
+                and not key.startswith("bbox_predictor")
+            ):
+                val = state_dict.pop(key)
+                state_dict["detr.model" + key[4:]] = val
+            elif "class_labels_classifier" in key or "bbox_predictor" in key:
+                val = state_dict.pop(key)
+                state_dict["detr." + key] = val
+            elif key.startswith("bbox_attention") or key.startswith("mask_head"):
+                continue
+            else:
+                val = state_dict.pop(key)
+                state_dict[prefix + key] = val
+        else:
+            if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"):
+                val = state_dict.pop(key)
+                state_dict[prefix + key] = val
+
+    # finally, create HuggingFace model and load state dict
+    model = DetrForSegmentation(config) if is_panoptic else DetrForObjectDetection(config)
+    model.load_state_dict(state_dict)
+    model.eval()
+
+    # verify our conversion on an image
+    format = "coco_panoptic" if is_panoptic else "coco_detection"
+    processor = DetrImageProcessor(format=format)
+
+    encoding = processor(images=prepare_img(), return_tensors="pt")
+    pixel_values = encoding["pixel_values"]
+
+    original_outputs = detr(pixel_values)
+    outputs = model(pixel_values)
+
+    assert torch.allclose(outputs.logits, original_outputs["pred_logits"], atol=1e-3)
+    assert torch.allclose(outputs.pred_boxes, original_outputs["pred_boxes"], atol=1e-3)
+    if is_panoptic:
+        assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4)
+    print("Looks ok!")
+
+    if pytorch_dump_folder_path is not None:
+        # Save model and image processor
+        logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
+        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+        model.save_pretrained(pytorch_dump_folder_path)
+        processor.save_pretrained(pytorch_dump_folder_path)
+
+    if push_to_hub:
+        # Upload model and image processor to the hub
+        logger.info("Uploading PyTorch model and image processor to the hub...")
+        model.push_to_hub(f"nielsr/{model_name}")
+        processor.push_to_hub(f"nielsr/{model_name}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--model_name",
+        default="detr-resnet-50",
+        type=str,
+        choices=["detr-resnet-50", "detr-resnet-101"],
+        help="Name of the DETR model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
+    )
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether to push the model to the hub or not.")
+    args = parser.parse_args()
+    convert_detr_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/detr/image_processing_detr.py b/src/transformers/models/detr/image_processing_detr.py
index 957360a96ca6..816fad102b5e 100644
--- a/src/transformers/models/detr/image_processing_detr.py
+++ b/src/transformers/models/detr/image_processing_detr.py
@@ -16,26 +16,24 @@
 
 import io
 import pathlib
-import warnings
 from collections import defaultdict
 from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 import numpy as np
 
-from transformers.image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from transformers.image_transforms import (
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
     PaddingMode,
     center_to_corners_format,
     corners_to_center_format,
     id_to_rgb,
-    normalize,
     pad,
     rescale,
     resize,
     rgb_to_id,
     to_channel_dimension_format,
 )
-from transformers.image_utils import (
+from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
     ChannelDimension,
@@ -43,13 +41,16 @@
     PILImageResampling,
     get_image_size,
     infer_channel_dimension_format,
-    is_batched,
+    is_scaled_image,
+    make_list_of_images,
     to_numpy_array,
     valid_coco_detection_annotations,
     valid_coco_panoptic_annotations,
     valid_images,
 )
-from transformers.utils import (
+from ...utils import (
+    ExplicitEnum,
+    TensorType,
     is_flax_available,
     is_jax_tensor,
     is_scipy_available,
@@ -58,8 +59,8 @@
     is_torch_available,
     is_torch_tensor,
     is_vision_available,
+    logging,
 )
-from transformers.utils.generic import ExplicitEnum, TensorType
 
 
 if is_torch_available():
@@ -76,6 +77,11 @@
     import scipy.stats
 
 
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+AnnotationType = Dict[str, Union[int, str, List[Dict]]]
+
+
 class AnnotionFormat(ExplicitEnum):
     COCO_DETECTION = "coco_detection"
     COCO_PANOPTIC = "coco_panoptic"
@@ -116,7 +122,10 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in
 
 
 def get_resize_output_image_size(
-    input_image: np.ndarray, size: Union[int, Tuple[int, int], List[int]], max_size: Optional[int] = None
+    input_image: np.ndarray,
+    size: Union[int, Tuple[int, int], List[int]],
+    max_size: Optional[int] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
 ) -> Tuple[int, int]:
     """
     Computes the output image size given the input image size and the desired output size. If the desired output size
@@ -130,8 +139,10 @@ def get_resize_output_image_size(
             The desired output size.
         max_size (`int`, *optional*):
             The maximum allowed output size.
+        input_data_format (`ChannelDimension` or `str`, *optional*):
+            The channel dimension format of the input image. If not provided, it will be inferred from the input image.
     """
-    image_size = get_image_size(input_image)
+    image_size = get_image_size(input_image, input_data_format)
     if isinstance(size, (list, tuple)):
         return size
 
@@ -198,23 +209,28 @@ def max_across_indices(values: Iterable[Any]) -> List[Any]:
 
 
 # Copied from transformers.models.vilt.image_processing_vilt.get_max_height_width
-def get_max_height_width(images: List[np.ndarray]) -> List[int]:
+def get_max_height_width(
+    images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> List[int]:
     """
     Get the maximum height and width across all images in a batch.
     """
-    input_channel_dimension = infer_channel_dimension_format(images[0])
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(images[0])
 
-    if input_channel_dimension == ChannelDimension.FIRST:
+    if input_data_format == ChannelDimension.FIRST:
         _, max_height, max_width = max_across_indices([img.shape for img in images])
-    elif input_channel_dimension == ChannelDimension.LAST:
+    elif input_data_format == ChannelDimension.LAST:
         max_height, max_width, _ = max_across_indices([img.shape for img in images])
     else:
-        raise ValueError(f"Invalid channel dimension format: {input_channel_dimension}")
+        raise ValueError(f"Invalid channel dimension format: {input_data_format}")
     return (max_height, max_width)
 
 
 # Copied from transformers.models.vilt.image_processing_vilt.make_pixel_mask
-def make_pixel_mask(image: np.ndarray, output_size: Tuple[int, int]) -> np.ndarray:
+def make_pixel_mask(
+    image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> np.ndarray:
     """
     Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
 
@@ -224,7 +240,7 @@ def make_pixel_mask(image: np.ndarray, output_size: Tuple[int, int]) -> np.ndarr
         output_size (`Tuple[int, int]`):
             Output size of the mask.
     """
-    input_height, input_width = get_image_size(image)
+    input_height, input_width = get_image_size(image, channel_dim=input_data_format)
     mask = np.zeros(output_size, dtype=np.int64)
     mask[:input_height, :input_width] = 1
     return mask
@@ -266,11 +282,16 @@ def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndar
 
 
 # inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L50
-def prepare_coco_detection_annotation(image, target, return_segmentation_masks: bool = False):
+def prepare_coco_detection_annotation(
+    image,
+    target,
+    return_segmentation_masks: bool = False,
+    input_data_format: Optional[Union[ChannelDimension, str]] = None,
+):
     """
     Convert the target in COCO format into the format expected by DETR.
     """
-    image_height, image_width = get_image_size(image)
+    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
 
     image_id = target["image_id"]
     image_id = np.asarray([image_id], dtype=np.int64)
@@ -353,12 +374,16 @@ def masks_to_boxes(masks: np.ndarray) -> np.ndarray:
 
 
 def prepare_coco_panoptic_annotation(
-    image: np.ndarray, target: Dict, masks_path: Union[str, pathlib.Path], return_masks: bool = True
+    image: np.ndarray,
+    target: Dict,
+    masks_path: Union[str, pathlib.Path],
+    return_masks: bool = True,
+    input_data_format: Union[ChannelDimension, str] = None,
 ) -> Dict:
     """
     Prepare a coco panoptic annotation for DETR.
     """
-    image_height, image_width = get_image_size(image)
+    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
     annotation_path = pathlib.Path(masks_path) / target["file_name"]
 
     new_target = {}
@@ -587,7 +612,7 @@ def binary_mask_to_rle(mask):
     pixels = np.concatenate([[0], pixels, [0]])
     runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
     runs[1::2] -= runs[::2]
-    return [x for x in runs]
+    return list(runs)
 
 
 # TODO - (Amy) make compatible with other frameworks
@@ -767,16 +792,15 @@ def __init__(
         image_mean: Union[float, List[float]] = None,
         image_std: Union[float, List[float]] = None,
         do_pad: bool = True,
-        **kwargs
+        **kwargs,
     ) -> None:
         if "pad_and_return_pixel_mask" in kwargs:
             do_pad = kwargs.pop("pad_and_return_pixel_mask")
 
         if "max_size" in kwargs:
-            warnings.warn(
+            logger.warning_once(
                 "The `max_size` parameter is deprecated and will be removed in v4.26. "
                 "Please specify in `size['longest_edge'] instead`.",
-                FutureWarning,
             )
             max_size = kwargs.pop("max_size")
         else:
@@ -818,6 +842,7 @@ def prepare_annotation(
         format: Optional[AnnotionFormat] = None,
         return_segmentation_masks: bool = None,
         masks_path: Optional[Union[str, pathlib.Path]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> Dict:
         """
         Prepare an annotation for feeding into DETR model.
@@ -826,19 +851,25 @@ def prepare_annotation(
 
         if format == AnnotionFormat.COCO_DETECTION:
             return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
-            target = prepare_coco_detection_annotation(image, target, return_segmentation_masks)
+            target = prepare_coco_detection_annotation(
+                image, target, return_segmentation_masks, input_data_format=input_data_format
+            )
         elif format == AnnotionFormat.COCO_PANOPTIC:
             return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
             target = prepare_coco_panoptic_annotation(
-                image, target, masks_path=masks_path, return_masks=return_segmentation_masks
+                image,
+                target,
+                masks_path=masks_path,
+                return_masks=return_segmentation_masks,
+                input_data_format=input_data_format,
             )
         else:
             raise ValueError(f"Format {format} is not supported.")
         return target
 
     def prepare(self, image, target, return_segmentation_masks=None, masks_path=None):
-        warnings.warn(
-            "The `prepare` method is deprecated and will be removed in a future version. "
+        logger.warning_once(
+            "The `prepare` method is deprecated and will be removed in a v4.33. "
             "Please use `prepare_annotation` instead. Note: the `prepare_annotation` method "
             "does not return the image anymore.",
         )
@@ -846,15 +877,15 @@ def prepare(self, image, target, return_segmentation_masks=None, masks_path=None
         return image, target
 
     def convert_coco_poly_to_mask(self, *args, **kwargs):
-        warnings.warn("The `convert_coco_poly_to_mask` method is deprecated and will be removed in a future version. ")
+        logger.warning_once("The `convert_coco_poly_to_mask` method is deprecated and will be removed in v4.33. ")
         return convert_coco_poly_to_mask(*args, **kwargs)
 
     def prepare_coco_detection(self, *args, **kwargs):
-        warnings.warn("The `prepare_coco_detection` method is deprecated and will be removed in a future version. ")
+        logger.warning_once("The `prepare_coco_detection` method is deprecated and will be removed in v4.33. ")
         return prepare_coco_detection_annotation(*args, **kwargs)
 
     def prepare_coco_panoptic(self, *args, **kwargs):
-        warnings.warn("The `prepare_coco_panoptic` method is deprecated and will be removed in a future version. ")
+        logger.warning_once("The `prepare_coco_panoptic` method is deprecated and will be removed in v4.33. ")
         return prepare_coco_panoptic_annotation(*args, **kwargs)
 
     def resize(
@@ -863,24 +894,40 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BILINEAR,
         data_format: Optional[ChannelDimension] = None,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
         int, smaller edge of the image will be matched to this number.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or
+                `height` and `width`.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                Resampling filter to use if resizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
         """
         if "max_size" in kwargs:
-            warnings.warn(
+            logger.warning_once(
                 "The `max_size` parameter is deprecated and will be removed in v4.26. "
                 "Please specify in `size['longest_edge'] instead`.",
-                FutureWarning,
             )
             max_size = kwargs.pop("max_size")
         else:
             max_size = None
         size = get_size_dict(size, max_size=max_size, default_to_square=False)
         if "shortest_edge" in size and "longest_edge" in size:
-            size = get_resize_output_image_size(image, size["shortest_edge"], size["longest_edge"])
+            size = get_resize_output_image_size(
+                image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
+            )
         elif "height" in size and "width" in size:
             size = (size["height"], size["width"])
         else:
@@ -888,7 +935,9 @@ def resize(
                 "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
                 f" {size.keys()}."
             )
-        image = resize(image, size=size, resample=resample, data_format=data_format)
+        image = resize(
+            image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
+        )
         return image
 
     def resize_annotation(
@@ -904,25 +953,34 @@ def resize_annotation(
         """
         return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample)
 
+    # TODO (Amy) - update to use `rescale_factor` instead of `scale`
     def rescale(
-        self, image: np.ndarray, rescale_factor: Union[float, int], data_format: Optional[ChannelDimension] = None
-    ) -> np.ndarray:
-        """
-        Rescale the image by the given factor.
-        """
-        return rescale(image, rescale_factor, data_format=data_format)
-
-    def normalize(
         self,
         image: np.ndarray,
-        mean: Union[float, Iterable[float]],
-        std: Union[float, Iterable[float]],
-        data_format: Optional[ChannelDimension] = None,
+        rescale_factor: float,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.ndarray:
         """
-        Normalize the image with the given mean and standard deviation.
+        Rescale the image by the given factor. image = image * rescale_factor.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            rescale_factor (`float`):
+                The value to use for rescaling.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the input image. If unset, is inferred from the input image. Can be
+                one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
         """
-        return normalize(image, mean=mean, std=std, data_format=data_format)
+        return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)
 
     def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
         """
@@ -931,59 +989,30 @@ def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) ->
         """
         return normalize_annotation(annotation, image_size=image_size)
 
-    def pad_and_create_pixel_mask(
-        self,
-        pixel_values_list: List[ImageInput],
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        data_format: Optional[ChannelDimension] = None,
-    ) -> BatchFeature:
-        """
-        Pads a batch of images with zeros to the size of largest height and width in the batch and returns their
-        corresponding pixel mask.
-
-        Args:
-            images (`List[np.ndarray]`):
-                Batch of images to pad.
-            return_tensors (`str` or `TensorType`, *optional*):
-                The type of tensors to return. Can be one of:
-                    - Unset: Return a list of `np.ndarray`.
-                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
-                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
-                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
-                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        warnings.warn(
-            "This method is deprecated and will be removed in v4.27.0. Please use pad instead.", FutureWarning
-        )
-        # pad expects a list of np.ndarray, but the previous feature extractors expected torch tensors
-        images = [to_numpy_array(image) for image in pixel_values_list]
-        return self.pad(
-            images=images,
-            return_pixel_mask=True,
-            return_tensors=return_tensors,
-            data_format=data_format,
-        )
-
     def _pad_image(
         self,
         image: np.ndarray,
         output_size: Tuple[int, int],
         constant_values: Union[float, Iterable[float]] = 0,
         data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.ndarray:
         """
         Pad an image with zeros to the given size.
         """
-        input_height, input_width = get_image_size(image)
+        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
         output_height, output_width = output_size
 
         pad_bottom = output_height - input_height
         pad_right = output_width - input_width
         padding = ((0, pad_bottom), (0, pad_right))
         padded_image = pad(
-            image, padding, mode=PaddingMode.CONSTANT, constant_values=constant_values, data_format=data_format
+            image,
+            padding,
+            mode=PaddingMode.CONSTANT,
+            constant_values=constant_values,
+            data_format=data_format,
+            input_data_format=input_data_format,
         )
         return padded_image
 
@@ -994,7 +1023,8 @@ def pad(
         return_pixel_mask: bool = True,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = None,
-    ) -> np.ndarray:
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> BatchFeature:
         """
         Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
         in the batch and optionally returns their corresponding pixel mask.
@@ -1006,21 +1036,37 @@ def pad(
                 The value to use for the padding if `mode` is `"constant"`.
             return_pixel_mask (`bool`, *optional*, defaults to `True`):
                 Whether to return a pixel mask.
-            input_channel_dimension (`ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be inferred from the input image.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
         """
-        pad_size = get_max_height_width(images)
+        pad_size = get_max_height_width(images, input_data_format=input_data_format)
 
         padded_images = [
-            self._pad_image(image, pad_size, constant_values=constant_values, data_format=data_format)
+            self._pad_image(
+                image,
+                pad_size,
+                constant_values=constant_values,
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
             for image in images
         ]
         data = {"pixel_values": padded_images}
 
         if return_pixel_mask:
-            masks = [make_pixel_mask(image=image, output_size=pad_size) for image in images]
+            masks = [
+                make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
+                for image in images
+            ]
             data["pixel_mask"] = masks
 
         return BatchFeature(data=data, tensor_type=return_tensors)
@@ -1028,7 +1074,7 @@ def pad(
     def preprocess(
         self,
         images: ImageInput,
-        annotations: Optional[Union[List[Dict], List[List[Dict]]]] = None,
+        annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
         return_segmentation_masks: bool = None,
         masks_path: Optional[Union[str, pathlib.Path]] = None,
         do_resize: Optional[bool] = None,
@@ -1043,21 +1089,23 @@ def preprocess(
         format: Optional[Union[str, AnnotionFormat]] = None,
         return_tensors: Optional[Union[TensorType, str]] = None,
         data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> BatchFeature:
         """
         Preprocess an image or a batch of images so that it can be used by the model.
 
         Args:
             images (`ImageInput`):
-                Image or batch of images to preprocess.
-            annotations (`List[Dict]` or `List[List[Dict]]`, *optional*):
-                List of annotations associated with the image or batch of images. If annotionation is for object
+                Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging
+                from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
+                List of annotations associated with the image or batch of images. If annotation is for object
                 detection, the annotations should be a dictionary with the following keys:
                 - "image_id" (`int`): The image id.
                 - "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a
                   dictionary. An image can have no annotations, in which case the list should be empty.
-                If annotionation is for segmentation, the annotations should be a dictionary with the following keys:
+                If annotation is for segmentation, the annotations should be a dictionary with the following keys:
                 - "image_id" (`int`): The image id.
                 - "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary.
                   An image can have no segments, in which case the list should be empty.
@@ -1088,23 +1136,30 @@ def preprocess(
                 Format of the annotations.
             return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
                 Type of tensors to return. If `None`, will return the list of images.
-            data_format (`str` or `ChannelDimension`, *optional*, defaults to self.data_format):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
         if "pad_and_return_pixel_mask" in kwargs:
-            warnings.warn(
+            logger.warning_once(
                 "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, "
-                "use `do_pad` instead.",
-                FutureWarning,
+                "use `do_pad` instead."
             )
             do_pad = kwargs.pop("pad_and_return_pixel_mask")
 
         max_size = None
         if "max_size" in kwargs:
-            warnings.warn(
+            logger.warning_once(
                 "The `max_size` argument is deprecated and will be removed in a future version, use"
-                " `size['longest_edge']` instead.",
-                FutureWarning,
+                " `size['longest_edge']` instead."
             )
             size = kwargs.pop("max_size")
 
@@ -1129,9 +1184,9 @@ def preprocess(
         if do_normalize is not None and (image_mean is None or image_std is None):
             raise ValueError("Image mean and std must be specified if do_normalize is True.")
 
-        if not is_batched(images):
-            images = [images]
-            annotations = [annotations] if annotations is not None else None
+        images = make_list_of_images(images)
+        if annotations is not None and isinstance(annotations, dict):
+            annotations = [annotations]
 
         if annotations is not None and len(images) != len(annotations):
             raise ValueError(
@@ -1176,13 +1231,28 @@ def preprocess(
         # All transformations expect numpy arrays
         images = [to_numpy_array(image) for image in images]
 
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
         # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
         if annotations is not None:
             prepared_images = []
             prepared_annotations = []
             for image, target in zip(images, annotations):
                 target = self.prepare_annotation(
-                    image, target, format, return_segmentation_masks=return_segmentation_masks, masks_path=masks_path
+                    image,
+                    target,
+                    format,
+                    return_segmentation_masks=return_segmentation_masks,
+                    masks_path=masks_path,
+                    input_data_format=input_data_format,
                 )
                 prepared_images.append(image)
                 prepared_annotations.append(target)
@@ -1195,33 +1265,47 @@ def preprocess(
             if annotations is not None:
                 resized_images, resized_annotations = [], []
                 for image, target in zip(images, annotations):
-                    orig_size = get_image_size(image)
-                    resized_image = self.resize(image, size=size, max_size=max_size, resample=resample)
-                    resized_annotation = self.resize_annotation(target, orig_size, get_image_size(resized_image))
+                    orig_size = get_image_size(image, input_data_format)
+                    resized_image = self.resize(
+                        image, size=size, max_size=max_size, resample=resample, input_data_format=input_data_format
+                    )
+                    resized_annotation = self.resize_annotation(
+                        target, orig_size, get_image_size(resized_image, input_data_format)
+                    )
                     resized_images.append(resized_image)
                     resized_annotations.append(resized_annotation)
                 images = resized_images
                 annotations = resized_annotations
                 del resized_images, resized_annotations
             else:
-                images = [self.resize(image, size=size, resample=resample) for image in images]
+                images = [
+                    self.resize(image, size=size, resample=resample, input_data_format=input_data_format)
+                    for image in images
+                ]
 
         if do_rescale:
-            images = [self.rescale(image, rescale_factor) for image in images]
+            images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images]
 
         if do_normalize:
-            images = [self.normalize(image, image_mean, image_std) for image in images]
+            images = [
+                self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
+            ]
             if annotations is not None:
                 annotations = [
-                    self.normalize_annotation(annotation, get_image_size(image))
+                    self.normalize_annotation(annotation, get_image_size(image, input_data_format))
                     for annotation, image in zip(annotations, images)
                 ]
 
         if do_pad:
             # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
-            data = self.pad(images, return_pixel_mask=True, data_format=data_format)
+            data = self.pad(
+                images, return_pixel_mask=True, data_format=data_format, input_data_format=input_data_format
+            )
         else:
-            images = [to_channel_dimension_format(image, data_format) for image in images]
+            images = [
+                to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+                for image in images
+            ]
             data = {"pixel_values": images}
 
         encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
@@ -1250,10 +1334,9 @@ def post_process(self, outputs, target_sizes):
             `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
             in the batch as predicted by the model.
         """
-        warnings.warn(
+        logger.warning_once(
             "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
-            " `post_process_object_detection`",
-            FutureWarning,
+            " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
         )
 
         out_logits, out_bbox = outputs.logits, outputs.pred_boxes
@@ -1293,12 +1376,12 @@ def post_process_segmentation(self, outputs, target_sizes, threshold=0.9, mask_t
             `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, and masks for an image
             in the batch as predicted by the model.
         """
-        warnings.warn(
+        logger.warning_once(
             "`post_process_segmentation` is deprecated and will be removed in v5 of Transformers, please use"
             " `post_process_semantic_segmentation`.",
-            FutureWarning,
         )
         out_logits, raw_masks = outputs.logits, outputs.pred_masks
+        empty_label = out_logits.shape[-1] - 1
         preds = []
 
         def to_tuple(tup):
@@ -1308,16 +1391,15 @@ def to_tuple(tup):
 
         for cur_logits, cur_masks, size in zip(out_logits, raw_masks, target_sizes):
             # we filter empty queries and detection below threshold
-            scores, labels = cur_logits.softmax(-1).max(-1)
-            keep = labels.ne(outputs.logits.shape[-1] - 1) & (scores > threshold)
-            cur_scores, cur_classes = cur_logits.softmax(-1).max(-1)
+            cur_scores, cur_labels = cur_logits.softmax(-1).max(-1)
+            keep = cur_labels.ne(empty_label) & (cur_scores > threshold)
             cur_scores = cur_scores[keep]
-            cur_classes = cur_classes[keep]
+            cur_labels = cur_labels[keep]
             cur_masks = cur_masks[keep]
             cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1)
             cur_masks = (cur_masks.sigmoid() > mask_threshold) * 1
 
-            predictions = {"scores": cur_scores, "labels": cur_classes, "masks": cur_masks}
+            predictions = {"scores": cur_scores, "labels": cur_labels, "masks": cur_masks}
             preds.append(predictions)
         return preds
 
@@ -1329,8 +1411,7 @@ def post_process_instance(self, results, outputs, orig_target_sizes, max_target_
 
         Args:
             results (`List[Dict]`):
-                Results list obtained by [`~DetrFeatureExtractor.post_process`], to which "masks" results will be
-                added.
+                Results list obtained by [`~DetrImageProcessor.post_process`], to which "masks" results will be added.
             outputs ([`DetrSegmentationOutput`]):
                 Raw outputs of the model.
             orig_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
@@ -1345,10 +1426,9 @@ def post_process_instance(self, results, outputs, orig_target_sizes, max_target_
             `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, boxes and masks for an
             image in the batch as predicted by the model.
         """
-        warnings.warn(
+        logger.warning_once(
             "`post_process_instance` is deprecated and will be removed in v5 of Transformers, please use"
             " `post_process_instance_segmentation`.",
-            FutureWarning,
         )
 
         if len(orig_target_sizes) != len(max_target_sizes):
@@ -1392,10 +1472,9 @@ def post_process_panoptic(self, outputs, processed_sizes, target_sizes=None, is_
             `List[Dict]`: A list of dictionaries, each dictionary containing a PNG string and segments_info values for
             an image in the batch as predicted by the model.
         """
-        warnings.warn(
+        logger.warning_once(
             "`post_process_panoptic is deprecated and will be removed in v5 of Transformers, please use"
             " `post_process_panoptic_segmentation`.",
-            FutureWarning,
         )
         if target_sizes is None:
             target_sizes = processed_sizes
@@ -1411,6 +1490,7 @@ def post_process_panoptic(self, outputs, processed_sizes, target_sizes=None, is_
             raise ValueError(
                 "Make sure that you pass in as many target sizes as the batch dimension of the logits and masks"
             )
+        empty_label = out_logits.shape[-1] - 1
         preds = []
 
         def to_tuple(tup):
@@ -1422,24 +1502,23 @@ def to_tuple(tup):
             out_logits, raw_masks, raw_boxes, processed_sizes, target_sizes
         ):
             # we filter empty queries and detection below threshold
-            scores, labels = cur_logits.softmax(-1).max(-1)
-            keep = labels.ne(outputs.logits.shape[-1] - 1) & (scores > threshold)
-            cur_scores, cur_classes = cur_logits.softmax(-1).max(-1)
+            cur_scores, cur_labels = cur_logits.softmax(-1).max(-1)
+            keep = cur_labels.ne(empty_label) & (cur_scores > threshold)
             cur_scores = cur_scores[keep]
-            cur_classes = cur_classes[keep]
+            cur_labels = cur_labels[keep]
             cur_masks = cur_masks[keep]
             cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1)
             cur_boxes = center_to_corners_format(cur_boxes[keep])
 
             h, w = cur_masks.shape[-2:]
-            if len(cur_boxes) != len(cur_classes):
+            if len(cur_boxes) != len(cur_labels):
                 raise ValueError("Not as many boxes as there are classes")
 
             # It may be that we have several predicted masks for the same stuff class.
             # In the following, we track the list of masks ids for each stuff class (they are merged later on)
             cur_masks = cur_masks.flatten(1)
             stuff_equiv_classes = defaultdict(lambda: [])
-            for k, label in enumerate(cur_classes):
+            for k, label in enumerate(cur_labels):
                 if not is_thing_map[label.item()]:
                     stuff_equiv_classes[label.item()].append(k)
 
@@ -1479,28 +1558,28 @@ def get_ids_area(masks, scores, dedup=False):
                 return area, seg_img
 
             area, seg_img = get_ids_area(cur_masks, cur_scores, dedup=True)
-            if cur_classes.numel() > 0:
+            if cur_labels.numel() > 0:
                 # We know filter empty masks as long as we find some
                 while True:
                     filtered_small = torch.as_tensor(
-                        [area[i] <= 4 for i, c in enumerate(cur_classes)], dtype=torch.bool, device=keep.device
+                        [area[i] <= 4 for i, c in enumerate(cur_labels)], dtype=torch.bool, device=keep.device
                     )
                     if filtered_small.any().item():
                         cur_scores = cur_scores[~filtered_small]
-                        cur_classes = cur_classes[~filtered_small]
+                        cur_labels = cur_labels[~filtered_small]
                         cur_masks = cur_masks[~filtered_small]
                         area, seg_img = get_ids_area(cur_masks, cur_scores)
                     else:
                         break
 
             else:
-                cur_classes = torch.ones(1, dtype=torch.long, device=cur_classes.device)
+                cur_labels = torch.ones(1, dtype=torch.long, device=cur_labels.device)
 
             segments_info = []
             for i, a in enumerate(area):
-                cat = cur_classes[i].item()
+                cat = cur_labels[i].item()
                 segments_info.append({"id": i, "isthing": is_thing_map[cat], "category_id": cat, "area": a})
-            del cur_classes
+            del cur_labels
 
             with io.BytesIO() as out:
                 seg_img.save(out, format="PNG")
@@ -1550,7 +1629,7 @@ def post_process_object_detection(
             else:
                 img_h, img_w = target_sizes.unbind(1)
 
-            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
             boxes = boxes * scale_fct[:, None, :]
 
         results = []
@@ -1738,7 +1817,7 @@ def post_process_panoptic_segmentation(
         """
 
         if label_ids_to_fuse is None:
-            warnings.warn("`label_ids_to_fuse` unset. No instance will be fused.")
+            logger.warning_once("`label_ids_to_fuse` unset. No instance will be fused.")
             label_ids_to_fuse = set()
 
         class_queries_logits = outputs.logits  # [batch_size, num_queries, num_classes+1]
diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py
index 478653ee635e..3dda00a20082 100644
--- a/src/transformers/models/detr/modeling_detr.py
+++ b/src/transformers/models/detr/modeling_detr.py
@@ -16,9 +16,8 @@
 
 
 import math
-import random
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple, Union
 
 import torch
 from torch import Tensor, nn
@@ -26,7 +25,6 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_int_div
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -38,6 +36,7 @@
     replace_return_docstrings,
     requires_backends,
 )
+from ..auto import AutoBackbone
 from .configuration_detr import DetrConfig
 
 
@@ -305,60 +304,79 @@ def forward(self, x):
         return x * scale + bias
 
 
-def replace_batch_norm(m, name=""):
-    for attr_str in dir(m):
-        target_attr = getattr(m, attr_str)
-        if isinstance(target_attr, nn.BatchNorm2d):
-            frozen = DetrFrozenBatchNorm2d(target_attr.num_features)
-            bn = getattr(m, attr_str)
-            frozen.weight.data.copy_(bn.weight)
-            frozen.bias.data.copy_(bn.bias)
-            frozen.running_mean.data.copy_(bn.running_mean)
-            frozen.running_var.data.copy_(bn.running_var)
-            setattr(m, attr_str, frozen)
-    for n, ch in m.named_children():
-        replace_batch_norm(ch, n)
+def replace_batch_norm(model):
+    r"""
+    Recursively replace all `torch.nn.BatchNorm2d` with `DetrFrozenBatchNorm2d`.
+
+    Args:
+        model (torch.nn.Module):
+            input model
+    """
+    for name, module in model.named_children():
+        if isinstance(module, nn.BatchNorm2d):
+            new_module = DetrFrozenBatchNorm2d(module.num_features)
+
+            new_module.weight.data.copy_(module.weight)
+            new_module.bias.data.copy_(module.bias)
+            new_module.running_mean.data.copy_(module.running_mean)
+            new_module.running_var.data.copy_(module.running_var)
+
+            model._modules[name] = new_module
+
+        if len(list(module.children())) > 0:
+            replace_batch_norm(module)
 
 
-class DetrTimmConvEncoder(nn.Module):
+class DetrConvEncoder(nn.Module):
     """
-    Convolutional encoder (backbone) from the timm library.
+    Convolutional backbone, using either the AutoBackbone API or one from the timm library.
 
     nn.BatchNorm2d layers are replaced by DetrFrozenBatchNorm2d as defined above.
 
     """
 
-    def __init__(self, name: str, dilation: bool, use_pretrained_backbone: bool, num_channels: int = 3):
+    def __init__(self, config):
         super().__init__()
 
-        kwargs = {}
-        if dilation:
-            kwargs["output_stride"] = 16
-
-        requires_backends(self, ["timm"])
+        self.config = config
+
+        if config.use_timm_backbone:
+            requires_backends(self, ["timm"])
+            kwargs = {}
+            if config.dilation:
+                kwargs["output_stride"] = 16
+            backbone = create_model(
+                config.backbone,
+                pretrained=config.use_pretrained_backbone,
+                features_only=True,
+                out_indices=(1, 2, 3, 4),
+                in_chans=config.num_channels,
+                **kwargs,
+            )
+        else:
+            backbone = AutoBackbone.from_config(config.backbone_config)
 
-        backbone = create_model(
-            name,
-            pretrained=use_pretrained_backbone,
-            features_only=True,
-            out_indices=(1, 2, 3, 4),
-            in_chans=num_channels,
-            **kwargs,
-        )
         # replace batch norm by frozen batch norm
         with torch.no_grad():
             replace_batch_norm(backbone)
         self.model = backbone
-        self.intermediate_channel_sizes = self.model.feature_info.channels()
+        self.intermediate_channel_sizes = (
+            self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
+        )
 
-        if "resnet" in name:
+        backbone_model_type = config.backbone if config.use_timm_backbone else config.backbone_config.model_type
+        if "resnet" in backbone_model_type:
             for name, parameter in self.model.named_parameters():
-                if "layer2" not in name and "layer3" not in name and "layer4" not in name:
-                    parameter.requires_grad_(False)
+                if config.use_timm_backbone:
+                    if "layer2" not in name and "layer3" not in name and "layer4" not in name:
+                        parameter.requires_grad_(False)
+                else:
+                    if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name:
+                        parameter.requires_grad_(False)
 
     def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
         # send pixel_values through the model to get list of feature maps
-        features = self.model(pixel_values)
+        features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps
 
         out = []
         for feature_map in features:
@@ -430,7 +448,7 @@ def forward(self, pixel_values, pixel_mask):
             x_embed = x_embed / (x_embed[:, :, -1:] + 1e-6) * self.scale
 
         dim_t = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device)
-        dim_t = self.temperature ** (2 * torch_int_div(dim_t, 2) / self.embedding_dim)
+        dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim)
 
         pos_x = x_embed[:, :, :, None] / dim_t
         pos_y = y_embed[:, :, :, None] / dim_t
@@ -488,7 +506,6 @@ def __init__(
         embed_dim: int,
         num_heads: int,
         dropout: float = 0.0,
-        is_decoder: bool = False,
         bias: bool = True,
     ):
         super().__init__()
@@ -511,34 +528,79 @@ def __init__(
     def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
         return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
 
-    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
-        return tensor if position_embeddings is None else tensor + position_embeddings
+    def with_pos_embed(self, tensor: torch.Tensor, object_queries: Optional[Tensor], **kwargs):
+        position_embeddings = kwargs.pop("position_embeddings", None)
+
+        if kwargs:
+            raise ValueError(f"Unexpected arguments {kwargs.keys()}")
+
+        if position_embeddings is not None and object_queries is not None:
+            raise ValueError(
+                "Cannot specify both position_embeddings and object_queries. Please use just object_queries"
+            )
+
+        if position_embeddings is not None:
+            logger.warning_once(
+                "position_embeddings has been deprecated and will be removed in v4.34. Please use object_queries instead"
+            )
+            object_queries = position_embeddings
+
+        return tensor if object_queries is None else tensor + object_queries
 
     def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
-        position_embeddings: Optional[torch.Tensor] = None,
+        object_queries: Optional[torch.Tensor] = None,
         key_value_states: Optional[torch.Tensor] = None,
-        key_value_position_embeddings: Optional[torch.Tensor] = None,
+        spatial_position_embeddings: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
+        **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         """Input shape: Batch x Time x Channel"""
 
+        position_embeddings = kwargs.pop("position_ebmeddings", None)
+        key_value_position_embeddings = kwargs.pop("key_value_position_embeddings", None)
+
+        if kwargs:
+            raise ValueError(f"Unexpected arguments {kwargs.keys()}")
+
+        if position_embeddings is not None and object_queries is not None:
+            raise ValueError(
+                "Cannot specify both position_embeddings and object_queries. Please use just object_queries"
+            )
+
+        if key_value_position_embeddings is not None and spatial_position_embeddings is not None:
+            raise ValueError(
+                "Cannot specify both key_value_position_embeddings and spatial_position_embeddings. Please use just spatial_position_embeddings"
+            )
+
+        if position_embeddings is not None:
+            logger.warning_once(
+                "position_embeddings has been deprecated and will be removed in v4.34. Please use object_queries instead"
+            )
+            object_queries = position_embeddings
+
+        if key_value_position_embeddings is not None:
+            logger.warning_once(
+                "key_value_position_embeddings has been deprecated and will be removed in v4.34. Please use spatial_position_embeddings instead"
+            )
+            spatial_position_embeddings = key_value_position_embeddings
+
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
         batch_size, target_len, embed_dim = hidden_states.size()
 
         # add position embeddings to the hidden states before projecting to queries and keys
-        if position_embeddings is not None:
+        if object_queries is not None:
             hidden_states_original = hidden_states
-            hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
+            hidden_states = self.with_pos_embed(hidden_states, object_queries)
 
         # add key-value position embeddings to the key value states
-        if key_value_position_embeddings is not None:
+        if spatial_position_embeddings is not None:
             key_value_states_original = key_value_states
-            key_value_states = self.with_pos_embed(key_value_states, key_value_position_embeddings)
+            key_value_states = self.with_pos_embed(key_value_states, spatial_position_embeddings)
 
         # get query proj
         query_states = self.q_proj(hidden_states) * self.scaling
@@ -628,25 +690,43 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: torch.Tensor,
-        position_embeddings: torch.Tensor = None,
+        object_queries: torch.Tensor = None,
         output_attentions: bool = False,
+        **kwargs,
     ):
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
                 values.
-            position_embeddings (`torch.FloatTensor`, *optional*): position embeddings, to be added to hidden_states.
+            object_queries (`torch.FloatTensor`, *optional*):
+                Object queries (also called content embeddings), to be added to the hidden states.
             output_attentions (`bool`, *optional*):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more detail.
         """
+        position_embeddings = kwargs.pop("position_embeddings", None)
+
+        if kwargs:
+            raise ValueError(f"Unexpected arguments {kwargs.keys()}")
+
+        if position_embeddings is not None and object_queries is not None:
+            raise ValueError(
+                "Cannot specify both position_embeddings and object_queries. Please use just object_queries"
+            )
+
+        if position_embeddings is not None:
+            logger.warning_once(
+                "position_embeddings has been deprecated and will be removed in v4.34. Please use object_queries instead"
+            )
+            object_queries = position_embeddings
+
         residual = hidden_states
         hidden_states, attn_weights = self.self_attn(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
-            position_embeddings=position_embeddings,
+            object_queries=object_queries,
             output_attentions=output_attentions,
         )
 
@@ -686,7 +766,6 @@ def __init__(self, config: DetrConfig):
             embed_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
             dropout=config.attention_dropout,
-            is_decoder=True,
         )
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
@@ -697,7 +776,6 @@ def __init__(self, config: DetrConfig):
             self.embed_dim,
             config.decoder_attention_heads,
             dropout=config.attention_dropout,
-            is_decoder=True,
         )
         self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
@@ -708,26 +786,27 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
-        position_embeddings: Optional[torch.Tensor] = None,
+        object_queries: Optional[torch.Tensor] = None,
         query_position_embeddings: Optional[torch.Tensor] = None,
         encoder_hidden_states: Optional[torch.Tensor] = None,
         encoder_attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
+        **kwargs,
     ):
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
                 values.
-            position_embeddings (`torch.FloatTensor`, *optional*):
-                position embeddings that are added to the queries and keys
+            object_queries (`torch.FloatTensor`, *optional*):
+                object_queries that are added to the hidden states
             in the cross-attention layer.
             query_position_embeddings (`torch.FloatTensor`, *optional*):
                 position embeddings that are added to the queries and keys
             in the self-attention layer.
             encoder_hidden_states (`torch.FloatTensor`):
-                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
             encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                 `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
                 values.
@@ -735,12 +814,28 @@ def forward(
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more detail.
         """
+        position_embeddings = kwargs.pop("position_embeddings", None)
+
+        if kwargs:
+            raise ValueError(f"Unexpected arguments {kwargs.keys()}")
+
+        if position_embeddings is not None and object_queries is not None:
+            raise ValueError(
+                "Cannot specify both position_embeddings and object_queries. Please use just object_queries"
+            )
+
+        if position_embeddings is not None:
+            logger.warning_once(
+                "position_embeddings has been deprecated and will be removed in v4.34. Please use object_queries instead"
+            )
+            object_queries = position_embeddings
+
         residual = hidden_states
 
         # Self Attention
         hidden_states, self_attn_weights = self.self_attn(
             hidden_states=hidden_states,
-            position_embeddings=query_position_embeddings,
+            object_queries=query_position_embeddings,
             attention_mask=attention_mask,
             output_attentions=output_attentions,
         )
@@ -756,10 +851,10 @@ def forward(
 
             hidden_states, cross_attn_weights = self.encoder_attn(
                 hidden_states=hidden_states,
-                position_embeddings=query_position_embeddings,
+                object_queries=query_position_embeddings,
                 key_value_states=encoder_hidden_states,
                 attention_mask=encoder_attention_mask,
-                key_value_position_embeddings=position_embeddings,
+                spatial_position_embeddings=object_queries,
                 output_attentions=output_attentions,
             )
 
@@ -856,7 +951,7 @@ def _set_gradient_checkpointing(self, module, value=False):
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it.
 
-            Pixel values can be obtained using [`DetrImageProcessor`]. See [`DetrImageProcessor.__call__`] for details.
+            Pixel values can be obtained using [`AutoImageProcessor`]. See [`DetrImageProcessor.__call__`] for details.
 
         pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
             Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
@@ -866,7 +961,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 
             [What are attention masks?](../glossary#attention-mask)
 
-        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, num_queries)`, *optional*):
+        decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
             Not used by default. Can be used to mask object queries.
         encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
             Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
@@ -898,7 +993,7 @@ class DetrEncoder(DetrPreTrainedModel):
 
     Small tweak for DETR:
 
-    - position_embeddings are added to the forward pass.
+    - object_queries are added to the forward pass.
 
     Args:
         config: DetrConfig
@@ -921,10 +1016,11 @@ def forward(
         self,
         inputs_embeds=None,
         attention_mask=None,
-        position_embeddings=None,
+        object_queries=None,
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
+        **kwargs,
     ):
         r"""
         Args:
@@ -939,8 +1035,8 @@ def forward(
 
                 [What are attention masks?](../glossary#attention-mask)
 
-            position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-                Position embeddings that are added to the queries and keys in each self-attention layer.
+            object_queries (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Object queries that are added to the queries in each self-attention layer.
 
             output_attentions (`bool`, *optional*):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
@@ -951,6 +1047,22 @@ def forward(
             return_dict (`bool`, *optional*):
                 Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
         """
+        position_embeddings = kwargs.pop("position_embeddings", None)
+
+        if kwargs:
+            raise ValueError(f"Unexpected arguments {kwargs.keys()}")
+
+        if position_embeddings is not None and object_queries is not None:
+            raise ValueError(
+                "Cannot specify both position_embeddings and object_queries. Please use just object_queries"
+            )
+
+        if position_embeddings is not None:
+            logger.warning_once(
+                "position_embeddings has been deprecated and will be removed in v4.34. Please use object_queries instead"
+            )
+            object_queries = position_embeddings
+
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -971,15 +1083,20 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
                 layer_outputs = (None, None)
             else:
-                # we add position_embeddings as extra input to the encoder_layer
+                # we add object_queries as extra input to the encoder_layer
                 layer_outputs = encoder_layer(
                     hidden_states,
                     attention_mask,
-                    position_embeddings=position_embeddings,
+                    object_queries=object_queries,
                     output_attentions=output_attentions,
                 )
 
@@ -1006,7 +1123,7 @@ class DetrDecoder(DetrPreTrainedModel):
 
     Some small tweaks for DETR:
 
-    - position_embeddings and query_position_embeddings are added to the forward pass.
+    - object_queries and query_position_embeddings are added to the forward pass.
     - if self.config.auxiliary_loss is set to True, also returns a stack of activations from all decoding layers.
 
     Args:
@@ -1032,11 +1149,12 @@ def forward(
         attention_mask=None,
         encoder_hidden_states=None,
         encoder_attention_mask=None,
-        position_embeddings=None,
+        object_queries=None,
         query_position_embeddings=None,
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
+        **kwargs,
     ):
         r"""
         Args:
@@ -1060,10 +1178,11 @@ def forward(
                 - 1 for pixels that are real (i.e. **not masked**),
                 - 0 for pixels that are padding (i.e. **masked**).
 
-            position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-                Position embeddings that are added to the queries and keys in each cross-attention layer.
+            object_queries (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Object queries that are added to the queries and keys in each cross-attention layer.
             query_position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
-                , *optional*): Position embeddings that are added to the queries and keys in each self-attention layer.
+                , *optional*): Position embeddings that are added to the values and keys in each self-attention layer.
+
             output_attentions (`bool`, *optional*):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more detail.
@@ -1073,6 +1192,22 @@ def forward(
             return_dict (`bool`, *optional*):
                 Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
         """
+        position_embeddings = kwargs.pop("position_embeddings", None)
+
+        if kwargs:
+            raise ValueError(f"Unexpected arguments {kwargs.keys()}")
+
+        if position_embeddings is not None and object_queries is not None:
+            raise ValueError(
+                "Cannot specify both position_embeddings and object_queries. Please use just object_queries"
+            )
+
+        if position_embeddings is not None:
+            logger.warning_once(
+                "position_embeddings has been deprecated and will be removed in v4.34. Please use object_queries instead"
+            )
+            object_queries = position_embeddings
+
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1110,9 +1245,10 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             if self.gradient_checkpointing and self.training:
 
@@ -1134,7 +1270,7 @@ def custom_forward(*inputs):
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=combined_attention_mask,
-                    position_embeddings=position_embeddings,
+                    object_queries=object_queries,
                     query_position_embeddings=query_position_embeddings,
                     encoder_hidden_states=encoder_hidden_states,
                     encoder_attention_mask=encoder_attention_mask,
@@ -1191,11 +1327,9 @@ def __init__(self, config: DetrConfig):
         super().__init__(config)
 
         # Create backbone + positional encoding
-        backbone = DetrTimmConvEncoder(
-            config.backbone, config.dilation, config.use_pretrained_backbone, config.num_channels
-        )
-        position_embeddings = build_position_encoding(config)
-        self.backbone = DetrConvModel(backbone, position_embeddings)
+        backbone = DetrConvEncoder(config)
+        object_queries = build_position_encoding(config)
+        self.backbone = DetrConvModel(backbone, object_queries)
 
         # Create projection layer
         self.input_projection = nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1)
@@ -1226,30 +1360,30 @@ def unfreeze_backbone(self):
     @replace_return_docstrings(output_type=DetrModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        pixel_values,
-        pixel_mask=None,
-        decoder_attention_mask=None,
-        encoder_outputs=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        pixel_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_outputs: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], DetrModelOutput]:
         r"""
         Returns:
 
         Examples:
 
         ```python
-        >>> from transformers import DetrImageProcessor, DetrModel
+        >>> from transformers import AutoImageProcessor, DetrModel
         >>> from PIL import Image
         >>> import requests
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> image_processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
+        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50")
         >>> model = DetrModel.from_pretrained("facebook/detr-resnet-50")
 
         >>> # prepare image for the model
@@ -1279,7 +1413,7 @@ def forward(
         # First, sent pixel_values + pixel_mask through Backbone to obtain the features
         # pixel_values should be of shape (batch_size, num_channels, height, width)
         # pixel_mask should be of shape (batch_size, height, width)
-        features, position_embeddings_list = self.backbone(pixel_values, pixel_mask)
+        features, object_queries_list = self.backbone(pixel_values, pixel_mask)
 
         # get final feature map and downsampled mask
         feature_map, mask = features[-1]
@@ -1293,7 +1427,7 @@ def forward(
         # Third, flatten the feature map + position embeddings of shape NxCxHxW to NxCxHW, and permute it to NxHWxC
         # In other words, turn their shape into (batch_size, sequence_length, hidden_size)
         flattened_features = projected_feature_map.flatten(2).permute(0, 2, 1)
-        position_embeddings = position_embeddings_list[-1].flatten(2).permute(0, 2, 1)
+        object_queries = object_queries_list[-1].flatten(2).permute(0, 2, 1)
 
         flattened_mask = mask.flatten(1)
 
@@ -1304,7 +1438,7 @@ def forward(
             encoder_outputs = self.encoder(
                 inputs_embeds=flattened_features,
                 attention_mask=flattened_mask,
-                position_embeddings=position_embeddings,
+                object_queries=object_queries,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict,
@@ -1317,7 +1451,7 @@ def forward(
                 attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
             )
 
-        # Fifth, sent query embeddings + position embeddings through the decoder (which is conditioned on the encoder output)
+        # Fifth, sent query embeddings + object_queries through the decoder (which is conditioned on the encoder output)
         query_position_embeddings = self.query_position_embeddings.weight.unsqueeze(0).repeat(batch_size, 1, 1)
         queries = torch.zeros_like(query_position_embeddings)
 
@@ -1325,7 +1459,7 @@ def forward(
         decoder_outputs = self.decoder(
             inputs_embeds=queries,
             attention_mask=None,
-            position_embeddings=position_embeddings,
+            object_queries=object_queries,
             query_position_embeddings=query_position_embeddings,
             encoder_hidden_states=encoder_outputs[0],
             encoder_attention_mask=flattened_mask,
@@ -1386,17 +1520,17 @@ def _set_aux_loss(self, outputs_class, outputs_coord):
     @replace_return_docstrings(output_type=DetrObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        pixel_values,
-        pixel_mask=None,
-        decoder_attention_mask=None,
-        encoder_outputs=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        pixel_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_outputs: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[List[dict]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], DetrObjectDetectionOutput]:
         r"""
         labels (`List[Dict]` of len `(batch_size,)`, *optional*):
             Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
@@ -1409,7 +1543,7 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import DetrImageProcessor, DetrForObjectDetection
+        >>> from transformers import AutoImageProcessor, DetrForObjectDetection
         >>> import torch
         >>> from PIL import Image
         >>> import requests
@@ -1417,7 +1551,7 @@ def forward(
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> image_processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
+        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50")
         >>> model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")
 
         >>> inputs = image_processor(images=image, return_tensors="pt")
@@ -1556,17 +1690,17 @@ def __init__(self, config: DetrConfig):
     @replace_return_docstrings(output_type=DetrSegmentationOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        pixel_values,
-        pixel_mask=None,
-        decoder_attention_mask=None,
-        encoder_outputs=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        pixel_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_outputs: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[List[dict]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], DetrSegmentationOutput]:
         r"""
         labels (`List[Dict]` of len `(batch_size,)`, *optional*):
             Labels for computing the bipartite matching loss, DICE/F-1 loss and Focal loss. List of dicts, each
@@ -1587,13 +1721,13 @@ def forward(
         >>> import torch
         >>> import numpy
 
-        >>> from transformers import DetrImageProcessor, DetrForSegmentation
+        >>> from transformers import AutoImageProcessor, DetrForSegmentation
         >>> from transformers.image_transforms import rgb_to_id
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> image_processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50-panoptic")
+        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50-panoptic")
         >>> model = DetrForSegmentation.from_pretrained("facebook/detr-resnet-50-panoptic")
 
         >>> # prepare image for the model
@@ -1602,7 +1736,7 @@ def forward(
         >>> # forward pass
         >>> outputs = model(**inputs)
 
-        >>> # Use the `post_process_panoptic_segmentation` method of `DetrImageProcessor` to retrieve post-processed panoptic segmentation maps
+        >>> # Use the `post_process_panoptic_segmentation` method of the `image_processor` to retrieve post-processed panoptic segmentation maps
         >>> # Segmentation results are returned as a list of dictionaries
         >>> result = image_processor.post_process_panoptic_segmentation(outputs, target_sizes=[(300, 500)])
 
@@ -1621,7 +1755,7 @@ def forward(
             pixel_mask = torch.ones((batch_size, height, width), device=device)
 
         # First, get list of feature maps and position embeddings
-        features, position_embeddings_list = self.detr.model.backbone(pixel_values, pixel_mask=pixel_mask)
+        features, object_queries_list = self.detr.model.backbone(pixel_values, pixel_mask=pixel_mask)
 
         # Second, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
         feature_map, mask = features[-1]
@@ -1631,7 +1765,7 @@ def forward(
         # Third, flatten the feature map + position embeddings of shape NxCxHxW to NxCxHW, and permute it to NxHWxC
         # In other words, turn their shape into (batch_size, sequence_length, hidden_size)
         flattened_features = projected_feature_map.flatten(2).permute(0, 2, 1)
-        position_embeddings = position_embeddings_list[-1].flatten(2).permute(0, 2, 1)
+        object_queries = object_queries_list[-1].flatten(2).permute(0, 2, 1)
 
         flattened_mask = mask.flatten(1)
 
@@ -1642,7 +1776,7 @@ def forward(
             encoder_outputs = self.detr.model.encoder(
                 inputs_embeds=flattened_features,
                 attention_mask=flattened_mask,
-                position_embeddings=position_embeddings,
+                object_queries=object_queries,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict,
@@ -1665,7 +1799,7 @@ def forward(
         decoder_outputs = self.detr.model.decoder(
             inputs_embeds=queries,
             attention_mask=None,
-            position_embeddings=position_embeddings,
+            object_queries=object_queries,
             query_position_embeddings=query_position_embeddings,
             encoder_hidden_states=encoder_outputs[0],
             encoder_attention_mask=flattened_mask,
@@ -1780,13 +1914,13 @@ def __init__(self, dim, fpn_dims, context_dim):
         self.lay1 = nn.Conv2d(dim, dim, 3, padding=1)
         self.gn1 = nn.GroupNorm(8, dim)
         self.lay2 = nn.Conv2d(dim, inter_dims[1], 3, padding=1)
-        self.gn2 = nn.GroupNorm(8, inter_dims[1])
+        self.gn2 = nn.GroupNorm(min(8, inter_dims[1]), inter_dims[1])
         self.lay3 = nn.Conv2d(inter_dims[1], inter_dims[2], 3, padding=1)
-        self.gn3 = nn.GroupNorm(8, inter_dims[2])
+        self.gn3 = nn.GroupNorm(min(8, inter_dims[2]), inter_dims[2])
         self.lay4 = nn.Conv2d(inter_dims[2], inter_dims[3], 3, padding=1)
-        self.gn4 = nn.GroupNorm(8, inter_dims[3])
+        self.gn4 = nn.GroupNorm(min(8, inter_dims[3]), inter_dims[3])
         self.lay5 = nn.Conv2d(inter_dims[3], inter_dims[4], 3, padding=1)
-        self.gn5 = nn.GroupNorm(8, inter_dims[4])
+        self.gn5 = nn.GroupNorm(min(8, inter_dims[4]), inter_dims[4])
         self.out_lay = nn.Conv2d(inter_dims[4], 1, 3, padding=1)
 
         self.dim = dim
diff --git a/src/transformers/models/dinat/__init__.py b/src/transformers/models/dinat/__init__.py
index ca0bdd11129b..88470f1ca9f9 100644
--- a/src/transformers/models/dinat/__init__.py
+++ b/src/transformers/models/dinat/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,7 +13,6 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-# rely on isort to merge the imports
 from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
 
 
diff --git a/src/transformers/models/dinat/configuration_dinat.py b/src/transformers/models/dinat/configuration_dinat.py
index 8348d1beb9fe..b70797b55c34 100644
--- a/src/transformers/models/dinat/configuration_dinat.py
+++ b/src/transformers/models/dinat/configuration_dinat.py
@@ -16,6 +16,7 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
 
 
 logger = logging.get_logger(__name__)
@@ -26,7 +27,7 @@
 }
 
 
-class DinatConfig(PretrainedConfig):
+class DinatConfig(BackboneConfigMixin, PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`DinatModel`]. It is used to instantiate a Dinat
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
@@ -43,9 +44,9 @@ class DinatConfig(PretrainedConfig):
             The number of input channels.
         embed_dim (`int`, *optional*, defaults to 64):
             Dimensionality of patch embedding.
-        depths (`List[int]`, *optional*, defaults to `[2, 2, 6, 2]`):
+        depths (`List[int]`, *optional*, defaults to `[3, 4, 6, 5]`):
             Number of layers in each level of the encoder.
-        num_heads (`List[int]`, *optional*, defaults to `[3, 6, 12, 24]`):
+        num_heads (`List[int]`, *optional*, defaults to `[2, 4, 8, 16]`):
             Number of attention heads in each layer of the Transformer encoder.
         kernel_size (`int`, *optional*, defaults to 7):
             Neighborhood Attention kernel size.
@@ -64,17 +65,20 @@ class DinatConfig(PretrainedConfig):
         hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`,
             `"selu"` and `"gelu_new"` are supported.
-        patch_norm (`bool`, *optional*, defaults to `True`):
-            Whether or not to add layer normalization after patch embedding.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
         layer_scale_init_value (`float`, *optional*, defaults to 0.0):
             The initial value for the layer scale. Disabled if <=0.
         out_features (`List[str]`, *optional*):
             If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
-            (depending on how many stages the model has). Will default to the last stage if unset.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage.
+        out_indices (`List[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage.
 
     Example:
 
@@ -112,12 +116,12 @@ def __init__(
         attention_probs_dropout_prob=0.0,
         drop_path_rate=0.1,
         hidden_act="gelu",
-        patch_norm=True,
         initializer_range=0.02,
         layer_norm_eps=1e-5,
         layer_scale_init_value=0.0,
         out_features=None,
-        **kwargs
+        out_indices=None,
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -135,7 +139,6 @@ def __init__(
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
         self.drop_path_rate = drop_path_rate
         self.hidden_act = hidden_act
-        self.path_norm = patch_norm
         self.layer_norm_eps = layer_norm_eps
         self.initializer_range = initializer_range
         # we set the hidden_size attribute in order to make Dinat work with VisionEncoderDecoderModel
@@ -143,12 +146,6 @@ def __init__(
         self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
         self.layer_scale_init_value = layer_scale_init_value
         self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
-        if out_features is not None:
-            if not isinstance(out_features, list):
-                raise ValueError("out_features should be a list")
-            for feature in out_features:
-                if feature not in self.stage_names:
-                    raise ValueError(
-                        f"Feature {feature} is not a valid feature name. Valid names are {self.stage_names}"
-                    )
-        self.out_features = out_features
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
diff --git a/src/transformers/models/dinat/modeling_dinat.py b/src/transformers/models/dinat/modeling_dinat.py
index aede5b5a2926..89c6ed2e2a88 100644
--- a/src/transformers/models/dinat/modeling_dinat.py
+++ b/src/transformers/models/dinat/modeling_dinat.py
@@ -26,7 +26,7 @@
 
 from ...activations import ACT2FN
 from ...modeling_outputs import BackboneOutput
-from ...modeling_utils import BackboneMixin, PreTrainedModel
+from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     ModelOutput,
@@ -39,6 +39,7 @@
     replace_return_docstrings,
     requires_backends,
 )
+from ...utils.backbone_utils import BackboneMixin
 from .configuration_dinat import DinatConfig
 
 
@@ -57,7 +58,6 @@ def natten2dav(*args, **kwargs):
 
 # General docstring
 _CONFIG_FOR_DOC = "DinatConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "shi-labs/dinat-mini-in1k-224"
@@ -269,7 +269,7 @@ def forward(self, input_feature: torch.Tensor) -> torch.Tensor:
 
 
 # Copied from transformers.models.beit.modeling_beit.drop_path
-def drop_path(input, drop_prob=0.0, training=False, scale_by_keep=True):
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
@@ -338,7 +338,6 @@ def forward(
         hidden_states: torch.Tensor,
         output_attentions: Optional[bool] = False,
     ) -> Tuple[torch.Tensor]:
-
         query_layer = self.transpose_for_scores(self.query(hidden_states))
         key_layer = self.transpose_for_scores(self.key(hidden_states))
         value_layer = self.transpose_for_scores(self.value(hidden_states))
@@ -349,7 +348,7 @@ def forward(
         query_layer = query_layer / math.sqrt(self.attention_head_size)
 
         # Compute NA between "query" and "key" to get the raw attention scores, and add relative positional biases.
-        attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, self.dilation)
+        attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, self.kernel_size, self.dilation)
 
         # Normalize the attention scores to probabilities.
         attention_probs = nn.functional.softmax(attention_scores, dim=-1)
@@ -358,7 +357,7 @@ def forward(
         # seem a bit unusual, but is taken from the original Transformer paper.
         attention_probs = self.dropout(attention_probs)
 
-        context_layer = natten2dav(attention_probs, value_layer, self.dilation)
+        context_layer = natten2dav(attention_probs, value_layer, self.kernel_size, self.dilation)
         context_layer = context_layer.permute(0, 2, 3, 1, 4).contiguous()
         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
         context_layer = context_layer.view(new_context_layer_shape)
@@ -679,8 +678,8 @@ def _set_gradient_checkpointing(self, module: DinatEncoder, value: bool = False)
 DINAT_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
-            [`AutoImageProcessor.__call__`] for details.
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
+            for details.
 
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
@@ -730,7 +729,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(DINAT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=DinatModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -810,7 +808,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(DINAT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_IMAGE_CLASS_CHECKPOINT,
         output_type=DinatImageClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -886,25 +883,17 @@ def forward(
 class DinatBackbone(DinatPreTrainedModel, BackboneMixin):
     def __init__(self, config):
         super().__init__(config)
+        super()._init_backbone(config)
 
         requires_backends(self, ["natten"])
 
-        self.stage_names = config.stage_names
-
         self.embeddings = DinatEmbeddings(config)
         self.encoder = DinatEncoder(config)
-
-        self.out_features = config.out_features if config.out_features is not None else [self.stage_names[-1]]
-
-        num_features = [int(config.embed_dim * 2**i) for i in range(len(config.depths))]
-        self.out_feature_channels = {}
-        self.out_feature_channels["stem"] = config.embed_dim
-        for i, stage in enumerate(self.stage_names[1:]):
-            self.out_feature_channels[stage] = num_features[i]
+        self.num_features = [config.embed_dim] + [int(config.embed_dim * 2**i) for i in range(len(config.depths))]
 
         # Add layer norms to hidden states of out_features
-        hidden_states_norms = dict()
-        for stage, num_channels in zip(self.out_features, self.channels):
+        hidden_states_norms = {}
+        for stage, num_channels in zip(self._out_features, self.channels):
             hidden_states_norms[stage] = nn.LayerNorm(num_channels)
         self.hidden_states_norms = nn.ModuleDict(hidden_states_norms)
 
@@ -914,10 +903,6 @@ def __init__(self, config):
     def get_input_embeddings(self):
         return self.embeddings.patch_embeddings
 
-    @property
-    def channels(self):
-        return [self.out_feature_channels[name] for name in self.out_features]
-
     @add_start_docstrings_to_model_forward(DINAT_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
diff --git a/src/transformers/models/dinov2/__init__.py b/src/transformers/models/dinov2/__init__.py
new file mode 100644
index 000000000000..01d02a9e65fd
--- /dev/null
+++ b/src/transformers/models/dinov2/__init__.py
@@ -0,0 +1,61 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_dinov2": ["DINOV2_PRETRAINED_CONFIG_ARCHIVE_MAP", "Dinov2Config", "Dinov2OnnxConfig"]
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_dinov2"] = [
+        "DINOV2_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "Dinov2ForImageClassification",
+        "Dinov2Model",
+        "Dinov2PreTrainedModel",
+        "Dinov2Backbone",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_dinov2 import DINOV2_PRETRAINED_CONFIG_ARCHIVE_MAP, Dinov2Config, Dinov2OnnxConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_dinov2 import (
+            DINOV2_PRETRAINED_MODEL_ARCHIVE_LIST,
+            Dinov2Backbone,
+            Dinov2ForImageClassification,
+            Dinov2Model,
+            Dinov2PreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/dinov2/configuration_dinov2.py b/src/transformers/models/dinov2/configuration_dinov2.py
new file mode 100644
index 000000000000..4c3c26623a3a
--- /dev/null
+++ b/src/transformers/models/dinov2/configuration_dinov2.py
@@ -0,0 +1,173 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" DINOv2 model configuration"""
+
+from collections import OrderedDict
+from typing import Mapping
+
+from packaging import version
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
+
+
+logger = logging.get_logger(__name__)
+
+DINOV2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "facebook/dinov2-base": "https://huggingface.co/facebook/dinov2-base/resolve/main/config.json",
+}
+
+
+class Dinov2Config(BackboneConfigMixin, PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Dinov2Model`]. It is used to instantiate an
+    Dinov2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Dinov2
+    [google/dinov2-base-patch16-224](https://huggingface.co/google/dinov2-base-patch16-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        mlp_ratio (`int`, *optional*, defaults to 4):
+            Ratio of the hidden size of the MLPs relative to the `hidden_size`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        layerscale_value (`float`, *optional*, defaults to 1.0):
+           Initial value to use for layer scale.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            Stochastic depth rate per sample (when applied in the main path of residual layers).
+        use_swiglu_ffn (`bool`, *optional*, defaults to `False`):
+            Whether to use the SwiGLU feedforward neural network.
+        out_features (`List[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage.
+        out_indices (`List[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage.
+        apply_layernorm (`bool`, *optional*, defaults to `True`):
+            Whether to apply layer normalization to the feature maps in case the model is used as backbone.
+        reshape_hidden_states (`bool`, *optional*, defaults to `True`):
+            Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in
+            case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size,
+            seq_len, hidden_size)`.
+
+    Example:
+
+    ```python
+    >>> from transformers import Dinov2Config, Dinov2Model
+
+    >>> # Initializing a Dinov2 dinov2-base-patch16-224 style configuration
+    >>> configuration = Dinov2Config()
+
+    >>> # Initializing a model (with random weights) from the dinov2-base-patch16-224 style configuration
+    >>> model = Dinov2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "dinov2"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        mlp_ratio=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-6,
+        image_size=224,
+        patch_size=16,
+        num_channels=3,
+        qkv_bias=True,
+        layerscale_value=1.0,
+        drop_path_rate=0.0,
+        use_swiglu_ffn=False,
+        out_features=None,
+        out_indices=None,
+        apply_layernorm=True,
+        reshape_hidden_states=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.mlp_ratio = mlp_ratio
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+        self.layerscale_value = layerscale_value
+        self.drop_path_rate = drop_path_rate
+        self.use_swiglu_ffn = use_swiglu_ffn
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)]
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
+        self.apply_layernorm = apply_layernorm
+        self.reshape_hidden_states = reshape_hidden_states
+
+
+class Dinov2OnnxConfig(OnnxConfig):
+    torch_onnx_minimum_version = version.parse("1.11")
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+            ]
+        )
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-4
diff --git a/src/transformers/models/dinov2/convert_dinov2_to_hf.py b/src/transformers/models/dinov2/convert_dinov2_to_hf.py
new file mode 100644
index 000000000000..dd5871e6c440
--- /dev/null
+++ b/src/transformers/models/dinov2/convert_dinov2_to_hf.py
@@ -0,0 +1,287 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert DINOv2 checkpoints from the original repository.
+
+URL: https://github.com/facebookresearch/dinov2/tree/main
+"""
+
+
+import argparse
+import json
+from pathlib import Path
+
+import requests
+import torch
+import torch.nn as nn
+from huggingface_hub import hf_hub_download
+from PIL import Image
+from torchvision import transforms
+
+from transformers import BitImageProcessor, Dinov2Config, Dinov2ForImageClassification, Dinov2Model
+from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+def get_dinov2_config(model_name, image_classifier=False):
+    config = Dinov2Config(image_size=518, patch_size=14)
+
+    # size of the architecture
+    if "vits" in model_name:
+        config.hidden_size = 384
+        config.num_attention_heads = 6
+    elif "vitb" in model_name:
+        pass
+    elif "vitl" in model_name:
+        config.hidden_size = 1024
+        config.num_hidden_layers = 24
+        config.num_attention_heads = 16
+    elif "vitg" in model_name:
+        config.use_swiglu_ffn = True
+        config.hidden_size = 1536
+        config.num_hidden_layers = 40
+        config.num_attention_heads = 24
+    else:
+        raise ValueError("Model not supported")
+
+    if image_classifier:
+        repo_id = "huggingface/label-files"
+        filename = "imagenet-1k-id2label.json"
+        config.num_labels = 1000
+        config.id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
+        config.id2label = {int(k): v for k, v in config.id2label.items()}
+
+    return config
+
+
+def create_rename_keys(config):
+    rename_keys = []
+    # fmt: off
+
+    # patch embedding layer
+    rename_keys.append(("cls_token", "embeddings.cls_token"))
+    rename_keys.append(("mask_token", "embeddings.mask_token"))
+    rename_keys.append(("pos_embed", "embeddings.position_embeddings"))
+    rename_keys.append(("patch_embed.proj.weight", "embeddings.patch_embeddings.projection.weight"))
+    rename_keys.append(("patch_embed.proj.bias", "embeddings.patch_embeddings.projection.bias"))
+
+    for i in range(config.num_hidden_layers):
+        # layernorms
+        rename_keys.append((f"blocks.{i}.norm1.weight", f"encoder.layer.{i}.norm1.weight"))
+        rename_keys.append((f"blocks.{i}.norm1.bias", f"encoder.layer.{i}.norm1.bias"))
+        rename_keys.append((f"blocks.{i}.norm2.weight", f"encoder.layer.{i}.norm2.weight"))
+        rename_keys.append((f"blocks.{i}.norm2.bias", f"encoder.layer.{i}.norm2.bias"))
+        # MLP
+        if config.use_swiglu_ffn:
+            rename_keys.append((f"blocks.{i}.mlp.w12.weight", f"encoder.layer.{i}.mlp.w12.weight"))
+            rename_keys.append((f"blocks.{i}.mlp.w12.bias", f"encoder.layer.{i}.mlp.w12.bias"))
+            rename_keys.append((f"blocks.{i}.mlp.w3.weight", f"encoder.layer.{i}.mlp.w3.weight"))
+            rename_keys.append((f"blocks.{i}.mlp.w3.bias", f"encoder.layer.{i}.mlp.w3.bias"))
+        else:
+            rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"encoder.layer.{i}.mlp.fc1.weight"))
+            rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"encoder.layer.{i}.mlp.fc1.bias"))
+            rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"encoder.layer.{i}.mlp.fc2.weight"))
+            rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"encoder.layer.{i}.mlp.fc2.bias"))
+        # layerscale
+        rename_keys.append((f"blocks.{i}.ls1.gamma", f"encoder.layer.{i}.layer_scale1.lambda1"))
+        rename_keys.append((f"blocks.{i}.ls2.gamma", f"encoder.layer.{i}.layer_scale2.lambda1"))
+        # attention projection layer
+        rename_keys.append((f"blocks.{i}.attn.proj.weight", f"encoder.layer.{i}.attention.output.dense.weight"))
+        rename_keys.append((f"blocks.{i}.attn.proj.bias", f"encoder.layer.{i}.attention.output.dense.bias"))
+
+    # final layernorm
+    rename_keys.append(("norm.weight", "layernorm.weight"))
+    rename_keys.append(("norm.bias", "layernorm.bias"))
+
+    # fmt: on
+    return rename_keys
+
+
+def rename_key(dct, old, new):
+    val = dct.pop(old)
+    dct[new] = val
+
+
+# we split up the matrix of each encoder layer into queries, keys and values
+def read_in_q_k_v(state_dict, config):
+    for i in range(config.num_hidden_layers):
+        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
+        in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
+        in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
+        # next, add query, keys and values (in that order) to the state dict
+        state_dict[f"encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :]
+        state_dict[f"encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
+        state_dict[f"encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
+            config.hidden_size : config.hidden_size * 2, :
+        ]
+        state_dict[f"encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
+            config.hidden_size : config.hidden_size * 2
+        ]
+        state_dict[f"encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-config.hidden_size :, :]
+        state_dict[f"encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+    return image
+
+
+@torch.no_grad()
+def convert_dinov2_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False):
+    """
+    Copy/paste/tweak model's weights to our DINOv2 structure.
+    """
+
+    # define default Dinov2 configuration
+    image_classifier = "1layer" in model_name
+    config = get_dinov2_config(model_name, image_classifier=image_classifier)
+
+    # load original model from torch hub
+    original_model = torch.hub.load("facebookresearch/dinov2", model_name.replace("_1layer", ""))
+    original_model.eval()
+
+    # load state_dict of original model, remove and rename some keys
+    state_dict = original_model.state_dict()
+    rename_keys = create_rename_keys(config)
+    for src, dest in rename_keys:
+        rename_key(state_dict, src, dest)
+    read_in_q_k_v(state_dict, config)
+
+    for key, val in state_dict.copy().items():
+        val = state_dict.pop(key)
+        if "w12" in key:
+            key = key.replace("w12", "weights_in")
+        if "w3" in key:
+            key = key.replace("w3", "weights_out")
+        state_dict[key] = val
+
+    # load HuggingFace model
+    if image_classifier:
+        model = Dinov2ForImageClassification(config).eval()
+        model.dinov2.load_state_dict(state_dict)
+        model_name_to_classifier_dict_url = {
+            "dinov2_vits14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_linear_head.pth",
+            "dinov2_vitb14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_linear_head.pth",
+            "dinov2_vitl14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_linear_head.pth",
+            "dinov2_vitg14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_linear_head.pth",
+        }
+        url = model_name_to_classifier_dict_url[model_name]
+        classifier_state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
+        model.classifier.weight = nn.Parameter(classifier_state_dict["weight"])
+        model.classifier.bias = nn.Parameter(classifier_state_dict["bias"])
+    else:
+        model = Dinov2Model(config).eval()
+        model.load_state_dict(state_dict)
+
+    # load image
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+
+    # preprocess image
+    transformations = transforms.Compose(
+        [
+            transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(
+                mean=IMAGENET_DEFAULT_MEAN,  # these are RGB mean+std values
+                std=IMAGENET_DEFAULT_STD,  # across a large photo dataset.
+            ),
+        ]
+    )
+
+    original_pixel_values = transformations(image).unsqueeze(0)  # insert batch dimension
+
+    processor = BitImageProcessor(
+        size={"shortest_edge": 256},
+        resample=PILImageResampling.BICUBIC,
+        image_mean=IMAGENET_DEFAULT_MEAN,
+        image_std=IMAGENET_DEFAULT_STD,
+    )
+    pixel_values = processor(image, return_tensors="pt").pixel_values
+
+    assert torch.allclose(original_pixel_values, pixel_values)
+
+    with torch.no_grad():
+        outputs = model(pixel_values, output_hidden_states=True)
+        original_outputs = original_model(pixel_values)
+
+    # assert values
+    if image_classifier:
+        print("Predicted class:")
+        class_idx = outputs.logits.argmax(-1).item()
+        print(model.config.id2label[class_idx])
+    else:
+        assert outputs.last_hidden_state[:, 0].shape == original_outputs.shape
+        assert torch.allclose(outputs.last_hidden_state[:, 0], original_outputs, atol=1e-3)
+    print("Looks ok!")
+
+    if pytorch_dump_folder_path is not None:
+        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
+        model.save_pretrained(pytorch_dump_folder_path)
+        print(f"Saving image processor to {pytorch_dump_folder_path}")
+        processor.save_pretrained(pytorch_dump_folder_path)
+
+    if push_to_hub:
+        model_name_to_hf_name = {
+            "dinov2_vits14": "dinov2-small",
+            "dinov2_vitb14": "dinov2-base",
+            "dinov2_vitl14": "dinov2-large",
+            "dinov2_vitg14": "dinov2-giant",
+            "dinov2_vits14_1layer": "dinov2-small-imagenet1k-1-layer",
+            "dinov2_vitb14_1layer": "dinov2-base-imagenet1k-1-layer",
+            "dinov2_vitl14_1layer": "dinov2-large-imagenet1k-1-layer",
+            "dinov2_vitg14_1layer": "dinov2-giant-imagenet1k-1-layer",
+        }
+
+        name = model_name_to_hf_name[model_name]
+        model.push_to_hub(f"facebook/{name}")
+        processor.push_to_hub(f"facebook/{name}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--model_name",
+        default="dinov2_vitb14",
+        type=str,
+        choices=[
+            "dinov2_vits14",
+            "dinov2_vitb14",
+            "dinov2_vitl14",
+            "dinov2_vitg14",
+            "dinov2_vits14_1layer",
+            "dinov2_vitb14_1layer",
+            "dinov2_vitl14_1layer",
+            "dinov2_vitg14_1layer",
+        ],
+        help="Name of the model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+    )
+    parser.add_argument(
+        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
+    )
+
+    args = parser.parse_args()
+    convert_dinov2_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/dinov2/modeling_dinov2.py b/src/transformers/models/dinov2/modeling_dinov2.py
new file mode 100644
index 000000000000..6e4446faddd5
--- /dev/null
+++ b/src/transformers/models/dinov2/modeling_dinov2.py
@@ -0,0 +1,867 @@
+# coding=utf-8
+# Copyright 2023 Meta AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch DINOv2 model."""
+
+
+import collections.abc
+import math
+from typing import Dict, List, Optional, Set, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BackboneOutput,
+    BaseModelOutput,
+    BaseModelOutputWithPooling,
+    ImageClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from ...utils.backbone_utils import BackboneMixin
+from .configuration_dinov2 import Dinov2Config
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "Dinov2Config"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "facebook/dinov2-base"
+_EXPECTED_OUTPUT_SHAPE = [1, 257, 768]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "facebook/dinov2-small-imagenet1k-1-layer"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
+
+
+DINOV2_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/dinov2-base",
+    # See all DINOv2 models at https://huggingface.co/models?filter=dinov2
+]
+
+
+class Dinov2Embeddings(nn.Module):
+    """
+    Construct the CLS token, mask token, position and patch embeddings.
+    """
+
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__()
+
+        self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.mask_token = nn.Parameter(torch.zeros(1, config.hidden_size))
+        self.patch_embeddings = Dinov2PatchEmbeddings(config)
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size))
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.config = config
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
+        resolution images.
+
+        Source:
+        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
+        """
+
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+        if num_patches == num_positions and height == width:
+            return self.position_embeddings
+        class_pos_embed = self.position_embeddings[:, 0]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+        dim = embeddings.shape[-1]
+        height = height // self.config.patch_size
+        width = width // self.config.patch_size
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        height, width = height + 0.1, width + 0.1
+        patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            scale_factor=(height / math.sqrt(num_positions), width / math.sqrt(num_positions)),
+            mode="bicubic",
+            align_corners=False,
+        )
+        if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]:
+            raise ValueError("Width or height does not match with the interpolated position embeddings")
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
+
+    def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor:
+        batch_size, _, height, width = pixel_values.shape
+        embeddings = self.patch_embeddings(pixel_values)
+
+        if bool_masked_pos is not None:
+            embeddings = torch.where(
+                bool_masked_pos.unsqueeze(-1), self.mask_token.to(embeddings.dtype).unsqueeze(0), embeddings
+            )
+
+        # add the [CLS] token to the embedded patch tokens
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
+        # add positional encoding to each token
+        embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+class Dinov2PatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        num_channels = pixel_values.shape[1]
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+                f" Expected {self.num_channels} but got {num_channels}."
+            )
+        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        return embeddings
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->Dinov2
+class Dinov2SelfAttention(nn.Module):
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        mixed_query_layer = self.query(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->Dinov2
+class Dinov2SelfOutput(nn.Module):
+    """
+    The residual connection is defined in Dinov2Layer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->Dinov2
+class Dinov2Attention(nn.Module):
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__()
+        self.attention = Dinov2SelfAttention(config)
+        self.output = Dinov2SelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads: Set[int]) -> None:
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_outputs = self.attention(hidden_states, head_mask, output_attentions)
+
+        attention_output = self.output(self_outputs[0], hidden_states)
+
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class Dinov2LayerScale(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.lambda1 = nn.Parameter(config.layerscale_value * torch.ones(config.hidden_size))
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        return hidden_state * self.lambda1
+
+
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath
+class Dinov2DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+
+
+class Dinov2MLP(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        in_features = out_features = config.hidden_size
+        hidden_features = int(config.hidden_size * config.mlp_ratio)
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=True)
+        if isinstance(config.hidden_act, str):
+            self.activation = ACT2FN[config.hidden_act]
+        else:
+            self.activation = config.hidden_act
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=True)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.fc1(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        hidden_state = self.fc2(hidden_state)
+        return hidden_state
+
+
+class Dinov2SwiGLUFFN(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        in_features = out_features = config.hidden_size
+        hidden_features = int(config.hidden_size * config.mlp_ratio)
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+
+        self.weights_in = nn.Linear(in_features, 2 * hidden_features, bias=True)
+        self.weights_out = nn.Linear(hidden_features, out_features, bias=True)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.weights_in(hidden_state)
+        x1, x2 = hidden_state.chunk(2, dim=-1)
+        hidden = nn.functional.silu(x1) * x2
+        return self.weights_out(hidden)
+
+
+class Dinov2Layer(nn.Module):
+    """This corresponds to the Block class in the original implementation."""
+
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__()
+
+        self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.attention = Dinov2Attention(config)
+        self.layer_scale1 = Dinov2LayerScale(config)
+        self.drop_path1 = Dinov2DropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
+
+        self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        if config.use_swiglu_ffn:
+            self.mlp = Dinov2SwiGLUFFN(config)
+        else:
+            self.mlp = Dinov2MLP(config)
+        self.layer_scale2 = Dinov2LayerScale(config)
+        self.drop_path2 = Dinov2DropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_attention_outputs = self.attention(
+            self.norm1(hidden_states),  # in Dinov2, layernorm is applied before self-attention
+            head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+
+        attention_output = self.layer_scale1(attention_output)
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        # first residual connection
+        hidden_states = attention_output + hidden_states
+
+        # in Dinov2, layernorm is also applied after self-attention
+        layer_output = self.norm2(hidden_states)
+        layer_output = self.mlp(layer_output)
+        layer_output = self.layer_scale2(layer_output)
+
+        # second residual connection
+        layer_output = layer_output + hidden_states
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->Dinov2
+class Dinov2Encoder(nn.Module):
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([Dinov2Layer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    layer_head_mask,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class Dinov2PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = Dinov2Config
+    base_model_prefix = "dinov2"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(
+                module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
+            ).to(module.weight.dtype)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, Dinov2Embeddings):
+            module.position_embeddings.data = nn.init.trunc_normal_(
+                module.position_embeddings.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.position_embeddings.dtype)
+
+            module.cls_token.data = nn.init.trunc_normal_(
+                module.cls_token.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.cls_token.dtype)
+
+    def _set_gradient_checkpointing(self, module: Dinov2Encoder, value: bool = False) -> None:
+        if isinstance(module, Dinov2Encoder):
+            module.gradient_checkpointing = value
+
+
+DINOV2_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`Dinov2Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+DINOV2_BASE_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`BitImageProcessor.preprocess`] for details.
+
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Only relevant for
+            pre-training.
+
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+DINOV2_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`BitImageProcessor.preprocess`] for details.
+
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare DINOv2 Model transformer outputting raw hidden-states without any specific head on top.",
+    DINOV2_START_DOCSTRING,
+)
+class Dinov2Model(Dinov2PreTrainedModel):
+    def __init__(self, config: Dinov2Config):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = Dinov2Embeddings(config)
+        self.encoder = Dinov2Encoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> Dinov2PatchEmbeddings:
+        return self.embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(DINOV2_BASE_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPooling,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        bool_masked_pos: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+        pooled_output = sequence_output[:, 0, :]
+
+        if not return_dict:
+            head_outputs = (sequence_output, pooled_output)
+            return head_outputs + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Dinov2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state
+    of the [CLS] token) e.g. for ImageNet.
+    """,
+    DINOV2_START_DOCSTRING,
+)
+class Dinov2ForImageClassification(Dinov2PreTrainedModel):
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.dinov2 = Dinov2Model(config)
+
+        # Classifier head
+        self.classifier = (
+            nn.Linear(config.hidden_size * 2, config.num_labels) if config.num_labels > 0 else nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(DINOV2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=ImageClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, ImageClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.dinov2(
+            pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]  # batch_size, sequence_length, hidden_size
+
+        cls_token = sequence_output[:, 0]
+        patch_tokens = sequence_output[:, 1:]
+
+        linear_input = torch.cat([cls_token, patch_tokens.mean(dim=1)], dim=1)
+
+        logits = self.classifier(linear_input)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Dinov2 backbone, to be used with frameworks like DETR and MaskFormer.
+    """,
+    DINOV2_START_DOCSTRING,
+)
+class Dinov2Backbone(Dinov2PreTrainedModel, BackboneMixin):
+    def __init__(self, config):
+        super().__init__(config)
+        super()._init_backbone(config)
+
+        self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)]
+        self.embeddings = Dinov2Embeddings(config)
+        self.encoder = Dinov2Encoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> Dinov2PatchEmbeddings:
+        return self.embeddings.patch_embeddings
+
+    @add_start_docstrings_to_model_forward(DINOV2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> BackboneOutput:
+        """
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoBackbone
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
+        >>> model = AutoBackbone.from_pretrained(
+        ...     "facebook/dinov2-base", out_features=["stage2", "stage5", "stage8", "stage11"]
+        ... )
+
+        >>> inputs = processor(image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> feature_maps = outputs.feature_maps
+        >>> list(feature_maps[-1].shape)
+        [1, 768, 16, 16]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        embedding_output = self.embeddings(pixel_values)
+
+        outputs = self.encoder(
+            embedding_output, output_hidden_states=True, output_attentions=output_attentions, return_dict=return_dict
+        )
+
+        hidden_states = outputs.hidden_states if return_dict else outputs[1]
+
+        feature_maps = ()
+        for stage, hidden_state in zip(self.stage_names, hidden_states):
+            if stage in self.out_features:
+                if self.config.apply_layernorm:
+                    hidden_state = self.layernorm(hidden_state)
+                if self.config.reshape_hidden_states:
+                    batch_size, _, height, width = pixel_values.shape
+                    patch_size = self.config.patch_size
+                    hidden_state = hidden_state[:, 1:, :].reshape(
+                        batch_size, width // patch_size, height // patch_size, -1
+                    )
+                    hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
+                feature_maps += (hidden_state,)
+
+        if not return_dict:
+            if output_hidden_states:
+                output = (feature_maps,) + outputs[1:]
+            else:
+                output = (feature_maps,) + outputs[2:]
+            return output
+
+        return BackboneOutput(
+            feature_maps=feature_maps,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=outputs.attentions if output_attentions else None,
+        )
diff --git a/src/transformers/models/distilbert/__init__.py b/src/transformers/models/distilbert/__init__.py
index 67d4502e2693..6a2756eb9d1c 100644
--- a/src/transformers/models/distilbert/__init__.py
+++ b/src/transformers/models/distilbert/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/distilbert/configuration_distilbert.py b/src/transformers/models/distilbert/configuration_distilbert.py
index b36917bc0758..3dabb3d3e234 100644
--- a/src/transformers/models/distilbert/configuration_distilbert.py
+++ b/src/transformers/models/distilbert/configuration_distilbert.py
@@ -121,7 +121,7 @@ def __init__(
         qa_dropout=0.1,
         seq_classif_dropout=0.2,
         pad_token_id=0,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
diff --git a/src/transformers/models/distilbert/modeling_distilbert.py b/src/transformers/models/distilbert/modeling_distilbert.py
index d6bb3eb66911..f26b5846972d 100755
--- a/src/transformers/models/distilbert/modeling_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_distilbert.py
@@ -27,10 +27,9 @@
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
-from transformers.configuration_utils import PretrainedConfig
-
 from ...activations import get_activation
-from ...deepspeed import is_deepspeed_zero3_enabled
+from ...configuration_utils import PretrainedConfig
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
 from ...modeling_outputs import (
     BaseModelOutput,
     MaskedLMOutput,
@@ -54,7 +53,6 @@
 logger = logging.get_logger(__name__)
 _CHECKPOINT_FOR_DOC = "distilbert-base-uncased"
 _CONFIG_FOR_DOC = "DistilBertConfig"
-_TOKENIZER_FOR_DOC = "DistilBertTokenizer"
 
 DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "distilbert-base-uncased",
@@ -106,15 +104,22 @@ def __init__(self, config: PretrainedConfig):
             "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
         )
 
-    def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def forward(self, input_ids: torch.Tensor, input_embeds: Optional[torch.Tensor] = None) -> torch.Tensor:
         """
         Parameters:
-            input_ids: torch.tensor(bs, max_seq_length) The token ids to embed.
+            input_ids (torch.Tensor):
+                torch.tensor(bs, max_seq_length) The token ids to embed.
+            input_embeds (*optional*, torch.Tensor):
+                The pre-computed word embeddings. Can only be passed if the input ids are `None`.
+
 
         Returns: torch.tensor(bs, max_seq_length, dim) The embedded tokens (plus position embeddings, no token_type
         embeddings)
         """
-        seq_length = input_ids.size(1)
+        if input_ids is not None:
+            input_embeds = self.word_embeddings(input_ids)  # (bs, max_seq_length, dim)
+
+        seq_length = input_embeds.size(1)
 
         # Setting the position-ids to the registered buffer in constructor, it helps
         # when tracing the model without passing position-ids, solves
@@ -125,10 +130,9 @@ def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
             position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)  # (max_seq_length)
             position_ids = position_ids.unsqueeze(0).expand_as(input_ids)  # (bs, max_seq_length)
 
-        word_embeddings = self.word_embeddings(input_ids)  # (bs, max_seq_length, dim)
         position_embeddings = self.position_embeddings(position_ids)  # (bs, max_seq_length, dim)
 
-        embeddings = word_embeddings + position_embeddings  # (bs, max_seq_length, dim)
+        embeddings = input_embeds + position_embeddings  # (bs, max_seq_length, dim)
         embeddings = self.LayerNorm(embeddings)  # (bs, max_seq_length, dim)
         embeddings = self.dropout(embeddings)  # (bs, max_seq_length, dim)
         return embeddings
@@ -320,6 +324,7 @@ def __init__(self, config: PretrainedConfig):
         super().__init__()
         self.n_layers = config.n_layers
         self.layer = nn.ModuleList([TransformerBlock(config) for _ in range(config.n_layers)])
+        self.gradient_checkpointing = False
 
     def forward(
         self,
@@ -330,7 +335,6 @@ def forward(
         output_hidden_states: bool = False,
         return_dict: Optional[bool] = None,
     ) -> Union[BaseModelOutput, Tuple[torch.Tensor, ...]]:  # docstyle-ignore
-
         """
         Parameters:
             x: torch.tensor(bs, seq_length, dim) Input sequence embedded.
@@ -353,9 +357,28 @@ def forward(
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_state,)
 
-            layer_outputs = layer_module(
-                x=hidden_state, attn_mask=attn_mask, head_mask=head_mask[i], output_attentions=output_attentions
-            )
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_state,
+                    attn_mask,
+                    head_mask[i],
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_state,
+                    attn_mask,
+                    head_mask[i],
+                    output_attentions,
+                )
+
             hidden_state = layer_outputs[-1]
 
             if output_attentions:
@@ -389,6 +412,7 @@ class DistilBertPreTrainedModel(PreTrainedModel):
     config_class = DistilBertConfig
     load_tf_weights = None
     base_model_prefix = "distilbert"
+    supports_gradient_checkpointing = True
 
     def _init_weights(self, module: nn.Module):
         """Initialize the weights."""
@@ -406,6 +430,10 @@ def _init_weights(self, module: nn.Module):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
 
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, Transformer):
+            module.gradient_checkpointing = value
+
 
 DISTILBERT_START_DOCSTRING = r"""
 
@@ -428,7 +456,7 @@ def _init_weights(self, module: nn.Module):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`DistilBertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -538,7 +566,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -562,6 +589,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -576,10 +604,10 @@ def forward(
         # Prepare head mask if needed
         head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
 
-        if inputs_embeds is None:
-            inputs_embeds = self.embeddings(input_ids)  # (bs, seq_length, dim)
+        embeddings = self.embeddings(input_ids, inputs_embeds)  # (bs, seq_length, dim)
+
         return self.transformer(
-            x=inputs_embeds,
+            x=embeddings,
             attn_mask=attention_mask,
             head_mask=head_mask,
             output_attentions=output_attentions,
@@ -593,7 +621,7 @@ def forward(
     DISTILBERT_START_DOCSTRING,
 )
 class DistilBertForMaskedLM(DistilBertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["vocab_projector.weight"]
+    _tied_weights_keys = ["vocab_projector.weight"]
 
     def __init__(self, config: PretrainedConfig):
         super().__init__(config)
@@ -638,7 +666,6 @@ def set_output_embeddings(self, new_embeddings: nn.Module):
 
     @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -736,7 +763,6 @@ def resize_position_embeddings(self, new_num_position_embeddings: int):
 
     @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -854,7 +880,6 @@ def resize_position_embeddings(self, new_num_position_embeddings: int):
 
     @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -971,7 +996,6 @@ def resize_position_embeddings(self, new_num_position_embeddings: int):
 
     @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1090,10 +1114,10 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import DistilBertTokenizer, DistilBertForMultipleChoice
+        >>> from transformers import AutoTokenizer, DistilBertForMultipleChoice
         >>> import torch
 
-        >>> tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-cased")
+        >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
         >>> model = DistilBertForMultipleChoice.from_pretrained("distilbert-base-cased")
 
         >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
diff --git a/src/transformers/models/distilbert/modeling_flax_distilbert.py b/src/transformers/models/distilbert/modeling_flax_distilbert.py
index 28f76194d7bb..24e2c7e3987e 100644
--- a/src/transformers/models/distilbert/modeling_flax_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_flax_distilbert.py
@@ -16,11 +16,10 @@
 import math
 from typing import Callable, Optional, Tuple
 
-import numpy as np
-
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.traverse_util import flatten_dict, unflatten_dict
 from jax import lax
@@ -42,7 +41,6 @@
 
 _CHECKPOINT_FOR_DOC = "distilbert-base-uncased"
 _CONFIG_FOR_DOC = "DistilBertConfig"
-_TOKENIZER_FOR_DOC = "DistilBertTokenizer"
 
 
 FLAX_DISTILBERT_START_DOCSTRING = r"""
@@ -72,7 +70,7 @@
         input_ids (`numpy.ndarray` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -201,7 +199,6 @@ def __call__(
         deterministic: bool = True,
         output_attentions: bool = False,
     ):
-
         bs, q_len, dim = query.shape
         k_len = key.shape[1]
         # assert dim == self.dim, f'Dimensions do not match: {dim} input vs {self.dim} configured'
@@ -430,7 +427,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, **kwargs)
         super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
@@ -536,7 +533,7 @@ class FlaxDistilBertModel(FlaxDistilBertPreTrainedModel):
     module_class = FlaxDistilBertModule
 
 
-append_call_sample_docstring(FlaxDistilBertModel, _TOKENIZER_FOR_DOC, _CHECKPOINT_FOR_DOC, None, _CONFIG_FOR_DOC)
+append_call_sample_docstring(FlaxDistilBertModel, _CHECKPOINT_FOR_DOC, None, _CONFIG_FOR_DOC)
 
 
 class FlaxDistilBertForMaskedLMModule(nn.Module):
@@ -609,9 +606,7 @@ class FlaxDistilBertForMaskedLM(FlaxDistilBertPreTrainedModel):
     module_class = FlaxDistilBertForMaskedLMModule
 
 
-append_call_sample_docstring(
-    FlaxDistilBertForMaskedLM, _TOKENIZER_FOR_DOC, _CHECKPOINT_FOR_DOC, FlaxMaskedLMOutput, _CONFIG_FOR_DOC
-)
+append_call_sample_docstring(FlaxDistilBertForMaskedLM, _CHECKPOINT_FOR_DOC, FlaxMaskedLMOutput, _CONFIG_FOR_DOC)
 
 
 class FlaxDistilBertForSequenceClassificationModule(nn.Module):
@@ -680,7 +675,6 @@ class FlaxDistilBertForSequenceClassification(FlaxDistilBertPreTrainedModel):
 
 append_call_sample_docstring(
     FlaxDistilBertForSequenceClassification,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxSequenceClassifierOutput,
     _CONFIG_FOR_DOC,
@@ -763,7 +757,6 @@ class FlaxDistilBertForMultipleChoice(FlaxDistilBertPreTrainedModel):
 )
 append_call_sample_docstring(
     FlaxDistilBertForMultipleChoice,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxMultipleChoiceModelOutput,
     _CONFIG_FOR_DOC,
@@ -826,7 +819,6 @@ class FlaxDistilBertForTokenClassification(FlaxDistilBertPreTrainedModel):
 
 append_call_sample_docstring(
     FlaxDistilBertForTokenClassification,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxTokenClassifierOutput,
     _CONFIG_FOR_DOC,
@@ -896,7 +888,6 @@ class FlaxDistilBertForQuestionAnswering(FlaxDistilBertPreTrainedModel):
 
 append_call_sample_docstring(
     FlaxDistilBertForQuestionAnswering,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxQuestionAnsweringModelOutput,
     _CONFIG_FOR_DOC,
diff --git a/src/transformers/models/distilbert/modeling_tf_distilbert.py b/src/transformers/models/distilbert/modeling_tf_distilbert.py
index 93c4be76246b..6b0e1b0f3feb 100644
--- a/src/transformers/models/distilbert/modeling_tf_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_tf_distilbert.py
@@ -16,6 +16,9 @@
  TF 2.0 DistilBERT model
 """
 
+
+from __future__ import annotations
+
 import warnings
 from typing import Optional, Tuple, Union
 
@@ -43,9 +46,8 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -58,7 +60,6 @@
 
 _CHECKPOINT_FOR_DOC = "distilbert-base-uncased"
 _CONFIG_FOR_DOC = "DistilBertConfig"
-_TOKENIZER_FOR_DOC = "DistilBertTokenizer"
 
 TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "distilbert-base-uncased",
@@ -76,7 +77,7 @@ class TFEmbeddings(tf.keras.layers.Layer):
 
     def __init__(self, config, **kwargs):
         super().__init__(**kwargs)
-        self.vocab_size = config.vocab_size
+        self.config = config
         self.dim = config.dim
         self.initializer_range = config.initializer_range
         self.max_position_embeddings = config.max_position_embeddings
@@ -87,7 +88,7 @@ def build(self, input_shape: tf.TensorShape):
         with tf.name_scope("word_embeddings"):
             self.weight = self.add_weight(
                 name="weight",
-                shape=[self.vocab_size, self.dim],
+                shape=[self.config.vocab_size, self.dim],
                 initializer=get_initializer(initializer_range=self.initializer_range),
             )
 
@@ -110,16 +111,7 @@ def call(self, input_ids=None, position_ids=None, inputs_embeds=None, training=F
         assert not (input_ids is None and inputs_embeds is None)
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
@@ -431,19 +423,6 @@ class TFDistilBertPreTrainedModel(TFPreTrainedModel):
     config_class = DistilBertConfig
     base_model_prefix = "distilbert"
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 DISTILBERT_START_DOCSTRING = r"""
 
@@ -492,7 +471,7 @@ def serving(self, inputs):
         input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`DistilBertTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -542,17 +521,16 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFBaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -570,18 +548,12 @@ def call(
         )
         return outputs
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutput(last_hidden_state=output.last_hidden_state, hidden_states=hs, attentions=attns)
-
 
 class TFDistilBertLMHead(tf.keras.layers.Layer):
     def __init__(self, config, input_embeddings, **kwargs):
         super().__init__(**kwargs)
 
-        self.vocab_size = config.vocab_size
+        self.config = config
         self.dim = config.dim
 
         # The output weights are the same as the input embeddings, but there is
@@ -589,7 +561,7 @@ def __init__(self, config, input_embeddings, **kwargs):
         self.input_embeddings = input_embeddings
 
     def build(self, input_shape):
-        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
+        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
 
         super().build(input_shape)
 
@@ -605,13 +577,13 @@ def get_bias(self):
 
     def set_bias(self, value):
         self.bias = value["bias"]
-        self.vocab_size = shape_list(value["bias"])[0]
+        self.config.vocab_size = shape_list(value["bias"])[0]
 
     def call(self, hidden_states):
         seq_length = shape_list(tensor=hidden_states)[1]
         hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.dim])
         hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
-        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
         hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
 
         return hidden_states
@@ -624,7 +596,7 @@ def call(self, hidden_states):
 class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModelingLoss):
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
-        self.vocab_size = config.vocab_size
+        self.config = config
 
         self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
         self.vocab_transform = tf.keras.layers.Dense(
@@ -644,21 +616,20 @@ def get_prefix_bias_name(self):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFMaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
         r"""
@@ -696,13 +667,6 @@ def call(
             attentions=distilbert_output.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output
-    def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -731,21 +695,20 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -783,13 +746,6 @@ def call(
             attentions=distilbert_output.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -812,21 +768,20 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFTokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -859,13 +814,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -890,36 +838,25 @@ def __init__(self, config, *inputs, **kwargs):
             1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
         )
 
-    @property
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            tf.Tensor with dummy inputs
-        """
-        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32)}
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(
         DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
     )
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFMultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -971,26 +908,6 @@ def call(
             attentions=distilbert_output.attentions,
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output
-    def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMultipleChoiceModelOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1013,22 +930,21 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1075,12 +991,3 @@ def call(
             hidden_states=distilbert_output.hidden_states,
             attentions=distilbert_output.attentions,
         )
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output
-    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
diff --git a/src/transformers/models/distilbert/tokenization_distilbert.py b/src/transformers/models/distilbert/tokenization_distilbert.py
index 612b96c83da3..014c41d1243b 100644
--- a/src/transformers/models/distilbert/tokenization_distilbert.py
+++ b/src/transformers/models/distilbert/tokenization_distilbert.py
@@ -147,22 +147,8 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
-        super().__init__(
-            do_lower_case=do_lower_case,
-            do_basic_tokenize=do_basic_tokenize,
-            never_split=never_split,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            tokenize_chinese_chars=tokenize_chinese_chars,
-            strip_accents=strip_accents,
-            **kwargs,
-        )
-
         if not os.path.isfile(vocab_file):
             raise ValueError(
                 f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
@@ -178,7 +164,21 @@ def __init__(
                 tokenize_chinese_chars=tokenize_chinese_chars,
                 strip_accents=strip_accents,
             )
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+
+        super().__init__(
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
 
     @property
     # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.do_lower_case
@@ -195,11 +195,12 @@ def get_vocab(self):
         return dict(self.vocab, **self.added_tokens_encoder)
 
     # Copied from transformers.models.bert.tokenization_bert.BertTokenizer._tokenize
-    def _tokenize(self, text):
+    def _tokenize(self, text, split_special_tokens=False):
         split_tokens = []
         if self.do_basic_tokenize:
-            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-
+            for token in self.basic_tokenizer.tokenize(
+                text, never_split=self.all_special_tokens if not split_special_tokens else None
+            ):
                 # If the token is part of the never_split set
                 if token in self.basic_tokenizer.never_split:
                     split_tokens.append(token)
@@ -351,20 +352,30 @@ class BasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -383,7 +394,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -411,7 +424,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
diff --git a/src/transformers/models/distilbert/tokenization_distilbert_fast.py b/src/transformers/models/distilbert/tokenization_distilbert_fast.py
index 67763ad36e94..adb90f857d75 100644
--- a/src/transformers/models/distilbert/tokenization_distilbert_fast.py
+++ b/src/transformers/models/distilbert/tokenization_distilbert_fast.py
@@ -140,7 +140,7 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
@@ -190,7 +190,7 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
         """
         output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
 
-        if token_ids_1:
+        if token_ids_1 is not None:
             output += token_ids_1 + [self.sep_token_id]
 
         return output
diff --git a/src/transformers/models/dit/convert_dit_unilm_to_pytorch.py b/src/transformers/models/dit/convert_dit_unilm_to_pytorch.py
index 7503f035eacc..c754b9bbf3ea 100644
--- a/src/transformers/models/dit/convert_dit_unilm_to_pytorch.py
+++ b/src/transformers/models/dit/convert_dit_unilm_to_pytorch.py
@@ -19,12 +19,12 @@
 import json
 from pathlib import Path
 
+import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 
-import requests
-from huggingface_hub import hf_hub_download
-from transformers import BeitConfig, BeitFeatureExtractor, BeitForImageClassification, BeitForMaskedImageModeling
+from transformers import BeitConfig, BeitForImageClassification, BeitForMaskedImageModeling, BeitImageProcessor
 from transformers.image_utils import PILImageResampling
 from transformers.utils import logging
 
@@ -171,12 +171,12 @@ def convert_dit_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
     model.load_state_dict(state_dict)
 
     # Check outputs on an image
-    feature_extractor = BeitFeatureExtractor(
+    image_processor = BeitImageProcessor(
         size=config.image_size, resample=PILImageResampling.BILINEAR, do_center_crop=False
     )
     image = prepare_img()
 
-    encoding = feature_extractor(images=image, return_tensors="pt")
+    encoding = image_processor(images=image, return_tensors="pt")
     pixel_values = encoding["pixel_values"]
 
     outputs = model(pixel_values)
@@ -189,18 +189,18 @@ def convert_dit_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     print(f"Saving model to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
         if has_lm_head:
             model_name = "dit-base" if "base" in checkpoint_url else "dit-large"
         else:
             model_name = "dit-base-finetuned-rvlcdip" if "dit-b" in checkpoint_url else "dit-large-finetuned-rvlcdip"
-        feature_extractor.push_to_hub(
+        image_processor.push_to_hub(
             repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
             organization="nielsr",
-            commit_message="Add feature extractor",
+            commit_message="Add image processor",
             use_temp_dir=True,
         )
         model.push_to_hub(
diff --git a/src/transformers/models/donut/__init__.py b/src/transformers/models/donut/__init__.py
index ee003aa4ace2..c548a181a3bf 100644
--- a/src/transformers/models/donut/__init__.py
+++ b/src/transformers/models/donut/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/donut/configuration_donut_swin.py b/src/transformers/models/donut/configuration_donut_swin.py
index d3316bdc79f6..c7d6792467fe 100644
--- a/src/transformers/models/donut/configuration_donut_swin.py
+++ b/src/transformers/models/donut/configuration_donut_swin.py
@@ -45,15 +45,15 @@ class DonutSwinConfig(PretrainedConfig):
             The number of input channels.
         embed_dim (`int`, *optional*, defaults to 96):
             Dimensionality of patch embedding.
-        depths (`list(int)`, *optional*, defaults to [2, 2, 6, 2]):
+        depths (`list(int)`, *optional*, defaults to `[2, 2, 6, 2]`):
             Depth of each layer in the Transformer encoder.
-        num_heads (`list(int)`, *optional*, defaults to [3, 6, 12, 24]):
+        num_heads (`list(int)`, *optional*, defaults to `[3, 6, 12, 24]`):
             Number of attention heads in each layer of the Transformer encoder.
         window_size (`int`, *optional*, defaults to 7):
             Size of windows.
         mlp_ratio (`float`, *optional*, defaults to 4.0):
             Ratio of MLP hidden dimensionality to embedding dimensionality.
-        qkv_bias (`bool`, *optional*, defaults to True):
+        qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether or not a learnable bias should be added to the queries, keys and values.
         hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout probability for all fully connected layers in the embeddings and encoder.
@@ -64,13 +64,11 @@ class DonutSwinConfig(PretrainedConfig):
         hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`,
             `"selu"` and `"gelu_new"` are supported.
-        use_absolute_embeddings (`bool`, *optional*, defaults to False):
+        use_absolute_embeddings (`bool`, *optional*, defaults to `False`):
             Whether or not to add absolute position embeddings to the patch embeddings.
-        patch_norm (`bool`, *optional*, defaults to True):
-            Whether or not to add layer normalization after patch embedding.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
 
     Example:
@@ -110,10 +108,9 @@ def __init__(
         drop_path_rate=0.1,
         hidden_act="gelu",
         use_absolute_embeddings=False,
-        patch_norm=True,
         initializer_range=0.02,
         layer_norm_eps=1e-5,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -132,7 +129,6 @@ def __init__(
         self.drop_path_rate = drop_path_rate
         self.hidden_act = hidden_act
         self.use_absolute_embeddings = use_absolute_embeddings
-        self.path_norm = patch_norm
         self.layer_norm_eps = layer_norm_eps
         self.initializer_range = initializer_range
         # we set the hidden_size attribute in order to make Swin work with VisionEncoderDecoderModel
diff --git a/src/transformers/models/donut/convert_donut_to_pytorch.py b/src/transformers/models/donut/convert_donut_to_pytorch.py
index 507f10cb776c..13f669ad97fd 100644
--- a/src/transformers/models/donut/convert_donut_to_pytorch.py
+++ b/src/transformers/models/donut/convert_donut_to_pytorch.py
@@ -18,10 +18,10 @@
 
 import torch
 from datasets import load_dataset
-
 from donut import DonutModel
+
 from transformers import (
-    DonutFeatureExtractor,
+    DonutImageProcessor,
     DonutProcessor,
     DonutSwinConfig,
     DonutSwinModel,
@@ -152,10 +152,10 @@ def convert_donut_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_
     image = dataset["test"][0]["image"].convert("RGB")
 
     tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name, from_slow=True)
-    feature_extractor = DonutFeatureExtractor(
+    image_processor = DonutImageProcessor(
         do_align_long_axis=original_model.config.align_long_axis, size=original_model.config.input_size[::-1]
     )
-    processor = DonutProcessor(feature_extractor, tokenizer)
+    processor = DonutProcessor(image_processor, tokenizer)
     pixel_values = processor(image, return_tensors="pt").pixel_values
 
     if model_name == "naver-clova-ix/donut-base-finetuned-docvqa":
diff --git a/src/transformers/models/donut/image_processing_donut.py b/src/transformers/models/donut/image_processing_donut.py
index 7fe402a09d0a..2a1672e22041 100644
--- a/src/transformers/models/donut/image_processing_donut.py
+++ b/src/transformers/models/donut/image_processing_donut.py
@@ -21,9 +21,7 @@
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import (
     get_resize_output_image_size,
-    normalize,
     pad,
-    rescale,
     resize,
     to_channel_dimension_format,
 )
@@ -34,7 +32,9 @@
     ImageInput,
     PILImageResampling,
     get_image_size,
-    is_batched,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
     to_numpy_array,
     valid_images,
 )
@@ -61,30 +61,29 @@ class DonutImageProcessor(BaseImageProcessor):
             Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
             the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
             method.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
             Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
-        do_center_crop (`bool`, *optional*, defaults to `True`):
-            Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
-            `preprocess` method.
-        crop_size (`Dict[str, int]` *optional*, defaults to 224):
-            Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
-            method.
+        do_thumbnail (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image using thumbnail method.
+        do_align_long_axis (`bool`, *optional*, defaults to `False`):
+            Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
+        do_pad (`bool`, *optional*, defaults to `True`):
+            Whether to pad the image. If `random_padding` is set to `True` in `preprocess`, each image is padded with a
+            random amont of padding on each size, up to the largest image size in the batch. Otherwise, all images are
+            padded to the largest image size in the batch.
         do_rescale (`bool`, *optional*, defaults to `True`):
             Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
             the `preprocess` method.
         rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
             Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
             method.
-        do_normalize:
+        do_normalize (`bool`, *optional*, defaults to `True`):
             Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
         image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
             Mean to use if normalizing the image. This is a float or list of floats the length of the number of
             channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
         image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
             Image standard deviation.
-        do_convert_rgb (`bool`, *optional*, defaults to `True`):
-            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
-            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
     """
 
     model_input_names = ["pixel_values"]
@@ -102,7 +101,7 @@ def __init__(
         do_normalize: bool = True,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
 
@@ -125,7 +124,11 @@ def __init__(
         self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
 
     def align_long_axis(
-        self, image: np.ndarray, size: Dict[str, int], data_format: Optional[Union[str, ChannelDimension]] = None
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.ndarray:
         """
         Align the long axis of the image to the longest axis of the specified size.
@@ -135,11 +138,15 @@ def align_long_axis(
                 The image to be aligned.
             size (`Dict[str, int]`):
                 The size `{"height": h, "width": w}` to align the long axis to.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The data format of the output image. If unset, the same format as the input image is used.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
 
         Returns:
             `np.ndarray`: The aligned image.
         """
-        input_height, input_width = get_image_size(image)
+        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
         output_height, output_width = size["height"], size["width"]
 
         if (output_width < output_height and input_width > input_height) or (
@@ -148,22 +155,17 @@ def align_long_axis(
             image = np.rot90(image, 3)
 
         if data_format is not None:
-            image = to_channel_dimension_format(image, data_format)
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
 
         return image
 
-    def rotate_image(self, *args, **kwargs):
-        logger.info(
-            "rotate_image is deprecated and will be removed in version 4.27. Please use align_long_axis instead."
-        )
-        return self.align_long_axis(*args, **kwargs)
-
     def pad_image(
         self,
         image: np.ndarray,
         size: Dict[str, int],
         random_padding: bool = False,
         data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.ndarray:
         """
         Pad the image to the specified size.
@@ -177,9 +179,11 @@ def pad_image(
                 Whether to use random padding or not.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The data format of the output image. If unset, the same format as the input image is used.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
         """
         output_height, output_width = size["height"], size["width"]
-        input_height, input_width = get_image_size(image)
+        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
 
         delta_width = output_width - input_width
         delta_height = output_height - input_height
@@ -195,7 +199,7 @@ def pad_image(
         pad_right = delta_width - pad_left
 
         padding = ((pad_top, pad_bottom), (pad_left, pad_right))
-        return pad(image, padding, data_format=data_format)
+        return pad(image, padding, data_format=data_format, input_data_format=input_data_format)
 
     def pad(self, *args, **kwargs):
         logger.info("pad is deprecated and will be removed in version 4.27. Please use pad_image instead.")
@@ -207,7 +211,8 @@ def thumbnail(
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize the image to make a thumbnail. The image is resized so that no dimension is larger than any
@@ -222,8 +227,10 @@ def thumbnail(
                 The resampling filter to use.
             data_format (`Optional[Union[str, ChannelDimension]]`, *optional*):
                 The data format of the output image. If unset, the same format as the input image is used.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
         """
-        input_height, input_width = get_image_size(image)
+        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
         output_height, output_width = size["height"], size["width"]
 
         # We always resize to the smallest of either the input or output size.
@@ -239,7 +246,13 @@ def thumbnail(
             height = int(input_height * width / input_width)
 
         return resize(
-            image, size=(height, width), resample=resample, reducing_gap=2.0, data_format=data_format, **kwargs
+            image,
+            size=(height, width),
+            resample=resample,
+            reducing_gap=2.0,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
         )
 
     def resize(
@@ -248,11 +261,11 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> np.ndarray:
         """
-        Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
-        resized to keep the input aspect ratio.
+        Resizes `image` to `(height, width)` specified by `size` using the PIL library.
 
         Args:
             image (`np.ndarray`):
@@ -263,56 +276,24 @@ def resize(
                 Resampling filter to use when resiizing the image.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
         """
         size = get_size_dict(size)
         shortest_edge = min(size["height"], size["width"])
-        output_size = get_resize_output_image_size(image, size=shortest_edge, default_to_square=False)
-        resized_image = resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
+        output_size = get_resize_output_image_size(
+            image, size=shortest_edge, default_to_square=False, input_data_format=input_data_format
+        )
+        resized_image = resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
         return resized_image
 
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            image_mean (`float` or `List[float]`):
-                Image mean.
-            image_std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: ImageInput,
@@ -330,14 +311,16 @@ def preprocess(
         image_std: Optional[Union[float, List[float]]] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> PIL.Image.Image:
         """
         Preprocess an image or batch of images.
 
         Args:
             images (`ImageInput`):
-                Image to preprocess.
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
             do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                 Whether to resize the image.
             size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@@ -379,6 +362,12 @@ def preprocess(
                 - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                 - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                 - Unset: defaults to the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         size = size if size is not None else self.size
@@ -396,8 +385,7 @@ def preprocess(
         image_mean = image_mean if image_mean is not None else self.image_mean
         image_std = image_std if image_std is not None else self.image_std
 
-        if not is_batched(images):
-            images = [images]
+        images = make_list_of_images(images)
 
         if not valid_images(images):
             raise ValueError(
@@ -420,25 +408,51 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
         if do_align_long_axis:
-            images = [self.align_long_axis(image, size=size) for image in images]
+            images = [self.align_long_axis(image, size=size, input_data_format=input_data_format) for image in images]
 
         if do_resize:
-            images = [self.resize(image=image, size=size, resample=resample) for image in images]
+            images = [
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
 
         if do_thumbnail:
-            images = [self.thumbnail(image=image, size=size) for image in images]
+            images = [self.thumbnail(image=image, size=size, input_data_format=input_data_format) for image in images]
 
         if do_pad:
-            images = [self.pad_image(image=image, size=size, random_padding=random_padding) for image in images]
+            images = [
+                self.pad_image(
+                    image=image, size=size, random_padding=random_padding, input_data_format=input_data_format
+                )
+                for image in images
+            ]
 
         if do_rescale:
-            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
 
         if do_normalize:
-            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
-
-        images = [to_channel_dimension_format(image, data_format) for image in images]
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
 
         data = {"pixel_values": images}
         return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/src/transformers/models/donut/modeling_donut_swin.py b/src/transformers/models/donut/modeling_donut_swin.py
index 46b0c54c4cbf..0d833406e259 100644
--- a/src/transformers/models/donut/modeling_donut_swin.py
+++ b/src/transformers/models/donut/modeling_donut_swin.py
@@ -43,7 +43,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "DonutSwinConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "https://huggingface.co/naver-clova-ix/donut-base"
@@ -296,8 +295,8 @@ def forward(self, input_feature: torch.Tensor, input_dimensions: Tuple[int, int]
         return input_feature
 
 
-# Copied from transformers.models.swin.modeling_swin.drop_path
-def drop_path(input, drop_prob=0.0, training=False, scale_by_keep=True):
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
@@ -577,8 +576,12 @@ def forward(
         input_dimensions: Tuple[int, int],
         head_mask: Optional[torch.FloatTensor] = None,
         output_attentions: Optional[bool] = False,
+        always_partition: Optional[bool] = False,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        self.set_shift_and_window_size(input_dimensions)
+        if not always_partition:
+            self.set_shift_and_window_size(input_dimensions)
+        else:
+            pass
         height, width = input_dimensions
         batch_size, _, channels = hidden_states.size()
         shortcut = hidden_states
@@ -668,13 +671,15 @@ def forward(
         input_dimensions: Tuple[int, int],
         head_mask: Optional[torch.FloatTensor] = None,
         output_attentions: Optional[bool] = False,
+        always_partition: Optional[bool] = False,
     ) -> Tuple[torch.Tensor]:
         height, width = input_dimensions
         for i, layer_module in enumerate(self.blocks):
-
             layer_head_mask = head_mask[i] if head_mask is not None else None
 
-            layer_outputs = layer_module(hidden_states, input_dimensions, layer_head_mask, output_attentions)
+            layer_outputs = layer_module(
+                hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition
+            )
 
             hidden_states = layer_outputs[0]
 
@@ -725,6 +730,7 @@ def forward(
         output_attentions: Optional[bool] = False,
         output_hidden_states: Optional[bool] = False,
         output_hidden_states_before_downsampling: Optional[bool] = False,
+        always_partition: Optional[bool] = False,
         return_dict: Optional[bool] = True,
     ) -> Union[Tuple, DonutSwinEncoderOutput]:
         all_hidden_states = () if output_hidden_states else None
@@ -754,7 +760,9 @@ def custom_forward(*inputs):
                     create_custom_forward(layer_module), hidden_states, input_dimensions, layer_head_mask
                 )
             else:
-                layer_outputs = layer_module(hidden_states, input_dimensions, layer_head_mask, output_attentions)
+                layer_outputs = layer_module(
+                    hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition
+                )
 
             hidden_states = layer_outputs[0]
             hidden_states_before_downsampling = layer_outputs[1]
@@ -837,8 +845,8 @@ def _set_gradient_checkpointing(self, module, value=False):
 SWIN_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
-            [`AutoFeatureExtractor.__call__`] for details.
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`DonutImageProcessor.__call__`] for details.
         head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
             Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
 
@@ -888,7 +896,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=DonutSwinModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -904,6 +911,10 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, DonutSwinModelOutput]:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+        """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/donut/processing_donut.py b/src/transformers/models/donut/processing_donut.py
index 87f2dd34f904..f797aec18ed4 100644
--- a/src/transformers/models/donut/processing_donut.py
+++ b/src/transformers/models/donut/processing_donut.py
@@ -32,9 +32,9 @@ class DonutProcessor(ProcessorMixin):
     [`~DonutProcessor.decode`] for more information.
 
     Args:
-        image_processor ([`DonutImageProcessor`]):
+        image_processor ([`DonutImageProcessor`], *optional*):
             An instance of [`DonutImageProcessor`]. The image processor is a required input.
-        tokenizer ([`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]):
+        tokenizer ([`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`], *optional*):
             An instance of [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]. The tokenizer is a required input.
     """
     attributes = ["image_processor", "tokenizer"]
@@ -42,6 +42,7 @@ class DonutProcessor(ProcessorMixin):
     tokenizer_class = "AutoTokenizer"
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        feature_extractor = None
         if "feature_extractor" in kwargs:
             warnings.warn(
                 "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
@@ -130,14 +131,16 @@ def token2json(self, tokens, is_inner_value=False, added_vocab=None):
         if added_vocab is None:
             added_vocab = self.tokenizer.get_added_vocab()
 
-        output = dict()
+        output = {}
 
         while tokens:
             start_token = re.search(r"<s_(.*?)>", tokens, re.IGNORECASE)
             if start_token is None:
                 break
             key = start_token.group(1)
-            end_token = re.search(rf"</s_{key}>", tokens, re.IGNORECASE)
+            key_escaped = re.escape(key)
+
+            end_token = re.search(rf"</s_{key_escaped}>", tokens, re.IGNORECASE)
             start_token = start_token.group()
             if end_token is None:
                 tokens = tokens.replace(start_token, "")
diff --git a/src/transformers/models/dpr/__init__.py b/src/transformers/models/dpr/__init__.py
index 8f9482364347..6ea8b78e5037 100644
--- a/src/transformers/models/dpr/__init__.py
+++ b/src/transformers/models/dpr/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/dpr/configuration_dpr.py b/src/transformers/models/dpr/configuration_dpr.py
index cfbf296994b7..5551883e0964 100644
--- a/src/transformers/models/dpr/configuration_dpr.py
+++ b/src/transformers/models/dpr/configuration_dpr.py
@@ -126,7 +126,7 @@ def __init__(
         pad_token_id=0,
         position_embedding_type="absolute",
         projection_dim: int = 0,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
 
diff --git a/src/transformers/models/dpr/convert_dpr_original_checkpoint_to_pytorch.py b/src/transformers/models/dpr/convert_dpr_original_checkpoint_to_pytorch.py
index 6ea85620242f..b4965857b557 100644
--- a/src/transformers/models/dpr/convert_dpr_original_checkpoint_to_pytorch.py
+++ b/src/transformers/models/dpr/convert_dpr_original_checkpoint_to_pytorch.py
@@ -19,7 +19,7 @@
 import torch
 from torch.serialization import default_restore_location
 
-from .transformers import BertConfig, DPRConfig, DPRContextEncoder, DPRQuestionEncoder, DPRReader
+from transformers import BertConfig, DPRConfig, DPRContextEncoder, DPRQuestionEncoder, DPRReader
 
 
 CheckpointState = collections.namedtuple(
diff --git a/src/transformers/models/dpr/modeling_dpr.py b/src/transformers/models/dpr/modeling_dpr.py
index 6ab3c68a391a..944ce142b0ad 100644
--- a/src/transformers/models/dpr/modeling_dpr.py
+++ b/src/transformers/models/dpr/modeling_dpr.py
@@ -170,7 +170,6 @@ def _set_gradient_checkpointing(self, module, value=False):
 
 
 class DPREncoder(DPRPreTrainedModel):
-
     base_model_prefix = "bert_model"
 
     def __init__(self, config: DPRConfig):
@@ -227,7 +226,6 @@ def embeddings_size(self) -> int:
 
 
 class DPRSpanPredictor(DPRPreTrainedModel):
-
     base_model_prefix = "encoder"
 
     def __init__(self, config: DPRConfig):
@@ -298,8 +296,6 @@ class DPRPretrainedContextEncoder(DPRPreTrainedModel):
     config_class = DPRConfig
     load_tf_weights = None
     base_model_prefix = "ctx_encoder"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
 
 
 class DPRPretrainedQuestionEncoder(DPRPreTrainedModel):
@@ -311,8 +307,6 @@ class DPRPretrainedQuestionEncoder(DPRPreTrainedModel):
     config_class = DPRConfig
     load_tf_weights = None
     base_model_prefix = "question_encoder"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
 
 
 class DPRPretrainedReader(DPRPreTrainedModel):
@@ -324,7 +318,6 @@ class DPRPretrainedReader(DPRPreTrainedModel):
     config_class = DPRConfig
     load_tf_weights = None
     base_model_prefix = "span_predictor"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
 
 ###############
@@ -371,7 +364,7 @@ class DPRPretrainedReader(DPRPreTrainedModel):
             DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
             rather than the left.
 
-            Indices can be obtained using [`DPRTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -461,9 +454,9 @@ def forward(
         attention_mask: Optional[Tensor] = None,
         token_type_ids: Optional[Tensor] = None,
         inputs_embeds: Optional[Tensor] = None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
     ) -> Union[DPRContextEncoderOutput, Tuple[Tensor, ...]]:
         r"""
         Return:
@@ -542,9 +535,9 @@ def forward(
         attention_mask: Optional[Tensor] = None,
         token_type_ids: Optional[Tensor] = None,
         inputs_embeds: Optional[Tensor] = None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
     ) -> Union[DPRQuestionEncoderOutput, Tuple[Tensor, ...]]:
         r"""
         Return:
@@ -569,6 +562,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -622,9 +616,9 @@ def forward(
         input_ids: Optional[Tensor] = None,
         attention_mask: Optional[Tensor] = None,
         inputs_embeds: Optional[Tensor] = None,
-        output_attentions: bool = None,
-        output_hidden_states: bool = None,
-        return_dict=None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
     ) -> Union[DPRReaderOutput, Tuple[Tensor, ...]]:
         r"""
         Return:
@@ -657,6 +651,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/dpr/modeling_tf_dpr.py b/src/transformers/models/dpr/modeling_tf_dpr.py
index 96ee761b819d..53efa41fda5d 100644
--- a/src/transformers/models/dpr/modeling_tf_dpr.py
+++ b/src/transformers/models/dpr/modeling_tf_dpr.py
@@ -15,13 +15,15 @@
 
 """ TensorFlow DPR model for Open Domain Question Answering."""
 
+from __future__ import annotations
+
 from dataclasses import dataclass
-from typing import Optional, Tuple, Union
+from typing import Tuple, Union
 
 import tensorflow as tf
 
 from ...modeling_tf_outputs import TFBaseModelOutputWithPooling
-from ...modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list, unpack_inputs
+from ...modeling_tf_utils import TFModelInputType, TFPreTrainedModel, get_initializer, shape_list, unpack_inputs
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -80,8 +82,8 @@ class TFDPRContextEncoderOutput(ModelOutput):
     """
 
     pooler_output: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -108,8 +110,8 @@ class TFDPRQuestionEncoderOutput(ModelOutput):
     """
 
     pooler_output: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -141,12 +143,11 @@ class TFDPRReaderOutput(ModelOutput):
     start_logits: tf.Tensor = None
     end_logits: tf.Tensor = None
     relevance_logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 class TFDPREncoderLayer(tf.keras.layers.Layer):
-
     base_model_prefix = "bert_model"
 
     def __init__(self, config: DPRConfig, **kwargs):
@@ -168,9 +169,9 @@ def __init__(self, config: DPRConfig, **kwargs):
     def call(
         self,
         input_ids: tf.Tensor = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        token_type_ids: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: bool = None,
         output_hidden_states: bool = None,
         return_dict: bool = None,
@@ -210,7 +211,6 @@ def embeddings_size(self) -> int:
 
 
 class TFDPRSpanPredictorLayer(tf.keras.layers.Layer):
-
     base_model_prefix = "encoder"
 
     def __init__(self, config: DPRConfig, **kwargs):
@@ -229,8 +229,8 @@ def __init__(self, config: DPRConfig, **kwargs):
     def call(
         self,
         input_ids: tf.Tensor = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: bool = False,
         output_hidden_states: bool = False,
         return_dict: bool = False,
@@ -275,7 +275,6 @@ def call(
 
 
 class TFDPRSpanPredictor(TFPreTrainedModel):
-
     base_model_prefix = "encoder"
 
     def __init__(self, config: DPRConfig, **kwargs):
@@ -286,9 +285,9 @@ def __init__(self, config: DPRConfig, **kwargs):
     def call(
         self,
         input_ids: tf.Tensor = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        token_type_ids: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: bool = False,
         output_hidden_states: bool = False,
         return_dict: bool = False,
@@ -319,9 +318,9 @@ def __init__(self, config: DPRConfig, **kwargs):
     def call(
         self,
         input_ids: tf.Tensor = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        token_type_ids: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: bool = False,
         output_hidden_states: bool = False,
         return_dict: bool = False,
@@ -373,19 +372,6 @@ class TFDPRPretrainedReader(TFPreTrainedModel):
     config_class = DPRConfig
     base_model_prefix = "reader"
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 ###############
 # Actual Models
@@ -457,7 +443,7 @@ def serving(self, inputs):
             DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
             rather than the left.
 
-            Indices can be obtained using [`DPRTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -546,7 +532,7 @@ def get_input_embeddings(self):
         try:
             return self.ctx_encoder.bert_model.get_input_embeddings()
         except AttributeError:
-            self(self.dummy_inputs)
+            self.build()
             return self.ctx_encoder.bert_model.get_input_embeddings()
 
     @unpack_inputs
@@ -554,15 +540,15 @@ def get_input_embeddings(self):
     @replace_return_docstrings(output_type=TFDPRContextEncoderOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids=None,
-        attention_mask: Optional[tf.Tensor] = None,
-        token_type_ids: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
         training: bool = False,
-    ) -> Union[TFDPRContextEncoderOutput, Tuple[tf.Tensor, ...]]:
+    ) -> TFDPRContextEncoderOutput | Tuple[tf.Tensor, ...]:
         r"""
         Return:
 
@@ -613,12 +599,6 @@ def call(
             pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions
         )
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFDPRContextEncoderOutput(pooler_output=output.pooler_output, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     "The bare DPRQuestionEncoder transformer outputting pooler outputs as question representations.",
@@ -633,7 +613,7 @@ def get_input_embeddings(self):
         try:
             return self.question_encoder.bert_model.get_input_embeddings()
         except AttributeError:
-            self(self.dummy_inputs)
+            self.build()
             return self.question_encoder.bert_model.get_input_embeddings()
 
     @unpack_inputs
@@ -641,15 +621,15 @@ def get_input_embeddings(self):
     @replace_return_docstrings(output_type=TFDPRQuestionEncoderOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids=None,
-        attention_mask: Optional[tf.Tensor] = None,
-        token_type_ids: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
         training: bool = False,
-    ) -> Union[TFDPRQuestionEncoderOutput, Tuple[tf.Tensor, ...]]:
+    ) -> TFDPRQuestionEncoderOutput | Tuple[tf.Tensor, ...]:
         r"""
         Return:
 
@@ -699,12 +679,6 @@ def call(
             pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions
         )
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFDPRQuestionEncoderOutput(pooler_output=output.pooler_output, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     "The bare DPRReader transformer outputting span predictions.",
@@ -719,7 +693,7 @@ def get_input_embeddings(self):
         try:
             return self.span_predictor.encoder.bert_model.get_input_embeddings()
         except AttributeError:
-            self(self.dummy_inputs)
+            self.build()
             return self.span_predictor.encoder.bert_model.get_input_embeddings()
 
     @unpack_inputs
@@ -727,14 +701,14 @@ def get_input_embeddings(self):
     @replace_return_docstrings(output_type=TFDPRReaderOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids=None,
-        attention_mask: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        output_attentions: bool = None,
-        output_hidden_states: bool = None,
-        return_dict=None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
         training: bool = False,
-    ) -> Union[TFDPRReaderOutput, Tuple[tf.Tensor, ...]]:
+    ) -> TFDPRReaderOutput | Tuple[tf.Tensor, ...]:
         r"""
         Return:
 
@@ -778,15 +752,3 @@ def call(
             return_dict=return_dict,
             training=training,
         )
-
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFDPRReaderOutput(
-            start_logits=output.start_logits,
-            end_logits=output.end_logits,
-            relevance_logits=output.relevance_logits,
-            hidden_states=hs,
-            attentions=attns,
-        )
diff --git a/src/transformers/models/dpr/tokenization_dpr.py b/src/transformers/models/dpr/tokenization_dpr.py
index 7cd01a18fc06..b2ae84addc75 100644
--- a/src/transformers/models/dpr/tokenization_dpr.py
+++ b/src/transformers/models/dpr/tokenization_dpr.py
@@ -230,7 +230,7 @@ def __call__(
         max_length: Optional[int] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_attention_mask: Optional[bool] = None,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         if titles is None and texts is None:
             return super().__call__(
@@ -316,6 +316,7 @@ def decode_best_spans(
         >>> outputs = model(**encoded_inputs)
         >>> predicted_spans = tokenizer.decode_best_spans(encoded_inputs, outputs)
         >>> print(predicted_spans[0].text)  # best span
+        a song
         ```"""
         input_ids = reader_input["input_ids"]
         start_logits, end_logits, relevance_logits = reader_output[:3]
@@ -378,11 +379,9 @@ def _get_best_spans(
             if length > max_answer_length:
                 raise ValueError(f"Span is too long: {length} > {max_answer_length}")
             if any(
-                [
-                    start_index <= prev_start_index <= prev_end_index <= end_index
-                    or prev_start_index <= start_index <= end_index <= prev_end_index
-                    for (prev_start_index, prev_end_index) in chosen_span_intervals
-                ]
+                start_index <= prev_start_index <= prev_end_index <= end_index
+                or prev_start_index <= start_index <= end_index <= prev_end_index
+                for (prev_start_index, prev_end_index) in chosen_span_intervals
             ):
                 continue
             chosen_span_intervals.append((start_index, end_index))
diff --git a/src/transformers/models/dpr/tokenization_dpr_fast.py b/src/transformers/models/dpr/tokenization_dpr_fast.py
index 280f856a174b..784ed1344cf6 100644
--- a/src/transformers/models/dpr/tokenization_dpr_fast.py
+++ b/src/transformers/models/dpr/tokenization_dpr_fast.py
@@ -231,7 +231,7 @@ def __call__(
         max_length: Optional[int] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_attention_mask: Optional[bool] = None,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         if titles is None and texts is None:
             return super().__call__(
@@ -316,6 +316,7 @@ def decode_best_spans(
         >>> outputs = model(**encoded_inputs)
         >>> predicted_spans = tokenizer.decode_best_spans(encoded_inputs, outputs)
         >>> print(predicted_spans[0].text)  # best span
+        a song
         ```"""
         input_ids = reader_input["input_ids"]
         start_logits, end_logits, relevance_logits = reader_output[:3]
@@ -376,11 +377,9 @@ def _get_best_spans(
             length = end_index - start_index + 1
             assert length <= max_answer_length, f"Span is too long: {length} > {max_answer_length}"
             if any(
-                [
-                    start_index <= prev_start_index <= prev_end_index <= end_index
-                    or prev_start_index <= start_index <= end_index <= prev_end_index
-                    for (prev_start_index, prev_end_index) in chosen_span_intervals
-                ]
+                start_index <= prev_start_index <= prev_end_index <= end_index
+                or prev_start_index <= start_index <= end_index <= prev_end_index
+                for (prev_start_index, prev_end_index) in chosen_span_intervals
             ):
                 continue
             chosen_span_intervals.append((start_index, end_index))
diff --git a/src/transformers/models/dpt/__init__.py b/src/transformers/models/dpt/__init__.py
index b1467adb0b2b..da53011b87b3 100644
--- a/src/transformers/models/dpt/__init__.py
+++ b/src/transformers/models/dpt/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/dpt/configuration_dpt.py b/src/transformers/models/dpt/configuration_dpt.py
index 034f1b7e166e..45acd5902f5e 100644
--- a/src/transformers/models/dpt/configuration_dpt.py
+++ b/src/transformers/models/dpt/configuration_dpt.py
@@ -52,9 +52,9 @@ class DPTConfig(PretrainedConfig):
         hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"selu"` and `"gelu_new"` are supported.
-        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
@@ -66,6 +66,8 @@ class DPTConfig(PretrainedConfig):
             The size (resolution) of each patch.
         num_channels (`int`, *optional*, defaults to 3):
             The number of input channels.
+        is_hybrid (`bool`, *optional*, defaults to `False`):
+            Whether to use a hybrid backbone. Useful in the context of loading DPT-Hybrid models.
         qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to the queries, keys and values.
         backbone_out_indices (`List[int]`, *optional*, defaults to `[2, 5, 8, 11]`):
@@ -79,11 +81,9 @@ class DPTConfig(PretrainedConfig):
             - "project" passes information to the other tokens by concatenating the readout to all other tokens before
               projecting the
             representation to the original feature dimension D using a linear layer followed by a GELU non-linearity.
-        is_hybrid (`bool`, *optional*, defaults to `False`):
-            Whether to use a hybrid backbone. Useful in the context of loading DPT-Hybrid models.
         reassemble_factors (`List[int]`, *optional*, defaults to `[4, 2, 1, 0.5]`):
             The up/downsampling factors of the reassemble layers.
-        neck_hidden_sizes (`List[str]`, *optional*, defaults to [96, 192, 384, 768]):
+        neck_hidden_sizes (`List[str]`, *optional*, defaults to `[96, 192, 384, 768]`):
             The hidden sizes to project to for the feature maps of the backbone.
         fusion_hidden_size (`int`, *optional*, defaults to 256):
             The number of channels before fusion.
@@ -152,7 +152,7 @@ def __init__(
         backbone_featmap_shape=[1, 1024, 24, 24],
         neck_ignore_stages=[0, 1],
         backbone_config=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py b/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py
index bccc82bb2b36..0fa69adfaf39 100644
--- a/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py
+++ b/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py
@@ -19,12 +19,12 @@
 import json
 from pathlib import Path
 
+import requests
 import torch
+from huggingface_hub import cached_download, hf_hub_url
 from PIL import Image
 
-import requests
-from huggingface_hub import cached_download, hf_hub_url
-from transformers import DPTConfig, DPTFeatureExtractor, DPTForDepthEstimation, DPTForSemanticSegmentation
+from transformers import DPTConfig, DPTForDepthEstimation, DPTForSemanticSegmentation, DPTImageProcessor
 from transformers.utils import logging
 
 
@@ -244,10 +244,10 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
 
     # Check outputs on an image
     size = 480 if "ade" in checkpoint_url else 384
-    feature_extractor = DPTFeatureExtractor(size=size)
+    image_processor = DPTImageProcessor(size=size)
 
     image = prepare_img()
-    encoding = feature_extractor(image, return_tensors="pt")
+    encoding = image_processor(image, return_tensors="pt")
 
     # forward pass
     outputs = model(**encoding).logits if "ade" in checkpoint_url else model(**encoding).predicted_depth
@@ -271,12 +271,12 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
         Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
         print(f"Saving model to {pytorch_dump_folder_path}")
         model.save_pretrained(pytorch_dump_folder_path)
-        print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-        feature_extractor.save_pretrained(pytorch_dump_folder_path)
+        print(f"Saving image processor to {pytorch_dump_folder_path}")
+        image_processor.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
         model.push_to_hub("ybelkada/dpt-hybrid-midas")
-        feature_extractor.push_to_hub("ybelkada/dpt-hybrid-midas")
+        image_processor.push_to_hub("ybelkada/dpt-hybrid-midas")
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/dpt/convert_dpt_to_pytorch.py b/src/transformers/models/dpt/convert_dpt_to_pytorch.py
index dc26d017d736..cee5be88c3a2 100644
--- a/src/transformers/models/dpt/convert_dpt_to_pytorch.py
+++ b/src/transformers/models/dpt/convert_dpt_to_pytorch.py
@@ -19,12 +19,12 @@
 import json
 from pathlib import Path
 
+import requests
 import torch
+from huggingface_hub import cached_download, hf_hub_url
 from PIL import Image
 
-import requests
-from huggingface_hub import cached_download, hf_hub_url
-from transformers import DPTConfig, DPTFeatureExtractor, DPTForDepthEstimation, DPTForSemanticSegmentation
+from transformers import DPTConfig, DPTForDepthEstimation, DPTForSemanticSegmentation, DPTImageProcessor
 from transformers.utils import logging
 
 
@@ -211,10 +211,10 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
 
     # Check outputs on an image
     size = 480 if "ade" in checkpoint_url else 384
-    feature_extractor = DPTFeatureExtractor(size=size)
+    image_processor = DPTImageProcessor(size=size)
 
     image = prepare_img()
-    encoding = feature_extractor(image, return_tensors="pt")
+    encoding = image_processor(image, return_tensors="pt")
 
     # forward pass
     outputs = model(**encoding).logits if "ade" in checkpoint_url else model(**encoding).predicted_depth
@@ -233,8 +233,8 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     print(f"Saving model to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
         print("Pushing model to hub...")
@@ -244,10 +244,10 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
             commit_message="Add model",
             use_temp_dir=True,
         )
-        feature_extractor.push_to_hub(
+        image_processor.push_to_hub(
             repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
             organization="nielsr",
-            commit_message="Add feature extractor",
+            commit_message="Add image processor",
             use_temp_dir=True,
         )
 
diff --git a/src/transformers/models/dpt/image_processing_dpt.py b/src/transformers/models/dpt/image_processing_dpt.py
index 3bfe80c9e8af..93374dbd9259 100644
--- a/src/transformers/models/dpt/image_processing_dpt.py
+++ b/src/transformers/models/dpt/image_processing_dpt.py
@@ -19,11 +19,8 @@
 
 import numpy as np
 
-from transformers.utils import is_vision_available
-from transformers.utils.generic import TensorType
-
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import normalize, rescale, resize, to_channel_dimension_format
+from ...image_transforms import resize, to_channel_dimension_format
 from ...image_utils import (
     IMAGENET_STANDARD_MEAN,
     IMAGENET_STANDARD_STD,
@@ -31,13 +28,15 @@
     ImageInput,
     PILImageResampling,
     get_image_size,
-    is_batched,
+    infer_channel_dimension_format,
+    is_scaled_image,
     is_torch_available,
     is_torch_tensor,
+    make_list_of_images,
     to_numpy_array,
     valid_images,
 )
-from ...utils import logging
+from ...utils import TensorType, is_vision_available, logging
 
 
 if is_torch_available():
@@ -51,7 +50,11 @@
 
 
 def get_resize_output_image_size(
-    input_image: np.ndarray, output_size: Union[int, Iterable[int]], keep_aspect_ratio: bool, multiple: int
+    input_image: np.ndarray,
+    output_size: Union[int, Iterable[int]],
+    keep_aspect_ratio: bool,
+    multiple: int,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
 ) -> Tuple[int, int]:
     def constraint_to_multiple_of(val, multiple, min_val=0, max_val=None):
         x = round(val / multiple) * multiple
@@ -66,7 +69,7 @@ def constraint_to_multiple_of(val, multiple, min_val=0, max_val=None):
 
     output_size = (output_size, output_size) if isinstance(output_size, int) else output_size
 
-    input_height, input_width = get_image_size(input_image)
+    input_height, input_width = get_image_size(input_image, input_data_format)
     output_height, output_width = output_size
 
     # determine new height and width
@@ -97,14 +100,14 @@ class DPTImageProcessor(BaseImageProcessor):
             Whether to resize the image's (height, width) dimensions. Can be overidden by `do_resize` in `preprocess`.
         size (`Dict[str, int]` *optional*, defaults to `{"height": 384, "width": 384}`):
             Size of the image after resizing. Can be overidden by `size` in `preprocess`.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Defines the resampling filter to use if resizing the image. Can be overidden by `resample` in `preprocess`.
         keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
             If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. Can
             be overidden by `keep_aspect_ratio` in `preprocess`.
-        ensure_multiple_of (`int`, *optional*, defaults to `1`):
+        ensure_multiple_of (`int`, *optional*, defaults to 1):
             If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overidden
             by `ensure_multiple_of` in `preprocess`.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
-            Defines the resampling filter to use if resizing the image. Can be overidden by `resample` in `preprocess`.
         do_rescale (`bool`, *optional*, defaults to `True`):
             Whether to rescale the image by the specified scale `rescale_factor`. Can be overidden by `do_rescale` in
             `preprocess`.
@@ -135,7 +138,7 @@ def __init__(
         do_normalize: bool = True,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"height": 384, "width": 384}
@@ -159,7 +162,8 @@ def resize(
         ensure_multiple_of: int = 1,
         resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image to target size `(size["height"], size["width"])`. If `keep_aspect_ratio` is `True`, the image
@@ -173,7 +177,7 @@ def resize(
                 Target size of the output image.
             keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
                 If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved.
-            ensure_multiple_of (`int`, *optional*, defaults to `1`):
+            ensure_multiple_of (`int`, *optional*, defaults to 1):
                 The image is resized to a size that is a multiple of this value.
             resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
                 Defines the resampling filter to use if resizing the image. Otherwise, the image is resized to size
@@ -182,6 +186,8 @@ def resize(
                 Resampling filter to use when resiizing the image.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
         """
         size = get_size_dict(size)
         if "height" not in size or "width" not in size:
@@ -191,51 +197,16 @@ def resize(
             output_size=(size["height"], size["width"]),
             keep_aspect_ratio=keep_aspect_ratio,
             multiple=ensure_multiple_of,
+            input_data_format=input_data_format,
+        )
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
         )
-        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
-
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            image_mean (`float` or `List[float]`):
-                Image mean.
-            image_std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
 
     def preprocess(
         self,
@@ -252,6 +223,7 @@ def preprocess(
         image_std: Optional[Union[float, List[float]]] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
         **kwargs,
     ) -> PIL.Image.Image:
         """
@@ -259,7 +231,8 @@ def preprocess(
 
         Args:
             images (`ImageInput`):
-                Image to preprocess.
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
             do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                 Whether to resize the image.
             size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@@ -295,6 +268,12 @@ def preprocess(
                 The channel dimension format for the output image. Can be one of:
                     - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                     - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         size = size if size is not None else self.size
@@ -308,8 +287,7 @@ def preprocess(
         image_mean = image_mean if image_mean is not None else self.image_mean
         image_std = image_std if image_std is not None else self.image_std
 
-        if not is_batched(images):
-            images = [images]
+        images = make_list_of_images(images)
 
         if not valid_images(images):
             raise ValueError(
@@ -329,20 +307,42 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
         if do_resize:
-            images = [self.resize(image=image, size=size, resample=resample) for image in images]
+            images = [
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
 
         if do_rescale:
-            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
 
         if do_normalize:
-            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
 
-        images = [to_channel_dimension_format(image, data_format) for image in images]
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
 
         data = {"pixel_values": images}
         return BatchFeature(data=data, tensor_type=return_tensors)
 
+    # Copied from transformers.models.beit.image_processing_beit.BeitImageProcessor.post_process_semantic_segmentation with Beit->DPT
     def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple] = None):
         """
         Converts the output of [`DPTForSemanticSegmentation`] into semantic segmentation maps. Only supports PyTorch.
diff --git a/src/transformers/models/dpt/modeling_dpt.py b/src/transformers/models/dpt/modeling_dpt.py
index d994b6bc7121..187a6c36656a 100755
--- a/src/transformers/models/dpt/modeling_dpt.py
+++ b/src/transformers/models/dpt/modeling_dpt.py
@@ -49,7 +49,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "DPTConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "DPTImageProcessor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "Intel/dpt-large"
@@ -380,7 +379,6 @@ def __init__(self, config: DPTConfig) -> None:
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
 
@@ -439,7 +437,6 @@ def __init__(self, config: DPTConfig) -> None:
             self.intermediate_act_fn = config.hidden_act
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.intermediate_act_fn(hidden_states)
 
@@ -840,7 +837,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 DPT_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`ViTImageProcessor`]. See [`ViTImageProcessor.__call__`]
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`]
             for details.
 
         head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
@@ -898,7 +895,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(DPT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPoolingAndIntermediateActivations,
         config_class=_CONFIG_FOR_DOC,
@@ -1087,7 +1083,7 @@ def forward(
 
         Examples:
         ```python
-        >>> from transformers import DPTImageProcessor, DPTForDepthEstimation
+        >>> from transformers import AutoImageProcessor, DPTForDepthEstimation
         >>> import torch
         >>> import numpy as np
         >>> from PIL import Image
@@ -1096,7 +1092,7 @@ def forward(
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> image_processor = DPTImageProcessor.from_pretrained("Intel/dpt-large")
+        >>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large")
         >>> model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large")
 
         >>> # prepare image for the model
@@ -1257,14 +1253,14 @@ def forward(
 
         Examples:
         ```python
-        >>> from transformers import DPTImageProcessor, DPTForSemanticSegmentation
+        >>> from transformers import AutoImageProcessor, DPTForSemanticSegmentation
         >>> from PIL import Image
         >>> import requests
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> image_processor = DPTImageProcessor.from_pretrained("Intel/dpt-large-ade")
+        >>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large-ade")
         >>> model = DPTForSemanticSegmentation.from_pretrained("Intel/dpt-large-ade")
 
         >>> inputs = image_processor(images=image, return_tensors="pt")
diff --git a/src/transformers/models/efficientformer/__init__.py b/src/transformers/models/efficientformer/__init__.py
new file mode 100644
index 000000000000..25d60d1ee765
--- /dev/null
+++ b/src/transformers/models/efficientformer/__init__.py
@@ -0,0 +1,109 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_tf_available,
+    is_torch_available,
+    is_vision_available,
+)
+
+
+_import_structure = {
+    "configuration_efficientformer": [
+        "EFFICIENTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "EfficientFormerConfig",
+    ]
+}
+
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["image_processing_efficientformer"] = ["EfficientFormerImageProcessor"]
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_efficientformer"] = [
+        "EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "EfficientFormerForImageClassification",
+        "EfficientFormerForImageClassificationWithTeacher",
+        "EfficientFormerModel",
+        "EfficientFormerPreTrainedModel",
+    ]
+
+try:
+    if not is_tf_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_tf_efficientformer"] = [
+        "TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "TFEfficientFormerForImageClassification",
+        "TFEfficientFormerForImageClassificationWithTeacher",
+        "TFEfficientFormerModel",
+        "TFEfficientFormerPreTrainedModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_efficientformer import EFFICIENTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, EfficientFormerConfig
+
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .image_processing_efficientformer import EfficientFormerImageProcessor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_efficientformer import (
+            EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            EfficientFormerForImageClassification,
+            EfficientFormerForImageClassificationWithTeacher,
+            EfficientFormerModel,
+            EfficientFormerPreTrainedModel,
+        )
+    try:
+        if not is_tf_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_tf_efficientformer import (
+            TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TFEfficientFormerForImageClassification,
+            TFEfficientFormerForImageClassificationWithTeacher,
+            TFEfficientFormerModel,
+            TFEfficientFormerPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/efficientformer/configuration_efficientformer.py b/src/transformers/models/efficientformer/configuration_efficientformer.py
new file mode 100644
index 000000000000..fecb90a886e8
--- /dev/null
+++ b/src/transformers/models/efficientformer/configuration_efficientformer.py
@@ -0,0 +1,173 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" EfficientFormer model configuration"""
+
+from typing import List
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+EFFICIENTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "snap-research/efficientformer-l1-300": (
+        "https://huggingface.co/snap-research/efficientformer-l1-300/resolve/main/config.json"
+    ),
+}
+
+
+class EfficientFormerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of an [`EfficientFormerModel`]. It is used to
+    instantiate an EfficientFormer model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the EfficientFormer
+    [snap-research/efficientformer-l1](https://huggingface.co/snap-research/efficientformer-l1) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        depths (`List(int)`, *optional*, defaults to `[3, 2, 6, 4]`)
+            Depth of each stage.
+        hidden_sizes (`List(int)`, *optional*, defaults to `[48, 96, 224, 448]`)
+            Dimensionality of each stage.
+        downsamples (`List(bool)`, *optional*, defaults to `[True, True, True, True]`)
+            Whether or not to downsample inputs between two stages.
+        dim (`int`, *optional*, defaults to 448):
+            Number of channels in Meta3D layers
+        key_dim (`int`, *optional*, defaults to 32):
+            The size of the key in meta3D block.
+        attention_ratio (`int`, *optional*, defaults to 4):
+            Ratio of the dimension of the query and value to the dimension of the key in MSHA block
+        resolution (`int`, *optional*, defaults to 7)
+            Size of each patch
+        num_hidden_layers (`int`, *optional*, defaults to 5):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the 3D MetaBlock.
+        mlp_expansion_ratio (`int`, *optional*, defaults to 4):
+            Ratio of size of the hidden dimensionality of an MLP to the dimensionality of its input.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        pool_size (`int`, *optional*, defaults to 3):
+            Kernel size of pooling layers.
+        downsample_patch_size (`int`, *optional*, defaults to 3):
+            The size of patches in downsampling layers.
+        downsample_stride (`int`, *optional*, defaults to 2):
+            The stride of convolution kernels in downsampling layers.
+        downsample_pad (`int`, *optional*, defaults to 1):
+            Padding in downsampling layers.
+        drop_path_rate (`int`, *optional*, defaults to 0):
+            Rate at which to increase dropout probability in DropPath.
+        num_meta3d_blocks (`int`, *optional*, defaults to 1):
+            The number of 3D MetaBlocks in the last stage.
+        distillation (`bool`, *optional*, defaults to `True`):
+            Whether to add a distillation head.
+        use_layer_scale (`bool`, *optional*, defaults to `True`):
+            Whether to scale outputs from token mixers.
+        layer_scale_init_value (`float`, *optional*, defaults to 1e-5):
+            Factor by which outputs from token mixers are scaled.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to `224`):
+            The size (resolution) of each image.
+
+    Example:
+
+    ```python
+    >>> from transformers import EfficientFormerConfig, EfficientFormerModel
+
+    >>> # Initializing a EfficientFormer efficientformer-l1 style configuration
+    >>> configuration = EfficientFormerConfig()
+
+    >>> # Initializing a EfficientFormerModel (with random weights) from the efficientformer-l3 style configuration
+    >>> model = EfficientFormerModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "efficientformer"
+
+    def __init__(
+        self,
+        depths: List[int] = [3, 2, 6, 4],
+        hidden_sizes: List[int] = [48, 96, 224, 448],
+        downsamples: List[bool] = [True, True, True, True],
+        dim: int = 448,
+        key_dim: int = 32,
+        attention_ratio: int = 4,
+        resolution: int = 7,
+        num_hidden_layers: int = 5,
+        num_attention_heads: int = 8,
+        mlp_expansion_ratio: int = 4,
+        hidden_dropout_prob: float = 0.0,
+        patch_size: int = 16,
+        num_channels: int = 3,
+        pool_size: int = 3,
+        downsample_patch_size: int = 3,
+        downsample_stride: int = 2,
+        downsample_pad: int = 1,
+        drop_path_rate: float = 0.0,
+        num_meta3d_blocks: int = 1,
+        distillation: bool = True,
+        use_layer_scale: bool = True,
+        layer_scale_init_value: float = 1e-5,
+        hidden_act: str = "gelu",
+        initializer_range: float = 0.02,
+        layer_norm_eps: float = 1e-12,
+        image_size: int = 224,
+        batch_norm_eps: float = 1e-05,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.hidden_sizes = hidden_sizes
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.depths = depths
+        self.mlp_expansion_ratio = mlp_expansion_ratio
+        self.downsamples = downsamples
+        self.dim = dim
+        self.key_dim = key_dim
+        self.attention_ratio = attention_ratio
+        self.resolution = resolution
+        self.pool_size = pool_size
+        self.downsample_patch_size = downsample_patch_size
+        self.downsample_stride = downsample_stride
+        self.downsample_pad = downsample_pad
+        self.drop_path_rate = drop_path_rate
+        self.num_meta3d_blocks = num_meta3d_blocks
+        self.distillation = distillation
+        self.use_layer_scale = use_layer_scale
+        self.layer_scale_init_value = layer_scale_init_value
+        self.image_size = image_size
+        self.batch_norm_eps = batch_norm_eps
diff --git a/src/transformers/models/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py
new file mode 100644
index 000000000000..7431cd6136a5
--- /dev/null
+++ b/src/transformers/models/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py
@@ -0,0 +1,252 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Convert EfficientFormer checkpoints from the original repository.
+
+URL: https://github.com/snap-research/EfficientFormer
+"""
+
+import argparse
+import re
+from pathlib import Path
+
+import requests
+import torch
+from PIL import Image
+from torchvision.transforms import CenterCrop, Compose, Normalize, Resize, ToTensor
+
+from transformers import (
+    EfficientFormerConfig,
+    EfficientFormerForImageClassificationWithTeacher,
+    EfficientFormerImageProcessor,
+)
+from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling
+
+
+def rename_key(old_name, num_meta4D_last_stage):
+    new_name = old_name
+
+    if "patch_embed" in old_name:
+        _, layer, param = old_name.split(".")
+
+        if layer == "0":
+            new_name = old_name.replace("0", "convolution1")
+        elif layer == "1":
+            new_name = old_name.replace("1", "batchnorm_before")
+        elif layer == "3":
+            new_name = old_name.replace("3", "convolution2")
+        else:
+            new_name = old_name.replace("4", "batchnorm_after")
+
+    if "network" in old_name and re.search(r"\d\.\d", old_name):
+        two_digit_num = r"\b\d{2}\b"
+        if bool(re.search(two_digit_num, old_name)):
+            match = re.search(r"\d\.\d\d.", old_name).group()
+        else:
+            match = re.search(r"\d\.\d.", old_name).group()
+        if int(match[0]) < 6:
+            trimmed_name = old_name.replace(match, "")
+            trimmed_name = trimmed_name.replace("network", match[0] + ".meta4D_layers.blocks." + match[2:-1])
+            new_name = "intermediate_stages." + trimmed_name
+        else:
+            trimmed_name = old_name.replace(match, "")
+            if int(match[2]) < num_meta4D_last_stage:
+                trimmed_name = trimmed_name.replace("network", "meta4D_layers.blocks." + match[2])
+            else:
+                layer_index = str(int(match[2]) - num_meta4D_last_stage)
+                trimmed_name = trimmed_name.replace("network", "meta3D_layers.blocks." + layer_index)
+                if "norm1" in old_name:
+                    trimmed_name = trimmed_name.replace("norm1", "layernorm1")
+                elif "norm2" in old_name:
+                    trimmed_name = trimmed_name.replace("norm2", "layernorm2")
+                elif "fc1" in old_name:
+                    trimmed_name = trimmed_name.replace("fc1", "linear_in")
+                elif "fc2" in old_name:
+                    trimmed_name = trimmed_name.replace("fc2", "linear_out")
+
+            new_name = "last_stage." + trimmed_name
+
+    elif "network" in old_name and re.search(r".\d.", old_name):
+        new_name = old_name.replace("network", "intermediate_stages")
+
+    if "fc" in new_name:
+        new_name = new_name.replace("fc", "convolution")
+    elif ("norm1" in new_name) and ("layernorm1" not in new_name):
+        new_name = new_name.replace("norm1", "batchnorm_before")
+    elif ("norm2" in new_name) and ("layernorm2" not in new_name):
+        new_name = new_name.replace("norm2", "batchnorm_after")
+    if "proj" in new_name:
+        new_name = new_name.replace("proj", "projection")
+    if "dist_head" in new_name:
+        new_name = new_name.replace("dist_head", "distillation_classifier")
+    elif "head" in new_name:
+        new_name = new_name.replace("head", "classifier")
+    elif "patch_embed" in new_name:
+        new_name = "efficientformer." + new_name
+    elif new_name == "norm.weight" or new_name == "norm.bias":
+        new_name = new_name.replace("norm", "layernorm")
+        new_name = "efficientformer." + new_name
+    else:
+        new_name = "efficientformer.encoder." + new_name
+
+    return new_name
+
+
+def convert_torch_checkpoint(checkpoint, num_meta4D_last_stage):
+    for key in checkpoint.copy().keys():
+        val = checkpoint.pop(key)
+        checkpoint[rename_key(key, num_meta4D_last_stage)] = val
+
+    return checkpoint
+
+
+# We will verify our results on a COCO image
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+
+    return image
+
+
+def convert_efficientformer_checkpoint(
+    checkpoint_path: Path, efficientformer_config_file: Path, pytorch_dump_path: Path, push_to_hub: bool
+):
+    orig_state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
+    config = EfficientFormerConfig.from_json_file(efficientformer_config_file)
+    model = EfficientFormerForImageClassificationWithTeacher(config)
+    model_name = "_".join(checkpoint_path.split("/")[-1].split(".")[0].split("_")[:-1])
+
+    num_meta4D_last_stage = config.depths[-1] - config.num_meta3d_blocks + 1
+    new_state_dict = convert_torch_checkpoint(orig_state_dict, num_meta4D_last_stage)
+
+    model.load_state_dict(new_state_dict)
+    model.eval()
+
+    pillow_resamplings = {
+        "bilinear": PILImageResampling.BILINEAR,
+        "bicubic": PILImageResampling.BICUBIC,
+        "nearest": PILImageResampling.NEAREST,
+    }
+
+    # prepare image
+    image = prepare_img()
+    image_size = 256
+    crop_size = 224
+    processor = EfficientFormerImageProcessor(
+        size={"shortest_edge": image_size},
+        crop_size={"height": crop_size, "width": crop_size},
+        resample=pillow_resamplings["bicubic"],
+    )
+    pixel_values = processor(images=image, return_tensors="pt").pixel_values
+
+    # original processing pipeline
+    image_transforms = Compose(
+        [
+            Resize(image_size, interpolation=pillow_resamplings["bicubic"]),
+            CenterCrop(crop_size),
+            ToTensor(),
+            Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
+        ]
+    )
+    original_pixel_values = image_transforms(image).unsqueeze(0)
+
+    assert torch.allclose(original_pixel_values, pixel_values)
+
+    outputs = model(pixel_values)
+    logits = outputs.logits
+
+    expected_shape = (1, 1000)
+
+    if "l1" in model_name:
+        expected_logits = torch.Tensor(
+            [-0.1312, 0.4353, -1.0499, -0.5124, 0.4183, -0.6793, -1.3777, -0.0893, -0.7358, -2.4328]
+        )
+        assert torch.allclose(logits[0, :10], expected_logits, atol=1e-3)
+        assert logits.shape == expected_shape
+    elif "l3" in model_name:
+        expected_logits = torch.Tensor(
+            [-1.3150, -1.5456, -1.2556, -0.8496, -0.7127, -0.7897, -0.9728, -0.3052, 0.3751, -0.3127]
+        )
+        assert torch.allclose(logits[0, :10], expected_logits, atol=1e-3)
+        assert logits.shape == expected_shape
+    elif "l7" in model_name:
+        expected_logits = torch.Tensor(
+            [-1.0283, -1.4131, -0.5644, -1.3115, -0.5785, -1.2049, -0.7528, 0.1992, -0.3822, -0.0878]
+        )
+        assert logits.shape == expected_shape
+    else:
+        raise ValueError(
+            f"Unknown model checkpoint: {checkpoint_path}. Supported version of efficientformer are l1, l3 and l7"
+        )
+
+    # Save Checkpoints
+    Path(pytorch_dump_path).mkdir(exist_ok=True)
+    model.save_pretrained(pytorch_dump_path)
+    print(f"Checkpoint successfuly converted. Model saved at {pytorch_dump_path}")
+    processor.save_pretrained(pytorch_dump_path)
+    print(f"Processor successfuly saved at {pytorch_dump_path}")
+
+    if push_to_hub:
+        print("Pushing model to the hub...")
+
+        model.push_to_hub(
+            repo_id=f"Bearnardd/{pytorch_dump_path}",
+            commit_message="Add model",
+            use_temp_dir=True,
+        )
+        processor.push_to_hub(
+            repo_id=f"Bearnardd/{pytorch_dump_path}",
+            commit_message="Add image processor",
+            use_temp_dir=True,
+        )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--pytorch_model_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to EfficientFormer pytorch checkpoint.",
+    )
+    parser.add_argument(
+        "--config_file",
+        default=None,
+        type=str,
+        required=True,
+        help="The json file for EfficientFormer model config.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
+
+    parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub")
+    parser.add_argument(
+        "--no-push_to_hub",
+        dest="push_to_hub",
+        action="store_false",
+        help="Do not push model and image processor to the hub",
+    )
+    parser.set_defaults(push_to_hub=True)
+
+    args = parser.parse_args()
+    convert_efficientformer_checkpoint(
+        checkpoint_path=args.pytorch_model_path,
+        efficientformer_config_file=args.config_file,
+        pytorch_dump_path=args.pytorch_dump_path,
+        push_to_hub=args.push_to_hub,
+    )
diff --git a/src/transformers/models/efficientformer/image_processing_efficientformer.py b/src/transformers/models/efficientformer/image_processing_efficientformer.py
new file mode 100644
index 000000000000..be8477678c5f
--- /dev/null
+++ b/src/transformers/models/efficientformer/image_processing_efficientformer.py
@@ -0,0 +1,299 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for EfficientFormer."""
+
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+    get_resize_output_image_size,
+    resize,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_batched,
+    is_scaled_image,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class EfficientFormerImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a EfficientFormer image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `(size["height"],
+            size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method.
+        size (`dict`, *optional*, defaults to `{"height": 224, "width": 224}`):
+            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
+            `preprocess` method.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
+            `preprocess` method.
+        crop_size (`Dict[str, int]` *optional*, defaults to 224):
+            Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
+            method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+            `preprocess` method.
+        do_normalize:
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[Dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_center_crop: bool = True,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        crop_size: Dict[str, int] = None,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 224, "width": 224}
+        size = get_size_dict(size)
+        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
+        crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
+
+        self.do_resize = do_resize
+        self.do_rescale = do_rescale
+        self.do_normalize = do_normalize
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.size = size
+        self.resample = resample
+        self.rescale_factor = rescale_factor
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to `(size["height"], size["width"])`.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample:
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+
+        Returns:
+            `np.ndarray`: The resized image.
+        """
+        size = get_size_dict(size)
+
+        if "shortest_edge" in size:
+            size = get_resize_output_image_size(
+                image, size=size["shortest_edge"], default_to_square=False, input_data_format=input_data_format
+            )
+            # size = get_resize_output_image_size(image, size["shortest_edge"], size["longest_edge"])
+        elif "height" in size and "width" in size:
+            size = (size["height"], size["width"])
+        else:
+            raise ValueError(f"Size must contain 'height' and 'width' keys or 'shortest_edge' key. Got {size.keys()}")
+        return resize(
+            image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
+        )
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: int = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after
+                resizing.
+            resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
+                `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
+                an effect if `do_resize` is set to `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+                Whether to center crop the image.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use if `do_normalize` is set to `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True)
+        resample = resample if resample is not None else self.resample
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+
+        size = size if size is not None else self.size
+        size_dict = get_size_dict(size)
+
+        if not is_batched(images):
+            images = [images]
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        if do_resize and size is None:
+            raise ValueError("Size must be specified if do_resize is True.")
+
+        if do_center_crop and crop_size is None:
+            raise ValueError("Crop size must be specified if do_center_crop is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        if do_resize:
+            images = [
+                self.resize(image=image, size=size_dict, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_center_crop:
+            images = [
+                self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images
+            ]
+
+        if do_rescale:
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_normalize:
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/src/transformers/models/efficientformer/modeling_efficientformer.py b/src/transformers/models/efficientformer/modeling_efficientformer.py
new file mode 100644
index 000000000000..5f03a5ab7472
--- /dev/null
+++ b/src/transformers/models/efficientformer/modeling_efficientformer.py
@@ -0,0 +1,806 @@
+# coding=utf-8
+# Copyright 2022 Snapchat Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch EfficientFormer model."""
+
+import itertools
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from .configuration_efficientformer import EfficientFormerConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "EfficientFormerConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "snap-research/efficientformer-l1-300"
+_EXPECTED_OUTPUT_SHAPE = [1, 49, 448]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "snap-research/efficientformer-l1-300"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "Egyptian cat"
+
+
+EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "snap-research/efficientformer-l1-300",
+    # See all EfficientFormer models at https://huggingface.co/models?filter=efficientformer
+]
+
+
+class EfficientFormerPatchEmbeddings(nn.Module):
+    """
+    This class performs downsampling between two stages. For the input tensor with the shape [batch_size, num_channels,
+    height, width] it produces output tensor with the shape [batch_size, num_channels, height/stride, width/stride]
+    """
+
+    def __init__(self, config: EfficientFormerConfig, num_channels: int, embed_dim: int, apply_norm: bool = True):
+        super().__init__()
+        self.num_channels = num_channels
+
+        self.projection = nn.Conv2d(
+            num_channels,
+            embed_dim,
+            kernel_size=config.downsample_patch_size,
+            stride=config.downsample_stride,
+            padding=config.downsample_pad,
+        )
+        self.norm = nn.BatchNorm2d(embed_dim, eps=config.batch_norm_eps) if apply_norm else nn.Identity()
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+
+        embeddings = self.projection(pixel_values)
+        embeddings = self.norm(embeddings)
+
+        return embeddings
+
+
+class EfficientFormerSelfAttention(nn.Module):
+    def __init__(self, dim: int, key_dim: int, num_heads: int, attention_ratio: int, resolution: int):
+        super().__init__()
+
+        self.num_heads = num_heads
+        self.key_dim = key_dim
+        self.attention_ratio = attention_ratio
+        self.scale = key_dim**-0.5
+        self.total_key_dim = key_dim * num_heads
+        self.expanded_key_dim = int(attention_ratio * key_dim)
+        self.total_expanded_key_dim = int(self.expanded_key_dim * num_heads)
+        hidden_size = self.total_expanded_key_dim + self.total_key_dim * 2
+        self.qkv = nn.Linear(dim, hidden_size)
+        self.projection = nn.Linear(self.total_expanded_key_dim, dim)
+        points = list(itertools.product(range(resolution), range(resolution)))
+        num_points = len(points)
+        attention_offsets = {}
+        idxs = []
+        for point_1 in points:
+            for point_2 in points:
+                offset = (abs(point_1[0] - point_2[0]), abs(point_1[1] - point_2[1]))
+                if offset not in attention_offsets:
+                    attention_offsets[offset] = len(attention_offsets)
+                idxs.append(attention_offsets[offset])
+        self.attention_biases = torch.nn.Parameter(torch.zeros(num_heads, len(attention_offsets)))
+        self.register_buffer("attention_bias_idxs", torch.LongTensor(idxs).view(num_points, num_points))
+
+    @torch.no_grad()
+    def train(self, mode=True):
+        super().train(mode)
+        if mode and hasattr(self, "ab"):
+            del self.ab
+        else:
+            self.ab = self.attention_biases[:, self.attention_bias_idxs]
+
+    def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> Tuple[torch.Tensor]:
+        batch_size, sequence_length, num_channels = hidden_states.shape
+        qkv = self.qkv(hidden_states)
+        query_layer, key_layer, value_layer = qkv.reshape(batch_size, sequence_length, self.num_heads, -1).split(
+            [self.key_dim, self.key_dim, self.expanded_key_dim], dim=3
+        )
+        query_layer = query_layer.permute(0, 2, 1, 3)
+        key_layer = key_layer.permute(0, 2, 1, 3)
+        value_layer = value_layer.permute(0, 2, 1, 3)
+
+        # set `model.to(torch_device)` won't change `self.ab.device`, if there is no follow-up `train` or `eval` call.
+        # Let's do it manually here, so users won't have to do this everytime.
+        if not self.training:
+            self.ab = self.ab.to(self.attention_biases.device)
+        attention_probs = (torch.matmul(query_layer, key_layer.transpose(-2, -1))) * self.scale + (
+            self.attention_biases[:, self.attention_bias_idxs] if self.training else self.ab
+        )
+
+        attention_probs = attention_probs.softmax(dim=-1)
+
+        context_layer = torch.matmul(attention_probs, value_layer).transpose(1, 2)
+        context_layer = context_layer.reshape(batch_size, sequence_length, self.total_expanded_key_dim)
+        context_layer = self.projection(context_layer)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+class EfficientFormerConvStem(nn.Module):
+    def __init__(self, config: EfficientFormerConfig, out_channels: int):
+        super().__init__()
+
+        self.convolution1 = nn.Conv2d(config.num_channels, out_channels // 2, kernel_size=3, stride=2, padding=1)
+        self.batchnorm_before = nn.BatchNorm2d(out_channels // 2, eps=config.batch_norm_eps)
+
+        self.convolution2 = nn.Conv2d(out_channels // 2, out_channels, kernel_size=3, stride=2, padding=1)
+        self.batchnorm_after = nn.BatchNorm2d(out_channels, eps=config.batch_norm_eps)
+
+        self.activation = nn.ReLU()
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        features = self.batchnorm_before(self.convolution1(pixel_values))
+        features = self.activation(features)
+        features = self.batchnorm_after(self.convolution2(features))
+        features = self.activation(features)
+
+        return features
+
+
+class EfficientFormerPooling(nn.Module):
+    def __init__(self, pool_size: int):
+        super().__init__()
+        self.pool = nn.AvgPool2d(pool_size, stride=1, padding=pool_size // 2, count_include_pad=False)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        output = self.pool(hidden_states) - hidden_states
+        return output
+
+
+class EfficientFormerDenseMlp(nn.Module):
+    def __init__(
+        self,
+        config: EfficientFormerConfig,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+
+        self.linear_in = nn.Linear(in_features, hidden_features)
+        self.activation = ACT2FN[config.hidden_act]
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.linear_out = nn.Linear(hidden_features, out_features)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.linear_in(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.linear_out(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+class EfficientFormerConvMlp(nn.Module):
+    def __init__(
+        self,
+        config: EfficientFormerConfig,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        drop: float = 0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+
+        self.convolution1 = nn.Conv2d(in_features, hidden_features, 1)
+        self.activation = ACT2FN[config.hidden_act]
+        self.convolution2 = nn.Conv2d(hidden_features, out_features, 1)
+        self.dropout = nn.Dropout(drop)
+
+        self.batchnorm_before = nn.BatchNorm2d(hidden_features, eps=config.batch_norm_eps)
+        self.batchnorm_after = nn.BatchNorm2d(out_features, eps=config.batch_norm_eps)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.convolution1(hidden_state)
+        hidden_state = self.batchnorm_before(hidden_state)
+
+        hidden_state = self.activation(hidden_state)
+        hidden_state = self.dropout(hidden_state)
+        hidden_state = self.convolution2(hidden_state)
+
+        hidden_state = self.batchnorm_after(hidden_state)
+        hidden_state = self.dropout(hidden_state)
+
+        return hidden_state
+
+
+# Copied from transformers.models.convnext.modeling_convnext.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->EfficientFormer
+class EfficientFormerDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+
+
+class EfficientFormerFlat(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor]:
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+        return hidden_states
+
+
+class EfficientFormerMeta3D(nn.Module):
+    def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0):
+        super().__init__()
+
+        self.token_mixer = EfficientFormerSelfAttention(
+            dim=config.dim,
+            key_dim=config.key_dim,
+            num_heads=config.num_attention_heads,
+            attention_ratio=config.attention_ratio,
+            resolution=config.resolution,
+        )
+
+        self.layernorm1 = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+        self.layernorm2 = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+
+        mlp_hidden_dim = int(dim * config.mlp_expansion_ratio)
+        self.mlp = EfficientFormerDenseMlp(config, in_features=dim, hidden_features=mlp_hidden_dim)
+
+        self.drop_path = EfficientFormerDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.use_layer_scale = config.use_layer_scale
+        if config.use_layer_scale:
+            self.layer_scale_1 = nn.Parameter(config.layer_scale_init_value * torch.ones((dim)), requires_grad=True)
+            self.layer_scale_2 = nn.Parameter(config.layer_scale_init_value * torch.ones((dim)), requires_grad=True)
+
+    def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> Tuple[torch.Tensor]:
+        self_attention_outputs = self.token_mixer(self.layernorm1(hidden_states), output_attentions)
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        if self.use_layer_scale:
+            layer_output = hidden_states + self.drop_path(
+                self.layer_scale_1.unsqueeze(0).unsqueeze(0) * attention_output
+            )
+            layer_output = layer_output + self.drop_path(
+                self.layer_scale_2.unsqueeze(0).unsqueeze(0) * self.mlp(self.layernorm2(layer_output))
+            )
+        else:
+            layer_output = hidden_states + self.drop_path(attention_output)
+            layer_output = layer_output + self.drop_path(self.mlp(self.layernorm2(layer_output)))
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+class EfficientFormerMeta3DLayers(nn.Module):
+    def __init__(self, config: EfficientFormerConfig):
+        super().__init__()
+        drop_paths = [
+            config.drop_path_rate * (block_idx + sum(config.depths[:-1]))
+            for block_idx in range(config.num_meta3d_blocks)
+        ]
+        self.blocks = nn.ModuleList(
+            [EfficientFormerMeta3D(config, config.hidden_sizes[-1], drop_path=drop_path) for drop_path in drop_paths]
+        )
+
+    def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> Tuple[torch.Tensor]:
+        all_attention_outputs = () if output_attentions else None
+
+        for layer_module in self.blocks:
+            if isinstance(hidden_states, tuple):
+                hidden_states = hidden_states[0]
+
+            hidden_states = layer_module(hidden_states, output_attentions)
+
+            if output_attentions:
+                all_attention_outputs = all_attention_outputs + (hidden_states[1],)
+
+        if output_attentions:
+            outputs = (hidden_states[0],) + all_attention_outputs
+            return outputs
+
+        return hidden_states
+
+
+class EfficientFormerMeta4D(nn.Module):
+    def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0):
+        super().__init__()
+        pool_size = config.pool_size if config.pool_size is not None else 3
+        self.token_mixer = EfficientFormerPooling(pool_size=pool_size)
+        mlp_hidden_dim = int(dim * config.mlp_expansion_ratio)
+        self.mlp = EfficientFormerConvMlp(
+            config, in_features=dim, hidden_features=mlp_hidden_dim, drop=config.hidden_dropout_prob
+        )
+
+        self.drop_path = EfficientFormerDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.use_layer_scale = config.use_layer_scale
+        if config.use_layer_scale:
+            self.layer_scale_1 = nn.Parameter(config.layer_scale_init_value * torch.ones((dim)), requires_grad=True)
+            self.layer_scale_2 = nn.Parameter(config.layer_scale_init_value * torch.ones((dim)), requires_grad=True)
+
+    def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor]:
+        outputs = self.token_mixer(hidden_states)
+
+        if self.use_layer_scale:
+            layer_output = hidden_states + self.drop_path(self.layer_scale_1.unsqueeze(-1).unsqueeze(-1) * outputs)
+
+            layer_output = layer_output + self.drop_path(
+                self.layer_scale_2.unsqueeze(-1).unsqueeze(-1) * self.mlp(layer_output)
+            )
+        else:
+            layer_output = hidden_states + self.drop_path(outputs)
+            layer_output = layer_output + self.drop_path(self.mlp(layer_output))
+
+        return layer_output
+
+
+class EfficientFormerMeta4DLayers(nn.Module):
+    def __init__(self, config: EfficientFormerConfig, stage_idx: int):
+        super().__init__()
+        num_layers = (
+            config.depths[stage_idx] if stage_idx != -1 else config.depths[stage_idx] - config.num_meta3d_blocks
+        )
+        drop_paths = [
+            config.drop_path_rate * (block_idx + sum(config.depths[:stage_idx])) for block_idx in range(num_layers)
+        ]
+
+        self.blocks = nn.ModuleList(
+            [
+                EfficientFormerMeta4D(config, config.hidden_sizes[stage_idx], drop_path=drop_path)
+                for drop_path in drop_paths
+            ]
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor]:
+        for layer_module in self.blocks:
+            hidden_states = layer_module(hidden_states)
+        return hidden_states
+
+
+class EfficientFormerIntermediateStage(nn.Module):
+    def __init__(self, config: EfficientFormerConfig, index: int):
+        super().__init__()
+        self.meta4D_layers = EfficientFormerMeta4DLayers(config, index)
+
+    def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor]:
+        hidden_states = self.meta4D_layers(hidden_states)
+        return hidden_states
+
+
+class EfficientFormerLastStage(nn.Module):
+    def __init__(self, config: EfficientFormerConfig):
+        super().__init__()
+        self.meta4D_layers = EfficientFormerMeta4DLayers(config, -1)
+        self.flat = EfficientFormerFlat()
+        self.meta3D_layers = EfficientFormerMeta3DLayers(config)
+
+    def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> Tuple[torch.Tensor]:
+        hidden_states = self.meta4D_layers(hidden_states)
+        hidden_states = self.flat(hidden_states)
+        hidden_states = self.meta3D_layers(hidden_states, output_attentions)
+
+        return hidden_states
+
+
+class EfficientFormerEncoder(nn.Module):
+    def __init__(self, config: EfficientFormerConfig):
+        super().__init__()
+        self.config = config
+        num_intermediate_stages = len(config.depths) - 1
+        downsamples = [
+            config.downsamples[i] or config.hidden_sizes[i] != config.hidden_sizes[i + 1]
+            for i in range(num_intermediate_stages)
+        ]
+        intermediate_stages = []
+
+        for i in range(num_intermediate_stages):
+            intermediate_stages.append(EfficientFormerIntermediateStage(config, i))
+            if downsamples[i]:
+                intermediate_stages.append(
+                    EfficientFormerPatchEmbeddings(config, config.hidden_sizes[i], config.hidden_sizes[i + 1])
+                )
+
+        self.intermediate_stages = nn.ModuleList(intermediate_stages)
+        self.last_stage = EfficientFormerLastStage(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_hidden_states: bool = False,
+        output_attentions: bool = False,
+        return_dict: bool = True,
+    ) -> BaseModelOutput:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        for layer_module in self.intermediate_stages:
+            hidden_states = layer_module(hidden_states)
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+        layer_output = self.last_stage(hidden_states, output_attentions=output_attentions)
+
+        if output_attentions:
+            all_self_attentions = all_self_attentions + layer_output[1:]
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (layer_output[0],)
+
+        if not return_dict:
+            return tuple(v for v in [layer_output[0], all_hidden_states, all_self_attentions] if v is not None)
+
+        return BaseModelOutput(
+            last_hidden_state=layer_output[0],
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class EfficientFormerPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = EfficientFormerConfig
+    base_model_prefix = "efficientformer"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = False
+
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+EFFICIENTFORMER_START_DOCSTRING = r"""
+    This model is a PyTorch [nn.Module](https://pytorch.org/docs/stable/nn.html#nn.Module) subclass. Use it as a
+    regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior.
+
+    Parameters:
+        config ([`EfficientFormerConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+EFFICIENTFORMER_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`ViTImageProcessor`]. See
+            [`ViTImageProcessor.preprocess`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare EfficientFormer Model transformer outputting raw hidden-states without any specific head on top.",
+    EFFICIENTFORMER_START_DOCSTRING,
+)
+class EfficientFormerModel(EfficientFormerPreTrainedModel):
+    def __init__(self, config: EfficientFormerConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.patch_embed = EfficientFormerConvStem(config, config.hidden_sizes[0])
+        self.encoder = EfficientFormerEncoder(config)
+        self.layernorm = nn.LayerNorm(config.hidden_sizes[-1], eps=config.layer_norm_eps)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPooling,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        embedding_output = self.patch_embed(pixel_values)
+        encoder_outputs = self.encoder(
+            embedding_output, output_attentions=output_attentions, output_hidden_states=output_hidden_states
+        )
+
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+
+        if not return_dict:
+            head_outputs = (sequence_output,)
+            return head_outputs + encoder_outputs[1:]
+
+        return BaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    EfficientFormer Model transformer with an image classification head on top (a linear layer on top of the final
+    hidden state of the [CLS] token) e.g. for ImageNet.
+    """,
+    EFFICIENTFORMER_START_DOCSTRING,
+)
+class EfficientFormerForImageClassification(EfficientFormerPreTrainedModel):
+    def __init__(self, config: EfficientFormerConfig):
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.efficientformer = EfficientFormerModel(config)
+
+        # Classifier head
+        self.classifier = (
+            nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=ImageClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, ImageClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.efficientformer(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.classifier(sequence_output.mean(-2))
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@dataclass
+class EfficientFormerForImageClassificationWithTeacherOutput(ModelOutput):
+    """
+    Output type of [`EfficientFormerForImageClassificationWithTeacher`].
+
+    Args:
+        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
+            Prediction scores as the average of the cls_logits and distillation logits.
+        cls_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
+            Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the
+            class token).
+        distillation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
+            Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
+            distillation token).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+    """
+
+    logits: torch.FloatTensor = None
+    cls_logits: torch.FloatTensor = None
+    distillation_logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@add_start_docstrings(
+    """
+    EfficientFormer Model transformer with image classification heads on top (a linear layer on top of the final hidden
+    state of the [CLS] token and a linear layer on top of the final hidden state of the distillation token) e.g. for
+    ImageNet.
+
+    <Tip warning={true}>
+
+           This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet
+           supported.
+
+    </Tip>
+    """,
+    EFFICIENTFORMER_START_DOCSTRING,
+)
+class EfficientFormerForImageClassificationWithTeacher(EfficientFormerPreTrainedModel):
+    def __init__(self, config: EfficientFormerConfig):
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.efficientformer = EfficientFormerModel(config)
+
+        # Classifier head
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
+        # Distillation head
+        self.distillation_classifier = (
+            nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=EfficientFormerForImageClassificationWithTeacherOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, EfficientFormerForImageClassificationWithTeacherOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.efficientformer(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        cls_logits = self.classifier(sequence_output.mean(-2))
+        distillation_logits = self.distillation_classifier(sequence_output.mean(-2))
+
+        # during inference, return the average of both classifier predictions
+        logits = (cls_logits + distillation_logits) / 2
+
+        if not return_dict:
+            output = (logits, cls_logits, distillation_logits) + outputs[1:]
+            return output
+
+        return EfficientFormerForImageClassificationWithTeacherOutput(
+            logits=logits,
+            cls_logits=cls_logits,
+            distillation_logits=distillation_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/efficientformer/modeling_tf_efficientformer.py b/src/transformers/models/efficientformer/modeling_tf_efficientformer.py
new file mode 100644
index 000000000000..1907af388f92
--- /dev/null
+++ b/src/transformers/models/efficientformer/modeling_tf_efficientformer.py
@@ -0,0 +1,986 @@
+# coding=utf-8
+# Copyright 2023 Snapchat Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" TensorFlow EfficientFormer model."""
+
+import itertools
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import tensorflow as tf
+
+from ...activations_tf import ACT2FN
+from ...modeling_tf_outputs import (
+    TFBaseModelOutput,
+    TFBaseModelOutputWithPooling,
+    TFImageClassifierOutput,
+)
+from ...modeling_tf_utils import (
+    TFPreTrainedModel,
+    TFSequenceClassificationLoss,
+    get_initializer,
+    keras_serializable,
+    unpack_inputs,
+)
+from ...tf_utils import shape_list, stable_softmax
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from .configuration_efficientformer import EfficientFormerConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "EfficientFormerConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "snap-research/efficientformer-l1-300"
+_EXPECTED_OUTPUT_SHAPE = [1, 49, 448]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "snap-research/efficientformer-l1-300"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "LABEL_281"
+
+
+TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "snap-research/efficientformer-l1-300",
+    # See all EfficientFormer models at https://huggingface.co/models?filter=efficientformer
+]
+
+
+class TFEfficientFormerPatchEmbeddings(tf.keras.layers.Layer):
+    """
+    This class performs downsampling between two stages. For the input tensor with the shape [batch_size, num_channels,
+    height, width] it produces output tensor with the shape [batch_size, num_channels, height/stride, width/stride]
+    """
+
+    def __init__(
+        self, config: EfficientFormerConfig, num_channels: int, embed_dim: int, apply_norm: bool = True, **kwargs
+    ) -> None:
+        super().__init__(**kwargs)
+        self.num_channels = num_channels
+
+        self.padding = tf.keras.layers.ZeroPadding2D(padding=config.downsample_pad)
+        self.projection = tf.keras.layers.Conv2D(
+            filters=embed_dim,
+            kernel_size=config.downsample_patch_size,
+            strides=config.downsample_stride,
+            padding="valid",
+            name="projection",
+        )
+        # Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization
+        self.norm = (
+            tf.keras.layers.BatchNormalization(axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="norm")
+            if apply_norm
+            else tf.identity
+        )
+
+    def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor:
+        tf.debugging.assert_shapes(
+            [(pixel_values, (..., None, None, self.num_channels))],
+            message="Make sure that the channel dimension of the pixel values match with the one set in the configuration.",
+        )
+        embeddings = self.projection(self.padding(pixel_values))
+        embeddings = self.norm(embeddings, training=training)
+        return embeddings
+
+
+class TFEfficientFormerSelfAttention(tf.keras.layers.Layer):
+    def __init__(
+        self,
+        dim: int,
+        key_dim: int,
+        num_heads: int,
+        attention_ratio: int,
+        resolution: int,
+        config: EfficientFormerConfig,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.num_heads = num_heads
+        self.key_dim = key_dim
+        self.attention_ratio = attention_ratio
+        self.scale = key_dim**-0.5
+        self.total_key_dim = key_dim * num_heads
+        self.expanded_key_dim = int(attention_ratio * key_dim)
+        self.total_expanded_key_dim = int(self.expanded_key_dim * num_heads)
+        hidden_size = self.total_expanded_key_dim + self.total_key_dim * 2
+
+        self.qkv = tf.keras.layers.Dense(
+            units=hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="qkv"
+        )
+        self.projection = tf.keras.layers.Dense(
+            units=dim, kernel_initializer=get_initializer(config.initializer_range), name="projection"
+        )
+        self.resolution = resolution
+
+    def build(self, input_shape: tf.TensorShape) -> None:
+        points = list(itertools.product(range(self.resolution), range(self.resolution)))
+        num_points = len(points)
+        attention_offsets = {}
+
+        idxs = []
+
+        for point_1 in points:
+            for point_2 in points:
+                offset = (abs(point_1[0] - point_2[0]), abs(point_1[1] - point_2[1]))
+                if offset not in attention_offsets:
+                    attention_offsets[offset] = len(attention_offsets)
+                idxs.append(attention_offsets[offset])
+
+        self.attention_biases = self.add_weight(
+            shape=(self.num_heads, len(attention_offsets)),
+            initializer=tf.keras.initializers.zeros(),
+            trainable=True,
+            name="attention_biases",
+        )
+        self.attention_bias_idxs = self.add_weight(
+            shape=(num_points, num_points),
+            trainable=False,
+            dtype=tf.int32,
+            name="attention_bias_idxs",
+        )
+
+        self.attention_bias_idxs.assign(tf.reshape(tf.cast(idxs, dtype=tf.int32), (num_points, num_points)))
+
+        super().build(input_shape)
+
+    def call(
+        self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False
+    ) -> Tuple[tf.Tensor]:
+        batch_size, sequence_length, *_ = shape_list(hidden_states)
+        qkv = self.qkv(inputs=hidden_states)
+
+        query_layer, key_layer, value_layer = tf.split(
+            tf.reshape(tensor=qkv, shape=(batch_size, sequence_length, self.num_heads, -1)),
+            num_or_size_splits=[self.key_dim, self.key_dim, self.expanded_key_dim],
+            axis=3,
+        )
+
+        query_layer = tf.transpose(query_layer, perm=[0, 2, 1, 3])
+        key_layer = tf.transpose(key_layer, perm=[0, 2, 1, 3])
+        value_layer = tf.transpose(value_layer, perm=[0, 2, 1, 3])
+
+        attention_probs = tf.matmul(query_layer, tf.transpose(key_layer, perm=[0, 1, 3, 2]))
+        scale = tf.cast(self.scale, dtype=attention_probs.dtype)
+        attention_probs = tf.multiply(attention_probs, scale)
+
+        attention_biases = tf.gather(params=self.attention_biases, indices=self.attention_bias_idxs, axis=1)
+        attention_probs = attention_probs + attention_biases
+        attention_probs = stable_softmax(logits=attention_probs, axis=-1)
+
+        context_layer = tf.matmul(attention_probs, value_layer)
+        context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
+
+        context_layer = tf.reshape(
+            tensor=context_layer, shape=(batch_size, sequence_length, self.total_expanded_key_dim)
+        )
+        context_layer = self.projection(context_layer)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+class TFEfficientFormerConvStem(tf.keras.layers.Layer):
+    def __init__(self, config: EfficientFormerConfig, out_channels: int, **kwargs):
+        super().__init__(**kwargs)
+
+        self.padding = tf.keras.layers.ZeroPadding2D(padding=1)
+        self.convolution1 = tf.keras.layers.Conv2D(
+            filters=out_channels // 2, kernel_size=3, strides=2, padding="valid", name="convolution1"
+        )
+        # Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization
+        self.batchnorm_before = tf.keras.layers.BatchNormalization(
+            axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_before"
+        )
+
+        self.convolution2 = tf.keras.layers.Conv2D(
+            filters=out_channels,
+            kernel_size=3,
+            strides=2,
+            padding="valid",
+            name="convolution2",
+        )
+        # Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization
+        self.batchnorm_after = tf.keras.layers.BatchNormalization(
+            axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_after"
+        )
+
+        self.activation = tf.keras.layers.Activation(activation=tf.keras.activations.relu, name="activation")
+
+    def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor:
+        features = self.batchnorm_before(self.convolution1(self.padding(pixel_values)), training=training)
+        features = self.activation(features)
+        features = self.batchnorm_after(self.convolution2(self.padding(features)), training=training)
+        features = self.activation(features)
+        return features
+
+
+class TFEfficientFormerPooling(tf.keras.layers.Layer):
+    def __init__(self, pool_size: int, **kwargs):
+        super().__init__(**kwargs)
+        self.pool = tf.keras.layers.AveragePooling2D(pool_size=pool_size, strides=1, padding="same")
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        output = self.pool(hidden_states)
+        output = output - hidden_states
+        return output
+
+
+class TFEfficientFormerDenseMlp(tf.keras.layers.Layer):
+    def __init__(
+        self,
+        config: EfficientFormerConfig,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+
+        self.linear_in = tf.keras.layers.Dense(
+            units=hidden_features, kernel_initializer=get_initializer(config.initializer_range), name="linear_in"
+        )
+        self.activation = ACT2FN[config.hidden_act]
+        self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
+
+        self.linear_out = tf.keras.layers.Dense(
+            units=out_features, kernel_initializer=get_initializer(config.initializer_range), name="linear_out"
+        )
+
+    def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_states = self.linear_in(inputs=hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.dropout(inputs=hidden_states, training=training)
+        hidden_states = self.linear_out(inputs=hidden_states)
+        hidden_states = self.dropout(inputs=hidden_states, training=training)
+
+        return hidden_states
+
+
+class TFEfficientFormerConvMlp(tf.keras.layers.Layer):
+    def __init__(
+        self,
+        config: EfficientFormerConfig,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        drop: float = 0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+
+        self.convolution1 = tf.keras.layers.Conv2D(
+            filters=hidden_features,
+            kernel_size=1,
+            name="convolution1",
+            padding="valid",
+        )
+
+        self.activation = ACT2FN[config.hidden_act]
+
+        self.convolution2 = tf.keras.layers.Conv2D(
+            filters=out_features,
+            kernel_size=1,
+            name="convolution2",
+            padding="valid",
+        )
+
+        self.dropout = tf.keras.layers.Dropout(rate=drop)
+
+        # Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization
+        self.batchnorm_before = tf.keras.layers.BatchNormalization(
+            axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_before"
+        )
+        # Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization
+        self.batchnorm_after = tf.keras.layers.BatchNormalization(
+            axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_after"
+        )
+
+    def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_state = self.convolution1(hidden_state)
+        hidden_state = self.batchnorm_before(hidden_state, training=training)
+        hidden_state = self.activation(hidden_state)
+        hidden_state = self.dropout(hidden_state, training=training)
+        hidden_state = self.convolution2(hidden_state)
+        hidden_state = self.batchnorm_after(hidden_state, training=training)
+        hidden_state = self.dropout(hidden_state, training=training)
+        return hidden_state
+
+
+# Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextDropPath with ConvNext->EfficientFormer
+class TFEfficientFormerDropPath(tf.keras.layers.Layer):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    References:
+        (1) github.com:rwightman/pytorch-image-models
+    """
+
+    def __init__(self, drop_path, **kwargs):
+        super().__init__(**kwargs)
+        self.drop_path = drop_path
+
+    def call(self, x, training=None):
+        if training:
+            keep_prob = 1 - self.drop_path
+            shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1)
+            random_tensor = keep_prob + tf.random.uniform(shape, 0, 1)
+            random_tensor = tf.floor(random_tensor)
+            return (x / keep_prob) * random_tensor
+        return x
+
+
+class TFEfficientFormerFlat(tf.keras.layers.Layer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def call(self, hidden_states: tf.Tensor) -> Tuple[tf.Tensor]:
+        batch_size, _, _, in_channels = shape_list(hidden_states)
+        hidden_states = tf.reshape(hidden_states, shape=[batch_size, -1, in_channels])
+        return hidden_states
+
+
+class TFEfficientFormerMeta3D(tf.keras.layers.Layer):
+    def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0, **kwargs):
+        super().__init__(**kwargs)
+
+        self.token_mixer = TFEfficientFormerSelfAttention(
+            dim=config.dim,
+            key_dim=config.key_dim,
+            num_heads=config.num_attention_heads,
+            attention_ratio=config.attention_ratio,
+            resolution=config.resolution,
+            name="token_mixer",
+            config=config,
+        )
+        self.dim = dim
+        self.config = config
+
+        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm1")
+        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm2")
+        mlp_hidden_dim = int(dim * config.mlp_expansion_ratio)
+        self.mlp = TFEfficientFormerDenseMlp(config, in_features=dim, hidden_features=mlp_hidden_dim, name="mlp")
+
+        # Using `layers.Activation` instead of `tf.identity` to better control `training' behavior.
+        self.drop_path = (
+            TFEfficientFormerDropPath(drop_path)
+            if drop_path > 0.0
+            else tf.keras.layers.Activation("linear", name="drop_path")
+        )
+        self.config = config
+
+    def build(self, input_shape: tf.TensorShape):
+        self.layer_scale_1 = None
+        self.layer_scale_2 = None
+
+        if self.config.use_layer_scale:
+            self.layer_scale_1 = self.add_weight(
+                shape=(self.dim,),
+                initializer=tf.keras.initializers.Constant(value=self.config.layer_scale_init_value),
+                trainable=True,
+                name="layer_scale_1",
+            )
+            self.layer_scale_2 = self.add_weight(
+                shape=(self.dim,),
+                initializer=tf.keras.initializers.Constant(value=self.config.layer_scale_init_value),
+                trainable=True,
+                name="layer_scale_2",
+            )
+        super().build(input_shape)
+
+    def call(
+        self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False
+    ) -> Tuple[tf.Tensor]:
+        self_attention_outputs = self.token_mixer(
+            hidden_states=self.layernorm1(hidden_states, training=training),
+            output_attentions=output_attentions,
+            training=training,
+        )
+
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        if self.config.use_layer_scale:
+            layer_output = hidden_states + self.drop_path(
+                tf.expand_dims(tf.expand_dims(self.layer_scale_1, 0), 0) * attention_output,
+                training=training,
+            )
+            layer_output = layer_output + self.drop_path(
+                tf.expand_dims(tf.expand_dims(self.layer_scale_2, 0), 0)
+                * self.mlp(hidden_states=self.layernorm2(inputs=layer_output, training=training), training=training),
+                training=training,
+            )
+        else:
+            layer_output = hidden_states + self.drop_path(attention_output, training=training)
+            layer_output = layer_output + self.drop_path(
+                self.mlp(hidden_states=self.layernorm2(inputs=layer_output, training=training), training=training),
+                training=training,
+            )
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+class TFEfficientFormerMeta3DLayers(tf.keras.layers.Layer):
+    def __init__(self, config: EfficientFormerConfig, **kwargs):
+        super().__init__(**kwargs)
+        drop_paths = [
+            config.drop_path_rate * (block_idx + sum(config.depths[:-1]))
+            for block_idx in range(config.num_meta3d_blocks)
+        ]
+        self.blocks = [
+            TFEfficientFormerMeta3D(config, config.hidden_sizes[-1], drop_path=drop_path, name=f"blocks.{i}")
+            for i, drop_path in enumerate(drop_paths)
+        ]
+
+    def call(
+        self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False
+    ) -> Tuple[tf.Tensor]:
+        all_attention_outputs = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.blocks):
+            if isinstance(hidden_states, tuple):
+                hidden_states = hidden_states[0]
+
+            hidden_states = layer_module(
+                hidden_states=hidden_states, output_attentions=output_attentions, training=training
+            )
+            if output_attentions:
+                all_attention_outputs = all_attention_outputs + (hidden_states[1],)
+
+        if output_attentions:
+            outputs = (hidden_states[0],) + all_attention_outputs
+            return outputs
+
+        return hidden_states
+
+
+class TFEfficientFormerMeta4D(tf.keras.layers.Layer):
+    def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0, **kwargs):
+        super().__init__(**kwargs)
+        pool_size = config.pool_size if config.pool_size is not None else 3
+        self.token_mixer = TFEfficientFormerPooling(pool_size=pool_size, name="token_mixer")
+        self.dim = dim
+        mlp_hidden_dim = int(dim * config.mlp_expansion_ratio)
+        self.mlp = TFEfficientFormerConvMlp(
+            config=config, in_features=dim, hidden_features=mlp_hidden_dim, drop=config.hidden_dropout_prob, name="mlp"
+        )
+
+        self.drop_path = (
+            TFEfficientFormerDropPath(drop_path, name="drop_path")
+            if drop_path > 0.0
+            else tf.keras.layers.Activation("linear", name="drop_path")
+        )
+        self.config = config
+
+    def build(self, input_shape: tf.TensorShape):
+        self.layer_scale_1 = None
+        self.layer_scale_2 = None
+
+        if self.config.use_layer_scale:
+            self.layer_scale_1 = self.add_weight(
+                shape=(self.dim),
+                initializer=tf.keras.initializers.Constant(value=self.config.layer_scale_init_value),
+                trainable=True,
+                name="layer_scale_1",
+            )
+            self.layer_scale_2 = self.add_weight(
+                shape=(self.dim),
+                initializer=tf.keras.initializers.Constant(value=self.config.layer_scale_init_value),
+                trainable=True,
+                name="layer_scale_2",
+            )
+        super().build(input_shape)
+
+    def call(self, hidden_states: tf.Tensor, training: bool = False) -> Tuple[tf.Tensor]:
+        outputs = self.token_mixer(hidden_states)
+
+        if self.config.use_layer_scale:
+            layer_output = hidden_states + self.drop_path(
+                tf.expand_dims(tf.expand_dims(self.layer_scale_1, 0), 0) * outputs,
+                training=training,
+            )
+
+            layer_output = layer_output + self.drop_path(
+                tf.expand_dims(tf.expand_dims(self.layer_scale_2, 0), 0)
+                * self.mlp(hidden_state=layer_output, training=training),
+                training=training,
+            )
+
+        else:
+            layer_output = hidden_states + self.drop_path(outputs, training=training)
+            layer_output = layer_output + self.drop_path(
+                self.mlp(hidden_state=layer_output, training=training), training=training
+            )
+
+        return layer_output
+
+
+class TFEfficientFormerMeta4DLayers(tf.keras.layers.Layer):
+    def __init__(self, config: EfficientFormerConfig, stage_idx: int, **kwargs):
+        super().__init__(**kwargs)
+        num_layers = (
+            config.depths[stage_idx] if stage_idx != -1 else config.depths[stage_idx] - config.num_meta3d_blocks
+        )
+        drop_paths = [
+            config.drop_path_rate * (block_idx + sum(config.depths[:stage_idx])) for block_idx in range(num_layers)
+        ]
+
+        self.blocks = [
+            TFEfficientFormerMeta4D(
+                config=config, dim=config.hidden_sizes[stage_idx], drop_path=drop_paths[i], name=f"blocks.{i}"
+            )
+            for i in range(len(drop_paths))
+        ]
+
+    def call(self, hidden_states: tf.Tensor, training: bool = False) -> Tuple[tf.Tensor]:
+        for layer_module in self.blocks:
+            hidden_states = layer_module(hidden_states=hidden_states, training=training)
+        return hidden_states
+
+
+class TFEfficientFormerIntermediateStage(tf.keras.layers.Layer):
+    def __init__(self, config: EfficientFormerConfig, index: int, **kwargs):
+        super().__init__(**kwargs)
+        self.meta4D_layers = TFEfficientFormerMeta4DLayers(config=config, stage_idx=index, name="meta4D_layers")
+
+    def call(self, hidden_states: tf.Tensor, training: bool = False) -> Tuple[tf.Tensor]:
+        hidden_states = self.meta4D_layers(hidden_states=hidden_states, training=training)
+        return hidden_states
+
+
+class TFEfficientFormerLastStage(tf.keras.layers.Layer):
+    def __init__(self, config: EfficientFormerConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.meta4D_layers = TFEfficientFormerMeta4DLayers(config=config, stage_idx=-1, name="meta4D_layers")
+        self.flat = TFEfficientFormerFlat(name="flat")
+        self.meta3D_layers = TFEfficientFormerMeta3DLayers(config, name="meta3D_layers")
+
+    def call(
+        self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False
+    ) -> Tuple[tf.Tensor]:
+        hidden_states = self.meta4D_layers(hidden_states=hidden_states, training=training)
+        hidden_states = self.flat(hidden_states=hidden_states)
+        hidden_states = self.meta3D_layers(
+            hidden_states=hidden_states, output_attentions=output_attentions, training=training
+        )
+
+        return hidden_states
+
+
+class TFEfficientFormerEncoder(tf.keras.layers.Layer):
+    def __init__(self, config: EfficientFormerConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+        num_intermediate_stages = len(config.depths) - 1
+        downsamples = [
+            config.downsamples[i] or config.hidden_sizes[i] != config.hidden_sizes[i + 1]
+            for i in range(num_intermediate_stages)
+        ]
+
+        intermediate_stages = []
+        layer_count = -1
+        for i in range(num_intermediate_stages):
+            layer_count += 1
+            intermediate_stages.append(
+                TFEfficientFormerIntermediateStage(config, i, name=f"intermediate_stages.{layer_count}")
+            )
+            if downsamples[i]:
+                layer_count += 1
+                intermediate_stages.append(
+                    TFEfficientFormerPatchEmbeddings(
+                        config,
+                        config.hidden_sizes[i],
+                        config.hidden_sizes[i + 1],
+                        name=f"intermediate_stages.{layer_count}",
+                    )
+                )
+        self.intermediate_stages = intermediate_stages
+        self.last_stage = TFEfficientFormerLastStage(config, name="last_stage")
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        output_hidden_states: bool,
+        output_attentions: bool,
+        return_dict: bool,
+        training: bool = False,
+    ) -> TFBaseModelOutput:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        for layer_module in self.intermediate_stages:
+            hidden_states = layer_module(hidden_states, training=training)
+
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+        layer_output = self.last_stage(hidden_states, output_attentions=output_attentions, training=training)
+
+        if output_attentions:
+            all_self_attentions = all_self_attentions + layer_output[1:]
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (layer_output[0],)
+
+        if not return_dict:
+            return tuple(v for v in [layer_output[0], all_hidden_states, all_self_attentions] if v is not None)
+
+        return TFBaseModelOutput(
+            last_hidden_state=layer_output[0],
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+@keras_serializable
+class TFEfficientFormerMainLayer(tf.keras.layers.Layer):
+    config_class = EfficientFormerConfig
+
+    def __init__(self, config: EfficientFormerConfig, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.config = config
+
+        self.patch_embed = TFEfficientFormerConvStem(config, config.hidden_sizes[0], name="patch_embed")
+        self.encoder = TFEfficientFormerEncoder(config, name="encoder")
+        self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
+
+    @unpack_inputs
+    def call(
+        self,
+        pixel_values: Optional[tf.Tensor] = None,
+        output_attentions: Optional[tf.Tensor] = None,
+        output_hidden_states: Optional[tf.Tensor] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+    ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor, ...]]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        # When running on CPU, tf.keras.layers.Conv2D and tf.keras.layers.AveragePool2D do not
+        # support channels first NCHW format. A number of blocks contain both.
+        # So change the input format from (batch_size, num_channels, height, width) to
+        # (batch_size, height, width, num_channels) here.
+        # shape = (batch_size, in_height, in_width, in_channels=num_channels)
+        pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
+        embedding_output = self.patch_embed(pixel_values, training=training)
+
+        encoder_outputs = self.encoder(
+            hidden_states=embedding_output,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output, training=training)
+
+        # Change the hidden states from (batch_size, height, width, num_channels) to
+        # (batch_size, num_channels, height, width).
+        # The hidden states are in (batch_size, height, width, num_channels)
+        # shape after all stages except the MB3D blocks.
+        if output_hidden_states:
+            hidden_states = tuple([tf.transpose(h, perm=(0, 3, 1, 2)) for h in encoder_outputs[1][:-1]]) + (
+                encoder_outputs[1][-1],
+            )
+
+        if not return_dict:
+            head_outputs = (sequence_output,)
+            return head_outputs + encoder_outputs[1:]
+
+        return TFBaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class TFEfficientFormerPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = EfficientFormerConfig
+    base_model_prefix = "efficientformer"
+    main_input_name = "pixel_values"
+
+
+EFFICIENTFORMER_START_DOCSTRING = r"""
+    This model is a TensorFlow
+    [tf.keras.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer). Use it as a regular
+    TensorFlow Module and refer to the TensorFlow documentation for all matter related to general usage and behavior.
+
+
+    Parameters:
+        config ([`EfficientFormerConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+EFFICIENTFORMER_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values ((`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`EfficientFormerImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare EfficientFormer Model transformer outputting raw hidden-states without any specific head on top.",
+    EFFICIENTFORMER_START_DOCSTRING,
+)
+class TFEfficientFormerModel(TFEfficientFormerPreTrainedModel):
+    def __init__(self, config: EfficientFormerConfig, **kwargs) -> None:
+        super().__init__(config, **kwargs)
+
+        self.efficientformer = TFEfficientFormerMainLayer(config, name="efficientformer")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFBaseModelOutputWithPooling,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def call(
+        self,
+        pixel_values: Optional[tf.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+    ) -> Union[Tuple, TFBaseModelOutput]:
+        outputs = self.efficientformer(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        return outputs
+
+
+@add_start_docstrings(
+    """
+    EfficientFormer Model transformer with an image classification head on top of pooled last hidden state, e.g. for
+    ImageNet.
+    """,
+    EFFICIENTFORMER_START_DOCSTRING,
+)
+class TFEfficientFormerForImageClassification(TFEfficientFormerPreTrainedModel, TFSequenceClassificationLoss):
+    def __init__(self, config: EfficientFormerConfig):
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.efficientformer = TFEfficientFormerMainLayer(config, name="efficientformer")
+
+        # Classifier head
+        self.classifier = (
+            tf.keras.layers.Dense(config.num_labels, name="classifier")
+            if config.num_labels > 0
+            else tf.keras.layers.Activation("linear", name="classifier")
+        )
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=TFImageClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def call(
+        self,
+        pixel_values: Optional[tf.Tensor] = None,
+        labels: Optional[tf.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+    ) -> Union[tf.Tensor, TFImageClassifierOutput]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.efficientformer(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.classifier(tf.reduce_mean(sequence_output, axis=-2))
+
+        loss = None if labels is None else self.hf_compute_loss(labels, logits)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFImageClassifierOutput(
+            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+
+
+@dataclass
+class TFEfficientFormerForImageClassificationWithTeacherOutput(ModelOutput):
+    """
+    Args:
+    Output type of [`EfficientFormerForImageClassificationWithTeacher`].
+        logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
+            Prediction scores as the average of the cls_logits and distillation logits.
+        cls_logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
+            Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the
+            class token).
+        distillation_logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
+            Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
+            distillation token).
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when
+        `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus
+            the initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when
+        `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+    """
+
+    logits: tf.Tensor = None
+    cls_logits: tf.Tensor = None
+    distillation_logits: tf.Tensor = None
+    hidden_states: Optional[Tuple[tf.Tensor]] = None
+    attentions: Optional[Tuple[tf.Tensor]] = None
+
+
+@add_start_docstrings(
+    """
+    EfficientFormer Model transformer with image classification heads on top (a linear layer on top of the final hidden
+    state and a linear layer on top of the final hidden state of the distillation token) e.g. for ImageNet.
+
+    .. warning::
+            This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet
+            supported.
+    """,
+    EFFICIENTFORMER_START_DOCSTRING,
+)
+class TFEfficientFormerForImageClassificationWithTeacher(TFEfficientFormerPreTrainedModel):
+    def __init__(self, config: EfficientFormerConfig) -> None:
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.efficientformer = TFEfficientFormerMainLayer(config, name="efficientformer")
+
+        # Classifier heads
+        self.classifier = (
+            tf.keras.layers.Dense(config.num_labels, name="classifier")
+            if config.num_labels > 0
+            else tf.keras.layers.Activation("linear", name="classifier")
+        )
+        self.distillation_classifier = (
+            tf.keras.layers.Dense(config.num_labels, name="distillation_classifier")
+            if config.num_labels > 0
+            else tf.keras.layers.Activation("linear", name="distillation_classifier")
+        )
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=TFEfficientFormerForImageClassificationWithTeacherOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def call(
+        self,
+        pixel_values: Optional[tf.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+    ) -> Union[tuple, TFEfficientFormerForImageClassificationWithTeacherOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if training:
+            raise Exception(
+                "This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet supported."
+            )
+
+        outputs = self.efficientformer(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        sequence_output = outputs[0]
+
+        cls_logits = self.classifier(tf.reduce_mean(sequence_output, axis=-2))
+        distillation_logits = self.distillation_classifier(tf.reduce_mean(sequence_output, axis=-2))
+        logits = (cls_logits + distillation_logits) / 2
+
+        if not return_dict:
+            output = (logits, cls_logits, distillation_logits) + outputs[1:]
+            return output
+
+        return TFEfficientFormerForImageClassificationWithTeacherOutput(
+            logits=logits,
+            cls_logits=cls_logits,
+            distillation_logits=distillation_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/efficientnet/__init__.py b/src/transformers/models/efficientnet/__init__.py
new file mode 100644
index 000000000000..6df523721aef
--- /dev/null
+++ b/src/transformers/models/efficientnet/__init__.py
@@ -0,0 +1,84 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+# rely on isort to merge the imports
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
+
+
+_import_structure = {
+    "configuration_efficientnet": [
+        "EFFICIENTNET_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "EfficientNetConfig",
+        "EfficientNetOnnxConfig",
+    ]
+}
+
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["image_processing_efficientnet"] = ["EfficientNetImageProcessor"]
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_efficientnet"] = [
+        "EFFICIENTNET_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "EfficientNetForImageClassification",
+        "EfficientNetModel",
+        "EfficientNetPreTrainedModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_efficientnet import (
+        EFFICIENTNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        EfficientNetConfig,
+        EfficientNetOnnxConfig,
+    )
+
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .image_processing_efficientnet import EfficientNetImageProcessor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_efficientnet import (
+            EFFICIENTNET_PRETRAINED_MODEL_ARCHIVE_LIST,
+            EfficientNetForImageClassification,
+            EfficientNetModel,
+            EfficientNetPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
diff --git a/src/transformers/models/efficientnet/configuration_efficientnet.py b/src/transformers/models/efficientnet/configuration_efficientnet.py
new file mode 100644
index 000000000000..e6b6a1c261ca
--- /dev/null
+++ b/src/transformers/models/efficientnet/configuration_efficientnet.py
@@ -0,0 +1,169 @@
+# coding=utf-8
+# Copyright 2023 Google Research, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" EfficientNet model configuration"""
+
+from collections import OrderedDict
+from typing import List, Mapping
+
+from packaging import version
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+EFFICIENTNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "google/efficientnet-b7": "https://huggingface.co/google/efficientnet-b7/resolve/main/config.json",
+}
+
+
+class EfficientNetConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`EfficientNetModel`]. It is used to instantiate an
+    EfficientNet model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the EfficientNet
+    [google/efficientnet-b7](https://huggingface.co/google/efficientnet-b7) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        image_size (`int`, *optional*, defaults to 600):
+            The input image size.
+        width_coefficient (`float`, *optional*, defaults to 2.0):
+            Scaling coefficient for network width at each stage.
+        depth_coefficient (`float`, *optional*, defaults to 3.1):
+            Scaling coefficient for network depth at each stage.
+        depth_divisor `int`, *optional*, defaults to 8):
+            A unit of network width.
+        kernel_sizes (`List[int]`, *optional*, defaults to `[3, 3, 5, 3, 5, 5, 3]`):
+            List of kernel sizes to be used in each block.
+        in_channels (`List[int]`, *optional*, defaults to `[32, 16, 24, 40, 80, 112, 192]`):
+            List of input channel sizes to be used in each block for convolutional layers.
+        out_channels (`List[int]`, *optional*, defaults to `[16, 24, 40, 80, 112, 192, 320]`):
+            List of output channel sizes to be used in each block for convolutional layers.
+        depthwise_padding (`List[int]`, *optional*, defaults to `[]`):
+            List of block indices with square padding.
+        strides (`List[int]`, *optional*, defaults to `[1, 2, 2, 2, 1, 2, 1]`):
+            List of stride sizes to be used in each block for convolutional layers.
+        num_block_repeats (`List[int]`, *optional*, defaults to `[1, 2, 2, 3, 3, 4, 1]`):
+            List of the number of times each block is to repeated.
+        expand_ratios (`List[int]`, *optional*, defaults to `[1, 6, 6, 6, 6, 6, 6]`):
+            List of scaling coefficient of each block.
+        squeeze_expansion_ratio (`float`, *optional*, defaults to 0.25):
+            Squeeze expansion ratio.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in each block. If string, `"gelu"`, `"relu"`,
+            `"selu", `"gelu_new"`, `"silu"` and `"mish"` are supported.
+        hiddem_dim (`int`, *optional*, defaults to 1280):
+            The hidden dimension of the layer before the classification head.
+        pooling_type (`str` or `function`, *optional*, defaults to `"mean"`):
+            Type of final pooling to be applied before the dense classification head. Available options are [`"mean"`,
+            `"max"`]
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        batch_norm_eps (`float`, *optional*, defaults to 1e-3):
+            The epsilon used by the batch normalization layers.
+        batch_norm_momentum (`float`, *optional*, defaults to 0.99):
+            The momentum used by the batch normalization layers.
+        dropout_rate (`float`, *optional*, defaults to 0.5):
+            The dropout rate to be applied before final classifier layer.
+        drop_connect_rate (`float`, *optional*, defaults to 0.2):
+            The drop rate for skip connections.
+
+    Example:
+    ```python
+    >>> from transformers import EfficientNetConfig, EfficientNetModel
+
+    >>> # Initializing a EfficientNet efficientnet-b7 style configuration
+    >>> configuration = EfficientNetConfig()
+
+    >>> # Initializing a model (with random weights) from the efficientnet-b7 style configuration
+    >>> model = EfficientNetModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "efficientnet"
+
+    def __init__(
+        self,
+        num_channels: int = 3,
+        image_size: int = 600,
+        width_coefficient: float = 2.0,
+        depth_coefficient: float = 3.1,
+        depth_divisor: int = 8,
+        kernel_sizes: List[int] = [3, 3, 5, 3, 5, 5, 3],
+        in_channels: List[int] = [32, 16, 24, 40, 80, 112, 192],
+        out_channels: List[int] = [16, 24, 40, 80, 112, 192, 320],
+        depthwise_padding: List[int] = [],
+        strides: List[int] = [1, 2, 2, 2, 1, 2, 1],
+        num_block_repeats: List[int] = [1, 2, 2, 3, 3, 4, 1],
+        expand_ratios: List[int] = [1, 6, 6, 6, 6, 6, 6],
+        squeeze_expansion_ratio: float = 0.25,
+        hidden_act: str = "swish",
+        hidden_dim: int = 2560,
+        pooling_type: str = "mean",
+        initializer_range: float = 0.02,
+        batch_norm_eps: float = 0.001,
+        batch_norm_momentum: float = 0.99,
+        dropout_rate: float = 0.5,
+        drop_connect_rate: float = 0.2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.width_coefficient = width_coefficient
+        self.depth_coefficient = depth_coefficient
+        self.depth_divisor = depth_divisor
+        self.kernel_sizes = kernel_sizes
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.depthwise_padding = depthwise_padding
+        self.strides = strides
+        self.num_block_repeats = num_block_repeats
+        self.expand_ratios = expand_ratios
+        self.squeeze_expansion_ratio = squeeze_expansion_ratio
+        self.hidden_act = hidden_act
+        self.hidden_dim = hidden_dim
+        self.pooling_type = pooling_type
+        self.initializer_range = initializer_range
+        self.batch_norm_eps = batch_norm_eps
+        self.batch_norm_momentum = batch_norm_momentum
+        self.dropout_rate = dropout_rate
+        self.drop_connect_rate = drop_connect_rate
+        self.num_hidden_layers = sum(num_block_repeats) * 4
+
+
+class EfficientNetOnnxConfig(OnnxConfig):
+    torch_onnx_minimum_version = version.parse("1.11")
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+            ]
+        )
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-5
diff --git a/src/transformers/models/efficientnet/convert_efficientnet_to_pytorch.py b/src/transformers/models/efficientnet/convert_efficientnet_to_pytorch.py
new file mode 100644
index 000000000000..e9988524aca0
--- /dev/null
+++ b/src/transformers/models/efficientnet/convert_efficientnet_to_pytorch.py
@@ -0,0 +1,339 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert EfficientNet checkpoints from the original repository.
+
+URL: https://github.com/keras-team/keras/blob/v2.11.0/keras/applications/efficientnet.py"""
+
+import argparse
+import json
+import os
+
+import numpy as np
+import PIL
+import requests
+import tensorflow.keras.applications.efficientnet as efficientnet
+import torch
+from huggingface_hub import hf_hub_download
+from PIL import Image
+from tensorflow.keras.preprocessing import image
+
+from transformers import (
+    EfficientNetConfig,
+    EfficientNetForImageClassification,
+    EfficientNetImageProcessor,
+)
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+model_classes = {
+    "b0": efficientnet.EfficientNetB0,
+    "b1": efficientnet.EfficientNetB1,
+    "b2": efficientnet.EfficientNetB2,
+    "b3": efficientnet.EfficientNetB3,
+    "b4": efficientnet.EfficientNetB4,
+    "b5": efficientnet.EfficientNetB5,
+    "b6": efficientnet.EfficientNetB6,
+    "b7": efficientnet.EfficientNetB7,
+}
+
+CONFIG_MAP = {
+    "b0": {
+        "hidden_dim": 1280,
+        "width_coef": 1.0,
+        "depth_coef": 1.0,
+        "image_size": 224,
+        "dropout_rate": 0.2,
+        "dw_padding": [],
+    },
+    "b1": {
+        "hidden_dim": 1280,
+        "width_coef": 1.0,
+        "depth_coef": 1.1,
+        "image_size": 240,
+        "dropout_rate": 0.2,
+        "dw_padding": [16],
+    },
+    "b2": {
+        "hidden_dim": 1408,
+        "width_coef": 1.1,
+        "depth_coef": 1.2,
+        "image_size": 260,
+        "dropout_rate": 0.3,
+        "dw_padding": [5, 8, 16],
+    },
+    "b3": {
+        "hidden_dim": 1536,
+        "width_coef": 1.2,
+        "depth_coef": 1.4,
+        "image_size": 300,
+        "dropout_rate": 0.3,
+        "dw_padding": [5, 18],
+    },
+    "b4": {
+        "hidden_dim": 1792,
+        "width_coef": 1.4,
+        "depth_coef": 1.8,
+        "image_size": 380,
+        "dropout_rate": 0.4,
+        "dw_padding": [6],
+    },
+    "b5": {
+        "hidden_dim": 2048,
+        "width_coef": 1.6,
+        "depth_coef": 2.2,
+        "image_size": 456,
+        "dropout_rate": 0.4,
+        "dw_padding": [13, 27],
+    },
+    "b6": {
+        "hidden_dim": 2304,
+        "width_coef": 1.8,
+        "depth_coef": 2.6,
+        "image_size": 528,
+        "dropout_rate": 0.5,
+        "dw_padding": [31],
+    },
+    "b7": {
+        "hidden_dim": 2560,
+        "width_coef": 2.0,
+        "depth_coef": 3.1,
+        "image_size": 600,
+        "dropout_rate": 0.5,
+        "dw_padding": [18],
+    },
+}
+
+
+def get_efficientnet_config(model_name):
+    config = EfficientNetConfig()
+    config.hidden_dim = CONFIG_MAP[model_name]["hidden_dim"]
+    config.width_coefficient = CONFIG_MAP[model_name]["width_coef"]
+    config.depth_coefficient = CONFIG_MAP[model_name]["depth_coef"]
+    config.image_size = CONFIG_MAP[model_name]["image_size"]
+    config.dropout_rate = CONFIG_MAP[model_name]["dropout_rate"]
+    config.depthwise_padding = CONFIG_MAP[model_name]["dw_padding"]
+
+    repo_id = "huggingface/label-files"
+    filename = "imagenet-1k-id2label.json"
+    config.num_labels = 1000
+    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
+    id2label = {int(k): v for k, v in id2label.items()}
+
+    config.id2label = id2label
+    config.label2id = {v: k for k, v in id2label.items()}
+    return config
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    return im
+
+
+def convert_image_processor(model_name):
+    size = CONFIG_MAP[model_name]["image_size"]
+    preprocessor = EfficientNetImageProcessor(
+        size={"height": size, "width": size},
+        image_mean=[0.485, 0.456, 0.406],
+        image_std=[0.47853944, 0.4732864, 0.47434163],
+        do_center_crop=False,
+    )
+    return preprocessor
+
+
+# here we list all keys to be renamed (original name on the left, our name on the right)
+def rename_keys(original_param_names):
+    block_names = [v.split("_")[0].split("block")[1] for v in original_param_names if v.startswith("block")]
+    block_names = sorted(set(block_names))
+    num_blocks = len(block_names)
+    block_name_mapping = {b: str(i) for b, i in zip(block_names, range(num_blocks))}
+
+    rename_keys = []
+    rename_keys.append(("stem_conv/kernel:0", "embeddings.convolution.weight"))
+    rename_keys.append(("stem_bn/gamma:0", "embeddings.batchnorm.weight"))
+    rename_keys.append(("stem_bn/beta:0", "embeddings.batchnorm.bias"))
+    rename_keys.append(("stem_bn/moving_mean:0", "embeddings.batchnorm.running_mean"))
+    rename_keys.append(("stem_bn/moving_variance:0", "embeddings.batchnorm.running_var"))
+
+    for b in block_names:
+        hf_b = block_name_mapping[b]
+        rename_keys.append((f"block{b}_expand_conv/kernel:0", f"encoder.blocks.{hf_b}.expansion.expand_conv.weight"))
+        rename_keys.append((f"block{b}_expand_bn/gamma:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.weight"))
+        rename_keys.append((f"block{b}_expand_bn/beta:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.bias"))
+        rename_keys.append(
+            (f"block{b}_expand_bn/moving_mean:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.running_mean")
+        )
+        rename_keys.append(
+            (f"block{b}_expand_bn/moving_variance:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.running_var")
+        )
+        rename_keys.append(
+            (f"block{b}_dwconv/depthwise_kernel:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_conv.weight")
+        )
+        rename_keys.append((f"block{b}_bn/gamma:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.weight"))
+        rename_keys.append((f"block{b}_bn/beta:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.bias"))
+        rename_keys.append(
+            (f"block{b}_bn/moving_mean:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.running_mean")
+        )
+        rename_keys.append(
+            (f"block{b}_bn/moving_variance:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.running_var")
+        )
+
+        rename_keys.append((f"block{b}_se_reduce/kernel:0", f"encoder.blocks.{hf_b}.squeeze_excite.reduce.weight"))
+        rename_keys.append((f"block{b}_se_reduce/bias:0", f"encoder.blocks.{hf_b}.squeeze_excite.reduce.bias"))
+        rename_keys.append((f"block{b}_se_expand/kernel:0", f"encoder.blocks.{hf_b}.squeeze_excite.expand.weight"))
+        rename_keys.append((f"block{b}_se_expand/bias:0", f"encoder.blocks.{hf_b}.squeeze_excite.expand.bias"))
+        rename_keys.append(
+            (f"block{b}_project_conv/kernel:0", f"encoder.blocks.{hf_b}.projection.project_conv.weight")
+        )
+        rename_keys.append((f"block{b}_project_bn/gamma:0", f"encoder.blocks.{hf_b}.projection.project_bn.weight"))
+        rename_keys.append((f"block{b}_project_bn/beta:0", f"encoder.blocks.{hf_b}.projection.project_bn.bias"))
+        rename_keys.append(
+            (f"block{b}_project_bn/moving_mean:0", f"encoder.blocks.{hf_b}.projection.project_bn.running_mean")
+        )
+        rename_keys.append(
+            (f"block{b}_project_bn/moving_variance:0", f"encoder.blocks.{hf_b}.projection.project_bn.running_var")
+        )
+
+    rename_keys.append(("top_conv/kernel:0", "encoder.top_conv.weight"))
+    rename_keys.append(("top_bn/gamma:0", "encoder.top_bn.weight"))
+    rename_keys.append(("top_bn/beta:0", "encoder.top_bn.bias"))
+    rename_keys.append(("top_bn/moving_mean:0", "encoder.top_bn.running_mean"))
+    rename_keys.append(("top_bn/moving_variance:0", "encoder.top_bn.running_var"))
+
+    key_mapping = {}
+    for item in rename_keys:
+        if item[0] in original_param_names:
+            key_mapping[item[0]] = "efficientnet." + item[1]
+
+    key_mapping["predictions/kernel:0"] = "classifier.weight"
+    key_mapping["predictions/bias:0"] = "classifier.bias"
+    return key_mapping
+
+
+def replace_params(hf_params, tf_params, key_mapping):
+    for key, value in tf_params.items():
+        if "normalization" in key:
+            continue
+
+        hf_key = key_mapping[key]
+        if "_conv" in key and "kernel" in key:
+            new_hf_value = torch.from_numpy(value).permute(3, 2, 0, 1)
+        elif "depthwise_kernel" in key:
+            new_hf_value = torch.from_numpy(value).permute(2, 3, 0, 1)
+        elif "kernel" in key:
+            new_hf_value = torch.from_numpy(np.transpose(value))
+        else:
+            new_hf_value = torch.from_numpy(value)
+
+        # Replace HF parameters with original TF model parameters
+        assert hf_params[hf_key].shape == new_hf_value.shape
+        hf_params[hf_key].copy_(new_hf_value)
+
+
+@torch.no_grad()
+def convert_efficientnet_checkpoint(model_name, pytorch_dump_folder_path, save_model, push_to_hub):
+    """
+    Copy/paste/tweak model's weights to our EfficientNet structure.
+    """
+    # Load original model
+    original_model = model_classes[model_name](
+        include_top=True,
+        weights="imagenet",
+        input_tensor=None,
+        input_shape=None,
+        pooling=None,
+        classes=1000,
+        classifier_activation="softmax",
+    )
+
+    tf_params = original_model.trainable_variables
+    tf_non_train_params = original_model.non_trainable_variables
+    tf_params = {param.name: param.numpy() for param in tf_params}
+    for param in tf_non_train_params:
+        tf_params[param.name] = param.numpy()
+    tf_param_names = list(tf_params.keys())
+
+    # Load HuggingFace model
+    config = get_efficientnet_config(model_name)
+    hf_model = EfficientNetForImageClassification(config).eval()
+    hf_params = hf_model.state_dict()
+
+    # Create src-to-dst parameter name mapping dictionary
+    print("Converting parameters...")
+    key_mapping = rename_keys(tf_param_names)
+    replace_params(hf_params, tf_params, key_mapping)
+
+    # Initialize preprocessor and preprocess input image
+    preprocessor = convert_image_processor(model_name)
+    inputs = preprocessor(images=prepare_img(), return_tensors="pt")
+
+    # HF model inference
+    hf_model.eval()
+    with torch.no_grad():
+        outputs = hf_model(**inputs)
+    hf_logits = outputs.logits.detach().numpy()
+
+    # Original model inference
+    original_model.trainable = False
+    image_size = CONFIG_MAP[model_name]["image_size"]
+    img = prepare_img().resize((image_size, image_size), resample=PIL.Image.NEAREST)
+    x = image.img_to_array(img)
+    x = np.expand_dims(x, axis=0)
+    original_logits = original_model.predict(x)
+
+    # Check whether original and HF model outputs match  -> np.allclose
+    assert np.allclose(original_logits, hf_logits, atol=1e-3), "The predicted logits are not the same."
+    print("Model outputs match!")
+
+    if save_model:
+        # Create folder to save model
+        if not os.path.isdir(pytorch_dump_folder_path):
+            os.mkdir(pytorch_dump_folder_path)
+        # Save converted model and image processor
+        hf_model.save_pretrained(pytorch_dump_folder_path)
+        preprocessor.save_pretrained(pytorch_dump_folder_path)
+
+    if push_to_hub:
+        # Push model and image processor to hub
+        print(f"Pushing converted {model_name} to the hub...")
+        model_name = f"efficientnet-{model_name}"
+        preprocessor.push_to_hub(model_name)
+        hf_model.push_to_hub(model_name)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--model_name",
+        default="b0",
+        type=str,
+        help="Version name of the EfficientNet model you want to convert, select from [b0, b1, b2, b3, b4, b5, b6, b7].",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        default="hf_model",
+        type=str,
+        help="Path to the output PyTorch model directory.",
+    )
+    parser.add_argument("--save_model", action="store_true", help="Save model to local")
+    parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub")
+
+    args = parser.parse_args()
+    convert_efficientnet_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.save_model, args.push_to_hub)
diff --git a/src/transformers/models/efficientnet/image_processing_efficientnet.py b/src/transformers/models/efficientnet/image_processing_efficientnet.py
new file mode 100644
index 000000000000..5f75d1692e88
--- /dev/null
+++ b/src/transformers/models/efficientnet/image_processing_efficientnet.py
@@ -0,0 +1,366 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for EfficientNet."""
+
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import rescale, resize, to_channel_dimension_format
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import TensorType, is_vision_available, logging
+
+
+if is_vision_available():
+    import PIL
+
+
+logger = logging.get_logger(__name__)
+
+
+class EfficientNetImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a EfficientNet image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
+            `do_resize` in `preprocess`.
+        size (`Dict[str, int]` *optional*, defaults to `{"height": 346, "width": 346}`):
+            Size of the image after `resize`. Can be overridden by `size` in `preprocess`.
+        resample (`PILImageResampling` filter, *optional*, defaults to 0):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`.
+        do_center_crop (`bool`, *optional*, defaults to `False`):
+            Whether to center crop the image. If the input size is smaller than `crop_size` along any edge, the image
+            is padded with 0's and then center cropped. Can be overridden by `do_center_crop` in `preprocess`.
+        crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 289, "width": 289}`):
+            Desired output size when applying center-cropping. Can be overridden by `crop_size` in `preprocess`.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+            `preprocess` method.
+        rescale_offset (`bool`, *optional*, defaults to `False`):
+            Whether to rescale the image between [-scale_range, scale_range] instead of [0, scale_range]. Can be
+            overridden by the `rescale_factor` parameter in the `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+        include_top (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image again. Should be set to True if the inputs are used for image classification.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PIL.Image.NEAREST,
+        do_center_crop: bool = False,
+        crop_size: Dict[str, int] = None,
+        rescale_factor: Union[int, float] = 1 / 255,
+        rescale_offset: bool = False,
+        do_rescale: bool = True,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        include_top: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 346, "width": 346}
+        size = get_size_dict(size)
+        crop_size = crop_size if crop_size is not None else {"height": 289, "width": 289}
+        crop_size = get_size_dict(crop_size, param_name="crop_size")
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.rescale_offset = rescale_offset
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+        self.include_top = include_top
+
+    # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize with PILImageResampling.BILINEAR->PILImageResampling.NEAREST
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.NEAREST,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to `(size["height"], size["width"])`.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.NEAREST`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.NEAREST`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        output_size = (size["height"], size["width"])
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    def rescale(
+        self,
+        image: np.ndarray,
+        scale: Union[int, float],
+        offset: bool = True,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ):
+        """
+        Rescale an image by a scale factor.
+
+        If `offset` is `True`, the image has its values rescaled by `scale` and then offset by 1. If `scale` is
+        1/127.5, the image is rescaled between [-1, 1].
+            image = image * scale - 1
+
+        If `offset` is `False`, and `scale` is 1/255, the image is rescaled between [0, 1].
+            image = image * scale
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            scale (`int` or `float`):
+                Scale to apply to the image.
+            offset (`bool`, *optional*):
+                Whether to scale the image in both negative and positive directions.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        rescaled_image = rescale(
+            image, scale=scale, data_format=data_format, input_data_format=input_data_format, **kwargs
+        )
+
+        if offset:
+            rescaled_image = rescaled_image - 1
+
+        return rescaled_image
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample=None,
+        do_center_crop: bool = None,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        rescale_offset: bool = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        include_top: bool = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after `resize`.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                PILImageResampling filter to use if resizing the image Only has an effect if `do_resize` is set to
+                `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+                Whether to center crop the image.
+            crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the image after center crop. If one edge the image is smaller than `crop_size`, it will be
+                padded with zeros and then cropped
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            rescale_offset (`bool`, *optional*, defaults to `self.rescale_offset`):
+                Whether to rescale the image between [-scale_range, scale_range] instead of [0, scale_range].
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation.
+            include_top (`bool`, *optional*, defaults to `self.include_top`):
+                Rescales the image again for image classification if set to True.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - `None`: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                    - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        resample = resample if resample is not None else self.resample
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        rescale_offset = rescale_offset if rescale_offset is not None else self.rescale_offset
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        include_top = include_top if include_top is not None else self.include_top
+
+        size = size if size is not None else self.size
+        size = get_size_dict(size)
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(crop_size, param_name="crop_size")
+
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        if do_resize and size is None or resample is None:
+            raise ValueError("Size and resample must be specified if do_resize is True.")
+
+        if do_center_crop and crop_size is None:
+            raise ValueError("Crop size must be specified if do_center_crop is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize and (image_mean is None or image_std is None):
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        if do_resize:
+            images = [
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_center_crop:
+            images = [
+                self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images
+            ]
+
+        if do_rescale:
+            images = [
+                self.rescale(
+                    image=image, scale=rescale_factor, offset=rescale_offset, input_data_format=input_data_format
+                )
+                for image in images
+            ]
+
+        if do_normalize:
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if include_top:
+            images = [
+                self.normalize(image=image, mean=0, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/src/transformers/models/efficientnet/modeling_efficientnet.py b/src/transformers/models/efficientnet/modeling_efficientnet.py
new file mode 100644
index 000000000000..478aeecee02b
--- /dev/null
+++ b/src/transformers/models/efficientnet/modeling_efficientnet.py
@@ -0,0 +1,654 @@
+# coding=utf-8
+# Copyright 2023 Google Research, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch EfficientNet model."""
+
+
+import math
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BaseModelOutputWithNoAttention,
+    BaseModelOutputWithPoolingAndNoAttention,
+    ImageClassifierOutputWithNoAttention,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from .configuration_efficientnet import EfficientNetConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "EfficientNetConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "google/efficientnet-b7"
+_EXPECTED_OUTPUT_SHAPE = [1, 768, 7, 7]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "google/efficientnet-b7"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
+
+EFFICIENTNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "google/efficientnet-b7",
+    # See all EfficientNet models at https://huggingface.co/models?filter=efficientnet
+]
+
+
+EFFICIENTNET_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`EfficientNetConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+EFFICIENTNET_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`AutoImageProcessor.__call__`] for details.
+
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+def round_filters(config: EfficientNetConfig, num_channels: int):
+    r"""
+    Round number of filters based on depth multiplier.
+    """
+    divisor = config.depth_divisor
+    num_channels *= config.width_coefficient
+    new_dim = max(divisor, int(num_channels + divisor / 2) // divisor * divisor)
+
+    # Make sure that round down does not go down by more than 10%.
+    if new_dim < 0.9 * num_channels:
+        new_dim += divisor
+
+    return int(new_dim)
+
+
+def correct_pad(kernel_size: Union[int, Tuple], adjust: bool = True):
+    r"""
+    Utility function to get the tuple padding value for the depthwise convolution.
+
+    Args:
+        kernel_size (`int` or `tuple`):
+            Kernel size of the convolution layers.
+        adjust (`bool`, *optional*, defaults to `True`):
+            Adjusts padding value to apply to right and bottom sides of the input.
+    """
+    if isinstance(kernel_size, int):
+        kernel_size = (kernel_size, kernel_size)
+
+    correct = (kernel_size[0] // 2, kernel_size[1] // 2)
+    if adjust:
+        return (correct[1] - 1, correct[1], correct[0] - 1, correct[0])
+    else:
+        return (correct[1], correct[1], correct[0], correct[0])
+
+
+class EfficientNetEmbeddings(nn.Module):
+    r"""
+    A module that corresponds to the stem module of the original work.
+    """
+
+    def __init__(self, config: EfficientNetConfig):
+        super().__init__()
+
+        self.out_dim = round_filters(config, 32)
+        self.padding = nn.ZeroPad2d(padding=(0, 1, 0, 1))
+        self.convolution = nn.Conv2d(
+            config.num_channels, self.out_dim, kernel_size=3, stride=2, padding="valid", bias=False
+        )
+        self.batchnorm = nn.BatchNorm2d(self.out_dim, eps=config.batch_norm_eps, momentum=config.batch_norm_momentum)
+        self.activation = ACT2FN[config.hidden_act]
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        features = self.padding(pixel_values)
+        features = self.convolution(features)
+        features = self.batchnorm(features)
+        features = self.activation(features)
+
+        return features
+
+
+class EfficientNetDepthwiseConv2d(nn.Conv2d):
+    def __init__(
+        self,
+        in_channels,
+        depth_multiplier=1,
+        kernel_size=3,
+        stride=1,
+        padding=0,
+        dilation=1,
+        bias=True,
+        padding_mode="zeros",
+    ):
+        out_channels = in_channels * depth_multiplier
+        super().__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=in_channels,
+            bias=bias,
+            padding_mode=padding_mode,
+        )
+
+
+class EfficientNetExpansionLayer(nn.Module):
+    r"""
+    This corresponds to the expansion phase of each block in the original implementation.
+    """
+
+    def __init__(self, config: EfficientNetConfig, in_dim: int, out_dim: int, stride: int):
+        super().__init__()
+        self.expand_conv = nn.Conv2d(
+            in_channels=in_dim,
+            out_channels=out_dim,
+            kernel_size=1,
+            padding="same",
+            bias=False,
+        )
+        self.expand_bn = nn.BatchNorm2d(num_features=out_dim, eps=config.batch_norm_eps)
+        self.expand_act = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.Tensor:
+        # Expand phase
+        hidden_states = self.expand_conv(hidden_states)
+        hidden_states = self.expand_bn(hidden_states)
+        hidden_states = self.expand_act(hidden_states)
+
+        return hidden_states
+
+
+class EfficientNetDepthwiseLayer(nn.Module):
+    r"""
+    This corresponds to the depthwise convolution phase of each block in the original implementation.
+    """
+
+    def __init__(
+        self,
+        config: EfficientNetConfig,
+        in_dim: int,
+        stride: int,
+        kernel_size: int,
+        adjust_padding: bool,
+    ):
+        super().__init__()
+        self.stride = stride
+        conv_pad = "valid" if self.stride == 2 else "same"
+        padding = correct_pad(kernel_size, adjust=adjust_padding)
+
+        self.depthwise_conv_pad = nn.ZeroPad2d(padding=padding)
+        self.depthwise_conv = EfficientNetDepthwiseConv2d(
+            in_dim, kernel_size=kernel_size, stride=stride, padding=conv_pad, bias=False
+        )
+        self.depthwise_norm = nn.BatchNorm2d(
+            num_features=in_dim, eps=config.batch_norm_eps, momentum=config.batch_norm_momentum
+        )
+        self.depthwise_act = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.Tensor:
+        # Depthwise convolution
+        if self.stride == 2:
+            hidden_states = self.depthwise_conv_pad(hidden_states)
+
+        hidden_states = self.depthwise_conv(hidden_states)
+        hidden_states = self.depthwise_norm(hidden_states)
+        hidden_states = self.depthwise_act(hidden_states)
+
+        return hidden_states
+
+
+class EfficientNetSqueezeExciteLayer(nn.Module):
+    r"""
+    This corresponds to the Squeeze and Excitement phase of each block in the original implementation.
+    """
+
+    def __init__(self, config: EfficientNetConfig, in_dim: int, expand_dim: int, expand: bool = False):
+        super().__init__()
+        self.dim = expand_dim if expand else in_dim
+        self.dim_se = max(1, int(in_dim * config.squeeze_expansion_ratio))
+
+        self.squeeze = nn.AdaptiveAvgPool2d(output_size=1)
+        self.reduce = nn.Conv2d(
+            in_channels=self.dim,
+            out_channels=self.dim_se,
+            kernel_size=1,
+            padding="same",
+        )
+        self.expand = nn.Conv2d(
+            in_channels=self.dim_se,
+            out_channels=self.dim,
+            kernel_size=1,
+            padding="same",
+        )
+        self.act_reduce = ACT2FN[config.hidden_act]
+        self.act_expand = nn.Sigmoid()
+
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.Tensor:
+        inputs = hidden_states
+        hidden_states = self.squeeze(hidden_states)
+        hidden_states = self.reduce(hidden_states)
+        hidden_states = self.act_reduce(hidden_states)
+
+        hidden_states = self.expand(hidden_states)
+        hidden_states = self.act_expand(hidden_states)
+        hidden_states = torch.mul(inputs, hidden_states)
+
+        return hidden_states
+
+
+class EfficientNetFinalBlockLayer(nn.Module):
+    r"""
+    This corresponds to the final phase of each block in the original implementation.
+    """
+
+    def __init__(
+        self, config: EfficientNetConfig, in_dim: int, out_dim: int, stride: int, drop_rate: float, id_skip: bool
+    ):
+        super().__init__()
+        self.apply_dropout = stride == 1 and not id_skip
+        self.project_conv = nn.Conv2d(
+            in_channels=in_dim,
+            out_channels=out_dim,
+            kernel_size=1,
+            padding="same",
+            bias=False,
+        )
+        self.project_bn = nn.BatchNorm2d(
+            num_features=out_dim, eps=config.batch_norm_eps, momentum=config.batch_norm_momentum
+        )
+        self.dropout = nn.Dropout(p=drop_rate)
+
+    def forward(self, embeddings: torch.FloatTensor, hidden_states: torch.FloatTensor) -> torch.Tensor:
+        hidden_states = self.project_conv(hidden_states)
+        hidden_states = self.project_bn(hidden_states)
+
+        if self.apply_dropout:
+            hidden_states = self.dropout(hidden_states)
+            hidden_states = hidden_states + embeddings
+
+        return hidden_states
+
+
+class EfficientNetBlock(nn.Module):
+    r"""
+    This corresponds to the expansion and depthwise convolution phase of each block in the original implementation.
+
+    Args:
+        config ([`EfficientNetConfig`]):
+            Model configuration class.
+        in_dim (`int`):
+            Number of input channels.
+        out_dim (`int`):
+            Number of output channels.
+        stride (`int`):
+            Stride size to be used in convolution layers.
+        expand_ratio (`int`):
+            Expand ratio to set the output dimensions for the expansion and squeeze-excite layers.
+        kernel_size (`int`):
+            Kernel size for the depthwise convolution layer.
+        drop_rate (`float`):
+            Dropout rate to be used in the final phase of each block.
+        id_skip (`bool`):
+            Whether to apply dropout and sum the final hidden states with the input embeddings during the final phase
+            of each block. Set to `True` for the first block of each stage.
+        adjust_padding (`bool`):
+            Whether to apply padding to only right and bottom side of the input kernel before the depthwise convolution
+            operation, set to `True` for inputs with odd input sizes.
+    """
+
+    def __init__(
+        self,
+        config: EfficientNetConfig,
+        in_dim: int,
+        out_dim: int,
+        stride: int,
+        expand_ratio: int,
+        kernel_size: int,
+        drop_rate: float,
+        id_skip: bool,
+        adjust_padding: bool,
+    ):
+        super().__init__()
+        self.expand_ratio = expand_ratio
+        self.expand = True if self.expand_ratio != 1 else False
+        expand_in_dim = in_dim * expand_ratio
+
+        if self.expand:
+            self.expansion = EfficientNetExpansionLayer(
+                config=config, in_dim=in_dim, out_dim=expand_in_dim, stride=stride
+            )
+
+        self.depthwise_conv = EfficientNetDepthwiseLayer(
+            config=config,
+            in_dim=expand_in_dim if self.expand else in_dim,
+            stride=stride,
+            kernel_size=kernel_size,
+            adjust_padding=adjust_padding,
+        )
+        self.squeeze_excite = EfficientNetSqueezeExciteLayer(
+            config=config, in_dim=in_dim, expand_dim=expand_in_dim, expand=self.expand
+        )
+        self.projection = EfficientNetFinalBlockLayer(
+            config=config,
+            in_dim=expand_in_dim if self.expand else in_dim,
+            out_dim=out_dim,
+            stride=stride,
+            drop_rate=drop_rate,
+            id_skip=id_skip,
+        )
+
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.Tensor:
+        embeddings = hidden_states
+        # Expansion and depthwise convolution phase
+        if self.expand_ratio != 1:
+            hidden_states = self.expansion(hidden_states)
+        hidden_states = self.depthwise_conv(hidden_states)
+
+        # Squeeze and excite phase
+        hidden_states = self.squeeze_excite(hidden_states)
+        hidden_states = self.projection(embeddings, hidden_states)
+        return hidden_states
+
+
+class EfficientNetEncoder(nn.Module):
+    r"""
+    Forward propogates the embeddings through each EfficientNet block.
+
+    Args:
+        config ([`EfficientNetConfig`]):
+            Model configuration class.
+    """
+
+    def __init__(self, config: EfficientNetConfig):
+        super().__init__()
+        self.config = config
+        self.depth_coefficient = config.depth_coefficient
+
+        def round_repeats(repeats):
+            # Round number of block repeats based on depth multiplier.
+            return int(math.ceil(self.depth_coefficient * repeats))
+
+        num_base_blocks = len(config.in_channels)
+        num_blocks = sum(round_repeats(n) for n in config.num_block_repeats)
+
+        curr_block_num = 0
+        blocks = []
+        for i in range(num_base_blocks):
+            in_dim = round_filters(config, config.in_channels[i])
+            out_dim = round_filters(config, config.out_channels[i])
+            stride = config.strides[i]
+            kernel_size = config.kernel_sizes[i]
+            expand_ratio = config.expand_ratios[i]
+
+            for j in range(round_repeats(config.num_block_repeats[i])):
+                id_skip = True if j == 0 else False
+                stride = 1 if j > 0 else stride
+                in_dim = out_dim if j > 0 else in_dim
+                adjust_padding = False if curr_block_num in config.depthwise_padding else True
+                drop_rate = config.drop_connect_rate * curr_block_num / num_blocks
+
+                block = EfficientNetBlock(
+                    config=config,
+                    in_dim=in_dim,
+                    out_dim=out_dim,
+                    stride=stride,
+                    kernel_size=kernel_size,
+                    expand_ratio=expand_ratio,
+                    drop_rate=drop_rate,
+                    id_skip=id_skip,
+                    adjust_padding=adjust_padding,
+                )
+                blocks.append(block)
+                curr_block_num += 1
+
+        self.blocks = nn.ModuleList(blocks)
+        self.top_conv = nn.Conv2d(
+            in_channels=out_dim,
+            out_channels=round_filters(config, 1280),
+            kernel_size=1,
+            padding="same",
+            bias=False,
+        )
+        self.top_bn = nn.BatchNorm2d(
+            num_features=config.hidden_dim, eps=config.batch_norm_eps, momentum=config.batch_norm_momentum
+        )
+        self.top_activation = ACT2FN[config.hidden_act]
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> BaseModelOutputWithNoAttention:
+        all_hidden_states = (hidden_states,) if output_hidden_states else None
+
+        for block in self.blocks:
+            hidden_states = block(hidden_states)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+        hidden_states = self.top_conv(hidden_states)
+        hidden_states = self.top_bn(hidden_states)
+        hidden_states = self.top_activation(hidden_states)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)
+
+        return BaseModelOutputWithNoAttention(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+        )
+
+
+class EfficientNetPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = EfficientNetConfig
+    base_model_prefix = "efficientnet"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, EfficientNetBlock):
+            module.gradient_checkpointing = value
+
+
+@add_start_docstrings(
+    "The bare EfficientNet model outputting raw features without any specific head on top.",
+    EFFICIENTNET_START_DOCSTRING,
+)
+class EfficientNetModel(EfficientNetPreTrainedModel):
+    def __init__(self, config: EfficientNetConfig):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = EfficientNetEmbeddings(config)
+        self.encoder = EfficientNetEncoder(config)
+
+        # Final pooling layer
+        if config.pooling_type == "mean":
+            self.pooler = nn.AvgPool2d(config.hidden_dim, ceil_mode=True)
+        elif config.pooling_type == "max":
+            self.pooler = nn.MaxPool2d(config.hidden_dim, ceil_mode=True)
+        else:
+            raise ValueError(f"config.pooling must be one of ['mean', 'max'] got {config.pooling}")
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(EFFICIENTNET_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndNoAttention,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPoolingAndNoAttention]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        embedding_output = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        # Apply pooling
+        last_hidden_state = encoder_outputs[0]
+        pooled_output = self.pooler(last_hidden_state)
+        # Reshape (batch_size, 1280, 1 , 1) -> (batch_size, 1280)
+        pooled_output = pooled_output.reshape(pooled_output.shape[:2])
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndNoAttention(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+        )
+
+
+@add_start_docstrings(
+    """
+    EfficientNet Model with an image classification head on top (a linear layer on top of the pooled features), e.g.
+    for ImageNet.
+    """,
+    EFFICIENTNET_START_DOCSTRING,
+)
+class EfficientNetForImageClassification(EfficientNetPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+        self.efficientnet = EfficientNetModel(config)
+        # Classifier head
+        self.dropout = nn.Dropout(p=config.dropout_rate)
+        self.classifier = nn.Linear(config.hidden_dim, self.num_labels) if self.num_labels > 0 else nn.Identity()
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(EFFICIENTNET_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=ImageClassifierOutputWithNoAttention,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, ImageClassifierOutputWithNoAttention]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.efficientnet(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
+
+        pooled_output = outputs.pooler_output if return_dict else outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ImageClassifierOutputWithNoAttention(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+        )
diff --git a/src/transformers/models/electra/__init__.py b/src/transformers/models/electra/__init__.py
index 59e3ca477941..09ce039d25fd 100644
--- a/src/transformers/models/electra/__init__.py
+++ b/src/transformers/models/electra/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/electra/configuration_electra.py b/src/transformers/models/electra/configuration_electra.py
index 01a6d1165a2e..d8e1de0fc97f 100644
--- a/src/transformers/models/electra/configuration_electra.py
+++ b/src/transformers/models/electra/configuration_electra.py
@@ -155,7 +155,7 @@ def __init__(
         position_embedding_type="absolute",
         use_cache=True,
         classifier_dropout=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
 
diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py
index 3d5f17f71691..c06d306c1a24 100644
--- a/src/transformers/models/electra/modeling_electra.py
+++ b/src/transformers/models/electra/modeling_electra.py
@@ -52,7 +52,6 @@
 
 _CHECKPOINT_FOR_DOC = "google/electra-small-discriminator"
 _CONFIG_FOR_DOC = "ElectraConfig"
-_TOKENIZER_FOR_DOC = "ElectraTokenizer"
 
 ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "google/electra-small-generator",
@@ -136,7 +135,7 @@ def load_tf_weights_in_electra(model, config, tf_checkpoint_path, discriminator_
             try:
                 if pointer.shape != array.shape:
                     raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
-            except AssertionError as e:
+            except ValueError as e:
                 e.args += (pointer.shape, array.shape)
                 raise
             print(f"Initialize PyTorch weight {name}", original_name)
@@ -162,7 +161,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
@@ -554,6 +555,13 @@ def forward(
         all_self_attentions = () if output_attentions else None
         all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         next_decoder_cache = () if use_cache else None
         for i, layer_module in enumerate(self.layer):
             if output_hidden_states:
@@ -564,12 +572,6 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         return module(*inputs, past_key_value, output_attentions)
@@ -672,8 +674,6 @@ class ElectraPreTrainedModel(PreTrainedModel):
     load_tf_weights = load_tf_weights_in_electra
     base_model_prefix = "electra"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-    _keys_to_ignore_on_load_unexpected = [r"electra.embeddings_project.weight", r"electra.embeddings_project.bias"]
 
     # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
     def _init_weights(self, module):
@@ -747,7 +747,7 @@ class ElectraForPreTrainingOutput(ModelOutput):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`ElectraTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -839,7 +839,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -869,6 +868,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -975,7 +975,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="bhadresh-savani/electra-base-emotion",
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1098,11 +1097,11 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import ElectraForPreTraining, ElectraTokenizerFast
+        >>> from transformers import ElectraForPreTraining, AutoTokenizer
         >>> import torch
 
         >>> discriminator = ElectraForPreTraining.from_pretrained("google/electra-base-discriminator")
-        >>> tokenizer = ElectraTokenizerFast.from_pretrained("google/electra-base-discriminator")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/electra-base-discriminator")
 
         >>> sentence = "The quick brown fox jumps over the lazy dog"
         >>> fake_sentence = "The quick brown fox fake over the lazy dog"
@@ -1168,7 +1167,7 @@ def forward(
     ELECTRA_START_DOCSTRING,
 )
 class ElectraForMaskedLM(ElectraPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["generator_lm_head.weight"]
+    _tied_weights_keys = ["generator_lm_head.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1188,7 +1187,6 @@ def set_output_embeddings(self, word_embeddings):
 
     @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="google/electra-small-generator",
         output_type=MaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1275,7 +1273,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="bhadresh-savani/electra-base-discriminator-finetuned-conll03-english",
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1357,7 +1354,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="bhadresh-savani/electra-base-squad2",
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1463,7 +1459,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1539,7 +1534,7 @@ def forward(
     """ELECTRA Model with a `language modeling` head on top for CLM fine-tuning.""", ELECTRA_START_DOCSTRING
 )
 class ElectraForCausalLM(ElectraPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["generator_lm_head.weight"]
+    _tied_weights_keys = ["generator_lm_head.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1608,10 +1603,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import ElectraTokenizer, ElectraForCausalLM, ElectraConfig
+        >>> from transformers import AutoTokenizer, ElectraForCausalLM, ElectraConfig
         >>> import torch
 
-        >>> tokenizer = ElectraTokenizer.from_pretrained("google/electra-base-generator")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/electra-base-generator")
         >>> config = ElectraConfig.from_pretrained("google/electra-base-generator")
         >>> config.is_decoder = True
         >>> model = ElectraForCausalLM.from_pretrained("google/electra-base-generator", config=config)
@@ -1679,8 +1674,10 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attenti
         return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
 
     # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM._reorder_cache
-    def _reorder_cache(self, past, beam_idx):
+    def _reorder_cache(self, past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
         return reordered_past
diff --git a/src/transformers/models/electra/modeling_flax_electra.py b/src/transformers/models/electra/modeling_flax_electra.py
index 99f193f590ca..32e76b8b586f 100644
--- a/src/transformers/models/electra/modeling_flax_electra.py
+++ b/src/transformers/models/electra/modeling_flax_electra.py
@@ -15,12 +15,11 @@
 
 from typing import Callable, Optional, Tuple
 
-import numpy as np
-
 import flax
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen import combine_masks, make_causal_mask
 from flax.linen import partitioning as nn_partitioning
@@ -53,7 +52,6 @@
 
 _CHECKPOINT_FOR_DOC = "google/electra-small-discriminator"
 _CONFIG_FOR_DOC = "ElectraConfig"
-_TOKENIZER_FOR_DOC = "ElectraTokenizer"
 
 remat = nn_partitioning.remat
 
@@ -111,7 +109,7 @@ class FlaxElectraForPreTrainingOutput(ModelOutput):
         input_ids (`numpy.ndarray` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`ElectraTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -326,7 +324,7 @@ def __call__(
             attention_bias = lax.select(
                 attention_mask > 0,
                 jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
-                jnp.full(attention_mask.shape, -1e10).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
             )
         else:
             attention_bias = None
@@ -693,7 +691,7 @@ def __init__(
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
         gradient_checkpointing: bool = False,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, gradient_checkpointing=gradient_checkpointing, **kwargs)
         super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
@@ -787,7 +785,6 @@ def __call__(
         return_dict: Optional[bool] = None,
         past_key_values: dict = None,
     ):
-
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -924,9 +921,7 @@ class FlaxElectraModel(FlaxElectraPreTrainedModel):
     module_class = FlaxElectraModule
 
 
-append_call_sample_docstring(
-    FlaxElectraModel, _TOKENIZER_FOR_DOC, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutput, _CONFIG_FOR_DOC
-)
+append_call_sample_docstring(FlaxElectraModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutput, _CONFIG_FOR_DOC)
 
 
 class FlaxElectraTiedDense(nn.Module):
@@ -1013,9 +1008,7 @@ class FlaxElectraForMaskedLM(FlaxElectraPreTrainedModel):
     module_class = FlaxElectraForMaskedLMModule
 
 
-append_call_sample_docstring(
-    FlaxElectraForMaskedLM, _TOKENIZER_FOR_DOC, _CHECKPOINT_FOR_DOC, FlaxMaskedLMOutput, _CONFIG_FOR_DOC
-)
+append_call_sample_docstring(FlaxElectraForMaskedLM, _CHECKPOINT_FOR_DOC, FlaxMaskedLMOutput, _CONFIG_FOR_DOC)
 
 
 class FlaxElectraForPreTrainingModule(nn.Module):
@@ -1085,9 +1078,9 @@ class FlaxElectraForPreTraining(FlaxElectraPreTrainedModel):
     Example:
 
     ```python
-    >>> from transformers import ElectraTokenizer, FlaxElectraForPreTraining
+    >>> from transformers import AutoTokenizer, FlaxElectraForPreTraining
 
-    >>> tokenizer = ElectraTokenizer.from_pretrained("google/electra-small-discriminator")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google/electra-small-discriminator")
     >>> model = FlaxElectraForPreTraining.from_pretrained("google/electra-small-discriminator")
 
     >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
@@ -1176,7 +1169,6 @@ class FlaxElectraForTokenClassification(FlaxElectraPreTrainedModel):
 
 append_call_sample_docstring(
     FlaxElectraForTokenClassification,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxTokenClassifierOutput,
     _CONFIG_FOR_DOC,
@@ -1221,7 +1213,7 @@ def setup(self):
             self.summary = nn.Dense(num_classes, dtype=self.dtype)
 
         activation_string = getattr(self.config, "summary_activation", None)
-        self.activation = ACT2FN[activation_string] if activation_string else lambda x: x
+        self.activation = ACT2FN[activation_string] if activation_string else lambda x: x  # noqa F407
 
         self.first_dropout = identity
         if hasattr(self.config, "summary_first_dropout") and self.config.summary_first_dropout > 0:
@@ -1328,7 +1320,6 @@ class FlaxElectraForMultipleChoice(FlaxElectraPreTrainedModel):
 )
 append_call_sample_docstring(
     FlaxElectraForMultipleChoice,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxMultipleChoiceModelOutput,
     _CONFIG_FOR_DOC,
@@ -1400,7 +1391,6 @@ class FlaxElectraForQuestionAnswering(FlaxElectraPreTrainedModel):
 
 append_call_sample_docstring(
     FlaxElectraForQuestionAnswering,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxQuestionAnsweringModelOutput,
     _CONFIG_FOR_DOC,
@@ -1494,7 +1484,6 @@ class FlaxElectraForSequenceClassification(FlaxElectraPreTrainedModel):
 
 append_call_sample_docstring(
     FlaxElectraForSequenceClassification,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxSequenceClassifierOutput,
     _CONFIG_FOR_DOC,
@@ -1576,7 +1565,7 @@ def __call__(
 class FlaxElectraForCausalLM(FlaxElectraPreTrainedModel):
     module_class = FlaxElectraForCausalLMModule
 
-    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jnp.DeviceArray] = None):
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
         # initializing the cache
         batch_size, seq_length = input_ids.shape
 
@@ -1605,7 +1594,6 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
 
 append_call_sample_docstring(
     FlaxElectraForCausalLM,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxCausalLMOutputWithCrossAttentions,
     _CONFIG_FOR_DOC,
diff --git a/src/transformers/models/electra/modeling_tf_electra.py b/src/transformers/models/electra/modeling_tf_electra.py
index 973860596ec1..41c64eed369d 100644
--- a/src/transformers/models/electra/modeling_tf_electra.py
+++ b/src/transformers/models/electra/modeling_tf_electra.py
@@ -14,10 +14,13 @@
 # limitations under the License.
 """ TF Electra model."""
 
+
+from __future__ import annotations
+
 import math
 import warnings
 from dataclasses import dataclass
-from typing import Dict, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import numpy as np
 import tensorflow as tf
@@ -44,10 +47,8 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
-    DUMMY_INPUTS,
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -62,7 +63,6 @@
 
 _CHECKPOINT_FOR_DOC = "google/electra-small-discriminator"
 _CONFIG_FOR_DOC = "ElectraConfig"
-_TOKENIZER_FOR_DOC = "ElectraTokenizer"
 
 TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "google/electra-small-generator",
@@ -313,9 +313,9 @@ def call(
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
         head_mask: tf.Tensor,
-        encoder_hidden_states: Optional[tf.Tensor],
-        encoder_attention_mask: Optional[tf.Tensor],
-        past_key_value: Optional[Tuple[tf.Tensor]],
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_value: Tuple[tf.Tensor] | None,
         output_attentions: bool,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
@@ -392,9 +392,9 @@ def call(
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
         head_mask: tf.Tensor,
-        encoder_hidden_states: Optional[tf.Tensor],
-        encoder_attention_mask: Optional[tf.Tensor],
-        past_key_values: Optional[Tuple[Tuple[tf.Tensor]]],
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_values: Tuple[Tuple[tf.Tensor]] | None,
         use_cache: Optional[bool],
         output_attentions: bool,
         output_hidden_states: bool,
@@ -478,8 +478,7 @@ class TFElectraEmbeddings(tf.keras.layers.Layer):
     def __init__(self, config: ElectraConfig, **kwargs):
         super().__init__(**kwargs)
 
-        self.vocab_size = config.vocab_size
-        self.type_vocab_size = config.type_vocab_size
+        self.config = config
         self.embedding_size = config.embedding_size
         self.max_position_embeddings = config.max_position_embeddings
         self.initializer_range = config.initializer_range
@@ -490,14 +489,14 @@ def build(self, input_shape: tf.TensorShape):
         with tf.name_scope("word_embeddings"):
             self.weight = self.add_weight(
                 name="weight",
-                shape=[self.vocab_size, self.embedding_size],
+                shape=[self.config.vocab_size, self.embedding_size],
                 initializer=get_initializer(self.initializer_range),
             )
 
         with tf.name_scope("token_type_embeddings"):
             self.token_type_embeddings = self.add_weight(
                 name="embeddings",
-                shape=[self.type_vocab_size, self.embedding_size],
+                shape=[self.config.type_vocab_size, self.embedding_size],
                 initializer=get_initializer(self.initializer_range),
             )
 
@@ -530,16 +529,7 @@ def call(
             raise ValueError("Need to provide either `input_ids` or `input_embeds`.")
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
@@ -604,25 +594,6 @@ class TFElectraPreTrainedModel(TFPreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"generator_lm_head.weight"]
     _keys_to_ignore_on_load_missing = [r"dropout"]
 
-    @property
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertPreTrainedModel.dummy_inputs
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        dummy = {"input_ids": tf.constant(DUMMY_INPUTS, dtype=tf.int32)}
-        # Add `encoder_hidden_states` to make the cross-attention layers' weights initialized
-        if self.config.add_cross_attention:
-            batch_size, seq_len = tf.constant(DUMMY_INPUTS).shape
-            shape = (batch_size, seq_len) + (self.config.hidden_size,)
-            h = tf.random.uniform(shape=shape)
-            dummy["encoder_hidden_states"] = h
-
-        return dummy
-
 
 @keras_serializable
 class TFElectraMainLayer(tf.keras.layers.Layer):
@@ -715,14 +686,14 @@ def get_head_mask(self, head_mask):
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -835,8 +806,8 @@ class TFElectraForPreTrainingOutput(ModelOutput):
     """
 
     logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 ELECTRA_START_DOCSTRING = r"""
@@ -886,7 +857,7 @@ class TFElectraForPreTrainingOutput(ModelOutput):
         input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`ElectraTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -946,21 +917,20 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFBaseModelOutputWithPastAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -1007,23 +977,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output):
-        output_cache = self.config.use_cache and self.config.is_decoder
-        pkv = tf.convert_to_tensor(output.past_key_values) if output_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = tf.convert_to_tensor(output.cross_attentions) if output.cross_attentions is not None else None
-        if not (self.config.output_attentions and self.config.add_cross_attention):
-            cross_attns = None
-
-        return TFBaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=output.last_hidden_state,
-            past_key_values=pkv,
-            hidden_states=hs,
-            attentions=attns,
-            cross_attentions=cross_attns,
-        )
-
 
 @add_start_docstrings(
     """
@@ -1046,12 +999,12 @@ def __init__(self, config, **kwargs):
     @replace_return_docstrings(output_type=TFElectraForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1064,9 +1017,9 @@ def call(
 
         ```python
         >>> import tensorflow as tf
-        >>> from transformers import ElectraTokenizer, TFElectraForPreTraining
+        >>> from transformers import AutoTokenizer, TFElectraForPreTraining
 
-        >>> tokenizer = ElectraTokenizer.from_pretrained("google/electra-small-discriminator")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/electra-small-discriminator")
         >>> model = TFElectraForPreTraining.from_pretrained("google/electra-small-discriminator")
         >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
         >>> outputs = model(input_ids)
@@ -1096,23 +1049,17 @@ def call(
             attentions=discriminator_hidden_states.attentions,
         )
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFElectraForPreTrainingOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 class TFElectraMaskedLMHead(tf.keras.layers.Layer):
     def __init__(self, config, input_embeddings, **kwargs):
         super().__init__(**kwargs)
 
-        self.vocab_size = config.vocab_size
+        self.config = config
         self.embedding_size = config.embedding_size
         self.input_embeddings = input_embeddings
 
     def build(self, input_shape):
-        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
+        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
 
         super().build(input_shape)
 
@@ -1128,13 +1075,13 @@ def get_bias(self):
 
     def set_bias(self, value):
         self.bias = value["bias"]
-        self.vocab_size = shape_list(value["bias"])[0]
+        self.config.vocab_size = shape_list(value["bias"])[0]
 
     def call(self, hidden_states):
         seq_length = shape_list(tensor=hidden_states)[1]
         hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size])
         hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
-        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
         hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
 
         return hidden_states
@@ -1153,7 +1100,7 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel, TFMaskedLanguageModelingLos
     def __init__(self, config, **kwargs):
         super().__init__(config, **kwargs)
 
-        self.vocab_size = config.vocab_size
+        self.config = config
         self.electra = TFElectraMainLayer(config, name="electra")
         self.generator_predictions = TFElectraGeneratorPredictions(config, name="generator_predictions")
 
@@ -1174,7 +1121,6 @@ def get_prefix_bias_name(self):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="google/electra-small-generator",
         output_type=TFMaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1184,16 +1130,16 @@ def get_prefix_bias_name(self):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1231,13 +1177,6 @@ def call(
             attentions=generator_hidden_states.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output
-    def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 class TFElectraClassificationHead(tf.keras.layers.Layer):
     """Head for sentence-level classification tasks."""
@@ -1286,7 +1225,6 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="bhadresh-savani/electra-base-emotion",
         output_type=TFSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1295,16 +1233,16 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1340,13 +1278,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1367,36 +1298,25 @@ def __init__(self, config, *inputs, **kwargs):
             1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
         )
 
-    @property
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            tf.Tensor with dummy inputs
-        """
-        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32)}
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFMultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1450,28 +1370,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-                "token_type_ids": tf.TensorSpec((None, None, None), tf.int32, name="token_type_ids"),
-            }
-        ]
-    )
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving
-    def serving(self, inputs: Dict[str, tf.Tensor]):
-        output = self.call(input_ids=inputs)
-
-        return self.serving_output(output)
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output
-    def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMultipleChoiceModelOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1497,7 +1395,6 @@ def __init__(self, config, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="bhadresh-savani/electra-base-discriminator-finetuned-conll03-english",
         output_type=TFTokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1506,16 +1403,16 @@ def __init__(self, config, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1551,13 +1448,6 @@ def call(
             attentions=discriminator_hidden_states.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1579,7 +1469,6 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="bhadresh-savani/electra-base-squad2",
         output_type=TFQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1590,17 +1479,17 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1652,12 +1541,3 @@ def call(
             hidden_states=discriminator_hidden_states.hidden_states,
             attentions=discriminator_hidden_states.attentions,
         )
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output
-    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
diff --git a/src/transformers/models/electra/tokenization_electra.py b/src/transformers/models/electra/tokenization_electra.py
index f81bb8f3bf9d..fb9bf9dfa13c 100644
--- a/src/transformers/models/electra/tokenization_electra.py
+++ b/src/transformers/models/electra/tokenization_electra.py
@@ -150,22 +150,8 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
-        super().__init__(
-            do_lower_case=do_lower_case,
-            do_basic_tokenize=do_basic_tokenize,
-            never_split=never_split,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            tokenize_chinese_chars=tokenize_chinese_chars,
-            strip_accents=strip_accents,
-            **kwargs,
-        )
-
         if not os.path.isfile(vocab_file):
             raise ValueError(
                 f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
@@ -181,7 +167,22 @@ def __init__(
                 tokenize_chinese_chars=tokenize_chinese_chars,
                 strip_accents=strip_accents,
             )
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+
+        super().__init__(
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
 
     @property
     def do_lower_case(self):
@@ -194,11 +195,12 @@ def vocab_size(self):
     def get_vocab(self):
         return dict(self.vocab, **self.added_tokens_encoder)
 
-    def _tokenize(self, text):
+    def _tokenize(self, text, split_special_tokens=False):
         split_tokens = []
         if self.do_basic_tokenize:
-            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-
+            for token in self.basic_tokenizer.tokenize(
+                text, never_split=self.all_special_tokens if not split_special_tokens else None
+            ):
                 # If the token is part of the never_split set
                 if token in self.basic_tokenizer.never_split:
                     split_tokens.append(token)
@@ -343,20 +345,30 @@ class BasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -375,7 +387,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -403,7 +417,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
diff --git a/src/transformers/models/electra/tokenization_electra_fast.py b/src/transformers/models/electra/tokenization_electra_fast.py
index 894f41df1792..81704317f869 100644
--- a/src/transformers/models/electra/tokenization_electra_fast.py
+++ b/src/transformers/models/electra/tokenization_electra_fast.py
@@ -143,7 +143,7 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
@@ -192,7 +192,7 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
         """
         output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
 
-        if token_ids_1:
+        if token_ids_1 is not None:
             output += token_ids_1 + [self.sep_token_id]
 
         return output
diff --git a/src/transformers/models/encodec/__init__.py b/src/transformers/models/encodec/__init__.py
new file mode 100644
index 000000000000..d3d9488968bf
--- /dev/null
+++ b/src/transformers/models/encodec/__init__.py
@@ -0,0 +1,65 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_encodec": [
+        "ENCODEC_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "EncodecConfig",
+    ],
+    "feature_extraction_encodec": ["EncodecFeatureExtractor"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_encodec"] = [
+        "ENCODEC_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "EncodecModel",
+        "EncodecPreTrainedModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_encodec import (
+        ENCODEC_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        EncodecConfig,
+    )
+    from .feature_extraction_encodec import EncodecFeatureExtractor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_encodec import (
+            ENCODEC_PRETRAINED_MODEL_ARCHIVE_LIST,
+            EncodecModel,
+            EncodecPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/encodec/configuration_encodec.py b/src/transformers/models/encodec/configuration_encodec.py
new file mode 100644
index 000000000000..e75711d9264e
--- /dev/null
+++ b/src/transformers/models/encodec/configuration_encodec.py
@@ -0,0 +1,194 @@
+# coding=utf-8
+# Copyright 2023 Meta Platforms, Inc. and affiliates, and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" EnCodec model configuration"""
+
+
+import math
+from typing import Optional
+
+import numpy as np
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+ENCODEC_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "facebook/encodec_24khz": "https://huggingface.co/facebook/encodec_24khz/resolve/main/config.json",
+    "facebook/encodec_48khz": "https://huggingface.co/facebook/encodec_48khz/resolve/main/config.json",
+}
+
+
+class EncodecConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of an [`EncodecModel`]. It is used to instantiate a
+    Encodec model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the
+    [facebook/encodec_24khz](https://huggingface.co/facebook/encodec_24khz) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        target_bandwidths (`List[float]`, *optional*, defaults to `[1.5, 3.0, 6.0, 12.0, 24.0]`):
+            The range of diffent bandwiths the model can encode audio with.
+        sampling_rate (`int`, *optional*, defaults to 24000):
+            The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz).
+        audio_channels (`int`, *optional*, defaults to 1):
+            Number of channels in the audio data. Either 1 for mono or 2 for stereo.
+        normalize (`bool`, *optional*, defaults to `False`):
+            Whether the audio shall be normalized when passed.
+        chunk_length_s (`float`, *optional*):
+            If defined the audio is pre-processed into chunks of lengths `chunk_length_s` and then encoded.
+        overlap (`float`, *optional*):
+            Defines the overlap between each chunk. It is used to compute the `chunk_stride` using the following
+            formulae : `int((1.0 - self.overlap) * self.chunk_length)`.
+        hidden_size (`int`, *optional*, defaults to 128):
+            Intermediate representation dimension.
+        num_filters (`int`, *optional*, defaults to 32):
+            Number of convolution kernels of first `EncodecConv1d` down sampling layer.
+        num_residual_layers (`int`,  *optional*, defaults to 1):
+            Number of residual layers.
+        upsampling_ratios (`Sequence[int]` , *optional*, defaults to `[8, 5, 4, 2]`):
+            Kernel size and stride ratios. The encoder uses downsampling ratios instead of upsampling ratios, hence it
+            will use the ratios in the reverse order to the ones specified here that must match the decoder order.
+        norm_type (`str`, *optional*, defaults to `"weight_norm"`):
+            Normalization method. Should be in `["weight_norm", "time_group_norm"]`
+        kernel_size (`int`, *optional*, defaults to 7):
+            Kernel size for the initial convolution.
+        last_kernel_size (`int`, *optional*, defaults to 7):
+            Kernel size for the last convolution layer.
+        residual_kernel_size (`int`, *optional*, defaults to 3):
+            Kernel size for the residual layers.
+        dilation_growth_rate (`int`, *optional*, defaults to 2):
+            How much to increase the dilation with each layer.
+        use_causal_conv (`bool`, *optional*, defaults to `True`):
+            Whether to use fully causal convolution.
+        pad_mode (`str`, *optional*, defaults to `"reflect"`):
+            Padding mode for the convolutions.
+        compress (`int`, *optional*, defaults to 2):
+            Reduced dimensionality in residual branches (from Demucs v3).
+        num_lstm_layers (`int`, *optional*, defaults to 2):
+            Number of LSTM layers at the end of the encoder.
+        trim_right_ratio (`float`, *optional*, defaults to 1.0):
+            Ratio for trimming at the right of the transposed convolution under the `use_causal_conv = True` setup. If
+            equal to 1.0, it means that all the trimming is done at the right.
+        codebook_size (`int`, *optional*, defaults to 1024):
+            Number of discret codes that make up VQVAE.
+        codebook_dim (`int`, *optional*):
+            Dimension of the codebook vectors. If not defined, uses `hidden_size`.
+        use_conv_shortcut (`bool`, *optional*, defaults to `True`):
+            Whether to use a convolutional layer as the 'skip' connection in the `EncodecResnetBlock` block. If False,
+            an identity function will be used, giving a generic residual connection.
+
+    Example:
+
+    ```python
+    >>> from transformers import EncodecModel, EncodecConfig
+
+    >>> # Initializing a "facebook/encodec_24khz" style configuration
+    >>> configuration = EncodecConfig()
+
+    >>> # Initializing a model (with random weights) from the "facebook/encodec_24khz" style configuration
+    >>> model = EncodecModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "encodec"
+
+    def __init__(
+        self,
+        target_bandwidths=[1.5, 3.0, 6.0, 12.0, 24.0],
+        sampling_rate=24_000,
+        audio_channels=1,
+        normalize=False,
+        chunk_length_s=None,
+        overlap=None,
+        hidden_size=128,
+        num_filters=32,
+        num_residual_layers=1,
+        upsampling_ratios=[8, 5, 4, 2],
+        norm_type="weight_norm",
+        kernel_size=7,
+        last_kernel_size=7,
+        residual_kernel_size=3,
+        dilation_growth_rate=2,
+        use_causal_conv=True,
+        pad_mode="reflect",
+        compress=2,
+        num_lstm_layers=2,
+        trim_right_ratio=1.0,
+        codebook_size=1024,
+        codebook_dim=None,
+        use_conv_shortcut=True,
+        **kwargs,
+    ):
+        self.target_bandwidths = target_bandwidths
+        self.sampling_rate = sampling_rate
+        self.audio_channels = audio_channels
+        self.normalize = normalize
+        self.chunk_length_s = chunk_length_s
+        self.overlap = overlap
+        self.hidden_size = hidden_size
+        self.num_filters = num_filters
+        self.num_residual_layers = num_residual_layers
+        self.upsampling_ratios = upsampling_ratios
+        self.norm_type = norm_type
+        self.kernel_size = kernel_size
+        self.last_kernel_size = last_kernel_size
+        self.residual_kernel_size = residual_kernel_size
+        self.dilation_growth_rate = dilation_growth_rate
+        self.use_causal_conv = use_causal_conv
+        self.pad_mode = pad_mode
+        self.compress = compress
+        self.num_lstm_layers = num_lstm_layers
+        self.trim_right_ratio = trim_right_ratio
+        self.codebook_size = codebook_size
+        self.codebook_dim = codebook_dim if codebook_dim is not None else hidden_size
+        self.use_conv_shortcut = use_conv_shortcut
+
+        if self.norm_type not in ["weight_norm", "time_group_norm"]:
+            raise ValueError(
+                f'self.norm_type must be one of `"weight_norm"`, `"time_group_norm"`), got {self.norm_type}'
+            )
+
+        super().__init__(**kwargs)
+
+    # This is a property because you might want to change the chunk_length_s on the fly
+    @property
+    def chunk_length(self) -> Optional[int]:
+        if self.chunk_length_s is None:
+            return None
+        else:
+            return int(self.chunk_length_s * self.sampling_rate)
+
+    # This is a property because you might want to change the chunk_length_s on the fly
+    @property
+    def chunk_stride(self) -> Optional[int]:
+        if self.chunk_length_s is None or self.overlap is None:
+            return None
+        else:
+            return max(1, int((1.0 - self.overlap) * self.chunk_length))
+
+    @property
+    def frame_rate(self) -> int:
+        hop_length = np.prod(self.upsampling_ratios)
+        return math.ceil(self.sampling_rate / hop_length)
+
+    @property
+    def num_quantizers(self) -> int:
+        return int(1000 * self.target_bandwidths[-1] // (self.frame_rate * 10))
diff --git a/src/transformers/models/encodec/convert_encodec_checkpoint_to_pytorch.py b/src/transformers/models/encodec/convert_encodec_checkpoint_to_pytorch.py
new file mode 100644
index 000000000000..3a16a4b7ba0f
--- /dev/null
+++ b/src/transformers/models/encodec/convert_encodec_checkpoint_to_pytorch.py
@@ -0,0 +1,365 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert EnCodec checkpoints."""
+
+import argparse
+
+import torch
+
+from transformers import (
+    EncodecConfig,
+    EncodecFeatureExtractor,
+    EncodecModel,
+    logging,
+)
+
+
+# checkpoints downloaded from:
+# https://dl.fbaipublicfiles.com/encodec/v0/encodec_24khz-d7cc33bc.th
+# https://huggingface.co/facebook/musicgen-small/resolve/main/compression_state_dict.bin
+# https://dl.fbaipublicfiles.com/encodec/v0/encodec_48khz-7e698e3e.th
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger("transformers.models.encodec")
+
+MAPPING_QUANTIZER = {
+    "quantizer.vq.layers.*._codebook.inited": "quantizer.layers.*.codebook.inited",
+    "quantizer.vq.layers.*._codebook.cluster_size": "quantizer.layers.*.codebook.cluster_size",
+    "quantizer.vq.layers.*._codebook.embed": "quantizer.layers.*.codebook.embed",
+    "quantizer.vq.layers.*._codebook.embed_avg": "quantizer.layers.*.codebook.embed_avg",
+}
+MAPPING_ENCODER = {
+    "encoder.model.0.conv.conv": "encoder.layers.0.conv",
+    "encoder.model.1.block.1.conv.conv": "encoder.layers.1.block.1.conv",
+    "encoder.model.1.block.3.conv.conv": "encoder.layers.1.block.3.conv",
+    "encoder.model.1.shortcut.conv.conv": "encoder.layers.1.shortcut.conv",
+    "encoder.model.3.conv.conv": "encoder.layers.3.conv",
+    "encoder.model.4.block.1.conv.conv": "encoder.layers.4.block.1.conv",
+    "encoder.model.4.block.3.conv.conv": "encoder.layers.4.block.3.conv",
+    "encoder.model.4.shortcut.conv.conv": "encoder.layers.4.shortcut.conv",
+    "encoder.model.6.conv.conv": "encoder.layers.6.conv",
+    "encoder.model.7.block.1.conv.conv": "encoder.layers.7.block.1.conv",
+    "encoder.model.7.block.3.conv.conv": "encoder.layers.7.block.3.conv",
+    "encoder.model.7.shortcut.conv.conv": "encoder.layers.7.shortcut.conv",
+    "encoder.model.9.conv.conv": "encoder.layers.9.conv",
+    "encoder.model.10.block.1.conv.conv": "encoder.layers.10.block.1.conv",
+    "encoder.model.10.block.3.conv.conv": "encoder.layers.10.block.3.conv",
+    "encoder.model.10.shortcut.conv.conv": "encoder.layers.10.shortcut.conv",
+    "encoder.model.12.conv.conv": "encoder.layers.12.conv",
+    "encoder.model.13.lstm": "encoder.layers.13.lstm",
+    "encoder.model.15.conv.conv": "encoder.layers.15.conv",
+}
+MAPPING_ENCODER_48K = {
+    "encoder.model.0.conv.norm": "encoder.layers.0.norm",
+    "encoder.model.1.block.1.conv.norm": "encoder.layers.1.block.1.norm",
+    "encoder.model.1.block.3.conv.norm": "encoder.layers.1.block.3.norm",
+    "encoder.model.1.shortcut.conv.norm": "encoder.layers.1.shortcut.norm",
+    "encoder.model.3.conv.norm": "encoder.layers.3.norm",
+    "encoder.model.4.block.1.conv.norm": "encoder.layers.4.block.1.norm",
+    "encoder.model.4.block.3.conv.norm": "encoder.layers.4.block.3.norm",
+    "encoder.model.4.shortcut.conv.norm": "encoder.layers.4.shortcut.norm",
+    "encoder.model.6.conv.norm": "encoder.layers.6.norm",
+    "encoder.model.7.block.1.conv.norm": "encoder.layers.7.block.1.norm",
+    "encoder.model.7.block.3.conv.norm": "encoder.layers.7.block.3.norm",
+    "encoder.model.7.shortcut.conv.norm": "encoder.layers.7.shortcut.norm",
+    "encoder.model.9.conv.norm": "encoder.layers.9.norm",
+    "encoder.model.10.block.1.conv.norm": "encoder.layers.10.block.1.norm",
+    "encoder.model.10.block.3.conv.norm": "encoder.layers.10.block.3.norm",
+    "encoder.model.10.shortcut.conv.norm": "encoder.layers.10.shortcut.norm",
+    "encoder.model.12.conv.norm": "encoder.layers.12.norm",
+    "encoder.model.15.conv.norm": "encoder.layers.15.norm",
+}
+MAPPING_DECODER = {
+    "decoder.model.0.conv.conv": "decoder.layers.0.conv",
+    "decoder.model.1.lstm": "decoder.layers.1.lstm",
+    "decoder.model.3.convtr.convtr": "decoder.layers.3.conv",
+    "decoder.model.4.block.1.conv.conv": "decoder.layers.4.block.1.conv",
+    "decoder.model.4.block.3.conv.conv": "decoder.layers.4.block.3.conv",
+    "decoder.model.4.shortcut.conv.conv": "decoder.layers.4.shortcut.conv",
+    "decoder.model.6.convtr.convtr": "decoder.layers.6.conv",
+    "decoder.model.7.block.1.conv.conv": "decoder.layers.7.block.1.conv",
+    "decoder.model.7.block.3.conv.conv": "decoder.layers.7.block.3.conv",
+    "decoder.model.7.shortcut.conv.conv": "decoder.layers.7.shortcut.conv",
+    "decoder.model.9.convtr.convtr": "decoder.layers.9.conv",
+    "decoder.model.10.block.1.conv.conv": "decoder.layers.10.block.1.conv",
+    "decoder.model.10.block.3.conv.conv": "decoder.layers.10.block.3.conv",
+    "decoder.model.10.shortcut.conv.conv": "decoder.layers.10.shortcut.conv",
+    "decoder.model.12.convtr.convtr": "decoder.layers.12.conv",
+    "decoder.model.13.block.1.conv.conv": "decoder.layers.13.block.1.conv",
+    "decoder.model.13.block.3.conv.conv": "decoder.layers.13.block.3.conv",
+    "decoder.model.13.shortcut.conv.conv": "decoder.layers.13.shortcut.conv",
+    "decoder.model.15.conv.conv": "decoder.layers.15.conv",
+}
+MAPPING_DECODER_48K = {
+    "decoder.model.0.conv.norm": "decoder.layers.0.norm",
+    "decoder.model.3.convtr.norm": "decoder.layers.3.norm",
+    "decoder.model.4.block.1.conv.norm": "decoder.layers.4.block.1.norm",
+    "decoder.model.4.block.3.conv.norm": "decoder.layers.4.block.3.norm",
+    "decoder.model.4.shortcut.conv.norm": "decoder.layers.4.shortcut.norm",
+    "decoder.model.6.convtr.norm": "decoder.layers.6.norm",
+    "decoder.model.7.block.1.conv.norm": "decoder.layers.7.block.1.norm",
+    "decoder.model.7.block.3.conv.norm": "decoder.layers.7.block.3.norm",
+    "decoder.model.7.shortcut.conv.norm": "decoder.layers.7.shortcut.norm",
+    "decoder.model.9.convtr.norm": "decoder.layers.9.norm",
+    "decoder.model.10.block.1.conv.norm": "decoder.layers.10.block.1.norm",
+    "decoder.model.10.block.3.conv.norm": "decoder.layers.10.block.3.norm",
+    "decoder.model.10.shortcut.conv.norm": "decoder.layers.10.shortcut.norm",
+    "decoder.model.12.convtr.norm": "decoder.layers.12.norm",
+    "decoder.model.13.block.1.conv.norm": "decoder.layers.13.block.1.norm",
+    "decoder.model.13.block.3.conv.norm": "decoder.layers.13.block.3.norm",
+    "decoder.model.13.shortcut.conv.norm": "decoder.layers.13.shortcut.norm",
+    "decoder.model.15.conv.norm": "decoder.layers.15.norm",
+}
+MAPPING_24K = {
+    **MAPPING_QUANTIZER,
+    **MAPPING_ENCODER,
+    **MAPPING_DECODER,
+}
+MAPPING_48K = {
+    **MAPPING_QUANTIZER,
+    **MAPPING_ENCODER,
+    **MAPPING_ENCODER_48K,
+    **MAPPING_DECODER,
+    **MAPPING_DECODER_48K,
+}
+TOP_LEVEL_KEYS = []
+IGNORE_KEYS = []
+
+
+def set_recursively(hf_pointer, key, value, full_name, weight_type):
+    for attribute in key.split("."):
+        hf_pointer = getattr(hf_pointer, attribute)
+
+    if weight_type is not None:
+        hf_shape = getattr(hf_pointer, weight_type).shape
+    else:
+        hf_shape = hf_pointer.shape
+
+    if hf_shape != value.shape:
+        raise ValueError(
+            f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
+            f" {value.shape} for {full_name}"
+        )
+
+    if weight_type == "weight":
+        hf_pointer.weight.data = value
+    elif weight_type == "weight_g":
+        hf_pointer.weight_g.data = value
+    elif weight_type == "weight_v":
+        hf_pointer.weight_v.data = value
+    elif weight_type == "bias":
+        hf_pointer.bias.data = value
+    elif weight_type == "running_mean":
+        hf_pointer.running_mean.data = value
+    elif weight_type == "running_var":
+        hf_pointer.running_var.data = value
+    elif weight_type == "num_batches_tracked":
+        hf_pointer.num_batches_tracked.data = value
+    elif weight_type == "weight_ih_l0":
+        hf_pointer.weight_ih_l0.data = value
+    elif weight_type == "weight_hh_l0":
+        hf_pointer.weight_hh_l0.data = value
+    elif weight_type == "bias_ih_l0":
+        hf_pointer.bias_ih_l0.data = value
+    elif weight_type == "bias_hh_l0":
+        hf_pointer.bias_hh_l0.data = value
+    elif weight_type == "weight_ih_l1":
+        hf_pointer.weight_ih_l1.data = value
+    elif weight_type == "weight_hh_l1":
+        hf_pointer.weight_hh_l1.data = value
+    elif weight_type == "bias_ih_l1":
+        hf_pointer.bias_ih_l1.data = value
+    elif weight_type == "bias_hh_l1":
+        hf_pointer.bias_hh_l1.data = value
+    else:
+        hf_pointer.data = value
+
+    logger.info(f"{key + ('.' + weight_type if weight_type is not None else '')} was initialized from {full_name}.")
+
+
+def should_ignore(name, ignore_keys):
+    for key in ignore_keys:
+        if key.endswith(".*"):
+            if name.startswith(key[:-1]):
+                return True
+        elif ".*." in key:
+            prefix, suffix = key.split(".*.")
+            if prefix in name and suffix in name:
+                return True
+        elif key in name:
+            return True
+    return False
+
+
+def recursively_load_weights(orig_dict, hf_model, model_name):
+    unused_weights = []
+
+    if model_name == "encodec_24khz" or "encodec_32khz":
+        MAPPING = MAPPING_24K
+    elif model_name == "encodec_48khz":
+        MAPPING = MAPPING_48K
+    else:
+        raise ValueError(f"Unsupported model: {model_name}")
+
+    for name, value in orig_dict.items():
+        if should_ignore(name, IGNORE_KEYS):
+            logger.info(f"{name} was ignored")
+            continue
+
+        is_used = False
+        for key, mapped_key in MAPPING.items():
+            if "*" in key:
+                prefix, suffix = key.split(".*.")
+                if prefix in name and suffix in name:
+                    key = suffix
+
+            if key in name:
+                # HACK otherwise .embed gets initialized with .embed_avg too
+                if key.endswith("embed") and name.endswith("embed_avg"):
+                    continue
+
+                is_used = True
+                if "*" in mapped_key:
+                    layer_index = name.split(key)[0].split(".")[-2]
+                    mapped_key = mapped_key.replace("*", layer_index)
+                if "weight_g" in name:
+                    weight_type = "weight_g"
+                elif "weight_v" in name:
+                    weight_type = "weight_v"
+                elif "weight_ih_l0" in name:
+                    weight_type = "weight_ih_l0"
+                elif "weight_hh_l0" in name:
+                    weight_type = "weight_hh_l0"
+                elif "bias_ih_l0" in name:
+                    weight_type = "bias_ih_l0"
+                elif "bias_hh_l0" in name:
+                    weight_type = "bias_hh_l0"
+                elif "weight_ih_l1" in name:
+                    weight_type = "weight_ih_l1"
+                elif "weight_hh_l1" in name:
+                    weight_type = "weight_hh_l1"
+                elif "bias_ih_l1" in name:
+                    weight_type = "bias_ih_l1"
+                elif "bias_hh_l1" in name:
+                    weight_type = "bias_hh_l1"
+                elif "bias" in name:
+                    weight_type = "bias"
+                elif "weight" in name:
+                    weight_type = "weight"
+                elif "running_mean" in name:
+                    weight_type = "running_mean"
+                elif "running_var" in name:
+                    weight_type = "running_var"
+                elif "num_batches_tracked" in name:
+                    weight_type = "num_batches_tracked"
+                else:
+                    weight_type = None
+                set_recursively(hf_model, mapped_key, value, name, weight_type)
+            continue
+        if not is_used:
+            unused_weights.append(name)
+
+    logger.warning(f"Unused weights: {unused_weights}")
+
+
+@torch.no_grad()
+def convert_checkpoint(
+    model_name,
+    checkpoint_path,
+    pytorch_dump_folder_path,
+    config_path=None,
+    repo_id=None,
+):
+    """
+    Copy/paste/tweak model's weights to transformers design.
+    """
+    if config_path is not None:
+        config = EncodecConfig.from_pretrained(config_path)
+    else:
+        config = EncodecConfig()
+
+    if model_name == "encodec_24khz":
+        pass  # config is already correct
+    elif model_name == "encodec_32khz":
+        config.upsampling_ratios = [8, 5, 4, 4]
+        config.target_bandwidths = [2.2]
+        config.num_filters = 64
+        config.sampling_rate = 32_000
+        config.codebook_size = 2048
+        config.use_causal_conv = False
+        config.normalize = False
+        config.use_conv_shortcut = False
+    elif model_name == "encodec_48khz":
+        config.upsampling_ratios = [8, 5, 4, 2]
+        config.target_bandwidths = [3.0, 6.0, 12.0, 24.0]
+        config.sampling_rate = 48_000
+        config.audio_channels = 2
+        config.use_causal_conv = False
+        config.norm_type = "time_group_norm"
+        config.normalize = True
+        config.chunk_length_s = 1.0
+        config.overlap = 0.01
+    else:
+        raise ValueError(f"Unknown model name: {model_name}")
+
+    model = EncodecModel(config)
+
+    feature_extractor = EncodecFeatureExtractor(
+        feature_size=config.audio_channels,
+        sampling_rate=config.sampling_rate,
+        chunk_length_s=config.chunk_length_s,
+        overlap=config.overlap,
+    )
+    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+
+    original_checkpoint = torch.load(checkpoint_path)
+    if "best_state" in original_checkpoint:
+        # we might have a training state saved, in which case discard the yaml results and just retain the weights
+        original_checkpoint = original_checkpoint["best_state"]
+    recursively_load_weights(original_checkpoint, model, model_name)
+    model.save_pretrained(pytorch_dump_folder_path)
+
+    if repo_id:
+        print("Pushing to the hub...")
+        feature_extractor.push_to_hub(repo_id)
+        model.push_to_hub(repo_id)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model",
+        default="encodec_24khz",
+        type=str,
+        help="The model to convert. Should be one of 'encodec_24khz', 'encodec_32khz', 'encodec_48khz'.",
+    )
+    parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
+    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
+    parser.add_argument(
+        "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
+    )
+    parser.add_argument(
+        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
+    )
+
+    args = parser.parse_args()
+    convert_checkpoint(
+        args.model,
+        args.checkpoint_path,
+        args.pytorch_dump_folder_path,
+        args.config_path,
+        args.push_to_hub,
+    )
diff --git a/src/transformers/models/encodec/feature_extraction_encodec.py b/src/transformers/models/encodec/feature_extraction_encodec.py
new file mode 100644
index 000000000000..6f7536a52e9f
--- /dev/null
+++ b/src/transformers/models/encodec/feature_extraction_encodec.py
@@ -0,0 +1,206 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for EnCodec."""
+
+from typing import List, Optional, Union
+
+import numpy as np
+
+from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
+from ...feature_extraction_utils import BatchFeature
+from ...utils import PaddingStrategy, TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class EncodecFeatureExtractor(SequenceFeatureExtractor):
+    r"""
+    Constructs an EnCodec feature extractor.
+
+    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
+    most of the main methods. Users should refer to this superclass for more information regarding those methods.
+
+    Instantiating a feature extractor with the defaults will yield a similar configuration to that of the
+    [facebook/encodec_24khz](https://huggingface.co/facebook/encodec_24khz) architecture.
+
+    Args:
+        feature_size (`int`, *optional*, defaults to 1):
+            The feature dimension of the extracted features. Use 1 for mono, 2 for stereo.
+        sampling_rate (`int`, *optional*, defaults to 24000):
+            The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz).
+        padding_value (`float`, *optional*, defaults to 0.0):
+            The value that is used to fill the padding values.
+        chunk_length_s (`float`, *optional*):
+            If defined the audio is pre-processed into chunks of lengths `chunk_length_s` and then encoded.
+        overlap (`float`, *optional*):
+            Defines the overlap between each chunk. It is used to compute the `chunk_stride` using the following
+            formulae : `int((1.0 - self.overlap) * self.chunk_length)`.
+    """
+
+    model_input_names = ["input_values", "padding_mask"]
+
+    def __init__(
+        self,
+        feature_size: int = 1,
+        sampling_rate: int = 24000,
+        padding_value: float = 0.0,
+        chunk_length_s: float = None,
+        overlap: float = None,
+        **kwargs,
+    ):
+        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
+        self.chunk_length_s = chunk_length_s
+        self.overlap = overlap
+
+    # This is a property because you might want to change the chunk_length_s on the fly
+    @property
+    def chunk_length(self) -> Optional[int]:
+        if self.chunk_length_s is None:
+            return None
+        else:
+            return int(self.chunk_length_s * self.sampling_rate)
+
+    # This is a property because you might want to change the chunk_length_s on the fly
+    @property
+    def chunk_stride(self) -> Optional[int]:
+        if self.chunk_length_s is None or self.overlap is None:
+            return None
+        else:
+            return max(1, int((1.0 - self.overlap) * self.chunk_length))
+
+    def __call__(
+        self,
+        raw_audio: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
+        padding: Optional[Union[bool, str, PaddingStrategy]] = None,
+        truncation: Optional[bool] = False,
+        max_length: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        sampling_rate: Optional[int] = None,
+    ) -> BatchFeature:
+        """
+        Main method to featurize and prepare for the model one or several sequence(s).
+
+        Args:
+            raw_audio (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
+                The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float
+                values, a list of numpy arrays or a list of list of float values. The numpy array must be of shape
+                `(num_samples,)` for mono audio (`feature_size = 1`), or `(2, num_samples)` for stereo audio
+                (`feature_size = 2`).
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            truncation (`bool`, *optional*, defaults to `False`):
+                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            sampling_rate (`int`, *optional*):
+                The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass
+                `sampling_rate` at the forward call to prevent silent errors.
+        """
+        if sampling_rate is not None:
+            if sampling_rate != self.sampling_rate:
+                raise ValueError(
+                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
+                    f" {self.sampling_rate}. Please make sure that the provided audio input was sampled with"
+                    f" {self.sampling_rate} and not {sampling_rate}."
+                )
+        else:
+            logger.warning(
+                "It is strongly recommended to pass the `sampling_rate` argument to this function. "
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+
+        if padding and truncation:
+            raise ValueError("Both padding and truncation were set. Make sure you only set one.")
+        elif padding is None:
+            # by default let's pad the inputs
+            padding = True
+
+        is_batched = bool(
+            isinstance(raw_audio, (list, tuple)) and (isinstance(raw_audio[0], (np.ndarray, tuple, list)))
+        )
+
+        if is_batched:
+            raw_audio = [np.asarray(audio, dtype=np.float32).T for audio in raw_audio]
+        elif not is_batched and not isinstance(raw_audio, np.ndarray):
+            raw_audio = np.asarray(raw_audio, dtype=np.float32)
+        elif isinstance(raw_audio, np.ndarray) and raw_audio.dtype is np.dtype(np.float64):
+            raw_audio = raw_audio.astype(np.float32)
+
+        # always return batch
+        if not is_batched:
+            raw_audio = [np.asarray(raw_audio).T]
+
+        # verify inputs are valid
+        for idx, example in enumerate(raw_audio):
+            if example.ndim > 2:
+                raise ValueError(f"Expected input shape (channels, length) but got shape {example.shape}")
+            if self.feature_size == 1 and example.ndim != 1:
+                raise ValueError(f"Expected mono audio but example has {example.shape[-1]} channels")
+            if self.feature_size == 2 and example.shape[-1] != 2:
+                raise ValueError(f"Expected stereo audio but example has {example.shape[-1]} channels")
+
+        padded_inputs = None
+        input_values = BatchFeature({"input_values": raw_audio})
+        if self.chunk_stride is not None and self.chunk_length is not None and max_length is None:
+            if truncation:
+                max_length = min(array.shape[0] for array in raw_audio)
+                nb_step = int(np.floor(max_length / self.chunk_stride))
+                max_length = (nb_step - 1) * self.chunk_stride + self.chunk_length
+            elif padding:
+                max_length = max(array.shape[0] for array in raw_audio)
+                nb_step = int(np.ceil(max_length / self.chunk_stride))
+                max_length = (nb_step - 1) * self.chunk_stride + self.chunk_length
+                padding = "max_length"
+            else:
+                padded_inputs = input_values
+
+        # normal padding on batch
+        if padded_inputs is None:
+            padded_inputs = self.pad(
+                input_values,
+                max_length=max_length,
+                truncation=truncation,
+                padding=padding,
+                return_attention_mask=padding,
+            )
+            if padding:
+                padded_inputs["padding_mask"] = padded_inputs.pop("attention_mask")
+
+        input_values = []
+        for example in padded_inputs.pop("input_values"):
+            if self.feature_size == 1:
+                example = example[..., None]
+            input_values.append(example.T)
+
+        padded_inputs["input_values"] = input_values
+        if return_tensors is not None:
+            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
+
+        return padded_inputs
diff --git a/src/transformers/models/encodec/modeling_encodec.py b/src/transformers/models/encodec/modeling_encodec.py
new file mode 100644
index 000000000000..697fb3c94fbb
--- /dev/null
+++ b/src/transformers/models/encodec/modeling_encodec.py
@@ -0,0 +1,811 @@
+# coding=utf-8
+# Copyright 2023 Meta Platforms, Inc. and affiliates, and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch EnCodec model."""
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_encodec import EncodecConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+# General docstring
+_CONFIG_FOR_DOC = "EncodecConfig"
+
+
+ENCODEC_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/encodec_24khz",
+    "facebook/encodec_48khz",
+    # See all EnCodec models at https://huggingface.co/models?filter=encodec
+]
+
+
+@dataclass
+class EncodecOutput(ModelOutput):
+    """
+    Args:
+        audio_codes (`torch.FloatTensor`  of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
+            Discret code embeddings computed using `model.encode`.
+        audio_values (`torch.FlaotTensor` of shape `(batch_size, sequence_length)`, *optional*)
+            Decoded audio values, obtained using the decoder part of Encodec.
+    """
+
+    audio_codes: torch.FloatTensor = None
+    audio_values: torch.FloatTensor = None
+
+
+@dataclass
+class EncodecEncoderOutput(ModelOutput):
+    """
+    Args:
+        audio_codes (`torch.FloatTensor`  of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
+            Discret code embeddings computed using `model.encode`.
+        audio_scales (`torch.Tensor` of shape `(batch_size, nb_chunks)`, *optional*):
+            Scaling factor for each `audio_codes` input. This is used to unscale each chunk of audio when decoding.
+    """
+
+    audio_codes: torch.FloatTensor = None
+    audio_scales: torch.FloatTensor = None
+
+
+@dataclass
+class EncodecDecoderOutput(ModelOutput):
+    """
+    Args:
+        audio_values (`torch.FloatTensor`  of shape `(batch_size, segment_length)`, *optional*):
+            Decoded audio values, obtained using the decoder part of Encodec.
+    """
+
+    audio_values: torch.FloatTensor = None
+
+
+class EncodecConv1d(nn.Module):
+    """Conv1d with asymmetric or causal padding and normalization."""
+
+    def __init__(
+        self, config, in_channels: int, out_channels: int, kernel_size: int, stride: int = 1, dilation: int = 1
+    ):
+        super().__init__()
+        self.causal = config.use_causal_conv
+        self.pad_mode = config.pad_mode
+        self.norm_type = config.norm_type
+
+        if self.norm_type not in ["weight_norm", "time_group_norm"]:
+            raise ValueError(
+                f'self.norm_type must be one of `"weight_norm"`, `"time_group_norm"`), got {self.norm_type}'
+            )
+
+        # warn user on unusual setup between dilation and stride
+        if stride > 1 and dilation > 1:
+            logger.warning(
+                "EncodecConv1d has been initialized with stride > 1 and dilation > 1"
+                f" (kernel_size={kernel_size} stride={stride}, dilation={dilation})."
+            )
+
+        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride, dilation=dilation)
+        if self.norm_type == "weight_norm":
+            self.conv = nn.utils.weight_norm(self.conv)
+        elif self.norm_type == "time_group_norm":
+            self.norm = nn.GroupNorm(1, out_channels)
+
+    @staticmethod
+    def _get_extra_padding_for_conv1d(
+        hidden_states: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0
+    ) -> int:
+        """See `pad_for_conv1d`."""
+        length = hidden_states.shape[-1]
+        n_frames = (length - kernel_size + padding_total) / stride + 1
+        ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
+        return ideal_length - length
+
+    @staticmethod
+    def _pad1d(hidden_states: torch.Tensor, paddings: Tuple[int, int], mode: str = "zero", value: float = 0.0):
+        """Tiny wrapper around torch.nn.functional.pad, just to allow for reflect padding on small input.
+        If this is the case, we insert extra 0 padding to the right before the reflection happens.
+        """
+        length = hidden_states.shape[-1]
+        padding_left, padding_right = paddings
+        if not mode == "reflect":
+            return nn.functional.pad(hidden_states, paddings, mode, value)
+
+        max_pad = max(padding_left, padding_right)
+        extra_pad = 0
+        if length <= max_pad:
+            extra_pad = max_pad - length + 1
+            hidden_states = nn.functional.pad(hidden_states, (0, extra_pad))
+        padded = nn.functional.pad(hidden_states, paddings, mode, value)
+        end = padded.shape[-1] - extra_pad
+        return padded[..., :end]
+
+    def forward(self, hidden_states):
+        kernel_size = self.conv.kernel_size[0]
+        stride = self.conv.stride[0]
+        dilation = self.conv.dilation[0]
+        kernel_size = (kernel_size - 1) * dilation + 1  # effective kernel size with dilations
+        padding_total = kernel_size - stride
+        extra_padding = self._get_extra_padding_for_conv1d(hidden_states, kernel_size, stride, padding_total)
+
+        if self.causal:
+            # Left padding for causal
+            hidden_states = self._pad1d(hidden_states, (padding_total, extra_padding), mode=self.pad_mode)
+        else:
+            # Asymmetric padding required for odd strides
+            padding_right = padding_total // 2
+            padding_left = padding_total - padding_right
+            hidden_states = self._pad1d(
+                hidden_states, (padding_left, padding_right + extra_padding), mode=self.pad_mode
+            )
+
+        hidden_states = self.conv(hidden_states)
+
+        if self.norm_type == "time_group_norm":
+            hidden_states = self.norm(hidden_states)
+
+        return hidden_states
+
+
+class EncodecConvTranspose1d(nn.Module):
+    """ConvTranspose1d with asymmetric or causal padding and normalization."""
+
+    def __init__(self, config, in_channels: int, out_channels: int, kernel_size: int, stride: int = 1):
+        super().__init__()
+        self.causal = config.use_causal_conv
+        self.trim_right_ratio = config.trim_right_ratio
+        self.norm_type = config.norm_type
+        if self.norm_type not in ["weight_norm", "time_group_norm"]:
+            raise ValueError(
+                f'self.norm_type must be one of `"weight_norm"`, `"time_group_norm"`), got {self.norm_type}'
+            )
+
+        self.conv = nn.ConvTranspose1d(in_channels, out_channels, kernel_size, stride)
+        if config.norm_type == "weight_norm":
+            self.conv = nn.utils.weight_norm(self.conv)
+        elif config.norm_type == "time_group_norm":
+            self.norm = nn.GroupNorm(1, out_channels)
+
+        if not (self.causal or self.trim_right_ratio == 1.0):
+            raise ValueError("`trim_right_ratio` != 1.0 only makes sense for causal convolutions")
+
+    def forward(self, hidden_states):
+        kernel_size = self.conv.kernel_size[0]
+        stride = self.conv.stride[0]
+        padding_total = kernel_size - stride
+
+        hidden_states = self.conv(hidden_states)
+
+        if self.norm_type == "time_group_norm":
+            hidden_states = self.norm(hidden_states)
+
+        # We will only trim fixed padding. Extra padding from `pad_for_conv1d` would be
+        # removed at the very end, when keeping only the right length for the output,
+        # as removing it here would require also passing the length at the matching layer
+        # in the encoder.
+        if self.causal:
+            # Trim the padding on the right according to the specified ratio
+            # if trim_right_ratio = 1.0, trim everything from right
+            padding_right = math.ceil(padding_total * self.trim_right_ratio)
+        else:
+            # Asymmetric padding required for odd strides
+            padding_right = padding_total // 2
+
+        padding_left = padding_total - padding_right
+
+        # unpad
+        end = hidden_states.shape[-1] - padding_right
+        hidden_states = hidden_states[..., padding_left:end]
+        return hidden_states
+
+
+class EncodecLSTM(nn.Module):
+    """
+    LSTM without worrying about the hidden state, nor the layout of the data. Expects input as convolutional layout.
+    """
+
+    def __init__(self, config, dimension):
+        super().__init__()
+        self.lstm = nn.LSTM(dimension, dimension, config.num_lstm_layers)
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.permute(2, 0, 1)
+        hidden_states = self.lstm(hidden_states)[0] + hidden_states
+        hidden_states = hidden_states.permute(1, 2, 0)
+        return hidden_states
+
+
+class EncodecResnetBlock(nn.Module):
+    """
+    Residual block from SEANet model as used by EnCodec.
+    """
+
+    def __init__(self, config: EncodecConfig, dim: int, dilations: List[int]):
+        super().__init__()
+        kernel_sizes = (config.residual_kernel_size, 1)
+        if len(kernel_sizes) != len(dilations):
+            raise ValueError("Number of kernel sizes should match number of dilations")
+
+        hidden = dim // config.compress
+        block = []
+        for i, (kernel_size, dilation) in enumerate(zip(kernel_sizes, dilations)):
+            in_chs = dim if i == 0 else hidden
+            out_chs = dim if i == len(kernel_sizes) - 1 else hidden
+            block += [nn.ELU()]
+            block += [EncodecConv1d(config, in_chs, out_chs, kernel_size, dilation=dilation)]
+        self.block = nn.ModuleList(block)
+
+        if config.use_conv_shortcut:
+            self.shortcut = EncodecConv1d(config, dim, dim, kernel_size=1)
+        else:
+            self.shortcut = nn.Identity()
+
+    def forward(self, hidden_states):
+        residual = hidden_states
+        for layer in self.block:
+            hidden_states = layer(hidden_states)
+
+        return self.shortcut(residual) + hidden_states
+
+
+class EncodecEncoder(nn.Module):
+    """SEANet encoder as used by EnCodec."""
+
+    def __init__(self, config: EncodecConfig):
+        super().__init__()
+        model = [EncodecConv1d(config, config.audio_channels, config.num_filters, config.kernel_size)]
+        scaling = 1
+
+        # Downsample to raw audio scale
+        for ratio in reversed(config.upsampling_ratios):
+            current_scale = scaling * config.num_filters
+            # Add residual layers
+            for j in range(config.num_residual_layers):
+                model += [EncodecResnetBlock(config, current_scale, [config.dilation_growth_rate**j, 1])]
+            # Add downsampling layers
+            model += [nn.ELU()]
+            model += [EncodecConv1d(config, current_scale, current_scale * 2, kernel_size=ratio * 2, stride=ratio)]
+            scaling *= 2
+
+        model += [EncodecLSTM(config, scaling * config.num_filters)]
+        model += [nn.ELU()]
+        model += [EncodecConv1d(config, scaling * config.num_filters, config.hidden_size, config.last_kernel_size)]
+
+        self.layers = nn.ModuleList(model)
+
+    def forward(self, hidden_states):
+        for layer in self.layers:
+            hidden_states = layer(hidden_states)
+        return hidden_states
+
+
+class EncodecDecoder(nn.Module):
+    """SEANet decoder as used by EnCodec."""
+
+    def __init__(self, config: EncodecConfig):
+        super().__init__()
+        scaling = int(2 ** len(config.upsampling_ratios))
+        model = [EncodecConv1d(config, config.hidden_size, scaling * config.num_filters, config.kernel_size)]
+
+        model += [EncodecLSTM(config, scaling * config.num_filters)]
+
+        # Upsample to raw audio scale
+        for ratio in config.upsampling_ratios:
+            current_scale = scaling * config.num_filters
+            # Add upsampling layers
+            model += [nn.ELU()]
+            model += [
+                EncodecConvTranspose1d(config, current_scale, current_scale // 2, kernel_size=ratio * 2, stride=ratio)
+            ]
+            # Add residual layers
+            for j in range(config.num_residual_layers):
+                model += [EncodecResnetBlock(config, current_scale // 2, (config.dilation_growth_rate**j, 1))]
+            scaling //= 2
+
+        # Add final layers
+        model += [nn.ELU()]
+        model += [EncodecConv1d(config, config.num_filters, config.audio_channels, config.last_kernel_size)]
+        self.layers = nn.ModuleList(model)
+
+    def forward(self, hidden_states):
+        for layer in self.layers:
+            hidden_states = layer(hidden_states)
+        return hidden_states
+
+
+class EncodecEuclideanCodebook(nn.Module):
+    """Codebook with Euclidean distance."""
+
+    def __init__(self, config: EncodecConfig):
+        super().__init__()
+        embed = torch.zeros(config.codebook_size, config.codebook_dim)
+
+        self.codebook_size = config.codebook_size
+
+        self.register_buffer("inited", torch.Tensor([True]))
+        self.register_buffer("cluster_size", torch.zeros(config.codebook_size))
+        self.register_buffer("embed", embed)
+        self.register_buffer("embed_avg", embed.clone())
+
+    def quantize(self, hidden_states):
+        embed = self.embed.t()
+        scaled_states = hidden_states.pow(2).sum(1, keepdim=True)
+        dist = -(scaled_states - 2 * hidden_states @ embed + embed.pow(2).sum(0, keepdim=True))
+        embed_ind = dist.max(dim=-1).indices
+        return embed_ind
+
+    def encode(self, hidden_states):
+        shape = hidden_states.shape
+        # pre-process
+        hidden_states = hidden_states.reshape((-1, shape[-1]))
+        # quantize
+        embed_ind = self.quantize(hidden_states)
+        # post-process
+        embed_ind = embed_ind.view(*shape[:-1])
+        return embed_ind
+
+    def decode(self, embed_ind):
+        quantize = nn.functional.embedding(embed_ind, self.embed)
+        return quantize
+
+
+class EncodecVectorQuantization(nn.Module):
+    """
+    Vector quantization implementation. Currently supports only euclidean distance.
+    """
+
+    def __init__(self, config: EncodecConfig):
+        super().__init__()
+        self.codebook = EncodecEuclideanCodebook(config)
+
+    def encode(self, hidden_states):
+        hidden_states = hidden_states.permute(0, 2, 1)
+        embed_in = self.codebook.encode(hidden_states)
+        return embed_in
+
+    def decode(self, embed_ind):
+        quantize = self.codebook.decode(embed_ind)
+        quantize = quantize.permute(0, 2, 1)
+        return quantize
+
+
+class EncodecResidualVectorQuantizer(nn.Module):
+    """Residual Vector Quantizer."""
+
+    def __init__(self, config: EncodecConfig):
+        super().__init__()
+        self.codebook_size = config.codebook_size
+        self.frame_rate = config.frame_rate
+        self.num_quantizers = config.num_quantizers
+        self.layers = nn.ModuleList([EncodecVectorQuantization(config) for _ in range(config.num_quantizers)])
+
+    def get_num_quantizers_for_bandwidth(self, bandwidth: Optional[float] = None) -> int:
+        """Return num_quantizers based on specified target bandwidth."""
+        bw_per_q = math.log2(self.codebook_size) * self.frame_rate
+        num_quantizers = self.num_quantizers
+        if bandwidth is not None and bandwidth > 0.0:
+            num_quantizers = int(max(1, math.floor(bandwidth * 1000 / bw_per_q)))
+        return num_quantizers
+
+    def encode(self, embeddings: torch.Tensor, bandwidth: Optional[float] = None) -> torch.Tensor:
+        """
+        Encode a given input tensor with the specified frame rate at the given bandwidth. The RVQ encode method sets
+        the appropriate number of quantizers to use and returns indices for each quantizer.
+        """
+        num_quantizers = self.get_num_quantizers_for_bandwidth(bandwidth)
+        residual = embeddings
+        all_indices = []
+        for layer in self.layers[:num_quantizers]:
+            indices = layer.encode(residual)
+            quantized = layer.decode(indices)
+            residual = residual - quantized
+            all_indices.append(indices)
+        out_indices = torch.stack(all_indices)
+        return out_indices
+
+    def decode(self, codes: torch.Tensor) -> torch.Tensor:
+        """Decode the given codes to the quantized representation."""
+        quantized_out = torch.tensor(0.0, device=codes.device)
+        for i, indices in enumerate(codes):
+            layer = self.layers[i]
+            quantized = layer.decode(indices)
+            quantized_out = quantized_out + quantized
+        return quantized_out
+
+
+class EncodecPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = EncodecConfig
+    base_model_prefix = "encodec"
+    main_input_name = "input_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LSTM):
+            for name, param in module.named_parameters():
+                if "weight" in name:
+                    nn.init.xavier_uniform_(param)
+                elif "bias" in name:
+                    nn.init.constant_(param, 0.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (EncodecEncoder, EncodecDecoder)):
+            module.gradient_checkpointing = value
+
+
+ENCODEC_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`EncodecConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+ENCODEC_INPUTS_DOCSTRING = r"""
+    Args:
+        input_values (`torch.FloatTensor` of shape `(batch_size, channels, sequence_length)`, *optional*):
+            Raw audio input converted to Float and padded to the approriate length in order to be encoded using chunks
+            of length self.chunk_length and a stride of `config.chunk_stride`.
+        padding_mask (`torch.BoolTensor` of shape `(batch_size, channels, sequence_length)`, *optional*):
+            Mask to avoid computing scaling factors on padding token indices (can we avoid computing conv on these+).
+            Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            <Tip warning={true}>
+
+             `padding_mask` should always be passed, unless the input was truncated or not padded. This is because in
+             order to process tensors effectively, the input audio should be padded so that `input_length % stride =
+             step` with `step = chunk_length-stride`. This ensures that all chunks are of the same shape
+
+            </Tip>
+
+        bandwidth (`float`, *optional*):
+            The target bandwidth. Must be one of `config.target_bandwidths`. If `None`, uses the smallest possible
+            bandwidth. bandwidth is represented as a thousandth of what it is, e.g. 6kbps bandwidth is represented as
+            `bandwidth == 6.0`
+        audio_codes (`torch.FloatTensor`  of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
+            Discret code embeddings computed using `model.encode`.
+        audio_scales (`torch.Tensor` of shape `(batch_size, nb_chunks)`, *optional*):
+            Scaling factor for each `audio_codes` input.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The EnCodec neural audio codec model.",
+    ENCODEC_START_DOCSTRING,
+)
+class EncodecModel(EncodecPreTrainedModel):
+    def __init__(self, config: EncodecConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.encoder = EncodecEncoder(config)
+        self.decoder = EncodecDecoder(config)
+
+        self.quantizer = EncodecResidualVectorQuantizer(config)
+
+        self.bits_per_codebook = int(math.log2(self.config.codebook_size))
+        if 2**self.bits_per_codebook != self.config.codebook_size:
+            raise ValueError("The codebook_size must be a power of 2.")
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    def _encode_frame(
+        self, input_values: torch.Tensor, bandwidth: float, padding_mask: int
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Encodes the given input using the underlying VQVAE. If `config.normalize` is set to `True` the input is first
+        normalized. The padding mask is required to compute the correct scale.
+        """
+        length = input_values.shape[-1]
+        duration = length / self.config.sampling_rate
+
+        if self.config.chunk_length_s is not None and duration > 1e-5 + self.config.chunk_length_s:
+            raise RuntimeError(f"Duration of frame ({duration}) is longer than chunk {self.config.chunk_length_s}")
+
+        scale = None
+        if self.config.normalize:
+            # if the padding is non zero
+            input_values = input_values * padding_mask
+            mono = torch.sum(input_values, 1, keepdim=True) / input_values.shape[1]
+            scale = mono.pow(2).mean(dim=-1, keepdim=True).sqrt() + 1e-8
+            input_values = input_values / scale
+
+        embeddings = self.encoder(input_values)
+        codes = self.quantizer.encode(embeddings, bandwidth)
+        codes = codes.transpose(0, 1)
+        return codes, scale
+
+    def encode(
+        self,
+        input_values: torch.Tensor,
+        padding_mask: torch.Tensor = None,
+        bandwidth: Optional[float] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor, Optional[torch.Tensor]], EncodecEncoderOutput]:
+        """
+        Encodes the input audio waveform into discrete codes.
+
+        Args:
+            input_values (`torch.Tensor` of shape `(batch_size, channels, sequence_length)`):
+                Float values of the input audio waveform.
+            padding_mask (`torch.Tensor` of shape `(batch_size, channels, sequence_length)`):
+                Padding mask used to pad the `input_values`.
+            bandwidth (`float`, *optional*):
+                The target bandwidth. Must be one of `config.target_bandwidths`. If `None`, uses the smallest possible
+                bandwidth. bandwidth is represented as a thousandth of what it is, e.g. 6kbps bandwidth is represented
+                as bandwidth == 6.0
+
+        Returns:
+            A list of frames containing the discrete encoded codes for the input audio waveform, along with rescaling
+            factors for each chunk when `normalize` is True. Each frames is a tuple `(codebook, scale)`, with
+            `codebook` of shape `[batch_size, num_codebooks, frames]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        if bandwidth is None:
+            bandwidth = self.config.target_bandwidths[0]
+        if bandwidth not in self.config.target_bandwidths:
+            raise ValueError(
+                f"This model doesn't support the bandwidth {bandwidth}. "
+                f"Select one of {self.config.target_bandwidths}."
+            )
+
+        _, channels, input_length = input_values.shape
+
+        if channels < 1 or channels > 2:
+            raise ValueError(f"Number of audio channels must be 1 or 2, but got {channels}")
+
+        chunk_length = self.config.chunk_length
+        if chunk_length is None:
+            chunk_length = input_length
+            stride = input_length
+        else:
+            stride = self.config.chunk_stride
+
+        if padding_mask is None:
+            padding_mask = torch.ones_like(input_values).bool()
+
+        encoded_frames = []
+        scales = []
+
+        step = chunk_length - stride
+        if (input_length % stride) - step != 0:
+            raise ValueError(
+                "The input length is not properly padded for batched chunked decoding. Make sure to pad the input correctly."
+            )
+
+        for offset in range(0, input_length - step, stride):
+            mask = padding_mask[..., offset : offset + chunk_length].bool()
+            frame = input_values[:, :, offset : offset + chunk_length]
+            encoded_frame, scale = self._encode_frame(frame, bandwidth, mask)
+            encoded_frames.append(encoded_frame)
+            scales.append(scale)
+
+        encoded_frames = torch.stack(encoded_frames)
+
+        if not return_dict:
+            return (encoded_frames, scales)
+
+        return EncodecEncoderOutput(encoded_frames, scales)
+
+    @staticmethod
+    def _linear_overlap_add(frames: List[torch.Tensor], stride: int):
+        # Generic overlap add, with linear fade-in/fade-out, supporting complex scenario
+        # e.g., more than 2 frames per position.
+        # The core idea is to use a weight function that is a triangle,
+        # with a maximum value at the middle of the chunk.
+        # We use this weighting when summing the frames, and divide by the sum of weights
+        # for each positions at the end. Thus:
+        #   - if a frame is the only one to cover a position, the weighting is a no-op.
+        #   - if 2 frames cover a position:
+        #          ...  ...
+        #         /   \/   \
+        #        /    /\    \
+        #            S  T       , i.e. S offset of second frame starts, T end of first frame.
+        # Then the weight function for each one is: (t - S), (T - t), with `t` a given offset.
+        # After the final normalization, the weight of the second frame at position `t` is
+        # (t - S) / (t - S + (T - t)) = (t - S) / (T - S), which is exactly what we want.
+        #
+        #   - if more than 2 frames overlap at a given point, we hope that by induction
+        #      something sensible happens.
+        if len(frames) == 0:
+            raise ValueError("`frames` cannot be an empty list.")
+
+        device = frames[0].device
+        dtype = frames[0].dtype
+        shape = frames[0].shape[:-1]
+        total_size = stride * (len(frames) - 1) + frames[-1].shape[-1]
+
+        frame_length = frames[0].shape[-1]
+        time_vec = torch.linspace(0, 1, frame_length + 2, device=device, dtype=dtype)[1:-1]
+        weight = 0.5 - (time_vec - 0.5).abs()
+
+        sum_weight = torch.zeros(total_size, device=device, dtype=dtype)
+        out = torch.zeros(*shape, total_size, device=device, dtype=dtype)
+        offset: int = 0
+
+        for frame in frames:
+            frame_length = frame.shape[-1]
+            out[..., offset : offset + frame_length] += weight[:frame_length] * frame
+            sum_weight[offset : offset + frame_length] += weight[:frame_length]
+            offset += stride
+
+        if sum_weight.min() == 0:
+            raise ValueError(f"`sum_weight` minimum element must be bigger than zero: {sum_weight}`")
+
+        return out / sum_weight
+
+    def _decode_frame(self, codes: torch.Tensor, scale: Optional[torch.Tensor] = None) -> torch.Tensor:
+        codes = codes.transpose(0, 1)
+        embeddings = self.quantizer.decode(codes)
+        outputs = self.decoder(embeddings)
+        if scale is not None:
+            outputs = outputs * scale.view(-1, 1, 1)
+        return outputs
+
+    def decode(
+        self,
+        audio_codes: torch.Tensor,
+        audio_scales: torch.Tensor,
+        padding_mask: Optional[torch.Tensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], EncodecDecoderOutput]:
+        """
+        Decodes the given frames into an output audio waveform.
+
+        Note that the output might be a bit bigger than the input. In that case, any extra steps at the end can be
+        trimmed.
+
+        Args:
+            audio_codes (`torch.FloatTensor`  of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
+                Discret code embeddings computed using `model.encode`.
+            audio_scales (`torch.Tensor` of shape `(batch_size, nb_chunks)`, *optional*):
+                Scaling factor for each `audio_codes` input.
+            padding_mask (`torch.Tensor` of shape `(batch_size, channels, sequence_length)`):
+                Padding mask used to pad the `input_values`.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+
+        """
+        return_dict = return_dict or self.config.return_dict
+
+        chunk_length = self.config.chunk_length
+        if chunk_length is None:
+            if len(audio_codes) != 1:
+                raise ValueError(f"Expected one frame, got {len(audio_codes)}")
+            audio_values = self._decode_frame(audio_codes[0], audio_scales[0])
+        else:
+            decoded_frames = []
+
+            for frame, scale in zip(audio_codes, audio_scales):
+                frames = self._decode_frame(frame, scale)
+                decoded_frames.append(frames)
+
+            audio_values = self._linear_overlap_add(decoded_frames, self.config.chunk_stride or 1)
+
+        # truncate based on padding mask
+        if padding_mask is not None and padding_mask.shape[-1] < audio_values.shape[-1]:
+            audio_values = audio_values[..., : padding_mask.shape[-1]]
+
+        if not return_dict:
+            return (audio_values,)
+        return EncodecDecoderOutput(audio_values)
+
+    @add_start_docstrings_to_model_forward(ENCODEC_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=EncodecOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_values: torch.Tensor,
+        padding_mask: Optional[torch.Tensor] = None,
+        bandwidth: Optional[float] = None,
+        audio_codes: Optional[torch.Tensor] = None,
+        audio_scales: Optional[torch.Tensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], EncodecOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from datasets import load_dataset
+        >>> from transformers import AutoProcessor, EncodecModel
+
+        >>> dataset = load_dataset("ashraq/esc50")
+        >>> audio_sample = dataset["train"]["audio"][0]["array"]
+
+        >>> model_id = "facebook/encodec_24khz"
+        >>> model = EncodecModel.from_pretrained(model_id)
+        >>> processor = AutoProcessor.from_pretrained(model_id)
+
+        >>> inputs = processor(raw_audio=audio_sample, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> audio_codes = outputs.audio_codes
+        >>> audio_values = outputs.audio_values
+        ```"""
+        return_dict = return_dict or self.config.return_dict
+
+        if padding_mask is None:
+            padding_mask = torch.ones_like(input_values).bool()
+
+        if audio_codes is not None and audio_scales is None:
+            raise ValueError("You specified `audio_codes` but did not specify the `audio_scales`")
+
+        if audio_scales is not None and audio_codes is None:
+            raise ValueError("You specified `audio_scales` but did not specify the `audio_codes`")
+
+        if audio_scales is None and audio_codes is None:
+            audio_codes, audio_scales = self.encode(input_values, padding_mask, bandwidth, False)
+
+        audio_values = self.decode(audio_codes, audio_scales, padding_mask, return_dict=return_dict)[0]
+        if not return_dict:
+            return (audio_codes, audio_values)
+
+        return EncodecOutput(audio_codes=audio_codes, audio_values=audio_values)
diff --git a/src/transformers/models/encoder_decoder/__init__.py b/src/transformers/models/encoder_decoder/__init__.py
index 759b49f50d33..ba71f1f7c7a9 100644
--- a/src/transformers/models/encoder_decoder/__init__.py
+++ b/src/transformers/models/encoder_decoder/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py b/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py
index 1fca8a10f78d..15fed4dbd1bb 100644
--- a/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py
@@ -14,7 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import copy
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -52,7 +51,7 @@ class EncoderDecoderConfig(PretrainedConfig):
 
     >>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
 
-    >>> # Initializing a Bert2Bert model from the bert-base-uncased style configurations
+    >>> # Initializing a Bert2Bert model (with random weights) from the bert-base-uncased style configurations
     >>> model = EncoderDecoderModel(config=config)
 
     >>> # Accessing the model configuration
@@ -104,16 +103,3 @@ def from_encoder_decoder_configs(
         decoder_config.add_cross_attention = True
 
         return cls(encoder=encoder_config.to_dict(), decoder=decoder_config.to_dict(), **kwargs)
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default *to_dict()* from *PretrainedConfig*.
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["encoder"] = self.encoder.to_dict()
-        output["decoder"] = self.decoder.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
index 730f6430fc8c..3548e48c595a 100644
--- a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
@@ -16,6 +16,7 @@
 
 
 import gc
+import inspect
 import os
 import tempfile
 import warnings
@@ -245,6 +246,13 @@ def __init__(
                 f"The encoder {self.encoder} should not have a LM Head. Please use a model without LM Head"
             )
 
+        decoder_signature = set(inspect.signature(self.decoder.forward).parameters.keys())
+        if "encoder_hidden_states" not in decoder_signature:
+            raise ValueError(
+                "The selected decoder is not prepared for the encoder hidden states to be passed. Please see the "
+                "following discussion on GitHub: https://github.com/huggingface/transformers/issues/23350"
+            )
+
         # tie encoder, decoder weights if config set accordingly
         self.tie_weights()
 
@@ -384,7 +392,7 @@ def from_encoder_decoder_pretrained(
         encoder_pretrained_model_name_or_path: str = None,
         decoder_pretrained_model_name_or_path: str = None,
         *model_args,
-        **kwargs
+        **kwargs,
     ) -> PreTrainedModel:
         r"""
         Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model
@@ -658,9 +666,9 @@ def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
         return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
 
     def prepare_inputs_for_generation(
-        self, input_ids, past=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs
+        self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs
     ):
-        decoder_inputs = self.decoder.prepare_inputs_for_generation(input_ids, past=past)
+        decoder_inputs = self.decoder.prepare_inputs_for_generation(input_ids, past_key_values=past_key_values)
         decoder_attention_mask = decoder_inputs["attention_mask"] if "attention_mask" in decoder_inputs else None
         input_dict = {
             "attention_mask": attention_mask,
@@ -679,6 +687,6 @@ def resize_token_embeddings(self, *args, **kwargs):
             " model.decoder.resize_token_embeddings(...))"
         )
 
-    def _reorder_cache(self, past, beam_idx):
+    def _reorder_cache(self, past_key_values, beam_idx):
         # apply decoder cache reordering here
-        return self.decoder._reorder_cache(past, beam_idx)
+        return self.decoder._reorder_cache(past_key_values, beam_idx)
diff --git a/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
index 36df84f30553..3d9679f26a1c 100644
--- a/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
@@ -317,7 +317,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         if input_shape is None:
             input_shape = ((1, 1), (1, 1))
@@ -582,7 +582,6 @@ def decode(
         def _decoder_forward(
             module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, encoder_hidden_states, **kwargs
         ):
-
             projection_module = module._get_projection_module()
             decoder_module = module._get_decoder_module()
 
@@ -723,10 +722,10 @@ def prepare_inputs_for_generation(
         self,
         decoder_input_ids,
         max_length,
-        attention_mask: Optional[jnp.DeviceArray] = None,
-        decoder_attention_mask: Optional[jnp.DeviceArray] = None,
+        attention_mask: Optional[jax.Array] = None,
+        decoder_attention_mask: Optional[jax.Array] = None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # initializing the cache
         batch_size, seq_length = decoder_input_ids.shape
@@ -763,7 +762,7 @@ def from_encoder_decoder_pretrained(
         encoder_pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
         decoder_pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
         *model_args,
-        **kwargs
+        **kwargs,
     ) -> FlaxPreTrainedModel:
         r"""
         Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model
diff --git a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
index c6a8fb0f35c5..19fc47546b0f 100644
--- a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
@@ -15,20 +15,27 @@
 """ Classes to support TF Encoder-Decoder architectures"""
 
 
-import gc
-import os
-import tempfile
+from __future__ import annotations
+
+import inspect
+import re
 import warnings
-from typing import Optional
+from typing import Optional, Tuple, Union
 
+import numpy as np
 import tensorflow as tf
 
 from ...configuration_utils import PretrainedConfig
 from ...modeling_tf_outputs import TFBaseModelOutput, TFSeq2SeqLMOutput
-from ...modeling_tf_utils import TFCausalLanguageModelingLoss, TFPreTrainedModel, get_initializer, unpack_inputs
+from ...modeling_tf_utils import (
+    TFCausalLanguageModelingLoss,
+    TFModelInputType,
+    TFPreTrainedModel,
+    get_initializer,
+    unpack_inputs,
+)
 from ...tf_utils import shape_list
 from ...utils import (
-    DUMMY_INPUTS,
     ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -157,7 +164,6 @@
 
 
 def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int):
-
     if pad_token_id is None:
         raise ValueError("Make sure to set the pad_token_id attribute of the model's configuration.")
     pad_token_id = tf.cast(pad_token_id, input_ids.dtype)
@@ -262,18 +268,12 @@ def __init__(
                 f"The encoder {self.encoder} should not have a LM Head. Please use a model without LM Head"
             )
 
-    @property
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        # Add `decoder_input_ids` because `self.decoder` requires it.
-        input_ids = tf.constant(DUMMY_INPUTS, dtype=tf.int32)
-        dummy = {"input_ids": input_ids, "decoder_input_ids": input_ids}
-        return dummy
+        decoder_signature = set(inspect.signature(self.decoder.call).parameters.keys())
+        if "encoder_hidden_states" not in decoder_signature:
+            raise ValueError(
+                "The selected decoder is not prepared for the encoder hidden states to be passed. Please see the "
+                "following discussion on GitHub: https://github.com/huggingface/transformers/issues/23350"
+            )
 
     def get_encoder(self):
         return self.encoder
@@ -300,46 +300,23 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
 
         >>> model = TFEncoderDecoderModel.from_pretrained("ydshieh/bert2bert-cnn_dailymail-fp16")
         ```"""
-
-        from_pt = kwargs.pop("from_pt", False)
-        if from_pt:
-            import torch
-
-            from transformers import EncoderDecoderModel
-
-            # a workaround to load from pytorch checkpoint
-            _model = EncoderDecoderModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-            config = _model.config
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                encoder_dir = os.path.join(tmpdirname, "encoder")
-                decoder_dir = os.path.join(tmpdirname, "decoder")
-                _model.encoder.save_pretrained(encoder_dir)
-                _model.decoder.save_pretrained(decoder_dir)
-
-                if hasattr(_model, "enc_to_dec_proj"):
-                    enc_to_dec_proj_kernel = tf.transpose(
-                        tf.constant(_model.enc_to_dec_proj.weight.detach().to("cpu").numpy()), perm=(1, 0)
-                    )
-                    enc_to_dec_proj_bias = tf.constant(_model.enc_to_dec_proj.bias.detach().to("cpu").numpy())
-
-                del _model
-                gc.collect()
-                torch.cuda.empty_cache()
-
-                model = TFEncoderDecoderModel.from_encoder_decoder_pretrained(
-                    encoder_dir, decoder_dir, encoder_from_pt=True, decoder_from_pt=True
-                )
-                # This is only for copying some specific attributes of this particular model.
-                model.config = config
-
-                if hasattr(model, "enc_to_dec_proj"):
-                    model(model.dummy_inputs)
-                    model.enc_to_dec_proj.kernel.assign(enc_to_dec_proj_kernel)
-                    model.enc_to_dec_proj.bias.assign(enc_to_dec_proj_bias)
-
-                return model
-
+        # Matt: The TF and PT weights don't align because our TF base classes have an extra layer compared to PT models
+        # (the main model stem is in the MainLayer class). If we remove that layer, then weight names sync up as normal.
+        # However, the name of that extra layer is the name of the MainLayer in the base model. We make the assumption
+        # here that the config model_type is the same as the name of the MainLayer. I don't know of anywhere that's
+        # not the case, and I wasn't sure how else to go from the config to the correct MainLayer name!
+
+        if kwargs.get("from_pt", False):
+            config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
+            encoder_model_type = config.encoder.model_type
+
+            def tf_to_pt_weight_rename(tf_weight):
+                if "encoder" in tf_weight and "decoder" not in tf_weight:
+                    return re.sub(rf"encoder\.{encoder_model_type}\.", "encoder.", tf_weight)
+                else:
+                    return tf_weight
+
+            kwargs["tf_to_pt_weight_rename"] = tf_to_pt_weight_rename
         return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
 
     @classmethod
@@ -348,7 +325,7 @@ def from_encoder_decoder_pretrained(
         encoder_pretrained_model_name_or_path: str = None,
         decoder_pretrained_model_name_or_path: str = None,
         *model_args,
-        **kwargs
+        **kwargs,
     ) -> TFPreTrainedModel:
         r"""
         Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model
@@ -445,14 +422,6 @@ def from_encoder_decoder_pretrained(
             kwargs_encoder["load_weight_prefix"] = cls.load_weight_prefix
             encoder = TFAutoModel.from_pretrained(encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder)
 
-            # This is necessary to make `from_pretrained` following `save_pretrained` work correctly
-            if kwargs_encoder.get("from_pt", None):
-                del kwargs_encoder["from_pt"]
-                with tempfile.TemporaryDirectory() as tmp_dirname:
-                    encoder.save_pretrained(tmp_dirname)
-                    del encoder
-                    encoder = TFAutoModel.from_pretrained(tmp_dirname, *model_args, **kwargs_encoder)
-
         decoder = kwargs_decoder.pop("model", None)
         if decoder is None:
             if decoder_pretrained_model_name_or_path is None:
@@ -487,14 +456,6 @@ def from_encoder_decoder_pretrained(
             kwargs_decoder["load_weight_prefix"] = cls.load_weight_prefix
             decoder = TFAutoModelForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder)
 
-            # This is necessary to make `from_pretrained` following `save_pretrained` work correctly
-            if kwargs_decoder.get("from_pt", None):
-                del kwargs_decoder["from_pt"]
-                with tempfile.TemporaryDirectory() as tmp_dirname:
-                    decoder.save_pretrained(tmp_dirname)
-                    del decoder
-                    decoder = TFAutoModelForCausalLM.from_pretrained(tmp_dirname, **kwargs_decoder)
-
         # Make sure these 2 `tf.keras.Model` have fixed names so `from_pretrained` could load model weights correctly.
         if encoder.name != "encoder":
             raise ValueError("encoder model must be created with the name `encoder`.")
@@ -510,22 +471,22 @@ def from_encoder_decoder_pretrained(
     @replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        encoder_outputs=None,
-        past_key_values=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        labels=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        training=False,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        encoder_outputs: np.ndarray | tf.Tensor | None = None,
+        past_key_values: Tuple[Tuple[tf.Tensor]] | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
         **kwargs,
-    ):
+    ) -> Union[TFSeq2SeqLMOutput, Tuple[tf.Tensor]]:
         r"""
         Returns:
 
@@ -573,7 +534,6 @@ def call(
                 )
 
         if encoder_outputs is None:
-
             encoder_inputs = {
                 "input_ids": input_ids,
                 "attention_mask": attention_mask,
@@ -668,33 +628,10 @@ def call(
             encoder_attentions=encoder_outputs.attentions,
         )
 
-    def serving_output(self, output):
-        pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
-        dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
-        dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
-        enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
-        enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None
-        cross_attns = (
-            tf.convert_to_tensor(output.cross_attentions)
-            if self.config.output_attentions and output.cross_attentions is not None
-            else None
-        )
-
-        return TFSeq2SeqLMOutput(
-            logits=output.logits,
-            past_key_values=pkv,
-            decoder_hidden_states=dec_hs,
-            decoder_attentions=dec_attns,
-            encoder_last_hidden_state=output.encoder_last_hidden_state,
-            encoder_hidden_states=enc_hs,
-            encoder_attentions=enc_attns,
-            cross_attentions=cross_attns,
-        )
-
     def prepare_inputs_for_generation(
-        self, input_ids, past=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs
+        self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs
     ):
-        decoder_inputs = self.decoder.prepare_inputs_for_generation(input_ids, past=past)
+        decoder_inputs = self.decoder.prepare_inputs_for_generation(input_ids, past_key_values=past_key_values)
         decoder_attention_mask = decoder_inputs["attention_mask"] if "attention_mask" in decoder_inputs else None
         past_key_values = decoder_inputs.get("past_key_values")
         if past_key_values is None:
@@ -720,3 +657,7 @@ def resize_token_embeddings(self, *args, **kwargs):
             " respective methods of the wrapped objects (model.encoder.resize_token_embeddings(...) or"
             " model.decoder.resize_token_embeddings(...))"
         )
+
+    def _reorder_cache(self, past, beam_idx):
+        # apply decoder cache reordering here
+        return self.decoder._reorder_cache(past, beam_idx)
diff --git a/src/transformers/models/ernie/__init__.py b/src/transformers/models/ernie/__init__.py
index b8dce9a15b59..ea7f077f928d 100644
--- a/src/transformers/models/ernie/__init__.py
+++ b/src/transformers/models/ernie/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/ernie/configuration_ernie.py b/src/transformers/models/ernie/configuration_ernie.py
index 71dfde96dafb..91253ab1384b 100644
--- a/src/transformers/models/ernie/configuration_ernie.py
+++ b/src/transformers/models/ernie/configuration_ernie.py
@@ -131,7 +131,7 @@ def __init__(
         position_embedding_type="absolute",
         use_cache=True,
         classifier_dropout=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
 
diff --git a/src/transformers/models/ernie/modeling_ernie.py b/src/transformers/models/ernie/modeling_ernie.py
index 5ec40af77f0c..7ee6f4381290 100644
--- a/src/transformers/models/ernie/modeling_ernie.py
+++ b/src/transformers/models/ernie/modeling_ernie.py
@@ -54,7 +54,6 @@
 
 _CHECKPOINT_FOR_DOC = "nghuyong/ernie-1.0-base-zh"
 _CONFIG_FOR_DOC = "ErnieConfig"
-_TOKENIZER_FOR_DOC = "BertTokenizer"
 
 
 ERNIE_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -90,7 +89,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
         )
@@ -489,6 +490,13 @@ def forward(
         all_self_attentions = () if output_attentions else None
         all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         next_decoder_cache = () if use_cache else None
         for i, layer_module in enumerate(self.layer):
             if output_hidden_states:
@@ -499,12 +507,6 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         return module(*inputs, past_key_value, output_attentions)
@@ -661,7 +663,6 @@ class ErniePreTrainedModel(PreTrainedModel):
     config_class = ErnieConfig
     base_model_prefix = "ernie"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -740,7 +741,7 @@ class ErnieForPreTrainingOutput(ModelOutput):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -839,7 +840,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPoolingAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -895,6 +895,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -984,7 +985,7 @@ def forward(
     ERNIE_START_DOCSTRING,
 )
 class ErnieForPreTraining(ErniePreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
+    _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
 
     # Copied from transformers.models.bert.modeling_bert.BertForPreTraining.__init__ with Bert->Ernie,bert->ernie
     def __init__(self, config):
@@ -1040,10 +1041,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import BertTokenizer, ErnieForPreTraining
+        >>> from transformers import AutoTokenizer, ErnieForPreTraining
         >>> import torch
 
-        >>> tokenizer = BertTokenizer.from_pretrained("nghuyong/ernie-1.0-base-zh")
+        >>> tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-1.0-base-zh")
         >>> model = ErnieForPreTraining.from_pretrained("nghuyong/ernie-1.0-base-zh")
 
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
@@ -1095,8 +1096,7 @@ def forward(
     """Ernie Model with a `language modeling` head on top for CLM fine-tuning.""", ERNIE_START_DOCSTRING
 )
 class ErnieForCausalLM(ErniePreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"]
+    _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
 
     # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.__init__ with BertLMHeadModel->ErnieForCausalLM,Bert->Ernie,bert->ernie
     def __init__(self, config):
@@ -1121,7 +1121,6 @@ def set_output_embeddings(self, new_embeddings):
 
     @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=CausalLMOutputWithCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -1234,17 +1233,18 @@ def prepare_inputs_for_generation(
         }
 
     # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel._reorder_cache
-    def _reorder_cache(self, past, beam_idx):
+    def _reorder_cache(self, past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
         return reordered_past
 
 
 @add_start_docstrings("""Ernie Model with a `language modeling` head on top.""", ERNIE_START_DOCSTRING)
 class ErnieForMaskedLM(ErniePreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"]
+    _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
 
     # Copied from transformers.models.bert.modeling_bert.BertForMaskedLM.__init__ with Bert->Ernie,bert->ernie
     def __init__(self, config):
@@ -1272,7 +1272,6 @@ def set_output_embeddings(self, new_embeddings):
 
     @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1401,10 +1400,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import BertTokenizer, ErnieForNextSentencePrediction
+        >>> from transformers import AutoTokenizer, ErnieForNextSentencePrediction
         >>> import torch
 
-        >>> tokenizer = BertTokenizer.from_pretrained("nghuyong/ernie-1.0-base-zh")
+        >>> tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-1.0-base-zh")
         >>> model = ErnieForNextSentencePrediction.from_pretrained("nghuyong/ernie-1.0-base-zh")
 
         >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
@@ -1584,7 +1583,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1666,8 +1664,6 @@ def forward(
     ERNIE_START_DOCSTRING,
 )
 class ErnieForTokenClassification(ErniePreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     # Copied from transformers.models.bert.modeling_bert.BertForTokenClassification.__init__ with Bert->Ernie,bert->ernie
     def __init__(self, config):
         super().__init__(config)
@@ -1747,8 +1743,6 @@ def forward(
     ERNIE_START_DOCSTRING,
 )
 class ErnieForQuestionAnswering(ErniePreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     # Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering.__init__ with Bert->Ernie,bert->ernie
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/ernie_m/__init__.py b/src/transformers/models/ernie_m/__init__.py
new file mode 100644
index 000000000000..b7cd3bdd0681
--- /dev/null
+++ b/src/transformers/models/ernie_m/__init__.py
@@ -0,0 +1,82 @@
+# Copyright 2023 The HuggingFace and Baidu Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+# rely on isort to merge the imports
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_sentencepiece_available, is_torch_available
+
+
+_import_structure = {
+    "configuration_ernie_m": ["ERNIE_M_PRETRAINED_CONFIG_ARCHIVE_MAP", "ErnieMConfig"],
+}
+
+try:
+    if not is_sentencepiece_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["tokenization_ernie_m"] = ["ErnieMTokenizer"]
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_ernie_m"] = [
+        "ERNIE_M_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "ErnieMForMultipleChoice",
+        "ErnieMForQuestionAnswering",
+        "ErnieMForSequenceClassification",
+        "ErnieMForTokenClassification",
+        "ErnieMModel",
+        "ErnieMPreTrainedModel",
+        "ErnieMForInformationExtraction",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_ernie_m import ERNIE_M_PRETRAINED_CONFIG_ARCHIVE_MAP, ErnieMConfig
+
+    try:
+        if not is_sentencepiece_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .tokenization_ernie_m import ErnieMTokenizer
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_ernie_m import (
+            ERNIE_M_PRETRAINED_MODEL_ARCHIVE_LIST,
+            ErnieMForInformationExtraction,
+            ErnieMForMultipleChoice,
+            ErnieMForQuestionAnswering,
+            ErnieMForSequenceClassification,
+            ErnieMForTokenClassification,
+            ErnieMModel,
+            ErnieMPreTrainedModel,
+        )
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/ernie_m/configuration_ernie_m.py b/src/transformers/models/ernie_m/configuration_ernie_m.py
new file mode 100644
index 000000000000..d23d616b8190
--- /dev/null
+++ b/src/transformers/models/ernie_m/configuration_ernie_m.py
@@ -0,0 +1,117 @@
+# coding=utf-8
+# Copyright 2023 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" ErnieM model configuration"""
+# Adapted from original paddlenlp repository.(https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/transformers/ernie_m/configuration.py)
+
+from __future__ import annotations
+
+from typing import Dict
+
+from ...configuration_utils import PretrainedConfig
+
+
+ERNIE_M_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "susnato/ernie-m-base_pytorch": "https://huggingface.co/susnato/ernie-m-base_pytorch/blob/main/config.json",
+    "susnato/ernie-m-large_pytorch": "https://huggingface.co/susnato/ernie-m-large_pytorch/blob/main/config.json",
+}
+
+
+class ErnieMConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ErnieMModel`]. It is used to instantiate a
+    Ernie-M model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the `Ernie-M`
+    [susnato/ernie-m-base_pytorch](https://huggingface.co/susnato/ernie-m-base_pytorch) architecture.
+
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 250002):
+            Vocabulary size of `inputs_ids` in [`ErnieMModel`]. Also is the vocab size of token embedding matrix.
+            Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling
+            [`ErnieMModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the embedding layer, encoder layers and pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors to feed-forward layers are
+            firstly projected from hidden_size to intermediate_size, and then projected back to hidden_size. Typically
+            intermediate_size is larger than hidden_size.
+        hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function in the feed-forward layer. `"gelu"`, `"relu"` and any other torch
+            supported activation functions are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability used in `MultiHeadAttention` in all encoder layers to drop some attention target.
+        act_dropout (`float`, *optional*, defaults to 0.0):
+            This dropout probability is used in `ErnieMEncoderLayer` after activation.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum value of the dimensionality of position encoding, which dictates the maximum supported length
+            of an input sequence.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the normal initializer for initializing all weight matrices.
+        pad_token_id(`int`, *optional*, defaults to 1):
+            The index of padding token in the token vocabulary.
+
+    A normal_initializer initializes weight matrices as normal distributions. See
+    `ErnieMPretrainedModel._init_weights()` for how weights are initialized in `ErnieMModel`.
+    """
+    model_type = "ernie_m"
+    attribute_map: Dict[str, str] = {"dropout": "classifier_dropout", "num_classes": "num_labels"}
+
+    def __init__(
+        self,
+        vocab_size: int = 250002,
+        hidden_size: int = 768,
+        num_hidden_layers: int = 12,
+        num_attention_heads: int = 12,
+        intermediate_size: int = 3072,
+        hidden_act: str = "gelu",
+        hidden_dropout_prob: float = 0.1,
+        attention_probs_dropout_prob: float = 0.1,
+        max_position_embeddings: int = 514,
+        initializer_range: float = 0.02,
+        pad_token_id: int = 1,
+        layer_norm_eps: float = 1e-05,
+        classifier_dropout=None,
+        is_decoder=False,
+        act_dropout=0.0,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.classifier_dropout = classifier_dropout
+        self.is_decoder = is_decoder
+        self.act_dropout = act_dropout
diff --git a/src/transformers/models/ernie_m/modeling_ernie_m.py b/src/transformers/models/ernie_m/modeling_ernie_m.py
new file mode 100755
index 000000000000..9c53ddd73c85
--- /dev/null
+++ b/src/transformers/models/ernie_m/modeling_ernie_m.py
@@ -0,0 +1,1066 @@
+# coding=utf-8
+# Copyright 2023 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch ErnieM model."""
+
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn, tensor
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_ernie_m import ErnieMConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "susnato/ernie-m-base_pytorch"
+_CONFIG_FOR_DOC = "ErnieMConfig"
+_TOKENIZER_FOR_DOC = "ErnieMTokenizer"
+
+ERNIE_M_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "susnato/ernie-m-base_pytorch",
+    "susnato/ernie-m-large_pytorch",
+    # See all ErnieM models at https://huggingface.co/models?filter=ernie_m
+]
+
+
+# Adapted from paddlenlp.transformers.ernie_m.modeling.ErnieEmbeddings
+class ErnieMEmbeddings(nn.Module):
+    """Construct the embeddings from word and position embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size, padding_idx=config.pad_token_id
+        )
+        self.layer_norm = nn.LayerNorm(normalized_shape=config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(p=config.hidden_dropout_prob)
+        self.padding_idx = config.pad_token_id
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.LongTensor] = None,
+        past_key_values_length: int = 0,
+    ) -> torch.Tensor:
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        if position_ids is None:
+            input_shape = inputs_embeds.size()[:-1]
+            ones = torch.ones(input_shape, dtype=torch.int64, device=inputs_embeds.device)
+            seq_length = torch.cumsum(ones, dim=1)
+            position_ids = seq_length - ones
+
+            if past_key_values_length > 0:
+                position_ids = position_ids + past_key_values_length
+        # to mimic paddlenlp implementation
+        position_ids += 2
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+        embeddings = self.layer_norm(embeddings)
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->ErnieM,self.value->self.v_proj,self.key->self.k_proj,self.query->self.q_proj
+class ErnieMSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.q_proj = nn.Linear(config.hidden_size, self.all_head_size)
+        self.k_proj = nn.Linear(config.hidden_size, self.all_head_size)
+        self.v_proj = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        mixed_query_layer = self.q_proj(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.k_proj(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.v_proj(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.k_proj(hidden_states))
+            value_layer = self.transpose_for_scores(self.v_proj(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.k_proj(hidden_states))
+            value_layer = self.transpose_for_scores(self.v_proj(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        use_cache = past_key_value is not None
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
+            if use_cache:
+                position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
+                    -1, 1
+                )
+            else:
+                position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in ErnieMModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+
+class ErnieMAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        self.self_attn = ErnieMSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.out_proj = nn.Linear(config.hidden_size, config.hidden_size)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self_attn.num_attention_heads, self.self_attn.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self_attn.q_proj = prune_linear_layer(self.self_attn.q_proj, index)
+        self.self_attn.k_proj = prune_linear_layer(self.self_attn.k_proj, index)
+        self.self_attn.v_proj = prune_linear_layer(self.self_attn.v_proj, index)
+        self.out_proj = prune_linear_layer(self.out_proj, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self_attn.num_attention_heads = self.self_attn.num_attention_heads - len(heads)
+        self.self_attn.all_head_size = self.self_attn.attention_head_size * self.self_attn.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        self_outputs = self.self_attn(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.out_proj(self_outputs[0])
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class ErnieMEncoderLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # to mimic paddlenlp implementation
+        dropout = 0.1 if config.hidden_dropout_prob is None else config.hidden_dropout_prob
+        act_dropout = config.hidden_dropout_prob if config.act_dropout is None else config.act_dropout
+
+        self.self_attn = ErnieMAttention(config)
+        self.linear1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.dropout = nn.Dropout(act_dropout)
+        self.linear2 = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        if isinstance(config.hidden_act, str):
+            self.activation = ACT2FN[config.hidden_act]
+        else:
+            self.activation = config.hidden_act
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = True,
+    ):
+        residual = hidden_states
+        if output_attentions:
+            hidden_states, attention_opt_weights = self.self_attn(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+            )
+
+        else:
+            hidden_states = self.self_attn(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+            )
+        hidden_states = residual + self.dropout1(hidden_states)
+        hidden_states = self.norm1(hidden_states)
+        residual = hidden_states
+
+        hidden_states = self.linear1(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.linear2(hidden_states)
+        hidden_states = residual + self.dropout2(hidden_states)
+        hidden_states = self.norm2(hidden_states)
+
+        if output_attentions:
+            return hidden_states, attention_opt_weights
+        else:
+            return hidden_states
+
+
+class ErnieMEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([ErnieMEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+
+    def forward(
+        self,
+        input_embeds: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+        hidden_states = () if output_hidden_states else None
+        attentions = () if output_attentions else None
+
+        output = input_embeds
+        if output_hidden_states:
+            hidden_states = hidden_states + (output,)
+        for i, layer in enumerate(self.layers):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            output, opt_attn_weights = layer(
+                hidden_states=output,
+                attention_mask=attention_mask,
+                head_mask=layer_head_mask,
+                past_key_value=past_key_value,
+            )
+
+            if output_hidden_states:
+                hidden_states = hidden_states + (output,)
+            if output_attentions:
+                attentions = attentions + (opt_attn_weights,)
+
+        last_hidden_state = output
+        if not return_dict:
+            return tuple(v for v in [last_hidden_state, hidden_states, attentions] if v is not None)
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=last_hidden_state, hidden_states=hidden_states, attentions=attentions
+        )
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->ErnieM
+class ErnieMPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class ErnieMPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = ErnieMConfig
+    base_model_prefix = "ernie_m"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, ErnieMEncoder):
+            module.gradient_checkpointing = value
+
+
+ERNIE_M_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`ErnieMConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+ERNIE_M_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`ErnieMTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare ErnieM Model transformer outputting raw hidden-states without any specific head on top.",
+    ERNIE_M_START_DOCSTRING,
+)
+class ErnieMModel(ErnieMPreTrainedModel):
+    def __init__(self, config, add_pooling_layer=True):
+        super(ErnieMModel, self).__init__(config)
+        self.initializer_range = config.initializer_range
+        self.embeddings = ErnieMEmbeddings(config)
+        self.encoder = ErnieMEncoder(config)
+        self.pooler = ErnieMPooler(config) if add_pooling_layer else None
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layers[layer].self_attn.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPastAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[tensor] = None,
+        position_ids: Optional[tensor] = None,
+        attention_mask: Optional[tensor] = None,
+        head_mask: Optional[tensor] = None,
+        inputs_embeds: Optional[tensor] = None,
+        past_key_values: Optional[Tuple[Tuple[tensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time.")
+
+        # init the default bool value
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+
+        # Adapted from paddlenlp.transformers.ernie_m.ErnieMModel
+        if attention_mask is None:
+            attention_mask = (input_ids == self.config.pad_token_id).to(torch.float32)
+            attention_mask *= torch.finfo(attention_mask.dtype).min
+            if past_key_values is not None:
+                batch_size = past_key_values[0][0].shape[0]
+                past_mask = torch.zeros([batch_size, 1, 1, past_key_values_length], dtype=attention_mask.dtype)
+                attention_mask = torch.concat([past_mask, attention_mask], dim=-1)
+        # For 2D attention_mask from tokenizer
+        elif attention_mask.ndim == 2:
+            attention_mask = attention_mask.to(torch.float32)
+            attention_mask = 1.0 - attention_mask
+            attention_mask *= torch.finfo(attention_mask.dtype).min
+
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(1)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            sequence_output = encoder_outputs[0]
+            pooler_output = self.pooler(sequence_output) if self.pooler is not None else None
+            return (sequence_output, pooler_output) + encoder_outputs[1:]
+
+        sequence_output = encoder_outputs["last_hidden_state"]
+        pooler_output = self.pooler(sequence_output) if self.pooler is not None else None
+        hidden_states = None if not output_hidden_states else encoder_outputs["hidden_states"]
+        attentions = None if not output_attentions else encoder_outputs["attentions"]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooler_output,
+            hidden_states=hidden_states,
+            attentions=attentions,
+        )
+
+
+@add_start_docstrings(
+    """ErnieM Model transformer with a sequence classification/regression head on top (a linear layer on top of
+    the pooled output) e.g. for GLUE tasks.""",
+    ERNIE_M_START_DOCSTRING,
+)
+class ErnieMForSequenceClassification(ErnieMPreTrainedModel):
+    # Copied from transformers.models.bert.modeling_bert.BertForSequenceClassification.__init__ with Bert->ErnieM,bert->ernie_m
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.ernie_m = ErnieMModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.Tensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = True,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple[torch.FloatTensor], SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.ernie_m(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            past_key_values=past_key_values,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """ErnieM Model with a multiple choice classification head on top (a linear layer on top of
+    the pooled output and a softmax) e.g. for RocStories/SWAG tasks.""",
+    ERNIE_M_START_DOCSTRING,
+)
+class ErnieMForMultipleChoice(ErnieMPreTrainedModel):
+    # Copied from transformers.models.bert.modeling_bert.BertForMultipleChoice.__init__ with Bert->ErnieM,bert->ernie_m
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.ernie_m = ErnieMModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple[torch.FloatTensor], MultipleChoiceModelOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.ernie_m(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """ErnieM Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.""",
+    ERNIE_M_START_DOCSTRING,
+)
+class ErnieMForTokenClassification(ErnieMPreTrainedModel):
+    # Copied from transformers.models.bert.modeling_bert.BertForTokenClassification.__init__ with Bert->ErnieM,bert->ernie_m
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.ernie_m = ErnieMModel(config, add_pooling_layer=False)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.Tensor]] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = True,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple[torch.FloatTensor], TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.ernie_m(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """ErnieM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).""",
+    ERNIE_M_START_DOCSTRING,
+)
+class ErnieMForQuestionAnswering(ErnieMPreTrainedModel):
+    # Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering.__init__ with Bert->ErnieM,bert->ernie_m
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.ernie_m = ErnieMModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple[torch.FloatTensor], QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.ernie_m(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """ErnieMForInformationExtraction is a Ernie-M Model with two linear layer on top of the hidden-states output to
+    compute `start_prob` and `end_prob`, designed for Universal Information Extraction.""",
+    ERNIE_M_START_DOCSTRING,
+)
+# Copied from paddlenlp.transformers.ernie_m.modeling.UIEM
+class ErnieMForInformationExtraction(ErnieMPreTrainedModel):
+    def __init__(self, config):
+        super(ErnieMForInformationExtraction, self).__init__(config)
+        self.ernie_m = ErnieMModel(config)
+        self.linear_start = nn.Linear(config.hidden_size, 1)
+        self.linear_end = nn.Linear(config.hidden_size, 1)
+        self.sigmoid = nn.Sigmoid()
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple[torch.FloatTensor], QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for position (index) for computing the start_positions loss. Position outside of the sequence are
+            not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) for computing the end_positions loss. Position outside of the sequence are not
+            taken into account for computing the loss.
+        """
+
+        result = self.ernie_m(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if return_dict:
+            sequence_output = result.last_hidden_state
+        elif not return_dict:
+            sequence_output = result[0]
+
+        start_logits = self.linear_start(sequence_output)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = self.linear_end(sequence_output)
+        end_logits = end_logits.squeeze(-1)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = BCEWithLogitsLoss()
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            return tuple(
+                i
+                for i in [total_loss, start_logits, end_logits, result.hidden_states, result.attentions]
+                if i is not None
+            )
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=result.hidden_states,
+            attentions=result.attentions,
+        )
diff --git a/src/transformers/models/ernie_m/tokenization_ernie_m.py b/src/transformers/models/ernie_m/tokenization_ernie_m.py
new file mode 100644
index 000000000000..b1b8cc845024
--- /dev/null
+++ b/src/transformers/models/ernie_m/tokenization_ernie_m.py
@@ -0,0 +1,429 @@
+# coding=utf-8
+# Copyright 2023 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for Ernie-M."""
+
+import io
+import os
+import unicodedata
+from typing import Any, Dict, List, Optional, Tuple
+
+import sentencepiece as spm
+
+from ...tokenization_utils import PreTrainedTokenizer
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+SPIECE_UNDERLINE = "▁"
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "sentencepiece_model_ckpt": "sentencepiece.bpe.model"}
+
+RESOURCE_FILES_NAMES = {
+    "sentencepiece_model_file": "sentencepiece.bpe.model",
+    "vocab_file": "vocab.txt",
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "ernie-m-base": "https://huggingface.co/susnato/ernie-m-base_pytorch/blob/main/vocab.txt",
+        "ernie-m-large": "https://huggingface.co/susnato/ernie-m-base_pytorch/blob/main/vocab.txt",
+    },
+    "sentencepiece_model_file": {
+        "ernie-m-base": "https://huggingface.co/susnato/ernie-m-base_pytorch/blob/main/sentencepiece.bpe.model",
+        "ernie-m-large": "https://huggingface.co/susnato/ernie-m-base_pytorch/blob/main/sentencepiece.bpe.model",
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "ernie-m-base": 514,
+    "ernie-m-large": 514,
+}
+
+PRETRAINED_INIT_CONFIGURATION = {
+    "ernie-m-base": {"do_lower_case": False},
+    "ernie-m-large": {"do_lower_case": False},
+}
+
+
+# Adapted from paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer
+class ErnieMTokenizer(PreTrainedTokenizer):
+    r"""
+    Constructs a Ernie-M tokenizer. It uses the `sentencepiece` tools to cut the words to sub-words.
+
+    Args:
+        sentencepiece_model_file (`str`):
+            The file path of sentencepiece model.
+        vocab_file (`str`, *optional*):
+            The file path of the vocabulary.
+        do_lower_case (`str`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+            A special token representing the `unknown (out-of-vocabulary)` token. An unknown token is set to be
+            `unk_token` inorder to be converted to an ID.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            A special token separating two different sentences in the same input.
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+            A special token used to make arrays of tokens the same size for batching purposes.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            A special token used for sequence classification. It is the last token of the sequence when built with
+            special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            A special token representing a masked token. This is the token used in the masked language modeling task
+            which the model tries to predict the original unmasked ones.
+    """
+
+    # Ernie-M model doesn't have token_type embedding.
+    model_input_names: List[str] = ["input_ids"]
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    resource_files_names = RESOURCE_FILES_NAMES
+
+    def __init__(
+        self,
+        sentencepiece_model_ckpt,
+        vocab_file=None,
+        do_lower_case=False,
+        encoding="utf8",
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ) -> None:
+        # Mask token behave like a normal word, i.e. include the space before it and
+        # is included in the raw text, there should be a match in a non-normalized sentence.
+
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+        self.do_lower_case = do_lower_case
+        self.sentencepiece_model_ckpt = sentencepiece_model_ckpt
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(sentencepiece_model_ckpt)
+
+        # to mimic paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer functioning
+        if vocab_file is not None:
+            self.vocab = self.load_vocab(filepath=vocab_file)
+        else:
+            self.vocab = {self.sp_model.id_to_piece(id): id for id in range(self.sp_model.get_piece_size())}
+        self.reverse_vocab = {v: k for k, v in self.vocab.items()}
+
+        super().__init__(
+            do_lower_case=do_lower_case,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            vocab_file=vocab_file,
+            encoding=encoding,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+
+    def get_offset_mapping(self, text):
+        if text is None:
+            return None
+
+        split_tokens = self.tokenize(text)
+        normalized_text, char_mapping = "", []
+
+        for i, ch in enumerate(text):
+            if ch in self.SP_CHAR_MAPPING:
+                ch = self.SP_CHAR_MAPPING.get(ch)
+            else:
+                ch = unicodedata.normalize("NFKC", ch)
+            if self.is_whitespace(ch):
+                continue
+            normalized_text += ch
+            char_mapping.extend([i] * len(ch))
+
+        text, token_mapping, offset = normalized_text, [], 0
+
+        if self.do_lower_case:
+            text = text.lower()
+
+        for token in split_tokens:
+            if token[:1] == "▁":
+                token = token[1:]
+            start = text[offset:].index(token) + offset
+            end = start + len(token)
+
+            token_mapping.append((char_mapping[start], char_mapping[end - 1] + 1))
+            offset = end
+        return token_mapping
+
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+
+    def get_vocab(self):
+        return dict(self.vocab, **self.added_tokens_encoder)
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+
+        # for backward compatibility
+        if not hasattr(self, "sp_model_kwargs"):
+            self.sp_model_kwargs = {}
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(self.sentencepiece_model_ckpt)
+
+    def clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        return "".join((self.SP_CHAR_MAPPING.get(c, c) for c in text))
+
+    def _tokenize(self, text, enable_sampling=False, nbest_size=64, alpha=0.1):
+        """Tokenize a string."""
+
+        if self.sp_model_kwargs.get("enable_sampling") is True:
+            enable_sampling = True
+        if self.sp_model_kwargs.get("alpha") is not None:
+            alpha = self.sp_model_kwargs.get("alpha")
+        if self.sp_model_kwargs.get("nbest_size") is not None:
+            nbest_size = self.sp_model_kwargs.get("nbest_size")
+
+        if not enable_sampling:
+            pieces = self.sp_model.EncodeAsPieces(text)
+        else:
+            pieces = self.sp_model.SampleEncodeAsPieces(text, nbest_size, alpha)
+        new_pieces = []
+        for pi, piece in enumerate(pieces):
+            if piece == SPIECE_UNDERLINE:
+                if not pieces[pi + 1].startswith(SPIECE_UNDERLINE) and pi != 0:
+                    new_pieces.append(SPIECE_UNDERLINE)
+                    continue
+                else:
+                    continue
+            lst_i = 0
+            for i, chunk in enumerate(piece):
+                if chunk == SPIECE_UNDERLINE:
+                    continue
+                if self.is_ch_char(chunk) or self.is_punct(chunk):
+                    if i > lst_i and piece[lst_i:i] != SPIECE_UNDERLINE:
+                        new_pieces.append(piece[lst_i:i])
+                    new_pieces.append(chunk)
+                    lst_i = i + 1
+                elif chunk.isdigit() and i > 0 and not piece[i - 1].isdigit():
+                    if i > lst_i and piece[lst_i:i] != SPIECE_UNDERLINE:
+                        new_pieces.append(piece[lst_i:i])
+                    lst_i = i
+                elif not chunk.isdigit() and i > 0 and piece[i - 1].isdigit():
+                    if i > lst_i and piece[lst_i:i] != SPIECE_UNDERLINE:
+                        new_pieces.append(piece[lst_i:i])
+                    lst_i = i
+            if len(piece) > lst_i:
+                new_pieces.append(piece[lst_i:])
+        return new_pieces
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (strings for sub-words) in a single string."""
+        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
+        return out_string
+
+    def convert_ids_to_string(self, ids):
+        """
+        Converts a sequence of tokens (strings for sub-words) in a single string.
+        """
+        tokens = self.convert_ids_to_tokens(ids)
+        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
+        return out_string
+
+    # to mimic paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer functioning
+    def _convert_token_to_id(self, token):
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+
+    # to mimic paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer functioning
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.reverse_vocab.get(index, self.unk_token)
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        r"""
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An ErnieM sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of input_id with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        _cls = [self.cls_token_id]
+        _sep = [self.sep_token_id]
+        return _cls + token_ids_0 + _sep + _sep + token_ids_1 + _sep
+
+    def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
+        r"""
+        Build offset map from a pair of offset map by concatenating and adding offsets of special tokens. An Ernie-M
+        offset_mapping has the following format:
+
+        - single sequence: `(0,0) X (0,0)`
+        - pair of sequences: `(0,0) A (0,0) (0,0) B (0,0)`
+
+        Args:
+            offset_mapping_ids_0 (`List[tuple]`):
+                List of char offsets to which the special tokens will be added.
+            offset_mapping_ids_1 (`List[tuple]`, *optional*):
+                Optional second list of wordpiece offsets for offset mapping pairs.
+        Returns:
+            `List[tuple]`: List of wordpiece offsets with the appropriate offsets of special tokens.
+        """
+        if offset_mapping_1 is None:
+            return [(0, 0)] + offset_mapping_0 + [(0, 0)]
+
+        return [(0, 0)] + offset_mapping_0 + [(0, 0), (0, 0)] + offset_mapping_1 + [(0, 0)]
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        r"""
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `encode` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of ids of the first sequence.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`str`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `List[int]`:
+                The list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return [1 if x in [self.sep_token_id, self.cls_token_id] else 0 for x in token_ids_0]
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create the token type IDs corresponding to the sequences passed. [What are token type
+        IDs?](../glossary#token-type-ids) Should be overridden in a subclass if the model has a special way of
+        building: those.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                The first tokenized sequence.
+            token_ids_1 (`List[int]`, *optional*):
+                The second tokenized sequence.
+        Returns:
+            `List[int]`: The token type ids.
+        """
+        # called when `add_special_tokens` is True, so align with `build_inputs_with_special_tokens` method
+        if token_ids_1 is None:
+            # [CLS] X [SEP]
+            return (len(token_ids_0) + 2) * [0]
+
+        # [CLS] A [SEP] [SEP] B [SEP]
+        return [0] * (len(token_ids_0) + 1) + [1] * (len(token_ids_1) + 3)
+
+    def is_ch_char(self, char):
+        """
+        is_ch_char
+        """
+        if "\u4e00" <= char <= "\u9fff":
+            return True
+        return False
+
+    def is_alpha(self, char):
+        """
+        is_alpha
+        """
+        if ("a" <= char <= "z") or ("A" <= char <= "Z"):
+            return True
+        return False
+
+    def is_punct(self, char):
+        """
+        is_punct
+        """
+        if char in ",;:.?!~，；：。？！《》【】":
+            return True
+        return False
+
+    def is_whitespace(self, char):
+        """
+        is whitespace
+        """
+        if char == " " or char == "\t" or char == "\n" or char == "\r":
+            return True
+        if len(char) == 1:
+            cat = unicodedata.category(char)
+            if cat == "Zs":
+                return True
+        return False
+
+    def load_vocab(self, filepath):
+        token_to_idx = {}
+        with io.open(filepath, "r", encoding="utf-8") as f:
+            for index, line in enumerate(f):
+                token = line.rstrip("\n")
+                token_to_idx[token] = int(index)
+
+        return token_to_idx
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        index = 0
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+            )
+        else:
+            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
+
+        tokenizer_model_file = os.path.join(save_directory, "sentencepiece.bpe.model")
+        with open(tokenizer_model_file, "wb") as fi:
+            content_spiece_model = self.sp_model.serialized_model_proto()
+            fi.write(content_spiece_model)
+
+        return (vocab_file,)
diff --git a/src/transformers/models/esm/__init__.py b/src/transformers/models/esm/__init__.py
index 0066ed2a3eb4..1b07db5a5eea 100644
--- a/src/transformers/models/esm/__init__.py
+++ b/src/transformers/models/esm/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 Facebook and The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/esm/configuration_esm.py b/src/transformers/models/esm/configuration_esm.py
index a267bf83bef7..e51c5d01f155 100644
--- a/src/transformers/models/esm/configuration_esm.py
+++ b/src/transformers/models/esm/configuration_esm.py
@@ -81,8 +81,6 @@ class EsmConfig(PretrainedConfig):
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`.
-        classifier_dropout (`float`, *optional*):
-            The dropout ratio for the classification head.
         emb_layer_norm_before (`bool`, *optional*):
             Whether to apply layer normalization after embeddings but before the main stem of the network.
         token_dropout (`bool`, defaults to `False`):
@@ -117,13 +115,12 @@ def __init__(
         layer_norm_eps=1e-12,
         position_embedding_type="absolute",
         use_cache=True,
-        classifier_dropout=None,
         emb_layer_norm_before=None,
         token_dropout=False,
         is_folding_model=False,
         esmfold_config=None,
         vocab_list=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, mask_token_id=mask_token_id, **kwargs)
 
@@ -139,7 +136,6 @@ def __init__(
         self.layer_norm_eps = layer_norm_eps
         self.position_embedding_type = position_embedding_type
         self.use_cache = use_cache
-        self.classifier_dropout = classifier_dropout
         self.emb_layer_norm_before = emb_layer_norm_before
         self.token_dropout = token_dropout
         self.is_folding_model = is_folding_model
diff --git a/src/transformers/models/esm/convert_esm.py b/src/transformers/models/esm/convert_esm.py
index 996e9eaeed4f..22ca3f5392c1 100644
--- a/src/transformers/models/esm/convert_esm.py
+++ b/src/transformers/models/esm/convert_esm.py
@@ -20,11 +20,11 @@
 from pathlib import Path
 from tempfile import TemporaryDirectory
 
-import torch
-
 import esm as esm_module
+import torch
 from esm.esmfold.v1.misc import batch_encode_sequences as esmfold_encode_sequences
 from esm.esmfold.v1.pretrained import esmfold_v1
+
 from transformers.models.esm.configuration_esm import EsmConfig, EsmFoldConfig
 from transformers.models.esm.modeling_esm import (
     EsmForMaskedLM,
@@ -378,8 +378,8 @@ def convert_esm_checkpoint_to_pytorch(
     hf_tokenizer.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_repo:
-        model.push_to_hub(repo_id=push_to_repo, use_auth_token=auth_token)
-        hf_tokenizer.push_to_hub(repo_id=push_to_repo, use_auth_token=auth_token)
+        model.push_to_hub(repo_id=push_to_repo, token_token=auth_token)
+        hf_tokenizer.push_to_hub(repo_id=push_to_repo, token_token=auth_token)
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/esm/modeling_esm.py b/src/transformers/models/esm/modeling_esm.py
index 0e9f792c29dd..7a07495ba7e5 100755
--- a/src/transformers/models/esm/modeling_esm.py
+++ b/src/transformers/models/esm/modeling_esm.py
@@ -39,7 +39,6 @@
 
 _CHECKPOINT_FOR_DOC = "facebook/esm2_t6_8M_UR50D"
 _CONFIG_FOR_DOC = "EsmConfig"
-_TOKENIZER_FOR_DOC = "EsmTokenizer"
 
 ESM_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "facebook/esm2_t6_8M_UR50D",
@@ -179,7 +178,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
         self.padding_idx = config.pad_token_id
         self.position_embeddings = nn.Embedding(
@@ -213,7 +214,7 @@ def forward(
         # This is analogous to the way that dropout layers scale down outputs during evaluation when not
         # actually dropping out values (or, equivalently, scale up their un-dropped outputs in training).
         if self.token_dropout:
-            embeddings.masked_fill_((input_ids == self.mask_token_id).unsqueeze(-1), 0.0)
+            embeddings = embeddings.masked_fill((input_ids == self.mask_token_id).unsqueeze(-1), 0.0)
             mask_ratio_train = 0.15 * 0.8  # Hardcoded as the ratio used in all ESM model training runs
             src_lengths = attention_mask.sum(-1)
             mask_ratio_observed = (input_ids == self.mask_token_id).sum(-1).float() / src_lengths
@@ -223,7 +224,7 @@ def forward(
 
         if self.position_embedding_type == "absolute":
             position_embeddings = self.position_embeddings(position_ids)
-            embeddings += position_embeddings
+            embeddings = embeddings + position_embeddings
 
         if self.layer_norm is not None:
             embeddings = self.layer_norm(embeddings)
@@ -296,7 +297,6 @@ def forward(
         past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         output_attentions: Optional[bool] = False,
     ) -> Tuple[torch.Tensor]:
-
         mixed_query_layer = self.query(hidden_states)
 
         # If this is instantiated as a cross-attention module, the keys
@@ -399,7 +399,7 @@ def __init__(self, config):
     def forward(self, hidden_states, input_tensor):
         hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
-        hidden_states += input_tensor
+        hidden_states = hidden_states + input_tensor
         return hidden_states
 
 
@@ -474,7 +474,7 @@ def __init__(self, config):
     def forward(self, hidden_states, input_tensor):
         hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
-        hidden_states += input_tensor
+        hidden_states = hidden_states + input_tensor
         return hidden_states
 
 
@@ -585,6 +585,13 @@ def forward(
         output_hidden_states=False,
         return_dict=True,
     ):
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
+                    "`use_cache=False`..."
+                )
+                use_cache = False
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
         all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
@@ -599,13 +606,6 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
-                        "`use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         return module(*inputs, past_key_value, output_attentions)
@@ -633,7 +633,7 @@ def custom_forward(*inputs):
 
             hidden_states = layer_outputs[0]
             if use_cache:
-                next_decoder_cache += (layer_outputs[-1],)
+                next_decoder_cache = next_decoder_cache + (layer_outputs[-1],)
             if output_attentions:
                 all_self_attentions = all_self_attentions + (layer_outputs[1],)
                 if self.config.add_cross_attention:
@@ -690,7 +690,8 @@ class EsmPreTrainedModel(PreTrainedModel):
 
     config_class = EsmConfig
     base_model_prefix = "esm"
-    _no_split_modules = ["EsmLayer", "EsmFoldTriangularSelfAttentionBlock"]
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["EsmLayer", "EsmFoldTriangularSelfAttentionBlock", "EsmEmbeddings"]
 
     # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
     def _init_weights(self, module):
@@ -709,6 +710,10 @@ def _init_weights(self, module):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
 
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, EsmEncoder):
+            module.gradient_checkpointing = value
+
 
 ESM_START_DOCSTRING = r"""
 
@@ -731,7 +736,7 @@ def _init_weights(self, module):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`EsmTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -785,9 +790,6 @@ class EsmModel(EsmPreTrainedModel):
     `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
     """
 
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-    supports_gradient_checkpointing = False
-
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
@@ -804,10 +806,6 @@ def __init__(self, config, add_pooling_layer=True):
         # Initialize weights and apply final processing
         self.post_init()
 
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, EsmEncoder):
-            module.gradient_checkpointing = value
-
     def get_input_embeddings(self):
         return self.embeddings.word_embeddings
 
@@ -824,7 +822,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(ESM_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPoolingAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -878,6 +875,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -963,8 +961,7 @@ def predict_contacts(self, tokens, attention_mask):
 
 @add_start_docstrings("""ESM Model with a `language modeling` head on top.""", ESM_START_DOCSTRING)
 class EsmForMaskedLM(EsmPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids", "lm_head.decoder.weight"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _tied_weights_keys = ["lm_head.decoder.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -988,7 +985,6 @@ def set_output_embeddings(self, new_embeddings):
 
     @add_start_docstrings_to_model_forward(ESM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1036,6 +1032,8 @@ def forward(
         masked_lm_loss = None
         if labels is not None:
             loss_fct = CrossEntropyLoss()
+
+            labels = labels.to(prediction_scores.device)
             masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
 
         if not return_dict:
@@ -1082,8 +1080,6 @@ def forward(self, features, **kwargs):
     ESM_START_DOCSTRING,
 )
 class EsmForSequenceClassification(EsmPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1096,7 +1092,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(ESM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1136,6 +1131,8 @@ def forward(
 
         loss = None
         if labels is not None:
+            labels = labels.to(logits.device)
+
             if self.config.problem_type is None:
                 if self.num_labels == 1:
                     self.config.problem_type = "regression"
@@ -1177,9 +1174,6 @@ def forward(
     ESM_START_DOCSTRING,
 )
 class EsmForTokenClassification(EsmPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1192,7 +1186,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(ESM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1234,16 +1227,9 @@ def forward(
         loss = None
         if labels is not None:
             loss_fct = CrossEntropyLoss()
-            # Only keep active parts of the loss
-            if attention_mask is not None:
-                active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)
-                active_labels = torch.where(
-                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
-                )
-                loss = loss_fct(active_logits, active_labels)
-            else:
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+            labels = labels.to(logits.device)
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
 
         if not return_dict:
             output = (logits,) + outputs[2:]
diff --git a/src/transformers/models/esm/modeling_esmfold.py b/src/transformers/models/esm/modeling_esmfold.py
index 943730a2ffbf..9bdb5b4eb74f 100644
--- a/src/transformers/models/esm/modeling_esmfold.py
+++ b/src/transformers/models/esm/modeling_esmfold.py
@@ -23,7 +23,7 @@
 import torch.nn as nn
 from torch.nn import LayerNorm
 
-from ...deepspeed import is_deepspeed_available
+from ...integrations.deepspeed import is_deepspeed_available
 from ...modeling_outputs import ModelOutput
 from ...utils import (
     ContextManagers,
@@ -54,7 +54,6 @@
 logger = logging.get_logger(__name__)
 _CHECKPOINT_FOR_DOC = "facebook/esmfold_v1"
 _CONFIG_FOR_DOC = "EsmConfig"
-_TOKENIZER_FOR_DOC = "EsmTokenizer"
 
 
 @dataclass
@@ -143,7 +142,7 @@ class EsmForProteinFoldingOutput(ModelOutput):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`EsmTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -202,9 +201,9 @@ def collate_dense_tensors(samples: List[torch.Tensor], pad_v: float = 0) -> torc
     """
     if len(samples) == 0:
         return torch.Tensor()
-    if len(set(x.dim() for x in samples)) != 1:
+    if len({x.dim() for x in samples}) != 1:
         raise RuntimeError(f"Samples has varying dimensions: {[x.dim() for x in samples]}")
-    (device,) = tuple(set(x.device for x in samples))  # assumes all on same device
+    (device,) = tuple({x.device for x in samples})  # assumes all on same device
     max_shape = [max(lst) for lst in zip(*[x.shape for x in samples])]
     result = torch.empty(len(samples), *max_shape, dtype=samples[0].dtype, device=device)
     result.fill_(pad_v)
@@ -2019,6 +2018,8 @@ def distogram(coords, min_bin, max_bin, num_bins):
     ESM_START_DOCSTRING,
 )
 class EsmForProteinFolding(EsmPreTrainedModel):
+    _no_split_modules = ["EsmFoldStructureModule", "EsmFoldTriangularSelfAttentionBlock"]
+
     def __init__(self, config):
         super().__init__(config)
 
@@ -2085,11 +2086,11 @@ def _af2_to_esm_from_vocab_list(vocab_list: List[str]) -> torch.Tensor:
     def forward(
         self,
         input_ids: torch.Tensor,
-        attention_mask: torch.Tensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
         masking_pattern: Optional[torch.Tensor] = None,
         num_recycles: Optional[int] = None,
-    ):
+    ) -> EsmForProteinFoldingOutput:
         r"""
         Returns:
 
diff --git a/src/transformers/models/esm/modeling_tf_esm.py b/src/transformers/models/esm/modeling_tf_esm.py
index 2754ec7ea718..3e9223087ba9 100644
--- a/src/transformers/models/esm/modeling_tf_esm.py
+++ b/src/transformers/models/esm/modeling_tf_esm.py
@@ -14,6 +14,10 @@
 # limitations under the License.
 """ PyTorch ESM model."""
 
+
+from __future__ import annotations
+
+import os
 from typing import Optional, Tuple, Union
 
 import numpy as np
@@ -36,11 +40,10 @@
     TFSequenceClassificationLoss,
     TFTokenClassificationLoss,
     get_initializer,
-    get_tf_activation,
     shape_list,
     unpack_inputs,
 )
-from ...tf_utils import stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, stable_softmax
 from ...utils import logging
 from .configuration_esm import EsmConfig
 
@@ -49,7 +52,6 @@
 
 _CHECKPOINT_FOR_DOC = "facebook/esm2_t6_8M_UR50D"
 _CONFIG_FOR_DOC = "EsmConfig"
-_TOKENIZER_FOR_DOC = "EsmTokenizer"
 
 TF_ESM_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "facebook/esm2_t6_8M_UR50D",
@@ -108,7 +110,7 @@ def __init__(self, dim: int, name=None):
     def build(self, input_shape):
         super().build(input_shape)
         self.inv_freq = self.add_weight(
-            "inv_freq", shape=(self.dim // 2,), dtype=tf.float32, initializer=get_initializer(1.0)
+            "inv_freq", shape=(self.dim // 2,), dtype=tf.float32, initializer=get_initializer(1.0), trainable=False
         )
         self.inv_freq.assign(
             1.0 / (10000 ** (tf.range(start=0, limit=self.dim, delta=2, dtype=tf.float32) / self.dim))
@@ -202,7 +204,7 @@ def __init__(self, config, name=None):
         self.padding_idx = config.pad_token_id
         self.token_dropout = config.token_dropout
         self.mask_token_id = config.mask_token_id
-        self.vocab_size = config.vocab_size
+        self.config = config
 
     def call(
         self, input_ids=None, attention_mask=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
@@ -215,16 +217,7 @@ def call(
                 position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
 
         if inputs_embeds is None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = self.word_embeddings(input_ids)
 
         # Note that if we want to support ESM-1 (not 1b!) in future then we need to support an
@@ -322,15 +315,14 @@ def transpose_for_scores(self, x: tf.Tensor) -> tf.Tensor:
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        encoder_hidden_states: Optional[tf.Tensor] = None,
-        encoder_attention_mask: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[Tuple[tf.Tensor]]] = None,
+        attention_mask: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        encoder_hidden_states: tf.Tensor | None = None,
+        encoder_attention_mask: tf.Tensor | None = None,
+        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
         output_attentions: Optional[bool] = False,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
-
         mixed_query_layer = self.query(hidden_states)
 
         # If this is instantiated as a cross-attention module, the keys
@@ -477,24 +469,19 @@ def call(
         return outputs
 
 
-# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Esm
 class TFEsmIntermediate(tf.keras.layers.Layer):
     def __init__(self, config: EsmConfig, **kwargs):
         super().__init__(**kwargs)
 
         self.dense = tf.keras.layers.Dense(
-            units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+            units=config.intermediate_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="dense",
         )
 
-        if isinstance(config.hidden_act, str):
-            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
-        else:
-            self.intermediate_act_fn = config.hidden_act
-
     def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
         hidden_states = self.dense(inputs=hidden_states)
-        hidden_states = self.intermediate_act_fn(hidden_states)
-
+        hidden_states = tf.nn.gelu(hidden_states)
         return hidden_states
 
 
@@ -680,7 +667,7 @@ def call(
 
 
 # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Esm
-class TFEsmPooler(Layer):
+class TFEsmPooler(tf.keras.layers.Layer):
     def __init__(self, config: EsmConfig, **kwargs):
         super().__init__(**kwargs)
 
@@ -730,7 +717,7 @@ class TFEsmPreTrainedModel(TFPreTrainedModel):
         input_ids (`tf.Tensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`EsmTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -817,13 +804,13 @@ def _prune_heads(self, heads_to_prune):
 
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -831,7 +818,6 @@ def call(
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFBaseModelOutputWithPoolingAndCrossAttentions, Tuple[tf.Tensor]]:
-
         if not self.config.is_decoder:
             use_cache = False
 
@@ -995,20 +981,19 @@ def __init__(self, config: EsmConfig, add_pooling_layer=True, *inputs, **kwargs)
     @unpack_inputs
     @add_start_docstrings_to_model_forward(ESM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFBaseModelOutputWithPoolingAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -1053,39 +1038,6 @@ def call(
         )
         return outputs
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
-    def serving_output(
-        self, output: TFBaseModelOutputWithPoolingAndCrossAttentions
-    ) -> TFBaseModelOutputWithPoolingAndCrossAttentions:
-        output_cache = self.config.use_cache and self.config.is_decoder
-        pkv = tf.convert_to_tensor(output.past_key_values) if output_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = tf.convert_to_tensor(output.cross_attentions) if output.cross_attentions is not None else None
-        if not (self.config.output_attentions and self.config.add_cross_attention):
-            cross_attns = None
-
-        return TFBaseModelOutputWithPoolingAndCrossAttentions(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            past_key_values=pkv,
-            hidden_states=hs,
-            attentions=attns,
-            cross_attentions=cross_attns,
-        )
-
     def predict_contacts(self, tokens, attention_mask):
         return self.esm.predict_contacts(tokens, attention_mask)
 
@@ -1106,6 +1058,11 @@ def __init__(self, config):
 
         self.esm = TFEsmMainLayer(config, add_pooling_layer=False, name="esm")
         self.lm_head = TFEsmLMHead(config, name="lm_head")
+        if config.tie_word_embeddings:
+            # Ensure word embeddings are built so that we actually have something to tie
+            with tf.name_scope(os.path.join(self._name_scope(), "esm", "embeddings", "word_embeddings")):
+                self.esm.embeddings.word_embeddings.build((None, None))
+            self.lm_head.decoder = self.esm.embeddings.word_embeddings.weights[0]
 
     def get_output_embeddings(self):
         return self.lm_head.decoder
@@ -1119,7 +1076,6 @@ def get_lm_head(self):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(ESM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFMaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1127,14 +1083,14 @@ def get_lm_head(self):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1181,26 +1137,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output
-    def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
     def predict_contacts(self, tokens, attention_mask):
         return self.esm.predict_contacts(tokens, attention_mask)
 
@@ -1215,20 +1151,22 @@ def __init__(self, config, name=None):
         )
 
         self.layer_norm = LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
-
-        self.decoder = Dense(
-            config.vocab_size,
-            use_bias=False,
-            kernel_initializer=get_initializer(config.initializer_range),
-            name="decoder",
-        )
-        self.vocab_size = config.vocab_size
+        if config.tie_word_embeddings:
+            self.decoder = None
+        else:
+            self.decoder = Dense(
+                config.vocab_size,
+                kernel_initializer=get_initializer(config.initializer_range),
+                name="decoder",
+                use_bias=False,
+            )
+        self.config = config
 
     def build(self, input_shape):
         super().build(input_shape)
         # Separate bias to match the PT model and allow weight cross-loading to work
         # Put it in the build so it gets the right name when adding it as a weight
-        self.bias = self.add_weight("bias", shape=(self.vocab_size,), initializer="zeros", trainable=True)
+        self.bias = self.add_weight("bias", shape=(self.config.vocab_size,), initializer="zeros", trainable=True)
 
     def get_bias(self):
         return {"bias": self.bias}
@@ -1239,8 +1177,10 @@ def call(self, features):
         x = self.layer_norm(x)
 
         # project back to size of vocabulary with bias
-        x = self.decoder(x)
-        x = x + self.bias
+        if self.config.tie_word_embeddings:
+            x = tf.matmul(x, self.decoder, transpose_b=True) + self.bias
+        else:
+            x = self.decoder(x) + self.bias
         return x
 
 
@@ -1265,19 +1205,18 @@ def __init__(self, config):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(ESM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1318,26 +1257,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 @add_start_docstrings(
     """
@@ -1361,19 +1280,18 @@ def __init__(self, config):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(ESM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFTokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1415,26 +1333,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 class TFEsmClassificationHead(Layer):
     """Head for sentence-level classification tasks."""
diff --git a/src/transformers/models/esm/openfold_utils/__init__.py b/src/transformers/models/esm/openfold_utils/__init__.py
index 4a0d932a05c4..02a8c149ae32 100644
--- a/src/transformers/models/esm/openfold_utils/__init__.py
+++ b/src/transformers/models/esm/openfold_utils/__init__.py
@@ -1,4 +1,3 @@
-# flake8: noqa
 from .chunk_utils import chunk_layer
 from .data_transforms import make_atom14_masks
 from .feats import atom14_to_atom37, frames_and_literature_positions_to_atom14_pos, torsion_angles_to_frames
diff --git a/src/transformers/models/esm/openfold_utils/chunk_utils.py b/src/transformers/models/esm/openfold_utils/chunk_utils.py
index 4f68503e99bb..301721d135ee 100644
--- a/src/transformers/models/esm/openfold_utils/chunk_utils.py
+++ b/src/transformers/models/esm/openfold_utils/chunk_utils.py
@@ -62,6 +62,7 @@ def _get_minimal_slice_set(
 
     end is INCLUSIVE.
     """
+
     # start_edges and end_edges both indicate whether, starting from any given
     # dimension, the start/end index is at the top/bottom edge of the
     # corresponding tensor, modeled as a tree
@@ -82,7 +83,7 @@ def reduce_edge_list(l: List[bool]) -> None:
     # Base cases. Either start/end are empty and we're done, or the final,
     # one-dimensional tensor can be simply sliced
     if len(start) == 0:
-        return [tuple()]
+        return [()]
     elif len(start) == 1:
         return [(slice(start[0], end[0] + 1),)]
 
diff --git a/src/transformers/models/esm/openfold_utils/loss.py b/src/transformers/models/esm/openfold_utils/loss.py
index e9523491d519..8c442786dc82 100644
--- a/src/transformers/models/esm/openfold_utils/loss.py
+++ b/src/transformers/models/esm/openfold_utils/loss.py
@@ -59,7 +59,7 @@ def compute_predicted_aligned_error(
     boundaries = torch.linspace(0, max_bin, steps=(no_bins - 1), device=logits.device)
 
     aligned_confidence_probs = torch.nn.functional.softmax(logits, dim=-1)
-    (predicted_aligned_error, max_predicted_aligned_error,) = _calculate_expected_aligned_error(
+    predicted_aligned_error, max_predicted_aligned_error = _calculate_expected_aligned_error(
         alignment_confidence_breaks=boundaries,
         aligned_distance_error_probs=aligned_confidence_probs,
     )
diff --git a/src/transformers/models/esm/openfold_utils/residue_constants.py b/src/transformers/models/esm/openfold_utils/residue_constants.py
index 6cab95652c63..8f0ad3b50c65 100644
--- a/src/transformers/models/esm/openfold_utils/residue_constants.py
+++ b/src/transformers/models/esm/openfold_utils/residue_constants.py
@@ -399,11 +399,13 @@ def map_structure_with_atom_order(in_list: list, first_call: bool = True) -> lis
 
 
 @functools.lru_cache(maxsize=None)
-def load_stereo_chemical_props() -> Tuple[
-    Mapping[str, List[Bond]],
-    Mapping[str, List[Bond]],
-    Mapping[str, List[BondAngle]],
-]:
+def load_stereo_chemical_props() -> (
+    Tuple[
+        Mapping[str, List[Bond]],
+        Mapping[str, List[Bond]],
+        Mapping[str, List[BondAngle]],
+    ]
+):
     """Load stereo_chemical_props.txt into a nice structure.
 
     Load literature values for bond lengths and bond angles and translate bond angles into the length of the opposite
diff --git a/src/transformers/models/esm/tokenization_esm.py b/src/transformers/models/esm/tokenization_esm.py
index 232ce61fb7e0..065eaae1d505 100644
--- a/src/transformers/models/esm/tokenization_esm.py
+++ b/src/transformers/models/esm/tokenization_esm.py
@@ -54,18 +54,33 @@ class EsmTokenizer(PreTrainedTokenizer):
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
     model_input_names = ["input_ids", "attention_mask"]
 
-    def __init__(self, vocab_file, **kwargs):
-        super().__init__(**kwargs)
+    def __init__(
+        self,
+        vocab_file,
+        unk_token="<unk>",
+        cls_token="<cls>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        eos_token="<eos>",
+        **kwargs,
+    ):
         self.all_tokens = load_vocab_file(vocab_file)
-        self._id_to_token = {ind: tok for ind, tok in enumerate(self.all_tokens)}
+        self._id_to_token = dict(enumerate(self.all_tokens))
         self._token_to_id = {tok: ind for ind, tok in enumerate(self.all_tokens)}
-        self.unk_token = "<unk>"
-        self.cls_token = "<cls>"
-        self.pad_token = "<pad>"
-        self.mask_token = "<mask>"
-        self.eos_token = "<eos>"
+        super().__init__(
+            unk_token=unk_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            eos_token=eos_token,
+            **kwargs,
+        )
+
+        # TODO, all the tokens are added? But they are also part of the vocab... bit strange.
+        # none of them are special, but they all need special splitting.
+
         self.unique_no_split_tokens = self.all_tokens
-        self._create_trie(self.unique_no_split_tokens)
+        self._update_trie(self.unique_no_split_tokens)
 
     def _convert_id_to_token(self, index: int) -> str:
         return self._id_to_token.get(index, self.unk_token)
@@ -91,11 +106,16 @@ def id_to_token(self, index: int) -> str:
     def build_inputs_with_special_tokens(
         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
     ) -> List[int]:
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.eos_token_id]
         cls = [self.cls_token_id]
         sep = [self.eos_token_id]  # No sep token in ESM vocabulary
-        return cls + token_ids_0 + sep + token_ids_1 + sep
+        if token_ids_1 is None:
+            if self.eos_token_id is None:
+                return cls + token_ids_0
+            else:
+                return cls + token_ids_0 + sep
+        elif self.eos_token_id is None:
+            raise ValueError("Cannot tokenize multiple sequences when EOS token is not set!")
+        return cls + token_ids_0 + sep + token_ids_1 + sep  # Multiple inputs always have an EOS token
 
     def get_special_tokens_mask(
         self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False
diff --git a/src/transformers/models/falcon/__init__.py b/src/transformers/models/falcon/__init__.py
new file mode 100644
index 000000000000..070e0cc033fb
--- /dev/null
+++ b/src/transformers/models/falcon/__init__.py
@@ -0,0 +1,68 @@
+# coding=utf-8
+# Copyright 2023 the Falcon authors and HuggingFace Inc. team.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_falcon": ["FALCON_PRETRAINED_CONFIG_ARCHIVE_MAP", "FalconConfig"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_falcon"] = [
+        "FALCON_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "FalconForCausalLM",
+        "FalconModel",
+        "FalconPreTrainedModel",
+        "FalconForSequenceClassification",
+        "FalconForTokenClassification",
+        "FalconForQuestionAnswering",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_falcon import FALCON_PRETRAINED_CONFIG_ARCHIVE_MAP, FalconConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_falcon import (
+            FALCON_PRETRAINED_MODEL_ARCHIVE_LIST,
+            FalconForCausalLM,
+            FalconForQuestionAnswering,
+            FalconForSequenceClassification,
+            FalconForTokenClassification,
+            FalconModel,
+            FalconPreTrainedModel,
+        )
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/falcon/configuration_falcon.py b/src/transformers/models/falcon/configuration_falcon.py
new file mode 100644
index 000000000000..fce21b146cf9
--- /dev/null
+++ b/src/transformers/models/falcon/configuration_falcon.py
@@ -0,0 +1,191 @@
+# coding=utf-8
+# Copyright 2023 the Falcon authors and HuggingFace Inc. team.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Falcon configuration"""
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+FALCON_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "tiiuae/falcon-40b": "https://huggingface.co/tiiuae/falcon-40b/resolve/main/config.json",
+    "tiiuae/falcon-7b": "https://huggingface.co/tiiuae/falcon-7b/resolve/main/config.json",
+}
+
+
+class FalconConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`FalconModel`]. It is used to instantiate a Falcon
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the
+    [tiiuae/falcon-7b](https://huggingface.co/tiiuae/falcon-7b) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 65024):
+            Vocabulary size of the Falcon model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`FalconModel`]
+        hidden_size (`int`, *optional*, defaults to 4544):
+            Dimension of the hidden representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 71):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether the model should return the last key/values attentions (not used by all models). Only relevant if
+            `config.is_decoder=True`.
+        hidden_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for MLP layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for attention layers.
+        num_kv_heads (`int`, *optional*):
+            Number of key-value heads to use per attention layer. If unset, defaults to the same value as
+            `num_attention_heads`.
+        alibi (`bool`, *optional*, defaults to `False`):
+            Whether to use ALiBi positional biases during self-attention.
+        new_decoder_architecture (`bool`, *optional*, defaults to `False`):
+            Whether to use the new (Falcon-40B) decoder architecture. If `True`, the `multi_query` and `parallel_attn`
+            arguments are ignored, as the new decoder always uses parallel attention.
+        multi_query (`bool`, *optional*, defaults to `True`):
+            Whether to use multi-query attention in the decoder. Ignored when `new_decoder_architecture` is `True`.
+        parallel_attn (`bool`, *optional*, defaults to `True`):
+            Whether to compute attention in parallel with the feedforward layer. If False, they are consecutive
+            instead, as in the original Transformer architecture. Ignored when `new_decoder_architecture` is `True`.
+        bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias on Linear layers.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with, when `alibi` is `False`. Pretrained
+            Falcon models with RoPE support up to 2048 tokens.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
+            is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+        bos_token_id (`int`, *optional*, defaults to 11):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 11):
+            The id of the "end-of-sequence" token.
+
+    Example:
+
+    ```pytho
+    >>> from transformers import FalconModel, FalconConfig
+
+    >>> # Initializing a small (2-layer) Falcon configuration
+    >>> configuration = FalconConfig(num_hidden_layers=2)
+
+    >>> # Initializing a model from the small configuration
+    >>> model = FalconModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "falcon"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=65024,
+        hidden_size=4544,
+        num_hidden_layers=32,
+        num_attention_heads=71,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        use_cache=True,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        num_kv_heads=None,
+        alibi=False,
+        new_decoder_architecture=False,
+        multi_query=True,
+        parallel_attn=True,
+        bias=False,
+        max_position_embeddings=2048,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        bos_token_id=11,
+        eos_token_id=11,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        # Backward compatibility with n_embed kwarg
+        n_embed = kwargs.pop("n_embed", None)
+        self.hidden_size = hidden_size if n_embed is None else n_embed
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.num_kv_heads = num_attention_heads if num_kv_heads is None else num_kv_heads
+        self.alibi = alibi
+        self.new_decoder_architecture = new_decoder_architecture
+        self.multi_query = multi_query  # Ignored when new_decoder_architecture is True
+        self.parallel_attn = parallel_attn
+        self.bias = bias
+        self.max_position_embeddings = max_position_embeddings
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+    @property
+    def head_dim(self):
+        return self.hidden_size // self.num_attention_heads
+
+    @property
+    def rotary(self):
+        return not self.alibi
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if self.rotary:
+            raise ValueError("`rope_scaling` is not supported when `alibi` is `True`.")
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")
diff --git a/src/transformers/models/falcon/convert_custom_code_checkpoint.py b/src/transformers/models/falcon/convert_custom_code_checkpoint.py
new file mode 100644
index 000000000000..0da817c3ffa7
--- /dev/null
+++ b/src/transformers/models/falcon/convert_custom_code_checkpoint.py
@@ -0,0 +1,74 @@
+import json
+from argparse import ArgumentParser
+from pathlib import Path
+
+
+"""
+This script converts Falcon custom code checkpoints to modern Falcon checkpoints that use code in the Transformers
+library. After conversion, performance (especially for generation) should improve and the checkpoint can be loaded
+without needing trust_remote_code=True.
+"""
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--checkpoint_dir",
+        type=Path,
+        required=True,
+        help="Directory containing a custom code checkpoint to convert to a modern Falcon checkpoint.",
+    )
+    args = parser.parse_args()
+
+    if not args.checkpoint_dir.is_dir():
+        raise ValueError("--checkpoint_dir argument should be a directory!")
+
+    if (
+        not (args.checkpoint_dir / "configuration_RW.py").is_file()
+        or not (args.checkpoint_dir / "modelling_RW.py").is_file()
+    ):
+        raise ValueError(
+            "The model directory should contain configuration_RW.py and modelling_RW.py files! Are you sure this is a custom code checkpoint?"
+        )
+    (args.checkpoint_dir / "configuration_RW.py").unlink()
+    (args.checkpoint_dir / "modelling_RW.py").unlink()
+
+    config = args.checkpoint_dir / "config.json"
+    text = config.read_text()
+    text = text.replace("RWForCausalLM", "FalconForCausalLM")
+    text = text.replace("RefinedWebModel", "falcon")
+    text = text.replace("RefinedWeb", "falcon")
+    json_config = json.loads(text)
+    del json_config["auto_map"]
+
+    if "n_head" in json_config:
+        json_config["num_attention_heads"] = json_config.pop("n_head")
+    if "n_layer" in json_config:
+        json_config["num_hidden_layers"] = json_config.pop("n_layer")
+    if "n_head_kv" in json_config:
+        json_config["num_kv_heads"] = json_config.pop("n_head_kv")
+        json_config["new_decoder_architecture"] = True
+    else:
+        json_config["new_decoder_architecture"] = False
+    bos_token_id = json_config.get("bos_token_id", 1)
+    eos_token_id = json_config.get("eos_token_id", 2)
+    config.unlink()
+    config.write_text(json.dumps(json_config, indent=2, sort_keys=True))
+
+    tokenizer_config = args.checkpoint_dir / "tokenizer_config.json"
+    if tokenizer_config.is_file():
+        text = tokenizer_config.read_text()
+        json_config = json.loads(text)
+        if json_config["tokenizer_class"] == "PreTrainedTokenizerFast":
+            json_config["model_input_names"] = ["input_ids", "attention_mask"]
+            tokenizer_config.unlink()
+            tokenizer_config.write_text(json.dumps(json_config, indent=2, sort_keys=True))
+
+    generation_config_path = args.checkpoint_dir / "generation_config.json"
+    generation_dict = {
+        "_from_model_config": True,
+        "bos_token_id": bos_token_id,
+        "eos_token_id": eos_token_id,
+        "transformers_version": "4.33.0.dev0",
+    }
+    generation_config_path.write_text(json.dumps(generation_dict, indent=2, sort_keys=True))
+    print("Done! Please double-check that the new checkpoint works as expected.")
diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py
new file mode 100644
index 000000000000..ab29322613be
--- /dev/null
+++ b/src/transformers/models/falcon/modeling_falcon.py
@@ -0,0 +1,1641 @@
+# coding=utf-8
+# Copyright 2023 the Falcon authors and HuggingFace Inc. team.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Falcon model."""
+
+import math
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss
+from torch.nn import functional as F
+
+from ...modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_available,
+    logging,
+)
+from .configuration_falcon import FalconConfig
+
+
+if is_flash_attn_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+
+logger = logging.get_logger(__name__)
+
+FALCON_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "tiiuae/falcon-40b",
+    "tiiuae/falcon-40b-instruct",
+    "tiiuae/falcon-7b",
+    "tiiuae/falcon-7b-instruct",
+    "tiiuae/falcon-rw-7b",
+    "tiiuae/falcon-rw-1b",
+]
+_CHECKPOINT_FOR_DOC = "Rocketknight1/falcon-rw-1b"
+_CONFIG_FOR_DOC = "FalconConfig"
+
+
+# NOTE(Hesslow): Unfortunately we did not fuse matmul and bias during training, this means that there's one additional quantization to bfloat16 between the operations.
+# In order not to degrade the quality of our HF-port, we keep these characteristics in the final model.
+class FalconLinear(nn.Linear):
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        hidden_states = input @ self.weight.T
+        if self.bias is None:
+            return hidden_states
+        return hidden_states + self.bias
+
+
+# rotary pos emb helpers (torch.jit.script does not seem to support staticmethod...)
+def rotate_half(x):
+    x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(padding_mask):
+    seqlens_in_batch = padding_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(padding_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+# TODO (joao): Is this the same implementation as in Llama? If so, let's make them the same and add the copy facilities
+class FalconRotaryEmbedding(nn.Module):
+    """Implementation of RotaryEmbedding from GPT-NeoX.
+    This implementation is designed to operate on queries and keys that are compatible with `[batch_size,
+    n_heads_per_partition, seq_len, head_dim]` (e.g. MinGPTAttention format).
+    """
+
+    def __init__(self, head_dim: int, base=10000, max_position_embeddings=2048):
+        super().__init__()
+        self.base = base
+        self.max_position_embeddings = max_position_embeddings
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, head_dim, 2).float() / head_dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.head_dim = head_dim
+        self.seq_len_cached = -1
+        self.cos_cached: torch.Tensor | None = None
+        self.sin_cached: torch.Tensor | None = None
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.seq_len_cached = seq_len
+        t = torch.arange(seq_len, device=device).to(dtype)
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1).to(device)
+
+        if dtype in [torch.float16, torch.bfloat16]:
+            emb = emb.float()
+
+        self.cos_cached = emb.cos()
+        self.sin_cached = emb.sin()
+
+        self.cos_cached = self.cos_cached.type(dtype)
+        self.sin_cached = self.sin_cached.type(dtype)
+
+    def cos_sin(
+        self, seq_len: int, past_key_values_length: int, position_ids: torch.Tensor, device="cpu", dtype=torch.bfloat16
+    ) -> torch.Tensor:
+        total_length = seq_len + past_key_values_length
+        if total_length > self.seq_len_cached:
+            self._set_cos_sin_cache(total_length, device, dtype)
+
+        # the cached tensors need to update their devices (for example, after we change the model's device)
+        self.cos_cached = self.cos_cached.to(device)
+        self.sin_cached = self.sin_cached.to(device)
+
+        # Gather cos, sin at the designated position ids
+        cos = self.cos_cached[position_ids]  # [bs, seq_len, dim]
+        sin = self.sin_cached[position_ids]  # [bs, seq_len, dim]
+        return cos, sin
+
+    def forward(self, query, key, past_key_values_length, position_ids):
+        _, seq_len, _ = query.shape
+        cos, sin = self.cos_sin(seq_len, past_key_values_length, position_ids, query.device, query.dtype)
+        # Query and key's shapes are [bs * num_heads, seq_len, dim], might need manual expansion. Ifs and elses used to
+        # avoid unnecessary repeat_interleave operations.
+        query_expansion_factor = int(query.shape[0] / cos.shape[0])
+        if query_expansion_factor > 1:
+            query_cos = torch.repeat_interleave(cos, query_expansion_factor, dim=0)
+            query_sin = torch.repeat_interleave(sin, query_expansion_factor, dim=0)
+        else:
+            query_cos, query_sin = cos, sin
+
+        key_expansion_factor = int(key.shape[0] / cos.shape[0])
+        if key_expansion_factor > 1:
+            if key_expansion_factor != query_expansion_factor:
+                key_cos = torch.repeat_interleave(cos, key_expansion_factor, dim=0)
+                key_sin = torch.repeat_interleave(sin, key_expansion_factor, dim=0)
+            else:
+                key_cos, key_sin = query_cos, query_sin
+        else:
+            key_cos, key_sin = cos, sin
+
+        return (query * query_cos) + (rotate_half(query) * query_sin), (key * key_cos) + (rotate_half(key) * key_sin)
+
+
+class FalconLinearScalingRotaryEmbedding(FalconRotaryEmbedding):
+    """FalconRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+
+    def __init__(self, head_dim: int, base=10000, max_position_embeddings=2048, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(head_dim, base, max_position_embeddings)
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.seq_len_cached = seq_len
+        t = torch.arange(seq_len, device=device).to(dtype)
+        # This line is the only difference from FalconRotaryEmbedding._set_cos_sin_cache
+        t = t / self.scaling_factor
+
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1).to(device)
+
+        if dtype in [torch.float16, torch.bfloat16]:
+            emb = emb.float()
+
+        self.cos_cached = emb.cos()
+        self.sin_cached = emb.sin()
+
+        self.cos_cached = self.cos_cached.type(dtype)
+        self.sin_cached = self.sin_cached.type(dtype)
+
+
+class FalconDynamicNTKScalingRotaryEmbedding(FalconRotaryEmbedding):
+    """
+    FalconRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla
+    """
+
+    def __init__(self, head_dim: int, base=10000, max_position_embeddings=2048, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(head_dim, base, max_position_embeddings)
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.seq_len_cached = seq_len
+
+        # This if block is the only difference from FalconRotaryEmbedding._set_cos_sin_cache
+        if seq_len > self.max_position_embeddings:
+            base = self.base * (
+                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+            ) ** (self.head_dim / (self.head_dim - 2))
+            inv_freq = 1.0 / (base ** (torch.arange(0, self.head_dim, 2).float().to(device) / self.head_dim))
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        t = torch.arange(seq_len, device=device).to(dtype)
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1).to(device)
+
+        if dtype in [torch.float16, torch.bfloat16]:
+            emb = emb.float()
+
+        self.cos_cached = emb.cos()
+        self.sin_cached = emb.sin()
+
+        self.cos_cached = self.cos_cached.type(dtype)
+        self.sin_cached = self.sin_cached.type(dtype)
+
+
+def _make_causal_mask(
+    input_ids_shape: torch.Size, device: torch.device, past_key_values_length: int
+) -> torch.BoolTensor:
+    """
+    Make causal mask used for self-attention. This mask does not take the existing attention mask into account - it
+    just blocks tokens from attending forwards in the sequence. The output shape will be `[batch_size, 1,
+    target_length, target_length+past_key_values_length]`.
+    """
+    batch_size, target_length = input_ids_shape
+
+    mask = torch.triu(torch.ones((target_length, target_length), dtype=torch.bool, device=device), diagonal=1)
+    # If past_key_values_length is 0 this is an empty tensor and the concatenation is a no-op.
+    # This code style is an unfortunate consequence of getting your TF engineer to port models; doing it this
+    # way avoids a data-dependent conditional, which will help me when I have to port this to XLA later.
+    past_mask = torch.zeros((target_length, past_key_values_length), dtype=torch.bool, device=device)
+    mask = torch.cat([past_mask, mask], dim=-1)
+    expanded_mask = mask[None, None, :, :].expand(batch_size, 1, target_length, target_length + past_key_values_length)
+    return expanded_mask
+
+
+def _expand_mask(mask: torch.Tensor, past_key_values_length: int) -> torch.BoolTensor:
+    """
+    Expands attention_mask from `[batch_size, seq_length]` to `[batch_size, 1, seq_length, seq_length + past_length]`.
+    """
+    batch_size, total_length = mask.shape
+    seq_length = total_length - past_key_values_length if past_key_values_length is not None else total_length
+
+    expanded_mask = ~(mask[:, None, None, :].to(torch.bool))
+    return expanded_mask.expand(batch_size, 1, seq_length, total_length)
+
+
+def build_alibi_tensor(attention_mask: torch.Tensor, num_heads: int, dtype: torch.dtype) -> torch.Tensor:
+    batch_size, seq_length = attention_mask.shape
+    closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
+    base = torch.tensor(
+        2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32
+    )
+    powers = torch.arange(1, 1 + closest_power_of_2, device=attention_mask.device, dtype=torch.int32)
+    slopes = torch.pow(base, powers)
+
+    if closest_power_of_2 != num_heads:
+        extra_base = torch.tensor(
+            2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32
+        )
+        num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
+        extra_powers = torch.arange(1, 1 + 2 * num_remaining_heads, 2, device=attention_mask.device, dtype=torch.int32)
+        slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
+
+    # Note: alibi will added to the attention bias that will be applied to the query, key product of attention
+    # => therefore alibi will have to be of shape (batch_size, num_heads, query_length, key_length)
+    # => here we set (batch_size=1, num_heads=num_heads, query_length=1, key_length=max_length)
+    # => the query_length dimension will then be broadcasted correctly
+    # This is more or less identical to T5's relative position bias:
+    # https://github.com/huggingface/transformers/blob/f681437203baa7671de3174b0fa583c349d9d5e1/src/transformers/models/t5/modeling_t5.py#L527
+    arange_tensor = ((attention_mask.cumsum(dim=-1) - 1) * attention_mask)[:, None, :]
+    alibi = slopes[..., None].bfloat16() * arange_tensor
+    return alibi.reshape(batch_size * num_heads, 1, seq_length).to(dtype)
+
+
+# Copied from transformers.models.bloom.modeling_bloom.dropout_add
+def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training: bool) -> torch.Tensor:
+    """
+    Dropout add function
+
+    Args:
+        x (`torch.tensor`, *required*):
+            input tensor
+        residual (`torch.tensor`, *required*):
+            residual tensor
+        prob (`float`, *required*):
+            dropout probability
+        training (`bool`, *required*):
+            training mode
+    """
+    out = F.dropout(x, p=prob, training=training)
+    out = residual + out
+    return out
+
+
+class FalconAttention(nn.Module):
+    def __init__(self, config: FalconConfig):
+        super().__init__()
+
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.split_size = self.hidden_size
+        self.hidden_dropout = config.hidden_dropout
+
+        if self.head_dim * self.num_heads != self.hidden_size:
+            raise ValueError(
+                f"`hidden_size` must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+
+        self.maybe_rotary = self._init_rope() if config.rotary else lambda q, k, t, p: (q, k)
+
+        # Layer-wise attention scaling
+        self.inv_norm_factor = 1.0 / math.sqrt(self.head_dim)
+        self.beta = self.inv_norm_factor
+        if config.new_decoder_architecture:
+            qkv_out_dim = (config.num_kv_heads * 2 + config.num_attention_heads) * self.head_dim
+        elif config.multi_query:
+            qkv_out_dim = self.hidden_size + 2 * self.head_dim
+        else:
+            qkv_out_dim = 3 * self.hidden_size
+        self.query_key_value = FalconLinear(self.hidden_size, qkv_out_dim, bias=config.bias)
+        self.new_decoder_architecture = config.new_decoder_architecture
+        self.multi_query = config.multi_query
+        self.dense = FalconLinear(self.hidden_size, self.hidden_size, bias=config.bias)
+        self.attention_dropout = nn.Dropout(config.attention_dropout)
+        self.num_kv_heads = config.num_kv_heads if (self.new_decoder_architecture or not self.multi_query) else 1
+
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            rotary_emb = FalconRotaryEmbedding(
+                self.head_dim,
+                base=self.config.rope_theta,
+                max_position_embeddings=self.config.max_position_embeddings,
+            )
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            scaling_factor = self.config.rope_scaling["factor"]
+            if scaling_type == "linear":
+                rotary_emb = FalconLinearScalingRotaryEmbedding(
+                    self.head_dim,
+                    base=self.config.rope_theta,
+                    max_position_embeddings=self.config.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                )
+            elif scaling_type == "dynamic":
+                rotary_emb = FalconDynamicNTKScalingRotaryEmbedding(
+                    self.head_dim,
+                    base=self.config.rope_theta,
+                    max_position_embeddings=self.config.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                )
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+        return rotary_emb
+
+    def _split_heads(self, fused_qkv: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Split the last dimension into (num_heads, head_dim), results share same memory storage as `fused_qkv`
+
+        Args:
+            fused_qkv (`torch.tensor`, *required*): [batch_size, seq_length, num_heads * 3 * head_dim]
+
+        Returns:
+            query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
+            value: [batch_size, seq_length, num_heads, head_dim]
+        """
+        if self.new_decoder_architecture:
+            batch, seq_len, _ = fused_qkv.shape
+            qkv = fused_qkv.view(batch, seq_len, -1, self.num_heads // self.num_kv_heads + 2, self.head_dim)
+            query = qkv[:, :, :, :-2]
+            key = qkv[:, :, :, [-2]]
+            value = qkv[:, :, :, [-1]]
+            key = torch.broadcast_to(key, query.shape)
+            value = torch.broadcast_to(value, query.shape)
+
+            query, key, value = [x.flatten(2, 3) for x in (query, key, value)]
+            return query, key, value
+        elif not self.multi_query:
+            batch_size, seq_length, three_times_hidden_size = fused_qkv.shape
+            fused_qkv = fused_qkv.view(batch_size, seq_length, self.num_heads, 3, self.head_dim)
+            return fused_qkv[..., 0, :], fused_qkv[..., 1, :], fused_qkv[..., 2, :]
+        else:
+            batch_size, seq_length, three_times_hidden_size = fused_qkv.shape
+            fused_qkv = fused_qkv.view(batch_size, seq_length, self.num_heads + 2, self.head_dim)
+            return fused_qkv[..., :-2, :], fused_qkv[..., [-2], :], fused_qkv[..., [-1], :]
+
+    # Copied from transformers.models.bloom.modeling_bloom.BloomAttention._merge_heads
+    def _merge_heads(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Merge heads together over the last dimension
+
+        Args:
+            x (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim]
+
+        Returns:
+            torch.tensor: [batch_size, seq_length, num_heads * head_dim]
+        """
+        # What we want to achieve is:
+        # batch_size * num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads * head_dim
+        batch_size_and_num_heads, seq_length, _ = x.shape
+        batch_size = batch_size_and_num_heads // self.num_heads
+
+        # First view to decompose the batch size
+        # batch_size * num_heads, seq_length, head_dim -> batch_size, num_heads, seq_length, head_dim
+        x = x.view(batch_size, self.num_heads, seq_length, self.head_dim)
+
+        # batch_size, num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads, head_dim
+        x = x.permute(0, 2, 1, 3)
+
+        # batch_size, seq_length, num_heads, head_dim -> batch_size, seq_length, num_heads * head_dim
+        return x.reshape(batch_size, seq_length, self.num_heads * self.head_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        alibi: Optional[torch.Tensor],
+        attention_mask: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        use_cache: bool = False,
+        output_attentions: bool = False,
+        padding_mask: Optional[torch.LongTensor] = None,
+    ):
+        fused_qkv = self.query_key_value(hidden_states)  # [batch_size, seq_length, 3 x hidden_size]
+        num_kv_heads = self.num_heads if self.new_decoder_architecture else self.num_kv_heads
+        # 3 x [batch_size, seq_length, num_heads, head_dim]
+        (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv)
+
+        batch_size, query_length, _, _ = query_layer.shape
+
+        query_layer = query_layer.transpose(1, 2).reshape(batch_size * self.num_heads, query_length, self.head_dim)
+        key_layer = key_layer.transpose(1, 2).reshape(
+            batch_size * num_kv_heads,
+            query_length,
+            self.head_dim,
+        )
+        value_layer = value_layer.transpose(1, 2).reshape(batch_size * num_kv_heads, query_length, self.head_dim)
+
+        past_kv_length = 0 if layer_past is None else layer_past[0].shape[1]
+        query_layer, key_layer = self.maybe_rotary(query_layer, key_layer, past_kv_length, position_ids)
+
+        if layer_past is not None:
+            past_key, past_value = layer_past
+            # concatenate along seq_length dimension:
+            #  - key: [batch_size * self.num_heads, kv_length, head_dim]
+            #  - value: [batch_size * self.num_heads, kv_length, head_dim]
+            key_layer = torch.cat((past_key, key_layer), dim=1)
+            value_layer = torch.cat((past_value, value_layer), dim=1)
+
+        _, kv_length, _ = key_layer.shape
+        if use_cache:
+            present = (key_layer, value_layer)
+        else:
+            present = None
+
+        float_min = torch.finfo(query_layer.dtype).min
+        attention_mask_float = (attention_mask * 1.0).masked_fill(attention_mask, float_min).to(query_layer.dtype)
+
+        query_layer_ = query_layer.reshape(batch_size, self.num_heads, -1, self.head_dim)
+        key_layer_ = key_layer.reshape(batch_size, num_kv_heads, -1, self.head_dim)
+        value_layer_ = value_layer.reshape(batch_size, num_kv_heads, -1, self.head_dim)
+
+        if alibi is None:
+            if hasattr(F, "scaled_dot_product_attention") and not output_attentions:
+                # TODO: deprecate this once we add FA2 support in Falcon
+                logger.warning_once(
+                    "The current implementation of Falcon calls `torch.scaled_dot_product_attention` directly, this will be deprecated in the"
+                    " future in favor of the `BetterTransformer` API. Please install the latest optimum library with `pip install -U optimum` and call "
+                    "`model.to_bettertransformer()` to benefit from `torch.scaled_dot_product_attention` and future performance optimizations."
+                )
+
+                attn_output = F.scaled_dot_product_attention(
+                    query_layer_, key_layer_, value_layer_, attention_mask_float, 0.0, is_causal=False
+                )
+                attention_scores = None
+            else:
+                attention_scores = query_layer_ @ key_layer_.transpose(-1, -2)
+                attention_scores /= math.sqrt(self.head_dim)
+
+                attention_scores = F.softmax(
+                    attention_scores + attention_mask_float, dim=-1, dtype=hidden_states.dtype
+                )
+                attn_output = attention_scores @ value_layer_
+
+            attn_output = attn_output.view(batch_size, self.num_heads, query_length, self.head_dim)
+            attn_output = attn_output.permute(0, 2, 1, 3)
+            attn_output = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim)
+
+            output_tensor = self.dense(attn_output)
+
+            if output_attentions:
+                return output_tensor, present, attention_scores
+            else:
+                return output_tensor, present
+
+        else:
+            matmul_result = query_layer_ @ key_layer_.transpose(-1, -2)
+
+            # change view to [batch_size, num_heads, q_length, kv_length]
+            attention_scores = matmul_result.view(batch_size, self.num_heads, query_length, kv_length)
+
+            # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype - [batch_size, num_heads, q_length, kv_length]
+            input_dtype = attention_scores.dtype
+            # `float16` has a minimum value of -65504.0, whereas `bfloat16` and `float32` have a minimum value of `-3.4e+38`
+            if input_dtype == torch.float16 or input_dtype == torch.bfloat16:
+                attention_scores = attention_scores.to(torch.float32)
+            # Matt (HF) note: We could possibly use F.scaled_dot_product_attention here too, by
+            # adding (alibi * self.inv_norm_factor) to attention_mask_float. I think this would be mathematically
+            # equivalent and more performant, but there might be a numerical difference. If you're reading this
+            # and you'd like to experiment and maybe file a PR, feel free!
+            attention_logits = attention_scores + alibi.view(batch_size, self.num_heads, 1, -1)
+            attention_logits *= self.inv_norm_factor
+            attention_probs = F.softmax(attention_logits + attention_mask_float, dim=-1, dtype=hidden_states.dtype)
+            # [batch_size, num_heads, q_length, kv_length]
+            attention_probs = self.attention_dropout(attention_probs)
+
+            if head_mask is not None:
+                attention_probs = attention_probs * head_mask
+
+            # change view [batch_size, num_heads, q_length, kv_length]
+            attention_probs_reshaped = attention_probs.view(batch_size, self.num_heads, query_length, kv_length)
+
+            # matmul: [batch_size * num_heads, q_length, head_dim]
+            context_layer = (attention_probs_reshaped @ value_layer_).flatten(0, 1)
+
+            # change view [batch_size, q_length, num_heads * head_dim]
+            context_layer = self._merge_heads(context_layer)
+
+            output_tensor = self.dense(context_layer)
+
+            if output_attentions:
+                return output_tensor, present, attention_probs
+            else:
+                return output_tensor, present
+
+
+class FalconFlashAttention2(FalconAttention):
+    """
+    Falcon flash attention module. This module inherits from `FalconAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        alibi: Optional[torch.Tensor],
+        attention_mask: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        use_cache: bool = False,
+        output_attentions: bool = False,
+        padding_mask: Optional[torch.LongTensor] = None,
+    ):
+        fused_qkv = self.query_key_value(hidden_states)  # [batch_size, seq_length, 3 x hidden_size]
+        num_kv_heads = self.num_heads if self.new_decoder_architecture else self.num_kv_heads
+        # 3 x [batch_size, seq_length, num_heads, head_dim]
+        (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv)
+
+        batch_size, query_length, _, _ = query_layer.shape
+
+        query_layer = query_layer.transpose(1, 2).reshape(batch_size * self.num_heads, query_length, self.head_dim)
+        key_layer = key_layer.transpose(1, 2).reshape(
+            batch_size * num_kv_heads,
+            query_length,
+            self.head_dim,
+        )
+        value_layer = value_layer.transpose(1, 2).reshape(batch_size * num_kv_heads, query_length, self.head_dim)
+
+        past_kv_length = 0 if layer_past is None else layer_past[0].shape[1]
+        query_layer, key_layer = self.maybe_rotary(query_layer, key_layer, past_kv_length, position_ids)
+
+        if layer_past is not None and use_cache:
+            past_key, past_value = layer_past
+            # concatenate along seq_length dimension:
+            #  - key: [batch_size * self.num_heads, kv_length, head_dim]
+            #  - value: [batch_size * self.num_heads, kv_length, head_dim]
+            key_layer = torch.cat((past_key, key_layer), dim=1)
+            value_layer = torch.cat((past_value, value_layer), dim=1)
+
+        _, kv_seq_length, _ = key_layer.shape
+
+        torch_dtype = query_layer.dtype
+
+        past_key_value = (key_layer, value_layer) if use_cache else None
+
+        query_layer = (
+            query_layer.reshape(batch_size, self.num_heads, -1, self.head_dim).transpose(1, 2).to(torch_dtype)
+        )
+        key_layer = key_layer.reshape(batch_size, num_kv_heads, -1, self.head_dim).transpose(1, 2).to(torch_dtype)
+        value_layer = value_layer.reshape(batch_size, num_kv_heads, -1, self.head_dim).transpose(1, 2).to(torch_dtype)
+
+        if alibi is not None:
+            raise ValueError("`alibi` is not supported when `use_flash_attn` is True")
+
+        attn_dropout = self.attention_dropout if self.training else 0.0
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_layer.dtype
+        if input_dtype == torch.float32:
+            logger.warning_once(
+                "The input hidden states seems to be silently casted in float32, this might be related to"
+                " the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                " float16."
+            )
+
+            query_layer = query_layer.to(torch.float16)
+            key_layer = key_layer.to(torch.float16)
+            value_layer = value_layer.to(torch.float16)
+
+        attn_output = self._flash_attention_forward(
+            query_layer, key_layer, value_layer, padding_mask, query_length, dropout=attn_dropout
+        )
+
+        attn_weights = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim)
+        attn_output = self.dense(attn_weights)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, past_key_value, attn_weights
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward
+    def _flash_attention_forward(
+        self, query_states, key_states, value_states, padding_mask, query_length, dropout=0.0, softmax_scale=None
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            padding_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`int`, *optional*):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+        """
+        # Contains at least one padding token in the sequence
+        if padding_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, padding_mask, query_length
+            )
+
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=dropout,
+                softmax_scale=softmax_scale,
+                causal=True,
+            )
+
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            attn_output = flash_attn_func(
+                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=True
+            )
+
+        return attn_output
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input
+    def _upad_input(self, query_layer, key_layer, value_layer, padding_mask, query_length):
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(padding_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            padding_mask = padding_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, padding_mask)
+
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+
+
+class FalconMLP(nn.Module):
+    def __init__(self, config: FalconConfig):
+        super().__init__()
+        hidden_size = config.hidden_size
+
+        self.dense_h_to_4h = FalconLinear(hidden_size, 4 * hidden_size, bias=config.bias)
+        self.act = nn.GELU()
+        self.dense_4h_to_h = FalconLinear(4 * hidden_size, hidden_size, bias=config.bias)
+        self.hidden_dropout = config.hidden_dropout
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.act(self.dense_h_to_4h(x))
+        x = self.dense_4h_to_h(x)
+        return x
+
+
+class FalconDecoderLayer(nn.Module):
+    def __init__(self, config: FalconConfig):
+        super().__init__()
+        hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+
+        self.self_attention = (
+            FalconAttention(config)
+            if not getattr(config, "_flash_attn_2_enabled", False)
+            else FalconFlashAttention2(config)
+        )
+        self.mlp = FalconMLP(config)
+        self.hidden_dropout = config.hidden_dropout
+        self.config = config
+
+        if config.new_decoder_architecture:
+            # The layer norm before self-attention
+            self.ln_attn = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+            # The layer norm before the MLP
+            self.ln_mlp = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        else:
+            self.input_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+            if not config.parallel_attn:
+                self.post_attention_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        alibi: Optional[torch.Tensor],
+        attention_mask: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        use_cache: bool = False,
+        output_attentions: bool = False,
+        padding_mask: Optional[torch.LongTensor] = None,
+    ):
+        residual = hidden_states
+
+        if self.config.new_decoder_architecture:
+            attention_layernorm_out = self.ln_attn(hidden_states)
+            mlp_layernorm_out = self.ln_mlp(hidden_states)
+        else:
+            attention_layernorm_out = self.input_layernorm(hidden_states)
+
+        # Self attention.
+        attn_outputs = self.self_attention(
+            attention_layernorm_out,
+            layer_past=layer_past,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            alibi=alibi,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            padding_mask=padding_mask,
+        )
+
+        attention_output = attn_outputs[0]
+
+        if not self.config.new_decoder_architecture:
+            if self.config.parallel_attn:
+                mlp_layernorm_out = attention_layernorm_out
+            else:
+                residual = dropout_add(
+                    attention_output, residual, self.config.attention_dropout, training=self.training
+                )
+                mlp_layernorm_out = self.post_attention_layernorm(residual)
+
+        outputs = attn_outputs[1:]
+
+        # MLP.
+        mlp_output = self.mlp(mlp_layernorm_out)
+
+        if self.config.new_decoder_architecture or self.config.parallel_attn:
+            mlp_output += attention_output
+
+        output = dropout_add(mlp_output, residual, self.config.hidden_dropout, training=self.training)
+
+        if use_cache:
+            outputs = (output,) + outputs
+        else:
+            outputs = (output,) + outputs[1:]
+
+        return outputs  # hidden_states, present, attentions
+
+
+FALCON_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`FalconConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+FALCON_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0][0].shape[2]`
+            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        past_key_values (`Tuple[Tuple[torch.Tensor]]` of length `config.num_hidden_layers`):
+            Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
+            `past_key_values` output below). Can be used to speed up sequential decoding. The `input_ids` which have
+            their past given to this model should not be passed as `input_ids` as they have already been computed.
+
+            Each element of `past_key_values` is a tuple (past_key, past_value):
+            - past_key: [batch_size * num_heads, head_dim, kv_length]
+            - past_value: [batch_size * num_heads, kv_length, head_dim]
+        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+
+            If `past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see
+            `past_key_values`).
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class FalconPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = FalconConfig
+    base_model_prefix = "transformer"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["FalconDecoderLayer"]
+    _supports_flash_attn_2 = True
+
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights."""
+        if isinstance(module, nn.Linear) or isinstance(module, FalconLinear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    # Copied from transformers.models.bloom.modeling_bloom.BloomPreTrainedModel._set_gradient_checkpointing with BloomModel->FalconModel
+    def _set_gradient_checkpointing(self, module: nn.Module, value: bool = False):
+        if isinstance(module, FalconModel):
+            module.gradient_checkpointing = value
+
+    @staticmethod
+    def _convert_cache_to_standard_format(
+        past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]], batch_size: int
+    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
+        """
+        Standardizes the format of the cache so as to match most implementations, i.e. to tuple(tuple([batch_size,
+        num_heads, ...]))
+        """
+        batch_size_times_num_heads, kv_length, head_dim = past_key_value[0][0].shape
+        # [batch_size * self.num_heads, kv_length, head_dim] -> [batch_size, num_heads, kv_length, head_dim]
+        # Note that don't want to use self.num_attention_heads because the number of heads may vary depending
+        # on whether we use multi_query attention.
+        num_heads = batch_size_times_num_heads // batch_size
+        return tuple(
+            (
+                layer_past[0].view(batch_size, num_heads, kv_length, head_dim),
+                layer_past[1].view(batch_size, num_heads, kv_length, head_dim),
+            )
+            for layer_past in past_key_value
+        )
+
+    @staticmethod
+    def _convert_to_rw_cache(
+        past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]]
+    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
+        batch_size, num_heads, kv_length, head_dim = past_key_value[0][0].shape
+        batch_size_times_num_heads = batch_size * num_heads
+        # [batch_size, num_heads, kv_length, head_dim] -> [batch_size * num_heads, kv_length, head_dim]
+        return tuple(
+            (
+                layer_past[0].view(batch_size_times_num_heads, kv_length, head_dim),
+                layer_past[1].view(batch_size_times_num_heads, kv_length, head_dim),
+            )
+            for layer_past in past_key_value
+        )
+
+
+@add_start_docstrings(
+    "The bare Falcon Model transformer outputting raw hidden-states without any specific head on top.",
+    FALCON_START_DOCSTRING,
+)
+class FalconModel(FalconPreTrainedModel):
+    def __init__(self, config: FalconConfig):
+        super().__init__(config)
+
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.use_alibi = config.alibi
+
+        # Embedding + LN Embedding
+        self.word_embeddings = nn.Embedding(config.vocab_size, self.embed_dim)
+
+        # Transformer blocks
+        self.h = nn.ModuleList([FalconDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+
+        # Final Layer Norm
+        self.ln_f = LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.word_embeddings
+
+    @staticmethod
+    def _prepare_attn_mask(
+        attention_mask: torch.Tensor, input_shape: Tuple[int, int], past_key_values_length: int
+    ) -> torch.BoolTensor:
+        # Create a causal mask
+        # The attention mask we receive as input should cover the whole extended sequence, including any past
+        # cache, so its shape should be [batch_size, seq_length + past_key_values_length]
+        # The output shape will be [batch_size, 1, seq_length, seq_length + past_key_values_length]
+        if input_shape[1] + past_key_values_length != attention_mask.shape[1]:
+            raise ValueError(
+                "Attention mask shape should be (batch_size, seq_length + past_key_values_length)"
+                f" but is {attention_mask.shape} with input_ids shape {input_shape} and past length"
+                f" {past_key_values_length}."
+            )
+        combined_attention_mask = None
+        device = attention_mask.device
+        _, seq_length = input_shape
+
+        if seq_length > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape, device=device, past_key_values_length=past_key_values_length
+            )
+
+        # [batch_size, seq_length + past_key_values_length] -> [batch_size, 1, seq_length, seq_length + past_key_values_length]
+        expanded_attn_mask = _expand_mask(attention_mask, past_key_values_length=past_key_values_length)
+        combined_attention_mask = (
+            expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask | combined_attention_mask
+        )
+
+        return combined_attention_mask
+
+    def set_input_embeddings(self, new_embeddings: torch.Tensor):
+        self.word_embeddings = new_embeddings
+
+    @add_start_docstrings_to_model_forward(FALCON_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPastAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if past_key_values is None:
+            past_key_values = tuple([None] * len(self.h))
+        else:
+            past_key_values = self._convert_to_rw_cache(past_key_values)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape batch_size x num_heads x N x N
+        # head_mask has shape n_layer x batch x num_heads x N x N
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        hidden_states = inputs_embeds
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        presents = () if use_cache else None
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        # Compute alibi tensor: check build_alibi_tensor documentation
+        past_key_values_length = 0
+        if past_key_values[0] is not None:
+            past_key_values_length = past_key_values[0][0].shape[1]  # 1 because RW-cache, not standard format
+        if attention_mask is None:
+            attention_mask = torch.ones((batch_size, seq_length + past_key_values_length), device=hidden_states.device)
+            padding_mask = None
+        else:
+            attention_mask = attention_mask.to(hidden_states.device)
+
+            if 0 in attention_mask:
+                padding_mask = attention_mask
+            else:
+                padding_mask = None
+
+        if self.use_alibi:
+            alibi = build_alibi_tensor(attention_mask, self.num_heads, dtype=hidden_states.dtype)
+        else:
+            alibi = None
+            if position_ids is None:
+                device = input_ids.device if input_ids is not None else inputs_embeds.device
+                position_ids = torch.arange(
+                    past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+                )
+                position_ids = position_ids.unsqueeze(0)
+
+        causal_mask = self._prepare_attn_mask(
+            attention_mask,
+            input_shape=(batch_size, seq_length),
+            past_key_values_length=past_key_values_length,
+        )
+
+        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, use_cache=use_cache, output_attentions=output_attentions)
+
+                    return custom_forward
+
+                outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    alibi,
+                    causal_mask,
+                    position_ids,
+                    head_mask[i],
+                    padding_mask,
+                )
+            else:
+                outputs = block(
+                    hidden_states,
+                    layer_past=layer_past,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    head_mask=head_mask[i],
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                    alibi=alibi,
+                    padding_mask=padding_mask,
+                )
+
+            hidden_states = outputs[0]
+            if use_cache is True:
+                presents = presents + (outputs[1],)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
+
+        # Add last hidden state
+        hidden_states = self.ln_f(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if presents is not None:
+            presents = self._convert_cache_to_standard_format(presents, batch_size)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+@add_start_docstrings(
+    "The Falcon Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings).",
+    FALCON_START_DOCSTRING,
+)
+class FalconForCausalLM(FalconPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: FalconConfig):
+        super().__init__(config)
+        self.transformer = FalconModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings: torch.Tensor):
+        self.lm_head = new_embeddings
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> dict:
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+
+        # Note: versions of Falcon with alibi do not use position_ids. It is used with RoPE.
+        if not self.transformer.use_alibi and attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+
+        return {
+            "input_ids": input_ids,
+            "position_ids": position_ids,
+            "past_key_values": past_key_values,
+            "use_cache": kwargs.get("use_cache"),
+            "attention_mask": attention_mask,
+        }
+
+    @add_start_docstrings_to_model_forward(FALCON_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutputWithCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+
+        lm_logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            batch_size, seq_length, vocab_size = shift_logits.shape
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.view(batch_size * seq_length, vocab_size), shift_labels.view(batch_size * seq_length)
+            )
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def _reorder_cache(
+        self, past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
+    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
+        """
+        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
+        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
+        beam_idx at every generation step.
+
+        Output shares the same memory storage as `past`.
+        """
+
+        # Get a copy of `beam_idx` on all the devices where we need those indices.
+        device_to_beam_idx = {
+            past_state.device: beam_idx.to(past_state.device) for layer_past in past for past_state in layer_past
+        }
+        reordered_past = tuple(
+            (
+                layer_past[0].index_select(0, device_to_beam_idx[layer_past[0].device]),
+                layer_past[1].index_select(0, device_to_beam_idx[layer_past[0].device]),
+            )
+            for layer_past in past
+        )
+        return reordered_past
+
+
+@add_start_docstrings(
+    """
+    The Falcon Model transformer with a sequence classification head on top (linear layer).
+
+    [`FalconForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-1) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    FALCON_START_DOCSTRING,
+)
+class FalconForSequenceClassification(FalconPreTrainedModel):
+    def __init__(self, config: FalconConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.transformer = FalconModel(config)
+        self.score = nn.Linear(config.hidden_size, config.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(FALCON_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutputWithPast,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(dim=-1) - 1).to(logits.device)
+            else:
+                sequence_lengths = -1
+                logger.warning(
+                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                    "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+                )
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Falcon Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    FALCON_START_DOCSTRING,
+)
+class FalconForTokenClassification(FalconPreTrainedModel):
+    def __init__(self, config: FalconConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.transformer = FalconModel(config)
+        if getattr(config, "classifier_dropout", None) is not None:
+            classifier_dropout = config.classifier_dropout
+        elif getattr(config, "hidden_dropout", None) is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(FALCON_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+        hidden_states = self.dropout(hidden_states)
+        logits = self.classifier(hidden_states)
+
+        loss = None
+        if labels is not None:
+            batch_size, seq_length = labels.shape
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(
+                logits.view(batch_size * seq_length, self.num_labels), labels.view(batch_size * seq_length)
+            )
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The Falcon Model transformer with a span classification head on top for extractive question-answering tasks like
+    SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    FALCON_START_DOCSTRING,
+)
+class FalconForQuestionAnswering(FalconPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = FalconModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(FALCON_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/flaubert/__init__.py b/src/transformers/models/flaubert/__init__.py
index fc8c3cc430a9..210d80b00f9e 100644
--- a/src/transformers/models/flaubert/__init__.py
+++ b/src/transformers/models/flaubert/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/flaubert/configuration_flaubert.py b/src/transformers/models/flaubert/configuration_flaubert.py
index 7d9c60338516..ba6d79891fa9 100644
--- a/src/transformers/models/flaubert/configuration_flaubert.py
+++ b/src/transformers/models/flaubert/configuration_flaubert.py
@@ -180,7 +180,7 @@ def __init__(
         lang_id=0,
         pad_token_id=2,
         bos_token_id=0,
-        **kwargs
+        **kwargs,
     ):
         """Constructs FlaubertConfig."""
         self.pre_norm = pre_norm
diff --git a/src/transformers/models/flaubert/modeling_flaubert.py b/src/transformers/models/flaubert/modeling_flaubert.py
index 9b747e7170b2..318e9bfd471c 100644
--- a/src/transformers/models/flaubert/modeling_flaubert.py
+++ b/src/transformers/models/flaubert/modeling_flaubert.py
@@ -16,7 +16,6 @@
 
 import itertools
 import math
-import random
 from dataclasses import dataclass
 from typing import Dict, Optional, Tuple, Union
 
@@ -51,7 +50,6 @@
 
 _CHECKPOINT_FOR_DOC = "flaubert/flaubert_base_cased"
 _CONFIG_FOR_DOC = "FlaubertConfig"
-_TOKENIZER_FOR_DOC = "FlaubertTokenizer"
 
 FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "flaubert/flaubert_small_cased",
@@ -99,7 +97,6 @@ def get_masks(slen, lengths, causal, padding_mask=None):
 
 # Copied from transformers.models.xlm.modeling_xlm.MultiHeadAttention
 class MultiHeadAttention(nn.Module):
-
     NEW_ID = itertools.count()
 
     def __init__(self, n_heads, dim, config):
@@ -238,7 +235,7 @@ def ff_chunk(self, input):
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`FlaubertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -381,8 +378,6 @@ def _init_weights(self, module):
 
 
 class FlaubertModel(FlaubertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):  # , dico, is_encoder, with_output):
         super().__init__(config)
 
@@ -451,7 +446,6 @@ def __init__(self, config):  # , dico, is_encoder, with_output):
 
         # Initialize weights and apply final processing
         self.post_init()
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
 
         self.layerdrop = getattr(config, "layerdrop", 0.0)
         self.pre_norm = getattr(config, "pre_norm", False)
@@ -478,7 +472,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -583,9 +576,10 @@ def forward(
         attentions = () if output_attentions else None
         for i in range(self.n_layers):
             # LayerDrop
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             if output_hidden_states:
                 hidden_states = hidden_states + (tensor,)
@@ -657,7 +651,7 @@ def forward(
 )
 # Copied transformers.models.xlm.modeling_xlm.XLMWithLMHeadModel with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert
 class FlaubertWithLMHeadModel(FlaubertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["pred_layer.proj.weight"]
+    _tied_weights_keys = ["pred_layer.proj.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -688,7 +682,6 @@ def prepare_inputs_for_generation(self, input_ids, **kwargs):
 
     @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -769,7 +762,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -873,7 +865,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -957,7 +948,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1232,7 +1222,6 @@ def __init__(self, config, *inputs, **kwargs):
         FLAUBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
     )
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/flaubert/modeling_tf_flaubert.py b/src/transformers/models/flaubert/modeling_tf_flaubert.py
index 05b6922795ba..068119d35f17 100644
--- a/src/transformers/models/flaubert/modeling_tf_flaubert.py
+++ b/src/transformers/models/flaubert/modeling_tf_flaubert.py
@@ -16,6 +16,9 @@
  TF 2.0 Flaubert model.
 """
 
+
+from __future__ import annotations
+
 import itertools
 import random
 import warnings
@@ -46,7 +49,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     MULTIPLE_CHOICE_DUMMY_INPUTS,
     ModelOutput,
@@ -62,7 +65,6 @@
 
 _CHECKPOINT_FOR_DOC = "flaubert/flaubert_base_cased"
 _CONFIG_FOR_DOC = "FlaubertConfig"
-_TOKENIZER_FOR_DOC = "FlaubertTokenizer"
 
 TF_FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     # See all Flaubert models at https://huggingface.co/models?filter=flaubert
@@ -115,7 +117,7 @@
         input_ids (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`FlaubertTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -250,22 +252,21 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFBaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        langs: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        lengths: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
         cache: Optional[Dict[str, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -289,13 +290,6 @@ def call(
 
         return outputs
 
-    # Copied from transformers.models.distilbert.modeling_tf_distilbert.TFDistilBertModel.serving_output
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutput(last_hidden_state=output.last_hidden_state, hidden_states=hs, attentions=attns)
-
 
 # Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMMultiHeadAttention with XLM->Flaubert
 class TFFlaubertMultiHeadAttention(tf.keras.layers.Layer):
@@ -488,15 +482,15 @@ def set_input_embeddings(self, value):
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        langs: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        lengths: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
         cache: Optional[Dict[str, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -580,16 +574,7 @@ def call(
 
         # embeddings
         if inputs_embeds is None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.embeddings.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.embeddings.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.embeddings.vocab_size)
             inputs_embeds = self.embeddings(input_ids)
 
         tensor = inputs_embeds + tf.gather(self.position_embeddings, position_ids)
@@ -765,8 +750,8 @@ class TFFlaubertWithLMHeadModelOutput(ModelOutput):
     """
 
     logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @add_start_docstrings(
@@ -808,28 +793,26 @@ def prepare_inputs_for_generation(self, inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFFlaubertWithLMHeadModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        langs: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        lengths: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
         cache: Optional[Dict[str, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFFlaubertWithLMHeadModelOutput]:
-
         transformer_outputs = self.transformer(
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -855,12 +838,6 @@ def call(
             logits=outputs, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions
         )
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFFlaubertWithLMHeadModelOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -881,26 +858,25 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        langs: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        lengths: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
         cache: Optional[Dict[str, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -941,13 +917,6 @@ def call(
             attentions=transformer_outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -968,27 +937,26 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        langs: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        lengths: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
         cache: Optional[Dict[str, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1041,15 +1009,6 @@ def call(
             attentions=transformer_outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output
-    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
-
 
 @add_start_docstrings(
     """
@@ -1073,26 +1032,25 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFTokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        langs: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        lengths: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
         cache: Optional[Dict[str, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1132,13 +1090,6 @@ def call(
             attentions=transformer_outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1182,26 +1133,25 @@ def dummy_inputs(self):
         FLAUBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
     )
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFMultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        langs: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        lengths: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
         cache: Optional[Dict[str, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
         if input_ids is not None:
@@ -1261,25 +1211,3 @@ def call(
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
         )
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-                "token_type_ids": tf.TensorSpec((None, None, None), tf.int32, name="token_type_ids"),
-            }
-        ]
-    )
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving
-    def serving(self, inputs: Dict[str, tf.Tensor]):
-        output = self.call(input_ids=inputs)
-
-        return self.serving_output(output)
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output
-    def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMultipleChoiceModelOutput(logits=output.logits, hidden_states=hs, attentions=attns)
diff --git a/src/transformers/models/flaubert/tokenization_flaubert.py b/src/transformers/models/flaubert/tokenization_flaubert.py
index 5a0ac59c39df..b1b34cc0f78d 100644
--- a/src/transformers/models/flaubert/tokenization_flaubert.py
+++ b/src/transformers/models/flaubert/tokenization_flaubert.py
@@ -207,7 +207,7 @@ class FlaubertTokenizer(PreTrainedTokenizer):
         mask_token (`str`, *optional*, defaults to `"<special1>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        additional_special_tokens (`List[str]`, *optional*, defaults to `["<special0>","<special1>","<special2>","<special3>","<special4>","<special5>","<special6>","<special7>","<special8>","<special9>"]`):
+        additional_special_tokens (`List[str]`, *optional*, defaults to `['<special0>', '<special1>', '<special2>', '<special3>', '<special4>', '<special5>', '<special6>', '<special7>', '<special8>', '<special9>']`):
             List of additional special tokens.
         lang2id (`Dict[str, int]`, *optional*):
             Dictionary mapping languages string identifiers to their IDs.
@@ -245,9 +245,8 @@ def __init__(
         ],
         lang2id=None,
         id2lang=None,
-        **kwargs
+        **kwargs,
     ):
-
         do_lowercase_and_remove_accent = kwargs.pop("do_lowercase_and_remove_accent", None)
         if do_lowercase_and_remove_accent is not None:
             logger.warning(
@@ -259,19 +258,6 @@ def __init__(
 
         self.do_lowercase = do_lowercase
 
-        super().__init__(
-            unk_token=unk_token,
-            bos_token=bos_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            additional_special_tokens=additional_special_tokens,
-            lang2id=lang2id,
-            id2lang=id2lang,
-            **kwargs,
-        )
-
         try:
             import sacremoses
         except ImportError:
@@ -283,10 +269,10 @@ def __init__(
         self.sm = sacremoses
 
         # cache of sm.MosesPunctNormalizer instance
-        self.cache_moses_punct_normalizer = dict()
+        self.cache_moses_punct_normalizer = {}
         # cache of sm.MosesTokenizer instance
-        self.cache_moses_tokenizer = dict()
-        self.lang_with_custom_tokenizer = set(["zh", "th", "ja"])
+        self.cache_moses_tokenizer = {}
+        self.lang_with_custom_tokenizer = {"zh", "th", "ja"}
         self.lang2id = lang2id
         self.id2lang = id2lang
         if lang2id is not None and id2lang is not None:
@@ -304,6 +290,19 @@ def __init__(
         self.bpe_ranks = dict(zip(merges, range(len(merges))))
         self.cache = {}
 
+        super().__init__(
+            unk_token=unk_token,
+            bos_token=bos_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            additional_special_tokens=additional_special_tokens,
+            lang2id=lang2id,
+            id2lang=id2lang,
+            **kwargs,
+        )
+
     @property
     # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.do_lower_case
     def do_lower_case(self):
@@ -453,7 +452,7 @@ def _tokenize(self, text, bypass_tokenizer=False):
         split_tokens = []
         for token in text:
             if token:
-                split_tokens.extend([t for t in self.bpe(token).split(" ")])
+                split_tokens.extend(list(self.bpe(token).split(" ")))
 
         return split_tokens
 
diff --git a/src/transformers/models/flava/__init__.py b/src/transformers/models/flava/__init__.py
index 356504bf4f28..8d026a944327 100644
--- a/src/transformers/models/flava/__init__.py
+++ b/src/transformers/models/flava/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 Meta Platforms authors and The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/flava/configuration_flava.py b/src/transformers/models/flava/configuration_flava.py
index e74101203c50..4125d9126220 100644
--- a/src/transformers/models/flava/configuration_flava.py
+++ b/src/transformers/models/flava/configuration_flava.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ FLAVA model configurations"""
 
-import copy
 import os
 from typing import Any, Dict, Union
 
@@ -53,9 +52,9 @@ class FlavaImageConfig(PretrainedConfig):
         hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"selu"` and `"gelu_new"` are supported.
-        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
@@ -109,7 +108,7 @@ def __init__(
         qkv_bias: bool = True,
         mask_token: bool = True,
         vocab_size: int = 8192,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -131,6 +130,7 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
 
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
@@ -237,7 +237,7 @@ def __init__(
         layer_norm_eps: float = 1e-12,
         pad_token_id: int = 0,
         qkv_bias: bool = True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -259,6 +259,7 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
 
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
@@ -290,7 +291,7 @@ class FlavaMultimodalConfig(PretrainedConfig):
     Args:
         hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (`int`, *optional*, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 6):
             Number of hidden layers in the Transformer encoder.
         num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
@@ -299,9 +300,9 @@ class FlavaMultimodalConfig(PretrainedConfig):
         hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"selu"` and `"gelu_new"` are supported.
-        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
@@ -343,7 +344,7 @@ def __init__(
         layer_norm_eps: float = 1e-12,
         qkv_bias: bool = True,
         use_cls_token: bool = True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -361,6 +362,8 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the multimodal config dict if we are loading from FlavaConfig
@@ -444,6 +447,7 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
 
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
@@ -531,7 +535,6 @@ class FlavaConfig(PretrainedConfig):
     """
 
     model_type = "flava"
-    is_composition = True
 
     def __init__(
         self,
@@ -555,40 +558,161 @@ def __init__(
         global_backprop_contrastive: bool = True,
         skip_unmasked_multimodal_encoder: bool = True,
         return_loss: bool = True,
-        **kwargs
+        **kwargs,
     ):
-        super().__init__(**kwargs)
-
         # If `_config_dict` exist, we use them for the backward compatibility.
+        # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
+        # of confusion!).
         text_config_dict = kwargs.pop("text_config_dict", None)
-        image_config_dict = kwargs.pop("vision_config_dict", None)
+        image_config_dict = kwargs.pop("image_config_dict", None)
         multimodal_config_dict = kwargs.pop("multimodal_config_dict", None)
         image_codebook_config_dict = kwargs.pop("image_codebook_config_dict", None)
+
+        super().__init__(**kwargs)
+
+        # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
+        # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
+        # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
         if text_config_dict is not None:
-            text_config = text_config_dict
+            if text_config is None:
+                text_config = {}
+
+            # This is the complete result when using `text_config_dict`.
+            _text_config_dict = FlavaTextConfig(**text_config_dict).to_dict()
+
+            # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
+            for key, value in _text_config_dict.items():
+                if key in text_config and value != text_config[key] and key not in ["transformers_version"]:
+                    # If specified in `text_config_dict`
+                    if key in text_config_dict:
+                        message = (
+                            f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
+                            f'The value `text_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`text_config_dict` is provided which will be used to initialize `FlavaTextConfig`. The "
+                            f'value `text_config["{key}"]` will be overriden.'
+                        )
+                    logger.warning(message)
+
+            # Update all values in `text_config` with the ones in `_text_config_dict`.
+            text_config.update(_text_config_dict)
+
         if image_config_dict is not None:
-            image_config = image_config_dict
+            if image_config is None:
+                image_config = {}
+
+            # This is the complete result when using `image_config_dict`.
+            _image_config_dict = FlavaImageConfig(**image_config_dict).to_dict()
+            # convert keys to string instead of integer
+            if "id2label" in _image_config_dict:
+                _image_config_dict["id2label"] = {
+                    str(key): value for key, value in _image_config_dict["id2label"].items()
+                }
+
+            # Give a warning if the values exist in both `_image_config_dict` and `image_config` but being different.
+            for key, value in _image_config_dict.items():
+                if key in image_config and value != image_config[key] and key not in ["transformers_version"]:
+                    # If specified in `image_config_dict`
+                    if key in image_config_dict:
+                        message = (
+                            f"`{key}` is found in both `image_config_dict` and `image_config` but with different "
+                            f'values. The value `image_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`image_config_dict` is provided which will be used to initialize `FlavaImageConfig`. "
+                            f'The value `image_config["{key}"]` will be overriden.'
+                        )
+                    logger.warning(message)
+
+            # Update all values in `image_config` with the ones in `_image_config_dict`.
+            image_config.update(_image_config_dict)
+
         if multimodal_config_dict is not None:
-            multimodal_config = multimodal_config_dict
+            if multimodal_config is None:
+                multimodal_config = {}
+
+            # This is the complete result when using `multimodal_config_dict`.
+            _multimodal_config_dict = FlavaMultimodalConfig(**multimodal_config_dict).to_dict()
+
+            # Give a warning if the values exist in both `_multimodal_config_dict` and `multimodal_config` but being
+            # different.
+            for key, value in _multimodal_config_dict.items():
+                if (
+                    key in multimodal_config
+                    and value != multimodal_config[key]
+                    and key not in ["transformers_version"]
+                ):
+                    # If specified in `multimodal_config_dict`
+                    if key in multimodal_config_dict:
+                        message = (
+                            f"`{key}` is found in both `multimodal_config_dict` and `multimodal_config` but with "
+                            f'different values. The value `multimodal_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`multimodal_config_dict` is provided which will be used to initialize "
+                            f'`FlavaMultimodalConfig`. The value `multimodal_config["{key}"]` will be overriden.'
+                        )
+                    logger.warning(message)
+
+            # Update all values in `multimodal_config` with the ones in `_multimodal_config_dict`.
+            multimodal_config.update(_multimodal_config_dict)
+
         if image_codebook_config_dict is not None:
-            image_codebook_config = image_codebook_config_dict
+            if image_codebook_config is None:
+                image_codebook_config = {}
+
+            # This is the complete result when using `image_codebook_config_dict`.
+            _image_codebook_config_dict = FlavaImageCodebookConfig(**image_codebook_config_dict).to_dict()
+
+            # Give a warning if the values exist in both `_image_codebook_config_dict` and `image_codebook_config` but
+            # being different.
+            for key, value in _image_codebook_config_dict.items():
+                if (
+                    key in image_codebook_config
+                    and value != image_codebook_config[key]
+                    and key not in ["transformers_version"]
+                ):
+                    # If specified in `image_codebook_config_dict`
+                    if key in image_codebook_config_dict:
+                        message = (
+                            f"`{key}` is found in both `image_codebook_config_dict` and `image_codebook_config` but "
+                            f'with different values. The value `image_codebook_config_dict["{key}"]` will be used '
+                            "instead."
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`image_codebook_config_dict` is provided which will be used to initialize "
+                            f'`FlavaImageCodebookConfig`. The value `image_codebook_config["{key}"]` will be overriden.'
+                        )
+                    logger.warning(message)
+
+            # Update all values in `image_codebook_config` with the ones in `_image_codebook_config_dict`.
+            image_codebook_config.update(_image_codebook_config_dict)
 
         if image_config is None:
             image_config = {}
-            logger.info("image_config is None. initializing the FlavaImageConfig with default values.")
+            logger.info("`image_config` is `None`. initializing the `FlavaImageConfig` with default values.")
 
         if text_config is None:
             text_config = {}
-            logger.info("text_config is None. Initializing the FlavaTextConfig with default values.")
+            logger.info("`text_config` is `None`. Initializing the `FlavaTextConfig` with default values.")
 
         if multimodal_config is None:
             multimodal_config = {}
-            logger.info("multimodal_config is None. initializing the FlavaMultimodalConfig with default values.")
+            logger.info("`multimodal_config` is `None`. initializing the `FlavaMultimodalConfig` with default values.")
 
         if image_codebook_config is None:
             image_codebook_config = {}
             logger.info(
-                "image_codebook_config is None. initializing the FlavaImageCodebookConfig with default values."
+                "`image_codebook_config` is `None`. initializing the `FlavaImageCodebookConfig` with default values."
             )
 
         self.image_config = FlavaImageConfig(**image_config)
@@ -621,7 +745,7 @@ def from_configs(
         text_config: FlavaTextConfig,
         multimodal_config: FlavaMultimodalConfig,
         image_codebook_config: FlavaImageCodebookConfig,
-        **kwargs
+        **kwargs,
     ):
         r"""
         Instantiate a [`FlavaConfig`] (or a derived class) from flava text model configuration, flava image model
@@ -638,18 +762,3 @@ def from_configs(
             image_codebook_config=image_codebook_config.to_dict(),
             **kwargs,
         )
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["image_config"] = self.image_config.to_dict()
-        output["text_config"] = self.text_config.to_dict()
-        output["multimodal_config"] = self.multimodal_config.to_dict()
-        output["image_codebook_config"] = self.image_codebook_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/flava/image_processing_flava.py b/src/transformers/models/flava/image_processing_flava.py
index 22e062306fcc..b098b7c634dd 100644
--- a/src/transformers/models/flava/image_processing_flava.py
+++ b/src/transformers/models/flava/image_processing_flava.py
@@ -21,13 +21,21 @@
 
 import numpy as np
 
-from transformers.utils import is_vision_available
-from transformers.utils.generic import TensorType
-
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import center_crop, normalize, rescale, resize, to_channel_dimension_format
-from ...image_utils import ChannelDimension, ImageInput, PILImageResampling, is_batched, to_numpy_array, valid_images
-from ...utils import logging
+from ...image_transforms import resize, to_channel_dimension_format
+from ...image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import TensorType, is_vision_available, logging
 
 
 if is_vision_available():
@@ -38,8 +46,8 @@
 
 
 # These values are taken from CLIP
-FLAVA_IMAGE_MEAN = [0.48145466, 0.4578275, 0.40821073]
-FLAVA_IMAGE_STD = [0.26862954, 0.26130258, 0.27577711]
+FLAVA_IMAGE_MEAN = OPENAI_CLIP_MEAN
+FLAVA_IMAGE_STD = OPENAI_CLIP_STD
 FLAVA_CODEBOOK_MEAN = [0.0, 0.0, 0.0]
 FLAVA_CODEBOOK_STD = [1.0, 1.0, 1.0]
 LOGIT_LAPLACE_EPS: float = 0.1
@@ -247,7 +255,7 @@ def __init__(
         codebook_do_normalize: bool = True,
         codebook_image_mean: Optional[Union[float, Iterable[float]]] = None,
         codebook_image_std: Optional[Union[float, Iterable[float]]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"height": 224, "width": 224}
@@ -325,13 +333,15 @@ def masking_generator(
             mask_group_max_aspect_ratio=mask_group_max_aspect_ratio,
         )
 
+    # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize with PILImageResampling.BILINEAR->PILImageResampling.BICUBIC
     def resize(
         self,
         image: np.ndarray,
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image to `(size["height"], size["width"])`.
@@ -340,86 +350,38 @@ def resize(
             image (`np.ndarray`):
                 Image to resize.
             size (`Dict[str, int]`):
-                Size of the output image.
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
             resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
-                Resampling filter to use when resiizing the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
         """
         size = get_size_dict(size)
         if "height" not in size or "width" not in size:
-            raise ValueError(f"The size dictionary must contain 'height' and 'width' keys. Got {size.keys()}")
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        output_size = (size["height"], size["width"])
         return resize(
-            image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
         )
 
-    def center_crop(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Center crop an image to `(size["height"], size["width"])`. If the input size is smaller than `crop_size` along
-        any edge, the image is padded with 0's and then center cropped.
-
-        Args:
-            image (`np.ndarray`):
-                Image to center crop.
-            size (`Dict[str, int]`):
-                Size of the output image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        size = get_size_dict(size)
-        if "height" not in size or "width" not in size:
-            raise ValueError(f"The size dictionary must contain 'height' and 'width' keys. Got {size.keys()}")
-        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
-
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            image_mean (`float` or `List[float]`):
-                Image mean.
-            image_std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
     def map_pixels(self, image: np.ndarray) -> np.ndarray:
         return (1 - 2 * LOGIT_LAPLACE_EPS) * image + LOGIT_LAPLACE_EPS
 
@@ -438,6 +400,7 @@ def _preprocess_image(
         image_std: Optional[Union[float, List[float]]] = None,
         do_map_pixels: bool = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[ChannelDimension] = None,
     ) -> np.ndarray:
         """Preprocesses a single image."""
         if do_resize and size is None or resample is None:
@@ -452,23 +415,33 @@ def _preprocess_image(
         # All transformations expect numpy arrays.
         image = to_numpy_array(image)
 
+        if is_scaled_image(image) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(image)
+
         if do_resize:
-            image = self.resize(image=image, size=size, resample=resample)
+            image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
 
         if do_center_crop:
-            image = self.center_crop(image=image, size=crop_size)
+            image = self.center_crop(image=image, size=crop_size, input_data_format=input_data_format)
 
         if do_rescale:
-            image = self.rescale(image=image, scale=rescale_factor)
+            image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
 
         if do_normalize:
-            image = self.normalize(image=image, mean=image_mean, std=image_std)
+            image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
 
         if do_map_pixels:
             image = self.map_pixels(image)
 
         if data_format is not None:
-            image = to_channel_dimension_format(image, data_format)
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
         return image
 
     def preprocess(
@@ -507,6 +480,7 @@ def preprocess(
         codebook_image_std: Optional[Iterable[float]] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
         **kwargs,
     ) -> PIL.Image.Image:
         """
@@ -514,7 +488,8 @@ def preprocess(
 
         Args:
             images (`ImageInput`):
-                Image to preprocess.
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
             do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                 Whether to resize the image.
             size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@@ -588,6 +563,12 @@ def preprocess(
                 The channel dimension format for the output image. Can be one of:
                     - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                     - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         size = size if size is not None else self.size
@@ -647,8 +628,7 @@ def preprocess(
         codebook_image_mean = codebook_image_mean if codebook_image_mean is not None else self.codebook_image_mean
         codebook_image_std = codebook_image_std if codebook_image_std is not None else self.codebook_image_std
 
-        if not is_batched(images):
-            images = [images]
+        images = make_list_of_images(images)
 
         if not valid_images(images):
             raise ValueError(
@@ -671,6 +651,7 @@ def preprocess(
                 image_std=image_std,
                 do_map_pixels=False,
                 data_format=data_format,
+                input_data_format=input_data_format,
             )
             for img in images
         ]
@@ -692,6 +673,7 @@ def preprocess(
                     image_std=codebook_image_std,
                     do_map_pixels=codebook_do_map_pixels,
                     data_format=data_format,
+                    input_data_format=input_data_format,
                 )
                 for img in images
             ]
diff --git a/src/transformers/models/flava/modeling_flava.py b/src/transformers/models/flava/modeling_flava.py
index ffac13c2cf26..9e106e3c2197 100644
--- a/src/transformers/models/flava/modeling_flava.py
+++ b/src/transformers/models/flava/modeling_flava.py
@@ -24,13 +24,12 @@
 import torch.utils.checkpoint
 from torch import nn
 
-from transformers.utils.doc import add_code_sample_docstrings
-
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     ModelOutput,
+    add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     logging,
@@ -51,11 +50,9 @@
 
 # Codebook docstring
 _CHECKPOINT_FOR_CODEBOOK_DOC = "facebook/flava-image-codebook"
-_FEAT_EXTRACTOR_FOR_DOC = "FlavaFeatureExtractor"
 _CONFIG_CLASS_FOR_IMAGE_MODEL_DOC = "FlavaImageConfig"
 _CONFIG_CLASS_FOR_TEXT_MODEL_DOC = "FlavaTextConfig"
 _CONFIG_CLASS_FOR_MULTIMODAL_MODEL_DOC = "FlavaMultimodalConfig"
-_TOKENIZER_FOR_DOC = "BertTokenizer"
 _EXPECTED_IMAGE_OUTPUT_SHAPE = [1, 197, 768]
 
 FLAVA_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -390,7 +387,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
         )
@@ -511,7 +510,6 @@ def __init__(self, config: FlavaPossibleConfigs) -> None:
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
 
@@ -571,7 +569,6 @@ def __init__(self, config: FlavaPossibleConfigs) -> None:
 
     # Copied from transformers.models.vit.modeling_vit.ViTIntermediate.forward
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.intermediate_act_fn(hidden_states)
 
@@ -750,8 +747,8 @@ def forward(self, hidden_states: torch.Tensor):
 FLAVA_IMAGE_INPUTS_DOCSTRING_BASE = r"""
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`FlavaFeatureExtractor`]. See
-            [`FlavaFeatureExtractor.__call__`] for details.
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`FlavaImageProcessor.__call__`] for details.
 
         bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`):
             Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
@@ -765,7 +762,7 @@ def forward(self, hidden_states: torch.Tensor):
 FLAVA_TEXT_INPUTS_DOCSTRING_BASE = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `({0})`):
-            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`BertTokenizer`]. See
+            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
             [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
             IDs?](../glossary#input-ids)
 
@@ -807,7 +804,7 @@ def forward(self, hidden_states: torch.Tensor):
     Args:
         input_ids_masked (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary. These ones are the masked version of the original task
-            to be used with MLM. Indices can be obtained using [`BertTokenizer`] along with
+            to be used with MLM. Indices can be obtained using [`AutoTokenizer`] along with
             [`DataCollatorForMaskedLanguageModeling`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids)
 
@@ -926,7 +923,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(FLAVA_IMAGE_INPUTS_DOCSTRING.format("batch_size, image_num_patches"))
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPooling,
         config_class=_CONFIG_CLASS_FOR_IMAGE_MODEL_DOC,
@@ -1024,7 +1020,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(FLAVA_TEXT_INPUTS_DOCSTRING.format("batch_size, text_seq_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPooling,
         config_class=_CONFIG_CLASS_FOR_TEXT_MODEL_DOC,
@@ -1129,7 +1124,6 @@ class PreTrainedModel
         FLAVA_MULTIMODAL_INPUTS_DOCSTRING.format("batch_size, image_num_patches + text_seq_len")
     )
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPooling,
         config_class=_CONFIG_CLASS_FOR_MULTIMODAL_MODEL_DOC,
@@ -1235,7 +1229,7 @@ def __init__(self, config: FlavaConfig):
 
         self.image_projection = nn.Linear(self.image_hidden_size, self.projection_dim)
         self.text_projection = nn.Linear(self.text_hidden_size, self.projection_dim)
-        self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
+        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
 
         self.image_to_mm_projection = nn.Linear(self.image_hidden_size, self.mm_hidden_size)
         self.text_to_mm_projection = nn.Linear(self.text_hidden_size, self.mm_hidden_size)
@@ -1261,10 +1255,10 @@ def get_text_features(
         Examples:
 
         ```python
-        >>> from transformers import FlavaProcessor, FlavaModel
+        >>> from transformers import AutoProcessor, FlavaModel
 
         >>> model = FlavaModel.from_pretrained("{0}")
-        >>> processor = FlavaProcessor.from_pretrained("{0}")
+        >>> processor = AutoProcessor.from_pretrained("{0}")
 
         >>> inputs = processor(
         ...     text=["a photo of a cat", "a photo of a dog"], max_length=77, padding="max_length", return_tensors="pt"
@@ -1310,10 +1304,10 @@ def get_image_features(
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import FlavaProcessor, FlavaModel
+        >>> from transformers import AutoProcessor, FlavaModel
 
         >>> model = FlavaModel.from_pretrained("{0}")
-        >>> processor = FlavaProcessor.from_pretrained("{0}")
+        >>> processor = AutoProcessor.from_pretrained("{0}")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
@@ -1366,10 +1360,10 @@ def forward(
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import FlavaProcessor, FlavaModel
+        >>> from transformers import AutoProcessor, FlavaModel
 
         >>> model = FlavaModel.from_pretrained("facebook/flava-full")
-        >>> processor = FlavaProcessor.from_pretrained("facebook/flava-full")
+        >>> processor = AutoProcessor.from_pretrained("facebook/flava-full")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
@@ -1568,22 +1562,22 @@ def get_codebook_indices(self, pixel_values: torch.Tensor) -> torch.Tensor:
         """
         Args:
             pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-                Pixel values. Codebook pixel values can be obtained using [`FlavaFeatureExtractor`] by passing
-                `return_codebook_pixels=True`. See [`FlavaFeatureExtractor.__call__`] for details.
+                Pixel values. Codebook pixel values can be obtained using [`AutoImageProcessor`] by passing
+                `return_codebook_pixels=True`. See [`FlavaImageProcessor.__call__`] for details.
 
         Examples:
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import FlavaFeatureExtractor, FlavaImageCodebook
+        >>> from transformers import AutoImageProcessor, FlavaImageCodebook
 
         >>> model = FlavaImageCodebook.from_pretrained("{0}")
-        >>> feature_extractor = FlavaFeatureExtractor.from_pretrained("{0}")
+        >>> image_processor = AutoImageProcessor.from_pretrained("{0}")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> inputs = feature_extractor([image], return_codebook_pixels=True, return_tensors="pt")
+        >>> inputs = image_processor([image], return_codebook_pixels=True, return_tensors="pt")
         >>> inputs = dict(pixel_values=inputs.codebook_pixel_values)
 
         >>> outputs = model.get_codebook_indices(**inputs)
@@ -1602,23 +1596,23 @@ def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
         """
         Args:
             pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-                Pixel values. Codebook pixel values can be obtained using [`FlavaFeatureExtractor`] by passing
-                `return_codebook_pixels=True`. See [`FlavaFeatureExtractor.__call__`] for details.
+                Pixel values. Codebook pixel values can be obtained using [`AutoImageProcessor`] by passing
+                `return_codebook_pixels=True`. See [`FlavaImageProcessor.__call__`] for details.
 
         Examples:
 
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import FlavaFeatureExtractor, FlavaImageCodebook
+        >>> from transformers import AutoImageProcessor, FlavaImageCodebook
 
         >>> model = FlavaImageCodebook.from_pretrained("{0}")
-        >>> feature_extractor = FlavaFeatureExtractor.from_pretrained("{0}")
+        >>> image_processor = AutoImageProcessor.from_pretrained("{0}")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> inputs = feature_extractor([image], return_codebook_pixels=True, return_tensors="pt")
+        >>> inputs = image_processor([image], return_codebook_pixels=True, return_tensors="pt")
         >>> inputs = dict(pixel_values=inputs.codebook_pixel_values)
 
         >>> outputs = model(**inputs)
@@ -1701,8 +1695,10 @@ def forward(self, image_embeddings, text_embeddings, logit_scale):
             world_size = torch.distributed.get_world_size()
 
             if self.global_backprop_contrastive:
-                image_embeddings_all = torch.distributed.nn.functional.all_gather_with_backprop(image_embeddings)
-                text_embeddings_all = torch.distributed.nn.functional.all_gather_with_backprop(text_embeddings)
+                # `torch.distributed.nn.functional.all_gather` does backprop on all active workers
+                # whereas `torch.distributed.all_gather` does only backpropagates on the current worker.
+                image_embeddings_all = torch.distributed.nn.functional.all_gather(image_embeddings)
+                text_embeddings_all = torch.distributed.nn.functional.all_gather(text_embeddings)
             else:
                 image_embeddings_all = [torch.zeros_like(text_embeddings) for _ in range(world_size)]
                 text_embeddings_all = [torch.zeros_like(image_embeddings) for _ in range(world_size)]
@@ -1730,7 +1726,7 @@ def forward(self, image_embeddings, text_embeddings, logit_scale):
 )
 class FlavaForPreTraining(FlavaPreTrainedModel):
     # Those are linked to xxx.bias
-    _keys_to_ignore_on_load_missing = [
+    _tied_weights_keys = [
         "mmm_text_head.decoder.bias",
         "mmm_image_head.decoder.bias",
         "mlm_head.decoder.bias",
@@ -1801,13 +1797,13 @@ def forward(
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import FlavaForPreTraining, FlavaProcessor
+        >>> from transformers import FlavaForPreTraining, AutoProcessor
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
         >>> model = FlavaForPreTraining.from_pretrained("facebook/flava-full")
-        >>> processor = FlavaProcessor.from_pretrained("facebook/flava-full")
+        >>> processor = AutoProcessor.from_pretrained("facebook/flava-full")
 
         >>> text = ["a photo of a cat"]
 
@@ -1897,7 +1893,7 @@ def forward(
                 if codebook_pixel_values is None:
                     raise ValueError(
                         "`codebook_pixel_value` are required to generate `mim_labels` if loss is expected. "
-                        "Call `FlavaProcessor` with `return_codebook_pixels` set to True"
+                        "Call `AutoProcessor` with `return_codebook_pixels` set to True"
                     )
                 mim_labels = self.image_codebook.get_codebook_indices(codebook_pixel_values)
         # Unimodal MIM Loss
diff --git a/src/transformers/models/flava/processing_flava.py b/src/transformers/models/flava/processing_flava.py
index 20562c84569e..1736257a3555 100644
--- a/src/transformers/models/flava/processing_flava.py
+++ b/src/transformers/models/flava/processing_flava.py
@@ -33,14 +33,15 @@ class FlavaProcessor(ProcessorMixin):
     [`~FlavaProcessor.__call__`] and [`~FlavaProcessor.decode`] for more information.
 
     Args:
-        image_processor ([`FlavaImageProcessor`]): The image processor is a required input.
-        tokenizer ([`BertTokenizerFast`]): The tokenizer is a required input.
+        image_processor ([`FlavaImageProcessor`], *optional*): The image processor is a required input.
+        tokenizer ([`BertTokenizerFast`], *optional*): The tokenizer is a required input.
     """
     attributes = ["image_processor", "tokenizer"]
     image_processor_class = "FlavaImageProcessor"
     tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        feature_extractor = None
         if "feature_extractor" in kwargs:
             warnings.warn(
                 "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
@@ -78,7 +79,7 @@ def __call__(
         return_length: bool = False,
         verbose: bool = True,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs
+        **kwargs,
     ):
         """
         This method uses [`FlavaImageProcessor.__call__`] method to prepare image(s) for the model, and
diff --git a/src/transformers/models/fnet/__init__.py b/src/transformers/models/fnet/__init__.py
index 7cece0488f63..485160d1ccaa 100644
--- a/src/transformers/models/fnet/__init__.py
+++ b/src/transformers/models/fnet/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/fnet/configuration_fnet.py b/src/transformers/models/fnet/configuration_fnet.py
index 29dc4c0f9126..9efa06487756 100644
--- a/src/transformers/models/fnet/configuration_fnet.py
+++ b/src/transformers/models/fnet/configuration_fnet.py
@@ -103,7 +103,7 @@ def __init__(
         pad_token_id=3,
         bos_token_id=1,
         eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
diff --git a/src/transformers/models/fnet/convert_fnet_original_flax_checkpoint_to_pytorch.py b/src/transformers/models/fnet/convert_fnet_original_flax_checkpoint_to_pytorch.py
index 27b6563e5dd9..f77a44874ae4 100644
--- a/src/transformers/models/fnet/convert_fnet_original_flax_checkpoint_to_pytorch.py
+++ b/src/transformers/models/fnet/convert_fnet_original_flax_checkpoint_to_pytorch.py
@@ -18,8 +18,8 @@
 import argparse
 
 import torch
-
 from flax.training.checkpoints import restore_checkpoint
+
 from transformers import FNetConfig, FNetForPreTraining
 from transformers.utils import logging
 
diff --git a/src/transformers/models/fnet/modeling_fnet.py b/src/transformers/models/fnet/modeling_fnet.py
index 672fe3564500..45042147761d 100755
--- a/src/transformers/models/fnet/modeling_fnet.py
+++ b/src/transformers/models/fnet/modeling_fnet.py
@@ -58,7 +58,6 @@
 
 _CHECKPOINT_FOR_DOC = "google/fnet-base"
 _CONFIG_FOR_DOC = "FNetConfig"
-_TOKENIZER_FOR_DOC = "FNetTokenizer"
 
 FNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "google/fnet-base",
@@ -115,7 +114,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
@@ -186,7 +187,6 @@ def _init_fourier_transform(self, config):
             self.fourier_transform = fftn
 
     def forward(self, hidden_states):
-
         # NOTE: We do not use torch.vmap as it is not integrated into PyTorch stable versions.
         # Interested users can modify the code to use vmap from the nightly versions, getting the vmap from here:
         # https://pytorch.org/docs/master/generated/torch.vmap.html. Note that fourier transform methods will need
@@ -413,7 +413,6 @@ class FNetPreTrainedModel(PreTrainedModel):
     config_class = FNetConfig
     base_model_prefix = "fnet"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -479,7 +478,7 @@ class FNetForPreTrainingOutput(ModelOutput):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`FNetTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -541,7 +540,6 @@ def set_input_embeddings(self, value):
 
     @add_start_docstrings_to_model_forward(FNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -624,7 +622,7 @@ def forward(
     FNET_START_DOCSTRING,
 )
 class FNetForPreTraining(FNetPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
+    _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -673,10 +671,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import FNetTokenizer, FNetForPreTraining
+        >>> from transformers import AutoTokenizer, FNetForPreTraining
         >>> import torch
 
-        >>> tokenizer = FNetTokenizer.from_pretrained("google/fnet-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/fnet-base")
         >>> model = FNetForPreTraining.from_pretrained("google/fnet-base")
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -718,7 +716,7 @@ def forward(
 
 @add_start_docstrings("""FNet Model with a `language modeling` head on top.""", FNET_START_DOCSTRING)
 class FNetForMaskedLM(FNetPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
+    _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -737,7 +735,6 @@ def set_output_embeddings(self, new_embeddings):
 
     @add_start_docstrings_to_model_forward(FNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -824,10 +821,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import FNetTokenizer, FNetForNextSentencePrediction
+        >>> from transformers import AutoTokenizer, FNetForNextSentencePrediction
         >>> import torch
 
-        >>> tokenizer = FNetTokenizer.from_pretrained("google/fnet-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/fnet-base")
         >>> model = FNetForNextSentencePrediction.from_pretrained("google/fnet-base")
         >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
         >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
@@ -897,7 +894,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(FNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -982,7 +978,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(FNET_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1064,7 +1059,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(FNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1133,7 +1127,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(FNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/fnet/tokenization_fnet.py b/src/transformers/models/fnet/tokenization_fnet.py
index e7e3adfd793a..cfa54fcecfb5 100644
--- a/src/transformers/models/fnet/tokenization_fnet.py
+++ b/src/transformers/models/fnet/tokenization_fnet.py
@@ -15,7 +15,6 @@
 """ Tokenization classes for FNet model."""
 
 import os
-import re
 import unicodedata
 from shutil import copyfile
 from typing import Any, Dict, List, Optional, Tuple
@@ -113,18 +112,23 @@ def __init__(
         cls_token="[CLS]",
         mask_token="[MASK]",
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         # Mask token behave like a normal word, i.e. include the space before it and
         # is included in the raw text, there should be a match in a non-normalized sentence.
-        mask_token = (
-            AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
-            if isinstance(mask_token, str)
-            else mask_token
-        )
-
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
+        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
 
+        self.do_lower_case = do_lower_case
+        self.remove_space = remove_space
+        self.keep_accents = keep_accents
+        self.vocab_file = vocab_file
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+
         super().__init__(
             do_lower_case=do_lower_case,
             remove_space=remove_space,
@@ -138,14 +142,6 @@ def __init__(
             **kwargs,
         )
 
-        self.do_lower_case = do_lower_case
-        self.remove_space = remove_space
-        self.keep_accents = keep_accents
-        self.vocab_file = vocab_file
-
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(vocab_file)
-
     @property
     def vocab_size(self):
         return len(self.sp_model)
@@ -236,44 +232,22 @@ def _decode(
         self,
         token_ids: List[int],
         skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: bool = True,
-        spaces_between_special_tokens: bool = True,
-        **kwargs
+        clean_up_tokenization_spaces: bool = None,
+        spaces_between_special_tokens: bool = False,
+        **kwargs,
     ) -> str:
-        self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
-
-        filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
-
-        # To avoid mixing byte-level and unicode for byte-level BPT
-        # we need to build string separately for added tokens and byte-level tokens
-        # cf. https://github.com/huggingface/transformers/issues/1133
-        sub_texts = []
-        current_sub_text = []
-        for token in filtered_tokens:
-            if skip_special_tokens and token in self.all_special_ids:
-                continue
-            if token in self.added_tokens_encoder:
-                if current_sub_text:
-                    sub_texts.append(self.convert_tokens_to_string(current_sub_text))
-                    current_sub_text = []
-                sub_texts.append(token)
-            else:
-                current_sub_text.append(token)
-        if current_sub_text:
-            sub_texts.append(self.convert_tokens_to_string(current_sub_text))
-
+        text = super()._decode(
+            token_ids=token_ids,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            spaces_between_special_tokens=spaces_between_special_tokens,
+            **kwargs,
+        )
         # Mimic the behavior of the Rust tokenizer:
         # No space after <unk>
-        if spaces_between_special_tokens:
-            text = re.sub(r"(<unk>) ", r"\1", " ".join(sub_texts))
-        else:
-            text = "".join(sub_texts)
-
-        if clean_up_tokenization_spaces:
-            clean_text = self.clean_up_tokenization(text)
-            return clean_text
-        else:
-            return text
+        if not spaces_between_special_tokens:
+            text = text.replace("<unk> ", "<unk>")
+        return text
 
     def build_inputs_with_special_tokens(
         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
diff --git a/src/transformers/models/fnet/tokenization_fnet_fast.py b/src/transformers/models/fnet/tokenization_fnet_fast.py
index 7cbd339c8b58..2179751e558e 100644
--- a/src/transformers/models/fnet/tokenization_fnet_fast.py
+++ b/src/transformers/models/fnet/tokenization_fnet_fast.py
@@ -104,15 +104,13 @@ def __init__(
         pad_token="<pad>",
         cls_token="[CLS]",
         mask_token="[MASK]",
-        **kwargs
+        **kwargs,
     ):
         # Mask token behave like a normal word, i.e. include the space before it and
         # is included in the raw text, there should be a match in a non-normalized sentence.
-        mask_token = (
-            AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
-            if isinstance(mask_token, str)
-            else mask_token
-        )
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
+        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
 
         super().__init__(
             vocab_file,
@@ -132,7 +130,10 @@ def __init__(
         self.remove_space = remove_space
         self.keep_accents = keep_accents
         self.vocab_file = vocab_file
-        self.can_save_slow_tokenizer = False if not self.vocab_file else True
+
+    @property
+    def can_save_slow_tokenizer(self) -> bool:
+        return os.path.isfile(self.vocab_file) if self.vocab_file else False
 
     def build_inputs_with_special_tokens(
         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
diff --git a/src/transformers/models/focalnet/__init__.py b/src/transformers/models/focalnet/__init__.py
new file mode 100644
index 000000000000..b043a006f937
--- /dev/null
+++ b/src/transformers/models/focalnet/__init__.py
@@ -0,0 +1,59 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+# rely on isort to merge the imports
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {"configuration_focalnet": ["FOCALNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "FocalNetConfig"]}
+
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_focalnet"] = [
+        "FOCALNET_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "FocalNetForImageClassification",
+        "FocalNetForMaskedImageModeling",
+        "FocalNetBackbone",
+        "FocalNetModel",
+        "FocalNetPreTrainedModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_focalnet import FOCALNET_PRETRAINED_CONFIG_ARCHIVE_MAP, FocalNetConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_focalnet import (
+            FOCALNET_PRETRAINED_MODEL_ARCHIVE_LIST,
+            FocalNetBackbone,
+            FocalNetForImageClassification,
+            FocalNetForMaskedImageModeling,
+            FocalNetModel,
+            FocalNetPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/focalnet/configuration_focalnet.py b/src/transformers/models/focalnet/configuration_focalnet.py
new file mode 100644
index 000000000000..83540c0f3491
--- /dev/null
+++ b/src/transformers/models/focalnet/configuration_focalnet.py
@@ -0,0 +1,162 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" FocalNet model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
+
+
+logger = logging.get_logger(__name__)
+
+FOCALNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "microsoft/focalnet-tiny": "https://huggingface.co/microsoft/focalnet-tiny/resolve/main/config.json",
+}
+
+
+class FocalNetConfig(BackboneConfigMixin, PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`FocalNetModel`]. It is used to instantiate a
+    FocalNet model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the FocalNet
+    [microsoft/focalnet-tiny](https://huggingface.co/microsoft/focalnet-tiny) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 4):
+            The size (resolution) of each patch in the embeddings layer.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        embed_dim (`int`, *optional*, defaults to 96):
+            Dimensionality of patch embedding.
+        use_conv_embed (`bool`, *optional*, defaults to `False`):
+            Whether to use convolutional embedding. The authors noted that using convolutional embedding usually
+            improve the performance, but it's not used by default.
+        hidden_sizes (`List[int]`, *optional*, defaults to `[192, 384, 768, 768]`):
+            Dimensionality (hidden size) at each stage.
+        depths (`list(int)`, *optional*, defaults to `[2, 2, 6, 2]`):
+            Depth (number of layers) of each stage in the encoder.
+        focal_levels (`list(int)`, *optional*, defaults to `[2, 2, 2, 2]`):
+            Number of focal levels in each layer of the respective stages in the encoder.
+        focal_windows (`list(int)`, *optional*, defaults to `[3, 3, 3, 3]`):
+            Focal window size in each layer of the respective stages in the encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`,
+            `"selu"` and `"gelu_new"` are supported.
+        mlp_ratio (`float`, *optional*, defaults to 4.0):
+            Ratio of MLP hidden dimensionality to embedding dimensionality.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+        drop_path_rate (`float`, *optional*, defaults to 0.1):
+            Stochastic depth rate.
+        use_layerscale (`bool`, *optional*, defaults to `False`):
+            Whether to use layer scale in the encoder.
+        layerscale_value (`float`, *optional*, defaults to 0.0001):
+            The initial value of the layer scale.
+        use_post_layernorm (`bool`, *optional*, defaults to `False`):
+            Whether to use post layer normalization in the encoder.
+        use_post_layernorm_in_modulation (`bool`, *optional*, defaults to `False`):
+            Whether to use post layer normalization in the modulation layer.
+        normalize_modulator (`bool`, *optional*, defaults to `False`):
+            Whether to normalize the modulator.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        encoder_stride (`int`, *optional*, defaults to 32):
+            Factor to increase the spatial resolution by in the decoder head for masked image modeling.
+        out_features (`List[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage.
+        out_indices (`List[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage.
+
+    Example:
+
+    ```python
+    >>> from transformers import FocalNetConfig, FocalNetModel
+
+    >>> # Initializing a FocalNet microsoft/focalnet-tiny style configuration
+    >>> configuration = FocalNetConfig()
+
+    >>> # Initializing a model (with random weights) from the microsoft/focalnet-tiny style configuration
+    >>> model = FocalNetModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "focalnet"
+
+    def __init__(
+        self,
+        image_size=224,
+        patch_size=4,
+        num_channels=3,
+        embed_dim=96,
+        use_conv_embed=False,
+        hidden_sizes=[192, 384, 768, 768],
+        depths=[2, 2, 6, 2],
+        focal_levels=[2, 2, 2, 2],
+        focal_windows=[3, 3, 3, 3],
+        hidden_act="gelu",
+        mlp_ratio=4.0,
+        hidden_dropout_prob=0.0,
+        drop_path_rate=0.1,
+        use_layerscale=False,
+        layerscale_value=1e-4,
+        use_post_layernorm=False,
+        use_post_layernorm_in_modulation=False,
+        normalize_modulator=False,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        encoder_stride=32,
+        out_features=None,
+        out_indices=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.embed_dim = embed_dim
+        self.use_conv_embed = use_conv_embed
+        self.hidden_sizes = hidden_sizes
+        self.depths = depths
+        self.focal_levels = focal_levels
+        self.focal_windows = focal_windows
+        self.hidden_act = hidden_act
+        self.mlp_ratio = mlp_ratio
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.drop_path_rate = drop_path_rate
+        self.use_layerscale = use_layerscale
+        self.layerscale_value = layerscale_value
+        self.use_post_layernorm = use_post_layernorm
+        self.use_post_layernorm_in_modulation = use_post_layernorm_in_modulation
+        self.normalize_modulator = normalize_modulator
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.encoder_stride = encoder_stride
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(self.depths) + 1)]
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
diff --git a/src/transformers/models/focalnet/convert_focalnet_to_hf_format.py b/src/transformers/models/focalnet/convert_focalnet_to_hf_format.py
new file mode 100644
index 000000000000..4aed15928062
--- /dev/null
+++ b/src/transformers/models/focalnet/convert_focalnet_to_hf_format.py
@@ -0,0 +1,237 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert FocalNet checkpoints from the original repository. URL: https://github.com/microsoft/FocalNet/tree/main"""
+
+import argparse
+import json
+
+import requests
+import torch
+from huggingface_hub import hf_hub_download
+from PIL import Image
+from torchvision import transforms
+
+from transformers import BitImageProcessor, FocalNetConfig, FocalNetForImageClassification
+from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling
+
+
+def get_focalnet_config(model_name):
+    depths = [2, 2, 6, 2] if "tiny" in model_name else [2, 2, 18, 2]
+    use_conv_embed = True if "large" in model_name or "huge" in model_name else False
+    use_post_layernorm = True if "large" in model_name or "huge" in model_name else False
+    use_layerscale = True if "large" in model_name or "huge" in model_name else False
+
+    if "large" in model_name or "xlarge" in model_name or "huge" in model_name:
+        if "fl3" in model_name:
+            focal_levels = [3, 3, 3, 3]
+            focal_windows = [5, 5, 5, 5]
+        elif "fl4" in model_name:
+            focal_levels = [4, 4, 4, 4]
+            focal_windows = [3, 3, 3, 3]
+
+    if "tiny" in model_name or "small" in model_name or "base" in model_name:
+        focal_windows = [3, 3, 3, 3]
+        if "lrf" in model_name:
+            focal_levels = [3, 3, 3, 3]
+        else:
+            focal_levels = [2, 2, 2, 2]
+
+    if "tiny" in model_name:
+        embed_dim = 96
+    elif "small" in model_name:
+        embed_dim = 96
+    elif "base" in model_name:
+        embed_dim = 128
+    elif "large" in model_name:
+        embed_dim = 192
+    elif "xlarge" in model_name:
+        embed_dim = 256
+    elif "huge" in model_name:
+        embed_dim = 352
+
+    # set label information
+    repo_id = "huggingface/label-files"
+    if "large" in model_name or "huge" in model_name:
+        filename = "imagenet-22k-id2label.json"
+    else:
+        filename = "imagenet-1k-id2label.json"
+
+    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
+    id2label = {int(k): v for k, v in id2label.items()}
+    label2id = {v: k for k, v in id2label.items()}
+
+    config = FocalNetConfig(
+        embed_dim=embed_dim,
+        depths=depths,
+        focal_levels=focal_levels,
+        focal_windows=focal_windows,
+        use_conv_embed=use_conv_embed,
+        id2label=id2label,
+        label2id=label2id,
+        use_post_layernorm=use_post_layernorm,
+        use_layerscale=use_layerscale,
+    )
+
+    return config
+
+
+def rename_key(name):
+    if "patch_embed.proj" in name:
+        name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
+    if "patch_embed.norm" in name:
+        name = name.replace("patch_embed.norm", "embeddings.norm")
+    if "layers" in name:
+        name = "encoder." + name
+    if "encoder.layers" in name:
+        name = name.replace("encoder.layers", "encoder.stages")
+    if "downsample.proj" in name:
+        name = name.replace("downsample.proj", "downsample.projection")
+    if "blocks" in name:
+        name = name.replace("blocks", "layers")
+    if "modulation.f.weight" in name or "modulation.f.bias" in name:
+        name = name.replace("modulation.f", "modulation.projection_in")
+    if "modulation.h.weight" in name or "modulation.h.bias" in name:
+        name = name.replace("modulation.h", "modulation.projection_context")
+    if "modulation.proj.weight" in name or "modulation.proj.bias" in name:
+        name = name.replace("modulation.proj", "modulation.projection_out")
+
+    if name == "norm.weight":
+        name = "layernorm.weight"
+    if name == "norm.bias":
+        name = "layernorm.bias"
+
+    if "head" in name:
+        name = name.replace("head", "classifier")
+    else:
+        name = "focalnet." + name
+
+    return name
+
+
+def convert_focalnet_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False):
+    # fmt: off
+    model_name_to_url = {
+        "focalnet-tiny": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_tiny_srf.pth",
+        "focalnet-tiny-lrf": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_tiny_lrf.pth",
+        "focalnet-small": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_small_srf.pth",
+        "focalnet-small-lrf": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_small_lrf.pth",
+        "focalnet-base": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_base_srf.pth",
+        "focalnet-base-lrf": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_base_lrf.pth",
+        "focalnet-large-lrf-fl3": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_large_lrf_384.pth",
+        "focalnet-large-lrf-fl4": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_large_lrf_384_fl4.pth",
+        "focalnet-xlarge-lrf-fl3": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_xlarge_lrf_384.pth",
+        "focalnet-xlarge-lrf-fl4": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_xlarge_lrf_384_fl4.pth",
+    }
+    # fmt: on
+
+    checkpoint_url = model_name_to_url[model_name]
+    print("Checkpoint URL: ", checkpoint_url)
+    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"]
+
+    # rename keys
+    for key in state_dict.copy().keys():
+        val = state_dict.pop(key)
+        state_dict[rename_key(key)] = val
+
+    config = get_focalnet_config(model_name)
+    model = FocalNetForImageClassification(config)
+    model.eval()
+
+    # load state dict
+    model.load_state_dict(state_dict)
+
+    # verify conversion
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+
+    processor = BitImageProcessor(
+        do_resize=True,
+        size={"shortest_edge": 256},
+        resample=PILImageResampling.BILINEAR,
+        do_center_crop=True,
+        crop_size=224,
+        do_normalize=True,
+        image_mean=IMAGENET_DEFAULT_MEAN,
+        image_std=IMAGENET_DEFAULT_STD,
+    )
+    image = Image.open(requests.get(url, stream=True).raw)
+    inputs = processor(images=image, return_tensors="pt")
+
+    image_transforms = transforms.Compose(
+        [
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+        ]
+    )
+
+    original_pixel_values = image_transforms(image).unsqueeze(0)
+
+    # verify pixel_values
+    assert torch.allclose(inputs.pixel_values, original_pixel_values, atol=1e-4)
+
+    outputs = model(**inputs)
+
+    predicted_class_idx = outputs.logits.argmax(-1).item()
+    print("Predicted class:", model.config.id2label[predicted_class_idx])
+
+    print("First values of logits:", outputs.logits[0, :3])
+
+    if model_name == "focalnet-tiny":
+        expected_slice = torch.tensor([0.2166, -0.4368, 0.2191])
+    elif model_name == "focalnet-tiny-lrf":
+        expected_slice = torch.tensor([1.1669, 0.0125, -0.1695])
+    elif model_name == "focalnet-small":
+        expected_slice = torch.tensor([0.4917, -0.0430, 0.1341])
+    elif model_name == "focalnet-small-lrf":
+        expected_slice = torch.tensor([-0.2588, -0.5342, -0.2331])
+    elif model_name == "focalnet-base":
+        expected_slice = torch.tensor([-0.1655, -0.4090, -0.1730])
+    elif model_name == "focalnet-base-lrf":
+        expected_slice = torch.tensor([0.5306, -0.0483, -0.3928])
+    assert torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4)
+    print("Looks ok!")
+
+    if pytorch_dump_folder_path is not None:
+        print(f"Saving model and processor of {model_name} to {pytorch_dump_folder_path}")
+        model.save_pretrained(pytorch_dump_folder_path)
+        processor.save_pretrained(pytorch_dump_folder_path)
+
+    if push_to_hub:
+        print(f"Pushing model and processor of {model_name} to the hub...")
+        model.push_to_hub(f"{model_name}")
+        processor.push_to_hub(f"{model_name}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--model_name",
+        default="focalnet-tiny",
+        type=str,
+        help="Name of the FocalNet model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+    )
+    parser.add_argument(
+        "--push_to_hub",
+        action="store_true",
+        help="Whether to push the model and processor to the hub.",
+    )
+
+    args = parser.parse_args()
+    convert_focalnet_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/focalnet/modeling_focalnet.py b/src/transformers/models/focalnet/modeling_focalnet.py
new file mode 100644
index 000000000000..8d18a8c63fda
--- /dev/null
+++ b/src/transformers/models/focalnet/modeling_focalnet.py
@@ -0,0 +1,1046 @@
+# coding=utf-8
+# Copyright 2023 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch FocalNet model."""
+
+
+import collections.abc
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BackboneOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from ...utils.backbone_utils import BackboneMixin
+from .configuration_focalnet import FocalNetConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "FocalNetConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "microsoft/focalnet-tiny"
+_EXPECTED_OUTPUT_SHAPE = [1, 49, 768]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "microsoft/focalnet-tiny"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
+
+
+FOCALNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "microsoft/focalnet-tiny",
+    # See all FocalNet models at https://huggingface.co/models?filter=focalnet
+]
+
+
+@dataclass
+class FocalNetEncoderOutput(ModelOutput):
+    """
+    FocalNet encoder's outputs, with potential hidden states.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+
+        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, hidden_size, height, width)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+            include the spatial dimensions.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class FocalNetModelOutput(ModelOutput):
+    """
+    FocalNet model's outputs that also contains a pooling of the last hidden states.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
+            Average pooling of the last layer hidden-state.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, hidden_size, height, width)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+            include the spatial dimensions.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    pooler_output: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class FocalNetMaskedImageModelingOutput(ModelOutput):
+    """
+    FocalNet masked image model outputs.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
+            Masked image modeling (MLM) loss.
+        reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Reconstructed pixel values.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, hidden_size, height, width)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+            include the spatial dimensions.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    reconstruction: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class FocalNetImageClassifierOutput(ModelOutput):
+    """
+    FocalNet outputs for image classification.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, hidden_size, height, width)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+            include the spatial dimensions.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+
+
+class FocalNetEmbeddings(nn.Module):
+    """
+    Construct the patch embeddings and layernorm. Optionally, also the mask token.
+    """
+
+    def __init__(self, config, use_mask_token=False):
+        super().__init__()
+
+        self.patch_embeddings = FocalNetPatchEmbeddings(
+            config=config,
+            image_size=config.image_size,
+            patch_size=config.patch_size,
+            num_channels=config.num_channels,
+            embed_dim=config.embed_dim,
+            use_conv_embed=config.use_conv_embed,
+            is_stem=True,
+        )
+        self.patch_grid = self.patch_embeddings.grid_size
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.embed_dim)) if use_mask_token else None
+
+        self.norm = nn.LayerNorm(config.embed_dim, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(
+        self, pixel_values: Optional[torch.FloatTensor], bool_masked_pos: Optional[torch.BoolTensor] = None
+    ) -> Tuple[torch.Tensor]:
+        embeddings, output_dimensions = self.patch_embeddings(pixel_values)
+        embeddings = self.norm(embeddings)
+        batch_size, seq_len, _ = embeddings.size()
+
+        if bool_masked_pos is not None:
+            mask_tokens = self.mask_token.expand(batch_size, seq_len, -1)
+            # replace the masked visual tokens by mask_tokens
+            mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
+            embeddings = embeddings * (1.0 - mask) + mask_tokens * mask
+
+        embeddings = self.dropout(embeddings)
+        return embeddings, output_dimensions
+
+
+class FocalNetPatchEmbeddings(nn.Module):
+    def __init__(
+        self,
+        config,
+        image_size,
+        patch_size,
+        num_channels,
+        embed_dim,
+        add_norm=False,
+        use_conv_embed=False,
+        is_stem=False,
+    ):
+        super().__init__()
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+        self.grid_size = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])
+
+        if use_conv_embed:
+            # if we choose to use conv embedding, then we treat the stem and non-stem differently
+            if is_stem:
+                kernel_size = 7
+                padding = 2
+                stride = 4
+            else:
+                kernel_size = 3
+                padding = 1
+                stride = 2
+            self.projection = nn.Conv2d(
+                num_channels, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
+            )
+        else:
+            self.projection = nn.Conv2d(num_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+        if add_norm:
+            self.norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        else:
+            self.norm = None
+
+    def maybe_pad(self, pixel_values, height, width):
+        if width % self.patch_size[1] != 0:
+            pad_values = (0, self.patch_size[1] - width % self.patch_size[1])
+            pixel_values = nn.functional.pad(pixel_values, pad_values)
+        if height % self.patch_size[0] != 0:
+            pad_values = (0, 0, 0, self.patch_size[0] - height % self.patch_size[0])
+            pixel_values = nn.functional.pad(pixel_values, pad_values)
+        return pixel_values
+
+    def forward(self, pixel_values: Optional[torch.FloatTensor]) -> Tuple[torch.Tensor, Tuple[int]]:
+        _, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        # pad the input to be divisible by self.patch_size, if needed
+        pixel_values = self.maybe_pad(pixel_values, height, width)
+        embeddings = self.projection(pixel_values)
+        _, _, height, width = embeddings.shape
+        output_dimensions = (height, width)
+        embeddings = embeddings.flatten(2).transpose(1, 2)
+
+        if self.norm is not None:
+            embeddings = self.norm(embeddings)
+
+        return embeddings, output_dimensions
+
+
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->FocalNet
+class FocalNetDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+
+
+class FocalNetModulation(nn.Module):
+    def __init__(self, config, index, dim, focal_factor=2, bias=True, projection_dropout=0.0):
+        super().__init__()
+
+        self.dim = dim
+        self.focal_window = config.focal_windows[index]
+        self.focal_level = config.focal_levels[index]
+        self.focal_factor = focal_factor
+        self.use_post_layernorm_in_modulation = config.use_post_layernorm_in_modulation
+        self.normalize_modulator = config.normalize_modulator
+
+        self.projection_in = nn.Linear(dim, 2 * dim + (self.focal_level + 1), bias=bias)
+        self.projection_context = nn.Conv2d(dim, dim, kernel_size=1, stride=1, bias=bias)
+
+        self.activation = nn.GELU()
+        self.projection_out = nn.Linear(dim, dim)
+        self.projection_dropout = nn.Dropout(projection_dropout)
+        self.focal_layers = nn.ModuleList()
+
+        self.kernel_sizes = []
+        for k in range(self.focal_level):
+            kernel_size = self.focal_factor * k + self.focal_window
+            self.focal_layers.append(
+                nn.Sequential(
+                    nn.Conv2d(
+                        dim, dim, kernel_size=kernel_size, stride=1, groups=dim, padding=kernel_size // 2, bias=False
+                    ),
+                    nn.GELU(),
+                )
+            )
+            self.kernel_sizes.append(kernel_size)
+        if self.use_post_layernorm_in_modulation:
+            self.layernorm = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_state):
+        """
+        Args:
+            hidden_state:
+                Input features with shape of (batch_size, height, width, num_channels)
+        """
+        num_channels = hidden_state.shape[-1]
+
+        # pre linear projection
+        x = self.projection_in(hidden_state).permute(0, 3, 1, 2).contiguous()
+        q, ctx, self.gates = torch.split(x, (num_channels, num_channels, self.focal_level + 1), 1)
+
+        # context aggreation
+        ctx_all = 0
+        for level in range(self.focal_level):
+            ctx = self.focal_layers[level](ctx)
+            ctx_all = ctx_all + ctx * self.gates[:, level : level + 1]
+        ctx_global = self.activation(ctx.mean(2, keepdim=True).mean(3, keepdim=True))
+        ctx_all = ctx_all + ctx_global * self.gates[:, self.focal_level :]
+
+        # normalize context
+        if self.normalize_modulator:
+            ctx_all = ctx_all / (self.focal_level + 1)
+
+        # focal modulation
+        self.modulator = self.projection_context(ctx_all)
+        x_out = q * self.modulator
+        x_out = x_out.permute(0, 2, 3, 1).contiguous()
+        if self.use_post_layernorm_in_modulation:
+            x_out = self.layernorm(x_out)
+
+        # post linear porjection
+        x_out = self.projection_out(x_out)
+        x_out = self.projection_dropout(x_out)
+        return x_out
+
+
+class FocalNetMlp(nn.Module):
+    def __init__(self, config, in_features, hidden_features=None, out_features=None, drop=0.0):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.activation = ACT2FN[config.hidden_act]
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, hidden_state):
+        hidden_state = self.fc1(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        hidden_state = self.drop(hidden_state)
+        hidden_state = self.fc2(hidden_state)
+        hidden_state = self.drop(hidden_state)
+        return hidden_state
+
+
+class FocalNetLayer(nn.Module):
+    r"""Focal Modulation Network layer (block).
+
+    Args:
+        config (`FocalNetConfig`):
+            Model config.
+        index (`int`):
+            Layer index.
+        dim (`int`):
+            Number of input channels.
+        input_resolution (`Tuple[int]`):
+            Input resulotion.
+        drop_path (`float`, *optional*, defaults to 0.0):
+            Stochastic depth rate.
+    """
+
+    def __init__(self, config, index, dim, input_resolution, drop_path=0.0):
+        super().__init__()
+
+        self.config = config
+
+        # layer-specific attributes
+        self.dim = dim
+        self.input_resolution = input_resolution
+
+        # general attributes
+        self.drop = config.hidden_dropout_prob
+        self.use_post_layernorm = config.use_post_layernorm
+
+        self.norm1 = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+        self.modulation = FocalNetModulation(
+            config=config,
+            index=index,
+            dim=dim,
+            projection_dropout=self.drop,
+        )
+
+        self.drop_path = FocalNetDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+        mlp_hidden_dim = int(dim * config.mlp_ratio)
+        self.mlp = FocalNetMlp(config=config, in_features=dim, hidden_features=mlp_hidden_dim, drop=self.drop)
+
+        self.gamma_1 = 1.0
+        self.gamma_2 = 1.0
+        if config.use_layerscale:
+            self.gamma_1 = nn.Parameter(config.layerscale_value * torch.ones((dim)), requires_grad=True)
+            self.gamma_2 = nn.Parameter(config.layerscale_value * torch.ones((dim)), requires_grad=True)
+
+    def forward(self, hidden_state, input_dimensions):
+        height, width = input_dimensions
+        batch_size, _, num_channels = hidden_state.shape
+        shortcut = hidden_state
+
+        # Focal Modulation
+        hidden_state = hidden_state if self.use_post_layernorm else self.norm1(hidden_state)
+        hidden_state = hidden_state.view(batch_size, height, width, num_channels)
+        hidden_state = self.modulation(hidden_state).view(batch_size, height * width, num_channels)
+        hidden_state = hidden_state if not self.use_post_layernorm else self.norm1(hidden_state)
+
+        # FFN
+        hidden_state = shortcut + self.drop_path(self.gamma_1 * hidden_state)
+        hidden_state = hidden_state + self.drop_path(
+            self.gamma_2
+            * (self.norm2(self.mlp(hidden_state)) if self.use_post_layernorm else self.mlp(self.norm2(hidden_state)))
+        )
+
+        return hidden_state
+
+
+class FocalNetStage(nn.Module):
+    def __init__(self, config, index, input_resolution):
+        super().__init__()
+
+        self.config = config
+        self.num_stages = len(config.depths)
+
+        embed_dim = [config.embed_dim * (2**i) for i in range(self.num_stages)]
+        dim = embed_dim[index]
+        out_dim = embed_dim[index + 1] if (index < self.num_stages - 1) else None
+        downsample = FocalNetPatchEmbeddings if (index < self.num_stages - 1) else None
+
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
+        drop_path = dpr[sum(config.depths[:index]) : sum(config.depths[: index + 1])]
+
+        self.layers = nn.ModuleList(
+            [
+                FocalNetLayer(
+                    config=config,
+                    index=index,
+                    dim=dim,
+                    input_resolution=input_resolution,
+                    drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                )
+                for i in range(config.depths[index])
+            ]
+        )
+
+        if downsample is not None:
+            self.downsample = downsample(
+                config=config,
+                image_size=input_resolution,
+                patch_size=2,
+                num_channels=dim,
+                embed_dim=out_dim,
+                add_norm=True,
+                use_conv_embed=config.use_conv_embed,
+                is_stem=False,
+            )
+        else:
+            self.downsample = None
+
+        self.pointing = False
+
+    def forward(self, hidden_states: torch.Tensor, input_dimensions: Tuple[int, int]) -> Tuple[torch.Tensor]:
+        height, width = input_dimensions
+        for layer_module in self.layers:
+            hidden_states = layer_module(hidden_states, input_dimensions)
+
+        hidden_states_before_downsampling = hidden_states
+        if self.downsample is not None:
+            height, width = input_dimensions
+            hidden_states = hidden_states.transpose(1, 2).reshape(
+                hidden_states_before_downsampling.shape[0], -1, height, width
+            )
+            hidden_states, output_dimensions = self.downsample(hidden_states)
+
+        else:
+            output_dimensions = (height, width, height, width)
+
+        stage_outputs = (hidden_states, hidden_states_before_downsampling, output_dimensions)
+
+        return stage_outputs
+
+
+class FocalNetEncoder(nn.Module):
+    def __init__(self, config, grid_size):
+        super().__init__()
+        self.num_stages = len(config.depths)
+        self.config = config
+
+        self.stages = nn.ModuleList(
+            [
+                FocalNetStage(
+                    config=config,
+                    index=i_layer,
+                    input_resolution=(grid_size[0] // (2**i_layer), grid_size[1] // (2**i_layer)),
+                )
+                for i_layer in range(self.num_stages)
+            ]
+        )
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_dimensions: Tuple[int, int],
+        output_hidden_states: Optional[bool] = False,
+        output_hidden_states_before_downsampling: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple, FocalNetEncoderOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_reshaped_hidden_states = () if output_hidden_states else None
+
+        if output_hidden_states:
+            batch_size, _, hidden_size = hidden_states.shape
+            # rearrange b (h w) c -> b c h w
+            reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size)
+            reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
+            all_hidden_states += (hidden_states,)
+            all_reshaped_hidden_states += (reshaped_hidden_state,)
+
+        for i, stage_module in enumerate(self.stages):
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                stage_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(stage_module),
+                    hidden_states,
+                    input_dimensions,
+                )
+            else:
+                stage_outputs = stage_module(hidden_states, input_dimensions)
+
+            hidden_states = stage_outputs[0]
+            hidden_states_before_downsampling = stage_outputs[1]
+            output_dimensions = stage_outputs[2]
+
+            input_dimensions = (output_dimensions[-2], output_dimensions[-1])
+
+            if output_hidden_states and output_hidden_states_before_downsampling:
+                batch_size, _, hidden_size = hidden_states_before_downsampling.shape
+                # rearrange b (h w) c -> b c h w
+                # here we use the original (not downsampled) height and width
+                reshaped_hidden_state = hidden_states_before_downsampling.view(
+                    batch_size, *(output_dimensions[0], output_dimensions[1]), hidden_size
+                )
+                reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
+                all_hidden_states += (hidden_states_before_downsampling,)
+                all_reshaped_hidden_states += (reshaped_hidden_state,)
+            elif output_hidden_states and not output_hidden_states_before_downsampling:
+                batch_size, _, hidden_size = hidden_states.shape
+                # rearrange b (h w) c -> b c h w
+                reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size)
+                reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
+                all_hidden_states += (hidden_states,)
+                all_reshaped_hidden_states += (reshaped_hidden_state,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)
+
+        return FocalNetEncoderOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            reshaped_hidden_states=all_reshaped_hidden_states,
+        )
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinPreTrainedModel with Swin->FocalNet,swin->focalnet
+class FocalNetPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = FocalNetConfig
+    base_model_prefix = "focalnet"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, FocalNetEncoder):
+            module.gradient_checkpointing = value
+
+
+FOCALNET_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`FocalNetConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+FOCALNET_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`AutoImageProcessor.__call__`] for details.
+
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare FocalNet Model outputting raw hidden-states without any specific head on top.",
+    FOCALNET_START_DOCSTRING,
+)
+class FocalNetModel(FocalNetPreTrainedModel):
+    def __init__(self, config, add_pooling_layer=True, use_mask_token=False):
+        super().__init__(config)
+        self.config = config
+        self.num_stages = len(config.depths)
+        self.num_features = int(config.embed_dim * 2 ** (self.num_stages - 1))
+
+        self.embeddings = FocalNetEmbeddings(config, use_mask_token=use_mask_token)
+        self.encoder = FocalNetEncoder(config, self.embeddings.patch_grid)
+
+        self.layernorm = nn.LayerNorm(self.num_features, eps=config.layer_norm_eps)
+        self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+
+    @add_start_docstrings_to_model_forward(FOCALNET_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=FocalNetModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, FocalNetModelOutput]:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+        """
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        embedding_output, input_dimensions = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            input_dimensions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+
+        pooled_output = None
+        if self.pooler is not None:
+            pooled_output = self.pooler(sequence_output.transpose(1, 2))
+            pooled_output = torch.flatten(pooled_output, 1)
+
+        if not return_dict:
+            output = (sequence_output, pooled_output) + encoder_outputs[1:]
+
+            return output
+
+        return FocalNetModelOutput(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            reshaped_hidden_states=encoder_outputs.reshaped_hidden_states,
+        )
+
+
+@add_start_docstrings(
+    """FocalNet Model with a decoder on top for masked image modeling.
+
+    This follows the same implementation as in [SimMIM](https://arxiv.org/abs/2111.09886).
+
+    <Tip>
+
+    Note that we provide a script to pre-train this model on custom data in our [examples
+    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).
+
+    </Tip>
+    """,
+    FOCALNET_START_DOCSTRING,
+)
+class FocalNetForMaskedImageModeling(FocalNetPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.focalnet = FocalNetModel(config, add_pooling_layer=False, use_mask_token=True)
+
+        self.num_stages = len(config.depths)
+        num_features = int(config.embed_dim * 2 ** (self.num_stages - 1))
+        self.decoder = nn.Sequential(
+            nn.Conv2d(
+                in_channels=num_features, out_channels=config.encoder_stride**2 * config.num_channels, kernel_size=1
+            ),
+            nn.PixelShuffle(config.encoder_stride),
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(FOCALNET_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=FocalNetMaskedImageModelingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, FocalNetMaskedImageModelingOutput]:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+
+        Returns:
+
+        Examples:
+        ```python
+        >>> from transformers import AutoImageProcessor, FocalNetConfig, FocalNetForMaskedImageModeling
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/focalnet-base-simmim-window6-192")
+        >>> config = FocalNetConfig()
+        >>> model = FocalNetForMaskedImageModeling(config)
+
+        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
+        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
+        >>> # create random boolean mask of shape (batch_size, num_patches)
+        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()
+
+        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
+        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.logits
+        >>> list(reconstructed_pixel_values.shape)
+        [1, 3, 192, 192]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.focalnet(
+            pixel_values,
+            bool_masked_pos=bool_masked_pos,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        # Reshape to (batch_size, num_channels, height, width)
+        sequence_output = sequence_output.transpose(1, 2)
+        batch_size, num_channels, sequence_length = sequence_output.shape
+        height = width = math.floor(sequence_length**0.5)
+        sequence_output = sequence_output.reshape(batch_size, num_channels, height, width)
+
+        # Reconstruct pixel values
+        reconstructed_pixel_values = self.decoder(sequence_output)
+
+        masked_im_loss = None
+        if bool_masked_pos is not None:
+            size = self.config.image_size // self.config.patch_size
+            bool_masked_pos = bool_masked_pos.reshape(-1, size, size)
+            mask = (
+                bool_masked_pos.repeat_interleave(self.config.patch_size, 1)
+                .repeat_interleave(self.config.patch_size, 2)
+                .unsqueeze(1)
+                .contiguous()
+            )
+            reconstruction_loss = nn.functional.l1_loss(pixel_values, reconstructed_pixel_values, reduction="none")
+            masked_im_loss = (reconstruction_loss * mask).sum() / (mask.sum() + 1e-5) / self.config.num_channels
+
+        if not return_dict:
+            output = (reconstructed_pixel_values,) + outputs[2:]
+            return ((masked_im_loss,) + output) if masked_im_loss is not None else output
+
+        return FocalNetMaskedImageModelingOutput(
+            loss=masked_im_loss,
+            reconstruction=reconstructed_pixel_values,
+            hidden_states=outputs.hidden_states,
+            reshaped_hidden_states=outputs.reshaped_hidden_states,
+        )
+
+
+@add_start_docstrings(
+    """
+    FocalNet Model with an image classification head on top (a linear layer on top of the pooled output) e.g. for
+    ImageNet.
+    """,
+    FOCALNET_START_DOCSTRING,
+)
+class FocalNetForImageClassification(FocalNetPreTrainedModel):
+    # Copied from transformers.models.swin.modeling_swin.SwinForImageClassification.__init__ with Swin->FocalNet, swin->focalnet
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.focalnet = FocalNetModel(config)
+
+        # Classifier head
+        self.classifier = (
+            nn.Linear(self.focalnet.num_features, config.num_labels) if config.num_labels > 0 else nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(FOCALNET_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=FocalNetImageClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, FocalNetImageClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.focalnet(
+            pixel_values,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return FocalNetImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            reshaped_hidden_states=outputs.reshaped_hidden_states,
+        )
+
+
+@add_start_docstrings(
+    """
+    FocalNet backbone, to be used with frameworks like X-Decoder.
+    """,
+    FOCALNET_START_DOCSTRING,
+)
+class FocalNetBackbone(FocalNetPreTrainedModel, BackboneMixin):
+    def __init__(self, config: FocalNetConfig):
+        super().__init__(config)
+        super()._init_backbone(config)
+
+        self.num_features = [config.embed_dim] + config.hidden_sizes
+        self.focalnet = FocalNetModel(config)
+
+        # initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(FOCALNET_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> BackboneOutput:
+        """
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoBackbone
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> processor = AutoImageProcessor.from_pretrained("microsoft/focalnet-tiny-lrf")
+        >>> model = AutoBackbone.from_pretrained("microsoft/focalnet-tiny-lrf")
+
+        >>> inputs = processor(image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        outputs = self.focalnet(pixel_values, output_hidden_states=True, return_dict=True)
+
+        hidden_states = outputs.reshaped_hidden_states
+
+        feature_maps = ()
+        for idx, stage in enumerate(self.stage_names):
+            if stage in self.out_features:
+                feature_maps += (hidden_states[idx],)
+
+        if not return_dict:
+            output = (feature_maps,)
+            if output_hidden_states:
+                output += (outputs.hidden_states,)
+            return output
+
+        return BackboneOutput(
+            feature_maps=feature_maps,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=None,
+        )
diff --git a/src/transformers/models/fsmt/__init__.py b/src/transformers/models/fsmt/__init__.py
index 00a17147adb2..65aba047469d 100644
--- a/src/transformers/models/fsmt/__init__.py
+++ b/src/transformers/models/fsmt/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/fsmt/configuration_fsmt.py b/src/transformers/models/fsmt/configuration_fsmt.py
index 85775d07b2e3..afd97f137dc3 100644
--- a/src/transformers/models/fsmt/configuration_fsmt.py
+++ b/src/transformers/models/fsmt/configuration_fsmt.py
@@ -15,8 +15,6 @@
 """ FSMT configuration"""
 
 
-import copy
-
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 
@@ -171,7 +169,7 @@ def __init__(
         bos_token_id=0,
         eos_token_id=2,
         forced_eos_token_id=2,
-        **common_kwargs
+        **common_kwargs,
     ):
         self.langs = langs
         self.src_vocab_size = src_vocab_size
@@ -216,15 +214,3 @@ def __init__(
             early_stopping=early_stopping,
             **common_kwargs,
         )
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default *to_dict()* from *PretrainedConfig*.
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["decoder"] = self.decoder.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py
index 85f5290a9ebd..ef2764f0ed10 100755
--- a/src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py
@@ -88,7 +88,6 @@ def rewrite_dict_keys(d):
 
 
 def convert_fsmt_checkpoint_to_pytorch(fsmt_checkpoint_path, pytorch_dump_folder_path):
-
     # prep
     assert os.path.exists(fsmt_checkpoint_path)
     os.makedirs(pytorch_dump_folder_path, exist_ok=True)
diff --git a/src/transformers/models/fsmt/modeling_fsmt.py b/src/transformers/models/fsmt/modeling_fsmt.py
index 4ad4c4f6cae9..1e566b150f76 100644
--- a/src/transformers/models/fsmt/modeling_fsmt.py
+++ b/src/transformers/models/fsmt/modeling_fsmt.py
@@ -28,7 +28,6 @@
 """PyTorch Fairseq model, ported from https://github.com/pytorch/fairseq/tree/master/examples/wmt19"""
 
 import math
-import random
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
@@ -36,7 +35,7 @@
 from torch.nn import CrossEntropyLoss, LayerNorm
 
 from ...activations import ACT2FN
-from ...deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
 from ...modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPastAndCrossAttentions,
@@ -59,7 +58,6 @@
 
 _CHECKPOINT_FOR_DOC = "facebook/wmt19-ru-en"
 _CONFIG_FOR_DOC = "FSMTConfig"
-_TOKENIZER_FOR_DOC = "FSMTTokenizer"
 
 # See all FSMT models at https://huggingface.co/models?filter=fsmt
 
@@ -200,11 +198,11 @@
     Translation example::
 
     ```python
-    >>> from transformers import FSMTTokenizer, FSMTForConditionalGeneration
+    >>> from transformers import AutoTokenizer, FSMTForConditionalGeneration
 
     >>> mname = "facebook/wmt19-ru-en"
     >>> model = FSMTForConditionalGeneration.from_pretrained(mname)
-    >>> tokenizer = FSMTTokenizer.from_pretrained(mname)
+    >>> tokenizer = AutoTokenizer.from_pretrained(mname)
 
     >>> src_text = "Машинное обучение - это здорово, не так ли?"
     >>> input_ids = tokenizer(src_text, return_tensors="pt").input_ids
@@ -234,7 +232,7 @@
         decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`FSMTTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -551,7 +549,7 @@ def forward(
                 encoder_states += (x,)
                 x = x.transpose(0, 1)  # B x T x C -> T x B x C
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):  # skip the layer
                 attn = None
             else:
@@ -772,7 +770,7 @@ def forward(
         x += positions
         x = nn.functional.dropout(x, p=self.dropout, training=self.training)
 
-        # Convert to FSMT output format: (seq_len, BS, model_dim) -> (BS, seq_len, model_dim)
+        # Convert to FSMT output format: (BS, seq_len, model_dim) -> (seq_len, BS, model_dim)
         x = x.transpose(0, 1)
         encoder_hidden_states = encoder_hidden_states.transpose(0, 1)
 
@@ -795,9 +793,10 @@ def forward(
                 x = x.transpose(0, 1)
                 all_hidden_states += (x,)
                 x = x.transpose(0, 1)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             layer_state = past_key_values[idx] if past_key_values is not None else None
 
@@ -1035,7 +1034,7 @@ def _get_shape(t):
     FSMT_START_DOCSTRING,
 )
 class FSMTModel(PretrainedFSMTModel):
-    _keys_to_ignore_on_load_missing = ["decoder.output_projection.weight"]
+    _tied_weights_keys = ["decoder.embed_tokens.weight", "decoder.output_projection.weight"]
 
     def __init__(self, config: FSMTConfig):
         super().__init__(config)
@@ -1056,9 +1055,13 @@ def get_encoder(self):
     def get_decoder(self):
         return self.decoder
 
+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self._tie_or_clone_weights(self.decoder.embed_tokens, self.get_input_embeddings())
+            self._tie_or_clone_weights(self.decoder.output_projection, self.get_input_embeddings())
+
     @add_start_docstrings_to_model_forward(FSMT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=Seq2SeqModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1173,21 +1176,16 @@ def set_output_embeddings(self, value):
 )
 class FSMTForConditionalGeneration(PretrainedFSMTModel):
     base_model_prefix = "model"
-    _keys_to_ignore_on_load_missing = [
-        "model.encoder.embed_positions.weight",
-        "model.decoder.embed_positions.weight",
-        "decoder.output_projection.weight",
-    ]
-    _keys_to_ignore_on_save = [
-        "model.encoder.embed_positions.weight",
-        "model.decoder.embed_positions.weight",
-    ]
+    _tied_weights_keys = ["decoder.embed_tokens.weight", "decoder.output_projection.weight"]
 
     def __init__(self, config: FSMTConfig):
         super().__init__(config)
         base_model = FSMTModel(config)
         self.model = base_model
 
+        # Initialize weights and apply final processing
+        self.post_init()
+
     @add_start_docstrings_to_model_forward(FSMT_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
     @add_end_docstrings(FSMT_GENERATION_EXAMPLE)
@@ -1275,7 +1273,7 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         return {
             "input_ids": None,  # encoder_outputs is defined. input_ids not needed
@@ -1293,9 +1291,9 @@ def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
         return shift_tokens_right(labels, self.config.pad_token_id)
 
     @staticmethod
-    def _reorder_cache(past, beam_idx):
+    def _reorder_cache(past_key_values, beam_idx):
         reordered_past = []
-        for layer_past in past:
+        for layer_past in past_key_values:
             # get the correct batch idx from decoder layer's batch dim for cross and self-attn
             layer_past_new = {
                 attn_key: _reorder_buffer(attn_cache, beam_idx) for attn_key, attn_cache in layer_past.items()
diff --git a/src/transformers/models/fsmt/tokenization_fsmt.py b/src/transformers/models/fsmt/tokenization_fsmt.py
index 9b9dc0bc956c..a631f0747648 100644
--- a/src/transformers/models/fsmt/tokenization_fsmt.py
+++ b/src/transformers/models/fsmt/tokenization_fsmt.py
@@ -146,13 +146,13 @@ class FSMTTokenizer(PreTrainedTokenizer):
     this superclass for more information regarding those methods.
 
     Args:
-        langs (`List[str]`):
+        langs (`List[str]`, *optional*):
             A list of two languages to translate from and to, for instance `["en", "ru"]`.
-        src_vocab_file (`str`):
+        src_vocab_file (`str`, *optional*):
             File containing the vocabulary for the source language.
-        tgt_vocab_file (`st`):
+        tgt_vocab_file (`st`, *optional*):
             File containing the vocabulary for the target language.
-        merges_file (`str`):
+        merges_file (`str`, *optional*):
             File containing the merges.
         do_lower_case (`bool`, *optional*, defaults to `False`):
             Whether or not to lowercase the input when tokenizing.
@@ -195,21 +195,8 @@ def __init__(
         bos_token="<s>",
         sep_token="</s>",
         pad_token="<pad>",
-        **kwargs
+        **kwargs,
     ):
-        super().__init__(
-            langs=langs,
-            src_vocab_file=src_vocab_file,
-            tgt_vocab_file=tgt_vocab_file,
-            merges_file=merges_file,
-            do_lower_case=do_lower_case,
-            unk_token=unk_token,
-            bos_token=bos_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            **kwargs,
-        )
-
         try:
             import sacremoses
         except ImportError:
@@ -226,10 +213,10 @@ def __init__(
         self.do_lower_case = do_lower_case
 
         # cache of sm.MosesPunctNormalizer instance
-        self.cache_moses_punct_normalizer = dict()
+        self.cache_moses_punct_normalizer = {}
         # cache of sm.MosesTokenizer instance
-        self.cache_moses_tokenizer = dict()
-        self.cache_moses_detokenizer = dict()
+        self.cache_moses_tokenizer = {}
+        self.cache_moses_detokenizer = {}
 
         if langs and len(langs) == 2:
             self.src_lang, self.tgt_lang = langs
@@ -250,6 +237,18 @@ def __init__(
         merges = [tuple(merge.split()[:2]) for merge in merges]
         self.bpe_ranks = dict(zip(merges, range(len(merges))))
         self.cache = {}
+        super().__init__(
+            langs=langs,
+            src_vocab_file=src_vocab_file,
+            tgt_vocab_file=tgt_vocab_file,
+            merges_file=merges_file,
+            do_lower_case=do_lower_case,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            **kwargs,
+        )
 
     # hack override
     def get_vocab(self) -> Dict[str, int]:
@@ -379,7 +378,7 @@ def _tokenize(self, text, lang="en", bypass_tokenizer=False):
         split_tokens = []
         for token in text:
             if token:
-                split_tokens.extend([t for t in self.bpe(token).split(" ")])
+                split_tokens.extend(list(self.bpe(token).split(" ")))
 
         return split_tokens
 
diff --git a/src/transformers/models/funnel/__init__.py b/src/transformers/models/funnel/__init__.py
index 6a9f6073fad5..28b9a34290c8 100644
--- a/src/transformers/models/funnel/__init__.py
+++ b/src/transformers/models/funnel/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/funnel/configuration_funnel.py b/src/transformers/models/funnel/configuration_funnel.py
index 60729b1a9169..d049b15911b0 100644
--- a/src/transformers/models/funnel/configuration_funnel.py
+++ b/src/transformers/models/funnel/configuration_funnel.py
@@ -75,15 +75,13 @@ class FunnelConfig(PretrainedConfig):
             The dropout probability for the attention probabilities.
         activation_dropout (`float`, *optional*, defaults to 0.0):
             The dropout probability used between the two layers of the feed-forward blocks.
-        type_vocab_size (`int`, *optional*, defaults to 3):
-            The vocabulary size of the `token_type_ids` passed when calling [`FunnelModel`] or [`TFFunnelModel`].
         initializer_range (`float`, *optional*, defaults to 0.1):
             The upper bound of the *uniform initializer* for initializing all weight matrices in attention layers.
         initializer_std (`float`, *optional*):
             The standard deviation of the *normal initializer* for initializing the embedding matrix and the weight of
             linear layers. Will default to 1 for the embedding matrix and the value given by Xavier initialization for
             linear layers.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-9):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-09):
             The epsilon used by the layer normalization layers.
         pooling_type (`str`, *optional*, defaults to `"mean"`):
             Possible values are `"mean"` or `"max"`. The way pooling is performed at the beginning of each block.
@@ -92,10 +90,10 @@ class FunnelConfig(PretrainedConfig):
             is faster on TPU.
         separate_cls (`bool`, *optional*, defaults to `True`):
             Whether or not to separate the cls token when applying pooling.
-        truncate_seq (`bool`, *optional*, defaults to `False`):
+        truncate_seq (`bool`, *optional*, defaults to `True`):
             When using `separate_cls`, whether or not to truncate the last token when pooling, to avoid getting a
             sequence length that is not a multiple of 2.
-        pool_q_only (`bool`, *optional*, defaults to `False`):
+        pool_q_only (`bool`, *optional*, defaults to `True`):
             Whether or not to apply the pooling only to the query or to query, key and values for the attention layers.
     """
     model_type = "funnel"
@@ -118,7 +116,6 @@ def __init__(
         hidden_dropout=0.1,
         attention_dropout=0.1,
         activation_dropout=0.0,
-        type_vocab_size=3,
         initializer_range=0.1,
         initializer_std=None,
         layer_norm_eps=1e-9,
@@ -127,7 +124,7 @@ def __init__(
         separate_cls=True,
         truncate_seq=True,
         pool_q_only=True,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.block_sizes = block_sizes
@@ -144,7 +141,6 @@ def __init__(
         self.hidden_dropout = hidden_dropout
         self.attention_dropout = attention_dropout
         self.activation_dropout = activation_dropout
-        self.type_vocab_size = type_vocab_size
         self.initializer_range = initializer_range
         self.initializer_std = initializer_std
         self.layer_norm_eps = layer_norm_eps
diff --git a/src/transformers/models/funnel/modeling_funnel.py b/src/transformers/models/funnel/modeling_funnel.py
index f560baa729a0..06432cedcf4d 100644
--- a/src/transformers/models/funnel/modeling_funnel.py
+++ b/src/transformers/models/funnel/modeling_funnel.py
@@ -47,7 +47,6 @@
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "FunnelConfig"
-_TOKENIZER_FOR_DOC = "FunnelTokenizer"
 _CHECKPOINT_FOR_DOC = "funnel-transformer/small"
 
 FUNNEL_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -883,7 +882,7 @@ class FunnelForPreTrainingOutput(ModelOutput):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -942,7 +941,6 @@ def set_input_embeddings(self, new_embeddings: nn.Embedding) -> None:
 
     @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="funnel-transformer/small-base",
         output_type=BaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -968,6 +966,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -1020,7 +1019,6 @@ def set_input_embeddings(self, new_embeddings: nn.Embedding) -> None:
 
     @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1044,6 +1042,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -1144,10 +1143,10 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import FunnelTokenizer, FunnelForPreTraining
+        >>> from transformers import AutoTokenizer, FunnelForPreTraining
         >>> import torch
 
-        >>> tokenizer = FunnelTokenizer.from_pretrained("funnel-transformer/small")
+        >>> tokenizer = AutoTokenizer.from_pretrained("funnel-transformer/small")
         >>> model = FunnelForPreTraining.from_pretrained("funnel-transformer/small")
 
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
@@ -1193,7 +1192,7 @@ def forward(
 
 @add_start_docstrings("""Funnel Transformer Model with a `language modeling` head on top.""", FUNNEL_START_DOCSTRING)
 class FunnelForMaskedLM(FunnelPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config: FunnelConfig) -> None:
         super().__init__(config)
@@ -1212,7 +1211,6 @@ def set_output_embeddings(self, new_embeddings: nn.Embedding) -> None:
 
     @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1287,7 +1285,6 @@ def __init__(self, config: FunnelConfig) -> None:
 
     @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="funnel-transformer/small-base",
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1378,7 +1375,6 @@ def __init__(self, config: FunnelConfig) -> None:
 
     @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="funnel-transformer/small-base",
         output_type=MultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1465,7 +1461,6 @@ def __init__(self, config: FunnelConfig) -> None:
 
     @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1538,7 +1533,6 @@ def __init__(self, config: FunnelConfig) -> None:
 
     @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/funnel/modeling_tf_funnel.py b/src/transformers/models/funnel/modeling_tf_funnel.py
index 6e5b10fa4338..ccd07b5954b7 100644
--- a/src/transformers/models/funnel/modeling_tf_funnel.py
+++ b/src/transformers/models/funnel/modeling_tf_funnel.py
@@ -14,9 +14,12 @@
 # limitations under the License.
 """ TF 2.0 Funnel model."""
 
+
+from __future__ import annotations
+
 import warnings
 from dataclasses import dataclass
-from typing import Dict, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import numpy as np
 import tensorflow as tf
@@ -42,9 +45,8 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -58,7 +60,6 @@
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "FunnelConfig"
-_TOKENIZER_FOR_DOC = "FunnelTokenizer"
 
 TF_FUNNEL_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "funnel-transformer/small",  # B4-4-4H768
@@ -82,7 +83,7 @@ class TFFunnelEmbeddings(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super().__init__(**kwargs)
 
-        self.vocab_size = config.vocab_size
+        self.config = config
         self.hidden_size = config.hidden_size
         self.initializer_std = 1.0 if config.initializer_std is None else config.initializer_std
 
@@ -93,7 +94,7 @@ def build(self, input_shape):
         with tf.name_scope("word_embeddings"):
             self.weight = self.add_weight(
                 name="weight",
-                shape=[self.vocab_size, self.hidden_size],
+                shape=[self.config.vocab_size, self.hidden_size],
                 initializer=get_initializer(initializer_range=self.initializer_std),
             )
 
@@ -110,16 +111,7 @@ def call(self, input_ids=None, inputs_embeds=None, training=False):
         assert not (input_ids is not None and inputs_embeds is not None)
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(self.weight, input_ids)
 
         final_embeddings = self.LayerNorm(inputs=inputs_embeds)
@@ -250,6 +242,7 @@ def get_position_embeds(self, seq_len, training=False):
                 # rel_pos = tf.broadcast_to(rel_pos, (rel_pos.shape[0], self.d_model))
                 rel_pos = tf.cast(rel_pos, dtype=zero_offset.dtype)
                 rel_pos = rel_pos + zero_offset
+                tf.debugging.assert_less(rel_pos, tf.shape(pos_embed)[0])
                 position_embeds_no_pooling = tf.gather(pos_embed, rel_pos, axis=0)
 
                 position_embeds_list.append([position_embeds_no_pooling, position_embeds_pooling])
@@ -772,7 +765,6 @@ def call(
         return_dict=None,
         training=False,
     ):
-
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
@@ -924,12 +916,12 @@ def call(self, discriminator_hidden_states):
 class TFFunnelMaskedLMHead(tf.keras.layers.Layer):
     def __init__(self, config, input_embeddings, **kwargs):
         super().__init__(**kwargs)
-        self.vocab_size = config.vocab_size
+        self.config = config
         self.hidden_size = config.hidden_size
         self.input_embeddings = input_embeddings
 
     def build(self, input_shape):
-        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
+        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
 
         super().build(input_shape)
 
@@ -945,13 +937,13 @@ def get_bias(self):
 
     def set_bias(self, value):
         self.bias = value["bias"]
-        self.vocab_size = shape_list(value["bias"])[0]
+        self.config.vocab_size = shape_list(value["bias"])[0]
 
     def call(self, hidden_states, training=False):
         seq_length = shape_list(tensor=hidden_states)[1]
         hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
         hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
-        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
         hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
 
         return hidden_states
@@ -983,6 +975,11 @@ class TFFunnelPreTrainedModel(TFPreTrainedModel):
     config_class = FunnelConfig
     base_model_prefix = "funnel"
 
+    @property
+    def dummy_inputs(self):
+        # Funnel misbehaves with very small inputs, so we override and make them a bit bigger
+        return {"input_ids": tf.ones((1, 3), dtype=tf.int32)}
+
 
 @dataclass
 class TFFunnelForPreTrainingOutput(ModelOutput):
@@ -1006,8 +1003,8 @@ class TFFunnelForPreTrainingOutput(ModelOutput):
     """
 
     logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 FUNNEL_START_DOCSTRING = r"""
@@ -1060,7 +1057,7 @@ class TFFunnelForPreTrainingOutput(ModelOutput):
         input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`FunnelTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -1114,7 +1111,6 @@ def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None:
 
     @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="funnel-transformer/small-base",
         output_type=TFBaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1122,10 +1118,10 @@ def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None:
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1164,23 +1160,21 @@ def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None:
     @unpack_inputs
     @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="funnel-transformer/small",
         output_type=TFBaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[Tuple[tf.Tensor], TFBaseModelOutput]:
-
         return self.funnel(
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -1220,15 +1214,15 @@ def __init__(self, config: FunnelConfig, **kwargs) -> None:
     @replace_return_docstrings(output_type=TFFunnelForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
-        **kwargs
+        **kwargs,
     ) -> Union[Tuple[tf.Tensor], TFFunnelForPreTrainingOutput]:
         r"""
         Returns:
@@ -1236,10 +1230,10 @@ def call(
         Examples:
 
         ```python
-        >>> from transformers import FunnelTokenizer, TFFunnelForPreTraining
+        >>> from transformers import AutoTokenizer, TFFunnelForPreTraining
         >>> import torch
 
-        >>> tokenizer = FunnelTokenizer.from_pretrained("funnel-transformer/small")
+        >>> tokenizer = AutoTokenizer.from_pretrained("funnel-transformer/small")
         >>> model = TFFunnelForPreTraining.from_pretrained("funnel-transformer/small")
 
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
@@ -1293,21 +1287,20 @@ def get_prefix_bias_name(self) -> str:
     @unpack_inputs
     @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="funnel-transformer/small",
         output_type=TFMaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[Tuple[tf.Tensor], TFMaskedLMOutput]:
         r"""
@@ -1366,21 +1359,20 @@ def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None:
     @unpack_inputs
     @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="funnel-transformer/small-base",
         output_type=TFSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[Tuple[tf.Tensor], TFSequenceClassifierOutput]:
         r"""
@@ -1440,32 +1432,25 @@ def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None:
 
     @property
     def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            tf.Tensor with dummy inputs
-        """
-        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32)}
+        return {"input_ids": tf.ones((3, 3, 4), dtype=tf.int32)}
 
     @unpack_inputs
     @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="funnel-transformer/small-base",
         output_type=TFMultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[Tuple[tf.Tensor], TFMultipleChoiceModelOutput]:
         r"""
@@ -1518,20 +1503,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.float32, name="attention_mask"),
-                "token_type_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"),
-            }
-        ]
-    )
-    def serving(self, inputs: Dict[str, tf.Tensor]) -> TFMultipleChoiceModelOutput:
-        output = self.call(input_ids=inputs)
-
-        return self.serving_output(output=output)
-
     def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
         # hidden_states and attentions not converted to Tensor with tf.convert_to_tensor as they are all of
         # different dimensions
@@ -1561,21 +1532,20 @@ def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None:
     @unpack_inputs
     @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="funnel-transformer/small",
         output_type=TFTokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[Tuple[tf.Tensor], TFTokenClassifierOutput]:
         r"""
@@ -1638,22 +1608,21 @@ def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None:
     @unpack_inputs
     @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="funnel-transformer/small",
         output_type=TFQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[Tuple[tf.Tensor], TFQuestionAnsweringModelOutput]:
         r"""
diff --git a/src/transformers/models/funnel/tokenization_funnel.py b/src/transformers/models/funnel/tokenization_funnel.py
index 476fec51a8a4..9b0d3c1b6c52 100644
--- a/src/transformers/models/funnel/tokenization_funnel.py
+++ b/src/transformers/models/funnel/tokenization_funnel.py
@@ -120,9 +120,9 @@ class FunnelTokenizer(PreTrainedTokenizer):
         mask_token (`str`, *optional*, defaults to `"<mask>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        bos_token (`str`, `optional`, defaults to `"<s>"`):
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
             The beginning of sentence token.
-        eos_token (`str`, `optional`, defaults to `"</s>"`):
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sentence token.
         tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
             Whether or not to tokenize Chinese characters.
@@ -155,24 +155,8 @@ def __init__(
         eos_token="</s>",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
-        super().__init__(
-            do_lower_case=do_lower_case,
-            do_basic_tokenize=do_basic_tokenize,
-            never_split=never_split,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            tokenize_chinese_chars=tokenize_chinese_chars,
-            strip_accents=strip_accents,
-            **kwargs,
-        )
-
         if not os.path.isfile(vocab_file):
             raise ValueError(
                 f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
@@ -188,7 +172,23 @@ def __init__(
                 tokenize_chinese_chars=tokenize_chinese_chars,
                 strip_accents=strip_accents,
             )
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+
+        super().__init__(
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
 
     @property
     # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.do_lower_case
@@ -205,11 +205,12 @@ def get_vocab(self):
         return dict(self.vocab, **self.added_tokens_encoder)
 
     # Copied from transformers.models.bert.tokenization_bert.BertTokenizer._tokenize
-    def _tokenize(self, text):
+    def _tokenize(self, text, split_special_tokens=False):
         split_tokens = []
         if self.do_basic_tokenize:
-            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-
+            for token in self.basic_tokenizer.tokenize(
+                text, never_split=self.all_special_tokens if not split_special_tokens else None
+            ):
                 # If the token is part of the never_split set
                 if token in self.basic_tokenizer.never_split:
                     split_tokens.append(token)
@@ -360,20 +361,30 @@ class BasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -392,7 +403,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -420,7 +433,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
diff --git a/src/transformers/models/funnel/tokenization_funnel_fast.py b/src/transformers/models/funnel/tokenization_funnel_fast.py
index 60be9fbcd769..17946eb74b58 100644
--- a/src/transformers/models/funnel/tokenization_funnel_fast.py
+++ b/src/transformers/models/funnel/tokenization_funnel_fast.py
@@ -158,7 +158,7 @@ def __init__(
         tokenize_chinese_chars=True,
         strip_accents=None,
         wordpieces_prefix="##",
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
@@ -212,7 +212,7 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
         """
         output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
 
-        if token_ids_1:
+        if token_ids_1 is not None:
             output += token_ids_1 + [self.sep_token_id]
 
         return output
diff --git a/src/transformers/models/git/__init__.py b/src/transformers/models/git/__init__.py
index 539cd3b37a17..e234a4b01db1 100644
--- a/src/transformers/models/git/__init__.py
+++ b/src/transformers/models/git/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/git/configuration_git.py b/src/transformers/models/git/configuration_git.py
index 43b9c5016916..41f54612afdb 100644
--- a/src/transformers/models/git/configuration_git.py
+++ b/src/transformers/models/git/configuration_git.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import copy
 import os
 from typing import Union
 
@@ -28,7 +27,6 @@
 }
 
 
-# Copied from transformers.models.clip.configuration_clip.CLIPVisionConfig with CLIPVision->GitVision, CLIP->GIT, clip->git, openai/git-vit-base-patch32->microsoft/git-base, 32->16
 class GitVisionConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`GitVisionModel`]. It is used to instantiate a GIT
@@ -54,17 +52,13 @@ class GitVisionConfig(PretrainedConfig):
             The size (resolution) of each patch.
         hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
-            defaults to 1e-5): The epsilon used by the layer normalization layers.
-        dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        initializer_factor (`float`, *optional*, defaults to 1):
-            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
-            testing).
 
     Example:
 
@@ -87,39 +81,34 @@ def __init__(
         self,
         hidden_size=768,
         intermediate_size=3072,
-        projection_dim=512,
         num_hidden_layers=12,
         num_attention_heads=12,
         num_channels=3,
         image_size=224,
         patch_size=16,
         hidden_act="quick_gelu",
-        layer_norm_eps=0.00001,
-        dropout=0.0,
+        layer_norm_eps=1e-5,
         attention_dropout=0.0,
         initializer_range=0.02,
-        initializer_factor=1.0,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size
-        self.projection_dim = projection_dim
-        self.dropout = dropout
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
         self.num_channels = num_channels
         self.patch_size = patch_size
         self.image_size = image_size
         self.initializer_range = initializer_range
-        self.initializer_factor = initializer_factor
         self.attention_dropout = attention_dropout
         self.layer_norm_eps = layer_norm_eps
         self.hidden_act = hidden_act
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
 
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
@@ -182,8 +171,6 @@ class GitConfig(PretrainedConfig):
             with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models).
-        classifier_dropout (`float`, *optional*):
-            The dropout ratio for the classification head.
         num_image_with_embedding (`int`, *optional*):
             The number of temporal embeddings to add, in case the model is used for video captioning/VQA.
 
@@ -220,12 +207,11 @@ def __init__(
         pad_token_id=0,
         position_embedding_type="absolute",
         use_cache=True,
-        classifier_dropout=None,
         tie_word_embeddings=False,
         bos_token_id=101,
         eos_token_id=102,
         num_image_with_embedding=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, **kwargs)
 
@@ -247,19 +233,8 @@ def __init__(
         self.layer_norm_eps = layer_norm_eps
         self.position_embedding_type = position_embedding_type
         self.use_cache = use_cache
-        self.classifier_dropout = classifier_dropout
         self.tie_word_embeddings = tie_word_embeddings
         self.num_image_with_embedding = num_image_with_embedding
 
         self.bos_token_id = bos_token_id
         self.eos_token_id = eos_token_id
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["vision_config"] = self.vision_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/git/convert_git_to_pytorch.py b/src/transformers/models/git/convert_git_to_pytorch.py
index f072db7cd9a9..5dde4da15e51 100644
--- a/src/transformers/models/git/convert_git_to_pytorch.py
+++ b/src/transformers/models/git/convert_git_to_pytorch.py
@@ -21,12 +21,12 @@
 from pathlib import Path
 
 import numpy as np
+import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 from torchvision.transforms import CenterCrop, Compose, Normalize, Resize, ToTensor
 
-import requests
-from huggingface_hub import hf_hub_download
 from transformers import (
     AutoTokenizer,
     CLIPImageProcessor,
@@ -200,6 +200,17 @@ def prepare_video():
     np.random.seed(0)
 
     def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
+        """
+        Sample a given number of frame indices from the video.
+
+        Args:
+            clip_len (`int`): Total number of frames to sample.
+            frame_sample_rate (`int`): Sample every n-th frame.
+            seg_len (`int`): Maximum allowed index of sample's last frame.
+
+        Returns:
+            indices (`List[int]`): List of sampled frame indices
+        """
         converted_len = int(clip_len * frame_sample_rate)
         end_idx = np.random.randint(converted_len, seg_len)
         start_idx = end_idx - converted_len
@@ -246,6 +257,11 @@ def convert_git_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=Fal
         "git-large-msrvtt-qa": (
             "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_MSRVTT_QA/snapshot/model.pt"
         ),
+        "git-large-r": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_R/snapshot/model.pt",
+        "git-large-r-coco": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_R_COCO/snapshot/model.pt",
+        "git-large-r-textcaps": (
+            "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_R_TEXTCAPS/snapshot/model.pt"
+        ),
     }
 
     model_name_to_path = {
@@ -258,7 +274,7 @@ def convert_git_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=Fal
 
     # define GIT configuration based on model name
     config, image_size, is_video = get_git_config(model_name)
-    if "large" in model_name and not is_video:
+    if "large" in model_name and not is_video and "large-r" not in model_name:
         # large checkpoints take way too long to download
         checkpoint_path = model_name_to_path[model_name]
         state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
@@ -349,6 +365,12 @@ def convert_git_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=Fal
         expected_slice_logits = torch.tensor([-1.0113, -1.0114, -1.0113])
     elif model_name == "git-large-msrvtt-qa":
         expected_slice_logits = torch.tensor([0.0130, 0.0134, 0.0131])
+    elif model_name == "git-large-r":
+        expected_slice_logits = torch.tensor([-1.1283, -1.1285, -1.1286])
+    elif model_name == "git-large-r-coco":
+        expected_slice_logits = torch.tensor([-0.9641, -0.9641, -0.9641])
+    elif model_name == "git-large-r-textcaps":
+        expected_slice_logits = torch.tensor([-1.1121, -1.1120, -1.1124])
 
     assert torch.allclose(logits[0, -1, :3], expected_slice_logits, atol=1e-4)
     print("Looks ok!")
diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py
index 861b27e3bf76..00707e42dd08 100644
--- a/src/transformers/models/git/modeling_git.py
+++ b/src/transformers/models/git/modeling_git.py
@@ -43,7 +43,6 @@
 
 _CHECKPOINT_FOR_DOC = "microsoft/git-base"
 _CONFIG_FOR_DOC = "GitConfig"
-_TOKENIZER_FOR_DOC = "BertTokenizer"
 
 GIT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "microsoft/git-base",
@@ -110,7 +109,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
     def forward(
         self,
@@ -432,6 +433,13 @@ def forward(
         pixel_values_present: Optional[bool] = False,
         return_dict: Optional[bool] = True,
     ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPast]:
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
 
@@ -445,12 +453,6 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         return module(*inputs, past_key_value, output_attentions)
@@ -510,7 +512,6 @@ class GitPreTrainedModel(PreTrainedModel):
     config_class = GitConfig
     base_model_prefix = "git"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -558,7 +559,7 @@ def _set_gradient_checkpointing(self, module, value=False):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -578,7 +579,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
-            [`AutoImageProcessor.__call__`] for details.
+            [`CLIPImageProcessor.__call__`] for details.
 
         head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
             Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
@@ -623,11 +624,12 @@ def __init__(self, config: GitVisionConfig):
         self.num_patches = (self.image_size // self.patch_size) ** 2
         self.num_positions = self.num_patches + 1
         self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
-        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)))
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
 
     def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
         batch_size = pixel_values.shape[0]
-        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
         patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
 
         class_embeds = self.class_embedding.expand(batch_size, 1, -1)
@@ -763,9 +765,9 @@ def __init__(self, config: GitVisionConfig):
         super().__init__()
         self.embed_dim = config.hidden_size
         self.self_attn = GitVisionAttention(config)
-        self.layer_norm1 = nn.LayerNorm(self.embed_dim)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
         self.mlp = GitVisionMLP(config)
-        self.layer_norm2 = nn.LayerNorm(self.embed_dim)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
 
     def forward(
         self,
@@ -916,7 +918,7 @@ def custom_forward(*inputs):
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
             tensors for more detail.
@@ -936,9 +938,9 @@ def __init__(self, config: GitVisionConfig):
         embed_dim = config.hidden_size
 
         self.embeddings = GitVisionEmbeddings(config)
-        self.pre_layrnorm = nn.LayerNorm(embed_dim)
+        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
         self.encoder = GitVisionEncoder(config)
-        self.post_layernorm = nn.LayerNorm(embed_dim)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
 
     @add_start_docstrings_to_model_forward(GIT_VISION_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BaseModelOutput, config_class=GitVisionConfig)
@@ -1049,7 +1051,8 @@ def __init__(self, config: GitConfig):
         super().__init__()
         self.config = config
         self.visual_projection = nn.Sequential(
-            nn.Linear(config.vision_config.hidden_size, config.hidden_size), nn.LayerNorm(config.hidden_size)
+            nn.Linear(config.vision_config.hidden_size, config.hidden_size),
+            nn.LayerNorm(config.hidden_size, eps=config.vision_config.layer_norm_eps),
         )
 
     def forward(self, embeddings: torch.Tensor) -> torch.Tensor:
@@ -1210,6 +1213,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -1264,6 +1268,11 @@ def forward(
                 device=embedding_output.device,
             )
 
+        # Repeat visual features to match embedding batch size.
+        projected_visual_features = projected_visual_features.repeat(
+            embedding_output.size(0) // projected_visual_features.size(0), 1, 1
+        )
+
         # concatenate patch token and text token embeddings
         hidden_states = torch.cat((projected_visual_features, embedding_output), dim=1)
 
@@ -1318,6 +1327,8 @@ def forward(
     """GIT Model with a `language modeling` head on top for autoregressive language modeling.""", GIT_START_DOCSTRING
 )
 class GitForCausalLM(GitPreTrainedModel):
+    _tied_weights_keys = ["output.weight"]
+
     def __init__(self, config):
         super().__init__(config)
 
@@ -1419,11 +1430,11 @@ def forward(
         Video captioning example:
 
         ```python
-        >>> from transformers import AutoProcessor, AutoModelForCausalLM
-        >>> from PIL import Image
+        >>> import av
         >>> import numpy as np
+        >>> from PIL import Image
         >>> from huggingface_hub import hf_hub_download
-        >>> from decord import VideoReader, cpu
+        >>> from transformers import AutoProcessor, AutoModelForCausalLM
 
         >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-vatex")
         >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-vatex")
@@ -1432,7 +1443,37 @@ def forward(
         >>> np.random.seed(45)
 
 
+        >>> def read_video_pyav(container, indices):
+        ...     '''
+        ...     Decode the video with PyAV decoder.
+        ...     Args:
+        ...         container (`av.container.input.InputContainer`): PyAV container.
+        ...         indices (`List[int]`): List of frame indices to decode.
+        ...     Returns:
+        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
+        ...     '''
+        ...     frames = []
+        ...     container.seek(0)
+        ...     start_index = indices[0]
+        ...     end_index = indices[-1]
+        ...     for i, frame in enumerate(container.decode(video=0)):
+        ...         if i > end_index:
+        ...             break
+        ...         if i >= start_index and i in indices:
+        ...             frames.append(frame)
+        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+
+
         >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
+        ...     '''
+        ...     Sample a given number of frame indices from the video.
+        ...     Args:
+        ...         clip_len (`int`): Total number of frames to sample.
+        ...         frame_sample_rate (`int`): Sample every n-th frame.
+        ...         seg_len (`int`): Maximum allowed index of sample's last frame.
+        ...     Returns:
+        ...         indices (`List[int]`): List of sampled frame indices
+        ...     '''
         ...     converted_len = int(clip_len * frame_sample_rate)
         ...     end_idx = np.random.randint(converted_len, seg_len)
         ...     start_idx = end_idx - converted_len
@@ -1441,24 +1482,20 @@ def forward(
         ...     return indices
 
 
-        >>> def sample_frames(file_path, num_frames):
-        ...     videoreader = VideoReader(file_path, num_threads=1, ctx=cpu(0))
-        ...     videoreader.seek(0)
-        ...     indices = sample_frame_indices(clip_len=num_frames, frame_sample_rate=4, seg_len=len(videoreader))
-        ...     frames = videoreader.get_batch(indices).asnumpy()
-        ...     return list(frames)
-
-
         >>> # load video
         >>> file_path = hf_hub_download(
         ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
         ... )
+        >>> container = av.open(file_path)
 
         >>> # sample frames
         >>> num_frames = model.config.num_image_with_embedding
-        >>> frames = sample_frames(file_path, num_frames)
+        >>> indices = sample_frame_indices(
+        ...     clip_len=num_frames, frame_sample_rate=4, seg_len=container.streams.video[0].frames
+        ... )
+        >>> frames = read_video_pyav(container, indices)
 
-        >>> pixel_values = processor(images=frames, return_tensors="pt").pixel_values
+        >>> pixel_values = processor(images=list(frames), return_tensors="pt").pixel_values
 
         >>> generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
 
@@ -1487,29 +1524,32 @@ def forward(
         sequence_output = outputs[0]
         logits = self.output(sequence_output)
 
-        lm_loss = None
+        loss = None
         if labels is not None:
             # we are doing next-token prediction; shift prediction scores and input ids by one
-            shifted_logits = logits[:, :-1, :].contiguous()
+            num_image_tokens = self.git.encoder.layer[0].attention.self.image_patch_tokens
+            shifted_logits = logits[:, num_image_tokens:-1, :].contiguous()
             labels = labels[:, 1:].contiguous()
             loss_fct = CrossEntropyLoss()
-            lm_loss = loss_fct(shifted_logits.view(-1, self.config.vocab_size), labels.view(-1))
+            loss = loss_fct(shifted_logits.view(-1, self.config.vocab_size), labels.view(-1))
 
         if not return_dict:
             output = (logits,) + outputs[1:]
-            return ((lm_loss,) + output) if lm_loss is not None else output
+            return ((loss,) + output) if loss is not None else output
 
         return CausalLMOutputWithPast(
-            loss=lm_loss,
+            loss=loss,
             logits=logits,
             past_key_values=outputs.past_key_values,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
 
-    def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, use_cache=True, **kwargs):
-        # cut decoder_input_ids if past is used
-        if past is not None:
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
+    ):
+        # cut decoder_input_ids if past_key_values is used
+        if past_key_values is not None:
             input_ids = input_ids[:, -1:]
 
         # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
@@ -1521,12 +1561,14 @@ def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=Non
             "input_ids": input_ids,
             "attention_mask": attention_mask,
             "pixel_values": kwargs.get("pixel_values", None),
-            "past_key_values": past,
+            "past_key_values": past_key_values,
             "use_cache": use_cache,
         }
 
-    def _reorder_cache(self, past, beam_idx):
+    def _reorder_cache(self, past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
         return reordered_past
diff --git a/src/transformers/models/glpn/__init__.py b/src/transformers/models/glpn/__init__.py
index f16ee4a5a6a7..94788dcb85e7 100644
--- a/src/transformers/models/glpn/__init__.py
+++ b/src/transformers/models/glpn/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,7 +13,6 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-# rely on isort to merge the imports
 from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
 
 
diff --git a/src/transformers/models/glpn/configuration_glpn.py b/src/transformers/models/glpn/configuration_glpn.py
index aec3d1c46795..63056c4c04fe 100644
--- a/src/transformers/models/glpn/configuration_glpn.py
+++ b/src/transformers/models/glpn/configuration_glpn.py
@@ -51,7 +51,7 @@ class GLPNConfig(PretrainedConfig):
             Patch size before each encoder block.
         strides (`List[int]`, *optional*, defaults to `[4, 2, 2, 2]`):
             Stride before each encoder block.
-        num_attention_heads (`List[int]`, *optional*, defaults to `[1, 2, 4, 8]`):
+        num_attention_heads (`List[int]`, *optional*, defaults to `[1, 2, 5, 8]`):
             Number of attention heads for each attention layer in each block of the Transformer encoder.
         mlp_ratios (`List[int]`, *optional*, defaults to `[4, 4, 4, 4]`):
             Ratio of the size of the hidden layer compared to the size of the input layer of the Mix FFNs in the
@@ -67,9 +67,9 @@ class GLPNConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         drop_path_rate (`float`, *optional*, defaults to 0.1):
             The dropout probability for stochastic depth, used in the blocks of the Transformer encoder.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
             The epsilon used by the layer normalization layers.
-        decoder_hidden_size (`int`, *optional*, defaults to 32):
+        decoder_hidden_size (`int`, *optional*, defaults to 64):
             The dimension of the decoder.
         max_depth (`int`, *optional*, defaults to 10):
             The maximum depth of the decoder.
@@ -112,7 +112,7 @@ def __init__(
         decoder_hidden_size=64,
         max_depth=10,
         head_in_index=-1,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/glpn/convert_glpn_to_pytorch.py b/src/transformers/models/glpn/convert_glpn_to_pytorch.py
index d083ff8271d8..5f0183783ec8 100644
--- a/src/transformers/models/glpn/convert_glpn_to_pytorch.py
+++ b/src/transformers/models/glpn/convert_glpn_to_pytorch.py
@@ -19,11 +19,11 @@
 from collections import OrderedDict
 from pathlib import Path
 
+import requests
 import torch
 from PIL import Image
 
-import requests
-from transformers import GLPNConfig, GLPNFeatureExtractor, GLPNForDepthEstimation
+from transformers import GLPNConfig, GLPNForDepthEstimation, GLPNImageProcessor
 from transformers.utils import logging
 
 
@@ -131,12 +131,12 @@ def convert_glpn_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_to_h
     # load GLPN configuration (Segformer-B4 size)
     config = GLPNConfig(hidden_sizes=[64, 128, 320, 512], decoder_hidden_size=64, depths=[3, 8, 27, 3])
 
-    # load feature extractor (only resize + rescale)
-    feature_extractor = GLPNFeatureExtractor()
+    # load image processor (only resize + rescale)
+    image_processor = GLPNImageProcessor()
 
     # prepare image
     image = prepare_img()
-    pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
+    pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
 
     logger.info("Converting model...")
 
@@ -179,17 +179,17 @@ def convert_glpn_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_to_h
 
     # finally, push to hub if required
     if push_to_hub:
-        logger.info("Pushing model and feature extractor to the hub...")
+        logger.info("Pushing model and image processor to the hub...")
         model.push_to_hub(
             repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
             organization="nielsr",
             commit_message="Add model",
             use_temp_dir=True,
         )
-        feature_extractor.push_to_hub(
+        image_processor.push_to_hub(
             repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
             organization="nielsr",
-            commit_message="Add feature extractor",
+            commit_message="Add image processor",
             use_temp_dir=True,
         )
 
diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py
index 5d5cd8c19879..afed9188f7ab 100644
--- a/src/transformers/models/glpn/image_processing_glpn.py
+++ b/src/transformers/models/glpn/image_processing_glpn.py
@@ -19,13 +19,19 @@
 import numpy as np
 import PIL.Image
 
-from transformers.image_utils import PILImageResampling
-from transformers.utils.generic import TensorType
-
 from ...image_processing_utils import BaseImageProcessor, BatchFeature
-from ...image_transforms import rescale, resize, to_channel_dimension_format
-from ...image_utils import ChannelDimension, get_image_size, is_batched, to_numpy_array, valid_images
-from ...utils import logging
+from ...image_transforms import resize, to_channel_dimension_format
+from ...image_utils import (
+    ChannelDimension,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import TensorType, logging
 
 
 logger = logging.get_logger(__name__)
@@ -42,7 +48,7 @@ class GLPNImageProcessor(BaseImageProcessor):
         size_divisor (`int`, *optional*, defaults to 32):
             When `do_resize` is `True`, images are resized so their height and width are rounded down to the closest
             multiple of `size_divisor`. Can be overridden by `size_divisor` in `preprocess`.
-        resample (`PIL.Image` resampling filter, *optional*, defaults to `PILImageResampling.BILINEAR`):
+        resample (`PIL.Image` resampling filter, *optional*, defaults to `Resampling.BILINEAR`):
             Resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`.
         do_rescale (`bool`, *optional*, defaults to `True`):
             Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.). Can be
@@ -57,7 +63,7 @@ def __init__(
         size_divisor: int = 32,
         resample=PILImageResampling.BILINEAR,
         do_rescale: bool = True,
-        **kwargs
+        **kwargs,
     ) -> None:
         self.do_resize = do_resize
         self.do_rescale = do_rescale
@@ -66,7 +72,13 @@ def __init__(
         super().__init__(**kwargs)
 
     def resize(
-        self, image: np.ndarray, size_divisor: int, resample, data_format: Optional[ChannelDimension] = None, **kwargs
+        self,
+        image: np.ndarray,
+        size_divisor: int,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize the image, rounding the (height, width) dimensions down to the closest multiple of size_divisor.
@@ -86,39 +98,29 @@ def resize(
                 image is used. Can be one of:
                 - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                 - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not set, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
 
         Returns:
             `np.ndarray`: The resized image.
         """
-        height, width = get_image_size(image)
+        height, width = get_image_size(image, channel_dim=input_data_format)
         # Rounds the height and width down to the closest multiple of size_divisor
         new_h = height // size_divisor * size_divisor
         new_w = width // size_divisor * size_divisor
-        image = resize(image, (new_h, new_w), resample=resample, data_format=data_format, **kwargs)
+        image = resize(
+            image,
+            (new_h, new_w),
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
         return image
 
-    def rescale(
-        self, image: np.ndarray, scale: float, data_format: Optional[ChannelDimension] = None, **kwargs
-    ) -> np.ndarray:
-        """
-        Rescale the image by the given scaling factor `scale`.
-
-        Args:
-            image (`np.ndarray`):
-                The image to rescale.
-            scale (`float`):
-                The scaling factor to rescale pixel values by.
-            data_format (`ChannelDimension` or `str`, *optional*):
-                The channel dimension format for the output image. If `None`, the channel dimension format of the input
-                image is used. Can be one of:
-                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-
-        Returns:
-            `np.ndarray`: The rescaled image.
-        """
-        return rescale(image=image, scale=scale, data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: Union["PIL.Image.Image", TensorType, List["PIL.Image.Image"], List[TensorType]],
@@ -128,6 +130,7 @@ def preprocess(
         do_rescale: Optional[bool] = None,
         return_tensors: Optional[Union[TensorType, str]] = None,
         data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
         **kwargs,
     ) -> BatchFeature:
         """
@@ -135,7 +138,8 @@ def preprocess(
 
         Args:
             images (`PIL.Image.Image` or `TensorType` or `List[np.ndarray]` or `List[TensorType]`):
-                The image or images to preprocess.
+                Images to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_normalize=False`.
             do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                 Whether to resize the input such that the (height, width) dimensions are a multiple of `size_divisor`.
             size_divisor (`int`, *optional*, defaults to `self.size_divisor`):
@@ -157,6 +161,12 @@ def preprocess(
                 The channel dimension format for the output image. Can be one of:
                     - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                     - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         do_rescale = do_rescale if do_rescale is not None else self.do_rescale
@@ -166,8 +176,7 @@ def preprocess(
         if do_resize and size_divisor is None:
             raise ValueError("size_divisor is required for resizing")
 
-        if not is_batched(images):
-            images = [images]
+        images = make_list_of_images(images)
 
         if not valid_images(images):
             raise ValueError("Invalid image(s)")
@@ -175,13 +184,28 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(img) for img in images]
 
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
         if do_resize:
-            images = [self.resize(image, size_divisor=size_divisor, resample=resample) for image in images]
+            images = [
+                self.resize(image, size_divisor=size_divisor, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
 
         if do_rescale:
-            images = [self.rescale(image, scale=1 / 255) for image in images]
+            images = [self.rescale(image, scale=1 / 255, input_data_format=input_data_format) for image in images]
 
-        images = [to_channel_dimension_format(image, data_format) for image in images]
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
 
         data = {"pixel_values": images}
         return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/src/transformers/models/glpn/modeling_glpn.py b/src/transformers/models/glpn/modeling_glpn.py
index 31f3c3f64e7e..d2ddef5c41e1 100755
--- a/src/transformers/models/glpn/modeling_glpn.py
+++ b/src/transformers/models/glpn/modeling_glpn.py
@@ -41,7 +41,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "GLPNConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "GLPNImageProcessor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "vinvino02/glpn-kitti"
@@ -53,8 +52,8 @@
 ]
 
 
-# Copied from transformers.models.segformer.modeling_segformer.drop_path
-def drop_path(input, drop_prob: float = 0.0, training: bool = False):
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
@@ -464,7 +463,7 @@ def _init_weights(self, module):
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`GLPNImageProcessor`]. See [`GLPNImageProcessor.__call__`] for details.
+            [`AutoImageProcessor`]. See [`GLPNImageProcessor.__call__`] for details.
 
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
@@ -503,7 +502,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(GLPN_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -634,7 +632,7 @@ def forward(self, hidden_states: List[torch.Tensor]) -> List[torch.Tensor]:
 
 
 class SiLogLoss(nn.Module):
-    """
+    r"""
     Implements the Scale-invariant log scale loss [Eigen et al., 2014](https://arxiv.org/abs/1406.2283).
 
     $$L=\frac{1}{n} \sum_{i} d_{i}^{2}-\frac{1}{2 n^{2}}\left(\sum_{i} d_{i}^{2}\right)$$ where $d_{i}=\log y_{i}-\log
@@ -713,7 +711,7 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import GLPNImageProcessor, GLPNForDepthEstimation
+        >>> from transformers import AutoImageProcessor, GLPNForDepthEstimation
         >>> import torch
         >>> import numpy as np
         >>> from PIL import Image
@@ -722,7 +720,7 @@ def forward(
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> image_processor = GLPNImageProcessor.from_pretrained("vinvino02/glpn-kitti")
+        >>> image_processor = AutoImageProcessor.from_pretrained("vinvino02/glpn-kitti")
         >>> model = GLPNForDepthEstimation.from_pretrained("vinvino02/glpn-kitti")
 
         >>> # prepare image for the model
diff --git a/src/transformers/models/gpt2/CONVERSION.md b/src/transformers/models/gpt2/CONVERSION.md
index d42ea1db9c8e..fc55cb338b81 100644
--- a/src/transformers/models/gpt2/CONVERSION.md
+++ b/src/transformers/models/gpt2/CONVERSION.md
@@ -2,8 +2,8 @@ Here is how to convert a GPT2 model generated outside of `transformers`
 
 * [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)-generated model:
 
-Use [convert_megatron_gpt2_checkpoint.py](../megatron_gpt2/convert_megatron_gpt2_checkpoint.py)
+  Use [convert_megatron_gpt2_checkpoint.py](../megatron_gpt2/convert_megatron_gpt2_checkpoint.py)
 
 * [big-science fork of Megatron-Deepspeed](https://github.com/bigscience-workshop/Megatron-DeepSpeed/)-generated model:
 
-Use the instructions [here](https://github.com/bigscience-workshop/bigscience/tree/aa872e754106f6678e8a9dac8c6962404ba39a6d/train/tr1-13B-base#checkpoint-conversion-and-upload). This approach uses a set of scripts that require the use of this particular fork of Megatron-Deepspeed.
+  Use the instructions [here](https://github.com/bigscience-workshop/bigscience/tree/aa872e754106f6678e8a9dac8c6962404ba39a6d/train/tr1-13B-base#checkpoint-conversion-and-upload). This approach uses a set of scripts that require the use of this particular fork of Megatron-Deepspeed.
diff --git a/src/transformers/models/gpt2/__init__.py b/src/transformers/models/gpt2/__init__.py
index e934602496f7..e99658ac1e88 100644
--- a/src/transformers/models/gpt2/__init__.py
+++ b/src/transformers/models/gpt2/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -23,6 +19,7 @@
     _LazyModule,
     is_flax_available,
     is_keras_nlp_available,
+    is_tensorflow_text_available,
     is_tf_available,
     is_tokenizers_available,
     is_torch_available,
@@ -51,6 +48,7 @@
     _import_structure["modeling_gpt2"] = [
         "GPT2_PRETRAINED_MODEL_ARCHIVE_LIST",
         "GPT2DoubleHeadsModel",
+        "GPT2ForQuestionAnswering",
         "GPT2ForSequenceClassification",
         "GPT2ForTokenClassification",
         "GPT2LMHeadModel",
@@ -112,6 +110,7 @@
         from .modeling_gpt2 import (
             GPT2_PRETRAINED_MODEL_ARCHIVE_LIST,
             GPT2DoubleHeadsModel,
+            GPT2ForQuestionAnswering,
             GPT2ForSequenceClassification,
             GPT2ForTokenClassification,
             GPT2LMHeadModel,
diff --git a/src/transformers/models/gpt2/configuration_gpt2.py b/src/transformers/models/gpt2/configuration_gpt2.py
index fe9c711d73ae..ef1c591a2796 100644
--- a/src/transformers/models/gpt2/configuration_gpt2.py
+++ b/src/transformers/models/gpt2/configuration_gpt2.py
@@ -17,8 +17,7 @@
 from collections import OrderedDict
 from typing import Any, List, Mapping, Optional
 
-from transformers import PreTrainedTokenizer, TensorType, is_torch_available
-
+from ... import PreTrainedTokenizer, TensorType, is_torch_available
 from ...configuration_utils import PretrainedConfig
 from ...onnx import OnnxConfigWithPast, PatchingSpec
 from ...utils import logging
@@ -61,11 +60,11 @@ class GPT2Config(PretrainedConfig):
             Number of attention heads for each attention layer in the Transformer encoder.
         n_inner (`int`, *optional*, defaults to None):
             Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
-        activation_function (`str`, *optional*, defaults to `"gelu"`):
+        activation_function (`str`, *optional*, defaults to `"gelu_new"`):
             Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
         resid_pdrop (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        embd_pdrop (`int`, *optional*, defaults to 0.1):
+        embd_pdrop (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the embeddings.
         attn_pdrop (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention.
diff --git a/src/transformers/models/gpt2/modeling_flax_gpt2.py b/src/transformers/models/gpt2/modeling_flax_gpt2.py
index 6a2d6f553cf0..50cfb5e11221 100644
--- a/src/transformers/models/gpt2/modeling_flax_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_flax_gpt2.py
@@ -37,7 +37,6 @@
 
 _CHECKPOINT_FOR_DOC = "gpt2"
 _CONFIG_FOR_DOC = "GPT2Config"
-_TOKENIZER_FOR_DOC = "GPT2Tokenizer"
 
 
 GPT2_START_DOCSTRING = r"""
@@ -80,7 +79,7 @@
         input_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`):
             `input_ids_length` = `sequence_length`. Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`GPT2Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -200,7 +199,6 @@ def __call__(
         init_cache: bool = False,
         output_attentions: bool = False,
     ):
-
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
@@ -255,7 +253,7 @@ def __call__(
             attention_bias = lax.select(
                 attention_mask > 0,
                 jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
-                jnp.full(attention_mask.shape, -1e4).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
             )
         else:
             attention_bias = None
@@ -670,7 +668,6 @@ class FlaxGPT2Model(FlaxGPT2PreTrainedModel):
 
 append_call_sample_docstring(
     FlaxGPT2Model,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxBaseModelOutputWithPastAndCrossAttentions,
     _CONFIG_FOR_DOC,
@@ -745,7 +742,7 @@ def __call__(
 class FlaxGPT2LMHeadModel(FlaxGPT2PreTrainedModel):
     module_class = FlaxGPT2LMHeadModule
 
-    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jnp.DeviceArray] = None):
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
         # initializing the cache
         batch_size, seq_length = input_ids.shape
 
@@ -756,7 +753,9 @@ def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: O
         extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
         if attention_mask is not None:
             position_ids = attention_mask.cumsum(axis=-1) - 1
-            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0))
+            extended_attention_mask = lax.dynamic_update_slice(
+                extended_attention_mask, attention_mask.astype("i4"), (0, 0)
+            )
         else:
             position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
 
@@ -774,7 +773,6 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
 
 append_call_sample_docstring(
     FlaxGPT2LMHeadModel,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxCausalLMOutputWithCrossAttentions,
     _CONFIG_FOR_DOC,
diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index 38a58a2a43fc..714f0351b3e4 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -17,6 +17,7 @@
 
 import math
 import os
+import warnings
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union
 
@@ -30,6 +31,7 @@
 from ...modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     CausalLMOutputWithCrossAttentions,
+    QuestionAnsweringModelOutput,
     SequenceClassifierOutputWithPast,
     TokenClassifierOutput,
 )
@@ -51,7 +53,6 @@
 
 _CHECKPOINT_FOR_DOC = "gpt2"
 _CONFIG_FOR_DOC = "GPT2Config"
-_TOKENIZER_FOR_DOC = "GPT2Tokenizer"
 
 GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "gpt2",
@@ -109,10 +110,9 @@ def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
                 num = int(scope_names[1])
                 pointer = pointer[num]
         try:
-            assert (
-                pointer.shape == array.shape
-            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
-        except AssertionError as e:
+            if pointer.shape != array.shape:
+                raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
+        except ValueError as e:
             e.args += (pointer.shape, array.shape)
             raise
         logger.info(f"Initialize PyTorch weight {name}")
@@ -127,11 +127,12 @@ def __init__(self, config, is_cross_attention=False, layer_idx=None):
         max_positions = config.max_position_embeddings
         self.register_buffer(
             "bias",
-            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.uint8)).view(
+            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
                 1, 1, max_positions, max_positions
             ),
+            persistent=False,
         )
-        self.register_buffer("masked_bias", torch.tensor(-1e4))
+        self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)
 
         self.embed_dim = config.hidden_size
         self.num_heads = config.num_attention_heads
@@ -193,7 +194,7 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None):
         if not self.is_cross_attention:
             # if only "normal" attention layer implements causal mask
             query_length, key_length = query.size(-2), key.size(-2)
-            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].to(torch.bool)
+            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
             mask_value = torch.finfo(attn_weights.dtype).min
             # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
             # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
@@ -243,7 +244,7 @@ def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, hea
         if not self.is_cross_attention:
             # if only "normal" attention layer implements causal mask
             query_length, key_length = query.size(-2), key.size(-2)
-            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].bool()
+            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
             mask_value = torch.finfo(attn_weights.dtype).min
             # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
             # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
@@ -447,6 +448,7 @@ class GPT2PreTrainedModel(PreTrainedModel):
     is_parallelizable = True
     supports_gradient_checkpointing = True
     _no_split_modules = ["GPT2Block"]
+    _skip_keys_device_placement = "past_key_values"
 
     def __init__(self, *inputs, **kwargs):
         super().__init__(*inputs, **kwargs)
@@ -551,7 +553,7 @@ class GPT2DoubleHeadsModelOutput(ModelOutput):
             If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
             `input_ids`.
 
-            Indices can be obtained using [`GPT2Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -665,8 +667,6 @@ class GPT2DoubleHeadsModelOutput(ModelOutput):
     GPT2_START_DOCSTRING,
 )
 class GPT2Model(GPT2PreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["attn.masked_bias"]
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -690,6 +690,13 @@ def __init__(self, config):
     @add_start_docstrings(PARALLELIZE_DOCSTRING)
     def parallelize(self, device_map=None):
         # Check validity of device_map
+        warnings.warn(
+            "`GPT2Model.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your"
+            " model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
+            " `device_map` but it needs to be a dictionary module_name to device, so for instance {'h.0': 0, 'h.1': 1,"
+            " ...}",
+            FutureWarning,
+        )
         self.device_map = (
             get_device_map(len(self.h), range(torch.cuda.device_count())) if device_map is None else device_map
         )
@@ -709,6 +716,10 @@ def parallelize(self, device_map=None):
 
     @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
     def deparallelize(self):
+        warnings.warn(
+            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
+            FutureWarning,
+        )
         self.model_parallel = False
         self.device_map = None
         self.first_device = "cpu"
@@ -735,7 +746,6 @@ def _prune_heads(self, heads_to_prune):
 
     @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPastAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -766,6 +776,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
             batch_size = input_ids.shape[0]
@@ -779,8 +790,6 @@ def forward(
 
         if token_type_ids is not None:
             token_type_ids = token_type_ids.view(-1, input_shape[-1])
-        if position_ids is not None:
-            position_ids = position_ids.view(-1, input_shape[-1])
 
         if past_key_values is None:
             past_length = 0
@@ -789,7 +798,7 @@ def forward(
             past_length = past_key_values[0][0].size(-2)
         if position_ids is None:
             position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
-            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
+            position_ids = position_ids.unsqueeze(0)
 
         # GPT2Attention mask.
         if attention_mask is not None:
@@ -839,14 +848,20 @@ def forward(
 
         hidden_states = self.drop(hidden_states)
 
-        output_shape = input_shape + (hidden_states.size(-1),)
+        output_shape = (-1,) + input_shape[1:] + (hidden_states.size(-1),)
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
 
         presents = () if use_cache else None
         all_self_attentions = () if output_attentions else None
         all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
         all_hidden_states = () if output_hidden_states else None
         for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-
             # Model parallel
             if self.model_parallel:
                 torch.cuda.set_device(hidden_states.device)
@@ -863,12 +878,6 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         # None for past_key_value
@@ -943,7 +952,7 @@ def custom_forward(*inputs):
     GPT2_START_DOCSTRING,
 )
 class GPT2LMHeadModel(GPT2PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -959,6 +968,13 @@ def __init__(self, config):
 
     @add_start_docstrings(PARALLELIZE_DOCSTRING)
     def parallelize(self, device_map=None):
+        warnings.warn(
+            "`GPT2LMHeadModel.parallelize` is deprecated and will be removed in v5 of Transformers, you should load"
+            " your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
+            " `device_map` but it needs to be a dictionary module_name to device, so for instance {'transformer.h.0':"
+            " 0, 'transformer.h.1': 1, ...}",
+            FutureWarning,
+        )
         self.device_map = (
             get_device_map(len(self.transformer.h), range(torch.cuda.device_count()))
             if device_map is None
@@ -971,6 +987,10 @@ def parallelize(self, device_map=None):
 
     @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
     def deparallelize(self):
+        warnings.warn(
+            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
+            FutureWarning,
+        )
         self.transformer.deparallelize()
         self.transformer = self.transformer.to("cpu")
         self.lm_head = self.lm_head.to("cpu")
@@ -983,7 +1003,7 @@ def get_output_embeddings(self):
     def set_output_embeddings(self, new_embeddings):
         self.lm_head = new_embeddings
 
-    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs):
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
         token_type_ids = kwargs.get("token_type_ids", None)
         # only last token for inputs_ids if past is defined in kwargs
         if past_key_values:
@@ -1002,18 +1022,26 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg
                 position_ids = position_ids[:, -1].unsqueeze(-1)
         else:
             position_ids = None
-        return {
-            "input_ids": input_ids,
-            "past_key_values": past_key_values,
-            "use_cache": kwargs.get("use_cache"),
-            "position_ids": position_ids,
-            "attention_mask": attention_mask,
-            "token_type_ids": token_type_ids,
-        }
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "position_ids": position_ids,
+                "attention_mask": attention_mask,
+                "token_type_ids": token_type_ids,
+            }
+        )
+        return model_inputs
 
     @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=CausalLMOutputWithCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -1069,6 +1097,8 @@ def forward(
 
         loss = None
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(lm_logits.device)
             # Shift so that tokens < n predict n
             shift_logits = lm_logits[..., :-1, :].contiguous()
             shift_labels = labels[..., 1:].contiguous()
@@ -1090,7 +1120,9 @@ def forward(
         )
 
     @staticmethod
-    def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> Tuple[Tuple[torch.Tensor]]:
+    def _reorder_cache(
+        past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
+    ) -> Tuple[Tuple[torch.Tensor]]:
         """
         This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
         [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
@@ -1098,7 +1130,7 @@ def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) ->
         """
         return tuple(
             tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
-            for layer_past in past
+            for layer_past in past_key_values
         )
 
 
@@ -1112,7 +1144,7 @@ def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) ->
     GPT2_START_DOCSTRING,
 )
 class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1130,6 +1162,13 @@ def __init__(self, config):
 
     @add_start_docstrings(PARALLELIZE_DOCSTRING)
     def parallelize(self, device_map=None):
+        warnings.warn(
+            "`GPT2DoubleHeadsModel.parallelize` is deprecated and will be removed in v5 of Transformers, you should"
+            " load your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your"
+            " own `device_map` but it needs to be a dictionary module_name to device, so for instance"
+            " {'transformer.h.0': 0, 'transformer.h.1': 1, ...}",
+            FutureWarning,
+        )
         self.device_map = (
             get_device_map(len(self.transformer.h), range(torch.cuda.device_count()))
             if device_map is None
@@ -1143,6 +1182,10 @@ def parallelize(self, device_map=None):
 
     @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
     def deparallelize(self):
+        warnings.warn(
+            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
+            FutureWarning,
+        )
         self.transformer.deparallelize()
         self.transformer = self.transformer.to("cpu")
         self.lm_head = self.lm_head.to("cpu")
@@ -1223,9 +1266,9 @@ def forward(
 
         ```python
         >>> import torch
-        >>> from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
+        >>> from transformers import AutoTokenizer, GPT2DoubleHeadsModel
 
-        >>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+        >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
         >>> model = GPT2DoubleHeadsModel.from_pretrained("gpt2")
 
         >>> # Add a [CLS] to the vocabulary (we should train it also!)
@@ -1276,6 +1319,7 @@ def forward(
             mc_loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))
         lm_loss = None
         if labels is not None:
+            labels = labels.to(lm_logits.device)
             shift_logits = lm_logits[..., :-1, :].contiguous()
             shift_labels = labels[..., 1:].contiguous()
             loss_fct = CrossEntropyLoss()
@@ -1298,7 +1342,9 @@ def forward(
         )
 
     @staticmethod
-    def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> Tuple[Tuple[torch.Tensor]]:
+    def _reorder_cache(
+        past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
+    ) -> Tuple[Tuple[torch.Tensor]]:
         """
         This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
         [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
@@ -1306,7 +1352,7 @@ def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) ->
         """
         return tuple(
             tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
-            for layer_past in past
+            for layer_past in past_key_values
         )
 
 
@@ -1326,8 +1372,6 @@ def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) ->
     GPT2_START_DOCSTRING,
 )
 class GPT2ForSequenceClassification(GPT2PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1343,12 +1387,9 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="microsoft/DialogRPT-updown",
         output_type=SequenceClassifierOutputWithPast,
         config_class=_CONFIG_FOR_DOC,
-        expected_output="'LABEL_0'",
-        expected_loss=5.28,
     )
     def forward(
         self,
@@ -1401,7 +1442,9 @@ def forward(
             sequence_lengths = -1
         else:
             if input_ids is not None:
-                sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device)
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+                    logits.device
+                )
             else:
                 sequence_lengths = -1
                 logger.warning(
@@ -1478,7 +1521,6 @@ def __init__(self, config):
     @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
     # fmt: off
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="brad1141/gpt2-finetuned-comp2",
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1529,6 +1571,7 @@ def forward(
 
         loss = None
         if labels is not None:
+            labels = labels.to(logits.device)
             loss_fct = CrossEntropyLoss()
             loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
 
@@ -1542,3 +1585,107 @@ def forward(
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
         )
+
+
+@add_start_docstrings(
+    """
+    The GPT-2 Model transformer with a span classification head on top for extractive question-answering tasks like
+    SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    GPT2_START_DOCSTRING,
+)
+class GPT2ForQuestionAnswering(GPT2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.transformer = GPT2Model(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        real_checkpoint=_CHECKPOINT_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1).to(start_logits.device)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1).to(end_logits.device)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/gpt2/modeling_tf_gpt2.py b/src/transformers/models/gpt2/modeling_tf_gpt2.py
index 8e29fd734b24..525207268e22 100644
--- a/src/transformers/models/gpt2/modeling_tf_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_tf_gpt2.py
@@ -15,6 +15,8 @@
 # limitations under the License.
 """ TF 2.0 OpenAI GPT-2 model."""
 
+from __future__ import annotations
+
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
 
@@ -34,14 +36,12 @@
     TFPreTrainedModel,
     TFSequenceClassificationLoss,
     TFSequenceSummary,
-    TFSharedEmbeddings,
     get_initializer,
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
-    DUMMY_INPUTS,
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -56,7 +56,6 @@
 
 _CHECKPOINT_FOR_DOC = "gpt2"
 _CONFIG_FOR_DOC = "GPT2Config"
-_TOKENIZER_FOR_DOC = "GPT2Tokenizer"
 
 TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "gpt2",
@@ -164,7 +163,6 @@ def call(
         output_attentions,
         training=False,
     ):
-
         if encoder_hidden_states is not None:
             if not hasattr(self, "q_attn"):
                 raise ValueError(
@@ -184,7 +182,7 @@ def call(
         key = self.split_heads(key)
         value = self.split_heads(value)
         if layer_past is not None:
-            past_key, past_value = tf.unstack(layer_past, axis=0)
+            past_key, past_value = tf.unstack(layer_past, axis=0, num=2)
             key = tf.concat([past_key, key], axis=-2)
             value = tf.concat([past_value, value], axis=-2)
 
@@ -231,7 +229,6 @@ def __init__(self, config, scale=False, **kwargs):
         self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2")
 
         if config.add_cross_attention:
-
             self.crossattention = TFAttention(nx, config, scale, name="crossattention", is_cross_attention=True)
             self.ln_cross_attn = tf.keras.layers.LayerNormalization(
                 epsilon=config.layer_norm_epsilon, name="ln_cross_attn"
@@ -314,34 +311,31 @@ def __init__(self, config, *inputs, **kwargs):
         self.return_dict = config.use_return_dict
 
         self.num_hidden_layers = config.n_layer
-        self.vocab_size = config.vocab_size
         self.n_embd = config.n_embd
         self.n_positions = config.n_positions
         self.initializer_range = config.initializer_range
 
-        self.wte = TFSharedEmbeddings(
-            config.vocab_size, config.hidden_size, initializer_range=config.initializer_range, name="wte"
+        self.wte = tf.keras.layers.Embedding(
+            input_dim=config.vocab_size,
+            output_dim=config.hidden_size,
+            embeddings_initializer=get_initializer(config.initializer_range),
+            name="wte",
+        )
+        self.wpe = tf.keras.layers.Embedding(
+            input_dim=config.n_positions,
+            output_dim=config.n_embd,
+            embeddings_initializer=get_initializer(config.initializer_range),
+            name="wpe",
         )
         self.drop = tf.keras.layers.Dropout(config.embd_pdrop)
         self.h = [TFBlock(config, scale=True, name=f"h_._{i}") for i in range(config.n_layer)]
         self.ln_f = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_f")
 
-    def build(self, input_shape):
-        with tf.name_scope("wpe"):
-            self.wpe = self.add_weight(
-                name="embeddings",
-                shape=[self.n_positions, self.n_embd],
-                initializer=get_initializer(self.initializer_range),
-            )
-
-        super().build(input_shape)
-
     def get_input_embeddings(self):
         return self.wte
 
-    def set_input_embeddings(self, value):
-        self.wte.weight = value
-        self.wte.vocab_size = shape_list(value)[0]
+    def set_input_embeddings(self, new_embeddings):
+        self.wte = new_embeddings
 
     def _prune_heads(self, heads_to_prune):
         """
@@ -352,22 +346,21 @@ def _prune_heads(self, heads_to_prune):
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
+        input_ids: TFModelInputType | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
     ) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]:
-
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
@@ -442,23 +435,14 @@ def call(
         position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
 
         if inputs_embeds is None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})"
-                ),
-            )
-            inputs_embeds = self.wte(input_ids, mode="embedding")
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
+            inputs_embeds = self.wte(input_ids)
 
-        position_embeds = tf.gather(self.wpe, position_ids)
+        position_embeds = self.wpe(position_ids)
 
         if token_type_ids is not None:
             token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
-            token_type_embeds = self.wte(token_type_ids, mode="embedding")
+            token_type_embeds = self.wte(token_type_ids)
         else:
             token_type_embeds = tf.constant(0.0)
 
@@ -537,37 +521,6 @@ class TFGPT2PreTrainedModel(TFPreTrainedModel):
     # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
     _keys_to_ignore_on_load_unexpected = [r"h.\d+.attn.bias", r"h.\d+.crossattention.bias"]
 
-    @property
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        dummy = {"input_ids": tf.constant(DUMMY_INPUTS, dtype=tf.int32)}
-        # Add `encoder_hidden_states` to make the cross-attention layers' weights initialized
-        if self.config.add_cross_attention:
-            batch_size, seq_len = tf.constant(DUMMY_INPUTS).shape
-            shape = (batch_size, seq_len) + (self.config.hidden_size,)
-            h = tf.random.uniform(shape=shape)
-            dummy["encoder_hidden_states"] = h
-
-        return dummy
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 @dataclass
 class TFGPT2DoubleHeadsModelOutput(ModelOutput):
@@ -600,9 +553,9 @@ class TFGPT2DoubleHeadsModelOutput(ModelOutput):
 
     logits: tf.Tensor = None
     mc_logits: tf.Tensor = None
-    past_key_values: Optional[List[tf.Tensor]] = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    past_key_values: List[tf.Tensor] | None = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 GPT2_START_DOCSTRING = r"""
@@ -656,7 +609,7 @@ class TFGPT2DoubleHeadsModelOutput(ModelOutput):
             If `past_key_values` is used, only input IDs that do not have their past calculated should be passed as
             `input_ids`.
 
-            Indices can be obtained using [`GPT2Tokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -727,22 +680,21 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFBaseModelOutputWithPastAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
+        input_ids: TFModelInputType | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -789,26 +741,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output):
-        pkv = tf.convert_to_tensor(output.past_key_values) if self.config.use_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = (
-            tf.convert_to_tensor(output.cross_attentions)
-            if self.config.output_attentions
-            and self.config.add_cross_attention
-            and output.cross_attentions is not None
-            else None
-        )
-
-        return TFBaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=output.last_hidden_state,
-            past_key_values=pkv,
-            hidden_states=hs,
-            attentions=attns,
-            cross_attentions=cross_attns,
-        )
-
 
 @add_start_docstrings(
     """
@@ -856,27 +788,26 @@ def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_cache=
     @unpack_inputs
     @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFCausalLMOutputWithCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
+        input_ids: TFModelInputType | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFCausalLMOutputWithCrossAttentions, Tuple[tf.Tensor]]:
         r"""
@@ -920,7 +851,7 @@ def call(
             training=training,
         )
         hidden_states = transformer_outputs[0]
-        logits = self.transformer.wte(hidden_states, mode="linear")
+        logits = tf.matmul(hidden_states, self.transformer.wte.weights, transpose_b=True)
 
         loss = None
         if labels is not None:
@@ -942,22 +873,6 @@ def call(
             cross_attentions=transformer_outputs.cross_attentions,
         )
 
-    def serving_output(self, output):
-        pkv = tf.convert_to_tensor(output.past_key_values) if self.config.use_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = (
-            tf.convert_to_tensor(output.cross_attentions)
-            if self.config.output_attentions
-            and self.config.add_cross_attention
-            and output.cross_attentions is not None
-            else None
-        )
-
-        return TFCausalLMOutputWithCrossAttentions(
-            logits=output.logits, past_key_values=pkv, hidden_states=hs, attentions=attns, cross_attentions=cross_attns
-        )
-
 
 @add_start_docstrings(
     """
@@ -982,14 +897,14 @@ def __init__(self, config, *inputs, **kwargs):
     @replace_return_docstrings(output_type=TFGPT2DoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
+        input_ids: TFModelInputType | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        mc_token_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        mc_token_ids: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1007,9 +922,9 @@ def call(
 
         ```python
         >>> import tensorflow as tf
-        >>> from transformers import GPT2Tokenizer, TFGPT2DoubleHeadsModel
+        >>> from transformers import AutoTokenizer, TFGPT2DoubleHeadsModel
 
-        >>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+        >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
         >>> model = TFGPT2DoubleHeadsModel.from_pretrained("gpt2")
 
         >>> # Add a [CLS] to the vocabulary (we should train it also!)
@@ -1058,7 +973,13 @@ def call(
         )
         hidden_states = transformer_outputs[0]
         hidden_states = tf.reshape(hidden_states, input_shapes + shape_list(hidden_states)[-1:])
-        lm_logits = self.transformer.wte(hidden_states, mode="linear")
+        if return_dict and output_hidden_states:
+            # We do this to match the slightly odd PT behaviour - the final hidden state is reshaped to rank 4 when the
+            # input is rank 3, but all other hidden states remain at rank-3 (with the first 2 dims merged)
+            all_hidden_states = transformer_outputs.hidden_states[:-1] + (hidden_states,)
+        else:
+            all_hidden_states = None
+        lm_logits = tf.matmul(hidden_states, self.transformer.wte.weights, transpose_b=True)
         mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids, training=training)
         mc_logits = tf.squeeze(mc_logits, axis=-1)
 
@@ -1069,36 +990,17 @@ def call(
             logits=lm_logits,
             mc_logits=mc_logits,
             past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
+            hidden_states=all_hidden_states,
             attentions=transformer_outputs.attentions,
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-                "mc_token_ids": tf.TensorSpec((None, None), tf.int32, name="mc_token_ids"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
-    def serving_output(self, output):
-        pkv = tf.convert_to_tensor(output.past_key_values) if self.config.use_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFGPT2DoubleHeadsModelOutput(
-            logits=output.logits,
-            mc_logits=output.mc_logits,
-            past_key_values=pkv,
-            hidden_states=hs,
-            attentions=attns,
-        )
+    @property
+    def input_signature(self):
+        return {
+            "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
+            "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
+            "mc_token_ids": tf.TensorSpec((None, None), tf.int32, name="mc_token_ids"),
+        }
 
 
 @add_start_docstrings(
@@ -1131,25 +1033,24 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="microsoft/DialogRPT-updown",
         output_type=TFSequenceClassifierOutputWithPast,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
+        input_ids: TFModelInputType | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSequenceClassifierOutputWithPast, Tuple[tf.Tensor]]:
         r"""
@@ -1181,16 +1082,10 @@ def call(
         else:
             if input_ids is not None:
                 sequence_lengths = (
-                    tf.reduce_sum(
-                        tf.cast(
-                            tf.math.not_equal(input_ids, self.config.pad_token_id),
-                            dtype=input_ids.dtype,
-                        ),
-                        -1,
-                        keepdims=False,
-                    )
+                    tf.argmax(tf.cast(tf.math.equal(input_ids, self.config.pad_token_id), input_ids.dtype), axis=-1)
                     - 1
                 )
+                sequence_lengths = tf.where(sequence_lengths >= 0, sequence_lengths, input_ids.shape[-1] - 1)
                 in_logits = tf.gather(logits, sequence_lengths, batch_dims=1, axis=1)
             else:
                 sequence_lengths = -1
@@ -1222,12 +1117,3 @@ def call(
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
         )
-
-    def serving_output(self, output):
-        pkv = tf.convert_to_tensor(output.past_key_values) if self.config.use_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutputWithPast(
-            logits=output.logits, past_key_values=pkv, hidden_states=hs, attentions=attns
-        )
diff --git a/src/transformers/models/gpt2/tokenization_gpt2.py b/src/transformers/models/gpt2/tokenization_gpt2.py
index 1be35fbfdf49..21c2cdf382e4 100644
--- a/src/transformers/models/gpt2/tokenization_gpt2.py
+++ b/src/transformers/models/gpt2/tokenization_gpt2.py
@@ -18,7 +18,7 @@
 import json
 import os
 from functools import lru_cache
-from typing import TYPE_CHECKING, List, Optional, Tuple
+from typing import List, Optional, Tuple
 
 import regex as re
 
@@ -26,9 +26,6 @@
 from ...utils import logging
 
 
-if TYPE_CHECKING:
-    from transformers.pipelines.conversational import Conversation
-
 logger = logging.get_logger(__name__)
 
 VOCAB_FILES_NAMES = {
@@ -108,12 +105,14 @@ class GPT2Tokenizer(PreTrainedTokenizer):
     This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
     be encoded differently whether it is at the beginning of the sentence (without space) or not:
 
-    ```
+    ```python
     >>> from transformers import GPT2Tokenizer
+
     >>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-    >>> tokenizer("Hello world")['input_ids']
+    >>> tokenizer("Hello world")["input_ids"]
     [15496, 995]
-    >>> tokenizer(" Hello world")['input_ids']
+
+    >>> tokenizer(" Hello world")["input_ids"]
     [18435, 995]
     ```
 
@@ -165,22 +164,13 @@ def __init__(
         pad_token=None,
         add_prefix_space=False,
         add_bos_token=False,
-        **kwargs
+        **kwargs,
     ):
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
         unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
         pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
-        super().__init__(
-            errors=errors,
-            unk_token=unk_token,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            pad_token=pad_token,
-            add_prefix_space=add_prefix_space,
-            add_bos_token=add_bos_token,
-            **kwargs,
-        )
+
         self.add_bos_token = add_bos_token
 
         with open(vocab_file, encoding="utf-8") as vocab_handle:
@@ -199,6 +189,17 @@ def __init__(
         # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
         self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
 
+        super().__init__(
+            errors=errors,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            add_prefix_space=add_prefix_space,
+            add_bos_token=add_bos_token,
+            **kwargs,
+        )
+
     @property
     def vocab_size(self):
         return len(self.encoder)
@@ -352,10 +353,9 @@ def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
             text = " " + text
         return (text, kwargs)
 
-    def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
-        input_ids = []
-        for is_user, text in conversation.iter_texts():
-            input_ids.extend(self.encode(text, add_special_tokens=False) + [self.eos_token_id])
-        if len(input_ids) > self.model_max_length:
-            input_ids = input_ids[-self.model_max_length :]
-        return input_ids
+    @property
+    def default_chat_template(self):
+        """
+        A simple chat template that ignores role information and just concatenates messages with EOS tokens.
+        """
+        return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}"
diff --git a/src/transformers/models/gpt2/tokenization_gpt2_fast.py b/src/transformers/models/gpt2/tokenization_gpt2_fast.py
index eefd35aa94a7..189a35508408 100644
--- a/src/transformers/models/gpt2/tokenization_gpt2_fast.py
+++ b/src/transformers/models/gpt2/tokenization_gpt2_fast.py
@@ -16,7 +16,7 @@
 
 
 import json
-from typing import TYPE_CHECKING, List, Optional, Tuple
+from typing import Optional, Tuple
 
 from tokenizers import pre_tokenizers
 
@@ -26,10 +26,6 @@
 from .tokenization_gpt2 import GPT2Tokenizer
 
 
-if TYPE_CHECKING:
-    from transformers.pipelines.conversational import Conversation
-
-
 logger = logging.get_logger(__name__)
 
 VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
@@ -75,12 +71,14 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast):
     This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
     be encoded differently whether it is at the beginning of the sentence (without space) or not:
 
-    ```
+    ```python
     >>> from transformers import GPT2TokenizerFast
+
     >>> tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
-    >>> tokenizer("Hello world")['input_ids']
+    >>> tokenizer("Hello world")["input_ids"]
     [15496, 995]
-    >>> tokenizer(" Hello world")['input_ids']
+
+    >>> tokenizer(" Hello world")["input_ids"]
     [18435, 995]
     ```
 
@@ -133,7 +131,7 @@ def __init__(
         bos_token="<|endoftext|>",
         eos_token="<|endoftext|>",
         add_prefix_space=False,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
@@ -179,12 +177,10 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
         files = self._tokenizer.model.save(save_directory, name=filename_prefix)
         return tuple(files)
 
-    def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
-        """This corresponds to DialoGPT variants of models."""
-        input_ids = []
-        for is_user, text in conversation.iter_texts():
-            input_ids.extend(self.encode(text, add_special_tokens=False) + [self.eos_token_id])
-
-        if len(input_ids) > self.model_max_length:
-            input_ids = input_ids[-self.model_max_length :]
-        return input_ids
+    @property
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.default_chat_template
+    def default_chat_template(self):
+        """
+        A simple chat template that ignores role information and just concatenates messages with EOS tokens.
+        """
+        return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}"
diff --git a/src/transformers/models/gpt2/tokenization_gpt2_tf.py b/src/transformers/models/gpt2/tokenization_gpt2_tf.py
index ba6f754373c5..4ab4af5b9d66 100644
--- a/src/transformers/models/gpt2/tokenization_gpt2_tf.py
+++ b/src/transformers/models/gpt2/tokenization_gpt2_tf.py
@@ -2,7 +2,6 @@
 from typing import Dict, List, Union
 
 import tensorflow as tf
-
 from keras_nlp.tokenizers import BytePairTokenizer
 from tensorflow_text import pad_model_inputs
 
diff --git a/src/transformers/models/gpt_bigcode/__init__.py b/src/transformers/models/gpt_bigcode/__init__.py
new file mode 100644
index 000000000000..33660eb81e4f
--- /dev/null
+++ b/src/transformers/models/gpt_bigcode/__init__.py
@@ -0,0 +1,65 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_gpt_bigcode": ["GPT_BIGCODE_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTBigCodeConfig"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_gpt_bigcode"] = [
+        "GPT_BIGCODE_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "GPTBigCodeForSequenceClassification",
+        "GPTBigCodeForTokenClassification",
+        "GPTBigCodeForCausalLM",
+        "GPTBigCodeModel",
+        "GPTBigCodePreTrainedModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_gpt_bigcode import GPT_BIGCODE_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTBigCodeConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_gpt_bigcode import (
+            GPT_BIGCODE_PRETRAINED_MODEL_ARCHIVE_LIST,
+            GPTBigCodeForCausalLM,
+            GPTBigCodeForSequenceClassification,
+            GPTBigCodeForTokenClassification,
+            GPTBigCodeModel,
+            GPTBigCodePreTrainedModel,
+        )
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py
new file mode 100644
index 000000000000..9cbaf3e18485
--- /dev/null
+++ b/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py
@@ -0,0 +1,145 @@
+# coding=utf-8
+# Copyright 2023 The BigCode team and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" GPTBigCode configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+GPT_BIGCODE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "bigcode/gpt_bigcode-santacoder": "https://huggingface.co/bigcode/gpt_bigcode-santacoder/resolve/main/config.json",
+}
+
+
+class GPTBigCodeConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`GPTBigCodeModel`]. It is used to instantiate a
+    GPTBigCode model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the GPTBigCode
+    [gpt_bigcode](https://huggingface.co/gpt_bigcode) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50257):
+            Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`GPTBigCodeModel`].
+        n_positions (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        n_embd (`int`, *optional*, defaults to 768):
+            Dimensionality of the embeddings and hidden states.
+        n_layer (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        n_head (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        n_inner (`int`, *optional*, defaults to None):
+            Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
+        activation_function (`str`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new",
+            "gelu_pytorch_tanh"]`.
+        resid_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        embd_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        attn_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon to use in the layer normalization layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        scale_attn_weights (`bool`, *optional*, defaults to `True`):
+            Scale attention weights by dividing by sqrt(hidden_size)..
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        attention_softmax_in_fp32 (`bool`, *optional*, defaults to `True`):
+            Whether to call the fused softmax in float32.
+        scale_attention_softmax_in_fp32 (`bool`, *optional*, defaults to `True`):
+            Whether to scale the attention softmax in float32.
+        attention_type (`bool`, *optional*, defaults to `True`):
+            Whether to use Multi-Query Attion (`True`) or Multi-Head Attention (`False`).
+    Example:
+
+    ```python
+    >>> from transformers import GPTBigCodeConfig, GPTBigCodeModel
+
+    >>> # Initializing a GPTBigCode configuration
+    >>> configuration = GPTBigCodeConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = GPTBigCodeModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "gpt_bigcode"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "hidden_size": "n_embd",
+        "max_position_embeddings": "n_positions",
+        "num_attention_heads": "n_head",
+        "num_hidden_layers": "n_layer",
+    }
+
+    def __init__(
+        self,
+        vocab_size=50257,
+        n_positions=1024,
+        n_embd=768,
+        n_layer=12,
+        n_head=12,
+        n_inner=None,
+        activation_function="gelu_pytorch_tanh",
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        scale_attn_weights=True,
+        use_cache=True,
+        bos_token_id=50256,
+        eos_token_id=50256,
+        attention_softmax_in_fp32=True,
+        scale_attention_softmax_in_fp32=True,
+        multi_query=True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_inner = n_inner
+        self.activation_function = activation_function
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.scale_attn_weights = scale_attn_weights
+        self.use_cache = use_cache
+        self.attention_softmax_in_fp32 = attention_softmax_in_fp32
+        self.scale_attention_softmax_in_fp32 = scale_attention_softmax_in_fp32
+        self.multi_query = multi_query
+
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
diff --git a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
new file mode 100644
index 000000000000..d58e00af1dac
--- /dev/null
+++ b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
@@ -0,0 +1,1066 @@
+# coding=utf-8
+# Copyright 2023 The Bigcode team and HuggingFace Inc. team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch GPTBigCode model."""
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from .configuration_gpt_bigcode import GPTBigCodeConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "bigcode/gpt_bigcode-santacoder"
+_CONFIG_FOR_DOC = "GPTBigCodeConfig"
+
+GPT_BIGCODE_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "bigcode/gpt_bigcode-santacoder",
+    # See all GPTBigCode models at https://huggingface.co/models?filter=gpt_bigcode
+]
+
+
+# Fused kernels
+# Use separate functions for each case because conditionals prevent kernel fusion.
+# TODO: Could have better fused kernels depending on scaling, dropout and head mask.
+#  Is it doable without writing 32 functions?
+@torch.jit.script
+def upcast_masked_softmax(
+    x: torch.Tensor, mask: torch.Tensor, mask_value: torch.Tensor, scale: float, softmax_dtype: torch.dtype
+):
+    input_dtype = x.dtype
+    x = x.to(softmax_dtype) * scale
+    x = torch.where(mask, x, mask_value)
+    x = torch.nn.functional.softmax(x, dim=-1).to(input_dtype)
+    return x
+
+
+@torch.jit.script
+def upcast_softmax(x: torch.Tensor, scale: float, softmax_dtype: torch.dtype):
+    input_dtype = x.dtype
+    x = x.to(softmax_dtype) * scale
+    x = torch.nn.functional.softmax(x, dim=-1).to(input_dtype)
+    return x
+
+
+@torch.jit.script
+def masked_softmax(x: torch.Tensor, mask: torch.Tensor, mask_value: torch.Tensor):
+    x = torch.where(mask, x, mask_value)
+    x = torch.nn.functional.softmax(x, dim=-1)
+    return x
+
+
+class GPTBigCodeAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False, layer_idx=None):
+        super().__init__()
+        self.mask_value = None
+
+        self.multi_query = config.multi_query
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        self.kv_heads = 1 if self.multi_query else self.num_heads
+        self.kv_dim = self.kv_heads * self.head_dim
+        self.split_size = self.embed_dim
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+
+        self.scale_attn_weights = config.scale_attn_weights
+        self.is_cross_attention = is_cross_attention
+
+        self.layer_idx = layer_idx
+        self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
+        self.scale_attention_softmax_in_fp32 = (
+            config.scale_attention_softmax_in_fp32 and config.attention_softmax_in_fp32
+        )
+
+        if self.is_cross_attention:
+            if self.multi_query:
+                raise NotImplementedError("Multi-Query Attention not supported for cross_attention")
+
+            self.c_attn = nn.Linear(self.embed_dim, 2 * self.embed_dim)
+            self.q_attn = nn.Linear(self.embed_dim, self.embed_dim)
+        else:
+            self.c_attn = nn.Linear(self.embed_dim, self.embed_dim + 2 * self.kv_dim)
+
+        self.c_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+        self.attn_dropout = nn.Dropout(config.attn_pdrop)
+        self.resid_dropout = nn.Dropout(config.resid_pdrop)
+
+    def _get_mask_value(self, device, dtype):
+        # torch.where expects a tensor. We use a cache to avoid recreating it every time.
+        if self.mask_value is None or self.mask_value.dtype != dtype or self.mask_value.device != device:
+            self.mask_value = torch.full([], torch.finfo(dtype).min, dtype=dtype, device=device)
+        return self.mask_value
+
+    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
+        dtype = query.dtype
+        softmax_dtype = torch.float32 if self.attention_softmax_in_fp32 else dtype
+        upcast = dtype != softmax_dtype
+
+        unscale = self.layer_idx + 1 if self.scale_attention_softmax_in_fp32 and upcast else 1
+        scale_factor = unscale**-1
+        if self.scale_attn_weights:
+            scale_factor /= self.head_dim**0.5
+
+        # MQA models: (batch_size, query_length, num_heads * head_dim)
+        # MHA models: (batch_size, num_heads, query_length, head_dim)
+        query_shape = query.shape
+        batch_size = query_shape[0]
+        key_length = key.size(-1)
+        if self.multi_query:
+            # (batch_size, query_length, num_heads, head_dim) x (batch_size, head_dim, key_length)
+            # -> (batch_size, query_length, num_heads, key_length)
+            query_length = query_shape[1]
+            attn_shape = (batch_size, query_length, self.num_heads, key_length)
+            attn_view = (batch_size, query_length * self.num_heads, key_length)
+            # No copy needed for MQA 2, or when layer_past is provided.
+            query = query.reshape(batch_size, query_length * self.num_heads, self.head_dim)
+        else:
+            # (batch_size, num_heads, query_length, head_dim) x (batch_size, num_heads, head_dim, key_length)
+            # -> (batch_size, num_heads, query_length, key_length)
+            query_length = query_shape[2]
+            attn_shape = (batch_size, self.num_heads, query_length, key_length)
+            attn_view = (batch_size * self.num_heads, query_length, key_length)
+            # Always copies
+            query = query.reshape(batch_size * self.num_heads, query_length, self.head_dim)
+            # No copy when layer_past is provided.
+            key = key.reshape(batch_size * self.num_heads, self.head_dim, key_length)
+
+        attn_weights = torch.empty(attn_view, device=query.device, dtype=query.dtype)
+        if query.device.type == "cpu":
+            # This is needed because of a bug in pytorch https://github.com/pytorch/pytorch/issues/80588.
+            # The bug was fixed in https://github.com/pytorch/pytorch/pull/96086,
+            # but the fix has not been released as of pytorch version 2.0.0.
+            attn_weights = torch.zeros_like(attn_weights)
+            beta = 1
+        else:
+            beta = 0
+        attn_weights = torch.baddbmm(attn_weights, query, key, beta=beta, alpha=scale_factor).view(attn_shape)
+
+        if upcast:
+            # Use a fused kernel to prevent a large overhead from casting and scaling.
+            # Sub-optimal when the key length is not a multiple of 8.
+            if attention_mask is None:
+                attn_weights = upcast_softmax(attn_weights, unscale, softmax_dtype)
+            else:
+                mask_value = self._get_mask_value(attn_weights.device, softmax_dtype)
+                attn_weights = upcast_masked_softmax(attn_weights, attention_mask, mask_value, unscale, softmax_dtype)
+        else:
+            if attention_mask is not None:
+                mask_value = self._get_mask_value(attn_weights.device, softmax_dtype)
+
+                # The fused kernel is very slow when the key length is not a multiple of 8, so we skip fusion.
+                attn_weights = torch.where(attention_mask, attn_weights, mask_value)
+
+            attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1)
+
+        attn_weights = self.attn_dropout(attn_weights)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            if self.multi_query:
+                head_mask = head_mask.transpose(1, 2)
+            attn_weights = attn_weights * head_mask
+
+        if self.multi_query:
+            attn_output = torch.bmm(attn_weights.view(attn_view), value).view(query_shape)
+        else:
+            attn_output = torch.matmul(attn_weights, value)
+
+        return attn_output, attn_weights
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        layer_past: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+    ) -> Union[
+        Tuple[torch.Tensor, Optional[torch.Tensor]],
+        Tuple[torch.Tensor, Optional[torch.Tensor], Tuple[torch.Tensor, ...]],
+    ]:
+        if encoder_hidden_states is not None:
+            if not hasattr(self, "q_attn") or not self.is_cross_attention:
+                raise ValueError(
+                    "If class is used as cross attention, the weights `q_attn` have to be defined. "
+                    "Please make sure to instantiate class with `GPTBigCodeAttention(..., is_cross_attention=True)`."
+                )
+
+            query = self.q_attn(hidden_states)
+            key_value = self.c_attn(encoder_hidden_states)
+            attention_mask = encoder_attention_mask
+        elif self.multi_query:
+            query, key_value = self.c_attn(hidden_states).split((self.embed_dim, 2 * self.kv_dim), dim=2)
+        else:
+            # Note: We split as (self.num_heads, 3, self.head_dim) instead of (3, self.num_heads, self.head_dim),
+            # i.e., the memory layout is not the same as GPT2.
+            # This makes the concatenation with past_key_value more efficient.
+            query, key_value = (
+                self.c_attn(hidden_states)
+                .view(*hidden_states.shape[:2], self.num_heads, 3 * self.head_dim)
+                .transpose(1, 2)
+                .split((self.head_dim, 2 * self.head_dim), dim=3)
+            )
+
+        if layer_past is not None:
+            key_value = torch.cat((layer_past, key_value), dim=-2)
+        present = key_value if use_cache else None
+
+        key, value = key_value.split((self.head_dim, self.head_dim), dim=-1)
+
+        attn_output, attn_weights = self._attn(query, key.transpose(-1, -2), value, attention_mask, head_mask)
+
+        if not self.multi_query:
+            attn_output = attn_output.transpose(1, 2).reshape(hidden_states.shape)
+        attn_output = self.c_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output)
+
+        outputs = (attn_output, present)
+        if output_attentions:
+            if self.multi_query:
+                # Transpose to return weights in the usual format (batch_size, num_heads, query_length, key_length)
+                attn_weights = attn_weights.transpose(1, 2)
+            outputs += (attn_weights,)
+
+        return outputs  # a, present, (attentions)
+
+
+class GPTBigCodeMLP(nn.Module):
+    def __init__(self, intermediate_size, config):
+        super().__init__()
+        embed_dim = config.hidden_size
+        self.c_fc = nn.Linear(embed_dim, intermediate_size)
+        self.c_proj = nn.Linear(intermediate_size, embed_dim)
+        self.act = ACT2FN[config.activation_function]
+        self.dropout = nn.Dropout(config.resid_pdrop)
+
+    # Copied from transformers.models.gpt2.modeling_gpt2.GPT2MLP.forward
+    def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
+        hidden_states = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.c_proj(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class GPTBigCodeBlock(nn.Module):
+    def __init__(self, config, layer_idx=None):
+        super().__init__()
+        hidden_size = config.hidden_size
+        self.inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
+
+        self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.attn = GPTBigCodeAttention(config, layer_idx=layer_idx)
+        self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+
+        if config.add_cross_attention:
+            if config.multi_query:
+                raise NotImplementedError("Cross-attention not implemented for MQA")
+            self.crossattention = GPTBigCodeAttention(config, is_cross_attention=True, layer_idx=layer_idx)
+            self.ln_cross_attn = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+
+        self.mlp = GPTBigCodeMLP(self.inner_dim, config)
+
+    def forward(
+        self,
+        hidden_states: Optional[Tuple[torch.Tensor]],
+        layer_past: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+    ) -> Union[
+        Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor, torch.Tensor]
+    ]:
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        attn_outputs = self.attn(
+            hidden_states,
+            layer_past=layer_past,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
+        outputs = attn_outputs[1:]
+        # residual connection
+        hidden_states = attn_output + residual
+
+        if encoder_hidden_states is not None:
+            # add one self-attention block for cross-attention
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
+                    "cross-attention layers by setting `config.add_cross_attention=True`"
+                )
+            residual = hidden_states
+            hidden_states = self.ln_cross_attn(hidden_states)
+            cross_attn_outputs = self.crossattention(
+                hidden_states,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                output_attentions=output_attentions,
+            )
+            attn_output = cross_attn_outputs[0]
+            # residual connection
+            hidden_states = residual + attn_output
+            outputs = outputs + cross_attn_outputs[2:]  # add cross attentions if we output attention weights
+
+        residual = hidden_states
+        hidden_states = self.ln_2(hidden_states)
+        feed_forward_hidden_states = self.mlp(hidden_states)
+        # residual connection
+        hidden_states = residual + feed_forward_hidden_states
+
+        if use_cache:
+            outputs = (hidden_states,) + outputs
+        else:
+            outputs = (hidden_states,) + outputs[1:]
+
+        return outputs  # hidden_states, present, (attentions, cross_attentions)
+
+
+class GPTBigCodePreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = GPTBigCodeConfig
+    base_model_prefix = "transformer"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["GPTBigCodeBlock"]
+    _skip_keys_device_placement = "past_key_values"
+
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, (GPTBigCodeMLP, GPTBigCodeAttention)):
+            # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
+            #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
+            #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
+            #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
+            #
+            # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+            module.c_proj.weight.data.normal_(
+                mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.n_layer))
+            )
+            module.c_proj._is_hf_initialized = True
+        elif isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    # Copied from transformers.models.gpt2.modeling_gpt2.GPT2PreTrainedModel._set_gradient_checkpointing with GPT2->GPTBigCode
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, GPTBigCodeModel):
+            module.gradient_checkpointing = value
+
+
+GPT_BIGCODE_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`GPTBigCodeConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+GPT_BIGCODE_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.Tensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
+            `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
+            sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        past_key_values (`Tuple[torch.Tensor]` of length `config.n_layers`):
+            Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
+            `past_key_values` output below). Can be used to speed up sequential decoding. The `input_ids` which have
+            their past given to this model should not be passed as `input_ids` as they have already been computed.
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            If `past_key_values` is used, `attention_mask` needs to contain the masking strategy that was used for
+            `past_key_values`. In other words, the `attention_mask` always has to have the length:
+            `len(past_key_values) + len(input_ids)`
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.Tensor` of shape `(batch_size, input_ids_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+
+            If `past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see
+            `past_key_values`).
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare GPT_BIGCODE Model transformer outputting raw hidden-states without any specific head on top.",
+    GPT_BIGCODE_START_DOCSTRING,
+)
+class GPTBigCodeModel(GPTBigCodePreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.multi_query = config.multi_query
+        self.embed_dim = config.hidden_size
+
+        self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
+        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
+
+        self.drop = nn.Dropout(config.embd_pdrop)
+        self.h = nn.ModuleList([GPTBigCodeBlock(config, layer_idx=i) for i in range(config.num_hidden_layers)])
+        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+
+        max_positions = config.max_position_embeddings
+        self.register_buffer(
+            "bias", torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)), persistent=False
+        )
+
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.wte
+
+    def set_input_embeddings(self, new_embeddings):
+        self.wte = new_embeddings
+
+    @add_start_docstrings_to_model_forward(GPT_BIGCODE_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPastAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+            batch_size = input_ids.shape[0]
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size = inputs_embeds.shape[0]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if batch_size <= 0:
+            raise ValueError("batch_size has to be defined and > 0")
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.view(-1, input_shape[-1])
+
+        if past_key_values is None:
+            past_length = 0
+            past_key_values = tuple([None] * len(self.h))
+        else:
+            past_length = past_key_values[0].size(-2)
+
+        if attention_mask is not None and len(attention_mask.shape) == 2 and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_length > 0:
+                position_ids = position_ids[:, past_length : input_shape[-1] + past_length :]
+        elif position_ids is None:
+            position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
+            position_ids = position_ids.unsqueeze(0)
+
+        # Self-attention mask.
+        query_length = input_shape[-1]
+        key_length = past_length + query_length
+        self_attention_mask = self.bias[None, key_length - query_length : key_length, :key_length]
+
+        if attention_mask is not None:
+            self_attention_mask = self_attention_mask * attention_mask.view(batch_size, 1, -1).to(
+                dtype=torch.bool, device=self_attention_mask.device
+            )
+
+        # MQA models: (batch_size, query_length, n_heads, key_length)
+        # MHA models: (batch_size, n_heads, query_length, key_length)
+        attention_mask = self_attention_mask.unsqueeze(2 if self.multi_query else 1)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if (
+            self.config.add_cross_attention
+            and encoder_hidden_states is not None
+            and encoder_attention_mask is not None
+        ):
+            if encoder_attention_mask.dim() == 2:
+                encoder_attention_mask.unsqueeze(1)
+            assert encoder_attention_mask.dim() == 3
+            encoder_attention_mask = encoder_attention_mask.bool().unsqueeze(2 if self.multi_query else 1)
+        else:
+            encoder_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # head_mask has shape n_layer x batch x n_heads x N x N
+        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.wte(input_ids)
+        position_embeds = self.wpe(position_ids)
+        hidden_states = inputs_embeds + position_embeds
+
+        if token_type_ids is not None:
+            token_type_embeds = self.wte(token_type_ids)
+            hidden_states = hidden_states + token_type_embeds
+
+        hidden_states = self.drop(hidden_states)
+
+        output_shape = input_shape + (hidden_states.size(-1),)
+
+        presents = [] if use_cache else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+        all_hidden_states = () if output_hidden_states else None
+        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, use_cache, output_attentions)
+
+                    return custom_forward
+
+                outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    None,
+                    attention_mask,
+                    head_mask[i],
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                outputs = block(
+                    hidden_states,
+                    layer_past=layer_past,
+                    attention_mask=attention_mask,
+                    head_mask=head_mask[i],
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = outputs[0]
+            if use_cache:
+                presents.append(outputs[1])
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (outputs[3 if use_cache else 2],)
+
+        hidden_states = self.ln_f(hidden_states)
+
+        hidden_states = hidden_states.view(output_shape)
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, presents, all_hidden_states, all_self_attentions, all_cross_attentions]
+                if v is not None
+            )
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The GPT_BIGCODE Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """,
+    GPT_BIGCODE_START_DOCSTRING,
+)
+class GPTBigCodeForCausalLM(GPTBigCodePreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = GPTBigCodeModel(config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
+        token_type_ids = kwargs.get("token_type_ids", None)
+        # only last token for inputs_ids if past is defined in kwargs
+        if past_key_values:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+            if token_type_ids is not None:
+                token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
+
+        attention_mask = kwargs.get("attention_mask", None)
+        position_ids = kwargs.get("position_ids", None)
+
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+        else:
+            position_ids = None
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "position_ids": position_ids,
+                "attention_mask": attention_mask,
+                "token_type_ids": token_type_ids,
+            }
+        )
+        return model_inputs
+
+    @add_start_docstrings_to_model_forward(GPT_BIGCODE_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutputWithCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
+        r"""
+        labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+
+        lm_logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous().to(shift_logits.device)
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+            cross_attentions=transformer_outputs.cross_attentions,
+        )
+
+    @staticmethod
+    def _reorder_cache(
+        past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
+    ) -> Tuple[Tuple[torch.Tensor]]:
+        """
+        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
+        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
+        beam_idx at every generation step.
+        """
+        return tuple(layer_past.index_select(0, beam_idx.to(layer_past.device)) for layer_past in past_key_values)
+
+
+@add_start_docstrings(
+    """
+    The GPTBigCode Model transformer with a sequence classification head on top (linear layer).
+
+    [`GPTBigCodeForSequenceClassification`] uses the last token in order to do the classification, as other causal
+    models (e.g. GPT-1) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    GPT_BIGCODE_START_DOCSTRING,
+)
+class GPTBigCodeForSequenceClassification(GPTBigCodePreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.transformer = GPTBigCodeModel(config)
+        self.score = nn.Linear(config.n_embd, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(GPT_BIGCODE_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size, sequence_length = input_ids.shape[:2]
+        else:
+            batch_size, sequence_length = inputs_embeds.shape[:2]
+
+        assert (
+            self.config.pad_token_id is not None or batch_size == 1
+        ), "Cannot handle batch sizes > 1 if no padding token is defined."
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+                    logits.device
+                )
+            else:
+                sequence_lengths = -1
+                logger.warning(
+                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                    "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+                )
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    GPT_BIGCODE Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
+    for Named-Entity-Recognition (NER) tasks.
+    """,
+    GPT_BIGCODE_START_DOCSTRING,
+)
+class GPTBigCodeForTokenClassification(GPTBigCodePreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.transformer = GPTBigCodeModel(config)
+        if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
+            classifier_dropout = config.classifier_dropout
+        elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(GPT_BIGCODE_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+        hidden_states = self.dropout(hidden_states)
+        logits = self.classifier(hidden_states)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1).to(logits.device))
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
diff --git a/src/transformers/models/gpt_neo/__init__.py b/src/transformers/models/gpt_neo/__init__.py
index b57f7c3f9760..02ca0a11949b 100644
--- a/src/transformers/models/gpt_neo/__init__.py
+++ b/src/transformers/models/gpt_neo/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -33,7 +29,9 @@
     _import_structure["modeling_gpt_neo"] = [
         "GPT_NEO_PRETRAINED_MODEL_ARCHIVE_LIST",
         "GPTNeoForCausalLM",
+        "GPTNeoForQuestionAnswering",
         "GPTNeoForSequenceClassification",
+        "GPTNeoForTokenClassification",
         "GPTNeoModel",
         "GPTNeoPreTrainedModel",
         "load_tf_weights_in_gpt_neo",
@@ -64,7 +62,9 @@
         from .modeling_gpt_neo import (
             GPT_NEO_PRETRAINED_MODEL_ARCHIVE_LIST,
             GPTNeoForCausalLM,
+            GPTNeoForQuestionAnswering,
             GPTNeoForSequenceClassification,
+            GPTNeoForTokenClassification,
             GPTNeoModel,
             GPTNeoPreTrainedModel,
             load_tf_weights_in_gpt_neo,
diff --git a/src/transformers/models/gpt_neo/configuration_gpt_neo.py b/src/transformers/models/gpt_neo/configuration_gpt_neo.py
index 4d07670621ff..9b84b18e26c0 100644
--- a/src/transformers/models/gpt_neo/configuration_gpt_neo.py
+++ b/src/transformers/models/gpt_neo/configuration_gpt_neo.py
@@ -47,37 +47,46 @@ class GPTNeoConfig(PretrainedConfig):
             Vocabulary size of the GPT Neo model. Defines the number of different tokens that can be represented by the
             `inputs_ids` passed when calling [`GPTNeoModel`]. Vocabulary size of the model. Defines the different
             tokens that can be represented by the *inputs_ids* passed to the forward method of [`GPTNeoModel`].
-        attention_types (`List`, *optional*, defaults to `[[["global", "local"], 12]]`):
-            The type of attention for each layer in a `List` of the following format `[[["attention_type"],
-            num_layerss]]` e.g. for a 24 layer model `[[["global"], 24]]` or `[[["global", "local"], 12]]` Choose the
-            value of `attention_type` from `["global", "local"]`
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
         hidden_size (`int`, *optional*, defaults to 2048):
             Dimensionality of the encoder layers and the pooler layer.
         num_layers (`int`, *optional*, defaults to 24):
             Number of hidden layers in the Transformer encoder.
+        attention_types (`List`, *optional*, defaults to `[[['global', 'local'], 12]]`):
+            The type of attention for each layer in a `List` of the following format `[[["attention_type"],
+            num_layerss]]` e.g. for a 24 layer model `[[["global"], 24]]` or `[[["global", "local"], 12]]` Choose the
+            value of `attention_type` from `["global", "local"]`
         num_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
         intermediate_size (`int`, *optional*, defaults to 8192):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        window_size (`int`, *optional*, defaults to 256):
+            The size of the sliding window for local attention.
         activation_function (`str` or `function`, *optional*, defaults to `"gelu_new"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        resid_dropout (`float`, *optional*, defaults to 0.0):
+            Residual dropout used in the attention pattern.
         embed_dropout (`float`, *optional*, defaults to 0.0):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        max_position_embeddings (`int`, *optional*, defaults to 2048):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (`int`, *optional*, defaults to 2):
-            The vocabulary size of the `token_type_ids` passed when calling [`GPTNeoModel`].
+        classifier_dropout (`float`, *optional*, defaults to 0.1):
+            Argument used when doing token classification, used in the model [`GPTNeoForTokenClassification`]. The
+            dropout ratio for the hidden layer.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
-            The epsilon used by the layer normalization layers.
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`.
+        bos_token_id (`int`, *optional*, defaults to 50256):
+            The id of the beginning of sentence token in the vocabulary.
+        eos_token_id (`int`, *optional*, defaults to 50256):
+            The id of the end of sentence token in the vocabulary.
 
     Example:
 
@@ -111,17 +120,13 @@ def __init__(
         resid_dropout=0.0,
         embed_dropout=0.0,
         attention_dropout=0.0,
+        classifier_dropout=0.1,
         layer_norm_epsilon=1e-5,
         initializer_range=0.02,
-        summary_type="cls_index",
-        summary_use_proj=True,
-        summary_activation=None,
-        summary_proj_to_labels=True,
-        summary_first_dropout=0.1,
         use_cache=True,
         bos_token_id=50256,
         eos_token_id=50256,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
@@ -134,13 +139,9 @@ def __init__(
         self.resid_dropout = resid_dropout
         self.embed_dropout = embed_dropout
         self.attention_dropout = attention_dropout
+        self.classifier_dropout = classifier_dropout
         self.layer_norm_epsilon = layer_norm_epsilon
         self.initializer_range = initializer_range
-        self.summary_type = summary_type
-        self.summary_use_proj = summary_use_proj
-        self.summary_activation = summary_activation
-        self.summary_first_dropout = summary_first_dropout
-        self.summary_proj_to_labels = summary_proj_to_labels
         self.use_cache = use_cache
 
         self.bos_token_id = bos_token_id
@@ -231,7 +232,6 @@ def generate_dummy_inputs(
         is_pair: bool = False,
         framework: Optional[TensorType] = None,
     ) -> Mapping[str, Any]:
-
         common_inputs = super(OnnxConfigWithPast, self).generate_dummy_inputs(
             tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
         )
diff --git a/src/transformers/models/gpt_neo/modeling_flax_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_flax_gpt_neo.py
index 20505c511fcc..5639ca50f166 100644
--- a/src/transformers/models/gpt_neo/modeling_flax_gpt_neo.py
+++ b/src/transformers/models/gpt_neo/modeling_flax_gpt_neo.py
@@ -34,7 +34,6 @@
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "GPTNeoConfig"
-_TOKENIZER_FOR_DOC = "GPT2Tokenizer"
 _CHECKPOINT_FOR_DOC = "EleutherAI/gpt-neo-1.3B"
 
 
@@ -78,7 +77,7 @@
         input_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`):
             `input_ids_length` = `sequence_length`. Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`GPTNeoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -223,7 +222,7 @@ def __call__(
         attention_bias = lax.select(
             attention_mask > 0,
             jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
-            jnp.full(attention_mask.shape, -1e9).astype(self.dtype),
+            jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
         )
 
         # usual dot product attention
@@ -593,9 +592,7 @@ class FlaxGPTNeoModel(FlaxGPTNeoPreTrainedModel):
     module_class = FlaxGPTNeoModule
 
 
-append_call_sample_docstring(
-    FlaxGPTNeoModel, _TOKENIZER_FOR_DOC, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutput, _CONFIG_FOR_DOC
-)
+append_call_sample_docstring(FlaxGPTNeoModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutput, _CONFIG_FOR_DOC)
 
 
 class FlaxGPTNeoForCausalLMModule(nn.Module):
@@ -657,7 +654,7 @@ def __call__(
 class FlaxGPTNeoForCausalLM(FlaxGPTNeoPreTrainedModel):
     module_class = FlaxGPTNeoForCausalLMModule
 
-    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jnp.DeviceArray] = None):
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
         # initializing the cache
         batch_size, seq_length = input_ids.shape
 
@@ -684,6 +681,4 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
         return model_kwargs
 
 
-append_call_sample_docstring(
-    FlaxGPTNeoForCausalLM, _TOKENIZER_FOR_DOC, _CHECKPOINT_FOR_DOC, FlaxCausalLMOutput, _CONFIG_FOR_DOC
-)
+append_call_sample_docstring(FlaxGPTNeoForCausalLM, _CHECKPOINT_FOR_DOC, FlaxCausalLMOutput, _CONFIG_FOR_DOC)
diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
index 002b6881752c..6364cfc31622 100755
--- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py
+++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
@@ -29,7 +29,9 @@
     BaseModelOutputWithPastAndCrossAttentions,
     CausalLMOutputWithCrossAttentions,
     CausalLMOutputWithPast,
+    QuestionAnsweringModelOutput,
     SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
@@ -39,7 +41,6 @@
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "GPTNeoConfig"
-_TOKENIZER_FOR_DOC = "GPT2Tokenizer"
 
 GPT_NEO_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "EleutherAI/gpt-neo-1.3B",
@@ -134,7 +135,7 @@ def __init__(self, config, attention_type):
         super().__init__()
 
         max_positions = config.max_position_embeddings
-        bias = torch.tril(torch.ones((max_positions, max_positions), dtype=torch.uint8)).view(
+        bias = torch.tril(torch.ones((max_positions, max_positions), dtype=bool)).view(
             1, 1, max_positions, max_positions
         )
 
@@ -144,8 +145,8 @@ def __init__(self, config, attention_type):
         if attention_type == "local":
             bias = torch.bitwise_xor(bias, torch.tril(bias, -config.window_size))
 
-        self.register_buffer("bias", bias)
-        self.register_buffer("masked_bias", torch.tensor(-1e9))
+        self.register_buffer("bias", bias, persistent=False)
+        self.register_buffer("masked_bias", torch.tensor(-1e9), persistent=False)
 
         self.attn_dropout = nn.Dropout(float(config.attention_dropout))
         self.resid_dropout = nn.Dropout(float(config.resid_dropout))
@@ -188,7 +189,7 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None):
         attn_weights = torch.matmul(query, key.transpose(-1, -2))
 
         query_length, key_length = query.size(-2), key.size(-2)
-        causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].to(torch.bool)
+        causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
         mask_value = torch.finfo(attn_weights.dtype).min
         # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
         # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
@@ -220,7 +221,6 @@ def forward(
         use_cache=False,
         output_attentions=False,
     ):
-
         query = self.q_proj(hidden_states)
         key = self.k_proj(hidden_states)
         value = self.v_proj(hidden_states)
@@ -363,6 +363,7 @@ class GPTNeoPreTrainedModel(PreTrainedModel):
     base_model_prefix = "transformer"
     supports_gradient_checkpointing = True
     _no_split_modules = ["GPTNeoBlock"]
+    _skip_keys_device_placement = "past_key_values"
 
     def __init__(self, *inputs, **kwargs):
         super().__init__(*inputs, **kwargs)
@@ -414,7 +415,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
             `input_ids`.
 
-            Indices can be obtained using [`GPTNeoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -496,7 +497,6 @@ def set_input_embeddings(self, new_embeddings):
 
     @add_start_docstrings_to_model_forward(GPT_NEO_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPastAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -525,6 +525,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
             batch_size = input_ids.shape[0]
@@ -538,8 +539,6 @@ def forward(
 
         if token_type_ids is not None:
             token_type_ids = token_type_ids.view(-1, input_shape[-1])
-        if position_ids is not None:
-            position_ids = position_ids.view(-1, input_shape[-1])
 
         if past_key_values is None:
             past_length = 0
@@ -549,7 +548,7 @@ def forward(
 
         if position_ids is None:
             position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
-            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
+            position_ids = position_ids.unsqueeze(0)
 
         # Attention mask.
         if attention_mask is not None:
@@ -588,7 +587,14 @@ def forward(
 
         hidden_states = self.drop(hidden_states)
 
-        output_shape = input_shape + (hidden_states.size(-1),)
+        output_shape = (-1,) + input_shape[1:] + (hidden_states.size(-1),)
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
 
         presents = () if use_cache else None
         all_self_attentions = () if output_attentions else None
@@ -599,12 +605,6 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         # None for past_key_value
@@ -662,12 +662,7 @@ def custom_forward(*inputs):
     GPT_NEO_START_DOCSTRING,
 )
 class GPTNeoForCausalLM(GPTNeoPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        r"h\.\d+\.attn\.masked_bias",
-        r"lm_head.weight",
-        r"h\.\d+\.attn\.attention\.bias",
-    ]
-    _keys_to_ignore_on_save = [r"lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -683,7 +678,7 @@ def get_output_embeddings(self):
     def set_output_embeddings(self, new_embeddings):
         self.lm_head = new_embeddings
 
-    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs):
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
         token_type_ids = kwargs.get("token_type_ids", None)
         # only last token for inputs_ids if past is defined in kwargs
         if past_key_values:
@@ -700,20 +695,27 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg
             position_ids.masked_fill_(attention_mask == 0, 1)
             if past_key_values:
                 position_ids = position_ids[:, -1].unsqueeze(-1)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
         else:
-            position_ids = None
-        return {
-            "input_ids": input_ids,
-            "past_key_values": past_key_values,
-            "use_cache": kwargs.get("use_cache"),
-            "position_ids": position_ids,
-            "attention_mask": attention_mask,
-            "token_type_ids": token_type_ids,
-        }
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "position_ids": position_ids,
+                "attention_mask": attention_mask,
+                "token_type_ids": token_type_ids,
+            }
+        )
+
+        return model_inputs
 
     @add_start_docstrings_to_model_forward(GPT_NEO_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=CausalLMOutputWithCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -760,6 +762,8 @@ def forward(
 
         loss = None
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(lm_logits.device)
             # Compute loss in fp32 to match with mesh-tf version
             # https://github.com/EleutherAI/gpt-neo/blob/89ce74164da2fb16179106f54e2269b5da8db333/models/gpt2/gpt2.py#L179
             lm_logits = lm_logits.to(torch.float32)
@@ -787,7 +791,9 @@ def forward(
         )
 
     @staticmethod
-    def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> Tuple[Tuple[torch.Tensor]]:
+    def _reorder_cache(
+        past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
+    ) -> Tuple[Tuple[torch.Tensor]]:
         """
         This function is used to re-order the `past_key_values` cache if [`~PretrainedModel.beam_search`] or
         [`~PretrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
@@ -795,7 +801,7 @@ def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) ->
         """
         return tuple(
             tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
-            for layer_past in past
+            for layer_past in past_key_values
         )
 
 
@@ -815,8 +821,6 @@ def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) ->
     GPT_NEO_START_DOCSTRING,
 )
 class GPTNeoForSequenceClassification(GPTNeoPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -828,7 +832,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(GPT_NEO_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=SequenceClassifierOutputWithPast,
         config_class=_CONFIG_FOR_DOC,
@@ -883,7 +886,9 @@ def forward(
             sequence_lengths = -1
         else:
             if input_ids is not None:
-                sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device)
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+                    logits.device
+                )
             else:
                 sequence_lengths = -1
                 logger.warning(
@@ -926,3 +931,187 @@ def forward(
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
         )
+
+
+@add_start_docstrings(
+    """
+    GPT Neo model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    GPT_NEO_START_DOCSTRING,
+)
+class GPTNeoForTokenClassification(GPTNeoPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.transformer = GPTNeoModel(config)
+        self.dropout = nn.Dropout(config.classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(GPT_NEO_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint="EleutherAI/gpt-neo-125m",
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_loss=0.25,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+        hidden_states = self.dropout(hidden_states)
+        logits = self.classifier(hidden_states)
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The GPT-Neo Model transformer with a span classification head on top for extractive question-answering tasks like
+    SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    GPT_NEO_START_DOCSTRING,
+)
+class GPTNeoForQuestionAnswering(GPTNeoPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.transformer = GPTNeoModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(GPT_NEO_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        real_checkpoint=_CHECKPOINT_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/gpt_neox/__init__.py b/src/transformers/models/gpt_neox/__init__.py
index 814fa9a30131..46f06b1991af 100644
--- a/src/transformers/models/gpt_neox/__init__.py
+++ b/src/transformers/models/gpt_neox/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -40,6 +36,9 @@
     _import_structure["modeling_gpt_neox"] = [
         "GPT_NEOX_PRETRAINED_MODEL_ARCHIVE_LIST",
         "GPTNeoXForCausalLM",
+        "GPTNeoXForQuestionAnswering",
+        "GPTNeoXForSequenceClassification",
+        "GPTNeoXForTokenClassification",
         "GPTNeoXLayer",
         "GPTNeoXModel",
         "GPTNeoXPreTrainedModel",
@@ -66,6 +65,9 @@
         from .modeling_gpt_neox import (
             GPT_NEOX_PRETRAINED_MODEL_ARCHIVE_LIST,
             GPTNeoXForCausalLM,
+            GPTNeoXForQuestionAnswering,
+            GPTNeoXForSequenceClassification,
+            GPTNeoXForTokenClassification,
             GPTNeoXLayer,
             GPTNeoXModel,
             GPTNeoXPreTrainedModel,
diff --git a/src/transformers/models/gpt_neox/configuration_gpt_neox.py b/src/transformers/models/gpt_neox/configuration_gpt_neox.py
index a5ba1fddd93a..896bda513177 100644
--- a/src/transformers/models/gpt_neox/configuration_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/configuration_gpt_neox.py
@@ -56,6 +56,15 @@ class GPTNeoXConfig(PretrainedConfig):
             percentage of hidden dimensions to allocate to rotary embeddings
         rotary_emb_base (`int`, *optional*, defaults to 10000)
             base for computing rotary embeddings frequency
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio probability of the attention score.
+        hidden_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio of (1) the word embeddings, (2) the post-attention hidden states, and (3) the post-mlp
+            hidden states.
+        classifier_dropout (`float`, *optional*, defaults to 0.1):
+            Argument used when doing token classification, used in the model [`GPTNeoXForTokenClassification`].
+
+            The dropout ratio for the hidden layer.
         max_position_embeddings (`int`, *optional*, defaults to 2048):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
@@ -69,6 +78,15 @@ class GPTNeoXConfig(PretrainedConfig):
         use_parallel_residual (`bool`, *optional*, defaults to `True`):
             Whether to use a "parallel" formulation in each Transformer layer, which can provide a slight training
             speedup at large scales (e.g. 20B).
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
+            is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+
         Example:
 
     ```python
@@ -95,6 +113,9 @@ def __init__(
         hidden_act="gelu",
         rotary_pct=0.25,
         rotary_emb_base=10000,
+        attention_dropout=0.0,
+        hidden_dropout=0.0,
+        classifier_dropout=0.1,
         max_position_embeddings=2048,
         initializer_range=0.02,
         layer_norm_eps=1e-5,
@@ -103,7 +124,8 @@ def __init__(
         eos_token_id=2,
         tie_word_embeddings=False,
         use_parallel_residual=True,
-        **kwargs
+        rope_scaling=None,
+        **kwargs,
     ):
         super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
         self.vocab_size = vocab_size
@@ -115,8 +137,40 @@ def __init__(
         self.hidden_act = hidden_act
         self.rotary_pct = rotary_pct
         self.rotary_emb_base = rotary_emb_base
+        self.attention_dropout = attention_dropout
+        self.hidden_dropout = hidden_dropout
+        self.classifier_dropout = classifier_dropout
         self.initializer_range = initializer_range
         self.layer_norm_eps = layer_norm_eps
         self.use_cache = use_cache
         self.tie_word_embeddings = tie_word_embeddings
         self.use_parallel_residual = use_parallel_residual
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+
+        if self.hidden_size % self.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size is not divisble by the number of attention heads! Make sure to update them!"
+            )
+
+    # Copied from transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")
diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
index 554e644595dd..b4aa4154459c 100755
--- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
@@ -19,7 +19,7 @@
 import torch
 import torch.utils.checkpoint
 from torch import nn
-from torch.nn import CrossEntropyLoss
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
 from ...file_utils import (
@@ -28,7 +28,13 @@
     add_start_docstrings_to_model_forward,
     replace_return_docstrings,
 )
-from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
 from ...modeling_utils import PreTrainedModel
 from ...utils import logging
 from .configuration_gpt_neox import GPTNeoXConfig
@@ -36,9 +42,9 @@
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "gpt-neox-20b"
+_CHECKPOINT_FOR_DOC = "trl-internal-testing/tiny-random-GPTNeoXForCausalLM"
+_REAL_CHECKPOINT_FOR_DOC = "EleutherAI/gpt-neox-20b"
 _CONFIG_FOR_DOC = "GPTNeoXConfig"
-_TOKENIZER_FOR_DOC = "GPTNeoXTokenizerFast"
 
 GPT_NEOX_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "EleutherAI/gpt-neox-20b",
@@ -56,6 +62,7 @@ class GPTNeoXPreTrainedModel(PreTrainedModel):
     base_model_prefix = "gpt_neox"
     supports_gradient_checkpointing = True
     _no_split_modules = ["GPTNeoXLayer"]
+    _skip_keys_device_placement = "past_key_values"
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -79,33 +86,70 @@ def _set_gradient_checkpointing(self, module, value=False):
 class GPTNeoXAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
+        self.config = config
         self.num_attention_heads = config.num_attention_heads
         self.hidden_size = config.hidden_size
+        if self.hidden_size % self.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size is not divisble by the number of attention heads! Make sure to update them"
+            )
         self.head_size = self.hidden_size // self.num_attention_heads
         self.rotary_ndims = int(self.head_size * config.rotary_pct)
-        max_positions = config.max_position_embeddings
+        self._init_bias(config.max_position_embeddings)
+
+        self.register_buffer("masked_bias", torch.tensor(-1e9), persistent=False)
+        self._init_rope()
+
+        self.norm_factor = self.head_size**-0.5
+        self.query_key_value = nn.Linear(config.hidden_size, 3 * config.hidden_size)
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.attention_dropout = nn.Dropout(config.attention_dropout)
+
+    def _init_bias(self, max_positions, device=None):
         self.register_buffer(
             "bias",
-            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.uint8)).view(
+            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
                 1, 1, max_positions, max_positions
             ),
+            persistent=False,
         )
-        self.register_buffer("masked_bias", torch.tensor(-1e9))
-        self.rotary_emb = RotaryEmbedding(
-            self.rotary_ndims, config.max_position_embeddings, base=config.rotary_emb_base
-        )
-        self.norm_factor = torch.sqrt(torch.tensor(self.head_size, dtype=torch.float32)).to(torch.get_default_dtype())
-        self.query_key_value = nn.Linear(config.hidden_size, 3 * config.hidden_size)
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if device is not None:
+            self.bias = self.bias.to(device)
+
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = GPTNeoXRotaryEmbedding(
+                self.rotary_ndims, self.config.max_position_embeddings, base=self.config.rotary_emb_base
+            )
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            scaling_factor = self.config.rope_scaling["factor"]
+            if scaling_type == "linear":
+                self.rotary_emb = GPTNeoXLinearScalingRotaryEmbedding(
+                    self.rotary_ndims,
+                    self.config.max_position_embeddings,
+                    base=self.config.rotary_emb_base,
+                    scaling_factor=scaling_factor,
+                )
+            elif scaling_type == "dynamic":
+                self.rotary_emb = GPTNeoXDynamicNTKScalingRotaryEmbedding(
+                    self.rotary_ndims,
+                    self.config.max_position_embeddings,
+                    base=self.config.rotary_emb_base,
+                    scaling_factor=scaling_factor,
+                )
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
 
     def forward(
         self,
-        hidden_states,
-        attention_mask,
-        head_mask=None,
-        layer_past=None,
-        use_cache=False,
-        output_attentions=False,
+        hidden_states: torch.FloatTensor,
+        attention_mask: torch.FloatTensor,
+        position_ids: torch.LongTensor,
+        head_mask: Optional[torch.FloatTensor] = None,
+        layer_past: Optional[Tuple[torch.Tensor]] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
     ):
         has_layer_past = layer_past is not None
 
@@ -132,12 +176,10 @@ def forward(
 
         # Compute token offset for rotary embeddings (when decoding)
         seq_len = key.shape[-2]
-        offset = 0
         if has_layer_past:
-            offset = layer_past[0].shape[-2]
-            seq_len += offset
+            seq_len += layer_past[0].shape[-2]
         cos, sin = self.rotary_emb(value, seq_len=seq_len)
-        query, key = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, offset=offset)
+        query, key = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids)
         query = torch.cat((query, query_pass), dim=-1)
         key = torch.cat((key, key_pass), dim=-1)
 
@@ -193,7 +235,10 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None):
         batch_size, num_attention_heads, query_length, attn_head_size = query.size()
         key_length = key.size(-2)
 
-        causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].bool()
+        # dynamically increase the causal mask with the key length, if needed.
+        if key_length > self.bias.shape[-1]:
+            self._init_bias(key_length, device=key.device)
+        causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
 
         query = query.view(batch_size * num_attention_heads, query_length, attn_head_size)
         key = key.view(batch_size * num_attention_heads, key_length, attn_head_size)
@@ -209,7 +254,7 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None):
             query,
             key.transpose(1, 2),
             beta=1.0,
-            alpha=(torch.tensor(1.0, dtype=self.norm_factor.dtype, device=self.norm_factor.device) / self.norm_factor),
+            alpha=self.norm_factor,
         )
         attn_scores = attn_scores.view(batch_size, num_attention_heads, query_length, key_length)
 
@@ -230,6 +275,8 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None):
         if head_mask is not None:
             attn_weights = attn_weights * head_mask
 
+        attn_weights = self.attention_dropout(attn_weights)
+
         attn_output = torch.matmul(attn_weights, value)
         return attn_output, attn_weights
 
@@ -239,33 +286,88 @@ def attention_mask_func(attention_scores, ltor_mask):
     return attention_scores
 
 
-class RotaryEmbedding(torch.nn.Module):
-    def __init__(self, dim, max_position_embeddings, base=10000, device=None):
+# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with LlamaRotary->GPTNeoXRotary
+class GPTNeoXRotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
-        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
-        self.register_buffer("inv_freq", inv_freq)
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
 
         # Build here to make `torch.jit.trace` work.
-        self.max_seq_len_cached = max_position_embeddings
-        t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
         freqs = torch.einsum("i,j->ij", t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
-        self.cos_cached = emb.cos()[None, None, :, :]
-        self.sin_cached = emb.sin()[None, None, :, :]
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
 
     def forward(self, x, seq_len=None):
         # x: [bs, num_attention_heads, seq_len, head_size]
-        # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
         if seq_len > self.max_seq_len_cached:
-            self.max_seq_len_cached = seq_len
-            t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype)
-            freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-            # Different from paper, but it uses a different permutation in order to obtain the same calculation
-            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
-            self.cos_cached = emb.cos()[None, None, :, :]
-            self.sin_cached = emb.sin()[None, None, :, :]
-        return self.cos_cached[:seq_len, ...].to(x.device), self.sin_cached[:seq_len, ...].to(x.device)
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->GPTNeoX
+class GPTNeoXLinearScalingRotaryEmbedding(GPTNeoXRotaryEmbedding):
+    """GPTNeoXRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        t = t / self.scaling_factor
+
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->GPTNeoX
+class GPTNeoXDynamicNTKScalingRotaryEmbedding(GPTNeoXRotaryEmbedding):
+    """GPTNeoXRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+
+        if seq_len > self.max_position_embeddings:
+            base = self.base * (
+                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+            ) ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
 
 
 def rotate_half(x):
@@ -275,9 +377,10 @@ def rotate_half(x):
     return torch.cat((-x2, x1), dim=-1)
 
 
-def apply_rotary_pos_emb(q, k, cos, sin, offset: int = 0):
-    cos = cos[..., offset : q.shape[-2] + offset, :]
-    sin = sin[..., offset : q.shape[-2] + offset, :]
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+    cos = cos[position_ids].unsqueeze(1)  # [seq_len, dim] -> [batch_size, 1, seq_len, head_dim]
+    sin = sin[position_ids].unsqueeze(1)
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
@@ -303,34 +406,39 @@ def __init__(self, config):
         self.use_parallel_residual = config.use_parallel_residual
         self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.post_attention_dropout = nn.Dropout(config.hidden_dropout)
+        self.post_mlp_dropout = nn.Dropout(config.hidden_dropout)
         self.attention = GPTNeoXAttention(config)
         self.mlp = GPTNeoXMLP(config)
 
     def forward(
         self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        use_cache=False,
-        layer_past=None,
-        output_attentions=False,
+        hidden_states: Optional[torch.FloatTensor],
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        layer_past: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
     ):
-
         attention_layer_outputs = self.attention(
             self.input_layernorm(hidden_states),
             attention_mask=attention_mask,
+            position_ids=position_ids,
             layer_past=layer_past,
             head_mask=head_mask,
             use_cache=use_cache,
             output_attentions=output_attentions,
         )
         attn_output = attention_layer_outputs[0]  # output_attn: attn_output, present, (attn_weights)
+        attn_output = self.post_attention_dropout(attn_output)
         outputs = attention_layer_outputs[1:]
 
         if self.use_parallel_residual:
             # pseudocode:
             # x = x + attn(ln1(x)) + mlp(ln2(x))
             mlp_output = self.mlp(self.post_attention_layernorm(hidden_states))
+            mlp_output = self.post_mlp_dropout(mlp_output)
             hidden_states = mlp_output + attn_output + hidden_states
         else:
             # pseudocode:
@@ -338,6 +446,7 @@ def forward(
             # x = x + mlp(ln2(x))
             attn_output = attn_output + hidden_states
             mlp_output = self.mlp(self.post_attention_layernorm(attn_output))
+            mlp_output = self.post_mlp_dropout(mlp_output)
             hidden_states = mlp_output + attn_output
 
         if use_cache:
@@ -364,7 +473,7 @@ def forward(
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`GPTNeoXTokenizerFast`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -375,17 +484,9 @@ def forward(
             - 0 for tokens that are **masked**.
 
             [What are attention masks?](../glossary#attention-mask)
-        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
-            1]`:
-
-            - 0 corresponds to a *sentence A* token,
-            - 1 corresponds to a *sentence B* token.
-
-            [What are token type IDs?](../glossary#token-type-ids)
         position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
             Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.max_position_embeddings - 1]`.
+            config.n_positions - 1]`.
 
             [What are position IDs?](../glossary#position-ids)
         head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
@@ -419,6 +520,7 @@ def __init__(self, config):
         self.config = config
 
         self.embed_in = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.emb_dropout = nn.Dropout(config.hidden_dropout)
         self.layers = nn.ModuleList([GPTNeoXLayer(config) for _ in range(config.num_hidden_layers)])
         self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
@@ -435,8 +537,8 @@ def set_input_embeddings(self, value):
 
     @add_start_docstrings_to_model_forward(GPT_NEOX_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
+        real_checkpoint=_REAL_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPast,
         config_class=_CONFIG_FOR_DOC,
     )
@@ -444,6 +546,7 @@ def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
         head_mask: Optional[torch.FloatTensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
@@ -472,6 +575,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -481,7 +585,15 @@ def forward(
         batch_size, seq_length = input_shape
 
         if past_key_values is None:
+            past_length = 0
             past_key_values = tuple([None] * self.config.num_hidden_layers)
+        else:
+            past_length = past_key_values[0][0].size(-2)
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(past_length, seq_length + past_length, dtype=torch.long, device=device)
+            position_ids = position_ids.unsqueeze(0)
 
         # Attention mask.
         if attention_mask is not None:
@@ -512,7 +624,14 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.embed_in(input_ids)
 
-        hidden_states = inputs_embeds
+        hidden_states = self.emb_dropout(inputs_embeds)
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
 
         presents = () if use_cache else None
         all_attentions = () if output_attentions else None
@@ -523,12 +642,6 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         # None for layer_past
@@ -540,12 +653,14 @@ def custom_forward(*inputs):
                     create_custom_forward(layer),
                     hidden_states,
                     attention_mask,
+                    position_ids,
                     head_mask[i],
                 )
             else:
                 outputs = layer(
                     hidden_states,
                     attention_mask=attention_mask,
+                    position_ids=position_ids,
                     head_mask=head_mask[i],
                     layer_past=layer_past,
                     use_cache=use_cache,
@@ -577,8 +692,7 @@ def custom_forward(*inputs):
     """GPTNeoX Model with a `language modeling` head on top for CLM fine-tuning.""", GPT_NEOX_START_DOCSTRING
 )
 class GPTNeoXForCausalLM(GPTNeoXPreTrainedModel):
-
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+    _tied_weights_keys = ["embed_out.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -601,6 +715,7 @@ def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         head_mask: Optional[torch.FloatTensor] = None,
         past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
@@ -636,10 +751,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import GPTNeoXTokenizerFast, GPTNeoXForCausalLM, GPTNeoXConfig
+        >>> from transformers import AutoTokenizer, GPTNeoXForCausalLM, GPTNeoXConfig
         >>> import torch
 
-        >>> tokenizer = GPTNeoXTokenizerFast.from_pretrained("EleutherAI/gpt-neox-20b")
+        >>> tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
         >>> config = GPTNeoXConfig.from_pretrained("EleutherAI/gpt-neox-20b")
         >>> config.is_decoder = True
         >>> model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/gpt-neox-20b", config=config)
@@ -654,6 +769,7 @@ def forward(
         outputs = self.gpt_neox(
             input_ids,
             attention_mask=attention_mask,
+            position_ids=position_ids,
             head_mask=head_mask,
             inputs_embeds=inputs_embeds,
             past_key_values=past_key_values,
@@ -668,6 +784,8 @@ def forward(
 
         lm_loss = None
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(lm_logits.device)
             # we are doing next-token prediction; shift prediction scores and input ids by one
             shift_logits = lm_logits[:, :-1, :].contiguous()
             labels = labels[:, 1:].contiguous()
@@ -686,27 +804,350 @@ def forward(
             attentions=outputs.attentions,
         )
 
-    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
         input_shape = input_ids.shape
 
+        # cut decoder_input_ids if past is used
+        if past_key_values and past_key_values[0] is not None:
+            input_ids = input_ids[:, -1:]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+
         # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
         if attention_mask is None:
             attention_mask = input_ids.new_ones(input_shape)
 
-        # cut decoder_input_ids if past is used
-        if past_key_values and past_key_values[0] is not None:
-            input_ids = input_ids[:, -1:]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "attention_mask": attention_mask,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+            }
+        )
 
-        return {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "past_key_values": past_key_values,
-        }
+        return model_inputs
 
-    def _reorder_cache(self, past, beam_idx):
+    def _reorder_cache(self, past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
+        for layer_past in past_key_values:
             reordered_past += (
-                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
+                + layer_past[2:],
             )
         return reordered_past
+
+
+@add_start_docstrings(
+    """
+    The GPTNeoX Model transformer with a sequence classification head on top (linear layer).
+
+    [`GPTNeoXForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-1) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    GPT_NEOX_START_DOCSTRING,
+)
+class GPTNeoXForSequenceClassification(GPTNeoXPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.gpt_neox = GPTNeoXModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(GPT_NEOX_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutputWithPast,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.gpt_neox(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size, sequence_length = input_ids.shape[:2]
+        else:
+            batch_size, sequence_length = inputs_embeds.shape[:2]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+                    logits.device
+                )
+            else:
+                sequence_lengths = -1
+                logger.warning(
+                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                    "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+                )
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class GPTNeoXForTokenClassification(GPTNeoXPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.gpt_neox = GPTNeoXModel(config)
+        self.dropout = nn.Dropout(config.classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(GPT_NEOX_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint="LarsJonasson/pythia-410m-deduped-sft-swedish",
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_loss=0.25,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.gpt_neox(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states)
+        logits = self.classifier(hidden_states)
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The GPT-NeoX Model transformer with a span classification head on top for extractive question-answering tasks like
+    SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    GPT_NEOX_START_DOCSTRING,
+)
+class GPTNeoXForQuestionAnswering(GPTNeoXPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.gpt_neox = GPTNeoXModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(GPT_NEOX_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        real_checkpoint=_REAL_CHECKPOINT_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.gpt_neox(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1).to(start_logits.device)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1).to(end_logits.device)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py b/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py
index c08d533835d7..f666b97efd2b 100644
--- a/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py
+++ b/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 """Tokenization classes for GPTNeoX."""
 import json
-from typing import TYPE_CHECKING, List, Optional, Tuple
+from typing import Optional, Tuple
 
 from tokenizers import pre_tokenizers
 
@@ -22,10 +22,6 @@
 from ...utils import logging
 
 
-if TYPE_CHECKING:
-    from transformers.pipelines.conversational import Conversation
-
-
 logger = logging.get_logger(__name__)
 
 VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
@@ -49,12 +45,14 @@ class GPTNeoXTokenizerFast(PreTrainedTokenizerFast):
     This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
     be encoded differently whether it is at the beginning of the sentence (without space) or not:
 
-    ```
+    ```python
     >>> from transformers import GPTNeoXTokenizerFast
+
     >>> tokenizer = GPTNeoXTokenizerFast.from_pretrained("gpt2")
-    >>> tokenizer("Hello world")['input_ids']
+    >>> tokenizer("Hello world")["input_ids"]
     [15496, 995]
-    >>> tokenizer(" Hello world")['input_ids']
+
+    >>> tokenizer(" Hello world")["input_ids"]
     [18435, 995]
     ```
 
@@ -106,7 +104,7 @@ def __init__(
         bos_token="<|endoftext|>",
         eos_token="<|endoftext|>",
         add_prefix_space=False,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
@@ -131,12 +129,10 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
         files = self._tokenizer.model.save(save_directory, name=filename_prefix)
         return tuple(files)
 
-    def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
-        """This corresponds to DialoGPT variants of models."""
-        input_ids = []
-        for is_user, text in conversation.iter_texts():
-            input_ids.extend(self.encode(text, add_special_tokens=False) + [self.eos_token_id])
-
-        if len(input_ids) > self.model_max_length:
-            input_ids = input_ids[-self.model_max_length :]
-        return input_ids
+    @property
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.default_chat_template
+    def default_chat_template(self):
+        """
+        A simple chat template that ignores role information and just concatenates messages with EOS tokens.
+        """
+        return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}"
diff --git a/src/transformers/models/gpt_neox_japanese/__init__.py b/src/transformers/models/gpt_neox_japanese/__init__.py
index 0d18143c0f02..bf04db7676c8 100644
--- a/src/transformers/models/gpt_neox_japanese/__init__.py
+++ b/src/transformers/models/gpt_neox_japanese/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py
index 1cf08ef301ca..8d8519b9eae8 100644
--- a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py
+++ b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py
@@ -63,8 +63,6 @@ class GPTNeoXJapaneseConfig(PretrainedConfig):
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`.
-        weight_tying (`bool`, *optional*, defaults to `True`):
-            Whhether or not use weight tying between input and output embedding weight
         attention_dropout (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention.
         hidden_dropout (`float`, *optional*, defaults to 0.0):
@@ -101,10 +99,9 @@ def __init__(
         use_cache=True,
         bos_token_id=31996,
         eos_token_id=31999,
-        weight_tying=True,
         attention_dropout=0.1,
         hidden_dropout=0.0,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
         self.vocab_size = vocab_size
@@ -119,6 +116,5 @@ def __init__(
         self.initializer_range = initializer_range
         self.layer_norm_eps = layer_norm_eps
         self.use_cache = use_cache
-        self.weight_tying = weight_tying
         self.attention_dropout = attention_dropout
         self.hidden_dropout = hidden_dropout
diff --git a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
index aea950a8a93b..98753edeb544 100755
--- a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
+++ b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
@@ -33,7 +33,6 @@
 
 _CHECKPOINT_FOR_DOC = "abeja/gpt-neox-japanese-2.7b"
 _CONFIG_FOR_DOC = "GPTNeoXJapaneseConfig"
-_TOKENIZER_FOR_DOC = "GPTNeoXJapaneseTokenizer"
 
 GPT_NEOX_JAPANESE_PRETRAINED_MODEL_ARCHIVE_LIST = {
     "https://huggingface.co/abeja/gpt-neox-japanese-2.7b/resolve/main/config.json",
@@ -51,6 +50,7 @@ class GPTNeoXJapanesePreTrainedModel(PreTrainedModel):
     base_model_prefix = "gpt_neox_japanese"
     supports_gradient_checkpointing = True
     _no_split_modules = ["GPTNeoXJapaneseLayer"]
+    _skip_keys_device_placement = "past_key_values"
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -181,13 +181,13 @@ def _merge_heads(cls, tensor, num_attention_heads, attn_head_size):
         # -> [bs, seq_len, hidden_size]
         return tensor
 
-    def _create_casual_mask(self, key_length, query_length):
-        casual_mask = torch.tril(
-            torch.ones((self.max_positions, self.max_positions), dtype=torch.uint8).view(
+    def _create_causal_mask(self, key_length, query_length):
+        causal_mask = torch.tril(
+            torch.ones((self.max_positions, self.max_positions), dtype=torch.bool).view(
                 1, 1, self.max_positions, self.max_positions
             )
         )
-        return casual_mask[:, :, key_length - query_length : key_length, :key_length].bool()
+        return causal_mask[:, :, key_length - query_length : key_length, :key_length]
 
     def _attn(self, query, key, value, attention_mask=None, head_mask=None):
         # q, k, v: [bs, num_attention_heads, seq_len, attn_head_size]
@@ -195,7 +195,7 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None):
         batch_size, num_attention_heads, query_length, attn_head_size = query.size()
         key_length = key.size(-2)
 
-        causal_mask = self._create_casual_mask(key_length, query_length)
+        causal_mask = self._create_causal_mask(key_length, query_length)
 
         query = query.view(batch_size * num_attention_heads, query_length, attn_head_size)
         key = key.view(batch_size * num_attention_heads, key_length, attn_head_size)
@@ -238,34 +238,41 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None):
         return attn_output, attn_weights
 
 
-# Copied from transformers.models.gpt_neox.modeling_gpt_neox.RotaryEmbedding
-class RotaryEmbedding(torch.nn.Module):
-    def __init__(self, dim, max_position_embeddings, base=10000, device=None):
+# Copied from transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXRotaryEmbedding with GPTNeoXRotaryEmbedding->RotaryEmbedding
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
-        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
-        self.register_buffer("inv_freq", inv_freq)
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
 
         # Build here to make `torch.jit.trace` work.
-        self.max_seq_len_cached = max_position_embeddings
-        t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
         freqs = torch.einsum("i,j->ij", t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
-        self.cos_cached = emb.cos()[None, None, :, :]
-        self.sin_cached = emb.sin()[None, None, :, :]
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
 
     def forward(self, x, seq_len=None):
         # x: [bs, num_attention_heads, seq_len, head_size]
-        # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
         if seq_len > self.max_seq_len_cached:
-            self.max_seq_len_cached = seq_len
-            t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype)
-            freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-            # Different from paper, but it uses a different permutation in order to obtain the same calculation
-            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
-            self.cos_cached = emb.cos()[None, None, :, :]
-            self.sin_cached = emb.sin()[None, None, :, :]
-        return self.cos_cached[:seq_len, ...].to(x.device), self.sin_cached[:seq_len, ...].to(x.device)
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
 
 
 def rotate_half(x):
@@ -392,7 +399,7 @@ def forward(
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`GPTNeoXJapaneseTokenizer`].
+            Indices can be obtained using [`AutoTokenizer`].
 
         attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
             Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
@@ -485,10 +492,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import GPTNeoXJapaneseTokenizer, GPTNeoXJapaneseModel
+        >>> from transformers import AutoTokenizer, GPTNeoXJapaneseModel
         >>> import torch
 
-        >>> tokenizer = GPTNeoXJapaneseTokenizer.from_pretrained("abeja/gpt-neox-japanese-2.7b")
+        >>> tokenizer = AutoTokenizer.from_pretrained("abeja/gpt-neox-japanese-2.7b")
         >>> model = GPTNeoXJapaneseModel.from_pretrained("abeja/gpt-neox-japanese-2.7b")
 
         >>> inputs = tokenizer("日本語のGPT-neoxがHugging Faceで使えます😀", return_tensors="pt")
@@ -507,6 +514,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -591,8 +599,7 @@ def forward(
     GPT_NEOX_JAPANESE_START_DOCSTRING,
 )
 class GPTNeoXJapaneseForCausalLM(GPTNeoXJapanesePreTrainedModel):
-
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "embed_out.weight"]
+    _tied_weights_keys = ["embed_out.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -651,10 +658,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import GPTNeoXJapaneseTokenizer, GPTNeoXJapaneseForCausalLM, GPTNeoXJapaneseConfig
+        >>> from transformers import AutoTokenizer, GPTNeoXJapaneseForCausalLM, GPTNeoXJapaneseConfig
         >>> import torch
 
-        >>> tokenizer = GPTNeoXJapaneseTokenizer.from_pretrained("abeja/gpt-neox-japanese-2.7b")
+        >>> tokenizer = AutoTokenizer.from_pretrained("abeja/gpt-neox-japanese-2.7b")
         >>> config = GPTNeoXJapaneseConfig.from_pretrained("abeja/gpt-neox-japanese-2.7b")
         >>> config.is_decoder = True
         >>> model = GPTNeoXJapaneseForCausalLM.from_pretrained("abeja/gpt-neox-japanese-2.7b", config=config)
@@ -684,6 +691,9 @@ def forward(
 
         lm_loss = None
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(lm_logits.device)
+
             # we are doing next-token prediction; shift prediction scores and input ids by one
             shift_logits = lm_logits[:, :-1, :].contiguous()
             labels = labels[:, 1:].contiguous()
@@ -715,10 +725,11 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attenti
 
         return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
 
-    def _reorder_cache(self, past, beam_idx):
+    def _reorder_cache(self, past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
+        for layer_past in past_key_values:
             reordered_past += (
-                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
+                + layer_past[2:],
             )
         return reordered_past
diff --git a/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
index a132d999a313..c0350879489f 100644
--- a/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
+++ b/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
@@ -17,7 +17,7 @@
 import json
 import os
 import re
-from typing import TYPE_CHECKING, List, Optional, Tuple
+from typing import Optional, Tuple
 
 import numpy as np
 
@@ -25,10 +25,6 @@
 from ...utils import logging
 
 
-if TYPE_CHECKING:
-    from transformers.pipelines.conversational import Conversation
-
-
 logger = logging.get_logger(__name__)
 
 VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "emoji_file": "emoji.json"}
@@ -129,16 +125,8 @@ def __init__(
         bos_token="<|startoftext|>",
         eos_token="<|endoftext|>",
         do_clean_text=False,
-        **kwargs
+        **kwargs,
     ):
-        super().__init__(
-            unk_token=unk_token,
-            pad_token=pad_token,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            do_clean_text=do_clean_text,
-            **kwargs,
-        )
         if not os.path.isfile(vocab_file):
             raise ValueError(
                 f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
@@ -154,6 +142,14 @@ def __init__(
         self.subword_tokenizer = SubWordJapaneseTokenizer(
             vocab=self.vocab, ids_to_tokens=self.ids_to_tokens, emoji=self.emoji
         )
+        super().__init__(
+            unk_token=unk_token,
+            pad_token=pad_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            do_clean_text=do_clean_text,
+            **kwargs,
+        )
 
     @property
     def vocab_size(self):
@@ -179,15 +175,17 @@ def convert_tokens_to_string(self, tokens):
         out_string = "".join(tokens).strip()
         return out_string
 
-    def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
-        """This corresponds to DialoGPT variants of models."""
-        input_ids = []
-        for is_user, text in conversation.iter_texts():
-            input_ids.extend(self.encode(text, add_special_tokens=False) + [self.eos_token_id])
-
-        if len(input_ids) > self.model_max_length:
-            input_ids = input_ids[-self.model_max_length :]
-        return input_ids
+    @property
+    def default_chat_template(self):
+        """
+        A simple chat template that just adds BOS/EOS tokens around messages while discarding role information.
+        """
+        return (
+            "{% for message in messages %}"
+            "{{ bos_token + eos_token + message.content + eos_token }}"
+            "{% endfor %}"
+            "{% if add_generation_prompt %} {{ bos_token + eos_token }} {% endif %}"
+        )
 
     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
         index = 0
diff --git a/src/transformers/models/gpt_sw3/__init__.py b/src/transformers/models/gpt_sw3/__init__.py
index c9e6dca3ef6e..e7c08f0e27e7 100644
--- a/src/transformers/models/gpt_sw3/__init__.py
+++ b/src/transformers/models/gpt_sw3/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -33,7 +29,6 @@
 
 
 if TYPE_CHECKING:
-
     try:
         if not is_sentencepiece_available():
             raise OptionalDependencyNotAvailable()
diff --git a/src/transformers/models/gpt_sw3/convert_megatron_to_pytorch.py b/src/transformers/models/gpt_sw3/convert_megatron_to_pytorch.py
index 13160f77c1c0..5562efa28747 100644
--- a/src/transformers/models/gpt_sw3/convert_megatron_to_pytorch.py
+++ b/src/transformers/models/gpt_sw3/convert_megatron_to_pytorch.py
@@ -78,7 +78,7 @@ def convert_megatron_checkpoint(sd_megatron, config):
 
     pf = "model.language_model.encoder.layers."
     for i in range(layers):
-        causal_mask = torch.tril(torch.ones((n_positions, n_positions), dtype=torch.uint8))
+        causal_mask = torch.tril(torch.ones((n_positions, n_positions), dtype=torch.bool))
         causal_mask = causal_mask.view(1, 1, n_positions, n_positions)
         sd_hf[f"transformer.h.{i}.attn.bias"] = causal_mask
         sd_hf[f"transformer.h.{i}.attn.masked_bias"] = torch.tensor(-1e4, dtype=torch.bfloat16)
diff --git a/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py b/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py
index bb2025381c72..857656fa07ce 100644
--- a/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py
+++ b/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py
@@ -1,20 +1,19 @@
+"""The tokenizer used by the GPT-SW3 models."""
+
 import os
 import re
 import unicodedata
-
-from ... import is_torch_available
-
-
-if is_torch_available():
-    import torch
-
 from shutil import copyfile
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import sentencepiece as spm
 
 from ...tokenization_utils import PreTrainedTokenizer
-from ...utils import logging
+from ...utils import is_torch_available, logging
+
+
+if is_torch_available():
+    import torch
 
 
 logger = logging.get_logger(__name__)
@@ -47,10 +46,11 @@ class GPTSw3Tokenizer(PreTrainedTokenizer):
     this superclass for more information regarding those methods.
 
     Example usage:
-    ```
+    ```python
     >>> from transformers import GPTSw3Tokenizer
+
     >>> tokenizer = GPTSw3Tokenizer.from_pretrained("AI-Sweden/gpt-sw3-126m")
-    >>> tokenizer("Svenska är kul!")['input_ids']
+    >>> tokenizer("Svenska är kul!")["input_ids"]
     [1814, 377, 3617, 63504]
     ```
 
@@ -64,17 +64,17 @@ class GPTSw3Tokenizer(PreTrainedTokenizer):
             Whether or not to strip the text when tokenizing (removing excess spaces before and after the string).
         keep_accents (`bool`, *optional*, defaults to `False`):
             Whether or not to keep accents when tokenizing.
-        bos_token (`str`, *optional*):
-            The beginning of sequence token that can be used for downstream task, was not seen during pretraining. If
-            not provided, will default to '<s>' or '<|endoftext|>', depending on model size.
-        eos_token (`str`, *optional*):
-            The end of sequence token seen during pretraining. If not provided, will default to '<|endoftext|>'
-        unk_token (`str`, *optional*):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead. If not provided, will default to '<unk>'.
         pad_token (`str`, *optional*):
             The token used for padding, for example when batching sequences of different lengths. If not provided, will
             default to '<pad>' or '<unk>' depending on model size.
+        unk_token (`str`, *optional*):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead. If not provided, will default to '<unk>'.
+        eos_token (`str`, *optional*):
+            The end of sequence token seen during pretraining. If not provided, will default to '<|endoftext|>'
+        bos_token (`str`, *optional*):
+            The beginning of sequence token that can be used for downstream task, was not seen during pretraining. If
+            not provided, will default to '<s>' or '<|endoftext|>', depending on model size.
         sp_model_kwargs (`dict`, *optional*):
             Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
             SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
@@ -103,6 +103,7 @@ class GPTSw3Tokenizer(PreTrainedTokenizer):
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "token_type_ids", "attention_mask"]
 
     def __init__(
         self,
@@ -115,9 +116,8 @@ def __init__(
         eos_token=None,
         bos_token=None,
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
-
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
 
         name_or_path = kwargs.get("name_or_path")
@@ -138,18 +138,6 @@ def __init__(
             pad_token = "<pad>" if pad_token is None else pad_token
             bos_token = "<s>" if bos_token is None else bos_token
 
-        super().__init__(
-            do_lower_case=do_lower_case,
-            remove_space=remove_space,
-            keep_accents=keep_accents,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            pad_token=pad_token,
-            sp_model_kwargs=self.sp_model_kwargs,
-            **kwargs,
-        )
-
         self.do_lower_case = do_lower_case
         self.remove_space = remove_space
         self.keep_accents = keep_accents
@@ -168,6 +156,18 @@ def __init__(
             f"[{''.join(map(chr, list(range(0, 9)) + list(range(11, 32)) + list(range(127, 160)) + [160, 173, 8203]))}]"
         )
 
+        super().__init__(
+            do_lower_case=do_lower_case,
+            remove_space=remove_space,
+            keep_accents=keep_accents,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+
     # Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.__getstate__
     def __getstate__(self):
         state = self.__dict__.copy()
@@ -230,8 +230,10 @@ def convert_tokens_to_string(self, tokens: List[str]) -> str:
         for token in tokens:
             # make sure that special tokens are not decoded using sentencepiece model
             if token in self.all_special_tokens:
+                # TODO: Check if this is needed, as it ensures that decode(encode(doc)) != doc by adding extra whitespace in the decoded document
                 if not prev_is_special:
                     out_string += " "
+
                 out_string += self.sp_model.decode(current_sub_tokens) + token
                 prev_is_special = True
                 current_sub_tokens = []
@@ -312,3 +314,19 @@ def decode_fast(self, token_ids: Union[int, List[int]]) -> str:
         """
 
         return self.sp_model.decode(token_ids)
+
+    @property
+    def default_chat_template(self):
+        """
+        This chat template formats messages like an instant messenger chat log, with "User:" and "Bot:" strings
+        preceding messages. BOS tokens are added between all messages.
+        """
+        return (
+            "{{ eos_token }}{{ bos_token }}"
+            "{% for message in messages %}"
+            "{% if message['role'] == 'user' %}{{ 'User: ' + message['content']}}"
+            "{% else %}{{ 'Bot: ' + message['content']}}{% endif %}"
+            "{{ message['text'] }}{{ bos_token }}"
+            "{% endfor %}"
+            "Bot:"
+        )
diff --git a/src/transformers/models/gptj/__init__.py b/src/transformers/models/gptj/__init__.py
index d4c4e01a6ede..4e59ed470620 100644
--- a/src/transformers/models/gptj/__init__.py
+++ b/src/transformers/models/gptj/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The EleutherAI and HuggingFace Teams. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/gptj/configuration_gptj.py b/src/transformers/models/gptj/configuration_gptj.py
index c1f20a77134b..b40861c354be 100644
--- a/src/transformers/models/gptj/configuration_gptj.py
+++ b/src/transformers/models/gptj/configuration_gptj.py
@@ -68,8 +68,6 @@ class GPTJConfig(PretrainedConfig):
             The epsilon to use in the layer normalization layers.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        scale_attn_weights (`bool`, *optional*, defaults to `True`):
-            Scale attention weights by dividing by sqrt(hidden_size).
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models).
 
@@ -110,12 +108,11 @@ def __init__(
         attn_pdrop=0.0,
         layer_norm_epsilon=1e-5,
         initializer_range=0.02,
-        scale_attn_weights=True,
         use_cache=True,
         bos_token_id=50256,
         eos_token_id=50256,
         tie_word_embeddings=False,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.n_positions = n_positions
@@ -130,7 +127,6 @@ def __init__(
         self.attn_pdrop = attn_pdrop
         self.layer_norm_epsilon = layer_norm_epsilon
         self.initializer_range = initializer_range
-        self.scale_attn_weights = scale_attn_weights
         self.use_cache = use_cache
 
         self.bos_token_id = bos_token_id
diff --git a/src/transformers/models/gptj/modeling_flax_gptj.py b/src/transformers/models/gptj/modeling_flax_gptj.py
index e7683c169d36..9f0d4d6e8600 100644
--- a/src/transformers/models/gptj/modeling_flax_gptj.py
+++ b/src/transformers/models/gptj/modeling_flax_gptj.py
@@ -16,11 +16,10 @@
 from functools import partial
 from typing import Optional, Tuple
 
-import numpy as np
-
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen import combine_masks, make_causal_mask
 from flax.linen.attention import dot_product_attention_weights
@@ -37,7 +36,6 @@
 
 _CHECKPOINT_FOR_DOC = "gptj"
 _CONFIG_FOR_DOC = "GPTJConfig"
-_TOKENIZER_FOR_DOC = "GPTJTokenizer"
 
 
 GPTJ_START_DOCSTRING = r"""
@@ -80,7 +78,7 @@
         input_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`):
             `input_ids_length` = `sequence_length`. Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`GPTJTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -196,7 +194,8 @@ def _concatenate_to_cache(self, key, value, query, attention_mask):
             cached_value.value = value
             num_updated_cache_vectors = query.shape[1]
             cache_index.value = cache_index.value + num_updated_cache_vectors
-            # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements.
+            # causal mask for cached decoder self-attention: our single query position should only attend to those key
+            # positions that have already been generated and cached, not the remaining zero elements.
             pad_mask = jnp.broadcast_to(
                 jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
                 tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
@@ -213,7 +212,6 @@ def __call__(
         init_cache: bool = False,
         output_attentions: bool = False,
     ):
-
         query = self.q_proj(hidden_states)
         key = self.k_proj(hidden_states)
         value = self.v_proj(hidden_states)
@@ -270,7 +268,7 @@ def __call__(
         attention_bias = lax.select(
             attention_mask > 0,
             jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
-            jnp.full(attention_mask.shape, -1e9).astype(self.dtype),
+            jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
         )
 
         # usual dot product attention
@@ -622,7 +620,6 @@ class FlaxGPTJModel(FlaxGPTJPreTrainedModel):
 
 append_call_sample_docstring(
     FlaxGPTJModel,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxCausalLMOutput,
     _CONFIG_FOR_DOC,
@@ -686,7 +683,7 @@ def __call__(
 class FlaxGPTJForCausalLM(FlaxGPTJPreTrainedModel):
     module_class = FlaxGPTJForCausalLMModule
 
-    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jnp.DeviceArray] = None):
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
         # initializing the cache
         batch_size, seq_length = input_ids.shape
 
@@ -715,7 +712,6 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
 
 append_call_sample_docstring(
     FlaxGPTJForCausalLM,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxCausalLMOutput,
     _CONFIG_FOR_DOC,
diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py
old mode 100755
new mode 100644
index e1000bb7d924..a93bdeaacd9d
--- a/src/transformers/models/gptj/modeling_gptj.py
+++ b/src/transformers/models/gptj/modeling_gptj.py
@@ -14,9 +14,11 @@
 # limitations under the License.
 """ PyTorch GPT-J model."""
 
+import warnings
 from typing import Optional, Tuple, Union
 
 import torch
+import torch.fx
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
@@ -29,7 +31,13 @@
     SequenceClassifierOutputWithPast,
 )
 from ...modeling_utils import PreTrainedModel
-from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_torch_fx_proxy,
+    logging,
+)
 from ...utils.model_parallel_utils import assert_device_map, get_device_map
 from .configuration_gptj import GPTJConfig
 
@@ -37,16 +45,8 @@
 logger = logging.get_logger(__name__)
 
 _CHECKPOINT_FOR_DOC = "hf-internal-testing/tiny-random-gptj"
+_REAL_CHECKPOINT_FOR_DOC = "EleutherAI/gpt-j-6B"
 _CONFIG_FOR_DOC = "GPTJConfig"
-_TOKENIZER_FOR_DOC = "GPT2Tokenizer"
-
-_CHECKPOINT_FOR_QA = "ydshieh/tiny-random-gptj-for-question-answering"
-_QA_EXPECTED_OUTPUT = "' was Jim Henson?Jim Henson was a n'"
-_QA_EXPECTED_LOSS = 3.13
-
-_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "ydshieh/tiny-random-gptj-for-sequence-classification"
-_SEQ_CLASS_EXPECTED_OUTPUT = "'LABEL_0'"
-_SEQ_CLASS_EXPECTED_LOSS = 0.76
 
 
 GPTJ_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -55,39 +55,28 @@
 ]
 
 
-def fixed_pos_embedding(x, seq_dim=1, seq_len=None):
-    dim = x.shape[-1]
-    if seq_len is None:
-        seq_len = x.shape[seq_dim]
+def create_sinusoidal_positions(num_pos: int, dim: int) -> torch.Tensor:
     inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2) / dim))
-    sinusoid_inp = (
-        torch.einsum("i , j -> i j", torch.arange(seq_len, dtype=torch.float), inv_freq).to(x.device).float()
-    )
-    return torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)
+    sinusoid_inp = torch.einsum("i , j -> i j", torch.arange(num_pos, dtype=torch.float), inv_freq).float()
+    return torch.cat((torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)), dim=1)
+
 
+@torch.fx.wrap
+def get_embed_positions(embed_positions, position_ids):
+    return embed_positions.to(position_ids.device).repeat(position_ids.shape[0], 1, 1)
 
-def rotate_every_two(x):
+
+def rotate_every_two(x: torch.Tensor) -> torch.Tensor:
     x1 = x[:, :, :, ::2]
     x2 = x[:, :, :, 1::2]
     x = torch.stack((-x2, x1), dim=-1)
     return x.flatten(-2)  # in einsum notation: rearrange(x, '... d j -> ... (d j)')
 
 
-def duplicate_interleave(m):
-    """
-    A simple version of `torch.repeat_interleave` for duplicating a matrix while interleaving the copy.
-    """
-    dim0 = m.shape[0]
-    m = m.view(-1, 1)  # flatten the matrix
-    m = m.repeat(1, 2)  # repeat all elements into the 2nd dimension
-    m = m.view(dim0, -1)  # reshape into a matrix, interleaving the copy
-    return m
-
-
-def apply_rotary_pos_emb(x, sincos, offset=0):
-    sin, cos = map(lambda t: duplicate_interleave(t)[None, offset : x.shape[1] + offset, None, :], sincos)
-    # einsum notation for lambda t: repeat(t[offset:x.shape[1]+offset,:], "n d -> () n () (d j)", j=2)
-    return (x * cos) + (rotate_every_two(x) * sin)
+def apply_rotary_pos_emb(tensor: torch.Tensor, sin: torch.Tensor, cos: torch.Tensor) -> torch.Tensor:
+    sin = torch.repeat_interleave(sin[:, :, None, :], 2, 3)
+    cos = torch.repeat_interleave(cos[:, :, None, :], 2, 3)
+    return (tensor * cos) + (rotate_every_two(tensor) * sin)
 
 
 class GPTJAttention(nn.Module):
@@ -97,11 +86,12 @@ def __init__(self, config):
         max_positions = config.max_position_embeddings
         self.register_buffer(
             "bias",
-            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.uint8)).view(
+            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
                 1, 1, max_positions, max_positions
             ),
+            persistent=False,
         )
-        self.register_buffer("masked_bias", torch.tensor(-1e9))
+        self.register_buffer("masked_bias", torch.tensor(-1e9), persistent=False)
 
         self.attn_dropout = nn.Dropout(config.attn_pdrop)
         self.resid_dropout = nn.Dropout(config.resid_pdrop)
@@ -120,9 +110,9 @@ def __init__(self, config):
         self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
         self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
         self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
-        self.rotary_dim = None
-        if config.rotary_dim is not None:
-            self.rotary_dim = config.rotary_dim
+        self.rotary_dim = config.rotary_dim
+        pos_embd_dim = self.rotary_dim or self.embed_dim
+        self.embed_positions = create_sinusoidal_positions(max_positions, pos_embd_dim)
 
     def _split_heads(self, tensor, num_attention_heads, attn_head_size, rotary):
         """
@@ -160,10 +150,9 @@ def _attn(
         attention_mask=None,
         head_mask=None,
     ):
-
         # compute causal mask from causal mask buffer
         query_length, key_length = query.size(-2), key.size(-2)
-        causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].to(torch.bool)
+        causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
 
         # Keep the attention weights computation in fp32 to avoid overflow issues
         query = query.to(torch.float32)
@@ -195,11 +184,19 @@ def _attn(
 
         return attn_output, attn_weights
 
+    def _get_embed_positions(self, position_ids):
+        embed_positions = self.embed_positions
+        if embed_positions.device != position_ids.device:
+            embed_positions = embed_positions.to(position_ids.device)
+            self.embed_positions = embed_positions
+        return embed_positions.repeat(position_ids.shape[0], 1, 1)
+
     def forward(
         self,
-        hidden_states: Optional[torch.FloatTensor],
-        attention_mask: Optional[torch.FloatTensor] = None,
+        hidden_states: torch.FloatTensor,
         layer_past: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
         head_mask: Optional[torch.FloatTensor] = None,
         use_cache: Optional[bool] = False,
         output_attentions: Optional[bool] = False,
@@ -207,7 +204,6 @@ def forward(
         Tuple[torch.Tensor, Tuple[torch.Tensor]],
         Optional[Tuple[torch.Tensor, Tuple[torch.Tensor], Tuple[torch.Tensor, ...]]],
     ]:
-
         query = self.q_proj(hidden_states)
         key = self.k_proj(hidden_states)
         value = self.v_proj(hidden_states)
@@ -216,12 +212,16 @@ def forward(
         key = self._split_heads(key, self.num_attention_heads, self.head_dim, True)
         value = self._split_heads(value, self.num_attention_heads, self.head_dim, False)
 
-        seq_len = key.shape[1]
-        offset = 0
+        if is_torch_fx_proxy(position_ids) or torch.jit.is_tracing():
+            # The logic to conditionally copy to GPU could not be traced, so we do this
+            # every time in the torch.fx case
+            embed_positions = get_embed_positions(self.embed_positions, position_ids)
+        else:
+            embed_positions = self._get_embed_positions(position_ids)
 
-        if layer_past is not None:
-            offset = layer_past[0].shape[-2]
-            seq_len += offset
+        repeated_position_ids = position_ids.unsqueeze(-1).repeat(1, 1, embed_positions.shape[-1])
+        sincos = torch.gather(embed_positions, 1, repeated_position_ids)
+        sin, cos = torch.split(sincos, sincos.shape[-1] // 2, dim=-1)
 
         if self.rotary_dim is not None:
             k_rot = key[:, :, :, : self.rotary_dim]
@@ -230,16 +230,14 @@ def forward(
             q_rot = query[:, :, :, : self.rotary_dim]
             q_pass = query[:, :, :, self.rotary_dim :]
 
-            sincos = fixed_pos_embedding(k_rot, 1, seq_len=seq_len)
-            k_rot = apply_rotary_pos_emb(k_rot, sincos, offset=offset)
-            q_rot = apply_rotary_pos_emb(q_rot, sincos, offset=offset)
+            k_rot = apply_rotary_pos_emb(k_rot, sin, cos)
+            q_rot = apply_rotary_pos_emb(q_rot, sin, cos)
 
             key = torch.cat([k_rot, k_pass], dim=-1)
             query = torch.cat([q_rot, q_pass], dim=-1)
         else:
-            sincos = fixed_pos_embedding(key, 1, seq_len=seq_len)
-            key = apply_rotary_pos_emb(key, sincos, offset=offset)
-            query = apply_rotary_pos_emb(query, sincos, offset=offset)
+            key = apply_rotary_pos_emb(key, sin, cos)
+            query = apply_rotary_pos_emb(query, sin, cos)
 
         key = key.permute(0, 2, 1, 3)
         query = query.permute(0, 2, 1, 3)
@@ -301,6 +299,7 @@ def forward(
         hidden_states: Optional[torch.FloatTensor],
         layer_past: Optional[Tuple[torch.Tensor]] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
         head_mask: Optional[torch.FloatTensor] = None,
         use_cache: Optional[bool] = False,
         output_attentions: Optional[bool] = False,
@@ -308,9 +307,10 @@ def forward(
         residual = hidden_states
         hidden_states = self.ln_1(hidden_states)
         attn_outputs = self.attn(
-            hidden_states,
+            hidden_states=hidden_states,
             layer_past=layer_past,
             attention_mask=attention_mask,
+            position_ids=position_ids,
             head_mask=head_mask,
             use_cache=use_cache,
             output_attentions=output_attentions,
@@ -340,6 +340,7 @@ class GPTJPreTrainedModel(PreTrainedModel):
     is_parallelizable = True
     supports_gradient_checkpointing = True
     _no_split_modules = ["GPTJBlock"]
+    _skip_keys_device_placement = "past_key_values"
 
     def __init__(self, *inputs, **kwargs):
         super().__init__(*inputs, **kwargs)
@@ -381,7 +382,7 @@ def _set_gradient_checkpointing(self, module, value=False):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`GPTJTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -499,6 +500,13 @@ def __init__(self, config):
 
     @add_start_docstrings(PARALLELIZE_DOCSTRING)
     def parallelize(self, device_map=None):
+        warnings.warn(
+            "`GPTJModel.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your"
+            " model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
+            " `device_map` but it needs to be a dictionary module_name to device, so for instance {'h.0': 0, 'h.1': 1,"
+            " ...}",
+            FutureWarning,
+        )
         # Check validity of device_map
         self.device_map = (
             get_device_map(len(self.h), range(torch.cuda.device_count())) if device_map is None else device_map
@@ -518,6 +526,10 @@ def parallelize(self, device_map=None):
 
     @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
     def deparallelize(self):
+        warnings.warn(
+            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
+            FutureWarning,
+        )
         self.model_parallel = False
         self.device_map = None
         self.first_device = "cpu"
@@ -536,10 +548,10 @@ def set_input_embeddings(self, new_embeddings):
 
     @add_start_docstrings_to_model_forward(GPTJ_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPast,
         config_class=_CONFIG_FOR_DOC,
+        real_checkpoint=_REAL_CHECKPOINT_FOR_DOC,
     )
     def forward(
         self,
@@ -565,6 +577,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
             batch_size = input_ids.shape[0]
@@ -579,9 +592,6 @@ def forward(
         if token_type_ids is not None:
             token_type_ids = token_type_ids.view(-1, input_shape[-1])
 
-        if position_ids is not None:
-            position_ids = position_ids.view(-1, input_shape[-1])
-
         if past_key_values is None:
             past_length = 0
             past_key_values = tuple([None] * len(self.h))
@@ -590,7 +600,7 @@ def forward(
 
         if position_ids is None:
             position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
-            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
+            position_ids = position_ids.unsqueeze(0)
 
         # Attention mask.
         if attention_mask is not None:
@@ -629,13 +639,19 @@ def forward(
 
         hidden_states = self.drop(hidden_states)
 
-        output_shape = input_shape + (hidden_states.size(-1),)
+        output_shape = (-1,) + input_shape[1:] + (hidden_states.size(-1),)
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
 
         presents = () if use_cache else None
         all_self_attentions = () if output_attentions else None
         all_hidden_states = () if output_hidden_states else None
         for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-
             # Model parallel
             if self.model_parallel:
                 torch.cuda.set_device(hidden_states.device)
@@ -652,12 +668,6 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         # None for past_key_value
@@ -670,13 +680,15 @@ def custom_forward(*inputs):
                     hidden_states,
                     None,
                     attention_mask,
+                    position_ids,
                     head_mask[i],
                 )
             else:
                 outputs = block(
-                    hidden_states,
+                    hidden_states=hidden_states,
                     layer_past=layer_past,
                     attention_mask=attention_mask,
+                    position_ids=position_ids,
                     head_mask=head_mask[i],
                     use_cache=use_cache,
                     output_attentions=output_attentions,
@@ -720,7 +732,7 @@ def custom_forward(*inputs):
     GPTJ_START_DOCSTRING,
 )
 class GPTJForCausalLM(GPTJPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -736,6 +748,13 @@ def __init__(self, config):
 
     @add_start_docstrings(PARALLELIZE_DOCSTRING)
     def parallelize(self, device_map=None):
+        warnings.warn(
+            "`GPTJForCausalLM.parallelize` is deprecated and will be removed in v5 of Transformers, you should load"
+            " your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
+            " `device_map` but it needs to be a dictionary module_name to device, so for instance {'transformer.h.0':"
+            " 0, 'transformer.h.1': 1, ...}",
+            FutureWarning,
+        )
         self.device_map = (
             get_device_map(len(self.transformer.h), range(torch.cuda.device_count()))
             if device_map is None
@@ -748,6 +767,10 @@ def parallelize(self, device_map=None):
 
     @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
     def deparallelize(self):
+        warnings.warn(
+            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
+            FutureWarning,
+        )
         self.transformer.deparallelize()
         self.transformer = self.transformer.to("cpu")
         self.lm_head = self.lm_head.to("cpu")
@@ -760,7 +783,7 @@ def get_output_embeddings(self):
     def set_output_embeddings(self, new_embeddings):
         self.lm_head = new_embeddings
 
-    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs):
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
         token_type_ids = kwargs.get("token_type_ids", None)
         # only last token for inputs_ids if past is defined in kwargs
         if past_key_values:
@@ -777,23 +800,31 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg
             position_ids.masked_fill_(attention_mask == 0, 1)
             if past_key_values:
                 position_ids = position_ids[:, -1].unsqueeze(-1)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
         else:
-            position_ids = None
-        return {
-            "input_ids": input_ids,
-            "past_key_values": past_key_values,
-            "use_cache": kwargs.get("use_cache"),
-            "position_ids": position_ids,
-            "attention_mask": attention_mask,
-            "token_type_ids": token_type_ids,
-        }
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "position_ids": position_ids,
+                "attention_mask": attention_mask,
+                "token_type_ids": token_type_ids,
+            }
+        )
+
+        return model_inputs
 
     @add_start_docstrings_to_model_forward(GPTJ_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=CausalLMOutputWithPast,
         config_class=_CONFIG_FOR_DOC,
+        real_checkpoint=_REAL_CHECKPOINT_FOR_DOC,
     )
     def forward(
         self,
@@ -845,6 +876,8 @@ def forward(
 
         loss = None
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(lm_logits.device)
             # Shift so that tokens < n predict n
             shift_logits = lm_logits[..., :-1, :].contiguous()
             shift_labels = labels[..., 1:].contiguous()
@@ -867,7 +900,9 @@ def forward(
         )
 
     @staticmethod
-    def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> Tuple[Tuple[torch.Tensor]]:
+    def _reorder_cache(
+        past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
+    ) -> Tuple[Tuple[torch.Tensor]]:
         """
         This function is used to re-order the `past_key_values` cache if [`~PretrainedModel.beam_search`] or
         [`~PretrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
@@ -875,7 +910,7 @@ def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) ->
         """
         return tuple(
             tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
-            for layer_past in past
+            for layer_past in past_key_values
         )
 
 
@@ -895,8 +930,6 @@ def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) ->
     GPTJ_START_DOCSTRING,
 )
 class GPTJForSequenceClassification(GPTJPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head.weight"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -912,12 +945,10 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(GPTJ_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
+        checkpoint="ydshieh/tiny-random-gptj-for-sequence-classification",
         output_type=SequenceClassifierOutputWithPast,
         config_class=_CONFIG_FOR_DOC,
-        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
-        expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
+        real_checkpoint=_REAL_CHECKPOINT_FOR_DOC,
     )
     def forward(
         self,
@@ -969,7 +1000,9 @@ def forward(
             sequence_lengths = -1
         else:
             if input_ids is not None:
-                sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device)
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+                    logits.device
+                )
             else:
                 sequence_lengths = -1
                 logger.warning(
@@ -981,6 +1014,7 @@ def forward(
 
         loss = None
         if labels is not None:
+            labels = labels.to(pooled_logits.device)
             if self.config.problem_type is None:
                 if self.num_labels == 1:
                     self.config.problem_type = "regression"
@@ -1022,8 +1056,6 @@ def forward(
     GPTJ_START_DOCSTRING,
 )
 class GPTJForQuestionAnswering(GPTJPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head.weight"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1039,12 +1071,10 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(GPTJ_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_QA,
+        checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output=_QA_EXPECTED_OUTPUT,
-        expected_loss=_QA_EXPECTED_LOSS,
+        real_checkpoint=_REAL_CHECKPOINT_FOR_DOC,
     )
     def forward(
         self,
@@ -1095,9 +1125,9 @@ def forward(
         if start_positions is not None and end_positions is not None:
             # If we are on multi-GPU, split add a dimension
             if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
+                start_positions = start_positions.squeeze(-1).to(start_logits.device)
             if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
+                end_positions = end_positions.squeeze(-1).to(end_logits.device)
             # sometimes the start/end positions are outside our model inputs, we ignore these terms
             ignored_index = start_logits.size(1)
             start_positions = start_positions.clamp(0, ignored_index)
diff --git a/src/transformers/models/gptj/modeling_tf_gptj.py b/src/transformers/models/gptj/modeling_tf_gptj.py
index 1e3b839e60b3..f215adaaac00 100644
--- a/src/transformers/models/gptj/modeling_tf_gptj.py
+++ b/src/transformers/models/gptj/modeling_tf_gptj.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 """ TF 2.0 GPT-J model."""
 
+from __future__ import annotations
+
 from typing import Optional, Tuple, Union
 
 import numpy as np
@@ -21,7 +23,6 @@
 
 from ...activations_tf import get_tf_activation
 from ...file_utils import (
-    DUMMY_INPUTS,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -43,7 +44,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import logging
 from .configuration_gptj import GPTJConfig
 
@@ -52,7 +53,6 @@
 
 _CHECKPOINT_FOR_DOC = "EleutherAI/gpt-j-6B"
 _CONFIG_FOR_DOC = "GPTJConfig"
-_TOKENIZER_FOR_DOC = "GPTJTokenizer"
 
 GPTJ_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "EleutherAI/gpt-j-6B",
@@ -172,8 +172,8 @@ def _attn(
         query: tf.Tensor,
         key: tf.Tensor,
         value: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
     ) -> Tuple[tf.Tensor, tf.Tensor]:
         # compute causal mask from causal mask buffer
         query_length, key_length = shape_list(query)[-2], shape_list(key)[-2]
@@ -208,9 +208,9 @@ def call(
         self,
         hidden_states: tf.Tensor,
         layer_past: Optional[Tuple[tf.Tensor, tf.Tensor]] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
         use_cache: bool = False,
         output_attentions: bool = False,
     ):
@@ -302,10 +302,10 @@ def __init__(self, config: GPTJConfig, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        layer_past: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
+        layer_past: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
         use_cache: bool = False,
         output_attentions: bool = False,
     ):
@@ -347,7 +347,6 @@ def __init__(self, config: GPTJConfig, *inputs, **kwargs):
         self.return_dict = config.use_return_dict
 
         self.num_hidden_layers = config.n_layer
-        self.vocab_size = config.vocab_size
         self.n_embd = config.n_embd
         self.n_positions = config.n_positions
         self.initializer_range = config.initializer_range
@@ -388,7 +387,6 @@ def call(
         return_dict=None,
         training=False,
     ) -> Union[TFBaseModelOutputWithPast, Tuple[tf.Tensor]]:
-
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
@@ -440,16 +438,7 @@ def call(
         position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
 
         if inputs_embeds is None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.wte.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.wte.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.wte.vocab_size)
             inputs_embeds = self.wte(input_ids, mode="embedding")
 
         if token_type_ids is not None:
@@ -523,30 +512,6 @@ class TFGPTJPreTrainedModel(TFPreTrainedModel):
     # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
     _keys_to_ignore_on_load_unexpected = [r"h.\d+.attn.bias"]
 
-    @property
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        dummy = {"input_ids": tf.constant(DUMMY_INPUTS, dtype=tf.int32)}
-        return dummy
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 GPTJ_START_DOCSTRING = r"""
 
@@ -598,7 +563,7 @@ def serving(self, inputs):
 
             If `past` is used, only input IDs that do not have their past calculated should be passed as `input_ids`.
 
-            Indices can be obtained using [`GPTJTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -665,20 +630,19 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(GPTJ_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFBaseModelOutputWithPast,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
+        input_ids: TFModelInputType | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -708,18 +672,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output):
-        pkv = tf.convert_to_tensor(output.past_key_values) if self.config.use_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutputWithPast(
-            last_hidden_state=output.last_hidden_state,
-            past_key_values=pkv,
-            hidden_states=hs,
-            attentions=attns,
-        )
-
 
 @add_start_docstrings(
     """
@@ -769,21 +721,20 @@ def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_cache=
     @unpack_inputs
     @add_start_docstrings_to_model_forward(GPTJ_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFCausalLMOutputWithPast,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
+        input_ids: TFModelInputType | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -833,13 +784,6 @@ def call(
             attentions=transformer_outputs.attentions,
         )
 
-    def serving_output(self, output):
-        pkv = tf.convert_to_tensor(output.past_key_values) if self.config.use_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFCausalLMOutputWithPast(logits=output.logits, past_key_values=pkv, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -873,21 +817,20 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(GPTJ_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFSequenceClassifierOutputWithPast,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
+        input_ids: TFModelInputType | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -924,16 +867,10 @@ def call(
         else:
             if input_ids is not None:
                 sequence_lengths = (
-                    tf.reduce_sum(
-                        tf.cast(
-                            tf.math.not_equal(input_ids, self.config.pad_token_id),
-                            dtype=input_ids.dtype,
-                        ),
-                        -1,
-                        keepdims=False,
-                    )
+                    tf.argmax(tf.cast(tf.math.equal(input_ids, self.config.pad_token_id), input_ids.dtype), axis=-1)
                     - 1
                 )
+                sequence_lengths = tf.where(sequence_lengths >= 0, sequence_lengths, input_ids.shape[-1] - 1)
                 in_logits = tf.gather(logits, sequence_lengths, batch_dims=1, axis=1)
             else:
                 sequence_lengths = -1
@@ -965,15 +902,6 @@ def call(
             attentions=transformer_outputs.attentions,
         )
 
-    def serving_output(self, output):
-        pkv = tf.convert_to_tensor(output.past_key_values) if self.config.use_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutputWithPast(
-            logits=output.logits, past_key_values=pkv, hidden_states=hs, attentions=attns
-        )
-
 
 @add_start_docstrings(
     """
@@ -996,22 +924,21 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(GPTJ_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
+        input_ids: TFModelInputType | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1065,11 +992,3 @@ def call(
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
         )
-
-    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
diff --git a/src/transformers/models/gptsan_japanese/__init__.py b/src/transformers/models/gptsan_japanese/__init__.py
new file mode 100644
index 000000000000..b3635ace9116
--- /dev/null
+++ b/src/transformers/models/gptsan_japanese/__init__.py
@@ -0,0 +1,70 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_flax_available,
+    is_tf_available,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_gptsan_japanese": ["GPTSAN_JAPANESE_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTSanJapaneseConfig"],
+    "tokenization_gptsan_japanese": ["GPTSanJapaneseTokenizer"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_gptsan_japanese"] = [
+        "GPTSAN_JAPANESE_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "GPTSanJapaneseForConditionalGeneration",
+        "GPTSanJapaneseModel",
+        "GPTSanJapanesePreTrainedModel",
+    ]
+    _import_structure["tokenization_gptsan_japanese"] = [
+        "GPTSanJapaneseTokenizer",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_gptsan_japanese import GPTSAN_JAPANESE_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTSanJapaneseConfig
+    from .tokenization_gptsan_japanese import GPTSanJapaneseTokenizer
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_gptsan_japanese import (
+            GPTSAN_JAPANESE_PRETRAINED_MODEL_ARCHIVE_LIST,
+            GPTSanJapaneseForConditionalGeneration,
+            GPTSanJapaneseModel,
+            GPTSanJapanesePreTrainedModel,
+        )
+        from .tokenization_gptsan_japanese import GPTSanJapaneseTokenizer
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/gptsan_japanese/configuration_gptsan_japanese.py b/src/transformers/models/gptsan_japanese/configuration_gptsan_japanese.py
new file mode 100644
index 000000000000..d20b79daacfd
--- /dev/null
+++ b/src/transformers/models/gptsan_japanese/configuration_gptsan_japanese.py
@@ -0,0 +1,158 @@
+# coding=utf-8
+# Copyright 2023, HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""  GPTSAN-japanese model configuration"""
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+GPTSAN_JAPANESE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "tanreinama/GPTSAN-2.8B-spout_is_uniform": (
+        "https://huggingface.co/tanreinama/GPTSAN-2.8B-spout_is_uniform/resolve/main/config.json"
+    ),
+}
+
+
+class GPTSanJapaneseConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GPTSanJapaneseModel`]. It is used to instantiate
+    a GPTSANJapanese model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the GPTSANJapanese
+    [Tanrei/GPTSAN-japanese](https://huggingface.co/Tanrei/GPTSAN-japanese) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Arguments:
+        vocab_size (`int`, *optional*, defaults to 36000):
+            Vocabulary size of the GPTSANJapanese model. Defines the number of different tokens that can be represented
+            by the `inputs_ids` passed when calling [`GPTSanJapaneseModel`].
+        max_position_embeddings (`int`, *optional*, defaults to 1280):
+            The maximum sequence length that this model might ever be used with. Defaults set this to 1280.
+        d_model (`int`, *optional*, defaults to 1024):
+            Size of the encoder layers and the pooler layer.
+        d_ff (`int`, *optional*, defaults to 8192):
+            Size of the intermediate feed forward layer in each `SwitchTransformersBlock`.
+        d_ext (`int`, *optional*, defaults to 4096):
+            Size of the intermediate feed forward layer in each Extra-layers.
+        d_spout (`int`, *optional*, defaults to 128):
+            Size of the `spout` vector.
+        num_switch_layers (`int`, *optional*, defaults to 10):
+            Number of layers in the Switch Transformer layer.
+        num_ext_layers (`int`, *optional*, defaults to 0):
+            Number of layers in the Extra-layers.
+        num_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_experts (`int`, *optional*, defaults to 16):
+            Number of experts for each SwitchTransformer layer.
+        expert_capacity (`int`, *optional*, defaults to 128):
+            Number of tokens that can be stored in each expert. If set to 1, the model will behave like a regular
+            Transformer.
+        dropout_rate (`float`, *optional*, defaults to 0.0):
+            The ratio for all dropout layers.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        router_bias (`bool`, *optional*, defaults to `False`):
+            Whether to add a bias to the router.
+        router_jitter_noise (`float`, *optional*, defaults to 0.0):
+            Amount of noise to add to the router. Set it to 0.0 during prediction or set small value (usually 1e-2)
+            during training.
+        router_dtype (`str`, *optional*, default to `"float32"`):
+            The `dtype` used for the routers. It is preferable to keep the `dtype` to `"float32"` as specified in the
+            *selective precision* discussion in [the paper](https://arxiv.org/abs/2101.03961).
+        router_ignore_padding_tokens (`bool`, *optional*, defaults to `False`):
+            Whether to ignore padding tokens when routing.
+        output_hidden_states (`bool`, *optional*, default to `False`):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        output_attentions (`bool`, *optional*, defaults to `False`):
+            Whether or not to return the attentions tensors of all attention layers.
+        initializer_factor (`float`, *optional*, defaults to 0.002):
+            A factor for initializing all weight matrices.
+        output_router_logits (`bool`, *optional*, default to `False`):
+            Whether or not to return the router logits of all experts.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models)
+    """
+    model_type = "gptsan-japanese"
+    keys_to_ignore_at_inference = [
+        "past_key_values",
+    ]
+    attribute_map = {
+        "hidden_size": "d_model",
+        "num_attention_heads": "num_heads",
+        "num_hidden_layers": "num_layers",
+    }
+
+    def __init__(
+        self,
+        vocab_size=36000,
+        max_position_embeddings=1280,
+        d_model=1024,
+        d_ff=8192,
+        d_ext=4096,
+        d_spout=128,
+        num_switch_layers=10,
+        num_ext_layers=0,
+        num_heads=16,
+        num_experts=16,
+        expert_capacity=128,
+        dropout_rate=0.0,
+        layer_norm_epsilon=1e-5,
+        router_bias=False,
+        router_jitter_noise=0.0,
+        router_dtype="float32",
+        router_ignore_padding_tokens=False,
+        output_hidden_states=False,
+        output_attentions=False,
+        initializer_factor=0.002,
+        output_router_logits=False,
+        use_cache=True,
+        separator_token_id=35998,
+        pad_token_id=35995,
+        eos_token_id=35999,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.d_ff = d_ff
+        self.d_ext = d_ext
+        self.d_spout = d_spout
+        self.num_switch_layers = num_switch_layers
+        self.num_ext_layers = num_ext_layers
+        self.num_layers = num_switch_layers + num_ext_layers
+        self.num_heads = num_heads
+        self.num_experts = num_experts
+        self.expert_capacity = expert_capacity
+        self.dropout_rate = dropout_rate
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.router_bias = router_bias
+        self.router_jitter_noise = router_jitter_noise
+        self.router_dtype = router_dtype
+        self.router_ignore_padding_tokens = router_ignore_padding_tokens
+        self.output_hidden_states = output_hidden_states
+        self.output_attentions = output_attentions
+        self.initializer_factor = initializer_factor
+        self.output_router_logits = output_router_logits
+        self.use_cache = use_cache
+
+        super().__init__(
+            separator_token_id=separator_token_id,
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
diff --git a/src/transformers/models/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py b/src/transformers/models/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py
new file mode 100644
index 000000000000..a84d000d4439
--- /dev/null
+++ b/src/transformers/models/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py
@@ -0,0 +1,181 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Convert GPTSANJapanese checkpoints from the original repository to pytorch model."""
+
+import argparse
+import json
+import os
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+import torch
+
+
+def convert_tf_gptsan_to_pt(args):
+    parameter_file = os.path.join(args.tf_model_dir, "parameters.json")
+    params = json.loads(open(parameter_file).read())
+    if not params:
+        raise ValueError(
+            f"It seems that the json file at {parameter_file} is empty. Make sure you have a correct json file."
+        )
+    if not args.output.endswith(".pt"):
+        args.output = args.output + ".pt"
+    new_state = OrderedDict()
+    with tf.device("/CPU:0"):
+        reader = tf.train.load_checkpoint(args.tf_model_dir)
+        shapes = reader.get_variable_to_shape_map()
+        for key_name in shapes.keys():
+            vnp = reader.get_tensor(key_name).astype(np.float16)
+            if key_name.endswith("/adam_m") or key_name.endswith("/adam_v"):
+                continue
+            if key_name.startswith("pasts/"):
+                if key_name.startswith("pasts/mlp"):
+                    player = int(key_name[9])
+                elif key_name.startswith("pasts/out"):
+                    player = 8
+                name = "model.sqout.%d.weight" % (player * 2)  # enter to nn.Sequencial with Tanh, so 2 at a time
+                state = vnp.transpose([1, 0]).copy()  # Mesh-Tensorflow is a diagonal matrix
+                new_state[name] = torch.tensor(state)
+            elif key_name.startswith("model/moe"):
+                player = int(key_name[9:].split("/")[0])
+                if key_name.endswith("/switch_gating/kernel"):
+                    name = "model.blocks.%d.feed_forward.mlp.router.classifier.weight" % player
+                    state = vnp.transpose([1, 0]).copy()  # Mesh-Tensorflow is a diagonal matrix
+                    new_state[name] = torch.tensor(state)
+                elif key_name.endswith("/softmlp/kernel"):
+                    name = "model.blocks.%d.feed_forward.soft_bypass_mlp.weight" % player
+                    state = vnp.transpose([1, 0]).copy()  # Mesh-Tensorflow is a diagonal matrix
+                    new_state[name] = torch.tensor(state)
+                elif key_name.endswith("/wo/kernel") or key_name.endswith("/wi/kernel"):
+                    nlayer = key_name[-9:-7]
+                    for i in range(16):
+                        name = "model.blocks.%d.feed_forward.mlp.experts.expert_%d.%s.weight" % (player, i, nlayer)
+                        state = (
+                            vnp[i].transpose([1, 0]).copy()
+                        )  # In Mesh-Tensorflow, it is one array, so it is divided
+                        new_state[name] = torch.tensor(state)
+            elif key_name.startswith("model/mlp"):
+                player = int(key_name[9:].split("/")[0])
+                if key_name.endswith("/p1/kernel"):
+                    name = "model.blocks.%d.feed_forward.mlp.wi.weight" % player
+                    state = vnp.transpose([1, 0]).copy()  # Mesh-Tensorflow is a diagonal matrix
+                    new_state[name] = torch.tensor(state)
+                elif key_name.endswith("/p1/bias"):
+                    name = "model.blocks.%d.feed_forward.mlp.wi.bias" % player
+                    state = vnp.copy()  # same because it is one dimensional
+                    new_state[name] = torch.tensor(state)
+                elif key_name.endswith("/p2/kernel"):
+                    name = "model.blocks.%d.feed_forward.mlp.wo.weight" % player
+                    state = vnp.transpose([1, 0]).copy()  # Mesh-Tensorflow is a diagonal matrix
+                    new_state[name] = torch.tensor(state)
+                elif key_name.endswith("/p2/bias"):
+                    name = "model.blocks.%d.feed_forward.mlp.wo.bias" % player
+                    state = vnp.copy()  # same because it is one dimensional
+                    new_state[name] = torch.tensor(state)
+            elif key_name.startswith("model/ln"):
+                player = int(key_name[8:].split("/")[0])
+                if key_name.endswith("/b"):
+                    name = "model.blocks.%d.feed_forward.norm.bias" % player
+                    state = vnp.copy()  # same because it is one dimensional
+                    new_state[name] = torch.tensor(state)
+                elif key_name.endswith("/g"):
+                    name = "model.blocks.%d.feed_forward.norm.weight" % player
+                    state = vnp.copy()  # same because it is one dimensional
+                    new_state[name] = torch.tensor(state)
+            elif key_name.startswith("model/att"):
+                player = int(key_name[9:].split("/")[0])
+                if key_name.endswith("/qkv/kernel"):
+                    state = vnp.copy()  # Compute same dimension as Mesh-tensorflow using einsum
+                    state_q = state[:, 0, :, :]
+                    state_k = state[:, 1, :, :]
+                    state_v = state[:, 2, :, :]
+                    state_q = (
+                        state_q.reshape([state_q.shape[0], state_q.shape[1] * state_q.shape[2]])
+                        .transpose([1, 0])
+                        .copy()
+                    )  # Mesh-Tensorflow is a diagonal matrix
+                    state_k = (
+                        state_k.reshape([state_k.shape[0], state_k.shape[1] * state_k.shape[2]])
+                        .transpose([1, 0])
+                        .copy()
+                    )  # Mesh-Tensorflow is a diagonal matrix
+                    state_v = (
+                        state_v.reshape([state_v.shape[0], state_v.shape[1] * state_v.shape[2]])
+                        .transpose([1, 0])
+                        .copy()
+                    )  # Mesh-Tensorflow is a diagonal matrix
+                    name = "model.blocks.%d.self_attn.self_attn.q_proj.weight" % player
+                    new_state[name] = torch.tensor(state_q)
+                    name = "model.blocks.%d.self_attn.self_attn.k_proj.weight" % player
+                    new_state[name] = torch.tensor(state_k)
+                    name = "model.blocks.%d.self_attn.self_attn.v_proj.weight" % player
+                    new_state[name] = torch.tensor(state_v)
+                elif key_name.endswith("/o/kernel"):
+                    name = "model.blocks.%d.self_attn.self_attn.out_proj.weight" % player
+                    state = (
+                        vnp.reshape([vnp.shape[0] * vnp.shape[1], vnp.shape[2]]).transpose([1, 0]).copy()
+                    )  # Mesh-Tensorflow is a diagonal matrix
+                    new_state[name] = torch.tensor(state)
+            elif key_name.startswith("model/an"):
+                player = int(key_name[8:].split("/")[0])
+                if key_name.endswith("/b"):
+                    name = "model.blocks.%d.self_attn.norm.bias" % player
+                    state = vnp.copy()  # same because it is one dimensional
+                    new_state[name] = torch.tensor(state)
+                elif key_name.endswith("/g"):
+                    name = "model.blocks.%d.self_attn.norm.weight" % player
+                    state = vnp.copy()  # same because it is one dimensional
+                    new_state[name] = torch.tensor(state)
+            elif (
+                key_name.startswith("model/wte")
+                or key_name.startswith("model/wpe")
+                or key_name.startswith("model/ete")
+            ):
+                nlayer = {"wte": "embed_tokens", "wpe": "position_embeddings", "ete": "extra_position_embeddings"}[
+                    key_name[-3:]
+                ]
+                name = "model.%s.weight" % nlayer
+                state = vnp.copy()  # same in embedded
+                new_state[name] = torch.tensor(state)
+                if key_name.startswith("model/wte"):
+                    name = "lm_head.weight"
+                    state = vnp.copy()  # same in embedded
+                    new_state[name] = torch.tensor(state)
+            elif key_name.startswith("model/wob"):
+                name = "final_logits_bias"
+                state = vnp.copy()  # same in embedded
+                state = state.reshape((1, -1))
+                new_state[name] = torch.tensor(state)
+            elif key_name == "model/dense/kernel":
+                name = "model.last_project.weight"
+                state = vnp.transpose([1, 0]).copy()  # Mesh-Tensorflow is a diagonal matrix
+                new_state[name] = torch.tensor(state)
+            elif key_name == "model/dense_1/bias":
+                name = "model.last_project.bias"
+                state = vnp.copy()  # same because it is one dimensional
+                new_state[name] = torch.tensor(state)
+    torch.save(new_state, args.output)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="model converter.", formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("--tf_model_dir", metavar="PATH", type=str, required=True, help="import model")
+    parser.add_argument("--output", metavar="PATH", type=str, required=True, help="output model")
+    args = parser.parse_args()
+    convert_tf_gptsan_to_pt(args)
diff --git a/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py b/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py
new file mode 100644
index 000000000000..0d9301406da8
--- /dev/null
+++ b/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py
@@ -0,0 +1,1354 @@
+# coding=utf-8
+# Copyright 2023 Toshiyuki Sakamoto(tanreinama) and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch GPTSANJapanese model."""
+
+
+import copy
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from ...activations import ACT2FN
+from ...modeling_outputs import MoECausalLMOutputWithPast, MoEModelOutputWithPastAndCrossAttentions
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    DUMMY_INPUTS,
+    DUMMY_MASK,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_torch_fx_proxy,
+    logging,
+)
+from .configuration_gptsan_japanese import GPTSanJapaneseConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "GPTSanJapaneseConfig"
+_CHECKPOINT_FOR_DOC = "Tanrei/GPTSAN-japanese"
+
+####################################################
+# This dict contains ids and associated url
+# for the pretrained weights provided with the models
+####################################################
+GPTSAN_JAPANESE_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "Tanrei/GPTSAN-japanese",
+    # See all GPTSAN-japanese models at https://huggingface.co/models?filter=gptsan-japanese
+]
+
+
+# Copied from transformers.models.switch_transformers.modeling_switch_transformers.router_z_loss_func
+def router_z_loss_func(router_logits: torch.Tensor) -> float:
+    r"""
+    Compute the router z-loss implemented in PyTorch.
+
+    The router z-loss was introduced in [Designing Effective Sparse Expert Models](https://arxiv.org/abs/2202.08906).
+    It encourages router logits to remain small in an effort to improve stability.
+
+    Args:
+        router_logits (`float`):
+            Input logits of shape [batch_size, sequence_length, num_experts]
+
+    Returns:
+        Scalar router z-loss.
+    """
+    num_groups, tokens_per_group, _ = router_logits.shape
+    log_z = torch.logsumexp(router_logits, dim=-1)
+    z_loss = log_z**2
+    return torch.sum(z_loss) / (num_groups * tokens_per_group)
+
+
+# Copied from transformers.models.switch_transformers.modeling_switch_transformers.load_balancing_loss_func
+def load_balancing_loss_func(router_probs: torch.Tensor, expert_indices: torch.Tensor) -> float:
+    r"""
+    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.
+
+    See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss
+    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
+    experts is too unbalanced.
+
+    Args:
+        router_probs (`torch.Tensor`):
+            Probability assigned to each expert per token. Shape: [batch_size, seqeunce_length, num_experts].
+        expert_indices (`torch.Tensor`):
+            Indices tensor of shape [batch_size, seqeunce_length] identifying the selected expert for a given token.
+
+    Returns:
+        The auxiliary loss.
+    """
+    num_experts = router_probs.shape[-1]
+
+    # cast the expert indices to int64, otherwise one-hot encoding will fail
+    if expert_indices.dtype != torch.int64:
+        expert_indices = expert_indices.to(torch.int64)
+
+    if len(expert_indices.shape) == 2:
+        expert_indices = expert_indices.unsqueeze(2)
+
+    expert_mask = torch.nn.functional.one_hot(expert_indices, num_experts)
+
+    # For a given token, determine if it was routed to a given expert.
+    expert_mask = torch.max(expert_mask, axis=-2).values
+
+    # cast to float32 otherwise mean will fail
+    expert_mask = expert_mask.to(torch.float32)
+    tokens_per_group_and_expert = torch.mean(expert_mask, axis=-2)
+
+    router_prob_per_group_and_expert = torch.mean(router_probs, axis=-2)
+    return torch.mean(tokens_per_group_and_expert * router_prob_per_group_and_expert) * (num_experts**2)
+
+
+class GPTSanJapaneseDenseActDense(nn.Module):
+    """
+    FFN Layer for Switch Transformer and Extra layers
+
+    GPTSAN can mix Switch Transformer layers and normal Transformer layers This class is used as Expert in Switch
+    Transformer layers and as FFN in regular Transformer layers. RELU is used in the Switch Transformer layer, and
+    Swish is used in the normal Transformer layer, so there is a choice of which is used in the argument.
+
+    """
+
+    def __init__(self, config: GPTSanJapaneseConfig, ext_layer=False):
+        super().__init__()
+        d_inter = config.d_ext if ext_layer else config.d_ff
+        self.wi = nn.Linear(config.d_model, d_inter, bias=ext_layer)
+        self.wo = nn.Linear(d_inter, config.d_model, bias=ext_layer)
+        self.dropout = nn.Identity() if ext_layer else nn.Dropout(config.dropout_rate)
+        self.act = ACT2FN["swish" if ext_layer else "relu"]
+
+    def forward(self, hidden_states):
+        r"""
+        Args:
+            hidden_states (`torch.Tensor`) :
+                [num_groups, tokens_per_group, hidden_dim] inputs to send to experts.
+        Returns:
+            torch.Tensor[num_groups, tokens_per_group, hidden_dim]
+
+        """
+        hidden_states = self.wi(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.switch_transformers.modeling_switch_transformers.SwitchTransformersTop1Router with SwitchTransformers->GPTSanJapanese
+class GPTSanJapaneseTop1Router(nn.Module):
+    """
+    Router using tokens choose top-1 experts assignment.
+
+    This router uses the same mechanism as in Switch Transformer (https://arxiv.org/abs/2101.03961) and V-MoE
+    (https://arxiv.org/abs/2106.05974): tokens choose their top experts. Items are sorted by router_probs and then
+    routed to their choice of expert until the expert's expert_capacity is reached. **There is no guarantee that each
+    token is processed by an expert**, or that each expert receives at least one token.
+
+    """
+
+    def __init__(self, config: GPTSanJapaneseConfig):
+        super().__init__()
+        self.num_experts = config.num_experts
+        self.expert_capacity = config.expert_capacity
+        self.classifier = nn.Linear(config.hidden_size, self.num_experts, bias=config.router_bias)
+        self.jitter_noise = config.router_jitter_noise
+        self.ignore_padding_tokens = config.router_ignore_padding_tokens
+        self.dtype = getattr(torch, config.router_dtype)
+
+    def _compute_router_probabilities(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        r"""
+        Computes router probabilities from input hidden states.
+
+        Args:
+            hidden_states (`torch.Tensor`):
+                (batch_size, sequence_length, hidden_dim) from which router probabilities are computed.
+        Returns:
+            router_probabilities (`torch.Tensor`):
+                Tensor of shape (batch_size, sequence_length, num_experts) corresponding to the probabilities for each
+                token and expert. Used for routing tokens to experts.
+            router_logits (`torch.Tensor`):
+                Logits tensor of shape (batch_size, sequence_length, num_experts) corresponding to raw router logits.
+                This is used later for computing router z-loss.
+        """
+        # float32 is used to ensure stability. See the discussion of "selective precision" in
+        # https://arxiv.org/abs/2101.03961.
+        # We also store the previous dtype to cast back the output to the previous dtype
+        self.input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(self.dtype)
+
+        if self.jitter_noise > 0:
+            # Get the lower and upper bound of the uniform distribution
+            # Adapted from: https://stackoverflow.com/questions/44328530/how-to-get-a-uniform-distribution-in-a-range-r1-r2-in-pytorch
+            distrib_lower_bound = 1.0 - self.jitter_noise
+            distrib_upper_bound = 1.0 + self.jitter_noise
+
+            uniform_distrib = torch.rand(hidden_states.shape, device=hidden_states.device, dtype=self.dtype)
+            uniform_distrib = uniform_distrib * (distrib_lower_bound - distrib_upper_bound)
+
+            uniform_distrib = uniform_distrib + distrib_upper_bound
+            # Multiply the token inputs by the uniform distribution - adding some noise
+            hidden_states *= uniform_distrib
+
+        # Shape: [num_groups, tokens_per_group, num_experts]
+        self._cast_classifier()
+        router_logits = self.classifier(hidden_states)
+
+        # Apply Softmax and cast back to the original `dtype`
+        router_probabilities = nn.functional.softmax(router_logits, dim=-1, dtype=self.dtype).to(self.input_dtype)
+        return router_probabilities, router_logits
+
+    def _cast_classifier(self):
+        r"""
+        `bitsandbytes` `Linear8bitLt` layers does not support manual casting Therefore we need to check if they are an
+        instance of the `Linear8bitLt` class by checking special attributes.
+        """
+        if not (hasattr(self.classifier, "SCB") or hasattr(self.classifier, "CB")):
+            self.classifier = self.classifier.to(self.dtype)
+
+    def forward(self, hidden_states: torch.Tensor) -> Tuple:
+        r"""
+        Generic forward function for every Router class. Each Router expects to have the same input hidden states
+        (`hidden_states`) corresponding to the hidden states for each token, the `expert_capacity` corresponding to the
+        number of tokens the Router will send to each expert, some Routers can send up to few tokens to each expert.
+
+        Each Router works as the following: it expects the hidden states for each token, gets the `router_probs` and
+        `router_logits` from the `router_weights`. This will assign for each token, the raw probability to be assigned
+        to an expert. Then each Router class will have to define its own `_compute_routing_instructions`.
+
+        Args:
+            hidden_states (`torch.Tensor`) :
+                [num_groups, tokens_per_group, hidden_dim] inputs to send to experts.
+        Returns:
+            Tuple[`torch.Tensor`, `torch.Tensor`, `torch.Tensor`] Tuple containing the expert index, the router probs
+            and the router logits. The router probabilities and logits are required to compute the loss.
+        """
+        router_probs, router_logits = self._compute_router_probabilities(hidden_states)
+
+        expert_index = torch.argmax(router_probs, dim=-1)
+        expert_index = torch.nn.functional.one_hot(expert_index, num_classes=self.num_experts)
+
+        # Mask tokens outside expert capacity. Sum over each sequence
+        token_priority = torch.cumsum(expert_index, dim=-2)
+        # mask if the token routed to to the expert will overflow
+        expert_capacity_mask = token_priority <= self.expert_capacity
+        expert_index = expert_index * expert_capacity_mask
+
+        router_probs = torch.max(router_probs, dim=-1).values.unsqueeze(-1)
+        return expert_index, router_probs, router_logits
+
+
+# Copied from transformers.models.switch_transformers.modeling_switch_transformers.SwitchTransformersSparseMLP with SwitchTransformers->GPTSanJapanese
+class GPTSanJapaneseSparseMLP(nn.Module):
+    r"""
+    Implementation of the Switch Transformers Sparse MLP module.
+    """
+
+    def __init__(self, config: GPTSanJapaneseConfig, expert_class: nn.Module = GPTSanJapaneseDenseActDense):
+        super().__init__()
+        # Step 1: Get the correct router according to its class
+        self.router = GPTSanJapaneseTop1Router(config)
+
+        # Step 2: Get the experts
+        self.experts = nn.ModuleDict()
+        for idx in range(config.num_experts):
+            self.experts[f"expert_{idx}"] = expert_class(config)
+
+    def forward(self, hidden_states):
+        r"""
+        Hold on, this will be slightly tricky to understand In the correct order, a MoE layer does the following:
+
+        1- Gets the `router_mask` from the router. The shape of the mask is `(batch_size, sequence_length, num_expert)`
+        and corresponds to the argmax of the `router_probs`. The probabilities are needed in the computation of the
+        hidden states : they are broadcasted to the hidden states values (can be interpreted as a scaling factor).
+
+        2- Dispatch the tokens to its associated experts. We do a classic for loop over the experts and assign for each
+        expert the corresponding hidden states.
+
+        """
+        # Step 1: Get the router_mask from the router as wel as the probabilities
+        router_mask, router_probs, router_logits = self.router(hidden_states)
+        expert_index = torch.argmax(router_mask, dim=-1)
+
+        # The routers introduced might not always map all the tokens, to a router, which means that some hidden states
+        # can be unchanged from one layer to another. That is why the hidden states are cloned before updating only the seleced ones.
+
+        next_states = hidden_states.clone()
+        for idx, expert in enumerate(self.experts.values()):
+            token_indices = router_mask[:, :, idx].bool()
+            next_states[token_indices] = expert(hidden_states[token_indices])
+
+        hidden_states = router_probs * next_states
+        return hidden_states, (router_logits, expert_index)
+
+
+class GPTSanJapaneseLayerSparseFF(nn.Module):
+    r"""
+    Switch Transformers Feed Forward layer module. This is a wrapper around the Mixture of Experts module.
+
+    Parameters:
+        config : ([`GPTSanJapaneseConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+    """
+
+    def __init__(self, config: GPTSanJapaneseConfig):
+        super().__init__()
+        self.mlp = GPTSanJapaneseSparseMLP(config)
+        self.soft_bypass_mlp = nn.Linear(config.d_model, config.d_model, bias=False)
+        self.norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+
+    def forward(self, hidden_states, output_router_logits):
+        r"""
+        Args:
+            hidden_states (`torch.Tensor`) :
+                [num_groups, tokens_per_group, hidden_dim] inputs to send to experts.
+            output_router_logits (`bool`) :
+                output experts router output.
+        Returns:
+            torch.Tensor[num_groups, tokens_per_group, hidden_dim]
+
+        """
+        forwarded_states, router_tuple = self.mlp(hidden_states)
+        forwarded_states += torch.tanh(self.soft_bypass_mlp(hidden_states))
+        output = hidden_states + self.norm(forwarded_states)
+
+        if output_router_logits and router_tuple is not None:
+            return output, router_tuple
+        else:
+            return output
+
+
+class GPTSanJapaneseLayerDenseFF(nn.Module):
+    r"""
+    Extra Transformers Feed Forward layer module.
+
+    Parameters:
+        config : ([`GPTSanJapaneseConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+    """
+
+    def __init__(self, config: GPTSanJapaneseConfig):
+        super().__init__()
+        # Check if it is a sparse layer, if not then it is a dense layer
+        self.mlp = GPTSanJapaneseDenseActDense(config, ext_layer=True)
+        self.norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+
+    def forward(self, hidden_states):
+        r"""
+        Args:
+            hidden_states (`torch.Tensor`) :
+                [num_groups, tokens_per_group, hidden_dim] inputs to send to experts.
+        Returns:
+            torch.Tensor[num_groups, tokens_per_group, hidden_dim]
+
+        """
+        forwarded_states = self.mlp(hidden_states)
+        output = hidden_states + self.norm(forwarded_states)
+        return output
+
+
+# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->GPTSanJapanese
+class GPTSanJapaneseAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_value` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        if (
+            is_cross_attention
+            and past_key_value is not None
+            and past_key_value[0].shape[2] == key_value_states.shape[1]
+        ):
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
+class GPTSanJapaneseLayerSelfAttention(nn.Module):
+    """
+    Self Attention and Normalization Unit
+    """
+
+    def __init__(self, config, has_relative_attention_bias=False):
+        super().__init__()
+        self.self_attn = GPTSanJapaneseAttention(
+            embed_dim=config.d_model,
+            num_heads=config.num_heads,
+            is_decoder=True,
+            bias=has_relative_attention_bias,
+        )
+        self.norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+
+    def forward(
+        self,
+        hidden_states: Optional[Tuple[torch.FloatTensor]],
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
+        r"""
+        Self-attention and normalize block.
+
+        Args:
+            hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                if the model is configured as a decoder.
+            past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+                Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up
+                decoding. If `past_key_values` are used, the user can optionally input only the last
+                `decoder_input_ids` (those that don't have their past key value states given to this model) of shape
+                `(batch_size, 1)` instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+            attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used
+                in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+            head_mask (`numpy.ndarray` of shape `({0})`, `optional):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        Returns:
+            Tuple[torch.Tensor[num_groups, tokens_per_group, hidden_dim],...]
+        """
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        atten_out = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=(1 - attention_mask) * torch.finfo(hidden_states.dtype).min,
+            layer_head_mask=head_mask,
+            output_attentions=output_attentions,
+        )
+        if output_attentions:
+            attn_weights = (atten_out[1],)
+        else:
+            attn_weights = ()
+
+        attention_output = atten_out[0]
+
+        hidden = hidden_states + self.norm(attention_output)
+
+        if use_cache:
+            outputs = (hidden, atten_out[2])  # hidden, present, (attentions)
+        else:
+            outputs = (hidden,)  # hidden, (attentions)
+
+        return outputs + attn_weights
+
+
+class GPTSanJapaneseBlock(nn.Module):
+    """
+    Self Attention and FFN Unit
+    """
+
+    def __init__(self, config, ext_layer=False):
+        super().__init__()
+        self.self_attn = GPTSanJapaneseLayerSelfAttention(config)
+        self.feed_forward = GPTSanJapaneseLayerDenseFF(config) if ext_layer else GPTSanJapaneseLayerSparseFF(config)
+
+    def forward(
+        self,
+        hidden_states: Optional[Tuple[torch.FloatTensor]],
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        output_router_tuple: Optional[bool] = False,
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
+        r"""
+        GPTSAN transformer block.
+
+        Args:
+            hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                if the model is configured as a decoder.
+            past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+                Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up
+                decoding. If `past_key_values` are used, the user can optionally input only the last
+                `decoder_input_ids` (those that don't have their past key value states given to this model) of shape
+                `(batch_size, 1)` instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+            attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used
+                in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+            head_mask (`numpy.ndarray` of shape `({0})`, `optional):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            output_attentions (`bool`) :
+                output attention probabirities.
+            output_router_tuple:
+                output experts router logits and expert id.
+        Returns:
+            Tuple[torch.Tensor[num_groups, tokens_per_group, hidden_dim],...]
+        """
+        atten_out = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=past_key_value,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        attention_output = atten_out[0]
+
+        if isinstance(self.feed_forward, GPTSanJapaneseLayerSparseFF):
+            sparse_out = self.feed_forward(attention_output, output_router_tuple)
+            if output_router_tuple:
+                hidden, router_tuple = sparse_out
+            else:
+                hidden = sparse_out
+        else:
+            hidden = self.feed_forward(attention_output)
+
+        outputs = (hidden,) + atten_out[1:]
+
+        if isinstance(self.feed_forward, GPTSanJapaneseLayerSparseFF) and output_router_tuple:
+            outputs += (router_tuple,)
+
+        return outputs
+
+
+class GPTSanJapanesePreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = GPTSanJapaneseConfig
+    base_model_prefix = "gptsan_japanese"
+    supports_gradient_checkpointing = False
+    _no_split_modules = ["GPTSanJapaneseBlock"]
+    _skip_keys_device_placement = "past_key_values"
+
+    @property
+    def dummy_inputs(self):
+        input_ids = torch.tensor(DUMMY_INPUTS)
+        input_mask = torch.tensor(DUMMY_MASK)
+        dummy_inputs = {
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+        }
+        return dummy_inputs
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_factor  # Used for testing weights initialization
+        if isinstance(module, nn.LayerNorm):
+            module.weight.data.fill_(factor * 1.0)
+            module.bias.data.zero_()
+        elif isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module, "bias") and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=factor * 1.0)
+        elif isinstance(module, GPTSanJapaneseModel):
+            # Mesh TensorFlow embeddings initialization
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
+            module.embed_tokens.weight.data.normal_(mean=0.0, std=factor * 1.0)
+            module.position_embeddings.weight.data.normal_(mean=0.0, std=factor * 1.0)
+            if hasattr(module, "extra_position_embeddings") and module.extra_position_embeddings is not None:
+                module.extra_position_embeddings.weight.data.normal_(mean=0.0, std=factor * 1.0)
+        elif isinstance(module, (GPTSanJapaneseModel, GPTSanJapaneseForConditionalGeneration)):
+            # Mesh TensorFlow embeddings initialization
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
+            module.final_logits_bias.data.normal_(mean=0.0, std=factor * 1.0)
+            if hasattr(module, "lm_head") and not self.config.tie_word_embeddings:
+                module.lm_head.weight.data.normal_(mean=0.0, std=factor * 1.0)
+        elif isinstance(module, GPTSanJapaneseDenseActDense):
+            # Mesh TensorFlow FF initialization
+            # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
+            # and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89
+            module.wi.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.wi, "bias") and module.wi.bias is not None:
+                module.wi.bias.data.zero_()
+            module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
+            if hasattr(module.wo, "bias") and module.wo.bias is not None:
+                module.wo.bias.data.zero_()
+        elif isinstance(module, GPTSanJapaneseAttention):
+            # Multi-headed attention
+            d_model = self.config.d_model
+            key_value_proj_dim = self.config.d_model
+            n_heads = self.config.num_heads
+            module.k_proj.weight.data.normal_(mean=0.0, std=factor * ((d_model * key_value_proj_dim) ** -0.5))
+            module.v_proj.weight.data.normal_(mean=0.0, std=factor * ((d_model * key_value_proj_dim) ** -0.5))
+            module.q_proj.weight.data.normal_(mean=0.0, std=factor * ((d_model * key_value_proj_dim) ** -0.5))
+            module.out_proj.weight.data.normal_(mean=0.0, std=factor * ((n_heads * key_value_proj_dim) ** -0.5))
+        elif isinstance(module, GPTSanJapaneseSparseMLP):
+            # Mesh TensorFlow attention initialization to avoid scaling before softmax
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136
+            d_model = self.config.d_model
+            key_value_proj_dim = self.config.d_model
+            n_heads = self.config.num_heads
+            module.router.classifier.weight.data.normal_(mean=0.0, std=factor * 1)
+            for idx in range(self.config.num_experts):
+                module.experts[f"expert_{idx}"].wi.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5))
+                module.experts[f"expert_{idx}"].wo.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5))
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (GPTSanJapaneseAttention,)):
+            module.gradient_checkpointing = value
+
+    # Copied from transformers.models.t5.modeling_t5.T5PreTrainedModel._shift_right
+    def _shift_right(self, input_ids):
+        decoder_start_token_id = self.config.decoder_start_token_id
+        pad_token_id = self.config.pad_token_id
+
+        if decoder_start_token_id is None:
+            raise ValueError(
+                "self.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id."
+                "See T5 docs for more information."
+            )
+
+        # shift inputs to the right
+        if is_torch_fx_proxy(input_ids):
+            # Item assignment is not supported natively for proxies.
+            shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), decoder_start_token_id)
+            shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)
+        else:
+            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+            shifted_input_ids[..., 0] = decoder_start_token_id
+
+        if pad_token_id is None:
+            raise ValueError("self.model.config.pad_token_id has to be defined.")
+        # replace possible -100 values in labels by `pad_token_id`
+        shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+        return shifted_input_ids
+
+
+GPTSAN_JAPANESE_START_DOCSTRING = r"""
+
+    The [GPTSAN-japanese](https://github.com/tanreinama/GPTSAN) model was proposed in General-purpose Swich transformer
+    based Japanese language model
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`GPTSanJapaneseConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+GPTSAN_JAPANESE_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. GPTSAN-japanese is a model that generates sentence
+            continuations or predicts tokens at mask positions. Special tokens required for inputs to the model are
+            automatically appended.
+        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            An input that masks the Prefix part in the Prefix-LM input. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **prefix** input,
+            - 0 for tokens that are **not-prefix** input.
+        spout (`torch.Tensor` of shape `(batch_size, config.d_spout)`):
+                This vector is transformed through an 8-layer FFN and can be used instead of `past_key_values`.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
+            input (see `past_key_values`). This is useful if you want more control over how to convert
+            `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        router_logits (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_logits=True` is passed or when `config.add_router_probs=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.
+            Router logits of the decoder model, useful to compute the auxiliary loss for Mixture of Experts models.
+"""
+
+
+@add_start_docstrings(
+    "The bare GPTSAN-japanese Model transformer outputting raw hidden-states without any specific head on top.",
+    GPTSAN_JAPANESE_START_DOCSTRING,
+)
+class GPTSanJapaneseModel(GPTSanJapanesePreTrainedModel):
+    def __init__(self, config: GPTSanJapaneseConfig):
+        super().__init__(config)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.d_model)
+        self.config = copy.deepcopy(config)
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model)
+        self.last_project = nn.Linear(config.d_model, config.d_model, bias=True)
+        self.act = ACT2FN["swish"]
+
+        self.blocks = torch.nn.ModuleList([])
+        for _ in range(config.num_switch_layers):
+            self.blocks.append(GPTSanJapaneseBlock(config))
+        for _ in range(config.num_ext_layers):
+            self.blocks.append(GPTSanJapaneseBlock(config, ext_layer=True))
+
+        if config.num_ext_layers > 0:
+            self.extra_position_embeddings = nn.Embedding(config.max_position_embeddings, config.d_model)
+
+        if config.d_spout:
+            spouts = []
+            for _ in range(8):
+                spouts.append(nn.Linear(config.d_spout, config.d_spout, bias=False))
+                spouts.append(nn.Tanh())
+            spouts.append(nn.Linear(config.d_spout, config.num_layers * 2 * config.d_model, bias=False))
+            self.spout = nn.Sequential(*spouts)
+
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, new_embeddings):
+        self.embed_tokens = new_embeddings
+
+    @add_start_docstrings_to_model_forward(GPTSAN_JAPANESE_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.FloatTensor] = None,
+        spout: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        num_precontext: Optional[torch.LongTensor] = None,
+    ) -> Union[MoEModelOutputWithPastAndCrossAttentions, Tuple[torch.FloatTensor]]:
+        r"""
+        num_precontext (`torch.LongTensor` of shape `(batch_size,1)`):
+            length of `hybrid` input tokens in the input. Tokens up to this length refer to both front and back like
+            BERT, tokens after that refer only to front like GPT. see also:
+            https://github.com/tanreinama/GPTSAN/blob/main/report/model.md
+
+        Returns:
+            `MoEModelOutputWithPastAndCrossAttentions` or `tuple` if `return_dict` returns
+            MoEModelOutputWithPastAndCrossAttentions insted of tuple
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        device = self.position_embeddings.weight.device
+        if input_ids is None:
+            input_ids = torch.zeros([1, 1]).int().to(device)  # dummy for input_ids was None
+        num_pasts_contexts = 0
+        num_batch = input_ids.shape[0]
+        pasts_or_spout_value = None
+        if past_key_values is not None:
+            num_pasts_contexts = past_key_values[0][0].shape[2]
+        elif self.config.d_spout and spout is not None:
+            # `spout` is a special input vector specific to GPTSAN
+            # This controls the output by projecting embedded information such as the class of sentences during learning.
+            # It should passed instead of the first past_key_value.
+            # See the original GPTSAN repository for details
+            num_pasts_contexts += 1
+
+        # If there is an attention_mask, increase first one for spout
+        if self.config.d_spout and spout is not None and attention_mask is not None:
+            attention_mask_with_spout = torch.ones(num_batch, attention_mask.shape[1] + 1, device=device)
+            attention_mask_with_spout[:, 1:] -= 1 - attention_mask  # 1st token should be spout
+            attention_mask = attention_mask_with_spout  # update attention_mask
+
+        if num_precontext is not None:
+            # `num_precontext` is the number of tokens that refer to each other in prefix-lm
+            # created per batch, so dimension of num_precontext should be [batch, 1]
+            if not (
+                len(num_precontext.shape) == 2 and num_precontext.shape[1] == 1
+            ):  # num_precontext Should be [batch,1]
+                raise ValueError("num_precontext should be [batch, 1] size.")
+            num_precontext = torch.reshape(num_precontext, [-1])
+        else:
+            num_precontext = torch.zeros([num_batch]).int().to(device)
+
+        num_input_contexts = input_ids.shape[1]
+        num_output_contexts = num_input_contexts + num_pasts_contexts
+
+        hidden_states = self.embed_tokens(input_ids)
+
+        if past_key_values is not None:
+            pasts_or_spout_value = past_key_values
+        elif self.config.d_spout and spout is not None:
+            # Make vector from `spout` of GPTSAN to the same shape as past_key_values
+            pasts_or_spout_value = self.spout(spout)  # projecting `spout` vector
+            pasts_or_spout_value = torch.reshape(
+                pasts_or_spout_value,
+                [
+                    num_batch,
+                    self.config.num_layers,
+                    2,
+                    self.config.num_heads,
+                    num_pasts_contexts,
+                    self.config.d_model // self.config.num_heads,
+                ],
+            )
+            pasts_or_spout_value = torch.split(pasts_or_spout_value, [1] * self.config.num_layers, dim=1)
+            # make same shape as past_key_values
+            pasts_or_spout_value = tuple(
+                tuple([b.squeeze(1) for b in torch.split(a.squeeze(1), [1, 1], dim=1)]) for a in pasts_or_spout_value
+            )
+        else:
+            pasts_or_spout_value = [None] * self.config.num_layers
+
+        # Token position considering spout and pasts
+        token_position = torch.arange(num_input_contexts).to(device) + num_pasts_contexts
+
+        if attention_mask is None:
+            attention_mask = torch.ones(num_batch, num_input_contexts, device=device)
+
+        # positions for get position_embeddings
+        gather_position = (
+            (
+                torch.zeros((num_batch, self.config.d_model, num_input_contexts)).to(device)
+                + token_position.unsqueeze(0)
+            )
+            .transpose(1, 2)
+            .long()
+        )
+        # When padding with padding_side="left", zeros line up on the left side of attention_mask, so position_embeddings is shifted accordingly
+        gather_position -= (1 - attention_mask).argmin(dim=-1).unsqueeze(1).unsqueeze(2)
+        gather_position = torch.clip(gather_position, num_pasts_contexts, self.config.max_position_embeddings - 1)
+
+        # attention_mask is applied per batch
+        for i in range(num_batch):
+            hidden_states[i] += torch.gather(self.position_embeddings.weight, dim=0, index=gather_position[i])
+
+        # Create a mask to be used when making the prefix Input length of Prefix-LM variable
+        causal_mask = (
+            torch.tril(torch.ones((num_output_contexts, num_output_contexts), dtype=torch.uint8))
+            .view(1, 1, num_output_contexts, num_output_contexts)
+            .to(device)
+        )
+        prefix_lm_mask = causal_mask[:, :, -num_input_contexts:, :]
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.unsqueeze(1).unsqueeze(2)
+            prefix_lm_mask = ((prefix_lm_mask + token_type_ids) > 0).float()
+        # Marge prefix_lm_mask and attention_mask
+        extended_attention_mask = prefix_lm_mask * attention_mask.unsqueeze(1).unsqueeze(2)
+
+        # Prepare head mask if needed
+        if head_mask is not None:
+            head_mask = self.get_head_mask(
+                head_mask, self.config.num_switch_layers + self.config.num_ext_layers
+            )  # n_layer x batch x n_heads x N x N
+
+        # outputs
+        present_key_value_states = () if self.config.use_cache or use_cache else None
+        all_hidden_states = () if self.config.output_hidden_states or output_hidden_states else None
+        all_attentions = () if self.config.output_attentions or output_attentions else None
+        all_router_probs = () if self.config.output_router_logits or output_router_logits else None
+
+        for layer, past in enumerate(pasts_or_spout_value):
+            if layer == self.config.num_switch_layers:
+                if self.config.num_ext_layers > 0:
+                    # extra_position_embeddings are extra position embeddings that are only created when extending the model with code from the original GPTSAN repository. Not used in the default model.
+                    # However, it is created when you create an additional layer and partially train only that location.
+                    # Therefore, convert_gptsan_tf_checkpoint_to_pytorch.py is used when converting and loading models created in the original GPTSAN repository.
+                    for i in range(num_batch):
+                        hidden_states[i] += torch.gather(
+                            self.extra_position_embeddings.weight, dim=0, index=gather_position[i]
+                        )
+
+            output_router_tuple = (
+                self.config.output_router_logits or output_router_logits
+            ) and layer < self.config.num_switch_layers
+            block_output = self.blocks[layer](
+                hidden_states=hidden_states,
+                past_key_value=past,
+                attention_mask=extended_attention_mask,
+                head_mask=head_mask,
+                use_cache=self.config.use_cache or use_cache,
+                output_attentions=self.config.output_attentions or output_attentions,
+                output_router_tuple=output_router_tuple,
+            )
+
+            outpos = 0
+            hidden_states = block_output[outpos]
+            if self.config.output_hidden_states or output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.config.use_cache or use_cache:
+                outpos += 1
+                present = block_output[outpos]
+                present_key_value_states += (present,)
+            if self.config.output_attentions or output_attentions:
+                outpos += 1
+                attention_probs = block_output[outpos]
+                all_attentions += (attention_probs,)
+            if output_router_tuple:
+                outpos += 1
+                router_tuple = block_output[outpos]
+                all_router_probs.append(router_tuple[0])
+
+        hidden_states = self.last_project(hidden_states)
+        hidden_states = self.act(hidden_states)
+
+        if self.config.output_hidden_states or output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    present_key_value_states,
+                    all_hidden_states,
+                    all_attentions,
+                    all_router_probs,
+                ]
+                if v is not None
+            )
+
+        return MoEModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=present_key_value_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+            router_probs=all_router_probs,
+        )
+
+
+@add_start_docstrings(
+    "The bare GPTSAN-japanese Model with a language modeling head.",
+    GPTSAN_JAPANESE_START_DOCSTRING,
+)
+class GPTSanJapaneseForConditionalGeneration(GPTSanJapanesePreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: GPTSanJapaneseConfig):
+        super().__init__(config)
+        self.model = GPTSanJapaneseModel(config)
+        self.register_buffer("final_logits_bias", torch.zeros([1, config.vocab_size]))
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+        if not self.config.torchscript:
+            self.lm_head.weight = self.model.embed_tokens.weight
+
+    @add_start_docstrings_to_model_forward(GPTSAN_JAPANESE_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.FloatTensor] = None,
+        spout: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple[torch.FloatTensor], MoECausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
+            labels in `[0, ..., config.vocab_size]`
+
+        Returns:
+            `MoECausalLMOutputWithPast` or `tuple` if `return_dict` returns MoECausalLMOutputWithPast insted of tuple
+
+        Example:
+
+        Text Generation with regular LM Model
+        ```python
+        >>> from transformers import AutoModel, AutoTokenizer, trainer_utils
+
+        >>> device = "cuda"
+        >>> model = AutoModel.from_pretrained("Tanrei/GPTSAN-japanese").to(device)
+        >>> tokenizer = AutoTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
+        >>> x_token = tokenizer("織田信長は、", return_tensors="pt")
+        >>> trainer_utils.set_seed(30)
+        >>> input_ids = x_token.input_ids.to(device)
+        >>> gen_token = model.generate(input_ids, max_new_tokens=50)
+        >>> tokenizer.decode(gen_token[0])
+        "織田信長は、政治・軍事の中枢まで掌握した政治家であり、日本史上類を見ない驚異的な軍事侵攻を続け..."
+        ```
+
+        Text Generation with Prefix-LM Model
+        ```python
+        >>> from transformers import AutoModel, AutoTokenizer, trainer_utils
+
+        >>> device = "cuda"
+        >>> model = AutoModel.from_pretrained("Tanrei/GPTSAN-japanese").to(device)
+        >>> tokenizer = AutoTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
+        >>> x_token = tokenizer("", prefix_text="織田信長は、", return_tensors="pt")
+        >>> trainer_utils.set_seed(30)
+        >>> input_ids = x_token.input_ids.to(device)
+        >>> token_type_ids = x_token.token_type_ids.to(device)
+        >>> gen_token = model.generate(input_ids, token_type_ids=token_type_ids, max_new_tokens=50)
+        >>> tokenizer.decode(gen_token[0])
+        "織田信長は、政治・外交で数々の戦果を上げるが、1568年からは、いわゆる本能寺の変で細川晴元に暗殺される..."
+        ```
+
+        Simultaneously Text Generation And Masked Language Model
+        ```python
+        >>> from transformers import AutoModel, AutoTokenizer, trainer_utils
+
+        >>> device = "cuda"
+        >>> model = AutoModel.from_pretrained("Tanrei/GPTSAN-japanese").to(device)
+        >>> tokenizer = AutoTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
+        >>> masked_sentence = "武田信玄は、<|inputmask|>時代ファンならぜひ押さえ<|inputmask|>きたい名将の一人。"
+        >>> x_token = tokenizer("", prefix_text=masked_sentence, return_tensors="pt")
+        >>> trainer_utils.set_seed(30)
+        >>> input_ids = x_token.input_ids.to(device)
+        >>> token_type_ids = x_token.token_type_ids.to(device)
+        >>> out_lm_token = model.generate(input_ids, token_type_ids=token_type_ids, max_new_tokens=50)
+        >>> out_mlm_token = model(input_ids, token_type_ids=token_type_ids).logits.argmax(axis=-1)
+        >>> tokenizer.decode(out_mlm_token[0])
+        "武田信玄は、戦国時代ファンならぜひ押さえておきたい名将の一人。"
+
+        >>> tokenizer.decode(out_lm_token[0][input_ids.shape[1] :])
+        "武田氏の三代に渡った武田家のひとり\n甲斐市に住む、日本史上最大の戦国大名。..."
+        ```"""
+        SEG_TOKEN = self.config.separator_token_id
+        use_cache = use_cache or self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        model_return_dict = True
+        num_precontext = None
+        if input_ids is not None:
+            num_batch = input_ids.shape[0]
+            num_precontext = torch.zeros([num_batch]).int().to(input_ids.device)
+            where_separators = torch.where(input_ids == SEG_TOKEN)
+            num_precontext[where_separators[0]] += where_separators[1]
+            num_precontext = num_precontext.unsqueeze(1)
+
+        outputs = self.model(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            spout,
+            past_key_values,
+            head_mask,
+            use_cache,
+            inputs_embeds,
+            decoder_inputs_embeds,
+            output_attentions,
+            output_hidden_states,
+            model_return_dict,
+            output_router_logits,
+            num_precontext,
+        )
+
+        lm_logits = self.lm_head(outputs[0])
+        if lm_logits.shape[-1] == self.final_logits_bias.shape[-1]:
+            lm_logits = lm_logits + self.final_logits_bias
+
+        loss = None
+        z_loss = None
+        router_probs = None
+        aux_loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(lm_logits.device)
+
+            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
+
+            if output_router_logits:
+                # Compute the router loss (z_loss + auxiliary loss) for each router in the encoder and decoder
+                router_logits, expert_indexes = self._unpack_router_logits(outputs.router_probs)
+                z_loss = router_z_loss_func(router_logits)
+                router_probs = nn.Softmax(dim=-1)(router_logits)
+                aux_loss = load_balancing_loss_func(router_probs, expert_indexes)
+
+            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    loss,
+                    lm_logits,
+                    outputs.past_key_values,
+                    outputs.hidden_states,
+                    outputs.router_probs,
+                    z_loss,
+                    aux_loss,
+                ]
+                if v is not None
+            )
+
+        return MoECausalLMOutputWithPast(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            router_logits=outputs.router_probs,
+            z_loss=z_loss,
+            aux_loss=aux_loss,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: torch.FloatTensor,
+        token_type_ids: Optional[torch.FloatTensor] = None,
+        spout: Optional[Union[List, torch.FloatTensor]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        **kwargs,
+    ):
+        if type(spout) is list:
+            spout = torch.tensor(spout).float()
+            if input_ids is not None:
+                spout = spout.to(input_ids.device)
+        if past_key_values is not None:
+            return {
+                "input_ids": input_ids[:, -1:] if input_ids is not None else None,
+                "attention_mask": attention_mask,
+                "token_type_ids": token_type_ids[:, -1:] if token_type_ids is not None else None,
+                "spout": spout,
+                "past_key_values": past_key_values,
+            }
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "token_type_ids": token_type_ids,
+            "spout": spout,
+            "past_key_values": None,
+        }
+
+    # Copied from transformers.models.switch_transformers.modeling_switch_transformers.SwitchTransformersForConditionalGeneration.prepare_decoder_input_ids_from_labels with SwitchTransformers->GPTSanJapanese
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return self._shift_right(labels)
+
+    # Copied from transformers.models.mbart.modeling_mbart.MBartForConditionalGeneration.resize_token_embeddings with MBart->GPTSanJapanese
+    def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding:
+        new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
+        self._resize_final_logits_bias(new_embeddings.weight.shape[0])
+        return new_embeddings
+
+    # Copied from transformers.models.mbart.modeling_mbart.MBartForConditionalGeneration._resize_final_logits_bias with MBart->GPTSanJapanese
+    def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
+        old_num_tokens = self.final_logits_bias.shape[-1]
+        if new_num_tokens <= old_num_tokens:
+            new_bias = self.final_logits_bias[:, :new_num_tokens]
+        else:
+            extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
+            new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
+        self.register_buffer("final_logits_bias", new_bias)
+
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+
+    def set_input_embeddings(self, new_embeddings):
+        self.model.set_input_embeddings(new_embeddings)
+
+    # Copied from transformers.models.switch_transformers.modeling_switch_transformers.SwitchTransformersForConditionalGeneration.set_output_embeddings with SwitchTransformers->GPTSanJapanese
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    # Copied from transformers.models.switch_transformers.modeling_switch_transformers.SwitchTransformersForConditionalGeneration.get_output_embeddings with SwitchTransformers->GPTSanJapanese
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    # Copied from transformers.models.switch_transformers.modeling_switch_transformers.SwitchTransformersForConditionalGeneration._unpack_router_logits with SwitchTransformers->GPTSanJapanese
+    def _unpack_router_logits(self, router_outputs):
+        total_router_logits = []
+        total_expert_indexes = []
+        for router_output in router_outputs:
+            if len(router_output[0].shape) > 1:
+                router_logits, expert_indexes = router_output
+                total_router_logits.append(router_logits)
+                total_expert_indexes.append(expert_indexes)
+        return torch.cat(total_router_logits, dim=1), torch.cat(total_expert_indexes, dim=1)
diff --git a/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py b/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py
new file mode 100644
index 000000000000..e805acf3c74b
--- /dev/null
+++ b/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py
@@ -0,0 +1,535 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for GPTSANJapanese."""
+import collections
+import json
+import os
+import re
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+
+from ...tokenization_utils import PreTrainedTokenizer
+from ...tokenization_utils_base import (
+    BatchEncoding,
+    PreTokenizedInput,
+    PreTokenizedInputPair,
+    TextInput,
+    TextInputPair,
+    TruncationStrategy,
+)
+from ...utils import PaddingStrategy, logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "emoji_file": "emoji.json"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "Tanrei/GPTSAN-japanese": "https://huggingface.co/Tanrei/GPTSAN-japanese/blob/main/vocab.txt",
+    },
+    "emoji_file": {
+        "Tanrei/GPTSAN-japanese": "https://huggingface.co/Tanrei/GPTSAN-japanese/blob/main/emoji.json",
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "Tanrei/GPTSAN-japanese": 1280,
+}
+
+
+def load_vocab_and_emoji(vocab_file, emoji_file):
+    """Loads a vocabulary file and emoji file into a dictionary."""
+    with open(emoji_file, "r", encoding="utf-8") as f:
+        emoji = json.loads(f.read())
+
+    vocab = collections.OrderedDict()
+    raw_vocab = collections.OrderedDict()
+    ids_to_tokens = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as f:
+        token = f.readlines()
+    token = [[t.rstrip("\n")] if (t == ",\n" or "," not in t) else t.rstrip("\n").split(",") for t in token]
+    for idx, b in enumerate(token):
+        ids_to_tokens[idx] = b
+        raw_vocab[",".join(b)] = idx
+        for wd in b:
+            vocab[wd] = idx
+
+    return vocab, raw_vocab, ids_to_tokens, emoji
+
+
+class GPTSanJapaneseTokenizer(PreTrainedTokenizer):
+    """
+    This tokenizer is based on GPTNeoXJapaneseTokenizer and has the following modifications
+    - Decoding byte0~byte255 tokens correctly
+    - Added bagofword token handling
+    - Return token_type_ids for Prefix-LM model
+    The bagofword token represents a repetition of the previous token and is converted to 3 consecutive tokens when
+    decoding In addition, the original Japanese special Sub-Word-Encoding has been released in this repository
+    (https://github.com/tanreinama/Japanese-BPEEncoder_V2). The token_type_ids is a mask indicating the prefix input
+    position of the Prefix-LM model. To specify a prefix position, specify a prefix input for prefix_text, or specify a
+    sentence of the prefix part and the part after it as a text pair of batch input.
+
+    Example:
+
+    ```python
+    >>> from transformers import GPTSanJapaneseTokenizer
+
+    >>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
+    >>> # You can confirm both 慶応 and 慶應 are encoded to 17750
+    >>> tokenizer("吾輩は猫である🐯。実は慶応(慶應)大学出身")["input_ids"]
+    [35993, 35998, 34347, 31459, 30647, 31448, 25, 30659, 35729, 35676, 32417, 30647, 17750, 35589, 17750, 35590, 321, 1281]
+
+    >>> # Both 慶応 and 慶應 are decoded to 慶応
+    >>> tokenizer.decode(tokenizer("吾輩は猫である🐯。実は慶応(慶應)大学出身")["input_ids"])
+    '吾輩は猫である🐯。実は慶応(慶応)大学出身'
+    ```
+
+    Example for Prefix-LM:
+
+    ```python
+    >>> from transformers import GPTSanJapaneseTokenizer
+
+    >>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
+    >>> tokenizer("実は慶応(慶應)大学出身", prefix_text="吾輩は猫である🐯。")["input_ids"]
+    [35993, 34347, 31459, 30647, 31448, 25, 30659, 35729, 35676, 35998, 32417, 30647, 17750, 35589, 17750, 35590, 321, 1281]
+
+    >>> # Mask for Prefix-LM inputs
+    >>> tokenizer("実は慶応(慶應)大学出身", prefix_text="吾輩は猫である🐯。")["token_type_ids"]
+    [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+    ```
+
+    Example for batch encode:
+
+    ```python
+    >>> from transformers import GPTSanJapaneseTokenizer
+
+    >>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
+    >>> tokenizer([["武田信玄", "は、"], ["織田信長", "の配下の、"]], padding=True)["input_ids"]
+    [[35993, 8640, 25948, 35998, 30647, 35675, 35999, 35999], [35993, 10382, 9868, 35998, 30646, 9459, 30646, 35675]]
+
+    >>> # Mask for Prefix-LM inputs
+    >>> tokenizer([["武田信玄", "は、"], ["織田信長", "の配下の、"]], padding=True)["token_type_ids"]
+    [[1, 1, 1, 0, 0, 0, 0, 0], [1, 1, 1, 0, 0, 0, 0, 0]]
+
+    >>> # Mask for padding
+    >>> tokenizer([["武田信玄", "は、"], ["織田信長", "の配下の、"]], padding=True)["attention_mask"]
+    [[1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1]]
+    ```
+
+    Args:
+        vocab_file (`str`):
+            File containing the vocabulary.
+        emoji_file (`str`):
+            File containing the emoji.
+        unk_token (`str`, *optional*, defaults to `"<|nottoken|>"`):
+            The token used for unknown charactor
+        pad_token (`str`, *optional*, defaults to `"<|separator|>"`):
+            The token used for padding
+        bos_token (`str`, *optional*, defaults to `"<|startoftext|>"`):
+            The beginning of sequence token.
+        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The end of sequence token.
+        sep_token (`str`, *optional*, defaults to `"<|segmenter|>"`):
+            A special token to separate token to prefix part and general input part.
+        do_clean_text (`bool`, *optional*, defaults to `False`):
+            Whether or not to clean text for URL, EMAIL, TEL, Japanese DATE and Japanese PRICE.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask", "token_type_ids"]
+
+    def __init__(
+        self,
+        vocab_file,
+        emoji_file,
+        unk_token="<|nottoken|>",
+        pad_token="<|separator|>",
+        bos_token="<|startoftext|>",
+        eos_token="<|endoftext|>",
+        sep_token="<|segmenter|>",
+        do_clean_text=False,
+        **kwargs,
+    ):
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
+                " model use `tokenizer = GPTSanJapaneseTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
+        if not os.path.isfile(emoji_file):
+            raise ValueError(
+                f"Can't find a emoji file at path '{emoji_file}'. To load the emoji information from a Google"
+                " pretrained model use `tokenizer = GPTSanJapaneseTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
+        self.do_clean_text = do_clean_text
+        self.vocab, self.raw_vocab, self.ids_to_tokens, self.emoji = load_vocab_and_emoji(vocab_file, emoji_file)
+        self.subword_tokenizer = SubWordJapaneseTokenizer(
+            vocab=self.vocab, ids_to_tokens=self.ids_to_tokens, emoji=self.emoji
+        )
+
+        super().__init__(
+            unk_token=unk_token,
+            pad_token=pad_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            do_clean_text=do_clean_text,
+            **kwargs,
+        )
+
+    @property
+    # Copied from tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizer.vocab_size
+    def vocab_size(self):
+        # self.vocab contains support for character fluctuation unique to Japanese, and has a large number of vocab
+        return len(self.raw_vocab)
+
+    # Copied from tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizer.get_vocab
+    def get_vocab(self):
+        return dict(self.raw_vocab, **self.added_tokens_encoder)
+
+    # Copied from tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizer._tokenize
+    def _tokenize(self, text):
+        return self.subword_tokenizer.tokenize(text, clean=self.do_clean_text)
+
+    # Copied from tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizer._convert_token_to_id
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+
+    # Copied from tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizer._convert_id_to_token
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.subword_tokenizer.convert_id_to_token(index)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        words = []
+        byte_tokens = []
+        for word in tokens:
+            if word[:6] == "<|byte" and word[-2:] == "|>":
+                byte_tokens.append(int(word[6:-2]))
+            else:
+                if len(byte_tokens) > 0:
+                    words.append(bytearray(byte_tokens).decode("utf-8", errors="replace"))
+                    byte_tokens = []
+                if word[:7] == "<|emoji" and word[-2:] == "|>":
+                    words.append(self.emoji["emoji_inv"][word])
+                elif word == "<SP>":
+                    words.append(" ")
+                elif word == "<BR>":
+                    words.append("\n")
+                elif word == "<TAB>":
+                    words.append("\t")
+                elif word == "<BLOCK>":
+                    words.append("▀")
+                elif word == "<KIGOU>":
+                    words.append("ǀ")
+                elif word == "<U2000U2BFF>":
+                    words.append("‖")
+                elif word == "<|bagoftoken|>":
+                    if len(words) > 0:
+                        words.append(words[-1])
+                        words.append(words[-1])
+                        words.append(words[-1])
+                elif word.startswith("<|") and word.endswith("|>"):
+                    words.append("")
+                else:
+                    words.append(word)
+        if len(byte_tokens) > 0:
+            words.append(bytearray(byte_tokens).decode("utf-8", errors="replace"))
+        text = "".join(words)
+        return text
+
+    @property
+    def default_chat_template(self):
+        """
+        A simple chat template that adds standard BOS, SEP and EOS tokens between messages while discarding role
+        information.
+        """
+        return (
+            "{% for message in messages %}"
+            "{% if not loop.first %}{{ bos_token}}{% endif %}"
+            "{{ sep_token }}{{ message.content }} {{ eos_token }}"
+            "{% endfor %}"
+        )
+
+    # Copied from tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizer.save_vocabulary
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        index = 0
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+            )
+            emoji_file = os.path.join(
+                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["emoji_file"]
+            )
+        else:
+            vocab_file = (
+                (filename_prefix + "-" if filename_prefix else "") + save_directory + VOCAB_FILES_NAMES["vocab_file"]
+            )
+            emoji_file = (
+                (filename_prefix + "-" if filename_prefix else "") + save_directory + VOCAB_FILES_NAMES["emoji_file"]
+            )
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token_index, token in self.ids_to_tokens.items():
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(",".join(token) + "\n")
+                index += 1
+        with open(emoji_file, "w", encoding="utf-8") as writer:
+            json.dump(self.emoji, writer)
+        return vocab_file, emoji_file
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        # docstyle-ignore
+        """
+        The tokenizer returns token_type_ids as separators between the Prefix part and the rest.
+        token_type_ids is 1 for the Prefix part and 0 for the rest of the token.
+
+        Example:
+        ```python
+        >>> from transformers import GPTSanJapaneseTokenizer
+
+        >>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
+        >>> x_token = tokenizer("ｱｲｳｴ")
+        >>> # input_ids:      | SOT | SEG | ｱ | ｲ | ｳ | ｴ |
+        >>> # token_type_ids: | 1   | 0   | 0 | 0 | 0 | 0 |
+
+        >>> x_token = tokenizer("", prefix_text="ｱｲｳｴ")
+        >>> # input_ids:      | SOT | ｱ | ｲ | ｳ | ｴ | SEG |
+        >>> # token_type_ids: | 1   | 1 | 1 | 1 | 1 | 0  |
+
+        >>> x_token = tokenizer("ｳｴ", prefix_text="ｱｲ")
+        >>> # input_ids:      | SOT | ｱ | ｲ | SEG | ｳ | ｴ |
+        >>> # token_type_ids: | 1   | 1 | 1 | 0   | 0 | 0 |
+        ```"""
+        prefix_len = 0
+        if self.sep_token in self.vocab:
+            segid = self.vocab[self.sep_token]
+            if segid in token_ids_0:
+                prefix_len = token_ids_0.index(segid)
+        if token_ids_1 is None:
+            total_len = len(token_ids_0)
+        else:
+            total_len = len(token_ids_0 + token_ids_1)
+        return prefix_len * [1] + (total_len - prefix_len) * [0]
+
+    def prepare_for_tokenization(self, text, prefix_text=None, add_sep_token=None, **kwargs):
+        # GPTSAN inserts extra SEP tokens in Prefix-LM in addition to SOT for text generation.
+        # SOT at the beginning of the text, and SEP at the separator between the Prefix part and the rest.
+        if add_sep_token is None:
+            add_sep_token = self.sep_token not in text  # If insert un-prefix position explicitly
+        prepared = self.bos_token if self.bos_token in self.vocab else ""
+        prepared += prefix_text if prefix_text is not None else ""
+        if add_sep_token:
+            prepared += self.sep_token if self.sep_token in self.vocab else ""
+        prepared += text
+        return (prepared, kwargs)
+
+    def _batch_encode_plus(
+        self,
+        batch_text_or_text_pairs: Union[
+            List[TextInput], List[TextInputPair], List[PreTokenizedInput], List[PreTokenizedInputPair]
+        ],
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        is_split_into_words: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[str] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+    ) -> BatchEncoding:
+        # This tokenizer converts input text pairs into Prefix input and subsequent input
+        if type(batch_text_or_text_pairs[0]) is tuple or type(batch_text_or_text_pairs[0]) is list:
+            # As a single text with an explicit un-prefix position
+            batch_prefix_texts = []
+            for pref, txt in batch_text_or_text_pairs:
+                batch_prefix_texts.append(pref + self.sep_token + txt)
+            batch_text_or_text_pairs = batch_prefix_texts
+
+        return super()._batch_encode_plus(
+            batch_text_or_text_pairs,
+            add_special_tokens,
+            padding_strategy,
+            truncation_strategy,
+            max_length,
+            stride,
+            is_split_into_words,
+            pad_to_multiple_of,
+            return_tensors,
+            return_token_type_ids,
+            return_attention_mask,
+            return_overflowing_tokens,
+            return_special_tokens_mask,
+            return_offsets_mapping,
+            return_length,
+            verbose,
+        )
+
+
+class SubWordJapaneseTokenizer(object):
+    """
+    This tokenizer is based on GPTNeoXJapaneseTokenizer and has the following modifications
+    - Decoding byte0~byte255 tokens correctly
+    - Added bagofword token handling
+
+    https://github.com/tanreinama/Japanese-BPEEncoder_V2 This tokenizer class is under MIT Lisence according to the
+    original repository.
+
+    MIT License
+
+    Copyright (c) 2020 tanreinama
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+    documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
+    rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included in all copies or substantial portions of
+    the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+    THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE.
+    """
+
+    # Copied from tokenization_gpt_neox_japanese.SubWordJapaneseTokenizer.__init__
+    def __init__(self, vocab, ids_to_tokens, emoji):
+        self.vocab = vocab  # same as swe
+        self.ids_to_tokens = ids_to_tokens  # same as bpe
+        self.emoji = emoji
+        self.maxlen = np.max([len(w) for w in self.vocab.keys()])
+        self.content_repatter1 = re.compile(r"(https?|ftp)(:\/\/[-_\.!~*\'()a-zA-Z0-9;\/?:\@&=\+$,%#]+)")
+        self.content_repatter2 = re.compile(r"[A-Za-z0-9\._+]*@[\-_0-9A-Za-z]+(\.[A-Za-z]+)*")
+        self.content_repatter3 = re.compile(r"[\(]{0,1}[0-9]{2,4}[\)\-\(]{0,1}[0-9]{2,4}[\)\-]{0,1}[0-9]{3,4}")
+        self.content_repatter4 = re.compile(
+            r"([12]\d{3}[/\-年])*(0?[1-9]|1[0-2])[/\-月]((0?[1-9]|[12][0-9]|3[01])日?)*(\d{1,2}|:|\d{1,2}時|\d{1,2}分|\(日\)|\(月\)|\(火\)|\(水\)|\(木\)|\(金\)|\(土\)|㈰|㈪|㈫|㈬|㈭|㈮|㈯)*"
+        )
+        self.content_repatter5 = re.compile(
+            r"(明治|大正|昭和|平成|令和|㍾|㍽|㍼|㍻|\u32ff)\d{1,2}年(0?[1-9]|1[0-2])月(0?[1-9]|[12][0-9]|3[01])日(\d{1,2}|:|\d{1,2}時|\d{1,2}分|\(日\)|\(月\)|\(火\)|\(水\)|\(木\)|\(金\)|\(土\)|㈰|㈪|㈫|㈬|㈭|㈮|㈯)*"
+        )
+        self.content_repatter6 = re.compile(
+            r"((0|[1-9]\d*|[1-9]\d{0,2}(,\d{3})+)*億)*((0|[1-9]\d*|[1-9]\d{0,2}(,\d{3})+)*万)*((0|[1-9]\d*|[1-9]\d{0,2}(,\d{3})+)*千)*(0|[1-9]\d*|[1-9]\d{0,2}(,\d{3})+)*(千円|万円|千万円|円|千ドル|万ドル|千万ドル|ドル|千ユーロ|万ユーロ|千万ユーロ|ユーロ)+(\(税込\)|\(税抜\)|\+tax)*"
+        )
+        keisen = "─━│┃┄┅┆┇┈┉┊┋┌┍┎┏┐┑┒┓└┕┖┗┘┙┚┛├┝┞┟┠┡┢┣┤┥┦┧┨┩┪┫┬┭┮┯┰┱┲┳┴┵┶┷┸┹┺┻┼┽┾┿╀╁╂╃╄╅╆╇╈╉╊╋╌╍╎╏═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠╡╢╣╤╥╦╧╨╩╪╫╬╭╮╯╰╱╲╳╴╵╶╷╸╹╺╻╼╽╾╿"
+        blocks = "▀▁▂▃▄▅▆▇█▉▊▋▌▍▎▏▐░▒▓▔▕▖▗▘▙▚▛▜▝▞▟"
+        self.content_trans1 = str.maketrans({k: "<BLOCK>" for k in keisen + blocks})
+
+    # Copied from tokenization_gpt_neox_japanese.SubWordJapaneseTokenizer.__len__
+    def __len__(self):
+        return len(self.ids_to_tokens)
+
+    # Copied from tokenization_gpt_neox_japanese.SubWordJapaneseTokenizer.clean_text
+    def clean_text(self, content):
+        content = self.content_repatter1.sub("<URL>", content)
+        content = self.content_repatter2.sub("<EMAIL>", content)
+        content = self.content_repatter3.sub("<TEL>", content)
+        content = self.content_repatter4.sub("<DATE>", content)
+        content = self.content_repatter5.sub("<DATE>", content)
+        content = self.content_repatter6.sub("<PRICE>", content)
+        content = content.translate(self.content_trans1)
+        while "<BLOCK><BLOCK>" in content:
+            content = content.replace("<BLOCK><BLOCK>", "<BLOCK>")
+        return content
+
+    # Copied from tokenization_gpt_neox_japanese.SubWordJapaneseTokenizer.tokenize
+    def tokenize(self, text, clean=False):
+        text = text.replace(" ", "<SP>")
+        text = text.replace("　", "<SP>")
+        text = text.replace("\r\n", "<BR>")
+        text = text.replace("\n", "<BR>")
+        text = text.replace("\r", "<BR>")
+        text = text.replace("\t", "<TAB>")
+        text = text.replace("—", "ー")
+        text = text.replace("−", "ー")
+        for k, v in self.emoji["emoji"].items():
+            if k in text:
+                text = text.replace(k, v)
+        if clean:
+            text = self.clean_text(text)
+
+        def check_simbol(x):
+            e = x.encode()
+            if len(x) == 1 and len(e) == 2:
+                c = (int(e[0]) << 8) + int(e[1])
+                if (
+                    (c >= 0xC2A1 and c <= 0xC2BF)
+                    or (c >= 0xC780 and c <= 0xC783)
+                    or (c >= 0xCAB9 and c <= 0xCBBF)
+                    or (c >= 0xCC80 and c <= 0xCDA2)
+                ):
+                    return True
+            return False
+
+        def checku2e(x):
+            e = x.encode()
+            if len(x) == 1 and len(e) == 3:
+                c = (int(e[0]) << 16) + (int(e[1]) << 8) + int(e[2])
+                if c >= 0xE28080 and c <= 0xE2B07F:
+                    return True
+            return False
+
+        pos = 0
+        result = []
+        while pos < len(text):
+            end = min(len(text), pos + self.maxlen + 1) if text[pos] == "<" else pos + 3
+            candidates = []  # (token_id, token, pos)
+            for e in range(end, pos, -1):
+                wd = text[pos:e]
+                if wd in self.vocab:
+                    if wd[0] == "<" and len(wd) > 2:
+                        candidates = [(self.vocab[wd], wd, e)]
+                        break
+                    else:
+                        candidates.append((self.vocab[wd], wd, e))
+            if len(candidates) > 0:
+                # the smallest token_id is adopted
+                _, wd, e = sorted(candidates, key=lambda x: x[0])[0]
+                result.append(wd)
+                pos = e
+            else:
+                end = pos + 1
+                wd = text[pos:end]
+                if check_simbol(wd):
+                    result.append("<KIGOU>")
+                elif checku2e(wd):
+                    result.append("<U2000U2BFF>")
+                else:
+                    for i in wd.encode("utf-8"):
+                        result.append("<|byte%d|>" % i)
+                pos = end
+        return result
+
+    def convert_id_to_token(self, index):
+        return self.ids_to_tokens[index][0]
diff --git a/src/transformers/models/graphormer/__init__.py b/src/transformers/models/graphormer/__init__.py
new file mode 100644
index 000000000000..426352568214
--- /dev/null
+++ b/src/transformers/models/graphormer/__init__.py
@@ -0,0 +1,57 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
+
+
+_import_structure = {
+    "configuration_graphormer": ["GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "GraphormerConfig"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_graphormer"] = [
+        "GRAPHORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "GraphormerForGraphClassification",
+        "GraphormerModel",
+        "GraphormerPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_graphormer import GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, GraphormerConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_graphormer import (
+            GRAPHORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            GraphormerForGraphClassification,
+            GraphormerModel,
+            GraphormerPreTrainedModel,
+        )
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/graphormer/algos_graphormer.pyx b/src/transformers/models/graphormer/algos_graphormer.pyx
new file mode 100644
index 000000000000..a0fafbdee53b
--- /dev/null
+++ b/src/transformers/models/graphormer/algos_graphormer.pyx
@@ -0,0 +1,107 @@
+# Copyright (c) Microsoft Corporation and HuggingFace
+# Licensed under the MIT License.
+
+import cython
+
+cimport numpy
+from cython.parallel cimport parallel, prange
+
+import numpy as np
+
+
+# Reduce this number if matrices are too big for large graphs
+UNREACHABLE_NODE_DISTANCE = 510 
+
+def floyd_warshall(adjacency_matrix):
+    """
+    Applies the Floyd-Warshall algorithm to the adjacency matrix, to compute the 
+    shortest paths distance between all nodes, up to UNREACHABLE_NODE_DISTANCE.
+    """
+    (nrows, ncols) = adjacency_matrix.shape
+    assert nrows == ncols
+    cdef unsigned int n = nrows
+
+    adj_mat_copy = adjacency_matrix.astype(np.int32, order='C', casting='safe', copy=True)
+    assert adj_mat_copy.flags['C_CONTIGUOUS']
+    cdef numpy.ndarray[numpy.int32_t, ndim=2, mode='c'] M = adj_mat_copy
+    cdef numpy.ndarray[numpy.int32_t, ndim=2, mode='c'] path = -1 * np.ones([n, n], dtype=np.int32)
+
+    cdef unsigned int i, j, k
+    cdef numpy.int32_t M_ij, M_ik, cost_ikkj
+    cdef numpy.int32_t* M_ptr = &M[0,0]
+    cdef numpy.int32_t* M_i_ptr
+    cdef numpy.int32_t* M_k_ptr
+
+    # set unreachable nodes distance to UNREACHABLE_NODE_DISTANCE
+    for i in range(n):
+        for j in range(n):
+            if i == j:
+                M[i][j] = 0
+            elif M[i][j] == 0:
+                M[i][j] = UNREACHABLE_NODE_DISTANCE
+
+    # floyed algo
+    for k in range(n):
+        M_k_ptr = M_ptr + n*k
+        for i in range(n):
+            M_i_ptr = M_ptr + n*i
+            M_ik = M_i_ptr[k]
+            for j in range(n):
+                cost_ikkj = M_ik + M_k_ptr[j]
+                M_ij = M_i_ptr[j]
+                if M_ij > cost_ikkj:
+                    M_i_ptr[j] = cost_ikkj
+                    path[i][j] = k
+
+    # set unreachable path to UNREACHABLE_NODE_DISTANCE
+    for i in range(n):
+        for j in range(n):
+            if M[i][j] >= UNREACHABLE_NODE_DISTANCE:
+                path[i][j] = UNREACHABLE_NODE_DISTANCE
+                M[i][j] = UNREACHABLE_NODE_DISTANCE
+
+    return M, path
+
+
+def get_all_edges(path, i, j):
+    """
+    Recursive function to compute all possible paths between two nodes from the graph adjacency matrix.
+    """
+    cdef int k = path[i][j]
+    if k == -1:
+        return []
+    else:
+        return get_all_edges(path, i, k) + [k] + get_all_edges(path, k, j)
+
+
+def gen_edge_input(max_dist, path, edge_feat):
+    """
+    Generates the full edge feature and adjacency matrix.
+    Shape: num_nodes * num_nodes * max_distance_between_nodes * num_edge_features
+    Dim 1 is the input node, dim 2 the output node of the edge, dim 3 the depth of the edge, dim 4 the feature
+    """
+    (nrows, ncols) = path.shape
+    assert nrows == ncols
+    cdef unsigned int n = nrows
+    cdef unsigned int max_dist_copy = max_dist
+
+    path_copy = path.astype(long, order='C', casting='safe', copy=True)
+    edge_feat_copy = edge_feat.astype(long, order='C', casting='safe', copy=True)
+    assert path_copy.flags['C_CONTIGUOUS']
+    assert edge_feat_copy.flags['C_CONTIGUOUS']
+
+    cdef numpy.ndarray[numpy.int32_t, ndim=4, mode='c'] edge_fea_all = -1 * np.ones([n, n, max_dist_copy, edge_feat.shape[-1]], dtype=np.int32)
+    cdef unsigned int i, j, k, num_path, cur
+
+    for i in range(n):
+        for j in range(n):
+            if i == j:
+                continue
+            if path_copy[i][j] == UNREACHABLE_NODE_DISTANCE:
+                continue
+            path = [i] + get_all_edges(path_copy, i, j) + [j]
+            num_path = len(path) - 1
+            for k in range(num_path):
+                edge_fea_all[i, j, k, :] = edge_feat_copy[path[k], path[k+1], :]
+
+    return edge_fea_all
diff --git a/src/transformers/models/graphormer/collating_graphormer.py b/src/transformers/models/graphormer/collating_graphormer.py
new file mode 100644
index 000000000000..58ce602ea28d
--- /dev/null
+++ b/src/transformers/models/graphormer/collating_graphormer.py
@@ -0,0 +1,134 @@
+# Copyright (c) Microsoft Corporation and HuggingFace
+# Licensed under the MIT License.
+
+from typing import Any, Dict, List, Mapping
+
+import numpy as np
+import torch
+
+from ...utils import is_cython_available, requires_backends
+
+
+if is_cython_available():
+    import pyximport
+
+    pyximport.install(setup_args={"include_dirs": np.get_include()})
+    from . import algos_graphormer  # noqa E402
+
+
+def convert_to_single_emb(x, offset: int = 512):
+    feature_num = x.shape[1] if len(x.shape) > 1 else 1
+    feature_offset = 1 + np.arange(0, feature_num * offset, offset, dtype=np.int64)
+    x = x + feature_offset
+    return x
+
+
+def preprocess_item(item, keep_features=True):
+    requires_backends(preprocess_item, ["cython"])
+
+    if keep_features and "edge_attr" in item.keys():  # edge_attr
+        edge_attr = np.asarray(item["edge_attr"], dtype=np.int64)
+    else:
+        edge_attr = np.ones((len(item["edge_index"][0]), 1), dtype=np.int64)  # same embedding for all
+
+    if keep_features and "node_feat" in item.keys():  # input_nodes
+        node_feature = np.asarray(item["node_feat"], dtype=np.int64)
+    else:
+        node_feature = np.ones((item["num_nodes"], 1), dtype=np.int64)  # same embedding for all
+
+    edge_index = np.asarray(item["edge_index"], dtype=np.int64)
+
+    input_nodes = convert_to_single_emb(node_feature) + 1
+    num_nodes = item["num_nodes"]
+
+    if len(edge_attr.shape) == 1:
+        edge_attr = edge_attr[:, None]
+    attn_edge_type = np.zeros([num_nodes, num_nodes, edge_attr.shape[-1]], dtype=np.int64)
+    attn_edge_type[edge_index[0], edge_index[1]] = convert_to_single_emb(edge_attr) + 1
+
+    # node adj matrix [num_nodes, num_nodes] bool
+    adj = np.zeros([num_nodes, num_nodes], dtype=bool)
+    adj[edge_index[0], edge_index[1]] = True
+
+    shortest_path_result, path = algos_graphormer.floyd_warshall(adj)
+    max_dist = np.amax(shortest_path_result)
+
+    input_edges = algos_graphormer.gen_edge_input(max_dist, path, attn_edge_type)
+    attn_bias = np.zeros([num_nodes + 1, num_nodes + 1], dtype=np.single)  # with graph token
+
+    # combine
+    item["input_nodes"] = input_nodes + 1  # we shift all indices by one for padding
+    item["attn_bias"] = attn_bias
+    item["attn_edge_type"] = attn_edge_type
+    item["spatial_pos"] = shortest_path_result.astype(np.int64) + 1  # we shift all indices by one for padding
+    item["in_degree"] = np.sum(adj, axis=1).reshape(-1) + 1  # we shift all indices by one for padding
+    item["out_degree"] = item["in_degree"]  # for undirected graph
+    item["input_edges"] = input_edges + 1  # we shift all indices by one for padding
+    if "labels" not in item:
+        item["labels"] = item["y"]
+
+    return item
+
+
+class GraphormerDataCollator:
+    def __init__(self, spatial_pos_max=20, on_the_fly_processing=False):
+        if not is_cython_available():
+            raise ImportError("Graphormer preprocessing needs Cython (pyximport)")
+
+        self.spatial_pos_max = spatial_pos_max
+        self.on_the_fly_processing = on_the_fly_processing
+
+    def __call__(self, features: List[dict]) -> Dict[str, Any]:
+        if self.on_the_fly_processing:
+            features = [preprocess_item(i) for i in features]
+
+        if not isinstance(features[0], Mapping):
+            features = [vars(f) for f in features]
+        batch = {}
+
+        max_node_num = max(len(i["input_nodes"]) for i in features)
+        node_feat_size = len(features[0]["input_nodes"][0])
+        edge_feat_size = len(features[0]["attn_edge_type"][0][0])
+        max_dist = max(len(i["input_edges"][0][0]) for i in features)
+        edge_input_size = len(features[0]["input_edges"][0][0][0])
+        batch_size = len(features)
+
+        batch["attn_bias"] = torch.zeros(batch_size, max_node_num + 1, max_node_num + 1, dtype=torch.float)
+        batch["attn_edge_type"] = torch.zeros(batch_size, max_node_num, max_node_num, edge_feat_size, dtype=torch.long)
+        batch["spatial_pos"] = torch.zeros(batch_size, max_node_num, max_node_num, dtype=torch.long)
+        batch["in_degree"] = torch.zeros(batch_size, max_node_num, dtype=torch.long)
+        batch["input_nodes"] = torch.zeros(batch_size, max_node_num, node_feat_size, dtype=torch.long)
+        batch["input_edges"] = torch.zeros(
+            batch_size, max_node_num, max_node_num, max_dist, edge_input_size, dtype=torch.long
+        )
+
+        for ix, f in enumerate(features):
+            for k in ["attn_bias", "attn_edge_type", "spatial_pos", "in_degree", "input_nodes", "input_edges"]:
+                f[k] = torch.tensor(f[k])
+
+            if len(f["attn_bias"][1:, 1:][f["spatial_pos"] >= self.spatial_pos_max]) > 0:
+                f["attn_bias"][1:, 1:][f["spatial_pos"] >= self.spatial_pos_max] = float("-inf")
+
+            batch["attn_bias"][ix, : f["attn_bias"].shape[0], : f["attn_bias"].shape[1]] = f["attn_bias"]
+            batch["attn_edge_type"][ix, : f["attn_edge_type"].shape[0], : f["attn_edge_type"].shape[1], :] = f[
+                "attn_edge_type"
+            ]
+            batch["spatial_pos"][ix, : f["spatial_pos"].shape[0], : f["spatial_pos"].shape[1]] = f["spatial_pos"]
+            batch["in_degree"][ix, : f["in_degree"].shape[0]] = f["in_degree"]
+            batch["input_nodes"][ix, : f["input_nodes"].shape[0], :] = f["input_nodes"]
+            batch["input_edges"][
+                ix, : f["input_edges"].shape[0], : f["input_edges"].shape[1], : f["input_edges"].shape[2], :
+            ] = f["input_edges"]
+
+        batch["out_degree"] = batch["in_degree"]
+
+        sample = features[0]["labels"]
+        if len(sample) == 1:  # one task
+            if isinstance(sample[0], float):  # regression
+                batch["labels"] = torch.from_numpy(np.concatenate([i["labels"] for i in features]))
+            else:  # binary classification
+                batch["labels"] = torch.from_numpy(np.concatenate([i["labels"] for i in features]))
+        else:  # multi task classification, left to float to keep the NaNs
+            batch["labels"] = torch.from_numpy(np.stack([i["labels"] for i in features], axis=0))
+
+        return batch
diff --git a/src/transformers/models/graphormer/configuration_graphormer.py b/src/transformers/models/graphormer/configuration_graphormer.py
new file mode 100644
index 000000000000..7f270f943434
--- /dev/null
+++ b/src/transformers/models/graphormer/configuration_graphormer.py
@@ -0,0 +1,220 @@
+# coding=utf-8
+# Copyright 2022 Microsoft, clefourrier and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Graphormer model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    # pcqm4mv1 now deprecated
+    "graphormer-base": "https://huggingface.co/clefourrier/graphormer-base-pcqm4mv2/resolve/main/config.json",
+    # See all Graphormer models at https://huggingface.co/models?filter=graphormer
+}
+
+
+class GraphormerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`~GraphormerModel`]. It is used to instantiate an
+    Graphormer model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the Graphormer
+    [graphormer-base-pcqm4mv1](https://huggingface.co/graphormer-base-pcqm4mv1) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        num_classes (`int`, *optional*, defaults to 1):
+            Number of target classes or labels, set to n for binary classification of n tasks.
+        num_atoms (`int`, *optional*, defaults to 512*9):
+            Number of node types in the graphs.
+        num_edges (`int`, *optional*, defaults to 512*3):
+            Number of edges types in the graph.
+        num_in_degree (`int`, *optional*, defaults to 512):
+            Number of in degrees types in the input graphs.
+        num_out_degree (`int`, *optional*, defaults to 512):
+            Number of out degrees types in the input graphs.
+        num_edge_dis (`int`, *optional*, defaults to 128):
+            Number of edge dis in the input graphs.
+        multi_hop_max_dist (`int`, *optional*, defaults to 20):
+            Maximum distance of multi hop edges between two nodes.
+        spatial_pos_max (`int`, *optional*, defaults to 1024):
+            Maximum distance between nodes in the graph attention bias matrices, used during preprocessing and
+            collation.
+        edge_type (`str`, *optional*, defaults to multihop):
+            Type of edge relation chosen.
+        max_nodes (`int`, *optional*, defaults to 512):
+            Maximum number of nodes which can be parsed for the input graphs.
+        share_input_output_embed (`bool`, *optional*, defaults to `False`):
+            Shares the embedding layer between encoder and decoder - careful, True is not implemented.
+        num_layers (`int`, *optional*, defaults to 12):
+            Number of layers.
+        embedding_dim (`int`, *optional*, defaults to 768):
+            Dimension of the embedding layer in encoder.
+        ffn_embedding_dim (`int`, *optional*, defaults to 768):
+            Dimension of the "intermediate" (often named feed-forward) layer in encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads in the encoder.
+        self_attention (`bool`, *optional*, defaults to `True`):
+            Model is self attentive (False not implemented).
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the attention weights.
+        activation_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the activation of the linear transformer layer.
+        layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        bias (`bool`, *optional*, defaults to `True`):
+            Uses bias in the attention module - unsupported at the moment.
+        embed_scale(`float`, *optional*, defaults to None):
+            Scaling factor for the node embeddings.
+        num_trans_layers_to_freeze (`int`, *optional*, defaults to 0):
+            Number of transformer layers to freeze.
+        encoder_normalize_before (`bool`, *optional*, defaults to `False`):
+            Normalize features before encoding the graph.
+        pre_layernorm (`bool`, *optional*, defaults to `False`):
+            Apply layernorm before self attention and the feed forward network. Without this, post layernorm will be
+            used.
+        apply_graphormer_init (`bool`, *optional*, defaults to `False`):
+            Apply a custom graphormer initialisation to the model before training.
+        freeze_embeddings (`bool`, *optional*, defaults to `False`):
+            Freeze the embedding layer, or train it along the model.
+        encoder_normalize_before (`bool`, *optional*, defaults to `False`):
+            Apply the layer norm before each encoder block.
+        q_noise (`float`, *optional*, defaults to 0.0):
+            Amount of quantization noise (see "Training with Quantization Noise for Extreme Model Compression"). (For
+            more detail, see fairseq's documentation on quant_noise).
+        qn_block_size (`int`, *optional*, defaults to 8):
+            Size of the blocks for subsequent quantization with iPQ (see q_noise).
+        kdim (`int`, *optional*, defaults to None):
+            Dimension of the key in the attention, if different from the other values.
+        vdim (`int`, *optional*, defaults to None):
+            Dimension of the value in the attention, if different from the other values.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        traceable (`bool`, *optional*, defaults to `False`):
+            Changes return value of the encoder's inner_state to stacked tensors.
+
+        Example:
+            ```python
+            >>> from transformers import GraphormerForGraphClassification, GraphormerConfig
+
+            >>> # Initializing a Graphormer graphormer-base-pcqm4mv2 style configuration
+            >>> configuration = GraphormerConfig()
+
+            >>> # Initializing a model from the graphormer-base-pcqm4mv1 style configuration
+            >>> model = GraphormerForGraphClassification(configuration)
+
+            >>> # Accessing the model configuration
+            >>> configuration = model.config
+            ```
+    """
+    model_type = "graphormer"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        num_classes: int = 1,
+        num_atoms: int = 512 * 9,
+        num_edges: int = 512 * 3,
+        num_in_degree: int = 512,
+        num_out_degree: int = 512,
+        num_spatial: int = 512,
+        num_edge_dis: int = 128,
+        multi_hop_max_dist: int = 5,  # sometimes is 20
+        spatial_pos_max: int = 1024,
+        edge_type: str = "multi_hop",
+        max_nodes: int = 512,
+        share_input_output_embed: bool = False,
+        num_hidden_layers: int = 12,
+        embedding_dim: int = 768,
+        ffn_embedding_dim: int = 768,
+        num_attention_heads: int = 32,
+        dropout: float = 0.1,
+        attention_dropout: float = 0.1,
+        activation_dropout: float = 0.1,
+        layerdrop: float = 0.0,
+        encoder_normalize_before: bool = False,
+        pre_layernorm: bool = False,
+        apply_graphormer_init: bool = False,
+        activation_fn: str = "gelu",
+        embed_scale: float = None,
+        freeze_embeddings: bool = False,
+        num_trans_layers_to_freeze: int = 0,
+        traceable: bool = False,
+        q_noise: float = 0.0,
+        qn_block_size: int = 8,
+        kdim: int = None,
+        vdim: int = None,
+        bias: bool = True,
+        self_attention: bool = True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        **kwargs,
+    ):
+        self.num_classes = num_classes
+        self.num_atoms = num_atoms
+        self.num_in_degree = num_in_degree
+        self.num_out_degree = num_out_degree
+        self.num_edges = num_edges
+        self.num_spatial = num_spatial
+        self.num_edge_dis = num_edge_dis
+        self.edge_type = edge_type
+        self.multi_hop_max_dist = multi_hop_max_dist
+        self.spatial_pos_max = spatial_pos_max
+        self.max_nodes = max_nodes
+        self.num_hidden_layers = num_hidden_layers
+        self.embedding_dim = embedding_dim
+        self.hidden_size = embedding_dim
+        self.ffn_embedding_dim = ffn_embedding_dim
+        self.num_attention_heads = num_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.layerdrop = layerdrop
+        self.encoder_normalize_before = encoder_normalize_before
+        self.pre_layernorm = pre_layernorm
+        self.apply_graphormer_init = apply_graphormer_init
+        self.activation_fn = activation_fn
+        self.embed_scale = embed_scale
+        self.freeze_embeddings = freeze_embeddings
+        self.num_trans_layers_to_freeze = num_trans_layers_to_freeze
+        self.share_input_output_embed = share_input_output_embed
+        self.traceable = traceable
+        self.q_noise = q_noise
+        self.qn_block_size = qn_block_size
+
+        # These parameters are here for future extensions
+        # atm, the model only supports self attention
+        self.kdim = kdim
+        self.vdim = vdim
+        self.self_attention = self_attention
+        self.bias = bias
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
diff --git a/src/transformers/models/graphormer/modeling_graphormer.py b/src/transformers/models/graphormer/modeling_graphormer.py
new file mode 100755
index 000000000000..8247745a3bc3
--- /dev/null
+++ b/src/transformers/models/graphormer/modeling_graphormer.py
@@ -0,0 +1,920 @@
+# coding=utf-8
+# Copyright 2022 Microsoft, clefourrier The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Graphormer model."""
+
+import math
+from typing import Iterable, Iterator, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BaseModelOutputWithNoAttention,
+    SequenceClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import logging
+from .configuration_graphormer import GraphormerConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "graphormer-base-pcqm4mv1"
+_CONFIG_FOR_DOC = "GraphormerConfig"
+
+
+GRAPHORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "clefourrier/graphormer-base-pcqm4mv1",
+    "clefourrier/graphormer-base-pcqm4mv2",
+    # See all Graphormer models at https://huggingface.co/models?filter=graphormer
+]
+
+
+def quant_noise(module: nn.Module, p: float, block_size: int):
+    """
+    From:
+    https://github.com/facebookresearch/fairseq/blob/dd0079bde7f678b0cd0715cbd0ae68d661b7226d/fairseq/modules/quant_noise.py
+
+    Wraps modules and applies quantization noise to the weights for subsequent quantization with Iterative Product
+    Quantization as described in "Training with Quantization Noise for Extreme Model Compression"
+
+    Args:
+        - module: nn.Module
+        - p: amount of Quantization Noise
+        - block_size: size of the blocks for subsequent quantization with iPQ
+
+    Remarks:
+        - Module weights must have the right sizes wrt the block size
+        - Only Linear, Embedding and Conv2d modules are supported for the moment
+        - For more detail on how to quantize by blocks with convolutional weights, see "And the Bit Goes Down:
+          Revisiting the Quantization of Neural Networks"
+        - We implement the simplest form of noise here as stated in the paper which consists in randomly dropping
+          blocks
+    """
+
+    # if no quantization noise, don't register hook
+    if p <= 0:
+        return module
+
+    # supported modules
+    if not isinstance(module, (nn.Linear, nn.Embedding, nn.Conv2d)):
+        raise NotImplementedError("Module unsupported for quant_noise.")
+
+    # test whether module.weight has the right sizes wrt block_size
+    is_conv = module.weight.ndim == 4
+
+    # 2D matrix
+    if not is_conv:
+        if module.weight.size(1) % block_size != 0:
+            raise AssertionError("Input features must be a multiple of block sizes")
+
+    # 4D matrix
+    else:
+        # 1x1 convolutions
+        if module.kernel_size == (1, 1):
+            if module.in_channels % block_size != 0:
+                raise AssertionError("Input channels must be a multiple of block sizes")
+        # regular convolutions
+        else:
+            k = module.kernel_size[0] * module.kernel_size[1]
+            if k % block_size != 0:
+                raise AssertionError("Kernel size must be a multiple of block size")
+
+    def _forward_pre_hook(mod, input):
+        # no noise for evaluation
+        if mod.training:
+            if not is_conv:
+                # gather weight and sizes
+                weight = mod.weight
+                in_features = weight.size(1)
+                out_features = weight.size(0)
+
+                # split weight matrix into blocks and randomly drop selected blocks
+                mask = torch.zeros(in_features // block_size * out_features, device=weight.device)
+                mask.bernoulli_(p)
+                mask = mask.repeat_interleave(block_size, -1).view(-1, in_features)
+
+            else:
+                # gather weight and sizes
+                weight = mod.weight
+                in_channels = mod.in_channels
+                out_channels = mod.out_channels
+
+                # split weight matrix into blocks and randomly drop selected blocks
+                if mod.kernel_size == (1, 1):
+                    mask = torch.zeros(
+                        int(in_channels // block_size * out_channels),
+                        device=weight.device,
+                    )
+                    mask.bernoulli_(p)
+                    mask = mask.repeat_interleave(block_size, -1).view(-1, in_channels)
+                else:
+                    mask = torch.zeros(weight.size(0), weight.size(1), device=weight.device)
+                    mask.bernoulli_(p)
+                    mask = mask.unsqueeze(2).unsqueeze(3).repeat(1, 1, mod.kernel_size[0], mod.kernel_size[1])
+
+            # scale weights and apply mask
+            mask = mask.to(torch.bool)  # x.bool() is not currently supported in TorchScript
+            s = 1 / (1 - p)
+            mod.weight.data = s * weight.masked_fill(mask, 0)
+
+    module.register_forward_pre_hook(_forward_pre_hook)
+    return module
+
+
+class LayerDropModuleList(nn.ModuleList):
+    """
+    From:
+    https://github.com/facebookresearch/fairseq/blob/dd0079bde7f678b0cd0715cbd0ae68d661b7226d/fairseq/modules/layer_drop.py
+    A LayerDrop implementation based on [`torch.nn.ModuleList`]. LayerDrop as described in
+    https://arxiv.org/abs/1909.11556.
+
+    We refresh the choice of which layers to drop every time we iterate over the LayerDropModuleList instance. During
+    evaluation we always iterate over all layers.
+
+    Usage:
+
+    ```python
+    layers = LayerDropList(p=0.5, modules=[layer1, layer2, layer3])
+    for layer in layers:  # this might iterate over layers 1 and 3
+        x = layer(x)
+    for layer in layers:  # this might iterate over all layers
+        x = layer(x)
+    for layer in layers:  # this might not iterate over any layers
+        x = layer(x)
+    ```
+
+    Args:
+        p (float): probability of dropping out each layer
+        modules (iterable, optional): an iterable of modules to add
+    """
+
+    def __init__(self, p: float, modules: Optional[Iterable[nn.Module]] = None):
+        super().__init__(modules)
+        self.p = p
+
+    def __iter__(self) -> Iterator[nn.Module]:
+        dropout_probs = torch.empty(len(self)).uniform_()
+        for i, m in enumerate(super().__iter__()):
+            if not self.training or (dropout_probs[i] > self.p):
+                yield m
+
+
+class GraphormerGraphNodeFeature(nn.Module):
+    """
+    Compute node features for each node in the graph.
+    """
+
+    def __init__(self, config: GraphormerConfig):
+        super().__init__()
+        self.num_heads = config.num_attention_heads
+        self.num_atoms = config.num_atoms
+
+        self.atom_encoder = nn.Embedding(config.num_atoms + 1, config.hidden_size, padding_idx=config.pad_token_id)
+        self.in_degree_encoder = nn.Embedding(
+            config.num_in_degree, config.hidden_size, padding_idx=config.pad_token_id
+        )
+        self.out_degree_encoder = nn.Embedding(
+            config.num_out_degree, config.hidden_size, padding_idx=config.pad_token_id
+        )
+
+        self.graph_token = nn.Embedding(1, config.hidden_size)
+
+    def forward(
+        self,
+        input_nodes: torch.LongTensor,
+        in_degree: torch.LongTensor,
+        out_degree: torch.LongTensor,
+    ) -> torch.Tensor:
+        n_graph, n_node = input_nodes.size()[:2]
+
+        node_feature = (  # node feature + graph token
+            self.atom_encoder(input_nodes).sum(dim=-2)  # [n_graph, n_node, n_hidden]
+            + self.in_degree_encoder(in_degree)
+            + self.out_degree_encoder(out_degree)
+        )
+
+        graph_token_feature = self.graph_token.weight.unsqueeze(0).repeat(n_graph, 1, 1)
+
+        graph_node_feature = torch.cat([graph_token_feature, node_feature], dim=1)
+
+        return graph_node_feature
+
+
+class GraphormerGraphAttnBias(nn.Module):
+    """
+    Compute attention bias for each head.
+    """
+
+    def __init__(self, config: GraphormerConfig):
+        super().__init__()
+        self.num_heads = config.num_attention_heads
+        self.multi_hop_max_dist = config.multi_hop_max_dist
+
+        # We do not change edge feature embedding learning, as edge embeddings are represented as a combination of the original features
+        # + shortest path
+        self.edge_encoder = nn.Embedding(config.num_edges + 1, config.num_attention_heads, padding_idx=0)
+
+        self.edge_type = config.edge_type
+        if self.edge_type == "multi_hop":
+            self.edge_dis_encoder = nn.Embedding(
+                config.num_edge_dis * config.num_attention_heads * config.num_attention_heads,
+                1,
+            )
+
+        self.spatial_pos_encoder = nn.Embedding(config.num_spatial, config.num_attention_heads, padding_idx=0)
+
+        self.graph_token_virtual_distance = nn.Embedding(1, config.num_attention_heads)
+
+    def forward(
+        self,
+        input_nodes: torch.LongTensor,
+        attn_bias: torch.Tensor,
+        spatial_pos: torch.LongTensor,
+        input_edges: torch.LongTensor,
+        attn_edge_type: torch.LongTensor,
+    ) -> torch.Tensor:
+        n_graph, n_node = input_nodes.size()[:2]
+        graph_attn_bias = attn_bias.clone()
+        graph_attn_bias = graph_attn_bias.unsqueeze(1).repeat(
+            1, self.num_heads, 1, 1
+        )  # [n_graph, n_head, n_node+1, n_node+1]
+
+        # spatial pos
+        # [n_graph, n_node, n_node, n_head] -> [n_graph, n_head, n_node, n_node]
+        spatial_pos_bias = self.spatial_pos_encoder(spatial_pos).permute(0, 3, 1, 2)
+        graph_attn_bias[:, :, 1:, 1:] = graph_attn_bias[:, :, 1:, 1:] + spatial_pos_bias
+
+        # reset spatial pos here
+        t = self.graph_token_virtual_distance.weight.view(1, self.num_heads, 1)
+        graph_attn_bias[:, :, 1:, 0] = graph_attn_bias[:, :, 1:, 0] + t
+        graph_attn_bias[:, :, 0, :] = graph_attn_bias[:, :, 0, :] + t
+
+        # edge feature
+        if self.edge_type == "multi_hop":
+            spatial_pos_ = spatial_pos.clone()
+
+            spatial_pos_[spatial_pos_ == 0] = 1  # set pad to 1
+            # set 1 to 1, input_nodes > 1 to input_nodes - 1
+            spatial_pos_ = torch.where(spatial_pos_ > 1, spatial_pos_ - 1, spatial_pos_)
+            if self.multi_hop_max_dist > 0:
+                spatial_pos_ = spatial_pos_.clamp(0, self.multi_hop_max_dist)
+                input_edges = input_edges[:, :, :, : self.multi_hop_max_dist, :]
+            # [n_graph, n_node, n_node, max_dist, n_head]
+
+            input_edges = self.edge_encoder(input_edges).mean(-2)
+            max_dist = input_edges.size(-2)
+            edge_input_flat = input_edges.permute(3, 0, 1, 2, 4).reshape(max_dist, -1, self.num_heads)
+            edge_input_flat = torch.bmm(
+                edge_input_flat,
+                self.edge_dis_encoder.weight.reshape(-1, self.num_heads, self.num_heads)[:max_dist, :, :],
+            )
+            input_edges = edge_input_flat.reshape(max_dist, n_graph, n_node, n_node, self.num_heads).permute(
+                1, 2, 3, 0, 4
+            )
+            input_edges = (input_edges.sum(-2) / (spatial_pos_.float().unsqueeze(-1))).permute(0, 3, 1, 2)
+        else:
+            # [n_graph, n_node, n_node, n_head] -> [n_graph, n_head, n_node, n_node]
+            input_edges = self.edge_encoder(attn_edge_type).mean(-2).permute(0, 3, 1, 2)
+
+        graph_attn_bias[:, :, 1:, 1:] = graph_attn_bias[:, :, 1:, 1:] + input_edges
+        graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1)  # reset
+
+        return graph_attn_bias
+
+
+class GraphormerMultiheadAttention(nn.Module):
+    """Multi-headed attention.
+
+    See "Attention Is All You Need" for more details.
+    """
+
+    def __init__(self, config: GraphormerConfig):
+        super().__init__()
+        self.embedding_dim = config.embedding_dim
+        self.kdim = config.kdim if config.kdim is not None else config.embedding_dim
+        self.vdim = config.vdim if config.vdim is not None else config.embedding_dim
+        self.qkv_same_dim = self.kdim == config.embedding_dim and self.vdim == config.embedding_dim
+
+        self.num_heads = config.num_attention_heads
+        self.attention_dropout_module = torch.nn.Dropout(p=config.attention_dropout, inplace=False)
+
+        self.head_dim = config.embedding_dim // config.num_attention_heads
+        if not (self.head_dim * config.num_attention_heads == self.embedding_dim):
+            raise AssertionError("The embedding_dim must be divisible by num_heads.")
+        self.scaling = self.head_dim**-0.5
+
+        self.self_attention = True  # config.self_attention
+        if not (self.self_attention):
+            raise NotImplementedError("The Graphormer model only supports self attention for now.")
+        if self.self_attention and not self.qkv_same_dim:
+            raise AssertionError("Self-attention requires query, key and value to be of the same size.")
+
+        self.k_proj = quant_noise(
+            nn.Linear(self.kdim, config.embedding_dim, bias=config.bias),
+            config.q_noise,
+            config.qn_block_size,
+        )
+        self.v_proj = quant_noise(
+            nn.Linear(self.vdim, config.embedding_dim, bias=config.bias),
+            config.q_noise,
+            config.qn_block_size,
+        )
+        self.q_proj = quant_noise(
+            nn.Linear(config.embedding_dim, config.embedding_dim, bias=config.bias),
+            config.q_noise,
+            config.qn_block_size,
+        )
+
+        self.out_proj = quant_noise(
+            nn.Linear(config.embedding_dim, config.embedding_dim, bias=config.bias),
+            config.q_noise,
+            config.qn_block_size,
+        )
+
+        self.onnx_trace = False
+
+    def reset_parameters(self):
+        if self.qkv_same_dim:
+            # Empirically observed the convergence to be much better with
+            # the scaled initialization
+            nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
+            nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2))
+            nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2))
+        else:
+            nn.init.xavier_uniform_(self.k_proj.weight)
+            nn.init.xavier_uniform_(self.v_proj.weight)
+            nn.init.xavier_uniform_(self.q_proj.weight)
+
+        nn.init.xavier_uniform_(self.out_proj.weight)
+        if self.out_proj.bias is not None:
+            nn.init.constant_(self.out_proj.bias, 0.0)
+
+    def forward(
+        self,
+        query: torch.LongTensor,
+        key: Optional[torch.Tensor],
+        value: Optional[torch.Tensor],
+        attn_bias: Optional[torch.Tensor],
+        key_padding_mask: Optional[torch.Tensor] = None,
+        need_weights: bool = True,
+        attn_mask: Optional[torch.Tensor] = None,
+        before_softmax: bool = False,
+        need_head_weights: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Args:
+            key_padding_mask (Bytetorch.Tensor, optional): mask to exclude
+                keys that are pads, of shape `(batch, src_len)`, where padding elements are indicated by 1s.
+            need_weights (bool, optional): return the attention weights,
+                averaged over heads (default: False).
+            attn_mask (Bytetorch.Tensor, optional): typically used to
+                implement causal attention, where the mask prevents the attention from looking forward in time
+                (default: None).
+            before_softmax (bool, optional): return the raw attention
+                weights and values before the attention softmax.
+            need_head_weights (bool, optional): return the attention
+                weights for each head. Implies *need_weights*. Default: return the average attention weights over all
+                heads.
+        """
+        if need_head_weights:
+            need_weights = True
+
+        tgt_len, bsz, embedding_dim = query.size()
+        src_len = tgt_len
+        if not (embedding_dim == self.embedding_dim):
+            raise AssertionError(
+                f"The query embedding dimension {embedding_dim} is not equal to the expected embedding_dim"
+                f" {self.embedding_dim}."
+            )
+        if not (list(query.size()) == [tgt_len, bsz, embedding_dim]):
+            raise AssertionError("Query size incorrect in Graphormer, compared to model dimensions.")
+
+        if key is not None:
+            src_len, key_bsz, _ = key.size()
+            if not torch.jit.is_scripting():
+                if (key_bsz != bsz) or (value is None) or not (src_len, bsz == value.shape[:2]):
+                    raise AssertionError(
+                        "The batch shape does not match the key or value shapes provided to the attention."
+                    )
+
+        q = self.q_proj(query)
+        k = self.k_proj(query)
+        v = self.v_proj(query)
+
+        q *= self.scaling
+
+        q = q.contiguous().view(tgt_len, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+        if k is not None:
+            k = k.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+        if v is not None:
+            v = v.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+
+        if (k is None) or not (k.size(1) == src_len):
+            raise AssertionError("The shape of the key generated in the attention is incorrect")
+
+        # This is part of a workaround to get around fork/join parallelism
+        # not supporting Optional types.
+        if key_padding_mask is not None and key_padding_mask.dim() == 0:
+            key_padding_mask = None
+
+        if key_padding_mask is not None:
+            if key_padding_mask.size(0) != bsz or key_padding_mask.size(1) != src_len:
+                raise AssertionError(
+                    "The shape of the generated padding mask for the key does not match expected dimensions."
+                )
+        attn_weights = torch.bmm(q, k.transpose(1, 2))
+        attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz)
+
+        if list(attn_weights.size()) != [bsz * self.num_heads, tgt_len, src_len]:
+            raise AssertionError("The attention weights generated do not match the expected dimensions.")
+
+        if attn_bias is not None:
+            attn_weights += attn_bias.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if attn_mask is not None:
+            attn_mask = attn_mask.unsqueeze(0)
+            attn_weights += attn_mask
+
+        if key_padding_mask is not None:
+            # don't attend to padding symbols
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.masked_fill(
+                key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool), float("-inf")
+            )
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if before_softmax:
+            return attn_weights, v
+
+        attn_weights_float = torch.nn.functional.softmax(attn_weights, dim=-1)
+        attn_weights = attn_weights_float.type_as(attn_weights)
+        attn_probs = self.attention_dropout_module(attn_weights)
+
+        if v is None:
+            raise AssertionError("No value generated")
+        attn = torch.bmm(attn_probs, v)
+        if list(attn.size()) != [bsz * self.num_heads, tgt_len, self.head_dim]:
+            raise AssertionError("The attention generated do not match the expected dimensions.")
+
+        attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embedding_dim)
+        attn: torch.Tensor = self.out_proj(attn)
+
+        attn_weights = None
+        if need_weights:
+            attn_weights = attn_weights_float.contiguous().view(bsz, self.num_heads, tgt_len, src_len).transpose(1, 0)
+            if not need_head_weights:
+                # average attention weights over heads
+                attn_weights = attn_weights.mean(dim=0)
+
+        return attn, attn_weights
+
+    def apply_sparse_mask(self, attn_weights: torch.Tensor, tgt_len: int, src_len: int, bsz: int) -> torch.Tensor:
+        return attn_weights
+
+
+class GraphormerGraphEncoderLayer(nn.Module):
+    def __init__(self, config: GraphormerConfig) -> None:
+        super().__init__()
+
+        # Initialize parameters
+        self.embedding_dim = config.embedding_dim
+        self.num_attention_heads = config.num_attention_heads
+        self.q_noise = config.q_noise
+        self.qn_block_size = config.qn_block_size
+        self.pre_layernorm = config.pre_layernorm
+
+        self.dropout_module = torch.nn.Dropout(p=config.dropout, inplace=False)
+
+        self.activation_dropout_module = torch.nn.Dropout(p=config.activation_dropout, inplace=False)
+
+        # Initialize blocks
+        self.activation_fn = ACT2FN[config.activation_fn]
+        self.self_attn = GraphormerMultiheadAttention(config)
+
+        # layer norm associated with the self attention layer
+        self.self_attn_layer_norm = nn.LayerNorm(self.embedding_dim)
+
+        self.fc1 = self.build_fc(
+            self.embedding_dim,
+            config.ffn_embedding_dim,
+            q_noise=config.q_noise,
+            qn_block_size=config.qn_block_size,
+        )
+        self.fc2 = self.build_fc(
+            config.ffn_embedding_dim,
+            self.embedding_dim,
+            q_noise=config.q_noise,
+            qn_block_size=config.qn_block_size,
+        )
+
+        # layer norm associated with the position wise feed-forward NN
+        self.final_layer_norm = nn.LayerNorm(self.embedding_dim)
+
+    def build_fc(
+        self, input_dim: int, output_dim: int, q_noise: float, qn_block_size: int
+    ) -> Union[nn.Module, nn.Linear, nn.Embedding, nn.Conv2d]:
+        return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size)
+
+    def forward(
+        self,
+        input_nodes: torch.Tensor,
+        self_attn_bias: Optional[torch.Tensor] = None,
+        self_attn_mask: Optional[torch.Tensor] = None,
+        self_attn_padding_mask: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """
+        nn.LayerNorm is applied either before or after the self-attention/ffn modules similar to the original
+        Transformer implementation.
+        """
+        residual = input_nodes
+        if self.pre_layernorm:
+            input_nodes = self.self_attn_layer_norm(input_nodes)
+
+        input_nodes, attn = self.self_attn(
+            query=input_nodes,
+            key=input_nodes,
+            value=input_nodes,
+            attn_bias=self_attn_bias,
+            key_padding_mask=self_attn_padding_mask,
+            need_weights=False,
+            attn_mask=self_attn_mask,
+        )
+        input_nodes = self.dropout_module(input_nodes)
+        input_nodes = residual + input_nodes
+        if not self.pre_layernorm:
+            input_nodes = self.self_attn_layer_norm(input_nodes)
+
+        residual = input_nodes
+        if self.pre_layernorm:
+            input_nodes = self.final_layer_norm(input_nodes)
+        input_nodes = self.activation_fn(self.fc1(input_nodes))
+        input_nodes = self.activation_dropout_module(input_nodes)
+        input_nodes = self.fc2(input_nodes)
+        input_nodes = self.dropout_module(input_nodes)
+        input_nodes = residual + input_nodes
+        if not self.pre_layernorm:
+            input_nodes = self.final_layer_norm(input_nodes)
+
+        return input_nodes, attn
+
+
+class GraphormerGraphEncoder(nn.Module):
+    def __init__(self, config: GraphormerConfig):
+        super().__init__()
+
+        self.dropout_module = torch.nn.Dropout(p=config.dropout, inplace=False)
+        self.layerdrop = config.layerdrop
+        self.embedding_dim = config.embedding_dim
+        self.apply_graphormer_init = config.apply_graphormer_init
+        self.traceable = config.traceable
+
+        self.graph_node_feature = GraphormerGraphNodeFeature(config)
+        self.graph_attn_bias = GraphormerGraphAttnBias(config)
+
+        self.embed_scale = config.embed_scale
+
+        if config.q_noise > 0:
+            self.quant_noise = quant_noise(
+                nn.Linear(self.embedding_dim, self.embedding_dim, bias=False),
+                config.q_noise,
+                config.qn_block_size,
+            )
+        else:
+            self.quant_noise = None
+
+        if config.encoder_normalize_before:
+            self.emb_layer_norm = nn.LayerNorm(self.embedding_dim)
+        else:
+            self.emb_layer_norm = None
+
+        if config.pre_layernorm:
+            self.final_layer_norm = nn.LayerNorm(self.embedding_dim)
+
+        if self.layerdrop > 0.0:
+            self.layers = LayerDropModuleList(p=self.layerdrop)
+        else:
+            self.layers = nn.ModuleList([])
+        self.layers.extend([GraphormerGraphEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+
+        # Apply initialization of model params after building the model
+        if config.freeze_embeddings:
+            raise NotImplementedError("Freezing embeddings is not implemented yet.")
+
+        for layer in range(config.num_trans_layers_to_freeze):
+            m = self.layers[layer]
+            if m is not None:
+                for p in m.parameters():
+                    p.requires_grad = False
+
+    def forward(
+        self,
+        input_nodes: torch.LongTensor,
+        input_edges: torch.LongTensor,
+        attn_bias: torch.Tensor,
+        in_degree: torch.LongTensor,
+        out_degree: torch.LongTensor,
+        spatial_pos: torch.LongTensor,
+        attn_edge_type: torch.LongTensor,
+        perturb=None,
+        last_state_only: bool = False,
+        token_embeddings: Optional[torch.Tensor] = None,
+        attn_mask: Optional[torch.Tensor] = None,
+    ) -> Tuple[Union[torch.Tensor, List[torch.LongTensor]], torch.Tensor]:
+        # compute padding mask. This is needed for multi-head attention
+        data_x = input_nodes
+        n_graph, n_node = data_x.size()[:2]
+        padding_mask = (data_x[:, :, 0]).eq(0)
+        padding_mask_cls = torch.zeros(n_graph, 1, device=padding_mask.device, dtype=padding_mask.dtype)
+        padding_mask = torch.cat((padding_mask_cls, padding_mask), dim=1)
+
+        attn_bias = self.graph_attn_bias(input_nodes, attn_bias, spatial_pos, input_edges, attn_edge_type)
+
+        if token_embeddings is not None:
+            input_nodes = token_embeddings
+        else:
+            input_nodes = self.graph_node_feature(input_nodes, in_degree, out_degree)
+
+        if perturb is not None:
+            input_nodes[:, 1:, :] += perturb
+
+        if self.embed_scale is not None:
+            input_nodes = input_nodes * self.embed_scale
+
+        if self.quant_noise is not None:
+            input_nodes = self.quant_noise(input_nodes)
+
+        if self.emb_layer_norm is not None:
+            input_nodes = self.emb_layer_norm(input_nodes)
+
+        input_nodes = self.dropout_module(input_nodes)
+
+        input_nodes = input_nodes.transpose(0, 1)
+
+        inner_states = []
+        if not last_state_only:
+            inner_states.append(input_nodes)
+
+        for layer in self.layers:
+            input_nodes, _ = layer(
+                input_nodes,
+                self_attn_padding_mask=padding_mask,
+                self_attn_mask=attn_mask,
+                self_attn_bias=attn_bias,
+            )
+            if not last_state_only:
+                inner_states.append(input_nodes)
+
+        graph_rep = input_nodes[0, :, :]
+
+        if last_state_only:
+            inner_states = [input_nodes]
+
+        if self.traceable:
+            return torch.stack(inner_states), graph_rep
+        else:
+            return inner_states, graph_rep
+
+
+class GraphormerDecoderHead(nn.Module):
+    def __init__(self, embedding_dim: int, num_classes: int):
+        super().__init__()
+        """num_classes should be 1 for regression, or the number of classes for classification"""
+        self.lm_output_learned_bias = nn.Parameter(torch.zeros(1))
+        self.classifier = nn.Linear(embedding_dim, num_classes, bias=False)
+        self.num_classes = num_classes
+
+    def forward(self, input_nodes: torch.Tensor, **unused) -> torch.Tensor:
+        input_nodes = self.classifier(input_nodes)
+        input_nodes = input_nodes + self.lm_output_learned_bias
+        return input_nodes
+
+
+class GraphormerPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = GraphormerConfig
+    base_model_prefix = "graphormer"
+    supports_gradient_checkpointing = True
+    main_input_name_nodes = "input_nodes"
+    main_input_name_edges = "input_edges"
+
+    def normal_(self, data: torch.Tensor):
+        # with FSDP, module params will be on CUDA, so we cast them back to CPU
+        # so that the RNG is consistent with and without FSDP
+        data.copy_(data.cpu().normal_(mean=0.0, std=0.02).to(data.device))
+
+    def init_graphormer_params(self, module: Union[nn.Linear, nn.Embedding, GraphormerMultiheadAttention]):
+        """
+        Initialize the weights specific to the Graphormer Model.
+        """
+        if isinstance(module, nn.Linear):
+            self.normal_(module.weight.data)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        if isinstance(module, nn.Embedding):
+            self.normal_(module.weight.data)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        if isinstance(module, GraphormerMultiheadAttention):
+            self.normal_(module.q_proj.weight.data)
+            self.normal_(module.k_proj.weight.data)
+            self.normal_(module.v_proj.weight.data)
+
+    def _init_weights(
+        self,
+        module: Union[
+            nn.Linear, nn.Conv2d, nn.Embedding, nn.LayerNorm, GraphormerMultiheadAttention, GraphormerGraphEncoder
+        ],
+    ):
+        """
+        Initialize the weights
+        """
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # We might be missing part of the Linear init, dependant on the layer num
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, GraphormerMultiheadAttention):
+            module.q_proj.weight.data.normal_(mean=0.0, std=0.02)
+            module.k_proj.weight.data.normal_(mean=0.0, std=0.02)
+            module.v_proj.weight.data.normal_(mean=0.0, std=0.02)
+            module.reset_parameters()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, GraphormerGraphEncoder):
+            if module.apply_graphormer_init:
+                module.apply(self.init_graphormer_params)
+
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, GraphormerModel):
+            module.gradient_checkpointing = value
+
+
+class GraphormerModel(GraphormerPreTrainedModel):
+    """The Graphormer model is a graph-encoder model.
+
+    It goes from a graph to its representation. If you want to use the model for a downstream classification task, use
+    GraphormerForGraphClassification instead. For any other downstream task, feel free to add a new class, or combine
+    this model with a downstream model of your choice, following the example in GraphormerForGraphClassification.
+    """
+
+    def __init__(self, config: GraphormerConfig):
+        super().__init__(config)
+        self.max_nodes = config.max_nodes
+
+        self.graph_encoder = GraphormerGraphEncoder(config)
+
+        self.share_input_output_embed = config.share_input_output_embed
+        self.lm_output_learned_bias = None
+
+        # Remove head is set to true during fine-tuning
+        self.load_softmax = not getattr(config, "remove_head", False)
+
+        self.lm_head_transform_weight = nn.Linear(config.embedding_dim, config.embedding_dim)
+        self.activation_fn = ACT2FN[config.activation_fn]
+        self.layer_norm = nn.LayerNorm(config.embedding_dim)
+
+        self.post_init()
+
+    def reset_output_layer_parameters(self):
+        self.lm_output_learned_bias = nn.Parameter(torch.zeros(1))
+
+    def forward(
+        self,
+        input_nodes: torch.LongTensor,
+        input_edges: torch.LongTensor,
+        attn_bias: torch.Tensor,
+        in_degree: torch.LongTensor,
+        out_degree: torch.LongTensor,
+        spatial_pos: torch.LongTensor,
+        attn_edge_type: torch.LongTensor,
+        perturb: Optional[torch.FloatTensor] = None,
+        masked_tokens: None = None,
+        return_dict: Optional[bool] = None,
+        **unused,
+    ) -> Union[Tuple[torch.LongTensor], BaseModelOutputWithNoAttention]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        inner_states, graph_rep = self.graph_encoder(
+            input_nodes, input_edges, attn_bias, in_degree, out_degree, spatial_pos, attn_edge_type, perturb=perturb
+        )
+
+        # last inner state, then revert Batch and Graph len
+        input_nodes = inner_states[-1].transpose(0, 1)
+
+        # project masked tokens only
+        if masked_tokens is not None:
+            raise NotImplementedError
+
+        input_nodes = self.layer_norm(self.activation_fn(self.lm_head_transform_weight(input_nodes)))
+
+        # project back to size of vocabulary
+        if self.share_input_output_embed and hasattr(self.graph_encoder.embed_tokens, "weight"):
+            input_nodes = torch.nn.functional.linear(input_nodes, self.graph_encoder.embed_tokens.weight)
+
+        if not return_dict:
+            return tuple(x for x in [input_nodes, inner_states] if x is not None)
+        return BaseModelOutputWithNoAttention(last_hidden_state=input_nodes, hidden_states=inner_states)
+
+    def max_nodes(self):
+        """Maximum output length supported by the encoder."""
+        return self.max_nodes
+
+
+class GraphormerForGraphClassification(GraphormerPreTrainedModel):
+    """
+    This model can be used for graph-level classification or regression tasks.
+
+    It can be trained on
+    - regression (by setting config.num_classes to 1); there should be one float-type label per graph
+    - one task classification (by setting config.num_classes to the number of classes); there should be one integer
+      label per graph
+    - binary multi-task classification (by setting config.num_classes to the number of labels); there should be a list
+      of integer labels for each graph.
+    """
+
+    def __init__(self, config: GraphormerConfig):
+        super().__init__(config)
+        self.encoder = GraphormerModel(config)
+        self.embedding_dim = config.embedding_dim
+        self.num_classes = config.num_classes
+        self.classifier = GraphormerDecoderHead(self.embedding_dim, self.num_classes)
+        self.is_encoder_decoder = True
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_nodes: torch.LongTensor,
+        input_edges: torch.LongTensor,
+        attn_bias: torch.Tensor,
+        in_degree: torch.LongTensor,
+        out_degree: torch.LongTensor,
+        spatial_pos: torch.LongTensor,
+        attn_edge_type: torch.LongTensor,
+        labels: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+        **unused,
+    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_outputs = self.encoder(
+            input_nodes,
+            input_edges,
+            attn_bias,
+            in_degree,
+            out_degree,
+            spatial_pos,
+            attn_edge_type,
+            return_dict=True,
+        )
+        outputs, hidden_states = encoder_outputs["last_hidden_state"], encoder_outputs["hidden_states"]
+
+        head_outputs = self.classifier(outputs)
+        logits = head_outputs[:, 0, :].contiguous()
+
+        loss = None
+        if labels is not None:
+            mask = ~torch.isnan(labels)
+
+            if self.num_classes == 1:  # regression
+                loss_fct = MSELoss()
+                loss = loss_fct(logits[mask].squeeze(), labels[mask].squeeze().float())
+            elif self.num_classes > 1 and len(labels.shape) == 1:  # One task classification
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits[mask].view(-1, self.num_classes), labels[mask].view(-1))
+            else:  # Binary multi-task classification
+                loss_fct = BCEWithLogitsLoss(reduction="sum")
+                loss = loss_fct(logits[mask], labels[mask])
+
+        if not return_dict:
+            return tuple(x for x in [loss, logits, hidden_states] if x is not None)
+        return SequenceClassifierOutput(loss=loss, logits=logits, hidden_states=hidden_states, attentions=None)
diff --git a/src/transformers/models/groupvit/__init__.py b/src/transformers/models/groupvit/__init__.py
index 0e8b51fedbd1..d0de4a00bd15 100644
--- a/src/transformers/models/groupvit/__init__.py
+++ b/src/transformers/models/groupvit/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/groupvit/configuration_groupvit.py b/src/transformers/models/groupvit/configuration_groupvit.py
index bc9c96f89f1a..8acf0d1c4e3b 100644
--- a/src/transformers/models/groupvit/configuration_groupvit.py
+++ b/src/transformers/models/groupvit/configuration_groupvit.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ GroupViT model configuration"""
 
-import copy
 import os
 from collections import OrderedDict
 from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
@@ -100,15 +99,15 @@ def __init__(
         num_attention_heads=4,
         max_position_embeddings=77,
         hidden_act="quick_gelu",
-        layer_norm_eps=0.00001,
+        layer_norm_eps=1e-5,
         dropout=0.0,
         attention_dropout=0.0,
         initializer_range=0.02,
         initializer_factor=1.0,
         pad_token_id=1,
-        bos_token_id=0,
-        eos_token_id=2,
-        **kwargs
+        bos_token_id=49406,
+        eos_token_id=49407,
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
@@ -127,6 +126,7 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
 
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
@@ -221,7 +221,7 @@ def __init__(
         initializer_factor=1.0,
         assign_eps=1.0,
         assign_mlp_ratio=[0.5, 4],
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -251,6 +251,7 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
 
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
@@ -294,7 +295,6 @@ class GroupViTConfig(PretrainedConfig):
     """
 
     model_type = "groupvit"
-    is_composition = True
 
     def __init__(
         self,
@@ -303,25 +303,85 @@ def __init__(
         projection_dim=256,
         projection_intermediate_dim=4096,
         logit_scale_init_value=2.6592,
-        **kwargs
+        **kwargs,
     ):
-        super().__init__(**kwargs)
-
         # If `_config_dict` exist, we use them for the backward compatibility.
+        # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
+        # of confusion!).
         text_config_dict = kwargs.pop("text_config_dict", None)
         vision_config_dict = kwargs.pop("vision_config_dict", None)
+
+        super().__init__(**kwargs)
+
+        # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
+        # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
+        # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
         if text_config_dict is not None:
-            text_config = text_config_dict
+            if text_config is None:
+                text_config = {}
+
+            # This is the complete result when using `text_config_dict`.
+            _text_config_dict = GroupViTTextConfig(**text_config_dict).to_dict()
+
+            # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
+            for key, value in _text_config_dict.items():
+                if key in text_config and value != text_config[key] and key not in ["transformers_version"]:
+                    # If specified in `text_config_dict`
+                    if key in text_config_dict:
+                        message = (
+                            f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
+                            f'The value `text_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`text_config_dict` is provided which will be used to initialize `GroupViTTextConfig`. "
+                            f'The value `text_config["{key}"]` will be overriden.'
+                        )
+                    logger.warning(message)
+
+            # Update all values in `text_config` with the ones in `_text_config_dict`.
+            text_config.update(_text_config_dict)
+
         if vision_config_dict is not None:
-            vision_config = vision_config_dict
+            if vision_config is None:
+                vision_config = {}
+
+            # This is the complete result when using `vision_config_dict`.
+            _vision_config_dict = GroupViTVisionConfig(**vision_config_dict).to_dict()
+            # convert keys to string instead of integer
+            if "id2label" in _vision_config_dict:
+                _vision_config_dict["id2label"] = {
+                    str(key): value for key, value in _vision_config_dict["id2label"].items()
+                }
+
+            # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different.
+            for key, value in _vision_config_dict.items():
+                if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]:
+                    # If specified in `vision_config_dict`
+                    if key in vision_config_dict:
+                        message = (
+                            f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different "
+                            f'values. The value `vision_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`vision_config_dict` is provided which will be used to initialize `GroupViTVisionConfig`."
+                            f' The value `vision_config["{key}"]` will be overriden.'
+                        )
+                    logger.warning(message)
+
+            # Update all values in `vision_config` with the ones in `_vision_config_dict`.
+            vision_config.update(_vision_config_dict)
 
         if text_config is None:
             text_config = {}
-            logger.info("text_config is None. Initializing the GroupViTTextConfig with default values.")
+            logger.info("`text_config` is `None`. Initializing the `GroupViTTextConfig` with default values.")
 
         if vision_config is None:
             vision_config = {}
-            logger.info("vision_config is None. initializing the GroupViTVisionConfig with default values.")
+            logger.info("`vision_config` is `None`. initializing the `GroupViTVisionConfig` with default values.")
 
         self.text_config = GroupViTTextConfig(**text_config)
         self.vision_config = GroupViTVisionConfig(**vision_config)
@@ -345,19 +405,6 @@ def from_text_vision_configs(cls, text_config: GroupViTTextConfig, vision_config
 
         return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
 
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["text_config"] = self.text_config.to_dict()
-        output["vision_config"] = self.vision_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
-
 
 class GroupViTOnnxConfig(OnnxConfig):
     @property
@@ -392,12 +439,11 @@ def generate_dummy_inputs(
         seq_length: int = -1,
         framework: Optional["TensorType"] = None,
     ) -> Mapping[str, Any]:
-
         text_input_dict = super().generate_dummy_inputs(
             processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
         )
         image_input_dict = super().generate_dummy_inputs(
-            processor.feature_extractor, batch_size=batch_size, framework=framework
+            processor.image_processor, batch_size=batch_size, framework=framework
         )
         return {**text_input_dict, **image_input_dict}
 
diff --git a/src/transformers/models/groupvit/convert_groupvit_nvlab_to_hf.py b/src/transformers/models/groupvit/convert_groupvit_nvlab_to_hf.py
index e83bdd35cb37..059f10f6129b 100644
--- a/src/transformers/models/groupvit/convert_groupvit_nvlab_to_hf.py
+++ b/src/transformers/models/groupvit/convert_groupvit_nvlab_to_hf.py
@@ -21,10 +21,10 @@
 
 import argparse
 
+import requests
 import torch
 from PIL import Image
 
-import requests
 from transformers import CLIPProcessor, GroupViTConfig, GroupViTModel
 
 
diff --git a/src/transformers/models/groupvit/modeling_groupvit.py b/src/transformers/models/groupvit/modeling_groupvit.py
index e9cdef543dbb..59ff60ed765a 100644
--- a/src/transformers/models/groupvit/modeling_groupvit.py
+++ b/src/transformers/models/groupvit/modeling_groupvit.py
@@ -195,7 +195,6 @@ def __init__(self, config: GroupViTVisionConfig):
         self.assign_eps = config.assign_eps
 
     def get_attn(self, attn, gumbel=True, hard=True):
-
         if gumbel and self.training:
             attn = gumbel_softmax(attn, dim=-2, hard=hard)
         else:
@@ -451,7 +450,9 @@ def __init__(self, config: GroupViTTextConfig):
         self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
     def forward(
         self,
@@ -714,9 +715,9 @@ def __init__(self, config: GroupViTConfig):
         super().__init__()
         self.embed_dim = config.hidden_size
         self.self_attn = GroupViTAttention(config)
-        self.layer_norm1 = nn.LayerNorm(self.embed_dim)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
         self.mlp = GroupViTMLP(config)
-        self.layer_norm2 = nn.LayerNorm(self.embed_dim)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
 
     def forward(
         self,
@@ -768,7 +769,6 @@ class GroupViTPreTrainedModel(PreTrainedModel):
     config_class = GroupViTConfig
     base_model_prefix = "groupvit"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -857,7 +857,7 @@ def _set_gradient_checkpointing(self, module, value=False):
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
             tensors for more detail.
@@ -891,8 +891,8 @@ def _set_gradient_checkpointing(self, module, value=False):
 
             [What are position IDs?](../glossary#position-ids)
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`CLIPFeatureExtractor`]. See
-            [`CLIPFeatureExtractor.__call__`] for details.
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`CLIPImageProcessor.__call__`] for details.
         return_loss (`bool`, *optional*):
             Whether or not to return the contrastive loss.
         output_attentions (`bool`, *optional*):
@@ -931,7 +931,6 @@ def forward(
         output_attentions: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[tuple, BaseModelOutput]:
-
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1068,6 +1067,24 @@ def custom_forward(*inputs):
         )
 
 
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
 # Copied from transformers.models.clip.modeling_clip.CLIPTextTransformer with CLIPText->GroupViTText, CLIPEncoder->GroupViTTextEncoder, CLIP_TEXT->GROUPVIT_TEXT
 class GroupViTTextTransformer(nn.Module):
     def __init__(self, config: GroupViTTextConfig):
@@ -1076,7 +1093,10 @@ def __init__(self, config: GroupViTTextConfig):
         embed_dim = config.hidden_size
         self.embeddings = GroupViTTextEmbeddings(config)
         self.encoder = GroupViTTextEncoder(config)
-        self.final_layer_norm = nn.LayerNorm(embed_dim)
+        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+        # For `pooled_output` computation
+        self.eos_token_id = config.eos_token_id
 
     @add_start_docstrings_to_model_forward(GROUPVIT_TEXT_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=GroupViTTextConfig)
@@ -1107,12 +1127,9 @@ def forward(
 
         hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
 
-        bsz, seq_len = input_shape
         # CLIP's text model uses causal mask, prepare it here.
         # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
-        causal_attention_mask = self._build_causal_attention_mask(bsz, seq_len, hidden_states.dtype).to(
-            hidden_states.device
-        )
+        causal_attention_mask = _make_causal_mask(input_shape, hidden_states.dtype, device=hidden_states.device)
         # expand attention_mask
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
@@ -1130,13 +1147,26 @@ def forward(
         last_hidden_state = encoder_outputs[0]
         last_hidden_state = self.final_layer_norm(last_hidden_state)
 
-        # text_embeds.shape = [batch_size, sequence_length, transformer.width]
-        # take features from the eot embedding (eot_token is the highest number in each sequence)
-        # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
-        pooled_output = last_hidden_state[
-            torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
-            input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
-        ]
+        if self.eos_token_id == 2:
+            # The `eos_token_id` was incorrect before PR #24773: Let's keep what have been done here.
+            # A CLIP model with such `eos_token_id` in the config can't work correctly with extra new tokens added
+            # ------------------------------------------------------------
+            # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+            # take features from the eot embedding (eot_token is the highest number in each sequence)
+            # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+            pooled_output = last_hidden_state[
+                torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+                input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
+            ]
+        else:
+            # The config gets updated `eos_token_id` from PR #24773 (so the use of exta new tokens is possible)
+            pooled_output = last_hidden_state[
+                torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+                # We need to get the first position of `eos_token_id` value (`pad_token_ids` might equal to `eos_token_id`)
+                (input_ids.to(dtype=torch.int, device=last_hidden_state.device) == self.eos_token_id)
+                .int()
+                .argmax(dim=-1),
+            ]
 
         if not return_dict:
             return (last_hidden_state, pooled_output) + encoder_outputs[1:]
@@ -1148,15 +1178,6 @@ def forward(
             attentions=encoder_outputs.attentions,
         )
 
-    def _build_causal_attention_mask(self, bsz, seq_len, dtype):
-        # lazily create causal attention mask, with full attention between the vision tokens
-        # pytorch uses additive attention mask; fill with -inf
-        mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype)
-        mask.fill_(torch.tensor(torch.finfo(dtype).min))
-        mask.triu_(1)  # zero out the lower diagonal
-        mask = mask.unsqueeze(1)  # expand mask
-        return mask
-
 
 class GroupViTTextModel(GroupViTPreTrainedModel):
     config_class = GroupViTTextConfig
@@ -1219,7 +1240,7 @@ def __init__(self, config: GroupViTVisionConfig):
 
         self.embeddings = GroupViTVisionEmbeddings(config)
         self.encoder = GroupViTVisionEncoder(config)
-        self.layernorm = nn.LayerNorm(embed_dim)
+        self.layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
 
     @add_start_docstrings_to_model_forward(GROUPVIT_VISION_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=GroupViTVisionConfig)
@@ -1363,7 +1384,7 @@ def __init__(self, config: GroupViTConfig):
             nn.ReLU(inplace=True),
             nn.Linear(self.projection_intermediate_dim, self.projection_dim, bias=True),
         )
-        self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
+        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
 
         # Initialize weights and apply final processing
         self.post_init()
diff --git a/src/transformers/models/groupvit/modeling_tf_groupvit.py b/src/transformers/models/groupvit/modeling_tf_groupvit.py
index 551e7a51345f..027117bdce23 100644
--- a/src/transformers/models/groupvit/modeling_tf_groupvit.py
+++ b/src/transformers/models/groupvit/modeling_tf_groupvit.py
@@ -15,10 +15,12 @@
 """ TF 2.0 GroupViT model."""
 
 
+from __future__ import annotations
+
 import collections.abc
 import math
 from dataclasses import dataclass
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Any, Optional, Tuple, Union
 
 import numpy as np
 import tensorflow as tf
@@ -26,14 +28,13 @@
 from ...activations_tf import get_tf_activation
 from ...modeling_tf_outputs import TFBaseModelOutput, TFBaseModelOutputWithPooling
 from ...modeling_tf_utils import (
-    DUMMY_INPUTS,
     TFModelInputType,
     TFPreTrainedModel,
     get_initializer,
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -247,7 +248,7 @@ class TFGroupViTModelOutput(ModelOutput):
             The output of the [`TFGroupViTVisionModel`].
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits_per_image: tf.Tensor = None
     logits_per_text: tf.Tensor = None
     segmentation_logits: tf.Tensor = None
@@ -291,7 +292,6 @@ def __init__(self, config: GroupViTVisionConfig, **kwargs):
         self.assign_eps = config.assign_eps
 
     def get_attn(self, attn: tf.Tensor, gumbel: bool = True, hard: bool = True, training: bool = False) -> tf.Tensor:
-
         if gumbel and training:
             attn = gumbel_softmax(attn, dim=-2, hard=hard)
         else:
@@ -474,7 +474,6 @@ def __init__(self, config: GroupViTVisionConfig, **kwargs):
         self.config = config
 
     def build(self, input_shape: tf.TensorShape):
-
         num_patches = self.patch_embeddings.num_patches
         self.position_embeddings = self.add_weight(
             shape=(1, num_patches, self.config.hidden_size),
@@ -536,15 +535,13 @@ def __init__(self, config: GroupViTTextConfig, **kwargs):
         super().__init__(**kwargs)
 
         self.embed_dim = config.hidden_size
-        self.vocab_size = config.vocab_size
 
         self.config = config
 
-    def build(self, input_shape: tf.TensorShape):
-
+    def build(self, input_shape: tf.TensorShape = None):
         with tf.name_scope("token_embedding"):
             self.weight = self.add_weight(
-                shape=(self.vocab_size, self.embed_dim),
+                shape=(self.config.vocab_size, self.embed_dim),
                 initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range),
                 trainable=True,
                 name="weight",
@@ -576,16 +573,7 @@ def call(
             raise ValueError("You have to specify either input_ids or inputs_embeds")
 
         if inputs_embeds is None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
@@ -660,7 +648,7 @@ def split_x(self, x: tf.Tensor) -> tf.Tensor:
         else:
             return x, None
 
-    def concat_x(self, x: tf.Tensor, group_token: Optional[tf.Tensor] = None) -> tf.Tensor:
+    def concat_x(self, x: tf.Tensor, group_token: tf.Tensor | None = None) -> tf.Tensor:
         if group_token is None:
             return x
         return tf.concat([x, group_token], axis=1)
@@ -668,7 +656,7 @@ def concat_x(self, x: tf.Tensor, group_token: Optional[tf.Tensor] = None) -> tf.
     def call(
         self,
         hidden_states: tf.Tensor,
-        prev_group_token: Optional[tf.Tensor] = None,
+        prev_group_token: tf.Tensor | None = None,
         output_attentions: bool = False,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
@@ -1014,6 +1002,9 @@ def __init__(self, config: GroupViTTextConfig, **kwargs):
             epsilon=config.layer_norm_eps, name="final_layer_norm"
         )
 
+        # For `pooled_output` computation
+        self.eos_token_id = config.eos_token_id
+
     def call(
         self,
         input_ids: TFModelInputType,
@@ -1050,14 +1041,30 @@ def call(
         sequence_output = encoder_outputs[0]
         sequence_output = self.final_layer_norm(inputs=sequence_output)
 
-        # text_embeds.shape = [batch_size, n_ctx, transformer.width]
-        # take features from the eot embedding (eot_token is the highest number in each sequence)
-        pooled_output = tf.gather_nd(
-            params=sequence_output,
-            indices=tf.stack(
-                values=(tf.range(input_shape[0], dtype=tf.int64), tf.math.argmax(input_ids, axis=-1)), axis=1
-            ),
-        )
+        if self.eos_token_id == 2:
+            # The `eos_token_id` was incorrect before PR #24773: Let's keep what have been done here.
+            # A CLIP model with such `eos_token_id` in the config can't work correctly with extra new tokens added
+            # ------------------------------------------------------------
+            # text_embeds.shape = [batch_size, n_ctx, transformer.width]
+            # take features from the eot embedding (eot_token is the highest number in each sequence)
+            pooled_output = tf.gather_nd(
+                params=sequence_output,
+                indices=tf.stack(
+                    values=(tf.range(input_shape[0], dtype=tf.int64), tf.math.argmax(input_ids, axis=-1)), axis=1
+                ),
+            )
+        else:
+            # The config gets updated `eos_token_id` from PR #24773 (so the use of exta new tokens is possible)
+            pooled_output = tf.gather_nd(
+                params=sequence_output,
+                indices=tf.stack(
+                    values=(
+                        tf.range(input_shape[0], dtype=tf.int64),
+                        tf.math.argmax(tf.cast(input_ids == self.eos_token_id, dtype=tf.int8), axis=-1),
+                    ),
+                    axis=1,
+                ),
+            )
 
         if not return_dict:
             return (sequence_output, pooled_output) + encoder_outputs[1:]
@@ -1105,7 +1112,6 @@ def call(
         return_dict: bool,
         training: bool = False,
     ) -> Union[Tuple, TFBaseModelOutputWithPooling]:
-
         embedding_output = self.embeddings(pixel_values)
 
         encoder_outputs = self.encoder(
@@ -1152,9 +1158,9 @@ def set_input_embeddings(self, value: tf.Variable):
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1197,13 +1203,12 @@ def get_input_embeddings(self) -> tf.keras.layers.Layer:
     @unpack_inputs
     def call(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
+        pixel_values: TFModelInputType | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
-
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
 
@@ -1265,7 +1270,6 @@ def __init__(self, config: GroupViTConfig, **kwargs):
         ]
 
     def build(self, input_shape: tf.TensorShape):
-
         self.logit_scale = self.add_weight(
             shape=(1,),
             initializer=tf.keras.initializers.Constant(self.config.logit_scale_init_value),
@@ -1278,15 +1282,14 @@ def build(self, input_shape: tf.TensorShape):
     @unpack_inputs
     def get_text_features(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> tf.Tensor:
-
         if input_ids is None:
             raise ValueError("You have to specify either input_ids")
 
@@ -1315,13 +1318,12 @@ def get_text_features(
     @unpack_inputs
     def get_image_features(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
+        pixel_values: TFModelInputType | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> tf.Tensor:
-
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
 
@@ -1343,10 +1345,10 @@ def get_image_features(
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        pixel_values: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        pixel_values: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
         return_loss: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1354,7 +1356,6 @@ def call(
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFGroupViTModelOutput, Tuple[tf.Tensor]]:
-
         if input_ids is None:
             raise ValueError("You have to specify either input_ids")
         if pixel_values is None:
@@ -1520,7 +1521,7 @@ class TFGroupViTPreTrainedModel(TFPreTrainedModel):
         input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -1555,8 +1556,8 @@ class TFGroupViTPreTrainedModel(TFPreTrainedModel):
 GROUPVIT_VISION_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]`, `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`CLIPFeatureExtractor`]. See
-            [`CLIPFeatureExtractor.__call__`] for details.
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`CLIPImageProcessor.__call__`] for details.
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
             tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
@@ -1578,13 +1579,13 @@ class TFGroupViTPreTrainedModel(TFPreTrainedModel):
         input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
         pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`CLIPFeatureExtractor`]. See
-            [`CLIPFeatureExtractor.__call__`] for details.
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`CLIPImageProcessor.__call__`] for details.
         attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
             Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
 
@@ -1625,38 +1626,14 @@ def __init__(self, config: GroupViTTextConfig, *inputs, **kwargs):
 
         self.groupvit = TFGroupViTTextMainLayer(config, name="groupvit")
 
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        return {
-            "input_ids": tf.constant(DUMMY_INPUTS, dtype=tf.int32),
-        }
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs: Dict[str, tf.Tensor]) -> TFBaseModelOutputWithPooling:
-        output = self.call(inputs)
-        return self.serving_output(output)
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(GROUPVIT_TEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=GroupViTTextConfig)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1692,17 +1669,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutputWithPooling(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            hidden_states=hs,
-            attentions=attns,
-        )
-
 
 class TFGroupViTVisionModel(TFGroupViTPreTrainedModel):
     config_class = GroupViTVisionConfig
@@ -1713,44 +1679,12 @@ def __init__(self, config: GroupViTVisionConfig, *inputs, **kwargs):
 
         self.groupvit = TFGroupViTVisionMainLayer(config, name="groupvit")
 
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        VISION_DUMMY_INPUTS = tf.random.uniform(
-            shape=(len(DUMMY_INPUTS), 3, self.config.image_size, self.config.image_size), dtype=tf.float32
-        )
-        return {"pixel_values": VISION_DUMMY_INPUTS}
-
-    @tf.function(
-        input_signature=[
-            {
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
-            }
-        ]
-    )
-    def serving(self, inputs: Dict[str, tf.Tensor]) -> TFBaseModelOutputWithPooling:
-        """
-        Method used for serving the model.
-
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
-        """
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(GROUPVIT_VISION_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=GroupViTVisionConfig)
     def call(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
+        pixel_values: TFModelInputType | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1789,15 +1723,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
-        # hidden_states and attentions not converted to Tensor with tf.convert_to_tensor as they are all of different dimensions
-        return TFBaseModelOutputWithPooling(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            hidden_states=output.hidden_states,
-            attentions=output.attentions,
-        )
-
 
 @add_start_docstrings(GROUPVIT_START_DOCSTRING)
 class TFGroupViTModel(TFGroupViTPreTrainedModel):
@@ -1808,51 +1733,13 @@ def __init__(self, config: GroupViTConfig, *inputs, **kwargs):
 
         self.groupvit = TFGroupViTMainLayer(config, name="groupvit")
 
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        VISION_DUMMY_INPUTS = tf.random.uniform(
-            shape=(len(DUMMY_INPUTS), 3, self.config.vision_config.image_size, self.config.vision_config.image_size),
-            dtype=tf.float32,
-        )
-        return {
-            "input_ids": tf.constant(DUMMY_INPUTS, dtype=tf.int32),
-            "pixel_values": VISION_DUMMY_INPUTS,
-        }
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float64, name="pixel_values"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs: Dict[str, tf.Tensor]) -> TFGroupViTModelOutput:
-        """
-        Method used for serving the model.
-
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
-        """
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(GROUPVIT_TEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     def get_text_features(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1891,7 +1778,7 @@ def get_text_features(
     @add_start_docstrings_to_model_forward(GROUPVIT_VISION_INPUTS_DOCSTRING)
     def get_image_features(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
+        pixel_values: TFModelInputType | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1935,10 +1822,10 @@ def get_image_features(
     @replace_return_docstrings(output_type=TFGroupViTModelOutput, config_class=GroupViTConfig)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        pixel_values: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        pixel_values: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
         return_loss: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
diff --git a/src/transformers/models/herbert/__init__.py b/src/transformers/models/herbert/__init__.py
index ef9d47535e5f..54037995229f 100644
--- a/src/transformers/models/herbert/__init__.py
+++ b/src/transformers/models/herbert/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/herbert/tokenization_herbert.py b/src/transformers/models/herbert/tokenization_herbert.py
index 479446787a3f..1747a59c6fc2 100644
--- a/src/transformers/models/herbert/tokenization_herbert.py
+++ b/src/transformers/models/herbert/tokenization_herbert.py
@@ -143,20 +143,30 @@ class BasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -175,7 +185,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -203,7 +215,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
@@ -320,24 +332,8 @@ def __init__(
         ],
         lang2id=None,
         id2lang=None,
-        **kwargs
+        **kwargs,
     ):
-
-        super().__init__(
-            unk_token=unk_token,
-            bos_token=bos_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            additional_special_tokens=additional_special_tokens,
-            lang2id=lang2id,
-            id2lang=id2lang,
-            do_lowercase_and_remove_accent=do_lowercase_and_remove_accent,
-            tokenizer_file=None,
-            **kwargs,
-        )
-
         try:
             import sacremoses
         except ImportError:
@@ -349,10 +345,10 @@ def __init__(
         self.sm = sacremoses
 
         # cache of sm.MosesPunctNormalizer instance
-        self.cache_moses_punct_normalizer = dict()
+        self.cache_moses_punct_normalizer = {}
         # cache of sm.MosesTokenizer instance
-        self.cache_moses_tokenizer = dict()
-        self.lang_with_custom_tokenizer = set(["zh", "th", "ja"])
+        self.cache_moses_tokenizer = {}
+        self.lang_with_custom_tokenizer = {"zh", "th", "ja"}
         # True for current supported model (v1.2.0), False for XLM-17 & 100
         self.do_lowercase_and_remove_accent = do_lowercase_and_remove_accent
         self.lang2id = lang2id
@@ -372,6 +368,21 @@ def __init__(
         self.bpe_ranks = dict(zip(merges, range(len(merges))))
         self.cache = {}
 
+        super().__init__(
+            unk_token=unk_token,
+            bos_token=bos_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            additional_special_tokens=additional_special_tokens,
+            lang2id=lang2id,
+            id2lang=id2lang,
+            do_lowercase_and_remove_accent=do_lowercase_and_remove_accent,
+            tokenizer_file=None,
+            **kwargs,
+        )
+
         self.bert_pre_tokenizer = BasicTokenizer(
             do_lower_case=False,
             never_split=self.all_special_tokens,
@@ -486,13 +497,12 @@ def bpe(self, token):
         return word
 
     def _tokenize(self, text):
-
         pre_tokens = self.bert_pre_tokenizer.tokenize(text)
 
         split_tokens = []
         for token in pre_tokens:
             if token:
-                split_tokens.extend([t for t in self.bpe(token).split(" ")])
+                split_tokens.extend(list(self.bpe(token).split(" ")))
 
         return split_tokens
 
diff --git a/src/transformers/models/herbert/tokenization_herbert_fast.py b/src/transformers/models/herbert/tokenization_herbert_fast.py
index 234ad4a56791..67e38c1c5ee7 100644
--- a/src/transformers/models/herbert/tokenization_herbert_fast.py
+++ b/src/transformers/models/herbert/tokenization_herbert_fast.py
@@ -72,9 +72,8 @@ def __init__(
         pad_token="<pad>",
         mask_token="<mask>",
         sep_token="</s>",
-        **kwargs
+        **kwargs,
     ):
-
         super().__init__(
             vocab_file,
             merges_file,
diff --git a/src/transformers/models/hubert/__init__.py b/src/transformers/models/hubert/__init__.py
index bd415e49a150..f0b72a1f297b 100644
--- a/src/transformers/models/hubert/__init__.py
+++ b/src/transformers/models/hubert/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/hubert/configuration_hubert.py b/src/transformers/models/hubert/configuration_hubert.py
index be2e6bbf4c71..8c9f3d6929e2 100644
--- a/src/transformers/models/hubert/configuration_hubert.py
+++ b/src/transformers/models/hubert/configuration_hubert.py
@@ -58,10 +58,15 @@ class HubertConfig(PretrainedConfig):
             `"relu"`, `"selu"` and `"gelu_new"` are supported.
         hidden_dropout(`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        activation_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for activations inside the fully connected layer.
         attention_dropout(`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
         final_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probabilitiy for the final projection layer of [`Wav2Vec2ForCTC`].
+        layerdrop (`float`, *optional*, defaults to 0.1):
+            The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
+            details.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
@@ -196,7 +201,7 @@ def __init__(
         pad_token_id=0,
         bos_token_id=1,
         eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
         self.hidden_size = hidden_size
diff --git a/src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py b/src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py
index d7ba74fedae7..571761e02284 100644
--- a/src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py
+++ b/src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py
@@ -18,8 +18,8 @@
 import argparse
 
 import torch
-
 from s3prl.hub import distilhubert
+
 from transformers import HubertConfig, HubertModel, Wav2Vec2FeatureExtractor, logging
 
 
diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py
index f0f31e87716e..948530bb6b3f 100755
--- a/src/transformers/models/hubert/modeling_hubert.py
+++ b/src/transformers/models/hubert/modeling_hubert.py
@@ -23,12 +23,10 @@
 from torch import nn
 from torch.nn import CrossEntropyLoss
 
-from transformers.deepspeed import is_deepspeed_zero3_enabled
-
 from ...activations import ACT2FN
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
 from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_int_div
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -41,14 +39,10 @@
 
 logger = logging.get_logger(__name__)
 
-_FEAT_EXTRACTOR_FOR_DOC = "Wav2Vec2FeatureExtractor"
-
-
 _HIDDEN_STATES_START_POSITION = 1
 
 # General docstring
 _CONFIG_FOR_DOC = "HubertConfig"
-_PROCESSOR_FOR_DOC = "Wav2Vec2Processor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "facebook/hubert-large-ls960-ft"
@@ -59,7 +53,6 @@
 _CTC_EXPECTED_LOSS = 22.68
 
 # Audio class docstring
-_FEAT_EXTRACTOR_FOR_DOC = "Wav2Vec2FeatureExtractor"
 _SEQ_CLASS_CHECKPOINT = "superb/hubert-base-superb-ks"
 _SEQ_CLASS_EXPECTED_OUTPUT = "'_unknown_'"
 _SEQ_CLASS_EXPECTED_LOSS = 8.53
@@ -278,15 +271,19 @@ def __init__(self, config):
             groups=config.num_conv_pos_embedding_groups,
         )
 
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
         if is_deepspeed_zero3_enabled():
             import deepspeed
 
             with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
-                self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+                self.conv = weight_norm(self.conv, name="weight", dim=2)
             deepspeed.zero.register_external_parameter(self, self.conv.weight_v)
             deepspeed.zero.register_external_parameter(self, self.conv.weight_g)
         else:
-            self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+            self.conv = weight_norm(self.conv, name="weight", dim=2)
 
         self.padding = HubertSamePadLayer(config.num_conv_pos_embeddings)
         self.activation = ACT2FN[config.feat_extract_activation]
@@ -487,8 +484,8 @@ def forward(
 
         proj_shape = (bsz * self.num_heads, -1, self.head_dim)
         query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-        key_states = key_states.view(*proj_shape)
-        value_states = value_states.view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
 
         src_len = key_states.size(1)
         attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
@@ -534,7 +531,7 @@ def forward(
 
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
             raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
                 f" {attn_output.size()}"
             )
 
@@ -542,7 +539,7 @@ def forward(
         attn_output = attn_output.transpose(1, 2)
 
         # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned aross GPUs when using tensor-parallelism.
+        # partitioned across GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
@@ -610,6 +607,32 @@ def forward(self, hidden_states, attention_mask=None, output_attentions=False):
         return outputs
 
 
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2AttnAdapterLayer with Wav2Vec2->Hubert
+class HubertAttnAdapterLayer(nn.Module):
+    def __init__(self, config):
+        """
+        Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
+        up training throughput.
+        """
+        super().__init__()
+        self.input_dim = config.adapter_attn_dim
+        self.hidden_dim = config.hidden_size
+
+        self.norm = nn.LayerNorm(self.hidden_dim)
+        self.linear_1 = nn.Linear(self.hidden_dim, self.input_dim)
+        self.act_fn = nn.ReLU()
+        self.linear_2 = nn.Linear(self.input_dim, self.hidden_dim)
+
+    def forward(self, hidden_states: torch.FloatTensor):
+        hidden_states = self.norm(hidden_states)
+
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act_fn(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+
+        return hidden_states
+
+
 # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2EncoderLayerStableLayerNorm with Wav2Vec2->Hubert
 class HubertEncoderLayerStableLayerNorm(nn.Module):
     def __init__(self, config):
@@ -625,6 +648,11 @@ def __init__(self, config):
         self.feed_forward = HubertFeedForward(config)
         self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
+        if getattr(config, "adapter_attn_dim", None) is not None:
+            self.adapter_layer = HubertAttnAdapterLayer(config)
+        else:
+            self.adapter_layer = None
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -640,6 +668,9 @@ def forward(
         hidden_states = attn_residual + hidden_states
         hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
 
+        if self.adapter_layer is not None:
+            hidden_states = hidden_states + self.adapter_layer(hidden_states)
+
         outputs = (hidden_states,)
 
         if output_attentions:
@@ -694,7 +725,7 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = np.random.uniform(0, 1)
+            dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
             if not skip_the_layer or deepspeed_zero3_is_enabled:
@@ -783,7 +814,7 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = np.random.uniform(0, 1)
+            dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
             if not skip_the_layer or deepspeed_zero3_is_enabled:
@@ -838,7 +869,6 @@ class HubertPreTrainedModel(PreTrainedModel):
     base_model_prefix = "hubert"
     main_input_name = "input_values"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -877,7 +907,7 @@ def _get_feat_extract_output_lengths(self, input_lengths: Union[torch.LongTensor
         def _conv_out_length(input_length, kernel_size, stride):
             # 1D convolutional layer output length formula taken
             # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch_int_div(input_length - kernel_size, stride) + 1
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
 
         for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
             input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
@@ -919,10 +949,10 @@ def _get_feature_vector_attention_mask(self, feature_vector_length: int, attenti
 HUBERT_INPUTS_DOCSTRING = r"""
     Args:
         input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
-            Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
-            into an array of type *List[float]* or a *numpy.ndarray*, *e.g.* via the soundfile library (*pip install
-            soundfile*). To prepare the array into *input_values*, the [`Wav2Vec2Processor`] should be used for padding
-            and conversion into a tensor of type *torch.FloatTensor*. See [`Wav2Vec2Processor.__call__`] for details.
+            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
+            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
+            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
+            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details.
         attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
             1]`:
@@ -1041,11 +1071,11 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import Wav2Vec2Processor, HubertModel
+        >>> from transformers import AutoProcessor, HubertModel
         >>> from datasets import load_dataset
         >>> import soundfile as sf
 
-        >>> processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
+        >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
         >>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
 
 
@@ -1103,12 +1133,14 @@ def forward(
 )
 # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->Hubert, wav2vec2->hubert, WAV_2_VEC_2->HUBERT
 class HubertForCTC(HubertPreTrainedModel):
-    def __init__(self, config):
+    def __init__(self, config, target_lang: Optional[str] = None):
         super().__init__(config)
 
         self.hubert = HubertModel(config)
         self.dropout = nn.Dropout(config.final_dropout)
 
+        self.target_lang = target_lang
+
         if config.vocab_size is None:
             raise ValueError(
                 f"You are trying to instantiate {self.__class__} with a configuration that "
@@ -1124,6 +1156,27 @@ def __init__(self, config):
         # Initialize weights and apply final processing
         self.post_init()
 
+    def tie_weights(self):
+        """
+        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
+        passing `target_lang=...` to `from_pretrained(...)`.
+
+        This method is **not** supposed to be called by the user and is prone to be changed in the future.
+        """
+
+        # Note that `tie_weights` is usually used to tie input and output embedding weights. The method is re-purposed to
+        # correctly load adapter layers for Hubert so that we do not have to introduce a new API to
+        # [`PreTrainedModel`]. While slightly hacky, Hubert never has to tie input and output embeddings, so that it is
+        # ok to repurpose this function here.
+        target_lang = self.target_lang
+
+        if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
+            raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
+        elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
+            logger.info("By default `target_lang` is set to 'eng'.")
+        elif target_lang is not None:
+            self.load_adapter(target_lang, force_load=True)
+
     def freeze_feature_extractor(self):
         """
         Calling this function will disable the gradient computation for the feature encoder so that its parameter will
@@ -1143,9 +1196,16 @@ def freeze_feature_encoder(self):
         """
         self.hubert.feature_extractor._freeze_parameters()
 
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.hubert.parameters():
+            param.requires_grad = False
+
     @add_start_docstrings_to_model_forward(HUBERT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_PROCESSOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=CausalLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1186,7 +1246,6 @@ def forward(
 
         loss = None
         if labels is not None:
-
             if labels.max() >= self.config.vocab_size:
                 raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
 
@@ -1280,7 +1339,6 @@ def freeze_base_model(self):
 
     @add_start_docstrings_to_model_forward(HUBERT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_SEQ_CLASS_CHECKPOINT,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py
index 9ba881445ab9..2c4d4debeac0 100644
--- a/src/transformers/models/hubert/modeling_tf_hubert.py
+++ b/src/transformers/models/hubert/modeling_tf_hubert.py
@@ -13,20 +13,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ TensorFlow Hubert model."""
-import inspect
+
+from __future__ import annotations
+
 import warnings
-from collections.abc import Mapping
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Any, Optional, Tuple, Union
 
 import numpy as np
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
 from ...modeling_tf_outputs import TFBaseModelOutput, TFCausalLMOutput
-from ...modeling_tf_utils import TFPreTrainedModel, booleans_processing, get_initializer, keras_serializable
+from ...modeling_tf_utils import (
+    TFPreTrainedModel,
+    get_initializer,
+    keras_serializable,
+    unpack_inputs,
+)
 from ...tf_utils import shape_list, stable_softmax
 from ...utils import (
-    ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     logging,
@@ -47,124 +52,6 @@
 LARGE_NEGATIVE = -1e8
 
 
-# Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.input_values_processing
-def input_values_processing(func, config, input_values, **kwargs):
-    """
-    Process the input of each TensorFlow model including the booleans. In case of a list of symbolic inputs, each input
-    has to be named accordingly to the parameters name, i.e. `input_values = tf.keras.Input(shape=(128,),
-    dtype='float32', name="input_values")` otherwise the order of the tensors will not be guaranteed during the
-    training.
-
-    Args:
-        func (`callable`):
-            The callable function of the TensorFlow model.
-        config ([`PretrainedConfig`]):
-            The config of the running model.
-        **kwargs:
-            The inputs of the model.
-
-    Returns:
-        Two lists, one for the missing layers, and another one for the unexpected layers.
-    """
-    signature = dict(inspect.signature(func).parameters)
-    signature.pop("kwargs", None)
-    signature.pop("self", None)
-    parameter_names = list(signature.keys())
-    output = {}
-    allowed_types = (tf.Tensor, bool, int, ModelOutput, tuple, list, dict, np.ndarray)
-
-    for k, v in kwargs.items():
-        if isinstance(v, allowed_types) or v is None:
-            output[k] = v
-        else:
-            raise ValueError(f"Data of type {type(v)} is not allowed only {allowed_types} is accepted for {k}.")
-
-    if isinstance(input_values, (tuple, list)):
-        for i, input in enumerate(input_values):
-            # EagerTensors don't allow to use the .name property so we check for a real Tensor
-            if type(input) == tf.Tensor:
-                # Tensor names have always the pattern `name:id` then we check only the
-                # `name` part
-                tensor_name = input.name.split(":")[0]
-
-                if tensor_name in parameter_names:
-                    output[tensor_name] = input
-                else:
-                    output[parameter_names[i]] = input
-            elif isinstance(input, allowed_types) or input is None:
-                output[parameter_names[i]] = input
-            else:
-                raise ValueError(
-                    f"Data of type {type(input)} is not allowed only {allowed_types} is accepted for"
-                    f" {parameter_names[i]}."
-                )
-    elif isinstance(input_values, Mapping):
-        if "inputs" in input_values:
-            warnings.warn(
-                "The `inputs` argument is deprecated and will be removed in a future version, use `input_values`"
-                " instead.",
-                FutureWarning,
-            )
-
-            output["input_values"] = input_values.pop("inputs")
-
-        if "decoder_cached_states" in input_values:
-            warnings.warn(
-                "The `decoder_cached_states` argument is deprecated and will be removed in a future version, use"
-                " `past_key_values` instead.",
-                FutureWarning,
-            )
-            output["past_key_values"] = input_values.pop("decoder_cached_states")
-
-        for k, v in dict(input_values).items():
-            if isinstance(v, allowed_types) or v is None:
-                output[k] = v
-            elif k not in parameter_names and "args" not in parameter_names:
-                logger.warning(
-                    f"The parameter {k} does not belongs to the parameter list {parameter_names} and will be ignored."
-                )
-                continue
-            else:
-                raise ValueError(f"Data of type {type(v)} is not allowed only {allowed_types} is accepted for {k}.")
-    else:
-        if isinstance(input_values, tf.Tensor) or input_values is None:
-            output[parameter_names[0]] = input_values
-        else:
-            raise ValueError(
-                f"Data of type {type(input_values)} is not allowed only {allowed_types} is accepted for"
-                f" {parameter_names[0]}."
-            )
-
-    for name in parameter_names:
-        if name not in list(output.keys()) and name != "args":
-            output[name] = kwargs.pop(name, signature[name].default)
-
-    # When creating a SavedModel TF calls the method with LayerCall.__call__(args, **kwargs)
-    # So to respect the proper output we have to add this exception
-    if "args" in output:
-        if output["args"] is not None and type(output["args"]) == tf.Tensor:
-            tensor_name = output["args"].name.split(":")[0]
-            output[tensor_name] = output["args"]
-        else:
-            # `args` in this case is always the first parameter, then `input_values`
-            output["input_values"] = output["args"]
-
-        del output["args"]
-
-    if "kwargs" in output:
-        del output["kwargs"]
-
-    boolean_dict = {
-        k: v
-        for k, v in output.items()
-        if k in ["return_dict", "output_attentions", "output_hidden_states", "use_cache"]
-    }
-
-    output.update(booleans_processing(config=config, **boolean_dict))
-
-    return output
-
-
 # Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2._sample_without_replacement
 def _sample_without_replacement(distribution, num_samples):
     """
@@ -221,13 +108,17 @@ def _compute_mask_indices(
     if mask_length < 1:
         raise ValueError("`mask_length` has to be bigger than 0.")
 
-    if mask_length > sequence_length:
-        raise ValueError(
+    tf.debugging.assert_less(
+        mask_length,
+        sequence_length,
+        message=(
             f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and"
             f" `sequence_length`: {sequence_length}`"
-        )
+        ),
+    )
+
     # compute number of masked spans in batch
-    num_masked_spans = mask_prob * sequence_length / mask_length + tf.random.uniform((1,))
+    num_masked_spans = mask_prob * tf.cast(sequence_length, tf.float32) / mask_length + tf.random.uniform((1,))
     num_masked_spans = tf.maximum(num_masked_spans, min_masks)
     num_masked_spans = tf.cast(num_masked_spans, tf.int32)
 
@@ -314,7 +205,6 @@ def __init__(
         self._check_axis()
 
     def build(self, input_shape):
-
         self._check_if_input_shape_is_none(input_shape)
         self._set_number_of_groups_for_instance_norm(input_shape)
         self._check_size_of_dimensions(input_shape)
@@ -326,7 +216,6 @@ def build(self, input_shape):
         super().build(input_shape)
 
     def call(self, inputs):
-
         input_shape = tf.keras.backend.int_shape(inputs)
         tensor_input_shape = tf.shape(inputs)
 
@@ -363,7 +252,6 @@ def compute_output_shape(self, input_shape):
         return input_shape
 
     def _reshape_into_groups(self, inputs, input_shape, tensor_input_shape):
-
         group_shape = [tensor_input_shape[i] for i in range(len(input_shape))]
         is_instance_norm = (input_shape[self.axis] // self.groups) == 1
         if not is_instance_norm:
@@ -376,7 +264,6 @@ def _reshape_into_groups(self, inputs, input_shape, tensor_input_shape):
             return inputs, group_shape
 
     def _apply_normalization(self, reshaped_inputs, input_shape):
-
         group_shape = tf.keras.backend.int_shape(reshaped_inputs)
         group_reduction_axes = list(range(1, len(group_shape)))
         is_instance_norm = (input_shape[self.axis] // self.groups) == 1
@@ -428,7 +315,6 @@ def _set_number_of_groups_for_instance_norm(self, input_shape):
             self.groups = dim
 
     def _check_size_of_dimensions(self, input_shape):
-
         dim = input_shape[self.axis]
         if dim < self.groups:
             raise ValueError(
@@ -449,19 +335,16 @@ def _check_size_of_dimensions(self, input_shape):
             )
 
     def _check_axis(self):
-
         if self.axis == 0:
             raise ValueError(
                 "You are trying to normalize your batch axis. Do you want to use tf.layer.batch_normalization instead"
             )
 
     def _create_input_spec(self, input_shape):
-
         dim = input_shape[self.axis]
         self.input_spec = tf.keras.layers.InputSpec(ndim=len(input_shape), axes={self.axis: dim})
 
     def _add_gamma_weight(self, input_shape):
-
         dim = input_shape[self.axis]
         shape = (dim,)
 
@@ -477,7 +360,6 @@ def _add_gamma_weight(self, input_shape):
             self.gamma = None
 
     def _add_beta_weight(self, input_shape):
-
         dim = input_shape[self.axis]
         shape = (dim,)
 
@@ -535,8 +417,10 @@ def _normalize_kernel(self):
     def build(self, input_shape):
         if not self.built:
             input_shape = input_shape.as_list()
-            # Conv1D output shapes are checked at build time since TF 2.7, so we need to account for padding
-            input_shape[-2] += self.explicit_padding * 2
+            # If a specific input shape is passed in, we need to modify it to account for padding
+            # Not necessary if those portions of the shape are None
+            if input_shape[-2] is not None:
+                input_shape[-2] += self.explicit_padding * 2
             super().build(input_shape)
 
             self.kernel = tf.Variable(tf.transpose(self.kernel), name="weight_v", trainable=True)
@@ -763,12 +647,12 @@ def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
     def call(
         self,
         hidden_states: tf.Tensor,
-        key_value_states: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[Tuple[tf.Tensor]]] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
+        key_value_states: tf.Tensor | None = None,
+        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
+        attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
         training: Optional[bool] = False,
-    ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
+    ) -> Tuple[tf.Tensor, tf.Tensor | None]:
         """Input shape: Batch x Time x Channel"""
 
         # if key_value_states are provided this layer is used as a cross-attention layer
@@ -933,7 +817,7 @@ def __init__(self, config: HubertConfig, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = False,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
@@ -977,7 +861,7 @@ def __init__(self, config: HubertConfig, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = False,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
@@ -1011,7 +895,7 @@ def __init__(self, config: HubertConfig, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = False,
         output_hidden_states: Optional[bool] = False,
         return_dict: Optional[bool] = True,
@@ -1079,7 +963,7 @@ def __init__(self, config: HubertConfig, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = False,
         output_hidden_states: Optional[bool] = False,
         return_dict: Optional[bool] = True,
@@ -1169,7 +1053,7 @@ def _conv_out_length(input_length, kernel_size, stride):
 
         return input_lengths
 
-    def _mask_hidden_states(self, hidden_states: tf.Tensor, mask_time_indices: Optional[tf.Tensor] = None):
+    def _mask_hidden_states(self, hidden_states: tf.Tensor, mask_time_indices: tf.Tensor | None = None):
         """
         Masks extracted features along time axis and/or along feature axis according to
         [SpecAugment](https://arxiv.org/abs/1904.08779).
@@ -1213,65 +1097,48 @@ def _mask_hidden_states(self, hidden_states: tf.Tensor, mask_time_indices: Optio
 
         return hidden_states
 
+    @unpack_inputs
     def call(
         self,
         input_values: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
-        token_type_ids: Optional[tf.Tensor] = None,
-        position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        output_attentions: Optional[tf.Tensor] = None,
-        output_hidden_states: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        output_attentions: tf.Tensor | None = None,
+        output_hidden_states: tf.Tensor | None = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
         **kwargs: Any,
     ):
-        inputs = input_values_processing(
-            func=self.call,
-            config=self.config,
-            input_values=input_values,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            training=training,
-            kwargs_call=kwargs,
-        )
-
-        hidden_states = self.feature_extractor(
-            tf.cast(inputs["input_values"], tf.float32), training=inputs["training"]
-        )
+        hidden_states = self.feature_extractor(tf.cast(input_values, tf.float32), training=training)
 
-        if inputs["attention_mask"] is not None:
+        if attention_mask is not None:
             # compute real output lengths according to convolution formula
-            output_lengths = self._get_feat_extract_output_lengths(tf.reduce_sum(inputs["attention_mask"], -1))
+            output_lengths = self._get_feat_extract_output_lengths(tf.reduce_sum(attention_mask, -1))
 
             attention_mask = tf.sequence_mask(
                 output_lengths, maxlen=shape_list(hidden_states)[1], dtype=hidden_states.dtype
             )
 
-        hidden_states = self.feature_projection(hidden_states, training=inputs["training"])
+        hidden_states = self.feature_projection(hidden_states, training=training)
 
         mask_time_indices = kwargs.get("mask_time_indices", None)
-        if inputs["training"]:
+        if training:
             hidden_states = self._mask_hidden_states(hidden_states, mask_time_indices=mask_time_indices)
 
         encoder_outputs = self.encoder(
             hidden_states,
             attention_mask=attention_mask,
-            output_attentions=inputs["output_attentions"],
-            output_hidden_states=inputs["output_hidden_states"],
-            return_dict=inputs["return_dict"],
-            training=inputs["training"],
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
         )
         hidden_states = encoder_outputs[0]
 
-        if not inputs["return_dict"]:
+        if not return_dict:
             return (hidden_states,) + encoder_outputs[1:]
 
         return TFBaseModelOutput(
@@ -1292,36 +1159,20 @@ class TFHubertPreTrainedModel(TFPreTrainedModel):
     main_input_name = "input_values"
 
     @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        pad_token = 0.0
-        input_values = tf.convert_to_tensor(np.random.rand(1, 16000), tf.float32)
-        dummy_inputs = {
-            "input_values": input_values,
-            "attention_mask": tf.cast(tf.not_equal(input_values, pad_token), tf.float32),
+    def input_signature(self):
+        return {
+            "input_values": tf.TensorSpec((None, 16000), tf.float32, name="input_values"),
+            "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
+            "token_type_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"),
         }
-        return dummy_inputs
 
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         logger.warning(
             f"\n{self.__class__.__name__} has backpropagation operations that are NOT supported on CPU. If you wish "
-            "to train/fine-tine this model, you need a GPU or a TPU"
+            "to train/fine-tune this model, you need a GPU or a TPU"
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_values": tf.TensorSpec((None, None), tf.float32, name="input_values"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-                "token_type_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(input_values=inputs, training=False)
-
-        return self.serving_output(output)
-
 
 HUBERT_START_DOCSTRING = r"""
 
@@ -1367,10 +1218,10 @@ def serving(self, inputs):
 
 HUBERT_INPUTS_DOCSTRING = r"""
     Args:
-        input_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
+        input_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -1433,14 +1284,15 @@ def __init__(self, config: HubertConfig, *inputs, **kwargs):
 
     @add_start_docstrings_to_model_forward(HUBERT_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=TFBaseModelOutput, config_class=_CONFIG_FOR_DOC)
+    @unpack_inputs
     def call(
         self,
         input_values: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
-        token_type_ids: Optional[tf.Tensor] = None,
-        position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1453,11 +1305,11 @@ def call(
         Example:
 
         ```python
-        >>> from transformers import Wav2Vec2Processor, TFHubertModel
+        >>> from transformers import AutoProcessor, TFHubertModel
         >>> from datasets import load_dataset
         >>> import soundfile as sf
 
-        >>> processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
+        >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
         >>> model = TFHubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
 
 
@@ -1474,9 +1326,11 @@ def call(
         >>> hidden_states = model(input_values).last_hidden_state
         ```"""
 
-        inputs = input_values_processing(
-            func=self.call,
-            config=self.config,
+        output_hidden_states = output_hidden_states if output_hidden_states else self.config.output_hidden_states
+        output_attentions = output_attentions if output_attentions else self.config.output_attentions
+        return_dict = return_dict if return_dict else self.config.return_dict
+
+        outputs = self.hubert(
             input_values=input_values,
             attention_mask=attention_mask,
             token_type_ids=token_type_ids,
@@ -1489,36 +1343,8 @@ def call(
             training=training,
         )
 
-        inputs["output_hidden_states"] = (
-            inputs["output_hidden_states"] if inputs["output_hidden_states"] else self.config.output_hidden_states
-        )
-        inputs["output_attentions"] = (
-            inputs["output_attentions"] if inputs["output_attentions"] else self.config.output_attentions
-        )
-        inputs["return_dict"] = inputs["return_dict"] if inputs["return_dict"] else self.config.return_dict
-
-        outputs = self.hubert(
-            input_values=inputs["input_values"],
-            attention_mask=inputs["attention_mask"],
-            token_type_ids=inputs["token_type_ids"],
-            position_ids=inputs["position_ids"],
-            head_mask=inputs["head_mask"],
-            inputs_embeds=inputs["inputs_embeds"],
-            output_attentions=inputs["output_attentions"],
-            output_hidden_states=inputs["output_hidden_states"],
-            return_dict=inputs["return_dict"],
-            training=inputs["training"],
-        )
-
         return outputs
 
-    def serving_output(self, output):
-        hidden_states = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attentions = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        return TFBaseModelOutput(
-            last_hidden_state=output.last_hidden_state, hidden_states=hidden_states, attentions=attentions
-        )
-
 
 @add_start_docstrings(
     """TFHubert Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
@@ -1553,16 +1379,17 @@ def freeze_feature_encoder(self):
 
     @add_start_docstrings_to_model_forward(HUBERT_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=TFCausalLMOutput, config_class=_CONFIG_FOR_DOC)
+    @unpack_inputs
     def call(
         self,
         input_values: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
-        token_type_ids: Optional[tf.Tensor] = None,
-        position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
@@ -1579,11 +1406,11 @@ def call(
 
         ```python
         >>> import tensorflow as tf
-        >>> from transformers import Wav2Vec2Processor, TFHubertForCTC
+        >>> from transformers import AutoProcessor, TFHubertForCTC
         >>> from datasets import load_dataset
         >>> import soundfile as sf
 
-        >>> processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
+        >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
         >>> model = TFHubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")
 
 
@@ -1610,9 +1437,8 @@ def call(
 
         >>> loss = model(input_values, labels=labels).loss
         ```"""
-        inputs = input_values_processing(
-            func=self.call,
-            config=self.config,
+
+        outputs = self.hubert(
             input_values=input_values,
             attention_mask=attention_mask,
             token_type_ids=token_type_ids,
@@ -1624,33 +1450,17 @@ def call(
             return_dict=return_dict,
             training=training,
         )
-
-        outputs = self.hubert(
-            input_values=inputs["input_values"],
-            attention_mask=inputs["attention_mask"],
-            token_type_ids=inputs["token_type_ids"],
-            position_ids=inputs["position_ids"],
-            head_mask=inputs["head_mask"],
-            inputs_embeds=inputs["inputs_embeds"],
-            output_attentions=inputs["output_attentions"],
-            output_hidden_states=inputs["output_hidden_states"],
-            return_dict=inputs["return_dict"],
-            training=inputs["training"],
-        )
         hidden_states = outputs[0]
-        hidden_states = self.dropout(hidden_states, training=inputs["training"])
+        hidden_states = self.dropout(hidden_states, training=training)
 
         logits = self.lm_head(hidden_states)
 
         if labels is not None:
-
             if tf.reduce_max(labels) >= self.config.vocab_size:
                 raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
 
             attention_mask = (
-                inputs["attention_mask"]
-                if inputs["attention_mask"] is not None
-                else tf.ones_like(inputs["input_values"], dtype=tf.float32)
+                attention_mask if attention_mask is not None else tf.ones_like(input_values, dtype=tf.float32)
             )
             input_lengths = self.hubert._get_feat_extract_output_lengths(tf.reduce_sum(attention_mask, axis=-1))
 
@@ -1677,7 +1487,7 @@ def call(
         else:
             loss = None
 
-        if not inputs["return_dict"]:
+        if not return_dict:
             output = (logits,) + outputs[1:]
             return ((loss,) + output) if loss is not None else output
 
@@ -1687,8 +1497,3 @@ def call(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
-
-    def serving_output(self, output: TFCausalLMOutput) -> TFCausalLMOutput:
-        hidden_states = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attentions = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        return TFCausalLMOutput(logits=output.logits, hidden_states=hidden_states, attentions=attentions)
diff --git a/src/transformers/models/ibert/__init__.py b/src/transformers/models/ibert/__init__.py
index 0480da8c47fe..637eb08eaf41 100644
--- a/src/transformers/models/ibert/__init__.py
+++ b/src/transformers/models/ibert/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/ibert/configuration_ibert.py b/src/transformers/models/ibert/configuration_ibert.py
index 32d4d2e56a80..249061ceae32 100644
--- a/src/transformers/models/ibert/configuration_ibert.py
+++ b/src/transformers/models/ibert/configuration_ibert.py
@@ -18,9 +18,8 @@
 from collections import OrderedDict
 from typing import Mapping
 
-from transformers.onnx import OnnxConfig
-
 from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
 from ...utils import logging
 
 
@@ -111,7 +110,7 @@ def __init__(
         position_embedding_type="absolute",
         quant_mode=False,
         force_dequant="none",
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
diff --git a/src/transformers/models/ibert/modeling_ibert.py b/src/transformers/models/ibert/modeling_ibert.py
index 2df68b49e977..0dcdaaf6998f 100644
--- a/src/transformers/models/ibert/modeling_ibert.py
+++ b/src/transformers/models/ibert/modeling_ibert.py
@@ -46,7 +46,6 @@
 
 _CHECKPOINT_FOR_DOC = "kssteven/ibert-roberta-base"
 _CONFIG_FOR_DOC = "IBertConfig"
-_TOKENIZER_FOR_DOC = "RobertaTokenizer"
 
 IBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "kssteven/ibert-roberta-base",
@@ -81,7 +80,9 @@ def __init__(self, config):
         )
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
 
         # End copy
@@ -682,7 +683,7 @@ def resize_token_embeddings(self, new_num_tokens=None):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`RobertaTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -741,8 +742,6 @@ class IBertModel(IBertPreTrainedModel):
 
     """
 
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
@@ -772,7 +771,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPoolingAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -798,6 +796,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -856,8 +855,7 @@ def forward(
 
 @add_start_docstrings("""I-BERT Model with a `language modeling` head on top.""", IBERT_START_DOCSTRING)
 class IBertForMaskedLM(IBertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.bias", "lm_head.decoder.weight"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _tied_weights_keys = ["lm_head.decoder.bias", "lm_head.decoder.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -876,7 +874,6 @@ def set_output_embeddings(self, new_embeddings):
 
     @add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -971,8 +968,6 @@ def _tie_weights(self):
     IBERT_START_DOCSTRING,
 )
 class IBertForSequenceClassification(IBertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -985,7 +980,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1067,8 +1061,6 @@ def forward(
     IBERT_START_DOCSTRING,
 )
 class IBertForMultipleChoice(IBertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -1081,7 +1073,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1160,9 +1151,6 @@ def forward(
     IBERT_START_DOCSTRING,
 )
 class IBertForTokenClassification(IBertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1176,7 +1164,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1261,9 +1248,6 @@ def forward(self, features, **kwargs):
     IBERT_START_DOCSTRING,
 )
 class IBertForQuestionAnswering(IBertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1276,7 +1260,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/ibert/quant_modules.py b/src/transformers/models/ibert/quant_modules.py
index fa657924645e..8e2f123c578c 100644
--- a/src/transformers/models/ibert/quant_modules.py
+++ b/src/transformers/models/ibert/quant_modules.py
@@ -163,7 +163,6 @@ def forward(
         specified_min=None,
         specified_max=None,
     ):
-
         x_act = x if identity is None else identity + x
         # collect running stats if training
         if self.training:
@@ -663,7 +662,6 @@ def forward(ctx, x, k, percentile_mode, scale):
 
     @staticmethod
     def backward(ctx, grad_output):
-
         scale = ctx.scale
         if len(grad_output.shape) == 4:
             scale = scale.view(-1, 1, 1, 1)
@@ -771,7 +769,6 @@ def forward(
         identity=None,
         identity_scaling_factor=None,
     ):
-
         if len(pre_act_scaling_factor.shape) == 3:
             reshape = lambda x: x  # noqa: E731
         else:
diff --git a/src/transformers/models/idefics/__init__.py b/src/transformers/models/idefics/__init__.py
new file mode 100644
index 000000000000..68ff40fc18dc
--- /dev/null
+++ b/src/transformers/models/idefics/__init__.py
@@ -0,0 +1,73 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
+
+
+_import_structure = {"configuration_idefics": ["IDEFICS_PRETRAINED_CONFIG_ARCHIVE_MAP", "IdeficsConfig"]}
+
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["image_processing_idefics"] = ["IdeficsImageProcessor"]
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_idefics"] = [
+        "IDEFICS_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "IdeficsForVisionText2Text",
+        "IdeficsModel",
+        "IdeficsPreTrainedModel",
+    ]
+    _import_structure["processing_idefics"] = ["IdeficsProcessor"]
+
+
+if TYPE_CHECKING:
+    from .configuration_idefics import IDEFICS_PRETRAINED_CONFIG_ARCHIVE_MAP, IdeficsConfig
+
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .image_processing_idefics import IdeficsImageProcessor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_idefics import (
+            IDEFICS_PRETRAINED_MODEL_ARCHIVE_LIST,
+            IdeficsForVisionText2Text,
+            IdeficsModel,
+            IdeficsPreTrainedModel,
+        )
+        from .processing_idefics import IdeficsProcessor
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
diff --git a/src/transformers/models/idefics/configuration_idefics.py b/src/transformers/models/idefics/configuration_idefics.py
new file mode 100644
index 000000000000..12d710d726dc
--- /dev/null
+++ b/src/transformers/models/idefics/configuration_idefics.py
@@ -0,0 +1,326 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Idefics model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+IDEFICS_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "HuggingFaceM4/idefics-9b": "https://huggingface.co/HuggingFaceM4/idefics-9b/blob/main/config.json",
+    "HuggingFaceM4/idefics-80b": "https://huggingface.co/HuggingFaceM4/idefics-80b/blob/main/config.json",
+}
+
+
+class IdeficsVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`IdeficsModel`]. It is used to instantiate an
+    Idefics model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Idefics-9B.
+
+    e.g. [HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer. (elsewhere referred to as `hidden_size`)
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        intermediate_size (`int`, *optional*, defaults to 5120):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_num_channels (`int`, *optional*, defaults to `3`):
+            Number of image channels.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1.0):
+            A factor for initializing all weight matrices (should be kept to 1.0, used internally for initialization
+            testing).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+    """
+    model_type = "idefics"
+    attribute_map = {
+        "hidden_size": "embed_dim",
+    }
+
+    def __init__(
+        self,
+        embed_dim=768,
+        image_size=224,
+        intermediate_size=5120,
+        patch_size=14,
+        num_hidden_layers=32,
+        num_attention_heads=16,
+        num_channels=3,
+        hidden_act="gelu",
+        layer_norm_eps=1e-5,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        **kwargs,
+    ):
+        self.embed_dim = embed_dim
+        self.image_size = image_size
+        self.intermediate_size = intermediate_size
+        self.patch_size = patch_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.layer_norm_eps = layer_norm_eps
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.hidden_act = hidden_act
+
+        super().__init__(**kwargs)
+
+
+class IdeficsPerceiverConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`IdeficsModel`]. It is used to instantiate an
+    Idefics model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Idefics-9B.
+
+    e.g. [HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        use_resampler (`bool`, *optional*, defaults to `False`):
+            Whether or not to use the resampler
+        resampler_n_latents (`int`, *optional*, defaults to ):
+            Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).
+        resampler_depth (`int`, *optional*, defaults to 6):
+            Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
+        resampler_n_heads (`int`, *optional*, defaults to 16):
+            Number of heads in each Transformer block (for multi-headed self-attention).
+        resampler_head_dim (`int`, *optional*, defaults to 96):
+            Dimensionality of each head projection in the Transformer block.
+        qk_layer_norms_perceiver (`bool`, *optional*, defaults to `False`):
+            Whether or not to use qk layer norms in perceiver
+    """
+    model_type = "idefics"
+
+    def __init__(
+        self,
+        use_resampler=False,
+        resampler_n_latents=64,
+        resampler_depth=6,
+        resampler_n_heads=16,
+        resampler_head_dim=96,
+        qk_layer_norms_perceiver=False,
+        **kwargs,
+    ):
+        self.use_resampler = use_resampler
+        self.resampler_n_latents = resampler_n_latents
+        self.resampler_depth = resampler_depth
+        self.resampler_n_heads = resampler_n_heads
+        self.resampler_head_dim = resampler_head_dim
+        self.qk_layer_norms_perceiver = qk_layer_norms_perceiver
+
+        super().__init__(**kwargs)
+
+
+class IdeficsConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`IdeficsModel`]. It is used to instantiate an
+    Idefics model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Idefics-9B.
+
+    e.g. [HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        additional_vocab_size (`int`, *optional`, defaults to 0):
+            Additional vocabulary size of the model, typically for the special "<img>" token. Additional vocab tokens
+            are always trainable whereas regular vocab tokens can be frozen or not.
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Idefics model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`~IdeficsModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        alpha_initializer (`str`, *optional*, defaults to `"zeros"`):
+            Initialization type for the alphas.
+        alphas_initializer_range (`float`, *optional*, defaults to 0.0):
+            The standard deviation of the truncated_normal_initializer for initializing the alphas in the Gated Cross
+            Attention.
+        alpha_type (`str`, *optional*, defaults to `"float"`):
+            Whether the gating alphas should be vectors or single floats.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-6):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 0)
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1)
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2)
+            End of stream token id.
+        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        cross_layer_interval (`int`, *optional*, default to 1)
+            Interval for cross attention (from text to image) layers.
+        qk_layer_norms (`bool`, *optional*, defaults to `False`): Whether to add layer norm after q and k
+        freeze_text_layers (`bool`, *optional*, defaults to `True`): Whether to freeze text layers
+        freeze_text_module_exceptions (`bool`, *optional*, defaults to `[]`):
+            Exceptions to freezing text layers when `freeze_text_layers` is `True`
+        freeze_lm_head (`bool`, *optional*, defaults to `False`): Whether to freeze lm head
+        freeze_vision_layers (`bool`, *optional*, defaults to `True`):  Whether to freeze vision layers
+        freeze_vision_module_exceptions (`bool`, *optional*, defaults to `[]`):
+            Exceptions to freezing vision layers when `freeze_vision_layers` is `True`
+        use_resampler (`bool`, *optional*, defaults to `False`): Whether to use the Resampler
+        vision_config (`IdeficsVisionConfig`,  *optional*): Custom vision config or dict
+        perceiver_config (`IdeficsPerceiverConfig`,  *optional*): Custom perceiver config or dict
+
+    Example:
+
+    ```python
+    >>> from transformers import IdeficsModel, IdeficsConfig
+
+    >>> # Initializing a Idefics idefics-9b style configuration
+    >>> configuration = IdeficsConfig()
+
+    >>> # Initializing a model from the idefics-9b style configuration
+    >>> model = IdeficsModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "idefics"
+    is_composition = False
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        additional_vocab_size=0,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        dropout=0.0,
+        hidden_act="silu",
+        initializer_range=0.02,
+        alpha_initializer="zeros",
+        alphas_initializer_range=0.0,
+        alpha_type="float",
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        cross_layer_interval=1,
+        qk_layer_norms=False,
+        freeze_text_layers=True,
+        freeze_text_module_exceptions=[],
+        freeze_lm_head=False,
+        freeze_vision_layers=True,
+        freeze_vision_module_exceptions=[],
+        use_resampler=False,
+        vision_config=None,
+        perceiver_config=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.additional_vocab_size = additional_vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.dropout = dropout
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.alpha_initializer = alpha_initializer
+        self.alphas_initializer_range = alphas_initializer_range
+        self.alpha_type = alpha_type
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+
+        self.cross_layer_interval = cross_layer_interval
+        self.qk_layer_norms = qk_layer_norms
+        self.freeze_vision_layers = freeze_vision_layers
+
+        self.freeze_text_layers = freeze_text_layers
+        self.freeze_text_module_exceptions = freeze_text_module_exceptions
+        self.freeze_vision_module_exceptions = freeze_vision_module_exceptions
+        self.freeze_lm_head = freeze_lm_head
+
+        self.use_resampler = use_resampler
+
+        if perceiver_config is None:
+            self.perceiver_config = IdeficsPerceiverConfig()
+        elif isinstance(perceiver_config, dict):
+            self.perceiver_config = IdeficsPerceiverConfig(**perceiver_config)
+        elif isinstance(perceiver_config, IdeficsPerceiverConfig):
+            self.perceiver_config = perceiver_config
+
+        if vision_config is None:
+            self.vision_config = IdeficsVisionConfig()
+        elif isinstance(vision_config, dict):
+            self.vision_config = IdeficsVisionConfig(**vision_config)
+        elif isinstance(vision_config, IdeficsVisionConfig):
+            self.vision_config = vision_config
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+        # IMPORTANT: Do not do any __init__ args-based checks in the constructor, since
+        # PretrainedConfig.from_dict first instantiates the class with the config dict and only then
+        # updates the config object with `kwargs` from from_pretrained, so during the instantiation
+        # of this object many attributes have default values and haven't yet been overridden.
+        # Do any required checks inside `from_pretrained` once the superclass' `from_pretrained` was run.
diff --git a/src/transformers/models/idefics/image_processing_idefics.py b/src/transformers/models/idefics/image_processing_idefics.py
new file mode 100644
index 000000000000..ee8dfbb4077c
--- /dev/null
+++ b/src/transformers/models/idefics/image_processing_idefics.py
@@ -0,0 +1,168 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Idefics."""
+
+from typing import Callable, Dict, List, Optional, Union
+
+from PIL import Image
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature
+from ...image_transforms import resize, to_channel_dimension_format
+from ...image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import TensorType, is_torch_available
+
+
+IDEFICS_STANDARD_MEAN = [0.48145466, 0.4578275, 0.40821073]
+IDEFICS_STANDARD_STD = [0.26862954, 0.26130258, 0.27577711]
+
+
+def convert_to_rgb(image):
+    # `image.convert("RGB")` would only work for .jpg images, as it creates a wrong background
+    # for transparent images. The call to `alpha_composite` handles this case
+    if image.mode == "RGB":
+        return image
+
+    image_rgba = image.convert("RGBA")
+    background = Image.new("RGBA", image_rgba.size, (255, 255, 255))
+    alpha_composite = Image.alpha_composite(background, image_rgba)
+    alpha_composite = alpha_composite.convert("RGB")
+    return alpha_composite
+
+
+class IdeficsImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Idefics image processor.
+
+    Args:
+        image_size (`int`, *optional*, defaults to 224):
+            Resize to image size
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
+            overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        image_num_channels (`int`, *optional*, defaults to 3):
+            Number of image channels.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        image_size: int = 224,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        image_num_channels: Optional[int] = 3,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+
+        self.image_size = image_size
+        self.image_num_channels = image_num_channels
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        image_num_channels: Optional[int] = 3,
+        image_size: Optional[Dict[str, int]] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        transform: Callable = None,
+        **kwargs,
+    ) -> TensorType.PYTORCH:
+        """
+        Preprocess a batch of images.
+
+        Args:
+            images (`ImageInput`):
+                A list of images to preprocess.
+            image_size (`int`, *optional*, defaults to `self.image_size`):
+                Resize to image size
+            image_num_channels (`int`, *optional*, defaults to `self.image_num_channels`):
+                Number of image channels.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_MEAN`):
+                Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+                channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can
+                be overridden by the `image_mean` parameter in the `preprocess` method.
+            image_std (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_STD`):
+                Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+                number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess`
+                method. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            transform (`Callable`, *optional*, defaults to `None`):
+                A custom transform function that accepts a single image can be passed for training. For example,
+                `torchvision.Compose` can be used to compose multiple transforms. If `None` - an inference mode is
+                assumed - and then a preset of inference-specific transforms will be applied to the images
+
+        Returns:
+            a PyTorch tensor of the processed images
+
+        """
+        image_size = image_size if image_size is not None else self.image_size
+        image_num_channels = image_num_channels if image_num_channels is not None else self.image_num_channels
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        size = (image_size, image_size)
+
+        if isinstance(images, list) and len(images) == 0:
+            return []
+
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        # For training a user needs to pass their own set of transforms as a Callable.
+        # For reference this is what was used in the original IDEFICS training:
+        # transform = transforms.Compose([
+        #     convert_to_rgb,
+        #     transforms.RandomResizedCrop((size, size), scale=(0.9, 1.0), interpolation=transforms.InterpolationMode.BICUBIC),
+        #     transforms.ToTensor(),
+        #     transforms.Normalize(mean=image_mean, std=image_std),
+        # ])
+        if transform is not None:
+            if not is_torch_available():
+                raise ImportError("To pass in `transform` torch must be installed")
+            import torch
+
+            images = [transform(x) for x in images]
+            return torch.stack(images)
+
+        # for inference we do the exact transforms that were used to train IDEFICS
+        images = [convert_to_rgb(x) for x in images]
+        # further transforms expect numpy arrays
+        images = [to_numpy_array(x) for x in images]
+        images = [resize(x, size, resample=PILImageResampling.BICUBIC) for x in images]
+        images = [self.rescale(image=image, scale=1 / 255) for image in images]
+        images = [self.normalize(x, mean=image_mean, std=image_std) for x in images]
+        images = [to_channel_dimension_format(x, ChannelDimension.FIRST) for x in images]
+        # TODO: this converts to torch tensors - switch to convert_to_tensors once it becomes available
+        images = BatchFeature(data={"pixel_values": images}, tensor_type=TensorType.PYTORCH)["pixel_values"]
+
+        return images
diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py
new file mode 100644
index 000000000000..316f36561308
--- /dev/null
+++ b/src/transformers/models/idefics/modeling_idefics.py
@@ -0,0 +1,1594 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Idefics model."""
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ... import PreTrainedModel
+from ...activations import ACT2FN
+from ...modeling_outputs import ModelOutput
+from ...modeling_utils import PretrainedConfig
+from ...pytorch_utils import ALL_LAYERNORM_LAYERS
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_idefics import IdeficsConfig
+from .perceiver import IdeficsPerceiverResampler
+from .vision import IdeficsVisionTransformer
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "IdeficsConfig"
+
+IDEFICS_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "HuggingFaceM4/idefics-9b",
+    "HuggingFaceM4/idefics-80b",
+    # See all Idefics models at https://huggingface.co/models?filter=idefics
+]
+
+
+@dataclass
+class IdeficsBaseModelOutputWithPast(ModelOutput):
+    """
+    Base class for Idefics model's outputs that may also contain a past key/values (to speed up sequential decoding).
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+
+            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+            hidden_size)` is output.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
+            encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
+            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
+            input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+            Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
+            sequence_length, hidden_size)`.
+
+            image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class IdeficsCausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for Idefics causal language model (or autoregressive) outputs.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+            Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
+            sequence_length, hidden_size)`.
+
+            image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+
+
+def expand_inputs_for_generation(
+    input_ids,
+    expand_size=1,
+    is_encoder_decoder=False,
+    attention_mask=None,
+    encoder_outputs=None,
+    **model_kwargs,
+):
+    expanded_return_idx = (
+        torch.arange(input_ids.shape[0]).view(-1, 1).repeat(1, expand_size).view(-1).to(input_ids.device)
+    )
+    input_ids = input_ids.index_select(0, expanded_return_idx)
+    model_kwargs["pixel_values"] = model_kwargs.get("pixel_values", None)
+    model_kwargs["image_encoder_embeddings"] = model_kwargs.get("image_encoder_embeddings", None)
+    model_kwargs["perceiver_embeddings"] = model_kwargs.get("perceiver_embeddings", None)
+    model_kwargs["image_attention_mask"] = model_kwargs.get("image_attention_mask", None)
+
+    if "token_type_ids" in model_kwargs:
+        token_type_ids = model_kwargs["token_type_ids"]
+        model_kwargs["token_type_ids"] = token_type_ids.index_select(0, expanded_return_idx)
+
+    if attention_mask is not None:
+        model_kwargs["attention_mask"] = attention_mask.index_select(0, expanded_return_idx)
+
+    if model_kwargs["image_attention_mask"] is not None:
+        model_kwargs["image_attention_mask"] = model_kwargs["image_attention_mask"].index_select(
+            0, expanded_return_idx
+        )
+
+    if model_kwargs["pixel_values"] is not None:
+        model_kwargs["pixel_values"] = model_kwargs["pixel_values"].index_select(0, expanded_return_idx)
+
+    elif model_kwargs["image_encoder_embeddings"] is not None:
+        model_kwargs["image_encoder_embeddings"] = model_kwargs["image_encoder_embeddings"].index_select(
+            0, expanded_return_idx
+        )
+
+    elif model_kwargs["perceiver_embeddings"] is not None:
+        model_kwargs["perceiver_embeddings"] = model_kwargs["perceiver_embeddings"].index_select(
+            0, expanded_return_idx
+        )
+
+    return input_ids, model_kwargs
+
+
+def update_model_kwargs_for_generation(outputs, model_kwargs):
+    # must have this key set to at least None
+    if "past_key_values" in outputs:
+        model_kwargs["past_key_values"] = outputs.past_key_values
+    else:
+        model_kwargs["past_key_values"] = None
+
+    # update token_type_ids with last value
+    if "token_type_ids" in model_kwargs:
+        token_type_ids = model_kwargs["token_type_ids"]
+        model_kwargs["token_type_ids"] = torch.cat([token_type_ids, token_type_ids[:, -1].unsqueeze(-1)], dim=-1)
+
+    # update attention masks
+    if "attention_mask" in model_kwargs:
+        attention_mask = model_kwargs["attention_mask"]
+        model_kwargs["attention_mask"] = torch.cat(
+            [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
+        )
+    if "image_attention_mask" in model_kwargs:
+        image_attention_mask = model_kwargs["image_attention_mask"]
+        last_mask = image_attention_mask[:, -1, :].unsqueeze(1)
+        model_kwargs["image_attention_mask"] = last_mask
+
+    # Get the precomputed image_hidden_states
+    model_kwargs["image_hidden_states"] = outputs.image_hidden_states
+
+    return model_kwargs
+
+
+def prepare_inputs_for_generation(input_ids, past_key_values=None, **kwargs):
+    token_type_ids = kwargs.get("token_type_ids", None)
+    # only last token for inputs_ids if past is defined in kwargs
+    if past_key_values:
+        input_ids = input_ids[:, -1].unsqueeze(-1)
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
+
+    attention_mask = kwargs.get("attention_mask", None)
+    position_ids = kwargs.get("position_ids", None)
+
+    if attention_mask is not None and position_ids is None:
+        # create position_ids on the fly for batch generation
+        position_ids = attention_mask.long().cumsum(-1) - 1
+        position_ids.masked_fill_(attention_mask == 0, 1)
+        if past_key_values:
+            position_ids = position_ids[:, -1].unsqueeze(-1)
+
+    pixel_values = kwargs.get("pixel_values", None)
+    image_encoder_embeddings = kwargs.get("image_encoder_embeddings", None)
+    perceiver_embeddings = kwargs.get("perceiver_embeddings", None)
+    image_attention_mask = kwargs.get("image_attention_mask", None)
+    interpolate_pos_encoding = kwargs.get("interpolate_pos_encoding", False)
+
+    return {
+        "input_ids": input_ids,
+        "past_key_values": past_key_values,
+        "use_cache": kwargs.get("use_cache"),
+        "position_ids": position_ids,
+        "attention_mask": attention_mask,
+        "token_type_ids": token_type_ids,
+        "pixel_values": pixel_values,
+        "image_encoder_embeddings": image_encoder_embeddings,
+        "perceiver_embeddings": perceiver_embeddings,
+        "image_attention_mask": image_attention_mask,
+        "interpolate_pos_encoding": interpolate_pos_encoding,
+    }
+
+
+def freeze_model(model, module_exceptions=[]):
+    mapping = {
+        "LayerNorm": nn.LayerNorm,
+        "Linear": nn.Linear,
+        "Embedding": nn.Embedding,
+    }
+    module_exceptions_mapped = [mapping[m] for m in module_exceptions]
+    for module in model.modules():
+        if module_exceptions and any(isinstance(module, t) for t in module_exceptions_mapped):
+            module.requires_grad_(True)  # Explicitely setting it to true to avoid any mistakes
+        else:
+            module.requires_grad_(False)
+    return model
+
+
+class IdeficsDecoupledEmbedding(nn.Embedding):
+    # Derived from https://pytorch.org/docs/stable/_modules/torch/nn/modules/sparse.html#Embedding
+    """
+    Implements a decoupling of parameters to allow freezing (or not) a subset of the embeddings. In practise, the
+    regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `num_additional_embeddings` > 0,
+    then it will create `num_additional_embeddings` additional parameters that are always trained. If
+    `num_additional_embeddings=0`, then the module defaults back to the regular behavior of `nn.Embedding`.
+    """
+
+    def __init__(
+        self,
+        num_embeddings,
+        num_additional_embeddings,
+        embedding_dim,
+        partially_freeze: Optional[bool] = False,
+        device=None,
+        dtype=None,
+        padding_idx=None,
+        **kwargs,
+    ) -> None:
+        """
+        Args:
+            num_embeddings (`int`):
+                Size of the dictionary of embeddings
+            num_additional_embeddings (`int`):
+                Number of additional embeddings. Only useful when you `partially_freeze=True`.
+            embedding_dim (`int`):
+                The size of each embedding vector
+            partially_freeze: (`bool`, *optional*, defaults to `False`):
+                If `True`, the regular `weight` will be frozen. `additional_weight` is never frozen.
+            padding_idx (`int`, *optional*):
+                The padding index (needs to be less than num_embeddings)
+
+        Note: there are a lot of other parameters to initialize a standard `nn.Embedding` such as `padding_idx`,
+        `max_norm` or `norm_type`. We are not supporting these.
+        """
+        if padding_idx is not None and padding_idx > num_embeddings:
+            raise ValueError(f"padding_idx must be within num_embeddings. Got {padding_idx} and {num_embeddings}")
+        super().__init__(
+            num_embeddings=num_embeddings,
+            embedding_dim=embedding_dim,
+            device=device,
+            dtype=dtype,
+            padding_idx=padding_idx,
+            **kwargs,
+        )
+        self.num_embeddings = num_embeddings
+        self.padding_idx = padding_idx
+        self.num_additional_embeddings = num_additional_embeddings
+        self.partially_freeze = partially_freeze
+
+        if partially_freeze:
+            self.weight.requires_grad_(False)
+
+        if self.num_additional_embeddings > 0:
+            self.additional_embedding = nn.Embedding(
+                num_embeddings=self.num_additional_embeddings,
+                embedding_dim=embedding_dim,
+                device=device,
+                dtype=dtype,
+            )
+
+    def forward(self, input_ids):
+        """
+        we have 2 embeddings, with different indices - one pretrained self.weight and another
+        self.additional_embedding.weight that is being trained.
+
+        in order to make a lookup of the input ids, we:
+        1. find out the indices of the entries belonging to the 2nd embedding
+        2. extract those values while subtracting the size of the first embedding (num_embeddings), since the 2nd
+           embedding starts from 0 and not num_embeddings
+        3. perform the 2nd embedding lookup
+        4. now we handle the 1st embedding, we overwrite indices belonging to the 2nd embedding with a padding index
+        5. perform the 1st embedding lookup
+        6. now we overwrite the values in the 1st embedding lookup with the values of the 2nd embedding lookup
+
+        note: for the 1st embedding lookup we could have looked up only the low indices and not do the padding, but
+        then we have to create a new tensor and populate it with 2 tensors that are spread out across various indices -
+        i.e. not a simple concat - I haven't benchmarked the complex case if it's any faster, given that seqlens are
+        usually relatively short it's probably not faster or if faster not by much - but might be a good idea to
+        measure.
+
+        """
+        if self.num_additional_embeddings == 0:
+            return F.embedding(input_ids, self.weight)
+
+        # Clone so that we don't modify the original input_ids later on
+        input_ids = input_ids.clone()
+        additional_vocab_indices = torch.where(input_ids >= self.num_embeddings)
+        input_ids_additional_vocab = input_ids[additional_vocab_indices]
+        additional_embeddings = self.additional_embedding(input_ids_additional_vocab - self.num_embeddings)
+
+        # for successful lookup replace input_ids with 0, the results of these will be discarded anyway
+        input_ids[additional_vocab_indices] = 0
+        full_vector = F.embedding(input_ids, self.weight)
+
+        # overwrite the records with high indices
+        full_vector[additional_vocab_indices] = additional_embeddings
+
+        return full_vector
+
+    def extra_repr(self) -> str:
+        return "num_embeddings={}, num_additional_embeddings={}, embedding_dim={}, partially_freeze={}".format(
+            self.num_embeddings,
+            self.num_additional_embeddings,
+            self.embedding_dim,
+            self.partially_freeze,
+        )
+
+
+class IdeficsDecoupledLinear(nn.Linear):
+    # Derived from https://pytorch.org/docs/stable/_modules/torch/nn/modules/linear.html#Linear
+    """
+    Implements a decoupling of parameters to allow freezing (or not) a subset of the parameters. In practise, the
+    regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `out_additional_features` > 0,
+    then it will create `out_additional_features * in_features` additional parameters that are always trained. If
+    `out_additional_features=0`, then the module defaults back to the regular behavior of `nn.Linear`.
+    """
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        out_additional_features: int = 0,
+        bias: bool = True,
+        partially_freeze: bool = True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        """
+        out_additional_features: int. Number of additional trainable dimensions. Only makes sense when
+        `partially_freeze=True`. partially_freeze: bool. If True, the regular `weight` will be frozen and extra
+        parameters (if any) will be trainable. If False, default to the regular behavior of nn.Linear.
+        """
+        super().__init__(in_features, out_features, bias, device, dtype)
+        self.out_additional_features = out_additional_features
+        self.partially_freeze = partially_freeze
+
+        self.in_features = in_features
+        self.out_features = out_features
+
+        if partially_freeze:
+            self.weight.requires_grad_(False)
+            if bias:
+                self.bias.requires_grad_(False)
+
+        if out_additional_features > 0:
+            self.additional_fc = nn.Linear(
+                in_features=in_features,
+                out_features=out_additional_features,
+                bias=bias,
+                device=device,
+                dtype=dtype,
+            )
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        output = F.linear(input, self.weight, self.bias)
+
+        if self.out_additional_features > 0:
+            additional_features = self.additional_fc(input)
+            output = torch.cat((output, additional_features), -1)
+
+        return output
+
+    def extra_repr(self) -> str:
+        """Overwriting `nn.Linear.extra_repr` to include new parameters."""
+        return "in_features={}, out_features={}, out_additional_features={}, bias={}, partially_freeze={}".format(
+            self.in_features,
+            self.out_features,
+            self.out_additional_features,
+            self.bias is not None,
+            self.partially_freeze,
+        )
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+# this was adapted from LlamaRMSNorm
+class IdeficsRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        IdeficsRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+
+        # convert into half-precision if necessary
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.to(self.weight.dtype)
+
+        return self.weight * hidden_states
+
+
+ALL_LAYERNORM_LAYERS.append(IdeficsRMSNorm)
+
+
+# this was adapted from LlamaRotaryEmbedding
+class IdeficsEmbedding(torch.nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+    cos = cos[position_ids].unsqueeze(1)  # [seq_len, dim] -> [batch_size, 1, seq_len, head_dim]
+    sin = sin[position_ids].unsqueeze(1)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+# this was adapted from LlamaMLP
+class IdeficsMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+    ):
+        super().__init__()
+        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
+        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.act_fn = ACT2FN[hidden_act]
+
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+
+# this was adapted from LlamaAttention
+class IdeficsAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_cross_attention: bool = False,
+        config: PretrainedConfig = None,
+        qk_layer_norms: bool = False,
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.head_dim = hidden_size // num_heads
+        self.dropout = dropout
+
+        if (self.head_dim * num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {num_heads})."
+            )
+
+        self.is_cross_attention = is_cross_attention
+
+        if not hasattr(nn.functional, "scaled_dot_product_attention"):
+            raise ValueError("this model requires pytorch 2.0 or higher")
+
+        if self.is_cross_attention:
+            kv_input_dim = (
+                self.hidden_size if not hasattr(config.vision_config, "embed_dim") else config.vision_config.embed_dim
+            )
+            self.q_proj = nn.Linear(
+                self.hidden_size,
+                num_heads * self.head_dim,
+                bias=False,
+            )
+            self.k_proj = nn.Linear(kv_input_dim, num_heads * self.head_dim, bias=False)
+            self.v_proj = nn.Linear(
+                kv_input_dim,
+                num_heads * self.head_dim,
+                bias=False,
+            )
+        else:
+            self.q_proj = nn.Linear(
+                self.hidden_size,
+                num_heads * self.head_dim,
+                bias=False,
+            )
+            self.k_proj = nn.Linear(
+                self.hidden_size,
+                num_heads * self.head_dim,
+                bias=False,
+            )
+            self.v_proj = nn.Linear(
+                self.hidden_size,
+                num_heads * self.head_dim,
+                bias=False,
+            )
+        self.o_proj = nn.Linear(
+            num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+        )
+        self.rotary_emb = IdeficsEmbedding(self.head_dim)
+
+        self.qk_layer_norms = qk_layer_norms
+        if self.qk_layer_norms:
+            self.q_layer_norm = IdeficsRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+            self.k_layer_norm = IdeficsRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        is_cross_attention = self.is_cross_attention or key_value_states is not None
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        if not is_cross_attention:
+            key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+            value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        else:
+            _, kv_len, _ = key_value_states.size()  # Note that, in this case, `kv_len` == `kv_seq_len`
+            key_states = self.k_proj(key_value_states).view(bsz, kv_len, self.num_heads, self.head_dim).transpose(1, 2)
+            value_states = (
+                self.v_proj(key_value_states).view(bsz, kv_len, self.num_heads, self.head_dim).transpose(1, 2)
+            )
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        if not is_cross_attention:
+            cos, sin = self.rotary_emb(value_states, seq_len=max(kv_seq_len, q_len))
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        # [bsz, nh, t, hd]
+
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+        past_key_value = (key_states, value_states) if use_cache else None
+
+        if self.qk_layer_norms:
+            query_states = self.q_layer_norm(query_states)
+            key_states = self.k_layer_norm(key_states)
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+
+        attn_output = nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.dropout,
+        )
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        attn_weights = None
+        if output_attentions:
+            logger.warning_once(
+                "attn_weights are not extracted in scaled_dot_product_attention. The model returns None instead"
+            )
+
+        return attn_output, attn_weights, past_key_value
+
+
+# this was adapted from LlamaDecoderLayer
+class IdeficsDecoderLayer(nn.Module):
+    def __init__(self, config: IdeficsConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = IdeficsAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.dropout,
+            config=config,
+        )
+        self.mlp = IdeficsMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+        )
+        self.input_layernorm = IdeficsRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = IdeficsRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.dropout = config.dropout
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+class IdeficsGatedCrossAttentionLayer(nn.Module):
+    def __init__(self, config: IdeficsConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.cross_attn = IdeficsAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            is_cross_attention=True,
+            dropout=config.dropout,
+            config=config,
+            qk_layer_norms=config.qk_layer_norms,
+        )
+        self.mlp = IdeficsMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+        )
+        self.input_layernorm = IdeficsRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = IdeficsRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.config = config.dropout
+
+        self.act_cross_attn = nn.Tanh()
+        self.act_dense = nn.Tanh()
+
+        if config.alpha_initializer == "zeros":
+            if config.alpha_type == "vector":
+                self.alpha_cross_attn = nn.Parameter(torch.zeros(1, 1, self.hidden_size))
+                self.alpha_dense = nn.Parameter(torch.zeros(1, 1, self.hidden_size))
+            elif config.alpha_type == "float":
+                self.alpha_cross_attn = nn.Parameter(torch.zeros(1))
+                self.alpha_dense = nn.Parameter(torch.zeros(1))
+            else:
+                raise ValueError(f"Unknown value for `alpha_type` ({config.alpha_type})")
+
+        elif config.alpha_initializer == "ones":
+            if config.alpha_type == "vector":
+                self.alpha_cross_attn = nn.Parameter(torch.ones(1, 1, self.hidden_size))
+                self.alpha_dense = nn.Parameter(torch.ones(1, 1, self.hidden_size))
+            elif config.alpha_type == "float":
+                self.alpha_cross_attn = nn.Parameter(torch.ones(1))
+                self.alpha_dense = nn.Parameter(torch.ones(1))
+            else:
+                raise ValueError(f"Unknown value for `alpha_type` ({config.alpha_type})")
+
+        elif config.alpha_initializer in {"normal", "gaussian", "random"}:
+            if config.alpha_type == "vector":
+                self.alpha_cross_attn = nn.Parameter(
+                    torch.normal(mean=0.0, std=config.alphas_initializer_range, size=(1, 1, self.hidden_size))
+                )
+                self.alpha_dense = nn.Parameter(
+                    torch.normal(mean=0.0, std=config.alphas_initializer_range, size=(1, 1, self.hidden_size))
+                )
+            elif config.alpha_type == "float":
+                self.alpha_cross_attn = nn.Parameter(
+                    torch.normal(mean=0.0, std=config.alphas_initializer_range, size=(1))
+                )
+                self.alpha_dense = nn.Parameter(torch.normal(mean=0.0, std=config.alphas_initializer_range, size=(1)))
+            else:
+                raise ValueError(f"Unknown value for `alpha_type` ({config.alpha_type})")
+
+        else:
+            raise NotImplementedError(f"Alpha initialization scheme {config.alpha_initializer} not yet implemented!")
+
+        if not (hasattr(self, "alpha_cross_attn") and hasattr(self, "alpha_dense")):
+            raise ValueError("Alpha parameters not initialized correctly!")
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_hidden_states: Optional[torch.Tensor] = None,
+        image_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        no_images: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            no_images (`bool`, *optional*, defaults to `False`): If `True` the vision part is ignored
+        """
+        if image_hidden_states is None:
+            raise ValueError(
+                "`image_hidden_states` is required for Idefics cross attention module which are visual features to be"
+                " conditioned on."
+            )
+
+        if past_key_value is not None:
+            raise NotImplementedError("Past key value states are not implemented for Idefics cross attention module.")
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.cross_attn(
+            hidden_states=hidden_states,
+            key_value_states=image_hidden_states,
+            attention_mask=image_attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.config, training=self.training)
+        # when there are no images the model is used in pure language mode
+        gate = 0 if no_images else 1
+        hidden_states = residual + gate * self.act_cross_attn(self.alpha_cross_attn) * hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.config, training=self.training)
+        hidden_states = residual + self.act_dense(self.alpha_dense) * hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+LLAMA_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`IdeficsConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    LLAMA_START_DOCSTRING,
+)
+class IdeficsPreTrainedModel(PreTrainedModel):
+    config_class = IdeficsConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["IdeficsDecoderLayer", "IdeficsGatedCrossAttentionLayer"]
+
+    def _init_weights(self, module):
+        # important: this ported version of Idefics isn't meant for training from scratch - only
+        # inference and fine-tuning - so the proper init weights code has been removed - the m4 code
+        # base should be used for training from scratch and it contains the correct code.
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, IdeficsModel):
+            module.gradient_checkpointing = value
+
+
+LLAMA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    LLAMA_START_DOCSTRING,
+)
+class IdeficsModel(IdeficsPreTrainedModel):
+    """
+    Transformer decoder consisting of `config.num_hidden_layers` layers. Each layer is a [`IdeficsDecoderLayer`]
+
+    Args:
+        config: IdeficsConfig
+    """
+
+    def __init__(self, config: IdeficsConfig):
+        super().__init__(config)
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = IdeficsDecoupledEmbedding(
+            num_embeddings=config.vocab_size,
+            num_additional_embeddings=config.additional_vocab_size,
+            embedding_dim=config.hidden_size,
+            partially_freeze=config.freeze_text_layers,
+            padding_idx=self.padding_idx,
+        )
+
+        self.image_size = config.vision_config.image_size
+        self.vision_config = config.vision_config
+        self.vision_model = IdeficsVisionTransformer(config.vision_config)
+
+        # Perceiver Resampler
+        if config.use_resampler:
+            perceiver_config = config.perceiver_config
+            self.perceiver_resampler = IdeficsPerceiverResampler(
+                config,
+                config.vision_config.embed_dim,
+                perceiver_config.resampler_depth,
+                perceiver_config.resampler_n_heads,
+                perceiver_config.resampler_head_dim,
+                perceiver_config.resampler_n_latents,
+            )
+
+        self.layers = nn.ModuleList([IdeficsDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+
+        self.cross_layer_interval = config.cross_layer_interval
+        num_cross_layers = config.num_hidden_layers // self.cross_layer_interval
+        self.gated_cross_attn_layers = nn.ModuleList(
+            [IdeficsGatedCrossAttentionLayer(config) for _ in range(num_cross_layers)]
+        )
+        self.gradient_checkpointing = False
+
+        self.norm = IdeficsRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        self.freeze_relevant_params(config)
+
+    def freeze_relevant_params(self, config=None):
+        if config is None:
+            config = self.config
+
+        if config.freeze_text_layers:
+            self.freeze_text_layers(config.freeze_text_module_exceptions)
+
+        if config.freeze_vision_layers:
+            freeze_model(self.vision_model, module_exceptions=config.freeze_vision_module_exceptions)
+
+    def freeze_text_layers(self, module_exceptions=[]):
+        for module in [self.layers, self.norm]:
+            freeze_model(module, module_exceptions=module_exceptions)
+
+    def freeze_vision_layers(self, module_exceptions=[]):
+        freeze_model(self.vision_model, module_exceptions=module_exceptions)
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+
+        return combined_attention_mask
+
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        image_encoder_embeddings: Optional[torch.FloatTensor] = None,
+        perceiver_embeddings: Optional[torch.FloatTensor] = None,
+        image_attention_mask: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = False,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, IdeficsBaseModelOutputWithPast]:
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+        elif position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0)
+
+        no_images = False
+        if (pixel_values, image_encoder_embeddings, perceiver_embeddings).count(None) != 2:
+            raise ValueError(
+                "Exactly 1 of pixel_values, image_encoder_embeddings or perceiver_embeddings has to be not-None."
+            )
+
+        elif pixel_values is not None:
+            no_images = len(torch.nonzero(pixel_values)) == 0
+            pixel_values = pixel_values.to(dtype=self.dtype, device=device)  # fp16 compatibility
+            batch_size, num_images = pixel_values.shape[:2]
+            pixel_values = pixel_values.contiguous().view(batch_size * num_images, *pixel_values.shape[2:])
+
+            # Get sequence from the vision encoder
+            image_hidden_states = self.vision_model(
+                pixel_values=pixel_values, interpolate_pos_encoding=interpolate_pos_encoding
+            ).last_hidden_state
+
+        elif image_encoder_embeddings is not None:
+            batch_size, num_images, image_seq_len, image_hidden_size = image_encoder_embeddings.size()
+            image_hidden_states = image_encoder_embeddings.to(dtype=self.dtype, device=input_ids.device)
+            image_hidden_states = image_hidden_states.view(batch_size * num_images, image_seq_len, image_hidden_size)
+
+        if self.config.use_resampler:
+            if perceiver_embeddings is None:
+                perceiver_embeddings = self.perceiver_resampler(image_hidden_states)
+                image_seq_len, image_hidden_size = perceiver_embeddings.size(1), perceiver_embeddings.size(2)
+            else:
+                batch_size, num_images, image_seq_len, image_hidden_size = perceiver_embeddings.size()
+            image_hidden_states = perceiver_embeddings
+        elif perceiver_embeddings is None:
+            image_seq_len, image_hidden_size = image_hidden_states.size(1), image_hidden_states.size(2)
+        else:
+            raise ValueError("If `perceiver_embeddings` are passed, use_resampler should be True")
+
+        image_hidden_states = image_hidden_states.view(batch_size, num_images * image_seq_len, image_hidden_size)
+        # # Hack to use the model in full language modeling mode
+        # image_attention_mask = torch.zeros(batch_size, seq_length, 1, dtype=torch.long, device=image_hidden_states.device)
+        # Make image_attention_mask compatible with hidden states
+        text_seq_len = image_attention_mask.size(1)
+        image_attention_mask = image_attention_mask.unsqueeze(-1)
+        image_attention_mask = image_attention_mask.repeat(1, 1, 1, image_seq_len)
+        image_attention_mask = image_attention_mask.view(batch_size, text_seq_len, num_images * image_seq_len)
+
+        if image_hidden_states is not None:
+            image_batch_size, image_sequence_length, _ = image_hidden_states.size()
+            image_hidden_shape = (image_batch_size, image_sequence_length)
+            if image_attention_mask is None:
+                image_attention_mask = torch.ones(image_hidden_shape, device=device)
+            image_attention_mask = self.invert_attention_mask(image_attention_mask)
+        else:
+            image_attention_mask = None
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        # embed positions
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+            )
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+        )
+
+        hidden_states = inputs_embeds
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            def vblock(
+                main_block,
+                hidden_states,
+                attention_mask,
+                position_ids,
+                past_key_value,
+                image_hidden_states,
+                image_attention_mask,
+                output_attentions,
+                use_cache,
+                no_images,
+                layer_idx,
+                cross_layer_interval,
+                gated_cross_attn_layers,
+            ):
+                # TODO(ls): Add cross attention values to respective lists
+                if layer_idx % cross_layer_interval == 0:
+                    xblock = gated_cross_attn_layers[layer_idx // cross_layer_interval]
+                    outputs = xblock(
+                        hidden_states,
+                        attention_mask=attention_mask,
+                        image_hidden_states=image_hidden_states,
+                        image_attention_mask=image_attention_mask,
+                        output_attentions=output_attentions,
+                        use_cache=use_cache,
+                        past_key_value=None,  # not implemented
+                        no_images=no_images,
+                    )
+                    hidden_states = outputs[0]
+
+                layer_outputs = main_block(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+
+                return layer_outputs
+
+            if self.gradient_checkpointing and self.training:
+                past_key_value = None
+                if use_cache:
+                    logger.warning_once(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    vblock,
+                    decoder_layer,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_value,
+                    image_hidden_states,
+                    image_attention_mask,
+                    output_attentions,
+                    use_cache,
+                    no_images,
+                    idx,
+                    self.cross_layer_interval,
+                    self.gated_cross_attn_layers,
+                )
+            else:
+                layer_outputs = vblock(
+                    decoder_layer,
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    image_hidden_states=image_hidden_states,
+                    image_attention_mask=image_attention_mask,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    no_images=no_images,
+                    layer_idx=idx,
+                    cross_layer_interval=self.cross_layer_interval,
+                    gated_cross_attn_layers=self.gated_cross_attn_layers,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        image_hidden_states = image_hidden_states.view(batch_size, num_images, image_seq_len, image_hidden_size)
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, image_hidden_states]
+                if v is not None
+            )
+        return IdeficsBaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            image_hidden_states=image_hidden_states,
+        )
+
+
+class IdeficsForVisionText2Text(IdeficsPreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
+    _tied_weights_keys = ["model.embed_tokens.weight", "lm_head.weight"]
+
+    def __init__(self, config, vision_model=None):
+        super().__init__(config)
+        self.model = IdeficsModel(config)
+
+        self.lm_head = IdeficsDecoupledLinear(
+            in_features=config.hidden_size,
+            out_features=config.vocab_size,
+            out_additional_features=config.additional_vocab_size,
+            bias=False,
+            partially_freeze=config.freeze_lm_head,
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    def tie_weights(self):
+        """
+        Overwrite `transformers.modeling_utils.PreTrainedModel.tie_weights` to handle the case of
+        IdeficsDecoupledLinear and IdeficsDecoupledEmbedding.
+        """
+        output_embeddings = self.get_output_embeddings()
+        input_embeddings = self.get_input_embeddings()
+
+        if getattr(self.config, "tie_word_embeddings", True):
+            output_embeddings.weight = input_embeddings.weight
+            if input_embeddings.num_additional_embeddings > 0:
+                assert output_embeddings.out_additional_features == input_embeddings.num_additional_embeddings
+                output_embeddings.additional_fc.weight = input_embeddings.additional_embedding.weight
+
+        if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"):
+            output_embeddings.out_features = input_embeddings.num_embeddings
+            if hasattr(output_embeddings, "out_additional_features") and hasattr(
+                input_embeddings, "num_additional_embeddings"
+            ):
+                output_embeddings.out_additional_features = input_embeddings.num_additional_embeddings
+
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=IdeficsCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        image_encoder_embeddings: Optional[torch.FloatTensor] = None,
+        perceiver_embeddings: Optional[torch.FloatTensor] = None,
+        image_attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = False,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, IdeficsCausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, IdeficsForVisionText2Text
+
+        >>> model = IdeficsForVisionText2Text.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+
+        >>> prompt = "Hey, are you consciours? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            pixel_values=pixel_values,
+            image_encoder_embeddings=image_encoder_embeddings,
+            perceiver_embeddings=perceiver_embeddings,
+            image_attention_mask=image_attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            if attention_mask is not None:
+                shift_attention_mask = attention_mask[..., 1:]
+                shift_logits = logits[..., :-1, :][shift_attention_mask != 0].contiguous()
+                shift_labels = labels[..., 1:][shift_attention_mask != 0].contiguous()
+            else:
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return IdeficsCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
+        image_hidden_states = kwargs.pop("image_hidden_states", None)
+        if image_hidden_states is not None:
+            if self.config.use_resampler:
+                kwargs["perceiver_embeddings"] = image_hidden_states
+            else:
+                kwargs["image_encoder_embeddings"] = image_hidden_states
+            kwargs["pixel_values"] = None
+        inputs = prepare_inputs_for_generation(input_ids, past=past, **kwargs)
+        unwanted_kwargs = ["token_type_ids"]
+        for kwarg in unwanted_kwargs:
+            inputs.pop(kwarg, None)
+        return inputs
+
+    @staticmethod
+    def _expand_inputs_for_generation(
+        *args,
+        **model_kwargs,
+    ):
+        return expand_inputs_for_generation(*args, **model_kwargs)
+
+    @staticmethod
+    def _update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder):
+        return update_model_kwargs_for_generation(outputs, model_kwargs)
+
+    @staticmethod
+    def _reorder_cache(past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
diff --git a/src/transformers/models/idefics/perceiver.py b/src/transformers/models/idefics/perceiver.py
new file mode 100644
index 000000000000..888c5b0bb939
--- /dev/null
+++ b/src/transformers/models/idefics/perceiver.py
@@ -0,0 +1,188 @@
+# This code was adapted from https://github.com/lucidrains/flamingo-pytorch licensed under the MIT License.
+#
+# MIT License
+#
+# Copyright (c) 2020  The Google AI Language Team Authors, The HuggingFace Inc. team and github/lonePatient
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+"""
+
+Generic interface to various configurations of the Perceiver Resampler, that simply takes in a series of (potentially
+time-indexed) contextual embeddings, and "resamples" (compresses) them down to a pre-specified number of latents! Note
+that the Perceiver in general resamples based solely off the *long-range* context; there's a nice opportunity here to
+prime the Perceiver Resampler with say a single layer's worth of language embeddings (the target domain), and use that
+to softly "retrieve & compress" what we need --> this would be a novel contribution we should explore.
+
+References:
+    - DeepMind's Flamingo: https://www.deepmind.com/blog/tackling-multiple-tasks-with-a-single-visual-language-model
+    - Code borrowed w/ love from: https://github.com/lucidrains/flamingo-pytorch
+
+"""
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+
+from .configuration_idefics import IdeficsConfig
+
+
+class IdeficsPerceiverResampler(nn.Module):
+    def __init__(
+        self, config: IdeficsConfig, embed_dim: int, depth: int, n_heads: int, head_dim: int, n_latents: int
+    ) -> None:
+        """
+        Instantiates a Perceiver Resampler that operates over a sequence of embeddings (say from a ResNet or ViT or
+        MAE) of a given dimension, performs `depth` blocks of cross-attention with a fixed `n_latents` inputs, then
+        returns a Tensor of shape [bsz, n_latents, embed_dim]. :param embed_dim: Dimensionality of embeddings being fed
+        to the Perceiver Resampler (also dimensionality of latent embeddings *returned* by the Perceiver Resampler.
+        Could be e.g., VIT embed_dim, ResNet pool dim, and so on.
+
+        Args:
+            config (`IdeficsConfig`): config object
+            embed_dim (`int`): The size of each embedding vector
+            depth (`int`): Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
+            n_heads (`int`): Number of heads in each Transformer block (for multi-headed self-attention).
+            head_dim (`int`): Dimensionality of each head projection in the Transformer block.
+            n_latents (`int`):
+                Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).
+
+        """
+        super().__init__()
+        self.embed_dim, self.n_heads, self.head_dim, self.n_latents = embed_dim, n_heads, head_dim, n_latents
+        self.qk_layer_norms = config.perceiver_config.qk_layer_norms_perceiver
+
+        # Create Latents for Perceiver
+        self.latents = nn.Parameter(torch.randn(self.n_latents, self.embed_dim), requires_grad=True)
+
+        self.intermediate_dim = (
+            self.embed_dim * 4
+            if not hasattr(config.vision_config, "embed_dim")
+            else config.vision_config.embed_dim * 4
+        )
+        # Create Transformer Blocks
+        self.blocks = nn.ModuleList(
+            [
+                nn.ModuleList(
+                    [
+                        IdeficsPerceiverAttention(self.embed_dim, self.n_heads, self.head_dim, self.qk_layer_norms),
+                        IdeficsMLP(self.intermediate_dim, config),
+                    ]
+                )
+                for _ in range(depth)
+            ]
+        )
+        self.layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(self, context: torch.Tensor) -> torch.Tensor:
+        """Resample arbitrary length context & *compress* down to self.n_latents latent embeddings"""
+        # einsum.repeat(self.latents, "seq embed -> bsz seq embed", bsz=context.shape[0])
+        latents = self.latents.repeat(context.shape[0], 1, 1)
+
+        # Feed through Perceiver Attention blocks...
+        for attn, ff in self.blocks:
+            latents = attn(context, latents) + latents
+            latents = ff(latents) + latents
+
+        return self.layer_norm(latents)
+
+
+class IdeficsPerceiverAttention(nn.Module):
+    def __init__(self, embed_dim: int, n_heads: int, head_dim: int, qk_layer_norms: bool) -> None:
+        """Perceiver Cross-Attention Module --> let long-form inputs be `context`, resampled embeddings be `latents`"""
+        super().__init__()
+        self.embed_dim, self.n_heads, self.head_dim = embed_dim, n_heads, head_dim
+        self.qk_layer_norms = qk_layer_norms
+        # Normalization & Scaling
+        self.context_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.latents_layer_norm = nn.LayerNorm(self.embed_dim)
+        if self.qk_layer_norms:
+            self.q_layer_norm = nn.LayerNorm(self.head_dim)
+            self.k_layer_norm = nn.LayerNorm(self.head_dim)
+
+        self.qk_scale = self.head_dim**-0.5
+
+        # Q, K, V Projection (no bias -- detail from Perceiver/Flamingo Papers).
+        self.q_proj = nn.Linear(self.embed_dim, self.n_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.embed_dim, self.n_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.embed_dim, self.n_heads * self.head_dim, bias=False)
+
+        self.output_proj = nn.Linear(self.n_heads * self.head_dim, embed_dim, bias=False)
+
+    def forward(self, context: torch.Tensor, latents: torch.Tensor) -> torch.Tensor:
+        """
+        Runs Perceiver Self-Attention, with special (context, latents) appended along the `seq` dimension!
+
+        Args:
+            context (`torch.Tensor`):
+                Tensor of shape `[bsz, seq, embed_dim]` representing long-form context to resample.
+            latents (`torch.Tensor`):
+                Tensor of shape `[bsz, n_latents, embed_dim]` representing fixed length latents to compress to.
+
+        Returns:
+            `torch.Tensor`: Tensor of shape `[bsz, n_latents, embed_dim]` representing attention over latents w/ cross
+            from context.
+        """
+        context = self.context_layer_norm(context)
+        latents = self.latents_layer_norm(latents)
+        batch_size, seq_length, embed_dim = context.shape[:3]
+
+        # Query, Key, Value Projections --> Note that in Flamingo, latents are *concatenated* with context prior to attn!
+        #   Note: This results in queries w/ `seq = n_latents`, and keys, values with `seq = len(context) + n_latents`
+        q = self.q_proj(latents)
+        k = self.k_proj(torch.cat([context, latents], dim=-2))
+        v = self.v_proj(torch.cat([context, latents], dim=-2))
+
+        # Multiheaded Self-Attention w/ stable softmax (subtract per-row max -- `amax` -- before softmax call)
+        #   =>> `attn` should be a 2D matrix of shape [n_latents x (context + n_latents)]
+        # einsum.rearrange(x, "bsz seq (heads embed) -> bsz heads seq embed", heads=self.n_heads)
+        q, k, v = [x.reshape(batch_size, x.shape[1], self.n_heads, self.head_dim).transpose(1, 2) for x in (q, k, v)]
+
+        if self.qk_layer_norms:
+            q = self.q_layer_norm(q)
+            k = self.k_layer_norm(k)
+
+        scores = torch.einsum("... i d, ... j d -> ... i j", q * self.qk_scale, k)
+        stabilized_scores = scores - (scores.amax(dim=-1, keepdim=True).detach())
+        attn = stabilized_scores.softmax(dim=-1)
+
+        # Attend & project back to output...
+        resampled = torch.einsum("... i j, ... j d -> ... i d", attn, v)
+        # einsum.rearrange(resampled, "bsz heads seq embed -> bsz seq (heads embed)", heads=self.n_heads)
+        return self.output_proj(resampled.transpose(1, 2).flatten(-2))
+
+
+class IdeficsMLP(nn.Module):
+    def __init__(self, intermediate_size, config: IdeficsConfig):
+        """Simple MLP block with intermediate_size and embedding size"""
+        super().__init__()
+        self.embed_dim = config.vision_config.embed_dim
+        self.ln = nn.LayerNorm(self.embed_dim)
+        self.fc = nn.Linear(self.embed_dim, intermediate_size, bias=False)
+        self.act = nn.ReLU()
+        self.c_proj = nn.Linear(intermediate_size, self.embed_dim, bias=False)
+
+    def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
+        hidden_states = self.ln(hidden_states)
+        hidden_states = self.fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.c_proj(hidden_states)
+
+        return hidden_states
diff --git a/src/transformers/models/idefics/processing_idefics.py b/src/transformers/models/idefics/processing_idefics.py
new file mode 100644
index 000000000000..e6e0a9254aa1
--- /dev/null
+++ b/src/transformers/models/idefics/processing_idefics.py
@@ -0,0 +1,413 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for IDEFICS.
+"""
+
+from typing import Callable, List, Optional, Union
+from urllib.parse import urlparse
+
+from ...feature_extraction_utils import BatchFeature
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, TextInput, TruncationStrategy
+from ...utils import TensorType, is_torch_available
+
+
+if is_torch_available():
+    import torch
+
+
+IMAGE_TOKEN = "<image>"
+
+
+# copied from m4.training.packing
+def incremental_to_binary_attention_mask(incremental_mask, num_classes=-1):
+    # This function converts: [-1, 0, 1] => [[0, 0], [1, 0], [0, 1]]
+
+    # If any of images index are more than num_classes, set them to -1.
+    # Words after the max number of images allowed have been seen don't attend on anything
+    if num_classes != -1:
+        incremental_mask[incremental_mask >= num_classes] = -1
+
+    negatives = incremental_mask == -1
+    incremental_mask[negatives] = 0
+    attn_mask = torch.nn.functional.one_hot(incremental_mask, num_classes=num_classes)
+    attn_mask[negatives, :] = 0
+    return attn_mask
+
+
+# copied from m4.training.packing
+def image_attention_mask_for_packed_input_ids(input_ids, tokenizer):
+    image_attention_mask = torch.full_like(input_ids, fill_value=-1)
+    next_image_attention_mask = torch.full_like(input_ids, fill_value=-1)
+    image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
+    eod_token_id = tokenizer.eos_token_id
+    for batch_idx in range(input_ids.size(0)):
+        count = -1
+        seen_eod = False
+        for idx, token_id in enumerate(input_ids[batch_idx]):
+            if token_id == image_token_id:
+                count += 1
+                image_attention_mask[batch_idx][idx] = count
+                seen_eod = False
+            else:
+                image_attention_mask[batch_idx][idx] = count
+
+            if seen_eod:
+                image_attention_mask[batch_idx][idx] = -1
+
+            if token_id == eod_token_id:
+                seen_eod = True
+
+    for batch_idx in range(input_ids.size(0)):
+        count = -1
+        seen_eod = False
+        for idx in range(input_ids[batch_idx].size(0) - 1, -1, -1):
+            token_id = input_ids[batch_idx][idx]
+            if token_id == image_token_id:
+                count += 1
+                next_image_attention_mask[batch_idx][idx] = count
+                seen_eod = False
+            else:
+                next_image_attention_mask[batch_idx][idx] = count
+
+            if token_id == eod_token_id:
+                seen_eod = True
+
+            if seen_eod:
+                next_image_attention_mask[batch_idx][idx] = -1
+
+        non_negative_indices = next_image_attention_mask[batch_idx] != -1
+        next_image_attention_mask[batch_idx][non_negative_indices] -= count
+        next_image_attention_mask[batch_idx][non_negative_indices] *= -1
+
+    return image_attention_mask, next_image_attention_mask
+
+
+def is_url(string):
+    """Checks if the passed string contains a valid url and nothing else. e.g. if space is included it's immediately
+    invalidated the url"""
+    if " " in string:
+        return False
+    result = urlparse(string)
+    return all([result.scheme, result.netloc])
+
+
+class IdeficsProcessor(ProcessorMixin):
+    r"""
+    Constructs a IDEFICS processor which wraps a LLama tokenizer and IDEFICS image processor into a single processor.
+
+    [`IdeficsProcessor`] offers all the functionalities of [`IdeficsImageProcessor`] and [`LlamaTokenizerFast`]. See
+    the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information.
+
+    Args:
+        image_processor (`IdeficsImageProcessor`):
+            An instance of [`IdeficsImageProcessor`]. The image processor is a required input.
+        tokenizer (`LlamaTokenizerFast`):
+            An instance of [`LlamaTokenizerFast`]. The tokenizer is a required input.
+        image_size (`int`, *optional*, defaults to 224): Image size (assuming a square image)
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "IdeficsImageProcessor"
+    tokenizer_class = "LlamaTokenizerFast"
+
+    def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_utterance_token=None, **kwargs):
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+
+        super().__init__(image_processor, tokenizer)
+        self.current_processor = self.image_processor
+        self.image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
+
+        self.default_image_dims = (
+            self.image_processor.image_num_channels,
+            self.image_processor.image_size,
+            self.image_processor.image_size,
+        )
+
+        self.tokenizer_was_trained_with_end_of_utterance_token = (
+            True
+            if "<end_of_utterance>" in self.tokenizer.special_tokens_map.get("additional_special_tokens", [])
+            else False
+        )
+
+    def __call__(
+        self,
+        prompts: Union[List[TextInput], List[List[TextInput]]],
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        transform: Callable = None,
+        add_eos_token=False,
+        add_end_of_utterance_token=None,
+        debug=False,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
+    ) -> BatchEncoding:
+        """This method takes batched or non-batched prompts made of text and images and converts them into prompts that
+        the model was trained on and prepares the image pixel values for the model to process.
+
+        Args:
+            prompts (`Union[List[TextInput], [List[List[TextInput]]]]`):
+                either a single prompt or a batched list of prompts - see the detailed description immediately after
+                the end of the arguments doc section.
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            truncation (`bool`, *optional*):
+                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+            transform (`Callable`, *optional*):
+                A custom transform function that accepts a single image can be passed for training. For example,
+                `torchvision.Compose` can be used to compose multiple functions. If `None` a preset inference-specific
+                set of transforms will be applied to the images
+            add_eos_token (`bool`, *optional*, defaults to `False`):
+                Adds `eos_token` at the end of the final prompt if True`
+            add_end_of_utterance_token (`bool`, *optional*)
+                Whether to automatically add `<end_of_utterance>` after each prompt's text input (unless followed by an
+                image). If `None` the tokenizer will be checked instead and if this token is found in
+                `additional_special_tokens` then the value will be `True`.
+            debug (`bool`, *optional*, defaults to `False`):
+                `True` value will help debug prompt generation by dumping useful information
+            return_tensors (`str` or `TensorType`, *optional*, defaults to `TensorType.PYTORCH`):
+                The type of tensors to return. Can be one of:
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+
+        Returns:
+            a dict with entries: `input_ids`, `attention_mask`, `pixel_values`, `image_attention_mask` which can be
+            directly passed to `model.generate`
+
+        Detailed explanation:
+
+        Each entry in `prompts` is either a text to be passed as is or an image that will be processed.
+
+        An image can be either an image object (`PIL.Image`) or a url from which the image can be retrieved.
+
+        When the processor encounters an image it'll inject `<fake_token_around_image><image><fake_token_around_image>`
+        entry into the prompt.
+
+        Example:
+
+        ```python
+        checkpoint = "HuggingFaceM4/idefics-9b"
+        processor = AutoProcessor.from_pretrained(checkpoint)
+        url = "https://hips.hearstapps.com/hmg-prod/images/cute-photos-of-cats-in-grass-1593184777.jpg"
+        img = processor.image_processor.fetch_images([url])[0]
+
+        prompts = [
+            "User:",
+            img,
+            "Describe this image.\nAssistant: An image of two kittens in grass.\n",
+            "User:",
+            "https://hips.hearstapps.com/hmg-prod/images/dog-puns-1581708208.jpg",
+            "Describe this image.\nAssistant:",
+        ]
+
+        inputs = processor(prompts, return_tensors="pt")
+        generated_ids = model.generate(**inputs, max_length=100)
+        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        ```
+
+        In this example the `prompts` will be converted into:
+
+        ```
+        <s>User:<fake_token_around_image><image><fake_token_around_image>Describe this image.
+        Assistant: An image of two kittens in grass.
+        User:<fake_token_around_image><image><fake_token_around_image>Describe this image.
+        Assistant:'
+        ```
+
+        and the two images will be massaged using [`IdeficsImageProcessor.__call__`] method and placed inside the
+        `pixel_values` dict entry of the return value.
+
+        This example also examplifies that images can be passed as objects or as text urls. It can be seen that the
+        first image is passed as object and the second one as a url.
+
+        To do training do:
+
+        ```python
+        image_transform = transforms.Compose(
+            [
+                transforms.RandomResizedCrop(
+                    (w, h), scale=(0.9, 1.0), interpolation=transforms.InterpolationMode.BICUBIC
+                ),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=self.image_mean, std=self.image_std),
+            ]
+        )
+        inputs = processor(prompts, transform=image_transform, return_tensors="pt")
+        ```
+
+        In order to help debug prompt generation enable `debug=True` which will show you what's happening.
+
+        """
+
+        # if the value isn't overriden by the user, check if the tokenizer was trained with this token and then use it
+        if add_end_of_utterance_token is None:
+            add_end_of_utterance_token = self.tokenizer_was_trained_with_end_of_utterance_token
+
+        # turn non-batched prompts into batched
+        if not any(isinstance(i, list) for i in prompts):
+            prompts = [prompts]
+
+        fake_token = "<fake_token_around_image>"
+        image_token = "<image>"
+        end_of_utterance_token = "<end_of_utterance>"
+
+        def image_tokens(last_was_image):
+            if last_was_image:
+                return image_token + fake_token
+            else:
+                return fake_token + image_token + fake_token
+
+        all_prompts = []
+        all_images = []
+        for sample in prompts:
+            # the model was trained on samples starting with <s>
+            full_text = f"{self.tokenizer.bos_token}"
+
+            # an image can either be an image object in the item or the url, everything else is a verbatim prompt text
+            image_objects = []
+            last_was_image = False
+            last_was_text = False
+            for i, item in enumerate(sample):
+                if i > 0:
+                    last_was_text = True if not last_was_image else False
+
+                if isinstance(item, str):
+                    item = item.strip(" ")
+                    if is_url(item):
+                        image = self.image_processor.fetch_images(item)
+                        full_text += image_tokens(last_was_image)
+                        image_objects.append(image)
+                        last_was_image = True
+                    else:
+                        # we add end_of_utterance_token between each subsequent text prompts (but not at the last one!)
+                        if add_end_of_utterance_token and last_was_text:
+                            full_text += end_of_utterance_token
+                        full_text += item
+                        last_was_image = False
+                else:
+                    # must be an image obj
+                    full_text += image_tokens(last_was_image)
+                    image_objects.append(item)
+                    last_was_image = True
+
+            if add_eos_token:
+                full_text += self.tokenizer.eos_token
+
+            if debug is True:
+                print(f"{full_text=}")
+
+            image_objects = self.image_processor(image_objects, transform=transform)
+
+            all_prompts.append(full_text)
+            all_images.append(image_objects)
+
+        text_encoding = self.tokenizer(
+            text=all_prompts,
+            add_special_tokens=False,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+        )
+        all_texts = text_encoding["input_ids"]
+
+        max_seq_len = max(len(x) for x in all_texts)
+
+        # max_num_images has to be at least 1 even when there are no images
+        max_num_images = max(len(x) for x in all_images)
+        max_num_images = max(1, max_num_images)
+
+        at_least_one_image = sum(len(x) for x in all_images) > 0
+        output_input_ids = []
+        output_images = []
+        output_attention_masks = []
+        for text, images in zip(all_texts, all_images):
+            padded_input_ids = [self.tokenizer.pad_token_id] * max_seq_len
+            unpadded_seq_len = len(text)
+            start = max_seq_len - unpadded_seq_len
+            padded_input_ids[start:] = text[:max_seq_len]
+
+            attention_mask = torch.zeros((max_seq_len,), dtype=torch.long)
+            attention_mask[start:] = 1
+
+            image_count = padded_input_ids.count(self.image_token_id)
+            local_max_num_images = min(image_count, max_num_images)
+
+            current_images = images[:local_max_num_images]
+
+            if len(current_images) > 0:
+                padded_image_tensor = torch.zeros(max_num_images, *current_images.size()[1:])
+                padded_image_tensor[: current_images.size(0)] = current_images
+            else:
+                padded_image_tensor = torch.zeros(max_num_images, *self.default_image_dims)
+
+            output_images.append(padded_image_tensor)
+            output_input_ids.append(torch.tensor(padded_input_ids))
+
+            output_attention_masks.append(attention_mask)
+
+        output_input_ids = torch.stack(output_input_ids)
+        output_images = torch.stack(output_images)
+        output_attention_masks = torch.stack(output_attention_masks)
+
+        if at_least_one_image:
+            image_attention_mask, _ = image_attention_mask_for_packed_input_ids(output_input_ids, self.tokenizer)
+            image_attention_mask = incremental_to_binary_attention_mask(
+                image_attention_mask, num_classes=max_num_images
+            )
+        else:
+            # in full language mode we set the image mask to all-0s
+            image_attention_mask = torch.zeros(
+                output_input_ids.shape[0], output_input_ids.shape[1], 1, dtype=torch.bool
+            )
+
+        return BatchFeature(
+            data={
+                "input_ids": output_input_ids,
+                "attention_mask": output_attention_masks,
+                "pixel_values": output_images,
+                "image_attention_mask": image_attention_mask,
+            }
+        )
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
diff --git a/src/transformers/models/idefics/vision.py b/src/transformers/models/idefics/vision.py
new file mode 100644
index 000000000000..8b7a14c56a2f
--- /dev/null
+++ b/src/transformers/models/idefics/vision.py
@@ -0,0 +1,496 @@
+# coding=utf-8
+# Copyright 2021 The OpenAI Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch IdeficsVision model: a copy of CLIPVisionModel using a simpler config object"""
+
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from ...utils import ModelOutput, logging
+from .configuration_idefics import IdeficsVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+class IdeficsVisionModelOutput(ModelOutput):
+    """
+    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
+
+    Args:
+        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The image embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    image_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+# Adapted from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings
+class IdeficsVisionEmbeddings(nn.Module):
+    def __init__(self, config: IdeficsVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            bias=False,
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
+
+    # Heavily inspired from https://github.com/huggingface/transformers/blob/v4.33.0/src/transformers/models/vit/modeling_vit.py#L82
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
+        resolution images.
+
+        Source:
+        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
+        """
+
+        num_patches = embeddings.shape[1] - 1
+        pos_embed = self.position_embedding(self.position_ids)
+        num_positions = pos_embed.shape[1] - 1
+        if num_patches == num_positions and height == width:
+            return pos_embed
+        class_pos_embed = pos_embed[:, 0]
+        patch_pos_embed = pos_embed[:, 1:]
+
+        embed_dim = embeddings.shape[-1]
+        num_h_patches = height // self.config.patch_size
+        num_w_patches = width // self.config.patch_size
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        num_h_patches, num_w_patches = num_h_patches + 0.1, num_w_patches + 0.1
+        sqrt_num_positions = math.sqrt(num_positions)
+        patch_pos_embed = patch_pos_embed.reshape(1, int(sqrt_num_positions), int(sqrt_num_positions), embed_dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+        fp32_upcasting = patch_pos_embed.dtype == torch.bfloat16
+        if fp32_upcasting:
+            logger.warning_once(
+                "Upcasting patch_pos_embed to fp32 for interpolation since `upsample_bicubic2d_out_frame` in nn.functional.interpolate"
+                "is not implemented for 'torch.bfloat16' dtype. This will result in a slight overhead"
+            )
+            patch_pos_embed = patch_pos_embed.to(torch.float)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            scale_factor=(num_h_patches / sqrt_num_positions, num_w_patches / sqrt_num_positions),
+            mode="bicubic",
+            align_corners=False,
+        )
+        if fp32_upcasting:
+            patch_pos_embed = patch_pos_embed.to(torch.bfloat16)
+        if int(num_h_patches) != patch_pos_embed.shape[-2] or int(num_w_patches) != patch_pos_embed.shape[-1]:
+            raise ValueError(
+                f"Number of patches for images ({int(num_h_patches), int(num_w_patches)}) don't match the "
+                f"shape of position embedding ({patch_pos_embed.shape[-2], patch_pos_embed.shape[-1]})"
+            )
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, embed_dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
+
+    def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding: bool = False) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        if not interpolate_pos_encoding:
+            if height != self.image_size or width != self.image_size:
+                raise ValueError(
+                    f"Input image size ({height}*{width}) doesn't match model"
+                    f" ({self.image_size}*{self.image_size}). You should try to set `interpolate_pos_encoding=True`"
+                )
+
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
+
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+
+        # add positional encoding to each token
+        if interpolate_pos_encoding:
+            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            embeddings = embeddings + self.position_embedding(self.position_ids)
+
+        return embeddings
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPAttention with CLIP->IdeficsVision
+class IdeficsVisionAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scale
+        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        # apply the causal_attention_mask first
+        if causal_attention_mask is not None:
+            if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
+                    f" {causal_attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit akward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->IdeficsVision
+class IdeficsVisionMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->IdeficsVision
+class IdeficsVisionEncoderLayer(nn.Module):
+    def __init__(self, config: IdeficsVisionConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = IdeficsVisionAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = IdeficsVisionMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->IdeficsVision
+class IdeficsVisionEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`IdeficsVisionEncoderLayer`].
+
+    Args:
+        config: IdeficsVisionConfig
+    """
+
+    def __init__(self, config: IdeficsVisionConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([IdeficsVisionEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(encoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+# Adapted from transformers.models.clip.modeling_clip.CLIPVisionTransformer
+class IdeficsVisionTransformer(nn.Module):
+    def __init__(self, config: IdeficsVisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = IdeficsVisionEmbeddings(config)
+        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.encoder = IdeficsVisionEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    # Adapted from transformers.models.clip.modeling_clip.CLIPVisionTransformer.forward
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = False,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+        hidden_states = self.pre_layrnorm(hidden_states)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
diff --git a/src/transformers/models/imagegpt/__init__.py b/src/transformers/models/imagegpt/__init__.py
index 8a7ed9669d17..7d3e1440da94 100644
--- a/src/transformers/models/imagegpt/__init__.py
+++ b/src/transformers/models/imagegpt/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt.py b/src/transformers/models/imagegpt/image_processing_imagegpt.py
index e775b50a28ae..ad421c910536 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt.py
@@ -18,13 +18,19 @@
 
 import numpy as np
 
-from transformers.utils import is_vision_available
-from transformers.utils.generic import TensorType
-
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import rescale, resize, to_channel_dimension_format
-from ...image_utils import ChannelDimension, ImageInput, PILImageResampling, is_batched, to_numpy_array, valid_images
-from ...utils import logging
+from ...image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import TensorType, is_vision_available, logging
 
 
 if is_vision_available():
@@ -56,15 +62,15 @@ class ImageGPTImageProcessor(BaseImageProcessor):
     (color clusters).
 
     Args:
-        clusters (`np.ndarray`, *optional*):
-            The color clusters to use, as a `np.ndarray` of shape `(n_clusters, 3)` when color quantizing. Can be
-            overriden by `clusters` in `preprocess`.
+        clusters (`np.ndarray` or `List[List[int]]`, *optional*):
+            The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overriden by `clusters`
+            in `preprocess`.
         do_resize (`bool`, *optional*, defaults to `True`):
             Whether to resize the image's dimensions to `(size["height"], size["width"])`. Can be overridden by
             `do_resize` in `preprocess`.
         size (`Dict[str, int]` *optional*, defaults to `{"height": 256, "width": 256}`):
             Size of the image after resizing. Can be overridden by `size` in `preprocess`.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
             Resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`.
         do_normalize (`bool`, *optional*, defaults to `True`):
             Whether to normalize the image pixel value to between [-1, 1]. Can be overridden by `do_normalize` in
@@ -77,57 +83,79 @@ class ImageGPTImageProcessor(BaseImageProcessor):
 
     def __init__(
         self,
-        # clusters is a first argument to maintain backwards compatibility with the old ImageGPTFeatureExtractor
-        clusters: Optional[np.ndarray] = None,
+        # clusters is a first argument to maintain backwards compatibility with the old ImageGPTImageProcessor
+        clusters: Optional[Union[List[List[int]], np.ndarray]] = None,
         do_resize: bool = True,
         size: Dict[str, int] = None,
         resample: PILImageResampling = PILImageResampling.BILINEAR,
         do_normalize: bool = True,
         do_color_quantize: bool = True,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"height": 256, "width": 256}
         size = get_size_dict(size)
-        self.clusters = clusters
+        self.clusters = np.array(clusters) if clusters is not None else None
         self.do_resize = do_resize
         self.size = size
         self.resample = resample
         self.do_normalize = do_normalize
         self.do_color_quantize = do_color_quantize
 
+    # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize
     def resize(
         self,
         image: np.ndarray,
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BILINEAR,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> np.ndarray:
         """
-        Resize an image to (size["height"], size["width"]).
+        Resize an image to `(size["height"], size["width"])`.
 
         Args:
             image (`np.ndarray`):
                 Image to resize.
             size (`Dict[str, int]`):
-                Size of the output image.
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
             resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
-                Resampling filter to use when resizing the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
         """
         size = get_size_dict(size)
         if "height" not in size or "width" not in size:
-            raise ValueError(f"Size dictionary must contain both height and width keys. Got {size.keys()}")
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        output_size = (size["height"], size["width"])
         return resize(
-            image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
         )
 
     def normalize(
         self,
         image: np.ndarray,
         data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.ndarray:
         """
         Normalizes an images' pixel values to between [-1, 1].
@@ -137,8 +165,10 @@ def normalize(
                 Image to normalize.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
         """
-        image = rescale(image=image, scale=1 / 127.5, data_format=data_format)
+        image = rescale(image=image, scale=1 / 127.5, data_format=data_format, input_data_format=input_data_format)
         image = image - 1
         return image
 
@@ -150,9 +180,10 @@ def preprocess(
         resample: PILImageResampling = None,
         do_normalize: bool = None,
         do_color_quantize: Optional[bool] = None,
-        clusters: Optional[Union[int, List[int]]] = None,
+        clusters: Optional[Union[List[List[int]], np.ndarray]] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
         **kwargs,
     ) -> PIL.Image.Image:
         """
@@ -160,7 +191,8 @@ def preprocess(
 
         Args:
             images (`ImageInput`):
-                Image to preprocess.
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_normalize=False`.
             do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                 Whether to resize the image.
             size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@@ -172,7 +204,7 @@ def preprocess(
                 Whether to normalize the image
             do_color_quantize (`bool`, *optional*, defaults to `self.do_color_quantize`):
                 Whether to color quantize the image.
-            clusters (`np.ndarray`, *optional*, defaults to `self.clusters`):
+            clusters (`np.ndarray` or `List[List[int]]`, *optional*, defaults to `self.clusters`):
                 Clusters used to quantize the image of shape `(n_clusters, 3)`. Only has an effect if
                 `do_color_quantize` is set to `True`.
             return_tensors (`str` or `TensorType`, *optional*):
@@ -187,6 +219,12 @@ def preprocess(
                     - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                     - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                 Only has an effect if `do_color_quantize` is set to `False`.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         size = size if size is not None else self.size
@@ -195,9 +233,9 @@ def preprocess(
         do_normalize = do_normalize if do_normalize is not None else self.do_normalize
         do_color_quantize = do_color_quantize if do_color_quantize is not None else self.do_color_quantize
         clusters = clusters if clusters is not None else self.clusters
+        clusters = np.array(clusters)
 
-        if not is_batched(images):
-            images = [images]
+        images = make_list_of_images(images)
 
         if not valid_images(images):
             raise ValueError(
@@ -214,17 +252,29 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
+        if is_scaled_image(images[0]) and do_normalize:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If you wish to do this, "
+                "make sure to set `do_normalize` to `False` and that pixel values are between [-1, 1].",
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
         if do_resize:
-            images = [self.resize(image=image, size=size, resample=resample) for image in images]
+            images = [
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
 
         if do_normalize:
-            images = [self.normalize(image=image) for image in images]
+            images = [self.normalize(image=image, input_data_format=input_data_format) for image in images]
 
         if do_color_quantize:
-            images = [to_channel_dimension_format(image, ChannelDimension.LAST) for image in images]
+            images = [to_channel_dimension_format(image, ChannelDimension.LAST, input_data_format) for image in images]
             # color quantize from (batch_size, height, width, 3) to (batch_size, height, width)
             images = np.array(images)
-            clusters = np.array(clusters)
             images = color_quantize(images, clusters).reshape(images.shape[:-1])
 
             # flatten to (batch_size, height*width)
@@ -234,7 +284,10 @@ def preprocess(
             # We need to convert back to a list of images to keep consistent behaviour across processors.
             images = list(images)
         else:
-            images = [to_channel_dimension_format(image, data_format) for image in images]
+            images = [
+                to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+                for image in images
+            ]
 
         data = {"input_ids": images}
         return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/src/transformers/models/imagegpt/modeling_imagegpt.py b/src/transformers/models/imagegpt/modeling_imagegpt.py
index 737e52ed7e75..5f193a137b00 100755
--- a/src/transformers/models/imagegpt/modeling_imagegpt.py
+++ b/src/transformers/models/imagegpt/modeling_imagegpt.py
@@ -41,7 +41,6 @@
 
 _CHECKPOINT_FOR_DOC = "openai/imagegpt-small"
 _CONFIG_FOR_DOC = "ImageGPTConfig"
-_TOKENIZER_FOR_DOC = "ImageGPTTokenizer"
 
 IMAGEGPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "openai/imagegpt-small",
@@ -181,11 +180,12 @@ def __init__(self, config, is_cross_attention: Optional[bool] = False, layer_idx
         max_positions = config.max_position_embeddings
         self.register_buffer(
             "bias",
-            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.uint8)).view(
+            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
                 1, 1, max_positions, max_positions
             ),
+            persistent=False,
         )
-        self.register_buffer("masked_bias", torch.tensor(-1e4))
+        self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)
 
         self.embed_dim = config.hidden_size
         self.num_heads = config.num_attention_heads
@@ -245,7 +245,7 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None):
         if not self.is_cross_attention:
             # if only "normal" attention layer implements causal mask
             query_length, key_length = query.size(-2), key.size(-2)
-            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].bool()
+            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
             mask_value = torch.finfo(attn_weights.dtype).min
             # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
             # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
@@ -295,7 +295,7 @@ def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, hea
         if not self.is_cross_attention:
             # if only "normal" attention layer implements causal mask
             query_length, key_length = query.size(-2), key.size(-2)
-            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].bool()
+            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
             mask_value = torch.finfo(attn_weights.dtype).min
             # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
             # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
@@ -556,8 +556,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
             `input_ids`.
 
-            Indices can be obtained using [`ImageGPTImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for
-            details.
+            Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.
 
         past_key_values (`Tuple[Tuple[torch.Tensor]]` of length `config.n_layers`):
             Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
@@ -615,8 +614,6 @@ def _set_gradient_checkpointing(self, module, value=False):
     IMAGEGPT_START_DOCSTRING,
 )
 class ImageGPTModel(ImageGPTPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["attn.masked_bias"]
-
     def __init__(self, config: ImageGPTConfig):
         super().__init__(config)
 
@@ -679,14 +676,14 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import ImageGPTImageProcessor, ImageGPTModel
+        >>> from transformers import AutoImageProcessor, ImageGPTModel
         >>> from PIL import Image
         >>> import requests
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> image_processor = ImageGPTImageProcessor.from_pretrained("openai/imagegpt-small")
+        >>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
         >>> model = ImageGPTModel.from_pretrained("openai/imagegpt-small")
 
         >>> inputs = image_processor(images=image, return_tensors="pt")
@@ -718,6 +715,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
             batch_size = input_ids.shape[0]
@@ -731,8 +729,6 @@ def forward(
 
         if token_type_ids is not None:
             token_type_ids = token_type_ids.view(-1, input_shape[-1])
-        if position_ids is not None:
-            position_ids = position_ids.view(-1, input_shape[-1])
 
         if past_key_values is None:
             past_length = 0
@@ -741,7 +737,7 @@ def forward(
             past_length = past_key_values[0][0].size(-2)
         if position_ids is None:
             position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
-            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
+            position_ids = position_ids.unsqueeze(0)
 
         # ImageGPTAttention mask.
         if attention_mask is not None:
@@ -793,12 +789,18 @@ def forward(
 
         output_shape = input_shape + (hidden_states.size(-1),)
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         presents = () if use_cache else None
         all_self_attentions = () if output_attentions else None
         all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
         all_hidden_states = () if output_hidden_states else None
         for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-
             # Model parallel
             if self.model_parallel:
                 torch.cuda.set_device(hidden_states.device)
@@ -815,12 +817,6 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         # None for past_key_value
@@ -895,7 +891,7 @@ def custom_forward(*inputs):
     IMAGEGPT_START_DOCSTRING,
 )
 class ImageGPTForCausalImageModeling(ImageGPTPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config: ImageGPTConfig):
         super().__init__(config)
@@ -973,20 +969,20 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import ImageGPTImageProcessor, ImageGPTForCausalImageModeling
+        >>> from transformers import AutoImageProcessor, ImageGPTForCausalImageModeling
         >>> import torch
         >>> import matplotlib.pyplot as plt
         >>> import numpy as np
 
-        >>> image_processor = ImageGPTImageProcessor.from_pretrained("openai/imagegpt-small")
+        >>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
         >>> model = ImageGPTForCausalImageModeling.from_pretrained("openai/imagegpt-small")
         >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        >>> model.to(device)
+        >>> model.to(device)  # doctest: +IGNORE_RESULT
 
         >>> # unconditional generation of 8 images
-        >>> batch_size = 8
+        >>> batch_size = 4
         >>> context = torch.full((batch_size, 1), model.config.vocab_size - 1)  # initialize with SOS token
-        >>> context = torch.tensor(context).to(device)
+        >>> context = context.to(device)
         >>> output = model.generate(
         ...     input_ids=context, max_length=model.config.n_positions + 1, temperature=1.0, do_sample=True, top_k=40
         ... )
@@ -1001,7 +997,7 @@ def forward(
         ... ]  # convert color cluster tokens back to pixels
         >>> f, axes = plt.subplots(1, batch_size, dpi=300)
 
-        >>> for img, ax in zip(samples_img, axes):
+        >>> for img, ax in zip(samples_img, axes):  # doctest: +IGNORE_RESULT
         ...     ax.axis("off")
         ...     ax.imshow(img)
         ```"""
@@ -1064,7 +1060,9 @@ def forward(
         )
 
     @staticmethod
-    def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> Tuple[Tuple[torch.Tensor]]:
+    def _reorder_cache(
+        past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
+    ) -> Tuple[Tuple[torch.Tensor]]:
         """
         This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
         [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
@@ -1072,7 +1070,7 @@ def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) ->
         """
         return tuple(
             tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
-            for layer_past in past
+            for layer_past in past_key_values
         )
 
 
@@ -1084,8 +1082,6 @@ def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) ->
     IMAGEGPT_START_DOCSTRING,
 )
 class ImageGPTForImageClassification(ImageGPTPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"]
-
     def __init__(self, config: ImageGPTConfig):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1124,14 +1120,14 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import ImageGPTImageProcessor, ImageGPTForImageClassification
+        >>> from transformers import AutoImageProcessor, ImageGPTForImageClassification
         >>> from PIL import Image
         >>> import requests
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> image_processor = ImageGPTImageProcessor.from_pretrained("openai/imagegpt-small")
+        >>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
         >>> model = ImageGPTForImageClassification.from_pretrained("openai/imagegpt-small")
 
         >>> inputs = image_processor(images=image, return_tensors="pt")
diff --git a/src/transformers/models/informer/__init__.py b/src/transformers/models/informer/__init__.py
new file mode 100644
index 000000000000..478ad56a72ba
--- /dev/null
+++ b/src/transformers/models/informer/__init__.py
@@ -0,0 +1,60 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+# rely on isort to merge the imports
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {
+    "configuration_informer": [
+        "INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "InformerConfig",
+    ],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_informer"] = [
+        "INFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "InformerForPrediction",
+        "InformerModel",
+        "InformerPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_informer import INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, InformerConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_informer import (
+            INFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            InformerForPrediction,
+            InformerModel,
+            InformerPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
new file mode 100644
index 000000000000..d8af8c793cdb
--- /dev/null
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -0,0 +1,252 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Informer model configuration"""
+
+from typing import List, Optional, Union
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "huggingface/informer-tourism-monthly": (
+        "https://huggingface.co/huggingface/informer-tourism-monthly/resolve/main/config.json"
+    ),
+    # See all Informer models at https://huggingface.co/models?filter=informer
+}
+
+
+class InformerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of an [`InformerModel`]. It is used to instantiate an
+    Informer model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Informer
+    [huggingface/informer-tourism-monthly](https://huggingface.co/huggingface/informer-tourism-monthly) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        prediction_length (`int`):
+            The prediction length for the decoder. In other words, the prediction horizon of the model. This value is
+            typically dictated by the dataset and we recommend to set it appropriately.
+        context_length (`int`, *optional*, defaults to `prediction_length`):
+            The context length for the encoder. If `None`, the context length will be the same as the
+            `prediction_length`.
+        distribution_output (`string`, *optional*, defaults to `"student_t"`):
+            The distribution emission head for the model. Could be either "student_t", "normal" or "negative_binomial".
+        loss (`string`, *optional*, defaults to `"nll"`):
+            The loss function for the model corresponding to the `distribution_output` head. For parametric
+            distributions it is the negative log likelihood (nll) - which currently is the only supported one.
+        input_size (`int`, *optional*, defaults to 1):
+            The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of
+            multivariate targets.
+        scaling (`string` or `bool`, *optional* defaults to `"mean"`):
+            Whether to scale the input targets via "mean" scaler, "std" scaler or no scaler if `None`. If `True`, the
+            scaler is set to "mean".
+        lags_sequence (`list[int]`, *optional*, defaults to `[1, 2, 3, 4, 5, 6, 7]`):
+            The lags of the input time series as covariates often dictated by the frequency of the data. Default is
+            `[1, 2, 3, 4, 5, 6, 7]` but we recommend to change it based on the dataset appropriately.
+        num_time_features (`int`, *optional*, defaults to 0):
+            The number of time features in the input time series.
+        num_dynamic_real_features (`int`, *optional*, defaults to 0):
+            The number of dynamic real valued features.
+        num_static_categorical_features (`int`, *optional*, defaults to 0):
+            The number of static categorical features.
+        num_static_real_features (`int`, *optional*, defaults to 0):
+            The number of static real valued features.
+        cardinality (`list[int]`, *optional*):
+            The cardinality (number of different values) for each of the static categorical features. Should be a list
+            of integers, having the same length as `num_static_categorical_features`. Cannot be `None` if
+            `num_static_categorical_features` is > 0.
+        embedding_dimension (`list[int]`, *optional*):
+            The dimension of the embedding for each of the static categorical features. Should be a list of integers,
+            having the same length as `num_static_categorical_features`. Cannot be `None` if
+            `num_static_categorical_features` is > 0.
+        d_model (`int`, *optional*, defaults to 64):
+            Dimensionality of the transformer layers.
+        encoder_layers (`int`, *optional*, defaults to 2):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 2):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 2):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 2):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 32):
+            Dimension of the "intermediate" (often named feed-forward) layer in encoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 32):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and decoder. If string, `"gelu"` and
+            `"relu"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the encoder, and decoder.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the attention and fully connected layers for each encoder layer.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the attention and fully connected layers for each decoder layer.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability used between the two layers of the feed-forward networks.
+        num_parallel_samples (`int`, *optional*, defaults to 100):
+            The number of samples to generate in parallel for each time step of inference.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated normal weight initialization distribution.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether to use the past key/values attentions (if applicable to the model) to speed up decoding.
+        attention_type (`str`, *optional*, defaults to "prob"):
+            Attention used in encoder. This can be set to "prob" (Informer's ProbAttention) or "full" (vanilla
+            transformer's canonical self-attention).
+        sampling_factor (`int`, *optional*, defaults to 5):
+            ProbSparse sampling factor (only makes affect when `attention_type`="prob"). It is used to control the
+            reduced query matrix (Q_reduce) input length.
+        distil (`bool`, *optional*, defaults to `True`):
+            Whether to use distilling in encoder.
+
+    Example:
+
+    ```python
+    >>> from transformers import InformerConfig, InformerModel
+
+    >>> # Initializing an Informer configuration with 12 time steps for prediction
+    >>> configuration = InformerConfig(prediction_length=12)
+
+    >>> # Randomly initializing a model (with random weights) from the configuration
+    >>> model = InformerModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "informer"
+    attribute_map = {
+        "hidden_size": "d_model",
+        "num_attention_heads": "encoder_attention_heads",
+        "num_hidden_layers": "encoder_layers",
+    }
+
+    def __init__(
+        self,
+        prediction_length: Optional[int] = None,
+        context_length: Optional[int] = None,
+        distribution_output: str = "student_t",
+        loss: str = "nll",
+        input_size: int = 1,
+        lags_sequence: List[int] = None,
+        scaling: Optional[Union[str, bool]] = "mean",
+        num_dynamic_real_features: int = 0,
+        num_static_real_features: int = 0,
+        num_static_categorical_features: int = 0,
+        num_time_features: int = 0,
+        cardinality: Optional[List[int]] = None,
+        embedding_dimension: Optional[List[int]] = None,
+        d_model: int = 64,
+        encoder_ffn_dim: int = 32,
+        decoder_ffn_dim: int = 32,
+        encoder_attention_heads: int = 2,
+        decoder_attention_heads: int = 2,
+        encoder_layers: int = 2,
+        decoder_layers: int = 2,
+        is_encoder_decoder: bool = True,
+        activation_function: str = "gelu",
+        dropout: float = 0.05,
+        encoder_layerdrop: float = 0.1,
+        decoder_layerdrop: float = 0.1,
+        attention_dropout: float = 0.1,
+        activation_dropout: float = 0.1,
+        num_parallel_samples: int = 100,
+        init_std: float = 0.02,
+        use_cache=True,
+        # Informer arguments
+        attention_type: str = "prob",
+        sampling_factor: int = 5,
+        distil: bool = True,
+        **kwargs,
+    ):
+        # time series specific configuration
+        self.prediction_length = prediction_length
+        self.context_length = context_length or prediction_length
+        self.distribution_output = distribution_output
+        self.loss = loss
+        self.input_size = input_size
+        self.num_time_features = num_time_features
+        self.lags_sequence = lags_sequence if lags_sequence is not None else [1, 2, 3, 4, 5, 6, 7]
+        self.scaling = scaling
+        self.num_dynamic_real_features = num_dynamic_real_features
+        self.num_static_real_features = num_static_real_features
+        self.num_static_categorical_features = num_static_categorical_features
+
+        # set cardinality
+        if cardinality and num_static_categorical_features > 0:
+            if len(cardinality) != num_static_categorical_features:
+                raise ValueError(
+                    "The cardinality should be a list of the same length as `num_static_categorical_features`"
+                )
+            self.cardinality = cardinality
+        else:
+            self.cardinality = [0]
+
+        # set embedding_dimension
+        if embedding_dimension and num_static_categorical_features > 0:
+            if len(embedding_dimension) != num_static_categorical_features:
+                raise ValueError(
+                    "The embedding dimension should be a list of the same length as `num_static_categorical_features`"
+                )
+            self.embedding_dimension = embedding_dimension
+        else:
+            self.embedding_dimension = [min(50, (cat + 1) // 2) for cat in self.cardinality]
+
+        self.num_parallel_samples = num_parallel_samples
+
+        # Transformer architecture configuration
+        self.feature_size = input_size * len(self.lags_sequence) + self._number_of_features
+        self.d_model = d_model
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_attention_heads = decoder_attention_heads
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.decoder_layers = decoder_layers
+
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+
+        self.activation_function = activation_function
+        self.init_std = init_std
+
+        self.use_cache = use_cache
+
+        # Informer
+        self.attention_type = attention_type
+        self.sampling_factor = sampling_factor
+        self.distil = distil
+
+        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
+
+    @property
+    def _number_of_features(self) -> int:
+        return (
+            sum(self.embedding_dimension)
+            + self.num_dynamic_real_features
+            + self.num_time_features
+            + self.num_static_real_features
+            + self.input_size * 2  # the log1p(abs(loc)) and log(scale) features
+        )
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
new file mode 100644
index 000000000000..e7b35174ca7e
--- /dev/null
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -0,0 +1,2109 @@
+# coding=utf-8
+# Copyright 2023 Amazon and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Informer model."""
+
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    SampleTSPredictionOutput,
+    Seq2SeqTSModelOutput,
+    Seq2SeqTSPredictionOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from .configuration_informer import InformerConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "InformerConfig"
+
+
+INFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "huggingface/informer-tourism-monthly",
+    # See all Informer models at https://huggingface.co/models?filter=informer
+]
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesFeatureEmbedder with TimeSeries->Informer
+class InformerFeatureEmbedder(nn.Module):
+    """
+    Embed a sequence of categorical features.
+
+    Args:
+        cardinalities (`list[int]`):
+            List of cardinalities of the categorical features.
+        embedding_dims (`list[int]`):
+            List of embedding dimensions of the categorical features.
+    """
+
+    def __init__(self, cardinalities: List[int], embedding_dims: List[int]) -> None:
+        super().__init__()
+
+        self.num_features = len(cardinalities)
+        self.embedders = nn.ModuleList([nn.Embedding(c, d) for c, d in zip(cardinalities, embedding_dims)])
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        if self.num_features > 1:
+            # we slice the last dimension, giving an array of length
+            # self.num_features with shape (N,T) or (N)
+            cat_feature_slices = torch.chunk(features, self.num_features, dim=-1)
+        else:
+            cat_feature_slices = [features]
+
+        return torch.cat(
+            [
+                embed(cat_feature_slice.squeeze(-1))
+                for embed, cat_feature_slice in zip(self.embedders, cat_feature_slices)
+            ],
+            dim=-1,
+        )
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeries->Informer
+class InformerStdScaler(nn.Module):
+    """
+    Standardize features by calculating the mean and scaling along some given dimension `dim`, and then normalizes it
+    by subtracting from the mean and dividing by the standard deviation.
+
+    Args:
+        dim (`int`):
+            Dimension along which to calculate the mean and standard deviation.
+        keepdim (`bool`, *optional*, defaults to `False`):
+            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+        minimum_scale (`float`, *optional*, defaults to 1e-5):
+            Default scale that is used for elements that are constantly zero along dimension `dim`.
+    """
+
+    def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-5):
+        super().__init__()
+        if not dim > 0:
+            raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0")
+        self.dim = dim
+        self.keepdim = keepdim
+        self.minimum_scale = minimum_scale
+
+    @torch.no_grad()
+    def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        denominator = weights.sum(self.dim, keepdim=self.keepdim)
+        denominator = denominator.clamp_min(1.0)
+        loc = (data * weights).sum(self.dim, keepdim=self.keepdim) / denominator
+
+        variance = (((data - loc) * weights) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator
+        scale = torch.sqrt(variance + self.minimum_scale)
+        return (data - loc) / scale, loc, scale
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesMeanScaler with TimeSeries->Informer
+class InformerMeanScaler(nn.Module):
+    """
+    Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data
+    accordingly.
+
+    Args:
+        dim (`int`):
+            Dimension along which to compute the scale.
+        keepdim (`bool`, *optional*, defaults to `False`):
+            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+        default_scale (`float`, *optional*, defaults to `None`):
+            Default scale that is used for elements that are constantly zero. If `None`, we use the scale of the batch.
+        minimum_scale (`float`, *optional*, defaults to 1e-10):
+            Default minimum possible scale that is used for any item.
+    """
+
+    def __init__(
+        self, dim: int = -1, keepdim: bool = True, default_scale: Optional[float] = None, minimum_scale: float = 1e-10
+    ):
+        super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
+        self.minimum_scale = minimum_scale
+        self.default_scale = default_scale
+
+    @torch.no_grad()
+    def forward(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        # shape: (N, [C], T=1)
+        ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True)
+        num_observed = observed_indicator.sum(self.dim, keepdim=True)
+
+        scale = ts_sum / torch.clamp(num_observed, min=1)
+
+        # If `default_scale` is provided, we use it, otherwise we use the scale
+        # of the batch.
+        if self.default_scale is None:
+            batch_sum = ts_sum.sum(dim=0)
+            batch_observations = torch.clamp(num_observed.sum(0), min=1)
+            default_scale = torch.squeeze(batch_sum / batch_observations)
+        else:
+            default_scale = self.default_scale * torch.ones_like(scale)
+
+        # apply default scale where there are no observations
+        scale = torch.where(num_observed > 0, scale, default_scale)
+
+        # ensure the scale is at least `self.minimum_scale`
+        scale = torch.clamp(scale, min=self.minimum_scale)
+        scaled_data = data / scale
+
+        if not self.keepdim:
+            scale = scale.squeeze(dim=self.dim)
+
+        return scaled_data, torch.zeros_like(scale), scale
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesNOPScaler with TimeSeries->Informer
+class InformerNOPScaler(nn.Module):
+    """
+    Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data.
+
+    Args:
+        dim (`int`):
+            Dimension along which to compute the scale.
+        keepdim (`bool`, *optional*, defaults to `False`):
+            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+    """
+
+    def __init__(self, dim: int, keepdim: bool = False):
+        super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
+
+    def forward(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
+        loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
+        return data, loc, scale
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.weighted_average
+def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
+    """
+    Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
+    meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.
+
+    Args:
+        input_tensor (`torch.FloatTensor`):
+            Input tensor, of which the average must be computed.
+        weights (`torch.FloatTensor`, *optional*):
+            Weights tensor, of the same shape as `input_tensor`.
+        dim (`int`, *optional*):
+            The dim along which to average `input_tensor`.
+
+    Returns:
+        `torch.FloatTensor`: The tensor with values averaged along the specified `dim`.
+    """
+    if weights is not None:
+        weighted_tensor = torch.where(weights != 0, input_tensor * weights, torch.zeros_like(input_tensor))
+        sum_weights = torch.clamp(weights.sum(dim=dim) if dim else weights.sum(), min=1.0)
+        return (weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum()) / sum_weights
+    else:
+        return input_tensor.mean(dim=dim)
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.nll
+def nll(input: torch.distributions.Distribution, target: torch.Tensor) -> torch.Tensor:
+    """
+    Computes the negative log likelihood loss from input distribution with respect to target.
+    """
+    return -input.log_prob(target)
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+# Copied from transformers.models.marian.modeling_marian.MarianSinusoidalPositionalEmbedding with Marian->Informer
+class InformerSinusoidalPositionalEmbedding(nn.Embedding):
+    """This module produces sinusoidal positional embeddings of any length."""
+
+    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None) -> None:
+        super().__init__(num_positions, embedding_dim)
+        self.weight = self._init_weight(self.weight)
+
+    @staticmethod
+    def _init_weight(out: nn.Parameter) -> nn.Parameter:
+        """
+        Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in
+        the 2nd half of the vector. [dim // 2:]
+        """
+        n_pos, dim = out.shape
+        position_enc = np.array(
+            [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
+        )
+        out.requires_grad = False  # set early to avoid an error in pytorch-1.8+
+        sentinel = dim // 2 if dim % 2 == 0 else (dim // 2) + 1
+        out[:, 0:sentinel] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
+        out[:, sentinel:] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
+        out.detach_()
+        return out
+
+    @torch.no_grad()
+    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0) -> torch.Tensor:
+        """`input_ids_shape` is expected to be [bsz x seqlen]."""
+        bsz, seq_len = input_ids_shape[:2]
+        positions = torch.arange(
+            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
+        )
+        return super().forward(positions)
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesValueEmbedding with TimeSeries->Info
+class InformerValueEmbedding(nn.Module):
+    def __init__(self, feature_size, d_model):
+        super().__init__()
+        self.value_projection = nn.Linear(in_features=feature_size, out_features=d_model, bias=False)
+
+    def forward(self, x):
+        return self.value_projection(x)
+
+
+# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->Informer
+class InformerAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_value` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        if (
+            is_cross_attention
+            and past_key_value is not None
+            and past_key_value[0].shape[2] == key_value_states.shape[1]
+        ):
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
+class InformerProbSparseAttention(nn.Module):
+    """Probabilistic Attention mechanism to select the "active"
+    queries rather than the "lazy" queries and provides a sparse Transformer thus mitigating the quadratic compute and
+    memory requirements of vanilla attention"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        sampling_factor: int = 5,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.factor = sampling_factor
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_value` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        if (
+            is_cross_attention
+            and past_key_value is not None
+            and past_key_value[0].shape[2] == key_value_states.shape[1]
+        ):
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
+
+        key_states_time_length = key_states.size(1)  # L_K
+        log_key_states_time_length = np.ceil(np.log1p(key_states_time_length)).astype("int").item()  # log_L_K
+
+        query_states_time_length = query_states.size(1)  # L_Q
+        log_query_states_time_length = np.ceil(np.log1p(query_states_time_length)).astype("int").item()  # log_L_Q
+
+        u_part = min(self.factor * query_states_time_length * log_key_states_time_length, key_states_time_length)
+        u = min(self.factor * log_query_states_time_length, query_states_time_length)
+
+        if key_states_time_length > 0:
+            index_sample = torch.randint(0, key_states_time_length, (u_part,))
+            k_sample = key_states[:, index_sample, :]
+        else:
+            k_sample = key_states
+
+        queries_keys_sample = torch.bmm(query_states, k_sample.transpose(1, 2))  # Q_K_sampled
+
+        # find the Top_k query with sparsity measurement
+        if u > 0:
+            sparsity_measurement = queries_keys_sample.max(dim=-1)[0] - torch.div(
+                queries_keys_sample.sum(dim=-1), key_states_time_length
+            )  # M
+            top_u_sparsity_measurement = sparsity_measurement.topk(u, sorted=False)[1]  # M_top
+
+            # calculate q_reduce: query_states[:, top_u_sparsity_measurement]
+            dim_for_slice = torch.arange(query_states.size(0)).unsqueeze(-1)
+            q_reduce = query_states[dim_for_slice, top_u_sparsity_measurement]
+        else:
+            q_reduce = query_states
+            top_u_sparsity_measurement = None
+
+        # Use q_reduce to calculate attention weights
+        attn_weights = torch.bmm(q_reduce, key_states.transpose(1, 2))
+
+        src_len = key_states.size(1)
+        if attn_weights.size() != (bsz * self.num_heads, u, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, u, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            prob_mask = attention_mask.expand(bsz, self.num_heads, tgt_len, src_len).reshape(
+                bsz * self.num_heads, tgt_len, src_len
+            )
+
+            if top_u_sparsity_measurement is not None:
+                dim_for_slice = torch.arange(prob_mask.size(0)).unsqueeze(-1)
+                prob_mask = prob_mask[dim_for_slice, top_u_sparsity_measurement, :]
+
+            attn_weights = attn_weights.view(bsz, self.num_heads, u, src_len) + prob_mask.view(
+                bsz, self.num_heads, u, src_len
+            )
+            attn_weights = attn_weights.view(bsz * self.num_heads, u, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, u, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, u, src_len)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, u, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, u, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        # calculate context for updating the attn_output, based on:
+        # https://github.com/zhouhaoyi/Informer2020/blob/ac59c7447135473fb2aafeafe94395f884d5c7a5/models/attn.py#L74
+        if self.is_decoder:
+            # cast to float32 before operation to avoid overflow
+            context = value_states.cumsum(dim=-2, dtype=torch.float32).to(value_states.dtype)
+        else:
+            v_mean_dim_time = value_states.mean(dim=-2)
+            context = (
+                v_mean_dim_time.unsqueeze(dim=1)
+                .expand(bsz * self.num_heads, query_states_time_length, v_mean_dim_time.size(-1))
+                .clone()
+            )
+
+        if top_u_sparsity_measurement is not None:
+            # update context: copy the attention output to the context at top_u_sparsity_measurement index
+            dim_for_slice = torch.arange(context.size(0)).unsqueeze(-1)
+            context[dim_for_slice, top_u_sparsity_measurement, :] = attn_output
+            attn_output = context
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
+# source: https://github.com/zhouhaoyi/Informer2020/blob/main/models/encoder.py
+class InformerConvLayer(nn.Module):
+    def __init__(self, c_in):
+        super().__init__()
+        self.downConv = nn.Conv1d(
+            in_channels=c_in,
+            out_channels=c_in,
+            kernel_size=3,
+            padding=1,
+            padding_mode="circular",
+        )
+        self.norm = nn.BatchNorm1d(c_in)
+        self.activation = nn.ELU()
+        self.maxPool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)
+
+    def forward(self, x):
+        x = self.downConv(x.permute(0, 2, 1))
+        x = self.norm(x)
+        x = self.activation(x)
+        x = self.maxPool(x)
+        x = x.transpose(1, 2)
+        return x
+
+
+class InformerEncoderLayer(nn.Module):
+    def __init__(self, config: InformerConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        if config.attention_type == "prob":
+            self.self_attn = InformerProbSparseAttention(
+                embed_dim=self.embed_dim,
+                num_heads=config.encoder_attention_heads,
+                dropout=config.attention_dropout,
+                sampling_factor=config.sampling_factor,
+            )
+        else:
+            self.self_attn = InformerAttention(
+                embed_dim=self.embed_dim,
+                num_heads=config.encoder_attention_heads,
+                dropout=config.attention_dropout,
+            )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: torch.FloatTensor,
+        layer_head_mask: torch.FloatTensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states, attn_weights, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        if hidden_states.dtype == torch.float16 and (
+            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
+        ):
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class InformerDecoderLayer(nn.Module):
+    def __init__(self, config: InformerConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        if config.attention_type == "prob":
+            self.self_attn = InformerProbSparseAttention(
+                embed_dim=self.embed_dim,
+                num_heads=config.decoder_attention_heads,
+                dropout=config.attention_dropout,
+                sampling_factor=config.sampling_factor,
+                is_decoder=True,
+            )
+        else:
+            self.self_attn = InformerAttention(
+                embed_dim=self.embed_dim,
+                num_heads=config.decoder_attention_heads,
+                dropout=config.attention_dropout,
+                is_decoder=True,
+            )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.encoder_attn = InformerAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+                size `(decoder_attention_heads,)`.
+            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+            )
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+class InformerPreTrainedModel(PreTrainedModel):
+    config_class = InformerConfig
+    base_model_prefix = "model"
+    main_input_name = "past_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (InformerDecoder, InformerEncoder)):
+            module.gradient_checkpointing = value
+
+
+INFORMER_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`TimeSeriesTransformerConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+INFORMER_INPUTS_DOCSTRING = r"""
+    Args:
+        past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`):
+            Past values of the time series, that serve as context in order to predict the future. The sequence size of
+            this tensor must be larger than the `context_length` of the model, since the model will use the larger size
+            to construct lag features, i.e. additional values from the past which are added in order to serve as "extra
+            context".
+
+            The `sequence_length` here is equal to `config.context_length` + `max(config.lags_sequence)`, which if no
+            `lags_sequence` is configured, is equal to `config.context_length` + 7 (as by default, the largest
+            look-back index in `config.lags_sequence` is 7). The property `_past_length` returns the actual length of
+            the past.
+
+            The `past_values` is what the Transformer encoder gets as input (with optional additional features, such as
+            `static_categorical_features`, `static_real_features`, `past_time_features` and lags).
+
+            Optionally, missing values need to be replaced with zeros and indicated via the `past_observed_mask`.
+
+            For multivariate time series, the `input_size` > 1 dimension is required and corresponds to the number of
+            variates in the time series per time step.
+        past_time_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`):
+            Required time features, which the model internally will add to `past_values`. These could be things like
+            "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These
+            could also be so-called "age" features, which basically help the model know "at which point in life" a
+            time-series is. Age features have small values for distant past time steps and increase monotonically the
+            more we approach the current time step. Holiday features are also a good example of time features.
+
+            These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
+            the position encodings are learned from scratch internally as parameters of the model, the Time Series
+            Transformer requires to provide additional time features. The Time Series Transformer only learns
+            additional embeddings for `static_categorical_features`.
+
+            Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these features
+            must but known at prediction time.
+
+            The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`.
+        past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`, *optional*):
+            Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected in
+            `[0, 1]`:
+
+            - 1 for values that are **observed**,
+            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+
+        static_categorical_features (`torch.LongTensor` of shape `(batch_size, number of static categorical features)`, *optional*):
+            Optional static categorical features for which the model will learn an embedding, which it will add to the
+            values of the time series.
+
+            Static categorical features are features which have the same value for all time steps (static over time).
+
+            A typical example of a static categorical feature is a time series ID.
+        static_real_features (`torch.FloatTensor` of shape `(batch_size, number of static real features)`, *optional*):
+            Optional static real features which the model will add to the values of the time series.
+
+            Static real features are features which have the same value for all time steps (static over time).
+
+            A typical example of a static real feature is promotion information.
+        future_values (`torch.FloatTensor` of shape `(batch_size, prediction_length)` or `(batch_size, prediction_length, input_size)`, *optional*):
+            Future values of the time series, that serve as labels for the model. The `future_values` is what the
+            Transformer needs during training to learn to output, given the `past_values`.
+
+            The sequence length here is equal to `prediction_length`.
+
+            See the demo notebook and code snippets for details.
+
+            Optionally, during training any missing values need to be replaced with zeros and indicated via the
+            `future_observed_mask`.
+
+            For multivariate time series, the `input_size` > 1 dimension is required and corresponds to the number of
+            variates in the time series per time step.
+        future_time_features (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`):
+            Required time features for the prediction window, which the model internally will add to `future_values`.
+            These could be things like "month of year", "day of the month", etc. encoded as vectors (for instance as
+            Fourier features). These could also be so-called "age" features, which basically help the model know "at
+            which point in life" a time-series is. Age features have small values for distant past time steps and
+            increase monotonically the more we approach the current time step. Holiday features are also a good example
+            of time features.
+
+            These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
+            the position encodings are learned from scratch internally as parameters of the model, the Time Series
+            Transformer requires to provide additional time features. The Time Series Transformer only learns
+            additional embeddings for `static_categorical_features`.
+
+            Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these features
+            must but known at prediction time.
+
+            The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`.
+        future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`, *optional*):
+            Boolean mask to indicate which `future_values` were observed and which were missing. Mask values selected
+            in `[0, 1]`:
+
+            - 1 for values that are **observed**,
+            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+
+            This mask is used to filter out missing values for the final loss calculation.
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on certain token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Mask to avoid performing attention on certain token indices. By default, a causal mask will be used, to
+            make sure the model can only look at previous inputs in order to predict the future.
+        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of `last_hidden_state`, `hidden_states` (*optional*) and `attentions` (*optional*)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` (*optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class InformerEncoder(InformerPreTrainedModel):
+    """
+    Informer encoder consisting of *config.encoder_layers* self attention layers with distillation layers. Each
+    attention layer is an [`InformerEncoderLayer`].
+
+    Args:
+        config: InformerConfig
+    """
+
+    def __init__(self, config: InformerConfig):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+        self.gradient_checkpointing = False
+        if config.prediction_length is None:
+            raise ValueError("The `prediction_length` config needs to be specified.")
+
+        self.value_embedding = InformerValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
+        self.embed_positions = InformerSinusoidalPositionalEmbedding(
+            config.context_length + config.prediction_length, config.d_model
+        )
+        self.layers = nn.ModuleList([InformerEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layernorm_embedding = nn.LayerNorm(config.d_model)
+
+        if config.distil:
+            self.conv_layers = nn.ModuleList(
+                [InformerConvLayer(config.d_model) for _ in range(config.encoder_layers - 1)]
+            )
+            self.conv_layers.append(None)
+        else:
+            self.conv_layers = [None] * config.encoder_layers
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states = self.value_embedding(inputs_embeds)
+        embed_pos = self.embed_positions(inputs_embeds.size())
+
+        hidden_states = self.layernorm_embedding(hidden_states + embed_pos)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            if head_mask.size()[0] != (len(self.layers)):
+                raise ValueError(
+                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
+                    f" {head_mask.size()[0]}."
+                )
+
+        for idx, (encoder_layer, conv_layer) in enumerate(zip(self.layers, self.conv_layers)):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
+                layer_outputs = (None, None)
+            else:
+                if self.gradient_checkpointing and self.training:
+
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
+
+                        return custom_forward
+
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(encoder_layer),
+                        hidden_states,
+                        attention_mask,
+                        (head_mask[idx] if head_mask is not None else None),
+                    )
+                    if conv_layer is not None:
+                        output = torch.utils.checkpoint.checkpoint(conv_layer, layer_outputs[0])
+                        layer_outputs = (output,) + layer_outputs[1:]
+                else:
+                    layer_outputs = encoder_layer(
+                        hidden_states,
+                        attention_mask,
+                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                        output_attentions=output_attentions,
+                    )
+                    if conv_layer is not None:
+                        output = conv_layer(layer_outputs[0])
+                        layer_outputs = (output,) + layer_outputs[1:]
+
+                hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerDecoder with TimeSeriesTransformer->Informer,TimeSeriesTransformerConfig->InformerConfig,time-series-transformer->informer,Transformer->Informer,TimeSeries->Informer
+class InformerDecoder(InformerPreTrainedModel):
+    """
+    Informer decoder consisting of *config.decoder_layers* layers. Each layer is a [`InformerDecoderLayer`]
+
+    Args:
+        config: InformerConfig
+    """
+
+    def __init__(self, config: InformerConfig):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        if config.prediction_length is None:
+            raise ValueError("The `prediction_length` config needs to be specified.")
+
+        self.value_embedding = InformerValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
+        self.embed_positions = InformerSinusoidalPositionalEmbedding(
+            config.context_length + config.prediction_length, config.d_model
+        )
+        self.layers = nn.ModuleList([InformerDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.layernorm_embedding = nn.LayerNorm(config.d_model)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+
+        return combined_attention_mask
+
+    def forward(
+        self,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        r"""
+        Args:
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+                selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
+                cross-attention on hidden heads. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        input_shape = inputs_embeds.size()[:-1]
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, input_shape, inputs_embeds, past_key_values_length
+        )
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+
+        hidden_states = self.value_embedding(inputs_embeds)
+        embed_pos = self.embed_positions(inputs_embeds.size(), past_key_values_length=self.config.context_length)
+        hidden_states = self.layernorm_embedding(hidden_states + embed_pos)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        next_decoder_cache = () if use_cache else None
+
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+            if attn_mask is not None:
+                if attn_mask.size()[0] != (len(self.layers)):
+                    raise ValueError(
+                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                        f" {head_mask.size()[0]}."
+                    )
+
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, use_cache)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    head_mask[idx] if head_mask is not None else None,
+                    cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    cross_attn_layer_head_mask=(
+                        cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
+                    ),
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    "The bare Informer Model outputting raw hidden-states without any specific head on top.",
+    INFORMER_START_DOCSTRING,
+)
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerModel with TimeSeriesTransformer->Informer,TIME_SERIES_TRANSFORMER->INFORMER,time-series-transformer->informer,TimeSeries->Informer
+class InformerModel(InformerPreTrainedModel):
+    def __init__(self, config: InformerConfig):
+        super().__init__(config)
+
+        if config.scaling == "mean" or config.scaling is True:
+            self.scaler = InformerMeanScaler(dim=1, keepdim=True)
+        elif config.scaling == "std":
+            self.scaler = InformerStdScaler(dim=1, keepdim=True)
+        else:
+            self.scaler = InformerNOPScaler(dim=1, keepdim=True)
+
+        if config.num_static_categorical_features > 0:
+            self.embedder = InformerFeatureEmbedder(
+                cardinalities=config.cardinality,
+                embedding_dims=config.embedding_dimension,
+            )
+
+        # transformer encoder-decoder and mask initializer
+        self.encoder = InformerEncoder(config)
+        self.decoder = InformerDecoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @property
+    def _past_length(self) -> int:
+        return self.config.context_length + max(self.config.lags_sequence)
+
+    def get_lagged_subsequences(
+        self, sequence: torch.Tensor, subsequences_length: int, shift: int = 0
+    ) -> torch.Tensor:
+        """
+        Returns lagged subsequences of a given sequence. Returns a tensor of shape (N, S, C, I),
+            where S = subsequences_length and I = len(indices), containing lagged subsequences. Specifically, lagged[i,
+            j, :, k] = sequence[i, -indices[k]-S+j, :].
+
+        Args:
+            sequence: Tensor
+                The sequence from which lagged subsequences should be extracted. Shape: (N, T, C).
+            subsequences_length : int
+                Length of the subsequences to be extracted.
+            shift: int
+                Shift the lags by this amount back.
+        """
+        sequence_length = sequence.shape[1]
+        indices = [lag - shift for lag in self.config.lags_sequence]
+
+        if max(indices) + subsequences_length > sequence_length:
+            raise ValueError(
+                f"lags cannot go further than history length, found lag {max(indices)} "
+                f"while history length is only {sequence_length}"
+            )
+
+        lagged_values = []
+        for lag_index in indices:
+            begin_index = -lag_index - subsequences_length
+            end_index = -lag_index if lag_index > 0 else None
+            lagged_values.append(sequence[:, begin_index:end_index, ...])
+        return torch.stack(lagged_values, dim=-1)
+
+    def create_network_inputs(
+        self,
+        past_values: torch.Tensor,
+        past_time_features: torch.Tensor,
+        static_categorical_features: Optional[torch.Tensor] = None,
+        static_real_features: Optional[torch.Tensor] = None,
+        past_observed_mask: Optional[torch.Tensor] = None,
+        future_values: Optional[torch.Tensor] = None,
+        future_time_features: Optional[torch.Tensor] = None,
+    ):
+        # time feature
+        time_feat = (
+            torch.cat(
+                (
+                    past_time_features[:, self._past_length - self.config.context_length :, ...],
+                    future_time_features,
+                ),
+                dim=1,
+            )
+            if future_values is not None
+            else past_time_features[:, self._past_length - self.config.context_length :, ...]
+        )
+
+        # target
+        if past_observed_mask is None:
+            past_observed_mask = torch.ones_like(past_values)
+
+        context = past_values[:, -self.config.context_length :]
+        observed_context = past_observed_mask[:, -self.config.context_length :]
+        _, loc, scale = self.scaler(context, observed_context)
+
+        inputs = (
+            (torch.cat((past_values, future_values), dim=1) - loc) / scale
+            if future_values is not None
+            else (past_values - loc) / scale
+        )
+
+        # static features
+        log_abs_loc = loc.abs().log1p() if self.config.input_size == 1 else loc.squeeze(1).abs().log1p()
+        log_scale = scale.log() if self.config.input_size == 1 else scale.squeeze(1).log()
+        static_feat = torch.cat((log_abs_loc, log_scale), dim=1)
+
+        if static_real_features is not None:
+            static_feat = torch.cat((static_real_features, static_feat), dim=1)
+        if static_categorical_features is not None:
+            embedded_cat = self.embedder(static_categorical_features)
+            static_feat = torch.cat((embedded_cat, static_feat), dim=1)
+        expanded_static_feat = static_feat.unsqueeze(1).expand(-1, time_feat.shape[1], -1)
+
+        # all features
+        features = torch.cat((expanded_static_feat, time_feat), dim=-1)
+
+        # lagged features
+        subsequences_length = (
+            self.config.context_length + self.config.prediction_length
+            if future_values is not None
+            else self.config.context_length
+        )
+        lagged_sequence = self.get_lagged_subsequences(sequence=inputs, subsequences_length=subsequences_length)
+        lags_shape = lagged_sequence.shape
+        reshaped_lagged_sequence = lagged_sequence.reshape(lags_shape[0], lags_shape[1], -1)
+
+        if reshaped_lagged_sequence.shape[1] != time_feat.shape[1]:
+            raise ValueError(
+                f"input length {reshaped_lagged_sequence.shape[1]} and time feature lengths {time_feat.shape[1]} does not match"
+            )
+
+        # transformer inputs
+        transformer_inputs = torch.cat((reshaped_lagged_sequence, features), dim=-1)
+
+        return transformer_inputs, loc, scale, static_feat
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    @add_start_docstrings_to_model_forward(INFORMER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqTSModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        past_values: torch.Tensor,
+        past_time_features: torch.Tensor,
+        past_observed_mask: torch.Tensor,
+        static_categorical_features: Optional[torch.Tensor] = None,
+        static_real_features: Optional[torch.Tensor] = None,
+        future_values: Optional[torch.Tensor] = None,
+        future_time_features: Optional[torch.Tensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Seq2SeqTSModelOutput, Tuple]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from huggingface_hub import hf_hub_download
+        >>> import torch
+        >>> from transformers import InformerModel
+
+        >>> file = hf_hub_download(
+        ...     repo_id="hf-internal-testing/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
+        ... )
+        >>> batch = torch.load(file)
+
+        >>> model = InformerModel.from_pretrained("huggingface/informer-tourism-monthly")
+
+        >>> # during training, one provides both past and future values
+        >>> # as well as possible additional features
+        >>> outputs = model(
+        ...     past_values=batch["past_values"],
+        ...     past_time_features=batch["past_time_features"],
+        ...     past_observed_mask=batch["past_observed_mask"],
+        ...     static_categorical_features=batch["static_categorical_features"],
+        ...     static_real_features=batch["static_real_features"],
+        ...     future_values=batch["future_values"],
+        ...     future_time_features=batch["future_time_features"],
+        ... )
+
+        >>> last_hidden_state = outputs.last_hidden_state
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_inputs, loc, scale, static_feat = self.create_network_inputs(
+            past_values=past_values,
+            past_time_features=past_time_features,
+            past_observed_mask=past_observed_mask,
+            static_categorical_features=static_categorical_features,
+            static_real_features=static_real_features,
+            future_values=future_values,
+            future_time_features=future_time_features,
+        )
+
+        if encoder_outputs is None:
+            enc_input = transformer_inputs[:, : self.config.context_length, ...]
+            encoder_outputs = self.encoder(
+                inputs_embeds=enc_input,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        dec_input = transformer_inputs[:, self.config.context_length :, ...]
+        decoder_outputs = self.decoder(
+            inputs_embeds=dec_input,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs + (loc, scale, static_feat)
+
+        return Seq2SeqTSModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            loc=loc,
+            scale=scale,
+            static_features=static_feat,
+        )
+
+
+@add_start_docstrings(
+    "The Informer Model with a distribution head on top for time-series forecasting.",
+    INFORMER_START_DOCSTRING,
+)
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerForPrediction with TimeSeriesTransformer->Informer,TIME_SERIES_TRANSFORMER->INFORMER,time-series-transformer->informer
+class InformerForPrediction(InformerPreTrainedModel):
+    def __init__(self, config: InformerConfig):
+        super().__init__(config)
+        self.model = InformerModel(config)
+        if config.distribution_output == "student_t":
+            self.distribution_output = StudentTOutput(dim=config.input_size)
+        elif config.distribution_output == "normal":
+            self.distribution_output = NormalOutput(dim=config.input_size)
+        elif config.distribution_output == "negative_binomial":
+            self.distribution_output = NegativeBinomialOutput(dim=config.input_size)
+        else:
+            raise ValueError(f"Unknown distribution output {config.distribution_output}")
+
+        self.parameter_projection = self.distribution_output.get_parameter_projection(self.model.config.d_model)
+        self.target_shape = self.distribution_output.event_shape
+
+        if config.loss == "nll":
+            self.loss = nll
+        else:
+            raise ValueError(f"Unknown loss function {config.loss}")
+
+        # Initialize weights of distribution_output and apply final processing
+        self.post_init()
+
+    def output_params(self, dec_output):
+        return self.parameter_projection(dec_output)
+
+    def get_encoder(self):
+        return self.model.get_encoder()
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    @torch.jit.ignore
+    def output_distribution(self, params, loc=None, scale=None, trailing_n=None) -> torch.distributions.Distribution:
+        sliced_params = params
+        if trailing_n is not None:
+            sliced_params = [p[:, -trailing_n:] for p in params]
+        return self.distribution_output.distribution(sliced_params, loc=loc, scale=scale)
+
+    @add_start_docstrings_to_model_forward(INFORMER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqTSModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        past_values: torch.Tensor,
+        past_time_features: torch.Tensor,
+        past_observed_mask: torch.Tensor,
+        static_categorical_features: Optional[torch.Tensor] = None,
+        static_real_features: Optional[torch.Tensor] = None,
+        future_values: Optional[torch.Tensor] = None,
+        future_time_features: Optional[torch.Tensor] = None,
+        future_observed_mask: Optional[torch.Tensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Seq2SeqTSModelOutput, Tuple]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from huggingface_hub import hf_hub_download
+        >>> import torch
+        >>> from transformers import InformerForPrediction
+
+        >>> file = hf_hub_download(
+        ...     repo_id="hf-internal-testing/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
+        ... )
+        >>> batch = torch.load(file)
+
+        >>> model = InformerForPrediction.from_pretrained("huggingface/informer-tourism-monthly")
+
+        >>> # during training, one provides both past and future values
+        >>> # as well as possible additional features
+        >>> outputs = model(
+        ...     past_values=batch["past_values"],
+        ...     past_time_features=batch["past_time_features"],
+        ...     past_observed_mask=batch["past_observed_mask"],
+        ...     static_categorical_features=batch["static_categorical_features"],
+        ...     static_real_features=batch["static_real_features"],
+        ...     future_values=batch["future_values"],
+        ...     future_time_features=batch["future_time_features"],
+        ... )
+
+        >>> loss = outputs.loss
+        >>> loss.backward()
+
+        >>> # during inference, one only provides past values
+        >>> # as well as possible additional features
+        >>> # the model autoregressively generates future values
+        >>> outputs = model.generate(
+        ...     past_values=batch["past_values"],
+        ...     past_time_features=batch["past_time_features"],
+        ...     past_observed_mask=batch["past_observed_mask"],
+        ...     static_categorical_features=batch["static_categorical_features"],
+        ...     static_real_features=batch["static_real_features"],
+        ...     future_time_features=batch["future_time_features"],
+        ... )
+
+        >>> mean_prediction = outputs.sequences.mean(dim=1)
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if future_values is not None:
+            use_cache = False
+
+        outputs = self.model(
+            past_values=past_values,
+            past_time_features=past_time_features,
+            past_observed_mask=past_observed_mask,
+            static_categorical_features=static_categorical_features,
+            static_real_features=static_real_features,
+            future_values=future_values,
+            future_time_features=future_time_features,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            encoder_outputs=encoder_outputs,
+            past_key_values=past_key_values,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            return_dict=return_dict,
+        )
+
+        prediction_loss = None
+        params = None
+        if future_values is not None:
+            params = self.output_params(outputs[0])  # outputs.last_hidden_state
+            # loc is 3rd last and scale is 2nd last output
+            distribution = self.output_distribution(params, loc=outputs[-3], scale=outputs[-2])
+
+            loss = self.loss(distribution, future_values)
+
+            if future_observed_mask is None:
+                future_observed_mask = torch.ones_like(future_values)
+
+            if len(self.target_shape) == 0:
+                loss_weights = future_observed_mask
+            else:
+                loss_weights, _ = future_observed_mask.min(dim=-1, keepdim=False)
+
+            prediction_loss = weighted_average(loss, weights=loss_weights)
+
+        if not return_dict:
+            outputs = ((params,) + outputs[1:]) if params is not None else outputs[1:]
+            return ((prediction_loss,) + outputs) if prediction_loss is not None else outputs
+
+        return Seq2SeqTSPredictionOutput(
+            loss=prediction_loss,
+            params=params,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+            loc=outputs.loc,
+            scale=outputs.scale,
+            static_features=outputs.static_features,
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        past_values: torch.Tensor,
+        past_time_features: torch.Tensor,
+        future_time_features: torch.Tensor,
+        past_observed_mask: Optional[torch.Tensor] = None,
+        static_categorical_features: Optional[torch.Tensor] = None,
+        static_real_features: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> SampleTSPredictionOutput:
+        r"""
+        Greedily generate sequences of sample predictions from a model with a probability distribution head.
+
+        Parameters:
+            past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`):
+                Past values of the time series, that serve as context in order to predict the future. The sequence size
+                of this tensor must be larger than the `context_length` of the model, since the model will use the
+                larger size to construct lag features, i.e. additional values from the past which are added in order to
+                serve as "extra context".
+
+                The `sequence_length` here is equal to `config.context_length` + `max(config.lags_sequence)`, which if
+                no `lags_sequence` is configured, is equal to `config.context_length` + 7 (as by default, the largest
+                look-back index in `config.lags_sequence` is 7). The property `_past_length` returns the actual length
+                of the past.
+
+                The `past_values` is what the Transformer encoder gets as input (with optional additional features,
+                such as `static_categorical_features`, `static_real_features`, `past_time_features` and lags).
+
+                Optionally, missing values need to be replaced with zeros and indicated via the `past_observed_mask`.
+
+                For multivariate time series, the `input_size` > 1 dimension is required and corresponds to the number
+                of variates in the time series per time step.
+            past_time_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`):
+                Required time features, which the model internally will add to `past_values`. These could be things
+                like "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features).
+                These could also be so-called "age" features, which basically help the model know "at which point in
+                life" a time-series is. Age features have small values for distant past time steps and increase
+                monotonically the more we approach the current time step. Holiday features are also a good example of
+                time features.
+
+                These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT,
+                where the position encodings are learned from scratch internally as parameters of the model, the Time
+                Series Transformer requires to provide additional time features. The Time Series Transformer only
+                learns additional embeddings for `static_categorical_features`.
+
+                Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these
+                features must but known at prediction time.
+
+                The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`.
+            future_time_features (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`):
+                Required time features for the prediction window, which the model internally will add to sampled
+                predictions. These could be things like "month of year", "day of the month", etc. encoded as vectors
+                (for instance as Fourier features). These could also be so-called "age" features, which basically help
+                the model know "at which point in life" a time-series is. Age features have small values for distant
+                past time steps and increase monotonically the more we approach the current time step. Holiday features
+                are also a good example of time features.
+
+                These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT,
+                where the position encodings are learned from scratch internally as parameters of the model, the Time
+                Series Transformer requires to provide additional time features. The Time Series Transformer only
+                learns additional embeddings for `static_categorical_features`.
+
+                Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these
+                features must but known at prediction time.
+
+                The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`.
+            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`, *optional*):
+                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
+                in `[0, 1]`:
+
+                - 1 for values that are **observed**,
+                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+
+            static_categorical_features (`torch.LongTensor` of shape `(batch_size, number of static categorical features)`, *optional*):
+                Optional static categorical features for which the model will learn an embedding, which it will add to
+                the values of the time series.
+
+                Static categorical features are features which have the same value for all time steps (static over
+                time).
+
+                A typical example of a static categorical feature is a time series ID.
+            static_real_features (`torch.FloatTensor` of shape `(batch_size, number of static real features)`, *optional*):
+                Optional static real features which the model will add to the values of the time series.
+
+                Static real features are features which have the same value for all time steps (static over time).
+
+                A typical example of a static real feature is promotion information.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers.
+
+        Return:
+            [`SampleTSPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of
+            samples, prediction_length)` or `(batch_size, number of samples, prediction_length, input_size)` for
+            multivariate predictions.
+        """
+        outputs = self(
+            static_categorical_features=static_categorical_features,
+            static_real_features=static_real_features,
+            past_time_features=past_time_features,
+            past_values=past_values,
+            past_observed_mask=past_observed_mask,
+            future_time_features=future_time_features,
+            future_values=None,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            use_cache=True,
+        )
+
+        decoder = self.model.get_decoder()
+        enc_last_hidden = outputs.encoder_last_hidden_state
+        loc = outputs.loc
+        scale = outputs.scale
+        static_feat = outputs.static_features
+
+        num_parallel_samples = self.config.num_parallel_samples
+        repeated_loc = loc.repeat_interleave(repeats=num_parallel_samples, dim=0)
+        repeated_scale = scale.repeat_interleave(repeats=num_parallel_samples, dim=0)
+
+        repeated_past_values = (
+            past_values.repeat_interleave(repeats=num_parallel_samples, dim=0) - repeated_loc
+        ) / repeated_scale
+
+        expanded_static_feat = static_feat.unsqueeze(1).expand(-1, future_time_features.shape[1], -1)
+        features = torch.cat((expanded_static_feat, future_time_features), dim=-1)
+        repeated_features = features.repeat_interleave(repeats=num_parallel_samples, dim=0)
+
+        repeated_enc_last_hidden = enc_last_hidden.repeat_interleave(repeats=num_parallel_samples, dim=0)
+
+        future_samples = []
+
+        # greedy decoding
+        for k in range(self.config.prediction_length):
+            lagged_sequence = self.model.get_lagged_subsequences(
+                sequence=repeated_past_values,
+                subsequences_length=1 + k,
+                shift=1,
+            )
+
+            lags_shape = lagged_sequence.shape
+            reshaped_lagged_sequence = lagged_sequence.reshape(lags_shape[0], lags_shape[1], -1)
+
+            decoder_input = torch.cat((reshaped_lagged_sequence, repeated_features[:, : k + 1]), dim=-1)
+
+            dec_output = decoder(inputs_embeds=decoder_input, encoder_hidden_states=repeated_enc_last_hidden)
+            dec_last_hidden = dec_output.last_hidden_state
+
+            params = self.parameter_projection(dec_last_hidden[:, -1:])
+            distr = self.output_distribution(params, loc=repeated_loc, scale=repeated_scale)
+            next_sample = distr.sample()
+
+            repeated_past_values = torch.cat(
+                (repeated_past_values, (next_sample - repeated_loc) / repeated_scale), dim=1
+            )
+            future_samples.append(next_sample)
+
+        concat_future_samples = torch.cat(future_samples, dim=1)
+
+        return SampleTSPredictionOutput(
+            sequences=concat_future_samples.reshape(
+                (-1, num_parallel_samples, self.config.prediction_length) + self.target_shape,
+            )
+        )
diff --git a/src/transformers/models/instructblip/__init__.py b/src/transformers/models/instructblip/__init__.py
new file mode 100644
index 000000000000..201db4d272d4
--- /dev/null
+++ b/src/transformers/models/instructblip/__init__.py
@@ -0,0 +1,69 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {
+    "configuration_instructblip": [
+        "INSTRUCTBLIP_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "InstructBlipConfig",
+        "InstructBlipQFormerConfig",
+        "InstructBlipVisionConfig",
+    ],
+    "processing_instructblip": ["InstructBlipProcessor"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_instructblip"] = [
+        "INSTRUCTBLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "InstructBlipQFormerModel",
+        "InstructBlipPreTrainedModel",
+        "InstructBlipForConditionalGeneration",
+        "InstructBlipVisionModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_instructblip import (
+        INSTRUCTBLIP_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        InstructBlipConfig,
+        InstructBlipQFormerConfig,
+        InstructBlipVisionConfig,
+    )
+    from .processing_instructblip import InstructBlipProcessor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_instructblip import (
+            INSTRUCTBLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
+            InstructBlipForConditionalGeneration,
+            InstructBlipPreTrainedModel,
+            InstructBlipQFormerModel,
+            InstructBlipVisionModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/instructblip/configuration_instructblip.py b/src/transformers/models/instructblip/configuration_instructblip.py
new file mode 100644
index 000000000000..78c7e4e8b65c
--- /dev/null
+++ b/src/transformers/models/instructblip/configuration_instructblip.py
@@ -0,0 +1,358 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" InstructBLIP model configuration"""
+
+import os
+from typing import Union
+
+from ...configuration_utils import PretrainedConfig
+from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+from ...utils import logging
+from ..auto import CONFIG_MAPPING
+
+
+logger = logging.get_logger(__name__)
+
+INSTRUCTBLIP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "Salesforce/instruct-blip-flan-t5": "https://huggingface.co/Salesforce/instruct-blip-flan-t5/resolve/main/config.json",
+}
+
+
+class InstructBlipVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`InstructBlipVisionModel`]. It is used to
+    instantiate a InstructBLIP vision encoder according to the specified arguments, defining the model architecture.
+    Instantiating a configuration defaults will yield a similar configuration to that of the InstructBLIP
+    [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1408):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 6144):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 39):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. to 1e-5): The epsilon used by the layer
+            normalization layers.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 1e-10):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries and values in the self-attention layers.
+
+    Example:
+
+    ```python
+    >>> from transformers import InstructBlipVisionConfig, InstructBlipVisionModel
+
+    >>> # Initializing a InstructBlipVisionConfig with Salesforce/instruct-blip-flan-t5 style configuration
+    >>> configuration = InstructBlipVisionConfig()
+
+    >>> # Initializing a InstructBlipVisionModel (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration
+    >>> model = InstructBlipVisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "instructblip_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=1408,
+        intermediate_size=6144,
+        num_hidden_layers=39,
+        num_attention_heads=16,
+        image_size=224,
+        patch_size=14,
+        hidden_act="gelu",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        initializer_range=1e-10,
+        qkv_bias=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.qkv_bias = qkv_bias
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from InstructBlipConfig
+        if config_dict.get("model_type") == "instructblip":
+            config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class InstructBlipQFormerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`InstructBlipQFormerModel`]. It is used to
+    instantiate a InstructBLIP Querying Transformer (Q-Former) model according to the specified arguments, defining the
+    model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the InstructBLIP [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5)
+    architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs.
+    Read the documentation from [`PretrainedConfig`] for more information.
+
+    Note that [`InstructBlipQFormerModel`] is very similar to [`BertLMHeadModel`] with interleaved cross-attention.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the Q-Former model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling the model.
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        cross_attention_frequency (`int`, *optional*, defaults to 2):
+            The frequency of adding cross-attention to the Transformer layers.
+        encoder_hidden_size (`int`, *optional*, defaults to 1408):
+            The hidden size of the hidden states for cross-attention.
+
+    Examples:
+
+    ```python
+    >>> from transformers import InstructBlipQFormerConfig, InstructBlipQFormerModel
+
+    >>> # Initializing a InstructBLIP Salesforce/instruct-blip-flan-t5 style configuration
+    >>> configuration = InstructBlipQFormerConfig()
+
+    >>> # Initializing a model (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration
+    >>> model = InstructBlipQFormerModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "instructblip_qformer"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        position_embedding_type="absolute",
+        cross_attention_frequency=2,
+        encoder_hidden_size=1408,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.cross_attention_frequency = cross_attention_frequency
+        self.encoder_hidden_size = encoder_hidden_size
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the qformer config dict if we are loading from InstructBlipConfig
+        if config_dict.get("model_type") == "instructblip":
+            config_dict = config_dict["qformer_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class InstructBlipConfig(PretrainedConfig):
+    r"""
+    [`InstructBlipConfig`] is the configuration class to store the configuration of a
+    [`InstructBlipForConditionalGeneration`]. It is used to instantiate a InstructBLIP model according to the specified
+    arguments, defining the vision model, Q-Former model and language model configs. Instantiating a configuration with
+    the defaults will yield a similar configuration to that of the InstructBLIP
+    [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`InstructBlipVisionConfig`].
+        qformer_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`InstructBlipQFormerConfig`].
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize any [`PretrainedConfig`].
+        num_query_tokens (`int`, *optional*, defaults to 32):
+            The number of query tokens passed through the Transformer.
+
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from transformers import (
+    ...     InstructBlipVisionConfig,
+    ...     InstructBlipQFormerConfig,
+    ...     OPTConfig,
+    ...     InstructBlipConfig,
+    ...     InstructBlipForConditionalGeneration,
+    ... )
+
+    >>> # Initializing a InstructBlipConfig with Salesforce/instruct-blip-flan-t5 style configuration
+    >>> configuration = InstructBlipConfig()
+
+    >>> # Initializing a InstructBlipForConditionalGeneration (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration
+    >>> model = InstructBlipForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+
+    >>> # We can also initialize a InstructBlipConfig from a InstructBlipVisionConfig, InstructBlipQFormerConfig and any PretrainedConfig
+
+    >>> # Initializing InstructBLIP vision, InstructBLIP Q-Former and language model configurations
+    >>> vision_config = InstructBlipVisionConfig()
+    >>> qformer_config = InstructBlipQFormerConfig()
+    >>> text_config = OPTConfig()
+
+    >>> config = InstructBlipConfig.from_text_vision_configs(vision_config, qformer_config, text_config)
+    ```"""
+
+    model_type = "instructblip"
+
+    def __init__(self, vision_config=None, qformer_config=None, text_config=None, num_query_tokens=32, **kwargs):
+        super().__init__(**kwargs)
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("vision_config is None. initializing the InstructBlipVisionConfig with default values.")
+
+        if qformer_config is None:
+            qformer_config = {}
+            logger.info("qformer_config is None. Initializing the InstructBlipQFormerConfig with default values.")
+
+        if text_config is None:
+            text_config = {}
+            logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).")
+
+        self.vision_config = InstructBlipVisionConfig(**vision_config)
+        self.qformer_config = InstructBlipQFormerConfig(**qformer_config)
+        text_model_type = text_config["model_type"] if "model_type" in text_config else "opt"
+        self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
+
+        self.tie_word_embeddings = self.text_config.tie_word_embeddings
+        self.is_encoder_decoder = self.text_config.is_encoder_decoder
+
+        self.num_query_tokens = num_query_tokens
+        self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
+        self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+        self.initializer_factor = 1.0
+        self.initializer_range = 0.02
+
+    @classmethod
+    def from_vision_qformer_text_configs(
+        cls,
+        vision_config: InstructBlipVisionConfig,
+        qformer_config: InstructBlipQFormerConfig,
+        text_config: PretrainedConfig,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a [`InstructBlipConfig`] (or a derived class) from a InstructBLIP vision model, Q-Former and
+        language model configurations.
+
+        Returns:
+            [`InstructBlipConfig`]: An instance of a configuration object
+        """
+
+        return cls(
+            vision_config=vision_config.to_dict(),
+            qformer_config=qformer_config.to_dict(),
+            text_config=text_config.to_dict(),
+            **kwargs,
+        )
diff --git a/src/transformers/models/instructblip/convert_instructblip_original_to_pytorch.py b/src/transformers/models/instructblip/convert_instructblip_original_to_pytorch.py
new file mode 100644
index 000000000000..87e8b90d6cc8
--- /dev/null
+++ b/src/transformers/models/instructblip/convert_instructblip_original_to_pytorch.py
@@ -0,0 +1,303 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Convert InstructBLIP checkpoints from the original repository.
+
+URL: https://github.com/salesforce/LAVIS/tree/main/projects/instructblip
+"""
+
+import argparse
+
+import requests
+import torch
+
+# pip3 install salesforce-lavis
+# I'm actually installing a slightly modified version: pip3 install git+https://github.com/nielsrogge/LAVIS.git@fix_lavis_float32 (there's also the fix_lavis branch)
+# also note: to convert Vicuna checkpoints, we had to include /home/niels/python_projects/checkpoints/FastChat/vicuna-7b in lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml
+# same for Vicuna-13b
+from lavis.models import load_model_and_preprocess
+from PIL import Image
+
+from transformers import (
+    AutoTokenizer,
+    BlipImageProcessor,
+    InstructBlipConfig,
+    InstructBlipForConditionalGeneration,
+    InstructBlipProcessor,
+    InstructBlipQFormerConfig,
+    InstructBlipVisionConfig,
+    LlamaConfig,
+    LlamaTokenizerFast,
+    T5Config,
+    T5TokenizerFast,
+)
+from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
+
+
+def load_demo_image():
+    url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg"
+    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+
+    return image
+
+
+# here we list all keys to be renamed (original name on the left, our name on the right)
+def create_rename_keys(config):
+    rename_keys = []
+    # fmt: off
+
+    # vision encoder
+    rename_keys.append(("visual_encoder.cls_token", "vision_model.embeddings.class_embedding"))
+    rename_keys.append(("visual_encoder.pos_embed", "vision_model.embeddings.position_embedding"))
+    rename_keys.append(("visual_encoder.patch_embed.proj.weight", "vision_model.embeddings.patch_embedding.weight"))
+    rename_keys.append(("visual_encoder.patch_embed.proj.bias", "vision_model.embeddings.patch_embedding.bias"))
+    rename_keys.append(("ln_vision.weight", "vision_model.post_layernorm.weight"))
+    rename_keys.append(("ln_vision.bias", "vision_model.post_layernorm.bias"))
+
+    for i in range(config.vision_config.num_hidden_layers):
+        rename_keys.append((f"visual_encoder.blocks.{i}.norm1.weight", f"vision_model.encoder.layers.{i}.layer_norm1.weight"))
+        rename_keys.append((f"visual_encoder.blocks.{i}.norm1.bias", f"vision_model.encoder.layers.{i}.layer_norm1.bias"))
+        rename_keys.append((f"visual_encoder.blocks.{i}.norm2.weight", f"vision_model.encoder.layers.{i}.layer_norm2.weight"))
+        rename_keys.append((f"visual_encoder.blocks.{i}.norm2.bias", f"vision_model.encoder.layers.{i}.layer_norm2.bias"))
+        rename_keys.append((f"visual_encoder.blocks.{i}.attn.qkv.weight", f"vision_model.encoder.layers.{i}.self_attn.qkv.weight"))
+        rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.weight", f"vision_model.encoder.layers.{i}.self_attn.projection.weight",))
+        rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.bias", f"vision_model.encoder.layers.{i}.self_attn.projection.bias"))
+        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.weight", f"vision_model.encoder.layers.{i}.mlp.fc1.weight"))
+        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.bias", f"vision_model.encoder.layers.{i}.mlp.fc1.bias"))
+        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.weight", f"vision_model.encoder.layers.{i}.mlp.fc2.weight"))
+        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.bias", f"vision_model.encoder.layers.{i}.mlp.fc2.bias"))
+
+    # QFormer
+    rename_keys.append(("Qformer.bert.embeddings.LayerNorm.weight", "qformer.embeddings.layernorm.weight"))
+    rename_keys.append(("Qformer.bert.embeddings.LayerNorm.bias", "qformer.embeddings.layernorm.bias"))
+
+    # fmt: on
+    return rename_keys
+
+
+def rename_key(dct, old, new):
+    val = dct.pop(old)
+    dct[new] = val
+
+
+def read_in_q_v_bias(state_dict, config):
+    for i in range(config.vision_config.num_hidden_layers):
+        # read in original q and v biases
+        q_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.q_bias")
+        v_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.v_bias")
+
+        # next, set bias in the state dict
+        qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias))
+        state_dict[f"vision_model.encoder.layers.{i}.self_attn.qkv.bias"] = qkv_bias
+
+
+def get_blip2_config(model_name):
+    image_size = 364 if "coco" in model_name else 224
+    vision_config = InstructBlipVisionConfig(image_size=image_size).to_dict()
+
+    # make sure the models have proper bos_token_id and eos_token_id set (important for generation)
+    # seems like flan-T5 models don't have bos_token_id properly set?
+    if "t5-xl" in model_name:
+        text_config = T5Config.from_pretrained("google/flan-t5-xl", dense_act_fn="gelu", bos_token_id=1).to_dict()
+    elif "t5-xxl" in model_name:
+        text_config = T5Config.from_pretrained("google/flan-t5-xxl", dense_act_fn="gelu", bos_token_id=1).to_dict()
+    elif "vicuna-7b" in model_name:
+        text_config = LlamaConfig.from_pretrained("decapoda-research/llama-7b-hf", vocab_size=32001).to_dict()
+    elif "vicuna-13b" in model_name:
+        text_config = LlamaConfig.from_pretrained("decapoda-research/llama-13b-hf", vocab_size=32001).to_dict()
+    else:
+        raise ValueError("Model name not supported")
+
+    # the authors add one special "[DEC]" token to the vocab of Q-Former, hence vocab size = 30522 + 1
+    qformer_config = InstructBlipQFormerConfig(vocab_size=30523).to_dict()
+    config = InstructBlipConfig(vision_config=vision_config, text_config=text_config, qformer_config=qformer_config)
+
+    return config, image_size
+
+
+@torch.no_grad()
+def convert_blip2_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
+    """
+    Copy/paste/tweak model's weights to Transformers design.
+    """
+    qformer_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", truncation_side="left")
+    qformer_tokenizer.add_special_tokens({"bos_token": "[DEC]"})
+
+    if "t5" in model_name:
+        tokenizer = T5TokenizerFast.from_pretrained("google/flan-t5-xl", truncation_side="left")
+    elif "vicuna" in model_name:
+        # the following was used in the original implementation:
+        # tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", use_fast=False, truncation_side="left")
+        # tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+        # tokenizer.add_special_tokens({"bos_token": "</s>"})
+        # tokenizer.add_special_tokens({"eos_token": "</s>"})
+        # tokenizer.add_special_tokens({"unk_token": "</s>"})
+        tokenizer = LlamaTokenizerFast.from_pretrained(
+            "huggyllama/llama-7b", truncation_side="left", bos_token="</s>", unk_token="</s>"
+        )
+        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+
+    config, image_size = get_blip2_config(model_name)
+    hf_model = InstructBlipForConditionalGeneration(config).eval()
+
+    model_name_to_original = {
+        "instructblip-vicuna-7b": ("blip2_vicuna_instruct", "vicuna7b"),
+        "instructblip-vicuna-13b": ("blip2_vicuna_instruct", "vicuna13b"),
+        "instructblip-flan-t5-xl": ("blip2_t5_instruct", "flant5xl"),
+        "instructblip-flan-t5-xxl": ("blip2_t5_instruct", "flant5xxl"),
+    }
+
+    name, type = model_name_to_original[model_name]
+
+    # load original model
+    print("Loading original model...")
+    hf_model_device = "cuda:1" if torch.cuda.is_available() else "cpu"
+    lavis_device = "cuda:2" if torch.cuda.is_available() else "cpu"
+    original_model, vis_processors, _ = load_model_and_preprocess(
+        name=name, model_type=type, is_eval=True, device=lavis_device
+    )
+    original_model.eval()
+    print("Done!")
+
+    # update state dict keys
+    state_dict = original_model.state_dict()
+    rename_keys = create_rename_keys(config)
+    for src, dest in rename_keys:
+        rename_key(state_dict, src, dest)
+
+    # some keys can be renamed efficiently
+    for key, val in state_dict.copy().items():
+        val = state_dict.pop(key)
+        if key.startswith("Qformer.bert"):
+            key = key.replace("Qformer.bert", "qformer")
+        if "attention.self" in key:
+            key = key.replace("self", "attention")
+        if "llm_proj" in key:
+            key = key.replace("llm_proj", "language_projection")
+        if "t5_proj" in key:
+            key = key.replace("t5_proj", "language_projection")
+        if key.startswith("llm_model"):
+            key = key.replace("llm_model", "language_model")
+        if key.startswith("t5"):
+            key = key.replace("t5", "language")
+        state_dict[key] = val
+
+    # read in qv biases
+    read_in_q_v_bias(state_dict, config)
+
+    # note: weights get loaded in torch.float32 by default
+    hf_model.load_state_dict(state_dict, strict=True)
+
+    image = load_demo_image()
+    prompt = "What is unusual about this image?"
+
+    # create processor
+    image_processor = BlipImageProcessor(
+        size={"height": image_size, "width": image_size}, image_mean=OPENAI_CLIP_MEAN, image_std=OPENAI_CLIP_STD
+    )
+    processor = InstructBlipProcessor(
+        image_processor=image_processor,
+        tokenizer=tokenizer,
+        qformer_tokenizer=qformer_tokenizer,
+    )
+    inputs = processor(images=image, text=prompt, return_tensors="pt").to(hf_model_device)
+
+    # make sure processor creates exact same pixel values
+    original_pixel_values = vis_processors["eval"](image).unsqueeze(0).to(lavis_device)
+    pixel_values = inputs.pixel_values
+    assert torch.allclose(original_pixel_values.to(pixel_values.device), pixel_values)
+
+    original_model.to(lavis_device)
+    hf_model.to(hf_model_device)
+    with torch.no_grad():
+        if "vicuna" in model_name:
+            original_logits = original_model({"image": original_pixel_values, "text_input": [prompt]}).logits
+            logits = hf_model(**inputs).logits
+        else:
+            original_logits = original_model(
+                {"image": original_pixel_values, "text_input": [prompt], "text_output": ["\n"]}
+            ).logits
+            label_input_ids = tokenizer("\n", return_tensors="pt").input_ids.to(hf_model_device)
+            labels = label_input_ids.masked_fill(label_input_ids == tokenizer.pad_token_id, -100)
+            logits = hf_model(**inputs, labels=labels).logits
+
+    print("First values of original logits:", original_logits[0, :3, :3])
+    print("First values of HF logits:", logits[0, :3, :3])
+
+    # assert values
+    assert original_logits.shape == logits.shape
+    atol = 1e-4 if "vicuna" in model_name else 1e-5
+    assert torch.allclose(original_logits.to(logits.device), logits, atol=atol)
+    print("Looks ok!")
+
+    print("Generating with original model...")
+    original_outputs = original_model.generate({"image": original_pixel_values, "prompt": prompt}, num_beams=5)
+
+    # important: we need to cast the weights of the HF model to the appropriate type
+    print("Generating with HF model...")
+    outputs = hf_model.generate(
+        **inputs,
+        do_sample=False,
+        num_beams=5,
+        max_length=256,
+        min_length=1,
+        top_p=0.9,
+        repetition_penalty=1.5,
+        length_penalty=1.0,
+        temperature=1,
+    )
+    if "vicuna" in model_name:
+        # convert output id 0 to 2 (eos_token_id)
+        # TODO add this in the generate method?
+        outputs[outputs == 0] = 2
+    print("Original generation:", original_outputs)
+    output_text = processor.batch_decode(outputs, skip_special_tokens=True)
+    output_text = [text.strip() for text in output_text]
+    print("HF generation:", output_text)
+
+    if pytorch_dump_folder_path is not None:
+        processor.save_pretrained(pytorch_dump_folder_path)
+        hf_model.save_pretrained(pytorch_dump_folder_path)
+
+    if push_to_hub:
+        processor.push_to_hub(f"Salesforce/{model_name}")
+        hf_model.push_to_hub(f"Salesforce/{model_name}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    choices = [
+        "instructblip-vicuna-7b",
+        "instructblip-vicuna-13b",
+        "instructblip-flan-t5-xl",
+        "instructblip-flan-t5-xxl",
+    ]
+    parser.add_argument(
+        "--model_name",
+        default="instructblip-flan-t5-xl",
+        choices=choices,
+        type=str,
+        help="Path to hf config.json of model to convert",
+    )
+    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+    parser.add_argument(
+        "--push_to_hub",
+        action="store_true",
+        help="Whether to push the model and processor to the hub after converting",
+    )
+
+    args = parser.parse_args()
+
+    convert_blip2_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py
new file mode 100644
index 000000000000..082900a6652f
--- /dev/null
+++ b/src/transformers/models/instructblip/modeling_instructblip.py
@@ -0,0 +1,1572 @@
+# coding=utf-8
+# Copyright 2023 The Salesforce Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch InstructBLIP model."""
+
+import math
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPooling,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from ..auto import AutoModelForCausalLM, AutoModelForSeq2SeqLM
+from .configuration_instructblip import InstructBlipConfig, InstructBlipQFormerConfig, InstructBlipVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "Salesforce/instructblip-flan-t5-xl"
+
+INSTRUCTBLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "Salesforce/instructblip-flan-t5-xl",
+    # See all InstructBLIP models at https://huggingface.co/models?filter=instructblip
+]
+
+
+@dataclass
+# Copied from transformers.models.blip_2.modeling_blip_2.Blip2ForConditionalGenerationModelOutput with Blip2->InstructBlip
+class InstructBlipForConditionalGenerationModelOutput(ModelOutput):
+    """
+    Class defining the outputs of [`InstructBlipForConditionalGeneration`].
+
+    Args:
+        loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
+            Language modeling loss from the language model.
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head of the language model.
+        vision_outputs (`BaseModelOutputWithPooling`):
+            Outputs of the vision encoder.
+        qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
+            Outputs of the Q-Former (Querying Transformer).
+        language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
+            Outputs of the language model.
+    """
+
+    loss: Optional[Tuple[torch.FloatTensor]] = None
+    logits: Optional[Tuple[torch.FloatTensor]] = None
+    vision_outputs: Optional[torch.FloatTensor] = None
+    qformer_outputs: Optional[Tuple[torch.FloatTensor]] = None
+    language_model_outputs: Optional[Tuple[torch.FloatTensor]] = None
+
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k]
+            if k not in ["vision_outputs", "qformer_outputs", "language_model_outputs"]
+            else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+# Copied from transformers.models.blip.modeling_blip.BlipVisionEmbeddings with Blip->InstructBlip
+class InstructBlipVisionEmbeddings(nn.Module):
+    def __init__(self, config: InstructBlipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim))
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+
+        self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
+
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        embeddings = embeddings + self.position_embedding[:, : embeddings.size(1), :].to(target_dtype)
+        return embeddings
+
+
+# Copied from transformers.models.blip_2.modeling_blip_2.Blip2Attention with Blip2->InstructBlip
+class InstructBlipAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = nn.Dropout(config.attention_dropout)
+
+        # small tweak here compared to CLIP, no bias here
+        self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=False)
+
+        if config.qkv_bias:
+            q_bias = nn.Parameter(torch.zeros(self.embed_dim))
+            v_bias = nn.Parameter(torch.zeros(self.embed_dim))
+        else:
+            q_bias = None
+            v_bias = None
+
+        if q_bias is not None:
+            qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias))
+            self.qkv.bias = nn.Parameter(qkv_bias)
+
+        self.projection = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        mixed_qkv = self.qkv(hidden_states)
+
+        mixed_qkv = mixed_qkv.reshape(bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads).permute(
+            2, 0, 3, 1, 4
+        )
+        query_states, key_states, value_states = mixed_qkv[0], mixed_qkv[1], mixed_qkv[2]
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2))
+
+        attention_scores = attention_scores * self.scale
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_states).permute(0, 2, 1, 3)
+
+        new_context_layer_shape = context_layer.size()[:-2] + (self.embed_dim,)
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        output = self.projection(context_layer)
+
+        outputs = (output, attention_probs) if output_attentions else (output, None)
+
+        return outputs
+
+
+# Copied from transformers.models.blip.modeling_blip.BlipMLP
+class InstructBlipMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.blip.modeling_blip.BlipEncoderLayer with Blip->InstructBlip
+class InstructBlipEncoderLayer(nn.Module):
+    def __init__(self, config: InstructBlipConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = InstructBlipAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = InstructBlipMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            head_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = hidden_states + residual
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+
+        hidden_states = hidden_states + residual
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class InstructBlipPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = InstructBlipConfig
+    base_model_prefix = "blip"
+    supports_gradient_checkpointing = True
+    _no_split_modules = [
+        "InstructBlipQFormerEmbeddings",
+        "InstructBlipAttention",
+        "InstructBlipQFormerMultiHeadAttention",
+        "InstructBlipQFormerSelfOutput",
+    ]
+    _keep_in_fp32_modules = []
+
+    # Copied from transformers.models.blip_2.modeling_blip_2.Blip2PreTrainedModel._init_weights with Blip2->InstructBlip
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_range
+        if isinstance(module, nn.Conv2d) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=factor)
+            if hasattr(module, "bias") and module.bias is not None:
+                module.bias.data.zero_()
+
+        if isinstance(module, InstructBlipVisionEmbeddings):
+            if hasattr(self.config, "vision_config"):
+                factor = self.config.vision_config.initializer_range
+            nn.init.trunc_normal_(module.position_embedding, mean=0.0, std=factor)
+            nn.init.trunc_normal_(module.class_embedding, mean=0.0, std=factor)
+
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, InstructBlipEncoder):
+            module.gradient_checkpointing = value
+
+
+INSTRUCTBLIP_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`InstructBlipConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+INSTRUCTBLIP_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`InstructBlipProcessor`]. See
+            [`InstructBlipProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+INSTRUCTBLIP_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`InstructBlipProcessor`]. See
+            [`InstructBlipProcessor.__call__`] for details.
+
+        qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided
+            to serve as text prompt, which the Q-Former model will encode.
+
+            Indices can be obtained using [`InstructBlipProcessor`]. See [`InstructBlipProcessor.__call__`] for
+            details.
+
+            [What are input IDs?](../glossary#input-ids)
+
+        qformer_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of input sequence tokens in the vocabulary of the language model. Input tokens can optionally be
+            provided to serve as text prompt, which the language model can continue.
+
+            Indices can be obtained using [`InstructBlipProcessor`]. See [`InstructBlipProcessor.__call__`] for
+            details.
+
+            [What are input IDs?](../glossary#input-ids)
+
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary of the language model. Only relevant in case an
+            encoder-decoder language model (like T5) is used.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details. [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+
+            Only relevant in case an encoder-decoder language model (like T5) is used.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.blip.modeling_blip.BlipEncoder with Blip->InstructBlip
+class InstructBlipEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`InstructBlipEncoderLayer`].
+
+    Args:
+        config (`InstructBlipConfig`):
+            The corresponding vision configuration for the `InstructBlipEncoder`.
+    """
+
+    def __init__(self, config: InstructBlipConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([InstructBlipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Embedded representation of the inputs. Should be float, not int tokens.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(encoder_layer),
+                    hidden_states,
+                    attention_mask,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+# Copied from transformers.models.blip.modeling_blip.BlipVisionModel with Blip->InstructBlip, BLIP->INSTRUCTBLIP
+class InstructBlipVisionModel(InstructBlipPreTrainedModel):
+    main_input_name = "pixel_values"
+    config_class = InstructBlipVisionConfig
+
+    def __init__(self, config: InstructBlipVisionConfig):
+        super().__init__(config)
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = InstructBlipVisionEmbeddings(config)
+        self.encoder = InstructBlipEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(INSTRUCTBLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=InstructBlipVisionConfig)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+
+class InstructBlipQFormerMultiHeadAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention heads (%d)"
+                % (config.hidden_size, config.num_attention_heads)
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        if is_cross_attention:
+            self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size)
+        else:
+            self.key = nn.Linear(config.hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+        self.save_attention = False
+
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+
+    def get_attn_gradients(self):
+        return self.attn_gradients
+
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+
+    def get_attention_map(self):
+        return self.attention_map
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        mixed_query_layer = self.query(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        attention_scores_dtype = attention_scores.dtype
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores).to(attention_scores_dtype)
+
+        if is_cross_attention and self.save_attention:
+            self.save_attention_map(attention_probs)
+            attention_probs.register_hook(self.save_attn_gradients)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs_dropped = attention_probs_dropped * head_mask
+
+        context_layer = torch.matmul(attention_probs_dropped, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        outputs = outputs + (past_key_value,)
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->InstructBlipQFormer
+class InstructBlipQFormerSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerAttention with Blip2->InstructBlip
+class InstructBlipQFormerAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.attention = InstructBlipQFormerMultiHeadAttention(config, is_cross_attention)
+        self.output = InstructBlipQFormerSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        self_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->InstructBlipQFormer
+class InstructBlipQFormerIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->InstructBlipQFormer
+class InstructBlipQFormerOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class InstructBlipQFormerLayer(nn.Module):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = InstructBlipQFormerAttention(config)
+
+        self.layer_idx = layer_idx
+
+        if layer_idx % config.cross_attention_frequency == 0:
+            self.crossattention = InstructBlipQFormerAttention(config, is_cross_attention=True)
+            self.has_cross_attention = True
+        else:
+            self.has_cross_attention = False
+
+        self.intermediate = InstructBlipQFormerIntermediate(config)
+        self.output = InstructBlipQFormerOutput(config)
+
+        self.intermediate_query = InstructBlipQFormerIntermediate(config)
+        self.output_query = InstructBlipQFormerOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        query_length=0,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:-1]
+
+        present_key_value = self_attention_outputs[-1]
+
+        if query_length > 0:
+            query_attention_output = attention_output[:, :query_length, :]
+
+            if self.has_cross_attention:
+                if encoder_hidden_states is None:
+                    raise ValueError("encoder_hidden_states must be given for cross-attention layers")
+                cross_attention_outputs = self.crossattention(
+                    query_attention_output,
+                    attention_mask,
+                    head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )
+                query_attention_output = cross_attention_outputs[0]
+                # add cross attentions if we output attention weights
+                outputs = outputs + cross_attention_outputs[1:-1]
+
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk_query,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                query_attention_output,
+            )
+
+            if attention_output.shape[1] > query_length:
+                layer_output_text = apply_chunking_to_forward(
+                    self.feed_forward_chunk,
+                    self.chunk_size_feed_forward,
+                    self.seq_len_dim,
+                    attention_output[:, query_length:, :],
+                )
+                layer_output = torch.cat([layer_output, layer_output_text], dim=1)
+        else:
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                attention_output,
+            )
+        outputs = (layer_output,) + outputs
+
+        outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+    def feed_forward_chunk_query(self, attention_output):
+        intermediate_output = self.intermediate_query(attention_output)
+        layer_output = self.output_query(intermediate_output, attention_output)
+        return layer_output
+
+
+# Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerEncoder with Blip2->InstructBlip
+class InstructBlipQFormerEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [InstructBlipQFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        query_length=0,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions else None
+
+        next_decoder_cache = () if use_cache else None
+
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions, query_length)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                    query_length,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if layer_module.has_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class InstructBlipQFormerEmbeddings(nn.Module):
+    """Construct the embeddings from word and position embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+
+        self.config = config
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        query_embeds=None,
+        past_key_values_length=0,
+    ):
+        if input_ids is not None:
+            seq_length = input_ids.size()[1]
+        else:
+            seq_length = 0
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length].clone()
+
+        if input_ids is not None:
+            embeddings = self.word_embeddings(input_ids)
+            if self.position_embedding_type == "absolute":
+                position_embeddings = self.position_embeddings(position_ids.to(embeddings.device))
+                embeddings = embeddings + position_embeddings
+
+            if query_embeds is not None:
+                embeddings = torch.cat((query_embeds, embeddings), dim=1)
+        else:
+            embeddings = query_embeds
+
+        embeddings = embeddings.to(self.layernorm.weight.dtype)
+        embeddings = self.layernorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class InstructBlipQFormerModel(InstructBlipPreTrainedModel):
+    """
+    Querying Transformer (Q-Former), used in InstructBLIP. Slightly modified from BLIP-2 as it also takes the
+    instruction as input.
+    """
+
+    def __init__(self, config: InstructBlipQFormerConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = InstructBlipQFormerEmbeddings(config)
+
+        self.encoder = InstructBlipQFormerEncoder(config)
+
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def get_extended_attention_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_shape: Tuple[int],
+        device: torch.device,
+        has_query: bool = False,
+    ) -> torch.Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+
+        Arguments:
+            attention_mask (`torch.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (`Tuple[int]`):
+                The shape of the input to the model.
+            device: (`torch.device`):
+                The device of the input to the model.
+
+        Returns:
+            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - the model is an encoder, so make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})",
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        query_embeds: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of:
+            shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and
+            value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are
+            used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key
+            value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape
+            `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is None and query_embeds is None:
+            raise ValueError("You have to specify query_embeds when input_ids is None")
+
+        # past_key_values_length
+        past_key_values_length = (
+            past_key_values[0][0].shape[2] - self.config.query_length if past_key_values is not None else 0
+        )
+
+        query_length = query_embeds.shape[1] if query_embeds is not None else 0
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            query_embeds=query_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+
+        input_shape = embedding_output.size()[:-1]
+        batch_size, seq_length = input_shape
+        device = embedding_output.device
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, device)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_hidden_states is not None:
+            if type(encoder_hidden_states) == list:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size()
+            else:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+
+            if type(encoder_attention_mask) == list:
+                encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
+            elif encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            query_length=query_length,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = sequence_output[:, 0, :]
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    InstructBLIP Model for generating text given an image and an optional text prompt. The model consists of a vision
+    encoder, Querying Transformer (Q-Former) and a language model.
+
+    One can optionally pass `input_ids` to the model, which serve as a text prompt, to make the language model continue
+    the prompt. Otherwise, the language model starts generating text from the [BOS] (beginning-of-sequence) token.
+    """,
+    INSTRUCTBLIP_START_DOCSTRING,
+)
+class InstructBlipForConditionalGeneration(InstructBlipPreTrainedModel):
+    config_class = InstructBlipConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: InstructBlipConfig):
+        super().__init__(config)
+
+        self.vision_model = InstructBlipVisionModel(config.vision_config)
+
+        self.query_tokens = nn.Parameter(torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size))
+        self.qformer = InstructBlipQFormerModel(config.qformer_config)
+
+        self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
+
+        if config.use_decoder_only_language_model:
+            language_model = AutoModelForCausalLM.from_config(config.text_config)
+        else:
+            language_model = AutoModelForSeq2SeqLM.from_config(config.text_config)
+
+        if language_model._no_split_modules is not None:
+            self._no_split_modules.extend(language_model._no_split_modules)
+
+        if language_model._keep_in_fp32_modules is not None:
+            self._keep_in_fp32_modules.extend(language_model._keep_in_fp32_modules)
+
+        self.language_model = language_model
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+
+    def get_output_embeddings(self) -> nn.Module:
+        return self.language_model.get_output_embeddings()
+
+    def get_encoder(self):
+        return self.language_model.get_encoder()
+
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+
+    def _tie_weights(self):
+        if not self.config.use_decoder_only_language_model:
+            self.language_model.encoder.embed_tokens = self.language_model.shared
+            self.language_model.decoder.embed_tokens = self.language_model.shared
+
+    def _preprocess_accelerate(self):
+        r"""
+        Some pre-processing hacks to make the model `accelerate` compatible. Check
+        https://github.com/huggingface/transformers/pull/21707 for more details.
+        """
+        hf_device_map = self.hf_device_map
+
+        if len(hf_device_map) > 1 and "language_model" not in hf_device_map and torch.cuda.device_count() > 1:
+            # warn users about unexpected behavior when using multi-GPU + InstructBLIP + `accelerate`.
+            logger.warning(
+                "The `language_model` is not in the `hf_device_map` dictionary and you are running your script"
+                " in a multi-GPU environment. this may lead to unexpected behavior when using `accelerate`."
+                " Please pass a `device_map` that contains `language_model` to remove this warning."
+                " Please refer to https://github.com/huggingface/blog/blob/main/accelerate-large-models.md for"
+                " more details on creating a `device_map` for large models.",
+            )
+
+        if hasattr(self.language_model, "_hf_hook"):
+            self.language_model._hf_hook.io_same_device = True  # For `generate` compatibility
+
+    @add_start_docstrings_to_model_forward(INSTRUCTBLIP_INPUTS_DOCSTRING)
+    @replace_return_docstrings(
+        output_type=InstructBlipForConditionalGenerationModelOutput, config_class=InstructBlipVisionConfig
+    )
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        qformer_input_ids: torch.FloatTensor,
+        qformer_attention_mask: Optional[torch.LongTensor] = None,
+        input_ids: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, InstructBlipForConditionalGenerationModelOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size -
+            1]`. All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
+            config.vocab_size]`
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b")
+        >>> processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")
+
+        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
+        >>> model.to(device)  # doctest: +IGNORE_RESULT
+
+        >>> url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+        >>> prompt = "What is unusual about this image?"
+        >>> inputs = processor(images=image, text=prompt, return_tensors="pt").to(device)
+
+        >>> outputs = model.generate(
+        ...     **inputs,
+        ...     do_sample=False,
+        ...     num_beams=5,
+        ...     max_length=256,
+        ...     min_length=1,
+        ...     top_p=0.9,
+        ...     repetition_penalty=1.5,
+        ...     length_penalty=1.0,
+        ...     temperature=1,
+        ... )
+        >>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
+        >>> print(generated_text)
+        The unusual aspect of this image is that a man is ironing clothes on the back of a yellow SUV, which is parked in the middle of a busy city street. This is an unconventional approach to ironing clothes, as it requires the man to balance himself and his ironing equipment on top of the vehicle while navigating through traffic. Additionally, the presence of taxis and other vehicles in the scene further emphasizes the unusual nature of this situation.
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # step 1: forward the images through the vision encoder,
+        # to get image embeddings of shape (batch_size, seq_len, hidden_size)
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        image_embeds = vision_outputs[0]
+
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+
+        # difference with BLIP-2 here: we also feed the instruction prompt to the Q-Former
+        query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+        query_attention_mask = torch.ones(query_tokens.size()[:-1], dtype=torch.long, device=image_embeds.device)
+        if qformer_attention_mask is None:
+            qformer_attention_mask = torch.ones_like(qformer_input_ids)
+        qformer_attention_mask = torch.cat([query_attention_mask, qformer_attention_mask], dim=1)
+        query_outputs = self.qformer(
+            input_ids=qformer_input_ids,
+            attention_mask=qformer_attention_mask,
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        query_output = query_outputs[0][:, : query_tokens.size(1), :]
+
+        # step 3: use the language model, conditioned on the query outputs and the prompt
+        language_model_inputs = self.language_projection(query_output)
+        language_model_attention_mask = torch.ones(
+            language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device
+        )
+
+        inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
+
+        inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
+
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        attention_mask = torch.cat([language_model_attention_mask.to(attention_mask.device), attention_mask], dim=1)
+
+        if self.config.use_decoder_only_language_model:
+            outputs = self.language_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            logits = outputs.logits if return_dict else outputs[0]
+            loss = None
+            # we compute the loss here since we need to take into account the sequence length of the query embeds
+            if labels is not None:
+                labels = labels.to(logits.device)
+                logits = logits[:, -labels.size(1) :, :]
+                # Shift so that tokens < n predict n
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous().to(logits.device)
+
+                # Flatten the tokens
+                loss_fct = CrossEntropyLoss(reduction="mean")
+
+                loss = loss_fct(shift_logits.view(-1, self.config.text_config.vocab_size), shift_labels.view(-1))
+        else:
+            outputs = self.language_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                labels=labels,
+            )
+            loss = outputs.loss if return_dict else outputs[0]
+            logits = outputs.logits if return_dict else outputs[1]
+
+        if not return_dict:
+            output = (logits, vision_outputs, query_outputs, outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return InstructBlipForConditionalGenerationModelOutput(
+            loss=loss,
+            logits=logits,
+            vision_outputs=vision_outputs,
+            qformer_outputs=query_outputs,
+            language_model_outputs=outputs,
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        pixel_values: torch.FloatTensor,
+        qformer_input_ids: Optional[torch.LongTensor] = None,
+        qformer_attention_mask: Optional[torch.LongTensor] = None,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        **generate_kwargs,
+    ) -> torch.LongTensor:
+        """
+        Overrides `generate` function to be able to use the model as a conditional generator.
+
+        Args:
+            pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width)):
+                Input images to be processed.
+            qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
+                The sequence used as a prompt to be fed to the Q-Former module.
+            qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
+                Mask to avoid performing attention on padding token indices.
+            input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
+                The sequence used as a prompt for the generation.
+            attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
+                Mask to avoid performing attention on padding token indices.
+
+        Returns:
+            captions (list): A list of strings of length batch_size * num_captions.
+        """
+        if hasattr(self, "hf_device_map"):
+            # preprocess for `accelerate`
+            self._preprocess_accelerate()
+
+        batch_size = pixel_values.shape[0]
+        image_embeds = self.vision_model(pixel_values, return_dict=True).last_hidden_state
+
+        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+
+        query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+        query_attention_mask = torch.ones(query_tokens.size()[:-1], dtype=torch.long, device=image_embeds.device)
+        if qformer_attention_mask is None:
+            qformer_attention_mask = torch.ones_like(qformer_input_ids)
+        qformer_attention_mask = torch.cat([query_attention_mask, qformer_attention_mask], dim=1)
+        query_outputs = self.qformer(
+            input_ids=qformer_input_ids,
+            attention_mask=qformer_attention_mask,
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            return_dict=True,
+        )
+        query_output = query_outputs.last_hidden_state[:, : query_tokens.size(1), :]
+
+        language_model_inputs = self.language_projection(query_output)
+        language_attention_mask = torch.ones(
+            language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device
+        )
+
+        if input_ids is None:
+            input_ids = (
+                torch.LongTensor([[self.config.text_config.bos_token_id]])
+                .repeat(batch_size, 1)
+                .to(image_embeds.device)
+            )
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        attention_mask = torch.cat([language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1)
+
+        # concatenate query embeddings with prompt embeddings
+        inputs_embeds = self.get_input_embeddings()(input_ids)
+        inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
+
+        outputs = self.language_model.generate(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            **generate_kwargs,
+        )
+
+        # the InstructBLIP authors used inconsistent tokenizer/model files during training,
+        # with the tokenizer's bos token being set to </s> which has ID=2,
+        # whereas the model's text config has bos token id = 0
+        if self.config.text_config.architectures[0] == "LLaMAForCausalLM":
+            if isinstance(outputs, torch.Tensor):
+                outputs[outputs == 0] = 2
+            else:
+                outputs.sequences[outputs.sequences == 0] = 2
+
+        return outputs
diff --git a/src/transformers/models/instructblip/processing_instructblip.py b/src/transformers/models/instructblip/processing_instructblip.py
new file mode 100644
index 000000000000..ab4fa0f6753d
--- /dev/null
+++ b/src/transformers/models/instructblip/processing_instructblip.py
@@ -0,0 +1,172 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for InstructBLIP. Largely copy of Blip2Processor with addition of a tokenizer for the Q-Former.
+"""
+
+import os
+from typing import List, Optional, Union
+
+from ...image_processing_utils import BatchFeature
+from ...image_utils import ImageInput
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
+from ...utils import TensorType
+from ..auto import AutoTokenizer
+
+
+class InstructBlipProcessor(ProcessorMixin):
+    r"""
+    Constructs an InstructBLIP processor which wraps a BLIP image processor and a LLaMa/T5 tokenizer into a single
+    processor.
+
+    [`InstructBlipProcessor`] offers all the functionalities of [`BlipImageProcessor`] and [`AutoTokenizer`]. See the
+    docstring of [`~BlipProcessor.__call__`] and [`~BlipProcessor.decode`] for more information.
+
+    Args:
+        image_processor (`BlipImageProcessor`):
+            An instance of [`BlipImageProcessor`]. The image processor is a required input.
+        tokenizer (`AutoTokenizer`):
+            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
+        qformer_tokenizer (`AutoTokenizer`):
+            An instance of ['PreTrainedTokenizer`]. The Q-Former tokenizer is a required input.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "BlipImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(self, image_processor, tokenizer, qformer_tokenizer):
+        super().__init__(image_processor, tokenizer)
+
+        # add QFormer tokenizer
+        self.qformer_tokenizer = qformer_tokenizer
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_token_type_ids: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        This method uses [`BlipImageProcessor.__call__`] method to prepare image(s) for the model, and
+        [`BertTokenizerFast.__call__`] to prepare text for the model.
+
+        Please refer to the docstring of the above two methods for more information.
+        """
+        if images is None and text is None:
+            raise ValueError("You have to specify at least images or text.")
+
+        encoding = BatchFeature()
+
+        if text is not None:
+            text_encoding = self.tokenizer(
+                text=text,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_token_type_ids=return_token_type_ids,
+                return_length=return_length,
+                verbose=verbose,
+                return_tensors=return_tensors,
+                **kwargs,
+            )
+            encoding.update(text_encoding)
+            qformer_text_encoding = self.qformer_tokenizer(
+                text=text,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_token_type_ids=return_token_type_ids,
+                return_length=return_length,
+                verbose=verbose,
+                return_tensors=return_tensors,
+                **kwargs,
+            )
+            encoding["qformer_input_ids"] = qformer_text_encoding.pop("input_ids")
+            encoding["qformer_attention_mask"] = qformer_text_encoding.pop("attention_mask")
+
+        if images is not None:
+            image_encoding = self.image_processor(images, return_tensors=return_tensors)
+            encoding.update(image_encoding)
+
+        return encoding
+
+    # Copied from transformers.models.blip.processing_blip.BlipProcessor.batch_decode with BertTokenizerFast->PreTrainedTokenizer
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    # Copied from transformers.models.blip.processing_blip.BlipProcessor.decode with BertTokenizerFast->PreTrainedTokenizer
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
+        to the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    # Copied from transformers.models.blip.processing_blip.BlipProcessor.model_input_names
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+
+    # overwrite to save the Q-Former tokenizer in a separate folder
+    def save_pretrained(self, save_directory, **kwargs):
+        if os.path.isfile(save_directory):
+            raise ValueError(f"Provided path ({save_directory}) should be a directory, not a file")
+        os.makedirs(save_directory, exist_ok=True)
+        qformer_tokenizer_path = os.path.join(save_directory, "qformer_tokenizer")
+        self.qformer_tokenizer.save_pretrained(qformer_tokenizer_path)
+        return super().save_pretrained(save_directory, **kwargs)
+
+    # overwrite to load the Q-Former tokenizer from a separate folder
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        qformer_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, subfolder="qformer_tokenizer")
+        args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
+        args.append(qformer_tokenizer)
+        return cls(*args)
diff --git a/src/transformers/models/jukebox/__init__.py b/src/transformers/models/jukebox/__init__.py
index 774e06bc3409..d96fba4d47b5 100644
--- a/src/transformers/models/jukebox/__init__.py
+++ b/src/transformers/models/jukebox/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index 6ce345a8578e..d4a8f0a0072c 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ Jukebox configuration"""
 
-import copy
 import os
 from typing import List, Union
 
@@ -301,7 +300,7 @@ def __init__(
         spread=None,
         timing_dims=64,
         zero_out=False,
-        **kwargs
+        **kwargs,
     ):
         self.act_fn = act_fn
         self.alignment_head = alignment_head
@@ -353,6 +352,8 @@ def __init__(
     def from_pretrained(
         cls, pretrained_model_name_or_path: Union[str, os.PathLike], level=0, **kwargs
     ) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the prior config dict if we are loading from JukeboxConfig
@@ -367,18 +368,6 @@ def from_pretrained(
 
         return cls.from_dict(config_dict, **kwargs)
 
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["encoder_config"] = self.encoder_config.to_dict() if self.encoder_config is not None else None
-        output["model_type"] = self.__class__.model_type
-        return output
-
 
 class JukeboxVQVAEConfig(PretrainedConfig):
     """
@@ -459,7 +448,7 @@ def __init__(
         sample_length=1058304,
         init_scale=0.2,
         zero_out=False,
-        **kwargs
+        **kwargs,
     ):
         self.hop_fraction = hop_fraction
         self.conv_input_shape = conv_input_shape
@@ -486,6 +475,7 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
 
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
@@ -540,8 +530,6 @@ class JukeboxConfig(PretrainedConfig):
         metadata_conditioning (`bool`, *optional*, defaults to `True`):
             Whether or not to use metadata conditioning, corresponding to the artist, the genre and the min/maximum
             duration.
-        init_std (`float`, *optional*, defaults to 0.2):
-            Standard deviation used to initial the model.
 
     Example:
 
@@ -560,7 +548,6 @@ class JukeboxConfig(PretrainedConfig):
     """
 
     model_type = "jukebox"
-    is_composition = True
 
     def __init__(
         self,
@@ -573,10 +560,8 @@ def __init__(
         max_duration=600.0,
         max_nb_genres=5,
         metadata_conditioning=True,
-        init_std=0.2,
         **kwargs,
     ):
-
         if vqvae_config is None:
             vqvae_config = {}
             logger.info("vqvae_config is None. initializing the JukeboxVQVAE with default values.")
@@ -598,7 +583,6 @@ def __init__(
 
         self.hop_fraction = self.vqvae_config.hop_fraction
 
-        self.init_std = init_std
         self.nb_priors = nb_priors
 
         # Metadata conditioning
@@ -624,16 +608,7 @@ def from_configs(cls, prior_configs: List[JukeboxPriorConfig], vqvae_config: Juk
         return cls(prior_config_list=prior_config_list, vqvae_config_dict=vqvae_config.to_dict(), **kwargs)
 
     def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        for i, config in enumerate(output.pop("prior_configs")):
-            output[f"prior_{i}"] = config.to_dict()
-
-        output["vqvae_config"] = self.vqvae_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
+        # Override the default to_dict to apply to_dict to the list of prior configs.
+        result = super().to_dict()
+        result["prior_config_list"] = [config.to_dict() for config in result.pop("prior_configs")]
+        return result
diff --git a/src/transformers/models/jukebox/convert_jukebox.py b/src/transformers/models/jukebox/convert_jukebox.py
index c8d0831e53f3..b56a25c57c70 100644
--- a/src/transformers/models/jukebox/convert_jukebox.py
+++ b/src/transformers/models/jukebox/convert_jukebox.py
@@ -19,9 +19,9 @@
 import os
 from pathlib import Path
 
+import requests
 import torch
 
-import requests
 from transformers import JukeboxConfig, JukeboxModel
 from transformers.utils import logging
 
@@ -97,26 +97,25 @@ def fix_jukebox_keys(state_dict, model_state_dict, key_prefix, mapping):
     new_dict = {}
     import re
 
-    re_encoder_block_conv_in = re.compile("encoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).(bias|weight)")
+    re_encoder_block_conv_in = re.compile(r"encoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).(bias|weight)")
     re_encoder_block_resnet = re.compile(
-        "encoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).model.(\d*).model.(\d*).(bias|weight)"
+        r"encoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).model.(\d*).model.(\d*).(bias|weight)"
     )
-    re_encoder_block_proj_out = re.compile("encoders.(\d*).level_blocks.(\d*).model.(\d*).(bias|weight)")
+    re_encoder_block_proj_out = re.compile(r"encoders.(\d*).level_blocks.(\d*).model.(\d*).(bias|weight)")
 
-    re_decoder_block_conv_out = re.compile("decoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).(bias|weight)")
+    re_decoder_block_conv_out = re.compile(r"decoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).(bias|weight)")
     re_decoder_block_resnet = re.compile(
-        "decoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).model.(\d*).model.(\d*).(bias|weight)"
+        r"decoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).model.(\d*).model.(\d*).(bias|weight)"
     )
-    re_decoder_block_proj_in = re.compile("decoders.(\d*).level_blocks.(\d*).model.(\d*).(bias|weight)")
+    re_decoder_block_proj_in = re.compile(r"decoders.(\d*).level_blocks.(\d*).model.(\d*).(bias|weight)")
 
-    re_prior_cond_conv_out = re.compile("conditioner_blocks.(\d*).cond.model.(\d*).(\d).(bias|weight)")
+    re_prior_cond_conv_out = re.compile(r"conditioner_blocks.(\d*).cond.model.(\d*).(\d).(bias|weight)")
     re_prior_cond_resnet = re.compile(
-        "conditioner_blocks.(\d*).cond.model.(\d*).(\d).model.(\d*).model.(\d*).(bias|weight)"
+        r"conditioner_blocks.(\d*).cond.model.(\d*).(\d).model.(\d*).model.(\d*).(bias|weight)"
     )
-    re_prior_cond_proj_in = re.compile("conditioner_blocks.(\d*).cond.model.(\d*).(bias|weight)")
+    re_prior_cond_proj_in = re.compile(r"conditioner_blocks.(\d*).cond.model.(\d*).(bias|weight)")
 
     for original_key, value in state_dict.items():
-
         # rename vqvae.encoder keys
         if re_encoder_block_conv_in.fullmatch(original_key):
             regex_match = re_encoder_block_conv_in.match(original_key)
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 949ddb227336..236d1f4ff37b 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -71,7 +71,7 @@ def filter_logits(logits, top_k=0, top_p=0.0, filter_value=-float("Inf")):
         sorted_indices_to_remove[..., 0] = 0
 
         # indices_to_remove = sorted_indices[sorted_indices_to_remove]
-        indices_to_remove = torch.zeros_like(logits, dtype=torch.uint8).scatter_(
+        indices_to_remove = torch.zeros_like(logits, dtype=torch.bool).scatter_(
             dim=-1, index=sorted_indices, src=sorted_indices_to_remove
         )
         logits[indices_to_remove] = filter_value
@@ -138,7 +138,7 @@ def get_alignment(music_tokens, labels, prior, config):
 
     hop_length = int(config.hop_fraction[-level - 1] * prior.n_ctx)
     alignment_head, alignment_layer = config.prior_alignment_head[0], config.prior_alignment_layer[0]
-    attn_layers = set([alignment_layer])
+    attn_layers = {alignment_layer}
     alignment_hops = {}
     indices_hops = {}
     for start in tqdm(get_starts(total_length, n_ctx, hop_length), desc="Computing lyric to music alignment "):
@@ -436,7 +436,7 @@ def update_codebook(self, hidden_states, latent_states):
             used_curr = (_codebook_elem >= self.threshold).sum()
             usage = torch.sum(usage)
             dk = torch.norm(self.codebook - old_codebook) / np.sqrt(np.prod(old_codebook.shape))
-        return dict(entropy=entropy, used_curr=used_curr, usage=usage, dk=dk)
+        return {"entropy": entropy, "used_curr": used_curr, "usage": usage, "dk": dk}
 
     def preprocess(self, hidden_states):
         hidden_states = hidden_states.permute(0, 2, 1).contiguous()
@@ -602,7 +602,6 @@ def forward(self, input_audio):
 class JukeboxVQVAE(PreTrainedModel):
     config_class = JukeboxVQVAEConfig
     base_model_prefix = "vqvae"
-    _keys_to_ignore_on_load_unexpected = [r"priors"]
 
     def _init_weights(self, module):
         if isinstance(module, nn.Embedding):  # embed_tokens
@@ -1625,7 +1624,6 @@ class JukeboxMusicTokenConditioner(nn.Module):
     """
 
     def __init__(self, config, level):
-
         super().__init__()
         self.embed_tokens = nn.Embedding(config.music_vocab_size, config.hidden_size)
         config.embed_dim = config.music_vocab_size  # setting correct argument for the `JukeboxDecoder`
@@ -1793,7 +1791,6 @@ class JukeboxPrior(PreTrainedModel):
     """
 
     config_class = JukeboxPriorConfig
-    _keys_to_ignore_on_load_unexpected = ["vqvae"]
 
     def _init_weights(self, module):
         init_scale = self.config.init_scale
@@ -1833,7 +1830,6 @@ def __init__(self, config: JukeboxPriorConfig, level=None, nb_priors=3, vqvae_en
         self.level = level if level is not None else config.level
 
         self.base_model_prefix = f"priors.{self.level}"
-        self._keys_to_ignore_on_load_unexpected += [r"priors.[^%d]." % self.level]
 
         self.n_ctx = config.n_ctx
 
@@ -2214,11 +2210,11 @@ def forward_tokens(
         loss = self.encoder_loss_fraction * encoder_loss * self.nb_relevant_lyric_tokens / self.total_loss_dims
         loss += next_token_prediction_loss * self.next_token_prediction_loss_dims / self.total_loss_dims
 
-        metrics = dict(
-            bpd=next_token_prediction_loss.clone().detach(),
-            encoder_loss=encoder_loss.clone().detach(),
-            next_token_prediction_loss=next_token_prediction_loss.clone().detach(),
-        )
+        metrics = {
+            "bpd": next_token_prediction_loss.clone().detach(),
+            "encoder_loss": encoder_loss.clone().detach(),
+            "next_token_prediction_loss": next_token_prediction_loss.clone().detach(),
+        }
         if get_preds:
             metrics["preds"] = preds.clone().detach()
         if get_attn_weights:
@@ -2501,11 +2497,11 @@ def _sample(
         Example:
 
         ```python
-        >>> from transformers import JukeboxTokenizer, JukeboxModel, set_seed
+        >>> from transformers import AutoTokenizer, JukeboxModel, set_seed
         >>> import torch
 
         >>> metas = dict(artist="Zac Brown Band", genres="Country", lyrics="I met a traveller from an antique land")
-        >>> tokenizer = JukeboxTokenizer.from_pretrained("openai/jukebox-1b-lyrics")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai/jukebox-1b-lyrics")
         >>> model = JukeboxModel.from_pretrained("openai/jukebox-1b-lyrics", min_duration=0).eval()
 
         >>> labels = tokenizer(**metas)["input_ids"]
@@ -2534,11 +2530,11 @@ def _sample(
         # total length of the signal, might be bit different from the actual generated length
         self.total_length = total_length
         for level in sample_levels:
-            sampling_kwargs = dict(
-                temp=0.99 if level == len(self.priors) - 1 else sampling_temperature,
-                chunk_size=chunk_size,
-                sample_tokens=sample_tokens,
-            )
+            sampling_kwargs = {
+                "temp": 0.99 if level == len(self.priors) - 1 else sampling_temperature,
+                "chunk_size": chunk_size,
+                "sample_tokens": sample_tokens,
+            }
             # Set correct total_length, hop_length, labels and sampling_kwargs for level
 
             total_token_to_sample = total_length // self.priors[level].raw_to_tokens
@@ -2594,10 +2590,10 @@ def ancestral_sample(self, labels, n_samples=1, **sampling_kwargs) -> List[torch
         Example:
 
         ```python
-        >>> from transformers import JukeboxTokenizer, JukeboxModel, set_seed
+        >>> from transformers import AutoTokenizer, JukeboxModel, set_seed
 
         >>> model = JukeboxModel.from_pretrained("openai/jukebox-1b-lyrics", min_duration=0).eval()
-        >>> tokenizer = JukeboxTokenizer.from_pretrained("openai/jukebox-1b-lyrics")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai/jukebox-1b-lyrics")
 
         >>> lyrics = "Hey, are you awake? Can you talk to me?"
         >>> artist = "Zac Brown Band"
diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index 01bada0e0806..dcf47f46f7de 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -23,13 +23,12 @@
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
-
 import regex
-from transformers.utils.generic import _is_jax, _is_numpy
 
 from ...tokenization_utils import AddedToken, PreTrainedTokenizer
 from ...tokenization_utils_base import BatchEncoding
 from ...utils import TensorType, is_flax_available, is_tf_available, is_torch_available, logging
+from ...utils.generic import _is_jax, _is_numpy
 
 
 logger = logging.get_logger(__name__)
@@ -69,13 +68,13 @@ class JukeboxTokenizer(PreTrainedTokenizer):
     as the conditioning of the model can be done on the three different queries. If None is provided, defaults values will be used.:
 
     Depending on the number of genres on which the model should be conditioned (`n_genres`).
-    ```
+    ```python
     >>> from transformers import JukeboxTokenizer
+
     >>> tokenizer = JukeboxTokenizer.from_pretrained("openai/jukebox-1b-lyrics")
-    >>> tokenizer("Alan Jackson", "Country Rock", "old town road")['input_ids']
+    >>> tokenizer("Alan Jackson", "Country Rock", "old town road")["input_ids"]
     [tensor([[   0,    0,    0, 6785,  546,   41,   38,   30,   76,   46,   41,   49,
                40,   76,   44,   41,   27,   30]]), tensor([[  0,   0,   0, 145,   0]]), tensor([[  0,   0,   0, 145,   0]])]
-
     ```
 
     You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
@@ -126,19 +125,13 @@ def __init__(
         max_n_lyric_tokens=512,
         n_genres=5,
         unk_token="<|endoftext|>",
-        **kwargs
+        **kwargs,
     ):
         unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
-        super().__init__(
-            unk_token=unk_token,
-            n_genres=n_genres,
-            version=version,
-            max_n_lyric_tokens=max_n_lyric_tokens,
-            **kwargs,
-        )
         self.version = version
         self.max_n_lyric_tokens = max_n_lyric_tokens
         self.n_genres = n_genres
+        self._added_tokens_decoder = {0: unk_token}
 
         with open(artists_file, encoding="utf-8") as vocab_handle:
             self.artists_encoder = json.load(vocab_handle)
@@ -149,22 +142,33 @@ def __init__(
         with open(lyrics_file, encoding="utf-8") as vocab_handle:
             self.lyrics_encoder = json.load(vocab_handle)
 
-        oov = "[^A-Za-z0-9.,:;!?\-'\"()\[\] \t\n]+"
+        oov = r"[^A-Za-z0-9.,:;!?\-'\"()\[\] \t\n]+"
         # In v2, we had a n_vocab=80 and in v3 we missed + and so n_vocab=79 of characters.
         if len(self.lyrics_encoder) == 79:
-            oov = oov.replace("\-'", "\-+'")
+            oov = oov.replace(r"\-'", r"\-+'")
 
         self.out_of_vocab = regex.compile(oov)
         self.artists_decoder = {v: k for k, v in self.artists_encoder.items()}
         self.genres_decoder = {v: k for k, v in self.genres_encoder.items()}
         self.lyrics_decoder = {v: k for k, v in self.lyrics_encoder.items()}
+        super().__init__(
+            unk_token=unk_token,
+            n_genres=n_genres,
+            version=version,
+            max_n_lyric_tokens=max_n_lyric_tokens,
+            **kwargs,
+        )
 
     @property
     def vocab_size(self):
         return len(self.artists_encoder) + len(self.genres_encoder) + len(self.lyrics_encoder)
 
     def get_vocab(self):
-        return dict(self.artists_encoder, self.genres_encoder, self.lyrics_encoder)
+        return {
+            "artists_encoder": self.artists_encoder,
+            "genres_encoder": self.genres_encoder,
+            "lyrics_encoder": self.lyrics_encoder,
+        }
 
     def _convert_token_to_id(self, list_artists, list_genres, list_lyrics):
         """Converts the artist, genre and lyrics tokens to their index using the vocabulary.
@@ -187,7 +191,7 @@ def _tokenize(self, lyrics):
         Do NOT take care of added tokens. Only the lyrics are split into character for the character-based vocabulary.
         """
         # only lyrics are not tokenized, but character based is easily handled
-        return [character for character in lyrics]
+        return list(lyrics)
 
     def tokenize(self, artist, genre, lyrics, **kwargs):
         """
@@ -203,9 +207,6 @@ def prepare_for_tokenization(
         """
         Performs any necessary transformations before tokenization.
 
-        This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
-        `kwargs` at the end of the encoding process to be sure all the arguments have been used.
-
         Args:
             artist (`str`):
                 The artist name to prepare. This will mostly lower the string
@@ -217,8 +218,6 @@ def prepare_for_tokenization(
                 Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
                 tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
                 which it will tokenize. This is useful for NER or token classification.
-            kwargs:
-                Keyword arguments to use for the tokenization.
         """
         for idx in range(len(self.version)):
             if self.version[idx] == "v3":
@@ -231,7 +230,7 @@ def prepare_for_tokenization(
                 ]  # split is for the full dictionary with combined genres
 
         if self.version[0] == "v2":
-            self.out_of_vocab = regex.compile("[^A-Za-z0-9.,:;!?\-'\"()\[\] \t\n]+")
+            self.out_of_vocab = regex.compile(r"[^A-Za-z0-9.,:;!?\-'\"()\[\] \t\n]+")
             vocab = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,:;!?-+'\"()[] \t\n"
             self.vocab = {vocab[index]: index + 1 for index in range(len(vocab))}
             self.vocab["<unk>"] = 0
@@ -240,7 +239,7 @@ def prepare_for_tokenization(
             self.lyrics_decoder = {v: k for k, v in self.vocab.items()}
             self.lyrics_decoder[0] = ""
         else:
-            self.out_of_vocab = regex.compile("[^A-Za-z0-9.,:;!?\-+'\"()\[\] \t\n]+")
+            self.out_of_vocab = regex.compile(r"[^A-Za-z0-9.,:;!?\-+'\"()\[\] \t\n]+")
 
         lyrics = self._run_strip_accents(lyrics)
         lyrics = lyrics.replace("\\", "\n")
diff --git a/src/transformers/models/layoutlm/__init__.py b/src/transformers/models/layoutlm/__init__.py
index cbedd3868d75..e172dd1dc791 100644
--- a/src/transformers/models/layoutlm/__init__.py
+++ b/src/transformers/models/layoutlm/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/layoutlm/configuration_layoutlm.py b/src/transformers/models/layoutlm/configuration_layoutlm.py
index 2d6e6477ee01..0ca51e6d5790 100644
--- a/src/transformers/models/layoutlm/configuration_layoutlm.py
+++ b/src/transformers/models/layoutlm/configuration_layoutlm.py
@@ -16,11 +16,9 @@
 from collections import OrderedDict
 from typing import Any, List, Mapping, Optional
 
-from transformers import PretrainedConfig, PreTrainedTokenizer, TensorType
-
-from ... import is_torch_available
+from ... import PretrainedConfig, PreTrainedTokenizer
 from ...onnx import OnnxConfig, PatchingSpec
-from ...utils import logging
+from ...utils import TensorType, is_torch_available, logging
 
 
 logger = logging.get_logger(__name__)
@@ -74,6 +72,17 @@ class LayoutLMConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The value used to pad input_ids.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
         max_2d_position_embeddings (`int`, *optional*, defaults to 1024):
             The maximum value that the 2D position embedding might ever used. Typically set this to something large
             just in case (e.g., 1024).
@@ -111,9 +120,8 @@ def __init__(
         pad_token_id=0,
         position_embedding_type="absolute",
         use_cache=True,
-        classifier_dropout=None,
         max_2d_position_embeddings=1024,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
         self.vocab_size = vocab_size
@@ -130,7 +138,6 @@ def __init__(
         self.layer_norm_eps = layer_norm_eps
         self.position_embedding_type = position_embedding_type
         self.use_cache = use_cache
-        self.classifier_dropout = classifier_dropout
         self.max_2d_position_embeddings = max_2d_position_embeddings
 
 
diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py
index 8ff5ff092edd..884a2799728b 100644
--- a/src/transformers/models/layoutlm/modeling_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_layoutlm.py
@@ -68,7 +68,9 @@ def __init__(self, config):
         self.LayerNorm = LayoutLMLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
     def forward(
         self,
@@ -469,6 +471,13 @@ def forward(
         all_self_attentions = () if output_attentions else None
         all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         next_decoder_cache = () if use_cache else None
         for i, layer_module in enumerate(self.layer):
             if output_hidden_states:
@@ -479,12 +488,6 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         return module(*inputs, past_key_value, output_attentions)
@@ -618,7 +621,6 @@ class LayoutLMPreTrainedModel(PreTrainedModel):
     pretrained_model_archive_map = LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST
     base_model_prefix = "layoutlm"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -661,7 +663,7 @@ def _set_gradient_checkpointing(self, module, value=False):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`LayoutLMTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -793,6 +795,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -856,11 +859,7 @@ def forward(
 
 @add_start_docstrings("""LayoutLM Model with a `language modeling` head on top.""", LAYOUTLM_START_DOCSTRING)
 class LayoutLMForMaskedLM(LayoutLMPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        "cls.predictions.decoder.bias",
-        "cls.predictions.decoder.weight",
-        "embeddings.position_ids",
-    ]
+    _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -892,8 +891,8 @@ def forward(
         head_mask: Optional[torch.FloatTensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
diff --git a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
index 74edd11009cd..c75660946859 100644
--- a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
@@ -14,6 +14,9 @@
 # limitations under the License.
 """ TF 2.0 LayoutLM model."""
 
+
+from __future__ import annotations
+
 import math
 import warnings
 from typing import Dict, Optional, Tuple, Union
@@ -41,7 +44,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_layoutlm import LayoutLMConfig
 
@@ -62,8 +65,7 @@ class TFLayoutLMEmbeddings(tf.keras.layers.Layer):
     def __init__(self, config: LayoutLMConfig, **kwargs):
         super().__init__(**kwargs)
 
-        self.vocab_size = config.vocab_size
-        self.type_vocab_size = config.type_vocab_size
+        self.config = config
         self.hidden_size = config.hidden_size
         self.max_position_embeddings = config.max_position_embeddings
         self.max_2d_position_embeddings = config.max_2d_position_embeddings
@@ -75,14 +77,14 @@ def build(self, input_shape: tf.TensorShape):
         with tf.name_scope("word_embeddings"):
             self.weight = self.add_weight(
                 name="weight",
-                shape=[self.vocab_size, self.hidden_size],
+                shape=[self.config.vocab_size, self.hidden_size],
                 initializer=get_initializer(self.initializer_range),
             )
 
         with tf.name_scope("token_type_embeddings"):
             self.token_type_embeddings = self.add_weight(
                 name="embeddings",
-                shape=[self.type_vocab_size, self.hidden_size],
+                shape=[self.config.type_vocab_size, self.hidden_size],
                 initializer=get_initializer(self.initializer_range),
             )
 
@@ -141,16 +143,7 @@ def call(
         assert not (input_ids is None and inputs_embeds is None)
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
@@ -433,9 +426,9 @@ def call(
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
         head_mask: tf.Tensor,
-        encoder_hidden_states: Optional[tf.Tensor],
-        encoder_attention_mask: Optional[tf.Tensor],
-        past_key_value: Optional[Tuple[tf.Tensor]],
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_value: Tuple[tf.Tensor] | None,
         output_attentions: bool,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
@@ -512,9 +505,9 @@ def call(
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
         head_mask: tf.Tensor,
-        encoder_hidden_states: Optional[tf.Tensor],
-        encoder_attention_mask: Optional[tf.Tensor],
-        past_key_values: Optional[Tuple[Tuple[tf.Tensor]]],
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_values: Tuple[Tuple[tf.Tensor]] | None,
         use_cache: Optional[bool],
         output_attentions: bool,
         output_hidden_states: bool,
@@ -622,7 +615,7 @@ class TFLayoutLMLMPredictionHead(tf.keras.layers.Layer):
     def __init__(self, config: LayoutLMConfig, input_embeddings: tf.keras.layers.Layer, **kwargs):
         super().__init__(**kwargs)
 
-        self.vocab_size = config.vocab_size
+        self.config = config
         self.hidden_size = config.hidden_size
 
         self.transform = TFLayoutLMPredictionHeadTransform(config, name="transform")
@@ -632,7 +625,7 @@ def __init__(self, config: LayoutLMConfig, input_embeddings: tf.keras.layers.Lay
         self.input_embeddings = input_embeddings
 
     def build(self, input_shape: tf.TensorShape):
-        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
+        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
 
         super().build(input_shape)
 
@@ -648,14 +641,14 @@ def get_bias(self) -> Dict[str, tf.Variable]:
 
     def set_bias(self, value: tf.Variable):
         self.bias = value["bias"]
-        self.vocab_size = shape_list(value["bias"])[0]
+        self.config.vocab_size = shape_list(value["bias"])[0]
 
     def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
         hidden_states = self.transform(hidden_states=hidden_states)
         seq_length = shape_list(hidden_states)[1]
         hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
         hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
-        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
         hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
 
         return hidden_states
@@ -704,21 +697,20 @@ class PreTrainedModel
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        bbox: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        bbox: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFBaseModelOutputWithPoolingAndCrossAttentions, Tuple[tf.Tensor]]:
-
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
@@ -862,7 +854,7 @@ class TFLayoutLMPreTrainedModel(TFPreTrainedModel):
         input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`LayoutLMTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -930,15 +922,15 @@ def __init__(self, config: LayoutLMConfig, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        bbox: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        bbox: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -994,27 +986,6 @@ def call(
 
         return outputs
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertModel.serving_output
-    def serving_output(
-        self, output: TFBaseModelOutputWithPoolingAndCrossAttentions
-    ) -> TFBaseModelOutputWithPoolingAndCrossAttentions:
-        output_cache = self.config.use_cache and self.config.is_decoder
-        pkv = tf.convert_to_tensor(output.past_key_values) if output_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = tf.convert_to_tensor(output.cross_attentions) if output.cross_attentions is not None else None
-        if not (self.config.output_attentions and self.config.add_cross_attention):
-            cross_attns = None
-
-        return TFBaseModelOutputWithPoolingAndCrossAttentions(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            past_key_values=pkv,
-            hidden_states=hs,
-            attentions=attns,
-            cross_attentions=cross_attns,
-        )
-
 
 @add_start_docstrings("""LayoutLM Model with a `language modeling` head on top.""", LAYOUTLM_START_DOCSTRING)
 class TFLayoutLMForMaskedLM(TFLayoutLMPreTrainedModel, TFMaskedLanguageModelingLoss):
@@ -1050,17 +1021,17 @@ def get_prefix_bias_name(self) -> str:
     @replace_return_docstrings(output_type=TFMaskedLMOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        bbox: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        bbox: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1136,12 +1107,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1173,17 +1138,17 @@ def __init__(self, config: LayoutLMConfig, *inputs, **kwargs):
     @replace_return_docstrings(output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        bbox: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        bbox: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1260,12 +1225,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1303,17 +1262,17 @@ def __init__(self, config: LayoutLMConfig, *inputs, **kwargs):
     @replace_return_docstrings(output_type=TFTokenClassifierOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        bbox: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        bbox: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1388,12 +1347,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1429,18 +1382,18 @@ def __init__(self, config: LayoutLMConfig, *inputs, **kwargs):
     @replace_return_docstrings(output_type=TFQuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        bbox: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        bbox: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1532,11 +1485,3 @@ def call(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
-
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
diff --git a/src/transformers/models/layoutlm/tokenization_layoutlm.py b/src/transformers/models/layoutlm/tokenization_layoutlm.py
index bc7fbc9bb95b..de6bc4de953d 100644
--- a/src/transformers/models/layoutlm/tokenization_layoutlm.py
+++ b/src/transformers/models/layoutlm/tokenization_layoutlm.py
@@ -132,22 +132,8 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
-        super().__init__(
-            do_lower_case=do_lower_case,
-            do_basic_tokenize=do_basic_tokenize,
-            never_split=never_split,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            tokenize_chinese_chars=tokenize_chinese_chars,
-            strip_accents=strip_accents,
-            **kwargs,
-        )
-
         if not os.path.isfile(vocab_file):
             raise ValueError(
                 f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
@@ -163,7 +149,22 @@ def __init__(
                 tokenize_chinese_chars=tokenize_chinese_chars,
                 strip_accents=strip_accents,
             )
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+
+        super().__init__(
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
 
     @property
     def do_lower_case(self):
@@ -176,11 +177,12 @@ def vocab_size(self):
     def get_vocab(self):
         return dict(self.vocab, **self.added_tokens_encoder)
 
-    def _tokenize(self, text):
+    def _tokenize(self, text, split_special_tokens=False):
         split_tokens = []
         if self.do_basic_tokenize:
-            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-
+            for token in self.basic_tokenizer.tokenize(
+                text, never_split=self.all_special_tokens if not split_special_tokens else None
+            ):
                 # If the token is part of the never_split set
                 if token in self.basic_tokenizer.never_split:
                     split_tokens.append(token)
@@ -325,20 +327,30 @@ class BasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -357,7 +369,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -385,7 +399,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
diff --git a/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py b/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py
index 42d664c061c7..afa92abaf877 100644
--- a/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py
+++ b/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py
@@ -117,7 +117,7 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
@@ -166,7 +166,7 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
         """
         output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
 
-        if token_ids_1:
+        if token_ids_1 is not None:
             output += token_ids_1 + [self.sep_token_id]
 
         return output
diff --git a/src/transformers/models/layoutlmv2/__init__.py b/src/transformers/models/layoutlmv2/__init__.py
index 5da6a9614280..9eccb238780f 100644
--- a/src/transformers/models/layoutlmv2/__init__.py
+++ b/src/transformers/models/layoutlmv2/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/layoutlmv2/configuration_layoutlmv2.py b/src/transformers/models/layoutlmv2/configuration_layoutlmv2.py
index db14690387aa..3cc8027c1dd5 100644
--- a/src/transformers/models/layoutlmv2/configuration_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/configuration_layoutlmv2.py
@@ -145,7 +145,7 @@ def __init__(
         has_spatial_attention_bias=True,
         has_visual_segment_embedding=False,
         detectron2_config_args=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_size=vocab_size,
diff --git a/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py b/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py
index 454dc50cb443..b1e6c0731d29 100644
--- a/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py
@@ -18,21 +18,18 @@
 
 import numpy as np
 
-from transformers.utils import is_vision_available
-from transformers.utils.generic import TensorType
-
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import resize, to_channel_dimension_format, to_pil_image
+from ...image_transforms import flip_channel_order, resize, to_channel_dimension_format, to_pil_image
 from ...image_utils import (
     ChannelDimension,
     ImageInput,
     PILImageResampling,
     infer_channel_dimension_format,
-    is_batched,
+    make_list_of_images,
     to_numpy_array,
     valid_images,
 )
-from ...utils import is_pytesseract_available, logging, requires_backends
+from ...utils import TensorType, is_pytesseract_available, is_vision_available, logging, requires_backends
 
 
 if is_vision_available():
@@ -54,12 +51,17 @@ def normalize_box(box, width, height):
     ]
 
 
-def apply_tesseract(image: np.ndarray, lang: Optional[str], tesseract_config: Optional[str] = None):
+def apply_tesseract(
+    image: np.ndarray,
+    lang: Optional[str],
+    tesseract_config: Optional[str] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+):
     """Applies Tesseract OCR on a document image, and returns recognized words + normalized bounding boxes."""
     tesseract_config = tesseract_config if tesseract_config is not None else ""
 
     # apply OCR
-    pil_image = to_pil_image(image)
+    pil_image = to_pil_image(image, input_data_format=input_data_format)
     image_width, image_height = pil_image.size
     data = pytesseract.image_to_data(pil_image, lang=lang, output_type="dict", config=tesseract_config)
     words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"]
@@ -88,20 +90,6 @@ def apply_tesseract(image: np.ndarray, lang: Optional[str], tesseract_config: Op
     return words, normalized_boxes
 
 
-def flip_channel_order(image: np.ndarray, data_format: Optional[ChannelDimension] = None) -> np.ndarray:
-    input_data_format = infer_channel_dimension_format(image)
-    if input_data_format == ChannelDimension.LAST:
-        image = image[..., ::-1]
-    elif input_data_format == ChannelDimension.FIRST:
-        image = image[:, ::-1, ...]
-    else:
-        raise ValueError(f"Unsupported channel dimension: {input_data_format}")
-
-    if data_format is not None:
-        image = to_channel_dimension_format(image, data_format)
-    return image
-
-
 class LayoutLMv2ImageProcessor(BaseImageProcessor):
     r"""
     Constructs a LayoutLMv2 image processor.
@@ -112,7 +100,7 @@ class LayoutLMv2ImageProcessor(BaseImageProcessor):
             overridden by `do_resize` in `preprocess`.
         size (`Dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`):
             Size of the image after resizing. Can be overridden by `size` in `preprocess`.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
             Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
             `preprocess` method.
         apply_ocr (`bool`, *optional*, defaults to `True`):
@@ -121,7 +109,7 @@ class LayoutLMv2ImageProcessor(BaseImageProcessor):
         ocr_lang (`str`, *optional*):
             The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
             used. Can be overridden by `ocr_lang` in `preprocess`.
-        tesseract_config (`str`, *optional*):
+        tesseract_config (`str`, *optional*, defaults to `""`):
             Any additional custom configuration flags that are forwarded to the `config` parameter when calling
             Tesseract. For example: '--psm 6'. Can be overridden by `tesseract_config` in `preprocess`.
     """
@@ -136,7 +124,7 @@ def __init__(
         apply_ocr: bool = True,
         ocr_lang: Optional[str] = None,
         tesseract_config: Optional[str] = "",
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"height": 224, "width": 224}
@@ -149,13 +137,15 @@ def __init__(
         self.ocr_lang = ocr_lang
         self.tesseract_config = tesseract_config
 
+    # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize
     def resize(
         self,
         image: np.ndarray,
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BILINEAR,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image to `(size["height"], size["width"])`.
@@ -164,17 +154,37 @@ def resize(
             image (`np.ndarray`):
                 Image to resize.
             size (`Dict[str, int]`):
-                Size of the output image.
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
             resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
-                Resampling filter to use when resizing the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
         """
         size = get_size_dict(size)
         if "height" not in size or "width" not in size:
-            raise ValueError(f"The size dictionary must contain the keys 'height' and 'width'. Got {size.keys()}")
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
         output_size = (size["height"], size["width"])
-        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
 
     def preprocess(
         self,
@@ -187,6 +197,7 @@ def preprocess(
         tesseract_config: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
         **kwargs,
     ) -> PIL.Image.Image:
         """
@@ -230,8 +241,7 @@ def preprocess(
         ocr_lang = ocr_lang if ocr_lang is not None else self.ocr_lang
         tesseract_config = tesseract_config if tesseract_config is not None else self.tesseract_config
 
-        if not is_batched(images):
-            images = [images]
+        images = make_list_of_images(images)
 
         if not valid_images(images):
             raise ValueError(
@@ -245,21 +255,30 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
         if apply_ocr:
             requires_backends(self, "pytesseract")
             words_batch = []
             boxes_batch = []
             for image in images:
-                words, boxes = apply_tesseract(image, ocr_lang, tesseract_config)
+                words, boxes = apply_tesseract(image, ocr_lang, tesseract_config, input_data_format=input_data_format)
                 words_batch.append(words)
                 boxes_batch.append(boxes)
 
         if do_resize:
-            images = [self.resize(image=image, size=size, resample=resample) for image in images]
+            images = [
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
 
         # flip color channels from RGB to BGR (as Detectron2 requires this)
-        images = [flip_channel_order(image) for image in images]
-        images = [to_channel_dimension_format(image, data_format) for image in images]
+        images = [flip_channel_order(image, input_data_format=input_data_format) for image in images]
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
 
         data = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
 
diff --git a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
index be31af99d6df..ef970edfdc91 100755
--- a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
@@ -31,7 +31,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, torch_int_div
+from ...pytorch_utils import apply_chunking_to_forward
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -52,7 +52,6 @@
 
 _CHECKPOINT_FOR_DOC = "microsoft/layoutlmv2-base-uncased"
 _CONFIG_FOR_DOC = "LayoutLMv2Config"
-_TOKENIZER_FOR_DOC = "LayoutLMv2Tokenizer"
 
 LAYOUTLMV2_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "microsoft/layoutlmv2-base-uncased",
@@ -78,7 +77,9 @@ def __init__(self, config):
         self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
     def _calc_spatial_position_embeddings(self, bbox):
         try:
@@ -371,31 +372,28 @@ def __init__(self, config):
         if self.has_relative_attention_bias:
             self.rel_pos_bins = config.rel_pos_bins
             self.max_rel_pos = config.max_rel_pos
-            self.rel_pos_onehot_size = config.rel_pos_bins
-            self.rel_pos_bias = nn.Linear(self.rel_pos_onehot_size, config.num_attention_heads, bias=False)
+            self.rel_pos_bias = nn.Linear(self.rel_pos_bins, config.num_attention_heads, bias=False)
 
         if self.has_spatial_attention_bias:
             self.max_rel_2d_pos = config.max_rel_2d_pos
             self.rel_2d_pos_bins = config.rel_2d_pos_bins
-            self.rel_2d_pos_onehot_size = config.rel_2d_pos_bins
-            self.rel_pos_x_bias = nn.Linear(self.rel_2d_pos_onehot_size, config.num_attention_heads, bias=False)
-            self.rel_pos_y_bias = nn.Linear(self.rel_2d_pos_onehot_size, config.num_attention_heads, bias=False)
+            self.rel_pos_x_bias = nn.Linear(self.rel_2d_pos_bins, config.num_attention_heads, bias=False)
+            self.rel_pos_y_bias = nn.Linear(self.rel_2d_pos_bins, config.num_attention_heads, bias=False)
 
         self.gradient_checkpointing = False
 
-    def _calculate_1d_position_embeddings(self, hidden_states, position_ids):
+    def _calculate_1d_position_embeddings(self, position_ids):
         rel_pos_mat = position_ids.unsqueeze(-2) - position_ids.unsqueeze(-1)
         rel_pos = relative_position_bucket(
             rel_pos_mat,
             num_buckets=self.rel_pos_bins,
             max_distance=self.max_rel_pos,
         )
-        rel_pos = nn.functional.one_hot(rel_pos, num_classes=self.rel_pos_onehot_size).type_as(hidden_states)
-        rel_pos = self.rel_pos_bias(rel_pos).permute(0, 3, 1, 2)
+        rel_pos = self.rel_pos_bias.weight.t()[rel_pos].permute(0, 3, 1, 2)
         rel_pos = rel_pos.contiguous()
         return rel_pos
 
-    def _calculate_2d_position_embeddings(self, hidden_states, bbox):
+    def _calculate_2d_position_embeddings(self, bbox):
         position_coord_x = bbox[:, :, 0]
         position_coord_y = bbox[:, :, 3]
         rel_pos_x_2d_mat = position_coord_x.unsqueeze(-2) - position_coord_x.unsqueeze(-1)
@@ -410,10 +408,8 @@ def _calculate_2d_position_embeddings(self, hidden_states, bbox):
             num_buckets=self.rel_2d_pos_bins,
             max_distance=self.max_rel_2d_pos,
         )
-        rel_pos_x = nn.functional.one_hot(rel_pos_x, num_classes=self.rel_2d_pos_onehot_size).type_as(hidden_states)
-        rel_pos_y = nn.functional.one_hot(rel_pos_y, num_classes=self.rel_2d_pos_onehot_size).type_as(hidden_states)
-        rel_pos_x = self.rel_pos_x_bias(rel_pos_x).permute(0, 3, 1, 2)
-        rel_pos_y = self.rel_pos_y_bias(rel_pos_y).permute(0, 3, 1, 2)
+        rel_pos_x = self.rel_pos_x_bias.weight.t()[rel_pos_x].permute(0, 3, 1, 2)
+        rel_pos_y = self.rel_pos_y_bias.weight.t()[rel_pos_y].permute(0, 3, 1, 2)
         rel_pos_x = rel_pos_x.contiguous()
         rel_pos_y = rel_pos_y.contiguous()
         rel_2d_pos = rel_pos_x + rel_pos_y
@@ -433,14 +429,8 @@ def forward(
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
 
-        rel_pos = (
-            self._calculate_1d_position_embeddings(hidden_states, position_ids)
-            if self.has_relative_attention_bias
-            else None
-        )
-        rel_2d_pos = (
-            self._calculate_2d_position_embeddings(hidden_states, bbox) if self.has_spatial_attention_bias else None
-        )
+        rel_pos = self._calculate_1d_position_embeddings(position_ids) if self.has_relative_attention_bias else None
+        rel_2d_pos = self._calculate_2d_position_embeddings(bbox) if self.has_spatial_attention_bias else None
 
         for i, layer_module in enumerate(self.layer):
             if output_hidden_states:
@@ -507,7 +497,6 @@ class LayoutLMv2PreTrainedModel(PreTrainedModel):
     config_class = LayoutLMv2Config
     pretrained_model_archive_map = LAYOUTLMV2_PRETRAINED_MODEL_ARCHIVE_LIST
     base_model_prefix = "layoutlmv2"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -568,8 +557,11 @@ def __init__(self, config):
         self.register_buffer(
             "pixel_mean",
             torch.Tensor(self.cfg.MODEL.PIXEL_MEAN).view(num_channels, 1, 1),
+            persistent=False,
+        )
+        self.register_buffer(
+            "pixel_std", torch.Tensor(self.cfg.MODEL.PIXEL_STD).view(num_channels, 1, 1), persistent=False
         )
-        self.register_buffer("pixel_std", torch.Tensor(self.cfg.MODEL.PIXEL_STD).view(num_channels, 1, 1))
         self.out_feature_key = "p2"
         if torch.are_deterministic_algorithms_enabled():
             logger.warning("using `AvgPool2d` instead of `AdaptiveAvgPool2d`")
@@ -605,7 +597,7 @@ def synchronize_batch_norm(self):
         self_rank = torch.distributed.get_rank()
         node_size = torch.cuda.device_count()
         world_size = torch.distributed.get_world_size()
-        if not (world_size & node_size == 0):
+        if not (world_size % node_size == 0):
             raise RuntimeError("Make sure the number of processes can be divided by the number of nodes")
 
         node_global_ranks = [list(range(i * node_size, (i + 1) * node_size)) for i in range(world_size // node_size)]
@@ -633,7 +625,7 @@ def synchronize_batch_norm(self):
         input_ids (`torch.LongTensor` of shape `{0}`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`LayoutLMv2Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -771,7 +763,7 @@ def _calc_img_embeddings(self, image, bbox, position_ids):
         return embeddings
 
     def _calc_visual_bbox(self, image_feature_pool_shape, bbox, device, final_shape):
-        visual_bbox_x = torch_int_div(
+        visual_bbox_x = torch.div(
             torch.arange(
                 0,
                 1000 * (image_feature_pool_shape[1] + 1),
@@ -780,8 +772,9 @@ def _calc_visual_bbox(self, image_feature_pool_shape, bbox, device, final_shape)
                 dtype=bbox.dtype,
             ),
             self.config.image_feature_pool_shape[1],
+            rounding_mode="floor",
         )
-        visual_bbox_y = torch_int_div(
+        visual_bbox_y = torch.div(
             torch.arange(
                 0,
                 1000 * (self.config.image_feature_pool_shape[0] + 1),
@@ -790,6 +783,7 @@ def _calc_visual_bbox(self, image_feature_pool_shape, bbox, device, final_shape)
                 dtype=bbox.dtype,
             ),
             self.config.image_feature_pool_shape[0],
+            rounding_mode="floor",
         )
         visual_bbox = torch.stack(
             [
@@ -837,14 +831,14 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import LayoutLMv2Processor, LayoutLMv2Model, set_seed
+        >>> from transformers import AutoProcessor, LayoutLMv2Model, set_seed
         >>> from PIL import Image
         >>> import torch
         >>> from datasets import load_dataset
 
         >>> set_seed(88)
 
-        >>> processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")
+        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased")
         >>> model = LayoutLMv2Model.from_pretrained("microsoft/layoutlmv2-base-uncased")
 
 
@@ -1008,7 +1002,7 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import LayoutLMv2Processor, LayoutLMv2ForSequenceClassification, set_seed
+        >>> from transformers import AutoProcessor, LayoutLMv2ForSequenceClassification, set_seed
         >>> from PIL import Image
         >>> import torch
         >>> from datasets import load_dataset
@@ -1019,7 +1013,7 @@ def forward(
         >>> data = next(iter(dataset))
         >>> image = data["image"].convert("RGB")
 
-        >>> processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")
+        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased")
         >>> model = LayoutLMv2ForSequenceClassification.from_pretrained(
         ...     "microsoft/layoutlmv2-base-uncased", num_labels=dataset.info.features["label"].num_classes
         ... )
@@ -1042,6 +1036,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -1187,7 +1182,7 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import LayoutLMv2Processor, LayoutLMv2ForTokenClassification, set_seed
+        >>> from transformers import AutoProcessor, LayoutLMv2ForTokenClassification, set_seed
         >>> from PIL import Image
         >>> from datasets import load_dataset
 
@@ -1197,7 +1192,7 @@ def forward(
         >>> labels = datasets.features["ner_tags"].feature.names
         >>> id2label = {v: k for v, k in enumerate(labels)}
 
-        >>> processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr")
+        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr")
         >>> model = LayoutLMv2ForTokenClassification.from_pretrained(
         ...     "microsoft/layoutlmv2-base-uncased", num_labels=len(labels)
         ... )
@@ -1328,13 +1323,13 @@ def forward(
         a prediction of what it thinks the answer is (the span of the answer within the texts parsed from the image).
 
         ```python
-        >>> from transformers import LayoutLMv2Processor, LayoutLMv2ForQuestionAnswering, set_seed
+        >>> from transformers import AutoProcessor, LayoutLMv2ForQuestionAnswering, set_seed
         >>> import torch
         >>> from PIL import Image
         >>> from datasets import load_dataset
 
         >>> set_seed(88)
-        >>> processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")
+        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased")
         >>> model = LayoutLMv2ForQuestionAnswering.from_pretrained("microsoft/layoutlmv2-base-uncased")
 
         >>> dataset = load_dataset("hf-internal-testing/fixtures_docvqa")
diff --git a/src/transformers/models/layoutlmv2/processing_layoutlmv2.py b/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
index 4ddd95bfbe56..fe52c16fd250 100644
--- a/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
@@ -38,9 +38,9 @@ class LayoutLMv2Processor(ProcessorMixin):
     into token-level `labels` for token classification tasks (such as FUNSD, CORD).
 
     Args:
-        image_processor (`LayoutLMv2ImageProcessor`):
+        image_processor (`LayoutLMv2ImageProcessor`, *optional*):
             An instance of [`LayoutLMv2ImageProcessor`]. The image processor is a required input.
-        tokenizer (`LayoutLMv2Tokenizer` or `LayoutLMv2TokenizerFast`):
+        tokenizer (`LayoutLMv2Tokenizer` or `LayoutLMv2TokenizerFast`, *optional*):
             An instance of [`LayoutLMv2Tokenizer`] or [`LayoutLMv2TokenizerFast`]. The tokenizer is a required input.
     """
     attributes = ["image_processor", "tokenizer"]
@@ -48,6 +48,7 @@ class LayoutLMv2Processor(ProcessorMixin):
     tokenizer_class = ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast")
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        feature_extractor = None
         if "feature_extractor" in kwargs:
             warnings.warn(
                 "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
@@ -85,7 +86,7 @@ def __call__(
         return_length: bool = False,
         verbose: bool = True,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         This method first forwards the `images` argument to [`~LayoutLMv2ImageProcessor.__call__`]. In case
diff --git a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
index f91c249f212c..6c0b2db4a9ef 100644
--- a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
@@ -100,7 +100,7 @@
                 argument defines the number of overlapping tokens.
             pad_to_multiple_of (`int`, *optional*):
                 If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
-                the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
+                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
             return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
                 If set, will return tensors instead of list of python integers. Acceptable values are:
 
@@ -242,29 +242,8 @@ def __init__(
         strip_accents=None,
         model_max_length: int = 512,
         additional_special_tokens: Optional[List[str]] = None,
-        **kwargs
+        **kwargs,
     ):
-        super().__init__(
-            do_lower_case=do_lower_case,
-            do_basic_tokenize=do_basic_tokenize,
-            never_split=never_split,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            cls_token_box=cls_token_box,
-            sep_token_box=sep_token_box,
-            pad_token_box=pad_token_box,
-            pad_token_label=pad_token_label,
-            only_label_first_subword=only_label_first_subword,
-            tokenize_chinese_chars=tokenize_chinese_chars,
-            strip_accents=strip_accents,
-            model_max_length=model_max_length,
-            additional_special_tokens=additional_special_tokens,
-            **kwargs,
-        )
-
         if not os.path.isfile(vocab_file):
             raise ValueError(
                 f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
@@ -280,7 +259,7 @@ def __init__(
                 tokenize_chinese_chars=tokenize_chinese_chars,
                 strip_accents=strip_accents,
             )
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
 
         # additional properties
         self.cls_token_box = cls_token_box
@@ -288,6 +267,26 @@ def __init__(
         self.pad_token_box = pad_token_box
         self.pad_token_label = pad_token_label
         self.only_label_first_subword = only_label_first_subword
+        super().__init__(
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            cls_token_box=cls_token_box,
+            sep_token_box=sep_token_box,
+            pad_token_box=pad_token_box,
+            pad_token_label=pad_token_label,
+            only_label_first_subword=only_label_first_subword,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            model_max_length=model_max_length,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs,
+        )
 
     @property
     def do_lower_case(self):
@@ -304,7 +303,6 @@ def _tokenize(self, text):
         split_tokens = []
         if self.do_basic_tokenize:
             for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-
                 # If the token is part of the never_split set
                 if token in self.basic_tokenizer.never_split:
                     split_tokens.append(token)
@@ -444,7 +442,7 @@ def __call__(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
@@ -463,6 +461,7 @@ def __call__(
             word_labels (`List[int]`, `List[List[int]]`, *optional*):
                 Word-level integer labels (for token classification tasks such as FUNSD, CORD).
         """
+
         # Input type checking for clearer error
         def _is_valid_text_input(t):
             if isinstance(t, str):
@@ -596,9 +595,8 @@ def batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
-
         # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
         padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
             padding=padding,
@@ -655,9 +653,8 @@ def _batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
-
         if return_offsets_mapping:
             raise NotImplementedError(
                 "return_offset_mapping is not available when using Python tokenizers. "
@@ -779,7 +776,7 @@ def encode(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> List[int]:
         encoded_inputs = self.encode_plus(
             text=text,
@@ -826,7 +823,7 @@ def encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated,
@@ -892,7 +889,7 @@ def _encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         if return_offsets_mapping:
             raise NotImplementedError(
@@ -946,7 +943,7 @@ def prepare_for_model(
         return_length: bool = False,
         verbose: bool = True,
         prepend_batch_axis: bool = False,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Prepares a sequence or a pair of sequences so that it can be used by the model. It adds special tokens,
@@ -1215,8 +1212,7 @@ def truncate_sequences(
                 )
                 if truncation_strategy == TruncationStrategy.ONLY_FIRST:
                     error_msg = (
-                        error_msg
-                        + "Please select another truncation strategy than "
+                        error_msg + "Please select another truncation strategy than "
                         f"{truncation_strategy}, for instance 'longest_first' or 'only_second'."
                     )
                 logger.error(error_msg)
@@ -1288,7 +1284,7 @@ def _pad(
                     - 'right': pads on the right of the sequences
             pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                 This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
-                >= 7.5 (Volta).
+                `>= 7.5` (Volta).
             return_attention_mask:
                 (optional) Set to False to avoid returning attention mask (default: set to model specifics)
         """
@@ -1365,20 +1361,30 @@ class BasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -1397,7 +1403,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -1425,7 +1433,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
diff --git a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py
index e164c49ebc2d..bed4e133aa3c 100644
--- a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py
+++ b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py
@@ -136,7 +136,7 @@ def __init__(
         only_label_first_subword=True,
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
@@ -197,7 +197,7 @@ def __call__(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
@@ -216,6 +216,7 @@ def __call__(
             word_labels (`List[int]`, `List[List[int]]`, *optional*):
                 Word-level integer labels (for token classification tasks such as FUNSD, CORD).
         """
+
         # Input type checking for clearer error
         def _is_valid_text_input(t):
             if isinstance(t, str):
@@ -349,9 +350,8 @@ def batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
-
         # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
         padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
             padding=padding,
@@ -413,7 +413,7 @@ def encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated,
@@ -484,7 +484,6 @@ def _batch_encode_plus(
         return_length: bool = False,
         verbose: bool = True,
     ) -> BatchEncoding:
-
         if not isinstance(batch_text_or_text_pairs, list):
             raise TypeError(f"batch_text_or_text_pairs has to be a list (got {type(batch_text_or_text_pairs)})")
 
@@ -636,9 +635,8 @@ def _encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
-
         # make it a batched input
         # 2 options:
         # 1) only text, in case text must be a list of str
@@ -710,7 +708,7 @@ def _pad(
                     - 'right': pads on the right of the sequences
             pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                 This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
-                >= 7.5 (Volta).
+                `>= 7.5` (Volta).
             return_attention_mask:
                 (optional) Set to False to avoid returning attention mask (default: set to model specifics)
         """
diff --git a/src/transformers/models/layoutlmv3/__init__.py b/src/transformers/models/layoutlmv3/__init__.py
index 927a940676c4..ca1c31091e8b 100644
--- a/src/transformers/models/layoutlmv3/__init__.py
+++ b/src/transformers/models/layoutlmv3/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py b/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py
index 96876ced612e..31ca2e00e471 100644
--- a/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py
@@ -153,7 +153,7 @@ def __init__(
         num_channels=3,
         patch_size=16,
         classifier_dropout=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_size=vocab_size,
@@ -191,7 +191,6 @@ def __init__(
 
 
 class LayoutLMv3OnnxConfig(OnnxConfig):
-
     torch_onnx_minimum_version = version.parse("1.12")
 
     @property
@@ -261,7 +260,7 @@ def generate_dummy_inputs(
         """
 
         # A dummy image is used so OCR should not be applied
-        setattr(processor.feature_extractor, "apply_ocr", False)
+        setattr(processor.image_processor, "apply_ocr", False)
 
         # If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX
         batch_size = compute_effective_axis_dimension(
diff --git a/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py b/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
index 2c74d8ed9be6..26a5c7a16418 100644
--- a/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
@@ -18,11 +18,8 @@
 
 import numpy as np
 
-from transformers.utils import is_vision_available
-from transformers.utils.generic import TensorType
-
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import normalize, rescale, resize, to_channel_dimension_format, to_pil_image
+from ...image_transforms import resize, to_channel_dimension_format, to_pil_image
 from ...image_utils import (
     IMAGENET_STANDARD_MEAN,
     IMAGENET_STANDARD_STD,
@@ -30,11 +27,12 @@
     ImageInput,
     PILImageResampling,
     infer_channel_dimension_format,
-    is_batched,
+    is_scaled_image,
+    make_list_of_images,
     to_numpy_array,
     valid_images,
 )
-from ...utils import is_pytesseract_available, logging, requires_backends
+from ...utils import TensorType, is_pytesseract_available, is_vision_available, logging, requires_backends
 
 
 if is_vision_available():
@@ -56,11 +54,16 @@ def normalize_box(box, width, height):
     ]
 
 
-def apply_tesseract(image: np.ndarray, lang: Optional[str], tesseract_config: Optional[str]):
+def apply_tesseract(
+    image: np.ndarray,
+    lang: Optional[str],
+    tesseract_config: Optional[str],
+    input_data_format: Optional[Union[ChannelDimension, str]] = None,
+):
     """Applies Tesseract OCR on a document image, and returns recognized words + normalized bounding boxes."""
 
     # apply OCR
-    pil_image = to_pil_image(image)
+    pil_image = to_pil_image(image, input_data_format=input_data_format)
     image_width, image_height = pil_image.size
     data = pytesseract.image_to_data(pil_image, lang=lang, output_type="dict", config=tesseract_config)
     words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"]
@@ -89,20 +92,6 @@ def apply_tesseract(image: np.ndarray, lang: Optional[str], tesseract_config: Op
     return words, normalized_boxes
 
 
-def flip_channel_order(image: np.ndarray, data_format: Optional[ChannelDimension] = None) -> np.ndarray:
-    input_data_format = infer_channel_dimension_format(image)
-    if input_data_format == ChannelDimension.LAST:
-        image = image[..., ::-1]
-    elif input_data_format == ChannelDimension.FIRST:
-        image = image[:, ::-1, ...]
-    else:
-        raise ValueError(f"Unsupported channel dimension: {input_data_format}")
-
-    if data_format is not None:
-        image = to_channel_dimension_format(image, data_format)
-    return image
-
-
 class LayoutLMv3ImageProcessor(BaseImageProcessor):
     r"""
     Constructs a LayoutLMv3 image processor.
@@ -157,7 +146,7 @@ def __init__(
         apply_ocr: bool = True,
         ocr_lang: Optional[str] = None,
         tesseract_config: Optional[str] = "",
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"height": 224, "width": 224}
@@ -175,75 +164,54 @@ def __init__(
         self.ocr_lang = ocr_lang
         self.tesseract_config = tesseract_config
 
+    # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize
     def resize(
         self,
         image: np.ndarray,
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BILINEAR,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> np.ndarray:
         """
-        Resize an image to (size["height"], size["width"]) dimensions.
+        Resize an image to `(size["height"], size["width"])`.
 
         Args:
             image (`np.ndarray`):
                 Image to resize.
             size (`Dict[str, int]`):
-                Size of the output image.
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
             resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
-                Resampling filter to use when resiizing the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
         """
         size = get_size_dict(size)
         if "height" not in size or "width" not in size:
-            raise ValueError(f"The size dictionary must contain the keys 'height' and 'width'. Got {size.keys()}")
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
         output_size = (size["height"], size["width"])
-        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
-
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, Iterable[float]],
-        std: Union[float, Iterable[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Normalize an image.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            mean (`float` or `Iterable[float]`):
-                Mean values to be used for normalization.
-            std (`float` or `Iterable[float]`):
-                Standard deviation values to be used for normalization.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
 
     def preprocess(
         self,
@@ -261,6 +229,7 @@ def preprocess(
         tesseract_config: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
         **kwargs,
     ) -> PIL.Image.Image:
         """
@@ -268,7 +237,8 @@ def preprocess(
 
         Args:
             images (`ImageInput`):
-                Image to preprocess.
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
             do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                 Whether to resize the image.
             size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@@ -306,6 +276,12 @@ def preprocess(
                 The channel dimension format for the output image. Can be one of:
                     - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                     - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         size = size if size is not None else self.size
@@ -320,8 +296,7 @@ def preprocess(
         ocr_lang = ocr_lang if ocr_lang is not None else self.ocr_lang
         tesseract_config = tesseract_config if tesseract_config is not None else self.tesseract_config
 
-        if not is_batched(images):
-            images = [images]
+        images = make_list_of_images(images)
 
         if not valid_images(images):
             raise ValueError(
@@ -341,27 +316,47 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
         # Tesseract OCR to get words + normalized bounding boxes
         if apply_ocr:
             requires_backends(self, "pytesseract")
             words_batch = []
             boxes_batch = []
             for image in images:
-                words, boxes = apply_tesseract(image, ocr_lang, tesseract_config)
+                words, boxes = apply_tesseract(image, ocr_lang, tesseract_config, input_data_format=input_data_format)
                 words_batch.append(words)
                 boxes_batch.append(boxes)
 
         if do_resize:
-            images = [self.resize(image=image, size=size, resample=resample) for image in images]
+            images = [
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
 
         if do_rescale:
-            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
 
         if do_normalize:
-            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
-
-        # flip color channels from RGB to BGR (as Detectron2 requires this)
-        images = [to_channel_dimension_format(image, data_format) for image in images]
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
 
         data = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
 
diff --git a/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py b/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py
index 6f1e37aacc01..30ab0a5e8620 100644
--- a/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py
@@ -24,18 +24,16 @@
 import torch.utils.checkpoint
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
-from transformers import apply_chunking_to_forward
-from transformers.modeling_outputs import (
+from ...activations import ACT2FN
+from ...modeling_outputs import (
     BaseModelOutput,
     QuestionAnsweringModelOutput,
     SequenceClassifierOutput,
     TokenClassifierOutput,
 )
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import logging
-
-from ...activations import ACT2FN
-from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_layoutlmv3 import LayoutLMv3Config
 
 
@@ -68,7 +66,7 @@
             Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
             token. See `pixel_values` for `patch_sequence_length`.
 
-            Indices can be obtained using [`LayoutLMv3Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -141,7 +139,7 @@
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`LayoutLMv3Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -247,7 +245,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
         self.padding_idx = config.pad_token_id
         self.position_embeddings = nn.Embedding(
@@ -566,15 +566,13 @@ def __init__(self, config):
         if self.has_relative_attention_bias:
             self.rel_pos_bins = config.rel_pos_bins
             self.max_rel_pos = config.max_rel_pos
-            self.rel_pos_onehot_size = config.rel_pos_bins
-            self.rel_pos_bias = nn.Linear(self.rel_pos_onehot_size, config.num_attention_heads, bias=False)
+            self.rel_pos_bias = nn.Linear(self.rel_pos_bins, config.num_attention_heads, bias=False)
 
         if self.has_spatial_attention_bias:
             self.max_rel_2d_pos = config.max_rel_2d_pos
             self.rel_2d_pos_bins = config.rel_2d_pos_bins
-            self.rel_2d_pos_onehot_size = config.rel_2d_pos_bins
-            self.rel_pos_x_bias = nn.Linear(self.rel_2d_pos_onehot_size, config.num_attention_heads, bias=False)
-            self.rel_pos_y_bias = nn.Linear(self.rel_2d_pos_onehot_size, config.num_attention_heads, bias=False)
+            self.rel_pos_x_bias = nn.Linear(self.rel_2d_pos_bins, config.num_attention_heads, bias=False)
+            self.rel_pos_y_bias = nn.Linear(self.rel_2d_pos_bins, config.num_attention_heads, bias=False)
 
     def relative_position_bucket(self, relative_position, bidirectional=True, num_buckets=32, max_distance=128):
         ret = 0
@@ -599,7 +597,7 @@ def relative_position_bucket(self, relative_position, bidirectional=True, num_bu
         ret += torch.where(is_small, n, val_if_large)
         return ret
 
-    def _cal_1d_pos_emb(self, hidden_states, position_ids):
+    def _cal_1d_pos_emb(self, position_ids):
         rel_pos_mat = position_ids.unsqueeze(-2) - position_ids.unsqueeze(-1)
 
         rel_pos = self.relative_position_bucket(
@@ -607,12 +605,11 @@ def _cal_1d_pos_emb(self, hidden_states, position_ids):
             num_buckets=self.rel_pos_bins,
             max_distance=self.max_rel_pos,
         )
-        rel_pos = F.one_hot(rel_pos, num_classes=self.rel_pos_onehot_size).type_as(hidden_states)
-        rel_pos = self.rel_pos_bias(rel_pos).permute(0, 3, 1, 2)
+        rel_pos = self.rel_pos_bias.weight.t()[rel_pos].permute(0, 3, 1, 2)
         rel_pos = rel_pos.contiguous()
         return rel_pos
 
-    def _cal_2d_pos_emb(self, hidden_states, bbox):
+    def _cal_2d_pos_emb(self, bbox):
         position_coord_x = bbox[:, :, 0]
         position_coord_y = bbox[:, :, 3]
         rel_pos_x_2d_mat = position_coord_x.unsqueeze(-2) - position_coord_x.unsqueeze(-1)
@@ -627,10 +624,8 @@ def _cal_2d_pos_emb(self, hidden_states, bbox):
             num_buckets=self.rel_2d_pos_bins,
             max_distance=self.max_rel_2d_pos,
         )
-        rel_pos_x = F.one_hot(rel_pos_x, num_classes=self.rel_2d_pos_onehot_size).type_as(hidden_states)
-        rel_pos_y = F.one_hot(rel_pos_y, num_classes=self.rel_2d_pos_onehot_size).type_as(hidden_states)
-        rel_pos_x = self.rel_pos_x_bias(rel_pos_x).permute(0, 3, 1, 2)
-        rel_pos_y = self.rel_pos_y_bias(rel_pos_y).permute(0, 3, 1, 2)
+        rel_pos_x = self.rel_pos_x_bias.weight.t()[rel_pos_x].permute(0, 3, 1, 2)
+        rel_pos_y = self.rel_pos_y_bias.weight.t()[rel_pos_y].permute(0, 3, 1, 2)
         rel_pos_x = rel_pos_x.contiguous()
         rel_pos_y = rel_pos_y.contiguous()
         rel_2d_pos = rel_pos_x + rel_pos_y
@@ -652,8 +647,8 @@ def forward(
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
 
-        rel_pos = self._cal_1d_pos_emb(hidden_states, position_ids) if self.has_relative_attention_bias else None
-        rel_2d_pos = self._cal_2d_pos_emb(hidden_states, bbox) if self.has_spatial_attention_bias else None
+        rel_pos = self._cal_1d_pos_emb(position_ids) if self.has_relative_attention_bias else None
+        rel_2d_pos = self._cal_2d_pos_emb(bbox) if self.has_spatial_attention_bias else None
 
         for i, layer_module in enumerate(self.layer):
             if output_hidden_states:
@@ -752,8 +747,6 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
     LAYOUTLMV3_START_DOCSTRING,
 )
 class LayoutLMv3Model(LayoutLMv3PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.config = config
@@ -1040,9 +1033,6 @@ def forward(self, x):
     LAYOUTLMV3_START_DOCSTRING,
 )
 class LayoutLMv3ForTokenClassification(LayoutLMv3PreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1155,9 +1145,6 @@ def forward(
     LAYOUTLMV3_START_DOCSTRING,
 )
 class LayoutLMv3ForQuestionAnswering(LayoutLMv3PreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1288,8 +1275,6 @@ def forward(
     LAYOUTLMV3_START_DOCSTRING,
 )
 class LayoutLMv3ForSequenceClassification(LayoutLMv3PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
diff --git a/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py b/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py
index 242d8d3983d6..feba69eafc2a 100644
--- a/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py
@@ -14,9 +14,12 @@
 # limitations under the License.
 """TF 2.0 LayoutLMv3 model."""
 
+
+from __future__ import annotations
+
 import collections
 import math
-from typing import Dict, List, Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import tensorflow as tf
 
@@ -36,6 +39,7 @@
     keras_serializable,
     unpack_inputs,
 )
+from ...tf_utils import check_embeddings_within_bounds
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
 from .configuration_layoutlmv3 import LayoutLMv3Config
 
@@ -221,11 +225,11 @@ def create_position_ids(self, input_ids: tf.Tensor, inputs_embeds: tf.Tensor) ->
 
     def call(
         self,
-        input_ids: Optional[tf.Tensor] = None,
+        input_ids: tf.Tensor | None = None,
         bbox: tf.Tensor = None,
-        token_type_ids: Optional[tf.Tensor] = None,
-        position_ids: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        token_type_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         training: bool = False,
     ) -> tf.Tensor:
         if position_ids is None:
@@ -240,16 +244,7 @@ def call(
             token_type_ids = tf.zeros(input_shape, dtype=position_ids.dtype)
 
         if inputs_embeds is None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.word_embeddings.input_dim, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.word_embeddings.input_dim})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.word_embeddings.input_dim)
             inputs_embeds = self.word_embeddings(input_ids)
         token_type_embeddings = self.token_type_embeddings(token_type_ids)
 
@@ -327,11 +322,11 @@ def cogview_attention(self, attention_scores: tf.Tensor, alpha: Union[float, int
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[tf.Tensor],
-        head_mask: Optional[tf.Tensor],
+        attention_mask: tf.Tensor | None,
+        head_mask: tf.Tensor | None,
         output_attentions: bool,
-        rel_pos: Optional[tf.Tensor] = None,
-        rel_2d_pos: Optional[tf.Tensor] = None,
+        rel_pos: tf.Tensor | None = None,
+        rel_2d_pos: tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[Tuple[tf.Tensor], Tuple[tf.Tensor, tf.Tensor]]:
         key_layer = self.transpose_for_scores(self.key(hidden_states))
@@ -406,11 +401,11 @@ def __init__(self, config: LayoutLMv3Config, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[tf.Tensor],
-        head_mask: Optional[tf.Tensor],
+        attention_mask: tf.Tensor | None,
+        head_mask: tf.Tensor | None,
         output_attentions: bool,
-        rel_pos: Optional[tf.Tensor] = None,
-        rel_2d_pos: Optional[tf.Tensor] = None,
+        rel_pos: tf.Tensor | None = None,
+        rel_2d_pos: tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[Tuple[tf.Tensor], Tuple[tf.Tensor, tf.Tensor]]:
         self_outputs = self.self_attention(
@@ -477,11 +472,11 @@ def __init__(self, config: LayoutLMv3Config, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[tf.Tensor],
-        head_mask: Optional[tf.Tensor],
+        attention_mask: tf.Tensor | None,
+        head_mask: tf.Tensor | None,
         output_attentions: bool,
-        rel_pos: Optional[tf.Tensor] = None,
-        rel_2d_pos: Optional[tf.Tensor] = None,
+        rel_pos: tf.Tensor | None = None,
+        rel_2d_pos: tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[Tuple[tf.Tensor], Tuple[tf.Tensor, tf.Tensor]]:
         self_attention_outputs = self.attention(
@@ -601,13 +596,13 @@ def _cal_2d_pos_emb(self, bbox: tf.Tensor):
     def call(
         self,
         hidden_states: tf.Tensor,
-        bbox: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
+        bbox: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
         output_attentions: bool = False,
         output_hidden_states: bool = False,
         return_dict: bool = True,
-        position_ids: Optional[tf.Tensor] = None,
+        position_ids: tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[
         TFBaseModelOutput,
@@ -786,7 +781,7 @@ def get_extended_attention_mask(self, attention_mask: tf.Tensor) -> tf.Tensor:
 
         return extended_attention_mask
 
-    def get_head_mask(self, head_mask: Optional[tf.Tensor]) -> Union[tf.Tensor, List[Optional[tf.Tensor]]]:
+    def get_head_mask(self, head_mask: tf.Tensor | None) -> Union[tf.Tensor, List[tf.Tensor | None]]:
         if head_mask is None:
             return [None] * self.config.num_hidden_layers
 
@@ -814,14 +809,14 @@ def get_head_mask(self, head_mask: Optional[tf.Tensor]) -> Union[tf.Tensor, List
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[tf.Tensor] = None,
-        bbox: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        token_type_ids: Optional[tf.Tensor] = None,
-        position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        pixel_values: Optional[tf.Tensor] = None,
+        input_ids: tf.Tensor | None = None,
+        bbox: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        pixel_values: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -985,37 +980,10 @@ class TFLayoutLMv3PreTrainedModel(TFPreTrainedModel):
     base_model_prefix = "layoutlmv3"
 
     @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        size = self.config.input_size
-        image_shape = (2, self.config.num_channels, size, size)
-        pixel_values = tf.random.uniform(shape=image_shape, minval=-1, maxval=1)
-        return {
-            "input_ids": tf.constant(_DUMMY_INPUT_IDS, dtype=tf.int32),
-            "bbox": tf.constant(_DUMMY_BBOX, dtype=tf.int32),
-            "pixel_values": pixel_values,
-        }
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "bbox": tf.TensorSpec((None, None, 4), tf.int32, name="bbox"),
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        """
-        Method used for serving the model.
-
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
-        """
-        output = self.call(inputs)
-
-        return self.serving_output(output)
+    def input_signature(self):
+        sig = super().input_signature
+        sig["bbox"] = tf.TensorSpec((None, None, 4), tf.int32, name="bbox")
+        return sig
 
 
 LAYOUTLMV3_START_DOCSTRING = r"""
@@ -1067,7 +1035,7 @@ def serving(self, inputs):
             Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
             token. See `pixel_values` for `patch_sequence_length`.
 
-            Indices can be obtained using [`LayoutLMv3Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -1153,14 +1121,14 @@ def __init__(self, config, *inputs, **kwargs):
     @replace_return_docstrings(output_type=TFBaseModelOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[tf.Tensor] = None,
-        bbox: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        token_type_ids: Optional[tf.Tensor] = None,
-        position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        pixel_values: Optional[tf.Tensor] = None,
+        input_ids: tf.Tensor | None = None,
+        bbox: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        pixel_values: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1212,16 +1180,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output: TFBaseModelOutput) -> TFBaseModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutput(
-            last_hidden_state=output.last_hidden_state,
-            hidden_states=hs,
-            attentions=attns,
-        )
-
 
 class TFLayoutLMv3ClassificationHead(tf.keras.layers.Layer):
     """
@@ -1280,18 +1238,18 @@ def __init__(self, config: LayoutLMv3Config, **kwargs):
     @replace_return_docstrings(output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        token_type_ids: Optional[tf.Tensor] = None,
-        position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        labels: Optional[tf.Tensor] = None,
+        input_ids: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        labels: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        bbox: Optional[tf.Tensor] = None,
-        pixel_values: Optional[tf.Tensor] = None,
+        bbox: tf.Tensor | None = None,
+        pixel_values: tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[
         TFSequenceClassifierOutput,
@@ -1359,13 +1317,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1400,18 +1351,18 @@ def __init__(self, config: LayoutLMv3Config, **kwargs):
     @replace_return_docstrings(output_type=TFTokenClassifierOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[tf.Tensor] = None,
-        bbox: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        token_type_ids: Optional[tf.Tensor] = None,
-        position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        labels: Optional[tf.Tensor] = None,
+        input_ids: tf.Tensor | None = None,
+        bbox: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        labels: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        pixel_values: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[
         TFTokenClassifierOutput,
@@ -1489,13 +1440,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1522,18 +1466,18 @@ def __init__(self, config: LayoutLMv3Config, **kwargs):
     @replace_return_docstrings(output_type=TFQuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        token_type_ids: Optional[tf.Tensor] = None,
-        position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        start_positions: Optional[tf.Tensor] = None,
-        end_positions: Optional[tf.Tensor] = None,
+        input_ids: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        start_positions: tf.Tensor | None = None,
+        end_positions: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        bbox: Optional[tf.Tensor] = None,
-        pixel_values: Optional[tf.Tensor] = None,
+        bbox: tf.Tensor | None = None,
+        pixel_values: tf.Tensor | None = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[
@@ -1623,12 +1567,3 @@ def call(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output
-    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
diff --git a/src/transformers/models/layoutlmv3/processing_layoutlmv3.py b/src/transformers/models/layoutlmv3/processing_layoutlmv3.py
index 9c6a8416d51f..31d0c5e60a54 100644
--- a/src/transformers/models/layoutlmv3/processing_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/processing_layoutlmv3.py
@@ -38,9 +38,9 @@ class LayoutLMv3Processor(ProcessorMixin):
     into token-level `labels` for token classification tasks (such as FUNSD, CORD).
 
     Args:
-        image_processor (`LayoutLMv3ImageProcessor`):
+        image_processor (`LayoutLMv3ImageProcessor`, *optional*):
             An instance of [`LayoutLMv3ImageProcessor`]. The image processor is a required input.
-        tokenizer (`LayoutLMv3Tokenizer` or `LayoutLMv3TokenizerFast`):
+        tokenizer (`LayoutLMv3Tokenizer` or `LayoutLMv3TokenizerFast`, *optional*):
             An instance of [`LayoutLMv3Tokenizer`] or [`LayoutLMv3TokenizerFast`]. The tokenizer is a required input.
     """
     attributes = ["image_processor", "tokenizer"]
@@ -48,6 +48,7 @@ class LayoutLMv3Processor(ProcessorMixin):
     tokenizer_class = ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast")
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        feature_extractor = None
         if "feature_extractor" in kwargs:
             warnings.warn(
                 "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
@@ -85,7 +86,7 @@ def __call__(
         return_length: bool = False,
         verbose: bool = True,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         This method first forwards the `images` argument to [`~LayoutLMv3ImageProcessor.__call__`]. In case
diff --git a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py
index 3521266e311b..4d3d1078db6a 100644
--- a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py
@@ -97,7 +97,7 @@
                 argument defines the number of overlapping tokens.
             pad_to_multiple_of (`int`, *optional*):
                 If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
-                the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
+                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
             return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
                 If set, will return tensors instead of list of python integers. Acceptable values are:
 
@@ -146,7 +146,7 @@
                 argument defines the number of overlapping tokens.
             pad_to_multiple_of (`int`, *optional*):
                 If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
-                the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
+                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
             return_tensors (`str` or [`~utils.TensorType`], *optional*):
                 If set, will return tensors instead of list of python integers. Acceptable values are:
 
@@ -253,7 +253,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
         mask_token (`str`, *optional*, defaults to `"<mask>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        add_prefix_space (`bool`, *optional*, defaults to `False`):
+        add_prefix_space (`bool`, *optional*, defaults to `True`):
             Whether or not to add an initial space to the input. This allows to treat the leading word just as any
             other word. (RoBERTa tokenizer detect beginning of words by the preceding space).
         cls_token_box (`List[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
@@ -291,7 +291,7 @@ def __init__(
         pad_token_box=[0, 0, 0, 0],
         pad_token_label=-100,
         only_label_first_subword=True,
-        **kwargs
+        **kwargs,
     ):
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
@@ -303,24 +303,6 @@ def __init__(
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
 
-        super().__init__(
-            errors=errors,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            add_prefix_space=add_prefix_space,
-            cls_token_box=cls_token_box,
-            sep_token_box=sep_token_box,
-            pad_token_box=pad_token_box,
-            pad_token_label=pad_token_label,
-            only_label_first_subword=only_label_first_subword,
-            **kwargs,
-        )
-
         with open(vocab_file, encoding="utf-8") as vocab_handle:
             self.encoder = json.load(vocab_handle)
         self.decoder = {v: k for k, v in self.encoder.items()}
@@ -344,6 +326,24 @@ def __init__(
         self.pad_token_label = pad_token_label
         self.only_label_first_subword = only_label_first_subword
 
+        super().__init__(
+            errors=errors,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            add_prefix_space=add_prefix_space,
+            cls_token_box=cls_token_box,
+            sep_token_box=sep_token_box,
+            pad_token_box=pad_token_box,
+            pad_token_label=pad_token_label,
+            only_label_first_subword=only_label_first_subword,
+            **kwargs,
+        )
+
     @property
     # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.vocab_size
     def vocab_size(self):
@@ -351,7 +351,9 @@ def vocab_size(self):
 
     # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.get_vocab
     def get_vocab(self):
-        return dict(self.encoder, **self.added_tokens_encoder)
+        vocab = dict(self.encoder).copy()
+        vocab.update(self.added_tokens_encoder)
+        return vocab
 
     # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.bpe
     def bpe(self, token):
@@ -539,7 +541,7 @@ def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
         if (
             (is_split_into_words or add_prefix_space)
             and (len(text) > 0 and not text[0].isspace())
-            and sum([text.startswith(no_split_token) for no_split_token in self.unique_no_split_tokens]) == 0
+            and sum([text.startswith(no_split_token) for no_split_token in self.added_tokens_encoder]) == 0
         ):
             text = " " + text
         return (text, kwargs)
@@ -566,7 +568,7 @@ def __call__(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
@@ -585,6 +587,7 @@ def __call__(
             word_labels (`List[int]`, `List[List[int]]`, *optional*):
                 Word-level integer labels (for token classification tasks such as FUNSD, CORD).
         """
+
         # Input type checking for clearer error
         def _is_valid_text_input(t):
             if isinstance(t, str):
@@ -719,9 +722,8 @@ def batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
-
         # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
         padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
             padding=padding,
@@ -779,9 +781,8 @@ def _batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
-
         if return_offsets_mapping:
             raise NotImplementedError(
                 "return_offset_mapping is not available when using Python tokenizers. "
@@ -905,7 +906,7 @@ def encode(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> List[int]:
         encoded_inputs = self.encode_plus(
             text=text,
@@ -953,7 +954,7 @@ def encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated,
@@ -1020,7 +1021,7 @@ def _encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         if return_offsets_mapping:
             raise NotImplementedError(
@@ -1074,7 +1075,7 @@ def prepare_for_model(
         return_length: bool = False,
         verbose: bool = True,
         prepend_batch_axis: bool = False,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Prepares a sequence or a pair of sequences so that it can be used by the model. It adds special tokens,
@@ -1346,8 +1347,7 @@ def truncate_sequences(
                 )
                 if truncation_strategy == TruncationStrategy.ONLY_FIRST:
                     error_msg = (
-                        error_msg
-                        + "Please select another truncation strategy than "
+                        error_msg + "Please select another truncation strategy than "
                         f"{truncation_strategy}, for instance 'longest_first' or 'only_second'."
                     )
                 logger.error(error_msg)
@@ -1420,7 +1420,7 @@ def _pad(
                     - 'right': pads on the right of the sequences
             pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                 This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
-                >= 7.5 (Volta).
+                `>= 7.5` (Volta).
             return_attention_mask:
                 (optional) Set to False to avoid returning attention mask (default: set to model specifics)
         """
diff --git a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py
index 121685fc9340..3d7445e44931 100644
--- a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py
+++ b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py
@@ -156,7 +156,7 @@ def __init__(
         pad_token_box=[0, 0, 0, 0],
         pad_token_label=-100,
         only_label_first_subword=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
@@ -243,7 +243,7 @@ def __call__(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
@@ -262,6 +262,7 @@ def __call__(
             word_labels (`List[int]`, `List[List[int]]`, *optional*):
                 Word-level integer labels (for token classification tasks such as FUNSD, CORD).
         """
+
         # Input type checking for clearer error
         def _is_valid_text_input(t):
             if isinstance(t, str):
@@ -396,9 +397,8 @@ def batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
-
         # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
         padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
             padding=padding,
@@ -462,7 +462,7 @@ def encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated,
@@ -508,7 +508,6 @@ def encode_plus(
             **kwargs,
         )
 
-    # Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2_fast.LayoutLMv2TokenizerFast._batch_encode_plus with LayoutLMv2->LayoutLMv3
     def _batch_encode_plus(
         self,
         batch_text_or_text_pairs: Union[
@@ -534,7 +533,6 @@ def _batch_encode_plus(
         return_length: bool = False,
         verbose: bool = True,
     ) -> BatchEncoding:
-
         if not isinstance(batch_text_or_text_pairs, list):
             raise TypeError(f"batch_text_or_text_pairs has to be a list (got {type(batch_text_or_text_pairs)})")
 
@@ -641,6 +639,7 @@ def _batch_encode_plus(
                 else:
                     original_index = batch_index
                 labels_example = []
+                previous_token_empty = False
                 for id, offset, word_id in zip(
                     sanitized_tokens["input_ids"][batch_index],
                     sanitized_tokens["offset_mapping"][batch_index],
@@ -648,11 +647,15 @@ def _batch_encode_plus(
                 ):
                     if word_id is not None:
                         if self.only_label_first_subword:
-                            if offset[0] == 0:
+                            if offset[0] == 0 and not previous_token_empty:
                                 # Use the real label id for the first token of the word, and padding ids for the remaining tokens
                                 labels_example.append(word_labels[original_index][word_id])
                             else:
                                 labels_example.append(self.pad_token_label)
+                            if offset == (0, 0):
+                                previous_token_empty = True
+                            else:
+                                previous_token_empty = False
                         else:
                             labels_example.append(word_labels[original_index][word_id])
                     else:
@@ -687,9 +690,8 @@ def _encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
-
         # make it a batched input
         # 2 options:
         # 1) only text, in case text must be a list of str
@@ -762,7 +764,7 @@ def _pad(
                     - 'right': pads on the right of the sequences
             pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                 This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
-                >= 7.5 (Volta).
+                `>= 7.5` (Volta).
             return_attention_mask:
                 (optional) Set to False to avoid returning attention mask (default: set to model specifics)
         """
diff --git a/src/transformers/models/layoutxlm/__init__.py b/src/transformers/models/layoutxlm/__init__.py
index 9c09d75d68ba..e3885d381f9c 100644
--- a/src/transformers/models/layoutxlm/__init__.py
+++ b/src/transformers/models/layoutxlm/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/layoutxlm/processing_layoutxlm.py b/src/transformers/models/layoutxlm/processing_layoutxlm.py
index 49fbb1ac3ddc..b1d885255b7c 100644
--- a/src/transformers/models/layoutxlm/processing_layoutxlm.py
+++ b/src/transformers/models/layoutxlm/processing_layoutxlm.py
@@ -15,6 +15,7 @@
 """
 Processor class for LayoutXLM.
 """
+import warnings
 from typing import List, Optional, Union
 
 from ...processing_utils import ProcessorMixin
@@ -24,26 +25,45 @@
 
 class LayoutXLMProcessor(ProcessorMixin):
     r"""
-    Constructs a LayoutXLM processor which combines a LayoutXLM feature extractor and a LayoutXLM tokenizer into a
-    single processor.
+    Constructs a LayoutXLM processor which combines a LayoutXLM image processor and a LayoutXLM tokenizer into a single
+    processor.
 
     [`LayoutXLMProcessor`] offers all the functionalities you need to prepare data for the model.
 
-    It first uses [`LayoutLMv2FeatureExtractor`] to resize document images to a fixed size, and optionally applies OCR
-    to get words and normalized bounding boxes. These are then provided to [`LayoutXLMTokenizer`] or
+    It first uses [`LayoutLMv2ImageProcessor`] to resize document images to a fixed size, and optionally applies OCR to
+    get words and normalized bounding boxes. These are then provided to [`LayoutXLMTokenizer`] or
     [`LayoutXLMTokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
     `attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
     into token-level `labels` for token classification tasks (such as FUNSD, CORD).
 
     Args:
-        feature_extractor (`LayoutLMv2FeatureExtractor`):
-            An instance of [`LayoutLMv2FeatureExtractor`]. The feature extractor is a required input.
-        tokenizer (`LayoutXLMTokenizer` or `LayoutXLMTokenizerFast`):
+        image_processor (`LayoutLMv2ImageProcessor`, *optional*):
+            An instance of [`LayoutLMv2ImageProcessor`]. The image processor is a required input.
+        tokenizer (`LayoutXLMTokenizer` or `LayoutXLMTokenizerFast`, *optional*):
             An instance of [`LayoutXLMTokenizer`] or [`LayoutXLMTokenizerFast`]. The tokenizer is a required input.
     """
-    feature_extractor_class = "LayoutLMv2FeatureExtractor"
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "LayoutLMv2ImageProcessor"
     tokenizer_class = ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast")
 
+    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        if "feature_extractor" in kwargs:
+            warnings.warn(
+                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
+                " instead.",
+                FutureWarning,
+            )
+            feature_extractor = kwargs.pop("feature_extractor")
+
+        image_processor = image_processor if image_processor is not None else feature_extractor
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+
+        super().__init__(image_processor, tokenizer)
+
     def __call__(
         self,
         images,
@@ -65,40 +85,40 @@ def __call__(
         return_length: bool = False,
         verbose: bool = True,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
-        This method first forwards the `images` argument to [`~LayoutLMv2FeatureExtractor.__call__`]. In case
-        [`LayoutLMv2FeatureExtractor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
+        This method first forwards the `images` argument to [`~LayoutLMv2ImagePrpcessor.__call__`]. In case
+        [`LayoutLMv2ImagePrpcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
         bounding boxes along with the additional arguments to [`~LayoutXLMTokenizer.__call__`] and returns the output,
-        together with resized `images`. In case [`LayoutLMv2FeatureExtractor`] was initialized with `apply_ocr` set to
+        together with resized `images`. In case [`LayoutLMv2ImagePrpcessor`] was initialized with `apply_ocr` set to
         `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along with the additional
         arguments to [`~LayoutXLMTokenizer.__call__`] and returns the output, together with resized `images``.
 
         Please refer to the docstring of the above two methods for more information.
         """
         # verify input
-        if self.feature_extractor.apply_ocr and (boxes is not None):
+        if self.image_processor.apply_ocr and (boxes is not None):
             raise ValueError(
                 "You cannot provide bounding boxes "
-                "if you initialized the feature extractor with apply_ocr set to True."
+                "if you initialized the image processor with apply_ocr set to True."
             )
 
-        if self.feature_extractor.apply_ocr and (word_labels is not None):
+        if self.image_processor.apply_ocr and (word_labels is not None):
             raise ValueError(
-                "You cannot provide word labels if you initialized the feature extractor with apply_ocr set to True."
+                "You cannot provide word labels if you initialized the image processor with apply_ocr set to True."
             )
 
         if return_overflowing_tokens is True and return_offsets_mapping is False:
             raise ValueError("You cannot return overflowing tokens without returning the offsets mapping.")
 
-        # first, apply the feature extractor
-        features = self.feature_extractor(images=images, return_tensors=return_tensors)
+        # first, apply the image processor
+        features = self.image_processor(images=images, return_tensors=return_tensors)
 
         # second, apply the tokenizer
-        if text is not None and self.feature_extractor.apply_ocr and text_pair is None:
+        if text is not None and self.image_processor.apply_ocr and text_pair is None:
             if isinstance(text, str):
-                text = [text]  # add batch dimension (as the feature extractor always adds a batch dimension)
+                text = [text]  # add batch dimension (as the image processor always adds a batch dimension)
             text_pair = features["words"]
 
         encoded_inputs = self.tokenizer(
@@ -162,3 +182,19 @@ def decode(self, *args, **kwargs):
     @property
     def model_input_names(self):
         return ["input_ids", "bbox", "attention_mask", "image"]
+
+    @property
+    def feature_extractor_class(self):
+        warnings.warn(
+            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
+            FutureWarning,
+        )
+        return self.image_processor_class
+
+    @property
+    def feature_extractor(self):
+        warnings.warn(
+            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
+            FutureWarning,
+        )
+        return self.image_processor
diff --git a/src/transformers/models/layoutxlm/tokenization_layoutxlm.py b/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
index d2f5e7514ccf..535ddb254ea2 100644
--- a/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
+++ b/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
@@ -82,7 +82,7 @@
                 argument defines the number of overlapping tokens.
             pad_to_multiple_of (`int`, *optional*):
                 If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
-                the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
+                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
             return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
                 If set, will return tensors instead of list of python integers. Acceptable values are:
 
@@ -203,8 +203,6 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
             CrossEntropyLoss.
         only_label_first_subword (`bool`, *optional*, defaults to `True`):
             Whether or not to only label the first subword, in case word labels are provided.
-        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
-            Additional special tokens used by the tokenizer.
         sp_model_kwargs (`dict`, *optional*):
             Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
             SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
@@ -247,30 +245,13 @@ def __init__(
         pad_token_label=-100,
         only_label_first_subword=True,
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
 
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
 
-        super().__init__(
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            cls_token_box=cls_token_box,
-            sep_token_box=sep_token_box,
-            pad_token_box=pad_token_box,
-            pad_token_label=pad_token_label,
-            only_label_first_subword=only_label_first_subword,
-            sp_model_kwargs=self.sp_model_kwargs,
-            **kwargs,
-        )
-
         self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
         self.sp_model.Load(str(vocab_file))
         self.vocab_file = vocab_file
@@ -297,6 +278,23 @@ def __init__(
         self.pad_token_label = pad_token_label
         self.only_label_first_subword = only_label_first_subword
 
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            cls_token_box=cls_token_box,
+            sep_token_box=sep_token_box,
+            pad_token_box=pad_token_box,
+            pad_token_label=pad_token_label,
+            only_label_first_subword=only_label_first_subword,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+
     def __getstate__(self):
         state = self.__dict__.copy()
         state["sp_model"] = None
@@ -462,7 +460,7 @@ def __call__(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
@@ -481,6 +479,7 @@ def __call__(
             word_labels (`List[int]`, `List[List[int]]`, *optional*):
                 Word-level integer labels (for token classification tasks such as FUNSD, CORD).
         """
+
         # Input type checking for clearer error
         def _is_valid_text_input(t):
             if isinstance(t, str):
@@ -613,9 +612,8 @@ def _batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
-
         if return_offsets_mapping:
             raise NotImplementedError(
                 "return_offset_mapping is not available when using Python tokenizers. "
@@ -736,7 +734,7 @@ def _encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         if return_offsets_mapping:
             raise NotImplementedError(
@@ -790,7 +788,7 @@ def prepare_for_model(
         return_length: bool = False,
         verbose: bool = True,
         prepend_batch_axis: bool = False,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Prepares a sequence or a pair of sequences so that it can be used by the model. It adds special tokens,
@@ -1118,7 +1116,7 @@ def _pad(
                     - 'right': pads on the right of the sequences
             pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                 This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
-                >= 7.5 (Volta).
+                `>= 7.5` (Volta).
             return_attention_mask:
                 (optional) Set to False to avoid returning attention mask (default: set to model specifics)
         """
diff --git a/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py b/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
index 387f7cdbdab2..31c4579d4766 100644
--- a/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
+++ b/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
@@ -85,7 +85,7 @@
                 argument defines the number of overlapping tokens.
             pad_to_multiple_of (`int`, *optional*):
                 If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
-                the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
+                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
             return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
                 If set, will return tensors instead of list of python integers. Acceptable values are:
 
@@ -233,7 +233,7 @@ def __init__(
         pad_token_box=[0, 0, 0, 0],
         pad_token_label=-100,
         only_label_first_subword=True,
-        **kwargs
+        **kwargs,
     ):
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
@@ -257,7 +257,6 @@ def __init__(
         )
 
         self.vocab_file = vocab_file
-        self.can_save_slow_tokenizer = False if not self.vocab_file else True
 
         # additional properties
         self.cls_token_box = cls_token_box
@@ -266,6 +265,10 @@ def __init__(
         self.pad_token_label = pad_token_label
         self.only_label_first_subword = only_label_first_subword
 
+    @property
+    def can_save_slow_tokenizer(self) -> bool:
+        return os.path.isfile(self.vocab_file) if self.vocab_file else False
+
     @add_end_docstrings(LAYOUTXLM_ENCODE_KWARGS_DOCSTRING)
     def __call__(
         self,
@@ -287,7 +290,7 @@ def __call__(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
@@ -306,6 +309,7 @@ def __call__(
             word_labels (`List[int]`, `List[List[int]]`, *optional*):
                 Word-level integer labels (for token classification tasks such as FUNSD, CORD).
         """
+
         # Input type checking for clearer error
         def _is_valid_text_input(t):
             if isinstance(t, str):
@@ -448,7 +452,6 @@ def _batch_encode_plus(
         verbose: bool = True,
         **kwargs,
     ) -> BatchEncoding:
-
         if not isinstance(batch_text_or_text_pairs, list):
             raise TypeError(f"batch_text_or_text_pairs has to be a list (got {type(batch_text_or_text_pairs)})")
 
@@ -600,9 +603,8 @@ def _encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
-
         # make it a batched input
         # 2 options:
         # 1) only text, in case text must be a list of str
@@ -674,7 +676,7 @@ def _pad(
                     - 'right': pads on the right of the sequences
             pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                 This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
-                >= 7.5 (Volta).
+                `>= 7.5` (Volta).
             return_attention_mask:
                 (optional) Set to False to avoid returning attention mask (default: set to model specifics)
         """
diff --git a/src/transformers/models/led/__init__.py b/src/transformers/models/led/__init__.py
index da871828ad88..dd1c53b886eb 100644
--- a/src/transformers/models/led/__init__.py
+++ b/src/transformers/models/led/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/led/configuration_led.py b/src/transformers/models/led/configuration_led.py
index 98d2e32f62e4..34c286ce1891 100644
--- a/src/transformers/models/led/configuration_led.py
+++ b/src/transformers/models/led/configuration_led.py
@@ -132,7 +132,7 @@ def __init__(
         bos_token_id=0,
         eos_token_id=2,
         attention_window: Union[List[int], int] = 512,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.max_encoder_position_embeddings = max_encoder_position_embeddings
diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py
index dff90268cf55..f0c22ed9502c 100755
--- a/src/transformers/models/led/modeling_led.py
+++ b/src/transformers/models/led/modeling_led.py
@@ -16,7 +16,6 @@
 
 
 import math
-import random
 import warnings
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
@@ -51,7 +50,6 @@
 
 _CHECKPOINT_FOR_DOC = "allenai/led-base-16384"
 _CONFIG_FOR_DOC = "LEDConfig"
-_TOKENIZER_FOR_DOC = "LEDTokenizer"
 
 
 LED_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -81,7 +79,7 @@ def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min)
     mask_cond = torch.arange(mask.size(-1))
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
@@ -466,8 +464,8 @@ def _sliding_chunks_query_key_matmul(self, query: torch.Tensor, key: torch.Tenso
         query = query.transpose(1, 2).reshape(batch_size * num_heads, seq_len, head_dim)
         key = key.transpose(1, 2).reshape(batch_size * num_heads, seq_len, head_dim)
 
-        query = self._chunk(query, window_overlap, self.config.__dict__.get("onnx_export", False))
-        key = self._chunk(key, window_overlap, self.config.__dict__.get("onnx_export", False))
+        query = self._chunk(query, window_overlap, getattr(self.config, "onnx_export", False))
+        key = self._chunk(key, window_overlap, getattr(self.config, "onnx_export", False))
 
         # matrix multiplication
         # bcxd: batch_size * num_heads x chunks x 2window_overlap x head_dim
@@ -964,7 +962,7 @@ def forward(
     ):
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+            hidden_states (`torch.FloatTensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
             attention_mask (`torch.FloatTensor`): attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
@@ -1042,11 +1040,11 @@ def forward(
     ):
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+            hidden_states (`torch.FloatTensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
             attention_mask (`torch.FloatTensor`): attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             encoder_hidden_states (`torch.FloatTensor`):
-                cross attention input to the layer of shape *(seq_len, batch, embed_dim)*
+                cross attention input to the layer of shape *(batch, seq_len, embed_dim)*
             encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
@@ -1507,10 +1505,10 @@ class LEDSeq2SeqQuestionAnsweringModelOutput(ModelOutput):
 
     ```python
     >>> import torch
-    >>> from transformers import LEDTokenizer, LEDForConditionalGeneration
+    >>> from transformers import AutoTokenizer, LEDForConditionalGeneration
 
     >>> model = LEDForConditionalGeneration.from_pretrained("allenai/led-large-16384-arxiv")
-    >>> tokenizer = LEDTokenizer.from_pretrained("allenai/led-large-16384-arxiv")
+    >>> tokenizer = AutoTokenizer.from_pretrained("allenai/led-large-16384-arxiv")
 
     >>> ARTICLE_TO_SUMMARIZE = '''Transformers (Vaswani et al., 2017) have achieved state-of-the-art
     ...     results in a wide range of natural language tasks including generative language modeling
@@ -1546,7 +1544,7 @@ class LEDSeq2SeqQuestionAnsweringModelOutput(ModelOutput):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`LEDTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -1764,7 +1762,7 @@ def forward(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`LEDTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -1872,7 +1870,7 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
 
             if self.training and (dropout_probability < self.layerdrop):  # skip the layer
                 layer_outputs = (None, None, None)
@@ -1992,7 +1990,7 @@ def forward(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`LEDTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -2111,6 +2109,13 @@ def forward(
 
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
@@ -2129,20 +2134,15 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         # None for past_key_value
@@ -2209,7 +2209,7 @@ def custom_forward(*inputs):
     LED_START_DOCSTRING,
 )
 class LEDModel(LEDPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
+    _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
 
     def __init__(self, config: LEDConfig):
         super().__init__(config)
@@ -2239,7 +2239,6 @@ def get_decoder(self):
 
     @add_start_docstrings_to_model_forward(LED_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=Seq2SeqModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -2335,14 +2334,8 @@ def forward(
 )
 class LEDForConditionalGeneration(LEDPreTrainedModel):
     base_model_prefix = "led"
-    _keys_to_ignore_on_load_missing = [
-        r"final_logits_bias",
-        r"encoder.version",
-        r"decoder.version",
-        r"lm_head.weight",
-        "decoder.embed_tokens.weight",
-        "encoder.embed_tokens.weight",
-    ]
+    _keys_to_ignore_on_load_missing = ["final_logits_bias"]
+    _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "lm_head.weight"]
 
     def __init__(self, config: LEDConfig):
         super().__init__(config)
@@ -2359,9 +2352,9 @@ def get_encoder(self):
     def get_decoder(self):
         return self.led.get_decoder()
 
-    def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
-        new_embeddings = super().resize_token_embeddings(new_num_tokens)
-        self._resize_final_logits_bias(new_num_tokens)
+    def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding:
+        new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
+        self._resize_final_logits_bias(new_embeddings.weight.shape[0])
         return new_embeddings
 
     def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
@@ -2413,9 +2406,9 @@ def forward(
         Conditional generation example:
 
         ```python
-        >>> from transformers import LEDTokenizer, LEDForConditionalGeneration
+        >>> from transformers import AutoTokenizer, LEDForConditionalGeneration
 
-        >>> tokenizer = LEDTokenizer.from_pretrained("allenai/led-base-16384")
+        >>> tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384")
         >>> TXT = "My friends are <mask> but they eat too many carbs."
 
         >>> model = LEDForConditionalGeneration.from_pretrained("allenai/led-base-16384")
@@ -2511,12 +2504,13 @@ def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
         return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
 
     @staticmethod
-    def _reorder_cache(past, beam_idx):
+    def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
+        for layer_past in past_key_values:
             # cached cross_attention states don't have to be reordered -> they are always the same
             reordered_past += (
-                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
+                + layer_past[2:],
             )
         return reordered_past
 
@@ -2529,7 +2523,7 @@ def _reorder_cache(past, beam_idx):
     LED_START_DOCSTRING,
 )
 class LEDForSequenceClassification(LEDPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
+    _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
 
     def __init__(self, config: LEDConfig, **kwargs):
         warnings.warn(
@@ -2546,12 +2540,12 @@ def __init__(self, config: LEDConfig, **kwargs):
             config.num_labels,
             config.classifier_dropout,
         )
-        self.led._init_weights(self.classification_head.dense)
-        self.led._init_weights(self.classification_head.out_proj)
+
+        # Initialize weights and apply final processing
+        self.post_init()
 
     @add_start_docstrings_to_model_forward(LED_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=Seq2SeqSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -2665,7 +2659,7 @@ def forward(
     LED_START_DOCSTRING,
 )
 class LEDForQuestionAnswering(LEDPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
+    _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -2676,11 +2670,11 @@ def __init__(self, config):
         self.led = LEDModel(config)
         self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
 
-        self.led._init_weights(self.qa_outputs)
+        # Initialize weights and apply final processing
+        self.post_init()
 
     @add_start_docstrings_to_model_forward(LED_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=Seq2SeqQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/led/modeling_tf_led.py b/src/transformers/models/led/modeling_tf_led.py
index 0db380831b5e..879538bca76b 100644
--- a/src/transformers/models/led/modeling_tf_led.py
+++ b/src/transformers/models/led/modeling_tf_led.py
@@ -15,6 +15,8 @@
 """ TF 2.0 LED model."""
 
 
+from __future__ import annotations
+
 import random
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
@@ -33,7 +35,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     ContextManagers,
     ModelOutput,
@@ -50,7 +52,6 @@
 
 _CHECKPOINT_FOR_DOC = "allenai/led-base-16384"
 _CONFIG_FOR_DOC = "LEDConfig"
-_TOKENIZER_FOR_DOC = "LEDTokenizer"
 
 
 LARGE_NEGATIVE = -1e8
@@ -134,6 +135,7 @@ def call(self, input_shape: tf.TensorShape, past_key_values_length: int = 0):
 class TFLEDEncoderSelfAttention(tf.keras.layers.Layer):
     def __init__(self, config, layer_id, **kwargs):
         super().__init__(**kwargs)
+        self.config = config
 
         if config.hidden_size % config.num_attention_heads != 0:
             raise ValueError(
@@ -190,6 +192,16 @@ def __init__(self, config, layer_id, **kwargs):
 
         self.one_sided_attn_window_size = attention_window // 2
 
+    def build(self, input_shape=None):
+        if not self.built:
+            with tf.name_scope("query_global"):
+                self.query_global.build((self.config.hidden_size,))
+            with tf.name_scope("key_global"):
+                self.key_global.build((self.config.hidden_size,))
+            with tf.name_scope("value_global"):
+                self.value_global.build((self.config.hidden_size,))
+        super().build(input_shape)
+
     def call(
         self,
         inputs,
@@ -270,9 +282,8 @@ def call(
         ) = self._get_global_attn_indices(is_index_global_attn)
 
         # this function is only relevant for global attention
-        attn_scores = tf.cond(
-            is_global_attn,
-            lambda: self._concat_with_global_key_attn_probs(
+        if is_global_attn:
+            attn_scores = self._concat_with_global_key_attn_probs(
                 attn_scores=attn_scores,
                 query_vectors=query_vectors,
                 key_vectors=key_vectors,
@@ -280,26 +291,24 @@ def call(
                 is_index_global_attn_nonzero=is_index_global_attn_nonzero,
                 is_local_index_global_attn_nonzero=is_local_index_global_attn_nonzero,
                 is_local_index_no_global_attn_nonzero=is_local_index_no_global_attn_nonzero,
-            ),
-            lambda: attn_scores,
-        )
+            )
+
         attn_probs = stable_softmax(attn_scores, axis=-1)
 
         # softmax sometimes inserts NaN if all positions are masked, replace them with 0
         # Make sure to create a mask with the proper shape:
         # if is_global_attn==True => [batch_size, seq_len, self.num_heads, self.one_sided_attn_window_size * 2 + max_num_global_attn_indices + 1]
         # if is_global_attn==False => [batch_size, seq_len, self.num_heads, self.one_sided_attn_window_size * 2 + 1]
-        masked_index = tf.cond(
-            is_global_attn,
-            lambda: tf.tile(
+        if is_global_attn:
+            masked_index = tf.tile(
                 is_index_masked[:, :, None, None],
                 (1, 1, self.num_heads, self.one_sided_attn_window_size * 2 + max_num_global_attn_indices + 1),
-            ),
-            lambda: tf.tile(
+            )
+        else:
+            masked_index = tf.tile(
                 is_index_masked[:, :, None, None],
                 (1, 1, self.num_heads, self.one_sided_attn_window_size * 2 + 1),
-            ),
-        )
+            )
         attn_probs = tf.where(
             masked_index,
             tf.zeros(shape_list(masked_index), dtype=attn_probs.dtype),
@@ -323,19 +332,19 @@ def call(
         value_vectors = tf.reshape(value_vectors, (batch_size, seq_len, self.num_heads, self.head_dim))
 
         # if global attention, compute sum of global and local attn
-        attn_output = tf.cond(
-            is_global_attn,
-            lambda: self._compute_attn_output_with_global_indices(
+
+        if is_global_attn:
+            attn_output = self._compute_attn_output_with_global_indices(
                 value_vectors=value_vectors,
                 attn_probs=attn_probs,
                 max_num_global_attn_indices=max_num_global_attn_indices,
                 is_index_global_attn_nonzero=is_index_global_attn_nonzero,
                 is_local_index_global_attn_nonzero=is_local_index_global_attn_nonzero,
-            ),
-            lambda: self._sliding_chunks_matmul_attn_probs_value(
+            )
+        else:
+            attn_output = self._sliding_chunks_matmul_attn_probs_value(
                 attn_probs, value_vectors, self.one_sided_attn_window_size
-            ),
-        )
+            )
 
         tf.debugging.assert_equal(
             shape_list(attn_output), [batch_size, seq_len, self.num_heads, self.head_dim], message="Unexpected size"
@@ -344,10 +353,8 @@ def call(
         attn_output = tf.reshape(attn_output, (batch_size, seq_len, embed_dim))
 
         # compute value for global attention and overwrite to attention output
-        # TODO: remove the redundant computation
-        attn_output, global_attn_probs = tf.cond(
-            is_global_attn,
-            lambda: self._compute_global_attn_output_from_hidden(
+        if is_global_attn:
+            attn_output, global_attn_probs = self._compute_global_attn_output_from_hidden(
                 attn_output=attn_output,
                 hidden_states=hidden_states,
                 max_num_global_attn_indices=max_num_global_attn_indices,
@@ -357,25 +364,25 @@ def call(
                 is_local_index_no_global_attn_nonzero=is_local_index_no_global_attn_nonzero,
                 is_index_masked=is_index_masked,
                 training=training,
-            ),
-            lambda: (attn_output, tf.zeros((batch_size, self.num_heads, max_num_global_attn_indices, seq_len))),
-        )
+            )
+        else:
+            # Leave attn_output unchanged
+            global_attn_probs = tf.zeros((batch_size, self.num_heads, max_num_global_attn_indices, seq_len))
 
         # make sure that local attention probabilities are set to 0 for indices of global attn
         # Make sure to create a mask with the proper shape:
         # if is_global_attn==True => [batch_size, seq_len, self.num_heads, self.one_sided_attn_window_size * 2 + max_num_global_attn_indices + 1]
         # if is_global_attn==False => [batch_size, seq_len, self.num_heads, self.one_sided_attn_window_size * 2 + 1]
-        masked_global_attn_index = tf.cond(
-            is_global_attn,
-            lambda: tf.tile(
+        if is_global_attn:
+            masked_global_attn_index = tf.tile(
                 is_index_global_attn[:, :, None, None],
                 (1, 1, self.num_heads, self.one_sided_attn_window_size * 2 + max_num_global_attn_indices + 1),
-            ),
-            lambda: tf.tile(
+            )
+        else:
+            masked_global_attn_index = tf.tile(
                 is_index_global_attn[:, :, None, None],
                 (1, 1, self.num_heads, self.one_sided_attn_window_size * 2 + 1),
-            ),
-        )
+            )
         attn_probs = tf.where(
             masked_global_attn_index,
             tf.zeros(shape_list(masked_global_attn_index), dtype=attn_probs.dtype),
@@ -1031,12 +1038,12 @@ def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
     def call(
         self,
         hidden_states: tf.Tensor,
-        key_value_states: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[Tuple[tf.Tensor]]] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
+        key_value_states: tf.Tensor | None = None,
+        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
+        attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
         training=False,
-    ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
+    ) -> Tuple[tf.Tensor, tf.Tensor | None]:
         """Input shape: Batch x Time x Channel"""
 
         # if key_value_states are provided this layer is used as a cross-attention layer
@@ -1174,7 +1181,7 @@ def call(
     ):
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+            hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
             attention_mask (`tf.Tensor`): attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
@@ -1239,21 +1246,21 @@ def __init__(self, config: LEDConfig, **kwargs):
     def call(
         self,
         hidden_states,
-        attention_mask: Optional[tf.Tensor] = None,
-        encoder_hidden_states: Optional[tf.Tensor] = None,
-        encoder_attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
-        encoder_layer_head_mask: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[tf.Tensor]] = None,
+        attention_mask: tf.Tensor | None = None,
+        encoder_hidden_states: tf.Tensor | None = None,
+        encoder_attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
+        encoder_layer_head_mask: tf.Tensor | None = None,
+        past_key_value: Tuple[tf.Tensor] | None = None,
         training=False,
     ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+            hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
             attention_mask (`tf.Tensor`): attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             encoder_hidden_states (`tf.Tensor`):
-                cross attention input to the layer of shape *(seq_len, batch, embed_dim)*
+                cross attention input to the layer of shape *(batch, seq_len, embed_dim)*
             encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
@@ -1322,33 +1329,10 @@ class TFLEDPreTrainedModel(TFPreTrainedModel):
     base_model_prefix = "led"
 
     @property
-    def dummy_inputs(self):
-        input_ids = tf.convert_to_tensor([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0]], dtype=tf.int32)
-        # make sure global layers are initialized
-        attention_mask = tf.convert_to_tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0]], dtype=tf.int32)
-        global_attention_mask = tf.convert_to_tensor([[0, 0, 0, 0, 1], [0, 0, 1, 0, 0]], dtype=tf.int32)
-        dummy_inputs = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "global_attention_mask": global_attention_mask,
-            "decoder_input_ids": input_ids,
-        }
-        return dummy_inputs
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-                "decoder_input_ids": tf.TensorSpec((None, None), tf.int32, name="decoder_input_ids"),
-                "decoder_attention_mask": tf.TensorSpec((None, None), tf.int32, name="decoder_attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
+    def input_signature(self):
+        sig = super().input_signature
+        sig["global_attention_mask"] = tf.TensorSpec((None, None), tf.int32, name="global_attention_mask")
+        return sig
 
 
 @dataclass
@@ -1390,9 +1374,9 @@ class TFLEDEncoderBaseModelOutput(ModelOutput):
     """
 
     last_hidden_state: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
-    global_attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+    global_attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -1453,14 +1437,14 @@ class TFLEDSeq2SeqModelOutput(ModelOutput):
     """
 
     last_hidden_state: tf.Tensor = None
-    past_key_values: Optional[List[tf.Tensor]] = None
-    decoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    decoder_attentions: Optional[Tuple[tf.Tensor]] = None
-    cross_attentions: Optional[Tuple[tf.Tensor]] = None
-    encoder_last_hidden_state: Optional[tf.Tensor] = None
-    encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    encoder_attentions: Optional[Tuple[tf.Tensor]] = None
-    encoder_global_attentions: Optional[Tuple[tf.Tensor]] = None
+    past_key_values: List[tf.Tensor] | None = None
+    decoder_hidden_states: Tuple[tf.Tensor] | None = None
+    decoder_attentions: Tuple[tf.Tensor] | None = None
+    cross_attentions: Tuple[tf.Tensor] | None = None
+    encoder_last_hidden_state: tf.Tensor | None = None
+    encoder_hidden_states: Tuple[tf.Tensor] | None = None
+    encoder_attentions: Tuple[tf.Tensor] | None = None
+    encoder_global_attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -1518,16 +1502,16 @@ class TFLEDSeq2SeqLMOutput(ModelOutput):
             in the sequence.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    past_key_values: Optional[List[tf.Tensor]] = None
-    decoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    decoder_attentions: Optional[Tuple[tf.Tensor]] = None
-    cross_attentions: Optional[Tuple[tf.Tensor]] = None
-    encoder_last_hidden_state: Optional[tf.Tensor] = None
-    encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    encoder_attentions: Optional[Tuple[tf.Tensor]] = None
-    encoder_global_attentions: Optional[Tuple[tf.Tensor]] = None
+    past_key_values: List[tf.Tensor] | None = None
+    decoder_hidden_states: Tuple[tf.Tensor] | None = None
+    decoder_attentions: Tuple[tf.Tensor] | None = None
+    cross_attentions: Tuple[tf.Tensor] | None = None
+    encoder_last_hidden_state: tf.Tensor | None = None
+    encoder_hidden_states: Tuple[tf.Tensor] | None = None
+    encoder_attentions: Tuple[tf.Tensor] | None = None
+    encoder_global_attentions: Tuple[tf.Tensor] | None = None
 
 
 LED_START_DOCSTRING = r"""
@@ -1576,7 +1560,7 @@ class TFLEDSeq2SeqLMOutput(ModelOutput):
         input_ids (`tf.Tensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -1587,7 +1571,7 @@ class TFLEDSeq2SeqLMOutput(ModelOutput):
             - 0 for tokens that are **masked**.
 
             [What are attention masks?](../glossary#attention-mask)
-        decoder_input_ids (`tf.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+        decoder_input_ids (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
 
             Indices can be obtained using [`LedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
@@ -1611,7 +1595,7 @@ class TFLEDSeq2SeqLMOutput(ModelOutput):
             - 1 indicates the head is **not masked**,
             - 0 indicates the head is **masked**.
 
-        encoder_outputs (`tf.FloatTensor`, *optional*):
+        encoder_outputs (`tf.Tensor`, *optional*):
             hidden states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
             of shape `(batch_size, sequence_length, hidden_size)` is a sequence of
         past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
@@ -1704,7 +1688,7 @@ def call(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`LEDTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -1747,16 +1731,7 @@ def call(
             if hasattr(self.embed_tokens, "load_weight_prefix"):
                 context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
             with ContextManagers(context):
-                # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-                # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-                tf.debugging.assert_less(
-                    input_ids,
-                    tf.cast(self.embed_tokens.input_dim, dtype=input_ids.dtype),
-                    message=(
-                        "input_ids must be smaller than the embedding layer's input dimension (got"
-                        f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.input_dim})"
-                    ),
-                )
+                check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
                 inputs_embeds = self.embed_tokens(input_ids)
         elif inputs_embeds is not None:
             input_shape = shape_list(inputs_embeds)[:-1]
@@ -1770,7 +1745,7 @@ def call(
         if global_attention_mask is not None:
             attention_mask = attention_mask * tf.cast((global_attention_mask + 1), dtype=attention_mask.dtype)
 
-        (padding_len, input_ids, attention_mask, inputs_embeds,) = self._pad_to_window_size(
+        padding_len, input_ids, attention_mask, inputs_embeds = self._pad_to_window_size(
             input_ids=input_ids,
             attention_mask=attention_mask,
             inputs_embeds=inputs_embeds,
@@ -1810,7 +1785,6 @@ def call(
 
         # encoder layers
         for idx, encoder_layer in enumerate(self.layers):
-
             if output_hidden_states:
                 hidden_states_to_add = self.compute_hidden_states(hidden_states, padding_len)
                 encoder_states = encoder_states + (hidden_states_to_add,)
@@ -1896,13 +1870,10 @@ def _pad_to_window_size(
             input_ids = tf.pad(input_ids, paddings, constant_values=pad_token_id)
 
         if inputs_embeds is not None:
-
-            def pad_embeddings():
+            if padding_len > 0:
                 input_ids_padding = tf.fill((batch_size, padding_len), pad_token_id)
                 inputs_embeds_padding = self.embed_tokens(input_ids_padding)
-                return tf.concat([inputs_embeds, inputs_embeds_padding], axis=-2)
-
-            inputs_embeds = tf.cond(tf.math.greater(padding_len, 0), pad_embeddings, lambda: inputs_embeds)
+                inputs_embeds = tf.concat([inputs_embeds, inputs_embeds_padding], axis=-2)
 
         attention_mask = tf.pad(attention_mask, paddings, constant_values=False)  # no attention on the padding tokens
 
@@ -1967,7 +1938,7 @@ def call(
         Args:
             input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
-                provide it. Indices can be obtained using [`LEDTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                provide it. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids)
             attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
@@ -2040,16 +2011,7 @@ def call(
             if hasattr(self.embed_tokens, "load_weight_prefix"):
                 context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
             with ContextManagers(context):
-                # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-                # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-                tf.debugging.assert_less(
-                    input_ids,
-                    tf.cast(self.embed_tokens.input_dim, dtype=input_ids.dtype),
-                    message=(
-                        "input_ids must be smaller than the embedding layer's input dimension (got"
-                        f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.input_dim})"
-                    ),
-                )
+                check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
                 inputs_embeds = self.embed_tokens(input_ids)
 
         hidden_states = inputs_embeds
@@ -2189,9 +2151,8 @@ def call(
         output_hidden_states=None,
         return_dict=None,
         training=False,
-        **kwargs
+        **kwargs,
     ):
-
         if decoder_input_ids is None and decoder_inputs_embeds is None:
             use_cache = False
 
@@ -2269,32 +2230,30 @@ def get_decoder(self):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(LED_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFLEDSeq2SeqModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        encoder_outputs: Optional[Union[Tuple, TFLEDEncoderBaseModelOutput]] = None,
-        global_attention_mask=None,
-        past_key_values=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        training=False,
-        **kwargs
-    ):
-
+        input_ids: TFModelInputType | None = None,
+        attention_mask: tf.Tensor | None = None,
+        decoder_input_ids: tf.Tensor | None = None,
+        decoder_attention_mask: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        decoder_head_mask: tf.Tensor | None = None,
+        encoder_outputs: tf.Tensor | None = None,
+        global_attention_mask: tf.Tensor | None = None,
+        past_key_values: Tuple[Tuple[tf.Tensor]] | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        decoder_inputs_embeds: tf.Tensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+        **kwargs,
+    ) -> Tuple[tf.Tensor] | TFLEDSeq2SeqModelOutput:
         outputs = self.led(
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -2406,35 +2365,35 @@ def set_output_embeddings(self, value):
     @replace_return_docstrings(output_type=TFLEDSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_outputs: Optional[TFLEDEncoderBaseModelOutput] = None,
-        global_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        labels: Optional[tf.Tensor] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_head_mask: np.ndarray | tf.Tensor | None = None,
+        encoder_outputs: TFLEDEncoderBaseModelOutput | None = None,
+        global_attention_mask: np.ndarray | tf.Tensor | None = None,
+        past_key_values: Tuple[Tuple[Union[np.ndarray, tf.Tensor]]] | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: tf.Tensor | None = None,
         training: bool = False,
-    ):
+    ) -> Tuple[tf.Tensor] | TFLEDSeq2SeqLMOutput:
         """
         Returns:
 
         Examples:
 
         ```python
-        >>> from transformers import LEDTokenizer, TFLEDForConditionalGeneration
+        >>> from transformers import AutoTokenizer, TFLEDForConditionalGeneration
         >>> import tensorflow as tf
 
         >>> mname = "allenai/led-base-16384"
-        >>> tokenizer = LEDTokenizer.from_pretrained(mname)
+        >>> tokenizer = AutoTokenizer.from_pretrained(mname)
         >>> TXT = "My friends are <mask> but they eat too many carbs."
         >>> model = TFLEDForConditionalGeneration.from_pretrained(mname)
         >>> batch = tokenizer([TXT], return_tensors="tf")
@@ -2518,7 +2477,7 @@ def prepare_inputs_for_generation(
         decoder_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # cut decoder_input_ids if past is used
         if past_key_values is not None:
diff --git a/src/transformers/models/led/tokenization_led.py b/src/transformers/models/led/tokenization_led.py
index 812e374c7a48..bc83680b219f 100644
--- a/src/transformers/models/led/tokenization_led.py
+++ b/src/transformers/models/led/tokenization_led.py
@@ -97,12 +97,14 @@ class LEDTokenizer(PreTrainedTokenizer):
     This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
     be encoded differently whether it is at the beginning of the sentence (without space) or not:
 
-    ```
+    ```python
     >>> from transformers import LEDTokenizer
+
     >>> tokenizer = LEDTokenizer.from_pretrained("allenai/led-base-16384")
-    >>> tokenizer("Hello world")['input_ids']
+    >>> tokenizer("Hello world")["input_ids"]
     [0, 31414, 232, 2]
-    >>> tokenizer(" Hello world")['input_ids']
+
+    >>> tokenizer(" Hello world")["input_ids"]
     [0, 20920, 232, 2]
     ```
 
@@ -185,7 +187,7 @@ def __init__(
         pad_token="<pad>",
         mask_token="<mask>",
         add_prefix_space=False,
-        **kwargs
+        **kwargs,
     ):
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
@@ -195,21 +197,10 @@ def __init__(
         pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
 
         # Mask token behave like a normal word, i.e. include the space before it
+        # TODO seems like both slow and fast actually don't strip left and right soooooooo yeah. See `test_embeded_special_tokens`
+        # Also this not only will strip the spaces but any punctuation
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
 
-        super().__init__(
-            errors=errors,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            add_prefix_space=add_prefix_space,
-            **kwargs,
-        )
-
         with open(vocab_file, encoding="utf-8") as vocab_handle:
             self.encoder = json.load(vocab_handle)
         self.decoder = {v: k for k, v in self.encoder.items()}
@@ -226,6 +217,19 @@ def __init__(
         # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
         self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
 
+        super().__init__(
+            errors=errors,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            add_prefix_space=add_prefix_space,
+            **kwargs,
+        )
+
     @property
     # Copied from transformers.models.bart.tokenization_bart.BartTokenizer.vocab_size
     def vocab_size(self):
diff --git a/src/transformers/models/led/tokenization_led_fast.py b/src/transformers/models/led/tokenization_led_fast.py
index ee9118f11d68..e7ef2fff737c 100644
--- a/src/transformers/models/led/tokenization_led_fast.py
+++ b/src/transformers/models/led/tokenization_led_fast.py
@@ -55,12 +55,14 @@ class LEDTokenizerFast(PreTrainedTokenizerFast):
     This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
     be encoded differently whether it is at the beginning of the sentence (without space) or not:
 
-    ```
+    ```python
     >>> from transformers import LEDTokenizerFast
+
     >>> tokenizer = LEDTokenizerFast.from_pretrained("allenai/led-base-16384")
-    >>> tokenizer("Hello world")['input_ids']
+    >>> tokenizer("Hello world")["input_ids"]
     [0, 31414, 232, 2]
-    >>> tokenizer(" Hello world")['input_ids']
+
+    >>> tokenizer(" Hello world")["input_ids"]
     [0, 20920, 232, 2]
     ```
 
@@ -148,8 +150,9 @@ def __init__(
         mask_token="<mask>",
         add_prefix_space=False,
         trim_offsets=True,
-        **kwargs
+        **kwargs,
     ):
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
         super().__init__(
             vocab_file,
             merges_file,
diff --git a/src/transformers/models/levit/__init__.py b/src/transformers/models/levit/__init__.py
index f42fb02ad071..84adf04084e6 100644
--- a/src/transformers/models/levit/__init__.py
+++ b/src/transformers/models/levit/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/levit/configuration_levit.py b/src/transformers/models/levit/configuration_levit.py
index 525221217ad4..06c7925a8f37 100644
--- a/src/transformers/models/levit/configuration_levit.py
+++ b/src/transformers/models/levit/configuration_levit.py
@@ -105,7 +105,7 @@ def __init__(
         mlp_ratio=[2, 2, 2],
         attention_ratio=[2, 2, 2],
         initializer_range=0.02,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.image_size = image_size
@@ -130,7 +130,6 @@ def __init__(
 
 # Copied from transformers.models.vit.configuration_vit.ViTOnnxConfig
 class LevitOnnxConfig(OnnxConfig):
-
     torch_onnx_minimum_version = version.parse("1.11")
 
     @property
diff --git a/src/transformers/models/levit/convert_levit_timm_to_pytorch.py b/src/transformers/models/levit/convert_levit_timm_to_pytorch.py
index de8826ce61d3..6f285a6de393 100644
--- a/src/transformers/models/levit/convert_levit_timm_to_pytorch.py
+++ b/src/transformers/models/levit/convert_levit_timm_to_pytorch.py
@@ -21,11 +21,11 @@
 from functools import partial
 from pathlib import Path
 
-import torch
-
 import timm
+import torch
 from huggingface_hub import hf_hub_download
-from transformers import LevitConfig, LevitFeatureExtractor, LevitForImageClassificationWithTeacher
+
+from transformers import LevitConfig, LevitForImageClassificationWithTeacher, LevitImageProcessor
 from transformers.utils import logging
 
 
@@ -74,8 +74,8 @@ def convert_weight_and_push(
 
     if push_to_hub:
         our_model.save_pretrained(save_directory / checkpoint_name)
-        feature_extractor = LevitFeatureExtractor()
-        feature_extractor.save_pretrained(save_directory / checkpoint_name)
+        image_processor = LevitImageProcessor()
+        image_processor.save_pretrained(save_directory / checkpoint_name)
 
         print(f"Pushed {checkpoint_name}")
 
@@ -167,12 +167,12 @@ def convert_weights_and_push(save_directory: Path, model_name: str = None, push_
         required=False,
         help="Path to the output PyTorch model directory.",
     )
-    parser.add_argument("--push_to_hub", action="store_true", help="Push model and feature extractor to the hub")
+    parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub")
     parser.add_argument(
         "--no-push_to_hub",
         dest="push_to_hub",
         action="store_false",
-        help="Do not push model and feature extractor to the hub",
+        help="Do not push model and image processor to the hub",
     )
 
     args = parser.parse_args()
diff --git a/src/transformers/models/levit/image_processing_levit.py b/src/transformers/models/levit/image_processing_levit.py
index 4b2fc85ecd78..77de1ec33366 100644
--- a/src/transformers/models/levit/image_processing_levit.py
+++ b/src/transformers/models/levit/image_processing_levit.py
@@ -14,18 +14,13 @@
 # limitations under the License.
 """Image processor class for LeViT."""
 
-from typing import Dict, Iterable, List, Optional, Union
+from typing import Dict, Iterable, Optional, Union
 
 import numpy as np
 
-from transformers.utils.generic import TensorType
-
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import (
-    center_crop,
     get_resize_output_image_size,
-    normalize,
-    rescale,
     resize,
     to_channel_dimension_format,
 )
@@ -35,11 +30,13 @@
     ChannelDimension,
     ImageInput,
     PILImageResampling,
-    is_batched,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
     to_numpy_array,
     valid_images,
 )
-from ...utils import logging
+from ...utils import TensorType, logging
 
 
 logger = logging.get_logger(__name__)
@@ -59,7 +56,7 @@ class LevitImageProcessor(BaseImageProcessor):
             edge value `c` is rescaled to `int(c * (256/224))`. The smaller edge of the image will be matched to this
             value i.e, if height > width, then image will be rescaled to `(size["shortest_egde"] * height / width,
             size["shortest_egde"])`. Can be overridden by the `size` parameter in the `preprocess` method.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
             Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
             `preprocess` method.
         do_center_crop (`bool`, *optional*, defaults to `True`):
@@ -77,10 +74,10 @@ class LevitImageProcessor(BaseImageProcessor):
         do_normalize (`bool`, *optional*, defaults to `True`):
             Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
             `preprocess` method.
-        image_mean (`List[int]`, defaults to `[0.229, 0.224, 0.225]`):
+        image_mean (`List[int]`, *optional*, defaults to `[0.485, 0.456, 0.406]`):
             Mean to use if normalizing the image. This is a float or list of floats the length of the number of
             channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
-        image_std (`List[int]`, defaults to `[0.485, 0.456, 0.406]`):
+        image_std (`List[int]`, *optional*, defaults to `[0.229, 0.224, 0.225]`):
             Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
             number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
     """
@@ -99,7 +96,7 @@ def __init__(
         do_normalize: bool = True,
         image_mean: Optional[Union[float, Iterable[float]]] = IMAGENET_DEFAULT_MEAN,
         image_std: Optional[Union[float, Iterable[float]]] = IMAGENET_DEFAULT_STD,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"shortest_edge": 224}
@@ -124,7 +121,8 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image.
@@ -148,87 +146,30 @@ def resize(
                 Resampling filter to use when resiizing the image.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
         """
         size_dict = get_size_dict(size, default_to_square=False)
         # size_dict is a dict with either keys "height" and "width" or "shortest_edge"
         if "shortest_edge" in size:
             shortest_edge = int((256 / 224) * size["shortest_edge"])
-            output_size = get_resize_output_image_size(image, size=shortest_edge, default_to_square=False)
+            output_size = get_resize_output_image_size(
+                image, size=shortest_edge, default_to_square=False, input_data_format=input_data_format
+            )
             size_dict = {"height": output_size[0], "width": output_size[1]}
         if "height" not in size_dict or "width" not in size_dict:
             raise ValueError(
                 f"Size dict must have keys 'height' and 'width' or 'shortest_edge'. Got {size_dict.keys()}"
             )
         return resize(
-            image, size=(size_dict["height"], size_dict["width"]), resample=resample, data_format=data_format, **kwargs
+            image,
+            size=(size_dict["height"], size_dict["width"]),
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
         )
 
-    def center_crop(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Center crop an image.
-
-        Args:
-            image (`np.ndarray`):
-                Image to center crop.
-            size (`Dict[str, int]`):
-                Dict `{"height": int, "width": int}` specifying the size of the output image after cropping.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        size = get_size_dict(size)
-        if "height" not in size or "width" not in size:
-            raise ValueError(f"Size dict must have keys 'height' and 'width'. Got {size.keys()}")
-        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
-
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            mean (`float` or `List[float]`):
-                Image mean.
-            std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: ImageInput,
@@ -244,6 +185,7 @@ def preprocess(
         image_std: Optional[Union[float, Iterable[float]]] = None,
         return_tensors: Optional[TensorType] = None,
         data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
         **kwargs,
     ) -> BatchFeature:
         """
@@ -251,7 +193,8 @@ def preprocess(
 
         Args:
             images (`ImageInput`):
-                Image or batch of images to preprocess.
+                Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging
+                from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`.
             do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                 Whether to resize the image.
             size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@@ -288,6 +231,12 @@ def preprocess(
                 image is used. Can be one of:
                 - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                 - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         resample = resample if resample is not None else self.resample
@@ -303,8 +252,7 @@ def preprocess(
         crop_size = crop_size if crop_size is not None else self.crop_size
         crop_size = get_size_dict(crop_size, param_name="crop_size")
 
-        if not is_batched(images):
-            images = [images]
+        images = make_list_of_images(images)
 
         if not valid_images(images):
             raise ValueError(
@@ -327,19 +275,33 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
         if do_resize:
-            images = [self.resize(image, size, resample) for image in images]
+            images = [self.resize(image, size, resample, input_data_format=input_data_format) for image in images]
 
         if do_center_crop:
-            images = [self.center_crop(image, crop_size) for image in images]
+            images = [self.center_crop(image, crop_size, input_data_format=input_data_format) for image in images]
 
         if do_rescale:
-            images = [self.rescale(image, rescale_factor) for image in images]
+            images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images]
 
         if do_normalize:
-            images = [self.normalize(image, image_mean, image_std) for image in images]
+            images = [
+                self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
+            ]
 
-        images = [to_channel_dimension_format(image, data_format) for image in images]
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
 
         data = {"pixel_values": images}
         return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/src/transformers/models/levit/modeling_levit.py b/src/transformers/models/levit/modeling_levit.py
index bddd54cfb3f5..0accc28391bd 100644
--- a/src/transformers/models/levit/modeling_levit.py
+++ b/src/transformers/models/levit/modeling_levit.py
@@ -38,7 +38,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "LevitConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "LevitImageProcessor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "facebook/levit-128S"
@@ -196,7 +195,9 @@ def __init__(self, hidden_sizes, key_dim, num_attention_heads, attention_ratio,
 
         self.attention_bias_cache = {}
         self.attention_biases = torch.nn.Parameter(torch.zeros(num_attention_heads, len(attention_offsets)))
-        self.register_buffer("attention_bias_idxs", torch.LongTensor(indices).view(len_points, len_points))
+        self.register_buffer(
+            "attention_bias_idxs", torch.LongTensor(indices).view(len_points, len_points), persistent=False
+        )
 
     @torch.no_grad()
     def train(self, mode=True):
@@ -272,7 +273,9 @@ def __init__(
                 indices.append(attention_offsets[offset])
 
         self.attention_biases = torch.nn.Parameter(torch.zeros(num_attention_heads, len(attention_offsets)))
-        self.register_buffer("attention_bias_idxs", torch.LongTensor(indices).view(len_points_, len_points))
+        self.register_buffer(
+            "attention_bias_idxs", torch.LongTensor(indices).view(len_points_, len_points), persistent=False
+        )
 
     @torch.no_grad()
     def train(self, mode=True):
@@ -524,7 +527,7 @@ def _set_gradient_checkpointing(self, module, value=False):
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
-            [`AutoImageProcessor.__call__`] for details.
+            [`LevitImageProcessor.__call__`] for details.
 
         output_hidden_states (`bool`, *optional*):
             Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
@@ -549,7 +552,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(LEVIT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPoolingAndNoAttention,
         config_class=_CONFIG_FOR_DOC,
@@ -618,7 +620,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(LEVIT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_IMAGE_CLASS_CHECKPOINT,
         output_type=ImageClassifierOutputWithNoAttention,
         config_class=_CONFIG_FOR_DOC,
@@ -711,7 +712,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(LEVIT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_IMAGE_CLASS_CHECKPOINT,
         output_type=LevitForImageClassificationWithTeacherOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/lilt/__init__.py b/src/transformers/models/lilt/__init__.py
index f44c87f4b59c..50c493e352bc 100644
--- a/src/transformers/models/lilt/__init__.py
+++ b/src/transformers/models/lilt/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/lilt/configuration_lilt.py b/src/transformers/models/lilt/configuration_lilt.py
index 16ec1d658f0e..d11899c94312 100644
--- a/src/transformers/models/lilt/configuration_lilt.py
+++ b/src/transformers/models/lilt/configuration_lilt.py
@@ -111,7 +111,7 @@ def __init__(
         classifier_dropout=None,
         channel_shrink_ratio=4,
         max_2d_position_embeddings=1024,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
 
diff --git a/src/transformers/models/lilt/modeling_lilt.py b/src/transformers/models/lilt/modeling_lilt.py
index 6859aff7e632..46fe2d3e9cd7 100644
--- a/src/transformers/models/lilt/modeling_lilt.py
+++ b/src/transformers/models/lilt/modeling_lilt.py
@@ -59,7 +59,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
 
         # End copy
@@ -239,7 +241,6 @@ def forward(
         head_mask=None,
         output_attentions=False,
     ):
-
         layout_value_layer = self.transpose_for_scores(self.layout_value(layout_inputs), r=self.channel_shrink_ratio)
         layout_key_layer = self.transpose_for_scores(self.layout_key(layout_inputs), r=self.channel_shrink_ratio)
         layout_query_layer = self.transpose_for_scores(self.layout_query(layout_inputs), r=self.channel_shrink_ratio)
@@ -578,7 +579,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return pooled_output
 
 
-# Copied from transformers.models.roberta.modeling_roberta.RobertaPreTrainedModel with Roberta->Lilt,roberta->lilt
 class LiltPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -611,15 +611,6 @@ def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, LiltEncoder):
             module.gradient_checkpointing = value
 
-    def update_keys_to_ignore(self, config, del_keys_to_ignore):
-        """Remove some keys from ignore list"""
-        if not config.tie_word_embeddings:
-            # must make a new list, or the class variable gets modified!
-            self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore]
-            self._keys_to_ignore_on_load_missing = [
-                k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore
-            ]
-
 
 LILT_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
@@ -641,7 +632,7 @@ def update_keys_to_ignore(self, config, del_keys_to_ignore):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`RobertaTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -698,8 +689,6 @@ def update_keys_to_ignore(self, config, del_keys_to_ignore):
     LILT_START_DOCSTRING,
 )
 class LiltModel(LiltPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
@@ -774,6 +763,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -848,8 +838,6 @@ def forward(
     LILT_START_DOCSTRING,
 )
 class LiltForSequenceClassification(LiltPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     # Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification.__init__ with Roberta->Lilt, roberta->lilt
     def __init__(self, config):
         super().__init__(config)
@@ -925,6 +913,8 @@ def forward(
 
         loss = None
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
             if self.config.problem_type is None:
                 if self.num_labels == 1:
                     self.config.problem_type = "regression"
@@ -966,9 +956,6 @@ def forward(
     LILT_START_DOCSTRING,
 )
 class LiltForTokenClassification(LiltPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     # Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification.__init__ with Roberta->Lilt, roberta->lilt
     def __init__(self, config):
         super().__init__(config)
@@ -1047,6 +1034,8 @@ def forward(
 
         loss = None
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
             loss_fct = CrossEntropyLoss()
             loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
 
@@ -1093,9 +1082,6 @@ def forward(self, features, **kwargs):
     LILT_START_DOCSTRING,
 )
 class LiltForQuestionAnswering(LiltPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     # Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering.__init__ with Roberta->Lilt, roberta->lilt
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/llama/__init__.py b/src/transformers/models/llama/__init__.py
new file mode 100644
index 000000000000..939756084d79
--- /dev/null
+++ b/src/transformers/models/llama/__init__.py
@@ -0,0 +1,90 @@
+# Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_sentencepiece_available,
+    is_tokenizers_available,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_llama": ["LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP", "LlamaConfig"],
+}
+
+try:
+    if not is_sentencepiece_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["tokenization_llama"] = ["LlamaTokenizer"]
+
+try:
+    if not is_tokenizers_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["tokenization_llama_fast"] = ["LlamaTokenizerFast"]
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_llama"] = [
+        "LlamaForCausalLM",
+        "LlamaModel",
+        "LlamaPreTrainedModel",
+        "LlamaForSequenceClassification",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_llama import LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP, LlamaConfig
+
+    try:
+        if not is_sentencepiece_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .tokenization_llama import LlamaTokenizer
+
+    try:
+        if not is_tokenizers_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .tokenization_llama_fast import LlamaTokenizerFast
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_llama import LlamaForCausalLM, LlamaForSequenceClassification, LlamaModel, LlamaPreTrainedModel
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py
new file mode 100644
index 000000000000..f3da8ab4cdc2
--- /dev/null
+++ b/src/transformers/models/llama/configuration_llama.py
@@ -0,0 +1,187 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" LLaMA model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+
+
+class LlamaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the LLaMA-7B.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`LlamaModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Llama 1 supports up to 2048 tokens,
+            Llama 2 up to 4096, CodeLlama up to 16384.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
+            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
+            issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
+            is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+
+
+    ```python
+    >>> from transformers import LlamaModel, LlamaConfig
+
+    >>> # Initializing a LLaMA llama-7b style configuration
+    >>> configuration = LlamaConfig()
+
+    >>> # Initializing a model from the llama-7b style configuration
+    >>> model = LlamaModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "llama"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+        self.attention_bias = attention_bias
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")
diff --git a/src/transformers/models/llama/convert_llama_weights_to_hf.py b/src/transformers/models/llama/convert_llama_weights_to_hf.py
new file mode 100644
index 000000000000..acc49884ebdb
--- /dev/null
+++ b/src/transformers/models/llama/convert_llama_weights_to_hf.py
@@ -0,0 +1,318 @@
+# Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import gc
+import json
+import os
+import shutil
+import warnings
+
+import torch
+
+from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer
+
+
+try:
+    from transformers import LlamaTokenizerFast
+except ImportError as e:
+    warnings.warn(e)
+    warnings.warn(
+        "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion"
+    )
+    LlamaTokenizerFast = None
+
+"""
+Sample usage:
+
+```
+python src/transformers/models/llama/convert_llama_weights_to_hf.py \
+    --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path
+```
+
+Thereafter, models can be loaded via:
+
+```py
+from transformers import LlamaForCausalLM, LlamaTokenizer
+
+model = LlamaForCausalLM.from_pretrained("/output/path")
+tokenizer = LlamaTokenizer.from_pretrained("/output/path")
+```
+
+Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
+come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
+"""
+
+NUM_SHARDS = {
+    "7B": 1,
+    "7Bf": 1,
+    "13B": 2,
+    "13Bf": 2,
+    "34B": 4,
+    "30B": 4,
+    "65B": 8,
+    "70B": 8,
+    "70Bf": 8,
+}
+
+
+def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
+    return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of)
+
+
+def read_json(path):
+    with open(path, "r") as f:
+        return json.load(f)
+
+
+def write_json(text, path):
+    with open(path, "w") as f:
+        json.dump(text, f)
+
+
+def write_model(model_path, input_base_path, model_size, tokenizer_path=None, safe_serialization=True):
+    # for backward compatibility, before you needed the repo to be called `my_repo/model_size`
+    if not os.path.isfile(os.path.join(input_base_path, "params.json")):
+        input_base_path = os.path.join(input_base_path, model_size)
+
+    os.makedirs(model_path, exist_ok=True)
+    tmp_model_path = os.path.join(model_path, "tmp")
+    os.makedirs(tmp_model_path, exist_ok=True)
+
+    params = read_json(os.path.join(input_base_path, "params.json"))
+    num_shards = NUM_SHARDS[model_size]
+    n_layers = params["n_layers"]
+    n_heads = params["n_heads"]
+    n_heads_per_shard = n_heads // num_shards
+    dim = params["dim"]
+    dims_per_head = dim // n_heads
+    base = params.get("rope_theta", 10000.0)
+    inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
+    if base > 10000.0:
+        max_position_embeddings = 16384
+    else:
+        max_position_embeddings = 2048
+
+    tokenizer_class = LlamaTokenizer if LlamaTokenizerFast is None else LlamaTokenizerFast
+    if tokenizer_path is not None:
+        tokenizer = tokenizer_class(tokenizer_path)
+        tokenizer.save_pretrained(model_path)
+    vocab_size = tokenizer.vocab_size if tokenizer_path is not None else 32000
+
+    if "n_kv_heads" in params:
+        num_key_value_heads = params["n_kv_heads"]  # for GQA / MQA
+        num_local_key_value_heads = n_heads_per_shard // num_key_value_heads
+        key_value_dim = dim // num_key_value_heads
+    else:  # compatibility with other checkpoints
+        num_key_value_heads = n_heads
+        num_local_key_value_heads = n_heads_per_shard
+        key_value_dim = dim
+
+    # permute for sliced rotary
+    def permute(w, n_heads=n_heads, dim1=dim, dim2=dim):
+        return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2)
+
+    print(f"Fetching all parameters from the checkpoint at {input_base_path}.")
+    # Load weights
+    if model_size == "7B":
+        # Not sharded
+        # (The sharded implementation would also work, but this is simpler.)
+        loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu")
+    else:
+        # Sharded
+        loaded = [
+            torch.load(os.path.join(input_base_path, f"consolidated.{i:02d}.pth"), map_location="cpu")
+            for i in range(num_shards)
+        ]
+    param_count = 0
+    index_dict = {"weight_map": {}}
+    for layer_i in range(n_layers):
+        filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin"
+        if model_size == "7B":
+            # Unsharded
+            state_dict = {
+                f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
+                    loaded[f"layers.{layer_i}.attention.wq.weight"]
+                ),
+                f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
+                    loaded[f"layers.{layer_i}.attention.wk.weight"]
+                ),
+                f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[f"layers.{layer_i}.attention.wv.weight"],
+                f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"layers.{layer_i}.attention.wo.weight"],
+                f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w1.weight"],
+                f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w2.weight"],
+                f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w3.weight"],
+                f"model.layers.{layer_i}.input_layernorm.weight": loaded[f"layers.{layer_i}.attention_norm.weight"],
+                f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[f"layers.{layer_i}.ffn_norm.weight"],
+            }
+        else:
+            # Sharded
+            # Note that attention.w{q,k,v,o}, feed_fordward.w[1,2,3], attention_norm.weight and ffn_norm.weight share
+            # the same storage object, saving attention_norm and ffn_norm will save other weights too, which is
+            # redundant as other weights will be stitched from multiple shards. To avoid that, they are cloned.
+
+            state_dict = {
+                f"model.layers.{layer_i}.input_layernorm.weight": loaded[0][
+                    f"layers.{layer_i}.attention_norm.weight"
+                ].clone(),
+                f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[0][
+                    f"layers.{layer_i}.ffn_norm.weight"
+                ].clone(),
+            }
+            state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute(
+                torch.cat(
+                    [
+                        loaded[i][f"layers.{layer_i}.attention.wq.weight"].view(n_heads_per_shard, dims_per_head, dim)
+                        for i in range(num_shards)
+                    ],
+                    dim=0,
+                ).reshape(dim, dim)
+            )
+            state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute(
+                torch.cat(
+                    [
+                        loaded[i][f"layers.{layer_i}.attention.wk.weight"].view(
+                            num_local_key_value_heads, dims_per_head, dim
+                        )
+                        for i in range(num_shards)
+                    ],
+                    dim=0,
+                ).reshape(key_value_dim, dim),
+                num_key_value_heads,
+                key_value_dim,
+                dim,
+            )
+            state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat(
+                [
+                    loaded[i][f"layers.{layer_i}.attention.wv.weight"].view(
+                        num_local_key_value_heads, dims_per_head, dim
+                    )
+                    for i in range(num_shards)
+                ],
+                dim=0,
+            ).reshape(key_value_dim, dim)
+
+            state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat(
+                [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(num_shards)], dim=1
+            )
+            state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat(
+                [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(num_shards)], dim=0
+            )
+            state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat(
+                [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(num_shards)], dim=1
+            )
+            state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat(
+                [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(num_shards)], dim=0
+            )
+
+        state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
+        for k, v in state_dict.items():
+            index_dict["weight_map"][k] = filename
+            param_count += v.numel()
+        torch.save(state_dict, os.path.join(tmp_model_path, filename))
+
+    filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin"
+    if model_size == "7B":
+        # Unsharded
+        state_dict = {
+            "model.embed_tokens.weight": loaded["tok_embeddings.weight"],
+            "model.norm.weight": loaded["norm.weight"],
+            "lm_head.weight": loaded["output.weight"],
+        }
+    else:
+        state_dict = {
+            "model.norm.weight": loaded[0]["norm.weight"],
+            "model.embed_tokens.weight": torch.cat(
+                [loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=1
+            ),
+            "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(num_shards)], dim=0),
+        }
+
+    for k, v in state_dict.items():
+        index_dict["weight_map"][k] = filename
+        param_count += v.numel()
+    torch.save(state_dict, os.path.join(tmp_model_path, filename))
+
+    # Write configs
+    index_dict["metadata"] = {"total_size": param_count * 2}
+    write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json"))
+    ffn_dim_multiplier = params["ffn_dim_multiplier"] if "ffn_dim_multiplier" in params else 1
+    multiple_of = params["multiple_of"] if "multiple_of" in params else 256
+    config = LlamaConfig(
+        hidden_size=dim,
+        intermediate_size=compute_intermediate_size(dim, ffn_dim_multiplier, multiple_of),
+        num_attention_heads=params["n_heads"],
+        num_hidden_layers=params["n_layers"],
+        rms_norm_eps=params["norm_eps"],
+        num_key_value_heads=num_key_value_heads,
+        vocab_size=vocab_size,
+        rope_theta=base,
+        max_position_embeddings=max_position_embeddings,
+    )
+    config.save_pretrained(tmp_model_path)
+
+    # Make space so we can load the model properly now.
+    del state_dict
+    del loaded
+    gc.collect()
+
+    print("Loading the checkpoint in a Llama model.")
+    model = LlamaForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True)
+    # Avoid saving this as part of the config.
+    del model.config._name_or_path
+    model.config.torch_dtype = torch.float16
+    print("Saving in the Transformers format.")
+    model.save_pretrained(model_path, safe_serialization=safe_serialization)
+    shutil.rmtree(tmp_model_path)
+
+
+def write_tokenizer(tokenizer_path, input_tokenizer_path):
+    # Initialize the tokenizer based on the `spm` model
+    tokenizer_class = LlamaTokenizer if LlamaTokenizerFast is None else LlamaTokenizerFast
+    print(f"Saving a {tokenizer_class.__name__} to {tokenizer_path}.")
+    tokenizer = tokenizer_class(input_tokenizer_path)
+    tokenizer.save_pretrained(tokenizer_path)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--input_dir",
+        help="Location of LLaMA weights, which contains tokenizer.model and model folders",
+    )
+    parser.add_argument(
+        "--model_size",
+        choices=["7B", "7Bf", "13B", "13Bf", "30B", "34B", "65B", "70B", "70Bf", "tokenizer_only"],
+        help="'f' models correspond to the finetuned versions, and are specific to the Llama2 official release. For more details on Llama2, checkout the original repo: https://huggingface.co/meta-llama",
+    )
+    parser.add_argument(
+        "--output_dir",
+        help="Location to write HF model and tokenizer",
+    )
+    parser.add_argument("--safe_serialization", type=bool, help="Whether or not to save using `safetensors`.")
+    args = parser.parse_args()
+    spm_path = os.path.join(args.input_dir, "tokenizer.model")
+    if args.model_size != "tokenizer_only":
+        write_model(
+            model_path=args.output_dir,
+            input_base_path=args.input_dir,
+            model_size=args.model_size,
+            safe_serialization=args.safe_serialization,
+            tokenizer_path=spm_path,
+        )
+    else:
+        write_tokenizer(args.output_dir, spm_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
new file mode 100644
index 000000000000..55753d5f75d9
--- /dev/null
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -0,0 +1,1239 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch LLaMA model."""
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import ALL_LAYERNORM_LAYERS
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_available,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_llama import LlamaConfig
+
+
+if is_flash_attn_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "LlamaConfig"
+
+
+def _get_unpad_data(padding_mask):
+    seqlens_in_batch = padding_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(padding_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+class LlamaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        LlamaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+ALL_LAYERNORM_LAYERS.append(LlamaRMSNorm)
+
+
+class LlamaRotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+
+
+class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
+    """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        t = t / self.scaling_factor
+
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+
+class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
+    """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+
+        if seq_len > self.max_position_embeddings:
+            base = self.base * (
+                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+            ) ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.models.gpt_neox.modeling_gpt_neox.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+    cos = cos[position_ids].unsqueeze(1)  # [seq_len, dim] -> [batch_size, 1, seq_len, head_dim]
+    sin = sin[position_ids].unsqueeze(1)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class LlamaMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        if self.config.pretraining_tp > 1:
+            slice = self.intermediate_size // self.config.pretraining_tp
+            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
+            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
+            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
+
+            gate_proj = torch.cat(
+                [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
+            )
+            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
+
+            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
+            down_proj = [
+                F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
+            ]
+            down_proj = sum(down_proj)
+        else:
+            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+        return down_proj
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class LlamaAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
+        self._init_rope()
+
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = LlamaRotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            scaling_factor = self.config.rope_scaling["factor"]
+            if scaling_type == "linear":
+                self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            elif scaling_type == "dynamic":
+                self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        padding_mask: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        if self.config.pretraining_tp > 1:
+            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
+            query_slices = self.q_proj.weight.split(
+                (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
+            )
+            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
+
+            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
+            query_states = torch.cat(query_states, dim=-1)
+
+            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
+            key_states = torch.cat(key_states, dim=-1)
+
+            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
+            value_states = torch.cat(value_states, dim=-1)
+
+        else:
+            query_states = self.q_proj(hidden_states)
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+        past_key_value = (key_states, value_states) if use_cache else None
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        if self.config.pretraining_tp > 1:
+            attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
+            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
+            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
+        else:
+            attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class LlamaFlashAttention2(LlamaAttention):
+    """
+    Llama flash attention module. This module inherits from `LlamaAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        padding_mask: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # LlamaFlashAttention2 attention does not support output_attentions
+        output_attentions = False
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dime x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+        past_key_value = (key_states, value_states) if use_cache else None
+
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        # TODO: llama does not have dropout in the config??
+        # It is recommended to use dropout with FA according to the docs
+        # when training.
+        dropout_rate = 0.0  # if not self.training else self.attn_dropout
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (LlamaRMSNorm handles it correctly)
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            logger.warning_once(
+                "The input hidden states seems to be silently casted in float32, this might be related to"
+                " the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                " float16."
+            )
+
+            query_states = query_states.to(torch.float16)
+            key_states = key_states.to(torch.float16)
+            value_states = value_states.to(torch.float16)
+
+        attn_output = self._flash_attention_forward(
+            query_states, key_states, value_states, padding_mask, q_len, dropout=dropout_rate
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+    def _flash_attention_forward(
+        self, query_states, key_states, value_states, padding_mask, query_length, dropout=0.0, softmax_scale=None
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            padding_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`int`, *optional*):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+        """
+        # Contains at least one padding token in the sequence
+        if padding_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, padding_mask, query_length
+            )
+
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=dropout,
+                softmax_scale=softmax_scale,
+                causal=True,
+            )
+
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            attn_output = flash_attn_func(
+                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=True
+            )
+
+        return attn_output
+
+    def _upad_input(self, query_layer, key_layer, value_layer, padding_mask, query_length):
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(padding_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            padding_mask = padding_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, padding_mask)
+
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+
+
+class LlamaDecoderLayer(nn.Module):
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = (
+            LlamaAttention(config=config)
+            if not getattr(config, "_flash_attn_2_enabled", False)
+            else LlamaFlashAttention2(config=config)
+        )
+        self.mlp = LlamaMLP(config)
+        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        padding_mask: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            padding_mask=padding_mask,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+LLAMA_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`LlamaConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    LLAMA_START_DOCSTRING,
+)
+class LlamaPreTrainedModel(PreTrainedModel):
+    config_class = LlamaConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["LlamaDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, LlamaModel):
+            module.gradient_checkpointing = value
+
+
+LLAMA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    LLAMA_START_DOCSTRING,
+)
+class LlamaModel(LlamaPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
+
+    Args:
+        config: LlamaConfig
+    """
+
+    def __init__(self, config: LlamaConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+
+        return combined_attention_mask
+
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        # embed positions
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+            )
+            padding_mask = None
+        else:
+            if 0 in attention_mask:
+                padding_mask = attention_mask
+            else:
+                padding_mask = None
+
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+        )
+
+        hidden_states = inputs_embeds
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, past_key_value, output_attentions, padding_mask=padding_mask)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer), hidden_states, attention_mask, position_ids
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    padding_mask=padding_mask,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class LlamaForCausalLM(LlamaPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = LlamaModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, LlamaForCausalLM
+
+        >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        if self.config.pretraining_tp > 1:
+            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
+            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
+            logits = torch.cat(logits, dim=-1)
+        else:
+            logits = self.lm_head(hidden_states)
+        logits = logits.float()
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+
+@add_start_docstrings(
+    """
+    The LLaMa Model transformer with a sequence classification head on top (linear layer).
+
+    [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    LLAMA_START_DOCSTRING,
+)
+class LlamaForSequenceClassification(LlamaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = LlamaModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+                    logits.device
+                )
+            else:
+                sequence_lengths = -1
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
diff --git a/src/transformers/models/llama/tokenization_llama.py b/src/transformers/models/llama/tokenization_llama.py
new file mode 100644
index 000000000000..907ddd65bbe4
--- /dev/null
+++ b/src/transformers/models/llama/tokenization_llama.py
@@ -0,0 +1,426 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tokenization classes for LLaMA."""
+import os
+from shutil import copyfile
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
+
+import sentencepiece as spm
+
+from ...convert_slow_tokenizer import import_protobuf
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...utils import logging
+
+
+if TYPE_CHECKING:
+    from ...tokenization_utils_base import TextInput
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model",
+    },
+    "tokenizer_file": {
+        "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer_config.json",
+    },
+}
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "hf-internal-testing/llama-tokenizer": 2048,
+}
+SPIECE_UNDERLINE = "▁"
+
+B_INST, E_INST = "[INST]", "[/INST]"
+B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
+
+# fmt: off
+DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your \
+answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
+ that your responses are socially unbiased and positive in nature.
+
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not \
+correct. If you don't know the answer to a question, please don't share false information."""
+# fmt: on
+
+
+class LlamaTokenizer(PreTrainedTokenizer):
+    """
+    Construct a Llama tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as there is
+    no padding token in the original model.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        legacy (`bool`, *optional*):
+            Whether or not the `legacy` behavior of the tokenizer should be used. Legacy is before the merge of #24622
+            and #25224 which includes fixes to properly handle tokens that appear after special tokens. A simple
+            example:
+
+            - `legacy=True`:
+            ```python
+            >>> from transformers import T5Tokenizer
+
+            >>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=True)
+            >>> tokenizer.encode("Hello <extra_id_0>.")
+            [8774, 32099, 3, 5, 1]
+            ```
+            - `legacy=False`:
+            ```python
+            >>> from transformers import T5Tokenizer
+
+            >>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=False)
+            >>> tokenizer.encode("Hello <extra_id_0>.")  # the extra space `[3]` is no longer here
+            [8774, 32099, 5, 1]
+            ```
+            Checkout the [pull request](https://github.com/huggingface/transformers/pull/24565) for more details.
+
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        unk_token="<unk>",
+        bos_token="<s>",
+        eos_token="</s>",
+        pad_token=None,
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        add_bos_token=True,
+        add_eos_token=False,
+        clean_up_tokenization_spaces=False,
+        use_default_system_prompt=True,
+        spaces_between_special_tokens=False,
+        legacy=None,
+        **kwargs,
+    ):
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+
+        if legacy is None:
+            logger.warning_once(
+                f"You are using the default legacy behaviour of the {self.__class__}. This is"
+                " expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you."
+                " If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it"
+                " means, and thouroughly read the reason why this was added as explained in"
+                " https://github.com/huggingface/transformers/pull/24565"
+            )
+            legacy = True
+
+        self.legacy = legacy
+        self.vocab_file = vocab_file
+        self.add_bos_token = add_bos_token
+        self.add_eos_token = add_eos_token
+        self.use_default_system_prompt = use_default_system_prompt
+        self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False))
+
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            add_bos_token=add_bos_token,
+            add_eos_token=add_eos_token,
+            sp_model_kwargs=self.sp_model_kwargs,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            use_default_system_prompt=use_default_system_prompt,
+            spaces_between_special_tokens=spaces_between_special_tokens,
+            legacy=legacy,
+            **kwargs,
+        )
+
+    @property
+    def unk_token_length(self):
+        return len(self.sp_model.encode(str(self.unk_token)))
+
+    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.get_spm_processor
+    def get_spm_processor(self, from_slow=False):
+        tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        if self.legacy or from_slow:  # no dependency on protobuf
+            tokenizer.Load(self.vocab_file)
+            return tokenizer
+
+        with open(self.vocab_file, "rb") as f:
+            sp_model = f.read()
+            model_pb2 = import_protobuf(f"The new behaviour of {self.__class__.__name__} (with `self.legacy = False`)")
+            model = model_pb2.ModelProto.FromString(sp_model)
+            normalizer_spec = model_pb2.NormalizerSpec()
+            normalizer_spec.add_dummy_prefix = False
+            model.normalizer_spec.MergeFrom(normalizer_spec)
+            sp_model = model.SerializeToString()
+            tokenizer.LoadFromSerializedProto(sp_model)
+        return tokenizer
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        state["sp_model_proto"] = self.sp_model.serialized_model_proto()
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
+
+    @property
+    def vocab_size(self):
+        """Returns vocab size"""
+        return self.sp_model.get_piece_size()
+
+    def get_vocab(self):
+        """Returns vocab as a dict"""
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.tokenize
+    def tokenize(self, text: "TextInput", add_special_tokens=False, **kwargs) -> List[str]:
+        """
+        Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the
+        first token is special.
+        """
+        if self.legacy or len(text) == 0:
+            return super().tokenize(text, **kwargs)
+
+        tokens = super().tokenize(SPIECE_UNDERLINE + text.replace(SPIECE_UNDERLINE, " "), **kwargs)
+
+        if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens:
+            tokens = tokens[1:]
+        return tokens
+
+    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._tokenize
+    def _tokenize(self, text, **kwargs):
+        """
+        Returns a tokenized string.
+
+        We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
+        SPIECE_UNDERLINE. For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give
+        `['H', 'e', 'y']` instead of `['▁He', 'y']`. Thus we always encode `f"{unk_token}text"` and strip the
+        `unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
+        `self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
+        """
+        tokens = self.sp_model.encode(text, out_type=str)
+        if self.legacy or not text.startswith((SPIECE_UNDERLINE, " ")):
+            return tokens
+
+        # 1. Encode string + prefix ex: "<unk> Hey"
+        tokens = self.sp_model.encode(self.unk_token + text, out_type=str)
+        # 2. Remove self.unk_token from ['<','unk','>', '▁Hey']
+        return tokens[self.unk_token_length :] if len(tokens) >= self.unk_token_length else tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.sp_model.piece_to_id(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        token = self.sp_model.IdToPiece(index)
+        return token
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        # since we manually add the prefix space, we have to remove it when decoding
+        if tokens[0].startswith(SPIECE_UNDERLINE):
+            tokens[0] = tokens[0][1:]
+
+        current_sub_tokens = []
+        out_string = ""
+        prev_is_special = False
+        for i, token in enumerate(tokens):
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                if not prev_is_special and i != 0 and self.legacy:
+                    out_string += " "
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                prev_is_special = True
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+                prev_is_special = False
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string
+
+    def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        """
+        Save the vocabulary and special tokens file to a directory.
+
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        return (out_vocab_file,)
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+
+        output = bos_token_id + token_ids_0 + eos_token_id
+
+        if token_ids_1 is not None:
+            output = output + bos_token_id + token_ids_1 + eos_token_id
+
+        return output
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        bos_token_id = [1] if self.add_bos_token else []
+        eos_token_id = [1] if self.add_eos_token else []
+
+        if token_ids_1 is None:
+            return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
+        return (
+            bos_token_id
+            + ([0] * len(token_ids_0))
+            + eos_token_id
+            + bos_token_id
+            + ([0] * len(token_ids_1))
+            + eos_token_id
+        )
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
+        sequence pair mask has the following format:
+
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+
+        if token_ids_1 is None, only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of ids.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+        """
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+
+        output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
+
+        if token_ids_1 is not None:
+            output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
+
+        return output
+
+    @property
+    def default_chat_template(self):
+        """
+        LLaMA uses [INST] and [/INST] to indicate user messages, and <<SYS>> and <</SYS>> to indicate system messages.
+        Assistant messages do not have special tokens, because LLaMA chat models are generally trained with strict
+        user/assistant/user/assistant message ordering, and so assistant messages can be identified from the ordering
+        rather than needing special tokens. The system message is partly 'embedded' in the first user message, which
+        results in an unusual token ordering when it is present. This template should definitely be changed if you wish
+        to fine-tune a model with more flexible role ordering!
+
+        The output should look something like:
+
+        <bos>[INST] B_SYS SystemPrompt E_SYS Prompt [/INST] Answer <eos> <bos>[INST] Prompt [/INST] Answer <eos>
+        <bos>[INST] Prompt [/INST]
+        """
+
+        template = (
+            "{% if messages[0]['role'] == 'system' %}"
+            "{% set loop_messages = messages[1:] %}"  # Extract system message if it's present
+            "{% set system_message = messages[0]['content'] %}"
+            "{% elif USE_DEFAULT_PROMPT == true and not '<<SYS>>' in messages[0]['content'] %}"
+            "{% set loop_messages = messages %}"  # Or use the default system message if the flag is set
+            "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}"
+            "{% else %}"
+            "{% set loop_messages = messages %}"
+            "{% set system_message = false %}"
+            "{% endif %}"
+            "{% for message in loop_messages %}"  # Loop over all non-system messages
+            "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}"
+            "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}"
+            "{% endif %}"
+            "{% if loop.index0 == 0 and system_message != false %}"  # Embed system message in first message
+            "{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}"
+            "{% else %}"
+            "{% set content = message['content'] %}"
+            "{% endif %}"
+            "{% if message['role'] == 'user' %}"  # After all of that, handle messages/roles in a fairly normal way
+            "{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}"
+            "{% elif message['role'] == 'system' %}"
+            "{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}"
+            "{% elif message['role'] == 'assistant' %}"
+            "{{ ' '  + content.strip() + ' ' + eos_token }}"
+            "{% endif %}"
+            "{% endfor %}"
+        )
+        template = template.replace("USE_DEFAULT_PROMPT", "true" if self.use_default_system_prompt else "false")
+        default_message = DEFAULT_SYSTEM_PROMPT.replace("\n", "\\n").replace("'", "\\'")
+        template = template.replace("DEFAULT_SYSTEM_MESSAGE", default_message)
+
+        return template
diff --git a/src/transformers/models/llama/tokenization_llama_fast.py b/src/transformers/models/llama/tokenization_llama_fast.py
new file mode 100644
index 000000000000..6e9cd2aa3ba2
--- /dev/null
+++ b/src/transformers/models/llama/tokenization_llama_fast.py
@@ -0,0 +1,268 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from shutil import copyfile
+from typing import Optional, Tuple
+
+from tokenizers import processors
+
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import is_sentencepiece_available, logging
+from ...utils.versions import require_version
+
+
+require_version("tokenizers>=0.13.3")
+
+if is_sentencepiece_available():
+    from .tokenization_llama import LlamaTokenizer
+else:
+    LlamaTokenizer = None
+
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model", "tokenizer_file": "tokenizer.json"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model",
+    },
+    "tokenizer_file": {
+        "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer_config.json",
+    },
+}
+B_INST, E_INST = "[INST]", "[/INST]"
+B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
+
+# fmt: off
+DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your \
+answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
+ that your responses are socially unbiased and positive in nature.
+
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not \
+correct. If you don't know the answer to a question, please don't share false information."""
+# fmt: on
+
+
+class LlamaTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a Llama tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+    This uses notably ByteFallback and no normalization.
+
+    ```
+    from transformers import LlamaTokenizerFast
+
+    tokenizer = LlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer")
+    tokenizer.encode("Hello this is a test")
+    >>> [1, 15043, 445, 338, 263, 1243]
+    ```
+
+    If you want to change the `bos_token` or the `eos_token`, make sure to specify them when initializing the model, or
+    call `tokenizer.update_post_processor()` to make sure that the post-processing is correctly done (otherwise the
+    values of the first token and final token of an encoded sequence will not be correct). For more details, checkout
+    [post-processors] (https://huggingface.co/docs/tokenizers/api/post-processors) documentation.
+
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .model extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        tokenizer_file (`str`):
+            [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
+            contains everything needed to load the tokenizer.
+
+        clean_up_tokenization_spaces (`str`, *optional*, defaults to `False`):
+            Wether to cleanup spaces after decoding, cleanup consists in removing potential artifacts like extra
+            spaces.
+
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    slow_tokenizer_class = LlamaTokenizer
+    padding_side = "left"
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file=None,
+        tokenizer_file=None,
+        clean_up_tokenization_spaces=False,
+        unk_token="<unk>",
+        bos_token="<s>",
+        eos_token="</s>",
+        add_bos_token=True,
+        add_eos_token=False,
+        use_default_system_prompt=True,
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_file=vocab_file,
+            tokenizer_file=tokenizer_file,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            use_default_system_prompt=use_default_system_prompt,
+            **kwargs,
+        )
+        self._add_bos_token = add_bos_token
+        self._add_eos_token = add_eos_token
+        self.update_post_processor()
+        self.use_default_system_prompt = use_default_system_prompt
+        self.vocab_file = vocab_file
+
+    @property
+    def can_save_slow_tokenizer(self) -> bool:
+        return os.path.isfile(self.vocab_file) if self.vocab_file else False
+
+    def update_post_processor(self):
+        """
+        Updates the underlying post processor with the current `bos_token` and `eos_token`.
+        """
+        bos = self.bos_token
+        bos_token_id = self.bos_token_id
+        if bos is None and self.add_bos_token:
+            raise ValueError("add_bos_token = True but bos_token = None")
+
+        eos = self.eos_token
+        eos_token_id = self.eos_token_id
+        if eos is None and self.add_eos_token:
+            raise ValueError("add_eos_token = True but eos_token = None")
+
+        single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
+        pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
+
+        special_tokens = []
+        if self.add_bos_token:
+            special_tokens.append((bos, bos_token_id))
+        if self.add_eos_token:
+            special_tokens.append((eos, eos_token_id))
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=single, pair=pair, special_tokens=special_tokens
+        )
+
+    @property
+    def add_eos_token(self):
+        return self._add_eos_token
+
+    @property
+    def add_bos_token(self):
+        return self._add_bos_token
+
+    @add_eos_token.setter
+    def add_eos_token(self, value):
+        self._add_eos_token = value
+        self.update_post_processor()
+
+    @add_bos_token.setter
+    def add_bos_token(self, value):
+        self._add_bos_token = value
+        self.update_post_processor()
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not self.can_save_slow_tokenizer:
+            raise ValueError(
+                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
+                "tokenizer."
+            )
+
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)
+
+    @property
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.default_chat_template
+    def default_chat_template(self):
+        """
+        LLaMA uses [INST] and [/INST] to indicate user messages, and <<SYS>> and <</SYS>> to indicate system messages.
+        Assistant messages do not have special tokens, because LLaMA chat models are generally trained with strict
+        user/assistant/user/assistant message ordering, and so assistant messages can be identified from the ordering
+        rather than needing special tokens. The system message is partly 'embedded' in the first user message, which
+        results in an unusual token ordering when it is present. This template should definitely be changed if you wish
+        to fine-tune a model with more flexible role ordering!
+
+        The output should look something like:
+
+        <bos>[INST] B_SYS SystemPrompt E_SYS Prompt [/INST] Answer <eos> <bos>[INST] Prompt [/INST] Answer <eos>
+        <bos>[INST] Prompt [/INST]
+        """
+
+        template = (
+            "{% if messages[0]['role'] == 'system' %}"
+            "{% set loop_messages = messages[1:] %}"  # Extract system message if it's present
+            "{% set system_message = messages[0]['content'] %}"
+            "{% elif USE_DEFAULT_PROMPT == true and not '<<SYS>>' in messages[0]['content'] %}"
+            "{% set loop_messages = messages %}"  # Or use the default system message if the flag is set
+            "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}"
+            "{% else %}"
+            "{% set loop_messages = messages %}"
+            "{% set system_message = false %}"
+            "{% endif %}"
+            "{% for message in loop_messages %}"  # Loop over all non-system messages
+            "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}"
+            "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}"
+            "{% endif %}"
+            "{% if loop.index0 == 0 and system_message != false %}"  # Embed system message in first message
+            "{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}"
+            "{% else %}"
+            "{% set content = message['content'] %}"
+            "{% endif %}"
+            "{% if message['role'] == 'user' %}"  # After all of that, handle messages/roles in a fairly normal way
+            "{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}"
+            "{% elif message['role'] == 'system' %}"
+            "{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}"
+            "{% elif message['role'] == 'assistant' %}"
+            "{{ ' '  + content.strip() + ' ' + eos_token }}"
+            "{% endif %}"
+            "{% endfor %}"
+        )
+        template = template.replace("USE_DEFAULT_PROMPT", "true" if self.use_default_system_prompt else "false")
+        default_message = DEFAULT_SYSTEM_PROMPT.replace("\n", "\\n").replace("'", "\\'")
+        template = template.replace("DEFAULT_SYSTEM_MESSAGE", default_message)
+
+        return template
+
+    # TODO ArthurZ let's rely on the template processor instead, refactor all fast tokenizers
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.build_inputs_with_special_tokens
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+
+        output = bos_token_id + token_ids_0 + eos_token_id
+
+        if token_ids_1 is not None:
+            output = output + bos_token_id + token_ids_1 + eos_token_id
+
+        return output
diff --git a/src/transformers/models/longformer/__init__.py b/src/transformers/models/longformer/__init__.py
index 1705703b5ac3..66ef7c953cff 100644
--- a/src/transformers/models/longformer/__init__.py
+++ b/src/transformers/models/longformer/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/longformer/configuration_longformer.py b/src/transformers/models/longformer/configuration_longformer.py
index b83384e825af..1542c497989f 100644
--- a/src/transformers/models/longformer/configuration_longformer.py
+++ b/src/transformers/models/longformer/configuration_longformer.py
@@ -86,14 +86,6 @@ class LongformerConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
-            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
-            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
-            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
-            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
-            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
-        classifier_dropout (`float`, *optional*):
-            The dropout ratio for the classification head.
         attention_window (`int` or `List[int]`, *optional*, defaults to 512):
             Size of an attention window around each token. If an `int`, use the same size for all layers. To specify a
             different window size for each layer, use a `List[int]` where `len(attention_window) == num_hidden_layers`.
@@ -133,10 +125,8 @@ def __init__(
         type_vocab_size: int = 2,
         initializer_range: float = 0.02,
         layer_norm_eps: float = 1e-12,
-        position_embedding_type: str = "absolute",
-        classifier_dropout: float = None,
         onnx_export: bool = False,
-        **kwargs
+        **kwargs,
     ):
         """Constructs LongformerConfig."""
         super().__init__(pad_token_id=pad_token_id, **kwargs)
@@ -157,8 +147,6 @@ def __init__(
         self.type_vocab_size = type_vocab_size
         self.initializer_range = initializer_range
         self.layer_norm_eps = layer_norm_eps
-        self.position_embedding_type = position_embedding_type
-        self.classifier_dropout = classifier_dropout
         self.onnx_export = onnx_export
 
 
diff --git a/src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py b/src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py
index 4d9ebe017a1d..ed7d32ab3edb 100644
--- a/src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py
+++ b/src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py
@@ -39,7 +39,6 @@ def forward(self):
 def convert_longformer_qa_checkpoint_to_pytorch(
     longformer_model: str, longformer_question_answering_ckpt_path: str, pytorch_dump_folder_path: str
 ):
-
     # load longformer model from model identifier
     longformer = LongformerModel.from_pretrained(longformer_model)
     lightning_model = LightningModel(longformer)
diff --git a/src/transformers/models/longformer/modeling_longformer.py b/src/transformers/models/longformer/modeling_longformer.py
index 137e99a67c01..33bf9a6f9268 100755
--- a/src/transformers/models/longformer/modeling_longformer.py
+++ b/src/transformers/models/longformer/modeling_longformer.py
@@ -41,7 +41,6 @@
 
 _CHECKPOINT_FOR_DOC = "allenai/longformer-base-4096"
 _CONFIG_FOR_DOC = "LongformerConfig"
-_TOKENIZER_FOR_DOC = "LongformerTokenizer"
 
 LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "allenai/longformer-base-4096",
@@ -405,12 +404,12 @@ def _compute_global_attention_mask(input_ids, sep_token_id, before_sep_token=Tru
     # bool attention mask with True in locations of global attention
     attention_mask = torch.arange(input_ids.shape[1], device=input_ids.device)
     if before_sep_token is True:
-        attention_mask = (attention_mask.expand_as(input_ids) < question_end_index).to(torch.uint8)
+        attention_mask = (attention_mask.expand_as(input_ids) < question_end_index).to(torch.bool)
     else:
         # last token is separation token and should not be counted and in the middle are two separation tokens
-        attention_mask = (attention_mask.expand_as(input_ids) > (question_end_index + 1)).to(torch.uint8) * (
+        attention_mask = (attention_mask.expand_as(input_ids) > (question_end_index + 1)).to(torch.bool) * (
             attention_mask.expand_as(input_ids) < input_ids.shape[-1]
-        ).to(torch.uint8)
+        ).to(torch.bool)
 
     return attention_mask
 
@@ -439,7 +438,6 @@ class LongformerEmbeddings(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
         self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
 
         # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
@@ -447,8 +445,6 @@ def __init__(self, config):
         self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
-        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-
         self.padding_idx = config.pad_token_id
         self.position_embeddings = nn.Embedding(
             config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
@@ -837,8 +833,8 @@ def _sliding_chunks_query_key_matmul(self, query: torch.Tensor, key: torch.Tenso
         query = query.transpose(1, 2).reshape(batch_size * num_heads, seq_len, head_dim)
         key = key.transpose(1, 2).reshape(batch_size * num_heads, seq_len, head_dim)
 
-        query = self._chunk(query, window_overlap, self.config.__dict__.get("onnx_export", False))
-        key = self._chunk(key, window_overlap, self.config.__dict__.get("onnx_export", False))
+        query = self._chunk(query, window_overlap, getattr(self.config, "onnx_export", False))
+        key = self._chunk(key, window_overlap, getattr(self.config, "onnx_export", False))
 
         # matrix multiplication
         # bcxd: batch_size * num_heads x chunks x 2window_overlap x head_dim
@@ -1425,7 +1421,6 @@ class LongformerPreTrainedModel(PreTrainedModel):
     config_class = LongformerConfig
     base_model_prefix = "longformer"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_unexpected = [r"position_ids"]
     _no_split_modules = ["LongformerSelfAttention"]
 
     def _init_weights(self, module):
@@ -1470,7 +1465,7 @@ def _set_gradient_checkpointing(self, module, value=False):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`LongformerTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -1672,10 +1667,10 @@ def forward(
 
         ```python
         >>> import torch
-        >>> from transformers import LongformerModel, LongformerTokenizer
+        >>> from transformers import LongformerModel, AutoTokenizer
 
         >>> model = LongformerModel.from_pretrained("allenai/longformer-base-4096")
-        >>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
+        >>> tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")
 
         >>> SAMPLE_TEXT = " ".join(["Hello world! "] * 1000)  # long input document
         >>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0)  # batch of size 1
@@ -1712,6 +1707,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -1774,8 +1770,7 @@ def forward(
 
 @add_start_docstrings("""Longformer Model with a `language modeling` head on top.""", LONGFORMER_START_DOCSTRING)
 class LongformerForMaskedLM(LongformerPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["lm_head.decoder"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _tied_weights_keys = ["lm_head.decoder"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1821,9 +1816,9 @@ def forward(
         Mask filling example:
 
         ```python
-        >>> from transformers import LongformerTokenizer, LongformerForMaskedLM
+        >>> from transformers import AutoTokenizer, LongformerForMaskedLM
 
-        >>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
+        >>> tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")
         >>> model = LongformerForMaskedLM.from_pretrained("allenai/longformer-base-4096")
         ```
 
@@ -1864,6 +1859,8 @@ def forward(
         masked_lm_loss = None
         if labels is not None:
             loss_fct = CrossEntropyLoss()
+
+            labels = labels.to(prediction_scores.device)
             masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
 
         if not return_dict:
@@ -1887,9 +1884,6 @@ def forward(
     LONGFORMER_START_DOCSTRING,
 )
 class LongformerForSequenceClassification(LongformerPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1903,7 +1897,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="jpwahle/longformer-base-plagiarism-detection",
         output_type=LongformerSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1955,6 +1948,8 @@ def forward(
 
         loss = None
         if labels is not None:
+            labels = labels.to(logits.device)
+
             if self.config.problem_type is None:
                 if self.num_labels == 1:
                     self.config.problem_type = "regression"
@@ -2016,9 +2011,6 @@ def forward(self, hidden_states, **kwargs):
     LONGFORMER_START_DOCSTRING,
 )
 class LongformerForQuestionAnswering(LongformerPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -2061,10 +2053,10 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import LongformerTokenizer, LongformerForQuestionAnswering
+        >>> from transformers import AutoTokenizer, LongformerForQuestionAnswering
         >>> import torch
 
-        >>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
+        >>> tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
         >>> model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
 
         >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
@@ -2156,9 +2148,6 @@ def forward(
     LONGFORMER_START_DOCSTRING,
 )
 class LongformerForTokenClassification(LongformerPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -2172,7 +2161,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="brad1141/Longformer-finetuned-norm",
         output_type=LongformerTokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -2223,6 +2211,8 @@ def forward(
         loss = None
         if labels is not None:
             loss_fct = CrossEntropyLoss()
+
+            labels = labels.to(logits.device)
             loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
 
         if not return_dict:
@@ -2260,7 +2250,6 @@ def __init__(self, config):
         LONGFORMER_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
     )
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=LongformerMultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -2336,6 +2325,8 @@ def forward(
         loss = None
         if labels is not None:
             loss_fct = CrossEntropyLoss()
+
+            labels = labels.to(reshaped_logits.device)
             loss = loss_fct(reshaped_logits, labels)
 
         if not return_dict:
diff --git a/src/transformers/models/longformer/modeling_tf_longformer.py b/src/transformers/models/longformer/modeling_tf_longformer.py
index 7c929f353d30..0397c2ba320e 100644
--- a/src/transformers/models/longformer/modeling_tf_longformer.py
+++ b/src/transformers/models/longformer/modeling_tf_longformer.py
@@ -14,6 +14,9 @@
 # limitations under the License.
 """Tensorflow Longformer model."""
 
+
+from __future__ import annotations
+
 import warnings
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union
@@ -34,9 +37,8 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -50,7 +52,6 @@
 
 _CHECKPOINT_FOR_DOC = "allenai/longformer-base-4096"
 _CONFIG_FOR_DOC = "LongformerConfig"
-_TOKENIZER_FOR_DOC = "LongformerTokenizer"
 
 LARGE_NEGATIVE = -1e8
 
@@ -102,9 +103,9 @@ class TFLongformerBaseModelOutput(ModelOutput):
     """
 
     last_hidden_state: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
-    global_attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+    global_attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -150,9 +151,9 @@ class TFLongformerBaseModelOutputWithPooling(ModelOutput):
 
     last_hidden_state: tf.Tensor = None
     pooler_output: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
-    global_attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+    global_attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -194,11 +195,11 @@ class TFLongformerMaskedLMOutput(ModelOutput):
             in the sequence.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
-    global_attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+    global_attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -242,12 +243,12 @@ class TFLongformerQuestionAnsweringModelOutput(ModelOutput):
             in the sequence.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     start_logits: tf.Tensor = None
     end_logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
-    global_attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+    global_attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -289,11 +290,11 @@ class TFLongformerSequenceClassifierOutput(ModelOutput):
             in the sequence.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
-    global_attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+    global_attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -337,11 +338,11 @@ class TFLongformerMultipleChoiceModelOutput(ModelOutput):
             in the sequence.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
-    global_attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+    global_attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -383,11 +384,11 @@ class TFLongformerTokenClassifierOutput(ModelOutput):
             in the sequence.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
-    global_attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+    global_attentions: Tuple[tf.Tensor] | None = None
 
 
 def _compute_global_attention_mask(input_ids_shape, sep_token_indices, before_sep_token=True):
@@ -421,7 +422,7 @@ class TFLongformerLMHead(tf.keras.layers.Layer):
     def __init__(self, config, input_embeddings, **kwargs):
         super().__init__(**kwargs)
 
-        self.vocab_size = config.vocab_size
+        self.config = config
         self.hidden_size = config.hidden_size
         self.dense = tf.keras.layers.Dense(
             config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
@@ -434,7 +435,7 @@ def __init__(self, config, input_embeddings, **kwargs):
         self.decoder = input_embeddings
 
     def build(self, input_shape):
-        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
+        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
 
         super().build(input_shape)
 
@@ -450,7 +451,7 @@ def get_bias(self):
 
     def set_bias(self, value):
         self.bias = value["bias"]
-        self.vocab_size = shape_list(value["bias"])[0]
+        self.config.vocab_size = shape_list(value["bias"])[0]
 
     def call(self, hidden_states):
         hidden_states = self.dense(hidden_states)
@@ -461,7 +462,7 @@ def call(self, hidden_states):
         seq_length = shape_list(tensor=hidden_states)[1]
         hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
         hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True)
-        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
         hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
 
         return hidden_states
@@ -476,8 +477,7 @@ def __init__(self, config, **kwargs):
         super().__init__(**kwargs)
 
         self.padding_idx = 1
-        self.vocab_size = config.vocab_size
-        self.type_vocab_size = config.type_vocab_size
+        self.config = config
         self.hidden_size = config.hidden_size
         self.max_position_embeddings = config.max_position_embeddings
         self.initializer_range = config.initializer_range
@@ -488,14 +488,14 @@ def build(self, input_shape: tf.TensorShape):
         with tf.name_scope("word_embeddings"):
             self.weight = self.add_weight(
                 name="weight",
-                shape=[self.vocab_size, self.hidden_size],
+                shape=[self.config.vocab_size, self.hidden_size],
                 initializer=get_initializer(self.initializer_range),
             )
 
         with tf.name_scope("token_type_embeddings"):
             self.token_type_embeddings = self.add_weight(
                 name="embeddings",
-                shape=[self.type_vocab_size, self.hidden_size],
+                shape=[self.config.type_vocab_size, self.hidden_size],
                 initializer=get_initializer(self.initializer_range),
             )
 
@@ -540,16 +540,7 @@ def call(
         assert not (input_ids is None and inputs_embeds is None)
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
@@ -661,6 +652,7 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool
 class TFLongformerSelfAttention(tf.keras.layers.Layer):
     def __init__(self, config, layer_id, **kwargs):
         super().__init__(**kwargs)
+        self.config = config
 
         if config.hidden_size % config.num_attention_heads != 0:
             raise ValueError(
@@ -717,6 +709,16 @@ def __init__(self, config, layer_id, **kwargs):
 
         self.one_sided_attn_window_size = attention_window // 2
 
+    def build(self, input_shape=None):
+        if not self.built:
+            with tf.name_scope("query_global"):
+                self.query_global.build((self.config.hidden_size,))
+            with tf.name_scope("key_global"):
+                self.key_global.build((self.config.hidden_size,))
+            with tf.name_scope("value_global"):
+                self.value_global.build((self.config.hidden_size,))
+        super().build(input_shape)
+
     def call(
         self,
         inputs,
@@ -797,9 +799,8 @@ def call(
         ) = self._get_global_attn_indices(is_index_global_attn)
 
         # this function is only relevant for global attention
-        attn_scores = tf.cond(
-            is_global_attn,
-            lambda: self._concat_with_global_key_attn_probs(
+        if is_global_attn:
+            attn_scores = self._concat_with_global_key_attn_probs(
                 attn_scores=attn_scores,
                 query_vectors=query_vectors,
                 key_vectors=key_vectors,
@@ -807,26 +808,24 @@ def call(
                 is_index_global_attn_nonzero=is_index_global_attn_nonzero,
                 is_local_index_global_attn_nonzero=is_local_index_global_attn_nonzero,
                 is_local_index_no_global_attn_nonzero=is_local_index_no_global_attn_nonzero,
-            ),
-            lambda: attn_scores,
-        )
+            )
+
         attn_probs = stable_softmax(attn_scores, axis=-1)
 
         # softmax sometimes inserts NaN if all positions are masked, replace them with 0
         # Make sure to create a mask with the proper shape:
         # if is_global_attn==True => [batch_size, seq_len, self.num_heads, self.one_sided_attn_window_size * 2 + max_num_global_attn_indices + 1]
         # if is_global_attn==False => [batch_size, seq_len, self.num_heads, self.one_sided_attn_window_size * 2 + 1]
-        masked_index = tf.cond(
-            is_global_attn,
-            lambda: tf.tile(
+        if is_global_attn:
+            masked_index = tf.tile(
                 is_index_masked[:, :, None, None],
                 (1, 1, self.num_heads, self.one_sided_attn_window_size * 2 + max_num_global_attn_indices + 1),
-            ),
-            lambda: tf.tile(
+            )
+        else:
+            masked_index = tf.tile(
                 is_index_masked[:, :, None, None],
                 (1, 1, self.num_heads, self.one_sided_attn_window_size * 2 + 1),
-            ),
-        )
+            )
         attn_probs = tf.where(
             masked_index,
             tf.zeros(shape_list(masked_index), dtype=attn_probs.dtype),
@@ -850,19 +849,19 @@ def call(
         value_vectors = tf.reshape(value_vectors, (batch_size, seq_len, self.num_heads, self.head_dim))
 
         # if global attention, compute sum of global and local attn
-        attn_output = tf.cond(
-            is_global_attn,
-            lambda: self._compute_attn_output_with_global_indices(
+
+        if is_global_attn:
+            attn_output = self._compute_attn_output_with_global_indices(
                 value_vectors=value_vectors,
                 attn_probs=attn_probs,
                 max_num_global_attn_indices=max_num_global_attn_indices,
                 is_index_global_attn_nonzero=is_index_global_attn_nonzero,
                 is_local_index_global_attn_nonzero=is_local_index_global_attn_nonzero,
-            ),
-            lambda: self._sliding_chunks_matmul_attn_probs_value(
+            )
+        else:
+            attn_output = self._sliding_chunks_matmul_attn_probs_value(
                 attn_probs, value_vectors, self.one_sided_attn_window_size
-            ),
-        )
+            )
 
         tf.debugging.assert_equal(
             shape_list(attn_output), [batch_size, seq_len, self.num_heads, self.head_dim], message="Unexpected size"
@@ -871,10 +870,8 @@ def call(
         attn_output = tf.reshape(attn_output, (batch_size, seq_len, embed_dim))
 
         # compute value for global attention and overwrite to attention output
-        # TODO: remove the redundant computation
-        attn_output, global_attn_probs = tf.cond(
-            is_global_attn,
-            lambda: self._compute_global_attn_output_from_hidden(
+        if is_global_attn:
+            attn_output, global_attn_probs = self._compute_global_attn_output_from_hidden(
                 attn_output=attn_output,
                 hidden_states=hidden_states,
                 max_num_global_attn_indices=max_num_global_attn_indices,
@@ -884,25 +881,25 @@ def call(
                 is_local_index_no_global_attn_nonzero=is_local_index_no_global_attn_nonzero,
                 is_index_masked=is_index_masked,
                 training=training,
-            ),
-            lambda: (attn_output, tf.zeros((batch_size, self.num_heads, max_num_global_attn_indices, seq_len))),
-        )
+            )
+        else:
+            # Leave attn_output unchanged
+            global_attn_probs = tf.zeros((batch_size, self.num_heads, max_num_global_attn_indices, seq_len))
 
         # make sure that local attention probabilities are set to 0 for indices of global attn
         # Make sure to create a mask with the proper shape:
         # if is_global_attn==True => [batch_size, seq_len, self.num_heads, self.one_sided_attn_window_size * 2 + max_num_global_attn_indices + 1]
         # if is_global_attn==False => [batch_size, seq_len, self.num_heads, self.one_sided_attn_window_size * 2 + 1]
-        masked_global_attn_index = tf.cond(
-            is_global_attn,
-            lambda: tf.tile(
+        if is_global_attn:
+            masked_global_attn_index = tf.tile(
                 is_index_global_attn[:, :, None, None],
                 (1, 1, self.num_heads, self.one_sided_attn_window_size * 2 + max_num_global_attn_indices + 1),
-            ),
-            lambda: tf.tile(
+            )
+        else:
+            masked_global_attn_index = tf.tile(
                 is_index_global_attn[:, :, None, None],
                 (1, 1, self.num_heads, self.one_sided_attn_window_size * 2 + 1),
-            ),
-        )
+            )
         attn_probs = tf.where(
             masked_global_attn_index,
             tf.zeros(shape_list(masked_global_attn_index), dtype=attn_probs.dtype),
@@ -1694,7 +1691,6 @@ def call(
         return_dict=None,
         training=False,
     ):
-
         if input_ids is not None and not isinstance(input_ids, tf.Tensor):
             input_ids = tf.convert_to_tensor(input_ids, dtype=tf.int64)
         elif input_ids is not None:
@@ -1822,12 +1818,6 @@ def _pad_to_window_size(
         batch_size, seq_len = input_shape[:2]
         padding_len = (attention_window - seq_len % attention_window) % attention_window
 
-        if padding_len > 0:
-            logger.info(
-                f"Input ids are automatically padded from {seq_len} to {seq_len + padding_len} to be a multiple of "
-                f"`config.attention_window`: {attention_window}"
-            )
-
         paddings = tf.convert_to_tensor([[0, 0], [0, padding_len]])
 
         if input_ids is not None:
@@ -1838,13 +1828,10 @@ def _pad_to_window_size(
             position_ids = tf.pad(position_ids, paddings, constant_values=pad_token_id)
 
         if inputs_embeds is not None:
-
-            def pad_embeddings():
+            if padding_len > 0:
                 input_ids_padding = tf.cast(tf.fill((batch_size, padding_len), self.pad_token_id), tf.int64)
                 inputs_embeds_padding = self.embeddings(input_ids_padding)
-                return tf.concat([inputs_embeds, inputs_embeds_padding], axis=-2)
-
-            inputs_embeds = tf.cond(tf.math.greater(padding_len, 0), pad_embeddings, lambda: inputs_embeds)
+                inputs_embeds = tf.concat([inputs_embeds, inputs_embeds_padding], axis=-2)
 
         attention_mask = tf.pad(attention_mask, paddings, constant_values=False)  # no attention on the padding tokens
         token_type_ids = tf.pad(token_type_ids, paddings, constant_values=0)  # pad with token_type_id = 0
@@ -1883,34 +1870,10 @@ class TFLongformerPreTrainedModel(TFPreTrainedModel):
     base_model_prefix = "longformer"
 
     @property
-    def dummy_inputs(self):
-        input_ids = tf.convert_to_tensor([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]], dtype=tf.int32)
-        # make sure global layers are initialized
-        attention_mask = tf.convert_to_tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]], dtype=tf.int32)
-        global_attention_mask = tf.convert_to_tensor(
-            [[0, 0, 0, 0, 1], [0, 0, 1, 0, 0], [0, 0, 0, 0, 1]], dtype=tf.int32
-        )
-        global_attention_mask = tf.convert_to_tensor(
-            [[0, 0, 0, 0, 1], [0, 0, 1, 0, 0], [0, 0, 0, 0, 1]], dtype=tf.int32
-        )
-        return {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "global_attention_mask": global_attention_mask,
-        }
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
+    def input_signature(self):
+        sig = super().input_signature
+        sig["global_attention_mask"] = tf.TensorSpec((None, None), tf.int32, name="global_attention_mask")
+        return sig
 
 
 LONGFORMER_START_DOCSTRING = r"""
@@ -1961,7 +1924,7 @@ def serving(self, inputs):
         input_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`LongformerTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -2053,19 +2016,18 @@ def __init__(self, config, *inputs, **kwargs):
     @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        global_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        global_attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
     ) -> Union[TFLongformerBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
-
         outputs = self.longformer(
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -2082,19 +2044,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        g_attns = tf.convert_to_tensor(output.global_attentions) if self.config.output_attentions else None
-
-        return TFLongformerBaseModelOutputWithPooling(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            hidden_states=hs,
-            attentions=attns,
-            global_attentions=g_attns,
-        )
-
 
 @add_start_docstrings(
     """Longformer Model with a `language modeling` head on top.""",
@@ -2120,7 +2069,6 @@ def get_prefix_bias_name(self):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="allenai/longformer-base-4096",
         output_type=TFLongformerMaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -2130,17 +2078,17 @@ def get_prefix_bias_name(self):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        global_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        global_attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFLongformerMaskedLMOutput, Tuple[tf.Tensor]]:
         r"""
@@ -2180,15 +2128,6 @@ def call(
             global_attentions=outputs.global_attentions,
         )
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        g_attns = tf.convert_to_tensor(output.global_attentions) if self.config.output_attentions else None
-
-        return TFLongformerMaskedLMOutput(
-            logits=output.logits, hidden_states=hs, attentions=attns, global_attentions=g_attns
-        )
-
 
 @add_start_docstrings(
     """
@@ -2215,7 +2154,6 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="allenai/longformer-large-4096-finetuned-triviaqa",
         output_type=TFLongformerQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -2224,18 +2162,18 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        global_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        global_attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFLongformerQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -2320,19 +2258,6 @@ def call(
             global_attentions=outputs.global_attentions,
         )
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        g_attns = tf.convert_to_tensor(output.global_attentions) if self.config.output_attentions else None
-
-        return TFLongformerQuestionAnsweringModelOutput(
-            start_logits=output.start_logits,
-            end_logits=output.end_logits,
-            hidden_states=hs,
-            attentions=attns,
-            global_attentions=g_attns,
-        )
-
 
 class TFLongformerClassificationHead(tf.keras.layers.Layer):
     """Head for sentence-level classification tasks."""
@@ -2381,29 +2306,25 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint="hf-internal-testing/tiny-random-longformer",
+        checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFLongformerSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output="'LABEL_1'",
-        expected_loss=0.69,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        global_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        global_attention_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFLongformerSequenceClassifierOutput, Tuple[tf.Tensor]]:
-
         if input_ids is not None and not isinstance(input_ids, tf.Tensor):
             input_ids = tf.convert_to_tensor(input_ids, dtype=tf.int64)
         elif input_ids is not None:
@@ -2465,15 +2386,6 @@ def call(
             global_attentions=outputs.global_attentions,
         )
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        g_attns = tf.convert_to_tensor(output.global_attentions) if self.config.output_attentions else None
-
-        return TFLongformerSequenceClassifierOutput(
-            logits=output.logits, hidden_states=hs, attentions=attns, global_attentions=g_attns
-        )
-
 
 @add_start_docstrings(
     """
@@ -2496,35 +2408,35 @@ def __init__(self, config, *inputs, **kwargs):
         )
 
     @property
-    def dummy_inputs(self):
-        input_ids = tf.convert_to_tensor(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32)
-        # make sure global layers are initialized
-        global_attention_mask = tf.convert_to_tensor([[[0, 0, 0, 1], [0, 0, 0, 1]]] * 2, dtype=tf.int32)
-        return {"input_ids": input_ids, "global_attention_mask": global_attention_mask}
+    def input_signature(self):
+        return {
+            "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
+            "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
+            "global_attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="global_attention_mask"),
+        }
 
     @unpack_inputs
     @add_start_docstrings_to_model_forward(
         LONGFORMER_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
     )
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFLongformerMultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        global_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        global_attention_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFLongformerMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -2588,28 +2500,6 @@ def call(
             global_attentions=outputs.global_attentions,
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        g_attns = tf.convert_to_tensor(output.global_attentions) if self.config.output_attentions else None
-
-        return TFLongformerMultipleChoiceModelOutput(
-            logits=output.logits, hidden_states=hs, attentions=attns, global_attentions=g_attns
-        )
-
 
 @add_start_docstrings(
     """
@@ -2636,26 +2526,19 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint="hf-internal-testing/tiny-random-longformer",
+        checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFLongformerTokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output=(
-            "['LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_1',"
-            " 'LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_1',"
-            " 'LABEL_1', 'LABEL_1']"
-        ),
-        expected_loss=0.59,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        global_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        global_attention_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -2696,12 +2579,3 @@ def call(
             attentions=outputs.attentions,
             global_attentions=outputs.global_attentions,
         )
-
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        g_attns = tf.convert_to_tensor(output.global_attentions) if self.config.output_attentions else None
-
-        return TFLongformerTokenClassifierOutput(
-            logits=output.logits, hidden_states=hs, attentions=attns, global_attentions=g_attns
-        )
diff --git a/src/transformers/models/longformer/tokenization_longformer.py b/src/transformers/models/longformer/tokenization_longformer.py
index 64bbeeb8ce51..7661634a0009 100644
--- a/src/transformers/models/longformer/tokenization_longformer.py
+++ b/src/transformers/models/longformer/tokenization_longformer.py
@@ -120,12 +120,14 @@ class LongformerTokenizer(PreTrainedTokenizer):
     This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
     be encoded differently whether it is at the beginning of the sentence (without space) or not:
 
-    ```
+    ```python
     >>> from transformers import LongformerTokenizer
+
     >>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
-    >>> tokenizer("Hello world")['input_ids']
-    [0, 31414, 232, 328, 2]
-    >>> tokenizer(" Hello world")['input_ids']
+    >>> tokenizer("Hello world")["input_ids"]
+    [0, 31414, 232, 2]
+
+    >>> tokenizer(" Hello world")["input_ids"]
     [0, 20920, 232, 2]
     ```
 
@@ -207,31 +209,24 @@ def __init__(
         pad_token="<pad>",
         mask_token="<mask>",
         add_prefix_space=False,
-        **kwargs
+        **kwargs,
     ):
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
         sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
         cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
-        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
-        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
 
         # Mask token behave like a normal word, i.e. include the space before it
-        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
-
-        super().__init__(
-            errors=errors,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            add_prefix_space=add_prefix_space,
-            **kwargs,
+        mask_token = (
+            AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
+            if isinstance(mask_token, str)
+            else mask_token
         )
 
+        # these special tokens are not part of the vocab.json, let's add them in the correct order
+
         with open(vocab_file, encoding="utf-8") as vocab_handle:
             self.encoder = json.load(vocab_handle)
         self.decoder = {v: k for k, v in self.encoder.items()}
@@ -248,12 +243,27 @@ def __init__(
         # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
         self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
 
+        super().__init__(
+            errors=errors,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            add_prefix_space=add_prefix_space,
+            **kwargs,
+        )
+
     @property
     def vocab_size(self):
         return len(self.encoder)
 
     def get_vocab(self):
-        return dict(self.encoder, **self.added_tokens_encoder)
+        vocab = dict(self.encoder).copy()
+        vocab.update(self.added_tokens_encoder)
+        return vocab
 
     def bpe(self, token):
         if token in self.cache:
diff --git a/src/transformers/models/longformer/tokenization_longformer_fast.py b/src/transformers/models/longformer/tokenization_longformer_fast.py
index 089ee69d668e..32c6f6c2deef 100644
--- a/src/transformers/models/longformer/tokenization_longformer_fast.py
+++ b/src/transformers/models/longformer/tokenization_longformer_fast.py
@@ -96,12 +96,14 @@ class LongformerTokenizerFast(PreTrainedTokenizerFast):
     This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
     be encoded differently whether it is at the beginning of the sentence (without space) or not:
 
-    ```
+    ```python
     >>> from transformers import LongformerTokenizerFast
+
     >>> tokenizer = LongformerTokenizerFast.from_pretrained("allenai/longformer-base-4096")
-    >>> tokenizer("Hello world")['input_ids']
-    [0, 31414, 232, 328, 2]
-    >>> tokenizer(" Hello world")['input_ids']
+    >>> tokenizer("Hello world")["input_ids"]
+    [0, 31414, 232, 2]
+
+    >>> tokenizer(" Hello world")["input_ids"]
     [0, 20920, 232, 2]
     ```
 
@@ -188,8 +190,13 @@ def __init__(
         mask_token="<mask>",
         add_prefix_space=False,
         trim_offsets=True,
-        **kwargs
+        **kwargs,
     ):
+        mask_token = (
+            AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
+            if isinstance(mask_token, str)
+            else mask_token
+        )
         super().__init__(
             vocab_file,
             merges_file,
diff --git a/src/transformers/models/longt5/__init__.py b/src/transformers/models/longt5/__init__.py
index fd355f6d5a93..93b9121c33f3 100644
--- a/src/transformers/models/longt5/__init__.py
+++ b/src/transformers/models/longt5/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/longt5/configuration_longt5.py b/src/transformers/models/longt5/configuration_longt5.py
index 705fdc493958..0927d1303467 100644
--- a/src/transformers/models/longt5/configuration_longt5.py
+++ b/src/transformers/models/longt5/configuration_longt5.py
@@ -108,9 +108,8 @@ def __init__(
         use_cache=True,
         pad_token_id=0,
         eos_token_id=1,
-        **kwargs
+        **kwargs,
     ):
-
         self.vocab_size = vocab_size
         self.d_model = d_model
         self.d_kv = d_kv
diff --git a/src/transformers/models/longt5/convert_longt5x_checkpoint_to_flax.py b/src/transformers/models/longt5/convert_longt5x_checkpoint_to_flax.py
index 41cc3a2005dd..5a1394c719d2 100644
--- a/src/transformers/models/longt5/convert_longt5x_checkpoint_to_flax.py
+++ b/src/transformers/models/longt5/convert_longt5x_checkpoint_to_flax.py
@@ -20,6 +20,7 @@
 import argparse
 
 from t5x import checkpoints
+
 from transformers import AutoConfig, FlaxAutoModelForSeq2SeqLM
 
 
diff --git a/src/transformers/models/longt5/modeling_flax_longt5.py b/src/transformers/models/longt5/modeling_flax_longt5.py
index 6e4558f3ff31..6b7bc7c28fcf 100644
--- a/src/transformers/models/longt5/modeling_flax_longt5.py
+++ b/src/transformers/models/longt5/modeling_flax_longt5.py
@@ -18,11 +18,10 @@
 import copy
 from typing import Any, Callable, List, Optional, Tuple
 
-import numpy as np
-
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen import combine_masks, make_causal_mask
 from flax.linen import partitioning as nn_partitioning
@@ -52,21 +51,20 @@
 
 _CHECKPOINT_FOR_DOC = "google/long-t5-local-base"
 _CONFIG_FOR_DOC = "LongT5Config"
-_TOKENIZER_FOR_DOC = "T5Tokenizer"
 
 remat = nn_partitioning.remat
 
 
 # Copied from transformers.models.bart.modeling_flax_bart.shift_tokens_right
-def shift_tokens_right(input_ids: np.array, pad_token_id: int, decoder_start_token_id: int) -> np.ndarray:
+def shift_tokens_right(input_ids: jnp.array, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray:
     """
     Shift input ids one token to the right.
     """
-    shifted_input_ids = np.zeros_like(input_ids)
-    shifted_input_ids[:, 1:] = input_ids[:, :-1]
-    shifted_input_ids[:, 0] = decoder_start_token_id
+    shifted_input_ids = jnp.zeros_like(input_ids)
+    shifted_input_ids = shifted_input_ids.at[:, 1:].set(input_ids[:, :-1])
+    shifted_input_ids = shifted_input_ids.at[:, 0].set(decoder_start_token_id)
 
-    shifted_input_ids = np.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
+    shifted_input_ids = jnp.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
     return shifted_input_ids
 
 
@@ -1541,7 +1539,7 @@ def __call__(
             Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
             you should be able to pad the inputs on both the right and the left.
 
-            Indices can be obtained using [`T5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for detail.
 
             To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
@@ -1568,7 +1566,7 @@ def __call__(
         decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`T5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -1611,7 +1609,7 @@ def __call__(
             Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
             you should be able to pad the inputs on both the right and the left.
 
-            Indices can be obtained using [`T5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for detail.
 
             [What are input IDs?](../glossary#input-ids)
@@ -1628,7 +1626,7 @@ def __call__(
         decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`T5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -1682,7 +1680,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, **kwargs)
         super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
@@ -1828,9 +1826,9 @@ def encode(
         Example:
 
         ```python
-        >>> from transformers import T5Tokenizer, FlaxLongT5ForConditionalGeneration
+        >>> from transformers import AutoTokenizer, FlaxLongT5ForConditionalGeneration
 
-        >>> tokenizer = T5Tokenizer.from_pretrained("t5-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
         >>> model = FlaxLongT5ForConditionalGeneration.from_pretrained("google/long-t5-local-base")
 
         >>> text = "My friends are cool but they eat too many carbs."
@@ -1889,10 +1887,10 @@ def decode(
         Example:
 
         ```python
-        >>> from transformers import T5Tokenizer, FlaxLongT5ForConditionalGeneration
+        >>> from transformers import AutoTokenizer, FlaxLongT5ForConditionalGeneration
         >>> import jax.numpy as jnp
 
-        >>> tokenizer = T5Tokenizer.from_pretrained("t5-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
         >>> model = FlaxLongT5ForConditionalGeneration.from_pretrained("google/long-t5-local-base")
 
         >>> text = "My friends are cool but they eat too many carbs."
@@ -2111,9 +2109,7 @@ class FlaxLongT5Model(FlaxLongT5PreTrainedModel):
     module_class = FlaxLongT5Module
 
 
-append_call_sample_docstring(
-    FlaxLongT5Model, _TOKENIZER_FOR_DOC, _CHECKPOINT_FOR_DOC, FlaxSeq2SeqModelOutput, _CONFIG_FOR_DOC
-)
+append_call_sample_docstring(FlaxLongT5Model, _CHECKPOINT_FOR_DOC, FlaxSeq2SeqModelOutput, _CONFIG_FOR_DOC)
 
 FLAX_LONGT5_MODEL_DOCSTRING = """
     Returns:
@@ -2121,9 +2117,9 @@ class FlaxLongT5Model(FlaxLongT5PreTrainedModel):
     Example:
 
     ```python
-    >>> from transformers import T5Tokenizer, FlaxLongT5Model
+    >>> from transformers import AutoTokenizer, FlaxLongT5Model
 
-    >>> tokenizer = T5Tokenizer.from_pretrained("t5-base")
+    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
     >>> model = FlaxLongT5Model.from_pretrained("google/long-t5-local-base")
 
     >>> input_ids = tokenizer(
@@ -2279,10 +2275,10 @@ def decode(
         Example:
 
         ```python
-        >>> from transformers import T5Tokenizer, FlaxLongT5ForConditionalGeneration
+        >>> from transformers import AutoTokenizer, FlaxLongT5ForConditionalGeneration
         >>> import jax.numpy as jnp
 
-        >>> tokenizer = T5Tokenizer.from_pretrained("t5-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
         >>> model = FlaxLongT5ForConditionalGeneration.from_pretrained("google/long-t5-local-base")
 
         >>> text = "summarize: My friends are cool but they eat too many carbs."
@@ -2392,10 +2388,10 @@ def prepare_inputs_for_generation(
         self,
         decoder_input_ids,
         max_length,
-        attention_mask: Optional[jnp.DeviceArray] = None,
-        decoder_attention_mask: Optional[jnp.DeviceArray] = None,
+        attention_mask: Optional[jax.Array] = None,
+        decoder_attention_mask: Optional[jax.Array] = None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # initializing the cache
         batch_size, seq_length = decoder_input_ids.shape
@@ -2428,9 +2424,9 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
     Example:
 
     ```python
-    >>> from transformers import T5Tokenizer, FlaxLongT5ForConditionalGeneration
+    >>> from transformers import AutoTokenizer, FlaxLongT5ForConditionalGeneration
 
-    >>> tokenizer = T5Tokenizer.from_pretrained("t5-base")
+    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
     >>> model = FlaxLongT5ForConditionalGeneration.from_pretrained("google/long-t5-local-base")
 
     >>> ARTICLE_TO_SUMMARIZE = "summarize: My friends are cool but they eat too many carbs."
diff --git a/src/transformers/models/longt5/modeling_longt5.py b/src/transformers/models/longt5/modeling_longt5.py
index 98783bed5b32..d08ed83af07e 100644
--- a/src/transformers/models/longt5/modeling_longt5.py
+++ b/src/transformers/models/longt5/modeling_longt5.py
@@ -49,7 +49,6 @@
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "LongT5Config"
-_TOKENIZER_FOR_DOC = "T5Tokenizer"
 _CHECKPOINT_FOR_DOC = "google/long-t5-local-base"
 
 # TODO: Update before the merge
@@ -232,7 +231,6 @@ def __init__(self, hidden_size, eps=1e-6):
         self.variance_epsilon = eps
 
     def forward(self, hidden_states):
-
         # LongT5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
         # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
         # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
@@ -277,6 +275,12 @@ def forward(self, hidden_states):
         hidden_states = self.wi(hidden_states)
         hidden_states = self.act(hidden_states)
         hidden_states = self.dropout(hidden_states)
+        if (
+            isinstance(self.wo.weight, torch.Tensor)
+            and hidden_states.dtype != self.wo.weight.dtype
+            and self.wo.weight.dtype != torch.int8
+        ):
+            hidden_states = hidden_states.to(self.wo.weight.dtype)
         hidden_states = self.wo(hidden_states)
         return hidden_states
 
@@ -447,9 +451,10 @@ def forward(
         real_seq_length = seq_length
 
         if past_key_value is not None:
-            assert (
-                len(past_key_value) == 2
-            ), f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
+            if len(past_key_value) != 2:
+                raise ValueError(
+                    f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
+                )
             real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length
 
         key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
@@ -1180,7 +1185,6 @@ def forward(
         output_attentions=False,
         return_dict=True,
     ):
-
         if past_key_value is not None:
             if not self.is_decoder:
                 logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.")
@@ -1346,10 +1350,11 @@ def _shift_right(self, input_ids):
         decoder_start_token_id = self.config.decoder_start_token_id
         pad_token_id = self.config.pad_token_id
 
-        assert decoder_start_token_id is not None, (
-            "self.model.config.decoder_start_token_id has to be defined. In LongT5 it is usually set to the"
-            " pad_token_id. See LongT5 docs for more information"
-        )
+        if decoder_start_token_id is None:
+            raise ValueError(
+                "self.model.config.decoder_start_token_id has to be defined. In LongT5 it is usually set to the pad_token_id."
+                "See LongT5 docs for more information."
+            )
 
         # shift inputs to the right
         if is_torch_fx_proxy(input_ids):
@@ -1361,7 +1366,8 @@ def _shift_right(self, input_ids):
             shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
             shifted_input_ids[..., 0] = decoder_start_token_id
 
-        assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
+        if pad_token_id is None:
+            raise ValueError("self.model.config.pad_token_id has to be defined.")
         # replace possible -100 values in labels by `pad_token_id`
         shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
 
@@ -1449,11 +1455,6 @@ def forward(
 
         if attention_mask is None:
             attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
-        if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None:
-            encoder_seq_length = encoder_hidden_states.shape[1]
-            encoder_attention_mask = torch.ones(
-                batch_size, encoder_seq_length, device=inputs_embeds.device, dtype=torch.long
-            )
 
         # initialize past_key_values with `None` if past does not exist
         if past_key_values is None:
@@ -1482,6 +1483,13 @@ def forward(
         else:
             encoder_extended_attention_mask = None
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         # Prepare head mask if needed
         head_mask = self.get_head_mask(head_mask, self.config.num_layers)
         cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
@@ -1502,8 +1510,6 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             if self.gradient_checkpointing and self.training:
-                if use_cache:
-                    use_cache = False
 
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
@@ -1616,7 +1622,7 @@ def custom_forward(*inputs):
             Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
             you should be able to pad the inputs on both the right and the left.
 
-            Indices can be obtained using [`T5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for detail.
 
             [What are input IDs?](../glossary#input-ids)
@@ -1633,7 +1639,7 @@ def custom_forward(*inputs):
         decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`T5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -1711,7 +1717,7 @@ def custom_forward(*inputs):
             Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
             you should be able to pad the inputs on both the right and the left.
 
-            Indices can be obtained using [`T5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for detail.
 
             To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
@@ -1757,13 +1763,10 @@ def custom_forward(*inputs):
     LONGT5_START_DOCSTRING,
 )
 class LongT5Model(LongT5PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        r"encoder.embed_tokens.weight",
-        r"decoder.embed_tokens.weight",
-    ]
     _keys_to_ignore_on_load_unexpected = [
         r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
     ]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: LongT5Config):
         super().__init__(config)
@@ -1832,9 +1835,9 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import T5Tokenizer, LongT5Model
+        >>> from transformers import AutoTokenizer, LongT5Model
 
-        >>> tokenizer = T5Tokenizer.from_pretrained("google/long-t5-local-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/long-t5-local-base")
         >>> model = LongT5Model.from_pretrained("google/long-t5-local-base")
 
         >>> # Let's try a very long encoder input.
@@ -1910,14 +1913,10 @@ def forward(
 
 @add_start_docstrings("""LONGT5 Model with a `language modeling` head on top.""", LONGT5_START_DOCSTRING)
 class LongT5ForConditionalGeneration(LongT5PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        r"encoder.embed_tokens.weight",
-        r"decoder.embed_tokens.weight",
-        r"lm_head.weight",
-    ]
     _keys_to_ignore_on_load_unexpected = [
         r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
     ]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
 
     def __init__(self, config: LongT5Config):
         super().__init__(config)
@@ -2071,6 +2070,8 @@ def forward(
         loss = None
         if labels is not None:
             loss_fct = CrossEntropyLoss(ignore_index=-100)
+
+            labels = labels.to(lm_logits.device)
             loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
             # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
 
@@ -2100,9 +2101,8 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
-
         # cut decoder_input_ids if past is used
         if past_key_values is not None:
             input_ids = input_ids[:, -1:]
@@ -2121,15 +2121,15 @@ def prepare_inputs_for_generation(
     def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
         return self._shift_right(labels)
 
-    def _reorder_cache(self, past, beam_idx):
+    def _reorder_cache(self, past_key_values, beam_idx):
         # if decoder past is not included in output
         # speedy decoding is disabled and no need to reorder
-        if past is None:
+        if past_key_values is None:
             logger.warning("You might want to consider setting `use_cache=True` to speed up decoding")
-            return past
+            return past_key_values
 
         reordered_decoder_past = ()
-        for layer_past_states in past:
+        for layer_past_states in past_key_values:
             # get the correct batch idx from layer past batch dim
             # batch dim of `past` is at 2nd position
             reordered_layer_past_states = ()
@@ -2151,7 +2151,8 @@ def _reorder_cache(self, past, beam_idx):
     LONGT5_START_DOCSTRING,
 )
 class LongT5EncoderModel(LongT5PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight"]
+    _keys_to_ignore_on_load_unexpected = [r"decoder"]
 
     def __init__(self, config: LongT5Config):
         super().__init__(config)
diff --git a/src/transformers/models/luke/__init__.py b/src/transformers/models/luke/__init__.py
index 42165923b1d8..91ef5f222218 100644
--- a/src/transformers/models/luke/__init__.py
+++ b/src/transformers/models/luke/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/luke/configuration_luke.py b/src/transformers/models/luke/configuration_luke.py
index 8f7438cc3c6a..6e5c99900bbd 100644
--- a/src/transformers/models/luke/configuration_luke.py
+++ b/src/transformers/models/luke/configuration_luke.py
@@ -114,7 +114,7 @@ def __init__(
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         """Constructs LukeConfig."""
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
diff --git a/src/transformers/models/luke/convert_luke_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/luke/convert_luke_original_pytorch_checkpoint_to_pytorch.py
index d2b2323b289c..c86fa6e30890 100644
--- a/src/transformers/models/luke/convert_luke_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/luke/convert_luke_original_pytorch_checkpoint_to_pytorch.py
@@ -42,7 +42,7 @@ def convert_luke_checkpoint(checkpoint_path, metadata_path, entity_vocab_path, p
     # Add special tokens to the token vocabulary for downstream tasks
     entity_token_1 = AddedToken("<ent>", lstrip=False, rstrip=False)
     entity_token_2 = AddedToken("<ent2>", lstrip=False, rstrip=False)
-    tokenizer.add_special_tokens(dict(additional_special_tokens=[entity_token_1, entity_token_2]))
+    tokenizer.add_special_tokens({"additional_special_tokens": [entity_token_1, entity_token_2]})
     config.vocab_size += 2
 
     print(f"Saving tokenizer to {pytorch_dump_folder_path}")
diff --git a/src/transformers/models/luke/modeling_luke.py b/src/transformers/models/luke/modeling_luke.py
index a1f9f3cbd915..6913ede09d1c 100644
--- a/src/transformers/models/luke/modeling_luke.py
+++ b/src/transformers/models/luke/modeling_luke.py
@@ -41,7 +41,6 @@
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "LukeConfig"
-_TOKENIZER_FOR_DOC = "LukeTokenizer"
 _CHECKPOINT_FOR_DOC = "studio-ousia/luke-base"
 
 LUKE_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -947,7 +946,7 @@ def _set_gradient_checkpointing(self, module, value=False):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`LukeTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -975,7 +974,7 @@ def _set_gradient_checkpointing(self, module, value=False):
         entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
             Indices of entity tokens in the entity vocabulary.
 
-            Indices can be obtained using [`LukeTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
         entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
@@ -1023,9 +1022,6 @@ def _set_gradient_checkpointing(self, module, value=False):
     LUKE_START_DOCSTRING,
 )
 class LukeModel(LukePreTrainedModel):
-
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config: LukeConfig, add_pooling_layer: bool = True):
         super().__init__(config)
         self.config = config
@@ -1079,9 +1075,9 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import LukeTokenizer, LukeModel
+        >>> from transformers import AutoTokenizer, LukeModel
 
-        >>> tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("studio-ousia/luke-base")
         >>> model = LukeModel.from_pretrained("studio-ousia/luke-base")
         # Compute the contextualized entity representation corresponding to the entity mention "Beyoncé"
 
@@ -1120,6 +1116,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -1280,17 +1277,7 @@ def _tie_weights(self):
     LUKE_START_DOCSTRING,
 )
 class LukeForMaskedLM(LukePreTrainedModel):
-    _keys_to_ignore_on_save = [
-        r"lm_head.decoder.weight",
-        r"lm_head.decoder.bias",
-        r"entity_predictions.decoder.weight",
-    ]
-    _keys_to_ignore_on_load_missing = [
-        r"position_ids",
-        r"lm_head.decoder.weight",
-        r"lm_head.decoder.bias",
-        r"entity_predictions.decoder.weight",
-    ]
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias", "entity_predictions.decoder.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1300,7 +1287,7 @@ def __init__(self, config):
         self.lm_head = LukeLMHead(config)
         self.entity_predictions = EntityPredictionHead(config)
 
-        self.loss_fn = nn.CrossEntropyLoss(ignore_index=-1)
+        self.loss_fn = nn.CrossEntropyLoss()
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1372,6 +1359,8 @@ def forward(
         mlm_loss = None
         logits = self.lm_head(outputs.last_hidden_state)
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
             mlm_loss = self.loss_fn(logits.view(-1, self.config.vocab_size), labels.view(-1))
             if loss is None:
                 loss = mlm_loss
@@ -1467,9 +1456,9 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import LukeTokenizer, LukeForEntityClassification
+        >>> from transformers import AutoTokenizer, LukeForEntityClassification
 
-        >>> tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-open-entity")
+        >>> tokenizer = AutoTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-open-entity")
         >>> model = LukeForEntityClassification.from_pretrained("studio-ousia/luke-large-finetuned-open-entity")
 
         >>> text = "Beyoncé lives in Los Angeles."
@@ -1507,6 +1496,8 @@ def forward(
         if labels is not None:
             # When the number of dimension of `labels` is 1, cross entropy is used as the loss function. The binary
             # cross entropy is used otherwise.
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
             if labels.ndim == 1:
                 loss = nn.functional.cross_entropy(logits, labels)
             else:
@@ -1580,9 +1571,9 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import LukeTokenizer, LukeForEntityPairClassification
+        >>> from transformers import AutoTokenizer, LukeForEntityPairClassification
 
-        >>> tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-tacred")
+        >>> tokenizer = AutoTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-tacred")
         >>> model = LukeForEntityPairClassification.from_pretrained("studio-ousia/luke-large-finetuned-tacred")
 
         >>> text = "Beyoncé lives in Los Angeles."
@@ -1625,6 +1616,8 @@ def forward(
         if labels is not None:
             # When the number of dimension of `labels` is 1, cross entropy is used as the loss function. The binary
             # cross entropy is used otherwise.
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
             if labels.ndim == 1:
                 loss = nn.functional.cross_entropy(logits, labels)
             else:
@@ -1671,7 +1664,7 @@ def __init__(self, config):
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
-        attention_mask=None,
+        attention_mask: Optional[torch.FloatTensor] = None,
         token_type_ids: Optional[torch.LongTensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         entity_ids: Optional[torch.LongTensor] = None,
@@ -1706,9 +1699,9 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import LukeTokenizer, LukeForEntitySpanClassification
+        >>> from transformers import AutoTokenizer, LukeForEntitySpanClassification
 
-        >>> tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003")
+        >>> tokenizer = AutoTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003")
         >>> model = LukeForEntitySpanClassification.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003")
 
         >>> text = "Beyoncé lives in Los Angeles"
@@ -1767,6 +1760,8 @@ def forward(
 
         loss = None
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
             # When the number of dimension of `labels` is 2, cross entropy is used as the loss function. The binary
             # cross entropy is used otherwise.
             if labels.ndim == 2:
@@ -1812,7 +1807,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(LUKE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=LukeSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1865,6 +1859,8 @@ def forward(
 
         loss = None
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
             if self.config.problem_type is None:
                 if self.num_labels == 1:
                     self.config.problem_type = "regression"
@@ -1926,7 +1922,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(LUKE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=LukeTokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1979,6 +1974,8 @@ def forward(
 
         loss = None
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
             loss_fct = CrossEntropyLoss()
             loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
 
@@ -2019,7 +2016,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(LUKE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=LukeQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -2140,7 +2136,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(LUKE_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=LukeMultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -2222,6 +2217,8 @@ def forward(
 
         loss = None
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(reshaped_logits.device)
             loss_fct = CrossEntropyLoss()
             loss = loss_fct(reshaped_logits, labels)
 
diff --git a/src/transformers/models/luke/tokenization_luke.py b/src/transformers/models/luke/tokenization_luke.py
index 54d76f98a694..e8ad725d050b 100644
--- a/src/transformers/models/luke/tokenization_luke.py
+++ b/src/transformers/models/luke/tokenization_luke.py
@@ -22,7 +22,6 @@
 from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
-
 import regex as re
 
 from ...tokenization_utils import PreTrainedTokenizer
@@ -198,12 +197,14 @@ class LukeTokenizer(PreTrainedTokenizer):
     This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
     be encoded differently whether it is at the beginning of the sentence (without space) or not:
 
-    ```
+    ```python
     >>> from transformers import LukeTokenizer
+
     >>> tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base")
-    >>> tokenizer("Hello world")['input_ids']
+    >>> tokenizer("Hello world")["input_ids"]
     [0, 31414, 232, 2]
-    >>> tokenizer(" Hello world")['input_ids']
+
+    >>> tokenizer(" Hello world")["input_ids"]
     [0, 20920, 232, 2]
     ```
 
@@ -313,9 +314,8 @@ def __init__(
         pad_token="<pad>",
         mask_token="<mask>",
         add_prefix_space=False,
-        **kwargs
+        **kwargs,
     ):
-
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
         sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
@@ -326,28 +326,6 @@ def __init__(
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
 
-        super().__init__(
-            errors=errors,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            add_prefix_space=add_prefix_space,
-            task=task,
-            max_entity_length=32,
-            max_mention_length=30,
-            entity_token_1="<ent>",
-            entity_token_2="<ent2>",
-            entity_unk_token=entity_unk_token,
-            entity_pad_token=entity_pad_token,
-            entity_mask_token=entity_mask_token,
-            entity_mask2_token=entity_mask2_token,
-            **kwargs,
-        )
-
         with open(vocab_file, encoding="utf-8") as vocab_handle:
             self.encoder = json.load(vocab_handle)
         self.decoder = {v: k for k, v in self.encoder.items()}
@@ -407,6 +385,28 @@ def __init__(
 
         self.max_mention_length = max_mention_length
 
+        super().__init__(
+            errors=errors,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            add_prefix_space=add_prefix_space,
+            task=task,
+            max_entity_length=32,
+            max_mention_length=30,
+            entity_token_1="<ent>",
+            entity_token_2="<ent2>",
+            entity_unk_token=entity_unk_token,
+            entity_pad_token=entity_pad_token,
+            entity_mask_token=entity_mask_token,
+            entity_mask2_token=entity_mask2_token,
+            **kwargs,
+        )
+
     @property
     # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.vocab_size with Roberta->Luke, RoBERTa->LUKE
     def vocab_size(self):
@@ -414,7 +414,9 @@ def vocab_size(self):
 
     # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.get_vocab with Roberta->Luke, RoBERTa->LUKE
     def get_vocab(self):
-        return dict(self.encoder, **self.added_tokens_encoder)
+        vocab = dict(self.encoder).copy()
+        vocab.update(self.added_tokens_encoder)
+        return vocab
 
     # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.bpe with Roberta->Luke, RoBERTa->LUKE
     def bpe(self, token):
@@ -597,7 +599,7 @@ def __call__(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
@@ -742,9 +744,8 @@ def _encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
-
         if return_offsets_mapping:
             raise NotImplementedError(
                 "return_offset_mapping is not available when using Python tokenizers. "
@@ -824,7 +825,7 @@ def _batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         if return_offsets_mapping:
             raise NotImplementedError(
@@ -916,7 +917,6 @@ def _check_entity_input_format(self, entities: Optional[EntityInput], entity_spa
             )
 
         if entities is not None:
-
             if not isinstance(entities, list):
                 raise ValueError("If you specify entities, they should be given as a list")
 
@@ -934,7 +934,7 @@ def _create_input_sequence(
         entities_pair: Optional[EntityInput] = None,
         entity_spans: Optional[EntitySpanInput] = None,
         entity_spans_pair: Optional[EntitySpanInput] = None,
-        **kwargs
+        **kwargs,
     ) -> Tuple[list, list, list, list, list, list]:
         def get_input_ids(text):
             tokens = self.tokenize(text, **kwargs)
@@ -975,7 +975,6 @@ def get_input_ids_and_entity_token_spans(text, entity_spans):
         first_entity_token_spans, second_entity_token_spans = None, None
 
         if self.task is None:
-
             if entity_spans is None:
                 first_ids = get_input_ids(text)
             else:
@@ -1058,7 +1057,6 @@ def get_input_ids_and_entity_token_spans(text, entity_spans):
                 first_ids = first_ids[:entity_token_start] + [special_token_id] + first_ids[entity_token_start:]
 
         elif self.task == "entity_span_classification":
-
             if not (isinstance(entity_spans, list) and len(entity_spans) > 0 and isinstance(entity_spans[0], tuple)):
                 raise ValueError(
                     "Entity spans should be provided as a list of tuples, "
@@ -1187,7 +1185,7 @@ def prepare_for_model(
         return_length: bool = False,
         verbose: bool = True,
         prepend_batch_axis: bool = False,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Prepares a sequence of input id, entity id and entity span, or a pair of sequences of inputs ids, entity ids,
@@ -1440,7 +1438,7 @@ def pad(
                 The maximum length of the entity sequence.
             pad_to_multiple_of (`int`, *optional*):
                 If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
-                the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
+                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
             return_attention_mask (`bool`, *optional*):
                 Whether to return the attention mask. If left to the default, will return the attention mask according
                 to the specific tokenizer's default, defined by the `return_outputs` attribute. [What are attention
@@ -1535,7 +1533,7 @@ def pad(
 
         batch_outputs = {}
         for i in range(batch_size):
-            inputs = dict((k, v[i]) for k, v in encoded_inputs.items())
+            inputs = {k: v[i] for k, v in encoded_inputs.items()}
             outputs = self._pad(
                 inputs,
                 max_length=max_length,
@@ -1584,7 +1582,7 @@ def _pad(
                     - 'right': pads on the right of the sequences
             pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                 This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
-                >= 7.5 (Volta).
+                `>= 7.5` (Volta).
             return_attention_mask:
                 (optional) Set to False to avoid returning attention mask (default: set to model specifics)
         """
diff --git a/src/transformers/models/lxmert/__init__.py b/src/transformers/models/lxmert/__init__.py
index 0b8b58bc9986..4f7e775431dd 100644
--- a/src/transformers/models/lxmert/__init__.py
+++ b/src/transformers/models/lxmert/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/lxmert/configuration_lxmert.py b/src/transformers/models/lxmert/configuration_lxmert.py
index dc200c8e8f5b..6ced7d2acadf 100644
--- a/src/transformers/models/lxmert/configuration_lxmert.py
+++ b/src/transformers/models/lxmert/configuration_lxmert.py
@@ -43,14 +43,18 @@ class LxmertConfig(PretrainedConfig):
             `inputs_ids` passed when calling [`LxmertModel`] or [`TFLxmertModel`].
         hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
-        r_layers (`int`, *optional*, defaults to 5):
-            Number of hidden layers in the Transformer visual encoder.
-        l_layers (`int`, *optional*, defaults to 9):
-            Number of hidden layers in the Transformer language encoder.
-        x_layers (`int`, *optional*, defaults to 5):
-            Number of hidden layers in the Transformer cross modality encoder.
-        num_attention_heads (`int`, *optional*, defaults to 5):
+        num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
+        num_qa_labels (`int`, *optional*, defaults to 9500):
+            This represents the total number of different question answering (QA) labels there are. If using more than
+            one dataset with QA, the user will need to account for the total number of labels that all of the datasets
+            have in total.
+        num_object_labels (`int`, *optional*, defaults to 1600):
+            This represents the total number of semantically unique objects that lxmert will be able to classify a
+            pooled-object feature as belonging too.
+        num_attr_labels (`int`, *optional*, defaults to 400):
+            This represents the total number of semantically unique attributes that lxmert will be able to classify a
+            pooled-object feature as possessing.
         intermediate_size (`int`, *optional*, defaults to 3072):
             Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
         hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
@@ -69,25 +73,21 @@ class LxmertConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
+        l_layers (`int`, *optional*, defaults to 9):
+            Number of hidden layers in the Transformer language encoder.
+        x_layers (`int`, *optional*, defaults to 5):
+            Number of hidden layers in the Transformer cross modality encoder.
+        r_layers (`int`, *optional*, defaults to 5):
+            Number of hidden layers in the Transformer visual encoder.
         visual_feat_dim (`int`, *optional*, defaults to 2048):
             This represents the last dimension of the pooled-object features used as input for the model, representing
             the size of each object feature itself.
         visual_pos_dim (`int`, *optional*, defaults to 4):
             This represents the number of spacial features that are mixed into the visual features. The default is set
             to 4 because most commonly this will represent the location of a bounding box. i.e., (x, y, width, height)
-        visual_loss_normalizer (`float`, *optional*, defaults to 1/15):
+        visual_loss_normalizer (`float`, *optional*, defaults to 6.67):
             This represents the scaling factor in which each visual loss is multiplied by if during pretraining, one
             decided to train with multiple vision-based loss objectives.
-        num_qa_labels (`int`, *optional*, defaults to 9500):
-            This represents the total number of different question answering (QA) labels there are. If using more than
-            one dataset with QA, the user will need to account for the total number of labels that all of the datasets
-            have in total.
-        num_object_labels (`int`, *optional*, defaults to 1600):
-            This represents the total number of semantically unique objects that lxmert will be able to classify a
-            pooled-object feature as belonging too.
-        num_attr_labels (`int`, *optional*, defaults to 400):
-            This represents the total number of semantically unique attributes that lxmert will be able to classify a
-            pooled-object feature as possessing.
         task_matched (`bool`, *optional*, defaults to `True`):
             This task is used for sentence-image matching. If the sentence correctly describes the image the label will
             be 1. If the sentence does not correctly describe the image, the label will be 0.
@@ -104,12 +104,6 @@ class LxmertConfig(PretrainedConfig):
             Whether or not to calculate the attribute-prediction loss objective
         visual_feat_loss (`bool`, *optional*, defaults to `True`):
             Whether or not to calculate the feature-regression loss objective
-        output_attentions (`bool`, *optional*, defaults to `False`):
-            Whether or not the model should return the attentions from the vision, language, and cross-modality layers
-            should be returned.
-        output_hidden_states (`bool`, *optional*, defaults to `False`):
-            Whether or not the model should return the hidden states from the vision, language, and cross-modality
-            layers should be returned.
     """
 
     model_type = "lxmert"
diff --git a/src/transformers/models/lxmert/modeling_lxmert.py b/src/transformers/models/lxmert/modeling_lxmert.py
index daf4abf67ccd..226e2e7197a7 100644
--- a/src/transformers/models/lxmert/modeling_lxmert.py
+++ b/src/transformers/models/lxmert/modeling_lxmert.py
@@ -42,7 +42,6 @@
 
 _CHECKPOINT_FOR_DOC = "unc-nlp/lxmert-base-uncased"
 _CONFIG_FOR_DOC = "LxmertConfig"
-_TOKENIZER_FOR_DOC = "LxmertTokenizer"
 
 LXMERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "unc-nlp/lxmert-base-uncased",
@@ -112,7 +111,7 @@ class LxmertForQuestionAnsweringOutput(ModelOutput):
         loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
             Total loss as the sum of the masked language modeling loss and the next sequence prediction
             (classification) loss.k.
-        question_answering_score: (`torch.FloatTensor` of shape `(batch_size, n_qa_answers)`, *optional*):
+        question_answering_score (`torch.FloatTensor` of shape `(batch_size, n_qa_answers)`, *optional*):
             Prediction scores of question answering objective (classification).
         language_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for input features + one for the output of each cross-modality layer) of
@@ -154,10 +153,10 @@ class LxmertForPreTrainingOutput(ModelOutput):
             (classification) loss.
         prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
             Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        cross_relationship_score: (`torch.FloatTensor` of shape `(batch_size, 2)`):
+        cross_relationship_score (`torch.FloatTensor` of shape `(batch_size, 2)`):
             Prediction scores of the textual matching objective (classification) head (scores of True/False
             continuation before SoftMax).
-        question_answering_score: (`torch.FloatTensor` of shape `(batch_size, n_qa_answers)`):
+        question_answering_score (`torch.FloatTensor` of shape `(batch_size, n_qa_answers)`):
             Prediction scores of question answering objective (classification).
         language_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for input features + one for the output of each cross-modality layer) of
@@ -527,7 +526,6 @@ def forward(
         visual_attention_mask,
         output_attentions=False,
     ):
-
         lang_att_output, visual_att_output = self.cross_att(
             lang_input=lang_feats,
             lang_attention_mask=lang_attention_mask,
@@ -610,7 +608,6 @@ def forward(
         visual_attention_mask=None,
         output_attentions=None,
     ):
-
         vision_hidden_states = ()
         language_hidden_states = ()
         vision_attentions = () if output_attentions or self.config.output_attentions else None
@@ -827,16 +824,16 @@ def _init_weights(self, module):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`LxmertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
-        visual_feats: (`torch.FloatTensor` of shape `(batch_size, num_visual_features, visual_feat_dim)`):
+        visual_feats (`torch.FloatTensor` of shape `(batch_size, num_visual_features, visual_feat_dim)`):
             This input represents visual features. They ROI pooled object features from bounding boxes using a
             faster-RCNN model)
 
             These are currently not provided by the transformers library.
-        visual_pos: (`torch.FloatTensor` of shape `(batch_size, num_visual_features, visual_pos_dim)`):
+        visual_pos (`torch.FloatTensor` of shape `(batch_size, num_visual_features, visual_pos_dim)`):
             This input represents spacial features corresponding to their relative (via index) visual features. The
             pre-trained LXMERT model expects these spacial features to be normalized bounding boxes on a scale of 0 to
             1.
@@ -900,7 +897,6 @@ def set_input_embeddings(self, new_embeddings):
 
     @add_start_docstrings_to_model_forward(LXMERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=LxmertModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -918,7 +914,6 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[LxmertModelOutput, Tuple[torch.FloatTensor]]:
-
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -928,6 +923,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -1023,7 +1019,7 @@ def forward(
     LXMERT_START_DOCSTRING,
 )
 class LxmertForPreTraining(LxmertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.weight"]
+    _tied_weights_keys = ["cls.predictions.decoder.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1125,7 +1121,6 @@ def _set_qa_logit_layer(self, qa_logit_layer):
         self.answer_head.logit_fc[-1] = qa_logit_layer
 
     def _get_resized_qa_labels(self, cur_qa_logit_layer, num_labels):
-
         if num_labels is None:
             return cur_qa_logit_layer
 
@@ -1177,7 +1172,7 @@ def forward(
             Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
             config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
             loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
-        obj_labels: (`Dict[Str: Tuple[Torch.FloatTensor, Torch.FloatTensor]]`, *optional*):
+        obj_labels (`Dict[Str: Tuple[Torch.FloatTensor, Torch.FloatTensor]]`, *optional*):
             each key is named after each one of the visual losses and each element of the tuple is of the shape
             `(batch_size, num_features)` and `(batch_size, num_features, visual_feature_dim)` for each the label id and
             the label score respectively
@@ -1357,7 +1352,6 @@ def _set_qa_logit_layer(self, qa_logit_layer):
         self.answer_head.logit_fc[-1] = qa_logit_layer
 
     def _get_resized_qa_labels(self, cur_qa_logit_layer, num_labels):
-
         if num_labels is None:
             return cur_qa_logit_layer
 
@@ -1386,7 +1380,6 @@ def _get_resized_qa_labels(self, cur_qa_logit_layer, num_labels):
 
     @add_start_docstrings_to_model_forward(LXMERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=LxmertForQuestionAnsweringOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1406,7 +1399,7 @@ def forward(
         return_dict: Optional[bool] = None,
     ) -> Union[LxmertForQuestionAnsweringOutput, Tuple[torch.FloatTensor]]:
         r"""
-        labels: (`Torch.Tensor` of shape `(batch_size)`, *optional*):
+        labels (`Torch.Tensor` of shape `(batch_size)`, *optional*):
             A one-hot representation of the correct answer
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
diff --git a/src/transformers/models/lxmert/modeling_tf_lxmert.py b/src/transformers/models/lxmert/modeling_tf_lxmert.py
index 0f1b75300276..80fa94e6420a 100644
--- a/src/transformers/models/lxmert/modeling_tf_lxmert.py
+++ b/src/transformers/models/lxmert/modeling_tf_lxmert.py
@@ -16,6 +16,9 @@
 # limitations under the License.
 """ TF 2.0 LXMERT model."""
 
+
+from __future__ import annotations
+
 import warnings
 from dataclasses import dataclass
 from typing import Dict, Optional, Tuple, Union
@@ -23,8 +26,6 @@
 import numpy as np
 import tensorflow as tf
 
-from transformers.tf_utils import stable_softmax
-
 from ...activations_tf import get_tf_activation
 from ...modeling_tf_utils import (
     TFModelInputType,
@@ -34,6 +35,7 @@
     shape_list,
     unpack_inputs,
 )
+from ...tf_utils import check_embeddings_within_bounds, stable_softmax
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -49,7 +51,6 @@
 
 _CHECKPOINT_FOR_DOC = "unc-nlp/lxmert-base-uncased"
 _CONFIG_FOR_DOC = "LxmertConfig"
-_TOKENIZER_FOR_DOC = "LxmertTokenizer"
 
 TF_LXMERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "unc-nlp/lxmert-base-uncased",
@@ -92,14 +93,14 @@ class TFLxmertModelOutput(ModelOutput):
             the self-attention heads.
     """
 
-    language_output: Optional[tf.Tensor] = None
-    vision_output: Optional[tf.Tensor] = None
-    pooled_output: Optional[tf.Tensor] = None
-    language_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    vision_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    language_attentions: Optional[Tuple[tf.Tensor]] = None
-    vision_attentions: Optional[Tuple[tf.Tensor]] = None
-    cross_encoder_attentions: Optional[Tuple[tf.Tensor]] = None
+    language_output: tf.Tensor | None = None
+    vision_output: tf.Tensor | None = None
+    pooled_output: tf.Tensor | None = None
+    language_hidden_states: Tuple[tf.Tensor] | None = None
+    vision_hidden_states: Tuple[tf.Tensor] | None = None
+    language_attentions: Tuple[tf.Tensor] | None = None
+    vision_attentions: Tuple[tf.Tensor] | None = None
+    cross_encoder_attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -113,10 +114,10 @@ class TFLxmertForPreTrainingOutput(ModelOutput):
             (classification) loss.
         prediction_logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
             Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        cross_relationship_score: (`tf.Tensor` of shape `(batch_size, 2)`):
+        cross_relationship_score (`tf.Tensor` of shape `(batch_size, 2)`):
             Prediction scores of the textual matching objective (classification) head (scores of True/False
             continuation before SoftMax).
-        question_answering_score: (`tf.Tensor` of shape `(batch_size, n_qa_answers)`):
+        question_answering_score (`tf.Tensor` of shape `(batch_size, n_qa_answers)`):
             Prediction scores of question answering objective (classification).
         language_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `tf.Tensor` (one for input features + one for the output of each cross-modality layer) of shape
@@ -139,15 +140,15 @@ class TFLxmertForPreTrainingOutput(ModelOutput):
 
     """
 
-    loss: Optional[tf.Tensor] = None
-    prediction_logits: Optional[tf.Tensor] = None
-    cross_relationship_score: Optional[tf.Tensor] = None
-    question_answering_score: Optional[tf.Tensor] = None
-    language_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    vision_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    language_attentions: Optional[Tuple[tf.Tensor]] = None
-    vision_attentions: Optional[Tuple[tf.Tensor]] = None
-    cross_encoder_attentions: Optional[Tuple[tf.Tensor]] = None
+    loss: tf.Tensor | None = None
+    prediction_logits: tf.Tensor | None = None
+    cross_relationship_score: tf.Tensor | None = None
+    question_answering_score: tf.Tensor | None = None
+    language_hidden_states: Tuple[tf.Tensor] | None = None
+    vision_hidden_states: Tuple[tf.Tensor] | None = None
+    language_attentions: Tuple[tf.Tensor] | None = None
+    vision_attentions: Tuple[tf.Tensor] | None = None
+    cross_encoder_attentions: Tuple[tf.Tensor] | None = None
 
 
 class TFLxmertVisualFeatureEncoder(tf.keras.layers.Layer):
@@ -193,8 +194,7 @@ class TFLxmertEmbeddings(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super().__init__(**kwargs)
 
-        self.vocab_size = config.vocab_size
-        self.type_vocab_size = config.type_vocab_size
+        self.config = config
         self.hidden_size = config.hidden_size
         self.max_position_embeddings = config.max_position_embeddings
         self.initializer_range = config.initializer_range
@@ -205,14 +205,14 @@ def build(self, input_shape):
         with tf.name_scope("word_embeddings"):
             self.weight = self.add_weight(
                 name="weight",
-                shape=[self.vocab_size, self.hidden_size],
+                shape=[self.config.vocab_size, self.hidden_size],
                 initializer=get_initializer(initializer_range=self.initializer_range),
             )
 
         with tf.name_scope("token_type_embeddings"):
             self.token_type_embeddings = self.add_weight(
                 name="embeddings",
-                shape=[self.type_vocab_size, self.hidden_size],
+                shape=[self.config.type_vocab_size, self.hidden_size],
                 initializer=get_initializer(initializer_range=self.initializer_range),
             )
 
@@ -235,16 +235,7 @@ def call(self, input_ids=None, token_type_ids=None, inputs_embeds=None, training
         assert not (input_ids is None and inputs_embeds is None)
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
@@ -645,26 +636,6 @@ def call(
 class TFLxmertMainLayer(tf.keras.layers.Layer):
     config_class = LxmertConfig
 
-    @property
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            tf.Tensor with dummy inputs
-        """
-        batch_size = 2
-        num_visual_features = 10
-        input_ids = tf.constant([[3, 5, 6], [2, 3, 4]], dtype=tf.int32)
-        visual_feats = tf.random.uniform((batch_size, num_visual_features, self.config.visual_feat_dim))
-        visual_pos = tf.random.uniform((batch_size, num_visual_features, 4))
-
-        return {
-            "input_ids": input_ids,
-            "visual_feats": visual_feats,
-            "visual_pos": visual_pos,
-        }
-
     def __init__(self, config, **kwargs):
         super().__init__(**kwargs)
 
@@ -811,25 +782,35 @@ class TFLxmertPreTrainedModel(TFPreTrainedModel):
     base_model_prefix = "lxmert"
 
     @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        return getattr(self, self.base_model_prefix).dummy_inputs
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-                "visual_feats": tf.TensorSpec((None, None, None), tf.float32, name="visual_feats"),
-                "visual_pos": tf.TensorSpec((None, None, None), tf.float32, name="visual_pos"),
-                "visual_attention_mask": tf.TensorSpec((None, None), tf.int32, name="visual_attention_mask"),
-                "token_type_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
+    def dummy_inputs(self):
+        """
+        Dummy inputs to build the network.
+
+        Returns:
+            tf.Tensor with dummy inputs
+        """
+        batch_size = 2
+        num_visual_features = 10
+        input_ids = tf.constant([[3, 5, 6], [2, 3, 4]], dtype=tf.int32)
+        visual_feats = tf.random.uniform((batch_size, num_visual_features, self.config.visual_feat_dim))
+        visual_pos = tf.random.uniform((batch_size, num_visual_features, 4))
+
+        return {
+            "input_ids": input_ids,
+            "visual_feats": visual_feats,
+            "visual_pos": visual_pos,
+        }
 
-        return self.serving_output(output)
+    @property
+    def input_signature(self):
+        return {
+            "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
+            "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
+            "visual_feats": tf.TensorSpec((None, None, self.config.visual_feat_dim), tf.float32, name="visual_feats"),
+            "visual_pos": tf.TensorSpec((None, None, 4), tf.float32, name="visual_pos"),
+            "visual_attention_mask": tf.TensorSpec((None, None), tf.int32, name="visual_attention_mask"),
+            "token_type_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"),
+        }
 
 
 LXMERT_START_DOCSTRING = r"""
@@ -881,16 +862,16 @@ def serving(self, inputs):
         input_ids (`np.ndarray` or `tf.Tensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`LxmertTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
-        visual_feats: (`tf.Tensor` of shape `(batch_size, num_visual_features, visual_feat_dim)`):
+        visual_feats (`tf.Tensor` of shape `(batch_size, num_visual_features, visual_feat_dim)`):
             This input represents visual features. They ROI pooled object features from bounding boxes using a
             faster-RCNN model)
 
             These are currently not provided by the transformers library.
-        visual_pos: (`tf.Tensor` of shape `(batch_size, num_visual_features, visual_feat_dim)`):
+        visual_pos (`tf.Tensor` of shape `(batch_size, num_visual_features, visual_feat_dim)`):
             This input represents spacial features corresponding to their relative (via index) visual features. The
             pre-trained LXMERT model expects these spacial features to be normalized bounding boxes on a scale of 0 to
             1.
@@ -951,20 +932,19 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(LXMERT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFLxmertModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        visual_feats: Optional[tf.Tensor] = None,
-        visual_pos: Optional[tf.Tensor] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        visual_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        visual_feats: tf.Tensor | None = None,
+        visual_pos: tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        visual_attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -986,24 +966,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output):
-        l_hs = tf.convert_to_tensor(output.language_hidden_states) if self.config.output_hidden_states else None
-        v_hs = tf.convert_to_tensor(output.vision_hidden_states) if self.config.output_hidden_states else None
-        l_attns = tf.convert_to_tensor(output.language_attentions) if self.config.output_attentions else None
-        v_attns = tf.convert_to_tensor(output.vision_attentions) if self.config.output_attentions else None
-        c_enc_attns = tf.convert_to_tensor(output.cross_encoder_attentions) if self.config.output_attentions else None
-
-        return TFLxmertModelOutput(
-            pooled_output=output.pooled_output,
-            language_output=output.language_output,
-            vision_output=output.vision_output,
-            language_hidden_states=l_hs,
-            vision_hidden_states=v_hs,
-            language_attentions=l_attns,
-            vision_attentions=v_attns,
-            cross_encoder_attentions=c_enc_attns,
-        )
-
 
 class TFLxmertPooler(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
@@ -1054,7 +1016,7 @@ class TFLxmertLMPredictionHead(tf.keras.layers.Layer):
     def __init__(self, config: LxmertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs):
         super().__init__(**kwargs)
 
-        self.vocab_size = config.vocab_size
+        self.config = config
         self.hidden_size = config.hidden_size
 
         self.transform = TFLxmertPredictionHeadTransform(config, name="transform")
@@ -1064,7 +1026,7 @@ def __init__(self, config: LxmertConfig, input_embeddings: tf.keras.layers.Layer
         self.input_embeddings = input_embeddings
 
     def build(self, input_shape: tf.TensorShape):
-        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
+        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
 
         super().build(input_shape)
 
@@ -1080,14 +1042,14 @@ def get_bias(self) -> Dict[str, tf.Variable]:
 
     def set_bias(self, value: tf.Variable):
         self.bias = value["bias"]
-        self.vocab_size = shape_list(value["bias"])[0]
+        self.config.vocab_size = shape_list(value["bias"])[0]
 
     def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
         hidden_states = self.transform(hidden_states=hidden_states)
         seq_length = shape_list(hidden_states)[1]
         hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
         hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
-        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
         hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
 
         return hidden_states
@@ -1289,28 +1251,28 @@ def get_prefix_bias_name(self):
     @replace_return_docstrings(output_type=TFLxmertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids=None,
-        visual_feats=None,
-        visual_pos=None,
-        attention_mask=None,
-        visual_attention_mask=None,
-        token_type_ids=None,
-        inputs_embeds=None,
-        masked_lm_labels=None,
-        obj_labels=None,
-        matched_label=None,
-        ans=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        training=False,
-    ):
+        input_ids: TFModelInputType | None = None,
+        visual_feats: tf.Tensor | None = None,
+        visual_pos: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        visual_attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        masked_lm_labels: tf.Tensor | None = None,
+        obj_labels: Dict[str, Tuple[tf.Tensor, tf.Tensor]] | None = None,
+        matched_label: tf.Tensor | None = None,
+        ans: tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+    ) -> Tuple[tf.Tensor] | TFLxmertForPreTrainingOutput:
         r"""
         masked_lm_labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
             config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
             loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
-        obj_labels: (`Dict[Str: Tuple[tf.Tensor, tf.Tensor]]`, *optional*, defaults to `None`):
+        obj_labels (`Dict[Str: Tuple[tf.Tensor, tf.Tensor]]`, *optional*, defaults to `None`):
             each key is named after each one of the visual losses and each element of the tuple is of the shape
             `(batch_size, num_features)` and `(batch_size, num_features, visual_feature_dim)` for each the label id and
             the label score respectively
@@ -1320,7 +1282,7 @@ def call(
 
             - 0 indicates that the sentence does not match the image,
             - 1 indicates that the sentence does match the image.
-        ans (`Torch.Tensor` of shape `(batch_size)`, *optional*, defaults to `None`):
+        ans (`tf.Tensor` of shape `(batch_size)`, *optional*, defaults to `None`):
             a one hot representation hof the correct answer *optional*
 
         Returns:
@@ -1425,21 +1387,3 @@ def call(
             vision_attentions=lxmert_output.vision_attentions,
             cross_encoder_attentions=lxmert_output.cross_encoder_attentions,
         )
-
-    def serving_output(self, output):
-        l_hs = tf.convert_to_tensor(output.language_hidden_states) if self.config.output_hidden_states else None
-        v_hs = tf.convert_to_tensor(output.vision_hidden_states) if self.config.output_hidden_states else None
-        l_attns = tf.convert_to_tensor(output.language_attentions) if self.config.output_attentions else None
-        v_attns = tf.convert_to_tensor(output.vision_attentions) if self.config.output_attentions else None
-        c_enc_attns = tf.convert_to_tensor(output.cross_encoder_attentions) if self.config.output_attentions else None
-
-        return TFLxmertForPreTrainingOutput(
-            prediction_logits=output.prediction_logits,
-            cross_relationship_score=output.cross_relationship_score,
-            question_answering_score=output.question_answering_score,
-            language_hidden_states=l_hs,
-            vision_hidden_states=v_hs,
-            language_attentions=l_attns,
-            vision_attentions=v_attns,
-            cross_encoder_attentions=c_enc_attns,
-        )
diff --git a/src/transformers/models/lxmert/tokenization_lxmert.py b/src/transformers/models/lxmert/tokenization_lxmert.py
index a657ddb94b5c..17ff0ff8e7f8 100644
--- a/src/transformers/models/lxmert/tokenization_lxmert.py
+++ b/src/transformers/models/lxmert/tokenization_lxmert.py
@@ -124,22 +124,8 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
-        super().__init__(
-            do_lower_case=do_lower_case,
-            do_basic_tokenize=do_basic_tokenize,
-            never_split=never_split,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            tokenize_chinese_chars=tokenize_chinese_chars,
-            strip_accents=strip_accents,
-            **kwargs,
-        )
-
         if not os.path.isfile(vocab_file):
             raise ValueError(
                 f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
@@ -155,7 +141,22 @@ def __init__(
                 tokenize_chinese_chars=tokenize_chinese_chars,
                 strip_accents=strip_accents,
             )
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+
+        super().__init__(
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
 
     @property
     def do_lower_case(self):
@@ -168,11 +169,12 @@ def vocab_size(self):
     def get_vocab(self):
         return dict(self.vocab, **self.added_tokens_encoder)
 
-    def _tokenize(self, text):
+    def _tokenize(self, text, split_special_tokens=False):
         split_tokens = []
         if self.do_basic_tokenize:
-            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-
+            for token in self.basic_tokenizer.tokenize(
+                text, never_split=self.all_special_tokens if not split_special_tokens else None
+            ):
                 # If the token is part of the never_split set
                 if token in self.basic_tokenizer.never_split:
                     split_tokens.append(token)
@@ -317,20 +319,30 @@ class BasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -349,7 +361,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -377,7 +391,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
diff --git a/src/transformers/models/lxmert/tokenization_lxmert_fast.py b/src/transformers/models/lxmert/tokenization_lxmert_fast.py
index 1b804f5239b2..0584f1fe83c3 100644
--- a/src/transformers/models/lxmert/tokenization_lxmert_fast.py
+++ b/src/transformers/models/lxmert/tokenization_lxmert_fast.py
@@ -103,7 +103,7 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
@@ -152,7 +152,7 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
         """
         output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
 
-        if token_ids_1:
+        if token_ids_1 is not None:
             output += token_ids_1 + [self.sep_token_id]
 
         return output
diff --git a/src/transformers/models/m2m_100/__init__.py b/src/transformers/models/m2m_100/__init__.py
index 23b7e2a46cbe..db2f0223bf04 100644
--- a/src/transformers/models/m2m_100/__init__.py
+++ b/src/transformers/models/m2m_100/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/m2m_100/configuration_m2m_100.py b/src/transformers/models/m2m_100/configuration_m2m_100.py
index 0ab2365accd3..07414c1b822f 100644
--- a/src/transformers/models/m2m_100/configuration_m2m_100.py
+++ b/src/transformers/models/m2m_100/configuration_m2m_100.py
@@ -88,12 +88,12 @@ class M2M100Config(PretrainedConfig):
     Example:
 
     ```python
-    >>> from transformers import M2M100Model, M2M100Config
+    >>> from transformers import M2M100Config, M2M100Model
 
     >>> # Initializing a M2M100 facebook/m2m100_418M style configuration
     >>> configuration = M2M100Config()
 
-    >>> # Initializing a model from the facebook/m2m100_418M style configuration
+    >>> # Initializing a model (with random weights) from the facebook/m2m100_418M style configuration
     >>> model = M2M100Model(configuration)
 
     >>> # Accessing the model configuration
@@ -128,7 +128,7 @@ def __init__(
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
diff --git a/src/transformers/models/m2m_100/modeling_m2m_100.py b/src/transformers/models/m2m_100/modeling_m2m_100.py
index 1a86e3e07d32..88e543b54b52 100755
--- a/src/transformers/models/m2m_100/modeling_m2m_100.py
+++ b/src/transformers/models/m2m_100/modeling_m2m_100.py
@@ -16,7 +16,6 @@
 
 
 import math
-import random
 from typing import List, Optional, Tuple, Union
 
 import torch
@@ -24,7 +23,7 @@
 from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
-from ...deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
 from ...modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPastAndCrossAttentions,
@@ -46,7 +45,6 @@
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "M2M100Config"
-_TOKENIZER_FOR_DOC = "M2M100Tokenizer"
 _CHECKPOINT_FOR_DOC = "facebook/m2m100_418M"
 
 
@@ -74,18 +72,20 @@ def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start
 
 
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
     """
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
-    mask_cond = torch.arange(mask.size(-1))
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
 
     if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
     return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
 
 
@@ -131,7 +131,7 @@ def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Opt
             # in forward put the weights on the correct dtype and device of the param
             emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
 
-        self.register_buffer("weights", emb_weights)
+        self.register_buffer("weights", emb_weights, persistent=False)
 
     @staticmethod
     def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
@@ -173,7 +173,7 @@ def forward(
         if max_pos > self.weights.size(0):
             self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)
 
-        return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, -1).detach()
+        return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, self.weights.shape[-1]).detach()
 
     def create_position_ids_from_inputs_embeds(self, inputs_embeds, past_key_values_length):
         """
@@ -285,8 +285,8 @@ def forward(
 
         proj_shape = (bsz * self.num_heads, -1, self.head_dim)
         query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-        key_states = key_states.view(*proj_shape)
-        value_states = value_states.view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
 
         src_len = key_states.size(1)
         attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
@@ -332,7 +332,7 @@ def forward(
 
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
             raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
                 f" {attn_output.size()}"
             )
 
@@ -340,7 +340,7 @@ def forward(
         attn_output = attn_output.transpose(1, 2)
 
         # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned aross GPUs when using tensor-parallelism.
+        # partitioned across GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
@@ -375,7 +375,7 @@ def forward(
     ) -> torch.Tensor:
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
@@ -459,11 +459,11 @@ def forward(
     ) -> torch.Tensor:
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             encoder_hidden_states (`torch.FloatTensor`):
-                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
             encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
@@ -577,10 +577,10 @@ def _set_gradient_checkpointing(self, module, value=False):
     Translation example:
 
     ```python
-    >>> from transformers import M2M100Tokenizer, M2M100ForConditionalGeneration
+    >>> from transformers import AutoTokenizer, M2M100ForConditionalGeneration
 
     >>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
-    >>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
+    >>> tokenizer = AutoTokenizer.from_pretrained("facebook/m2m100_418M")
 
     >>> text_to_translate = "Life is like a box of chocolates"
     >>> model_inputs = tokenizer(text_to_translate, return_tensors="pt")
@@ -597,7 +597,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`M2M100Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -611,7 +611,7 @@ def _set_gradient_checkpointing(self, module, value=False):
         decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`M2M100Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -734,7 +734,7 @@ def forward(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`M2M100Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -774,6 +774,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
         elif inputs_embeds is not None:
@@ -812,7 +813,7 @@ def forward(
                 encoder_states = encoder_states + (hidden_states,)
 
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.layerdrop) else False
             if not skip_the_layer or deepspeed_zero3_is_enabled:
@@ -915,7 +916,7 @@ def forward(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`M2M100Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -1003,8 +1004,11 @@ def forward(
         combined_attention_mask = None
         if input_shape[-1] > 1:
             combined_attention_mask = _make_causal_mask(
-                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
-            ).to(inputs_embeds.device)
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
 
         if attention_mask is not None and combined_attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
@@ -1025,6 +1029,13 @@ def forward(
 
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting" " `use_cache=False`..."
+                )
+                use_cache = False
+
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
@@ -1046,7 +1057,7 @@ def forward(
                 all_hidden_states += (hidden_states,)
 
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.layerdrop) else False
             if not skip_the_layer or deepspeed_zero3_is_enabled:
@@ -1056,13 +1067,6 @@ def forward(
 
                 if self.gradient_checkpointing and self.training:
 
-                    if use_cache:
-                        logger.warning(
-                            "`use_cache=True` is incompatible with gradient checkpointing. Setting"
-                            " `use_cache=False`..."
-                        )
-                        use_cache = False
-
                     def create_custom_forward(module):
                         def custom_forward(*inputs):
                             # None for past_key_value
@@ -1081,7 +1085,6 @@ def custom_forward(*inputs):
                         None,
                     )
                 else:
-
                     layer_outputs = decoder_layer(
                         hidden_states,
                         attention_mask=combined_attention_mask,
@@ -1135,14 +1138,7 @@ def custom_forward(*inputs):
     M2M_100_START_DOCSTRING,
 )
 class M2M100Model(M2M100PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        "encoder.embed_tokens.weight",
-        "decoder.embed_tokens.weight",
-        "encoder.embed_positions.weights",
-        "encoder.embed_positions.bias",
-        "decoder.embed_positions.weights",
-        "decoder.embed_positions.bias",
-    ]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: M2M100Config):
         super().__init__(config)
@@ -1172,7 +1168,6 @@ def get_decoder(self):
 
     @add_start_docstrings_to_model_forward(M2M_100_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=Seq2SeqModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1256,17 +1251,7 @@ def forward(
 )
 class M2M100ForConditionalGeneration(M2M100PreTrainedModel):
     base_model_prefix = "model"
-    _keys_to_ignore_on_load_missing = [
-        r"encoder.version",
-        r"decoder.version",
-        r"lm_head.weight",
-        r"encoder.embed_tokens.weight",
-        r"decoder.embed_tokens.weight",
-        r"encoder.embed_positions.weights",
-        r"encoder.embed_positions.bias",
-        r"decoder.embed_positions.weights",
-        r"decoder.embed_positions.bias",
-    ]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
 
     def __init__(self, config: M2M100Config):
         super().__init__(config)
@@ -1282,10 +1267,6 @@ def get_encoder(self):
     def get_decoder(self):
         return self.model.get_decoder()
 
-    def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
-        new_embeddings = super().resize_token_embeddings(new_num_tokens)
-        return new_embeddings
-
     def get_output_embeddings(self):
         return self.lm_head
 
@@ -1351,6 +1332,8 @@ def forward(
 
         masked_lm_loss = None
         if labels is not None:
+            # move labels to the correct device to enable PP
+            labels = labels.to(lm_logits.device)
             loss_fct = CrossEntropyLoss()
             masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
 
@@ -1399,8 +1382,10 @@ def prepare_inputs_for_generation(
         }
 
     @staticmethod
-    def _reorder_cache(past, beam_idx):
+    def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
         return reordered_past
diff --git a/src/transformers/models/m2m_100/tokenization_m2m_100.py b/src/transformers/models/m2m_100/tokenization_m2m_100.py
index 984d05cd582d..1346af81412a 100644
--- a/src/transformers/models/m2m_100/tokenization_m2m_100.py
+++ b/src/transformers/models/m2m_100/tokenization_m2m_100.py
@@ -110,13 +110,14 @@ class M2M100Tokenizer(PreTrainedTokenizer):
     Examples:
 
     ```python
-    >>> from transformers import M2M100Tokenizer
+    >>> from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
 
+    >>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
     >>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="en", tgt_lang="ro")
     >>> src_text = " UN Chief Says There Is No Military Solution in Syria"
     >>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
     >>> model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
-    >>> model(**model_inputs)  # should work
+    >>> outputs = model(**model_inputs)  # should work
     ```"""
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -149,26 +150,11 @@ def __init__(
         fairseq_language_code = FAIRSEQ_LANGUAGE_CODES[language_codes]
         self.lang_code_to_token = {lang_code: f"__{lang_code}__" for lang_code in fairseq_language_code}
 
-        kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", [])
-        kwargs["additional_special_tokens"] += [
-            self.get_lang_token(lang_code)
-            for lang_code in fairseq_language_code
-            if self.get_lang_token(lang_code) not in kwargs["additional_special_tokens"]
-        ]
-
-        super().__init__(
-            src_lang=src_lang,
-            tgt_lang=tgt_lang,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            sep_token=sep_token,
-            unk_token=unk_token,
-            pad_token=pad_token,
-            language_codes=language_codes,
-            sp_model_kwargs=self.sp_model_kwargs,
-            num_madeup_words=num_madeup_words,
-            **kwargs,
-        )
+        additional_special_tokens = kwargs.pop("additional_special_tokens", [])
+        for lang_code in fairseq_language_code:
+            token = self.get_lang_token(lang_code)
+            if token not in additional_special_tokens and lang_code not in str(token) not in self.added_tokens_encoder:
+                additional_special_tokens.append(token)
 
         self.vocab_file = vocab_file
         self.encoder = load_json(vocab_file)
@@ -187,13 +173,33 @@ def __init__(
         self._src_lang = src_lang if src_lang is not None else "en"
         self.tgt_lang = tgt_lang
         self.cur_lang_id = self.get_lang_id(self._src_lang)
-        self.set_src_lang_special_tokens(self._src_lang)
 
         self.num_madeup_words = num_madeup_words
 
+        super().__init__(
+            src_lang=src_lang,
+            tgt_lang=tgt_lang,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            language_codes=language_codes,
+            sp_model_kwargs=self.sp_model_kwargs,
+            additional_special_tokens=additional_special_tokens,
+            num_madeup_words=num_madeup_words,
+            **kwargs,
+        )
+        self.set_src_lang_special_tokens(self._src_lang)
+
     @property
     def vocab_size(self) -> int:
-        return len(self.encoder) + len(self.lang_token_to_id) + self.num_madeup_words
+        return len(self.encoder)
+
+    def get_vocab(self) -> Dict:
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
 
     @property
     def src_lang(self) -> str:
@@ -289,11 +295,6 @@ def build_inputs_with_special_tokens(
         # We don't expect to process pairs, but leave the pair logic for API consistency
         return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
 
-    def get_vocab(self) -> Dict:
-        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-
     def __getstate__(self) -> Dict:
         state = self.__dict__.copy()
         state["sp_model"] = None
diff --git a/src/transformers/models/marian/__init__.py b/src/transformers/models/marian/__init__.py
index eaaaf290821b..56f0a4e86afb 100644
--- a/src/transformers/models/marian/__init__.py
+++ b/src/transformers/models/marian/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/marian/configuration_marian.py b/src/transformers/models/marian/configuration_marian.py
index 0a6ad7ec3a15..a2fdd41d7442 100644
--- a/src/transformers/models/marian/configuration_marian.py
+++ b/src/transformers/models/marian/configuration_marian.py
@@ -133,7 +133,7 @@ def __init__(
         eos_token_id=0,
         forced_eos_token_id=0,
         share_encoder_decoder_embeddings=True,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.decoder_vocab_size = decoder_vocab_size or vocab_size
diff --git a/src/transformers/models/marian/convert_marian_tatoeba_to_pytorch.py b/src/transformers/models/marian/convert_marian_tatoeba_to_pytorch.py
index c17514462398..f6b548c2b07f 100644
--- a/src/transformers/models/marian/convert_marian_tatoeba_to_pytorch.py
+++ b/src/transformers/models/marian/convert_marian_tatoeba_to_pytorch.py
@@ -20,9 +20,9 @@
 from pathlib import Path
 from typing import Tuple
 
+import yaml
 from tqdm import tqdm
 
-import yaml
 from transformers.models.marian.convert_marian_to_pytorch import (
     FRONT_MATTER_TEMPLATE,
     convert,
diff --git a/src/transformers/models/marian/convert_marian_to_pytorch.py b/src/transformers/models/marian/convert_marian_to_pytorch.py
index 1fb5a34f064f..0eb17063c2ba 100644
--- a/src/transformers/models/marian/convert_marian_to_pytorch.py
+++ b/src/transformers/models/marian/convert_marian_to_pytorch.py
@@ -24,10 +24,10 @@
 
 import numpy as np
 import torch
+from huggingface_hub.hf_api import list_models
 from torch import nn
 from tqdm import tqdm
 
-from huggingface_hub.hf_api import list_models
 from transformers import MarianConfig, MarianMTModel, MarianTokenizer
 
 
@@ -185,12 +185,12 @@ def convert_hf_name_to_opus_name(hf_model_name):
 def get_system_metadata(repo_root):
     import git
 
-    return dict(
-        helsinki_git_sha=git.Repo(path=repo_root, search_parent_directories=True).head.object.hexsha,
-        transformers_git_sha=git.Repo(path=".", search_parent_directories=True).head.object.hexsha,
-        port_machine=socket.gethostname(),
-        port_time=time.strftime("%Y-%m-%d-%H:%M"),
-    )
+    return {
+        "helsinki_git_sha": git.Repo(path=repo_root, search_parent_directories=True).head.object.hexsha,
+        "transformers_git_sha": git.Repo(path=".", search_parent_directories=True).head.object.hexsha,
+        "port_machine": socket.gethostname(),
+        "port_time": time.strftime("%Y-%m-%d-%H:%M"),
+    }
 
 
 # docstyle-ignore
@@ -366,7 +366,7 @@ def _parse_readme(lns):
 
 def save_tokenizer_config(dest_dir: Path, separate_vocabs=False):
     dname = dest_dir.name.split("-")
-    dct = dict(target_lang=dname[-1], source_lang="-".join(dname[:-1]), separate_vocabs=separate_vocabs)
+    dct = {"target_lang": dname[-1], "source_lang": "-".join(dname[:-1]), "separate_vocabs": separate_vocabs}
     save_json(dct, dest_dir / "tokenizer_config.json")
 
 
@@ -631,7 +631,7 @@ def load_marian_model(self) -> MarianMTModel:
             model.model.decoder.embed_positions.weight = wpos_tensor
 
         if cfg.normalize_embedding:
-            if not ("encoder_emb_ln_scale_pre" in state_dict):
+            if "encoder_emb_ln_scale_pre" not in state_dict:
                 raise ValueError("encoder_emb_ln_scale_pre is not in state dictionary")
             raise NotImplementedError("Need to convert layernorm_embedding")
 
diff --git a/src/transformers/models/marian/modeling_flax_marian.py b/src/transformers/models/marian/modeling_flax_marian.py
index da2e4a1fe5b5..a713fdb05dcf 100644
--- a/src/transformers/models/marian/modeling_flax_marian.py
+++ b/src/transformers/models/marian/modeling_flax_marian.py
@@ -19,11 +19,10 @@
 from functools import partial
 from typing import Callable, Optional, Tuple
 
-import numpy as np
-
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen import combine_masks, make_causal_mask
 from flax.linen.attention import dot_product_attention_weights
@@ -53,7 +52,6 @@
 
 _CHECKPOINT_FOR_DOC = "Helsinki-NLP/opus-mt-en-de"
 _CONFIG_FOR_DOC = "MarianConfig"
-_TOKENIZER_FOR_DOC = "MarianTokenizer"
 
 
 MARIAN_START_DOCSTRING = r"""
@@ -96,7 +94,7 @@
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`MarianTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -110,7 +108,7 @@
         decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`MarianTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -147,7 +145,7 @@
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`MarianTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -176,7 +174,7 @@
         decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`MarianTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -229,15 +227,15 @@ def create_sinusoidal_positions(n_pos, dim):
 
 
 # Copied from transformers.models.bart.modeling_flax_bart.shift_tokens_right
-def shift_tokens_right(input_ids: jnp.ndarray, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray:
+def shift_tokens_right(input_ids: jnp.array, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray:
     """
     Shift input ids one token to the right.
     """
-    shifted_input_ids = np.zeros_like(input_ids)
-    shifted_input_ids[:, 1:] = input_ids[:, :-1]
-    shifted_input_ids[:, 0] = decoder_start_token_id
+    shifted_input_ids = jnp.zeros_like(input_ids)
+    shifted_input_ids = shifted_input_ids.at[:, 1:].set(input_ids[:, :-1])
+    shifted_input_ids = shifted_input_ids.at[:, 0].set(decoder_start_token_id)
 
-    shifted_input_ids = np.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
+    shifted_input_ids = jnp.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
     return shifted_input_ids
 
 
@@ -381,7 +379,7 @@ def __call__(
             attention_bias = lax.select(
                 attention_mask > 0,
                 jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
-                jnp.full(attention_mask.shape, float("-inf")).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
             )
         else:
             attention_bias = None
@@ -884,7 +882,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, **kwargs)
         super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
@@ -981,9 +979,9 @@ def encode(
         Example:
 
         ```python
-        >>> from transformers import MarianTokenizer, FlaxMarianMTModel
+        >>> from transformers import AutoTokenizer, FlaxMarianMTModel
 
-        >>> tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
+        >>> tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
         >>> model = FlaxMarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-de")
 
         >>> text = "My friends are cool but they eat too many carbs."
@@ -1049,9 +1047,9 @@ def decode(
 
         ```python
         >>> import jax.numpy as jnp
-        >>> from transformers import MarianTokenizer, FlaxMarianMTModel
+        >>> from transformers import AutoTokenizer, FlaxMarianMTModel
 
-        >>> tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
+        >>> tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
         >>> model = FlaxMarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-de")
 
         >>> text = "My friends are cool but they eat too many carbs."
@@ -1210,9 +1208,7 @@ class FlaxMarianModel(FlaxMarianPreTrainedModel):
     module_class = FlaxMarianModule
 
 
-append_call_sample_docstring(
-    FlaxMarianModel, _TOKENIZER_FOR_DOC, _CHECKPOINT_FOR_DOC, FlaxSeq2SeqModelOutput, _CONFIG_FOR_DOC
-)
+append_call_sample_docstring(FlaxMarianModel, _CHECKPOINT_FOR_DOC, FlaxSeq2SeqModelOutput, _CONFIG_FOR_DOC)
 
 
 class FlaxMarianMTModule(nn.Module):
@@ -1318,10 +1314,10 @@ def decode(
 
         ```python
         >>> import jax.numpy as jnp
-        >>> from transformers import MarianTokenizer, FlaxMarianMTModel
+        >>> from transformers import AutoTokenizer, FlaxMarianMTModel
 
         >>> model = FlaxMarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-de")
-        >>> tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
+        >>> tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
 
         >>> text = "My friends are cool but they eat too many carbs."
         >>> inputs = tokenizer(text, max_length=64, return_tensors="jax")
@@ -1440,10 +1436,10 @@ def prepare_inputs_for_generation(
         self,
         decoder_input_ids,
         max_length,
-        attention_mask: Optional[jnp.DeviceArray] = None,
-        decoder_attention_mask: Optional[jnp.DeviceArray] = None,
+        attention_mask: Optional[jax.Array] = None,
+        decoder_attention_mask: Optional[jax.Array] = None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # initializing the cache
         batch_size, seq_length = decoder_input_ids.shape
@@ -1479,10 +1475,10 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
     Example:
 
     ```python
-    >>> from transformers import MarianTokenizer, FlaxMarianMTModel
+    >>> from transformers import AutoTokenizer, FlaxMarianMTModel
 
     >>> model = FlaxMarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-de")
-    >>> tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
+    >>> tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
 
     >>> text = "My friends are cool but they eat too many carbs."
     >>> input_ids = tokenizer(text, max_length=64, return_tensors="jax").input_ids
diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py
index c408abf805b4..b4e3aac5be0b 100755
--- a/src/transformers/models/marian/modeling_marian.py
+++ b/src/transformers/models/marian/modeling_marian.py
@@ -17,7 +17,6 @@
 
 import copy
 import math
-import random
 from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
@@ -48,7 +47,6 @@
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "MarianConfig"
-_TOKENIZER_FOR_DOC = "MarianTokenizer"
 _CHECKPOINT_FOR_DOC = "Helsinki-NLP/opus-mt-en-de"
 
 
@@ -76,18 +74,20 @@ def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start
 
 
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
     """
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
-    mask_cond = torch.arange(mask.size(-1))
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
 
     if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
     return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
 
 
@@ -232,8 +232,8 @@ def forward(
 
         proj_shape = (bsz * self.num_heads, -1, self.head_dim)
         query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-        key_states = key_states.view(*proj_shape)
-        value_states = value_states.view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
 
         src_len = key_states.size(1)
         attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
@@ -279,7 +279,7 @@ def forward(
 
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
             raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
                 f" {attn_output.size()}"
             )
 
@@ -287,7 +287,7 @@ def forward(
         attn_output = attn_output.transpose(1, 2)
 
         # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned aross GPUs when using tensor-parallelism.
+        # partitioned across GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
@@ -322,7 +322,7 @@ def forward(
     ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
@@ -539,14 +539,14 @@ def dummy_inputs(self):
     Examples:
 
     ```python
-    >>> from transformers import MarianTokenizer, MarianMTModel
+    >>> from transformers import AutoTokenizer, MarianMTModel
 
     >>> src = "fr"  # source language
     >>> trg = "en"  # target language
 
     >>> model_name = f"Helsinki-NLP/opus-mt-{src}-{trg}"
     >>> model = MarianMTModel.from_pretrained(model_name)
-    >>> tokenizer = MarianTokenizer.from_pretrained(model_name)
+    >>> tokenizer = AutoTokenizer.from_pretrained(model_name)
 
     >>> sample_text = "où est l'arrêt de bus ?"
     >>> batch = tokenizer([sample_text], return_tensors="pt")
@@ -563,7 +563,7 @@ def dummy_inputs(self):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`MarianTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -577,7 +577,7 @@ def dummy_inputs(self):
         decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`MarianTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -705,7 +705,7 @@ def forward(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`MarianTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -745,6 +745,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
         elif inputs_embeds is not None:
@@ -777,8 +778,13 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
                 layer_outputs = (None, None)
             else:
                 if self.gradient_checkpointing and self.training:
@@ -862,8 +868,11 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
         combined_attention_mask = None
         if input_shape[-1] > 1:
             combined_attention_mask = _make_causal_mask(
-                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
-            ).to(inputs_embeds.device)
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
 
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
@@ -897,7 +906,7 @@ def forward(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`MarianTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -996,6 +1005,13 @@ def forward(
 
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
@@ -1013,20 +1029,15 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         # None for past_key_value
@@ -1045,7 +1056,6 @@ def custom_forward(*inputs):
                     None,
                 )
             else:
-
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
@@ -1094,7 +1104,7 @@ def custom_forward(*inputs):
     "The bare Marian Model outputting raw hidden-states without any specific head on top.", MARIAN_START_DOCSTRING
 )
 class MarianModel(MarianPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: MarianConfig):
         super().__init__(config)
@@ -1203,9 +1213,9 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import MarianTokenizer, MarianModel
+        >>> from transformers import AutoTokenizer, MarianModel
 
-        >>> tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
+        >>> tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
         >>> model = MarianModel.from_pretrained("Helsinki-NLP/opus-mt-en-de")
 
         >>> inputs = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt")
@@ -1282,16 +1292,12 @@ def forward(
 class MarianMTModel(MarianPreTrainedModel):
     base_model_prefix = "model"
     _keys_to_ignore_on_load_missing = [
-        r"final_logits_bias",
-        r"encoder.version",
-        r"decoder.version",
-        r"lm_head.weight",
-        r"embed_positions",
-        "encoder.embed_tokens.weight",
-        "decoder.embed_tokens.weight",
+        "final_logits_bias",
+        "encoder.embed_positions.weight",
+        "decoder.embed_positions.weight",
     ]
-
     _keys_to_ignore_on_save = ["model.encoder.embed_positions.weight", "model.decoder.embed_positions.weight"]
+    _tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight", "lm_head.weight"]
 
     def __init__(self, config: MarianConfig):
         super().__init__(config)
@@ -1310,17 +1316,18 @@ def get_encoder(self):
     def get_decoder(self):
         return self.model.get_decoder()
 
-    def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
-        new_embeddings = super().resize_token_embeddings(new_num_tokens)
+    def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding:
+        new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
         if self.config.share_encoder_decoder_embeddings:
             self._resize_final_logits_bias(new_num_tokens)
         return new_embeddings
 
-    def _resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
+    def _resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of=None) -> nn.Embedding:
         old_embeddings = self.get_input_embeddings()
-        new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
+        new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens, pad_to_multiple_of)
         self.set_input_embeddings(new_embeddings)
 
+        new_num_tokens = new_embeddings.weight.shape[0]
         # update config.decoder_vocab_size if embeddings are tied
         if self.config.share_encoder_decoder_embeddings:
             self.config.decoder_vocab_size = new_num_tokens
@@ -1519,17 +1526,14 @@ def prepare_inputs_for_generation(
     def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
         return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
 
-    def adjust_logits_during_generation(self, logits, cur_len):
-        logits[:, self.config.pad_token_id] = float("-inf")  # never predict pad token.
-        return logits
-
     @staticmethod
-    def _reorder_cache(past, beam_idx):
+    def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
+        for layer_past in past_key_values:
             # cached cross_attention states don't have to be reordered -> they are always the same
             reordered_past += (
-                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
+                + layer_past[2:],
             )
         return reordered_past
 
@@ -1551,7 +1555,7 @@ def forward(self, *args, **kwargs):
 
 # Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->Marian, facebook/bart-base->Helsinki-NLP/opus-mt-fr-en
 class MarianForCausalLM(MarianPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         config = copy.deepcopy(config)
@@ -1606,7 +1610,7 @@ def forward(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`MarianTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -1671,9 +1675,9 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import MarianTokenizer, MarianForCausalLM
+        >>> from transformers import AutoTokenizer, MarianForCausalLM
 
-        >>> tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-fr-en")
+        >>> tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-fr-en")
         >>> model = MarianForCausalLM.from_pretrained("Helsinki-NLP/opus-mt-fr-en", add_cross_attention=False)
         >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
@@ -1711,6 +1715,7 @@ def forward(
 
         loss = None
         if labels is not None:
+            labels = labels.to(logits.device)
             loss_fct = CrossEntropyLoss()
             loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
 
@@ -1745,8 +1750,10 @@ def prepare_inputs_for_generation(
         }
 
     @staticmethod
-    def _reorder_cache(past, beam_idx):
+    def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
         return reordered_past
diff --git a/src/transformers/models/marian/modeling_tf_marian.py b/src/transformers/models/marian/modeling_tf_marian.py
index b93680d84f39..76235b5f0f70 100644
--- a/src/transformers/models/marian/modeling_tf_marian.py
+++ b/src/transformers/models/marian/modeling_tf_marian.py
@@ -15,6 +15,8 @@
 """ TF 2.0 Marian model."""
 
 
+from __future__ import annotations
+
 import random
 from typing import Optional, Tuple, Union
 
@@ -31,13 +33,12 @@
 
 # Public API
 from ...modeling_tf_utils import (
-    DUMMY_INPUTS,
     TFCausalLanguageModelingLoss,
     TFPreTrainedModel,
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     ContextManagers,
     add_code_sample_docstrings,
@@ -54,7 +55,6 @@
 
 _CHECKPOINT_FOR_DOC = "Helsinki-NLP/opus-mt-en-de"
 _CONFIG_FOR_DOC = "MarianConfig"
-_TOKENIZER_FOR_DOC = "MarianTokenizer"
 
 
 LARGE_NEGATIVE = -1e8
@@ -166,7 +166,7 @@ def _init_weight(n_pos: int, dim: int):
         return table
 
     def call(
-        self, input_shape: tf.TensorShape, past_key_values_length: int = 0, position_ids: Optional[tf.Tensor] = None
+        self, input_shape: tf.TensorShape, past_key_values_length: int = 0, position_ids: tf.Tensor | None = None
     ):
         """Input is expected to be of size [bsz x seqlen]."""
         if position_ids is None:
@@ -213,12 +213,12 @@ def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
     def call(
         self,
         hidden_states: tf.Tensor,
-        key_value_states: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[Tuple[tf.Tensor]]] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
+        key_value_states: tf.Tensor | None = None,
+        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
+        attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
         training: Optional[bool] = False,
-    ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
+    ) -> Tuple[tf.Tensor, tf.Tensor | None]:
         """Input shape: Batch x Time x Channel"""
 
         # if key_value_states are provided this layer is used as a cross-attention layer
@@ -348,13 +348,13 @@ def __init__(self, config: MarianConfig, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]],
-        layer_head_mask: Optional[tf.Tensor],
+        attention_mask: np.ndarray | tf.Tensor | None,
+        layer_head_mask: tf.Tensor | None,
         training: Optional[bool] = False,
     ) -> tf.Tensor:
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`tf.Tensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
@@ -418,21 +418,21 @@ def __init__(self, config: MarianConfig, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
-        cross_attn_layer_head_mask: Optional[tf.Tensor] = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
+        cross_attn_layer_head_mask: tf.Tensor | None = None,
         past_key_value: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         training: Optional[bool] = False,
     ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`tf.Tensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             encoder_hidden_states (`tf.Tensor`):
-                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
             encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
@@ -500,34 +500,6 @@ class TFMarianPreTrainedModel(TFPreTrainedModel):
     config_class = MarianConfig
     base_model_prefix = "model"
 
-    @property
-    def dummy_inputs(self):
-        pad_token = 1
-        input_ids = tf.cast(tf.convert_to_tensor(DUMMY_INPUTS), tf.int32)
-        decoder_input_ids = tf.cast(tf.convert_to_tensor(DUMMY_INPUTS), tf.int32)
-        dummy_inputs = {
-            "decoder_input_ids": decoder_input_ids,
-            "attention_mask": tf.cast(input_ids != pad_token, tf.int32),
-            "input_ids": input_ids,
-        }
-        return dummy_inputs
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-                "decoder_input_ids": tf.TensorSpec((None, None), tf.int32, name="decoder_input_ids"),
-                "decoder_attention_mask": tf.TensorSpec((None, None), tf.int32, name="decoder_attention_mask"),
-            }
-        ]
-    )
-    # Copied from transformers.models.bart.modeling_tf_bart.TFBartPretrainedModel.serving
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 MARIAN_START_DOCSTRING = r"""
     This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
@@ -577,7 +549,7 @@ def serving(self, inputs):
         Examples:
 
         ```python
-        >>> from transformers import MarianTokenizer, TFMarianMTModel
+        >>> from transformers import AutoTokenizer, TFMarianMTModel
         >>> from typing import List
 
         >>> src = "fr"  # source language
@@ -586,7 +558,7 @@ def serving(self, inputs):
         >>> model_name = f"Helsinki-NLP/opus-mt-{src}-{trg}"
 
         >>> model = TFMarianMTModel.from_pretrained(model_name)
-        >>> tokenizer = MarianTokenizer.from_pretrained(model_name)
+        >>> tokenizer = AutoTokenizer.from_pretrained(model_name)
         >>> batch = tokenizer([sample_text], return_tensors="tf")
         >>> gen = model.generate(**batch)
         >>> tokenizer.batch_decode(gen, skip_special_tokens=True)
@@ -599,7 +571,7 @@ def serving(self, inputs):
         input_ids (`tf.Tensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`MarianTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -613,7 +585,7 @@ def serving(self, inputs):
         decoder_input_ids (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`MarianTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -709,14 +681,14 @@ def set_embed_tokens(self, embed_tokens):
     @unpack_inputs
     def call(
         self,
-        input_ids=None,
-        inputs_embeds=None,
-        attention_mask=None,
-        head_mask=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        training=False,
+        input_ids: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
     ):
         """
         Args:
@@ -724,7 +696,7 @@ def call(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`MarianTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -779,16 +751,7 @@ def call(
             if hasattr(self.embed_tokens, "load_weight_prefix"):
                 context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
             with ContextManagers(context):
-                # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-                # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-                tf.debugging.assert_less(
-                    input_ids,
-                    tf.cast(self.embed_tokens.input_dim, dtype=input_ids.dtype),
-                    message=(
-                        "input_ids must be smaller than the embedding layer's input dimension (got"
-                        f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.input_dim})"
-                    ),
-                )
+                check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
                 inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
         embed_pos = self.embed_positions(input_shape)
@@ -818,7 +781,6 @@ def call(
 
         # encoder layers
         for idx, encoder_layer in enumerate(self.layers):
-
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
@@ -881,20 +843,20 @@ def set_embed_tokens(self, embed_tokens):
     @unpack_inputs
     def call(
         self,
-        input_ids=None,
-        inputs_embeds=None,
-        attention_mask=None,
-        position_ids=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        head_mask=None,
-        cross_attn_head_mask=None,
-        past_key_values=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        training=False,
+        input_ids: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        encoder_hidden_states: tf.Tensor | None = None,
+        encoder_attention_mask: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        cross_attn_head_mask: tf.Tensor | None = None,
+        past_key_values: Tuple[Tuple[tf.Tensor]] | None = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
     ):
         r"""
         Args:
@@ -902,7 +864,7 @@ def call(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`MarianTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -992,16 +954,7 @@ def call(
             if hasattr(self.embed_tokens, "load_weight_prefix"):
                 context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
             with ContextManagers(context):
-                # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-                # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-                tf.debugging.assert_less(
-                    input_ids,
-                    tf.cast(self.embed_tokens.input_dim, dtype=input_ids.dtype),
-                    message=(
-                        "input_ids must be smaller than the embedding layer's input dimension (got"
-                        f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.input_dim})"
-                    ),
-                )
+                check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
                 inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
         hidden_states = inputs_embeds
@@ -1117,26 +1070,25 @@ def set_input_embeddings(self, new_embeddings):
     @unpack_inputs
     def call(
         self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        decoder_position_ids=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
+        input_ids: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        decoder_input_ids: tf.Tensor | None = None,
+        decoder_attention_mask: tf.Tensor | None = None,
+        decoder_position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        decoder_head_mask: tf.Tensor | None = None,
+        cross_attn_head_mask: tf.Tensor | None = None,
         encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
-        past_key_values=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        training=False,
-        **kwargs
+        past_key_values: Tuple[Tuple[tf.Tensor]] = None,
+        inputs_embeds: tf.Tensor | None = None,
+        decoder_inputs_embeds: tf.Tensor | None = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+        **kwargs,
     ):
-
         if decoder_input_ids is None and decoder_inputs_embeds is None:
             use_cache = False
 
@@ -1217,32 +1169,31 @@ def get_decoder(self):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(MARIAN_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFSeq2SeqModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        decoder_position_ids=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
-        past_key_values=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        training=False,
-        **kwargs
-    ):
+        input_ids: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        decoder_input_ids: tf.Tensor | None = None,
+        decoder_attention_mask: tf.Tensor | None = None,
+        decoder_position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        decoder_head_mask: tf.Tensor | None = None,
+        cross_attn_head_mask: tf.Tensor | None = None,
+        encoder_outputs: tf.Tensor | None = None,
+        past_key_values: Tuple[Tuple[tf.Tensor]] | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        decoder_inputs_embeds: tf.Tensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+        **kwargs,
+    ) -> Tuple[tf.Tensor] | TFSeq2SeqModelOutput:
         outputs = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -1352,25 +1303,25 @@ def set_bias(self, value):
     @add_end_docstrings(MARIAN_GENERATION_EXAMPLE)
     def call(
         self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        decoder_position_ids=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        encoder_outputs: Optional[TFBaseModelOutput] = None,
-        past_key_values=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        labels=None,
-        training=False,
-    ):
+        input_ids: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        decoder_input_ids: tf.Tensor | None = None,
+        decoder_attention_mask: tf.Tensor | None = None,
+        decoder_position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        decoder_head_mask: tf.Tensor | None = None,
+        cross_attn_head_mask: tf.Tensor | None = None,
+        encoder_outputs: TFBaseModelOutput | None = None,
+        past_key_values: Tuple[Tuple[tf.Tensor]] | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        decoder_inputs_embeds: tf.Tensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: tf.Tensor | None = None,
+        training: bool = False,
+    ) -> Tuple[tf.Tensor] | TFSeq2SeqLMOutput:
         r"""
         labels (`tf.tensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
@@ -1463,9 +1414,8 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
-
         # cut decoder_input_ids if past_key_values is used
         if past_key_values is not None:
             decoder_input_ids = decoder_input_ids[:, -1:]
@@ -1493,18 +1443,3 @@ def prepare_inputs_for_generation(
 
     def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor):
         return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
-
-    def adjust_logits_during_generation(
-        self, logits, cur_len, max_length, forced_bos_token_id, forced_eos_token_id, **kwargs
-    ):
-        """Never predict pad_token_id. Predict </s> when max_length is reached."""
-        vocab_range = tf.constant(range(self.config.vocab_size))
-        logits = tf.where(vocab_range == self.config.pad_token_id, LARGE_NEGATIVE, logits)
-        if cur_len == 1 and forced_bos_token_id is not None:
-            vocab_range = tf.constant(range(self.config.vocab_size))
-            return tf.where(vocab_range != forced_bos_token_id, LARGE_NEGATIVE, logits)
-        elif cur_len == max_length - 1 and forced_eos_token_id is not None:
-            vocab_range = tf.constant(range(self.config.vocab_size))
-            return tf.where(vocab_range != forced_eos_token_id, LARGE_NEGATIVE, logits)
-        else:
-            return logits
diff --git a/src/transformers/models/marian/tokenization_marian.py b/src/transformers/models/marian/tokenization_marian.py
index c688733321be..f064b49a8397 100644
--- a/src/transformers/models/marian/tokenization_marian.py
+++ b/src/transformers/models/marian/tokenization_marian.py
@@ -55,6 +55,8 @@
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"Helsinki-NLP/opus-mt-en-de": 512}
 PRETRAINED_INIT_CONFIGURATION = {}
 
+SPIECE_UNDERLINE = "▁"
+
 # Example URL https://huggingface.co/Helsinki-NLP/opus-mt-en-de/resolve/main/vocab.json
 
 
@@ -106,13 +108,13 @@ class MarianTokenizer(PreTrainedTokenizer):
     Examples:
 
     ```python
-    >>> from transformers import MarianTokenizer
+    >>> from transformers import MarianForCausalLM, MarianTokenizer
 
+    >>> model = MarianForCausalLM.from_pretrained("Helsinki-NLP/opus-mt-en-de")
     >>> tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
     >>> src_texts = ["I am a small frog.", "Tom asked his teacher for advice."]
     >>> tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."]  # optional
     >>> inputs = tokenizer(src_texts, text_target=tgt_texts, return_tensors="pt", padding=True)
-    # keys  [input_ids, attention_mask, labels].
 
     >>> outputs = model(**inputs)  # should work
     ```"""
@@ -138,30 +140,17 @@ def __init__(
         model_max_length=512,
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
         separate_vocabs=False,
-        **kwargs
+        **kwargs,
     ) -> None:
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
 
-        super().__init__(
-            # bos_token=bos_token,  unused. Start decoding with config.decoder_start_token_id
-            source_lang=source_lang,
-            target_lang=target_lang,
-            unk_token=unk_token,
-            eos_token=eos_token,
-            pad_token=pad_token,
-            model_max_length=model_max_length,
-            sp_model_kwargs=self.sp_model_kwargs,
-            target_vocab_file=target_vocab_file,
-            separate_vocabs=separate_vocabs,
-            **kwargs,
-        )
         assert Path(source_spm).exists(), f"cannot find spm source {source_spm}"
 
         self.separate_vocabs = separate_vocabs
         self.encoder = load_json(vocab)
-        if self.unk_token not in self.encoder:
-            raise KeyError("<unk> token must be in vocab")
-        assert self.pad_token in self.encoder
+        if unk_token not in self.encoder:
+            raise KeyError("<unk> token must be in the vocab")
+        assert pad_token in self.encoder
 
         if separate_vocabs:
             self.target_encoder = load_json(target_vocab_file)
@@ -185,6 +174,20 @@ def __init__(
 
         self._setup_normalizer()
 
+        super().__init__(
+            # bos_token=bos_token,  unused. Start decoding with config.decoder_start_token_id
+            source_lang=source_lang,
+            target_lang=target_lang,
+            unk_token=unk_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            model_max_length=model_max_length,
+            sp_model_kwargs=self.sp_model_kwargs,
+            target_vocab_file=target_vocab_file,
+            separate_vocabs=separate_vocabs,
+            **kwargs,
+        )
+
     def _setup_normalizer(self):
         try:
             from sacremoses import MosesPunctNormalizer
@@ -225,8 +228,9 @@ def batch_decode(self, sequences, **kwargs):
                 List of tokenized input ids. Can be obtained using the `__call__` method.
             skip_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not to remove special tokens in the decoding.
-            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
-                Whether or not to clean up the tokenization spaces.
+            clean_up_tokenization_spaces (`bool`, *optional*):
+                Whether or not to clean up the tokenization spaces. If `None`, will default to
+                `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
             use_source_tokenizer (`bool`, *optional*, defaults to `False`):
                 Whether or not to use the source tokenizer to decode sequences (only applicable in sequence-to-sequence
                 problems).
@@ -250,8 +254,9 @@ def decode(self, token_ids, **kwargs):
                 List of tokenized input ids. Can be obtained using the `__call__` method.
             skip_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not to remove special tokens in the decoding.
-            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
-                Whether or not to clean up the tokenization spaces.
+            clean_up_tokenization_spaces (`bool`, *optional*):
+                Whether or not to clean up the tokenization spaces. If `None`, will default to
+                `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
             use_source_tokenizer (`bool`, *optional*, defaults to `False`):
                 Whether or not to use the source tokenizer to decode sequences (only applicable in sequence-to-sequence
                 problems).
@@ -276,6 +281,7 @@ def convert_tokens_to_string(self, tokens: List[str]) -> str:
             else:
                 current_sub_tokens.append(token)
         out_string += sp_model.decode_pieces(current_sub_tokens)
+        out_string = out_string.replace(SPIECE_UNDERLINE, " ")
         return out_string.strip()
 
     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
diff --git a/src/transformers/models/markuplm/__init__.py b/src/transformers/models/markuplm/__init__.py
index 9d81b9ad369e..f8df88ce16f6 100644
--- a/src/transformers/models/markuplm/__init__.py
+++ b/src/transformers/models/markuplm/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,7 +13,6 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-# rely on isort to merge the imports
 from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
 
 
diff --git a/src/transformers/models/markuplm/configuration_markuplm.py b/src/transformers/models/markuplm/configuration_markuplm.py
index d8c06d811cfc..1455150598ac 100644
--- a/src/transformers/models/markuplm/configuration_markuplm.py
+++ b/src/transformers/models/markuplm/configuration_markuplm.py
@@ -14,9 +14,8 @@
 # limitations under the License.
 """ MarkupLM model configuration"""
 
-from transformers.utils import logging
-
 from ...configuration_utils import PretrainedConfig
+from ...utils import logging
 
 
 logger = logging.get_logger(__name__)
@@ -65,8 +64,6 @@ class MarkupLMConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        gradient_checkpointing (`bool`, *optional*, defaults to `False`):
-            If True, use gradient checkpointing to save memory at the expense of slower backward pass.
         max_tree_id_unit_embeddings (`int`, *optional*, defaults to 1024):
             The maximum value that the tree id unit embedding might ever use. Typically set this to something large
             just in case (e.g., 1024).
@@ -119,7 +116,6 @@ def __init__(
         pad_token_id=0,
         bos_token_id=0,
         eos_token_id=2,
-        gradient_checkpointing=False,
         max_xpath_tag_unit_embeddings=256,
         max_xpath_subs_unit_embeddings=1024,
         tag_pad_id=216,
@@ -129,13 +125,12 @@ def __init__(
         position_embedding_type="absolute",
         use_cache=True,
         classifier_dropout=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,
             eos_token_id=eos_token_id,
-            gradient_checkpointing=gradient_checkpointing,
             **kwargs,
         )
         self.vocab_size = vocab_size
diff --git a/src/transformers/models/markuplm/modeling_markuplm.py b/src/transformers/models/markuplm/modeling_markuplm.py
index d1c7962ef4b4..ca6bea403372 100755
--- a/src/transformers/models/markuplm/modeling_markuplm.py
+++ b/src/transformers/models/markuplm/modeling_markuplm.py
@@ -23,13 +23,13 @@
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
-from transformers.activations import ACT2FN
-from transformers.file_utils import (
+from ...activations import ACT2FN
+from ...file_utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     replace_return_docstrings,
 )
-from transformers.modeling_outputs import (
+from ...modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     BaseModelOutputWithPoolingAndCrossAttentions,
     MaskedLMOutput,
@@ -37,14 +37,13 @@
     SequenceClassifierOutput,
     TokenClassifierOutput,
 )
-from transformers.modeling_utils import (
+from ...modeling_utils import (
     PreTrainedModel,
     apply_chunking_to_forward,
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
-from transformers.utils import logging
-
+from ...utils import logging
 from .configuration_markuplm import MarkupLMConfig
 
 
@@ -52,7 +51,6 @@
 
 _CHECKPOINT_FOR_DOC = "microsoft/markuplm-base"
 _CONFIG_FOR_DOC = "MarkupLMConfig"
-_TOKENIZER_FOR_DOC = "MarkupLMTokenizer"
 
 MARKUPLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "microsoft/markuplm-base",
@@ -145,7 +143,9 @@ def __init__(self, config):
         self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
         self.padding_idx = config.pad_token_id
         self.position_embeddings = nn.Embedding(
@@ -632,6 +632,13 @@ def forward(
         all_self_attentions = () if output_attentions else None
         all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         next_decoder_cache = () if use_cache else None
         for i, layer_module in enumerate(self.layer):
             if output_hidden_states:
@@ -642,12 +649,6 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         return module(*inputs, past_key_value, output_attentions)
@@ -714,7 +715,6 @@ class MarkupLMPreTrainedModel(PreTrainedModel):
     config_class = MarkupLMConfig
     pretrained_model_archive_map = MARKUPLM_PRETRAINED_MODEL_ARCHIVE_LIST
     base_model_prefix = "markuplm"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights with Bert->MarkupLM
     def _init_weights(self, module):
@@ -756,7 +756,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`MarkupLMTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -854,9 +854,9 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import MarkupLMProcessor, MarkupLMModel
+        >>> from transformers import AutoProcessor, MarkupLMModel
 
-        >>> processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base")
+        >>> processor = AutoProcessor.from_pretrained("microsoft/markuplm-base")
         >>> model = MarkupLMModel.from_pretrained("microsoft/markuplm-base")
 
         >>> html_string = "<html> <head> <title>Page Title</title> </head> </html>"
@@ -877,6 +877,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -957,10 +958,12 @@ def prepare_inputs_for_generation(
         }
 
     # Copied from transformers.models.bert.modeling_bert.BertModel._reorder_cache
-    def _reorder_cache(self, past, beam_idx):
+    def _reorder_cache(self, past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
         return reordered_past
 
 
@@ -972,8 +975,6 @@ def _reorder_cache(self, past, beam_idx):
     MARKUPLM_START_DOCSTRING,
 )
 class MarkupLMForQuestionAnswering(MarkupLMPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     # Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering.__init__ with bert->markuplm, Bert->MarkupLM
     def __init__(self, config):
         super().__init__(config)
@@ -1018,10 +1019,10 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import MarkupLMProcessor, MarkupLMForQuestionAnswering
+        >>> from transformers import AutoProcessor, MarkupLMForQuestionAnswering
         >>> import torch
 
-        >>> processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base-finetuned-websrc")
+        >>> processor = AutoProcessor.from_pretrained("microsoft/markuplm-base-finetuned-websrc")
         >>> model = MarkupLMForQuestionAnswering.from_pretrained("microsoft/markuplm-base-finetuned-websrc")
 
         >>> html_string = "<html> <head> <title>My name is Niels</title> </head> </html>"
diff --git a/src/transformers/models/markuplm/processing_markuplm.py b/src/transformers/models/markuplm/processing_markuplm.py
index d6251586ac67..51307d20eb5f 100644
--- a/src/transformers/models/markuplm/processing_markuplm.py
+++ b/src/transformers/models/markuplm/processing_markuplm.py
@@ -66,7 +66,7 @@ def __call__(
         return_length: bool = False,
         verbose: bool = True,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         This method first forwards the `html_strings` argument to [`~MarkupLMFeatureExtractor.__call__`]. Next, it
diff --git a/src/transformers/models/markuplm/tokenization_markuplm.py b/src/transformers/models/markuplm/tokenization_markuplm.py
index 2c9e006858ca..24fa4b7763a9 100644
--- a/src/transformers/models/markuplm/tokenization_markuplm.py
+++ b/src/transformers/models/markuplm/tokenization_markuplm.py
@@ -96,7 +96,7 @@
                 argument defines the number of overlapping tokens.
             pad_to_multiple_of (`int`, *optional*):
                 If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
-                the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
+                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
             return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
                 If set, will return tensors instead of list of python integers. Acceptable values are:
 
@@ -220,7 +220,7 @@ def __init__(
         pad_width=1001,
         pad_token_label=-100,
         only_label_first_subword=True,
-        **kwargs
+        **kwargs,
     ):
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
@@ -232,27 +232,6 @@ def __init__(
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
 
-        super().__init__(
-            vocab_file=vocab_file,
-            merges_file=merges_file,
-            tags_dict=tags_dict,
-            errors=errors,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            add_prefix_space=add_prefix_space,
-            max_depth=max_depth,
-            max_width=max_width,
-            pad_width=pad_width,
-            pad_token_label=pad_token_label,
-            only_label_first_subword=only_label_first_subword,
-            **kwargs,
-        )
-
         with open(vocab_file, encoding="utf-8") as vocab_handle:
             self.encoder = json.load(vocab_handle)
 
@@ -279,6 +258,28 @@ def __init__(
         self.pad_tag_id = self.unk_tag_id + 1
         self.pad_xpath_tags_seq = [self.pad_tag_id] * self.max_depth
         self.pad_xpath_subs_seq = [self.pad_width] * self.max_depth
+
+        super().__init__(
+            vocab_file=vocab_file,
+            merges_file=merges_file,
+            tags_dict=tags_dict,
+            errors=errors,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            add_prefix_space=add_prefix_space,
+            max_depth=max_depth,
+            max_width=max_width,
+            pad_width=pad_width,
+            pad_token_label=pad_token_label,
+            only_label_first_subword=only_label_first_subword,
+            **kwargs,
+        )
+
         self.pad_token_label = pad_token_label
         self.only_label_first_subword = only_label_first_subword
 
@@ -301,7 +302,7 @@ def get_xpath_seq(self, xpath):
             xpath_subs_list.append(min(self.max_width, sub))
 
         xpath_tags_list = xpath_tags_list[: self.max_depth]
-        xpath_subs_list = xpath_tags_list[: self.max_depth]
+        xpath_subs_list = xpath_subs_list[: self.max_depth]
         xpath_tags_list += [self.pad_tag_id] * (self.max_depth - len(xpath_tags_list))
         xpath_subs_list += [self.pad_width] * (self.max_depth - len(xpath_subs_list))
 
@@ -312,7 +313,9 @@ def vocab_size(self):
         return len(self.encoder)
 
     def get_vocab(self):
-        return dict(self.encoder, **self.added_tokens_encoder)
+        vocab = self.encoder.copy()
+        vocab.update(self.added_tokens_encoder)
+        return vocab
 
     def bpe(self, token):
         if token in self.cache:
@@ -527,7 +530,7 @@ def __call__(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
@@ -676,7 +679,7 @@ def batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
         padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
@@ -734,7 +737,7 @@ def _batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         if return_offsets_mapping:
             raise NotImplementedError(
@@ -857,7 +860,7 @@ def encode(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> List[int]:
         encoded_inputs = self.encode_plus(
             text=text,
@@ -904,7 +907,7 @@ def encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated,
@@ -970,7 +973,7 @@ def _encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         if return_offsets_mapping:
             raise NotImplementedError(
@@ -1024,7 +1027,7 @@ def prepare_for_model(
         return_length: bool = False,
         verbose: bool = True,
         prepend_batch_axis: bool = False,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Prepares a sequence or a pair of sequences so that it can be used by the model. It adds special tokens,
@@ -1315,8 +1318,7 @@ def truncate_sequences(
                 )
                 if truncation_strategy == TruncationStrategy.ONLY_FIRST:
                     error_msg = (
-                        error_msg
-                        + "Please select another truncation strategy than "
+                        error_msg + "Please select another truncation strategy than "
                         f"{truncation_strategy}, for instance 'longest_first' or 'only_second'."
                     )
                 logger.error(error_msg)
@@ -1392,7 +1394,7 @@ def _pad(
                     - 'right': pads on the right of the sequences
             pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                 This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
-                >= 7.5 (Volta).
+                `>= 7.5` (Volta).
             return_attention_mask:
                 (optional) Set to False to avoid returning attention mask (default: set to model specifics)
         """
diff --git a/src/transformers/models/markuplm/tokenization_markuplm_fast.py b/src/transformers/models/markuplm/tokenization_markuplm_fast.py
index 1531c5ca4bce..a0933631b65b 100644
--- a/src/transformers/models/markuplm/tokenization_markuplm_fast.py
+++ b/src/transformers/models/markuplm/tokenization_markuplm_fast.py
@@ -26,6 +26,7 @@
 from ...file_utils import PaddingStrategy, TensorType, add_end_docstrings
 from ...tokenization_utils_base import (
     ENCODE_KWARGS_DOCSTRING,
+    AddedToken,
     BatchEncoding,
     EncodedInput,
     PreTokenizedInput,
@@ -180,8 +181,18 @@ def __init__(
         pad_token_label=-100,
         only_label_first_subword=True,
         trim_offsets=False,
-        **kwargs
+        **kwargs,
     ):
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
+        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+
         super().__init__(
             vocab_file=vocab_file,
             merges_file=merges_file,
@@ -275,7 +286,7 @@ def get_xpath_seq(self, xpath):
             xpath_subs_list.append(min(self.max_width, sub))
 
         xpath_tags_list = xpath_tags_list[: self.max_depth]
-        xpath_subs_list = xpath_tags_list[: self.max_depth]
+        xpath_subs_list = xpath_subs_list[: self.max_depth]
         xpath_tags_list += [self.pad_tag_id] * (self.max_depth - len(xpath_tags_list))
         xpath_subs_list += [self.pad_width] * (self.max_depth - len(xpath_subs_list))
 
@@ -302,7 +313,7 @@ def __call__(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
@@ -321,6 +332,7 @@ def __call__(
             node_labels (`List[int]`, `List[List[int]]`, *optional*):
                 Node-level integer labels (for token classification tasks).
         """
+
         # Input type checking for clearer error
         def _is_valid_text_input(t):
             if isinstance(t, str):
@@ -450,7 +462,7 @@ def batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
         padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
@@ -513,7 +525,7 @@ def encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated,
@@ -736,7 +748,7 @@ def _encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         # make it a batched input
         # 2 options:
@@ -806,7 +818,7 @@ def _pad(
                     - 'right': pads on the right of the sequences
             pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                 This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
-                >= 7.5 (Volta).
+                `>= 7.5` (Volta).
             return_attention_mask:
                 (optional) Set to False to avoid returning attention mask (default: set to model specifics)
         """
diff --git a/src/transformers/models/mask2former/__init__.py b/src/transformers/models/mask2former/__init__.py
new file mode 100644
index 000000000000..d6db4a478ac1
--- /dev/null
+++ b/src/transformers/models/mask2former/__init__.py
@@ -0,0 +1,75 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
+
+
+_import_structure = {
+    "configuration_mask2former": [
+        "MASK2FORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "Mask2FormerConfig",
+    ],
+}
+
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["image_processing_mask2former"] = ["Mask2FormerImageProcessor"]
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_mask2former"] = [
+        "MASK2FORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "Mask2FormerForUniversalSegmentation",
+        "Mask2FormerModel",
+        "Mask2FormerPreTrainedModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_mask2former import MASK2FORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, Mask2FormerConfig
+
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .image_processing_mask2former import Mask2FormerImageProcessor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_mask2former import (
+            MASK2FORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            Mask2FormerForUniversalSegmentation,
+            Mask2FormerModel,
+            Mask2FormerPreTrainedModel,
+        )
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
diff --git a/src/transformers/models/mask2former/configuration_mask2former.py b/src/transformers/models/mask2former/configuration_mask2former.py
new file mode 100644
index 000000000000..ccc1c9c2cffc
--- /dev/null
+++ b/src/transformers/models/mask2former/configuration_mask2former.py
@@ -0,0 +1,231 @@
+# coding=utf-8
+# Copyright 2022 Meta Platforms, Inc.and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Mask2Former model configuration"""
+from typing import Dict, List, Optional
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto import CONFIG_MAPPING
+
+
+MASK2FORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "facebook/mask2former-swin-small-coco-instance": (
+        "https://huggingface.co/facebook/mask2former-swin-small-coco-instance/blob/main/config.json"
+    )
+    # See all Mask2Former models at https://huggingface.co/models?filter=mask2former
+}
+
+logger = logging.get_logger(__name__)
+
+
+class Mask2FormerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Mask2FormerModel`]. It is used to instantiate a
+    Mask2Former model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the Mask2Former
+    [facebook/mask2former-swin-small-coco-instance](https://huggingface.co/facebook/mask2former-swin-small-coco-instance)
+    architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Currently, Mask2Former only supports the [Swin Transformer](swin) as backbone.
+
+    Args:
+        backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `SwinConfig()`):
+            The configuration of the backbone model. If unset, the configuration corresponding to
+            `swin-base-patch4-window12-384` will be used.
+        feature_size (`int`, *optional*, defaults to 256):
+            The features (channels) of the resulting feature maps.
+        mask_feature_size (`int`, *optional*, defaults to 256):
+            The masks' features size, this value will also be used to specify the Feature Pyramid Network features'
+            size.
+        hidden_dim (`int`, *optional*, defaults to 256):
+            Dimensionality of the encoder layers.
+        encoder_feedforward_dim (`int`, *optional*, defaults to 1024):
+            Dimension of feedforward network for deformable detr encoder used as part of pixel decoder.
+        encoder_layers (`int`, *optional*, defaults to 6):
+            Number of layers in the deformable detr encoder used as part of pixel decoder.
+        decoder_layers (`int`, *optional*, defaults to 10):
+            Number of layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder.
+        dim_feedforward (`int`, *optional*, defaults to 2048):
+            Feature dimension in feedforward network for transformer decoder.
+        pre_norm (`bool`, *optional*, defaults to `False`):
+            Whether to use pre-LayerNorm or not for transformer decoder.
+        enforce_input_projection (`bool`, *optional*, defaults to `False`):
+            Whether to add an input projection 1x1 convolution even if the input channels and hidden dim are identical
+            in the Transformer decoder.
+        common_stride (`int`, *optional*, defaults to 4):
+            Parameter used for determining number of FPN levels used as part of pixel decoder.
+        ignore_value (`int`, *optional*, defaults to 255):
+            Category id to be ignored during training.
+        num_queries (`int`, *optional*, defaults to 100):
+            Number of queries for the decoder.
+        no_object_weight (`int`, *optional*, defaults to 0.1):
+            The weight to apply to the null (no object) class.
+        class_weight (`int`, *optional*, defaults to 2.0):
+            The weight for the cross entropy loss.
+        mask_weight (`int`, *optional*, defaults to 5.0):
+            The weight for the mask loss.
+        dice_weight (`int`, *optional*, defaults to 5.0):
+            The weight for the dice loss.
+        train_num_points (`str` or `function`, *optional*, defaults to 12544):
+            Number of points used for sampling during loss calculation.
+        oversample_ratio (`float`, *optional*, defaults to 3.0):
+            Oversampling parameter used for calculating no. of sampled points
+        importance_sample_ratio (`float`, *optional*, defaults to 0.75):
+            Ratio of points that are sampled via importance sampling.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        init_xavier_std (`float``, *optional*, defaults to 1.0):
+            The scaling factor used for the Xavier initialization gain in the HM Attention map module.
+        use_auxiliary_loss (`boolean``, *optional*, defaults to `True`):
+            If `True` [`Mask2FormerForUniversalSegmentationOutput`] will contain the auxiliary losses computed using
+            the logits from each decoder's stage.
+        feature_strides (`List[int]`, *optional*, defaults to `[4, 8, 16, 32]`):
+            Feature strides corresponding to features generated from backbone network.
+        output_auxiliary_logits (`bool`, *optional*):
+            Should the model output its `auxiliary_logits` or not.
+
+    Examples:
+
+    ```python
+    >>> from transformers import Mask2FormerConfig, Mask2FormerModel
+
+    >>> # Initializing a Mask2Former facebook/mask2former-swin-small-coco-instance configuration
+    >>> configuration = Mask2FormerConfig()
+
+    >>> # Initializing a model (with random weights) from the facebook/mask2former-swin-small-coco-instance style configuration
+    >>> model = Mask2FormerModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+
+    """
+    model_type = "mask2former"
+    backbones_supported = ["swin"]
+    attribute_map = {"hidden_size": "hidden_dim"}
+
+    def __init__(
+        self,
+        backbone_config: Optional[Dict] = None,
+        feature_size: int = 256,
+        mask_feature_size: int = 256,
+        hidden_dim: int = 256,
+        encoder_feedforward_dim: int = 1024,
+        activation_function: str = "relu",
+        encoder_layers: int = 6,
+        decoder_layers: int = 10,
+        num_attention_heads: int = 8,
+        dropout: float = 0.0,
+        dim_feedforward: int = 2048,
+        pre_norm: bool = False,
+        enforce_input_projection: bool = False,
+        common_stride: int = 4,
+        ignore_value: int = 255,
+        num_queries: int = 100,
+        no_object_weight: float = 0.1,
+        class_weight: float = 2.0,
+        mask_weight: float = 5.0,
+        dice_weight: float = 5.0,
+        train_num_points: int = 12544,
+        oversample_ratio: float = 3.0,
+        importance_sample_ratio: float = 0.75,
+        init_std: float = 0.02,
+        init_xavier_std: float = 1.0,
+        use_auxiliary_loss: bool = True,
+        feature_strides: List[int] = [4, 8, 16, 32],
+        output_auxiliary_logits: bool = None,
+        **kwargs,
+    ):
+        if backbone_config is None:
+            logger.info("`backbone_config` is `None`. Initializing the config with the default `Swin` backbone.")
+            backbone_config = CONFIG_MAPPING["swin"](
+                image_size=224,
+                in_channels=3,
+                patch_size=4,
+                embed_dim=96,
+                depths=[2, 2, 18, 2],
+                num_heads=[3, 6, 12, 24],
+                window_size=7,
+                drop_path_rate=0.3,
+                use_absolute_embeddings=False,
+                out_features=["stage1", "stage2", "stage3", "stage4"],
+            )
+
+        if isinstance(backbone_config, dict):
+            backbone_model_type = backbone_config.pop("model_type")
+            config_class = CONFIG_MAPPING[backbone_model_type]
+            backbone_config = config_class.from_dict(backbone_config)
+
+        # verify that the backbone is supported
+        if backbone_config.model_type not in self.backbones_supported:
+            logger.warning_once(
+                f"Backbone {backbone_config.model_type} is not a supported model and may not be compatible with Mask2Former. "
+                f"Supported model types: {','.join(self.backbones_supported)}"
+            )
+
+        self.backbone_config = backbone_config
+        self.feature_size = feature_size
+        self.mask_feature_size = mask_feature_size
+        self.hidden_dim = hidden_dim
+        self.encoder_feedforward_dim = encoder_feedforward_dim
+        self.activation_function = activation_function
+        self.encoder_layers = encoder_layers
+        self.decoder_layers = decoder_layers
+        self.num_attention_heads = num_attention_heads
+        self.dropout = dropout
+        self.dim_feedforward = dim_feedforward
+        self.pre_norm = pre_norm
+        self.enforce_input_projection = enforce_input_projection
+        self.common_stride = common_stride
+        self.ignore_value = ignore_value
+        self.num_queries = num_queries
+        self.no_object_weight = no_object_weight
+        self.class_weight = class_weight
+        self.mask_weight = mask_weight
+        self.dice_weight = dice_weight
+        self.train_num_points = train_num_points
+        self.oversample_ratio = oversample_ratio
+        self.importance_sample_ratio = importance_sample_ratio
+        self.init_std = init_std
+        self.init_xavier_std = init_xavier_std
+        self.use_auxiliary_loss = use_auxiliary_loss
+        self.feature_strides = feature_strides
+        self.output_auxiliary_logits = output_auxiliary_logits
+        self.num_hidden_layers = decoder_layers
+
+        super().__init__(**kwargs)
+
+    @classmethod
+    def from_backbone_config(cls, backbone_config: PretrainedConfig, **kwargs):
+        """Instantiate a [`Mask2FormerConfig`] (or a derived class) from a pre-trained backbone model configuration.
+
+        Args:
+            backbone_config ([`PretrainedConfig`]):
+                The backbone configuration.
+
+        Returns:
+            [`Mask2FormerConfig`]: An instance of a configuration object
+        """
+        return cls(
+            backbone_config=backbone_config,
+            **kwargs,
+        )
diff --git a/src/transformers/models/mask2former/convert_mask2former_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/mask2former/convert_mask2former_original_pytorch_checkpoint_to_pytorch.py
new file mode 100644
index 000000000000..ea1c578509f6
--- /dev/null
+++ b/src/transformers/models/mask2former/convert_mask2former_original_pytorch_checkpoint_to_pytorch.py
@@ -0,0 +1,1019 @@
+# coding=utf-8
+# Copyright 2022 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import sys
+from argparse import ArgumentParser
+from dataclasses import dataclass
+from pathlib import Path
+from pprint import pformat
+from typing import Any, Dict, Iterator, List, Set, Tuple
+
+import requests
+import torch
+import torchvision.transforms as T
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import get_cfg
+from detectron2.projects.deeplab import add_deeplab_config
+from huggingface_hub import hf_hub_download
+from PIL import Image
+from torch import Tensor, nn
+
+from transformers import (
+    Mask2FormerConfig,
+    Mask2FormerForUniversalSegmentation,
+    Mask2FormerImageProcessor,
+    Mask2FormerModel,
+    SwinConfig,
+)
+from transformers.models.mask2former.modeling_mask2former import (
+    Mask2FormerForUniversalSegmentationOutput,
+    Mask2FormerModelOutput,
+)
+from transformers.utils import logging
+
+
+StateDict = Dict[str, Tensor]
+
+logging.set_verbosity_info()
+logger = logging.get_logger()
+
+torch.manual_seed(0)
+
+
+class TrackedStateDict:
+    def __init__(self, to_track: Dict):
+        """This class "tracks" a python dictionary by keeping track of which item is accessed.
+
+        Args:
+            to_track (Dict): The dictionary we wish to track
+        """
+        self.to_track = to_track
+        self._seen: Set[str] = set()
+
+    def __getitem__(self, key: str) -> Any:
+        return self.to_track[key]
+
+    def __setitem__(self, key: str, item: Any):
+        self._seen.add(key)
+        self.to_track[key] = item
+
+    def diff(self) -> List[str]:
+        """This method returns a set difference between the keys in the tracked state dict and the one we have access so far.
+        This is an effective method to check if we have update all the keys
+
+        Returns:
+            List[str]: List of keys not yet updated
+        """
+        return set(self.to_track.keys()) - self._seen
+
+    def copy(self) -> Dict:
+        # proxy the call to the internal dictionary
+        return self.to_track.copy()
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    img_data = requests.get(url, stream=True).raw
+    im = Image.open(img_data)
+    return im
+
+
+@dataclass
+class Args:
+    """Fake command line arguments needed by mask2former/detectron implementation"""
+
+    config_file: str
+
+
+def setup_cfg(args: Args):
+    # load config from file and command-line arguments
+    cfg = get_cfg()
+    add_deeplab_config(cfg)
+    add_maskformer2_config(cfg)
+    cfg.merge_from_file(args.config_file)
+    cfg.freeze()
+    return cfg
+
+
+class OriginalMask2FormerConfigToOursConverter:
+    def __call__(self, original_config: object) -> Mask2FormerConfig:
+        model = original_config.MODEL
+
+        repo_id = "huggingface/label-files"
+        if model.SEM_SEG_HEAD.NUM_CLASSES == 847:
+            filename = "mask2former-ade20k-full-id2label.json"
+        elif model.SEM_SEG_HEAD.NUM_CLASSES == 150:
+            filename = "ade20k-id2label.json"
+        elif model.SEM_SEG_HEAD.NUM_CLASSES == 80:
+            filename = "coco-detection-mmdet-id2label.json"
+        elif model.SEM_SEG_HEAD.NUM_CLASSES == 171:
+            filename = "mask2former-coco-stuff-id2label.json"
+        elif model.SEM_SEG_HEAD.NUM_CLASSES == 133:
+            filename = "coco-panoptic-id2label.json"
+        elif model.SEM_SEG_HEAD.NUM_CLASSES == 19:
+            filename = "cityscapes-id2label.json"
+        elif model.SEM_SEG_HEAD.NUM_CLASSES == 8:
+            filename = "cityscapes-instance-id2label.json"
+        elif model.SEM_SEG_HEAD.NUM_CLASSES == 65:
+            filename = "mapillary-vistas-id2label.json"
+
+        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
+        id2label = {int(k): v for k, v in id2label.items()}
+        label2id = {label: idx for idx, label in id2label.items()}
+
+        if model.SWIN.EMBED_DIM == 96:
+            backbone_config = SwinConfig.from_pretrained(
+                "microsoft/swin-tiny-patch4-window7-224", out_features=["stage1", "stage2", "stage3", "stage4"]
+            )
+        elif model.SWIN.EMBED_DIM == 128:
+            backbone_config = SwinConfig(
+                embed_dim=128,
+                window_size=12,
+                depths=(2, 2, 18, 2),
+                num_heads=(4, 8, 16, 32),
+                out_features=["stage1", "stage2", "stage3", "stage4"],
+            )
+
+        elif model.SWIN.EMBED_DIM == 192:
+            backbone_config = SwinConfig.from_pretrained(
+                "microsoft/swin-large-patch4-window12-384", out_features=["stage1", "stage2", "stage3", "stage4"]
+            )
+        else:
+            raise ValueError(f"embed dim {model.SWIN.EMBED_DIM} not supported for Swin!")
+
+        backbone_config.drop_path_rate = model.SWIN.DROP_PATH_RATE
+        backbone_config.attention_probs_dropout_prob = model.SWIN.ATTN_DROP_RATE
+        backbone_config.depths = model.SWIN.DEPTHS
+
+        config: Mask2FormerConfig = Mask2FormerConfig(
+            ignore_value=model.SEM_SEG_HEAD.IGNORE_VALUE,
+            num_labels=model.SEM_SEG_HEAD.NUM_CLASSES,
+            num_queries=model.MASK_FORMER.NUM_OBJECT_QUERIES,
+            no_object_weight=model.MASK_FORMER.NO_OBJECT_WEIGHT,
+            class_weight=model.MASK_FORMER.CLASS_WEIGHT,
+            mask_weight=model.MASK_FORMER.MASK_WEIGHT,
+            dice_weight=model.MASK_FORMER.DICE_WEIGHT,
+            train_num_points=model.MASK_FORMER.TRAIN_NUM_POINTS,
+            oversample_ratio=model.MASK_FORMER.OVERSAMPLE_RATIO,
+            importance_sample_ratio=model.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO,
+            init_std=0.02,
+            init_xavier_std=1.0,
+            use_auxiliary_loss=model.MASK_FORMER.DEEP_SUPERVISION,
+            feature_strides=[4, 8, 16, 32],
+            backbone_config=backbone_config,
+            id2label=id2label,
+            label2id=label2id,
+            feature_size=model.SEM_SEG_HEAD.CONVS_DIM,
+            mask_feature_size=model.SEM_SEG_HEAD.MASK_DIM,
+            hidden_dim=model.MASK_FORMER.HIDDEN_DIM,
+            encoder_layers=model.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS,
+            encoder_feedforward_dim=1024,
+            decoder_layers=model.MASK_FORMER.DEC_LAYERS,
+            num_attention_heads=model.MASK_FORMER.NHEADS,
+            dropout=model.MASK_FORMER.DROPOUT,
+            dim_feedforward=model.MASK_FORMER.DIM_FEEDFORWARD,
+            pre_norm=model.MASK_FORMER.PRE_NORM,
+            enforce_input_proj=model.MASK_FORMER.ENFORCE_INPUT_PROJ,
+            common_stride=model.SEM_SEG_HEAD.COMMON_STRIDE,
+        )
+        return config
+
+
+class OriginalMask2FormerConfigToImageProcessorConverter:
+    def __call__(self, original_config: object) -> Mask2FormerImageProcessor:
+        model = original_config.MODEL
+        model_input = original_config.INPUT
+
+        return Mask2FormerImageProcessor(
+            image_mean=(torch.tensor(model.PIXEL_MEAN) / 255).tolist(),
+            image_std=(torch.tensor(model.PIXEL_STD) / 255).tolist(),
+            size=model_input.MIN_SIZE_TEST,
+            max_size=model_input.MAX_SIZE_TEST,
+            num_labels=model.SEM_SEG_HEAD.NUM_CLASSES,
+            ignore_index=model.SEM_SEG_HEAD.IGNORE_VALUE,
+            size_divisibility=32,
+        )
+
+
+class OriginalMask2FormerCheckpointToOursConverter:
+    def __init__(self, original_model: nn.Module, config: Mask2FormerConfig):
+        self.original_model = original_model
+        self.config = config
+
+    def pop_all(self, renamed_keys: List[Tuple[str, str]], dst_state_dict: StateDict, src_state_dict: StateDict):
+        for src_key, dst_key in renamed_keys:
+            dst_state_dict[dst_key] = src_state_dict.pop(src_key)
+
+    def replace_maskformer_swin_backbone(
+        self, dst_state_dict: StateDict, src_state_dict: StateDict, config: Mask2FormerConfig
+    ):
+        dst_prefix: str = "pixel_level_module.encoder"
+        src_prefix: str = "backbone"
+
+        renamed_keys = [
+            (
+                f"{src_prefix}.patch_embed.proj.weight",
+                f"{dst_prefix}.model.embeddings.patch_embeddings.projection.weight",
+            ),
+            (f"{src_prefix}.patch_embed.proj.bias", f"{dst_prefix}.model.embeddings.patch_embeddings.projection.bias"),
+            (f"{src_prefix}.patch_embed.norm.weight", f"{dst_prefix}.model.embeddings.norm.weight"),
+            (f"{src_prefix}.patch_embed.norm.bias", f"{dst_prefix}.model.embeddings.norm.bias"),
+        ]
+        num_layers = len(config.backbone_config.depths)
+        for layer_idx in range(num_layers):
+            for block_idx in range(config.backbone_config.depths[layer_idx]):
+                renamed_keys.extend(
+                    [  # src, dst
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm1.weight",
+                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_before.weight",
+                        ),
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm1.bias",
+                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_before.bias",
+                        ),
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.relative_position_bias_table",
+                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.relative_position_bias_table",
+                        ),
+                    ]
+                )
+                # now we need to handle the attentions
+                # read in weights + bias of input projection layer of cross-attention
+
+                src_att_weight = src_state_dict[f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.weight"]
+                src_att_bias = src_state_dict[f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.bias"]
+
+                size = src_att_weight.shape[0]
+                offset = size // 3
+                dst_state_dict[
+                    f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.query.weight"
+                ] = src_att_weight[:offset, :]
+                dst_state_dict[
+                    f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.query.bias"
+                ] = src_att_bias[:offset]
+
+                dst_state_dict[
+                    f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.key.weight"
+                ] = src_att_weight[offset : offset * 2, :]
+                dst_state_dict[
+                    f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.key.bias"
+                ] = src_att_bias[offset : offset * 2]
+
+                dst_state_dict[
+                    f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.value.weight"
+                ] = src_att_weight[-offset:, :]
+                dst_state_dict[
+                    f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.value.bias"
+                ] = src_att_bias[-offset:]
+
+                # let's pop them
+                src_state_dict.pop(f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.weight")
+                src_state_dict.pop(f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.bias")
+                # proj
+                renamed_keys.extend(
+                    [
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.proj.weight",
+                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.output.dense.weight",
+                        ),
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.proj.bias",
+                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.output.dense.bias",
+                        ),
+                    ]
+                )
+
+                # second norm
+                renamed_keys.extend(
+                    [
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm2.weight",
+                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_after.weight",
+                        ),
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm2.bias",
+                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_after.bias",
+                        ),
+                    ]
+                )
+
+                # mlp
+                renamed_keys.extend(
+                    [
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc1.weight",
+                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.intermediate.dense.weight",
+                        ),
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc1.bias",
+                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.intermediate.dense.bias",
+                        ),
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc2.weight",
+                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.output.dense.weight",
+                        ),
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc2.bias",
+                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.output.dense.bias",
+                        ),
+                    ]
+                )
+
+                renamed_keys.extend(
+                    [
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.relative_position_index",
+                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.relative_position_index",
+                        )
+                    ]
+                )
+
+            if layer_idx < num_layers - 1:
+                # patch merging
+                renamed_keys.extend(
+                    [
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.downsample.reduction.weight",
+                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.downsample.reduction.weight",
+                        ),
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.downsample.norm.weight",
+                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.downsample.norm.weight",
+                        ),
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.downsample.norm.bias",
+                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.downsample.norm.bias",
+                        ),
+                    ]
+                )
+
+            # hidden states norms
+            renamed_keys.extend(
+                [
+                    (
+                        f"{src_prefix}.norm{layer_idx}.weight",
+                        f"{dst_prefix}.hidden_states_norms.{layer_idx}.weight",
+                    ),
+                    (
+                        f"{src_prefix}.norm{layer_idx}.bias",
+                        f"{dst_prefix}.hidden_states_norms.{layer_idx}.bias",
+                    ),
+                ]
+            )
+        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
+
+    def replace_swin_backbone(self, dst_state_dict: StateDict, src_state_dict: StateDict, config: Mask2FormerConfig):
+        dst_prefix: str = "pixel_level_module.encoder"
+        src_prefix: str = "backbone"
+
+        renamed_keys = [
+            (
+                f"{src_prefix}.patch_embed.proj.weight",
+                f"{dst_prefix}.embeddings.patch_embeddings.projection.weight",
+            ),
+            (f"{src_prefix}.patch_embed.proj.bias", f"{dst_prefix}.embeddings.patch_embeddings.projection.bias"),
+            (f"{src_prefix}.patch_embed.norm.weight", f"{dst_prefix}.embeddings.norm.weight"),
+            (f"{src_prefix}.patch_embed.norm.bias", f"{dst_prefix}.embeddings.norm.bias"),
+        ]
+
+        for layer_idx in range(len(config.backbone_config.depths)):
+            for block_idx in range(config.backbone_config.depths[layer_idx]):
+                renamed_keys.extend(
+                    [  # src, dst
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm1.weight",
+                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_before.weight",
+                        ),
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm1.bias",
+                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_before.bias",
+                        ),
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.relative_position_bias_table",
+                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.relative_position_bias_table",
+                        ),
+                    ]
+                )
+                # now we need to handle the attentions
+                # read in weights + bias of input projection layer of cross-attention
+
+                src_att_weight = src_state_dict[f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.weight"]
+                src_att_bias = src_state_dict[f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.bias"]
+
+                size = src_att_weight.shape[0]
+                offset = size // 3
+                dst_state_dict[
+                    f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.query.weight"
+                ] = src_att_weight[:offset, :]
+                dst_state_dict[
+                    f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.query.bias"
+                ] = src_att_bias[:offset]
+
+                dst_state_dict[
+                    f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.key.weight"
+                ] = src_att_weight[offset : offset * 2, :]
+                dst_state_dict[
+                    f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.key.bias"
+                ] = src_att_bias[offset : offset * 2]
+
+                dst_state_dict[
+                    f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.value.weight"
+                ] = src_att_weight[-offset:, :]
+                dst_state_dict[
+                    f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.value.bias"
+                ] = src_att_bias[-offset:]
+
+                # let's pop them
+                src_state_dict.pop(f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.weight")
+                src_state_dict.pop(f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.bias")
+                # proj
+                renamed_keys.extend(
+                    [
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.proj.weight",
+                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.output.dense.weight",
+                        ),
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.proj.bias",
+                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.output.dense.bias",
+                        ),
+                    ]
+                )
+
+                # second norm
+                renamed_keys.extend(
+                    [
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm2.weight",
+                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_after.weight",
+                        ),
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm2.bias",
+                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_after.bias",
+                        ),
+                    ]
+                )
+
+                # mlp
+                renamed_keys.extend(
+                    [
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc1.weight",
+                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.intermediate.dense.weight",
+                        ),
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc1.bias",
+                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.intermediate.dense.bias",
+                        ),
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc2.weight",
+                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.output.dense.weight",
+                        ),
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc2.bias",
+                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.output.dense.bias",
+                        ),
+                    ]
+                )
+
+                renamed_keys.extend(
+                    [
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.relative_position_index",
+                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.relative_position_index",
+                        )
+                    ]
+                )
+
+            if layer_idx < 3:
+                # patch merging
+                renamed_keys.extend(
+                    [
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.downsample.reduction.weight",
+                            f"{dst_prefix}.encoder.layers.{layer_idx}.downsample.reduction.weight",
+                        ),
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.downsample.norm.weight",
+                            f"{dst_prefix}.encoder.layers.{layer_idx}.downsample.norm.weight",
+                        ),
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.downsample.norm.bias",
+                            f"{dst_prefix}.encoder.layers.{layer_idx}.downsample.norm.bias",
+                        ),
+                    ]
+                )
+
+            # hidden states norms
+            renamed_keys.extend(
+                [
+                    (
+                        f"{src_prefix}.norm{layer_idx}.weight",
+                        f"{dst_prefix}.hidden_states_norms.stage{layer_idx+1}.weight",
+                    ),
+                    (
+                        f"{src_prefix}.norm{layer_idx}.bias",
+                        f"{dst_prefix}.hidden_states_norms.stage{layer_idx+1}.bias",
+                    ),
+                ]
+            )
+        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
+
+    # Backbone + Pixel Decoder
+    def replace_pixel_module(self, dst_state_dict: StateDict, src_state_dict: StateDict):
+        dst_prefix: str = "pixel_level_module.decoder"
+        src_prefix: str = "sem_seg_head.pixel_decoder"
+
+        self.replace_swin_backbone(dst_state_dict, src_state_dict, self.config)
+
+        def rename_keys_for_weight_bias(src_prefix: str, dst_prefix: str):
+            return [
+                (f"{src_prefix}.weight", f"{dst_prefix}.weight"),
+                (f"{src_prefix}.bias", f"{dst_prefix}.bias"),
+            ]
+
+        def rename_keys_for_self_attn(src_prefix: str, dst_prefix: str):
+            self_attn_keys = []
+            self_attn_keys.extend(
+                rename_keys_for_weight_bias(f"{src_prefix}.attention_weights", f"{dst_prefix}.attention_weights")
+            )
+            self_attn_keys.extend(
+                rename_keys_for_weight_bias(f"{src_prefix}.output_proj", f"{dst_prefix}.output_proj")
+            )
+            self_attn_keys.extend(
+                rename_keys_for_weight_bias(f"{src_prefix}.sampling_offsets", f"{dst_prefix}.sampling_offsets")
+            )
+            self_attn_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.value_proj", f"{dst_prefix}.value_proj"))
+
+            return self_attn_keys
+
+        def rename_keys_for_encoder_layer(src_prefix: str, dst_prefix: str):
+            encoder_keys = []
+            encoder_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.linear1", f"{dst_prefix}.fc1"))
+            encoder_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.linear2", f"{dst_prefix}.fc2"))
+            encoder_keys.extend(
+                rename_keys_for_weight_bias(f"{src_prefix}.norm1", f"{dst_prefix}.self_attn_layer_norm")
+            )
+            encoder_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.norm2", f"{dst_prefix}.final_layer_norm"))
+            encoder_keys.extend(rename_keys_for_self_attn(f"{src_prefix}.self_attn", f"{dst_prefix}.self_attn"))
+
+            return encoder_keys
+
+        # convolution layer for final features
+        renamed_keys = [
+            (f"{src_prefix}.adapter_1.weight", f"{dst_prefix}.adapter_1.0.weight"),
+            (f"{src_prefix}.adapter_1.norm.weight", f"{dst_prefix}.adapter_1.1.weight"),
+            (f"{src_prefix}.adapter_1.norm.bias", f"{dst_prefix}.adapter_1.1.bias"),
+        ]
+
+        renamed_keys.extend(
+            [
+                (f"{src_prefix}.layer_1.weight", f"{dst_prefix}.layer_1.0.weight"),
+                (f"{src_prefix}.layer_1.norm.weight", f"{dst_prefix}.layer_1.1.weight"),
+                (f"{src_prefix}.layer_1.norm.bias", f"{dst_prefix}.layer_1.1.bias"),
+            ]
+        )
+
+        # proj layers
+        for i in range(3):
+            for j in range(2):
+                renamed_keys.extend(
+                    [
+                        (f"{src_prefix}.input_proj.{i}.{j}.weight", f"{dst_prefix}.input_projections.{i}.{j}.weight"),
+                        (f"{src_prefix}.input_proj.{i}.{j}.bias", f"{dst_prefix}.input_projections.{i}.{j}.bias"),
+                    ]
+                )
+
+        renamed_keys.extend([(f"{src_prefix}.transformer.level_embed", f"{dst_prefix}.level_embed")])
+
+        # layers
+        for layer_idx in range(self.config.encoder_layers):
+            renamed_keys.extend(
+                rename_keys_for_encoder_layer(
+                    f"{src_prefix}.transformer.encoder.layers.{layer_idx}", f"{dst_prefix}.encoder.layers.{layer_idx}"
+                )
+            )
+
+        # proj
+        renamed_keys.extend(
+            [
+                (f"{src_prefix}.mask_features.weight", f"{dst_prefix}.mask_projection.weight"),
+                (f"{src_prefix}.mask_features.bias", f"{dst_prefix}.mask_projection.bias"),
+            ]
+        )
+        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
+
+    # Transformer Decoder
+    def rename_keys_in_masked_attention_decoder(self, dst_state_dict: StateDict, src_state_dict: StateDict):
+        dst_prefix: str = "transformer_module.decoder"
+        src_prefix: str = "sem_seg_head.predictor"
+
+        rename_keys = []
+        for i in range(self.config.decoder_layers - 1):
+            rename_keys.append(
+                (
+                    f"{src_prefix}.transformer_self_attention_layers.{i}.self_attn.out_proj.weight",
+                    f"{dst_prefix}.layers.{i}.self_attn.out_proj.weight",
+                )
+            )
+            rename_keys.append(
+                (
+                    f"{src_prefix}.transformer_self_attention_layers.{i}.self_attn.out_proj.bias",
+                    f"{dst_prefix}.layers.{i}.self_attn.out_proj.bias",
+                )
+            )
+
+            rename_keys.append(
+                (
+                    f"{src_prefix}.transformer_self_attention_layers.{i}.norm.weight",
+                    f"{dst_prefix}.layers.{i}.self_attn_layer_norm.weight",
+                )
+            )
+            rename_keys.append(
+                (
+                    f"{src_prefix}.transformer_self_attention_layers.{i}.norm.bias",
+                    f"{dst_prefix}.layers.{i}.self_attn_layer_norm.bias",
+                )
+            )
+
+            rename_keys.append(
+                (
+                    f"{src_prefix}.transformer_cross_attention_layers.{i}.multihead_attn.in_proj_weight",
+                    f"{dst_prefix}.layers.{i}.cross_attn.in_proj_weight",
+                )
+            )
+            rename_keys.append(
+                (
+                    f"{src_prefix}.transformer_cross_attention_layers.{i}.multihead_attn.in_proj_bias",
+                    f"{dst_prefix}.layers.{i}.cross_attn.in_proj_bias",
+                )
+            )
+            rename_keys.append(
+                (
+                    f"{src_prefix}.transformer_cross_attention_layers.{i}.multihead_attn.out_proj.weight",
+                    f"{dst_prefix}.layers.{i}.cross_attn.out_proj.weight",
+                )
+            )
+            rename_keys.append(
+                (
+                    f"{src_prefix}.transformer_cross_attention_layers.{i}.multihead_attn.out_proj.bias",
+                    f"{dst_prefix}.layers.{i}.cross_attn.out_proj.bias",
+                )
+            )
+
+            rename_keys.append(
+                (
+                    f"{src_prefix}.transformer_cross_attention_layers.{i}.norm.weight",
+                    f"{dst_prefix}.layers.{i}.cross_attn_layer_norm.weight",
+                )
+            )
+            rename_keys.append(
+                (
+                    f"{src_prefix}.transformer_cross_attention_layers.{i}.norm.bias",
+                    f"{dst_prefix}.layers.{i}.cross_attn_layer_norm.bias",
+                )
+            )
+
+            rename_keys.append(
+                (f"{src_prefix}.transformer_ffn_layers.{i}.linear1.weight", f"{dst_prefix}.layers.{i}.fc1.weight")
+            )
+            rename_keys.append(
+                (f"{src_prefix}.transformer_ffn_layers.{i}.linear1.bias", f"{dst_prefix}.layers.{i}.fc1.bias")
+            )
+            rename_keys.append(
+                (f"{src_prefix}.transformer_ffn_layers.{i}.linear2.weight", f"{dst_prefix}.layers.{i}.fc2.weight")
+            )
+            rename_keys.append(
+                (f"{src_prefix}.transformer_ffn_layers.{i}.linear2.bias", f"{dst_prefix}.layers.{i}.fc2.bias")
+            )
+            rename_keys.append(
+                (
+                    f"{src_prefix}.transformer_ffn_layers.{i}.norm.weight",
+                    f"{dst_prefix}.layers.{i}.final_layer_norm.weight",
+                )
+            )
+            rename_keys.append(
+                (
+                    f"{src_prefix}.transformer_ffn_layers.{i}.norm.bias",
+                    f"{dst_prefix}.layers.{i}.final_layer_norm.bias",
+                )
+            )
+
+        return rename_keys
+
+    def replace_masked_attention_decoder(self, dst_state_dict: StateDict, src_state_dict: StateDict):
+        dst_prefix: str = "transformer_module.decoder"
+        src_prefix: str = "sem_seg_head.predictor"
+
+        renamed_keys = self.rename_keys_in_masked_attention_decoder(dst_state_dict, src_state_dict)
+
+        # add more
+        renamed_keys.extend(
+            [
+                (f"{src_prefix}.decoder_norm.weight", f"{dst_prefix}.layernorm.weight"),
+                (f"{src_prefix}.decoder_norm.bias", f"{dst_prefix}.layernorm.bias"),
+            ]
+        )
+
+        mlp_len = 3
+        for i in range(mlp_len):
+            renamed_keys.extend(
+                [
+                    (
+                        f"{src_prefix}.mask_embed.layers.{i}.weight",
+                        f"{dst_prefix}.mask_predictor.mask_embedder.{i}.0.weight",
+                    ),
+                    (
+                        f"{src_prefix}.mask_embed.layers.{i}.bias",
+                        f"{dst_prefix}.mask_predictor.mask_embedder.{i}.0.bias",
+                    ),
+                ]
+            )
+
+        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
+
+    def replace_keys_qkv_transformer_decoder(self, dst_state_dict: StateDict, src_state_dict: StateDict):
+        dst_prefix: str = "transformer_module.decoder.layers"
+        src_prefix: str = "sem_seg_head.predictor"
+        for i in range(self.config.decoder_layers - 1):
+            # read in weights + bias of input projection layer of self-attention
+            in_proj_weight = src_state_dict.pop(
+                f"{src_prefix}.transformer_self_attention_layers.{i}.self_attn.in_proj_weight"
+            )
+            in_proj_bias = src_state_dict.pop(
+                f"{src_prefix}.transformer_self_attention_layers.{i}.self_attn.in_proj_bias"
+            )
+            # next, add query, keys and values (in that order) to the state dict
+            dst_state_dict[f"{dst_prefix}.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
+            dst_state_dict[f"{dst_prefix}.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
+            dst_state_dict[f"{dst_prefix}.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
+            dst_state_dict[f"{dst_prefix}.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
+            dst_state_dict[f"{dst_prefix}.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
+            dst_state_dict[f"{dst_prefix}.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
+
+    def replace_transformer_module(self, dst_state_dict: StateDict, src_state_dict: StateDict):
+        dst_prefix: str = "transformer_module"
+        src_prefix: str = "sem_seg_head.predictor"
+
+        self.replace_masked_attention_decoder(dst_state_dict, src_state_dict)
+
+        renamed_keys = [
+            (f"{src_prefix}.query_embed.weight", f"{dst_prefix}.queries_embedder.weight"),
+            (f"{src_prefix}.query_feat.weight", f"{dst_prefix}.queries_features.weight"),
+            (f"{src_prefix}.level_embed.weight", f"{dst_prefix}.level_embed.weight"),
+        ]
+
+        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
+        self.replace_keys_qkv_transformer_decoder(dst_state_dict, src_state_dict)
+
+    def replace_universal_segmentation_module(self, dst_state_dict: StateDict, src_state_dict: StateDict):
+        dst_prefix: str = ""
+        src_prefix: str = "sem_seg_head.predictor"
+
+        renamed_keys = [
+            (f"{src_prefix}.class_embed.weight", f"{dst_prefix}class_predictor.weight"),
+            (f"{src_prefix}.class_embed.bias", f"{dst_prefix}class_predictor.bias"),
+        ]
+
+        logger.info(f"Replacing keys {pformat(renamed_keys)}")
+        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
+
+    def convert(self, mask2former: Mask2FormerModel) -> Mask2FormerModel:
+        dst_state_dict = TrackedStateDict(mask2former.state_dict())
+        src_state_dict = self.original_model.state_dict()
+
+        self.replace_pixel_module(dst_state_dict, src_state_dict)
+        self.replace_transformer_module(dst_state_dict, src_state_dict)
+
+        logger.info(f"Missed keys are {pformat(dst_state_dict.diff())}")
+        logger.info(f"Not copied keys are {pformat(src_state_dict.keys())}")
+        logger.info("🙌 Done")
+
+        state_dict = {key: dst_state_dict[key] for key in dst_state_dict.to_track.keys()}
+        mask2former.load_state_dict(state_dict)
+        return mask2former
+
+    def convert_universal_segmentation(
+        self, mask2former: Mask2FormerForUniversalSegmentation
+    ) -> Mask2FormerForUniversalSegmentation:
+        dst_state_dict = TrackedStateDict(mask2former.state_dict())
+        src_state_dict = self.original_model.state_dict()
+
+        self.replace_universal_segmentation_module(dst_state_dict, src_state_dict)
+
+        state_dict = {key: dst_state_dict[key] for key in dst_state_dict.to_track.keys()}
+        mask2former.load_state_dict(state_dict)
+
+        return mask2former
+
+    @staticmethod
+    def using_dirs(checkpoints_dir: Path, config_dir: Path) -> Iterator[Tuple[object, Path, Path]]:
+        checkpoints: List[Path] = checkpoints_dir.glob("**/*.pkl")
+
+        for checkpoint in checkpoints:
+            logger.info(f"💪 Converting {checkpoint.stem}")
+            # find associated config file
+
+            # dataset_name e.g 'coco'
+            dataset_name = checkpoint.parents[2].stem
+            if dataset_name == "ade":
+                dataset_name = dataset_name.replace("ade", "ade20k")
+
+            # task type e.g 'instance-segmentation'
+            segmentation_task = checkpoint.parents[1].stem
+
+            # config file corresponding to checkpoint
+            config_file_name = f"{checkpoint.parents[0].stem}.yaml"
+
+            config: Path = config_dir / dataset_name / segmentation_task / "swin" / config_file_name
+            yield config, checkpoint
+
+
+def test(
+    original_model,
+    our_model: Mask2FormerForUniversalSegmentation,
+    image_processor: Mask2FormerImageProcessor,
+    tolerance: float,
+):
+    with torch.no_grad():
+        original_model = original_model.eval()
+        our_model = our_model.eval()
+
+        im = prepare_img()
+        x = image_processor(images=im, return_tensors="pt")["pixel_values"]
+
+        original_model_backbone_features = original_model.backbone(x.clone())
+        our_model_output: Mask2FormerModelOutput = our_model.model(x.clone(), output_hidden_states=True)
+
+        # Test backbone
+        for original_model_feature, our_model_feature in zip(
+            original_model_backbone_features.values(), our_model_output.encoder_hidden_states
+        ):
+            assert torch.allclose(
+                original_model_feature, our_model_feature, atol=tolerance
+            ), "The backbone features are not the same."
+
+        # Test pixel decoder
+        mask_features, _, multi_scale_features = original_model.sem_seg_head.pixel_decoder.forward_features(
+            original_model_backbone_features
+        )
+
+        for original_model_feature, our_model_feature in zip(
+            multi_scale_features, our_model_output.pixel_decoder_hidden_states
+        ):
+            assert torch.allclose(
+                original_model_feature, our_model_feature, atol=tolerance
+            ), "The pixel decoder feature are not the same"
+
+        # Let's test the full model
+        tr_complete = T.Compose(
+            [T.Resize((384, 384)), T.ToTensor()],
+        )
+        y = (tr_complete(im) * 255.0).to(torch.int).float()
+
+        # modify original Mask2Former code to return mask and class logits
+        original_class_logits, original_mask_logits = original_model([{"image": y.clone().squeeze(0)}])
+
+        our_model_out: Mask2FormerForUniversalSegmentationOutput = our_model(x.clone())
+        our_mask_logits = our_model_out.masks_queries_logits
+        our_class_logits = our_model_out.class_queries_logits
+
+        assert original_mask_logits.shape == our_mask_logits.shape, "Output masks shapes are not matching."
+        assert original_class_logits.shape == our_class_logits.shape, "Output class logits shapes are not matching."
+        assert torch.allclose(
+            original_class_logits, our_class_logits, atol=tolerance
+        ), "The class logits are not the same."
+        assert torch.allclose(
+            original_mask_logits, our_mask_logits, atol=tolerance
+        ), "The predicted masks are not the same."
+
+        logger.info("✅ Test passed!")
+
+
+def get_model_name(checkpoint_file: Path):
+    # model_name_raw is something like maskformer2_swin_small_bs16_50ep
+    model_name_raw: str = checkpoint_file.parents[0].stem
+
+    # `segmentation_task_type` must be one of the following: `instance-segmentation`, `panoptic-segmentation`, `semantic-segmentation`
+    segmentation_task_name: str = checkpoint_file.parents[1].stem
+    if segmentation_task_name not in ["instance-segmentation", "panoptic-segmentation", "semantic-segmentation"]:
+        raise ValueError(
+            f"{segmentation_task_name} must be wrong since acceptable values are: instance-segmentation,"
+            " panoptic-segmentation, semantic-segmentation."
+        )
+
+    # dataset name must be one of the following: `coco`, `ade`, `cityscapes`, `mapillary-vistas`
+    dataset_name: str = checkpoint_file.parents[2].stem
+    if dataset_name not in ["coco", "ade", "cityscapes", "mapillary-vistas"]:
+        raise ValueError(
+            f"{dataset_name} must be wrong since we didn't find 'coco' or 'ade' or 'cityscapes' or 'mapillary-vistas'"
+            " in it "
+        )
+
+    backbone = "swin"
+    backbone_types = ["tiny", "small", "base_IN21k", "base", "large"]
+    backbone_type = list(filter(lambda x: x in model_name_raw, backbone_types))[0].replace("_", "-")
+
+    model_name = f"mask2former-{backbone}-{backbone_type}-{dataset_name}-{segmentation_task_name.split('-')[0]}"
+
+    return model_name
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(
+        description="Command line to convert the original mask2formers (with swin backbone) to our implementations."
+    )
+
+    parser.add_argument(
+        "--checkpoints_dir",
+        type=Path,
+        help=(
+            "A directory containing the model's checkpoints. The directory has to have the following structure:"
+            " <DIR_NAME>/<DATASET_NAME>/<SEGMENTATION_TASK_NAME>/<CONFIG_NAME>.pkl"
+        ),
+    )
+    parser.add_argument(
+        "--configs_dir",
+        type=Path,
+        help=(
+            "A directory containing the model's configs, see detectron2 doc. The directory has to have the following"
+            " structure: <DIR_NAME>/<DATASET_NAME>/<SEGMENTATION_TASK_NAME>/<CONFIG_NAME>.yaml"
+        ),
+    )
+    parser.add_argument(
+        "--mask2former_dir",
+        required=True,
+        type=Path,
+        help=(
+            "A path to Mask2Former's original implementation directory. You can download from here:"
+            " https://github.com/facebookresearch/Mask2Former"
+        ),
+    )
+
+    args = parser.parse_args()
+
+    checkpoints_dir: Path = args.checkpoints_dir
+    config_dir: Path = args.configs_dir
+    mask2former_dir: Path = args.mask2former_dir
+    # append the path to the parents to mask2former dir
+    sys.path.append(str(mask2former_dir.parent))
+    # import original Mask2Former config and model from original source code repo
+    from Mask2Former.mask2former.config import add_maskformer2_config
+    from Mask2Former.mask2former.maskformer_model import MaskFormer as OriginalMask2Former
+
+    for config_file, checkpoint_file in OriginalMask2FormerCheckpointToOursConverter.using_dirs(
+        checkpoints_dir, config_dir
+    ):
+        model_name = get_model_name(checkpoint_file)
+        image_processor = OriginalMask2FormerConfigToImageProcessorConverter()(
+            setup_cfg(Args(config_file=config_file))
+        )
+        image_processor.size = {"height": 384, "width": 384}
+
+        original_config = setup_cfg(Args(config_file=config_file))
+        mask2former_kwargs = OriginalMask2Former.from_config(original_config)
+        original_model = OriginalMask2Former(**mask2former_kwargs).eval()
+
+        DetectionCheckpointer(original_model).load(str(checkpoint_file))
+
+        config: Mask2FormerConfig = OriginalMask2FormerConfigToOursConverter()(original_config)
+        mask2former = Mask2FormerModel(config=config).eval()
+
+        converter = OriginalMask2FormerCheckpointToOursConverter(original_model, config)
+        mask2former = converter.convert(mask2former)
+
+        mask2former_for_segmentation = Mask2FormerForUniversalSegmentation(config=config).eval()
+        mask2former_for_segmentation.model = mask2former
+
+        mask2former_for_segmentation = converter.convert_universal_segmentation(mask2former_for_segmentation)
+
+        tolerance = 3e-1
+        high_tolerance_models = [
+            "mask2former-swin-base-IN21k-coco-instance",
+            "mask2former-swin-base-coco-instance",
+            "mask2former-swin-small-cityscapes-semantic",
+        ]
+
+        if model_name in high_tolerance_models:
+            tolerance = 3e-1
+
+        logger.info(f"🪄 Testing {model_name}...")
+        test(original_model, mask2former_for_segmentation, image_processor, tolerance)
+        logger.info(f"🪄 Pushing {model_name} to hub...")
+
+        image_processor.push_to_hub(model_name)
+        mask2former_for_segmentation.push_to_hub(model_name)
diff --git a/src/transformers/models/mask2former/image_processing_mask2former.py b/src/transformers/models/mask2former/image_processing_mask2former.py
new file mode 100644
index 000000000000..af3591e192e1
--- /dev/null
+++ b/src/transformers/models/mask2former/image_processing_mask2former.py
@@ -0,0 +1,1226 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Mask2Former."""
+
+import math
+import warnings
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+    PaddingMode,
+    get_resize_output_image_size,
+    pad,
+    rescale,
+    resize,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_batched,
+    is_scaled_image,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    TensorType,
+    is_torch_available,
+    is_torch_tensor,
+    logging,
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+
+# Copied from transformers.models.detr.image_processing_detr.max_across_indices
+def max_across_indices(values: Iterable[Any]) -> List[Any]:
+    """
+    Return the maximum value across all indices of an iterable of values.
+    """
+    return [max(values_i) for values_i in zip(*values)]
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_max_height_width
+def get_max_height_width(
+    images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> List[int]:
+    """
+    Get the maximum height and width across all images in a batch.
+    """
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(images[0])
+
+    if input_data_format == ChannelDimension.FIRST:
+        _, max_height, max_width = max_across_indices([img.shape for img in images])
+    elif input_data_format == ChannelDimension.LAST:
+        max_height, max_width, _ = max_across_indices([img.shape for img in images])
+    else:
+        raise ValueError(f"Invalid channel dimension format: {input_data_format}")
+    return (max_height, max_width)
+
+
+# Copied from transformers.models.detr.image_processing_detr.make_pixel_mask
+def make_pixel_mask(
+    image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> np.ndarray:
+    """
+    Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
+
+    Args:
+        image (`np.ndarray`):
+            Image to make the pixel mask for.
+        output_size (`Tuple[int, int]`):
+            Output size of the mask.
+    """
+    input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+    mask = np.zeros(output_size, dtype=np.int64)
+    mask[:input_height, :input_width] = 1
+    return mask
+
+
+# Copied from transformers.models.detr.image_processing_detr.binary_mask_to_rle
+def binary_mask_to_rle(mask):
+    """
+    Converts given binary mask of shape `(height, width)` to the run-length encoding (RLE) format.
+
+    Args:
+        mask (`torch.Tensor` or `numpy.array`):
+            A binary mask tensor of shape `(height, width)` where 0 denotes background and 1 denotes the target
+            segment_id or class_id.
+    Returns:
+        `List`: Run-length encoded list of the binary mask. Refer to COCO API for more information about the RLE
+        format.
+    """
+    if is_torch_tensor(mask):
+        mask = mask.numpy()
+
+    pixels = mask.flatten()
+    pixels = np.concatenate([[0], pixels, [0]])
+    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
+    runs[1::2] -= runs[::2]
+    return list(runs)
+
+
+# Copied from transformers.models.detr.image_processing_detr.convert_segmentation_to_rle
+def convert_segmentation_to_rle(segmentation):
+    """
+    Converts given segmentation map of shape `(height, width)` to the run-length encoding (RLE) format.
+
+    Args:
+        segmentation (`torch.Tensor` or `numpy.array`):
+            A segmentation map of shape `(height, width)` where each value denotes a segment or class id.
+    Returns:
+        `List[List]`: A list of lists, where each list is the run-length encoding of a segment / class id.
+    """
+    segment_ids = torch.unique(segmentation)
+
+    run_length_encodings = []
+    for idx in segment_ids:
+        mask = torch.where(segmentation == idx, 1, 0)
+        rle = binary_mask_to_rle(mask)
+        run_length_encodings.append(rle)
+
+    return run_length_encodings
+
+
+# Copied from transformers.models.detr.image_processing_detr.remove_low_and_no_objects
+def remove_low_and_no_objects(masks, scores, labels, object_mask_threshold, num_labels):
+    """
+    Binarize the given masks using `object_mask_threshold`, it returns the associated values of `masks`, `scores` and
+    `labels`.
+
+    Args:
+        masks (`torch.Tensor`):
+            A tensor of shape `(num_queries, height, width)`.
+        scores (`torch.Tensor`):
+            A tensor of shape `(num_queries)`.
+        labels (`torch.Tensor`):
+            A tensor of shape `(num_queries)`.
+        object_mask_threshold (`float`):
+            A number between 0 and 1 used to binarize the masks.
+    Raises:
+        `ValueError`: Raised when the first dimension doesn't match in all input tensors.
+    Returns:
+        `Tuple[`torch.Tensor`, `torch.Tensor`, `torch.Tensor`]`: The `masks`, `scores` and `labels` without the region
+        < `object_mask_threshold`.
+    """
+    if not (masks.shape[0] == scores.shape[0] == labels.shape[0]):
+        raise ValueError("mask, scores and labels must have the same shape!")
+
+    to_keep = labels.ne(num_labels) & (scores > object_mask_threshold)
+
+    return masks[to_keep], scores[to_keep], labels[to_keep]
+
+
+# Copied from transformers.models.detr.image_processing_detr.check_segment_validity
+def check_segment_validity(mask_labels, mask_probs, k, mask_threshold=0.5, overlap_mask_area_threshold=0.8):
+    # Get the mask associated with the k class
+    mask_k = mask_labels == k
+    mask_k_area = mask_k.sum()
+
+    # Compute the area of all the stuff in query k
+    original_area = (mask_probs[k] >= mask_threshold).sum()
+    mask_exists = mask_k_area > 0 and original_area > 0
+
+    # Eliminate disconnected tiny segments
+    if mask_exists:
+        area_ratio = mask_k_area / original_area
+        if not area_ratio.item() > overlap_mask_area_threshold:
+            mask_exists = False
+
+    return mask_exists, mask_k
+
+
+# Copied from transformers.models.detr.image_processing_detr.compute_segments
+def compute_segments(
+    mask_probs,
+    pred_scores,
+    pred_labels,
+    mask_threshold: float = 0.5,
+    overlap_mask_area_threshold: float = 0.8,
+    label_ids_to_fuse: Optional[Set[int]] = None,
+    target_size: Tuple[int, int] = None,
+):
+    height = mask_probs.shape[1] if target_size is None else target_size[0]
+    width = mask_probs.shape[2] if target_size is None else target_size[1]
+
+    segmentation = torch.zeros((height, width), dtype=torch.int32, device=mask_probs.device)
+    segments: List[Dict] = []
+
+    if target_size is not None:
+        mask_probs = nn.functional.interpolate(
+            mask_probs.unsqueeze(0), size=target_size, mode="bilinear", align_corners=False
+        )[0]
+
+    current_segment_id = 0
+
+    # Weigh each mask by its prediction score
+    mask_probs *= pred_scores.view(-1, 1, 1)
+    mask_labels = mask_probs.argmax(0)  # [height, width]
+
+    # Keep track of instances of each class
+    stuff_memory_list: Dict[str, int] = {}
+    for k in range(pred_labels.shape[0]):
+        pred_class = pred_labels[k].item()
+        should_fuse = pred_class in label_ids_to_fuse
+
+        # Check if mask exists and large enough to be a segment
+        mask_exists, mask_k = check_segment_validity(
+            mask_labels, mask_probs, k, mask_threshold, overlap_mask_area_threshold
+        )
+
+        if mask_exists:
+            if pred_class in stuff_memory_list:
+                current_segment_id = stuff_memory_list[pred_class]
+            else:
+                current_segment_id += 1
+
+            # Add current object segment to final segmentation map
+            segmentation[mask_k] = current_segment_id
+            segment_score = round(pred_scores[k].item(), 6)
+            segments.append(
+                {
+                    "id": current_segment_id,
+                    "label_id": pred_class,
+                    "was_fused": should_fuse,
+                    "score": segment_score,
+                }
+            )
+            if should_fuse:
+                stuff_memory_list[pred_class] = current_segment_id
+
+    return segmentation, segments
+
+
+# TODO: (Amy) Move to image_transforms
+# Copied from transformers.models.maskformer.image_processing_maskformer.convert_segmentation_map_to_binary_masks
+def convert_segmentation_map_to_binary_masks(
+    segmentation_map: "np.ndarray",
+    instance_id_to_semantic_id: Optional[Dict[int, int]] = None,
+    ignore_index: Optional[int] = None,
+    reduce_labels: bool = False,
+):
+    if reduce_labels and ignore_index is None:
+        raise ValueError("If `reduce_labels` is True, `ignore_index` must be provided.")
+
+    if reduce_labels:
+        segmentation_map = np.where(segmentation_map == 0, ignore_index, segmentation_map - 1)
+
+    # Get unique ids (class or instance ids based on input)
+    all_labels = np.unique(segmentation_map)
+
+    # Drop background label if applicable
+    if ignore_index is not None:
+        all_labels = all_labels[all_labels != ignore_index]
+
+    # Generate a binary mask for each object instance
+    binary_masks = [(segmentation_map == i) for i in all_labels]
+    binary_masks = np.stack(binary_masks, axis=0)  # (num_labels, height, width)
+
+    # Convert instance ids to class ids
+    if instance_id_to_semantic_id is not None:
+        labels = np.zeros(all_labels.shape[0])
+
+        for label in all_labels:
+            class_id = instance_id_to_semantic_id[label + 1 if reduce_labels else label]
+            labels[all_labels == label] = class_id - 1 if reduce_labels else class_id
+    else:
+        labels = all_labels
+
+    return binary_masks.astype(np.float32), labels.astype(np.int64)
+
+
+# Copied from transformers.models.maskformer.image_processing_maskformer.get_maskformer_resize_output_image_size with maskformer->mask2former
+def get_mask2former_resize_output_image_size(
+    image: np.ndarray,
+    size: Union[int, Tuple[int, int], List[int], Tuple[int]],
+    max_size: Optional[int] = None,
+    size_divisor: int = 0,
+    default_to_square: bool = True,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> tuple:
+    """
+    Computes the output size given the desired size.
+
+    Args:
+        input_image (`np.ndarray`):
+            The input image.
+        size (`int`, `Tuple[int, int]`, `List[int]`, `Tuple[int]`):
+            The size of the output image.
+        default_to_square (`bool`, *optional*, defaults to `True`):
+            Whether to default to square if no size is provided.
+        max_size (`int`, *optional*):
+            The maximum size of the output image.
+        size_divisible (`int`, *optional*, defaults to 0):
+            If size_divisible is given, the output image size will be divisible by the number.
+
+    Returns:
+        `Tuple[int, int]`: The output size.
+    """
+    output_size = get_resize_output_image_size(
+        input_image=image,
+        size=size,
+        default_to_square=default_to_square,
+        max_size=max_size,
+        input_data_format=input_data_format,
+    )
+
+    if size_divisor > 0:
+        height, width = output_size
+        height = int(math.ceil(height / size_divisor) * size_divisor)
+        width = int(math.ceil(width / size_divisor) * size_divisor)
+        output_size = (height, width)
+
+    return output_size
+
+
+class Mask2FormerImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Mask2Former image processor. The image processor can be used to prepare image(s) and optional targets
+    for the model.
+
+    This image processor inherits from [`BaseImageProcessor`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the input to a certain `size`.
+        size (`int`, *optional*, defaults to 800):
+            Resize the input to the given size. Only has an effect if `do_resize` is set to `True`. If size is a
+            sequence like `(width, height)`, output size will be matched to this. If size is an int, smaller edge of
+            the image will be matched to this number. i.e, if `height > width`, then image will be rescaled to `(size *
+            height / width, size)`.
+        size_divisor (`int`, *optional*, defaults to 32):
+            Some backbones need images divisible by a certain number. If not passed, it defaults to the value used in
+            Swin Transformer.
+        resample (`int`, *optional*, defaults to `Resampling.BILINEAR`):
+            An optional resampling filter. This can be one of `PIL.Image.Resampling.NEAREST`,
+            `PIL.Image.Resampling.BOX`, `PIL.Image.Resampling.BILINEAR`, `PIL.Image.Resampling.HAMMING`,
+            `PIL.Image.Resampling.BICUBIC` or `PIL.Image.Resampling.LANCZOS`. Only has an effect if `do_resize` is set
+            to `True`.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the input to a certain `scale`.
+        rescale_factor (`float`, *optional*, defaults to `1/ 255`):
+            Rescale the input by the given factor. Only has an effect if `do_rescale` is set to `True`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether or not to normalize the input with mean and standard deviation.
+        image_mean (`int`, *optional*, defaults to `[0.485, 0.456, 0.406]`):
+            The sequence of means for each channel, to be used when normalizing images. Defaults to the ImageNet mean.
+        image_std (`int`, *optional*, defaults to `[0.229, 0.224, 0.225]`):
+            The sequence of standard deviations for each channel, to be used when normalizing images. Defaults to the
+            ImageNet std.
+        ignore_index (`int`, *optional*):
+            Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
+            denoted with 0 (background) will be replaced with `ignore_index`.
+        reduce_labels (`bool`, *optional*, defaults to `False`):
+            Whether or not to decrement all label values of segmentation maps by 1. Usually used for datasets where 0
+            is used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k).
+            The background label will be replaced by `ignore_index`.
+
+    """
+
+    model_input_names = ["pixel_values", "pixel_mask"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        size_divisor: int = 32,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_factor: float = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Union[float, List[float]] = None,
+        image_std: Union[float, List[float]] = None,
+        ignore_index: Optional[int] = None,
+        reduce_labels: bool = False,
+        **kwargs,
+    ):
+        if "size_divisibility" in kwargs:
+            warnings.warn(
+                "The `size_divisibility` argument is deprecated and will be removed in v4.27. Please use "
+                "`size_divisor` instead.",
+                FutureWarning,
+            )
+            size_divisor = kwargs.pop("size_divisibility")
+        if "max_size" in kwargs:
+            warnings.warn(
+                "The `max_size` argument is deprecated and will be removed in v4.27. Please use size['longest_edge']"
+                " instead.",
+                FutureWarning,
+            )
+            # We make max_size a private attribute so we can pass it as a default value in the preprocess method whilst
+            # `size` can still be pass in as an int
+            self._max_size = kwargs.pop("max_size")
+        else:
+            self._max_size = 1333
+
+        size = size if size is not None else {"shortest_edge": 800, "longest_edge": self._max_size}
+        size = get_size_dict(size, max_size=self._max_size, default_to_square=False)
+
+        super().__init__(**kwargs)
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.size_divisor = size_divisor
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+        self.ignore_index = ignore_index
+        self.reduce_labels = reduce_labels
+
+    @classmethod
+    def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
+        """
+        Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
+        created using from_dict and kwargs e.g. `Mask2FormerImageProcessor.from_pretrained(checkpoint, max_size=800)`
+        """
+        image_processor_dict = image_processor_dict.copy()
+        if "max_size" in kwargs:
+            image_processor_dict["max_size"] = kwargs.pop("max_size")
+        if "size_divisibility" in kwargs:
+            image_processor_dict["size_divisibility"] = kwargs.pop("size_divisibility")
+        return super().from_dict(image_processor_dict, **kwargs)
+
+    # Copied from transformers.models.maskformer.image_processing_maskformer.MaskFormerImageProcessor.resize with get_maskformer_resize_output_image_size->get_mask2former_resize_output_image_size
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        size_divisor: int = 0,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format=None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize the image to the given size. Size can be min_size (scalar) or `(height, width)` tuple. If size is an
+        int, smaller edge of the image will be matched to this number.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                The size of the output image.
+            size_divisor (`int`, *optional*, defaults to 0):
+                If size_divisor is given, the output image size will be divisible by the number.
+            resample (`PILImageResampling` resampling filter, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                Resampling filter to use when resizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        if "max_size" in kwargs:
+            warnings.warn(
+                "The `max_size` parameter is deprecated and will be removed in v4.27. "
+                "Please specify in `size['longest_edge'] instead`.",
+                FutureWarning,
+            )
+            max_size = kwargs.pop("max_size")
+        else:
+            max_size = None
+        size = get_size_dict(size, max_size=max_size, default_to_square=False)
+        if "shortest_edge" in size and "longest_edge" in size:
+            size, max_size = size["shortest_edge"], size["longest_edge"]
+        elif "height" in size and "width" in size:
+            size = (size["height"], size["width"])
+            max_size = None
+        else:
+            raise ValueError(
+                "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
+                f" {size.keys()}."
+            )
+        size = get_mask2former_resize_output_image_size(
+            image=image,
+            size=size,
+            max_size=max_size,
+            size_divisor=size_divisor,
+            default_to_square=False,
+            input_data_format=input_data_format,
+        )
+        image = resize(
+            image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
+        )
+        return image
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
+    def rescale(
+        self,
+        image: np.ndarray,
+        rescale_factor: float,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Rescale the image by the given factor. image = image * rescale_factor.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            rescale_factor (`float`):
+                The value to use for rescaling.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the input image. If unset, is inferred from the input image. Can be
+                one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+        """
+        return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)
+
+    # Copied from transformers.models.maskformer.image_processing_maskformer.MaskFormerImageProcessor.convert_segmentation_map_to_binary_masks
+    def convert_segmentation_map_to_binary_masks(
+        self,
+        segmentation_map: "np.ndarray",
+        instance_id_to_semantic_id: Optional[Dict[int, int]] = None,
+        ignore_index: Optional[int] = None,
+        reduce_labels: bool = False,
+    ):
+        reduce_labels = reduce_labels if reduce_labels is not None else self.reduce_labels
+        ignore_index = ignore_index if ignore_index is not None else self.ignore_index
+        return convert_segmentation_map_to_binary_masks(
+            segmentation_map=segmentation_map,
+            instance_id_to_semantic_id=instance_id_to_semantic_id,
+            ignore_index=ignore_index,
+            reduce_labels=reduce_labels,
+        )
+
+    def __call__(self, images, segmentation_maps=None, **kwargs) -> BatchFeature:
+        return self.preprocess(images, segmentation_maps=segmentation_maps, **kwargs)
+
+    def _preprocess(
+        self,
+        image: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        size_divisor: int = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        if do_resize:
+            image = self.resize(
+                image, size=size, size_divisor=size_divisor, resample=resample, input_data_format=input_data_format
+            )
+        if do_rescale:
+            image = self.rescale(image, rescale_factor=rescale_factor, input_data_format=input_data_format)
+        if do_normalize:
+            image = self.normalize(image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+        return image
+
+    def _preprocess_image(
+        self,
+        image: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        size_divisor: int = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """Preprocesses a single image."""
+        # All transformations expect numpy arrays.
+        image = to_numpy_array(image)
+        if is_scaled_image(image) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(image)
+        image = self._preprocess(
+            image=image,
+            do_resize=do_resize,
+            size=size,
+            size_divisor=size_divisor,
+            resample=resample,
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            input_data_format=input_data_format,
+        )
+        if data_format is not None:
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+        return image
+
+    def _preprocess_mask(
+        self,
+        segmentation_map: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        size_divisor: int = 0,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """Preprocesses a single mask."""
+        segmentation_map = to_numpy_array(segmentation_map)
+        # Add channel dimension if missing - needed for certain transformations
+        if segmentation_map.ndim == 2:
+            added_channel_dim = True
+            segmentation_map = segmentation_map[None, ...]
+            input_data_format = ChannelDimension.FIRST
+        else:
+            added_channel_dim = False
+            if input_data_format is None:
+                input_data_format = infer_channel_dimension_format(segmentation_map)
+        # TODO: (Amy)
+        # Remork segmentation map processing to include reducing labels and resizing which doesn't
+        # drop segment IDs > 255.
+        segmentation_map = self._preprocess(
+            image=segmentation_map,
+            do_resize=do_resize,
+            resample=PILImageResampling.NEAREST,
+            size=size,
+            size_divisor=size_divisor,
+            do_rescale=False,
+            do_normalize=False,
+            input_data_format=input_data_format,
+        )
+        # Remove extra channel dimension if added for processing
+        if added_channel_dim:
+            segmentation_map = segmentation_map.squeeze(0)
+        return segmentation_map
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        segmentation_maps: Optional[ImageInput] = None,
+        instance_id_to_semantic_id: Optional[Dict[int, int]] = None,
+        do_resize: Optional[bool] = None,
+        size: Optional[Dict[str, int]] = None,
+        size_divisor: Optional[int] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        ignore_index: Optional[int] = None,
+        reduce_labels: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        if "pad_and_return_pixel_mask" in kwargs:
+            warnings.warn(
+                "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version",
+                FutureWarning,
+            )
+
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size, default_to_square=False, max_size=self._max_size)
+        size_divisor = size_divisor if size_divisor is not None else self.size_divisor
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        ignore_index = ignore_index if ignore_index is not None else self.ignore_index
+        reduce_labels = reduce_labels if reduce_labels is not None else self.reduce_labels
+
+        if do_resize is not None and size is None or size_divisor is None:
+            raise ValueError("If `do_resize` is True, `size` and `size_divisor` must be provided.")
+
+        if do_rescale is not None and rescale_factor is None:
+            raise ValueError("If `do_rescale` is True, `rescale_factor` must be provided.")
+
+        if do_normalize is not None and (image_mean is None or image_std is None):
+            raise ValueError("If `do_normalize` is True, `image_mean` and `image_std` must be provided.")
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        if segmentation_maps is not None and not valid_images(segmentation_maps):
+            raise ValueError(
+                "Invalid segmentation map type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        if not is_batched(images):
+            images = [images]
+            segmentation_maps = [segmentation_maps] if segmentation_maps is not None else None
+
+        if segmentation_maps is not None and len(images) != len(segmentation_maps):
+            raise ValueError("Images and segmentation maps must have the same length.")
+
+        images = [
+            self._preprocess_image(
+                image,
+                do_resize=do_resize,
+                size=size,
+                size_divisor=size_divisor,
+                resample=resample,
+                do_rescale=do_rescale,
+                rescale_factor=rescale_factor,
+                do_normalize=do_normalize,
+                image_mean=image_mean,
+                image_std=image_std,
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
+            for image in images
+        ]
+
+        if segmentation_maps is not None:
+            segmentation_maps = [
+                self._preprocess_mask(
+                    segmentation_map, do_resize, size, size_divisor, input_data_format=input_data_format
+                )
+                for segmentation_map in segmentation_maps
+            ]
+        encoded_inputs = self.encode_inputs(
+            images,
+            segmentation_maps,
+            instance_id_to_semantic_id,
+            ignore_index,
+            reduce_labels,
+            return_tensors,
+            input_data_format=input_data_format,
+        )
+        return encoded_inputs
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
+    def _pad_image(
+        self,
+        image: np.ndarray,
+        output_size: Tuple[int, int],
+        constant_values: Union[float, Iterable[float]] = 0,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Pad an image with zeros to the given size.
+        """
+        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+        output_height, output_width = output_size
+
+        pad_bottom = output_height - input_height
+        pad_right = output_width - input_width
+        padding = ((0, pad_bottom), (0, pad_right))
+        padded_image = pad(
+            image,
+            padding,
+            mode=PaddingMode.CONSTANT,
+            constant_values=constant_values,
+            data_format=data_format,
+            input_data_format=input_data_format,
+        )
+        return padded_image
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
+    def pad(
+        self,
+        images: List[np.ndarray],
+        constant_values: Union[float, Iterable[float]] = 0,
+        return_pixel_mask: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> BatchFeature:
+        """
+        Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
+        in the batch and optionally returns their corresponding pixel mask.
+
+        Args:
+            image (`np.ndarray`):
+                Image to pad.
+            constant_values (`float` or `Iterable[float]`, *optional*):
+                The value to use for the padding if `mode` is `"constant"`.
+            return_pixel_mask (`bool`, *optional*, defaults to `True`):
+                Whether to return a pixel mask.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        pad_size = get_max_height_width(images, input_data_format=input_data_format)
+
+        padded_images = [
+            self._pad_image(
+                image,
+                pad_size,
+                constant_values=constant_values,
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
+            for image in images
+        ]
+        data = {"pixel_values": padded_images}
+
+        if return_pixel_mask:
+            masks = [
+                make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
+                for image in images
+            ]
+            data["pixel_mask"] = masks
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    def encode_inputs(
+        self,
+        pixel_values_list: List[ImageInput],
+        segmentation_maps: ImageInput = None,
+        instance_id_to_semantic_id: Optional[Union[List[Dict[int, int]], Dict[int, int]]] = None,
+        ignore_index: Optional[int] = None,
+        reduce_labels: bool = False,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Pad images up to the largest image in a batch and create a corresponding `pixel_mask`.
+
+        Mask2Former addresses semantic segmentation with a mask classification paradigm, thus input segmentation maps
+        will be converted to lists of binary masks and their respective labels. Let's see an example, assuming
+        `segmentation_maps = [[2,6,7,9]]`, the output will contain `mask_labels =
+        [[1,0,0,0],[0,1,0,0],[0,0,1,0],[0,0,0,1]]` (four binary masks) and `class_labels = [2,6,7,9]`, the labels for
+        each mask.
+
+        Args:
+            pixel_values_list (`List[ImageInput]`):
+                List of images (pixel values) to be padded. Each image should be a tensor of shape `(channels, height,
+                width)`.
+
+            segmentation_maps (`ImageInput`, *optional*):
+                The corresponding semantic segmentation maps with the pixel-wise annotations.
+
+             (`bool`, *optional*, defaults to `True`):
+                Whether or not to pad images up to the largest image in a batch and create a pixel mask.
+
+                If left to the default, will return a pixel mask that is:
+
+                - 1 for pixels that are real (i.e. **not masked**),
+                - 0 for pixels that are padding (i.e. **masked**).
+
+            instance_id_to_semantic_id (`List[Dict[int, int]]` or `Dict[int, int]`, *optional*):
+                A mapping between object instance ids and class ids. If passed, `segmentation_maps` is treated as an
+                instance segmentation map where each pixel represents an instance id. Can be provided as a single
+                dictionary with a global/dataset-level mapping or as a list of dictionaries (one per image), to map
+                instance ids in each image separately.
+
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
+                If set, will return tensors instead of NumPy arrays. If set to `'pt'`, return PyTorch `torch.Tensor`
+                objects.
+
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **pixel_values** -- Pixel values to be fed to a model.
+            - **pixel_mask** -- Pixel mask to be fed to a model (when `=True` or if `pixel_mask` is in
+              `self.model_input_names`).
+            - **mask_labels** -- Optional list of mask labels of shape `(labels, height, width)` to be fed to a model
+              (when `annotations` are provided).
+            - **class_labels** -- Optional list of class labels of shape `(labels)` to be fed to a model (when
+              `annotations` are provided). They identify the labels of `mask_labels`, e.g. the label of
+              `mask_labels[i][j]` if `class_labels[i][j]`.
+        """
+        ignore_index = self.ignore_index if ignore_index is None else ignore_index
+        reduce_labels = self.reduce_labels if reduce_labels is None else reduce_labels
+
+        pixel_values_list = [to_numpy_array(pixel_values) for pixel_values in pixel_values_list]
+
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(pixel_values_list[0])
+
+        encoded_inputs = self.pad(
+            pixel_values_list, return_tensors=return_tensors, input_data_format=input_data_format
+        )
+
+        if segmentation_maps is not None:
+            mask_labels = []
+            class_labels = []
+            pad_size = get_max_height_width(pixel_values_list)
+            # Convert to list of binary masks and labels
+            for idx, segmentation_map in enumerate(segmentation_maps):
+                segmentation_map = to_numpy_array(segmentation_map)
+                if isinstance(instance_id_to_semantic_id, list):
+                    instance_id = instance_id_to_semantic_id[idx]
+                else:
+                    instance_id = instance_id_to_semantic_id
+                # Use instance2class_id mapping per image
+                masks, classes = self.convert_segmentation_map_to_binary_masks(
+                    segmentation_map, instance_id, ignore_index=ignore_index, reduce_labels=reduce_labels
+                )
+                # We add an axis to make them compatible with the transformations library
+                # this will be removed in the future
+                masks = [mask[None, ...] for mask in masks]
+                masks = [
+                    self._pad_image(image=mask, output_size=pad_size, constant_values=ignore_index) for mask in masks
+                ]
+                masks = np.concatenate(masks, axis=0)
+                mask_labels.append(torch.from_numpy(masks))
+                class_labels.append(torch.from_numpy(classes))
+
+            # we cannot batch them since they don't share a common class size
+            encoded_inputs["mask_labels"] = mask_labels
+            encoded_inputs["class_labels"] = class_labels
+
+        return encoded_inputs
+
+    def post_process_semantic_segmentation(
+        self, outputs, target_sizes: Optional[List[Tuple[int, int]]] = None
+    ) -> "torch.Tensor":
+        """
+        Converts the output of [`Mask2FormerForUniversalSegmentation`] into semantic segmentation maps. Only supports
+        PyTorch.
+
+        Args:
+            outputs ([`Mask2FormerForUniversalSegmentation`]):
+                Raw outputs of the model.
+            target_sizes (`List[Tuple[int, int]]`, *optional*):
+                List of length (batch_size), where each list item (`Tuple[int, int]]`) corresponds to the requested
+                final size (height, width) of each prediction. If left to None, predictions will not be resized.
+        Returns:
+            `List[torch.Tensor]`:
+                A list of length `batch_size`, where each item is a semantic segmentation map of shape (height, width)
+                corresponding to the target_sizes entry (if `target_sizes` is specified). Each entry of each
+                `torch.Tensor` correspond to a semantic class id.
+        """
+        class_queries_logits = outputs.class_queries_logits  # [batch_size, num_queries, num_classes+1]
+        masks_queries_logits = outputs.masks_queries_logits  # [batch_size, num_queries, height, width]
+
+        # Scale back to preprocessed image size - (384, 384) for all models
+        masks_queries_logits = torch.nn.functional.interpolate(
+            masks_queries_logits, size=(384, 384), mode="bilinear", align_corners=False
+        )
+
+        # Remove the null class `[..., :-1]`
+        masks_classes = class_queries_logits.softmax(dim=-1)[..., :-1]
+        masks_probs = masks_queries_logits.sigmoid()  # [batch_size, num_queries, height, width]
+
+        # Semantic segmentation logits of shape (batch_size, num_classes, height, width)
+        segmentation = torch.einsum("bqc, bqhw -> bchw", masks_classes, masks_probs)
+        batch_size = class_queries_logits.shape[0]
+
+        # Resize logits and compute semantic segmentation maps
+        if target_sizes is not None:
+            if batch_size != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+
+            semantic_segmentation = []
+            for idx in range(batch_size):
+                resized_logits = torch.nn.functional.interpolate(
+                    segmentation[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False
+                )
+                semantic_map = resized_logits[0].argmax(dim=0)
+                semantic_segmentation.append(semantic_map)
+        else:
+            semantic_segmentation = segmentation.argmax(dim=1)
+            semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])]
+
+        return semantic_segmentation
+
+    def post_process_instance_segmentation(
+        self,
+        outputs,
+        threshold: float = 0.5,
+        mask_threshold: float = 0.5,
+        overlap_mask_area_threshold: float = 0.8,
+        target_sizes: Optional[List[Tuple[int, int]]] = None,
+        return_coco_annotation: Optional[bool] = False,
+        return_binary_maps: Optional[bool] = False,
+    ) -> List[Dict]:
+        """
+        Converts the output of [`Mask2FormerForUniversalSegmentationOutput`] into instance segmentation predictions.
+        Only supports PyTorch.
+
+        Args:
+            outputs ([`Mask2FormerForUniversalSegmentation`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*, defaults to 0.5):
+                The probability score threshold to keep predicted instance masks.
+            mask_threshold (`float`, *optional*, defaults to 0.5):
+                Threshold to use when turning the predicted masks into binary values.
+            overlap_mask_area_threshold (`float`, *optional*, defaults to 0.8):
+                The overlap mask area threshold to merge or discard small disconnected parts within each binary
+                instance mask.
+            target_sizes (`List[Tuple]`, *optional*):
+                List of length (batch_size), where each list item (`Tuple[int, int]]`) corresponds to the requested
+                final size (height, width) of each prediction. If left to None, predictions will not be resized.
+            return_coco_annotation (`bool`, *optional*, defaults to `False`):
+                If set to `True`, segmentation maps are returned in COCO run-length encoding (RLE) format.
+            return_binary_maps (`bool`, *optional*, defaults to `False`):
+                If set to `True`, segmentation maps are returned as a concatenated tensor of binary segmentation maps
+                (one per detected instance).
+        Returns:
+            `List[Dict]`: A list of dictionaries, one per image, each dictionary containing two keys:
+            - **segmentation** -- A tensor of shape `(height, width)` where each pixel represents a `segment_id` or
+              `List[List]` run-length encoding (RLE) of the segmentation map if return_coco_annotation is set to
+              `True`. Set to `None` if no mask if found above `threshold`.
+            - **segments_info** -- A dictionary that contains additional information on each segment.
+                - **id** -- An integer representing the `segment_id`.
+                - **label_id** -- An integer representing the label / semantic class id corresponding to `segment_id`.
+                - **score** -- Prediction score of segment with `segment_id`.
+        """
+        if return_coco_annotation and return_binary_maps:
+            raise ValueError("return_coco_annotation and return_binary_maps can not be both set to True.")
+
+        # [batch_size, num_queries, num_classes+1]
+        class_queries_logits = outputs.class_queries_logits
+        # [batch_size, num_queries, height, width]
+        masks_queries_logits = outputs.masks_queries_logits
+
+        # Scale back to preprocessed image size - (384, 384) for all models
+        masks_queries_logits = torch.nn.functional.interpolate(
+            masks_queries_logits, size=(384, 384), mode="bilinear", align_corners=False
+        )
+
+        device = masks_queries_logits.device
+        num_classes = class_queries_logits.shape[-1] - 1
+        num_queries = class_queries_logits.shape[-2]
+
+        # Loop over items in batch size
+        results: List[Dict[str, TensorType]] = []
+
+        for i in range(class_queries_logits.shape[0]):
+            mask_pred = masks_queries_logits[i]
+            mask_cls = class_queries_logits[i]
+
+            scores = torch.nn.functional.softmax(mask_cls, dim=-1)[:, :-1]
+            labels = torch.arange(num_classes, device=device).unsqueeze(0).repeat(num_queries, 1).flatten(0, 1)
+
+            scores_per_image, topk_indices = scores.flatten(0, 1).topk(num_queries, sorted=False)
+            labels_per_image = labels[topk_indices]
+
+            topk_indices = torch.div(topk_indices, num_classes, rounding_mode="floor")
+            mask_pred = mask_pred[topk_indices]
+            pred_masks = (mask_pred > 0).float()
+
+            # Calculate average mask prob
+            mask_scores_per_image = (mask_pred.sigmoid().flatten(1) * pred_masks.flatten(1)).sum(1) / (
+                pred_masks.flatten(1).sum(1) + 1e-6
+            )
+            pred_scores = scores_per_image * mask_scores_per_image
+            pred_classes = labels_per_image
+
+            segmentation = torch.zeros((384, 384)) - 1
+            if target_sizes is not None:
+                segmentation = torch.zeros(target_sizes[i]) - 1
+                pred_masks = torch.nn.functional.interpolate(
+                    pred_masks.unsqueeze(0), size=target_sizes[i], mode="nearest"
+                )[0]
+
+            instance_maps, segments = [], []
+            current_segment_id = 0
+            for j in range(num_queries):
+                score = pred_scores[j].item()
+
+                if not torch.all(pred_masks[j] == 0) and score >= threshold:
+                    segmentation[pred_masks[j] == 1] = current_segment_id
+                    segments.append(
+                        {
+                            "id": current_segment_id,
+                            "label_id": pred_classes[j].item(),
+                            "was_fused": False,
+                            "score": round(score, 6),
+                        }
+                    )
+                    current_segment_id += 1
+                    instance_maps.append(pred_masks[j])
+
+            # Return segmentation map in run-length encoding (RLE) format
+            if return_coco_annotation:
+                segmentation = convert_segmentation_to_rle(segmentation)
+
+            # Return a concatenated tensor of binary instance maps
+            if return_binary_maps and len(instance_maps) != 0:
+                segmentation = torch.stack(instance_maps, dim=0)
+
+            results.append({"segmentation": segmentation, "segments_info": segments})
+        return results
+
+    def post_process_panoptic_segmentation(
+        self,
+        outputs,
+        threshold: float = 0.5,
+        mask_threshold: float = 0.5,
+        overlap_mask_area_threshold: float = 0.8,
+        label_ids_to_fuse: Optional[Set[int]] = None,
+        target_sizes: Optional[List[Tuple[int, int]]] = None,
+    ) -> List[Dict]:
+        """
+        Converts the output of [`Mask2FormerForUniversalSegmentationOutput`] into image panoptic segmentation
+        predictions. Only supports PyTorch.
+
+        Args:
+            outputs ([`Mask2FormerForUniversalSegmentationOutput`]):
+                The outputs from [`Mask2FormerForUniversalSegmentation`].
+            threshold (`float`, *optional*, defaults to 0.5):
+                The probability score threshold to keep predicted instance masks.
+            mask_threshold (`float`, *optional*, defaults to 0.5):
+                Threshold to use when turning the predicted masks into binary values.
+            overlap_mask_area_threshold (`float`, *optional*, defaults to 0.8):
+                The overlap mask area threshold to merge or discard small disconnected parts within each binary
+                instance mask.
+            label_ids_to_fuse (`Set[int]`, *optional*):
+                The labels in this state will have all their instances be fused together. For instance we could say
+                there can only be one sky in an image, but several persons, so the label ID for sky would be in that
+                set, but not the one for person.
+            target_sizes (`List[Tuple]`, *optional*):
+                List of length (batch_size), where each list item (`Tuple[int, int]]`) corresponds to the requested
+                final size (height, width) of each prediction in batch. If left to None, predictions will not be
+                resized.
+
+        Returns:
+            `List[Dict]`: A list of dictionaries, one per image, each dictionary containing two keys:
+            - **segmentation** -- a tensor of shape `(height, width)` where each pixel represents a `segment_id`, set
+              to `None` if no mask if found above `threshold`. If `target_sizes` is specified, segmentation is resized
+              to the corresponding `target_sizes` entry.
+            - **segments_info** -- A dictionary that contains additional information on each segment.
+                - **id** -- an integer representing the `segment_id`.
+                - **label_id** -- An integer representing the label / semantic class id corresponding to `segment_id`.
+                - **was_fused** -- a boolean, `True` if `label_id` was in `label_ids_to_fuse`, `False` otherwise.
+                  Multiple instances of the same class / label were fused and assigned a single `segment_id`.
+                - **score** -- Prediction score of segment with `segment_id`.
+        """
+
+        if label_ids_to_fuse is None:
+            logger.warning("`label_ids_to_fuse` unset. No instance will be fused.")
+            label_ids_to_fuse = set()
+
+        class_queries_logits = outputs.class_queries_logits  # [batch_size, num_queries, num_classes+1]
+        masks_queries_logits = outputs.masks_queries_logits  # [batch_size, num_queries, height, width]
+
+        # Scale back to preprocessed image size - (384, 384) for all models
+        masks_queries_logits = torch.nn.functional.interpolate(
+            masks_queries_logits, size=(384, 384), mode="bilinear", align_corners=False
+        )
+
+        batch_size = class_queries_logits.shape[0]
+        num_labels = class_queries_logits.shape[-1] - 1
+
+        mask_probs = masks_queries_logits.sigmoid()  # [batch_size, num_queries, height, width]
+
+        # Predicted label and score of each query (batch_size, num_queries)
+        pred_scores, pred_labels = nn.functional.softmax(class_queries_logits, dim=-1).max(-1)
+
+        # Loop over items in batch size
+        results: List[Dict[str, TensorType]] = []
+
+        for i in range(batch_size):
+            mask_probs_item, pred_scores_item, pred_labels_item = remove_low_and_no_objects(
+                mask_probs[i], pred_scores[i], pred_labels[i], threshold, num_labels
+            )
+
+            # No mask found
+            if mask_probs_item.shape[0] <= 0:
+                height, width = target_sizes[i] if target_sizes is not None else mask_probs_item.shape[1:]
+                segmentation = torch.zeros((height, width)) - 1
+                results.append({"segmentation": segmentation, "segments_info": []})
+                continue
+
+            # Get segmentation map and segment information of batch item
+            target_size = target_sizes[i] if target_sizes is not None else None
+            segmentation, segments = compute_segments(
+                mask_probs=mask_probs_item,
+                pred_scores=pred_scores_item,
+                pred_labels=pred_labels_item,
+                mask_threshold=mask_threshold,
+                overlap_mask_area_threshold=overlap_mask_area_threshold,
+                label_ids_to_fuse=label_ids_to_fuse,
+                target_size=target_size,
+            )
+
+            results.append({"segmentation": segmentation, "segments_info": segments})
+        return results
diff --git a/src/transformers/models/mask2former/modeling_mask2former.py b/src/transformers/models/mask2former/modeling_mask2former.py
new file mode 100644
index 000000000000..e839b16f6257
--- /dev/null
+++ b/src/transformers/models/mask2former/modeling_mask2former.py
@@ -0,0 +1,2562 @@
+# coding=utf-8
+# Copyright 2022 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Mask2Former model."""
+
+import math
+import warnings
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+import torch
+from torch import Tensor, nn
+
+from ... import AutoBackbone
+from ...activations import ACT2FN
+from ...file_utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_scipy_available,
+    replace_return_docstrings,
+    requires_backends,
+)
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions
+from ...modeling_utils import PreTrainedModel
+from ...utils import logging
+from .configuration_mask2former import Mask2FormerConfig
+
+
+if is_scipy_available():
+    from scipy.optimize import linear_sum_assignment
+
+logger = logging.get_logger(__name__)
+
+
+_CONFIG_FOR_DOC = "Mask2FormerConfig"
+_CHECKPOINT_FOR_DOC = "facebook/mask2former-swin-small-coco-instance"
+_IMAGE_PROCESSOR_FOR_DOC = "Mask2FormerImageProcessor"
+
+MASK2FORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/mask2former-swin-small-coco-instance",
+    # See all mask2former models at https://huggingface.co/models?filter=mask2former
+]
+
+
+@dataclass
+class Mask2FormerPixelDecoderOutput(ModelOutput):
+    """
+    Mask2Former's pixel decoder module output, practically a Multi-Scale Deformable Attention based decoder. It returns
+    the mask features and the multiscale features.
+
+    Args:
+        multi_scale_features (`tuple(torch.FloatTensor)`):
+            Tuple of multi-scale features of scales [1/8, 1/16, 1/32] and shape `(batch_size, num_channels, height,
+            width)`from the Multi-Scale Deformable Attenntion based Pixel Decoder.
+        mask_features (`torch.FloatTensor`):
+            Tensor of shape `(batch_size, num_channels, height, width)`, 1/4 scale features from the last Pixel Decoder
+            Layer.
+        attentions (`tuple(torch.FloatTensor)`, *optional*):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights from pixel decoder. Returned when `output_attentions=True` is passed
+            or when `config.output_attentions=True`
+    """
+
+    multi_scale_features: Tuple[torch.FloatTensor] = None
+    mask_features: torch.FloatTensor = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class Mask2FormerMaskedAttentionDecoderOutput(BaseModelOutputWithCrossAttentions):
+    """
+    Base class for outputs of the Transformer decoder. This class adds two attributes to
+    BaseModelOutputWithCrossAttentions for mask predictions logits and a tuple of intermediate decoder activations,
+    i.e. the output of each decoder layer, each of them gone through a layernorm.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs. Returned when `output_hidden_states=True`.
+        attentions (`tuple(torch.FloatTensor)`, *optional*):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads. Returned when `output_attentions=True`.
+        masks_queries_logits (`tuple(torch.FloatTensor)` of shape `(batch_size, num_queries, height, width)`):
+            Tuple of mask predictions from all layers of the transformer decoder.
+        intermediate_hidden_states (`tuple(torch.FloatTensor)` of shape `(num_queries, 1, hidden_size)`):
+            Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
+            layernorm.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[torch.FloatTensor] = None
+    masks_queries_logits: Tuple[torch.FloatTensor] = None
+    intermediate_hidden_states: Tuple[torch.FloatTensor] = None
+
+
+@dataclass
+class Mask2FormerPixelLevelModuleOutput(ModelOutput):
+    """
+    Mask2Former's pixel level module output. It returns the output of the encoder (optional) and all hidden states
+    (multi-scale features) from the `decoder`. By default, the `encoder` is a Swin Backbone and the `decoder` is a
+    Multi-Scale Deformable Attention based decoder.
+
+    The `decoder_last_hidden_state` are the **per-pixel embeddings** while `decoder_hidden_states` refer to multi-scale
+    feature maps produced using **multi-scaling strategy** defined in the paper.
+
+    Args:
+        encoder_last_hidden_state (`torch.FloatTensor`):
+            Last hidden states (final feature map of shape `(batch_size, num_channels, height, width)`) of the last
+            stage of the encoder.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+            Tuple of `torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`. Hidden states (also
+            called feature maps) of the model at the output of each stage. Returned if output_hidden_states is set to
+            True.
+        decoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)):
+            1/4 scale features from the last Pixel Decoder Layer.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`):
+            Tuple of `torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`. Hidden states (also
+            called feature maps) of the model at the output of each stage.
+    """
+
+    encoder_last_hidden_state: torch.FloatTensor = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_last_hidden_state: torch.FloatTensor = None
+    decoder_hidden_states: Tuple[torch.FloatTensor] = None
+
+
+@dataclass
+class Mask2FormerModelOutput(ModelOutput):
+    """
+    Class for outputs of [`Mask2FormerModel`]. This class returns all the needed hidden states to compute the logits.
+
+    Args:
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`, *optional*):
+            Last hidden states (final feature map) of the last stage of the encoder model (backbone). Returned when
+            `output_hidden_states=True` is passed.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the encoder
+            model at the output of each stage. Returned when `output_hidden_states=True` is passed.
+        pixel_decoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`, *optional*):
+            Last hidden states (final feature map) of the last stage of the pixel decoder model.
+        pixel_decoder_hidden_states (`tuple(torch.FloatTensor)`, , *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the pixel
+            decoder model at the output of each stage. Returned when `output_hidden_states=True` is passed.
+        transformer_decoder_last_hidden_state (`tuple(torch.FloatTensor)`):
+            Final output of the transformer decoder `(batch_size, sequence_length, hidden_size)`.
+        transformer_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states (also called feature maps) of the
+            transformer decoder at the output of each stage. Returned when `output_hidden_states=True` is passed.
+        transformer_decoder_intermediate_states (`tuple(torch.FloatTensor)` of shape `(num_queries, 1, hidden_size)`):
+            Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
+            layernorm.
+        masks_queries_logits (`tuple(torch.FloatTensor)` of shape `(batch_size, num_queries, height, width)`)
+            Mask Predictions from each layer in the transformer decoder.
+        attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed):
+            Tuple of `tuple(torch.FloatTensor)` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Self attentions weights from transformer decoder.
+    """
+
+    encoder_last_hidden_state: torch.FloatTensor = None
+    pixel_decoder_last_hidden_state: torch.FloatTensor = None
+    transformer_decoder_last_hidden_state: torch.FloatTensor = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    pixel_decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    transformer_decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    transformer_decoder_intermediate_states: Tuple[torch.FloatTensor] = None
+    masks_queries_logits: Tuple[torch.FloatTensor] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class Mask2FormerForUniversalSegmentationOutput(ModelOutput):
+    """
+    Class for outputs of [`Mask2FormerForUniversalSegmentationOutput`].
+
+    This output can be directly passed to [`~Mask2FormerImageProcessor.post_process_semantic_segmentation`] or
+    [`~Mask2FormerImageProcessor.post_process_instance_segmentation`] or
+    [`~Mask2FormerImageProcessor.post_process_panoptic_segmentation`] to compute final segmentation maps. Please, see
+    [`~Mask2FormerImageProcessor] for details regarding usage.
+
+    Args:
+        loss (`torch.Tensor`, *optional*):
+            The computed loss, returned when labels are present.
+        class_queries_logits (`torch.FloatTensor`):
+            A tensor of shape `(batch_size, num_queries, num_labels + 1)` representing the proposed classes for each
+            query. Note the `+ 1` is needed because we incorporate the null class.
+        masks_queries_logits (`torch.FloatTensor`):
+            A tensor of shape `(batch_size, num_queries, height, width)` representing the proposed masks for each
+            query.
+        auxiliary_logits (`List[Dict(str, torch.FloatTensor)]`, *optional*):
+            List of class and mask predictions from each layer of the transformer decoder.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Last hidden states (final feature map) of the last stage of the encoder model (backbone).
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the encoder
+            model at the output of each stage.
+        pixel_decoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Last hidden states (final feature map) of the last stage of the pixel decoder model.
+        pixel_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the pixel
+            decoder model at the output of each stage.
+        transformer_decoder_last_hidden_state (`tuple(torch.FloatTensor)`):
+            Final output of the transformer decoder `(batch_size, sequence_length, hidden_size)`.
+        transformer_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states (also called feature maps) of the
+            transformer decoder at the output of each stage.
+        attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tuple(torch.FloatTensor)` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Self and Cross Attentions weights from transformer decoder.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    class_queries_logits: torch.FloatTensor = None
+    masks_queries_logits: torch.FloatTensor = None
+    auxiliary_logits: Optional[List[Dict[str, torch.FloatTensor]]] = None
+    encoder_last_hidden_state: torch.FloatTensor = None
+    pixel_decoder_last_hidden_state: torch.FloatTensor = None
+    transformer_decoder_last_hidden_state: torch.FloatTensor = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    pixel_decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    transformer_decoder_hidden_states: Optional[torch.FloatTensor] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+# Copied from transformers.models.detr.modeling_detr._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, target_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[batch_size, seq_len]` to `[batch_size, 1, target_seq_len, source_seq_len]`.
+    """
+    batch_size, source_len = mask.size()
+    target_len = target_len if target_len is not None else source_len
+
+    expanded_mask = mask[:, None, None, :].expand(batch_size, 1, target_len, source_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
+
+
+# Adapted from https://github.com/facebookresearch/detectron2/blob/main/projects/PointRend/point_rend/point_features.py
+def sample_point(
+    input_features: torch.Tensor, point_coordinates: torch.Tensor, add_dim=False, **kwargs
+) -> torch.Tensor:
+    """
+    A wrapper around `torch.nn.functional.grid_sample` to support 3D point_coordinates tensors.
+
+    Args:
+        input_features (`torch.Tensor` of shape (batch_size, channels, height, width)):
+            A tensor that contains features map on a height * width grid
+        point_coordinates (`torch.Tensor` of shape (batch_size, num_points, 2) or (batch_size, grid_height, grid_width,:
+        2)):
+            A tensor that contains [0, 1] * [0, 1] normalized point coordinates
+        add_dim (`bool`):
+            boolean value to keep track of added dimension
+
+    Returns:
+        point_features (`torch.Tensor` of shape (batch_size, channels, num_points) or (batch_size, channels,
+        height_grid, width_grid):
+            A tensor that contains features for points in `point_coordinates`.
+    """
+    if point_coordinates.dim() == 3:
+        add_dim = True
+        point_coordinates = point_coordinates.unsqueeze(2)
+
+    # use nn.function.grid_sample to get features for points in `point_coordinates` via bilinear interpolation
+    point_features = torch.nn.functional.grid_sample(input_features, 2.0 * point_coordinates - 1.0, **kwargs)
+    if add_dim:
+        point_features = point_features.squeeze(3)
+
+    return point_features
+
+
+# Copied from transformers.models.maskformer.modeling_maskformer.dice_loss
+def dice_loss(inputs: Tensor, labels: Tensor, num_masks: int) -> Tensor:
+    r"""
+    Compute the DICE loss, similar to generalized IOU for masks as follows:
+
+    $$ \mathcal{L}_{\text{dice}(x, y) = 1 - \frac{2 * x \cap y }{x \cup y + 1}} $$
+
+    In practice, since `labels` is a binary mask, (only 0s and 1s), dice can be computed as follow
+
+    $$ \mathcal{L}_{\text{dice}(x, y) = 1 - \frac{2 * x * y }{x + y + 1}} $$
+
+    Args:
+        inputs (`torch.Tensor`):
+            A tensor representing a mask.
+        labels (`torch.Tensor`):
+            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
+            (0 for the negative class and 1 for the positive class).
+        num_masks (`int`):
+            The number of masks present in the current batch, used for normalization.
+
+    Returns:
+        `torch.Tensor`: The computed loss.
+    """
+    probs = inputs.sigmoid().flatten(1)
+    numerator = 2 * (probs * labels).sum(-1)
+    denominator = probs.sum(-1) + labels.sum(-1)
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    loss = loss.sum() / num_masks
+    return loss
+
+
+def sigmoid_cross_entropy_loss(inputs: torch.Tensor, labels: torch.Tensor, num_masks: int) -> torch.Tensor:
+    r"""
+    Args:
+        inputs (`torch.Tensor`):
+            A float tensor of arbitrary shape.
+        labels (`torch.Tensor`):
+            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
+            (0 for the negative class and 1 for the positive class).
+
+    Returns:
+        loss (`torch.Tensor`): The computed loss.
+    """
+    criterion = nn.BCEWithLogitsLoss(reduction="none")
+    cross_entropy_loss = criterion(inputs, labels)
+
+    loss = cross_entropy_loss.mean(1).sum() / num_masks
+    return loss
+
+
+# Copied from transformers.models.maskformer.modeling_maskformer.pair_wise_dice_loss
+def pair_wise_dice_loss(inputs: Tensor, labels: Tensor) -> Tensor:
+    """
+    A pair wise version of the dice loss, see `dice_loss` for usage.
+
+    Args:
+        inputs (`torch.Tensor`):
+            A tensor representing a mask
+        labels (`torch.Tensor`):
+            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
+            (0 for the negative class and 1 for the positive class).
+
+    Returns:
+        `torch.Tensor`: The computed loss between each pairs.
+    """
+    inputs = inputs.sigmoid().flatten(1)
+    numerator = 2 * torch.matmul(inputs, labels.T)
+    # using broadcasting to get a [num_queries, NUM_CLASSES] matrix
+    denominator = inputs.sum(-1)[:, None] + labels.sum(-1)[None, :]
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    return loss
+
+
+def pair_wise_sigmoid_cross_entropy_loss(inputs: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
+    r"""
+    A pair wise version of the cross entropy loss, see `sigmoid_cross_entropy_loss` for usage.
+
+    Args:
+        inputs (`torch.Tensor`):
+            A tensor representing a mask.
+        labels (`torch.Tensor`):
+            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
+            (0 for the negative class and 1 for the positive class).
+
+    Returns:
+        loss (`torch.Tensor`): The computed loss between each pairs.
+    """
+
+    height_and_width = inputs.shape[1]
+
+    criterion = nn.BCEWithLogitsLoss(reduction="none")
+    cross_entropy_loss_pos = criterion(inputs, torch.ones_like(inputs))
+    cross_entropy_loss_neg = criterion(inputs, torch.zeros_like(inputs))
+
+    loss_pos = torch.matmul(cross_entropy_loss_pos, labels.T)
+    loss_neg = torch.matmul(cross_entropy_loss_neg, (1 - labels).T)
+    loss = loss_pos + loss_neg
+    loss = loss / height_and_width
+    return loss
+
+
+# Adapted from https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/modeling/matcher.py
+class Mask2FormerHungarianMatcher(nn.Module):
+    """This class computes an assignment between the labels and the predictions of the network.
+
+    For efficiency reasons, the labels don't include the no_object. Because of this, in general, there are more
+    predictions than labels. In this case, we do a 1-to-1 matching of the best predictions, while the others are
+    un-matched (and thus treated as non-objects).
+    """
+
+    def __init__(
+        self, cost_class: float = 1.0, cost_mask: float = 1.0, cost_dice: float = 1.0, num_points: int = 12544
+    ):
+        """Creates the matcher
+
+        Params:
+            cost_class (`float`, *optional*, defaults to 1.0):
+                Relative weight of the classification error in the matching cost.
+            cost_mask (`float`, *optional*,  defaults to 1.0):
+                This is the relative weight of the focal loss of the binary mask in the matching cost.
+            cost_dice (`float`, *optional*, defaults to 1.0):
+                This is the relative weight of the dice loss of the binary mask in the matching cost.
+            num_points (`int`, *optional*, defaults to 12544):
+                No. of points to sample on which the mask loss will be calculated. The same set of K points are
+                uniformly sampled for all prediction and ground truth masks to construct the cost matrix for bipartite
+                matching.
+        """
+        super().__init__()
+        if cost_class == 0 and cost_mask == 0 and cost_dice == 0:
+            raise ValueError("All costs cant be 0")
+
+        self.num_points = num_points
+        self.cost_class = cost_class
+        self.cost_mask = cost_mask
+        self.cost_dice = cost_dice
+
+    @torch.no_grad()
+    def forward(
+        self,
+        masks_queries_logits: torch.Tensor,
+        class_queries_logits: torch.Tensor,
+        mask_labels: torch.Tensor,
+        class_labels: torch.Tensor,
+    ) -> List[Tuple[Tensor]]:
+        """
+        Params:
+            masks_queries_logits (`torch.Tensor`):
+                A tensor of dim `batch_size, num_queries, num_labels` with the classification logits.
+            class_queries_logits (`torch.Tensor`):
+                A tensor of dim `batch_size, num_queries, height, width` with the predicted masks.
+            class_labels (`torch.Tensor`):
+                A tensor of dim `num_target_boxes` (where num_target_boxes is the number of ground-truth objects in the
+                target) containing the class labels.
+            mask_labels (`torch.Tensor`):
+                A tensor of dim `num_target_boxes, height, width` containing the target masks.
+
+        Returns:
+            matched_indices (`List[Tuple[Tensor]]`): A list of size batch_size, containing tuples of (index_i, index_j)
+            where:
+                - index_i is the indices of the selected predictions (in order)
+                - index_j is the indices of the corresponding selected labels (in order)
+            For each batch element, it holds:
+                len(index_i) = len(index_j) = min(num_queries, num_target_boxes).
+        """
+        indices: List[Tuple[np.array]] = []
+
+        # iterate through batch size
+        batch_size = masks_queries_logits.shape[0]
+        for i in range(batch_size):
+            pred_probs = class_queries_logits[i].softmax(-1)
+            pred_mask = masks_queries_logits[i]
+
+            # Compute the classification cost. Contrary to the loss, we don't use the NLL, but approximate it in 1 - proba[target class]. The 1 is a constant that doesn't change the matching, it can be ommitted.
+            cost_class = -pred_probs[:, class_labels[i]]
+            target_mask = mask_labels[i].to(pred_mask)
+            target_mask = target_mask[:, None]
+            pred_mask = pred_mask[:, None]
+
+            # Sample ground truth and predicted masks
+            point_coordinates = torch.rand(1, self.num_points, 2, device=pred_mask.device)
+
+            target_coordinates = point_coordinates.repeat(target_mask.shape[0], 1, 1)
+            target_mask = sample_point(target_mask, target_coordinates, align_corners=False).squeeze(1)
+
+            pred_coordinates = point_coordinates.repeat(pred_mask.shape[0], 1, 1)
+            pred_mask = sample_point(pred_mask, pred_coordinates, align_corners=False).squeeze(1)
+
+            # compute the cross entropy loss between each mask pairs -> shape (num_queries, num_labels)
+            cost_mask = pair_wise_sigmoid_cross_entropy_loss(pred_mask, target_mask)
+            # Compute the dice loss betwen each mask pairs -> shape (num_queries, num_labels)
+            cost_dice = pair_wise_dice_loss(pred_mask, target_mask)
+            # final cost matrix
+            cost_matrix = self.cost_mask * cost_mask + self.cost_class * cost_class + self.cost_dice * cost_dice
+            # do the assigmented using the hungarian algorithm in scipy
+            assigned_indices: Tuple[np.array] = linear_sum_assignment(cost_matrix.cpu())
+            indices.append(assigned_indices)
+
+        # It could be stacked in one tensor
+        matched_indices = [
+            (torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices
+        ]
+        return matched_indices
+
+
+# Adapted from https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/modeling/criterion.py
+class Mask2FormerLoss(nn.Module):
+    def __init__(self, config: Mask2FormerConfig, weight_dict: Dict[str, float]):
+        """
+        The Mask2Former Loss. The loss is computed very similar to DETR. The process happens in two steps: 1) we
+        compute hungarian assignment between ground truth masks and the outputs of the model 2) we supervise each pair
+        of matched ground-truth / prediction (supervise class and mask)
+
+        Args:
+            config (`Mask2FormerConfig`):
+                The configuration for Mask2Former model also containing loss calculation specific parameters.
+            weight_dict (`Dict[str, float]`):
+                A dictionary of weights to be applied to the different losses.
+        """
+        super().__init__()
+        requires_backends(self, ["scipy"])
+        self.num_labels = config.num_labels
+        self.weight_dict = weight_dict
+
+        # Weight to apply to the null class
+        self.eos_coef = config.no_object_weight
+        empty_weight = torch.ones(self.num_labels + 1)
+        empty_weight[-1] = self.eos_coef
+        self.register_buffer("empty_weight", empty_weight)
+
+        # pointwise mask loss parameters
+        self.num_points = config.train_num_points
+        self.oversample_ratio = config.oversample_ratio
+        self.importance_sample_ratio = config.importance_sample_ratio
+
+        self.matcher = Mask2FormerHungarianMatcher(
+            cost_class=1.0,
+            cost_dice=config.dice_weight,
+            cost_mask=config.mask_weight,
+            num_points=self.num_points,
+        )
+
+    def _max_by_axis(self, sizes: List[List[int]]) -> List[int]:
+        maxes = sizes[0]
+        for sublist in sizes[1:]:
+            for index, item in enumerate(sublist):
+                maxes[index] = max(maxes[index], item)
+        return maxes
+
+    # Adapted from nested_tensor_from_tensor_list() in original implementation
+    def _pad_images_to_max_in_batch(self, tensors: List[Tensor]) -> Tuple[Tensor, Tensor]:
+        # get the maximum size in the batch
+        max_size = self._max_by_axis([list(tensor.shape) for tensor in tensors])
+        # compute final size
+        batch_shape = [len(tensors)] + max_size
+        batch_size, _, height, width = batch_shape
+        dtype = tensors[0].dtype
+        device = tensors[0].device
+        padded_tensors = torch.zeros(batch_shape, dtype=dtype, device=device)
+        padding_masks = torch.ones((batch_size, height, width), dtype=torch.bool, device=device)
+        # pad the tensors to the size of the biggest one
+        for tensor, padded_tensor, padding_mask in zip(tensors, padded_tensors, padding_masks):
+            padded_tensor[: tensor.shape[0], : tensor.shape[1], : tensor.shape[2]].copy_(tensor)
+            padding_mask[: tensor.shape[1], : tensor.shape[2]] = False
+
+        return padded_tensors, padding_masks
+
+    def loss_labels(
+        self, class_queries_logits: Tensor, class_labels: List[Tensor], indices: Tuple[np.array]
+    ) -> Dict[str, Tensor]:
+        """Compute the losses related to the labels using cross entropy.
+
+        Args:
+            class_queries_logits (`torch.Tensor`):
+                A tensor of shape `batch_size, num_queries, num_labels`
+            class_labels (`List[torch.Tensor]`):
+                List of class labels of shape `(labels)`.
+            indices (`Tuple[np.array])`:
+                The indices computed by the Hungarian matcher.
+
+        Returns:
+            `Dict[str, Tensor]`: A dict of `torch.Tensor` containing the following key:
+            - **loss_cross_entropy** -- The loss computed using cross entropy on the predicted and ground truth labels.
+        """
+        pred_logits = class_queries_logits
+        batch_size, num_queries, _ = pred_logits.shape
+        criterion = nn.CrossEntropyLoss(weight=self.empty_weight)
+        idx = self._get_predictions_permutation_indices(indices)  # shape of (batch_size, num_queries)
+        target_classes_o = torch.cat(
+            [target[j] for target, (_, j) in zip(class_labels, indices)]
+        )  # shape of (batch_size, num_queries)
+        target_classes = torch.full(
+            (batch_size, num_queries), fill_value=self.num_labels, dtype=torch.int64, device=pred_logits.device
+        )
+        target_classes[idx] = target_classes_o
+        # Permute target_classes (batch_size, num_queries, num_labels) -> (batch_size, num_labels, num_queries)
+        pred_logits_transposed = pred_logits.transpose(1, 2)
+        loss_ce = criterion(pred_logits_transposed, target_classes)
+        losses = {"loss_cross_entropy": loss_ce}
+        return losses
+
+    def loss_masks(
+        self,
+        masks_queries_logits: torch.Tensor,
+        mask_labels: List[torch.Tensor],
+        indices: Tuple[np.array],
+        num_masks: int,
+    ) -> Dict[str, torch.Tensor]:
+        """Compute the losses related to the masks using sigmoid_cross_entropy_loss and dice loss.
+
+        Args:
+            masks_queries_logits (`torch.Tensor`):
+                A tensor of shape `(batch_size, num_queries, height, width)`.
+            mask_labels (`torch.Tensor`):
+                List of mask labels of shape `(labels, height, width)`.
+            indices (`Tuple[np.array])`:
+                The indices computed by the Hungarian matcher.
+            num_masks (`int)`:
+                The number of masks, used for normalization.
+
+        Returns:
+            losses (`Dict[str, Tensor]`): A dict of `torch.Tensor` containing two keys:
+            - **loss_mask** -- The loss computed using sigmoid cross entropy loss on the predicted and ground truth.
+              masks.
+            - **loss_dice** -- The loss computed using dice loss on the predicted on the predicted and ground truth,
+              masks.
+        """
+        src_idx = self._get_predictions_permutation_indices(indices)
+        tgt_idx = self._get_targets_permutation_indices(indices)
+        # shape (batch_size * num_queries, height, width)
+        pred_masks = masks_queries_logits[src_idx]
+        # shape (batch_size, num_queries, height, width)
+        # pad all and stack the targets to the num_labels dimension
+        target_masks, _ = self._pad_images_to_max_in_batch(mask_labels)
+        target_masks = target_masks[tgt_idx]
+
+        # No need to upsample predictions as we are using normalized coordinates
+        pred_masks = pred_masks[:, None]
+        target_masks = target_masks[:, None]
+
+        # Sample point coordinates
+        with torch.no_grad():
+            point_coordinates = self.sample_points_using_uncertainty(
+                pred_masks,
+                lambda logits: self.calculate_uncertainty(logits),
+                self.num_points,
+                self.oversample_ratio,
+                self.importance_sample_ratio,
+            )
+
+            point_labels = sample_point(target_masks, point_coordinates, align_corners=False).squeeze(1)
+
+        point_logits = sample_point(pred_masks, point_coordinates, align_corners=False).squeeze(1)
+
+        losses = {
+            "loss_mask": sigmoid_cross_entropy_loss(point_logits, point_labels, num_masks),
+            "loss_dice": dice_loss(point_logits, point_labels, num_masks),
+        }
+
+        del pred_masks
+        del target_masks
+        return losses
+
+    def _get_predictions_permutation_indices(self, indices):
+        # Permute predictions following indices
+        batch_indices = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        predictions_indices = torch.cat([src for (src, _) in indices])
+        return batch_indices, predictions_indices
+
+    def _get_targets_permutation_indices(self, indices):
+        # Permute labels following indices
+        batch_indices = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
+        target_indices = torch.cat([tgt for (_, tgt) in indices])
+        return batch_indices, target_indices
+
+    def calculate_uncertainty(self, logits: torch.Tensor) -> torch.Tensor:
+        """
+        In Mask2Former paper, uncertainty is estimated as L1 distance between 0.0 and the logit prediction in 'logits'
+        for the foreground class in `classes`.
+
+        Args:
+            logits (`torch.Tensor`):
+            A tensor of shape (R, 1, ...) for class-specific or class-agnostic, where R is the total number of predicted masks in all images and C is:
+            the number of foreground classes. The values are logits.
+
+        Returns:
+            scores (`torch.Tensor`): A tensor of shape (R, 1, ...) that contains uncertainty scores with the most
+            uncertain locations having the highest uncertainty score.
+        """
+        uncertainty_scores = -(torch.abs(logits))
+        return uncertainty_scores
+
+    def sample_points_using_uncertainty(
+        self,
+        logits: torch.Tensor,
+        uncertainty_function,
+        num_points: int,
+        oversample_ratio: int,
+        importance_sample_ratio: float,
+    ) -> torch.Tensor:
+        """
+        This function is meant for sampling points in [0, 1] * [0, 1] coordinate space based on their uncertainty. The
+        uncertainty is calculated for each point using the passed `uncertainty function` that takes points logit
+        prediction as input.
+
+        Args:
+            logits (`float`):
+                Logit predictions for P points.
+            uncertainty_function:
+                A function that takes logit predictions for P points and returns their uncertainties.
+            num_points (`int`):
+                The number of points P to sample.
+            oversample_ratio (`int`):
+                Oversampling parameter.
+            importance_sample_ratio (`float`):
+                Ratio of points that are sampled via importance sampling.
+
+        Returns:
+            point_coordinates (`torch.Tensor`):
+                Coordinates for P sampled points.
+        """
+
+        num_boxes = logits.shape[0]
+        num_points_sampled = int(num_points * oversample_ratio)
+
+        # Get random point coordinates
+        point_coordinates = torch.rand(num_boxes, num_points_sampled, 2, device=logits.device)
+        # Get sampled prediction value for the point coordinates
+        point_logits = sample_point(logits, point_coordinates, align_corners=False)
+        # Calculate the uncertainties based on the sampled prediction values of the points
+        point_uncertainties = uncertainty_function(point_logits)
+
+        num_uncertain_points = int(importance_sample_ratio * num_points)
+        num_random_points = num_points - num_uncertain_points
+
+        idx = torch.topk(point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1]
+        shift = num_points_sampled * torch.arange(num_boxes, dtype=torch.long, device=logits.device)
+        idx += shift[:, None]
+        point_coordinates = point_coordinates.view(-1, 2)[idx.view(-1), :].view(num_boxes, num_uncertain_points, 2)
+
+        if num_random_points > 0:
+            point_coordinates = torch.cat(
+                [point_coordinates, torch.rand(num_boxes, num_random_points, 2, device=logits.device)],
+                dim=1,
+            )
+        return point_coordinates
+
+    def forward(
+        self,
+        masks_queries_logits: torch.Tensor,
+        class_queries_logits: torch.Tensor,
+        mask_labels: List[torch.Tensor],
+        class_labels: List[torch.Tensor],
+        auxiliary_predictions: Optional[Dict[str, torch.Tensor]] = None,
+    ) -> Dict[str, torch.Tensor]:
+        """
+        This performs the loss computation.
+
+        Args:
+            masks_queries_logits (`torch.Tensor`):
+                A tensor of shape `(batch_size, num_queries, height, width)`.
+            class_queries_logits (`torch.Tensor`):
+                A tensor of shape `(batch_size, num_queries, num_labels)`.
+            mask_labels (`torch.Tensor`):
+                List of mask labels of shape `(labels, height, width)`.
+            class_labels (`List[torch.Tensor]`):
+                List of class labels of shape `(labels)`.
+            auxiliary_predictions (`Dict[str, torch.Tensor]`, *optional*):
+                if `use_auxiliary_loss` was set to `true` in [`Mask2FormerConfig`], then it contains the logits from
+                the inner layers of the Mask2FormerMaskedAttentionDecoder.
+
+        Returns:
+            losses (`Dict[str, Tensor]`): A dict of `torch.Tensor` containing three keys:
+            - **loss_cross_entropy** -- The loss computed using cross entropy on the predicted and ground truth labels.
+            - **loss_mask** -- The loss computed using sigmoid cross_entropy loss on the predicted and ground truth
+              masks.
+            - **loss_dice** -- The loss computed using dice loss on the predicted on the predicted and ground truth
+              masks.
+            if `use_auxiliary_loss` was set to `true` in [`Mask2FormerConfig`], the dictionary contains additional
+            losses for each auxiliary predictions.
+        """
+
+        # retrieve the matching between the outputs of the last layer and the labels
+        indices = self.matcher(masks_queries_logits, class_queries_logits, mask_labels, class_labels)
+        # compute the average number of target masks for normalization purposes
+        num_masks = self.get_num_masks(class_labels, device=class_labels[0].device)
+        # get all the losses
+        losses: Dict[str, Tensor] = {
+            **self.loss_masks(masks_queries_logits, mask_labels, indices, num_masks),
+            **self.loss_labels(class_queries_logits, class_labels, indices),
+        }
+        # in case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if auxiliary_predictions is not None:
+            for idx, aux_outputs in enumerate(auxiliary_predictions):
+                masks_queries_logits = aux_outputs["masks_queries_logits"]
+                class_queries_logits = aux_outputs["class_queries_logits"]
+                loss_dict = self.forward(masks_queries_logits, class_queries_logits, mask_labels, class_labels)
+                loss_dict = {f"{key}_{idx}": value for key, value in loss_dict.items()}
+                losses.update(loss_dict)
+
+        return losses
+
+    def get_num_masks(self, class_labels: torch.Tensor, device: torch.device) -> torch.Tensor:
+        """
+        Computes the average number of target masks across the batch, for normalization purposes.
+        """
+        num_masks = sum([len(classes) for classes in class_labels])
+        num_masks_pt = torch.as_tensor([num_masks], dtype=torch.float, device=device)
+        return num_masks_pt
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.multi_scale_deformable_attention
+def multi_scale_deformable_attention(
+    value: Tensor, value_spatial_shapes: Tensor, sampling_locations: Tensor, attention_weights: Tensor
+) -> Tensor:
+    batch_size, _, num_heads, hidden_dim = value.shape
+    _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
+    value_list = value.split([height.item() * width.item() for height, width in value_spatial_shapes], dim=1)
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for level_id, (height, width) in enumerate(value_spatial_shapes):
+        # batch_size, height*width, num_heads, hidden_dim
+        # -> batch_size, height*width, num_heads*hidden_dim
+        # -> batch_size, num_heads*hidden_dim, height*width
+        # -> batch_size*num_heads, hidden_dim, height, width
+        value_l_ = (
+            value_list[level_id].flatten(2).transpose(1, 2).reshape(batch_size * num_heads, hidden_dim, height, width)
+        )
+        # batch_size, num_queries, num_heads, num_points, 2
+        # -> batch_size, num_heads, num_queries, num_points, 2
+        # -> batch_size*num_heads, num_queries, num_points, 2
+        sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1)
+        # batch_size*num_heads, hidden_dim, num_queries, num_points
+        sampling_value_l_ = nn.functional.grid_sample(
+            value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
+        )
+        sampling_value_list.append(sampling_value_l_)
+    # (batch_size, num_queries, num_heads, num_levels, num_points)
+    # -> (batch_size, num_heads, num_queries, num_levels, num_points)
+    # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points)
+    attention_weights = attention_weights.transpose(1, 2).reshape(
+        batch_size * num_heads, 1, num_queries, num_levels * num_points
+    )
+    output = (
+        (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
+        .sum(-1)
+        .view(batch_size, num_heads * hidden_dim, num_queries)
+    )
+    return output.transpose(1, 2).contiguous()
+
+
+# Copied from transformers.models.maskformer.modeling_maskformer.MaskFormerSinePositionEmbedding with MaskFormer->Mask2Former
+class Mask2FormerSinePositionEmbedding(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
+    need paper, generalized to work on images.
+    """
+
+    def __init__(
+        self, num_pos_feats: int = 64, temperature: int = 10000, normalize: bool = False, scale: Optional[float] = None
+    ):
+        super().__init__()
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        self.num_pos_feats = num_pos_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        self.scale = 2 * math.pi if scale is None else scale
+
+    def forward(self, x: Tensor, mask: Optional[Tensor] = None) -> Tensor:
+        if mask is None:
+            mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
+        not_mask = (~mask).to(x.dtype)
+        y_embed = not_mask.cumsum(1)
+        x_embed = not_mask.cumsum(2)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+
+        dim_t = torch.arange(self.num_pos_feats, dtype=x.dtype, device=x.device)
+        dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.num_pos_feats)
+
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+
+
+# Modified from transformers.models.detr.modeling_deformable_detr.DeformableDetrMultiscaleDeformableAttention
+class Mask2FormerPixelDecoderEncoderMultiscaleDeformableAttention(nn.Module):
+    """
+    Multiscale deformable attention as proposed in Deformable DETR.
+    """
+
+    def __init__(self, embed_dim: int, num_heads: int, n_levels: int, n_points: int):
+        super().__init__()
+        if embed_dim % num_heads != 0:
+            raise ValueError(
+                f"embed_dim (d_model) must be divisible by num_heads, but got {embed_dim} and {num_heads}"
+            )
+        dim_per_head = embed_dim // num_heads
+        # check if dim_per_head is power of 2
+        if not ((dim_per_head & (dim_per_head - 1) == 0) and dim_per_head != 0):
+            warnings.warn(
+                "You'd better set embed_dim (d_model) in DeformableDetrMultiscaleDeformableAttention to make the"
+                " dimension of each attention head a power of 2 which is more efficient in the authors' CUDA"
+                " implementation."
+            )
+
+        self.im2col_step = 128
+
+        self.d_model = embed_dim
+        self.n_levels = n_levels
+        self.n_heads = num_heads
+        self.n_points = n_points
+
+        self.sampling_offsets = nn.Linear(embed_dim, num_heads * n_levels * n_points * 2)
+        self.attention_weights = nn.Linear(embed_dim, num_heads * n_levels * n_points)
+        self.value_proj = nn.Linear(embed_dim, embed_dim)
+        self.output_proj = nn.Linear(embed_dim, embed_dim)
+
+    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
+        return tensor if position_embeddings is None else tensor + position_embeddings
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        reference_points=None,
+        spatial_shapes=None,
+        level_start_index=None,
+        output_attentions: bool = False,
+    ):
+        # add position embeddings to the hidden states before projecting to queries and keys
+        if position_embeddings is not None:
+            hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
+
+        batch_size, num_queries, _ = hidden_states.shape
+        batch_size, sequence_length, _ = encoder_hidden_states.shape
+        if (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() != sequence_length:
+            raise ValueError(
+                "Make sure to align the spatial shapes with the sequence length of the encoder hidden states"
+            )
+
+        value = self.value_proj(encoder_hidden_states)
+        if attention_mask is not None:
+            # we invert the attention_mask
+            value = value.masked_fill(attention_mask[..., None], float(0))
+        value = value.view(batch_size, sequence_length, self.n_heads, self.d_model // self.n_heads)
+        sampling_offsets = self.sampling_offsets(hidden_states).view(
+            batch_size, num_queries, self.n_heads, self.n_levels, self.n_points, 2
+        )
+        attention_weights = self.attention_weights(hidden_states).view(
+            batch_size, num_queries, self.n_heads, self.n_levels * self.n_points
+        )
+        attention_weights = nn.functional.softmax(attention_weights, -1).view(
+            batch_size, num_queries, self.n_heads, self.n_levels, self.n_points
+        )
+        # batch_size, num_queries, n_heads, n_levels, n_points, 2
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :]
+                + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
+            )
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :2]
+                + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
+            )
+        else:
+            raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}")
+
+        output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights)
+        output = self.output_proj(output)
+
+        return output, attention_weights
+
+
+class Mask2FormerPixelDecoderEncoderLayer(nn.Module):
+    def __init__(self, config: Mask2FormerConfig):
+        super().__init__()
+        self.embed_dim = config.feature_size
+        self.self_attn = Mask2FormerPixelDecoderEncoderMultiscaleDeformableAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.num_attention_heads,
+            n_levels=3,
+            n_points=4,
+        )
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = nn.functional.relu
+        self.activation_dropout = config.dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_feedforward_dim)
+        self.fc2 = nn.Linear(config.encoder_feedforward_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        position_embeddings: torch.Tensor = None,
+        reference_points=None,
+        spatial_shapes=None,
+        level_start_index=None,
+        output_attentions: bool = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Input to the layer.
+            attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+                Attention mask.
+            position_embeddings (`torch.FloatTensor`, *optional*):
+                Position embeddings, to be added to `hidden_states`.
+            reference_points (`torch.FloatTensor`, *optional*):
+                Reference points.
+            spatial_shapes (`torch.LongTensor`, *optional*):
+                Spatial shapes of the backbone feature maps.
+            level_start_index (`torch.LongTensor`, *optional*):
+                Level start index.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # Apply Multi-scale Deformable Attention Module on the multi-scale feature maps.
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            position_embeddings=position_embeddings,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        if self.training:
+            if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
+                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights.transpose(1, 0),)
+
+        return outputs
+
+
+# Modified from from transformers.models.detr.modeling_deformable_detr.DeformableDetrEncoder with DeformableDetrEncoder->Mask2FormerPixelDecoderEncoderOnly
+class Mask2FormerPixelDecoderEncoderOnly(nn.Module):
+    """
+    Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a
+    [`Mask2FormerPixelDecoderEncoderLayer`]. The encoder updates the flattened multi-scale feature maps through
+    multiple deformable attention layers.
+
+    Args:
+        config: Mask2FormerConfig
+    """
+
+    def __init__(self, config: Mask2FormerConfig):
+        super().__init__()
+
+        self.config = config
+        self.dropout = config.dropout
+        self.layers = nn.ModuleList(
+            [Mask2FormerPixelDecoderEncoderLayer(config) for _ in range(config.encoder_layers)]
+        )
+
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios, device):
+        """
+        Get reference points for each feature map. Used in decoder.
+
+        Args:
+            spatial_shapes (`torch.LongTensor`):
+                Spatial shapes of each feature map, has shape of `(num_feature_levels, 2)`.
+            valid_ratios (`torch.FloatTensor`):
+                Valid ratios of each feature map, has shape of `(batch_size, num_feature_levels, 2)`.
+            device (`torch.device`):
+                Device on which to create the tensors.
+        Returns:
+            `torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)`
+        """
+        reference_points_list = []
+        for lvl, (height, width) in enumerate(spatial_shapes):
+            ref_y, ref_x = torch.meshgrid(
+                torch.linspace(0.5, height - 0.5, height, dtype=valid_ratios.dtype, device=device),
+                torch.linspace(0.5, width - 0.5, width, dtype=valid_ratios.dtype, device=device),
+                indexing="ij",
+            )
+            ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * height)
+            ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * width)
+            ref = torch.stack((ref_x, ref_y), -1)
+            reference_points_list.append(ref)
+
+        reference_points = torch.cat(reference_points_list, 1)
+        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+
+        return reference_points
+
+    def forward(
+        self,
+        inputs_embeds=None,
+        attention_mask=None,
+        position_embeddings=None,
+        spatial_shapes=None,
+        level_start_index=None,
+        valid_ratios=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Flattened feature map (output of the backbone + projection layer) that is passed to the encoder.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:
+                - 1 for pixel features that are real (i.e. **not masked**),
+                - 0 for pixel features that are padding (i.e. **masked**).
+                [What are attention masks?](../glossary#attention-mask)
+            position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Position embeddings that are added to the queries and keys in each self-attention layer.
+            spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
+                Spatial shapes of each feature map.
+            level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`):
+                Starting index of each feature map.
+            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
+                Ratio of valid area in each feature level.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states = inputs_embeds
+        reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=inputs_embeds.device)
+
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        for i, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states.transpose(1, 0),)
+
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask,
+                position_embeddings=position_embeddings,
+                reference_points=reference_points,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states += (hidden_states.transpose(1, 0),)
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
+        )
+
+
+# Modified from from transformers.models.detr.modeling_deformable_detr.DeformableDetrModel with DeformableDetrModel->Mask2FormerPixelDecoder
+class Mask2FormerPixelDecoder(nn.Module):
+    def __init__(self, config: Mask2FormerConfig, feature_channels):
+        super().__init__()
+
+        self.config = config
+
+        feature_dim = config.feature_size
+        mask_dim = config.mask_feature_size
+        num_pos_features = feature_dim // 2
+
+        self.position_embedding = Mask2FormerSinePositionEmbedding(num_pos_feats=num_pos_features, normalize=True)
+        self.num_feature_levels = 3
+        transformer_in_channels = feature_channels[-self.num_feature_levels :]
+
+        self.transformer_feature_strides = config.feature_strides[-self.num_feature_levels :]
+        self.feature_channels = feature_channels
+        self.level_embed = nn.Parameter(torch.Tensor(self.num_feature_levels, feature_dim))
+
+        # Create input projection layers
+        if self.num_feature_levels > 1:
+            input_projections_list = []
+            for in_channels in transformer_in_channels[::-1]:
+                input_projections_list.append(
+                    nn.Sequential(
+                        nn.Conv2d(in_channels, feature_dim, kernel_size=1),
+                        nn.GroupNorm(32, feature_dim),
+                    )
+                )
+            self.input_projections = nn.ModuleList(input_projections_list)
+        else:
+            self.input_projections = nn.ModuleList(
+                [
+                    nn.Sequential(
+                        nn.Conv2d(transformer_in_channels[-1], feature_dim, kernel_size=1),
+                        nn.GroupNorm(32, feature_dim),
+                    )
+                ]
+            )
+
+        self.encoder = Mask2FormerPixelDecoderEncoderOnly(config)
+        self.mask_projection = nn.Conv2d(feature_dim, mask_dim, kernel_size=1, stride=1, padding=0)
+
+        # Extra FPN levels
+        stride = min(self.transformer_feature_strides)
+        self.common_stride = config.common_stride
+        self.num_fpn_levels = int(np.log2(stride) - np.log2(self.common_stride))
+
+        lateral_convs = []
+        output_convs = []
+
+        for idx, in_channels in enumerate(self.feature_channels[: self.num_fpn_levels]):
+            lateral_conv = nn.Sequential(
+                nn.Conv2d(in_channels, feature_dim, kernel_size=1, bias=False),
+                nn.GroupNorm(32, feature_dim),
+            )
+
+            output_conv = nn.Sequential(
+                nn.Conv2d(feature_dim, feature_dim, kernel_size=3, stride=1, padding=1, bias=False),
+                nn.GroupNorm(32, feature_dim),
+                nn.ReLU(),
+            )
+            self.add_module("adapter_{}".format(idx + 1), lateral_conv)
+            self.add_module("layer_{}".format(idx + 1), output_conv)
+
+            lateral_convs.append(lateral_conv)
+            output_convs.append(output_conv)
+
+        # Order convolutional layers from low to high resolution
+        self.lateral_convolutions = lateral_convs[::-1]
+        self.output_convolutions = output_convs[::-1]
+
+    def get_valid_ratio(self, mask, dtype=torch.float32):
+        """Get the valid ratio of all feature maps."""
+
+        _, height, width = mask.shape
+        valid_height = torch.sum(~mask[:, :, 0], 1)
+        valid_width = torch.sum(~mask[:, 0, :], 1)
+        valid_ratio_heigth = valid_height.to(dtype) / height
+        valid_ratio_width = valid_width.to(dtype) / width
+        valid_ratio = torch.stack([valid_ratio_width, valid_ratio_heigth], -1)
+        return valid_ratio
+
+    def forward(
+        self,
+        features,
+        encoder_outputs=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        # Apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
+        input_embeds = []
+        position_embeddings = []
+        for level, x in enumerate(features[::-1][: self.num_feature_levels]):
+            input_embeds.append(self.input_projections[level](x))
+            position_embeddings.append(self.position_embedding(x))
+
+        masks = [
+            torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool) for x in input_embeds
+        ]
+
+        # Prepare encoder inputs (by flattening)
+        spatial_shapes = [(embed.shape[2], embed.shape[3]) for embed in input_embeds]
+        input_embeds_flat = torch.cat([embed.flatten(2).transpose(1, 2) for embed in input_embeds], 1)
+        spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=input_embeds_flat.device)
+        masks_flat = torch.cat([mask.flatten(1) for mask in masks], 1)
+
+        position_embeddings = [embed.flatten(2).transpose(1, 2) for embed in position_embeddings]
+        level_pos_embed_flat = [x + self.level_embed[i].view(1, 1, -1) for i, x in enumerate(position_embeddings)]
+        level_pos_embed_flat = torch.cat(level_pos_embed_flat, 1)
+
+        level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack([self.get_valid_ratio(mask, dtype=input_embeds_flat.dtype) for mask in masks], 1)
+
+        # Send input_embeds_flat + masks_flat + level_pos_embed_flat (backbone + proj layer output) through encoder
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                inputs_embeds=input_embeds_flat,
+                attention_mask=masks_flat,
+                position_embeddings=level_pos_embed_flat,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                valid_ratios=valid_ratios,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+
+        last_hidden_state = encoder_outputs.last_hidden_state
+        batch_size = last_hidden_state.shape[0]
+
+        split_sizes = [None] * self.num_feature_levels
+        for i in range(self.num_feature_levels):
+            if i < self.num_feature_levels - 1:
+                split_sizes[i] = level_start_index[i + 1] - level_start_index[i]
+            else:
+                split_sizes[i] = last_hidden_state.shape[1] - level_start_index[i]
+
+        encoder_output = torch.split(last_hidden_state, [size.item() for size in split_sizes], dim=1)
+
+        # Compute final features
+        outputs = [
+            x.transpose(1, 2).view(batch_size, -1, spatial_shapes[i][0], spatial_shapes[i][1])
+            for i, x in enumerate(encoder_output)
+        ]
+
+        # Append extra FPN levels to outputs, ordered from low to high resolution
+        for idx, feature in enumerate(features[: self.num_fpn_levels][::-1]):
+            lateral_conv = self.lateral_convolutions[idx]
+            output_conv = self.output_convolutions[idx]
+            current_fpn = lateral_conv(feature)
+
+            # Following FPN implementation, we use nearest upsampling here
+            out = current_fpn + nn.functional.interpolate(
+                outputs[-1], size=current_fpn.shape[-2:], mode="bilinear", align_corners=False
+            )
+            out = output_conv(out)
+            outputs.append(out)
+
+        num_cur_levels = 0
+        multi_scale_features = []
+
+        for out in outputs:
+            if num_cur_levels < self.num_feature_levels:
+                multi_scale_features.append(out)
+                num_cur_levels += 1
+
+        return Mask2FormerPixelDecoderOutput(
+            mask_features=self.mask_projection(outputs[-1]),
+            multi_scale_features=tuple(multi_scale_features),
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class Mask2FormerPixelLevelModule(nn.Module):
+    def __init__(self, config: Mask2FormerConfig):
+        """
+        Pixel Level Module proposed in [Masked-attention Mask Transformer for Universal Image
+        Segmentation](https://arxiv.org/abs/2112.01527). It runs the input image through a backbone and a pixel
+        decoder, generating multi-scale feature maps and pixel embeddings.
+
+        Args:
+            config ([`Mask2FormerConfig`]):
+                The configuration used to instantiate this model.
+        """
+        super().__init__()
+
+        self.encoder = AutoBackbone.from_config(config.backbone_config)
+        self.decoder = Mask2FormerPixelDecoder(config, feature_channels=self.encoder.channels)
+
+    def forward(self, pixel_values: Tensor, output_hidden_states: bool = False) -> Mask2FormerPixelLevelModuleOutput:
+        backbone_features = self.encoder(pixel_values).feature_maps
+        decoder_output = self.decoder(backbone_features, output_hidden_states=output_hidden_states)
+
+        return Mask2FormerPixelLevelModuleOutput(
+            encoder_last_hidden_state=backbone_features[-1],
+            encoder_hidden_states=tuple(backbone_features) if output_hidden_states else None,
+            decoder_last_hidden_state=decoder_output.mask_features,
+            decoder_hidden_states=decoder_output.multi_scale_features,
+        )
+
+
+# Modified from transformers.models.detr.modeling_detr.DetrAttention with Detr->Mask2Former
+class Mask2FormerAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Here, we add position embeddings to the queries and
+    keys (as explained in the DETR paper).
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        if self.head_dim * num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
+        return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
+        return tensor if position_embeddings is None else tensor + position_embeddings
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        key_value_states: Optional[torch.Tensor] = None,
+        key_value_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        hidden_states = hidden_states.permute(1, 0, 2) if hidden_states is not None else None
+        position_embeddings = position_embeddings.permute(1, 0, 2) if position_embeddings is not None else None
+        key_value_states = key_value_states.permute(1, 0, 2) if key_value_states is not None else None
+        key_value_position_embeddings = (
+            key_value_position_embeddings.permute(1, 0, 2) if key_value_position_embeddings is not None else None
+        )
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        batch_size, target_len, embed_dim = hidden_states.size()
+
+        # add position embeddings to the hidden states before projecting to queries and keys
+        if position_embeddings is not None:
+            hidden_states_original = hidden_states
+            hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
+
+        # add key-value position embeddings to the key value states
+        if key_value_position_embeddings is not None:
+            key_value_states_original = key_value_states
+            key_value_states = self.with_pos_embed(key_value_states, key_value_position_embeddings)
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, batch_size)
+            value_states = self._shape(self.v_proj(key_value_states_original), -1, batch_size)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, batch_size)
+            value_states = self._shape(self.v_proj(hidden_states_original), -1, batch_size)
+
+        proj_shape = (batch_size * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, target_len, batch_size).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        source_len = key_states.size(1)
+
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (batch_size * self.num_heads, target_len, source_len):
+            raise ValueError(
+                f"Attention weights should be of size {(batch_size * self.num_heads, target_len, source_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (batch_size * self.num_heads, target_len, source_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(target_len, batch_size * self.num_heads, source_len)}, but is"
+                    f" {attention_mask.size()}"
+                )
+            attn_weights += attention_mask
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(batch_size, self.num_heads, target_len, source_len)
+            attn_weights = attn_weights_reshaped.view(batch_size * self.num_heads, target_len, source_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (batch_size * self.num_heads, target_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(batch_size, self.num_heads, target_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(batch_size, target_len, embed_dim)
+
+        attn_output = self.out_proj(attn_output).permute(1, 0, 2)
+
+        return attn_output, attn_weights_reshaped
+
+
+class Mask2FormerMaskedAttentionDecoderLayer(nn.Module):
+    """
+    The Mask2FormerMaskedAttentionDecoderLayer is made up of self-attention, cross (masked) attention as well as FFN
+    blocks. The cross attention block used as part of `Mask2FormerMaskedAttentionDecoderLayer` is actually a `masked
+    attention` block that restricts the attention to localized features centered around predicted segments which leads
+    to faster convergence and improved performance. The order of self and cross (i.e. masked) attention blocks have
+    also been swapped in Mask2FormerMaskedAttentionDecoder compared to a standard DetrDecoder as an optimization
+    improvement.
+
+    Args:
+        config (`Mask2FormerConfig`):
+            The configuration used to initialize the Mask2FormerMaskedAttentionDecoder.
+    """
+
+    def __init__(self, config: Mask2FormerConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = self.config.hidden_dim
+        self.pre_norm = self.config.pre_norm
+        self.self_attn = Mask2FormerAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.num_attention_heads,
+            dropout=config.dropout,
+            is_decoder=True,
+        )
+
+        self.dropout = self.config.dropout
+        self.activation_fn = ACT2FN[self.config.activation_function]
+        self.activation_dropout = self.config.dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.cross_attn = nn.MultiheadAttention(self.embed_dim, self.config.num_attention_heads, self.config.dropout)
+        self.cross_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, self.config.dim_feedforward)
+        self.fc2 = nn.Linear(self.config.dim_feedforward, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+
+    def forward_post(
+        self,
+        hidden_states: torch.Tensor,
+        level_index: int = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        query_position_embeddings: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ):
+        # Masked(Cross)-Attention Block
+        cross_attn_weights = None
+        self_attn_weights = None
+
+        residual = hidden_states
+
+        hidden_states, cross_attn_weights = self.cross_attn(
+            query=self.with_pos_embed(hidden_states, query_position_embeddings),
+            key=self.with_pos_embed(encoder_hidden_states[level_index], position_embeddings[level_index]),
+            value=encoder_hidden_states[level_index],
+            attn_mask=encoder_attention_mask,
+            key_padding_mask=None,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.cross_attn_layer_norm(hidden_states)
+
+        # Self Attention Block
+        residual = hidden_states
+
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            position_embeddings=query_position_embeddings,
+            attention_mask=None,
+            output_attentions=True,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        return outputs
+
+    def forward_pre(
+        self,
+        hidden_states: torch.Tensor,
+        level_index: int = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        query_position_embeddings: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ):
+        # Masked(Cross)-Attention Block
+        cross_attn_weights = None
+        self_attn_weights = None
+
+        residual = hidden_states
+
+        hidden_states = self.cross_attn_layer_norm(hidden_states)
+
+        hidden_states, cross_attn_weights = self.cross_attn(
+            query=self.with_pos_embed(hidden_states, query_position_embeddings),
+            key=self.with_pos_embed(encoder_hidden_states[level_index], position_embeddings[level_index]),
+            value=encoder_hidden_states[level_index],
+            attn_mask=encoder_attention_mask,
+            key_padding_mask=None,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        # Self Attention Block
+        residual = hidden_states
+
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            position_embeddings=query_position_embeddings,
+            attention_mask=None,
+            output_attentions=True,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        return outputs
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        level_index: int = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        query_position_embeddings: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(seq_len, batch, embed_dim)`.
+            attention_mask (`torch.FloatTensor`):
+                Attention mask of shape `(1, seq_len, tgt_len, src_len)`.
+            position_embeddings (`torch.FloatTensor`, *optional*):
+                Position embeddings that are added to the keys in the masked-attention layer.
+            query_position_embeddings (`torch.FloatTensor`, *optional*):
+                Position embeddings that are added to the queries and keys in the self-attention layer.
+            encoder_hidden_states (`torch.FloatTensor`):
+                Cross attention input to the layer of shape `(seq_len, batch, embed_dim)`.
+            encoder_attention_mask (`torch.FloatTensor`):
+                Encoder attention mask of size`(1, seq_len, tgt_len, src_len)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+
+        if self.pre_norm:
+            outputs = self.forward_pre(
+                hidden_states=hidden_states,
+                level_index=level_index,
+                position_embeddings=position_embeddings,
+                query_position_embeddings=query_position_embeddings,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                output_attentions=output_attentions,
+            )
+        else:
+            outputs = self.forward_post(
+                hidden_states=hidden_states,
+                level_index=level_index,
+                position_embeddings=position_embeddings,
+                query_position_embeddings=query_position_embeddings,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                output_attentions=output_attentions,
+            )
+
+        return outputs
+
+
+class Mask2FormerMaskedAttentionDecoder(nn.Module):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a
+    [`Mask2FormerMaskedAttentionDecoderLayer`]. The decoder updates the query embeddings through multiple cross
+    (masked) and self-attention layers. The decoder uses a new **masked attention** mechanism instead of the standard
+    cross-attention, which extracts localized features by constraining cross-attention to within the foreground region
+    of the predicted mask for each query, instead of attending to the full feature map.
+
+    Args:
+        config (`Mask2FormerConfig`):
+            Configuration used to instantiate Mask2FormerMaskedAttentionDecoder.
+    """
+
+    def __init__(self, config: Mask2FormerConfig):
+        super().__init__()
+
+        self.config = config
+        self.mask_feature_size = config.mask_feature_size
+        self.dropout = config.dropout
+        self.layerdrop = config.dropout
+        self.num_feature_levels = 3  # level embedding (3 scales)
+        self.decoder_layers = config.decoder_layers - 1
+
+        self.layers = nn.ModuleList(
+            [Mask2FormerMaskedAttentionDecoderLayer(self.config) for _ in range(self.decoder_layers)]
+        )
+        self.layernorm = nn.LayerNorm(config.hidden_dim)
+
+        self.mask_predictor = Mask2FormerMaskPredictor(
+            hidden_size=config.hidden_dim,
+            num_heads=config.num_attention_heads,
+            mask_feature_size=self.mask_feature_size,
+        )
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor = None,
+        multi_stage_positional_embeddings: torch.Tensor = None,
+        pixel_embeddings: torch.Tensor = None,
+        encoder_hidden_states: torch.Tensor = None,
+        query_position_embeddings: torch.Tensor = None,
+        feature_size_list: List = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(num_queries, batch_size, hidden_size)`):
+                The query embeddings that are passed into the decoder.
+            multi_stage_positional_embeddings (`torch.FloatTensor` of shape `(height*width, batch_size, num_channels)`):
+                Position embeddings that are added to the keys in each cross(masked)-attention layer.
+            pixel_embeddings (`torch.FloatTensor`):
+                Tensor of shape `(batch_size, num_channels, height, width)`, 1/4 scale features from the last Pixel
+                Decoder.
+            query_position_embeddings (`torch.FloatTensor` of shape `(num_queries, batch_size, hidden_size)`):
+                , *optional*): Position embeddings that are added to the queries and keys in each self-attention layer.
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the
+                cross(masked)-attention of the decoder.
+            feature_size_list (`List[torch.Size]` ):
+                This is a list containing shapes (height & width) of multi-scale features from the Pixel Decoder.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+
+        # intermediate hidden states with layernorm applied - required for predicting class logits
+        intermediate = ()
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        attentions = () if output_attentions else None
+
+        # intermediate mask predictions from transformer decoder layers
+        intermediate_mask_predictions = ()
+
+        intermediate_hidden_states = self.layernorm(inputs_embeds)
+        intermediate += (intermediate_hidden_states,)
+
+        predicted_mask, attention_mask = self.mask_predictor(
+            intermediate_hidden_states, pixel_embeddings, feature_size_list[0]
+        )
+        intermediate_mask_predictions += (predicted_mask,)
+
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            dropout_probability = torch.rand([])
+
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    None,
+                    None,
+                )
+
+            else:
+                level_index = idx % self.num_feature_levels
+
+                attention_mask[torch.where(attention_mask.sum(-1) == attention_mask.shape[-1])] = False
+
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    level_index=level_index,
+                    position_embeddings=multi_stage_positional_embeddings,
+                    query_position_embeddings=query_position_embeddings,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+                intermediate_hidden_states = self.layernorm(layer_outputs[0])
+
+                predicted_mask, attention_mask = self.mask_predictor(
+                    intermediate_hidden_states,
+                    pixel_embeddings,
+                    feature_size_list[(idx + 1) % self.num_feature_levels],
+                )
+
+                intermediate_mask_predictions += (predicted_mask,)
+
+                # add intermediate hidden states with layer norm applied which will be used for predicting class logits
+                intermediate += (intermediate_hidden_states,)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                attentions += (layer_outputs[1],)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        hidden_states = hidden_states.transpose(1, 0)
+        if not return_dict:
+            outputs = [hidden_states, all_hidden_states, attentions, intermediate, intermediate_mask_predictions]
+            return tuple(v for v in outputs if v is not None)
+
+        return Mask2FormerMaskedAttentionDecoderOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=attentions,
+            intermediate_hidden_states=intermediate,
+            masks_queries_logits=intermediate_mask_predictions,
+        )
+
+
+# Copied from transformers.models.maskformer.modeling_maskformer.PredictionBlock with MaskFormer->Mask2Former
+class Mask2FormerPredictionBlock(nn.Module):
+    def __init__(self, in_dim: int, out_dim: int, activation: nn.Module) -> None:
+        super().__init__()
+        self.layers = [nn.Linear(in_dim, out_dim), activation]
+        # Maintain submodule indexing as if part of a Sequential block
+        for i, layer in enumerate(self.layers):
+            self.add_module(str(i), layer)
+
+    def forward(self, input: Tensor) -> Tensor:
+        hidden_state = input
+        for layer in self.layers:
+            hidden_state = layer(hidden_state)
+        return hidden_state
+
+
+class Mask2FormerMLPPredictionHead(nn.Module):
+    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int, num_layers: int = 3):
+        """
+        A classic Multi Layer Perceptron (MLP).
+
+        Args:
+            input_dim (`int`):
+                The input dimensions.
+            hidden_dim (`int`):
+                The hidden dimensions.
+            output_dim (`int`):
+                The output dimensions.
+            num_layers (int, *optional*, defaults to 3):
+                The number of layers.
+        """
+        super().__init__()
+        in_dims = [input_dim] + [hidden_dim] * (num_layers - 1)
+        out_dims = [hidden_dim] * (num_layers - 1) + [output_dim]
+
+        self.layers = []
+        for i, (in_dim, out_dim) in enumerate(zip(in_dims, out_dims)):
+            activation = nn.ReLU() if i < num_layers - 1 else nn.Identity()
+            layer = Mask2FormerPredictionBlock(in_dim, out_dim, activation=activation)
+            self.layers.append(layer)
+            # Provide backwards compatibility from when the class inherited from nn.Sequential
+            # In nn.Sequential subclasses, the name given to the layer is its index in the sequence.
+            # In nn.Module subclasses they derived from the instance attribute they are assigned to e.g.
+            # self.my_layer_name = Layer()
+            # We can't give instance attributes integer names i.e. self.0 is not permitted and so need to register
+            # explicitly
+            self.add_module(str(i), layer)
+
+    def forward(self, input: Tensor) -> Tensor:
+        hidden_state = input
+        for layer in self.layers:
+            hidden_state = layer(hidden_state)
+        return hidden_state
+
+
+class Mask2FormerMaskPredictor(nn.Module):
+    def __init__(self, hidden_size: int, num_heads: int, mask_feature_size: torch.Tensor):
+        """
+        This class is used to get the predicted mask for a given Mask2FormerMaskedAttentionDecoder layer. It also
+        generates the binarized attention mask associated with the given predicted mask. The attention mask obtained
+        using predicted mask of the (l-1)th decoder layer is fed to the cross(masked)-attention block of the next
+        decoder layer as input.
+
+        Args:
+            hidden_size (`int`):
+                The feature dimension of the Mask2FormerMaskedAttentionDecoder
+            num_heads (`int`):
+                The number of heads used in the Mask2FormerMaskedAttentionDecoder
+            mask_feature_size (`torch.Tensor`):
+                one of the output dimensions of the predicted masks for each query
+        """
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+
+        self.mask_embedder = Mask2FormerMLPPredictionHead(self.hidden_size, self.hidden_size, mask_feature_size)
+
+    def forward(self, outputs: torch.Tensor, pixel_embeddings: torch.Tensor, attention_mask_target_size: int = None):
+        mask_embeddings = self.mask_embedder(outputs.transpose(0, 1))
+
+        # Equivalent to einsum('bqc, bchw -> bqhw') but jit friendly
+        batch_size, num_queries, num_channels = mask_embeddings.shape
+        _, _, height, width = pixel_embeddings.shape
+        outputs_mask = torch.zeros((batch_size, num_queries, height, width), device=mask_embeddings.device)
+        for c in range(num_channels):
+            outputs_mask += mask_embeddings[..., c][..., None, None] * pixel_embeddings[:, None, c]
+
+        attention_mask = nn.functional.interpolate(
+            outputs_mask, size=attention_mask_target_size, mode="bilinear", align_corners=False
+        )
+
+        attention_mask = attention_mask.sigmoid().flatten(2).unsqueeze(1).repeat(1, self.num_heads, 1, 1)
+        attention_mask = (attention_mask.flatten(0, 1) < 0.5).bool()
+        attention_mask = attention_mask.detach()
+
+        return outputs_mask, attention_mask
+
+
+class Mask2FormerTransformerModule(nn.Module):
+    """
+    The Mask2Former's transformer module.
+    """
+
+    def __init__(self, in_features: int, config: Mask2FormerConfig):
+        super().__init__()
+        hidden_dim = config.hidden_dim
+        self.num_feature_levels = 3
+        self.position_embedder = Mask2FormerSinePositionEmbedding(num_pos_feats=hidden_dim // 2, normalize=True)
+        self.queries_embedder = nn.Embedding(config.num_queries, hidden_dim)
+        self.queries_features = nn.Embedding(config.num_queries, hidden_dim)
+        self.input_projections = []
+
+        for _ in range(self.num_feature_levels):
+            if in_features != hidden_dim or config.enforce_input_projection:
+                self.input_projections.append(nn.Conv2d(in_features, hidden_dim, kernel_size=1))
+            else:
+                self.input_projections.append(nn.Sequential())
+
+        self.decoder = Mask2FormerMaskedAttentionDecoder(config=config)
+        self.level_embed = nn.Embedding(self.num_feature_levels, hidden_dim)
+
+    def forward(
+        self,
+        multi_scale_features: List[Tensor],
+        mask_features: Tensor,
+        output_hidden_states: bool = False,
+        output_attentions: bool = False,
+    ) -> Mask2FormerMaskedAttentionDecoderOutput:
+        multi_stage_features = []
+        multi_stage_positional_embeddings = []
+        size_list = []
+
+        for i in range(self.num_feature_levels):
+            size_list.append(multi_scale_features[i].shape[-2:])
+            multi_stage_positional_embeddings.append(self.position_embedder(multi_scale_features[i], None).flatten(2))
+            multi_stage_features.append(
+                self.input_projections[i](multi_scale_features[i]).flatten(2)
+                + self.level_embed.weight[i][None, :, None]
+            )
+
+            # Flatten (batch_size, num_channels, height, width) -> (height*width, batch_size, num_channels)
+            multi_stage_positional_embeddings[-1] = multi_stage_positional_embeddings[-1].permute(2, 0, 1)
+            multi_stage_features[-1] = multi_stage_features[-1].permute(2, 0, 1)
+
+        _, batch_size, _ = multi_stage_features[0].shape
+
+        # [num_queries, batch_size, num_channels]
+        query_embeddings = self.queries_embedder.weight.unsqueeze(1).repeat(1, batch_size, 1)
+        query_features = self.queries_features.weight.unsqueeze(1).repeat(1, batch_size, 1)
+
+        decoder_output = self.decoder(
+            inputs_embeds=query_features,
+            multi_stage_positional_embeddings=multi_stage_positional_embeddings,
+            pixel_embeddings=mask_features,
+            encoder_hidden_states=multi_stage_features,
+            query_position_embeddings=query_embeddings,
+            feature_size_list=size_list,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=True,
+        )
+
+        return decoder_output
+
+
+MASK2FORMER_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`Mask2FormerConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+MASK2FORMER_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`AutoImageProcessor.preprocess`] for details.
+        pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+            Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
+
+            - 1 for pixels that are real (i.e. **not masked**),
+            - 0 for pixels that are padding (i.e. **masked**).
+
+            [What are attention masks?](../glossary#attention-mask)
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of Detr's decoder attention layers.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~Mask2FormerModelOutput`] instead of a plain tuple.
+"""
+
+
+class Mask2FormerPreTrainedModel(PreTrainedModel):
+    config_class = Mask2FormerConfig
+    base_model_prefix = "model"
+    main_input_name = "pixel_values"
+
+    def _init_weights(self, module: nn.Module):
+        xavier_std = self.config.init_xavier_std
+        std = self.config.init_std
+
+        if isinstance(module, Mask2FormerTransformerModule):
+            if module.input_projections is not None:
+                for input_projection in module.input_projections:
+                    if not isinstance(input_projection, nn.Sequential):
+                        nn.init.xavier_uniform_(input_projection.weight, gain=xavier_std)
+                        nn.init.constant_(input_projection.bias, 0)
+
+        elif isinstance(module, Mask2FormerPixelDecoderEncoderMultiscaleDeformableAttention):
+            nn.init.constant_(module.sampling_offsets.weight.data, 0.0)
+            thetas = torch.arange(module.n_heads, dtype=torch.float32) * (2.0 * math.pi / module.n_heads)
+            grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+            grid_init = (
+                (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
+                .view(module.n_heads, 1, 1, 2)
+                .repeat(1, module.n_levels, module.n_points, 1)
+            )
+            for i in range(module.n_points):
+                grid_init[:, :, i, :] *= i + 1
+            with torch.no_grad():
+                module.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+
+            nn.init.constant_(module.attention_weights.weight.data, 0.0)
+            nn.init.constant_(module.attention_weights.bias.data, 0.0)
+            nn.init.xavier_uniform_(module.value_proj.weight.data)
+            nn.init.constant_(module.value_proj.bias.data, 0.0)
+            nn.init.xavier_uniform_(module.output_proj.weight.data)
+            nn.init.constant_(module.output_proj.bias.data, 0.0)
+
+        elif isinstance(module, Mask2FormerMaskedAttentionDecoderLayer):
+            for p in module.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p, gain=xavier_std)
+
+        elif isinstance(module, Mask2FormerPixelLevelModule):
+            for submodule in module.modules():
+                if isinstance(submodule, (nn.Conv2d, nn.Linear)):
+                    submodule.weight.data.normal_(mean=0.0, std=std)
+                    if submodule.bias is not None:
+                        submodule.bias.data.zero_()
+
+        elif isinstance(module, Mask2FormerPixelDecoder):
+            for p in module.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p)
+            nn.init.normal_(module.level_embed, std=0)
+
+        elif isinstance(module, Mask2FormerPixelDecoderEncoderOnly):
+            for p in module.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p)
+
+        elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+        if hasattr(module, "reference_points"):
+            nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0)
+            nn.init.constant_(module.reference_points.bias.data, 0.0)
+
+
+@add_start_docstrings(
+    "The bare Mask2Former Model outputting raw hidden-states without any specific head on top.",
+    MASK2FORMER_START_DOCSTRING,
+)
+class Mask2FormerModel(Mask2FormerPreTrainedModel):
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: Mask2FormerConfig):
+        super().__init__(config)
+        self.pixel_level_module = Mask2FormerPixelLevelModule(config)
+        self.transformer_module = Mask2FormerTransformerModule(in_features=config.feature_size, config=config)
+
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MASK2FORMER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Mask2FormerModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: Tensor,
+        pixel_mask: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Mask2FormerModelOutput:
+        r"""
+        Returns:
+            `Mask2FormerModelOutput`
+
+        Examples:
+        ```python
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoImageProcessor, Mask2FormerModel
+
+        >>> # load image
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> # load image preprocessor and Mask2FormerModel trained on COCO instance segmentation dataset
+        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/mask2former-swin-small-coco-instance")
+        >>> model = Mask2FormerModel.from_pretrained("facebook/mask2former-swin-small-coco-instance")
+        >>> inputs = image_processor(image, return_tensors="pt")
+
+        >>> # forward pass
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+
+        >>> # model outputs last hidden states of shape (batch_size, num_queries, hidden_size)
+        >>> print(outputs.transformer_decoder_last_hidden_state.shape)
+        torch.Size([1, 100, 256])
+        ```
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, _, height, width = pixel_values.shape
+
+        if pixel_mask is None:
+            pixel_mask = torch.ones((batch_size, height, width), device=pixel_values.device)
+
+        pixel_level_module_output = self.pixel_level_module(
+            pixel_values=pixel_values, output_hidden_states=output_hidden_states
+        )
+
+        transformer_module_output = self.transformer_module(
+            multi_scale_features=pixel_level_module_output.decoder_hidden_states,
+            mask_features=pixel_level_module_output.decoder_last_hidden_state,
+            output_hidden_states=True,
+            output_attentions=output_attentions,
+        )
+
+        encoder_hidden_states = None
+        pixel_decoder_hidden_states = None
+        transformer_decoder_hidden_states = None
+        transformer_decoder_intermediate_states = None
+
+        if output_hidden_states:
+            encoder_hidden_states = pixel_level_module_output.encoder_hidden_states
+            pixel_decoder_hidden_states = pixel_level_module_output.decoder_hidden_states
+            transformer_decoder_hidden_states = transformer_module_output.hidden_states
+            transformer_decoder_intermediate_states = transformer_module_output.intermediate_hidden_states
+
+        output = Mask2FormerModelOutput(
+            encoder_last_hidden_state=pixel_level_module_output.encoder_last_hidden_state,
+            pixel_decoder_last_hidden_state=pixel_level_module_output.decoder_last_hidden_state,
+            transformer_decoder_last_hidden_state=transformer_module_output.last_hidden_state,
+            encoder_hidden_states=encoder_hidden_states,
+            pixel_decoder_hidden_states=pixel_decoder_hidden_states,
+            transformer_decoder_hidden_states=transformer_decoder_hidden_states,
+            transformer_decoder_intermediate_states=transformer_decoder_intermediate_states,
+            attentions=transformer_module_output.attentions,
+            masks_queries_logits=transformer_module_output.masks_queries_logits,
+        )
+
+        if not return_dict:
+            output = tuple(v for v in output.values() if v is not None)
+
+        return output
+
+
+@add_start_docstrings(
+    "The Mask2Former Model with heads on top for instance/semantic/panoptic segmentation.",
+    MASK2FORMER_START_DOCSTRING,
+)
+class Mask2FormerForUniversalSegmentation(Mask2FormerPreTrainedModel):
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: Mask2FormerConfig):
+        super().__init__(config)
+        self.model = Mask2FormerModel(config)
+
+        self.weight_dict: Dict[str, float] = {
+            "loss_cross_entropy": config.class_weight,
+            "loss_mask": config.mask_weight,
+            "loss_dice": config.dice_weight,
+        }
+
+        self.class_predictor = nn.Linear(config.hidden_dim, config.num_labels + 1)
+
+        self.criterion = Mask2FormerLoss(config=config, weight_dict=self.weight_dict)
+        self.post_init()
+
+    def get_loss_dict(
+        self,
+        masks_queries_logits: Tensor,
+        class_queries_logits: Tensor,
+        mask_labels: Tensor,
+        class_labels: Tensor,
+        auxiliary_predictions: Dict[str, Tensor],
+    ) -> Dict[str, Tensor]:
+        loss_dict: Dict[str, Tensor] = self.criterion(
+            masks_queries_logits=masks_queries_logits,
+            class_queries_logits=class_queries_logits,
+            mask_labels=mask_labels,
+            class_labels=class_labels,
+            auxiliary_predictions=auxiliary_predictions,
+        )
+
+        # weight each loss by `self.weight_dict[<LOSS_NAME>]` including auxiliary losses
+        for key, weight in self.weight_dict.items():
+            for loss_key, loss in loss_dict.items():
+                if key in loss_key:
+                    loss *= weight
+
+        return loss_dict
+
+    def get_loss(self, loss_dict: Dict[str, Tensor]) -> Tensor:
+        return sum(loss_dict.values())
+
+    def get_auxiliary_logits(self, classes: torch.Tensor, output_masks: torch.Tensor):
+        auxiliary_logits: List[Dict(str, Tensor)] = []
+
+        for aux_binary_masks, aux_classes in zip(output_masks[:-1], classes[:-1]):
+            auxiliary_logits.append({"masks_queries_logits": aux_binary_masks, "class_queries_logits": aux_classes})
+
+        return auxiliary_logits
+
+    @add_start_docstrings_to_model_forward(MASK2FORMER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Mask2FormerForUniversalSegmentationOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: Tensor,
+        mask_labels: Optional[List[Tensor]] = None,
+        class_labels: Optional[List[Tensor]] = None,
+        pixel_mask: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_auxiliary_logits: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Mask2FormerForUniversalSegmentationOutput:
+        r"""
+        mask_labels (`List[torch.Tensor]`, *optional*):
+            List of mask labels of shape `(num_labels, height, width)` to be fed to a model
+        class_labels (`List[torch.LongTensor]`, *optional*):
+            list of target class labels of shape `(num_labels, height, width)` to be fed to a model. They identify the
+            labels of `mask_labels`, e.g. the label of `mask_labels[i][j]` if `class_labels[i][j]`.
+
+        Returns:
+            `Mask2FormerUniversalSegmentationOutput`
+
+        Examples:
+
+        Instance segmentation example:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, Mask2FormerForUniversalSegmentation
+        >>> from PIL import Image
+        >>> import requests
+        >>> import torch
+
+        >>> # Load Mask2Former trained on COCO instance segmentation dataset
+        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/mask2former-swin-small-coco-instance")
+        >>> model = Mask2FormerForUniversalSegmentation.from_pretrained(
+        ...     "facebook/mask2former-swin-small-coco-instance"
+        ... )
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = image_processor(image, return_tensors="pt")
+
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+
+        >>> # Model predicts class_queries_logits of shape `(batch_size, num_queries)`
+        >>> # and masks_queries_logits of shape `(batch_size, num_queries, height, width)`
+        >>> class_queries_logits = outputs.class_queries_logits
+        >>> masks_queries_logits = outputs.masks_queries_logits
+
+        >>> # Perform post-processing to get instance segmentation map
+        >>> pred_instance_map = image_processor.post_process_semantic_segmentation(
+        ...     outputs, target_sizes=[image.size[::-1]]
+        ... )[0]
+        >>> print(pred_instance_map.shape)
+        torch.Size([480, 640])
+        ```
+
+        Semantic segmentation example:
+        ```python
+        >>> from transformers import AutoImageProcessor, Mask2FormerForUniversalSegmentation
+        >>> from PIL import Image
+        >>> import requests
+        >>> import torch
+
+        >>> # Load Mask2Former trained on ADE20k semantic segmentation dataset
+        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/mask2former-swin-small-ade-semantic")
+        >>> model = Mask2FormerForUniversalSegmentation.from_pretrained("facebook/mask2former-swin-small-ade-semantic")
+
+        >>> url = (
+        ...     "https://huggingface.co/datasets/hf-internal-testing/fixtures_ade20k/resolve/main/ADE_val_00000001.jpg"
+        ... )
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = image_processor(image, return_tensors="pt")
+
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+
+        >>> # Model predicts class_queries_logits of shape `(batch_size, num_queries)`
+        >>> # and masks_queries_logits of shape `(batch_size, num_queries, height, width)`
+        >>> class_queries_logits = outputs.class_queries_logits
+        >>> masks_queries_logits = outputs.masks_queries_logits
+
+        >>> # Perform post-processing to get semantic segmentation map
+        >>> pred_semantic_map = image_processor.post_process_semantic_segmentation(
+        ...     outputs, target_sizes=[image.size[::-1]]
+        ... )[0]
+        >>> print(pred_semantic_map.shape)
+        torch.Size([512, 683])
+        ```
+
+        Panoptic segmentation example:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, Mask2FormerForUniversalSegmentation
+        >>> from PIL import Image
+        >>> import requests
+        >>> import torch
+
+        >>> # Load Mask2Former trained on CityScapes panoptic segmentation dataset
+        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/mask2former-swin-small-cityscapes-panoptic")
+        >>> model = Mask2FormerForUniversalSegmentation.from_pretrained(
+        ...     "facebook/mask2former-swin-small-cityscapes-panoptic"
+        ... )
+
+        >>> url = "https://cdn-media.huggingface.co/Inference-API/Sample-results-on-the-Cityscapes-dataset-The-above-images-show-how-our-method-can-handle.png"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = image_processor(image, return_tensors="pt")
+
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+
+        >>> # Model predicts class_queries_logits of shape `(batch_size, num_queries)`
+        >>> # and masks_queries_logits of shape `(batch_size, num_queries, height, width)`
+        >>> class_queries_logits = outputs.class_queries_logits
+        >>> masks_queries_logits = outputs.masks_queries_logits
+
+        >>> # Perform post-processing to get panoptic segmentation map
+        >>> pred_panoptic_map = image_processor.post_process_panoptic_segmentation(
+        ...     outputs, target_sizes=[image.size[::-1]]
+        ... )[0]["segmentation"]
+        >>> print(pred_panoptic_map.shape)
+        torch.Size([338, 676])
+        ```
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            pixel_values=pixel_values,
+            pixel_mask=pixel_mask,
+            output_hidden_states=output_hidden_states or self.config.use_auxiliary_loss,
+            output_attentions=output_attentions,
+            return_dict=True,
+        )
+
+        loss, loss_dict, auxiliary_logits = None, None, None
+        class_queries_logits = ()
+
+        for decoder_output in outputs.transformer_decoder_intermediate_states:
+            class_prediction = self.class_predictor(decoder_output.transpose(0, 1))
+            class_queries_logits += (class_prediction,)
+
+        masks_queries_logits = outputs.masks_queries_logits
+
+        auxiliary_logits = self.get_auxiliary_logits(class_queries_logits, masks_queries_logits)
+
+        if mask_labels is not None and class_labels is not None:
+            loss_dict = self.get_loss_dict(
+                masks_queries_logits=masks_queries_logits[-1],
+                class_queries_logits=class_queries_logits[-1],
+                mask_labels=mask_labels,
+                class_labels=class_labels,
+                auxiliary_predictions=auxiliary_logits,
+            )
+            loss = self.get_loss(loss_dict)
+
+        encoder_hidden_states = None
+        pixel_decoder_hidden_states = None
+        transformer_decoder_hidden_states = None
+
+        if output_hidden_states:
+            encoder_hidden_states = outputs.encoder_hidden_states
+            pixel_decoder_hidden_states = outputs.pixel_decoder_hidden_states
+            transformer_decoder_hidden_states = outputs.transformer_decoder_hidden_states
+
+        output_auxiliary_logits = (
+            self.config.output_auxiliary_logits if output_auxiliary_logits is None else output_auxiliary_logits
+        )
+        if not output_auxiliary_logits:
+            auxiliary_logits = None
+
+        output = Mask2FormerForUniversalSegmentationOutput(
+            loss=loss,
+            class_queries_logits=class_queries_logits[-1],
+            masks_queries_logits=masks_queries_logits[-1],
+            auxiliary_logits=auxiliary_logits,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            pixel_decoder_last_hidden_state=outputs.pixel_decoder_last_hidden_state,
+            transformer_decoder_last_hidden_state=outputs.transformer_decoder_last_hidden_state,
+            encoder_hidden_states=encoder_hidden_states,
+            pixel_decoder_hidden_states=pixel_decoder_hidden_states,
+            transformer_decoder_hidden_states=transformer_decoder_hidden_states,
+            attentions=outputs.attentions,
+        )
+
+        if not return_dict:
+            output = tuple(v for v in output.values() if v is not None)
+            if loss is not None:
+                output = ((loss)) + output
+        return output
diff --git a/src/transformers/models/maskformer/__init__.py b/src/transformers/models/maskformer/__init__.py
index ba6452c7c405..efb2290f2c9c 100644
--- a/src/transformers/models/maskformer/__init__.py
+++ b/src/transformers/models/maskformer/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/maskformer/configuration_maskformer.py b/src/transformers/models/maskformer/configuration_maskformer.py
index 655bee2b9a5f..baf907ee53c0 100644
--- a/src/transformers/models/maskformer/configuration_maskformer.py
+++ b/src/transformers/models/maskformer/configuration_maskformer.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ MaskFormer model configuration"""
-import copy
 from typing import Dict, Optional
 
 from ...configuration_utils import PretrainedConfig
@@ -129,19 +128,18 @@ def __init__(
                 drop_path_rate=0.3,
                 out_features=["stage1", "stage2", "stage3", "stage4"],
             )
-        else:
-            # verify that the backbone is supported
-            backbone_model_type = (
-                backbone_config.pop("model_type") if isinstance(backbone_config, dict) else backbone_config.model_type
+
+        if isinstance(backbone_config, dict):
+            backbone_model_type = backbone_config.pop("model_type")
+            config_class = CONFIG_MAPPING[backbone_model_type]
+            backbone_config = config_class.from_dict(backbone_config)
+
+        # verify that the backbone is supported
+        if backbone_config.model_type not in self.backbones_supported:
+            logger.warning_once(
+                f"Backbone {backbone_config.model_type} is not a supported model and may not be compatible with MaskFormer. "
+                f"Supported model types: {','.join(self.backbones_supported)}"
             )
-            if backbone_model_type not in self.backbones_supported:
-                raise ValueError(
-                    f"Backbone {backbone_model_type} not supported, please use one of"
-                    f" {','.join(self.backbones_supported)}"
-                )
-            if isinstance(backbone_config, dict):
-                config_class = CONFIG_MAPPING[backbone_model_type]
-                backbone_config = config_class.from_dict(backbone_config)
 
         if decoder_config is None:
             # fall back to https://huggingface.co/facebook/detr-resnet-50
@@ -201,16 +199,3 @@ def from_backbone_and_decoder_configs(
             decoder_config=decoder_config,
             **kwargs,
         )
-
-    def to_dict(self) -> Dict[str, any]:
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["backbone_config"] = self.backbone_config.to_dict()
-        output["decoder_config"] = self.decoder_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/maskformer/configuration_maskformer_swin.py b/src/transformers/models/maskformer/configuration_maskformer_swin.py
index 36e0746552c8..7c3ac54bd80d 100644
--- a/src/transformers/models/maskformer/configuration_maskformer_swin.py
+++ b/src/transformers/models/maskformer/configuration_maskformer_swin.py
@@ -16,12 +16,13 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
 
 
 logger = logging.get_logger(__name__)
 
 
-class MaskFormerSwinConfig(PretrainedConfig):
+class MaskFormerSwinConfig(BackboneConfigMixin, PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`MaskFormerSwinModel`]. It is used to instantiate
     a Donut model according to the specified arguments, defining the model architecture. Instantiating a configuration
@@ -62,15 +63,18 @@ class MaskFormerSwinConfig(PretrainedConfig):
             `"selu"` and `"gelu_new"` are supported.
         use_absolute_embeddings (`bool`, *optional*, defaults to False):
             Whether or not to add absolute position embeddings to the patch embeddings.
-        patch_norm (`bool`, *optional*, defaults to True):
-            Whether or not to add layer normalization after patch embedding.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
         out_features (`List[str]`, *optional*):
             If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
-            (depending on how many stages the model has). Will default to the last stage if unset.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage.
+        out_indices (`List[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage.
 
     Example:
 
@@ -109,11 +113,11 @@ def __init__(
         drop_path_rate=0.1,
         hidden_act="gelu",
         use_absolute_embeddings=False,
-        patch_norm=True,
         initializer_range=0.02,
         layer_norm_eps=1e-5,
         out_features=None,
-        **kwargs
+        out_indices=None,
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -132,19 +136,12 @@ def __init__(
         self.drop_path_rate = drop_path_rate
         self.hidden_act = hidden_act
         self.use_absolute_embeddings = use_absolute_embeddings
-        self.path_norm = patch_norm
         self.layer_norm_eps = layer_norm_eps
         self.initializer_range = initializer_range
         # we set the hidden_size attribute in order to make Swin work with VisionEncoderDecoderModel
         # this indicates the channel dimension after the last stage of the model
         self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
         self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
-        if out_features is not None:
-            if not isinstance(out_features, list):
-                raise ValueError("out_features should be a list")
-            for feature in out_features:
-                if feature not in self.stage_names:
-                    raise ValueError(
-                        f"Feature {feature} is not a valid feature name. Valid names are {self.stage_names}"
-                    )
-        self.out_features = out_features
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
diff --git a/src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py
index c08591e044db..999eee136afb 100644
--- a/src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py
@@ -19,17 +19,17 @@
 from pprint import pformat
 from typing import Any, Dict, Iterator, List, Set, Tuple
 
+import requests
 import torch
 import torchvision.transforms as T
-from PIL import Image
-from torch import Tensor, nn
-
-import requests
 from detectron2.checkpoint import DetectionCheckpointer
 from detectron2.config import get_cfg
 from detectron2.data import MetadataCatalog
 from detectron2.projects.deeplab import add_deeplab_config
-from transformers.models.maskformer.feature_extraction_maskformer import MaskFormerFeatureExtractor
+from PIL import Image
+from torch import Tensor, nn
+
+from transformers.models.maskformer.feature_extraction_maskformer import MaskFormerImageProcessor
 from transformers.models.maskformer.modeling_maskformer import (
     MaskFormerConfig,
     MaskFormerForInstanceSegmentation,
@@ -72,7 +72,7 @@ def diff(self) -> List[str]:
         Returns:
             List[str]: List of keys not yet updated
         """
-        return set(list(self.to_track.keys())) - self._seen
+        return set(self.to_track.keys()) - self._seen
 
     def copy(self) -> Dict:
         # proxy the call to the internal dictionary
@@ -106,13 +106,12 @@ def setup_cfg(args: Args):
 
 class OriginalMaskFormerConfigToOursConverter:
     def __call__(self, original_config: object) -> MaskFormerConfig:
-
         model = original_config.MODEL
         mask_former = model.MASK_FORMER
         swin = model.SWIN
 
         dataset_catalog = MetadataCatalog.get(original_config.DATASETS.TEST[0])
-        id2label = {idx: label for idx, label in enumerate(dataset_catalog.stuff_classes)}
+        id2label = dict(enumerate(dataset_catalog.stuff_classes))
         label2id = {label: idx for idx, label in id2label.items()}
 
         config: MaskFormerConfig = MaskFormerConfig(
@@ -121,43 +120,43 @@ def __call__(self, original_config: object) -> MaskFormerConfig:
             num_labels=model.SEM_SEG_HEAD.NUM_CLASSES,
             no_object_weight=mask_former.NO_OBJECT_WEIGHT,
             num_queries=mask_former.NUM_OBJECT_QUERIES,
-            backbone_config=dict(
-                pretrain_img_size=swin.PRETRAIN_IMG_SIZE,
-                image_size=swin.PRETRAIN_IMG_SIZE,
-                in_channels=3,
-                patch_size=swin.PATCH_SIZE,
-                embed_dim=swin.EMBED_DIM,
-                depths=swin.DEPTHS,
-                num_heads=swin.NUM_HEADS,
-                window_size=swin.WINDOW_SIZE,
-                drop_path_rate=swin.DROP_PATH_RATE,
-                model_type="swin",
-            ),
+            backbone_config={
+                "pretrain_img_size": swin.PRETRAIN_IMG_SIZE,
+                "image_size": swin.PRETRAIN_IMG_SIZE,
+                "in_channels": 3,
+                "patch_size": swin.PATCH_SIZE,
+                "embed_dim": swin.EMBED_DIM,
+                "depths": swin.DEPTHS,
+                "num_heads": swin.NUM_HEADS,
+                "window_size": swin.WINDOW_SIZE,
+                "drop_path_rate": swin.DROP_PATH_RATE,
+                "model_type": "swin",
+            },
             dice_weight=mask_former.DICE_WEIGHT,
             ce_weight=1.0,
             mask_weight=mask_former.MASK_WEIGHT,
-            decoder_config=dict(
-                model_type="detr",
-                max_position_embeddings=1024,
-                encoder_layers=6,
-                encoder_ffn_dim=2048,
-                encoder_attention_heads=8,
-                decoder_layers=mask_former.DEC_LAYERS,
-                decoder_ffn_dim=mask_former.DIM_FEEDFORWARD,
-                decoder_attention_heads=mask_former.NHEADS,
-                encoder_layerdrop=0.0,
-                decoder_layerdrop=0.0,
-                d_model=mask_former.HIDDEN_DIM,
-                dropout=mask_former.DROPOUT,
-                attention_dropout=0.0,
-                activation_dropout=0.0,
-                init_std=0.02,
-                init_xavier_std=1.0,
-                scale_embedding=False,
-                auxiliary_loss=False,
-                dilation=False,
+            decoder_config={
+                "model_type": "detr",
+                "max_position_embeddings": 1024,
+                "encoder_layers": 6,
+                "encoder_ffn_dim": 2048,
+                "encoder_attention_heads": 8,
+                "decoder_layers": mask_former.DEC_LAYERS,
+                "decoder_ffn_dim": mask_former.DIM_FEEDFORWARD,
+                "decoder_attention_heads": mask_former.NHEADS,
+                "encoder_layerdrop": 0.0,
+                "decoder_layerdrop": 0.0,
+                "d_model": mask_former.HIDDEN_DIM,
+                "dropout": mask_former.DROPOUT,
+                "attention_dropout": 0.0,
+                "activation_dropout": 0.0,
+                "init_std": 0.02,
+                "init_xavier_std": 1.0,
+                "scale_embedding": False,
+                "auxiliary_loss": False,
+                "dilation": False,
                 # default pretrained config values
-            ),
+            },
             id2label=id2label,
             label2id=label2id,
         )
@@ -165,13 +164,13 @@ def __call__(self, original_config: object) -> MaskFormerConfig:
         return config
 
 
-class OriginalMaskFormerConfigToFeatureExtractorConverter:
-    def __call__(self, original_config: object) -> MaskFormerFeatureExtractor:
+class OriginalMaskFormerConfigToImageProcessorConverter:
+    def __call__(self, original_config: object) -> MaskFormerImageProcessor:
         model = original_config.MODEL
         model_input = original_config.INPUT
         dataset_catalog = MetadataCatalog.get(original_config.DATASETS.TEST[0])
 
-        return MaskFormerFeatureExtractor(
+        return MaskFormerImageProcessor(
             image_mean=(torch.tensor(model.PIXEL_MEAN) / 255).tolist(),
             image_std=(torch.tensor(model.PIXEL_STD) / 255).tolist(),
             size=model_input.MIN_SIZE_TEST,
@@ -555,9 +554,8 @@ def using_dirs(checkpoints_dir: Path, config_dir: Path) -> Iterator[Tuple[object
             yield config, checkpoint
 
 
-def test(original_model, our_model: MaskFormerForInstanceSegmentation, feature_extractor: MaskFormerFeatureExtractor):
+def test(original_model, our_model: MaskFormerForInstanceSegmentation, image_processor: MaskFormerImageProcessor):
     with torch.no_grad():
-
         original_model = original_model.eval()
         our_model = our_model.eval()
 
@@ -583,7 +581,6 @@ def test(original_model, our_model: MaskFormerForInstanceSegmentation, feature_e
         for original_model_feature, our_model_feature in zip(
             original_model_backbone_features.values(), our_model_output.encoder_hidden_states
         ):
-
             assert torch.allclose(
                 original_model_feature, our_model_feature, atol=1e-3
             ), "The backbone features are not the same."
@@ -603,7 +600,7 @@ def test(original_model, our_model: MaskFormerForInstanceSegmentation, feature_e
 
         our_model_out: MaskFormerForInstanceSegmentationOutput = our_model(x)
 
-        our_segmentation = feature_extractor.post_process_segmentation(our_model_out, target_size=(384, 384))
+        our_segmentation = image_processor.post_process_segmentation(our_model_out, target_size=(384, 384))
 
         assert torch.allclose(
             original_segmentation, our_segmentation, atol=1e-3
@@ -635,7 +632,6 @@ def get_name(checkpoint_file: Path):
 
 
 if __name__ == "__main__":
-
     parser = ArgumentParser(
         description="Command line to convert the original maskformers (with swin backbone) to our implementations."
     )
@@ -690,10 +686,7 @@ def get_name(checkpoint_file: Path):
     for config_file, checkpoint_file in OriginalMaskFormerCheckpointToOursConverter.using_dirs(
         checkpoints_dir, config_dir
     ):
-
-        feature_extractor = OriginalMaskFormerConfigToFeatureExtractorConverter()(
-            setup_cfg(Args(config_file=config_file))
-        )
+        image_processor = OriginalMaskFormerConfigToImageProcessorConverter()(setup_cfg(Args(config_file=config_file)))
 
         original_config = setup_cfg(Args(config_file=config_file))
         mask_former_kwargs = OriginalMaskFormer.from_config(original_config)
@@ -717,15 +710,15 @@ def get_name(checkpoint_file: Path):
             mask_former_for_instance_segmentation
         )
 
-        test(original_model, mask_former_for_instance_segmentation, feature_extractor)
+        test(original_model, mask_former_for_instance_segmentation, image_processor)
 
         model_name = get_name(checkpoint_file)
         logger.info(f"🪄 Saving {model_name}")
 
-        feature_extractor.save_pretrained(save_directory / model_name)
+        image_processor.save_pretrained(save_directory / model_name)
         mask_former_for_instance_segmentation.save_pretrained(save_directory / model_name)
 
-        feature_extractor.push_to_hub(
+        image_processor.push_to_hub(
             repo_path_or_name=save_directory / model_name,
             commit_message="Add model",
             use_temp_dir=True,
diff --git a/src/transformers/models/maskformer/convert_maskformer_resnet_to_pytorch.py b/src/transformers/models/maskformer/convert_maskformer_resnet_to_pytorch.py
index f0f69f9aa836..fec508de4138 100644
--- a/src/transformers/models/maskformer/convert_maskformer_resnet_to_pytorch.py
+++ b/src/transformers/models/maskformer/convert_maskformer_resnet_to_pytorch.py
@@ -21,12 +21,12 @@
 import pickle
 from pathlib import Path
 
+import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 
-import requests
-from huggingface_hub import hf_hub_download
-from transformers import MaskFormerConfig, MaskFormerFeatureExtractor, MaskFormerForInstanceSegmentation, ResNetConfig
+from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, MaskFormerImageProcessor, ResNetConfig
 from transformers.utils import logging
 
 
@@ -297,9 +297,9 @@ def convert_maskformer_checkpoint(
     else:
         ignore_index = 255
     reduce_labels = True if "ade" in model_name else False
-    feature_extractor = MaskFormerFeatureExtractor(ignore_index=ignore_index, reduce_labels=reduce_labels)
+    image_processor = MaskFormerImageProcessor(ignore_index=ignore_index, reduce_labels=reduce_labels)
 
-    inputs = feature_extractor(image, return_tensors="pt")
+    inputs = image_processor(image, return_tensors="pt")
 
     outputs = model(**inputs)
 
@@ -340,15 +340,15 @@ def convert_maskformer_checkpoint(
     print("Looks ok!")
 
     if pytorch_dump_folder_path is not None:
-        print(f"Saving model and feature extractor of {model_name} to {pytorch_dump_folder_path}")
+        print(f"Saving model and image processor of {model_name} to {pytorch_dump_folder_path}")
         Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
         model.save_pretrained(pytorch_dump_folder_path)
-        feature_extractor.save_pretrained(pytorch_dump_folder_path)
+        image_processor.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
-        print(f"Pushing model and feature extractor of {model_name} to the hub...")
+        print(f"Pushing model and image processor of {model_name} to the hub...")
         model.push_to_hub(f"facebook/{model_name}")
-        feature_extractor.push_to_hub(f"facebook/{model_name}")
+        image_processor.push_to_hub(f"facebook/{model_name}")
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/maskformer/convert_maskformer_swin_to_pytorch.py b/src/transformers/models/maskformer/convert_maskformer_swin_to_pytorch.py
index 59606b1a409a..8f0d0e99df1e 100644
--- a/src/transformers/models/maskformer/convert_maskformer_swin_to_pytorch.py
+++ b/src/transformers/models/maskformer/convert_maskformer_swin_to_pytorch.py
@@ -21,12 +21,12 @@
 import pickle
 from pathlib import Path
 
+import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 
-import requests
-from huggingface_hub import hf_hub_download
-from transformers import MaskFormerConfig, MaskFormerFeatureExtractor, MaskFormerForInstanceSegmentation, SwinConfig
+from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, MaskFormerImageProcessor, SwinConfig
 from transformers.utils import logging
 
 
@@ -278,9 +278,9 @@ def convert_maskformer_checkpoint(
     else:
         ignore_index = 255
     reduce_labels = True if "ade" in model_name else False
-    feature_extractor = MaskFormerFeatureExtractor(ignore_index=ignore_index, reduce_labels=reduce_labels)
+    image_processor = MaskFormerImageProcessor(ignore_index=ignore_index, reduce_labels=reduce_labels)
 
-    inputs = feature_extractor(image, return_tensors="pt")
+    inputs = image_processor(image, return_tensors="pt")
 
     outputs = model(**inputs)
 
@@ -294,15 +294,15 @@ def convert_maskformer_checkpoint(
     print("Looks ok!")
 
     if pytorch_dump_folder_path is not None:
-        print(f"Saving model and feature extractor to {pytorch_dump_folder_path}")
+        print(f"Saving model and image processor to {pytorch_dump_folder_path}")
         Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
         model.save_pretrained(pytorch_dump_folder_path)
-        feature_extractor.save_pretrained(pytorch_dump_folder_path)
+        image_processor.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
-        print("Pushing model and feature extractor to the hub...")
+        print("Pushing model and image processor to the hub...")
         model.push_to_hub(f"nielsr/{model_name}")
-        feature_extractor.push_to_hub(f"nielsr/{model_name}")
+        image_processor.push_to_hub(f"nielsr/{model_name}")
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/maskformer/feature_extraction_maskformer.py b/src/transformers/models/maskformer/feature_extraction_maskformer.py
index 26aff086afd2..848c8e128296 100644
--- a/src/transformers/models/maskformer/feature_extraction_maskformer.py
+++ b/src/transformers/models/maskformer/feature_extraction_maskformer.py
@@ -16,8 +16,7 @@
 
 import warnings
 
-from transformers.utils import logging
-
+from ...utils import logging
 from .image_processing_maskformer import MaskFormerImageProcessor
 
 
diff --git a/src/transformers/models/maskformer/image_processing_maskformer.py b/src/transformers/models/maskformer/image_processing_maskformer.py
index aea9bb784b9b..e071c45e0cc8 100644
--- a/src/transformers/models/maskformer/image_processing_maskformer.py
+++ b/src/transformers/models/maskformer/image_processing_maskformer.py
@@ -20,27 +20,27 @@
 
 import numpy as np
 
-from transformers.image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from transformers.image_transforms import (
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
     PaddingMode,
     get_resize_output_image_size,
-    normalize,
     pad,
     rescale,
     resize,
     to_channel_dimension_format,
-    to_numpy_array,
 )
-from transformers.image_utils import (
+from ...image_utils import (
     ChannelDimension,
     ImageInput,
     PILImageResampling,
     get_image_size,
     infer_channel_dimension_format,
-    is_batched,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
     valid_images,
 )
-from transformers.utils import (
+from ...utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
     TensorType,
@@ -71,23 +71,28 @@ def max_across_indices(values: Iterable[Any]) -> List[Any]:
 
 
 # Copied from transformers.models.detr.image_processing_detr.get_max_height_width
-def get_max_height_width(images: List[np.ndarray]) -> List[int]:
+def get_max_height_width(
+    images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> List[int]:
     """
     Get the maximum height and width across all images in a batch.
     """
-    input_channel_dimension = infer_channel_dimension_format(images[0])
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(images[0])
 
-    if input_channel_dimension == ChannelDimension.FIRST:
+    if input_data_format == ChannelDimension.FIRST:
         _, max_height, max_width = max_across_indices([img.shape for img in images])
-    elif input_channel_dimension == ChannelDimension.LAST:
+    elif input_data_format == ChannelDimension.LAST:
         max_height, max_width, _ = max_across_indices([img.shape for img in images])
     else:
-        raise ValueError(f"Invalid channel dimension format: {input_channel_dimension}")
+        raise ValueError(f"Invalid channel dimension format: {input_data_format}")
     return (max_height, max_width)
 
 
 # Copied from transformers.models.detr.image_processing_detr.make_pixel_mask
-def make_pixel_mask(image: np.ndarray, output_size: Tuple[int, int]) -> np.ndarray:
+def make_pixel_mask(
+    image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> np.ndarray:
     """
     Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
 
@@ -97,7 +102,7 @@ def make_pixel_mask(image: np.ndarray, output_size: Tuple[int, int]) -> np.ndarr
         output_size (`Tuple[int, int]`):
             Output size of the mask.
     """
-    input_height, input_width = get_image_size(image)
+    input_height, input_width = get_image_size(image, channel_dim=input_data_format)
     mask = np.zeros(output_size, dtype=np.int64)
     mask[:input_height, :input_width] = 1
     return mask
@@ -123,7 +128,7 @@ def binary_mask_to_rle(mask):
     pixels = np.concatenate([[0], pixels, [0]])
     runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
     runs[1::2] -= runs[::2]
-    return [x for x in runs]
+    return list(runs)
 
 
 # Copied from transformers.models.detr.image_processing_detr.convert_segmentation_to_rle
@@ -300,6 +305,7 @@ def get_maskformer_resize_output_image_size(
     max_size: Optional[int] = None,
     size_divisor: int = 0,
     default_to_square: bool = True,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
 ) -> tuple:
     """
     Computes the output size given the desired size.
@@ -313,14 +319,18 @@ def get_maskformer_resize_output_image_size(
             Whether to default to square if no size is provided.
         max_size (`int`, *optional*):
             The maximum size of the output image.
-        size_divisible (`int`, *optional*, defaults to `0`):
+        size_divisible (`int`, *optional*, defaults to 0):
             If size_divisible is given, the output image size will be divisible by the number.
 
     Returns:
         `Tuple[int, int]`: The output size.
     """
     output_size = get_resize_output_image_size(
-        input_image=image, size=size, default_to_square=default_to_square, max_size=max_size
+        input_image=image,
+        size=size,
+        default_to_square=default_to_square,
+        max_size=max_size,
+        input_data_format=input_data_format,
     )
 
     if size_divisor > 0:
@@ -348,20 +358,17 @@ class MaskFormerImageProcessor(BaseImageProcessor):
             sequence like `(width, height)`, output size will be matched to this. If size is an int, smaller edge of
             the image will be matched to this number. i.e, if `height > width`, then image will be rescaled to `(size *
             height / width, size)`.
-        max_size (`int`, *optional*, defaults to 1333):
-            The largest size an image dimension can have (otherwise it's capped). Only has an effect if `do_resize` is
-            set to `True`.
-        resample (`int`, *optional*, defaults to `PIL.Image.Resampling.BILINEAR`):
+        size_divisor (`int`, *optional*, defaults to 32):
+            Some backbones need images divisible by a certain number. If not passed, it defaults to the value used in
+            Swin Transformer.
+        resample (`int`, *optional*, defaults to `Resampling.BILINEAR`):
             An optional resampling filter. This can be one of `PIL.Image.Resampling.NEAREST`,
             `PIL.Image.Resampling.BOX`, `PIL.Image.Resampling.BILINEAR`, `PIL.Image.Resampling.HAMMING`,
             `PIL.Image.Resampling.BICUBIC` or `PIL.Image.Resampling.LANCZOS`. Only has an effect if `do_resize` is set
             to `True`.
-        size_divisor (`int`, *optional*, defaults to 32):
-            Some backbones need images divisible by a certain number. If not passed, it defaults to the value used in
-            Swin Transformer.
         do_rescale (`bool`, *optional*, defaults to `True`):
             Whether to rescale the input to a certain `scale`.
-        rescale_factor (`float`, *optional*, defaults to 1/ 255):
+        rescale_factor (`float`, *optional*, defaults to `1/ 255`):
             Rescale the input by the given factor. Only has an effect if `do_rescale` is set to `True`.
         do_normalize (`bool`, *optional*, defaults to `True`):
             Whether or not to normalize the input with mean and standard deviation.
@@ -373,7 +380,7 @@ class MaskFormerImageProcessor(BaseImageProcessor):
         ignore_index (`int`, *optional*):
             Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
             denoted with 0 (background) will be replaced with `ignore_index`.
-        reduce_labels (`bool`, *optional*, defaults to `False`):
+        do_reduce_labels (`bool`, *optional*, defaults to `False`):
             Whether or not to decrement all label values of segmentation maps by 1. Usually used for datasets where 0
             is used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k).
             The background label will be replaced by `ignore_index`.
@@ -394,8 +401,8 @@ def __init__(
         image_mean: Union[float, List[float]] = None,
         image_std: Union[float, List[float]] = None,
         ignore_index: Optional[int] = None,
-        reduce_labels: bool = False,
-        **kwargs
+        do_reduce_labels: bool = False,
+        **kwargs,
     ):
         if "size_divisibility" in kwargs:
             warnings.warn(
@@ -415,6 +422,13 @@ def __init__(
             self._max_size = kwargs.pop("max_size")
         else:
             self._max_size = 1333
+        if "reduce_labels" in kwargs:
+            warnings.warn(
+                "The `reduce_labels` argument is deprecated and will be removed in v4.27. Please use "
+                "`do_reduce_labels` instead.",
+                FutureWarning,
+            )
+            do_reduce_labels = kwargs.pop("reduce_labels")
 
         size = size if size is not None else {"shortest_edge": 800, "longest_edge": self._max_size}
         size = get_size_dict(size, max_size=self._max_size, default_to_square=False)
@@ -430,7 +444,7 @@ def __init__(
         self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
         self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
         self.ignore_index = ignore_index
-        self.reduce_labels = reduce_labels
+        self.do_reduce_labels = do_reduce_labels
 
     @classmethod
     def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
@@ -445,24 +459,6 @@ def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
             image_processor_dict["size_divisibility"] = kwargs.pop("size_divisibility")
         return super().from_dict(image_processor_dict, **kwargs)
 
-    @property
-    def size_divisibility(self):
-        warnings.warn(
-            "The `size_divisibility` property is deprecated and will be removed in v4.27. Please use "
-            "`size_divisor` instead.",
-            FutureWarning,
-        )
-        return self.size_divisor
-
-    @property
-    def max_size(self):
-        warnings.warn(
-            "The `max_size` property is deprecated and will be removed in v4.27. Please use size['longest_edge']"
-            " instead.",
-            FutureWarning,
-        )
-        return self.size["longest_edge"]
-
     def resize(
         self,
         image: np.ndarray,
@@ -470,11 +466,27 @@ def resize(
         size_divisor: int = 0,
         resample: PILImageResampling = PILImageResampling.BILINEAR,
         data_format=None,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize the image to the given size. Size can be min_size (scalar) or `(height, width)` tuple. If size is an
         int, smaller edge of the image will be matched to this number.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                The size of the output image.
+            size_divisor (`int`, *optional*, defaults to 0):
+                If size_divisor is given, the output image size will be divisible by the number.
+            resample (`PILImageResampling` resampling filter, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                Resampling filter to use when resizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
         """
         if "max_size" in kwargs:
             warnings.warn(
@@ -502,29 +514,41 @@ def resize(
             max_size=max_size,
             size_divisor=size_divisor,
             default_to_square=False,
+            input_data_format=input_data_format,
+        )
+        image = resize(
+            image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
         )
-        image = resize(image, size=size, resample=resample, data_format=data_format)
         return image
 
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
     def rescale(
-        self, image: np.ndarray, rescale_factor: float, data_format: Optional[ChannelDimension] = None
-    ) -> np.ndarray:
-        """
-        Rescale the image by the given factor.
-        """
-        return rescale(image, rescale_factor, data_format=data_format)
-
-    def normalize(
         self,
         image: np.ndarray,
-        mean: Union[float, Iterable[float]],
-        std: Union[float, Iterable[float]],
-        data_format: Optional[ChannelDimension] = None,
+        rescale_factor: float,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.ndarray:
         """
-        Normalize the image with the given mean and standard deviation.
+        Rescale the image by the given factor. image = image * rescale_factor.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            rescale_factor (`float`):
+                The value to use for rescaling.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the input image. If unset, is inferred from the input image. Can be
+                one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
         """
-        return normalize(image, mean=mean, std=std, data_format=data_format)
+        return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)
 
     def convert_segmentation_map_to_binary_masks(
         self,
@@ -557,13 +581,16 @@ def _preprocess(
         do_normalize: bool = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ):
         if do_resize:
-            image = self.resize(image, size=size, size_divisor=size_divisor, resample=resample)
+            image = self.resize(
+                image, size=size, size_divisor=size_divisor, resample=resample, input_data_format=input_data_format
+            )
         if do_rescale:
-            image = self.rescale(image, rescale_factor=rescale_factor)
+            image = self.rescale(image, rescale_factor=rescale_factor, input_data_format=input_data_format)
         if do_normalize:
-            image = self.normalize(image, mean=image_mean, std=image_std)
+            image = self.normalize(image, mean=image_mean, std=image_std, input_data_format=input_data_format)
         return image
 
     def _preprocess_image(
@@ -579,10 +606,18 @@ def _preprocess_image(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.ndarray:
         """Preprocesses a single image."""
         # All transformations expect numpy arrays.
         image = to_numpy_array(image)
+        if is_scaled_image(image) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(image)
         image = self._preprocess(
             image=image,
             do_resize=do_resize,
@@ -594,9 +629,10 @@ def _preprocess_image(
             do_normalize=do_normalize,
             image_mean=image_mean,
             image_std=image_std,
+            input_data_format=input_data_format,
         )
         if data_format is not None:
-            image = to_channel_dimension_format(image, data_format)
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
         return image
 
     def _preprocess_mask(
@@ -605,14 +641,19 @@ def _preprocess_mask(
         do_resize: bool = None,
         size: Dict[str, int] = None,
         size_divisor: int = 0,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.ndarray:
         """Preprocesses a single mask."""
         segmentation_map = to_numpy_array(segmentation_map)
         # Add channel dimension if missing - needed for certain transformations
-        added_channel_dim = False
         if segmentation_map.ndim == 2:
             added_channel_dim = True
             segmentation_map = segmentation_map[None, ...]
+            input_data_format = ChannelDimension.FIRST
+        else:
+            added_channel_dim = False
+            if input_data_format is None:
+                input_data_format = infer_channel_dimension_format(segmentation_map, num_channels=1)
         # TODO: (Amy)
         # Remork segmentation map processing to include reducing labels and resizing which doesn't
         # drop segment IDs > 255.
@@ -624,6 +665,7 @@ def _preprocess_mask(
             size_divisor=size_divisor,
             do_rescale=False,
             do_normalize=False,
+            input_data_format=input_data_format,
         )
         # Remove extra channel dimension if added for processing
         if added_channel_dim:
@@ -645,16 +687,27 @@ def preprocess(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         ignore_index: Optional[int] = None,
-        reduce_labels: Optional[bool] = None,
+        do_reduce_labels: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> BatchFeature:
         if "pad_and_return_pixel_mask" in kwargs:
             warnings.warn(
-                "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version",
+                "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in v4.27",
+                FutureWarning,
+            )
+        if "reduce_labels" in kwargs:
+            warnings.warn(
+                "The `reduce_labels` argument is deprecated and will be removed in v4.27. Please use"
+                " `do_reduce_labels` instead.",
                 FutureWarning,
             )
+            if do_reduce_labels is not None:
+                raise ValueError(
+                    "Cannot use both `reduce_labels` and `do_reduce_labels`. Please use `do_reduce_labels` instead."
+                )
 
         do_resize = do_resize if do_resize is not None else self.do_resize
         size = size if size is not None else self.size
@@ -667,7 +720,7 @@ def preprocess(
         image_mean = image_mean if image_mean is not None else self.image_mean
         image_std = image_std if image_std is not None else self.image_std
         ignore_index = ignore_index if ignore_index is not None else self.ignore_index
-        reduce_labels = reduce_labels if reduce_labels is not None else self.reduce_labels
+        do_reduce_labels = do_reduce_labels if do_reduce_labels is not None else self.do_reduce_labels
 
         if do_resize is not None and size is None or size_divisor is None:
             raise ValueError("If `do_resize` is True, `size` and `size_divisor` must be provided.")
@@ -690,9 +743,9 @@ def preprocess(
                 "torch.Tensor, tf.Tensor or jax.ndarray."
             )
 
-        if not is_batched(images):
-            images = [images]
-            segmentation_maps = [segmentation_maps] if segmentation_maps is not None else None
+        images = make_list_of_images(images)
+        if segmentation_maps is not None:
+            segmentation_maps = make_list_of_images(segmentation_maps, expected_ndims=2)
 
         if segmentation_maps is not None and len(images) != len(segmentation_maps):
             raise ValueError("Images and segmentation maps must have the same length.")
@@ -710,17 +763,26 @@ def preprocess(
                 image_mean=image_mean,
                 image_std=image_std,
                 data_format=data_format,
+                input_data_format=input_data_format,
             )
             for image in images
         ]
 
         if segmentation_maps is not None:
             segmentation_maps = [
-                self._preprocess_mask(segmentation_map, do_resize, size, size_divisor)
+                self._preprocess_mask(
+                    segmentation_map, do_resize, size, size_divisor, input_data_format=input_data_format
+                )
                 for segmentation_map in segmentation_maps
             ]
         encoded_inputs = self.encode_inputs(
-            images, segmentation_maps, instance_id_to_semantic_id, ignore_index, reduce_labels, return_tensors
+            images,
+            segmentation_maps,
+            instance_id_to_semantic_id,
+            ignore_index,
+            do_reduce_labels,
+            return_tensors,
+            input_data_format=input_data_format,
         )
         return encoded_inputs
 
@@ -731,18 +793,24 @@ def _pad_image(
         output_size: Tuple[int, int],
         constant_values: Union[float, Iterable[float]] = 0,
         data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.ndarray:
         """
         Pad an image with zeros to the given size.
         """
-        input_height, input_width = get_image_size(image)
+        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
         output_height, output_width = output_size
 
         pad_bottom = output_height - input_height
         pad_right = output_width - input_width
         padding = ((0, pad_bottom), (0, pad_right))
         padded_image = pad(
-            image, padding, mode=PaddingMode.CONSTANT, constant_values=constant_values, data_format=data_format
+            image,
+            padding,
+            mode=PaddingMode.CONSTANT,
+            constant_values=constant_values,
+            data_format=data_format,
+            input_data_format=input_data_format,
         )
         return padded_image
 
@@ -754,7 +822,8 @@ def pad(
         return_pixel_mask: bool = True,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = None,
-    ) -> np.ndarray:
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> BatchFeature:
         """
         Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
         in the batch and optionally returns their corresponding pixel mask.
@@ -766,21 +835,37 @@ def pad(
                 The value to use for the padding if `mode` is `"constant"`.
             return_pixel_mask (`bool`, *optional*, defaults to `True`):
                 Whether to return a pixel mask.
-            input_channel_dimension (`ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be inferred from the input image.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
         """
-        pad_size = get_max_height_width(images)
+        pad_size = get_max_height_width(images, input_data_format=input_data_format)
 
         padded_images = [
-            self._pad_image(image, pad_size, constant_values=constant_values, data_format=data_format)
+            self._pad_image(
+                image,
+                pad_size,
+                constant_values=constant_values,
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
             for image in images
         ]
         data = {"pixel_values": padded_images}
 
         if return_pixel_mask:
-            masks = [make_pixel_mask(image=image, output_size=pad_size) for image in images]
+            masks = [
+                make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
+                for image in images
+            ]
             data["pixel_mask"] = masks
 
         return BatchFeature(data=data, tensor_type=return_tensors)
@@ -793,7 +878,7 @@ def encode_inputs(
         ignore_index: Optional[int] = None,
         reduce_labels: bool = False,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ):
         """
         Pad images up to the largest image in a batch and create a corresponding `pixel_mask`.
@@ -843,20 +928,21 @@ def encode_inputs(
               `mask_labels[i][j]` if `class_labels[i][j]`.
         """
         ignore_index = self.ignore_index if ignore_index is None else ignore_index
-        reduce_labels = self.reduce_labels if reduce_labels is None else reduce_labels
-
-        if "pad_and_return_pixel_mask" in kwargs:
-            warnings.warn(
-                "The `pad_and_return_pixel_mask` argument has no effect and will be removed in v4.27", FutureWarning
-            )
+        reduce_labels = self.do_reduce_labels if reduce_labels is None else reduce_labels
 
         pixel_values_list = [to_numpy_array(pixel_values) for pixel_values in pixel_values_list]
-        encoded_inputs = self.pad(pixel_values_list, return_tensors=return_tensors)
+
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(pixel_values_list[0])
+
+        encoded_inputs = self.pad(
+            pixel_values_list, return_tensors=return_tensors, input_data_format=input_data_format
+        )
 
         if segmentation_maps is not None:
             mask_labels = []
             class_labels = []
-            pad_size = get_max_height_width(pixel_values_list)
+            pad_size = get_max_height_width(pixel_values_list, input_data_format=input_data_format)
             # Convert to list of binary masks and labels
             for idx, segmentation_map in enumerate(segmentation_maps):
                 segmentation_map = to_numpy_array(segmentation_map)
@@ -872,7 +958,13 @@ def encode_inputs(
                 # this will be removed in the future
                 masks = [mask[None, ...] for mask in masks]
                 masks = [
-                    self._pad_image(image=mask, output_size=pad_size, constant_values=ignore_index) for mask in masks
+                    self._pad_image(
+                        image=mask,
+                        output_size=pad_size,
+                        constant_values=ignore_index,
+                        input_data_format=ChannelDimension.FIRST,
+                    )
+                    for mask in masks
                 ]
                 masks = np.concatenate(masks, axis=0)
                 mask_labels.append(torch.from_numpy(masks))
@@ -990,6 +1082,7 @@ def post_process_instance_segmentation(
         overlap_mask_area_threshold: float = 0.8,
         target_sizes: Optional[List[Tuple[int, int]]] = None,
         return_coco_annotation: Optional[bool] = False,
+        return_binary_maps: Optional[bool] = False,
     ) -> List[Dict]:
         """
         Converts the output of [`MaskFormerForInstanceSegmentationOutput`] into instance segmentation predictions. Only
@@ -1008,9 +1101,11 @@ def post_process_instance_segmentation(
             target_sizes (`List[Tuple]`, *optional*):
                 List of length (batch_size), where each list item (`Tuple[int, int]]`) corresponds to the requested
                 final size (height, width) of each prediction. If left to None, predictions will not be resized.
-            return_coco_annotation (`bool`, *optional*):
-                Defaults to `False`. If set to `True`, segmentation maps are returned in COCO run-length encoding (RLE)
-                format.
+            return_coco_annotation (`bool`, *optional*, defaults to `False`):
+                If set to `True`, segmentation maps are returned in COCO run-length encoding (RLE) format.
+            return_binary_maps (`bool`, *optional*, defaults to `False`):
+                If set to `True`, segmentation maps are returned as a concatenated tensor of binary segmentation maps
+                (one per detected instance).
         Returns:
             `List[Dict]`: A list of dictionaries, one per image, each dictionary containing two keys:
             - **segmentation** -- A tensor of shape `(height, width)` where each pixel represents a `segment_id` or
@@ -1021,48 +1116,75 @@ def post_process_instance_segmentation(
                 - **label_id** -- An integer representing the label / semantic class id corresponding to `segment_id`.
                 - **score** -- Prediction score of segment with `segment_id`.
         """
-        class_queries_logits = outputs.class_queries_logits  # [batch_size, num_queries, num_classes+1]
-        masks_queries_logits = outputs.masks_queries_logits  # [batch_size, num_queries, height, width]
-
-        batch_size = class_queries_logits.shape[0]
-        num_labels = class_queries_logits.shape[-1] - 1
+        if return_coco_annotation and return_binary_maps:
+            raise ValueError("return_coco_annotation and return_binary_maps can not be both set to True.")
 
-        mask_probs = masks_queries_logits.sigmoid()  # [batch_size, num_queries, height, width]
+        # [batch_size, num_queries, num_classes+1]
+        class_queries_logits = outputs.class_queries_logits
+        # [batch_size, num_queries, height, width]
+        masks_queries_logits = outputs.masks_queries_logits
 
-        # Predicted label and score of each query (batch_size, num_queries)
-        pred_scores, pred_labels = nn.functional.softmax(class_queries_logits, dim=-1).max(-1)
+        device = masks_queries_logits.device
+        num_classes = class_queries_logits.shape[-1] - 1
+        num_queries = class_queries_logits.shape[-2]
 
         # Loop over items in batch size
         results: List[Dict[str, TensorType]] = []
 
-        for i in range(batch_size):
-            mask_probs_item, pred_scores_item, pred_labels_item = remove_low_and_no_objects(
-                mask_probs[i], pred_scores[i], pred_labels[i], threshold, num_labels
-            )
+        for i in range(class_queries_logits.shape[0]):
+            mask_pred = masks_queries_logits[i]
+            mask_cls = class_queries_logits[i]
 
-            # No mask found
-            if mask_probs_item.shape[0] <= 0:
-                height, width = target_sizes[i] if target_sizes is not None else mask_probs_item.shape[1:]
-                segmentation = torch.zeros((height, width)) - 1
-                results.append({"segmentation": segmentation, "segments_info": []})
-                continue
+            scores = torch.nn.functional.softmax(mask_cls, dim=-1)[:, :-1]
+            labels = torch.arange(num_classes, device=device).unsqueeze(0).repeat(num_queries, 1).flatten(0, 1)
 
-            # Get segmentation map and segment information of batch item
-            target_size = target_sizes[i] if target_sizes is not None else None
-            segmentation, segments = compute_segments(
-                mask_probs=mask_probs_item,
-                pred_scores=pred_scores_item,
-                pred_labels=pred_labels_item,
-                mask_threshold=mask_threshold,
-                overlap_mask_area_threshold=overlap_mask_area_threshold,
-                label_ids_to_fuse=[],
-                target_size=target_size,
+            scores_per_image, topk_indices = scores.flatten(0, 1).topk(num_queries, sorted=False)
+            labels_per_image = labels[topk_indices]
+
+            topk_indices = torch.div(topk_indices, num_classes, rounding_mode="floor")
+            mask_pred = mask_pred[topk_indices]
+            pred_masks = (mask_pred > 0).float()
+
+            # Calculate average mask prob
+            mask_scores_per_image = (mask_pred.sigmoid().flatten(1) * pred_masks.flatten(1)).sum(1) / (
+                pred_masks.flatten(1).sum(1) + 1e-6
             )
+            pred_scores = scores_per_image * mask_scores_per_image
+            pred_classes = labels_per_image
+
+            segmentation = torch.zeros(masks_queries_logits.shape[2:]) - 1
+            if target_sizes is not None:
+                segmentation = torch.zeros(target_sizes[i]) - 1
+                pred_masks = torch.nn.functional.interpolate(
+                    pred_masks.unsqueeze(0), size=target_sizes[i], mode="nearest"
+                )[0]
+
+            instance_maps, segments = [], []
+            current_segment_id = 0
+            for j in range(num_queries):
+                score = pred_scores[j].item()
+
+                if not torch.all(pred_masks[j] == 0) and score >= threshold:
+                    segmentation[pred_masks[j] == 1] = current_segment_id
+                    segments.append(
+                        {
+                            "id": current_segment_id,
+                            "label_id": pred_classes[j].item(),
+                            "was_fused": False,
+                            "score": round(score, 6),
+                        }
+                    )
+                    current_segment_id += 1
+                    instance_maps.append(pred_masks[j])
 
             # Return segmentation map in run-length encoding (RLE) format
             if return_coco_annotation:
                 segmentation = convert_segmentation_to_rle(segmentation)
 
+            # Return a concatenated tensor of binary instance maps
+            if return_binary_maps and len(instance_maps) != 0:
+                segmentation = torch.stack(instance_maps, dim=0)
+
             results.append({"segmentation": segmentation, "segments_info": segments})
         return results
 
diff --git a/src/transformers/models/maskformer/modeling_maskformer.py b/src/transformers/models/maskformer/modeling_maskformer.py
index 298d10879a2f..87b91ed64b62 100644
--- a/src/transformers/models/maskformer/modeling_maskformer.py
+++ b/src/transformers/models/maskformer/modeling_maskformer.py
@@ -15,7 +15,6 @@
 """ PyTorch MaskFormer model."""
 
 import math
-import random
 from dataclasses import dataclass
 from numbers import Number
 from typing import Dict, List, Optional, Tuple
@@ -24,9 +23,7 @@
 import torch
 from torch import Tensor, nn
 
-from transformers import AutoBackbone
-from transformers.utils import logging
-
+from ... import AutoBackbone
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutputWithCrossAttentions
 from ...modeling_utils import PreTrainedModel
@@ -35,6 +32,7 @@
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_scipy_available,
+    logging,
     replace_return_docstrings,
     requires_backends,
 )
@@ -51,7 +49,6 @@
 
 _CONFIG_FOR_DOC = "MaskFormerConfig"
 _CHECKPOINT_FOR_DOC = "facebook/maskformer-swin-base-ade"
-_FEAT_EXTRACTOR_FOR_DOC = "MaskFormerImageProcessor"
 
 MASKFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "facebook/maskformer-swin-base-ade",
@@ -121,6 +118,7 @@ class MaskFormerPixelLevelModuleOutput(ModelOutput):
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
 
 
+@dataclass
 class MaskFormerPixelDecoderOutput(ModelOutput):
     """
     MaskFormer's pixel decoder module output, practically a Feature Pyramid Network. It returns the last hidden state
@@ -358,7 +356,7 @@ def pair_wise_dice_loss(inputs: Tensor, labels: Tensor) -> Tensor:
         `torch.Tensor`: The computed loss between each pairs.
     """
     inputs = inputs.sigmoid().flatten(1)
-    numerator = 2 * torch.einsum("nc,mc->nm", inputs, labels)
+    numerator = 2 * torch.matmul(inputs, labels.T)
     # using broadcasting to get a [num_queries, NUM_CLASSES] matrix
     denominator = inputs.sum(-1)[:, None] + labels.sum(-1)[None, :]
     loss = 1 - (numerator + 1) / (denominator + 1)
@@ -400,7 +398,7 @@ def pair_wise_sigmoid_focal_loss(inputs: Tensor, labels: Tensor, alpha: float =
     focal_neg = (prob**gamma) * cross_entropy_loss_neg
     focal_neg *= 1 - alpha
 
-    loss = torch.einsum("nc,mc->nm", focal_pos, labels) + torch.einsum("nc,mc->nm", focal_neg, (1 - labels))
+    loss = torch.matmul(focal_pos, labels.T) + torch.matmul(focal_neg, (1 - labels).T)
 
     return loss / height_and_width
 
@@ -418,7 +416,6 @@ def __init__(
         embed_dim: int,
         num_heads: int,
         dropout: float = 0.0,
-        is_decoder: bool = False,
         bias: bool = True,
     ):
         super().__init__()
@@ -441,34 +438,79 @@ def __init__(
     def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
         return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
 
-    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
-        return tensor if position_embeddings is None else tensor + position_embeddings
+    def with_pos_embed(self, tensor: torch.Tensor, object_queries: Optional[Tensor], **kwargs):
+        position_embeddings = kwargs.pop("position_embeddings", None)
+
+        if kwargs:
+            raise ValueError(f"Unexpected arguments {kwargs.keys()}")
+
+        if position_embeddings is not None and object_queries is not None:
+            raise ValueError(
+                "Cannot specify both position_embeddings and object_queries. Please use just object_queries"
+            )
+
+        if position_embeddings is not None:
+            logger.warning_once(
+                "position_embeddings has been deprecated and will be removed in v4.34. Please use object_queries instead"
+            )
+            object_queries = position_embeddings
+
+        return tensor if object_queries is None else tensor + object_queries
 
     def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
-        position_embeddings: Optional[torch.Tensor] = None,
+        object_queries: Optional[torch.Tensor] = None,
         key_value_states: Optional[torch.Tensor] = None,
-        key_value_position_embeddings: Optional[torch.Tensor] = None,
+        spatial_position_embeddings: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
+        **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         """Input shape: Batch x Time x Channel"""
 
+        position_embeddings = kwargs.pop("position_ebmeddings", None)
+        key_value_position_embeddings = kwargs.pop("key_value_position_embeddings", None)
+
+        if kwargs:
+            raise ValueError(f"Unexpected arguments {kwargs.keys()}")
+
+        if position_embeddings is not None and object_queries is not None:
+            raise ValueError(
+                "Cannot specify both position_embeddings and object_queries. Please use just object_queries"
+            )
+
+        if key_value_position_embeddings is not None and spatial_position_embeddings is not None:
+            raise ValueError(
+                "Cannot specify both key_value_position_embeddings and spatial_position_embeddings. Please use just spatial_position_embeddings"
+            )
+
+        if position_embeddings is not None:
+            logger.warning_once(
+                "position_embeddings has been deprecated and will be removed in v4.34. Please use object_queries instead"
+            )
+            object_queries = position_embeddings
+
+        if key_value_position_embeddings is not None:
+            logger.warning_once(
+                "key_value_position_embeddings has been deprecated and will be removed in v4.34. Please use spatial_position_embeddings instead"
+            )
+            spatial_position_embeddings = key_value_position_embeddings
+
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
         batch_size, target_len, embed_dim = hidden_states.size()
 
         # add position embeddings to the hidden states before projecting to queries and keys
-        if position_embeddings is not None:
+        if object_queries is not None:
             hidden_states_original = hidden_states
-            hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
+            hidden_states = self.with_pos_embed(hidden_states, object_queries)
 
         # add key-value position embeddings to the key value states
-        if key_value_position_embeddings is not None:
+        if spatial_position_embeddings is not None:
             key_value_states_original = key_value_states
-            key_value_states = self.with_pos_embed(key_value_states, key_value_position_embeddings)
+            key_value_states = self.with_pos_embed(key_value_states, spatial_position_embeddings)
 
         # get query proj
         query_states = self.q_proj(hidden_states) * self.scaling
@@ -547,7 +589,6 @@ def __init__(self, config: DetrConfig):
             embed_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
             dropout=config.attention_dropout,
-            is_decoder=True,
         )
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
@@ -558,7 +599,6 @@ def __init__(self, config: DetrConfig):
             self.embed_dim,
             config.decoder_attention_heads,
             dropout=config.attention_dropout,
-            is_decoder=True,
         )
         self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
@@ -569,26 +609,27 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
-        position_embeddings: Optional[torch.Tensor] = None,
+        object_queries: Optional[torch.Tensor] = None,
         query_position_embeddings: Optional[torch.Tensor] = None,
         encoder_hidden_states: Optional[torch.Tensor] = None,
         encoder_attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
+        **kwargs,
     ):
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
                 values.
-            position_embeddings (`torch.FloatTensor`, *optional*):
-                position embeddings that are added to the queries and keys
+            object_queries (`torch.FloatTensor`, *optional*):
+                object_queries that are added to the hidden states
             in the cross-attention layer.
             query_position_embeddings (`torch.FloatTensor`, *optional*):
                 position embeddings that are added to the queries and keys
             in the self-attention layer.
             encoder_hidden_states (`torch.FloatTensor`):
-                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
             encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                 `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
                 values.
@@ -596,12 +637,28 @@ def forward(
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more detail.
         """
+        position_embeddings = kwargs.pop("position_embeddings", None)
+
+        if kwargs:
+            raise ValueError(f"Unexpected arguments {kwargs.keys()}")
+
+        if position_embeddings is not None and object_queries is not None:
+            raise ValueError(
+                "Cannot specify both position_embeddings and object_queries. Please use just object_queries"
+            )
+
+        if position_embeddings is not None:
+            logger.warning_once(
+                "position_embeddings has been deprecated and will be removed in v4.34. Please use object_queries instead"
+            )
+            object_queries = position_embeddings
+
         residual = hidden_states
 
         # Self Attention
         hidden_states, self_attn_weights = self.self_attn(
             hidden_states=hidden_states,
-            position_embeddings=query_position_embeddings,
+            object_queries=query_position_embeddings,
             attention_mask=attention_mask,
             output_attentions=output_attentions,
         )
@@ -617,10 +674,10 @@ def forward(
 
             hidden_states, cross_attn_weights = self.encoder_attn(
                 hidden_states=hidden_states,
-                position_embeddings=query_position_embeddings,
+                object_queries=query_position_embeddings,
                 key_value_states=encoder_hidden_states,
                 attention_mask=encoder_attention_mask,
-                key_value_position_embeddings=position_embeddings,
+                spatial_position_embeddings=object_queries,
                 output_attentions=output_attentions,
             )
 
@@ -668,7 +725,7 @@ class DetrDecoder(nn.Module):
 
     Some small tweaks for DETR:
 
-    - position_embeddings and query_position_embeddings are added to the forward pass.
+    - object_queries and query_position_embeddings are added to the forward pass.
     - if self.config.auxiliary_loss is set to True, also returns a stack of activations from all decoding layers.
 
     Args:
@@ -693,11 +750,12 @@ def forward(
         attention_mask=None,
         encoder_hidden_states=None,
         encoder_attention_mask=None,
-        position_embeddings=None,
+        object_queries=None,
         query_position_embeddings=None,
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
+        **kwargs,
     ):
         r"""
         Args:
@@ -721,7 +779,7 @@ def forward(
                 - 1 for pixels that are real (i.e. **not masked**),
                 - 0 for pixels that are padding (i.e. **masked**).
 
-            position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            object_queries (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                 Position embeddings that are added to the queries and keys in each cross-attention layer.
             query_position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
                 , *optional*): Position embeddings that are added to the queries and keys in each self-attention layer.
@@ -734,6 +792,21 @@ def forward(
             return_dict (`bool`, *optional*):
                 Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
         """
+        position_embeddings = kwargs.pop("position_embeddings", None)
+        if kwargs:
+            raise ValueError(f"Unexpected arguments {kwargs.keys()}")
+
+        if position_embeddings is not None and object_queries is not None:
+            raise ValueError(
+                "Cannot specify both position_embeddings and object_queries. Please use just object_queries"
+            )
+
+        if position_embeddings is not None:
+            logger.warning_once(
+                "position_embeddings has been deprecated and will be removed in v4.34. Please use object_queries instead"
+            )
+            object_queries = position_embeddings
+
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -769,9 +842,10 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             if self.gradient_checkpointing and self.training:
 
@@ -793,7 +867,7 @@ def custom_forward(*inputs):
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=combined_attention_mask,
-                    position_embeddings=position_embeddings,
+                    object_queries=object_queries,
                     query_position_embeddings=query_position_embeddings,
                     encoder_hidden_states=encoder_hidden_states,
                     encoder_attention_mask=encoder_attention_mask,
@@ -1243,7 +1317,7 @@ def forward(self, features: List[Tensor]) -> List[Tensor]:
 
 class MaskFormerPixelDecoder(nn.Module):
     def __init__(self, *args, feature_size: int = 256, mask_feature_size: int = 256, **kwargs):
-        """
+        r"""
         Pixel Decoder Module proposed in [Per-Pixel Classification is Not All You Need for Semantic
         Segmentation](https://arxiv.org/abs/2107.06278). It first runs the backbone's features into a Feature Pyramid
         Network creating a list of feature maps. Then, it projects the last one to the correct `mask_size`.
@@ -1252,18 +1326,23 @@ def __init__(self, *args, feature_size: int = 256, mask_feature_size: int = 256,
             feature_size (`int`, *optional*, defaults to 256):
                 The feature size (channel dimension) of the FPN feature maps.
             mask_feature_size (`int`, *optional*, defaults to 256):
-                The features (channels) of the target masks size \\C_{\epsilon}\\ in the paper.
+                The features (channels) of the target masks size \\(C_{\epsilon}\\) in the paper.
         """
         super().__init__()
 
         self.fpn = MaskFormerFPNModel(*args, feature_size=feature_size, **kwargs)
         self.mask_projection = nn.Conv2d(feature_size, mask_feature_size, kernel_size=3, padding=1)
 
-    def forward(self, features: List[Tensor], output_hidden_states: bool = False) -> MaskFormerPixelDecoderOutput:
+    def forward(
+        self, features: List[Tensor], output_hidden_states: bool = False, return_dict: bool = True
+    ) -> MaskFormerPixelDecoderOutput:
         fpn_features = self.fpn(features)
         # we use the last feature map
         last_feature_projected = self.mask_projection(fpn_features[-1])
 
+        if not return_dict:
+            return (last_feature_projected, tuple(fpn_features)) if output_hidden_states else (last_feature_projected,)
+
         return MaskFormerPixelDecoderOutput(
             last_hidden_state=last_feature_projected, hidden_states=tuple(fpn_features) if output_hidden_states else ()
         )
@@ -1290,15 +1369,15 @@ def __init__(
     def forward(self, x: Tensor, mask: Optional[Tensor] = None) -> Tensor:
         if mask is None:
             mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
-        not_mask = ~mask
-        y_embed = not_mask.cumsum(1, dtype=torch.float32)
-        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        not_mask = (~mask).to(x.dtype)
+        y_embed = not_mask.cumsum(1)
+        x_embed = not_mask.cumsum(2)
         if self.normalize:
             eps = 1e-6
             y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
             x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
 
-        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_t = torch.arange(self.num_pos_feats, dtype=x.dtype, device=x.device)
         dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.num_pos_feats)
 
         pos_x = x_embed[:, :, :, None] / dim_t
@@ -1392,9 +1471,20 @@ def __init__(self, config: MaskFormerConfig):
             lateral_widths=feature_channels[:-1],
         )
 
-    def forward(self, pixel_values: Tensor, output_hidden_states: bool = False) -> MaskFormerPixelLevelModuleOutput:
+    def forward(
+        self, pixel_values: Tensor, output_hidden_states: bool = False, return_dict: bool = True
+    ) -> MaskFormerPixelLevelModuleOutput:
         features = self.encoder(pixel_values).feature_maps
-        decoder_output = self.decoder(features, output_hidden_states)
+        decoder_output = self.decoder(features, output_hidden_states, return_dict=return_dict)
+
+        if not return_dict:
+            last_hidden_state = decoder_output[0]
+            outputs = (features[-1], last_hidden_state)
+            if output_hidden_states:
+                hidden_states = decoder_output[1]
+                outputs = outputs + (tuple(features),) + (hidden_states,)
+            return outputs
+
         return MaskFormerPixelLevelModuleOutput(
             # the last feature is actually the output from the last layer
             encoder_last_hidden_state=features[-1],
@@ -1419,31 +1509,35 @@ def __init__(self, in_features: int, config: MaskFormerConfig):
         self.decoder = DetrDecoder(config=config.decoder_config)
 
     def forward(
-        self, image_features: Tensor, output_hidden_states: bool = False, output_attentions: bool = False
+        self,
+        image_features: Tensor,
+        output_hidden_states: bool = False,
+        output_attentions: bool = False,
+        return_dict: Optional[bool] = None,
     ) -> DetrDecoderOutput:
         if self.input_projection is not None:
             image_features = self.input_projection(image_features)
-        position_embeddings = self.position_embedder(image_features)
+        object_queries = self.position_embedder(image_features)
         # repeat the queries "q c -> b q c"
         batch_size = image_features.shape[0]
         queries_embeddings = self.queries_embedder.weight.unsqueeze(0).repeat(batch_size, 1, 1)
         inputs_embeds = torch.zeros_like(queries_embeddings, requires_grad=True)
 
         batch_size, num_channels, height, width = image_features.shape
-        # rearrange both image_features and position_embeddings "b c h w -> b (h w) c"
+        # rearrange both image_features and object_queries "b c h w -> b (h w) c"
         image_features = image_features.view(batch_size, num_channels, height * width).permute(0, 2, 1)
-        position_embeddings = position_embeddings.view(batch_size, num_channels, height * width).permute(0, 2, 1)
+        object_queries = object_queries.view(batch_size, num_channels, height * width).permute(0, 2, 1)
 
         decoder_output: DetrDecoderOutput = self.decoder(
             inputs_embeds=inputs_embeds,
             attention_mask=None,
             encoder_hidden_states=image_features,
             encoder_attention_mask=None,
-            position_embeddings=position_embeddings,
+            object_queries=object_queries,
             query_position_embeddings=queries_embeddings,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=None,
+            return_dict=return_dict,
         )
         return decoder_output
 
@@ -1463,7 +1557,7 @@ def forward(
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
-            [`AutoImageProcessor.__call__`] for details.
+            [`MaskFormerImageProcessor.__call__`] for details.
         pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
             Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
 
@@ -1562,12 +1656,12 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import MaskFormerImageProcessor, MaskFormerModel
+        >>> from transformers import AutoImageProcessor, MaskFormerModel
         >>> from PIL import Image
         >>> import requests
 
         >>> # load MaskFormer fine-tuned on ADE20k semantic segmentation
-        >>> image_processor = MaskFormerImageProcessor.from_pretrained("facebook/maskformer-swin-base-ade")
+        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/maskformer-swin-base-ade")
         >>> model = MaskFormerModel.from_pretrained("facebook/maskformer-swin-base-ade")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
@@ -1598,9 +1692,11 @@ def forward(
         if pixel_mask is None:
             pixel_mask = torch.ones((batch_size, height, width), device=pixel_values.device)
 
-        pixel_level_module_output = self.pixel_level_module(pixel_values, output_hidden_states)
-        image_features = pixel_level_module_output.encoder_last_hidden_state
-        pixel_embeddings = pixel_level_module_output.decoder_last_hidden_state
+        pixel_level_module_output = self.pixel_level_module(
+            pixel_values, output_hidden_states, return_dict=return_dict
+        )
+        image_features = pixel_level_module_output[0]
+        pixel_embeddings = pixel_level_module_output[1]
 
         transformer_module_output = self.transformer_module(image_features, output_hidden_states, output_attentions)
         queries = transformer_module_output.last_hidden_state
@@ -1611,9 +1707,9 @@ def forward(
         hidden_states = None
 
         if output_hidden_states:
-            encoder_hidden_states = pixel_level_module_output.encoder_hidden_states
-            pixel_decoder_hidden_states = pixel_level_module_output.decoder_hidden_states
-            transformer_decoder_hidden_states = transformer_module_output.hidden_states
+            encoder_hidden_states = pixel_level_module_output[2]
+            pixel_decoder_hidden_states = pixel_level_module_output[3]
+            transformer_decoder_hidden_states = transformer_module_output[1]
             hidden_states = encoder_hidden_states + pixel_decoder_hidden_states + transformer_decoder_hidden_states
 
         output = MaskFormerModelOutput(
@@ -1694,8 +1790,16 @@ def get_logits(self, outputs: MaskFormerModelOutput) -> Tuple[Tensor, Tensor, Di
             class_queries_logits = classes[-1]
             # get the masks
             mask_embeddings = self.mask_embedder(stacked_transformer_decoder_outputs)
-            # sum up over the channels for each embedding
-            binaries_masks = torch.einsum("lbqc,   bchw -> lbqhw", mask_embeddings, pixel_embeddings)
+
+            # Equivalent to einsum('lbqc, bchw -> lbqhw') but jit friendly
+            num_embeddings, batch_size, num_queries, num_channels = mask_embeddings.shape
+            _, _, height, width = pixel_embeddings.shape
+            binaries_masks = torch.zeros(
+                (num_embeddings, batch_size, num_queries, height, width), device=mask_embeddings.device
+            )
+            for c in range(num_channels):
+                binaries_masks += mask_embeddings[..., c][..., None, None] * pixel_embeddings[None, :, None, c]
+
             masks_queries_logits = binaries_masks[-1]
             # go til [:-1] because the last one is always used
             for aux_binary_masks, aux_classes in zip(binaries_masks[:-1], classes[:-1]):
@@ -1710,7 +1814,13 @@ def get_logits(self, outputs: MaskFormerModelOutput) -> Tuple[Tensor, Tensor, Di
             # get the masks
             mask_embeddings = self.mask_embedder(transformer_decoder_hidden_states)
             # sum up over the channels
-            masks_queries_logits = torch.einsum("bqc,   bchw -> bqhw", mask_embeddings, pixel_embeddings)
+
+            # Equivalent to einsum('bqc, bchw -> bqhw') but jit friendly
+            batch_size, num_queries, num_channels = mask_embeddings.shape
+            _, _, height, width = pixel_embeddings.shape
+            masks_queries_logits = torch.zeros((batch_size, num_queries, height, width), device=mask_embeddings.device)
+            for c in range(num_channels):
+                masks_queries_logits += mask_embeddings[..., c][..., None, None] * pixel_embeddings[:, None, c]
 
         return class_queries_logits, masks_queries_logits, auxiliary_logits
 
@@ -1741,12 +1851,12 @@ def forward(
         Semantic segmentation example:
 
         ```python
-        >>> from transformers import MaskFormerImageProcessor, MaskFormerForInstanceSegmentation
+        >>> from transformers import AutoImageProcessor, MaskFormerForInstanceSegmentation
         >>> from PIL import Image
         >>> import requests
 
         >>> # load MaskFormer fine-tuned on ADE20k semantic segmentation
-        >>> image_processor = MaskFormerImageProcessor.from_pretrained("facebook/maskformer-swin-base-ade")
+        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/maskformer-swin-base-ade")
         >>> model = MaskFormerForInstanceSegmentation.from_pretrained("facebook/maskformer-swin-base-ade")
 
         >>> url = (
@@ -1774,12 +1884,12 @@ def forward(
         Panoptic segmentation example:
 
         ```python
-        >>> from transformers import MaskFormerImageProcessor, MaskFormerForInstanceSegmentation
+        >>> from transformers import AutoImageProcessor, MaskFormerForInstanceSegmentation
         >>> from PIL import Image
         >>> import requests
 
         >>> # load MaskFormer fine-tuned on COCO panoptic segmentation
-        >>> image_processor = MaskFormerImageProcessor.from_pretrained("facebook/maskformer-swin-base-coco")
+        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/maskformer-swin-base-coco")
         >>> model = MaskFormerForInstanceSegmentation.from_pretrained("facebook/maskformer-swin-base-coco")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
@@ -1808,13 +1918,25 @@ def forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs: MaskFormerModelOutput = self.model(
+        raw_outputs = self.model(
             pixel_values,
             pixel_mask,
             output_hidden_states=output_hidden_states or self.config.use_auxiliary_loss,
-            return_dict=True,
+            return_dict=return_dict,
             output_attentions=output_attentions,
         )
+        # We need to have raw_outputs optionally be returned as a dict to use torch.compile. For backwards
+        # compatibility we convert to a dataclass for the rest of the model logic
+        outputs = MaskFormerModelOutput(
+            encoder_last_hidden_state=raw_outputs[0],
+            pixel_decoder_last_hidden_state=raw_outputs[1],
+            transformer_decoder_last_hidden_state=raw_outputs[2],
+            encoder_hidden_states=raw_outputs[3] if output_hidden_states else None,
+            pixel_decoder_hidden_states=raw_outputs[4] if output_hidden_states else None,
+            transformer_decoder_hidden_states=raw_outputs[5] if output_hidden_states else None,
+            hidden_states=raw_outputs[6] if output_hidden_states else None,
+            attentions=raw_outputs[-1] if output_attentions else None,
+        )
 
         loss, loss_dict, auxiliary_logits = None, None, None
 
@@ -1832,16 +1954,18 @@ def forward(
         if not output_auxiliary_logits:
             auxiliary_logits = None
 
-        output = MaskFormerForInstanceSegmentationOutput(
+        if not return_dict:
+            output = tuple(
+                v
+                for v in (loss, class_queries_logits, masks_queries_logits, auxiliary_logits, *outputs.values())
+                if v is not None
+            )
+            return output
+
+        return MaskFormerForInstanceSegmentationOutput(
             loss=loss,
             **outputs,
             class_queries_logits=class_queries_logits,
             masks_queries_logits=masks_queries_logits,
             auxiliary_logits=auxiliary_logits,
         )
-
-        if not return_dict:
-            output = tuple(v for v in output.values())
-            if loss is not None:
-                output = ((loss)) + output
-        return output
diff --git a/src/transformers/models/maskformer/modeling_maskformer_swin.py b/src/transformers/models/maskformer/modeling_maskformer_swin.py
index f3c5577ab8d4..357ac9d4aaca 100644
--- a/src/transformers/models/maskformer/modeling_maskformer_swin.py
+++ b/src/transformers/models/maskformer/modeling_maskformer_swin.py
@@ -27,8 +27,9 @@
 from ...activations import ACT2FN
 from ...file_utils import ModelOutput
 from ...modeling_outputs import BackboneOutput
-from ...modeling_utils import BackboneMixin, PreTrainedModel
+from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
+from ...utils.backbone_utils import BackboneMixin
 from .configuration_maskformer_swin import MaskFormerSwinConfig
 
 
@@ -122,7 +123,7 @@ def window_reverse(windows, window_size, height, width):
 
 
 # Copied from transformers.models.swin.modeling_swin.drop_path
-def drop_path(input, drop_prob=0.0, training=False, scale_by_keep=True):
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
@@ -851,24 +852,18 @@ class MaskFormerSwinBackbone(MaskFormerSwinPreTrainedModel, BackboneMixin):
 
     def __init__(self, config: MaskFormerSwinConfig):
         super().__init__(config)
+        super()._init_backbone(config)
 
-        self.stage_names = config.stage_names
         self.model = MaskFormerSwinModel(config)
-
-        self.out_features = config.out_features if config.out_features is not None else [self.stage_names[-1]]
         if "stem" in self.out_features:
             raise ValueError("This backbone does not support 'stem' in the `out_features`.")
+        self.num_features = [config.embed_dim] + [int(config.embed_dim * 2**i) for i in range(len(config.depths))]
+        self.hidden_states_norms = nn.ModuleList(
+            [nn.LayerNorm(num_channels) for num_channels in self.num_features[1:]]
+        )
 
-        num_features = [int(config.embed_dim * 2**i) for i in range(len(config.depths))]
-        self.out_feature_channels = {}
-        for i, stage in enumerate(self.stage_names[1:]):
-            self.out_feature_channels[stage] = num_features[i]
-
-        self.hidden_states_norms = nn.ModuleList([nn.LayerNorm(num_channels) for num_channels in self.channels])
-
-    @property
-    def channels(self):
-        return [self.out_feature_channels[name] for name in self.out_features]
+        # Initialize weights and apply final processing
+        self.post_init()
 
     def forward(
         self,
@@ -890,10 +885,10 @@ def forward(
         # we skip the stem
         hidden_states = outputs.hidden_states[1:]
 
-        feature_maps = ()
         # we need to reshape the hidden states to their original spatial dimensions
         # spatial dimensions contains all the heights and widths of each stage, including after the embeddings
         spatial_dimensions: Tuple[Tuple[int, int]] = outputs.hidden_states_spatial_dimensions
+        feature_maps = ()
         for i, (hidden_state, stage, (height, width)) in enumerate(
             zip(hidden_states, self.stage_names[1:], spatial_dimensions)
         ):
diff --git a/src/transformers/models/mbart/__init__.py b/src/transformers/models/mbart/__init__.py
index ef967c2482a1..bae4593c87d8 100644
--- a/src/transformers/models/mbart/__init__.py
+++ b/src/transformers/models/mbart/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/mbart/configuration_mbart.py b/src/transformers/models/mbart/configuration_mbart.py
index 83f741dbf7b6..1a775f57fdfb 100644
--- a/src/transformers/models/mbart/configuration_mbart.py
+++ b/src/transformers/models/mbart/configuration_mbart.py
@@ -134,7 +134,7 @@ def __init__(
         bos_token_id=0,
         eos_token_id=2,
         forced_eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
diff --git a/src/transformers/models/mbart/modeling_flax_mbart.py b/src/transformers/models/mbart/modeling_flax_mbart.py
index afc67be57bad..907fd53aa1e5 100644
--- a/src/transformers/models/mbart/modeling_flax_mbart.py
+++ b/src/transformers/models/mbart/modeling_flax_mbart.py
@@ -19,8 +19,6 @@
 from functools import partial
 from typing import Callable, Optional, Tuple
 
-import numpy as np
-
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
@@ -55,7 +53,6 @@
 
 _CHECKPOINT_FOR_DOC = "facebook/mbart-large-cc25"
 _CONFIG_FOR_DOC = "MBartConfig"
-_TOKENIZER_FOR_DOC = "MBartTokenizer"
 
 
 MBART_START_DOCSTRING = r"""
@@ -98,7 +95,7 @@
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`MBartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -112,7 +109,7 @@
         decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`MBartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -149,7 +146,7 @@
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`MBartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -178,7 +175,7 @@
         decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`MBartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -225,20 +222,20 @@ def shift_tokens_right(input_ids: jnp.ndarray, pad_token_id: int) -> jnp.ndarray
     Shift input ids one token to the right, and wrap the last non pad token (the <LID> token) Note that MBart does not
     have a single `decoder_start_token_id` in contrast to other Bart-like models.
     """
-    prev_output_tokens = np.array(input_ids).copy()
+    prev_output_tokens = jnp.array(input_ids).copy()
 
     if pad_token_id is None:
         raise ValueError("self.model.config.pad_token_id has to be defined.")
 
     # replace possible -100 values in labels by `pad_token_id`
-    prev_output_tokens = np.where(prev_output_tokens == -100, pad_token_id, input_ids)
-    index_of_eos = (np.where(prev_output_tokens != pad_token_id, 1, 0).sum(axis=-1) - 1).reshape(-1, 1)
-    decoder_start_tokens = np.array(
-        [prev_output_tokens[i, eos_idx] for i, eos_idx in enumerate(index_of_eos)], dtype=np.int32
+    prev_output_tokens = jnp.where(prev_output_tokens == -100, pad_token_id, input_ids)
+    index_of_eos = (jnp.where(prev_output_tokens != pad_token_id, 1, 0).sum(axis=-1) - 1).reshape(-1, 1)
+    decoder_start_tokens = jnp.array(
+        [prev_output_tokens[i, eos_idx] for i, eos_idx in enumerate(index_of_eos)], dtype=jnp.int32
     ).squeeze()
 
-    prev_output_tokens[:, 1:] = prev_output_tokens[:, :-1].copy()
-    prev_output_tokens[:, 0] = decoder_start_tokens
+    prev_output_tokens = prev_output_tokens.at[:, 1:].set(prev_output_tokens[:, :-1])
+    prev_output_tokens = prev_output_tokens.at[:, 0].set(decoder_start_tokens)
 
     return prev_output_tokens
 
@@ -383,7 +380,7 @@ def __call__(
             attention_bias = lax.select(
                 attention_mask > 0,
                 jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
-                jnp.full(attention_mask.shape, float("-inf")).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
             )
         else:
             attention_bias = None
@@ -954,7 +951,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, **kwargs)
         super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
@@ -1057,10 +1054,10 @@ def encode(
         Example:
 
         ```python
-        >>> from transformers import MBartTokenizer, FlaxMBartForConditionalGeneration
+        >>> from transformers import AutoTokenizer, FlaxMBartForConditionalGeneration
 
         >>> model = FlaxMBartForConditionalGeneration.from_pretrained("facebook/mbart-large-cc25")
-        >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25")
+        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-cc25")
 
         >>> text = "My friends are cool but they eat too many carbs."
         >>> inputs = tokenizer(text, max_length=1024, return_tensors="jax")
@@ -1123,10 +1120,10 @@ def decode(
         Example:
 
         ```python
-        >>> from transformers import MBartTokenizer, FlaxMBartForConditionalGeneration
+        >>> from transformers import AutoTokenizer, FlaxMBartForConditionalGeneration
 
         >>> model = FlaxMBartForConditionalGeneration.from_pretrained("facebook/mbart-large-cc25")
-        >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25")
+        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-cc25")
 
         >>> text = "My friends are cool but they eat too many carbs."
         >>> inputs = tokenizer(text, max_length=1024, return_tensors="jax")
@@ -1282,9 +1279,7 @@ class FlaxMBartModel(FlaxMBartPreTrainedModel):
     module_class = FlaxMBartModule
 
 
-append_call_sample_docstring(
-    FlaxMBartModel, _TOKENIZER_FOR_DOC, _CHECKPOINT_FOR_DOC, FlaxSeq2SeqModelOutput, _CONFIG_FOR_DOC
-)
+append_call_sample_docstring(FlaxMBartModel, _CHECKPOINT_FOR_DOC, FlaxSeq2SeqModelOutput, _CONFIG_FOR_DOC)
 
 
 # Copied from transformers.models.bart.modeling_flax_bart.FlaxBartForConditionalGenerationModule with Bart->MBart
@@ -1390,10 +1385,10 @@ def decode(
         Example:
 
         ```python
-        >>> from transformers import MBartTokenizer, FlaxMBartForConditionalGeneration
+        >>> from transformers import AutoTokenizer, FlaxMBartForConditionalGeneration
 
         >>> model = FlaxMBartForConditionalGeneration.from_pretrained("facebook/mbart-large-cc25")
-        >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25")
+        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-cc25")
 
         >>> text = "My friends are cool but they eat too many carbs."
         >>> inputs = tokenizer(text, max_length=1024, return_tensors="jax")
@@ -1507,10 +1502,10 @@ def prepare_inputs_for_generation(
         self,
         decoder_input_ids,
         max_length,
-        attention_mask: Optional[jnp.DeviceArray] = None,
-        decoder_attention_mask: Optional[jnp.DeviceArray] = None,
+        attention_mask: Optional[jax.Array] = None,
+        decoder_attention_mask: Optional[jax.Array] = None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # initializing the cache
         batch_size, seq_length = decoder_input_ids.shape
@@ -1546,10 +1541,10 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
     Summarization example:
 
     ```python
-    >>> from transformers import MBartTokenizer, FlaxMBartForConditionalGeneration, MBartConfig
+    >>> from transformers import AutoTokenizer, FlaxMBartForConditionalGeneration, MBartConfig
 
     >>> model = FlaxMBartForConditionalGeneration.from_pretrained("facebook/mbart-large-cc25")
-    >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25")
+    >>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-cc25")
 
     >>> ARTICLE_TO_SUMMARIZE = "Meine Freunde sind cool, aber sie essen zu viel Kuchen."
     >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="np")
@@ -1562,10 +1557,10 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
     Mask filling example:
 
     ```python
-    >>> from transformers import MBartTokenizer, FlaxMBartForConditionalGeneration
+    >>> from transformers import AutoTokenizer, FlaxMBartForConditionalGeneration
 
     >>> model = FlaxMBartForConditionalGeneration.from_pretrained("facebook/mbart-large-cc25")
-    >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25")
+    >>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-cc25")
 
     >>> # de_DE is the language symbol id <LID> for German
     >>> TXT = "</s> Meine Freunde sind <mask> nett aber sie essen zu viel Kuchen. </s> de_DE"
@@ -1683,7 +1678,6 @@ class FlaxMBartForSequenceClassification(FlaxMBartPreTrainedModel):
 
 append_call_sample_docstring(
     FlaxMBartForSequenceClassification,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxSeq2SeqSequenceClassifierOutput,
     _CONFIG_FOR_DOC,
@@ -1771,7 +1765,6 @@ class FlaxMBartForQuestionAnswering(FlaxMBartPreTrainedModel):
 
 append_call_sample_docstring(
     FlaxMBartForQuestionAnswering,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxSeq2SeqQuestionAnsweringModelOutput,
     _CONFIG_FOR_DOC,
diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py
index 421393195e44..276f94aebdbb 100755
--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
@@ -15,7 +15,6 @@
 """ PyTorch MBART model."""
 import copy
 import math
-import random
 from typing import List, Optional, Tuple, Union
 
 import torch
@@ -49,22 +48,10 @@
 
 _CHECKPOINT_FOR_DOC = "facebook/mbart-large-cc25"
 _CONFIG_FOR_DOC = "MBartConfig"
-_TOKENIZER_FOR_DOC = "MBartTokenizer"
 
 # Base model docstring
 _EXPECTED_OUTPUT_SHAPE = [1, 8, 1024]
 
-# SequenceClassification docstring
-_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "hf-internal-testing/tiny-random-mbart"
-_SEQ_CLASS_EXPECTED_LOSS = 0.69
-_SEQ_CLASS_EXPECTED_OUTPUT = "'LABEL_1'"
-
-# QuestionAsnwering docstring
-_CHECKPOINT_FOR_QA = "hf-internal-testing/tiny-random-mbart"
-_QA_EXPECTED_LOSS = 3.55
-_QA_EXPECTED_OUTPUT = "'? Jim Henson was a'"
-
-
 MBART_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "facebook/mbart-large-cc25",
     # See all MBART models at https://huggingface.co/models?filter=mbart
@@ -92,18 +79,20 @@ def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int):
 
 
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
     """
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
-    mask_cond = torch.arange(mask.size(-1))
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
 
     if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
     return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
 
 
@@ -237,8 +226,8 @@ def forward(
 
         proj_shape = (bsz * self.num_heads, -1, self.head_dim)
         query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-        key_states = key_states.view(*proj_shape)
-        value_states = value_states.view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
 
         src_len = key_states.size(1)
         attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
@@ -284,7 +273,7 @@ def forward(
 
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
             raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
                 f" {attn_output.size()}"
             )
 
@@ -292,7 +281,7 @@ def forward(
         attn_output = attn_output.transpose(1, 2)
 
         # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned aross GPUs when using tensor-parallelism.
+        # partitioned across GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
@@ -326,7 +315,7 @@ def forward(
     ) -> torch.Tensor:
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
@@ -409,11 +398,11 @@ def forward(
     ) -> torch.Tensor:
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             encoder_hidden_states (`torch.FloatTensor`):
-                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
             encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
@@ -514,6 +503,7 @@ class MBartPreTrainedModel(PreTrainedModel):
     config_class = MBartConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
+    _no_split_modules = ["MBartDecoderLayer", "MBartAttention"]
 
     def _init_weights(self, module):
         std = self.config.init_std
@@ -561,16 +551,16 @@ def dummy_inputs(self):
     Translation example:
 
     ```python
-    >>> from transformers import MBartTokenizer, MBartForConditionalGeneration
+    >>> from transformers import AutoTokenizer, MBartForConditionalGeneration
 
     >>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro")
-    >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro")
+    >>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-en-ro")
 
     >>> example_english_phrase = "42 is the answer"
     >>> inputs = tokenizer(example_english_phrase, return_tensors="pt")
 
     >>> # Translate
-    >>> generated_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=5)
+    >>> generated_ids = model.generate(**inputs, num_beams=4, max_length=5)
     >>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
     '42 este răspuns'
     ```
@@ -578,10 +568,10 @@ def dummy_inputs(self):
     Mask filling example:
 
     ```python
-    >>> from transformers import MBartTokenizer, MBartForConditionalGeneration
+    >>> from transformers import AutoTokenizer, MBartForConditionalGeneration
 
     >>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-cc25")
-    >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25")
+    >>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-cc25")
 
     >>> # de_DE is the language symbol id <LID> for German
     >>> TXT = "</s> Meine Freunde sind <mask> nett aber sie essen zu viel Kuchen. </s> de_DE"
@@ -604,7 +594,7 @@ def dummy_inputs(self):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`MBartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -618,7 +608,7 @@ def dummy_inputs(self):
         decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`MBartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -714,10 +704,10 @@ def __init__(self, config: MBartConfig, embed_tokens: Optional[nn.Embedding] = N
         self.max_source_positions = config.max_position_embeddings
         self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
 
+        self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
+
         if embed_tokens is not None:
-            self.embed_tokens = embed_tokens
-        else:
-            self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
+            self.embed_tokens.weight = embed_tokens.weight
 
         self.embed_positions = MBartLearnedPositionalEmbedding(
             config.max_position_embeddings,
@@ -752,7 +742,7 @@ def forward(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`MBartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -805,7 +795,7 @@ def forward(
 
         embed_pos = self.embed_positions(input)
 
-        hidden_states = inputs_embeds + embed_pos
+        hidden_states = inputs_embeds + embed_pos.to(inputs_embeds.device)
         hidden_states = self.layernorm_embedding(hidden_states)
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
 
@@ -828,8 +818,13 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
                 layer_outputs = (None, None)
             else:
                 if self.gradient_checkpointing and self.training:
@@ -888,10 +883,10 @@ def __init__(self, config: MBartConfig, embed_tokens: Optional[nn.Embedding] = N
         self.max_target_positions = config.max_position_embeddings
         self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
 
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
+
         if embed_tokens is not None:
-            self.embed_tokens = embed_tokens
-        else:
-            self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
+            self.embed_tokens.weight = embed_tokens.weight
 
         self.embed_positions = MBartLearnedPositionalEmbedding(
             config.max_position_embeddings,
@@ -918,8 +913,11 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
         combined_attention_mask = None
         if input_shape[-1] > 1:
             combined_attention_mask = _make_causal_mask(
-                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
-            ).to(inputs_embeds.device)
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
 
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
@@ -953,7 +951,7 @@ def forward(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`MBartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -1050,11 +1048,18 @@ def forward(
         # embed positions
         positions = self.embed_positions(input, past_key_values_length)
 
-        hidden_states = inputs_embeds + positions
+        hidden_states = inputs_embeds + positions.to(inputs_embeds.device)
         hidden_states = self.layernorm_embedding(hidden_states)
 
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
@@ -1067,26 +1072,21 @@ def forward(
                 if attn_mask.size()[0] != len(self.layers):
                     raise ValueError(
                         f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
-                        f" {head_mask.size()[0]}."
+                        f" {attn_mask.size()[0]}."
                     )
         for idx, decoder_layer in enumerate(self.layers):
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         # None for past_key_value
@@ -1105,7 +1105,6 @@ def custom_forward(*inputs):
                     None,
                 )
             else:
-
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
@@ -1157,7 +1156,7 @@ def custom_forward(*inputs):
     MBART_START_DOCSTRING,
 )
 class MBartModel(MBartPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: MBartConfig):
         super().__init__(config)
@@ -1185,9 +1184,13 @@ def get_encoder(self):
     def get_decoder(self):
         return self.decoder
 
+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self._tie_or_clone_weights(self.encoder.embed_tokens, self.get_input_embeddings())
+            self._tie_or_clone_weights(self.decoder.embed_tokens, self.get_input_embeddings())
+
     @add_start_docstrings_to_model_forward(MBART_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=Seq2SeqModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1273,18 +1276,13 @@ def forward(
 
 
 @add_start_docstrings(
-    "The MBART Model with a language modeling head. Can be used for summarization.", MBART_START_DOCSTRING
+    "The MBART Model with a language modeling head. Can be used for summarization, after fine-tuning the pretrained models.",
+    MBART_START_DOCSTRING,
 )
 class MBartForConditionalGeneration(MBartPreTrainedModel):
     base_model_prefix = "model"
-    _keys_to_ignore_on_load_missing = [
-        r"final_logits_bias",
-        r"encoder.version",
-        r"decoder.version",
-        r"lm_head.weight",
-        "encoder.embed_tokens.weight",
-        "decoder.embed_tokens.weight",
-    ]
+    _keys_to_ignore_on_load_missing = ["final_logits_bias"]
+    _tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight", "lm_head.weight"]
 
     def __init__(self, config: MBartConfig):
         super().__init__(config)
@@ -1301,9 +1299,9 @@ def get_encoder(self):
     def get_decoder(self):
         return self.model.get_decoder()
 
-    def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
-        new_embeddings = super().resize_token_embeddings(new_num_tokens)
-        self._resize_final_logits_bias(new_num_tokens)
+    def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding:
+        new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
+        self._resize_final_logits_bias(new_embeddings.weight.shape[0])
         return new_embeddings
 
     def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
@@ -1411,7 +1409,7 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # cut decoder_input_ids if past is used
         if past_key_values is not None:
@@ -1433,12 +1431,13 @@ def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
         return shift_tokens_right(labels, self.config.pad_token_id)
 
     @staticmethod
-    def _reorder_cache(past, beam_idx):
+    def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
+        for layer_past in past_key_values:
             # cached cross_attention states don't have to be reordered -> they are always the same
             reordered_past += (
-                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
+                + layer_past[2:],
             )
         return reordered_past
 
@@ -1451,7 +1450,7 @@ def _reorder_cache(past, beam_idx):
     MBART_START_DOCSTRING,
 )
 class MBartForSequenceClassification(MBartPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+    _tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight"]
 
     def __init__(self, config: MBartConfig, **kwargs):
         super().__init__(config, **kwargs)
@@ -1462,17 +1461,15 @@ def __init__(self, config: MBartConfig, **kwargs):
             config.num_labels,
             config.classifier_dropout,
         )
-        self.model._init_weights(self.classification_head.dense)
-        self.model._init_weights(self.classification_head.out_proj)
+
+        # Initialize weights and apply final processing
+        self.post_init()
 
     @add_start_docstrings_to_model_forward(MBART_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
+        checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=Seq2SeqSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
-        expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
     )
     # Copied from transformers.models.bart.modeling_bart.BartForSequenceClassification.forward
     def forward(
@@ -1536,6 +1533,7 @@ def forward(
 
         loss = None
         if labels is not None:
+            labels = labels.to(logits.device)
             if self.config.problem_type is None:
                 if self.config.num_labels == 1:
                     self.config.problem_type = "regression"
@@ -1581,7 +1579,7 @@ def forward(
     MBART_START_DOCSTRING,
 )
 class MBartForQuestionAnswering(MBartPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+    _tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1592,16 +1590,14 @@ def __init__(self, config):
         self.model = MBartModel(config)
         self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
 
-        self.model._init_weights(self.qa_outputs)
+        # Initialize weights and apply final processing
+        self.post_init()
 
     @add_start_docstrings_to_model_forward(MBART_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_QA,
+        checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=Seq2SeqQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_loss=_QA_EXPECTED_LOSS,
-        expected_output=_QA_EXPECTED_OUTPUT,
     )
     # Copied from transformers.models.bart.modeling_bart.BartForQuestionAnswering.forward
     def forward(
@@ -1716,7 +1712,7 @@ def forward(self, *args, **kwargs):
 
 # Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->MBart, facebook/bart-base->facebook/mbart-large-cc25
 class MBartForCausalLM(MBartPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         config = copy.deepcopy(config)
@@ -1771,7 +1767,7 @@ def forward(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`MBartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -1836,9 +1832,9 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import MBartTokenizer, MBartForCausalLM
+        >>> from transformers import AutoTokenizer, MBartForCausalLM
 
-        >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25")
+        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-cc25")
         >>> model = MBartForCausalLM.from_pretrained("facebook/mbart-large-cc25", add_cross_attention=False)
         >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
@@ -1876,6 +1872,7 @@ def forward(
 
         loss = None
         if labels is not None:
+            labels = labels.to(logits.device)
             loss_fct = CrossEntropyLoss()
             loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
 
@@ -1910,8 +1907,10 @@ def prepare_inputs_for_generation(
         }
 
     @staticmethod
-    def _reorder_cache(past, beam_idx):
+    def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
         return reordered_past
diff --git a/src/transformers/models/mbart/modeling_tf_mbart.py b/src/transformers/models/mbart/modeling_tf_mbart.py
index 71e9a66b6d62..04d489ec2cbc 100644
--- a/src/transformers/models/mbart/modeling_tf_mbart.py
+++ b/src/transformers/models/mbart/modeling_tf_mbart.py
@@ -15,6 +15,8 @@
 """ TF 2.0 MBart model."""
 
 
+from __future__ import annotations
+
 import random
 from typing import Optional, Tuple, Union
 
@@ -30,14 +32,13 @@
 
 # Public API
 from ...modeling_tf_utils import (
-    DUMMY_INPUTS,
     TFCausalLanguageModelingLoss,
     TFModelInputType,
     TFPreTrainedModel,
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     ContextManagers,
     add_code_sample_docstrings,
@@ -54,7 +55,6 @@
 
 _CHECKPOINT_FOR_DOC = "facebook/mbart-large-cc25"
 _CONFIG_FOR_DOC = "MBartConfig"
-_TOKENIZER_FOR_DOC = "MBartTokenizer"
 
 
 LARGE_NEGATIVE = -1e8
@@ -132,7 +132,7 @@ def call(
         self,
         input_shape: Optional[tf.TensorShape] = None,
         past_key_values_length: int = 0,
-        position_ids: Optional[tf.Tensor] = None,
+        position_ids: tf.Tensor | None = None,
     ):
         """Input is expected to be of size [bsz x seqlen]."""
         if position_ids is None:
@@ -182,12 +182,12 @@ def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
     def call(
         self,
         hidden_states: tf.Tensor,
-        key_value_states: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[Tuple[tf.Tensor]]] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
+        key_value_states: tf.Tensor | None = None,
+        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
+        attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
         training: Optional[bool] = False,
-    ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
+    ) -> Tuple[tf.Tensor, tf.Tensor | None]:
         """Input shape: Batch x Time x Channel"""
 
         # if key_value_states are provided this layer is used as a cross-attention layer
@@ -322,7 +322,7 @@ def call(
     ):
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+            hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
             attention_mask (`tf.Tensor`): attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
@@ -385,21 +385,21 @@ def __init__(self, config: MBartConfig, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
-        encoder_hidden_states: Optional[tf.Tensor] = None,
-        encoder_attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
-        cross_attn_layer_head_mask: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[tf.Tensor]] = None,
+        attention_mask: tf.Tensor | None = None,
+        encoder_hidden_states: tf.Tensor | None = None,
+        encoder_attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
+        cross_attn_layer_head_mask: tf.Tensor | None = None,
+        past_key_value: Tuple[tf.Tensor] | None = None,
         training: Optional[bool] = False,
     ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+            hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
             attention_mask (`tf.Tensor`): attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             encoder_hidden_states (`tf.Tensor`):
-                cross attention input to the layer of shape *(seq_len, batch, embed_dim)*
+                cross attention input to the layer of shape *(batch, seq_len, embed_dim)*
             encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
@@ -467,34 +467,6 @@ class TFMBartPreTrainedModel(TFPreTrainedModel):
     config_class = MBartConfig
     base_model_prefix = "model"
 
-    @property
-    def dummy_inputs(self):
-        pad_token = 1
-        input_ids = tf.cast(tf.convert_to_tensor(DUMMY_INPUTS), tf.int32)
-        decoder_input_ids = tf.cast(tf.convert_to_tensor(DUMMY_INPUTS), tf.int32)
-        dummy_inputs = {
-            "decoder_input_ids": decoder_input_ids,
-            "attention_mask": tf.cast(input_ids != pad_token, tf.int32),
-            "input_ids": input_ids,
-        }
-        return dummy_inputs
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-                "decoder_input_ids": tf.TensorSpec((None, None), tf.int32, name="decoder_input_ids"),
-                "decoder_attention_mask": tf.TensorSpec((None, None), tf.int32, name="decoder_attention_mask"),
-            }
-        ]
-    )
-    # Copied from transformers.models.bart.modeling_tf_bart.TFBartPretrainedModel.serving
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 MBART_START_DOCSTRING = r"""
     This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
@@ -542,7 +514,7 @@ def serving(self, inputs):
         input_ids (`tf.Tensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`MBartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -556,7 +528,7 @@ def serving(self, inputs):
         decoder_input_ids (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`MBartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -620,37 +592,44 @@ def serving(self, inputs):
 """
 
 MBART_GENERATION_EXAMPLE = r"""
-    Summarization example:
+    Translation example:
 
     ```python
-    >>> from transformers import MBartTokenizer, TFMBartForConditionalGeneration, MBartConfig
+    >>> from transformers import AutoTokenizer, TFMBartForConditionalGeneration
 
-    >>> model = TFMBartForConditionalGeneration.from_pretrained("facebook/mbart-large-cc25")
-    >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25")
+    >>> model = TFMBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro")
+    >>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-en-ro")
 
-    >>> ARTICLE_TO_SUMMARIZE = "Meine Freunde sind cool, aber sie essen zu viel Kuchen."
-    >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="tf")
+    >>> example_english_phrase = "42 is the answer"
+    >>> inputs = tokenizer(example_english_phrase, return_tensors="tf")
 
-    >>> # Generate Summary
-    >>> summary_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=5)
-    >>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
+    >>> # Translate
+    >>> generated_ids = model.generate(**inputs, num_beams=4, max_length=5)
+    >>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    '42 este răspuns'
     ```
 
     Mask filling example:
 
     ```python
-    >>> from transformers import MBartTokenizer, TFMBartForConditionalGeneration
+    >>> from transformers import AutoTokenizer, TFMBartForConditionalGeneration
+    >>> import tensorflow as tf
 
-    >>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-cc25")
-    >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25")
+    >>> model = TFMBartForConditionalGeneration.from_pretrained("facebook/mbart-large-cc25")
+    >>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-cc25")
 
     >>> # de_DE is the language symbol id <LID> for German
     >>> TXT = "</s> Meine Freunde sind <mask> nett aber sie essen zu viel Kuchen. </s> de_DE"
-    >>> input_ids = tokenizer([TXT], add_special_tokens=False, return_tensors="tf")["input_ids"]
 
+    >>> input_ids = tokenizer([TXT], add_special_tokens=False, return_tensors="tf")["input_ids"]
     >>> logits = model(input_ids).logits
-    >>> probs = tf.nn.softmax(logits[0])
-    >>> # probs[5] is associated with the mask token
+
+    >>> masked_index = tf.where(input_ids[0] == tokenizer.mask_token_id)[0, 0]
+    >>> probs = tf.nn.softmax(logits[0, masked_index], axis=0)
+    >>> values, predictions = tf.math.top_k(probs, 5)
+
+    >>> tokenizer.decode(predictions).split()
+    ['nett', 'sehr', 'ganz', 'nicht', 'so']
     ```
 """
 
@@ -694,10 +673,10 @@ def set_embed_tokens(self, embed_tokens):
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
+        input_ids: TFModelInputType | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -709,7 +688,7 @@ def call(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`MBartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -764,16 +743,7 @@ def call(
             if hasattr(self.embed_tokens, "load_weight_prefix"):
                 context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
             with ContextManagers(context):
-                # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-                # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-                tf.debugging.assert_less(
-                    input_ids,
-                    tf.cast(self.embed_tokens.input_dim, dtype=input_ids.dtype),
-                    message=(
-                        "input_ids must be smaller than the embedding layer's input dimension (got"
-                        f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.input_dim})"
-                    ),
-                )
+                check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
                 inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
         embed_pos = self.embed_positions(input_shape)
@@ -804,7 +774,6 @@ def call(
 
         # encoder layers
         for idx, encoder_layer in enumerate(self.layers):
-
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
@@ -872,14 +841,14 @@ def set_embed_tokens(self, embed_tokens):
     def call(
         self,
         input_ids: TFModelInputType = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        position_ids: Optional[tf.Tensor] = None,
-        encoder_hidden_states: Optional[tf.Tensor] = None,
-        encoder_attention_mask: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        cross_attn_head_mask: Optional[tf.Tensor] = None,
-        past_key_values: Optional[Tuple[Tuple[tf.Tensor]]] = None,
+        inputs_embeds: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        encoder_hidden_states: tf.Tensor | None = None,
+        encoder_attention_mask: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        cross_attn_head_mask: tf.Tensor | None = None,
+        past_key_values: Tuple[Tuple[tf.Tensor]] | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -894,7 +863,7 @@ def call(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`MBartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -984,16 +953,7 @@ def call(
             if hasattr(self.embed_tokens, "load_weight_prefix"):
                 context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
             with ContextManagers(context):
-                # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-                # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-                tf.debugging.assert_less(
-                    input_ids,
-                    tf.cast(self.embed_tokens.input_dim, dtype=input_ids.dtype),
-                    message=(
-                        "input_ids must be smaller than the embedding layer's input dimension (got"
-                        f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.input_dim})"
-                    ),
-                )
+                check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
                 inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
         hidden_states = inputs_embeds
@@ -1113,25 +1073,24 @@ def set_input_embeddings(self, new_embeddings):
     def call(
         self,
         input_ids: TFModelInputType = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        decoder_input_ids: Optional[tf.Tensor] = None,
-        decoder_attention_mask: Optional[tf.Tensor] = None,
-        decoder_position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        decoder_head_mask: Optional[tf.Tensor] = None,
-        cross_attn_head_mask: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
+        decoder_input_ids: tf.Tensor | None = None,
+        decoder_attention_mask: tf.Tensor | None = None,
+        decoder_position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        decoder_head_mask: tf.Tensor | None = None,
+        cross_attn_head_mask: tf.Tensor | None = None,
         encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
-        past_key_values: Optional[Tuple[Tuple[tf.Tensor]]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        decoder_inputs_embeds: Optional[tf.Tensor] = None,
+        past_key_values: Tuple[Tuple[tf.Tensor]] | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        decoder_inputs_embeds: tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
-        **kwargs
+        **kwargs,
     ) -> Union[TFSeq2SeqModelOutput, tf.Tensor]:
-
         if decoder_input_ids is None and decoder_inputs_embeds is None:
             use_cache = False
 
@@ -1215,7 +1174,6 @@ def get_decoder(self):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(MBART_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFSeq2SeqModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1223,25 +1181,24 @@ def get_decoder(self):
     def call(
         self,
         input_ids: TFModelInputType = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        decoder_input_ids: Optional[tf.Tensor] = None,
-        decoder_attention_mask: Optional[tf.Tensor] = None,
-        decoder_position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        decoder_head_mask: Optional[tf.Tensor] = None,
-        cross_attn_head_mask: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
+        decoder_input_ids: tf.Tensor | None = None,
+        decoder_attention_mask: tf.Tensor | None = None,
+        decoder_position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        decoder_head_mask: tf.Tensor | None = None,
+        cross_attn_head_mask: tf.Tensor | None = None,
         encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
-        past_key_values: Optional[Tuple[Tuple[tf.Tensor]]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        decoder_inputs_embeds: Optional[tf.Tensor] = None,
+        past_key_values: Tuple[Tuple[tf.Tensor]] | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        decoder_inputs_embeds: tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
-        **kwargs
+        **kwargs,
     ) -> Union[TFSeq2SeqModelOutput, Tuple[tf.Tensor]]:
-
         outputs = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -1304,7 +1261,7 @@ def call(self, x):
 
 
 @add_start_docstrings(
-    "The MBART Model with a language modeling head. Can be used for summarization.",
+    "The MBART Model with a language modeling head. Can be used for summarization, after fine-tuning the pretrained models.",
     MBART_START_DOCSTRING,
 )
 class TFMBartForConditionalGeneration(TFMBartPreTrainedModel, TFCausalLanguageModelingLoss):
@@ -1352,22 +1309,22 @@ def set_bias(self, value):
     def call(
         self,
         input_ids: TFModelInputType = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        decoder_input_ids: Optional[tf.Tensor] = None,
-        decoder_attention_mask: Optional[tf.Tensor] = None,
-        decoder_position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        decoder_head_mask: Optional[tf.Tensor] = None,
-        cross_attn_head_mask: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
+        decoder_input_ids: tf.Tensor | None = None,
+        decoder_attention_mask: tf.Tensor | None = None,
+        decoder_position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        decoder_head_mask: tf.Tensor | None = None,
+        cross_attn_head_mask: tf.Tensor | None = None,
         encoder_outputs: Optional[TFBaseModelOutput] = None,
         past_key_values: Tuple[Tuple[tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        decoder_inputs_embeds: Optional[tf.Tensor] = None,
+        inputs_embeds: tf.Tensor | None = None,
+        decoder_inputs_embeds: tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSeq2SeqLMOutput, Tuple[tf.Tensor]]:
         """
@@ -1460,9 +1417,8 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
-
         # cut decoder_input_ids if past_key_values is used
         if past_key_values is not None:
             decoder_input_ids = decoder_input_ids[:, -1:]
diff --git a/src/transformers/models/mbart/tokenization_mbart.py b/src/transformers/models/mbart/tokenization_mbart.py
index b6b4173e50af..933074fd5d85 100644
--- a/src/transformers/models/mbart/tokenization_mbart.py
+++ b/src/transformers/models/mbart/tokenization_mbart.py
@@ -94,30 +94,13 @@ def __init__(
         tgt_lang=None,
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
         additional_special_tokens=None,
-        **kwargs
+        **kwargs,
     ):
-
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
 
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
 
-        super().__init__(
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            tokenizer_file=None,
-            src_lang=src_lang,
-            tgt_lang=tgt_lang,
-            additional_special_tokens=additional_special_tokens,
-            sp_model_kwargs=self.sp_model_kwargs,
-            **kwargs,
-        )
-
         self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
         self.sp_model.Load(str(vocab_file))
         self.vocab_file = vocab_file
@@ -143,14 +126,30 @@ def __init__(
 
         self.fairseq_tokens_to_ids.update(self.lang_code_to_id)
         self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
-        self._additional_special_tokens = list(self.lang_code_to_id.keys())
+        _additional_special_tokens = list(self.lang_code_to_id.keys())
 
         if additional_special_tokens is not None:
             # Only add those special tokens if they are not already there.
-            self._additional_special_tokens.extend(
-                [t for t in additional_special_tokens if t not in self._additional_special_tokens]
+            _additional_special_tokens.extend(
+                [t for t in additional_special_tokens if t not in _additional_special_tokens]
             )
 
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            tokenizer_file=None,
+            src_lang=src_lang,
+            tgt_lang=tgt_lang,
+            additional_special_tokens=_additional_special_tokens,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+
         self._src_lang = src_lang if src_lang is not None else "en_XX"
         self.cur_lang_code_id = self.lang_code_to_id[self._src_lang]
         self.tgt_lang = tgt_lang
diff --git a/src/transformers/models/mbart/tokenization_mbart_fast.py b/src/transformers/models/mbart/tokenization_mbart_fast.py
index 0ac14033a44a..ed0d0de9c864 100644
--- a/src/transformers/models/mbart/tokenization_mbart_fast.py
+++ b/src/transformers/models/mbart/tokenization_mbart_fast.py
@@ -107,11 +107,19 @@ def __init__(
         src_lang=None,
         tgt_lang=None,
         additional_special_tokens=None,
-        **kwargs
+        **kwargs,
     ):
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
 
+        _additional_special_tokens = FAIRSEQ_LANGUAGE_CODES.copy()
+
+        if additional_special_tokens is not None:
+            # Only add those special tokens if they are not already there.
+            _additional_special_tokens.extend(
+                [t for t in additional_special_tokens if t not in _additional_special_tokens]
+            )
+
         super().__init__(
             vocab_file=vocab_file,
             tokenizer_file=tokenizer_file,
@@ -124,22 +132,11 @@ def __init__(
             mask_token=mask_token,
             src_lang=src_lang,
             tgt_lang=tgt_lang,
-            additional_special_tokens=additional_special_tokens,
+            additional_special_tokens=_additional_special_tokens,
             **kwargs,
         )
 
         self.vocab_file = vocab_file
-        self.can_save_slow_tokenizer = False if not self.vocab_file else True
-
-        _additional_special_tokens = FAIRSEQ_LANGUAGE_CODES.copy()
-
-        if additional_special_tokens is not None:
-            # Only add those special tokens if they are not already there.
-            _additional_special_tokens.extend(
-                [t for t in additional_special_tokens if t not in _additional_special_tokens]
-            )
-
-        self.add_special_tokens({"additional_special_tokens": _additional_special_tokens})
         self.lang_code_to_id = {
             lang_code: self.convert_tokens_to_ids(lang_code) for lang_code in FAIRSEQ_LANGUAGE_CODES
         }
@@ -149,6 +146,10 @@ def __init__(
         self.tgt_lang = tgt_lang
         self.set_src_lang_special_tokens(self._src_lang)
 
+    @property
+    def can_save_slow_tokenizer(self) -> bool:
+        return os.path.isfile(self.vocab_file) if self.vocab_file else False
+
     @property
     def src_lang(self) -> str:
         return self._src_lang
diff --git a/src/transformers/models/mbart50/__init__.py b/src/transformers/models/mbart50/__init__.py
index 299c0d0da7bb..b889e374bb6d 100644
--- a/src/transformers/models/mbart50/__init__.py
+++ b/src/transformers/models/mbart50/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/mbart50/tokenization_mbart50.py b/src/transformers/models/mbart50/tokenization_mbart50.py
index 0a331b283760..e2cffc57ad33 100644
--- a/src/transformers/models/mbart50/tokenization_mbart50.py
+++ b/src/transformers/models/mbart50/tokenization_mbart50.py
@@ -125,7 +125,7 @@ def __init__(
         pad_token="<pad>",
         mask_token="<mask>",
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
@@ -137,19 +137,6 @@ def __init__(
             code for code in FAIRSEQ_LANGUAGE_CODES if code not in kwargs["additional_special_tokens"]
         ]
 
-        super().__init__(
-            src_lang=src_lang,
-            tgt_lang=tgt_lang,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            sp_model_kwargs=self.sp_model_kwargs,
-            **kwargs,
-        )
-
         self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
         self.sp_model.Load(str(vocab_file))
         self.vocab_file = vocab_file
@@ -176,6 +163,19 @@ def __init__(
         self.fairseq_tokens_to_ids.update(self.lang_code_to_id)
         self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
 
+        super().__init__(
+            src_lang=src_lang,
+            tgt_lang=tgt_lang,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+
         self._src_lang = src_lang if src_lang is not None else "en_XX"
         self.cur_lang_code_id = self.lang_code_to_id[self._src_lang]
         self.tgt_lang = tgt_lang
diff --git a/src/transformers/models/mbart50/tokenization_mbart50_fast.py b/src/transformers/models/mbart50/tokenization_mbart50_fast.py
index 1ab8ff06e260..09f53a83e6d0 100644
--- a/src/transformers/models/mbart50/tokenization_mbart50_fast.py
+++ b/src/transformers/models/mbart50/tokenization_mbart50_fast.py
@@ -122,7 +122,7 @@ def __init__(
         unk_token="<unk>",
         pad_token="<pad>",
         mask_token="<mask>",
-        **kwargs
+        **kwargs,
     ):
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
@@ -147,7 +147,6 @@ def __init__(
         )
 
         self.vocab_file = vocab_file
-        self.can_save_slow_tokenizer = False if not self.vocab_file else True
 
         self.lang_code_to_id = {
             lang_code: self.convert_tokens_to_ids(lang_code) for lang_code in FAIRSEQ_LANGUAGE_CODES
@@ -158,6 +157,10 @@ def __init__(
         self.cur_lang_code_id = self.lang_code_to_id[self._src_lang]
         self.set_src_lang_special_tokens(self._src_lang)
 
+    @property
+    def can_save_slow_tokenizer(self) -> bool:
+        return os.path.isfile(self.vocab_file) if self.vocab_file else False
+
     @property
     def src_lang(self) -> str:
         return self._src_lang
diff --git a/src/transformers/models/mctct/__init__.py b/src/transformers/models/mctct/__init__.py
deleted file mode 100644
index 6c28eb2214c5..000000000000
--- a/src/transformers/models/mctct/__init__.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import TYPE_CHECKING
-
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_speech_available, is_torch_available
-
-
-_import_structure = {
-    "configuration_mctct": ["MCTCT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MCTCTConfig"],
-    "processing_mctct": ["MCTCTProcessor"],
-}
-
-
-try:
-    if not is_speech_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["feature_extraction_mctct"] = ["MCTCTFeatureExtractor"]
-
-
-try:
-    if not is_torch_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_mctct"] = [
-        "MCTCT_PRETRAINED_MODEL_ARCHIVE_LIST",
-        "MCTCTForCTC",
-        "MCTCTModel",
-        "MCTCTPreTrainedModel",
-    ]
-
-
-if TYPE_CHECKING:
-    from .configuration_mctct import MCTCT_PRETRAINED_CONFIG_ARCHIVE_MAP, MCTCTConfig
-    from .processing_mctct import MCTCTProcessor
-
-    try:
-        if not is_speech_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .feature_extraction_mctct import MCTCTFeatureExtractor
-
-    try:
-        if not is_torch_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_mctct import MCTCT_PRETRAINED_MODEL_ARCHIVE_LIST, MCTCTForCTC, MCTCTModel, MCTCTPreTrainedModel
-
-else:
-    import sys
-
-    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/mctct/feature_extraction_mctct.py b/src/transformers/models/mctct/feature_extraction_mctct.py
deleted file mode 100644
index 9785d530e926..000000000000
--- a/src/transformers/models/mctct/feature_extraction_mctct.py
+++ /dev/null
@@ -1,364 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Feature extractor class for M-CTC-T
-"""
-
-from typing import List, Optional, Union
-
-import numpy as np
-import torch
-import torchaudio
-from packaging import version
-
-from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
-from ...feature_extraction_utils import BatchFeature
-from ...file_utils import PaddingStrategy, TensorType
-from ...utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-parsed_torchaudio_version_base = version.parse(version.parse(torchaudio.__version__).base_version)
-if not parsed_torchaudio_version_base >= version.parse("0.10"):
-    logger.warning(
-        f"You are using torchaudio=={torchaudio.__version__}, but torchaudio>=0.10.0 is required to use "
-        "MCTCTFeatureExtractor. This requires torch>=1.10.0. Please upgrade torch and torchaudio."
-    )
-
-
-class MCTCTFeatureExtractor(SequenceFeatureExtractor):
-    r"""
-    Constructs a M-CTC-T feature extractor.
-
-    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
-    most of the main methods. Users should refer to this superclass for more information regarding those methods. This
-    code has been adapted from Flashlight's C++ code. For more information about the implementation, one can refer to
-    this [notebook](https://colab.research.google.com/drive/1GLtINkkhzms-IsdcGy_-tVCkv0qNF-Gt#scrollTo=pMCRGMmUC_an)
-    that takes the user step-by-step in the implementation.
-
-    Args:
-        feature_size (`int`, defaults to 80):
-            The feature dimension of the extracted features. This is the number of mel_frequency
-        sampling_rate (`int`, defaults to 16000):
-            The sampling rate at which the audio files should be digitalized expressed in Hertz per second (Hz).
-        padding_value (`float`, defaults to 0.0):
-            The value that is used to fill the padding values.
-        hop_length (`int`, defaults to 10):
-            Number of audio samples between windows. Otherwise referred to as "shift" in many papers.
-        win_length (`int`, defaults to 25):
-            Number of ms per window
-        win_function (`str`, defaults to `"hamming_window"`):
-            Name for the window function used for windowing, must be accessible via `torch.{win_function}`
-        frame_signal_scale (`float`, defaults to 32768.0):
-            Constant multiplied in creating the frames before applying DFT.
-        preemphasis_coeff (`float`, defaults to 0.97):
-            Constant multiplied in applying Pre-emphasis before DFT.
-        mel_floor (`float` defaults to 1.0):
-            Minimum value of mel frequency banks.
-        normalize_means (`bool`, *optional*, defaults to `True`):
-            Whether or not to zero-mean normalize the extracted features.
-        normalize_vars (`bool`, *optional*, defaults to `True`):
-            Whether or not to unit-variance normalize the extracted features.
-    """
-
-    model_input_names = ["input_features", "attention_mask"]
-
-    def __init__(
-        self,
-        feature_size=80,
-        sampling_rate=16000,
-        padding_value=0.0,
-        hop_length=10,
-        win_length=25,
-        win_function="hamming_window",
-        frame_signal_scale=32768.0,
-        preemphasis_coeff=0.97,
-        mel_floor=1.0,
-        normalize_means=True,
-        normalize_vars=True,
-        return_attention_mask=False,
-        **kwargs
-    ):
-        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
-
-        self.feature_size = feature_size
-        self.sampling_rate = sampling_rate
-        self.padding_value = padding_value
-        self.hop_length = hop_length
-        self.win_length = win_length
-        self.frame_signal_scale = frame_signal_scale
-        self.preemphasis_coeff = preemphasis_coeff
-        self.mel_floor = mel_floor
-        self.normalize_means = normalize_means
-        self.normalize_vars = normalize_vars
-        self.win_function = win_function
-        self.return_attention_mask = return_attention_mask
-
-        self.sample_size = win_length * sampling_rate // 1000
-        self.sample_stride = hop_length * sampling_rate // 1000
-
-        self.n_fft = 2 ** int(np.ceil(np.log2(self.sample_size)))
-        self.n_freqs = (self.n_fft // 2) + 1
-
-    @staticmethod
-    def _num_frames_calc(in_size, frame_size, frame_stride):
-        return int(1 + np.floor((in_size - frame_size) * 1 / frame_stride))
-
-    @staticmethod
-    def _frame_signal(one_waveform, n_frames, frame_signal_scale, window_length, sample_stride):
-        scale = frame_signal_scale
-        frames = np.zeros(n_frames * window_length)
-        for frame_idx in range(n_frames):
-            start = frame_idx * window_length
-            end = (frame_idx + 1) * window_length
-            wave_start = frame_idx * sample_stride
-            wave_end = frame_idx * sample_stride + window_length
-            frames[start:end] = scale * one_waveform[wave_start:wave_end]
-
-        return frames
-
-    @staticmethod
-    def _apply_preemphasis_inplace(frames, window_length, preemphasis_coeff):
-        if frames.size % window_length != 0:
-            raise ValueError(
-                f"`frames` is supposed to have length divisble by `window_length`, but is {frames.size} with"
-                f" window_length={window_length}."
-            )
-
-        n_frames = frames.size // window_length
-        for frame_idx in range(n_frames, 0, -1):
-            start = (frame_idx - 1) * window_length
-            end = frame_idx * window_length - 1
-            frames[start + 1 : end + 1] -= preemphasis_coeff * frames[start:end]
-            frames[start] *= 1 - preemphasis_coeff
-
-    @staticmethod
-    def _windowing(frames, window_length, window):
-        if frames.size % window_length != 0:
-            raise ValueError(
-                f"`frames` is supposed to have length divisble by `window_length`, but is {frames.size} with"
-                f" window_length={window_length}."
-            )
-
-        shaped = frames.reshape(-1, window_length)
-        shaped = window * shaped
-        return shaped
-
-    @staticmethod
-    def _dft(frames, K, n_frames, n_samples, n_fft):
-        dft = np.zeros([n_frames, K])
-
-        for frame in range(n_frames):
-            begin = frame * n_samples
-
-            inwards_buffer = frames[begin : begin + n_samples]
-            inwards_buffer = np.pad(inwards_buffer, (0, n_fft - n_samples), "constant")
-            out = np.fft.rfft(inwards_buffer)
-
-            dft[frame] = np.abs(out[:K])
-
-        return dft
-
-    def _extract_mfsc_features(self, one_waveform: np.array) -> np.ndarray:
-        """
-        Extracts MFSC Features for one waveform vector (unbatched). Adapted from Flashlight's C++ MFSC code.
-        """
-        if self.win_function == "hamming_window":
-            window = torch.hamming_window(window_length=self.sample_size, periodic=False, alpha=0.54, beta=0.46)
-        else:
-            window = getattr(torch, self.win_function)()
-
-        window = window.numpy()
-
-        fbanks = torchaudio.functional.melscale_fbanks(
-            n_freqs=self.n_freqs,
-            f_min=0.0,  # change this to zeros
-            f_max=self.sampling_rate / 2.0,
-            n_mels=self.feature_size,
-            sample_rate=self.sampling_rate,
-        )
-
-        fbanks = fbanks.numpy()
-
-        n_frames = self._num_frames_calc(one_waveform.size, self.sample_size, self.sample_stride)
-
-        frames = self._frame_signal(
-            one_waveform, n_frames, self.frame_signal_scale, self.sample_size, self.sample_stride
-        )
-
-        self._apply_preemphasis_inplace(frames, self.sample_size, self.preemphasis_coeff)
-
-        frames = self._windowing(frames, self.sample_size, window)
-
-        dft_out = self._dft(frames.flatten(), self.n_freqs, n_frames, self.sample_size, self.n_fft)
-
-        # msfc_features = STFT * mel frequency banks.
-        msfc_features = np.einsum("...tf,fm->...tm", dft_out, fbanks)
-
-        # clamp feature values then log scale, as implemented in flashlight
-        msfc_features = np.maximum(msfc_features, self.mel_floor)
-        msfc_features = np.log(msfc_features)
-
-        return msfc_features
-
-    def _normalize_one(self, x, input_length, padding_value):
-        # make sure we normalize float32 arrays
-        if self.normalize_means:
-            mean = x[:input_length].mean(axis=0)
-            x = np.subtract(x, mean)
-        if self.normalize_vars:
-            std = x[:input_length].std(axis=0)
-            x = np.divide(x, std)
-
-        if input_length < x.shape[0]:
-            x[input_length:] = padding_value
-
-        # make sure array is in float32
-        x = x.astype(np.float32)
-
-        return x
-
-    def normalize(
-        self, input_features: List[np.ndarray], attention_mask: Optional[np.ndarray] = None
-    ) -> List[np.ndarray]:
-        lengths = attention_mask.sum(-1) if attention_mask is not None else [x.shape[0] for x in input_features]
-        return [self._normalize_one(x, n, self.padding_value) for x, n in zip(input_features, lengths)]
-
-    def __call__(
-        self,
-        raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
-        padding: Union[bool, str, PaddingStrategy] = False,
-        max_length: Optional[int] = None,
-        truncation: bool = False,
-        pad_to_multiple_of: Optional[int] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        sampling_rate: Optional[int] = None,
-        **kwargs
-    ) -> BatchFeature:
-        """
-        Main method to featurize and prepare for the model one or several sequence(s). sequences. It returns the
-        log-mel spectrogram of the input audio, as implemented in the original Flashlight MFSC feature extraction code.
-
-        Args:
-            raw_speech (`torch.Tensor`, `np.ndarray`, `List[float]`, `List[torch.Tensor]`, `List[np.ndarray]`, `List[List[float]]`):
-                The sequence or batch of sequences to be padded. Each sequence can be a tensor, a numpy array, a list
-                of float values, a list of tensors, a list of numpy arrays or a list of list of float values.
-            padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
-                Select a strategy to pad the returned sequences (according to the model's padding side and padding
-                index) among:
-
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                  lengths).
-            max_length (`int`, *optional*):
-                Maximum length of the returned list and optionally padding length (see above).
-            truncation (`bool`):
-                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
-            pad_to_multiple_of (`int`, *optional*):
-                If set will pad the sequence to a multiple of the provided value.
-
-                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
-                >= 7.5 (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
-            return_attention_mask (`bool`, *optional*):
-                Whether to return the attention mask. If left to the default, will return the attention mask according
-                to the specific feature_extractor's default.
-
-                [What are attention masks?](../glossary#attention-mask)
-
-            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
-                If set, will return tensors instead of list of python integers. Acceptable values are:
-
-                - `'tf'`: Return TensorFlow `tf.constant` objects.
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return Numpy `np.ndarray` objects.
-            sampling_rate (`int`, *optional*):
-                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
-                `sampling_rate` at the forward call to prevent silent errors.
-            padding_value (`float`, defaults to 0.0):
-        """
-
-        if sampling_rate is not None:
-            if sampling_rate != self.sampling_rate:
-                raise ValueError(
-                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
-                    f" {self.sampling_rate}. Please make sure that the provided `raw_speech` input was sampled with"
-                    f" {self.sampling_rate} and not {sampling_rate}."
-                )
-        else:
-            logger.warning(
-                "It is strongly recommended to pass the ``sampling_rate`` argument to this function. "
-                "Failing to do so can result in silent errors that might be hard to debug."
-            )
-
-        is_batched = bool(
-            isinstance(raw_speech, (list, tuple))
-            and (isinstance(raw_speech[0], np.ndarray) or isinstance(raw_speech[0], (tuple, list)))
-        )
-
-        if is_batched:
-            raw_speech = [np.asarray(speech, dtype=np.float32) for speech in raw_speech]
-        elif not is_batched and not isinstance(raw_speech, np.ndarray):
-            raw_speech = np.asarray(raw_speech, dtype=np.float32)
-        elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64):
-            raw_speech = raw_speech.astype(np.float32)
-
-        # always return batch
-        if not is_batched:
-            raw_speech = [raw_speech]
-
-        # extract fbank features
-        features = [self._extract_mfsc_features(one_waveform) for one_waveform in raw_speech]
-
-        # convert into correct format for padding
-        encoded_inputs = BatchFeature({"input_features": features})
-
-        padded_inputs = self.pad(
-            encoded_inputs,
-            padding=padding,
-            max_length=max_length,
-            truncation=truncation,
-            pad_to_multiple_of=pad_to_multiple_of,
-            return_attention_mask=True,
-            **kwargs,
-        )
-        # make sure list is in array format
-        input_features = padded_inputs.get("input_features")
-        if isinstance(input_features[0], list):
-            padded_inputs["input_features"] = [np.asarray(feature, dtype=np.float32) for feature in input_features]
-
-        attention_mask = padded_inputs.get("attention_mask")
-        if attention_mask is not None:
-            padded_inputs["attention_mask"] = [np.asarray(array, dtype=np.int32) for array in attention_mask]
-
-        if self.normalize_means or self.normalize_vars:
-            attention_mask = (
-                np.array(attention_mask, dtype=np.int32)
-                if self._get_padding_strategies(padding, max_length=max_length) is not PaddingStrategy.DO_NOT_PAD
-                and padding
-                else None
-            )
-            padded_inputs["input_features"] = self.normalize(
-                padded_inputs["input_features"], attention_mask=attention_mask
-            )
-
-        if return_tensors is not None:
-            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
-
-        return padded_inputs
diff --git a/src/transformers/models/mega/__init__.py b/src/transformers/models/mega/__init__.py
new file mode 100644
index 000000000000..728499ef2d38
--- /dev/null
+++ b/src/transformers/models/mega/__init__.py
@@ -0,0 +1,70 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_mega": ["MEGA_PRETRAINED_CONFIG_ARCHIVE_MAP", "MegaConfig", "MegaOnnxConfig"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_mega"] = [
+        "MEGA_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "MegaForCausalLM",
+        "MegaForMaskedLM",
+        "MegaForMultipleChoice",
+        "MegaForQuestionAnswering",
+        "MegaForSequenceClassification",
+        "MegaForTokenClassification",
+        "MegaModel",
+        "MegaPreTrainedModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_mega import MEGA_PRETRAINED_CONFIG_ARCHIVE_MAP, MegaConfig, MegaOnnxConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_mega import (
+            MEGA_PRETRAINED_MODEL_ARCHIVE_LIST,
+            MegaForCausalLM,
+            MegaForMaskedLM,
+            MegaForMultipleChoice,
+            MegaForQuestionAnswering,
+            MegaForSequenceClassification,
+            MegaForTokenClassification,
+            MegaModel,
+            MegaPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/mega/configuration_mega.py b/src/transformers/models/mega/configuration_mega.py
new file mode 100644
index 000000000000..cade307c84e5
--- /dev/null
+++ b/src/transformers/models/mega/configuration_mega.py
@@ -0,0 +1,242 @@
+# coding=utf-8
+# Copyright 2023 The Mega Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" MEGA configuration"""
+from collections import OrderedDict
+from typing import Mapping
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+MEGA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "mnaylor/mega-base-wikitext": "https://huggingface.co/mnaylor/mega-base-wikitext/resolve/main/config.json",
+}
+
+
+class MegaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MegaModel`]. It is used to instantiate a Mega
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Mega
+    [mnaylor/mega-base-wikitext](https://huggingface.co/mnaylor/mega-base-wikitext) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the Mega model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`MegaModel`].
+        hidden_size (`int`, *optional*, defaults to 128):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 4):
+            Number of hidden layers in the Mega encoder.
+        intermediate_size (`int`, *optional*, defaults to 256):
+            Dimensionality of the hidden size (self-attention value projection) within the Mega encoder
+        ema_projection_size (`int`, *optional*, defaults to 16):
+            Dimensionality of the MegaMultiDimensionDampedEma
+        bidirectional (`bool`, *optional*, defaults to `True`):
+            Whether the MegaMultiDimensionDampedEma used in Mega's self-attention should work bidirectionally (`True`)
+            or unidirectionally (`False`). Bidirectional EMA is incompatible with causal decoding, so this should be
+            False if you intend to use the model as a decoder.
+        shared_representation_size (`int`, *optional*, defaults to 64):
+            Dimensionality of the linear projection for shared representation of self-attention queries and keys
+        use_chunking (`bool`, *optional*, defaults to `False`):
+            Whether to chunk inputs for linear self-attention complexity (described as Mega-chunk in the paper)
+        chunk_size (`int`, *optional*, defaults to -1):
+            If `use_chunking` is set to `True`, determines the size of the chunks to apply to the input sequence. If
+            chunking is used, input sequences must be padded to a multiple of `chunk_size`
+        truncation (`int`, *optional*):
+            If specified, the sequence length for which to truncate MegaMultiDimensionDampedEma
+        normalize_before_mega (`bool`, *optional*, defaults to `True`):
+            Whether to normalize before (`True`) or after (`False`) passing through Mega encoder blocks
+        normalization_type (`str`, *optional*, defaults to `"scalenorm"`):
+            Type of normalization to use in Mega encoder blocks. Choose one of `"scalenorm"`, `"layernorm"`,
+            `"rmsnorm"`, `"batchnorm"`, or `"syncbatchnorm"` (GPU required for syncbatchnorm)
+        norm_affine (`bool`, *optional*, defaults to `True`):
+            If `True`, applies a parameterized affine transformation to inputs during normalization
+        activation (`str`, *optional*, defaults to `"silu"`):
+            Activation function to apply within Mega encoder blocks. Choose one of `"silu"`, `"relu"`, `"linear"`,
+            `"gelu"`, or `"gelu_accurate"`
+        attention_activation (`str`, *optional*, defaults to `"softmax"`):
+            Activation function to apply for single-headed self-attention (a la Transformer). Choose one of
+            `"softmax"`, `"laplace"`, or `"relu2"`
+        dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for EMA self-attention
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        use_feature_dropout (`bool`, *optional*, defaults to `False`):
+            Whether to use feature-based (`True`) or standard dropout (`False`)
+        use_normalized_ffn (`bool`, *optional*, defaults to `True`):
+            Whether to use the normalized feed-forward sub-layer in Mega blocks (`True`) or pass Mega encoder output
+            as-is (`False`)
+        nffn_hidden_size (`int`, *optional*, defaults to 256):
+            If using the normalized feed-forward network (NFFN) layer within Mega (`use_normalized_ffn = True`), this
+            is the hidden size of the NFFN
+        normalize_before_ffn (`bool`, *optional*, defaults to `True`):
+            Whether to normalize before (`True`) or after (`False`) the feed-forward portion of NFFN
+        nffn_activation_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the NFFN component.
+        max_positions (`int`, *optional*, defaults to 2048):
+            The maximum sequence length to use for positional representations. For `"simple"` relative positional bias,
+            this is a hard limit on input length; `"rotary"` relative positional bias will extrapolate to longer
+            sequences
+        add_token_type_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to account for token types in embeddings. Left as optional to maintain compatibility with original
+            implementation while adding support for token types.
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`MegaModel`]. Only used if
+            `add_token_type_embeddings = True`
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        ema_delta_alpha_range (`float`, *optional*, defaults to 0.2):
+            The standard deviation for initializing the delta (damping factor) and alpha (decay factor) parameters in
+            MegaMultiDimensionDampedEma.
+        ema_beta_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation for initializing the beta parameter (expansion matrix) in
+            MegaMultiDimensionDampedEma.
+        ema_gamma_omega_range (`float`, *optional*, defaults to 1.0):
+            The standard deviation for initializing the gamma (projection matrix) and omega (residual weight)
+            parameters in MultiDimensionEMA.
+        relative_positional_bias (`str`, *optional*, defaults to `"rotary"`):
+            Type of relative positional encoding. Choose one of `"rotary"` or `"simple"`. If `"simple"` is selected,
+            `max_positions` is used as a limit on input size, while `"rotary"` extrapolates beyond `max_positions`.
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+        add_lm_hidden_dense_layer (`bool`, *optional*, defaults to `True`):
+            Whether to include a hidden layer for projection between encoder outputs and LM heads (`True`) or pass
+            hidden states directly to LM head (`False`). Remains optional for compatibility with original
+            implementation
+
+    Examples:
+
+    ```python
+    >>> from transformers import MegaConfig, MegaModel
+
+    >>> # Initializing a Mega configuration
+    >>> configuration = MegaConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = MegaModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "mega"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=128,
+        num_hidden_layers=4,
+        intermediate_size=256,
+        ema_projection_size=16,
+        bidirectional=True,
+        shared_representation_size=64,
+        use_chunking=False,
+        chunk_size=-1,
+        truncation=None,
+        normalize_before_mega=True,
+        normalization_type="scalenorm",
+        norm_affine=True,
+        activation="silu",
+        attention_activation="softmax",
+        dropout_prob=0.1,
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        use_feature_dropout=False,
+        use_normalized_ffn=True,
+        nffn_hidden_size=256,
+        normalize_before_ffn=True,
+        nffn_activation_dropout_prob=0.1,
+        max_positions=2048,
+        add_token_type_embeddings=False,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        ema_delta_alpha_range=0.2,
+        ema_beta_range=0.02,
+        ema_gamma_omega_range=1.0,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        relative_positional_bias="rotary",
+        classifier_dropout=None,
+        use_cache=True,
+        add_lm_hidden_dense_layer=True,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.activation = activation
+        self.attention_activation = attention_activation
+        self.intermediate_size = intermediate_size
+        self.ema_projection_size = ema_projection_size
+        self.bidirectional = bidirectional
+        self.shared_representation_size = shared_representation_size
+        self.use_chunking = use_chunking
+        self.chunk_size = chunk_size
+        self.truncation = truncation
+        self.normalize_before_mega = normalize_before_mega
+        self.normalization_type = normalization_type
+        self.norm_affine = norm_affine
+        self.dropout_prob = dropout_prob
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.use_feature_dropout = use_feature_dropout
+        self.use_normalized_ffn = use_normalized_ffn
+        self.nffn_hidden_size = nffn_hidden_size
+        self.normalize_before_ffn = normalize_before_ffn
+        self.nffn_activation_dropout_prob = nffn_activation_dropout_prob
+        self.max_positions = max_positions
+        self.add_token_type_embeddings = add_token_type_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.ema_delta_alpha_range = ema_delta_alpha_range
+        self.ema_beta_range = ema_beta_range
+        self.ema_gamma_omega_range = ema_gamma_omega_range
+        self.relative_positional_bias = relative_positional_bias
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
+        self.add_lm_hidden_dense_layer = add_lm_hidden_dense_layer
+        self.num_attention_heads = 1  # not used but required by Hugging Face
+
+
+class MegaOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task == "multiple-choice":
+            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
+        else:
+            dynamic_axis = {0: "batch", 1: "sequence"}
+        return OrderedDict(
+            [
+                ("input_ids", dynamic_axis),
+                ("attention_mask", dynamic_axis),
+            ]
+        )
diff --git a/src/transformers/models/mega/convert_mega_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/mega/convert_mega_original_pytorch_checkpoint_to_pytorch.py
new file mode 100644
index 000000000000..2fe75ba27324
--- /dev/null
+++ b/src/transformers/models/mega/convert_mega_original_pytorch_checkpoint_to_pytorch.py
@@ -0,0 +1,291 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Convert Mega pretrained checkpoint. Built to convert the Masked LM checkpoint located at
+https://huggingface.co/mnaylor/mega-wikitext-103
+
+Requirements:
+  - clone the Mega repo and install fairseq from there
+    1. git clone https://github.com/facebookresearch/mega.git
+    2. cd mega && pip install -e
+  - clone the pretrained weights for the original implementation from the hugging face repo
+    * use this location as the path for pretrained weights
+"""
+import argparse
+
+# utilities to import the model weights and config file
+import os
+import pickle as pkl
+
+# PyTorch + new model classes
+import torch
+from torch import nn
+
+from transformers import AutoTokenizer, MegaConfig, MegaForMaskedLM
+
+
+# import the EncoderLayer class used to pretrain
+# !! NOTE !! this requires the version of fairseq that is built when you install the Mega source
+try:
+    from fairseq.modules.mega_layer import MegaEncoderLayer
+except ImportError:
+    raise ImportError("You need to install the version of fairseq from the Mega repo!")
+
+
+# define the wrapper classes used to train the MLM  (see colab notebook below)
+# https://colab.research.google.com/drive/1qfUO6o5HRdxBblWlw058HVyvaEPhPpH8?usp=sharing
+# MegaLM outputs hidden states
+class MegaLM(nn.Module):
+    "The base class for our Mega encoder - given input IDs, embed text and return encoder output"
+
+    def __init__(self, mega_args, depth, vocab_size):
+        super().__init__()
+        self.mega_args = mega_args
+        self.embedding_layer = nn.Embedding(vocab_size, self.mega_args.encoder_embed_dim)
+        self.encoders = nn.ModuleList([MegaEncoderLayer(self.mega_args) for _ in range(depth)])
+        self.depth = depth
+
+    def forward(self, input_ids, attention_mask, batch_first=True, ignore_mask_value=0):
+        """
+        Code for a forward pass - expects input_ids and attention_mask to come from a Hugging Face tokenizer as PyTorch
+        tensors, and returns a tensor of size (batch, n_classes) containing classification logits
+
+        Other options:
+          - batch_first: boolean indicating whether the batch dimension is first in input_ids (default: True, which
+            aligns with the HF tokenizer behavior)
+          - ignore_mask_value: the value in attention_mask that identifies tokens that should be ignored (default: 0,
+            which aligns with HF tokenizer)
+        """
+
+        # Mega expects embeddings to be (time, batch, embedding size), but
+        # Hugging Face returns tokens as (batch, time)
+        if batch_first:
+            input_ids = input_ids.T
+
+        # to make things more confusing, Mega expects the attention mask to
+        # be (batch, time), but with values of 0 (normal token) and 1 (ignore token)
+        # which is the opposite of what HF returns
+        if ignore_mask_value == 0:
+            attention_mask = 1 - attention_mask
+
+        # get token embeddings from IDs
+        embeds = self.embedding_layer(input_ids)
+
+        # pass through the Mega layers
+        # input is (time, batch, encoder dim) and output is the same
+        for encoder in self.encoders:
+            embeds = encoder(embeds, attention_mask)
+
+        # return according to the shape specified
+        if batch_first:
+            # (T, B, H) --> (B, T, H)
+            return torch.transpose(embeds, 0, 1)
+        else:
+            return embeds
+
+
+# renamed from MegaForMaskedLM to avoid confusion with new module
+class OriginalMegaForMaskedLM(nn.Module):
+    "A wrapper class for doing masked language modeling with Mega"
+
+    def __init__(self, mega_args, depth, vocab_size):
+        super().__init__()
+        self.mega = MegaLM(mega_args, depth, vocab_size)
+        self.mlm_head = nn.Linear(mega_args.encoder_embed_dim, vocab_size)
+        self.dropout = nn.Dropout(p=0.1)
+
+    def forward(self, input_ids, attention_mask, batch_first=True, ignore_mask_value=0):
+        """
+        Perform a forward pass through the Mega encoder and the masked LM head. Returns logits for each vocabulary
+        entry.
+
+        If `batch_first` (default to align with Hugging Face tokenizer behavior), output will have the shape (Batch
+        size, Sequence length, Vocab size); otherwise (S, B, V)
+        """
+        encoder_output = self.mega(input_ids, attention_mask, batch_first, ignore_mask_value)
+        return self.mlm_head(self.dropout(encoder_output))
+
+
+# code to convert the checkpoint located in the user-specified location
+def convert_checkpoint_to_huggingface(pretrained_checkpoint_path, output_path, includes_tokenizer):
+    with open(os.path.join(pretrained_checkpoint_path, "model_args.pkl"), "rb") as f:
+        mega_original_args = pkl.load(f)
+
+    # load the original encoder
+    original_mlm = OriginalMegaForMaskedLM(**mega_original_args).eval()
+
+    # load its weights
+    print(
+        "Original Mega encoder:",
+        original_mlm.mega.load_state_dict(
+            torch.load(os.path.join(pretrained_checkpoint_path, "encoder_weights.pt"), map_location="cpu")
+        ),
+    )
+    print(
+        "Original Mega MLM layer:",
+        original_mlm.mlm_head.load_state_dict(
+            torch.load(os.path.join(pretrained_checkpoint_path, "mlm_head_weights.pt"), map_location="cpu")
+        ),
+    )
+
+    # create a new config from the old one
+    hf_config = MegaConfig(
+        num_hidden_layers=mega_original_args["depth"],
+        vocab_size=mega_original_args["vocab_size"],
+        hidden_size=mega_original_args["mega_args"].encoder_embed_dim,
+        shared_representation_size=mega_original_args["mega_args"].encoder_z_dim,
+        intermediate_size=mega_original_args["mega_args"].encoder_hidden_dim,
+        ema_projection_size=mega_original_args["mega_args"].encoder_n_dim,
+        dropout_prob=mega_original_args["mega_args"].dropout,
+        attention_probs_dropout_prob=mega_original_args["mega_args"].attention_dropout,
+        hidden_dropout_prob=mega_original_args["mega_args"].hidden_dropout,
+        activation=mega_original_args["mega_args"].activation_fn,
+        attention_activation=mega_original_args["mega_args"].attention_activation_fn,
+        bidirectional=mega_original_args["mega_args"].bidirectional,
+        use_chunking=mega_original_args["mega_args"].encoder_chunk_size > 0,
+        chunk_size=mega_original_args["mega_args"].encoder_chunk_size,
+        truncation=mega_original_args["mega_args"].truncation_length,
+        normalization_type=mega_original_args["mega_args"].normalization_type,
+        normalize_before_mega=True,
+        norm_affine=True,
+        use_feature_dropout=mega_original_args["mega_args"].feature_dropout,
+        relative_positional_bias=mega_original_args["mega_args"].rel_pos_bias,
+        max_positions=mega_original_args["mega_args"].max_source_positions,
+        nffn_hidden_size=mega_original_args["mega_args"].encoder_ffn_embed_dim,
+        normalize_before_ffn=mega_original_args["mega_args"].normalize_before,
+        # new arguments added for HF implementation
+        nffn_activation_dropout_prob=0.0,
+        add_token_type_embeddings=False,
+        add_lm_hidden_dense_layer=False,
+    )
+
+    hf_mlm = MegaForMaskedLM(hf_config).eval()
+
+    # the originl checkpoint just uses nn.Embedding for the word embeddings
+    # we use a wrapper module for embeddings to add support for positional embeddings
+    hf_mlm.mega.embedding_layer.word_embeddings.weight = original_mlm.mega.embedding_layer.weight
+
+    # modify the state dictionary of the original checkpoint to account for naming issues in the Hugging Face
+    # ecosystem -- any names containing "beta" or "gamma" aren't safe to use and are renamed upon _load_pretrained,
+    # also renaming previously confusing parameter names
+    original_state_dict = original_mlm.mega.encoders.state_dict()
+    updated_keys = {}
+    for module_name in original_state_dict.keys():
+        new_module_name = None
+        # have to handle gamma, beta, and alpha differently due to their use
+        # in multiple modules within the original repository;
+        # beta is used in EMA, MovingAverageGatedAttention, and RotaryRelativePositionalBias, and must be renamed due to flax/tf weights
+        # the EMA sublayer was renamed from "move" to "ema_gate" for readability, so that is also done here
+        if "beta" in module_name:
+            # EMA sub-layers were always called "move" in the original repo
+            if "move.beta" in module_name:
+                new_module_name = module_name.replace("move.beta", "ema_gate.ema_expansion_matrix")
+            elif "mega_layer.beta" in module_name:
+                new_module_name = module_name.replace("beta", "qk_bias")
+            else:
+                new_module_name = module_name.replace("beta", "b_param")
+        # beta is used in EMA and MovingAverageGatedAttention, and must be renamed due to flax/tf weights
+        elif "gamma" in module_name:
+            if "move.gamma" in module_name:
+                new_module_name = module_name.replace("move.gamma", "ema_gate.kernel_projection_matrix")
+            elif "mega_layer.gamma" in module_name:
+                new_module_name = module_name.replace("gamma", "qk_weight")
+            else:
+                new_module_name = module_name.replace("gamma", "g_param")
+        # alpha is used in EMA and positional bias; renaming to improve readability
+        elif "move.alpha" in module_name:
+            new_module_name = module_name.replace("move.alpha", "ema_gate.decay_factor")
+        # delta is only used in EMA; renaming to improve readability
+        elif "move.delta" in module_name:
+            new_module_name = module_name.replace("move.delta", "ema_gate.damping_factor")
+        # omega is only used in EMA; renaming to improve readability
+        elif "omega" in module_name:
+            new_module_name = module_name.replace("move.omega", "ema_gate.residual_weight")
+
+        if new_module_name:
+            updated_keys[module_name] = new_module_name
+
+    if len(updated_keys) != 0:
+        print(f"Renaming these keys: {updated_keys.keys()}")
+    else:
+        print("No need to rename state dict entries")
+    for old, new in updated_keys.items():
+        original_state_dict[new] = original_state_dict.pop(old)
+
+    # now attempt to load the state dictionary with updated names
+    # note that we now call it `mega.layers` instead of `mega.encoders` due to hugging face style
+    print("HF Mega encoder:", hf_mlm.mega.layers.load_state_dict(original_state_dict))
+
+    # load the MLM head weights directly
+    print(
+        "HF Mega MLM layer:",
+        hf_mlm.mlm_head.load_state_dict(
+            torch.load(os.path.join(pretrained_checkpoint_path, "mlm_head_weights.pt"), map_location="cpu")
+        ),
+    )
+
+    # test on a randomly generated input sequence
+    input_ids = torch.randint(0, hf_config.vocab_size, size=(4, 256))
+    input_mask = torch.ones_like(input_ids)
+    # mask a few tokens to make sure masking is applied appropriately :)
+    input_mask[:, -10:] = 0
+
+    # run forward passes
+    original_output = original_mlm(input_ids, input_mask, batch_first=True, ignore_mask_value=0)
+    hf_output = hf_mlm(input_ids, input_mask)[0]
+
+    # print shapes and diff
+    print(f"original output {original_output.shape}")
+    print(f"hf output {hf_output.shape}")
+    print(f"max diff: {(original_output - hf_output).max()}")  # 0.0
+    success = torch.allclose(original_output, hf_output, atol=1e-3)
+
+    if success:
+        print("Yay!")
+        hf_mlm.save_pretrained(output_path)
+    else:
+        raise RuntimeError(f"Something's broken :(\nOriginal:\n{original_output}\n\nHF\n{hf_output}\n{hf_mlm}")
+
+    if includes_tokenizer:
+        print("Transferring tokenizer")
+        tokenizer = AutoTokenizer.from_pretrained(pretrained_checkpoint_path)
+        tokenizer.save_pretrained(output_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--pretrained_checkpoint_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Point to the directory containing your model weights using the official Mega repo",
+    )
+
+    parser.add_argument(
+        "--output_path", default=None, type=str, required=True, help="Location to save the Hugging Face version"
+    )
+
+    parser.add_argument(
+        "--includes_tokenizer",
+        action="store_true",
+        help="Use this flag if there is a Hugging Face tokenizer in the original checkpoint repo",
+    )
+
+    args = parser.parse_args()
+
+    convert_checkpoint_to_huggingface(args.pretrained_checkpoint_path, args.output_path, args.includes_tokenizer)
diff --git a/src/transformers/models/mega/modeling_mega.py b/src/transformers/models/mega/modeling_mega.py
new file mode 100644
index 000000000000..45ce5242428f
--- /dev/null
+++ b/src/transformers/models/mega/modeling_mega.py
@@ -0,0 +1,2277 @@
+# coding=utf-8
+# Copyright 2023 The Mega Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch MEGA model."""
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import ALL_LAYERNORM_LAYERS
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_mega import MegaConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "mnaylor/mega-base-wikitext"
+_CONFIG_FOR_DOC = "MegaConfig"
+
+MEGA_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "mnaylor/mega-base-wikitext",
+    # See all Mega models at https://huggingface.co/models?filter=mega
+]
+
+
+class MegaEmbeddings(nn.Module):
+    """
+    Mega's basic implementation does not incorporate token type embeddings, so this is a stripped-down version of
+    RoBERTa's embeddings which optionally includes token types
+    """
+
+    def __init__(self, config: MegaConfig):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.use_token_types = config.add_token_type_embeddings
+        if self.use_token_types:
+            self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+            # registering a buffer here allows model tracing when not passing optional token type IDs
+            # more info at transformers issue #5664
+            self.register_buffer(
+                "token_type_ids", torch.zeros(config.max_positions, dtype=torch.long).expand((1, -1)), persistent=False
+            )
+
+        self.padding_idx = config.pad_token_id
+
+    def forward(self, input_ids=None, token_type_ids=None, inputs_embeds=None):
+        if (input_ids is None) and (inputs_embeds is None):
+            raise ValueError("Must provide one of input_ids or inputs_embeds")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            device = input_ids.device
+
+            # get the word embeddings if only IDs are provided
+            inputs_embeds = self.word_embeddings(input_ids)
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+            device = inputs_embeds.device
+
+        # the original Mega implementation did not include token type embeddings, so we add
+        # an option to use them if desired; if embeddings are present and token type IDs are
+        # not provided, we will use a registered buffer (which helps with tracing)
+        if self.use_token_types:
+            if token_type_ids is None:
+                if hasattr(self, "token_type_ids"):
+                    buffered_token_type_ids = self.token_type_ids[:, : input_shape[1]]
+                    buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], input_shape[1])
+                    token_type_ids = buffered_token_type_ids_expanded
+                else:
+                    token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+            # access token type embeddings
+            token_type_embeddings = self.token_type_embeddings(token_type_ids)
+            # add the token type embeddings to the word embeddings
+            embeddings = inputs_embeds + token_type_embeddings
+        else:
+            embeddings = inputs_embeds
+        return embeddings
+
+
+class MegaSimpleRelativePositionalBias(nn.Module):
+    """
+    Simple relative positional embeddings copied from the Mega repo; renamed variables for better readability
+    """
+
+    def __init__(self, config: MegaConfig):
+        super().__init__()
+        self.config = config
+        self.max_positions = self.config.max_positions if self.config.chunk_size < 0 else self.config.chunk_size
+        self.rel_pos_bias = nn.Parameter(torch.Tensor(2 * config.max_positions - 1))
+
+    def forward(self, seq_len):
+        if seq_len > self.max_positions:
+            raise ValueError("Sequence length {} going beyond max length {}".format(seq_len, self.max_positions))
+
+        # seq_len * 2 - 1
+        bias = self.rel_pos_bias[(self.max_positions - seq_len) : (self.max_positions + seq_len - 1)]
+        # seq_len * 3 - 1
+        tile = F.pad(bias, (0, seq_len))
+        # (seq_len * 3 - 1) * seq_len
+        tile = torch.tile(tile, (seq_len,))
+        tile = tile[:-seq_len]
+        # seq_len x (3 * seq_len - 2)
+        tile = tile.view(seq_len, 3 * seq_len - 2)
+        start = (2 * seq_len - 1) // 2
+        end = tile.size(1) - start
+        tile = tile[:, start:end]
+        return tile
+
+
+class MegaRotaryRelativePositionalBias(nn.Module):
+    """
+    Rotary relative bias for positional information; similar in concept to RoPE (i.e. RoFormer) but taken from the Mega
+    repo due to differences in implementation.
+
+    When initialized, produces a positional bias which ranges from position 0 to config.max_positions, but can
+    extrapolate to longer sequences. Can be indexed according to input position IDs
+    """
+
+    def __init__(self, config: MegaConfig):
+        super().__init__()
+        if config.hidden_size % 2 != 0:
+            raise RuntimeError("Rotary positional bias requires `hidden_size` to be a multiple of 2")
+        self.config = config
+        self.embed_dim = config.shared_representation_size
+        self.max_positions = self.config.max_positions if self.config.chunk_size < 0 else self.config.chunk_size
+        self.sine, self.cosine = MegaRotaryRelativePositionalBias.get_sinusoid_embeddings(
+            config.max_positions, self.embed_dim
+        )
+        # alpha and beta parameters for the rotary bias; beta renamed to b_param to avoid clashes with tf/flax weight handling
+        # in loading pretrained weights
+        self.alpha = nn.Parameter(torch.Tensor(1, self.embed_dim))
+        self.b_param = nn.Parameter(torch.Tensor(1, self.embed_dim))
+        self.register_buffer("_float_tensor", torch.FloatTensor([0.0]))
+
+    @staticmethod
+    def get_sinusoid_embeddings(max_positions: int, embedding_dim: int):
+        half_dim = embedding_dim // 2
+        emb = math.log(10000) / half_dim
+        emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb)
+        emb = torch.arange(max_positions, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0)
+        return torch.sin(emb), torch.cos(emb)
+
+    def rotary(self, input):
+        seq_len, embed_dim = input.size()
+        chunk_1, chunk_2 = torch.chunk(input, 2, dim=-1)
+        if self.sine is None or seq_len > self.sine.size(0):
+            self.sine, self.cosine = MegaRotaryRelativePositionalBias.get_sinusoid_embeddings(seq_len, embed_dim)
+            self.max_positions = seq_len
+        self.sine = self.sine.to(self._float_tensor)
+        self.cosine = self.cosine.to(self._float_tensor)
+
+        sin = self.sine[:seq_len]
+        cos = self.cosine[:seq_len]
+        return torch.cat([chunk_1 * cos - chunk_2 * sin, chunk_2 * cos + chunk_1 * sin], dim=1)
+
+    def forward(self, seq_len):
+        rotary_alpha = self.rotary(self.alpha.expand(seq_len, self.embed_dim))
+        rotary_beta = self.rotary(self.b_param.expand(seq_len, self.embed_dim))
+        bias = torch.einsum("mk,nk->mn", rotary_alpha, rotary_beta)
+        return bias
+
+
+class MegaDropout(nn.Module):
+    """
+    A unified class for standard dropout functionality and featurewise dropout.
+
+    The original fairseq Mega repo used 2 classes for these, which included some unnecessary handling of training logic
+    and an unused `inplace` option. The original implementation used torch.nn.functional instead of submodules, which
+    is retained here as well.
+    """
+
+    def __init__(self, dropout_probability, is_featurewise=False):
+        super().__init__()
+        self.dropout_probability = dropout_probability
+        self.is_featurewise = is_featurewise
+
+    def forward(self, input, batch_first: bool = False):
+        if self.is_featurewise:
+            if batch_first:
+                # (batch_size X sequence_length X feature_dimension)
+                # -> (batch_size X feature_dimension X sequence_length)
+                # -> (batch_size X sequence_length X feature_dimension)
+                return F.dropout2d(
+                    input.transpose(-1, -2), p=self.dropout_probability, training=self.training
+                ).transpose(-1, -2)
+            else:
+                if input.dim() != 3:
+                    raise ValueError(
+                        "Feature dropout inputs must be exactly 3-dimensional if inputs are ordered [sequence length, batch size, hidden dimension]"
+                    )
+                # (sequence_length X batch_size X feature_dimension)
+                # -> (batch_size X feature_dimension X sequence_length)
+                # -> (sequence_length X batch_size X feature_dimension)
+                return F.dropout2d(input.permute(1, 2, 0), p=self.dropout_probability, training=self.training).permute(
+                    2, 0, 1
+                )
+        else:
+            return F.dropout(input, p=self.dropout_probability, training=self.training)
+
+
+class MegaRMSNorm(nn.Module):
+    """
+    RMSNorm used in Mega implementation. Differs from T5's RMSNorm by applying the weight prior to taking the square
+    root (as opposed to after in T5)
+    """
+
+    def __init__(self, number_features, eps=1e-6, affine=True):
+        super().__init__()
+        self.num_features = number_features
+        self.eps = eps
+        self.affine = affine
+        if affine:
+            self.weight = nn.Parameter(torch.Tensor(self.num_features))
+        else:
+            self.register_parameter("weight", None)
+
+    def forward(self, input):
+        mean_square = torch.mean(torch.square(input), dim=-1, keepdim=True)
+        if self.weight is not None:
+            input = input * self.weight
+
+        input * torch.rsqrt(mean_square + self.eps)
+        return input
+
+
+class MegaScaleNorm(nn.Module):
+    """
+    Scale normalization introduced in MEGA which is similar to RMSNorm, but uses a single parameter for scalar
+    multiplication instead of a vector, and applies over a specified dimension
+    """
+
+    def __init__(self, dim, eps=1e-6, affine=True):
+        super().__init__()
+        self.dim = dim
+        self.eps = eps
+        self.affine = affine
+        if affine:
+            self.scalar = nn.Parameter(torch.Tensor(1))
+        else:
+            self.register_parameter("scalar", None)
+
+    def forward(self, input):
+        mean_square = torch.mean(torch.square(input), dim=self.dim, keepdim=True)
+        if self.scalar is not None:
+            input = self.scalar * input
+
+        output = input * torch.rsqrt(mean_square + self.eps)
+        return output
+
+
+class MegaSequenceNorm(nn.Module):
+    """
+    A wrapper class for various layer normalization options used in Mega. Used to handle differences in expectations on
+    input axis locations for different normalization methods.
+    """
+
+    def __init__(self, norm_type, embedding_dim, eps=1e-5, affine=True, export=False):
+        super().__init__()
+        if norm_type == "layernorm":
+            self.norm = nn.LayerNorm(embedding_dim, eps, elementwise_affine=affine)
+        elif norm_type == "scalenorm":
+            self.norm = MegaScaleNorm(dim=-1, eps=eps, affine=affine)
+        elif norm_type == "rmsnorm":
+            self.norm = MegaRMSNorm(embedding_dim, eps=eps, affine=affine)
+        elif norm_type == "batchnorm":
+            self.norm = nn.BatchNorm1d(embedding_dim, eps=eps, affine=affine)
+        elif norm_type == "syncbatchnorm":
+            self.norm = nn.SyncBatchNorm(embedding_dim, eps=eps, affine=affine)
+        else:
+            raise ValueError("Unknown norm type: {}".format(norm_type))
+
+    def forward(self, input):
+        if isinstance(self.norm, nn.modules.batchnorm._BatchNorm):
+            if input.dim() != 3:
+                raise ValueError("BatchNorm inputs must be exactly 3-dimensional")
+            input = input.permute(1, 2, 0)
+            input = self.norm(input)
+            return input.permute(2, 0, 1)
+        else:
+            return self.norm(input)
+
+
+# add this layernorm class to ALL_LAYERNORM_LAYERS
+ALL_LAYERNORM_LAYERS.append(MegaSequenceNorm)
+
+
+class MegaMultiDimensionDampedEma(nn.Module):
+    """
+    Mega's Exponential Moving Average layer, largely left unmodified from the original repo with the exception of
+    variable names and moving away from the stateful representation of incremental decoding state. See
+    "https://arxiv.org/abs/2209.10655" for more details.
+    """
+
+    def __init__(self, config: MegaConfig):
+        super().__init__()
+
+        self.config = config
+
+        self.embed_dim = config.hidden_size
+        self.ndim = config.ema_projection_size
+        self.bidirectional = config.bidirectional
+        self.truncation = config.truncation
+        self.scale = math.sqrt(1.0 / self.ndim)
+
+        kernel_dim = 2 * config.hidden_size if self.bidirectional else config.hidden_size
+        # renamed delta (damping_factor) and alpha (decay_factor) to be more descriptive of what the parameters are doing
+        self.damping_factor = nn.Parameter(torch.Tensor(kernel_dim, self.ndim, 1))
+        self.decay_factor = nn.Parameter(torch.Tensor(kernel_dim, self.ndim, 1))
+        # renamed gamma (kernel_projection_matrix) and beta (ema_expansion_matrix) respectively to avoid HF renaming
+        # things and align with the paper's description of these params' behavior
+        self.ema_expansion_matrix = nn.Parameter(torch.Tensor(kernel_dim, self.ndim, 1))
+        self.kernel_projection_matrix = nn.Parameter(torch.Tensor(kernel_dim, self.ndim))
+        # renamed omega to residual_weight to describe what it's doing
+        self.residual_weight = nn.Parameter(torch.Tensor(config.hidden_size))
+        self._kernel = None
+        self._coeffs = None
+
+    def _compute_ema_coefficients(self):
+        self._coeffs = None
+        # convert the alpha and delta parameters (kernel_dim x EMA projection size x 1) to [0, 1] with sigmoid
+        damping_factor = torch.sigmoid(self.damping_factor)
+        decay_factor = torch.sigmoid(self.decay_factor)
+        previous_timestep_weight = 1.0 - damping_factor * decay_factor
+        return damping_factor, previous_timestep_weight
+
+    def _compute_efficient_ema_kernel(self, length: int):
+        # computes the kernel used for efficient damped EMA applied via FFT convolution
+        self._kernel = None
+        # p and q have shape (kernel_dim x ema_projection_size x 1)
+        damping_factor, previous_timestep_weight = self._compute_ema_coefficients()
+        # extend the kernel to (kernel_dim X ema_projection_size X sequence_length) and
+        # multiply q by sequential ints up to the sequence length
+        vander = torch.arange(length).to(damping_factor).view(1, 1, length) * torch.log(previous_timestep_weight)
+        kernel = (damping_factor * self.ema_expansion_matrix) * torch.exp(vander)
+        # (kernel_dim X ema_projection_size X sequence_length) -> (kernel_dim, sequence_length)
+        return torch.einsum("dnl,dn->dl", kernel, self.kernel_projection_matrix * self.scale)
+
+    def get_ema_coefficients(self):
+        if self.training:
+            return self._compute_ema_coefficients()
+        else:
+            if self._coeffs is None:
+                self._coeffs = self._compute_ema_coefficients()
+            return self._coeffs
+
+    def get_ema_kernel(self, length: int):
+        kernel_size = length if self.truncation is None else min(self.truncation, length)
+        if self.training:
+            return self._compute_efficient_ema_kernel(kernel_size)
+        else:
+            if self._kernel is None or self._kernel.size(-1) < kernel_size:
+                self._kernel = self._compute_efficient_ema_kernel(kernel_size)
+            return self._kernel[..., :kernel_size]
+
+    def fft_convolution(self, inputs, kernel, length):
+        # this is a wrapper for repeated use of EMA calculation via FFT (fast Fourier transform) convolution
+        inputs_fft = torch.fft.rfft(inputs.float(), n=2 * length)
+        kernel_fft = torch.fft.rfft(kernel.float(), n=2 * length)
+        convolved_sequence = torch.fft.irfft(inputs_fft * kernel_fft, n=2 * length)
+        return convolved_sequence
+
+    def ema_step(self, inputs, length, past_state=None):
+        if length == 1:
+            return self.one_ema_step(inputs, past_state=past_state)
+
+        # (kernel_dim X ema_projection_size X 1)
+        damping_factor, previous_timestep_weight = self.get_ema_coefficients()
+        # (kernel_dim X ema_projection_size X 1+sequence_length)
+        vander = torch.arange(length + 1).to(damping_factor).view(1, 1, length + 1) * torch.log(
+            previous_timestep_weight
+        )
+        vander = torch.exp(vander)
+        if past_state is not None:
+            # (kernel_dim X ema_projection_size X sequence_length) * (kernel_dim X ema_projection_size X 1)
+            # -> (kernel_dim X ema_projection_size X sequence_length)
+            past_ema_proj = vander[:, :, 1:] * (self.kernel_projection_matrix * self.scale).unsqueeze(-1)
+            # past_state will be (batch_size, kernel_dim, ema_projection_size)
+            past_ema_state = torch.einsum("bdn,dnl->bdl", past_state, past_ema_proj)
+            # (kernel_dim X ema_projection_size) * (batch_size X kernel_dim X ema_projection_size)
+            # -> (batch_size X kernel_dim X ema_projection_size)
+            past_vandermonde = vander[:, :, -1] * past_state
+        else:
+            past_ema_state = None
+            past_vandermonde = None
+
+        # (kernel_dim X ema_projection_size X sequence_length)
+        vander = vander[:, :, :-1]
+        kernel = (damping_factor * self.ema_expansion_matrix) * vander
+        kernel_proj = torch.einsum("dnl,dn->dl", kernel, self.kernel_projection_matrix * self.scale)
+
+        ema_output = self.fft_convolution(inputs, kernel_proj, length=length)[..., 0:length]
+        ema_output = ema_output.type_as(inputs)
+        if past_ema_state is not None:
+            ema_output = ema_output + past_ema_state
+
+        updated_hidden_state = torch.einsum("bdl,dnl->bdn", inputs, torch.flip(kernel, dims=[2]))
+        if past_vandermonde is not None:
+            updated_hidden_state = updated_hidden_state + past_vandermonde
+        # return a tuple:
+        # (sequence_length, batch_size, kernel_dim)
+        # (batch_size, kernel_dim, ema_projection_size)
+        return ema_output.permute(2, 0, 1), updated_hidden_state
+
+    def one_ema_step(self, inputs, past_state=None):
+        damping_factor, previous_timestep_weight = self.get_ema_coefficients()
+        # (kernel_dim X ema_projection_size) x (batch_size X kernel_dim X 1)
+        # -> (batch_size X kernel_dim X ema_projection_size)
+        updated_state = (damping_factor * self.ema_expansion_matrix).squeeze(-1) * inputs
+        if past_state is not None:
+            updated_state = updated_state + previous_timestep_weight.squeeze(-1) * past_state
+        # (batch_size X kernel_dim)
+        out = torch.einsum("bdn,dn->bd", updated_state, self.kernel_projection_matrix * self.scale)
+        # (1 X batch_size X kernel_dim), (batch_size X kernel_dim X ema_projection_size)
+        return out.unsqueeze(0), updated_state
+
+    def forward(
+        self,
+        inputs,
+        attention_mask: Optional[torch.Tensor] = None,
+        prev_state: Optional[torch.Tensor] = None,
+        use_cache: bool = False,
+    ) -> torch.Tensor:
+        """
+        Mega's exponential moving average (EMA) sub-layer applied prior to single-headed (traditional) self-attention
+
+        Args:
+            inputs (`torch.Tensor` of shape `(sequence_length, batch_size, hidden_size)`):
+                Hidden state / embedding input to update via EMA based on FFT convolution
+            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indicates which inputs are to be ignored (mostly due to padding), where elements are either 1 for *not
+                masked* or 0 for *masked*
+            prev_state (`torch.Tensor` of shape `(batch_size, config.ndim)`, *optional*):
+                The hidden state returned from the previous timestep during incremental decoding.
+            use_cache (`bool`, default `False`):
+                Whether to perfom incremental decoding; uses `prev_state` as the prior timestep, and returns the
+                updated EMA hidden state for use in the next step
+
+        Returns:
+            `tuple(torch.FloatTensor)` containing various elements depending on configuration ([`MegaConfig`]) and
+            inputs:
+            - **hidden_states** (`torch.FloatTensor` of shape `(sequence_length, batch_size, hidden_size)`) -- Hidden
+              states updated by EMA, with same shapes as inputs
+            - **updated_state** (*optional*, returned when `use_cache=True`) `torch.FloatTensor of shape `(batch_size,
+              config.ndim)` -- The incremental EMA state for use in the next step of incremental decoding
+        """
+
+        seq_len, bsz, embed_dim = inputs.size()
+        if embed_dim != self.embed_dim:
+            raise ValueError(
+                f"Unexpected embedding dimension received: input is {embed_dim}, model expects {self.embed_dim}"
+            )
+
+        # sequence_length X batch_size X hidden_size
+        residual = inputs * self.residual_weight
+
+        # (sequence_length x batch_size x hidden_size) -> (batch_size x hidden_size x sequence_length)
+        inputs = inputs.permute(1, 2, 0)
+        # mask the input: output is a tensor with 0 in the masked positions
+        if attention_mask is not None:
+            inputs = inputs * (attention_mask.unsqueeze(1).type_as(inputs))
+
+        if self.bidirectional and use_cache:
+            raise RuntimeError("Bidirectional EMA does not support incremental state")
+
+        if use_cache:
+            out, updated_state = self.ema_step(inputs, seq_len, past_state=prev_state)
+
+            # (batch_size X hidden_size) -> (1 x batch_size x hidden_size)
+            out = F.silu(out + residual)
+
+            # if incremental decoding, return the new state along with the output
+            return out, updated_state
+        else:
+            # (hidden_size x sequence_length)
+            kernel = self.get_ema_kernel(seq_len)
+            fft_len = seq_len
+            s_index = 0
+            kernel_size = kernel.size(1)
+            if self.bidirectional:
+                # split the kernel for each direction of EMA
+                k1, k2 = torch.split(kernel, [self.embed_dim, self.embed_dim], dim=0)
+                # (hidden_size X 2*sequence_length - 1)
+                kernel = F.pad(k1, (kernel_size - 1, 0)) + F.pad(k2.flip(-1), (0, kernel_size - 1))
+                inputs = F.pad(inputs, (kernel_size - 1, 0))
+                fft_len = fft_len + kernel_size - 1
+                s_index = 2 * kernel_size - 2
+
+            ema_output = self.fft_convolution(inputs, kernel, length=fft_len)[..., s_index : s_index + seq_len]
+            ema_output = ema_output.type_as(inputs)
+            # (batch_size X hidden_size X sequence_length) -> (sequence_length X batch_size X hidden_size)
+            gated_ema_output = F.silu(ema_output.permute(2, 0, 1) + residual)
+
+            return gated_ema_output, None
+
+
+class MegaGatedCrossAttention(nn.Module):
+    """
+    Gated Structured State Attention for use in encoder-decoder model. See Mega paper for more details. Only
+    modifications from original implementation are variable names, removing the unnecessary `before_attn_fn` and
+    `static_kv` arguments, and the stateful representation of incremental decoder state.
+    """
+
+    def __init__(self, config: MegaConfig):
+        super().__init__()
+
+        self.config = config
+        self.activation = ACT2FN[self.config.activation]
+        self.attention_activation = self.config.attention_activation
+        self.scaling = (
+            self.config.shared_representation_size**-0.5 if self.attention_activation == "softmax" else None
+        )
+
+        self.dropout = MegaDropout(self.config.dropout_prob, is_featurewise=self.config.use_feature_dropout)
+        self.hidden_dropout = MegaDropout(
+            self.config.hidden_dropout_prob, is_featurewise=self.config.use_feature_dropout
+        )
+        # Attention dropout is standard dropout
+        self.attention_dropout = MegaDropout(self.config.attention_probs_dropout_prob, is_featurewise=False)
+
+        self.prenorm = self.config.normalize_before_mega
+        self.norm = MegaSequenceNorm(
+            self.config.normalization_type, self.config.hidden_size, affine=self.config.norm_affine
+        )
+
+        self.k_proj = nn.Linear(self.config.hidden_size, self.config.shared_representation_size)
+        self.v_proj = nn.Linear(self.config.hidden_size, self.config.hidden_size)
+        self.q_proj = nn.Linear(
+            self.config.hidden_size, 2 * self.config.hidden_size + self.config.shared_representation_size
+        )
+        self.h_proj = nn.Linear(self.config.hidden_size, self.config.hidden_size)
+
+        if self.config.relative_positional_bias == "simple":
+            self.rel_pos_bias = MegaSimpleRelativePositionalBias(config)
+        elif self.config.relative_positional_bias == "rotary":
+            self.rel_pos_bias = MegaRotaryRelativePositionalBias(config)
+        else:
+            raise ValueError("unknown relative position bias: {}".format(self.config.relative_positional_bias))
+
+        self.softmax = nn.Softmax(dim=-1)
+
+    def element_attention(self, query, key, key_padding_mask, pidx):
+        bsz, src_len, _ = key.size()
+        tgt_len = query.size(1) if pidx is None else pidx + 1
+        if key_padding_mask is not None:
+            # (batch_size X source_sequence_length) --> (batch_size X 1 X 1)
+            lengths = key_padding_mask.sum(dim=-1).view(bsz, 1, 1)
+        else:
+            lengths = src_len
+
+        # (target_sequence_length X source_sequence_length)
+        bias = self.rel_pos_bias(max(tgt_len, src_len))[:, :src_len]
+        if pidx is not None:
+            if query.size(1) != 1:
+                raise ValueError("Position offset provided with queries longer than 1 token")
+            # source_sequence_length
+            bias = bias[pidx]
+        else:
+            # (target_sequence_length X source_sequence_length)
+            bias = bias[:tgt_len]
+
+        # (batch_size X target_sequence_length X source_sequence_length)
+        qk = torch.bmm(query, key.transpose(1, 2)) / lengths + bias
+
+        attn_weights = ACT2FN[self.attention_activation](qk).type_as(qk)
+
+        if key_padding_mask is not None:
+            attn_weights = attn_weights * key_padding_mask.unsqueeze(1)
+
+        return attn_weights
+
+    def softmax_attention(self, query, key, key_padding_mask, pidx):
+        bsz, src_len, _ = key.size()
+        tgt_len = query.size(1) if pidx is None else pidx + 1
+
+        # (target_sequence_length X source_sequence_length)
+        bias = self.rel_pos_bias(max(tgt_len, src_len))[:, :src_len]
+        if pidx is not None:
+            if query.size(1) != 1:
+                raise ValueError("Position offset provided with queries longer than 1 token")
+            # source_sequence_length
+            bias = bias[pidx]
+        else:
+            # (target_sequence_length X source_sequence_length)
+            bias = bias[:tgt_len]
+
+        # scaled attention
+        query = query * self.scaling
+        # (batch_size X target_sequence_length X source_sequence_length)
+        qk = torch.bmm(query, key.transpose(1, 2)) + bias
+
+        if key_padding_mask is not None:
+            qk = qk.masked_fill((1 - key_padding_mask).unsqueeze(1).to(torch.bool), float("-inf"))
+
+        attn_weights = self.softmax(qk).type_as(qk)
+        return attn_weights
+
+    def forward(
+        self,
+        query,
+        key: Optional[torch.Tensor],
+        value: Optional[torch.Tensor],
+        key_padding_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Gated cross-attention used in Mega
+
+        Args:
+            query (`torch.Tensor` of shape `(target_sequence_length, batch_size, hidden_size)`):
+                The self (or target) sequence input used as query inputs for cross-attention
+            key (`torch.Tensor` of shape `(source_sequence_length, batch_size, hidden_size)`):
+                The cross (or source) sequence input with shape used as keys in cross-attention
+            value (`torch.Tensor` of shape `(source_sequence_length, batch_size, hidden_size)`):
+                The cross (or source) sequence input with shape used as values in cross-attention
+            key_padding_mask (`torch.LongTensor` of shape `(batch_size, source_sequence_length)`, *optional*):
+                Padding mask corresponding to the source sequence, where entries are 1 for *not masked* and 0 for
+                *masked* tokens
+            past_key_values (`tuple(torch.FloatTensor)`, *optional*):
+                If provided, the hidden state returned from the previous timestep during incremental decoding; expects
+                that prior cross-attention keys and values will be the last two items in the tuple
+            output_attentions (`bool`, defaults to `False`):
+                Whether or not to return the cross-attention weights.
+            use_cache (`bool`, defaults to `False`):
+                Whether to perfom incremental decoding; uses `prev_state` as the prior timestep, and returns the
+                updated EMA hidden state for use in the next step
+
+        Returns:
+            `tuple(torch.FloatTensor)` containing various elements depending on configuration ([`MegaConfig`]) and
+            inputs:
+            - **hidden_states** (`torch.FloatTensor` of shape `(target_sequence_length, batch_size, hidden_size)`) --
+              Hidden states from target sequence updated by gated cross-attention
+            - **attn_weights** (*optional*, returned when `output_attentions=True`) `torch.FloatTensor` of shape
+              `(batch_size, source_sequence_length, target_sequence_length)` -- The pairwise cross-attention weights
+              corresponding to each token in the source and target sequences
+            - **cross_key** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape `(batch_size,
+              source_sequence_length, config.shared_representation_size)` -- The cross-attention key state for use in
+              the next step of incremental decoding
+            - **cross_value** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape `(batch_size,
+              source_sequence_length, config.hidden_size)` -- The cross-attention value state for use in the next step
+              of incremental decoding
+        """
+
+        seq_len, bsz, embed_dim = query.size()
+        if embed_dim != self.config.hidden_size:
+            raise ValueError(
+                f"Unexpected embedding dimension received: input is {embed_dim} but expected {self.config.hidden_size}"
+            )
+
+        if past_key_values is not None:
+            # make sure the inputs only have a sequence length of 1 if we're doing incremental decoding
+            if seq_len != 1:
+                raise ValueError(f"Incremental decoding requested with self-sequence length > 1: {seq_len}")
+            # expect past_key_values to have (self_key, self_value, self_ema, cross_key, cross_value)
+            prev_cross_key, prev_cross_value = past_key_values[-2:]
+            key = value = None
+
+            # use the self-attention cache to get the position id of the current step
+            prev_self_key = past_key_values[0]
+            num_incremental_steps = prev_self_key.size(1) + 1
+        else:
+            prev_cross_key = prev_cross_value = None
+            # we still need the position id if we're doing incremental decoding (past_key_values will be None for the first step)
+            num_incremental_steps = 0 if use_cache and (seq_len == 1) else None
+
+        full_query = query
+        if self.prenorm:
+            full_query = self.norm(full_query)
+
+        # (target_sequence_length X batch_size X 2*hidden_size + shared_representation_size)
+        query_projected = self.q_proj(full_query)
+        # split the query projections into separate components
+        # - residual_weight is passed through sigmoid and sent through elementwise multiplication to the gated/weighted targets prior to being added to the query directly
+        # - target_gate is a silu-gated tensor that is multiplied by the attention-weighted target below prior to residual connection
+        # - attention_query is the part that is passed to the attention function
+        residual_weight, target_gate, attention_query = torch.split(
+            query_projected,
+            [self.config.hidden_size, self.config.hidden_size, self.config.shared_representation_size],
+            dim=-1,
+        )
+
+        # (target_sequence_length X batch_size X hidden_size)
+        residual_weight = torch.sigmoid(residual_weight)
+        target_gate = F.silu(target_gate)
+
+        if key is None:
+            if value is not None:
+                raise ValueError("Key and value must be `None` simultaneously")
+            projected_key = projected_value = None
+        else:
+            # (source_sequence_length X batch_size X shared_representation_size)
+            projected_key = self.k_proj(key)
+            # (source_sequence_length X batch_size X hidden_size)
+            projected_value = self.activation(self.v_proj(key))
+
+        # (target_sequence_length X batch_size X shared_representation_size)
+        # -> (batch_size X target_sequence_length X shared_representation_size)
+        attention_query = attention_query.transpose(0, 1)
+        if projected_key is not None:
+            projected_key = projected_key.transpose(0, 1)
+        if projected_value is not None:
+            projected_value = projected_value.transpose(0, 1)
+
+        # if we're doing incremental decoding, k and v are None and need to be overwritten with past values
+        if past_key_values is not None:
+            projected_key = prev_cross_key
+            projected_value = prev_cross_value
+
+        # if we're returning the cache for later use, store these now for later return (can be done without having past_key_values provided)
+        if use_cache:
+            updated_cross_key = projected_key
+            updated_cross_value = projected_value
+
+        ctx_len = projected_key.size(1)
+        # This is part of a workaround to get around fork/join parallelism
+        # not supporting Optional types.
+        if key_padding_mask is not None and key_padding_mask.dim() == 0:
+            key_padding_mask = None
+
+        if key_padding_mask is not None:
+            if key_padding_mask.size(0) != bsz:
+                raise ValueError("Key padding mask does not align on the batch dimension")
+            if key_padding_mask.size(1) != ctx_len:
+                raise ValueError("Key padding mask does not align on the sequence length dimension")
+
+        if self.attention_activation == "softmax":
+            attn_weights = self.softmax_attention(
+                attention_query, projected_key, key_padding_mask, num_incremental_steps
+            )
+        else:
+            attn_weights = self.element_attention(
+                attention_query, projected_key, key_padding_mask, num_incremental_steps
+            )
+
+        projected_value = self.hidden_dropout(projected_value, batch_first=True)
+        kernel = self.attention_dropout(attn_weights)
+        # (batch_size X target_sequence_length X hidden_size)
+        # -> (target_sequence_length X batch_size X hidden_size)
+        weighted_targets = torch.bmm(kernel, projected_value).transpose(0, 1)
+        # (target_sequence_length X batch_size X hidden_size)
+        weighted_targets = self.activation(self.h_proj(weighted_targets * target_gate))
+        weighted_targets = self.dropout(weighted_targets)
+        out = torch.addcmul(query, residual_weight, weighted_targets - query)
+
+        if not self.prenorm:
+            out = self.norm(out)
+
+        outputs = (out, attn_weights) if output_attentions else (out,)
+        if use_cache:
+            outputs = outputs + (updated_cross_key, updated_cross_value)
+
+        return outputs
+
+
+class MegaMovingAverageGatedAttention(nn.Module):
+    """
+    Pure PyTorch implementation of Mega block; see https://arxiv.org/abs/2209.10655 and original fairseq implementation
+    at https://github.com/facebookresearch/mega (copyright Meta Research, licensed under MIT License)
+
+    Differences from original implementation include hidden state refactor and fixed inconsistency with additive /
+    multiplicative attention masks
+    """
+
+    def __init__(self, config: MegaConfig):
+        super().__init__()
+        self.config = config
+        self.activation = ACT2FN[self.config.activation]
+        self.scaling = (
+            self.config.shared_representation_size**-0.5 if self.config.attention_activation == "softmax" else None
+        )
+        self.dropout = MegaDropout(self.config.dropout_prob, is_featurewise=self.config.use_feature_dropout)
+        self.hidden_dropout = MegaDropout(
+            self.config.hidden_dropout_prob, is_featurewise=self.config.use_feature_dropout
+        )
+        # attention dropout is standard dropout
+        self.attention_dropout = MegaDropout(self.config.attention_probs_dropout_prob, is_featurewise=False)
+
+        self.norm = MegaSequenceNorm(
+            self.config.normalization_type, self.config.hidden_size, affine=self.config.norm_affine
+        )
+        self.ema_gate = MegaMultiDimensionDampedEma(config)
+
+        self.v_proj = nn.Linear(self.config.hidden_size, self.config.intermediate_size)
+        self.mx_proj = nn.Linear(
+            self.config.hidden_size,
+            self.config.shared_representation_size + self.config.intermediate_size + 2 * self.config.hidden_size,
+        )
+        self.h_proj = nn.Linear(self.config.intermediate_size, self.config.hidden_size)
+
+        self.qk_weight = nn.Parameter(torch.Tensor(2, self.config.shared_representation_size))
+        self.qk_bias = nn.Parameter(torch.Tensor(2, self.config.shared_representation_size))
+
+        if self.config.relative_positional_bias == "simple":
+            self.rel_pos_bias = MegaSimpleRelativePositionalBias(config)
+        elif self.config.relative_positional_bias == "rotary":
+            self.rel_pos_bias = MegaRotaryRelativePositionalBias(config)
+        else:
+            raise ValueError(f"Unknown relative positional bias: {self.config.relative_positional_bias}")
+
+        self.softmax = nn.Softmax(dim=-1)
+        self.attention_function = (
+            self.softmax_attention if self.config.attention_activation == "softmax" else self.element_attention
+        )
+
+    def element_attention(self, query, key, padding_mask, causal_mask):
+        """
+        Apply element-wise attention via relu^2 or laplace. Same as original implementation but with standardized
+        causal attention mask. Expects the Hugging Face standard attention mask paradigm: 1 for not masked, and 0 for
+        masked.
+        """
+        seq_len = key.size(2)
+        if padding_mask is not None:
+            # (batch_size X number of chunks X 1)
+            lengths = padding_mask.sum(-1, keepdim=True)
+            # (batch_size X number of chunks X 1 X 1)
+            lengths = lengths.clamp(min=1.0).unsqueeze(-1)
+        else:
+            lengths = seq_len
+
+        if causal_mask is not None:
+            lengths = causal_mask.sum(dim=-1, keepdim=True)
+
+        # (sequence_length X sequence_length)
+        bias = self.rel_pos_bias(seq_len)
+        if seq_len != query.size(2):
+            if query.size(2) != 1:
+                raise ValueError("Size mismatch between Q and K in element attention")
+            # (1 X sequence_length)
+            bias = bias[-1:]
+
+        # (batch_size X number of chunks X sequence_length X sequence_length)
+        qk = torch.matmul(query, key.transpose(2, 3)) / lengths + bias
+
+        attn_weights = ACT2FN[self.config.attention_activation](qk).type_as(qk)
+
+        if padding_mask is not None:
+            attn_weights = attn_weights * padding_mask.unsqueeze(2)
+
+        if causal_mask is not None:
+            attn_weights = attn_weights * causal_mask
+
+        return attn_weights
+
+    def softmax_attention(self, query, key, padding_mask, causal_mask):
+        "Standard softmax self-attention, as in the original Transformer paper"
+        seq_len = key.size(2)
+        # (sequence_length X sequence_length)
+        bias = self.rel_pos_bias(seq_len)
+        if seq_len != query.size(2):
+            if query.size(2) != 1:
+                raise ValueError("Size mismatch between Q and K in softmax attention")
+            # (1 X sequence_length)
+            bias = bias[-1:]
+
+        # scaled attention
+        query = query * self.scaling
+
+        # (batch_size x number of chunks x chunk_size x chunk_size) if chunking
+        # (batch_size x 1 x sequence_length x sequence_length) otherwise
+        qk = torch.matmul(query, key.transpose(2, 3)) + bias
+
+        # apply causal mask (presumed to be 1/0 for not masked / masked)
+        # additive, but convert to 0/-inf (which is not explicitly in the Mega source code)
+        if causal_mask is not None:
+            additive_causal_mask = torch.zeros_like(causal_mask, dtype=qk.dtype)
+            additive_causal_mask = additive_causal_mask.masked_fill((1 - causal_mask).bool(), float("-inf"))
+            qk = qk + additive_causal_mask
+
+        if padding_mask is not None:
+            # 1 for tokens which are *not masked*
+            # 0 for tokens which are *masked*
+            # replace masked tokens with -inf to make softmax ignore them
+            # need to invert the padding mask to match what mega original did
+            padding_mask = 1 - padding_mask
+            padding_mask_all = padding_mask.all(dim=-1, keepdim=True)
+            padding_mask = torch.logical_and(padding_mask, ~padding_mask_all)
+            qk = qk.masked_fill(padding_mask.unsqueeze(2).to(torch.bool), float("-inf"))
+
+        attn_weights = self.softmax(qk).type_as(qk)
+        return attn_weights
+
+    def forward(
+        self,
+        input,
+        padding_mask: Optional[torch.Tensor] = None,
+        causal_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions=False,
+        use_cache=False,
+    ):
+        """
+        Mega's self-attention block, which combines multi-headed EMA with traditional self-attention
+
+        Args:
+            input (`torch.Tensor` of shape `(sequence_length, batch_size, hidden_size)`):
+                Hidden states to be updated by Mega's self-attention
+            padding_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked*
+                or 0 for *masked*
+            causal_mask (`torch.LongTensor` of shape `(sequence_length, sequence_length)`, *optional*):
+                Indicates which inputs are to be ignored due to causal attention, where elements are either 1 for *not
+                masked* or 0 for *masked*
+            past_key_values (`tuple(torch.Tensor)`, *optional*):
+                The hidden states returned from the previous timestep during incremental decoding; expects that
+                self-attention key, value, and EMA states are the first 3 entries in the tuple
+            output_attentions (`bool`, default `False`):
+                Whether to return self-attention weights
+            use_cache (`bool`, default `False`):
+                Whether to perfom incremental decoding; uses `past_key_values` as prior state, and returns the updated
+                states for use in the next step
+
+        Returns:
+            `tuple(torch.FloatTensor)` containing various elements depending on configuration ([`MegaConfig`]) and
+            inputs:
+            - **hidden_states** (`torch.FloatTensor` of shape `(sequence_length, batch_size, hidden_size)`) -- Hidden
+              states from target sequence updated by Mega's self-attention
+            - **attn_weights** (*optional*, returned when `output_attentions=True`) `torch.FloatTensor` of shape
+              `(batch_size, 1, sequence_length, sequence_length)` -- The self-attention weights corresponding to how
+              each token in the input sequence attends to every other token
+            - **self_key** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape `(batch_size,
+              sequence_length, config.shared_representation_size)` -- The self-attention key state for use in the next
+              step of incremental decoding
+            - **self_value** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape `(batch_size,
+              sequence_length, config.hidden_size)` -- The self-attention value state for use in the next step of
+              incremental decoding
+            - **self_ema_state** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape
+              `(batch_size, config.ndim)` The incremental EMA state for use in the next step of incremental decoding.
+        """
+
+        seq_len, bsz, embed_dim = input.size()
+        if embed_dim != self.config.hidden_size:
+            raise ValueError(f"Input embedding dimension should be {self.config.hidden_size}; received {embed_dim}")
+
+        # store inputs for residual connection and handle pre-norm if requested
+        residual = input
+        if self.config.normalize_before_mega:
+            input = self.norm(input)
+
+        # (sequence_length X batch_size X hidden_size) -> (sequence_length X batch_size X intermediate_size)
+        value = self.activation(self.v_proj(input))
+
+        # unpack the incremental state if provided
+        # assumed to be (self K, self V, self EMA state, cross K, cross V)
+        # also assumes that incremental decoding is working one token at a time, so input sequence length must be 1
+        if self.config.is_decoder and (past_key_values is not None):
+            if seq_len > 1:
+                raise ValueError(f"Incremental decoding only supports self sequence length of 1; received {seq_len}")
+            # the first 3 items in the saved states will be these regardless of whether cross-attention is present
+            prev_self_key, prev_self_value, prev_ema_state = past_key_values[0:3]
+        else:
+            prev_self_key = prev_self_value = prev_ema_state = None
+
+        # ema output is (sequence_length x batch_size x hidden_size)
+        # updated_ema_state will be None if use_cache=False; otherwise (batch_size, config.ndim)
+        ema_out, updated_ema_state = self.ema_gate(
+            input, attention_mask=padding_mask, prev_state=prev_ema_state, use_cache=use_cache
+        )
+        ema_out = self.dropout(ema_out)
+
+        # (sequence_length X batch_size X hidden_size)
+        # -> (sequence_length X batch_size X 2*hidden_size + config.shared_representation_size + config.intermediate_size)
+        # - residual_weight -> sigmoid -> applied to residual connection in torch.addcmul
+        # - query_key_gates -> split into two components: query_key becomes query and key for attention input, gates becomes gating for self-attention output
+        # - intermediate_state -> added to weighted attention output, sent through activation, and has inputs subtracted during
+        #   torch.addcmul to create the final layer output
+        base = self.mx_proj(ema_out)
+        residual_weight, query_key_gates, intermediate_state = torch.split(
+            base,
+            [
+                self.config.hidden_size,
+                self.config.shared_representation_size + self.config.intermediate_size,
+                self.config.hidden_size,
+            ],
+            dim=-1,
+        )
+
+        # (sequence_length X batch_size X hidden_size)
+        residual_weight = torch.sigmoid(residual_weight)
+
+        # (sequence_length X batch_size X shared_representation_size + intermediate_size)
+        query_key_gates = F.silu(query_key_gates)
+
+        # split into two different tensors: one for Q/K usage and the other for gating self-attention
+        query_key, attention_gate = torch.split(
+            query_key_gates, [self.config.shared_representation_size, self.config.intermediate_size], dim=-1
+        )
+
+        # (sequence_length X batch_size X shared_representation_size)
+        # -> (sequence_length X batch_size X 1 X shared_representation_size)
+        # -> (sequence_length X batch_size X 2 X shared_representation_size)
+        query_key = query_key.unsqueeze(2) * self.qk_weight + self.qk_bias
+
+        # (sequence_length X batch_size X 2 X shared_representation_size)
+        # -> 2 tensors of (sequence_length X batch_size X shared_representation_size)
+        query, key = torch.unbind(query_key, dim=2)
+
+        # (sequence_length X batch_size X dimension)
+        # -> (batch_size X sequence_length X dimension)
+        # where `dimension` is either shared_representation_size (queries and keys) or intermediate_size (values)
+        query = query.transpose(0, 1)
+        key = key.transpose(0, 1)
+        value = value.transpose(0, 1)
+
+        if self.config.is_decoder:
+            # combine history and current to save updated state (if history is provided)
+            # when chunking is applied, the past states will be None at the end of the chunk, in
+            # which case, proceed as if no K/V history had been provided
+            # saved states are stored with shape (batch_size X sequence_length X dimension)
+            if prev_self_key is not None:
+                key = torch.cat([prev_self_key, key], dim=1)
+            if prev_self_value is not None:
+                value = torch.cat([prev_self_value, value], dim=1)
+
+            # if not chunking, store as-is
+            if not self.config.use_chunking:
+                updated_self_key = key
+                updated_self_value = value
+            else:
+                curr_len = key.size(1) % self.config.chunk_size
+                if curr_len == 0:
+                    # if we're chunking and have reached the end of a chunk, wipe out the saved state
+                    updated_self_key = None
+                    updated_self_value = None
+                else:
+                    updated_self_key = key
+                    updated_self_value = value
+
+        ctx_len = key.size(1)  # potentially differs from seq_len because of incremental decoding
+        if not self.config.use_chunking:
+            # if we're not chunking, treat the entire sequence as one long chunk
+            # (batch_size X sequence_length X dimension) -> (batch_size X 1 X sequence_length X dimension)
+            query = query.unsqueeze(1)
+            key = key.unsqueeze(1)
+            value = value.unsqueeze(1)
+            if padding_mask is not None:
+                # (batch_size X sequence_length) -> (batch_size X 1 X sequence_length)
+                padding_mask = padding_mask.unsqueeze(1)
+        else:
+            # otherwise, split the sequences in the batch into `n_chunks` chunks of size `chunk_size`
+            if seq_len < self.config.chunk_size:
+                query = query.unsqueeze(1)
+            else:
+                # (batch_size X sequence_length X dimension) -> (batch_size X n_chunks X chunk_size X dimension)
+                n_chunks = seq_len // self.config.chunk_size
+                query = query.reshape(bsz, n_chunks, self.config.chunk_size, self.config.shared_representation_size)
+
+            if ctx_len < self.config.chunk_size:
+                key = key.unsqueeze(1)
+                value = value.unsqueeze(1)
+                if padding_mask is not None:
+                    padding_mask = padding_mask.unsqueeze(1)
+            else:
+                # (batch_size X sequence_length X dimension) -> (batch_size X n_chunks X chunk_size X dimension)
+                n_chunks = ctx_len // self.config.chunk_size
+                key = key.reshape(bsz, n_chunks, self.config.chunk_size, self.config.shared_representation_size)
+                value = value.reshape(bsz, n_chunks, self.config.chunk_size, self.config.intermediate_size)
+                if padding_mask is not None:
+                    padding_mask = padding_mask.view(bsz, n_chunks, self.config.chunk_size)
+
+        # this is in the original Mega implementation to work around fork/join parallelism not supporting optional types
+        if padding_mask is not None and padding_mask.dim() == 0:
+            padding_mask = None
+
+        attn_weights = self.attention_function(query, key, padding_mask=padding_mask, causal_mask=causal_mask)
+
+        value = self.hidden_dropout(value, batch_first=True)
+        kernel = self.attention_dropout(attn_weights)
+
+        # (batch_size x n_chunks x chunk_size x intermediate_size) -> (sequence_length X batch_size X intermediate_size)
+        weighted_self_output = (
+            torch.matmul(kernel, value).view(bsz, seq_len, self.config.intermediate_size).transpose(0, 1)
+        )
+
+        # (sequence_length X batch_size X intermediate_size) -> (sequence_length X batch_size X hidden_size)
+        weighted_self_output = self.activation(intermediate_state + self.h_proj(weighted_self_output * attention_gate))
+        weighted_self_output = self.dropout(weighted_self_output)
+        # (sequence_length X batch_size X hidden_size)
+        out = torch.addcmul(residual, residual_weight, weighted_self_output - residual)
+
+        if not self.config.normalize_before_mega:
+            out = self.norm(out)
+
+        return_values = (out, attn_weights) if output_attentions else (out,)
+
+        if self.config.is_decoder:
+            return_values = return_values + (updated_self_key, updated_self_value, updated_ema_state)
+
+        return return_values
+
+
+class MegaNormalizedFeedForwardNetwork(nn.Module):
+    """
+    Normalized feed-forward network used in Mega blocks. Left as-is from original Mega repo aside from retrieving args
+    from Hugging Face config
+    """
+
+    def __init__(self, config: MegaConfig):
+        super().__init__()
+
+        self.config = config
+        self.hidden_dim = config.nffn_hidden_size
+        self.act_fn = config.activation
+        self.activation = ACT2FN[config.activation]
+
+        self.dropout = MegaDropout(self.config.dropout_prob, is_featurewise=self.config.use_feature_dropout)
+        self.hidden_dropout = MegaDropout(
+            self.config.nffn_activation_dropout_prob, is_featurewise=self.config.use_feature_dropout
+        )
+
+        self.prenorm = self.config.normalize_before_ffn
+        self.norm = MegaSequenceNorm(
+            self.config.normalization_type, self.config.hidden_size, affine=self.config.norm_affine
+        )
+
+        self.fc1 = nn.Linear(self.config.hidden_size, self.config.nffn_hidden_size)
+        self.fc2 = nn.Linear(self.config.nffn_hidden_size, self.config.hidden_size)
+
+    def forward(self, inputs):
+        residual = inputs
+
+        if self.prenorm:
+            inputs = self.norm(inputs)
+
+        hidden = self.activation(self.fc1(inputs))
+        hidden = self.hidden_dropout(hidden)
+        output = self.fc2(hidden)
+        output = self.dropout(output)
+        output = output + residual
+
+        if not self.prenorm:
+            output = self.norm(output)
+
+        return output
+
+
+class MegaBlock(nn.Module):
+    def __init__(self, config: MegaConfig):
+        super().__init__()
+        self.seq_len_dim = 1
+        self.mega_layer = MegaMovingAverageGatedAttention(config)
+        self.nffn = MegaNormalizedFeedForwardNetwork(config) if config.use_normalized_ffn else None
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.cross_attn = MegaGatedCrossAttention(config)
+        else:
+            self.cross_attn = None
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        causal_mask: Optional[torch.LongTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[torch.FloatTensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor]:
+        """
+        A single Mega layer: either encoder or decoder, with optional cross-attention and optional normalized
+        feed-forward layer
+
+        Args:
+            hidden_states (`torch.Tensor` of shape `(target_sequence_length, batch_size, hidden_size)`):
+                Hidden states to be updated by the Mega block
+            attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+                Indicates which entries in the self/target sequence are to be ignored (mostly due to padding), where
+                elements are either 1 for *not masked* or 0 for *masked*. Causal attention is enforced internally.
+            causal_mask (`torch.LongTensor` of shape `(sequence_length, sequence_length)`, *optional*):
+                Indicates which inputs are to be ignored due to causal attention, where elements are either 1 for *not
+                masked* or 0 for *masked*
+            encoder_hidden_states (`torch.Tensor`, of shape `(source_sequence_length, batch_size, hidden_size)`, *optional*):
+                Encoder hidden states to be used for cross-attention (and required for encoder-decoder model setup)
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, source_sequence_length)`, *optional*):
+                Indicates which entries in the cross/source sequence are to be ignored (mostly due to padding), where
+                elements are either 1 for *not masked* or 0 for *masked*.
+            past_key_value (`tuple(torch.Tensor)`, *optional*):
+                The hidden states returned from the previous timestep during incremental decoding; expects that
+                self-attention key, value, and EMA states are the first 3 entries in the tuple, and (if doing
+                cross-attention) cross-attention key and value are the last 2 entries in the tuple
+            output_attentions (`bool`, default `False`):
+                Whether to return self-attention weights
+            use_cache (`bool`, default `False`):
+                Whether to perfom incremental decoding; uses `past_key_value` as prior state, and returns the updated
+                states for use in the next step
+
+        Returns:
+            `tuple(torch.FloatTensor)` containing various elements depending on configuration ([`MegaConfig`]) and
+            inputs:
+            - **hidden_states** (`torch.FloatTensor` of shape `(target_sequence_length, batch_size, hidden_size)`) --
+              Hidden states from target sequence updated by Mega
+            - **self_attn_weights** (*optional*, returned when `output_attentions=True`) `torch.FloatTensor` of shape
+              `(batch_size, 1, target_sequence_length, target_sequence_length)` -- The self-attention weights
+              corresponding to how each token in the input sequence attends to every other token
+            - **cross_attn_weights** (*optional*, returned when `output_attentions=True` and
+              `config.add_cross_attention=True`) `torch.FloatTensor` of shape `(batch_size, source_sequence_length,
+              target_sequence_length)` -- Pairwise cross-attention weights between every entry in the source sequence
+              and target sequence
+            - **self_key** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape `(batch_size,
+              sequence_length, config.shared_representation_size)` -- The self-attention key state for use in the next
+              step of incremental decoding
+            - **self_value** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape `(batch_size,
+              sequence_length, config.hidden_size)` -- The self-attention value state for use in the next step of
+              incremental decoding
+            - **self_ema_state** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape
+              `(batch_size, config.ndim)` The incremental EMA state for use in the next step of incremental decoding.
+            - **cross_key** (*optional*, returned when `use_cache=True` and `config.is_decoder=True`)
+              `torch.FloatTensor` of shape `(batch_size, source_sequence_length, config.shared_representation_size)` --
+              The cross-attention key state for use in the next step of incremental decoding
+            - **cross_value** (*optional*, returned when `use_cache=True` and `config.is_decoder=True`)
+              `torch.FloatTensor` of shape `(batch_size, source_sequence_length, config.hidden_size)` -- The
+              cross-attention value state for use in the next step of incremental decoding
+        """
+
+        # incremental decoding in the MegaMultiDimensionDampedEma module requires that the attention mask has the same
+        # sequence length as the input tensor; if we're caching incremental states, we assume the input
+        # sequence length is 1 (Mega will break otherwise), so we take the padding mask for the final
+        # token in the input (mask is received as [batch X sequence length])
+        if use_cache and (past_key_value is not None) and (attention_mask is not None):
+            mega_padding_mask = attention_mask[:, -1].unsqueeze(-1)
+        else:
+            mega_padding_mask = attention_mask
+
+        mega_outputs = self.mega_layer(
+            input=hidden_states,
+            padding_mask=mega_padding_mask,
+            causal_mask=causal_mask,
+            past_key_values=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+
+        new_hidden_states = mega_outputs[0]
+        self_key, self_value, self_ema_state = mega_outputs[-3:] if use_cache else (None, None, None)
+        self_attention_weights = mega_outputs[1] if output_attentions else None
+
+        # optional cross attention
+        if self.cross_attn is not None:
+            if encoder_hidden_states is None:
+                raise ValueError("Requested cross-attention without providing encoder hidden states")
+
+            cross_attn_outputs = self.cross_attn(
+                query=new_hidden_states,
+                key=encoder_hidden_states,
+                value=encoder_hidden_states,
+                key_padding_mask=encoder_attention_mask,
+                past_key_values=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+            # update the hidden state from cross attention
+            new_hidden_states = cross_attn_outputs[0]
+            # store cross-attention k/v if caching
+            cross_key, cross_value = cross_attn_outputs[-2:] if use_cache else (None, None)
+            cross_attention_weights = cross_attn_outputs[1] if output_attentions else None
+
+        # optional NFFN follows cross attention
+        if self.nffn is not None:
+            new_hidden_states = self.nffn(new_hidden_states)
+
+        outs = (new_hidden_states,)
+        if output_attentions:
+            outs = outs + (self_attention_weights,)
+            if self.cross_attn is not None:
+                outs = outs + (cross_attention_weights,)
+
+        if use_cache:
+            new_key_values = (
+                self_key,
+                self_value,
+                self_ema_state,
+            )
+            if self.cross_attn is not None:
+                new_key_values = new_key_values + (cross_key, cross_value)
+
+            outs = outs + (new_key_values,)
+
+        return outs
+
+
+# copied from transformers.models.roberta.modeling_roberta.RobertaPooler with Roberta->Mega
+class MegaPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class MegaPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = MegaConfig
+    base_model_prefix = "mega"
+    supports_gradient_checkpointing = False
+    _no_split_modules = ["MegaMovingAverageGatedAttention"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, MegaMultiDimensionDampedEma):
+            with torch.no_grad():
+                # delta & alpha
+                nn.init.normal_(module.damping_factor, mean=0.0, std=self.config.ema_delta_alpha_range)
+                nn.init.normal_(module.decay_factor, mean=0.0, std=self.config.ema_delta_alpha_range)
+                # beta [1, -1, 1, -1, ...] seems more stable.
+                val = torch.ones(self.config.ema_projection_size, 1)
+                if self.config.ema_projection_size > 1:
+                    idx = torch.tensor(list(range(1, self.config.ema_projection_size, 2)))
+                    val.index_fill_(0, idx, -1.0)
+                module.ema_expansion_matrix.normal_(mean=0.0, std=self.config.ema_beta_range).add_(val)
+                # gamma & omega
+                nn.init.normal_(module.kernel_projection_matrix, mean=0.0, std=self.config.ema_gamma_omega_range)
+                nn.init.normal_(module.residual_weight, mean=0.0, std=self.config.ema_gamma_omega_range)
+        elif isinstance(module, MegaSimpleRelativePositionalBias):
+            nn.init.normal_(module.rel_pos_bias, mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, MegaRotaryRelativePositionalBias):
+            nn.init.normal_(module.alpha, mean=0.0, std=self.config.initializer_range)
+            nn.init.normal_(module.b_param, mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, MegaScaleNorm):
+            if self.config.norm_affine:
+                nn.init.constant_(module.scalar, 1.0)
+        elif isinstance(module, MegaRMSNorm):
+            if self.config.norm_affine:
+                nn.init.constant_(module.weight, 1.0)
+        elif isinstance(module, MegaMovingAverageGatedAttention):
+            # linear layers covered separately by the generic nn.Linear init below
+            nn.init.normal_(module.qk_weight, mean=0.0, std=self.config.initializer_range)
+            nn.init.constant_(module.qk_bias, 0.0)
+        elif isinstance(module, nn.Linear):
+            # initializes all linear layers in the entire network
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+MEGA_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`MegaConfig`]): Model configuration class with all the parameters of the
+            model. Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+MEGA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            This parameter can only be used when the model is initialized with `add_token_type_embeddings` parameter
+            set to `True`. All the value in this tensor should be always < config.type_vocab_size.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare MEGA Model transformer outputting raw hidden-states without any specific head on top.",
+    MEGA_START_DOCSTRING,
+)
+class MegaModel(MegaPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added after self-attention, following the architecture described in *Mega: Moving Average
+    Equipped Gated Attention*_ by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig,
+    Jonathan May, and Luke Zettlemoyer
+
+    To behave as a decoder the model needs to be initialized with the `is_decoder` argument of the configuration set to
+    `True` and `bidirectional` set to `False`. To be used in a Seq2Seq model, the model needs to initialized with both
+    `is_decoder=True` and `bidirectional=False` argument as well as `add_cross_attention` set to `True`; an
+    `encoder_hidden_states` is then expected as an input to the forward pass.
+
+    .. _*Mega: Moving Average Equipped Gated Attention*: https://arxiv.org/abs/2209.10655
+
+    """
+
+    def __init__(self, config: MegaConfig, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embedding_layer = MegaEmbeddings(config)
+        self.layers = nn.ModuleList([MegaBlock(config) for _ in range(config.num_hidden_layers)])
+
+        self.pooler = MegaPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing (retained from RoBERTa code)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embedding_layer.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embedding_layer.word_embeddings = value
+
+    @add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+            device = input_ids.device
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            device = inputs_embeds.device
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if self.config.use_chunking:
+            input_shape = torch.tensor([input_shape[0], self.config.chunk_size])
+
+        batch_size, sequence_length = input_shape
+
+        if self.config.use_chunking and (sequence_length > self.config.chunk_size):
+            if sequence_length % self.config.chunk_size != 0:
+                raise ValueError(
+                    f"config.use_chunking is activated; input sequence length must be shorter than or a multiple of config.chunk_size\nreceived sequence length of {sequence_length} with chunk size {self.config.chunk_size}"
+                )
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+            # Mega expects the causal mask to be a 2D square matrix of (from) x (to) over the input sequence length
+            # the HF utility function generates a 3D causal mask which includes batch size, so we'll create a dummy
+            # mask with the correct device and all ones
+            temp_mask_for_extension = torch.ones((1, sequence_length), dtype=torch.long, device=device)
+            causal_mask = self.create_extended_attention_mask_for_decoder(input_shape, temp_mask_for_extension)
+
+            # get rid of batch dimension in the generated mask; result is (sequence_length X sequence_length)
+            causal_mask = causal_mask.squeeze(0)
+        else:
+            use_cache = False
+            causal_mask = None
+
+        # if using cache, make sure we have a tuple of tuples which matches the length of our hidden layers
+        if (past_key_values is not None) and (len(past_key_values) != self.config.num_hidden_layers):
+            raise ValueError(
+                f"Received past key/value cache with size mismatch; expected {self.config.num_hidden_layers}, received {len(past_key_values)}"
+            )
+
+        # get embeddings (batch X sequence length X embed dim)
+        embedding_output = self.embedding_layer(
+            input_ids=input_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
+        )
+
+        # transpose for Mega --> (seq len X batch X embed dim)
+        hidden_states = embedding_output.transpose(0, 1)
+
+        # we expect encoder hidden states to also have batch first in line
+        # with typical Hugging Face behavior (which is also how we return them)
+        # Mega expects sequence length first, so do the same transpose here
+        if encoder_hidden_states is not None:
+            encoder_hidden_states = encoder_hidden_states.transpose(0, 1)
+
+        # pass through mega layers
+        all_hidden_states = (embedding_output,) if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+        next_decoder_cache = () if use_cache else None
+        for i, mega_layer in enumerate(self.layers):
+            current_decoder_cache = past_key_values[i] if past_key_values is not None else None
+            mega_outputs = mega_layer(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                causal_mask=causal_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                past_key_value=current_decoder_cache,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+            hidden_states = mega_outputs[0]
+            if output_hidden_states:
+                # store layer-wise hidden states in the way that the user expects
+                # (seq len X batch X embed dim) --> (batch X seq len X embed dim)
+                all_hidden_states += (hidden_states.transpose(0, 1),)
+            if output_attentions:
+                self_attn_weights = mega_outputs[1]
+                all_self_attentions += (self_attn_weights,)
+                if self.config.add_cross_attention:
+                    cross_attn_weights = mega_outputs[2]
+                    all_cross_attentions += (cross_attn_weights,)
+            if use_cache:
+                updated_cache = mega_outputs[-1]
+                next_decoder_cache += (updated_cache,)
+
+        # transpose final hidden states
+        hidden_states = hidden_states.transpose(0, 1)
+
+        # optional pooling layer
+        pooled_output = self.pooler(hidden_states) if self.pooler is not None else None
+
+        if not return_dict:
+            return (hidden_states, pooled_output) + (
+                all_hidden_states,
+                next_decoder_cache,
+                all_self_attentions,
+                all_cross_attentions,
+            )
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            pooler_output=pooled_output,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """MEGA Model with a `language modeling` head on top for CLM fine-tuning.""", MEGA_START_DOCSTRING
+)
+class MegaForCausalLM(MegaPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: MegaConfig):
+        super().__init__(config)
+
+        if not config.is_decoder:
+            logger.warning("If you want to use `MegaForCausalLM` as a standalone, add `is_decoder=True.`")
+
+        self.mega = MegaModel(config, add_pooling_layer=False)
+
+        if config.add_lm_hidden_dense_layer:
+            self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+            self.hidden_activation = nn.Tanh()
+        else:
+            self.dense = None
+            self.hidden_activation = None
+
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    @add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        past_key_values: Tuple[Tuple[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, MegaForCausalLM, AutoConfig
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("mnaylor/mega-base-wikitext")
+        >>> config = AutoConfig.from_pretrained("mnaylor/mega-base-wikitext")
+        >>> config.is_decoder = True
+        >>> config.bidirectional = False
+        >>> model = MegaForCausalLM.from_pretrained(
+        ...     "mnaylor/mega-base-wikitext", config=config, ignore_mismatched_sizes=True
+        ... )
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> prediction_logits = outputs.logits
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.mega(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        if self.dense is not None:
+            sequence_output = self.dense(sequence_output)
+            sequence_output = self.hidden_activation(sequence_output)
+
+        prediction_scores = self.lm_head(sequence_output)
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        # cut decoder_input_ids if past is used
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
+
+    def _reorder_cache(self, past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+
+@add_start_docstrings("""MEGA Model with a `language modeling` head on top.""", MEGA_START_DOCSTRING)
+class MegaForMaskedLM(MegaPreTrainedModel):
+    _tied_weights_keys = ["mlm_head.weight"]
+
+    def __init__(self, config: MegaConfig):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                "If you want to use `MegaForMaskedLM`, set `config.is_decoder=False` for "
+                "bi-directional self-attention."
+            )
+
+        self.mega = MegaModel(config, add_pooling_layer=False)
+        if config.add_lm_hidden_dense_layer:
+            self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+            self.hidden_activation = nn.Tanh()
+        else:
+            self.dense = None
+            self.hidden_activation = None
+        self.mlm_head = nn.Linear(config.hidden_size, config.vocab_size)
+        self.dropout = nn.Dropout(config.dropout_prob)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.mlm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.mlm_head = new_embeddings
+
+    @add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        mask="<mask>",
+        expected_output="' Paris'",
+        expected_loss=0.1,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+            Used to hide legacy arguments that have been deprecated.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.mega(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        if self.dense is not None:
+            sequence_output = self.dense(sequence_output)
+            sequence_output = self.hidden_activation(sequence_output)
+        prediction_scores = self.mlm_head(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    MEGA Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
+    output) e.g. for GLUE tasks.
+    """,
+    MEGA_START_DOCSTRING,
+)
+class MegaForSequenceClassification(MegaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.mega = MegaModel(config, add_pooling_layer=False)
+        self.classifier = MegaClassificationHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.mega(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    MEGA Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    MEGA_START_DOCSTRING,
+)
+class MegaForMultipleChoice(MegaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.mega = MegaModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        flat_inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.mega(
+            flat_input_ids,
+            token_type_ids=flat_token_type_ids,
+            attention_mask=flat_attention_mask,
+            inputs_embeds=flat_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    MEGA Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    MEGA_START_DOCSTRING,
+)
+class MegaForTokenClassification(MegaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.mega = MegaModel(config, add_pooling_layer=False)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.mega(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+# copied from transformers.models.roberta.modeling_roberta.RobertaClassificationHead with Roberta->Mega
+class MegaClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, features, **kwargs):
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = torch.tanh(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+
+
+@add_start_docstrings(
+    """
+    MEGA Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    MEGA_START_DOCSTRING,
+)
+class MegaForQuestionAnswering(MegaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.mega = MegaModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.mega(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/megatron_bert/__init__.py b/src/transformers/models/megatron_bert/__init__.py
index 9075b898377a..477802fdc009 100644
--- a/src/transformers/models/megatron_bert/__init__.py
+++ b/src/transformers/models/megatron_bert/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021  NVIDIA Corporation and The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/megatron_bert/configuration_megatron_bert.py b/src/transformers/models/megatron_bert/configuration_megatron_bert.py
index 577f7868d5ae..db9b67090ac7 100644
--- a/src/transformers/models/megatron_bert/configuration_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/configuration_megatron_bert.py
@@ -109,7 +109,7 @@ def __init__(
         pad_token_id=0,
         position_embedding_type="absolute",
         use_cache=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
 
diff --git a/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py b/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
index 19124a074b9a..0fc67866301f 100644
--- a/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
+++ b/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
@@ -150,7 +150,7 @@ def convert_megatron_checkpoint(args, input_state_dict, config):
     transformer = lm["transformer"] if "transformer" in lm.keys() else lm["encoder"]
 
     # The regex to extract layer names.
-    layer_re = re.compile("layers\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)")
+    layer_re = re.compile(r"layers\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)")
 
     # The simple map of names for "automated" rules.
     megatron_to_transformers = {
@@ -184,7 +184,6 @@ def convert_megatron_checkpoint(args, input_state_dict, config):
 
         # For layernorm(s), simply store the layer norm.
         if op_name.endswith("layernorm"):
-
             ln_name = "attention.ln" if op_name.startswith("input") else "ln"
             output_state_dict[layer_name + "." + ln_name + "." + weight_or_bias] = val
 
@@ -192,7 +191,6 @@ def convert_megatron_checkpoint(args, input_state_dict, config):
         elif (
             op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value"
         ) and weight_or_bias == "weight":
-
             # Make sure the QKV pointer is nil.
             assert attention_qkv_weight is None, ""
 
@@ -204,7 +202,6 @@ def convert_megatron_checkpoint(args, input_state_dict, config):
         elif (
             op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value"
         ) and weight_or_bias == "bias":
-
             # Make sure we read the weight tensor.
             assert attention_qkv_weight is not None, ""
 
@@ -232,7 +229,6 @@ def convert_megatron_checkpoint(args, input_state_dict, config):
 
         # Copy weights and biases as is.
         elif weight_or_bias in ["weight", "bias"]:
-
             out_name = megatron_to_transformers[op_name]
             output_state_dict[layer_name + out_name + weight_or_bias] = val
 
diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
index ab0c20036707..1c1eeff667d4 100755
--- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -55,7 +55,6 @@
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "MegatronBertConfig"
-_TOKENIZER_FOR_DOC = "BertTokenizer"
 _CHECKPOINT_FOR_DOC = "nvidia/megatron-bert-cased-345m"
 
 MEGATRON_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -150,7 +149,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
 
     def forward(
@@ -531,6 +532,12 @@ def forward(
         output_hidden_states: Optional[bool] = False,
         return_dict: Optional[bool] = True,
     ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
         all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
@@ -545,12 +552,6 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         return module(*inputs, past_key_value, output_attentions)
@@ -714,7 +715,6 @@ class MegatronBertPreTrainedModel(PreTrainedModel):
     load_tf_weights = load_tf_weights_in_megatron_bert
     base_model_prefix = "bert"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -789,7 +789,7 @@ class MegatronBertForPreTrainingOutput(ModelOutput):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -879,7 +879,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPoolingAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -934,6 +933,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -1016,7 +1016,7 @@ def forward(
     MEGATRON_BERT_START_DOCSTRING,
 )
 class MegatronBertForPreTraining(MegatronBertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["cls.predictions.decoder"]
+    _tied_weights_keys = ["cls.predictions.decoder"]
 
     def __init__(self, config, add_binary_head=True):
         super().__init__(config)
@@ -1068,10 +1068,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import BertTokenizer, MegatronBertForPreTraining
+        >>> from transformers import AutoTokenizer, MegatronBertForPreTraining
         >>> import torch
 
-        >>> tokenizer = BertTokenizer.from_pretrained("nvidia/megatron-bert-cased-345m")
+        >>> tokenizer = AutoTokenizer.from_pretrained("nvidia/megatron-bert-cased-345m")
         >>> model = MegatronBertForPreTraining.from_pretrained("nvidia/megatron-bert-cased-345m")
 
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
@@ -1122,9 +1122,7 @@ def forward(
     MEGATRON_BERT_START_DOCSTRING,
 )
 class MegatronBertForCausalLM(MegatronBertPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"cls.predictions.decoder"]
+    _tied_weights_keys = ["cls.predictions.decoder"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1192,10 +1190,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import BertTokenizer, MegatronBertForCausalLM, MegatronBertConfig
+        >>> from transformers import AutoTokenizer, MegatronBertForCausalLM, MegatronBertConfig
         >>> import torch
 
-        >>> tokenizer = BertTokenizer.from_pretrained("nvidia/megatron-bert-cased-345m")
+        >>> tokenizer = AutoTokenizer.from_pretrained("nvidia/megatron-bert-cased-345m")
         >>> model = MegatronBertForCausalLM.from_pretrained("nvidia/megatron-bert-cased-345m", is_decoder=True)
 
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
@@ -1259,18 +1257,18 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attenti
 
         return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
 
-    def _reorder_cache(self, past, beam_idx):
+    def _reorder_cache(self, past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
         return reordered_past
 
 
 @add_start_docstrings("""MegatronBert Model with a `language modeling` head on top.""", MEGATRON_BERT_START_DOCSTRING)
 class MegatronBertForMaskedLM(MegatronBertPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r"pooler", r"seq_relationship"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder"]
+    _tied_weights_keys = ["cls.predictions.decoder"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1295,7 +1293,6 @@ def set_output_embeddings(self, new_embeddings):
 
     @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1378,9 +1375,6 @@ def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_
     MEGATRON_BERT_START_DOCSTRING,
 )
 class MegatronBertForNextSentencePrediction(MegatronBertPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r"predictions"]
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -1404,7 +1398,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        **kwargs
+        **kwargs,
     ) -> Union[Tuple, NextSentencePredictorOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -1419,10 +1413,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import BertTokenizer, MegatronBertForNextSentencePrediction
+        >>> from transformers import AutoTokenizer, MegatronBertForNextSentencePrediction
         >>> import torch
 
-        >>> tokenizer = BertTokenizer.from_pretrained("nvidia/megatron-bert-cased-345m")
+        >>> tokenizer = AutoTokenizer.from_pretrained("nvidia/megatron-bert-cased-345m")
         >>> model = MegatronBertForNextSentencePrediction.from_pretrained("nvidia/megatron-bert-cased-345m")
 
         >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
@@ -1498,7 +1492,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1597,7 +1590,6 @@ def __init__(self, config):
         MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
     )
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1677,9 +1669,6 @@ def forward(
     MEGATRON_BERT_START_DOCSTRING,
 )
 class MegatronBertForTokenClassification(MegatronBertPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1693,7 +1682,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1759,9 +1747,6 @@ def forward(
     MEGATRON_BERT_START_DOCSTRING,
 )
 class MegatronBertForQuestionAnswering(MegatronBertPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1774,7 +1759,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/megatron_gpt2/__init__.py b/src/transformers/models/megatron_gpt2/__init__.py
index 8228eea5365f..f1b21c7d2f71 100644
--- a/src/transformers/models/megatron_gpt2/__init__.py
+++ b/src/transformers/models/megatron_gpt2/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021  NVIDIA Corporation and The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py b/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py
index f8fb1e7c8f3b..b535e599ad6c 100644
--- a/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py
+++ b/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py
@@ -291,8 +291,10 @@ def get_megatron_sharded_states(args, tp_size, pp_size, pp_rank):
     tp_state_dicts = []
     for i in range(tp_size):
         sub_dir_name = f"mp_rank_{i:02d}" if pp_size == 1 else f"mp_rank_{i:02d}_{pp_rank:03d}"
-        checkpoint_name = os.listdir(os.path.join(args.load_path, sub_dir_name))[0]
-        checkpoint_path = os.path.join(args.load_path, sub_dir_name, checkpoint_name)
+        for checkpoint_name in ["model_optim_rng.pt", "model_rng.pt"]:
+            checkpoint_path = os.path.join(args.load_path, sub_dir_name, checkpoint_name)
+            if os.path.isfile(checkpoint_path):
+                break
         state_dict = torch.load(checkpoint_path, map_location="cpu")
         tp_state_dicts.append(state_dict)
     return tp_state_dicts
@@ -394,7 +396,7 @@ def convert_checkpoint_from_megatron_to_transformers(args):
     pp_size = megatron_args.pipeline_model_parallel_size
     dtype = torch.float32
     # The regex to extract layer names.
-    layer_re = re.compile("layers\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)")
+    layer_re = re.compile(r"layers\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)")
 
     # Convert.
     print("Converting")
@@ -475,7 +477,6 @@ def convert_checkpoint_from_megatron_to_transformers(args):
 
             # For layernorm(s), simply store the layer norm.
             if op_name.endswith("layernorm"):
-
                 ln_name = "ln_1" if op_name.startswith("input") else "ln_2"
                 output_state_dict[layer_name + "." + ln_name + "." + weight_or_bias] = params
 
@@ -483,7 +484,6 @@ def convert_checkpoint_from_megatron_to_transformers(args):
             elif (
                 op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value"
             ) and weight_or_bias == "weight":
-
                 # Insert a tensor of 1x1xDxD bias.
                 causal_mask = torch.tril(torch.ones((n_positions, n_positions), dtype=dtype)).view(
                     1, 1, n_positions, n_positions
@@ -510,7 +510,6 @@ def convert_checkpoint_from_megatron_to_transformers(args):
             elif (
                 op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value"
             ) and weight_or_bias == "bias":
-
                 out_val = megatron_to_transformers_fix_query_key_value_ordering(
                     params, checkpoint_version, 3, heads, hidden_size_per_head
                 )
@@ -519,13 +518,11 @@ def convert_checkpoint_from_megatron_to_transformers(args):
 
             # Transpose the weights.
             elif weight_or_bias == "weight":
-
                 out_name = megatron_to_transformers[op_name]
                 output_state_dict[layer_name + out_name + "weight"] = params.transpose(0, 1)
 
             # Copy the bias.
             elif weight_or_bias == "bias":
-
                 out_name = megatron_to_transformers[op_name]
                 output_state_dict[layer_name + out_name + "bias"] = params
 
@@ -695,9 +692,9 @@ def convert_checkpoint_from_transformers_to_megatron(args):
             for j in range(args.target_tensor_model_parallel_size):
                 for k in range(args.target_data_parallel_size):
                     if args.target_pipeline_model_parallel_size == 1:
-                        checkpoint_dir = f"mp_rank_{i:02d}_{k:03d}"
+                        checkpoint_dir = f"mp_rank_{j:02d}_{k:03d}"
                     else:
-                        checkpoint_dir = f"mp_rank_{i:02d}_{j:03d}_{k:03d}"
+                        checkpoint_dir = f"mp_rank_{j:02d}_{i:03d}_{k:03d}"
                     checkpoint_dir = os.path.join(release_dir, checkpoint_dir)
                     os.makedirs(checkpoint_dir, exist_ok=True)
                     torch.save(
@@ -740,18 +737,25 @@ def convert_checkpoint_from_transformers_to_megatron(args):
         word_emb_dict = get_element_from_dict_by_path(
             output_state_dict[i], "model.language_model.embedding.word_embeddings"
         )
-        word_emb_dict["weight"] = out_word_embed[i]
+        word_emb_dict["weight"] = out_word_embed[i].clone()
 
     # Transformer layers
     print("converting transformer layers")
-    if config.num_hidden_layers % args.target_tensor_model_parallel_size != 0:
+    if config.num_attention_heads % args.target_tensor_model_parallel_size != 0:
         raise ValueError(
-            f"Number of layers ({config.num_hidden_layers}) must be divisible by number of tensor parallelism"
+            f"Number of attention heads ({config.num_attention_heads}) must be divisible by number of tensor parallelism"
             f" ({args.target_tensor_model_parallel_size})"
         )
+
+    if config.num_hidden_layers % args.target_pipeline_model_parallel_size != 0:
+        raise ValueError(
+            f"Number of layers ({config.num_hidden_layers}) must be divisible by number of pipeline parallelism"
+            f" ({args.target_pipeline_model_parallel_size})"
+        )
+
     num_layers = config.num_hidden_layers // args.target_pipeline_model_parallel_size
 
-    layer_re = re.compile("transformer.h\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)")
+    layer_re = re.compile(r"transformer.h\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)")
     # The number of heads.
     heads = config.n_head
     # The hidden_size per head.
@@ -841,7 +845,7 @@ def convert_checkpoint_from_transformers_to_megatron(args):
                 for i in range(args.target_tensor_model_parallel_size):
                     params_dict = get_element_from_dict_by_path(output_state_dict[i], "model.language_model.encoder")
                     params_dict[layer_name] = (
-                        params[i] if (op_name + "." + weight_or_bias in tensor_parallel_params) else params
+                        params[i].clone() if (op_name + "." + weight_or_bias in tensor_parallel_params) else params
                     )
 
         if pp_rank == args.target_pipeline_model_parallel_size - 1:
@@ -856,7 +860,7 @@ def convert_checkpoint_from_transformers_to_megatron(args):
             # add the LM head
             for i in range(args.target_tensor_model_parallel_size):
                 params_dict = get_element_from_dict_by_path(output_state_dict[i], "model.word_embeddings_for_head")
-                params_dict["weight"] = out_word_embed[i]
+                params_dict["weight"] = out_word_embed[i].clone()
 
         # saving the state dict as per the tp_rank and pp_rank
         for tp_rank in range(args.target_tensor_model_parallel_size):
diff --git a/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py b/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
index 778b1384a28b..88d54f10e260 100644
--- a/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
+++ b/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
@@ -148,7 +148,7 @@ def convert_megatron_checkpoint(args, input_state_dict, config):
     transformer = lm["transformer"] if "transformer" in lm.keys() else lm["encoder"]
 
     # The regex to extract layer names.
-    layer_re = re.compile("layers\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)")
+    layer_re = re.compile(r"layers\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)")
 
     # The simple map of names for "automated" rules.
     megatron_to_transformers = {
@@ -179,7 +179,6 @@ def convert_megatron_checkpoint(args, input_state_dict, config):
 
         # For layernorm(s), simply store the layer norm.
         if op_name.endswith("layernorm"):
-
             ln_name = "ln_1" if op_name.startswith("input") else "ln_2"
             output_state_dict[layer_name + "." + ln_name + "." + weight_or_bias] = val
 
@@ -187,7 +186,6 @@ def convert_megatron_checkpoint(args, input_state_dict, config):
         elif (
             op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value"
         ) and weight_or_bias == "weight":
-
             # Insert a tensor of 1x1xDxD bias.
             causal_mask = torch.tril(torch.ones((n_positions, n_positions), dtype=torch.float16)).view(
                 1, 1, n_positions, n_positions
@@ -208,20 +206,17 @@ def convert_megatron_checkpoint(args, input_state_dict, config):
         elif (
             op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value"
         ) and weight_or_bias == "bias":
-
             out_val = fix_query_key_value_ordering(val, checkpoint_version, 3, heads, hidden_size_per_head)
             # Store. No change of shape.
             output_state_dict[layer_name + ".attn.c_attn.bias"] = out_val
 
         # Transpose the weights.
         elif weight_or_bias == "weight":
-
             out_name = megatron_to_transformers[op_name]
             output_state_dict[layer_name + out_name + "weight"] = val.transpose(0, 1)
 
         # Copy the bias.
         elif weight_or_bias == "bias":
-
             out_name = megatron_to_transformers[op_name]
             output_state_dict[layer_name + out_name + "bias"] = val
 
@@ -276,7 +271,6 @@ def main():
 
     # Read the config, or default to the model released by NVIDIA.
     if args.config_file == "":
-
         if ds_args is not None:
             if ds_args.bias_gelu_fusion:
                 activation_function = "gelu_fast"
diff --git a/src/transformers/models/mgp_str/__init__.py b/src/transformers/models/mgp_str/__init__.py
new file mode 100644
index 000000000000..1bb9ae50b291
--- /dev/null
+++ b/src/transformers/models/mgp_str/__init__.py
@@ -0,0 +1,62 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {
+    "configuration_mgp_str": ["MGP_STR_PRETRAINED_CONFIG_ARCHIVE_MAP", "MgpstrConfig"],
+    "processing_mgp_str": ["MgpstrProcessor"],
+    "tokenization_mgp_str": ["MgpstrTokenizer"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_mgp_str"] = [
+        "MGP_STR_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "MgpstrModel",
+        "MgpstrPreTrainedModel",
+        "MgpstrForSceneTextRecognition",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_mgp_str import MGP_STR_PRETRAINED_CONFIG_ARCHIVE_MAP, MgpstrConfig
+    from .processing_mgp_str import MgpstrProcessor
+    from .tokenization_mgp_str import MgpstrTokenizer
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_mgp_str import (
+            MGP_STR_PRETRAINED_MODEL_ARCHIVE_LIST,
+            MgpstrForSceneTextRecognition,
+            MgpstrModel,
+            MgpstrPreTrainedModel,
+        )
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/mgp_str/configuration_mgp_str.py b/src/transformers/models/mgp_str/configuration_mgp_str.py
new file mode 100644
index 000000000000..b553c6a0ff68
--- /dev/null
+++ b/src/transformers/models/mgp_str/configuration_mgp_str.py
@@ -0,0 +1,137 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" MGP-STR model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+MGP_STR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "alibaba-damo/mgp-str-base": "https://huggingface.co/alibaba-damo/mgp-str-base/resolve/main/config.json",
+}
+
+
+class MgpstrConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of an [`MgpstrModel`]. It is used to instantiate an
+    MGP-STR model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the MGP-STR
+    [alibaba-damo/mgp-str-base](https://huggingface.co/alibaba-damo/mgp-str-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        image_size (`List[int]`, *optional*, defaults to `[32, 128]`):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 4):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        max_token_length (`int`, *optional*, defaults to 27):
+            The max number of output tokens.
+        num_character_labels (`int`, *optional*, defaults to 38):
+            The number of classes for character head .
+        num_bpe_labels (`int`, *optional*, defaults to 50257):
+            The number of classes for bpe head .
+        num_wordpiece_labels (`int`, *optional*, defaults to 30522):
+            The number of classes for wordpiece head .
+        hidden_size (`int`, *optional*, defaults to 768):
+            The embedding dimension.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        mlp_ratio (`float`, *optional*, defaults to 4.0):
+            The ratio of mlp hidden dim to embedding dim.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        distilled (`bool`, *optional*, defaults to `False`):
+            Model includes a distillation token and head as in DeiT models.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        drop_rate (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder.
+        attn_drop_rate (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            The stochastic depth rate.
+        output_a3_attentions (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should returns A^3 module attentions.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+
+    Example:
+
+    ```python
+    >>> from transformers import MgpstrConfig, MgpstrForSceneTextRecognition
+
+    >>> # Initializing a Mgpstr mgp-str-base style configuration
+    >>> configuration = MgpstrConfig()
+
+    >>> # Initializing a model (with random weights) from the mgp-str-base style configuration
+    >>> model = MgpstrForSceneTextRecognition(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "mgp-str"
+
+    def __init__(
+        self,
+        image_size=[32, 128],
+        patch_size=4,
+        num_channels=3,
+        max_token_length=27,
+        num_character_labels=38,
+        num_bpe_labels=50257,
+        num_wordpiece_labels=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        distilled=False,
+        layer_norm_eps=1e-5,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.0,
+        output_a3_attentions=False,
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.max_token_length = max_token_length
+        self.num_character_labels = num_character_labels
+        self.num_bpe_labels = num_bpe_labels
+        self.num_wordpiece_labels = num_wordpiece_labels
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.mlp_ratio = mlp_ratio
+        self.distilled = distilled
+        self.layer_norm_eps = layer_norm_eps
+        self.drop_rate = drop_rate
+        self.qkv_bias = qkv_bias
+        self.attn_drop_rate = attn_drop_rate
+        self.drop_path_rate = drop_path_rate
+        self.output_a3_attentions = output_a3_attentions
+        self.initializer_range = initializer_range
diff --git a/src/transformers/models/mgp_str/modeling_mgp_str.py b/src/transformers/models/mgp_str/modeling_mgp_str.py
new file mode 100644
index 000000000000..5d1f5bea7bfd
--- /dev/null
+++ b/src/transformers/models/mgp_str/modeling_mgp_str.py
@@ -0,0 +1,518 @@
+# coding=utf-8
+# Copyright 2023 Alibaba Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch MGP-STR model."""
+
+import collections.abc
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+
+from ...modeling_outputs import BaseModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_mgp_str import MgpstrConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "MgpstrConfig"
+_TOKENIZER_FOR_DOC = "MgpstrTokenizer"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "alibaba-damo/mgp-str-base"
+
+MGP_STR_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "alibaba-damo/mgp-str-base",
+    # See all MGP-STR models at https://huggingface.co/models?filter=mgp-str
+]
+
+
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->Mgpstr
+class MgpstrDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+
+
+@dataclass
+class MgpstrModelOutput(ModelOutput):
+    """
+    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
+
+    Args:
+        logits (`tuple(torch.FloatTensor)` of shape `(batch_size, config.num_character_labels)`):
+            Tuple of `torch.FloatTensor` (one for the output of character of shape `(batch_size,
+            config.max_token_length, config.num_character_labels)`, + one for the output of bpe of shape `(batch_size,
+            config.max_token_length, config.num_bpe_labels)`, + one for the output of wordpiece of shape `(batch_size,
+            config.max_token_length, config.num_wordpiece_labels)`) .
+
+            Classification scores (before SoftMax) of character, bpe and wordpiece.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, config.max_token_length,
+            sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        a3_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_a3_attentions=True` is passed or when `config.output_a3_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for the attention of character, + one for the attention of bpe`, + one
+            for the attention of wordpiece) of shape `(batch_size, config.max_token_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    logits: Tuple[torch.FloatTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    a3_attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+class MgpstrEmbeddings(nn.Module):
+    """2D Image to Patch Embedding"""
+
+    def __init__(self, config: MgpstrConfig):
+        super().__init__()
+        image_size = (
+            config.image_size
+            if isinstance(config.image_size, collections.abc.Iterable)
+            else (config.image_size, config.image_size)
+        )
+        patch_size = (
+            config.patch_size
+            if isinstance(config.patch_size, collections.abc.Iterable)
+            else (config.patch_size, config.patch_size)
+        )
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.grid_size = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.num_tokens = 2 if config.distilled else 1
+
+        self.proj = nn.Conv2d(config.num_channels, config.hidden_size, kernel_size=patch_size, stride=patch_size)
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+
+        self.pos_embed = nn.Parameter(torch.zeros(1, self.num_patches + self.num_tokens, config.hidden_size))
+        self.pos_drop = nn.Dropout(p=config.drop_rate)
+
+    def forward(self, pixel_values):
+        batch_size, channel, height, width = pixel_values.shape
+        if height != self.image_size[0] or width != self.image_size[1]:
+            raise ValueError(
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
+            )
+
+        patch_embeddings = self.proj(pixel_values)
+        patch_embeddings = patch_embeddings.flatten(2).transpose(1, 2)  # BCHW -> BNC
+
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embedding_output = torch.cat((cls_tokens, patch_embeddings), dim=1)
+        embedding_output = embedding_output + self.pos_embed
+        embedding_output = self.pos_drop(embedding_output)
+
+        return embedding_output
+
+
+class MgpstrMlp(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+
+    def __init__(self, config: MgpstrConfig, hidden_features):
+        super().__init__()
+        hidden_features = hidden_features or config.hidden_size
+        self.fc1 = nn.Linear(config.hidden_size, hidden_features)
+        self.act = nn.GELU()
+        self.fc2 = nn.Linear(hidden_features, config.hidden_size)
+        self.drop = nn.Dropout(config.drop_rate)
+
+    def forward(self, hidden_states):
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.drop(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = self.drop(hidden_states)
+        return hidden_states
+
+
+class MgpstrAttention(nn.Module):
+    def __init__(self, config: MgpstrConfig):
+        super().__init__()
+        self.num_heads = config.num_attention_heads
+        head_dim = config.hidden_size // config.num_attention_heads
+        self.scale = head_dim**-0.5
+
+        self.qkv = nn.Linear(config.hidden_size, config.hidden_size * 3, bias=config.qkv_bias)
+        self.attn_drop = nn.Dropout(config.attn_drop_rate)
+        self.proj = nn.Linear(config.hidden_size, config.hidden_size)
+        self.proj_drop = nn.Dropout(config.drop_rate)
+
+    def forward(self, hidden_states):
+        batch_size, num, channel = hidden_states.shape
+        qkv = (
+            self.qkv(hidden_states)
+            .reshape(batch_size, num, 3, self.num_heads, channel // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        query, key, value = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+
+        attention_probs = (query @ key.transpose(-2, -1)) * self.scale
+        attention_probs = attention_probs.softmax(dim=-1)
+        attention_probs = self.attn_drop(attention_probs)
+
+        context_layer = (attention_probs @ value).transpose(1, 2).reshape(batch_size, num, channel)
+        context_layer = self.proj(context_layer)
+        context_layer = self.proj_drop(context_layer)
+        return (context_layer, attention_probs)
+
+
+class MgpstrLayer(nn.Module):
+    def __init__(self, config: MgpstrConfig, drop_path=None):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.attn = MgpstrAttention(config)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = MgpstrDropPath(drop_path) if drop_path is not None else nn.Identity()
+        self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        mlp_hidden_dim = int(config.hidden_size * config.mlp_ratio)
+        self.mlp = MgpstrMlp(config, mlp_hidden_dim)
+
+    def forward(self, hidden_states):
+        self_attention_outputs = self.attn(self.norm1(hidden_states))
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1]
+
+        # first residual connection
+        hidden_states = self.drop_path(attention_output) + hidden_states
+
+        # second residual connection is done here
+        layer_output = hidden_states + self.drop_path(self.mlp(self.norm2(hidden_states)))
+
+        outputs = (layer_output, outputs)
+        return outputs
+
+
+class MgpstrEncoder(nn.Module):
+    def __init__(self, config: MgpstrConfig):
+        super().__init__()
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)]
+
+        self.blocks = nn.Sequential(
+            *[MgpstrLayer(config=config, drop_path=dpr[i]) for i in range(config.num_hidden_layers)]
+        )
+
+    def forward(self, hidden_states, output_attentions=False, output_hidden_states=False, return_dict=True):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for _, blk in enumerate(self.blocks):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_outputs = blk(hidden_states)
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class MgpstrA3Module(nn.Module):
+    def __init__(self, config: MgpstrConfig):
+        super().__init__()
+        self.token_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.tokenLearner = nn.Sequential(
+            nn.Conv2d(config.hidden_size, config.hidden_size, kernel_size=(1, 1), stride=1, groups=8, bias=False),
+            nn.Conv2d(config.hidden_size, config.max_token_length, kernel_size=(1, 1), stride=1, bias=False),
+        )
+        self.feat = nn.Conv2d(
+            config.hidden_size, config.hidden_size, kernel_size=(1, 1), stride=1, groups=8, bias=False
+        )
+        self.norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.token_norm(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2).unsqueeze(-1)
+        selected = self.tokenLearner(hidden_states)
+        selected = selected.flatten(2)
+        attentions = F.softmax(selected, dim=-1)
+
+        feat = self.feat(hidden_states)
+        feat = feat.flatten(2).transpose(1, 2)
+        feat = torch.einsum("...si,...id->...sd", attentions, feat)
+        a3_out = self.norm(feat)
+
+        return (a3_out, attentions)
+
+
+class MgpstrPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = MgpstrConfig
+    base_model_prefix = "mgp_str"
+
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, MgpstrEmbeddings):
+            nn.init.trunc_normal_(module.pos_embed, mean=0.0, std=self.config.initializer_range)
+            nn.init.trunc_normal_(module.cls_token, mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data = nn.init.trunc_normal_(module.weight.data, mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module: MgpstrEncoder, value: bool = False) -> None:
+        if isinstance(module, MgpstrEncoder):
+            module.gradient_checkpointing = value
+
+
+MGP_STR_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`MgpstrConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+MGP_STR_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
+            for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare MGP-STR Model transformer outputting raw hidden-states without any specific head on top.",
+    MGP_STR_START_DOCSTRING,
+)
+class MgpstrModel(MgpstrPreTrainedModel):
+    def __init__(self, config: MgpstrConfig):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = MgpstrEmbeddings(config)
+        self.encoder = MgpstrEncoder(config)
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.embeddings.proj
+
+    @add_start_docstrings_to_model_forward(MGP_STR_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        embedding_output = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return encoder_outputs
+        return BaseModelOutput(
+            last_hidden_state=encoder_outputs.last_hidden_state,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    MGP-STR Model transformer with three classification heads on top (three A^3 modules and three linear layer on top
+    of the transformer encoder output) for scene text recognition (STR) .
+    """,
+    MGP_STR_START_DOCSTRING,
+)
+class MgpstrForSceneTextRecognition(MgpstrPreTrainedModel):
+    config_class = MgpstrConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: MgpstrConfig) -> None:
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.mgp_str = MgpstrModel(config)
+
+        self.char_a3_module = MgpstrA3Module(config)
+        self.bpe_a3_module = MgpstrA3Module(config)
+        self.wp_a3_module = MgpstrA3Module(config)
+
+        self.char_head = nn.Linear(config.hidden_size, config.num_character_labels)
+        self.bpe_head = nn.Linear(config.hidden_size, config.num_bpe_labels)
+        self.wp_head = nn.Linear(config.hidden_size, config.num_wordpiece_labels)
+
+    @add_start_docstrings_to_model_forward(MGP_STR_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=MgpstrModelOutput, config_class=MgpstrConfig)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        output_attentions: Optional[bool] = None,
+        output_a3_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], MgpstrModelOutput]:
+        r"""
+        output_a3_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of a3 modules. See `a3_attentions` under returned tensors
+            for more detail.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import (
+        ...     MgpstrProcessor,
+        ...     MgpstrForSceneTextRecognition,
+        ... )
+        >>> import requests
+        >>> from PIL import Image
+
+        >>> # load image from the IIIT-5k dataset
+        >>> url = "https://i.postimg.cc/ZKwLg2Gw/367-14.png"
+        >>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+
+        >>> processor = MgpstrProcessor.from_pretrained("alibaba-damo/mgp-str-base")
+        >>> pixel_values = processor(images=image, return_tensors="pt").pixel_values
+
+        >>> model = MgpstrForSceneTextRecognition.from_pretrained("alibaba-damo/mgp-str-base")
+
+        >>> # inference
+        >>> outputs = model(pixel_values)
+        >>> out_strs = processor.batch_decode(outputs.logits)
+        >>> out_strs["generated_text"]
+        '["ticket"]'
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        mgp_outputs = self.mgp_str(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = mgp_outputs[0]
+
+        char_a3_out, char_attention = self.char_a3_module(sequence_output)
+        bpe_a3_out, bpe_attention = self.bpe_a3_module(sequence_output)
+        wp_a3_out, wp_attention = self.wp_a3_module(sequence_output)
+
+        char_logits = self.char_head(char_a3_out)
+        bpe_logits = self.bpe_head(bpe_a3_out)
+        wp_logits = self.wp_head(wp_a3_out)
+
+        all_a3_attentions = (char_attention, bpe_attention, wp_attention) if output_a3_attentions else None
+        all_logits = (char_logits, bpe_logits, wp_logits)
+
+        if not return_dict:
+            outputs = (all_logits, all_a3_attentions) + mgp_outputs[1:]
+            return tuple(output for output in outputs if output is not None)
+        return MgpstrModelOutput(
+            logits=all_logits,
+            hidden_states=mgp_outputs.hidden_states,
+            attentions=mgp_outputs.attentions,
+            a3_attentions=all_a3_attentions,
+        )
diff --git a/src/transformers/models/mgp_str/processing_mgp_str.py b/src/transformers/models/mgp_str/processing_mgp_str.py
new file mode 100644
index 000000000000..6e18e2dd4855
--- /dev/null
+++ b/src/transformers/models/mgp_str/processing_mgp_str.py
@@ -0,0 +1,229 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Processor class for MGP-STR."""
+
+import warnings
+
+from transformers import AutoTokenizer
+from transformers.utils import is_torch_available
+from transformers.utils.generic import ExplicitEnum
+
+from ...processing_utils import ProcessorMixin
+
+
+if is_torch_available():
+    import torch
+
+
+class DecodeType(ExplicitEnum):
+    CHARACTER = "char"
+    BPE = "bpe"
+    WORDPIECE = "wp"
+
+
+SUPPORTED_ANNOTATION_FORMATS = (DecodeType.CHARACTER, DecodeType.BPE, DecodeType.WORDPIECE)
+
+
+class MgpstrProcessor(ProcessorMixin):
+    r"""
+    Constructs a MGP-STR processor which wraps an image processor and MGP-STR tokenizers into a single
+
+    [`MgpstrProcessor`] offers all the functionalities of `ViTImageProcessor`] and [`MgpstrTokenizer`]. See the
+    [`~MgpstrProcessor.__call__`] and [`~MgpstrProcessor.batch_decode`] for more information.
+
+    Args:
+        image_processor (`ViTImageProcessor`, *optional*):
+            An instance of `ViTImageProcessor`. The image processor is a required input.
+        tokenizer ([`MgpstrTokenizer`], *optional*):
+            The tokenizer is a required input.
+    """
+    attributes = ["image_processor", "char_tokenizer"]
+    image_processor_class = "ViTImageProcessor"
+    char_tokenizer_class = "MgpstrTokenizer"
+
+    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        feature_extractor = None
+        if "feature_extractor" in kwargs:
+            warnings.warn(
+                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
+                " instead.",
+                FutureWarning,
+            )
+            feature_extractor = kwargs.pop("feature_extractor")
+
+        image_processor = image_processor if image_processor is not None else feature_extractor
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+
+        self.char_tokenizer = tokenizer
+        self.bpe_tokenizer = AutoTokenizer.from_pretrained("gpt2")
+        self.wp_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+
+        super().__init__(image_processor, tokenizer)
+
+    def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
+        """
+        When used in normal mode, this method forwards all its arguments to ViTImageProcessor's
+        [`~ViTImageProcessor.__call__`] and returns its output. This method also forwards the `text` and `kwargs`
+        arguments to MgpstrTokenizer's [`~MgpstrTokenizer.__call__`] if `text` is not `None` to encode the text. Please
+        refer to the doctsring of the above methods for more information.
+        """
+        if images is None and text is None:
+            raise ValueError("You need to specify either an `images` or `text` input to process.")
+
+        if images is not None:
+            inputs = self.image_processor(images, return_tensors=return_tensors, **kwargs)
+        if text is not None:
+            encodings = self.char_tokenizer(text, return_tensors=return_tensors, **kwargs)
+
+        if text is None:
+            return inputs
+        elif images is None:
+            return encodings
+        else:
+            inputs["labels"] = encodings["input_ids"]
+            return inputs
+
+    def batch_decode(self, sequences):
+        """
+        Convert a list of lists of token ids into a list of strings by calling decode.
+
+        Args:
+            sequences (`torch.Tensor`):
+                List of tokenized input ids.
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the outputs of the decoded results.
+                generated_text (`List[str]`): The final results after fusion of char, bpe, and wp. scores
+                (`List[float]`): The final scores after fusion of char, bpe, and wp. char_preds (`List[str]`): The list
+                of character decoded sentences. bpe_preds (`List[str]`): The list of bpe decoded sentences. wp_preds
+                (`List[str]`): The list of wp decoded sentences.
+
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        char_preds, bpe_preds, wp_preds = sequences
+        batch_size = char_preds.size(0)
+
+        char_strs, char_scores = self._decode_helper(char_preds, "char")
+        bpe_strs, bpe_scores = self._decode_helper(bpe_preds, "bpe")
+        wp_strs, wp_scores = self._decode_helper(wp_preds, "wp")
+
+        final_strs = []
+        final_scores = []
+        for i in range(batch_size):
+            scores = [char_scores[i], bpe_scores[i], wp_scores[i]]
+            strs = [char_strs[i], bpe_strs[i], wp_strs[i]]
+            max_score_index = scores.index(max(scores))
+            final_strs.append(strs[max_score_index])
+            final_scores.append(scores[max_score_index])
+
+        out = {}
+        out["generated_text"] = final_strs
+        out["scores"] = final_scores
+        out["char_preds"] = char_strs
+        out["bpe_preds"] = bpe_strs
+        out["wp_preds"] = wp_strs
+        return out
+
+    def _decode_helper(self, pred_logits, format):
+        """
+        Convert a list of lists of bpe token ids into a list of strings by calling bpe tokenizer.
+
+        Args:
+            pred_logits (`torch.Tensor`):
+                List of model prediction logits.
+            format (`Union[DecoderType, str]`):
+                Type of model prediction. Must be one of ['char', 'bpe', 'wp'].
+        Returns:
+            `tuple`:
+                dec_strs(`str`): The decode strings of model prediction. conf_scores(`List[float]`): The confidence
+                score of model prediction.
+        """
+        if format == DecodeType.CHARACTER:
+            decoder = self.char_decode
+            eos_token = 1
+            eos_str = "[s]"
+        elif format == DecodeType.BPE:
+            decoder = self.bpe_decode
+            eos_token = 2
+            eos_str = "#"
+        elif format == DecodeType.WORDPIECE:
+            decoder = self.wp_decode
+            eos_token = 102
+            eos_str = "[SEP]"
+        else:
+            raise ValueError(f"Format {format} is not supported.")
+
+        dec_strs, conf_scores = [], []
+        batch_size = pred_logits.size(0)
+        batch_max_length = pred_logits.size(1)
+        _, preds_index = pred_logits.topk(1, dim=-1, largest=True, sorted=True)
+        preds_index = preds_index.view(-1, batch_max_length)[:, 1:]
+        preds_str = decoder(preds_index)
+        preds_max_prob, _ = torch.nn.functional.softmax(pred_logits, dim=2).max(dim=2)
+        preds_max_prob = preds_max_prob[:, 1:]
+
+        for index in range(batch_size):
+            pred_eos = preds_str[index].find(eos_str)
+            pred = preds_str[index][:pred_eos]
+            pred_index = preds_index[index].cpu().tolist()
+            pred_eos_index = pred_index.index(eos_token) if eos_token in pred_index else -1
+            pred_max_prob = preds_max_prob[index][: pred_eos_index + 1]
+            confidence_score = pred_max_prob.cumprod(dim=0)[-1] if pred_max_prob.nelement() != 0 else 0.0
+            dec_strs.append(pred)
+            conf_scores.append(confidence_score)
+
+        return dec_strs, conf_scores
+
+    def char_decode(self, sequences):
+        """
+        Convert a list of lists of char token ids into a list of strings by calling char tokenizer.
+
+        Args:
+            sequences (`torch.Tensor`):
+                List of tokenized input ids.
+        Returns:
+            `List[str]`: The list of char decoded sentences.
+        """
+        decode_strs = [seq.replace(" ", "") for seq in self.char_tokenizer.batch_decode(sequences)]
+        return decode_strs
+
+    def bpe_decode(self, sequences):
+        """
+        Convert a list of lists of bpe token ids into a list of strings by calling bpe tokenizer.
+
+        Args:
+            sequences (`torch.Tensor`):
+                List of tokenized input ids.
+        Returns:
+            `List[str]`: The list of bpe decoded sentences.
+        """
+        return self.bpe_tokenizer.batch_decode(sequences)
+
+    def wp_decode(self, sequences):
+        """
+        Convert a list of lists of word piece token ids into a list of strings by calling word piece tokenizer.
+
+        Args:
+            sequences (`torch.Tensor`):
+                List of tokenized input ids.
+        Returns:
+            `List[str]`: The list of wp decoded sentences.
+        """
+        decode_strs = [seq.replace(" ", "") for seq in self.wp_tokenizer.batch_decode(sequences)]
+        return decode_strs
diff --git a/src/transformers/models/mgp_str/tokenization_mgp_str.py b/src/transformers/models/mgp_str/tokenization_mgp_str.py
new file mode 100644
index 000000000000..7fe110611540
--- /dev/null
+++ b/src/transformers/models/mgp_str/tokenization_mgp_str.py
@@ -0,0 +1,111 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for MGT-STR CHAR."""
+
+import json
+import os
+from typing import Optional, Tuple
+
+from ...tokenization_utils import PreTrainedTokenizer
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "mgp-str": "https://huggingface.co/alibaba-damo/mgp-str-base/blob/main/vocab.json",
+    }
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"mgp-str": 27}
+
+
+class MgpstrTokenizer(PreTrainedTokenizer):
+    """
+    Construct a MGP-STR char tokenizer.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        unk_token (`str`, *optional*, defaults to `"[GO]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str`, *optional*, defaults to `"[GO]"`):
+            The beginning of sequence token.
+        eos_token (`str`, *optional*, defaults to `"[s]"`):
+            The end of sequence token.
+        pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"[GO]"`):
+            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
+            attention mechanisms or loss computation.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(self, vocab_file, unk_token="[GO]", bos_token="[GO]", eos_token="[s]", pad_token="[GO]", **kwargs):
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.vocab = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.vocab.items()}
+        super().__init__(
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            **kwargs,
+        )
+
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+
+    def get_vocab(self):
+        vocab = dict(self.vocab).copy()
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text):
+        """Tokenize a string."""
+        char_tokens = []
+        for s in text:
+            char_tokens.extend(s)
+        return char_tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index)
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+
+        return (vocab_file,)
diff --git a/src/transformers/models/mistral/__init__.py b/src/transformers/models/mistral/__init__.py
new file mode 100644
index 000000000000..2f308031dda7
--- /dev/null
+++ b/src/transformers/models/mistral/__init__.py
@@ -0,0 +1,62 @@
+# Copyright 2023 Mistral AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_mistral": ["MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP", "MistralConfig"],
+}
+
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_mistral"] = [
+        "MistralForCausalLM",
+        "MistralModel",
+        "MistralPreTrainedModel",
+        "MistralForSequenceClassification",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_mistral import MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP, MistralConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_mistral import (
+            MistralForCausalLM,
+            MistralForSequenceClassification,
+            MistralModel,
+            MistralPreTrainedModel,
+        )
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/mistral/configuration_mistral.py b/src/transformers/models/mistral/configuration_mistral.py
new file mode 100644
index 000000000000..a1b054df49c2
--- /dev/null
+++ b/src/transformers/models/mistral/configuration_mistral.py
@@ -0,0 +1,149 @@
+# coding=utf-8
+# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Mistral model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "mistralai/Mistral-7B-v0.1": "https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json",
+    "mistralai/Mistral-7B-Instruct-v0.1": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/config.json",
+}
+
+
+class MistralConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an
+    Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Mistral-7B-v0.1 or Mistral-7B-Instruct-v0.1.
+
+    [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
+    [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Mistral model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`MistralModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 14336):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
+            The maximum sequence length that this model might ever be used with. Mistral's sliding window attention
+            allows sequence of up to 4096*32 tokens.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Sliding window attention window size. If not specified, will default to `4096`.
+
+
+    ```python
+    >>> from transformers import MistralModel, MistralConfig
+
+    >>> # Initializing a Mistral 7B style configuration
+    >>> configuration = MistralConfig()
+
+    >>> # Initializing a model from the Mistral 7B style configuration
+    >>> model = MistralModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "mistral"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=14336,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        max_position_embeddings=4096 * 32,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        sliding_window=4096,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
diff --git a/src/transformers/models/mistral/convert_mistral_weights_to_hf.py b/src/transformers/models/mistral/convert_mistral_weights_to_hf.py
new file mode 100644
index 000000000000..6e1ffbcc0443
--- /dev/null
+++ b/src/transformers/models/mistral/convert_mistral_weights_to_hf.py
@@ -0,0 +1,276 @@
+# Copyright 2023 Mistral AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import gc
+import json
+import os
+import shutil
+import warnings
+
+import torch
+
+from transformers import (
+    LlamaTokenizer,
+    MistralConfig,
+    MistralForCausalLM,
+)
+
+
+try:
+    from transformers import LlamaTokenizerFast
+
+    tokenizer_class = LlamaTokenizerFast
+except ImportError as e:
+    warnings.warn(e)
+    warnings.warn(
+        "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion"
+    )
+    tokenizer_class = LlamaTokenizer
+
+"""
+Sample usage:
+
+```
+python src/transformers/models/mistral/convert_mistral_weights_to_hf.py \
+    --input_dir /path/to/downloaded/mistral/weights --model_size 7B --output_dir /output/path
+```
+
+Thereafter, models can be loaded via:
+
+```py
+from transformers import MistralForCausalLM, LlamaTokenizer
+
+model = MistralForCausalLM.from_pretrained("/output/path")
+tokenizer = LlamaTokenizer.from_pretrained("/output/path")
+```
+
+Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
+come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
+"""
+
+NUM_SHARDS = {"7B": 1}
+
+
+def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
+    return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of)
+
+
+def read_json(path):
+    with open(path, "r") as f:
+        return json.load(f)
+
+
+def write_json(text, path):
+    with open(path, "w") as f:
+        json.dump(text, f)
+
+
+def write_model(model_path, input_base_path, model_size, tokenizer_path=None, safe_serialization=True):
+    # for backward compatibility, before you needed the repo to be called `my_repo/model_size`
+    if not os.path.isfile(os.path.join(input_base_path, "params.json")):
+        input_base_path = os.path.join(input_base_path, model_size)
+
+    os.makedirs(model_path, exist_ok=True)
+    tmp_model_path = os.path.join(model_path, "tmp")
+    os.makedirs(tmp_model_path, exist_ok=True)
+
+    params = read_json(os.path.join(input_base_path, "params.json"))
+    num_shards = NUM_SHARDS[model_size]
+
+    # For some reason this is a string in the params.json
+    sliding_window = int(params["ragged_attention"])
+    n_layers = params["n_layers"]
+    n_heads = params["n_heads"]
+    n_heads_per_shard = n_heads // num_shards
+    dim = params["dim"]
+    dims_per_head = dim // n_heads
+    base = params.get("rope_theta", 10000.0)
+    inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
+    max_position_embeddings = 4096 * 8
+
+    if tokenizer_path is not None:
+        tokenizer = tokenizer_class(tokenizer_path)
+        tokenizer.save_pretrained(model_path)
+    vocab_size = tokenizer.vocab_size if tokenizer_path is not None else 32000
+
+    if "n_kv_heads" in params:
+        num_key_value_heads = params["n_kv_heads"]  # for GQA / MQA
+        num_local_key_value_heads = num_key_value_heads // num_shards
+        key_value_dim = dims_per_head * num_local_key_value_heads
+    else:  # compatibility with other checkpoints
+        num_key_value_heads = n_heads
+        num_local_key_value_heads = n_heads_per_shard
+        key_value_dim = dim
+
+    # permute for sliced rotary
+    def permute(w, n_heads=n_heads, dim1=dim, dim2=dim):
+        return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2)
+
+    print(f"Fetching all parameters from the checkpoint at {input_base_path}.")
+    # Load weights
+    loaded = [
+        torch.load(os.path.join(input_base_path, f"consolidated.{i:02d}.pth"), map_location="cpu")
+        for i in range(num_shards)
+    ]
+    param_count = 0
+    index_dict = {"weight_map": {}}
+    for layer_i in range(n_layers):
+        filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin"
+
+        # Sharded
+        # Note that attention.w{q,k,v,o}, feed_fordward.w[1,2,3], attention_norm.weight and ffn_norm.weight share
+        # the same storage object, saving attention_norm and ffn_norm will save other weights too, which is
+        # redundant as other weights will be stitched from multiple shards. To avoid that, they are cloned.
+
+        state_dict = {
+            f"model.layers.{layer_i}.input_layernorm.weight": loaded[0][
+                f"layers.{layer_i}.attention_norm.weight"
+            ].clone(),
+            f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[0][
+                f"layers.{layer_i}.ffn_norm.weight"
+            ].clone(),
+        }
+        state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute(
+            torch.cat(
+                [
+                    loaded[i][f"layers.{layer_i}.attention.wq.weight"].view(n_heads_per_shard, dims_per_head, dim)
+                    for i in range(num_shards)
+                ],
+                dim=0,
+            ).reshape(dim, dim)
+        )
+        state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute(
+            torch.cat(
+                [
+                    loaded[i][f"layers.{layer_i}.attention.wk.weight"].view(
+                        num_local_key_value_heads, dims_per_head, dim
+                    )
+                    for i in range(num_shards)
+                ],
+                dim=0,
+            ).reshape(key_value_dim, dim),
+            num_key_value_heads,
+            key_value_dim,
+            dim,
+        )
+        state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat(
+            [
+                loaded[i][f"layers.{layer_i}.attention.wv.weight"].view(num_local_key_value_heads, dims_per_head, dim)
+                for i in range(num_shards)
+            ],
+            dim=0,
+        ).reshape(key_value_dim, dim)
+
+        state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat(
+            [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(num_shards)], dim=1
+        )
+        state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat(
+            [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(num_shards)], dim=0
+        )
+        state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat(
+            [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(num_shards)], dim=1
+        )
+        state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat(
+            [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(num_shards)], dim=0
+        )
+
+        state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
+        for k, v in state_dict.items():
+            index_dict["weight_map"][k] = filename
+            param_count += v.numel()
+        torch.save(state_dict, os.path.join(tmp_model_path, filename))
+
+    filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin"
+    state_dict = {
+        "model.norm.weight": loaded[0]["norm.weight"],
+        "model.embed_tokens.weight": torch.cat([loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=1),
+        "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(num_shards)], dim=0),
+    }
+
+    for k, v in state_dict.items():
+        index_dict["weight_map"][k] = filename
+        param_count += v.numel()
+    torch.save(state_dict, os.path.join(tmp_model_path, filename))
+
+    # Write configs
+    index_dict["metadata"] = {"total_size": param_count * 2}
+    write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json"))
+    config = MistralConfig(
+        hidden_size=dim,
+        intermediate_size=params["hidden_dim"],
+        num_attention_heads=params["n_heads"],
+        num_hidden_layers=params["n_layers"],
+        rms_norm_eps=params["norm_eps"],
+        num_key_value_heads=num_key_value_heads,
+        vocab_size=vocab_size,
+        rope_theta=base,
+        max_position_embeddings=max_position_embeddings,
+        sliding_window=sliding_window,
+    )
+    config.save_pretrained(tmp_model_path)
+
+    # Make space so we can load the model properly now.
+    del state_dict
+    del loaded
+    gc.collect()
+
+    print("Loading the checkpoint in a Mistral model.")
+    model = MistralForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True)
+    # Avoid saving this as part of the config.
+    del model.config._name_or_path
+    model.config.torch_dtype = torch.float16
+    print("Saving in the Transformers format.")
+    model.save_pretrained(model_path, safe_serialization=safe_serialization)
+    shutil.rmtree(tmp_model_path)
+
+
+def write_tokenizer(tokenizer_path, input_tokenizer_path):
+    # Initialize the tokenizer based on the `spm` model
+    print(f"Saving a {tokenizer_class.__name__} to {tokenizer_path}.")
+    tokenizer = tokenizer_class(input_tokenizer_path)
+    tokenizer.save_pretrained(tokenizer_path)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--input_dir",
+        help="Location of Mistral weights, which contains tokenizer.model and model folders",
+    )
+    parser.add_argument(
+        "--model_size",
+        choices=["7B", "tokenizer_only"],
+        help="'f' models correspond to the finetuned versions, and are specific to the Mistral2 official release. For more details on Mistral2, checkout the original repo: https://huggingface.co/meta-mistral",
+    )
+    parser.add_argument(
+        "--output_dir",
+        help="Location to write HF model and tokenizer",
+    )
+    parser.add_argument("--safe_serialization", type=bool, help="Whether or not to save using `safetensors`.")
+    args = parser.parse_args()
+    spm_path = os.path.join(args.input_dir, "tokenizer.model")
+    if args.model_size != "tokenizer_only":
+        write_model(
+            model_path=args.output_dir,
+            input_base_path=args.input_dir,
+            model_size=args.model_size,
+            safe_serialization=args.safe_serialization,
+            tokenizer_path=spm_path,
+        )
+    else:
+        write_tokenizer(args.output_dir, spm_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py
new file mode 100644
index 000000000000..a55f16a23d5b
--- /dev/null
+++ b/src/transformers/models/mistral/modeling_mistral.py
@@ -0,0 +1,1243 @@
+# coding=utf-8
+# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Mistral model."""
+import inspect
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_available,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_mistral import MistralConfig
+
+
+if is_flash_attn_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+
+    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "MistralConfig"
+
+
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(padding_mask):
+    seqlens_in_batch = padding_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(padding_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+def _make_sliding_window_causal_mask(
+    input_ids_shape: torch.Size,
+    dtype: torch.dtype,
+    device: torch.device,
+    past_key_values_length: int = 0,
+    sliding_window: int = 4096,
+):
+    """
+    Make causal mask used for sliding window attention
+    """
+    bsz, tgt_len = input_ids_shape
+
+    tensor = torch.full(
+        (tgt_len, tgt_len),
+        fill_value=1,
+        device=device,
+    )
+    mask = torch.tril(tensor, diagonal=0)
+    # make the mask banded to account for sliding window
+    mask = torch.triu(mask, diagonal=-sliding_window)
+    mask = torch.log(mask).to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral
+class MistralRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        MistralRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral
+class MistralRotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+    cos = cos[position_ids].unsqueeze(1)  # [seq_len, dim] -> [batch_size, 1, seq_len, head_dim]
+    sin = sin[position_ids].unsqueeze(1)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class MistralMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class MistralAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+
+    def __init__(self, config: MistralConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+
+        self.rotary_emb = MistralRotaryEmbedding(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        padding_mask: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+        past_key_value = (key_states, value_states) if use_cache else None
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class MistralFlashAttention2(MistralAttention):
+    """
+    Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        padding_mask: Optional[torch.LongTensor] = None,
+    ):
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+
+        # Because the input can be padded, the absolute sequence length depends on the max position id.
+        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
+        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
+
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        use_sliding_windows = (
+            _flash_supports_window_size
+            and hasattr(self.config, "sliding_window") is not None
+            and kv_seq_len > self.config.sliding_window
+        )
+
+        if not _flash_supports_window_size:
+            logger.warning_once(
+                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
+                " make sure to upgrade flash-attn library."
+            )
+
+        if past_key_value is not None:
+            # Activate slicing cache only if the config has a value `sliding_windows` attribute
+            if hasattr(self.config, "sliding_window") and kv_seq_len > self.config.sliding_window:
+                slicing_tokens = kv_seq_len - self.config.sliding_window
+
+                past_key = past_key_value[0]
+                past_value = past_key_value[1]
+
+                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
+                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
+
+                if past_key.shape[-2] != self.config.sliding_window - 1:
+                    raise ValueError(
+                        f"past key much have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
+                        f" {past_key.shape}"
+                    )
+
+                past_key_value = (past_key, past_value)
+
+                if padding_mask is not None:
+                    padding_mask = padding_mask[:, slicing_tokens:]
+                    padding_mask = torch.cat([padding_mask, torch.ones_like(padding_mask[:, -1:])], dim=-1)
+
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+        past_key_value = (key_states, value_states) if use_cache else None
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        # TODO: Mistral does not have dropout in the config??
+        # It is recommended to use dropout with FA according to the docs
+        # when training.
+        dropout_rate = 0.0  # if not self.training else self.attn_dropout
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            logger.warning_once(
+                "The input hidden states seems to be silently casted in float32, this might be related to"
+                " the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                " float16."
+            )
+
+            query_states = query_states.to(torch.float16)
+            key_states = key_states.to(torch.float16)
+            value_states = value_states.to(torch.float16)
+
+        # Reashape to the expected shape for Flash Attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        attn_output = self._flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            padding_mask,
+            q_len,
+            dropout=dropout_rate,
+            use_sliding_windows=use_sliding_windows,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+    def _flash_attention_forward(
+        self,
+        query_states,
+        key_states,
+        value_states,
+        padding_mask,
+        query_length,
+        dropout=0.0,
+        softmax_scale=None,
+        use_sliding_windows=False,
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            padding_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`int`, *optional*):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+            use_sliding_windows (`bool`, *optional*):
+                Whether to activate sliding window attention.
+        """
+        # Contains at least one padding token in the sequence
+        if padding_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, padding_mask, query_length
+            )
+
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+            if not use_sliding_windows:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=True,
+                )
+            else:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=True,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            if not use_sliding_windows:
+                attn_output = flash_attn_func(
+                    query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=True
+                )
+            else:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=True,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+
+        return attn_output
+
+    def _upad_input(self, query_layer, key_layer, value_layer, padding_mask, query_length):
+        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
+
+        # On the first iteration we need to properly re-create the padding mask
+        # by slicing it on the proper place
+        if kv_seq_len != padding_mask.shape[-1]:
+            padding_mask_num_tokens = padding_mask.shape[-1]
+            padding_mask = padding_mask[:, padding_mask_num_tokens - kv_seq_len :]
+
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(padding_mask)
+
+        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            padding_mask = padding_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, padding_mask)
+
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+
+
+class MistralDecoderLayer(nn.Module):
+    def __init__(self, config: MistralConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = (
+            MistralAttention(config=config)
+            if not getattr(config, "_flash_attn_2_enabled", False)
+            else MistralFlashAttention2(config)
+        )
+        self.mlp = MistralMLP(config)
+        self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        padding_mask: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            padding_mask=padding_mask,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+MISTRAL_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`MistralConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
+    MISTRAL_START_DOCSTRING,
+)
+class MistralPreTrainedModel(PreTrainedModel):
+    config_class = MistralConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["MistralDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, MistralModel):
+            module.gradient_checkpointing = value
+
+
+MISTRAL_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
+    MISTRAL_START_DOCSTRING,
+)
+class MistralModel(MistralPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`]
+
+    Args:
+        config: MistralConfig
+    """
+
+    def __init__(self, config: MistralConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList([MistralDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.norm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    def _prepare_decoder_attention_mask(
+        self, attention_mask, input_shape, inputs_embeds, past_key_values_length, sliding_window
+    ):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_sliding_window_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+                sliding_window=sliding_window,
+            )
+
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+
+        return combined_attention_mask
+
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        padding_mask = None
+
+        # embed positions
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+            )
+        elif 0 in attention_mask:
+            padding_mask = attention_mask
+
+        if (
+            padding_mask is not None
+            and hasattr(self.config, "_flash_attn_2_enabled")
+            and self.config._flash_attn_2_enabled
+        ):
+            is_padding_right = padding_mask[:, -1].sum().item() != batch_size
+            if is_padding_right:
+                raise ValueError(
+                    "You are attempting to perform batched generation with padding_side='right'"
+                    " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to "
+                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                )
+
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask,
+            (batch_size, seq_length),
+            inputs_embeds,
+            past_key_values_length,
+            sliding_window=self.config.sliding_window,
+        )
+
+        hidden_states = inputs_embeds
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, past_key_value, output_attentions, padding_mask=padding_mask)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    padding_mask=padding_mask,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class MistralForCausalLM(MistralPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = MistralModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, MistralForCausalLM
+
+        >>> model = MistralForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+
+@add_start_docstrings(
+    """
+    The Mistral Model transformer with a sequence classification head on top (linear layer).
+
+    [`MistralForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    MISTRAL_START_DOCSTRING,
+)
+# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mistral, LLAMA->MISTRAL
+class MistralForSequenceClassification(MistralPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = MistralModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+                    logits.device
+                )
+            else:
+                sequence_lengths = -1
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
diff --git a/src/transformers/models/mluke/__init__.py b/src/transformers/models/mluke/__init__.py
index b6582e35a9d0..aae869bdff51 100644
--- a/src/transformers/models/mluke/__init__.py
+++ b/src/transformers/models/mluke/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/mluke/convert_mluke_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/mluke/convert_mluke_original_pytorch_checkpoint_to_pytorch.py
index 9d61c3bc8e27..f361082fb3c5 100644
--- a/src/transformers/models/mluke/convert_mluke_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/mluke/convert_mluke_original_pytorch_checkpoint_to_pytorch.py
@@ -46,7 +46,7 @@ def convert_luke_checkpoint(checkpoint_path, metadata_path, entity_vocab_path, p
     # Add special tokens to the token vocabulary for downstream tasks
     entity_token_1 = AddedToken("<ent>", lstrip=False, rstrip=False)
     entity_token_2 = AddedToken("<ent2>", lstrip=False, rstrip=False)
-    tokenizer.add_special_tokens(dict(additional_special_tokens=[entity_token_1, entity_token_2]))
+    tokenizer.add_special_tokens({"additional_special_tokens": [entity_token_1, entity_token_2]})
     config.vocab_size += 2
 
     print(f"Saving tokenizer to {pytorch_dump_folder_path}")
diff --git a/src/transformers/models/mluke/tokenization_mluke.py b/src/transformers/models/mluke/tokenization_mluke.py
index cc347bca864f..028de5d4f79c 100644
--- a/src/transformers/models/mluke/tokenization_mluke.py
+++ b/src/transformers/models/mluke/tokenization_mluke.py
@@ -23,7 +23,6 @@
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
-
 import sentencepiece as spm
 
 from ...tokenization_utils import PreTrainedTokenizer
@@ -273,32 +272,11 @@ def __init__(
             if isinstance(entity_token_2, str)
             else entity_token_2
         )
-        kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", [])
-        kwargs["additional_special_tokens"] += [entity_token_1, entity_token_2]
+        additional_special_tokens = kwargs.pop("additional_special_tokens", [])
+        additional_special_tokens += [entity_token_1, entity_token_2]
 
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
 
-        super().__init__(
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            sp_model_kwargs=self.sp_model_kwargs,
-            task=task,
-            max_entity_length=max_entity_length,
-            max_mention_length=max_mention_length,
-            entity_token_1=entity_token_1,
-            entity_token_2=entity_token_2,
-            entity_unk_token=entity_unk_token,
-            entity_pad_token=entity_pad_token,
-            entity_mask_token=entity_mask_token,
-            entity_mask2_token=entity_mask2_token,
-            **kwargs,
-        )
-
         self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
         self.sp_model.Load(str(vocab_file))
         self.vocab_file = vocab_file
@@ -346,6 +324,65 @@ def __init__(
 
         self.max_mention_length = max_mention_length
 
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            sp_model_kwargs=self.sp_model_kwargs,
+            task=task,
+            max_entity_length=max_entity_length,
+            max_mention_length=max_mention_length,
+            entity_token_1=entity_token_1,
+            entity_token_2=entity_token_2,
+            entity_unk_token=entity_unk_token,
+            entity_pad_token=entity_pad_token,
+            entity_mask_token=entity_mask_token,
+            entity_mask2_token=entity_mask2_token,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs,
+        )
+
+    @property
+    # Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer.vocab_size
+    def vocab_size(self):
+        return len(self.sp_model) + self.fairseq_offset + 1  # Add the <mask> token
+
+    # Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer.get_vocab
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    # Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer._tokenize
+    def _tokenize(self, text: str) -> List[str]:
+        # TODO check if the t5/llama PR also applies here
+        return self.sp_model.encode(text, out_type=str)
+
+    # Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer._convert_token_to_id
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        if token in self.fairseq_tokens_to_ids:
+            return self.fairseq_tokens_to_ids[token]
+        spm_id = self.sp_model.PieceToId(token)
+
+        # Need to return unknown token if the SP model returned 0
+        return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        if index in self.fairseq_ids_to_tokens:
+            return self.fairseq_ids_to_tokens[index]
+        return self.sp_model.IdToPiece(index - self.fairseq_offset)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (strings for sub-words) in a single string."""
+        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
+        return out_string
+
     def __getstate__(self):
         state = self.__dict__.copy()
         state["sp_model"] = None
@@ -388,7 +425,7 @@ def __call__(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
@@ -534,9 +571,8 @@ def _encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
-
         if return_offsets_mapping:
             raise NotImplementedError(
                 "return_offset_mapping is not available when using Python tokenizers. "
@@ -617,7 +653,7 @@ def _batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         if return_offsets_mapping:
             raise NotImplementedError(
@@ -710,7 +746,6 @@ def _check_entity_input_format(self, entities: Optional[EntityInput], entity_spa
             )
 
         if entities is not None:
-
             if not isinstance(entities, list):
                 raise ValueError("If you specify entities, they should be given as a list")
 
@@ -729,7 +764,7 @@ def _create_input_sequence(
         entities_pair: Optional[EntityInput] = None,
         entity_spans: Optional[EntitySpanInput] = None,
         entity_spans_pair: Optional[EntitySpanInput] = None,
-        **kwargs
+        **kwargs,
     ) -> Tuple[list, list, list, list, list, list]:
         def get_input_ids(text):
             tokens = self.tokenize(text, **kwargs)
@@ -770,7 +805,6 @@ def get_input_ids_and_entity_token_spans(text, entity_spans):
         first_entity_token_spans, second_entity_token_spans = None, None
 
         if self.task is None:
-
             if entity_spans is None:
                 first_ids = get_input_ids(text)
             else:
@@ -853,7 +887,6 @@ def get_input_ids_and_entity_token_spans(text, entity_spans):
                 first_ids = first_ids[:entity_token_start] + [special_token_id] + first_ids[entity_token_start:]
 
         elif self.task == "entity_span_classification":
-
             if not (isinstance(entity_spans, list) and len(entity_spans) > 0 and isinstance(entity_spans[0], tuple)):
                 raise ValueError(
                     "Entity spans should be provided as a list of tuples, "
@@ -984,7 +1017,7 @@ def prepare_for_model(
         return_length: bool = False,
         verbose: bool = True,
         prepend_batch_axis: bool = False,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Prepares a sequence of input id, entity id and entity span, or a pair of sequences of inputs ids, entity ids,
@@ -1238,7 +1271,7 @@ def pad(
                 The maximum length of the entity sequence.
             pad_to_multiple_of (`int`, *optional*):
                 If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
-                the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
+                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
             return_attention_mask (`bool`, *optional*):
                 Whether to return the attention mask. If left to the default, will return the attention mask according
                 to the specific tokenizer's default, defined by the `return_outputs` attribute. [What are attention
@@ -1333,7 +1366,7 @@ def pad(
 
         batch_outputs = {}
         for i in range(batch_size):
-            inputs = dict((k, v[i]) for k, v in encoded_inputs.items())
+            inputs = {k: v[i] for k, v in encoded_inputs.items()}
             outputs = self._pad(
                 inputs,
                 max_length=max_length,
@@ -1383,7 +1416,7 @@ def _pad(
                     - 'right': pads on the right of the sequences
             pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                 This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
-                >= 7.5 (Volta).
+                `>= 7.5` (Volta).
             return_attention_mask:
                 (optional) Set to False to avoid returning attention mask (default: set to model specifics)
         """
@@ -1499,8 +1532,12 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
             save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
         )
 
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
             copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
 
         entity_vocab_file = os.path.join(
             save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["entity_vocab_file"]
@@ -1592,39 +1629,3 @@ def create_token_type_ids_from_sequences(
         if token_ids_1 is None:
             return len(cls + token_ids_0 + sep) * [0]
         return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
-
-    @property
-    # Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer.vocab_size
-    def vocab_size(self):
-        return len(self.sp_model) + self.fairseq_offset + 1  # Add the <mask> token
-
-    # Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer.get_vocab
-    def get_vocab(self):
-        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-
-    # Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer._tokenize
-    def _tokenize(self, text: str) -> List[str]:
-        return self.sp_model.encode(text, out_type=str)
-
-    # Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer._convert_token_to_id
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        if token in self.fairseq_tokens_to_ids:
-            return self.fairseq_tokens_to_ids[token]
-        spm_id = self.sp_model.PieceToId(token)
-
-        # Need to return unknown token if the SP model returned 0
-        return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        if index in self.fairseq_ids_to_tokens:
-            return self.fairseq_ids_to_tokens[index]
-        return self.sp_model.IdToPiece(index - self.fairseq_offset)
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (strings for sub-words) in a single string."""
-        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
-        return out_string
diff --git a/src/transformers/models/mmbt/__init__.py b/src/transformers/models/mmbt/__init__.py
deleted file mode 100644
index d95a2cc8d84a..000000000000
--- a/src/transformers/models/mmbt/__init__.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import TYPE_CHECKING
-
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
-
-
-_import_structure = {"configuration_mmbt": ["MMBTConfig"]}
-
-try:
-    if not is_torch_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_mmbt"] = ["MMBTForClassification", "MMBTModel", "ModalEmbeddings"]
-
-
-if TYPE_CHECKING:
-    from .configuration_mmbt import MMBTConfig
-
-    try:
-        if not is_torch_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_mmbt import MMBTForClassification, MMBTModel, ModalEmbeddings
-
-else:
-    import sys
-
-    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/mobilebert/__init__.py b/src/transformers/models/mobilebert/__init__.py
index ae91c38bdfb3..0d202eb4d423 100644
--- a/src/transformers/models/mobilebert/__init__.py
+++ b/src/transformers/models/mobilebert/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/mobilebert/configuration_mobilebert.py b/src/transformers/models/mobilebert/configuration_mobilebert.py
index 7034cdb2769b..afe6c3b3d927 100644
--- a/src/transformers/models/mobilebert/configuration_mobilebert.py
+++ b/src/transformers/models/mobilebert/configuration_mobilebert.py
@@ -136,7 +136,7 @@ def __init__(
         normalization_type="no_norm",
         classifier_activation=True,
         classifier_dropout=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
 
diff --git a/src/transformers/models/mobilebert/modeling_mobilebert.py b/src/transformers/models/mobilebert/modeling_mobilebert.py
index 8b04dce18820..70f2ebc7bfd8 100644
--- a/src/transformers/models/mobilebert/modeling_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_mobilebert.py
@@ -58,7 +58,6 @@
 
 _CHECKPOINT_FOR_DOC = "google/mobilebert-uncased"
 _CONFIG_FOR_DOC = "MobileBertConfig"
-_TOKENIZER_FOR_DOC = "MobileBertTokenizer"
 
 # TokenClassification docstring
 _CHECKPOINT_FOR_TOKEN_CLASSIFICATION = "mrm8488/mobilebert-finetuned-ner"
@@ -192,7 +191,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
     def forward(
         self,
@@ -687,7 +688,6 @@ class MobileBertPreTrainedModel(PreTrainedModel):
     pretrained_model_archive_map = MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST
     load_tf_weights = load_tf_weights_in_mobilebert
     base_model_prefix = "mobilebert"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -761,7 +761,7 @@ class MobileBertForPreTrainingOutput(ModelOutput):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -842,7 +842,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPooling,
         config_class=_CONFIG_FOR_DOC,
@@ -868,6 +867,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -925,11 +925,7 @@ def forward(
     MOBILEBERT_START_DOCSTRING,
 )
 class MobileBertForPreTraining(MobileBertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        "cls.predictions.decoder.weight",
-        "cls.predictions.decoder.bias",
-        "embeddings.position_ids",
-    ]
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -986,10 +982,10 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import MobileBertTokenizer, MobileBertForPreTraining
+        >>> from transformers import AutoTokenizer, MobileBertForPreTraining
         >>> import torch
 
-        >>> tokenizer = MobileBertTokenizer.from_pretrained("google/mobilebert-uncased")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/mobilebert-uncased")
         >>> model = MobileBertForPreTraining.from_pretrained("google/mobilebert-uncased")
 
         >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)
@@ -1037,13 +1033,7 @@ def forward(
 
 @add_start_docstrings("""MobileBert Model with a `language modeling` head on top.""", MOBILEBERT_START_DOCSTRING)
 class MobileBertForMaskedLM(MobileBertPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [
-        "cls.predictions.decoder.weight",
-        "cls.predictions.decoder.bias",
-        "embeddings.position_ids",
-    ]
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1069,7 +1059,6 @@ def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> nn.Em
 
     @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1182,10 +1171,10 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import MobileBertTokenizer, MobileBertForNextSentencePrediction
+        >>> from transformers import AutoTokenizer, MobileBertForNextSentencePrediction
         >>> import torch
 
-        >>> tokenizer = MobileBertTokenizer.from_pretrained("google/mobilebert-uncased")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/mobilebert-uncased")
         >>> model = MobileBertForNextSentencePrediction.from_pretrained("google/mobilebert-uncased")
 
         >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
@@ -1265,7 +1254,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1353,9 +1341,6 @@ def forward(
 )
 # Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering with Bert->MobileBert all-casing
 class MobileBertForQuestionAnswering(MobileBertPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1368,7 +1353,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_QA,
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1478,7 +1462,6 @@ def __init__(self, config):
         MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
     )
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1559,9 +1542,6 @@ def forward(
 )
 # Copied from transformers.models.bert.modeling_bert.BertForTokenClassification with Bert->MobileBert all-casing
 class MobileBertForTokenClassification(MobileBertPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1578,7 +1558,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_TOKEN_CLASSIFICATION,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
index 38f7f4474c62..bc508a47984e 100644
--- a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
@@ -15,9 +15,12 @@
 # limitations under the License.
 """ TF 2.0 MobileBERT model."""
 
+
+from __future__ import annotations
+
 import warnings
 from dataclasses import dataclass
-from typing import Dict, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import numpy as np
 import tensorflow as tf
@@ -46,9 +49,8 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -63,7 +65,6 @@
 
 _CHECKPOINT_FOR_DOC = "google/mobilebert-uncased"
 _CONFIG_FOR_DOC = "MobileBertConfig"
-_TOKENIZER_FOR_DOC = "MobileBertTokenizer"
 
 # TokenClassification docstring
 _CHECKPOINT_FOR_TOKEN_CLASSIFICATION = "vumichien/mobilebert-finetuned-ner"
@@ -150,6 +151,7 @@ def __init__(self, feat_size, epsilon=None, **kwargs):
     def build(self, input_shape):
         self.bias = self.add_weight("bias", shape=[self.feat_size], initializer="zeros")
         self.weight = self.add_weight("weight", shape=[self.feat_size], initializer="ones")
+        super().build(input_shape)
 
     def call(self, inputs: tf.Tensor):
         return inputs * self.weight + self.bias
@@ -166,9 +168,8 @@ def __init__(self, config, **kwargs):
 
         self.trigram_input = config.trigram_input
         self.embedding_size = config.embedding_size
-        self.vocab_size = config.vocab_size
+        self.config = config
         self.hidden_size = config.hidden_size
-        self.type_vocab_size = config.type_vocab_size
         self.max_position_embeddings = config.max_position_embeddings
         self.initializer_range = config.initializer_range
         self.embedding_transformation = tf.keras.layers.Dense(config.hidden_size, name="embedding_transformation")
@@ -184,14 +185,14 @@ def build(self, input_shape):
         with tf.name_scope("word_embeddings"):
             self.weight = self.add_weight(
                 name="weight",
-                shape=[self.vocab_size, self.embedding_size],
+                shape=[self.config.vocab_size, self.embedding_size],
                 initializer=get_initializer(initializer_range=self.initializer_range),
             )
 
         with tf.name_scope("token_type_embeddings"):
             self.token_type_embeddings = self.add_weight(
                 name="embeddings",
-                shape=[self.type_vocab_size, self.hidden_size],
+                shape=[self.config.type_vocab_size, self.hidden_size],
                 initializer=get_initializer(initializer_range=self.initializer_range),
             )
 
@@ -214,16 +215,7 @@ def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_em
         assert not (input_ids is None and inputs_embeds is None)
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
@@ -658,13 +650,12 @@ class TFMobileBertLMPredictionHead(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super().__init__(**kwargs)
         self.transform = TFMobileBertPredictionHeadTransform(config, name="transform")
-        self.vocab_size = config.vocab_size
         self.config = config
 
     def build(self, input_shape):
-        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
+        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
         self.dense = self.add_weight(
-            shape=(self.config.hidden_size - self.config.embedding_size, self.vocab_size),
+            shape=(self.config.hidden_size - self.config.embedding_size, self.config.vocab_size),
             initializer="zeros",
             trainable=True,
             name="dense/weight",
@@ -682,14 +673,14 @@ def get_output_embeddings(self):
 
     def set_output_embeddings(self, value):
         self.decoder = value
-        self.vocab_size = shape_list(value)[0]
+        self.config.vocab_size = shape_list(value)[0]
 
     def get_bias(self):
         return {"bias": self.bias}
 
     def set_bias(self, value):
         self.bias = value["bias"]
-        self.vocab_size = shape_list(value["bias"])[0]
+        self.config.vocab_size = shape_list(value["bias"])[0]
 
     def call(self, hidden_states):
         hidden_states = self.transform(hidden_states)
@@ -858,11 +849,11 @@ class TFMobileBertForPreTrainingOutput(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     prediction_logits: tf.Tensor = None
     seq_relationship_logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 MOBILEBERT_START_DOCSTRING = r"""
@@ -912,7 +903,7 @@ class TFMobileBertForPreTrainingOutput(ModelOutput):
         input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`MobileBertTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -975,19 +966,18 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFBaseModelOutputWithPooling,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1008,17 +998,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutputWithPooling(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            hidden_states=hs,
-            attentions=attns,
-        )
-
 
 @add_start_docstrings(
     """
@@ -1046,17 +1025,17 @@ def get_prefix_bias_name(self):
     @replace_return_docstrings(output_type=TFMobileBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        next_sentence_label: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        next_sentence_label: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFMobileBertForPreTrainingOutput]:
         r"""
@@ -1066,9 +1045,9 @@ def call(
 
         ```python
         >>> import tensorflow as tf
-        >>> from transformers import MobileBertTokenizer, TFMobileBertForPreTraining
+        >>> from transformers import AutoTokenizer, TFMobileBertForPreTraining
 
-        >>> tokenizer = MobileBertTokenizer.from_pretrained("google/mobilebert-uncased")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/mobilebert-uncased")
         >>> model = TFMobileBertForPreTraining.from_pretrained("google/mobilebert-uncased")
         >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
         >>> outputs = model(input_ids)
@@ -1109,17 +1088,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMobileBertForPreTrainingOutput(
-            prediction_logits=output.prediction_logits,
-            seq_relationship_logits=output.seq_relationship_logits,
-            hidden_states=hs,
-            attentions=attns,
-        )
-
 
 @add_start_docstrings("""MobileBert Model with a `language modeling` head on top.""", MOBILEBERT_START_DOCSTRING)
 class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel, TFMaskedLanguageModelingLoss):
@@ -1146,7 +1114,6 @@ def get_prefix_bias_name(self):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFMaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1155,16 +1122,16 @@ def get_prefix_bias_name(self):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFMaskedLMOutput]:
         r"""
@@ -1201,13 +1168,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output
-    def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 class TFMobileBertOnlyNSPHead(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
@@ -1238,16 +1198,16 @@ def __init__(self, config, *inputs, **kwargs):
     @replace_return_docstrings(output_type=TFNextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        next_sentence_label: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        next_sentence_label: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFNextSentencePredictorOutput]:
         r"""
@@ -1257,9 +1217,9 @@ def call(
 
         ```python
         >>> import tensorflow as tf
-        >>> from transformers import MobileBertTokenizer, TFMobileBertForNextSentencePrediction
+        >>> from transformers import AutoTokenizer, TFMobileBertForNextSentencePrediction
 
-        >>> tokenizer = MobileBertTokenizer.from_pretrained("google/mobilebert-uncased")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/mobilebert-uncased")
         >>> model = TFMobileBertForNextSentencePrediction.from_pretrained("google/mobilebert-uncased")
 
         >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
@@ -1300,13 +1260,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForNextSentencePrediction.serving_output
-    def serving_output(self, output: TFNextSentencePredictorOutput) -> TFNextSentencePredictorOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFNextSentencePredictorOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1341,7 +1294,6 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
         output_type=TFSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1350,16 +1302,16 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFSequenceClassifierOutput]:
         r"""
@@ -1398,13 +1350,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1435,7 +1380,6 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_QA,
         output_type=TFQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1446,17 +1390,17 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFQuestionAnsweringModelOutput]:
         r"""
@@ -1505,15 +1449,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output
-    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
-
 
 @add_start_docstrings(
     """
@@ -1541,38 +1476,27 @@ def __init__(self, config, *inputs, **kwargs):
             1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
         )
 
-    @property
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            tf.Tensor with dummy inputs
-        """
-        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32)}
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(
         MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
     )
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFMultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFMultipleChoiceModelOutput]:
         r"""
@@ -1626,28 +1550,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-                "token_type_ids": tf.TensorSpec((None, None, None), tf.int32, name="token_type_ids"),
-            }
-        ]
-    )
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving
-    def serving(self, inputs: Dict[str, tf.Tensor]):
-        output = self.call(input_ids=inputs)
-
-        return self.serving_output(output)
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output
-    def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMultipleChoiceModelOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1683,7 +1585,6 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_TOKEN_CLASSIFICATION,
         output_type=TFTokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1692,16 +1593,16 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFTokenClassifierOutput]:
         r"""
@@ -1737,10 +1638,3 @@ def call(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
diff --git a/src/transformers/models/mobilebert/tokenization_mobilebert.py b/src/transformers/models/mobilebert/tokenization_mobilebert.py
index 023a5f74dabc..398f054a9926 100644
--- a/src/transformers/models/mobilebert/tokenization_mobilebert.py
+++ b/src/transformers/models/mobilebert/tokenization_mobilebert.py
@@ -122,22 +122,8 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
-        super().__init__(
-            do_lower_case=do_lower_case,
-            do_basic_tokenize=do_basic_tokenize,
-            never_split=never_split,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            tokenize_chinese_chars=tokenize_chinese_chars,
-            strip_accents=strip_accents,
-            **kwargs,
-        )
-
         if not os.path.isfile(vocab_file):
             raise ValueError(
                 f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
@@ -153,7 +139,22 @@ def __init__(
                 tokenize_chinese_chars=tokenize_chinese_chars,
                 strip_accents=strip_accents,
             )
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+
+        super().__init__(
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
 
     @property
     def do_lower_case(self):
@@ -166,11 +167,12 @@ def vocab_size(self):
     def get_vocab(self):
         return dict(self.vocab, **self.added_tokens_encoder)
 
-    def _tokenize(self, text):
+    def _tokenize(self, text, split_special_tokens=False):
         split_tokens = []
         if self.do_basic_tokenize:
-            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-
+            for token in self.basic_tokenizer.tokenize(
+                text, never_split=self.all_special_tokens if not split_special_tokens else None
+            ):
                 # If the token is part of the never_split set
                 if token in self.basic_tokenizer.never_split:
                     split_tokens.append(token)
@@ -315,20 +317,30 @@ class BasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -347,7 +359,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -375,7 +389,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
diff --git a/src/transformers/models/mobilebert/tokenization_mobilebert_fast.py b/src/transformers/models/mobilebert/tokenization_mobilebert_fast.py
index 36ea9c61e48c..f8d62158b22c 100644
--- a/src/transformers/models/mobilebert/tokenization_mobilebert_fast.py
+++ b/src/transformers/models/mobilebert/tokenization_mobilebert_fast.py
@@ -101,7 +101,7 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
@@ -150,7 +150,7 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
         """
         output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
 
-        if token_ids_1:
+        if token_ids_1 is not None:
             output += token_ids_1 + [self.sep_token_id]
 
         return output
diff --git a/src/transformers/models/mobilenet_v1/__init__.py b/src/transformers/models/mobilenet_v1/__init__.py
index 4318ce6ca448..dec8eeec2de5 100644
--- a/src/transformers/models/mobilenet_v1/__init__.py
+++ b/src/transformers/models/mobilenet_v1/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/mobilenet_v1/configuration_mobilenet_v1.py b/src/transformers/models/mobilenet_v1/configuration_mobilenet_v1.py
index f13ed1b2ea33..2ee20cd2bafa 100644
--- a/src/transformers/models/mobilenet_v1/configuration_mobilenet_v1.py
+++ b/src/transformers/models/mobilenet_v1/configuration_mobilenet_v1.py
@@ -55,7 +55,7 @@ class MobileNetV1Config(PretrainedConfig):
             All layers will have at least this many channels.
         hidden_act (`str` or `function`, *optional*, defaults to `"relu6"`):
             The non-linear activation function (function or string) in the Transformer encoder and convolution layers.
-        tf_padding (`bool`, `optional`, defaults to `True`):
+        tf_padding (`bool`, *optional*, defaults to `True`):
             Whether to use TensorFlow padding rules on the convolution layers.
         classifier_dropout_prob (`float`, *optional*, defaults to 0.999):
             The dropout ratio for attached classifiers.
@@ -91,7 +91,7 @@ def __init__(
         classifier_dropout_prob=0.999,
         initializer_range=0.02,
         layer_norm_eps=0.001,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -110,7 +110,6 @@ def __init__(
 
 
 class MobileNetV1OnnxConfig(OnnxConfig):
-
     torch_onnx_minimum_version = version.parse("1.11")
 
     @property
diff --git a/src/transformers/models/mobilenet_v1/convert_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/mobilenet_v1/convert_original_tf_checkpoint_to_pytorch.py
index c00ec9c703e0..4985e0ff22d7 100644
--- a/src/transformers/models/mobilenet_v1/convert_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/mobilenet_v1/convert_original_tf_checkpoint_to_pytorch.py
@@ -20,15 +20,15 @@
 import re
 from pathlib import Path
 
+import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 
-import requests
-from huggingface_hub import hf_hub_download
 from transformers import (
     MobileNetV1Config,
-    MobileNetV1FeatureExtractor,
     MobileNetV1ForImageClassification,
+    MobileNetV1ImageProcessor,
     load_tf_weights_in_mobilenet_v1,
 )
 from transformers.utils import logging
@@ -83,12 +83,12 @@ def convert_movilevit_checkpoint(model_name, checkpoint_path, pytorch_dump_folde
     # Load weights from TensorFlow checkpoint
     load_tf_weights_in_mobilenet_v1(model, config, checkpoint_path)
 
-    # Check outputs on an image, prepared by MobileNetV1FeatureExtractor
-    feature_extractor = MobileNetV1FeatureExtractor(
+    # Check outputs on an image, prepared by MobileNetV1ImageProcessor
+    image_processor = MobileNetV1ImageProcessor(
         crop_size={"width": config.image_size, "height": config.image_size},
         size={"shortest_edge": config.image_size + 32},
     )
-    encoding = feature_extractor(images=prepare_img(), return_tensors="pt")
+    encoding = image_processor(images=prepare_img(), return_tensors="pt")
     outputs = model(**encoding)
     logits = outputs.logits
 
@@ -107,13 +107,13 @@ def convert_movilevit_checkpoint(model_name, checkpoint_path, pytorch_dump_folde
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
         print("Pushing to the hub...")
         repo_id = "google/" + model_name
-        feature_extractor.push_to_hub(repo_id)
+        image_processor.push_to_hub(repo_id)
         model.push_to_hub(repo_id)
 
 
diff --git a/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py b/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py
index 1bf7ccd11387..c9b015c5c01f 100644
--- a/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py
+++ b/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py
@@ -18,14 +18,9 @@
 
 import numpy as np
 
-from transformers.utils.generic import TensorType
-
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import (
-    center_crop,
     get_resize_output_image_size,
-    normalize,
-    rescale,
     resize,
     to_channel_dimension_format,
 )
@@ -35,11 +30,13 @@
     ChannelDimension,
     ImageInput,
     PILImageResampling,
-    is_batched,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
     to_numpy_array,
     valid_images,
 )
-from ...utils import logging
+from ...utils import TensorType, logging
 
 
 logger = logging.get_logger(__name__)
@@ -98,7 +95,7 @@ def __init__(
         do_normalize: bool = True,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"shortest_edge": 256}
@@ -116,13 +113,15 @@ def __init__(
         self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
         self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
 
+    # Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize
     def resize(
         self,
         image: np.ndarray,
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
@@ -137,85 +136,23 @@ def resize(
                 Resampling filter to use when resiizing the image.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
         """
         size = get_size_dict(size, default_to_square=False)
         if "shortest_edge" not in size:
             raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}")
-        output_size = get_resize_output_image_size(image, size=size["shortest_edge"], default_to_square=False)
-        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
-
-    def center_crop(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Center crop an image to (size["height"], size["width"]). If the input size is smaller than `size` along any
-        edge, the image is padded with 0's and then center cropped.
-
-        Args:
-            image (`np.ndarray`):
-                Image to center crop.
-            size (`Dict[str, int]`):
-                Size of the output image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        size = get_size_dict(size)
-        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
-
-    def rescale(
-        self, image: np.ndarray, scale: float, data_format: Optional[Union[str, ChannelDimension]] = None, **kwargs
-    ) -> np.ndarray:
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`float`):
-                The scaling factor to rescale pixel values by.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format for the output image. If unset, the channel dimension format of the input
-                image is used. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-
-        Returns:
-            `np.ndarray`: The rescaled image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            mean (`float` or `List[float]`):
-                Image mean to use for normalization.
-            std (`float` or `List[float]`):
-                Image standard deviation to use for normalization.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format for the output image. If unset, the channel dimension format of the input
-                image is used. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-
-        Returns:
-            `np.ndarray`: The normalized image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
+        output_size = get_resize_output_image_size(
+            image, size=size["shortest_edge"], default_to_square=False, input_data_format=input_data_format
+        )
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
 
     def preprocess(
         self,
@@ -232,6 +169,7 @@ def preprocess(
         image_std: Optional[Union[float, List[float]]] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
         **kwargs,
     ):
         """
@@ -239,7 +177,8 @@ def preprocess(
 
         Args:
             images (`ImageInput`):
-                Image to preprocess.
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
             do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                 Whether to resize the image.
             size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@@ -274,6 +213,12 @@ def preprocess(
                 - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                 - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                 - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         size = size if size is not None else self.size
@@ -288,8 +233,7 @@ def preprocess(
         image_mean = image_mean if image_mean is not None else self.image_mean
         image_std = image_std if image_std is not None else self.image_std
 
-        if not is_batched(images):
-            images = [images]
+        images = make_list_of_images(images)
 
         if not valid_images(images):
             raise ValueError(
@@ -312,19 +256,42 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
         if do_resize:
-            images = [self.resize(image=image, size=size, resample=resample) for image in images]
+            images = [
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
 
         if do_center_crop:
-            images = [self.center_crop(image=image, size=crop_size) for image in images]
+            images = [
+                self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images
+            ]
 
         if do_rescale:
-            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
 
         if do_normalize:
-            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
-
-        images = [to_channel_dimension_format(image, data_format) for image in images]
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
 
         data = {"pixel_values": images}
         return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/src/transformers/models/mobilenet_v1/modeling_mobilenet_v1.py b/src/transformers/models/mobilenet_v1/modeling_mobilenet_v1.py
index 79c64dbaeab5..3963e60f3562 100755
--- a/src/transformers/models/mobilenet_v1/modeling_mobilenet_v1.py
+++ b/src/transformers/models/mobilenet_v1/modeling_mobilenet_v1.py
@@ -33,7 +33,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "MobileNetV1Config"
-_FEAT_EXTRACTOR_FOR_DOC = "MobileNetV1ImageProcessor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "google/mobilenet_v1_1.0_224"
@@ -285,7 +284,7 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d]) -> None:
 MOBILENET_V1_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`MobileNetV1ImageProcessor`]. See
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
             [`MobileNetV1ImageProcessor.__call__`] for details.
         output_hidden_states (`bool`, *optional*):
             Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
@@ -355,7 +354,6 @@ def _prune_heads(self, heads_to_prune):
 
     @add_start_docstrings_to_model_forward(MOBILENET_V1_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPoolingAndNoAttention,
         config_class=_CONFIG_FOR_DOC,
@@ -428,7 +426,6 @@ def __init__(self, config: MobileNetV1Config) -> None:
 
     @add_start_docstrings_to_model_forward(MOBILENET_V1_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_IMAGE_CLASS_CHECKPOINT,
         output_type=ImageClassifierOutputWithNoAttention,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/mobilenet_v2/__init__.py b/src/transformers/models/mobilenet_v2/__init__.py
index eafb8c1d7809..e3d89c8b5947 100644
--- a/src/transformers/models/mobilenet_v2/__init__.py
+++ b/src/transformers/models/mobilenet_v2/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/mobilenet_v2/configuration_mobilenet_v2.py b/src/transformers/models/mobilenet_v2/configuration_mobilenet_v2.py
index 5c46d4c10a44..ab4eef23cfb4 100644
--- a/src/transformers/models/mobilenet_v2/configuration_mobilenet_v2.py
+++ b/src/transformers/models/mobilenet_v2/configuration_mobilenet_v2.py
@@ -64,16 +64,16 @@ class MobileNetV2Config(PretrainedConfig):
             the input dimensions by a factor of 32. If `output_stride` is 8 or 16, the model uses dilated convolutions
             on the depthwise layers instead of regular convolutions, so that the feature maps never become more than 8x
             or 16x smaller than the input image.
-        first_layer_is_expansion (`bool`, `optional`, defaults to `True`):
+        first_layer_is_expansion (`bool`, *optional*, defaults to `True`):
             True if the very first convolution layer is also the expansion layer for the first expansion block.
-        finegrained_output (`bool`, `optional`, defaults to `True`):
+        finegrained_output (`bool`, *optional*, defaults to `True`):
             If true, the number of output channels in the final convolution layer will stay large (1280) even if
             `depth_multiplier` is less than 1.
         hidden_act (`str` or `function`, *optional*, defaults to `"relu6"`):
             The non-linear activation function (function or string) in the Transformer encoder and convolution layers.
-        tf_padding (`bool`, `optional`, defaults to `True`):
+        tf_padding (`bool`, *optional*, defaults to `True`):
             Whether to use TensorFlow padding rules on the convolution layers.
-        classifier_dropout_prob (`float`, *optional*, defaults to 0.999):
+        classifier_dropout_prob (`float`, *optional*, defaults to 0.8):
             The dropout ratio for attached classifiers.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
@@ -105,7 +105,7 @@ def __init__(
         depth_multiplier=1.0,
         depth_divisible_by=8,
         min_depth=8,
-        expand_ratio=6,
+        expand_ratio=6.0,
         output_stride=32,
         first_layer_is_expansion=True,
         finegrained_output=True,
@@ -115,7 +115,7 @@ def __init__(
         initializer_range=0.02,
         layer_norm_eps=0.001,
         semantic_loss_ignore_index=255,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -140,7 +140,6 @@ def __init__(
 
 
 class MobileNetV2OnnxConfig(OnnxConfig):
-
     torch_onnx_minimum_version = version.parse("1.11")
 
     @property
diff --git a/src/transformers/models/mobilenet_v2/convert_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/mobilenet_v2/convert_original_tf_checkpoint_to_pytorch.py
index 70a00d7d2339..443bf8fd7e4e 100644
--- a/src/transformers/models/mobilenet_v2/convert_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/mobilenet_v2/convert_original_tf_checkpoint_to_pytorch.py
@@ -20,11 +20,11 @@
 import re
 from pathlib import Path
 
+import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 
-import requests
-from huggingface_hub import hf_hub_download
 from transformers import (
     MobileNetV2Config,
     MobileNetV2ForImageClassification,
@@ -99,11 +99,11 @@ def convert_movilevit_checkpoint(model_name, checkpoint_path, pytorch_dump_folde
     load_tf_weights_in_mobilenet_v2(model, config, checkpoint_path)
 
     # Check outputs on an image, prepared by MobileNetV2ImageProcessor
-    feature_extractor = MobileNetV2ImageProcessor(
+    image_processor = MobileNetV2ImageProcessor(
         crop_size={"width": config.image_size, "height": config.image_size},
         size={"shortest_edge": config.image_size + 32},
     )
-    encoding = feature_extractor(images=prepare_img(), return_tensors="pt")
+    encoding = image_processor(images=prepare_img(), return_tensors="pt")
     outputs = model(**encoding)
     logits = outputs.logits
 
@@ -143,13 +143,13 @@ def convert_movilevit_checkpoint(model_name, checkpoint_path, pytorch_dump_folde
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
         print("Pushing to the hub...")
         repo_id = "google/" + model_name
-        feature_extractor.push_to_hub(repo_id)
+        image_processor.push_to_hub(repo_id)
         model.push_to_hub(repo_id)
 
 
diff --git a/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py b/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
index 92fa04081dbd..9b015c88bf1d 100644
--- a/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
+++ b/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
@@ -18,15 +18,9 @@
 
 import numpy as np
 
-from transformers.utils import is_torch_available, is_torch_tensor
-from transformers.utils.generic import TensorType
-
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import (
-    center_crop,
     get_resize_output_image_size,
-    normalize,
-    rescale,
     resize,
     to_channel_dimension_format,
 )
@@ -36,11 +30,13 @@
     ChannelDimension,
     ImageInput,
     PILImageResampling,
-    is_batched,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
     to_numpy_array,
     valid_images,
 )
-from ...utils import logging
+from ...utils import TensorType, is_torch_available, is_torch_tensor, logging
 
 
 if is_torch_available():
@@ -103,7 +99,7 @@ def __init__(
         do_normalize: bool = True,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"shortest_edge": 256}
@@ -121,13 +117,15 @@ def __init__(
         self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
         self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
 
+    # Copied from transformers.models.mobilenet_v1.image_processing_mobilenet_v1.MobileNetV1ImageProcessor.resize
     def resize(
         self,
         image: np.ndarray,
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
@@ -142,87 +140,23 @@ def resize(
                 Resampling filter to use when resiizing the image.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
         """
         size = get_size_dict(size, default_to_square=False)
         if "shortest_edge" not in size:
             raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}")
-        output_size = get_resize_output_image_size(image, size=size["shortest_edge"], default_to_square=False)
-        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
-
-    def center_crop(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Center crop an image to (size["height"], size["width"]). If the input size is smaller than `size` along any
-        edge, the image is padded with 0's and then center cropped.
-
-        Args:
-            image (`np.ndarray`):
-                Image to center crop.
-            size (`Dict[str, int]`):
-                Size of the output image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        size = get_size_dict(size)
-        if "height" not in size or "width" not in size:
-            raise ValueError(f"The `size` parameter must contain the keys `height` and `width`. Got {size.keys()}")
-        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
-
-    def rescale(
-        self, image: np.ndarray, scale: float, data_format: Optional[Union[str, ChannelDimension]] = None, **kwargs
-    ) -> np.ndarray:
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`float`):
-                The scaling factor to rescale pixel values by.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format for the output image. If unset, the channel dimension format of the input
-                image is used. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-
-        Returns:
-            `np.ndarray`: The rescaled image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            mean (`float` or `List[float]`):
-                Image mean to use for normalization.
-            std (`float` or `List[float]`):
-                Image standard deviation to use for normalization.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format for the output image. If unset, the channel dimension format of the input
-                image is used. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-
-        Returns:
-            `np.ndarray`: The normalized image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
+        output_size = get_resize_output_image_size(
+            image, size=size["shortest_edge"], default_to_square=False, input_data_format=input_data_format
+        )
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
 
     def preprocess(
         self,
@@ -239,6 +173,7 @@ def preprocess(
         image_std: Optional[Union[float, List[float]]] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
         **kwargs,
     ):
         """
@@ -246,7 +181,8 @@ def preprocess(
 
         Args:
             images (`ImageInput`):
-                Image to preprocess.
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
             do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                 Whether to resize the image.
             size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@@ -281,6 +217,12 @@ def preprocess(
                 - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                 - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                 - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         size = size if size is not None else self.size
@@ -295,8 +237,7 @@ def preprocess(
         image_mean = image_mean if image_mean is not None else self.image_mean
         image_std = image_std if image_std is not None else self.image_std
 
-        if not is_batched(images):
-            images = [images]
+        images = make_list_of_images(images)
 
         if not valid_images(images):
             raise ValueError(
@@ -319,23 +260,47 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
         if do_resize:
-            images = [self.resize(image=image, size=size, resample=resample) for image in images]
+            images = [
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
 
         if do_center_crop:
-            images = [self.center_crop(image=image, size=crop_size) for image in images]
+            images = [
+                self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images
+            ]
 
         if do_rescale:
-            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
 
         if do_normalize:
-            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
 
-        images = [to_channel_dimension_format(image, data_format) for image in images]
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
 
         data = {"pixel_values": images}
         return BatchFeature(data=data, tensor_type=return_tensors)
 
+    # Copied from transformers.models.beit.image_processing_beit.BeitImageProcessor.post_process_semantic_segmentation with Beit->MobileNetV2
     def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple] = None):
         """
         Converts the output of [`MobileNetV2ForSemanticSegmentation`] into semantic segmentation maps. Only supports
@@ -344,14 +309,14 @@ def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple]
         Args:
             outputs ([`MobileNetV2ForSemanticSegmentation`]):
                 Raw outputs of the model.
-            target_sizes (`List[Tuple]`, *optional*):
-                A list of length `batch_size`, where each item is a `Tuple[int, int]` corresponding to the requested
-                final size (height, width) of each prediction. If left to None, predictions will not be resized.
+            target_sizes (`List[Tuple]` of length `batch_size`, *optional*):
+                List of tuples corresponding to the requested final size (height, width) of each prediction. If unset,
+                predictions will not be resized.
+
         Returns:
-            `List[torch.Tensor]`:
-                A list of length `batch_size`, where each item is a semantic segmentation map of shape (height, width)
-                corresponding to the target_sizes entry (if `target_sizes` is specified). Each entry of each
-                `torch.Tensor` correspond to a semantic class id.
+            semantic_segmentation: `List[torch.Tensor]` of length `batch_size`, where each item is a semantic
+            segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
+            specified). Each entry of each `torch.Tensor` correspond to a semantic class id.
         """
         # TODO: add support for other frameworks
         logits = outputs.logits
diff --git a/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py b/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py
index a47c5aebbe30..b76e68f9067e 100755
--- a/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py
+++ b/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py
@@ -43,7 +43,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "MobileNetV2Config"
-_FEAT_EXTRACTOR_FOR_DOC = "MobileNetV2ImageProcessor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "google/mobilenet_v2_1.0_224"
@@ -486,7 +485,7 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d]) -> None:
 MOBILENET_V2_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`MobileNetV2ImageProcessor`]. See
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
             [`MobileNetV2ImageProcessor.__call__`] for details.
         output_hidden_states (`bool`, *optional*):
             Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
@@ -566,7 +565,6 @@ def _prune_heads(self, heads_to_prune):
 
     @add_start_docstrings_to_model_forward(MOBILENET_V2_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPoolingAndNoAttention,
         config_class=_CONFIG_FOR_DOC,
@@ -639,8 +637,7 @@ def __init__(self, config: MobileNetV2Config) -> None:
 
     @add_start_docstrings_to_model_forward(MOBILENET_V2_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
-        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=ImageClassifierOutputWithNoAttention,
         config_class=_CONFIG_FOR_DOC,
         expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
@@ -811,14 +808,14 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import MobileNetV2ImageProcessor, MobileNetV2ForSemanticSegmentation
+        >>> from transformers import AutoImageProcessor, MobileNetV2ForSemanticSegmentation
         >>> from PIL import Image
         >>> import requests
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> image_processor = MobileNetV2ImageProcessor.from_pretrained("google/deeplabv3_mobilenet_v2_1.0_513")
+        >>> image_processor = AutoImageProcessor.from_pretrained("google/deeplabv3_mobilenet_v2_1.0_513")
         >>> model = MobileNetV2ForSemanticSegmentation.from_pretrained("google/deeplabv3_mobilenet_v2_1.0_513")
 
         >>> inputs = image_processor(images=image, return_tensors="pt")
diff --git a/src/transformers/models/mobilevit/__init__.py b/src/transformers/models/mobilevit/__init__.py
index d0d8962b4ee1..5615c6221862 100644
--- a/src/transformers/models/mobilevit/__init__.py
+++ b/src/transformers/models/mobilevit/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/mobilevit/configuration_mobilevit.py b/src/transformers/models/mobilevit/configuration_mobilevit.py
index 83406c96d830..a4aafe997eb2 100644
--- a/src/transformers/models/mobilevit/configuration_mobilevit.py
+++ b/src/transformers/models/mobilevit/configuration_mobilevit.py
@@ -74,7 +74,7 @@ class MobileViTConfig(PretrainedConfig):
             The non-linear activation function (function or string) in the Transformer encoder and convolution layers.
         conv_kernel_size (`int`, *optional*, defaults to 3):
             The size of the convolutional kernel in the MobileViT layer.
-        output_stride (`int`, `optional`, defaults to 32):
+        output_stride (`int`, *optional*, defaults to 32):
             The ratio of the spatial resolution of the output to the resolution of the input image.
         hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probabilitiy for all fully connected layers in the Transformer encoder.
@@ -84,11 +84,11 @@ class MobileViTConfig(PretrainedConfig):
             The dropout ratio for attached classifiers.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
         qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to the queries, keys and values.
-        aspp_out_channels (`int`, `optional`, defaults to 256):
+        aspp_out_channels (`int`, *optional*, defaults to 256):
             Number of output channels used in the ASPP layer for semantic segmentation.
         atrous_rates (`List[int]`, *optional*, defaults to `[6, 12, 18]`):
             Dilation (atrous) factors used in the ASPP layer for semantic segmentation.
@@ -136,7 +136,7 @@ def __init__(
         atrous_rates=[6, 12, 18],
         aspp_dropout_prob=0.1,
         semantic_loss_ignore_index=255,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -166,7 +166,6 @@ def __init__(
 
 
 class MobileViTOnnxConfig(OnnxConfig):
-
     torch_onnx_minimum_version = version.parse("1.11")
 
     @property
diff --git a/src/transformers/models/mobilevit/convert_mlcvnets_to_pytorch.py b/src/transformers/models/mobilevit/convert_mlcvnets_to_pytorch.py
index bc61f8822efa..e251b124b465 100644
--- a/src/transformers/models/mobilevit/convert_mlcvnets_to_pytorch.py
+++ b/src/transformers/models/mobilevit/convert_mlcvnets_to_pytorch.py
@@ -19,16 +19,16 @@
 import json
 from pathlib import Path
 
+import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 
-import requests
-from huggingface_hub import hf_hub_download
 from transformers import (
     MobileViTConfig,
-    MobileViTFeatureExtractor,
     MobileViTForImageClassification,
     MobileViTForSemanticSegmentation,
+    MobileViTImageProcessor,
 )
 from transformers.utils import logging
 
@@ -211,9 +211,9 @@ def convert_movilevit_checkpoint(mobilevit_name, checkpoint_path, pytorch_dump_f
     new_state_dict = convert_state_dict(state_dict, model)
     model.load_state_dict(new_state_dict)
 
-    # Check outputs on an image, prepared by MobileViTFeatureExtractor
-    feature_extractor = MobileViTFeatureExtractor(crop_size=config.image_size, size=config.image_size + 32)
-    encoding = feature_extractor(images=prepare_img(), return_tensors="pt")
+    # Check outputs on an image, prepared by MobileViTImageProcessor
+    image_processor = MobileViTImageProcessor(crop_size=config.image_size, size=config.image_size + 32)
+    encoding = image_processor(images=prepare_img(), return_tensors="pt")
     outputs = model(**encoding)
     logits = outputs.logits
 
@@ -265,8 +265,8 @@ def convert_movilevit_checkpoint(mobilevit_name, checkpoint_path, pytorch_dump_f
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     print(f"Saving model {mobilevit_name} to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
         model_mapping = {
@@ -280,7 +280,7 @@ def convert_movilevit_checkpoint(mobilevit_name, checkpoint_path, pytorch_dump_f
 
         print("Pushing to the hub...")
         model_name = model_mapping[mobilevit_name]
-        feature_extractor.push_to_hub(model_name, organization="apple")
+        image_processor.push_to_hub(model_name, organization="apple")
         model.push_to_hub(model_name, organization="apple")
 
 
diff --git a/src/transformers/models/mobilevit/image_processing_mobilevit.py b/src/transformers/models/mobilevit/image_processing_mobilevit.py
index a7a4a071d96f..0f3a422b30a0 100644
--- a/src/transformers/models/mobilevit/image_processing_mobilevit.py
+++ b/src/transformers/models/mobilevit/image_processing_mobilevit.py
@@ -18,21 +18,24 @@
 
 import numpy as np
 
-from transformers.utils import is_torch_available, is_torch_tensor, is_vision_available
-from transformers.utils.generic import TensorType
-
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import center_crop, get_resize_output_image_size, rescale, resize, to_channel_dimension_format
+from ...image_transforms import (
+    flip_channel_order,
+    get_resize_output_image_size,
+    resize,
+    to_channel_dimension_format,
+)
 from ...image_utils import (
     ChannelDimension,
     ImageInput,
     PILImageResampling,
     infer_channel_dimension_format,
-    is_batched,
+    is_scaled_image,
+    make_list_of_images,
     to_numpy_array,
     valid_images,
 )
-from ...utils import logging
+from ...utils import TensorType, is_torch_available, is_torch_tensor, is_vision_available, logging
 
 
 if is_vision_available():
@@ -45,34 +48,6 @@
 logger = logging.get_logger(__name__)
 
 
-def flip_channel_order(image: np.ndarray, data_format: Optional[ChannelDimension]) -> np.ndarray:
-    """
-    Flip the color channels from RGB to BGR or vice versa.
-
-    Args:
-        image (`np.ndarray`):
-            The image, represented as a numpy array.
-        data_format (`ChannelDimension`, *`optional`*):
-            The channel dimension format of the image. If not provided, it will be the same as the input image.
-
-    Returns:
-        `np.ndarray`: The image with the flipped color channels.
-    """
-    input_data_format = infer_channel_dimension_format(image)
-
-    if input_data_format == ChannelDimension.LAST:
-        image = image[..., ::-1]
-    elif input_data_format == ChannelDimension.FIRST:
-        image = image[:, ::-1, ...]
-    else:
-        raise ValueError(f"Invalid input channel dimension format: {input_data_format}")
-
-    if data_format is not None:
-        image = to_channel_dimension_format(image, data_format)
-
-    return image
-
-
 class MobileViTImageProcessor(BaseImageProcessor):
     r"""
     Constructs a MobileViT image processor.
@@ -84,7 +59,7 @@ class MobileViTImageProcessor(BaseImageProcessor):
         size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
             Controls the size of the output image after resizing. Can be overridden by the `size` parameter in the
             `preprocess` method.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
             Defines the resampling filter to use if resizing the image. Can be overridden by the `resample` parameter
             in the `preprocess` method.
         do_rescale (`bool`, *optional*, defaults to `True`):
@@ -117,7 +92,7 @@ def __init__(
         do_center_crop: bool = True,
         crop_size: Dict[str, int] = None,
         do_flip_channel_order: bool = True,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"shortest_edge": 224}
@@ -134,80 +109,52 @@ def __init__(
         self.crop_size = crop_size
         self.do_flip_channel_order = do_flip_channel_order
 
+    # Copied from transformers.models.mobilenet_v1.image_processing_mobilenet_v1.MobileNetV1ImageProcessor.resize with PILImageResampling.BICUBIC->PILImageResampling.BILINEAR
     def resize(
         self,
         image: np.ndarray,
         size: Dict[str, int],
-        resample: PILImageResampling = PIL.Image.BILINEAR,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> np.ndarray:
         """
-        Resize an image.
+        Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
+        resized to keep the input aspect ratio.
 
         Args:
             image (`np.ndarray`):
                 Image to resize.
             size (`Dict[str, int]`):
-                Controls the size of the output image. The shortest edge of the image will be resized to
-                `size["shortest_edge"]` while maintaining the aspect ratio.
+                Size of the output image.
             resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
                 Resampling filter to use when resiizing the image.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
         """
         size = get_size_dict(size, default_to_square=False)
         if "shortest_edge" not in size:
-            raise ValueError(f"The `size` dictionary must contain the key `shortest_edge`. Got {size.keys()}")
-        output_size = get_resize_output_image_size(image, size=size["shortest_edge"], default_to_square=False)
-        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
-
-    def center_crop(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Center crop an image to size `(size["height], size["width"])`. If the input size is smaller than `size` along
-        any edge, the image is padded with 0's and then center cropped.
-
-        Args:
-            image (`np.ndarray`):
-                Image to center crop.
-            size (`Dict[str, int]`):
-                Size of the output image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        size = get_size_dict(size)
-        if "height" not in size or "width" not in size:
-            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
-        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
+            raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}")
+        output_size = get_resize_output_image_size(
+            image, size=size["shortest_edge"], default_to_square=False, input_data_format=input_data_format
+        )
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
 
-    def rescale(
+    def flip_channel_order(
         self,
         image: np.ndarray,
-        scale: Union[int, float],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def flip_channel_order(
-        self, image: np.ndarray, data_format: Optional[Union[str, ChannelDimension]] = None
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.ndarray:
         """
         Flip the color channels from RGB to BGR or vice versa.
@@ -217,8 +164,10 @@ def flip_channel_order(
                 The image, represented as a numpy array.
             data_format (`ChannelDimension` or `str`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
         """
-        return flip_channel_order(image, data_format=data_format)
+        return flip_channel_order(image, data_format=data_format, input_data_format=input_data_format)
 
     def preprocess(
         self,
@@ -233,6 +182,7 @@ def preprocess(
         do_flip_channel_order: bool = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
         **kwargs,
     ) -> PIL.Image.Image:
         """
@@ -240,7 +190,8 @@ def preprocess(
 
         Args:
             images (`ImageInput`):
-                Image to preprocess.
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
             do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                 Whether to resize the image.
             size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@@ -269,6 +220,12 @@ def preprocess(
                 The channel dimension format for the output image. Can be one of:
                     - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                     - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         resample = resample if resample is not None else self.resample
@@ -284,8 +241,7 @@ def preprocess(
         crop_size = crop_size if crop_size is not None else self.crop_size
         crop_size = get_size_dict(crop_size, param_name="crop_size")
 
-        if not is_batched(images):
-            images = [images]
+        images = make_list_of_images(images)
 
         if not valid_images(images):
             raise ValueError(
@@ -305,24 +261,45 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
         if do_resize:
-            images = [self.resize(image=image, size=size, resample=resample) for image in images]
+            images = [
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
 
         if do_center_crop:
-            images = [self.center_crop(image=image, size=crop_size) for image in images]
+            images = [
+                self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images
+            ]
 
         if do_rescale:
-            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
 
         # the pretrained checkpoints assume images are BGR, not RGB
         if do_flip_channel_order:
-            images = [self.flip_channel_order(image=image) for image in images]
+            images = [self.flip_channel_order(image=image, input_data_format=input_data_format) for image in images]
 
-        images = [to_channel_dimension_format(image, data_format) for image in images]
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
 
         data = {"pixel_values": images}
         return BatchFeature(data=data, tensor_type=return_tensors)
 
+    # Copied from transformers.models.beit.image_processing_beit.BeitImageProcessor.post_process_semantic_segmentation with Beit->MobileViT
     def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple] = None):
         """
         Converts the output of [`MobileViTForSemanticSegmentation`] into semantic segmentation maps. Only supports
@@ -331,15 +308,14 @@ def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple]
         Args:
             outputs ([`MobileViTForSemanticSegmentation`]):
                 Raw outputs of the model.
-            target_sizes (`List[Tuple]`, *optional*):
-                A list of length `batch_size`, where each item is a `Tuple[int, int]` corresponding to the requested
-                final size (height, width) of each prediction. If left to None, predictions will not be resized.
+            target_sizes (`List[Tuple]` of length `batch_size`, *optional*):
+                List of tuples corresponding to the requested final size (height, width) of each prediction. If unset,
+                predictions will not be resized.
 
         Returns:
-            `List[torch.Tensor]`:
-                A list of length `batch_size`, where each item is a semantic segmentation map of shape (height, width)
-                corresponding to the target_sizes entry (if `target_sizes` is specified). Each entry of each
-                `torch.Tensor` correspond to a semantic class id.
+            semantic_segmentation: `List[torch.Tensor]` of length `batch_size`, where each item is a semantic
+            segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
+            specified). Each entry of each `torch.Tensor` correspond to a semantic class id.
         """
         # TODO: add support for other frameworks
         logits = outputs.logits
diff --git a/src/transformers/models/mobilevit/modeling_mobilevit.py b/src/transformers/models/mobilevit/modeling_mobilevit.py
index e129fa28981e..c3accb21e05e 100755
--- a/src/transformers/models/mobilevit/modeling_mobilevit.py
+++ b/src/transformers/models/mobilevit/modeling_mobilevit.py
@@ -49,7 +49,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "MobileViTConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "MobileViTImageProcessor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "apple/mobilevit-small"
@@ -692,7 +691,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 MOBILEVIT_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`MobileViTImageProcessor`]. See
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
             [`MobileViTImageProcessor.__call__`] for details.
         output_hidden_states (`bool`, *optional*):
             Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
@@ -745,7 +744,6 @@ def _prune_heads(self, heads_to_prune):
 
     @add_start_docstrings_to_model_forward(MOBILEVIT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPoolingAndNoAttention,
         config_class=_CONFIG_FOR_DOC,
@@ -819,7 +817,6 @@ def __init__(self, config: MobileViTConfig) -> None:
 
     @add_start_docstrings_to_model_forward(MOBILEVIT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_IMAGE_CLASS_CHECKPOINT,
         output_type=ImageClassifierOutputWithNoAttention,
         config_class=_CONFIG_FOR_DOC,
@@ -1027,14 +1024,15 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import MobileViTImageProcessor, MobileViTForSemanticSegmentation
-        >>> from PIL import Image
         >>> import requests
+        >>> import torch
+        >>> from PIL import Image
+        >>> from transformers import AutoImageProcessor, MobileViTForSemanticSegmentation
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> image_processor = MobileViTImageProcessor.from_pretrained("apple/deeplabv3-mobilevit-small")
+        >>> image_processor = AutoImageProcessor.from_pretrained("apple/deeplabv3-mobilevit-small")
         >>> model = MobileViTForSemanticSegmentation.from_pretrained("apple/deeplabv3-mobilevit-small")
 
         >>> inputs = image_processor(images=image, return_tensors="pt")
diff --git a/src/transformers/models/mobilevit/modeling_tf_mobilevit.py b/src/transformers/models/mobilevit/modeling_tf_mobilevit.py
index ebfce88937f8..3dcca75706c8 100644
--- a/src/transformers/models/mobilevit/modeling_tf_mobilevit.py
+++ b/src/transformers/models/mobilevit/modeling_tf_mobilevit.py
@@ -16,6 +16,8 @@
 # Original license: https://github.com/apple/ml-cvnets/blob/main/LICENSE
 """ TensorFlow 2.0 MobileViT model."""
 
+from __future__ import annotations
+
 from typing import Dict, Optional, Tuple, Union
 
 import tensorflow as tf
@@ -43,7 +45,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "MobileViTConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "MobileViTImageProcessor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "apple/mobilevit-small"
@@ -92,12 +93,12 @@ def __init__(
         dilation: int = 1,
         use_normalization: bool = True,
         use_activation: Union[bool, str] = True,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         logger.warning(
             f"\n{self.__class__.__name__} has backpropagation operations that are NOT supported on CPU. If you wish "
-            "to train/fine-tine this model, you need a GPU or a TPU"
+            "to train/fine-tune this model, you need a GPU or a TPU"
         )
 
         padding = int((kernel_size - 1) / 2) * dilation
@@ -198,7 +199,7 @@ def __init__(
         out_channels: int,
         stride: int = 1,
         num_stages: int = 1,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
 
@@ -384,7 +385,7 @@ def __init__(
         hidden_size: int,
         num_stages: int,
         dilation: int = 1,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         self.patch_width = config.patch_size
@@ -664,7 +665,7 @@ class PreTrainedModel
     @unpack_inputs
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
@@ -734,38 +735,6 @@ class TFMobileViTPreTrainedModel(TFPreTrainedModel):
     base_model_prefix = "mobilevit"
     main_input_name = "pixel_values"
 
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        VISION_DUMMY_INPUTS = tf.random.uniform(
-            shape=(3, self.config.num_channels, self.config.image_size, self.config.image_size),
-            dtype=tf.float32,
-        )
-        return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)}
-
-    @tf.function(
-        input_signature=[
-            {
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        """
-        Method used for serving the model.
-
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
-        """
-        output = self.call(inputs)
-        return self.serving_output(output)
-
 
 MOBILEVIT_START_DOCSTRING = r"""
     This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
@@ -811,7 +780,7 @@ def serving(self, inputs):
 MOBILEVIT_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]`, `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`MobileViTImageProcessor`]. See
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
             [`MobileViTImageProcessor.__call__`] for details.
 
         output_hidden_states (`bool`, *optional*):
@@ -839,7 +808,6 @@ def __init__(self, config: MobileViTConfig, expand_output: bool = True, *inputs,
     @unpack_inputs
     @add_start_docstrings_to_model_forward(MOBILEVIT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFBaseModelOutputWithPooling,
         config_class=_CONFIG_FOR_DOC,
@@ -848,23 +816,14 @@ def __init__(self, config: MobileViTConfig, expand_output: bool = True, *inputs,
     )
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[Tuple[tf.Tensor], TFBaseModelOutputWithPooling]:
-
         output = self.mobilevit(pixel_values, output_hidden_states, return_dict, training=training)
         return output
 
-    def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
-        # hidden_states not converted to Tensor with tf.convert_to_tensor as they are all of different dimensions
-        return TFBaseModelOutputWithPooling(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            hidden_states=output.hidden_states,
-        )
-
 
 @add_start_docstrings(
     """
@@ -889,7 +848,6 @@ def __init__(self, config: MobileViTConfig, *inputs, **kwargs) -> None:
     @unpack_inputs
     @add_start_docstrings_to_model_forward(MOBILEVIT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_IMAGE_CLASS_CHECKPOINT,
         output_type=TFImageClassifierOutputWithNoAttention,
         config_class=_CONFIG_FOR_DOC,
@@ -897,9 +855,9 @@ def __init__(self, config: MobileViTConfig, *inputs, **kwargs) -> None:
     )
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
         output_hidden_states: Optional[bool] = None,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
     ) -> Union[tuple, TFImageClassifierOutputWithNoAttention]:
@@ -926,10 +884,6 @@ def call(
 
         return TFImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
 
-    def serving_output(self, output: TFImageClassifierOutputWithNoAttention) -> TFImageClassifierOutputWithNoAttention:
-        # hidden_states and attention not converted to Tensor with tf.convert_to_tensor as they are all of different dimensions
-        return TFImageClassifierOutputWithNoAttention(logits=output.logits, hidden_states=output.hidden_states)
-
 
 class TFMobileViTASPPPooling(tf.keras.layers.Layer):
     def __init__(self, config: MobileViTConfig, out_channels: int, **kwargs) -> None:
@@ -1087,8 +1041,8 @@ def masked_loss(real, pred):
     @replace_return_docstrings(output_type=TFSemanticSegmenterOutputWithNoAttention, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
-        labels: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
+        labels: tf.Tensor | None = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
@@ -1103,14 +1057,14 @@ def call(
         Examples:
 
         ```python
-        >>> from transformers import MobileViTImageProcessor, TFMobileViTForSemanticSegmentation
+        >>> from transformers import AutoImageProcessor, TFMobileViTForSemanticSegmentation
         >>> from PIL import Image
         >>> import requests
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> image_processor = MobileViTImageProcessor.from_pretrained("apple/deeplabv3-mobilevit-small")
+        >>> image_processor = AutoImageProcessor.from_pretrained("apple/deeplabv3-mobilevit-small")
         >>> model = TFMobileViTForSemanticSegmentation.from_pretrained("apple/deeplabv3-mobilevit-small")
 
         >>> inputs = image_processor(images=image, return_tensors="tf")
@@ -1159,8 +1113,3 @@ def call(
             logits=logits,
             hidden_states=outputs.hidden_states if output_hidden_states else None,
         )
-
-    def serving_output(
-        self, output: TFSemanticSegmenterOutputWithNoAttention
-    ) -> TFSemanticSegmenterOutputWithNoAttention:
-        return TFSemanticSegmenterOutputWithNoAttention(logits=output.logits, hidden_states=output.hidden_states)
diff --git a/src/transformers/models/mobilevitv2/__init__.py b/src/transformers/models/mobilevitv2/__init__.py
new file mode 100644
index 000000000000..043caf7b7526
--- /dev/null
+++ b/src/transformers/models/mobilevitv2/__init__.py
@@ -0,0 +1,71 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+    is_vision_available,
+)
+
+
+_import_structure = {
+    "configuration_mobilevitv2": [
+        "MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "MobileViTV2Config",
+        "MobileViTV2OnnxConfig",
+    ],
+}
+
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_mobilevitv2"] = [
+        "MOBILEVITV2_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "MobileViTV2ForImageClassification",
+        "MobileViTV2ForSemanticSegmentation",
+        "MobileViTV2Model",
+        "MobileViTV2PreTrainedModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_mobilevitv2 import (
+        MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        MobileViTV2Config,
+        MobileViTV2OnnxConfig,
+    )
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_mobilevitv2 import (
+            MOBILEVITV2_PRETRAINED_MODEL_ARCHIVE_LIST,
+            MobileViTV2ForImageClassification,
+            MobileViTV2ForSemanticSegmentation,
+            MobileViTV2Model,
+            MobileViTV2PreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/mobilevitv2/configuration_mobilevitv2.py b/src/transformers/models/mobilevitv2/configuration_mobilevitv2.py
new file mode 100644
index 000000000000..0181d17c3517
--- /dev/null
+++ b/src/transformers/models/mobilevitv2/configuration_mobilevitv2.py
@@ -0,0 +1,168 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" MobileViTV2 model configuration"""
+
+from collections import OrderedDict
+from typing import Mapping
+
+from packaging import version
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "apple/mobilevitv2-1.0": "https://huggingface.co/apple/mobilevitv2-1.0/resolve/main/config.json",
+}
+
+
+class MobileViTV2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MobileViTV2Model`]. It is used to instantiate a
+    MobileViTV2 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the MobileViTV2
+    [apple/mobilevitv2-1.0](https://huggingface.co/apple/mobilevitv2-1.0) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        image_size (`int`, *optional*, defaults to 256):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 2):
+            The size (resolution) of each patch.
+        expand_ratio (`float`, *optional*, defaults to 2.0):
+            Expansion factor for the MobileNetv2 layers.
+        hidden_act (`str` or `function`, *optional*, defaults to `"swish"`):
+            The non-linear activation function (function or string) in the Transformer encoder and convolution layers.
+        conv_kernel_size (`int`, *optional*, defaults to 3):
+            The size of the convolutional kernel in the MobileViTV2 layer.
+        output_stride (`int`, *optional*, defaults to 32):
+            The ratio of the spatial resolution of the output to the resolution of the input image.
+        classifier_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for attached classifiers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        aspp_out_channels (`int`, *optional*, defaults to 512):
+            Number of output channels used in the ASPP layer for semantic segmentation.
+        atrous_rates (`List[int]`, *optional*, defaults to `[6, 12, 18]`):
+            Dilation (atrous) factors used in the ASPP layer for semantic segmentation.
+        aspp_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the ASPP layer for semantic segmentation.
+        semantic_loss_ignore_index (`int`, *optional*, defaults to 255):
+            The index that is ignored by the loss function of the semantic segmentation model.
+        n_attn_blocks (`List[int]`, *optional*, defaults to `[2, 4, 3]`):
+            The number of attention blocks in each MobileViTV2Layer
+        base_attn_unit_dims (`List[int]`, *optional*, defaults to `[128, 192, 256]`):
+            The base multiplier for dimensions of attention blocks in each MobileViTV2Layer
+        width_multiplier (`float`, *optional*, defaults to 1.0):
+            The width multiplier for MobileViTV2.
+        ffn_multiplier (`int`, *optional*, defaults to 2):
+            The FFN multiplier for MobileViTV2.
+        attn_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout in the attention layer.
+        ffn_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout between FFN layers.
+
+    Example:
+
+    ```python
+    >>> from transformers import MobileViTV2Config, MobileViTV2Model
+
+    >>> # Initializing a mobilevitv2-small style configuration
+    >>> configuration = MobileViTV2Config()
+
+    >>> # Initializing a model from the mobilevitv2-small style configuration
+    >>> model = MobileViTV2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "mobilevitv2"
+
+    def __init__(
+        self,
+        num_channels=3,
+        image_size=256,
+        patch_size=2,
+        expand_ratio=2.0,
+        hidden_act="swish",
+        conv_kernel_size=3,
+        output_stride=32,
+        classifier_dropout_prob=0.1,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        aspp_out_channels=512,
+        atrous_rates=[6, 12, 18],
+        aspp_dropout_prob=0.1,
+        semantic_loss_ignore_index=255,
+        n_attn_blocks=[2, 4, 3],
+        base_attn_unit_dims=[128, 192, 256],
+        width_multiplier=1.0,
+        ffn_multiplier=2,
+        attn_dropout=0.0,
+        ffn_dropout=0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.expand_ratio = expand_ratio
+        self.hidden_act = hidden_act
+        self.conv_kernel_size = conv_kernel_size
+        self.output_stride = output_stride
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.n_attn_blocks = n_attn_blocks
+        self.base_attn_unit_dims = base_attn_unit_dims
+        self.width_multiplier = width_multiplier
+        self.ffn_multiplier = ffn_multiplier
+        self.ffn_dropout = ffn_dropout
+        self.attn_dropout = attn_dropout
+        self.classifier_dropout_prob = classifier_dropout_prob
+
+        # decode head attributes for semantic segmentation
+        self.aspp_out_channels = aspp_out_channels
+        self.atrous_rates = atrous_rates
+        self.aspp_dropout_prob = aspp_dropout_prob
+        self.semantic_loss_ignore_index = semantic_loss_ignore_index
+
+
+class MobileViTV2OnnxConfig(OnnxConfig):
+    torch_onnx_minimum_version = version.parse("1.11")
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict([("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"})])
+
+    @property
+    def outputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task == "image-classification":
+            return OrderedDict([("logits", {0: "batch"})])
+        else:
+            return OrderedDict([("last_hidden_state", {0: "batch"}), ("pooler_output", {0: "batch"})])
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-4
diff --git a/src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py b/src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py
new file mode 100644
index 000000000000..2e2d31295d7c
--- /dev/null
+++ b/src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py
@@ -0,0 +1,326 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert MobileViTV2 checkpoints from the ml-cvnets library."""
+
+
+import argparse
+import collections
+import json
+from pathlib import Path
+
+import requests
+import torch
+import yaml
+from huggingface_hub import hf_hub_download
+from PIL import Image
+
+from transformers import (
+    MobileViTImageProcessor,
+    MobileViTV2Config,
+    MobileViTV2ForImageClassification,
+    MobileViTV2ForSemanticSegmentation,
+)
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+def load_orig_config_file(orig_cfg_file):
+    print("Loading config file...")
+
+    def flatten_yaml_as_dict(d, parent_key="", sep="."):
+        items = []
+        for k, v in d.items():
+            new_key = parent_key + sep + k if parent_key else k
+            if isinstance(v, collections.abc.MutableMapping):
+                items.extend(flatten_yaml_as_dict(v, new_key, sep=sep).items())
+            else:
+                items.append((new_key, v))
+        return dict(items)
+
+    config = argparse.Namespace()
+    with open(orig_cfg_file, "r") as yaml_file:
+        try:
+            cfg = yaml.load(yaml_file, Loader=yaml.FullLoader)
+
+            flat_cfg = flatten_yaml_as_dict(cfg)
+            for k, v in flat_cfg.items():
+                setattr(config, k, v)
+        except yaml.YAMLError as exc:
+            logger.error("Error while loading config file: {}. Error message: {}".format(orig_cfg_file, str(exc)))
+    return config
+
+
+def get_mobilevitv2_config(task_name, orig_cfg_file):
+    config = MobileViTV2Config()
+
+    is_segmentation_model = False
+
+    # dataset
+    if task_name.startswith("imagenet1k_"):
+        config.num_labels = 1000
+        if int(task_name.strip().split("_")[-1]) == 384:
+            config.image_size = 384
+        else:
+            config.image_size = 256
+        filename = "imagenet-1k-id2label.json"
+    elif task_name.startswith("imagenet21k_to_1k_"):
+        config.num_labels = 21000
+        if int(task_name.strip().split("_")[-1]) == 384:
+            config.image_size = 384
+        else:
+            config.image_size = 256
+        filename = "imagenet-22k-id2label.json"
+    elif task_name.startswith("ade20k_"):
+        config.num_labels = 151
+        config.image_size = 512
+        filename = "ade20k-id2label.json"
+        is_segmentation_model = True
+    elif task_name.startswith("voc_"):
+        config.num_labels = 21
+        config.image_size = 512
+        filename = "pascal-voc-id2label.json"
+        is_segmentation_model = True
+
+    # orig_config
+    orig_config = load_orig_config_file(orig_cfg_file)
+    assert getattr(orig_config, "model.classification.name", -1) == "mobilevit_v2", "Invalid model"
+    config.width_multiplier = getattr(orig_config, "model.classification.mitv2.width_multiplier", 1.0)
+    assert (
+        getattr(orig_config, "model.classification.mitv2.attn_norm_layer", -1) == "layer_norm_2d"
+    ), "Norm layers other than layer_norm_2d is not supported"
+    config.hidden_act = getattr(orig_config, "model.classification.activation.name", "swish")
+    # config.image_size == getattr(orig_config,  'sampler.bs.crop_size_width', 256)
+
+    if is_segmentation_model:
+        config.output_stride = getattr(orig_config, "model.segmentation.output_stride", 16)
+        if "_deeplabv3" in task_name:
+            config.atrous_rates = getattr(orig_config, "model.segmentation.deeplabv3.aspp_rates", [12, 24, 36])
+            config.aspp_out_channels = getattr(orig_config, "model.segmentation.deeplabv3.aspp_out_channels", 512)
+            config.aspp_dropout_prob = getattr(orig_config, "model.segmentation.deeplabv3.aspp_dropout", 0.1)
+
+    # id2label
+    repo_id = "huggingface/label-files"
+    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
+    id2label = {int(k): v for k, v in id2label.items()}
+    config.id2label = id2label
+    config.label2id = {v: k for k, v in id2label.items()}
+
+    return config
+
+
+def rename_key(dct, old, new):
+    val = dct.pop(old)
+    dct[new] = val
+
+
+def create_rename_keys(state_dict, base_model=False):
+    if base_model:
+        model_prefix = ""
+    else:
+        model_prefix = "mobilevitv2."
+
+    rename_keys = []
+    for k in state_dict.keys():
+        if k[:8] == "encoder.":
+            k_new = k[8:]
+        else:
+            k_new = k
+
+        if ".block." in k:
+            k_new = k_new.replace(".block.", ".")
+        if ".conv." in k:
+            k_new = k_new.replace(".conv.", ".convolution.")
+        if ".norm." in k:
+            k_new = k_new.replace(".norm.", ".normalization.")
+
+        if "conv_1." in k:
+            k_new = k_new.replace("conv_1.", f"{model_prefix}conv_stem.")
+        for i in [1, 2]:
+            if f"layer_{i}." in k:
+                k_new = k_new.replace(f"layer_{i}.", f"{model_prefix}encoder.layer.{i-1}.layer.")
+        if ".exp_1x1." in k:
+            k_new = k_new.replace(".exp_1x1.", ".expand_1x1.")
+        if ".red_1x1." in k:
+            k_new = k_new.replace(".red_1x1.", ".reduce_1x1.")
+
+        for i in [3, 4, 5]:
+            if f"layer_{i}.0." in k:
+                k_new = k_new.replace(f"layer_{i}.0.", f"{model_prefix}encoder.layer.{i-1}.downsampling_layer.")
+            if f"layer_{i}.1.local_rep.0." in k:
+                k_new = k_new.replace(f"layer_{i}.1.local_rep.0.", f"{model_prefix}encoder.layer.{i-1}.conv_kxk.")
+            if f"layer_{i}.1.local_rep.1." in k:
+                k_new = k_new.replace(f"layer_{i}.1.local_rep.1.", f"{model_prefix}encoder.layer.{i-1}.conv_1x1.")
+
+        for i in [3, 4, 5]:
+            if i == 3:
+                j_in = [0, 1]
+            elif i == 4:
+                j_in = [0, 1, 2, 3]
+            elif i == 5:
+                j_in = [0, 1, 2]
+
+            for j in j_in:
+                if f"layer_{i}.1.global_rep.{j}." in k:
+                    k_new = k_new.replace(
+                        f"layer_{i}.1.global_rep.{j}.", f"{model_prefix}encoder.layer.{i-1}.transformer.layer.{j}."
+                    )
+            if f"layer_{i}.1.global_rep.{j+1}." in k:
+                k_new = k_new.replace(
+                    f"layer_{i}.1.global_rep.{j+1}.", f"{model_prefix}encoder.layer.{i-1}.layernorm."
+                )
+
+            if f"layer_{i}.1.conv_proj." in k:
+                k_new = k_new.replace(f"layer_{i}.1.conv_proj.", f"{model_prefix}encoder.layer.{i-1}.conv_projection.")
+
+        if "pre_norm_attn.0." in k:
+            k_new = k_new.replace("pre_norm_attn.0.", "layernorm_before.")
+        if "pre_norm_attn.1." in k:
+            k_new = k_new.replace("pre_norm_attn.1.", "attention.")
+        if "pre_norm_ffn.0." in k:
+            k_new = k_new.replace("pre_norm_ffn.0.", "layernorm_after.")
+        if "pre_norm_ffn.1." in k:
+            k_new = k_new.replace("pre_norm_ffn.1.", "ffn.conv1.")
+        if "pre_norm_ffn.3." in k:
+            k_new = k_new.replace("pre_norm_ffn.3.", "ffn.conv2.")
+
+        if "classifier.1." in k:
+            k_new = k_new.replace("classifier.1.", "classifier.")
+
+        if "seg_head." in k:
+            k_new = k_new.replace("seg_head.", "segmentation_head.")
+        if ".aspp_layer." in k:
+            k_new = k_new.replace(".aspp_layer.", ".")
+        if ".aspp_pool." in k:
+            k_new = k_new.replace(".aspp_pool.", ".")
+
+        rename_keys.append((k, k_new))
+    return rename_keys
+
+
+def remove_unused_keys(state_dict):
+    """remove unused keys (e.g.: seg_head.aux_head)"""
+    keys_to_ignore = []
+    for k in state_dict.keys():
+        if k.startswith("seg_head.aux_head."):
+            keys_to_ignore.append(k)
+    for k in keys_to_ignore:
+        state_dict.pop(k, None)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    # url = "https://cdn.britannica.com/86/141086-050-9D7C75EE/Gulfstream-G450-business-jet-passengers.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    return im
+
+
+@torch.no_grad()
+def convert_mobilevitv2_checkpoint(task_name, checkpoint_path, orig_config_path, pytorch_dump_folder_path):
+    """
+    Copy/paste/tweak model's weights to our MobileViTV2 structure.
+    """
+    config = get_mobilevitv2_config(task_name, orig_config_path)
+
+    # load original state_dict
+    checkpoint = torch.load(checkpoint_path, map_location="cpu")
+
+    # load huggingface model
+    if task_name.startswith("ade20k_") or task_name.startswith("voc_"):
+        model = MobileViTV2ForSemanticSegmentation(config).eval()
+        base_model = False
+    else:
+        model = MobileViTV2ForImageClassification(config).eval()
+        base_model = False
+
+    # remove and rename some keys of load the original model
+    state_dict = checkpoint
+    remove_unused_keys(state_dict)
+    rename_keys = create_rename_keys(state_dict, base_model=base_model)
+    for rename_key_src, rename_key_dest in rename_keys:
+        rename_key(state_dict, rename_key_src, rename_key_dest)
+
+    # load modified state_dict
+    model.load_state_dict(state_dict)
+
+    # Check outputs on an image, prepared by MobileViTImageProcessor
+    image_processor = MobileViTImageProcessor(crop_size=config.image_size, size=config.image_size + 32)
+    encoding = image_processor(images=prepare_img(), return_tensors="pt")
+    outputs = model(**encoding)
+
+    # verify classification model
+    if task_name.startswith("imagenet"):
+        logits = outputs.logits
+        predicted_class_idx = logits.argmax(-1).item()
+        print("Predicted class:", model.config.id2label[predicted_class_idx])
+        if task_name.startswith("imagenet1k_256") and config.width_multiplier == 1.0:
+            # expected_logits for base variant
+            expected_logits = torch.tensor([-1.6336e00, -7.3204e-02, -5.1883e-01])
+            assert torch.allclose(logits[0, :3], expected_logits, atol=1e-4)
+
+    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+    print(f"Saving model {task_name} to {pytorch_dump_folder_path}")
+    model.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--task",
+        default="imagenet1k_256",
+        type=str,
+        help=(
+            "Name of the task for which the MobileViTV2 model you'd like to convert is trained on . "
+            """
+                Classification (ImageNet-1k)
+                    - MobileViTV2 (256x256) : imagenet1k_256
+                    - MobileViTV2 (Trained on 256x256 and Finetuned on 384x384) : imagenet1k_384
+                    - MobileViTV2 (Trained on ImageNet-21k and Finetuned on ImageNet-1k 256x256) :
+                      imagenet21k_to_1k_256
+                    - MobileViTV2 (Trained on ImageNet-21k, Finetuned on ImageNet-1k 256x256, and Finetuned on
+                      ImageNet-1k 384x384) : imagenet21k_to_1k_384
+                Segmentation
+                    - ADE20K Dataset : ade20k_deeplabv3
+                    - Pascal VOC 2012 Dataset: voc_deeplabv3
+            """
+        ),
+        choices=[
+            "imagenet1k_256",
+            "imagenet1k_384",
+            "imagenet21k_to_1k_256",
+            "imagenet21k_to_1k_384",
+            "ade20k_deeplabv3",
+            "voc_deeplabv3",
+        ],
+    )
+
+    parser.add_argument(
+        "--orig_checkpoint_path", required=True, type=str, help="Path to the original state dict (.pt file)."
+    )
+    parser.add_argument("--orig_config_path", required=True, type=str, help="Path to the original config file.")
+    parser.add_argument(
+        "--pytorch_dump_folder_path", required=True, type=str, help="Path to the output PyTorch model directory."
+    )
+
+    args = parser.parse_args()
+    convert_mobilevitv2_checkpoint(
+        args.task, args.orig_checkpoint_path, args.orig_config_path, args.pytorch_dump_folder_path
+    )
diff --git a/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py b/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py
new file mode 100644
index 000000000000..5a0e08d7344d
--- /dev/null
+++ b/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py
@@ -0,0 +1,1044 @@
+# coding=utf-8
+# Copyright 2023 Apple Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Original license: https://github.com/apple/ml-cvnets/blob/main/LICENSE
+""" PyTorch MobileViTV2 model."""
+
+
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BaseModelOutputWithNoAttention,
+    BaseModelOutputWithPoolingAndNoAttention,
+    ImageClassifierOutputWithNoAttention,
+    SemanticSegmenterOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_mobilevitv2 import MobileViTV2Config
+
+
+logger = logging.get_logger(__name__)
+
+
+# General docstring
+_CONFIG_FOR_DOC = "MobileViTV2Config"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "apple/mobilevitv2-1.0-imagenet1k-256"
+_EXPECTED_OUTPUT_SHAPE = [1, 512, 8, 8]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "apple/mobilevitv2-1.0-imagenet1k-256"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
+
+
+MOBILEVITV2_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "apple/mobilevitv2-1.0-imagenet1k-256"
+    # See all MobileViTV2 models at https://huggingface.co/models?filter=mobilevitv2
+]
+
+
+# Copied from transformers.models.mobilevit.modeling_mobilevit.make_divisible
+def make_divisible(value: int, divisor: int = 8, min_value: Optional[int] = None) -> int:
+    """
+    Ensure that all layers have a channel count that is divisible by `divisor`. This function is taken from the
+    original TensorFlow repo. It can be seen here:
+    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+    """
+    if min_value is None:
+        min_value = divisor
+    new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_value < 0.9 * value:
+        new_value += divisor
+    return int(new_value)
+
+
+def clip(value: float, min_val: float = float("-inf"), max_val: float = float("inf")) -> float:
+    return max(min_val, min(max_val, value))
+
+
+# Copied from transformers.models.mobilevit.modeling_mobilevit.MobileViTConvLayer with MobileViT->MobileViTV2
+class MobileViTV2ConvLayer(nn.Module):
+    def __init__(
+        self,
+        config: MobileViTV2Config,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        groups: int = 1,
+        bias: bool = False,
+        dilation: int = 1,
+        use_normalization: bool = True,
+        use_activation: Union[bool, str] = True,
+    ) -> None:
+        super().__init__()
+        padding = int((kernel_size - 1) / 2) * dilation
+
+        if in_channels % groups != 0:
+            raise ValueError(f"Input channels ({in_channels}) are not divisible by {groups} groups.")
+        if out_channels % groups != 0:
+            raise ValueError(f"Output channels ({out_channels}) are not divisible by {groups} groups.")
+
+        self.convolution = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+            padding_mode="zeros",
+        )
+
+        if use_normalization:
+            self.normalization = nn.BatchNorm2d(
+                num_features=out_channels,
+                eps=1e-5,
+                momentum=0.1,
+                affine=True,
+                track_running_stats=True,
+            )
+        else:
+            self.normalization = None
+
+        if use_activation:
+            if isinstance(use_activation, str):
+                self.activation = ACT2FN[use_activation]
+            elif isinstance(config.hidden_act, str):
+                self.activation = ACT2FN[config.hidden_act]
+            else:
+                self.activation = config.hidden_act
+        else:
+            self.activation = None
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        features = self.convolution(features)
+        if self.normalization is not None:
+            features = self.normalization(features)
+        if self.activation is not None:
+            features = self.activation(features)
+        return features
+
+
+# Copied from transformers.models.mobilevit.modeling_mobilevit.MobileViTInvertedResidual with MobileViT->MobileViTV2
+class MobileViTV2InvertedResidual(nn.Module):
+    """
+    Inverted residual block (MobileNetv2): https://arxiv.org/abs/1801.04381
+    """
+
+    def __init__(
+        self, config: MobileViTV2Config, in_channels: int, out_channels: int, stride: int, dilation: int = 1
+    ) -> None:
+        super().__init__()
+        expanded_channels = make_divisible(int(round(in_channels * config.expand_ratio)), 8)
+
+        if stride not in [1, 2]:
+            raise ValueError(f"Invalid stride {stride}.")
+
+        self.use_residual = (stride == 1) and (in_channels == out_channels)
+
+        self.expand_1x1 = MobileViTV2ConvLayer(
+            config, in_channels=in_channels, out_channels=expanded_channels, kernel_size=1
+        )
+
+        self.conv_3x3 = MobileViTV2ConvLayer(
+            config,
+            in_channels=expanded_channels,
+            out_channels=expanded_channels,
+            kernel_size=3,
+            stride=stride,
+            groups=expanded_channels,
+            dilation=dilation,
+        )
+
+        self.reduce_1x1 = MobileViTV2ConvLayer(
+            config,
+            in_channels=expanded_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            use_activation=False,
+        )
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        residual = features
+
+        features = self.expand_1x1(features)
+        features = self.conv_3x3(features)
+        features = self.reduce_1x1(features)
+
+        return residual + features if self.use_residual else features
+
+
+# Copied from transformers.models.mobilevit.modeling_mobilevit.MobileViTMobileNetLayer with MobileViT->MobileViTV2
+class MobileViTV2MobileNetLayer(nn.Module):
+    def __init__(
+        self, config: MobileViTV2Config, in_channels: int, out_channels: int, stride: int = 1, num_stages: int = 1
+    ) -> None:
+        super().__init__()
+
+        self.layer = nn.ModuleList()
+        for i in range(num_stages):
+            layer = MobileViTV2InvertedResidual(
+                config,
+                in_channels=in_channels,
+                out_channels=out_channels,
+                stride=stride if i == 0 else 1,
+            )
+            self.layer.append(layer)
+            in_channels = out_channels
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        for layer_module in self.layer:
+            features = layer_module(features)
+        return features
+
+
+class MobileViTV2LinearSelfAttention(nn.Module):
+    """
+    This layer applies a self-attention with linear complexity, as described in MobileViTV2 paper:
+    https://arxiv.org/abs/2206.02680
+
+    Args:
+        config (`MobileVitv2Config`):
+             Model configuration object
+        embed_dim (`int`):
+            `input_channels` from an expected input of size :math:`(batch_size, input_channels, height, width)`
+    """
+
+    def __init__(self, config: MobileViTV2Config, embed_dim: int) -> None:
+        super().__init__()
+
+        self.qkv_proj = MobileViTV2ConvLayer(
+            config=config,
+            in_channels=embed_dim,
+            out_channels=1 + (2 * embed_dim),
+            bias=True,
+            kernel_size=1,
+            use_normalization=False,
+            use_activation=False,
+        )
+
+        self.attn_dropout = nn.Dropout(p=config.attn_dropout)
+        self.out_proj = MobileViTV2ConvLayer(
+            config=config,
+            in_channels=embed_dim,
+            out_channels=embed_dim,
+            bias=True,
+            kernel_size=1,
+            use_normalization=False,
+            use_activation=False,
+        )
+        self.embed_dim = embed_dim
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # (batch_size, embed_dim, num_pixels_in_patch, num_patches) --> (batch_size, 1+2*embed_dim, num_pixels_in_patch, num_patches)
+        qkv = self.qkv_proj(hidden_states)
+
+        # Project hidden_states into query, key and value
+        # Query --> [batch_size, 1, num_pixels_in_patch, num_patches]
+        # value, key --> [batch_size, embed_dim, num_pixels_in_patch, num_patches]
+        query, key, value = torch.split(qkv, split_size_or_sections=[1, self.embed_dim, self.embed_dim], dim=1)
+
+        # apply softmax along num_patches dimension
+        context_scores = torch.nn.functional.softmax(query, dim=-1)
+        context_scores = self.attn_dropout(context_scores)
+
+        # Compute context vector
+        # [batch_size, embed_dim, num_pixels_in_patch, num_patches] x [batch_size, 1, num_pixels_in_patch, num_patches] -> [batch_size, embed_dim, num_pixels_in_patch, num_patches]
+        context_vector = key * context_scores
+        # [batch_size, embed_dim, num_pixels_in_patch, num_patches] --> [batch_size, embed_dim, num_pixels_in_patch, 1]
+        context_vector = torch.sum(context_vector, dim=-1, keepdim=True)
+
+        # combine context vector with values
+        # [batch_size, embed_dim, num_pixels_in_patch, num_patches] * [batch_size, embed_dim, num_pixels_in_patch, 1] --> [batch_size, embed_dim, num_pixels_in_patch, num_patches]
+        out = torch.nn.functional.relu(value) * context_vector.expand_as(value)
+        out = self.out_proj(out)
+        return out
+
+
+class MobileViTV2FFN(nn.Module):
+    def __init__(
+        self,
+        config: MobileViTV2Config,
+        embed_dim: int,
+        ffn_latent_dim: int,
+        ffn_dropout: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.conv1 = MobileViTV2ConvLayer(
+            config=config,
+            in_channels=embed_dim,
+            out_channels=ffn_latent_dim,
+            kernel_size=1,
+            stride=1,
+            bias=True,
+            use_normalization=False,
+            use_activation=True,
+        )
+        self.dropout1 = nn.Dropout(ffn_dropout)
+
+        self.conv2 = MobileViTV2ConvLayer(
+            config=config,
+            in_channels=ffn_latent_dim,
+            out_channels=embed_dim,
+            kernel_size=1,
+            stride=1,
+            bias=True,
+            use_normalization=False,
+            use_activation=False,
+        )
+        self.dropout2 = nn.Dropout(ffn_dropout)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.conv1(hidden_states)
+        hidden_states = self.dropout1(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        hidden_states = self.dropout2(hidden_states)
+        return hidden_states
+
+
+class MobileViTV2TransformerLayer(nn.Module):
+    def __init__(
+        self,
+        config: MobileViTV2Config,
+        embed_dim: int,
+        ffn_latent_dim: int,
+        dropout: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.layernorm_before = nn.GroupNorm(num_groups=1, num_channels=embed_dim, eps=config.layer_norm_eps)
+        self.attention = MobileViTV2LinearSelfAttention(config, embed_dim)
+        self.dropout1 = nn.Dropout(p=dropout)
+        self.layernorm_after = nn.GroupNorm(num_groups=1, num_channels=embed_dim, eps=config.layer_norm_eps)
+        self.ffn = MobileViTV2FFN(config, embed_dim, ffn_latent_dim, config.ffn_dropout)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        layernorm_1_out = self.layernorm_before(hidden_states)
+        attention_output = self.attention(layernorm_1_out)
+        hidden_states = attention_output + hidden_states
+
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.ffn(layer_output)
+
+        layer_output = layer_output + hidden_states
+        return layer_output
+
+
+class MobileViTV2Transformer(nn.Module):
+    def __init__(self, config: MobileViTV2Config, n_layers: int, d_model: int) -> None:
+        super().__init__()
+
+        ffn_multiplier = config.ffn_multiplier
+
+        ffn_dims = [ffn_multiplier * d_model] * n_layers
+
+        # ensure that dims are multiple of 16
+        ffn_dims = [int((d // 16) * 16) for d in ffn_dims]
+
+        self.layer = nn.ModuleList()
+        for block_idx in range(n_layers):
+            transformer_layer = MobileViTV2TransformerLayer(
+                config, embed_dim=d_model, ffn_latent_dim=ffn_dims[block_idx]
+            )
+            self.layer.append(transformer_layer)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        for layer_module in self.layer:
+            hidden_states = layer_module(hidden_states)
+        return hidden_states
+
+
+class MobileViTV2Layer(nn.Module):
+    """
+    MobileViTV2 layer: https://arxiv.org/abs/2206.02680
+    """
+
+    def __init__(
+        self,
+        config: MobileViTV2Config,
+        in_channels: int,
+        out_channels: int,
+        attn_unit_dim: int,
+        n_attn_blocks: int = 2,
+        dilation: int = 1,
+        stride: int = 2,
+    ) -> None:
+        super().__init__()
+        self.patch_width = config.patch_size
+        self.patch_height = config.patch_size
+
+        cnn_out_dim = attn_unit_dim
+
+        if stride == 2:
+            self.downsampling_layer = MobileViTV2InvertedResidual(
+                config,
+                in_channels=in_channels,
+                out_channels=out_channels,
+                stride=stride if dilation == 1 else 1,
+                dilation=dilation // 2 if dilation > 1 else 1,
+            )
+            in_channels = out_channels
+        else:
+            self.downsampling_layer = None
+
+        # Local representations
+        self.conv_kxk = MobileViTV2ConvLayer(
+            config,
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=config.conv_kernel_size,
+            groups=in_channels,
+        )
+        self.conv_1x1 = MobileViTV2ConvLayer(
+            config,
+            in_channels=in_channels,
+            out_channels=cnn_out_dim,
+            kernel_size=1,
+            use_normalization=False,
+            use_activation=False,
+        )
+
+        # Global representations
+        self.transformer = MobileViTV2Transformer(config, d_model=attn_unit_dim, n_layers=n_attn_blocks)
+
+        # self.layernorm = MobileViTV2LayerNorm2D(attn_unit_dim, eps=config.layer_norm_eps)
+        self.layernorm = nn.GroupNorm(num_groups=1, num_channels=attn_unit_dim, eps=config.layer_norm_eps)
+
+        # Fusion
+        self.conv_projection = MobileViTV2ConvLayer(
+            config,
+            in_channels=cnn_out_dim,
+            out_channels=in_channels,
+            kernel_size=1,
+            use_normalization=True,
+            use_activation=False,
+        )
+
+    def unfolding(self, feature_map: torch.Tensor) -> Tuple[torch.Tensor, Tuple[int, int]]:
+        batch_size, in_channels, img_height, img_width = feature_map.shape
+        patches = nn.functional.unfold(
+            feature_map,
+            kernel_size=(self.patch_height, self.patch_width),
+            stride=(self.patch_height, self.patch_width),
+        )
+        patches = patches.reshape(batch_size, in_channels, self.patch_height * self.patch_width, -1)
+
+        return patches, (img_height, img_width)
+
+    def folding(self, patches: torch.Tensor, output_size: Tuple[int, int]) -> torch.Tensor:
+        batch_size, in_dim, patch_size, n_patches = patches.shape
+        patches = patches.reshape(batch_size, in_dim * patch_size, n_patches)
+
+        feature_map = nn.functional.fold(
+            patches,
+            output_size=output_size,
+            kernel_size=(self.patch_height, self.patch_width),
+            stride=(self.patch_height, self.patch_width),
+        )
+
+        return feature_map
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        # reduce spatial dimensions if needed
+        if self.downsampling_layer:
+            features = self.downsampling_layer(features)
+
+        # local representation
+        features = self.conv_kxk(features)
+        features = self.conv_1x1(features)
+
+        # convert feature map to patches
+        patches, output_size = self.unfolding(features)
+
+        # learn global representations
+        patches = self.transformer(patches)
+        patches = self.layernorm(patches)
+
+        # convert patches back to feature maps
+        # [batch_size, patch_height, patch_width, input_dim] --> [batch_size, input_dim, patch_height, patch_width]
+        features = self.folding(patches, output_size)
+
+        features = self.conv_projection(features)
+        return features
+
+
+class MobileViTV2Encoder(nn.Module):
+    def __init__(self, config: MobileViTV2Config) -> None:
+        super().__init__()
+        self.config = config
+
+        self.layer = nn.ModuleList()
+        self.gradient_checkpointing = False
+
+        # segmentation architectures like DeepLab and PSPNet modify the strides
+        # of the classification backbones
+        dilate_layer_4 = dilate_layer_5 = False
+        if config.output_stride == 8:
+            dilate_layer_4 = True
+            dilate_layer_5 = True
+        elif config.output_stride == 16:
+            dilate_layer_5 = True
+
+        dilation = 1
+
+        layer_0_dim = make_divisible(
+            clip(value=32 * config.width_multiplier, min_val=16, max_val=64), divisor=8, min_value=16
+        )
+
+        layer_1_dim = make_divisible(64 * config.width_multiplier, divisor=16)
+        layer_2_dim = make_divisible(128 * config.width_multiplier, divisor=8)
+        layer_3_dim = make_divisible(256 * config.width_multiplier, divisor=8)
+        layer_4_dim = make_divisible(384 * config.width_multiplier, divisor=8)
+        layer_5_dim = make_divisible(512 * config.width_multiplier, divisor=8)
+
+        layer_1 = MobileViTV2MobileNetLayer(
+            config,
+            in_channels=layer_0_dim,
+            out_channels=layer_1_dim,
+            stride=1,
+            num_stages=1,
+        )
+        self.layer.append(layer_1)
+
+        layer_2 = MobileViTV2MobileNetLayer(
+            config,
+            in_channels=layer_1_dim,
+            out_channels=layer_2_dim,
+            stride=2,
+            num_stages=2,
+        )
+        self.layer.append(layer_2)
+
+        layer_3 = MobileViTV2Layer(
+            config,
+            in_channels=layer_2_dim,
+            out_channels=layer_3_dim,
+            attn_unit_dim=make_divisible(config.base_attn_unit_dims[0] * config.width_multiplier, divisor=8),
+            n_attn_blocks=config.n_attn_blocks[0],
+        )
+        self.layer.append(layer_3)
+
+        if dilate_layer_4:
+            dilation *= 2
+
+        layer_4 = MobileViTV2Layer(
+            config,
+            in_channels=layer_3_dim,
+            out_channels=layer_4_dim,
+            attn_unit_dim=make_divisible(config.base_attn_unit_dims[1] * config.width_multiplier, divisor=8),
+            n_attn_blocks=config.n_attn_blocks[1],
+            dilation=dilation,
+        )
+        self.layer.append(layer_4)
+
+        if dilate_layer_5:
+            dilation *= 2
+
+        layer_5 = MobileViTV2Layer(
+            config,
+            in_channels=layer_4_dim,
+            out_channels=layer_5_dim,
+            attn_unit_dim=make_divisible(config.base_attn_unit_dims[2] * config.width_multiplier, divisor=8),
+            n_attn_blocks=config.n_attn_blocks[2],
+            dilation=dilation,
+        )
+        self.layer.append(layer_5)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutputWithNoAttention]:
+        all_hidden_states = () if output_hidden_states else None
+
+        for i, layer_module in enumerate(self.layer):
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                )
+            else:
+                hidden_states = layer_module(hidden_states)
+
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)
+
+        return BaseModelOutputWithNoAttention(last_hidden_state=hidden_states, hidden_states=all_hidden_states)
+
+
+# Copied from transformers.models.mobilevit.modeling_mobilevit.MobileViTPreTrainedModel with MobileViT->MobileViTV2,mobilevit->mobilevitv2
+class MobileViTV2PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = MobileViTV2Config
+    base_model_prefix = "mobilevitv2"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, MobileViTV2Encoder):
+            module.gradient_checkpointing = value
+
+
+MOBILEVITV2_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`MobileViTV2Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+MOBILEVITV2_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`MobileViTImageProcessor.__call__`] for details.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare MobileViTV2 model outputting raw hidden-states without any specific head on top.",
+    MOBILEVITV2_START_DOCSTRING,
+)
+class MobileViTV2Model(MobileViTV2PreTrainedModel):
+    def __init__(self, config: MobileViTV2Config, expand_output: bool = True):
+        super().__init__(config)
+        self.config = config
+        self.expand_output = expand_output
+
+        layer_0_dim = make_divisible(
+            clip(value=32 * config.width_multiplier, min_val=16, max_val=64), divisor=8, min_value=16
+        )
+
+        self.conv_stem = MobileViTV2ConvLayer(
+            config,
+            in_channels=config.num_channels,
+            out_channels=layer_0_dim,
+            kernel_size=3,
+            stride=2,
+            use_normalization=True,
+            use_activation=True,
+        )
+        self.encoder = MobileViTV2Encoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def _prune_heads(self, heads_to_prune):
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
+        """
+        for layer_index, heads in heads_to_prune.items():
+            mobilevitv2_layer = self.encoder.layer[layer_index]
+            if isinstance(mobilevitv2_layer, MobileViTV2Layer):
+                for transformer_layer in mobilevitv2_layer.transformer.layer:
+                    transformer_layer.attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(MOBILEVITV2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndNoAttention,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPoolingAndNoAttention]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        embedding_output = self.conv_stem(pixel_values)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.expand_output:
+            last_hidden_state = encoder_outputs[0]
+
+            # global average pooling: (batch_size, channels, height, width) -> (batch_size, channels)
+            pooled_output = torch.mean(last_hidden_state, dim=[-2, -1], keepdim=False)
+        else:
+            last_hidden_state = encoder_outputs[0]
+            pooled_output = None
+
+        if not return_dict:
+            output = (last_hidden_state, pooled_output) if pooled_output is not None else (last_hidden_state,)
+            return output + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndNoAttention(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+        )
+
+
+@add_start_docstrings(
+    """
+    MobileViTV2 model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
+    ImageNet.
+    """,
+    MOBILEVITV2_START_DOCSTRING,
+)
+class MobileViTV2ForImageClassification(MobileViTV2PreTrainedModel):
+    def __init__(self, config: MobileViTV2Config) -> None:
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.mobilevitv2 = MobileViTV2Model(config)
+
+        out_channels = make_divisible(512 * config.width_multiplier, divisor=8)  # layer 5 output dimension
+        # Classifier head
+        self.classifier = (
+            nn.Linear(in_features=out_channels, out_features=config.num_labels)
+            if config.num_labels > 0
+            else nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MOBILEVITV2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=ImageClassifierOutputWithNoAttention,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, ImageClassifierOutputWithNoAttention]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.mobilevitv2(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
+
+        pooled_output = outputs.pooler_output if return_dict else outputs[1]
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ImageClassifierOutputWithNoAttention(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+        )
+
+
+# Copied from transformers.models.mobilevit.modeling_mobilevit.MobileViTASPPPooling with MobileViT->MobileViTV2
+class MobileViTV2ASPPPooling(nn.Module):
+    def __init__(self, config: MobileViTV2Config, in_channels: int, out_channels: int) -> None:
+        super().__init__()
+
+        self.global_pool = nn.AdaptiveAvgPool2d(output_size=1)
+
+        self.conv_1x1 = MobileViTV2ConvLayer(
+            config,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            use_normalization=True,
+            use_activation="relu",
+        )
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        spatial_size = features.shape[-2:]
+        features = self.global_pool(features)
+        features = self.conv_1x1(features)
+        features = nn.functional.interpolate(features, size=spatial_size, mode="bilinear", align_corners=False)
+        return features
+
+
+class MobileViTV2ASPP(nn.Module):
+    """
+    ASPP module defined in DeepLab papers: https://arxiv.org/abs/1606.00915, https://arxiv.org/abs/1706.05587
+    """
+
+    def __init__(self, config: MobileViTV2Config) -> None:
+        super().__init__()
+
+        encoder_out_channels = make_divisible(512 * config.width_multiplier, divisor=8)  # layer 5 output dimension
+        in_channels = encoder_out_channels
+        out_channels = config.aspp_out_channels
+
+        if len(config.atrous_rates) != 3:
+            raise ValueError("Expected 3 values for atrous_rates")
+
+        self.convs = nn.ModuleList()
+
+        in_projection = MobileViTV2ConvLayer(
+            config,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            use_activation="relu",
+        )
+        self.convs.append(in_projection)
+
+        self.convs.extend(
+            [
+                MobileViTV2ConvLayer(
+                    config,
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    kernel_size=3,
+                    dilation=rate,
+                    use_activation="relu",
+                )
+                for rate in config.atrous_rates
+            ]
+        )
+
+        pool_layer = MobileViTV2ASPPPooling(config, in_channels, out_channels)
+        self.convs.append(pool_layer)
+
+        self.project = MobileViTV2ConvLayer(
+            config, in_channels=5 * out_channels, out_channels=out_channels, kernel_size=1, use_activation="relu"
+        )
+
+        self.dropout = nn.Dropout(p=config.aspp_dropout_prob)
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        pyramid = []
+        for conv in self.convs:
+            pyramid.append(conv(features))
+        pyramid = torch.cat(pyramid, dim=1)
+
+        pooled_features = self.project(pyramid)
+        pooled_features = self.dropout(pooled_features)
+        return pooled_features
+
+
+# Copied from transformers.models.mobilevit.modeling_mobilevit.MobileViTDeepLabV3 with MobileViT->MobileViTV2
+class MobileViTV2DeepLabV3(nn.Module):
+    """
+    DeepLabv3 architecture: https://arxiv.org/abs/1706.05587
+    """
+
+    def __init__(self, config: MobileViTV2Config) -> None:
+        super().__init__()
+        self.aspp = MobileViTV2ASPP(config)
+
+        self.dropout = nn.Dropout2d(config.classifier_dropout_prob)
+
+        self.classifier = MobileViTV2ConvLayer(
+            config,
+            in_channels=config.aspp_out_channels,
+            out_channels=config.num_labels,
+            kernel_size=1,
+            use_normalization=False,
+            use_activation=False,
+            bias=True,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        features = self.aspp(hidden_states[-1])
+        features = self.dropout(features)
+        features = self.classifier(features)
+        return features
+
+
+@add_start_docstrings(
+    """
+    MobileViTV2 model with a semantic segmentation head on top, e.g. for Pascal VOC.
+    """,
+    MOBILEVITV2_START_DOCSTRING,
+)
+class MobileViTV2ForSemanticSegmentation(MobileViTV2PreTrainedModel):
+    def __init__(self, config: MobileViTV2Config) -> None:
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.mobilevitv2 = MobileViTV2Model(config, expand_output=False)
+        self.segmentation_head = MobileViTV2DeepLabV3(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MOBILEVITV2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=SemanticSegmenterOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, SemanticSegmenterOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> import requests
+        >>> import torch
+        >>> from PIL import Image
+        >>> from transformers import AutoImageProcessor, MobileViTV2ForSemanticSegmentation
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("apple/mobilevitv2-1.0-imagenet1k-256")
+        >>> model = MobileViTV2ForSemanticSegmentation.from_pretrained("apple/mobilevitv2-1.0-imagenet1k-256")
+
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+
+        >>> # logits are of shape (batch_size, num_labels, height, width)
+        >>> logits = outputs.logits
+        ```"""
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.mobilevitv2(
+            pixel_values,
+            output_hidden_states=True,  # we need the intermediate hidden states
+            return_dict=return_dict,
+        )
+
+        encoder_hidden_states = outputs.hidden_states if return_dict else outputs[1]
+
+        logits = self.segmentation_head(encoder_hidden_states)
+
+        loss = None
+        if labels is not None:
+            if self.config.num_labels == 1:
+                raise ValueError("The number of labels should be greater than one")
+            else:
+                # upsample logits to the images' original size
+                upsampled_logits = nn.functional.interpolate(
+                    logits, size=labels.shape[-2:], mode="bilinear", align_corners=False
+                )
+                loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index)
+                loss = loss_fct(upsampled_logits, labels)
+
+        if not return_dict:
+            if output_hidden_states:
+                output = (logits,) + outputs[1:]
+            else:
+                output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SemanticSegmenterOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=None,
+        )
diff --git a/src/transformers/models/mpnet/__init__.py b/src/transformers/models/mpnet/__init__.py
index 5b3bc0dbd375..993a99c0819b 100644
--- a/src/transformers/models/mpnet/__init__.py
+++ b/src/transformers/models/mpnet/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/mpnet/modeling_mpnet.py b/src/transformers/models/mpnet/modeling_mpnet.py
index 01d1375ac934..86194607e217 100644
--- a/src/transformers/models/mpnet/modeling_mpnet.py
+++ b/src/transformers/models/mpnet/modeling_mpnet.py
@@ -43,7 +43,6 @@
 
 _CHECKPOINT_FOR_DOC = "microsoft/mpnet-base"
 _CONFIG_FOR_DOC = "MPNetConfig"
-_TOKENIZER_FOR_DOC = "MPNetTokenizer"
 
 
 MPNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -84,7 +83,9 @@ def __init__(self, config):
 
         self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
     def forward(self, input_ids=None, position_ids=None, inputs_embeds=None, **kwargs):
         if position_ids is None:
@@ -164,7 +165,6 @@ def forward(
         output_attentions=False,
         **kwargs,
     ):
-
         q = self.q(hidden_states)
         k = self.k(hidden_states)
         v = self.v(hidden_states)
@@ -439,7 +439,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`MPNetTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -481,9 +481,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
     MPNET_START_DOCSTRING,
 )
 class MPNetModel(MPNetPreTrainedModel):
-
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
@@ -511,7 +508,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPooling,
         config_class=_CONFIG_FOR_DOC,
@@ -537,6 +533,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -574,8 +571,7 @@ def forward(
 
 
 class MPNetForMaskedLM(MPNetPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _tied_weights_keys = ["lm_head.decoder"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -594,7 +590,6 @@ def set_output_embeddings(self, new_embeddings):
 
     @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -683,8 +678,6 @@ def forward(self, features, **kwargs):
     MPNET_START_DOCSTRING,
 )
 class MPNetForSequenceClassification(MPNetPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -697,7 +690,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -778,8 +770,6 @@ def forward(
     MPNET_START_DOCSTRING,
 )
 class MPNetForMultipleChoice(MPNetPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -792,7 +782,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -869,9 +858,6 @@ def forward(
     MPNET_START_DOCSTRING,
 )
 class MPNetForTokenClassification(MPNetPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -885,7 +871,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -969,9 +954,6 @@ def forward(self, features, **kwargs):
     MPNET_START_DOCSTRING,
 )
 class MPNetForQuestionAnswering(MPNetPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -984,7 +966,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/mpnet/modeling_tf_mpnet.py b/src/transformers/models/mpnet/modeling_tf_mpnet.py
index 4bc39ff0a233..2982899340d2 100644
--- a/src/transformers/models/mpnet/modeling_tf_mpnet.py
+++ b/src/transformers/models/mpnet/modeling_tf_mpnet.py
@@ -16,6 +16,8 @@
 """ TF 2.0 MPNet model."""
 
 
+from __future__ import annotations
+
 import math
 import warnings
 from typing import Optional, Tuple, Union
@@ -45,9 +47,8 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -60,7 +61,6 @@
 
 _CHECKPOINT_FOR_DOC = "microsoft/mpnet-base"
 _CONFIG_FOR_DOC = "MPNetConfig"
-_TOKENIZER_FOR_DOC = "MPNetTokenizer"
 
 TF_MPNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "microsoft/mpnet-base",
@@ -76,19 +76,6 @@ class TFMPNetPreTrainedModel(TFPreTrainedModel):
     config_class = MPNetConfig
     base_model_prefix = "mpnet"
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 class TFMPNetEmbeddings(tf.keras.layers.Layer):
     """Construct the embeddings from word, position embeddings."""
@@ -97,7 +84,7 @@ def __init__(self, config, **kwargs):
         super().__init__(**kwargs)
 
         self.padding_idx = 1
-        self.vocab_size = config.vocab_size
+        self.config = config
         self.hidden_size = config.hidden_size
         self.max_position_embeddings = config.max_position_embeddings
         self.initializer_range = config.initializer_range
@@ -108,7 +95,7 @@ def build(self, input_shape: tf.TensorShape):
         with tf.name_scope("word_embeddings"):
             self.weight = self.add_weight(
                 name="weight",
-                shape=[self.vocab_size, self.hidden_size],
+                shape=[self.config.vocab_size, self.hidden_size],
                 initializer=get_initializer(initializer_range=self.initializer_range),
             )
 
@@ -145,16 +132,7 @@ def call(self, input_ids=None, position_ids=None, inputs_embeds=None, training=F
         assert not (input_ids is None and inputs_embeds is None)
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
@@ -511,7 +489,6 @@ def call(
         return_dict=None,
         training=False,
     ):
-
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
@@ -632,7 +609,7 @@ def call(
         input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`MPNetTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -687,18 +664,17 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFBaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
+        input_ids: TFModelInputType | None = None,
         attention_mask: Optional[Union[np.array, tf.Tensor]] = None,
         position_ids: Optional[Union[np.array, tf.Tensor]] = None,
         head_mask: Optional[Union[np.array, tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -717,17 +693,6 @@ def call(
         )
         return outputs
 
-    def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutputWithPooling(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            hidden_states=hs,
-            attentions=attns,
-        )
-
 
 class TFMPNetLMHead(tf.keras.layers.Layer):
     """MPNet head for masked and permuted language modeling"""
@@ -735,7 +700,7 @@ class TFMPNetLMHead(tf.keras.layers.Layer):
     def __init__(self, config, input_embeddings, **kwargs):
         super().__init__(**kwargs)
 
-        self.vocab_size = config.vocab_size
+        self.config = config
         self.hidden_size = config.hidden_size
         self.dense = tf.keras.layers.Dense(
             config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
@@ -748,7 +713,7 @@ def __init__(self, config, input_embeddings, **kwargs):
         self.decoder = input_embeddings
 
     def build(self, input_shape):
-        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
+        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
 
         super().build(input_shape)
 
@@ -764,7 +729,7 @@ def get_bias(self):
 
     def set_bias(self, value):
         self.bias = value["bias"]
-        self.vocab_size = shape_list(value["bias"])[0]
+        self.config.vocab_size = shape_list(value["bias"])[0]
 
     def call(self, hidden_states):
         hidden_states = self.dense(hidden_states)
@@ -775,7 +740,7 @@ def call(self, hidden_states):
         seq_length = shape_list(tensor=hidden_states)[1]
         hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
         hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True)
-        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
         hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
 
         return hidden_states
@@ -783,7 +748,6 @@ def call(self, hidden_states):
 
 @add_start_docstrings("""MPNet Model with a `language modeling` head on top.""", MPNET_START_DOCSTRING)
 class TFMPNetForMaskedLM(TFMPNetPreTrainedModel, TFMaskedLanguageModelingLoss):
-
     _keys_to_ignore_on_load_missing = [r"pooler"]
 
     def __init__(self, config, *inputs, **kwargs):
@@ -802,22 +766,21 @@ def get_prefix_bias_name(self):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFMaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
         r"""
@@ -853,13 +816,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output
-    def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 class TFMPNetClassificationHead(tf.keras.layers.Layer):
     """Head for sentence-level classification tasks."""
@@ -894,7 +850,6 @@ def call(self, features, training=False):
     MPNET_START_DOCSTRING,
 )
 class TFMPNetForSequenceClassification(TFMPNetPreTrainedModel, TFSequenceClassificationLoss):
-
     _keys_to_ignore_on_load_missing = [r"pooler"]
 
     def __init__(self, config, *inputs, **kwargs):
@@ -907,22 +862,21 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
+        input_ids: TFModelInputType | None = None,
         attention_mask: Optional[Union[np.array, tf.Tensor]] = None,
         position_ids: Optional[Union[np.array, tf.Tensor]] = None,
         head_mask: Optional[Union[np.array, tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -959,13 +913,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -984,35 +931,24 @@ def __init__(self, config, *inputs, **kwargs):
             1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
         )
 
-    @property
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            tf.Tensor with dummy inputs
-        """
-        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32)}
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFMultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1063,26 +999,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output
-    def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMultipleChoiceModelOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1092,7 +1008,6 @@ def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoic
     MPNET_START_DOCSTRING,
 )
 class TFMPNetForTokenClassification(TFMPNetPreTrainedModel, TFTokenClassificationLoss):
-
     _keys_to_ignore_on_load_missing = [r"pooler"]
 
     def __init__(self, config, *inputs, **kwargs):
@@ -1108,22 +1023,21 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFTokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1159,13 +1073,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1175,7 +1082,6 @@ def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOu
     MPNET_START_DOCSTRING,
 )
 class TFMPNetForQuestionAnswering(TFMPNetPreTrainedModel, TFQuestionAnsweringLoss):
-
     _keys_to_ignore_on_load_missing = [r"pooler"]
 
     def __init__(self, config, *inputs, **kwargs):
@@ -1190,23 +1096,22 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
+        input_ids: TFModelInputType | None = None,
         attention_mask: Optional[Union[np.array, tf.Tensor]] = None,
         position_ids: Optional[Union[np.array, tf.Tensor]] = None,
         head_mask: Optional[Union[np.array, tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[tf.Tensor] = None,
-        end_positions: Optional[tf.Tensor] = None,
+        start_positions: tf.Tensor | None = None,
+        end_positions: tf.Tensor | None = None,
         training: bool = False,
         **kwargs,
     ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
@@ -1254,12 +1159,3 @@ def call(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output
-    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
diff --git a/src/transformers/models/mpnet/tokenization_mpnet.py b/src/transformers/models/mpnet/tokenization_mpnet.py
index 28d8b7096ae1..21c3555c0577 100644
--- a/src/transformers/models/mpnet/tokenization_mpnet.py
+++ b/src/transformers/models/mpnet/tokenization_mpnet.py
@@ -119,7 +119,7 @@ class MPNetTokenizer(PreTrainedTokenizer):
 
             This should likely be deactivated for Japanese (see this
             [issue](https://github.com/huggingface/transformers/issues/328)).
-        strip_accents: (`bool`, *optional*):
+        strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
     """
@@ -145,7 +145,7 @@ def __init__(
         mask_token="<mask>",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
@@ -157,22 +157,6 @@ def __init__(
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
 
-        super().__init__(
-            do_lower_case=do_lower_case,
-            do_basic_tokenize=do_basic_tokenize,
-            never_split=never_split,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            tokenize_chinese_chars=tokenize_chinese_chars,
-            strip_accents=strip_accents,
-            **kwargs,
-        )
-
         if not os.path.isfile(vocab_file):
             raise ValueError(
                 f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
@@ -188,7 +172,23 @@ def __init__(
                 tokenize_chinese_chars=tokenize_chinese_chars,
                 strip_accents=strip_accents,
             )
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+
+        super().__init__(
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
 
     @property
     def do_lower_case(self):
@@ -199,13 +199,14 @@ def vocab_size(self):
         return len(self.vocab)
 
     def get_vocab(self):
-        return dict(self.vocab, **self.added_tokens_encoder)
+        vocab = self.vocab.copy()
+        vocab.update(self.added_tokens_encoder)
+        return vocab
 
     def _tokenize(self, text):
         split_tokens = []
         if self.do_basic_tokenize:
             for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-
                 # If the token is part of the never_split set
                 if token in self.basic_tokenizer.never_split:
                     split_tokens.append(token)
@@ -343,20 +344,30 @@ class BasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -375,7 +386,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -403,7 +416,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
diff --git a/src/transformers/models/mpnet/tokenization_mpnet_fast.py b/src/transformers/models/mpnet/tokenization_mpnet_fast.py
index f2fe4fe4fe8f..1c9b1d592227 100644
--- a/src/transformers/models/mpnet/tokenization_mpnet_fast.py
+++ b/src/transformers/models/mpnet/tokenization_mpnet_fast.py
@@ -98,7 +98,7 @@ class MPNetTokenizerFast(PreTrainedTokenizerFast):
         tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
             Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
             issue](https://github.com/huggingface/transformers/issues/328)).
-        strip_accents: (`bool`, *optional*):
+        strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
     """
@@ -124,8 +124,18 @@ def __init__(
         mask_token="<mask>",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
+        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+
         super().__init__(
             vocab_file,
             tokenizer_file=tokenizer_file,
diff --git a/src/transformers/models/mpt/__init__.py b/src/transformers/models/mpt/__init__.py
new file mode 100644
index 000000000000..d24a5fad7b9d
--- /dev/null
+++ b/src/transformers/models/mpt/__init__.py
@@ -0,0 +1,62 @@
+# Copyright 2023 HuggingFace Inc. team and MosaicML NLP team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {
+    "configuration_mpt": ["MPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MptConfig", "MptOnnxConfig"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_mpt"] = [
+        "MPT_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "MptForCausalLM",
+        "MptModel",
+        "MptPreTrainedModel",
+        "MptForSequenceClassification",
+        "MptForTokenClassification",
+        "MptForQuestionAnswering",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_mpt import MPT_PRETRAINED_CONFIG_ARCHIVE_MAP, MptConfig, MptOnnxConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_mpt import (
+            MPT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            MptForCausalLM,
+            MptForQuestionAnswering,
+            MptForSequenceClassification,
+            MptForTokenClassification,
+            MptModel,
+            MptPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/mpt/configuration_mpt.py b/src/transformers/models/mpt/configuration_mpt.py
new file mode 100644
index 000000000000..cc91966b6b0d
--- /dev/null
+++ b/src/transformers/models/mpt/configuration_mpt.py
@@ -0,0 +1,247 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc. team and MosaicML NLP team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Mpt configuration"""
+from typing import TYPE_CHECKING, Optional, Union
+
+
+if TYPE_CHECKING:
+    pass
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+MPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "mosaicml/mpt-7b": "https://huggingface.co/mosaicml/mpt-7b/resolve/main/config.json",
+}
+
+
+class MptAttentionConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`MptAttention`] class. It is used to instantiate
+    attention layers according to the specified arguments, defining the layers architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the MPT
+    [mosaicml/mpt-7b](https://huggingface.co/mosaicml/mpt-7b) architecture. Most of the arguments are kept for backward
+    compatibility with previous MPT models that are hosted on the Hub (previously with `trust_remote_code=True`).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        attn_type (`str`, *optional*, defaults to `"multihead_attention"`):
+            type of attention to use. Options: `"multihead_attention"`, `"multiquery_attention"`.
+        attn_pdrop (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the attention layers.
+        attn_impl (`str`, *optional*, defaults to `"torch"`):
+            The attention implementation to use. One of `"torch"`, `"flash"`, or `"triton"`.
+        clip_qkv (`float`, *optional*):
+            If not `None`, clip the queries, keys, and values in the attention layer to this value.
+        softmax_scale (`float`, *optional*, defaults to `None`):
+            If not `None`, scale the softmax in the attention layer by this value. If `None`, will default to
+            `1/sqrt(hidden_size)`.
+        prefix_lm (`bool`, *optional*, defaults to `False`)):
+            Whether the model should operate as a Prefix LM. This requires passing an extra `prefix_mask` argument
+            which indicates which tokens belong to the prefix. Tokens in the prefix can attend to one another
+            bi-directionally. Tokens outside the prefix use causal attention.
+        qk_ln (`bool`, *optional*, defaults to `False`):
+            Whether to apply layer normalization to the queries and keys in the attention layer.
+        attn_uses_sequence_id (`bool`, *optional*, defaults to `False`)):
+            Whether to restrict attention to tokens that have the same token_type_ids. When the model is in `train`
+            mode, this requires passing an extra *token_type_ids* argument which indicates which sub-sequence each
+            token belongs to. Defaults to `False` meaning any provided *token_type_ids* will be ignored.
+        alibi (`bool`, *optional*, defaults to `True`):
+            Whether or not to use the alibi bias instead of positional embedding.
+        alibi_bias_max (`int`, *optional*, defaults to 8):
+            The maximum value of the alibi bias.
+    """
+
+    def __init__(
+        self,
+        attn_type="multihead_attention",
+        attn_pdrop=0,
+        attn_impl="torch",
+        clip_qkv=None,
+        softmax_scale=None,
+        prefix_lm=False,
+        qk_ln=False,
+        attn_uses_sequence_id=False,
+        alibi=True,
+        alibi_bias_max=8,
+        **kwargs,
+    ):
+        super().__init__()
+        self.attn_type = attn_type
+        self.attn_pdrop = attn_pdrop
+        self.attn_impl = attn_impl
+        self.clip_qkv = clip_qkv
+        self.softmax_scale = softmax_scale
+        self.prefix_lm = prefix_lm
+        self.attn_uses_sequence_id = attn_uses_sequence_id
+        self.alibi = alibi
+        self.qk_ln = qk_ln
+        self.alibi_bias_max = alibi_bias_max
+
+        if attn_type not in ["multihead_attention", "multiquery_attention"]:
+            raise ValueError(
+                f"`attn_type` has to be either `multihead_attention` or `multiquery_attention`. Received: {attn_type}"
+            )
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        if config_dict.get("model_type") == "mpt":
+            config_dict = config_dict["attn_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class MptConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`MptModel`]. It is used to instantiate a Mpt model
+    according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to the Mpt-7b architecture
+    [mosaicml/mpt-7b](https://huggingface.co/mosaicml/mpt-7b).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        d_model (`int`, *optional*, defaults to 2048):
+            Dimensionality of the embeddings and hidden states.
+        n_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        n_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        expansion_ratio (`int`, *optional*, defaults to 4):
+            The ratio of the up/down scale in the MLP.
+        max_seq_len (`int`, *optional*, defaults to 2048):
+            The maximum sequence length of the model.
+        vocab_size (`int`, *optional*, defaults to 50368):
+            Vocabulary size of the Mpt model. Defines the maximum number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`MptModel`]. Check [this
+            discussion](https://huggingface.co/bigscience/mpt/discussions/120#633d28389addb8530b406c2a) on how the
+            `vocab_size` has been defined.
+        resid_pdrop (`float`, *optional*, defaults to 0.0):
+            The dropout probability applied to the attention output before combining with residual.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
+            The epsilon to use in the layer normalization layers.
+        emb_pdrop (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the embedding layer.
+        learned_pos_emb (`bool`, *optional*, defaults to `True`):
+            Whether to use learned positional embeddings.
+        attn_config (`dict`, *optional*):
+            A dictionary used to configure the model's attention module.
+        init_device (`str`, *optional*, defaults to `"cpu"`):
+            The device to use for parameter initialization. Defined for backward compatibility
+        logit_scale (`float`, *optional*):
+            If not None, scale the logits by this value.
+        no_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use bias in all linear layers.
+        verbose (`int`, *optional*, defaults to 0):
+            The verbosity level to use for logging. Used in the previous versions of MPT models for logging. This
+            argument is deprecated.
+        embedding_fraction (`float`, *optional*, defaults to 1.0):
+            The fraction to scale the gradients of the embedding layer by.
+        norm_type (`str`, *optional*, defaults to `"low_precision_layernorm"`):
+            Type of layer norm to use. All MPT models uses the same layer norm implementation. Defined for backward
+            compatibility.
+        use_cache (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+
+    Example:
+
+    ```python
+    >>> from transformers import MptConfig, MptModel
+
+    >>> # Initializing a Mpt configuration
+    >>> configuration = MptConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = MptModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "mpt"
+    attribute_map = {
+        "num_attention_heads": "n_heads",
+        "hidden_size": "d_model",
+        "num_hidden_layers": "n_layers",
+    }
+
+    def __init__(
+        self,
+        d_model: int = 2048,
+        n_heads: int = 16,
+        n_layers: int = 24,
+        expansion_ratio: int = 4,
+        max_seq_len: int = 2048,
+        vocab_size: int = 50368,
+        resid_pdrop: float = 0.0,
+        layer_norm_epsilon: float = 1e-5,
+        emb_pdrop: float = 0.0,
+        learned_pos_emb: bool = True,
+        attn_config: MptAttentionConfig = None,
+        init_device: str = "cpu",
+        logit_scale: Optional[Union[float, str]] = None,
+        no_bias: bool = True,
+        verbose: int = 0,
+        embedding_fraction: float = 1.0,
+        norm_type: str = "low_precision_layernorm",
+        use_cache: bool = False,
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        if attn_config is None:
+            self.attn_config = MptAttentionConfig()
+        elif isinstance(attn_config, dict):
+            self.attn_config = MptAttentionConfig(**attn_config)
+        else:
+            self.attn_config = attn_config
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.expansion_ratio = expansion_ratio
+        self.max_seq_len = max_seq_len
+        self.vocab_size = vocab_size
+        self.resid_pdrop = resid_pdrop
+        self.emb_pdrop = emb_pdrop
+        self.learned_pos_emb = learned_pos_emb
+        self.init_device = init_device
+        self.logit_scale = logit_scale
+        self.no_bias = no_bias
+        self.verbose = verbose
+        self.embedding_fraction = embedding_fraction
+        self.norm_type = norm_type
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.use_cache = use_cache
+        self.initializer_range = initializer_range
+        super().__init__(**kwargs)
diff --git a/src/transformers/models/mpt/modeling_mpt.py b/src/transformers/models/mpt/modeling_mpt.py
new file mode 100644
index 000000000000..0c608dbd2a93
--- /dev/null
+++ b/src/transformers/models/mpt/modeling_mpt.py
@@ -0,0 +1,1010 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc. team and MosaicML NLP team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch MPT model."""
+
+import math
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss
+from torch.nn import functional as F
+
+from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
+from ...modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import logging
+from .configuration_mpt import MptConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "mosaicml/mpt-7b"
+_CONFIG_FOR_DOC = "MptConfig"
+
+MPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "mosaicml/mpt-7b",
+    "mosaicml/mpt-7b-storywriter",
+    "mosaicml/mpt-7b-instruct",
+    "mosaicml/mpt-7b-8k",
+    "mosaicml/mpt-7b-8k-instruct",
+    "mosaicml/mpt-7b-8k-chat",
+    "mosaicml/mpt-30b",
+    "mosaicml/mpt-30b-instruct",
+    "mosaicml/mpt-30b-chat"
+    # See all MPT models at https://huggingface.co/models?filter=mpt
+]
+
+
+# Copied from transformers.models.bloom.modeling_bloom._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, device: torch.device, past_key_values_length: int
+) -> torch.BoolTensor:
+    """
+    Make causal mask used for self-attention.
+    """
+    batch_size, target_length = input_ids_shape
+    mask = torch.empty((target_length, target_length + past_key_values_length), dtype=torch.bool, device=device)
+    # ONNX doesn't support `torch.Tensor.triu` properly, thus we use this workaround
+    seq_ids = torch.arange(target_length, device=device)
+    mask[:, past_key_values_length:] = seq_ids[:, None] < seq_ids[None, :]
+
+    if past_key_values_length > 0:
+        mask[:, :past_key_values_length] = False
+
+    expanded_mask = mask[None, None, :, :].expand(batch_size, 1, target_length, target_length + past_key_values_length)
+    return expanded_mask
+
+
+# Copied from transformers.models.bloom.modeling_bloom._expand_mask
+def _expand_mask(mask: torch.Tensor, tgt_length: int) -> torch.BoolTensor:
+    """
+    Expands attention_mask from `[batch_size, src_length]` to `[batch_size, 1, tgt_length, src_length]`.
+    """
+    batch_size, src_length = mask.shape
+    tgt_length = tgt_length if tgt_length is not None else src_length
+
+    expanded_mask = ~(mask[:, None, None, :].to(torch.bool))
+    return expanded_mask.expand(batch_size, 1, tgt_length, src_length)
+
+
+def build_mpt_alibi_tensor(num_heads, sequence_length, alibi_bias_max=8, device=None):
+    r"""
+    Link to paper: https://arxiv.org/abs/2108.12409 - Alibi tensor is not causal as the original paper mentions, it
+    relies on a translation invariance of softmax for quick implementation. This implementation has been copied from
+    the alibi implementation of MPT source code that led to slightly different results than the Bloom alibi:
+    https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L292
+    """
+    alibi = torch.arange(1 - sequence_length, 1, dtype=torch.int32, device=device).view(1, 1, 1, sequence_length)
+    num_heads_power_of_2 = 2 ** math.ceil(math.log2(num_heads))
+
+    base = torch.arange(1, num_heads_power_of_2 + 1, dtype=torch.float32, device=device)
+    base = base * (alibi_bias_max / num_heads_power_of_2)
+
+    slopes = 1.0 / torch.pow(2, base)
+    slopes = slopes.view(1, num_heads, 1, 1)
+
+    if num_heads_power_of_2 != num_heads:
+        slopes = torch.concat([slopes[1::2], slopes[::2]])[:num_heads]
+
+    alibi = alibi * slopes
+    return alibi.squeeze(0)
+
+
+class MptAttention(nn.Module):
+    """Multi-head self attention.
+    Using torch or triton attention implemetation enables user to also use additive bias.
+    """
+
+    def __init__(self, config: MptConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.n_heads = config.n_heads
+        self.max_seq_length = config.max_seq_len
+        self.head_dim = self.hidden_size // self.n_heads
+        self.softmax_scale = config.attn_config.softmax_scale
+        if self.softmax_scale is None:
+            self.softmax_scale = 1 / math.sqrt(self.hidden_size / self.n_heads)
+
+        self.attn_dropout_p = config.attn_config.attn_pdrop
+        self.Wqkv = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=False)
+        self.out_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_bias: torch.Tensor,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        mixed_qkv = self.Wqkv(hidden_states)
+        query_states, key_states, value_states = mixed_qkv.chunk(3, dim=2)
+        query_states = query_states.reshape(batch_size, seq_length, self.n_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.reshape(batch_size, seq_length, self.n_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.reshape(batch_size, seq_length, self.n_heads, self.head_dim).transpose(1, 2)
+
+        if past_key_value is not None:
+            if len(past_key_value) != 0:
+                key_states = torch.cat([past_key_value[0], key_states], dim=2)
+                value_states = torch.cat([past_key_value[1], value_states], dim=2)
+            past_key_value = (key_states, value_states)
+        else:
+            past_key_value = (key_states, value_states)
+
+        attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2)) * self.softmax_scale
+
+        query_length = seq_length if past_key_value is None else seq_length + past_key_value[0].shape[2]
+
+        if position_bias is not None:
+            if len(position_bias.shape) != 3:
+                raise ValueError(f"Expecting position_bias shape to be 3 dimensions, got {len(position_bias.shape)}")
+            key_length = key_states.shape[-2]
+
+            position_bias_query_index = max(0, position_bias.size(1) - query_length)
+            position_bias_key_index = max(0, position_bias.size(2) - key_length)
+
+            position_bias = position_bias[:, position_bias_query_index:, position_bias_key_index:]
+
+            attention_scores = attention_scores + position_bias
+
+        if attention_mask is not None:
+            attention_scores = attention_scores.masked_fill(attention_mask, torch.finfo(query_states.dtype).min)
+
+        # (batch_size, n_heads, seq_length, key_length)
+        attn_weights = nn.functional.softmax(attention_scores.float(), dim=-1).to(value_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attn_dropout_p, training=self.training)
+
+        context_states = torch.matmul(attn_weights, value_states)
+        context_states = context_states.permute(0, 2, 1, 3).contiguous().view(batch_size, seq_length, -1)
+        attn_output = self.out_proj(context_states)
+
+        return attn_output, attn_weights, past_key_value
+
+
+class MptMLP(nn.Module):
+    def __init__(self, config: MptConfig):
+        super().__init__()
+        hidden_size = config.hidden_size
+
+        self.up_proj = nn.Linear(hidden_size, 4 * hidden_size, bias=False)
+        self.act = nn.GELU(approximate="none")
+        self.down_proj = nn.Linear(4 * hidden_size, hidden_size, bias=False)
+        self.hidden_dropout = config.attn_config.attn_pdrop
+
+    def forward(self, hidden_states: torch.Tensor, residual: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.act(self.up_proj(hidden_states))
+
+        intermediate_output = self.down_proj(hidden_states)
+
+        output = F.dropout(intermediate_output, p=self.hidden_dropout, training=self.training)
+        output = output + residual
+
+        return output
+
+
+class MptBlock(nn.Module):
+    def __init__(self, config: MptConfig):
+        super().__init__()
+        hidden_size = config.hidden_size
+
+        self.norm_1 = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        # backward compatibility with weights on the Hub
+        self.norm_1.bias = None
+
+        self.num_heads = config.n_heads
+        self.attn = MptAttention(config)
+
+        self.norm_2 = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        # backward compatibility with weights on the Hub
+        self.norm_2.bias = None
+
+        self.ffn = MptMLP(config)
+
+        self.dropout_rate = config.attn_config.attn_pdrop
+        self.resid_attn_dropout = nn.Dropout(self.dropout_rate)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_bias: torch.Tensor,
+        attention_mask: torch.Tensor,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: bool = False,
+        output_attentions: bool = False,
+    ):
+        # hidden_states: [batch_size, seq_length, hidden_size]
+        # Layer norm at the beginning of the transformer layer.
+        layernorm_output = self.norm_1(hidden_states)
+
+        residual = hidden_states
+
+        # Self attention.
+        attn_outputs, attn_weights, past_key_value = self.attn(
+            layernorm_output,
+            position_bias=position_bias,
+            attention_mask=attention_mask,
+            past_key_value=layer_past,
+        )
+
+        hidden_states = self.resid_attn_dropout(attn_outputs) + residual
+
+        layernorm_output = self.norm_2(hidden_states)
+
+        # Get residual
+        residual = hidden_states
+
+        # MLP.
+        output = self.ffn(layernorm_output, residual)
+        outputs = (output,)
+
+        if use_cache:
+            outputs += (past_key_value,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs  # hidden_states, present, attentions
+
+
+class MptPreTrainedModel(PreTrainedModel):
+    config_class = MptConfig
+    base_model_prefix = "transformer"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["MptBlock"]
+    _keys_to_ignore_on_load_missing = [r"lm_head.*."]
+
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights."""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, LayerNorm):
+            if module.bias is not None:
+                module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module: nn.Module, value: bool = False):
+        if isinstance(module, MptModel):
+            module.gradient_checkpointing = value
+
+    @staticmethod
+    def _convert_to_mpt_cache(
+        past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]]
+    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
+        """
+        Converts the cache to the format expected by Mpt, i.e. to tuple(tuple([batch_size * num_heads, ...]))
+        """
+        batch_size, num_heads, head_dim, seq_length = past_key_value[0][0].shape
+        batch_size_times_num_heads = batch_size * num_heads
+        # key:  [batch_size, num_heads, head_dim, seq_length] -> [batch_size * num_heads, head_dim, seq_length]
+        # value: [batch_size, num_heads, seq_length, head_dim] -> [batch_size * num_heads, seq_length, head_dim]
+        return tuple(
+            (
+                layer_past[0].reshape(batch_size_times_num_heads, head_dim, seq_length),
+                layer_past[1].reshape(batch_size_times_num_heads, seq_length, head_dim),
+            )
+            for layer_past in past_key_value
+        )
+
+
+MPT_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`MptConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+MPT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0][0].shape[2]`
+            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        past_key_values (`Tuple[Tuple[torch.Tensor]]` of length `config.n_layers`):
+            Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
+            `past_key_values` output below). Can be used to speed up sequential decoding. The `input_ids` which have
+            their past given to this model should not be passed as `input_ids` as they have already been computed.
+
+            Each element of `past_key_values` is a tuple (past_key, past_value):
+            - past_key: [batch_size * num_heads, head_dim, kv_length]
+            - past_value: [batch_size * num_heads, kv_length, head_dim]
+        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+
+            If `past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see
+            `past_key_values`).
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Mpt Model transformer outputting raw hidden-states without any specific head on top.",
+    MPT_START_DOCSTRING,
+)
+class MptModel(MptPreTrainedModel):
+    def __init__(self, config: MptConfig):
+        super().__init__(config)
+
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.n_heads
+
+        # Embedding + LN Embedding
+        self.wte = nn.Embedding(config.vocab_size, self.hidden_size)
+
+        # Transformer blocks
+        self.blocks = nn.ModuleList([MptBlock(config) for _ in range(config.n_layers)])
+
+        # Final Layer Norm
+        self.norm_f = LayerNorm(self.hidden_size, eps=config.layer_norm_epsilon)
+        # backward compatibility with weights on the Hub
+        self.norm_f.bias = None
+
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.wte
+
+    def build_mpt_alibi_tensor(self, num_heads, sequence_length, alibi_bias_max=8, device=None):
+        return build_mpt_alibi_tensor(num_heads, sequence_length, alibi_bias_max, device)
+
+    def _prepare_attn_mask(
+        self, attention_mask: torch.Tensor, input_shape: Tuple[int, int], past_key_values_length: int
+    ) -> torch.BoolTensor:
+        # create causal mask
+        # [batch_size, seq_length] -> [batch_size, 1, tgt_length, src_length]
+        if input_shape[1] + past_key_values_length != attention_mask.shape[1]:
+            raise ValueError(
+                "Attention mask shape should be (batch_size, seq_length + past_key_values_length)"
+                f" but is {attention_mask.shape} with input_ids shape {input_shape} and past length"
+                f" {past_key_values_length}."
+            )
+        combined_attention_mask = None
+        device = attention_mask.device
+        _, src_length = input_shape
+
+        if src_length > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape, device=device, past_key_values_length=past_key_values_length
+            )
+
+        # [batch_size, seq_length] -> [batch_size, 1, tgt_length, src_length]
+        expanded_attn_mask = _expand_mask(attention_mask, tgt_length=src_length)
+        combined_attention_mask = (
+            expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask | combined_attention_mask
+        )
+
+        return combined_attention_mask
+
+    def set_input_embeddings(self, new_embeddings: torch.Tensor):
+        self.wte = new_embeddings
+
+    @add_start_docstrings_to_model_forward(MPT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPastAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if past_key_values is None:
+            past_key_values = tuple([None] * len(self.blocks))
+
+        if inputs_embeds is None:
+            inputs_embeds = self.wte(input_ids)
+
+        hidden_states = inputs_embeds
+
+        presents = () if use_cache else None
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # Compute alibi tensor: check build_alibi_tensor documentation
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+        if past_key_values[0] is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+        if attention_mask is None:
+            attention_mask = torch.ones((batch_size, seq_length_with_past), device=hidden_states.device)
+        else:
+            attention_mask = attention_mask.to(hidden_states.device)
+
+        alibi = self.build_mpt_alibi_tensor(self.num_heads, self.config.max_seq_len, device=hidden_states.device)
+
+        causal_mask = self._prepare_attn_mask(
+            attention_mask,
+            input_shape=(batch_size, seq_length),
+            past_key_values_length=past_key_values_length,
+        )
+
+        for i, (block, layer_past) in enumerate(zip(self.blocks, past_key_values)):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, use_cache=use_cache, output_attentions=output_attentions)
+
+                    return custom_forward
+
+                outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    alibi,
+                    causal_mask,
+                    layer_past,
+                )
+            else:
+                outputs = block(
+                    hidden_states,
+                    layer_past=layer_past,
+                    attention_mask=causal_mask,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                    position_bias=alibi,
+                )
+
+            hidden_states = outputs[0]
+            if use_cache is True:
+                presents = presents + (outputs[1],)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
+
+        # Add last hidden state
+        hidden_states = self.norm_f(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The MPT Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """,
+    MPT_START_DOCSTRING,
+)
+class MptForCausalLM(MptPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: MptConfig):
+        super().__init__(config)
+        self.transformer = MptModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings: torch.Tensor):
+        self.lm_head = new_embeddings
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        **kwargs,
+    ) -> dict:
+        # only last token for input_ids if past is not None
+        if past_key_values:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,  # NITS should it be layer_past?
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    @add_start_docstrings_to_model_forward(MPT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutputWithCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+
+        lm_logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(lm_logits.device)
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            batch_size, seq_length, vocab_size = shift_logits.shape
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.view(batch_size * seq_length, vocab_size), shift_labels.view(batch_size * seq_length)
+            )
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def _reorder_cache(
+        self, past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
+    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
+        """
+        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
+        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
+        beam_idx at every generation step.
+
+        Output shares the same memory storage as `past`.
+        """
+        # Get a copy of `beam_idx` on all the devices where we need those indices.
+        device_to_beam_idx = {
+            past_state.device: beam_idx.to(past_state.device) for layer_past in past for past_state in layer_past
+        }
+        reordered_past = tuple(
+            (
+                layer_past[0].index_select(0, device_to_beam_idx[layer_past[0].device]),
+                layer_past[1].index_select(0, device_to_beam_idx[layer_past[0].device]),
+            )
+            for layer_past in past
+        )
+        return reordered_past
+
+
+@add_start_docstrings(
+    """
+    The MPT Model transformer with a sequence classification head on top (linear layer).
+
+    [`MptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-1) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    MPT_START_DOCSTRING,
+)
+class MptForSequenceClassification(MptPreTrainedModel):
+    def __init__(self, config: MptConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.transformer = MptModel(config)
+        self.score = nn.Linear(config.hidden_size, config.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MPT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutputWithPast,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device)
+            else:
+                sequence_lengths = -1
+                logger.warning(
+                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                    "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+                )
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    MPT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    MPT_START_DOCSTRING,
+)
+class MptForTokenClassification(MptPreTrainedModel):
+    def __init__(self, config: MptConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.transformer = MptModel(config)
+        if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
+            classifier_dropout = config.classifier_dropout
+        elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MPT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **deprecated_arguments,
+    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+        hidden_states = self.dropout(hidden_states)
+        logits = self.classifier(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            batch_size, seq_length = labels.shape
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(
+                logits.view(batch_size * seq_length, self.num_labels), labels.view(batch_size * seq_length)
+            )
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The MPT Model transformer with a span classification head on top for extractive question-answering tasks like SQuAD
+    (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    MPT_START_DOCSTRING,
+)
+class MptForQuestionAnswering(MptPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = MptModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MPT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/mra/__init__.py b/src/transformers/models/mra/__init__.py
new file mode 100644
index 000000000000..d27ee2f17193
--- /dev/null
+++ b/src/transformers/models/mra/__init__.py
@@ -0,0 +1,68 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+# rely on isort to merge the imports
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
+
+
+_import_structure = {"configuration_mra": ["MRA_PRETRAINED_CONFIG_ARCHIVE_MAP", "MraConfig"]}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_mra"] = [
+        "MRA_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "MraForMaskedLM",
+        "MraForMultipleChoice",
+        "MraForQuestionAnswering",
+        "MraForSequenceClassification",
+        "MraForTokenClassification",
+        "MraLayer",
+        "MraModel",
+        "MraPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_mra import MRA_PRETRAINED_CONFIG_ARCHIVE_MAP, MraConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_mra import (
+            MRA_PRETRAINED_MODEL_ARCHIVE_LIST,
+            MraForMaskedLM,
+            MraForMultipleChoice,
+            MraForQuestionAnswering,
+            MraForSequenceClassification,
+            MraForTokenClassification,
+            MraLayer,
+            MraModel,
+            MraPreTrainedModel,
+        )
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
diff --git a/src/transformers/models/mra/configuration_mra.py b/src/transformers/models/mra/configuration_mra.py
new file mode 100644
index 000000000000..bc6aeebc907e
--- /dev/null
+++ b/src/transformers/models/mra/configuration_mra.py
@@ -0,0 +1,137 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" MRA model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+MRA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "uw-madison/mra-base-512-4": "https://huggingface.co/uw-madison/mra-base-512-4/resolve/main/config.json",
+}
+
+
+class MraConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MraModel`]. It is used to instantiate an MRA
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Mra
+    [uw-madison/mra-base-512-4](https://huggingface.co/uw-madison/mra-base-512-4) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50265):
+            Vocabulary size of the Mra model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`MraModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimension of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 1):
+            The vocabulary size of the `token_type_ids` passed when calling [`MraModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`.
+        block_per_row (`int`, *optional*, defaults to 4):
+            Used to set the budget for the high resolution scale.
+        approx_mode (`str`, *optional*, defaults to `"full"`):
+            Controls whether both low and high resolution approximations are used. Set to `"full"` for both low and
+            high resolution and `"sparse"` for only low resolution.
+        initial_prior_first_n_blocks (`int`, *optional*, defaults to 0):
+            The initial number of blocks for which high resolution is used.
+        initial_prior_diagonal_n_blocks (`int`, *optional*, defaults to 0):
+            The number of diagonal blocks for which high resolution is used.
+
+    Example:
+
+    ```python
+    >>> from transformers import MraConfig, MraModel
+
+    >>> # Initializing a Mra uw-madison/mra-base-512-4 style configuration
+    >>> configuration = MraConfig()
+
+    >>> # Initializing a model (with random weights) from the uw-madison/mra-base-512-4 style configuration
+    >>> model = MraModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "mra"
+
+    def __init__(
+        self,
+        vocab_size=50265,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=1,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        position_embedding_type="absolute",
+        block_per_row=4,
+        approx_mode="full",
+        initial_prior_first_n_blocks=0,
+        initial_prior_diagonal_n_blocks=0,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.type_vocab_size = type_vocab_size
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.block_per_row = block_per_row
+        self.approx_mode = approx_mode
+        self.initial_prior_first_n_blocks = initial_prior_first_n_blocks
+        self.initial_prior_diagonal_n_blocks = initial_prior_diagonal_n_blocks
diff --git a/src/transformers/models/mra/convert_mra_pytorch_to_pytorch.py b/src/transformers/models/mra/convert_mra_pytorch_to_pytorch.py
new file mode 100644
index 000000000000..f558f7c7bce3
--- /dev/null
+++ b/src/transformers/models/mra/convert_mra_pytorch_to_pytorch.py
@@ -0,0 +1,110 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert MRA checkpoints from the original repository. URL: https://github.com/mlpen/mra-attention"""
+
+import argparse
+
+import torch
+
+from transformers import MraConfig, MraForMaskedLM
+
+
+def rename_key(orig_key):
+    if "model" in orig_key:
+        orig_key = orig_key.replace("model.", "")
+    if "norm1" in orig_key:
+        orig_key = orig_key.replace("norm1", "attention.output.LayerNorm")
+    if "norm2" in orig_key:
+        orig_key = orig_key.replace("norm2", "output.LayerNorm")
+    if "norm" in orig_key:
+        orig_key = orig_key.replace("norm", "LayerNorm")
+    if "transformer" in orig_key:
+        layer_num = orig_key.split(".")[0].split("_")[-1]
+        orig_key = orig_key.replace(f"transformer_{layer_num}", f"encoder.layer.{layer_num}")
+    if "mha.attn" in orig_key:
+        orig_key = orig_key.replace("mha.attn", "attention.self")
+    if "mha" in orig_key:
+        orig_key = orig_key.replace("mha", "attention")
+    if "W_q" in orig_key:
+        orig_key = orig_key.replace("W_q", "self.query")
+    if "W_k" in orig_key:
+        orig_key = orig_key.replace("W_k", "self.key")
+    if "W_v" in orig_key:
+        orig_key = orig_key.replace("W_v", "self.value")
+    if "ff.0" in orig_key:
+        orig_key = orig_key.replace("ff.0", "intermediate.dense")
+    if "ff.2" in orig_key:
+        orig_key = orig_key.replace("ff.2", "output.dense")
+    if "ff" in orig_key:
+        orig_key = orig_key.replace("ff", "output.dense")
+    if "mlm_class" in orig_key:
+        orig_key = orig_key.replace("mlm.mlm_class", "cls.predictions.decoder")
+    if "mlm" in orig_key:
+        orig_key = orig_key.replace("mlm", "cls.predictions.transform")
+    if "backbone.backbone.encoders" in orig_key:
+        orig_key = orig_key.replace("backbone.backbone.encoders", "encoder.layer")
+    if "cls" not in orig_key:
+        orig_key = "mra." + orig_key
+
+    return orig_key
+
+
+def convert_checkpoint_helper(max_position_embeddings, orig_state_dict):
+    for key in orig_state_dict.copy().keys():
+        val = orig_state_dict.pop(key)
+
+        if ("pooler" in key) or ("sen_class" in key):
+            continue
+        else:
+            orig_state_dict[rename_key(key)] = val
+
+    orig_state_dict["cls.predictions.bias"] = orig_state_dict["cls.predictions.decoder.bias"]
+    orig_state_dict["mra.embeddings.position_ids"] = torch.arange(max_position_embeddings).expand((1, -1)) + 2
+
+    return orig_state_dict
+
+
+def convert_mra_checkpoint(checkpoint_path, mra_config_file, pytorch_dump_path):
+    orig_state_dict = torch.load(checkpoint_path, map_location="cpu")["model_state_dict"]
+    config = MraConfig.from_json_file(mra_config_file)
+    model = MraForMaskedLM(config)
+
+    new_state_dict = convert_checkpoint_helper(config.max_position_embeddings, orig_state_dict)
+
+    print(model.load_state_dict(new_state_dict))
+    model.eval()
+    model.save_pretrained(pytorch_dump_path)
+
+    print(f"Checkpoint successfuly converted. Model saved at {pytorch_dump_path}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--pytorch_model_path", default=None, type=str, required=True, help="Path to Mra pytorch checkpoint."
+    )
+    parser.add_argument(
+        "--config_file",
+        default=None,
+        type=str,
+        required=True,
+        help="The json file for Mra model config.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
+    args = parser.parse_args()
+    convert_mra_checkpoint(args.pytorch_model_path, args.config_file, args.pytorch_dump_path)
diff --git a/src/transformers/models/mra/modeling_mra.py b/src/transformers/models/mra/modeling_mra.py
new file mode 100644
index 000000000000..d400fea6d23d
--- /dev/null
+++ b/src/transformers/models/mra/modeling_mra.py
@@ -0,0 +1,1502 @@
+# coding=utf-8
+# Copyright 2023 University of Wisconsin-Madison and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch MRA model."""
+
+
+import math
+from pathlib import Path
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from torch.utils.cpp_extension import load
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BaseModelOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_ninja_available,
+    is_torch_cuda_available,
+    logging,
+)
+from .configuration_mra import MraConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "uw-madison/mra-base-512-4"
+_CONFIG_FOR_DOC = "MraConfig"
+_TOKENIZER_FOR_DOC = "AutoTokenizer"
+
+MRA_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "uw-madison/mra-base-512-4",
+    # See all Mra models at https://huggingface.co/models?filter=mra
+]
+
+
+def load_cuda_kernels():
+    global cuda_kernel
+    src_folder = Path(__file__).resolve().parent.parent.parent / "kernels" / "mra"
+
+    def append_root(files):
+        return [src_folder / file for file in files]
+
+    src_files = append_root(["cuda_kernel.cu", "cuda_launch.cu", "torch_extension.cpp"])
+
+    cuda_kernel = load("cuda_kernel", src_files, verbose=True)
+
+    import cuda_kernel
+
+
+cuda_kernel = None
+
+
+if is_torch_cuda_available() and is_ninja_available():
+    logger.info("Loading custom CUDA kernels...")
+
+    try:
+        load_cuda_kernels()
+    except Exception as e:
+        logger.warning(
+            "Failed to load CUDA kernels. Mra requires custom CUDA kernels. Please verify that compatible versions of"
+            f" PyTorch and CUDA Toolkit are installed: {e}"
+        )
+else:
+    pass
+
+
+def sparse_max(sparse_qk_prod, indices, query_num_block, key_num_block):
+    """
+    Computes maximum values for softmax stability.
+    """
+    if len(sparse_qk_prod.size()) != 4:
+        raise ValueError("sparse_qk_prod must be a 4-dimensional tensor.")
+
+    if len(indices.size()) != 2:
+        raise ValueError("indices must be a 2-dimensional tensor.")
+
+    if sparse_qk_prod.size(2) != 32:
+        raise ValueError("The size of the second dimension of sparse_qk_prod must be 32.")
+
+    if sparse_qk_prod.size(3) != 32:
+        raise ValueError("The size of the third dimension of sparse_qk_prod must be 32.")
+
+    index_vals = sparse_qk_prod.max(dim=-2).values.transpose(-1, -2)
+    index_vals = index_vals.contiguous()
+
+    indices = indices.int()
+    indices = indices.contiguous()
+
+    max_vals, max_vals_scatter = cuda_kernel.index_max(index_vals, indices, query_num_block, key_num_block)
+    max_vals_scatter = max_vals_scatter.transpose(-1, -2)[:, :, None, :]
+
+    return max_vals, max_vals_scatter
+
+
+def sparse_mask(mask, indices, block_size=32):
+    """
+    Converts attention mask to a sparse mask for high resolution logits.
+    """
+    if len(mask.size()) != 2:
+        raise ValueError("mask must be a 2-dimensional tensor.")
+
+    if len(indices.size()) != 2:
+        raise ValueError("indices must be a 2-dimensional tensor.")
+
+    if mask.shape[0] != indices.shape[0]:
+        raise ValueError("mask and indices must have the same size in the zero-th dimension.")
+
+    batch_size, seq_len = mask.shape
+    num_block = seq_len // block_size
+
+    batch_idx = torch.arange(indices.size(0), dtype=torch.long, device=indices.device)
+    mask = mask.reshape(batch_size, num_block, block_size)
+    mask = mask[batch_idx[:, None], (indices % num_block).long(), :]
+
+    return mask
+
+
+def mm_to_sparse(dense_query, dense_key, indices, block_size=32):
+    """
+    Performs Sampled Dense Matrix Multiplication.
+    """
+    batch_size, query_size, dim = dense_query.size()
+    _, key_size, dim = dense_key.size()
+
+    if query_size % block_size != 0:
+        raise ValueError("query_size (size of first dimension of dense_query) must be divisible by block_size.")
+
+    if key_size % block_size != 0:
+        raise ValueError("key_size (size of first dimension of dense_key) must be divisible by block_size.")
+
+    dense_query = dense_query.reshape(batch_size, query_size // block_size, block_size, dim).transpose(-1, -2)
+    dense_key = dense_key.reshape(batch_size, key_size // block_size, block_size, dim).transpose(-1, -2)
+
+    if len(dense_query.size()) != 4:
+        raise ValueError("dense_query must be a 4-dimensional tensor.")
+
+    if len(dense_key.size()) != 4:
+        raise ValueError("dense_key must be a 4-dimensional tensor.")
+
+    if len(indices.size()) != 2:
+        raise ValueError("indices must be a 2-dimensional tensor.")
+
+    if dense_query.size(3) != 32:
+        raise ValueError("The third dimension of dense_query must be 32.")
+
+    if dense_key.size(3) != 32:
+        raise ValueError("The third dimension of dense_key must be 32.")
+
+    dense_query = dense_query.contiguous()
+    dense_key = dense_key.contiguous()
+
+    indices = indices.int()
+    indices = indices.contiguous()
+
+    return cuda_kernel.mm_to_sparse(dense_query, dense_key, indices.int())
+
+
+def sparse_dense_mm(sparse_query, indices, dense_key, query_num_block, block_size=32):
+    """
+    Performs matrix multiplication of a sparse matrix with a dense matrix.
+    """
+    batch_size, key_size, dim = dense_key.size()
+
+    if key_size % block_size != 0:
+        raise ValueError("key_size (size of first dimension of dense_key) must be divisible by block_size.")
+
+    if sparse_query.size(2) != block_size:
+        raise ValueError("The size of the second dimension of sparse_query must be equal to the block_size.")
+
+    if sparse_query.size(3) != block_size:
+        raise ValueError("The size of the third dimension of sparse_query must be equal to the block_size.")
+
+    dense_key = dense_key.reshape(batch_size, key_size // block_size, block_size, dim).transpose(-1, -2)
+
+    if len(sparse_query.size()) != 4:
+        raise ValueError("sparse_query must be a 4-dimensional tensor.")
+
+    if len(dense_key.size()) != 4:
+        raise ValueError("dense_key must be a 4-dimensional tensor.")
+
+    if len(indices.size()) != 2:
+        raise ValueError("indices must be a 2-dimensional tensor.")
+
+    if dense_key.size(3) != 32:
+        raise ValueError("The size of the third dimension of dense_key must be 32.")
+
+    sparse_query = sparse_query.contiguous()
+
+    indices = indices.int()
+    indices = indices.contiguous()
+    dense_key = dense_key.contiguous()
+
+    dense_qk_prod = cuda_kernel.sparse_dense_mm(sparse_query, indices, dense_key, query_num_block)
+    dense_qk_prod = dense_qk_prod.transpose(-1, -2).reshape(batch_size, query_num_block * block_size, dim)
+    return dense_qk_prod
+
+
+def transpose_indices(indices, dim_1_block, dim_2_block):
+    return ((indices % dim_2_block) * dim_1_block + torch.div(indices, dim_2_block, rounding_mode="floor")).long()
+
+
+class MraSampledDenseMatMul(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, dense_query, dense_key, indices, block_size):
+        sparse_qk_prod = mm_to_sparse(dense_query, dense_key, indices, block_size)
+        ctx.save_for_backward(dense_query, dense_key, indices)
+        ctx.block_size = block_size
+        return sparse_qk_prod
+
+    @staticmethod
+    def backward(ctx, grad):
+        dense_query, dense_key, indices = ctx.saved_tensors
+        block_size = ctx.block_size
+        query_num_block = dense_query.size(1) // block_size
+        key_num_block = dense_key.size(1) // block_size
+        indices_T = transpose_indices(indices, query_num_block, key_num_block)
+        grad_key = sparse_dense_mm(grad.transpose(-1, -2), indices_T, dense_query, key_num_block)
+        grad_query = sparse_dense_mm(grad, indices, dense_key, query_num_block)
+        return grad_query, grad_key, None, None
+
+    @staticmethod
+    def operator_call(dense_query, dense_key, indices, block_size=32):
+        return MraSampledDenseMatMul.apply(dense_query, dense_key, indices, block_size)
+
+
+class MraSparseDenseMatMul(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, sparse_query, indices, dense_key, query_num_block):
+        sparse_qk_prod = sparse_dense_mm(sparse_query, indices, dense_key, query_num_block)
+        ctx.save_for_backward(sparse_query, indices, dense_key)
+        ctx.query_num_block = query_num_block
+        return sparse_qk_prod
+
+    @staticmethod
+    def backward(ctx, grad):
+        sparse_query, indices, dense_key = ctx.saved_tensors
+        query_num_block = ctx.query_num_block
+        key_num_block = dense_key.size(1) // sparse_query.size(-1)
+        indices_T = transpose_indices(indices, query_num_block, key_num_block)
+        grad_key = sparse_dense_mm(sparse_query.transpose(-1, -2), indices_T, grad, key_num_block)
+        grad_query = mm_to_sparse(grad, dense_key, indices)
+        return grad_query, None, grad_key, None
+
+    @staticmethod
+    def operator_call(sparse_query, indices, dense_key, query_num_block):
+        return MraSparseDenseMatMul.apply(sparse_query, indices, dense_key, query_num_block)
+
+
+class MraReduceSum:
+    @staticmethod
+    def operator_call(sparse_query, indices, query_num_block, key_num_block):
+        batch_size, num_block, block_size, _ = sparse_query.size()
+
+        if len(sparse_query.size()) != 4:
+            raise ValueError("sparse_query must be a 4-dimensional tensor.")
+
+        if len(indices.size()) != 2:
+            raise ValueError("indices must be a 2-dimensional tensor.")
+
+        _, _, block_size, _ = sparse_query.size()
+        batch_size, num_block = indices.size()
+
+        sparse_query = sparse_query.sum(dim=2).reshape(batch_size * num_block, block_size)
+
+        batch_idx = torch.arange(indices.size(0), dtype=torch.long, device=indices.device)
+        global_idxes = (
+            torch.div(indices, key_num_block, rounding_mode="floor").long() + batch_idx[:, None] * query_num_block
+        ).reshape(batch_size * num_block)
+        temp = torch.zeros(
+            (batch_size * query_num_block, block_size), dtype=sparse_query.dtype, device=sparse_query.device
+        )
+        output = temp.index_add(0, global_idxes, sparse_query).reshape(batch_size, query_num_block, block_size)
+
+        output = output.reshape(batch_size, query_num_block * block_size)
+        return output
+
+
+def get_low_resolution_logit(query, key, block_size, mask=None, value=None):
+    """
+    Compute low resolution approximation.
+    """
+    batch_size, seq_len, head_dim = query.size()
+
+    num_block_per_row = seq_len // block_size
+
+    value_hat = None
+    if mask is not None:
+        token_count = mask.reshape(batch_size, num_block_per_row, block_size).sum(dim=-1)
+        query_hat = query.reshape(batch_size, num_block_per_row, block_size, head_dim).sum(dim=-2) / (
+            token_count[:, :, None] + 1e-6
+        )
+        key_hat = key.reshape(batch_size, num_block_per_row, block_size, head_dim).sum(dim=-2) / (
+            token_count[:, :, None] + 1e-6
+        )
+        if value is not None:
+            value_hat = value.reshape(batch_size, num_block_per_row, block_size, head_dim).sum(dim=-2) / (
+                token_count[:, :, None] + 1e-6
+            )
+    else:
+        token_count = block_size * torch.ones(batch_size, num_block_per_row, dtype=torch.float, device=query.device)
+        query_hat = query.reshape(batch_size, num_block_per_row, block_size, head_dim).mean(dim=-2)
+        key_hat = key.reshape(batch_size, num_block_per_row, block_size, head_dim).mean(dim=-2)
+        if value is not None:
+            value_hat = value.reshape(batch_size, num_block_per_row, block_size, head_dim).mean(dim=-2)
+
+    low_resolution_logit = torch.matmul(query_hat, key_hat.transpose(-1, -2)) / math.sqrt(head_dim)
+
+    low_resolution_logit_row_max = low_resolution_logit.max(dim=-1, keepdims=True).values
+
+    if mask is not None:
+        low_resolution_logit = (
+            low_resolution_logit - 1e4 * ((token_count[:, None, :] * token_count[:, :, None]) < 0.5).float()
+        )
+
+    return low_resolution_logit, token_count, low_resolution_logit_row_max, value_hat
+
+
+def get_block_idxes(
+    low_resolution_logit, num_blocks, approx_mode, initial_prior_first_n_blocks, initial_prior_diagonal_n_blocks
+):
+    """
+    Compute the indices of the subset of components to be used in the approximation.
+    """
+    batch_size, total_blocks_per_row, _ = low_resolution_logit.shape
+
+    if initial_prior_diagonal_n_blocks > 0:
+        offset = initial_prior_diagonal_n_blocks // 2
+        temp_mask = torch.ones(total_blocks_per_row, total_blocks_per_row, device=low_resolution_logit.device)
+        diagonal_mask = torch.tril(torch.triu(temp_mask, diagonal=-offset), diagonal=offset)
+        low_resolution_logit = low_resolution_logit + diagonal_mask[None, :, :] * 5e3
+
+    if initial_prior_first_n_blocks > 0:
+        low_resolution_logit[:, :initial_prior_first_n_blocks, :] = (
+            low_resolution_logit[:, :initial_prior_first_n_blocks, :] + 5e3
+        )
+        low_resolution_logit[:, :, :initial_prior_first_n_blocks] = (
+            low_resolution_logit[:, :, :initial_prior_first_n_blocks] + 5e3
+        )
+
+    top_k_vals = torch.topk(
+        low_resolution_logit.reshape(batch_size, -1), num_blocks, dim=-1, largest=True, sorted=False
+    )
+    indices = top_k_vals.indices
+
+    if approx_mode == "full":
+        threshold = top_k_vals.values.min(dim=-1).values
+        high_resolution_mask = (low_resolution_logit >= threshold[:, None, None]).float()
+    elif approx_mode == "sparse":
+        high_resolution_mask = None
+    else:
+        raise ValueError(f"{approx_mode} is not a valid approx_model value.")
+
+    return indices, high_resolution_mask
+
+
+def mra2_attention(
+    query,
+    key,
+    value,
+    mask,
+    num_blocks,
+    approx_mode,
+    block_size=32,
+    initial_prior_first_n_blocks=0,
+    initial_prior_diagonal_n_blocks=0,
+):
+    """
+    Use Mra to approximate self-attention.
+    """
+    if cuda_kernel is None:
+        return torch.zeros_like(query).requires_grad_()
+
+    batch_size, num_head, seq_len, head_dim = query.size()
+    meta_batch = batch_size * num_head
+
+    if seq_len % block_size != 0:
+        raise ValueError("sequence length must be divisible by the block_size.")
+
+    num_block_per_row = seq_len // block_size
+
+    query = query.reshape(meta_batch, seq_len, head_dim)
+    key = key.reshape(meta_batch, seq_len, head_dim)
+    value = value.reshape(meta_batch, seq_len, head_dim)
+
+    if mask is not None:
+        query = query * mask[:, :, None]
+        key = key * mask[:, :, None]
+        value = value * mask[:, :, None]
+
+    if approx_mode == "full":
+        low_resolution_logit, token_count, low_resolution_logit_row_max, value_hat = get_low_resolution_logit(
+            query, key, block_size, mask, value
+        )
+    elif approx_mode == "sparse":
+        with torch.no_grad():
+            low_resolution_logit, token_count, low_resolution_logit_row_max, _ = get_low_resolution_logit(
+                query, key, block_size, mask
+            )
+    else:
+        raise Exception('approx_mode must be "full" or "sparse"')
+
+    with torch.no_grad():
+        low_resolution_logit_normalized = low_resolution_logit - low_resolution_logit_row_max
+        indices, high_resolution_mask = get_block_idxes(
+            low_resolution_logit_normalized,
+            num_blocks,
+            approx_mode,
+            initial_prior_first_n_blocks,
+            initial_prior_diagonal_n_blocks,
+        )
+
+    high_resolution_logit = MraSampledDenseMatMul.operator_call(
+        query, key, indices, block_size=block_size
+    ) / math.sqrt(head_dim)
+    max_vals, max_vals_scatter = sparse_max(high_resolution_logit, indices, num_block_per_row, num_block_per_row)
+    high_resolution_logit = high_resolution_logit - max_vals_scatter
+    if mask is not None:
+        high_resolution_logit = high_resolution_logit - 1e4 * (1 - sparse_mask(mask, indices)[:, :, :, None])
+    high_resolution_attn = torch.exp(high_resolution_logit)
+    high_resolution_attn_out = MraSparseDenseMatMul.operator_call(
+        high_resolution_attn, indices, value, num_block_per_row
+    )
+    high_resolution_normalizer = MraReduceSum.operator_call(
+        high_resolution_attn, indices, num_block_per_row, num_block_per_row
+    )
+
+    if approx_mode == "full":
+        low_resolution_attn = (
+            torch.exp(low_resolution_logit - low_resolution_logit_row_max - 1e4 * high_resolution_mask)
+            * token_count[:, None, :]
+        )
+
+        low_resolution_attn_out = (
+            torch.matmul(low_resolution_attn, value_hat)[:, :, None, :]
+            .repeat(1, 1, block_size, 1)
+            .reshape(meta_batch, seq_len, head_dim)
+        )
+        low_resolution_normalizer = (
+            low_resolution_attn.sum(dim=-1)[:, :, None].repeat(1, 1, block_size).reshape(meta_batch, seq_len)
+        )
+
+        log_correction = low_resolution_logit_row_max.repeat(1, 1, block_size).reshape(meta_batch, seq_len) - max_vals
+        if mask is not None:
+            log_correction = log_correction * mask
+
+        low_resolution_corr = torch.exp(log_correction * (log_correction <= 0).float())
+        low_resolution_attn_out = low_resolution_attn_out * low_resolution_corr[:, :, None]
+        low_resolution_normalizer = low_resolution_normalizer * low_resolution_corr
+
+        high_resolution_corr = torch.exp(-log_correction * (log_correction > 0).float())
+        high_resolution_attn_out = high_resolution_attn_out * high_resolution_corr[:, :, None]
+        high_resolution_normalizer = high_resolution_normalizer * high_resolution_corr
+
+        context_layer = (high_resolution_attn_out + low_resolution_attn_out) / (
+            high_resolution_normalizer[:, :, None] + low_resolution_normalizer[:, :, None] + 1e-6
+        )
+
+    elif approx_mode == "sparse":
+        context_layer = high_resolution_attn_out / (high_resolution_normalizer[:, :, None] + 1e-6)
+    else:
+        raise Exception('config.approx_mode must be "full" or "sparse"')
+
+    if mask is not None:
+        context_layer = context_layer * mask[:, :, None]
+
+    context_layer = context_layer.reshape(batch_size, num_head, seq_len, head_dim)
+
+    return context_layer
+
+
+class MraEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings + 2, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + 2)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer(
+            "token_type_ids",
+            torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device),
+            persistent=False,
+        )
+
+    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class MraSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = (
+            position_embedding_type if position_embedding_type is not None else config.position_embedding_type
+        )
+
+        self.num_block = (config.max_position_embeddings // 32) * config.block_per_row
+        self.num_block = min(self.num_block, int((config.max_position_embeddings // 32) ** 2))
+
+        self.approx_mode = config.approx_mode
+        self.initial_prior_first_n_blocks = config.initial_prior_first_n_blocks
+        self.initial_prior_diagonal_n_blocks = config.initial_prior_diagonal_n_blocks
+
+    def transpose_for_scores(self, layer):
+        new_layer_shape = layer.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        layer = layer.view(*new_layer_shape)
+        return layer.permute(0, 2, 1, 3)
+
+    def forward(self, hidden_states, attention_mask=None):
+        mixed_query_layer = self.query(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        batch_size, num_heads, seq_len, head_dim = query_layer.size()
+
+        # revert changes made by get_extended_attention_mask
+        attention_mask = 1.0 + attention_mask / 10000.0
+        attention_mask = (
+            attention_mask.squeeze().repeat(1, num_heads, 1).reshape(batch_size * num_heads, seq_len).int()
+        )
+
+        # The CUDA kernels are most efficient with inputs whose size is a multiple of a GPU's warp size (32). Inputs
+        # smaller than this are padded with zeros.
+        gpu_warp_size = 32
+
+        if head_dim < gpu_warp_size:
+            pad_size = batch_size, num_heads, seq_len, gpu_warp_size - head_dim
+
+            query_layer = torch.cat([query_layer, torch.zeros(pad_size, device=query_layer.device)], dim=-1)
+            key_layer = torch.cat([key_layer, torch.zeros(pad_size, device=key_layer.device)], dim=-1)
+            value_layer = torch.cat([value_layer, torch.zeros(pad_size, device=value_layer.device)], dim=-1)
+
+        context_layer = mra2_attention(
+            query_layer.float(),
+            key_layer.float(),
+            value_layer.float(),
+            attention_mask.float(),
+            self.num_block,
+            approx_mode=self.approx_mode,
+            initial_prior_first_n_blocks=self.initial_prior_first_n_blocks,
+            initial_prior_diagonal_n_blocks=self.initial_prior_diagonal_n_blocks,
+        )
+
+        if head_dim < gpu_warp_size:
+            context_layer = context_layer[:, :, :, :head_dim]
+
+        context_layer = context_layer.reshape(batch_size, num_heads, seq_len, head_dim)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer,)
+
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput
+class MraSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class MraAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        self.self = MraSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.output = MraSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(self, hidden_states, attention_mask=None):
+        self_outputs = self.self(hidden_states, attention_mask)
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate
+class MraIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput
+class MraOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class MraLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = MraAttention(config)
+        self.add_cross_attention = config.add_cross_attention
+        self.intermediate = MraIntermediate(config)
+        self.output = MraOutput(config)
+
+    def forward(self, hidden_states, attention_mask=None):
+        self_attention_outputs = self.attention(hidden_states, attention_mask)
+        attention_output = self_attention_outputs[0]
+
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class MraEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([MraLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, attention_mask)
+
+            hidden_states = layer_outputs[0]
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)
+        return BaseModelOutputWithCrossAttentions(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+        )
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform
+class MraPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->Mra
+class MraLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = MraPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->Mra
+class MraOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = MraLMPredictionHead(config)
+
+    def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+# Copied from transformers.models.yoso.modeling_yoso.YosoPreTrainedModel with Yoso->Mra,yoso->mra
+class MraPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = MraConfig
+    base_model_prefix = "mra"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, MraEncoder):
+            module.gradient_checkpointing = value
+
+
+MRA_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`MraConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+MRA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare MRA Model transformer outputting raw hidden-states without any specific head on top.",
+    MRA_START_DOCSTRING,
+)
+class MraModel(MraPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = MraEmbeddings(config)
+        self.encoder = MraEncoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(MRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithCrossAttentions]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length)), device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+
+        if not return_dict:
+            return (sequence_output,) + encoder_outputs[1:]
+
+        return BaseModelOutputWithCrossAttentions(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings("""MRA Model with a `language modeling` head on top.""", MRA_START_DOCSTRING)
+class MraForMaskedLM(MraPreTrainedModel):
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.mra = MraModel(config)
+        self.cls = MraOnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(MRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, MaskedLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.mra(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+# Copied from transformers.models.yoso.modeling_yoso.YosoClassificationHead with Yoso->Mra
+class MraClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.config = config
+
+    def forward(self, features, **kwargs):
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = ACT2FN[self.config.hidden_act](x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+
+
+@add_start_docstrings(
+    """MRA Model transformer with a sequence classification/regression head on top (a linear layer on top of
+    the pooled output) e.g. for GLUE tasks.""",
+    MRA_START_DOCSTRING,
+)
+class MraForSequenceClassification(MraPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.mra = MraModel(config)
+        self.classifier = MraClassificationHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.mra(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """MRA Model with a multiple choice classification head on top (a linear layer on top of
+    the pooled output and a softmax) e.g. for RocStories/SWAG tasks.""",
+    MRA_START_DOCSTRING,
+)
+class MraForMultipleChoice(MraPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.mra = MraModel(config)
+        self.pre_classifier = nn.Linear(config.hidden_size, config.hidden_size)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MRA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, MultipleChoiceModelOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.mra(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_state = outputs[0]  # (bs * num_choices, seq_len, dim)
+        pooled_output = hidden_state[:, 0]  # (bs * num_choices, dim)
+        pooled_output = self.pre_classifier(pooled_output)  # (bs * num_choices, dim)
+        pooled_output = nn.ReLU()(pooled_output)  # (bs * num_choices, dim)
+        logits = self.classifier(pooled_output)
+
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """MRA Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.""",
+    MRA_START_DOCSTRING,
+)
+class MraForTokenClassification(MraPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.mra = MraModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.mra(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
+                )
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """MRA Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).""",
+    MRA_START_DOCSTRING,
+)
+class MraForQuestionAnswering(MraPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        config.num_labels = 2
+        self.num_labels = config.num_labels
+
+        self.mra = MraModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.mra(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[1:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/mt5/__init__.py b/src/transformers/models/mt5/__init__.py
index b97fbff75277..ee536f50dfb6 100644
--- a/src/transformers/models/mt5/__init__.py
+++ b/src/transformers/models/mt5/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -54,6 +50,8 @@
     _import_structure["modeling_mt5"] = [
         "MT5EncoderModel",
         "MT5ForConditionalGeneration",
+        "MT5ForQuestionAnswering",
+        "MT5ForSequenceClassification",
         "MT5Model",
         "MT5PreTrainedModel",
         "MT5Stack",
@@ -85,7 +83,15 @@
     except OptionalDependencyNotAvailable:
         pass
     else:
-        from .modeling_mt5 import MT5EncoderModel, MT5ForConditionalGeneration, MT5Model, MT5PreTrainedModel, MT5Stack
+        from .modeling_mt5 import (
+            MT5EncoderModel,
+            MT5ForConditionalGeneration,
+            MT5ForQuestionAnswering,
+            MT5ForSequenceClassification,
+            MT5Model,
+            MT5PreTrainedModel,
+            MT5Stack,
+        )
 
     try:
         if not is_tf_available():
diff --git a/src/transformers/models/mt5/configuration_mt5.py b/src/transformers/models/mt5/configuration_mt5.py
index d9232c94629d..b1bb201bebc5 100644
--- a/src/transformers/models/mt5/configuration_mt5.py
+++ b/src/transformers/models/mt5/configuration_mt5.py
@@ -56,6 +56,8 @@ class MT5Config(PretrainedConfig):
             The maximum distance of the longer sequences for the bucket separation.
         dropout_rate (`float`, *optional*, defaults to 0.1):
             The ratio for all dropout layers.
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for classifier.
         layer_norm_eps (`float`, *optional*, defaults to 1e-6):
             The epsilon used by the layer normalization layers.
         initializer_factor (`float`, *optional*, defaults to 1):
@@ -91,7 +93,8 @@ def __init__(
         pad_token_id=0,
         eos_token_id=1,
         decoder_start_token_id=0,
-        **kwargs
+        classifier_dropout=0.0,
+        **kwargs,
     ):
         super().__init__(
             is_encoder_decoder=is_encoder_decoder,
@@ -114,6 +117,7 @@ def __init__(
         self.relative_attention_num_buckets = relative_attention_num_buckets
         self.relative_attention_max_distance = relative_attention_max_distance
         self.dropout_rate = dropout_rate
+        self.classifier_dropout = classifier_dropout
         self.layer_norm_epsilon = layer_norm_epsilon
         self.initializer_factor = initializer_factor
         self.feed_forward_proj = feed_forward_proj
diff --git a/src/transformers/models/mt5/modeling_flax_mt5.py b/src/transformers/models/mt5/modeling_flax_mt5.py
index 4f2fa5b9fb39..86ddf477ffab 100644
--- a/src/transformers/models/mt5/modeling_flax_mt5.py
+++ b/src/transformers/models/mt5/modeling_flax_mt5.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 """ Flax mT5 model."""
 
-import numpy as np
+import jax.numpy as jnp
 
 from ...utils import logging
 from ..t5.modeling_flax_t5 import FlaxT5EncoderModel, FlaxT5ForConditionalGeneration, FlaxT5Model
@@ -24,19 +24,18 @@
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "T5Config"
-_TOKENIZER_FOR_DOC = "T5Tokenizer"
 
 
 # Copied from transformers.models.bart.modeling_flax_bart.shift_tokens_right
-def shift_tokens_right(input_ids: np.array, pad_token_id: int, decoder_start_token_id: int) -> np.ndarray:
+def shift_tokens_right(input_ids: jnp.array, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray:
     """
     Shift input ids one token to the right.
     """
-    shifted_input_ids = np.zeros_like(input_ids)
-    shifted_input_ids[:, 1:] = input_ids[:, :-1]
-    shifted_input_ids[:, 0] = decoder_start_token_id
+    shifted_input_ids = jnp.zeros_like(input_ids)
+    shifted_input_ids = shifted_input_ids.at[:, 1:].set(input_ids[:, :-1])
+    shifted_input_ids = shifted_input_ids.at[:, 0].set(decoder_start_token_id)
 
-    shifted_input_ids = np.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
+    shifted_input_ids = jnp.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
     return shifted_input_ids
 
 
@@ -48,10 +47,10 @@ class FlaxMT5Model(FlaxT5Model):
     Examples:
 
     ```python
-    >>> from transformers import FlaxMT5Model, T5Tokenizer
+    >>> from transformers import FlaxMT5Model, AutoTokenizer
 
     >>> model = FlaxMT5Model.from_pretrained("google/mt5-small")
-    >>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
 
     >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
     >>> summary = "Weiter Verhandlung in Syrien."
@@ -74,10 +73,10 @@ class FlaxMT5EncoderModel(FlaxT5EncoderModel):
     Examples:
 
     ```python
-    >>> from transformers import FlaxT5EncoderModel, T5Tokenizer
+    >>> from transformers import FlaxT5EncoderModel, AutoTokenizer
 
     >>> model = FlaxT5EncoderModel.from_pretrained("google/mt5-small")
-    >>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
 
     >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
     >>> summary = "Weiter Verhandlung in Syrien."
@@ -100,10 +99,10 @@ class FlaxMT5ForConditionalGeneration(FlaxT5ForConditionalGeneration):
     Examples:
 
     ```python
-    >>> from transformers import FlaxMT5ForConditionalGeneration, T5Tokenizer
+    >>> from transformers import FlaxMT5ForConditionalGeneration, AutoTokenizer
 
     >>> model = FlaxMT5ForConditionalGeneration.from_pretrained("google/mt5-small")
-    >>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
 
     >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
     >>> summary = "Weiter Verhandlung in Syrien."
diff --git a/src/transformers/models/mt5/modeling_mt5.py b/src/transformers/models/mt5/modeling_mt5.py
index 50b40e961290..3d03503ddd40 100644
--- a/src/transformers/models/mt5/modeling_mt5.py
+++ b/src/transformers/models/mt5/modeling_mt5.py
@@ -18,11 +18,11 @@
 import math
 import os
 import warnings
-from typing import Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import torch
 from torch import nn
-from torch.nn import CrossEntropyLoss
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from torch.utils.checkpoint import checkpoint
 
 from ...activations import ACT2FN
@@ -31,6 +31,8 @@
     BaseModelOutputWithPastAndCrossAttentions,
     Seq2SeqLMOutput,
     Seq2SeqModelOutput,
+    Seq2SeqQuestionAnsweringModelOutput,
+    Seq2SeqSequenceClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
@@ -50,7 +52,6 @@
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "MT5Config"
-_TOKENIZER_FOR_DOC = "MT5Tokenizer"
 _CHECKPOINT_FOR_DOC = "mt5-small"
 
 
@@ -118,7 +119,6 @@ def __init__(self, hidden_size, eps=1e-6):
         self.variance_epsilon = eps
 
     def forward(self, hidden_states):
-
         # MT5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
         # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
         # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
@@ -147,6 +147,12 @@ def forward(self, hidden_states):
         hidden_states = self.wi(hidden_states)
         hidden_states = self.act(hidden_states)
         hidden_states = self.dropout(hidden_states)
+        if (
+            isinstance(self.wo.weight, torch.Tensor)
+            and hidden_states.dtype != self.wo.weight.dtype
+            and self.wo.weight.dtype != torch.int8
+        ):
+            hidden_states = hidden_states.to(self.wo.weight.dtype)
         hidden_states = self.wo(hidden_states)
         return hidden_states
 
@@ -169,7 +175,12 @@ def forward(self, hidden_states):
 
         # To make 8bit quantization work for google/flan-t5-xxl, self.wo is kept in float32.
         # See https://github.com/huggingface/transformers/issues/20287
-        if hidden_states.dtype != self.wo.weight.dtype:
+        # we also make sure the weights are not in `int8` in case users will force `_keep_in_fp32_modules` to be `None``
+        if (
+            isinstance(self.wo.weight, torch.Tensor)
+            and hidden_states.dtype != self.wo.weight.dtype
+            and self.wo.weight.dtype != torch.int8
+        ):
             hidden_states = hidden_states.to(self.wo.weight.dtype)
 
         hidden_states = self.wo(hidden_states)
@@ -324,9 +335,10 @@ def forward(
         real_seq_length = seq_length
 
         if past_key_value is not None:
-            assert (
-                len(past_key_value) == 2
-            ), f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
+            if len(past_key_value) != 2:
+                raise ValueError(
+                    f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
+                )
             real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length
 
         key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
@@ -527,7 +539,6 @@ def forward(
         output_attentions=False,
         return_dict=True,
     ):
-
         if past_key_value is not None:
             if not self.is_decoder:
                 logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.")
@@ -558,8 +569,12 @@ def forward(
         attention_outputs = self_attention_outputs[2:]  # Keep self-attention outputs and relative position weights
 
         # clamp inf values to enable fp16 training
-        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+        if hidden_states.dtype == torch.float16:
+            clamp_value = torch.where(
+                torch.isinf(hidden_states).any(),
+                torch.finfo(hidden_states.dtype).max - 1000,
+                torch.finfo(hidden_states.dtype).max,
+            )
             hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
 
         do_cross_attention = self.is_decoder and encoder_hidden_states is not None
@@ -585,8 +600,12 @@ def forward(
             hidden_states = cross_attention_outputs[0]
 
             # clamp inf values to enable fp16 training
-            if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
-                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            if hidden_states.dtype == torch.float16:
+                clamp_value = torch.where(
+                    torch.isinf(hidden_states).any(),
+                    torch.finfo(hidden_states.dtype).max - 1000,
+                    torch.finfo(hidden_states.dtype).max,
+                )
                 hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
 
             # Combine self attn and cross attn key value states
@@ -600,8 +619,12 @@ def forward(
         hidden_states = self.layer[-1](hidden_states)
 
         # clamp inf values to enable fp16 training
-        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+        if hidden_states.dtype == torch.float16:
+            clamp_value = torch.where(
+                torch.isinf(hidden_states).any(),
+                torch.finfo(hidden_states.dtype).max - 1000,
+                torch.finfo(hidden_states.dtype).max,
+            )
             hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
 
         outputs = (hidden_states,)
@@ -720,6 +743,25 @@ def load_tf_weights_in_mt5(model, config, tf_checkpoint_path):
     return model
 
 
+# Copied from transformers.models.t5.modeling_t5.T5ClassificationHead with T5->MT5
+class MT5ClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config: MT5Config):
+        super().__init__()
+        self.dense = nn.Linear(config.d_model, config.d_model)
+        self.dropout = nn.Dropout(p=config.classifier_dropout)
+        self.out_proj = nn.Linear(config.d_model, config.num_labels)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = torch.tanh(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.out_proj(hidden_states)
+        return hidden_states
+
+
 # Copied from transformers.models.t5.modeling_t5.T5PreTrainedModel with T5->MT5, t5->mt5
 class MT5PreTrainedModel(PreTrainedModel):
     """
@@ -751,12 +793,25 @@ def _init_weights(self, module):
         factor = self.config.initializer_factor  # Used for testing weights initialization
         if isinstance(module, MT5LayerNorm):
             module.weight.data.fill_(factor * 1.0)
-        elif isinstance(module, (MT5Model, MT5ForConditionalGeneration, MT5EncoderModel)):
+        elif isinstance(
+            module,
+            (MT5Model, MT5ForConditionalGeneration, MT5EncoderModel, MT5ForQuestionAnswering),
+        ):
             # Mesh TensorFlow embeddings initialization
             # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
             module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0)
             if hasattr(module, "lm_head") and not self.config.tie_word_embeddings:
                 module.lm_head.weight.data.normal_(mean=0.0, std=factor * 1.0)
+            if hasattr(module, "qa_outputs"):
+                module.qa_outputs.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+                module.qa_outputs.bias.data.zero_()
+        elif isinstance(module, MT5ClassificationHead):
+            module.dense.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.dense, "bias") and module.dense.bias is not None:
+                module.dense.bias.data.zero_()
+            module.out_proj.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.out_proj, "bias") and module.out_proj.bias is not None:
+                module.out_proj.bias.data.zero_()
         elif isinstance(module, MT5DenseActDense):
             # Mesh TensorFlow FF initialization
             # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
@@ -798,10 +853,11 @@ def _shift_right(self, input_ids):
         decoder_start_token_id = self.config.decoder_start_token_id
         pad_token_id = self.config.pad_token_id
 
-        assert decoder_start_token_id is not None, (
-            "self.model.config.decoder_start_token_id has to be defined. In MT5 it is usually set to the pad_token_id."
-            " See MT5 docs for more information"
-        )
+        if decoder_start_token_id is None:
+            raise ValueError(
+                "self.model.config.decoder_start_token_id has to be defined. In MT5 it is usually set to the pad_token_id."
+                "See MT5 docs for more information."
+            )
 
         # shift inputs to the right
         if is_torch_fx_proxy(input_ids):
@@ -813,7 +869,8 @@ def _shift_right(self, input_ids):
             shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
             shifted_input_ids[..., 0] = decoder_start_token_id
 
-        assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
+        if pad_token_id is None:
+            raise ValueError("self.model.config.pad_token_id has to be defined.")
         # replace possible -100 values in labels by `pad_token_id`
         shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
 
@@ -843,6 +900,13 @@ def __init__(self, config, embed_tokens=None):
 
     @add_start_docstrings(PARALLELIZE_DOCSTRING)
     def parallelize(self, device_map=None):
+        warnings.warn(
+            "`MT5Stack.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your model"
+            " with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
+            " `device_map` but it needs to be a dictionary module_name to device, so for instance {'block.0': 0,"
+            " 'block.1': 1, ...}",
+            FutureWarning,
+        )
         # Check validity of device_map
         self.device_map = (
             get_device_map(len(self.block), range(torch.cuda.device_count())) if device_map is None else device_map
@@ -864,6 +928,10 @@ def parallelize(self, device_map=None):
 
     @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
     def deparallelize(self):
+        warnings.warn(
+            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
+            FutureWarning,
+        )
         self.model_parallel = False
         self.device_map = None
         self.first_device = "cpu"
@@ -921,7 +989,8 @@ def forward(
             raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds")
 
         if inputs_embeds is None:
-            assert self.embed_tokens is not None, "You have to initialize the model with valid token embeddings"
+            if self.embed_tokens is None:
+                raise ValueError("You have to initialize the model with valid token embeddings")
             inputs_embeds = self.embed_tokens(input_ids)
 
         batch_size, seq_length = input_shape
@@ -930,7 +999,8 @@ def forward(
         mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length
 
         if use_cache is True:
-            assert self.is_decoder, f"`use_cache` can only be set to `True` if {self} is used as a decoder"
+            if not self.is_decoder:
+                raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder")
 
         if attention_mask is None:
             attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
@@ -959,6 +1029,13 @@ def forward(
         else:
             encoder_extended_attention_mask = None
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         # Prepare head mask if needed
         head_mask = self.get_head_mask(head_mask, self.config.num_layers)
         cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
@@ -996,11 +1073,6 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             if self.gradient_checkpointing and self.training:
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
 
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
@@ -1118,7 +1190,7 @@ def custom_forward(*inputs):
             Indices of input sequence tokens in the vocabulary. MT5 is a model with relative position embeddings so you
             should be able to pad the inputs on both the right and the left.
 
-            Indices can be obtained using [`MT5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for detail.
 
             [What are input IDs?](../glossary#input-ids)
@@ -1134,7 +1206,7 @@ def custom_forward(*inputs):
         decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`MT5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -1211,7 +1283,7 @@ def custom_forward(*inputs):
             Indices of input sequence tokens in the vocabulary. MT5 is a model with relative position embeddings so you
             should be able to pad the inputs on both the right and the left.
 
-            Indices can be obtained using [`MT5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for detail.
 
             To know more on how to prepare `input_ids` for pretraining take a look a [MT5 Training](./mt5#training).
@@ -1260,10 +1332,10 @@ class MT5Model(MT5PreTrainedModel):
     Examples:
 
     ```python
-    >>> from transformers import MT5Model, MT5Tokenizer
+    >>> from transformers import MT5Model, AutoTokenizer
 
     >>> model = MT5Model.from_pretrained("google/mt5-small")
-    >>> tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
     >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
     >>> summary = "Weiter Verhandlung in Syrien."
     >>> inputs = tokenizer(article, return_tensors="pt")
@@ -1274,18 +1346,9 @@ class MT5Model(MT5PreTrainedModel):
     ```"""
     model_type = "mt5"
     config_class = MT5Config
-    _keys_to_ignore_on_load_missing = [
-        r"encoder.embed_tokens.weight",
-        r"decoder.embed_tokens.weight",
-        r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
-    ]
-    _keys_to_ignore_on_save = [
-        r"encoder.embed_tokens.weight",
-        r"decoder.embed_tokens.weight",
-    ]
-    _keys_to_ignore_on_load_unexpected = [
-        r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
-    ]
+    _keys_to_ignore_on_load_missing = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
+    _keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     # Copied from transformers.models.t5.modeling_t5.T5Model.__init__ with T5->MT5
     def __init__(self, config: MT5Config):
@@ -1314,6 +1377,13 @@ def __init__(self, config: MT5Config):
     @add_start_docstrings(PARALLELIZE_DOCSTRING)
     # Copied from transformers.models.t5.modeling_t5.T5Model.parallelize
     def parallelize(self, device_map=None):
+        warnings.warn(
+            "`T5Model.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your model"
+            " with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
+            " `device_map` but it needs to be a dictionary module_name to device, so for instance {'encoder.block.0':"
+            " 0, 'encoder.block.1': 1, ...}",
+            FutureWarning,
+        )
         self.device_map = (
             get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
             if device_map is None
@@ -1327,6 +1397,10 @@ def parallelize(self, device_map=None):
     @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
     # Copied from transformers.models.t5.modeling_t5.T5Model.deparallelize
     def deparallelize(self):
+        warnings.warn(
+            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
+            FutureWarning,
+        )
         self.encoder.deparallelize()
         self.decoder.deparallelize()
         self.encoder = self.encoder.to("cpu")
@@ -1389,9 +1463,9 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import MT5Tokenizer, MT5Model
+        >>> from transformers import AutoTokenizer, MT5Model
 
-        >>> tokenizer = MT5Tokenizer.from_pretrained("mt5-small")
+        >>> tokenizer = AutoTokenizer.from_pretrained("mt5-small")
         >>> model = MT5Model.from_pretrained("mt5-small")
 
         >>> input_ids = tokenizer(
@@ -1484,10 +1558,10 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel):
     Examples:
 
     ```python
-    >>> from transformers import MT5ForConditionalGeneration, MT5Tokenizer
+    >>> from transformers import MT5ForConditionalGeneration, AutoTokenizer
 
     >>> model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
-    >>> tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
     >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
     >>> summary = "Weiter Verhandlung in Syrien."
     >>> inputs = tokenizer(article, text_target=summary, return_tensors="pt")
@@ -1498,15 +1572,8 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel):
 
     model_type = "mt5"
     config_class = MT5Config
-    _keys_to_ignore_on_load_missing = [
-        r"encoder.embed_tokens.weight",
-    ]
-    _keys_to_ignore_on_save = [
-        r"encoder.embed_tokens.weight",
-    ]
-    _keys_to_ignore_on_load_unexpected = [
-        r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
-    ]
+    _keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
 
     # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.__init__ with T5->MT5
     def __init__(self, config: MT5Config):
@@ -1539,6 +1606,13 @@ def __init__(self, config: MT5Config):
     @add_start_docstrings(PARALLELIZE_DOCSTRING)
     # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.parallelize
     def parallelize(self, device_map=None):
+        warnings.warn(
+            "`T5ForConditionalGeneration.parallelize` is deprecated and will be removed in v5 of Transformers, you"
+            " should load your model with `device_map='balanced'` in the call to `from_pretrained`. You can also"
+            " provide your own `device_map` but it needs to be a dictionary module_name to device, so for instance"
+            " {'encoder.block.0': 0, 'encoder.block.1': 1, ...}",
+            FutureWarning,
+        )
         self.device_map = (
             get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
             if device_map is None
@@ -1553,6 +1627,10 @@ def parallelize(self, device_map=None):
     @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
     # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.deparallelize
     def deparallelize(self):
+        warnings.warn(
+            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
+            FutureWarning,
+        )
         self.encoder.deparallelize()
         self.decoder.deparallelize()
         self.encoder = self.encoder.to("cpu")
@@ -1621,9 +1699,9 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import MT5Tokenizer, MT5ForConditionalGeneration
+        >>> from transformers import AutoTokenizer, MT5ForConditionalGeneration
 
-        >>> tokenizer = MT5Tokenizer.from_pretrained("mt5-small")
+        >>> tokenizer = AutoTokenizer.from_pretrained("mt5-small")
         >>> model = MT5ForConditionalGeneration.from_pretrained("mt5-small")
 
         >>> # training
@@ -1723,6 +1801,8 @@ def forward(
         loss = None
         if labels is not None:
             loss_fct = CrossEntropyLoss(ignore_index=-100)
+            # move labels to correct device to enable PP
+            labels = labels.to(lm_logits.device)
             loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
             # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
 
@@ -1750,12 +1830,12 @@ def prepare_inputs_for_generation(
         attention_mask=None,
         head_mask=None,
         decoder_head_mask=None,
+        decoder_attention_mask=None,
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
-
         # cut decoder_input_ids if past is used
         if past_key_values is not None:
             input_ids = input_ids[:, -1:]
@@ -1767,6 +1847,7 @@ def prepare_inputs_for_generation(
             "attention_mask": attention_mask,
             "head_mask": head_mask,
             "decoder_head_mask": decoder_head_mask,
+            "decoder_attention_mask": decoder_attention_mask,
             "cross_attn_head_mask": cross_attn_head_mask,
             "use_cache": use_cache,
         }
@@ -1776,15 +1857,15 @@ def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
         return self._shift_right(labels)
 
     # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration._reorder_cache
-    def _reorder_cache(self, past, beam_idx):
+    def _reorder_cache(self, past_key_values, beam_idx):
         # if decoder past is not included in output
         # speedy decoding is disabled and no need to reorder
-        if past is None:
+        if past_key_values is None:
             logger.warning("You might want to consider setting `use_cache=True` to speed up decoding")
-            return past
+            return past_key_values
 
         reordered_decoder_past = ()
-        for layer_past_states in past:
+        for layer_past_states in past_key_values:
             # get the correct batch idx from layer past batch dim
             # batch dim of `past` is at 2nd position
             reordered_layer_past_states = ()
@@ -1794,8 +1875,14 @@ def _reorder_cache(self, past, beam_idx):
                     layer_past_state.index_select(0, beam_idx.to(layer_past_state.device)),
                 )
 
-            assert reordered_layer_past_states[0].shape == layer_past_states[0].shape
-            assert len(reordered_layer_past_states) == len(layer_past_states)
+            if reordered_layer_past_states[0].shape != layer_past_states[0].shape:
+                raise ValueError(
+                    f"reordered_layer_past_states[0] shape {reordered_layer_past_states[0].shape} and layer_past_states[0] shape {layer_past_states[0].shape} mismatched"
+                )
+            if len(reordered_layer_past_states) != len(layer_past_states):
+                raise ValueError(
+                    f"length of reordered_layer_past_states {len(reordered_layer_past_states)} and length of layer_past_states {len(layer_past_states)} mismatched"
+                )
 
             reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)
         return reordered_decoder_past
@@ -1810,10 +1897,10 @@ class MT5EncoderModel(MT5PreTrainedModel):
     Examples:
 
     ```python
-    >>> from transformers import MT5EncoderModel, MT5Tokenizer
+    >>> from transformers import MT5EncoderModel, AutoTokenizer
 
     >>> model = MT5EncoderModel.from_pretrained("google/mt5-small")
-    >>> tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
     >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
     >>> input_ids = tokenizer(article, return_tensors="pt").input_ids
     >>> outputs = model(input_ids)
@@ -1822,13 +1909,7 @@ class MT5EncoderModel(MT5PreTrainedModel):
 
     model_type = "mt5"
     config_class = MT5Config
-    _keys_to_ignore_on_load_missing = [
-        r"encoder.embed_tokens.weight",
-    ]
-    _keys_to_ignore_on_save = [
-        r"encoder.embed_tokens.weight",
-    ]
-    _keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight"]
 
     # Copied from transformers.models.t5.modeling_t5.T5EncoderModel.__init__ with T5->MT5
     def __init__(self, config: MT5Config):
@@ -1850,6 +1931,13 @@ def __init__(self, config: MT5Config):
     @add_start_docstrings(PARALLELIZE_DOCSTRING)
     # Copied from transformers.models.t5.modeling_t5.T5EncoderModel.parallelize
     def parallelize(self, device_map=None):
+        warnings.warn(
+            "`T5EncoderModel.parallelize` is deprecated and will be removed in v5 of Transformers, you should load"
+            " your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
+            " `device_map` but it needs to be a dictionary module_name to device, so for instance {'block.0': 0,"
+            " 'block.1': 1, ...}",
+            FutureWarning,
+        )
         self.device_map = (
             get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
             if device_map is None
@@ -1862,6 +1950,10 @@ def parallelize(self, device_map=None):
     @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
     # Copied from transformers.models.t5.modeling_t5.T5EncoderModel.deparallelize
     def deparallelize(self):
+        warnings.warn(
+            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
+            FutureWarning,
+        )
         self.encoder.deparallelize()
         self.encoder = self.encoder.to("cpu")
         self.model_parallel = False
@@ -1909,9 +2001,9 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import MT5Tokenizer, MT5EncoderModel
+        >>> from transformers import AutoTokenizer, MT5EncoderModel
 
-        >>> tokenizer = MT5Tokenizer.from_pretrained("mt5-small")
+        >>> tokenizer = AutoTokenizer.from_pretrained("mt5-small")
         >>> model = MT5EncoderModel.from_pretrained("mt5-small")
         >>> input_ids = tokenizer(
         ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
@@ -1932,3 +2024,331 @@ def forward(
         )
 
         return encoder_outputs
+
+
+@add_start_docstrings(
+    """
+    MT5 model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
+    tasks.
+    """,
+    MT5_START_DOCSTRING,
+)
+class MT5ForSequenceClassification(MT5PreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+
+    # Copied from transformers.models.t5.modeling_t5.T5ForSequenceClassification.__init__ with T5->MT5
+    def __init__(self, config: MT5Config):
+        super().__init__(config)
+        self.transformer = MT5Model(config)
+        self.classification_head = MT5ClassificationHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        self.model_parallel = False
+
+    @add_start_docstrings_to_model_forward(MT5_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    # Copied from transformers.models.t5.modeling_t5.T5ForSequenceClassification.forward
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Seq2SeqSequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        Returns:
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        if input_ids is None and inputs_embeds is not None:
+            raise NotImplementedError(
+                f"Passing input embeddings is currently not supported for {self.__class__.__name__}"
+            )
+
+        # Copied from models.bart.modeling_bart.BartModel.forward different to other models, T5 automatically creates
+        # decoder_input_ids from input_ids if no decoder_input_ids are provided
+        if decoder_input_ids is None and decoder_inputs_embeds is None:
+            if input_ids is None:
+                raise ValueError(
+                    "If no `decoder_input_ids` or `decoder_inputs_embeds` are "
+                    "passed, `input_ids` cannot be `None`. Please pass either "
+                    "`input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`."
+                )
+            decoder_input_ids = self._shift_right(input_ids)
+
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+
+        eos_mask = input_ids.eq(self.config.eos_token_id).to(sequence_output.device)
+
+        if len(torch.unique_consecutive(eos_mask.sum(1))) > 1:
+            raise ValueError("All examples must have the same number of <eos> tokens.")
+        batch_size, _, hidden_size = sequence_output.shape
+        sentence_representation = sequence_output[eos_mask, :].view(batch_size, -1, hidden_size)[:, -1, :]
+        logits = self.classification_head(sentence_representation)
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.config.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.config.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.config.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return Seq2SeqSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    MT5 Model with a span classification head on top for extractive question-answering tasks like SQuAD (linear layers
+    on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    MT5_START_DOCSTRING,
+)
+class MT5ForQuestionAnswering(MT5PreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+
+    # Copied from transformers.models.t5.modeling_t5.T5ForQuestionAnswering.__init__ with T5->MT5
+    def __init__(self, config: MT5Config):
+        super().__init__(config)
+        self.model_dim = config.d_model
+
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_decoder = False
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = MT5Stack(encoder_config, self.shared)
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.is_encoder_decoder = False
+        decoder_config.num_layers = config.num_decoder_layers
+        self.decoder = MT5Stack(decoder_config, self.shared)
+
+        self.num_labels = config.num_labels
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        self.model_parallel = False
+
+    # Copied from transformers.models.t5.modeling_t5.T5ForQuestionAnswering.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.shared
+
+    # Copied from transformers.models.t5.modeling_t5.T5ForQuestionAnswering.set_input_embeddings
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+        self.decoder.set_input_embeddings(new_embeddings)
+
+    # Copied from transformers.models.t5.modeling_t5.T5ForQuestionAnswering.get_encoder
+    def get_encoder(self):
+        return self.encoder
+
+    # Copied from transformers.models.t5.modeling_t5.T5ForQuestionAnswering.get_decoder
+    def get_decoder(self):
+        return self.decoder
+
+    @add_start_docstrings_to_model_forward(MT5_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqQuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC)
+    # Copied from transformers.models.t5.modeling_t5.T5ForQuestionAnswering.forward
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        decoder_head_mask: Optional[torch.FloatTensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqQuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
+            are not taken into account for computing the loss.
+        Returns:
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if start_positions is not None and end_positions is not None:
+            use_cache = False
+
+        # Copied from models.bart.modeling_bart.BartModel.forward
+        #   different to other models, T5 automatically creates decoder_input_ids from
+        #   input_ids if no decoder_input_ids are provided
+        if decoder_input_ids is None and decoder_inputs_embeds is None:
+            if input_ids is None:
+                raise ValueError(
+                    "If no `decoder_input_ids` or `decoder_inputs_embeds` are "
+                    "passed, `input_ids` cannot be `None`. Please pass either "
+                    "`input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`."
+                )
+            decoder_input_ids = self._shift_right(input_ids)
+
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+        if head_mask is not None and decoder_head_mask is None:
+            if self.config.num_layers == self.config.num_decoder_layers:
+                warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
+                decoder_head_mask = head_mask
+
+        # Encode if needed (training, first prediction pass)
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        hidden_states = encoder_outputs[0]
+
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            past_key_values=None,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = decoder_outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1).to(start_logits.device)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1).to(end_logits.device)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + decoder_outputs[1:] + encoder_outputs
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return Seq2SeqQuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
diff --git a/src/transformers/models/mt5/modeling_tf_mt5.py b/src/transformers/models/mt5/modeling_tf_mt5.py
index 71aa0bb66a7a..ba7bd33c3447 100644
--- a/src/transformers/models/mt5/modeling_tf_mt5.py
+++ b/src/transformers/models/mt5/modeling_tf_mt5.py
@@ -22,7 +22,6 @@
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "T5Config"
-_TOKENIZER_FOR_DOC = "T5Tokenizer"
 
 
 class TFMT5Model(TFT5Model):
@@ -33,10 +32,10 @@ class TFMT5Model(TFT5Model):
     Examples:
 
     ```python
-    >>> from transformers import TFMT5Model, T5Tokenizer
+    >>> from transformers import TFMT5Model, AutoTokenizer
 
     >>> model = TFMT5Model.from_pretrained("google/mt5-small")
-    >>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
     >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
     >>> summary = "Weiter Verhandlung in Syrien."
     >>> inputs = tokenizer(article, return_tensors="tf")
@@ -57,10 +56,10 @@ class TFMT5ForConditionalGeneration(TFT5ForConditionalGeneration):
     Examples:
 
     ```python
-    >>> from transformers import TFMT5ForConditionalGeneration, T5Tokenizer
+    >>> from transformers import TFMT5ForConditionalGeneration, AutoTokenizer
 
     >>> model = TFMT5ForConditionalGeneration.from_pretrained("google/mt5-small")
-    >>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
     >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
     >>> summary = "Weiter Verhandlung in Syrien."
     >>> inputs = tokenizer(article, text_target=summary, return_tensors="tf")
@@ -81,10 +80,10 @@ class TFMT5EncoderModel(TFT5EncoderModel):
     Examples:
 
     ```python
-    >>> from transformers import TFMT5EncoderModel, T5Tokenizer
+    >>> from transformers import TFMT5EncoderModel, AutoTokenizer
 
     >>> model = TFMT5EncoderModel.from_pretrained("google/mt5-small")
-    >>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
     >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
     >>> input_ids = tokenizer(article, return_tensors="tf").input_ids
     >>> outputs = model(input_ids)
diff --git a/src/transformers/models/musicgen/__init__.py b/src/transformers/models/musicgen/__init__.py
new file mode 100644
index 000000000000..7fa695eba808
--- /dev/null
+++ b/src/transformers/models/musicgen/__init__.py
@@ -0,0 +1,67 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {
+    "configuration_musicgen": [
+        "MUSICGEN_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "MusicgenConfig",
+        "MusicgenDecoderConfig",
+    ],
+    "processing_musicgen": ["MusicgenProcessor"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_musicgen"] = [
+        "MUSICGEN_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "MusicgenForConditionalGeneration",
+        "MusicgenForCausalLM",
+        "MusicgenModel",
+        "MusicgenPreTrainedModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_musicgen import (
+        MUSICGEN_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        MusicgenConfig,
+        MusicgenDecoderConfig,
+    )
+    from .processing_musicgen import MusicgenProcessor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_musicgen import (
+            MUSICGEN_PRETRAINED_MODEL_ARCHIVE_LIST,
+            MusicgenForCausalLM,
+            MusicgenForConditionalGeneration,
+            MusicgenModel,
+            MusicgenPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/musicgen/configuration_musicgen.py b/src/transformers/models/musicgen/configuration_musicgen.py
new file mode 100644
index 000000000000..03371e10446c
--- /dev/null
+++ b/src/transformers/models/musicgen/configuration_musicgen.py
@@ -0,0 +1,233 @@
+# coding=utf-8
+# Copyright 2023 Meta AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" MusicGen model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto.configuration_auto import AutoConfig
+
+
+logger = logging.get_logger(__name__)
+
+MUSICGEN_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "facebook/musicgen-small": "https://huggingface.co/facebook/musicgen-small/resolve/main/config.json",
+    # See all Musicgen models at https://huggingface.co/models?filter=musicgen
+}
+
+
+class MusicgenDecoderConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of an [`MusicgenDecoder`]. It is used to instantiate a
+    MusicGen decoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the MusicGen
+    [facebook/musicgen-small](https://huggingface.co/facebook/musicgen-small) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 2048):
+            Vocabulary size of the MusicgenDecoder model. Defines the number of different tokens that can be
+            represented by the `inputs_ids` passed when calling [`MusicgenDecoder`].
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of decoder layers.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer block.
+        ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer block.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the decoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, text_encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically, set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_factor (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        scale_embedding (`bool`, *optional*, defaults to `False`):
+            Scale embeddings by diving by sqrt(hidden_size).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether the model should return the last key/values attentions (not used by all models)
+        num_codebooks (`int`, *optional*, defaults to 4):
+            The number of parallel codebooks forwarded to the model.
+        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
+            Whether input and output word embeddings should be tied.
+    """
+    model_type = "musicgen_decoder"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=2048,
+        max_position_embeddings=2048,
+        num_hidden_layers=24,
+        ffn_dim=4096,
+        num_attention_heads=16,
+        layerdrop=0.0,
+        use_cache=True,
+        activation_function="gelu",
+        hidden_size=1024,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        initializer_factor=0.02,
+        scale_embedding=False,
+        num_codebooks=4,
+        pad_token_id=2048,
+        bos_token_id=2048,
+        eos_token_id=None,
+        tie_word_embeddings=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.ffn_dim = ffn_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.initializer_factor = initializer_factor
+        self.layerdrop = layerdrop
+        self.use_cache = use_cache
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+        self.num_codebooks = num_codebooks
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+class MusicgenConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MusicgenModel`]. It is used to instantiate a
+    MusicGen model according to the specified arguments, defining the text encoder, audio encoder and MusicGen decoder
+    configs.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        kwargs (*optional*):
+            Dictionary of keyword arguments. Notably:
+
+                - **text_encoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that
+                  defines the text encoder config.
+                - **audio_encoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that
+                  defines the audio encoder config.
+                - **decoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that defines
+                  the decoder config.
+
+    Example:
+
+    ```python
+    >>> from transformers import (
+    ...     MusicgenConfig,
+    ...     MusicgenDecoderConfig,
+    ...     T5Config,
+    ...     EncodecConfig,
+    ...     MusicgenForConditionalGeneration,
+    ... )
+
+    >>> # Initializing text encoder, audio encoder, and decoder model configurations
+    >>> text_encoder_config = T5Config()
+    >>> audio_encoder_config = EncodecConfig()
+    >>> decoder_config = MusicgenDecoderConfig()
+
+    >>> configuration = MusicgenConfig.from_sub_models_config(
+    ...     text_encoder_config, audio_encoder_config, decoder_config
+    ... )
+
+    >>> # Initializing a MusicgenForConditionalGeneration (with random weights) from the facebook/musicgen-small style configuration
+    >>> model = MusicgenForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    >>> config_text_encoder = model.config.text_encoder
+    >>> config_audio_encoder = model.config.audio_encoder
+    >>> config_decoder = model.config.decoder
+
+    >>> # Saving the model, including its configuration
+    >>> model.save_pretrained("musicgen-model")
+
+    >>> # loading model and config from pretrained folder
+    >>> musicgen_config = MusicgenConfig.from_pretrained("musicgen-model")
+    >>> model = MusicgenForConditionalGeneration.from_pretrained("musicgen-model", config=musicgen_config)
+    ```"""
+
+    model_type = "musicgen"
+    is_composition = True
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        if "text_encoder" not in kwargs or "audio_encoder" not in kwargs or "decoder" not in kwargs:
+            raise ValueError("Config has to be initialized with text_encoder, audio_encoder and decoder config")
+
+        text_encoder_config = kwargs.pop("text_encoder")
+        text_encoder_model_type = text_encoder_config.pop("model_type")
+
+        audio_encoder_config = kwargs.pop("audio_encoder")
+        audio_encoder_model_type = audio_encoder_config.pop("model_type")
+
+        decoder_config = kwargs.pop("decoder")
+
+        self.text_encoder = AutoConfig.for_model(text_encoder_model_type, **text_encoder_config)
+        self.audio_encoder = AutoConfig.for_model(audio_encoder_model_type, **audio_encoder_config)
+        self.decoder = MusicgenDecoderConfig(**decoder_config)
+        self.is_encoder_decoder = True
+
+    @classmethod
+    def from_sub_models_config(
+        cls,
+        text_encoder_config: PretrainedConfig,
+        audio_encoder_config: PretrainedConfig,
+        decoder_config: MusicgenDecoderConfig,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a [`MusicgenConfig`] (or a derived class) from text encoder, audio encoder and decoder
+        configurations.
+
+        Returns:
+            [`MusicgenConfig`]: An instance of a configuration object
+        """
+
+        return cls(
+            text_encoder=text_encoder_config.to_dict(),
+            audio_encoder=audio_encoder_config.to_dict(),
+            decoder=decoder_config.to_dict(),
+            **kwargs,
+        )
+
+    @property
+    # This is a property because you might want to change the codec model on the fly
+    def sampling_rate(self):
+        return self.audio_encoder.sampling_rate
diff --git a/src/transformers/models/musicgen/convert_musicgen_transformers.py b/src/transformers/models/musicgen/convert_musicgen_transformers.py
new file mode 100644
index 000000000000..517f0099d0cd
--- /dev/null
+++ b/src/transformers/models/musicgen/convert_musicgen_transformers.py
@@ -0,0 +1,209 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert MusicGen checkpoints from the original repository."""
+import argparse
+from pathlib import Path
+from typing import Dict, OrderedDict, Tuple
+
+import torch
+from audiocraft.models import MusicGen
+
+from transformers import (
+    AutoFeatureExtractor,
+    AutoTokenizer,
+    EncodecModel,
+    MusicgenDecoderConfig,
+    MusicgenForConditionalGeneration,
+    MusicgenProcessor,
+    T5EncoderModel,
+)
+from transformers.models.musicgen.modeling_musicgen import MusicgenForCausalLM
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+EXPECTED_MISSING_KEYS = ["model.decoder.embed_positions.weights"]
+
+
+def rename_keys(name):
+    if "emb" in name:
+        name = name.replace("emb", "model.decoder.embed_tokens")
+    if "transformer" in name:
+        name = name.replace("transformer", "model.decoder")
+    if "cross_attention" in name:
+        name = name.replace("cross_attention", "encoder_attn")
+    if "linear1" in name:
+        name = name.replace("linear1", "fc1")
+    if "linear2" in name:
+        name = name.replace("linear2", "fc2")
+    if "norm1" in name:
+        name = name.replace("norm1", "self_attn_layer_norm")
+    if "norm_cross" in name:
+        name = name.replace("norm_cross", "encoder_attn_layer_norm")
+    if "norm2" in name:
+        name = name.replace("norm2", "final_layer_norm")
+    if "out_norm" in name:
+        name = name.replace("out_norm", "model.decoder.layer_norm")
+    if "linears" in name:
+        name = name.replace("linears", "lm_heads")
+    if "condition_provider.conditioners.description.output_proj" in name:
+        name = name.replace("condition_provider.conditioners.description.output_proj", "enc_to_dec_proj")
+    return name
+
+
+def rename_state_dict(state_dict: OrderedDict, hidden_size: int) -> Tuple[Dict, Dict]:
+    """Function that takes the fairseq Musicgen state dict and renames it according to the HF
+    module names. It further partitions the state dict into the decoder (LM) state dict, and that for the
+    encoder-decoder projection."""
+    keys = list(state_dict.keys())
+    enc_dec_proj_state_dict = {}
+    for key in keys:
+        val = state_dict.pop(key)
+        key = rename_keys(key)
+        if "in_proj_weight" in key:
+            # split fused qkv proj
+            state_dict[key.replace("in_proj_weight", "q_proj.weight")] = val[:hidden_size, :]
+            state_dict[key.replace("in_proj_weight", "k_proj.weight")] = val[hidden_size : 2 * hidden_size, :]
+            state_dict[key.replace("in_proj_weight", "v_proj.weight")] = val[-hidden_size:, :]
+        elif "enc_to_dec_proj" in key:
+            enc_dec_proj_state_dict[key[len("enc_to_dec_proj.") :]] = val
+        else:
+            state_dict[key] = val
+    return state_dict, enc_dec_proj_state_dict
+
+
+def decoder_config_from_checkpoint(checkpoint: str) -> MusicgenDecoderConfig:
+    if checkpoint == "small":
+        # default config values
+        hidden_size = 1024
+        num_hidden_layers = 24
+        num_attention_heads = 16
+    elif checkpoint == "medium":
+        hidden_size = 1536
+        num_hidden_layers = 48
+        num_attention_heads = 24
+    elif checkpoint == "large":
+        hidden_size = 2048
+        num_hidden_layers = 48
+        num_attention_heads = 32
+    else:
+        raise ValueError(f"Checkpoint should be one of `['small', 'medium', 'large']`, got {checkpoint}.")
+    config = MusicgenDecoderConfig(
+        hidden_size=hidden_size,
+        ffn_dim=hidden_size * 4,
+        num_hidden_layers=num_hidden_layers,
+        num_attention_heads=num_attention_heads,
+    )
+    return config
+
+
+@torch.no_grad()
+def convert_musicgen_checkpoint(checkpoint, pytorch_dump_folder=None, repo_id=None, device="cpu"):
+    fairseq_model = MusicGen.get_pretrained(checkpoint, device=device)
+    decoder_config = decoder_config_from_checkpoint(checkpoint)
+
+    decoder_state_dict = fairseq_model.lm.state_dict()
+    decoder_state_dict, enc_dec_proj_state_dict = rename_state_dict(
+        decoder_state_dict, hidden_size=decoder_config.hidden_size
+    )
+
+    text_encoder = T5EncoderModel.from_pretrained("t5-base")
+    audio_encoder = EncodecModel.from_pretrained("facebook/encodec_32khz")
+    decoder = MusicgenForCausalLM(decoder_config).eval()
+
+    # load all decoder weights - expect that we'll be missing embeddings and enc-dec projection
+    missing_keys, unexpected_keys = decoder.load_state_dict(decoder_state_dict, strict=False)
+
+    for key in missing_keys.copy():
+        if key.startswith(("text_encoder", "audio_encoder")) or key in EXPECTED_MISSING_KEYS:
+            missing_keys.remove(key)
+
+    if len(missing_keys) > 0:
+        raise ValueError(f"Missing key(s) in state_dict: {missing_keys}")
+
+    if len(unexpected_keys) > 0:
+        raise ValueError(f"Unexpected key(s) in state_dict: {unexpected_keys}")
+
+    # init the composite model
+    model = MusicgenForConditionalGeneration(text_encoder=text_encoder, audio_encoder=audio_encoder, decoder=decoder)
+
+    # load the pre-trained enc-dec projection (from the decoder state dict)
+    model.enc_to_dec_proj.load_state_dict(enc_dec_proj_state_dict)
+
+    # check we can do a forward pass
+    input_ids = torch.arange(0, 8, dtype=torch.long).reshape(2, -1)
+    decoder_input_ids = input_ids.reshape(2 * 4, -1)
+
+    with torch.no_grad():
+        logits = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids).logits
+
+    if logits.shape != (8, 1, 2048):
+        raise ValueError("Incorrect shape for logits")
+
+    # now construct the processor
+    tokenizer = AutoTokenizer.from_pretrained("t5-base")
+    feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/encodec_32khz", padding_side="left")
+
+    processor = MusicgenProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+
+    # set the appropriate bos/pad token ids
+    model.generation_config.decoder_start_token_id = 2048
+    model.generation_config.pad_token_id = 2048
+
+    # set other default generation config params
+    model.generation_config.max_length = int(30 * audio_encoder.config.frame_rate)
+    model.generation_config.do_sample = True
+    model.generation_config.guidance_scale = 3.0
+
+    if pytorch_dump_folder is not None:
+        Path(pytorch_dump_folder).mkdir(exist_ok=True)
+        logger.info(f"Saving model {checkpoint} to {pytorch_dump_folder}")
+        model.save_pretrained(pytorch_dump_folder)
+        processor.save_pretrained(pytorch_dump_folder)
+
+    if repo_id:
+        logger.info(f"Pushing model {checkpoint} to {repo_id}")
+        model.push_to_hub(repo_id)
+        processor.push_to_hub(repo_id)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--checkpoint",
+        default="small",
+        type=str,
+        help="Checkpoint size of the MusicGen model you'd like to convert. Can be one of: `['small', 'medium', 'large']`.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder",
+        required=True,
+        default=None,
+        type=str,
+        help="Path to the output PyTorch model directory.",
+    )
+    parser.add_argument(
+        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
+    )
+    parser.add_argument(
+        "--device", default="cpu", type=str, help="Torch device to run the conversion, either cpu or cuda."
+    )
+
+    args = parser.parse_args()
+    convert_musicgen_checkpoint(args.checkpoint, args.pytorch_dump_folder, args.push_to_hub)
diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py
new file mode 100644
index 000000000000..f178a6762005
--- /dev/null
+++ b/src/transformers/models/musicgen/modeling_musicgen.py
@@ -0,0 +1,2537 @@
+# coding=utf-8
+# Copyright 2023 Meta AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Musicgen model."""
+import copy
+import inspect
+import math
+import random
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+from torch.utils.checkpoint import checkpoint
+
+from ...activations import ACT2FN
+from ...generation.configuration_utils import GenerationConfig
+from ...generation.logits_process import ClassifierFreeGuidanceLogitsProcessor, LogitsProcessorList
+from ...generation.stopping_criteria import StoppingCriteriaList
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    ModelOutput,
+    Seq2SeqLMOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from ..auto.configuration_auto import AutoConfig
+from ..auto.modeling_auto import AutoModel
+from .configuration_musicgen import MusicgenConfig, MusicgenDecoderConfig
+
+
+if TYPE_CHECKING:
+    from ...generation.streamers import BaseStreamer
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "MusicgenConfig"
+_CHECKPOINT_FOR_DOC = "facebook/musicgen-small"
+
+MUSICGEN_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/musicgen-small",
+    # See all Musicgen models at https://huggingface.co/models?filter=musicgen
+]
+
+
+@dataclass
+class MusicgenUnconditionalInput(ModelOutput):
+    """
+    Args:
+        encoder_outputs  (`Tuple[torch.FloatTensor]` of length 1, with tensor shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the text encoder model.
+        attention_mask (`torch.LongTensor`)  of shape `(batch_size, sequence_length)`, *optional*):
+            Encoder attention mask to avoid performing attention on padding token indices. Mask values selected in `[0,
+            1]`: 1 for tokens that are **not masked**, 0 for tokens that are **masked**.
+        guidance_scale (`float`, *optional*):
+            Guidance scale for classifier free guidance, setting the balance between the conditional logits (predicted
+            from the prompts) and the unconditional logits (predicted without prompts).
+    """
+
+    encoder_outputs: Tuple[torch.FloatTensor] = None
+    attention_mask: torch.LongTensor = None
+    guidance_scale: float = None
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+# Copied from transformers.models.encoder_decoder.modeling_encoder_decoder.shift_tokens_right
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    if decoder_start_token_id is None:
+        raise ValueError("Make sure to set the decoder_start_token_id attribute of the model's configuration.")
+    shifted_input_ids[:, 0] = decoder_start_token_id
+
+    if pad_token_id is None:
+        raise ValueError("Make sure to set the pad_token_id attribute of the model's configuration.")
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+    return shifted_input_ids
+
+
+class MusicgenSinusoidalPositionalEmbedding(nn.Module):
+    """This module produces sinusoidal positional embeddings of any length."""
+
+    def __init__(self, num_positions: int, embedding_dim: int):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.make_weights(num_positions, embedding_dim)
+
+    def make_weights(self, num_embeddings: int, embedding_dim: int):
+        emb_weights = self.get_embedding(num_embeddings, embedding_dim)
+        if hasattr(self, "weights"):
+            # in forward put the weights on the correct dtype and device of the param
+            emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
+
+        self.weights = nn.Parameter(emb_weights)
+        self.weights.requires_grad = False
+        self.weights.detach_()
+
+    @staticmethod
+    def get_embedding(num_embeddings: int, embedding_dim: int):
+        """
+        Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
+        description in Section 3.5 of "Attention Is All You Need".
+        """
+        half_dim = embedding_dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb)
+        emb = torch.arange(num_embeddings, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat([torch.cos(emb), torch.sin(emb)], dim=1).view(num_embeddings, -1)
+        if embedding_dim % 2 == 1:
+            # zero pad
+            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
+        return emb.to(torch.get_default_dtype())
+
+    @torch.no_grad()
+    def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):
+        bsz, codebooks, seq_len = input_ids.size()
+        # Create the position ids from the input token ids.
+        position_ids = (torch.arange(seq_len) + past_key_values_length).to(input_ids.device)
+        # expand embeddings if needed
+        if seq_len > self.weights.size(0):
+            self.make_weights(seq_len + self.offset, self.embedding_dim)
+        return self.weights.index_select(0, position_ids.view(-1)).detach()
+
+
+# Copied from transformers.models.bart.modeling_bart.BartAttention
+class MusicgenAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_value` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        if (
+            is_cross_attention
+            and past_key_value is not None
+            and past_key_value[0].shape[2] == key_value_states.shape[1]
+        ):
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
+class MusicgenDecoderLayer(nn.Module):
+    def __init__(self, config: MusicgenDecoderConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+
+        self.self_attn = MusicgenAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+            bias=False,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.encoder_attn = MusicgenAttention(
+            self.embed_dim,
+            config.num_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+            bias=False,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.ffn_dim, bias=False)
+        self.fc2 = nn.Linear(config.ffn_dim, self.embed_dim, bias=False)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    # Copied from transformers.models.mbart.modeling_mbart.MBartDecoderLayer.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+                size `(decoder_attention_heads,)`.
+            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+            )
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+class MusicgenPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = MusicgenDecoderConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["MusicgenDecoderLayer", "MusicgenAttention"]
+
+    def _init_weights(self, module):
+        std = self.config.initializer_factor
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, MusicgenDecoder):
+            module.gradient_checkpointing = value
+
+
+MUSICGEN_START_DOCSTRING = r"""
+
+    The Musicgen model was proposed in [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by
+    Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi, Alexandre Défossez. It is an
+    encoder decoder transformer trained on the task of conditional music generation
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`MusicgenConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+MUSICGEN_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary, corresponding to the sequence of audio codes.
+
+            Indices can be obtained by encoding an audio prompt with an audio encoder model to predict audio codes,
+            such as with the [`EncodecModel`]. See [`EncodecModel.encode`] for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            <Tip warning={true}>
+
+            The `decoder_input_ids` will automatically be converted from shape `(batch_size * num_codebooks,
+            target_sequence_length)` to `(batch_size, num_codebooks, target_sequence_length)` in the forward pass. If
+            you obtain audio codes from an audio encoding model, such as [`EncodecModel`], ensure that the number of
+            frames is equal to 1, and that you reshape the audio codes from `(frames, batch_size, num_codebooks,
+            target_sequence_length)` to `(batch_size * num_codebooks, target_sequence_length)` prior to passing them as
+            `decoder_input_ids`.
+
+            </Tip>
+
+        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
+            1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape
+            `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you
+            can choose to directly pass an embedded representation. This is useful if you want more control over how to
+            convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
+            input (see `past_key_values`). This is useful if you want more control over how to convert
+            `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+
+            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
+            of `inputs_embeds`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+MUSICGEN_DECODER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary, corresponding to the sequence of audio codes.
+
+            Indices can be obtained by encoding an audio prompt with an audio encoder model to predict audio codes,
+            such as with the [`EncodecModel`]. See [`EncodecModel.encode`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+
+            <Tip warning={true}>
+
+            The `input_ids` will automatically be converted from shape `(batch_size * num_codebooks,
+            target_sequence_length)` to `(batch_size, num_codebooks, target_sequence_length)` in the forward pass. If
+            you obtain audio codes from an audio encoding model, such as [`EncodecModel`], ensure that the number of
+            frames is equal to 1, and that you reshape the audio codes from `(frames, batch_size, num_codebooks,
+            target_sequence_length)` to `(batch_size * num_codebooks, target_sequence_length)` prior to passing them as
+            `input_ids`.
+
+            </Tip>
+
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
+            the decoder.
+        encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+            Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+            selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
+            cross-attention on hidden heads. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape
+            `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you
+            can choose to directly pass an embedded representation. This is useful if you want more control over how to
+            convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class MusicgenDecoder(MusicgenPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MusicgenDecoderLayer`]
+    """
+
+    def __init__(self, config: MusicgenDecoderConfig):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.layerdrop
+        self.max_target_positions = config.max_position_embeddings
+        self.d_model = config.hidden_size
+        self.num_codebooks = config.num_codebooks
+        self.embed_scale = math.sqrt(config.hidden_size) if config.scale_embedding else 1.0
+
+        embed_dim = config.vocab_size + 1
+        self.embed_tokens = nn.ModuleList(
+            [nn.Embedding(embed_dim, config.hidden_size) for _ in range(config.num_codebooks)]
+        )
+
+        self.embed_positions = MusicgenSinusoidalPositionalEmbedding(
+            config.max_position_embeddings,
+            config.hidden_size,
+        )
+
+        self.layers = nn.ModuleList([MusicgenDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+
+        return combined_attention_mask
+
+    @add_start_docstrings_to_model_forward(MUSICGEN_DECODER_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            # (bsz * codebooks, seq_len) -> (bsz, codebooks, seq_len)
+            input = input_ids.reshape(-1, self.num_codebooks, input_ids.shape[-1])
+            bsz, num_codebooks, seq_len = input.shape
+            input_shape = (bsz, seq_len)
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            input = inputs_embeds[:, :, -1:]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if inputs_embeds is None:
+            inputs_embeds = sum([self.embed_tokens[codebook](input[:, codebook]) for codebook in range(num_codebooks)])
+
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, input_shape, inputs_embeds, past_key_values_length
+        )
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+
+        # embed positions
+        positions = self.embed_positions(input, past_key_values_length)
+
+        hidden_states = inputs_embeds + positions.to(inputs_embeds.device)
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        next_decoder_cache = () if use_cache else None
+
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+            if attn_mask is not None:
+                if attn_mask.size()[0] != len(self.layers):
+                    raise ValueError(
+                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                        f" {attn_mask.size()[0]}."
+                    )
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, use_cache)
+
+                    return custom_forward
+
+                layer_outputs = checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    head_mask[idx] if head_mask is not None else None,
+                    cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    cross_attn_layer_head_mask=(
+                        cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
+                    ),
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    "The bare Musicgen decoder model outputting raw hidden-states without any specific head on top.",
+    MUSICGEN_START_DOCSTRING,
+)
+class MusicgenModel(MusicgenPreTrainedModel):
+    def __init__(self, config: MusicgenDecoderConfig):
+        super().__init__(config)
+        self.decoder = MusicgenDecoder(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.decoder.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.decoder.embed_tokens = value
+
+    def get_decoder(self):
+        return self.decoder
+
+    @add_start_docstrings_to_model_forward(MUSICGEN_DECODER_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            encoder_attention_mask=encoder_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            head_mask=head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            hidden_states=decoder_outputs.hidden_states,
+            attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    "The MusicGen decoder model with a language modelling head on top.",
+    MUSICGEN_START_DOCSTRING,
+)
+class MusicgenForCausalLM(MusicgenPreTrainedModel):
+    def __init__(self, config: MusicgenDecoderConfig):
+        super().__init__(config)
+
+        self.model = MusicgenModel(config)
+
+        self.num_codebooks = config.num_codebooks
+        self.lm_heads = nn.ModuleList(
+            [nn.Linear(config.hidden_size, config.vocab_size, bias=False) for _ in range(config.num_codebooks)]
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.decoder.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.decoder.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_heads
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_heads = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model.decoder = decoder
+
+    def get_decoder(self):
+        return self.model.decoder
+
+    @add_start_docstrings_to_model_forward(MUSICGEN_DECODER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+                Returns:
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            head_mask=head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+
+        lm_logits = torch.stack([head(hidden_states) for head in self.lm_heads], dim=1)
+
+        loss = None
+        if labels is not None:
+            raise NotImplementedError("Training is not implemented for Musicgen.")
+
+        # (bsz, num_codebooks, seq_len, vocab_size) -> (bsz * num_codebooks, seq_len, vocab_size)
+        lm_logits = lm_logits.reshape(-1, *lm_logits.shape[2:])
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        head_mask=None,
+        cross_attn_head_mask=None,
+        past_key_values=None,
+        use_cache=True,
+        delay_pattern_mask=None,
+        guidance_scale=None,
+        **kwargs,
+    ):
+        if delay_pattern_mask is None:
+            input_ids, delay_pattern_mask = self.build_delay_pattern_mask(
+                input_ids,
+                pad_token_id=self.generation_config.pad_token_id,
+                max_length=self.generation_config.max_length,
+            )
+
+        # apply the delay pattern mask
+        input_ids = self.apply_delay_pattern_mask(input_ids, delay_pattern_mask)
+
+        if guidance_scale is not None and guidance_scale > 1:
+            # for classifier free guidance we need to replicate the decoder args across the batch dim (we'll split these
+            # before sampling)
+            input_ids = input_ids.repeat((2, 1))
+            if attention_mask is not None:
+                attention_mask = attention_mask.repeat((2, 1))
+
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "encoder_hidden_states": encoder_hidden_states,
+            "encoder_attention_mask": encoder_attention_mask,
+            "head_mask": head_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "past_key_values": past_key_values,
+            "use_cache": use_cache,
+        }
+
+    def build_delay_pattern_mask(self, input_ids: torch.LongTensor, pad_token_id: int, max_length: int = None):
+        """Build a delayed pattern mask to the input_ids. Each codebook is offset by the previous codebook by
+        one, giving a delayed pattern mask at the start of sequence and end of sequence. Take the example where there
+        are 4 codebooks and a max sequence length of 8, we have the delayed pattern mask of shape `(codebooks,
+        seq_len)`:
+        - [P, -1, -1, -1, -1, P, P, P]
+        - [P, P, -1, -1, -1, -1, P, P]
+        - [P, P, P, -1, -1, -1, -1, P]
+        - [P, P, P, P, -1, -1, -1, -1]
+        where P is the special padding token id and -1 indicates that the token is valid for prediction. If we include
+        a prompt (decoder input ids), the -1 positions indicate where new tokens should be predicted. Otherwise, the
+        mask is set to the value in the prompt:
+        - [P, a, b, -1, -1, P, P, P]
+        - [P, P, c, d, -1, -1, P, P]
+        - [P, P, P, e, f, -1, -1, P]
+        - [P, P, P, P, g, h, -1, -1]
+        where a-h indicate the input prompt (decoder input ids) that are offset by 1. Now, we only override the -1
+        tokens in our prediction.
+        """
+        # (bsz * num_codebooks, seq_len) -> (bsz, num_codebooks, seq_len)
+        input_ids = input_ids.reshape(-1, self.num_codebooks, input_ids.shape[-1])
+        bsz, num_codebooks, seq_len = input_ids.shape
+
+        max_length = max_length if max_length is not None else self.generation_config.max_length
+        input_ids_shifted = (
+            torch.ones((bsz, num_codebooks, max_length), dtype=torch.long, device=input_ids.device) * -1
+        )
+
+        # we only apply the mask if we have a large enough seq len - otherwise we return as is
+        if max_length < 2 * num_codebooks - 1:
+            return input_ids.reshape(bsz * num_codebooks, -1), input_ids_shifted.reshape(bsz * num_codebooks, -1)
+
+        # fill the shifted ids with the prompt entries, offset by the codebook idx
+        for codebook in range(num_codebooks):
+            input_ids_shifted[:, codebook, codebook : seq_len + codebook] = input_ids[:, codebook]
+
+        # construct a pattern mask that indicates the positions of padding tokens for each codebook
+        # first fill the upper triangular part (the EOS padding)
+        delay_pattern = torch.triu(
+            torch.ones((num_codebooks, max_length), dtype=torch.bool), diagonal=max_length - num_codebooks + 1
+        )
+        # then fill the lower triangular part (the BOS padding)
+        delay_pattern = delay_pattern + torch.tril(torch.ones((num_codebooks, max_length), dtype=torch.bool))
+        mask = ~delay_pattern.to(input_ids.device)
+        input_ids = mask * input_ids_shifted + ~mask * pad_token_id
+
+        # find the first position to start generating - this is the first place we have the -1 token
+        # and will always be in the first codebook (since it has no codebook offset)
+        first_codebook_ids = input_ids[:, 0, :]
+        start_ids = (first_codebook_ids == -1).nonzero()[:, 1]
+        if len(start_ids) > 0:
+            first_start_id = min(start_ids)
+        else:
+            # we have no tokens that need to be filled - return entire matrix of input ids
+            first_start_id = seq_len
+
+        # (bsz * num_codebooks, seq_len) -> (bsz, num_codebooks, seq_len)
+        pattern_mask = input_ids.reshape(bsz * num_codebooks, -1)
+        input_ids = input_ids[..., :first_start_id].reshape(bsz * num_codebooks, -1)
+        return input_ids, pattern_mask
+
+    @staticmethod
+    def apply_delay_pattern_mask(input_ids, decoder_pad_token_mask):
+        """Apply a delay pattern mask to the decoder input ids, only preserving predictions where
+        the mask is set to -1, and otherwise setting to the value detailed in the mask."""
+        seq_len = input_ids.shape[-1]
+        decoder_pad_token_mask = decoder_pad_token_mask[..., :seq_len]
+        input_ids = torch.where(decoder_pad_token_mask == -1, input_ids, decoder_pad_token_mask)
+        return input_ids
+
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        synced_gpus: Optional[bool] = None,
+        streamer: Optional["BaseStreamer"] = None,
+        **kwargs,
+    ):
+        """
+
+        Generates sequences of token ids for models with a language modeling head.
+
+        <Tip warning={true}>
+
+        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
+        model's default generation configuration. You can override any `generation_config` by passing the corresponding
+        parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
+
+        For an overview of generation strategies and code examples, check out the [following
+        guide](./generation_strategies).
+
+        </Tip>
+
+        Parameters:
+            inputs (`torch.Tensor` of varying shape depending on the modality, *optional*):
+                The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
+                method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs`
+                should be in the format `input_ids`. For encoder-decoder models *inputs* can represent any of
+                `input_ids`, `input_values`, `input_features`, or `pixel_values`.
+            generation_config (`~generation.GenerationConfig`, *optional*):
+                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+                passed to generate matching the attributes of `generation_config` will override them. If
+                `generation_config` is not provided, the default will be used, which had the following loading
+                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
+                default values, whose documentation should be checked to parameterize generation.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                Custom logits processors that complement the default logits processors built from arguments and
+                generation config. If a logit processor is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                Custom stopping criteria that complement the default stopping criteria built from arguments and a
+                generation config. If a stopping criteria is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            synced_gpus (`bool`, *optional*, defaults to `False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+            streamer (`BaseStreamer`, *optional*):
+                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
+                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
+            kwargs (`Dict[str, Any]`, *optional*):
+                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
+                forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
+                specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
+
+        Return:
+            [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
+            or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`.
+
+                If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
+                [`~utils.ModelOutput`] types are:
+
+                    - [`~generation.GreedySearchDecoderOnlyOutput`],
+                    - [`~generation.SampleDecoderOnlyOutput`],
+                    - [`~generation.BeamSearchDecoderOnlyOutput`],
+                    - [`~generation.BeamSampleDecoderOnlyOutput`]
+
+                If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
+                [`~utils.ModelOutput`] types are:
+
+                    - [`~generation.GreedySearchEncoderDecoderOutput`],
+                    - [`~generation.SampleEncoderDecoderOutput`],
+                    - [`~generation.BeamSearchEncoderDecoderOutput`],
+                    - [`~generation.BeamSampleEncoderDecoderOutput`]
+        """
+        # 1. Handle `generation_config` and kwargs that might update it, and validate the resulting objects
+        if generation_config is None:
+            generation_config = self.generation_config
+
+        generation_config = copy.deepcopy(generation_config)
+        model_kwargs = generation_config.update(**kwargs)  # All unused kwargs must be model kwargs
+        generation_config.validate()
+        self._validate_model_kwargs(model_kwargs.copy())
+
+        # 2. Set generation parameters if not already defined
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+
+        if generation_config.pad_token_id is None and generation_config.eos_token_id is not None:
+            if model_kwargs.get("attention_mask", None) is None:
+                logger.warning(
+                    "The attention mask and the pad token id were not set. As a consequence, you may observe "
+                    "unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results."
+                )
+            eos_token_id = generation_config.eos_token_id
+            if isinstance(eos_token_id, list):
+                eos_token_id = eos_token_id[0]
+            logger.warning(f"Setting `pad_token_id` to `eos_token_id`:{eos_token_id} for open-end generation.")
+            generation_config.pad_token_id = eos_token_id
+
+        # 3. Define model inputs
+        # inputs_tensor has to be defined
+        # model_input_name is defined if model-specific keyword input is passed
+        # otherwise model_input_name is None
+        # all model-specific keyword inputs are removed from `model_kwargs`
+        input_ids, model_input_name, model_kwargs = self._prepare_model_inputs(
+            inputs, generation_config.bos_token_id, model_kwargs
+        )
+        batch_size = input_ids.shape[0] // self.num_codebooks
+
+        # 4. Define other model kwargs
+        model_kwargs["output_attentions"] = generation_config.output_attentions
+        model_kwargs["output_hidden_states"] = generation_config.output_hidden_states
+        model_kwargs["use_cache"] = generation_config.use_cache
+        model_kwargs["guidance_scale"] = generation_config.guidance_scale
+
+        requires_attention_mask = "encoder_outputs" not in model_kwargs
+        if model_kwargs.get("attention_mask", None) is None and requires_attention_mask:
+            model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation(
+                input_ids, generation_config.pad_token_id, generation_config.eos_token_id
+            )
+
+        # 5. Prepare `max_length` depending on other stopping criteria.
+        input_ids_seq_length = input_ids.shape[-1]
+        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
+        if has_default_max_length and generation_config.max_new_tokens is None and generation_config.max_length == 20:
+            logger.warning(
+                f"Using the model-agnostic default `max_length` (={generation_config.max_length}) "
+                "to control the generation length.  recommend setting `max_new_tokens` to control the maximum length of the generation."
+            )
+        elif generation_config.max_new_tokens is not None:
+            if not has_default_max_length:
+                logger.warning(
+                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
+                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
+                    "Please refer to the documentation for more information. "
+                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)"
+                )
+            generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
+
+        if generation_config.min_length is not None and generation_config.min_length > generation_config.max_length:
+            raise ValueError(
+                f"Unfeasible length constraints: the minimum length ({generation_config.min_length}) is larger than"
+                f" the maximum length ({generation_config.max_length})"
+            )
+        if input_ids_seq_length >= generation_config.max_length:
+            logger.warning(
+                f"Input length of decoder_input_ids is {input_ids_seq_length}, but `max_length` is set to"
+                f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
+                " increasing `max_new_tokens`."
+            )
+
+        # 6. Prepare `input_ids` which will be used for auto-regressive generation
+        # Build the delay pattern mask for offsetting each codebook prediction by 1 (this behaviour is specific to MusicGen)
+        input_ids, delay_pattern_mask = self.build_delay_pattern_mask(
+            input_ids,
+            pad_token_id=generation_config.decoder_start_token_id,
+            max_length=generation_config.max_length,
+        )
+
+        if streamer is not None:
+            streamer.put(input_ids.cpu())
+
+        # stash the delay mask so that we don't have to recompute it in each forward pass
+        model_kwargs["delay_pattern_mask"] = delay_pattern_mask
+
+        # 7. determine generation mode
+        is_greedy_gen_mode = (
+            (generation_config.num_beams == 1)
+            and (generation_config.num_beam_groups == 1)
+            and generation_config.do_sample is False
+        )
+        is_sample_gen_mode = (
+            (generation_config.num_beams == 1)
+            and (generation_config.num_beam_groups == 1)
+            and generation_config.do_sample is True
+        )
+
+        # 8. prepare batched CFG externally (to enable coexistance with the unbatched CFG)
+        if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
+            logits_processor.append(ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale))
+            generation_config.guidance_scale = None
+
+        # 9. prepare distribution pre_processing samplers
+        logits_processor = self._get_logits_processor(
+            generation_config=generation_config,
+            input_ids_seq_length=input_ids_seq_length,
+            encoder_input_ids=input_ids,
+            prefix_allowed_tokens_fn=None,
+            logits_processor=logits_processor,
+        )
+
+        # 10. prepare stopping criteria
+        stopping_criteria = self._get_stopping_criteria(
+            generation_config=generation_config, stopping_criteria=stopping_criteria
+        )
+
+        if is_greedy_gen_mode:
+            if generation_config.num_return_sequences > 1:
+                raise ValueError(
+                    "num_return_sequences has to be 1 when doing greedy search, "
+                    f"but is {generation_config.num_return_sequences}."
+                )
+
+            # 11. run greedy search
+            outputs = self.greedy_search(
+                input_ids,
+                logits_processor=logits_processor,
+                stopping_criteria=stopping_criteria,
+                pad_token_id=generation_config.pad_token_id,
+                eos_token_id=generation_config.eos_token_id,
+                output_scores=generation_config.output_scores,
+                return_dict_in_generate=generation_config.return_dict_in_generate,
+                synced_gpus=synced_gpus,
+                streamer=streamer,
+                **model_kwargs,
+            )
+
+        elif is_sample_gen_mode:
+            # 11. prepare logits warper
+            logits_warper = self._get_logits_warper(generation_config)
+
+            # expand input_ids with `num_return_sequences` additional sequences per batch
+            input_ids, model_kwargs = self._expand_inputs_for_generation(
+                input_ids=input_ids,
+                expand_size=generation_config.num_return_sequences,
+                **model_kwargs,
+            )
+
+            # 12. run sample
+            outputs = self.sample(
+                input_ids,
+                logits_processor=logits_processor,
+                logits_warper=logits_warper,
+                stopping_criteria=stopping_criteria,
+                pad_token_id=generation_config.pad_token_id,
+                eos_token_id=generation_config.eos_token_id,
+                output_scores=generation_config.output_scores,
+                return_dict_in_generate=generation_config.return_dict_in_generate,
+                synced_gpus=synced_gpus,
+                streamer=streamer,
+                **model_kwargs,
+            )
+
+        else:
+            raise ValueError(
+                "Got incompatible mode for generation, should be one of greedy or sampling."
+                "Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`."
+            )
+
+        if generation_config.return_dict_in_generate:
+            output_ids = outputs.sequences
+        else:
+            output_ids = outputs
+
+        # apply the pattern mask to the final ids
+        output_ids = self.apply_delay_pattern_mask(output_ids, model_kwargs["delay_pattern_mask"])
+
+        # revert the pattern delay mask by filtering the pad token id
+        output_ids = output_ids[output_ids != generation_config.pad_token_id].reshape(
+            batch_size, self.num_codebooks, -1
+        )
+
+        if generation_config.return_dict_in_generate:
+            outputs.sequences = output_ids
+            return outputs
+        else:
+            return output_ids
+
+
+@add_start_docstrings(
+    "The composite MusicGen model with a text encoder, audio encoder and Musicgen decoder,"
+    "for music generation tasks with one or both of text and audio prompts.",
+    MUSICGEN_START_DOCSTRING,
+)
+class MusicgenForConditionalGeneration(PreTrainedModel):
+    config_class = MusicgenConfig
+    base_model_prefix = "encoder_decoder"
+    main_input_name = "input_ids"
+    supports_gradient_checkpointing = True
+
+    def __init__(
+        self,
+        config: Optional[MusicgenConfig] = None,
+        text_encoder: Optional[PreTrainedModel] = None,
+        audio_encoder: Optional[PreTrainedModel] = None,
+        decoder: Optional[MusicgenForCausalLM] = None,
+    ):
+        if config is None and (text_encoder is None or audio_encoder is None or decoder is None):
+            raise ValueError(
+                "Either a configuration has to be provided, or all three of text encoder, audio encoder and MusicGen decoder."
+            )
+        if config is None:
+            config = MusicgenConfig.from_sub_models_config(text_encoder.config, audio_encoder.config, decoder.config)
+        else:
+            if not isinstance(config, self.config_class):
+                raise ValueError(f"Config: {config} has to be of type {self.config_class}")
+
+        if config.decoder.cross_attention_hidden_size is not None:
+            if config.decoder.cross_attention_hidden_size != config.text_encoder.hidden_size:
+                raise ValueError(
+                    "If `cross_attention_hidden_size` is specified in the MusicGen decoder's configuration, it has to be equal"
+                    f" to the text encoder's `hidden_size`. Got {config.decoder.cross_attention_hidden_size} for"
+                    f" `config.decoder.cross_attention_hidden_size` and {config.text_encoder.hidden_size} for"
+                    " `config.text_encoder.hidden_size`."
+                )
+
+        # initialize with config
+        super().__init__(config)
+
+        if text_encoder is None:
+            from ..auto.modeling_auto import AutoModelForTextEncoding
+
+            text_encoder = AutoModelForTextEncoding.from_config(config.text_encoder)
+
+        if audio_encoder is None:
+            from ..auto.modeling_auto import AutoModel
+
+            audio_encoder = AutoModel.from_config(config.audio_encoder)
+
+        if decoder is None:
+            decoder = MusicgenForCausalLM(config.decoder)
+
+        self.text_encoder = text_encoder
+        self.audio_encoder = audio_encoder
+        self.decoder = decoder
+
+        if self.text_encoder.config.to_dict() != self.config.text_encoder.to_dict():
+            logger.warning(
+                f"Config of the text_encoder: {self.text_encoder.__class__} is overwritten by shared text_encoder config:"
+                f" {self.config.text_encoder}"
+            )
+        if self.audio_encoder.config.to_dict() != self.config.audio_encoder.to_dict():
+            logger.warning(
+                f"Config of the audio_encoder: {self.audio_encoder.__class__} is overwritten by shared audio_encoder config:"
+                f" {self.config.audio_encoder}"
+            )
+        if self.decoder.config.to_dict() != self.config.decoder.to_dict():
+            logger.warning(
+                f"Config of the decoder: {self.decoder.__class__} is overwritten by shared decoder config:"
+                f" {self.config.decoder}"
+            )
+
+        # make sure that the individual model's config refers to the shared config
+        # so that the updates to the config will be synced
+        self.text_encoder.config = self.config.text_encoder
+        self.audio_encoder.config = self.config.audio_encoder
+        self.decoder.config = self.config.decoder
+
+        # text encoder outputs might need to be projected to different dimension for decoder
+        if (
+            self.text_encoder.config.hidden_size != self.decoder.config.hidden_size
+            and self.decoder.config.cross_attention_hidden_size is None
+        ):
+            self.enc_to_dec_proj = nn.Linear(self.text_encoder.config.hidden_size, self.decoder.config.hidden_size)
+
+        if self.text_encoder.get_output_embeddings() is not None:
+            raise ValueError(
+                f"The encoder {self.text_encoder} should not have a LM Head. Please use a model without and LM Head"
+            )
+
+        decoder_signature = set(inspect.signature(self.decoder.forward).parameters.keys())
+        if "encoder_hidden_states" not in decoder_signature:
+            raise ValueError(
+                "The selected decoder is not prepared for the encoder hidden states to be passed. Please see the "
+                "following discussion on GitHub: https://github.com/huggingface/transformers/issues/23350"
+            )
+
+        # tie text encoder, decoder weights if config set accordingly
+        self.tie_weights()
+
+    def tie_weights(self):
+        # tie text encoder & decoder if needed
+        if self.config.tie_encoder_decoder:
+            # tie text encoder and decoder base model
+            decoder_base_model_prefix = self.decoder.base_model_prefix
+            self._tie_encoder_decoder_weights(
+                self.text_encoder, self.decoder._modules[decoder_base_model_prefix], self.decoder.base_model_prefix
+            )
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        # call both encoder and decoder function on gradient checkpointing
+        self.text_encoder._set_gradient_checkpointing(module, value=value)
+        self.decoder._set_gradient_checkpointing(module, value=value)
+
+    def get_audio_encoder(self):
+        return self.audio_encoder
+
+    def get_text_encoder(self):
+        return self.text_encoder
+
+    def get_encoder(self):
+        # get the text encoder to compute the encoder hidden-states for generation
+        return self.get_text_encoder()
+
+    def get_decoder(self):
+        return self.decoder
+
+    def get_input_embeddings(self):
+        return self.text_encoder.get_input_embeddings()
+
+    def get_output_embeddings(self):
+        return self.decoder.get_output_embeddings()
+
+    def set_output_embeddings(self, new_embeddings):
+        return self.decoder.set_output_embeddings(new_embeddings)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r"""
+        Example:
+
+        ```python
+        >>> from transformers import MusicgenForConditionalGeneration
+
+        >>> model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
+        ```"""
+
+        # At the moment fast initialization is not supported for composite models
+        if kwargs.get("_fast_init", False):
+            logger.warning(
+                "Fast initialization is currently not supported for MusicgenForConditionalGeneration. "
+                "Falling back to slow initialization..."
+            )
+        kwargs["_fast_init"] = False
+
+        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+    @classmethod
+    def from_sub_models_pretrained(
+        cls,
+        text_encoder_pretrained_model_name_or_path: str = None,
+        audio_encoder_pretrained_model_name_or_path: str = None,
+        decoder_pretrained_model_name_or_path: str = None,
+        *model_args,
+        **kwargs,
+    ) -> PreTrainedModel:
+        r"""
+        Instantiate a text encoder, an audio encoder, and a MusicGen decoder from one, two or three base classes of the
+        library from pretrained model checkpoints.
+
+
+        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train
+        the model, you need to first set it back in training mode with `model.train()`.
+
+        Params:
+            text_encoder_pretrained_model_name_or_path (`str`, *optional*):
+                Information necessary to initiate the text encoder. Can be either:
+
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+                      Valid model ids can be located at the root-level, like `t5-base`, or namespaced under a user or
+                      organization name, like `google/flan-t5-base.
+                    - A path to a *directory* containing model weights saved using
+                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+
+            audio_encoder_pretrained_model_name_or_path (`str`, *optional*):
+                Information necessary to initiate the audio encoder. Can be either:
+
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
+                      user or organization name, like `facebook/encodec_24khz`.
+                    - A path to a *directory* containing model weights saved using
+                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+
+            decoder_pretrained_model_name_or_path (`str`, *optional*, defaults to `None`):
+                Information necessary to initiate the decoder. Can be either:
+
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+                      Valid model ids can be located at the root-level, like `gpt2`, or namespaced under a user or
+                      organization name, like `facebook/musicgen-small`.
+                    - A path to a *directory* containing model weights saved using
+                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+
+            model_args (remaining positional arguments, *optional*):
+                All remaining positional arguments will be passed to the underlying model's `__init__` method.
+
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
+                `output_attentions=True`).
+
+                - To update the text encoder configuration, use the prefix *text_encoder_* for each configuration
+                  parameter.
+                - To update the audio encoder configuration, use the prefix *audio_encoder_* for each configuration
+                  parameter.
+                - To update the decoder configuration, use the prefix *decoder_* for each configuration parameter.
+                - To update the parent model configuration, do not use a prefix for each configuration parameter.
+
+                Behaves differently depending on whether a `config` is provided or automatically loaded.
+
+        Example:
+
+        ```python
+        >>> from transformers import MusicgenForConditionalGeneration
+
+        >>> # initialize a musicgen model from a t5 text encoder, encodec audio encoder, and musicgen decoder
+        >>> model = MusicgenForConditionalGeneration.from_sub_models_pretrained(
+        ...     text_encoder_pretrained_model_name_or_path="t5-base",
+        ...     audio_encoder_pretrained_model_name_or_path="facebook/encodec_24khz",
+        ...     decoder_pretrained_model_name_or_path="facebook/musicgen-small",
+        ... )
+        >>> # saving model after fine-tuning
+        >>> model.save_pretrained("./musicgen-ft")
+        >>> # load fine-tuned model
+        >>> model = MusicgenForConditionalGeneration.from_pretrained("./musicgen-ft")
+        ```"""
+
+        kwargs_text_encoder = {
+            argument[len("text_encoder_") :]: value
+            for argument, value in kwargs.items()
+            if argument.startswith("text_encoder_")
+        }
+
+        kwargs_audio_encoder = {
+            argument[len("audio_encoder_") :]: value
+            for argument, value in kwargs.items()
+            if argument.startswith("audio_encoder_")
+        }
+
+        kwargs_decoder = {
+            argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")
+        }
+
+        # remove text encoder, audio encoder and decoder kwargs from kwargs
+        for key in kwargs_text_encoder.keys():
+            del kwargs["text_encoder_" + key]
+        for key in kwargs_audio_encoder.keys():
+            del kwargs["audio_encoder_" + key]
+        for key in kwargs_decoder.keys():
+            del kwargs["decoder_" + key]
+
+        # Load and initialize the encoder and decoder
+        # The distinction between encoder and decoder at the model level is made
+        # by the value of the flag `is_decoder` that we need to set correctly.
+        text_encoder = kwargs_text_encoder.pop("model", None)
+        if text_encoder is None:
+            if text_encoder_pretrained_model_name_or_path is None:
+                raise ValueError(
+                    "If `text_encoder_model` is not defined as an argument, a `text_encoder_pretrained_model_name_or_path` has "
+                    "to be defined."
+                )
+
+            if "config" not in kwargs_text_encoder:
+                encoder_config, kwargs_text_encoder = AutoConfig.from_pretrained(
+                    text_encoder_pretrained_model_name_or_path, **kwargs_text_encoder, return_unused_kwargs=True
+                )
+
+                if encoder_config.is_decoder is True or encoder_config.add_cross_attention is True:
+                    logger.info(
+                        f"Initializing {text_encoder_pretrained_model_name_or_path} as a text_encoder model "
+                        "from a decoder model. Cross-attention and casual mask are disabled."
+                    )
+                    encoder_config.is_decoder = False
+                    encoder_config.add_cross_attention = False
+
+                kwargs_text_encoder["config"] = encoder_config
+
+            text_encoder = AutoModel.from_pretrained(
+                text_encoder_pretrained_model_name_or_path, *model_args, **kwargs_text_encoder
+            )
+
+        audio_encoder = kwargs_audio_encoder.pop("model", None)
+        if audio_encoder is None:
+            if audio_encoder_pretrained_model_name_or_path is None:
+                raise ValueError(
+                    "If `audio_encoder_model` is not defined as an argument, an `audio_encoder_pretrained_model_name_or_path` has "
+                    "to be defined."
+                )
+
+            if "config" not in kwargs_audio_encoder:
+                encoder_config, kwargs_audio_encoder = AutoConfig.from_pretrained(
+                    audio_encoder_pretrained_model_name_or_path, **kwargs_audio_encoder, return_unused_kwargs=True
+                )
+
+                if encoder_config.is_decoder is True or encoder_config.add_cross_attention is True:
+                    logger.info(
+                        f"Initializing {audio_encoder_pretrained_model_name_or_path} as an audio_encoder model "
+                        "from a decoder model. Cross-attention and casual mask are disabled."
+                    )
+                    encoder_config.is_decoder = False
+                    encoder_config.add_cross_attention = False
+
+                kwargs_audio_encoder["config"] = encoder_config
+
+            audio_encoder = AutoModel.from_pretrained(
+                audio_encoder_pretrained_model_name_or_path, *model_args, **kwargs_audio_encoder
+            )
+
+        decoder = kwargs_decoder.pop("model", None)
+        if decoder is None:
+            if decoder_pretrained_model_name_or_path is None:
+                raise ValueError(
+                    "If `decoder_model` is not defined as an argument, a `decoder_pretrained_model_name_or_path` has "
+                    "to be defined."
+                )
+
+            if "config" not in kwargs_decoder:
+                decoder_config, kwargs_decoder = AutoConfig.from_pretrained(
+                    decoder_pretrained_model_name_or_path, **kwargs_decoder, return_unused_kwargs=True
+                )
+
+                if isinstance(decoder_config, MusicgenConfig):
+                    decoder_config = decoder_config.decoder
+
+                if decoder_config.is_decoder is False or decoder_config.add_cross_attention is False:
+                    logger.info(
+                        f"Initializing {decoder_pretrained_model_name_or_path} as a decoder model. Cross attention"
+                        f" layers are added to {decoder_pretrained_model_name_or_path} and randomly initialized if"
+                        f" {decoder_pretrained_model_name_or_path}'s architecture allows for cross attention layers."
+                    )
+                    decoder_config.is_decoder = True
+                    decoder_config.add_cross_attention = True
+
+                kwargs_decoder["config"] = decoder_config
+
+            if kwargs_decoder["config"].is_decoder is False or kwargs_decoder["config"].add_cross_attention is False:
+                logger.warning(
+                    f"Decoder model {decoder_pretrained_model_name_or_path} is not initialized as a decoder. "
+                    f"In order to initialize {decoder_pretrained_model_name_or_path} as a decoder, "
+                    "make sure that the attributes `is_decoder` and `add_cross_attention` of `decoder_config` "
+                    "passed to `.from_sub_models_pretrained(...)` are set to `True` or do not pass a "
+                    "`decoder_config` to `.from_sub_models_pretrained(...)`"
+                )
+
+            decoder = MusicgenForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder)
+
+        # instantiate config with corresponding kwargs
+        config = MusicgenConfig.from_sub_models_config(
+            text_encoder.config, audio_encoder.config, decoder.config, **kwargs
+        )
+        return cls(text_encoder=text_encoder, audio_encoder=audio_encoder, decoder=decoder, config=config)
+
+    @add_start_docstrings_to_model_forward(MUSICGEN_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.BoolTensor] = None,
+        input_values: Optional[torch.FloatTensor] = None,
+        padding_mask: Optional[torch.BoolTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        encoder_outputs: Optional[Tuple[torch.FloatTensor]] = None,
+        past_key_values: Tuple[Tuple[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple, Seq2SeqLMOutput]:
+        r"""
+        Returns:
+
+        Examples:
+        ```python
+        >>> from transformers import AutoProcessor, MusicgenForConditionalGeneration
+        >>> import torch
+
+        >>> processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
+        >>> model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
+
+        >>> inputs = processor(
+        ...     text=["80s pop track with bassy drums and synth", "90s rock song with loud guitars and heavy drums"],
+        ...     padding=True,
+        ...     return_tensors="pt",
+        ... )
+
+        >>> pad_token_id = model.generation_config.pad_token_id
+        >>> decoder_input_ids = (
+        ...     torch.ones((inputs.input_ids.shape[0] * model.decoder.num_codebooks, 1), dtype=torch.long)
+        ...     * pad_token_id
+        ... )
+
+        >>> logits = model(**inputs, decoder_input_ids=decoder_input_ids).logits
+        >>> logits.shape  # (bsz * num_codebooks, tgt_len, vocab_size)
+        torch.Size([8, 1, 2048])
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        kwargs_text_encoder = {
+            argument[len("text_encoder_")]: value
+            for argument, value in kwargs.items()
+            if argument.startswith("text_encoder_")
+        }
+
+        kwargs_audio_encoder = {
+            argument[len("audio_encoder_")]: value
+            for argument, value in kwargs.items()
+            if argument.startswith("audio_encoder_")
+        }
+
+        kwargs_decoder = {
+            argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")
+        }
+
+        if encoder_outputs is None:
+            encoder_outputs = self.text_encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                **kwargs_text_encoder,
+            )
+        elif isinstance(encoder_outputs, tuple):
+            encoder_outputs = BaseModelOutput(*encoder_outputs)
+
+        encoder_hidden_states = encoder_outputs[0]
+
+        # optionally project encoder_hidden_states
+        if (
+            self.text_encoder.config.hidden_size != self.decoder.config.hidden_size
+            and self.decoder.config.cross_attention_hidden_size is None
+        ):
+            encoder_hidden_states = self.enc_to_dec_proj(encoder_hidden_states)
+
+        if attention_mask is not None:
+            encoder_hidden_states = encoder_hidden_states * attention_mask[..., None]
+
+        if (labels is not None) and (decoder_input_ids is None and decoder_inputs_embeds is None):
+            decoder_input_ids = shift_tokens_right(
+                labels, self.config.pad_token_id, self.config.decoder_start_token_id
+            )
+
+        elif decoder_input_ids is None and decoder_inputs_embeds is None:
+            audio_encoder_outputs = self.audio_encoder(
+                input_values=input_values,
+                padding_mask=padding_mask,
+                **kwargs_audio_encoder,
+            )
+            audio_codes = audio_encoder_outputs.audio_codes
+            frames, bsz, codebooks, seq_len = audio_codes.shape
+            if frames != 1:
+                raise ValueError(
+                    f"Expected 1 frame in the audio code outputs, got {frames} frames. Ensure chunking is "
+                    "disabled by setting `chunk_length=None` in the audio encoder."
+                )
+            decoder_input_ids = audio_codes[0, ...].reshape(bsz * self.decoder.num_codebooks, seq_len)
+
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            use_cache=use_cache,
+            past_key_values=past_key_values,
+            return_dict=return_dict,
+            **kwargs_decoder,
+        )
+
+        loss = None
+        if labels is not None:
+            logits = decoder_outputs.logits if return_dict else decoder_outputs[0]
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            if loss is not None:
+                return (loss,) + decoder_outputs + encoder_outputs
+            else:
+                return decoder_outputs + encoder_outputs
+
+        return Seq2SeqLMOutput(
+            loss=loss,
+            logits=decoder_outputs.logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        head_mask=None,
+        decoder_attention_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        decoder_delay_pattern_mask=None,
+        guidance_scale=None,
+        **kwargs,
+    ):
+        if decoder_delay_pattern_mask is None:
+            decoder_input_ids, decoder_delay_pattern_mask = self.decoder.build_delay_pattern_mask(
+                decoder_input_ids,
+                self.generation_config.pad_token_id,
+                max_length=self.generation_config.max_length,
+            )
+
+        # apply the delay pattern mask
+        decoder_input_ids = self.decoder.apply_delay_pattern_mask(decoder_input_ids, decoder_delay_pattern_mask)
+
+        if guidance_scale is not None and guidance_scale > 1:
+            # for classifier free guidance we need to replicate the decoder args across the batch dim (we'll split these
+            # before sampling)
+            decoder_input_ids = decoder_input_ids.repeat((2, 1))
+            if decoder_attention_mask is not None:
+                decoder_attention_mask = decoder_attention_mask.repeat((2, 1))
+
+        # cut decoder_input_ids if past is used
+        if past_key_values is not None:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+
+        return {
+            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past_key_values,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "decoder_attention_mask": decoder_attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "use_cache": use_cache,
+        }
+
+    def _prepare_decoder_input_ids_for_generation(
+        self,
+        batch_size: int,
+        model_input_name: str,
+        model_kwargs: Dict[str, torch.Tensor],
+        decoder_start_token_id: int = None,
+        bos_token_id: int = None,
+        device: torch.device = None,
+    ) -> Tuple[torch.LongTensor, Dict[str, torch.Tensor]]:
+        """Prepares `decoder_input_ids` for generation with encoder-decoder models"""
+
+        # 1. Check whether the user has defined `decoder_input_ids` manually. To facilitate in terms of input naming,
+        # we also allow the user to pass it under `input_ids`, if the encoder does not use it as the main input.
+        if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
+            decoder_input_ids = model_kwargs.pop("decoder_input_ids")
+        elif "input_ids" in model_kwargs and model_input_name != "input_ids":
+            decoder_input_ids = model_kwargs.pop("input_ids")
+        else:
+            decoder_input_ids = None
+
+        # 2. Encoder-decoder models expect the `decoder_input_ids` to start with a special token. Let's ensure that.
+        decoder_start_token_id = self._get_decoder_start_token_id(decoder_start_token_id, bos_token_id)
+        if device is None:
+            device = self.device
+        decoder_input_ids_start = (
+            torch.ones((batch_size * self.decoder.num_codebooks, 1), dtype=torch.long, device=device)
+            * decoder_start_token_id
+        )
+
+        # no user input -> use decoder_start_token_id as decoder_input_ids
+        if decoder_input_ids is None:
+            decoder_input_ids = decoder_input_ids_start
+
+        # user input but doesn't start with decoder_start_token_id -> prepend decoder_start_token_id (and adjust
+        # decoder_attention_mask if provided)
+        elif (decoder_input_ids[..., 0] != decoder_start_token_id).all().item():
+            decoder_input_ids = torch.cat([decoder_input_ids_start, decoder_input_ids], dim=-1)
+            if "decoder_attention_mask" in model_kwargs:
+                decoder_attention_mask = model_kwargs["decoder_attention_mask"]
+                decoder_attention_mask = torch.cat(
+                    (torch.ones_like(decoder_attention_mask)[:, :1], decoder_attention_mask),
+                    dim=-1,
+                )
+                model_kwargs["decoder_attention_mask"] = decoder_attention_mask
+
+        return decoder_input_ids, model_kwargs
+
+    def _prepare_text_encoder_kwargs_for_generation(
+        self,
+        inputs_tensor: torch.Tensor,
+        model_kwargs,
+        model_input_name: Optional[str] = None,
+        guidance_scale: Optional[float] = None,
+    ) -> Dict[str, Any]:
+        # 1. get text encoder
+        encoder = self.get_text_encoder()
+        # Compatibility with Accelerate big model inference: we need the encoder to outputs stuff on the same device
+        # as the inputs.
+        if hasattr(encoder, "_hf_hook"):
+            encoder._hf_hook.io_same_device = True
+
+        # 2. Prepare encoder args and encoder kwargs from model kwargs.
+        irrelevant_prefix = ["decoder_", "cross_attn", "use_cache"]
+        encoder_kwargs = {
+            argument: value
+            for argument, value in model_kwargs.items()
+            if not any(argument.startswith(p) for p in irrelevant_prefix)
+        }
+        encoder_signature = set(inspect.signature(encoder.forward).parameters)
+        encoder_accepts_wildcard = "kwargs" in encoder_signature or "model_kwargs" in encoder_signature
+        if not encoder_accepts_wildcard:
+            encoder_kwargs = {
+                argument: value for argument, value in encoder_kwargs.items() if argument in encoder_signature
+            }
+
+        # 3. make sure that encoder returns `ModelOutput`
+        model_input_name = model_input_name if model_input_name is not None else self.text_encoder.main_input_name
+        encoder_kwargs["return_dict"] = True
+        encoder_kwargs[model_input_name] = inputs_tensor
+        last_hidden_state = encoder(**encoder_kwargs).last_hidden_state
+
+        # for classifier free guidance we need to add a 'null' input to our encoder hidden states
+        if guidance_scale is not None and guidance_scale > 1:
+            last_hidden_state = torch.concatenate([last_hidden_state, torch.zeros_like(last_hidden_state)], dim=0)
+            if "attention_mask" in model_kwargs:
+                model_kwargs["attention_mask"] = torch.concatenate(
+                    [model_kwargs["attention_mask"], torch.zeros_like(model_kwargs["attention_mask"])], dim=0
+                )
+
+        model_kwargs["encoder_outputs"] = BaseModelOutput(last_hidden_state=last_hidden_state)
+
+        return model_kwargs
+
+    def _prepare_audio_encoder_kwargs_for_generation(
+        self, input_values, model_kwargs, model_input_name: Optional[str] = None
+    ):
+        # 1. get audio encoder
+        encoder = self.get_audio_encoder()
+        # Compatibility with Accelerate big model inference: we need the encoder to outputs stuff on the same device
+        # as the inputs.
+        if hasattr(encoder, "_hf_hook"):
+            encoder._hf_hook.io_same_device = True
+
+        # 2. Prepare encoder args and encoder kwargs from model kwargs.
+        irrelevant_prefix = ["decoder_", "cross_attn", "use_cache"]
+        encoder_kwargs = {
+            argument: value
+            for argument, value in model_kwargs.items()
+            if not any(argument.startswith(p) for p in irrelevant_prefix)
+        }
+        encoder_signature = set(inspect.signature(encoder.forward).parameters)
+        encoder_accepts_wildcard = "kwargs" in encoder_signature or "model_kwargs" in encoder_signature
+        if not encoder_accepts_wildcard:
+            encoder_kwargs = {
+                argument: value for argument, value in encoder_kwargs.items() if argument in encoder_signature
+            }
+
+        # 3. make sure that encoder returns `ModelOutput`
+        model_input_name = model_input_name if model_input_name is not None else self.audio_encoder.main_input_name
+        encoder_kwargs["return_dict"] = True
+        encoder_kwargs[model_input_name] = input_values
+
+        audio_encoder_outputs = encoder.encode(**encoder_kwargs)
+
+        audio_codes = audio_encoder_outputs.audio_codes
+        frames, bsz, codebooks, seq_len = audio_codes.shape
+
+        if frames != 1:
+            raise ValueError(
+                f"Expected 1 frame in the audio code outputs, got {frames} frames. Ensure chunking is "
+                "disabled by setting `chunk_length=None` in the audio encoder."
+            )
+
+        decoder_input_ids = audio_codes[0, ...].reshape(bsz * self.decoder.num_codebooks, seq_len)
+
+        model_kwargs["decoder_input_ids"] = decoder_input_ids
+        model_kwargs["audio_scales"] = audio_encoder_outputs.audio_scales
+        return model_kwargs
+
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
+
+    def resize_token_embeddings(self, *args, **kwargs):
+        raise NotImplementedError(
+            "Resizing the embedding layers via the EncoderDecoderModel directly is not supported. Please use the"
+            " respective methods of the wrapped objects (model.encoder.resize_token_embeddings(...) or"
+            " model.decoder.resize_token_embeddings(...))"
+        )
+
+    def _maybe_initialize_input_ids_for_generation(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        bos_token_id: Optional[int] = None,
+        model_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+    ) -> torch.LongTensor:
+        """Initializes input ids for generation, if necessary."""
+        if inputs is not None:
+            return inputs
+
+        encoder_outputs = model_kwargs.get("encoder_outputs")
+        if encoder_outputs is not None:
+            # make dummy input_ids with value -100, as a sanity check ensuring that they won't be used for encoding
+            shape = encoder_outputs[0].size()[:-1]
+            return torch.ones(shape, dtype=torch.long, device=self.device) * -100
+
+        if bos_token_id is None:
+            raise ValueError("`bos_token_id` has to be defined when no `input_ids` are provided.")
+
+        # If there is some tensor in `model_kwargs`, we can infer the batch size from it. This is helpful with
+        # soft-prompting or in multimodal implementations built on top of decoder-only language models.
+        batch_size = 1
+        for value in model_kwargs.values():
+            if isinstance(value, torch.Tensor):
+                batch_size = value.shape[0]
+                break
+        return torch.ones((batch_size, 1), dtype=torch.long, device=self.device) * bos_token_id
+
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        synced_gpus: Optional[bool] = None,
+        streamer: Optional["BaseStreamer"] = None,
+        **kwargs,
+    ):
+        """
+
+        Generates sequences of token ids for models with a language modeling head.
+
+        <Tip warning={true}>
+
+        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
+        model's default generation configuration. You can override any `generation_config` by passing the corresponding
+        parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
+
+        For an overview of generation strategies and code examples, check out the [following
+        guide](./generation_strategies).
+
+        </Tip>
+
+        Parameters:
+            inputs (`torch.Tensor` of varying shape depending on the modality, *optional*):
+                The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
+                method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs`
+                should be in the format `input_ids`. For encoder-decoder models *inputs* can represent any of
+                `input_ids`, `input_values`, `input_features`, or `pixel_values`.
+            generation_config (`~generation.GenerationConfig`, *optional*):
+                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+                passed to generate matching the attributes of `generation_config` will override them. If
+                `generation_config` is not provided, the default will be used, which had the following loading
+                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
+                default values, whose documentation should be checked to parameterize generation.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                Custom logits processors that complement the default logits processors built from arguments and
+                generation config. If a logit processor is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                Custom stopping criteria that complement the default stopping criteria built from arguments and a
+                generation config. If a stopping criteria is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            synced_gpus (`bool`, *optional*, defaults to `False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+            streamer (`BaseStreamer`, *optional*):
+                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
+                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
+            kwargs (`Dict[str, Any]`, *optional*):
+                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
+                forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
+                specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
+
+        Return:
+            [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
+            or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`.
+
+                If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
+                [`~utils.ModelOutput`] types are:
+
+                    - [`~generation.GreedySearchDecoderOnlyOutput`],
+                    - [`~generation.SampleDecoderOnlyOutput`],
+                    - [`~generation.BeamSearchDecoderOnlyOutput`],
+                    - [`~generation.BeamSampleDecoderOnlyOutput`]
+
+                If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
+                [`~utils.ModelOutput`] types are:
+
+                    - [`~generation.GreedySearchEncoderDecoderOutput`],
+                    - [`~generation.SampleEncoderDecoderOutput`],
+                    - [`~generation.BeamSearchEncoderDecoderOutput`],
+                    - [`~generation.BeamSampleEncoderDecoderOutput`]
+        """
+        # 1. Handle `generation_config` and kwargs that might update it, and validate the resulting objects
+        if generation_config is None:
+            generation_config = self.generation_config
+
+        generation_config = copy.deepcopy(generation_config)
+        model_kwargs = generation_config.update(**kwargs)  # All unused kwargs must be model kwargs
+        generation_config.validate()
+        self._validate_model_kwargs(model_kwargs.copy())
+
+        if model_kwargs.get("encoder_outputs") is not None and type(model_kwargs["encoder_outputs"]) == tuple:
+            # wrap the unconditional outputs as a BaseModelOutput for compatibility with the rest of generate
+            model_kwargs["encoder_outputs"] = BaseModelOutput(last_hidden_state=model_kwargs["encoder_outputs"][0])
+
+        # 2. Set generation parameters if not already defined
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+
+        if generation_config.pad_token_id is None and generation_config.eos_token_id is not None:
+            if model_kwargs.get("attention_mask", None) is None:
+                logger.warning(
+                    "The attention mask and the pad token id were not set. As a consequence, you may observe "
+                    "unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results."
+                )
+            eos_token_id = generation_config.eos_token_id
+            if isinstance(eos_token_id, list):
+                eos_token_id = eos_token_id[0]
+            logger.warning(f"Setting `pad_token_id` to `eos_token_id`:{eos_token_id} for open-end generation.")
+            generation_config.pad_token_id = eos_token_id
+
+        # 3. Define model inputs
+        # inputs_tensor has to be defined
+        # model_input_name is defined if model-specific keyword input is passed
+        # otherwise model_input_name is None
+        # all model-specific keyword inputs are removed from `model_kwargs`
+        inputs_tensor, model_input_name, model_kwargs = self._prepare_model_inputs(
+            inputs, generation_config.bos_token_id, model_kwargs
+        )
+        batch_size = inputs_tensor.shape[0]
+
+        # 4. Define other model kwargs
+        model_kwargs["output_attentions"] = generation_config.output_attentions
+        model_kwargs["output_hidden_states"] = generation_config.output_hidden_states
+        model_kwargs["use_cache"] = generation_config.use_cache
+        model_kwargs["guidance_scale"] = generation_config.guidance_scale
+
+        requires_attention_mask = "encoder_outputs" not in model_kwargs
+
+        if model_kwargs.get("attention_mask", None) is None and requires_attention_mask:
+            model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation(
+                inputs_tensor, generation_config.pad_token_id, generation_config.eos_token_id
+            )
+
+        if "encoder_outputs" not in model_kwargs:
+            # encoder_outputs are created and added to `model_kwargs`
+            model_kwargs = self._prepare_text_encoder_kwargs_for_generation(
+                inputs_tensor,
+                model_kwargs,
+                model_input_name,
+                guidance_scale=generation_config.guidance_scale,
+            )
+
+        if "decoder_input_ids" not in model_kwargs and "input_values" in model_kwargs:
+            model_kwargs = self._prepare_audio_encoder_kwargs_for_generation(
+                model_kwargs["input_values"],
+                model_kwargs,
+            )
+
+        # 5. Prepare `input_ids` which will be used for auto-regressive generation
+        input_ids, model_kwargs = self._prepare_decoder_input_ids_for_generation(
+            batch_size=batch_size,
+            model_input_name=model_input_name,
+            model_kwargs=model_kwargs,
+            decoder_start_token_id=generation_config.decoder_start_token_id,
+            bos_token_id=generation_config.bos_token_id,
+            device=inputs_tensor.device,
+        )
+
+        # 6. Prepare `max_length` depending on other stopping criteria.
+        input_ids_seq_length = input_ids.shape[-1]
+        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
+        if has_default_max_length and generation_config.max_new_tokens is None:
+            logger.warning(
+                f"Using the model-agnostic default `max_length` (={generation_config.max_length}) "
+                "to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation."
+            )
+        elif generation_config.max_new_tokens is not None:
+            if not has_default_max_length:
+                logger.warning(
+                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
+                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
+                    "Please refer to the documentation for more information. "
+                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)"
+                )
+            generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
+
+        if generation_config.min_length is not None and generation_config.min_length > generation_config.max_length:
+            raise ValueError(
+                f"Unfeasible length constraints: the minimum length ({generation_config.min_length}) is larger than"
+                f" the maximum length ({generation_config.max_length})"
+            )
+        if input_ids_seq_length >= generation_config.max_length:
+            logger.warning(
+                f"Input length of decoder_input_ids is {input_ids_seq_length}, but `max_length` is set to"
+                f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
+                " increasing `max_new_tokens`."
+            )
+
+        # build the delay pattern mask for offsetting each codebook prediction by 1 (this behaviour is specific to MusicGen)
+        input_ids, decoder_delay_pattern_mask = self.decoder.build_delay_pattern_mask(
+            input_ids,
+            pad_token_id=generation_config.decoder_start_token_id,
+            max_length=generation_config.max_length,
+        )
+        # stash the delay mask so that we don't have to recompute in each forward pass
+        model_kwargs["decoder_delay_pattern_mask"] = decoder_delay_pattern_mask
+
+        # input_ids are ready to be placed on the streamer (if used)
+        if streamer is not None:
+            streamer.put(input_ids.cpu())
+
+        # 7. determine generation mode
+        is_greedy_gen_mode = (
+            (generation_config.num_beams == 1)
+            and (generation_config.num_beam_groups == 1)
+            and generation_config.do_sample is False
+        )
+        is_sample_gen_mode = (
+            (generation_config.num_beams == 1)
+            and (generation_config.num_beam_groups == 1)
+            and generation_config.do_sample is True
+        )
+
+        # 8. prepare batched CFG externally (to enable coexistance with the unbatched CFG)
+        if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
+            logits_processor.append(ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale))
+            generation_config.guidance_scale = None
+
+        # 9. prepare distribution pre_processing samplers
+        logits_processor = self._get_logits_processor(
+            generation_config=generation_config,
+            input_ids_seq_length=input_ids_seq_length,
+            encoder_input_ids=inputs_tensor,
+            prefix_allowed_tokens_fn=None,
+            logits_processor=logits_processor,
+        )
+
+        # 10. prepare stopping criteria
+        stopping_criteria = self._get_stopping_criteria(
+            generation_config=generation_config, stopping_criteria=stopping_criteria
+        )
+
+        if is_greedy_gen_mode:
+            if generation_config.num_return_sequences > 1:
+                raise ValueError(
+                    "num_return_sequences has to be 1 when doing greedy search, "
+                    f"but is {generation_config.num_return_sequences}."
+                )
+
+            # 11. run greedy search
+            outputs = self.greedy_search(
+                input_ids,
+                logits_processor=logits_processor,
+                stopping_criteria=stopping_criteria,
+                pad_token_id=generation_config.pad_token_id,
+                eos_token_id=generation_config.eos_token_id,
+                output_scores=generation_config.output_scores,
+                return_dict_in_generate=generation_config.return_dict_in_generate,
+                synced_gpus=synced_gpus,
+                streamer=streamer,
+                **model_kwargs,
+            )
+
+        elif is_sample_gen_mode:
+            # 11. prepare logits warper
+            logits_warper = self._get_logits_warper(generation_config)
+
+            # expand input_ids with `num_return_sequences` additional sequences per batch
+            input_ids, model_kwargs = self._expand_inputs_for_generation(
+                input_ids=input_ids,
+                expand_size=generation_config.num_return_sequences,
+                is_encoder_decoder=self.config.is_encoder_decoder,
+                **model_kwargs,
+            )
+
+            # 12. run sample
+            outputs = self.sample(
+                input_ids,
+                logits_processor=logits_processor,
+                logits_warper=logits_warper,
+                stopping_criteria=stopping_criteria,
+                pad_token_id=generation_config.pad_token_id,
+                eos_token_id=generation_config.eos_token_id,
+                output_scores=generation_config.output_scores,
+                return_dict_in_generate=generation_config.return_dict_in_generate,
+                synced_gpus=synced_gpus,
+                streamer=streamer,
+                **model_kwargs,
+            )
+
+        else:
+            raise ValueError(
+                "Got incompatible mode for generation, should be one of greedy or sampling."
+                "Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`."
+            )
+
+        if generation_config.return_dict_in_generate:
+            output_ids = outputs.sequences
+        else:
+            output_ids = outputs
+
+        # apply the pattern mask to the final ids
+        output_ids = self.decoder.apply_delay_pattern_mask(output_ids, model_kwargs["decoder_delay_pattern_mask"])
+
+        # revert the pattern delay mask by filtering the pad token id
+        output_ids = output_ids[output_ids != generation_config.pad_token_id].reshape(
+            batch_size, self.decoder.num_codebooks, -1
+        )
+
+        # append the frame dimension back to the audio codes
+        output_ids = output_ids[None, ...]
+
+        audio_scales = model_kwargs.get("audio_scales")
+        if audio_scales is None:
+            audio_scales = [None] * batch_size
+
+        output_values = self.audio_encoder.decode(
+            output_ids,
+            audio_scales=audio_scales,
+        )
+
+        if generation_config.return_dict_in_generate:
+            outputs.sequences = output_values.audio_values
+            return outputs
+        else:
+            return output_values.audio_values
+
+    def get_unconditional_inputs(self, num_samples=1):
+        """
+        Helper function to get null inputs for unconditional generation, enabling the model to be used without the
+        feature extractor or tokenizer.
+
+        Args:
+            num_samples (int, *optional*):
+                Number of audio samples to unconditionally generate.
+            max_new_tokens (int, *optional*):
+                Number of tokens to generate for each sample. More tokens means longer audio samples, at the expense of
+                longer inference (since more audio tokens need to be generated per sample).
+
+        Example:
+        ```python
+        >>> from transformers import MusicgenForConditionalGeneration
+
+        >>> model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
+
+        >>> # get the unconditional (or 'null') inputs for the model
+        >>> unconditional_inputs = model.get_unconditional_inputs(num_samples=1)
+        >>> audio_samples = model.generate(**unconditional_inputs, max_new_tokens=256)
+        ```"""
+        last_hidden_state = torch.zeros(
+            (num_samples, 1, self.config.text_encoder.hidden_size), device=self.device, dtype=self.dtype
+        )
+
+        attention_mask = torch.zeros((num_samples, 1), device=self.device, dtype=torch.long)
+
+        return MusicgenUnconditionalInput(
+            encoder_outputs=(last_hidden_state,),
+            attention_mask=attention_mask,
+            guidance_scale=1.0,
+        )
diff --git a/src/transformers/models/musicgen/processing_musicgen.py b/src/transformers/models/musicgen/processing_musicgen.py
new file mode 100644
index 000000000000..ed8d1277f2f7
--- /dev/null
+++ b/src/transformers/models/musicgen/processing_musicgen.py
@@ -0,0 +1,139 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Text/audio processor class for MusicGen
+"""
+from typing import List, Optional
+
+import numpy as np
+
+from ...processing_utils import ProcessorMixin
+from ...utils import to_numpy
+
+
+class MusicgenProcessor(ProcessorMixin):
+    r"""
+    Constructs a MusicGen processor which wraps an EnCodec feature extractor and a T5 tokenizer into a single processor
+    class.
+
+    [`MusicgenProcessor`] offers all the functionalities of [`EncodecFeatureExtractor`] and [`TTokenizer`]. See
+    [`~MusicgenProcessor.__call__`] and [`~MusicgenProcessor.decode`] for more information.
+
+    Args:
+        feature_extractor (`EncodecFeatureExtractor`):
+            An instance of [`EncodecFeatureExtractor`]. The feature extractor is a required input.
+        tokenizer (`T5Tokenizer`):
+            An instance of [`T5Tokenizer`]. The tokenizer is a required input.
+    """
+    feature_extractor_class = "EncodecFeatureExtractor"
+    tokenizer_class = ("T5Tokenizer", "T5TokenizerFast")
+
+    def __init__(self, feature_extractor, tokenizer):
+        super().__init__(feature_extractor, tokenizer)
+        self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False
+
+    def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
+        return self.tokenizer.get_decoder_prompt_ids(task=task, language=language, no_timestamps=no_timestamps)
+
+    def __call__(self, *args, **kwargs):
+        """
+        Forwards the `audio` argument to EncodecFeatureExtractor's [`~EncodecFeatureExtractor.__call__`] and the `text`
+        argument to [`~T5Tokenizer.__call__`]. Please refer to the doctsring of the above two methods for more
+        information.
+        """
+        # For backward compatibility
+        if self._in_target_context_manager:
+            return self.current_processor(*args, **kwargs)
+
+        audio = kwargs.pop("audio", None)
+        sampling_rate = kwargs.pop("sampling_rate", None)
+        text = kwargs.pop("text", None)
+        if len(args) > 0:
+            audio = args[0]
+            args = args[1:]
+
+        if audio is None and text is None:
+            raise ValueError("You need to specify either an `audio` or `text` input to process.")
+
+        if text is not None:
+            inputs = self.tokenizer(text, **kwargs)
+
+        if audio is not None:
+            audio_inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
+
+        if audio is None:
+            return inputs
+
+        elif text is None:
+            return audio_inputs
+
+        else:
+            inputs["input_values"] = audio_inputs["input_values"]
+            if "padding_mask" in audio_inputs:
+                inputs["padding_mask"] = audio_inputs["padding_mask"]
+            return inputs
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method is used to decode either batches of audio outputs from the MusicGen model, or batches of token ids
+        from the tokenizer. In the case of decoding token ids, this method forwards all its arguments to T5Tokenizer's
+        [`~PreTrainedTokenizer.batch_decode`]. Please refer to the docstring of this method for more information.
+        """
+        audio_values = kwargs.pop("audio", None)
+        padding_mask = kwargs.pop("padding_mask", None)
+
+        if len(args) > 0:
+            audio_values = args[0]
+            args = args[1:]
+
+        if audio_values is not None:
+            return self._decode_audio(audio_values, padding_mask=padding_mask)
+        else:
+            return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to T5Tokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to the
+        docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    def _decode_audio(self, audio_values, padding_mask: Optional = None) -> List[np.ndarray]:
+        """
+        This method strips any padding from the audio values to return a list of numpy audio arrays.
+        """
+        audio_values = to_numpy(audio_values)
+        bsz, channels, seq_len = audio_values.shape
+
+        if padding_mask is None:
+            return list(audio_values)
+
+        padding_mask = to_numpy(padding_mask)
+
+        # match the sequence length of the padding mask to the generated audio arrays by padding with the **non-padding**
+        # token (so that the generated audio values are **not** treated as padded tokens)
+        difference = seq_len - padding_mask.shape[-1]
+        padding_value = 1 - self.feature_extractor.padding_value
+        padding_mask = np.pad(padding_mask, ((0, 0), (0, difference)), "constant", constant_values=padding_value)
+
+        audio_values = audio_values.tolist()
+        for i in range(bsz):
+            sliced_audio = np.asarray(audio_values[i])[
+                padding_mask[i][None, :] != self.feature_extractor.padding_value
+            ]
+            audio_values[i] = sliced_audio.reshape(channels, -1)
+
+        return audio_values
diff --git a/src/transformers/models/mvp/__init__.py b/src/transformers/models/mvp/__init__.py
index 865b958d3911..406dc531e96f 100644
--- a/src/transformers/models/mvp/__init__.py
+++ b/src/transformers/models/mvp/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/mvp/configuration_mvp.py b/src/transformers/models/mvp/configuration_mvp.py
index 63a006b8e429..0880985b7930 100644
--- a/src/transformers/models/mvp/configuration_mvp.py
+++ b/src/transformers/models/mvp/configuration_mvp.py
@@ -93,12 +93,12 @@ class MvpConfig(PretrainedConfig):
     Example:
 
     ```python
-    >>> from transformers import MvpModel, MvpConfig
+    >>> from transformers import MvpConfig, MvpModel
 
     >>> # Initializing a MVP RUCAIBox/mvp style configuration
     >>> configuration = MvpConfig()
 
-    >>> # Initializing a model from the RUCAIBox/mvp style configuration
+    >>> # Initializing a model (with random weights) from the RUCAIBox/mvp style configuration
     >>> model = MvpModel(configuration)
 
     >>> # Accessing the model configuration
@@ -138,7 +138,7 @@ def __init__(
         use_prompt=False,
         prompt_length=100,
         prompt_mid_dim=800,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
diff --git a/src/transformers/models/mvp/modeling_mvp.py b/src/transformers/models/mvp/modeling_mvp.py
index 999d61cda17b..21a82f95c333 100644
--- a/src/transformers/models/mvp/modeling_mvp.py
+++ b/src/transformers/models/mvp/modeling_mvp.py
@@ -15,7 +15,6 @@
 """ PyTorch MVP model."""
 import copy
 import math
-import random
 from typing import List, Optional, Tuple, Union
 
 import torch
@@ -49,7 +48,6 @@
 
 _CHECKPOINT_FOR_DOC = "RUCAIBox/mvp"
 _CONFIG_FOR_DOC = "MvpConfig"
-_TOKENIZER_FOR_DOC = "MvpTokenizer"
 
 # Base model docstring
 _EXPECTED_OUTPUT_SHAPE = [1, 8, 1024]
@@ -92,18 +90,20 @@ def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start
 
 
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
     """
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
-    mask_cond = torch.arange(mask.size(-1))
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
 
     if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
     return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
 
 
@@ -327,7 +327,7 @@ def forward(
     ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
@@ -551,7 +551,6 @@ class MvpPreTrainedModel(PreTrainedModel):
     config_class = MvpConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_unexpected = [r"encoder.version", r"decoder.version"]
 
     def _init_weights(self, module):
         std = self.config.init_std
@@ -601,7 +600,7 @@ def dummy_inputs(self):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`MvpTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -615,7 +614,7 @@ def dummy_inputs(self):
         decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`MvpTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -697,9 +696,9 @@ def dummy_inputs(self):
     Fine-tuning a model
     ```python
     >>> import torch
-    >>> from transformers import MvpTokenizer, MvpForConditionalGeneration
+    >>> from transformers import AutoTokenizer, MvpForConditionalGeneration
 
-    >>> tokenizer = MvpTokenizer.from_pretrained("RUCAIBox/mvp")
+    >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
     >>> model = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mvp")
 
     >>> inputs = tokenizer(
@@ -727,10 +726,10 @@ def dummy_inputs(self):
     Fine-tuning a model on `num_labels` classes
     ```python
     >>> import torch
-    >>> from transformers import MvpTokenizer, MvpForSequenceClassification
+    >>> from transformers import AutoTokenizer, MvpForSequenceClassification
 
     >>> num_labels = 2  # for example, this is a binary classification task
-    >>> tokenizer = MvpTokenizer.from_pretrained("RUCAIBox/mvp")
+    >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
     >>> model = MvpForSequenceClassification.from_pretrained("RUCAIBox/mvp", num_labels=num_labels)
 
     >>> inputs = tokenizer("Classify: Hello, my dog is cute", return_tensors="pt")
@@ -756,9 +755,9 @@ def dummy_inputs(self):
     using `BartForConditionalGeneration`
     ```python
     >>> import torch
-    >>> from transformers import MvpTokenizer, MvpForQuestionAnswering
+    >>> from transformers import AutoTokenizer, MvpForQuestionAnswering
 
-    >>> tokenizer = MvpTokenizer.from_pretrained("RUCAIBox/mvp")
+    >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
     >>> model = MvpForQuestionAnswering.from_pretrained("RUCAIBox/mvp")
 
     >>> inputs = tokenizer(
@@ -857,7 +856,7 @@ def forward(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`MvpTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -940,8 +939,13 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
                 layer_outputs = (None, None)
             else:
                 if self.gradient_checkpointing and self.training:
@@ -1045,8 +1049,11 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
         combined_attention_mask = None
         if input_shape[-1] > 1:
             combined_attention_mask = _make_causal_mask(
-                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
-            ).to(inputs_embeds.device)
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
 
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
@@ -1078,7 +1085,7 @@ def forward(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`MvpTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -1186,6 +1193,13 @@ def forward(
             self_attn_prompt = self.self_attn_prompt(prompt_ids)
             cross_attn_prompt = self.cross_attn_prompt(prompt_ids)
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
@@ -1205,20 +1219,15 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         # None for past_key_value
@@ -1239,7 +1248,6 @@ def custom_forward(*inputs):
                     None,
                 )
             else:
-
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
@@ -1291,8 +1299,8 @@ def custom_forward(*inputs):
     MVP_START_DOCSTRING,
 )
 class MvpModel(MvpPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"final_logits_bias", r"lm_head.weight"]
-    _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+    _keys_to_ignore_on_load_unexpected = ["final_logits_bias"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: MvpConfig):
         super().__init__(config)
@@ -1331,7 +1339,6 @@ def set_lightweight_tuning(self):
 
     @add_start_docstrings_to_model_forward(MVP_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=Seq2SeqModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1355,7 +1362,6 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, Seq2SeqModelOutput]:
-
         # different to other models, Mvp automatically creates decoder_input_ids from
         # input_ids if no decoder_input_ids are provided
         if decoder_input_ids is None and decoder_inputs_embeds is None:
@@ -1430,7 +1436,7 @@ def forward(
     "The MVP Model with a language modeling head. Can be used for various text generation tasks.", MVP_START_DOCSTRING
 )
 class MvpForConditionalGeneration(MvpPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
 
     def __init__(self, config: MvpConfig):
         super().__init__(config)
@@ -1447,8 +1453,8 @@ def get_encoder(self):
     def get_decoder(self):
         return self.model.get_decoder()
 
-    def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
-        new_embeddings = super().resize_token_embeddings(new_num_tokens)
+    def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding:
+        new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
         self._resize_final_logits_bias(new_num_tokens)
         return new_embeddings
 
@@ -1562,7 +1568,7 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # cut decoder_input_ids if past is used
         if past_key_values is not None:
@@ -1584,12 +1590,13 @@ def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
         return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
 
     @staticmethod
-    def _reorder_cache(past, beam_idx):
+    def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
+        for layer_past in past_key_values:
             # cached cross_attention states don't have to be reordered -> they are always the same
             reordered_past += (
-                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
+                + layer_past[2:],
             )
         return reordered_past
 
@@ -1602,8 +1609,7 @@ def _reorder_cache(past, beam_idx):
     MVP_START_DOCSTRING,
 )
 class MvpForSequenceClassification(MvpPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"final_logits_bias", r"lm_head.weight"]
-    _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: MvpConfig, **kwargs):
         super().__init__(config, **kwargs)
@@ -1615,8 +1621,8 @@ def __init__(self, config: MvpConfig, **kwargs):
             config.classifier_dropout,
         )
 
-        self.model._init_weights(self.classification_head.dense)
-        self.model._init_weights(self.classification_head.out_proj)
+        # Initialize weights and apply final processing
+        self.post_init()
 
     def set_lightweight_tuning(self):
         self.model.set_lightweight_tuning()
@@ -1730,8 +1736,7 @@ def forward(
     MVP_START_DOCSTRING,
 )
 class MvpForQuestionAnswering(MvpPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"final_logits_bias", r"lm_head.weight"]
-    _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1742,7 +1747,8 @@ def __init__(self, config):
         self.model = MvpModel(config)
         self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
 
-        self.model._init_weights(self.qa_outputs)
+        # Initialize weights and apply final processing
+        self.post_init()
 
     def set_lightweight_tuning(self):
         self.model.set_lightweight_tuning()
@@ -1861,7 +1867,7 @@ def forward(self, *args, **kwargs):
 
 
 class MvpForCausalLM(MvpPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         config = copy.deepcopy(config)
@@ -1920,7 +1926,7 @@ def forward(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`MvpTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -1985,9 +1991,9 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import MvpTokenizer, MvpForCausalLM
+        >>> from transformers import AutoTokenizer, MvpForCausalLM
 
-        >>> tokenizer = MvpTokenizer.from_pretrained("RUCAIBox/mvp")
+        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
         >>> model = MvpForCausalLM.from_pretrained("RUCAIBox/mvp", add_cross_attention=False)
 
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
@@ -2058,8 +2064,10 @@ def prepare_inputs_for_generation(
         }
 
     @staticmethod
-    def _reorder_cache(past, beam_idx):
+    def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
         return reordered_past
diff --git a/src/transformers/models/mvp/tokenization_mvp.py b/src/transformers/models/mvp/tokenization_mvp.py
index 3d5d606d63b5..c897cbea30d9 100644
--- a/src/transformers/models/mvp/tokenization_mvp.py
+++ b/src/transformers/models/mvp/tokenization_mvp.py
@@ -93,12 +93,14 @@ class MvpTokenizer(PreTrainedTokenizer):
     This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
     be encoded differently whether it is at the beginning of the sentence (without space) or not:
 
-    ```
+    ```python
     >>> from transformers import MvpTokenizer
+
     >>> tokenizer = MvpTokenizer.from_pretrained("RUCAIBox/mvp")
-    >>> tokenizer("Hello world")['input_ids']
+    >>> tokenizer("Hello world")["input_ids"]
     [0, 31414, 232, 2]
-    >>> tokenizer(" Hello world")['input_ids']
+
+    >>> tokenizer(" Hello world")["input_ids"]
     [0, 20920, 232, 2]
     ```
 
@@ -180,7 +182,7 @@ def __init__(
         pad_token="<pad>",
         mask_token="<mask>",
         add_prefix_space=False,
-        **kwargs
+        **kwargs,
     ):
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
@@ -191,19 +193,6 @@ def __init__(
 
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
-
-        super().__init__(
-            errors=errors,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            add_prefix_space=add_prefix_space,
-            **kwargs,
-        )
         with open(vocab_file, encoding="utf-8") as vocab_handle:
             self.encoder = json.load(vocab_handle)
         self.decoder = {v: k for k, v in self.encoder.items()}
@@ -220,12 +209,27 @@ def __init__(
         # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
         self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
 
+        super().__init__(
+            errors=errors,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            add_prefix_space=add_prefix_space,
+            **kwargs,
+        )
+
     @property
     def vocab_size(self):
         return len(self.encoder)
 
     def get_vocab(self):
-        return dict(self.encoder, **self.added_tokens_encoder)
+        vocab = self.encoder.copy()
+        vocab.update(self.added_tokens_encoder)
+        return vocab
 
     def bpe(self, token):
         if token in self.cache:
diff --git a/src/transformers/models/mvp/tokenization_mvp_fast.py b/src/transformers/models/mvp/tokenization_mvp_fast.py
index 00b7a5c6651e..afe2a0a89e2a 100644
--- a/src/transformers/models/mvp/tokenization_mvp_fast.py
+++ b/src/transformers/models/mvp/tokenization_mvp_fast.py
@@ -58,12 +58,14 @@ class MvpTokenizerFast(PreTrainedTokenizerFast):
     This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
     be encoded differently whether it is at the beginning of the sentence (without space) or not:
 
-    ```
+    ```python
     >>> from transformers import MvpTokenizerFast
+
     >>> tokenizer = MvpTokenizerFast.from_pretrained("RUCAIBox/mvp")
-    >>> tokenizer("Hello world")['input_ids']
+    >>> tokenizer("Hello world")["input_ids"]
     [0, 31414, 232, 2]
-    >>> tokenizer(" Hello world")['input_ids']
+
+    >>> tokenizer(" Hello world")["input_ids"]
     [0, 20920, 232, 2]
     ```
 
@@ -149,8 +151,17 @@ def __init__(
         mask_token="<mask>",
         add_prefix_space=False,
         trim_offsets=True,
-        **kwargs
+        **kwargs,
     ):
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
+        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
         super().__init__(
             vocab_file,
             merges_file,
diff --git a/src/transformers/models/nat/__init__.py b/src/transformers/models/nat/__init__.py
index 3936d6ddda84..19ddb46e8266 100644
--- a/src/transformers/models/nat/__init__.py
+++ b/src/transformers/models/nat/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,7 +13,6 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-# rely on isort to merge the imports
 from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
 
 
diff --git a/src/transformers/models/nat/configuration_nat.py b/src/transformers/models/nat/configuration_nat.py
index 32272e62c562..e24ad679995f 100644
--- a/src/transformers/models/nat/configuration_nat.py
+++ b/src/transformers/models/nat/configuration_nat.py
@@ -16,6 +16,7 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
 
 
 logger = logging.get_logger(__name__)
@@ -26,7 +27,7 @@
 }
 
 
-class NatConfig(PretrainedConfig):
+class NatConfig(BackboneConfigMixin, PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`NatModel`]. It is used to instantiate a Nat model
     according to the specified arguments, defining the model architecture. Instantiating a configuration with the
@@ -43,9 +44,9 @@ class NatConfig(PretrainedConfig):
             The number of input channels.
         embed_dim (`int`, *optional*, defaults to 64):
             Dimensionality of patch embedding.
-        depths (`List[int]`, *optional*, defaults to `[2, 2, 6, 2]`):
+        depths (`List[int]`, *optional*, defaults to `[3, 4, 6, 5]`):
             Number of layers in each level of the encoder.
-        num_heads (`List[int]`, *optional*, defaults to `[3, 6, 12, 24]`):
+        num_heads (`List[int]`, *optional*, defaults to `[2, 4, 8, 16]`):
             Number of attention heads in each layer of the Transformer encoder.
         kernel_size (`int`, *optional*, defaults to 7):
             Neighborhood Attention kernel size.
@@ -62,17 +63,20 @@ class NatConfig(PretrainedConfig):
         hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`,
             `"selu"` and `"gelu_new"` are supported.
-        patch_norm (`bool`, *optional*, defaults to `True`):
-            Whether or not to add layer normalization after patch embedding.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
         layer_scale_init_value (`float`, *optional*, defaults to 0.0):
             The initial value for the layer scale. Disabled if <=0.
         out_features (`List[str]`, *optional*):
             If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
-            (depending on how many stages the model has). Will default to the last stage if unset.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage.
+        out_indices (`List[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage.
 
     Example:
 
@@ -109,12 +113,12 @@ def __init__(
         attention_probs_dropout_prob=0.0,
         drop_path_rate=0.1,
         hidden_act="gelu",
-        patch_norm=True,
         initializer_range=0.02,
         layer_norm_eps=1e-5,
         layer_scale_init_value=0.0,
         out_features=None,
-        **kwargs
+        out_indices=None,
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -131,7 +135,6 @@ def __init__(
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
         self.drop_path_rate = drop_path_rate
         self.hidden_act = hidden_act
-        self.path_norm = patch_norm
         self.layer_norm_eps = layer_norm_eps
         self.initializer_range = initializer_range
         # we set the hidden_size attribute in order to make Nat work with VisionEncoderDecoderModel
@@ -139,12 +142,6 @@ def __init__(
         self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
         self.layer_scale_init_value = layer_scale_init_value
         self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
-        if out_features is not None:
-            if not isinstance(out_features, list):
-                raise ValueError("out_features should be a list")
-            for feature in out_features:
-                if feature not in self.stage_names:
-                    raise ValueError(
-                        f"Feature {feature} is not a valid feature name. Valid names are {self.stage_names}"
-                    )
-        self.out_features = out_features
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
diff --git a/src/transformers/models/nat/modeling_nat.py b/src/transformers/models/nat/modeling_nat.py
index 8e5bd648bf53..ecc745b558dd 100644
--- a/src/transformers/models/nat/modeling_nat.py
+++ b/src/transformers/models/nat/modeling_nat.py
@@ -26,7 +26,7 @@
 
 from ...activations import ACT2FN
 from ...modeling_outputs import BackboneOutput
-from ...modeling_utils import BackboneMixin, PreTrainedModel
+from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     ModelOutput,
@@ -39,6 +39,7 @@
     replace_return_docstrings,
     requires_backends,
 )
+from ...utils.backbone_utils import BackboneMixin
 from .configuration_nat import NatConfig
 
 
@@ -57,7 +58,6 @@ def natten2dav(*args, **kwargs):
 
 # General docstring
 _CONFIG_FOR_DOC = "NatConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "shi-labs/nat-mini-in1k-224"
@@ -263,7 +263,7 @@ def forward(self, input_feature: torch.Tensor) -> torch.Tensor:
 
 
 # Copied from transformers.models.beit.modeling_beit.drop_path
-def drop_path(input, drop_prob=0.0, training=False, scale_by_keep=True):
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
@@ -330,7 +330,6 @@ def forward(
         hidden_states: torch.Tensor,
         output_attentions: Optional[bool] = False,
     ) -> Tuple[torch.Tensor]:
-
         query_layer = self.transpose_for_scores(self.query(hidden_states))
         key_layer = self.transpose_for_scores(self.key(hidden_states))
         value_layer = self.transpose_for_scores(self.value(hidden_states))
@@ -341,7 +340,7 @@ def forward(
         query_layer = query_layer / math.sqrt(self.attention_head_size)
 
         # Compute NA between "query" and "key" to get the raw attention scores, and add relative positional biases.
-        attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, 1)
+        attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, self.kernel_size, 1)
 
         # Normalize the attention scores to probabilities.
         attention_probs = nn.functional.softmax(attention_scores, dim=-1)
@@ -350,7 +349,7 @@ def forward(
         # seem a bit unusual, but is taken from the original Transformer paper.
         attention_probs = self.dropout(attention_probs)
 
-        context_layer = natten2dav(attention_probs, value_layer, 1)
+        context_layer = natten2dav(attention_probs, value_layer, self.kernel_size, 1)
         context_layer = context_layer.permute(0, 2, 3, 1, 4).contiguous()
         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
         context_layer = context_layer.view(new_context_layer_shape)
@@ -658,8 +657,8 @@ def _set_gradient_checkpointing(self, module: NatEncoder, value: bool = False) -
 NAT_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
-            [`AutoImageProcessor.__call__`] for details.
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
+            for details.
 
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
@@ -708,7 +707,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(NAT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=NatModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -788,7 +786,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(NAT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_IMAGE_CLASS_CHECKPOINT,
         output_type=NatImageClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -864,24 +861,16 @@ def forward(
 class NatBackbone(NatPreTrainedModel, BackboneMixin):
     def __init__(self, config):
         super().__init__(config)
+        super()._init_backbone(config)
 
         requires_backends(self, ["natten"])
 
-        self.stage_names = config.stage_names
-
         self.embeddings = NatEmbeddings(config)
         self.encoder = NatEncoder(config)
-
-        self.out_features = config.out_features if config.out_features is not None else [self.stage_names[-1]]
-
-        num_features = [int(config.embed_dim * 2**i) for i in range(len(config.depths))]
-        self.out_feature_channels = {}
-        self.out_feature_channels["stem"] = config.embed_dim
-        for i, stage in enumerate(self.stage_names[1:]):
-            self.out_feature_channels[stage] = num_features[i]
+        self.num_features = [config.embed_dim] + [int(config.embed_dim * 2**i) for i in range(len(config.depths))]
 
         # Add layer norms to hidden states of out_features
-        hidden_states_norms = dict()
+        hidden_states_norms = {}
         for stage, num_channels in zip(self.out_features, self.channels):
             hidden_states_norms[stage] = nn.LayerNorm(num_channels)
         self.hidden_states_norms = nn.ModuleDict(hidden_states_norms)
@@ -892,10 +881,6 @@ def __init__(self, config):
     def get_input_embeddings(self):
         return self.embeddings.patch_embeddings
 
-    @property
-    def channels(self):
-        return [self.out_feature_channels[name] for name in self.out_features]
-
     @add_start_docstrings_to_model_forward(NAT_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
diff --git a/src/transformers/models/nezha/__init__.py b/src/transformers/models/nezha/__init__.py
index 9811ee325250..f9078fc4a566 100644
--- a/src/transformers/models/nezha/__init__.py
+++ b/src/transformers/models/nezha/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,7 +13,6 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-# rely on isort to merge the imports
 from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
 
 
diff --git a/src/transformers/models/nezha/configuration_nezha.py b/src/transformers/models/nezha/configuration_nezha.py
index 6e8adc0e84a6..f41a9b2bf895 100644
--- a/src/transformers/models/nezha/configuration_nezha.py
+++ b/src/transformers/models/nezha/configuration_nezha.py
@@ -1,4 +1,4 @@
-from transformers import PretrainedConfig
+from ... import PretrainedConfig
 
 
 NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
@@ -21,8 +21,6 @@ class NezhaConfig(PretrainedConfig):
         vocab_size (`int`, optional, defaults to 21128):
             Vocabulary size of the NEZHA model. Defines the different tokens that can be represented by the
             *inputs_ids* passed to the forward method of [`NezhaModel`].
-        embedding_size (`int`, optional, defaults to 128):
-            Dimensionality of vocabulary embeddings.
         hidden_size (`int`, optional, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
         num_hidden_layers (`int`, optional, defaults to 12):
@@ -71,7 +69,6 @@ class NezhaConfig(PretrainedConfig):
     def __init__(
         self,
         vocab_size=21128,
-        embedding_size=128,
         hidden_size=768,
         num_hidden_layers=12,
         num_attention_heads=12,
@@ -89,12 +86,11 @@ def __init__(
         bos_token_id=2,
         eos_token_id=3,
         use_cache=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
         self.vocab_size = vocab_size
-        self.embedding_size = embedding_size
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
diff --git a/src/transformers/models/nezha/modeling_nezha.py b/src/transformers/models/nezha/modeling_nezha.py
index 82a0633f98d0..fa31e94f4d2e 100644
--- a/src/transformers/models/nezha/modeling_nezha.py
+++ b/src/transformers/models/nezha/modeling_nezha.py
@@ -54,7 +54,6 @@
 
 _CHECKPOINT_FOR_DOC = "sijunhe/nezha-cn-base"
 _CONFIG_FOR_DOC = "NezhaConfig"
-_TOKENIZER_FOR_DOC = "BertTokenizer"
 
 NEZHA_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "sijunhe/nezha-cn-base",
@@ -164,7 +163,7 @@ def __init__(self, length, depth, max_relative_position=127):
         my_shape = list(final_mat.size())
         my_shape.append(depth)
         positions_encoding = positions_encoding.view(my_shape)
-        self.register_buffer("positions_encoding", positions_encoding)
+        self.register_buffer("positions_encoding", positions_encoding, persistent=False)
 
     def forward(self, length):
         return self.positions_encoding[:length, :length, :]
@@ -224,7 +223,7 @@ def forward(
 class NezhaSelfAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
-        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+        if config.hidden_size % config.num_attention_heads != 0:
             raise ValueError(
                 f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                 f"heads ({config.num_attention_heads})"
@@ -562,6 +561,13 @@ def forward(
         all_self_attentions = () if output_attentions else None
         all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         next_decoder_cache = () if use_cache else None
         for i, layer_module in enumerate(self.layer):
             if output_hidden_states:
@@ -572,12 +578,6 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         return module(*inputs, past_key_value, output_attentions)
@@ -735,7 +735,6 @@ class NezhaPreTrainedModel(PreTrainedModel):
     load_tf_weights = load_tf_weights_in_nezha
     base_model_prefix = "nezha"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"positions_encoding"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -813,7 +812,7 @@ class NezhaForPreTrainingOutput(ModelOutput):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -898,7 +897,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPoolingAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -952,6 +950,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -1038,7 +1037,7 @@ def forward(
     NEZHA_START_DOCSTRING,
 )
 class NezhaForPreTraining(NezhaPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["cls.predictions.decoder"]
+    _tied_weights_keys = ["cls.predictions.decoder"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1089,10 +1088,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import BertTokenizer, NezhaForPreTraining
+        >>> from transformers import AutoTokenizer, NezhaForPreTraining
         >>> import torch
 
-        >>> tokenizer = BertTokenizer.from_pretrained("sijunhe/nezha-cn-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("sijunhe/nezha-cn-base")
         >>> model = NezhaForPreTraining.from_pretrained("sijunhe/nezha-cn-base")
 
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
@@ -1140,9 +1139,7 @@ def forward(
 
 @add_start_docstrings("""Nezha Model with a `language modeling` head on top.""", NEZHA_START_DOCSTRING)
 class NezhaForMaskedLM(NezhaPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"cls.predictions.decoder", r"positions_encoding"]
+    _tied_weights_keys = ["cls.predictions.decoder"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1167,7 +1164,6 @@ def set_output_embeddings(self, new_embeddings):
 
     @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1286,10 +1282,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import BertTokenizer, NezhaForNextSentencePrediction
+        >>> from transformers import AutoTokenizer, NezhaForNextSentencePrediction
         >>> import torch
 
-        >>> tokenizer = BertTokenizer.from_pretrained("sijunhe/nezha-cn-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("sijunhe/nezha-cn-base")
         >>> model = NezhaForNextSentencePrediction.from_pretrained("sijunhe/nezha-cn-base")
 
         >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
@@ -1369,7 +1365,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1467,7 +1462,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1545,9 +1539,6 @@ def forward(
     NEZHA_START_DOCSTRING,
 )
 class NezhaForTokenClassification(NezhaPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1564,7 +1555,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1628,9 +1618,6 @@ def forward(
     NEZHA_START_DOCSTRING,
 )
 class NezhaForQuestionAnswering(NezhaPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1643,7 +1630,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/nllb/__init__.py b/src/transformers/models/nllb/__init__.py
index a678bf527440..49e0e5c675ac 100644
--- a/src/transformers/models/nllb/__init__.py
+++ b/src/transformers/models/nllb/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/nllb/tokenization_nllb.py b/src/transformers/models/nllb/tokenization_nllb.py
index 6a326fd3ca10..ea77f10ea578 100644
--- a/src/transformers/models/nllb/tokenization_nllb.py
+++ b/src/transformers/models/nllb/tokenization_nllb.py
@@ -140,34 +140,18 @@ def __init__(
         tgt_lang=None,
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
         additional_special_tokens=None,
-        **kwargs
+        legacy_behaviour=False,
+        **kwargs,
     ):
-
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
 
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
-
-        super().__init__(
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            tokenizer_file=tokenizer_file,
-            src_lang=src_lang,
-            tgt_lang=tgt_lang,
-            additional_special_tokens=additional_special_tokens,
-            sp_model_kwargs=self.sp_model_kwargs,
-            **kwargs,
-        )
+        self.legacy_behaviour = legacy_behaviour
 
         self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
         self.sp_model.Load(str(vocab_file))
         self.vocab_file = vocab_file
-
         # Original fairseq vocab and spm vocab must be "aligned":
         # Vocab    |    0    |    1    |   2    |    3    |  4   |  5   |  6   |   7  |   8  |  9
         # -------- | ------- | ------- | ------ | ------- | ---- | ---- | ---- | ---- | ---- | ----
@@ -189,16 +173,35 @@ def __init__(
 
         self.fairseq_tokens_to_ids.update(self.lang_code_to_id)
         self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
-        self._additional_special_tokens = list(self.lang_code_to_id.keys())
+
+        self._src_lang = src_lang if src_lang is not None else "eng_Latn"
+        self.cur_lang_code_id = self.lang_code_to_id[self._src_lang]
+
+        _additional_special_tokens = list(self.lang_code_to_id.keys())
 
         if additional_special_tokens is not None:
             # Only add those special tokens if they are not already there.
-            self._additional_special_tokens.extend(
-                [t for t in additional_special_tokens if t not in self._additional_special_tokens]
+            _additional_special_tokens.extend(
+                [t for t in additional_special_tokens if t not in _additional_special_tokens]
             )
 
-        self._src_lang = src_lang if src_lang is not None else "eng_Latn"
-        self.cur_lang_code_id = self.lang_code_to_id[self._src_lang]
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            tokenizer_file=tokenizer_file,
+            src_lang=src_lang,
+            tgt_lang=tgt_lang,
+            additional_special_tokens=_additional_special_tokens,
+            sp_model_kwargs=self.sp_model_kwargs,
+            legacy_behaviour=legacy_behaviour,
+            **kwargs,
+        )
+
         self.tgt_lang = tgt_lang
         self.set_src_lang_special_tokens(self._src_lang)
 
@@ -389,13 +392,27 @@ def _switch_to_target_mode(self):
         return self.set_tgt_lang_special_tokens(self.tgt_lang)
 
     def set_src_lang_special_tokens(self, src_lang) -> None:
-        """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code]."""
+        """Reset the special tokens to the source lang setting.
+        - In legacy mode: No prefix and suffix=[eos, src_lang_code].
+        - In default mode: Prefix=[src_lang_code], suffix = [eos]
+        """
         self.cur_lang_code = self.lang_code_to_id[src_lang]
-        self.prefix_tokens = []
-        self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
+        if self.legacy_behaviour:
+            self.prefix_tokens = []
+            self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
+        else:
+            self.prefix_tokens = [self.cur_lang_code]
+            self.suffix_tokens = [self.eos_token_id]
 
     def set_tgt_lang_special_tokens(self, lang: str) -> None:
-        """Reset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code]."""
+        """Reset the special tokens to the target lang setting.
+        - In legacy mode: No prefix and suffix=[eos, tgt_lang_code].
+        - In default mode: Prefix=[tgt_lang_code], suffix = [eos]
+        """
         self.cur_lang_code = self.lang_code_to_id[lang]
-        self.prefix_tokens = []
-        self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
+        if self.legacy_behaviour:
+            self.prefix_tokens = []
+            self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
+        else:
+            self.prefix_tokens = [self.cur_lang_code]
+            self.suffix_tokens = [self.eos_token_id]
diff --git a/src/transformers/models/nllb/tokenization_nllb_fast.py b/src/transformers/models/nllb/tokenization_nllb_fast.py
index 1afe27f43b4e..7ab11c8cc00a 100644
--- a/src/transformers/models/nllb/tokenization_nllb_fast.py
+++ b/src/transformers/models/nllb/tokenization_nllb_fast.py
@@ -151,10 +151,20 @@ def __init__(
         src_lang=None,
         tgt_lang=None,
         additional_special_tokens=None,
-        **kwargs
+        legacy_behaviour=False,
+        **kwargs,
     ):
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+        self.legacy_behaviour = legacy_behaviour
+
+        _additional_special_tokens = FAIRSEQ_LANGUAGE_CODES.copy()
+
+        if additional_special_tokens is not None:
+            # Only add those special tokens if they are not already there.
+            _additional_special_tokens.extend(
+                [t for t in additional_special_tokens if t not in _additional_special_tokens]
+            )
 
         super().__init__(
             vocab_file=vocab_file,
@@ -168,22 +178,13 @@ def __init__(
             mask_token=mask_token,
             src_lang=src_lang,
             tgt_lang=tgt_lang,
-            additional_special_tokens=additional_special_tokens,
+            additional_special_tokens=_additional_special_tokens,
+            legacy_behaviour=legacy_behaviour,
             **kwargs,
         )
 
         self.vocab_file = vocab_file
-        self.can_save_slow_tokenizer = False if not self.vocab_file else True
-
-        _additional_special_tokens = FAIRSEQ_LANGUAGE_CODES.copy()
 
-        if additional_special_tokens is not None:
-            # Only add those special tokens if they are not already there.
-            _additional_special_tokens.extend(
-                [t for t in additional_special_tokens if t not in _additional_special_tokens]
-            )
-
-        self.add_special_tokens({"additional_special_tokens": _additional_special_tokens})
         self.lang_code_to_id = {
             lang_code: self.convert_tokens_to_ids(lang_code) for lang_code in FAIRSEQ_LANGUAGE_CODES
         }
@@ -193,6 +194,10 @@ def __init__(
         self.tgt_lang = tgt_lang
         self.set_src_lang_special_tokens(self._src_lang)
 
+    @property
+    def can_save_slow_tokenizer(self) -> bool:
+        return os.path.isfile(self.vocab_file) if self.vocab_file else False
+
     @property
     def src_lang(self) -> str:
         return self._src_lang
@@ -287,10 +292,18 @@ def _switch_to_target_mode(self):
         return self.set_tgt_lang_special_tokens(self.tgt_lang)
 
     def set_src_lang_special_tokens(self, src_lang) -> None:
-        """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code]."""
+        """Reset the special tokens to the source lang setting.
+        - In legacy mode: No prefix and suffix=[eos, src_lang_code].
+        - In default mode: Prefix=[src_lang_code], suffix = [eos]
+        """
         self.cur_lang_code = self.convert_tokens_to_ids(src_lang)
-        self.prefix_tokens = []
-        self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
+
+        if self.legacy_behaviour:
+            self.prefix_tokens = []
+            self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
+        else:
+            self.prefix_tokens = [self.cur_lang_code]
+            self.suffix_tokens = [self.eos_token_id]
 
         prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
         suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
@@ -302,10 +315,17 @@ def set_src_lang_special_tokens(self, src_lang) -> None:
         )
 
     def set_tgt_lang_special_tokens(self, lang: str) -> None:
-        """Reset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code]."""
+        """Reset the special tokens to the target lang setting.
+        - In legacy mode: No prefix and suffix=[eos, tgt_lang_code].
+        - In default mode: Prefix=[tgt_lang_code], suffix = [eos]
+        """
         self.cur_lang_code = self.convert_tokens_to_ids(lang)
-        self.prefix_tokens = []
-        self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
+        if self.legacy_behaviour:
+            self.prefix_tokens = []
+            self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
+        else:
+            self.prefix_tokens = [self.cur_lang_code]
+            self.suffix_tokens = [self.eos_token_id]
 
         prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
         suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
diff --git a/src/transformers/models/nllb_moe/__init__.py b/src/transformers/models/nllb_moe/__init__.py
new file mode 100644
index 000000000000..ea0f7752ed0c
--- /dev/null
+++ b/src/transformers/models/nllb_moe/__init__.py
@@ -0,0 +1,68 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {
+    "configuration_nllb_moe": [
+        "NLLB_MOE_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "NllbMoeConfig",
+    ]
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_nllb_moe"] = [
+        "NLLB_MOE_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "NllbMoeForConditionalGeneration",
+        "NllbMoeModel",
+        "NllbMoePreTrainedModel",
+        "NllbMoeTop2Router",
+        "NllbMoeSparseMLP",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_nllb_moe import (
+        NLLB_MOE_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        NllbMoeConfig,
+    )
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_nllb_moe import (
+            NLLB_MOE_PRETRAINED_MODEL_ARCHIVE_LIST,
+            NllbMoeForConditionalGeneration,
+            NllbMoeModel,
+            NllbMoePreTrainedModel,
+            NllbMoeSparseMLP,
+            NllbMoeTop2Router,
+        )
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/nllb_moe/configuration_nllb_moe.py b/src/transformers/models/nllb_moe/configuration_nllb_moe.py
new file mode 100644
index 000000000000..f2701e3781b3
--- /dev/null
+++ b/src/transformers/models/nllb_moe/configuration_nllb_moe.py
@@ -0,0 +1,218 @@
+# coding=utf-8
+# Copyright 2023, HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" NLLB-MoE model configuration"""
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+NLLB_MOE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "facebook/nllb-moe-54B": "https://huggingface.co/facebook/nllb-moe-54b/resolve/main/config.json",
+}
+
+
+class NllbMoeConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`NllbMoeModel`]. It is used to instantiate an
+    NLLB-MoE model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the NLLB-MoE
+    [facebook/nllb-moe-54b](https://huggingface.co/facebook/nllb-moe-54b) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50265):
+            Vocabulary size of the NllbMoe model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`NllbMoeModel`] or
+        d_model (`int`, *optional*, defaults to 1024):
+            Dimensionality of the layers and the pooler layer.
+        encoder_layers (`int`, *optional*, defaults to 12):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 12):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in encoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for classifier.
+        max_position_embeddings (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        second_expert_policy ( `str`, *optional*, default to `"all"`):
+            The policy used for the sampling the probability of being sampled to a second expert for each token.
+        normalize_router_prob_before_dropping (`bool`, *optional*, defaults to `True`):
+            Whether or not to normalize the router probabilities before applying a mask based on the experts capacity
+            (capacity dropping).
+        batch_prioritized_routing (`bool`, *optional*, defaults to `True`):
+            Whether or not to orders the tokens by their router probabilities before capacity dropping. This means that
+            the tokens that have the highest probabilities will be routed before other tokens that might be further in
+            the sequence.
+        moe_eval_capacity_token_fraction (`float`, *optional*, defaults to 1.0):
+            Fraction of tokens as capacity during validation, if set to negative, uses the same as training. Should be
+            in range: (0.0, 1.0].
+        num_experts (`int`, *optional*, defaults to 128):
+            Number of experts for each NllbMoeSparseMlp layer.
+        expert_capacity (`int`, *optional*, defaults to 64):
+            Number of tokens that can be stored in each expert.
+        encoder_sparse_step (`int`, *optional*, defaults to 4):
+            Frequency of the sparse layers in the encoder. 4 means that one out of 4 layers will be sparse.
+        decoder_sparse_step (`int`, *optional*, defaults to 4):
+            Frequency of the sparse layers in the decoder. 4 means that one out of 4 layers will be sparse.
+        router_dtype (`str`, *optional*, default to `"float32"`):
+            The `dtype` used for the routers. It is preferable to keep the `dtype` to `"float32"` as specified in the
+            *selective precision* discussion in [the paper](https://arxiv.org/abs/2101.03961).
+        router_ignore_padding_tokens (`bool`, *optional*, defaults to `False`):
+            Whether to ignore padding tokens when routing. if `False`, the padding tokens are not routed to any
+            experts.
+        router_bias (`bool`, *optional*, defaults to `False`):
+            Whether or not the classifier of the router should have a bias.
+        moe_token_dropout (`float`, *optional*, defualt ot 0.2):
+            Masking rate for MoE expert output masking (EOM), which is implemented via a Dropout2d on the expert
+            outputs.
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether or not to return the router logits. Only set to `True` to get the auxiliary loss when training.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+
+    Example:
+
+    ```python
+    >>> from transformers import NllbMoeModel, NllbMoeConfig
+
+    >>> # Initializing a NllbMoe facebook/nllb-moe-54b style configuration
+    >>> configuration = NllbMoeConfig()
+
+    >>> # Initializing a model from the facebook/nllb-moe-54b style configuration
+    >>> model = NllbMoeModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "nllb-moe"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
+
+    def __init__(
+        self,
+        vocab_size=128112,
+        max_position_embeddings=1024,
+        encoder_layers=12,
+        encoder_ffn_dim=4096,
+        encoder_attention_heads=16,
+        decoder_layers=12,
+        decoder_ffn_dim=4096,
+        decoder_attention_heads=16,
+        encoder_layerdrop=0.05,
+        decoder_layerdrop=0.05,
+        use_cache=True,
+        is_encoder_decoder=True,
+        activation_function="relu",
+        d_model=1024,
+        dropout=0.1,
+        attention_dropout=0.1,
+        activation_dropout=0.0,
+        init_std=0.02,
+        decoder_start_token_id=2,
+        scale_embedding=True,
+        router_bias=False,
+        router_dtype="float32",
+        router_ignore_padding_tokens=False,
+        num_experts=128,
+        expert_capacity=64,
+        encoder_sparse_step=4,
+        decoder_sparse_step=4,
+        router_z_loss_coef=0.001,
+        router_aux_loss_coef=0.001,
+        second_expert_policy="all",
+        normalize_router_prob_before_dropping=False,
+        batch_prioritized_routing=False,
+        moe_eval_capacity_token_fraction=1.0,
+        moe_token_dropout=0.2,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        output_router_logits=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+        self.router_z_loss_coef = router_z_loss_coef
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.decoder_sparse_step = decoder_sparse_step
+        self.encoder_sparse_step = encoder_sparse_step
+        self.num_experts = num_experts
+        self.expert_capacity = expert_capacity
+        self.router_bias = router_bias
+        if router_dtype not in ["float32", "float16", "bfloat16"]:
+            raise ValueError(f"`router_dtype` must be one of 'float32', 'float16' or 'bfloat16', got {router_dtype}")
+        self.router_dtype = router_dtype
+
+        self.router_ignore_padding_tokens = router_ignore_padding_tokens
+        self.batch_prioritized_routing = batch_prioritized_routing
+        self.second_expert_policy = second_expert_policy
+        self.normalize_router_prob_before_dropping = normalize_router_prob_before_dropping
+        self.moe_eval_capacity_token_fraction = moe_eval_capacity_token_fraction
+        self.moe_token_dropout = moe_token_dropout
+        self.output_router_logits = output_router_logits
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            decoder_start_token_id=decoder_start_token_id,
+            **kwargs,
+        )
diff --git a/src/transformers/models/nllb_moe/convert_nllb_moe_sharded_original_checkpoint_to_pytorch.py b/src/transformers/models/nllb_moe/convert_nllb_moe_sharded_original_checkpoint_to_pytorch.py
new file mode 100644
index 000000000000..5f98c0ca3d92
--- /dev/null
+++ b/src/transformers/models/nllb_moe/convert_nllb_moe_sharded_original_checkpoint_to_pytorch.py
@@ -0,0 +1,160 @@
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import json
+import os
+
+import torch
+from torch import nn
+
+from transformers import NllbMoeConfig, NllbMoeModel
+from transformers.modeling_utils import dtype_byte_size
+from transformers.utils import WEIGHTS_INDEX_NAME, WEIGHTS_NAME
+
+
+def remove_ignore_keys_(state_dict):
+    ignore_keys = [
+        "encoder.version",
+        "decoder.version",
+        "model.encoder.version",
+        "model.decoder.version",
+        "decoder.output_projection.weight",
+        "_float_tensor",
+        "encoder.embed_positions._float_tensor",
+        "decoder.embed_positions._float_tensor",
+    ]
+    for k in ignore_keys:
+        state_dict.pop(k, None)
+
+
+def make_linear_from_emb(emb):
+    vocab_size, emb_size = emb.weight.shape
+    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
+    lin_layer.weight.data = emb.weight.data
+    return lin_layer
+
+
+def rename_fairseq_keys(state_dict, expert_idx=None):
+    new_dict = {}
+    for old_key in state_dict.keys():
+        key = old_key
+        if "moe_layer.experts." in key:
+            if expert_idx is not None:
+                key = key.replace("moe_layer.experts.0", f"ffn.experts.expert_{expert_idx}")
+            else:
+                key = key.replace("moe_layer.experts.", "ffn.experts.expert_")
+        if "gate" in key:
+            key = key.replace(".moe_layer.gate.wg", ".ffn.router.classifier")
+        if "fc2" and "experts" not in key:
+            key = key.replace(".fc2.", ".ffn.fc2.")
+        if "fc1" and "experts" not in key:
+            key = key.replace(".fc1.", ".ffn.fc1.")
+        if ".encoder_attn." in key:
+            key = key.replace(".encoder_attn.", ".cross_attention.")
+        if "encoder_attn_layer_norm" in key:
+            key = key.replace("encoder_attn_layer_norm", "cross_attention_layer_norm")
+        if "final_layer_norm" in key:
+            key = key.replace("final_layer_norm", "ff_layer_norm")
+        new_dict[key] = state_dict[old_key]
+    return new_dict
+
+
+def shard_on_the_fly(switch_checkpoint_path, dump_path, num_experts, dtype, weights_name: str = WEIGHTS_NAME):
+    sharded_state_dicts = []
+    total_size = 0
+    os.makedirs(dump_path, exist_ok=True)
+
+    for expert in range(num_experts):
+        expert_path = switch_checkpoint_path + f"-rank-{expert}.pt"
+        if os.path.isfile(expert_path):
+            expert_state = torch.load(expert_path)["model"]
+            remove_ignore_keys_(expert_state)
+            expert_state = rename_fairseq_keys(expert_state, expert)
+            save_path = os.path.join(
+                dump_path, weights_name.replace(".bin", f"-{len(sharded_state_dicts)+1:05d}-of-???.bin")
+            )
+            torch.save(expert_state, save_path)
+            sharded_state_dicts.append(expert_state.keys())
+            total_size += sum([value.numel() for key, value in expert_state.items()]) * dtype_byte_size(
+                expert_state[list(expert_state)[0]].dtype
+            )
+
+    # Add the last block
+    save_path = os.path.join(dump_path, weights_name.replace(".bin", f"-{len(sharded_state_dicts)+1:05d}-of-???.bin"))
+    shared_weights = torch.load(switch_checkpoint_path + "-shared.pt")["model"]
+    remove_ignore_keys_(shared_weights)
+    shared_weights = rename_fairseq_keys(shared_weights, None)
+    shared_weights["shared.weight"] = shared_weights["decoder.embed_tokens.weight"]
+    sharded_state_dicts.append(shared_weights.keys())
+
+    # If we only have the shared weights (dummy model/experts saved on the same file)
+    if len(sharded_state_dicts) == 1:
+        save_path = os.path.join(dump_path, weights_name)
+        torch.save(shared_weights, save_path)
+        return {weights_name: sharded_state_dicts[0]}, None
+    else:
+        torch.save(shared_weights, save_path)
+    # Otherwise, let's build the index
+    weight_map = {}
+    for idx, shard in enumerate(sharded_state_dicts):
+        shard_file = weights_name.replace(".bin", f"-{idx+1:05d}-of-{len(sharded_state_dicts):05d}.bin")
+        temp_filename = os.path.join(dump_path, weights_name.replace(".bin", f"-{idx+1:05d}-of-???.bin"))
+        os.rename(temp_filename, os.path.join(dump_path, shard_file))
+        for key in shard:
+            weight_map[key] = shard_file
+
+    # Add the metadata
+    metadata = {"total_size": total_size}
+    index = {"metadata": metadata, "weight_map": weight_map}
+
+    with open(os.path.join(dump_path, WEIGHTS_INDEX_NAME), "w", encoding="utf-8") as f:
+        content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+        f.write(content)
+
+    return metadata, index
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--nllb_moe_checkpoint_path",
+        default="/home/arthur_huggingface_co/fairseq/weights/checkpoints/model_moe_54b/checkpoint_2_300000",
+        type=str,
+        required=False,
+        help="Path to a directory containing a folder per layer. Follows the original Google format.",
+    )
+    parser.add_argument("--dtype", default="float32", type=str, required=False, help="dtype of the saved model")
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        default="/home/arthur_huggingface_co/fairseq/weights/checkpoints/hf-converted-moe-54b",
+        type=str,
+        required=False,
+        help="Path to the output pytorch model.",
+    )
+    args = parser.parse_args()
+    metadata, index = shard_on_the_fly(
+        args.nllb_moe_checkpoint_path,
+        args.pytorch_dump_folder_path,
+        128,
+        args.dtype,
+    )
+
+    config = NllbMoeConfig.from_pretrained(
+        "facebook/nllb-200-3.3B", encoder_sparse_step=4, decoder_sparse_step=4, num_experts=128
+    )
+    config.save_pretrained(args.pytorch_dump_folder_path)
+    model = NllbMoeModel.from_pretrained(args.pytorch_dump_folder_path)
+    print("Done")
+    model.save_pretrained(args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/nllb_moe/modeling_nllb_moe.py b/src/transformers/models/nllb_moe/modeling_nllb_moe.py
new file mode 100644
index 000000000000..f37f64627dfa
--- /dev/null
+++ b/src/transformers/models/nllb_moe/modeling_nllb_moe.py
@@ -0,0 +1,1832 @@
+# coding=utf-8
+# Copyright 2023 NllbMoe Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch NLLB-MoE model."""
+
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+from torch.utils.checkpoint import checkpoint
+
+from ...activations import ACT2FN
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...modeling_outputs import (
+    MoEModelOutput,
+    MoEModelOutputWithPastAndCrossAttentions,
+    Seq2SeqMoEModelOutput,
+    Seq2SeqMoEOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_nllb_moe import NllbMoeConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "NllbMoeConfig"
+_CHECKPOINT_FOR_DOC = "hf-internal-testing/dummy-nllb-moe-2-experts"
+_REAL_CHECKPOINT_FOR_DOC = "facebook/nllb-moe-54b"
+
+
+####################################################
+# This dict contains ids and associated url
+# for the pretrained weights provided with the models
+####################################################
+NLLB_MOE_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/nllb-moe-54b",
+    # See all NLLB-MOE models at https://huggingface.co/models?filter=nllb-moe
+]
+
+
+# Copied from transformers.models.bart.modeling_bart.shift_tokens_right
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
+
+    if pad_token_id is None:
+        raise ValueError("self.model.config.pad_token_id has to be defined.")
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+    return shifted_input_ids
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+# Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids
+def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+
+    Args:
+        x: torch.Tensor x:
+
+    Returns: torch.Tensor
+    """
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    mask = input_ids.ne(padding_idx).int()
+    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
+    return incremental_indices.long() + padding_idx
+
+
+def load_balancing_loss_func(router_probs: torch.Tensor, expert_indices: torch.Tensor) -> float:
+    r"""
+    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.
+
+    See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss
+    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
+    experts is too unbalanced.
+
+    Args:
+        router_probs (`torch.Tensor`):
+            Probability assigned to each expert per token. Shape: [batch_size, seqeunce_length, num_experts].
+        expert_indices (`torch.Tensor`):
+            Indices tensor of shape [batch_size, seqeunce_length] identifying the selected expert for a given token.
+
+    Returns:
+        The auxiliary loss.
+    """
+    if router_probs is None:
+        return 0
+
+    num_experts = router_probs.shape[-1]
+
+    # cast the expert indices to int64, otherwise one-hot encoding will fail
+    if expert_indices.dtype != torch.int64:
+        expert_indices = expert_indices.to(torch.int64)
+
+    if len(expert_indices.shape) == 2:
+        expert_indices = expert_indices.unsqueeze(2)
+
+    expert_mask = torch.nn.functional.one_hot(expert_indices, num_experts)
+
+    # For a given token, determine if it was routed to a given expert.
+    expert_mask = torch.max(expert_mask, axis=-2).values
+
+    # cast to float32 otherwise mean will fail
+    expert_mask = expert_mask.to(torch.float32)
+    tokens_per_group_and_expert = torch.mean(expert_mask, axis=-2)
+
+    router_prob_per_group_and_expert = torch.mean(router_probs, axis=-2)
+    return torch.mean(tokens_per_group_and_expert * router_prob_per_group_and_expert) * (num_experts**2)
+
+
+# Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100SinusoidalPositionalEmbedding
+class NllbMoeSinusoidalPositionalEmbedding(nn.Module):
+    """This module produces sinusoidal positional embeddings of any length."""
+
+    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        super().__init__()
+        self.offset = 2
+        self.embedding_dim = embedding_dim
+        self.padding_idx = padding_idx
+        self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
+
+    def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx)
+        if hasattr(self, "weights"):
+            # in forward put the weights on the correct dtype and device of the param
+            emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
+
+        self.register_buffer("weights", emb_weights, persistent=False)
+
+    @staticmethod
+    def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        """
+        Build sinusoidal embeddings.
+
+        This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
+        "Attention Is All You Need".
+        """
+        half_dim = embedding_dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb)
+        emb = torch.arange(num_embeddings, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
+        if embedding_dim % 2 == 1:
+            # zero pad
+            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
+        if padding_idx is not None:
+            emb[padding_idx, :] = 0
+
+        return emb.to(torch.get_default_dtype())
+
+    @torch.no_grad()
+    def forward(
+        self, input_ids: torch.Tensor = None, inputs_embeds: torch.Tensor = None, past_key_values_length: int = 0
+    ):
+        if input_ids is not None:
+            bsz, seq_len = input_ids.size()
+            # Create the position ids from the input token ids. Any padded tokens remain padded.
+            position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length).to(
+                input_ids.device
+            )
+        else:
+            bsz, seq_len = inputs_embeds.size()[:-1]
+            position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds, past_key_values_length)
+
+        # expand embeddings if needed
+        max_pos = self.padding_idx + 1 + seq_len + past_key_values_length
+        if max_pos > self.weights.size(0):
+            self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)
+
+        return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, self.weights.shape[-1]).detach()
+
+    def create_position_ids_from_inputs_embeds(self, inputs_embeds, past_key_values_length):
+        """
+        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+
+        Args:
+            inputs_embeds: torch.Tensor
+
+        Returns: torch.Tensor
+        """
+        input_shape = inputs_embeds.size()[:-1]
+        sequence_length = input_shape[1]
+
+        position_ids = torch.arange(
+            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+        )
+        return position_ids.unsqueeze(0).expand(input_shape).contiguous() + past_key_values_length
+
+
+class NllbMoeTop2Router(nn.Module):
+    """
+    Router using tokens choose top-2 experts assignment.
+
+    This router uses the same mechanism as in NLLB-MoE from the fairseq repository. Items are sorted by router_probs
+    and then routed to their choice of expert until the expert's expert_capacity is reached. **There is no guarantee
+    that each token is processed by an expert**, or that each expert receives at least one token.
+
+    The router combining weights are also returned to make sure that the states that are not updated will be masked.
+
+    """
+
+    def __init__(self, config: NllbMoeConfig):
+        super().__init__()
+        self.num_experts = config.num_experts
+        self.expert_capacity = config.expert_capacity
+        self.classifier = nn.Linear(config.hidden_size, self.num_experts, bias=config.router_bias)
+        self.router_ignore_padding_tokens = config.router_ignore_padding_tokens
+        self.dtype = getattr(torch, config.router_dtype)
+
+        self.second_expert_policy = config.second_expert_policy
+        self.normalize_router_prob_before_dropping = config.normalize_router_prob_before_dropping
+        self.batch_prioritized_routing = config.batch_prioritized_routing
+        self.moe_eval_capacity_token_fraction = config.moe_eval_capacity_token_fraction
+
+    def _cast_classifier(self):
+        r"""
+        `bitsandbytes` `Linear8bitLt` layers does not support manual casting Therefore we need to check if they are an
+        instance of the `Linear8bitLt` class by checking special attributes.
+        """
+        if not (hasattr(self.classifier, "SCB") or hasattr(self.classifier, "CB")):
+            self.classifier = self.classifier.to(self.dtype)
+
+    def normalize_router_probabilities(self, router_probs, top_1_mask, top_2_mask):
+        top_1_max_probs = (router_probs * top_1_mask).sum(dim=1)
+        top_2_max_probs = (router_probs * top_2_mask).sum(dim=1)
+        denom_s = torch.clamp(top_1_max_probs + top_2_max_probs, min=torch.finfo(router_probs.dtype).eps)
+        top_1_max_probs = top_1_max_probs / denom_s
+        top_2_max_probs = top_2_max_probs / denom_s
+        return top_1_max_probs, top_2_max_probs
+
+    def route_tokens(
+        self,
+        router_logits: torch.Tensor,
+        input_dtype: torch.dtype = torch.float32,
+        padding_mask: Optional[torch.LongTensor] = None,
+    ) -> Tuple:
+        """
+        Computes the `dispatch_mask` and the `dispatch_weights` for each experts. The masks are adapted to the expert
+        capacity.
+        """
+        nb_tokens = router_logits.shape[0]
+        # Apply Softmax and cast back to the original `dtype`
+        router_probs = nn.functional.softmax(router_logits, dim=-1, dtype=self.dtype).to(input_dtype)
+        top_1_expert_index = torch.argmax(router_probs, dim=-1)
+        top_1_mask = torch.nn.functional.one_hot(top_1_expert_index, num_classes=self.num_experts)
+
+        if self.second_expert_policy == "sampling":
+            gumbel = torch.distributions.gumbel.Gumbel(0, 1).rsample
+            router_logits += gumbel(router_logits.shape).to(router_logits.device)
+
+        # replace top_1_expert_index with min values
+        logits_except_top_1 = router_logits.masked_fill(top_1_mask.bool(), float("-inf"))
+        top_2_expert_index = torch.argmax(logits_except_top_1, dim=-1)
+        top_2_mask = torch.nn.functional.one_hot(top_2_expert_index, num_classes=self.num_experts)
+
+        if self.normalize_router_prob_before_dropping:
+            top_1_max_probs, top_2_max_probs = self.normalize_router_probabilities(
+                router_probs, top_1_mask, top_2_mask
+            )
+
+        if self.second_expert_policy == "random":
+            top_2_max_probs = (router_probs * top_2_mask).sum(dim=1)
+            sampled = (2 * top_2_max_probs) > torch.rand_like(top_2_max_probs.float())
+            top_2_mask = top_2_mask * sampled.repeat(self.num_experts, 1).transpose(1, 0)
+
+        if padding_mask is not None and not self.router_ignore_padding_tokens:
+            if len(padding_mask.shape) == 4:
+                # only get the last causal mask
+                padding_mask = padding_mask[:, :, -1, :].reshape(-1)[-nb_tokens:]
+            non_padding = ~padding_mask.bool()
+            top_1_mask = top_1_mask * non_padding.unsqueeze(-1).to(top_1_mask.dtype)
+            top_2_mask = top_2_mask * non_padding.unsqueeze(-1).to(top_1_mask.dtype)
+
+        if self.batch_prioritized_routing:
+            # sort tokens based on their routing probability
+            # to make sure important tokens are routed, first
+            importance_scores = -1 * router_probs.max(dim=1)[0]
+            sorted_top_1_mask = top_1_mask[importance_scores.argsort(dim=0)]
+            sorted_cumsum1 = (torch.cumsum(sorted_top_1_mask, dim=0) - 1) * sorted_top_1_mask
+            locations1 = sorted_cumsum1[importance_scores.argsort(dim=0).argsort(dim=0)]
+
+            sorted_top_2_mask = top_2_mask[importance_scores.argsort(dim=0)]
+            sorted_cumsum2 = (torch.cumsum(sorted_top_2_mask, dim=0) - 1) * sorted_top_2_mask
+            locations2 = sorted_cumsum2[importance_scores.argsort(dim=0).argsort(dim=0)]
+            # Update 2nd's location by accounting for locations of 1st
+            locations2 += torch.sum(top_1_mask, dim=0, keepdim=True)
+
+        else:
+            locations1 = torch.cumsum(top_1_mask, dim=0) - 1
+            locations2 = torch.cumsum(top_2_mask, dim=0) - 1
+            # Update 2nd's location by accounting for locations of 1st
+            locations2 += torch.sum(top_1_mask, dim=0, keepdim=True)
+
+        if not self.training and self.moe_eval_capacity_token_fraction > 0:
+            self.expert_capacity = math.ceil(self.moe_eval_capacity_token_fraction * nb_tokens)
+        else:
+            capacity = 2 * math.ceil(nb_tokens / self.num_experts)
+            self.expert_capacity = capacity if self.expert_capacity is None else self.expert_capacity
+
+        # Remove locations outside capacity from ( cumsum < capacity = False will not be routed)
+        top_1_mask = top_1_mask * torch.lt(locations1, self.expert_capacity)
+        top_2_mask = top_2_mask * torch.lt(locations2, self.expert_capacity)
+
+        if not self.normalize_router_prob_before_dropping:
+            top_1_max_probs, top_2_max_probs = self.normalize_router_probabilities(
+                router_probs, top_1_mask, top_2_mask
+            )
+
+        # Calculate combine_weights and dispatch_mask
+        gates1 = top_1_max_probs[:, None] * top_1_mask
+        gates2 = top_2_max_probs[:, None] * top_2_mask
+        router_probs = gates1 + gates2
+
+        return top_1_mask, router_probs
+
+    def forward(self, hidden_states: torch.Tensor, padding_mask: Optional[torch.LongTensor] = None) -> Tuple:
+        r"""
+        The hidden states are reshaped to simplify the computation of the router probabilities (combining weights for
+        each experts.)
+
+        Args:
+            hidden_states (`torch.Tensor`):
+                (batch_size, sequence_length, hidden_dim) from which router probabilities are computed.
+        Returns:
+            top_1_mask (`torch.Tensor` of shape (batch_size, sequence_length)):
+                Index tensor of shape [batch_size, sequence_length] corresponding to the expert selected for each token
+                using the top1 probabilities of the router.
+            router_probabilities (`torch.Tensor` of shape (batch_size, sequence_length, nump_experts)):
+                Tensor of shape (batch_size, sequence_length, num_experts) corresponding to the probabilities for each
+                token and expert. Used for routing tokens to experts.
+            router_logits (`torch.Tensor` of shape (batch_size, sequence_length))):
+                Logits tensor of shape (batch_size, sequence_length, num_experts) corresponding to raw router logits.
+                This is used later for computing router z-loss.
+        """
+        self.input_dtype = hidden_states.dtype
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.reshape((batch_size * sequence_length), hidden_dim)
+        hidden_states = hidden_states.to(self.dtype)
+        self._cast_classifier()
+        router_logits = self.classifier(hidden_states)
+        top_1_mask, router_probs = self.route_tokens(router_logits, self.input_dtype, padding_mask)
+        return top_1_mask, router_probs
+
+
+class NllbMoeDenseActDense(nn.Module):
+    def __init__(self, config: NllbMoeConfig, ffn_dim: int):
+        super().__init__()
+        self.fc1 = nn.Linear(config.d_model, ffn_dim)
+        self.fc2 = nn.Linear(ffn_dim, config.d_model)
+        self.dropout = nn.Dropout(config.activation_dropout)
+        self.act = ACT2FN[config.activation_function]
+
+    def forward(self, hidden_states):
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        if (
+            isinstance(self.fc2.weight, torch.Tensor)
+            and hidden_states.dtype != self.fc2.weight.dtype
+            and self.fc2.weight.dtype != torch.int8
+        ):
+            hidden_states = hidden_states.to(self.fc2.weight.dtype)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class NllbMoeSparseMLP(nn.Module):
+    r"""
+    Implementation of the NLLB-MoE sparse MLP module.
+    """
+
+    def __init__(self, config: NllbMoeConfig, ffn_dim: int, expert_class: nn.Module = NllbMoeDenseActDense):
+        super().__init__()
+        self.router = NllbMoeTop2Router(config)
+        self.moe_token_dropout = config.moe_token_dropout
+        self.token_dropout = nn.Dropout(self.moe_token_dropout)
+        self.num_experts = config.num_experts
+
+        self.experts = nn.ModuleDict()
+        for idx in range(self.num_experts):
+            self.experts[f"expert_{idx}"] = expert_class(config, ffn_dim)
+
+    def forward(self, hidden_states: torch.Tensor, padding_mask: Optional[torch.Tensor] = False):
+        r"""
+        The goal of this forward pass is to have the same number of operation as the equivalent `NllbMoeDenseActDense`
+        (mlp) layer. This means that all of the hidden states should be processed at most twice ( since we are using a
+        top_2 gating mecanism). This means that we keep the complexity to O(batch_size x sequence_length x hidden_dim)
+        instead of O(num_experts x batch_size x sequence_length x hidden_dim).
+
+        1- Get the `router_probs` from the `router`. The shape of the `router_mask` is `(batch_size X sequence_length,
+        num_expert)` and corresponds to the boolean version of the `router_probs`. The inputs are masked using the
+        `router_mask`.
+
+        2- Dispatch the hidden_states to its associated experts. The router probabilities are used to weight the
+        contribution of each experts when updating the masked hidden states.
+
+        Args:
+            hidden_states (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_dim)`):
+                The hidden states
+            padding_mask (`torch.Tensor`, *optional*, defaults to `False`):
+                Attention mask. Can be in the causal form or not.
+
+        Returns:
+            hidden_states (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_dim)`):
+                Updated hidden states
+            router_logits (`torch.Tensor` of shape `(batch_size, sequence_length, num_experts)`):
+                Needed for computing the loss
+
+        """
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+
+        top_1_mask, router_probs = self.router(hidden_states, padding_mask)
+        router_mask = router_probs.bool()
+        hidden_states = hidden_states.reshape((batch_size * sequence_length), hidden_dim)
+        masked_hidden_states = torch.einsum("bm,be->ebm", hidden_states, router_mask)
+        for idx, expert in enumerate(self.experts.values()):
+            token_indices = router_mask[:, idx]
+            combining_weights = router_probs[token_indices, idx]
+            expert_output = expert(masked_hidden_states[idx, token_indices])
+            if self.moe_token_dropout > 0:
+                if self.training:
+                    expert_output = self.token_dropout(expert_output)
+                else:
+                    expert_output *= 1 - self.moe_token_dropout
+            masked_hidden_states[idx, token_indices] = torch.einsum("b,be->be", combining_weights, expert_output)
+        hidden_states = masked_hidden_states.sum(dim=0).reshape(batch_size, sequence_length, hidden_dim)
+
+        top_1_expert_index = torch.argmax(top_1_mask, dim=-1)
+        return hidden_states, (router_probs, top_1_expert_index)
+
+
+# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->NllbMoe,key_value_states->encoder_hidden_states
+class NllbMoeAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if encoder_hidden_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = encoder_hidden_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        # `past_key_value[0].shape[2] == encoder_hidden_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_value` is the same as
+        # the provided `encoder_hidden_states` to support prefix tuning
+        if (
+            is_cross_attention
+            and past_key_value is not None
+            and past_key_value[0].shape[2] == encoder_hidden_states.shape[1]
+        ):
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(encoder_hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(encoder_hidden_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
+class NllbMoeEncoderLayer(nn.Module):
+    def __init__(self, config: NllbMoeConfig, is_sparse: bool = False):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.is_sparse = is_sparse
+        self.self_attn = NllbMoeAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.attn_dropout = nn.Dropout(config.dropout)
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        if not self.is_sparse:
+            self.ffn = NllbMoeDenseActDense(config, ffn_dim=config.encoder_ffn_dim)
+        else:
+            self.ffn = NllbMoeSparseMLP(config, ffn_dim=config.encoder_ffn_dim)
+        self.ff_layer_norm = nn.LayerNorm(config.d_model)
+        self.ff_dropout = nn.Dropout(config.activation_dropout)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        layer_head_mask: torch.Tensor,
+        output_attentions: bool = False,
+        output_router_logits: bool = False,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`):
+                attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very
+                large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weights, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.attn_dropout(hidden_states)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+
+        hidden_states = self.ff_layer_norm(hidden_states)
+        if self.is_sparse:
+            hidden_states, router_states = self.ffn(hidden_states, attention_mask)
+        else:
+            # router_states set to None to track which layers have None gradients.
+            hidden_states, router_states = self.ffn(hidden_states), None
+
+        hidden_states = self.ff_dropout(hidden_states)
+
+        hidden_states = residual + hidden_states
+
+        if hidden_states.dtype == torch.float16 and (
+            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
+        ):
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        if output_router_logits:
+            outputs += (router_states,)
+
+        return outputs
+
+
+class NllbMoeDecoderLayer(nn.Module):
+    def __init__(self, config: NllbMoeConfig, is_sparse: bool = False):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.is_sparse = is_sparse
+        self.self_attn = NllbMoeAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.attn_dropout = nn.Dropout(config.dropout)
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.cross_attention = NllbMoeAttention(
+            self.embed_dim, config.decoder_attention_heads, config.attention_dropout, is_decoder=True
+        )
+        self.cross_attention_layer_norm = nn.LayerNorm(self.embed_dim)
+        if not self.is_sparse:
+            self.ffn = NllbMoeDenseActDense(config, ffn_dim=config.decoder_ffn_dim)
+        else:
+            self.ffn = NllbMoeSparseMLP(config, ffn_dim=config.decoder_ffn_dim)
+        self.ff_layer_norm = nn.LayerNorm(config.d_model)
+        self.ff_dropout = nn.Dropout(config.activation_dropout)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        output_router_logits: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`):
+                attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very
+                large negative values.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`):
+                encoder attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by
+                very large negative values.
+            layer_head_mask (`torch.FloatTensor`):
+                mask for attention heads in a given layer of size `(encoder_attention_heads,)`.
+            cross_attn_layer_head_mask (`torch.FloatTensor`):
+                mask for cross-attention heads in a given layer of size `(decoder_attention_heads,)`.
+            past_key_value (`Tuple(torch.FloatTensor)`):
+                cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.attn_dropout(hidden_states)
+        hidden_states = residual + hidden_states
+
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+            hidden_states = self.cross_attention_layer_norm(hidden_states)
+
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.cross_attention(
+                hidden_states=hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                past_key_value=cross_attn_past_key_value,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                output_attentions=output_attentions,
+            )
+            hidden_states = self.attn_dropout(hidden_states)
+            hidden_states = residual + hidden_states
+
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value += cross_attn_present_key_value
+
+        # Fully Connected
+        residual = hidden_states
+
+        hidden_states = self.ff_layer_norm(hidden_states)
+        if self.is_sparse:
+            hidden_states, router_states = self.ffn(hidden_states, attention_mask)
+        else:
+            hidden_states, router_states = self.ffn(hidden_states), None
+
+        hidden_states = self.ff_dropout(hidden_states)
+
+        hidden_states = residual + hidden_states
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states, present_key_value)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        if output_router_logits:
+            outputs += (router_states,)
+
+        return outputs
+
+
+class NllbMoePreTrainedModel(PreTrainedModel):
+    config_class = NllbMoeConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["NllbMoeEncoderLayer", "NllbMoeDecoderLayer"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        std = self.config.init_std
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (NllbMoeDecoder, NllbMoeEncoder)):
+            module.gradient_checkpointing = value
+
+
+NLLB_MOE_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`NllbMoeConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+NLLB_MOE_GENERATION_EXAMPLE = r"""
+    Translation example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, NllbMoeForConditionalGeneration
+
+    >>> model = NllbMoeForConditionalGeneration.from_pretrained("facebook/nllb-moe-54b")
+    >>> tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-moe-54b")
+
+    >>> text_to_translate = "Life is like a box of chocolates"
+    >>> model_inputs = tokenizer(text_to_translate, return_tensors="pt")
+
+    >>> # translate to French
+    >>> gen_tokens = model.generate(**model_inputs, forced_bos_token_id=tokenizer.get_lang_id("eng_Latn"))
+    >>> print(tokenizer.batch_decode(gen_tokens, skip_special_tokens=True))
+    ```
+"""
+
+NLLB_MOE_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            NllbMoe uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
+            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
+            1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape
+            `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you
+            can choose to directly pass an embedded representation. This is useful if you want more control over how to
+            convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
+            input (see `past_key_values`). This is useful if you want more control over how to convert
+            `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+
+            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
+            of `inputs_embeds`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        output_router_logits (`bool`, *optional*):
+            Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
+            should not be returned during inference.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class NllbMoeEncoder(NllbMoePreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    [`NllbMoeEncoderLayer`].
+
+    Args:
+        config:
+            NllbMoeConfig
+        embed_tokens (nn.Embedding):
+            output embedding
+    """
+
+    def __init__(self, config: NllbMoeConfig, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+
+        embed_dim = config.d_model
+        self.padding_idx = config.pad_token_id
+        self.max_source_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
+
+        if embed_tokens is not None:
+            self.embed_tokens.weight = embed_tokens.weight
+
+        self.embed_positions = NllbMoeSinusoidalPositionalEmbedding(
+            config.max_position_embeddings,
+            embed_dim,
+            self.padding_idx,
+        )
+        sparse_step = config.encoder_sparse_step
+        self.layers = nn.ModuleList()
+        for i in range(config.encoder_layers):
+            is_sparse = (i + 1) % sparse_step == 0 if sparse_step > 0 else False
+            self.layers.append(NllbMoeEncoderLayer(config, is_sparse))
+
+        self.layer_norm = nn.LayerNorm(config.d_model)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            output_router_logits (`bool`, *optional*):
+                Whether or not to return the logits of all the routers. They are useful for computing the router loss,
+                and should not be returned during inference.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+
+        embed_pos = self.embed_positions(input_ids, inputs_embeds)
+        embed_pos = embed_pos.to(inputs_embeds.device)
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
+
+        encoder_states = () if output_hidden_states else None
+        all_router_probs = () if output_router_logits else None
+        all_attentions = () if output_attentions else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            if head_mask.size()[0] != len(self.layers):
+                raise ValueError(
+                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
+                    f" {head_mask.size()[0]}."
+                )
+
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = torch.rand([])
+            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+                layer_outputs = (None, None, None)
+            else:
+                if self.gradient_checkpointing and self.training:
+                    # create gradient checkpointing function
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
+
+                        return custom_forward
+
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(encoder_layer),
+                        hidden_states,
+                        attention_mask,
+                        (head_mask[idx] if head_mask is not None else None),
+                    )
+                else:
+                    layer_outputs = encoder_layer(
+                        hidden_states,
+                        attention_mask,
+                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                        output_attentions=output_attentions,
+                        output_router_logits=output_router_logits,
+                    )
+
+                hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions += (layer_outputs[1],)
+
+            if output_router_logits:
+                all_router_probs += (layer_outputs[-1],)
+
+        last_hidden_state = self.layer_norm(hidden_states)
+
+        if output_hidden_states:
+            encoder_states += (last_hidden_state,)
+
+        if not return_dict:
+            return tuple(
+                v for v in [last_hidden_state, encoder_states, all_attentions, all_router_probs] if v is not None
+            )
+
+        return MoEModelOutput(
+            last_hidden_state=last_hidden_state,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
+            router_probs=all_router_probs,
+        )
+
+
+class NllbMoeDecoder(NllbMoePreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`NllbMoeDecoderLayer`]
+
+    Args:
+        config:
+            NllbMoeConfig
+        embed_tokens (nn.Embedding):
+            output embedding
+    """
+
+    def __init__(self, config: NllbMoeConfig, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        self.max_target_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
+
+        if embed_tokens is not None:
+            self.embed_tokens.weight = embed_tokens.weight
+
+        self.embed_positions = NllbMoeSinusoidalPositionalEmbedding(
+            config.max_position_embeddings,
+            config.d_model,
+            self.padding_idx,
+        )
+
+        sparse_step = config.decoder_sparse_step
+        self.layers = nn.ModuleList()
+        for i in range(config.decoder_layers):
+            is_sparse = (i + 1) % sparse_step == 0 if sparse_step > 0 else False
+            self.layers.append(NllbMoeDecoderLayer(config, is_sparse))
+
+        self.layer_norm = nn.LayerNorm(config.d_model)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+                selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
+                cross-attention on hidden heads. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of
+                shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing
+                `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more
+                control over how to convert `input_ids` indices into associated vectors than the model's internal
+                embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            output_router_logits (`bool`, *optional*):
+                Whether or not to return the logits of all the routers. They are useful for computing the router loss,
+                and should not be returned during inference.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+
+        if attention_mask is not None and combined_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            combined_attention_mask = combined_attention_mask + _expand_mask(
+                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            )
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+
+        # embed positions
+        positions = self.embed_positions(input_ids, inputs_embeds, past_key_values_length)
+        positions = positions.to(inputs_embeds.device)
+
+        hidden_states = inputs_embeds + positions
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting" " `use_cache=False`..."
+                )
+                use_cache = False
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_router_probs = () if output_router_logits else None
+        all_cross_attentions = () if output_attentions else None
+        present_key_value_states = () if use_cache else None
+
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+            if attn_mask is not None:
+                if attn_mask.size()[0] != len(self.layers):
+                    raise ValueError(
+                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                        f" {head_mask.size()[0]}."
+                    )
+        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = torch.rand([])
+
+            skip_the_layer = True if self.training and (dropout_probability < self.layerdrop) else False
+            if not skip_the_layer or deepspeed_zero3_is_enabled:
+                layer_head_mask = head_mask[idx] if head_mask is not None else None
+                cross_attn_layer_head_mask = cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
+
+                past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+                # under deepspeed zero3 all gpus must run in sync
+                if self.gradient_checkpointing and self.training:
+                    if use_cache:
+                        logger.warning_once(
+                            "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                        )
+                        use_cache = False
+
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return tuple(module(*inputs, use_cache, output_attentions))
+
+                        return custom_forward
+
+                    layer_outputs = checkpoint(
+                        create_custom_forward(decoder_layer),
+                        hidden_states,
+                        combined_attention_mask,
+                        encoder_hidden_states,
+                        encoder_attention_mask,
+                        layer_head_mask,
+                        cross_attn_layer_head_mask,
+                        None,  # past_key_value is always None with gradient checkpointing
+                    )
+                else:
+                    layer_outputs = decoder_layer(
+                        hidden_states,
+                        attention_mask=combined_attention_mask,
+                        encoder_hidden_states=encoder_hidden_states,
+                        encoder_attention_mask=encoder_attention_mask,
+                        layer_head_mask=layer_head_mask,
+                        cross_attn_layer_head_mask=cross_attn_layer_head_mask,
+                        past_key_value=past_key_value,
+                        use_cache=use_cache,
+                        output_attentions=output_attentions,
+                        output_router_logits=output_router_logits,
+                    )
+
+                hidden_states = layer_outputs[0]
+
+            if skip_the_layer:
+                continue
+
+            if use_cache:
+                present_key_value_states += (layer_outputs[1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[2],)
+                all_cross_attentions += (layer_outputs[3],)
+
+            if output_router_logits:
+                all_router_probs += (layer_outputs[-1],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    present_key_value_states,
+                    all_hidden_states,
+                    all_self_attns,
+                    all_cross_attentions,
+                    all_router_probs,
+                ]
+                if v is not None
+            )
+        return MoEModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=present_key_value_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+            router_probs=all_router_probs,
+        )
+
+
+@add_start_docstrings(
+    "The bare NllbMoe Model outputting raw hidden-states without any specific head on top.",
+    NLLB_MOE_START_DOCSTRING,
+)
+class NllbMoeModel(NllbMoePreTrainedModel):
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+
+    def __init__(self, config: NllbMoeConfig):
+        super().__init__(config)
+
+        padding_idx, vocab_size = config.pad_token_id, config.vocab_size
+        self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)
+
+        self.encoder = NllbMoeEncoder(config, self.shared)
+        self.decoder = NllbMoeDecoder(config, self.shared)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, value):
+        self.shared = value
+        self.encoder.embed_tokens = self.shared
+        self.decoder.embed_tokens = self.shared
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    @add_start_docstrings_to_model_forward(NLLB_MOE_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(NLLB_MOE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqMoEModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], Seq2SeqMoEModelOutput]:
+        r"""
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, NllbMoeModel
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/random-nllb-moe-2-experts")
+        >>> model = SwitchTransformersModel.from_pretrained("hf-internal-testing/random-nllb-moe-2-experts")
+
+        >>> input_ids = tokenizer(
+        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
+        ... ).input_ids  # Batch size 1
+        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
+
+        >>> # preprocess: Prepend decoder_input_ids with start token which is pad token for NllbMoeModel
+        >>> decoder_input_ids = model._shift_right(decoder_input_ids)
+
+        >>> # forward pass
+        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                output_router_logits=output_router_logits,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, MoEModelOutput):
+            encoder_outputs = MoEModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+                router_probs=encoder_outputs[3] if len(encoder_outputs) > 3 else None,
+            )
+
+        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            output_router_logits=output_router_logits,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return Seq2SeqMoEModelOutput(
+            past_key_values=decoder_outputs.past_key_values,
+            cross_attentions=decoder_outputs.cross_attentions,
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            decoder_attentions=decoder_outputs.attentions,
+            encoder_router_logits=encoder_outputs.router_probs,
+            decoder_router_logits=decoder_outputs.router_probs,
+        )
+
+
+@add_start_docstrings(
+    "The NllbMoe Model with a language modeling head. Can be used for summarization.", NLLB_MOE_START_DOCSTRING
+)
+class NllbMoeForConditionalGeneration(NllbMoePreTrainedModel):
+    base_model_prefix = "model"
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
+
+    def __init__(self, config: NllbMoeConfig):
+        super().__init__(config)
+        self.model = NllbMoeModel(config)
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+
+        self.router_z_loss_coef = config.router_z_loss_coef
+        self.router_aux_loss_coef = config.router_aux_loss_coef
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_encoder(self):
+        return self.model.get_encoder()
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    @add_start_docstrings_to_model_forward(NLLB_MOE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqMoEOutput, config_class=_CONFIG_FOR_DOC)
+    @add_end_docstrings(NLLB_MOE_GENERATION_EXAMPLE)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], Seq2SeqMoEOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+        """
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_router_logits = (
+            output_router_logits if output_router_logits is not None else self.config.output_router_logits
+        )
+        if labels is not None:
+            if decoder_input_ids is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            encoder_outputs=encoder_outputs,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            output_router_logits=output_router_logits,
+            return_dict=return_dict,
+        )
+        lm_logits = self.lm_head(outputs[0])
+
+        loss = None
+        encoder_aux_loss = None
+        decoder_aux_loss = None
+
+        if labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-100)
+            # todo check in the config if router loss enables
+
+            if output_router_logits:
+                encoder_router_logits = outputs[-1]
+                decoder_router_logits = outputs[3 if output_attentions else 4]
+
+                # Compute the router loss (z_loss + auxiliary loss) for each router in the encoder and decoder
+                encoder_router_logits, encoder_expert_indexes = self._unpack_router_logits(encoder_router_logits)
+                encoder_aux_loss = load_balancing_loss_func(encoder_router_logits, encoder_expert_indexes)
+
+                decoder_router_logits, decoder_expert_indexes = self._unpack_router_logits(decoder_router_logits)
+                decoder_aux_loss = load_balancing_loss_func(decoder_router_logits, decoder_expert_indexes)
+
+            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
+
+            if output_router_logits and labels is not None:
+                aux_loss = self.router_aux_loss_coef * (encoder_aux_loss + decoder_aux_loss)
+                loss = loss + aux_loss
+
+        output = (loss,) if loss is not None else ()
+        if not return_dict:
+            output += (lm_logits,)
+            if output_router_logits:  # only return the loss if they are not None
+                output += (
+                    encoder_aux_loss,
+                    decoder_aux_loss,
+                    *outputs[1:],
+                )
+            else:
+                output += outputs[1:]
+
+            return output
+
+        return Seq2SeqMoEOutput(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            cross_attentions=outputs.cross_attentions,
+            encoder_aux_loss=encoder_aux_loss,
+            decoder_aux_loss=decoder_aux_loss,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+            decoder_attentions=outputs.decoder_attentions,
+            encoder_router_logits=outputs.encoder_router_logits,
+            decoder_router_logits=outputs.decoder_router_logits,
+        )
+
+    def _unpack_router_logits(self, router_outputs):
+        total_router_logits = []
+        total_expert_indexes = []
+        for router_output in router_outputs:
+            if router_output is not None:
+                router_logits, expert_indexes = router_output
+                total_router_logits.append(router_logits)
+                total_expert_indexes.append(expert_indexes)
+
+        total_router_logits = torch.cat(total_router_logits, dim=1) if len(total_router_logits) > 0 else None
+        total_expert_indexes = torch.stack(total_expert_indexes, dim=1) if len(total_expert_indexes) > 0 else None
+        return total_router_logits, total_expert_indexes
+
+    # Copied from transfomers.models.switch_transformers.SwitchTransformersForConditionalGeneration.prepare_inputs_for_generation
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs,
+    ):
+        # cut decoder_input_ids if past is used
+        if past_key_values is not None:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+
+        return {
+            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past_key_values,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
+        }
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
diff --git a/src/transformers/models/nougat/__init__.py b/src/transformers/models/nougat/__init__.py
new file mode 100644
index 000000000000..3cc8bbddf9e9
--- /dev/null
+++ b/src/transformers/models/nougat/__init__.py
@@ -0,0 +1,63 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_vision_available
+
+
+_import_structure = {
+    "processing_nougat": ["NougatProcessor"],
+}
+
+try:
+    if not is_tokenizers_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["tokenization_nougat_fast"] = ["NougatTokenizerFast"]
+
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["image_processing_nougat"] = ["NougatImageProcessor"]
+
+
+if TYPE_CHECKING:
+    from .processing_nougat import NougatProcessor
+
+    try:
+        if not is_tokenizers_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .tokenization_nougat_fast import NougatTokenizerFast
+
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .image_processing_nougat import NougatImageProcessor
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
diff --git a/src/transformers/models/nougat/convert_nougat_to_hf.py b/src/transformers/models/nougat/convert_nougat_to_hf.py
new file mode 100644
index 000000000000..ecc74fdb5fbe
--- /dev/null
+++ b/src/transformers/models/nougat/convert_nougat_to_hf.py
@@ -0,0 +1,282 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert Nougat checkpoints using the original `nougat` library. URL:
+https://github.com/facebookresearch/nougat/tree/main"""
+
+import argparse
+
+import torch
+from huggingface_hub import hf_hub_download
+from nougat import NougatModel
+from nougat.dataset.rasterize import rasterize_paper
+from nougat.utils.checkpoint import get_checkpoint
+from PIL import Image
+
+from transformers import (
+    DonutSwinConfig,
+    DonutSwinModel,
+    MBartConfig,
+    MBartForCausalLM,
+    NougatImageProcessor,
+    NougatProcessor,
+    NougatTokenizerFast,
+    VisionEncoderDecoderModel,
+)
+
+
+def get_configs(model):
+    original_config = model.config
+
+    encoder_config = DonutSwinConfig(
+        image_size=original_config.input_size,
+        patch_size=4,
+        depths=original_config.encoder_layer,
+        num_heads=[4, 8, 16, 32],
+        window_size=original_config.window_size,
+        embed_dim=128,
+    )
+    decoder_config = MBartConfig(
+        is_decoder=True,
+        is_encoder_decoder=False,
+        add_cross_attention=True,
+        decoder_layers=original_config.decoder_layer,
+        max_position_embeddings=original_config.max_position_embeddings,
+        vocab_size=len(
+            model.decoder.tokenizer
+        ),  # several special tokens are added to the vocab of XLMRobertaTokenizer, see repo on the hub (added_tokens.json)
+        scale_embedding=True,
+        add_final_layer_norm=True,
+        tie_word_embeddings=False,
+    )
+
+    return encoder_config, decoder_config
+
+
+# Copied from transformers.models.donut.convert_donut_to_pytorch.rename_key
+def rename_key(name):
+    if "encoder.model" in name:
+        name = name.replace("encoder.model", "encoder")
+    if "decoder.model" in name:
+        name = name.replace("decoder.model", "decoder")
+    if "patch_embed.proj" in name:
+        name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
+    if "patch_embed.norm" in name:
+        name = name.replace("patch_embed.norm", "embeddings.norm")
+    if name.startswith("encoder"):
+        if "layers" in name:
+            name = "encoder." + name
+        if "attn.proj" in name:
+            name = name.replace("attn.proj", "attention.output.dense")
+        if "attn" in name and "mask" not in name:
+            name = name.replace("attn", "attention.self")
+        if "norm1" in name:
+            name = name.replace("norm1", "layernorm_before")
+        if "norm2" in name:
+            name = name.replace("norm2", "layernorm_after")
+        if "mlp.fc1" in name:
+            name = name.replace("mlp.fc1", "intermediate.dense")
+        if "mlp.fc2" in name:
+            name = name.replace("mlp.fc2", "output.dense")
+
+        if name == "encoder.norm.weight":
+            name = "encoder.layernorm.weight"
+        if name == "encoder.norm.bias":
+            name = "encoder.layernorm.bias"
+
+    return name
+
+
+# Copied from transformers.models.donut.convert_donut_to_pytorch.convert_state_dict
+def convert_state_dict(orig_state_dict, model):
+    for key in orig_state_dict.copy().keys():
+        val = orig_state_dict.pop(key)
+
+        if "qkv" in key:
+            key_split = key.split(".")
+            layer_num = int(key_split[3])
+            block_num = int(key_split[5])
+            dim = model.encoder.encoder.layers[layer_num].blocks[block_num].attention.self.all_head_size
+
+            if "weight" in key:
+                orig_state_dict[
+                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight"
+                ] = val[:dim, :]
+                orig_state_dict[
+                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight"
+                ] = val[dim : dim * 2, :]
+                orig_state_dict[
+                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight"
+                ] = val[-dim:, :]
+            else:
+                orig_state_dict[
+                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias"
+                ] = val[:dim]
+                orig_state_dict[
+                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.bias"
+                ] = val[dim : dim * 2]
+                orig_state_dict[
+                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.bias"
+                ] = val[-dim:]
+        elif "attn_mask" in key or key in ["encoder.model.norm.weight", "encoder.model.norm.bias"]:
+            # HuggingFace implementation doesn't use attn_mask buffer
+            # and model doesn't use final LayerNorms for the encoder
+            pass
+        else:
+            orig_state_dict[rename_key(key)] = val
+
+    return orig_state_dict
+
+
+def convert_nougat_checkpoint(model_tag, pytorch_dump_folder_path=None, push_to_hub=False):
+    # load original model
+    checkpoint_path = get_checkpoint(None, model_tag)
+    original_model = NougatModel.from_pretrained(checkpoint_path)
+    original_model.eval()
+
+    # load HuggingFace model
+    encoder_config, decoder_config = get_configs(original_model)
+    encoder = DonutSwinModel(encoder_config)
+    decoder = MBartForCausalLM(decoder_config)
+    model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder)
+    model.eval()
+
+    state_dict = original_model.state_dict()
+    new_state_dict = convert_state_dict(state_dict, model)
+    model.load_state_dict(new_state_dict)
+
+    # verify results on PDF
+    filepath = hf_hub_download(repo_id="ysharma/nougat", filename="input/nougat.pdf", repo_type="space")
+    images = rasterize_paper(pdf=filepath, return_pil=True)
+    image = Image.open(images[0])
+
+    tokenizer_file = checkpoint_path / "tokenizer.json"
+    tokenizer = NougatTokenizerFast(tokenizer_file=str(tokenizer_file))
+    tokenizer.pad_token = "<pad>"
+    tokenizer.bos_token = "<s>"
+    tokenizer.eos_token = "</s>"
+    tokenizer.unk_token = "<unk>"
+    tokenizer.model_max_length = original_model.config.max_length
+
+    size = {"height": original_model.config.input_size[0], "width": original_model.config.input_size[1]}
+    image_processor = NougatImageProcessor(
+        do_align_long_axis=original_model.config.align_long_axis,
+        size=size,
+    )
+    processor = NougatProcessor(image_processor=image_processor, tokenizer=tokenizer)
+
+    # verify pixel_values
+    pixel_values = processor(image, return_tensors="pt").pixel_values
+    original_pixel_values = original_model.encoder.prepare_input(image).unsqueeze(0)
+
+    assert torch.allclose(original_pixel_values, pixel_values)
+
+    # verify patch embeddings
+    original_patch_embed = original_model.encoder.model.patch_embed(pixel_values)
+    patch_embeddings, _ = model.encoder.embeddings(pixel_values)
+    assert torch.allclose(original_patch_embed, patch_embeddings)
+
+    # verify encoder hidden states
+    original_last_hidden_state = original_model.encoder(pixel_values)
+    last_hidden_state = model.encoder(pixel_values).last_hidden_state
+    assert torch.allclose(original_last_hidden_state, last_hidden_state, atol=1e-2)
+
+    # NOTE original model does not use tied weights for embeddings of decoder
+    original_embeddings = original_model.decoder.model.model.decoder.embed_tokens
+    embeddings = model.decoder.model.decoder.embed_tokens
+    assert torch.allclose(original_embeddings.weight, embeddings.weight, atol=1e-3)
+
+    # verify decoder hidden states
+    prompt = "hello world"
+    decoder_input_ids = original_model.decoder.tokenizer(
+        prompt, add_special_tokens=False, return_tensors="pt"
+    ).input_ids
+    decoder_attention_mask = torch.ones_like(decoder_input_ids)
+    original_logits = original_model(
+        image_tensors=pixel_values, decoder_input_ids=decoder_input_ids, attention_mask=decoder_attention_mask
+    ).logits
+    logits = model(
+        pixel_values,
+        decoder_input_ids=decoder_input_ids[:, :-1],
+        decoder_attention_mask=decoder_attention_mask[:, :-1],
+    ).logits
+    assert torch.allclose(original_logits, logits, atol=1e-3)
+
+    # verify generation
+    outputs = model.generate(
+        pixel_values,
+        min_length=1,
+        max_length=30,
+        pad_token_id=tokenizer.pad_token_id,
+        eos_token_id=tokenizer.eos_token_id,
+        use_cache=True,
+        bad_words_ids=[
+            [tokenizer.unk_token_id],
+        ],
+        return_dict_in_generate=True,
+        do_sample=False,
+    )
+    generated = tokenizer.batch_decode(outputs.sequences, skip_special_tokens=True)[0]
+
+    if model_tag == "0.1.0-base":
+        expected_generation = "# Nougat: Neural Optical Understanding for Academic Documents\n\nLukas Blecher\n\nCorrespondence to: lblec"
+    elif model_tag == "0.1.0-small":
+        expected_generation = (
+            "# Nougat: Neural Optical Understanding for Academic Documents\n\nLukas Blecher\n\nCorrespondence to: lble"
+        )
+    else:
+        raise ValueError(f"Unexpected model tag: {model_tag}")
+
+    assert generated == expected_generation
+    print("Looks ok!")
+
+    if pytorch_dump_folder_path is not None:
+        print(f"Saving model and processor to {pytorch_dump_folder_path}")
+        model.save_pretrained(pytorch_dump_folder_path)
+        processor.save_pretrained(pytorch_dump_folder_path)
+
+    if push_to_hub:
+        tag_to_name = {"0.1.0-base": "nougat-base", "0.1.0-small": "nougat-small"}
+        model_name = tag_to_name[model_tag]
+
+        model.push_to_hub(f"facebook/{model_name}")
+        processor.push_to_hub(f"facebook/{model_name}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--model_tag",
+        default="0.1.0-base",
+        required=False,
+        type=str,
+        choices=["0.1.0-base", "0.1.0-small"],
+        help="Tag of the original model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        default=None,
+        required=False,
+        type=str,
+        help="Path to the output PyTorch model directory.",
+    )
+    parser.add_argument(
+        "--push_to_hub",
+        action="store_true",
+        help="Whether or not to push the converted model and processor to the 🤗 hub.",
+    )
+
+    args = parser.parse_args()
+    convert_nougat_checkpoint(args.model_tag, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/nougat/image_processing_nougat.py b/src/transformers/models/nougat/image_processing_nougat.py
new file mode 100644
index 000000000000..882614059f9d
--- /dev/null
+++ b/src/transformers/models/nougat/image_processing_nougat.py
@@ -0,0 +1,510 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Nougat."""
+
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+    get_resize_output_image_size,
+    pad,
+    resize,
+    to_channel_dimension_format,
+    to_pil_image,
+)
+from ...image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import TensorType, logging
+from ...utils.import_utils import is_cv2_available, is_vision_available
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_cv2_available():
+    pass
+
+
+if is_vision_available():
+    import PIL
+
+
+class NougatImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Nougat image processor.
+
+    Args:
+        do_crop_margin (`bool`, *optional*, defaults to `True`):
+            Whether to crop the image margins.
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
+            `do_resize` in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"height": 896, "width": 672}`):
+            Size of the image after resizing. Can be overridden by `size` in the `preprocess` method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
+        do_thumbnail (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image using thumbnail method.
+        do_align_long_axis (`bool`, *optional*, defaults to `False`):
+            Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
+        do_pad (`bool`, *optional*, defaults to `True`):
+            Whether to pad the images to the largest image size in the batch.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+            `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
+            Image standard deviation.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_crop_margin: bool = True,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_thumbnail: bool = True,
+        do_align_long_axis: bool = False,
+        do_pad: bool = True,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+
+        size = size if size is not None else {"height": 896, "width": 672}
+        size = get_size_dict(size)
+
+        self.do_crop_margin = do_crop_margin
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_thumbnail = do_thumbnail
+        self.do_align_long_axis = do_align_long_axis
+        self.do_pad = do_pad
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+
+    def python_find_non_zero(self, image: np.array):
+        """This is a reimplementation of a findNonZero function equivalent to cv2."""
+        non_zero_indices = np.column_stack(np.nonzero(image))
+        idxvec = non_zero_indices[:, [1, 0]]
+        idxvec = idxvec.reshape(-1, 1, 2)
+        return idxvec
+
+    def python_bounding_rect(self, coordinates):
+        """This is a reimplementation of a BoundingRect function equivalent to cv2."""
+        min_values = np.min(coordinates, axis=(0, 1)).astype(int)
+        max_values = np.max(coordinates, axis=(0, 1)).astype(int)
+        x_min, y_min = min_values[0], min_values[1]
+        width = max_values[0] - x_min + 1
+        height = max_values[1] - y_min + 1
+        return x_min, y_min, width, height
+
+    def crop_margin(
+        self,
+        image: np.array,
+        gray_threshold: int = 200,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.array:
+        """
+        Crops the margin of the image. Gray pixels are considered margin (i.e., pixels with a value below the
+        threshold).
+
+        Args:
+            image (`np.array`):
+                The image to be cropped.
+            gray_threshold (`int`, *optional*, defaults to `200`)
+                Value below which pixels are considered to be gray.
+            data_format (`ChannelDimension`, *optional*):
+                The channel dimension format of the output image. If unset, will use the inferred format from the
+                input.
+            input_data_format (`ChannelDimension`, *optional*):
+                The channel dimension format of the input image. If unset, will use the inferred format from the input.
+        """
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(image)
+
+        image = to_pil_image(image, input_data_format=input_data_format)
+        data = np.array(image.convert("L")).astype(np.uint8)
+        max_val = data.max()
+        min_val = data.min()
+        if max_val == min_val:
+            image = np.array(image)
+            image = (
+                to_channel_dimension_format(image, data_format, input_data_format)
+                if data_format is not None
+                else image
+            )
+            return image
+        data = (data - min_val) / (max_val - min_val) * 255
+        gray = data < gray_threshold
+        coords = self.python_find_non_zero(gray)
+        x_min, y_min, width, height = self.python_bounding_rect(coords)
+        image = image.crop((x_min, y_min, x_min + width, y_min + height))
+        image = np.array(image).astype(np.uint8)
+        image = to_channel_dimension_format(image, input_data_format, ChannelDimension.LAST)
+
+        image = (
+            to_channel_dimension_format(image, data_format, input_data_format) if data_format is not None else image
+        )
+
+        return image
+
+    # Copied from transformers.models.donut.image_processing_donut.DonutImageProcessor.align_long_axis
+    def align_long_axis(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Align the long axis of the image to the longest axis of the specified size.
+
+        Args:
+            image (`np.ndarray`):
+                The image to be aligned.
+            size (`Dict[str, int]`):
+                The size `{"height": h, "width": w}` to align the long axis to.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The data format of the output image. If unset, the same format as the input image is used.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+
+        Returns:
+            `np.ndarray`: The aligned image.
+        """
+        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+        output_height, output_width = size["height"], size["width"]
+
+        if (output_width < output_height and input_width > input_height) or (
+            output_width > output_height and input_width < input_height
+        ):
+            image = np.rot90(image, 3)
+
+        if data_format is not None:
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+
+        return image
+
+    def pad_image(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Pad the image to the specified size at the top, bottom, left and right.
+
+        Args:
+            image (`np.ndarray`):
+                The image to be padded.
+            size (`Dict[str, int]`):
+                The size `{"height": h, "width": w}` to pad the image to.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The data format of the output image. If unset, the same format as the input image is used.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        output_height, output_width = size["height"], size["width"]
+        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+
+        delta_width = output_width - input_width
+        delta_height = output_height - input_height
+
+        pad_top = delta_height // 2
+        pad_left = delta_width // 2
+
+        pad_bottom = delta_height - pad_top
+        pad_right = delta_width - pad_left
+
+        padding = ((pad_top, pad_bottom), (pad_left, pad_right))
+        return pad(image, padding, data_format=data_format, input_data_format=input_data_format)
+
+    # Copied from transformers.models.donut.image_processing_donut.DonutImageProcessor.thumbnail
+    def thumbnail(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize the image to make a thumbnail. The image is resized so that no dimension is larger than any
+        corresponding dimension of the specified size.
+
+        Args:
+            image (`np.ndarray`):
+                The image to be resized.
+            size (`Dict[str, int]`):
+                The size `{"height": h, "width": w}` to resize the image to.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                The resampling filter to use.
+            data_format (`Optional[Union[str, ChannelDimension]]`, *optional*):
+                The data format of the output image. If unset, the same format as the input image is used.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+        output_height, output_width = size["height"], size["width"]
+
+        # We always resize to the smallest of either the input or output size.
+        height = min(input_height, output_height)
+        width = min(input_width, output_width)
+
+        if height == input_height and width == input_width:
+            return image
+
+        if input_height > input_width:
+            width = int(input_width * height / input_height)
+        elif input_width > input_height:
+            height = int(input_height * width / input_width)
+
+        return resize(
+            image,
+            size=(height, width),
+            resample=resample,
+            reducing_gap=2.0,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    # Copied from transformers.models.donut.image_processing_donut.DonutImageProcessor.resize
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resizes `image` to `(height, width)` specified by `size` using the PIL library.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        size = get_size_dict(size)
+        shortest_edge = min(size["height"], size["width"])
+        output_size = get_resize_output_image_size(
+            image, size=shortest_edge, default_to_square=False, input_data_format=input_data_format
+        )
+        resized_image = resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+        return resized_image
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_crop_margin: bool = None,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_thumbnail: bool = None,
+        do_align_long_axis: bool = None,
+        do_pad: bool = None,
+        do_rescale: bool = None,
+        rescale_factor: Union[int, float] = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255.
+            do_crop_margin (`bool`, *optional*, defaults to `self.do_crop_margin`):
+                Whether to crop the image margins.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to min(size["height"],
+                size["width"]) with the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_thumbnail (`bool`, *optional*, defaults to `self.do_thumbnail`):
+                Whether to resize the image using thumbnail method.
+            do_align_long_axis (`bool`, *optional*, defaults to `self.do_align_long_axis`):
+                Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
+            do_pad (`bool`, *optional*, defaults to `self.do_pad`):
+                Whether to pad the images to the largest image size in the batch.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image by the specified scale `rescale_factor`.
+            rescale_factor (`int` or `float`, *optional*, defaults to `self.rescale_factor`):
+                Scale factor to use if rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: defaults to the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_crop_margin = do_crop_margin if do_crop_margin is not None else self.do_crop_margin
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        resample = resample if resample is not None else self.resample
+        do_thumbnail = do_thumbnail if do_thumbnail is not None else self.do_thumbnail
+        do_align_long_axis = do_align_long_axis if do_align_long_axis is not None else self.do_align_long_axis
+        do_pad = do_pad if do_pad is not None else self.do_pad
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        if do_resize and size is None:
+            raise ValueError("Size must be specified if do_resize is True.")
+
+        if do_pad and size is None:
+            raise ValueError("Size must be specified if do_pad is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize and (image_mean is None or image_std is None):
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        if do_crop_margin:
+            images = [self.crop_margin(image, input_data_format=input_data_format) for image in images]
+
+        if do_align_long_axis:
+            images = [self.align_long_axis(image, size=size, input_data_format=input_data_format) for image in images]
+
+        if do_resize:
+            images = [
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_thumbnail:
+            images = [self.thumbnail(image=image, size=size, input_data_format=input_data_format) for image in images]
+
+        if do_pad:
+            images = [self.pad_image(image=image, size=size, input_data_format=input_data_format) for image in images]
+
+        if do_rescale:
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_normalize:
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/src/transformers/models/nougat/processing_nougat.py b/src/transformers/models/nougat/processing_nougat.py
new file mode 100644
index 000000000000..b63639e2dd1f
--- /dev/null
+++ b/src/transformers/models/nougat/processing_nougat.py
@@ -0,0 +1,159 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Nougat.
+"""
+
+from typing import Dict, List, Optional, Union
+
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput, TruncationStrategy
+
+from ...processing_utils import ProcessorMixin
+from ...utils import PaddingStrategy, TensorType
+
+
+class NougatProcessor(ProcessorMixin):
+    r"""
+    Constructs a Nougat processor which wraps a Nougat image processor and a Nougat tokenizer into a single processor.
+
+    [`NougatProcessor`] offers all the functionalities of [`NougatImageProcessor`] and [`NougatTokenizerFast`]. See the
+    [`~NougatProcessor.__call__`] and [`~NougatProcessor.decode`] for more information.
+
+    Args:
+        image_processor ([`NougatImageProcessor`]):
+            An instance of [`NougatImageProcessor`]. The image processor is a required input.
+        tokenizer ([`NougatTokenizerFast`]):
+            An instance of [`NougatTokenizerFast`]. The tokenizer is a required input.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(self, image_processor, tokenizer):
+        super().__init__(image_processor, tokenizer)
+        self.current_processor = self.image_processor
+
+    def __call__(
+        self,
+        images=None,
+        text=None,
+        do_crop_margin: bool = None,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: "PILImageResampling" = None,  # noqa: F821
+        do_thumbnail: bool = None,
+        do_align_long_axis: bool = None,
+        do_pad: bool = None,
+        do_rescale: bool = None,
+        rescale_factor: Union[int, float] = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        data_format: Optional["ChannelDimension"] = "channels_first",  # noqa: F821
+        input_data_format: Optional[Union[str, "ChannelDimension"]] = None,  # noqa: F821
+        text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
+        text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        text_pair_target: Optional[
+            Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]
+        ] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        is_split_into_words: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+    ):
+        if images is None and text is None:
+            raise ValueError("You need to specify either an `images` or `text` input to process.")
+
+        if images is not None:
+            inputs = self.image_processor(
+                images,
+                do_crop_margin=do_crop_margin,
+                do_resize=do_resize,
+                size=size,
+                resample=resample,
+                do_thumbnail=do_thumbnail,
+                do_align_long_axis=do_align_long_axis,
+                do_pad=do_pad,
+                do_rescale=do_rescale,
+                rescale_factor=rescale_factor,
+                do_normalize=do_normalize,
+                image_mean=image_mean,
+                image_std=image_std,
+                return_tensors=return_tensors,
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
+        if text is not None:
+            encodings = self.tokenizer(
+                text,
+                text_pair=text_pair,
+                text_target=text_target,
+                text_pair_target=text_pair_target,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                is_split_into_words=is_split_into_words,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+            )
+
+        if text is None:
+            return inputs
+        elif images is None:
+            return encodings
+        else:
+            inputs["labels"] = encodings["input_ids"]
+            return inputs
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to NougatTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please refer
+        to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to NougatTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    def post_process_generation(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to NougatTokenizer's [`~PreTrainedTokenizer.post_process_generation`].
+        Please refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.post_process_generation(*args, **kwargs)
diff --git a/src/transformers/models/nougat/tokenization_nougat_fast.py b/src/transformers/models/nougat/tokenization_nougat_fast.py
new file mode 100644
index 000000000000..9d95940875e1
--- /dev/null
+++ b/src/transformers/models/nougat/tokenization_nougat_fast.py
@@ -0,0 +1,634 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fast tokenizer class for Nougat.
+"""
+import re
+from functools import partial
+from multiprocessing import Pool
+from typing import List, Union
+
+import numpy as np
+
+from transformers.tokenization_utils_base import INIT_TOKENIZER_DOCSTRING
+from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+from transformers.utils import add_end_docstrings
+
+from ...utils import is_levenshtein_available, is_nltk_available, logging, requires_backends
+
+
+if is_levenshtein_available():
+    from Levenshtein import ratio
+
+if is_nltk_available():
+    import nltk
+
+
+logger = logging.get_logger(__name__)
+
+
+INIT_TOKENIZER_DOCSTRING += """
+        tokenizer_object ([`tokenizers.Tokenizer`]):
+            A [`tokenizers.Tokenizer`] object from 🤗 tokenizers to instantiate from. See [Using tokenizers from 🤗
+            tokenizers](../fast_tokenizers) for more information.
+        tokenizer_file ([`str`]):
+            A path to a local JSON file representing a previously serialized [`tokenizers.Tokenizer`] object from 🤗
+            tokenizers.
+"""
+
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "tokenizer_file": {
+        "facebook/nougat-base": "https://huggingface.co/facebook/nougat-base/tokenizer/blob/main/tokenizer.json",
+    },
+}
+
+VOCAB_FILES_NAMES = {"tokenizer_file": "tokenizer.json"}
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"facebook/nougat-base": 3584}
+
+
+def markdown_compatible(text: str) -> str:
+    """
+    Make text compatible with Markdown formatting.
+
+    This function makes various text formatting adjustments to make it compatible with Markdown.
+
+    Args:
+        text (`str`):
+            The input text to be made Markdown-compatible.
+
+    Returns:
+        `str`: The Markdown-compatible text.
+    """
+    # equation tag
+    # Replace lines that start with a pattern like (decimal) \[some text\] with \[[some text] \tag{decimal}\].
+    text = re.sub(r"^\(([\d.]+[a-zA-Z]?)\) \\\[(.+?)\\\]$", r"\[\2 \\tag{\1}\]", text, flags=re.M)
+    # Replace lines that start with a pattern like \[some text\] (decimal)  with \[[some text] \tag{decimal}\].
+    text = re.sub(r"^\\\[(.+?)\\\] \(([\d.]+[a-zA-Z]?)\)$", r"\[\1 \\tag{\2}\]", text, flags=re.M)
+    # Replace lines that start with a pattern like \[some text\] (digits) \[another text\]  with \[[some text] \tag{digits}\] [another text].
+    text = re.sub(
+        r"^\\\[(.+?)\\\] \(([\d.]+[a-zA-Z]?)\) (\\\[.+?\\\])$",
+        r"\[\1 \\tag{\2}\] \3",
+        text,
+        flags=re.M,
+    )
+    # multi line
+    text = text.replace(r"\. ", ". ")
+    # bold formatting
+    text = text.replace(r"\bm{", r"\mathbf{").replace(r"{\\bm ", r"\mathbf{")
+    text = re.sub(r"\\mbox{ ?\\boldmath\$(.*?)\$}", r"\\mathbf{\1}", text)
+    # Reformat urls (http, ftp and https only) to markdown [url](url) clickable format
+    text = re.sub(
+        r"((?:http|ftp|https):\/\/(?:[\w_-]+(?:(?:\.[\w_-]+)+))(?:[\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-]))",
+        r"[\1](\1)",
+        text,
+    )
+    # algorithms
+    text = re.sub(r"```\s*(.+?)\s*```", r"```\n\1\n```", text, flags=re.S)
+
+    return text
+
+
+def normalize_list_like_lines(generation):
+    """
+    Normalize lines in the given text that resemble list items. The function looks for lines that start optionally with
+    '-' or '*', possibly followed by Roman numerals or digits indicating nesting levels. The function reformats such
+    lines to make them more structured.
+
+    Args:
+        generation (str): The input text containing lines that need to be normalized.
+
+    Returns:
+        str: The input text with the list-like lines normalized.
+
+    Note:
+        The function uses regular expressions to identify and reformat the list-like lines. The patterns capture
+        optional bullet points, nesting levels indicated by numerals, and the actual list item content. The
+        normalization adjusts the bullet point style and nesting levels based on the captured patterns.
+    """
+
+    # This matches lines starting with - or *, not followed by - or * (lists)
+    # that are then numbered by digits \d or roman numerals (one or more)
+    # and then, optional additional numbering of this line is captured
+    # this is then fed to re.finditer.
+    pattern = r"(?:^)(-|\*)?(?!-|\*) ?((?:\d|[ixv])+ )?.+? (-|\*) (((?:\d|[ixv])+)\.(\d|[ixv]) )?.*(?:$)"
+
+    for match in reversed(list(re.finditer(pattern, generation, flags=re.I | re.M))):
+        start, stop = match.span()
+        delim = match.group(3) + " "
+        splits = match.group(0).split(delim)
+        replacement = ""
+
+        if match.group(1) is not None:
+            splits = splits[1:]
+            delim1 = match.group(1) + " "
+        else:
+            delim1 = ""
+            continue  # Skip false positives
+
+        pre, post = generation[:start], generation[stop:]
+
+        for i, item in enumerate(splits):
+            level = 0
+            potential_numeral, _, rest = item.strip().partition(" ")
+            if not rest:
+                continue
+            # Infer current nesting level based on detected numbering
+            if re.match(r"^[\dixv]+((?:\.[\dixv])?)+$", potential_numeral, flags=re.I | re.M):
+                level = potential_numeral.count(".")
+
+            replacement += (
+                ("\n" if i > 0 else "") + ("\t" * level) + (delim if i > 0 or start == 0 else delim1) + item.strip()
+            )
+
+        if post == "":
+            post = "\n"
+
+        generation = pre + replacement + post
+
+    return generation
+
+
+def find_next_punctuation(text: str, start_idx=0):
+    """
+    Find the index of the next punctuation mark.
+
+    Args:
+        text (`str`):
+            String to examine
+        start_idx (`int`, *optional*)
+            Index where to start
+    """
+
+    for i in range(start_idx, len(text)):
+        if text[i] in [".", "?", "!", "\n"]:
+            return i
+
+    return None
+
+
+def truncate_repetitions(text: str, min_len: int = 30) -> str:
+    """
+    Attempt to truncate repeating segments in the input string.
+
+    This function looks for the longest repeating substring at the end of the input string and truncates it to appear
+    only once. To be considered for removal, repetitions need to be continuous.
+
+    Args:
+        text (`str`):
+            The input raw prediction to be truncated.
+        min_len (int):
+            The minimum length of the repeating segment.
+
+    Returns:
+        `str`: The input string with repeated segments truncated.
+    """
+    text_lower = text.lower()
+    text_length = len(text_lower)
+
+    if text_length < 2 * min_len:
+        return text
+
+    # try to find a length at which the tail is repeating
+    max_repetition_length = None
+    for repetition_length in range(min_len, int(text_length / 2)):
+        # check if there is a repetition at the end
+        same = True
+        for i in range(0, repetition_length):
+            if text_lower[text_length - repetition_length - i - 1] != text_lower[text_length - i - 1]:
+                same = False
+                break
+
+        if same:
+            max_repetition_length = repetition_length
+
+    if max_repetition_length is None:
+        return text
+
+    lcs = text_lower[-max_repetition_length:]
+
+    # remove all but the last repetition
+    substituted_text = text
+    substituted_text_lower = text_lower
+    while substituted_text_lower.endswith(lcs):
+        substituted_text = substituted_text[:-max_repetition_length]
+        substituted_text_lower = substituted_text_lower[:-max_repetition_length]
+
+    # this is the tail with the repetitions
+    repeating_tail = text_lower[len(substituted_text_lower) :]
+
+    # add until next punctuation and make sure last sentence is not repeating
+    substituted_text_lower_out = substituted_text_lower
+    while True:
+        sentence_end = find_next_punctuation(text_lower, len(substituted_text_lower_out))
+        sentence_start = find_next_punctuation(text_lower[::-1], len(substituted_text_lower_out))
+        if sentence_end and sentence_start:
+            sentence = text_lower[sentence_start:sentence_end]
+            substituted_text_lower_out = text_lower[: sentence_end + 1]
+            if sentence in repeating_tail:
+                break
+        else:
+            break
+
+    text_out = text[: len(substituted_text_lower_out)]
+
+    return text_out
+
+
+def remove_numbers(lines):
+    def _clean(s):
+        return re.sub(r"(?:[\d_]|\*\*)", "", s).strip()
+
+    if type(lines) is str:
+        return _clean(lines)
+    out = []
+    for l in lines:
+        out.append(_clean(l))
+    return out
+
+
+def get_slices(lines, clean_lines):
+    """
+    Get slices of text based on specific criteria within the lines.
+
+    This function identifies and returns slices of text from the input lines based on certain conditions.
+
+    These conditions were chosen by the Nougat authors:
+    - The slice is less than 200 characters long.
+    - The slice is more than 3 characters long.
+    - The slice does not start with "[MISSING_PAGE".
+    - The slice is either the same as the next slice or the ratio of the two in terms of Levensthein distance is
+      greater than 0.9.
+
+    Args:
+        lines (`List[str]`):
+            The list of lines containing the text.
+        clean_lines (`List[str]`):
+            A cleaned version of the text (without numbers).
+
+    Returns:
+        `List[tuple]`: A list of tuples representing the start and end indices of text slices.
+    """
+    indices = np.zeros(len(lines))
+    for i in range(len(lines) - 1):
+        j = i + 1
+        while not clean_lines[j] and j < len(lines) - 1:
+            j += 1
+        if (
+            len(clean_lines[i]) < 200
+            and len(clean_lines[i]) > 3
+            and len(clean_lines[j]) < 200
+            and len(clean_lines[j]) > 3
+            and not clean_lines[i].startswith("[MISSING_PAGE")
+            and (clean_lines[i] == clean_lines[j] or ratio(clean_lines[i], clean_lines[j]) > 0.9)
+        ):
+            indices[i:j] = 1
+    ids = np.where(indices)[0]
+    slices = []
+    if len(ids) == 0:
+        return slices
+    j0 = 0
+    for j, x in enumerate(np.diff(ids) > 3):
+        if x:
+            slices.append((ids[j0], ids[j] + 2))
+            j0 = j + 1
+    slices.append((ids[j0], ids[-1] + 2))
+    return [sli for sli in slices if sli[1] - sli[0] > 15]
+
+
+def remove_slice_from_lines(lines, clean_text, slice) -> str:
+    """
+    Remove a slice of text from the lines based on specific criteria.
+
+    This function identifies a slice of text within the lines and removes it based on certain conditions.
+
+    Args:
+        lines (list of str): The list of lines containing the text.
+        clean_text (list of str): A cleaned version of the text (without numbers).
+        slice (tuple): A tuple representing the start and end indices of the slice to be removed.
+
+    Returns:
+        str: The removed slice of text as a single string.
+    """
+    base = clean_text[slice[0]]
+    section = list(slice)
+    check_start_flag = False
+    # backwards pass, at most 5 lines
+    for line_idx in range(max(0, slice[0] - 1), max(0, slice[0] - 5), -1):
+        if not lines[line_idx]:
+            continue
+        if lines[line_idx] == "## References":
+            section[0] = line_idx
+            break
+        elif ratio(base, remove_numbers(lines[line_idx])) < 0.9:
+            section[0] = line_idx + 1
+            potential_ref = remove_numbers(lines[max(0, line_idx - 1)].partition("* [")[-1])
+            if len(potential_ref) >= 0.75 * len(base) and ratio(base, potential_ref) < 0.9:
+                section[0] = line_idx
+            check_start_flag = True
+            break
+    # forward pass, at most 5 lines
+    for line_idx in range(min(len(lines), slice[1]), min(len(lines), slice[1] + 5)):
+        if ratio(base, remove_numbers(lines[line_idx])) < 0.9:
+            section[1] = line_idx
+            break
+    if len(lines) <= section[1]:
+        section[1] = len(lines) - 1
+    to_delete = "\n".join(lines[section[0] : section[1] + 1])
+    # cut off next page content
+    itera, iterb = enumerate(lines[section[1] - 1]), enumerate(lines[section[1]])
+    while True:
+        try:
+            (ia, a) = next(itera)
+            while a.isnumeric():
+                (ia, a) = next(itera)
+            (ib, b) = next(iterb)
+            while b.isnumeric():
+                (ib, b) = next(iterb)
+            if a != b:
+                break
+        except StopIteration:
+            break
+    if check_start_flag and "* [" in to_delete:
+        to_delete = "* [" + to_delete.partition("* [")[-1]
+    try:
+        delta = len(lines[section[1]]) - ib - 1
+        if delta > 0:
+            to_delete = to_delete[:-delta]
+    except UnboundLocalError:
+        pass
+
+    return to_delete.strip()
+
+
+@add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
+class NougatTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Fast tokenizer for Nougat (backed by HuggingFace tokenizers library).
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods. This class mainly adds Nougat-specific
+    methods for postprocessing the generated text.
+
+    Args:
+        vocab_file (`str`, *optional*):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .model extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        tokenizer_file (`str`, *optional*):
+            [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
+            contains everything needed to load the tokenizer.
+
+        clean_up_tokenization_spaces (`str`, *optional*, defaults to `False`):
+            Wether to cleanup spaces after decoding, cleanup consists in removing potential artifacts like extra
+            spaces.
+
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = None
+
+    def __init__(
+        self,
+        vocab_file=None,
+        tokenizer_file=None,
+        clean_up_tokenization_spaces=False,
+        unk_token="<unk>",
+        bos_token="<s>",
+        eos_token="</s>",
+        pad_token="<pad>",
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_file=vocab_file,
+            tokenizer_file=tokenizer_file,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            **kwargs,
+        )
+        self.vocab_file = vocab_file
+
+    def remove_hallucinated_references(self, text: str) -> str:
+        """
+        Remove hallucinated or missing references from the text.
+
+        This function identifies and removes references that are marked as missing or hallucinated from the input text.
+
+        Args:
+            text (`str`):
+                The input text containing references.
+
+        Returns:
+            `str`: The text with hallucinated references removed.
+        """
+        lines = text.split("\n")
+        if len(lines) == 0:
+            return ""
+        clean_lines = remove_numbers(lines)
+        slices = get_slices(lines, clean_lines)
+        to_delete = []
+        for slice in slices:
+            to_delete.append(remove_slice_from_lines(lines, clean_lines, slice))
+        for to_delete in reversed(to_delete):
+            text = text.replace(to_delete, "\n\n[MISSING_PAGE_POST]\n\n")
+        text = re.sub(
+            r"## References\n+\[MISSING_PAGE_POST(:\d+)?\]",
+            "\n\n[MISSING_PAGE_POST\\1]",
+            text,
+        )
+        return text
+
+    def correct_tables(self, generation: str) -> str:
+        """
+        Takes a generated string and fixes tables/tabulars to make them match the markdown format needed.
+
+        Args:
+            generation (str): The generated text to be postprocessed.
+
+        Returns:
+            str: The postprocessed text.
+
+        Example:
+
+        ```python
+        correct_tables("\\begin{table} \\begin{tabular}{l l} & \\ \\end{tabular} \\end{table}")
+        "\\begin{table}\n\\begin{tabular}{l l} & \\ \\end{tabular}\n\\end{table}"
+        ```
+        """
+        # remove obvious wrong tables
+        for l in generation.split("\n"):
+            if l.count("\\begin{tabular}") > 15 or l.count("\\multicolumn") > 60 or l.count("&") > 400:
+                generation = generation.replace(l, "")
+        # whitespace corrections
+
+        generation = generation.replace("\\begin{table} \\begin{tabular}", "\\begin{table}\n\\begin{tabular}")
+        generation = generation.replace("\\end{tabular} \\end{table}", "\\end{tabular}\n\\end{table}")
+        generation = generation.replace("\\end{table} Tab", "\\end{table}\nTab")
+
+        generation = re.sub(r"(^.+)\\begin{tab", r"\1\n\\begin{tab", generation, flags=re.M)
+
+        # Remove left-aligned empty LaTeX tabular blocks.
+        generation = generation.replace(r"\begin{tabular}{l l}  & \\ \end{tabular}", "")
+        # Remove tabulars with just 2 newline characters.
+        generation = generation.replace("\\begin{tabular}{}\n\n\\end{tabular}", "")
+        return generation
+
+    def post_process_single(self, generation: str, fix_markdown: bool = True) -> str:
+        """
+        Postprocess a single generated text. Regular expressions used here are taken directly from the Nougat article
+        authors. These expressions are commented for clarity and tested end-to-end in most cases.
+
+        Args:
+            generation (str): The generated text to be postprocessed.
+            fix_markdown (bool, optional): Whether to perform Markdown formatting fixes. Default is True.
+
+        Returns:
+            str: The postprocessed text.
+        """
+        generation = re.sub(
+            r"(?:\n|^)#+ \d*\W? ?(.{100,})", r"\n\1", generation
+        )  # too long section titles probably are none
+        generation = generation.strip()
+        # Remove LaTeX left margin tag
+        generation = generation.replace("\n* [leftmargin=*]\n", "\n")
+        # Remove lines with markdown headings starting with #, with numerals,
+        # and possibly roman numerals with trailing spaces and newlines
+        generation = re.sub(r"^#+ (?:\.?(?:\d|[ixv])+)*\s*(?:$|\n\s*)", "", generation, flags=re.M)
+        # most likely hallucinated titles
+        lines = generation.split("\n")
+        if lines[-1].startswith("#") and lines[-1].lstrip("#").startswith(" ") and len(lines) > 1:
+            logger.info("Likely hallucinated title at the end of the page: " + lines[-1])
+            generation = "\n".join(lines[:-1])
+        # obvious repetition detection
+        generation = truncate_repetitions(generation)
+        # Reference corrections
+        generation = self.remove_hallucinated_references(generation)
+        # Remove lines starting with asterisks and numbers like "*[1]" and followed by capital letters and periods (ie too long references)
+        generation = re.sub(r"^\* \[\d+\](\s?[A-W]\.+\s?){10,}.*$", "", generation, flags=re.M)
+        # Remove empty brackets after a reference number in brackets. *[12][]ABC will become *[12]ABC
+        generation = re.sub(r"^(\* \[\d+\])\[\](.*)$", r"\1\2", generation, flags=re.M)
+        # Remove single characters before or after 2 new lines
+        generation = re.sub(r"(^\w\n\n|\n\n\w$)", "", generation)
+        # pmc math artifact correction
+        generation = re.sub(
+            r"([\s.,()])_([a-zA-Z0-9])__([a-zA-Z0-9]){1,3}_([\s.,:()])",
+            r"\1\(\2_{\3}\)\4",
+            generation,
+        )
+        generation = re.sub(r"([\s.,\d])_([a-zA-Z0-9])_([\s.,\d;])", r"\1\(\2\)\3", generation)
+        # footnote mistakes
+        generation = re.sub(
+            r"(\nFootnote .*?:) (?:footnotetext|thanks):\W*(.*(?:\n\n|$))",
+            r"\1 \2",
+            generation,
+        )
+        # TODO Come up with footnote formatting inside a table
+        generation = re.sub(r"\[FOOTNOTE:.+?\](.*?)\[ENDFOOTNOTE\]", "", generation)
+        # itemize post processing
+        generation = normalize_list_like_lines(generation)
+
+        if generation.endswith((".", "}")):
+            generation += "\n\n"
+        if re.match(r"[A-Z0-9,;:]$", generation):
+            # add space in case it there is a comma or word ending
+            generation += " "
+        elif generation.startswith(("#", "**", "\\begin")):
+            generation = "\n\n" + generation
+        elif generation.split("\n")[-1].startswith(("#", "Figure", "Table")):
+            generation = generation + "\n\n"
+        else:
+            try:
+                last_word = generation.split(" ")[-1]
+                if last_word in nltk.corpus.words.words():
+                    generation += " "
+            except LookupError:
+                # add space just in case. Will split words but better than concatenating them
+                generation += " "
+
+        # table corrections
+        generation = self.correct_tables(generation)
+        # Remove optional, empty square brackets after begin{array}
+        generation = generation.replace("\\begin{array}[]{", "\\begin{array}{")
+        # Remove empty or malformed LaTeX tabular blocks with 2 or more columns specified, with spaces and ampersands.
+        generation = re.sub(
+            r"\\begin{tabular}{([clr ]){2,}}\s*[& ]*\s*(\\\\)? \\end{tabular}",
+            "",
+            generation,
+        )
+        # Remove lines containing "S.A.B." one or more times. Was included in Nougat's code.
+        generation = re.sub(r"(\*\*S\. A\. B\.\*\*\n+){2,}", "", generation)
+        # Remove markdown-style headers that are incomplete or empty on multiple lines.
+        generation = re.sub(r"^#+( [\[\d\w])?$", "", generation, flags=re.M)
+        # Remove lines with just one period.
+        generation = re.sub(r"^\.\s*$", "", generation, flags=re.M)
+        # Replace instances of three or more newlines with just two newlines.
+        generation = re.sub(r"\n{3,}", "\n\n", generation)
+        if fix_markdown:
+            return markdown_compatible(generation)
+        else:
+            return generation
+
+    def post_process_generation(
+        self,
+        generation: Union[str, List[str]],
+        fix_markdown: bool = True,
+        num_workers: int = None,
+    ) -> Union[str, List[str]]:
+        """
+        Postprocess a generated text or a list of generated texts.
+
+        This function can be used to perform postprocessing on generated text, such as fixing Markdown formatting.
+
+        Postprocessing is quite slow so it is recommended to use multiprocessing to speed up the process.
+
+        Args:
+            generation (Union[str, List[str]]):
+                The generated text or a list of generated texts.
+            fix_markdown (`bool`, *optional*, defaults to `True`):
+                Whether to perform Markdown formatting fixes.
+            num_workers (`int`, *optional*):
+                Optional number of workers to pass to leverage multiprocessing (postprocessing several texts in
+                parallel).
+
+        Returns:
+            Union[str, List[str]]: The postprocessed text or list of postprocessed texts.
+        """
+        requires_backends(self, ["nltk", "levenshtein"])
+
+        if isinstance(generation, list):
+            if num_workers is not None and isinstance(num_workers, int):
+                with Pool(num_workers) as p:
+                    return p.map(partial(self.post_process_single, fix_markdown=fix_markdown), generation)
+            else:
+                return [self.post_process_single(s, fix_markdown=fix_markdown) for s in generation]
+        else:
+            return self.post_process_single(generation, fix_markdown=fix_markdown)
diff --git a/src/transformers/models/nystromformer/__init__.py b/src/transformers/models/nystromformer/__init__.py
index a239e435f97b..4e94fc8f2639 100644
--- a/src/transformers/models/nystromformer/__init__.py
+++ b/src/transformers/models/nystromformer/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,7 +13,6 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-# rely on isort to merge the imports
 from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
 
 
diff --git a/src/transformers/models/nystromformer/configuration_nystromformer.py b/src/transformers/models/nystromformer/configuration_nystromformer.py
index 9a1cf726ea73..98b3e511ac0e 100644
--- a/src/transformers/models/nystromformer/configuration_nystromformer.py
+++ b/src/transformers/models/nystromformer/configuration_nystromformer.py
@@ -112,7 +112,7 @@ def __init__(
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
diff --git a/src/transformers/models/nystromformer/convert_nystromformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/nystromformer/convert_nystromformer_original_pytorch_checkpoint_to_pytorch.py
index d8e2cfac1019..8d5a52bdbf82 100644
--- a/src/transformers/models/nystromformer/convert_nystromformer_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/nystromformer/convert_nystromformer_original_pytorch_checkpoint_to_pytorch.py
@@ -78,7 +78,6 @@ def convert_checkpoint_helper(config, orig_state_dict):
 
 
 def convert_nystromformer_checkpoint(checkpoint_path, nystromformer_config_file, pytorch_dump_path):
-
     orig_state_dict = torch.load(checkpoint_path, map_location="cpu")["model_state_dict"]
     config = NystromformerConfig.from_json_file(nystromformer_config_file)
     model = NystromformerForMaskedLM(config)
diff --git a/src/transformers/models/nystromformer/modeling_nystromformer.py b/src/transformers/models/nystromformer/modeling_nystromformer.py
index 72a0d3479834..51ee73ab72d3 100755
--- a/src/transformers/models/nystromformer/modeling_nystromformer.py
+++ b/src/transformers/models/nystromformer/modeling_nystromformer.py
@@ -42,7 +42,6 @@
 
 _CHECKPOINT_FOR_DOC = "uw-madison/nystromformer-512"
 _CONFIG_FOR_DOC = "NystromformerConfig"
-_TOKENIZER_FOR_DOC = "AutoTokenizer"
 
 NYSTROMFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "uw-madison/nystromformer-512",
@@ -65,7 +64,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + 2)
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + 2, persistent=False
+        )
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
         self.register_buffer(
             "token_type_ids",
@@ -459,7 +460,6 @@ class NystromformerPreTrainedModel(PreTrainedModel):
     config_class = NystromformerConfig
     base_model_prefix = "nystromformer"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -574,7 +574,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(NYSTROMFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPastAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -600,6 +599,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -660,7 +660,7 @@ def forward(
 
 @add_start_docstrings("""Nyströmformer Model with a `language modeling` head on top.""", NYSTROMFORMER_START_DOCSTRING)
 class NystromformerForMaskedLM(NystromformerPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["cls.predictions.decoder"]
+    _tied_weights_keys = ["cls.predictions.decoder"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -679,7 +679,6 @@ def set_output_embeddings(self, new_embeddings):
 
     @add_start_docstrings_to_model_forward(NYSTROMFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -777,7 +776,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(NYSTROMFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -874,7 +872,6 @@ def __init__(self, config):
         NYSTROMFORMER_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
     )
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -969,7 +966,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(NYSTROMFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1049,7 +1045,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(NYSTROMFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/oneformer/__init__.py b/src/transformers/models/oneformer/__init__.py
new file mode 100644
index 000000000000..01bbaa139814
--- /dev/null
+++ b/src/transformers/models/oneformer/__init__.py
@@ -0,0 +1,73 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
+
+
+_import_structure = {
+    "configuration_oneformer": ["ONEFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "OneFormerConfig"],
+    "processing_oneformer": ["OneFormerProcessor"],
+}
+
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["image_processing_oneformer"] = ["OneFormerImageProcessor"]
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_oneformer"] = [
+        "ONEFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "OneFormerForUniversalSegmentation",
+        "OneFormerModel",
+        "OneFormerPreTrainedModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_oneformer import ONEFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, OneFormerConfig
+    from .processing_oneformer import OneFormerProcessor
+
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .image_processing_oneformer import OneFormerImageProcessor
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_oneformer import (
+            ONEFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            OneFormerForUniversalSegmentation,
+            OneFormerModel,
+            OneFormerPreTrainedModel,
+        )
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
diff --git a/src/transformers/models/oneformer/configuration_oneformer.py b/src/transformers/models/oneformer/configuration_oneformer.py
new file mode 100644
index 000000000000..06c75b92b1c0
--- /dev/null
+++ b/src/transformers/models/oneformer/configuration_oneformer.py
@@ -0,0 +1,251 @@
+# coding=utf-8
+# Copyright 2022 SHI Labs and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""OneFormer model configuration"""
+from typing import Dict, Optional
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto import CONFIG_MAPPING
+
+
+logger = logging.get_logger(__name__)
+
+ONEFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "shi-labs/oneformer_ade20k_swin_tiny": (
+        "https://huggingface.co/shi-labs/oneformer_ade20k_swin_tiny/blob/main/config.json"
+    ),
+    # See all OneFormer models at https://huggingface.co/models?filter=oneformer
+}
+
+
+class OneFormerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`OneFormerModel`]. It is used to instantiate a
+    OneFormer model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the OneFormer
+    [shi-labs/oneformer_ade20k_swin_tiny](https://huggingface.co/shi-labs/oneformer_ade20k_swin_tiny) architecture
+    trained on [ADE20k-150](https://huggingface.co/datasets/scene_parse_150).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        backbone_config (`PretrainedConfig`, *optional*, defaults to `SwinConfig`):
+            The configuration of the backbone model.
+        ignore_value (`int`, *optional*, defaults to 255):
+            Values to be ignored in GT label while calculating loss.
+        num_queries (`int`, *optional*, defaults to 150):
+            Number of object queries.
+        no_object_weight (`float`, *optional*, defaults to 0.1):
+            Weight for no-object class predictions.
+        class_weight (`float`, *optional*, defaults to 2.0):
+            Weight for Classification CE loss.
+        mask_weight (`float`, *optional*, defaults to 5.0):
+            Weight for binary CE loss.
+        dice_weight (`float`, *optional*, defaults to 5.0):
+            Weight for dice loss.
+        contrastive_weight (`float`, *optional*, defaults to 0.5):
+            Weight for contrastive loss.
+        contrastive_temperature (`float`, *optional*, defaults to 0.07):
+            Initial value for scaling the contrastive logits.
+        train_num_points (`int`, *optional*, defaults to 12544):
+            Number of points to sample while calculating losses on mask predictions.
+        oversample_ratio (`float`, *optional*, defaults to 3.0):
+            Ratio to decide how many points to oversample.
+        importance_sample_ratio (`float`, *optional*, defaults to 0.75):
+            Ratio of points that are sampled via importance sampling.
+        init_std (`float`, *optional*, defaults to 0.02):
+            Standard deviation for normal intialization.
+        init_xavier_std (`float`, *optional*, defaults to 1.0):
+            Standard deviation for xavier uniform initialization.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            Epsilon for layer normalization.
+        is_training (`bool`, *optional*, defaults to `False`):
+            Whether to run in training or inference mode.
+        use_auxiliary_loss (`bool`, *optional*, defaults to `True`):
+            Whether to calculate loss using intermediate predictions from transformer decoder.
+        output_auxiliary_logits (`bool`, *optional*, defaults to `True`):
+            Whether to return intermediate predictions from transformer decoder.
+        strides (`list`, *optional*, defaults to `[4, 8, 16, 32]`):
+            List containing the strides for feature maps in the encoder.
+        task_seq_len (`int`, *optional*, defaults to 77):
+            Sequence length for tokenizing text list input.
+        text_encoder_width (`int`, *optional*, defaults to 256):
+            Hidden size for text encoder.
+        text_encoder_context_length (`int`, *optional*, defaults to 77):
+            Input sequence length for text encoder.
+        text_encoder_num_layers (`int`, *optional*, defaults to 6):
+            Number of layers for transformer in text encoder.
+        text_encoder_vocab_size (`int`, *optional*, defaults to 49408):
+            Vocabulary size for tokenizer.
+        text_encoder_proj_layers (`int`, *optional*, defaults to 2):
+            Number of layers in MLP for project text queries.
+        text_encoder_n_ctx (`int`, *optional*, defaults to 16):
+            Number of learnable text context queries.
+        conv_dim (`int`, *optional*, defaults to 256):
+            Feature map dimension to map outputs from the backbone.
+        mask_dim (`int`, *optional*, defaults to 256):
+            Dimension for feature maps in pixel decoder.
+        hidden_dim (`int`, *optional*, defaults to 256):
+            Dimension for hidden states in transformer decoder.
+        encoder_feedforward_dim (`int`, *optional*, defaults to 1024):
+            Dimension for FFN layer in pixel decoder.
+        norm (`str`, *optional*, defaults to `"GN"`):
+            Type of normalization.
+        encoder_layers (`int`, *optional*, defaults to 6):
+            Number of layers in pixel decoder.
+        decoder_layers (`int`, *optional*, defaults to 10):
+            Number of layers in transformer decoder.
+        use_task_norm (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the task token.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads in transformer layers in the pixel and transformer decoders.
+        dropout (`float`, *optional*, defaults to 0.1):
+            Dropout probability for pixel and transformer decoders.
+        dim_feedforward (`int`, *optional*, defaults to 2048):
+            Dimension for FFN layer in transformer decoder.
+        pre_norm (`bool`, *optional*, defaults to `False`):
+            Whether to normalize hidden states before attention layers in transformer decoder.
+        enforce_input_proj (`bool`, *optional*, defaults to `False`):
+            Whether to project hidden states in transformer decoder.
+        query_dec_layers (`int`, *optional*, defaults to 2):
+            Number of layers in query transformer.
+        common_stride (`int`, *optional*, defaults to 4):
+            Common stride used for features in pixel decoder.
+
+    Examples:
+    ```python
+    >>> from transformers import OneFormerConfig, OneFormerModel
+
+    >>> # Initializing a OneFormer shi-labs/oneformer_ade20k_swin_tiny configuration
+    >>> configuration = OneFormerConfig()
+    >>> # Initializing a model (with random weights) from the shi-labs/oneformer_ade20k_swin_tiny style configuration
+    >>> model = OneFormerModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+    model_type = "oneformer"
+    attribute_map = {"hidden_size": "hidden_dim"}
+
+    def __init__(
+        self,
+        backbone_config: Optional[Dict] = None,
+        ignore_value: int = 255,
+        num_queries: int = 150,
+        no_object_weight: int = 0.1,
+        class_weight: float = 2.0,
+        mask_weight: float = 5.0,
+        dice_weight: float = 5.0,
+        contrastive_weight: float = 0.5,
+        contrastive_temperature: float = 0.07,
+        train_num_points: int = 12544,
+        oversample_ratio: float = 3.0,
+        importance_sample_ratio: float = 0.75,
+        init_std: float = 0.02,
+        init_xavier_std: float = 1.0,
+        layer_norm_eps: float = 1e-05,
+        is_training: bool = False,
+        use_auxiliary_loss: bool = True,
+        output_auxiliary_logits: bool = True,
+        strides: Optional[list] = [4, 8, 16, 32],
+        task_seq_len: int = 77,
+        text_encoder_width: int = 256,
+        text_encoder_context_length: int = 77,
+        text_encoder_num_layers: int = 6,
+        text_encoder_vocab_size: int = 49408,
+        text_encoder_proj_layers: int = 2,
+        text_encoder_n_ctx: int = 16,
+        conv_dim: int = 256,
+        mask_dim: int = 256,
+        hidden_dim: int = 256,
+        encoder_feedforward_dim: int = 1024,
+        norm: str = "GN",
+        encoder_layers: int = 6,
+        decoder_layers: int = 10,
+        use_task_norm: bool = True,
+        num_attention_heads: int = 8,
+        dropout: float = 0.1,
+        dim_feedforward: int = 2048,
+        pre_norm: bool = False,
+        enforce_input_proj: bool = False,
+        query_dec_layers: int = 2,
+        common_stride: int = 4,
+        **kwargs,
+    ):
+        if backbone_config is None:
+            logger.info("`backbone_config` is unset. Initializing the config with the default `Swin` backbone.")
+            backbone_config = CONFIG_MAPPING["swin"](
+                image_size=224,
+                in_channels=3,
+                patch_size=4,
+                embed_dim=96,
+                depths=[2, 2, 6, 2],
+                num_heads=[3, 6, 12, 24],
+                window_size=7,
+                drop_path_rate=0.3,
+                use_absolute_embeddings=False,
+                out_features=["stage1", "stage2", "stage3", "stage4"],
+            )
+        elif isinstance(backbone_config, dict):
+            backbone_model_type = backbone_config.get("model_type")
+            config_class = CONFIG_MAPPING[backbone_model_type]
+            backbone_config = config_class.from_dict(backbone_config)
+
+        self.backbone_config = backbone_config
+
+        self.ignore_value = ignore_value
+        self.num_queries = num_queries
+        self.no_object_weight = no_object_weight
+        self.class_weight = class_weight
+        self.mask_weight = mask_weight
+        self.dice_weight = dice_weight
+        self.contrastive_weight = contrastive_weight
+        self.contrastive_temperature = contrastive_temperature
+        self.train_num_points = train_num_points
+        self.oversample_ratio = oversample_ratio
+        self.importance_sample_ratio = importance_sample_ratio
+        self.init_std = init_std
+        self.init_xavier_std = init_xavier_std
+        self.layer_norm_eps = layer_norm_eps
+        self.is_training = is_training
+        self.use_auxiliary_loss = use_auxiliary_loss
+        self.output_auxiliary_logits = output_auxiliary_logits
+        self.strides = strides
+        self.task_seq_len = task_seq_len
+        self.text_encoder_width = text_encoder_width
+        self.text_encoder_context_length = text_encoder_context_length
+        self.text_encoder_num_layers = text_encoder_num_layers
+        self.text_encoder_vocab_size = text_encoder_vocab_size
+        self.text_encoder_proj_layers = text_encoder_proj_layers
+        self.text_encoder_n_ctx = text_encoder_n_ctx
+        self.conv_dim = conv_dim
+        self.mask_dim = mask_dim
+        self.hidden_dim = hidden_dim
+        self.encoder_feedforward_dim = encoder_feedforward_dim
+        self.norm = norm
+        self.encoder_layers = encoder_layers
+        self.decoder_layers = decoder_layers
+        self.use_task_norm = use_task_norm
+        self.num_attention_heads = num_attention_heads
+        self.dropout = dropout
+        self.dim_feedforward = dim_feedforward
+        self.pre_norm = pre_norm
+        self.enforce_input_proj = enforce_input_proj
+        self.query_dec_layers = query_dec_layers
+        self.common_stride = common_stride
+        self.num_hidden_layers = decoder_layers
+
+        super().__init__(**kwargs)
diff --git a/src/transformers/models/oneformer/convert_to_hf_oneformer.py b/src/transformers/models/oneformer/convert_to_hf_oneformer.py
new file mode 100644
index 000000000000..cb93857ad8e4
--- /dev/null
+++ b/src/transformers/models/oneformer/convert_to_hf_oneformer.py
@@ -0,0 +1,1191 @@
+# coding=utf-8
+# Copyright 2022 SHI Labs and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Convert OneFormer checkpoints from the original repository. URL: https://github.com/SHI-Labs/OneFormer"""
+
+import os
+import sys
+from argparse import ArgumentParser
+from dataclasses import dataclass
+from pathlib import Path
+from pprint import pformat
+from typing import Any, Dict, Iterator, List, Set, Tuple
+
+import requests
+import torch
+import torchvision.transforms as T
+from PIL import Image
+from torch import Tensor, nn
+
+
+try:
+    from detectron2.checkpoint import DetectionCheckpointer
+    from detectron2.config import get_cfg
+    from detectron2.data import MetadataCatalog
+    from detectron2.projects.deeplab import add_deeplab_config
+except ImportError:
+    pass
+from transformers import CLIPTokenizer, DinatConfig, SwinConfig
+from transformers.models.oneformer.image_processing_oneformer import OneFormerImageProcessor
+from transformers.models.oneformer.modeling_oneformer import (
+    OneFormerConfig,
+    OneFormerForUniversalSegmentation,
+    OneFormerForUniversalSegmentationOutput,
+    OneFormerModel,
+    OneFormerModelOutput,
+)
+from transformers.models.oneformer.processing_oneformer import OneFormerProcessor
+from transformers.utils import logging
+
+
+StateDict = Dict[str, Tensor]
+
+logging.set_verbosity_info()
+logger = logging.get_logger()
+
+torch.manual_seed(0)
+
+
+class TrackedStateDict:
+    def __init__(self, to_track: Dict):
+        """This class "tracks" a python dictionary by keeping track of which item is accessed.
+
+        Args:
+            to_track (Dict): The dictionary we wish to track
+        """
+        self.to_track = to_track
+        self._seen: Set[str] = set()
+
+    def __getitem__(self, key: str) -> Any:
+        return self.to_track[key]
+
+    def __setitem__(self, key: str, item: Any):
+        self._seen.add(key)
+        self.to_track[key] = item
+
+    def diff(self) -> List[str]:
+        """This method returns a set difference between the keys in the tracked state dict and the one we have access so far.
+        This is an effective method to check if we have update all the keys
+
+        Returns:
+            List[str]: List of keys not yet updated
+        """
+        return set(self.to_track.keys()) - self._seen
+
+    def copy(self) -> Dict:
+        # proxy the call to the internal dictionary
+        return self.to_track.copy()
+
+
+# Image to verify the result
+def prepare_img():
+    url = "https://praeclarumjj3.github.io/files/coco.jpeg"
+    img_data = requests.get(url, stream=True).raw
+    im = Image.open(img_data)
+    return im
+
+
+@dataclass
+class Args:
+    """Fake command line arguments needed by oneformer/detectron2 implementation"""
+
+    config_file: str
+
+
+def setup_cfg(args: Args):
+    # load config from file and command-line arguments
+    cfg = get_cfg()
+    add_deeplab_config(cfg)
+    add_common_config(cfg)
+    add_oneformer_config(cfg)
+    add_swin_config(cfg)
+    add_dinat_config(cfg)
+    cfg.merge_from_file(args.config_file)
+    cfg.freeze()
+    return cfg
+
+
+class OriginalOneFormerConfigToOursConverter:
+    def __call__(self, original_config: object, is_swin: bool) -> OneFormerConfig:
+        model = original_config.MODEL
+
+        dataset_catalog = MetadataCatalog.get(original_config.DATASETS.TEST_PANOPTIC[0])
+        id2label = dict(enumerate(dataset_catalog.stuff_classes))
+        label2id = {label: idx for idx, label in id2label.items()}
+
+        if is_swin:
+            if model.SWIN.EMBED_DIM == 96:
+                backbone_config = SwinConfig.from_pretrained(
+                    "microsoft/swin-tiny-patch4-window7-224",
+                    drop_path_rate=model.SWIN.DROP_PATH_RATE,
+                    out_features=["stage1", "stage2", "stage3", "stage4"],
+                )
+            elif model.SWIN.EMBED_DIM == 192:
+                backbone_config = SwinConfig.from_pretrained(
+                    "microsoft/swin-large-patch4-window12-384",
+                    drop_path_rate=model.SWIN.DROP_PATH_RATE,
+                    out_features=["stage1", "stage2", "stage3", "stage4"],
+                )
+            else:
+                raise ValueError(f"embed dim {model.SWIN.EMBED_DIM} not supported for Swin!")
+        else:
+            backbone_config = DinatConfig.from_pretrained(
+                "shi-labs/dinat-large-11x11-in22k-in1k-384",
+                dilations=model.DiNAT.DILATIONS,
+                kernel_size=model.DiNAT.KERNEL_SIZE,
+                out_features=["stage1", "stage2", "stage3", "stage4"],
+            )
+
+        config: OneFormerConfig = OneFormerConfig(
+            backbone_config=backbone_config,
+            output_attentions=True,
+            output_hidden_states=True,
+            return_dict=True,
+            ignore_value=model.SEM_SEG_HEAD.IGNORE_VALUE,
+            num_classes=model.SEM_SEG_HEAD.NUM_CLASSES,
+            num_queries=model.ONE_FORMER.NUM_OBJECT_QUERIES,
+            no_object_weight=model.ONE_FORMER.NO_OBJECT_WEIGHT,
+            class_weight=model.ONE_FORMER.CLASS_WEIGHT,
+            mask_weight=model.ONE_FORMER.MASK_WEIGHT,
+            dice_weight=model.ONE_FORMER.DICE_WEIGHT,
+            contrastive_weight=model.ONE_FORMER.CONTRASTIVE_WEIGHT,
+            contrastive_temperature=model.ONE_FORMER.CONTRASTIVE_TEMPERATURE,
+            train_num_points=model.ONE_FORMER.TRAIN_NUM_POINTS,
+            oversample_ratio=model.ONE_FORMER.OVERSAMPLE_RATIO,
+            importance_sample_ratio=model.ONE_FORMER.IMPORTANCE_SAMPLE_RATIO,
+            init_std=0.02,
+            init_xavier_std=1.0,
+            layer_norm_eps=1e-05,
+            is_training=False,
+            use_auxiliary_loss=model.ONE_FORMER.DEEP_SUPERVISION,
+            output_auxiliary_logits=True,
+            strides=[4, 8, 16, 32],
+            task_seq_len=original_config.INPUT.TASK_SEQ_LEN,
+            max_seq_len=original_config.INPUT.MAX_SEQ_LEN,
+            text_encoder_width=model.TEXT_ENCODER.WIDTH,
+            text_encoder_context_length=model.TEXT_ENCODER.CONTEXT_LENGTH,
+            text_encoder_num_layers=model.TEXT_ENCODER.NUM_LAYERS,
+            text_encoder_vocab_size=model.TEXT_ENCODER.VOCAB_SIZE,
+            text_encoder_proj_layers=model.TEXT_ENCODER.PROJ_NUM_LAYERS,
+            text_encoder_n_ctx=model.TEXT_ENCODER.N_CTX,
+            conv_dim=model.SEM_SEG_HEAD.CONVS_DIM,
+            mask_dim=model.SEM_SEG_HEAD.MASK_DIM,
+            hidden_dim=model.ONE_FORMER.HIDDEN_DIM,
+            norm=model.SEM_SEG_HEAD.NORM,
+            encoder_layers=model.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS,
+            encoder_feedforward_dim=1024,
+            decoder_layers=model.ONE_FORMER.DEC_LAYERS,
+            use_task_norm=model.ONE_FORMER.USE_TASK_NORM,
+            num_attention_heads=model.ONE_FORMER.NHEADS,
+            dropout=model.ONE_FORMER.DROPOUT,
+            dim_feedforward=model.ONE_FORMER.DIM_FEEDFORWARD,
+            pre_norm=model.ONE_FORMER.PRE_NORM,
+            enforce_input_proj=model.ONE_FORMER.ENFORCE_INPUT_PROJ,
+            query_dec_layers=model.ONE_FORMER.CLASS_DEC_LAYERS,
+            common_stride=model.SEM_SEG_HEAD.COMMON_STRIDE,
+            id2label=id2label,
+            label2id=label2id,
+        )
+
+        return config
+
+
+class OriginalOneFormerConfigToProcessorConverter:
+    def __call__(self, original_config: object, model_repo: str) -> OneFormerProcessor:
+        model = original_config.MODEL
+        model_input = original_config.INPUT
+        dataset_catalog = MetadataCatalog.get(original_config.DATASETS.TEST_PANOPTIC[0])
+
+        if "ade20k" in model_repo:
+            class_info_file = "ade20k_panoptic.json"
+        elif "coco" in model_repo:
+            class_info_file = "coco_panoptic.json"
+        elif "cityscapes" in model_repo:
+            class_info_file = "cityscapes_panoptic.json"
+        else:
+            raise ValueError("Invalid Dataset!")
+
+        image_processor = OneFormerImageProcessor(
+            image_mean=(torch.tensor(model.PIXEL_MEAN) / 255).tolist(),
+            image_std=(torch.tensor(model.PIXEL_STD) / 255).tolist(),
+            size=model_input.MIN_SIZE_TEST,
+            max_size=model_input.MAX_SIZE_TEST,
+            num_labels=model.SEM_SEG_HEAD.NUM_CLASSES,
+            ignore_index=dataset_catalog.ignore_label,
+            class_info_file=class_info_file,
+        )
+
+        tokenizer = CLIPTokenizer.from_pretrained(model_repo)
+
+        return OneFormerProcessor(
+            image_processor=image_processor,
+            tokenizer=tokenizer,
+            task_seq_length=original_config.INPUT.TASK_SEQ_LEN,
+            max_seq_length=original_config.INPUT.MAX_SEQ_LEN,
+        )
+
+
+class OriginalOneFormerCheckpointToOursConverter:
+    def __init__(self, original_model: nn.Module, config: OneFormerConfig):
+        self.original_model = original_model
+        self.config = config
+
+    def pop_all(self, renamed_keys: List[Tuple[str, str]], dst_state_dict: StateDict, src_state_dict: StateDict):
+        for src_key, dst_key in renamed_keys:
+            dst_state_dict[dst_key] = src_state_dict.pop(src_key)
+
+    # Swin Backbone
+    def replace_swin_backbone(self, dst_state_dict: StateDict, src_state_dict: StateDict, config: OneFormerConfig):
+        dst_prefix: str = "pixel_level_module.encoder"
+        src_prefix: str = "backbone"
+
+        renamed_keys = [
+            (
+                f"{src_prefix}.patch_embed.proj.weight",
+                f"{dst_prefix}.embeddings.patch_embeddings.projection.weight",
+            ),
+            (f"{src_prefix}.patch_embed.proj.bias", f"{dst_prefix}.embeddings.patch_embeddings.projection.bias"),
+            (f"{src_prefix}.patch_embed.norm.weight", f"{dst_prefix}.embeddings.norm.weight"),
+            (f"{src_prefix}.patch_embed.norm.bias", f"{dst_prefix}.embeddings.norm.bias"),
+        ]
+        num_layers = len(config.backbone_config.depths)
+        for layer_idx in range(num_layers):
+            for block_idx in range(config.backbone_config.depths[layer_idx]):
+                renamed_keys.extend(
+                    [  # src, dst
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm1.weight",
+                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_before.weight",
+                        ),
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm1.bias",
+                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_before.bias",
+                        ),
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.relative_position_bias_table",
+                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.relative_position_bias_table",
+                        ),
+                    ]
+                )
+                # now we need to handle the attentions
+                # read in weights + bias of input projection layer of cross-attention
+
+                src_att_weight = src_state_dict[f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.weight"]
+                src_att_bias = src_state_dict[f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.bias"]
+
+                size = src_att_weight.shape[0]
+                offset = size // 3
+                dst_state_dict[
+                    f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.query.weight"
+                ] = src_att_weight[:offset, :]
+                dst_state_dict[
+                    f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.query.bias"
+                ] = src_att_bias[:offset]
+
+                dst_state_dict[
+                    f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.key.weight"
+                ] = src_att_weight[offset : offset * 2, :]
+                dst_state_dict[
+                    f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.key.bias"
+                ] = src_att_bias[offset : offset * 2]
+
+                dst_state_dict[
+                    f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.value.weight"
+                ] = src_att_weight[-offset:, :]
+                dst_state_dict[
+                    f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.value.bias"
+                ] = src_att_bias[-offset:]
+
+                # let's pop them
+                src_state_dict.pop(f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.weight")
+                src_state_dict.pop(f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.bias")
+                # proj
+                renamed_keys.extend(
+                    [
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.proj.weight",
+                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.output.dense.weight",
+                        ),
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.proj.bias",
+                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.output.dense.bias",
+                        ),
+                    ]
+                )
+
+                # second norm
+                renamed_keys.extend(
+                    [
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm2.weight",
+                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_after.weight",
+                        ),
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm2.bias",
+                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_after.bias",
+                        ),
+                    ]
+                )
+
+                # mlp
+                renamed_keys.extend(
+                    [
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc1.weight",
+                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.intermediate.dense.weight",
+                        ),
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc1.bias",
+                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.intermediate.dense.bias",
+                        ),
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc2.weight",
+                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.output.dense.weight",
+                        ),
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc2.bias",
+                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.output.dense.bias",
+                        ),
+                    ]
+                )
+
+                renamed_keys.extend(
+                    [
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.relative_position_index",
+                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.relative_position_index",
+                        )
+                    ]
+                )
+
+            if layer_idx < num_layers - 1:
+                # patch merging
+                renamed_keys.extend(
+                    [
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.downsample.reduction.weight",
+                            f"{dst_prefix}.encoder.layers.{layer_idx}.downsample.reduction.weight",
+                        ),
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.downsample.norm.weight",
+                            f"{dst_prefix}.encoder.layers.{layer_idx}.downsample.norm.weight",
+                        ),
+                        (
+                            f"{src_prefix}.layers.{layer_idx}.downsample.norm.bias",
+                            f"{dst_prefix}.encoder.layers.{layer_idx}.downsample.norm.bias",
+                        ),
+                    ]
+                )
+
+            # hidden states norms
+            renamed_keys.extend(
+                [
+                    (
+                        f"{src_prefix}.norm{layer_idx}.weight",
+                        f"{dst_prefix}.hidden_states_norms.stage{layer_idx+1}.weight",
+                    ),
+                    (
+                        f"{src_prefix}.norm{layer_idx}.bias",
+                        f"{dst_prefix}.hidden_states_norms.stage{layer_idx+1}.bias",
+                    ),
+                ]
+            )
+
+        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
+
+    # Dinat Backbone
+    def replace_dinat_backbone(self, dst_state_dict: StateDict, src_state_dict: StateDict, config: OneFormerConfig):
+        dst_prefix: str = "pixel_level_module.encoder"
+        src_prefix: str = "backbone"
+
+        def rename_keys_for_weight_bias(src_prefix: str, dst_prefix: str):
+            return [
+                (f"{src_prefix}.weight", f"{dst_prefix}.weight"),
+                (f"{src_prefix}.bias", f"{dst_prefix}.bias"),
+            ]
+
+        renamed_keys = rename_keys_for_weight_bias(f"{src_prefix}.patch_embed.norm", f"{dst_prefix}.embeddings.norm")
+
+        for i in range(2):
+            renamed_keys.extend(
+                rename_keys_for_weight_bias(
+                    f"{src_prefix}.patch_embed.proj.{i}",
+                    f"{dst_prefix}.embeddings.patch_embeddings.projection.{i}",
+                )
+            )
+
+        num_layers = len(config.backbone_config.depths)
+        for layer_idx in range(num_layers):
+            for block_idx in range(config.backbone_config.depths[layer_idx]):
+                renamed_keys.extend(
+                    rename_keys_for_weight_bias(
+                        f"{src_prefix}.levels.{layer_idx}.blocks.{block_idx}.norm1",
+                        f"{dst_prefix}.encoder.levels.{layer_idx}.layers.{block_idx}.layernorm_before",
+                    )
+                )
+
+                renamed_keys.extend(
+                    rename_keys_for_weight_bias(
+                        f"{src_prefix}.levels.{layer_idx}.blocks.{block_idx}.norm2",
+                        f"{dst_prefix}.encoder.levels.{layer_idx}.layers.{block_idx}.layernorm_after",
+                    )
+                )
+
+                renamed_keys.extend(
+                    [  # src, dst
+                        (
+                            f"{src_prefix}.levels.{layer_idx}.blocks.{block_idx}.attn.rpb",
+                            f"{dst_prefix}.encoder.levels.{layer_idx}.layers.{block_idx}.attention.self.rpb",
+                        ),
+                    ]
+                )
+                # now we need to handle the attentions
+                # read in weights + bias of input projection layer of cross-attention
+
+                src_att_weight = src_state_dict[f"{src_prefix}.levels.{layer_idx}.blocks.{block_idx}.attn.qkv.weight"]
+                src_att_bias = src_state_dict[f"{src_prefix}.levels.{layer_idx}.blocks.{block_idx}.attn.qkv.bias"]
+
+                size = src_att_weight.shape[0]
+                offset = size // 3
+                dst_state_dict[
+                    f"{dst_prefix}.encoder.levels.{layer_idx}.layers.{block_idx}.attention.self.query.weight"
+                ] = src_att_weight[:offset, :]
+                dst_state_dict[
+                    f"{dst_prefix}.encoder.levels.{layer_idx}.layers.{block_idx}.attention.self.query.bias"
+                ] = src_att_bias[:offset]
+
+                dst_state_dict[
+                    f"{dst_prefix}.encoder.levels.{layer_idx}.layers.{block_idx}.attention.self.key.weight"
+                ] = src_att_weight[offset : offset * 2, :]
+                dst_state_dict[
+                    f"{dst_prefix}.encoder.levels.{layer_idx}.layers.{block_idx}.attention.self.key.bias"
+                ] = src_att_bias[offset : offset * 2]
+
+                dst_state_dict[
+                    f"{dst_prefix}.encoder.levels.{layer_idx}.layers.{block_idx}.attention.self.value.weight"
+                ] = src_att_weight[-offset:, :]
+                dst_state_dict[
+                    f"{dst_prefix}.encoder.levels.{layer_idx}.layers.{block_idx}.attention.self.value.bias"
+                ] = src_att_bias[-offset:]
+
+                # let's pop them
+                src_state_dict.pop(f"{src_prefix}.levels.{layer_idx}.blocks.{block_idx}.attn.qkv.weight")
+                src_state_dict.pop(f"{src_prefix}.levels.{layer_idx}.blocks.{block_idx}.attn.qkv.bias")
+                # proj
+
+                renamed_keys.extend(
+                    rename_keys_for_weight_bias(
+                        f"{src_prefix}.levels.{layer_idx}.blocks.{block_idx}.attn.proj",
+                        f"{dst_prefix}.encoder.levels.{layer_idx}.layers.{block_idx}.attention.output.dense",
+                    )
+                )
+
+                # mlp
+                renamed_keys.extend(
+                    rename_keys_for_weight_bias(
+                        f"{src_prefix}.levels.{layer_idx}.blocks.{block_idx}.mlp.fc1",
+                        f"{dst_prefix}.encoder.levels.{layer_idx}.layers.{block_idx}.intermediate.dense",
+                    )
+                )
+
+                renamed_keys.extend(
+                    rename_keys_for_weight_bias(
+                        f"{src_prefix}.levels.{layer_idx}.blocks.{block_idx}.mlp.fc2",
+                        f"{dst_prefix}.encoder.levels.{layer_idx}.layers.{block_idx}.output.dense",
+                    )
+                )
+
+            if layer_idx < num_layers - 1:
+                # patch merging
+                renamed_keys.extend(
+                    [
+                        (
+                            f"{src_prefix}.levels.{layer_idx}.downsample.reduction.weight",
+                            f"{dst_prefix}.encoder.levels.{layer_idx}.downsample.reduction.weight",
+                        ),
+                        (
+                            f"{src_prefix}.levels.{layer_idx}.downsample.norm.weight",
+                            f"{dst_prefix}.encoder.levels.{layer_idx}.downsample.norm.weight",
+                        ),
+                        (
+                            f"{src_prefix}.levels.{layer_idx}.downsample.norm.bias",
+                            f"{dst_prefix}.encoder.levels.{layer_idx}.downsample.norm.bias",
+                        ),
+                    ]
+                )
+
+            # hidden states norms
+            renamed_keys.extend(
+                [
+                    (
+                        f"{src_prefix}.norm{layer_idx}.weight",
+                        f"{dst_prefix}.hidden_states_norms.stage{layer_idx+1}.weight",
+                    ),
+                    (
+                        f"{src_prefix}.norm{layer_idx}.bias",
+                        f"{dst_prefix}.hidden_states_norms.stage{layer_idx+1}.bias",
+                    ),
+                ]
+            )
+
+        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
+
+    # Backbone + Pixel Decoder
+    def replace_pixel_module(self, dst_state_dict: StateDict, src_state_dict: StateDict, is_swin: bool):
+        dst_prefix: str = "pixel_level_module.decoder"
+        src_prefix: str = "sem_seg_head.pixel_decoder"
+
+        if is_swin:
+            self.replace_swin_backbone(dst_state_dict, src_state_dict, self.config)
+        else:
+            self.replace_dinat_backbone(dst_state_dict, src_state_dict, self.config)
+
+        def rename_keys_for_weight_bias(src_prefix: str, dst_prefix: str):
+            return [
+                (f"{src_prefix}.weight", f"{dst_prefix}.weight"),
+                (f"{src_prefix}.bias", f"{dst_prefix}.bias"),
+            ]
+
+        def rename_keys_for_self_attn(src_prefix: str, dst_prefix: str):
+            self_attn_keys = []
+            self_attn_keys.extend(
+                rename_keys_for_weight_bias(f"{src_prefix}.attention_weights", f"{dst_prefix}.attention_weights")
+            )
+            self_attn_keys.extend(
+                rename_keys_for_weight_bias(f"{src_prefix}.output_proj", f"{dst_prefix}.output_proj")
+            )
+            self_attn_keys.extend(
+                rename_keys_for_weight_bias(f"{src_prefix}.sampling_offsets", f"{dst_prefix}.sampling_offsets")
+            )
+            self_attn_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.value_proj", f"{dst_prefix}.value_proj"))
+
+            return self_attn_keys
+
+        def rename_keys_for_encoder_layer(src_prefix: str, dst_prefix: str):
+            encoder_keys = []
+            encoder_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.linear1", f"{dst_prefix}.fc1"))
+            encoder_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.linear2", f"{dst_prefix}.fc2"))
+            encoder_keys.extend(
+                rename_keys_for_weight_bias(f"{src_prefix}.norm1", f"{dst_prefix}.self_attn_layer_norm")
+            )
+            encoder_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.norm2", f"{dst_prefix}.final_layer_norm"))
+            encoder_keys.extend(rename_keys_for_self_attn(f"{src_prefix}.self_attn", f"{dst_prefix}.self_attn"))
+
+            return encoder_keys
+
+        # convolution layer for final features
+        renamed_keys = [
+            (f"{src_prefix}.adapter_1.weight", f"{dst_prefix}.adapter_1.0.weight"),
+            (f"{src_prefix}.adapter_1.norm.weight", f"{dst_prefix}.adapter_1.1.weight"),
+            (f"{src_prefix}.adapter_1.norm.bias", f"{dst_prefix}.adapter_1.1.bias"),
+        ]
+
+        renamed_keys.extend(
+            [
+                (f"{src_prefix}.layer_1.weight", f"{dst_prefix}.layer_1.0.weight"),
+                (f"{src_prefix}.layer_1.norm.weight", f"{dst_prefix}.layer_1.1.weight"),
+                (f"{src_prefix}.layer_1.norm.bias", f"{dst_prefix}.layer_1.1.bias"),
+            ]
+        )
+
+        # proj layers
+        for i in range(3):
+            for j in range(2):
+                renamed_keys.extend(
+                    [
+                        (f"{src_prefix}.input_proj.{i}.{j}.weight", f"{dst_prefix}.input_projections.{i}.{j}.weight"),
+                        (f"{src_prefix}.input_proj.{i}.{j}.bias", f"{dst_prefix}.input_projections.{i}.{j}.bias"),
+                    ]
+                )
+
+        renamed_keys.extend([(f"{src_prefix}.transformer.level_embed", f"{dst_prefix}.level_embed")])
+
+        # layers
+        for layer_idx in range(self.config.encoder_layers):
+            renamed_keys.extend(
+                rename_keys_for_encoder_layer(
+                    f"{src_prefix}.transformer.encoder.layers.{layer_idx}", f"{dst_prefix}.encoder.layers.{layer_idx}"
+                )
+            )
+
+        # proj
+        renamed_keys.extend(
+            [
+                (f"{src_prefix}.mask_features.weight", f"{dst_prefix}.mask_projection.weight"),
+                (f"{src_prefix}.mask_features.bias", f"{dst_prefix}.mask_projection.bias"),
+            ]
+        )
+
+        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
+
+    # Transformer Decoder
+    def replace_keys_qkv_transformer_decoder(self, dst_state_dict: StateDict, src_state_dict: StateDict):
+        dst_prefix: str = "transformer_module.decoder.layers"
+        src_prefix: str = "sem_seg_head.predictor"
+        for i in range(self.config.decoder_layers - 1):
+            # read in weights + bias of input projection layer of self-attention
+            in_proj_weight = src_state_dict.pop(
+                f"{src_prefix}.transformer_self_attention_layers.{i}.self_attn.in_proj_weight"
+            )
+            in_proj_bias = src_state_dict.pop(
+                f"{src_prefix}.transformer_self_attention_layers.{i}.self_attn.in_proj_bias"
+            )
+            # next, add query, keys and values (in that order) to the state dict
+            dst_state_dict[f"{dst_prefix}.{i}.self_attn.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
+            dst_state_dict[f"{dst_prefix}.{i}.self_attn.self_attn.q_proj.bias"] = in_proj_bias[:256]
+            dst_state_dict[f"{dst_prefix}.{i}.self_attn.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
+            dst_state_dict[f"{dst_prefix}.{i}.self_attn.self_attn.k_proj.bias"] = in_proj_bias[256:512]
+            dst_state_dict[f"{dst_prefix}.{i}.self_attn.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
+            dst_state_dict[f"{dst_prefix}.{i}.self_attn.self_attn.v_proj.bias"] = in_proj_bias[-256:]
+
+    def replace_transformer_module(self, dst_state_dict: StateDict, src_state_dict: StateDict):
+        dst_prefix: str = "transformer_module"
+        src_prefix: str = "sem_seg_head.predictor"
+
+        def rename_keys_for_weight_bias(src_prefix: str, dst_prefix: str):
+            return [
+                (f"{src_prefix}.weight", f"{dst_prefix}.weight"),
+                (f"{src_prefix}.bias", f"{dst_prefix}.bias"),
+            ]
+
+        def rename_keys_for_attn(src_prefix: str, dst_prefix: str):
+            attn_keys = [
+                (f"{src_prefix}.in_proj_bias", f"{dst_prefix}.in_proj_bias"),
+                (f"{src_prefix}.in_proj_weight", f"{dst_prefix}.in_proj_weight"),
+            ]
+            attn_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.out_proj", f"{dst_prefix}.out_proj"))
+
+            return attn_keys
+
+        def rename_keys_for_self_attn(src_prefix: str, dst_prefix: str):
+            attn_keys = []
+            attn_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.out_proj", f"{dst_prefix}.out_proj"))
+
+            return attn_keys
+
+        def rename_keys_for_query_transformer_layer(src_prefix: str, dst_prefix: str):
+            query_transformer_layer_keys = []
+
+            query_transformer_layer_keys.extend(
+                rename_keys_for_weight_bias(f"{src_prefix}.linear1", f"{dst_prefix}.linear1")
+            )
+            query_transformer_layer_keys.extend(
+                rename_keys_for_weight_bias(f"{src_prefix}.linear2", f"{dst_prefix}.linear2")
+            )
+            query_transformer_layer_keys.extend(
+                rename_keys_for_weight_bias(f"{src_prefix}.norm1", f"{dst_prefix}.norm1")
+            )
+            query_transformer_layer_keys.extend(
+                rename_keys_for_weight_bias(f"{src_prefix}.norm2", f"{dst_prefix}.norm2")
+            )
+            query_transformer_layer_keys.extend(
+                rename_keys_for_weight_bias(f"{src_prefix}.norm3", f"{dst_prefix}.norm3")
+            )
+
+            query_transformer_layer_keys.extend(
+                rename_keys_for_attn(f"{src_prefix}.self_attn", f"{dst_prefix}.self_attn")
+            )
+
+            query_transformer_layer_keys.extend(
+                rename_keys_for_attn(f"{src_prefix}.multihead_attn", f"{dst_prefix}.multihead_attn")
+            )
+
+            return query_transformer_layer_keys
+
+        def rename_keys_for_cross_attn_layer(src_prefix: str, dst_prefix: str):
+            cross_attn_layer_keys = []
+
+            cross_attn_layer_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.norm", f"{dst_prefix}.norm"))
+            cross_attn_layer_keys.extend(
+                rename_keys_for_attn(f"{src_prefix}.multihead_attn", f"{dst_prefix}.multihead_attn")
+            )
+
+            return cross_attn_layer_keys
+
+        def rename_keys_for_self_attn_layer(src_prefix: str, dst_prefix: str):
+            self_attn_layer_keys = []
+
+            self_attn_layer_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.norm", f"{dst_prefix}.norm"))
+            self_attn_layer_keys.extend(
+                rename_keys_for_self_attn(f"{src_prefix}.self_attn", f"{dst_prefix}.self_attn")
+            )
+
+            return self_attn_layer_keys
+
+        def rename_keys_for_ffn_layer(src_prefix: str, dst_prefix: str):
+            ffn_layer_keys = []
+
+            ffn_layer_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.linear1", f"{dst_prefix}.linear1"))
+            ffn_layer_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.linear2", f"{dst_prefix}.linear2"))
+            ffn_layer_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.norm", f"{dst_prefix}.norm"))
+
+            return ffn_layer_keys
+
+        def rename_keys_for_transformer_decoder_layer(src_prefix: str, dst_prefix: str, idx: int):
+            transformer_decoder_layer_keys = []
+
+            transformer_decoder_layer_keys.extend(
+                rename_keys_for_cross_attn_layer(
+                    f"{src_prefix}.transformer_cross_attention_layers.{idx}", f"{dst_prefix}.{idx}.cross_attn"
+                )
+            )
+
+            transformer_decoder_layer_keys.extend(
+                rename_keys_for_self_attn_layer(
+                    f"{src_prefix}.transformer_self_attention_layers.{idx}", f"{dst_prefix}.{idx}.self_attn"
+                )
+            )
+
+            transformer_decoder_layer_keys.extend(
+                rename_keys_for_ffn_layer(f"{src_prefix}.transformer_ffn_layers.{idx}", f"{dst_prefix}.{idx}.ffn")
+            )
+
+            return transformer_decoder_layer_keys
+
+        # positional embedding for object queries
+        renamed_keys = [
+            (f"{src_prefix}.query_embed.weight", f"{dst_prefix}.queries_embedder.weight"),
+            (f"{src_prefix}.level_embed.weight", f"{dst_prefix}.level_embed.weight"),
+        ]
+
+        # norm
+        renamed_keys.extend(
+            rename_keys_for_weight_bias(f"{src_prefix}.decoder_norm", f"{dst_prefix}.decoder.decoder_norm")
+        )
+
+        # proj
+        renamed_keys.extend(
+            rename_keys_for_weight_bias(
+                f"{src_prefix}.class_input_proj", f"{dst_prefix}.decoder.query_input_projection"
+            )
+        )
+
+        renamed_keys.extend(
+            rename_keys_for_weight_bias(f"{src_prefix}.class_embed", f"{dst_prefix}.decoder.class_embed")
+        )
+
+        for i in range(3):
+            renamed_keys.extend(
+                rename_keys_for_weight_bias(
+                    f"{src_prefix}.mask_embed.layers.{i}", f"{dst_prefix}.decoder.mask_embed.layers.{i}.0"
+                )
+            )
+
+        # norm
+        renamed_keys.extend(
+            rename_keys_for_weight_bias(
+                f"{src_prefix}.class_transformer.decoder.norm", f"{dst_prefix}.decoder.query_transformer.decoder.norm"
+            )
+        )
+
+        # transformer to update queries with task tokens
+        for i in range(self.config.query_dec_layers):
+            renamed_keys.extend(
+                rename_keys_for_query_transformer_layer(
+                    f"{src_prefix}.class_transformer.decoder.layers.{i}",
+                    f"{dst_prefix}.decoder.query_transformer.decoder.layers.{i}",
+                )
+            )
+
+        # decoder layers
+        for i in range(self.config.decoder_layers - 1):
+            renamed_keys.extend(
+                rename_keys_for_transformer_decoder_layer(
+                    f"{src_prefix}",
+                    f"{dst_prefix}.decoder.layers",
+                    i,
+                )
+            )
+
+        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
+        self.replace_keys_qkv_transformer_decoder(dst_state_dict, src_state_dict)
+
+    def replace_task_mlp(self, dst_state_dict: StateDict, src_state_dict: StateDict):
+        dst_prefix: str = "task_encoder"
+        src_prefix: str = "task_mlp"
+
+        def rename_keys_for_weight_bias(src_prefix: str, dst_prefix: str):
+            return [
+                (f"{src_prefix}.weight", f"{dst_prefix}.weight"),
+                (f"{src_prefix}.bias", f"{dst_prefix}.bias"),
+            ]
+
+        renamed_keys = []
+
+        for i in range(2):
+            renamed_keys.extend(
+                rename_keys_for_weight_bias(f"{src_prefix}.layers.{i}", f"{dst_prefix}.task_mlp.layers.{i}.0")
+            )
+
+        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
+
+    def replace_text_projector(self, dst_state_dict: StateDict, src_state_dict: StateDict):
+        dst_prefix: str = "text_mapper.text_projector"
+        src_prefix: str = "text_projector"
+
+        def rename_keys_for_weight_bias(src_prefix: str, dst_prefix: str):
+            return [
+                (f"{src_prefix}.weight", f"{dst_prefix}.weight"),
+                (f"{src_prefix}.bias", f"{dst_prefix}.bias"),
+            ]
+
+        renamed_keys = []
+
+        for i in range(self.config.text_encoder_config["text_encoder_proj_layers"]):
+            renamed_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.layers.{i}", f"{dst_prefix}.{i}.0"))
+
+        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
+
+    def replace_text_mapper(self, dst_state_dict: StateDict, src_state_dict: StateDict):
+        dst_prefix: str = "text_mapper.text_encoder"
+        src_prefix: str = "text_encoder"
+
+        self.replace_text_projector(dst_state_dict, src_state_dict)
+
+        def rename_keys_for_weight_bias(src_prefix: str, dst_prefix: str):
+            return [
+                (f"{src_prefix}.weight", f"{dst_prefix}.weight"),
+                (f"{src_prefix}.bias", f"{dst_prefix}.bias"),
+            ]
+
+        def rename_keys_for_attn(src_prefix: str, dst_prefix: str):
+            attn_keys = [
+                (f"{src_prefix}.in_proj_bias", f"{dst_prefix}.in_proj_bias"),
+                (f"{src_prefix}.in_proj_weight", f"{dst_prefix}.in_proj_weight"),
+            ]
+            attn_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.out_proj", f"{dst_prefix}.out_proj"))
+
+            return attn_keys
+
+        def rename_keys_for_layer(src_prefix: str, dst_prefix: str):
+            resblock_keys = []
+
+            resblock_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.mlp.c_fc", f"{dst_prefix}.mlp.fc1"))
+            resblock_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.mlp.c_proj", f"{dst_prefix}.mlp.fc2"))
+            resblock_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.ln_1", f"{dst_prefix}.layer_norm1"))
+            resblock_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.ln_2", f"{dst_prefix}.layer_norm2"))
+            resblock_keys.extend(rename_keys_for_attn(f"{src_prefix}.attn", f"{dst_prefix}.self_attn"))
+
+            return resblock_keys
+
+        renamed_keys = [
+            ("prompt_ctx.weight", "text_mapper.prompt_ctx.weight"),
+        ]
+
+        renamed_keys.extend(
+            [
+                (f"{src_prefix}.positional_embedding", f"{dst_prefix}.positional_embedding"),
+                (f"{src_prefix}.token_embedding.weight", f"{dst_prefix}.token_embedding.weight"),
+            ]
+        )
+
+        renamed_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.ln_final", f"{dst_prefix}.ln_final"))
+
+        for i in range(self.config.text_encoder_config["text_encoder_num_layers"]):
+            renamed_keys.extend(
+                rename_keys_for_layer(
+                    f"{src_prefix}.transformer.resblocks.{i}", f"{dst_prefix}.transformer.layers.{i}"
+                )
+            )
+
+        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
+
+    def convert(self, oneformer: OneFormerModel, is_swin: bool) -> OneFormerModel:
+        dst_state_dict = TrackedStateDict(oneformer.state_dict())
+        src_state_dict = self.original_model.state_dict()
+
+        self.replace_pixel_module(dst_state_dict, src_state_dict, is_swin)
+        self.replace_transformer_module(dst_state_dict, src_state_dict)
+        self.replace_task_mlp(dst_state_dict, src_state_dict)
+        if self.config.is_training:
+            self.replace_text_mapper(dst_state_dict, src_state_dict)
+
+        logger.info(f"Missed keys are {pformat(dst_state_dict.diff())}")
+        logger.info(f"Not copied keys are {pformat(src_state_dict.keys())}")
+        logger.info("🙌 Done")
+
+        oneformer.load_state_dict(dst_state_dict)
+
+        return oneformer
+
+    @staticmethod
+    def using_dirs(checkpoints_dir: Path, config_dir: Path) -> Iterator[Tuple[object, Path, Path]]:
+        checkpoints: List[Path] = checkpoints_dir.glob("**/*.pth")
+
+        for checkpoint in checkpoints:
+            logger.info(f"💪 Converting {checkpoint.stem}")
+            # find associated config file
+            config: Path = config_dir / f"{checkpoint.stem}.yaml"
+
+            yield config, checkpoint
+
+
+def post_process_sem_seg_output(outputs: OneFormerForUniversalSegmentationOutput, target_size: Tuple[int, int]):
+    # class_queries_logits has shape [BATCH, QUERIES, CLASSES + 1]
+    class_queries_logits = outputs.class_queries_logits
+    # masks_queries_logits has shape [BATCH, QUERIES, HEIGHT, WIDTH]
+    masks_queries_logits = outputs.masks_queries_logits
+    if target_size is not None:
+        masks_queries_logits = torch.nn.functional.interpolate(
+            masks_queries_logits,
+            size=target_size,
+            mode="bilinear",
+            align_corners=False,
+        )
+    # remove the null class `[..., :-1]`
+    masks_classes = class_queries_logits.softmax(dim=-1)[..., :-1]
+    # mask probs has shape [BATCH, QUERIES, HEIGHT, WIDTH]
+    masks_probs = masks_queries_logits.sigmoid()
+    # now we want to sum over the queries,
+    # $ out_{c,h,w} =  \sum_q p_{q,c} * m_{q,h,w} $
+    # where $ softmax(p) \in R^{q, c} $ is the mask classes
+    # and $ sigmoid(m) \in R^{q, h, w}$ is the mask probabilities
+    # b(atch)q(uery)c(lasses), b(atch)q(uery)h(eight)w(idth)
+    segmentation = torch.einsum("bqc, bqhw -> bchw", masks_classes, masks_probs)
+
+    return segmentation
+
+
+def test(
+    original_model,
+    our_model: OneFormerForUniversalSegmentation,
+    processor: OneFormerProcessor,
+    model_repo: str,
+):
+    def _preprocess_text(text_list=None, max_length=77):
+        if text_list is None:
+            raise ValueError("tokens cannot be None.")
+
+        tokens = tokenizer(text_list, padding="max_length", max_length=max_length, truncation=True)
+
+        attention_masks, input_ids = tokens["attention_mask"], tokens["input_ids"]
+
+        token_inputs = []
+        for attn_mask, input_id in zip(attention_masks, input_ids):
+            token = torch.tensor(attn_mask) * torch.tensor(input_id)
+            token_inputs.append(token.unsqueeze(0))
+
+        token_inputs = torch.cat(token_inputs, dim=0)
+        return token_inputs
+
+    with torch.no_grad():
+        tokenizer = CLIPTokenizer.from_pretrained(model_repo)
+        original_model = original_model.eval()
+        our_model = our_model.eval()
+
+        im = prepare_img()
+
+        tr = T.Compose(
+            [
+                T.Resize((640, 640)),
+                T.ToTensor(),
+                T.Normalize(
+                    mean=torch.tensor([123.675, 116.280, 103.530]) / 255.0,
+                    std=torch.tensor([58.395, 57.120, 57.375]) / 255.0,
+                ),
+            ],
+        )
+
+        x = tr(im).unsqueeze(0)
+
+        task_input = ["the task is semantic"]
+        task_token = _preprocess_text(task_input, max_length=processor.task_seq_length)
+
+        original_model_backbone_features = original_model.backbone(x.clone())
+
+        our_model_output: OneFormerModelOutput = our_model.model(x.clone(), task_token, output_hidden_states=True)
+
+        for original_model_feature, our_model_feature in zip(
+            original_model_backbone_features.values(), our_model_output.encoder_hidden_states
+        ):
+            assert torch.allclose(
+                original_model_feature, our_model_feature, atol=3e-3
+            ), "The backbone features are not the same."
+        mask_features, _, multi_scale_features, _, _ = original_model.sem_seg_head.pixel_decoder.forward_features(
+            original_model_backbone_features
+        )
+
+        original_pixel_decoder_features = []
+        original_pixel_decoder_features.append(mask_features)
+        for i in range(len(multi_scale_features)):
+            original_pixel_decoder_features.append(multi_scale_features[i])
+
+        for original_model_feature, our_model_feature in zip(
+            original_pixel_decoder_features, our_model_output.pixel_decoder_hidden_states
+        ):
+            assert torch.allclose(
+                original_model_feature, our_model_feature, atol=3e-4
+            ), "The pixel decoder feature are not the same"
+
+        tr_complete = T.Compose(
+            [
+                T.Resize((640, 640)),
+                T.ToTensor(),
+            ],
+        )
+
+        y = (tr_complete(im) * 255.0).to(torch.int).float()
+
+        # let's test the full model
+        original_model_out = original_model([{"image": y.clone(), "task": "The task is semantic"}])
+
+        original_segmentation = original_model_out[0]["sem_seg"]
+
+        our_model_out: OneFormerForUniversalSegmentationOutput = our_model(
+            x.clone(), task_token, output_hidden_states=True
+        )
+
+        our_segmentation = post_process_sem_seg_output(our_model_out, target_size=(640, 640))[0]
+
+        assert torch.allclose(
+            original_segmentation, our_segmentation, atol=1e-3
+        ), "The segmentation image is not the same."
+
+        logger.info("✅ Test passed!")
+
+
+def get_name(checkpoint_file: Path):
+    model_name_raw: str = checkpoint_file.stem
+
+    backbone = "swin" if "swin" in model_name_raw else "dinat"
+    dataset = ""
+    if "coco" in model_name_raw:
+        dataset = "coco"
+    elif "ade20k" in model_name_raw:
+        dataset = "ade20k"
+    elif "cityscapes" in model_name_raw:
+        dataset = "cityscapes"
+    else:
+        raise ValueError(
+            f"{model_name_raw} must be wrong since we didn't find 'coco' or 'ade20k' or 'cityscapes' in it "
+        )
+
+    backbone_types = ["tiny", "large"]
+
+    backbone_type = list(filter(lambda x: x in model_name_raw, backbone_types))[0]
+
+    model_name = f"oneformer_{dataset}_{backbone}_{backbone_type}"
+
+    return model_name
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(
+        description=(
+            "Command line to convert the original oneformer models (with swin backbone) to transformers"
+            " implementation."
+        )
+    )
+
+    parser.add_argument(
+        "--checkpoints_dir",
+        type=Path,
+        help=(
+            "A directory containing the model's checkpoints. The directory has to have the following structure:"
+            " structure: <DIR_NAME>/<DATASET_NAME>/<CONFIG_NAME>.pth; where <CONFIG_NAME> name must follow the"
+            " following nomenclature nomenclature: oneformer_<DATASET_NAME>_<BACKBONE>_<BACKBONE_TYPE>"
+        ),
+    )
+    parser.add_argument(
+        "--configs_dir",
+        type=Path,
+        help=(
+            "A directory containing the model's configs, see detectron2 doc. The directory has to have the following"
+            " structure: <DIR_NAME>/<DATASET_NAME>/<CONFIG_NAME>.yaml; where <CONFIG_NAME> name must follow the"
+            " following nomenclature nomenclature: oneformer_<DATASET_NAME>_<BACKBONE>_<BACKBONE_TYPE>"
+        ),
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        required=True,
+        type=Path,
+        help="Path to the folder to output PyTorch models.",
+    )
+    parser.add_argument(
+        "--oneformer_dir",
+        required=True,
+        type=Path,
+        help=(
+            "A path to OneFormer's original implementation directory. You can download from here:"
+            "https://github.com/SHI-Labs/OneFormer"
+        ),
+    )
+
+    args = parser.parse_args()
+
+    checkpoints_dir: Path = args.checkpoints_dir
+    config_dir: Path = args.configs_dir
+    save_directory: Path = args.pytorch_dump_folder_path
+    oneformer_dir: Path = args.oneformer_dir
+    # append the path to the parents to oneformer dir
+    sys.path.append(str(oneformer_dir.parent))
+    # and import what's needed
+    from OneFormer.oneformer import add_common_config, add_dinat_config, add_oneformer_config, add_swin_config
+    from OneFormer.oneformer.oneformer_model import OneFormer as OriginalOneFormer
+
+    if not save_directory.exists():
+        save_directory.mkdir(parents=True)
+
+    for config_file, checkpoint_file in OriginalOneFormerCheckpointToOursConverter.using_dirs(
+        checkpoints_dir, config_dir
+    ):
+        processor = OriginalOneFormerConfigToProcessorConverter()(
+            setup_cfg(Args(config_file=config_file)), os.path.join("shi-labs", config_file.stem)
+        )
+
+        original_config = setup_cfg(Args(config_file=config_file))
+        oneformer_kwargs = OriginalOneFormer.from_config(original_config)
+
+        original_model = OriginalOneFormer(**oneformer_kwargs).eval()
+
+        DetectionCheckpointer(original_model).load(str(checkpoint_file))
+
+        is_swin = "swin" in config_file.stem
+
+        config: OneFormerConfig = OriginalOneFormerConfigToOursConverter()(original_config, is_swin)
+
+        oneformer = OneFormerModel(config=config).eval()
+
+        converter = OriginalOneFormerCheckpointToOursConverter(original_model, config)
+
+        oneformer = converter.convert(oneformer, is_swin)
+
+        oneformer_for_universal_segmentation = OneFormerForUniversalSegmentation(config=config).eval()
+
+        oneformer_for_universal_segmentation.model = oneformer
+
+        test(
+            original_model,
+            oneformer_for_universal_segmentation,
+            processor,
+            os.path.join("shi-labs", config_file.stem),
+        )
+
+        model_name = get_name(checkpoint_file)
+        logger.info(f"🪄 Saving {model_name}")
+
+        processor.save_pretrained(save_directory / model_name)
+        oneformer_for_universal_segmentation.save_pretrained(save_directory / model_name)
+
+        processor.push_to_hub(
+            repo_id=os.path.join("shi-labs", config_file.stem),
+            commit_message="Add configs",
+            use_temp_dir=True,
+        )
+        oneformer_for_universal_segmentation.push_to_hub(
+            repo_id=os.path.join("shi-labs", config_file.stem),
+            commit_message="Add model",
+            use_temp_dir=True,
+        )
diff --git a/src/transformers/models/oneformer/image_processing_oneformer.py b/src/transformers/models/oneformer/image_processing_oneformer.py
new file mode 100644
index 000000000000..16f5013f154a
--- /dev/null
+++ b/src/transformers/models/oneformer/image_processing_oneformer.py
@@ -0,0 +1,1323 @@
+# coding=utf-8
+# Copyright 2022 SHI Labs and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for OneFormer."""
+
+import json
+import warnings
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
+
+import numpy as np
+from huggingface_hub import hf_hub_download
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+    PaddingMode,
+    get_resize_output_image_size,
+    pad,
+    rescale,
+    resize,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    TensorType,
+    is_torch_available,
+    is_torch_tensor,
+    logging,
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+
+# Copied from transformers.models.detr.image_processing_detr.max_across_indices
+def max_across_indices(values: Iterable[Any]) -> List[Any]:
+    """
+    Return the maximum value across all indices of an iterable of values.
+    """
+    return [max(values_i) for values_i in zip(*values)]
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_max_height_width
+def get_max_height_width(
+    images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> List[int]:
+    """
+    Get the maximum height and width across all images in a batch.
+    """
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(images[0])
+
+    if input_data_format == ChannelDimension.FIRST:
+        _, max_height, max_width = max_across_indices([img.shape for img in images])
+    elif input_data_format == ChannelDimension.LAST:
+        max_height, max_width, _ = max_across_indices([img.shape for img in images])
+    else:
+        raise ValueError(f"Invalid channel dimension format: {input_data_format}")
+    return (max_height, max_width)
+
+
+# Copied from transformers.models.detr.image_processing_detr.make_pixel_mask
+def make_pixel_mask(
+    image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> np.ndarray:
+    """
+    Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
+
+    Args:
+        image (`np.ndarray`):
+            Image to make the pixel mask for.
+        output_size (`Tuple[int, int]`):
+            Output size of the mask.
+    """
+    input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+    mask = np.zeros(output_size, dtype=np.int64)
+    mask[:input_height, :input_width] = 1
+    return mask
+
+
+# Copied from transformers.models.detr.image_processing_detr.binary_mask_to_rle
+def binary_mask_to_rle(mask):
+    """
+    Converts given binary mask of shape `(height, width)` to the run-length encoding (RLE) format.
+
+    Args:
+        mask (`torch.Tensor` or `numpy.array`):
+            A binary mask tensor of shape `(height, width)` where 0 denotes background and 1 denotes the target
+            segment_id or class_id.
+    Returns:
+        `List`: Run-length encoded list of the binary mask. Refer to COCO API for more information about the RLE
+        format.
+    """
+    if is_torch_tensor(mask):
+        mask = mask.numpy()
+
+    pixels = mask.flatten()
+    pixels = np.concatenate([[0], pixels, [0]])
+    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
+    runs[1::2] -= runs[::2]
+    return list(runs)
+
+
+# Copied from transformers.models.detr.image_processing_detr.convert_segmentation_to_rle
+def convert_segmentation_to_rle(segmentation):
+    """
+    Converts given segmentation map of shape `(height, width)` to the run-length encoding (RLE) format.
+
+    Args:
+        segmentation (`torch.Tensor` or `numpy.array`):
+            A segmentation map of shape `(height, width)` where each value denotes a segment or class id.
+    Returns:
+        `List[List]`: A list of lists, where each list is the run-length encoding of a segment / class id.
+    """
+    segment_ids = torch.unique(segmentation)
+
+    run_length_encodings = []
+    for idx in segment_ids:
+        mask = torch.where(segmentation == idx, 1, 0)
+        rle = binary_mask_to_rle(mask)
+        run_length_encodings.append(rle)
+
+    return run_length_encodings
+
+
+# Copied from transformers.models.detr.image_processing_detr.remove_low_and_no_objects
+def remove_low_and_no_objects(masks, scores, labels, object_mask_threshold, num_labels):
+    """
+    Binarize the given masks using `object_mask_threshold`, it returns the associated values of `masks`, `scores` and
+    `labels`.
+
+    Args:
+        masks (`torch.Tensor`):
+            A tensor of shape `(num_queries, height, width)`.
+        scores (`torch.Tensor`):
+            A tensor of shape `(num_queries)`.
+        labels (`torch.Tensor`):
+            A tensor of shape `(num_queries)`.
+        object_mask_threshold (`float`):
+            A number between 0 and 1 used to binarize the masks.
+    Raises:
+        `ValueError`: Raised when the first dimension doesn't match in all input tensors.
+    Returns:
+        `Tuple[`torch.Tensor`, `torch.Tensor`, `torch.Tensor`]`: The `masks`, `scores` and `labels` without the region
+        < `object_mask_threshold`.
+    """
+    if not (masks.shape[0] == scores.shape[0] == labels.shape[0]):
+        raise ValueError("mask, scores and labels must have the same shape!")
+
+    to_keep = labels.ne(num_labels) & (scores > object_mask_threshold)
+
+    return masks[to_keep], scores[to_keep], labels[to_keep]
+
+
+# Copied from transformers.models.detr.image_processing_detr.check_segment_validity
+def check_segment_validity(mask_labels, mask_probs, k, mask_threshold=0.5, overlap_mask_area_threshold=0.8):
+    # Get the mask associated with the k class
+    mask_k = mask_labels == k
+    mask_k_area = mask_k.sum()
+
+    # Compute the area of all the stuff in query k
+    original_area = (mask_probs[k] >= mask_threshold).sum()
+    mask_exists = mask_k_area > 0 and original_area > 0
+
+    # Eliminate disconnected tiny segments
+    if mask_exists:
+        area_ratio = mask_k_area / original_area
+        if not area_ratio.item() > overlap_mask_area_threshold:
+            mask_exists = False
+
+    return mask_exists, mask_k
+
+
+# Copied from transformers.models.detr.image_processing_detr.compute_segments
+def compute_segments(
+    mask_probs,
+    pred_scores,
+    pred_labels,
+    mask_threshold: float = 0.5,
+    overlap_mask_area_threshold: float = 0.8,
+    label_ids_to_fuse: Optional[Set[int]] = None,
+    target_size: Tuple[int, int] = None,
+):
+    height = mask_probs.shape[1] if target_size is None else target_size[0]
+    width = mask_probs.shape[2] if target_size is None else target_size[1]
+
+    segmentation = torch.zeros((height, width), dtype=torch.int32, device=mask_probs.device)
+    segments: List[Dict] = []
+
+    if target_size is not None:
+        mask_probs = nn.functional.interpolate(
+            mask_probs.unsqueeze(0), size=target_size, mode="bilinear", align_corners=False
+        )[0]
+
+    current_segment_id = 0
+
+    # Weigh each mask by its prediction score
+    mask_probs *= pred_scores.view(-1, 1, 1)
+    mask_labels = mask_probs.argmax(0)  # [height, width]
+
+    # Keep track of instances of each class
+    stuff_memory_list: Dict[str, int] = {}
+    for k in range(pred_labels.shape[0]):
+        pred_class = pred_labels[k].item()
+        should_fuse = pred_class in label_ids_to_fuse
+
+        # Check if mask exists and large enough to be a segment
+        mask_exists, mask_k = check_segment_validity(
+            mask_labels, mask_probs, k, mask_threshold, overlap_mask_area_threshold
+        )
+
+        if mask_exists:
+            if pred_class in stuff_memory_list:
+                current_segment_id = stuff_memory_list[pred_class]
+            else:
+                current_segment_id += 1
+
+            # Add current object segment to final segmentation map
+            segmentation[mask_k] = current_segment_id
+            segment_score = round(pred_scores[k].item(), 6)
+            segments.append(
+                {
+                    "id": current_segment_id,
+                    "label_id": pred_class,
+                    "was_fused": should_fuse,
+                    "score": segment_score,
+                }
+            )
+            if should_fuse:
+                stuff_memory_list[pred_class] = current_segment_id
+
+    return segmentation, segments
+
+
+# Copied from transformers.models.maskformer.image_processing_maskformer.convert_segmentation_map_to_binary_masks
+def convert_segmentation_map_to_binary_masks(
+    segmentation_map: "np.ndarray",
+    instance_id_to_semantic_id: Optional[Dict[int, int]] = None,
+    ignore_index: Optional[int] = None,
+    reduce_labels: bool = False,
+):
+    if reduce_labels and ignore_index is None:
+        raise ValueError("If `reduce_labels` is True, `ignore_index` must be provided.")
+
+    if reduce_labels:
+        segmentation_map = np.where(segmentation_map == 0, ignore_index, segmentation_map - 1)
+
+    # Get unique ids (class or instance ids based on input)
+    all_labels = np.unique(segmentation_map)
+
+    # Drop background label if applicable
+    if ignore_index is not None:
+        all_labels = all_labels[all_labels != ignore_index]
+
+    # Generate a binary mask for each object instance
+    binary_masks = [(segmentation_map == i) for i in all_labels]
+    binary_masks = np.stack(binary_masks, axis=0)  # (num_labels, height, width)
+
+    # Convert instance ids to class ids
+    if instance_id_to_semantic_id is not None:
+        labels = np.zeros(all_labels.shape[0])
+
+        for label in all_labels:
+            class_id = instance_id_to_semantic_id[label + 1 if reduce_labels else label]
+            labels[all_labels == label] = class_id - 1 if reduce_labels else class_id
+    else:
+        labels = all_labels
+
+    return binary_masks.astype(np.float32), labels.astype(np.int64)
+
+
+def get_oneformer_resize_output_image_size(
+    image: np.ndarray,
+    size: Union[int, Tuple[int, int], List[int], Tuple[int]],
+    max_size: Optional[int] = None,
+    default_to_square: bool = True,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> tuple:
+    """
+    Computes the output size given the desired size.
+
+    Args:
+        input_image (`np.ndarray`):
+            The input image.
+        size (`int`, `Tuple[int, int]`, `List[int]`, `Tuple[int]`):
+            The size of the output image.
+        max_size (`int`, *optional*):
+            The maximum size of the output image.
+        default_to_square (`bool`, *optional*, defaults to `True`):
+            Whether to default to square if no size is provided.
+
+    Returns:
+        `Tuple[int, int]`: The output size.
+    """
+    output_size = get_resize_output_image_size(
+        input_image=image,
+        size=size,
+        default_to_square=default_to_square,
+        max_size=max_size,
+        input_data_format=input_data_format,
+    )
+    return output_size
+
+
+def prepare_metadata(repo_path, class_info_file):
+    with open(hf_hub_download(repo_path, class_info_file, repo_type="dataset"), "r") as f:
+        class_info = json.load(f)
+    metadata = {}
+    class_names = []
+    thing_ids = []
+    for key, info in class_info.items():
+        metadata[key] = info["name"]
+        class_names.append(info["name"])
+        if info["isthing"]:
+            thing_ids.append(int(key))
+    metadata["thing_ids"] = thing_ids
+    metadata["class_names"] = class_names
+    return metadata
+
+
+class OneFormerImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a OneFormer image processor. The image processor can be used to prepare image(s), task input(s) and
+    optional text inputs and targets for the model.
+
+    This image processor inherits from [`BaseImageProcessor`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the input to a certain `size`.
+        size (`int`, *optional*, defaults to 800):
+            Resize the input to the given size. Only has an effect if `do_resize` is set to `True`. If size is a
+            sequence like `(width, height)`, output size will be matched to this. If size is an int, smaller edge of
+            the image will be matched to this number. i.e, if `height > width`, then image will be rescaled to `(size *
+            height / width, size)`.
+        resample (`int`, *optional*, defaults to `Resampling.BILINEAR`):
+            An optional resampling filter. This can be one of `PIL.Image.Resampling.NEAREST`,
+            `PIL.Image.Resampling.BOX`, `PIL.Image.Resampling.BILINEAR`, `PIL.Image.Resampling.HAMMING`,
+            `PIL.Image.Resampling.BICUBIC` or `PIL.Image.Resampling.LANCZOS`. Only has an effect if `do_resize` is set
+            to `True`.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the input to a certain `scale`.
+        rescale_factor (`float`, *optional*, defaults to `1/ 255`):
+            Rescale the input by the given factor. Only has an effect if `do_rescale` is set to `True`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether or not to normalize the input with mean and standard deviation.
+        image_mean (`int`, *optional*, defaults to `[0.485, 0.456, 0.406]`):
+            The sequence of means for each channel, to be used when normalizing images. Defaults to the ImageNet mean.
+        image_std (`int`, *optional*, defaults to `[0.229, 0.224, 0.225]`):
+            The sequence of standard deviations for each channel, to be used when normalizing images. Defaults to the
+            ImageNet std.
+        ignore_index (`int`, *optional*):
+            Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
+            denoted with 0 (background) will be replaced with `ignore_index`.
+        do_reduce_labels (`bool`, *optional*, defaults to `False`):
+            Whether or not to decrement all label values of segmentation maps by 1. Usually used for datasets where 0
+            is used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k).
+            The background label will be replaced by `ignore_index`.
+        repo_path (`str`, defaults to `shi-labs/oneformer_demo`, *optional*, defaults to `"shi-labs/oneformer_demo"`):
+            Dataset repository on huggingface hub containing the JSON file with class information for the dataset.
+        class_info_file (`str`, *optional*):
+            JSON file containing class information for the dataset. It is stored inside on the `repo_path` dataset
+            repository.
+        num_text (`int`, *optional*):
+            Number of text entries in the text input list.
+    """
+
+    model_input_names = ["pixel_values", "pixel_mask", "task_inputs"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_factor: float = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Union[float, List[float]] = None,
+        image_std: Union[float, List[float]] = None,
+        ignore_index: Optional[int] = None,
+        do_reduce_labels: bool = False,
+        repo_path: str = "shi-labs/oneformer_demo",
+        class_info_file: str = None,
+        num_text: Optional[int] = None,
+        **kwargs,
+    ):
+        if "max_size" in kwargs:
+            self._max_size = kwargs.pop("max_size")
+        else:
+            self._max_size = 1333
+
+        size = size if size is not None else {"shortest_edge": 800, "longest_edge": self._max_size}
+        size = get_size_dict(size, max_size=self._max_size, default_to_square=False)
+
+        if "reduce_labels" in kwargs:
+            warnings.warn(
+                "The `reduce_labels` argument is deprecated and will be removed in v4.27. "
+                "Please use `do_reduce_labels` instead.",
+                FutureWarning,
+            )
+            do_reduce_labels = kwargs.pop("reduce_labels")
+
+        super().__init__(**kwargs)
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+        self.ignore_index = ignore_index
+        self.do_reduce_labels = do_reduce_labels
+        self.class_info_file = class_info_file
+        self.repo_path = repo_path
+        self.metadata = prepare_metadata(repo_path, class_info_file)
+        self.num_text = num_text
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format=None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize the image to the given size. Size can be min_size (scalar) or `(height, width)` tuple. If size is an
+        int, smaller edge of the image will be matched to this number.
+        """
+        if "max_size" in kwargs:
+            warnings.warn(
+                "The `max_size` parameter is deprecated and will be removed in v4.27. "
+                "Please specify in `size['longest_edge'] instead`.",
+                FutureWarning,
+            )
+            max_size = kwargs.pop("max_size")
+        else:
+            max_size = None
+        size = get_size_dict(size, max_size=max_size, default_to_square=False)
+        if "shortest_edge" in size and "longest_edge" in size:
+            size, max_size = size["shortest_edge"], size["longest_edge"]
+        elif "height" in size and "width" in size:
+            size = (size["height"], size["width"])
+            max_size = None
+        else:
+            raise ValueError(
+                "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
+                f" {size.keys()}."
+            )
+        size = get_oneformer_resize_output_image_size(
+            image=image, size=size, max_size=max_size, default_to_square=False, input_data_format=input_data_format
+        )
+        image = resize(
+            image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format
+        )
+        return image
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
+    def rescale(
+        self,
+        image: np.ndarray,
+        rescale_factor: float,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Rescale the image by the given factor. image = image * rescale_factor.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            rescale_factor (`float`):
+                The value to use for rescaling.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the input image. If unset, is inferred from the input image. Can be
+                one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+        """
+        return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)
+
+    # Copied from transformers.models.maskformer.image_processing_maskformer.MaskFormerImageProcessor.convert_segmentation_map_to_binary_masks
+    def convert_segmentation_map_to_binary_masks(
+        self,
+        segmentation_map: "np.ndarray",
+        instance_id_to_semantic_id: Optional[Dict[int, int]] = None,
+        ignore_index: Optional[int] = None,
+        reduce_labels: bool = False,
+    ):
+        reduce_labels = reduce_labels if reduce_labels is not None else self.reduce_labels
+        ignore_index = ignore_index if ignore_index is not None else self.ignore_index
+        return convert_segmentation_map_to_binary_masks(
+            segmentation_map=segmentation_map,
+            instance_id_to_semantic_id=instance_id_to_semantic_id,
+            ignore_index=ignore_index,
+            reduce_labels=reduce_labels,
+        )
+
+    def __call__(self, images, task_inputs=None, segmentation_maps=None, **kwargs) -> BatchFeature:
+        return self.preprocess(images, task_inputs=task_inputs, segmentation_maps=segmentation_maps, **kwargs)
+
+    def _preprocess(
+        self,
+        image: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        if do_resize:
+            image = self.resize(image, size=size, resample=resample, input_data_format=input_data_format)
+        if do_rescale:
+            image = self.rescale(image, rescale_factor=rescale_factor, input_data_format=input_data_format)
+        if do_normalize:
+            image = self.normalize(image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+        return image
+
+    def _preprocess_image(
+        self,
+        image: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """Preprocesses a single image."""
+        # All transformations expect numpy arrays.
+        image = to_numpy_array(image)
+        if is_scaled_image(image) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(image)
+        image = self._preprocess(
+            image=image,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            input_data_format=input_data_format,
+        )
+        if data_format is not None:
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+        return image
+
+    def _preprocess_mask(
+        self,
+        segmentation_map: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """Preprocesses a single mask."""
+        segmentation_map = to_numpy_array(segmentation_map)
+        # Add channel dimension if missing - needed for certain transformations
+        if segmentation_map.ndim == 2:
+            added_channel_dim = True
+            segmentation_map = segmentation_map[None, ...]
+            input_data_format = ChannelDimension.FIRST
+        else:
+            added_channel_dim = False
+            if input_data_format is None:
+                input_data_format = infer_channel_dimension_format(segmentation_map, num_channels=1)
+        # TODO: (Amy)
+        # Remork segmentation map processing to include reducing labels and resizing which doesn't
+        # drop segment IDs > 255.
+        segmentation_map = self._preprocess(
+            image=segmentation_map,
+            do_resize=do_resize,
+            resample=PILImageResampling.NEAREST,
+            size=size,
+            do_rescale=False,
+            do_normalize=False,
+            input_data_format=input_data_format,
+        )
+        # Remove extra channel dimension if added for processing
+        if added_channel_dim:
+            segmentation_map = segmentation_map.squeeze(0)
+        return segmentation_map
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        task_inputs: Optional[List[str]] = None,
+        segmentation_maps: Optional[ImageInput] = None,
+        instance_id_to_semantic_id: Optional[Dict[int, int]] = None,
+        do_resize: Optional[bool] = None,
+        size: Optional[Dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        ignore_index: Optional[int] = None,
+        do_reduce_labels: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        if "pad_and_return_pixel_mask" in kwargs:
+            warnings.warn(
+                "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in v4.27",
+                FutureWarning,
+            )
+        if "reduce_labels" in kwargs:
+            warnings.warn(
+                "The `reduce_labels` argument is deprecated and will be removed in a v4.27. Please use"
+                " `do_reduce_labels` instead.",
+                FutureWarning,
+            )
+            if do_reduce_labels is not None:
+                raise ValueError(
+                    "You cannot use both `reduce_labels` and `do_reduce_labels` arguments. Please use"
+                    " `do_reduce_labels` instead."
+                )
+            do_reduce_labels = kwargs.pop("reduce_labels")
+
+        if task_inputs is None:
+            # Default value
+            task_inputs = ["panoptic"]
+
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size, default_to_square=False, max_size=self._max_size)
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        ignore_index = ignore_index if ignore_index is not None else self.ignore_index
+        do_reduce_labels = do_reduce_labels if do_reduce_labels is not None else self.do_reduce_labels
+
+        if do_resize is not None and size is None:
+            raise ValueError("If `do_resize` is True, `size` must be provided.")
+
+        if do_rescale is not None and rescale_factor is None:
+            raise ValueError("If `do_rescale` is True, `rescale_factor` must be provided.")
+
+        if do_normalize is not None and (image_mean is None or image_std is None):
+            raise ValueError("If `do_normalize` is True, `image_mean` and `image_std` must be provided.")
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        if segmentation_maps is not None and not valid_images(segmentation_maps):
+            raise ValueError(
+                "Invalid segmentation map type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        images = make_list_of_images(images)
+        if segmentation_maps is not None:
+            segmentation_maps = make_list_of_images(segmentation_maps, expected_ndims=2)
+
+        if segmentation_maps is not None and len(images) != len(segmentation_maps):
+            raise ValueError("Images and segmentation maps must have the same length.")
+
+        images = [
+            self._preprocess_image(
+                image,
+                do_resize=do_resize,
+                size=size,
+                resample=resample,
+                do_rescale=do_rescale,
+                rescale_factor=rescale_factor,
+                do_normalize=do_normalize,
+                image_mean=image_mean,
+                image_std=image_std,
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
+            for image in images
+        ]
+
+        if segmentation_maps is not None:
+            segmentation_maps = [
+                self._preprocess_mask(segmentation_map, do_resize, size, input_data_format=input_data_format)
+                for segmentation_map in segmentation_maps
+            ]
+        encoded_inputs = self.encode_inputs(
+            images,
+            task_inputs,
+            segmentation_maps,
+            instance_id_to_semantic_id,
+            ignore_index,
+            do_reduce_labels,
+            return_tensors,
+            input_data_format=input_data_format,
+        )
+        return encoded_inputs
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
+    def _pad_image(
+        self,
+        image: np.ndarray,
+        output_size: Tuple[int, int],
+        constant_values: Union[float, Iterable[float]] = 0,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Pad an image with zeros to the given size.
+        """
+        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+        output_height, output_width = output_size
+
+        pad_bottom = output_height - input_height
+        pad_right = output_width - input_width
+        padding = ((0, pad_bottom), (0, pad_right))
+        padded_image = pad(
+            image,
+            padding,
+            mode=PaddingMode.CONSTANT,
+            constant_values=constant_values,
+            data_format=data_format,
+            input_data_format=input_data_format,
+        )
+        return padded_image
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
+    def pad(
+        self,
+        images: List[np.ndarray],
+        constant_values: Union[float, Iterable[float]] = 0,
+        return_pixel_mask: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> BatchFeature:
+        """
+        Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
+        in the batch and optionally returns their corresponding pixel mask.
+
+        Args:
+            image (`np.ndarray`):
+                Image to pad.
+            constant_values (`float` or `Iterable[float]`, *optional*):
+                The value to use for the padding if `mode` is `"constant"`.
+            return_pixel_mask (`bool`, *optional*, defaults to `True`):
+                Whether to return a pixel mask.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        pad_size = get_max_height_width(images, input_data_format=input_data_format)
+
+        padded_images = [
+            self._pad_image(
+                image,
+                pad_size,
+                constant_values=constant_values,
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
+            for image in images
+        ]
+        data = {"pixel_values": padded_images}
+
+        if return_pixel_mask:
+            masks = [
+                make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
+                for image in images
+            ]
+            data["pixel_mask"] = masks
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    def get_semantic_annotations(self, label, num_class_obj):
+        annotation_classes = label["classes"]
+        annotation_masks = label["masks"]
+
+        texts = ["a semantic photo"] * self.num_text
+        classes = []
+        masks = []
+
+        for idx in range(len(annotation_classes)):
+            class_id = annotation_classes[idx]
+            mask = annotation_masks[idx]
+            if not np.all(mask is False):
+                if class_id not in classes:
+                    cls_name = self.metadata[str(class_id)]
+                    classes.append(class_id)
+                    masks.append(mask)
+                    num_class_obj[cls_name] += 1
+                else:
+                    idx = classes.index(class_id)
+                    masks[idx] += mask
+                    masks[idx] = np.clip(masks[idx], 0, 1)
+
+        num = 0
+        for i, cls_name in enumerate(self.metadata["class_names"]):
+            if num_class_obj[cls_name] > 0:
+                for _ in range(num_class_obj[cls_name]):
+                    if num >= len(texts):
+                        break
+                    texts[num] = f"a photo with a {cls_name}"
+                    num += 1
+
+        classes = np.array(classes)
+        masks = np.array(masks)
+        return classes, masks, texts
+
+    def get_instance_annotations(self, label, num_class_obj):
+        annotation_classes = label["classes"]
+        annotation_masks = label["masks"]
+
+        texts = ["an instance photo"] * self.num_text
+        classes = []
+        masks = []
+
+        for idx in range(len(annotation_classes)):
+            class_id = annotation_classes[idx]
+            mask = annotation_masks[idx]
+
+            if class_id in self.metadata["thing_ids"]:
+                if not np.all(mask is False):
+                    cls_name = self.metadata[str(class_id)]
+                    classes.append(class_id)
+                    masks.append(mask)
+                    num_class_obj[cls_name] += 1
+
+        num = 0
+        for i, cls_name in enumerate(self.metadata["class_names"]):
+            if num_class_obj[cls_name] > 0:
+                for _ in range(num_class_obj[cls_name]):
+                    if num >= len(texts):
+                        break
+                    texts[num] = f"a photo with a {cls_name}"
+                    num += 1
+
+        classes = np.array(classes)
+        masks = np.array(masks)
+        return classes, masks, texts
+
+    def get_panoptic_annotations(self, label, num_class_obj):
+        annotation_classes = label["classes"]
+        annotation_masks = label["masks"]
+
+        texts = ["an panoptic photo"] * self.num_text
+        classes = []
+        masks = []
+
+        for idx in range(len(annotation_classes)):
+            class_id = annotation_classes[idx]
+            mask = annotation_masks[idx].data
+            if not np.all(mask is False):
+                cls_name = self.metadata[str(class_id)]
+                classes.append(class_id)
+                masks.append(mask)
+                num_class_obj[cls_name] += 1
+
+        num = 0
+        for i, cls_name in enumerate(self.metadata["class_names"]):
+            if num_class_obj[cls_name] > 0:
+                for _ in range(num_class_obj[cls_name]):
+                    if num >= len(texts):
+                        break
+                    texts[num] = f"a photo with a {cls_name}"
+                    num += 1
+
+        classes = np.array(classes)
+        masks = np.array(masks)
+        return classes, masks, texts
+
+    def encode_inputs(
+        self,
+        pixel_values_list: List[ImageInput],
+        task_inputs: List[str],
+        segmentation_maps: ImageInput = None,
+        instance_id_to_semantic_id: Optional[Union[List[Dict[int, int]], Dict[int, int]]] = None,
+        ignore_index: Optional[int] = None,
+        reduce_labels: bool = False,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Pad images up to the largest image in a batch and create a corresponding `pixel_mask`.
+
+        OneFormer addresses semantic segmentation with a mask classification paradigm, thus input segmentation maps
+        will be converted to lists of binary masks and their respective labels. Let's see an example, assuming
+        `segmentation_maps = [[2,6,7,9]]`, the output will contain `mask_labels =
+        [[1,0,0,0],[0,1,0,0],[0,0,1,0],[0,0,0,1]]` (four binary masks) and `class_labels = [2,6,7,9]`, the labels for
+        each mask.
+
+        Args:
+            pixel_values_list (`List[ImageInput]`):
+                List of images (pixel values) to be padded. Each image should be a tensor of shape `(channels, height,
+                width)`.
+
+            task_inputs (`List[str]`):
+                List of task values.
+
+            segmentation_maps (`ImageInput`, *optional*):
+                The corresponding semantic segmentation maps with the pixel-wise annotations.
+
+             (`bool`, *optional*, defaults to `True`):
+                Whether or not to pad images up to the largest image in a batch and create a pixel mask.
+
+                If left to the default, will return a pixel mask that is:
+
+                - 1 for pixels that are real (i.e. **not masked**),
+                - 0 for pixels that are padding (i.e. **masked**).
+
+            instance_id_to_semantic_id (`List[Dict[int, int]]` or `Dict[int, int]`, *optional*):
+                A mapping between object instance ids and class ids. If passed, `segmentation_maps` is treated as an
+                instance segmentation map where each pixel represents an instance id. Can be provided as a single
+                dictionary with a global/dataset-level mapping or as a list of dictionaries (one per image), to map
+                instance ids in each image separately.
+
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
+                If set, will return tensors instead of NumPy arrays. If set to `'pt'`, return PyTorch `torch.Tensor`
+                objects.
+
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred from the input
+                image.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **pixel_values** -- Pixel values to be fed to a model.
+            - **pixel_mask** -- Pixel mask to be fed to a model (when `=True` or if `pixel_mask` is in
+              `self.model_input_names`).
+            - **mask_labels** -- Optional list of mask labels of shape `(labels, height, width)` to be fed to a model
+              (when `annotations` are provided).
+            - **class_labels** -- Optional list of class labels of shape `(labels)` to be fed to a model (when
+              `annotations` are provided). They identify the labels of `mask_labels`, e.g. the label of
+              `mask_labels[i][j]` if `class_labels[i][j]`.
+            - **text_inputs** -- Optional list of text string entries to be fed to a model (when `annotations` are
+              provided). They identify the binary masks present in the image.
+        """
+        ignore_index = self.ignore_index if ignore_index is None else ignore_index
+        reduce_labels = self.do_reduce_labels if reduce_labels is None else reduce_labels
+        pixel_values_list = [to_numpy_array(pixel_values) for pixel_values in pixel_values_list]
+
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(pixel_values_list[0])
+
+        pad_size = get_max_height_width(pixel_values_list, input_data_format=input_data_format)
+        encoded_inputs = self.pad(
+            pixel_values_list, return_tensors=return_tensors, input_data_format=input_data_format
+        )
+
+        annotations = None
+        if segmentation_maps is not None:
+            segmentation_maps = map(np.array, segmentation_maps)
+            annotations = []
+            for idx, segmentation_map in enumerate(segmentation_maps):
+                # Use instance2class_id mapping per image
+                if isinstance(instance_id_to_semantic_id, list):
+                    instance_id = instance_id_to_semantic_id[idx]
+                else:
+                    instance_id = instance_id_to_semantic_id
+                # Use instance2class_id mapping per image
+                masks, classes = self.convert_segmentation_map_to_binary_masks(
+                    segmentation_map, instance_id, ignore_index=ignore_index, reduce_labels=reduce_labels
+                )
+                annotations.append({"masks": masks, "classes": classes})
+
+        if annotations is not None:
+            mask_labels = []
+            class_labels = []
+            text_inputs = []
+
+            num_class_obj = {}
+            for cls_name in self.metadata["class_names"]:
+                num_class_obj[cls_name] = 0
+
+            for i, label in enumerate(annotations):
+                task = task_inputs[i]
+                if task == "semantic":
+                    classes, masks, texts = self.get_semantic_annotations(label, num_class_obj)
+                elif task == "instance":
+                    classes, masks, texts = self.get_instance_annotations(label, num_class_obj)
+                elif task == "panoptic":
+                    classes, masks, texts = self.get_panoptic_annotations(label, num_class_obj)
+                else:
+                    raise ValueError(f"{task} was not expected, expected `semantic`, `instance` or `panoptic`")
+
+                # we cannot batch them since they don't share a common class size
+                masks = [mask[None, ...] for mask in masks]
+                masks = [
+                    self._pad_image(image=mask, output_size=pad_size, constant_values=ignore_index) for mask in masks
+                ]
+                masks = np.concatenate(masks, axis=0)
+                mask_labels.append(torch.from_numpy(masks))
+                class_labels.append(torch.from_numpy(classes).long())
+                text_inputs.append(texts)
+
+            encoded_inputs["mask_labels"] = mask_labels
+            encoded_inputs["class_labels"] = class_labels
+            encoded_inputs["text_inputs"] = text_inputs
+
+        # This needs to be tokenized before sending to the model.
+        encoded_inputs["task_inputs"] = [f"the task is {task_input}" for task_input in task_inputs]
+
+        return encoded_inputs
+
+    # Copied from transformers.models.maskformer.image_processing_maskformer.MaskFormerImageProcessor.post_process_semantic_segmentation
+    def post_process_semantic_segmentation(
+        self, outputs, target_sizes: Optional[List[Tuple[int, int]]] = None
+    ) -> "torch.Tensor":
+        """
+        Converts the output of [`MaskFormerForInstanceSegmentation`] into semantic segmentation maps. Only supports
+        PyTorch.
+
+        Args:
+            outputs ([`MaskFormerForInstanceSegmentation`]):
+                Raw outputs of the model.
+            target_sizes (`List[Tuple[int, int]]`, *optional*):
+                List of length (batch_size), where each list item (`Tuple[int, int]]`) corresponds to the requested
+                final size (height, width) of each prediction. If left to None, predictions will not be resized.
+        Returns:
+            `List[torch.Tensor]`:
+                A list of length `batch_size`, where each item is a semantic segmentation map of shape (height, width)
+                corresponding to the target_sizes entry (if `target_sizes` is specified). Each entry of each
+                `torch.Tensor` correspond to a semantic class id.
+        """
+        class_queries_logits = outputs.class_queries_logits  # [batch_size, num_queries, num_classes+1]
+        masks_queries_logits = outputs.masks_queries_logits  # [batch_size, num_queries, height, width]
+
+        # Remove the null class `[..., :-1]`
+        masks_classes = class_queries_logits.softmax(dim=-1)[..., :-1]
+        masks_probs = masks_queries_logits.sigmoid()  # [batch_size, num_queries, height, width]
+
+        # Semantic segmentation logits of shape (batch_size, num_classes, height, width)
+        segmentation = torch.einsum("bqc, bqhw -> bchw", masks_classes, masks_probs)
+        batch_size = class_queries_logits.shape[0]
+
+        # Resize logits and compute semantic segmentation maps
+        if target_sizes is not None:
+            if batch_size != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+
+            semantic_segmentation = []
+            for idx in range(batch_size):
+                resized_logits = torch.nn.functional.interpolate(
+                    segmentation[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False
+                )
+                semantic_map = resized_logits[0].argmax(dim=0)
+                semantic_segmentation.append(semantic_map)
+        else:
+            semantic_segmentation = segmentation.argmax(dim=1)
+            semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])]
+
+        return semantic_segmentation
+
+    def post_process_instance_segmentation(
+        self,
+        outputs,
+        task_type: str = "instance",
+        is_demo: bool = True,
+        threshold: float = 0.5,
+        mask_threshold: float = 0.5,
+        overlap_mask_area_threshold: float = 0.8,
+        target_sizes: Optional[List[Tuple[int, int]]] = None,
+        return_coco_annotation: Optional[bool] = False,
+    ):
+        """
+        Converts the output of [`OneFormerForUniversalSegmentationOutput`] into image instance segmentation
+        predictions. Only supports PyTorch.
+
+        Args:
+            outputs ([`OneFormerForUniversalSegmentationOutput`]):
+                The outputs from [`OneFormerForUniversalSegmentationOutput`].
+            task_type (`str`, *optional)*, defaults to "instance"):
+                The post processing depends on the task token input. If the `task_type` is "panoptic", we need to
+                ignore the stuff predictions.
+            is_demo (`bool`, *optional)*, defaults to `True`):
+                Whether the model is in demo mode. If true, use threshold to predict final masks.
+            threshold (`float`, *optional*, defaults to 0.5):
+                The probability score threshold to keep predicted instance masks.
+            mask_threshold (`float`, *optional*, defaults to 0.5):
+                Threshold to use when turning the predicted masks into binary values.
+            overlap_mask_area_threshold (`float`, *optional*, defaults to 0.8):
+                The overlap mask area threshold to merge or discard small disconnected parts within each binary
+                instance mask.
+            target_sizes (`List[Tuple]`, *optional*):
+                List of length (batch_size), where each list item (`Tuple[int, int]]`) corresponds to the requested
+                final size (height, width) of each prediction in batch. If left to None, predictions will not be
+                resized.
+            return_coco_annotation (`bool`, *optional)*, defaults to `False`):
+                Whether to return predictions in COCO format.
+
+        Returns:
+            `List[Dict]`: A list of dictionaries, one per image, each dictionary containing two keys:
+            - **segmentation** -- a tensor of shape `(height, width)` where each pixel represents a `segment_id`, set
+              to `None` if no mask if found above `threshold`. If `target_sizes` is specified, segmentation is resized
+              to the corresponding `target_sizes` entry.
+            - **segments_info** -- A dictionary that contains additional information on each segment.
+                - **id** -- an integer representing the `segment_id`.
+                - **label_id** -- An integer representing the label / semantic class id corresponding to `segment_id`.
+                - **was_fused** -- a boolean, `True` if `label_id` was in `label_ids_to_fuse`, `False` otherwise.
+                  Multiple instances of the same class / label were fused and assigned a single `segment_id`.
+                - **score** -- Prediction score of segment with `segment_id`.
+        """
+        class_queries_logits = outputs.class_queries_logits  # [batch_size, num_queries, num_classes+1]
+        masks_queries_logits = outputs.masks_queries_logits  # [batch_size, num_queries, height, width]
+
+        batch_size = class_queries_logits.shape[0]
+        num_queries = class_queries_logits.shape[1]
+        num_classes = class_queries_logits.shape[-1] - 1
+
+        # Loop over items in batch size
+        results: List[Dict[str, torch.Tensor]] = []
+
+        for i in range(batch_size):
+            # [Q, K]
+            scores = torch.nn.functional.softmax(class_queries_logits[i], dim=-1)[:, :-1]
+            labels = torch.arange(num_classes).unsqueeze(0).repeat(num_queries, 1).flatten(0, 1)
+
+            # scores_per_image, topk_indices = scores.flatten(0, 1).topk(self.num_queries, sorted=False)
+            scores_per_image, topk_indices = scores.flatten(0, 1).topk(num_queries, sorted=False)
+            labels_per_image = labels[topk_indices]
+
+            topk_indices = torch.div(topk_indices, num_classes, rounding_mode="floor")
+            # mask_pred = mask_pred.unsqueeze(1).repeat(1, self.sem_seg_head.num_classes, 1).flatten(0, 1)
+            mask_pred = masks_queries_logits[i][topk_indices]
+
+            # Only consider scores with confidence over [threshold] for demo
+            if is_demo:
+                keep = scores_per_image > threshold
+                scores_per_image = scores_per_image[keep]
+                labels_per_image = labels_per_image[keep]
+                mask_pred = mask_pred[keep]
+
+            # if this is panoptic segmentation, we only keep the "thing" classes
+            if task_type == "panoptic":
+                keep = torch.zeros_like(scores_per_image).bool()
+                for i, lab in enumerate(labels_per_image):
+                    keep[i] = lab in self.metadata["thing_ids"]
+
+                scores_per_image = scores_per_image[keep]
+                labels_per_image = labels_per_image[keep]
+                mask_pred = mask_pred[keep]
+
+            if mask_pred.shape[0] <= 0:
+                height, width = target_sizes[i] if target_sizes is not None else mask_pred.shape[1:]
+                segmentation = torch.zeros((height, width)) - 1
+                results.append({"segmentation": segmentation, "segments_info": []})
+                continue
+
+            if "ade20k" in self.class_info_file and not is_demo and "instance" in task_type:
+                for i in range(labels_per_image.shape[0]):
+                    labels_per_image[i] = self.metadata["thing_ids"].index(labels_per_image[i].item())
+
+            # Get segmentation map and segment information of batch item
+            target_size = target_sizes[i] if target_sizes is not None else None
+            segmentation, segments = compute_segments(
+                mask_pred,
+                scores_per_image,
+                labels_per_image,
+                mask_threshold,
+                overlap_mask_area_threshold,
+                set(),
+                target_size,
+            )
+
+            # Return segmentation map in run-length encoding (RLE) format
+            if return_coco_annotation:
+                segmentation = convert_segmentation_to_rle(segmentation)
+
+            results.append({"segmentation": segmentation, "segments_info": segments})
+        return results
+
+    # Copied from transformers.models.maskformer.image_processing_maskformer.MaskFormerImageProcessor.post_process_panoptic_segmentation
+    def post_process_panoptic_segmentation(
+        self,
+        outputs,
+        threshold: float = 0.5,
+        mask_threshold: float = 0.5,
+        overlap_mask_area_threshold: float = 0.8,
+        label_ids_to_fuse: Optional[Set[int]] = None,
+        target_sizes: Optional[List[Tuple[int, int]]] = None,
+    ) -> List[Dict]:
+        """
+        Converts the output of [`MaskFormerForInstanceSegmentationOutput`] into image panoptic segmentation
+        predictions. Only supports PyTorch.
+
+        Args:
+            outputs ([`MaskFormerForInstanceSegmentationOutput`]):
+                The outputs from [`MaskFormerForInstanceSegmentation`].
+            threshold (`float`, *optional*, defaults to 0.5):
+                The probability score threshold to keep predicted instance masks.
+            mask_threshold (`float`, *optional*, defaults to 0.5):
+                Threshold to use when turning the predicted masks into binary values.
+            overlap_mask_area_threshold (`float`, *optional*, defaults to 0.8):
+                The overlap mask area threshold to merge or discard small disconnected parts within each binary
+                instance mask.
+            label_ids_to_fuse (`Set[int]`, *optional*):
+                The labels in this state will have all their instances be fused together. For instance we could say
+                there can only be one sky in an image, but several persons, so the label ID for sky would be in that
+                set, but not the one for person.
+            target_sizes (`List[Tuple]`, *optional*):
+                List of length (batch_size), where each list item (`Tuple[int, int]]`) corresponds to the requested
+                final size (height, width) of each prediction in batch. If left to None, predictions will not be
+                resized.
+
+        Returns:
+            `List[Dict]`: A list of dictionaries, one per image, each dictionary containing two keys:
+            - **segmentation** -- a tensor of shape `(height, width)` where each pixel represents a `segment_id`, set
+              to `None` if no mask if found above `threshold`. If `target_sizes` is specified, segmentation is resized
+              to the corresponding `target_sizes` entry.
+            - **segments_info** -- A dictionary that contains additional information on each segment.
+                - **id** -- an integer representing the `segment_id`.
+                - **label_id** -- An integer representing the label / semantic class id corresponding to `segment_id`.
+                - **was_fused** -- a boolean, `True` if `label_id` was in `label_ids_to_fuse`, `False` otherwise.
+                  Multiple instances of the same class / label were fused and assigned a single `segment_id`.
+                - **score** -- Prediction score of segment with `segment_id`.
+        """
+
+        if label_ids_to_fuse is None:
+            logger.warning("`label_ids_to_fuse` unset. No instance will be fused.")
+            label_ids_to_fuse = set()
+
+        class_queries_logits = outputs.class_queries_logits  # [batch_size, num_queries, num_classes+1]
+        masks_queries_logits = outputs.masks_queries_logits  # [batch_size, num_queries, height, width]
+
+        batch_size = class_queries_logits.shape[0]
+        num_labels = class_queries_logits.shape[-1] - 1
+
+        mask_probs = masks_queries_logits.sigmoid()  # [batch_size, num_queries, height, width]
+
+        # Predicted label and score of each query (batch_size, num_queries)
+        pred_scores, pred_labels = nn.functional.softmax(class_queries_logits, dim=-1).max(-1)
+
+        # Loop over items in batch size
+        results: List[Dict[str, TensorType]] = []
+
+        for i in range(batch_size):
+            mask_probs_item, pred_scores_item, pred_labels_item = remove_low_and_no_objects(
+                mask_probs[i], pred_scores[i], pred_labels[i], threshold, num_labels
+            )
+
+            # No mask found
+            if mask_probs_item.shape[0] <= 0:
+                height, width = target_sizes[i] if target_sizes is not None else mask_probs_item.shape[1:]
+                segmentation = torch.zeros((height, width)) - 1
+                results.append({"segmentation": segmentation, "segments_info": []})
+                continue
+
+            # Get segmentation map and segment information of batch item
+            target_size = target_sizes[i] if target_sizes is not None else None
+            segmentation, segments = compute_segments(
+                mask_probs=mask_probs_item,
+                pred_scores=pred_scores_item,
+                pred_labels=pred_labels_item,
+                mask_threshold=mask_threshold,
+                overlap_mask_area_threshold=overlap_mask_area_threshold,
+                label_ids_to_fuse=label_ids_to_fuse,
+                target_size=target_size,
+            )
+
+            results.append({"segmentation": segmentation, "segments_info": segments})
+        return results
diff --git a/src/transformers/models/oneformer/modeling_oneformer.py b/src/transformers/models/oneformer/modeling_oneformer.py
new file mode 100644
index 000000000000..5b6220f88169
--- /dev/null
+++ b/src/transformers/models/oneformer/modeling_oneformer.py
@@ -0,0 +1,3251 @@
+# coding=utf-8
+# Copyright 2022 SHI Labs and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch OneFormer model."""
+import copy
+import math
+import warnings
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+import torch
+from torch import Tensor, nn
+from torch.cuda.amp import autocast
+
+from ... import AutoBackbone
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_scipy_available,
+    logging,
+    replace_return_docstrings,
+    requires_backends,
+)
+from .configuration_oneformer import OneFormerConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+_CONFIG_FOR_DOC = "OneFormerConfig"
+_CHECKPOINT_FOR_DOC = "shi-labs/oneformer_ade20k_swin_tiny"
+
+ONEFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "shi-labs/oneformer_ade20k_swin_tiny",
+    # See all OneFormer models at https://huggingface.co/models?filter=oneformer
+]
+
+
+if is_scipy_available():
+    from scipy.optimize import linear_sum_assignment
+
+
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.multi_scale_deformable_attention
+def multi_scale_deformable_attention(
+    value: Tensor, value_spatial_shapes: Tensor, sampling_locations: Tensor, attention_weights: Tensor
+) -> Tensor:
+    batch_size, _, num_heads, hidden_dim = value.shape
+    _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
+    value_list = value.split([height.item() * width.item() for height, width in value_spatial_shapes], dim=1)
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for level_id, (height, width) in enumerate(value_spatial_shapes):
+        # batch_size, height*width, num_heads, hidden_dim
+        # -> batch_size, height*width, num_heads*hidden_dim
+        # -> batch_size, num_heads*hidden_dim, height*width
+        # -> batch_size*num_heads, hidden_dim, height, width
+        value_l_ = (
+            value_list[level_id].flatten(2).transpose(1, 2).reshape(batch_size * num_heads, hidden_dim, height, width)
+        )
+        # batch_size, num_queries, num_heads, num_points, 2
+        # -> batch_size, num_heads, num_queries, num_points, 2
+        # -> batch_size*num_heads, num_queries, num_points, 2
+        sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1)
+        # batch_size*num_heads, hidden_dim, num_queries, num_points
+        sampling_value_l_ = nn.functional.grid_sample(
+            value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
+        )
+        sampling_value_list.append(sampling_value_l_)
+    # (batch_size, num_queries, num_heads, num_levels, num_points)
+    # -> (batch_size, num_heads, num_queries, num_levels, num_points)
+    # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points)
+    attention_weights = attention_weights.transpose(1, 2).reshape(
+        batch_size * num_heads, 1, num_queries, num_levels * num_points
+    )
+    output = (
+        (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
+        .sum(-1)
+        .view(batch_size, num_heads * hidden_dim, num_queries)
+    )
+    return output.transpose(1, 2).contiguous()
+
+
+# Copied from transformers.models.maskformer.modeling_maskformer.dice_loss
+def dice_loss(inputs: Tensor, labels: Tensor, num_masks: int) -> Tensor:
+    r"""
+    Compute the DICE loss, similar to generalized IOU for masks as follows:
+
+    $$ \mathcal{L}_{\text{dice}(x, y) = 1 - \frac{2 * x \cap y }{x \cup y + 1}} $$
+
+    In practice, since `labels` is a binary mask, (only 0s and 1s), dice can be computed as follow
+
+    $$ \mathcal{L}_{\text{dice}(x, y) = 1 - \frac{2 * x * y }{x + y + 1}} $$
+
+    Args:
+        inputs (`torch.Tensor`):
+            A tensor representing a mask.
+        labels (`torch.Tensor`):
+            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
+            (0 for the negative class and 1 for the positive class).
+        num_masks (`int`):
+            The number of masks present in the current batch, used for normalization.
+
+    Returns:
+        `torch.Tensor`: The computed loss.
+    """
+    probs = inputs.sigmoid().flatten(1)
+    numerator = 2 * (probs * labels).sum(-1)
+    denominator = probs.sum(-1) + labels.sum(-1)
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    loss = loss.sum() / num_masks
+    return loss
+
+
+# Copied from transformers.models.mask2former.modeling_mask2former.sigmoid_cross_entropy_loss
+def sigmoid_cross_entropy_loss(inputs: torch.Tensor, labels: torch.Tensor, num_masks: int) -> torch.Tensor:
+    r"""
+    Args:
+        inputs (`torch.Tensor`):
+            A float tensor of arbitrary shape.
+        labels (`torch.Tensor`):
+            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
+            (0 for the negative class and 1 for the positive class).
+
+    Returns:
+        loss (`torch.Tensor`): The computed loss.
+    """
+    criterion = nn.BCEWithLogitsLoss(reduction="none")
+    cross_entropy_loss = criterion(inputs, labels)
+
+    loss = cross_entropy_loss.mean(1).sum() / num_masks
+    return loss
+
+
+# Copied from transformers.models.maskformer.modeling_maskformer.pair_wise_dice_loss
+def pair_wise_dice_loss(inputs: Tensor, labels: Tensor) -> Tensor:
+    """
+    A pair wise version of the dice loss, see `dice_loss` for usage.
+
+    Args:
+        inputs (`torch.Tensor`):
+            A tensor representing a mask
+        labels (`torch.Tensor`):
+            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
+            (0 for the negative class and 1 for the positive class).
+
+    Returns:
+        `torch.Tensor`: The computed loss between each pairs.
+    """
+    inputs = inputs.sigmoid().flatten(1)
+    numerator = 2 * torch.matmul(inputs, labels.T)
+    # using broadcasting to get a [num_queries, NUM_CLASSES] matrix
+    denominator = inputs.sum(-1)[:, None] + labels.sum(-1)[None, :]
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    return loss
+
+
+# Copied from transformers.models.mask2former.modeling_mask2former.pair_wise_sigmoid_cross_entropy_loss
+def pair_wise_sigmoid_cross_entropy_loss(inputs: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
+    r"""
+    A pair wise version of the cross entropy loss, see `sigmoid_cross_entropy_loss` for usage.
+
+    Args:
+        inputs (`torch.Tensor`):
+            A tensor representing a mask.
+        labels (`torch.Tensor`):
+            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
+            (0 for the negative class and 1 for the positive class).
+
+    Returns:
+        loss (`torch.Tensor`): The computed loss between each pairs.
+    """
+
+    height_and_width = inputs.shape[1]
+
+    criterion = nn.BCEWithLogitsLoss(reduction="none")
+    cross_entropy_loss_pos = criterion(inputs, torch.ones_like(inputs))
+    cross_entropy_loss_neg = criterion(inputs, torch.zeros_like(inputs))
+
+    loss_pos = torch.matmul(cross_entropy_loss_pos, labels.T)
+    loss_neg = torch.matmul(cross_entropy_loss_neg, (1 - labels).T)
+    loss = loss_pos + loss_neg
+    loss = loss / height_and_width
+    return loss
+
+
+# Copied from transformers.models.mask2former.modeling_mask2former.sample_point
+def sample_point(
+    input_features: torch.Tensor, point_coordinates: torch.Tensor, add_dim=False, **kwargs
+) -> torch.Tensor:
+    """
+    A wrapper around `torch.nn.functional.grid_sample` to support 3D point_coordinates tensors.
+
+    Args:
+        input_features (`torch.Tensor` of shape (batch_size, channels, height, width)):
+            A tensor that contains features map on a height * width grid
+        point_coordinates (`torch.Tensor` of shape (batch_size, num_points, 2) or (batch_size, grid_height, grid_width,:
+        2)):
+            A tensor that contains [0, 1] * [0, 1] normalized point coordinates
+        add_dim (`bool`):
+            boolean value to keep track of added dimension
+
+    Returns:
+        point_features (`torch.Tensor` of shape (batch_size, channels, num_points) or (batch_size, channels,
+        height_grid, width_grid):
+            A tensor that contains features for points in `point_coordinates`.
+    """
+    if point_coordinates.dim() == 3:
+        add_dim = True
+        point_coordinates = point_coordinates.unsqueeze(2)
+
+    # use nn.function.grid_sample to get features for points in `point_coordinates` via bilinear interpolation
+    point_features = torch.nn.functional.grid_sample(input_features, 2.0 * point_coordinates - 1.0, **kwargs)
+    if add_dim:
+        point_features = point_features.squeeze(3)
+
+    return point_features
+
+
+# Refactored from https://github.com/SHI-Labs/OneFormer/blob/33ebb56ed34f970a30ae103e786c0cb64c653d9a/oneformer/modeling/matcher.py#L93
+class OneFormerHungarianMatcher(nn.Module):
+    def __init__(
+        self, cost_class: float = 1.0, cost_mask: float = 1.0, cost_dice: float = 1.0, num_points: int = 12544
+    ):
+        """This class computes an assignment between the labels and the predictions of the network.
+
+        For efficiency reasons, the labels don't include the no_object. Because of this, in general, there are more
+        predictions than labels. In this case, we do a 1-to-1 matching of the best predictions, while the others are
+        un-matched (and thus treated as non-objects).
+
+        Params:
+            cost_class (float, *optional*, defaults to 1.0):
+                This is the relative weight of the classification error in the matching cost.
+            cost_mask (float, *optional*,  defaults to 1.0):
+                This is the relative weight of the sigmoid ce loss of the binary mask in the matching cost.
+            cost_dice (float, *optional*, defaults to 1.0):
+                This is the relative weight of the dice loss of the binary mask in the matching cost
+            num_points (int, *optional*, defaults to 12544):
+                Number of points to be sampled for dice and mask loss matching cost.
+        """
+        super().__init__()
+        if cost_class == 0 and cost_mask == 0 and cost_dice == 0:
+            raise ValueError("All costs cant be 0")
+        self.cost_class = cost_class
+        self.cost_mask = cost_mask
+        self.cost_dice = cost_dice
+        self.num_points = num_points
+
+    @torch.no_grad()
+    def forward(self, masks_queries_logits, class_queries_logits, mask_labels, class_labels) -> List[Tuple[Tensor]]:
+        """Performs the matching
+
+        Params:
+            masks_queries_logits (`torch.Tensor`):
+                A tensor` of dim `batch_size, num_queries, num_labels` with the
+                  classification logits.
+            class_queries_logits (`torch.Tensor`):
+                A tensor` of dim `batch_size, num_queries, height, width` with the
+                  predicted masks.
+
+            class_labels (`torch.Tensor`):
+                A tensor` of dim `num_target_boxes` (where num_target_boxes is the number
+                  of ground-truth objects in the target) containing the class labels.
+            mask_labels (`torch.Tensor`):
+                A tensor` of dim `num_target_boxes, height, width` containing the target
+                  masks.
+
+        Returns:
+            `List[Tuple[Tensor]]`: A list of size batch_size, containing tuples of (index_i, index_j) where:
+                - index_i is the indices of the selected predictions (in order)
+                - index_j is the indices of the corresponding selected labels (in order)
+            For each batch element, it holds:
+                len(index_i) = len(index_j) = min(num_queries, num_targets).
+        """
+        indices: List[Tuple[np.array]] = []
+
+        num_queries = class_queries_logits.shape[1]
+
+        preds_masks = masks_queries_logits
+        preds_probs = class_queries_logits
+        # iterate through batch size
+        for pred_probs, pred_mask, target_mask, labels in zip(preds_probs, preds_masks, mask_labels, class_labels):
+            pred_probs = pred_probs.softmax(-1)
+            # Compute the classification cost. Contrary to the loss, we don't use the NLL,
+            # but approximate it in 1 - proba[target class].
+            # The 1 is a constant that doesn't change the matching, it can be ommitted.
+            cost_class = -pred_probs[:, labels]
+
+            pred_mask = pred_mask[:, None]
+            target_mask = target_mask[:, None].to(pred_mask.device)
+
+            # all masks share the same set of points for efficient matching!
+            point_coords = torch.rand(1, self.num_points, 2, device=pred_mask.device)
+
+            # get ground truth labels
+            target_mask = sample_point(
+                target_mask,
+                point_coords.repeat(target_mask.shape[0], 1, 1),
+                align_corners=False,
+            ).squeeze(1)
+
+            pred_mask = sample_point(
+                pred_mask,
+                point_coords.repeat(pred_mask.shape[0], 1, 1),
+                align_corners=False,
+            ).squeeze(1)
+
+            with autocast(enabled=False):
+                pred_mask = pred_mask.float()
+                target_mask = target_mask.float()
+
+                # compute the sigmoid ce loss
+                cost_mask = pair_wise_sigmoid_cross_entropy_loss(pred_mask, target_mask)
+                # Compute the dice loss
+                cost_dice = pair_wise_dice_loss(pred_mask, target_mask)
+                # final cost matrix
+                cost_matrix = self.cost_mask * cost_mask + self.cost_class * cost_class + self.cost_dice * cost_dice
+                cost_matrix = cost_matrix.reshape(num_queries, -1).cpu()
+                # do the assigmented using the hungarian algorithm in scipy
+                assigned_indices: Tuple[np.array] = linear_sum_assignment(cost_matrix.cpu())
+                indices.append(assigned_indices)
+
+        # It could be stacked in one tensor
+        matched_indices = [
+            (torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices
+        ]
+        return matched_indices
+
+
+class OneFormerLoss(nn.Module):
+    def __init__(
+        self,
+        num_classes: int,
+        matcher: OneFormerHungarianMatcher,
+        weight_dict: Dict[str, float],
+        eos_coef: float,
+        num_points: int,
+        oversample_ratio: float,
+        importance_sample_ratio: float,
+        contrastive_temperature: float = None,
+    ):
+        """
+        This class computes the losses using the class predictions, mask predictions and the contrastive queries.
+
+        Oneformer calculates the classification CE loss on the class predictions. Mask predictions are used for
+        calculating the binary CE loss and dice loss. The contrastive queries are used for calculating the contrastive
+        loss.
+
+        Args:
+            num_labels (`int`):
+                The number of classes.
+            matcher (`OneFormerHungarianMatcher`):
+                A torch module that computes the assigments between the predictions and labels.
+            weight_dict (`Dict[str, float]`):
+                A dictionary of weights to be applied to the different losses.
+            eos_coef (`float`):
+                Weight to apply to the null class.
+            num_points (`int`):
+                Number of points to be sampled for dice and mask loss calculations.
+            oversample_ratio (`float`):
+                Required for pointwise loss calculation.
+            importance_sample_ratio (`float`):
+                Required for pointwise loss calculation.
+            contrastive_temperature (`float`):
+                Temperature for scaling the contrastive logits.
+        """
+        requires_backends(self, ["scipy"])
+        super().__init__()
+        self.num_classes = num_classes
+        self.matcher = matcher
+        self.weight_dict = weight_dict
+        self.eos_coef = eos_coef
+        empty_weight = torch.ones(self.num_classes + 1)
+        empty_weight[-1] = self.eos_coef
+        self.register_buffer("empty_weight", empty_weight)
+
+        # pointwise mask loss parameters
+        self.num_points = num_points
+        self.oversample_ratio = oversample_ratio
+        self.importance_sample_ratio = importance_sample_ratio
+        self.contrastive_temperature = contrastive_temperature
+        if self.contrastive_temperature is not None:
+            self.logit_scale = nn.Parameter(torch.tensor(np.log(1 / contrastive_temperature)))
+
+    def _max_by_axis(self, the_list: List[List[int]]) -> List[int]:
+        maxes = the_list[0]
+        for sublist in the_list[1:]:
+            for index, item in enumerate(sublist):
+                maxes[index] = max(maxes[index], item)
+        return maxes
+
+    def _pad_images_to_max_in_batch(self, tensors: List[Tensor]) -> Tuple[Tensor, Tensor]:
+        # get the maximum size in the batch
+        max_size = self._max_by_axis([list(tensor.shape) for tensor in tensors])
+        batch_size = len(tensors)
+        # compute finel size
+        batch_shape = [batch_size] + max_size
+        b, _, h, w = batch_shape
+        # get metadata
+        dtype = tensors[0].dtype
+        device = tensors[0].device
+        padded_tensors = torch.zeros(batch_shape, dtype=dtype, device=device)
+        padding_masks = torch.ones((b, h, w), dtype=torch.bool, device=device)
+        # pad the tensors to the size of the biggest one
+        for tensor, padded_tensor, padding_mask in zip(tensors, padded_tensors, padding_masks):
+            padded_tensor[: tensor.shape[0], : tensor.shape[1], : tensor.shape[2]].copy_(tensor)
+            padding_mask[: tensor.shape[1], : tensor.shape[2]] = False
+
+        return padded_tensors, padding_masks
+
+    def loss_contrastive(self, contrastive_queries_logits: Tensor, text_queries: Tensor):
+        """Compute the query-text contrastive loss.
+
+        Args:
+            contrastive_queries_logits (`torch.Tensor`):
+                A tensor of shape `batch_size, num_queries, hidden_dim`
+            text_queries (`torch.Tensor`):
+                A tensor of shape `batch_size, num_queries, hidden_dim`
+        Returns:
+            `Dict[str, Tensor]`: A dict of `torch.Tensor` containing the following key:
+            - **loss_contrastive** -- The query-text contrastive loss computed using task-guided queries
+                                    and text queries derived from input text list.
+        """
+
+        image_queries = contrastive_queries_logits.float()
+
+        # [batch_size, hidden_dim]
+        image_queries = nn.functional.normalize(image_queries.flatten(1), dim=-1)
+        text_queries = nn.functional.normalize(text_queries.flatten(1), dim=-1)
+
+        logit_scale = torch.clamp(self.logit_scale.exp(), max=100)
+
+        logits_per_text = torch.matmul(text_queries, image_queries.t()) * logit_scale
+        logits_per_img = logits_per_text.t()
+
+        loss_img = nn.functional.cross_entropy(
+            logits_per_img, torch.arange(len(logits_per_img), device=logits_per_text.device)
+        )
+        loss_text = nn.functional.cross_entropy(
+            logits_per_text, torch.arange(len(logits_per_text), device=logits_per_text.device)
+        )
+
+        loss_contrastive = loss_img + loss_text
+
+        losses = {"loss_contrastive": loss_contrastive}
+        return losses
+
+    def loss_labels(
+        self, class_queries_logits: Tensor, class_labels: List[Tensor], indices: Tuple[np.array]
+    ) -> Dict[str, Tensor]:
+        """Compute the losses related to the labels using cross entropy.
+
+        Args:
+            class_queries_logits (`torch.Tensor`):
+                A tensor of shape `batch_size, num_queries, num_labels`
+            class_labels (`List[torch.Tensor]`):
+                List of class labels of shape `(labels)`.
+            indices (`Tuple[np.array])`:
+                The indices computed by the Hungarian matcher.
+
+        Returns:
+            `Dict[str, Tensor]`: A dict of `torch.Tensor` containing the following key:
+            - **loss_cross_entropy** -- The loss computed using cross entropy on the predicted and ground truth labels.
+        """
+        pred_logits = class_queries_logits
+        batch_size, num_queries, _ = pred_logits.shape
+        criterion = nn.CrossEntropyLoss(weight=self.empty_weight)
+        idx = self._get_predictions_permutation_indices(indices)
+
+        # shape = (batch_size, num_queries)
+        target_classes_o = torch.cat([target[j] for target, (_, j) in zip(class_labels, indices)])
+        # shape = (batch_size, num_queries)
+        target_classes = torch.full(
+            (batch_size, num_queries), fill_value=self.num_classes, dtype=torch.int64, device=pred_logits.device
+        )
+        target_classes[idx] = target_classes_o
+        # permute pred_logits (batch_size, num_queries, num_labels) -> (batch_size, num_labels, num_queries)
+        pred_logits_transposed = pred_logits.transpose(1, 2)
+        loss_ce = criterion(pred_logits_transposed, target_classes)
+        losses = {"loss_cross_entropy": loss_ce}
+        return losses
+
+    def loss_masks(
+        self, masks_queries_logits: Tensor, mask_labels: List[Tensor], indices: Tuple[np.array], num_masks: int
+    ) -> Dict[str, Tensor]:
+        """Compute the losses related to the masks using focal and dice loss.
+
+        Args:
+            masks_queries_logits (`torch.Tensor`):
+                A tensor of shape `batch_size, num_queries, height, width`
+            mask_labels (`torch.Tensor`):
+                List of mask labels of shape `(labels, height, width)`.
+            indices (`Tuple[np.array])`:
+                The indices computed by the Hungarian matcher.
+            num_masks (`int)`:
+                The number of masks, used for normalization.
+
+        Returns:
+            `Dict[str, Tensor]`: A dict of `torch.Tensor` containing two keys:
+            - **loss_mask** -- The loss computed using sigmoid ce loss on the predicted and ground truth masks.
+            - **loss_dice** -- The loss computed using dice loss on the predicted on the predicted and ground truth
+              masks.
+        """
+        src_idx = self._get_predictions_permutation_indices(indices)
+        tgt_idx = self._get_targets_permutation_indices(indices)
+        # shape (batch_size * num_queries, height, width)
+        pred_masks = masks_queries_logits[src_idx]
+        # shape (batch_size, num_queries, height, width)
+        # pad all and stack the targets to the num_labels dimension
+        # upsample predictions to the target size, we have to add one dim to use interpolate
+        target_masks, _ = self._pad_images_to_max_in_batch(mask_labels)
+        target_masks = target_masks[tgt_idx]
+
+        pred_masks = pred_masks[:, None]
+        target_masks = target_masks[:, None]
+
+        with torch.no_grad():
+            # sample point_coords
+            point_coords = self.sample_points_using_uncertainty(
+                pred_masks,
+                self.calculate_uncertainty,
+                self.num_points,
+                self.oversample_ratio,
+                self.importance_sample_ratio,
+            )
+            # get ground-truth labels
+            point_labels = sample_point(target_masks, point_coords, align_corners=False).squeeze(1)
+
+        point_logits = sample_point(pred_masks, point_coords, align_corners=False).squeeze(1)
+
+        losses = {
+            "loss_mask": sigmoid_cross_entropy_loss(point_logits, point_labels, num_masks),
+            "loss_dice": dice_loss(point_logits, point_labels, num_masks),
+        }
+
+        del pred_masks
+        del target_masks
+        return losses
+
+    # Copied from transformers.models.mask2former.modeling_mask2former.Mask2FormerLoss.calculate_uncertainty
+    def calculate_uncertainty(self, logits: torch.Tensor) -> torch.Tensor:
+        """
+        In Mask2Former paper, uncertainty is estimated as L1 distance between 0.0 and the logit prediction in 'logits'
+        for the foreground class in `classes`.
+
+        Args:
+            logits (`torch.Tensor`):
+            A tensor of shape (R, 1, ...) for class-specific or class-agnostic, where R is the total number of predicted masks in all images and C is:
+            the number of foreground classes. The values are logits.
+
+        Returns:
+            scores (`torch.Tensor`): A tensor of shape (R, 1, ...) that contains uncertainty scores with the most
+            uncertain locations having the highest uncertainty score.
+        """
+        uncertainty_scores = -(torch.abs(logits))
+        return uncertainty_scores
+
+    # Copied from transformers.models.mask2former.modeling_mask2former.Mask2FormerLoss.sample_points_using_uncertainty
+    def sample_points_using_uncertainty(
+        self,
+        logits: torch.Tensor,
+        uncertainty_function,
+        num_points: int,
+        oversample_ratio: int,
+        importance_sample_ratio: float,
+    ) -> torch.Tensor:
+        """
+        This function is meant for sampling points in [0, 1] * [0, 1] coordinate space based on their uncertainty. The
+        uncertainty is calculated for each point using the passed `uncertainty function` that takes points logit
+        prediction as input.
+
+        Args:
+            logits (`float`):
+                Logit predictions for P points.
+            uncertainty_function:
+                A function that takes logit predictions for P points and returns their uncertainties.
+            num_points (`int`):
+                The number of points P to sample.
+            oversample_ratio (`int`):
+                Oversampling parameter.
+            importance_sample_ratio (`float`):
+                Ratio of points that are sampled via importance sampling.
+
+        Returns:
+            point_coordinates (`torch.Tensor`):
+                Coordinates for P sampled points.
+        """
+
+        num_boxes = logits.shape[0]
+        num_points_sampled = int(num_points * oversample_ratio)
+
+        # Get random point coordinates
+        point_coordinates = torch.rand(num_boxes, num_points_sampled, 2, device=logits.device)
+        # Get sampled prediction value for the point coordinates
+        point_logits = sample_point(logits, point_coordinates, align_corners=False)
+        # Calculate the uncertainties based on the sampled prediction values of the points
+        point_uncertainties = uncertainty_function(point_logits)
+
+        num_uncertain_points = int(importance_sample_ratio * num_points)
+        num_random_points = num_points - num_uncertain_points
+
+        idx = torch.topk(point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1]
+        shift = num_points_sampled * torch.arange(num_boxes, dtype=torch.long, device=logits.device)
+        idx += shift[:, None]
+        point_coordinates = point_coordinates.view(-1, 2)[idx.view(-1), :].view(num_boxes, num_uncertain_points, 2)
+
+        if num_random_points > 0:
+            point_coordinates = torch.cat(
+                [point_coordinates, torch.rand(num_boxes, num_random_points, 2, device=logits.device)],
+                dim=1,
+            )
+        return point_coordinates
+
+    def _get_predictions_permutation_indices(self, indices):
+        # permute predictions following indices
+        batch_indices = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        predictions_indices = torch.cat([src for (src, _) in indices])
+        return batch_indices, predictions_indices
+
+    def _get_targets_permutation_indices(self, indices):
+        # permute labels following indices
+        batch_indices = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
+        target_indices = torch.cat([tgt for (_, tgt) in indices])
+        return batch_indices, target_indices
+
+    def forward(
+        self,
+        masks_queries_logits: Tensor,
+        class_queries_logits: Tensor,
+        contrastive_queries_logits: Tensor,
+        mask_labels: List[Tensor],
+        class_labels: List[Tensor],
+        text_queries: Tensor,
+        auxiliary_predictions: Optional[Dict[str, Tensor]] = None,
+        calculate_contrastive_loss: bool = True,
+    ) -> Dict[str, Tensor]:
+        """
+        This performs the loss computation.
+
+        Args:
+            masks_queries_logits (`torch.Tensor`):
+                A tensor of shape `batch_size, num_queries, height, width`
+            class_queries_logits (`torch.Tensor`):
+                A tensor of shape `batch_size, num_queries, num_labels`
+            contrastive_queries_logits (`torch.Tensor`):
+                A tensor of shape `batch_size, num_queries, hidden_dim`
+            mask_labels (`torch.Tensor`):
+                List of mask labels of shape `(labels, height, width)`.
+            class_labels (`List[torch.Tensor]`):
+                List of class labels of shape `(labels)`.
+            text_queries (`torch.Tensor`):
+                A tensor of shape `batch_size, num_queries, hidden_dim`
+            auxiliary_predictions (`Dict[str, torch.Tensor]`, *optional*):
+                if `use_auxiliary_loss` was set to `true` in [`OneFormerConfig`], then it contains the logits from the
+                inner layers of the Detr's Decoder.
+            calculate_contrastive_loss (`bool`, *optional*, defaults to `True`):
+                Whether or not to calculate the contrastive loss.
+
+        Returns:
+            `Dict[str, Tensor]`: A dict of `torch.Tensor` containing two keys:
+            - **loss_cross_entropy** -- The loss computed using cross entropy on the predicted and ground truth labels.
+            - **loss_mask** -- The loss computed using sigmoid ce loss on the predicted and ground truth masks.
+            - **loss_dice** -- The loss computed using dice loss on the predicted on the predicted and ground truth
+              masks.
+            - **loss_contrastive** -- The query-text contrstive loss computed using object and text queries.
+            if `use_auxiliary_loss` was set to `true` in [`OneFormerConfig`], the dictionary contains addional losses
+            for each auxiliary predictions.
+        """
+
+        # retrieve the matching between the outputs of the last layer and the labels
+        indices = self.matcher(masks_queries_logits, class_queries_logits, mask_labels, class_labels)
+        # compute the average number of target masks for normalization purposes
+        num_masks = self.get_num_masks(class_labels, device=class_labels[0].device)
+        # get all the losses
+        losses: Dict[str, Tensor] = {
+            **self.loss_masks(masks_queries_logits, mask_labels, indices, num_masks),
+            **self.loss_labels(class_queries_logits, class_labels, indices),
+        }
+        if calculate_contrastive_loss:
+            losses = {**losses, **self.loss_contrastive(contrastive_queries_logits, text_queries)}
+
+        # in case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if auxiliary_predictions is not None:
+            for idx, aux_outputs in enumerate(auxiliary_predictions):
+                masks_queries_logits = aux_outputs["masks_queries_logits"]
+                class_queries_logits = aux_outputs["class_queries_logits"]
+                loss_dict = self.forward(
+                    masks_queries_logits,
+                    class_queries_logits,
+                    None,
+                    mask_labels,
+                    class_labels,
+                    None,
+                    calculate_contrastive_loss=False,
+                )
+                loss_dict = {f"{key}_{idx}": value for key, value in loss_dict.items()}
+                losses.update(loss_dict)
+
+        return losses
+
+    def get_num_masks(self, class_labels: torch.Tensor, device: torch.device) -> torch.Tensor:
+        """
+        Computes the average number of target masks across the batch, for normalization purposes.
+        """
+        num_masks = sum([len(classes) for classes in class_labels])
+        num_masks_pt = torch.as_tensor([num_masks], dtype=torch.float, device=device)
+        return num_masks_pt
+
+
+@dataclass
+class OneFormerTransformerDecoderOutput(BaseModelOutput):
+    """
+    Base class for outputs of the Transformer decoder. This class adds attributes for class predictions, mask
+    predictions and contrastive logits to BaseModelOutputWithCrossAttentions.
+
+    Args:
+        object_logits (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_dim)`):
+            Queries representation for the region proposals.
+        contrastive_logits (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_dim)`):
+            Queries representation for the contrastive loss.
+        prediction_masks (`torch.FloatTensor` of shape `(batch_size, num_queries, height, width)`):
+            Mask predictions from last layer of the transformer decoder.
+        prediction_class (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes+1)`):
+            Class predictions from last layer of the transformer decoder.
+        auxiliary_predictions (Tuple of Dict of `str, torch.FloatTensor`, *optional*):
+            Tuple of class and mask predictions from each layer of the transformer decoder.
+    """
+
+    object_queries: torch.FloatTensor = None
+    contrastive_logits: Optional[torch.FloatTensor] = None
+    prediction_masks: torch.FloatTensor = None
+    prediction_class: torch.FloatTensor = None
+    auxiliary_predictions: Optional[Tuple[Dict[str, torch.FloatTensor]]] = None
+
+
+@dataclass
+# Copied from transformers.models.mask2former.modeling_mask2former.Mask2FormerPixelDecoderOutput with Mask2->One
+class OneFormerPixelDecoderOutput(ModelOutput):
+    """
+    OneFormer's pixel decoder module output, practically a Multi-Scale Deformable Attention based decoder. It returns
+    the mask features and the multiscale features.
+
+    Args:
+        multi_scale_features (`tuple(torch.FloatTensor)`):
+            Tuple of multi-scale features of scales [1/8, 1/16, 1/32] and shape `(batch_size, num_channels, height,
+            width)`from the Multi-Scale Deformable Attenntion based Pixel Decoder.
+        mask_features (`torch.FloatTensor`):
+            Tensor of shape `(batch_size, num_channels, height, width)`, 1/4 scale features from the last Pixel Decoder
+            Layer.
+        attentions (`tuple(torch.FloatTensor)`, *optional*):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights from pixel decoder. Returned when `output_attentions=True` is passed
+            or when `config.output_attentions=True`
+    """
+
+    multi_scale_features: Tuple[torch.FloatTensor] = None
+    mask_features: torch.FloatTensor = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class OneFormerPixelLevelModuleOutput(ModelOutput):
+    """
+    OneFormer's pixel level module output. It returns both the last and (optionally) the hidden states from the
+    `encoder` and `decoder`. By default, the `encoder` is a Swin/Dinat Backbone and the `decoder` is a Multi-Scale
+    Deformable Attention based decoder.
+
+    Args:
+        encoder_features (List of `(torch.FloatTensor)`):
+            List of `torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`. Hidden-states (also
+            called feature maps) of the model at the output of each stage.
+        decoder_features (List of `(torch.FloatTensor)`):
+            List of `torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`. Hidden-states (also
+            called feature maps) of the model at the output of each stage.
+        decoder_last_feature (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)):
+            1/4 scale features from the last Pixel Decoder Layer.
+    """
+
+    encoder_features: List[torch.FloatTensor] = None
+    decoder_features: List[torch.FloatTensor] = None
+    decoder_last_feature: torch.FloatTensor = None
+
+
+@dataclass
+class OneFormerModelOutput(ModelOutput):
+    """
+    Class for outputs of [`OneFormerModel`]. This class returns all the needed hidden states to compute the logits.
+
+    Args:
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the encoder
+            model at the output of each stage.
+        pixel_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the pixel
+            decoder model at the output of each stage.
+        transformer_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states (also called feature maps) of the
+            transformer decoder at the output of each stage.
+        transformer_decoder_object_queries (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_dim)`)
+            Output object queries from the last layer in the transformer decoder.
+        transformer_decoder_contrastive_queries (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_dim)`)
+            Contrastive queries from the transformer decoder.
+        transformer_decoder_mask_predictions (`torch.FloatTensor` of shape `(batch_size, num_queries, height, width)`)
+            Mask Predictions from the last layer in the transformer decoder.
+        transformer_decoder_class_predictions (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes+1)`):
+            Class Predictions from the last layer in the transformer decoder.
+        transformer_decoder_auxiliary_predictions (Tuple of Dict of `str, torch.FloatTensor`, *optional*):
+            Tuple of class and mask predictions from each layer of the transformer decoder.
+        text_queries (`torch.FloatTensor`, *optional* of shape `(batch_size, num_queries, hidden_dim)`)
+            Text queries derived from the input text list used for calculating contrastive loss during training.
+        task_token (`torch.FloatTensor` of shape `(batch_size, hidden_dim)`)
+            1D task token to condition the queries.
+        attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tuple(torch.FloatTensor)` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Self and Cross Attentions weights from transformer decoder.
+    """
+
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    pixel_decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    transformer_decoder_hidden_states: Optional[torch.FloatTensor] = None
+    transformer_decoder_object_queries: torch.FloatTensor = None
+    transformer_decoder_contrastive_queries: Optional[torch.FloatTensor] = None
+    transformer_decoder_mask_predictions: torch.FloatTensor = None
+    transformer_decoder_class_predictions: torch.FloatTensor = None
+    transformer_decoder_auxiliary_predictions: Optional[Tuple[Dict[str, torch.FloatTensor]]] = None
+    text_queries: Optional[torch.FloatTensor] = None
+    task_token: torch.FloatTensor = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class OneFormerForUniversalSegmentationOutput(ModelOutput):
+    """
+    Class for outputs of [`OneFormerForUniversalSegmentationOutput`].
+
+    This output can be directly passed to [`~OneFormerImageProcessor.post_process_semantic_segmentation`] or
+    [`~OneFormerImageProcessor.post_process_instance_segmentation`] or
+    [`~OneFormerImageProcessor.post_process_panoptic_segmentation`] depending on the task. Please, see
+    [`~OneFormerImageProcessor] for details regarding usage.
+
+    Args:
+        loss (`torch.Tensor`, *optional*):
+            The computed loss, returned when labels are present.
+        class_queries_logits (`torch.FloatTensor`):
+            A tensor of shape `(batch_size, num_queries, num_labels + 1)` representing the proposed classes for each
+            query. Note the `+ 1` is needed because we incorporate the null class.
+        masks_queries_logits (`torch.FloatTensor`):
+            A tensor of shape `(batch_size, num_queries, height, width)` representing the proposed masks for each
+            query.
+        auxiliary_predictions (List of Dict of `str, torch.FloatTensor`, *optional*):
+            List of class and mask predictions from each layer of the transformer decoder.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the encoder
+            model at the output of each stage.
+        pixel_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the pixel
+            decoder model at the output of each stage.
+        transformer_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states (also called feature maps) of the
+            transformer decoder at the output of each stage.
+        transformer_decoder_object_queries (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_dim)`)
+            Output object queries from the last layer in the transformer decoder.
+        transformer_decoder_contrastive_queries (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_dim)`)
+            Contrastive queries from the transformer decoder.
+        transformer_decoder_mask_predictions (`torch.FloatTensor` of shape `(batch_size, num_queries, height, width)`)
+            Mask Predictions from the last layer in the transformer decoder.
+        transformer_decoder_class_predictions (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes+1)`):
+            Class Predictions from the last layer in the transformer decoder.
+        transformer_decoder_auxiliary_predictions (List of Dict of `str, torch.FloatTensor`, *optional*):
+            List of class and mask predictions from each layer of the transformer decoder.
+        text_queries (`torch.FloatTensor`, *optional* of shape `(batch_size, num_queries, hidden_dim)`)
+            Text queries derived from the input text list used for calculating contrastive loss during training.
+        task_token (`torch.FloatTensor` of shape `(batch_size, hidden_dim)`)
+            1D task token to condition the queries.
+        attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tuple(torch.FloatTensor)` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Self and Cross Attentions weights from transformer decoder.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    class_queries_logits: torch.FloatTensor = None
+    masks_queries_logits: torch.FloatTensor = None
+    auxiliary_predictions: List[Dict[str, torch.FloatTensor]] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    pixel_decoder_hidden_states: Optional[List[torch.FloatTensor]] = None
+    transformer_decoder_hidden_states: Optional[torch.FloatTensor] = None
+    transformer_decoder_object_queries: torch.FloatTensor = None
+    transformer_decoder_contrastive_queries: Optional[torch.FloatTensor] = None
+    transformer_decoder_mask_predictions: torch.FloatTensor = None
+    transformer_decoder_class_predictions: torch.FloatTensor = None
+    transformer_decoder_auxiliary_predictions: Optional[List[Dict[str, torch.FloatTensor]]] = None
+    text_queries: Optional[torch.FloatTensor] = None
+    task_token: torch.FloatTensor = None
+    attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+
+
+# Modified from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrFrozenBatchNorm2d with DeformableDetr->OneFormerPixelDecoder
+class OneFormerPixelDecoderFrozenBatchNorm2d(nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than
+    torchvision.models.resnet[18,34,50,101] produce nans.
+    """
+
+    def __init__(self, n):
+        super().__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        num_batches_tracked_key = prefix + "num_batches_tracked"
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+
+    def forward(self, x):
+        weight = self.weight.reshape(1, -1, 1, 1)
+        bias = self.bias.reshape(1, -1, 1, 1)
+        running_var = self.running_var.reshape(1, -1, 1, 1)
+        running_mean = self.running_mean.reshape(1, -1, 1, 1)
+        epsilon = 1e-5
+        scale = weight * (running_var + epsilon).rsqrt()
+        bias = bias - running_mean * scale
+        return x * scale + bias
+
+
+# Modified from transformers.models.detr.modeling_deformable_detr.DeformableDetrMultiscaleDeformableAttention with DeformableDetr->OneFormerPixelDecoderEncoder
+class OneFormerPixelDecoderEncoderMultiscaleDeformableAttention(nn.Module):
+    """
+    Multiscale deformable attention as proposed in Deformable DETR.
+    """
+
+    def __init__(self, embed_dim: int, num_heads: int, n_levels: int, n_points: int):
+        super().__init__()
+        if embed_dim % num_heads != 0:
+            raise ValueError(
+                f"embed_dim (d_model) must be divisible by num_heads, but got {embed_dim} and {num_heads}"
+            )
+        dim_per_head = embed_dim // num_heads
+        # check if dim_per_head is power of 2
+        if not ((dim_per_head & (dim_per_head - 1) == 0) and dim_per_head != 0):
+            warnings.warn(
+                "You'd better set embed_dim (d_model) in DeformableDetrMultiscaleDeformableAttention to make the"
+                " dimension of each attention head a power of 2 which is more efficient in the authors' CUDA"
+                " implementation."
+            )
+
+        self.im2col_step = 128
+
+        self.d_model = embed_dim
+        self.n_levels = n_levels
+        self.n_heads = num_heads
+        self.n_points = n_points
+
+        self.sampling_offsets = nn.Linear(embed_dim, num_heads * n_levels * n_points * 2)
+        self.attention_weights = nn.Linear(embed_dim, num_heads * n_levels * n_points)
+        self.value_proj = nn.Linear(embed_dim, embed_dim)
+        self.output_proj = nn.Linear(embed_dim, embed_dim)
+
+    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
+        return tensor if position_embeddings is None else tensor + position_embeddings
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        reference_points=None,
+        spatial_shapes=None,
+        level_start_index=None,
+        output_attentions: bool = False,
+    ):
+        # add position embeddings to the hidden states before projecting to queries and keys
+        if position_embeddings is not None:
+            hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
+
+        batch_size, num_queries, _ = hidden_states.shape
+        batch_size, sequence_length, _ = encoder_hidden_states.shape
+        if (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() != sequence_length:
+            raise ValueError(
+                "Make sure to align the spatial shapes with the sequence length of the encoder hidden states"
+            )
+
+        value = self.value_proj(encoder_hidden_states)
+        if attention_mask is not None:
+            # we invert the attention_mask
+            value = value.masked_fill(attention_mask[..., None], float(0))
+        value = value.view(batch_size, sequence_length, self.n_heads, self.d_model // self.n_heads)
+        sampling_offsets = self.sampling_offsets(hidden_states).view(
+            batch_size, num_queries, self.n_heads, self.n_levels, self.n_points, 2
+        )
+        attention_weights = self.attention_weights(hidden_states).view(
+            batch_size, num_queries, self.n_heads, self.n_levels * self.n_points
+        )
+        attention_weights = nn.functional.softmax(attention_weights, -1).view(
+            batch_size, num_queries, self.n_heads, self.n_levels, self.n_points
+        )
+        # batch_size, num_queries, n_heads, n_levels, n_points, 2
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :]
+                + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
+            )
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :2]
+                + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
+            )
+        else:
+            raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}")
+        # PyTorch implementation
+        output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights)
+        output = self.output_proj(output)
+
+        return output, attention_weights
+
+
+class OneFormerPixelDecoderEncoderLayer(nn.Module):
+    def __init__(self, config: OneFormerConfig):
+        super().__init__()
+        self.embed_dim = config.conv_dim
+        self.self_attn = OneFormerPixelDecoderEncoderMultiscaleDeformableAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.num_attention_heads,
+            n_levels=3,
+            n_points=4,
+        )
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.dropout = config.dropout
+        self.activation_fn = nn.functional.relu
+        self.activation_dropout = config.dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_feedforward_dim)
+        self.fc2 = nn.Linear(config.encoder_feedforward_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+        self.is_training = config.is_training
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        position_embeddings: torch.Tensor = None,
+        reference_points=None,
+        spatial_shapes=None,
+        level_start_index=None,
+        output_attentions: bool = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Input to the layer.
+            attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+                Attention mask.
+            position_embeddings (`torch.FloatTensor`, *optional*):
+                Position embeddings, to be added to `hidden_states`.
+            reference_points (`torch.FloatTensor`, *optional*):
+                Reference points.
+            spatial_shapes (`torch.LongTensor`, *optional*):
+                Spatial shapes of the backbone feature maps.
+            level_start_index (`torch.LongTensor`, *optional*):
+                Level start index.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # Apply Multi-scale Deformable Attention Module on the multi-scale feature maps.
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            position_embeddings=position_embeddings,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.is_training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.is_training)
+
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.is_training)
+
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        if self.is_training:
+            if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
+                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+# Modified from from transformers.models.detr.modeling_deformable_detr.DeformableDetrEncoder with DeformableDetrEncoder->OneFormerPixelDecoderEncoderOnly
+class OneFormerPixelDecoderEncoderOnly(nn.Module):
+    """
+    Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a
+    [`OneFormerPixelDecoderEncoderLayer`].
+
+    The encoder updates the flattened multi-scale feature maps through multiple deformable attention layers.
+
+    Args:
+        config: OneFormerConfig
+    """
+
+    def __init__(self, config: OneFormerConfig):
+        super().__init__()
+
+        self.config = config
+        self.dropout = config.dropout
+        self.layers = nn.ModuleList([OneFormerPixelDecoderEncoderLayer(config) for _ in range(config.encoder_layers)])
+
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios, device):
+        """
+        Get reference points for each feature map. Used in decoder.
+
+        Args:
+            spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
+                Spatial shapes of each feature map.
+            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
+                Valid ratios of each feature map.
+            device (`torch.device`):
+                Device on which to create the tensors.
+        Returns:
+            `torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)`
+        """
+        reference_points_list = []
+        for lvl, (height, width) in enumerate(spatial_shapes):
+            ref_y, ref_x = torch.meshgrid(
+                torch.linspace(0.5, height - 0.5, height, dtype=valid_ratios.dtype, device=device),
+                torch.linspace(0.5, width - 0.5, width, dtype=valid_ratios.dtype, device=device),
+            )
+            ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * height)
+            ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * width)
+            ref = torch.stack((ref_x, ref_y), -1)
+            reference_points_list.append(ref)
+        reference_points = torch.cat(reference_points_list, 1)
+        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+        return reference_points
+
+    def forward(
+        self,
+        inputs_embeds=None,
+        attention_mask=None,
+        position_embeddings=None,
+        spatial_shapes=None,
+        level_start_index=None,
+        valid_ratios=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Flattened feature map (output of the backbone + projection layer) that is passed to the encoder.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:
+                - 1 for pixel features that are real (i.e. **not masked**),
+                - 0 for pixel features that are padding (i.e. **masked**).
+                [What are attention masks?](../glossary#attention-mask)
+            position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Position embeddings that are added to the queries and keys in each self-attention layer.
+            spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
+                Spatial shapes of each feature map.
+            level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`):
+                Starting index of each feature map.
+            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
+                Ratio of valid area in each feature level.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states = inputs_embeds
+        reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=inputs_embeds.device)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        for i, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask,
+                position_embeddings=position_embeddings,
+                reference_points=reference_points,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+# Modified from from transformers.models.mask2former.modeling_mask2former.Mask2FormerPixelDecoder with Mask2->One
+class OneFormerPixelDecoder(nn.Module):
+    def __init__(self, config: OneFormerConfig, feature_channels):
+        super().__init__()
+
+        self.config = config
+
+        #  positional encoding
+        self.position_embedding = OneFormerSinePositionEmbedding(num_pos_feats=config.conv_dim // 2, normalize=True)
+        self.num_feature_levels = 3
+        transformer_in_channels = feature_channels[-self.num_feature_levels :]
+        self.transformer_feature_strides = config.strides[-self.num_feature_levels :]
+        self.feature_channels = feature_channels
+        self.level_embed = nn.Parameter(torch.Tensor(self.num_feature_levels, config.conv_dim))
+
+        # Create input projection layers
+        if self.num_feature_levels > 1:
+            input_projections_list = []
+            for in_channels in transformer_in_channels[::-1]:
+                input_projections_list.append(
+                    nn.Sequential(
+                        nn.Conv2d(in_channels, config.conv_dim, kernel_size=1),
+                        nn.GroupNorm(32, config.conv_dim),
+                    )
+                )
+            self.input_projections = nn.ModuleList(input_projections_list)
+        else:
+            self.input_projections = nn.ModuleList(
+                [
+                    nn.Sequential(
+                        nn.Conv2d(transformer_in_channels[-1], config.conv_dim, kernel_size=1),
+                        nn.GroupNorm(32, config.conv_dim),
+                    )
+                ]
+            )
+
+        self.encoder = OneFormerPixelDecoderEncoderOnly(config)
+
+        self.mask_projection = nn.Conv2d(
+            config.conv_dim,
+            config.mask_dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+
+        self.common_stride = config.common_stride
+
+        # extra fpn levels
+        stride = min(self.transformer_feature_strides)
+        self.num_fpn_levels = int(np.log2(stride) - np.log2(self.common_stride))
+
+        lateral_convs = []
+        output_convs = []
+
+        for idx, in_channels in enumerate(self.feature_channels[: self.num_fpn_levels]):
+            lateral_conv = nn.Sequential(
+                nn.Conv2d(
+                    in_channels,
+                    config.conv_dim,
+                    kernel_size=1,
+                    bias=False,
+                ),
+                nn.GroupNorm(32, config.conv_dim),
+            )
+            output_conv = nn.Sequential(
+                nn.Conv2d(
+                    config.conv_dim,
+                    config.conv_dim,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=False,
+                ),
+                nn.GroupNorm(32, config.conv_dim),
+                nn.ReLU(),
+            )
+            self.add_module("adapter_{}".format(idx + 1), lateral_conv)
+            self.add_module("layer_{}".format(idx + 1), output_conv)
+
+            lateral_convs.append(lateral_conv)
+            output_convs.append(output_conv)
+        # Place convs into top-down order (from low to high resolution)
+        # to make the top-down computation in forward clearer.
+        self.lateral_convs = lateral_convs[::-1]
+        self.output_convs = output_convs[::-1]
+
+    def get_valid_ratio(self, mask, dtype=torch.float32):
+        """Get the valid ratio of all feature maps."""
+
+        _, height, width = mask.shape
+        valid_height = torch.sum(~mask[:, :, 0], 1)
+        valid_width = torch.sum(~mask[:, 0, :], 1)
+        valid_ratio_heigth = valid_height.to(dtype) / height
+        valid_ratio_width = valid_width.to(dtype) / width
+        valid_ratio = torch.stack([valid_ratio_width, valid_ratio_heigth], -1)
+        return valid_ratio
+
+    def forward(
+        self,
+        features,
+        encoder_outputs=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        # Then, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
+        sources = []
+        position_embeddings_list = []
+        for level, source in enumerate(features[::-1][: self.num_feature_levels]):
+            sources.append(self.input_projections[level](source))
+            position_embeddings_list.append(self.position_embedding(source))
+
+        masks = [torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool) for x in sources]
+
+        # Prepare encoder inputs (by flattening)
+        source_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        for level, (source, mask, pos_embed) in enumerate(zip(sources, masks, position_embeddings_list)):
+            batch_size, num_channels, height, width = source.shape
+            spatial_shape = (height, width)
+            spatial_shapes.append(spatial_shape)
+            source = source.flatten(2).transpose(1, 2)
+            mask = mask.flatten(1)
+            pos_embed = pos_embed.flatten(2).transpose(1, 2)
+            lvl_pos_embed = pos_embed + self.level_embed[level].view(1, 1, -1)
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            source_flatten.append(source)
+            mask_flatten.append(mask)
+        source_flatten = torch.cat(source_flatten, 1)
+        mask_flatten = torch.cat(mask_flatten, 1)
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
+        spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=source_flatten.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack([self.get_valid_ratio(m, dtype=source_flatten.dtype) for m in masks], 1)
+
+        # Fourth, sent source_flatten + mask_flatten + lvl_pos_embed_flatten (backbone + proj layer output) through encoder
+        # Also provide spatial_shapes, level_start_index and valid_ratios
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                inputs_embeds=source_flatten,
+                attention_mask=mask_flatten,
+                position_embeddings=lvl_pos_embed_flatten,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                valid_ratios=valid_ratios,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+
+        y = encoder_outputs.last_hidden_state
+        bs = y.shape[0]
+
+        split_size_or_sections = [None] * self.num_feature_levels
+        for i in range(self.num_feature_levels):
+            if i < self.num_feature_levels - 1:
+                split_size_or_sections[i] = level_start_index[i + 1] - level_start_index[i]
+            else:
+                split_size_or_sections[i] = y.shape[1] - level_start_index[i]
+        y = torch.split(y, split_size_or_sections, dim=1)
+
+        out = []
+        multi_scale_features = []
+        num_cur_levels = 0
+        for i, z in enumerate(y):
+            out.append(z.transpose(1, 2).view(bs, -1, spatial_shapes[i][0], spatial_shapes[i][1]))
+
+        # append `out` with extra FPN levels
+        # Reverse feature maps into top-down order (from low to high resolution)
+        for idx, feats in enumerate(features[: self.num_fpn_levels][::-1]):
+            lateral_conv = self.lateral_convs[idx]
+            output_conv = self.output_convs[idx]
+            cur_fpn = lateral_conv(feats)
+            # Following FPN implementation, we use nearest upsampling here
+            y = cur_fpn + nn.functional.interpolate(
+                out[-1], size=cur_fpn.shape[-2:], mode="bilinear", align_corners=False
+            )
+            y = output_conv(y)
+            out.append(y)
+
+        for o in out:
+            if num_cur_levels < self.num_feature_levels:
+                multi_scale_features.append(o)
+                num_cur_levels += 1
+
+        return OneFormerPixelDecoderOutput(
+            mask_features=self.mask_projection(out[-1]),
+            multi_scale_features=multi_scale_features,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+# Modified from from transformers.models.mask2former.modeling_mask2former.Mask2FormerPixelLevelModule with Mask2->One
+class OneFormerPixelLevelModule(nn.Module):
+    def __init__(self, config: OneFormerConfig):
+        """
+        Pixel Level Module proposed in [Masked-attention Mask Transformer for Universal Image
+        Segmentation](https://arxiv.org/abs/2112.01527). It runs the input image through a backbone and a pixel
+        decoder, generating multi-scale feature maps and pixel embeddings.
+
+        Args:
+            config ([`OneFormerConfig`]):
+                The configuration used to instantiate this model.
+        """
+        super().__init__()
+        backbone_config = config.backbone_config
+        self.encoder = AutoBackbone.from_config(backbone_config)
+        self.decoder = OneFormerPixelDecoder(config, feature_channels=self.encoder.channels)
+
+    def forward(self, pixel_values: Tensor, output_hidden_states: bool = False) -> OneFormerPixelLevelModuleOutput:
+        features: List[Tensor] = self.encoder(pixel_values).feature_maps
+        decoder_output: OneFormerPixelDecoderOutput = self.decoder(features, output_hidden_states=output_hidden_states)
+        return OneFormerPixelLevelModuleOutput(
+            encoder_features=tuple(features),
+            decoder_features=decoder_output.multi_scale_features,
+            decoder_last_feature=decoder_output.mask_features,
+        )
+
+
+# Modified from transformers.models.detr.modeling_detr.DetrAttention with Detr->OneFormer
+class OneFormerAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Here, we add position embeddings to the queries and
+    keys (as explained in the DETR paper).
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        if self.head_dim * num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
+        return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
+        return tensor if position_embeddings is None else tensor + position_embeddings
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        key_value_states: Optional[torch.Tensor] = None,
+        key_value_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        hidden_states = hidden_states.permute(1, 0, 2) if hidden_states is not None else None
+        position_embeddings = position_embeddings.permute(1, 0, 2) if position_embeddings is not None else None
+        key_value_states = key_value_states.permute(1, 0, 2) if key_value_states is not None else None
+        key_value_position_embeddings = (
+            key_value_position_embeddings.permute(1, 0, 2) if key_value_position_embeddings is not None else None
+        )
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        batch_size, target_len, embed_dim = hidden_states.size()
+
+        # add position embeddings to the hidden states before projecting to queries and keys
+        if position_embeddings is not None:
+            hidden_states_original = hidden_states
+            hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
+
+        # add key-value position embeddings to the key value states
+        if key_value_position_embeddings is not None:
+            key_value_states_original = key_value_states
+            key_value_states = self.with_pos_embed(key_value_states, key_value_position_embeddings)
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, batch_size)
+            value_states = self._shape(self.v_proj(key_value_states_original), -1, batch_size)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, batch_size)
+            value_states = self._shape(self.v_proj(hidden_states_original), -1, batch_size)
+
+        proj_shape = (batch_size * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, target_len, batch_size).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        source_len = key_states.size(1)
+
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (batch_size * self.num_heads, target_len, source_len):
+            raise ValueError(
+                f"Attention weights should be of size {(batch_size * self.num_heads, target_len, source_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (batch_size * self.num_heads, target_len, source_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(target_len, batch_size * self.num_heads, source_len)}, but is"
+                    f" {attention_mask.size()}"
+                )
+            attn_weights += attention_mask
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(batch_size, self.num_heads, target_len, source_len)
+            attn_weights = attn_weights_reshaped.view(batch_size * self.num_heads, target_len, source_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (batch_size * self.num_heads, target_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(batch_size, self.num_heads, target_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(batch_size, target_len, embed_dim)
+
+        attn_output = self.out_proj(attn_output).permute(1, 0, 2)
+
+        return attn_output, attn_weights_reshaped
+
+
+class OneFormerTransformerDecoderSelfAttentionLayer(nn.Module):
+    def __init__(
+        self, embed_dim, num_heads, dropout=0.0, activation="relu", normalize_before=False, layer_norm_eps=1e-05
+    ):
+        super().__init__()
+        self.self_attn = OneFormerAttention(embed_dim=embed_dim, num_heads=num_heads, dropout=dropout, is_decoder=True)
+
+        self.norm = nn.LayerNorm(embed_dim, eps=layer_norm_eps)
+        self.dropout = nn.Dropout(dropout)
+
+        self.activation = ACT2FN[activation]
+        self.normalize_before = normalize_before
+
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+
+    def forward_post(
+        self,
+        output,
+        output_mask: Optional[Tensor] = None,
+        output_key_padding_mask: Optional[Tensor] = None,
+        query_pos: Optional[Tensor] = None,
+    ):
+        output2, attention_weights = self.self_attn(
+            hidden_states=output, position_embeddings=query_pos, attention_mask=output_mask, output_attentions=True
+        )
+        output = output + self.dropout(output2)
+        output = self.norm(output)
+
+        return output, attention_weights
+
+    def forward_pre(
+        self,
+        output,
+        output_mask: Optional[Tensor] = None,
+        output_key_padding_mask: Optional[Tensor] = None,
+        query_pos: Optional[Tensor] = None,
+    ):
+        output2 = self.norm(output)
+        output2, attention_weights = self.self_attn(
+            hidden_states=output2, position_embeddings=query_pos, attention_mask=output_mask, output_attentions=True
+        )
+        output = output + self.dropout(output2)
+
+        return output, attention_weights
+
+    def forward(
+        self,
+        output,
+        output_mask: Optional[Tensor] = None,
+        output_key_padding_mask: Optional[Tensor] = None,
+        query_pos: Optional[Tensor] = None,
+    ):
+        if self.normalize_before:
+            return self.forward_pre(output, output_mask, output_key_padding_mask, query_pos)
+        return self.forward_post(output, output_mask, output_key_padding_mask, query_pos)
+
+
+class OneFormerTransformerDecoderCrossAttentionLayer(nn.Module):
+    def __init__(
+        self, embed_dim, num_heads, dropout=0.0, activation="relu", normalize_before=False, layer_norm_eps=1e-05
+    ):
+        super().__init__()
+        self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout)
+
+        self.norm = nn.LayerNorm(embed_dim, eps=layer_norm_eps)
+        self.dropout = nn.Dropout(dropout)
+
+        self.activation = ACT2FN[activation]
+        self.normalize_before = normalize_before
+
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+
+    def forward_post(
+        self,
+        output,
+        memory,
+        memory_mask: Optional[Tensor] = None,
+        memory_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+        query_pos: Optional[Tensor] = None,
+    ):
+        output2, attention_weights = self.multihead_attn(
+            query=self.with_pos_embed(output, query_pos),
+            key=self.with_pos_embed(memory, pos),
+            value=memory,
+            attn_mask=memory_mask,
+            key_padding_mask=memory_key_padding_mask,
+        )
+        output = output + self.dropout(output2)
+        output = self.norm(output)
+
+        return output, attention_weights
+
+    def forward_pre(
+        self,
+        output,
+        memory,
+        memory_mask: Optional[Tensor] = None,
+        memory_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+        query_pos: Optional[Tensor] = None,
+    ):
+        output2 = self.norm(output)
+        output2, attention_weights = self.multihead_attn(
+            query=self.with_pos_embed(output2, query_pos),
+            key=self.with_pos_embed(memory, pos),
+            value=memory,
+            attn_mask=memory_mask,
+            key_padding_mask=memory_key_padding_mask,
+        )
+        output = output + self.dropout(output2)
+
+        return output, attention_weights
+
+    def forward(
+        self,
+        output,
+        memory,
+        memory_mask: Optional[Tensor] = None,
+        memory_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+        query_pos: Optional[Tensor] = None,
+    ):
+        if self.normalize_before:
+            return self.forward_pre(output, memory, memory_mask, memory_key_padding_mask, pos, query_pos)
+        return self.forward_post(output, memory, memory_mask, memory_key_padding_mask, pos, query_pos)
+
+
+class OneFormerTransformerDecoderFFNLayer(nn.Module):
+    def __init__(
+        self,
+        d_model,
+        dim_feedforward=2048,
+        dropout=0.0,
+        activation="relu",
+        normalize_before=False,
+        layer_norm_eps=1e-05,
+    ):
+        super().__init__()
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm = nn.LayerNorm(d_model, eps=layer_norm_eps)
+
+        self.activation = ACT2FN[activation]
+        self.normalize_before = normalize_before
+
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+
+    def forward_post(self, output):
+        output2 = self.linear2(self.dropout(self.activation(self.linear1(output))))
+        output = output + self.dropout(output2)
+        output = self.norm(output)
+        return output
+
+    def forward_pre(self, output):
+        output2 = self.norm(output)
+        output2 = self.linear2(self.dropout(self.activation(self.linear1(output2))))
+        output = output + self.dropout(output2)
+        return output
+
+    def forward(self, output):
+        if self.normalize_before:
+            return self.forward_pre(output)
+        return self.forward_post(output)
+
+
+class OneFormerMLPPredictionHead(nn.Module):
+    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int, num_layers: int = 3):
+        """
+        A classic Multi Layer Perceptron (MLP).
+
+        Args:
+            input_dim (`int`):
+                The input dimensions.
+            hidden_dim (`int`):
+                The hidden dimensions.
+            output_dim (`int`):
+                The output dimensions.
+            num_layers (int, *optional*, defaults to 3):
+                The number of layers.
+        """
+        super().__init__()
+        in_dims = [input_dim] + [hidden_dim] * (num_layers - 1)
+        out_dims = [hidden_dim] * (num_layers - 1) + [output_dim]
+
+        layers = []
+        for i, (in_dim, out_dim) in enumerate(zip(in_dims, out_dims)):
+            layers.append(
+                PredictionBlock(in_dim, out_dim, activation=nn.ReLU() if i < num_layers - 1 else nn.Identity())
+            )
+
+        self.layers = nn.Sequential(*layers)
+
+    def forward(self, input: Tensor) -> Tensor:
+        return self.layers(input)
+
+
+# refactored from original implementation
+class OneFormerTransformerDecoderLayer(nn.Module):
+    def __init__(self, config: OneFormerConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_dim
+        self.num_feature_levels = 3
+
+        self.cross_attn = OneFormerTransformerDecoderCrossAttentionLayer(
+            embed_dim=self.embed_dim,
+            num_heads=config.num_attention_heads,
+            dropout=0.0,
+            normalize_before=config.pre_norm,
+            layer_norm_eps=config.layer_norm_eps,
+        )
+
+        self.self_attn = OneFormerTransformerDecoderSelfAttentionLayer(
+            embed_dim=self.embed_dim,
+            num_heads=config.num_attention_heads,
+            dropout=0.0,
+            normalize_before=config.pre_norm,
+            layer_norm_eps=config.layer_norm_eps,
+        )
+
+        self.ffn = OneFormerTransformerDecoderFFNLayer(
+            d_model=self.embed_dim,
+            dim_feedforward=config.dim_feedforward,
+            dropout=0.0,
+            normalize_before=config.pre_norm,
+            layer_norm_eps=config.layer_norm_eps,
+        )
+
+    def forward(
+        self,
+        index: int,
+        output: torch.Tensor,
+        multi_stage_features: List[torch.Tensor],
+        multi_stage_positional_embeddings: List[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        query_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ):
+        """
+        Args:
+            index (`int`): index of the layer in the Transformer decoder.
+            output (`torch.FloatTensor`): the object queries of shape `(N, batch, hidden_dim)`
+            multi_stage_features (`List[torch.Tensor]`): the multi-scale features from the pixel decoder.
+            multi_stage_positional_embeddings (`List[torch.Tensor]`):
+                positional embeddings for the multi_stage_features
+            attention_mask (`torch.FloatTensor`): attention mask for the masked cross attention layer
+            query_embeddings (`torch.FloatTensor`, *optional*):
+                position embeddings that are added to the queries and keys in the self-attention layer.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+
+        level_index = index % self.num_feature_levels
+        attention_mask[torch.where(attention_mask.sum(-1) == attention_mask.shape[-1])] = False
+
+        # Masked Cross Attention
+        output, cross_attn_weights = self.cross_attn(
+            output,
+            multi_stage_features[level_index],
+            memory_mask=attention_mask,
+            memory_key_padding_mask=None,  # here we do not apply masking on padded region
+            pos=multi_stage_positional_embeddings[level_index],
+            query_pos=query_embeddings,
+        )
+
+        # Self Attention
+        output, self_attn_weights = self.self_attn(
+            output,
+            output_mask=None,
+            output_key_padding_mask=None,
+            query_pos=query_embeddings,
+        )
+
+        # Fully Connected
+        output = self.ffn(output)
+
+        outputs = (output,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        return outputs
+
+
+class OneFormerTransformerDecoderQueryTransformerDecoder(nn.Module):
+    def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False):
+        super().__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+        self.return_intermediate = return_intermediate
+
+    def forward(
+        self,
+        output,
+        memory,
+        output_mask: Optional[Tensor] = None,
+        memory_mask: Optional[Tensor] = None,
+        output_key_padding_mask: Optional[Tensor] = None,
+        memory_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+        query_pos: Optional[Tensor] = None,
+    ):
+        intermediate = []
+
+        for layer in self.layers:
+            output = layer(
+                output,
+                memory,
+                output_mask=output_mask,
+                memory_mask=memory_mask,
+                output_key_padding_mask=output_key_padding_mask,
+                memory_key_padding_mask=memory_key_padding_mask,
+                pos=pos,
+                query_pos=query_pos,
+            )
+            if self.return_intermediate:
+                intermediate.append(self.norm(output))
+
+        if self.norm is not None:
+            output = self.norm(output)
+            if self.return_intermediate:
+                intermediate.pop()
+                intermediate.append(output)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate)
+
+        return output.unsqueeze(0)
+
+
+class OneFormerTransformerDecoderQueryTransformerDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        d_model,
+        nhead,
+        dim_feedforward=2048,
+        dropout=0.1,
+        activation="relu",
+        normalize_before=False,
+        layer_norm_eps=1e-05,
+    ):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
+        self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
+        self.norm3 = nn.LayerNorm(d_model, eps=layer_norm_eps)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.dropout3 = nn.Dropout(dropout)
+
+        self.activation = ACT2FN[activation]
+        self.normalize_before = normalize_before
+
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+
+    def forward_post(
+        self,
+        output,
+        memory,
+        output_mask: Optional[Tensor] = None,
+        memory_mask: Optional[Tensor] = None,
+        output_key_padding_mask: Optional[Tensor] = None,
+        memory_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+        query_pos: Optional[Tensor] = None,
+    ):
+        q = k = self.with_pos_embed(output, query_pos)
+        output2 = self.self_attn(q, k, value=output, attn_mask=output_mask, key_padding_mask=output_key_padding_mask)
+        output2 = output2[0]
+        output = output + self.dropout1(output2)
+        output = self.norm1(output)
+        output2 = self.multihead_attn(
+            query=self.with_pos_embed(output, query_pos),
+            key=self.with_pos_embed(memory, pos),
+            value=memory,
+            attn_mask=memory_mask,
+            key_padding_mask=memory_key_padding_mask,
+        )
+        output2 = output2[0]
+        output = output + self.dropout2(output2)
+        output = self.norm2(output)
+        output2 = self.linear2(self.dropout(self.activation(self.linear1(output))))
+        output = output + self.dropout3(output2)
+        output = self.norm3(output)
+        return output
+
+    def forward_pre(
+        self,
+        output,
+        memory,
+        output_mask: Optional[Tensor] = None,
+        memory_mask: Optional[Tensor] = None,
+        output_key_padding_mask: Optional[Tensor] = None,
+        memory_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+        query_pos: Optional[Tensor] = None,
+    ):
+        output2 = self.norm1(output)
+        q = k = self.with_pos_embed(output2, query_pos)
+        output2 = self.self_attn(q, k, value=output2, attn_mask=output_mask, key_padding_mask=output_key_padding_mask)
+        output2 = output2[0]
+        output = output + self.dropout1(output2)
+        output2 = self.norm2(output)
+        output2 = self.multihead_attn(
+            query=self.with_pos_embed(output2, query_pos),
+            key=self.with_pos_embed(memory, pos),
+            value=memory,
+            attn_mask=memory_mask,
+            key_padding_mask=memory_key_padding_mask,
+        )
+        output2 = output2[0]
+        output = output + self.dropout2(output2)
+        output2 = self.norm3(output)
+        output2 = self.linear2(self.dropout(self.activation(self.linear1(output2))))
+        output = output + self.dropout3(output2)
+        return output
+
+    def forward(
+        self,
+        output,
+        memory,
+        output_mask: Optional[Tensor] = None,
+        memory_mask: Optional[Tensor] = None,
+        output_key_padding_mask: Optional[Tensor] = None,
+        memory_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+        query_pos: Optional[Tensor] = None,
+    ):
+        if self.normalize_before:
+            return self.forward_pre(
+                output,
+                memory,
+                output_mask,
+                memory_mask,
+                output_key_padding_mask,
+                memory_key_padding_mask,
+                pos,
+                query_pos,
+            )
+        return self.forward_post(
+            output,
+            memory,
+            output_mask,
+            memory_mask,
+            output_key_padding_mask,
+            memory_key_padding_mask,
+            pos,
+            query_pos,
+        )
+
+
+class OneFormerTransformerDecoderQueryTransformer(nn.Module):
+    def __init__(
+        self,
+        d_model=512,
+        nhead=8,
+        num_decoder_layers=6,
+        dim_feedforward=2048,
+        dropout=0.1,
+        activation="relu",
+        normalize_before=False,
+        return_intermediate_dec=False,
+        layer_norm_eps=1e-05,
+    ):
+        super().__init__()
+
+        decoder_layer = OneFormerTransformerDecoderQueryTransformerDecoderLayer(
+            d_model, nhead, dim_feedforward, dropout, activation, normalize_before, layer_norm_eps
+        )
+        decoder_norm = nn.LayerNorm(d_model, eps=layer_norm_eps)
+        self.decoder = OneFormerTransformerDecoderQueryTransformerDecoder(
+            decoder_layer,
+            num_decoder_layers,
+            decoder_norm,
+            return_intermediate=return_intermediate_dec,
+        )
+
+        self.d_model = d_model
+        self.nhead = nhead
+
+    def forward(self, src, mask, query_embed, pos_embed, task_token=None):
+        batch_size = src.shape[0]
+        src = src.flatten(2).permute(2, 0, 1)
+        pos_embed = pos_embed.flatten(2).permute(2, 0, 1)
+        query_embed = query_embed.unsqueeze(1).repeat(1, batch_size, 1)
+        if mask is not None:
+            mask = mask.flatten(1)
+
+        if task_token is None:
+            queries = torch.zeros_like(query_embed)
+        else:
+            queries = task_token.repeat(query_embed.shape[0], 1, 1)
+
+        queries = self.decoder(queries, src, memory_key_padding_mask=mask, pos=pos_embed, query_pos=query_embed)
+        return queries.transpose(1, 2)
+
+
+class OneFormerTransformerDecoder(nn.Module):
+    """
+    Transformer decoder
+    """
+
+    def __init__(self, in_channels: int, config: OneFormerConfig):
+        super().__init__()
+        self.config = config
+
+        self.dropout = config.dropout
+        self.num_heads = config.num_attention_heads
+        self.is_training = config.is_training
+        self.use_task_norm = config.use_task_norm
+        self.use_auxiliary_loss = config.use_auxiliary_loss
+
+        self.query_transformer = OneFormerTransformerDecoderQueryTransformer(
+            d_model=config.hidden_dim,
+            dropout=config.dropout,
+            nhead=config.num_attention_heads,
+            dim_feedforward=config.dim_feedforward,
+            num_decoder_layers=config.query_dec_layers,
+            normalize_before=config.pre_norm,
+            return_intermediate_dec=False,
+            layer_norm_eps=config.layer_norm_eps,
+        )
+
+        self.decoder_norm = nn.LayerNorm(config.hidden_dim, eps=config.layer_norm_eps)
+
+        self.num_feature_levels = 3
+
+        self.layers = nn.ModuleList(
+            [OneFormerTransformerDecoderLayer(config) for _ in range(config.decoder_layers - 1)]
+        )
+
+        self.query_input_projection = nn.Conv2d(in_channels, config.hidden_dim, kernel_size=1)
+
+        self.class_embed = nn.Linear(config.hidden_dim, config.num_labels + 1)
+        self.mask_embed = OneFormerMLPPredictionHead(
+            config.hidden_dim,
+            config.hidden_dim,
+            config.mask_dim,
+            3,
+        )
+
+    def forward(
+        self,
+        task_token=None,
+        multi_stage_features=None,
+        multi_stage_positional_embeddings=None,
+        mask_features=None,
+        query_features=None,
+        query_embeddings=None,
+        query_embedder=None,
+        size_list=None,
+        output_attentions=None,
+    ):
+        if self.use_task_norm:
+            task_token = self.decoder_norm(task_token)
+
+        object_queries = self.query_transformer(
+            query_features,
+            None,
+            query_embedder.weight[:-1],
+            self.query_input_projection(mask_features),
+            task_token if self.use_task_norm else None,
+        )
+
+        object_queries = object_queries[0].permute(1, 0, 2)
+
+        queries = torch.cat([object_queries, task_token], dim=0)
+
+        output = queries.clone()
+
+        intermediate_class_predictions = []
+        intermediate_mask_predictions = []
+
+        # prediction heads on learnable query features
+        outputs_class, outputs_mask, attention_mask = self.forward_prediction_heads(
+            output, mask_features, attention_mask_target_size=size_list[0]
+        )
+        intermediate_class_predictions.append(outputs_class)
+        intermediate_mask_predictions.append(outputs_mask)
+
+        attentions = ()
+
+        for index, layer in enumerate(self.layers):
+            layer_outputs = layer(
+                index=index,
+                output=output,
+                multi_stage_features=multi_stage_features,
+                multi_stage_positional_embeddings=multi_stage_positional_embeddings,
+                attention_mask=attention_mask,
+                query_embeddings=query_embeddings,
+                output_attentions=output_attentions,
+            )
+
+            output = layer_outputs[0]
+            attentions += (layer_outputs[1:],)
+
+            outputs_class, outputs_mask, attention_mask = self.forward_prediction_heads(
+                output, mask_features, attention_mask_target_size=size_list[(index + 1) % self.num_feature_levels]
+            )
+            intermediate_class_predictions.append(outputs_class)
+            intermediate_mask_predictions.append(outputs_mask)
+
+        if not len(intermediate_mask_predictions) == len(self.layers) + 1:
+            raise ValueError(
+                "Intermediate predictions in the transformer decoder must have the same number of elements as number"
+                " of layers"
+            )
+
+        object_queries = layer_outputs[0].permute(1, 0, 2)
+
+        contrastive_logits = queries.permute(1, 0, 2)
+
+        return OneFormerTransformerDecoderOutput(
+            object_queries=object_queries,
+            contrastive_logits=contrastive_logits,
+            prediction_masks=intermediate_mask_predictions[-1],
+            prediction_class=intermediate_class_predictions[-1],
+            auxiliary_predictions=self._get_aux_predictions(
+                intermediate_class_predictions, intermediate_mask_predictions
+            )
+            if self.use_auxiliary_loss
+            else None,
+            attentions=attentions,
+        )
+
+    def forward_prediction_heads(self, output, mask_features, attention_mask_target_size):
+        decoder_output = self.decoder_norm(output)
+        decoder_output = decoder_output.transpose(0, 1)
+        outputs_class = self.class_embed(decoder_output)
+        mask_embed = self.mask_embed(decoder_output)
+        outputs_mask = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_features)
+
+        attention_mask = nn.functional.interpolate(
+            outputs_mask, size=attention_mask_target_size, mode="bilinear", align_corners=False
+        )
+
+        # must use bool type
+        # If a BoolTensor is provided, positions with ``True`` are not allowed to attend while ``False`` values will be unchanged.
+        attention_mask = (
+            attention_mask.sigmoid().flatten(2).unsqueeze(1).repeat(1, self.num_heads, 1, 1).flatten(0, 1) < 0.5
+        ).bool()
+        attention_mask = attention_mask.detach()
+
+        return outputs_class, outputs_mask, attention_mask
+
+    @torch.jit.unused
+    def _get_aux_predictions(self, outputs_class, outputs_seg_masks):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        aux_list = [
+            {"class_queries_logits": a, "masks_queries_logits": b}
+            for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1])
+        ]
+        return tuple(aux_list)
+
+
+class OneFormerTransformerModule(nn.Module):
+    """
+    The OneFormer's transformer module.
+    """
+
+    def __init__(self, in_features: int, config: OneFormerConfig):
+        super().__init__()
+        hidden_dim = config.hidden_dim
+        self.num_feature_levels = 3
+        self.position_embedder = OneFormerSinePositionEmbedding(num_pos_feats=hidden_dim // 2, normalize=True)
+        self.queries_embedder = nn.Embedding(config.num_queries, hidden_dim)
+        self.input_projections = []
+
+        for _ in range(self.num_feature_levels):
+            if in_features != hidden_dim or config.enforce_input_proj:
+                self.input_projections.append(nn.Conv2d(in_features, hidden_dim, kernel_size=1))
+            else:
+                self.input_projections.append(nn.Sequential())
+
+        self.decoder = OneFormerTransformerDecoder(in_channels=in_features, config=config)
+        self.level_embed = nn.Embedding(self.num_feature_levels, hidden_dim)
+
+    def forward(
+        self,
+        multi_scale_features: List[Tensor],
+        mask_features: Tensor,
+        task_token: Tensor,
+        output_attentions: bool = False,
+    ) -> OneFormerTransformerDecoderOutput:
+        if not len(multi_scale_features) == self.num_feature_levels:
+            raise ValueError(
+                f"Number of elements in multi_scale_features ({len(multi_scale_features)}) and num_feature_levels"
+                f" ({self.num_feature_levels}) do not match!"
+            )
+        multi_stage_features = []
+        multi_stage_positional_embeddings = []
+        size_list = []
+
+        for i in range(self.num_feature_levels):
+            size_list.append(multi_scale_features[i].shape[-2:])
+            multi_stage_positional_embeddings.append(self.position_embedder(multi_scale_features[i], None).flatten(2))
+            multi_stage_features.append(
+                self.input_projections[i](multi_scale_features[i]).flatten(2)
+                + self.level_embed.weight[i][None, :, None]
+            )
+
+            # flatten NxCxHxW to HWxNxC
+            multi_stage_positional_embeddings[-1] = multi_stage_positional_embeddings[-1].permute(2, 0, 1)
+            multi_stage_features[-1] = multi_stage_features[-1].permute(2, 0, 1)
+
+        _, batch_size, _ = multi_stage_features[0].shape
+
+        # QxNxC
+        query_embeddings = self.queries_embedder.weight.unsqueeze(1).repeat(1, batch_size, 1)
+        task_token = task_token.unsqueeze(0)
+
+        query_features = self.position_embedder(mask_features, None)
+
+        return self.decoder(
+            task_token=task_token,
+            multi_stage_features=multi_stage_features,
+            multi_stage_positional_embeddings=multi_stage_positional_embeddings,
+            mask_features=mask_features,
+            query_features=query_features,
+            query_embeddings=query_embeddings,
+            query_embedder=self.queries_embedder,
+            size_list=size_list,
+            output_attentions=output_attentions,
+        )
+
+
+# Copied from transformers.models.maskformer.modeling_maskformer.MaskFormerSinePositionEmbedding with Mask->One
+class OneFormerSinePositionEmbedding(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
+    need paper, generalized to work on images.
+    """
+
+    def __init__(
+        self, num_pos_feats: int = 64, temperature: int = 10000, normalize: bool = False, scale: Optional[float] = None
+    ):
+        super().__init__()
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        self.num_pos_feats = num_pos_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        self.scale = 2 * math.pi if scale is None else scale
+
+    def forward(self, x: Tensor, mask: Optional[Tensor] = None) -> Tensor:
+        if mask is None:
+            mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
+        not_mask = (~mask).to(x.dtype)
+        y_embed = not_mask.cumsum(1)
+        x_embed = not_mask.cumsum(2)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+
+        dim_t = torch.arange(self.num_pos_feats, dtype=x.dtype, device=x.device)
+        dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.num_pos_feats)
+
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+
+
+# Copied from transformers.models.maskformer.modeling_maskformer.PredictionBlock
+class PredictionBlock(nn.Module):
+    def __init__(self, in_dim: int, out_dim: int, activation: nn.Module) -> None:
+        super().__init__()
+        self.layers = [nn.Linear(in_dim, out_dim), activation]
+        # Maintain submodule indexing as if part of a Sequential block
+        for i, layer in enumerate(self.layers):
+            self.add_module(str(i), layer)
+
+    def forward(self, input: Tensor) -> Tensor:
+        hidden_state = input
+        for layer in self.layers:
+            hidden_state = layer(hidden_state)
+        return hidden_state
+
+
+class OneFormerTextMapperAttention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.0, proj_drop=0.0):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.q_proj = nn.Linear(dim, dim, bias=qkv_bias)
+        self.k_proj = nn.Linear(dim, dim, bias=qkv_bias)
+        self.v_proj = nn.Linear(dim, dim, bias=qkv_bias)
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, q, k, v):
+        batch_size, q_sequence_length, num_channels = q.shape
+        if not k.shape == v.shape:
+            raise ValueError(f"keys ({list(k.shape)}) and values ({list(v.shape)}) have different shapes!")
+        batch_size, k_sequence_length, num_channels = k.shape
+        q = self.q_proj(q).reshape(batch_size, q_sequence_length, self.num_heads, num_channels // self.num_heads)
+        k = self.k_proj(k).reshape(batch_size, k_sequence_length, self.num_heads, num_channels // self.num_heads)
+        v = self.v_proj(v).reshape(batch_size, k_sequence_length, self.num_heads, num_channels // self.num_heads)
+
+        attn = torch.einsum("bnkc,bmkc->bknm", q, k) * self.scale
+
+        attn = attn.softmax(dim=-1)
+
+        output = torch.einsum("bknm,bmkc->bnkc", attn, v).reshape(batch_size, q_sequence_length, num_channels)
+
+        output = self.proj(output)
+        output = self.proj_drop(output)
+        return output
+
+
+class OneFormerTextTransformerDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        d_model,
+        nhead,
+        dropout=0.1,
+        layer_norm_eps=1e-05,
+    ):
+        super().__init__()
+        self.self_attn = OneFormerTextMapperAttention(d_model, nhead, proj_drop=dropout)
+        self.cross_attn = OneFormerTextMapperAttention(d_model, nhead, proj_drop=dropout)
+
+        self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
+        self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
+        self.norm3 = nn.LayerNorm(d_model, eps=layer_norm_eps)
+        self.dropout = nn.Dropout(dropout)
+
+        self.mlp = nn.Sequential(
+            nn.Linear(d_model, d_model * 4), nn.GELU(), nn.Dropout(dropout), nn.Linear(d_model * 4, d_model)
+        )
+
+    def forward(self, hidden_state, mem):
+        q = k = v = self.norm1(hidden_state)
+        hidden_state = hidden_state + self.self_attn(q, k, v)
+        q = self.norm2(hidden_state)
+        hidden_state = hidden_state + self.cross_attn(q, mem, mem)
+        hidden_state = hidden_state + self.dropout(self.mlp(self.norm3(hidden_state)))
+        return hidden_state
+
+
+class OneFormerTextContextDecoder(nn.Module):
+    def __init__(
+        self,
+        transformer_width=256,
+        transformer_heads=4,
+        transformer_layers=6,
+        visual_dim=1024,
+        dropout=0.1,
+        layer_norm_eps=1e-05,
+        **kwargs,
+    ):
+        super().__init__()
+
+        self.memory_proj = nn.Sequential(
+            nn.LayerNorm(visual_dim, eps=layer_norm_eps),
+            nn.Linear(visual_dim, transformer_width),
+            nn.LayerNorm(transformer_width, eps=layer_norm_eps),
+        )
+
+        self.text_proj = nn.Sequential(
+            nn.LayerNorm(visual_dim, eps=layer_norm_eps),
+            nn.Linear(visual_dim, transformer_width),
+        )
+
+        self.decoder = nn.ModuleList(
+            [
+                OneFormerTextTransformerDecoderLayer(transformer_width, transformer_heads, dropout, layer_norm_eps)
+                for _ in range(transformer_layers)
+            ]
+        )
+
+        self.out_proj = nn.Sequential(
+            nn.LayerNorm(transformer_width, eps=layer_norm_eps), nn.Linear(transformer_width, visual_dim)
+        )
+
+    def forward(self, text, visual):
+        visual = self.memory_proj(visual)
+        hidden_state = self.text_proj(text)
+
+        for layer in self.decoder:
+            hidden_state = layer(hidden_state, visual)
+
+        return self.out_proj(hidden_state)
+
+
+class OneFormerTextMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: Optional[int] = None,
+        intermediate_size: Optional[int] = None,
+        output_size: Optional[int] = None,
+    ):
+        super().__init__()
+        self.activation_fn = ACT2FN["quick_gelu"]
+        hidden_size = hidden_size
+        intermediate_size = intermediate_size
+        output_size = output_size
+        self.fc1 = nn.Linear(hidden_size, intermediate_size)
+        self.fc2 = nn.Linear(intermediate_size, output_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class OneFormerTextTransformerLayer(nn.Module):
+    def __init__(self, width: int, heads: int, attn_mask: torch.Tensor, layer_norm_eps=1e-05):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(width, heads)
+        self.layer_norm1 = nn.LayerNorm(width, eps=layer_norm_eps)
+        self.mlp = OneFormerTextMLP(width, width * 4, width)
+        self.layer_norm2 = nn.LayerNorm(width, eps=layer_norm_eps)
+        self.attn_mask = attn_mask
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_padding_mask: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states = self.self_attn(
+            hidden_states,
+            hidden_states,
+            hidden_states,
+            need_weights=False,
+            key_padding_mask=key_padding_mask,
+        )[0]
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class OneFormerTextTransformer(nn.Module):
+    def __init__(
+        self,
+        width: int,
+        layers: int,
+        heads: int,
+        attn_mask: torch.Tensor = None,
+        use_checkpoint=False,
+        layer_norm_eps=1e-05,
+    ):
+        super().__init__()
+        self.width = width
+        self.num_layers = layers
+        self.layers = nn.Sequential(
+            *[OneFormerTextTransformerLayer(width, heads, attn_mask, layer_norm_eps) for _ in range(layers)]
+        )
+        self.use_checkpoint = use_checkpoint
+
+    def forward(self, hidden_states: torch.Tensor):
+        for layer in self.layers:
+            if self.use_checkpoint:
+                hidden_states = torch.utils.checkpoint.checkpoint(layer, hidden_states)
+            else:
+                hidden_states = layer(hidden_states)
+        return hidden_states
+
+
+class OneFormerTextEncoder(nn.Module):
+    def __init__(
+        self,
+        context_length: int,
+        width: int,
+        layers: int,
+        vocab_size,
+        use_checkpoint=False,
+        layer_norm_eps=1e-05,
+    ):
+        super().__init__()
+        heads = width // 64
+        self.context_length = context_length
+        self.width = width
+        self.transformer = OneFormerTextTransformer(
+            width=width,
+            layers=layers,
+            heads=heads,
+            attn_mask=self.build_attention_mask(),
+            use_checkpoint=use_checkpoint,
+            layer_norm_eps=layer_norm_eps,
+        )
+
+        self.positional_embedding = nn.Parameter(torch.empty(self.context_length, width))
+        self.ln_final = nn.LayerNorm(width, eps=layer_norm_eps)
+        self.token_embedding = nn.Embedding(vocab_size, width)
+
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+
+    def forward(self, text):
+        hidden_state = self.token_embedding(text)
+        hidden_state = hidden_state + self.positional_embedding
+        hidden_state = hidden_state.permute(1, 0, 2)
+        hidden_state = self.transformer(hidden_state)
+        hidden_state = hidden_state.permute(1, 0, 2)
+        hidden_state = self.ln_final(hidden_state)
+        hidden_state = hidden_state[torch.arange(hidden_state.shape[0]), text.argmax(dim=-1)]
+
+        return hidden_state
+
+
+class OneFormerTextMapper(nn.Module):
+    def __init__(self, config: OneFormerConfig):
+        super().__init__()
+        self.text_encoder = OneFormerTextEncoder(
+            context_length=config.text_encoder_context_length,
+            width=config.text_encoder_width,
+            layers=config.text_encoder_num_layers,
+            vocab_size=config.text_encoder_vocab_size,
+            layer_norm_eps=config.layer_norm_eps,
+        )
+
+        self.text_projector = OneFormerMLPPredictionHead(
+            config.text_encoder_width,
+            config.hidden_dim,
+            config.hidden_dim,
+            config.text_encoder_proj_layers,
+        )
+        if config.text_encoder_n_ctx > 0:
+            self.prompt_ctx = nn.Embedding(
+                config.text_encoder_n_ctx,
+                config.text_encoder_width,
+            )
+        else:
+            self.prompt_ctx = None
+
+    def forward(
+        self,
+        inputs: Tensor,
+    ) -> Tensor:
+        text_queries = self.encode_text(inputs)
+
+        return text_queries
+
+    def encode_text(self, text):
+        if text.ndim is None:
+            raise ValueError("text must not be NoneType")
+        if text.ndim not in [2, 3]:
+            raise ValueError("Number of dimensions in text must be 2 or 3")
+        squeeze_dim = False
+        num_text = 1
+        if text.ndim == 3:
+            num_text = text.shape[1]
+            batch_size, num_text, hidden_dim = text.shape
+            text = text.reshape(batch_size * num_text, hidden_dim)
+            squeeze_dim = True
+
+        # [batch_size, num_channels]
+        encoded_text = self.text_encoder(text)
+
+        text_queries = self.text_projector(encoded_text)
+
+        if squeeze_dim:
+            _, hidden_dim = text_queries.shape
+            text_queries = text_queries.reshape(batch_size, num_text, hidden_dim)
+            if self.prompt_ctx is not None:
+                text_queries_ctx = self.prompt_ctx.weight.unsqueeze(0).repeat(text_queries.shape[0], 1, 1)
+                text_queries = torch.cat([text_queries, text_queries_ctx], dim=1)
+
+        return text_queries
+
+
+class OneFormerTaskModel(nn.Module):
+    def __init__(self, config: OneFormerConfig):
+        super().__init__()
+        self.task_mlp = OneFormerMLPPredictionHead(
+            config.task_seq_len,
+            config.hidden_dim,
+            config.hidden_dim,
+            2,
+        )
+
+    def forward(self, inputs: Tensor) -> Tensor:
+        task_tokens = self.task_mlp(inputs)
+        return task_tokens
+
+
+ONEFORMER_START_DOCSTRING = r"""
+    This model is a PyTorch [nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use it as a
+    regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior.
+
+    Parameters:
+        config ([`OneFormerConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+ONEFORMER_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`OneFormerProcessor`]. See
+            [`OneFormerProcessor.__call__`] for details.
+        task_inputs (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Task inputs. Task inputs can be obtained using [`AutoImageProcessor`]. See [`OneFormerProcessor.__call__`]
+            for details.
+        pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+            Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
+
+            - 1 for pixels that are real (i.e. **not masked**),
+            - 0 for pixels that are padding (i.e. **masked**).
+
+            [What are attention masks?](../glossary#attention-mask)
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of Detr's decoder attention layers.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~OneFormerModelOutput`] instead of a plain tuple.
+"""
+
+
+class OneFormerPreTrainedModel(PreTrainedModel):
+    config_class = OneFormerConfig
+    base_model_prefix = "model"
+    main_input_name = "pixel_values"
+
+    def _init_weights(self, module: nn.Module):
+        xavier_std = self.config.init_xavier_std
+        std = self.config.init_std
+        if isinstance(module, OneFormerTransformerModule):
+            if module.input_projections is not None:
+                for input_projection in module.input_projections:
+                    if not isinstance(input_projection, nn.Sequential):
+                        nn.init.xavier_uniform_(input_projection.weight, gain=xavier_std)
+                        nn.init.constant_(input_projection.bias, 0)
+        elif isinstance(module, OneFormerTransformerDecoder):
+            nn.init.xavier_uniform_(module.query_input_projection.weight, gain=xavier_std)
+            nn.init.constant_(module.query_input_projection.bias, 0)
+            module.query_input_projection._is_hf_initialized = True
+        elif isinstance(module, OneFormerPixelDecoderEncoderMultiscaleDeformableAttention):
+            nn.init.constant_(module.sampling_offsets.weight.data, 0.0)
+            thetas = torch.arange(module.n_heads, dtype=torch.float32) * (2.0 * math.pi / module.n_heads)
+            grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+            grid_init = (
+                (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
+                .view(module.n_heads, 1, 1, 2)
+                .repeat(1, module.n_levels, module.n_points, 1)
+            )
+            for i in range(module.n_points):
+                grid_init[:, :, i, :] *= i + 1
+            with torch.no_grad():
+                module.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+            nn.init.constant_(module.attention_weights.weight.data, 0.0)
+            nn.init.constant_(module.attention_weights.bias.data, 0.0)
+            nn.init.xavier_uniform_(module.value_proj.weight.data)
+            nn.init.constant_(module.value_proj.bias.data, 0.0)
+            nn.init.xavier_uniform_(module.output_proj.weight.data)
+            nn.init.constant_(module.output_proj.bias.data, 0.0)
+        elif isinstance(module, OneFormerPixelDecoderEncoderOnly):
+            for p in module.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p)
+        elif isinstance(module, OneFormerPixelDecoder):
+            for p in module.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p)
+            nn.init.normal_(module.level_embed, std=0)
+        elif isinstance(module, OneFormerTransformerDecoderSelfAttentionLayer):
+            for p in module.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p, gain=xavier_std)
+        elif isinstance(module, OneFormerTransformerDecoderCrossAttentionLayer):
+            for p in module.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p, gain=xavier_std)
+        elif isinstance(module, OneFormerTransformerDecoderFFNLayer):
+            for p in module.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p, gain=xavier_std)
+        elif isinstance(module, OneFormerTransformerDecoderQueryTransformer):
+            for p in module.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p, gain=xavier_std)
+        elif isinstance(module, OneFormerPixelLevelModule):
+            for submodule in module.modules():
+                if isinstance(submodule, (nn.Conv2d, nn.Linear)):
+                    submodule.weight.data.normal_(mean=0.0, std=std)
+                    if submodule.bias is not None:
+                        submodule.bias.data.zero_()
+        elif isinstance(module, OneFormerTextContextDecoder):
+            for submodule in module.modules():
+                if isinstance(submodule, nn.Linear):
+                    nn.init.trunc_normal_(submodule.weight, std=0.02)
+                    if isinstance(submodule, nn.Linear) and submodule.bias is not None:
+                        nn.init.constant_(submodule.bias, 0)
+                elif isinstance(submodule, nn.LayerNorm):
+                    nn.init.constant_(submodule.bias, 0)
+                    nn.init.constant_(submodule.weight, 1.0)
+        elif isinstance(module, OneFormerTextTransformer):
+            proj_std = (module.width**-0.5) * ((2 * module.num_layers) ** -0.5)
+            attn_std = module.width**-0.5
+            fc_std = (2 * module.width) ** -0.5
+            for layer in module.layers:
+                nn.init.normal_(layer.self_attn.in_proj_weight, std=attn_std)
+                nn.init.normal_(layer.self_attn.out_proj.weight, std=proj_std)
+                nn.init.normal_(layer.mlp.fc1.weight, std=fc_std)
+                nn.init.normal_(layer.mlp.fc2.weight, std=proj_std)
+        elif isinstance(module, OneFormerTextEncoder):
+            nn.init.normal_(module.token_embedding.weight, std=0.02)
+            nn.init.normal_(module.positional_embedding, std=0.01)
+        if hasattr(module, "reference_points"):
+            nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0)
+            nn.init.constant_(module.reference_points.bias.data, 0.0)
+        elif isinstance(module, OneFormerTaskModel):
+            for submodule in module.modules():
+                if isinstance(module, OneFormerMLPPredictionHead):
+                    for submodule in module.modules():
+                        if isinstance(submodule, nn.Linear):
+                            nn.init.xavier_uniform_(submodule.weight, gain=xavier_std)
+                            nn.init.constant_(submodule.bias, 0)
+                        elif isinstance(module, nn.LayerNorm):
+                            module.bias.data.zero_()
+                            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.MultiheadAttention):
+            module.in_proj_weight.data.normal_(mean=0.0, std=std)
+            module.in_proj_bias.data.zero_()
+        elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+@add_start_docstrings(
+    "The bare OneFormer Model outputting raw hidden-states without any specific head on top.",
+    ONEFORMER_START_DOCSTRING,
+)
+class OneFormerModel(OneFormerPreTrainedModel):
+    main_input_name = ["pixel_values", "task_inputs"]
+
+    def __init__(self, config: OneFormerConfig):
+        super().__init__(config)
+        self.pixel_level_module = OneFormerPixelLevelModule(config)
+        self.transformer_module = OneFormerTransformerModule(in_features=config.conv_dim, config=config)
+        self.task_encoder = OneFormerTaskModel(config)
+        self.is_training = config.is_training
+
+        if self.is_training:
+            self.text_mapper = OneFormerTextMapper(config)
+        else:
+            self.text_mapper = None
+
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(ONEFORMER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=OneFormerModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: Tensor,
+        task_inputs: Tensor,
+        text_inputs: Optional[Tensor] = None,
+        pixel_mask: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> OneFormerModelOutput:
+        r"""
+        Returns:
+            `OneFormerModelOutput`
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import OneFormerProcessor, OneFormerModel
+
+        >>> # download texting image
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> # load processor for preprocessing the inputs
+        >>> processor = OneFormerProcessor.from_pretrained("shi-labs/oneformer_ade20k_swin_tiny")
+        >>> model = OneFormerModel.from_pretrained("shi-labs/oneformer_ade20k_swin_tiny")
+        >>> inputs = processor(image, ["semantic"], return_tensors="pt")
+
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+
+        >>> mask_predictions = outputs.transformer_decoder_mask_predictions
+        >>> class_predictions = outputs.transformer_decoder_class_predictions
+
+        >>> f"👉 Mask Predictions Shape: {list(mask_predictions.shape)}, Class Predictions Shape: {list(class_predictions.shape)}"
+        '👉 Mask Predictions Shape: [1, 150, 128, 171], Class Predictions Shape: [1, 150, 151]'
+        ```"""
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, _, height, width = pixel_values.shape
+
+        if pixel_mask is None:
+            pixel_mask = torch.ones((batch_size, height, width), device=pixel_values.device)
+
+        pixel_level_module_output = self.pixel_level_module(pixel_values, output_hidden_states)
+
+        multi_scale_features = pixel_level_module_output.decoder_features
+        mask_features = pixel_level_module_output.decoder_last_feature
+
+        task_token = self.task_encoder(task_inputs.to(self.dtype))
+
+        if self.is_training:
+            text_queries = self.text_mapper(text_inputs)
+        else:
+            text_queries = None
+
+        transformer_module_output = self.transformer_module(
+            multi_scale_features=multi_scale_features,
+            mask_features=mask_features,
+            task_token=task_token,
+            output_attentions=output_attentions,
+        )
+
+        queries = transformer_module_output.object_queries
+
+        encoder_hidden_states = None
+        pixel_decoder_hidden_states = None
+        transformer_decoder_hidden_states = None
+
+        if output_hidden_states:
+            encoder_hidden_states = pixel_level_module_output.encoder_features
+            pixel_decoder_hidden_states = (pixel_level_module_output.decoder_last_feature,)
+            for f in pixel_level_module_output.decoder_features:
+                pixel_decoder_hidden_states += (f,)
+            transformer_decoder_hidden_states = transformer_module_output.auxiliary_predictions
+
+        output = OneFormerModelOutput(
+            encoder_hidden_states=encoder_hidden_states,
+            pixel_decoder_hidden_states=pixel_decoder_hidden_states,
+            transformer_decoder_hidden_states=transformer_decoder_hidden_states,
+            transformer_decoder_object_queries=queries,
+            transformer_decoder_contrastive_queries=transformer_module_output.contrastive_logits,
+            transformer_decoder_mask_predictions=transformer_module_output.prediction_masks,
+            transformer_decoder_class_predictions=transformer_module_output.prediction_class,
+            transformer_decoder_auxiliary_predictions=transformer_module_output.auxiliary_predictions,
+            text_queries=text_queries,
+            task_token=task_token,
+            attentions=transformer_module_output.attentions,
+        )
+
+        if not return_dict:
+            output = tuple(v for v in output.values())
+
+        return output
+
+
+@add_start_docstrings(
+    "OneFormer Model for instance, semantic and panoptic image segmentation.",
+    ONEFORMER_START_DOCSTRING,
+)
+class OneFormerForUniversalSegmentation(OneFormerPreTrainedModel):
+    main_input_name = ["pixel_values", "task_inputs"]
+
+    def __init__(self, config: OneFormerConfig):
+        super().__init__(config)
+        self.model = OneFormerModel(config)
+
+        self.matcher = OneFormerHungarianMatcher(
+            cost_class=config.class_weight,
+            cost_dice=config.dice_weight,
+            cost_mask=config.mask_weight,
+            num_points=config.train_num_points,
+        )
+
+        self.weight_dict: Dict[str, float] = {
+            "loss_cross_entropy": config.class_weight,
+            "loss_mask": config.mask_weight,
+            "loss_dice": config.dice_weight,
+            "loss_contrastive": config.contrastive_weight,
+        }
+
+        self.criterion = OneFormerLoss(
+            num_classes=config.num_labels,
+            matcher=self.matcher,
+            weight_dict=self.weight_dict,
+            eos_coef=config.no_object_weight,
+            num_points=config.train_num_points,
+            oversample_ratio=config.oversample_ratio,
+            importance_sample_ratio=config.importance_sample_ratio,
+            contrastive_temperature=config.contrastive_temperature,
+        )
+
+        self.post_init()
+
+    def get_loss_dict(
+        self,
+        masks_queries_logits: Tensor,
+        class_queries_logits: Tensor,
+        contrastive_queries_logits: Tensor,
+        mask_labels: Tensor,
+        class_labels: Tensor,
+        text_queries: Tensor,
+        auxiliary_predictions: Dict[str, Tensor],
+        calculate_contrastive_loss: bool,
+    ) -> Dict[str, Tensor]:
+        loss_dict: Dict[str, Tensor] = self.criterion(
+            masks_queries_logits=masks_queries_logits,
+            class_queries_logits=class_queries_logits,
+            contrastive_queries_logits=contrastive_queries_logits,
+            mask_labels=mask_labels,
+            class_labels=class_labels,
+            text_queries=text_queries,
+            auxiliary_predictions=auxiliary_predictions,
+            calculate_contrastive_loss=calculate_contrastive_loss,
+        )
+
+        # weight each loss by `self.weight_dict[<LOSS_NAME>]` including auxiliary losses
+        for key, weight in self.weight_dict.items():
+            for loss_key, loss in loss_dict.items():
+                if key in loss_key:
+                    loss *= weight
+
+        return loss_dict
+
+    def get_loss(self, loss_dict: Dict[str, Tensor]) -> Tensor:
+        return sum(loss_dict.values())
+
+    @add_start_docstrings_to_model_forward(ONEFORMER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=OneFormerForUniversalSegmentationOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: Tensor,
+        task_inputs: Tensor,
+        text_inputs: Optional[Tensor] = None,
+        mask_labels: Optional[List[Tensor]] = None,
+        class_labels: Optional[List[Tensor]] = None,
+        pixel_mask: Optional[Tensor] = None,
+        output_auxiliary_logits: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> OneFormerForUniversalSegmentationOutput:
+        r"""
+        text_inputs (`List[torch.Tensor]`, *optional*):
+            Tensor fof shape `(num_queries, sequence_length)` to be fed to a model
+        mask_labels (`List[torch.Tensor]`, *optional*):
+            List of mask labels of shape `(num_labels, height, width)` to be fed to a model
+        class_labels (`List[torch.LongTensor]`, *optional*):
+            list of target class labels of shape `(num_labels, height, width)` to be fed to a model. They identify the
+            labels of `mask_labels`, e.g. the label of `mask_labels[i][j]` if `class_labels[i][j]`.
+
+        Returns:
+            `OneFormerUniversalSegmentationOutput`
+        Example:
+
+        Universal segmentation example:
+
+        ```python
+        >>> from transformers import OneFormerProcessor, OneFormerForUniversalSegmentation
+        >>> from PIL import Image
+        >>> import requests
+        >>> import torch
+
+        >>> # load OneFormer fine-tuned on ADE20k for universal segmentation
+        >>> processor = OneFormerProcessor.from_pretrained("shi-labs/oneformer_ade20k_swin_tiny")
+        >>> model = OneFormerForUniversalSegmentation.from_pretrained("shi-labs/oneformer_ade20k_swin_tiny")
+
+        >>> url = (
+        ...     "https://huggingface.co/datasets/hf-internal-testing/fixtures_ade20k/resolve/main/ADE_val_00000001.jpg"
+        ... )
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> # Semantic Segmentation
+        >>> inputs = processor(image, ["semantic"], return_tensors="pt")
+
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+        >>> # model predicts class_queries_logits of shape `(batch_size, num_queries)`
+        >>> # and masks_queries_logits of shape `(batch_size, num_queries, height, width)`
+        >>> class_queries_logits = outputs.class_queries_logits
+        >>> masks_queries_logits = outputs.masks_queries_logits
+
+        >>> # you can pass them to processor for semantic postprocessing
+        >>> predicted_semantic_map = processor.post_process_semantic_segmentation(
+        ...     outputs, target_sizes=[image.size[::-1]]
+        ... )[0]
+        >>> f"👉 Semantic Predictions Shape: {list(predicted_semantic_map.shape)}"
+        '👉 Semantic Predictions Shape: [512, 683]'
+
+        >>> # Instance Segmentation
+        >>> inputs = processor(image, ["instance"], return_tensors="pt")
+
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+        >>> # model predicts class_queries_logits of shape `(batch_size, num_queries)`
+        >>> # and masks_queries_logits of shape `(batch_size, num_queries, height, width)`
+        >>> class_queries_logits = outputs.class_queries_logits
+        >>> masks_queries_logits = outputs.masks_queries_logits
+
+        >>> # you can pass them to processor for instance postprocessing
+        >>> predicted_instance_map = processor.post_process_instance_segmentation(
+        ...     outputs, target_sizes=[image.size[::-1]]
+        ... )[0]["segmentation"]
+        >>> f"👉 Instance Predictions Shape: {list(predicted_instance_map.shape)}"
+        '👉 Instance Predictions Shape: [512, 683]'
+
+        >>> # Panoptic Segmentation
+        >>> inputs = processor(image, ["panoptic"], return_tensors="pt")
+
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+        >>> # model predicts class_queries_logits of shape `(batch_size, num_queries)`
+        >>> # and masks_queries_logits of shape `(batch_size, num_queries, height, width)`
+        >>> class_queries_logits = outputs.class_queries_logits
+        >>> masks_queries_logits = outputs.masks_queries_logits
+
+        >>> # you can pass them to processor for panoptic postprocessing
+        >>> predicted_panoptic_map = processor.post_process_panoptic_segmentation(
+        ...     outputs, target_sizes=[image.size[::-1]]
+        ... )[0]["segmentation"]
+        >>> f"👉 Panoptic Predictions Shape: {list(predicted_panoptic_map.shape)}"
+        '👉 Panoptic Predictions Shape: [512, 683]'
+        ```
+        """
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            pixel_values=pixel_values,
+            task_inputs=task_inputs,
+            text_inputs=text_inputs,
+            pixel_mask=pixel_mask,
+            output_hidden_states=output_hidden_states or self.config.use_auxiliary_loss,
+            output_attentions=output_attentions,
+            return_dict=True,
+        )
+
+        loss, loss_dict, auxiliary_predictions = None, None, None
+
+        class_queries_logits = outputs.transformer_decoder_class_predictions
+        masks_queries_logits = outputs.transformer_decoder_mask_predictions
+        contrastive_queries_logits = outputs.transformer_decoder_contrastive_queries
+        auxiliary_predictions = outputs.transformer_decoder_auxiliary_predictions
+        text_queries = outputs.text_queries
+
+        if mask_labels is not None and class_labels is not None:
+            loss_dict: Dict[str, Tensor] = self.get_loss_dict(
+                masks_queries_logits=masks_queries_logits,
+                class_queries_logits=class_queries_logits,
+                contrastive_queries_logits=contrastive_queries_logits,
+                mask_labels=mask_labels,
+                class_labels=class_labels,
+                text_queries=text_queries,
+                auxiliary_predictions=auxiliary_predictions,
+                calculate_contrastive_loss=self.config.contrastive_temperature is not None,
+            )
+            loss = self.get_loss(loss_dict)
+
+        output_auxiliary_logits = (
+            self.config.output_auxiliary_logits if output_auxiliary_logits is None else output_auxiliary_logits
+        )
+        if not output_auxiliary_logits:
+            auxiliary_predictions = None
+
+        output = OneFormerForUniversalSegmentationOutput(
+            class_queries_logits=class_queries_logits,
+            masks_queries_logits=masks_queries_logits,
+            auxiliary_predictions=auxiliary_predictions,
+            loss=loss,
+            **outputs,
+        )
+
+        if not return_dict:
+            output = tuple(v for v in output.values())
+            if loss is not None:
+                output = ((loss)) + output
+        return output
diff --git a/src/transformers/models/oneformer/processing_oneformer.py b/src/transformers/models/oneformer/processing_oneformer.py
new file mode 100644
index 000000000000..c4479110ae77
--- /dev/null
+++ b/src/transformers/models/oneformer/processing_oneformer.py
@@ -0,0 +1,204 @@
+# coding=utf-8
+# Copyright 2022 SHI Labs and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Image/Text processor class for OneFormer
+"""
+
+from typing import List
+
+from ...processing_utils import ProcessorMixin
+from ...utils import is_torch_available
+
+
+if is_torch_available():
+    import torch
+
+
+class OneFormerProcessor(ProcessorMixin):
+    r"""
+    Constructs an OneFormer processor which wraps [`OneFormerImageProcessor`] and
+    [`CLIPTokenizer`]/[`CLIPTokenizerFast`] into a single processor that inherits both the image processor and
+    tokenizer functionalities.
+
+    Args:
+        image_processor ([`OneFormerImageProcessor`]):
+            The image processor is a required input.
+        tokenizer ([`CLIPTokenizer`, `CLIPTokenizerFast`]):
+            The tokenizer is a required input.
+        max_seq_len (`int`, *optional*, defaults to 77)):
+            Sequence length for input text list.
+        task_seq_len (`int`, *optional*, defaults to 77):
+            Sequence length for input task token.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "OneFormerImageProcessor"
+    tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
+
+    def __init__(
+        self, image_processor=None, tokenizer=None, max_seq_length: int = 77, task_seq_length: int = 77, **kwargs
+    ):
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+
+        self.max_seq_length = max_seq_length
+        self.task_seq_length = task_seq_length
+
+        super().__init__(image_processor, tokenizer)
+
+    def _preprocess_text(self, text_list=None, max_length=77):
+        if text_list is None:
+            raise ValueError("tokens cannot be None.")
+
+        tokens = self.tokenizer(text_list, padding="max_length", max_length=max_length, truncation=True)
+
+        attention_masks, input_ids = tokens["attention_mask"], tokens["input_ids"]
+
+        token_inputs = []
+        for attn_mask, input_id in zip(attention_masks, input_ids):
+            token = torch.tensor(attn_mask) * torch.tensor(input_id)
+            token_inputs.append(token.unsqueeze(0))
+
+        token_inputs = torch.cat(token_inputs, dim=0)
+        return token_inputs
+
+    def __call__(self, images=None, task_inputs=None, segmentation_maps=None, **kwargs):
+        """
+        Main method to prepare for the model one or several task input(s) and image(s). This method forwards the
+        `task_inputs` and `kwargs` arguments to CLIPTokenizer's [`~CLIPTokenizer.__call__`] if `task_inputs` is not
+        `None` to encode. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
+        OneFormerImageProcessor's [`~OneFormerImageProcessor.__call__`] if `images` is not `None`. Please refer to the
+        doctsring of the above two methods for more information.
+
+        Args:
+            task_inputs (`str`, `List[str]`):
+                The sequence or batch of task_inputs sequences to be encoded. Each sequence can be a string or a list
+                of strings of the template "the task is {task}".
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`,
+            `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+            segmentation_maps (`ImageInput`, *optional*):
+                The corresponding semantic segmentation maps with the pixel-wise annotations.
+
+             (`bool`, *optional*, defaults to `True`):
+                Whether or not to pad images up to the largest image in a batch and create a pixel mask.
+
+                If left to the default, will return a pixel mask that is:
+
+                - 1 for pixels that are real (i.e. **not masked**),
+                - 0 for pixels that are padding (i.e. **masked**).
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+            - **task_inputs** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+
+        if task_inputs is None:
+            raise ValueError("You have to specify the task_input. Found None.")
+        elif images is None:
+            raise ValueError("You have to specify the image. Found None.")
+
+        if not all(task in ["semantic", "instance", "panoptic"] for task in task_inputs):
+            raise ValueError("task_inputs must be semantic, instance, or panoptic.")
+
+        encoded_inputs = self.image_processor(images, task_inputs, segmentation_maps, **kwargs)
+
+        if isinstance(task_inputs, str):
+            task_inputs = [task_inputs]
+
+        if isinstance(task_inputs, List) and all(isinstance(task_input, str) for task_input in task_inputs):
+            task_token_inputs = []
+            for task in task_inputs:
+                task_input = f"the task is {task}"
+                task_token_inputs.append(task_input)
+            encoded_inputs["task_inputs"] = self._preprocess_text(task_token_inputs, max_length=self.task_seq_length)
+        else:
+            raise TypeError("Task Inputs should be a string or a list of strings.")
+
+        if hasattr(encoded_inputs, "text_inputs"):
+            texts_list = encoded_inputs.text_inputs
+
+            text_inputs = []
+            for texts in texts_list:
+                text_input_list = self._preprocess_text(texts, max_length=self.max_seq_length)
+                text_inputs.append(text_input_list.unsqueeze(0))
+
+            encoded_inputs["text_inputs"] = torch.cat(text_inputs, dim=0)
+
+        return encoded_inputs
+
+    def encode_inputs(self, images=None, task_inputs=None, segmentation_maps=None, **kwargs):
+        """
+        This method forwards all its arguments to [`OneFormerImageProcessor.encode_inputs`] and then tokenizes the
+        task_inputs. Please refer to the docstring of this method for more information.
+        """
+
+        if task_inputs is None:
+            raise ValueError("You have to specify the task_input. Found None.")
+        elif images is None:
+            raise ValueError("You have to specify the image. Found None.")
+
+        if not all(task in ["semantic", "instance", "panoptic"] for task in task_inputs):
+            raise ValueError("task_inputs must be semantic, instance, or panoptic.")
+
+        encoded_inputs = self.image_processor.encode_inputs(images, task_inputs, segmentation_maps, **kwargs)
+
+        if isinstance(task_inputs, str):
+            task_inputs = [task_inputs]
+
+        if isinstance(task_inputs, List) and all(isinstance(task_input, str) for task_input in task_inputs):
+            task_token_inputs = []
+            for task in task_inputs:
+                task_input = f"the task is {task}"
+                task_token_inputs.append(task_input)
+            encoded_inputs["task_inputs"] = self._preprocess_text(task_token_inputs, max_length=self.task_seq_length)
+        else:
+            raise TypeError("Task Inputs should be a string or a list of strings.")
+
+        if hasattr(encoded_inputs, "text_inputs"):
+            texts_list = encoded_inputs.text_inputs
+
+            text_inputs = []
+            for texts in texts_list:
+                text_input_list = self._preprocess_text(texts, max_length=self.max_seq_length)
+                text_inputs.append(text_input_list.unsqueeze(0))
+
+            encoded_inputs["text_inputs"] = torch.cat(text_inputs, dim=0)
+
+        return encoded_inputs
+
+    def post_process_semantic_segmentation(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to [`OneFormerImageProcessor.post_process_semantic_segmentation`].
+        Please refer to the docstring of this method for more information.
+        """
+        return self.image_processor.post_process_semantic_segmentation(*args, **kwargs)
+
+    def post_process_instance_segmentation(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to [`OneFormerImageProcessor.post_process_instance_segmentation`].
+        Please refer to the docstring of this method for more information.
+        """
+        return self.image_processor.post_process_instance_segmentation(*args, **kwargs)
+
+    def post_process_panoptic_segmentation(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to [`OneFormerImageProcessor.post_process_panoptic_segmentation`].
+        Please refer to the docstring of this method for more information.
+        """
+        return self.image_processor.post_process_panoptic_segmentation(*args, **kwargs)
diff --git a/src/transformers/models/openai/__init__.py b/src/transformers/models/openai/__init__.py
index 8aaaaa62a989..b7dba0b5dc0c 100644
--- a/src/transformers/models/openai/__init__.py
+++ b/src/transformers/models/openai/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/openai/configuration_openai.py b/src/transformers/models/openai/configuration_openai.py
index d378872023fc..dd6f349249e3 100644
--- a/src/transformers/models/openai/configuration_openai.py
+++ b/src/transformers/models/openai/configuration_openai.py
@@ -56,12 +56,10 @@ class OpenAIGPTConfig(PretrainedConfig):
             The dropout ratio for the embeddings.
         attn_pdrop (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention.
-        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
             The epsilon to use in the layer normalization layers
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        predict_special_tokens (`bool`, *optional*, defaults to `True`):
-            Whether or not special tokens should be predicted when the model has a language modeling head.
         summary_type (`str`, *optional*, defaults to `"cls_index"`):
             Argument used when doing sequence summary, used in the models [`OpenAIGPTDoubleHeadsModel`] and
             [`OpenAIGPTDoubleHeadsModel`].
@@ -93,8 +91,6 @@ class OpenAIGPTConfig(PretrainedConfig):
             [`OpenAIGPTDoubleHeadsModel`].
 
             The dropout ratio to be used after the projection and activation.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models).
 
 
     Examples:
@@ -133,13 +129,12 @@ def __init__(
         attn_pdrop=0.1,
         layer_norm_epsilon=1e-5,
         initializer_range=0.02,
-        predict_special_tokens=True,
         summary_type="cls_index",
         summary_use_proj=True,
         summary_activation=None,
         summary_proj_to_labels=True,
         summary_first_dropout=0.1,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.n_positions = n_positions
@@ -152,7 +147,6 @@ def __init__(
         self.attn_pdrop = attn_pdrop
         self.layer_norm_epsilon = layer_norm_epsilon
         self.initializer_range = initializer_range
-        self.predict_special_tokens = predict_special_tokens
         self.summary_type = summary_type
         self.summary_use_proj = summary_use_proj
         self.summary_activation = summary_activation
diff --git a/src/transformers/models/openai/modeling_openai.py b/src/transformers/models/openai/modeling_openai.py
index 6102ce377af5..2d56272721e2 100644
--- a/src/transformers/models/openai/modeling_openai.py
+++ b/src/transformers/models/openai/modeling_openai.py
@@ -45,7 +45,6 @@
 
 _CHECKPOINT_FOR_DOC = "openai-gpt"
 _CONFIG_FOR_DOC = "OpenAIGPTConfig"
-_TOKENIZER_FOR_DOC = "OpenAIGPTTokenizer"
 
 OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "openai-gpt",
@@ -131,7 +130,7 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
     return model
 
 
-ACT_FNS = {"relu": nn.ReLU, "silu": silu, "gelu": gelu_new, "swish": silu}
+ACT_FNS = {"relu": nn.ReLU(), "silu": silu, "gelu": gelu_new, "swish": silu}
 
 
 class Attention(nn.Module):
@@ -142,7 +141,9 @@ def __init__(self, nx, n_positions, config, scale=False):
         if n_state % config.n_head != 0:
             raise ValueError(f"Attention n_state shape: {n_state} must be divisible by config.n_head {config.n_head}")
         self.register_buffer(
-            "bias", torch.tril(torch.ones(n_positions, n_positions)).view(1, 1, n_positions, n_positions)
+            "bias",
+            torch.tril(torch.ones(n_positions, n_positions)).view(1, 1, n_positions, n_positions),
+            persistent=False,
         )
         self.n_head = config.n_head
         self.split_size = n_state
@@ -275,7 +276,6 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel):
     config_class = OpenAIGPTConfig
     load_tf_weights = load_tf_weights_in_openai_gpt
     base_model_prefix = "transformer"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights."""
@@ -350,7 +350,7 @@ class OpenAIGPTDoubleHeadsModelOutput(ModelOutput):
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`OpenAIGPTTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -408,7 +408,7 @@ def __init__(self, config):
         self.drop = nn.Dropout(config.embd_pdrop)
         self.h = nn.ModuleList([Block(config.n_positions, config, scale=True) for _ in range(config.n_layer)])
 
-        self.register_buffer("position_ids", torch.arange(config.n_positions))
+        self.register_buffer("position_ids", torch.arange(config.n_positions), persistent=False)
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -427,7 +427,6 @@ def _prune_heads(self, heads_to_prune):
 
     @add_start_docstrings_to_model_forward(OPENAI_GPT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -453,6 +452,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
         elif inputs_embeds is not None:
@@ -531,7 +531,7 @@ def forward(
     OPENAI_GPT_START_DOCSTRING,
 )
 class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -549,7 +549,6 @@ def set_output_embeddings(self, new_embeddings):
 
     @add_start_docstrings_to_model_forward(OPENAI_GPT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=CausalLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -623,7 +622,7 @@ def prepare_inputs_for_generation(self, input_ids: torch.LongTensor, **kwargs) -
     OPENAI_GPT_START_DOCSTRING,
 )
 class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -676,10 +675,10 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import OpenAIGPTTokenizer, OpenAIGPTDoubleHeadsModel
+        >>> from transformers import AutoTokenizer, OpenAIGPTDoubleHeadsModel
         >>> import torch
 
-        >>> tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai-gpt")
         >>> model = OpenAIGPTDoubleHeadsModel.from_pretrained("openai-gpt")
         >>> tokenizer.add_special_tokens(
         ...     {"cls_token": "[CLS]"}
@@ -762,7 +761,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(OPENAI_GPT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -816,7 +814,9 @@ def forward(
             sequence_lengths = -1
         else:
             if input_ids is not None:
-                sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+                    logits.device
+                )
             else:
                 sequence_lengths = -1
                 logger.warning(
diff --git a/src/transformers/models/openai/modeling_tf_openai.py b/src/transformers/models/openai/modeling_tf_openai.py
index 6d848947a0c0..775664b1b381 100644
--- a/src/transformers/models/openai/modeling_tf_openai.py
+++ b/src/transformers/models/openai/modeling_tf_openai.py
@@ -15,6 +15,8 @@
 # limitations under the License.
 """ TF 2.0 OpenAI GPT model."""
 
+from __future__ import annotations
+
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union
 
@@ -35,7 +37,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -51,7 +53,6 @@
 
 _CHECKPOINT_FOR_DOC = "openai-gpt"
 _CONFIG_FOR_DOC = "OpenAIGPTConfig"
-_TOKENIZER_FOR_DOC = "OpenAIGPTTokenizer"
 
 TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "openai-gpt",
@@ -202,7 +203,6 @@ def __init__(self, config, *inputs, **kwargs):
         self.output_attentions = config.output_attentions
         self.return_dict = config.use_return_dict
         self.num_hidden_layers = config.n_layer
-        self.vocab_size = config.vocab_size
         self.n_embd = config.n_embd
         self.n_positions = config.n_positions
         self.initializer_range = config.initializer_range
@@ -239,18 +239,17 @@ def _prune_heads(self, heads_to_prune):
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFBaseModelOutput]:
-
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
@@ -298,30 +297,12 @@ def call(
         position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
 
         if inputs_embeds is None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = self.tokens_embed(input_ids, mode="embedding")
         position_embeds = tf.gather(self.positions_embed, position_ids)
         if token_type_ids is not None:
             token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                token_type_ids,
-                tf.cast(self.vocab_size, dtype=token_type_ids.dtype),
-                message=(
-                    "token_type_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(token_type_ids)} >= {self.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(token_type_ids, self.config.vocab_size, "token_type_ids")
             token_type_embeds = self.tokens_embed(token_type_ids, mode="embedding")
         else:
             token_type_embeds = 0
@@ -376,19 +357,6 @@ class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel):
     config_class = OpenAIGPTConfig
     base_model_prefix = "transformer"
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 @dataclass
 class TFOpenAIGPTDoubleHeadsModelOutput(ModelOutput):
@@ -415,8 +383,8 @@ class TFOpenAIGPTDoubleHeadsModelOutput(ModelOutput):
 
     logits: tf.Tensor = None
     mc_logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 OPENAI_GPT_START_DOCSTRING = r"""
@@ -466,7 +434,7 @@ class TFOpenAIGPTDoubleHeadsModelOutput(ModelOutput):
         input_ids (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`OpenAIGPTTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -529,25 +497,23 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(OPENAI_GPT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFBaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFBaseModelOutput]:
-
         outputs = self.transformer(
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -562,13 +528,6 @@ def call(
         )
         return outputs
 
-    # Copied from transformers.models.distilbert.modeling_tf_distilbert.TFDistilBertModel.serving_output
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutput(last_hidden_state=output.last_hidden_state, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -593,23 +552,22 @@ def set_output_embeddings(self, value):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(OPENAI_GPT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFCausalLMOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFCausalLMOutput]:
         r"""
@@ -652,12 +610,6 @@ def call(
             attentions=transformer_outputs.attentions,
         )
 
-    def serving_output(self, output: TFCausalLMOutput) -> TFCausalLMOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFCausalLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
     def prepare_inputs_for_generation(self, inputs, **kwargs):
         return {"input_ids": inputs}
 
@@ -685,13 +637,13 @@ def __init__(self, config, *inputs, **kwargs):
     @replace_return_docstrings(output_type=TFOpenAIGPTDoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        mc_token_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        mc_token_ids: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -708,9 +660,9 @@ def call(
 
         ```python
         >>> import tensorflow as tf
-        >>> from transformers import OpenAIGPTTokenizer, TFOpenAIGPTDoubleHeadsModel
+        >>> from transformers import AutoTokenizer, TFOpenAIGPTDoubleHeadsModel
 
-        >>> tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai-gpt")
         >>> model = TFOpenAIGPTDoubleHeadsModel.from_pretrained("openai-gpt")
 
         >>> # Add a [CLS] to the vocabulary (we should train it also!)
@@ -754,6 +706,12 @@ def call(
         )
         hidden_states = transformer_outputs[0]
         hidden_states = tf.reshape(hidden_states, input_shapes + shape_list(hidden_states)[-1:])
+        if return_dict and output_hidden_states:
+            # We do this to match the slightly odd PT behaviour - the final hidden state is reshaped to rank 4 when the
+            # input is rank 3, but all other hidden states remain at rank-3 (with the first 2 dims merged)
+            all_hidden_states = transformer_outputs.hidden_states[:-1] + (hidden_states,)
+        else:
+            all_hidden_states = None
         lm_logits = self.transformer.tokens_embed(hidden_states, mode="linear")
         mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids, training=training)
         mc_logits = tf.squeeze(mc_logits, axis=-1)
@@ -764,31 +722,17 @@ def call(
         return TFOpenAIGPTDoubleHeadsModelOutput(
             logits=lm_logits,
             mc_logits=mc_logits,
-            hidden_states=transformer_outputs.hidden_states,
+            hidden_states=all_hidden_states,
             attentions=transformer_outputs.attentions,
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-                "mc_token_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFOpenAIGPTDoubleHeadsModelOutput(
-            logits=output.logits, mc_logits=output.mc_logits, hidden_states=hs, attentions=attns
-        )
+    @property
+    def input_signature(self):
+        return {
+            "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
+            "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
+            "mc_token_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"),
+        }
 
 
 @add_start_docstrings(
@@ -821,23 +765,22 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(OPENAI_GPT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFSequenceClassifierOutput]:
         r"""
@@ -866,16 +809,10 @@ def call(
         else:
             if input_ids is not None:
                 sequence_lengths = (
-                    tf.reduce_sum(
-                        tf.cast(
-                            tf.math.not_equal(input_ids, self.config.pad_token_id),
-                            dtype=input_ids.dtype,
-                        ),
-                        -1,
-                        keepdims=False,
-                    )
+                    tf.argmax(tf.cast(tf.math.equal(input_ids, self.config.pad_token_id), input_ids.dtype), axis=-1)
                     - 1
                 )
+                sequence_lengths = tf.where(sequence_lengths >= 0, sequence_lengths, input_ids.shape[-1] - 1)
                 in_logits = tf.gather(logits, sequence_lengths, batch_dims=1, axis=1)
             else:
                 sequence_lengths = -1
@@ -911,10 +848,3 @@ def call(
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
         )
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
diff --git a/src/transformers/models/openai/tokenization_openai.py b/src/transformers/models/openai/tokenization_openai.py
index 96fd492dbb6c..cfdeb3207a6d 100644
--- a/src/transformers/models/openai/tokenization_openai.py
+++ b/src/transformers/models/openai/tokenization_openai.py
@@ -71,20 +71,30 @@ class BasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -103,7 +113,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -131,7 +143,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
@@ -257,8 +269,6 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
     model_input_names = ["input_ids", "attention_mask"]
 
     def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
-        super().__init__(unk_token=unk_token, **kwargs)
-
         try:
             import ftfy
             from spacy.lang.en import English
@@ -280,6 +290,8 @@ def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
         self.bpe_ranks = dict(zip(merges, range(len(merges))))
         self.cache = {}
 
+        super().__init__(unk_token=unk_token, **kwargs)
+
     @property
     def do_lower_case(self):
         return True
@@ -342,12 +354,12 @@ def _tokenize(self, text):
             # Using BERT's BasicTokenizer
             text = self.nlp.tokenize(text)
             for token in text:
-                split_tokens.extend([t for t in self.bpe(token).split(" ")])
+                split_tokens.extend(list(self.bpe(token).split(" ")))
         else:
             # Using SpaCy & ftfy (original tokenization process of OpenAI GPT)
             text = self.nlp(text_standardize(self.fix_text(text)))
             for token in text:
-                split_tokens.extend([t for t in self.bpe(token.text.lower()).split(" ")])
+                split_tokens.extend(list(self.bpe(token.text.lower()).split(" ")))
         return split_tokens
 
     def _convert_token_to_id(self, token):
diff --git a/src/transformers/models/opt/__init__.py b/src/transformers/models/opt/__init__.py
index c5a4533c03b5..db1c9300824b 100644
--- a/src/transformers/models/opt/__init__.py
+++ b/src/transformers/models/opt/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/opt/configuration_opt.py b/src/transformers/models/opt/configuration_opt.py
index f8b5bc4d8faf..d2b7a4347ea4 100644
--- a/src/transformers/models/opt/configuration_opt.py
+++ b/src/transformers/models/opt/configuration_opt.py
@@ -67,7 +67,7 @@ class OPTConfig(PretrainedConfig):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        layerdrop: (`float`, *optional*, defaults to 0.0):
+        layerdrop (`float`, *optional*, defaults to 0.0):
             The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
             details.
         init_std (`float`, *optional*, defaults to 0.02):
@@ -118,7 +118,7 @@ def __init__(
         eos_token_id=2,
         enable_bias=True,
         layer_norm_elementwise_affine=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             pad_token_id=pad_token_id,
diff --git a/src/transformers/models/opt/convert_opt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/opt/convert_opt_original_pytorch_checkpoint_to_pytorch.py
index ec1749daeff7..3f302b2ec3f4 100644
--- a/src/transformers/models/opt/convert_opt_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/opt/convert_opt_original_pytorch_checkpoint_to_pytorch.py
@@ -53,6 +53,27 @@ def load_checkpoint(checkpoint_path):
         if old_key in sd:
             sd[new_key] = sd.pop(old_key)
 
+    keys = list(sd.keys())
+    for key in keys:
+        if ".qkv_proj." in key:
+            value = sd[key]
+            # We split QKV in separate Q,K,V
+
+            q_name = key.replace(".qkv_proj.", ".q_proj.")
+            k_name = key.replace(".qkv_proj.", ".k_proj.")
+            v_name = key.replace(".qkv_proj.", ".v_proj.")
+
+            depth = value.shape[0]
+            assert depth % 3 == 0
+            # `SequeuceParallelTransformerBlock` has QKV weight is separated in K,V,Q despite the naming:
+            # https://cs.github.com/facebookresearch/metaseq/blob/51871bd73cd04c038f239ea2a26db1d7f6b37927/metaseq/modules/sequence_parallel_transformer_layer.py#L97
+            k, v, q = torch.split(value, depth // 3, dim=0)
+
+            sd[q_name] = q
+            sd[k_name] = k
+            sd[v_name] = v
+            del sd[key]
+
     return sd
 
 
diff --git a/src/transformers/models/opt/modeling_flax_opt.py b/src/transformers/models/opt/modeling_flax_opt.py
index 1237e3b25f73..5d9839f12048 100644
--- a/src/transformers/models/opt/modeling_flax_opt.py
+++ b/src/transformers/models/opt/modeling_flax_opt.py
@@ -37,7 +37,6 @@
 
 _CHECKPOINT_FOR_DOC = "facebook/opt-350m"
 _CONFIG_FOR_DOC = "OPTConfig"
-_TOKENIZER_FOR_DOC = "GPT2Tokenizer"
 
 
 OPT_START_DOCSTRING = r"""
@@ -80,7 +79,7 @@
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`GPT2Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -245,7 +244,7 @@ def __call__(
             attention_bias = lax.select(
                 attention_mask > 0,
                 jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
-                jnp.full(attention_mask.shape, float("-inf")).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
             )
         else:
             attention_bias = None
@@ -310,7 +309,6 @@ def __call__(
         output_attentions: bool = True,
         deterministic: bool = True,
     ) -> Tuple[jnp.ndarray]:
-
         residual = hidden_states
 
         # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention
@@ -528,7 +526,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, **kwargs)
         super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
@@ -667,7 +665,6 @@ def __call__(
         deterministic: bool = True,
         init_cache=False,
     ):
-
         decoder_outputs = self.decoder(
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -696,9 +693,7 @@ class FlaxOPTModel(FlaxOPTPreTrainedModel):
     module_class = FlaxOPTModule
 
 
-append_call_sample_docstring(
-    FlaxOPTModel, _TOKENIZER_FOR_DOC, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutput, _CONFIG_FOR_DOC
-)
+append_call_sample_docstring(FlaxOPTModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutput, _CONFIG_FOR_DOC)
 
 
 @add_start_docstrings(
@@ -729,7 +724,6 @@ def __call__(
         return_dict: bool = True,
         deterministic: bool = True,
     ):
-
         outputs = self.model(
             input_ids,
             attention_mask,
@@ -769,7 +763,7 @@ def __call__(
 class FlaxOPTForCausalLM(FlaxOPTPreTrainedModel):
     module_class = FlaxOPTForCausalLMModule
 
-    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jnp.DeviceArray] = None):
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
         # initializing the cache
         batch_size, seq_length = input_ids.shape
 
@@ -799,7 +793,6 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
 
 append_call_sample_docstring(
     FlaxOPTForCausalLM,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxBaseModelOutput,
     _CONFIG_FOR_DOC,
diff --git a/src/transformers/models/opt/modeling_opt.py b/src/transformers/models/opt/modeling_opt.py
index 3aebb95fcf6a..d24211f03936 100644
--- a/src/transformers/models/opt/modeling_opt.py
+++ b/src/transformers/models/opt/modeling_opt.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ PyTorch OPT model."""
-import random
 from typing import List, Optional, Tuple, Union
 
 import torch
@@ -43,7 +42,6 @@
 
 _CHECKPOINT_FOR_DOC = "facebook/opt-350m"
 _CONFIG_FOR_DOC = "OPTConfig"
-_TOKENIZER_FOR_DOC = "GPT2Tokenizer"
 
 # Base model docstring
 _EXPECTED_OUTPUT_SHAPE = [1, 8, 1024]
@@ -65,18 +63,21 @@
 ]
 
 
-def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
     """
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
-    mask_cond = torch.arange(mask.size(-1))
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
 
     if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
     return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
 
 
@@ -220,7 +221,9 @@ def forward(
                     f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
                 )
             attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
-            attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
+            attn_weights = torch.max(
+                attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min, device=attn_weights.device)
+            )
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
 
         # upcast to fp32 if the weights are in fp16. Please see https://github.com/huggingface/transformers/pull/17437
@@ -297,9 +300,9 @@ def forward(
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
         layer_head_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = False,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         """
         Args:
@@ -392,12 +395,10 @@ def forward(
     OPT_START_DOCSTRING,
 )
 class OPTPreTrainedModel(PreTrainedModel):
-
     config_class = OPTConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
     _no_split_modules = ["OPTDecoderLayer"]
-    _keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
 
     def _init_weights(self, module):
         std = self.config.init_std
@@ -421,7 +422,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`GPT2Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -433,7 +434,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 
             [What are attention masks?](../glossary#attention-mask)
 
-            Indices can be obtained using [`OPTTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
@@ -535,8 +536,11 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
         combined_attention_mask = None
         if input_shape[-1] > 1:
             combined_attention_mask = _make_causal_mask(
-                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
-            ).to(inputs_embeds.device)
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
 
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
@@ -567,7 +571,7 @@ def forward(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`OPTTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -627,25 +631,39 @@ def forward(
         else:
             raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
 
-        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
-
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
+        batch_size, seq_length = input_shape
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+        # required mask seq length can be calculated via length of past
+        mask_seq_length = past_key_values_length + seq_length
+
         # embed positions
         if attention_mask is None:
-            attention_mask = torch.ones(inputs_embeds.shape[:2], dtype=torch.bool, device=inputs_embeds.device)
-        pos_embeds = self.embed_positions(attention_mask, past_key_values_length)
-
-        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
+        elif attention_mask.shape[1] != mask_seq_length:
+            raise ValueError(
+                f"The provided attention mask has length {attention_mask.shape[1]}, but its length should be "
+                f"{mask_seq_length} (sum of the lengths of current and past inputs)"
+            )
+        causal_attention_mask = self._prepare_decoder_attention_mask(
             attention_mask, input_shape, inputs_embeds, past_key_values_length
         )
+        pos_embeds = self.embed_positions(attention_mask, past_key_values_length)
 
         if self.project_in is not None:
             inputs_embeds = self.project_in(inputs_embeds)
 
         hidden_states = inputs_embeds + pos_embeds
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
@@ -665,20 +683,15 @@ def forward(
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
 
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         # None for past_key_value
@@ -689,15 +702,14 @@ def custom_forward(*inputs):
                 layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
-                    attention_mask,
+                    causal_attention_mask,
                     head_mask[idx] if head_mask is not None else None,
                     None,
                 )
             else:
-
                 layer_outputs = decoder_layer(
                     hidden_states,
-                    attention_mask=attention_mask,
+                    attention_mask=causal_attention_mask,
                     layer_head_mask=(head_mask[idx] if head_mask is not None else None),
                     past_key_value=past_key_value,
                     output_attentions=output_attentions,
@@ -755,7 +767,6 @@ def get_decoder(self):
 
     @add_start_docstrings_to_model_forward(OPT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPast,
         config_class=_CONFIG_FOR_DOC,
@@ -773,7 +784,6 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
-
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -806,7 +816,7 @@ def forward(
 
 
 class OPTForCausalLM(OPTPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -856,7 +866,7 @@ def forward(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`OPTTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -910,18 +920,18 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import GPT2Tokenizer, OPTForCausalLM
+        >>> from transformers import AutoTokenizer, OPTForCausalLM
 
         >>> model = OPTForCausalLM.from_pretrained("facebook/opt-350m")
-        >>> tokenizer = GPT2Tokenizer.from_pretrained("facebook/opt-350m")
+        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
 
-        >>> prompt = "Hey, are you consciours? Can you talk to me?"
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
         >>> inputs = tokenizer(prompt, return_tensors="pt")
 
         >>> # Generate
         >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious. I'm just a little bit of a weirdo."
         ```"""
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -947,6 +957,8 @@ def forward(
 
         loss = None
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
             # Shift so that tokens < n predict n
             shift_logits = logits[..., :-1, :].contiguous()
             shift_labels = labels[..., 1:].contiguous()
@@ -967,27 +979,33 @@ def forward(
         )
 
     def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_ids.shape)
-
         if past_key_values:
             input_ids = input_ids[:, -1:]
-        # first step, decoder_cached_states are empty
-        return {
-            "input_ids": input_ids,  # encoder_outputs is defined. input_ids not needed
-            "attention_mask": attention_mask,
-            "past_key_values": past_key_values,
-            "use_cache": use_cache,
-        }
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
 
     @staticmethod
-    def _reorder_cache(past, beam_idx):
+    def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
         return reordered_past
 
 
@@ -1007,8 +1025,6 @@ def _reorder_cache(past, beam_idx):
     OPT_START_DOCSTRING,
 )
 class OPTForSequenceClassification(OPTPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
-
     def __init__(self, config: OPTConfig):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1020,7 +1036,6 @@ def __init__(self, config: OPTConfig):
 
     @add_start_docstrings_to_model_forward(OPT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
         output_type=SequenceClassifierOutputWithPast,
         config_class=_CONFIG_FOR_DOC,
@@ -1071,7 +1086,9 @@ def forward(
             sequence_lengths = -1
         else:
             if input_ids is not None:
-                sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device)
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+                    logits.device
+                )
             else:
                 sequence_lengths = -1
                 logger.warning(
@@ -1130,8 +1147,6 @@ def set_input_embeddings(self, value):
     OPT_START_DOCSTRING,
 )
 class OPTForQuestionAnswering(OPTPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
-
     def __init__(self, config: OPTConfig):
         super().__init__(config)
         self.model = OPTModel(config)
@@ -1171,11 +1186,11 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import GPT2Tokenizer, OPTForQuestionAnswering
+        >>> from transformers import AutoTokenizer, OPTForQuestionAnswering
         >>> import torch
 
         >>> torch.manual_seed(4)  # doctest: +IGNORE_RESULT
-        >>> tokenizer = GPT2Tokenizer.from_pretrained("facebook/opt-350m")
+        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
 
         >>> # note: we are loading a OPTForQuestionAnswering from the hub here,
         >>> # so the head will be randomly initialized, hence the predictions will be random
@@ -1190,10 +1205,14 @@ def forward(
         >>> answer_start_index = outputs.start_logits.argmax()
         >>> answer_end_index = outputs.end_logits.argmax()
 
-        >>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
+        >>> answer_offset = len(tokenizer(question)[0])
+
+        >>> predict_answer_tokens = inputs.input_ids[
+        ...     0, answer_offset + answer_start_index : answer_offset + answer_end_index + 1
+        ... ]
         >>> predicted = tokenizer.decode(predict_answer_tokens)
         >>> predicted
-        ' Henson?'
+        ' a nice puppet'
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
diff --git a/src/transformers/models/opt/modeling_tf_opt.py b/src/transformers/models/opt/modeling_tf_opt.py
index 3a7bb3cf8452..6c48d6e62927 100644
--- a/src/transformers/models/opt/modeling_tf_opt.py
+++ b/src/transformers/models/opt/modeling_tf_opt.py
@@ -15,6 +15,8 @@
 """ TF 2.0 OPT model."""
 
 
+from __future__ import annotations
+
 from typing import Optional, Tuple, Union
 
 import numpy as np
@@ -25,7 +27,6 @@
 
 # Public API
 from ...modeling_tf_utils import (
-    DUMMY_INPUTS,
     TFCausalLanguageModelingLoss,
     TFModelInputType,
     TFPreTrainedModel,
@@ -33,7 +34,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -48,28 +49,27 @@
 
 _CHECKPOINT_FOR_DOC = "facebook/opt-350m"
 _CONFIG_FOR_DOC = "OPTConfig"
-_TOKENIZER_FOR_DOC = "GPT2Tokenizer"
 
 # Base model docstring
 _EXPECTED_OUTPUT_SHAPE = [1, 8, 1024]
 
 # Causal LM output
-_CAUSAL_LM_EXPECTED_OUTPUT = "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
+_CAUSAL_LM_EXPECTED_OUTPUT = (
+    "Hey, are you conscious? Can you talk to me?\nI'm not conscious. I'm just a little bit of a weirdo."
+)
 
 LARGE_NEGATIVE = -1e8
 
 
-# Copied from transformers.models.bart.modeling_tf_bart._make_causal_mask
 def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: int = 0):
     """
     Make causal mask used for bi-directional self-attention.
     """
     bsz = input_ids_shape[0]
     tgt_len = input_ids_shape[1]
-    mask = tf.ones((tgt_len, tgt_len)) * LARGE_NEGATIVE
-    mask_cond = tf.range(shape_list(mask)[-1])
-
-    mask = tf.where(mask_cond < tf.reshape(mask_cond + 1, (shape_list(mask)[-1], 1)), 0.0, mask)
+    # We need triu with k = 1 but TF expects known compile-time dims for that, so we hack around it
+    mask = tf.fill((tgt_len, tgt_len), tf.cast(LARGE_NEGATIVE, tf.float32))
+    mask = tf.linalg.band_part(mask, 0, -1) - tf.linalg.band_part(mask, 0, 0)
 
     if past_key_values_length > 0:
         mask = tf.concat([tf.zeros((tgt_len, past_key_values_length)), mask], axis=-1)
@@ -91,7 +91,7 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
     return (one_cst - expanded_mask) * LARGE_NEGATIVE
 
 
-class TFOPTLearnedPositionalEmbedding(TFSharedEmbeddings):
+class TFOPTLearnedPositionalEmbedding(tf.keras.layers.Embedding):
     """
     This module learns positional embeddings up to a fixed maximum size.
     """
@@ -153,12 +153,12 @@ def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
     def call(
         self,
         hidden_states: tf.Tensor,
-        key_value_states: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[Tuple[tf.Tensor]]] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
+        key_value_states: tf.Tensor | None = None,
+        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
+        attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
         training: Optional[bool] = False,
-    ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
+    ) -> Tuple[tf.Tensor, tf.Tensor | None]:
         """Input shape: Batch x Time x Channel"""
 
         # if key_value_states are provided this layer is used as a cross-attention layer
@@ -292,8 +292,8 @@ def __init__(self, config: OPTConfig, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
         past_key_value: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         training: Optional[bool] = False,
         output_attentions: Optional[bool] = False,
@@ -301,7 +301,7 @@ def call(
     ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`tf.Tensor`, *optional*): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`, *optional*): mask for attention heads in a given layer of size
@@ -412,36 +412,13 @@ class TFOPTPreTrainedModel(TFPreTrainedModel):
     config_class = OPTConfig
     base_model_prefix = "model"
 
-    @property
-    def dummy_inputs(self):
-        pad_token = 1
-        input_ids = tf.convert_to_tensor(DUMMY_INPUTS, dtype=tf.int32)
-        dummy_inputs = {
-            "attention_mask": tf.cast(input_ids != pad_token, tf.int32),
-            "input_ids": input_ids,
-        }
-        return dummy_inputs
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 OPT_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`tf.Tensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -487,7 +464,7 @@ def serving(self, inputs):
 class TFOPTDecoder(tf.keras.layers.Layer):
     config_class = OPTConfig
 
-    def __init__(self, config: OPTConfig, load_weight_prefix=None, **kwargs):
+    def __init__(self, config: OPTConfig, **kwargs):
         super().__init__(**kwargs)
         self.config = config
         self.padding_idx = config.pad_token_id
@@ -537,26 +514,32 @@ def get_input_embeddings(self):
     def _prepare_decoder_attention_mask(self, attention_mask, input_shape, past_key_values_length):
         # create causal mask
         # # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        combined_attention_mask = None
-        if input_shape[-1] > 1:
-            combined_attention_mask = _make_causal_mask(input_shape, past_key_values_length=past_key_values_length)
-        else:
-            combined_attention_mask = _expand_mask(
-                tf.ones((input_shape[0], input_shape[1] + past_key_values_length)), tgt_len=input_shape[-1]
-            )
+        _, seq_length = input_shape
+        tf.debugging.assert_equal(
+            seq_length + past_key_values_length,
+            shape_list(attention_mask)[1],
+            message="Attention mask shape should be (batch_size, seq_length + past_key_values_length)"
+            f" but is {shape_list(attention_mask)[1]} with input_ids shape {input_shape} and past length"
+            f" {past_key_values_length}.",
+        )
 
-        if attention_mask is not None:
-            combined_attention_mask = combined_attention_mask + _expand_mask(attention_mask, tgt_len=input_shape[-1])
+        expanded_attn_mask = _expand_mask(attention_mask, tgt_len=input_shape[-1])
+        if seq_length > 1:
+            combined_attention_mask = (
+                _make_causal_mask(input_shape, past_key_values_length=past_key_values_length) + expanded_attn_mask
+            )
+        else:
+            combined_attention_mask = expanded_attn_mask
 
         return combined_attention_mask
 
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -570,7 +553,7 @@ def call(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`OPTTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -632,21 +615,20 @@ def call(
         past_key_values_length = shape_list(past_key_values[0][0])[2] if past_key_values is not None else 0
 
         if inputs_embeds is None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.embed_tokens.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.embed_tokens.vocab_size)
             inputs_embeds = self.embed_tokens(input_ids)
 
         if attention_mask is None:
-            attention_mask = tf.ones(inputs_embeds.shape[:2], dtype=tf.bool)
-
+            attention_mask = tf.ones((input_shape[0], input_shape[1] + past_key_values_length), dtype=tf.bool)
+        else:
+            tf.debugging.assert_equal(
+                shape_list(attention_mask)[1],
+                past_key_values_length + input_shape[1],
+                message=(
+                    f"The provided attention mask has length {tf.shape(attention_mask)[1]}, but its length should be "
+                    f"{past_key_values_length + input_shape[1]} (sum of the lengths of current and past inputs)"
+                ),
+            )
         pos_embeds = self.embed_positions(attention_mask, past_key_values_length)
 
         attention_mask = self._prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values_length)
@@ -733,19 +715,18 @@ def set_input_embeddings(self, new_embeddings):
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
-        **kwargs
+        **kwargs,
     ) -> Union[TFBaseModelOutputWithPast, Tuple[tf.Tensor]]:
-
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -799,7 +780,6 @@ def set_input_embeddings(self, new_embeddings):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(OPT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFBaseModelOutputWithPast,
         config_class=_CONFIG_FOR_DOC,
@@ -807,19 +787,18 @@ def set_input_embeddings(self, new_embeddings):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
-        **kwargs
+        **kwargs,
     ) -> Union[TFBaseModelOutputWithPast, Tuple[tf.Tensor]]:
-
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -898,7 +877,6 @@ def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_cache=
     @unpack_inputs
     @replace_return_docstrings(output_type=TFCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFCausalLMOutputWithPast,
         config_class=_CONFIG_FOR_DOC,
@@ -906,19 +884,19 @@ def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_cache=
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
+        input_ids: TFModelInputType | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
-        **kwargs
+        **kwargs,
     ) -> Union[TFCausalLMOutputWithPast, Tuple[tf.Tensor]]:
         r"""
         Args:
@@ -926,7 +904,7 @@ def call(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`OPTTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
diff --git a/src/transformers/models/owlvit/__init__.py b/src/transformers/models/owlvit/__init__.py
index f29db2f06c47..599508e0e5ca 100644
--- a/src/transformers/models/owlvit/__init__.py
+++ b/src/transformers/models/owlvit/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/owlvit/configuration_owlvit.py b/src/transformers/models/owlvit/configuration_owlvit.py
index 9b7f17d7e1c9..d21dc77bbf65 100644
--- a/src/transformers/models/owlvit/configuration_owlvit.py
+++ b/src/transformers/models/owlvit/configuration_owlvit.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ OWL-ViT model configuration"""
 
-import copy
 import os
 from collections import OrderedDict
 from typing import TYPE_CHECKING, Any, Dict, Mapping, Optional, Union
@@ -66,12 +65,11 @@ class OwlViTTextConfig(PretrainedConfig):
             just in case (e.g., 512 or 1024 or 2048).
         hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
-            defaults to 1e-5): The epsilon used by the layer normalization layers.
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         initializer_factor (`float`, *optional*, defaults to 1):
@@ -103,15 +101,14 @@ def __init__(
         num_attention_heads=8,
         max_position_embeddings=16,
         hidden_act="quick_gelu",
-        layer_norm_eps=0.00001,
-        dropout=0.0,
+        layer_norm_eps=1e-5,
         attention_dropout=0.0,
         initializer_range=0.02,
         initializer_factor=1.0,
         pad_token_id=0,
         bos_token_id=49406,
         eos_token_id=49407,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
@@ -123,13 +120,13 @@ def __init__(
         self.max_position_embeddings = max_position_embeddings
         self.hidden_act = hidden_act
         self.layer_norm_eps = layer_norm_eps
-        self.dropout = dropout
         self.attention_dropout = attention_dropout
         self.initializer_range = initializer_range
         self.initializer_factor = initializer_factor
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
 
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
@@ -173,15 +170,14 @@ class OwlViTVisionConfig(PretrainedConfig):
             The size (resolution) of each patch.
         hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
-            defaults to 1e-5): The epsilon used by the layer normalization layers.
-        dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        initializer_factor (`float``, *optional*, defaults to 1):
+        initializer_factor (`float``, *optional*, defaults to 1.0):
             A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
             testing).
 
@@ -212,12 +208,11 @@ def __init__(
         image_size=768,
         patch_size=32,
         hidden_act="quick_gelu",
-        layer_norm_eps=0.00001,
-        dropout=0.0,
+        layer_norm_eps=1e-5,
         attention_dropout=0.0,
         initializer_range=0.02,
         initializer_factor=1.0,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -230,13 +225,13 @@ def __init__(
         self.patch_size = patch_size
         self.hidden_act = hidden_act
         self.layer_norm_eps = layer_norm_eps
-        self.dropout = dropout
         self.attention_dropout = attention_dropout
         self.initializer_range = initializer_range
         self.initializer_factor = initializer_factor
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
 
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
@@ -278,7 +273,6 @@ class OwlViTConfig(PretrainedConfig):
     """
 
     model_type = "owlvit"
-    is_composition = True
 
     def __init__(
         self,
@@ -287,7 +281,7 @@ def __init__(
         projection_dim=512,
         logit_scale_init_value=2.6592,
         return_dict=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -309,6 +303,8 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
@@ -334,19 +330,6 @@ def from_text_vision_configs(cls, text_config: Dict, vision_config: Dict, **kwar
 
         return cls.from_dict(config_dict, **kwargs)
 
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["text_config"] = self.text_config.to_dict()
-        output["vision_config"] = self.vision_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
-
 
 class OwlViTOnnxConfig(OnnxConfig):
     @property
@@ -381,12 +364,11 @@ def generate_dummy_inputs(
         seq_length: int = -1,
         framework: Optional["TensorType"] = None,
     ) -> Mapping[str, Any]:
-
         text_input_dict = super().generate_dummy_inputs(
             processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
         )
         image_input_dict = super().generate_dummy_inputs(
-            processor.feature_extractor, batch_size=batch_size, framework=framework
+            processor.image_processor, batch_size=batch_size, framework=framework
         )
         return {**text_input_dict, **image_input_dict}
 
diff --git a/src/transformers/models/owlvit/convert_owlvit_original_flax_to_hf.py b/src/transformers/models/owlvit/convert_owlvit_original_flax_to_hf.py
index 09942fa3928d..1e9fbb950467 100644
--- a/src/transformers/models/owlvit/convert_owlvit_original_flax_to_hf.py
+++ b/src/transformers/models/owlvit/convert_owlvit_original_flax_to_hf.py
@@ -18,61 +18,61 @@
 import argparse
 import collections
 
-import torch
-import torch.nn as nn
-
 import jax
 import jax.numpy as jnp
+import torch
+import torch.nn as nn
 from clip.model import CLIP
 from flax.training import checkpoints
 from huggingface_hub import Repository
+
 from transformers import (
     CLIPTokenizer,
     OwlViTConfig,
-    OwlViTFeatureExtractor,
     OwlViTForObjectDetection,
+    OwlViTImageProcessor,
     OwlViTModel,
     OwlViTProcessor,
 )
 
 
 CONFIGS = {
-    "vit_b32": dict(
-        embed_dim=512,
-        image_resolution=768,
-        context_length=16,
-        vocab_size=49408,
-        vision_layers=12,
-        vision_width=768,
-        vision_patch_size=32,
-        transformer_width=512,
-        transformer_heads=8,
-        transformer_layers=12,
-    ),
-    "vit_b16": dict(
-        embed_dim=512,
-        image_resolution=768,
-        context_length=16,
-        vocab_size=49408,
-        vision_layers=12,
-        vision_width=768,
-        vision_patch_size=16,
-        transformer_width=512,
-        transformer_heads=8,
-        transformer_layers=12,
-    ),
-    "vit_l14": dict(
-        embed_dim=768,
-        image_resolution=840,
-        context_length=16,
-        vocab_size=49408,
-        vision_layers=24,
-        vision_width=1024,
-        vision_patch_size=14,
-        transformer_width=768,
-        transformer_heads=12,
-        transformer_layers=12,
-    ),
+    "vit_b32": {
+        "embed_dim": 512,
+        "image_resolution": 768,
+        "context_length": 16,
+        "vocab_size": 49408,
+        "vision_layers": 12,
+        "vision_width": 768,
+        "vision_patch_size": 32,
+        "transformer_width": 512,
+        "transformer_heads": 8,
+        "transformer_layers": 12,
+    },
+    "vit_b16": {
+        "embed_dim": 512,
+        "image_resolution": 768,
+        "context_length": 16,
+        "vocab_size": 49408,
+        "vision_layers": 12,
+        "vision_width": 768,
+        "vision_patch_size": 16,
+        "transformer_width": 512,
+        "transformer_heads": 8,
+        "transformer_layers": 12,
+    },
+    "vit_l14": {
+        "embed_dim": 768,
+        "image_resolution": 840,
+        "context_length": 16,
+        "vocab_size": 49408,
+        "vision_layers": 24,
+        "vision_width": 1024,
+        "vision_patch_size": 14,
+        "transformer_width": 768,
+        "transformer_heads": 12,
+        "transformer_layers": 12,
+    },
 }
 
 
@@ -314,7 +314,6 @@ def convert_clip_backbone(flax_params, torch_config):
     # Copy flax CLIP backbone params to PyTorch params
     for name, param in new_torch_params.items():
         if name in torch_clip_params.keys():
-
             new_param = torch.from_numpy(new_torch_params[name])
             torch_clip_params[name].copy_(new_param)
         else:
@@ -351,16 +350,16 @@ def convert_owlvit_checkpoint(pt_backbone, flax_params, attn_params, pytorch_dum
     # Save HF model
     hf_model.save_pretrained(repo.local_dir)
 
-    # Initialize feature extractor
-    feature_extractor = OwlViTFeatureExtractor(
+    # Initialize image processor
+    image_processor = OwlViTImageProcessor(
         size=config.vision_config.image_size, crop_size=config.vision_config.image_size
     )
     # Initialize tokenizer
     tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32", pad_token="!", model_max_length=16)
 
     # Initialize processor
-    processor = OwlViTProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
-    feature_extractor.save_pretrained(repo.local_dir)
+    processor = OwlViTProcessor(image_processor=image_processor, tokenizer=tokenizer)
+    image_processor.save_pretrained(repo.local_dir)
     processor.save_pretrained(repo.local_dir)
 
     repo.git_add()
diff --git a/src/transformers/models/owlvit/image_processing_owlvit.py b/src/transformers/models/owlvit/image_processing_owlvit.py
index fc3f0fa3314d..3efbc5122962 100644
--- a/src/transformers/models/owlvit/image_processing_owlvit.py
+++ b/src/transformers/models/owlvit/image_processing_owlvit.py
@@ -19,18 +19,27 @@
 
 import numpy as np
 
-from transformers.image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from transformers.image_transforms import (
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
     center_crop,
     center_to_corners_format,
-    normalize,
     rescale,
     resize,
     to_channel_dimension_format,
+)
+from ...image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
     to_numpy_array,
+    valid_images,
 )
-from transformers.image_utils import ChannelDimension, ImageInput, PILImageResampling, is_batched, valid_images
-from transformers.utils import TensorType, is_torch_available, logging
+from ...utils import TensorType, is_torch_available, logging
 
 
 if is_torch_available():
@@ -40,7 +49,6 @@
 logger = logging.get_logger(__name__)
 
 
-# Copied from transformers.models.detr.modeling_detr._upcast
 def _upcast(t):
     # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
     if t.is_floating_point():
@@ -94,7 +102,7 @@ class OwlViTImageProcessor(BaseImageProcessor):
             The size to use for resizing the image. Only has an effect if `do_resize` is set to `True`. If `size` is a
             sequence like (h, w), output size will be matched to this. If `size` is an int, then image will be resized
             to (size, size).
-        resample (`int`, *optional*, defaults to `PIL.Image.Resampling.BICUBIC`):
+        resample (`int`, *optional*, defaults to `Resampling.BICUBIC`):
             An optional resampling filter. This can be one of `PIL.Image.Resampling.NEAREST`,
             `PIL.Image.Resampling.BOX`, `PIL.Image.Resampling.BILINEAR`, `PIL.Image.Resampling.HAMMING`,
             `PIL.Image.Resampling.BICUBIC` or `PIL.Image.Resampling.LANCZOS`. Only has an effect if `do_resize` is set
@@ -130,7 +138,7 @@ def __init__(
         do_normalize=True,
         image_mean=None,
         image_std=None,
-        **kwargs
+        **kwargs,
     ):
         size = size if size is not None else {"height": 768, "width": 768}
         size = get_size_dict(size, default_to_square=True)
@@ -154,8 +162,8 @@ def __init__(
         self.do_rescale = do_rescale
         self.rescale_factor = rescale_factor
         self.do_normalize = do_normalize
-        self.image_mean = image_mean if image_mean is not None else [0.48145466, 0.4578275, 0.40821073]
-        self.image_std = image_std if image_std is not None else [0.26862954, 0.26130258, 0.27577711]
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
 
     def resize(
         self,
@@ -163,57 +171,100 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image to a certain size.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                The size to resize the image to. Must contain height and width keys.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                The resampling filter to use when resizing the input.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
         """
         size = get_size_dict(size, default_to_square=True)
         if "height" not in size or "width" not in size:
             raise ValueError("size dictionary must contain height and width keys")
 
-        return resize(image, (size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs)
+        return resize(
+            image,
+            (size["height"], size["width"]),
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
 
     def center_crop(
         self,
         image: np.ndarray,
         crop_size: Dict[str, int],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> np.ndarray:
         """
         Center crop an image to a certain size.
+
+        Args:
+            image (`np.ndarray`):
+                Image to center crop.
+            crop_size (`Dict[str, int]`):
+                The size to center crop the image to. Must contain height and width keys.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
         """
         crop_size = get_size_dict(crop_size, default_to_square=True)
         if "height" not in crop_size or "width" not in crop_size:
             raise ValueError("crop_size dictionary must contain height and width keys")
 
-        return center_crop(image, (crop_size["height"], crop_size["width"]), data_format=data_format, **kwargs)
+        return center_crop(
+            image,
+            (crop_size["height"], crop_size["width"]),
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
 
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
     def rescale(
         self,
         image: np.ndarray,
         rescale_factor: float,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.ndarray:
         """
-        Rescale an image by a certain factor.
-        """
-        return rescale(image, rescale_factor, data_format=data_format, **kwargs)
+        Rescale the image by the given factor. image = image * rescale_factor.
 
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: List[float],
-        std: List[float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Normalize an image with a certain mean and standard deviation.
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            rescale_factor (`float`):
+                The value to use for rescaling.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the input image. If unset, is inferred from the input image. Can be
+                one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
         """
-        return normalize(image, mean, std, data_format=data_format, **kwargs)
+        return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)
 
     def preprocess(
         self,
@@ -230,14 +281,16 @@ def preprocess(
         image_std: Optional[Union[float, List[float]]] = None,
         return_tensors: Optional[Union[TensorType, str]] = None,
         data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> BatchFeature:
         """
         Prepares an image or batch of images for the model.
 
         Args:
             images (`ImageInput`):
-                The image or batch of images to be prepared.
+                The image or batch of images to be prepared. Expects a single or batch of images with pixel values
+                ranging from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`.
             do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                 Whether or not to resize the input. If `True`, will resize the input to the size specified by `size`.
             size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@@ -276,6 +329,12 @@ def preprocess(
                 - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                 - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                 - Unset: defaults to the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         size = size if size is not None else self.size
@@ -300,8 +359,7 @@ def preprocess(
         if do_normalize is not None and (image_mean is None or image_std is None):
             raise ValueError("Image mean and std must be specified if do_normalize is True.")
 
-        if not is_batched(images):
-            images = [images]
+        images = make_list_of_images(images)
 
         if not valid_images(images):
             raise ValueError(
@@ -312,19 +370,42 @@ def preprocess(
         # All transformations expect numpy arrays
         images = [to_numpy_array(image) for image in images]
 
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
         if do_resize:
-            images = [self.resize(image, size=size, resample=resample) for image in images]
+            images = [
+                self.resize(image, size=size, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
 
         if do_center_crop:
-            images = [self.center_crop(image, crop_size=crop_size) for image in images]
+            images = [
+                self.center_crop(image, crop_size=crop_size, input_data_format=input_data_format) for image in images
+            ]
 
         if do_rescale:
-            images = [self.rescale(image, rescale_factor=rescale_factor) for image in images]
+            images = [
+                self.rescale(image, rescale_factor=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
 
         if do_normalize:
-            images = [self.normalize(image, mean=image_mean, std=image_std) for image in images]
-
-        images = [to_channel_dimension_format(image, data_format) for image in images]
+            images = [
+                self.normalize(image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
         encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
         return encoded_inputs
 
@@ -347,7 +428,7 @@ def post_process(self, outputs, target_sizes):
         # TODO: (amy) add support for other frameworks
         warnings.warn(
             "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
-            " `post_process_object_detection`",
+            " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
             FutureWarning,
         )
 
@@ -430,7 +511,7 @@ def post_process_object_detection(
         return results
 
     # TODO: (Amy) Make compatible with other frameworks
-    def post_process_image_guided_detection(self, outputs, threshold=0.6, nms_threshold=0.3, target_sizes=None):
+    def post_process_image_guided_detection(self, outputs, threshold=0.0, nms_threshold=0.3, target_sizes=None):
         """
         Converts the output of [`OwlViTForObjectDetection.image_guided_detection`] into the format expected by the COCO
         api.
@@ -438,7 +519,7 @@ def post_process_image_guided_detection(self, outputs, threshold=0.6, nms_thresh
         Args:
             outputs ([`OwlViTImageGuidedObjectDetectionOutput`]):
                 Raw outputs of the model.
-            threshold (`float`, *optional*, defaults to 0.6):
+            threshold (`float`, *optional*, defaults to 0.0):
                 Minimum confidence threshold to use to filter out predicted boxes.
             nms_threshold (`float`, *optional*, defaults to 0.3):
                 IoU threshold for non-maximum suppression of overlapping boxes.
@@ -491,11 +572,13 @@ def post_process_image_guided_detection(self, outputs, threshold=0.6, nms_thresh
             if not query_scores.nonzero().numel():
                 continue
 
+            # Apply threshold on scores before scaling
+            query_scores[query_scores < threshold] = 0.0
+
             # Scale box alpha such that the best box for each query has alpha 1.0 and the worst box has alpha 0.1.
             # All other boxes will either belong to a different query, or will not be shown.
             max_score = torch.max(query_scores) + 1e-6
             query_alphas = (query_scores - (max_score * 0.1)) / (max_score * 0.9)
-            query_alphas[query_alphas < threshold] = 0.0
             query_alphas = torch.clip(query_alphas, 0.0, 1.0)
             alphas[idx] = query_alphas
 
diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py
index 39f133483448..f2a9607a6e98 100644
--- a/src/transformers/models/owlvit/modeling_owlvit.py
+++ b/src/transformers/models/owlvit/modeling_owlvit.py
@@ -22,7 +22,7 @@
 import numpy as np
 import torch
 import torch.utils.checkpoint
-from torch import nn
+from torch import Tensor, nn
 
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
@@ -120,7 +120,7 @@ def to_tuple(self) -> Tuple[Any]:
 
 
 # Copied from transformers.models.detr.modeling_detr._upcast
-def _upcast(t: torch.Tensor) -> torch.Tensor:
+def _upcast(t: Tensor) -> Tensor:
     # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
     if t.is_floating_point():
         return t if t.dtype in (torch.float32, torch.float64) else t.float()
@@ -129,7 +129,7 @@ def _upcast(t: torch.Tensor) -> torch.Tensor:
 
 
 # Copied from transformers.models.detr.modeling_detr.box_area
-def box_area(boxes: torch.Tensor) -> torch.Tensor:
+def box_area(boxes: Tensor) -> Tensor:
     """
     Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.
 
@@ -146,7 +146,7 @@ def box_area(boxes: torch.Tensor) -> torch.Tensor:
 
 
 # Copied from transformers.models.detr.modeling_detr.box_iou
-def box_iou(boxes1: torch.Tensor, boxes2: torch.Tensor) -> torch.Tensor:
+def box_iou(boxes1, boxes2):
     area1 = box_area(boxes1)
     area2 = box_area(boxes2)
 
@@ -204,7 +204,7 @@ class OwlViTObjectDetectionOutput(ModelOutput):
         pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
             Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
             values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
-            possible padding). You can use [`~OwlViTFeatureExtractor.post_process_object_detection`] to retrieve the
+            possible padding). You can use [`~OwlViTImageProcessor.post_process_object_detection`] to retrieve the
             unnormalized bounding boxes.
         text_embeds (`torch.FloatTensor` of shape `(batch_size, num_max_text_queries, output_dim`):
             The text embeddings obtained by applying the projection layer to the pooled output of [`OwlViTTextModel`].
@@ -248,12 +248,12 @@ class OwlViTImageGuidedObjectDetectionOutput(ModelOutput):
         target_pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
             Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
             values are normalized in [0, 1], relative to the size of each individual target image in the batch
-            (disregarding possible padding). You can use [`~OwlViTFeatureExtractor.post_process_object_detection`] to
+            (disregarding possible padding). You can use [`~OwlViTImageProcessor.post_process_object_detection`] to
             retrieve the unnormalized bounding boxes.
         query_pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
             Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
             values are normalized in [0, 1], relative to the size of each individual query image in the batch
-            (disregarding possible padding). You can use [`~OwlViTFeatureExtractor.post_process_object_detection`] to
+            (disregarding possible padding). You can use [`~OwlViTImageProcessor.post_process_object_detection`] to
             retrieve the unnormalized bounding boxes.
         image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
             Pooled output of [`OwlViTVisionModel`]. OWL-ViT represents images as a set of image patches and computes
@@ -304,7 +304,7 @@ def __init__(self, config: OwlViTVisionConfig):
         self.num_patches = (config.image_size // config.patch_size) ** 2
         self.num_positions = self.num_patches + 1
         self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
-        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)))
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
 
     def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
         batch_size = pixel_values.shape[0]
@@ -325,7 +325,9 @@ def __init__(self, config: OwlViTTextConfig):
         self.position_embedding = nn.Embedding(config.max_position_embeddings, config.hidden_size)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
     def forward(
         self,
@@ -476,9 +478,9 @@ def __init__(self, config: OwlViTConfig):
         super().__init__()
         self.embed_dim = config.hidden_size
         self.self_attn = OwlViTAttention(config)
-        self.layer_norm1 = nn.LayerNorm(self.embed_dim)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
         self.mlp = OwlViTMLP(config)
-        self.layer_norm2 = nn.LayerNorm(self.embed_dim)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
 
     def forward(
         self,
@@ -530,7 +532,6 @@ class OwlViTPreTrainedModel(PreTrainedModel):
     config_class = OwlViTConfig
     base_model_prefix = "owlvit"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
     _no_split_modules = ["OwlViTEncoderLayer"]
 
     def _init_weights(self, module):
@@ -594,7 +595,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 OWLVIT_TEXT_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`CLIPTokenizer`]. See
+            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
             [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
             IDs?](../glossary#input-ids)
         attention_mask (`torch.Tensor` of shape `(batch_size, num_max_text_queries, sequence_length)`, *optional*):
@@ -629,7 +630,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 OWLVIT_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`CLIPTokenizer`]. See
+            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
             [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
             IDs?](../glossary#input-ids)
         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -656,7 +657,7 @@ def _set_gradient_checkpointing(self, module, value=False):
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values.
         input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`, *optional*):
-            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`CLIPTokenizer`]. See
+            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
             [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
             IDs?](../glossary#input-ids).
         attention_mask (`torch.Tensor` of shape `(batch_size, num_max_text_queries, sequence_length)`, *optional*):
@@ -783,6 +784,24 @@ def custom_forward(*inputs):
         )
 
 
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
 class OwlViTTextTransformer(nn.Module):
     def __init__(self, config: OwlViTTextConfig):
         super().__init__()
@@ -790,7 +809,7 @@ def __init__(self, config: OwlViTTextConfig):
         embed_dim = config.hidden_size
         self.embeddings = OwlViTTextEmbeddings(config)
         self.encoder = OwlViTEncoder(config)
-        self.final_layer_norm = nn.LayerNorm(embed_dim)
+        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
 
     @add_start_docstrings_to_model_forward(OWLVIT_TEXT_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=OwlViTTextConfig)
@@ -816,10 +835,10 @@ def forward(
         input_ids = input_ids.view(-1, input_shape[-1])
         hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
 
-        num_samples, seq_len = input_shape  # num_samples = batch_size * num_max_text_queries
+        # num_samples, seq_len = input_shape  where num_samples = batch_size * num_max_text_queries
         # OWLVIT's text model uses causal mask, prepare it here.
         # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
-        causal_attention_mask = self._build_causal_attention_mask(num_samples, seq_len).to(hidden_states.device)
+        causal_attention_mask = _make_causal_mask(input_shape, hidden_states.dtype, device=hidden_states.device)
         # expand attention_mask
         if attention_mask is not None:
             # [num_samples, seq_len] -> [num_samples, 1, tgt_seq_len, src_seq_len]
@@ -854,15 +873,6 @@ def forward(
             attentions=encoder_outputs.attentions,
         )
 
-    def _build_causal_attention_mask(self, bsz, seq_len):
-        # lazily create causal attention mask, with full attention between the vision tokens
-        # pytorch uses additive attention mask; fill with -inf
-        mask = torch.empty(bsz, seq_len, seq_len)
-        mask.fill_(torch.tensor(float("-inf")))
-        mask.triu_(1)  # zero out the lower diagonal
-        mask = mask.unsqueeze(1)  # expand mask
-        return mask
-
 
 class OwlViTTextModel(OwlViTPreTrainedModel):
     config_class = OwlViTTextConfig
@@ -894,10 +904,10 @@ def forward(
 
         Examples:
         ```python
-        >>> from transformers import OwlViTProcessor, OwlViTTextModel
+        >>> from transformers import AutoProcessor, OwlViTTextModel
 
         >>> model = OwlViTTextModel.from_pretrained("google/owlvit-base-patch32")
-        >>> processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
         >>> inputs = processor(
         ...     text=[["a photo of a cat", "a photo of a dog"], ["photo of a astranaut"]], return_tensors="pt"
         ... )
@@ -922,9 +932,9 @@ def __init__(self, config: OwlViTVisionConfig):
         self.config = config
 
         self.embeddings = OwlViTVisionEmbeddings(config)
-        self.pre_layernorm = nn.LayerNorm(config.hidden_size)
+        self.pre_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.encoder = OwlViTEncoder(config)
-        self.post_layernorm = nn.LayerNorm(config.hidden_size)
+        self.post_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
     @add_start_docstrings_to_model_forward(OWLVIT_VISION_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=OwlViTVisionConfig)
@@ -1003,10 +1013,10 @@ def forward(
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import OwlViTProcessor, OwlViTVisionModel
+        >>> from transformers import AutoProcessor, OwlViTVisionModel
 
         >>> model = OwlViTVisionModel.from_pretrained("google/owlvit-base-patch32")
-        >>> processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
@@ -1055,7 +1065,7 @@ def __init__(self, config: OwlViTConfig):
 
         self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
         self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
-        self.logit_scale = nn.Parameter(torch.ones([]) * config.logit_scale_init_value)
+        self.logit_scale = nn.Parameter(torch.tensor(config.logit_scale_init_value))
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1076,10 +1086,10 @@ def get_text_features(
 
         Examples:
         ```python
-        >>> from transformers import OwlViTProcessor, OwlViTModel
+        >>> from transformers import AutoProcessor, OwlViTModel
 
         >>> model = OwlViTModel.from_pretrained("google/owlvit-base-patch32")
-        >>> processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
         >>> inputs = processor(
         ...     text=[["a photo of a cat", "a photo of a dog"], ["photo of a astranaut"]], return_tensors="pt"
         ... )
@@ -1112,10 +1122,10 @@ def get_image_features(
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import OwlViTProcessor, OwlViTModel
+        >>> from transformers import AutoProcessor, OwlViTModel
 
         >>> model = OwlViTModel.from_pretrained("google/owlvit-base-patch32")
-        >>> processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
         >>> inputs = processor(images=image, return_tensors="pt")
@@ -1160,10 +1170,10 @@ def forward(
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import OwlViTProcessor, OwlViTModel
+        >>> from transformers import AutoProcessor, OwlViTModel
 
         >>> model = OwlViTModel.from_pretrained("google/owlvit-base-patch32")
-        >>> processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
         >>> inputs = processor(text=[["a photo of a cat", "a photo of a dog"]], images=image, return_tensors="pt")
@@ -1276,7 +1286,6 @@ def forward(
         query_embeds: Optional[torch.FloatTensor],
         query_mask: Optional[torch.Tensor],
     ) -> Tuple[torch.FloatTensor]:
-
         image_class_embeds = self.dense0(image_embeds)
         if query_embeds is None:
             device = image_class_embeds.device
@@ -1285,8 +1294,8 @@ def forward(
             return (pred_logits, image_class_embeds)
 
         # Normalize image and text features
-        image_class_embeds /= torch.linalg.norm(image_class_embeds, dim=-1, keepdim=True) + 1e-6
-        query_embeds /= torch.linalg.norm(query_embeds, dim=-1, keepdim=True) + 1e-6
+        image_class_embeds = image_class_embeds / (torch.linalg.norm(image_class_embeds, dim=-1, keepdim=True) + 1e-6)
+        query_embeds = query_embeds / (torch.linalg.norm(query_embeds, dim=-1, keepdim=True) + 1e-6)
 
         # Get class predictions
         pred_logits = torch.einsum("...pd,...qd->...pq", image_class_embeds, query_embeds)
@@ -1318,7 +1327,7 @@ def __init__(self, config: OwlViTConfig):
         self.class_head = OwlViTClassPredictionHead(config)
         self.box_head = OwlViTBoxPredictionHead(config)
 
-        self.layer_norm = nn.LayerNorm(config.vision_config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.vision_config.hidden_size, eps=config.vision_config.layer_norm_eps)
         self.sigmoid = nn.Sigmoid()
 
     def normalize_grid_corner_coordinates(self, feature_map: torch.FloatTensor):
@@ -1408,7 +1417,6 @@ def image_text_embedder(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
     ) -> Tuple[torch.FloatTensor]:
-
         # Encode text and image
         outputs = self.owlvit(
             pixel_values=pixel_values,
@@ -1478,7 +1486,6 @@ def image_embedder(
     def embed_image_query(
         self, query_image_features: torch.FloatTensor, query_feature_map: torch.FloatTensor
     ) -> torch.FloatTensor:
-
         _, class_embeds = self.class_predictor(query_image_features)
         pred_boxes = self.box_predictor(query_image_features, query_feature_map)
         pred_boxes_as_corners = center_to_corners_format(pred_boxes)
@@ -1502,7 +1509,7 @@ def embed_image_query(
 
             selected_inds = (ious[0] >= iou_threshold).nonzero()
             if selected_inds.numel():
-                selected_embeddings = class_embeds[i][selected_inds[0]]
+                selected_embeddings = class_embeds[i][selected_inds.squeeze(1)]
                 mean_embeds = torch.mean(class_embeds[i], axis=0)
                 mean_sim = torch.einsum("d,id->i", mean_embeds, selected_embeddings)
                 best_box_ind = selected_inds[torch.argmin(mean_sim)]
@@ -1535,9 +1542,9 @@ def image_guided_detection(
         >>> import requests
         >>> from PIL import Image
         >>> import torch
-        >>> from transformers import OwlViTProcessor, OwlViTForObjectDetection
+        >>> from transformers import AutoProcessor, OwlViTForObjectDetection
 
-        >>> processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch16")
+        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch16")
         >>> model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch16")
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
@@ -1557,8 +1564,8 @@ def image_guided_detection(
         >>> for box, score in zip(boxes, scores):
         ...     box = [round(i, 2) for i in box.tolist()]
         ...     print(f"Detected similar object with confidence {round(score.item(), 3)} at location {box}")
-        Detected similar object with confidence 0.782 at location [-0.06, -1.52, 637.96, 271.16]
-        Detected similar object with confidence 1.0 at location [39.64, 71.61, 176.21, 117.15]
+        Detected similar object with confidence 0.856 at location [10.94, 50.4, 315.8, 471.39]
+        Detected similar object with confidence 1.0 at location [334.84, 25.33, 636.16, 374.71]
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -1631,9 +1638,9 @@ def forward(
         >>> import requests
         >>> from PIL import Image
         >>> import torch
-        >>> from transformers import OwlViTProcessor, OwlViTForObjectDetection
+        >>> from transformers import AutoProcessor, OwlViTForObjectDetection
 
-        >>> processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
         >>> model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
diff --git a/src/transformers/models/owlvit/processing_owlvit.py b/src/transformers/models/owlvit/processing_owlvit.py
index 04b8c191acdb..088693a057f3 100644
--- a/src/transformers/models/owlvit/processing_owlvit.py
+++ b/src/transformers/models/owlvit/processing_owlvit.py
@@ -21,10 +21,9 @@
 
 import numpy as np
 
-from transformers import is_flax_available, is_tf_available, is_torch_available
-
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding
+from ...utils import is_flax_available, is_tf_available, is_torch_available
 
 
 class OwlViTProcessor(ProcessorMixin):
@@ -34,9 +33,9 @@ class OwlViTProcessor(ProcessorMixin):
     [`~OwlViTProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more information.
 
     Args:
-        image_processor ([`OwlViTImageProcessor`]):
+        image_processor ([`OwlViTImageProcessor`], *optional*):
             The image processor is a required input.
-        tokenizer ([`CLIPTokenizer`, `CLIPTokenizerFast`]):
+        tokenizer ([`CLIPTokenizer`, `CLIPTokenizerFast`], *optional*):
             The tokenizer is a required input.
     """
     attributes = ["image_processor", "tokenizer"]
@@ -44,6 +43,7 @@ class OwlViTProcessor(ProcessorMixin):
     tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        feature_extractor = None
         if "feature_extractor" in kwargs:
             warnings.warn(
                 "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
diff --git a/src/transformers/models/pegasus/__init__.py b/src/transformers/models/pegasus/__init__.py
index ca04afeeb1a0..97d6ddb31ac0 100644
--- a/src/transformers/models/pegasus/__init__.py
+++ b/src/transformers/models/pegasus/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/pegasus/configuration_pegasus.py b/src/transformers/models/pegasus/configuration_pegasus.py
index f38d61ff8a02..fd7de9a1a490 100644
--- a/src/transformers/models/pegasus/configuration_pegasus.py
+++ b/src/transformers/models/pegasus/configuration_pegasus.py
@@ -126,7 +126,7 @@ def __init__(
         pad_token_id=0,
         eos_token_id=1,
         forced_eos_token_id=1,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
diff --git a/src/transformers/models/pegasus/convert_pegasus_tf_to_pytorch.py b/src/transformers/models/pegasus/convert_pegasus_tf_to_pytorch.py
index 9254a0ba9411..cf183b590c1b 100644
--- a/src/transformers/models/pegasus/convert_pegasus_tf_to_pytorch.py
+++ b/src/transformers/models/pegasus/convert_pegasus_tf_to_pytorch.py
@@ -46,7 +46,6 @@
 
 
 def rename_state_dict_key(k):
-
     for pegasus_name, hf_name in PATTERNS:
         k = k.replace(pegasus_name, hf_name)
     return k
@@ -91,7 +90,7 @@ def get_tf_weights_as_numpy(path="./ckpt/aeslc/model.ckpt-32000") -> Dict:
     tf_weights = {}
     ignore_name = ["Adafactor", "global_step"]
     for name, shape in tqdm(init_vars, desc="converting tf checkpoint to dict"):
-        skip_key = any([pat in name for pat in ignore_name])
+        skip_key = any(pat in name for pat in ignore_name)
         if skip_key:
             continue
         array = tf.train.load_variable(path, name)
diff --git a/src/transformers/models/pegasus/modeling_flax_pegasus.py b/src/transformers/models/pegasus/modeling_flax_pegasus.py
index c4ecd25b6eb1..c5189746b106 100644
--- a/src/transformers/models/pegasus/modeling_flax_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_flax_pegasus.py
@@ -20,11 +20,10 @@
 from functools import partial
 from typing import Callable, Optional, Tuple
 
-import numpy as np
-
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen import combine_masks, make_causal_mask
 from flax.linen.attention import dot_product_attention_weights
@@ -55,7 +54,6 @@
 
 _CHECKPOINT_FOR_DOC = "google/pegasus-large"
 _CONFIG_FOR_DOC = "PegasusConfig"
-_TOKENIZER_FOR_DOC = "PegasusTokenizer"
 
 PEGASUS_START_DOCSTRING = r"""
     This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
@@ -97,7 +95,7 @@
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`PegasusTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -111,7 +109,7 @@
         decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`PegasusTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -144,7 +142,7 @@
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`PegasusTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -173,7 +171,7 @@
         decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`PegasusTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -212,20 +210,20 @@
 
 
 # Copied from transformers.models.bart.modeling_flax_bart.shift_tokens_right
-def shift_tokens_right(input_ids: np.array, pad_token_id: int, decoder_start_token_id: int) -> np.ndarray:
+def shift_tokens_right(input_ids: jnp.array, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray:
     """
     Shift input ids one token to the right.
     """
-    shifted_input_ids = np.zeros_like(input_ids)
-    shifted_input_ids[:, 1:] = input_ids[:, :-1]
-    shifted_input_ids[:, 0] = decoder_start_token_id
+    shifted_input_ids = jnp.zeros_like(input_ids)
+    shifted_input_ids = shifted_input_ids.at[:, 1:].set(input_ids[:, :-1])
+    shifted_input_ids = shifted_input_ids.at[:, 0].set(decoder_start_token_id)
 
-    shifted_input_ids = np.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
+    shifted_input_ids = jnp.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
     return shifted_input_ids
 
 
 # Copied from transformers.models.marian.modeling_flax_marian.create_sinusoidal_positions
-def create_sinusoidal_positions(n_pos, dim, dtype):
+def create_sinusoidal_positions(n_pos, dim):
     position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)])
     sentinel = dim // 2 + dim % 2
     out = np.zeros_like(position_enc)
@@ -375,7 +373,7 @@ def __call__(
             attention_bias = lax.select(
                 attention_mask > 0,
                 jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
-                jnp.full(attention_mask.shape, float("-inf")).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
             )
         else:
             attention_bias = None
@@ -688,9 +686,7 @@ def setup(self):
         self.max_source_positions = self.config.max_position_embeddings
         self.embed_scale = math.sqrt(embed_dim) if self.config.scale_embedding else 1.0
 
-        self.embed_positions = create_sinusoidal_positions(
-            self.config.max_position_embeddings, embed_dim, dtype=self.dtype
-        )
+        self.embed_positions = create_sinusoidal_positions(self.config.max_position_embeddings, embed_dim)
         self.layers = FlaxPegasusEncoderLayerCollection(self.config, self.dtype)
         self.layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
 
@@ -757,9 +753,7 @@ def setup(self):
         self.max_target_positions = self.config.max_position_embeddings
         self.embed_scale = math.sqrt(self.config.d_model) if self.config.scale_embedding else 1.0
 
-        self.embed_positions = create_sinusoidal_positions(
-            self.config.max_position_embeddings, embed_dim, dtype=self.dtype
-        )
+        self.embed_positions = create_sinusoidal_positions(self.config.max_position_embeddings, embed_dim)
 
         self.layers = FlaxPegasusDecoderLayerCollection(self.config, self.dtype)
         self.layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
@@ -904,7 +898,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, **kwargs)
         super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
@@ -1004,10 +998,10 @@ def encode(
         Example:
 
         ```python
-        >>> from transformers import PegasusTokenizer, FlaxPegasusForConditionalGeneration
+        >>> from transformers import AutoTokenizer, FlaxPegasusForConditionalGeneration
 
         >>> model = FlaxPegasusForConditionalGeneration.from_pretrained("google/pegasus-large")
-        >>> tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/pegasus-large")
 
         >>> text = "My friends are cool but they eat too many carbs."
         >>> inputs = tokenizer(text, max_length=1024, return_tensors="np")
@@ -1071,10 +1065,10 @@ def decode(
 
         ```python
         >>> import jax.numpy as jnp
-        >>> from transformers import PegasusTokenizer, FlaxPegasusForConditionalGeneration
+        >>> from transformers import AutoTokenizer, FlaxPegasusForConditionalGeneration
 
         >>> model = FlaxPegasusForConditionalGeneration.from_pretrained("google/pegasus-large")
-        >>> tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/pegasus-large")
 
         >>> text = "My friends are cool but they eat too many carbs."
         >>> inputs = tokenizer(text, max_length=1024, return_tensors="np")
@@ -1232,9 +1226,7 @@ class FlaxPegasusModel(FlaxPegasusPreTrainedModel):
     module_class = FlaxPegasusModule
 
 
-append_call_sample_docstring(
-    FlaxPegasusModel, _TOKENIZER_FOR_DOC, _CHECKPOINT_FOR_DOC, FlaxSeq2SeqModelOutput, _CONFIG_FOR_DOC
-)
+append_call_sample_docstring(FlaxPegasusModel, _CHECKPOINT_FOR_DOC, FlaxSeq2SeqModelOutput, _CONFIG_FOR_DOC)
 
 
 # Copied from transformers.models.bart.modeling_flax_bart.FlaxBartForConditionalGenerationModule with Bart->Pegasus
@@ -1341,10 +1333,10 @@ def decode(
 
         ```python
         >>> import jax.numpy as jnp
-        >>> from transformers import PegasusTokenizer, FlaxPegasusForConditionalGeneration
+        >>> from transformers import AutoTokenizer, FlaxPegasusForConditionalGeneration
 
         >>> model = FlaxPegasusForConditionalGeneration.from_pretrained("google/pegasus-large")
-        >>> tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/pegasus-large")
 
         >>> text = "My friends are cool but they eat too many carbs."
         >>> inputs = tokenizer(text, max_length=1024, return_tensors="np")
@@ -1458,10 +1450,10 @@ def prepare_inputs_for_generation(
         self,
         decoder_input_ids,
         max_length,
-        attention_mask: Optional[jnp.DeviceArray] = None,
-        decoder_attention_mask: Optional[jnp.DeviceArray] = None,
+        attention_mask: Optional[jax.Array] = None,
+        decoder_attention_mask: Optional[jax.Array] = None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # initializing the cache
         batch_size, seq_length = decoder_input_ids.shape
@@ -1496,30 +1488,38 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
 
     Summarization example:
 
-        >>> from transformers import PegasusTokenizer, FlaxPegasusForConditionalGeneration
+    ```pyton
+    >>> from transformers import AutoTokenizer, FlaxPegasusForConditionalGeneration
 
-        >>> model = FlaxPegasusForConditionalGeneration.from_pretrained('google/pegasus-large') >>> tokenizer =
-        PegasusTokenizer.from_pretrained('google/pegasus-large')
+    >>> model = FlaxPegasusForConditionalGeneration.from_pretrained('google/pegasus-large')
+    >>> tokenizer = AutoTokenizer.from_pretrained('google/pegasus-large')
 
-        >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs." >>> inputs =
-        tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='np')
+    >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
+    >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='np')
 
-        >>> # Generate Summary >>> summary_ids = model.generate(inputs['input_ids']).sequences >>>
-        print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
+    >>> # Generate Summary
+    >>> summary_ids = model.generate(inputs['input_ids']).sequences
+    >>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
+    ```
 
     Mask filling example:
 
-        >>> from transformers import PegasusTokenizer, FlaxPegasusForConditionalGeneration >>> tokenizer =
-        PegasusTokenizer.from_pretrained('google/pegasus-large') >>> TXT = "My friends are <mask> but they eat too many
-        carbs."
+    ```python
+    >>> from transformers import AutoTokenizer, FlaxPegasusForConditionalGeneration
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("google/pegasus-large")
+    >>> TXT = "My friends are <mask> but they eat too many carbs."
 
-        >>> model = FlaxPegasusForConditionalGeneration.from_pretrained('google/pegasus-large') >>> input_ids =
-        tokenizer([TXT], return_tensors='np')['input_ids'] >>> logits = model(input_ids).logits
+    >>> model = FlaxPegasusForConditionalGeneration.from_pretrained("google/pegasus-large")
+    >>> input_ids = tokenizer([TXT], return_tensors="np")["input_ids"]
+    >>> logits = model(input_ids).logits
 
-        >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item() >>> probs =
-        jax.nn.softmax(logits[0, masked_index], axis=0) >>> values, predictions = jax.lax.top_k(probs)
+    >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
+    >>> probs = jax.nn.softmax(logits[0, masked_index], axis=0)
+    >>> values, predictions = jax.lax.top_k(probs)
 
-        >>> tokenizer.decode(predictions).split()
+    >>> tokenizer.decode(predictions).split()
+    ```
 """
 
 overwrite_call_docstring(
diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py
index 2b88944d2854..67934520fbb6 100755
--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -16,7 +16,6 @@
 
 import copy
 import math
-import random
 from typing import List, Optional, Tuple, Union
 
 import numpy as np
@@ -48,7 +47,6 @@
 
 _CHECKPOINT_FOR_DOC = "google/pegasus-large"
 _CONFIG_FOR_DOC = "PegasusConfig"
-_TOKENIZER_FOR_DOC = "PegasusTokenizer"
 
 
 PEGASUS_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -75,18 +73,20 @@ def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start
 
 
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
     """
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
-    mask_cond = torch.arange(mask.size(-1))
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
 
     if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
     return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
 
 
@@ -232,8 +232,8 @@ def forward(
 
         proj_shape = (bsz * self.num_heads, -1, self.head_dim)
         query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-        key_states = key_states.view(*proj_shape)
-        value_states = value_states.view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
 
         src_len = key_states.size(1)
         attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
@@ -279,7 +279,7 @@ def forward(
 
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
             raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
                 f" {attn_output.size()}"
             )
 
@@ -287,7 +287,7 @@ def forward(
         attn_output = attn_output.transpose(1, 2)
 
         # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned aross GPUs when using tensor-parallelism.
+        # partitioned across GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
@@ -322,7 +322,7 @@ def forward(
     ) -> torch.Tensor:
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
@@ -406,11 +406,11 @@ def forward(
     ) -> torch.Tensor:
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             encoder_hidden_states (`torch.FloatTensor`):
-                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
             encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
@@ -525,10 +525,10 @@ def _set_gradient_checkpointing(self, module, value=False):
     Summarization example:
 
     ```python
-    >>> from transformers import PegasusTokenizer, PegasusForConditionalGeneration
+    >>> from transformers import AutoTokenizer, PegasusForConditionalGeneration
 
     >>> model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
-    >>> tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google/pegasus-xsum")
 
     >>> ARTICLE_TO_SUMMARIZE = (
     ...     "PG&E stated it scheduled the blackouts in response to forecasts for high winds "
@@ -550,7 +550,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`PegasusTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -564,7 +564,7 @@ def _set_gradient_checkpointing(self, module, value=False):
         decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`PegasusTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -717,7 +717,7 @@ def forward(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`PegasusTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -757,6 +757,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
         elif inputs_embeds is not None:
@@ -792,8 +793,13 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
                 layer_outputs = (None, None)
             else:
                 if self.gradient_checkpointing and self.training:
@@ -882,8 +888,11 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
         combined_attention_mask = None
         if input_shape[-1] > 1:
             combined_attention_mask = _make_causal_mask(
-                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
-            ).to(inputs_embeds.device)
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
 
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
@@ -946,7 +955,7 @@ def forward(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`PegasusTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -1045,6 +1054,13 @@ def forward(
 
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
@@ -1063,20 +1079,15 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         # None for past_key_value
@@ -1095,7 +1106,6 @@ def custom_forward(*inputs):
                     None,
                 )
             else:
-
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
@@ -1147,7 +1157,7 @@ def custom_forward(*inputs):
     PEGASUS_START_DOCSTRING,
 )
 class PegasusModel(PegasusPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: PegasusConfig):
         super().__init__(config)
@@ -1224,9 +1234,9 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import PegasusTokenizer, PegasusModel
+        >>> from transformers import AutoTokenizer, PegasusModel
 
-        >>> tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/pegasus-large")
         >>> model = PegasusModel.from_pretrained("google/pegasus-large")
 
         >>> inputs = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt")
@@ -1299,15 +1309,8 @@ def forward(
 )
 class PegasusForConditionalGeneration(PegasusPreTrainedModel):
     base_model_prefix = "model"
-    _keys_to_ignore_on_load_missing = [
-        r"final_logits_bias",
-        r"encoder.version",
-        r"decoder.version",
-        r"lm_head.weight",
-        r"embed_positions.weight",
-        "encoder.embed_tokens.weight",
-        "decoder.embed_tokens.weight",
-    ]
+    _keys_to_ignore_on_load_missing = ["final_logits_bias"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
 
     def __init__(self, config: PegasusConfig):
         super().__init__(config)
@@ -1324,9 +1327,9 @@ def get_encoder(self):
     def get_decoder(self):
         return self.model.get_decoder()
 
-    def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
-        new_embeddings = super().resize_token_embeddings(new_num_tokens)
-        self._resize_final_logits_bias(new_num_tokens)
+    def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding:
+        new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
+        self._resize_final_logits_bias(new_embeddings.weight.shape[0])
         return new_embeddings
 
     def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
@@ -1459,7 +1462,7 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # cut decoder_input_ids if past is used
         if past_key_values is not None:
@@ -1481,12 +1484,13 @@ def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
         return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
 
     @staticmethod
-    def _reorder_cache(past, beam_idx):
+    def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
+        for layer_past in past_key_values:
             # cached cross_attention states don't have to be reordered -> they are always the same
             reordered_past += (
-                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
+                + layer_past[2:],
             )
         return reordered_past
 
@@ -1507,7 +1511,7 @@ def forward(self, *args, **kwargs):
 
 
 class PegasusForCausalLM(PegasusPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         config = copy.deepcopy(config)
@@ -1585,7 +1589,7 @@ def forward(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`PegasusTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -1650,9 +1654,9 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import PegasusTokenizer, PegasusForCausalLM
+        >>> from transformers import AutoTokenizer, PegasusForCausalLM
 
-        >>> tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/pegasus-large")
         >>> model = PegasusForCausalLM.from_pretrained("google/pegasus-large", add_cross_attention=False)
         >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
@@ -1690,6 +1694,7 @@ def forward(
 
         loss = None
         if labels is not None:
+            labels = labels.to(logits.device)
             loss_fct = CrossEntropyLoss()
             loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
 
@@ -1724,8 +1729,10 @@ def prepare_inputs_for_generation(
         }
 
     @staticmethod
-    def _reorder_cache(past, beam_idx):
+    def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
         return reordered_past
diff --git a/src/transformers/models/pegasus/modeling_tf_pegasus.py b/src/transformers/models/pegasus/modeling_tf_pegasus.py
index 6ba4597d1ca6..52171b884ca8 100644
--- a/src/transformers/models/pegasus/modeling_tf_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_tf_pegasus.py
@@ -15,6 +15,8 @@
 """ TF 2.0 Pegasus model."""
 
 
+from __future__ import annotations
+
 import random
 from typing import Optional, Tuple, Union
 
@@ -31,14 +33,13 @@
 
 # Public API
 from ...modeling_tf_utils import (
-    DUMMY_INPUTS,
     TFCausalLanguageModelingLoss,
     TFModelInputType,
     TFPreTrainedModel,
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     ContextManagers,
     add_code_sample_docstrings,
@@ -55,7 +56,6 @@
 
 _CHECKPOINT_FOR_DOC = "google/pegasus-large"
 _CONFIG_FOR_DOC = "PegasusConfig"
-_TOKENIZER_FOR_DOC = "PegasusTokenizer"
 
 
 LARGE_NEGATIVE = -1e8
@@ -168,7 +168,7 @@ def _init_weight(n_pos: int, dim: int):
         return table
 
     def call(
-        self, input_shape: tf.TensorShape, past_key_values_length: int = 0, position_ids: Optional[tf.Tensor] = None
+        self, input_shape: tf.TensorShape, past_key_values_length: int = 0, position_ids: tf.Tensor | None = None
     ):
         """Input is expected to be of size [bsz x seqlen]."""
         if position_ids is None:
@@ -215,12 +215,12 @@ def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
     def call(
         self,
         hidden_states: tf.Tensor,
-        key_value_states: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[Tuple[tf.Tensor]]] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
+        key_value_states: tf.Tensor | None = None,
+        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
+        attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
         training: Optional[bool] = False,
-    ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
+    ) -> Tuple[tf.Tensor, tf.Tensor | None]:
         """Input shape: Batch x Time x Channel"""
 
         # if key_value_states are provided this layer is used as a cross-attention layer
@@ -356,7 +356,7 @@ def call(
     ):
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+            hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
             attention_mask (`tf.Tensor`): attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
@@ -420,21 +420,21 @@ def __init__(self, config: PegasusConfig, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
-        encoder_hidden_states: Optional[tf.Tensor] = None,
-        encoder_attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
-        cross_attn_layer_head_mask: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[tf.Tensor]] = None,
+        attention_mask: tf.Tensor | None = None,
+        encoder_hidden_states: tf.Tensor | None = None,
+        encoder_attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
+        cross_attn_layer_head_mask: tf.Tensor | None = None,
+        past_key_value: Tuple[tf.Tensor] | None = None,
         training: Optional[bool] = False,
     ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+            hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
             attention_mask (`tf.Tensor`): attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             encoder_hidden_states (`tf.Tensor`):
-                cross attention input to the layer of shape *(seq_len, batch, embed_dim)*
+                cross attention input to the layer of shape *(batch, seq_len, embed_dim)*
             encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
@@ -502,34 +502,6 @@ class TFPegasusPreTrainedModel(TFPreTrainedModel):
     config_class = PegasusConfig
     base_model_prefix = "model"
 
-    @property
-    def dummy_inputs(self):
-        pad_token = 1
-        input_ids = tf.convert_to_tensor(DUMMY_INPUTS, dtype=tf.int32)
-        decoder_input_ids = tf.convert_to_tensor(DUMMY_INPUTS, dtype=tf.int32)
-        dummy_inputs = {
-            "decoder_input_ids": decoder_input_ids,
-            "attention_mask": tf.cast(input_ids != pad_token, tf.int32),
-            "input_ids": input_ids,
-        }
-        return dummy_inputs
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-                "decoder_input_ids": tf.TensorSpec((None, None), tf.int32, name="decoder_input_ids"),
-                "decoder_attention_mask": tf.TensorSpec((None, None), tf.int32, name="decoder_attention_mask"),
-            }
-        ]
-    )
-    # Copied from transformers.models.bart.modeling_tf_bart.TFBartPretrainedModel.serving
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 PEGASUS_START_DOCSTRING = r"""
     This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
@@ -576,10 +548,10 @@ def serving(self, inputs):
     Summarization example:
 
     ```python
-    >>> from transformers import PegasusTokenizer, TFPegasusForConditionalGeneration
+    >>> from transformers import AutoTokenizer, TFPegasusForConditionalGeneration
 
     >>> model = TFPegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
-    >>> tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google/pegasus-xsum")
 
     >>> ARTICLE_TO_SUMMARIZE = (
     ...     "PG&E stated it scheduled the blackouts in response to forecasts for high winds "
@@ -599,7 +571,7 @@ def serving(self, inputs):
         input_ids (`tf.Tensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`PegasusTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -613,7 +585,7 @@ def serving(self, inputs):
         decoder_input_ids (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`PegasusTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -713,14 +685,14 @@ def set_embed_tokens(self, embed_tokens):
     @unpack_inputs
     def call(
         self,
-        input_ids=None,
-        inputs_embeds=None,
-        attention_mask=None,
-        head_mask=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        training=False,
+        input_ids: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: Optional[bool] = False,
     ):
         """
         Args:
@@ -728,7 +700,7 @@ def call(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`PegasusTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -783,16 +755,7 @@ def call(
             if hasattr(self.embed_tokens, "load_weight_prefix"):
                 context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
             with ContextManagers(context):
-                # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-                # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-                tf.debugging.assert_less(
-                    input_ids,
-                    tf.cast(self.embed_tokens.input_dim, dtype=input_ids.dtype),
-                    message=(
-                        "input_ids must be smaller than the embedding layer's input dimension (got"
-                        f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.input_dim})"
-                    ),
-                )
+                check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
                 inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
         embed_pos = self.embed_positions(input_shape)
@@ -822,7 +785,6 @@ def call(
 
         # encoder layers
         for idx, encoder_layer in enumerate(self.layers):
-
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
@@ -888,20 +850,20 @@ def set_embed_tokens(self, embed_tokens):
     @unpack_inputs
     def call(
         self,
-        input_ids=None,
-        inputs_embeds=None,
-        attention_mask=None,
-        position_ids=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        head_mask=None,
-        cross_attn_head_mask=None,
-        past_key_values=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        training=False,
+        input_ids: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        encoder_hidden_states: tf.Tensor | None = None,
+        encoder_attention_mask: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        cross_attn_head_mask: tf.Tensor | None = None,
+        past_key_values: Tuple[Tuple[tf.Tensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: Optional[bool] = False,
     ):
         r"""
         Args:
@@ -909,7 +871,7 @@ def call(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`PegasusTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -999,16 +961,7 @@ def call(
             if hasattr(self.embed_tokens, "load_weight_prefix"):
                 context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
             with ContextManagers(context):
-                # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-                # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-                tf.debugging.assert_less(
-                    input_ids,
-                    tf.cast(self.embed_tokens.input_dim, dtype=input_ids.dtype),
-                    message=(
-                        "input_ids must be smaller than the embedding layer's input dimension (got"
-                        f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.input_dim})"
-                    ),
-                )
+                check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
                 inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
         hidden_states = inputs_embeds
@@ -1126,26 +1079,25 @@ def set_input_embeddings(self, new_embeddings):
     @unpack_inputs
     def call(
         self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        decoder_position_ids=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
+        input_ids: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        decoder_input_ids: tf.Tensor | None = None,
+        decoder_attention_mask: tf.Tensor | None = None,
+        decoder_position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        decoder_head_mask: tf.Tensor | None = None,
+        cross_attn_head_mask: tf.Tensor | None = None,
         encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
-        past_key_values=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        training=False,
-        **kwargs
+        past_key_values: Tuple[Tuple[tf.Tensor]] = None,
+        inputs_embeds: tf.Tensor | None = None,
+        decoder_inputs_embeds: tf.Tensor | None = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: Optional[bool] = False,
+        **kwargs,
     ):
-
         if decoder_input_ids is None and decoder_inputs_embeds is None:
             use_cache = False
 
@@ -1226,33 +1178,31 @@ def get_decoder(self):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(PEGASUS_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFSeq2SeqModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        cross_attn_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_head_mask: np.ndarray | tf.Tensor | None = None,
+        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
         encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
-        **kwargs
+        **kwargs,
     ) -> Union[TFSeq2SeqModelOutput, Tuple[tf.Tensor]]:
-
         outputs = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -1362,23 +1312,23 @@ def set_bias(self, value):
     @add_end_docstrings(PEGASUS_GENERATION_EXAMPLE)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        cross_attn_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_head_mask: np.ndarray | tf.Tensor | None = None,
+        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
         encoder_outputs: Optional[TFBaseModelOutput] = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[TFSeq2SeqLMOutput, Tuple[tf.Tensor]]:
         """
@@ -1473,9 +1423,8 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
-
         # cut decoder_input_ids if past_key_values is used
         if past_key_values is not None:
             decoder_input_ids = decoder_input_ids[:, -1:]
diff --git a/src/transformers/models/pegasus/tokenization_pegasus.py b/src/transformers/models/pegasus/tokenization_pegasus.py
index 77127125bb48..3b6a461d81d0 100644
--- a/src/transformers/models/pegasus/tokenization_pegasus.py
+++ b/src/transformers/models/pegasus/tokenization_pegasus.py
@@ -18,7 +18,7 @@
 
 import sentencepiece as spm
 
-from ...tokenization_utils import PreTrainedTokenizer
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
 from ...utils import logging
 
 
@@ -38,6 +38,7 @@
 logger = logging.get_logger(__name__)
 
 
+# TODO ArthurZ refactor this to only use the added_tokens_encoder
 class PegasusTokenizer(PreTrainedTokenizer):
     r"""
     Construct a PEGASUS tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
@@ -95,8 +96,6 @@ class PegasusTokenizer(PreTrainedTokenizer):
             - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
               BPE-dropout.
     """
-    vocab_files_names = VOCAB_FILES_NAMES
-
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
@@ -113,7 +112,7 @@ def __init__(
         additional_special_tokens=None,
         offset=103,  # entries 2 - 104 are only used for pretraining
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         self.offset = offset
         if additional_special_tokens is not None:
@@ -122,7 +121,6 @@ def __init__(
                     f"additional_special_tokens should be of type {type(list)}, but is"
                     f" {type(additional_special_tokens)}"
                 )
-
             additional_special_tokens_extended = (
                 ([mask_token_sent] + additional_special_tokens)
                 if mask_token_sent not in additional_special_tokens and mask_token_sent is not None
@@ -140,10 +138,27 @@ def __init__(
                 )
             additional_special_tokens = additional_special_tokens_extended
         else:
+            additional_special_tokens_extended = []
             additional_special_tokens = [mask_token_sent] if mask_token_sent is not None else []
             additional_special_tokens += [f"<unk_{i}>" for i in range(2, self.offset)]
 
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        self.mask_token_sent = mask_token_sent
+        self.vocab_file = vocab_file
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+
+        self._added_tokens_decoder = {
+            0: AddedToken(str(pad_token), lstrip=True, rstrip=True),
+            1: AddedToken(str(eos_token), lstrip=True, rstrip=True),
+        }
+
+        if self.mask_token_sent is not None:
+            self._added_tokens_decoder[2] = AddedToken(mask_token_sent)
+            self._added_tokens_decoder[3] = AddedToken(str(mask_token))
+
+        for i in range(1, self.offset - 1):
+            self._added_tokens_decoder[len(self._added_tokens_decoder)] = AddedToken(f"<unk_{i}>")
 
         super().__init__(
             eos_token=eos_token,
@@ -156,31 +171,6 @@ def __init__(
             sp_model_kwargs=self.sp_model_kwargs,
             **kwargs,
         )
-        self.mask_token_sent = mask_token_sent
-        self.vocab_file = vocab_file
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(vocab_file)
-
-        # add special tokens to encoder dict
-        self.encoder: Dict[int, str] = {
-            0: self.pad_token,
-            1: self.eos_token,
-        }
-
-        if self.mask_token_sent is not None:
-            self.encoder.update(
-                {
-                    2: self.mask_token_sent,
-                    3: self.mask_token,
-                }
-            )
-
-        if self.offset > 0:
-            # entries 2-104 are only used for pretraining and called <mask_1>, <mask_2>, unk_2, ...unk_102
-            # mask_token_sent is already added to list -> so start at 1
-            self.encoder.update({i + 3: additional_special_tokens[i] for i in range(1, self.offset - 1)})
-
-        self.decoder: Dict[str, int] = {v: k for k, v in self.encoder.items()}
 
     @property
     def vocab_size(self) -> int:
@@ -212,21 +202,14 @@ def _tokenize(self, text: str) -> List[str]:
 
     def _convert_token_to_id(self, token: str) -> int:
         """Converts a token (str) to an id using the vocab."""
-        if token in self.decoder:
-            return self.decoder[token]
-        elif token in self.added_tokens_decoder:
-            return self.added_tokens_decoder[token]
         sp_id = self.sp_model.piece_to_id(token)
         return sp_id + self.offset
 
     def _convert_id_to_token(self, index: int) -> str:
         """Converts an index (integer) to a token (str) using the vocab."""
-        if index in self.encoder:
-            return self.encoder[index]
-        elif index in self.added_tokens_encoder:
-            return self.added_tokens_encoder[index]
-        else:
-            token = self.sp_model.IdToPiece(index - self.offset)
+        if index < self.offset:
+            return self.sp_model.IdToPiece(index)
+        token = self.sp_model.IdToPiece(index - self.offset)
         return token
 
     def convert_tokens_to_string(self, tokens):
diff --git a/src/transformers/models/pegasus/tokenization_pegasus_fast.py b/src/transformers/models/pegasus/tokenization_pegasus_fast.py
index 22c6018385f6..c99b600f5549 100644
--- a/src/transformers/models/pegasus/tokenization_pegasus_fast.py
+++ b/src/transformers/models/pegasus/tokenization_pegasus_fast.py
@@ -108,7 +108,7 @@ def __init__(
         mask_token_sent="<mask_1>",
         additional_special_tokens=None,
         offset=103,  # entries 2 - 104 are only used for pretraining
-        **kwargs
+        **kwargs,
     ):
         self.offset = offset
 
@@ -152,7 +152,10 @@ def __init__(
             **kwargs,
         )
         self.vocab_file = vocab_file
-        self.can_save_slow_tokenizer = False if not self.vocab_file else True
+
+    @property
+    def can_save_slow_tokenizer(self) -> bool:
+        return os.path.isfile(self.vocab_file) if self.vocab_file else False
 
     def _special_token_mask(self, seq):
         all_special_ids = set(self.all_special_ids)  # call it once instead of inside list comp
diff --git a/src/transformers/models/pegasus_x/__init__.py b/src/transformers/models/pegasus_x/__init__.py
index 32ab92a58552..32003120c6a0 100644
--- a/src/transformers/models/pegasus_x/__init__.py
+++ b/src/transformers/models/pegasus_x/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,7 +13,6 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-# rely on isort to merge the imports
 from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
 
 
diff --git a/src/transformers/models/pegasus_x/configuration_pegasus_x.py b/src/transformers/models/pegasus_x/configuration_pegasus_x.py
index 4263f15f98b7..f48e19bdcbca 100644
--- a/src/transformers/models/pegasus_x/configuration_pegasus_x.py
+++ b/src/transformers/models/pegasus_x/configuration_pegasus_x.py
@@ -65,17 +65,15 @@ class PegasusXConfig(PretrainedConfig):
             The dropout ratio for the attention probabilities.
         activation_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for classifier.
         max_position_embeddings (`int`, *optional*, defaults to 16384):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
         init_std (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
             The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
             for more details.
-        decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
             The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
             for more details.
         use_cache (`bool`, *optional*, defaults to `True`):
@@ -130,7 +128,6 @@ def __init__(
         activation_dropout=0.0,
         init_std=0.02,
         decoder_start_token_id=0,
-        classifier_dropout=0.0,
         scale_embedding=True,
         pad_token_id=0,
         eos_token_id=1,
@@ -138,7 +135,7 @@ def __init__(
         num_global_tokens=32,
         block_size=512,
         stagger_local_blocks=True,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
@@ -156,7 +153,6 @@ def __init__(
         self.init_std = init_std
         self.encoder_layerdrop = encoder_layerdrop
         self.decoder_layerdrop = decoder_layerdrop
-        self.classifier_dropout = classifier_dropout
         self.use_cache = use_cache
         self.num_hidden_layers = encoder_layers
         self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
diff --git a/src/transformers/models/pegasus_x/modeling_pegasus_x.py b/src/transformers/models/pegasus_x/modeling_pegasus_x.py
index 7ed712d26f8f..def82bdbaa71 100755
--- a/src/transformers/models/pegasus_x/modeling_pegasus_x.py
+++ b/src/transformers/models/pegasus_x/modeling_pegasus_x.py
@@ -16,7 +16,6 @@
 
 import dataclasses
 import math
-import random
 from typing import Optional, Tuple, Union
 
 import numpy as np
@@ -47,7 +46,6 @@
 
 _CHECKPOINT_FOR_DOC = "google/pegasus-x-base"
 _CONFIG_FOR_DOC = "PegasusXConfig"
-_TOKENIZER_FOR_DOC = "PegasusTokenizer"
 
 
 PEGASUS_X_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -93,18 +91,20 @@ def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start
 
 
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
     """
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
-    mask_cond = torch.arange(mask.size(-1))
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
 
     if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
     return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
 
 
@@ -241,8 +241,8 @@ def forward(
 
         proj_shape = (bsz * self.num_heads, -1, self.head_dim)
         query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-        key_states = key_states.view(*proj_shape)
-        value_states = value_states.view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
 
         src_len = key_states.size(1)
         attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
@@ -288,7 +288,7 @@ def forward(
 
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
             raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
                 f" {attn_output.size()}"
             )
 
@@ -296,7 +296,7 @@ def forward(
         attn_output = attn_output.transpose(1, 2)
 
         # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned aross GPUs when using tensor-parallelism.
+        # partitioned across GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
@@ -769,6 +769,7 @@ class PegasusXPreTrainedModel(PreTrainedModel):
     config_class = PegasusXConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
+    _no_split_modules = [r"PegasusXEncoderLayer", r"PegasusXDecoderLayer"]
 
     def _init_weights(self, module):
         std = self.config.init_std
@@ -804,10 +805,10 @@ def _set_gradient_checkpointing(self, module, value=False):
     Summarization example:
 
     ```python
-    >>> from transformers import PegasusTokenizer, PegasusXForConditionalGeneration
+    >>> from transformers import AutoTokenizer, PegasusXForConditionalGeneration
 
     >>> model = PegasusXForConditionalGeneration.from_pretrained("google/pegasus-x-base")
-    >>> tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-x-large")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google/pegasus-x-large")
 
     >>> ARTICLE_TO_SUMMARIZE = (
     ...     "PG&E stated it scheduled the blackouts in response to forecasts for high winds "
@@ -829,7 +830,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`PegasusTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -845,7 +846,7 @@ def _set_gradient_checkpointing(self, module, value=False):
         decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`PegasusTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -978,7 +979,7 @@ def forward(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`PegasusTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -1013,6 +1014,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
         elif inputs_embeds is not None:
@@ -1059,8 +1061,13 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
                 layer_outputs = (None, None)
             else:
                 if self.gradient_checkpointing and self.training:
@@ -1148,8 +1155,11 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
         combined_attention_mask = None
         if input_shape[-1] > 1:
             combined_attention_mask = _make_causal_mask(
-                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
-            ).to(inputs_embeds.device)
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
 
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
@@ -1206,7 +1216,7 @@ def forward(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`PegasusTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -1290,10 +1300,19 @@ def forward(
         # embed positions
         positions = self.embed_positions(inputs_embeds, past_key_values_length)
 
+        positions = positions.to(inputs_embeds.device)
+
         hidden_states = inputs_embeds + positions
 
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
@@ -1304,20 +1323,15 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         # None for past_key_value
@@ -1334,7 +1348,6 @@ def custom_forward(*inputs):
                     None,
                 )
             else:
-
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
@@ -1382,7 +1395,7 @@ def custom_forward(*inputs):
     PEGASUS_X_START_DOCSTRING,
 )
 class PegasusXModel(PegasusXPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: PegasusXConfig):
         super().__init__(config)
@@ -1456,9 +1469,9 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import PegasusTokenizer, PegasusModel
+        >>> from transformers import AutoTokenizer, PegasusModel
 
-        >>> tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-x-large")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/pegasus-x-large")
         >>> model = PegasusModel.from_pretrained("google/pegasus-x-large")
 
         >>> inputs = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt")
@@ -1526,14 +1539,7 @@ def forward(
 @add_start_docstrings("The PEGASUS-X for conditional generation (e.g. summarization).", PEGASUS_X_START_DOCSTRING)
 class PegasusXForConditionalGeneration(PegasusXPreTrainedModel):
     base_model_prefix = "model"
-    _keys_to_ignore_on_load_missing = [
-        r"encoder.version",
-        r"decoder.version",
-        r"lm_head.weight",
-        r"embed_positions.weight",
-        "decoder.embed_tokens.weight",
-        "encoder.embed_tokens.weight",
-    ]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
 
     def __init__(self, config: PegasusXConfig):
         super().__init__(config)
@@ -1549,10 +1555,6 @@ def get_encoder(self):
     def get_decoder(self):
         return self.model.get_decoder()
 
-    def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
-        new_embeddings = super().resize_token_embeddings(new_num_tokens)
-        return new_embeddings
-
     def get_output_embeddings(self):
         return self.lm_head
 
@@ -1665,7 +1667,7 @@ def prepare_inputs_for_generation(
         attention_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # cut decoder_input_ids if past is used
         if past_key_values is not None:
@@ -1684,12 +1686,13 @@ def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
         return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
 
     @staticmethod
-    def _reorder_cache(past, beam_idx):
+    def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
+        for layer_past in past_key_values:
             # cached cross_attention states don't have to be reordered -> they are always the same
             reordered_past += (
-                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
+                + layer_past[2:],
             )
         return reordered_past
 
diff --git a/src/transformers/models/perceiver/__init__.py b/src/transformers/models/perceiver/__init__.py
index 120d4a36fb0b..997f88234fc2 100644
--- a/src/transformers/models/perceiver/__init__.py
+++ b/src/transformers/models/perceiver/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/perceiver/configuration_perceiver.py b/src/transformers/models/perceiver/configuration_perceiver.py
index a4b475532ff8..72b13a11e113 100644
--- a/src/transformers/models/perceiver/configuration_perceiver.py
+++ b/src/transformers/models/perceiver/configuration_perceiver.py
@@ -65,7 +65,7 @@ class PerceiverConfig(PretrainedConfig):
         v_channels (`int`, *optional*):
             Dimension to project the values before applying attention in the cross-attention and self-attention layers
             of the encoder. Will default to preserving the dimension of the queries if not specified.
-        cross_attention_shape_for_attention (`str`, *optional*, defaults to `'kv'`):
+        cross_attention_shape_for_attention (`str`, *optional*, defaults to `"kv"`):
             Dimension to use when downsampling the queries and keys in the cross-attention layer of the encoder.
         self_attention_widening_factor (`int`, *optional*, defaults to 1):
             Dimension of the feed-forward layer in the cross-attention layer of the Transformer encoder.
@@ -89,7 +89,7 @@ class PerceiverConfig(PretrainedConfig):
             this to something large just in case (e.g., 512 or 1024 or 2048).
         image_size (`int`, *optional*, defaults to 56):
             Size of the images after preprocessing, for [`PerceiverForImageClassificationLearned`].
-        train_size (`List[int]`, *optional*, defaults to [368, 496]):
+        train_size (`List[int]`, *optional*, defaults to `[368, 496]`):
             Training size of the images for the optical flow model.
         num_frames (`int`, *optional*, defaults to 16):
             Number of video frames used for the multimodal autoencoding model.
@@ -100,6 +100,8 @@ class PerceiverConfig(PretrainedConfig):
         output_shape (`List[int]`, *optional*, defaults to `[1, 16, 224, 224]`):
             Shape of the output (batch_size, num_frames, height, width) for the video decoder queries of the multimodal
             autoencoding model. This excludes the channel dimension.
+        output_num_channels (`int`, *optional*, defaults to 512):
+            Number of output channels for each modalitiy decoder.
 
     Example:
 
@@ -133,7 +135,6 @@ def __init__(
         cross_attention_widening_factor=1,
         hidden_act="gelu",
         attention_probs_dropout_prob=0.1,
-        position_embedding_init_scale=0.02,
         initializer_range=0.02,
         layer_norm_eps=1e-12,
         use_query_residual=True,
@@ -145,7 +146,9 @@ def __init__(
         audio_samples_per_frame=1920,
         samples_per_patch=16,
         output_shape=[1, 16, 224, 224],
-        **kwargs
+        output_num_channels=512,
+        _label_trainable_num_channels=1024,
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -178,6 +181,8 @@ def __init__(
         self.audio_samples_per_frame = audio_samples_per_frame
         self.samples_per_patch = samples_per_patch
         self.output_shape = output_shape
+        self.output_num_channels = output_num_channels
+        self._label_trainable_num_channels = _label_trainable_num_channels
 
 
 class PerceiverOnnxConfig(OnnxConfig):
diff --git a/src/transformers/models/perceiver/convert_perceiver_haiku_to_pytorch.py b/src/transformers/models/perceiver/convert_perceiver_haiku_to_pytorch.py
index d1a4fd14e576..1ea979812752 100644
--- a/src/transformers/models/perceiver/convert_perceiver_haiku_to_pytorch.py
+++ b/src/transformers/models/perceiver/convert_perceiver_haiku_to_pytorch.py
@@ -20,22 +20,22 @@
 import pickle
 from pathlib import Path
 
+import haiku as hk
 import numpy as np
+import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 
-import haiku as hk
-import requests
-from huggingface_hub import hf_hub_download
 from transformers import (
     PerceiverConfig,
-    PerceiverFeatureExtractor,
     PerceiverForImageClassificationConvProcessing,
     PerceiverForImageClassificationFourier,
     PerceiverForImageClassificationLearned,
     PerceiverForMaskedLM,
     PerceiverForMultimodalAutoencoding,
     PerceiverForOpticalFlow,
+    PerceiverImageProcessor,
     PerceiverTokenizer,
 )
 from transformers.utils import logging
@@ -283,7 +283,7 @@ def convert_perceiver_checkpoint(pickle_file, pytorch_dump_folder_path, architec
         params = checkpoint
 
     # turn into initial state dict
-    state_dict = dict()
+    state_dict = {}
     for scope_name, parameters in hk.data_structures.to_mutable_dict(params).items():
         for param_name, param in parameters.items():
             state_dict[scope_name + "/" + param_name] = param
@@ -389,16 +389,16 @@ def convert_perceiver_checkpoint(pickle_file, pytorch_dump_folder_path, architec
         inputs = encoding.input_ids
         input_mask = encoding.attention_mask
     elif architecture in ["image_classification", "image_classification_fourier", "image_classification_conv"]:
-        feature_extractor = PerceiverFeatureExtractor()
+        image_processor = PerceiverImageProcessor()
         image = prepare_img()
-        encoding = feature_extractor(image, return_tensors="pt")
+        encoding = image_processor(image, return_tensors="pt")
         inputs = encoding.pixel_values
     elif architecture == "optical_flow":
         inputs = torch.randn(1, 2, 27, 368, 496)
     elif architecture == "multimodal_autoencoding":
         images = torch.randn((1, 16, 3, 224, 224))
         audio = torch.randn((1, 30720, 1))
-        inputs = dict(image=images, audio=audio, label=torch.zeros((images.shape[0], 700)))
+        inputs = {"image": images, "audio": audio, "label": torch.zeros((images.shape[0], 700))}
 
     # forward pass
     if architecture == "multimodal_autoencoding":
diff --git a/src/transformers/models/perceiver/image_processing_perceiver.py b/src/transformers/models/perceiver/image_processing_perceiver.py
index 18161a97e099..272cf32fa5eb 100644
--- a/src/transformers/models/perceiver/image_processing_perceiver.py
+++ b/src/transformers/models/perceiver/image_processing_perceiver.py
@@ -18,11 +18,8 @@
 
 import numpy as np
 
-from transformers.utils import is_vision_available
-from transformers.utils.generic import TensorType
-
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import center_crop, normalize, rescale, resize, to_channel_dimension_format
+from ...image_transforms import center_crop, resize, to_channel_dimension_format
 from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
@@ -30,11 +27,13 @@
     ImageInput,
     PILImageResampling,
     get_image_size,
-    is_batched,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
     to_numpy_array,
     valid_images,
 )
-from ...utils import logging
+from ...utils import TensorType, is_vision_available, logging
 
 
 if is_vision_available():
@@ -95,7 +94,7 @@ def __init__(
         do_normalize: bool = True,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         crop_size = crop_size if crop_size is not None else {"height": 256, "width": 256}
@@ -120,7 +119,8 @@ def center_crop(
         crop_size: Dict[str, int],
         size: Optional[int] = None,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> np.ndarray:
         """
         Center crop an image to `(size["height"] / crop_size["height"] * min_dim, size["width"] / crop_size["width"] *
@@ -138,24 +138,34 @@ def center_crop(
                 Size of the image after resizing. If not provided, the self.size attribute will be used.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
         """
         size = self.size if size is None else size
         size = get_size_dict(size)
         crop_size = get_size_dict(crop_size, param_name="crop_size")
 
-        height, width = get_image_size(image)
+        height, width = get_image_size(image, channel_dim=input_data_format)
         min_dim = min(height, width)
         cropped_height = (size["height"] / crop_size["height"]) * min_dim
         cropped_width = (size["width"] / crop_size["width"]) * min_dim
-        return center_crop(image, size=(cropped_height, cropped_width), data_format=data_format, **kwargs)
+        return center_crop(
+            image,
+            size=(cropped_height, cropped_width),
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
 
+    # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize with PILImageResampling.BILINEAR->PILImageResampling.BICUBIC
     def resize(
         self,
         image: np.ndarray,
         size: Dict[str, int],
-        resample: PILImageResampling = PIL.Image.BICUBIC,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image to `(size["height"], size["width"])`.
@@ -164,62 +174,38 @@ def resize(
             image (`np.ndarray`):
                 Image to resize.
             size (`Dict[str, int]`):
-                Size of the output image.
-            resample (`PILImageResampling`, *optional*, defaults to `PIL.Image.BICUBIC`):
-                Resampling filter to use when resizing the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
         """
         size = get_size_dict(size)
         if "height" not in size or "width" not in size:
-            raise ValueError(f"The size dictionary must contain the keys 'height' and 'width'. Got {size.keys()}")
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        output_size = (size["height"], size["width"])
         return resize(
-            image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
         )
 
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            mean (`float` or `List[float]`):
-                Image mean.
-            std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: ImageInput,
@@ -235,6 +221,7 @@ def preprocess(
         image_std: Optional[Union[float, List[float]]] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
         **kwargs,
     ) -> PIL.Image.Image:
         """
@@ -242,7 +229,8 @@ def preprocess(
 
         Args:
             images (`ImageInput`):
-                Image to preprocess.
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
             do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
                 Whether to center crop the image to `crop_size`.
             crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
@@ -275,6 +263,12 @@ def preprocess(
                 The channel dimension format for the output image. Can be one of:
                     - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                     - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
         do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
         crop_size = crop_size if crop_size is not None else self.crop_size
@@ -289,8 +283,7 @@ def preprocess(
         image_mean = image_mean if image_mean is not None else self.image_mean
         image_std = image_std if image_std is not None else self.image_std
 
-        if not is_batched(images):
-            images = [images]
+        images = make_list_of_images(images)
 
         if not valid_images(images):
             raise ValueError(
@@ -313,19 +306,42 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
         if do_center_crop:
-            images = [self.center_crop(image, crop_size, size=size) for image in images]
+            images = [
+                self.center_crop(image, crop_size, size=size, input_data_format=input_data_format) for image in images
+            ]
 
         if do_resize:
-            images = [self.resize(image=image, size=size, resample=resample) for image in images]
+            images = [
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
 
         if do_rescale:
-            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
 
         if do_normalize:
-            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
-
-        images = [to_channel_dimension_format(image, data_format) for image in images]
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
 
         data = {"pixel_values": images}
         return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/src/transformers/models/perceiver/modeling_perceiver.py b/src/transformers/models/perceiver/modeling_perceiver.py
index 02961b8b617a..bb7ac2bc3139 100755
--- a/src/transformers/models/perceiver/modeling_perceiver.py
+++ b/src/transformers/models/perceiver/modeling_perceiver.py
@@ -50,7 +50,6 @@
 
 _CHECKPOINT_FOR_DOC = "deepmind/language-perceiver"
 _CONFIG_FOR_DOC = "PerceiverConfig"
-_TOKENIZER_FOR_DOC = "PerceiverTokenizer"
 
 PERCEIVER_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "deepmind/language-perceiver",
@@ -878,7 +877,7 @@ def forward(
 
         # If no attention mask is provided, make them all ones
         if attention_mask is None:
-            attention_mask = torch.ones(((batch_size, seq_length)), device=device)
+            attention_mask = torch.ones((batch_size, seq_length), device=device)
         # Make the attention mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
         extended_attention_mask = self.invert_attention_mask(attention_mask)
 
@@ -912,7 +911,7 @@ def forward(
                     "label": 1,
                 }
             else:
-                output_modality_sizes = None
+                output_modality_sizes = modality_sizes
             decoder_query = self.decoder.decoder_query(
                 inputs, modality_sizes, inputs_without_pos, subsampled_points=subsampled_output_points
             )
@@ -953,14 +952,15 @@ def forward(
 
 @add_start_docstrings("""Example use of Perceiver for masked language modeling.""", PERCEIVER_START_DOCSTRING)
 class PerceiverForMaskedLM(PerceiverPreTrainedModel):
-    def __init__(self, config):
+    def __init__(self, config: PerceiverConfig):
         super().__init__(config)
 
         text_preprocessor = PerceiverTextPreprocessor(config)
 
-        trainable_position_encoding_kwargs_decoder = dict(
-            num_channels=text_preprocessor.num_channels, index_dims=config.max_position_embeddings
-        )
+        trainable_position_encoding_kwargs_decoder = {
+            "num_channels": text_preprocessor.num_channels,
+            "index_dims": config.max_position_embeddings,
+        }
 
         self.perceiver = PerceiverModel(
             config,
@@ -1007,10 +1007,10 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import PerceiverTokenizer, PerceiverForMaskedLM
+        >>> from transformers import AutoTokenizer, PerceiverForMaskedLM
         >>> import torch
 
-        >>> tokenizer = PerceiverTokenizer.from_pretrained("deepmind/language-perceiver")
+        >>> tokenizer = AutoTokenizer.from_pretrained("deepmind/language-perceiver")
         >>> model = PerceiverForMaskedLM.from_pretrained("deepmind/language-perceiver")
 
         >>> # training
@@ -1090,7 +1090,7 @@ class PerceiverForSequenceClassification(PerceiverPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
 
-        trainable_position_encoding_kwargs_decoder = dict(num_channels=config.d_latents, index_dims=1)
+        trainable_position_encoding_kwargs_decoder = {"num_channels": config.d_latents, "index_dims": 1}
 
         self.num_labels = config.num_labels
         self.perceiver = PerceiverModel(
@@ -1131,9 +1131,9 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import PerceiverTokenizer, PerceiverForSequenceClassification
+        >>> from transformers import AutoTokenizer, PerceiverForSequenceClassification
 
-        >>> tokenizer = PerceiverTokenizer.from_pretrained("deepmind/language-perceiver")
+        >>> tokenizer = AutoTokenizer.from_pretrained("deepmind/language-perceiver")
         >>> model = PerceiverForSequenceClassification.from_pretrained("deepmind/language-perceiver")
 
         >>> text = "hello world"
@@ -1215,8 +1215,8 @@ class PerceiverForImageClassificationLearned(PerceiverPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
 
-        trainable_position_encoding_kwargs_preprocessor = dict(num_channels=256, index_dims=config.image_size**2)
-        trainable_position_encoding_kwargs_decoder = dict(num_channels=config.d_latents, index_dims=1)
+        trainable_position_encoding_kwargs_preprocessor = {"num_channels": 256, "index_dims": config.image_size**2}
+        trainable_position_encoding_kwargs_decoder = {"num_channels": config.d_latents, "index_dims": 1}
 
         self.num_labels = config.num_labels
         self.perceiver = PerceiverModel(
@@ -1266,14 +1266,14 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import PerceiverImageProcessor, PerceiverForImageClassificationLearned
+        >>> from transformers import AutoImageProcessor, PerceiverForImageClassificationLearned
         >>> from PIL import Image
         >>> import requests
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> image_processor = PerceiverImageProcessor.from_pretrained("deepmind/vision-perceiver-learned")
+        >>> image_processor = AutoImageProcessor.from_pretrained("deepmind/vision-perceiver-learned")
         >>> model = PerceiverForImageClassificationLearned.from_pretrained("deepmind/vision-perceiver-learned")
 
         >>> inputs = image_processor(images=image, return_tensors="pt").pixel_values
@@ -1358,10 +1358,13 @@ class PerceiverForImageClassificationFourier(PerceiverPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
 
-        fourier_position_encoding_kwargs_preprocessor = dict(
-            concat_pos=True, max_resolution=(224, 224), num_bands=64, sine_only=False
-        )
-        trainable_position_encoding_kwargs_decoder = dict(num_channels=config.d_latents, index_dims=1)
+        fourier_position_encoding_kwargs_preprocessor = {
+            "concat_pos": True,
+            "max_resolution": (224, 224),
+            "num_bands": 64,
+            "sine_only": False,
+        }
+        trainable_position_encoding_kwargs_decoder = {"num_channels": config.d_latents, "index_dims": 1}
 
         self.num_labels = config.num_labels
         self.perceiver = PerceiverModel(
@@ -1407,14 +1410,14 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import PerceiverImageProcessor, PerceiverForImageClassificationFourier
+        >>> from transformers import AutoImageProcessor, PerceiverForImageClassificationFourier
         >>> from PIL import Image
         >>> import requests
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> image_processor = PerceiverImageProcessor.from_pretrained("deepmind/vision-perceiver-fourier")
+        >>> image_processor = AutoImageProcessor.from_pretrained("deepmind/vision-perceiver-fourier")
         >>> model = PerceiverForImageClassificationFourier.from_pretrained("deepmind/vision-perceiver-fourier")
 
         >>> inputs = image_processor(images=image, return_tensors="pt").pixel_values
@@ -1498,10 +1501,13 @@ class PerceiverForImageClassificationConvProcessing(PerceiverPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
 
-        fourier_position_encoding_kwargs_preprocessor = dict(
-            concat_pos=True, max_resolution=(56, 56), num_bands=64, sine_only=False
-        )
-        trainable_position_encoding_kwargs_decoder = dict(num_channels=config.d_latents, index_dims=1)
+        fourier_position_encoding_kwargs_preprocessor = {
+            "concat_pos": True,
+            "max_resolution": (56, 56),
+            "num_bands": 64,
+            "sine_only": False,
+        }
+        trainable_position_encoding_kwargs_decoder = {"num_channels": config.d_latents, "index_dims": 1}
 
         self.num_labels = config.num_labels
         self.perceiver = PerceiverModel(
@@ -1548,14 +1554,14 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import PerceiverImageProcessor, PerceiverForImageClassificationConvProcessing
+        >>> from transformers import AutoImageProcessor, PerceiverForImageClassificationConvProcessing
         >>> from PIL import Image
         >>> import requests
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> image_processor = PerceiverImageProcessor.from_pretrained("deepmind/vision-perceiver-conv")
+        >>> image_processor = AutoImageProcessor.from_pretrained("deepmind/vision-perceiver-conv")
         >>> model = PerceiverForImageClassificationConvProcessing.from_pretrained("deepmind/vision-perceiver-conv")
 
         >>> inputs = image_processor(images=image, return_tensors="pt").pixel_values
@@ -1639,15 +1645,18 @@ class PerceiverForOpticalFlow(PerceiverPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
 
-        fourier_position_encoding_kwargs_preprocessor = dict(
-            num_bands=64,
-            max_resolution=config.train_size,
-            sine_only=False,
-            concat_pos=True,
-        )
-        fourier_position_encoding_kwargs_decoder = dict(
-            concat_pos=True, max_resolution=config.train_size, num_bands=64, sine_only=False
-        )
+        fourier_position_encoding_kwargs_preprocessor = {
+            "num_bands": 64,
+            "max_resolution": config.train_size,
+            "sine_only": False,
+            "concat_pos": True,
+        }
+        fourier_position_encoding_kwargs_decoder = {
+            "concat_pos": True,
+            "max_resolution": config.train_size,
+            "num_bands": 64,
+            "sine_only": False,
+        }
 
         image_preprocessor = PerceiverImagePreprocessor(
             config,
@@ -1778,7 +1787,7 @@ def forward(
     PERCEIVER_START_DOCSTRING,
 )
 class PerceiverForMultimodalAutoencoding(PerceiverPreTrainedModel):
-    def __init__(self, config):
+    def __init__(self, config: PerceiverConfig):
         super().__init__(config)
 
         n_audio_samples = config.num_frames * config.audio_samples_per_frame
@@ -1789,24 +1798,24 @@ def __init__(self, config):
                 "audio": PerceiverAudioPreprocessor(
                     config,
                     position_encoding_type="fourier",
-                    fourier_position_encoding_kwargs=dict(
-                        num_bands=192,
-                        max_resolution=(n_audio_samples,),
-                        sine_only=False,
-                        concat_pos=True,
-                    ),
+                    fourier_position_encoding_kwargs={
+                        "num_bands": 192,
+                        "max_resolution": (n_audio_samples,),
+                        "sine_only": False,
+                        "concat_pos": True,
+                    },
                     prep_type="patches",
                     samples_per_patch=config.samples_per_patch,
                 ),
                 "image": PerceiverImagePreprocessor(
                     config,
                     position_encoding_type="fourier",
-                    fourier_position_encoding_kwargs=dict(
-                        num_bands=32,
-                        max_resolution=(config.num_frames, config.image_size, config.image_size),
-                        sine_only=False,
-                        concat_pos=True,
-                    ),
+                    fourier_position_encoding_kwargs={
+                        "num_bands": 32,
+                        "max_resolution": (config.num_frames, config.image_size, config.image_size),
+                        "sine_only": False,
+                        "concat_pos": True,
+                    },
                     prep_type="patches",
                     spatial_downsample=4,
                     temporal_downsample=1,
@@ -1821,16 +1830,16 @@ def __init__(self, config):
             # Autoencoding, don't pass inputs to the queries.
             concat_preprocessed_input=False,
             output_shape=config.output_shape,
-            output_num_channels=512,
+            output_num_channels=config.output_num_channels,
             use_query_residual=False,
             position_encoding_only=True,
             position_encoding_type="fourier",
-            fourier_position_encoding_kwargs=dict(
-                num_bands=32,
-                max_resolution=(config.num_frames, config.image_size, config.image_size),
-                sine_only=False,
-                concat_pos=True,
-            ),
+            fourier_position_encoding_kwargs={
+                "num_bands": 32,
+                "max_resolution": (config.num_frames, config.image_size, config.image_size),
+                "sine_only": False,
+                "concat_pos": True,
+            },
         )
 
         decoder = PerceiverMultimodalDecoder(
@@ -1845,16 +1854,16 @@ def __init__(self, config):
                     # Autoencoding, don't pass inputs to the queries.
                     concat_preprocessed_input=False,
                     output_index_dims=(n_audio_samples // config.samples_per_patch,),
-                    output_num_channels=512,
+                    output_num_channels=config.output_num_channels,
                     use_query_residual=False,
                     position_encoding_only=True,
                     position_encoding_type="fourier",
-                    fourier_position_encoding_kwargs=dict(
-                        num_bands=192,
-                        max_resolution=(n_audio_samples,),
-                        sine_only=False,
-                        concat_pos=True,
-                    ),
+                    fourier_position_encoding_kwargs={
+                        "num_bands": 192,
+                        "max_resolution": (n_audio_samples,),
+                        "sine_only": False,
+                        "concat_pos": True,
+                    },
                 ),
                 "image": image_decoder,
                 "label": PerceiverClassificationDecoder(
@@ -1864,22 +1873,22 @@ def __init__(self, config):
                     use_query_residual=False,
                     position_encoding_only=True,
                     position_encoding_type="trainable",
-                    trainable_position_encoding_kwargs=dict(
-                        num_channels=1024,
-                        index_dims=1,
-                    ),
+                    trainable_position_encoding_kwargs={
+                        "num_channels": config._label_trainable_num_channels,
+                        "index_dims": 1,
+                    },
                 ),
             },
             num_outputs=None,
-            output_num_channels=512,
+            output_num_channels=config.output_num_channels,
             use_query_residual=False,
         )
 
         output_postprocessor = PerceiverMultimodalPostprocessor(
             modalities={
-                "audio": PerceiverAudioPostprocessor(config, in_channels=512),
-                "image": PerceiverProjectionPostprocessor(in_channels=512, out_channels=3),
-                "label": PerceiverClassificationPostprocessor(config, in_channels=512),
+                "audio": PerceiverAudioPostprocessor(config, in_channels=config.output_num_channels),
+                "image": PerceiverProjectionPostprocessor(in_channels=config.output_num_channels, out_channels=3),
+                "label": PerceiverClassificationPostprocessor(config, in_channels=config.output_num_channels),
             }
         )
 
@@ -2124,7 +2133,7 @@ def __init__(
 
         self.output_num_channels = output_num_channels
         # If `none`, the decoder will not construct any position encodings.
-        # You should construct your own when quering the decoder.
+        # You should construct your own when querying the decoder.
         self.output_position_encodings = None
         self.position_encoding_type = position_encoding_type
         self.position_encoding_kwargs = position_encoding_kwargs
@@ -2181,9 +2190,7 @@ def decoder_query(self, inputs, modality_sizes=None, inputs_without_pos=None, su
             # to get the indices for the unflattened array
             # unravel_index returns a tuple (x_idx, y_idx, ...)
             # stack to get the [n, d] tensor of coordinates
-            indices = list(
-                torch.from_numpy(x) for x in np.unravel_index(subsampled_points.cpu(), self.output_index_dims)
-            )
+            indices = [torch.from_numpy(x) for x in np.unravel_index(subsampled_points.cpu(), self.output_index_dims)]
             pos = torch.stack(indices, dim=1)
             batch_size = inputs.shape[0]
             # Map these coordinates to [-1, 1]
@@ -2194,7 +2201,7 @@ def decoder_query(self, inputs, modality_sizes=None, inputs_without_pos=None, su
                 pos_emb = self.output_position_encodings(batch_size)
             elif self.position_encoding_type == "fourier":
                 pos_emb = self.output_position_encodings(
-                    self.output_index_dims, batch_size=batch_size, device=inputs.device, pos=pos
+                    self.output_index_dims, batch_size=batch_size, device=inputs.device, dtype=inputs.dtype, pos=pos
                 )
 
             # Optionally project them to a target dimension.
@@ -2208,7 +2215,9 @@ def decoder_query(self, inputs, modality_sizes=None, inputs_without_pos=None, su
             if self.position_encoding_type == "trainable":
                 pos_emb = self.output_position_encodings(batch_size)
             elif self.position_encoding_type == "fourier":
-                pos_emb = self.output_position_encodings(index_dims, batch_size, device=inputs.device)
+                pos_emb = self.output_position_encodings(
+                    index_dims, batch_size, device=inputs.device, dtype=inputs.dtype
+                )
 
             # Optionally project them to a target dimension.
             pos_emb = self.positions_projection(pos_emb)
@@ -2443,7 +2452,7 @@ def __init__(
         output_num_channels: int,
         min_padding_size: Optional[int] = 2,
         subsampled_index_dims: Optional[Dict[str, PerceiverAbstractDecoder]] = None,
-        **decoder_kwargs
+        **decoder_kwargs,
     ) -> None:
         super().__init__()
         self.modalities = nn.ModuleDict(modalities)
@@ -2477,9 +2486,9 @@ def decoder_query(self, inputs, modality_sizes, inputs_without_pos=None, subsamp
         inputs = restructure(modality_sizes, inputs)
 
         # Obtain modality-specific decoders' queries
-        subsampled_points = subsampled_points or dict()
+        subsampled_points = subsampled_points or {}
 
-        decoder_queries = dict()
+        decoder_queries = {}
         for modality, decoder in self.modalities.items():
             # Get input_without_pos for this modality if it exists.
             input_without_pos = None
@@ -2809,7 +2818,12 @@ def output_size(self):
         return encoding_size
 
     def forward(
-        self, index_dims: List[int], batch_size: int, device, pos: torch.FloatTensor = None
+        self,
+        index_dims: List[int],
+        batch_size: int,
+        device: torch.device,
+        dtype: torch.dtype,
+        pos: torch.FloatTensor = None,
     ) -> torch.FloatTensor:
         pos = _check_or_build_spatial_positions(pos, index_dims, batch_size)
         fourier_pos_enc = generate_fourier_features(
@@ -2818,7 +2832,7 @@ def forward(
             max_resolution=self.max_resolution,
             concat_pos=self.concat_pos,
             sine_only=self.sine_only,
-        ).to(device)
+        ).to(device=device, dtype=dtype)
         return fourier_pos_enc
 
 
@@ -2850,14 +2864,14 @@ def __init__(self, config: PerceiverConfig) -> None:
     def num_channels(self) -> int:
         return self.config.d_model
 
-    def forward(self, inputs: torch.LongTensor) -> torch.FloatTensor:
-        embeddings = self.embeddings(inputs)
+    def forward(self, inputs: torch.LongTensor, pos: Optional[torch.Tensor] = None, network_input_is_1d: bool = True):
+        embeddings_without_pos = self.embeddings(inputs)
 
         seq_length = inputs.shape[1]
         position_ids = torch.arange(0, seq_length, device=inputs.device)
-        embeddings = embeddings + self.position_embeddings(position_ids)
+        embeddings = embeddings_without_pos + self.position_embeddings(position_ids)
 
-        return embeddings, None, None
+        return embeddings, None, embeddings_without_pos
 
 
 class PerceiverEmbeddingDecoder(nn.Module):
@@ -2890,7 +2904,7 @@ class PerceiverMultimodalPostprocessor(nn.Module):
     postprocessor.
 
     Args:
-          modalities (`Dict[str, PostprocessorType]`):
+          modalities (`Mapping[str, PostprocessorType]`):
             Dictionary mapping modality name to postprocessor class for that modality.
           input_is_dict (`bool`, *optional*, defaults to `False`):
             If True, input is assumed to be dictionary structured, and outputs keep the same dictionary shape. If
@@ -3149,7 +3163,7 @@ def _build_network_inputs(self, inputs: torch.Tensor, network_input_is_1d: bool
         if self.position_encoding_type == "trainable":
             pos_enc = self.position_embeddings(batch_size)
         elif self.position_encoding_type == "fourier":
-            pos_enc = self.position_embeddings(index_dims, batch_size, device=inputs.device)
+            pos_enc = self.position_embeddings(index_dims, batch_size, device=inputs.device, dtype=inputs.dtype)
 
         # Optionally project them to a target dimension.
         pos_enc = self.positions_projection(pos_enc)
@@ -3317,7 +3331,7 @@ def _build_network_inputs(self, inputs):
         if self.position_encoding_type == "trainable":
             pos_enc = self.position_embeddings(batch_size)
         elif self.position_encoding_type == "fourier":
-            pos_enc = self.position_embeddings(index_dims, batch_size, device=inputs.device)
+            pos_enc = self.position_embeddings(index_dims, batch_size, device=inputs.device, dtype=inputs.dtype)
 
         # Optionally project them to a target dimension.
         pos_enc = self.positions_projection(pos_enc)
@@ -3346,7 +3360,7 @@ class PerceiverMultimodalPreprocessor(AbstractPreprocessor):
     of channels.
 
     Args:
-        modalities (`Dict[str, PreprocessorType]`):
+        modalities (`Mapping[str, PreprocessorType]`):
             Dict mapping modality name to preprocessor.
         mask_probs (`Dict[str, float]`):
             Dict mapping modality name to masking probability of that modality.
@@ -3362,9 +3376,9 @@ def __init__(
         min_padding_size: int = 2,
     ):
         super().__init__()
-        self.modalities = modalities
+        self.modalities = nn.ModuleDict(modalities)
         self.min_padding_size = min_padding_size
-        self.mask_probs = mask_probs if mask_probs is not None else dict()
+        self.mask_probs = mask_probs if mask_probs is not None else {}
         self.padding = nn.ParameterDict(
             {
                 modality: nn.Parameter(torch.randn(1, self.num_channels - preprocessor.num_channels))
diff --git a/src/transformers/models/perceiver/tokenization_perceiver.py b/src/transformers/models/perceiver/tokenization_perceiver.py
index 958d8a9c1d61..b4ec1e378e56 100644
--- a/src/transformers/models/perceiver/tokenization_perceiver.py
+++ b/src/transformers/models/perceiver/tokenization_perceiver.py
@@ -66,9 +66,8 @@ def __init__(
         cls_token="[CLS]",
         sep_token="[SEP]",
         model_max_length=2048,
-        **kwargs
+        **kwargs,
     ) -> None:
-
         pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
@@ -76,6 +75,18 @@ def __init__(
         cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
         sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
 
+        self._utf_vocab_size = 2**8  # utf is 8 bits
+
+        # Since these tokens are not part of the vocabulary, we manually add them
+        self._added_tokens_decoder: Dict[str, int] = {
+            0: pad_token,
+            1: bos_token,
+            2: eos_token,
+            3: mask_token,
+            4: cls_token,
+            5: sep_token,
+        }
+        self._num_special_tokens = len(self._added_tokens_decoder)
         super().__init__(
             pad_token=pad_token,
             bos_token=bos_token,
@@ -87,31 +98,17 @@ def __init__(
             **kwargs,
         )
 
-        self._utf_vocab_size = 2**8  # utf is 8 bits
-
-        # define special tokens dict
-        self.special_tokens_encoder: Dict[str, int] = {
-            self.pad_token: 0,
-            self.bos_token: 1,
-            self.eos_token: 2,
-            self.mask_token: 3,
-            self.cls_token: 4,
-            self.sep_token: 5,
-        }
-        self._num_special_tokens = len(self.special_tokens_encoder)
-        self.special_tokens_decoder: Dict[int, str] = {v: k for k, v in self.special_tokens_encoder.items()}
-
     def get_vocab(self) -> Dict[str, int]:
-        vocab = self.special_tokens_encoder.copy()
-        vocab.update(self.added_tokens_encoder)
+        vocab = {}
         for i in range(self._utf_vocab_size):
             token = chr(i)
-            vocab[token] = i + len(self.special_tokens_encoder)
+            vocab[token] = i + self._num_special_tokens
+        vocab.update(self.added_tokens_encoder)
         return vocab
 
     @property
     def vocab_size(self):
-        return self._utf_vocab_size + self._num_special_tokens
+        return self._utf_vocab_size
 
     def get_special_tokens_mask(
         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
@@ -172,11 +169,7 @@ def _tokenize(self, text: str) -> List[str]:
 
     def _convert_token_to_id(self, token):
         """Converts a token (str) in an id using the vocab."""
-        if token in self.special_tokens_encoder:
-            token_id = self.special_tokens_encoder[token]
-        elif token in self.added_tokens_encoder:
-            token_id = self.added_tokens_encoder[token]
-        elif len(token) != 1:
+        if len(token) != 1:
             token_id = self.unk_token_id
         else:
             token_id = ord(token) + self._num_special_tokens
@@ -184,26 +177,16 @@ def _convert_token_to_id(self, token):
 
     def _convert_id_to_token(self, index):
         """Converts an index (integer) in a token (str) using the vocab."""
-        if index in self.special_tokens_decoder:
-            token = self.special_tokens_decoder[index]
-        elif index in self.added_tokens_decoder:
-            token = self.added_tokens_decoder[index]
-        else:
-            token = chr(index - self._num_special_tokens)
+        token = chr(index - self._num_special_tokens)
         return token
 
+    # TODO @ArthurZ refactor this as well....
     def convert_tokens_to_string(self, tokens):
         """Converts a sequence of tokens (string) in a single string."""
         bstring = b""
         for token in tokens:
-            if token in self.special_tokens_decoder:
-                tok_string = self.special_tokens_decoder[token].encode("utf-8")
-            elif token in self.added_tokens_decoder:
-                tok_string = self.special_tokens_decoder[token].encode("utf-8")
-            elif token in self.special_tokens_encoder:
-                tok_string = token.encode("utf-8")
-            elif token in self.added_tokens_encoder:
-                tok_string = token.encode("utf-8")
+            if token in self.added_tokens_encoder:
+                tok_string = str(token).encode("utf-8")
             else:
                 tok_string = bytes([ord(token)])
             bstring += tok_string
diff --git a/src/transformers/models/persimmon/__init__.py b/src/transformers/models/persimmon/__init__.py
new file mode 100644
index 000000000000..4c88459362eb
--- /dev/null
+++ b/src/transformers/models/persimmon/__init__.py
@@ -0,0 +1,62 @@
+# Copyright 2023 AdeptAI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_persimmon": ["PERSIMMON_PRETRAINED_CONFIG_ARCHIVE_MAP", "PersimmonConfig"],
+}
+
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_persimmon"] = [
+        "PersimmonForCausalLM",
+        "PersimmonModel",
+        "PersimmonPreTrainedModel",
+        "PersimmonForSequenceClassification",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_persimmon import PERSIMMON_PRETRAINED_CONFIG_ARCHIVE_MAP, PersimmonConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_persimmon import (
+            PersimmonForCausalLM,
+            PersimmonForSequenceClassification,
+            PersimmonModel,
+            PersimmonPreTrainedModel,
+        )
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/persimmon/configuration_persimmon.py b/src/transformers/models/persimmon/configuration_persimmon.py
new file mode 100644
index 000000000000..8606e4febffe
--- /dev/null
+++ b/src/transformers/models/persimmon/configuration_persimmon.py
@@ -0,0 +1,163 @@
+# coding=utf-8
+# Copyright 2023 Adept AI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Persimmon model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+PERSIMMON_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "adept/persimmon-8b-base": "https://huggingface.co/adept/persimmon-8b-base/resolve/main/config.json",
+}
+
+
+class PersimmonConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`PersimmonModel`]. It is used to instantiate an
+    Persimmon model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the
+    [adept/persimmon-8b-base](https://huggingface.co/adept/persimmon-8b-base).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 262144):
+            Vocabulary size of the Persimmon model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`PersimmonModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 16384):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 36):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 64):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 16384):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 25000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
+            is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalPersimmon/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This
+            is an experimental feature, subject to breaking API changes in future versions.
+        qk_layernorm (`bool`, *optional*, default to `True`):
+            Whether or not to normalize the Queries and Keys after projecting the hidden states
+        hidden_dropout (`float`, *optional*, default to 0.0):
+            The dropout ratio after applying the MLP to the hidden states.
+        attention_dropout (`float`, *optional*, default to 0.0):
+            The dropout ratio after computing the attention scores.
+        partial_rotary_factor (`float`, *optional*, default to 0.5):
+            Percentage of the query and keys which will have rotary embedding.
+
+        Example:
+
+    ```python
+    >>> from transformers import PersimmonModel, PersimmonConfig
+
+    >>> # Initializing a Persimmon persimmon-7b style configuration
+    >>> configuration = PersimmonConfig()
+    ```"""
+    model_type = "persimmon"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=262144,
+        hidden_size=4096,
+        intermediate_size=16384,
+        num_hidden_layers=36,
+        num_attention_heads=64,
+        hidden_act="relu2",
+        max_position_embeddings=16384,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=25000.0,
+        rope_scaling=None,
+        qk_layernorm=True,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        partial_rotary_factor=0.5,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.qk_layernorm = qk_layernorm
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.partial_rotary_factor = partial_rotary_factor
+        self._rope_scaling_validation()
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")
diff --git a/src/transformers/models/persimmon/convert_persimmon_weights_to_hf.py b/src/transformers/models/persimmon/convert_persimmon_weights_to_hf.py
new file mode 100644
index 000000000000..6cd61b9f71c8
--- /dev/null
+++ b/src/transformers/models/persimmon/convert_persimmon_weights_to_hf.py
@@ -0,0 +1,129 @@
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import warnings
+
+import flatdict
+import torch
+
+from transformers import LlamaTokenizer, PersimmonConfig, PersimmonForCausalLM
+
+
+try:
+    from transformers import LlamaTokenizerFast
+
+    tokenizer_class = LlamaTokenizerFast
+except ImportError as e:
+    warnings.warn(e)
+    warnings.warn(
+        "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion"
+    )
+    tokenizer_class = LlamaTokenizer
+
+"""
+Sample usage:
+
+```
+git clone https://github.com/persimmon-ai-labs/adept-inference
+wget https://axtkn4xl5cip.objectstorage.us-phoenix-1.oci.customer-oci.com/n/axtkn4xl5cip/b/adept-public-data/o/8b_base_model_release.tar
+wget https://axtkn4xl5cip.objectstorage.us-phoenix-1.oci.customer-oci.com/n/axtkn4xl5cip/b/adept-public-data/o/8b_chat_model_release.tar
+python src/transformers/models/persimmon/convert_persimmon_weights_to_hf.py  --input_dir /path/to/downloaded/persimmon/weights/ --output_dir /output/path
+```
+
+Thereafter, models can be loaded via:
+
+```py
+from transformers import PersimmonForCausalLM, PersimmonTokenizer
+
+model = PersimmonForCausalLM.from_pretrained("/output/path")
+tokenizer = PersimmonTokenizer.from_pretrained("/output/path")
+```
+
+Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
+come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
+"""
+
+
+KEYS_TO_MODIFY_MAPPING = {
+    "self_attention": "self_attn",
+    "language_model.encoder": "model",
+    "word_embeddings_for_head": "lm_head",
+    "language_model.embedding.word_embeddings": "model.embed_tokens",
+}
+
+KEYS_TO_REMOVE = "rotary_emb.inv_freq"
+
+
+def rename_state_dict(state_dict):
+    model_state_dict = {}
+    for key, value in state_dict.items():
+        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
+            if key_to_modify in key:
+                key = key.replace(key_to_modify, new_key)
+        if KEYS_TO_REMOVE in key:
+            continue
+        model_state_dict[key] = value
+    return model_state_dict
+
+
+def convert_persimmon_checkpoint(pytorch_dump_folder_path, ada_lib_path, pt_model_path, safe_serialization=False):
+    import sys
+
+    sys.path.insert(0, ada_lib_path)
+    model_state_dict_base = torch.load(pt_model_path, map_location="cpu")
+    state_dict = flatdict.FlatDict(model_state_dict_base["model"], ".")
+    state_dict = rename_state_dict(state_dict)
+
+    transformers_config = PersimmonConfig()
+    model = PersimmonForCausalLM(transformers_config, eos_token_id=71013, bos_token_id=71013).to(torch.bfloat16)
+    model.load_state_dict(state_dict)
+    model.save_pretrained(pytorch_dump_folder_path, safe_serialization=safe_serialization)
+    transformers_config.save_pretrained(pytorch_dump_folder_path)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--input_dir",
+        help="Location of Persimmon weights, which contains tokenizer.model and model folders",
+    )
+    parser.add_argument(
+        "--pt_model_path",
+        help="Location of Persimmon `model_optim_rng.pt`",
+    )
+    parser.add_argument(
+        "--output_dir",
+        help="Location to write HF model and tokenizer",
+    )
+    parser.add_argument(
+        "--ada_lib_path",
+        help="Location to write HF model and tokenizer",
+    )
+    parser.add_argument("--safe_serialization", type=bool, help="Whether or not to save using `safetensors`.")
+    args = parser.parse_args()
+    spm_path = os.path.join(args.input_dir, "adept_vocab.model")
+
+    convert_persimmon_checkpoint(
+        pytorch_dump_folder_path=args.output_dir,
+        pt_model_path=args.pt_model_path,
+        safe_serialization=args.safe_serialization,
+        ada_lib_path=args.ada_lib_path,
+    )
+    tokenizer = tokenizer_class(spm_path, bos_token="|ENDOFTEXT|", eos_token="|ENDOFTEXT|")
+    tokenizer.save_pretrained(args.output_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py
new file mode 100644
index 000000000000..c09657c065f2
--- /dev/null
+++ b/src/transformers/models/persimmon/modeling_persimmon.py
@@ -0,0 +1,1007 @@
+# coding=utf-8
+# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Persimmon model."""
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from ...modeling_utils import PreTrainedModel
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from .configuration_persimmon import PersimmonConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "PersimmonConfig"
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Persimmon
+class PersimmonRotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->Persimmon
+class PersimmonLinearScalingRotaryEmbedding(PersimmonRotaryEmbedding):
+    """PersimmonRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        t = t / self.scaling_factor
+
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Persimmon
+class PersimmonDynamicNTKScalingRotaryEmbedding(PersimmonRotaryEmbedding):
+    """PersimmonRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+
+        if seq_len > self.max_position_embeddings:
+            base = self.base * (
+                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+            ) ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+    cos = cos[position_ids].unsqueeze(1)  # [seq_len, dim] -> [batch_size, 1, seq_len, head_dim]
+    sin = sin[position_ids].unsqueeze(1)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+# Copied from transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXMLP with GPTNeoX->Persimmon
+class PersimmonMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense_h_to_4h = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.dense_4h_to_h = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.act = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense_h_to_4h(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.dense_4h_to_h(hidden_states)
+        return hidden_states
+
+
+class PersimmonAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: PersimmonConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.partial_rotary_factor = config.partial_rotary_factor
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.query_key_value = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=True)
+        self.dense = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=True)
+        self.qk_layernorm = config.qk_layernorm
+
+        if self.qk_layernorm:
+            self.q_layernorm = nn.LayerNorm(
+                config.hidden_size // self.num_heads, eps=config.layer_norm_eps, elementwise_affine=True
+            )
+            self.k_layernorm = nn.LayerNorm(
+                config.hidden_size // self.num_heads, eps=config.layer_norm_eps, elementwise_affine=True
+            )
+        self.attention_dropout = nn.Dropout(config.attention_dropout)
+        self._init_rope()
+
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = PersimmonRotaryEmbedding(
+                int(self.partial_rotary_factor * self.head_dim),
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            scaling_factor = self.config.rope_scaling["factor"]
+            if scaling_type == "linear":
+                self.rotary_emb = PersimmonLinearScalingRotaryEmbedding(
+                    int(self.partial_rotary_factor * self.head_dim),
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            elif scaling_type == "dynamic":
+                self.rotary_emb = PersimmonDynamicNTKScalingRotaryEmbedding(
+                    int(self.partial_rotary_factor * self.head_dim),
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+
+    # Copied from transformers.models.bloom.modeling_bloom.BloomAttention._split_heads
+    def _split_heads(self, fused_qkv: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Split the last dimension into (num_heads, head_dim) without making any copies, results share same memory
+        storage as `fused_qkv`
+
+        Args:
+            fused_qkv (`torch.tensor`, *required*): [batch_size, seq_length, num_heads * 3 * head_dim]
+
+        Returns:
+            query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
+            value: [batch_size, seq_length, num_heads, head_dim]
+        """
+        batch_size, seq_length, three_times_hidden_size = fused_qkv.shape
+        fused_qkv = fused_qkv.view(batch_size, seq_length, self.num_heads, 3, self.head_dim)
+        return fused_qkv[..., 0, :], fused_qkv[..., 1, :], fused_qkv[..., 2, :]
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        # [batch_size, seq_length, 3 x hidden_size]
+        fused_qkv = self.query_key_value(hidden_states)
+
+        # 3 x [batch_size, seq_length, num_heads, head_dim]
+        (query_states, key_states, value_states) = self._split_heads(fused_qkv)
+
+        if self.qk_layernorm:
+            query_states = self.q_layernorm(query_states)
+            key_states = self.k_layernorm(key_states)
+
+        # [batch_size, num_heads, seq_length, head_dim] -> [batch_size, seq_length, num_heads, head_dim]
+        query_states = query_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+
+        # Partial rotary embedding
+        query_rot, query_pass = (
+            query_states[..., : self.rotary_emb.dim],
+            query_states[..., self.rotary_emb.dim :],
+        )
+        key_rot, key_pass = (
+            key_states[..., : self.rotary_emb.dim],
+            key_states[..., self.rotary_emb.dim :],
+        )
+        # [batch_size, seq_length, num_heads, head_dim // config.partial_rotary_factor]
+        query_rot, key_rot = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids)
+
+        # [batch_size, seq_length, num_heads, head_dim]
+        query_states = torch.cat((query_rot, query_pass), dim=-1)
+        key_states = torch.cat((key_rot, key_pass), dim=-1)
+
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+        past_key_value = (key_states, value_states) if use_cache else None
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dtype=torch.float32, dim=-1).to(query_states.dtype)
+        attn_weights = self.attention_dropout(attn_weights)
+
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.dense(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class PersimmonDecoderLayer(nn.Module):
+    def __init__(self, config: PersimmonConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = PersimmonAttention(config=config)
+        self.mlp = PersimmonMLP(config)
+        self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
+                `[0, config.n_positions - 1]`.
+
+                [What are position IDs?](../glossary#position-ids)
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*):
+                cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states + residual
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+PERSIMMON_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`PersimmonConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Persimmon Model outputting raw hidden-states without any specific head on top.",
+    PERSIMMON_START_DOCSTRING,
+)
+class PersimmonPreTrainedModel(PreTrainedModel):
+    config_class = PersimmonConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["PersimmonDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, PersimmonModel):
+            module.gradient_checkpointing = value
+
+
+PERSIMMON_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Persimmon Model outputting raw hidden-states without any specific head on top.",
+    PERSIMMON_START_DOCSTRING,
+)
+class PersimmonModel(PersimmonPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`PersimmonDecoderLayer`]
+
+    Args:
+        config: PersimmonConfig
+    """
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaModel.__init__ with LLAMA->PERSIMMON,Llama->Persimmon,PersimmonRMSNorm->nn.LayerNorm,norm->final_layernorm,rms_final_layernorm_eps->layer_norm_eps
+    def __init__(self, config: PersimmonConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList([PersimmonDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.final_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+
+        return combined_attention_mask
+
+    @add_start_docstrings_to_model_forward(PERSIMMON_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        # embed positions
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+            )
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+        )
+
+        hidden_states = inputs_embeds
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, past_key_value, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.final_layernorm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class PersimmonForCausalLM(PersimmonPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.__init__ with LLAMA->PERSIMMON,Llama->Persimmon
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = PersimmonModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_input_embeddings
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_output_embeddings
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_output_embeddings
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_decoder
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_decoder
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(PERSIMMON_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, PersimmonForCausalLM
+
+        >>> model = PersimmonForCausalLM.from_pretrained("adept/persimmon-8b-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("adept/persimmon-8b-base")
+
+        >>> prompt = "human: Hey, what should I eat for dinner?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        'human: Hey, what should I eat for dinner?\n\ncat: 🐱\n\nhuman: 😐\n\n'
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.prepare_inputs_for_generation
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+
+@add_start_docstrings(
+    """
+    The Persimmon transformer with a sequence classification head on top (linear layer).
+
+    [`PersimmonForSequenceClassification`] uses the last token in order to do the classification, as other causal
+    models (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    PERSIMMON_START_DOCSTRING,
+)
+# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with LLAMA->PERSIMMON,Llama->Persimmon
+class PersimmonForSequenceClassification(PersimmonPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = PersimmonModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(PERSIMMON_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+                    logits.device
+                )
+            else:
+                sequence_lengths = -1
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
diff --git a/src/transformers/models/phobert/__init__.py b/src/transformers/models/phobert/__init__.py
index 0d9a6f4cea1a..c974d994eca0 100644
--- a/src/transformers/models/phobert/__init__.py
+++ b/src/transformers/models/phobert/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/phobert/tokenization_phobert.py b/src/transformers/models/phobert/tokenization_phobert.py
index a37a5645ae42..efa7e2469478 100644
--- a/src/transformers/models/phobert/tokenization_phobert.py
+++ b/src/transformers/models/phobert/tokenization_phobert.py
@@ -129,27 +129,16 @@ def __init__(
         unk_token="<unk>",
         pad_token="<pad>",
         mask_token="<mask>",
-        **kwargs
+        **kwargs,
     ):
-        super().__init__(
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            **kwargs,
-        )
-
         self.vocab_file = vocab_file
         self.merges_file = merges_file
 
         self.encoder = {}
-        self.encoder[self.bos_token] = 0
-        self.encoder[self.pad_token] = 1
-        self.encoder[self.eos_token] = 2
-        self.encoder[self.unk_token] = 3
+        self.encoder[bos_token] = 0
+        self.encoder[pad_token] = 1
+        self.encoder[eos_token] = 2
+        self.encoder[unk_token] = 3
 
         self.add_from_file(vocab_file)
 
@@ -158,9 +147,21 @@ def __init__(
         with open(merges_file, encoding="utf-8") as merges_handle:
             merges = merges_handle.read().split("\n")[:-1]
         merges = [tuple(merge.split()[:-1]) for merge in merges]
+
         self.bpe_ranks = dict(zip(merges, range(len(merges))))
         self.cache = {}
 
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            **kwargs,
+        )
+
     def build_inputs_with_special_tokens(
         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
     ) -> List[int]:
@@ -297,7 +298,7 @@ def _tokenize(self, text):
         words = re.findall(r"\S+\n?", text)
 
         for token in words:
-            split_tokens.extend([t for t in self.bpe(token).split(" ")])
+            split_tokens.extend(list(self.bpe(token).split(" ")))
         return split_tokens
 
     def _convert_token_to_id(self, token):
@@ -324,8 +325,12 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
             save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
         )
 
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
             copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
 
         if os.path.abspath(self.merges_file) != os.path.abspath(out_merge_file):
             copyfile(self.merges_file, out_merge_file)
diff --git a/src/transformers/models/pix2struct/__init__.py b/src/transformers/models/pix2struct/__init__.py
new file mode 100644
index 000000000000..8b395b31d8be
--- /dev/null
+++ b/src/transformers/models/pix2struct/__init__.py
@@ -0,0 +1,86 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
+
+
+_import_structure = {
+    "configuration_pix2struct": [
+        "PIX2STRUCT_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "Pix2StructConfig",
+        "Pix2StructTextConfig",
+        "Pix2StructVisionConfig",
+    ],
+    "processing_pix2struct": ["Pix2StructProcessor"],
+}
+
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["image_processing_pix2struct"] = ["Pix2StructImageProcessor"]
+
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_pix2struct"] = [
+        "PIX2STRUCT_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "Pix2StructPreTrainedModel",
+        "Pix2StructForConditionalGeneration",
+        "Pix2StructVisionModel",
+        "Pix2StructTextModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_pix2struct import (
+        PIX2STRUCT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        Pix2StructConfig,
+        Pix2StructTextConfig,
+        Pix2StructVisionConfig,
+    )
+    from .processing_pix2struct import Pix2StructProcessor
+
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .image_processing_pix2struct import Pix2StructImageProcessor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_pix2struct import (
+            PIX2STRUCT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            Pix2StructForConditionalGeneration,
+            Pix2StructPreTrainedModel,
+            Pix2StructTextModel,
+            Pix2StructVisionModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/pix2struct/configuration_pix2struct.py b/src/transformers/models/pix2struct/configuration_pix2struct.py
new file mode 100644
index 000000000000..feb5397a2a08
--- /dev/null
+++ b/src/transformers/models/pix2struct/configuration_pix2struct.py
@@ -0,0 +1,389 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Pix2Struct model configuration"""
+
+import os
+from typing import Union
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+PIX2STRUCT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "google/pix2struct-textcaps-base": (
+        "https://huggingface.co/google/pix2struct-textcaps-base/resolve/main/config.json"
+    ),
+}
+
+
+class Pix2StructTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Pix2StructTextModel`]. It is used to instantiate
+    a Pix2Struct text model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the Pix2Struct text decoder used by
+    the [google/pix2struct-base](https://huggingface.co/google/pix2struct-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50244):
+            Vocabulary size of the `Pix2Struct` text model. Defines the number of different tokens that can be
+            represented by the `inputs_ids` passed when calling [`Pix2StructTextModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        d_kv (`int`, *optional*, defaults to 64):
+            Dimensionality of the key, query, value projections in each attention head.
+        d_ff (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        relative_attention_num_buckets (`int`, *optional*, defaults to 32):
+            The number of buckets to use for each attention layer.
+        relative_attention_max_distance (`int`, *optional*, defaults to 128):
+            The maximum distance of the longer sequences for the bucket separation.
+        dropout_rate (`float`, *optional*, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-6):
+            The epsilon used by the layer normalization layers.
+        initializer_factor (`float`, *optional*, defaults to 1.0):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+        dense_act_fn (`Union[Callable, str]`, *optional*, defaults to `"gelu_new"`):
+            The non-linear activation function (function or string).
+        decoder_start_token_id (`int`, *optional*, defaults to 0):
+            The id of the `decoder_start_token_id` token.
+        use_cache (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The id of the `padding` token.
+        eos_token_id (`int`, *optional*, defaults to 1):
+            The id of the `end-of-sequence` token.
+
+    Example:
+
+    ```python
+    >>> from transformers import Pix2StructTextConfig, Pix2StructTextModel
+
+    >>> # Initializing a Pix2StructTextConfig with google/pix2struct-base style configuration
+    >>> configuration = Pix2StructTextConfig()
+
+    >>> # Initializing a Pix2StructTextModel (with random weights) from the google/pix2struct-base style configuration
+    >>> model = Pix2StructTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "pix2struct_text_model"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "hidden_size": "hidden_size",
+        "num_attention_heads": "num_heads",
+        "num_hidden_layers": "num_layers",
+    }
+
+    def __init__(
+        self,
+        vocab_size=50244,
+        hidden_size=768,
+        d_kv=64,
+        d_ff=2048,
+        num_layers=12,
+        num_heads=12,
+        relative_attention_num_buckets=32,
+        relative_attention_max_distance=128,
+        dropout_rate=0.1,
+        layer_norm_epsilon=1e-6,
+        initializer_factor=1.0,
+        dense_act_fn="gelu_new",
+        decoder_start_token_id=0,
+        use_cache=False,
+        pad_token_id=0,
+        eos_token_id=1,
+        tie_word_embeddings=False,
+        is_decoder=True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.d_kv = d_kv
+        self.d_ff = d_ff
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.relative_attention_num_buckets = relative_attention_num_buckets
+        self.relative_attention_max_distance = relative_attention_max_distance
+        self.dropout_rate = dropout_rate
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_factor = initializer_factor
+        self.use_cache = use_cache
+
+        self.eos_token_id = eos_token_id
+        self.decoder_start_token_id = decoder_start_token_id
+
+        # for backwards compatibility
+        self.dense_act_fn = dense_act_fn
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            decoder_start_token_id=decoder_start_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            is_decoder=is_decoder,
+            **kwargs,
+        )
+
+    @classmethod
+    def from_pretrained(
+        cls, pretrainehidden_size_name_or_path: Union[str, os.PathLike], **kwargs
+    ) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrainehidden_size_name_or_path, **kwargs)
+
+        # get the text config dict if we are loading from Pix2StructConfig
+        if config_dict.get("model_type") == "pix2struct":
+            config_dict = config_dict["text_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class Pix2StructVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Pix2StructVisionModel`]. It is used to
+    instantiate a Pix2Struct vision model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration defaults will yield a similar configuration to that of the Pix2Struct-base
+    [google/pix2struct-base](https://huggingface.co/google/pix2struct-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        patch_embed_hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the input patch_embedding layer in the Transformer encoder.
+        d_ff (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        d_kv (`int`, *optional*, defaults to 64):
+            Dimensionality of the key, query, value projections per attention head.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        dense_act_fn (`str` or `function`, *optional*, defaults to `"gelu_new"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        dropout_rate (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 1e-10):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float``, *optional*, defaults to 1.0):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+        seq_len (`int`, *optional*, defaults to 4096):
+            Maximum sequence length (here number of patches) supported by the model.
+        relative_attention_num_buckets (`int`, *optional*, defaults to 32):
+            The number of buckets to use for each attention layer.
+        relative_attention_max_distance (`int`, *optional*, defaults to 128):
+            The maximum distance (in tokens) to use for each attention layer.
+
+    Example:
+
+    ```python
+    >>> from transformers import Pix2StructVisionConfig, Pix2StructVisionModel
+
+    >>> # Initializing a Pix2StructVisionConfig with google/pix2struct-base style configuration
+    >>> configuration = Pix2StructVisionConfig()
+
+    >>> # Initializing a Pix2StructVisionModel (with random weights) from the google/pix2struct-base style configuration
+    >>> model = Pix2StructVisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "pix2struct_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        patch_embed_hidden_size=768,
+        d_ff=2048,
+        d_kv=64,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        dense_act_fn="gelu_new",
+        layer_norm_eps=1e-6,
+        dropout_rate=0.0,
+        attention_dropout=0.0,
+        initializer_range=1e-10,
+        initializer_factor=1.0,
+        seq_len=4096,
+        relative_attention_num_buckets=32,
+        relative_attention_max_distance=128,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.patch_embed_hidden_size = patch_embed_hidden_size
+        self.d_ff = d_ff
+        self.dropout_rate = dropout_rate
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.dense_act_fn = dense_act_fn
+        self.seq_len = seq_len
+        self.relative_attention_num_buckets = relative_attention_num_buckets
+        self.relative_attention_max_distance = relative_attention_max_distance
+        self.d_kv = d_kv
+
+    @classmethod
+    def from_pretrained(
+        cls, pretrainehidden_size_name_or_path: Union[str, os.PathLike], **kwargs
+    ) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrainehidden_size_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from Pix2StructConfig
+        if config_dict.get("model_type") == "pix2struct":
+            config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class Pix2StructConfig(PretrainedConfig):
+    r"""
+    [`Pix2StructConfig`] is the configuration class to store the configuration of a
+    [`Pix2StructForConditionalGeneration`]. It is used to instantiate a Pix2Struct model according to the specified
+    arguments, defining the text model and vision model configs. Instantiating a configuration with the defaults will
+    yield a similar configuration to that of the Pix2Struct-base
+    [google/pix2struct-base](https://huggingface.co/google/pix2struct-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`Pix2StructTextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`Pix2StructVisionConfig`].
+        initializer_factor (`float`, *optional*, defaults to 1.0):
+            Factor to multiply the initialization range with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        is_vqa (`bool`, *optional*, defaults to `False`):
+            Whether the model has been fine-tuned for VQA or not.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from transformers import Pix2StructConfig, Pix2StructForConditionalGeneration
+
+    >>> # Initializing a Pix2StructConfig with google/pix2struct-base style configuration
+    >>> configuration = Pix2StructConfig()
+
+    >>> # Initializing a Pix2StructForConditionalGeneration (with random weights) from the google/pix2struct-base style configuration
+    >>> model = Pix2StructForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+
+    >>> # We can also initialize a Pix2StructConfig from a Pix2StructTextConfig and a Pix2StructVisionConfig
+
+    >>> # Initializing a Pix2Struct text and Pix2Struct vision configuration
+    >>> config_text = Pix2StructTextConfig()
+    >>> config_vision = Pix2StructVisionConfig()
+
+    >>> config = Pix2StructConfig.from_text_vision_configs(config_text, config_vision)
+    ```"""
+
+    model_type = "pix2struct"
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        initializer_factor=1.0,
+        initializer_range=0.02,
+        is_vqa=False,
+        tie_word_embeddings=False,
+        is_encoder_decoder=True,
+        **kwargs,
+    ):
+        super().__init__(tie_word_embeddings=tie_word_embeddings, is_encoder_decoder=is_encoder_decoder, **kwargs)
+
+        if text_config is None:
+            text_config = {}
+            logger.info("text_config is None. Initializing the Pix2StructTextConfig with default values.")
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("vision_config is None. Initializing the Pix2StructVisionConfig with default values.")
+
+        self.text_config = Pix2StructTextConfig(**text_config)
+        self.vision_config = Pix2StructVisionConfig(**vision_config)
+
+        self.decoder_start_token_id = self.text_config.decoder_start_token_id
+        self.pad_token_id = self.text_config.pad_token_id
+        self.eos_token_id = self.text_config.eos_token_id
+
+        self.initializer_factor = initializer_factor
+        self.initializer_range = initializer_range
+
+        self.text_config.initializer_range = self.initializer_range
+        self.vision_config.initializer_range = self.initializer_range
+
+        self.is_vqa = is_vqa
+
+    @classmethod
+    def from_text_vision_configs(
+        cls, text_config: Pix2StructTextConfig, vision_config: Pix2StructVisionConfig, **kwargs
+    ):
+        r"""
+        Instantiate a [`Pix2StructConfig`] (or a derived class) from pix2struct text model configuration and pix2struct
+        vision model configuration.
+
+        Returns:
+            [`Pix2StructConfig`]: An instance of a configuration object
+        """
+
+        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
diff --git a/src/transformers/models/pix2struct/convert_pix2struct_original_pytorch_to_hf.py b/src/transformers/models/pix2struct/convert_pix2struct_original_pytorch_to_hf.py
new file mode 100644
index 000000000000..457c2236694a
--- /dev/null
+++ b/src/transformers/models/pix2struct/convert_pix2struct_original_pytorch_to_hf.py
@@ -0,0 +1,155 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import re
+
+import torch
+from flax.traverse_util import flatten_dict
+from t5x import checkpoints
+
+from transformers import (
+    AutoTokenizer,
+    Pix2StructConfig,
+    Pix2StructForConditionalGeneration,
+    Pix2StructImageProcessor,
+    Pix2StructProcessor,
+    Pix2StructTextConfig,
+    Pix2StructVisionConfig,
+)
+
+
+def get_flax_param(t5x_checkpoint_path):
+    flax_params = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path)
+    flax_params = flatten_dict(flax_params)
+    return flax_params
+
+
+def rename_and_convert_flax_params(flax_dict):
+    converted_dict = {}
+
+    CONVERSION_MAPPING = {
+        "token_embedder": "embeddings",
+        "encoder_norm": "layernorm",
+        "kernel": "weight",
+        ".out": ".output",
+        "scale": "weight",
+        "embedders_0.pos_embedding": "row_embedder.weight",
+        "embedders_1.pos_embedding": "column_embedder.weight",
+    }
+
+    DECODER_CONVERSION_MAPPING = {
+        "query": "attention.query",
+        "key": "attention.key",
+        "value": "attention.value",
+        "output.dense": "output",
+        "encoder_decoder_attention.o": "encoder_decoder_attention.attention.o",
+        "pre_self_attention_layer_norm": "self_attention.layer_norm",
+        "pre_cross_attention_layer_norm": "encoder_decoder_attention.layer_norm",
+        "mlp.": "mlp.DenseReluDense.",
+        "pre_mlp_layer_norm": "mlp.layer_norm",
+        "self_attention.o": "self_attention.attention.o",
+        "decoder.embeddings.embedding": "decoder.embed_tokens.weight",
+        "decoder.relpos_bias.rel_embedding": "decoder.layer.0.self_attention.attention.relative_attention_bias.weight",
+        "decoder.decoder_norm.weight": "decoder.final_layer_norm.weight",
+        "decoder.logits_dense.weight": "decoder.lm_head.weight",
+    }
+
+    for key in flax_dict.keys():
+        if "target" in key:
+            # remove the first prefix from the key
+            new_key = ".".join(key[1:])
+
+            # rename the key
+            for old, new in CONVERSION_MAPPING.items():
+                new_key = new_key.replace(old, new)
+
+            if "decoder" in new_key:
+                for old, new in DECODER_CONVERSION_MAPPING.items():
+                    new_key = new_key.replace(old, new)
+
+            if "layers" in new_key and "decoder" not in new_key:
+                # use regex to replace the layer number
+                new_key = re.sub(r"layers_(\d+)", r"layer.\1", new_key)
+                new_key = new_key.replace("encoder", "encoder.encoder")
+
+            elif "layers" in new_key and "decoder" in new_key:
+                # use regex to replace the layer number
+                new_key = re.sub(r"layers_(\d+)", r"layer.\1", new_key)
+
+            converted_dict[new_key] = flax_dict[key]
+
+    converted_torch_dict = {}
+    # convert converted_dict into torch format
+    for key in converted_dict.keys():
+        if ("embed_tokens" not in key) and ("embedder" not in key):
+            converted_torch_dict[key] = torch.from_numpy(converted_dict[key].T)
+        else:
+            converted_torch_dict[key] = torch.from_numpy(converted_dict[key])
+
+    return converted_torch_dict
+
+
+def convert_pix2struct_original_pytorch_checkpoint_to_hf(
+    t5x_checkpoint_path, pytorch_dump_folder_path, use_large=False, is_vqa=False
+):
+    flax_params = get_flax_param(t5x_checkpoint_path)
+
+    if not use_large:
+        encoder_config = Pix2StructVisionConfig()
+        decoder_config = Pix2StructTextConfig()
+    else:
+        encoder_config = Pix2StructVisionConfig(
+            hidden_size=1536, d_ff=3968, num_attention_heads=24, num_hidden_layers=18
+        )
+        decoder_config = Pix2StructTextConfig(hidden_size=1536, d_ff=3968, num_heads=24, num_layers=18)
+    config = Pix2StructConfig(
+        vision_config=encoder_config.to_dict(), text_config=decoder_config.to_dict(), is_vqa=is_vqa
+    )
+
+    model = Pix2StructForConditionalGeneration(config)
+
+    torch_params = rename_and_convert_flax_params(flax_params)
+    model.load_state_dict(torch_params)
+
+    tok = AutoTokenizer.from_pretrained("ybelkada/test-pix2struct-tokenizer")
+    image_processor = Pix2StructImageProcessor()
+    processor = Pix2StructProcessor(image_processor=image_processor, tokenizer=tok)
+
+    if use_large:
+        processor.image_processor.max_patches = 4096
+
+    processor.image_processor.is_vqa = True
+
+    # mkdir if needed
+    os.makedirs(pytorch_dump_folder_path, exist_ok=True)
+
+    model.save_pretrained(pytorch_dump_folder_path)
+    processor.save_pretrained(pytorch_dump_folder_path)
+
+    print("Model saved in {}".format(pytorch_dump_folder_path))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--t5x_checkpoint_path", default=None, type=str, help="Path to the original T5x checkpoint.")
+    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+    parser.add_argument("--use_large", action="store_true", help="Use large model.")
+    parser.add_argument("--is_vqa", action="store_true", help="Use large model.")
+    args = parser.parse_args()
+
+    convert_pix2struct_original_pytorch_checkpoint_to_hf(
+        args.t5x_checkpoint_path, args.pytorch_dump_folder_path, args.use_large
+    )
diff --git a/src/transformers/models/pix2struct/image_processing_pix2struct.py b/src/transformers/models/pix2struct/image_processing_pix2struct.py
new file mode 100644
index 000000000000..ba9cc95fcb0c
--- /dev/null
+++ b/src/transformers/models/pix2struct/image_processing_pix2struct.py
@@ -0,0 +1,475 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Pix2Struct."""
+import io
+import math
+from typing import Dict, Optional, Union
+
+import numpy as np
+from huggingface_hub import hf_hub_download
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature
+from ...image_transforms import convert_to_rgb, normalize, to_channel_dimension_format, to_pil_image
+from ...image_utils import (
+    ChannelDimension,
+    ImageInput,
+    get_image_size,
+    infer_channel_dimension_format,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import TensorType, is_torch_available, is_vision_available, logging
+from ...utils.import_utils import requires_backends
+
+
+if is_vision_available():
+    import textwrap
+
+    from PIL import Image, ImageDraw, ImageFont
+
+if is_torch_available():
+    import torch
+
+    from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_11
+else:
+    is_torch_greater_or_equal_than_1_11 = False
+
+
+logger = logging.get_logger(__name__)
+DEFAULT_FONT_PATH = "ybelkada/fonts"
+
+
+def _check_torch_version():
+    if is_torch_available() and not is_torch_greater_or_equal_than_1_11:
+        raise ImportError(
+            f"You are using torch=={torch.__version__}, but torch>=1.11.0 is required to use "
+            "Pix2StructImageProcessor. Please upgrade torch."
+        )
+
+
+# adapted from: https://discuss.pytorch.org/t/tf-image-extract-patches-in-pytorch/171409/2
+def torch_extract_patches(image_tensor, patch_height, patch_width):
+    """
+    Utiliy function to extract patches from a given image tensor. Returns a tensor of shape (1, `patch_height`,
+    `patch_width`, `num_channels`x `patch_height` x `patch_width`)
+
+    Args:
+        image_tensor (torch.Tensor):
+            The image tensor to extract patches from.
+        patch_height (int):
+            The height of the patches to extract.
+        patch_width (int):
+            The width of the patches to extract.
+    """
+    requires_backends(torch_extract_patches, ["torch"])
+    _check_torch_version()
+
+    image_tensor = image_tensor.unsqueeze(0)
+    patches = torch.nn.functional.unfold(image_tensor, (patch_height, patch_width), stride=(patch_height, patch_width))
+    patches = patches.reshape(image_tensor.size(0), image_tensor.size(1), patch_height, patch_width, -1)
+    patches = patches.permute(0, 4, 2, 3, 1).reshape(
+        image_tensor.size(2) // patch_height,
+        image_tensor.size(3) // patch_width,
+        image_tensor.size(1) * patch_height * patch_width,
+    )
+    return patches.unsqueeze(0)
+
+
+# Adapted from https://github.com/google-research/pix2struct/blob/0e1779af0f4db4b652c1d92b3bbd2550a7399123/pix2struct/preprocessing/preprocessing_utils.py#L106
+def render_text(
+    text: str,
+    text_size: int = 36,
+    text_color: str = "black",
+    background_color: str = "white",
+    left_padding: int = 5,
+    right_padding: int = 5,
+    top_padding: int = 5,
+    bottom_padding: int = 5,
+    font_bytes: Optional[bytes] = None,
+    font_path: Optional[str] = None,
+) -> Image.Image:
+    """
+    Render text. This script is entirely adapted from the original script that can be found here:
+    https://github.com/google-research/pix2struct/blob/main/pix2struct/preprocessing/preprocessing_utils.py
+
+    Args:
+        text (`str`, *optional*, defaults to ):
+            Text to render.
+        text_size (`int`, *optional*, defaults to 36):
+            Size of the text.
+        text_color (`str`, *optional*, defaults to `"black"`):
+            Color of the text.
+        background_color (`str`, *optional*, defaults to `"white"`):
+            Color of the background.
+        left_padding (`int`, *optional*, defaults to 5):
+            Padding on the left.
+        right_padding (`int`, *optional*, defaults to 5):
+            Padding on the right.
+        top_padding (`int`, *optional*, defaults to 5):
+            Padding on the top.
+        bottom_padding (`int`, *optional*, defaults to 5):
+            Padding on the bottom.
+        font_bytes (`bytes`, *optional*):
+            Bytes of the font to use. If `None`, the default font will be used.
+        font_path (`str`, *optional*):
+            Path to the font to use. If `None`, the default font will be used.
+    """
+    requires_backends(render_text, "vision")
+    # Add new lines so that each line is no more than 80 characters.
+
+    wrapper = textwrap.TextWrapper(width=80)
+    lines = wrapper.wrap(text=text)
+    wrapped_text = "\n".join(lines)
+
+    if font_bytes is not None and font_path is None:
+        font = io.BytesIO(font_bytes)
+    elif font_path is not None:
+        font = font_path
+    else:
+        font = hf_hub_download(DEFAULT_FONT_PATH, "Arial.TTF")
+    font = ImageFont.truetype(font, encoding="UTF-8", size=text_size)
+
+    # Use a temporary canvas to determine the width and height in pixels when
+    # rendering the text.
+    temp_draw = ImageDraw.Draw(Image.new("RGB", (1, 1), background_color))
+    _, _, text_width, text_height = temp_draw.textbbox((0, 0), wrapped_text, font)
+
+    # Create the actual image with a bit of padding around the text.
+    image_width = text_width + left_padding + right_padding
+    image_height = text_height + top_padding + bottom_padding
+    image = Image.new("RGB", (image_width, image_height), background_color)
+    draw = ImageDraw.Draw(image)
+    draw.text(xy=(left_padding, top_padding), text=wrapped_text, fill=text_color, font=font)
+    return image
+
+
+# Adapted from https://github.com/google-research/pix2struct/blob/0e1779af0f4db4b652c1d92b3bbd2550a7399123/pix2struct/preprocessing/preprocessing_utils.py#L87
+def render_header(
+    image: np.ndarray, header: str, input_data_format: Optional[Union[str, ChildProcessError]] = None, **kwargs
+):
+    """
+    Renders the input text as a header on the input image.
+
+    Args:
+        image (`np.ndarray`):
+            The image to render the header on.
+        header (`str`):
+            The header text.
+        data_format (`Union[ChannelDimension, str]`, *optional*):
+            The data format of the image. Can be either "ChannelDimension.channels_first" or
+            "ChannelDimension.channels_last".
+
+    Returns:
+        `np.ndarray`: The image with the header rendered.
+    """
+    requires_backends(render_header, "vision")
+
+    # Convert to PIL image if necessary
+    image = to_pil_image(image, input_data_format=input_data_format)
+
+    header_image = render_text(header, **kwargs)
+    new_width = max(header_image.width, image.width)
+
+    new_height = int(image.height * (new_width / image.width))
+    new_header_height = int(header_image.height * (new_width / header_image.width))
+
+    new_image = Image.new("RGB", (new_width, new_height + new_header_height), "white")
+    new_image.paste(header_image.resize((new_width, new_header_height)), (0, 0))
+    new_image.paste(image.resize((new_width, new_height)), (0, new_header_height))
+
+    # Convert back to the original framework if necessary
+    new_image = to_numpy_array(new_image)
+
+    if infer_channel_dimension_format(new_image) == ChannelDimension.LAST:
+        new_image = to_channel_dimension_format(new_image, ChannelDimension.LAST)
+
+    return new_image
+
+
+class Pix2StructImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Pix2Struct image processor.
+
+    Args:
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method. According to Pix2Struct paper and code, the image is normalized with its own mean and standard
+            deviation.
+        patch_size (`Dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
+            The patch size to use for the image. According to Pix2Struct paper and code, the patch size is 16x16.
+        max_patches (`int`, *optional*, defaults to 2048):
+            The maximum number of patches to extract from the image as per the [Pix2Struct
+            paper](https://arxiv.org/pdf/2210.03347.pdf).
+        is_vqa (`bool`, *optional*, defaults to `False`):
+            Whether or not the image processor is for the VQA task. If `True` and `header_text` is passed in, text is
+            rendered onto the input images.
+    """
+
+    model_input_names = ["flattened_patches"]
+
+    def __init__(
+        self,
+        do_convert_rgb: bool = True,
+        do_normalize: bool = True,
+        patch_size: Dict[str, int] = None,
+        max_patches: int = 2048,
+        is_vqa: bool = False,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.patch_size = patch_size if patch_size is not None else {"height": 16, "width": 16}
+        self.do_normalize = do_normalize
+        self.do_convert_rgb = do_convert_rgb
+        self.max_patches = max_patches
+        self.is_vqa = is_vqa
+
+    def extract_flattened_patches(
+        self,
+        image: np.ndarray,
+        max_patches: int,
+        patch_size: dict,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Extract flattened patches from an image.
+
+        Args:
+            image (`np.ndarray`):
+                Image to extract flattened patches from.
+            max_patches (`int`):
+                Maximum number of patches to extract.
+            patch_size (`dict`):
+                Dictionary containing the patch height and width.
+
+        Returns:
+            result (`np.ndarray`):
+                A sequence of `max_patches` flattened patches.
+        """
+        requires_backends(self.extract_flattened_patches, "torch")
+        _check_torch_version()
+
+        # convert to torch
+        image = to_channel_dimension_format(image, ChannelDimension.FIRST, input_data_format)
+        image = torch.from_numpy(image)
+
+        patch_height, patch_width = patch_size["height"], patch_size["width"]
+        image_height, image_width = get_image_size(image, ChannelDimension.FIRST)
+
+        # maximize scale s.t.
+        scale = math.sqrt(max_patches * (patch_height / image_height) * (patch_width / image_width))
+        num_feasible_rows = max(min(math.floor(scale * image_height / patch_height), max_patches), 1)
+        num_feasible_cols = max(min(math.floor(scale * image_width / patch_width), max_patches), 1)
+        resized_height = max(num_feasible_rows * patch_height, 1)
+        resized_width = max(num_feasible_cols * patch_width, 1)
+
+        image = torch.nn.functional.interpolate(
+            image.unsqueeze(0),
+            size=(resized_height, resized_width),
+            mode="bilinear",
+            align_corners=False,
+            antialias=True,
+        ).squeeze(0)
+
+        # [1, rows, columns, patch_height * patch_width * image_channels]
+        patches = torch_extract_patches(image, patch_height, patch_width)
+
+        patches_shape = patches.shape
+        rows = patches_shape[1]
+        columns = patches_shape[2]
+        depth = patches_shape[3]
+
+        # [rows * columns, patch_height * patch_width * image_channels]
+        patches = patches.reshape([rows * columns, depth])
+
+        # [rows * columns, 1]
+        row_ids = torch.arange(rows).reshape([rows, 1]).repeat(1, columns).reshape([rows * columns, 1])
+        col_ids = torch.arange(columns).reshape([1, columns]).repeat(rows, 1).reshape([rows * columns, 1])
+
+        # Offset by 1 so the ids do not contain zeros, which represent padding.
+        row_ids += 1
+        col_ids += 1
+
+        # Prepare additional patch features.
+        # [rows * columns, 1]
+        row_ids = row_ids.to(torch.float32)
+        col_ids = col_ids.to(torch.float32)
+
+        # [rows * columns, 2 + patch_height * patch_width * image_channels]
+        result = torch.cat([row_ids, col_ids, patches], -1)
+
+        # [max_patches, 2 + patch_height * patch_width * image_channels]
+        result = torch.nn.functional.pad(result, [0, 0, 0, max_patches - (rows * columns)]).float()
+
+        result = to_numpy_array(result)
+
+        return result
+
+    def normalize(
+        self,
+        image: np.ndarray,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Normalize an image. image = (image - image_mean) / image_std.
+
+        The image std is to mimic the tensorflow implementation of the `per_image_standardization`:
+        https://www.tensorflow.org/api_docs/python/tf/image/per_image_standardization
+
+        Args:
+            image (`np.ndarray`):
+                Image to normalize.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        if image.dtype == np.uint8:
+            image = image.astype(np.float32)
+
+        # take mean across the whole `image`
+        mean = np.mean(image)
+        std = np.std(image)
+        adjusted_stddev = max(std, 1.0 / math.sqrt(np.prod(image.shape)))
+
+        return normalize(
+            image,
+            mean=mean,
+            std=adjusted_stddev,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        header_text: Optional[str] = None,
+        do_convert_rgb: bool = None,
+        do_normalize: Optional[bool] = None,
+        max_patches: Optional[int] = None,
+        patch_size: Optional[Dict[str, int]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> ImageInput:
+        """
+        Preprocess an image or batch of images. The processor first computes the maximum possible number of
+        aspect-ratio preserving patches of size `patch_size` that can be extracted from the image. It then pads the
+        image with zeros to make the image respect the constraint of `max_patches`. Before extracting the patches the
+        images are standardized following the tensorflow implementation of `per_image_standardization`
+        (https://www.tensorflow.org/api_docs/python/tf/image/per_image_standardization).
+
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images.
+            header_text (`Union[List[str], str]`, *optional*):
+                Text to render as a header. Only has an effect if `image_processor.is_vqa` is `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            max_patches (`int`, *optional*, defaults to `self.max_patches`):
+                Maximum number of patches to extract.
+            patch_size (`dict`, *optional*, defaults to `self.patch_size`):
+                Dictionary containing the patch height and width.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        patch_size = patch_size if patch_size is not None else self.patch_size
+        max_patches = max_patches if max_patches is not None else self.max_patches
+        is_vqa = self.is_vqa
+
+        if kwargs.get("data_format", None) is not None:
+            raise ValueError("data_format is not an accepted input as the outputs are ")
+
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        # PIL RGBA images are converted to RGB
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        if is_vqa:
+            if header_text is None:
+                raise ValueError("A header text must be provided for VQA models.")
+            font_bytes = kwargs.pop("font_bytes", None)
+            font_path = kwargs.pop("font_path", None)
+
+            if isinstance(header_text, str):
+                header_text = [header_text] * len(images)
+
+            images = [
+                render_header(image, header_text[i], font_bytes=font_bytes, font_path=font_path)
+                for i, image in enumerate(images)
+            ]
+
+        if do_normalize:
+            images = [self.normalize(image=image, input_data_format=input_data_format) for image in images]
+
+        # convert to torch tensor and permute
+        images = [
+            self.extract_flattened_patches(
+                image=image, max_patches=max_patches, patch_size=patch_size, input_data_format=input_data_format
+            )
+            for image in images
+        ]
+
+        # create attention mask in numpy
+        attention_masks = [(image.sum(axis=-1) != 0).astype(np.float32) for image in images]
+
+        encoded_outputs = BatchFeature(
+            data={"flattened_patches": images, "attention_mask": attention_masks}, tensor_type=return_tensors
+        )
+
+        return encoded_outputs
diff --git a/src/transformers/models/pix2struct/modeling_pix2struct.py b/src/transformers/models/pix2struct/modeling_pix2struct.py
new file mode 100644
index 000000000000..288e31a126e6
--- /dev/null
+++ b/src/transformers/models/pix2struct/modeling_pix2struct.py
@@ -0,0 +1,1816 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. & Google team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Pix2Struct modeling file"""
+
+import math
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.utils.checkpoint import checkpoint
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPooling,
+    CausalLMOutputWithCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import ALL_LAYERNORM_LAYERS
+from ...utils import (
+    DUMMY_INPUTS,
+    DUMMY_MASK,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_torch_fx_proxy,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_pix2struct import Pix2StructConfig, Pix2StructTextConfig, Pix2StructVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "Pix2StructConfig"
+
+
+PIX2STRUCT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "google/pix2struct-textcaps-base",
+    "google/pix2struct-textcaps-large",
+    "google/pix2struct-base",
+    "google/pix2struct-large",
+    "google/pix2struct-ai2d-base",
+    "google/pix2struct-ai2d-large",
+    "google/pix2struct-widget-captioning-base",
+    "google/pix2struct-widget-captioning-large",
+    "google/pix2struct-screen2words-base",
+    "google/pix2struct-screen2words-large",
+    "google/pix2struct-docvqa-base",
+    "google/pix2struct-docvqa-large",
+    "google/pix2struct-ocrvqa-base",
+    "google/pix2struct-ocrvqa-large",
+    "google/pix2struct-chartqa-base",
+    "google/pix2struct-inforgraphics-vqa-base",
+    "google/pix2struct-inforgraphics-vqa-large",
+    # See all Pix2StructVision models at https://huggingface.co/models?filter=pix2struct
+]
+
+
+# Adapted from transformers.models.t5.modeling_t5.T5LayerNorm with T5->Pix2Struct
+class Pix2StructLayerNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
+        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
+        # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
+        # half-precision inputs is done in fp32
+
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+
+        # convert into half-precision if necessary
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.to(self.weight.dtype)
+
+        return self.weight * hidden_states
+
+
+try:
+    from apex.normalization import FusedRMSNorm
+
+    Pix2StructLayerNorm = FusedRMSNorm  # noqa
+
+    logger.info("Discovered apex.normalization.FusedRMSNorm - will use it instead of Pix2StructLayerNorm")
+except ImportError:
+    # using the normal Pix2StructLayerNorm
+    pass
+except Exception:
+    logger.warning("Discovered apex but it failed to load, falling back to Pix2StructLayerNorm")
+    pass
+
+ALL_LAYERNORM_LAYERS.append(Pix2StructLayerNorm)
+
+
+class Pix2StructVisionEmbeddings(nn.Module):
+    r"""
+    Construct the embeddings from patch. In `Pix2Struct` the input is different from classic Vision-transformer models.
+    Here the input is a sequence of `seq_len` flattened patches that also combines padding patches (tokens). Each patch
+    is represented by a vector of `hidden_size` values.
+    """
+
+    def __init__(self, config: Pix2StructConfig) -> None:
+        super().__init__()
+        self.patch_projection = nn.Linear(config.patch_embed_hidden_size, config.hidden_size)
+
+        self.row_embedder = nn.Embedding(config.seq_len, config.hidden_size)
+        self.column_embedder = nn.Embedding(config.seq_len, config.hidden_size)
+
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, flattened_patches: torch.Tensor) -> torch.Tensor:
+        # the row and column indices are stored in the first and second position of the flattened_patches
+        # flattened_patches: `batch_size`, `seq_len`, `hidden_size` + 2
+        row_indices = flattened_patches[:, :, 0].long()
+        col_indices = flattened_patches[:, :, 1].long()
+
+        flattened_patches = flattened_patches[:, :, 2:]
+
+        embeddings = self.patch_projection(flattened_patches)
+        row_embeddings = self.row_embedder(row_indices)
+        col_embeddings = self.column_embedder(col_indices)
+
+        # sum all embeddings together
+        embeddings = embeddings + row_embeddings + col_embeddings
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+class Pix2StructVisionAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.key_value_proj_dim = config.d_kv
+        self.n_heads = config.num_attention_heads
+        self.dropout = config.attention_dropout
+        self.inner_dim = self.n_heads * self.key_value_proj_dim
+
+        # Mesh TensorFlow initialization to avoid scaling before softmax
+        self.query = nn.Linear(self.hidden_size, self.inner_dim, bias=False)
+        self.key = nn.Linear(self.hidden_size, self.inner_dim, bias=False)
+        self.value = nn.Linear(self.hidden_size, self.inner_dim, bias=False)
+        self.output = nn.Linear(self.inner_dim, self.hidden_size, bias=False)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_bias=None,
+        layer_head_mask=None,
+        output_attentions=False,
+    ):
+        """
+        Self-attention block
+        """
+        # Input is (batch_size, seq_length, dim)
+        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
+        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        def to_projection_shape(states):
+            """projection"""
+            return states.contiguous().view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
+
+        # get query states
+        # (batch_size, n_heads, seq_length, dim_per_head)
+        query_states = to_projection_shape(self.query(hidden_states))
+
+        # get key/value states
+        key_states = to_projection_shape(self.key(hidden_states))
+        value_states = to_projection_shape(self.value(hidden_states))
+
+        # compute scores
+        # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
+        scores = torch.matmul(query_states, key_states.transpose(3, 2))
+
+        if position_bias is None:
+            position_bias = torch.zeros(
+                (1, self.n_heads, seq_length, seq_length), device=scores.device, dtype=scores.dtype
+            )
+            if self.gradient_checkpointing and self.training:
+                position_bias.requires_grad = True
+
+            if attention_mask is None:
+                attention_mask = torch.ones((batch_size, seq_length), device=scores.device, dtype=scores.dtype)
+
+            if attention_mask.dim() == 2:
+                position_bias = position_bias + attention_mask[:, None, None, :].to(position_bias.device)
+            else:
+                # (batch_size, n_heads, seq_length, key_length)
+                position_bias = position_bias + attention_mask.to(position_bias.device)
+            position_bias = 1 - position_bias
+
+        position_bias_masked = position_bias.masked_fill(position_bias == 1, torch.finfo(scores.dtype).min)
+        scores += position_bias_masked
+        scores = torch.max(scores, torch.tensor(torch.finfo(scores.dtype).min))
+
+        # (batch_size, n_heads, seq_length, key_length)
+        attn_weights = nn.functional.softmax(scores, dim=-1, dtype=torch.float32).type_as(scores)
+
+        # (batch_size, n_heads, seq_length, key_length)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        # Mask heads if we want to
+        if layer_head_mask is not None:
+            attn_weights = attn_weights * layer_head_mask
+
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        # (batch_size, seq_length, dim)
+        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
+
+        attn_output = self.output(attn_output)
+
+        outputs = (attn_output,) + (position_bias,)
+
+        if output_attentions:
+            outputs = outputs + (attn_weights,)
+        return outputs
+
+
+# Copied from transformers.models.t5.modeling_t5.T5DenseGatedActDense with T5DenseGatedActDense->Pix2StructVisionMlp,T5Config->Pix2StructVisionConfig,config.d_model->config.hidden_size,dropout_rate->dropout_rate
+class Pix2StructVisionMlp(nn.Module):
+    def __init__(self, config: Pix2StructVisionConfig):
+        super().__init__()
+        self.wi_0 = nn.Linear(config.hidden_size, config.d_ff, bias=False)
+        self.wi_1 = nn.Linear(config.hidden_size, config.d_ff, bias=False)
+        self.wo = nn.Linear(config.d_ff, config.hidden_size, bias=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+        self.act = ACT2FN[config.dense_act_fn]
+
+    def forward(self, hidden_states):
+        hidden_gelu = self.act(self.wi_0(hidden_states))
+        hidden_linear = self.wi_1(hidden_states)
+        hidden_states = hidden_gelu * hidden_linear
+        hidden_states = self.dropout(hidden_states)
+
+        # To make 8bit quantization work for google/flan-t5-xxl, self.wo is kept in float32.
+        # See https://github.com/huggingface/transformers/issues/20287
+        # we also make sure the weights are not in `int8` in case users will force `_keep_in_fp32_modules` to be `None``
+        if (
+            isinstance(self.wo.weight, torch.Tensor)
+            and hidden_states.dtype != self.wo.weight.dtype
+            and self.wo.weight.dtype != torch.int8
+        ):
+            hidden_states = hidden_states.to(self.wo.weight.dtype)
+
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+class Pix2StructVisionLayer(nn.Module):
+    def __init__(self, config: Pix2StructConfig) -> None:
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = Pix2StructVisionAttention(config)
+        self.mlp = Pix2StructVisionMlp(config)
+        self.pre_mlp_layer_norm = Pix2StructLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.pre_attention_layer_norm = Pix2StructLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        residual = hidden_states
+
+        # in Pix2StructVision, layernorm is applied before self-attention
+        hidden_states = self.pre_attention_layer_norm(hidden_states)
+
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        # first residual connection
+        hidden_states = attention_output + residual
+
+        # in Pix2StructVision, layernorm is also applied after self-attention
+        layer_output = self.pre_mlp_layer_norm(hidden_states)
+        layer_output = self.mlp(layer_output) + hidden_states  # second residual connection
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+class Pix2StructVisionEncoder(nn.Module):
+    def __init__(self, config: Pix2StructConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([Pix2StructVisionLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, attention_mask, layer_head_mask, output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class Pix2StructPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = Pix2StructConfig
+
+    @property
+    def dummy_inputs(self):
+        input_ids = torch.tensor(DUMMY_INPUTS)
+        input_mask = torch.tensor(DUMMY_MASK)
+        dummy_inputs = {
+            "decoder_input_ids": input_ids,
+            "input_ids": input_ids,
+            "decoder_attention_mask": input_mask,
+        }
+        return dummy_inputs
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_factor  # Used for testing weights initialization
+        if isinstance(module, Pix2StructLayerNorm):
+            module.weight.data.fill_(factor * 1.0)
+        elif isinstance(module, Pix2StructTextDenseGatedActDense):
+            hidden_size = (
+                self.config.text_config.hidden_size
+                if isinstance(self.config, Pix2StructConfig)
+                else self.config.hidden_size
+            )
+            d_ff = self.config.text_config.d_ff if isinstance(self.config, Pix2StructConfig) else self.config.d_ff
+
+            module.wi_0.weight.data.normal_(mean=0.0, std=factor * ((hidden_size) ** -0.5))
+            if hasattr(module.wi_0, "bias") and module.wi_0.bias is not None:
+                module.wi_0.bias.data.zero_()
+            module.wi_1.weight.data.normal_(mean=0.0, std=factor * ((hidden_size) ** -0.5))
+            if hasattr(module.wi_1, "bias") and module.wi_1.bias is not None:
+                module.wi_1.bias.data.zero_()
+            module.wo.weight.data.normal_(mean=0.0, std=factor * ((d_ff) ** -0.5))
+            if hasattr(module.wo, "bias") and module.wo.bias is not None:
+                module.wo.bias.data.zero_()
+        elif isinstance(module, Pix2StructTextAttention):
+            # Mesh TensorFlow attention initialization to avoid scaling before softmax
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136
+            hidden_size = (
+                self.config.text_config.hidden_size
+                if isinstance(self.config, Pix2StructConfig)
+                else self.config.hidden_size
+            )
+            key_value_proj_dim = (
+                self.config.text_config.d_kv if isinstance(self.config, Pix2StructConfig) else self.config.hidden_size
+            )
+            n_heads = (
+                self.config.text_config.num_heads
+                if isinstance(self.config, Pix2StructConfig)
+                else self.config.num_heads
+            )
+
+            module.query.weight.data.normal_(mean=0.0, std=factor * ((hidden_size * key_value_proj_dim) ** -0.5))
+            module.key.weight.data.normal_(mean=0.0, std=factor * (hidden_size**-0.5))
+            module.value.weight.data.normal_(mean=0.0, std=factor * (hidden_size**-0.5))
+            module.output.weight.data.normal_(mean=0.0, std=factor * ((n_heads * key_value_proj_dim) ** -0.5))
+            if module.has_relative_attention_bias:
+                module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor * ((hidden_size) ** -0.5))
+        elif isinstance(module, nn.Embedding):
+            hidden_size = (
+                self.config.text_config.hidden_size
+                if isinstance(self.config, Pix2StructConfig)
+                else self.config.hidden_size
+            )
+
+            module.weight.data.normal_(mean=0.0, std=factor * ((hidden_size) ** -0.5))
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, Pix2StructTextModel):
+            hidden_size = (
+                self.config.text_config.hidden_size
+                if isinstance(self.config, Pix2StructConfig)
+                else self.config.hidden_size
+            )
+
+            module.lm_head.weight.data.normal_(mean=0.0, std=factor * ((hidden_size) ** -0.5))
+        elif isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(
+                module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
+            ).to(module.weight.dtype)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, Pix2StructLayerNorm):
+            if module.weight is not None:
+                module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    # Copied from transformers.models.t5.modeling_t5.T5PreTrainedModel._shift_right with T5->Pix2Struct
+    def _shift_right(self, input_ids):
+        decoder_start_token_id = self.config.decoder_start_token_id
+        pad_token_id = self.config.pad_token_id
+
+        if decoder_start_token_id is None:
+            raise ValueError(
+                "self.model.config.decoder_start_token_id has to be defined. In Pix2Struct it is usually set to the pad_token_id."
+                "See Pix2Struct docs for more information."
+            )
+
+        # shift inputs to the right
+        if is_torch_fx_proxy(input_ids):
+            # Item assignment is not supported natively for proxies.
+            shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), decoder_start_token_id)
+            shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)
+        else:
+            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+            shifted_input_ids[..., 0] = decoder_start_token_id
+
+        if pad_token_id is None:
+            raise ValueError("self.model.config.pad_token_id has to be defined.")
+        # replace possible -100 values in labels by `pad_token_id`
+        shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+        return shifted_input_ids
+
+
+PIX2STRUCT_VISION_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`Pix2StructConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+PIX2STRUCT_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        flattened_patches (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_channels x patch_height x patch_width)`):
+            Flattened and padded pixel values. These values can be obtained using [`AutoImageProcessor`]. See
+            [`Pix2StructVisionImageProcessor.__call__`] for details. Check the [original
+            paper](https://arxiv.org/abs/2210.03347) (figure 5) for more details.
+
+        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
+
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Pix2StructVision Model transformer outputting raw hidden-states without any specific head on top.",
+    PIX2STRUCT_VISION_START_DOCSTRING,
+)
+class Pix2StructVisionModel(Pix2StructPreTrainedModel):
+    config_class = Pix2StructVisionConfig
+    main_input_name = "flattened_patches"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Pix2StructVisionLayer"]
+
+    def __init__(self, config: Pix2StructConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = Pix2StructVisionEmbeddings(config)
+        self.encoder = Pix2StructVisionEncoder(config)
+
+        self.layernorm = Pix2StructLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def _set_gradient_checkpointing(self, module: Pix2StructVisionEncoder, value: bool = False) -> None:
+        if isinstance(module, Pix2StructVisionEncoder):
+            module.gradient_checkpointing = value
+
+    def get_input_embeddings(self):
+        return self.embeddings.patch_projection
+
+    def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(PIX2STRUCT_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        flattened_patches: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        Example:
+
+        ```python
+        >>> import requests
+        >>> from PIL import Image
+        >>> from transformers import AutoProcessor, Pix2StructVisionModel
+
+        >>> image_processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
+        >>> model = Pix2StructVisionModel.from_pretrained("google/pix2struct-textcaps-base")
+
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+
+        >>> last_hidden_states = outputs.last_hidden_state
+        >>> list(last_hidden_states.shape)
+        [1, 2048, 768]
+        ```
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if flattened_patches is None:
+            raise ValueError("You have to specify flattened_patches")
+
+        if attention_mask is None:
+            # check where `flattened_patches` is not 0
+            attention_mask = (flattened_patches.sum(dim=-1) != 0).float()
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(flattened_patches)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+
+        if not return_dict:
+            head_outputs = (sequence_output,)
+            return head_outputs + encoder_outputs[1:]
+
+        return BaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+# Copied from transformers.models.t5.modeling_t5.T5DenseGatedActDense with T5->Pix2StructText,d_model->hidden_size
+class Pix2StructTextDenseGatedActDense(nn.Module):
+    def __init__(self, config: Pix2StructTextConfig):
+        super().__init__()
+        self.wi_0 = nn.Linear(config.hidden_size, config.d_ff, bias=False)
+        self.wi_1 = nn.Linear(config.hidden_size, config.d_ff, bias=False)
+        self.wo = nn.Linear(config.d_ff, config.hidden_size, bias=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+        self.act = ACT2FN[config.dense_act_fn]
+
+    def forward(self, hidden_states):
+        hidden_gelu = self.act(self.wi_0(hidden_states))
+        hidden_linear = self.wi_1(hidden_states)
+        hidden_states = hidden_gelu * hidden_linear
+        hidden_states = self.dropout(hidden_states)
+
+        # To make 8bit quantization work for google/flan-t5-xxl, self.wo is kept in float32.
+        # See https://github.com/huggingface/transformers/issues/20287
+        # we also make sure the weights are not in `int8` in case users will force `_keep_in_fp32_modules` to be `None``
+        if (
+            isinstance(self.wo.weight, torch.Tensor)
+            and hidden_states.dtype != self.wo.weight.dtype
+            and self.wo.weight.dtype != torch.int8
+        ):
+            hidden_states = hidden_states.to(self.wo.weight.dtype)
+
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+class Pix2StructTextLayerFF(nn.Module):
+    def __init__(self, config: Pix2StructTextConfig):
+        super().__init__()
+        self.DenseReluDense = Pix2StructTextDenseGatedActDense(config)
+
+        self.layer_norm = Pix2StructLayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    # Copied from transformers.models.t5.modeling_t5.T5LayerFF.forward
+    def forward(self, hidden_states):
+        forwarded_states = self.layer_norm(hidden_states)
+        forwarded_states = self.DenseReluDense(forwarded_states)
+        hidden_states = hidden_states + self.dropout(forwarded_states)
+        return hidden_states
+
+
+class Pix2StructTextAttention(nn.Module):
+    def __init__(self, config: Pix2StructTextConfig, has_relative_attention_bias=False):
+        super().__init__()
+        self.has_relative_attention_bias = has_relative_attention_bias
+        self.relative_attention_num_buckets = config.relative_attention_num_buckets
+        self.relative_attention_max_distance = config.relative_attention_max_distance
+        self.hidden_size = config.hidden_size
+        self.key_value_proj_dim = config.d_kv
+        self.n_heads = config.num_heads
+        self.dropout = config.dropout_rate
+        self.inner_dim = self.n_heads * self.key_value_proj_dim
+
+        # Mesh TensorFlow initialization to avoid scaling before softmax
+        self.query = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+        self.key = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+        self.value = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+        self.output = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+
+        if self.has_relative_attention_bias:
+            self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
+        self.pruned_heads = set()
+        self.gradient_checkpointing = False
+
+    @staticmethod
+    # Copied from transformers.models.t5.modeling_t5.T5Attention._relative_position_bucket
+    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
+        """
+        Adapted from Mesh Tensorflow:
+        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
+
+        Translate relative position to a bucket number for relative attention. The relative position is defined as
+        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
+        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
+        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
+        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
+        This should allow for more graceful generalization to longer sequences than the model has been trained on
+
+        Args:
+            relative_position: an int32 Tensor
+            bidirectional: a boolean - whether the attention is bidirectional
+            num_buckets: an integer
+            max_distance: an integer
+
+        Returns:
+            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
+        """
+        relative_buckets = 0
+        if bidirectional:
+            num_buckets //= 2
+            relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
+            relative_position = torch.abs(relative_position)
+        else:
+            relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))
+        # now relative_position is in the range [0, inf)
+
+        # half of the buckets are for exact increments in positions
+        max_exact = num_buckets // 2
+        is_small = relative_position < max_exact
+
+        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
+        relative_position_if_large = max_exact + (
+            torch.log(relative_position.float() / max_exact)
+            / math.log(max_distance / max_exact)
+            * (num_buckets - max_exact)
+        ).to(torch.long)
+        relative_position_if_large = torch.min(
+            relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
+        )
+
+        relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)
+        return relative_buckets
+
+    # Adapted from transformers.models.t5.modeling_t5.T5Attention.compute_bias
+    def compute_bias(self, query_length, key_length, device=None):
+        """Compute binned relative position bias"""
+        if device is None:
+            device = self.relative_attention_bias.weight.device
+        context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]
+        memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :]
+        relative_position = memory_position - context_position  # shape (query_length, key_length)
+        relative_position_bucket = self._relative_position_bucket(
+            relative_position,  # shape (query_length, key_length)
+            bidirectional=False,
+            num_buckets=self.relative_attention_num_buckets,
+            max_distance=self.relative_attention_max_distance,
+        )
+        values = self.relative_attention_bias(relative_position_bucket)  # shape (query_length, key_length, num_heads)
+        values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, query_length, key_length)
+        return values
+
+    def forward(
+        self,
+        hidden_states,
+        mask=None,
+        key_value_states=None,
+        position_bias=None,
+        past_key_value=None,
+        layer_head_mask=None,
+        query_length=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        """
+        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
+        """
+        # Input is (batch_size, seq_length, dim)
+        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
+        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        real_seq_length = seq_length
+
+        if past_key_value is not None:
+            if len(past_key_value) != 2:
+                raise ValueError(
+                    f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
+                )
+            real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length
+
+        key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
+
+        def to_projection_shape(states):
+            """projection"""
+            return states.contiguous().view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
+
+        def project(hidden_states, proj_layer, key_value_states, past_key_value):
+            """projects hidden states correctly to key/query states"""
+            if key_value_states is None:
+                # self-attn
+                # (batch_size, n_heads, seq_length, dim_per_head)
+                hidden_states = to_projection_shape(proj_layer(hidden_states))
+            elif past_key_value is None:
+                # cross-attn
+                # (batch_size, n_heads, seq_length, dim_per_head)
+                hidden_states = to_projection_shape(proj_layer(key_value_states))
+
+            if past_key_value is not None:
+                if key_value_states is None:
+                    # self-attn
+                    # (batch_size, n_heads, key_length, dim_per_head)
+                    hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
+                elif past_key_value.shape[2] != key_value_states.shape[1]:
+                    # checking that the `sequence_length` of the `past_key_value` is the same as
+                    # the provided `key_value_states` to support prefix tuning
+                    # cross-attn
+                    # (batch_size, n_heads, seq_length, dim_per_head)
+                    hidden_states = to_projection_shape(proj_layer(key_value_states))
+                else:
+                    # cross-attn
+                    hidden_states = past_key_value
+            return hidden_states
+
+        # get query states
+        # (batch_size, n_heads, seq_length, dim_per_head)
+        query_states = to_projection_shape(self.query(hidden_states))
+
+        # get key/value states
+        key_states = project(
+            hidden_states, self.key, key_value_states, past_key_value[0] if past_key_value is not None else None
+        )
+        value_states = project(
+            hidden_states, self.value, key_value_states, past_key_value[1] if past_key_value is not None else None
+        )
+
+        # compute scores
+        scores = torch.matmul(
+            query_states, key_states.transpose(3, 2)
+        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
+
+        if position_bias is None:
+            if not self.has_relative_attention_bias:
+                position_bias = torch.zeros(
+                    (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype
+                )
+                if self.gradient_checkpointing and self.training:
+                    position_bias.requires_grad = True
+            else:
+                position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device)
+
+            # if key and values are already calculated
+            # we want only the last query position bias
+            if past_key_value is not None:
+                position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
+
+            if mask is not None:
+                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
+
+        if self.pruned_heads:
+            mask = torch.ones(position_bias.shape[1])
+            mask[list(self.pruned_heads)] = 0
+            position_bias_masked = position_bias[:, mask.bool()]
+        else:
+            position_bias_masked = position_bias
+
+        scores += position_bias_masked
+        # (batch_size, n_heads, seq_length, key_length)
+        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores)
+
+        # (batch_size, n_heads, seq_length, key_length)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        # Mask heads if we want to
+        if layer_head_mask is not None:
+            attn_weights = attn_weights * layer_head_mask
+
+        attn_output = torch.matmul(attn_weights, value_states)
+        # (batch_size, seq_length, dim)
+        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
+
+        attn_output = self.output(attn_output)
+
+        present_key_value_state = (key_states, value_states) if use_cache else None
+        outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
+
+        if output_attentions:
+            outputs = outputs + (attn_weights,)
+        return outputs
+
+
+# Copied from transformers.models.t5.modeling_t5.T5LayerSelfAttention with T5LayerNorm->Pix2StructLayerNorm,T5Attention->Pix2StructTextAttention,self.SelfAttention->self.attention,config.d_model->config.hidden_size
+class Pix2StructTextLayerSelfAttention(nn.Module):
+    def __init__(self, config, has_relative_attention_bias=False):
+        super().__init__()
+        self.attention = Pix2StructTextAttention(config, has_relative_attention_bias=has_relative_attention_bias)
+        self.layer_norm = Pix2StructLayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_bias=None,
+        layer_head_mask=None,
+        past_key_value=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        normed_hidden_states = self.layer_norm(hidden_states)
+        attention_output = self.attention(
+            normed_hidden_states,
+            mask=attention_mask,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            past_key_value=past_key_value,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        hidden_states = hidden_states + self.dropout(attention_output[0])
+        outputs = (hidden_states,) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.t5.modeling_t5.T5LayerCrossAttention with T5LayerNorm->Pix2StructLayerNorm,T5Attention->Pix2StructTextAttention,self.EncDecAttention->self.attention,config.d_model->config.hidden_size
+class Pix2StructTextLayerCrossAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.attention = Pix2StructTextAttention(config, has_relative_attention_bias=False)
+        self.layer_norm = Pix2StructLayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(
+        self,
+        hidden_states,
+        key_value_states,
+        attention_mask=None,
+        position_bias=None,
+        layer_head_mask=None,
+        past_key_value=None,
+        use_cache=False,
+        query_length=None,
+        output_attentions=False,
+    ):
+        normed_hidden_states = self.layer_norm(hidden_states)
+        attention_output = self.attention(
+            normed_hidden_states,
+            mask=attention_mask,
+            key_value_states=key_value_states,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            past_key_value=past_key_value,
+            use_cache=use_cache,
+            query_length=query_length,
+            output_attentions=output_attentions,
+        )
+        layer_output = hidden_states + self.dropout(attention_output[0])
+        outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+class Pix2StructTextBlock(nn.Module):
+    def __init__(self, config, has_relative_attention_bias=False):
+        super().__init__()
+
+        self.self_attention = Pix2StructTextLayerSelfAttention(
+            config, has_relative_attention_bias=has_relative_attention_bias
+        )
+
+        self.encoder_decoder_attention = Pix2StructTextLayerCrossAttention(config)
+
+        self.mlp = Pix2StructTextLayerFF(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_bias=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        encoder_decoder_position_bias=None,
+        layer_head_mask=None,
+        cross_attn_layer_head_mask=None,
+        past_key_value=None,
+        use_cache=False,
+        output_attentions=False,
+        return_dict=True,
+    ):
+        if past_key_value is not None:
+            expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
+
+            if len(past_key_value) != expected_num_past_key_values:
+                raise ValueError(
+                    f"There should be {expected_num_past_key_values} past states. "
+                    f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
+                    f"Got {len(past_key_value)} past key / value states"
+                )
+
+            self_attn_past_key_value = past_key_value[:2]
+            cross_attn_past_key_value = past_key_value[2:]
+        else:
+            self_attn_past_key_value, cross_attn_past_key_value = None, None
+
+        self_attention_outputs = self.self_attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            past_key_value=self_attn_past_key_value,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        hidden_states, present_key_value_state = self_attention_outputs[:2]
+        attention_outputs = self_attention_outputs[2:]  # Keep self-attention outputs and relative position weights
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        do_cross_attention = encoder_hidden_states is not None
+        if do_cross_attention:
+            # the actual query length is unknown for cross attention
+            # if using past key value states. Need to inject it here
+            if present_key_value_state is not None:
+                query_length = present_key_value_state[0].shape[2]
+            else:
+                query_length = None
+
+            cross_attention_outputs = self.encoder_decoder_attention(
+                hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                position_bias=encoder_decoder_position_bias,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                query_length=query_length,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+            )
+            hidden_states = cross_attention_outputs[0]
+
+            # clamp inf values to enable fp16 training
+            if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+            # Combine self attn and cross attn key value states
+            if present_key_value_state is not None:
+                present_key_value_state = present_key_value_state + cross_attention_outputs[1]
+
+            # Keep cross-attention outputs and relative position weights
+            attention_outputs = attention_outputs + cross_attention_outputs[2:]
+
+        # Apply Feed Forward layer
+        hidden_states = self.mlp(hidden_states)
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if use_cache:
+            outputs = outputs + (present_key_value_state,) + attention_outputs
+        else:
+            outputs = outputs + attention_outputs
+
+        return outputs
+
+
+PIX2STRUCT_START_DOCSTRING = r"""
+
+    The Pix2Struct model was proposed in [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language
+    Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu,
+    Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova. It's an encoder decoder
+    transformer pre-trained in a image-to-text setting.
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config (Union[`Pix2StructConfig`, `Pix2StructTextConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+PIX2STRUCT_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Pix2StructText is a model with relative position
+            embeddings so you should be able to pad the inputs on both the right and the left.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for detail.
+
+            [What are input IDs?](../glossary#input-ids)
+
+            To know more on how to prepare `input_ids` for pretraining take a look a [Pix2StructText
+            Training](./t5#training).
+        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            Pix2StructText uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
+            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [Pix2StructText
+            Training](./t5#training).
+        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules in the encoder. Mask values selected in `[0,
+            1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
+            1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
+                `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, `optional`: *hidden_states*, `optional`: *attentions*)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` is a sequence of hidden states at
+            the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention layers. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
+            input (see `past_key_values`). This is useful if you want more control over how to convert
+            `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+
+            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
+            of `inputs_embeds`.
+
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+PIX2STRUCT_INPUTS_DOCSTRING = r"""
+    Args:
+        flattened_patches (`torch.FloatTensor` of shape `(batch_size, seq_length, hidden_size)`):
+            Flattened pixel patches. the `hidden_size` is obtained by the following formula: `hidden_size` =
+            `num_channels` * `patch_size` * `patch_size`
+
+            The process of flattening the pixel patches is done by `Pix2StructProcessor`.
+
+        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            Pix2StructText uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
+            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [Pix2StructText
+            Training](./t5#training).
+        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules in the encoder. Mask values selected in `[0,
+            1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
+            1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
+                `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, `optional`: *hidden_states*, `optional`: *attentions*)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` is a sequence of hidden states at
+            the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention layers. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
+            input (see `past_key_values`). This is useful if you want more control over how to convert
+            `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+
+            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
+            of `inputs_embeds`.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss for the decoder.
+
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The standalone text decoder of Pix2Struct",
+    PIX2STRUCT_START_DOCSTRING,
+)
+class Pix2StructTextModel(Pix2StructPreTrainedModel):
+    config_class = Pix2StructTextConfig
+    _no_split_modules = ["Pix2StructTextBlock"]
+    _tied_weights_keys = ["lm_head.weight"]
+    supports_gradient_checkpointing = True
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (Pix2StructTextAttention, Pix2StructTextModel)):
+            module.gradient_checkpointing = value
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+
+        self.layer = nn.ModuleList(
+            [Pix2StructTextBlock(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
+        )
+        self.final_layer_norm = Pix2StructLayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+        self.gradient_checkpointing = False
+
+    # Copied from transformers.models.t5.modeling_t5.T5PreTrainedModel._reorder_cache
+    def _reorder_cache(self, past_key_values, beam_idx):
+        # if decoder past is not included in output
+        # speedy decoding is disabled and no need to reorder
+        if past_key_values is None:
+            logger.warning("You might want to consider setting `use_cache=True` to speed up decoding")
+            return past_key_values
+
+        reordered_decoder_past = ()
+        for layer_past_states in past_key_values:
+            # get the correct batch idx from layer past batch dim
+            # batch dim of `past` is at 2nd position
+            reordered_layer_past_states = ()
+            for layer_past_state in layer_past_states:
+                # need to set correct `past` for each of the four key / value states
+                reordered_layer_past_states = reordered_layer_past_states + (
+                    layer_past_state.index_select(0, beam_idx.to(layer_past_state.device)),
+                )
+
+            if reordered_layer_past_states[0].shape != layer_past_states[0].shape:
+                raise ValueError(
+                    f"reordered_layer_past_states[0] shape {reordered_layer_past_states[0].shape} and layer_past_states[0] shape {layer_past_states[0].shape} mismatched"
+                )
+            if len(reordered_layer_past_states) != len(layer_past_states):
+                raise ValueError(
+                    f"length of reordered_layer_past_states {len(reordered_layer_past_states)} and length of layer_past_states {len(layer_past_states)} mismatched"
+                )
+
+            reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)
+        return reordered_decoder_past
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, new_embeddings):
+        self.embed_tokens = new_embeddings
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    @add_start_docstrings_to_model_forward(PIX2STRUCT_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple[torch.FloatTensor, ...], CausalLMOutputWithCrossAttentions]:
+        r"""
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoProcessor, Pix2StructTextModel
+
+        >>> processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
+        >>> model = Pix2StructTextModel.from_pretrained("google/pix2struct-textcaps-base")
+
+        >>> inputs = processor(text="Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> loss = outputs.loss
+        ```
+        """
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        if inputs_embeds is None:
+            assert self.embed_tokens is not None, "You have to initialize the model with valid token embeddings"
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        batch_size, seq_length = input_shape
+
+        # required mask seq length can be calculated via length of past
+        mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length
+
+        if attention_mask is None:
+            attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
+        if encoder_attention_mask is None and encoder_hidden_states is not None:
+            encoder_seq_length = encoder_hidden_states.shape[1]
+            encoder_attention_mask = torch.ones(
+                batch_size, encoder_seq_length, device=inputs_embeds.device, dtype=torch.long
+            )
+
+        # initialize past_key_values with `None` if past does not exist
+        if past_key_values is None:
+            past_key_values = [None] * len(self.layer)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=inputs_embeds.device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        head_mask = self.get_head_mask(head_mask, self.config.num_layers)
+        cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
+        present_key_value_states = () if use_cache else None
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions) else None
+        position_bias = None
+        encoder_decoder_position_bias = None
+
+        hidden_states = self.dropout(inputs_embeds)
+
+        for i, (layer_module, past_key_value) in enumerate(zip(self.layer, past_key_values)):
+            layer_head_mask = head_mask[i]
+            cross_attn_layer_head_mask = cross_attn_head_mask[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return tuple(module(*inputs, use_cache, output_attentions))
+
+                    return custom_forward
+
+                layer_outputs = checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    extended_attention_mask,
+                    position_bias,
+                    encoder_hidden_states,
+                    encoder_extended_attention_mask,
+                    encoder_decoder_position_bias,
+                    layer_head_mask,
+                    cross_attn_layer_head_mask,
+                    None,  # past_key_value is always None with gradient checkpointing
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask=extended_attention_mask,
+                    position_bias=position_bias,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_extended_attention_mask,
+                    encoder_decoder_position_bias=encoder_decoder_position_bias,
+                    layer_head_mask=layer_head_mask,
+                    cross_attn_layer_head_mask=cross_attn_layer_head_mask,
+                    past_key_value=past_key_value,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                )
+
+            # layer_outputs is a tuple with:
+            # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
+            if use_cache is False:
+                layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:]
+
+            hidden_states, present_key_value_state = layer_outputs[:2]
+
+            # We share the position biases between the layers - the first layer store them
+            # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
+            # (cross-attention position bias), (cross-attention weights)
+            position_bias = layer_outputs[2]
+            if encoder_hidden_states is not None:
+                encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3]
+            # append next layer key value states
+            if use_cache:
+                present_key_value_states = present_key_value_states + (present_key_value_state,)
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[3],)
+                if encoder_hidden_states is not None:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[5],)
+
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        logits = self.lm_head(hidden_states)
+
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            loss_fct = nn.CrossEntropyLoss(ignore_index=-100, reduction="mean")
+
+            loss = loss_fct(logits.contiguous().view(-1, logits.size(-1)), labels.contiguous().view(-1))
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    loss,
+                    logits,
+                    present_key_value_states,
+                    all_hidden_states,
+                    all_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=logits,
+            past_key_values=present_key_value_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    "A conditional generation model with a language modeling head. Can be used for sequence generation tasks.",
+    PIX2STRUCT_START_DOCSTRING,
+)
+class Pix2StructForConditionalGeneration(Pix2StructPreTrainedModel):
+    config_class = Pix2StructConfig
+    main_input_name = "flattened_patches"
+    _tied_weights_keys = ["decoder.lm_head.weight"]
+
+    def __init__(self, config: Pix2StructConfig):
+        super().__init__(config)
+
+        self.encoder = Pix2StructVisionModel(config.vision_config)
+        self.decoder = Pix2StructTextModel(config.text_config)
+
+        self.is_vqa = config.is_vqa
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.decoder.get_input_embeddings()
+
+    def set_input_embeddings(self, new_embeddings):
+        self.decoder.set_input_embeddings(new_embeddings)
+
+    def get_output_embeddings(self) -> nn.Module:
+        return self.decoder.get_output_embeddings()
+
+    def set_output_embeddings(self, new_embeddings):
+        self.decoder.set_output_embeddings(new_embeddings)
+
+    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> nn.Embedding:
+        model_embeds = self.decoder.resize_token_embeddings(new_num_tokens)
+
+        # update vocab size
+        self.config.text_config.vocab_size = new_num_tokens
+
+        return model_embeds
+
+    def get_decoder(self):
+        return self.decoder
+
+    def get_encoder(self):
+        return self.encoder
+
+    @add_start_docstrings_to_model_forward(PIX2STRUCT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        flattened_patches: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        decoder_head_mask: Optional[torch.FloatTensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        decoder_inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqModelOutput]:
+        r"""
+        Returns:
+
+        Example:
+
+        Inference:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, Pix2StructForConditionalGeneration
+
+        >>> processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
+        >>> model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-textcaps-base")
+
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> # autoregressive generation
+        >>> generated_ids = model.generate(**inputs, max_new_tokens=50)
+        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        >>> print(generated_text)
+        A stop sign is on a street corner.
+
+        >>> # conditional generation
+        >>> text = "A picture of"
+        >>> inputs = processor(text=text, images=image, return_tensors="pt", add_special_tokens=False)
+
+        >>> generated_ids = model.generate(**inputs, max_new_tokens=50)
+        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        >>> print(generated_text)
+        A picture of a stop sign with a red stop sign
+        ```
+
+        Training:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, Pix2StructForConditionalGeneration
+
+        >>> processor = AutoProcessor.from_pretrained("google/pix2struct-base")
+        >>> model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-base")
+
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> text = "A stop sign is on the street corner."
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+        >>> labels = processor(text=text, return_tensors="pt").input_ids
+
+        >>> # forward pass
+        >>> outputs = model(**inputs, labels=labels)
+        >>> loss = outputs.loss
+        >>> print(f"{loss.item():.5f}")
+        5.94282
+        ```"""
+        use_cache = use_cache if use_cache is not None else self.config.text_config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # Encode if needed (training, first prediction pass)
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                flattened_patches=flattened_patches,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        hidden_states = encoder_outputs[0]
+
+        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
+            # get decoder inputs from shifting lm labels to the right
+            decoder_input_ids = self._shift_right(labels)
+            decoder_attention_mask = (
+                decoder_attention_mask
+                if decoder_attention_mask is not None
+                else decoder_input_ids.ne(self.config.pad_token_id).float()
+            )
+            # Always attend to the first token
+            decoder_attention_mask[:, 0] = 1
+
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            past_key_values=past_key_values,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            labels=labels,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return Seq2SeqLMOutput(
+            loss=decoder_outputs.loss,
+            logits=decoder_outputs.logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        flattened_patches: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        past_key_values=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs,
+    ):
+        if decoder_attention_mask is None:
+            decoder_attention_mask = torch.ones_like(input_ids).to(input_ids.device)
+
+        # cut decoder_input_ids if past is used
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            "flattened_patches": flattened_patches,
+            "decoder_input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "encoder_outputs": encoder_outputs,
+            "attention_mask": attention_mask,
+            "decoder_attention_mask": decoder_attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "use_cache": use_cache,
+        }
diff --git a/src/transformers/models/pix2struct/processing_pix2struct.py b/src/transformers/models/pix2struct/processing_pix2struct.py
new file mode 100644
index 000000000000..bc54e14604f8
--- /dev/null
+++ b/src/transformers/models/pix2struct/processing_pix2struct.py
@@ -0,0 +1,162 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Pix2Struct.
+"""
+
+from typing import List, Optional, Union
+
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
+from ...utils import TensorType
+
+
+class Pix2StructProcessor(ProcessorMixin):
+    r"""
+    Constructs a PIX2STRUCT processor which wraps a BERT tokenizer and PIX2STRUCT image processor into a single
+    processor.
+
+    [`Pix2StructProcessor`] offers all the functionalities of [`Pix2StructImageProcessor`] and [`T5TokenizerFast`]. See
+    the docstring of [`~Pix2StructProcessor.__call__`] and [`~Pix2StructProcessor.decode`] for more information.
+
+    Args:
+        image_processor (`Pix2StructImageProcessor`):
+            An instance of [`Pix2StructImageProcessor`]. The image processor is a required input.
+        tokenizer (Union[`T5TokenizerFast`, `T5Tokenizer`]):
+            An instance of ['T5TokenizerFast`] or ['T5Tokenizer`]. The tokenizer is a required input.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "Pix2StructImageProcessor"
+    tokenizer_class = ("T5Tokenizer", "T5TokenizerFast")
+
+    def __init__(self, image_processor, tokenizer):
+        tokenizer.return_token_type_ids = False
+        super().__init__(image_processor, tokenizer)
+
+    def __call__(
+        self,
+        images=None,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        max_patches: Optional[int] = 2048,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_token_type_ids: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        This method uses [`Pix2StructImageProcessor.preprocess`] method to prepare image(s) for the model, and
+        [`T5TokenizerFast.__call__`] to prepare text for the model.
+
+        Please refer to the docstring of the above two methods for more information.
+        """
+        if images is None and text is None:
+            raise ValueError("You have to specify either images or text.")
+
+        # Get only text
+        if images is None and not self.image_processor.is_vqa:
+            self.current_processor = self.tokenizer
+            text_encoding = self.tokenizer(
+                text=text,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_token_type_ids=return_token_type_ids,
+                return_length=return_length,
+                verbose=verbose,
+                return_tensors=return_tensors,
+                **kwargs,
+            )
+            return text_encoding
+
+        if not self.image_processor.is_vqa:
+            # add pixel_values
+            encoding_image_processor = self.image_processor(
+                images, return_tensors=return_tensors, max_patches=max_patches, **kwargs
+            )
+        else:
+            # add pixel_values and bbox
+            encoding_image_processor = self.image_processor(
+                images, return_tensors=return_tensors, max_patches=max_patches, header_text=text, **kwargs
+            )
+
+        if text is not None and not self.image_processor.is_vqa:
+            text_encoding = self.tokenizer(
+                text=text,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_token_type_ids=return_token_type_ids,
+                return_length=return_length,
+                verbose=verbose,
+                return_tensors=return_tensors,
+                **kwargs,
+            )
+
+            if "attention_mask" in text_encoding:
+                text_encoding["decoder_attention_mask"] = text_encoding.pop("attention_mask")
+            if "input_ids" in text_encoding:
+                text_encoding["decoder_input_ids"] = text_encoding.pop("input_ids")
+        else:
+            text_encoding = None
+
+        if text_encoding is not None:
+            encoding_image_processor.update(text_encoding)
+
+        return encoding_image_processor
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Pix2StructTokenizerFast's [`~PreTrainedTokenizer.batch_decode`].
+        Please refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Pix2StructTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
diff --git a/src/transformers/models/plbart/__init__.py b/src/transformers/models/plbart/__init__.py
index 06204a8901e9..ade03d8aa5cd 100644
--- a/src/transformers/models/plbart/__init__.py
+++ b/src/transformers/models/plbart/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/plbart/configuration_plbart.py b/src/transformers/models/plbart/configuration_plbart.py
index e3d6c7fbe9fb..25f4c31c5778 100644
--- a/src/transformers/models/plbart/configuration_plbart.py
+++ b/src/transformers/models/plbart/configuration_plbart.py
@@ -132,7 +132,7 @@ def __init__(
         bos_token_id=0,
         eos_token_id=2,
         forced_eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
diff --git a/src/transformers/models/plbart/modeling_plbart.py b/src/transformers/models/plbart/modeling_plbart.py
index 1e2bc3bcc655..93532f4b0d8c 100644
--- a/src/transformers/models/plbart/modeling_plbart.py
+++ b/src/transformers/models/plbart/modeling_plbart.py
@@ -15,7 +15,6 @@
 """ PyTorch PLBART model."""
 import copy
 import math
-import random
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
@@ -48,16 +47,6 @@
 
 _CHECKPOINT_FOR_DOC = "uclanlp/plbart-base"
 _CONFIG_FOR_DOC = "PLBartConfig"
-_TOKENIZER_FOR_DOC = "PLBartTokenizer"
-
-# Base model docstring
-_EXPECTED_OUTPUT_SHAPE = [1, 8, 768]
-
-# SequenceClassification docstring
-_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "hf-internal-testing/tiny-plbart"
-_SEQ_CLASS_EXPECTED_OUTPUT = "'LABEL_1'"
-_SEQ_CLASS_EXPECTED_LOSS = 0.69
-
 
 PLBART_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "uclanlp/plbart-base",
@@ -89,18 +78,20 @@ def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int):
 
 
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
     """
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
-    mask_cond = torch.arange(mask.size(-1))
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
 
     if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
     return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
 
 
@@ -234,8 +225,8 @@ def forward(
 
         proj_shape = (bsz * self.num_heads, -1, self.head_dim)
         query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-        key_states = key_states.view(*proj_shape)
-        value_states = value_states.view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
 
         src_len = key_states.size(1)
         attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
@@ -281,7 +272,7 @@ def forward(
 
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
             raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
                 f" {attn_output.size()}"
             )
 
@@ -289,7 +280,7 @@ def forward(
         attn_output = attn_output.transpose(1, 2)
 
         # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned aross GPUs when using tensor-parallelism.
+        # partitioned across GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
@@ -324,7 +315,7 @@ def forward(
     ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
@@ -551,10 +542,10 @@ def _set_gradient_checkpointing(self, module, value=False):
     Mask-filling example:
 
     ```python
-    >>> from transformers import PLBartTokenizer, PLBartForConditionalGeneration
+    >>> from transformers import AutoTokenizer, PLBartForConditionalGeneration
 
     >>> model = PLBartForConditionalGeneration.from_pretrained("uclanlp/plbart-base")
-    >>> tokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-base")
+    >>> tokenizer = AutoTokenizer.from_pretrained("uclanlp/plbart-base")
 
     >>> # en_XX is the language symbol id <LID> for English
     >>> TXT = "<s> Is 0 the <mask> Fibonacci number ? </s> en_XX"
@@ -576,7 +567,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`PLBartTokenizer`] or [`PLBartMultiTokenizer`] depending on the checkpoint.
+            Indices can be obtained using [`AutoTokenizer`] or [`PLBartMultiTokenizer`] depending on the checkpoint.
             See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -590,7 +581,7 @@ def _set_gradient_checkpointing(self, module, value=False):
         decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`PLBartTokenizer`] or [`PLBartMultiTokenizer`] depending on the checkpoint.
+            Indices can be obtained using [`AutoTokenizer`] or [`PLBartMultiTokenizer`] depending on the checkpoint.
             See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -729,7 +720,7 @@ def forward(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`PLBartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -806,8 +797,13 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
                 layer_outputs = (None, None)
             else:
                 if self.gradient_checkpointing and self.training:
@@ -893,8 +889,11 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
         combined_attention_mask = None
         if input_shape[-1] > 1:
             combined_attention_mask = _make_causal_mask(
-                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
-            ).to(inputs_embeds.device)
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
 
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
@@ -928,7 +927,7 @@ def forward(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`PLBartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -1031,6 +1030,13 @@ def forward(
 
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
@@ -1050,20 +1056,15 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         # None for past_key_value
@@ -1082,7 +1083,6 @@ def custom_forward(*inputs):
                     None,
                 )
             else:
-
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
@@ -1132,7 +1132,7 @@ def custom_forward(*inputs):
     PLBART_START_DOCSTRING,
 )
 class PLBartModel(PLBartPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: PLBartConfig):
         super().__init__(config)
@@ -1161,7 +1161,6 @@ def get_decoder(self):
 
     @add_start_docstrings_to_model_forward(PLBART_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=Seq2SeqModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1178,7 +1177,7 @@ def forward(
         encoder_outputs: Optional[List[torch.FloatTensor]] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
-        decoder_inputs_embeds=None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1251,14 +1250,8 @@ def forward(
 )
 class PLBartForConditionalGeneration(PLBartPreTrainedModel):
     base_model_prefix = "model"
-    _keys_to_ignore_on_load_missing = [
-        r"final_logits_bias",
-        r"encoder.version",
-        r"decoder.version",
-        r"lm_head.weight",
-        "decoder.embed_tokens.weight",
-        "encoder.embed_tokens.weight",
-    ]
+    _keys_to_ignore_on_load_missing = ["final_logits_bias"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
 
     def __init__(self, config: PLBartConfig):
         super().__init__(config)
@@ -1274,9 +1267,9 @@ def get_encoder(self):
     def get_decoder(self):
         return self.model.get_decoder()
 
-    def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
-        new_embeddings = super().resize_token_embeddings(new_num_tokens)
-        self._resize_final_logits_bias(new_num_tokens)
+    def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding:
+        new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
+        self._resize_final_logits_bias(new_embeddings.weight.shape[0])
         return new_embeddings
 
     def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
@@ -1309,7 +1302,7 @@ def forward(
         encoder_outputs: Optional[List[torch.FloatTensor]] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
-        decoder_inputs_embeds=None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.Tensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -1382,7 +1375,7 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask: Optional[torch.Tensor] = None,
         use_cache: Optional[bool] = None,
         encoder_outputs: Optional[List[torch.FloatTensor]] = None,
-        **kwargs  # TODO: Check if this is needed. It is unused?
+        **kwargs,  # TODO: Check if this is needed. It is unused?
     ) -> Dict[str, Any]:
         # cut decoder_input_ids if past is used
         if past_key_values is not None:
@@ -1404,12 +1397,13 @@ def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
         return shift_tokens_right(labels, self.config.pad_token_id)
 
     @staticmethod
-    def _reorder_cache(past, beam_idx):
+    def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
+        for layer_past in past_key_values:
             # cached cross_attention states don't have to be reordered -> they are always the same
             reordered_past += (
-                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
+                + layer_past[2:],
             )
         return reordered_past
 
@@ -1422,7 +1416,7 @@ def _reorder_cache(past, beam_idx):
     PLBART_START_DOCSTRING,
 )
 class PLBartForSequenceClassification(PLBartPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: PLBartConfig, **kwargs):
         super().__init__(config, **kwargs)
@@ -1433,17 +1427,15 @@ def __init__(self, config: PLBartConfig, **kwargs):
             config.num_labels,
             config.classifier_dropout,
         )
-        self.model._init_weights(self.classification_head.dense)
-        self.model._init_weights(self.classification_head.out_proj)
+
+        # Initialize weights and apply final processing
+        self.post_init()
 
     @add_start_docstrings_to_model_forward(PLBART_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
+        checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=Seq2SeqSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
-        expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
     )
     # Copied from transformers.models.bart.modeling_bart.BartForSequenceClassification.forward
     def forward(
@@ -1507,6 +1499,7 @@ def forward(
 
         loss = None
         if labels is not None:
+            labels = labels.to(logits.device)
             if self.config.problem_type is None:
                 if self.config.num_labels == 1:
                     self.config.problem_type = "regression"
@@ -1561,7 +1554,7 @@ def forward(self, *args, **kwargs):
 
 # Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->PLBart, facebook/bart-base->uclanlp/plbart-base
 class PLBartForCausalLM(PLBartPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         config = copy.deepcopy(config)
@@ -1616,7 +1609,7 @@ def forward(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`PLBartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -1681,9 +1674,9 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import PLBartTokenizer, PLBartForCausalLM
+        >>> from transformers import AutoTokenizer, PLBartForCausalLM
 
-        >>> tokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("uclanlp/plbart-base")
         >>> model = PLBartForCausalLM.from_pretrained("uclanlp/plbart-base", add_cross_attention=False)
         >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
@@ -1721,6 +1714,7 @@ def forward(
 
         loss = None
         if labels is not None:
+            labels = labels.to(logits.device)
             loss_fct = CrossEntropyLoss()
             loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
 
@@ -1755,8 +1749,10 @@ def prepare_inputs_for_generation(
         }
 
     @staticmethod
-    def _reorder_cache(past, beam_idx):
+    def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
         return reordered_past
diff --git a/src/transformers/models/plbart/tokenization_plbart.py b/src/transformers/models/plbart/tokenization_plbart.py
index 94ec77c468c9..e50849b51d2d 100644
--- a/src/transformers/models/plbart/tokenization_plbart.py
+++ b/src/transformers/models/plbart/tokenization_plbart.py
@@ -189,29 +189,12 @@ def __init__(
         tgt_lang=None,
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
         additional_special_tokens=None,
-        **kwargs
+        **kwargs,
     ):
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
 
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
-
-        super().__init__(
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            language_codes=language_codes,
-            tokenizer_file=tokenizer_file,
-            src_lang=src_lang,
-            tgt_lang=tgt_lang,
-            additional_special_tokens=additional_special_tokens,
-            sp_model_kwargs=self.sp_model_kwargs,
-            **kwargs,
-        )
         src_lang = self._convert_lang_code_special_format(src_lang)
         tgt_lang = self._convert_lang_code_special_format(tgt_lang)
 
@@ -245,12 +228,12 @@ def __init__(
 
         self.fairseq_tokens_to_ids.update(self.lang_code_to_id)
         self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
-        self._additional_special_tokens = list(self.lang_code_to_id.keys())
+        _additional_special_tokens = list(self.lang_code_to_id.keys())
 
         if additional_special_tokens is not None:
             # Only add those special tokens if they are not already there.
-            self._additional_special_tokens.extend(
-                [t for t in additional_special_tokens if t not in self._additional_special_tokens]
+            _additional_special_tokens.extend(
+                [t for t in additional_special_tokens if t not in _additional_special_tokens]
             )
 
         if self.language_codes == "base":
@@ -262,6 +245,23 @@ def __init__(
             self._src_lang = src_lang if src_lang is not None else "__en_XX__"
             self.cur_lang_code_id = self.lang_code_to_id[self._src_lang]
 
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            language_codes=language_codes,
+            tokenizer_file=tokenizer_file,
+            src_lang=src_lang,
+            tgt_lang=tgt_lang,
+            additional_special_tokens=_additional_special_tokens,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+
         self.tgt_lang = tgt_lang
         self.set_src_lang_special_tokens(self._src_lang)
 
diff --git a/src/transformers/models/poolformer/__init__.py b/src/transformers/models/poolformer/__init__.py
index 79e82a22808e..3a62183a23d6 100644
--- a/src/transformers/models/poolformer/__init__.py
+++ b/src/transformers/models/poolformer/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,7 +13,6 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-# rely on isort to merge the imports
 from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
 
 
diff --git a/src/transformers/models/poolformer/configuration_poolformer.py b/src/transformers/models/poolformer/configuration_poolformer.py
index c55f13b80c96..7444de8ec2b2 100644
--- a/src/transformers/models/poolformer/configuration_poolformer.py
+++ b/src/transformers/models/poolformer/configuration_poolformer.py
@@ -71,7 +71,7 @@ class PoolFormerConfig(PretrainedConfig):
             The activation function for the hidden layers.
         use_layer_scale (`bool`, *optional*, defaults to `True`):
             Whether to use layer scale.
-        layer_scale_init_value (`float`, *optional*, defaults to 1e-5):
+        layer_scale_init_value (`float`, *optional*, defaults to 1e-05):
             The initial value for the layer scale.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The initializer range for the weights.
@@ -111,7 +111,7 @@ def __init__(
         use_layer_scale=True,
         layer_scale_init_value=1e-5,
         initializer_range=0.02,
-        **kwargs
+        **kwargs,
     ):
         self.num_channels = num_channels
         self.patch_size = patch_size
@@ -133,7 +133,6 @@ def __init__(
 
 
 class PoolFormerOnnxConfig(OnnxConfig):
-
     torch_onnx_minimum_version = version.parse("1.11")
 
     @property
diff --git a/src/transformers/models/poolformer/convert_poolformer_original_to_pytorch.py b/src/transformers/models/poolformer/convert_poolformer_original_to_pytorch.py
index 4ab0d2bfb3d4..e5fad6da1a3f 100644
--- a/src/transformers/models/poolformer/convert_poolformer_original_to_pytorch.py
+++ b/src/transformers/models/poolformer/convert_poolformer_original_to_pytorch.py
@@ -19,12 +19,12 @@
 from collections import OrderedDict
 from pathlib import Path
 
+import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 
-import requests
-from huggingface_hub import hf_hub_download
-from transformers import PoolFormerConfig, PoolFormerFeatureExtractor, PoolFormerForImageClassification
+from transformers import PoolFormerConfig, PoolFormerForImageClassification, PoolFormerImageProcessor
 from transformers.utils import logging
 
 
@@ -141,12 +141,12 @@ def convert_poolformer_checkpoint(model_name, checkpoint_path, pytorch_dump_fold
     else:
         raise ValueError(f"Size {size} not supported")
 
-    # load feature extractor
-    feature_extractor = PoolFormerFeatureExtractor(crop_pct=crop_pct)
+    # load image processor
+    image_processor = PoolFormerImageProcessor(crop_pct=crop_pct)
 
     # Prepare image
     image = prepare_img()
-    pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
+    pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
 
     logger.info(f"Converting model {model_name}...")
 
@@ -161,9 +161,9 @@ def convert_poolformer_checkpoint(model_name, checkpoint_path, pytorch_dump_fold
     model.load_state_dict(state_dict)
     model.eval()
 
-    # Define feature extractor
-    feature_extractor = PoolFormerFeatureExtractor(crop_pct=crop_pct)
-    pixel_values = feature_extractor(images=prepare_img(), return_tensors="pt").pixel_values
+    # Define image processor
+    image_processor = PoolFormerImageProcessor(crop_pct=crop_pct)
+    pixel_values = image_processor(images=prepare_img(), return_tensors="pt").pixel_values
 
     # forward pass
     outputs = model(pixel_values)
@@ -187,12 +187,12 @@ def convert_poolformer_checkpoint(model_name, checkpoint_path, pytorch_dump_fold
     assert logits.shape == expected_shape
     assert torch.allclose(logits[0, :3], expected_slice, atol=1e-2)
 
-    # finally, save model and feature extractor
-    logger.info(f"Saving PyTorch model and feature extractor to {pytorch_dump_folder_path}...")
+    # finally, save model and image processor
+    logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/poolformer/image_processing_poolformer.py b/src/transformers/models/poolformer/image_processing_poolformer.py
index 896465551cb3..b5773d3146f4 100644
--- a/src/transformers/models/poolformer/image_processing_poolformer.py
+++ b/src/transformers/models/poolformer/image_processing_poolformer.py
@@ -18,15 +18,9 @@
 
 import numpy as np
 
-from transformers import is_vision_available
-from transformers.utils.generic import TensorType
-
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import (
-    center_crop,
     get_resize_output_image_size,
-    normalize,
-    rescale,
     resize,
     to_channel_dimension_format,
 )
@@ -36,11 +30,13 @@
     ChannelDimension,
     ImageInput,
     PILImageResampling,
-    is_batched,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
     to_numpy_array,
     valid_images,
 )
-from ...utils import logging
+from ...utils import TensorType, is_vision_available, logging
 
 
 if is_vision_available():
@@ -72,10 +68,10 @@ class PoolFormerImageProcessor(BaseImageProcessor):
               whilst maintaining the aspect ratio.
             - size is `{"shortest_edge": c}`: the shortest edge of the image is resized to `int(floor(c/crop_pct)`
               whilst maintaining the aspect ratio.
-        crop_pct (`float`, *optional*, defaults to `0.9`):
+        crop_pct (`float`, *optional*, defaults to 0.9):
             Percentage of the image to crop from the center. Can be overridden by `crop_pct` in the `preprocess`
             method.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
             Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
         do_center_crop (`bool`, *optional*, defaults to `True`):
             Whether to center crop the image. If the input size is smaller than `crop_size` along any edge, the image
@@ -84,12 +80,12 @@ class PoolFormerImageProcessor(BaseImageProcessor):
         crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
             Size of the image after applying center crop. Only has an effect if `do_center_crop` is set to `True`. Can
             be overridden by the `crop_size` parameter in the `preprocess` method.
-        do_rescale (`bool`, *optional*, defaults to `True`):
-            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
-            parameter in the `preprocess` method.
         rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
             Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
             `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
         do_normalize (`bool`, *optional*, defaults to `True`):
             Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
             `preprocess` method.
@@ -116,7 +112,7 @@ def __init__(
         do_normalize: bool = True,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"shortest_edge": 224}
@@ -143,7 +139,8 @@ def resize(
         crop_pct: Optional[float] = None,
         resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image.
@@ -172,6 +169,8 @@ def resize(
                 Resampling filter to use when resizing the image.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
         """
         size = get_size_dict(size, default_to_square=False)
         if "shortest_edge" not in size and ("height" not in size or "width" not in size):
@@ -187,83 +186,27 @@ def resize(
             else:
                 raise ValueError("Invalid size for resize: {}".format(size))
 
-            output_size = get_resize_output_image_size(image, size=scale_size, default_to_square=False)
+            output_size = get_resize_output_image_size(
+                image, size=scale_size, default_to_square=False, input_data_format=input_data_format
+            )
         else:
             if "shortest_edge" in size:
-                output_size = get_resize_output_image_size(image, size=size["shortest_edge"], default_to_square=False)
+                output_size = get_resize_output_image_size(
+                    image, size=size["shortest_edge"], default_to_square=False, input_data_format=input_data_format
+                )
             elif "height" in size and "width" in size:
                 output_size = (size["height"], size["width"])
             else:
                 raise ValueError("Invalid size for resize: {}".format(size))
 
-        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
-
-    def center_crop(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Center crop an image to (size["height"], size["width"]). If the input size is smaller than `crop_size` along
-        any edge, the image is padded with 0's and then center cropped.
-
-        Args:
-            image (`np.ndarray`):
-                Image to center crop.
-            size (`Dict[str, int]`):
-                Size of the output image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        size = get_size_dict(size)
-        if "height" not in size or "width" not in size:
-            raise ValueError(f"size must contain 'height' and 'width' as keys. Got {size.keys()}")
-        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
-
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            image_mean (`float` or `List[float]`):
-                Image mean.
-            image_std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
 
     def preprocess(
         self,
@@ -281,6 +224,7 @@ def preprocess(
         image_std: Optional[Union[float, List[float]]] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
         **kwargs,
     ) -> PIL.Image.Image:
         """
@@ -288,7 +232,8 @@ def preprocess(
 
         Args:
             images (`ImageInput`):
-                Image to preprocess.
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
             do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                 Whether to resize the image.
             size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@@ -323,6 +268,12 @@ def preprocess(
                 The channel dimension format for the output image. Can be one of:
                     - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                     - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         crop_pct = crop_pct if crop_pct is not None else self.crop_pct
@@ -339,8 +290,7 @@ def preprocess(
         crop_size = crop_size if crop_size is not None else self.crop_size
         crop_size = get_size_dict(crop_size, param_name="crop_size")
 
-        if not is_batched(images):
-            images = [images]
+        images = make_list_of_images(images)
 
         if not valid_images(images):
             raise ValueError(
@@ -363,19 +313,44 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
         if do_resize:
-            images = [self.resize(image=image, size=size, crop_pct=crop_pct, resample=resample) for image in images]
+            images = [
+                self.resize(
+                    image=image, size=size, crop_pct=crop_pct, resample=resample, input_data_format=input_data_format
+                )
+                for image in images
+            ]
 
         if do_center_crop:
-            images = [self.center_crop(image=image, size=crop_size) for image in images]
+            images = [
+                self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images
+            ]
 
         if do_rescale:
-            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
 
         if do_normalize:
-            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
-
-        images = [to_channel_dimension_format(image, data_format) for image in images]
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
 
         data = {"pixel_values": images}
         return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/src/transformers/models/poolformer/modeling_poolformer.py b/src/transformers/models/poolformer/modeling_poolformer.py
index 3c354638b1dc..6acc8ec98e69 100755
--- a/src/transformers/models/poolformer/modeling_poolformer.py
+++ b/src/transformers/models/poolformer/modeling_poolformer.py
@@ -34,7 +34,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "PoolFormerConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "PoolFormerImageProcessor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "sail/poolformer_s12"
@@ -51,7 +50,7 @@
 
 
 # Copied from transformers.models.beit.modeling_beit.drop_path
-def drop_path(input, drop_prob: float = 0.0, training: bool = False):
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
@@ -302,7 +301,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 POOLFORMER_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`PoolFormerImageProcessor`]. See
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
             [`PoolFormerImageProcessor.__call__`] for details.
 """
 
@@ -326,7 +325,6 @@ def get_input_embeddings(self):
 
     @add_start_docstrings_to_model_forward(POOLFORMER_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithNoAttention,
         config_class=_CONFIG_FOR_DOC,
@@ -397,7 +395,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(POOLFORMER_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_IMAGE_CLASS_CHECKPOINT,
         output_type=ImageClassifierOutputWithNoAttention,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/pop2piano/__init__.py b/src/transformers/models/pop2piano/__init__.py
new file mode 100644
index 000000000000..08b1e732b7df
--- /dev/null
+++ b/src/transformers/models/pop2piano/__init__.py
@@ -0,0 +1,122 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_essentia_available,
+    is_librosa_available,
+    is_pretty_midi_available,
+    is_scipy_available,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_pop2piano": ["POP2PIANO_PRETRAINED_CONFIG_ARCHIVE_MAP", "Pop2PianoConfig"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_pop2piano"] = [
+        "POP2PIANO_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "Pop2PianoForConditionalGeneration",
+        "Pop2PianoPreTrainedModel",
+    ]
+
+try:
+    if not (is_librosa_available() and is_essentia_available() and is_scipy_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["feature_extraction_pop2piano"] = ["Pop2PianoFeatureExtractor"]
+
+try:
+    if not (is_pretty_midi_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["tokenization_pop2piano"] = ["Pop2PianoTokenizer"]
+
+try:
+    if not (
+        is_pretty_midi_available()
+        and is_torch_available()
+        and is_librosa_available()
+        and is_essentia_available()
+        and is_scipy_available()
+    ):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["processing_pop2piano"] = ["Pop2PianoProcessor"]
+
+
+if TYPE_CHECKING:
+    from .configuration_pop2piano import POP2PIANO_PRETRAINED_CONFIG_ARCHIVE_MAP, Pop2PianoConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_pop2piano import (
+            POP2PIANO_PRETRAINED_MODEL_ARCHIVE_LIST,
+            Pop2PianoForConditionalGeneration,
+            Pop2PianoPreTrainedModel,
+        )
+
+    try:
+        if not (is_librosa_available() and is_essentia_available() and is_scipy_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .feature_extraction_pop2piano import Pop2PianoFeatureExtractor
+
+    try:
+        if not (is_pretty_midi_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .tokenization_pop2piano import Pop2PianoTokenizer
+
+    try:
+        if not (
+            is_pretty_midi_available()
+            and is_torch_available()
+            and is_librosa_available()
+            and is_essentia_available()
+            and is_scipy_available()
+        ):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .processing_pop2piano import Pop2PianoProcessor
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/pop2piano/configuration_pop2piano.py b/src/transformers/models/pop2piano/configuration_pop2piano.py
new file mode 100644
index 000000000000..15bf1ac438dd
--- /dev/null
+++ b/src/transformers/models/pop2piano/configuration_pop2piano.py
@@ -0,0 +1,129 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Pop2Piano model configuration"""
+
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+POP2PIANO_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "sweetcocoa/pop2piano": "https://huggingface.co/sweetcocoa/pop2piano/blob/main/config.json"
+}
+
+
+class Pop2PianoConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Pop2PianoForConditionalGeneration`]. It is used
+    to instantiate a Pop2PianoForConditionalGeneration model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the
+    Pop2Piano [sweetcocoa/pop2piano](https://huggingface.co/sweetcocoa/pop2piano) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Arguments:
+        vocab_size (`int`, *optional*, defaults to 2400):
+            Vocabulary size of the `Pop2PianoForConditionalGeneration` model. Defines the number of different tokens
+            that can be represented by the `inputs_ids` passed when calling [`Pop2PianoForConditionalGeneration`].
+        composer_vocab_size (`int`, *optional*, defaults to 21):
+            Denotes the number of composers.
+        d_model (`int`, *optional*, defaults to 512):
+            Size of the encoder layers and the pooler layer.
+        d_kv (`int`, *optional*, defaults to 64):
+            Size of the key, query, value projections per attention head. The `inner_dim` of the projection layer will
+            be defined as `num_heads * d_kv`.
+        d_ff (`int`, *optional*, defaults to 2048):
+            Size of the intermediate feed forward layer in each `Pop2PianoBlock`.
+        num_layers (`int`, *optional*, defaults to 6):
+            Number of hidden layers in the Transformer encoder.
+        num_decoder_layers (`int`, *optional*):
+            Number of hidden layers in the Transformer decoder. Will use the same value as `num_layers` if not set.
+        num_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        relative_attention_num_buckets (`int`, *optional*, defaults to 32):
+            The number of buckets to use for each attention layer.
+        relative_attention_max_distance (`int`, *optional*, defaults to 128):
+            The maximum distance of the longer sequences for the bucket separation.
+        dropout_rate (`float`, *optional*, defaults to 0.1):
+            The ratio for all dropout layers.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-6):
+            The epsilon used by the layer normalization layers.
+        initializer_factor (`float`, *optional*, defaults to 1.0):
+            A factor for initializing all weight matrices (should be kept to 1.0, used internally for initialization
+            testing).
+        feed_forward_proj (`string`, *optional*, defaults to `"gated-gelu"`):
+            Type of feed forward layer to be used. Should be one of `"relu"` or `"gated-gelu"`.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        dense_act_fn (`string`, *optional*, defaults to `"relu"`):
+            Type of Activation Function to be used in `Pop2PianoDenseActDense` and in `Pop2PianoDenseGatedActDense`.
+    """
+
+    model_type = "pop2piano"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=2400,
+        composer_vocab_size=21,
+        d_model=512,
+        d_kv=64,
+        d_ff=2048,
+        num_layers=6,
+        num_decoder_layers=None,
+        num_heads=8,
+        relative_attention_num_buckets=32,
+        relative_attention_max_distance=128,
+        dropout_rate=0.1,
+        layer_norm_epsilon=1e-6,
+        initializer_factor=1.0,
+        feed_forward_proj="gated-gelu",  # noqa
+        is_encoder_decoder=True,
+        use_cache=True,
+        pad_token_id=0,
+        eos_token_id=1,
+        dense_act_fn="relu",
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.composer_vocab_size = composer_vocab_size
+        self.d_model = d_model
+        self.d_kv = d_kv
+        self.d_ff = d_ff
+        self.num_layers = num_layers
+        self.num_decoder_layers = num_decoder_layers if num_decoder_layers is not None else self.num_layers
+        self.num_heads = num_heads
+        self.relative_attention_num_buckets = relative_attention_num_buckets
+        self.relative_attention_max_distance = relative_attention_max_distance
+        self.dropout_rate = dropout_rate
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_factor = initializer_factor
+        self.feed_forward_proj = feed_forward_proj
+        self.use_cache = use_cache
+        self.dense_act_fn = dense_act_fn
+        self.is_gated_act = self.feed_forward_proj.split("-")[0] == "gated"
+        self.hidden_size = self.d_model
+        self.num_attention_heads = num_heads
+        self.num_hidden_layers = num_layers
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            **kwargs,
+        )
diff --git a/src/transformers/models/pop2piano/convert_pop2piano_weights_to_hf.py b/src/transformers/models/pop2piano/convert_pop2piano_weights_to_hf.py
new file mode 100644
index 000000000000..a73c57886da9
--- /dev/null
+++ b/src/transformers/models/pop2piano/convert_pop2piano_weights_to_hf.py
@@ -0,0 +1,190 @@
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" File for loading the Pop2Piano model weights from the official repository and to show how tokenizer vocab was
+ constructed"""
+
+import json
+
+import torch
+
+from transformers import Pop2PianoConfig, Pop2PianoForConditionalGeneration
+
+
+########################## MODEL WEIGHTS ##########################
+
+# This weights were downloaded from the official pop2piano repository
+# https://huggingface.co/sweetcocoa/pop2piano/blob/main/model-1999-val_0.67311615.ckpt
+official_weights = torch.load("./model-1999-val_0.67311615.ckpt")
+state_dict = {}
+
+
+# load the config and init the model
+cfg = Pop2PianoConfig.from_pretrained("sweetcocoa/pop2piano")
+model = Pop2PianoForConditionalGeneration(cfg)
+
+
+# load relative attention bias
+state_dict["encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"] = official_weights["state_dict"][
+    "transformer.encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"
+]
+state_dict["decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"] = official_weights["state_dict"][
+    "transformer.decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"
+]
+
+# load embed tokens and final layer norm for both encoder and decoder
+state_dict["encoder.embed_tokens.weight"] = official_weights["state_dict"]["transformer.encoder.embed_tokens.weight"]
+state_dict["decoder.embed_tokens.weight"] = official_weights["state_dict"]["transformer.decoder.embed_tokens.weight"]
+
+state_dict["encoder.final_layer_norm.weight"] = official_weights["state_dict"][
+    "transformer.encoder.final_layer_norm.weight"
+]
+state_dict["decoder.final_layer_norm.weight"] = official_weights["state_dict"][
+    "transformer.decoder.final_layer_norm.weight"
+]
+
+# load lm_head, mel_conditioner.emb and shared
+state_dict["lm_head.weight"] = official_weights["state_dict"]["transformer.lm_head.weight"]
+state_dict["mel_conditioner.embedding.weight"] = official_weights["state_dict"]["mel_conditioner.embedding.weight"]
+state_dict["shared.weight"] = official_weights["state_dict"]["transformer.shared.weight"]
+
+# load each encoder blocks
+for i in range(cfg.num_layers):
+    # layer 0
+    state_dict[f"encoder.block.{i}.layer.0.SelfAttention.q.weight"] = official_weights["state_dict"][
+        f"transformer.encoder.block.{i}.layer.0.SelfAttention.q.weight"
+    ]
+    state_dict[f"encoder.block.{i}.layer.0.SelfAttention.k.weight"] = official_weights["state_dict"][
+        f"transformer.encoder.block.{i}.layer.0.SelfAttention.k.weight"
+    ]
+    state_dict[f"encoder.block.{i}.layer.0.SelfAttention.v.weight"] = official_weights["state_dict"][
+        f"transformer.encoder.block.{i}.layer.0.SelfAttention.v.weight"
+    ]
+    state_dict[f"encoder.block.{i}.layer.0.SelfAttention.o.weight"] = official_weights["state_dict"][
+        f"transformer.encoder.block.{i}.layer.0.SelfAttention.o.weight"
+    ]
+    state_dict[f"encoder.block.{i}.layer.0.layer_norm.weight"] = official_weights["state_dict"][
+        f"transformer.encoder.block.{i}.layer.0.layer_norm.weight"
+    ]
+
+    # layer 1
+    state_dict[f"encoder.block.{i}.layer.1.DenseReluDense.wi_0.weight"] = official_weights["state_dict"][
+        f"transformer.encoder.block.{i}.layer.1.DenseReluDense.wi_0.weight"
+    ]
+    state_dict[f"encoder.block.{i}.layer.1.DenseReluDense.wi_1.weight"] = official_weights["state_dict"][
+        f"transformer.encoder.block.{i}.layer.1.DenseReluDense.wi_1.weight"
+    ]
+    state_dict[f"encoder.block.{i}.layer.1.DenseReluDense.wo.weight"] = official_weights["state_dict"][
+        f"transformer.encoder.block.{i}.layer.1.DenseReluDense.wo.weight"
+    ]
+    state_dict[f"encoder.block.{i}.layer.1.layer_norm.weight"] = official_weights["state_dict"][
+        f"transformer.encoder.block.{i}.layer.1.layer_norm.weight"
+    ]
+
+# load each decoder blocks
+for i in range(6):
+    # layer 0
+    state_dict[f"decoder.block.{i}.layer.0.SelfAttention.q.weight"] = official_weights["state_dict"][
+        f"transformer.decoder.block.{i}.layer.0.SelfAttention.q.weight"
+    ]
+    state_dict[f"decoder.block.{i}.layer.0.SelfAttention.k.weight"] = official_weights["state_dict"][
+        f"transformer.decoder.block.{i}.layer.0.SelfAttention.k.weight"
+    ]
+    state_dict[f"decoder.block.{i}.layer.0.SelfAttention.v.weight"] = official_weights["state_dict"][
+        f"transformer.decoder.block.{i}.layer.0.SelfAttention.v.weight"
+    ]
+    state_dict[f"decoder.block.{i}.layer.0.SelfAttention.o.weight"] = official_weights["state_dict"][
+        f"transformer.decoder.block.{i}.layer.0.SelfAttention.o.weight"
+    ]
+    state_dict[f"decoder.block.{i}.layer.0.layer_norm.weight"] = official_weights["state_dict"][
+        f"transformer.decoder.block.{i}.layer.0.layer_norm.weight"
+    ]
+
+    # layer 1
+    state_dict[f"decoder.block.{i}.layer.1.EncDecAttention.q.weight"] = official_weights["state_dict"][
+        f"transformer.decoder.block.{i}.layer.1.EncDecAttention.q.weight"
+    ]
+    state_dict[f"decoder.block.{i}.layer.1.EncDecAttention.k.weight"] = official_weights["state_dict"][
+        f"transformer.decoder.block.{i}.layer.1.EncDecAttention.k.weight"
+    ]
+    state_dict[f"decoder.block.{i}.layer.1.EncDecAttention.v.weight"] = official_weights["state_dict"][
+        f"transformer.decoder.block.{i}.layer.1.EncDecAttention.v.weight"
+    ]
+    state_dict[f"decoder.block.{i}.layer.1.EncDecAttention.o.weight"] = official_weights["state_dict"][
+        f"transformer.decoder.block.{i}.layer.1.EncDecAttention.o.weight"
+    ]
+    state_dict[f"decoder.block.{i}.layer.1.layer_norm.weight"] = official_weights["state_dict"][
+        f"transformer.decoder.block.{i}.layer.1.layer_norm.weight"
+    ]
+
+    # layer 2
+    state_dict[f"decoder.block.{i}.layer.2.DenseReluDense.wi_0.weight"] = official_weights["state_dict"][
+        f"transformer.decoder.block.{i}.layer.2.DenseReluDense.wi_0.weight"
+    ]
+    state_dict[f"decoder.block.{i}.layer.2.DenseReluDense.wi_1.weight"] = official_weights["state_dict"][
+        f"transformer.decoder.block.{i}.layer.2.DenseReluDense.wi_1.weight"
+    ]
+    state_dict[f"decoder.block.{i}.layer.2.DenseReluDense.wo.weight"] = official_weights["state_dict"][
+        f"transformer.decoder.block.{i}.layer.2.DenseReluDense.wo.weight"
+    ]
+    state_dict[f"decoder.block.{i}.layer.2.layer_norm.weight"] = official_weights["state_dict"][
+        f"transformer.decoder.block.{i}.layer.2.layer_norm.weight"
+    ]
+
+model.load_state_dict(state_dict, strict=True)
+
+# save the weights
+torch.save(state_dict, "./pytorch_model.bin")
+
+########################## TOKENIZER ##########################
+
+# the tokenize and detokenize methods are taken from the official implementation
+
+
+# link : https://github.com/sweetcocoa/pop2piano/blob/fac11e8dcfc73487513f4588e8d0c22a22f2fdc5/midi_tokenizer.py#L34
+def tokenize(idx, token_type, n_special=4, n_note=128, n_velocity=2):
+    if token_type == "TOKEN_TIME":
+        return n_special + n_note + n_velocity + idx
+    elif token_type == "TOKEN_VELOCITY":
+        return n_special + n_note + idx
+    elif token_type == "TOKEN_NOTE":
+        return n_special + idx
+    elif token_type == "TOKEN_SPECIAL":
+        return idx
+    else:
+        return -1
+
+
+# link : https://github.com/sweetcocoa/pop2piano/blob/fac11e8dcfc73487513f4588e8d0c22a22f2fdc5/midi_tokenizer.py#L48
+def detokenize(idx, n_special=4, n_note=128, n_velocity=2, time_idx_offset=0):
+    if idx >= n_special + n_note + n_velocity:
+        return "TOKEN_TIME", (idx - (n_special + n_note + n_velocity)) + time_idx_offset
+    elif idx >= n_special + n_note:
+        return "TOKEN_VELOCITY", idx - (n_special + n_note)
+    elif idx >= n_special:
+        return "TOKEN_NOTE", idx - n_special
+    else:
+        return "TOKEN_SPECIAL", idx
+
+
+# crate the decoder and then the encoder of the tokenizer
+decoder = {}
+for i in range(cfg.vocab_size):
+    decoder.update({i: f"{detokenize(i)[1]}_{detokenize(i)[0]}"})
+
+encoder = {v: k for k, v in decoder.items()}
+
+# save the vocab
+with open("./vocab.json", "w") as file:
+    file.write(json.dumps(encoder))
diff --git a/src/transformers/models/pop2piano/feature_extraction_pop2piano.py b/src/transformers/models/pop2piano/feature_extraction_pop2piano.py
new file mode 100644
index 000000000000..d0e50b1c1730
--- /dev/null
+++ b/src/transformers/models/pop2piano/feature_extraction_pop2piano.py
@@ -0,0 +1,463 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Feature extractor class for Pop2Piano"""
+
+import copy
+import warnings
+from typing import List, Optional, Union
+
+import numpy
+import numpy as np
+
+from ...audio_utils import mel_filter_bank, spectrogram
+from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
+from ...feature_extraction_utils import BatchFeature
+from ...utils import (
+    TensorType,
+    is_essentia_available,
+    is_librosa_available,
+    is_scipy_available,
+    logging,
+    requires_backends,
+)
+
+
+if is_essentia_available():
+    import essentia
+    import essentia.standard
+
+if is_librosa_available():
+    import librosa
+
+if is_scipy_available():
+    import scipy
+
+
+logger = logging.get_logger(__name__)
+
+
+class Pop2PianoFeatureExtractor(SequenceFeatureExtractor):
+    r"""
+    Constructs a Pop2Piano feature extractor.
+
+    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
+    most of the main methods. Users should refer to this superclass for more information regarding those methods.
+
+    This class extracts rhythm and preprocesses the audio before it is passed to the model. First the audio is passed
+    to `RhythmExtractor2013` algorithm which extracts the beat_times, beat positions and estimates their confidence as
+    well as tempo in bpm, then beat_times is interpolated and to get beatsteps. Later we calculate
+    extrapolated_beatsteps from it to be used in tokenizer. On the other hand audio is resampled to self.sampling_rate
+    and preprocessed and then log mel spectogram is computed from that to be used in our transformer model.
+
+    Args:
+        sampling_rate (`int`, *optional*, defaults to 22050):
+            Target Sampling rate of audio signal. It's the sampling rate that we forward to the model.
+        padding_value (`int`, *optional*, defaults to 0):
+            Padding value used to pad the audio. Should correspond to silences.
+        window_size (`int`, *optional*, defaults to 4096):
+            Length of the window in samples to which the Fourier transform is applied.
+        hop_length (`int`, *optional*, defaults to 1024):
+            Step size between each window of the waveform, in samples.
+        min_frequency (`float`, *optional*, defaults to 10.0):
+            Lowest frequency that will be used in the log-mel spectrogram.
+        feature_size (`int`, *optional*, defaults to 512):
+            The feature dimension of the extracted features.
+        num_bars (`int`, *optional*, defaults to 2):
+            Determines interval between each sequence.
+    """
+    model_input_names = ["input_features", "beatsteps", "extrapolated_beatstep"]
+
+    def __init__(
+        self,
+        sampling_rate: int = 22050,
+        padding_value: int = 0,
+        window_size: int = 4096,
+        hop_length: int = 1024,
+        min_frequency: float = 10.0,
+        feature_size: int = 512,
+        num_bars: int = 2,
+        **kwargs,
+    ):
+        super().__init__(
+            feature_size=feature_size,
+            sampling_rate=sampling_rate,
+            padding_value=padding_value,
+            **kwargs,
+        )
+        self.sampling_rate = sampling_rate
+        self.padding_value = padding_value
+        self.window_size = window_size
+        self.hop_length = hop_length
+        self.min_frequency = min_frequency
+        self.feature_size = feature_size
+        self.num_bars = num_bars
+        self.mel_filters = mel_filter_bank(
+            num_frequency_bins=(self.window_size // 2) + 1,
+            num_mel_filters=self.feature_size,
+            min_frequency=self.min_frequency,
+            max_frequency=float(self.sampling_rate // 2),
+            sampling_rate=self.sampling_rate,
+            norm=None,
+            mel_scale="htk",
+        )
+
+    def mel_spectrogram(self, sequence: np.ndarray):
+        """
+        Generates MelSpectrogram.
+
+        Args:
+            sequence (`numpy.ndarray`):
+                The sequence of which the mel-spectrogram will be computed.
+        """
+        mel_specs = []
+        for seq in sequence:
+            window = np.hanning(self.window_size + 1)[:-1]
+            mel_specs.append(
+                spectrogram(
+                    waveform=seq,
+                    window=window,
+                    frame_length=self.window_size,
+                    hop_length=self.hop_length,
+                    power=2.0,
+                    mel_filters=self.mel_filters,
+                )
+            )
+        mel_specs = np.array(mel_specs)
+
+        return mel_specs
+
+    def extract_rhythm(self, audio: np.ndarray):
+        """
+        This algorithm(`RhythmExtractor2013`) extracts the beat positions and estimates their confidence as well as
+        tempo in bpm for an audio signal. For more information please visit
+        https://essentia.upf.edu/reference/std_RhythmExtractor2013.html .
+
+        Args:
+            audio(`numpy.ndarray`):
+                raw audio waveform which is passed to the Rhythm Extractor.
+        """
+        requires_backends(self, ["essentia"])
+        essentia_tracker = essentia.standard.RhythmExtractor2013(method="multifeature")
+        bpm, beat_times, confidence, estimates, essentia_beat_intervals = essentia_tracker(audio)
+
+        return bpm, beat_times, confidence, estimates, essentia_beat_intervals
+
+    def interpolate_beat_times(
+        self, beat_times: numpy.ndarray, steps_per_beat: numpy.ndarray, n_extend: numpy.ndarray
+    ):
+        """
+        This method takes beat_times and then interpolates that using `scipy.interpolate.interp1d` and the output is
+        then used to convert raw audio to log-mel-spectrogram.
+
+        Args:
+            beat_times (`numpy.ndarray`):
+                beat_times is passed into `scipy.interpolate.interp1d` for processing.
+            steps_per_beat (`int`):
+                used as an parameter to control the interpolation.
+            n_extend (`int`):
+                used as an parameter to control the interpolation.
+        """
+
+        requires_backends(self, ["scipy"])
+        beat_times_function = scipy.interpolate.interp1d(
+            np.arange(beat_times.size),
+            beat_times,
+            bounds_error=False,
+            fill_value="extrapolate",
+        )
+
+        ext_beats = beat_times_function(
+            np.linspace(0, beat_times.size + n_extend - 1, beat_times.size * steps_per_beat + n_extend)
+        )
+
+        return ext_beats
+
+    def preprocess_mel(self, audio: np.ndarray, beatstep: np.ndarray):
+        """
+        Preprocessing for log-mel-spectrogram
+
+        Args:
+            audio (`numpy.ndarray` of shape `(audio_length, )` ):
+                Raw audio waveform to be processed.
+            beatstep (`numpy.ndarray`):
+                Interpolated values of the raw audio. If beatstep[0] is greater than 0.0, then it will be shifted by
+                the value at beatstep[0].
+        """
+
+        if audio is not None and len(audio.shape) != 1:
+            raise ValueError(
+                f"Expected `audio` to be a single channel audio input of shape `(n, )` but found shape {audio.shape}."
+            )
+        if beatstep[0] > 0.0:
+            beatstep = beatstep - beatstep[0]
+
+        num_steps = self.num_bars * 4
+        num_target_steps = len(beatstep)
+        extrapolated_beatstep = self.interpolate_beat_times(
+            beat_times=beatstep, steps_per_beat=1, n_extend=(self.num_bars + 1) * 4 + 1
+        )
+
+        sample_indices = []
+        max_feature_length = 0
+        for i in range(0, num_target_steps, num_steps):
+            start_idx = i
+            end_idx = min(i + num_steps, num_target_steps)
+            start_sample = int(extrapolated_beatstep[start_idx] * self.sampling_rate)
+            end_sample = int(extrapolated_beatstep[end_idx] * self.sampling_rate)
+            sample_indices.append((start_sample, end_sample))
+            max_feature_length = max(max_feature_length, end_sample - start_sample)
+        padded_batch = []
+        for start_sample, end_sample in sample_indices:
+            feature = audio[start_sample:end_sample]
+            padded_feature = np.pad(
+                feature,
+                ((0, max_feature_length - feature.shape[0]),),
+                "constant",
+                constant_values=0,
+            )
+            padded_batch.append(padded_feature)
+
+        padded_batch = np.asarray(padded_batch)
+        return padded_batch, extrapolated_beatstep
+
+    def _pad(self, features: np.ndarray, add_zero_line=True):
+        features_shapes = [each_feature.shape for each_feature in features]
+        attention_masks, padded_features = [], []
+        for i, each_feature in enumerate(features):
+            # To pad "input_features".
+            if len(each_feature.shape) == 3:
+                features_pad_value = max([*zip(*features_shapes)][1]) - features_shapes[i][1]
+                attention_mask = np.ones(features_shapes[i][:2], dtype=np.int64)
+                feature_padding = ((0, 0), (0, features_pad_value), (0, 0))
+                attention_mask_padding = (feature_padding[0], feature_padding[1])
+
+            # To pad "beatsteps" and "extrapolated_beatstep".
+            else:
+                each_feature = each_feature.reshape(1, -1)
+                features_pad_value = max([*zip(*features_shapes)][0]) - features_shapes[i][0]
+                attention_mask = np.ones(features_shapes[i], dtype=np.int64).reshape(1, -1)
+                feature_padding = attention_mask_padding = ((0, 0), (0, features_pad_value))
+
+            each_padded_feature = np.pad(each_feature, feature_padding, "constant", constant_values=self.padding_value)
+            attention_mask = np.pad(
+                attention_mask, attention_mask_padding, "constant", constant_values=self.padding_value
+            )
+
+            if add_zero_line:
+                # if it is batched then we seperate each examples using zero array
+                zero_array_len = max([*zip(*features_shapes)][1])
+
+                # we concatenate the zero array line here
+                each_padded_feature = np.concatenate(
+                    [each_padded_feature, np.zeros([1, zero_array_len, self.feature_size])], axis=0
+                )
+                attention_mask = np.concatenate(
+                    [attention_mask, np.zeros([1, zero_array_len], dtype=attention_mask.dtype)], axis=0
+                )
+
+            padded_features.append(each_padded_feature)
+            attention_masks.append(attention_mask)
+
+        padded_features = np.concatenate(padded_features, axis=0).astype(np.float32)
+        attention_masks = np.concatenate(attention_masks, axis=0).astype(np.int64)
+
+        return padded_features, attention_masks
+
+    def pad(
+        self,
+        inputs: BatchFeature,
+        is_batched: bool,
+        return_attention_mask: bool,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+    ):
+        """
+        Pads the inputs to same length and returns attention_mask.
+
+        Args:
+            inputs (`BatchFeature`):
+                Processed audio features.
+            is_batched (`bool`):
+                Whether inputs are batched or not.
+            return_attention_mask (`bool`):
+                Whether to return attention mask or not.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+                If nothing is specified, it will return list of `np.ndarray` arrays.
+        Return:
+            `BatchFeature` with attention_mask, attention_mask_beatsteps and attention_mask_extrapolated_beatstep added
+            to it:
+            - **attention_mask** numpy.ndarray of shape `(batch_size, max_input_features_seq_length)` --
+                Example :
+                    1, 1, 1, 0, 0 (audio 1, also here it is padded to max length of 5 thats why there are 2 zeros at
+                    the end indicating they are padded)
+
+                    0, 0, 0, 0, 0 (zero pad to seperate audio 1 and 2)
+
+                    1, 1, 1, 1, 1 (audio 2)
+
+                    0, 0, 0, 0, 0 (zero pad to seperate audio 2 and 3)
+
+                    1, 1, 1, 1, 1 (audio 3)
+            - **attention_mask_beatsteps** numpy.ndarray of shape `(batch_size, max_beatsteps_seq_length)`
+            - **attention_mask_extrapolated_beatstep** numpy.ndarray of shape `(batch_size,
+              max_extrapolated_beatstep_seq_length)`
+        """
+
+        processed_features_dict = {}
+        for feature_name, feature_value in inputs.items():
+            if feature_name == "input_features":
+                padded_feature_values, attention_mask = self._pad(feature_value, add_zero_line=True)
+                processed_features_dict[feature_name] = padded_feature_values
+                if return_attention_mask:
+                    processed_features_dict["attention_mask"] = attention_mask
+            else:
+                padded_feature_values, attention_mask = self._pad(feature_value, add_zero_line=False)
+                processed_features_dict[feature_name] = padded_feature_values
+                if return_attention_mask:
+                    processed_features_dict[f"attention_mask_{feature_name}"] = attention_mask
+
+        # If we are processing only one example, we should remove the zero array line since we don't need it to
+        # seperate examples from each other.
+        if not is_batched and not return_attention_mask:
+            processed_features_dict["input_features"] = processed_features_dict["input_features"][:-1, ...]
+
+        outputs = BatchFeature(processed_features_dict, tensor_type=return_tensors)
+
+        return outputs
+
+    def __call__(
+        self,
+        audio: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
+        sampling_rate: Union[int, List[int]],
+        steps_per_beat: int = 2,
+        resample: Optional[bool] = True,
+        return_attention_mask: Optional[bool] = False,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Main method to featurize and prepare for the model.
+
+        Args:
+            audio (`np.ndarray`, `List`):
+                The audio or batch of audio to be processed. Each audio can be a numpy array, a list of float values, a
+                list of numpy arrays or a list of list of float values.
+            sampling_rate (`int`):
+                The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass
+                `sampling_rate` at the forward call to prevent silent errors.
+            steps_per_beat (`int`, *optional*, defaults to 2):
+                This is used in interpolating `beat_times`.
+            resample (`bool`, *optional*, defaults to `True`):
+                Determines whether to resample the audio to `sampling_rate` or not before processing. Must be True
+                during inference.
+            return_attention_mask (`bool` *optional*, defaults to `False`):
+                Denotes if attention_mask for input_features, beatsteps and extrapolated_beatstep will be given as
+                output or not. Automatically set to True for batched inputs.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+                If nothing is specified, it will return list of `np.ndarray` arrays.
+        """
+
+        requires_backends(self, ["librosa"])
+        is_batched = bool(isinstance(audio, (list, tuple)) and isinstance(audio[0], (np.ndarray, tuple, list)))
+        if is_batched:
+            # This enables the user to process files of different sampling_rate at same time
+            if not isinstance(sampling_rate, list):
+                raise ValueError(
+                    "Please give sampling_rate of each audio separately when you are passing multiple raw_audios at the same time. "
+                    f"Received {sampling_rate}, expected [audio_1_sr, ..., audio_n_sr]."
+                )
+            return_attention_mask = True if return_attention_mask is None else return_attention_mask
+        else:
+            audio = [audio]
+            sampling_rate = [sampling_rate]
+            return_attention_mask = False if return_attention_mask is None else return_attention_mask
+
+        batch_input_features, batch_beatsteps, batch_ext_beatstep = [], [], []
+        for single_raw_audio, single_sampling_rate in zip(audio, sampling_rate):
+            bpm, beat_times, confidence, estimates, essentia_beat_intervals = self.extract_rhythm(
+                audio=single_raw_audio
+            )
+            beatsteps = self.interpolate_beat_times(beat_times=beat_times, steps_per_beat=steps_per_beat, n_extend=1)
+
+            if self.sampling_rate != single_sampling_rate and self.sampling_rate is not None:
+                if resample:
+                    # Change sampling_rate to self.sampling_rate
+                    single_raw_audio = librosa.core.resample(
+                        single_raw_audio,
+                        orig_sr=single_sampling_rate,
+                        target_sr=self.sampling_rate,
+                        res_type="kaiser_best",
+                    )
+                else:
+                    warnings.warn(
+                        f"The sampling_rate of the provided audio is different from the target sampling_rate"
+                        f"of the Feature Extractor, {self.sampling_rate} vs {single_sampling_rate}. "
+                        f"In these cases it is recommended to use `resample=True` in the `__call__` method to"
+                        f"get the optimal behaviour."
+                    )
+
+            single_sampling_rate = self.sampling_rate
+            start_sample = int(beatsteps[0] * single_sampling_rate)
+            end_sample = int(beatsteps[-1] * single_sampling_rate)
+
+            input_features, extrapolated_beatstep = self.preprocess_mel(
+                single_raw_audio[start_sample:end_sample], beatsteps - beatsteps[0]
+            )
+
+            mel_specs = self.mel_spectrogram(input_features.astype(np.float32))
+
+            # apply np.log to get log mel-spectrograms
+            log_mel_specs = np.log(np.clip(mel_specs, a_min=1e-6, a_max=None))
+
+            input_features = np.transpose(log_mel_specs, (0, -1, -2))
+
+            batch_input_features.append(input_features)
+            batch_beatsteps.append(beatsteps)
+            batch_ext_beatstep.append(extrapolated_beatstep)
+
+        output = BatchFeature(
+            {
+                "input_features": batch_input_features,
+                "beatsteps": batch_beatsteps,
+                "extrapolated_beatstep": batch_ext_beatstep,
+            }
+        )
+
+        output = self.pad(
+            output,
+            is_batched=is_batched,
+            return_attention_mask=return_attention_mask,
+            return_tensors=return_tensors,
+        )
+
+        return output
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary.
+
+        Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["feature_extractor_type"] = self.__class__.__name__
+        if "mel_filters" in output:
+            del output["mel_filters"]
+        return output
diff --git a/src/transformers/models/pop2piano/modeling_pop2piano.py b/src/transformers/models/pop2piano/modeling_pop2piano.py
new file mode 100644
index 000000000000..5a67b8044b09
--- /dev/null
+++ b/src/transformers/models/pop2piano/modeling_pop2piano.py
@@ -0,0 +1,1377 @@
+# coding=utf-8
+# Copyright 2023 The Pop2Piano Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Pop2Piano model."""
+
+
+import copy
+import math
+from typing import Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from torch.utils.checkpoint import checkpoint
+
+from transformers.generation import GenerationConfig
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_torch_fx_proxy,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_pop2piano import Pop2PianoConfig
+
+
+logger = logging.get_logger(__name__)
+
+_load_pop2piano_layer_norm = True
+
+try:
+    from apex.normalization import FusedRMSNorm
+
+    _load_pop2piano_layer_norm = False
+
+    logger.info("Discovered apex.normalization.FusedRMSNorm - will use it instead of Pop2PianoLayerNorm")
+except ImportError:
+    # using the normal Pop2PianoLayerNorm
+    pass
+except Exception:
+    logger.warning("Discovered apex but it failed to load, falling back to Pop2PianoLayerNorm")
+    pass
+
+
+_CONFIG_FOR_DOC = "Pop2PianoConfig"
+_CHECKPOINT_FOR_DOC = "sweetcocoa/pop2piano"
+
+POP2PIANO_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "sweetcocoa/pop2piano",
+    # See all Pop2Piano models at https://huggingface.co/models?filter=pop2piano
+]
+
+
+POP2PIANO_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Pop2Piano is a model with relative position embeddings
+            so you should be able to pad the inputs on both the right and the left. Indices can be obtained using
+            [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for detail.
+            [What are input IDs?](../glossary#input-ids) To know more on how to prepare `input_ids` for pretraining
+            take a look a [Pop2Pianp Training](./Pop2Piano#training).
+        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary. Indices can be obtained using
+            [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+            [What are decoder input IDs?](../glossary#decoder-input-ids) Pop2Piano uses the `pad_token_id` as the
+            starting token for `decoder_input_ids` generation. If `past_key_values` is used, optionally only the last
+            `decoder_input_ids` have to be input (see `past_key_values`). To know more on how to prepare
+        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules in the encoder. Mask values selected in `[0,
+            1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
+            1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
+                `[0, 1]`:
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, `optional`: *hidden_states*, `optional`: *attentions*)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` is a sequence of hidden states at
+            the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Does the same task as `inputs_embeds`. If `inputs_embeds` is not present but `input_features` is present
+            then `input_features` will be considered as `inputs_embeds`.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
+            input (see `past_key_values`). This is useful if you want more control over how to convert
+            `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix. If
+            `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value of
+            `inputs_embeds`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.t5.modeling_t5.T5LayerNorm with T5->Pop2Piano
+class Pop2PianoLayerNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Construct a layernorm module in the Pop2Piano style. No bias and no subtraction of mean.
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        # Pop2Piano uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
+        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
+        # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
+        # half-precision inputs is done in fp32
+
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+
+        # convert into half-precision if necessary
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.to(self.weight.dtype)
+
+        return self.weight * hidden_states
+
+
+if not _load_pop2piano_layer_norm:
+    Pop2PianoLayerNorm = FusedRMSNorm  # noqa
+
+ALL_LAYERNORM_LAYERS.append(Pop2PianoLayerNorm)
+
+
+# Copied from transformers.models.t5.modeling_t5.T5DenseActDense with T5->Pop2Piano,t5->pop2piano
+class Pop2PianoDenseActDense(nn.Module):
+    def __init__(self, config: Pop2PianoConfig):
+        super().__init__()
+        self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+        self.act = ACT2FN[config.dense_act_fn]
+
+    def forward(self, hidden_states):
+        hidden_states = self.wi(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        if (
+            isinstance(self.wo.weight, torch.Tensor)
+            and hidden_states.dtype != self.wo.weight.dtype
+            and self.wo.weight.dtype != torch.int8
+        ):
+            hidden_states = hidden_states.to(self.wo.weight.dtype)
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.t5.modeling_t5.T5DenseGatedActDense with T5->Pop2Piano
+class Pop2PianoDenseGatedActDense(nn.Module):
+    def __init__(self, config: Pop2PianoConfig):
+        super().__init__()
+        self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+        self.act = ACT2FN[config.dense_act_fn]
+
+    def forward(self, hidden_states):
+        hidden_gelu = self.act(self.wi_0(hidden_states))
+        hidden_linear = self.wi_1(hidden_states)
+        hidden_states = hidden_gelu * hidden_linear
+        hidden_states = self.dropout(hidden_states)
+
+        # To make 8bit quantization work for google/flan-t5-xxl, self.wo is kept in float32.
+        # See https://github.com/huggingface/transformers/issues/20287
+        # we also make sure the weights are not in `int8` in case users will force `_keep_in_fp32_modules` to be `None``
+        if (
+            isinstance(self.wo.weight, torch.Tensor)
+            and hidden_states.dtype != self.wo.weight.dtype
+            and self.wo.weight.dtype != torch.int8
+        ):
+            hidden_states = hidden_states.to(self.wo.weight.dtype)
+
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.t5.modeling_t5.T5LayerFF with T5->Pop2Piano
+class Pop2PianoLayerFF(nn.Module):
+    def __init__(self, config: Pop2PianoConfig):
+        super().__init__()
+        if config.is_gated_act:
+            self.DenseReluDense = Pop2PianoDenseGatedActDense(config)
+        else:
+            self.DenseReluDense = Pop2PianoDenseActDense(config)
+
+        self.layer_norm = Pop2PianoLayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, hidden_states):
+        forwarded_states = self.layer_norm(hidden_states)
+        forwarded_states = self.DenseReluDense(forwarded_states)
+        hidden_states = hidden_states + self.dropout(forwarded_states)
+        return hidden_states
+
+
+# Copied from transformers.models.t5.modeling_t5.T5Attention with T5->Pop2Piano,t5->pop2piano
+class Pop2PianoAttention(nn.Module):
+    def __init__(self, config: Pop2PianoConfig, has_relative_attention_bias=False):
+        super().__init__()
+        self.is_decoder = config.is_decoder
+        self.has_relative_attention_bias = has_relative_attention_bias
+        self.relative_attention_num_buckets = config.relative_attention_num_buckets
+        self.relative_attention_max_distance = config.relative_attention_max_distance
+        self.d_model = config.d_model
+        self.key_value_proj_dim = config.d_kv
+        self.n_heads = config.num_heads
+        self.dropout = config.dropout_rate
+        self.inner_dim = self.n_heads * self.key_value_proj_dim
+
+        # Mesh TensorFlow initialization to avoid scaling before softmax
+        self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)
+
+        if self.has_relative_attention_bias:
+            self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
+        self.pruned_heads = set()
+        self.gradient_checkpointing = False
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads
+        )
+        # Prune linear layers
+        self.q = prune_linear_layer(self.q, index)
+        self.k = prune_linear_layer(self.k, index)
+        self.v = prune_linear_layer(self.v, index)
+        self.o = prune_linear_layer(self.o, index, dim=1)
+        # Update hyper params
+        self.n_heads = self.n_heads - len(heads)
+        self.inner_dim = self.key_value_proj_dim * self.n_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    @staticmethod
+    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
+        """
+        Adapted from Mesh Tensorflow:
+        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
+
+        Translate relative position to a bucket number for relative attention. The relative position is defined as
+        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
+        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
+        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
+        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
+        This should allow for more graceful generalization to longer sequences than the model has been trained on
+
+        Args:
+            relative_position: an int32 Tensor
+            bidirectional: a boolean - whether the attention is bidirectional
+            num_buckets: an integer
+            max_distance: an integer
+
+        Returns:
+            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
+        """
+        relative_buckets = 0
+        if bidirectional:
+            num_buckets //= 2
+            relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
+            relative_position = torch.abs(relative_position)
+        else:
+            relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))
+        # now relative_position is in the range [0, inf)
+
+        # half of the buckets are for exact increments in positions
+        max_exact = num_buckets // 2
+        is_small = relative_position < max_exact
+
+        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
+        relative_position_if_large = max_exact + (
+            torch.log(relative_position.float() / max_exact)
+            / math.log(max_distance / max_exact)
+            * (num_buckets - max_exact)
+        ).to(torch.long)
+        relative_position_if_large = torch.min(
+            relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
+        )
+
+        relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)
+        return relative_buckets
+
+    def compute_bias(self, query_length, key_length, device=None):
+        """Compute binned relative position bias"""
+        if device is None:
+            device = self.relative_attention_bias.weight.device
+        context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]
+        memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :]
+        relative_position = memory_position - context_position  # shape (query_length, key_length)
+        relative_position_bucket = self._relative_position_bucket(
+            relative_position,  # shape (query_length, key_length)
+            bidirectional=(not self.is_decoder),
+            num_buckets=self.relative_attention_num_buckets,
+            max_distance=self.relative_attention_max_distance,
+        )
+        values = self.relative_attention_bias(relative_position_bucket)  # shape (query_length, key_length, num_heads)
+        values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, query_length, key_length)
+        return values
+
+    def forward(
+        self,
+        hidden_states,
+        mask=None,
+        key_value_states=None,
+        position_bias=None,
+        past_key_value=None,
+        layer_head_mask=None,
+        query_length=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        """
+        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
+        """
+        # Input is (batch_size, seq_length, dim)
+        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
+        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        real_seq_length = seq_length
+
+        if past_key_value is not None:
+            if len(past_key_value) != 2:
+                raise ValueError(
+                    f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
+                )
+            real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length
+
+        key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
+
+        def shape(states):
+            """projection"""
+            return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
+
+        def unshape(states):
+            """reshape"""
+            return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
+
+        def project(hidden_states, proj_layer, key_value_states, past_key_value):
+            """projects hidden states correctly to key/query states"""
+            if key_value_states is None:
+                # self-attn
+                # (batch_size, n_heads, seq_length, dim_per_head)
+                hidden_states = shape(proj_layer(hidden_states))
+            elif past_key_value is None:
+                # cross-attn
+                # (batch_size, n_heads, seq_length, dim_per_head)
+                hidden_states = shape(proj_layer(key_value_states))
+
+            if past_key_value is not None:
+                if key_value_states is None:
+                    # self-attn
+                    # (batch_size, n_heads, key_length, dim_per_head)
+                    hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
+                elif past_key_value.shape[2] != key_value_states.shape[1]:
+                    # checking that the `sequence_length` of the `past_key_value` is the same as
+                    # the provided `key_value_states` to support prefix tuning
+                    # cross-attn
+                    # (batch_size, n_heads, seq_length, dim_per_head)
+                    hidden_states = shape(proj_layer(key_value_states))
+                else:
+                    # cross-attn
+                    hidden_states = past_key_value
+            return hidden_states
+
+        # get query states
+        query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
+
+        # get key/value states
+        key_states = project(
+            hidden_states, self.k, key_value_states, past_key_value[0] if past_key_value is not None else None
+        )
+        value_states = project(
+            hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None
+        )
+
+        # compute scores
+        scores = torch.matmul(
+            query_states, key_states.transpose(3, 2)
+        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
+
+        if position_bias is None:
+            if not self.has_relative_attention_bias:
+                position_bias = torch.zeros(
+                    (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype
+                )
+                if self.gradient_checkpointing and self.training:
+                    position_bias.requires_grad = True
+            else:
+                position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device)
+
+            # if key and values are already calculated
+            # we want only the last query position bias
+            if past_key_value is not None:
+                position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
+
+            if mask is not None:
+                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
+
+        if self.pruned_heads:
+            mask = torch.ones(position_bias.shape[1])
+            mask[list(self.pruned_heads)] = 0
+            position_bias_masked = position_bias[:, mask.bool()]
+        else:
+            position_bias_masked = position_bias
+
+        scores += position_bias_masked
+        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
+            scores
+        )  # (batch_size, n_heads, seq_length, key_length)
+        attn_weights = nn.functional.dropout(
+            attn_weights, p=self.dropout, training=self.training
+        )  # (batch_size, n_heads, seq_length, key_length)
+
+        # Mask heads if we want to
+        if layer_head_mask is not None:
+            attn_weights = attn_weights * layer_head_mask
+
+        attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
+        attn_output = self.o(attn_output)
+
+        present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
+        outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
+
+        if output_attentions:
+            outputs = outputs + (attn_weights,)
+        return outputs
+
+
+# Copied from transformers.models.t5.modeling_t5.T5LayerSelfAttention with T5->Pop2Piano,t5->pop2piano
+class Pop2PianoLayerSelfAttention(nn.Module):
+    def __init__(self, config, has_relative_attention_bias=False):
+        super().__init__()
+        self.SelfAttention = Pop2PianoAttention(config, has_relative_attention_bias=has_relative_attention_bias)
+        self.layer_norm = Pop2PianoLayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_bias=None,
+        layer_head_mask=None,
+        past_key_value=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        normed_hidden_states = self.layer_norm(hidden_states)
+        attention_output = self.SelfAttention(
+            normed_hidden_states,
+            mask=attention_mask,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            past_key_value=past_key_value,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        hidden_states = hidden_states + self.dropout(attention_output[0])
+        outputs = (hidden_states,) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.t5.modeling_t5.T5LayerCrossAttention with T5->Pop2Piano,t5->pop2piano
+class Pop2PianoLayerCrossAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.EncDecAttention = Pop2PianoAttention(config, has_relative_attention_bias=False)
+        self.layer_norm = Pop2PianoLayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(
+        self,
+        hidden_states,
+        key_value_states,
+        attention_mask=None,
+        position_bias=None,
+        layer_head_mask=None,
+        past_key_value=None,
+        use_cache=False,
+        query_length=None,
+        output_attentions=False,
+    ):
+        normed_hidden_states = self.layer_norm(hidden_states)
+        attention_output = self.EncDecAttention(
+            normed_hidden_states,
+            mask=attention_mask,
+            key_value_states=key_value_states,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            past_key_value=past_key_value,
+            use_cache=use_cache,
+            query_length=query_length,
+            output_attentions=output_attentions,
+        )
+        layer_output = hidden_states + self.dropout(attention_output[0])
+        outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.t5.modeling_t5.T5Block with T5->Pop2Piano,t5->pop2piano
+class Pop2PianoBlock(nn.Module):
+    def __init__(self, config, has_relative_attention_bias=False):
+        super().__init__()
+        self.is_decoder = config.is_decoder
+        self.layer = nn.ModuleList()
+        self.layer.append(Pop2PianoLayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias))
+        if self.is_decoder:
+            self.layer.append(Pop2PianoLayerCrossAttention(config))
+
+        self.layer.append(Pop2PianoLayerFF(config))
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_bias=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        encoder_decoder_position_bias=None,
+        layer_head_mask=None,
+        cross_attn_layer_head_mask=None,
+        past_key_value=None,
+        use_cache=False,
+        output_attentions=False,
+        return_dict=True,
+    ):
+        if past_key_value is not None:
+            if not self.is_decoder:
+                logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.")
+            expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
+
+            if len(past_key_value) != expected_num_past_key_values:
+                raise ValueError(
+                    f"There should be {expected_num_past_key_values} past states. "
+                    f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
+                    f"Got {len(past_key_value)} past key / value states"
+                )
+
+            self_attn_past_key_value = past_key_value[:2]
+            cross_attn_past_key_value = past_key_value[2:]
+        else:
+            self_attn_past_key_value, cross_attn_past_key_value = None, None
+
+        self_attention_outputs = self.layer[0](
+            hidden_states,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            past_key_value=self_attn_past_key_value,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        hidden_states, present_key_value_state = self_attention_outputs[:2]
+        attention_outputs = self_attention_outputs[2:]  # Keep self-attention outputs and relative position weights
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == torch.float16:
+            clamp_value = torch.where(
+                torch.isinf(hidden_states).any(),
+                torch.finfo(hidden_states.dtype).max - 1000,
+                torch.finfo(hidden_states.dtype).max,
+            )
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        do_cross_attention = self.is_decoder and encoder_hidden_states is not None
+        if do_cross_attention:
+            # the actual query length is unknown for cross attention
+            # if using past key value states. Need to inject it here
+            if present_key_value_state is not None:
+                query_length = present_key_value_state[0].shape[2]
+            else:
+                query_length = None
+
+            cross_attention_outputs = self.layer[1](
+                hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                position_bias=encoder_decoder_position_bias,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                query_length=query_length,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+            )
+            hidden_states = cross_attention_outputs[0]
+
+            # clamp inf values to enable fp16 training
+            if hidden_states.dtype == torch.float16:
+                clamp_value = torch.where(
+                    torch.isinf(hidden_states).any(),
+                    torch.finfo(hidden_states.dtype).max - 1000,
+                    torch.finfo(hidden_states.dtype).max,
+                )
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+            # Combine self attn and cross attn key value states
+            if present_key_value_state is not None:
+                present_key_value_state = present_key_value_state + cross_attention_outputs[1]
+
+            # Keep cross-attention outputs and relative position weights
+            attention_outputs = attention_outputs + cross_attention_outputs[2:]
+
+        # Apply Feed Forward layer
+        hidden_states = self.layer[-1](hidden_states)
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == torch.float16:
+            clamp_value = torch.where(
+                torch.isinf(hidden_states).any(),
+                torch.finfo(hidden_states.dtype).max - 1000,
+                torch.finfo(hidden_states.dtype).max,
+            )
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if use_cache:
+            outputs = outputs + (present_key_value_state,) + attention_outputs
+        else:
+            outputs = outputs + attention_outputs
+
+        return outputs  # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
+
+
+class Pop2PianoPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = Pop2PianoConfig
+    base_model_prefix = "transformer"
+    is_parallelizable = False
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Pop2PianoBlock"]
+    _keep_in_fp32_modules = ["wo"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_factor  # Used for testing weights initialization
+        if isinstance(module, Pop2PianoLayerNorm):
+            module.weight.data.fill_(factor * 1.0)
+        elif isinstance(module, Pop2PianoConcatEmbeddingToMel):
+            module.embedding.weight.data.normal_(mean=0.0, std=factor * 1.0)
+        elif isinstance(module, Pop2PianoForConditionalGeneration):
+            # Mesh TensorFlow embeddings initialization
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
+            module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0)
+            if hasattr(module, "lm_head") and not self.config.tie_word_embeddings:
+                module.lm_head.weight.data.normal_(mean=0.0, std=factor * 1.0)
+        elif isinstance(module, Pop2PianoDenseActDense):
+            # Mesh TensorFlow FF initialization
+            # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
+            # and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89
+            module.wi.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.wi, "bias") and module.wi.bias is not None:
+                module.wi.bias.data.zero_()
+            module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
+            if hasattr(module.wo, "bias") and module.wo.bias is not None:
+                module.wo.bias.data.zero_()
+        elif isinstance(module, Pop2PianoDenseGatedActDense):
+            module.wi_0.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.wi_0, "bias") and module.wi_0.bias is not None:
+                module.wi_0.bias.data.zero_()
+            module.wi_1.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.wi_1, "bias") and module.wi_1.bias is not None:
+                module.wi_1.bias.data.zero_()
+            module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
+            if hasattr(module.wo, "bias") and module.wo.bias is not None:
+                module.wo.bias.data.zero_()
+        elif isinstance(module, Pop2PianoAttention):
+            # Mesh TensorFlow attention initialization to avoid scaling before softmax
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136
+            d_model = self.config.d_model
+            key_value_proj_dim = self.config.d_kv
+            n_heads = self.config.num_heads
+            module.q.weight.data.normal_(mean=0.0, std=factor * ((d_model * key_value_proj_dim) ** -0.5))
+            module.k.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5))
+            module.v.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5))
+            module.o.weight.data.normal_(mean=0.0, std=factor * ((n_heads * key_value_proj_dim) ** -0.5))
+            if module.has_relative_attention_bias:
+                module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor * ((d_model) ** -0.5))
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (Pop2PianoAttention, Pop2PianoStack)):
+            module.gradient_checkpointing = value
+
+    def _shift_right(self, input_ids):
+        decoder_start_token_id = self.config.decoder_start_token_id
+        pad_token_id = self.config.pad_token_id
+
+        if decoder_start_token_id is None:
+            raise ValueError(
+                "self.model.config.decoder_start_token_id has to be defined. In Pop2Piano it is usually set to the pad_token_id."
+            )
+
+        # shift inputs to the right
+        if is_torch_fx_proxy(input_ids):
+            # Item assignment is not supported natively for proxies.
+            shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), decoder_start_token_id)
+            shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)
+        else:
+            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+            shifted_input_ids[..., 0] = decoder_start_token_id
+
+        if pad_token_id is None:
+            raise ValueError("self.model.config.pad_token_id has to be defined.")
+        # replace possible -100 values in labels by `pad_token_id`
+        shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+        return shifted_input_ids
+
+
+class Pop2PianoStack(Pop2PianoPreTrainedModel):
+    # Copied from transformers.models.t5.modeling_t5.T5Stack.__init__ with T5->Pop2Piano,t5->pop2piano
+    def __init__(self, config, embed_tokens=None):
+        super().__init__(config)
+
+        self.embed_tokens = embed_tokens
+        self.is_decoder = config.is_decoder
+
+        self.block = nn.ModuleList(
+            [Pop2PianoBlock(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
+        )
+        self.final_layer_norm = Pop2PianoLayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+        self.gradient_checkpointing = False
+
+    # Copied from transformers.models.t5.modeling_t5.T5Stack.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    # Copied from transformers.models.t5.modeling_t5.T5Stack.set_input_embeddings
+    def set_input_embeddings(self, new_embeddings):
+        self.embed_tokens = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        inputs_embeds=None,
+        head_mask=None,
+        cross_attn_head_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            err_msg_prefix = "decoder_" if self.is_decoder else ""
+            raise ValueError(
+                f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            err_msg_prefix = "decoder_" if self.is_decoder else ""
+            raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds")
+
+        if inputs_embeds is None:
+            if self.embed_tokens is None:
+                raise ValueError("You have to initialize the model with valid token embeddings")
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        batch_size, seq_length = input_shape
+
+        # required mask seq length can be calculated via length of past
+        mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length
+
+        if use_cache is True:
+            if not self.is_decoder:
+                raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder")
+
+        if attention_mask is None:
+            attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
+        if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None:
+            encoder_seq_length = encoder_hidden_states.shape[1]
+            encoder_attention_mask = torch.ones(
+                batch_size, encoder_seq_length, device=inputs_embeds.device, dtype=torch.long
+            )
+
+        # initialize past_key_values with `None` if past does not exist
+        if past_key_values is None:
+            past_key_values = [None] * len(self.block)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=inputs_embeds.device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # Prepare head mask if needed
+        head_mask = self.get_head_mask(head_mask, self.config.num_layers)
+        cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
+        present_key_value_states = () if use_cache else None
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and self.is_decoder) else None
+        position_bias = None
+        encoder_decoder_position_bias = None
+
+        hidden_states = self.dropout(inputs_embeds)
+
+        for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)):
+            layer_head_mask = head_mask[i]
+            cross_attn_layer_head_mask = cross_attn_head_mask[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return tuple(module(*inputs, use_cache, output_attentions))
+
+                    return custom_forward
+
+                layer_outputs = checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    extended_attention_mask,
+                    position_bias,
+                    encoder_hidden_states,
+                    encoder_extended_attention_mask,
+                    encoder_decoder_position_bias,
+                    layer_head_mask,
+                    cross_attn_layer_head_mask,
+                    None,  # past_key_value is always None with gradient checkpointing
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask=extended_attention_mask,
+                    position_bias=position_bias,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_extended_attention_mask,
+                    encoder_decoder_position_bias=encoder_decoder_position_bias,
+                    layer_head_mask=layer_head_mask,
+                    cross_attn_layer_head_mask=cross_attn_layer_head_mask,
+                    past_key_value=past_key_value,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                )
+
+            # layer_outputs is a tuple with:
+            # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
+            if use_cache is False:
+                layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:]
+
+            hidden_states, present_key_value_state = layer_outputs[:2]
+
+            # We share the position biases between the layers - the first layer store them
+            # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
+            # (cross-attention position bias), (cross-attention weights)
+            position_bias = layer_outputs[2]
+            if self.is_decoder and encoder_hidden_states is not None:
+                encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3]
+            # append next layer key value states
+            if use_cache:
+                present_key_value_states = present_key_value_states + (present_key_value_state,)
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[3],)
+                if self.is_decoder:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[5],)
+
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    present_key_value_states,
+                    all_hidden_states,
+                    all_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=present_key_value_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class Pop2PianoConcatEmbeddingToMel(nn.Module):
+    """Embedding Matrix for `composer` tokens."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.embedding = nn.Embedding(num_embeddings=config.composer_vocab_size, embedding_dim=config.d_model)
+
+    def forward(self, feature, index_value, embedding_offset):
+        index_shifted = index_value - embedding_offset
+        composer_embedding = self.embedding(index_shifted).unsqueeze(1)
+        inputs_embeds = torch.cat([composer_embedding, feature], dim=1)
+        return inputs_embeds
+
+
+Pop2Piano_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`Pop2PianoConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings("""Pop2Piano Model with a `language modeling` head on top.""", Pop2Piano_START_DOCSTRING)
+class Pop2PianoForConditionalGeneration(Pop2PianoPreTrainedModel):
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
+
+    def __init__(self, config: Pop2PianoConfig):
+        super().__init__(config)
+        self.config = config
+        self.model_dim = config.d_model
+
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+        self.mel_conditioner = Pop2PianoConcatEmbeddingToMel(config)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_decoder = False
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+
+        self.encoder = Pop2PianoStack(encoder_config, self.shared)
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.is_encoder_decoder = False
+        decoder_config.num_layers = config.num_decoder_layers
+        self.decoder = Pop2PianoStack(decoder_config, self.shared)
+
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+        self.decoder.set_input_embeddings(new_embeddings)
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    def get_mel_conditioner_outputs(
+        self,
+        input_features: torch.FloatTensor,
+        composer: str,
+        generation_config: GenerationConfig,
+        attention_mask: torch.FloatTensor = None,
+    ):
+        """
+        This method is used to concatenate mel conditioner tokens at the front of the input_features in order to
+        control the type of MIDI token generated by the model.
+
+        Args:
+            input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                input features extracted from the feature extractor.
+            composer (`str`):
+                composer token which determines the type of MIDI tokens to be generated.
+            generation_config (`~generation.GenerationConfig`):
+                The generation is used to get the composer-feature_token pair.
+            attention_mask (``, *optional*):
+                For batched generation `input_features` are padded to have the same shape across all examples.
+                `attention_mask` helps to determine which areas were padded and which were not.
+                - 1 for tokens that are **not padded**,
+                - 0 for tokens that are **padded**.
+        """
+        composer_to_feature_token = generation_config.composer_to_feature_token
+        if composer not in composer_to_feature_token.keys():
+            raise ValueError(
+                f"Please choose a composer from {list(composer_to_feature_token.keys())}. Composer received - {composer}"
+            )
+        composer_value = composer_to_feature_token[composer]
+        composer_value = torch.tensor(composer_value, device=self.device)
+        composer_value = composer_value.repeat(input_features.shape[0])
+
+        embedding_offset = min(composer_to_feature_token.values())
+
+        input_features = self.mel_conditioner(
+            feature=input_features,
+            index_value=composer_value,
+            embedding_offset=embedding_offset,
+        )
+        if attention_mask is not None:
+            input_features[~attention_mask[:, 0].bool()] = 0.0
+
+            # since self.mel_conditioner adds a new array at the front of inputs_embeds we need to do the same for attention_mask to keep the shapes same
+            attention_mask = torch.concatenate([attention_mask[:, 0].view(-1, 1), attention_mask], axis=1)
+            return input_features, attention_mask
+
+        return input_features, None
+
+    @add_start_docstrings_to_model_forward(POP2PIANO_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        decoder_head_mask: Optional[torch.FloatTensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        input_features: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
+            labels in `[0, ..., config.vocab_size]`
+        Returns:
+        """
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if inputs_embeds is not None and input_features is not None:
+            raise ValueError("Both `inputs_embeds` and `input_features` received! Please provide only one of them")
+        elif input_features is not None and inputs_embeds is None:
+            inputs_embeds = input_features
+
+        # Encode if needed (training, first prediction pass)
+        if encoder_outputs is None:
+            # Convert encoder inputs in embeddings if needed
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        hidden_states = encoder_outputs[0]
+
+        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
+            # get decoder inputs from shifting lm labels to the right
+            decoder_input_ids = self._shift_right(labels)
+
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            past_key_values=past_key_values,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = decoder_outputs[0]
+
+        if self.config.tie_word_embeddings:
+            # Rescale output before projecting on vocab
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
+            sequence_output = sequence_output * (self.model_dim**-0.5)
+
+        lm_logits = self.lm_head(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
+
+        if not return_dict:
+            output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
+            return ((loss,) + output) if loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        input_features,
+        attention_mask=None,
+        composer="composer1",
+        generation_config=None,
+        **kwargs,
+    ):
+        """
+        Generates token ids for midi outputs.
+
+        <Tip warning={true}>
+
+        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
+        model's default generation configuration. You can override any `generation_config` by passing the corresponding
+        parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`. For an overview of generation
+        strategies and code examples, check out the [following guide](./generation_strategies).
+
+        </Tip>
+
+        Parameters:
+            input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                This is the featurized version of audio generated by `Pop2PianoFeatureExtractor`.
+            attention_mask:
+                For batched generation `input_features` are padded to have the same shape across all examples.
+                `attention_mask` helps to determine which areas were padded and which were not.
+                - 1 for tokens that are **not padded**,
+                - 0 for tokens that are **padded**.
+            composer (`str`, *optional*, defaults to `"composer1"`):
+                This value is passed to `Pop2PianoConcatEmbeddingToMel` to generate different embeddings for each
+                `"composer"`. Please make sure that the composet value is present in `composer_to_feature_token` in
+                `generation_config`. For an example please see
+                https://huggingface.co/sweetcocoa/pop2piano/blob/main/generation_config.json .
+            generation_config (`~generation.GenerationConfig`, *optional*):
+                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+                passed to generate matching the attributes of `generation_config` will override them. If
+                `generation_config` is not provided, the default will be used, which had the following loading
+                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
+                default values, whose documentation should be checked to parameterize generation.
+            kwargs:
+                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
+                forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
+                specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
+        Return:
+            [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
+            or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`.
+                Since Pop2Piano is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
+                [`~utils.ModelOutput`] types are:
+                    - [`~generation.GreedySearchEncoderDecoderOutput`],
+                    - [`~generation.SampleEncoderDecoderOutput`],
+                    - [`~generation.BeamSearchEncoderDecoderOutput`],
+                    - [`~generation.BeamSampleEncoderDecoderOutput`]
+        """
+
+        if generation_config is None:
+            generation_config = self.generation_config
+        generation_config.update(**kwargs)
+
+        # check for composer_to_feature_token
+        if not hasattr(generation_config, "composer_to_feature_token"):
+            raise ValueError(
+                "`composer_to_feature_token` was not found! Please refer to "
+                "https://huggingface.co/sweetcocoa/pop2piano/blob/main/generation_config.json"
+                "and parse a dict like that."
+            )
+
+        if len(generation_config.composer_to_feature_token) != self.config.composer_vocab_size:
+            raise ValueError(
+                "config.composer_vocab_size must be same as the number of keys in "
+                f"generation_config.composer_to_feature_token! "
+                f"Found {self.config.composer_vocab_size} vs {len(generation_config.composer_to_feature_token)}."
+            )
+
+        # to control the variation of generated MIDI tokens we concatenate mel-conditioner tokens(which depends on composer_token)
+        # at the front of input_features.
+        input_features, attention_mask = self.get_mel_conditioner_outputs(
+            input_features=input_features,
+            attention_mask=attention_mask,
+            composer=composer,
+            generation_config=generation_config,
+        )
+
+        return super().generate(
+            inputs=None,
+            inputs_embeds=input_features,
+            attention_mask=attention_mask,
+            generation_config=generation_config,
+            **kwargs,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs,
+    ):
+        # cut decoder_input_ids if past is used
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            "decoder_input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "encoder_outputs": encoder_outputs,
+            "attention_mask": attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "use_cache": use_cache,
+        }
+
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return self._shift_right(labels)
+
+    def _reorder_cache(self, past_key_values, beam_idx):
+        # if decoder past is not included in output
+        # speedy decoding is disabled and no need to reorder
+        if past_key_values is None:
+            logger.warning("You might want to consider setting `use_cache=True` to speed up decoding")
+            return past_key_values
+
+        reordered_decoder_past = ()
+        for layer_past_states in past_key_values:
+            # get the correct batch idx from layer past batch dim
+            # batch dim of `past` is at 2nd position
+            reordered_layer_past_states = ()
+            for layer_past_state in layer_past_states:
+                # need to set correct `past` for each of the four key / value states
+                reordered_layer_past_states = reordered_layer_past_states + (
+                    layer_past_state.index_select(0, beam_idx.to(layer_past_state.device)),
+                )
+
+            if reordered_layer_past_states[0].shape != layer_past_states[0].shape:
+                raise ValueError(
+                    f"reordered_layer_past_states[0] shape {reordered_layer_past_states[0].shape} and layer_past_states[0] shape {layer_past_states[0].shape} mismatched"
+                )
+            if len(reordered_layer_past_states) != len(layer_past_states):
+                raise ValueError(
+                    f"length of reordered_layer_past_states {len(reordered_layer_past_states)} and length of layer_past_states {len(layer_past_states)} mismatched"
+                )
+
+            reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)
+        return reordered_decoder_past
diff --git a/src/transformers/models/pop2piano/processing_pop2piano.py b/src/transformers/models/pop2piano/processing_pop2piano.py
new file mode 100644
index 000000000000..5ea579111ddb
--- /dev/null
+++ b/src/transformers/models/pop2piano/processing_pop2piano.py
@@ -0,0 +1,138 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Processor class for Pop2Piano."""
+
+import os
+from typing import List, Optional, Union
+
+import numpy as np
+
+from ...feature_extraction_utils import BatchFeature
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils import BatchEncoding, PaddingStrategy, TruncationStrategy
+from ...utils import TensorType
+
+
+class Pop2PianoProcessor(ProcessorMixin):
+    r"""
+    Constructs an Pop2Piano processor which wraps a Pop2Piano Feature Extractor and Pop2Piano Tokenizer into a single
+    processor.
+
+    [`Pop2PianoProcessor`] offers all the functionalities of [`Pop2PianoFeatureExtractor`] and [`Pop2PianoTokenizer`].
+    See the docstring of [`~Pop2PianoProcessor.__call__`] and [`~Pop2PianoProcessor.decode`] for more information.
+
+    Args:
+        feature_extractor (`Pop2PianoFeatureExtractor`):
+            An instance of [`Pop2PianoFeatureExtractor`]. The feature extractor is a required input.
+        tokenizer (`Pop2PianoTokenizer`):
+            An instance of ['Pop2PianoTokenizer`]. The tokenizer is a required input.
+    """
+    attributes = ["feature_extractor", "tokenizer"]
+    feature_extractor_class = "Pop2PianoFeatureExtractor"
+    tokenizer_class = "Pop2PianoTokenizer"
+
+    def __init__(self, feature_extractor, tokenizer):
+        super().__init__(feature_extractor, tokenizer)
+
+    def __call__(
+        self,
+        audio: Union[np.ndarray, List[float], List[np.ndarray]] = None,
+        sampling_rate: Union[int, List[int]] = None,
+        steps_per_beat: int = 2,
+        resample: Optional[bool] = True,
+        notes: Union[List, TensorType] = None,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        verbose: bool = True,
+        **kwargs,
+    ) -> Union[BatchFeature, BatchEncoding]:
+        """
+        This method uses [`Pop2PianoFeatureExtractor.__call__`] method to prepare log-mel-spectrograms for the model,
+        and [`Pop2PianoTokenizer.__call__`] to prepare token_ids from notes.
+
+        Please refer to the docstring of the above two methods for more information.
+        """
+
+        # Since Feature Extractor needs both audio and sampling_rate and tokenizer needs both token_ids and
+        # feature_extractor_output, we must check for both.
+        if (audio is None and sampling_rate is None) and (notes is None):
+            raise ValueError(
+                "You have to specify at least audios and sampling_rate in order to use feature extractor or "
+                "notes to use the tokenizer part."
+            )
+
+        if audio is not None and sampling_rate is not None:
+            inputs = self.feature_extractor(
+                audio=audio,
+                sampling_rate=sampling_rate,
+                steps_per_beat=steps_per_beat,
+                resample=resample,
+                **kwargs,
+            )
+        if notes is not None:
+            encoded_token_ids = self.tokenizer(
+                notes=notes,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                pad_to_multiple_of=pad_to_multiple_of,
+                verbose=verbose,
+                **kwargs,
+            )
+
+        if notes is None:
+            return inputs
+
+        elif audio is None or sampling_rate is None:
+            return encoded_token_ids
+
+        else:
+            inputs["token_ids"] = encoded_token_ids["token_ids"]
+            return inputs
+
+    def batch_decode(
+        self,
+        token_ids,
+        feature_extractor_output: BatchFeature,
+        return_midi: bool = True,
+    ) -> BatchEncoding:
+        """
+        This method uses [`Pop2PianoTokenizer.batch_decode`] method to convert model generated token_ids to midi_notes.
+
+        Please refer to the docstring of the above two methods for more information.
+        """
+
+        return self.tokenizer.batch_decode(
+            token_ids=token_ids, feature_extractor_output=feature_extractor_output, return_midi=return_midi
+        )
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        feature_extractor_input_names = self.feature_extractor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names))
+
+    def save_pretrained(self, save_directory, **kwargs):
+        if os.path.isfile(save_directory):
+            raise ValueError(f"Provided path ({save_directory}) should be a directory, not a file")
+        os.makedirs(save_directory, exist_ok=True)
+        return super().save_pretrained(save_directory, **kwargs)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
+        return cls(*args)
diff --git a/src/transformers/models/pop2piano/tokenization_pop2piano.py b/src/transformers/models/pop2piano/tokenization_pop2piano.py
new file mode 100644
index 000000000000..0d25dcdfc7d5
--- /dev/null
+++ b/src/transformers/models/pop2piano/tokenization_pop2piano.py
@@ -0,0 +1,713 @@
+# coding=utf-8
+# Copyright 2023 The Pop2Piano Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization class for Pop2Piano."""
+
+import json
+import os
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+
+from ...feature_extraction_utils import BatchFeature
+from ...tokenization_utils import AddedToken, BatchEncoding, PaddingStrategy, PreTrainedTokenizer, TruncationStrategy
+from ...utils import TensorType, is_pretty_midi_available, logging, requires_backends, to_numpy
+
+
+if is_pretty_midi_available():
+    import pretty_midi
+
+logger = logging.get_logger(__name__)
+
+
+VOCAB_FILES_NAMES = {
+    "vocab": "vocab.json",
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab": {
+        "sweetcocoa/pop2piano": "https://huggingface.co/sweetcocoa/pop2piano/blob/main/vocab.json",
+    },
+}
+
+
+def token_time_to_note(number, cutoff_time_idx, current_idx):
+    current_idx += number
+    if cutoff_time_idx is not None:
+        current_idx = min(current_idx, cutoff_time_idx)
+
+    return current_idx
+
+
+def token_note_to_note(number, current_velocity, default_velocity, note_onsets_ready, current_idx, notes):
+    if note_onsets_ready[number] is not None:
+        # offset with onset
+        onset_idx = note_onsets_ready[number]
+        if onset_idx < current_idx:
+            # Time shift after previous note_on
+            offset_idx = current_idx
+            notes.append([onset_idx, offset_idx, number, default_velocity])
+            onsets_ready = None if current_velocity == 0 else current_idx
+            note_onsets_ready[number] = onsets_ready
+    else:
+        note_onsets_ready[number] = current_idx
+    return notes
+
+
+class Pop2PianoTokenizer(PreTrainedTokenizer):
+    """
+    Constructs a Pop2Piano tokenizer. This tokenizer does not require training.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab (`str`):
+            Path to the vocab file which contains the vocabulary.
+        default_velocity (`int`, *optional*, defaults to 77):
+            Determines the default velocity to be used while creating midi Notes.
+        num_bars (`int`, *optional*, defaults to 2):
+            Determines cutoff_time_idx in for each token.
+    """
+
+    model_input_names = ["token_ids", "attention_mask"]
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+
+    def __init__(
+        self,
+        vocab,
+        default_velocity=77,
+        num_bars=2,
+        unk_token="-1",
+        eos_token="1",
+        pad_token="0",
+        bos_token="2",
+        **kwargs,
+    ):
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+
+        self.default_velocity = default_velocity
+        self.num_bars = num_bars
+
+        # Load the vocab
+        with open(vocab, "rb") as file:
+            self.encoder = json.load(file)
+
+        # create mappings for encoder
+        self.decoder = {v: k for k, v in self.encoder.items()}
+
+        super().__init__(
+            unk_token=unk_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            bos_token=bos_token,
+            **kwargs,
+        )
+
+    @property
+    def vocab_size(self):
+        """Returns the vocabulary size of the tokenizer."""
+        return len(self.encoder)
+
+    def get_vocab(self):
+        """Returns the vocabulary of the tokenizer."""
+        return dict(self.encoder, **self.added_tokens_encoder)
+
+    def _convert_id_to_token(self, token_id: int) -> list:
+        """
+        Decodes the token ids generated by the transformer into notes.
+
+        Args:
+            token_id (`int`):
+                This denotes the ids generated by the transformers to be converted to Midi tokens.
+
+        Returns:
+            `List`: A list consists of token_type (`str`) and value (`int`).
+        """
+
+        token_type_value = self.decoder.get(token_id, f"{self.unk_token}_TOKEN_TIME")
+        token_type_value = token_type_value.split("_")
+        token_type, value = "_".join(token_type_value[1:]), int(token_type_value[0])
+
+        return [token_type, value]
+
+    def _convert_token_to_id(self, token, token_type="TOKEN_TIME") -> int:
+        """
+        Encodes the Midi tokens to transformer generated token ids.
+
+        Args:
+            token (`int`):
+                This denotes the token value.
+            token_type (`str`):
+                This denotes the type of the token. There are four types of midi tokens such as "TOKEN_TIME",
+                "TOKEN_VELOCITY", "TOKEN_NOTE" and "TOKEN_SPECIAL".
+
+        Returns:
+            `int`: returns the id of the token.
+        """
+        return self.encoder.get(f"{token}_{token_type}", int(self.unk_token))
+
+    def relative_batch_tokens_ids_to_notes(
+        self,
+        tokens: np.ndarray,
+        beat_offset_idx: int,
+        bars_per_batch: int,
+        cutoff_time_idx: int,
+    ):
+        """
+        Converts relative tokens to notes which are then used to generate pretty midi object.
+
+        Args:
+            tokens (`numpy.ndarray`):
+                Tokens to be converted to notes.
+            beat_offset_idx (`int`):
+                Denotes beat offset index for each note in generated Midi.
+            bars_per_batch (`int`):
+                A parameter to control the Midi output generation.
+            cutoff_time_idx (`int`):
+                Denotes the cutoff time index for each note in generated Midi.
+        """
+
+        notes = None
+
+        for index in range(len(tokens)):
+            _tokens = tokens[index]
+            _start_idx = beat_offset_idx + index * bars_per_batch * 4
+            _cutoff_time_idx = cutoff_time_idx + _start_idx
+            _notes = self.relative_tokens_ids_to_notes(
+                _tokens,
+                start_idx=_start_idx,
+                cutoff_time_idx=_cutoff_time_idx,
+            )
+
+            if len(_notes) == 0:
+                pass
+            elif notes is None:
+                notes = _notes
+            else:
+                notes = np.concatenate((notes, _notes), axis=0)
+
+        if notes is None:
+            return []
+        return notes
+
+    def relative_batch_tokens_ids_to_midi(
+        self,
+        tokens: np.ndarray,
+        beatstep: np.ndarray,
+        beat_offset_idx: int = 0,
+        bars_per_batch: int = 2,
+        cutoff_time_idx: int = 12,
+    ):
+        """
+        Converts tokens to Midi. This method calls `relative_batch_tokens_ids_to_notes` method to convert batch tokens
+        to notes then uses `notes_to_midi` method to convert them to Midi.
+
+        Args:
+            tokens (`numpy.ndarray`):
+                Denotes tokens which alongside beatstep will be converted to Midi.
+            beatstep (`np.ndarray`):
+                We get beatstep from feature extractor which is also used to get Midi.
+            beat_offset_idx (`int`, *optional*, defaults to 0):
+                Denotes beat offset index for each note in generated Midi.
+            bars_per_batch (`int`, *optional*, defaults to 2):
+                A parameter to control the Midi output generation.
+            cutoff_time_idx (`int`, *optional*, defaults to 12):
+                Denotes the cutoff time index for each note in generated Midi.
+        """
+        beat_offset_idx = 0 if beat_offset_idx is None else beat_offset_idx
+        notes = self.relative_batch_tokens_ids_to_notes(
+            tokens=tokens,
+            beat_offset_idx=beat_offset_idx,
+            bars_per_batch=bars_per_batch,
+            cutoff_time_idx=cutoff_time_idx,
+        )
+        midi = self.notes_to_midi(notes, beatstep, offset_sec=beatstep[beat_offset_idx])
+        return midi
+
+    # Taken from the original code
+    # Please see https://github.com/sweetcocoa/pop2piano/blob/fac11e8dcfc73487513f4588e8d0c22a22f2fdc5/midi_tokenizer.py#L257
+    def relative_tokens_ids_to_notes(self, tokens: np.ndarray, start_idx: float, cutoff_time_idx: float = None):
+        """
+        Converts relative tokens to notes which will then be used to create Pretty Midi objects.
+
+        Args:
+            tokens (`numpy.ndarray`):
+                Relative Tokens which will be converted to notes.
+            start_idx (`float`):
+                A parameter which denotes the starting index.
+            cutoff_time_idx (`float`, *optional*):
+                A parameter used while converting tokens to notes.
+        """
+        words = [self._convert_id_to_token(token) for token in tokens]
+
+        current_idx = start_idx
+        current_velocity = 0
+        note_onsets_ready = [None for i in range(sum([k.endswith("NOTE") for k in self.encoder.keys()]) + 1)]
+        notes = []
+        for token_type, number in words:
+            if token_type == "TOKEN_SPECIAL":
+                if number == 1:
+                    break
+            elif token_type == "TOKEN_TIME":
+                current_idx = token_time_to_note(
+                    number=number, cutoff_time_idx=cutoff_time_idx, current_idx=current_idx
+                )
+            elif token_type == "TOKEN_VELOCITY":
+                current_velocity = number
+
+            elif token_type == "TOKEN_NOTE":
+                notes = token_note_to_note(
+                    number=number,
+                    current_velocity=current_velocity,
+                    default_velocity=self.default_velocity,
+                    note_onsets_ready=note_onsets_ready,
+                    current_idx=current_idx,
+                    notes=notes,
+                )
+            else:
+                raise ValueError("Token type not understood!")
+
+        for pitch, note_onset in enumerate(note_onsets_ready):
+            # force offset if no offset for each pitch
+            if note_onset is not None:
+                if cutoff_time_idx is None:
+                    cutoff = note_onset + 1
+                else:
+                    cutoff = max(cutoff_time_idx, note_onset + 1)
+
+                offset_idx = max(current_idx, cutoff)
+                notes.append([note_onset, offset_idx, pitch, self.default_velocity])
+
+        if len(notes) == 0:
+            return []
+        else:
+            notes = np.array(notes)
+            note_order = notes[:, 0] * 128 + notes[:, 1]
+            notes = notes[note_order.argsort()]
+            return notes
+
+    def notes_to_midi(self, notes: np.ndarray, beatstep: np.ndarray, offset_sec: int = 0.0):
+        """
+        Converts notes to Midi.
+
+        Args:
+            notes (`numpy.ndarray`):
+                This is used to create Pretty Midi objects.
+            beatstep (`numpy.ndarray`):
+                This is the extrapolated beatstep that we get from feature extractor.
+            offset_sec (`int`, *optional*, defaults to 0.0):
+                This represents the offset seconds which is used while creating each Pretty Midi Note.
+        """
+
+        requires_backends(self, ["pretty_midi"])
+
+        new_pm = pretty_midi.PrettyMIDI(resolution=384, initial_tempo=120.0)
+        new_inst = pretty_midi.Instrument(program=0)
+        new_notes = []
+
+        for onset_idx, offset_idx, pitch, velocity in notes:
+            new_note = pretty_midi.Note(
+                velocity=velocity,
+                pitch=pitch,
+                start=beatstep[onset_idx] - offset_sec,
+                end=beatstep[offset_idx] - offset_sec,
+            )
+            new_notes.append(new_note)
+        new_inst.notes = new_notes
+        new_pm.instruments.append(new_inst)
+        new_pm.remove_invalid_notes()
+        return new_pm
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        """
+        Saves the tokenizer's vocabulary dictionary to the provided save_directory.
+
+        Args:
+            save_directory (`str`):
+                A path to the directory where to saved. It will be created if it doesn't exist.
+            filename_prefix (`Optional[str]`, *optional*):
+                A prefix to add to the names of the files saved by the tokenizer.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+
+        # Save the encoder.
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab"]
+        )
+        with open(out_vocab_file, "w") as file:
+            file.write(json.dumps(self.encoder))
+
+        return (out_vocab_file,)
+
+    def encode_plus(
+        self,
+        notes: Union[np.ndarray, List[pretty_midi.Note]],
+        truncation_strategy: Optional[TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        **kwargs,
+    ) -> BatchEncoding:
+        r"""
+        This is the `encode_plus` method for `Pop2PianoTokenizer`. It converts the midi notes to the transformer
+        generated token ids. It only works on a single batch, to process multiple batches please use
+        `batch_encode_plus` or `__call__` method.
+
+        Args:
+            notes (`numpy.ndarray` of shape `[sequence_length, 4]` or `list` of `pretty_midi.Note` objects):
+                This represents the midi notes. If `notes` is a `numpy.ndarray`:
+                    - Each sequence must have 4 values, they are `onset idx`, `offset idx`, `pitch` and `velocity`.
+                If `notes` is a `list` containing `pretty_midi.Note` objects:
+                    - Each sequence must have 4 attributes, they are `start`, `end`, `pitch` and `velocity`.
+            truncation_strategy ([`~tokenization_utils_base.TruncationStrategy`], *optional*):
+                Indicates the truncation strategy that is going to be used during truncation.
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+
+        Returns:
+            `BatchEncoding` containing the tokens ids.
+        """
+
+        requires_backends(self, ["pretty_midi"])
+
+        # check if notes is a pretty_midi object or not, if yes then extract the attributes and put them into a numpy
+        # array.
+        if isinstance(notes[0], pretty_midi.Note):
+            notes = np.array(
+                [[each_note.start, each_note.end, each_note.pitch, each_note.velocity] for each_note in notes]
+            ).reshape(-1, 4)
+
+        # to round up all the values to the closest int values.
+        notes = np.round(notes).astype(np.int32)
+        max_time_idx = notes[:, :2].max()
+
+        times = [[] for i in range((max_time_idx + 1))]
+        for onset, offset, pitch, velocity in notes:
+            times[onset].append([pitch, velocity])
+            times[offset].append([pitch, 0])
+
+        tokens = []
+        current_velocity = 0
+        for i, time in enumerate(times):
+            if len(time) == 0:
+                continue
+            tokens.append(self._convert_token_to_id(i, "TOKEN_TIME"))
+            for pitch, velocity in time:
+                velocity = int(velocity > 0)
+                if current_velocity != velocity:
+                    current_velocity = velocity
+                    tokens.append(self._convert_token_to_id(velocity, "TOKEN_VELOCITY"))
+                tokens.append(self._convert_token_to_id(pitch, "TOKEN_NOTE"))
+
+        total_len = len(tokens)
+
+        # truncation
+        if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
+            tokens, _, _ = self.truncate_sequences(
+                ids=tokens,
+                num_tokens_to_remove=total_len - max_length,
+                truncation_strategy=truncation_strategy,
+                **kwargs,
+            )
+
+        return BatchEncoding({"token_ids": tokens})
+
+    def batch_encode_plus(
+        self,
+        notes: Union[np.ndarray, List[pretty_midi.Note]],
+        truncation_strategy: Optional[TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        **kwargs,
+    ) -> BatchEncoding:
+        r"""
+        This is the `batch_encode_plus` method for `Pop2PianoTokenizer`. It converts the midi notes to the transformer
+        generated token ids. It works on multiple batches by calling `encode_plus` multiple times in a loop.
+
+        Args:
+            notes (`numpy.ndarray` of shape `[batch_size, sequence_length, 4]` or `list` of `pretty_midi.Note` objects):
+                This represents the midi notes. If `notes` is a `numpy.ndarray`:
+                    - Each sequence must have 4 values, they are `onset idx`, `offset idx`, `pitch` and `velocity`.
+                If `notes` is a `list` containing `pretty_midi.Note` objects:
+                    - Each sequence must have 4 attributes, they are `start`, `end`, `pitch` and `velocity`.
+            truncation_strategy ([`~tokenization_utils_base.TruncationStrategy`], *optional*):
+                Indicates the truncation strategy that is going to be used during truncation.
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+
+        Returns:
+            `BatchEncoding` containing the tokens ids.
+        """
+
+        encoded_batch_token_ids = []
+        for i in range(len(notes)):
+            encoded_batch_token_ids.append(
+                self.encode_plus(
+                    notes[i],
+                    truncation_strategy=truncation_strategy,
+                    max_length=max_length,
+                    **kwargs,
+                )["token_ids"]
+            )
+
+        return BatchEncoding({"token_ids": encoded_batch_token_ids})
+
+    def __call__(
+        self,
+        notes: Union[
+            np.ndarray,
+            List[pretty_midi.Note],
+            List[List[pretty_midi.Note]],
+        ],
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        r"""
+        This is the `__call__` method for `Pop2PianoTokenizer`. It converts the midi notes to the transformer generated
+        token ids.
+
+        Args:
+            notes (`numpy.ndarray` of shape `[batch_size, max_sequence_length, 4]` or `list` of `pretty_midi.Note` objects):
+                This represents the midi notes.
+
+                If `notes` is a `numpy.ndarray`:
+                    - Each sequence must have 4 values, they are `onset idx`, `offset idx`, `pitch` and `velocity`.
+                If `notes` is a `list` containing `pretty_midi.Note` objects:
+                    - Each sequence must have 4 attributes, they are `start`, `end`, `pitch` and `velocity`.
+            padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Activates and controls padding. Accepts the following values:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
+                Activates and controls truncation. Accepts the following values:
+
+                - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
+                  to the maximum acceptable input length for the model if that argument is not provided. This will
+                  truncate token by token, removing a token from the longest sequence in the pair if a pair of
+                  sequences (or a batch of pairs) is provided.
+                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
+                  greater than the model maximum admissible input size).
+            max_length (`int`, *optional*):
+                Controls the maximum length to use by one of the truncation/padding parameters. If left unset or set to
+                `None`, this will use the predefined model maximum length if a maximum length is required by one of the
+                truncation/padding parameters. If the model has no specific maximum input length (like XLNet)
+                truncation/padding to a maximum length will be deactivated.
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
+                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
+            return_attention_mask (`bool`, *optional*):
+                Whether to return the attention mask. If left to the default, will return the attention mask according
+                to the specific tokenizer's default, defined by the `return_outputs` attribute.
+
+                [What are attention masks?](../glossary#attention-mask)
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            verbose (`bool`, *optional*, defaults to `True`):
+                Whether or not to print more information and warnings.
+
+        Returns:
+            `BatchEncoding` containing the token_ids.
+        """
+
+        # check if it is batched or not
+        # it is batched if its a list containing a list of `pretty_midi.Notes` where the outer list contains all the
+        # batches and the inner list contains all Notes for a single batch. Otherwise if np.ndarray is passed it will be
+        # considered batched if it has shape of `[batch_size, seqence_length, 4]` or ndim=3.
+        is_batched = notes.ndim == 3 if isinstance(notes, np.ndarray) else isinstance(notes[0], list)
+
+        # get the truncation and padding strategy
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        if is_batched:
+            # If the user has not explicitly mentioned `return_attention_mask` as False, we change it to True
+            return_attention_mask = True if return_attention_mask is None else return_attention_mask
+            token_ids = self.batch_encode_plus(
+                notes=notes,
+                truncation_strategy=truncation_strategy,
+                max_length=max_length,
+                **kwargs,
+            )
+        else:
+            token_ids = self.encode_plus(
+                notes=notes,
+                truncation_strategy=truncation_strategy,
+                max_length=max_length,
+                **kwargs,
+            )
+
+        # since we already have truncated sequnences we are just left to do padding
+        token_ids = self.pad(
+            token_ids,
+            padding=padding_strategy,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=return_attention_mask,
+            return_tensors=return_tensors,
+            verbose=verbose,
+        )
+
+        return token_ids
+
+    def batch_decode(
+        self,
+        token_ids,
+        feature_extractor_output: BatchFeature,
+        return_midi: bool = True,
+    ):
+        r"""
+        This is the `batch_decode` method for `Pop2PianoTokenizer`. It converts the token_ids generated by the
+        transformer to midi_notes and returns them.
+
+        Args:
+            token_ids (`Union[np.ndarray, torch.Tensor, tf.Tensor]`):
+                Output token_ids of `Pop2PianoConditionalGeneration` model.
+            feature_extractor_output (`BatchFeature`):
+                Denotes the output of `Pop2PianoFeatureExtractor.__call__`. It must contain `"beatstep"` and
+                `"extrapolated_beatstep"`. Also `"attention_mask_beatsteps"` and
+                `"attention_mask_extrapolated_beatstep"`
+                 should be present if they were returned by the feature extractor.
+            return_midi (`bool`, *optional*, defaults to `True`):
+                Whether to return midi object or not.
+        Returns:
+            If `return_midi` is True:
+                - `BatchEncoding` containing both `notes` and `pretty_midi.pretty_midi.PrettyMIDI` objects.
+            If `return_midi` is False:
+                - `BatchEncoding` containing `notes`.
+        """
+
+        # check if they have attention_masks(attention_mask, attention_mask_beatsteps, attention_mask_extrapolated_beatstep) or not
+        attention_masks_present = bool(
+            hasattr(feature_extractor_output, "attention_mask")
+            and hasattr(feature_extractor_output, "attention_mask_beatsteps")
+            and hasattr(feature_extractor_output, "attention_mask_extrapolated_beatstep")
+        )
+
+        # if we are processing batched inputs then we must need attention_masks
+        if not attention_masks_present and feature_extractor_output["beatsteps"].shape[0] > 1:
+            raise ValueError(
+                "attention_mask, attention_mask_beatsteps and attention_mask_extrapolated_beatstep must be present "
+                "for batched inputs! But one of them were not present."
+            )
+
+        # check for length mismatch between inputs_embeds, beatsteps and extrapolated_beatstep
+        if attention_masks_present:
+            # since we know about the number of examples in token_ids from attention_mask
+            if (
+                sum(feature_extractor_output["attention_mask"][:, 0] == 0)
+                != feature_extractor_output["beatsteps"].shape[0]
+                or feature_extractor_output["beatsteps"].shape[0]
+                != feature_extractor_output["extrapolated_beatstep"].shape[0]
+            ):
+                raise ValueError(
+                    "Length mistamtch between token_ids, beatsteps and extrapolated_beatstep! Found "
+                    f"token_ids length - {token_ids.shape[0]}, beatsteps shape - {feature_extractor_output['beatsteps'].shape[0]} "
+                    f"and extrapolated_beatsteps shape - {feature_extractor_output['extrapolated_beatstep'].shape[0]}"
+                )
+            if feature_extractor_output["attention_mask"].shape[0] != token_ids.shape[0]:
+                raise ValueError(
+                    f"Found attention_mask of length - {feature_extractor_output['attention_mask'].shape[0]} but token_ids of length - {token_ids.shape[0]}"
+                )
+        else:
+            # if there is no attention mask present then it's surely a single example
+            if (
+                feature_extractor_output["beatsteps"].shape[0] != 1
+                or feature_extractor_output["extrapolated_beatstep"].shape[0] != 1
+            ):
+                raise ValueError(
+                    "Length mistamtch of beatsteps and extrapolated_beatstep! Since attention_mask is not present the number of examples must be 1, "
+                    f"But found beatsteps length - {feature_extractor_output['beatsteps'].shape[0]}, extrapolated_beatsteps length - {feature_extractor_output['extrapolated_beatstep'].shape[0]}."
+                )
+
+        if attention_masks_present:
+            # check for zeros(since token_ids are seperated by zero arrays)
+            batch_idx = np.where(feature_extractor_output["attention_mask"][:, 0] == 0)[0]
+        else:
+            batch_idx = [token_ids.shape[0]]
+
+        notes_list = []
+        pretty_midi_objects_list = []
+        start_idx = 0
+        for index, end_idx in enumerate(batch_idx):
+            each_tokens_ids = token_ids[start_idx:end_idx]
+            # check where the whole example ended by searching for eos_token_id and getting the upper bound
+            each_tokens_ids = each_tokens_ids[:, : np.max(np.where(each_tokens_ids == int(self.eos_token))[1]) + 1]
+            beatsteps = feature_extractor_output["beatsteps"][index]
+            extrapolated_beatstep = feature_extractor_output["extrapolated_beatstep"][index]
+
+            # if attention mask is present then mask out real array/tensor
+            if attention_masks_present:
+                attention_mask_beatsteps = feature_extractor_output["attention_mask_beatsteps"][index]
+                attention_mask_extrapolated_beatstep = feature_extractor_output[
+                    "attention_mask_extrapolated_beatstep"
+                ][index]
+                beatsteps = beatsteps[: np.max(np.where(attention_mask_beatsteps == 1)[0]) + 1]
+                extrapolated_beatstep = extrapolated_beatstep[
+                    : np.max(np.where(attention_mask_extrapolated_beatstep == 1)[0]) + 1
+                ]
+
+            each_tokens_ids = to_numpy(each_tokens_ids)
+            beatsteps = to_numpy(beatsteps)
+            extrapolated_beatstep = to_numpy(extrapolated_beatstep)
+
+            pretty_midi_object = self.relative_batch_tokens_ids_to_midi(
+                tokens=each_tokens_ids,
+                beatstep=extrapolated_beatstep,
+                bars_per_batch=self.num_bars,
+                cutoff_time_idx=(self.num_bars + 1) * 4,
+            )
+
+            for note in pretty_midi_object.instruments[0].notes:
+                note.start += beatsteps[0]
+                note.end += beatsteps[0]
+                notes_list.append(note)
+
+            pretty_midi_objects_list.append(pretty_midi_object)
+            start_idx += end_idx + 1  # 1 represents the zero array
+
+        if return_midi:
+            return BatchEncoding({"notes": notes_list, "pretty_midi_objects": pretty_midi_objects_list})
+
+        return BatchEncoding({"notes": notes_list})
diff --git a/src/transformers/models/prophetnet/__init__.py b/src/transformers/models/prophetnet/__init__.py
index b739fb9f5d5a..083301cc20c6 100644
--- a/src/transformers/models/prophetnet/__init__.py
+++ b/src/transformers/models/prophetnet/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/prophetnet/configuration_prophetnet.py b/src/transformers/models/prophetnet/configuration_prophetnet.py
index 40f5939d99bc..35988eaa1321 100644
--- a/src/transformers/models/prophetnet/configuration_prophetnet.py
+++ b/src/transformers/models/prophetnet/configuration_prophetnet.py
@@ -132,7 +132,7 @@ def __init__(
         pad_token_id: Optional[int] = 0,
         bos_token_id: Optional[int] = 1,
         eos_token_id: Optional[int] = 2,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
diff --git a/src/transformers/models/prophetnet/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/prophetnet/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py
index 638a71ef2fa4..c9e64c06ef76 100644
--- a/src/transformers/models/prophetnet/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/prophetnet/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py
@@ -19,8 +19,6 @@
 
 from torch import nn
 
-from transformers import ProphetNetForConditionalGeneration, XLMProphetNetForConditionalGeneration, logging
-
 # transformers_old should correspond to branch `save_old_prophetnet_model_structure` here
 # original prophetnet_checkpoints are saved under `patrickvonplaten/..._old` respectively
 from transformers_old.modeling_prophetnet import (
@@ -30,6 +28,8 @@
     XLMProphetNetForConditionalGeneration as XLMProphetNetForConditionalGenerationOld,
 )
 
+from transformers import ProphetNetForConditionalGeneration, XLMProphetNetForConditionalGeneration, logging
+
 
 logger = logging.get_logger(__name__)
 logging.set_verbosity_info()
diff --git a/src/transformers/models/prophetnet/modeling_prophetnet.py b/src/transformers/models/prophetnet/modeling_prophetnet.py
index 231145ae7c24..241a9efea36a 100644
--- a/src/transformers/models/prophetnet/modeling_prophetnet.py
+++ b/src/transformers/models/prophetnet/modeling_prophetnet.py
@@ -41,7 +41,6 @@
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "ProphenetConfig"
-_TOKENIZER_FOR_DOC = "ProphetNetTokenizer"
 _CHECKPOINT_FOR_DOC = "microsoft/prophetnet-large-uncased"
 
 PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -75,7 +74,7 @@
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`ProphetNetTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -89,7 +88,7 @@
         decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`ProphetNetTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -148,7 +147,7 @@
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`ProphetNetTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -667,7 +666,6 @@ def forward(
         past_key_value: Optional[Tuple[Tensor]] = None,
         output_attentions: bool = False,
     ) -> Tuple[Tensor, Optional[Tensor]]:
-
         batch_size, tgt_len, hidden_size = hidden_states.size()
 
         # if key_value_states are provided this layer is used as a cross-attention layer
@@ -703,44 +701,27 @@ def forward(
             past_key_value = (key_states, value_states)
 
         # project states into the correct shape
-        proj_shape = (batch_size * self.num_attn_heads, -1, self.head_dim)
+        proj_shape = (batch_size, self.num_attn_heads, -1, self.head_dim)
         query_states = self._shape(query_states, tgt_len, batch_size).view(*proj_shape)
         key_states = key_states.view(*proj_shape)
         value_states = value_states.view(*proj_shape)
-
-        src_len = key_states.size(1)
-        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
-        assert attn_weights.size() == (
-            batch_size * self.num_attn_heads,
-            tgt_len,
-            src_len,
-        ), (
-            f"`attn_weights` should be of size {batch_size * self.num_attn_heads, tgt_len, src_len}, but is of size"
-            f" {attn_weights.shape}"
-        )
+        src_len = key_states.size(2)
+        attn_weights = torch.einsum("bsij,bsjk->bsik", query_states, key_states.transpose(2, 3))
+        expected_shape = (batch_size, self.num_attn_heads, tgt_len, src_len)
+        if attn_weights.size() != expected_shape:
+            raise ValueError(f"Attention weights should have size {expected_shape}, but is {attn_weights.size()}")
 
         # This is part of a workaround to get around fork/join parallelism not supporting Optional types.
         if attention_mask is not None and attention_mask.dim() == 0:
             attention_mask = None
-        assert attention_mask is None or attention_mask.size() == (
-            self.num_attn_heads * batch_size,
-            1,
-            src_len,
-        ), (
-            "`attention_mask` should be `None` or of shape attention_mask.size() =="
-            f" {batch_size * self.num_attn_heads, 1, src_len}, but is {attention_mask.shape}"
-        )
 
+        expected_shape = (batch_size, self.num_attn_heads, 1, src_len)
+        if attention_mask is not None and attention_mask.size() != expected_shape:
+            raise ValueError(f"Attention mask should have size {expected_shape}, but is {attention_mask.size()}")
         if attention_mask is not None:  # don't attend to padding symbols
             attn_weights = attn_weights + attention_mask
-
         if output_attentions:
-            # this operation is a bit awkward, but it's required to
-            # make sure that attn_weights keeps its gradient.
-            # In order to do so, attn_weights have to be reshaped
-            # twice and have to be reused in the following
-            attn_weights_reshaped = attn_weights.view(batch_size, self.num_attn_heads, tgt_len, src_len)
-            attn_weights = attn_weights_reshaped.view(batch_size * self.num_attn_heads, tgt_len, src_len)
+            attn_weights_reshaped = attn_weights
         else:
             attn_weights_reshaped = None
 
@@ -754,7 +735,6 @@ def forward(
             attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(
                 batch_size, self.num_attn_heads, tgt_len, src_len
             )
-            attn_weights = attn_weights.view(batch_size * self.num_attn_heads, tgt_len, src_len)
 
             # apply head_mask also on attn_weights_reshaped which is used for n-gram attention inside the model
             attn_weights_reshaped = layer_head_mask.view(1, -1, 1, 1) * attn_weights_reshaped
@@ -764,23 +744,12 @@ def forward(
             p=self.attention_dropout,
             training=self.training,
         )
+        attn_output = torch.einsum("bsij,bsjk->bsik", attn_probs, value_states)
+        expected_shape = (batch_size, self.num_attn_heads, tgt_len, self.head_dim)
+        if attn_output.size() != expected_shape:
+            raise ValueError(f"`attn_output` should have shape {expected_shape}, but is of shape {attn_output.size()}")
 
-        attn_output = torch.bmm(attn_probs, value_states)
-        assert attn_output.size() == (
-            batch_size * self.num_attn_heads,
-            tgt_len,
-            self.head_dim,
-        ), (
-            f"`attn_output` should be of shape {batch_size * self.num_attn_heads, tgt_len, self.head_dim}, but is of"
-            f" shape {attn_output.size()}"
-        )
-
-        attn_output = (
-            attn_output.view(batch_size, self.num_attn_heads, tgt_len, self.head_dim)
-            .transpose(1, 2)
-            .reshape(batch_size, tgt_len, hidden_size)
-        )
-
+        attn_output = attn_output.transpose(1, 2).reshape(batch_size, tgt_len, hidden_size)
         attn_output = self.out_proj(attn_output)
 
         attn_output = nn.functional.dropout(attn_output, p=self.dropout, training=self.training)
@@ -858,7 +827,6 @@ def forward(
         position_ids=None,
     ):
         batch_size, ngram_sequence_length, hidden_size = hidden_states.size()
-
         assert list(hidden_states.size()) == [batch_size, ngram_sequence_length, hidden_size], (
             f"`hidden_states` should be of shape {batch_size, ngram_sequence_length, hidden_size}, but is of shape"
             f" {hidden_states.shape}"
@@ -876,8 +844,7 @@ def forward(
         query_states = self._shape(query_states, ngram_sequence_length, batch_size)
         key_states = self._shape(key_states, -1, batch_size)
         value_states = self._shape(value_states, -1, batch_size)
-
-        proj_shape = (batch_size * self.num_attn_heads, -1, self.head_dim)
+        proj_shape = (batch_size, self.num_attn_heads, -1, self.head_dim)
 
         query_states = query_states.view(*proj_shape)
         key_states = key_states.view(*proj_shape)
@@ -885,10 +852,9 @@ def forward(
 
         # chunk into main stream and predict stream
         hidden_states_list = hidden_states.chunk(1 + self.ngram, dim=1)
-
-        query_states_list = query_states.chunk(1 + self.ngram, dim=1)
-        key_states_list = key_states.chunk(1 + self.ngram, dim=1)
-        value_states_list = value_states.chunk(1 + self.ngram, dim=1)
+        query_states_list = query_states.chunk(1 + self.ngram, dim=2)
+        key_states_list = key_states.chunk(1 + self.ngram, dim=2)
+        value_states_list = value_states.chunk(1 + self.ngram, dim=2)
 
         main_hidden_states, hidden_states_predict_list = hidden_states_list[0], hidden_states_list[1:]
         main_query_states, predict_query_states_list = query_states_list[0], query_states_list[1:]
@@ -897,28 +863,29 @@ def forward(
 
         # saved states are stored with shape (batch_size, num_attn_heads, seq_len, head_dim)
         if past_key_value is not None:
-            prev_main_key_states = past_key_value[0].view(batch_size * self.num_attn_heads, -1, self.head_dim)
-            main_key_states = torch.cat((prev_main_key_states, main_key_states), dim=1)
-            prev_main_value_states = past_key_value[1].view(batch_size * self.num_attn_heads, -1, self.head_dim)
-            main_value_states = torch.cat((prev_main_value_states, main_value_states), dim=1)
+            prev_main_key_states = past_key_value[0]
+            main_key_states = torch.cat((prev_main_key_states, main_key_states), dim=2)
+            prev_main_value_states = past_key_value[1]
+            main_value_states = torch.cat((prev_main_value_states, main_value_states), dim=2)
 
         # Update cache
-        past_key_value = (
-            main_key_states.view(batch_size, self.num_attn_heads, -1, self.head_dim),
-            main_value_states.view(batch_size, self.num_attn_heads, -1, self.head_dim),
-        )
+        past_key_value = (main_key_states, main_value_states)
 
         # get seq_length of main stream only
         sequence_length = ngram_sequence_length // (1 + self.ngram)
 
         # MAIN-STREAM
         # main attn weights
-        main_attn_weights = torch.bmm(main_query_states, main_key_states.transpose(1, 2))
+        # [batch_size, number_heads, sequence_length, head_dimesion]
+        # x [batch_size, number_heads, head_dimesion, sequence_length]
+        # -> [batch_size, number_heads, sequence_length, sequence_length]
+        main_attn_weights = torch.einsum("bntc,bncs->bnts", main_query_states, main_key_states.transpose(2, 3))
 
         # retrieve relative position embeddings for each layer -> see paper for more details
         main_relative_pos_embeddings = self.get_main_relative_pos_embeddings(
             main_hidden_states, main_attn_weights, position_ids, main_relative_position_buckets
         )
+
         main_attn_weights = main_attn_weights + main_relative_pos_embeddings
 
         if attention_mask is not None:
@@ -938,55 +905,53 @@ def forward(
             main_attn_probs = layer_head_mask.view(1, -1, 1, 1) * main_attn_probs.view(
                 batch_size, self.num_attn_heads, -1, sequence_length
             )
-            main_attn_probs = main_attn_probs.view(batch_size * self.num_attn_heads, -1, sequence_length)
 
         main_attn_probs = nn.functional.dropout(main_attn_probs, p=self.attention_dropout, training=self.training)
         # project to attn_output
-        main_attn_output = torch.bmm(main_attn_probs, main_value_states)
-
+        # [batch_size, number_heads, sequence_length, sequence_length]
+        # x [batch_size, number_heads, sequence_length, head_dimesion]
+        # -> [batch_size, number_heads, sequence_length, head_dimesion]
+        main_attn_output = torch.einsum("bntc,bncs->bnts", main_attn_probs, main_value_states)
         # reshape so that num_heads dim is merged into last `head_dim` axis
-        main_attn_output = (
-            main_attn_output.view(batch_size, self.num_attn_heads, sequence_length, self.head_dim)
-            .transpose(1, 2)
-            .reshape(batch_size, 1, sequence_length, hidden_size)
-        )
+        main_attn_output = main_attn_output.transpose(1, 2).reshape(batch_size, 1, sequence_length, hidden_size)
         main_attn_output = self.out_proj(main_attn_output)
 
         # PREDICT-STREAM
-        # [ngram, B*head, T, c]
-        predict_query_states = torch.cat(predict_query_states_list, 0).view(
-            self.ngram, -1, sequence_length, self.head_dim
-        )
-        # [ngram, B*head, 2*T, c]
-        predict_key_states = torch.cat(
-            [torch.cat([main_key_states, key], 1).unsqueeze(0) for key in predict_key_states_list], 0
+        # [batch_size, ngram, number_heads, sequence_length, head_dimesion]
+        predict_query_states = torch.stack(predict_query_states_list, 1).view(
+            batch_size, self.ngram, self.num_attn_heads, sequence_length, self.head_dim
         )
 
-        # [ngram, T, B, C]
-        predict_hidden_states = torch.cat(hidden_states_predict_list, 0).view(
-            self.ngram, sequence_length, batch_size, hidden_size
-        )
+        # [batch_size, ngram, number_heads, 2*sequence_length, head_dimesion]
+        predict_key_states = torch.stack([torch.cat([main_key_states, key], 2) for key in predict_key_states_list], 1)
 
-        # [ngram, B*head, 2*T, c]
+        # [batch_size, sequence_length, ngram, hidden_size]
+        predict_hidden_states = torch.stack(hidden_states_predict_list, dim=2)
+
+        # [batch_size, number_heads, ngram, 2*sequence_length, head_dimesion]
         predict_value_states = torch.cat(
-            [torch.cat([main_value_states, v_p], 1).unsqueeze(0) for v_p in predict_value_states_list], 0
+            [torch.cat([main_value_states, v_p], 2).unsqueeze(2) for v_p in predict_value_states_list], 2
         )
-        # [ngram, B*head, T, 2*T]
-        predict_attn_weights = torch.einsum("nbtc,nbsc->nbts", (predict_query_states, predict_key_states))
 
-        # [ngram, B*head, T, S]
+        # [batch_size, ngram, number_heads, sequence_length, head_dimesion]
+        # x [batch_size, ngram, number_heads, 2*sequence_length, head_dimesion]
+        # -> [batch_size, ngram, number_heads, sequence_length, 2*sequence_length]
+        predict_attn_weights = torch.einsum("bnhtc,bnhsc->bnhts", (predict_query_states, predict_key_states))
+
         # retrieve relative position embeddings for each layer -> see paper for more details
+        # [batch_size, ngram, number_heads, sequence_length, predict_relative_pos_embeddings]
         predict_relative_pos_embeddings = self.get_predict_relative_pos_embeddings(
             predict_hidden_states, predict_attn_weights, position_ids, predict_relative_position_buckets
         )
 
-        # [ngram, B*head, T, 2*T]
+        # [batch_size, ngram, number_heads, sequence_length, 2*sequence_length]
         predict_attn_weights = predict_attn_weights + predict_relative_pos_embeddings
 
         if extended_predict_attention_mask is not None:
-            predict_attn_weights = predict_attn_weights + extended_predict_attention_mask.to(
-                predict_attn_weights.dtype
-            )
+            # Permuting Predict attention mask to [batch_size, ngram, number_heads, sequence_length, 2*sequence_length]
+            extended_predict_attention_mask = extended_predict_attention_mask.permute(0, 2, 1, 3, 4)
+            extended_predict_attention_mask = extended_predict_attention_mask.to(predict_attn_weights.dtype)
+            predict_attn_weights = predict_attn_weights + extended_predict_attention_mask
 
         predict_attn_probs = softmax(
             predict_attn_weights,
@@ -999,37 +964,30 @@ def forward(
                 f"Head mask for a single layer should be of size {(self.num_attn_heads,)}, but is"
                 f" {layer_head_mask.size()}"
             )
-            predict_attn_probs = layer_head_mask.view(1, 1, -1, 1, 1) * predict_attn_probs.view(
-                self.ngram, batch_size, self.num_attn_heads, sequence_length, 2 * sequence_length
-            )
-            predict_attn_probs = predict_attn_probs.view(
-                self.ngram, batch_size * self.num_attn_heads, sequence_length, 2 * sequence_length
-            )
+            predict_attn_probs = layer_head_mask.view(1, 1, -1, 1, 1) * predict_attn_probs
 
         predict_attn_probs = nn.functional.dropout(
             predict_attn_probs, p=self.attention_dropout, training=self.training
         )
         # project to attention output
-        # [ngram, B*head, T, c]
-        predict_attn_output = torch.einsum("nbts,nbsc->nbtc", (predict_attn_probs, predict_value_states))
+        # [batch_size, ngram, number_heads, sequence_length, 2*sequence_length]
+        # x [batch_size, ngram, number_heads, 2*sequence_length, head_dimesion]
+        # -> [batch_size, ngram, number_heads, sequence_length, head_dimesion]
+        predict_attn_output = torch.einsum(
+            "bnhts,bnhsc->bnhtc", (predict_attn_probs, predict_value_states.transpose(1, 2))
+        )
 
         # reshape so that num_heads dim is merged into last `head_dim` axis
-        # [ngram, B, T, C]
-        predict_attn_output = (
-            predict_attn_output.view(self.ngram, batch_size, self.num_attn_heads, sequence_length, self.head_dim)
-            .permute(1, 0, 3, 2, 4)
-            .reshape(batch_size, self.ngram, sequence_length, hidden_size)
-        )
+        # [batch_size, ngram, number_heads, sequence_length, head_dimesion] -> [batch_size, ngram, sequence_length, hidden_size]
+        predict_attn_output = predict_attn_output.transpose(2, 3)
+        predict_attn_output = predict_attn_output.reshape(batch_size, self.ngram, sequence_length, hidden_size)
         predict_attn_output = self.out_proj(predict_attn_output)
 
         # concat to single attn output
-        # [B, 1+ngram*T, C]
+        # [batch_size, (1+ngram)*sequence_length, hidden_size]
         attn_output = torch.cat([main_attn_output, predict_attn_output], 1).view(batch_size, -1, hidden_size)
         # reshape into better form for `config.output_attentions`
         main_attn_probs = main_attn_probs.view(batch_size, self.num_attn_heads, sequence_length, -1)
-        predict_attn_probs = predict_attn_probs.view(
-            self.ngram, batch_size, self.num_attn_heads, sequence_length, -1
-        ).transpose(0, 1)
 
         attn_output = nn.functional.dropout(attn_output, p=self.dropout, training=self.training)
 
@@ -1038,8 +996,11 @@ def forward(
     def get_main_relative_pos_embeddings(
         self, hidden_states, attn_weights, position_ids, main_relative_position_buckets
     ):
-        # input hidden_states [B,T,C], input attn_weights [T*head,T,S], input position_ids [B,T] or [1,1]
-
+        # input hidden_states [batch_size, sequence_length, hidden_size]
+        # input attn_weights [batch_size, num_heads, sequence_length, sequence_length]
+        # input position_ids [batch_size, sequence_length] or [1,1]
+        batch_size, num_attn_heads, tgt_len, src_len = attn_weights.shape
+        attn_weights = attn_weights.view(batch_size, num_attn_heads, tgt_len, src_len)
         if main_relative_position_buckets is None:
             batch_size, sequence_length = hidden_states.shape[:2]
             relative_positions = (
@@ -1049,39 +1010,42 @@ def get_main_relative_pos_embeddings(
                 .repeat(batch_size, sequence_length, 1)
                 .to(position_ids.device)
             )
-            relative_positions = relative_positions - position_ids.unsqueeze(0).repeat(
-                batch_size, sequence_length, 1
-            )  # [B, T, s]
+            # [batch_size, sequence_length, sequence_length+1]
+            relative_positions = relative_positions - position_ids.unsqueeze(0).repeat(batch_size, sequence_length, 1)
             main_relative_position_buckets = compute_relative_buckets(
                 self.num_buckets, self.relative_max_distance, relative_positions, False
             )
 
-        rel_pos_embeddings = self.relative_pos_embeddings(hidden_states)  # [B,T,Buckets*head]
+        # [batch_size, sequence_length, num_buckets * num_heads]
+        rel_pos_embeddings = self.relative_pos_embeddings(hidden_states)
         rel_pos_embeddings = rel_pos_embeddings.view(
             rel_pos_embeddings.shape[:2] + (self.num_buckets, self.num_attn_heads)
-        ).permute(
-            0, 3, 1, 2
-        )  # [B,T,Buckets,head]
-        rel_pos_embeddings = rel_pos_embeddings.reshape(attn_weights.shape[:2] + (-1,))  # [B*head,T,Buckets]
-
-        main_relative_position_buckets = (
-            main_relative_position_buckets.repeat(1, self.num_attn_heads, 1)
-            .view(-1, main_relative_position_buckets.shape[-1])
-            .long()
-        )  # [B*head*T, T]
-        rel_pos_embeddings = rel_pos_embeddings.reshape(-1, rel_pos_embeddings.size(-1))  # [B*head*T,Buckets]
-
-        main_relative_pos_embeddings = torch.gather(
-            rel_pos_embeddings, dim=1, index=main_relative_position_buckets
-        ).view(attn_weights.shape[:2] + (-1,))
+        )
+        rel_pos_embeddings = rel_pos_embeddings.permute(0, 3, 1, 2)
+        # [batch_size, num_heads, sequence_length, num_buckets]
+        rel_pos_embeddings = rel_pos_embeddings.reshape(attn_weights.shape[:3] + (-1,))
+
+        main_relative_position_buckets = main_relative_position_buckets.repeat(1, self.num_attn_heads, 1)
+        # [batch_size * num_heads * sequence_length, sequence_length]
+        main_relative_position_buckets = main_relative_position_buckets.view(
+            -1, main_relative_position_buckets.shape[-1]
+        )
+        main_relative_position_buckets = main_relative_position_buckets.long()
+        # [batch_size * num_heads * sequence_length, sequence_length]
+        rel_pos_embeddings = rel_pos_embeddings.reshape(-1, rel_pos_embeddings.size(-1))
 
+        main_relative_pos_embeddings = torch.gather(rel_pos_embeddings, dim=1, index=main_relative_position_buckets)
+        main_relative_pos_embeddings = main_relative_pos_embeddings.view(batch_size, num_attn_heads, tgt_len, -1)
         return main_relative_pos_embeddings
 
     def get_predict_relative_pos_embeddings(
         self, hidden_states, attn_weights, position_ids, predict_relative_position_buckets
     ):
-        # input hidden_states [ngram, T,B,C], input attn_weights [ngram, B*head,T,S], input position_ids [B,T] or [1,1], input predict_relative_position_buckets [B,T, 2*T] or None
-        sequence_length, batch_size = hidden_states.shape[1:3]
+        # input hidden_states [batch_size, sequence_length, ngram, hidden_size]
+        # input attn_weights [batch_size, ngram, num_heads, sequence_length, 2*sequence_length]
+        # input position_ids [batch_size, sequence_length] or [1,1]
+        # input predict_relative_position_buckets [batch_size, sequence_length, 2*sequence_length] or None
+        batch_size, sequence_length = hidden_states.shape[0:2]
 
         if predict_relative_position_buckets is None:
             key_sequence_length = attn_weights.shape[-1]
@@ -1101,28 +1065,35 @@ def get_predict_relative_pos_embeddings(
                 self.num_buckets, self.relative_max_distance, relative_positions, False
             )
 
-        hidden_states = hidden_states.transpose(1, 2)  # [ngram, B, T, C]
-        rel_pos_embeddings = self.relative_pos_embeddings(hidden_states).view(
-            hidden_states.shape[:-1] + (self.num_buckets, self.num_attn_heads)
-        )  # [ngram, B, T, bucket, head]
-        rel_pos_embeddings = rel_pos_embeddings.permute(0, 1, 4, 2, 3).reshape(
-            self.ngram * batch_size * self.num_attn_heads, sequence_length, -1
-        )  # [ngram*B*head, T, bucket]
+        # [batch_size, ngram, sequence_length, hidden_size]
+        hidden_states = hidden_states.transpose(1, 2)
+        rel_pos_embeddings = self.relative_pos_embeddings(hidden_states)
 
-        predict_relative_position_buckets = predict_relative_position_buckets.unsqueeze(0).repeat(
+        # [batch_size, ngram, sequence_length, num_buckets, num_heads]
+        rel_pos_embeddings = rel_pos_embeddings.view(
+            hidden_states.shape[:-1] + (self.num_buckets, self.num_attn_heads)
+        )
+        rel_pos_embeddings = rel_pos_embeddings.permute(0, 2, 1, 4, 3)
+        # [batch_size * ngram * sequence_length * num_heads, num_buckets]
+        rel_pos_embeddings = rel_pos_embeddings.reshape(-1, self.num_buckets)
+        # [ngram, batch_size, num_heads * sequence_length, -1]
+        predict_relative_position_buckets = predict_relative_position_buckets.unsqueeze(0)
+        predict_relative_position_buckets = predict_relative_position_buckets.repeat(
             self.ngram, 1, self.num_attn_heads, 1
-        )  # [ngram, B, head*T, S]
-
-        rel_pos_embeddings = rel_pos_embeddings.reshape(-1, rel_pos_embeddings.size(-1))
+        )
+        # [ngram * batch_size * num_heads * sequence_length, -1]
         predict_relative_position_buckets = predict_relative_position_buckets.view(
             -1, predict_relative_position_buckets.size(-1)
-        ).long()  # [ngram*B*head*T, S]
+        ).long()
 
         predict_relative_pos_embeddings = torch.gather(
             rel_pos_embeddings, dim=1, index=predict_relative_position_buckets
-        ).view(
-            self.ngram, batch_size * self.num_attn_heads, sequence_length, -1
-        )  # [ngram, B*head, T, S]
+        )
+
+        # [batch_size, gram, num_heads, sequence_length, -1]
+        predict_relative_pos_embeddings = predict_relative_pos_embeddings.view(
+            batch_size, self.ngram, self.num_attn_heads, sequence_length, -1
+        )
 
         return predict_relative_pos_embeddings
 
@@ -1306,10 +1277,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import ProphetNetTokenizer, ProphetNetEncoder
+        >>> from transformers import AutoTokenizer, ProphetNetEncoder
         >>> import torch
 
-        >>> tokenizer = ProphetNetTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
+        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
         >>> model = ProphetNetEncoder.from_pretrained("patrickvonplaten/prophetnet-large-uncased-standalone")
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -1333,7 +1304,7 @@ def forward(
         # prepare attention mask
         if attention_mask is not None:
             extended_attention_mask = (
-                1.0 - attention_mask[:, None, :].repeat(self.config.num_encoder_attention_heads, 1, 1)
+                1.0 - attention_mask[:, None, None, :].repeat(1, self.config.num_encoder_attention_heads, 1, 1)
             ) * torch.finfo(self.dtype).min
             extended_attention_mask = extended_attention_mask.to(inputs_embeds.dtype)
         else:
@@ -1483,10 +1454,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import ProphetNetTokenizer, ProphetNetDecoder
+        >>> from transformers import AutoTokenizer, ProphetNetDecoder
         >>> import torch
 
-        >>> tokenizer = ProphetNetTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
+        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
         >>> model = ProphetNetDecoder.from_pretrained("microsoft/prophetnet-large-uncased", add_cross_attention=False)
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -1551,7 +1522,7 @@ def forward(
         # prepare encoder attention mask
         if encoder_attention_mask is not None:
             extended_encoder_attention_mask = (
-                1.0 - encoder_attention_mask[:, None, :].repeat(self.config.num_decoder_attention_heads, 1, 1)
+                1.0 - encoder_attention_mask[:, None, None, :].repeat(1, self.config.num_decoder_attention_heads, 1, 1)
             ) * torch.finfo(self.dtype).min
             extended_encoder_attention_mask = extended_encoder_attention_mask.to(inputs_embeds.dtype)
         else:
@@ -1571,6 +1542,14 @@ def forward(
         all_main_stream_attns = () if output_attentions else None
         all_ngram_stream_attns = () if output_attentions else None
         all_cross_attns = () if output_attentions and self.config.add_cross_attention else None
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         present_key_values = () if use_cache else None
 
         # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
@@ -1591,12 +1570,6 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         # None for past_key_value
@@ -1717,17 +1690,18 @@ def prepare_attention_mask(self, hidden_states, attention_mask):
             device=hidden_states.device,
         )
         causal_mask = torch.triu(causal_mask, 1)
-        extended_causal_mask = causal_mask[:seq_length, :seq_length][None, :, :].expand(
-            (batch_size,) + causal_mask.shape
+
+        extended_causal_mask = causal_mask[:seq_length, :seq_length][None, None, :, :].expand(
+            (batch_size, self.config.num_decoder_attention_heads) + causal_mask.shape
         )
 
         # add usual attention mask
         if attention_mask is not None:
-            extended_attention_mask = (1.0 - attention_mask[:, None, :]) * torch.finfo(self.dtype).min
+            extended_attention_mask = (1.0 - attention_mask[:, None, None, :]) * torch.finfo(self.dtype).min
             extended_attention_mask = extended_causal_mask + extended_attention_mask
         else:
             extended_attention_mask = extended_causal_mask
-        return extended_attention_mask.repeat(self.config.num_decoder_attention_heads, 1, 1).to(hidden_states.dtype)
+        return extended_attention_mask.to(hidden_states.dtype)
 
     def prepare_predict_attention_mask(self, hidden_states, attention_mask):
         batch_size, seq_length = hidden_states.shape[:2]
@@ -1745,14 +1719,16 @@ def prepare_predict_attention_mask(self, hidden_states, attention_mask):
             ],
             dim=-1,
         )
-        extended_predict_causal_mask = predict_causal_mask[:, None, :, :].expand(
-            predict_causal_mask.shape[:1] + (batch_size,) + predict_causal_mask.shape[1:]
+        extended_predict_causal_mask = predict_causal_mask[None, None, :, :, :].expand(
+            (batch_size, self.config.num_decoder_attention_heads) + predict_causal_mask.shape
         )
 
         # add usual attention mask
         if attention_mask is not None:
-            extended_attention_mask = (1.0 - attention_mask[None, :, None, :]) * torch.finfo(self.dtype).min
-            extended_attention_mask = extended_attention_mask.expand((self.ngram, batch_size, seq_length, seq_length))
+            extended_attention_mask = (1.0 - attention_mask[:, None, None, None, :]) * torch.finfo(self.dtype).min
+            extended_attention_mask = extended_attention_mask.expand(
+                (batch_size, self.config.num_decoder_attention_heads, self.ngram, seq_length, seq_length)
+            )
             # predicted stream attention_mask should always be 0
             extended_attention_mask = torch.cat(
                 [extended_attention_mask, torch.zeros_like(extended_attention_mask)], dim=-1
@@ -1760,9 +1736,7 @@ def prepare_predict_attention_mask(self, hidden_states, attention_mask):
             extended_predict_attention_mask = extended_predict_causal_mask + extended_attention_mask
         else:
             extended_predict_attention_mask = extended_predict_causal_mask
-        return extended_predict_attention_mask.repeat(1, self.config.num_decoder_attention_heads, 1, 1).to(
-            hidden_states.dtype
-        )
+        return extended_predict_attention_mask.to(hidden_states.dtype)
 
 
 @add_start_docstrings(
@@ -1770,7 +1744,7 @@ def prepare_predict_attention_mask(self, hidden_states, attention_mask):
     PROPHETNET_START_DOCSTRING,
 )
 class ProphetNetModel(ProphetNetPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["decoder.word_embeddings.weight", "encoder.word_embeddings.weight"]
+    _tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight"]
 
     def __init__(self, config: ProphetNetConfig):
         super().__init__(config)
@@ -1829,9 +1803,9 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import ProphetNetTokenizer, ProphetNetModel
+        >>> from transformers import AutoTokenizer, ProphetNetModel
 
-        >>> tokenizer = ProphetNetTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
+        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
         >>> model = ProphetNetModel.from_pretrained("microsoft/prophetnet-large-uncased")
 
         >>> input_ids = tokenizer(
@@ -1899,11 +1873,7 @@ def forward(
     PROPHETNET_START_DOCSTRING,
 )
 class ProphetNetForConditionalGeneration(ProphetNetPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        "decoder.word_embeddings.weight",
-        "encoder.word_embeddings.weight",
-        "lm_head.weight",
-    ]
+    _tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight", "lm_head.weight"]
 
     def __init__(self, config: ProphetNetConfig):
         super().__init__(config)
@@ -1957,9 +1927,9 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import ProphetNetTokenizer, ProphetNetForConditionalGeneration
+        >>> from transformers import AutoTokenizer, ProphetNetForConditionalGeneration
 
-        >>> tokenizer = ProphetNetTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
+        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
         >>> model = ProphetNetForConditionalGeneration.from_pretrained("microsoft/prophetnet-large-uncased")
 
         >>> input_ids = tokenizer(
@@ -2093,12 +2063,13 @@ def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
 
     @staticmethod
     # Copied from transformers.models.bart.modeling_bart.BartForConditionalGeneration._reorder_cache
-    def _reorder_cache(past, beam_idx):
+    def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
+        for layer_past in past_key_values:
             # cached cross_attention states don't have to be reordered -> they are always the same
             reordered_past += (
-                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
+                + layer_past[2:],
             )
         return reordered_past
 
@@ -2115,7 +2086,7 @@ def get_decoder(self):
     PROPHETNET_START_DOCSTRING,
 )
 class ProphetNetForCausalLM(ProphetNetPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config: ProphetNetConfig):
         # set config for CLM
@@ -2205,10 +2176,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import ProphetNetTokenizer, ProphetNetForCausalLM
+        >>> from transformers import AutoTokenizer, ProphetNetForCausalLM
         >>> import torch
 
-        >>> tokenizer = ProphetNetTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
+        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
         >>> model = ProphetNetForCausalLM.from_pretrained("microsoft/prophetnet-large-uncased")
         >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
@@ -2217,11 +2188,11 @@ def forward(
         >>> logits = outputs.logits
 
         >>> # Model can also be used with EncoderDecoder framework
-        >>> from transformers import BertTokenizer, EncoderDecoderModel, ProphetNetTokenizer
+        >>> from transformers import BertTokenizer, EncoderDecoderModel, AutoTokenizer
         >>> import torch
 
         >>> tokenizer_enc = BertTokenizer.from_pretrained("bert-large-uncased")
-        >>> tokenizer_dec = ProphetNetTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
+        >>> tokenizer_dec = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
         >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained(
         ...     "bert-large-uncased", "microsoft/prophetnet-large-uncased"
         ... )
@@ -2339,10 +2310,12 @@ def prepare_inputs_for_generation(
 
     @staticmethod
     # Copied from transformers.models.bart.modeling_bart.BartForCausalLM._reorder_cache
-    def _reorder_cache(past, beam_idx):
+    def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
         return reordered_past
 
 
diff --git a/src/transformers/models/prophetnet/tokenization_prophetnet.py b/src/transformers/models/prophetnet/tokenization_prophetnet.py
index 05e03ad4881a..483188ca55d0 100644
--- a/src/transformers/models/prophetnet/tokenization_prophetnet.py
+++ b/src/transformers/models/prophetnet/tokenization_prophetnet.py
@@ -72,20 +72,30 @@ class BasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -104,7 +114,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -132,7 +144,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
@@ -301,9 +313,6 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
             used to separate bullet-point like sentences in summarization, *e.g.*.
         pad_token (`str`, *optional*, defaults to `"[PAD]"`):
             The token used for padding, for example when batching sequences of different lengths.
-        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
         mask_token (`str`, *optional*, defaults to `"[MASK]"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
@@ -340,23 +349,8 @@ def __init__(
         mask_token: Optional[str] = "[MASK]",
         tokenize_chinese_chars: Optional[bool] = True,
         strip_accents: Optional[bool] = None,
-        **kwargs
+        **kwargs,
     ):
-        super().__init__(
-            do_lower_case=do_lower_case,
-            do_basic_tokenize=do_basic_tokenize,
-            never_split=never_split,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            x_sep_token=x_sep_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            tokenize_chinese_chars=tokenize_chinese_chars,
-            strip_accents=strip_accents,
-            **kwargs,
-        )
-        self.unique_no_split_tokens.append(x_sep_token)
-
         if not os.path.isfile(vocab_file):
             raise ValueError(
                 f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
@@ -372,7 +366,21 @@ def __init__(
                 tokenize_chinese_chars=tokenize_chinese_chars,
                 strip_accents=strip_accents,
             )
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+
+        super().__init__(
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            x_sep_token=x_sep_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
 
     @property
     def vocab_size(self):
@@ -385,7 +393,6 @@ def _tokenize(self, text):
         split_tokens = []
         if self.do_basic_tokenize:
             for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-
                 # If the token is part of the never_split set
                 if token in self.basic_tokenizer.never_split:
                     split_tokens.append(token)
diff --git a/src/transformers/models/pvt/__init__.py b/src/transformers/models/pvt/__init__.py
new file mode 100644
index 000000000000..cab5af9af7c9
--- /dev/null
+++ b/src/transformers/models/pvt/__init__.py
@@ -0,0 +1,80 @@
+# coding=utf-8
+# Copyright 2023 Authors: Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan,
+# Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao and The HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+    is_vision_available,
+)
+
+
+_import_structure = {
+    "configuration_pvt": ["PVT_PRETRAINED_CONFIG_ARCHIVE_MAP", "PvtConfig", "PvtOnnxConfig"],
+}
+
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["image_processing_pvt"] = ["PvtImageProcessor"]
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_pvt"] = [
+        "PVT_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "PvtForImageClassification",
+        "PvtModel",
+        "PvtPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_pvt import PVT_PRETRAINED_CONFIG_ARCHIVE_MAP, PvtConfig, PvtOnnxConfig
+
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .image_processing_pvt import PvtImageProcessor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_pvt import (
+            PVT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            PvtForImageClassification,
+            PvtModel,
+            PvtPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/pvt/configuration_pvt.py b/src/transformers/models/pvt/configuration_pvt.py
new file mode 100644
index 000000000000..12fb3a5b9a94
--- /dev/null
+++ b/src/transformers/models/pvt/configuration_pvt.py
@@ -0,0 +1,163 @@
+# coding=utf-8
+# Copyright 2023 Authors: Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan,
+# Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao and The HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Pvt model configuration"""
+
+from collections import OrderedDict
+from typing import Callable, List, Mapping
+
+from packaging import version
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+PVT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "pvt-tiny-224": "https://huggingface.co/Zetatech/pvt-tiny-224",
+    # See all PVT models at https://huggingface.co/models?filter=pvt
+}
+
+
+class PvtConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`PvtModel`]. It is used to instantiate an Pvt
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Pvt
+    [Xrenya/pvt-tiny-224](https://huggingface.co/Xrenya/pvt-tiny-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        image_size (`int`, *optional*, defaults to 224):
+            The input image size
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        num_encoder_blocks (`int`, *optional*, defaults to 4):
+            The number of encoder blocks (i.e. stages in the Mix Transformer encoder).
+        depths (`List[int]`, *optional*, defaults to `[2, 2, 2, 2]`):
+            The number of layers in each encoder block.
+        sequence_reduction_ratios (`List[int]`, *optional*, defaults to `[8, 4, 2, 1]`):
+            Sequence reduction ratios in each encoder block.
+        hidden_sizes (`List[int]`, *optional*, defaults to `[64, 128, 320, 512]`):
+            Dimension of each of the encoder blocks.
+        patch_sizes (`List[int]`, *optional*, defaults to `[4, 2, 2, 2]`):
+            Patch size before each encoder block.
+        strides (`List[int]`, *optional*, defaults to `[4, 2, 2, 2]`):
+            Stride before each encoder block.
+        num_attention_heads (`List[int]`, *optional*, defaults to `[1, 2, 5, 8]`):
+            Number of attention heads for each attention layer in each block of the Transformer encoder.
+        mlp_ratios (`List[int]`, *optional*, defaults to `[8, 8, 4, 4]`):
+            Ratio of the size of the hidden layer compared to the size of the input layer of the Mix FFNs in the
+            encoder blocks.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            The dropout probability for stochastic depth, used in the blocks of the Transformer encoder.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether or not a learnable bias should be added to the queries, keys and values.
+        num_labels ('int', *optional*, defaults to 1000):
+            The number of classes.
+    Example:
+
+    ```python
+    >>> from transformers import PvtModel, PvtConfig
+
+    >>> # Initializing a PVT Xrenya/pvt-tiny-224 style configuration
+    >>> configuration = PvtConfig()
+
+    >>> # Initializing a model from the Xrenya/pvt-tiny-224 style configuration
+    >>> model = PvtModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "pvt"
+
+    def __init__(
+        self,
+        image_size: int = 224,
+        num_channels: int = 3,
+        num_encoder_blocks: int = 4,
+        depths: List[int] = [2, 2, 2, 2],
+        sequence_reduction_ratios: List[int] = [8, 4, 2, 1],
+        hidden_sizes: List[int] = [64, 128, 320, 512],
+        patch_sizes: List[int] = [4, 2, 2, 2],
+        strides: List[int] = [4, 2, 2, 2],
+        num_attention_heads: List[int] = [1, 2, 5, 8],
+        mlp_ratios: List[int] = [8, 8, 4, 4],
+        hidden_act: Mapping[str, Callable] = "gelu",
+        hidden_dropout_prob: float = 0.0,
+        attention_probs_dropout_prob: float = 0.0,
+        initializer_range: float = 0.02,
+        drop_path_rate: float = 0.0,
+        layer_norm_eps: float = 1e-6,
+        qkv_bias: bool = True,
+        num_labels: int = 1000,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.image_size = image_size
+        self.num_channels = num_channels
+        self.num_encoder_blocks = num_encoder_blocks
+        self.depths = depths
+        self.sequence_reduction_ratios = sequence_reduction_ratios
+        self.hidden_sizes = hidden_sizes
+        self.patch_sizes = patch_sizes
+        self.strides = strides
+        self.mlp_ratios = mlp_ratios
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.drop_path_rate = drop_path_rate
+        self.layer_norm_eps = layer_norm_eps
+        self.num_labels = num_labels
+        self.qkv_bias = qkv_bias
+
+
+class PvtOnnxConfig(OnnxConfig):
+    torch_onnx_minimum_version = version.parse("1.11")
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+            ]
+        )
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-4
+
+    @property
+    def default_onnx_opset(self) -> int:
+        return 12
diff --git a/src/transformers/models/pvt/convert_pvt_to_pytorch.py b/src/transformers/models/pvt/convert_pvt_to_pytorch.py
new file mode 100644
index 000000000000..187f3200d608
--- /dev/null
+++ b/src/transformers/models/pvt/convert_pvt_to_pytorch.py
@@ -0,0 +1,227 @@
+# coding=utf-8
+# Copyright 2023 Authors: Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan,
+# Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao and The HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert Pvt checkpoints from the original library."""
+
+
+import argparse
+from pathlib import Path
+
+import requests
+import torch
+from PIL import Image
+
+from transformers import PvtConfig, PvtForImageClassification, PvtImageProcessor
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+# here we list all keys to be renamed (original name on the left, our name on the right)
+def create_rename_keys(config):
+    rename_keys = []
+    for i in range(config.num_encoder_blocks):
+        # Remane embedings' paramters
+        rename_keys.append((f"pos_embed{i + 1}", f"pvt.encoder.patch_embeddings.{i}.position_embeddings"))
+
+        rename_keys.append((f"patch_embed{i + 1}.proj.weight", f"pvt.encoder.patch_embeddings.{i}.projection.weight"))
+        rename_keys.append((f"patch_embed{i + 1}.proj.bias", f"pvt.encoder.patch_embeddings.{i}.projection.bias"))
+        rename_keys.append((f"patch_embed{i + 1}.norm.weight", f"pvt.encoder.patch_embeddings.{i}.layer_norm.weight"))
+        rename_keys.append((f"patch_embed{i + 1}.norm.bias", f"pvt.encoder.patch_embeddings.{i}.layer_norm.bias"))
+
+        for j in range(config.depths[i]):
+            # Rename blocks' parameters
+            rename_keys.append(
+                (f"block{i + 1}.{j}.attn.q.weight", f"pvt.encoder.block.{i}.{j}.attention.self.query.weight")
+            )
+            rename_keys.append(
+                (f"block{i + 1}.{j}.attn.q.bias", f"pvt.encoder.block.{i}.{j}.attention.self.query.bias")
+            )
+            rename_keys.append(
+                (f"block{i + 1}.{j}.attn.kv.weight", f"pvt.encoder.block.{i}.{j}.attention.self.kv.weight")
+            )
+            rename_keys.append((f"block{i + 1}.{j}.attn.kv.bias", f"pvt.encoder.block.{i}.{j}.attention.self.kv.bias"))
+
+            if config.sequence_reduction_ratios[i] > 1:
+                rename_keys.append(
+                    (
+                        f"block{i + 1}.{j}.attn.norm.weight",
+                        f"pvt.encoder.block.{i}.{j}.attention.self.layer_norm.weight",
+                    )
+                )
+                rename_keys.append(
+                    (f"block{i + 1}.{j}.attn.norm.bias", f"pvt.encoder.block.{i}.{j}.attention.self.layer_norm.bias")
+                )
+                rename_keys.append(
+                    (
+                        f"block{i + 1}.{j}.attn.sr.weight",
+                        f"pvt.encoder.block.{i}.{j}.attention.self.sequence_reduction.weight",
+                    )
+                )
+                rename_keys.append(
+                    (
+                        f"block{i + 1}.{j}.attn.sr.bias",
+                        f"pvt.encoder.block.{i}.{j}.attention.self.sequence_reduction.bias",
+                    )
+                )
+
+            rename_keys.append(
+                (f"block{i + 1}.{j}.attn.proj.weight", f"pvt.encoder.block.{i}.{j}.attention.output.dense.weight")
+            )
+            rename_keys.append(
+                (f"block{i + 1}.{j}.attn.proj.bias", f"pvt.encoder.block.{i}.{j}.attention.output.dense.bias")
+            )
+
+            rename_keys.append((f"block{i + 1}.{j}.norm1.weight", f"pvt.encoder.block.{i}.{j}.layer_norm_1.weight"))
+            rename_keys.append((f"block{i + 1}.{j}.norm1.bias", f"pvt.encoder.block.{i}.{j}.layer_norm_1.bias"))
+
+            rename_keys.append((f"block{i + 1}.{j}.norm2.weight", f"pvt.encoder.block.{i}.{j}.layer_norm_2.weight"))
+            rename_keys.append((f"block{i + 1}.{j}.norm2.bias", f"pvt.encoder.block.{i}.{j}.layer_norm_2.bias"))
+
+            rename_keys.append((f"block{i + 1}.{j}.mlp.fc1.weight", f"pvt.encoder.block.{i}.{j}.mlp.dense1.weight"))
+            rename_keys.append((f"block{i + 1}.{j}.mlp.fc1.bias", f"pvt.encoder.block.{i}.{j}.mlp.dense1.bias"))
+            rename_keys.append((f"block{i + 1}.{j}.mlp.fc2.weight", f"pvt.encoder.block.{i}.{j}.mlp.dense2.weight"))
+            rename_keys.append((f"block{i + 1}.{j}.mlp.fc2.bias", f"pvt.encoder.block.{i}.{j}.mlp.dense2.bias"))
+
+    # Rename cls token
+    rename_keys.extend(
+        [
+            ("cls_token", "pvt.encoder.patch_embeddings.3.cls_token"),
+        ]
+    )
+    # Rename norm layer and classifier layer
+    rename_keys.extend(
+        [
+            ("norm.weight", "pvt.encoder.layer_norm.weight"),
+            ("norm.bias", "pvt.encoder.layer_norm.bias"),
+            ("head.weight", "classifier.weight"),
+            ("head.bias", "classifier.bias"),
+        ]
+    )
+
+    return rename_keys
+
+
+# we split up the matrix of each encoder layer into queries, keys and values
+def read_in_k_v(state_dict, config):
+    # for each of the encoder blocks:
+    for i in range(config.num_encoder_blocks):
+        for j in range(config.depths[i]):
+            # read in weights + bias of keys and values (which is a single matrix in the original implementation)
+            kv_weight = state_dict.pop(f"pvt.encoder.block.{i}.{j}.attention.self.kv.weight")
+            kv_bias = state_dict.pop(f"pvt.encoder.block.{i}.{j}.attention.self.kv.bias")
+            # next, add keys and values (in that order) to the state dict
+            state_dict[f"pvt.encoder.block.{i}.{j}.attention.self.key.weight"] = kv_weight[: config.hidden_sizes[i], :]
+            state_dict[f"pvt.encoder.block.{i}.{j}.attention.self.key.bias"] = kv_bias[: config.hidden_sizes[i]]
+
+            state_dict[f"pvt.encoder.block.{i}.{j}.attention.self.value.weight"] = kv_weight[
+                config.hidden_sizes[i] :, :
+            ]
+            state_dict[f"pvt.encoder.block.{i}.{j}.attention.self.value.bias"] = kv_bias[config.hidden_sizes[i] :]
+
+
+def rename_key(dct, old, new):
+    val = dct.pop(old)
+    dct[new] = val
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    return im
+
+
+@torch.no_grad()
+def convert_pvt_checkpoint(pvt_size, pvt_checkpoint, pytorch_dump_folder_path):
+    """
+    Copy/paste/tweak model's weights to our PVT structure.
+    """
+
+    # define default Pvt configuration
+    if pvt_size == "tiny":
+        config_path = "Zetatech/pvt-tiny-224"
+    elif pvt_size == "small":
+        config_path = "Zetatech/pvt-small-224"
+    elif pvt_size == "medium":
+        config_path = "Zetatech/pvt-medium-224"
+    elif pvt_size == "large":
+        config_path = "Zetatech/pvt-large-224"
+    else:
+        raise ValueError(f"Available model's size: 'tiny', 'small', 'medium', 'large', but " f"'{pvt_size}' was given")
+    config = PvtConfig(name_or_path=config_path)
+    # load original model from https://github.com/whai362/PVT
+    state_dict = torch.load(pvt_checkpoint, map_location="cpu")
+
+    rename_keys = create_rename_keys(config)
+    for src, dest in rename_keys:
+        rename_key(state_dict, src, dest)
+    read_in_k_v(state_dict, config)
+
+    # load HuggingFace model
+    model = PvtForImageClassification(config).eval()
+    model.load_state_dict(state_dict)
+
+    # Check outputs on an image, prepared by PVTFeatureExtractor
+    image_processor = PvtImageProcessor(size=config.image_size)
+    encoding = image_processor(images=prepare_img(), return_tensors="pt")
+    pixel_values = encoding["pixel_values"]
+    outputs = model(pixel_values)
+    logits = outputs.logits.detach().cpu()
+
+    if pvt_size == "tiny":
+        expected_slice_logits = torch.tensor([-1.4192, -1.9158, -0.9702])
+    elif pvt_size == "small":
+        expected_slice_logits = torch.tensor([0.4353, -0.1960, -0.2373])
+    elif pvt_size == "medium":
+        expected_slice_logits = torch.tensor([-0.2914, -0.2231, 0.0321])
+    elif pvt_size == "large":
+        expected_slice_logits = torch.tensor([0.3740, -0.7739, -0.4214])
+    else:
+        raise ValueError(f"Available model's size: 'tiny', 'small', 'medium', 'large', but " f"'{pvt_size}' was given")
+
+    assert torch.allclose(logits[0, :3], expected_slice_logits, atol=1e-4)
+
+    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+    print(f"Saving model pytorch_model.bin to {pytorch_dump_folder_path}")
+    model.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--pvt_size",
+        default="tiny",
+        type=str,
+        help="Size of the PVT pretrained model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pvt_checkpoint",
+        default="pvt_tiny.pth",
+        type=str,
+        help="Checkpoint of the PVT pretrained model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+    )
+
+    args = parser.parse_args()
+    convert_pvt_checkpoint(args.pvt_size, args.pvt_checkpoint, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/pvt/image_processing_pvt.py b/src/transformers/models/pvt/image_processing_pvt.py
new file mode 100644
index 000000000000..37d65778b073
--- /dev/null
+++ b/src/transformers/models/pvt/image_processing_pvt.py
@@ -0,0 +1,268 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Pvt."""
+
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import resize, to_channel_dimension_format
+from ...image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class PvtImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a PVT image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `(size["height"],
+            size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method.
+        size (`dict`, *optional*, defaults to `{"height": 224, "width": 224}`):
+            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
+            `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+            `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[Dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 224, "width": 224}
+        size = get_size_dict(size)
+        self.do_resize = do_resize
+        self.do_rescale = do_rescale
+        self.do_normalize = do_normalize
+        self.size = size
+        self.resample = resample
+        self.rescale_factor = rescale_factor
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+
+    # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to `(size["height"], size["width"])`.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        output_size = (size["height"], size["width"])
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ):
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after
+                resizing.
+            resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
+                `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
+                an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use if `do_normalize` is set to `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        resample = resample if resample is not None else self.resample
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+
+        size = size if size is not None else self.size
+        size_dict = get_size_dict(size)
+
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        if do_resize and size is None:
+            raise ValueError("Size must be specified if do_resize is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        if do_resize:
+            images = [
+                self.resize(image=image, size=size_dict, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_rescale:
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_normalize:
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/src/transformers/models/pvt/modeling_pvt.py b/src/transformers/models/pvt/modeling_pvt.py
new file mode 100755
index 000000000000..2dd452ec1df1
--- /dev/null
+++ b/src/transformers/models/pvt/modeling_pvt.py
@@ -0,0 +1,674 @@
+# coding=utf-8
+# Copyright 2023 Authors: Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan,
+# Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao and The HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch PVT model."""
+
+import collections
+import math
+from typing import Iterable, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutput, ImageClassifierOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from .configuration_pvt import PvtConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "PvtConfig"
+
+_CHECKPOINT_FOR_DOC = "Zetatech/pvt-tiny-224"
+_EXPECTED_OUTPUT_SHAPE = [1, 50, 512]
+
+_IMAGE_CLASS_CHECKPOINT = "Zetatech/pvt-tiny-224"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
+
+PVT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "Zetatech/pvt-tiny-224"
+    # See all PVT models at https://huggingface.co/models?filter=pvt
+]
+
+
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+# Copied from transformers.models.convnext.modeling_convnext.ConvNextDropPath with ConvNext->Pvt
+class PvtDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+
+
+class PvtPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(
+        self,
+        config: PvtConfig,
+        image_size: Union[int, Iterable[int]],
+        patch_size: Union[int, Iterable[int]],
+        stride: int,
+        num_channels: int,
+        hidden_size: int,
+        cls_token: bool = False,
+    ):
+        super().__init__()
+        self.config = config
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+
+        self.position_embeddings = nn.Parameter(
+            torch.randn(1, num_patches + 1 if cls_token else num_patches, hidden_size)
+        )
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, hidden_size)) if cls_token else None
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=stride, stride=patch_size)
+        self.layer_norm = nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(p=config.hidden_dropout_prob)
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        num_patches = height * width
+        if num_patches == self.config.image_size * self.config.image_size:
+            return self.position_embeddings
+        embeddings = embeddings.reshape(1, height, width, -1).permute(0, 3, 1, 2)
+        interpolated_embeddings = F.interpolate(embeddings, size=(height, width), mode="bilinear")
+        interpolated_embeddings = interpolated_embeddings.reshape(1, -1, height * width).permute(0, 2, 1)
+        return interpolated_embeddings
+
+    def forward(self, pixel_values: torch.Tensor) -> Tuple[torch.Tensor, int, int]:
+        batch_size, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        patch_embed = self.projection(pixel_values)
+        *_, height, width = patch_embed.shape
+        patch_embed = patch_embed.flatten(2).transpose(1, 2)
+        embeddings = self.layer_norm(patch_embed)
+        if self.cls_token is not None:
+            cls_token = self.cls_token.expand(batch_size, -1, -1)
+            embeddings = torch.cat((cls_token, embeddings), dim=1)
+            position_embeddings = self.interpolate_pos_encoding(self.position_embeddings[:, 1:], height, width)
+            position_embeddings = torch.cat((self.position_embeddings[:, :1], position_embeddings), dim=1)
+        else:
+            position_embeddings = self.interpolate_pos_encoding(self.position_embeddings, height, width)
+        embeddings = self.dropout(embeddings + position_embeddings)
+
+        return embeddings, height, width
+
+
+class PvtSelfOutput(nn.Module):
+    def __init__(self, config: PvtConfig, hidden_size: int):
+        super().__init__()
+        self.dense = nn.Linear(hidden_size, hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class PvtEfficientSelfAttention(nn.Module):
+    """Efficient self-attention mechanism with reduction of the sequence [PvT paper](https://arxiv.org/abs/2102.12122)."""
+
+    def __init__(
+        self, config: PvtConfig, hidden_size: int, num_attention_heads: int, sequences_reduction_ratio: float
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+
+        if self.hidden_size % self.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({self.num_attention_heads})"
+            )
+
+        self.attention_head_size = int(self.hidden_size / self.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(self.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(self.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(self.hidden_size, self.all_head_size, bias=config.qkv_bias)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+        self.sequences_reduction_ratio = sequences_reduction_ratio
+        if sequences_reduction_ratio > 1:
+            self.sequence_reduction = nn.Conv2d(
+                hidden_size, hidden_size, kernel_size=sequences_reduction_ratio, stride=sequences_reduction_ratio
+            )
+            self.layer_norm = nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
+
+    def transpose_for_scores(self, hidden_states: int) -> torch.Tensor:
+        new_shape = hidden_states.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        hidden_states = hidden_states.view(new_shape)
+        return hidden_states.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        height: int,
+        width: int,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor]:
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+
+        if self.sequences_reduction_ratio > 1:
+            batch_size, seq_len, num_channels = hidden_states.shape
+            # Reshape to (batch_size, num_channels, height, width)
+            hidden_states = hidden_states.permute(0, 2, 1).reshape(batch_size, num_channels, height, width)
+            # Apply sequence reduction
+            hidden_states = self.sequence_reduction(hidden_states)
+            # Reshape back to (batch_size, seq_len, num_channels)
+            hidden_states = hidden_states.reshape(batch_size, num_channels, -1).permute(0, 2, 1)
+            hidden_states = self.layer_norm(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+class PvtAttention(nn.Module):
+    def __init__(
+        self, config: PvtConfig, hidden_size: int, num_attention_heads: int, sequences_reduction_ratio: float
+    ):
+        super().__init__()
+        self.self = PvtEfficientSelfAttention(
+            config,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            sequences_reduction_ratio=sequences_reduction_ratio,
+        )
+        self.output = PvtSelfOutput(config, hidden_size=hidden_size)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self, hidden_states: torch.Tensor, height: int, width: int, output_attentions: bool = False
+    ) -> Tuple[torch.Tensor]:
+        self_outputs = self.self(hidden_states, height, width, output_attentions)
+
+        attention_output = self.output(self_outputs[0])
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class PvtFFN(nn.Module):
+    def __init__(
+        self,
+        config: PvtConfig,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+    ):
+        super().__init__()
+        out_features = out_features if out_features is not None else in_features
+        self.dense1 = nn.Linear(in_features, hidden_features)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+        self.dense2 = nn.Linear(hidden_features, out_features)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense1(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dense2(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class PvtLayer(nn.Module):
+    def __init__(
+        self,
+        config: PvtConfig,
+        hidden_size: int,
+        num_attention_heads: int,
+        drop_path: float,
+        sequences_reduction_ratio: float,
+        mlp_ratio: float,
+    ):
+        super().__init__()
+        self.layer_norm_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
+        self.attention = PvtAttention(
+            config=config,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            sequences_reduction_ratio=sequences_reduction_ratio,
+        )
+        self.drop_path = PvtDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.layer_norm_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
+        mlp_hidden_size = int(hidden_size * mlp_ratio)
+        self.mlp = PvtFFN(config=config, in_features=hidden_size, hidden_features=mlp_hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor, height: int, width: int, output_attentions: bool = False):
+        self_attention_outputs = self.attention(
+            hidden_states=self.layer_norm_1(hidden_states),
+            height=height,
+            width=width,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]
+
+        attention_output = self.drop_path(attention_output)
+        hidden_states = attention_output + hidden_states
+
+        mlp_output = self.mlp(self.layer_norm_2(hidden_states))
+
+        mlp_output = self.drop_path(mlp_output)
+        layer_output = hidden_states + mlp_output
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+class PvtEncoder(nn.Module):
+    def __init__(self, config: PvtConfig):
+        super().__init__()
+        self.config = config
+
+        # stochastic depth decay rule
+        drop_path_decays = torch.linspace(0, config.drop_path_rate, sum(config.depths)).tolist()
+
+        # patch embeddings
+        embeddings = []
+
+        for i in range(config.num_encoder_blocks):
+            embeddings.append(
+                PvtPatchEmbeddings(
+                    config=config,
+                    image_size=config.image_size if i == 0 else self.config.image_size // (2 ** (i + 1)),
+                    patch_size=config.patch_sizes[i],
+                    stride=config.strides[i],
+                    num_channels=config.num_channels if i == 0 else config.hidden_sizes[i - 1],
+                    hidden_size=config.hidden_sizes[i],
+                    cls_token=i == config.num_encoder_blocks - 1,
+                )
+            )
+        self.patch_embeddings = nn.ModuleList(embeddings)
+
+        # Transformer blocks
+        blocks = []
+        cur = 0
+        for i in range(config.num_encoder_blocks):
+            # each block consists of layers
+            layers = []
+            if i != 0:
+                cur += config.depths[i - 1]
+            for j in range(config.depths[i]):
+                layers.append(
+                    PvtLayer(
+                        config=config,
+                        hidden_size=config.hidden_sizes[i],
+                        num_attention_heads=config.num_attention_heads[i],
+                        drop_path=drop_path_decays[cur + j],
+                        sequences_reduction_ratio=config.sequence_reduction_ratios[i],
+                        mlp_ratio=config.mlp_ratios[i],
+                    )
+                )
+            blocks.append(nn.ModuleList(layers))
+
+        self.block = nn.ModuleList(blocks)
+
+        # Layer norms
+        self.layer_norm = nn.LayerNorm(config.hidden_sizes[-1], eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        batch_size = pixel_values.shape[0]
+        num_blocks = len(self.block)
+        hidden_states = pixel_values
+        for idx, (embedding_layer, block_layer) in enumerate(zip(self.patch_embeddings, self.block)):
+            # first, obtain patch embeddings
+            hidden_states, height, width = embedding_layer(hidden_states)
+            # second, send embeddings through blocks
+            for block in block_layer:
+                layer_outputs = block(hidden_states, height, width, output_attentions)
+                hidden_states = layer_outputs[0]
+                if output_attentions:
+                    all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if output_hidden_states:
+                    all_hidden_states = all_hidden_states + (hidden_states,)
+            if idx != num_blocks - 1:
+                hidden_states = hidden_states.reshape(batch_size, height, width, -1).permute(0, 3, 1, 2).contiguous()
+        hidden_states = self.layer_norm(hidden_states)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class PvtPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = PvtConfig
+    base_model_prefix = "pvt"
+    main_input_name = "pixel_values"
+
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(module.weight.data, mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, PvtPatchEmbeddings):
+            module.position_embeddings.data = nn.init.trunc_normal_(
+                module.position_embeddings.data,
+                mean=0.0,
+                std=self.config.initializer_range,
+            )
+            if module.cls_token is not None:
+                module.cls_token.data = nn.init.trunc_normal_(
+                    module.cls_token.data,
+                    mean=0.0,
+                    std=self.config.initializer_range,
+                )
+
+    def _set_gradient_checkpointing(self, module: PvtEncoder, value: bool = False):
+        if isinstance(module, PvtEncoder):
+            module.gradient_checkpointing = value
+
+
+PVT_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`~PvtConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+PVT_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`PvtImageProcessor.__call__`]
+            for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Pvt encoder outputting raw hidden-states without any specific head on top.",
+    PVT_START_DOCSTRING,
+)
+class PvtModel(PvtPreTrainedModel):
+    def __init__(self, config: PvtConfig):
+        super().__init__(config)
+        self.config = config
+
+        # hierarchical Transformer encoder
+        self.encoder = PvtEncoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(PVT_INPUTS_DOCSTRING.format("(batch_size, channels, height, width)"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_outputs = self.encoder(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+
+        if not return_dict:
+            return (sequence_output,) + encoder_outputs[1:]
+
+        return BaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Pvt Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
+    the [CLS] token) e.g. for ImageNet.
+    """,
+    PVT_START_DOCSTRING,
+)
+class PvtForImageClassification(PvtPreTrainedModel):
+    def __init__(self, config: PvtConfig) -> None:
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.pvt = PvtModel(config)
+
+        # Classifier head
+        self.classifier = (
+            nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(PVT_INPUTS_DOCSTRING.format("(batch_size, channels, height, width)"))
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=ImageClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor],
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, ImageClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.pvt(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.classifier(sequence_output[:, 0, :])
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/qdqbert/__init__.py b/src/transformers/models/qdqbert/__init__.py
index 60f03338f480..3d161192d81b 100644
--- a/src/transformers/models/qdqbert/__init__.py
+++ b/src/transformers/models/qdqbert/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/qdqbert/configuration_qdqbert.py b/src/transformers/models/qdqbert/configuration_qdqbert.py
index 090617a6308f..c4f8c1559e61 100644
--- a/src/transformers/models/qdqbert/configuration_qdqbert.py
+++ b/src/transformers/models/qdqbert/configuration_qdqbert.py
@@ -105,7 +105,7 @@ def __init__(
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
diff --git a/src/transformers/models/qdqbert/modeling_qdqbert.py b/src/transformers/models/qdqbert/modeling_qdqbert.py
index 67aba873939a..47546930ebdf 100755
--- a/src/transformers/models/qdqbert/modeling_qdqbert.py
+++ b/src/transformers/models/qdqbert/modeling_qdqbert.py
@@ -68,7 +68,6 @@
 
 _CHECKPOINT_FOR_DOC = "bert-base-uncased"
 _CONFIG_FOR_DOC = "QDQBertConfig"
-_TOKENIZER_FOR_DOC = "BertTokenizer"
 
 QDQBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "bert-base-uncased",
@@ -165,7 +164,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
         )
@@ -575,9 +576,8 @@ def forward(
             past_key_value = past_key_values[i] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
-                    logger.warning(
+                    logger.warning_once(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                     )
                     use_cache = False
@@ -740,7 +740,6 @@ class QDQBertPreTrainedModel(PreTrainedModel):
     load_tf_weights = load_tf_weights_in_qdqbert
     base_model_prefix = "bert"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -784,7 +783,7 @@ def _set_gradient_checkpointing(self, module, value=False):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -875,7 +874,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPoolingAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -930,6 +928,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             batch_size, seq_length = input_shape
         elif inputs_embeds is not None:
@@ -1015,9 +1014,7 @@ def forward(
     """QDQBERT Model with a `language modeling` head on top for CLM fine-tuning.""", QDQBERT_START_DOCSTRING
 )
 class QDQBertLMHeadModel(QDQBertPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+    _tied_weights_keys = ["predictions.decoder.weight", "predictions.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1085,10 +1082,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import BertTokenizer, QDQBertLMHeadModel, QDQBertConfig
+        >>> from transformers import AutoTokenizer, QDQBertLMHeadModel, QDQBertConfig
         >>> import torch
 
-        >>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
         >>> config = QDQBertConfig.from_pretrained("bert-base-cased")
         >>> config.is_decoder = True
         >>> model = QDQBertLMHeadModel.from_pretrained("bert-base-cased", config=config)
@@ -1147,7 +1144,7 @@ def prepare_inputs_for_generation(
         input_ids: Optional[torch.LongTensor],
         past_key_values=None,
         attention_mask: Optional[torch.Tensor] = None,
-        **model_kwargs
+        **model_kwargs,
     ):
         input_shape = input_ids.shape
         # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
@@ -1160,18 +1157,18 @@ def prepare_inputs_for_generation(
 
         return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
 
-    def _reorder_cache(self, past, beam_idx):
+    def _reorder_cache(self, past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
         return reordered_past
 
 
 @add_start_docstrings("""QDQBERT Model with a `language modeling` head on top.""", QDQBERT_START_DOCSTRING)
 class QDQBertForMaskedLM(QDQBertPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+    _tied_weights_keys = ["predictions.decoder.weight", "predictions.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1196,7 +1193,6 @@ def set_output_embeddings(self, new_embeddings):
 
     @add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1320,10 +1316,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import BertTokenizer, QDQBertForNextSentencePrediction
+        >>> from transformers import AutoTokenizer, QDQBertForNextSentencePrediction
         >>> import torch
 
-        >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
         >>> model = QDQBertForNextSentencePrediction.from_pretrained("bert-base-uncased")
 
         >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
@@ -1399,7 +1395,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1496,7 +1491,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1576,9 +1570,6 @@ def forward(
     QDQBERT_START_DOCSTRING,
 )
 class QDQBertForTokenClassification(QDQBertPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1592,7 +1583,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1658,9 +1648,6 @@ def forward(
     QDQBERT_START_DOCSTRING,
 )
 class QDQBertForQuestionAnswering(QDQBertPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1673,7 +1660,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/rag/__init__.py b/src/transformers/models/rag/__init__.py
index 7798e8a41574..b238c6290832 100644
--- a/src/transformers/models/rag/__init__.py
+++ b/src/transformers/models/rag/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/rag/configuration_rag.py b/src/transformers/models/rag/configuration_rag.py
index 109588eadbdf..60f38ee6a532 100644
--- a/src/transformers/models/rag/configuration_rag.py
+++ b/src/transformers/models/rag/configuration_rag.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ RAG model configuration"""
 
-import copy
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import add_start_docstrings
@@ -112,7 +111,7 @@ def __init__(
         output_retrieved=False,
         use_cache=True,
         forced_eos_token_id=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             bos_token_id=bos_token_id,
@@ -179,16 +178,3 @@ def from_question_encoder_generator_configs(
             [`EncoderDecoderConfig`]: An instance of a configuration object
         """
         return cls(question_encoder=question_encoder_config.to_dict(), generator=generator_config.to_dict(), **kwargs)
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["question_encoder"] = self.question_encoder.to_dict()
-        output["generator"] = self.generator.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/rag/modeling_rag.py b/src/transformers/models/rag/modeling_rag.py
index ecf664b9041b..7048168a0642 100644
--- a/src/transformers/models/rag/modeling_rag.py
+++ b/src/transformers/models/rag/modeling_rag.py
@@ -231,7 +231,6 @@ class RagPreTrainedModel(PreTrainedModel):
     """
     config_class = RagConfig
     base_model_prefix = "rag"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     @classmethod
     def from_pretrained(cls, *args, **kwargs):
@@ -246,7 +245,7 @@ def from_pretrained_question_encoder_generator(
         question_encoder_pretrained_model_name_or_path: str = None,
         generator_pretrained_model_name_or_path: str = None,
         retriever: RagRetriever = None,
-        **kwargs
+        **kwargs,
     ) -> PreTrainedModel:
         r"""
         Instantiates an question encoder and a generator from one or two base classes of the library from pretrained
@@ -463,16 +462,12 @@ def from_pretrained_question_encoder_generator(
             `question_encoder_last_hidden_state` and `retrieved_doc_embeds`, see examples for more information.
         context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
             Input IDs post-processed from the retrieved documents and the question encoder `input_ids` by the
-            retriever.
-
-            If the model has is not initialized with a `retriever` ``context_input_ids` has to be provided to the
-            forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`]. context_attention_mask
-            (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*,
-            returned when *output_retrieved=True*): Attention mask post-processed from the retrieved documents and the
-            question encoder `input_ids` by the retriever.
-
-            If the model has is not initialized with a `retriever` `context_attention_mask` has to be provided to the
-            forward pass. `context_attention_mask` are returned by [`~RagRetriever.__call__`].
+            retriever. If the model was not initialized with a `retriever` ``context_input_ids` has to be provided to
+            the forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`].
+        context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`,*optional*, returned when *output_retrieved=True*):
+            Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
+            retriever. If the model has is not initialized with a `retriever` `context_attention_mask` has to be
+            provided to the forward pass. `context_attention_mask` are returned by [`~RagRetriever.__call__`].
         use_cache (`bool`, *optional*, defaults to `True`):
             If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
             `past_key_values`).
@@ -546,7 +541,7 @@ def forward(
         past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         doc_scores: Optional[torch.FloatTensor] = None,
         context_input_ids: Optional[torch.LongTensor] = None,
-        context_attention_mask=None,
+        context_attention_mask: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -559,10 +554,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import RagTokenizer, RagRetriever, RagModel
+        >>> from transformers import AutoTokenizer, RagRetriever, RagModel
         >>> import torch
 
-        >>> tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/rag-token-base")
         >>> retriever = RagRetriever.from_pretrained(
         ...     "facebook/rag-token-base", index_name="exact", use_dummy_dataset=True
         ... )
@@ -588,7 +583,6 @@ def forward(
         )
         # encoder_outputs are pre-computed during RAG-token generation
         if encoder_outputs is None:
-
             if has_to_retrieve:
                 question_enc_outputs = self.question_encoder(
                     input_ids, attention_mask=attention_mask, return_dict=True
@@ -603,7 +597,6 @@ def forward(
                     return_tensors="pt",
                 )
                 if self.context_encoder_training:
-
                     (
                         context_input_ids,
                         context_attention_mask,
@@ -789,7 +782,7 @@ def forward(
         reduce_loss: Optional[bool] = None,
         labels: Optional[torch.LongTensor] = None,
         n_docs: Optional[int] = None,
-        **kwargs  # needs kwargs for generation
+        **kwargs,  # needs kwargs for generation
     ) -> RetrievAugLMMarginOutput:
         r"""
         exclude_bos_score (`bool`, *optional*):
@@ -806,10 +799,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration
+        >>> from transformers import AutoTokenizer, RagRetriever, RagSequenceForGeneration
         >>> import torch
 
-        >>> tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
+        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/rag-sequence-nq")
         >>> retriever = RagRetriever.from_pretrained(
         ...     "facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True
         ... )
@@ -921,7 +914,7 @@ def generate(
         num_return_sequences: Optional[int] = None,  # defaults to 1
         num_beams: Optional[int] = None,  # defaults to 1
         n_docs: Optional[int] = None,
-        **model_kwargs
+        **model_kwargs,
     ) -> torch.LongTensor:
         """
         Implements RAG sequence "thorough" decoding. Read the [`~generation.GenerationMixin.generate`]` documentation
@@ -965,7 +958,7 @@ def generate(
                 Number of beams for beam search. 1 means no beam search.
             n_docs (`int`, *optional*, defaults to `config.n_docs`)
                 Number of documents to retrieve and/or number of documents for which to generate an answer.
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Additional kwargs will be passed to [`~generation.GenerationMixin.generate`].
 
         Return:
@@ -1176,7 +1169,7 @@ def prepare_inputs_for_generation(
         encoder_outputs=None,
         doc_scores=None,
         n_docs=None,
-        **kwargs
+        **kwargs,
     ):
         if past_key_values is not None:
             # if past is defined use only last decoder_input_ids
@@ -1207,7 +1200,7 @@ def question_encoder(self):
         return self.rag.question_encoder
 
     @staticmethod
-    def _reorder_cache(past, beam_idx):
+    def _reorder_cache(past_key_values, beam_idx):
         """Reorders cache for generation. BART-inspired but we need to take care of the extra dimension for docs"""
 
         def _reorder_stacked(hidden_states, new_order):
@@ -1218,14 +1211,15 @@ def _reorder_stacked(hidden_states, new_order):
             return result
 
         reordered_past = ()
-        for layer_past in past:
+        for layer_past in past_key_values:
             # get the correct batch idx from decoder layer's batch dim for cross and self-attn
-            reordered_past += (tuple(_reorder_stacked(past_state, beam_idx) for past_state in layer_past),)
+            reordered_past += (
+                tuple(_reorder_stacked(past_state, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
 
         return reordered_past
 
     def marginalize(self, seq_logits, doc_scores, n_docs=None):
-
         n_docs = n_docs if n_docs is not None else self.config.n_docs
 
         # RAG-token marginalization
@@ -1257,7 +1251,7 @@ def forward(
         reduce_loss: Optional[bool] = None,
         labels: Optional[torch.LongTensor] = None,
         n_docs: Optional[int] = None,
-        **kwargs  # needs kwargs for generation
+        **kwargs,  # needs kwargs for generation
     ) -> RetrievAugLMMarginOutput:
         r"""
         do_marginalize (`bool`, *optional*):
@@ -1274,10 +1268,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration
+        >>> from transformers import AutoTokenizer, RagRetriever, RagTokenForGeneration
         >>> import torch
 
-        >>> tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
+        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/rag-token-nq")
         >>> retriever = RagRetriever.from_pretrained(
         ...     "facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True
         ... )
@@ -1390,7 +1384,7 @@ def generate(
         prefix_allowed_tokens_fn: Callable[[int, torch.Tensor], List[int]] = None,
         logits_processor: Optional[LogitsProcessorList] = LogitsProcessorList(),
         stopping_criteria: Optional[StoppingCriteriaList] = StoppingCriteriaList(),
-        **kwargs
+        **kwargs,
     ) -> torch.LongTensor:
         """
         Implements RAG token decoding.
@@ -1433,7 +1427,7 @@ def generate(
                 priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
                 configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
                 default values, whose documentation should be checked to parameterize generation.
-            prefix_allowed_tokens_fn: (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
+            prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
                 If provided, this function constraints the beam search to allowed tokens only at each step. If not
                 provided no constraint is applied. This function takes 2 arguments `inputs_ids` and the batch ID
                 `batch_id`. It has to return a list with the allowed tokens for the next generation step conditioned on
@@ -1448,7 +1442,7 @@ def generate(
                 Custom stopping criteria that complement the default stopping criteria built from arguments and a
                 model's config. If a stopping criteria is passed that is already created with the arguments or a
                 model's config an error is thrown.
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                 forwarded to the `forward` function of the model.
 
@@ -1566,6 +1560,7 @@ def extend_enc_output(tensor, num_beams=None):
                 length_penalty=generation_config.length_penalty,
                 do_early_stopping=generation_config.early_stopping,
                 num_beam_hyps_to_keep=generation_config.num_return_sequences,
+                max_length=generation_config.max_length,
             )
             return self.beam_search(
                 input_ids,
diff --git a/src/transformers/models/rag/modeling_tf_rag.py b/src/transformers/models/rag/modeling_tf_rag.py
index aa8a8da90fdf..a58bdb6e7538 100644
--- a/src/transformers/models/rag/modeling_tf_rag.py
+++ b/src/transformers/models/rag/modeling_tf_rag.py
@@ -15,6 +15,9 @@
 
 """TFRAG model implementation."""
 
+
+from __future__ import annotations
+
 import copy
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
@@ -23,6 +26,7 @@
 import tensorflow as tf
 
 from ...configuration_utils import PretrainedConfig
+from ...generation import TFLogitsProcessorList
 from ...modeling_tf_utils import (
     TFCausalLanguageModelingLoss,
     TFModelInputType,
@@ -110,22 +114,22 @@ class TFRetrievAugLMMarginOutput(ModelOutput):
             average in the self-attention heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    past_key_values: Optional[List[tf.Tensor]] = None
-    doc_scores: Optional[tf.Tensor] = None
-    retrieved_doc_embeds: Optional[tf.Tensor] = None
-    retrieved_doc_ids: Optional[tf.Tensor] = None
-    context_input_ids: Optional[tf.Tensor] = None
-    context_attention_mask: Optional[tf.Tensor] = None
-    question_encoder_last_hidden_state: Optional[tf.Tensor] = None
-    question_enc_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    question_enc_attentions: Optional[Tuple[tf.Tensor]] = None
-    generator_enc_last_hidden_state: Optional[tf.Tensor] = None
-    generator_enc_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    generator_enc_attentions: Optional[Tuple[tf.Tensor]] = None
-    generator_dec_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    generator_dec_attentions: Optional[Tuple[tf.Tensor]] = None
+    past_key_values: List[tf.Tensor] | None = None
+    doc_scores: tf.Tensor | None = None
+    retrieved_doc_embeds: tf.Tensor | None = None
+    retrieved_doc_ids: tf.Tensor | None = None
+    context_input_ids: tf.Tensor | None = None
+    context_attention_mask: tf.Tensor | None = None
+    question_encoder_last_hidden_state: tf.Tensor | None = None
+    question_enc_hidden_states: Tuple[tf.Tensor] | None = None
+    question_enc_attentions: Tuple[tf.Tensor] | None = None
+    generator_enc_last_hidden_state: tf.Tensor | None = None
+    generator_enc_hidden_states: Tuple[tf.Tensor] | None = None
+    generator_enc_attentions: Tuple[tf.Tensor] | None = None
+    generator_dec_hidden_states: Tuple[tf.Tensor] | None = None
+    generator_dec_attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -195,20 +199,20 @@ class TFRetrievAugLMOutput(ModelOutput):
     """
 
     logits: tf.Tensor = None
-    past_key_values: Optional[List[tf.Tensor]] = None
-    doc_scores: Optional[tf.Tensor] = None
-    retrieved_doc_embeds: Optional[tf.Tensor] = None
-    retrieved_doc_ids: Optional[tf.Tensor] = None
-    context_input_ids: Optional[tf.Tensor] = None
-    context_attention_mask: Optional[tf.Tensor] = None
-    question_encoder_last_hidden_state: Optional[tf.Tensor] = None
-    question_enc_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    question_enc_attentions: Optional[Tuple[tf.Tensor]] = None
-    generator_enc_last_hidden_state: Optional[tf.Tensor] = None
-    generator_enc_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    generator_enc_attentions: Optional[Tuple[tf.Tensor]] = None
-    generator_dec_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    generator_dec_attentions: Optional[Tuple[tf.Tensor]] = None
+    past_key_values: List[tf.Tensor] | None = None
+    doc_scores: tf.Tensor | None = None
+    retrieved_doc_embeds: tf.Tensor | None = None
+    retrieved_doc_ids: tf.Tensor | None = None
+    context_input_ids: tf.Tensor | None = None
+    context_attention_mask: tf.Tensor | None = None
+    question_encoder_last_hidden_state: tf.Tensor | None = None
+    question_enc_hidden_states: Tuple[tf.Tensor] | None = None
+    question_enc_attentions: Tuple[tf.Tensor] | None = None
+    generator_enc_last_hidden_state: tf.Tensor | None = None
+    generator_enc_hidden_states: Tuple[tf.Tensor] | None = None
+    generator_enc_attentions: Tuple[tf.Tensor] | None = None
+    generator_dec_hidden_states: Tuple[tf.Tensor] | None = None
+    generator_dec_attentions: Tuple[tf.Tensor] | None = None
 
 
 class TFRagPreTrainedModel(TFPreTrainedModel):
@@ -231,7 +235,7 @@ def from_pretrained_question_encoder_generator(
         generator_pretrained_model_name_or_path: str = None,
         retriever: RagRetriever = None,
         *model_args,
-        **kwargs
+        **kwargs,
     ) -> TFPreTrainedModel:
         r"""
         Instantiates an question encoder and a generator from one or two base classes of the library from pretrained
@@ -490,7 +494,6 @@ def from_pretrained_question_encoder_generator(
 
 @add_start_docstrings_to_model_forward(RAG_START_DOCSTRING)
 class TFRagModel(TFRagPreTrainedModel):
-
     load_weight_prefix = "tf_rag_model_1"
 
     def __init__(
@@ -545,34 +548,34 @@ def set_retriever(self, retriever: RagRetriever):
     @replace_return_docstrings(output_type=TFRetrievAugLMOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_outputs: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        doc_scores: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        context_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        context_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        output_retrieved: Optional[bool] = None,
-        n_docs: Optional[int] = None,
-        return_dict: Optional[bool] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        encoder_outputs: np.ndarray | tf.Tensor | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        past_key_values: Tuple[Tuple[Union[np.ndarray, tf.Tensor]]] | None = None,
+        doc_scores: np.ndarray | tf.Tensor | None = None,
+        context_input_ids: np.ndarray | tf.Tensor | None = None,
+        context_attention_mask: np.ndarray | tf.Tensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        output_retrieved: bool | None = None,
+        n_docs: int | None = None,
+        return_dict: bool | None = None,
         training: bool = False,
-        **kwargs
-    ):
+        **kwargs,
+    ) -> TFRetrievAugLMOutput:
         r"""
         Returns:
 
         Example:
 
         ```python
-        >>> from transformers import RagTokenizer, RagRetriever, TFRagModel
+        >>> from transformers import AutoTokenizer, RagRetriever, TFRagModel
         >>> import torch
 
-        >>> tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/rag-token-base")
         >>> retriever = RagRetriever.from_pretrained(
         ...     "facebook/rag-token-base", index_name="exact", use_dummy_dataset=True
         ... )
@@ -601,7 +604,6 @@ def call(
 
         # encoder_outputs are pre-computed during RAG-token generation
         if encoder_outputs is None:
-
             if has_to_retrieve:
                 question_enc_outputs = self.question_encoder(
                     input_ids, attention_mask=attention_mask, return_dict=True, training=training
@@ -725,7 +727,6 @@ def call(
     RAG_START_DOCSTRING,
 )
 class TFRagTokenForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingLoss):
-
     load_weight_prefix = "tf_rag_token_for_generation_1/rag"
 
     def __init__(
@@ -770,7 +771,7 @@ def prepare_inputs_for_generation(
         encoder_outputs=None,
         doc_scores=None,
         n_docs=None,
-        **kwargs
+        **kwargs,
     ):
         if past_key_values is not None:
             # if past is defined use only last decoder_input_ids
@@ -843,27 +844,27 @@ def marginalize(self, seq_logits, doc_scores, n_docs=None):
     @replace_return_docstrings(output_type=TFRetrievAugLMMarginOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_outputs: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        doc_scores: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        context_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        context_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        output_retrieved: Optional[bool] = None,
-        n_docs: Optional[int] = None,
-        do_marginalize: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        reduce_loss: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        encoder_outputs: np.ndarray | tf.Tensor | None = None,
+        past_key_values: Tuple[Tuple[Union[np.ndarray, tf.Tensor]]] | None = None,
+        doc_scores: np.ndarray | tf.Tensor | None = None,
+        context_input_ids: np.ndarray | tf.Tensor | None = None,
+        context_attention_mask: np.ndarray | tf.Tensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        output_retrieved: bool | None = None,
+        n_docs: int | None = None,
+        do_marginalize: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        reduce_loss: bool | None = None,
+        return_dict: bool | None = None,
         training: bool = False,
-        **kwargs  # needs kwargs for generation
-    ):
+        **kwargs,  # needs kwargs for generation
+    ) -> TFRetrievAugLMMarginOutput:
         r"""
         do_marginalize (`bool`, *optional*):
             If `True`, the logits are marginalized over all documents by making use of
@@ -884,9 +885,9 @@ def call(
 
         ```python
         >>> import tensorflow as tf
-        >>> from transformers import RagTokenizer, RagRetriever, TFRagTokenForGeneration
+        >>> from transformers import AutoTokenizer, RagRetriever, TFRagTokenForGeneration
 
-        >>> tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
+        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/rag-token-nq")
         >>> retriever = RagRetriever.from_pretrained(
         ...     "facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True
         ... )
@@ -995,14 +996,15 @@ def call(
 
     def generate(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[tf.Tensor] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: tf.Tensor | None = None,
         context_input_ids=None,
         context_attention_mask=None,
         doc_scores=None,
         n_docs=None,
         generation_config=None,
-        **kwargs
+        logits_processor=TFLogitsProcessorList(),
+        **kwargs,
     ):
         """
         Implements TFRAG token decoding.
@@ -1045,7 +1047,11 @@ def generate(
                 priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
                 configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
                 default values, whose documentation should be checked to parameterize generation.
-            kwargs:
+            logits_processor (`TFLogitsProcessorList`, *optional*):
+                Custom logits processors that complement the default logits processors built from arguments and a
+                model's config. If a logit processor is passed that is already created with the arguments or a model's
+                config an error is thrown.
+            kwargs (`Dict[str, Any]`, *optional*):
                 Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                 forwarded to the `forward` function of the model.
 
@@ -1149,6 +1155,7 @@ def extend_enc_output(tensor, num_beams=None):
         pre_processor = self._get_logits_processor(
             generation_config=generation_config,
             input_ids_seq_length=tf.shape(decoder_input_ids)[-1],
+            logits_processor=logits_processor,
         )
 
         if generation_config.num_beams == 1:
@@ -1292,7 +1299,6 @@ def hf_compute_loss(self, labels, y_pred, smooth_epsilon=0.0, from_logits=True,
     RAG_START_DOCSTRING,
 )
 class TFRagSequenceForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingLoss):
-
     load_weight_prefix = "tf_rag_sequence_for_generation_1/rag"
 
     def __init__(
@@ -1344,26 +1350,26 @@ def question_encoder(self):
     @replace_return_docstrings(output_type=TFRetrievAugLMMarginOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_outputs: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        encoder_outputs: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        doc_scores: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        context_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        context_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        doc_scores: np.ndarray | tf.Tensor | None = None,
+        context_input_ids: np.ndarray | tf.Tensor | None = None,
+        context_attention_mask: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         output_retrieved: Optional[bool] = None,
         n_docs: Optional[int] = None,
         exclude_bos_score: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         reduce_loss: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
-        **kwargs  # needs kwargs for generation
+        **kwargs,  # needs kwargs for generation
     ) -> Union[Tuple[tf.Tensor], TFRetrievAugLMMarginOutput]:
         r"""
         exclude_bos_score (`bool`, *optional*):
@@ -1384,9 +1390,9 @@ def call(
         Example:
 
         ```python
-        >>> from transformers import RagTokenizer, RagRetriever, TFRagSequenceForGeneration
+        >>> from transformers import AutoTokenizer, RagRetriever, TFRagSequenceForGeneration
 
-        >>> tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
+        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/rag-sequence-nq")
         >>> retriever = RagRetriever.from_pretrained(
         ...     "facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True
         ... )
@@ -1576,8 +1582,8 @@ def gather2d(target, id_tensor):
 
     def generate(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[tf.Tensor] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: tf.Tensor | None = None,
         context_input_ids=None,
         context_attention_mask=None,
         doc_scores=None,
@@ -1585,7 +1591,7 @@ def generate(
         num_return_sequences=None,  # defaults to 1
         num_beams=None,  # defaults to 1
         n_docs=None,
-        **model_kwargs
+        **model_kwargs,
     ):
         """
         Implements RAG sequence "thorough" decoding. Read the [`~generation.GenerationMixin.generate`]` documentation
@@ -1623,7 +1629,7 @@ def generate(
                 Number of beams for beam search. 1 means no beam search.
             n_docs (`int`, *optional*, defaults to `config.n_docs`)
                 Number of documents to retrieve and/or number of documents for which to generate an answer.
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Additional kwargs will be passed to [`~generation.GenerationMixin.generate`]
 
         Return:
diff --git a/src/transformers/models/rag/retrieval_rag.py b/src/transformers/models/rag/retrieval_rag.py
index 261255b9f62f..88cb54115bf5 100644
--- a/src/transformers/models/rag/retrieval_rag.py
+++ b/src/transformers/models/rag/retrieval_rag.py
@@ -573,10 +573,10 @@ def __call__(
         Retrieves documents for specified `question_hidden_states`.
 
         Args:
-            question_input_ids: (`List[List[int]]`) batch of input ids
+            question_input_ids (`List[List[int]]`) batch of input ids
             question_hidden_states (`np.ndarray` of shape `(batch_size, vector_size)`:
                 A batch of query vectors to retrieve with.
-            prefix: (`str`, *optional*):
+            prefix (`str`, *optional*):
                 The prefix used by the generator's tokenizer.
             n_docs (`int`, *optional*):
                 The number of docs retrieved per query.
diff --git a/src/transformers/models/realm/__init__.py b/src/transformers/models/realm/__init__.py
index 2464c0ae27d9..594ce0c35e38 100644
--- a/src/transformers/models/realm/__init__.py
+++ b/src/transformers/models/realm/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/realm/configuration_realm.py b/src/transformers/models/realm/configuration_realm.py
index 2b7ec5f2e099..bef2baf05f20 100644
--- a/src/transformers/models/realm/configuration_realm.py
+++ b/src/transformers/models/realm/configuration_realm.py
@@ -110,8 +110,6 @@ class RealmConfig(PretrainedConfig):
         searcher_beam_size (`int`, *optional*, defaults to 5000):
             Beam size of the searcher. Note that when eval mode is enabled, *searcher_beam_size* will be the same as
             *reader_beam_size*.
-        searcher_seq_len (`int`, *optional*, defaults to 64):
-            Maximum sequence length of the searcher.
 
     Example:
 
@@ -152,11 +150,10 @@ def __init__(
         reader_seq_len=320,  # 288 + 32
         num_block_records=13353718,
         searcher_beam_size=5000,
-        searcher_seq_len=64,
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
@@ -186,4 +183,3 @@ def __init__(
         # Retrieval config
         self.num_block_records = num_block_records
         self.searcher_beam_size = searcher_beam_size
-        self.searcher_seq_len = searcher_seq_len
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index da4eaf0f1187..aa738d782b7b 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -41,7 +41,6 @@
 _ENCODER_CHECKPOINT_FOR_DOC = "google/realm-cc-news-pretrained-encoder"
 _SCORER_CHECKPOINT_FOR_DOC = "google/realm-cc-news-pretrained-scorer"
 _CONFIG_FOR_DOC = "RealmConfig"
-_TOKENIZER_FOR_DOC = "RealmTokenizer"
 
 REALM_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "google/realm-cc-news-pretrained-embedder",
@@ -179,7 +178,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
         )
@@ -569,6 +570,13 @@ def forward(
         all_self_attentions = () if output_attentions else None
         all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         next_decoder_cache = () if use_cache else None
         for i, layer_module in enumerate(self.layer):
             if output_hidden_states:
@@ -579,12 +587,6 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         return module(*inputs, past_key_value, output_attentions)
@@ -726,7 +728,7 @@ class RealmReaderOutput(ModelOutput):
             The index of the retrieved span candidates in which the predicted answer is most likely.
         start_pos (`torch.IntTensor` of shape `()`):
             Predicted answer starting position in *RealmReader*'s inputs.
-        end_pos: (`torch.IntTensor` of shape `()`):
+        end_pos (`torch.IntTensor` of shape `()`):
             Predicted answer ending position in *RealmReader*'s inputs.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
@@ -914,7 +916,7 @@ def mask_to_score(mask, dtype=torch.float32):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`RealmTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -968,7 +970,6 @@ class RealmPreTrainedModel(PreTrainedModel):
     config_class = RealmConfig
     load_tf_weights = load_tf_weights_in_realm
     base_model_prefix = "realm"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -1062,6 +1063,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -1147,7 +1149,7 @@ def forward(
     REALM_START_DOCSTRING,
 )
 class RealmEmbedder(RealmPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias"]
+    _tied_weights_keys = ["cls.predictions.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1182,10 +1184,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import RealmTokenizer, RealmEmbedder
+        >>> from transformers import AutoTokenizer, RealmEmbedder
         >>> import torch
 
-        >>> tokenizer = RealmTokenizer.from_pretrained("google/realm-cc-news-pretrained-embedder")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/realm-cc-news-pretrained-embedder")
         >>> model = RealmEmbedder.from_pretrained("google/realm-cc-news-pretrained-embedder")
 
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
@@ -1266,7 +1268,7 @@ def forward(
         candidate_input_ids (`torch.LongTensor` of shape `(batch_size, num_candidates, sequence_length)`):
             Indices of candidate input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`RealmTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -1296,9 +1298,9 @@ def forward(
 
         ```python
         >>> import torch
-        >>> from transformers import RealmTokenizer, RealmScorer
+        >>> from transformers import AutoTokenizer, RealmScorer
 
-        >>> tokenizer = RealmTokenizer.from_pretrained("google/realm-cc-news-pretrained-scorer")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/realm-cc-news-pretrained-scorer")
         >>> model = RealmScorer.from_pretrained("google/realm-cc-news-pretrained-scorer", num_candidates=2)
 
         >>> # batch_size = 2, num_candidates = 2
@@ -1377,7 +1379,7 @@ def forward(
     REALM_START_DOCSTRING,
 )
 class RealmKnowledgeAugEncoder(RealmPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["cls.predictions.decoder"]
+    _tied_weights_keys = ["cls.predictions.decoder"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1439,9 +1441,9 @@ def forward(
 
         ```python
         >>> import torch
-        >>> from transformers import RealmTokenizer, RealmKnowledgeAugEncoder
+        >>> from transformers import AutoTokenizer, RealmKnowledgeAugEncoder
 
-        >>> tokenizer = RealmTokenizer.from_pretrained("google/realm-cc-news-pretrained-encoder")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/realm-cc-news-pretrained-encoder")
         >>> model = RealmKnowledgeAugEncoder.from_pretrained(
         ...     "google/realm-cc-news-pretrained-encoder", num_candidates=2
         ... )
@@ -1527,9 +1529,6 @@ def forward(
 
 @add_start_docstrings("The reader of REALM.", REALM_START_DOCSTRING)
 class RealmReader(RealmPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r"pooler", "cls"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1701,7 +1700,7 @@ def mask_to_score(mask, dtype=torch.float32):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`RealmTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -1783,10 +1782,10 @@ def forward(
 
         ```python
         >>> import torch
-        >>> from transformers import RealmForOpenQA, RealmRetriever, RealmTokenizer
+        >>> from transformers import RealmForOpenQA, RealmRetriever, AutoTokenizer
 
         >>> retriever = RealmRetriever.from_pretrained("google/realm-orqa-nq-openqa")
-        >>> tokenizer = RealmTokenizer.from_pretrained("google/realm-orqa-nq-openqa")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/realm-orqa-nq-openqa")
         >>> model = RealmForOpenQA.from_pretrained("google/realm-orqa-nq-openqa", retriever=retriever)
 
         >>> question = "Who is the pioneer in modern computer science?"
diff --git a/src/transformers/models/realm/retrieval_realm.py b/src/transformers/models/realm/retrieval_realm.py
index db6c8c7246be..c84e7af08f56 100644
--- a/src/transformers/models/realm/retrieval_realm.py
+++ b/src/transformers/models/realm/retrieval_realm.py
@@ -18,10 +18,9 @@
 from typing import Optional, Union
 
 import numpy as np
-
 from huggingface_hub import hf_hub_download
-from transformers import AutoTokenizer
 
+from ... import AutoTokenizer
 from ...utils import logging
 
 
diff --git a/src/transformers/models/realm/tokenization_realm.py b/src/transformers/models/realm/tokenization_realm.py
index 28ee33ab87bc..bf6b63277488 100644
--- a/src/transformers/models/realm/tokenization_realm.py
+++ b/src/transformers/models/realm/tokenization_realm.py
@@ -155,22 +155,8 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
-        super().__init__(
-            do_lower_case=do_lower_case,
-            do_basic_tokenize=do_basic_tokenize,
-            never_split=never_split,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            tokenize_chinese_chars=tokenize_chinese_chars,
-            strip_accents=strip_accents,
-            **kwargs,
-        )
-
         if not os.path.isfile(vocab_file):
             raise ValueError(
                 f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
@@ -186,7 +172,20 @@ def __init__(
                 tokenize_chinese_chars=tokenize_chinese_chars,
                 strip_accents=strip_accents,
             )
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+        super().__init__(
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
 
     @property
     def do_lower_case(self):
@@ -203,7 +202,6 @@ def _tokenize(self, text):
         split_tokens = []
         if self.do_basic_tokenize:
             for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-
                 # If the token is part of the never_split set
                 if token in self.basic_tokenizer.never_split:
                     split_tokens.append(token)
@@ -295,7 +293,7 @@ def batch_encode_candidates(self, text, **kwargs):
             if encoded_token_type_ids is not None:
                 output_data["token_type_ids"].append(encoded_token_type_ids)
 
-        output_data = dict((key, item) for key, item in output_data.items() if len(item) != 0)
+        output_data = {key: item for key, item in output_data.items() if len(item) != 0}
 
         return BatchEncoding(output_data, tensor_type=return_tensors)
 
diff --git a/src/transformers/models/realm/tokenization_realm_fast.py b/src/transformers/models/realm/tokenization_realm_fast.py
index f61fa8418ed2..59b23f45ee0b 100644
--- a/src/transformers/models/realm/tokenization_realm_fast.py
+++ b/src/transformers/models/realm/tokenization_realm_fast.py
@@ -160,7 +160,7 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
@@ -259,7 +259,7 @@ def batch_encode_candidates(self, text, **kwargs):
             if encoded_token_type_ids is not None:
                 output_data["token_type_ids"].append(encoded_token_type_ids)
 
-        output_data = dict((key, item) for key, item in output_data.items() if len(item) != 0)
+        output_data = {key: item for key, item in output_data.items() if len(item) != 0}
 
         return BatchEncoding(output_data, tensor_type=return_tensors)
 
@@ -282,7 +282,7 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
         """
         output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
 
-        if token_ids_1:
+        if token_ids_1 is not None:
             output += token_ids_1 + [self.sep_token_id]
 
         return output
diff --git a/src/transformers/models/reformer/__init__.py b/src/transformers/models/reformer/__init__.py
index 979074bcc728..37508ef808e0 100644
--- a/src/transformers/models/reformer/__init__.py
+++ b/src/transformers/models/reformer/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/reformer/configuration_reformer.py b/src/transformers/models/reformer/configuration_reformer.py
index d4ffb2a39b05..af712ced1eed 100755
--- a/src/transformers/models/reformer/configuration_reformer.py
+++ b/src/transformers/models/reformer/configuration_reformer.py
@@ -197,7 +197,7 @@ def __init__(
         tie_word_embeddings=False,
         use_cache=True,
         classifier_dropout=None,
-        **kwargs
+        **kwargs,
     ):
         self.hash_seed = hash_seed
         self.vocab_size = vocab_size
diff --git a/src/transformers/models/reformer/modeling_reformer.py b/src/transformers/models/reformer/modeling_reformer.py
index 0cca45422eaf..275a1e1dc738 100755
--- a/src/transformers/models/reformer/modeling_reformer.py
+++ b/src/transformers/models/reformer/modeling_reformer.py
@@ -49,7 +49,6 @@
 
 _CHECKPOINT_FOR_DOC = "google/reformer-crime-and-punishment"
 _CONFIG_FOR_DOC = "ReformerConfig"
-_TOKENIZER_FOR_DOC = "ReformerTokenizer"
 
 REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "google/reformer-crime-and-punishment",
@@ -88,7 +87,7 @@ def _get_least_common_mult_chunk_len(config):
         return config.lsh_attn_chunk_length
     elif len(attn_types_set) == 1 and attn_types[0] == "local":
         return config.local_attn_chunk_length
-    elif len(attn_types_set) == 2 and attn_types_set == set(["lsh", "local"]):
+    elif len(attn_types_set) == 2 and attn_types_set == {"lsh", "local"}:
         return np.lcm(config.lsh_attn_chunk_length, config.local_attn_chunk_length)
     else:
         raise NotImplementedError(
@@ -104,7 +103,7 @@ def _get_min_chunk_len(config):
         return config.lsh_attn_chunk_length
     elif len(attn_types_set) == 1 and attn_types[0] == "local":
         return config.local_attn_chunk_length
-    elif len(attn_types_set) == 2 and attn_types_set == set(["lsh", "local"]):
+    elif len(attn_types_set) == 2 and attn_types_set == {"lsh", "local"}:
         return min(config.lsh_attn_chunk_length, config.local_attn_chunk_length)
     else:
         raise NotImplementedError(
@@ -353,10 +352,10 @@ def __init__(self, config):
         self.value = nn.Linear(self.hidden_size, self.all_head_size, bias=False)
 
         # save mask value here. Need fp32 and fp16 mask values
-        self.register_buffer("self_mask_value_float16", torch.tensor(-1e3))
-        self.register_buffer("self_mask_value_float32", torch.tensor(-1e5))
-        self.register_buffer("mask_value_float16", torch.tensor(-1e4))
-        self.register_buffer("mask_value_float32", torch.tensor(-1e9))
+        self.register_buffer("self_mask_value_float16", torch.tensor(-1e3), persistent=False)
+        self.register_buffer("self_mask_value_float32", torch.tensor(-1e5), persistent=False)
+        self.register_buffer("mask_value_float16", torch.tensor(-1e4), persistent=False)
+        self.register_buffer("mask_value_float32", torch.tensor(-1e9), persistent=False)
 
     def forward(
         self,
@@ -520,7 +519,8 @@ def forward(
             )
 
         # scale key vectors
-        key_vectors = self._len_and_dim_norm(query_key_vectors)
+        sqrt_num = np.sqrt(self.attention_head_size)
+        key_vectors = self._len_and_dim_norm(query_key_vectors, sqrt_num)
 
         # set query_vectors to query key vectors if LSH self attention
         query_vectors = query_vectors if query_vectors is not None else query_key_vectors
@@ -666,7 +666,7 @@ def _hash_vectors(self, vectors, num_hashes, attention_mask, increase_num_bucket
             # add an extra bucket for padding tokens only
             num_buckets = num_buckets + 1
             # assign padding tokens extra bucket
-            buckets_mask = attention_mask.to(torch.uint8)[:, None, None, :].expand(buckets.shape)
+            buckets_mask = attention_mask.to(torch.bool)[:, None, None, :].expand(buckets.shape)
             buckets = torch.where(
                 buckets_mask, buckets, torch.tensor(num_buckets - 1, dtype=torch.long, device=buckets.device)
             )
@@ -841,7 +841,7 @@ def _compute_attn_mask(
         # attention mask for LSH
         if attention_mask is not None:
             # if chunked attention, the attention mask has to correspond to LSH order
-            attention_mask = attention_mask.to(torch.uint8)[:, None, :]
+            attention_mask = attention_mask.to(torch.bool)[:, None, :]
             if not do_standard_self_attention:
                 # expand attn_mask to fit with key_value_bucket_idx shape
                 attention_mask = attention_mask[:, None, :]
@@ -909,10 +909,9 @@ def _get_relevant_hid_states_and_buckets(
         relevant_bucket_idx_chunk = bucket_idx[tuple(relevant_bucket_idx_chunk.transpose(0, 1))]
 
         # adapt bucket_idx for batch and hidden states for index select
+        offset = torch.arange(relevant_bucket_idx_chunk.shape[-1], device=hidden_states.device, dtype=torch.long)
         bucket_idx_batch_offset = sequence_length * (
-            batch_size
-            * torch.arange(relevant_bucket_idx_chunk.shape[-1], device=hidden_states.device, dtype=torch.long)
-            // relevant_bucket_idx_chunk.shape[-1]
+            batch_size * torch.div(offset, relevant_bucket_idx_chunk.shape[-1], rounding_mode="floor")
         )
 
         # add batch offset
@@ -970,14 +969,12 @@ def _expand_to_indices_in_relevant_chunk(self, indices, sequence_length):
 
         return indices
 
-    def _len_and_dim_norm(self, vectors):
+    def _len_and_dim_norm(self, vectors, sqrt_num):
         """
         length and attention head size dim normalization
         """
         vectors = self._len_norm(vectors)
-        vectors = vectors * torch.rsqrt(
-            torch.tensor(self.attention_head_size, device=vectors.device, dtype=vectors.dtype)
-        )
+        vectors = vectors / sqrt_num
         return vectors
 
     def _len_norm(self, x, epsilon=1e-6):
@@ -1052,8 +1049,8 @@ def __init__(self, config):
         self.dropout = config.local_attention_probs_dropout_prob
 
         # save mask value here
-        self.register_buffer("mask_value_float16", torch.tensor(-1e4))
-        self.register_buffer("mask_value_float32", torch.tensor(-1e9))
+        self.register_buffer("mask_value_float16", torch.tensor(-1e4), persistent=False)
+        self.register_buffer("mask_value_float32", torch.tensor(-1e9), persistent=False)
 
     def forward(
         self,
@@ -1115,9 +1112,7 @@ def forward(
             )
 
         # normalize key vectors
-        key_vectors = key_vectors / torch.sqrt(
-            torch.tensor(self.attention_head_size, device=key_vectors.device, dtype=key_vectors.dtype)
-        )
+        key_vectors = key_vectors / np.sqrt(self.attention_head_size)
 
         # get sequence length indices
         indices = torch.arange(sequence_length, device=query_vectors.device).repeat(
@@ -1227,10 +1222,9 @@ def forward(
     def _compute_attn_mask(
         self, query_indices, key_indices, attention_mask, query_key_dots_shape, do_standard_self_attention
     ):
-
         # chunk attention mask and look before and after
         if attention_mask is not None:
-            attention_mask = attention_mask.to(torch.uint8)[:, None, :]
+            attention_mask = attention_mask.to(torch.bool)[:, None, :]
 
             if not do_standard_self_attention:
                 attention_mask = self._split_seq_length_dim_to(attention_mask, -1, self.chunk_length, 1)
@@ -1282,7 +1276,7 @@ def __init__(self, config, layer_id=0):
             self.self_attention = LSHSelfAttention(config)
         elif len(set(self.attn_layers)) == 1 and self.attn_layers[0] == "local":
             self.self_attention = LocalSelfAttention(config)
-        elif len(set(self.attn_layers)) == 2 and set(self.attn_layers) == set(["lsh", "local"]):
+        elif len(set(self.attn_layers)) == 2 and set(self.attn_layers) == {"lsh", "local"}:
             # get correct attn layers
             if self.attn_layers[self.layer_id] == "lsh":
                 self.self_attention = LSHSelfAttention(config)
@@ -1923,7 +1917,7 @@ class ReformerModelWithLMHeadOutput(ModelOutput):
             a multiple of the relevant model's chunk lengths (lsh's, local's or both). During evaluation, the indices
             are automatically padded to be a multiple of the chunk length.
 
-            Indices can be obtained using [`ReformerTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -2009,7 +2003,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(REFORMER_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=ReformerModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -2038,6 +2031,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()  # noqa: F841
             device = input_ids.device
         elif inputs_embeds is not None:
@@ -2165,8 +2159,8 @@ def _pad_to_mult_of_chunk_length(
         else:
             attention_mask = torch.cat(
                 [
-                    torch.ones(input_shape, device=device, dtype=torch.uint8),
-                    torch.zeros((input_shape[0], padding_length), device=device, dtype=torch.uint8),
+                    torch.ones(input_shape, device=device, dtype=torch.bool),
+                    torch.zeros((input_shape[0], padding_length), device=device, dtype=torch.bool),
                 ],
                 dim=-1,
             )
@@ -2192,7 +2186,7 @@ def _pad_to_mult_of_chunk_length(
 
 @add_start_docstrings("""Reformer Model with a `language modeling` head on top.""", REFORMER_START_DOCSTRING)
 class ReformerModelWithLMHead(ReformerPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["lm_head.decoder.bias"]
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -2220,7 +2214,6 @@ def set_output_embeddings(self, new_embeddings):
 
     @add_start_docstrings_to_model_forward(REFORMER_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=CausalLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -2302,23 +2295,25 @@ def prepare_inputs_for_generation(
 
         return inputs_dict
 
-    def _reorder_cache(self, past, beam_idx):
+    def _reorder_cache(self, past_key_values, beam_idx):
         reord_past_buckets_states = []
-        for layer_past in past:
+        for layer_past in past_key_values:
             # buckets
             if layer_past[0] is not None:
-                reord_buckets = layer_past[0].index_select(0, beam_idx)
+                reord_buckets = layer_past[0].index_select(0, beam_idx.to(layer_past[0].device))
             else:
                 reord_buckets = None
 
             # hidden states
-            reord_hidden_states = layer_past[1].index_select(0, beam_idx)
+            reord_hidden_states = layer_past[1].index_select(0, beam_idx.to(layer_past[1].device))
             reord_past_buckets_states.append((reord_buckets, reord_hidden_states))
         return reord_past_buckets_states
 
 
 @add_start_docstrings("""Reformer Model with a `language modeling` head on top.""", REFORMER_START_DOCSTRING)
 class ReformerForMaskedLM(ReformerPreTrainedModel):
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
+
     def __init__(self, config):
         super().__init__(config)
         assert not config.is_decoder, (
@@ -2360,19 +2355,29 @@ def forward(
 
         Returns:
 
+        <Tip warning={true}>
+
+        This example uses a false checkpoint since we don't have any available pretrained model for the masked language
+        modeling task with the Reformer architecture.
+
+        </Tip>
+
         Example:
 
         ```python
         >>> import torch
-        >>> from transformers import ReformerTokenizer, ReformerForMaskedLM
+        >>> from transformers import AutoTokenizer, ReformerForMaskedLM
 
-        >>> tokenizer = ReformerTokenizer.from_pretrained("hf-internal-testing/tiny-random-reformer")
+        >>> tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-reformer")
         >>> model = ReformerForMaskedLM.from_pretrained("hf-internal-testing/tiny-random-reformer")
 
         >>> # add mask_token
         >>> tokenizer.add_special_tokens({"mask_token": "[MASK]"})  # doctest: +IGNORE_RESULT
         >>> inputs = tokenizer("The capital of France is [MASK].", return_tensors="pt")
 
+        >>> # resize model's embedding matrix
+        >>> model.resize_token_embeddings(new_num_tokens=model.config.vocab_size + 1)  # doctest: +IGNORE_RESULT
+
         >>> with torch.no_grad():
         ...     logits = model(**inputs).logits
 
@@ -2380,8 +2385,7 @@ def forward(
         >>> mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]
 
         >>> predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
-        >>> tokenizer.decode(predicted_token_id)
-        'it'
+        >>> predicted_token = tokenizer.decode(predicted_token_id)
         ```
 
         ```python
@@ -2392,8 +2396,7 @@ def forward(
         ... )
 
         >>> outputs = model(**inputs, labels=labels)
-        >>> round(outputs.loss.item(), 2)
-        7.09
+        >>> loss = round(outputs.loss.item(), 2)
         ```
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -2479,10 +2482,10 @@ def forward(
 
         ```python
         >>> import torch
-        >>> from transformers import ReformerTokenizer, ReformerForSequenceClassification
+        >>> from transformers import AutoTokenizer, ReformerForSequenceClassification
 
-        >>> tokenizer = ReformerTokenizer.from_pretrained("hf-internal-testing/tiny-random-reformer")
-        >>> model = ReformerForSequenceClassification.from_pretrained("hf-internal-testing/tiny-random-reformer")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/reformer-crime-and-punishment")
+        >>> model = ReformerForSequenceClassification.from_pretrained("google/reformer-crime-and-punishment")
 
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
 
@@ -2490,60 +2493,18 @@ def forward(
         ...     logits = model(**inputs).logits
 
         >>> predicted_class_id = logits.argmax().item()
-        >>> model.config.id2label[predicted_class_id]
-        'LABEL_1'
+        >>> label = model.config.id2label[predicted_class_id]
         ```
 
         ```python
         >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
         >>> num_labels = len(model.config.id2label)
         >>> model = ReformerForSequenceClassification.from_pretrained(
-        ...     "hf-internal-testing/tiny-random-reformer", num_labels=num_labels
+        ...     "google/reformer-crime-and-punishment", num_labels=num_labels
         ... )
 
         >>> labels = torch.tensor(1)
         >>> loss = model(**inputs, labels=labels).loss
-        >>> round(loss.item(), 2)
-        0.69
-        ```
-
-        Example of multi-label classification:
-
-        ```python
-        >>> import torch
-        >>> from transformers import ReformerTokenizer, ReformerForSequenceClassification
-
-        >>> tokenizer = ReformerTokenizer.from_pretrained("hf-internal-testing/tiny-random-reformer")
-        >>> model = ReformerForSequenceClassification.from_pretrained(
-        ...     "hf-internal-testing/tiny-random-reformer", problem_type="multi_label_classification"
-        ... )
-
-        >>> # add pad_token
-        >>> tokenizer.add_special_tokens({"pad_token": "[PAD]"})  # doctest: +IGNORE_RESULT
-        >>> inputs = tokenizer("Hello, my dog is cute", max_length=100, padding="max_length", return_tensors="pt")
-
-        >>> with torch.no_grad():
-        ...     logits = model(**inputs).logits
-
-        >>> predicted_class_id = logits.argmax().item()
-        >>> model.config.id2label[predicted_class_id]
-        'LABEL_1'
-        ```
-
-        ```python
-        >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
-        >>> num_labels = len(model.config.id2label)
-        >>> model = ReformerForSequenceClassification.from_pretrained(
-        ...     "hf-internal-testing/tiny-random-reformer", num_labels=num_labels
-        ... )
-        >>> model.train()  # doctest: +IGNORE_RESULT
-
-        >>> num_labels = len(model.config.id2label)
-        >>> labels = torch.nn.functional.one_hot(torch.tensor([predicted_class_id]), num_classes=num_labels).to(
-        ...     torch.float
-        ... )
-        >>> loss = model(**inputs, labels=labels).loss
-        >>> loss.backward()  # doctest: +IGNORE_RESULT
         ```
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -2641,12 +2602,9 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(REFORMER_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint="hf-internal-testing/tiny-random-reformer",
+        checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output="''",
-        expected_loss=3.28,
     )
     def forward(
         self,
diff --git a/src/transformers/models/reformer/tokenization_reformer.py b/src/transformers/models/reformer/tokenization_reformer.py
index 814d5ed6cde1..364a2d42edff 100644
--- a/src/transformers/models/reformer/tokenization_reformer.py
+++ b/src/transformers/models/reformer/tokenization_reformer.py
@@ -69,9 +69,7 @@ class ReformerTokenizer(PreTrainedTokenizer):
         unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (`str`, *optional*, defaults to `"<pad>"`):
-            The token used for padding, for example when batching sequences of different lengths.
-        additional_special_tokens (`List[str]`, *optional*):
+        additional_special_tokens (`List[str]`, *optional*, defaults to `[]`):
             Additional special tokens used by the tokenizer.
         sp_model_kwargs (`dict`, *optional*):
             Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
@@ -102,10 +100,14 @@ def __init__(
         unk_token="<unk>",
         additional_special_tokens=[],
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
 
+        self.vocab_file = vocab_file
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+
         super().__init__(
             eos_token=eos_token,
             unk_token=unk_token,
@@ -114,10 +116,6 @@ def __init__(
             **kwargs,
         )
 
-        self.vocab_file = vocab_file
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(vocab_file)
-
     @property
     def vocab_size(self):
         return self.sp_model.get_piece_size()
diff --git a/src/transformers/models/reformer/tokenization_reformer_fast.py b/src/transformers/models/reformer/tokenization_reformer_fast.py
index e9c6a61993d0..eb8c86b3cd12 100644
--- a/src/transformers/models/reformer/tokenization_reformer_fast.py
+++ b/src/transformers/models/reformer/tokenization_reformer_fast.py
@@ -98,7 +98,7 @@ def __init__(
         eos_token="</s>",
         unk_token="<unk>",
         additional_special_tokens=[],
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
@@ -110,7 +110,10 @@ def __init__(
         )
 
         self.vocab_file = vocab_file
-        self.can_save_slow_tokenizer = False if not self.vocab_file else True
+
+    @property
+    def can_save_slow_tokenizer(self) -> bool:
+        return os.path.isfile(self.vocab_file) if self.vocab_file else False
 
     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
         if not self.can_save_slow_tokenizer:
diff --git a/src/transformers/models/regnet/__init__.py b/src/transformers/models/regnet/__init__.py
index 5399cb3f3be0..5084c4486008 100644
--- a/src/transformers/models/regnet/__init__.py
+++ b/src/transformers/models/regnet/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,8 +13,13 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-# rely on isort to merge the imports
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tf_available, is_torch_available
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_flax_available,
+    is_tf_available,
+    is_torch_available,
+)
 
 
 _import_structure = {"configuration_regnet": ["REGNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "RegNetConfig"]}
@@ -49,6 +50,18 @@
         "TFRegNetPreTrainedModel",
     ]
 
+try:
+    if not is_flax_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_flax_regnet"] = [
+        "FlaxRegNetForImageClassification",
+        "FlaxRegNetModel",
+        "FlaxRegNetPreTrainedModel",
+    ]
+
 
 if TYPE_CHECKING:
     from .configuration_regnet import REGNET_PRETRAINED_CONFIG_ARCHIVE_MAP, RegNetConfig
@@ -79,6 +92,18 @@
             TFRegNetPreTrainedModel,
         )
 
+    try:
+        if not is_flax_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_flax_regnet import (
+            FlaxRegNetForImageClassification,
+            FlaxRegNetModel,
+            FlaxRegNetPreTrainedModel,
+        )
+
 
 else:
     import sys
diff --git a/src/transformers/models/regnet/configuration_regnet.py b/src/transformers/models/regnet/configuration_regnet.py
index 13b61c235be0..201354d1553c 100644
--- a/src/transformers/models/regnet/configuration_regnet.py
+++ b/src/transformers/models/regnet/configuration_regnet.py
@@ -78,7 +78,7 @@ def __init__(
         groups_width=64,
         layer_type="y",
         hidden_act="relu",
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         if layer_type not in self.layer_types:
diff --git a/src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py b/src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py
index 4a73b9623f11..93a516fb3c77 100644
--- a/src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py
+++ b/src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py
@@ -29,14 +29,14 @@
 
 import torch
 import torch.nn as nn
-from torch import Tensor
-
 from classy_vision.models.regnet import RegNet, RegNetParams
 from huggingface_hub import cached_download, hf_hub_url
-from transformers import AutoFeatureExtractor, RegNetConfig, RegNetForImageClassification, RegNetModel
+from torch import Tensor
+from vissl.models.model_helpers import get_trunk_forward_outputs
+
+from transformers import AutoImageProcessor, RegNetConfig, RegNetForImageClassification, RegNetModel
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import logging
-from vissl.models.model_helpers import get_trunk_forward_outputs
 
 
 logging.set_verbosity_info()
@@ -60,7 +60,7 @@ def __call__(self, x: Tensor):
         for name, m in self.module.named_modules():
             self.handles.append(m.register_forward_hook(partial(self._forward_hook, name=name)))
         self.module(x)
-        list(map(lambda x: x.remove(), self.handles))
+        [x.remove() for x in self.handles]
         return self
 
     @property
@@ -262,10 +262,10 @@ def load_using_classy_vision(checkpoint_url: str) -> Tuple[Dict, Dict]:
         )
         size = 384
         # we can use the convnext one
-        feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/convnext-base-224-22k-1k", size=size)
-        feature_extractor.push_to_hub(
+        image_processor = AutoImageProcessor.from_pretrained("facebook/convnext-base-224-22k-1k", size=size)
+        image_processor.push_to_hub(
             repo_path_or_name=save_directory / model_name,
-            commit_message="Add feature extractor",
+            commit_message="Add image processor",
             output_dir=save_directory / model_name,
         )
 
@@ -294,7 +294,7 @@ def load_using_classy_vision(checkpoint_url: str) -> Tuple[Dict, Dict]:
         default=True,
         type=bool,
         required=False,
-        help="If True, push model and feature extractor to the hub.",
+        help="If True, push model and image processor to the hub.",
     )
 
     args = parser.parse_args()
diff --git a/src/transformers/models/regnet/convert_regnet_to_pytorch.py b/src/transformers/models/regnet/convert_regnet_to_pytorch.py
index acb74dc89dce..14d01ae44525 100644
--- a/src/transformers/models/regnet/convert_regnet_to_pytorch.py
+++ b/src/transformers/models/regnet/convert_regnet_to_pytorch.py
@@ -22,17 +22,17 @@
 from pathlib import Path
 from typing import Callable, Dict, List, Tuple
 
+import timm
 import torch
 import torch.nn as nn
-from torch import Tensor
-
-import timm
 from classy_vision.models.regnet import RegNet, RegNetParams, RegNetY32gf, RegNetY64gf, RegNetY128gf
 from huggingface_hub import cached_download, hf_hub_url
-from transformers import AutoFeatureExtractor, RegNetConfig, RegNetForImageClassification, RegNetModel
-from transformers.utils import logging
+from torch import Tensor
 from vissl.models.model_helpers import get_trunk_forward_outputs
 
+from transformers import AutoImageProcessor, RegNetConfig, RegNetForImageClassification, RegNetModel
+from transformers.utils import logging
+
 
 logging.set_verbosity_info()
 logger = logging.get_logger()
@@ -53,7 +53,7 @@ def __call__(self, x: Tensor):
         for m in self.module.modules():
             self.handles.append(m.register_forward_hook(self._forward_hook))
         self.module(x)
-        list(map(lambda x: x.remove(), self.handles))
+        [x.remove() for x in self.handles]
         return self
 
     @property
@@ -209,10 +209,10 @@ def convert_weight_and_push(
 
         size = 224 if "seer" not in name else 384
         # we can use the convnext one
-        feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/convnext-base-224-22k-1k", size=size)
-        feature_extractor.push_to_hub(
+        image_processor = AutoImageProcessor.from_pretrained("facebook/convnext-base-224-22k-1k", size=size)
+        image_processor.push_to_hub(
             repo_path_or_name=save_directory / name,
-            commit_message="Add feature extractor",
+            commit_message="Add image processor",
             use_temp_dir=True,
         )
 
@@ -449,7 +449,7 @@ def load_using_classy_vision(checkpoint_url: str, model_func: Callable[[], nn.Mo
         default=True,
         type=bool,
         required=False,
-        help="If True, push model and feature extractor to the hub.",
+        help="If True, push model and image processor to the hub.",
     )
 
     args = parser.parse_args()
diff --git a/src/transformers/models/regnet/modeling_flax_regnet.py b/src/transformers/models/regnet/modeling_flax_regnet.py
new file mode 100644
index 000000000000..9fef1868d60a
--- /dev/null
+++ b/src/transformers/models/regnet/modeling_flax_regnet.py
@@ -0,0 +1,818 @@
+# coding=utf-8
+# Copyright 2023 The Google Flax Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from functools import partial
+from typing import Optional, Tuple
+
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.traverse_util import flatten_dict, unflatten_dict
+
+from transformers import RegNetConfig
+from transformers.modeling_flax_outputs import (
+    FlaxBaseModelOutputWithNoAttention,
+    FlaxBaseModelOutputWithPooling,
+    FlaxBaseModelOutputWithPoolingAndNoAttention,
+    FlaxImageClassifierOutputWithNoAttention,
+)
+from transformers.modeling_flax_utils import (
+    ACT2FN,
+    FlaxPreTrainedModel,
+    append_replace_return_docstrings,
+    overwrite_call_docstring,
+)
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+)
+
+
+REGNET_START_DOCSTRING = r"""
+
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)
+
+    This model is also a Flax Linen [flax.linen.Module](https://flax.readthedocs.io/en/latest/flax.linen.html#module)
+    subclass. Use it as a regular Flax linen Module and refer to the Flax documentation for all matter related to
+    general usage and behavior.
+
+    Finally, this model supports inherent JAX features such as:
+
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+    Parameters:
+        config ([`RegNetConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+            `jax.numpy.bfloat16` (on TPUs).
+
+            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
+            specified all the computation will be performed with the given `dtype`.
+
+            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+            parameters.**
+
+            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
+            [`~FlaxPreTrainedModel.to_bf16`].
+"""
+
+REGNET_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`RegNetImageProcessor.__call__`] for details.
+
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.resnet.modeling_flax_resnet.Identity
+class Identity(nn.Module):
+    """Identity function."""
+
+    @nn.compact
+    def __call__(self, x, **kwargs):
+        return x
+
+
+class FlaxRegNetConvLayer(nn.Module):
+    out_channels: int
+    kernel_size: int = 3
+    stride: int = 1
+    groups: int = 1
+    activation: Optional[str] = "relu"
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.convolution = nn.Conv(
+            self.out_channels,
+            kernel_size=(self.kernel_size, self.kernel_size),
+            strides=self.stride,
+            padding=self.kernel_size // 2,
+            feature_group_count=self.groups,
+            use_bias=False,
+            kernel_init=nn.initializers.variance_scaling(2.0, mode="fan_out", distribution="truncated_normal"),
+            dtype=self.dtype,
+        )
+        self.normalization = nn.BatchNorm(momentum=0.9, epsilon=1e-05, dtype=self.dtype)
+        self.activation_func = ACT2FN[self.activation] if self.activation is not None else Identity()
+
+    def __call__(self, hidden_state: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
+        hidden_state = self.convolution(hidden_state)
+        hidden_state = self.normalization(hidden_state, use_running_average=deterministic)
+        hidden_state = self.activation_func(hidden_state)
+        return hidden_state
+
+
+class FlaxRegNetEmbeddings(nn.Module):
+    config: RegNetConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.embedder = FlaxRegNetConvLayer(
+            self.config.embedding_size,
+            kernel_size=3,
+            stride=2,
+            activation=self.config.hidden_act,
+            dtype=self.dtype,
+        )
+
+    def __call__(self, pixel_values: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
+        num_channels = pixel_values.shape[-1]
+        if num_channels != self.config.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        hidden_state = self.embedder(pixel_values, deterministic=deterministic)
+        return hidden_state
+
+
+# Copied from transformers.models.resnet.modeling_flax_resnet.FlaxResNetShortCut with ResNet->RegNet
+class FlaxRegNetShortCut(nn.Module):
+    """
+    RegNet shortcut, used to project the residual features to the correct size. If needed, it is also used to
+    downsample the input using `stride=2`.
+    """
+
+    out_channels: int
+    stride: int = 2
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.convolution = nn.Conv(
+            self.out_channels,
+            kernel_size=(1, 1),
+            strides=self.stride,
+            use_bias=False,
+            kernel_init=nn.initializers.variance_scaling(2.0, mode="fan_out", distribution="truncated_normal"),
+            dtype=self.dtype,
+        )
+        self.normalization = nn.BatchNorm(momentum=0.9, epsilon=1e-05, dtype=self.dtype)
+
+    def __call__(self, x: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
+        hidden_state = self.convolution(x)
+        hidden_state = self.normalization(hidden_state, use_running_average=deterministic)
+        return hidden_state
+
+
+class FlaxRegNetSELayerCollection(nn.Module):
+    in_channels: int
+    reduced_channels: int
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.conv_1 = nn.Conv(
+            self.reduced_channels,
+            kernel_size=(1, 1),
+            kernel_init=nn.initializers.variance_scaling(2.0, mode="fan_out", distribution="truncated_normal"),
+            dtype=self.dtype,
+            name="0",
+        )  # 0 is the name used in corresponding pytorch implementation
+        self.conv_2 = nn.Conv(
+            self.in_channels,
+            kernel_size=(1, 1),
+            kernel_init=nn.initializers.variance_scaling(2.0, mode="fan_out", distribution="truncated_normal"),
+            dtype=self.dtype,
+            name="2",
+        )  # 2 is the name used in corresponding pytorch implementation
+
+    def __call__(self, hidden_state: jnp.ndarray) -> jnp.ndarray:
+        hidden_state = self.conv_1(hidden_state)
+        hidden_state = nn.relu(hidden_state)
+        hidden_state = self.conv_2(hidden_state)
+        attention = nn.sigmoid(hidden_state)
+
+        return attention
+
+
+class FlaxRegNetSELayer(nn.Module):
+    """
+    Squeeze and Excitation layer (SE) proposed in [Squeeze-and-Excitation Networks](https://arxiv.org/abs/1709.01507).
+    """
+
+    in_channels: int
+    reduced_channels: int
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.pooler = partial(nn.avg_pool, padding=((0, 0), (0, 0)))
+        self.attention = FlaxRegNetSELayerCollection(self.in_channels, self.reduced_channels, dtype=self.dtype)
+
+    def __call__(self, hidden_state: jnp.ndarray) -> jnp.ndarray:
+        pooled = self.pooler(
+            hidden_state,
+            window_shape=(hidden_state.shape[1], hidden_state.shape[2]),
+            strides=(hidden_state.shape[1], hidden_state.shape[2]),
+        )
+        attention = self.attention(pooled)
+        hidden_state = hidden_state * attention
+        return hidden_state
+
+
+class FlaxRegNetXLayerCollection(nn.Module):
+    config: RegNetConfig
+    out_channels: int
+    stride: int = 1
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        groups = max(1, self.out_channels // self.config.groups_width)
+
+        self.layer = [
+            FlaxRegNetConvLayer(
+                self.out_channels,
+                kernel_size=1,
+                activation=self.config.hidden_act,
+                dtype=self.dtype,
+                name="0",
+            ),
+            FlaxRegNetConvLayer(
+                self.out_channels,
+                stride=self.stride,
+                groups=groups,
+                activation=self.config.hidden_act,
+                dtype=self.dtype,
+                name="1",
+            ),
+            FlaxRegNetConvLayer(
+                self.out_channels,
+                kernel_size=1,
+                activation=None,
+                dtype=self.dtype,
+                name="2",
+            ),
+        ]
+
+    def __call__(self, hidden_state: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
+        for layer in self.layer:
+            hidden_state = layer(hidden_state, deterministic=deterministic)
+        return hidden_state
+
+
+class FlaxRegNetXLayer(nn.Module):
+    """
+    RegNet's layer composed by three `3x3` convolutions, same as a ResNet bottleneck layer with reduction = 1.
+    """
+
+    config: RegNetConfig
+    in_channels: int
+    out_channels: int
+    stride: int = 1
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        should_apply_shortcut = self.in_channels != self.out_channels or self.stride != 1
+        self.shortcut = (
+            FlaxRegNetShortCut(
+                self.out_channels,
+                stride=self.stride,
+                dtype=self.dtype,
+            )
+            if should_apply_shortcut
+            else Identity()
+        )
+        self.layer = FlaxRegNetXLayerCollection(
+            self.config,
+            in_channels=self.in_channels,
+            out_channels=self.out_channels,
+            stride=self.stride,
+            dtype=self.dtype,
+        )
+        self.activation_func = ACT2FN[self.config.hidden_act]
+
+    def __call__(self, hidden_state: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
+        residual = hidden_state
+        hidden_state = self.layer(hidden_state)
+        residual = self.shortcut(residual, deterministic=deterministic)
+        hidden_state += residual
+        hidden_state = self.activation_func(hidden_state)
+        return hidden_state
+
+
+class FlaxRegNetYLayerCollection(nn.Module):
+    config: RegNetConfig
+    in_channels: int
+    out_channels: int
+    stride: int = 1
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        groups = max(1, self.out_channels // self.config.groups_width)
+
+        self.layer = [
+            FlaxRegNetConvLayer(
+                self.out_channels,
+                kernel_size=1,
+                activation=self.config.hidden_act,
+                dtype=self.dtype,
+                name="0",
+            ),
+            FlaxRegNetConvLayer(
+                self.out_channels,
+                stride=self.stride,
+                groups=groups,
+                activation=self.config.hidden_act,
+                dtype=self.dtype,
+                name="1",
+            ),
+            FlaxRegNetSELayer(
+                self.out_channels,
+                reduced_channels=int(round(self.in_channels / 4)),
+                dtype=self.dtype,
+                name="2",
+            ),
+            FlaxRegNetConvLayer(
+                self.out_channels,
+                kernel_size=1,
+                activation=None,
+                dtype=self.dtype,
+                name="3",
+            ),
+        ]
+
+    def __call__(self, hidden_state: jnp.ndarray) -> jnp.ndarray:
+        for layer in self.layer:
+            hidden_state = layer(hidden_state)
+        return hidden_state
+
+
+class FlaxRegNetYLayer(nn.Module):
+    """
+    RegNet's Y layer: an X layer with Squeeze and Excitation.
+    """
+
+    config: RegNetConfig
+    in_channels: int
+    out_channels: int
+    stride: int = 1
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        should_apply_shortcut = self.in_channels != self.out_channels or self.stride != 1
+
+        self.shortcut = (
+            FlaxRegNetShortCut(
+                self.out_channels,
+                stride=self.stride,
+                dtype=self.dtype,
+            )
+            if should_apply_shortcut
+            else Identity()
+        )
+        self.layer = FlaxRegNetYLayerCollection(
+            self.config,
+            in_channels=self.in_channels,
+            out_channels=self.out_channels,
+            stride=self.stride,
+            dtype=self.dtype,
+        )
+        self.activation_func = ACT2FN[self.config.hidden_act]
+
+    def __call__(self, hidden_state: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
+        residual = hidden_state
+        hidden_state = self.layer(hidden_state)
+        residual = self.shortcut(residual, deterministic=deterministic)
+        hidden_state += residual
+        hidden_state = self.activation_func(hidden_state)
+        return hidden_state
+
+
+class FlaxRegNetStageLayersCollection(nn.Module):
+    """
+    A RegNet stage composed by stacked layers.
+    """
+
+    config: RegNetConfig
+    in_channels: int
+    out_channels: int
+    stride: int = 2
+    depth: int = 2
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        layer = FlaxRegNetXLayer if self.config.layer_type == "x" else FlaxRegNetYLayer
+
+        layers = [
+            # downsampling is done in the first layer with stride of 2
+            layer(
+                self.config,
+                self.in_channels,
+                self.out_channels,
+                stride=self.stride,
+                dtype=self.dtype,
+                name="0",
+            )
+        ]
+
+        for i in range(self.depth - 1):
+            layers.append(
+                layer(
+                    self.config,
+                    self.out_channels,
+                    self.out_channels,
+                    dtype=self.dtype,
+                    name=str(i + 1),
+                )
+            )
+
+        self.layers = layers
+
+    def __call__(self, x: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
+        hidden_state = x
+        for layer in self.layers:
+            hidden_state = layer(hidden_state, deterministic=deterministic)
+        return hidden_state
+
+
+# Copied from transformers.models.resnet.modeling_flax_resnet.FlaxResNetStage with ResNet->RegNet
+class FlaxRegNetStage(nn.Module):
+    """
+    A RegNet stage composed by stacked layers.
+    """
+
+    config: RegNetConfig
+    in_channels: int
+    out_channels: int
+    stride: int = 2
+    depth: int = 2
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.layers = FlaxRegNetStageLayersCollection(
+            self.config,
+            in_channels=self.in_channels,
+            out_channels=self.out_channels,
+            stride=self.stride,
+            depth=self.depth,
+            dtype=self.dtype,
+        )
+
+    def __call__(self, x: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
+        return self.layers(x, deterministic=deterministic)
+
+
+# Copied from transformers.models.resnet.modeling_flax_resnet.FlaxResNetStageCollection with ResNet->RegNet
+class FlaxRegNetStageCollection(nn.Module):
+    config: RegNetConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        in_out_channels = zip(self.config.hidden_sizes, self.config.hidden_sizes[1:])
+        stages = [
+            FlaxRegNetStage(
+                self.config,
+                self.config.embedding_size,
+                self.config.hidden_sizes[0],
+                stride=2 if self.config.downsample_in_first_stage else 1,
+                depth=self.config.depths[0],
+                dtype=self.dtype,
+                name="0",
+            )
+        ]
+
+        for i, ((in_channels, out_channels), depth) in enumerate(zip(in_out_channels, self.config.depths[1:])):
+            stages.append(
+                FlaxRegNetStage(self.config, in_channels, out_channels, depth=depth, dtype=self.dtype, name=str(i + 1))
+            )
+
+        self.stages = stages
+
+    def __call__(
+        self,
+        hidden_state: jnp.ndarray,
+        output_hidden_states: bool = False,
+        deterministic: bool = True,
+    ) -> FlaxBaseModelOutputWithNoAttention:
+        hidden_states = () if output_hidden_states else None
+
+        for stage_module in self.stages:
+            if output_hidden_states:
+                hidden_states = hidden_states + (hidden_state.transpose(0, 3, 1, 2),)
+
+            hidden_state = stage_module(hidden_state, deterministic=deterministic)
+
+        return hidden_state, hidden_states
+
+
+# Copied from transformers.models.resnet.modeling_flax_resnet.FlaxResNetEncoder with ResNet->RegNet
+class FlaxRegNetEncoder(nn.Module):
+    config: RegNetConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.stages = FlaxRegNetStageCollection(self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        hidden_state: jnp.ndarray,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+    ) -> FlaxBaseModelOutputWithNoAttention:
+        hidden_state, hidden_states = self.stages(
+            hidden_state, output_hidden_states=output_hidden_states, deterministic=deterministic
+        )
+
+        if output_hidden_states:
+            hidden_states = hidden_states + (hidden_state.transpose(0, 3, 1, 2),)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_state, hidden_states] if v is not None)
+
+        return FlaxBaseModelOutputWithNoAttention(
+            last_hidden_state=hidden_state,
+            hidden_states=hidden_states,
+        )
+
+
+# Copied from transformers.models.resnet.modeling_flax_resnet.FlaxResNetPreTrainedModel with ResNet->RegNet,resnet->regnet,RESNET->REGNET
+class FlaxRegNetPreTrainedModel(FlaxPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = RegNetConfig
+    base_model_prefix = "regnet"
+    main_input_name = "pixel_values"
+    module_class: nn.Module = None
+
+    def __init__(
+        self,
+        config: RegNetConfig,
+        input_shape=(1, 224, 224, 3),
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        **kwargs,
+    ):
+        module = self.module_class(config=config, dtype=dtype, **kwargs)
+        if input_shape is None:
+            input_shape = (1, config.image_size, config.image_size, config.num_channels)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        pixel_values = jnp.zeros(input_shape, dtype=self.dtype)
+
+        rngs = {"params": rng}
+
+        random_params = self.module.init(rngs, pixel_values, return_dict=False)
+
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+
+    @add_start_docstrings_to_model_forward(REGNET_INPUTS_DOCSTRING)
+    def __call__(
+        self,
+        pixel_values,
+        params: dict = None,
+        train: bool = False,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        pixel_values = jnp.transpose(pixel_values, (0, 2, 3, 1))
+
+        # Handle any PRNG if needed
+        rngs = {}
+
+        return self.module.apply(
+            {
+                "params": params["params"] if params is not None else self.params["params"],
+                "batch_stats": params["batch_stats"] if params is not None else self.params["batch_stats"],
+            },
+            jnp.array(pixel_values, dtype=jnp.float32),
+            not train,
+            output_hidden_states,
+            return_dict,
+            rngs=rngs,
+            mutable=["batch_stats"] if train else False,  # Returing tuple with batch_stats only when train is True
+        )
+
+
+# Copied from transformers.models.resnet.modeling_flax_resnet.FlaxResNetModule with ResNet->RegNet
+class FlaxRegNetModule(nn.Module):
+    config: RegNetConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.embedder = FlaxRegNetEmbeddings(self.config, dtype=self.dtype)
+        self.encoder = FlaxRegNetEncoder(self.config, dtype=self.dtype)
+
+        # Adaptive average pooling used in resnet
+        self.pooler = partial(
+            nn.avg_pool,
+            padding=((0, 0), (0, 0)),
+        )
+
+    def __call__(
+        self,
+        pixel_values,
+        deterministic: bool = True,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> FlaxBaseModelOutputWithPoolingAndNoAttention:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        embedding_output = self.embedder(pixel_values, deterministic=deterministic)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=deterministic,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+
+        pooled_output = self.pooler(
+            last_hidden_state,
+            window_shape=(last_hidden_state.shape[1], last_hidden_state.shape[2]),
+            strides=(last_hidden_state.shape[1], last_hidden_state.shape[2]),
+        ).transpose(0, 3, 1, 2)
+
+        last_hidden_state = last_hidden_state.transpose(0, 3, 1, 2)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return FlaxBaseModelOutputWithPoolingAndNoAttention(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+        )
+
+
+@add_start_docstrings(
+    "The bare RegNet model outputting raw features without any specific head on top.",
+    REGNET_START_DOCSTRING,
+)
+class FlaxRegNetModel(FlaxRegNetPreTrainedModel):
+    module_class = FlaxRegNetModule
+
+
+FLAX_VISION_MODEL_DOCSTRING = """
+    Returns:
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoImageProcessor, FlaxRegNetModel
+    >>> from PIL import Image
+    >>> import requests
+
+    >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    >>> image = Image.open(requests.get(url, stream=True).raw)
+
+    >>> image_processor = AutoImageProcessor.from_pretrained("facebook/regnet-y-040")
+    >>> model = FlaxRegNetModel.from_pretrained("facebook/regnet-y-040")
+
+    >>> inputs = image_processor(images=image, return_tensors="np")
+    >>> outputs = model(**inputs)
+    >>> last_hidden_states = outputs.last_hidden_state
+    ```
+"""
+
+overwrite_call_docstring(FlaxRegNetModel, FLAX_VISION_MODEL_DOCSTRING)
+append_replace_return_docstrings(
+    FlaxRegNetModel,
+    output_type=FlaxBaseModelOutputWithPooling,
+    config_class=RegNetConfig,
+)
+
+
+# Copied from transformers.models.resnet.modeling_flax_resnet.FlaxResNetClassifierCollection with ResNet->RegNet
+class FlaxRegNetClassifierCollection(nn.Module):
+    config: RegNetConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype, name="1")
+
+    def __call__(self, x: jnp.ndarray) -> jnp.ndarray:
+        return self.classifier(x)
+
+
+# Copied from transformers.models.resnet.modeling_flax_resnet.FlaxResNetForImageClassificationModule with ResNet->RegNet,resnet->regnet,RESNET->REGNET
+class FlaxRegNetForImageClassificationModule(nn.Module):
+    config: RegNetConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.regnet = FlaxRegNetModule(config=self.config, dtype=self.dtype)
+
+        if self.config.num_labels > 0:
+            self.classifier = FlaxRegNetClassifierCollection(self.config, dtype=self.dtype)
+        else:
+            self.classifier = Identity()
+
+    def __call__(
+        self,
+        pixel_values=None,
+        deterministic: bool = True,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.regnet(
+            pixel_values,
+            deterministic=deterministic,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs.pooler_output if return_dict else outputs[1]
+
+        logits = self.classifier(pooled_output[:, :, 0, 0])
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return output
+
+        return FlaxImageClassifierOutputWithNoAttention(logits=logits, hidden_states=outputs.hidden_states)
+
+
+@add_start_docstrings(
+    """
+    RegNet Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
+    ImageNet.
+    """,
+    REGNET_START_DOCSTRING,
+)
+class FlaxRegNetForImageClassification(FlaxRegNetPreTrainedModel):
+    module_class = FlaxRegNetForImageClassificationModule
+
+
+FLAX_VISION_CLASSIF_DOCSTRING = """
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoImageProcessor, FlaxRegNetForImageClassification
+    >>> from PIL import Image
+    >>> import jax
+    >>> import requests
+
+    >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    >>> image = Image.open(requests.get(url, stream=True).raw)
+
+    >>> image_processor = AutoImageProcessor.from_pretrained("facebook/regnet-y-040")
+    >>> model = FlaxRegNetForImageClassification.from_pretrained("facebook/regnet-y-040")
+
+    >>> inputs = image_processor(images=image, return_tensors="np")
+    >>> outputs = model(**inputs)
+    >>> logits = outputs.logits
+
+    >>> # model predicts one of the 1000 ImageNet classes
+    >>> predicted_class_idx = jax.numpy.argmax(logits, axis=-1)
+    >>> print("Predicted class:", model.config.id2label[predicted_class_idx.item()])
+    ```
+"""
+
+overwrite_call_docstring(FlaxRegNetForImageClassification, FLAX_VISION_CLASSIF_DOCSTRING)
+append_replace_return_docstrings(
+    FlaxRegNetForImageClassification,
+    output_type=FlaxImageClassifierOutputWithNoAttention,
+    config_class=RegNetConfig,
+)
diff --git a/src/transformers/models/regnet/modeling_regnet.py b/src/transformers/models/regnet/modeling_regnet.py
index 3969819e8bd8..07ef29fd3332 100644
--- a/src/transformers/models/regnet/modeling_regnet.py
+++ b/src/transformers/models/regnet/modeling_regnet.py
@@ -37,7 +37,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "RegNetConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "facebook/regnet-y-040"
@@ -314,7 +313,7 @@ def _set_gradient_checkpointing(self, module, value=False):
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
-            [`AutoImageProcessor.__call__`] for details.
+            [`ConvNextImageProcessor.__call__`] for details.
 
         output_hidden_states (`bool`, *optional*):
             Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
@@ -341,7 +340,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(REGNET_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPoolingAndNoAttention,
         config_class=_CONFIG_FOR_DOC,
@@ -399,7 +397,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(REGNET_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_IMAGE_CLASS_CHECKPOINT,
         output_type=ImageClassifierOutputWithNoAttention,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/regnet/modeling_tf_regnet.py b/src/transformers/models/regnet/modeling_tf_regnet.py
index fd36b2554f7f..4f5af855858f 100644
--- a/src/transformers/models/regnet/modeling_tf_regnet.py
+++ b/src/transformers/models/regnet/modeling_tf_regnet.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 """ TensorFlow RegNet model."""
 
-from typing import Dict, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import tensorflow as tf
 
@@ -35,7 +35,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "RegNetConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "facebook/regnet-y-040"
@@ -248,7 +247,7 @@ def call(self, hidden_state):
 class TFRegNetEncoder(tf.keras.layers.Layer):
     def __init__(self, config: RegNetConfig, **kwargs):
         super().__init__(**kwargs)
-        self.stages = list()
+        self.stages = []
         # based on `downsample_in_first_stage`, the first layer of the first stage may or may not downsample the input
         self.stages.append(
             TFRegNetStage(
@@ -346,33 +345,8 @@ class TFRegNetPreTrainedModel(TFPreTrainedModel):
     main_input_name = "pixel_values"
 
     @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        VISION_DUMMY_INPUTS = tf.random.uniform(shape=(3, self.config.num_channels, 224, 224), dtype=tf.float32)
-        return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)}
-
-    @tf.function(
-        input_signature=[
-            {
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        """
-        Method used for serving the model.
-
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
-        """
-        output = self.call(inputs)
-        return self.serving_output(output)
+    def input_signature(self):
+        return {"pixel_values": tf.TensorSpec(shape=(None, self.config.num_channels, 224, 224), dtype=tf.float32)}
 
 
 REGNET_START_DOCSTRING = r"""
@@ -390,7 +364,7 @@ def serving(self, inputs):
     Args:
         pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
-            [`AutoImageProcessor.__call__`] for details.
+            [`ConveNextImageProcessor.__call__`] for details.
         output_hidden_states (`bool`, *optional*):
             Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
             more detail.
@@ -411,7 +385,6 @@ def __init__(self, config: RegNetConfig, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(REGNET_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFBaseModelOutputWithPoolingAndNoAttention,
         config_class=_CONFIG_FOR_DOC,
@@ -423,7 +396,7 @@ def call(
         pixel_values: tf.Tensor,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        training=False,
+        training: bool = False,
     ) -> Union[TFBaseModelOutputWithPoolingAndNoAttention, Tuple[tf.Tensor]]:
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -445,16 +418,6 @@ def call(
             hidden_states=outputs.hidden_states,
         )
 
-    def serving_output(
-        self, output: TFBaseModelOutputWithPoolingAndNoAttention
-    ) -> TFBaseModelOutputWithPoolingAndNoAttention:
-        # hidden_states not converted to Tensor with tf.convert_to_tensor as they are all of different dimensions
-        return TFBaseModelOutputWithPoolingAndNoAttention(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            hidden_states=output.hidden_states,
-        )
-
 
 @add_start_docstrings(
     """
@@ -477,7 +440,6 @@ def __init__(self, config: RegNetConfig, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(REGNET_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_IMAGE_CLASS_CHECKPOINT,
         output_type=TFSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -485,11 +447,11 @@ def __init__(self, config: RegNetConfig, *inputs, **kwargs):
     )
     def call(
         self,
-        pixel_values: tf.Tensor = None,
-        labels: tf.Tensor = None,
-        output_hidden_states: bool = None,
-        return_dict: bool = None,
-        training=False,
+        pixel_values: Optional[tf.Tensor] = None,
+        labels: Optional[tf.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
         labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
@@ -517,7 +479,3 @@ def call(
             return ((loss,) + output) if loss is not None else output
 
         return TFSequenceClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
-
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        # hidden_states not converted to Tensor with tf.convert_to_tensor as they are all of different dimensions
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=output.hidden_states)
diff --git a/src/transformers/models/rembert/__init__.py b/src/transformers/models/rembert/__init__.py
index 72d438870394..98e8e2254dcf 100644
--- a/src/transformers/models/rembert/__init__.py
+++ b/src/transformers/models/rembert/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/rembert/configuration_rembert.py b/src/transformers/models/rembert/configuration_rembert.py
index 550722569178..792a6dbcfadf 100644
--- a/src/transformers/models/rembert/configuration_rembert.py
+++ b/src/transformers/models/rembert/configuration_rembert.py
@@ -119,7 +119,7 @@ def __init__(
         pad_token_id=0,
         bos_token_id=312,
         eos_token_id=313,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
diff --git a/src/transformers/models/rembert/modeling_rembert.py b/src/transformers/models/rembert/modeling_rembert.py
index 20c3675108c7..745be26ebfc9 100755
--- a/src/transformers/models/rembert/modeling_rembert.py
+++ b/src/transformers/models/rembert/modeling_rembert.py
@@ -50,7 +50,6 @@
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "RemBertConfig"
-_TOKENIZER_FOR_DOC = "RemBertTokenizer"
 _CHECKPOINT_FOR_DOC = "google/rembert"
 
 REMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -159,7 +158,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
     def forward(
         self,
@@ -522,7 +523,12 @@ def forward(
         output_hidden_states: bool = False,
         return_dict: bool = True,
     ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
-
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
         hidden_states = self.embedding_hidden_mapping_in(hidden_states)
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
@@ -538,12 +544,6 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         return module(*inputs, past_key_value, output_attentions)
@@ -656,7 +656,6 @@ class RemBertPreTrainedModel(PreTrainedModel):
     load_tf_weights = load_tf_weights_in_rembert
     base_model_prefix = "rembert"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -695,7 +694,7 @@ def _set_gradient_checkpointing(self, module, value=False):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`RemBertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -785,7 +784,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="google/rembert",
         output_type=BaseModelOutputWithPastAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -839,6 +837,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -915,6 +914,8 @@ def forward(
 
 @add_start_docstrings("""RemBERT Model with a `language modeling` head on top.""", REMBERT_START_DOCSTRING)
 class RemBertForMaskedLM(RemBertPreTrainedModel):
+    _tied_weights_keys = ["cls.predictions.decoder.weight"]
+
     def __init__(self, config):
         super().__init__(config)
 
@@ -938,7 +939,6 @@ def set_output_embeddings(self, new_embeddings):
 
     @add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="google/rembert",
         output_type=MaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1018,8 +1018,7 @@ def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_
     """RemBERT Model with a `language modeling` head on top for CLM fine-tuning.""", REMBERT_START_DOCSTRING
 )
 class RemBertForCausalLM(RemBertPreTrainedModel):
-
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+    _tied_weights_keys = ["cls.predictions.decoder.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1086,10 +1085,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import RemBertTokenizer, RemBertForCausalLM, RemBertConfig
+        >>> from transformers import AutoTokenizer, RemBertForCausalLM, RemBertConfig
         >>> import torch
 
-        >>> tokenizer = RemBertTokenizer.from_pretrained("google/rembert")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/rembert")
         >>> config = RemBertConfig.from_pretrained("google/rembert")
         >>> config.is_decoder = True
         >>> model = RemBertForCausalLM.from_pretrained("google/rembert", config=config)
@@ -1154,11 +1153,12 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attenti
 
         return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
 
-    def _reorder_cache(self, past, beam_idx):
+    def _reorder_cache(self, past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
+        for layer_past in past_key_values:
             reordered_past += (
-                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
+                + layer_past[2:],
             )
         return reordered_past
 
@@ -1183,7 +1183,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="google/rembert",
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1280,7 +1279,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="google/rembert",
         output_type=MultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1373,7 +1371,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="google/rembert",
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1452,7 +1449,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="google/rembert",
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/rembert/modeling_tf_rembert.py b/src/transformers/models/rembert/modeling_tf_rembert.py
index cc11dc4ca88c..1595fd8118de 100644
--- a/src/transformers/models/rembert/modeling_tf_rembert.py
+++ b/src/transformers/models/rembert/modeling_tf_rembert.py
@@ -15,6 +15,8 @@
 """ TF 2.0 RemBERT model."""
 
 
+from __future__ import annotations
+
 import math
 from typing import Dict, Optional, Tuple, Union
 
@@ -45,10 +47,8 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
-    DUMMY_INPUTS,
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -60,7 +60,6 @@
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "RemBertConfig"
-_TOKENIZER_FOR_DOC = "RemBertTokenizer"
 
 TF_REMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "google/rembert",
@@ -74,8 +73,7 @@ class TFRemBertEmbeddings(tf.keras.layers.Layer):
     def __init__(self, config: RemBertConfig, **kwargs):
         super().__init__(**kwargs)
 
-        self.vocab_size = config.vocab_size
-        self.type_vocab_size = config.type_vocab_size
+        self.config = config
         self.input_embedding_size = config.input_embedding_size
         self.max_position_embeddings = config.max_position_embeddings
         self.initializer_range = config.initializer_range
@@ -86,14 +84,14 @@ def build(self, input_shape: tf.TensorShape):
         with tf.name_scope("word_embeddings"):
             self.weight = self.add_weight(
                 name="weight",
-                shape=[self.vocab_size, self.input_embedding_size],
+                shape=[self.config.vocab_size, self.input_embedding_size],
                 initializer=get_initializer(self.initializer_range),
             )
 
         with tf.name_scope("token_type_embeddings"):
             self.token_type_embeddings = self.add_weight(
                 name="embeddings",
-                shape=[self.type_vocab_size, self.input_embedding_size],
+                shape=[self.config.type_vocab_size, self.input_embedding_size],
                 initializer=get_initializer(self.initializer_range),
             )
 
@@ -124,16 +122,7 @@ def call(
         assert not (input_ids is None and inputs_embeds is None)
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
@@ -393,9 +382,9 @@ def call(
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
         head_mask: tf.Tensor,
-        encoder_hidden_states: Optional[tf.Tensor],
-        encoder_attention_mask: Optional[tf.Tensor],
-        past_key_value: Optional[Tuple[tf.Tensor]],
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_value: Tuple[tf.Tensor] | None,
         output_attentions: bool,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
@@ -561,7 +550,7 @@ class TFRemBertLMPredictionHead(tf.keras.layers.Layer):
     def __init__(self, config: RemBertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs):
         super().__init__(**kwargs)
 
-        self.vocab_size = config.vocab_size
+        self.config = config
         self.initializer_range = config.initializer_range
         self.output_embedding_size = config.output_embedding_size
         self.dense = tf.keras.layers.Dense(
@@ -576,11 +565,11 @@ def __init__(self, config: RemBertConfig, input_embeddings: tf.keras.layers.Laye
     def build(self, input_shape: tf.TensorShape):
         self.decoder = self.add_weight(
             name="decoder/weight",
-            shape=[self.vocab_size, self.output_embedding_size],
+            shape=[self.config.vocab_size, self.output_embedding_size],
             initializer=get_initializer(self.initializer_range),
         )
         self.decoder_bias = self.add_weight(
-            shape=(self.vocab_size,), initializer="zeros", trainable=True, name="decoder/bias"
+            shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="decoder/bias"
         )
 
         super().build(input_shape)
@@ -597,7 +586,7 @@ def get_bias(self) -> Dict[str, tf.Variable]:
 
     def set_bias(self, value: tf.Variable):
         self.decoder_bias = value["decoder_bias"]
-        self.vocab_size = shape_list(value["decoder_bias"])[0]
+        self.config.vocab_size = shape_list(value["decoder_bias"])[0]
 
     def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
         hidden_states = self.dense(inputs=hidden_states)
@@ -606,7 +595,7 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
         hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.output_embedding_size])
         hidden_states = self.LayerNorm(hidden_states)
         hidden_states = tf.matmul(a=hidden_states, b=self.decoder, transpose_b=True)
-        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
         hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.decoder_bias)
         return hidden_states
 
@@ -656,14 +645,14 @@ class PreTrainedModel
     # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.call
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -671,7 +660,6 @@ def call(
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFBaseModelOutputWithPoolingAndCrossAttentions, Tuple[tf.Tensor]]:
-
         if not self.config.is_decoder:
             use_cache = False
 
@@ -822,24 +810,6 @@ class TFRemBertPreTrainedModel(TFPreTrainedModel):
     config_class = RemBertConfig
     base_model_prefix = "rembert"
 
-    @property
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        dummy = {"input_ids": tf.constant(DUMMY_INPUTS, dtype=tf.int32)}
-        # Add `encoder_hidden_states` to make the cross-attention layers' weights initialized
-        if self.config.add_cross_attention:
-            batch_size, seq_len = tf.constant(DUMMY_INPUTS).shape
-            shape = (batch_size, seq_len) + (self.config.hidden_size,)
-            h = tf.random.uniform(shape=shape)
-            dummy["encoder_hidden_states"] = h
-
-        return dummy
-
 
 REMBERT_START_DOCSTRING = r"""
 
@@ -888,7 +858,7 @@ def dummy_inputs(self):
         input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -952,21 +922,20 @@ def __init__(self, config: RemBertConfig, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="google/rembert",
         output_type=TFBaseModelOutputWithPoolingAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -1013,27 +982,6 @@ def call(
 
         return outputs
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertModel.serving_output
-    def serving_output(
-        self, output: TFBaseModelOutputWithPoolingAndCrossAttentions
-    ) -> TFBaseModelOutputWithPoolingAndCrossAttentions:
-        output_cache = self.config.use_cache and self.config.is_decoder
-        pkv = tf.convert_to_tensor(output.past_key_values) if output_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = tf.convert_to_tensor(output.cross_attentions) if output.cross_attentions is not None else None
-        if not (self.config.output_attentions and self.config.add_cross_attention):
-            cross_attns = None
-
-        return TFBaseModelOutputWithPoolingAndCrossAttentions(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            past_key_values=pkv,
-            hidden_states=hs,
-            attentions=attns,
-            cross_attentions=cross_attns,
-        )
-
 
 @add_start_docstrings("""RemBERT Model with a `language modeling` head on top.""", REMBERT_START_DOCSTRING)
 class TFRemBertForMaskedLM(TFRemBertPreTrainedModel, TFMaskedLanguageModelingLoss):
@@ -1055,23 +1003,22 @@ def get_lm_head(self) -> tf.keras.layers.Layer:
     @unpack_inputs
     @add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="google/rembert",
         output_type=TFMaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1107,12 +1054,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """RemBERT Model with a `language modeling` head on top for CLM fine-tuning.""", REMBERT_START_DOCSTRING
@@ -1145,27 +1086,26 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attenti
 
     @unpack_inputs
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="google/rembert",
         output_type=TFCausalLMOutputWithCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFCausalLMOutputWithCrossAttentions, Tuple[tf.Tensor]]:
         r"""
@@ -1230,20 +1170,6 @@ def call(
             cross_attentions=outputs.cross_attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMHeadModel.serving_output
-    def serving_output(self, output: TFCausalLMOutputWithCrossAttentions) -> TFCausalLMOutputWithCrossAttentions:
-        output_cache = self.config.use_cache and self.config.is_decoder
-        pkv = tf.convert_to_tensor(output.past_key_values) if output_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = tf.convert_to_tensor(output.cross_attentions) if output.cross_attentions is not None else None
-        if not (self.config.output_attentions and self.config.add_cross_attention):
-            cross_attns = None
-
-        return TFCausalLMOutputWithCrossAttentions(
-            logits=output.logits, past_key_values=pkv, hidden_states=hs, attentions=attns, cross_attentions=cross_attns
-        )
-
 
 @add_start_docstrings(
     """
@@ -1268,23 +1194,22 @@ def __init__(self, config: RemBertConfig, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="google/rembert",
         output_type=TFSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1321,12 +1246,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1345,36 +1264,25 @@ def __init__(self, config: RemBertConfig, *inputs, **kwargs):
             units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
         )
 
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            tf.Tensor with dummy inputs
-        """
-        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32)}
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="google/rembert",
         output_type=TFMultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1434,26 +1342,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-                "token_type_ids": tf.TensorSpec((None, None, None), tf.int32, name="token_type_ids"),
-            }
-        ]
-    )
-    def serving(self, inputs: Dict[str, tf.Tensor]) -> TFMultipleChoiceModelOutput:
-        output = self.call(input_ids=inputs)
-
-        return self.serving_output(output)
-
-    def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMultipleChoiceModelOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1477,23 +1365,22 @@ def __init__(self, config: RemBertConfig, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="google/rembert",
         output_type=TFTokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1528,12 +1415,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1556,24 +1437,23 @@ def __init__(self, config: RemBertConfig, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="google/rembert",
         output_type=TFQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1621,11 +1501,3 @@ def call(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
-
-    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
diff --git a/src/transformers/models/rembert/tokenization_rembert.py b/src/transformers/models/rembert/tokenization_rembert.py
index 4c2cce94aa3a..c1f12527ef59 100644
--- a/src/transformers/models/rembert/tokenization_rembert.py
+++ b/src/transformers/models/rembert/tokenization_rembert.py
@@ -109,8 +109,15 @@ def __init__(
         pad_token="[PAD]",
         cls_token="[CLS]",
         mask_token="[MASK]",
-        **kwargs
+        **kwargs,
     ):
+        self.do_lower_case = do_lower_case
+        self.remove_space = remove_space
+        self.keep_accents = keep_accents
+        self.vocab_file = vocab_file
+
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(vocab_file)
         super().__init__(
             do_lower_case=do_lower_case,
             remove_space=remove_space,
@@ -125,14 +132,6 @@ def __init__(
             **kwargs,
         )
 
-        self.do_lower_case = do_lower_case
-        self.remove_space = remove_space
-        self.keep_accents = keep_accents
-        self.vocab_file = vocab_file
-
-        self.sp_model = spm.SentencePieceProcessor()
-        self.sp_model.Load(vocab_file)
-
     @property
     def vocab_size(self):
         return len(self.sp_model)
@@ -219,7 +218,7 @@ def get_special_tokens_mask(
                     "You should not supply a second sequence if the provided sequence of "
                     "ids is already formatted with special tokens for the model."
                 )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+            return [1 if x in [self.sep_token_id, self.cls_token_id] else 0 for x in token_ids_0]
 
         if token_ids_1 is not None:
             return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
@@ -263,7 +262,11 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
             save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
         )
 
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
             copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
 
         return (out_vocab_file,)
diff --git a/src/transformers/models/rembert/tokenization_rembert_fast.py b/src/transformers/models/rembert/tokenization_rembert_fast.py
index 72c402438f50..947cc4bc9601 100644
--- a/src/transformers/models/rembert/tokenization_rembert_fast.py
+++ b/src/transformers/models/rembert/tokenization_rembert_fast.py
@@ -114,7 +114,7 @@ def __init__(
         pad_token="<pad>",
         cls_token="[CLS]",
         mask_token="[MASK]",
-        **kwargs
+        **kwargs,
     ):
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
@@ -139,7 +139,10 @@ def __init__(
         self.remove_space = remove_space
         self.keep_accents = keep_accents
         self.vocab_file = vocab_file
-        self.can_save_slow_tokenizer = False if not self.vocab_file else True
+
+    @property
+    def can_save_slow_tokenizer(self) -> bool:
+        return os.path.isfile(self.vocab_file) if self.vocab_file else False
 
     def build_inputs_with_special_tokens(
         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
@@ -191,7 +194,7 @@ def get_special_tokens_mask(
                     "You should not supply a second sequence if the provided sequence of "
                     "ids is already formatted with special tokens for the model."
                 )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+            return [1 if x in [self.sep_token_id, self.cls_token_id] else 0 for x in token_ids_0]
 
         if token_ids_1 is not None:
             return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
diff --git a/src/transformers/models/resnet/__init__.py b/src/transformers/models/resnet/__init__.py
index be110e9c50ab..62e6b1c2ca1a 100644
--- a/src/transformers/models/resnet/__init__.py
+++ b/src/transformers/models/resnet/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,8 +13,13 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-# rely on isort to merge the imports
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tf_available, is_torch_available
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_flax_available,
+    is_tf_available,
+    is_torch_available,
+)
 
 
 _import_structure = {
@@ -52,6 +53,17 @@
         "TFResNetPreTrainedModel",
     ]
 
+try:
+    if not is_flax_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_flax_resnet"] = [
+        "FlaxResNetForImageClassification",
+        "FlaxResNetModel",
+        "FlaxResNetPreTrainedModel",
+    ]
 
 if TYPE_CHECKING:
     from .configuration_resnet import RESNET_PRETRAINED_CONFIG_ARCHIVE_MAP, ResNetConfig, ResNetOnnxConfig
@@ -83,6 +95,14 @@
             TFResNetPreTrainedModel,
         )
 
+    try:
+        if not is_flax_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_flax_resnet import FlaxResNetForImageClassification, FlaxResNetModel, FlaxResNetPreTrainedModel
+
 
 else:
     import sys
diff --git a/src/transformers/models/resnet/configuration_resnet.py b/src/transformers/models/resnet/configuration_resnet.py
index 74f6c6939722..da465f5d039a 100644
--- a/src/transformers/models/resnet/configuration_resnet.py
+++ b/src/transformers/models/resnet/configuration_resnet.py
@@ -22,6 +22,7 @@
 from ...configuration_utils import PretrainedConfig
 from ...onnx import OnnxConfig
 from ...utils import logging
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
 
 
 logger = logging.get_logger(__name__)
@@ -31,7 +32,7 @@
 }
 
 
-class ResNetConfig(PretrainedConfig):
+class ResNetConfig(BackboneConfigMixin, PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`ResNetModel`]. It is used to instantiate an
     ResNet model according to the specified arguments, defining the model architecture. Instantiating a configuration
@@ -58,9 +59,16 @@ class ResNetConfig(PretrainedConfig):
             are supported.
         downsample_in_first_stage (`bool`, *optional*, defaults to `False`):
             If `True`, the first stage will downsample the inputs using a `stride` of 2.
+        downsample_in_bottleneck (`bool`, *optional*, defaults to `False`):
+            If `True`, the first conv 1x1 in ResNetBottleNeckLayer will downsample the inputs using a `stride` of 2.
         out_features (`List[str]`, *optional*):
             If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
-            (depending on how many stages the model has). Will default to the last stage if unset.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage.
+        out_indices (`List[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage.
 
     Example:
     ```python
@@ -88,8 +96,10 @@ def __init__(
         layer_type="bottleneck",
         hidden_act="relu",
         downsample_in_first_stage=False,
+        downsample_in_bottleneck=False,
         out_features=None,
-        **kwargs
+        out_indices=None,
+        **kwargs,
     ):
         super().__init__(**kwargs)
         if layer_type not in self.layer_types:
@@ -101,20 +111,14 @@ def __init__(
         self.layer_type = layer_type
         self.hidden_act = hidden_act
         self.downsample_in_first_stage = downsample_in_first_stage
+        self.downsample_in_bottleneck = downsample_in_bottleneck
         self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
-        if out_features is not None:
-            if not isinstance(out_features, list):
-                raise ValueError("out_features should be a list")
-            for feature in out_features:
-                if feature not in self.stage_names:
-                    raise ValueError(
-                        f"Feature {feature} is not a valid feature name. Valid names are {self.stage_names}"
-                    )
-        self.out_features = out_features
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
 
 
 class ResNetOnnxConfig(OnnxConfig):
-
     torch_onnx_minimum_version = version.parse("1.11")
 
     @property
diff --git a/src/transformers/models/resnet/convert_resnet_to_pytorch.py b/src/transformers/models/resnet/convert_resnet_to_pytorch.py
index ef3d564185df..52b0bd906871 100644
--- a/src/transformers/models/resnet/convert_resnet_to_pytorch.py
+++ b/src/transformers/models/resnet/convert_resnet_to_pytorch.py
@@ -22,13 +22,13 @@
 from pathlib import Path
 from typing import List
 
+import timm
 import torch
 import torch.nn as nn
+from huggingface_hub import hf_hub_download
 from torch import Tensor
 
-import timm
-from huggingface_hub import hf_hub_download
-from transformers import AutoFeatureExtractor, ResNetConfig, ResNetForImageClassification
+from transformers import AutoImageProcessor, ResNetConfig, ResNetForImageClassification
 from transformers.utils import logging
 
 
@@ -51,7 +51,7 @@ def __call__(self, x: Tensor):
         for m in self.module.modules():
             self.handles.append(m.register_forward_hook(self._forward_hook))
         self.module(x)
-        list(map(lambda x: x.remove(), self.handles))
+        [x.remove() for x in self.handles]
         return self
 
     @property
@@ -113,10 +113,10 @@ def convert_weight_and_push(name: str, config: ResNetConfig, save_directory: Pat
         )
 
         # we can use the convnext one
-        feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/convnext-base-224-22k-1k")
-        feature_extractor.push_to_hub(
+        image_processor = AutoImageProcessor.from_pretrained("facebook/convnext-base-224-22k-1k")
+        image_processor.push_to_hub(
             repo_path_or_name=save_directory / checkpoint_name,
-            commit_message="Add feature extractor",
+            commit_message="Add image processor",
             use_temp_dir=True,
         )
 
@@ -191,7 +191,7 @@ def convert_weights_and_push(save_directory: Path, model_name: str = None, push_
         default=True,
         type=bool,
         required=False,
-        help="If True, push model and feature extractor to the hub.",
+        help="If True, push model and image processor to the hub.",
     )
 
     args = parser.parse_args()
diff --git a/src/transformers/models/resnet/modeling_flax_resnet.py b/src/transformers/models/resnet/modeling_flax_resnet.py
new file mode 100644
index 000000000000..875716d3f5be
--- /dev/null
+++ b/src/transformers/models/resnet/modeling_flax_resnet.py
@@ -0,0 +1,701 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+from typing import Optional, Tuple
+
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.traverse_util import flatten_dict, unflatten_dict
+
+from ...modeling_flax_outputs import (
+    FlaxBaseModelOutputWithNoAttention,
+    FlaxBaseModelOutputWithPoolingAndNoAttention,
+    FlaxImageClassifierOutputWithNoAttention,
+)
+from ...modeling_flax_utils import (
+    ACT2FN,
+    FlaxPreTrainedModel,
+    append_replace_return_docstrings,
+    overwrite_call_docstring,
+)
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward
+from .configuration_resnet import ResNetConfig
+
+
+RESNET_START_DOCSTRING = r"""
+
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)
+
+    This model is also a Flax Linen [flax.linen.Module](https://flax.readthedocs.io/en/latest/flax.linen.html#module)
+    subclass. Use it as a regular Flax linen Module and refer to the Flax documentation for all matter related to
+    general usage and behavior.
+
+    Finally, this model supports inherent JAX features such as:
+
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+    Parameters:
+        config ([`ResNetConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+            `jax.numpy.bfloat16` (on TPUs).
+
+            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
+            specified all the computation will be performed with the given `dtype`.
+
+            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+            parameters.**
+
+            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
+            [`~FlaxPreTrainedModel.to_bf16`].
+"""
+
+
+RESNET_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`jax.numpy.float32` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`AutoImageProcessor.__call__`] for details.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class Identity(nn.Module):
+    """Identity function."""
+
+    @nn.compact
+    def __call__(self, x, **kwargs):
+        return x
+
+
+class FlaxResNetConvLayer(nn.Module):
+    out_channels: int
+    kernel_size: int = 3
+    stride: int = 1
+    activation: Optional[str] = "relu"
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.convolution = nn.Conv(
+            self.out_channels,
+            kernel_size=(self.kernel_size, self.kernel_size),
+            strides=self.stride,
+            padding=self.kernel_size // 2,
+            dtype=self.dtype,
+            use_bias=False,
+            kernel_init=nn.initializers.variance_scaling(2.0, mode="fan_out", distribution="normal", dtype=self.dtype),
+        )
+        self.normalization = nn.BatchNorm(momentum=0.9, epsilon=1e-05, dtype=self.dtype)
+        self.activation_func = ACT2FN[self.activation] if self.activation is not None else Identity()
+
+    def __call__(self, x: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
+        hidden_state = self.convolution(x)
+        hidden_state = self.normalization(hidden_state, use_running_average=deterministic)
+        hidden_state = self.activation_func(hidden_state)
+        return hidden_state
+
+
+class FlaxResNetEmbeddings(nn.Module):
+    """
+    ResNet Embeddings (stem) composed of a single aggressive convolution.
+    """
+
+    config: ResNetConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.embedder = FlaxResNetConvLayer(
+            self.config.embedding_size,
+            kernel_size=7,
+            stride=2,
+            activation=self.config.hidden_act,
+            dtype=self.dtype,
+        )
+
+        self.max_pool = partial(nn.max_pool, window_shape=(3, 3), strides=(2, 2), padding=((1, 1), (1, 1)))
+
+    def __call__(self, pixel_values: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
+        num_channels = pixel_values.shape[-1]
+        if num_channels != self.config.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        embedding = self.embedder(pixel_values, deterministic=deterministic)
+        embedding = self.max_pool(embedding)
+        return embedding
+
+
+class FlaxResNetShortCut(nn.Module):
+    """
+    ResNet shortcut, used to project the residual features to the correct size. If needed, it is also used to
+    downsample the input using `stride=2`.
+    """
+
+    out_channels: int
+    stride: int = 2
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.convolution = nn.Conv(
+            self.out_channels,
+            kernel_size=(1, 1),
+            strides=self.stride,
+            use_bias=False,
+            kernel_init=nn.initializers.variance_scaling(2.0, mode="fan_out", distribution="truncated_normal"),
+            dtype=self.dtype,
+        )
+        self.normalization = nn.BatchNorm(momentum=0.9, epsilon=1e-05, dtype=self.dtype)
+
+    def __call__(self, x: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
+        hidden_state = self.convolution(x)
+        hidden_state = self.normalization(hidden_state, use_running_average=deterministic)
+        return hidden_state
+
+
+class FlaxResNetBasicLayerCollection(nn.Module):
+    out_channels: int
+    stride: int = 1
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.layer = [
+            FlaxResNetConvLayer(self.out_channels, stride=self.stride, dtype=self.dtype),
+            FlaxResNetConvLayer(self.out_channels, activation=None, dtype=self.dtype),
+        ]
+
+    def __call__(self, hidden_state: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
+        for layer in self.layer:
+            hidden_state = layer(hidden_state, deterministic=deterministic)
+        return hidden_state
+
+
+class FlaxResNetBasicLayer(nn.Module):
+    """
+    A classic ResNet's residual layer composed by two `3x3` convolutions.
+    """
+
+    in_channels: int
+    out_channels: int
+    stride: int = 1
+    activation: Optional[str] = "relu"
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        should_apply_shortcut = self.in_channels != self.out_channels or self.stride != 1
+        self.shortcut = (
+            FlaxResNetShortCut(self.out_channels, stride=self.stride, dtype=self.dtype)
+            if should_apply_shortcut
+            else None
+        )
+        self.layer = FlaxResNetBasicLayerCollection(
+            out_channels=self.out_channels,
+            stride=self.stride,
+            activation=self.activation,
+            dtype=self.dtype,
+        )
+        self.activation_func = ACT2FN[self.activation]
+
+    def __call__(self, hidden_state, deterministic: bool = True):
+        residual = hidden_state
+        hidden_state = self.layer(hidden_state, deterministic=deterministic)
+
+        if self.shortcut is not None:
+            residual = self.shortcut(residual, deterministic=deterministic)
+        hidden_state += residual
+
+        hidden_state = self.activation_func(hidden_state)
+        return hidden_state
+
+
+class FlaxResNetBottleNeckLayerCollection(nn.Module):
+    out_channels: int
+    stride: int = 1
+    activation: Optional[str] = "relu"
+    reduction: int = 4
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        reduces_channels = self.out_channels // self.reduction
+
+        self.layer = [
+            FlaxResNetConvLayer(reduces_channels, kernel_size=1, dtype=self.dtype, name="0"),
+            FlaxResNetConvLayer(reduces_channels, stride=self.stride, dtype=self.dtype, name="1"),
+            FlaxResNetConvLayer(self.out_channels, kernel_size=1, activation=None, dtype=self.dtype, name="2"),
+        ]
+
+    def __call__(self, hidden_state: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
+        for layer in self.layer:
+            hidden_state = layer(hidden_state, deterministic=deterministic)
+        return hidden_state
+
+
+class FlaxResNetBottleNeckLayer(nn.Module):
+    """
+    A classic ResNet's bottleneck layer composed by three `3x3` convolutions. The first `1x1` convolution reduces the
+    input by a factor of `reduction` in order to make the second `3x3` convolution faster. The last `1x1` convolution
+    remaps the reduced features to `out_channels`.
+    """
+
+    in_channels: int
+    out_channels: int
+    stride: int = 1
+    activation: Optional[str] = "relu"
+    reduction: int = 4
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        should_apply_shortcut = self.in_channels != self.out_channels or self.stride != 1
+        self.shortcut = (
+            FlaxResNetShortCut(self.out_channels, stride=self.stride, dtype=self.dtype)
+            if should_apply_shortcut
+            else None
+        )
+
+        self.layer = FlaxResNetBottleNeckLayerCollection(
+            self.out_channels,
+            stride=self.stride,
+            activation=self.activation,
+            reduction=self.reduction,
+            dtype=self.dtype,
+        )
+
+        self.activation_func = ACT2FN[self.activation]
+
+    def __call__(self, hidden_state: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
+        residual = hidden_state
+
+        if self.shortcut is not None:
+            residual = self.shortcut(residual, deterministic=deterministic)
+        hidden_state = self.layer(hidden_state, deterministic)
+        hidden_state += residual
+        hidden_state = self.activation_func(hidden_state)
+        return hidden_state
+
+
+class FlaxResNetStageLayersCollection(nn.Module):
+    """
+    A ResNet stage composed by stacked layers.
+    """
+
+    config: ResNetConfig
+    in_channels: int
+    out_channels: int
+    stride: int = 2
+    depth: int = 2
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        layer = FlaxResNetBottleNeckLayer if self.config.layer_type == "bottleneck" else FlaxResNetBasicLayer
+
+        layers = [
+            # downsampling is done in the first layer with stride of 2
+            layer(
+                self.in_channels,
+                self.out_channels,
+                stride=self.stride,
+                activation=self.config.hidden_act,
+                dtype=self.dtype,
+                name="0",
+            ),
+        ]
+
+        for i in range(self.depth - 1):
+            layers.append(
+                layer(
+                    self.out_channels,
+                    self.out_channels,
+                    activation=self.config.hidden_act,
+                    dtype=self.dtype,
+                    name=str(i + 1),
+                )
+            )
+
+        self.layers = layers
+
+    def __call__(self, x: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
+        hidden_state = x
+        for layer in self.layers:
+            hidden_state = layer(hidden_state, deterministic=deterministic)
+        return hidden_state
+
+
+class FlaxResNetStage(nn.Module):
+    """
+    A ResNet stage composed by stacked layers.
+    """
+
+    config: ResNetConfig
+    in_channels: int
+    out_channels: int
+    stride: int = 2
+    depth: int = 2
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.layers = FlaxResNetStageLayersCollection(
+            self.config,
+            in_channels=self.in_channels,
+            out_channels=self.out_channels,
+            stride=self.stride,
+            depth=self.depth,
+            dtype=self.dtype,
+        )
+
+    def __call__(self, x: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
+        return self.layers(x, deterministic=deterministic)
+
+
+class FlaxResNetStageCollection(nn.Module):
+    config: ResNetConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        in_out_channels = zip(self.config.hidden_sizes, self.config.hidden_sizes[1:])
+        stages = [
+            FlaxResNetStage(
+                self.config,
+                self.config.embedding_size,
+                self.config.hidden_sizes[0],
+                stride=2 if self.config.downsample_in_first_stage else 1,
+                depth=self.config.depths[0],
+                dtype=self.dtype,
+                name="0",
+            )
+        ]
+
+        for i, ((in_channels, out_channels), depth) in enumerate(zip(in_out_channels, self.config.depths[1:])):
+            stages.append(
+                FlaxResNetStage(self.config, in_channels, out_channels, depth=depth, dtype=self.dtype, name=str(i + 1))
+            )
+
+        self.stages = stages
+
+    def __call__(
+        self,
+        hidden_state: jnp.ndarray,
+        output_hidden_states: bool = False,
+        deterministic: bool = True,
+    ) -> FlaxBaseModelOutputWithNoAttention:
+        hidden_states = () if output_hidden_states else None
+
+        for stage_module in self.stages:
+            if output_hidden_states:
+                hidden_states = hidden_states + (hidden_state.transpose(0, 3, 1, 2),)
+
+            hidden_state = stage_module(hidden_state, deterministic=deterministic)
+
+        return hidden_state, hidden_states
+
+
+class FlaxResNetEncoder(nn.Module):
+    config: ResNetConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.stages = FlaxResNetStageCollection(self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        hidden_state: jnp.ndarray,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+    ) -> FlaxBaseModelOutputWithNoAttention:
+        hidden_state, hidden_states = self.stages(
+            hidden_state, output_hidden_states=output_hidden_states, deterministic=deterministic
+        )
+
+        if output_hidden_states:
+            hidden_states = hidden_states + (hidden_state.transpose(0, 3, 1, 2),)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_state, hidden_states] if v is not None)
+
+        return FlaxBaseModelOutputWithNoAttention(
+            last_hidden_state=hidden_state,
+            hidden_states=hidden_states,
+        )
+
+
+class FlaxResNetPreTrainedModel(FlaxPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = ResNetConfig
+    base_model_prefix = "resnet"
+    main_input_name = "pixel_values"
+    module_class: nn.Module = None
+
+    def __init__(
+        self,
+        config: ResNetConfig,
+        input_shape=(1, 224, 224, 3),
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        **kwargs,
+    ):
+        module = self.module_class(config=config, dtype=dtype, **kwargs)
+        if input_shape is None:
+            input_shape = (1, config.image_size, config.image_size, config.num_channels)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        pixel_values = jnp.zeros(input_shape, dtype=self.dtype)
+
+        rngs = {"params": rng}
+
+        random_params = self.module.init(rngs, pixel_values, return_dict=False)
+
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+
+    @add_start_docstrings_to_model_forward(RESNET_INPUTS_DOCSTRING)
+    def __call__(
+        self,
+        pixel_values,
+        params: dict = None,
+        train: bool = False,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        pixel_values = jnp.transpose(pixel_values, (0, 2, 3, 1))
+
+        # Handle any PRNG if needed
+        rngs = {}
+
+        return self.module.apply(
+            {
+                "params": params["params"] if params is not None else self.params["params"],
+                "batch_stats": params["batch_stats"] if params is not None else self.params["batch_stats"],
+            },
+            jnp.array(pixel_values, dtype=jnp.float32),
+            not train,
+            output_hidden_states,
+            return_dict,
+            rngs=rngs,
+            mutable=["batch_stats"] if train else False,  # Returing tuple with batch_stats only when train is True
+        )
+
+
+class FlaxResNetModule(nn.Module):
+    config: ResNetConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.embedder = FlaxResNetEmbeddings(self.config, dtype=self.dtype)
+        self.encoder = FlaxResNetEncoder(self.config, dtype=self.dtype)
+
+        # Adaptive average pooling used in resnet
+        self.pooler = partial(
+            nn.avg_pool,
+            padding=((0, 0), (0, 0)),
+        )
+
+    def __call__(
+        self,
+        pixel_values,
+        deterministic: bool = True,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> FlaxBaseModelOutputWithPoolingAndNoAttention:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        embedding_output = self.embedder(pixel_values, deterministic=deterministic)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=deterministic,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+
+        pooled_output = self.pooler(
+            last_hidden_state,
+            window_shape=(last_hidden_state.shape[1], last_hidden_state.shape[2]),
+            strides=(last_hidden_state.shape[1], last_hidden_state.shape[2]),
+        ).transpose(0, 3, 1, 2)
+
+        last_hidden_state = last_hidden_state.transpose(0, 3, 1, 2)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return FlaxBaseModelOutputWithPoolingAndNoAttention(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+        )
+
+
+@add_start_docstrings(
+    "The bare ResNet model outputting raw features without any specific head on top.",
+    RESNET_START_DOCSTRING,
+)
+class FlaxResNetModel(FlaxResNetPreTrainedModel):
+    module_class = FlaxResNetModule
+
+
+FLAX_VISION_MODEL_DOCSTRING = """
+    Returns:
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoImageProcessor, FlaxResNetModel
+    >>> from PIL import Image
+    >>> import requests
+
+    >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    >>> image = Image.open(requests.get(url, stream=True).raw)
+    >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")
+    >>> model = FlaxResNetModel.from_pretrained("microsoft/resnet-50")
+    >>> inputs = image_processor(images=image, return_tensors="np")
+    >>> outputs = model(**inputs)
+    >>> last_hidden_states = outputs.last_hidden_state
+    ```
+"""
+
+overwrite_call_docstring(FlaxResNetModel, FLAX_VISION_MODEL_DOCSTRING)
+append_replace_return_docstrings(
+    FlaxResNetModel, output_type=FlaxBaseModelOutputWithPoolingAndNoAttention, config_class=ResNetConfig
+)
+
+
+class FlaxResNetClassifierCollection(nn.Module):
+    config: ResNetConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype, name="1")
+
+    def __call__(self, x: jnp.ndarray) -> jnp.ndarray:
+        return self.classifier(x)
+
+
+class FlaxResNetForImageClassificationModule(nn.Module):
+    config: ResNetConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.resnet = FlaxResNetModule(config=self.config, dtype=self.dtype)
+
+        if self.config.num_labels > 0:
+            self.classifier = FlaxResNetClassifierCollection(self.config, dtype=self.dtype)
+        else:
+            self.classifier = Identity()
+
+    def __call__(
+        self,
+        pixel_values=None,
+        deterministic: bool = True,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.resnet(
+            pixel_values,
+            deterministic=deterministic,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs.pooler_output if return_dict else outputs[1]
+
+        logits = self.classifier(pooled_output[:, :, 0, 0])
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return output
+
+        return FlaxImageClassifierOutputWithNoAttention(logits=logits, hidden_states=outputs.hidden_states)
+
+
+@add_start_docstrings(
+    """
+    ResNet Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
+    ImageNet.
+    """,
+    RESNET_START_DOCSTRING,
+)
+class FlaxResNetForImageClassification(FlaxResNetPreTrainedModel):
+    module_class = FlaxResNetForImageClassificationModule
+
+
+FLAX_VISION_CLASSIF_DOCSTRING = """
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoImageProcessor, FlaxResNetForImageClassification
+    >>> from PIL import Image
+    >>> import jax
+    >>> import requests
+
+    >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    >>> image = Image.open(requests.get(url, stream=True).raw)
+
+    >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")
+    >>> model = FlaxResNetForImageClassification.from_pretrained("microsoft/resnet-50")
+
+    >>> inputs = image_processor(images=image, return_tensors="np")
+    >>> outputs = model(**inputs)
+    >>> logits = outputs.logits
+
+    >>> # model predicts one of the 1000 ImageNet classes
+    >>> predicted_class_idx = jax.numpy.argmax(logits, axis=-1)
+    >>> print("Predicted class:", model.config.id2label[predicted_class_idx.item()])
+    ```
+"""
+
+overwrite_call_docstring(FlaxResNetForImageClassification, FLAX_VISION_CLASSIF_DOCSTRING)
+append_replace_return_docstrings(
+    FlaxResNetForImageClassification, output_type=FlaxImageClassifierOutputWithNoAttention, config_class=ResNetConfig
+)
diff --git a/src/transformers/models/resnet/modeling_resnet.py b/src/transformers/models/resnet/modeling_resnet.py
index 4c737c218128..f2d207c2189f 100644
--- a/src/transformers/models/resnet/modeling_resnet.py
+++ b/src/transformers/models/resnet/modeling_resnet.py
@@ -28,7 +28,7 @@
     BaseModelOutputWithPoolingAndNoAttention,
     ImageClassifierOutputWithNoAttention,
 )
-from ...modeling_utils import BackboneMixin, PreTrainedModel
+from ...modeling_utils import PreTrainedModel
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -36,6 +36,7 @@
     logging,
     replace_return_docstrings,
 )
+from ...utils.backbone_utils import BackboneMixin
 from .configuration_resnet import ResNetConfig
 
 
@@ -43,7 +44,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "ResNetConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "microsoft/resnet-50"
@@ -149,11 +149,18 @@ class ResNetBottleNeckLayer(nn.Module):
     A classic ResNet's bottleneck layer composed by three `3x3` convolutions.
 
     The first `1x1` convolution reduces the input by a factor of `reduction` in order to make the second `3x3`
-    convolution faster. The last `1x1` convolution remaps the reduced features to `out_channels`.
+    convolution faster. The last `1x1` convolution remaps the reduced features to `out_channels`. If
+    `downsample_in_bottleneck` is true, downsample will be in the first layer instead of the second layer.
     """
 
     def __init__(
-        self, in_channels: int, out_channels: int, stride: int = 1, activation: str = "relu", reduction: int = 4
+        self,
+        in_channels: int,
+        out_channels: int,
+        stride: int = 1,
+        activation: str = "relu",
+        reduction: int = 4,
+        downsample_in_bottleneck: bool = False,
     ):
         super().__init__()
         should_apply_shortcut = in_channels != out_channels or stride != 1
@@ -162,8 +169,10 @@ def __init__(
             ResNetShortCut(in_channels, out_channels, stride=stride) if should_apply_shortcut else nn.Identity()
         )
         self.layer = nn.Sequential(
-            ResNetConvLayer(in_channels, reduces_channels, kernel_size=1),
-            ResNetConvLayer(reduces_channels, reduces_channels, stride=stride),
+            ResNetConvLayer(
+                in_channels, reduces_channels, kernel_size=1, stride=stride if downsample_in_bottleneck else 1
+            ),
+            ResNetConvLayer(reduces_channels, reduces_channels, stride=stride if not downsample_in_bottleneck else 1),
             ResNetConvLayer(reduces_channels, out_channels, kernel_size=1, activation=None),
         )
         self.activation = ACT2FN[activation]
@@ -194,10 +203,18 @@ def __init__(
 
         layer = ResNetBottleNeckLayer if config.layer_type == "bottleneck" else ResNetBasicLayer
 
+        if config.layer_type == "bottleneck":
+            first_layer = layer(
+                in_channels,
+                out_channels,
+                stride=stride,
+                activation=config.hidden_act,
+                downsample_in_bottleneck=config.downsample_in_bottleneck,
+            )
+        else:
+            first_layer = layer(in_channels, out_channels, stride=stride, activation=config.hidden_act)
         self.layers = nn.Sequential(
-            # downsampling is done in the first layer with stride of 2
-            layer(in_channels, out_channels, stride=stride, activation=config.hidden_act),
-            *[layer(out_channels, out_channels, activation=config.hidden_act) for _ in range(depth - 1)],
+            first_layer, *[layer(out_channels, out_channels, activation=config.hidden_act) for _ in range(depth - 1)]
         )
 
     def forward(self, input: Tensor) -> Tensor:
@@ -286,7 +303,7 @@ def _set_gradient_checkpointing(self, module, value=False):
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
-            [`AutoImageProcessor.__call__`] for details.
+            [`ConvNextImageProcessor.__call__`] for details.
 
         output_hidden_states (`bool`, *optional*):
             Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
@@ -312,7 +329,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(RESNET_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPoolingAndNoAttention,
         config_class=_CONFIG_FOR_DOC,
@@ -369,7 +385,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(RESNET_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_IMAGE_CLASS_CHECKPOINT,
         output_type=ImageClassifierOutputWithNoAttention,
         config_class=_CONFIG_FOR_DOC,
@@ -434,27 +449,15 @@ def forward(
 class ResNetBackbone(ResNetPreTrainedModel, BackboneMixin):
     def __init__(self, config):
         super().__init__(config)
+        super()._init_backbone(config)
 
-        self.stage_names = config.stage_names
+        self.num_features = [config.embedding_size] + config.hidden_sizes
         self.embedder = ResNetEmbeddings(config)
         self.encoder = ResNetEncoder(config)
 
-        self.out_features = config.out_features if config.out_features is not None else [self.stage_names[-1]]
-
-        out_feature_channels = {}
-        out_feature_channels["stem"] = config.embedding_size
-        for idx, stage in enumerate(self.stage_names[1:]):
-            out_feature_channels[stage] = config.hidden_sizes[idx]
-
-        self.out_feature_channels = out_feature_channels
-
         # initialize weights and apply final processing
         self.post_init()
 
-    @property
-    def channels(self):
-        return [self.out_feature_channels[name] for name in self.out_features]
-
     @add_start_docstrings_to_model_forward(RESNET_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
diff --git a/src/transformers/models/resnet/modeling_tf_resnet.py b/src/transformers/models/resnet/modeling_tf_resnet.py
index 483d5798e694..4ff1b119d428 100644
--- a/src/transformers/models/resnet/modeling_tf_resnet.py
+++ b/src/transformers/models/resnet/modeling_tf_resnet.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 """ TensorFlow ResNet model."""
 
-from typing import Dict, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import tensorflow as tf
 
@@ -34,7 +34,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "ResNetConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "microsoft/resnet-50"
@@ -172,7 +171,7 @@ def __init__(
         stride: int = 1,
         activation: str = "relu",
         reduction: int = 4,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         should_apply_shortcut = in_channels != out_channels or stride != 1
@@ -277,24 +276,8 @@ class TFResNetPreTrainedModel(TFPreTrainedModel):
     main_input_name = "pixel_values"
 
     @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network. Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        VISION_DUMMY_INPUTS = tf.random.uniform(shape=(3, self.config.num_channels, 224, 224), dtype=tf.float32)
-        return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)}
-
-    @tf.function(
-        input_signature=[
-            {
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-        return self.serving_output(output)
+    def input_signature(self):
+        return {"pixel_values": tf.TensorSpec(shape=(None, self.config.num_channels, 224, 224), dtype=tf.float32)}
 
 
 RESNET_START_DOCSTRING = r"""
@@ -314,7 +297,7 @@ def serving(self, inputs):
     Args:
         pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
-            [`AutoImageProcessor.__call__`] for details.
+            [`ConvNextImageProcessor.__call__`] for details.
 
         output_hidden_states (`bool`, *optional*):
             Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
@@ -393,7 +376,6 @@ def __init__(self, config: ResNetConfig, **kwargs) -> None:
 
     @add_start_docstrings_to_model_forward(RESNET_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFBaseModelOutputWithPoolingAndNoAttention,
         config_class=_CONFIG_FOR_DOC,
@@ -421,16 +403,6 @@ def call(
         )
         return resnet_outputs
 
-    def serving_output(
-        self, output: TFBaseModelOutputWithPoolingAndNoAttention
-    ) -> TFBaseModelOutputWithPoolingAndNoAttention:
-        # hidden_states not converted to Tensor with tf.convert_to_tensor as they are all of different dimensions
-        return TFBaseModelOutputWithPoolingAndNoAttention(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            hidden_states=output.hidden_states,
-        )
-
 
 @add_start_docstrings(
     """
@@ -458,7 +430,6 @@ def classifier(self, x: tf.Tensor) -> tf.Tensor:
 
     @add_start_docstrings_to_model_forward(RESNET_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_IMAGE_CLASS_CHECKPOINT,
         output_type=TFImageClassifierOutputWithNoAttention,
         config_class=_CONFIG_FOR_DOC,
@@ -495,7 +466,3 @@ def call(
             return (loss,) + output if loss is not None else output
 
         return TFImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
-
-    def serving_output(self, output: TFImageClassifierOutputWithNoAttention) -> TFImageClassifierOutputWithNoAttention:
-        # hidden_states not converted to Tensor with tf.convert_to_tensor as they are all of different dimensions
-        return TFImageClassifierOutputWithNoAttention(logits=output.logits, hidden_states=output.hidden_states)
diff --git a/src/transformers/models/retribert/__init__.py b/src/transformers/models/retribert/__init__.py
deleted file mode 100644
index 34cfadfe1a87..000000000000
--- a/src/transformers/models/retribert/__init__.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import TYPE_CHECKING
-
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
-
-
-_import_structure = {
-    "configuration_retribert": ["RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RetriBertConfig"],
-    "tokenization_retribert": ["RetriBertTokenizer"],
-}
-
-try:
-    if not is_tokenizers_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["tokenization_retribert_fast"] = ["RetriBertTokenizerFast"]
-
-try:
-    if not is_torch_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_retribert"] = [
-        "RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
-        "RetriBertModel",
-        "RetriBertPreTrainedModel",
-    ]
-
-
-if TYPE_CHECKING:
-    from .configuration_retribert import RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RetriBertConfig
-    from .tokenization_retribert import RetriBertTokenizer
-
-    try:
-        if not is_tokenizers_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .tokenization_retribert_fast import RetriBertTokenizerFast
-
-    try:
-        if not is_torch_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_retribert import (
-            RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
-            RetriBertModel,
-            RetriBertPreTrainedModel,
-        )
-
-else:
-    import sys
-
-    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/roberta/__init__.py b/src/transformers/models/roberta/__init__.py
index 2429ba113e8a..774179f5f6f4 100644
--- a/src/transformers/models/roberta/__init__.py
+++ b/src/transformers/models/roberta/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/roberta/configuration_roberta.py b/src/transformers/models/roberta/configuration_roberta.py
index 3d911f5847dc..f82033f4588f 100644
--- a/src/transformers/models/roberta/configuration_roberta.py
+++ b/src/transformers/models/roberta/configuration_roberta.py
@@ -46,7 +46,7 @@ class RobertaConfig(PretrainedConfig):
 
 
     Args:
-        vocab_size (`int`, *optional*, defaults to 30522):
+        vocab_size (`int`, *optional*, defaults to 50265):
             Vocabulary size of the RoBERTa model. Defines the number of different tokens that can be represented by the
             `inputs_ids` passed when calling [`RobertaModel`] or [`TFRobertaModel`].
         hidden_size (`int`, *optional*, defaults to 768):
@@ -105,7 +105,7 @@ class RobertaConfig(PretrainedConfig):
 
     def __init__(
         self,
-        vocab_size=30522,
+        vocab_size=50265,
         hidden_size=768,
         num_hidden_layers=12,
         num_attention_heads=12,
@@ -123,7 +123,7 @@ def __init__(
         position_embedding_type="absolute",
         use_cache=True,
         classifier_dropout=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
diff --git a/src/transformers/models/roberta/modeling_flax_roberta.py b/src/transformers/models/roberta/modeling_flax_roberta.py
index b7494e19f4de..845fcea44297 100644
--- a/src/transformers/models/roberta/modeling_flax_roberta.py
+++ b/src/transformers/models/roberta/modeling_flax_roberta.py
@@ -14,11 +14,10 @@
 # limitations under the License.
 from typing import Callable, Optional, Tuple
 
-import numpy as np
-
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen import combine_masks, make_causal_mask
 from flax.linen import partitioning as nn_partitioning
@@ -46,7 +45,6 @@
 
 _CHECKPOINT_FOR_DOC = "roberta-base"
 _CONFIG_FOR_DOC = "RobertaConfig"
-_TOKENIZER_FOR_DOC = "RobertaTokenizer"
 
 remat = nn_partitioning.remat
 
@@ -102,7 +100,7 @@ def create_position_ids_from_input_ids(input_ids, padding_idx):
         input_ids (`numpy.ndarray` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -319,7 +317,7 @@ def __call__(
             attention_bias = lax.select(
                 attention_mask > 0,
                 jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
-                jnp.full(attention_mask.shape, -1e10).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
             )
         else:
             attention_bias = None
@@ -740,7 +738,7 @@ def __init__(
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
         gradient_checkpointing: bool = False,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, gradient_checkpointing=gradient_checkpointing, **kwargs)
         super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
@@ -993,9 +991,7 @@ class FlaxRobertaModel(FlaxRobertaPreTrainedModel):
     module_class = FlaxRobertaModule
 
 
-append_call_sample_docstring(
-    FlaxRobertaModel, _TOKENIZER_FOR_DOC, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutputWithPooling, _CONFIG_FOR_DOC
-)
+append_call_sample_docstring(FlaxRobertaModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutputWithPooling, _CONFIG_FOR_DOC)
 
 
 class FlaxRobertaForMaskedLMModule(nn.Module):
@@ -1063,7 +1059,6 @@ class FlaxRobertaForMaskedLM(FlaxRobertaPreTrainedModel):
 
 append_call_sample_docstring(
     FlaxRobertaForMaskedLM,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxBaseModelOutputWithPooling,
     _CONFIG_FOR_DOC,
@@ -1136,7 +1131,6 @@ class FlaxRobertaForSequenceClassification(FlaxRobertaPreTrainedModel):
 
 append_call_sample_docstring(
     FlaxRobertaForSequenceClassification,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxSequenceClassifierOutput,
     _CONFIG_FOR_DOC,
@@ -1221,7 +1215,6 @@ class FlaxRobertaForMultipleChoice(FlaxRobertaPreTrainedModel):
 )
 append_call_sample_docstring(
     FlaxRobertaForMultipleChoice,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxMultipleChoiceModelOutput,
     _CONFIG_FOR_DOC,
@@ -1301,7 +1294,6 @@ class FlaxRobertaForTokenClassification(FlaxRobertaPreTrainedModel):
 
 append_call_sample_docstring(
     FlaxRobertaForTokenClassification,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxTokenClassifierOutput,
     _CONFIG_FOR_DOC,
@@ -1379,7 +1371,6 @@ class FlaxRobertaForQuestionAnswering(FlaxRobertaPreTrainedModel):
 
 append_call_sample_docstring(
     FlaxRobertaForQuestionAnswering,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxQuestionAnsweringModelOutput,
     _CONFIG_FOR_DOC,
@@ -1461,7 +1452,7 @@ def __call__(
 class FlaxRobertaForCausalLM(FlaxRobertaPreTrainedModel):
     module_class = FlaxRobertaForCausalLMModule
 
-    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jnp.DeviceArray] = None):
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
         # initializing the cache
         batch_size, seq_length = input_ids.shape
 
@@ -1490,7 +1481,6 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
 
 append_call_sample_docstring(
     FlaxRobertaForCausalLM,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxCausalLMOutputWithCrossAttentions,
     _CONFIG_FOR_DOC,
diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py
index 7fc9ce5de839..4f0dd3983d03 100644
--- a/src/transformers/models/roberta/modeling_roberta.py
+++ b/src/transformers/models/roberta/modeling_roberta.py
@@ -50,7 +50,6 @@
 
 _CHECKPOINT_FOR_DOC = "roberta-base"
 _CONFIG_FOR_DOC = "RobertaConfig"
-_TOKENIZER_FOR_DOC = "RobertaTokenizer"
 
 ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "roberta-base",
@@ -81,7 +80,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
         )
@@ -493,6 +494,13 @@ def forward(
         all_self_attentions = () if output_attentions else None
         all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         next_decoder_cache = () if use_cache else None
         for i, layer_module in enumerate(self.layer):
             if output_hidden_states:
@@ -503,12 +511,6 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         return module(*inputs, past_key_value, output_attentions)
@@ -591,7 +593,7 @@ class RobertaPreTrainedModel(PreTrainedModel):
     config_class = RobertaConfig
     base_model_prefix = "roberta"
     supports_gradient_checkpointing = True
-    _no_split_modules = []
+    _no_split_modules = ["RobertaEmbeddings", "RobertaSelfAttention"]
 
     # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
     def _init_weights(self, module):
@@ -614,15 +616,6 @@ def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, RobertaEncoder):
             module.gradient_checkpointing = value
 
-    def update_keys_to_ignore(self, config, del_keys_to_ignore):
-        """Remove some keys from ignore list"""
-        if not config.tie_word_embeddings:
-            # must make a new list, or the class variable gets modified!
-            self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore]
-            self._keys_to_ignore_on_load_missing = [
-                k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore
-            ]
-
 
 ROBERTA_START_DOCSTRING = r"""
 
@@ -645,7 +638,7 @@ def update_keys_to_ignore(self, config, del_keys_to_ignore):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`RobertaTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -711,8 +704,6 @@ class RobertaModel(RobertaPreTrainedModel):
 
     """
 
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Roberta
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
@@ -742,7 +733,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPoolingAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -798,6 +788,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -882,9 +873,7 @@ def forward(
     """RoBERTa Model with a `language modeling` head on top for CLM fine-tuning.""", ROBERTA_START_DOCSTRING
 )
 class RobertaForCausalLM(RobertaPreTrainedModel):
-    _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -895,9 +884,6 @@ def __init__(self, config):
         self.roberta = RobertaModel(config, add_pooling_layer=False)
         self.lm_head = RobertaLMHead(config)
 
-        # The LM head weights require special treatment only when they are tied with the word embeddings
-        self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
-
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -994,6 +980,8 @@ def forward(
 
         lm_loss = None
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(prediction_scores.device)
             # we are doing next-token prediction; shift prediction scores and input ids by one
             shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
             labels = labels[:, 1:].contiguous()
@@ -1025,18 +1013,18 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attenti
 
         return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
 
-    def _reorder_cache(self, past, beam_idx):
+    def _reorder_cache(self, past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
         return reordered_past
 
 
 @add_start_docstrings("""RoBERTa Model with a `language modeling` head on top.""", ROBERTA_START_DOCSTRING)
 class RobertaForMaskedLM(RobertaPreTrainedModel):
-    _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1050,9 +1038,6 @@ def __init__(self, config):
         self.roberta = RobertaModel(config, add_pooling_layer=False)
         self.lm_head = RobertaLMHead(config)
 
-        # The LM head weights require special treatment only when they are tied with the word embeddings
-        self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
-
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -1064,7 +1049,6 @@ def set_output_embeddings(self, new_embeddings):
 
     @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1115,6 +1099,8 @@ def forward(
 
         masked_lm_loss = None
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(prediction_scores.device)
             loss_fct = CrossEntropyLoss()
             masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
 
@@ -1169,8 +1155,6 @@ def _tie_weights(self):
     ROBERTA_START_DOCSTRING,
 )
 class RobertaForSequenceClassification(RobertaPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1184,7 +1168,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="cardiffnlp/twitter-roberta-base-emotion",
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1228,6 +1211,8 @@ def forward(
 
         loss = None
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
             if self.config.problem_type is None:
                 if self.num_labels == 1:
                     self.config.problem_type = "regression"
@@ -1269,8 +1254,6 @@ def forward(
     ROBERTA_START_DOCSTRING,
 )
 class RobertaForMultipleChoice(RobertaPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -1283,7 +1266,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1339,6 +1321,8 @@ def forward(
 
         loss = None
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(reshaped_logits.device)
             loss_fct = CrossEntropyLoss()
             loss = loss_fct(reshaped_logits, labels)
 
@@ -1362,9 +1346,6 @@ def forward(
     ROBERTA_START_DOCSTRING,
 )
 class RobertaForTokenClassification(RobertaPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1381,7 +1362,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="Jean-Baptiste/roberta-large-ner-english",
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1426,6 +1406,8 @@ def forward(
 
         loss = None
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
             loss_fct = CrossEntropyLoss()
             loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
 
@@ -1471,9 +1453,6 @@ def forward(self, features, **kwargs):
     ROBERTA_START_DOCSTRING,
 )
 class RobertaForQuestionAnswering(RobertaPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1487,7 +1466,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="deepset/roberta-base-squad2",
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/roberta/modeling_tf_roberta.py b/src/transformers/models/roberta/modeling_tf_roberta.py
index 198ec2faa704..9b6c491d2761 100644
--- a/src/transformers/models/roberta/modeling_tf_roberta.py
+++ b/src/transformers/models/roberta/modeling_tf_roberta.py
@@ -15,6 +15,9 @@
 # limitations under the License.
 """ TF 2.0 RoBERTa model."""
 
+
+from __future__ import annotations
+
 import math
 import warnings
 from typing import Optional, Tuple, Union
@@ -46,10 +49,8 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
-    DUMMY_INPUTS,
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -62,7 +63,6 @@
 
 _CHECKPOINT_FOR_DOC = "roberta-base"
 _CONFIG_FOR_DOC = "RobertaConfig"
-_TOKENIZER_FOR_DOC = "RobertaTokenizer"
 
 TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "roberta-base",
@@ -82,8 +82,7 @@ def __init__(self, config, **kwargs):
         super().__init__(**kwargs)
 
         self.padding_idx = 1
-        self.vocab_size = config.vocab_size
-        self.type_vocab_size = config.type_vocab_size
+        self.config = config
         self.hidden_size = config.hidden_size
         self.max_position_embeddings = config.max_position_embeddings
         self.initializer_range = config.initializer_range
@@ -94,14 +93,14 @@ def build(self, input_shape: tf.TensorShape):
         with tf.name_scope("word_embeddings"):
             self.weight = self.add_weight(
                 name="weight",
-                shape=[self.vocab_size, self.hidden_size],
+                shape=[self.config.vocab_size, self.hidden_size],
                 initializer=get_initializer(self.initializer_range),
             )
 
         with tf.name_scope("token_type_embeddings"):
             self.token_type_embeddings = self.add_weight(
                 name="embeddings",
-                shape=[self.type_vocab_size, self.hidden_size],
+                shape=[self.config.type_vocab_size, self.hidden_size],
                 initializer=get_initializer(self.initializer_range),
             )
 
@@ -146,16 +145,7 @@ def call(
         assert not (input_ids is None and inputs_embeds is None)
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
@@ -442,9 +432,9 @@ def call(
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
         head_mask: tf.Tensor,
-        encoder_hidden_states: Optional[tf.Tensor],
-        encoder_attention_mask: Optional[tf.Tensor],
-        past_key_value: Optional[Tuple[tf.Tensor]],
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_value: Tuple[tf.Tensor] | None,
         output_attentions: bool,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
@@ -521,9 +511,9 @@ def call(
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
         head_mask: tf.Tensor,
-        encoder_hidden_states: Optional[tf.Tensor],
-        encoder_attention_mask: Optional[tf.Tensor],
-        past_key_values: Optional[Tuple[Tuple[tf.Tensor]]],
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_values: Tuple[Tuple[tf.Tensor]] | None,
         use_cache: Optional[bool],
         output_attentions: bool,
         output_hidden_states: bool,
@@ -620,14 +610,14 @@ class PreTrainedModel
     # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.call
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -635,7 +625,6 @@ def call(
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFBaseModelOutputWithPoolingAndCrossAttentions, Tuple[tf.Tensor]]:
-
         if not self.config.is_decoder:
             use_cache = False
 
@@ -786,38 +775,6 @@ class TFRobertaPreTrainedModel(TFPreTrainedModel):
     config_class = RobertaConfig
     base_model_prefix = "roberta"
 
-    @property
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertPreTrainedModel.dummy_inputs
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        dummy = {"input_ids": tf.constant(DUMMY_INPUTS, dtype=tf.int32)}
-        # Add `encoder_hidden_states` to make the cross-attention layers' weights initialized
-        if self.config.add_cross_attention:
-            batch_size, seq_len = tf.constant(DUMMY_INPUTS).shape
-            shape = (batch_size, seq_len) + (self.config.hidden_size,)
-            h = tf.random.uniform(shape=shape)
-            dummy["encoder_hidden_states"] = h
-
-        return dummy
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 ROBERTA_START_DOCSTRING = r"""
 
@@ -866,7 +823,7 @@ def serving(self, inputs):
         input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`RobertaTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -929,21 +886,20 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFBaseModelOutputWithPoolingAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -990,27 +946,6 @@ def call(
 
         return outputs
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertModel.serving_output
-    def serving_output(
-        self, output: TFBaseModelOutputWithPoolingAndCrossAttentions
-    ) -> TFBaseModelOutputWithPoolingAndCrossAttentions:
-        output_cache = self.config.use_cache and self.config.is_decoder
-        pkv = tf.convert_to_tensor(output.past_key_values) if output_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = tf.convert_to_tensor(output.cross_attentions) if output.cross_attentions is not None else None
-        if not (self.config.output_attentions and self.config.add_cross_attention):
-            cross_attns = None
-
-        return TFBaseModelOutputWithPoolingAndCrossAttentions(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            past_key_values=pkv,
-            hidden_states=hs,
-            attentions=attns,
-            cross_attentions=cross_attns,
-        )
-
 
 class TFRobertaLMHead(tf.keras.layers.Layer):
     """Roberta Head for masked language modeling."""
@@ -1018,7 +953,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
     def __init__(self, config, input_embeddings, **kwargs):
         super().__init__(**kwargs)
 
-        self.vocab_size = config.vocab_size
+        self.config = config
         self.hidden_size = config.hidden_size
         self.dense = tf.keras.layers.Dense(
             config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
@@ -1031,7 +966,7 @@ def __init__(self, config, input_embeddings, **kwargs):
         self.decoder = input_embeddings
 
     def build(self, input_shape):
-        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
+        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
 
         super().build(input_shape)
 
@@ -1047,7 +982,7 @@ def get_bias(self):
 
     def set_bias(self, value):
         self.bias = value["bias"]
-        self.vocab_size = shape_list(value["bias"])[0]
+        self.config.vocab_size = shape_list(value["bias"])[0]
 
     def call(self, hidden_states):
         hidden_states = self.dense(hidden_states)
@@ -1058,7 +993,7 @@ def call(self, hidden_states):
         seq_length = shape_list(tensor=hidden_states)[1]
         hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
         hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True)
-        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
         hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
 
         return hidden_states
@@ -1085,7 +1020,6 @@ def get_prefix_bias_name(self):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFMaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1095,16 +1029,16 @@ def get_prefix_bias_name(self):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1142,13 +1076,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output
-    def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 class TFRobertaForCausalLM(TFRobertaPreTrainedModel, TFCausalLanguageModelingLoss):
     # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
@@ -1186,27 +1113,26 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attenti
     @unpack_inputs
     @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFCausalLMOutputWithCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFCausalLMOutputWithCrossAttentions, Tuple[tf.Tensor]]:
         r"""
@@ -1272,20 +1198,6 @@ def call(
             cross_attentions=outputs.cross_attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMHeadModel.serving_output
-    def serving_output(self, output: TFCausalLMOutputWithCrossAttentions) -> TFCausalLMOutputWithCrossAttentions:
-        output_cache = self.config.use_cache and self.config.is_decoder
-        pkv = tf.convert_to_tensor(output.past_key_values) if output_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = tf.convert_to_tensor(output.cross_attentions) if output.cross_attentions is not None else None
-        if not (self.config.output_attentions and self.config.add_cross_attention):
-            cross_attns = None
-
-        return TFCausalLMOutputWithCrossAttentions(
-            logits=output.logits, past_key_values=pkv, hidden_states=hs, attentions=attns, cross_attentions=cross_attns
-        )
-
 
 class TFRobertaClassificationHead(tf.keras.layers.Layer):
     """Head for sentence-level classification tasks."""
@@ -1336,7 +1248,6 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="cardiffnlp/twitter-roberta-base-emotion",
         output_type=TFSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1345,16 +1256,16 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1391,13 +1302,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1420,36 +1324,25 @@ def __init__(self, config, *inputs, **kwargs):
             1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
         )
 
-    @property
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            tf.Tensor with dummy inputs
-        """
-        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32)}
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFMultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1499,26 +1392,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output
-    def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMultipleChoiceModelOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1548,7 +1421,6 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="ydshieh/roberta-large-ner-english",
         output_type=TFTokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1557,16 +1429,16 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1603,13 +1475,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1634,7 +1499,6 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="ydshieh/roberta-base-squad2",
         output_type=TFQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1643,17 +1507,17 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1702,12 +1566,3 @@ def call(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output
-    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
diff --git a/src/transformers/models/roberta/tokenization_roberta.py b/src/transformers/models/roberta/tokenization_roberta.py
index 10b28125e92b..b7b3c75be180 100644
--- a/src/transformers/models/roberta/tokenization_roberta.py
+++ b/src/transformers/models/roberta/tokenization_roberta.py
@@ -111,12 +111,14 @@ class RobertaTokenizer(PreTrainedTokenizer):
     This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
     be encoded differently whether it is at the beginning of the sentence (without space) or not:
 
-    ```
+    ```python
     >>> from transformers import RobertaTokenizer
+
     >>> tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
-    >>> tokenizer("Hello world")['input_ids']
-    [0, 31414, 232, 328, 2]
-    >>> tokenizer(" Hello world")['input_ids']
+    >>> tokenizer("Hello world")["input_ids"]
+    [0, 31414, 232, 2]
+
+    >>> tokenizer(" Hello world")["input_ids"]
     [0, 20920, 232, 2]
     ```
 
@@ -198,31 +200,24 @@ def __init__(
         pad_token="<pad>",
         mask_token="<mask>",
         add_prefix_space=False,
-        **kwargs
+        **kwargs,
     ):
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
         sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
         cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
-        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
-        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
 
         # Mask token behave like a normal word, i.e. include the space before it
-        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
-
-        super().__init__(
-            errors=errors,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            add_prefix_space=add_prefix_space,
-            **kwargs,
+        mask_token = (
+            AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
+            if isinstance(mask_token, str)
+            else mask_token
         )
 
+        # these special tokens are not part of the vocab.json, let's add them in the correct order
+
         with open(vocab_file, encoding="utf-8") as vocab_handle:
             self.encoder = json.load(vocab_handle)
         self.decoder = {v: k for k, v in self.encoder.items()}
@@ -239,12 +234,27 @@ def __init__(
         # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
         self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
 
+        super().__init__(
+            errors=errors,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            add_prefix_space=add_prefix_space,
+            **kwargs,
+        )
+
     @property
     def vocab_size(self):
         return len(self.encoder)
 
     def get_vocab(self):
-        return dict(self.encoder, **self.added_tokens_encoder)
+        vocab = dict(self.encoder).copy()
+        vocab.update(self.added_tokens_encoder)
+        return vocab
 
     def bpe(self, token):
         if token in self.cache:
diff --git a/src/transformers/models/roberta/tokenization_roberta_fast.py b/src/transformers/models/roberta/tokenization_roberta_fast.py
index 29381404c47f..05f64ac2ab18 100644
--- a/src/transformers/models/roberta/tokenization_roberta_fast.py
+++ b/src/transformers/models/roberta/tokenization_roberta_fast.py
@@ -81,12 +81,14 @@ class RobertaTokenizerFast(PreTrainedTokenizerFast):
     This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
     be encoded differently whether it is at the beginning of the sentence (without space) or not:
 
-    ```
+    ```python
     >>> from transformers import RobertaTokenizerFast
+
     >>> tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
-    >>> tokenizer("Hello world")['input_ids']
-    [0, 31414, 232, 328, 2]
-    >>> tokenizer(" Hello world")['input_ids']
+    >>> tokenizer("Hello world")["input_ids"]
+    [0, 31414, 232, 2]
+
+    >>> tokenizer(" Hello world")["input_ids"]
     [0, 20920, 232, 2]
     ```
 
@@ -173,8 +175,13 @@ def __init__(
         mask_token="<mask>",
         add_prefix_space=False,
         trim_offsets=True,
-        **kwargs
+        **kwargs,
     ):
+        mask_token = (
+            AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
+            if isinstance(mask_token, str)
+            else mask_token
+        )
         super().__init__(
             vocab_file,
             merges_file,
diff --git a/src/transformers/models/roberta_prelayernorm/__init__.py b/src/transformers/models/roberta_prelayernorm/__init__.py
index a83fca3e051f..e2dcaa71be54 100644
--- a/src/transformers/models/roberta_prelayernorm/__init__.py
+++ b/src/transformers/models/roberta_prelayernorm/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py
index 1683e527aa87..fca6763f274e 100644
--- a/src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py
+++ b/src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py
@@ -45,7 +45,7 @@ class RobertaPreLayerNormConfig(PretrainedConfig):
 
 
     Args:
-        vocab_size (`int`, *optional*, defaults to 30522):
+        vocab_size (`int`, *optional*, defaults to 50265):
             Vocabulary size of the RoBERTa-PreLayerNorm model. Defines the number of different tokens that can be
             represented by the `inputs_ids` passed when calling [`RobertaPreLayerNormModel`] or
             [`TFRobertaPreLayerNormModel`].
@@ -106,7 +106,7 @@ class RobertaPreLayerNormConfig(PretrainedConfig):
 
     def __init__(
         self,
-        vocab_size=30522,
+        vocab_size=50265,
         hidden_size=768,
         num_hidden_layers=12,
         num_attention_heads=12,
@@ -124,7 +124,7 @@ def __init__(
         position_embedding_type="absolute",
         use_cache=True,
         classifier_dropout=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
diff --git a/src/transformers/models/roberta_prelayernorm/convert_roberta_prelayernorm_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/roberta_prelayernorm/convert_roberta_prelayernorm_original_pytorch_checkpoint_to_pytorch.py
index 463f8d58a61e..41fd14c5fddf 100644
--- a/src/transformers/models/roberta_prelayernorm/convert_roberta_prelayernorm_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/roberta_prelayernorm/convert_roberta_prelayernorm_original_pytorch_checkpoint_to_pytorch.py
@@ -18,8 +18,8 @@
 import argparse
 
 import torch
-
 from huggingface_hub import hf_hub_download
+
 from transformers import AutoTokenizer, RobertaPreLayerNormConfig, RobertaPreLayerNormForMaskedLM
 from transformers.utils import logging
 
diff --git a/src/transformers/models/roberta_prelayernorm/modeling_flax_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_flax_roberta_prelayernorm.py
index 68cf1b7ca5eb..b7c347693d95 100644
--- a/src/transformers/models/roberta_prelayernorm/modeling_flax_roberta_prelayernorm.py
+++ b/src/transformers/models/roberta_prelayernorm/modeling_flax_roberta_prelayernorm.py
@@ -15,11 +15,10 @@
 """ Flax RoBERTa-PreLayerNorm model."""
 from typing import Callable, Optional, Tuple
 
-import numpy as np
-
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen import combine_masks, make_causal_mask
 from flax.linen import partitioning as nn_partitioning
@@ -47,7 +46,6 @@
 
 _CHECKPOINT_FOR_DOC = "andreasmadsen/efficient_mlm_m0.40"
 _CONFIG_FOR_DOC = "RobertaPreLayerNormConfig"
-_TOKENIZER_FOR_DOC = "RobertaTokenizer"
 
 remat = nn_partitioning.remat
 
@@ -104,7 +102,7 @@ def create_position_ids_from_input_ids(input_ids, padding_idx):
         input_ids (`numpy.ndarray` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -321,7 +319,7 @@ def __call__(
             attention_bias = lax.select(
                 attention_mask > 0,
                 jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
-                jnp.full(attention_mask.shape, -1e10).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
             )
         else:
             attention_bias = None
@@ -743,7 +741,7 @@ def __init__(
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
         gradient_checkpointing: bool = False,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, gradient_checkpointing=gradient_checkpointing, **kwargs)
         super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
@@ -1000,7 +998,6 @@ class FlaxRobertaPreLayerNormModel(FlaxRobertaPreLayerNormPreTrainedModel):
 
 append_call_sample_docstring(
     FlaxRobertaPreLayerNormModel,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxBaseModelOutputWithPooling,
     _CONFIG_FOR_DOC,
@@ -1078,7 +1075,6 @@ class FlaxRobertaPreLayerNormForMaskedLM(FlaxRobertaPreLayerNormPreTrainedModel)
 
 append_call_sample_docstring(
     FlaxRobertaPreLayerNormForMaskedLM,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxBaseModelOutputWithPooling,
     _CONFIG_FOR_DOC,
@@ -1153,7 +1149,6 @@ class FlaxRobertaPreLayerNormForSequenceClassification(FlaxRobertaPreLayerNormPr
 
 append_call_sample_docstring(
     FlaxRobertaPreLayerNormForSequenceClassification,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxSequenceClassifierOutput,
     _CONFIG_FOR_DOC,
@@ -1240,7 +1235,6 @@ class FlaxRobertaPreLayerNormForMultipleChoice(FlaxRobertaPreLayerNormPreTrained
 )
 append_call_sample_docstring(
     FlaxRobertaPreLayerNormForMultipleChoice,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxMultipleChoiceModelOutput,
     _CONFIG_FOR_DOC,
@@ -1321,7 +1315,6 @@ class FlaxRobertaPreLayerNormForTokenClassification(FlaxRobertaPreLayerNormPreTr
 
 append_call_sample_docstring(
     FlaxRobertaPreLayerNormForTokenClassification,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxTokenClassifierOutput,
     _CONFIG_FOR_DOC,
@@ -1400,7 +1393,6 @@ class FlaxRobertaPreLayerNormForQuestionAnswering(FlaxRobertaPreLayerNormPreTrai
 
 append_call_sample_docstring(
     FlaxRobertaPreLayerNormForQuestionAnswering,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxQuestionAnsweringModelOutput,
     _CONFIG_FOR_DOC,
@@ -1486,7 +1478,7 @@ def __call__(
 class FlaxRobertaPreLayerNormForCausalLM(FlaxRobertaPreLayerNormPreTrainedModel):
     module_class = FlaxRobertaPreLayerNormForCausalLMModule
 
-    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jnp.DeviceArray] = None):
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
         # initializing the cache
         batch_size, seq_length = input_ids.shape
 
@@ -1515,7 +1507,6 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
 
 append_call_sample_docstring(
     FlaxRobertaPreLayerNormForCausalLM,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxCausalLMOutputWithCrossAttentions,
     _CONFIG_FOR_DOC,
diff --git a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
index 712b6ed34114..ddd87fa9ce0c 100644
--- a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
+++ b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
@@ -50,7 +50,6 @@
 
 _CHECKPOINT_FOR_DOC = "andreasmadsen/efficient_mlm_m0.40"
 _CONFIG_FOR_DOC = "RobertaPreLayerNormConfig"
-_TOKENIZER_FOR_DOC = "RobertaTokenizer"
 
 ROBERTA_PRELAYERNORM_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "andreasmadsen/efficient_mlm_m0.15",
@@ -84,7 +83,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
         )
@@ -495,6 +496,13 @@ def forward(
         all_self_attentions = () if output_attentions else None
         all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         next_decoder_cache = () if use_cache else None
         for i, layer_module in enumerate(self.layer):
             if output_hidden_states:
@@ -505,12 +513,6 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         return module(*inputs, past_key_value, output_attentions)
@@ -594,7 +596,7 @@ class RobertaPreLayerNormPreTrainedModel(PreTrainedModel):
     config_class = RobertaPreLayerNormConfig
     base_model_prefix = "roberta_prelayernorm"
     supports_gradient_checkpointing = True
-    _no_split_modules = []
+    _no_split_modules = ["RobertaPreLayerNormEmbeddings", "RobertaPreLayerNormSelfAttention"]
 
     # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
     def _init_weights(self, module):
@@ -617,15 +619,6 @@ def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, RobertaPreLayerNormEncoder):
             module.gradient_checkpointing = value
 
-    def update_keys_to_ignore(self, config, del_keys_to_ignore):
-        """Remove some keys from ignore list"""
-        if not config.tie_word_embeddings:
-            # must make a new list, or the class variable gets modified!
-            self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore]
-            self._keys_to_ignore_on_load_missing = [
-                k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore
-            ]
-
 
 ROBERTA_PRELAYERNORM_START_DOCSTRING = r"""
 
@@ -648,7 +641,7 @@ def update_keys_to_ignore(self, config, del_keys_to_ignore):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`RobertaTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -714,8 +707,6 @@ class RobertaPreLayerNormModel(RobertaPreLayerNormPreTrainedModel):
 
     """
 
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
@@ -745,7 +736,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPoolingAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -800,6 +790,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -887,9 +878,7 @@ def forward(
 )
 # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with roberta-base->andreasmadsen/efficient_mlm_m0.40,ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm, RobertaPreLayerNormTokenizer->RobertaTokenizer
 class RobertaPreLayerNormForCausalLM(RobertaPreLayerNormPreTrainedModel):
-    _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -902,9 +891,6 @@ def __init__(self, config):
         self.roberta_prelayernorm = RobertaPreLayerNormModel(config, add_pooling_layer=False)
         self.lm_head = RobertaPreLayerNormLMHead(config)
 
-        # The LM head weights require special treatment only when they are tied with the word embeddings
-        self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
-
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -1001,6 +987,8 @@ def forward(
 
         lm_loss = None
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(prediction_scores.device)
             # we are doing next-token prediction; shift prediction scores and input ids by one
             shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
             labels = labels[:, 1:].contiguous()
@@ -1032,22 +1020,22 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attenti
 
         return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
 
-    def _reorder_cache(self, past, beam_idx):
+    def _reorder_cache(self, past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
         return reordered_past
 
 
 @add_start_docstrings(
     """RoBERTa-PreLayerNorm Model with a `language modeling` head on top.""", ROBERTA_PRELAYERNORM_START_DOCSTRING
 )
-# Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
 class RobertaPreLayerNormForMaskedLM(RobertaPreLayerNormPreTrainedModel):
-    _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM.__init__ with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
     def __init__(self, config):
         super().__init__(config)
 
@@ -1060,9 +1048,6 @@ def __init__(self, config):
         self.roberta_prelayernorm = RobertaPreLayerNormModel(config, add_pooling_layer=False)
         self.lm_head = RobertaPreLayerNormLMHead(config)
 
-        # The LM head weights require special treatment only when they are tied with the word embeddings
-        self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
-
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -1074,14 +1059,14 @@ def set_output_embeddings(self, new_embeddings):
 
     @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
         mask="<mask>",
         expected_output="' Paris'",
-        expected_loss=0.1,
+        expected_loss=0.69,
     )
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM.forward with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -1125,6 +1110,8 @@ def forward(
 
         masked_lm_loss = None
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(prediction_scores.device)
             loss_fct = CrossEntropyLoss()
             masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
 
@@ -1180,8 +1167,6 @@ def _tie_weights(self):
     ROBERTA_PRELAYERNORM_START_DOCSTRING,
 )
 class RobertaPreLayerNormForSequenceClassification(RobertaPreLayerNormPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1195,12 +1180,9 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output="'optimism'",
-        expected_loss=0.08,
     )
     # Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification.forward with roberta->roberta_prelayernorm
     def forward(
@@ -1240,6 +1222,8 @@ def forward(
 
         loss = None
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
             if self.config.problem_type is None:
                 if self.num_labels == 1:
                     self.config.problem_type = "regression"
@@ -1282,8 +1266,6 @@ def forward(
 )
 # Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
 class RobertaPreLayerNormForMultipleChoice(RobertaPreLayerNormPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -1298,7 +1280,6 @@ def __init__(self, config):
         ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
     )
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1354,6 +1335,8 @@ def forward(
 
         loss = None
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(reshaped_logits.device)
             loss_fct = CrossEntropyLoss()
             loss = loss_fct(reshaped_logits, labels)
 
@@ -1377,9 +1360,6 @@ def forward(
     ROBERTA_PRELAYERNORM_START_DOCSTRING,
 )
 class RobertaPreLayerNormForTokenClassification(RobertaPreLayerNormPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1396,12 +1376,9 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output="['O', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'LOC', 'LOC']",
-        expected_loss=0.01,
     )
     # Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification.forward with roberta->roberta_prelayernorm
     def forward(
@@ -1442,6 +1419,8 @@ def forward(
 
         loss = None
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
             loss_fct = CrossEntropyLoss()
             loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
 
@@ -1488,9 +1467,6 @@ def forward(self, features, **kwargs):
     ROBERTA_PRELAYERNORM_START_DOCSTRING,
 )
 class RobertaPreLayerNormForQuestionAnswering(RobertaPreLayerNormPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1503,12 +1479,9 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output="' puppet'",
-        expected_loss=0.86,
     )
     # Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering.forward with roberta->roberta_prelayernorm
     def forward(
diff --git a/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py
index 086f6dee074d..2f98a5f5d0cf 100644
--- a/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py
+++ b/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py
@@ -15,6 +15,9 @@
 # limitations under the License.
 """ TF 2.0 RoBERTa-PreLayerNorm model."""
 
+
+from __future__ import annotations
+
 import math
 import warnings
 from typing import Optional, Tuple, Union
@@ -46,10 +49,8 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
-    DUMMY_INPUTS,
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -62,7 +63,6 @@
 
 _CHECKPOINT_FOR_DOC = "andreasmadsen/efficient_mlm_m0.40"
 _CONFIG_FOR_DOC = "RobertaPreLayerNormConfig"
-_TOKENIZER_FOR_DOC = "RobertaTokenizer"
 
 TF_ROBERTA_PRELAYERNORM_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "andreasmadsen/efficient_mlm_m0.15",
@@ -87,8 +87,7 @@ def __init__(self, config, **kwargs):
         super().__init__(**kwargs)
 
         self.padding_idx = 1
-        self.vocab_size = config.vocab_size
-        self.type_vocab_size = config.type_vocab_size
+        self.config = config
         self.hidden_size = config.hidden_size
         self.max_position_embeddings = config.max_position_embeddings
         self.initializer_range = config.initializer_range
@@ -99,14 +98,14 @@ def build(self, input_shape: tf.TensorShape):
         with tf.name_scope("word_embeddings"):
             self.weight = self.add_weight(
                 name="weight",
-                shape=[self.vocab_size, self.hidden_size],
+                shape=[self.config.vocab_size, self.hidden_size],
                 initializer=get_initializer(self.initializer_range),
             )
 
         with tf.name_scope("token_type_embeddings"):
             self.token_type_embeddings = self.add_weight(
                 name="embeddings",
-                shape=[self.type_vocab_size, self.hidden_size],
+                shape=[self.config.type_vocab_size, self.hidden_size],
                 initializer=get_initializer(self.initializer_range),
             )
 
@@ -151,16 +150,7 @@ def call(
         assert not (input_ids is None and inputs_embeds is None)
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
@@ -446,9 +436,9 @@ def call(
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
         head_mask: tf.Tensor,
-        encoder_hidden_states: Optional[tf.Tensor],
-        encoder_attention_mask: Optional[tf.Tensor],
-        past_key_value: Optional[Tuple[tf.Tensor]],
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_value: Tuple[tf.Tensor] | None,
         output_attentions: bool,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
@@ -525,9 +515,9 @@ def call(
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
         head_mask: tf.Tensor,
-        encoder_hidden_states: Optional[tf.Tensor],
-        encoder_attention_mask: Optional[tf.Tensor],
-        past_key_values: Optional[Tuple[Tuple[tf.Tensor]]],
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_values: Tuple[Tuple[tf.Tensor]] | None,
         use_cache: Optional[bool],
         output_attentions: bool,
         output_hidden_states: bool,
@@ -621,14 +611,14 @@ class PreTrainedModel
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -636,7 +626,6 @@ def call(
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFBaseModelOutputWithPoolingAndCrossAttentions, Tuple[tf.Tensor]]:
-
         if not self.config.is_decoder:
             use_cache = False
 
@@ -787,38 +776,6 @@ class TFRobertaPreLayerNormPreTrainedModel(TFPreTrainedModel):
     config_class = RobertaPreLayerNormConfig
     base_model_prefix = "roberta_prelayernorm"
 
-    @property
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertPreTrainedModel.dummy_inputs
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        dummy = {"input_ids": tf.constant(DUMMY_INPUTS, dtype=tf.int32)}
-        # Add `encoder_hidden_states` to make the cross-attention layers' weights initialized
-        if self.config.add_cross_attention:
-            batch_size, seq_len = tf.constant(DUMMY_INPUTS).shape
-            shape = (batch_size, seq_len) + (self.config.hidden_size,)
-            h = tf.random.uniform(shape=shape)
-            dummy["encoder_hidden_states"] = h
-
-        return dummy
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 ROBERTA_PRELAYERNORM_START_DOCSTRING = r"""
 
@@ -867,7 +824,7 @@ def serving(self, inputs):
         input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`RobertaTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -931,21 +888,20 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFBaseModelOutputWithPoolingAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -992,27 +948,6 @@ def call(
 
         return outputs
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertModel.serving_output
-    def serving_output(
-        self, output: TFBaseModelOutputWithPoolingAndCrossAttentions
-    ) -> TFBaseModelOutputWithPoolingAndCrossAttentions:
-        output_cache = self.config.use_cache and self.config.is_decoder
-        pkv = tf.convert_to_tensor(output.past_key_values) if output_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = tf.convert_to_tensor(output.cross_attentions) if output.cross_attentions is not None else None
-        if not (self.config.output_attentions and self.config.add_cross_attention):
-            cross_attns = None
-
-        return TFBaseModelOutputWithPoolingAndCrossAttentions(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            past_key_values=pkv,
-            hidden_states=hs,
-            attentions=attns,
-            cross_attentions=cross_attns,
-        )
-
 
 # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->RobertaPreLayerNorm
 class TFRobertaPreLayerNormLMHead(tf.keras.layers.Layer):
@@ -1021,7 +956,7 @@ class TFRobertaPreLayerNormLMHead(tf.keras.layers.Layer):
     def __init__(self, config, input_embeddings, **kwargs):
         super().__init__(**kwargs)
 
-        self.vocab_size = config.vocab_size
+        self.config = config
         self.hidden_size = config.hidden_size
         self.dense = tf.keras.layers.Dense(
             config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
@@ -1034,7 +969,7 @@ def __init__(self, config, input_embeddings, **kwargs):
         self.decoder = input_embeddings
 
     def build(self, input_shape):
-        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
+        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
 
         super().build(input_shape)
 
@@ -1050,7 +985,7 @@ def get_bias(self):
 
     def set_bias(self, value):
         self.bias = value["bias"]
-        self.vocab_size = shape_list(value["bias"])[0]
+        self.config.vocab_size = shape_list(value["bias"])[0]
 
     def call(self, hidden_states):
         hidden_states = self.dense(hidden_states)
@@ -1061,7 +996,7 @@ def call(self, hidden_states):
         seq_length = shape_list(tensor=hidden_states)[1]
         hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
         hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True)
-        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
         hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
 
         return hidden_states
@@ -1070,11 +1005,11 @@ def call(self, hidden_states):
 @add_start_docstrings(
     """RoBERTa-PreLayerNorm Model with a `language modeling` head on top.""", ROBERTA_PRELAYERNORM_START_DOCSTRING
 )
-# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForMaskedLM with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
 class TFRobertaPreLayerNormForMaskedLM(TFRobertaPreLayerNormPreTrainedModel, TFMaskedLanguageModelingLoss):
     # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
     _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head.decoder.weight"]
 
+    # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForMaskedLM.__init__ with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
 
@@ -1093,26 +1028,26 @@ def get_prefix_bias_name(self):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFMaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
         mask="<mask>",
         expected_output="' Paris'",
-        expected_loss=0.1,
+        expected_loss=0.69,
     )
+    # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForMaskedLM.call with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1150,13 +1085,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output
-    def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForCausalLM with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
 class TFRobertaPreLayerNormForCausalLM(TFRobertaPreLayerNormPreTrainedModel, TFCausalLanguageModelingLoss):
@@ -1201,27 +1129,26 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attenti
     @unpack_inputs
     @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFCausalLMOutputWithCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFCausalLMOutputWithCrossAttentions, Tuple[tf.Tensor]]:
         r"""
@@ -1287,20 +1214,6 @@ def call(
             cross_attentions=outputs.cross_attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMHeadModel.serving_output
-    def serving_output(self, output: TFCausalLMOutputWithCrossAttentions) -> TFCausalLMOutputWithCrossAttentions:
-        output_cache = self.config.use_cache and self.config.is_decoder
-        pkv = tf.convert_to_tensor(output.past_key_values) if output_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = tf.convert_to_tensor(output.cross_attentions) if output.cross_attentions is not None else None
-        if not (self.config.output_attentions and self.config.add_cross_attention):
-            cross_attns = None
-
-        return TFCausalLMOutputWithCrossAttentions(
-            logits=output.logits, past_key_values=pkv, hidden_states=hs, attentions=attns, cross_attentions=cross_attns
-        )
-
 
 # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead with Roberta->RobertaPreLayerNorm
 class TFRobertaPreLayerNormClassificationHead(tf.keras.layers.Layer):
@@ -1356,26 +1269,23 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output="'optimism'",
-        expected_loss=0.08,
     )
     # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForSequenceClassification.call with roberta->roberta_prelayernorm
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1412,13 +1322,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1442,38 +1345,27 @@ def __init__(self, config, *inputs, **kwargs):
             1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
         )
 
-    @property
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            tf.Tensor with dummy inputs
-        """
-        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32)}
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(
         ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
     )
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFMultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1523,26 +1415,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output
-    def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMultipleChoiceModelOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1574,26 +1446,23 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFTokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output="['O', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'LOC', 'LOC']",
-        expected_loss=0.01,
     )
     # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForTokenClassification.call with roberta->roberta_prelayernorm
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1630,13 +1499,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1663,27 +1525,24 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output="' puppet'",
-        expected_loss=0.86,
     )
     # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForQuestionAnswering.call with roberta->roberta_prelayernorm
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1732,12 +1591,3 @@ def call(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output
-    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
diff --git a/src/transformers/models/roc_bert/__init__.py b/src/transformers/models/roc_bert/__init__.py
index a19398dfb845..344bcfa41654 100644
--- a/src/transformers/models/roc_bert/__init__.py
+++ b/src/transformers/models/roc_bert/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,7 +13,6 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-# rely on isort to merge the imports
 from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
 
 
diff --git a/src/transformers/models/roc_bert/configuration_roc_bert.py b/src/transformers/models/roc_bert/configuration_roc_bert.py
index 83bc35b6ff7e..2f0a0dd0e0f7 100644
--- a/src/transformers/models/roc_bert/configuration_roc_bert.py
+++ b/src/transformers/models/roc_bert/configuration_roc_bert.py
@@ -77,8 +77,6 @@ class RoCBertConfig(PretrainedConfig):
             with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
         classifier_dropout (`float`, *optional*):
             The dropout ratio for the classification head.
-        enable_cls (`bool`, *optional*, defaults to `True`):
-            Whether or not the model use cls loss when pretrained.
         enable_pronunciation (`bool`, *optional*, defaults to `True`):
             Whether or not the model use pronunciation embed when training.
         enable_shape (`bool`, *optional*, defaults to `True`):
@@ -131,7 +129,6 @@ def __init__(
         pad_token_id=0,
         position_embedding_type="absolute",
         classifier_dropout=None,
-        enable_cls=True,
         enable_pronunciation=True,
         enable_shape=True,
         pronunciation_embed_dim=768,
@@ -139,7 +136,7 @@ def __init__(
         shape_embed_dim=512,
         shape_vocab_size=24858,
         concat_input=True,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
@@ -154,7 +151,6 @@ def __init__(
         self.type_vocab_size = type_vocab_size
         self.layer_norm_eps = layer_norm_eps
         self.use_cache = use_cache
-        self.enable_cls = enable_cls
         self.enable_pronunciation = enable_pronunciation
         self.enable_shape = enable_shape
         self.pronunciation_embed_dim = pronunciation_embed_dim
diff --git a/src/transformers/models/roc_bert/modeling_roc_bert.py b/src/transformers/models/roc_bert/modeling_roc_bert.py
index a6f79a3f1179..35d4be9f20e0 100644
--- a/src/transformers/models/roc_bert/modeling_roc_bert.py
+++ b/src/transformers/models/roc_bert/modeling_roc_bert.py
@@ -50,7 +50,6 @@
 
 _CHECKPOINT_FOR_DOC = "weiweishi/roc-bert-base-zh"
 _CONFIG_FOR_DOC = "RoCBertConfig"
-_TOKENIZER_FOR_DOC = "RoCBertTokenizer"
 
 # Base model docstring
 _EXPECTED_OUTPUT_SHAPE = [1, 8, 768]
@@ -147,7 +146,7 @@ def load_tf_weights_in_roc_bert(model, config, tf_checkpoint_path):
         try:
             if pointer.shape != array.shape:
                 raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
-        except AssertionError as e:
+        except ValueError as e:
             e.args += (pointer.shape, array.shape)
             raise
         logger.info(f"Initialize PyTorch weight {name}")
@@ -191,7 +190,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
         self.register_buffer(
             "token_type_ids",
@@ -627,6 +628,13 @@ def forward(
         all_self_attentions = () if output_attentions else None
         all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         next_decoder_cache = () if use_cache else None
         for i, layer_module in enumerate(self.layer):
             if output_hidden_states:
@@ -637,12 +645,6 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         return module(*inputs, past_key_value, output_attentions)
@@ -777,7 +779,6 @@ class RoCBertPreTrainedModel(PreTrainedModel):
     load_tf_weights = load_tf_weights_in_roc_bert
     base_model_prefix = "roc_bert"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -816,21 +817,21 @@ def _set_gradient_checkpointing(self, module, value=False):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`RoCBertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
         input_shape_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the shape vocabulary.
 
-            Indices can be obtained using [`RoCBertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input_shape_ids)
         input_pronunciation_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the pronunciation vocabulary.
 
-            Indices can be obtained using [`RoCBertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input_pronunciation_ids)
@@ -936,7 +937,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPoolingAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -993,6 +993,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -1082,7 +1083,7 @@ def forward(
     ROC_BERT_START_DOCSTRING,
 )
 class RoCBertForPreTraining(RoCBertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"predictions.decoder.bias", "cls.predictions.decoder.weight"]
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1163,10 +1164,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import RoCBertTokenizer, RoCBertForPreTraining
+        >>> from transformers import AutoTokenizer, RoCBertForPreTraining
         >>> import torch
 
-        >>> tokenizer = RoCBertTokenizer.from_pretrained("weiweishi/roc-bert-base-zh")
+        >>> tokenizer = AutoTokenizer.from_pretrained("weiweishi/roc-bert-base-zh")
         >>> model = RoCBertForPreTraining.from_pretrained("weiweishi/roc-bert-base-zh")
 
         >>> inputs = tokenizer("你好，很高兴认识你", return_tensors="pt")
@@ -1243,7 +1244,7 @@ def forward(
 
                 sim_matrix = torch.matmul(pooled_output_norm, attack_pooled_output_norm.T)  # batch_size * hidden_dim
                 sim_matrix_target = torch.matmul(labels_pooled_output_norm, attack_pooled_output_norm.T)
-                batch_labels = torch.tensor([i for i in range(batch_size)], device=device)
+                batch_labels = torch.tensor(list(range(batch_size)), device=device)
                 contrastive_loss = (
                     loss_fct(100 * sim_matrix.view(batch_size, -1), batch_labels.view(-1))
                     + loss_fct(100 * sim_matrix_target.view(batch_size, -1), batch_labels.view(-1))
@@ -1267,8 +1268,7 @@ def forward(
 
 @add_start_docstrings("""RoCBert Model with a `language modeling` head on top.""", ROC_BERT_START_DOCSTRING)
 class RoCBertForMaskedLM(RoCBertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"]
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     # Copied from transformers.models.bert.modeling_bert.BertForMaskedLM.__init__ with Bert->RoCBert,bert->roc_bert
     def __init__(self, config):
@@ -1320,10 +1320,10 @@ def forward(
 
         Example:
         ```python
-        >>> from transformers import RoCBertTokenizer, RoCBertForMaskedLM
+        >>> from transformers import AutoTokenizer, RoCBertForMaskedLM
         >>> import torch
 
-        >>> tokenizer = RoCBertTokenizer.from_pretrained("weiweishi/roc-bert-base-zh")
+        >>> tokenizer = AutoTokenizer.from_pretrained("weiweishi/roc-bert-base-zh")
         >>> model = RoCBertForMaskedLM.from_pretrained("weiweishi/roc-bert-base-zh")
 
         >>> inputs = tokenizer("法国是首都[MASK].", return_tensors="pt")
@@ -1408,8 +1408,7 @@ def prepare_inputs_for_generation(
     """RoCBert Model with a `language modeling` head on top for CLM fine-tuning.""", ROC_BERT_START_DOCSTRING
 )
 class RoCBertForCausalLM(RoCBertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"]
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.__init__ with BertLMHeadModel->RoCBertForCausalLM,Bert->RoCBert,bert->roc_bert
     def __init__(self, config):
@@ -1488,10 +1487,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import RoCBertTokenizer, RoCBertForCausalLM, RoCBertConfig
+        >>> from transformers import AutoTokenizer, RoCBertForCausalLM, RoCBertConfig
         >>> import torch
 
-        >>> tokenizer = RoCBertTokenizer.from_pretrained("weiweishi/roc-bert-base-zh")
+        >>> tokenizer = AutoTokenizer.from_pretrained("weiweishi/roc-bert-base-zh")
         >>> config = RoCBertConfig.from_pretrained("weiweishi/roc-bert-base-zh")
         >>> config.is_decoder = True
         >>> model = RoCBertForCausalLM.from_pretrained("weiweishi/roc-bert-base-zh", config=config)
@@ -1553,7 +1552,7 @@ def prepare_inputs_for_generation(
         input_pronunciation_ids=None,
         past_key_values=None,
         attention_mask=None,
-        **model_kwargs
+        **model_kwargs,
     ):
         input_shape = input_ids.shape
 
@@ -1578,10 +1577,12 @@ def prepare_inputs_for_generation(
         }
 
     # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel._reorder_cache
-    def _reorder_cache(self, past, beam_idx):
+    def _reorder_cache(self, past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
         return reordered_past
 
 
@@ -1609,7 +1610,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1716,7 +1716,6 @@ def __init__(self, config):
         ROC_BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
     )
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1804,8 +1803,6 @@ def forward(
     ROC_BERT_START_DOCSTRING,
 )
 class RoCBertForTokenClassification(RoCBertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     # Copied from transformers.models.bert.modeling_bert.BertForTokenClassification.__init__ with Bert->RoCBert,bert->roc_bert
     def __init__(self, config):
         super().__init__(config)
@@ -1823,7 +1820,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_TOKEN_CLASSIFICATION,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1893,8 +1889,6 @@ def forward(
     ROC_BERT_START_DOCSTRING,
 )
 class RoCBertForQuestionAnswering(RoCBertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     # Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering.__init__ with Bert->RoCBert,bert->roc_bert
     def __init__(self, config):
         super().__init__(config)
@@ -1908,7 +1902,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_QA,
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/roc_bert/tokenization_roc_bert.py b/src/transformers/models/roc_bert/tokenization_roc_bert.py
index 07e740577a06..0bbdc04e536e 100644
--- a/src/transformers/models/roc_bert/tokenization_roc_bert.py
+++ b/src/transformers/models/roc_bert/tokenization_roc_bert.py
@@ -154,22 +154,8 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
-        super().__init__(
-            do_lower_case=do_lower_case,
-            do_basic_tokenize=do_basic_tokenize,
-            never_split=never_split,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            tokenize_chinese_chars=tokenize_chinese_chars,
-            strip_accents=strip_accents,
-            **kwargs,
-        )
-
         for cur_file in [vocab_file, word_shape_file, word_pronunciation_file]:
             if cur_file is None or not os.path.isfile(cur_file):
                 raise ValueError(
@@ -195,7 +181,20 @@ def __init__(
                 tokenize_chinese_chars=tokenize_chinese_chars,
                 strip_accents=strip_accents,
             )
-        self.wordpiece_tokenizer = RoCBertWordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+        self.wordpiece_tokenizer = RoCBertWordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+        super().__init__(
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
 
     @property
     def do_lower_case(self):
@@ -210,11 +209,12 @@ def get_vocab(self):
         return dict(self.vocab, **self.added_tokens_encoder)
 
     # Copied from transformers.models.bert.tokenization_bert.BertTokenizer._tokenize
-    def _tokenize(self, text):
+    def _tokenize(self, text, split_special_tokens=False):
         split_tokens = []
         if self.do_basic_tokenize:
-            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-
+            for token in self.basic_tokenizer.tokenize(
+                text, never_split=self.all_special_tokens if not split_special_tokens else None
+            ):
                 # If the token is part of the never_split set
                 if token in self.basic_tokenizer.never_split:
                     split_tokens.append(token)
@@ -243,7 +243,7 @@ def _encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         def get_input_ids(text):
             if isinstance(text, str):
@@ -342,7 +342,7 @@ def prepare_for_model(
         return_length: bool = False,
         verbose: bool = True,
         prepend_batch_axis: bool = False,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
@@ -584,7 +584,7 @@ def _batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         def get_input_ids(text):
             if isinstance(text, str):
@@ -932,20 +932,30 @@ class RoCBertBasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -964,7 +974,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -992,7 +1004,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
diff --git a/src/transformers/models/roformer/__init__.py b/src/transformers/models/roformer/__init__.py
index 909259ead601..93c86eb081fa 100644
--- a/src/transformers/models/roformer/__init__.py
+++ b/src/transformers/models/roformer/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/roformer/configuration_roformer.py b/src/transformers/models/roformer/configuration_roformer.py
index cbd92b412ba3..a5af26aa5d61 100644
--- a/src/transformers/models/roformer/configuration_roformer.py
+++ b/src/transformers/models/roformer/configuration_roformer.py
@@ -100,7 +100,7 @@ class RoFormerConfig(PretrainedConfig):
     >>> # Initializing a RoFormer junnyu/roformer_chinese_base style configuration
     >>> configuration = RoFormerConfig()
 
-    >>> # Initializing a model from the junnyu/roformer_chinese_base style configuration
+    >>> # Initializing a model (with random weights) from the junnyu/roformer_chinese_base style configuration
     >>> model = RoFormerModel(configuration)
 
     >>> # Accessing the model configuration
@@ -126,7 +126,7 @@ def __init__(
         pad_token_id=0,
         rotary_value=False,
         use_cache=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
 
diff --git a/src/transformers/models/roformer/modeling_flax_roformer.py b/src/transformers/models/roformer/modeling_flax_roformer.py
index 011f1610488d..d95a4d73832e 100644
--- a/src/transformers/models/roformer/modeling_flax_roformer.py
+++ b/src/transformers/models/roformer/modeling_flax_roformer.py
@@ -16,11 +16,10 @@
 
 from typing import Callable, Optional, Tuple
 
-import numpy as np
-
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen.attention import dot_product_attention_weights
 from flax.traverse_util import flatten_dict, unflatten_dict
@@ -43,7 +42,6 @@
 
 _CHECKPOINT_FOR_DOC = "junnyu/roformer_chinese_base"
 _CONFIG_FOR_DOC = "RoFormerConfig"
-_TOKENIZER_FOR_DOC = "RoFormerTokenizer"
 
 FLAX_ROFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "junnyu/roformer_chinese_small",
@@ -95,7 +93,7 @@
         input_ids (`numpy.ndarray` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`RoFormerTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -240,7 +238,7 @@ def __call__(
             attention_bias = lax.select(
                 attention_mask > 0,
                 jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
-                jnp.full(attention_mask.shape, -1e10).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
             )
         else:
             attention_bias = None
@@ -623,7 +621,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, **kwargs)
         super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
@@ -750,9 +748,7 @@ class FlaxRoFormerModel(FlaxRoFormerPreTrainedModel):
     module_class = FlaxRoFormerModule
 
 
-append_call_sample_docstring(
-    FlaxRoFormerModel, _TOKENIZER_FOR_DOC, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutput, _CONFIG_FOR_DOC
-)
+append_call_sample_docstring(FlaxRoFormerModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutput, _CONFIG_FOR_DOC)
 
 
 class FlaxRoFormerForMaskedLMModule(nn.Module):
@@ -812,7 +808,6 @@ class FlaxRoFormerForMaskedLM(FlaxRoFormerPreTrainedModel):
 
 append_call_sample_docstring(
     FlaxRoFormerForMaskedLM,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxMaskedLMOutput,
     _CONFIG_FOR_DOC,
@@ -877,7 +872,6 @@ class FlaxRoFormerForSequenceClassification(FlaxRoFormerPreTrainedModel):
 
 append_call_sample_docstring(
     FlaxRoFormerForSequenceClassification,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxSequenceClassifierOutput,
     _CONFIG_FOR_DOC,
@@ -956,7 +950,6 @@ class FlaxRoFormerForMultipleChoice(FlaxRoFormerPreTrainedModel):
 )
 append_call_sample_docstring(
     FlaxRoFormerForMultipleChoice,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxMultipleChoiceModelOutput,
     _CONFIG_FOR_DOC,
@@ -1022,7 +1015,6 @@ class FlaxRoFormerForTokenClassification(FlaxRoFormerPreTrainedModel):
 
 append_call_sample_docstring(
     FlaxRoFormerForTokenClassification,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxTokenClassifierOutput,
     _CONFIG_FOR_DOC,
@@ -1091,7 +1083,6 @@ class FlaxRoFormerForQuestionAnswering(FlaxRoFormerPreTrainedModel):
 
 append_call_sample_docstring(
     FlaxRoFormerForQuestionAnswering,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxQuestionAnsweringModelOutput,
     _CONFIG_FOR_DOC,
diff --git a/src/transformers/models/roformer/modeling_roformer.py b/src/transformers/models/roformer/modeling_roformer.py
index 791e13762f83..2c3feeda1270 100644
--- a/src/transformers/models/roformer/modeling_roformer.py
+++ b/src/transformers/models/roformer/modeling_roformer.py
@@ -51,7 +51,6 @@
 
 _CHECKPOINT_FOR_DOC = "junnyu/roformer_chinese_base"
 _CONFIG_FOR_DOC = "RoFormerConfig"
-_TOKENIZER_FOR_DOC = "RoFormerTokenizer"
 
 ROFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "junnyu/roformer_chinese_small",
@@ -260,11 +259,6 @@ def forward(
             key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
             value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
             attention_mask = encoder_attention_mask
-        elif past_key_value is not None:
-            key_layer = self.transpose_for_scores(self.key(hidden_states))
-            value_layer = self.transpose_for_scores(self.value(hidden_states))
-            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
-            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
         else:
             key_layer = self.transpose_for_scores(self.key(hidden_states))
             value_layer = self.transpose_for_scores(self.value(hidden_states))
@@ -277,6 +271,9 @@ def forward(
                     query_layer, key_layer = self.apply_rotary_position_embeddings(
                         sinusoidal_pos, query_layer, key_layer
                     )
+            if past_key_value is not None:
+                key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+                value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
         if self.is_decoder:
             # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
             # Further calls to cross_attention layer can then reuse all cross-attention
@@ -557,12 +554,20 @@ def forward(
         output_hidden_states=False,
         return_dict=True,
     ):
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
         all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
 
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
         # [sequence_length, embed_size_per_head] -> [batch_size, num_heads, sequence_length, embed_size_per_head]
-        sinusoidal_pos = self.embed_positions(hidden_states.shape[:-1])[None, None, :, :]
+        sinusoidal_pos = self.embed_positions(hidden_states.shape[:-1], past_key_values_length)[None, None, :, :]
 
         next_decoder_cache = () if use_cache else None
         for i, layer_module in enumerate(self.layer):
@@ -574,12 +579,6 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         return module(*inputs, past_key_value, output_attentions)
@@ -697,11 +696,6 @@ class RoFormerPreTrainedModel(PreTrainedModel):
     load_tf_weights = load_tf_weights_in_roformer
     base_model_prefix = "roformer"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = []
-    _keys_to_ignore_on_load_unexpected = [
-        r"roformer.embeddings_project.weight",
-        r"roformer.embeddings_project.bias",
-    ]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -742,7 +736,7 @@ def _set_gradient_checkpointing(self, module, value=False):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`RoFormerTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -828,7 +822,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPastAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -881,6 +874,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -954,7 +948,7 @@ def forward(
 
 @add_start_docstrings("""RoFormer Model with a `language modeling` head on top.""", ROFORMER_START_DOCSTRING)
 class RoFormerForMaskedLM(RoFormerPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
+    _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -979,7 +973,6 @@ def set_output_embeddings(self, new_embeddings):
 
     @add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1057,7 +1050,7 @@ def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_
     """RoFormer Model with a `language modeling` head on top for CLM fine-tuning.""", ROFORMER_START_DOCSTRING
 )
 class RoFormerForCausalLM(RoFormerPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
+    _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1124,10 +1117,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import RoFormerTokenizer, RoFormerForCausalLM, RoFormerConfig
+        >>> from transformers import AutoTokenizer, RoFormerForCausalLM, RoFormerConfig
         >>> import torch
 
-        >>> tokenizer = RoFormerTokenizer.from_pretrained("junnyu/roformer_chinese_base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("junnyu/roformer_chinese_base")
         >>> config = RoFormerConfig.from_pretrained("junnyu/roformer_chinese_base")
         >>> config.is_decoder = True
         >>> model = RoFormerForCausalLM.from_pretrained("junnyu/roformer_chinese_base", config=config)
@@ -1191,11 +1184,12 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attenti
 
         return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
 
-    def _reorder_cache(self, past, beam_idx):
+    def _reorder_cache(self, past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
+        for layer_past in past_key_values:
             reordered_past += (
-                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
+                + layer_past[2:],
             )
         return reordered_past
 
@@ -1240,7 +1234,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1335,7 +1328,6 @@ def __init__(self, config):
         ROFORMER_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
     )
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1426,7 +1418,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1504,7 +1495,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/roformer/modeling_tf_roformer.py b/src/transformers/models/roformer/modeling_tf_roformer.py
index 1ca96b5856c8..f6067f9237f4 100644
--- a/src/transformers/models/roformer/modeling_tf_roformer.py
+++ b/src/transformers/models/roformer/modeling_tf_roformer.py
@@ -15,6 +15,8 @@
 """ TF 2.0 RoFormer model."""
 
 
+from __future__ import annotations
+
 import math
 from typing import Dict, Optional, Tuple, Union
 
@@ -46,9 +48,8 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -61,7 +62,6 @@
 
 _CHECKPOINT_FOR_DOC = "junnyu/roformer_chinese_base"
 _CONFIG_FOR_DOC = "RoFormerConfig"
-_TOKENIZER_FOR_DOC = "RoFormerTokenizer"
 
 TF_ROFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "junnyu/roformer_chinese_small",
@@ -136,8 +136,7 @@ class TFRoFormerEmbeddings(tf.keras.layers.Layer):
     def __init__(self, config: RoFormerConfig, **kwargs):
         super().__init__(**kwargs)
 
-        self.vocab_size = config.vocab_size
-        self.type_vocab_size = config.type_vocab_size
+        self.config = config
         self.embedding_size = config.embedding_size
         self.initializer_range = config.initializer_range
         self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
@@ -147,14 +146,14 @@ def build(self, input_shape: tf.TensorShape):
         with tf.name_scope("word_embeddings"):
             self.weight = self.add_weight(
                 name="weight",
-                shape=[self.vocab_size, self.embedding_size],
+                shape=[self.config.vocab_size, self.embedding_size],
                 initializer=get_initializer(self.initializer_range),
             )
 
         with tf.name_scope("token_type_embeddings"):
             self.token_type_embeddings = self.add_weight(
                 name="embeddings",
-                shape=[self.type_vocab_size, self.embedding_size],
+                shape=[self.config.type_vocab_size, self.embedding_size],
                 initializer=get_initializer(self.initializer_range),
             )
 
@@ -177,16 +176,7 @@ def call(
         assert not (input_ids is None and inputs_embeds is None)
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
@@ -531,7 +521,7 @@ class TFRoFormerLMPredictionHead(tf.keras.layers.Layer):
     def __init__(self, config: RoFormerConfig, input_embeddings: tf.keras.layers.Layer, **kwargs):
         super().__init__(**kwargs)
 
-        self.vocab_size = config.vocab_size
+        self.config = config
         self.embedding_size = config.embedding_size
 
         self.transform = TFRoFormerPredictionHeadTransform(config, name="transform")
@@ -541,7 +531,7 @@ def __init__(self, config: RoFormerConfig, input_embeddings: tf.keras.layers.Lay
         self.input_embeddings = input_embeddings
 
     def build(self, input_shape: tf.TensorShape):
-        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
+        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
 
         super().build(input_shape)
 
@@ -557,14 +547,14 @@ def get_bias(self) -> Dict[str, tf.Variable]:
 
     def set_bias(self, value: tf.Variable):
         self.bias = value["bias"]
-        self.vocab_size = shape_list(value["bias"])[0]
+        self.config.vocab_size = shape_list(value["bias"])[0]
 
     def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
         hidden_states = self.transform(hidden_states=hidden_states)
         seq_length = shape_list(hidden_states)[1]
         hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size])
         hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
-        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
         hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
 
         return hidden_states
@@ -615,17 +605,16 @@ class PreTrainedModel
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
-
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
@@ -756,7 +745,7 @@ class TFRoFormerPreTrainedModel(TFPreTrainedModel):
         input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`RoFormerTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -815,18 +804,17 @@ def __init__(self, config: RoFormerConfig, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFBaseModelOutputWithPooling,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -846,12 +834,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output: TFBaseModelOutput) -> TFBaseModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutput(last_hidden_state=output.last_hidden_state, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings("""RoFormer Model with a `language modeling` head on top.""", ROFORMER_START_DOCSTRING)
 class TFRoFormerForMaskedLM(TFRoFormerPreTrainedModel, TFMaskedLanguageModelingLoss):
@@ -873,22 +855,21 @@ def get_lm_head(self) -> tf.keras.layers.Layer:
     @unpack_inputs
     @add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFMaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
         r"""
@@ -923,12 +904,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """RoFormer Model with a `language modeling` head on top for CLM fine-tuning.""", ROFORMER_START_DOCSTRING
@@ -948,22 +923,21 @@ def get_lm_head(self) -> tf.keras.layers.Layer:
 
     @unpack_inputs
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFCausalLMOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFCausalLMOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1003,12 +977,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFCausalLMOutput) -> TFCausalLMOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFCausalLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 class TFRoFormerClassificationHead(tf.keras.layers.Layer):
     """Head for sentence-level classification tasks."""
@@ -1058,22 +1026,21 @@ def __init__(self, config: RoFormerConfig, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1108,12 +1075,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1132,38 +1093,26 @@ def __init__(self, config: RoFormerConfig, *inputs, **kwargs):
             units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
         )
 
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-
-        Returns:
-            tf.Tensor with dummy inputs
-        """
-        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32)}
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(
         ROFORMER_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
     )
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFMultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1218,26 +1167,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-                "token_type_ids": tf.TensorSpec((None, None, None), tf.int32, name="token_type_ids"),
-            }
-        ]
-    )
-    def serving(self, inputs: Dict[str, tf.Tensor]) -> TFMultipleChoiceModelOutput:
-        output = self.call(input_ids=inputs)
-
-        return self.serving_output(output)
-
-    def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMultipleChoiceModelOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1261,22 +1190,21 @@ def __init__(self, config: RoFormerConfig, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFTokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1310,12 +1238,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1338,23 +1260,22 @@ def __init__(self, config: RoFormerConfig, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1400,11 +1321,3 @@ def call(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
-
-    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
diff --git a/src/transformers/models/roformer/tokenization_roformer.py b/src/transformers/models/roformer/tokenization_roformer.py
index 5ab1f694ad9a..88c0f398b300 100644
--- a/src/transformers/models/roformer/tokenization_roformer.py
+++ b/src/transformers/models/roformer/tokenization_roformer.py
@@ -107,20 +107,30 @@ class BasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -139,7 +149,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -167,7 +179,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
@@ -344,7 +356,7 @@ class RoFormerTokenizer(PreTrainedTokenizer):
 
     >>> tokenizer = RoFormerTokenizer.from_pretrained("junnyu/roformer_chinese_base")
     >>> tokenizer.tokenize("今天天气非常好。")
-    # ['今', '天', '天', '气', '非常', '好', '。']
+    ['今', '天', '天', '气', '非常', '好', '。']
     ```"""
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
@@ -364,22 +376,8 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
-        super().__init__(
-            do_lower_case=do_lower_case,
-            do_basic_tokenize=do_basic_tokenize,
-            never_split=never_split,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            tokenize_chinese_chars=tokenize_chinese_chars,
-            strip_accents=strip_accents,
-            **kwargs,
-        )
-
         if not os.path.isfile(vocab_file):
             raise ValueError(
                 f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
@@ -395,7 +393,7 @@ def __init__(
                 tokenize_chinese_chars=tokenize_chinese_chars,
                 strip_accents=strip_accents,
             )
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
         try:
             import rjieba
         except ImportError:
@@ -405,6 +403,20 @@ def __init__(
             )
         self.jieba = rjieba
 
+        super().__init__(
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
+
     @property
     def do_lower_case(self):
         return self.basic_tokenizer.do_lower_case
diff --git a/src/transformers/models/roformer/tokenization_roformer_fast.py b/src/transformers/models/roformer/tokenization_roformer_fast.py
index 7b2cab568862..360b76b843dd 100644
--- a/src/transformers/models/roformer/tokenization_roformer_fast.py
+++ b/src/transformers/models/roformer/tokenization_roformer_fast.py
@@ -85,7 +85,7 @@ class RoFormerTokenizerFast(PreTrainedTokenizerFast):
 
     >>> tokenizer = RoFormerTokenizerFast.from_pretrained("junnyu/roformer_chinese_base")
     >>> tokenizer.tokenize("今天天气非常好。")
-    # ['今', '天', '天', '气', '非常', '好', '。']
+    ['今', '天', '天', '气', '非常', '好', '。']
     ```"""
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -106,7 +106,7 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
@@ -163,7 +163,7 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
         """
         output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
 
-        if token_ids_1:
+        if token_ids_1 is not None:
             output += token_ids_1 + [self.sep_token_id]
 
         return output
diff --git a/src/transformers/models/rwkv/__init__.py b/src/transformers/models/rwkv/__init__.py
new file mode 100644
index 000000000000..e68eefe9f8aa
--- /dev/null
+++ b/src/transformers/models/rwkv/__init__.py
@@ -0,0 +1,60 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_rwkv": ["RWKV_PRETRAINED_CONFIG_ARCHIVE_MAP", "RwkvConfig", "RwkvOnnxConfig"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_rwkv"] = [
+        "RWKV_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "RwkvForCausalLM",
+        "RwkvModel",
+        "RwkvPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_rwkv import RWKV_PRETRAINED_CONFIG_ARCHIVE_MAP, RwkvConfig, RwkvOnnxConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_rwkv import (
+            RWKV_PRETRAINED_MODEL_ARCHIVE_LIST,
+            RwkvForCausalLM,
+            RwkvModel,
+            RwkvPreTrainedModel,
+        )
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/rwkv/configuration_rwkv.py b/src/transformers/models/rwkv/configuration_rwkv.py
new file mode 100644
index 000000000000..89b2f5fb6483
--- /dev/null
+++ b/src/transformers/models/rwkv/configuration_rwkv.py
@@ -0,0 +1,130 @@
+# coding=utf-8
+# Copyright 2023 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" RWKV configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+RWKV_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "RWKV/rwkv-4-169m-pile": "https://huggingface.co/RWKV/rwkv-4-169m-pile/resolve/main/config.json",
+    "RWKV/rwkv-4-430m-pile": "https://huggingface.co/RWKV/rwkv-4-430m-pile/resolve/main/config.json",
+    "RWKV/rwkv-4-1b5-pile": "https://huggingface.co/RWKV/rwkv-4-1b5-pile/resolve/main/config.json",
+    "RWKV/rwkv-4-3b-pile": "https://huggingface.co/RWKV/rwkv-4-3b-pile/resolve/main/config.json",
+    "RWKV/rwkv-4-7b-pile": "https://huggingface.co/RWKV/rwkv-4-7b-pile/resolve/main/config.json",
+    "RWKV/rwkv-4-14b-pile": "https://huggingface.co/RWKV/rwkv-4-14b-pile/resolve/main/config.json",
+    "RWKV/rwkv-raven-1b5": "https://huggingface.co/RWKV/rwkv-raven-1b5/resolve/main/config.json",
+    "RWKV/rwkv-raven-3b": "https://huggingface.co/RWKV/rwkv-raven-3b/resolve/main/config.json",
+    "RWKV/rwkv-raven-7b": "https://huggingface.co/RWKV/rwkv-raven-7b/resolve/main/config.json",
+    "RWKV/rwkv-raven-14b": "https://huggingface.co/RWKV/rwkv-raven-14b/resolve/main/config.json",
+}
+
+
+class RwkvConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`RwkvModel`]. It is used to instantiate a RWKV
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the RWVK-4
+    [RWKV/rwkv-4-169m-pile](https://huggingface.co/RWKV/rwkv-4-169m-pile) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50277):
+            Vocabulary size of the RWKV model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`RwkvModel`].
+        context_length (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model can be be used with in a single forward (using it in RNN mode
+            lets use any sequence length).
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimensionality of the embeddings and hidden states.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the model.
+        attention_hidden_size (`int`, *optional*):
+            Dimensionality of the attention hidden states. Will default to `hidden_size` if unset.
+        intermediate_size (`int`, *optional*):
+            Dimensionality of the inner feed-forward layers. Will default to 4 times `hidden_size` if unset.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon to use in the layer normalization layers.
+        bos_token_id (`int`, *optional*, defaults to 0):
+            The id of the beginning of sentence token in the vocabulary. Defaults to 0 as RWKV uses the same tokenizer
+            as GPTNeoX.
+        eos_token_id (`int`, *optional*, defaults to 0):
+            The id of the end of sentence token in the vocabulary. Defaults to 0 as RWKV uses the same tokenizer as
+            GPTNeoX.
+        rescale_every (`int`, *optional*, default to 6):
+            At inference, the hidden states (and weights of the correponding output layers) are divided by 2 every
+            `rescale_every` layer. If set to 0 or a negative number, no rescale is done.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether or not to tie the word embeddings with the input token embeddings.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last state.
+
+
+    Example:
+
+    ```python
+    >>> from transformers import RwkvConfig, RwkvModel
+
+    >>> # Initializing a Rwkv configuration
+    >>> configuration = RwkvConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = RwkvModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "rwkv"
+    attribute_map = {"max_position_embeddings": "context_length"}
+
+    def __init__(
+        self,
+        vocab_size=50277,
+        context_length=1024,
+        hidden_size=4096,
+        num_hidden_layers=32,
+        attention_hidden_size=None,
+        intermediate_size=None,
+        layer_norm_epsilon=1e-5,
+        bos_token_id=0,
+        eos_token_id=0,
+        rescale_every=6,
+        tie_word_embeddings=False,
+        use_cache=True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.context_length = context_length
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.attention_hidden_size = attention_hidden_size if attention_hidden_size is not None else hidden_size
+        self.intermediate_size = intermediate_size if intermediate_size is not None else 4 * hidden_size
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.rescale_every = rescale_every
+        self.use_cache = use_cache
+
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs
+        )
diff --git a/src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py b/src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py
new file mode 100644
index 000000000000..b340b9f028b3
--- /dev/null
+++ b/src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py
@@ -0,0 +1,201 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert a RWKV checkpoint from BlinkDL to the Hugging Face format."""
+
+
+import argparse
+import gc
+import json
+import os
+import re
+
+import torch
+from huggingface_hub import hf_hub_download
+
+from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerFast, RwkvConfig
+from transformers.modeling_utils import WEIGHTS_INDEX_NAME, shard_checkpoint
+
+
+NUM_HIDDEN_LAYERS_MAPPING = {
+    "169M": 12,
+    "430M": 24,
+    "1B5": 24,
+    "3B": 32,
+    "7B": 32,
+    "14B": 40,
+}
+
+HIDEN_SIZE_MAPPING = {
+    "169M": 768,
+    "430M": 1024,
+    "1B5": 2048,
+    "3B": 2560,
+    "7B": 4096,
+    "14B": 5120,
+}
+
+
+def convert_state_dict(state_dict):
+    state_dict_keys = list(state_dict.keys())
+    for name in state_dict_keys:
+        weight = state_dict.pop(name)
+        # emb -> embedding
+        if name.startswith("emb."):
+            name = name.replace("emb.", "embeddings.")
+        # ln_0 -> pre_ln (only present at block 0)
+        if name.startswith("blocks.0.ln0"):
+            name = name.replace("blocks.0.ln0", "blocks.0.pre_ln")
+        # att -> attention
+        name = re.sub(r"blocks\.(\d+)\.att", r"blocks.\1.attention", name)
+        # ffn -> feed_forward
+        name = re.sub(r"blocks\.(\d+)\.ffn", r"blocks.\1.feed_forward", name)
+        # time_mix_k -> time_mix_key and reshape
+        if name.endswith(".time_mix_k"):
+            name = name.replace(".time_mix_k", ".time_mix_key")
+        # time_mix_v -> time_mix_value and reshape
+        if name.endswith(".time_mix_v"):
+            name = name.replace(".time_mix_v", ".time_mix_value")
+        # time_mix_r -> time_mix_key and reshape
+        if name.endswith(".time_mix_r"):
+            name = name.replace(".time_mix_r", ".time_mix_receptance")
+
+        if name != "head.weight":
+            name = "rwkv." + name
+
+        state_dict[name] = weight
+    return state_dict
+
+
+def convert_rmkv_checkpoint_to_hf_format(
+    repo_id, checkpoint_file, output_dir, size=None, tokenizer_file=None, push_to_hub=False, model_name=None
+):
+    # 1. If possible, build the tokenizer.
+    if tokenizer_file is None:
+        print("No `--tokenizer_file` provided, we will use the default tokenizer.")
+        vocab_size = 50277
+        tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
+    else:
+        tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_file)
+        vocab_size = len(tokenizer)
+    tokenizer.save_pretrained(output_dir)
+
+    # 2. Build the config
+    possible_sizes = list(NUM_HIDDEN_LAYERS_MAPPING.keys())
+    if size is None:
+        # Try to infer size from the checkpoint name
+        for candidate in possible_sizes:
+            if candidate in checkpoint_file:
+                size = candidate
+                break
+        if size is None:
+            raise ValueError("Could not infer the size, please provide it with the `--size` argument.")
+    if size not in possible_sizes:
+        raise ValueError(f"`size` should be one of {possible_sizes}, got {size}.")
+
+    config = RwkvConfig(
+        vocab_size=vocab_size,
+        num_hidden_layers=NUM_HIDDEN_LAYERS_MAPPING[size],
+        hidden_size=HIDEN_SIZE_MAPPING[size],
+    )
+    config.save_pretrained(output_dir)
+
+    # 3. Download model file then convert state_dict
+    model_file = hf_hub_download(repo_id, checkpoint_file)
+    state_dict = torch.load(model_file, map_location="cpu")
+    state_dict = convert_state_dict(state_dict)
+
+    # 4. Split in shards and save
+    shards, index = shard_checkpoint(state_dict)
+    for shard_file, shard in shards.items():
+        torch.save(shard, os.path.join(output_dir, shard_file))
+
+    if index is not None:
+        save_index_file = os.path.join(output_dir, WEIGHTS_INDEX_NAME)
+        # Save the index as well
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+
+        # 5. Clean up shards (for some reason the file PyTorch saves take the same space as the whole state_dict
+        print(
+            "Cleaning up shards. This may error with an OOM error, it this is the case don't worry you still have converted the model."
+        )
+        shard_files = list(shards.keys())
+
+        del state_dict
+        del shards
+        gc.collect()
+
+        for shard_file in shard_files:
+            state_dict = torch.load(os.path.join(output_dir, shard_file))
+            torch.save({k: v.cpu().clone() for k, v in state_dict.items()}, os.path.join(output_dir, shard_file))
+
+    del state_dict
+    gc.collect()
+
+    if push_to_hub:
+        if model_name is None:
+            raise ValueError("Please provide a `model_name` to push the model to the Hub.")
+        model = AutoModelForCausalLM.from_pretrained(output_dir)
+        model.push_to_hub(model_name, max_shard_size="2GB")
+        tokenizer.push_to_hub(model_name)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--repo_id", default=None, type=str, required=True, help="Repo ID from which to pull the checkpoint."
+    )
+    parser.add_argument(
+        "--checkpoint_file", default=None, type=str, required=True, help="Name of the checkpoint file in the repo."
+    )
+    parser.add_argument(
+        "--output_dir", default=None, type=str, required=True, help="Where to save the converted model."
+    )
+    parser.add_argument(
+        "--tokenizer_file",
+        default=None,
+        type=str,
+        help="Path to the tokenizer file to use (if not provided, only the model is converted).",
+    )
+    parser.add_argument(
+        "--size",
+        default=None,
+        type=str,
+        help="Size of the model. Will be inferred from the `checkpoint_file` if not passed.",
+    )
+    parser.add_argument(
+        "--push_to_hub",
+        action="store_true",
+        help="Push to the Hub the converted model.",
+    )
+    parser.add_argument(
+        "--model_name",
+        default=None,
+        type=str,
+        help="Name of the pushed model on the Hub, including the username / organization.",
+    )
+
+    args = parser.parse_args()
+    convert_rmkv_checkpoint_to_hf_format(
+        args.repo_id,
+        args.checkpoint_file,
+        args.output_dir,
+        size=args.size,
+        tokenizer_file=args.tokenizer_file,
+        push_to_hub=args.push_to_hub,
+        model_name=args.model_name,
+    )
diff --git a/src/transformers/models/rwkv/modeling_rwkv.py b/src/transformers/models/rwkv/modeling_rwkv.py
new file mode 100644
index 000000000000..db41bd3c9538
--- /dev/null
+++ b/src/transformers/models/rwkv/modeling_rwkv.py
@@ -0,0 +1,867 @@
+# coding=utf-8
+# Copyright 2023 Bo Peng and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch RWKV model."""
+
+import math
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_bitsandbytes_available,
+    is_ninja_available,
+    is_torch_cuda_available,
+    logging,
+)
+from .configuration_rwkv import RwkvConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "RWKV/rwkv-4-169m-pile"
+_CONFIG_FOR_DOC = "RwkvConfig"
+
+RWKV_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "RWKV/rwkv-4-169m-pile",
+    "RWKV/rwkv-4-430m-pile",
+    "RWKV/rwkv-4-1b5-pile",
+    "RWKV/rwkv-4-3b-pile",
+    "RWKV/rwkv-4-7b-pile",
+    "RWKV/rwkv-4-14b-pile",
+    "RWKV/rwkv-raven-1b5",
+    "RWKV/rwkv-raven-3b",
+    "RWKV/rwkv-raven-7b",
+    "RWKV/rwkv-raven-14b",
+    # See all RWKV models at https://huggingface.co/models?filter=rwkv
+]
+
+
+rwkv_cuda_kernel = None
+
+
+def load_wkv_cuda_kernel(context_length):
+    from torch.utils.cpp_extension import load as load_kernel
+
+    global rwkv_cuda_kernel
+
+    kernel_folder = Path(__file__).resolve().parent.parent.parent / "kernels" / "rwkv"
+    cuda_kernel_files = [kernel_folder / f for f in ["wkv_op.cpp", "wkv_cuda.cu", "wkv_cuda_bf16.cu"]]
+
+    # Only load the kernel if it's not been loaded yet or if we changed the context length
+    if rwkv_cuda_kernel is not None and rwkv_cuda_kernel.max_seq_length == context_length:
+        return
+
+    logger.info(f"Loading CUDA kernel for RWKV at context length of {context_length}.")
+
+    flags = [
+        "-res-usage",
+        "--maxrregcount 60",
+        "--use_fast_math",
+        "-O3",
+        "-Xptxas -O3",
+        "--extra-device-vectorization",
+        f"-DTmax={context_length}",
+    ]
+    rwkv_cuda_kernel = load_kernel(
+        name=f"wkv_{context_length}",
+        sources=cuda_kernel_files,
+        verbose=(logging.get_verbosity() == logging.DEBUG),
+        extra_cuda_cflags=flags,
+    )
+    rwkv_cuda_kernel.max_seq_length = context_length
+
+
+class RwkvLinearAttention(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, time_decay, time_first, key, value, state=None, return_state=False):
+        batch_size, seq_len, hidden_size = key.size()
+        if seq_len > rwkv_cuda_kernel.max_seq_length:
+            raise ValueError(
+                f"Cannot process a batch with {seq_len} tokens at the same time, use a maximum of "
+                f"{rwkv_cuda_kernel.max_seq_length} with this model."
+            )
+        if batch_size * hidden_size % min(hidden_size, 32) != 0:
+            raise ValueError(
+                f"The product of batch size ({batch_size}) and hidden size ({hidden_size}) needs to be a round "
+                f"multiple of {min(hidden_size, 32)}."
+            )
+
+        ctx.input_dtype = key.dtype
+
+        if (
+            time_decay.device.type != "cuda"
+            or time_first.device.type != "cuda"
+            or key.device.type != "cuda"
+            or value.device.type != "cuda"
+        ):
+            raise ValueError("Calling the CUDA kernel for wkv attention requires all tensors to be on CUDA devices.")
+
+        time_decay = -torch.exp(time_decay.float().contiguous())
+        if key.dtype == torch.float16:
+            time_first = time_first.float()
+            key = key.float()
+            value = value.float()
+        time_first = time_first.contiguous()
+        key = key.contiguous()
+        value = value.contiguous()
+        # The CUDA kernel will fill this tensor.
+        output = torch.empty_like(key, memory_format=torch.contiguous_format)
+        if return_state or state is not None:
+            if state is None:
+                state = torch.zeros(
+                    batch_size,
+                    hidden_size,
+                    3,
+                    dtype=torch.float32,
+                    device=key.device,
+                    memory_format=torch.contiguous_format,
+                )
+                state[:, :, 2] -= 1e38
+            else:
+                state = torch.cat([s.unsqueeze(2) for s in state], dim=2).contiguous()
+            if key.dtype == torch.bfloat16:
+                forward_func = rwkv_cuda_kernel.forward_with_state_bf16
+            else:
+                forward_func = rwkv_cuda_kernel.forward_with_state
+            forward_func(time_decay, time_first, key, value, output, state)
+        else:
+            forward_func = rwkv_cuda_kernel.forward_bf16 if key.dtype == torch.bfloat16 else rwkv_cuda_kernel.forward
+            forward_func(time_decay, time_first, key, value, output)
+
+        ctx.save_for_backward(time_decay, time_first, key, value, output)
+
+        if state is not None:
+            state = [s.squeeze(2) for s in torch.chunk(state, 3, dim=2)]
+
+        return output.to(ctx.input_dtype), state
+
+    @staticmethod
+    # g stands for grad
+    def backward(ctx, g_output, g_state=None):
+        input_dtype = ctx.input_dtype
+
+        time_decay, time_first, key, value, output = ctx.saved_tensors
+        # The CUDA kernel will fill those tensors.
+        g_time_decay = torch.empty_like(
+            time_decay,
+            memory_format=torch.contiguous_format,
+            dtype=torch.bfloat16 if input_dtype == torch.bfloat16 else torch.float32,
+        )
+        g_time_first = torch.empty_like(time_first, memory_format=torch.contiguous_format)
+        g_key = torch.empty_like(key, memory_format=torch.contiguous_format)
+        g_value = torch.empty_like(value, memory_format=torch.contiguous_format)
+
+        if input_dtype == torch.float16:
+            g_output = g_output.float()
+        backward_func = rwkv_cuda_kernel.backward_bf16 if input_dtype == torch.bfloat16 else rwkv_cuda_kernel.backward
+        backward_func(
+            time_decay,
+            time_first,
+            key,
+            value,
+            output,
+            g_output.contiguous(),
+            g_time_decay,
+            g_time_first,
+            g_key,
+            g_value,
+        )
+
+        return (
+            g_time_decay.to(input_dtype),
+            g_time_first.to(input_dtype),
+            g_key.to(input_dtype),
+            g_value.to(input_dtype),
+            None,
+            None,
+        )
+
+
+def rwkv_linear_attention_cpu(time_decay, time_first, key, value, state=None, return_state=False):
+    # For CPU fallback. Will be slower and probably take more memory than the custom CUDA kernel if not executed
+    # within a torch.no_grad.
+    _, seq_length, _ = key.size()
+    output = torch.zeros_like(key)
+
+    if state is None:
+        num_state = torch.zeros_like(key[:, 0], dtype=torch.float32)
+        den_state = torch.zeros_like(key[:, 0], dtype=torch.float32)
+        max_state = torch.zeros_like(key[:, 0], dtype=torch.float32) - 1e38
+    else:
+        num_state, den_state, max_state = state
+    # For numerical stability
+    #    real_numerator_state = num_state * torch.exp(max_state)
+    #    real_denominator_state = den_state * torch.exp(max_state)
+
+    time_decay = -torch.exp(time_decay)
+
+    for current_index in range(seq_length):
+        current_key = key[:, current_index].float()
+        current_value = value[:, current_index]
+
+        # wkv computation at time t
+        max_for_output = torch.maximum(max_state, current_key + time_first)
+        e1 = torch.exp(max_state - max_for_output)
+        e2 = torch.exp(current_key + time_first - max_for_output)
+        numerator = e1 * num_state + e2 * current_value
+        denominator = e1 * den_state + e2
+        output[:, current_index] = (numerator / denominator).to(output.dtype)
+
+        # Update state for next iteration
+        max_for_state = torch.maximum(max_state + time_decay, current_key)
+        e1 = torch.exp(max_state + time_decay - max_for_state)
+        e2 = torch.exp(current_key - max_for_state)
+        num_state = e1 * num_state + e2 * current_value
+        den_state = e1 * den_state + e2
+        max_state = max_for_state
+
+    if return_state or state is not None:
+        state = [num_state, den_state, max_state]
+
+    return output, state
+
+
+def rwkv_linear_attention(time_decay, time_first, key, value, state=None, return_state=False):
+    no_cuda = any(t.device.type != "cuda" for t in [time_decay, time_first, key, value])
+    # Launching the CUDA kernel for just one token will actually be slower (there is no for loop in the CPU version
+    # in this case).
+    one_token = key.size(1) == 1
+    if rwkv_cuda_kernel is None or no_cuda or one_token:
+        return rwkv_linear_attention_cpu(time_decay, time_first, key, value, state=state, return_state=return_state)
+    else:
+        return RwkvLinearAttention.apply(time_decay, time_first, key, value, state, return_state)
+
+
+class RwkvSelfAttention(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.config = config
+        kernel_loaded = rwkv_cuda_kernel is not None and rwkv_cuda_kernel.max_seq_length == config.context_length
+        if is_ninja_available() and is_torch_cuda_available() and not kernel_loaded:
+            try:
+                load_wkv_cuda_kernel(config.context_length)
+            except Exception:
+                logger.info("Could not load the custom CUDA kernel for RWKV attention.")
+        self.layer_id = layer_id
+        hidden_size = config.hidden_size
+        attention_hidden_size = (
+            config.attention_hidden_size if config.attention_hidden_size is not None else hidden_size
+        )
+        self.attention_hidden_size = attention_hidden_size
+
+        self.time_decay = nn.Parameter(torch.empty(attention_hidden_size))
+        self.time_first = nn.Parameter(torch.empty(attention_hidden_size))
+
+        self.time_mix_key = nn.Parameter(torch.empty(1, 1, hidden_size))
+        self.time_mix_value = nn.Parameter(torch.empty(1, 1, hidden_size))
+        self.time_mix_receptance = nn.Parameter(torch.empty(1, 1, hidden_size))
+
+        self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))
+        self.key = nn.Linear(hidden_size, attention_hidden_size, bias=False)
+        self.value = nn.Linear(hidden_size, attention_hidden_size, bias=False)
+        self.receptance = nn.Linear(hidden_size, attention_hidden_size, bias=False)
+        self.output = nn.Linear(attention_hidden_size, hidden_size, bias=False)
+
+    # TODO: maybe jit, otherwise move inside forward
+    def extract_key_value(self, hidden, state=None):
+        # Mix hidden with the previous timestep to produce key, value, receptance
+        if hidden.size(1) == 1 and state is not None:
+            shifted = state[1][:, :, self.layer_id]
+        else:
+            shifted = self.time_shift(hidden)
+            if state is not None:
+                shifted[:, 0] = state[1][:, :, self.layer_id]
+        key = hidden * self.time_mix_key + shifted * (1 - self.time_mix_key)
+        value = hidden * self.time_mix_value + shifted * (1 - self.time_mix_value)
+        receptance = hidden * self.time_mix_receptance + shifted * (1 - self.time_mix_receptance)
+
+        key = self.key(key)
+        value = self.value(value)
+        receptance = torch.sigmoid(self.receptance(receptance))
+        if state is not None:
+            state[1][:, :, self.layer_id] = hidden[:, -1]
+        return receptance, key, value, state
+
+    def forward(self, hidden, state=None, use_cache=False):
+        receptance, key, value, state = self.extract_key_value(hidden, state=state)
+        layer_state = tuple(s[:, :, self.layer_id] for s in state[2:]) if state is not None else None
+        rwkv, layer_state = rwkv_linear_attention(
+            self.time_decay,
+            self.time_first,
+            key,
+            value,
+            state=layer_state,
+            return_state=use_cache,
+        )
+
+        if layer_state is not None:
+            state[2][:, :, self.layer_id] = layer_state[0]
+            state[3][:, :, self.layer_id] = layer_state[1]
+            state[4][:, :, self.layer_id] = layer_state[2]
+
+        return self.output(receptance * rwkv), state
+
+
+class RwkvFeedForward(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.config = config
+        self.layer_id = layer_id
+        hidden_size = config.hidden_size
+        intermediate_size = (
+            config.intermediate_size if config.intermediate_size is not None else 4 * config.hidden_size
+        )
+
+        self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))
+        self.time_mix_key = nn.Parameter(torch.empty(1, 1, hidden_size))
+        self.time_mix_receptance = nn.Parameter(torch.empty(1, 1, hidden_size))
+
+        self.key = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.receptance = nn.Linear(hidden_size, hidden_size, bias=False)
+        self.value = nn.Linear(intermediate_size, hidden_size, bias=False)
+
+    def forward(self, hidden, state=None):
+        if hidden.size(1) == 1 and state is not None:
+            shifted = state[0][:, :, self.layer_id]
+        else:
+            shifted = self.time_shift(hidden)
+            if state is not None:
+                shifted[:, 0] = state[0][:, :, self.layer_id]
+        key = hidden * self.time_mix_key + shifted * (1 - self.time_mix_key)
+        receptance = hidden * self.time_mix_receptance + shifted * (1 - self.time_mix_receptance)
+
+        key = torch.square(torch.relu(self.key(key)))
+        value = self.value(key)
+        receptance = torch.sigmoid(self.receptance(receptance))
+
+        if state is not None:
+            state[0][:, :, self.layer_id] = hidden[:, -1]
+
+        return receptance * value, state
+
+
+class RwkvBlock(nn.Module):
+    def __init__(self, config, layer_id):
+        super().__init__()
+        self.config = config
+        self.layer_id = layer_id
+
+        if layer_id == 0:
+            self.pre_ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+
+        self.ln1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+        self.ln2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+
+        self.attention = RwkvSelfAttention(config, layer_id)
+        self.feed_forward = RwkvFeedForward(config, layer_id)
+
+    def forward(self, hidden, state=None, use_cache=False, output_attentions=False):
+        if self.layer_id == 0:
+            hidden = self.pre_ln(hidden)
+
+        attention, state = self.attention(self.ln1(hidden), state=state, use_cache=use_cache)
+        hidden = hidden + attention
+
+        feed_forward, state = self.feed_forward(self.ln2(hidden), state=state)
+        hidden = hidden + feed_forward
+
+        outputs = (hidden, state)
+        if output_attentions:
+            outputs += (attention,)
+        else:
+            outputs += (None,)
+
+        return outputs
+
+
+class RwkvPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = RwkvConfig
+    base_model_prefix = "rwkv"
+    _no_split_modules = ["RwkvBlock"]
+    _keep_in_fp32_modules = ["time_decay", "time_first"]
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, RwkvSelfAttention):
+            layer_id = module.layer_id
+            num_hidden_layers = module.config.num_hidden_layers
+            hidden_size = module.config.hidden_size
+            attention_hidden_size = module.attention_hidden_size
+
+            ratio_0_to_1 = layer_id / (num_hidden_layers - 1)  # 0 to 1
+            ratio_1_to_almost0 = 1.0 - (layer_id / num_hidden_layers)  # 1 to ~0
+
+            time_weight = torch.tensor(
+                [i / hidden_size for i in range(hidden_size)],
+                dtype=module.time_mix_key.dtype,
+                device=module.time_mix_key.device,
+            )
+            time_weight = time_weight[None, None, :]
+
+            decay_speed = [
+                -5 + 8 * (h / (attention_hidden_size - 1)) ** (0.7 + 1.3 * ratio_0_to_1)
+                for h in range(attention_hidden_size)
+            ]
+            decay_speed = torch.tensor(decay_speed, dtype=module.time_decay.dtype, device=module.time_decay.device)
+            zigzag = (
+                torch.tensor(
+                    [(i + 1) % 3 - 1 for i in range(attention_hidden_size)],
+                    dtype=module.time_first.dtype,
+                    device=module.time_first.device,
+                )
+                * 0.5
+            )
+
+            with torch.no_grad():
+                module.time_decay.data = decay_speed
+                module.time_first.data = torch.ones_like(module.time_first * math.log(0.3) + zigzag)
+
+                module.time_mix_key.data = torch.pow(time_weight, ratio_1_to_almost0)
+                module.time_mix_value.data = torch.pow(time_weight, ratio_1_to_almost0) + 0.3 * ratio_0_to_1
+                module.time_mix_receptance.data = torch.pow(time_weight, 0.5 * ratio_1_to_almost0)
+        elif isinstance(module, RwkvFeedForward):
+            layer_id = module.layer_id
+            num_hidden_layers = module.config.num_hidden_layers
+            hidden_size = module.config.hidden_size
+
+            ratio_1_to_almost0 = 1.0 - (layer_id / num_hidden_layers)  # 1 to ~0
+
+            time_weight = torch.tensor(
+                [i / hidden_size for i in range(hidden_size)],
+                dtype=module.time_mix_key.dtype,
+                device=module.time_mix_key.device,
+            )
+            time_weight = time_weight[None, None, :]
+
+            with torch.no_grad():
+                module.time_mix_key.data = torch.pow(time_weight, ratio_1_to_almost0)
+                module.time_mix_receptance.data = torch.pow(time_weight, ratio_1_to_almost0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, RwkvModel):
+            module.gradient_checkpointing = value
+
+
+@dataclass
+class RwkvOutput(ModelOutput):
+    """
+    Class for the RWKV model outputs.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        state (list of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`):
+            The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
+            avoid providing the old `input_ids`.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    state: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class RwkvCausalLMOutput(ModelOutput):
+    """
+    Base class for causal language model (or autoregressive) outputs.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        state (list of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`):
+            The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
+            avoid providing the old `input_ids`.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    state: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+RWKV_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`RwkvConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+RWKV_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
+            `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
+            sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            This is currently not used by `RwkvModel`, but will be supported in the future.
+
+            [What are attention masks?](../glossary#attention-mask)
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        state (tuple of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`, *optional*):
+            If passed along, the model uses the previous state in all the blocks (which will give the output for the
+            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
+        use_cache (`bool`, *optional*):
+            If set to `True`, the last state is returned and can be used to quickly generate the next logits.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare RWKV Model transformer outputting raw hidden-states without any specific head on top.",
+    RWKV_START_DOCSTRING,
+)
+class RwkvModel(RwkvPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.blocks = nn.ModuleList([RwkvBlock(config, layer_id=idx) for idx in range(config.num_hidden_layers)])
+        self.ln_out = nn.LayerNorm(config.hidden_size)
+
+        self.layers_are_rescaled = False
+
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def set_input_embeddings(self, new_embeddings):
+        self.embeddings = new_embeddings
+
+    @add_start_docstrings_to_model_forward(RWKV_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=RwkvOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,  # noqa
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        state: Optional[List[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, RwkvOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.training == self.layers_are_rescaled:
+            self._rescale_layers()
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is None and inputs_embeds is None:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embeddings(input_ids)
+
+        if use_cache and state is None:
+            shape = (inputs_embeds.size(0), self.config.hidden_size, self.config.num_hidden_layers)
+            state = [
+                torch.zeros(
+                    *shape, dtype=inputs_embeds.dtype if i <= 1 else torch.float32, device=inputs_embeds.device
+                )
+                for i in range(5)
+            ]
+            state[4] -= 1e30
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        hidden_states = inputs_embeds
+
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        for idx, block in enumerate(self.blocks):
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, use_cache=use_cache, output_attentions=output_attentions)
+
+                    return custom_forward
+
+                hidden_states, state, attentions = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block), hidden_states, state
+                )
+            else:
+                hidden_states, state, attentions = block(
+                    hidden_states, state=state, use_cache=use_cache, output_attentions=output_attentions
+                )
+
+            if (
+                self.layers_are_rescaled
+                and self.config.rescale_every > 0
+                and (idx + 1) % self.config.rescale_every == 0
+            ):
+                hidden_states = hidden_states / 2
+
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (attentions,)
+
+        hidden_states = self.ln_out(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(x for x in [hidden_states, state, all_hidden_states, all_self_attentions] if x is not None)
+
+        return RwkvOutput(
+            last_hidden_state=hidden_states,
+            state=state,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+    def _rescale_layers(self):
+        # Layers should be rescaled for inference only.
+        if self.layers_are_rescaled == (not self.training):
+            return
+        if self.config.rescale_every > 0:
+            with torch.no_grad():
+                for block_id, block in enumerate(self.blocks):
+                    if self.training:
+                        block.attention.output.weight.mul_(2 ** int(block_id // self.config.rescale_every))
+                        block.feed_forward.value.weight.mul_(2 ** int(block_id // self.config.rescale_every))
+                    else:
+                        # Deal with quantization statistics
+                        if hasattr(block.attention.output.weight, "SCB"):
+                            block.attention.output.weight.SCB.div_(2 ** int(block_id // self.config.rescale_every))
+                            block.feed_forward.value.weight.SCB.div_(2 ** int(block_id // self.config.rescale_every))
+                        elif hasattr(block.attention.output.weight, "quant_state"):
+                            self._bnb_4bit_dequantize_and_rescale(block.attention.output, block_id)
+                            self._bnb_4bit_dequantize_and_rescale(block.feed_forward.value, block_id)
+                        else:
+                            block.attention.output.weight.div_(2 ** int(block_id // self.config.rescale_every))
+                            block.feed_forward.value.weight.div_(2 ** int(block_id // self.config.rescale_every))
+
+        self.layers_are_rescaled = not self.training
+
+    def _bnb_4bit_dequantize_and_rescale(self, target_layer, block_id):
+        r"""
+        Perform the dequantization and rescaling of the weights of a given layer. After that operation the layer will
+        be quantized again.
+        """
+        if not is_bitsandbytes_available():
+            raise ImportError("Please install bitsandbytes to use this method.")
+        import bitsandbytes as bnb
+
+        dequant_weights = bnb.functional.dequantize_4bit(target_layer.weight.data, target_layer.weight.quant_state)
+
+        dequant_weights.div_(2 ** int(block_id // self.config.rescale_every))
+
+        # re-quantize the model:
+        # we need to put it first on CPU then back to the device
+        # this will create an overhead :/
+        # We set requires_grad=False as we cannot compute gradients on top of 4bit parameters anyway and to avoid
+        # bugs with bnb
+        quant_weight = bnb.nn.Params4bit(dequant_weights.to("cpu"), requires_grad=False).to(dequant_weights.device)
+        setattr(target_layer, "weight", quant_weight)
+
+
+@add_start_docstrings(
+    """
+    The RWKV Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """,
+    RWKV_START_DOCSTRING,
+)
+class RwkvForCausalLM(RwkvPreTrainedModel):
+    _tied_weights_keys = ["head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.rwkv = RwkvModel(config)
+        self.head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.head = new_embeddings
+
+    def prepare_inputs_for_generation(self, input_ids, state=None, inputs_embeds=None, **kwargs):
+        # only last token for inputs_ids if the state is passed along.
+        if state is not None:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and state is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs["state"] = state
+        return model_inputs
+
+    @add_start_docstrings_to_model_forward(RWKV_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=RwkvCausalLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,  # noqa
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        state: Optional[List[torch.FloatTensor]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, RwkvCausalLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        rwkv_outputs = self.rwkv(
+            input_ids,
+            inputs_embeds=inputs_embeds,
+            state=state,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = rwkv_outputs[0]
+
+        logits = self.head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + rwkv_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return RwkvCausalLMOutput(
+            loss=loss,
+            logits=logits,
+            state=rwkv_outputs.state,
+            hidden_states=rwkv_outputs.hidden_states,
+            attentions=rwkv_outputs.attentions,
+        )
diff --git a/src/transformers/models/sam/__init__.py b/src/transformers/models/sam/__init__.py
new file mode 100644
index 000000000000..e8006e89e0f1
--- /dev/null
+++ b/src/transformers/models/sam/__init__.py
@@ -0,0 +1,105 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_tf_available,
+    is_torch_available,
+    is_vision_available,
+)
+
+
+_import_structure = {
+    "configuration_sam": [
+        "SAM_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "SamConfig",
+        "SamMaskDecoderConfig",
+        "SamPromptEncoderConfig",
+        "SamVisionConfig",
+    ],
+    "processing_sam": ["SamProcessor"],
+}
+
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_sam"] = [
+        "SAM_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "SamModel",
+        "SamPreTrainedModel",
+    ]
+try:
+    if not is_tf_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_tf_sam"] = [
+        "TF_SAM_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "TFSamModel",
+        "TFSamPreTrainedModel",
+    ]
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["image_processing_sam"] = ["SamImageProcessor"]
+
+
+if TYPE_CHECKING:
+    from .configuration_sam import (
+        SAM_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        SamConfig,
+        SamMaskDecoderConfig,
+        SamPromptEncoderConfig,
+        SamVisionConfig,
+    )
+    from .processing_sam import SamProcessor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_sam import SAM_PRETRAINED_MODEL_ARCHIVE_LIST, SamModel, SamPreTrainedModel
+
+    try:
+        if not is_tf_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_tf_sam import TF_SAM_PRETRAINED_MODEL_ARCHIVE_LIST, TFSamModel, TFSamPreTrainedModel
+
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .image_processing_sam import SamImageProcessor
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/sam/configuration_sam.py b/src/transformers/models/sam/configuration_sam.py
new file mode 100644
index 000000000000..2eb75e122e64
--- /dev/null
+++ b/src/transformers/models/sam/configuration_sam.py
@@ -0,0 +1,312 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" SAM model configuration"""
+
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+SAM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "facebook/sam-vit-huge": "https://huggingface.co/facebook/sam-vit-huge/resolve/main/config.json",
+    "facebook/sam-vit-large": "https://huggingface.co/facebook/sam-vit-large/resolve/main/config.json",
+    "facebook/sam-vit-base": "https://huggingface.co/facebook/sam-vit-base/resolve/main/config.json",
+}
+
+
+class SamPromptEncoderConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`SamPromptEncoder`]. The [`SamPromptEncoder`]
+    module is used to encode the input 2D points and bounding boxes. Instantiating a configuration defaults will yield
+    a similar configuration to that of the SAM-vit-h
+    [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 256):
+            Dimensionality of the hidden states.
+        image_size (`int`, *optional*, defaults to 1024):
+            The expected output resolution of the image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        mask_input_channels (`int`, *optional*, defaults to 16):
+            The number of channels to be fed to the `MaskDecoder` module.
+        num_point_embeddings (`int`, *optional*, defaults to 4):
+            The number of point embeddings to be used.
+        hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function in the encoder and pooler.
+    """
+
+    def __init__(
+        self,
+        hidden_size=256,
+        image_size=1024,
+        patch_size=16,
+        mask_input_channels=16,
+        num_point_embeddings=4,
+        hidden_act="gelu",
+        layer_norm_eps=1e-6,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.image_embedding_size = image_size // patch_size
+        self.mask_input_channels = mask_input_channels
+        self.num_point_embeddings = num_point_embeddings
+        self.hidden_act = hidden_act
+        self.layer_norm_eps = layer_norm_eps
+
+
+class SamMaskDecoderConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`SamMaskDecoder`]. It is used to instantiate a SAM
+    mask decoder to the specified arguments, defining the model architecture. Instantiating a configuration defaults
+    will yield a similar configuration to that of the SAM-vit-h
+    [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 256):
+            Dimensionality of the hidden states.
+        hidden_act (`str`, *optional*, defaults to `"relu"`):
+            The non-linear activation function used inside the `SamMaskDecoder` module.
+        mlp_dim (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 2):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        attention_downsample_rate (`int`, *optional*, defaults to 2):
+            The downsampling rate of the attention layer.
+        num_multimask_outputs (`int`, *optional*, defaults to 3):
+            The number of outputs from the `SamMaskDecoder` module. In the Segment Anything paper, this is set to 3.
+        iou_head_depth (`int`, *optional*, defaults to 3):
+            The number of layers in the IoU head module.
+        iou_head_hidden_dim (`int`, *optional*, defaults to 256):
+            The dimensionality of the hidden states in the IoU head module.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+
+    """
+
+    def __init__(
+        self,
+        hidden_size=256,
+        hidden_act="relu",
+        mlp_dim=2048,
+        num_hidden_layers=2,
+        num_attention_heads=8,
+        attention_downsample_rate=2,
+        num_multimask_outputs=3,
+        iou_head_depth=3,
+        iou_head_hidden_dim=256,
+        layer_norm_eps=1e-6,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.hidden_act = hidden_act
+        self.mlp_dim = mlp_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.attention_downsample_rate = attention_downsample_rate
+        self.num_multimask_outputs = num_multimask_outputs
+        self.iou_head_depth = iou_head_depth
+        self.iou_head_hidden_dim = iou_head_hidden_dim
+        self.layer_norm_eps = layer_norm_eps
+
+
+class SamVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`SamVisionModel`]. It is used to instantiate a SAM
+    vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
+    defaults will yield a similar configuration to that of the SAM ViT-h
+    [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        output_channels (`int`, *optional*, defaults to 256):
+            Dimensionality of the output channels in the Patch Encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the input image.
+        image_size (`int`, *optional*, defaults to 1024):
+            Expected resolution. Target size of the resized input image.
+        patch_size (`int`, *optional*, defaults to 16):
+            Size of the patches to be extracted from the input image.
+        hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string)
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 1e-10):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to query, key, value projections.
+        mlp_ratio (`float`, *optional*, defaults to 4.0):
+            Ratio of mlp hidden dim to embedding dim.
+        use_abs_pos (`bool`, *optional*, defaults to `True`):
+            Whether to use absolute position embedding.
+        use_rel_pos (`bool`, *optional*, defaults to `True`):
+            Whether to use relative position embedding.
+        window_size (`int`, *optional*, defaults to 14):
+            Window size for relative position.
+        global_attn_indexes (`List[int]`, *optional*, defaults to `[2, 5, 8, 11]`):
+            The indexes of the global attention layers.
+        num_pos_feats (`int`, *optional*, defaults to 128):
+            The dimensionality of the position embedding.
+        mlp_dim (`int`, *optional*):
+            The dimensionality of the MLP layer in the Transformer encoder. If `None`, defaults to `mlp_ratio *
+            hidden_size`.
+    """
+
+    def __init__(
+        self,
+        hidden_size=768,
+        output_channels=256,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=1024,
+        patch_size=16,
+        hidden_act="gelu",
+        layer_norm_eps=1e-06,
+        attention_dropout=0.0,
+        initializer_range=1e-10,
+        qkv_bias=True,
+        mlp_ratio=4.0,
+        use_abs_pos=True,
+        use_rel_pos=True,
+        window_size=14,
+        global_attn_indexes=[2, 5, 8, 11],
+        num_pos_feats=128,
+        mlp_dim=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.output_channels = output_channels
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.hidden_act = hidden_act
+        self.layer_norm_eps = layer_norm_eps
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.qkv_bias = qkv_bias
+        self.mlp_ratio = mlp_ratio
+        self.use_abs_pos = use_abs_pos
+        self.use_rel_pos = use_rel_pos
+        self.window_size = window_size
+        self.global_attn_indexes = global_attn_indexes
+        self.num_pos_feats = num_pos_feats
+        self.mlp_dim = int(hidden_size * mlp_ratio) if mlp_dim is None else mlp_dim
+
+
+class SamConfig(PretrainedConfig):
+    r"""
+    [`SamConfig`] is the configuration class to store the configuration of a [`SamModel`]. It is used to instantiate a
+    SAM model according to the specified arguments, defining the vision model, prompt-encoder model and mask decoder
+    configs. Instantiating a configuration with the defaults will yield a similar configuration to that of the
+    SAM-ViT-H [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vision_config (Union[`dict`, `SamVisionConfig`], *optional*):
+            Dictionary of configuration options used to initialize [`SamVisionConfig`].
+        prompt_encoder_config (Union[`dict`, `SamPromptEncoderConfig`], *optional*):
+            Dictionary of configuration options used to initialize [`SamPromptEncoderConfig`].
+        mask_decoder_config (Union[`dict`, `SamMaskDecoderConfig`], *optional*):
+            Dictionary of configuration options used to initialize [`SamMaskDecoderConfig`].
+
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from transformers import (
+    ...     SamVisionConfig,
+    ...     SamPromptEncoderConfig,
+    ...     SamMaskDecoderConfig,
+    ...     SamModel,
+    ... )
+
+    >>> # Initializing a SamConfig with `"facebook/sam-vit-huge"` style configuration
+    >>> configuration = SamConfig()
+
+    >>> # Initializing a SamModel (with random weights) from the `"facebook/sam-vit-huge"` style configuration
+    >>> model = SamModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+
+    >>> # We can also initialize a SamConfig from a SamVisionConfig, SamPromptEncoderConfig, and SamMaskDecoderConfig
+
+    >>> # Initializing SAM vision, SAM Q-Former and language model configurations
+    >>> vision_config = SamVisionConfig()
+    >>> prompt_encoder_config = SamPromptEncoderConfig()
+    >>> mask_decoder_config = SamMaskDecoderConfig()
+
+    >>> config = SamConfig(vision_config, prompt_encoder_config, mask_decoder_config)
+    ```"""
+
+    model_type = "sam"
+
+    def __init__(
+        self,
+        vision_config=None,
+        prompt_encoder_config=None,
+        mask_decoder_config=None,
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        vision_config = vision_config if vision_config is not None else {}
+        prompt_encoder_config = prompt_encoder_config if prompt_encoder_config is not None else {}
+        mask_decoder_config = mask_decoder_config if mask_decoder_config is not None else {}
+
+        if isinstance(vision_config, SamVisionConfig):
+            vision_config = vision_config.to_dict()
+        if isinstance(prompt_encoder_config, SamPromptEncoderConfig):
+            prompt_encoder_config = prompt_encoder_config.to_dict()
+        if isinstance(mask_decoder_config, SamMaskDecoderConfig):
+            mask_decoder_config = mask_decoder_config.to_dict()
+
+        self.vision_config = SamVisionConfig(**vision_config)
+        self.prompt_encoder_config = SamPromptEncoderConfig(**prompt_encoder_config)
+        self.mask_decoder_config = SamMaskDecoderConfig(**mask_decoder_config)
+        self.initializer_range = initializer_range
diff --git a/src/transformers/models/sam/convert_sam_original_to_hf_format.py b/src/transformers/models/sam/convert_sam_original_to_hf_format.py
new file mode 100644
index 000000000000..b3cb45b34701
--- /dev/null
+++ b/src/transformers/models/sam/convert_sam_original_to_hf_format.py
@@ -0,0 +1,206 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Convert SAM checkpoints from the original repository.
+"""
+import argparse
+import re
+
+import numpy as np
+import requests
+import torch
+from huggingface_hub import hf_hub_download
+from PIL import Image
+
+from transformers import (
+    SamConfig,
+    SamImageProcessor,
+    SamModel,
+    SamProcessor,
+    SamVisionConfig,
+)
+
+
+KEYS_TO_MODIFY_MAPPING = {
+    "iou_prediction_head.layers.0": "iou_prediction_head.proj_in",
+    "iou_prediction_head.layers.1": "iou_prediction_head.layers.0",
+    "iou_prediction_head.layers.2": "iou_prediction_head.proj_out",
+    "mask_decoder.output_upscaling.0": "mask_decoder.upscale_conv1",
+    "mask_decoder.output_upscaling.1": "mask_decoder.upscale_layer_norm",
+    "mask_decoder.output_upscaling.3": "mask_decoder.upscale_conv2",
+    "mask_downscaling.0": "mask_embed.conv1",
+    "mask_downscaling.1": "mask_embed.layer_norm1",
+    "mask_downscaling.3": "mask_embed.conv2",
+    "mask_downscaling.4": "mask_embed.layer_norm2",
+    "mask_downscaling.6": "mask_embed.conv3",
+    "point_embeddings": "point_embed",
+    "pe_layer.positional_encoding_gaussian_matrix": "shared_embedding.positional_embedding",
+    "image_encoder": "vision_encoder",
+    "neck.0": "neck.conv1",
+    "neck.1": "neck.layer_norm1",
+    "neck.2": "neck.conv2",
+    "neck.3": "neck.layer_norm2",
+    "patch_embed.proj": "patch_embed.projection",
+    ".norm": ".layer_norm",
+    "blocks": "layers",
+}
+
+
+def replace_keys(state_dict):
+    model_state_dict = {}
+    state_dict.pop("pixel_mean", None)
+    state_dict.pop("pixel_std", None)
+
+    output_hypernetworks_mlps_pattern = r".*.output_hypernetworks_mlps.(\d+).layers.(\d+).*"
+
+    for key, value in state_dict.items():
+        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
+            if key_to_modify in key:
+                key = key.replace(key_to_modify, new_key)
+
+        if re.match(output_hypernetworks_mlps_pattern, key):
+            layer_nb = int(re.match(output_hypernetworks_mlps_pattern, key).group(2))
+            if layer_nb == 0:
+                key = key.replace("layers.0", "proj_in")
+            elif layer_nb == 1:
+                key = key.replace("layers.1", "layers.0")
+            elif layer_nb == 2:
+                key = key.replace("layers.2", "proj_out")
+
+        model_state_dict[key] = value
+
+    model_state_dict["shared_image_embedding.positional_embedding"] = model_state_dict[
+        "prompt_encoder.shared_embedding.positional_embedding"
+    ]
+
+    return model_state_dict
+
+
+def convert_sam_checkpoint(model_name, pytorch_dump_folder, push_to_hub, model_hub_id="ybelkada/segment-anything"):
+    checkpoint_path = hf_hub_download(model_hub_id, f"checkpoints/{model_name}.pth")
+
+    if "sam_vit_b" in model_name:
+        config = SamConfig()
+    elif "sam_vit_l" in model_name:
+        vision_config = SamVisionConfig(
+            hidden_size=1024,
+            num_hidden_layers=24,
+            num_attention_heads=16,
+            global_attn_indexes=[5, 11, 17, 23],
+        )
+
+        config = SamConfig(
+            vision_config=vision_config,
+        )
+    elif "sam_vit_h" in model_name:
+        vision_config = SamVisionConfig(
+            hidden_size=1280,
+            num_hidden_layers=32,
+            num_attention_heads=16,
+            global_attn_indexes=[7, 15, 23, 31],
+        )
+
+        config = SamConfig(
+            vision_config=vision_config,
+        )
+
+    state_dict = torch.load(checkpoint_path, map_location="cpu")
+    state_dict = replace_keys(state_dict)
+
+    image_processor = SamImageProcessor()
+
+    processor = SamProcessor(image_processor=image_processor)
+    hf_model = SamModel(config)
+
+    hf_model.load_state_dict(state_dict)
+    hf_model = hf_model.to("cuda")
+
+    img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
+    raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
+
+    input_points = [[[400, 650]]]
+    input_labels = [[1]]
+
+    inputs = processor(images=np.array(raw_image), return_tensors="pt").to("cuda")
+
+    with torch.no_grad():
+        output = hf_model(**inputs)
+    scores = output.iou_scores.squeeze()
+
+    if model_name == "sam_vit_h_4b8939":
+        assert scores[-1].item() == 0.579890251159668
+
+        inputs = processor(
+            images=np.array(raw_image), input_points=input_points, input_labels=input_labels, return_tensors="pt"
+        ).to("cuda")
+
+        with torch.no_grad():
+            output = hf_model(**inputs)
+        scores = output.iou_scores.squeeze()
+
+        assert scores[-1].item() == 0.9712603092193604
+
+        input_boxes = ((75, 275, 1725, 850),)
+
+        inputs = processor(images=np.array(raw_image), input_boxes=input_boxes, return_tensors="pt").to("cuda")
+
+        with torch.no_grad():
+            output = hf_model(**inputs)
+        scores = output.iou_scores.squeeze()
+
+        assert scores[-1].item() == 0.8686015605926514
+
+        # Test with 2 points and 1 image.
+        input_points = [[[400, 650], [800, 650]]]
+        input_labels = [[1, 1]]
+
+        inputs = processor(
+            images=np.array(raw_image), input_points=input_points, input_labels=input_labels, return_tensors="pt"
+        ).to("cuda")
+
+        with torch.no_grad():
+            output = hf_model(**inputs)
+        scores = output.iou_scores.squeeze()
+
+        assert scores[-1].item() == 0.9936047792434692
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    choices = ["sam_vit_b_01ec64", "sam_vit_h_4b8939", "sam_vit_l_0b3195"]
+    parser.add_argument(
+        "--model_name",
+        default="sam_vit_h_4b8939",
+        choices=choices,
+        type=str,
+        help="Path to hf config.json of model to convert",
+    )
+    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+    parser.add_argument(
+        "--push_to_hub",
+        action="store_true",
+        help="Whether to push the model and processor to the hub after converting",
+    )
+    parser.add_argument(
+        "--model_hub_id",
+        default="ybelkada/segment-anything",
+        choices=choices,
+        type=str,
+        help="Path to hf config.json of model to convert",
+    )
+
+    args = parser.parse_args()
+
+    convert_sam_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.model_hub_id)
diff --git a/src/transformers/models/sam/image_processing_sam.py b/src/transformers/models/sam/image_processing_sam.py
new file mode 100644
index 000000000000..a5c5c1e5fb4e
--- /dev/null
+++ b/src/transformers/models/sam/image_processing_sam.py
@@ -0,0 +1,1298 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for SAM."""
+import math
+from copy import deepcopy
+from itertools import product
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import convert_to_rgb, pad, resize, to_channel_dimension_format
+from ...image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import (
+    TensorType,
+    is_tf_available,
+    is_torch_available,
+    is_torchvision_available,
+    logging,
+    requires_backends,
+)
+
+
+if is_torch_available():
+    import torch
+    import torch.nn.functional as F
+
+if is_torchvision_available():
+    from torchvision.ops.boxes import batched_nms
+
+if is_tf_available():
+    import tensorflow as tf
+    from tensorflow.experimental import numpy as tnp
+
+    from ...tf_utils import flatten, shape_list
+
+logger = logging.get_logger(__name__)
+
+
+class SamImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a SAM image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
+            `do_resize` parameter in the `preprocess` method.
+        size (`dict`, *optional*, defaults to `{"longest_edge": 1024}`):
+            Size of the output image after resizing. Resizes the longest edge of the image to match
+            `size["longest_edge"]` while maintaining the aspect ratio. Can be overridden by the `size` parameter in the
+            `preprocess` method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
+            `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Wwhether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+            `do_rescale` parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be
+            overridden by the `rescale_factor` parameter in the `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
+            overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_pad (`bool`, *optional*, defaults to `True`):
+            Whether to pad the image to the specified `pad_size`. Can be overridden by the `do_pad` parameter in the
+            `preprocess` method.
+        pad_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`):
+            Size of the output image after padding. Can be overridden by the `pad_size` parameter in the `preprocess`
+            method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_pad: bool = True,
+        pad_size: int = None,
+        do_convert_rgb: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"longest_edge": 1024}
+        size = get_size_dict(max_size=size, default_to_square=False) if not isinstance(size, dict) else size
+
+        pad_size = pad_size if pad_size is not None else {"height": 1024, "width": 1024}
+        pad_size = get_size_dict(pad_size, default_to_square=True)
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+        self.do_pad = do_pad
+        self.pad_size = pad_size
+        self.do_convert_rgb = do_convert_rgb
+
+    def pad_image(
+        self,
+        image: np.ndarray,
+        pad_size: Dict[str, int],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Pad an image to `(pad_size["height"], pad_size["width"])` with zeros to the right and bottom.
+
+        Args:
+            image (`np.ndarray`):
+                Image to pad.
+            pad_size (`Dict[str, int]`):
+                Size of the output image after padding.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The data format of the image. Can be either "channels_first" or "channels_last". If `None`, the
+                `data_format` of the `image` will be used.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        output_height, output_width = pad_size["height"], pad_size["width"]
+        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+
+        pad_width = output_width - input_width
+        pad_height = output_height - input_height
+
+        padded_image = pad(
+            image,
+            ((0, pad_height), (0, pad_width)),
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+        return padded_image
+
+    def _get_preprocess_shape(self, old_shape: Tuple[int, int], longest_edge: int):
+        """
+        Compute the output size given input size and target long side length.
+        """
+        oldh, oldw = old_shape
+        scale = longest_edge * 1.0 / max(oldh, oldw)
+        newh, neww = oldh * scale, oldw * scale
+        newh = int(newh + 0.5)
+        neww = int(neww + 0.5)
+        return (newh, neww)
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to `(size["height"], size["width"])`.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Dictionary in the format `{"longest_edge": int}` specifying the size of the output image. The longest
+                edge of the image will be resized to the specified size, while the other edge will be resized to
+                maintain the aspect ratio.
+            resample:
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
+        """
+        size = get_size_dict(size)
+        if "longest_edge" not in size:
+            raise ValueError(f"The `size` dictionary must contain the key `longest_edge`. Got {size.keys()}")
+        input_size = get_image_size(image, channel_dim=input_data_format)
+        output_height, output_width = self._get_preprocess_shape(input_size, size["longest_edge"])
+        return resize(
+            image,
+            size=(output_height, output_width),
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[Dict[str, int]] = None,
+        resample: Optional["PILImageResampling"] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[Union[int, float]] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_pad: Optional[bool] = None,
+        pad_size: Optional[Dict[str, int]] = None,
+        do_convert_rgb: bool = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ):
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Controls the size of the image after `resize`. The longest edge of the image is resized to
+                `size["longest_edge"]` whilst preserving the aspect ratio.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image pixel values by rescaling factor.
+            rescale_factor (`int` or `float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to apply to the image pixel values.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to normalize the image by if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to normalize the image by if `do_normalize` is set to `True`.
+            do_pad (`bool`, *optional*, defaults to `self.do_pad`):
+                Whether to pad the image.
+            pad_size (`Dict[str, int]`, *optional*, defaults to `self.pad_size`):
+                Controls the size of the padding applied to the image. The image is padded to `pad_size["height"]` and
+                `pad_size["width"]` if `do_pad` is set to `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(max_size=size, default_to_square=False) if not isinstance(size, dict) else size
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_pad = do_pad if do_pad is not None else self.do_pad
+        pad_size = pad_size if pad_size is not None else self.pad_size
+        pad_size = get_size_dict(pad_size, default_to_square=True)
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        if do_resize and (size is None or resample is None):
+            raise ValueError("Size and resample must be specified if do_resize is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize and (image_mean is None or image_std is None):
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
+
+        if do_pad and pad_size is None:
+            raise ValueError("Pad size must be specified if do_pad is True.")
+
+        # PIL RGBA images are converted to RGB
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        original_sizes = [get_image_size(image, channel_dim=input_data_format) for image in images]
+
+        if do_resize:
+            images = [
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        reshaped_input_sizes = [get_image_size(image, channel_dim=input_data_format) for image in images]
+
+        if do_rescale:
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_normalize:
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_pad:
+            images = [
+                self.pad_image(image=image, pad_size=pad_size, input_data_format=input_data_format) for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
+        encoded_outputs = BatchFeature(
+            data={
+                "pixel_values": images,
+                "original_sizes": original_sizes,
+                "reshaped_input_sizes": reshaped_input_sizes,
+            },
+            tensor_type=return_tensors,
+        )
+        return encoded_outputs
+
+    def post_process_masks(
+        self,
+        masks,
+        original_sizes,
+        reshaped_input_sizes,
+        mask_threshold=0.0,
+        binarize=True,
+        pad_size=None,
+        return_tensors="pt",
+    ):
+        """
+        Remove padding and upscale masks to the original image size.
+
+        Args:
+            masks (`Union[List[torch.Tensor], List[np.ndarray], List[tf.Tensor]]`):
+                Batched masks from the mask_decoder in (batch_size, num_channels, height, width) format.
+            original_sizes (`Union[torch.Tensor, tf.Tensor, List[Tuple[int,int]]]`):
+                The original sizes of each image before it was resized to the model's expected input shape, in (height,
+                width) format.
+            reshaped_input_sizes (`Union[torch.Tensor, tf.Tensor, List[Tuple[int,int]]]`):
+                The size of each image as it is fed to the model, in (height, width) format. Used to remove padding.
+            mask_threshold (`float`, *optional*, defaults to 0.0):
+                The threshold to use for binarizing the masks.
+            binarize (`bool`, *optional*, defaults to `True`):
+                Whether to binarize the masks.
+            pad_size (`int`, *optional*, defaults to `self.pad_size`):
+                The target size the images were padded to before being passed to the model. If None, the target size is
+                assumed to be the processor's `pad_size`.
+            return_tensors (`str`, *optional*, defaults to `"pt"`):
+                If `"pt"`, return PyTorch tensors. If `"tf"`, return TensorFlow tensors.
+        Returns:
+            (`Union[torch.Tensor, tf.Tensor]`): Batched masks in batch_size, num_channels, height, width) format, where
+            (height, width) is given by original_size.
+        """
+        if return_tensors == "pt":
+            return self._post_process_masks_pt(
+                masks=masks,
+                original_sizes=original_sizes,
+                reshaped_input_sizes=reshaped_input_sizes,
+                mask_threshold=mask_threshold,
+                binarize=binarize,
+                pad_size=pad_size,
+            )
+        elif return_tensors == "tf":
+            return self._post_process_masks_tf(
+                masks=masks,
+                original_sizes=original_sizes,
+                reshaped_input_sizes=reshaped_input_sizes,
+                mask_threshold=mask_threshold,
+                binarize=binarize,
+                pad_size=pad_size,
+            )
+        else:
+            raise ValueError("return_tensors must be either 'pt' or 'tf'")
+
+    def _post_process_masks_pt(
+        self, masks, original_sizes, reshaped_input_sizes, mask_threshold=0.0, binarize=True, pad_size=None
+    ):
+        """
+        Remove padding and upscale masks to the original image size.
+
+        Args:
+            masks (`Union[List[torch.Tensor], List[np.ndarray]]`):
+                Batched masks from the mask_decoder in (batch_size, num_channels, height, width) format.
+            original_sizes (`Union[torch.Tensor, List[Tuple[int,int]]]`):
+                The original sizes of each image before it was resized to the model's expected input shape, in (height,
+                width) format.
+            reshaped_input_sizes (`Union[torch.Tensor, List[Tuple[int,int]]]`):
+                The size of each image as it is fed to the model, in (height, width) format. Used to remove padding.
+            mask_threshold (`float`, *optional*, defaults to 0.0):
+                The threshold to use for binarizing the masks.
+            binarize (`bool`, *optional*, defaults to `True`):
+                Whether to binarize the masks.
+            pad_size (`int`, *optional*, defaults to `self.pad_size`):
+                The target size the images were padded to before being passed to the model. If None, the target size is
+                assumed to be the processor's `pad_size`.
+        Returns:
+            (`torch.Tensor`): Batched masks in batch_size, num_channels, height, width) format, where (height, width)
+            is given by original_size.
+        """
+        requires_backends(self, ["torch"])
+        pad_size = self.pad_size if pad_size is None else pad_size
+        target_image_size = (pad_size["height"], pad_size["width"])
+        if isinstance(original_sizes, (torch.Tensor, np.ndarray)):
+            original_sizes = original_sizes.tolist()
+        if isinstance(reshaped_input_sizes, (torch.Tensor, np.ndarray)):
+            reshaped_input_sizes = reshaped_input_sizes.tolist()
+        output_masks = []
+        for i, original_size in enumerate(original_sizes):
+            if isinstance(masks[i], np.ndarray):
+                masks[i] = torch.from_numpy(masks[i])
+            elif not isinstance(masks[i], torch.Tensor):
+                raise ValueError("Input masks should be a list of `torch.tensors` or a list of `np.ndarray`")
+            interpolated_mask = F.interpolate(masks[i], target_image_size, mode="bilinear", align_corners=False)
+            interpolated_mask = interpolated_mask[..., : reshaped_input_sizes[i][0], : reshaped_input_sizes[i][1]]
+            interpolated_mask = F.interpolate(interpolated_mask, original_size, mode="bilinear", align_corners=False)
+            if binarize:
+                interpolated_mask = interpolated_mask > mask_threshold
+            output_masks.append(interpolated_mask)
+
+        return output_masks
+
+    def _post_process_masks_tf(
+        self, masks, original_sizes, reshaped_input_sizes, mask_threshold=0.0, binarize=True, pad_size=None
+    ):
+        """
+        Remove padding and upscale masks to the original image size.
+
+        Args:
+            masks (`tf.Tensor`):
+                Batched masks from the mask_decoder in (batch_size, num_channels, height, width) format.
+            original_sizes (`tf.Tensor`):
+                The original size of the images before resizing for input to the model, in (height, width) format.
+            reshaped_input_sizes (`tf.Tensor`):
+                The size of the image input to the model, in (height, width) format. Used to remove padding.
+            mask_threshold (`float`, *optional*, defaults to 0.0):
+                The threshold to use for binarizing the masks.
+            binarize (`bool`, *optional*, defaults to `True`):
+                Whether to binarize the masks.
+            pad_size (`int`, *optional*, defaults to `self.pad_size`):
+                The target size the images were padded to before being passed to the model. If None, the target size is
+                assumed to be the processor's `pad_size`.
+        Returns:
+            (`tf.Tensor`): Batched masks in batch_size, num_channels, height, width) format, where (height, width) is
+            given by original_size.
+        """
+        requires_backends(self, ["tf"])
+        pad_size = self.pad_size if pad_size is None else pad_size
+        target_image_size = (pad_size["height"], pad_size["width"])
+
+        output_masks = []
+        for i, original_size in enumerate(original_sizes):
+            # tf.image expects NHWC, we transpose the NCHW inputs for it
+            mask = tf.transpose(masks[i], perm=[0, 2, 3, 1])
+            interpolated_mask = tf.image.resize(mask, target_image_size, method="bilinear")
+            interpolated_mask = interpolated_mask[:, : reshaped_input_sizes[i][0], : reshaped_input_sizes[i][1], :]
+            interpolated_mask = tf.image.resize(interpolated_mask, original_size, method="bilinear")
+            if binarize:
+                interpolated_mask = interpolated_mask > mask_threshold
+            # And then we transpose them back at the end
+            output_masks.append(tf.transpose(interpolated_mask, perm=[0, 3, 1, 2]))
+
+        return output_masks
+
+    def post_process_for_mask_generation(
+        self, all_masks, all_scores, all_boxes, crops_nms_thresh, return_tensors="pt"
+    ):
+        """
+        Post processes mask that are generated by calling the Non Maximum Suppression algorithm on the predicted masks.
+
+        Args:
+            all_masks (`Union[List[torch.Tensor], List[tf.Tensor]]`):
+                List of all predicted segmentation masks
+            all_scores (`Union[List[torch.Tensor], List[tf.Tensor]]`):
+                List of all predicted iou scores
+            all_boxes (`Union[List[torch.Tensor], List[tf.Tensor]]`):
+                List of all bounding boxes of the predicted masks
+            crops_nms_thresh (`float`):
+                Threshold for NMS (Non Maximum Suppression) algorithm.
+            return_tensors (`str`, *optional*, defaults to `pt`):
+                If `pt`, returns `torch.Tensor`. If `tf`, returns `tf.Tensor`.
+        """
+        if return_tensors == "pt":
+            return _postprocess_for_mg(all_masks, all_scores, all_boxes, crops_nms_thresh)
+        elif return_tensors == "tf":
+            return _postprocess_for_mg_tf(all_masks, all_scores, all_boxes, crops_nms_thresh)
+
+    def generate_crop_boxes(
+        self,
+        image,
+        target_size,
+        crop_n_layers: int = 0,
+        overlap_ratio: float = 512 / 1500,
+        points_per_crop: Optional[int] = 32,
+        crop_n_points_downscale_factor: Optional[List[int]] = 1,
+        device: Optional["torch.device"] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        return_tensors: str = "pt",
+    ):
+        """
+        Generates a list of crop boxes of different sizes. Each layer has (2**i)**2 boxes for the ith layer.
+
+        Args:
+            image (`np.array`):
+                Input original image
+            target_size (`int`):
+                Target size of the resized image
+            crop_n_layers (`int`, *optional*, defaults to 0):
+                If >0, mask prediction will be run again on crops of the image. Sets the number of layers to run, where
+                each layer has 2**i_layer number of image crops.
+            overlap_ratio (`float`, *optional*, defaults to 512/1500):
+                Sets the degree to which crops overlap. In the first crop layer, crops will overlap by this fraction of
+                the image length. Later layers with more crops scale down this overlap.
+            points_per_crop (`int`, *optional*, defaults to 32):
+                Number of points to sample from each crop.
+            crop_n_points_downscale_factor (`List[int]`, *optional*, defaults to 1):
+                The number of points-per-side sampled in layer n is scaled down by crop_n_points_downscale_factor**n.
+            device (`torch.device`, *optional*, defaults to None):
+                Device to use for the computation. If None, cpu will be used.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+            return_tensors (`str`, *optional*, defaults to `pt`):
+                If `pt`, returns `torch.Tensor`. If `tf`, returns `tf.Tensor`.
+        """
+        crop_boxes, points_per_crop, cropped_images, input_labels = _generate_crop_boxes(
+            image,
+            target_size,
+            crop_n_layers,
+            overlap_ratio,
+            points_per_crop,
+            crop_n_points_downscale_factor,
+            input_data_format,
+        )
+        if return_tensors == "pt":
+            if device is None:
+                device = torch.device("cpu")
+            crop_boxes = torch.tensor(crop_boxes, device=device)
+            points_per_crop = torch.tensor(points_per_crop, device=device)
+            # cropped_images stays as np
+            input_labels = torch.tensor(input_labels, device=device)
+
+        elif return_tensors == "tf":
+            if device is not None:
+                raise ValueError("device is not a supported argument when return_tensors is tf!")
+            crop_boxes = tf.convert_to_tensor(crop_boxes)
+            points_per_crop = tf.convert_to_tensor(points_per_crop)
+            # cropped_images stays as np
+            input_labels = tf.convert_to_tensor(input_labels)
+        else:
+            raise ValueError("return_tensors must be either 'pt' or 'tf'.")
+        return crop_boxes, points_per_crop, cropped_images, input_labels
+
+    def filter_masks(
+        self,
+        masks,
+        iou_scores,
+        original_size,
+        cropped_box_image,
+        pred_iou_thresh=0.88,
+        stability_score_thresh=0.95,
+        mask_threshold=0,
+        stability_score_offset=1,
+        return_tensors="pt",
+    ):
+        """
+        Filters the predicted masks by selecting only the ones that meets several criteria. The first criterion being
+        that the iou scores needs to be greater than `pred_iou_thresh`. The second criterion is that the stability
+        score needs to be greater than `stability_score_thresh`. The method also converts the predicted masks to
+        bounding boxes and pad the predicted masks if necessary.
+
+        Args:
+            masks (`Union[torch.Tensor, tf.Tensor]`):
+                Input masks.
+            iou_scores (`Union[torch.Tensor, tf.Tensor]`):
+                List of IoU scores.
+            original_size (`Tuple[int,int]`):
+                Size of the orginal image.
+            cropped_box_image (`np.array`):
+                The cropped image.
+            pred_iou_thresh (`float`, *optional*, defaults to 0.88):
+                The threshold for the iou scores.
+            stability_score_thresh (`float`, *optional*, defaults to 0.95):
+                The threshold for the stability score.
+            mask_threshold (`float`, *optional*, defaults to 0):
+                The threshold for the predicted masks.
+            stability_score_offset (`float`, *optional*, defaults to 1):
+                The offset for the stability score used in the `_compute_stability_score` method.
+            return_tensors (`str`, *optional*, defaults to `pt`):
+                If `pt`, returns `torch.Tensor`. If `tf`, returns `tf.Tensor`.
+        """
+        if return_tensors == "pt":
+            return self._filter_masks_pt(
+                masks=masks,
+                iou_scores=iou_scores,
+                original_size=original_size,
+                cropped_box_image=cropped_box_image,
+                pred_iou_thresh=pred_iou_thresh,
+                stability_score_thresh=stability_score_thresh,
+                mask_threshold=mask_threshold,
+                stability_score_offset=stability_score_offset,
+            )
+        elif return_tensors == "tf":
+            return self._filter_masks_tf(
+                masks=masks,
+                iou_scores=iou_scores,
+                original_size=original_size,
+                cropped_box_image=cropped_box_image,
+                pred_iou_thresh=pred_iou_thresh,
+                stability_score_thresh=stability_score_thresh,
+                mask_threshold=mask_threshold,
+                stability_score_offset=stability_score_offset,
+            )
+
+    def _filter_masks_pt(
+        self,
+        masks,
+        iou_scores,
+        original_size,
+        cropped_box_image,
+        pred_iou_thresh=0.88,
+        stability_score_thresh=0.95,
+        mask_threshold=0,
+        stability_score_offset=1,
+    ):
+        """
+        Filters the predicted masks by selecting only the ones that meets several criteria. The first criterion being
+        that the iou scores needs to be greater than `pred_iou_thresh`. The second criterion is that the stability
+        score needs to be greater than `stability_score_thresh`. The method also converts the predicted masks to
+        bounding boxes and pad the predicted masks if necessary.
+
+        Args:
+            masks (`torch.Tensor`):
+                Input masks.
+            iou_scores (`torch.Tensor`):
+                List of IoU scores.
+            original_size (`Tuple[int,int]`):
+                Size of the orginal image.
+            cropped_box_image (`np.array`):
+                The cropped image.
+            pred_iou_thresh (`float`, *optional*, defaults to 0.88):
+                The threshold for the iou scores.
+            stability_score_thresh (`float`, *optional*, defaults to 0.95):
+                The threshold for the stability score.
+            mask_threshold (`float`, *optional*, defaults to 0):
+                The threshold for the predicted masks.
+            stability_score_offset (`float`, *optional*, defaults to 1):
+                The offset for the stability score used in the `_compute_stability_score` method.
+
+        """
+        requires_backends(self, ["torch"])
+        original_height, original_width = original_size
+        iou_scores = iou_scores.flatten(0, 1)
+        masks = masks.flatten(0, 1)
+
+        if masks.shape[0] != iou_scores.shape[0]:
+            raise ValueError("masks and iou_scores must have the same batch size.")
+
+        if masks.device != iou_scores.device:
+            iou_scores = iou_scores.to(masks.device)
+
+        batch_size = masks.shape[0]
+
+        keep_mask = torch.ones(batch_size, dtype=torch.bool, device=masks.device)
+
+        if pred_iou_thresh > 0.0:
+            keep_mask = keep_mask & (iou_scores > pred_iou_thresh)
+
+        # compute stability score
+        if stability_score_thresh > 0.0:
+            stability_scores = _compute_stability_score_pt(masks, mask_threshold, stability_score_offset)
+            keep_mask = keep_mask & (stability_scores > stability_score_thresh)
+
+        scores = iou_scores[keep_mask]
+        masks = masks[keep_mask]
+
+        # binarize masks
+        masks = masks > mask_threshold
+        converted_boxes = _batched_mask_to_box(masks)
+
+        keep_mask = ~_is_box_near_crop_edge(
+            converted_boxes, cropped_box_image, [0, 0, original_width, original_height]
+        )
+
+        scores = scores[keep_mask]
+        masks = masks[keep_mask]
+        converted_boxes = converted_boxes[keep_mask]
+
+        masks = _pad_masks(masks, cropped_box_image, original_height, original_width)
+        # conversion to rle is necessary to run non-maximum suppresion
+        masks = _mask_to_rle_pytorch(masks)
+
+        return masks, scores, converted_boxes
+
+    def _filter_masks_tf(
+        self,
+        masks,
+        iou_scores,
+        original_size,
+        cropped_box_image,
+        pred_iou_thresh=0.88,
+        stability_score_thresh=0.95,
+        mask_threshold=0,
+        stability_score_offset=1,
+    ):
+        """
+        Filters the predicted masks by selecting only the ones that meets several criteria. The first criterion being
+        that the iou scores needs to be greater than `pred_iou_thresh`. The second criterion is that the stability
+        score needs to be greater than `stability_score_thresh`. The method also converts the predicted masks to
+        bounding boxes and pad the predicted masks if necessary.
+
+        Args:
+            masks (`tf.Tensor`):
+                Input masks.
+            iou_scores (`tf.Tensor`):
+                List of IoU scores.
+            original_size (`Tuple[int,int]`):
+                Size of the orginal image.
+            cropped_box_image (`np.array`):
+                The cropped image.
+            pred_iou_thresh (`float`, *optional*, defaults to 0.88):
+                The threshold for the iou scores.
+            stability_score_thresh (`float`, *optional*, defaults to 0.95):
+                The threshold for the stability score.
+            mask_threshold (`float`, *optional*, defaults to 0):
+                The threshold for the predicted masks.
+            stability_score_offset (`float`, *optional*, defaults to 1):
+                The offset for the stability score used in the `_compute_stability_score` method.
+
+        """
+        requires_backends(self, ["tf"])
+        original_height, original_width = original_size
+        iou_scores = tf.reshape(iou_scores, [iou_scores.shape[0] * iou_scores.shape[1], iou_scores.shape[2:]])
+        masks = tf.reshape(masks, [masks.shape[0] * masks.shape[1], masks.shape[2:]])
+
+        if masks.shape[0] != iou_scores.shape[0]:
+            raise ValueError("masks and iou_scores must have the same batch size.")
+
+        batch_size = masks.shape[0]
+
+        keep_mask = tf.ones(batch_size, dtype=tf.bool)
+
+        if pred_iou_thresh > 0.0:
+            keep_mask = keep_mask & (iou_scores > pred_iou_thresh)
+
+        # compute stability score
+        if stability_score_thresh > 0.0:
+            stability_scores = _compute_stability_score_tf(masks, mask_threshold, stability_score_offset)
+            keep_mask = keep_mask & (stability_scores > stability_score_thresh)
+
+        scores = iou_scores[keep_mask]
+        masks = masks[keep_mask]
+
+        # binarize masks
+        masks = masks > mask_threshold
+        converted_boxes = _batched_mask_to_box_tf(masks)
+
+        keep_mask = ~_is_box_near_crop_edge_tf(
+            converted_boxes, cropped_box_image, [0, 0, original_width, original_height]
+        )
+
+        scores = scores[keep_mask]
+        masks = masks[keep_mask]
+        converted_boxes = converted_boxes[keep_mask]
+
+        masks = _pad_masks_tf(masks, cropped_box_image, original_height, original_width)
+        # conversion to rle is necessary to run non-maximum suppresion
+        masks = _mask_to_rle_tf(masks)
+
+        return masks, scores, converted_boxes
+
+
+def _compute_stability_score_pt(masks: "torch.Tensor", mask_threshold: float, stability_score_offset: int):
+    # One mask is always contained inside the other.
+    # Save memory by preventing unnecesary cast to torch.int64
+    intersections = (
+        (masks > (mask_threshold + stability_score_offset)).sum(-1, dtype=torch.int16).sum(-1, dtype=torch.int32)
+    )
+    unions = (masks > (mask_threshold - stability_score_offset)).sum(-1, dtype=torch.int16).sum(-1, dtype=torch.int32)
+    stability_scores = intersections / unions
+    return stability_scores
+
+
+def _compute_stability_score_tf(masks: "tf.Tensor", mask_threshold: float, stability_score_offset: int):
+    # Torch does Py3-style division but TF does floor division with ints. We cast to float32 in TF to make sure
+    # we get the right division results.
+    intersections = tf.count_nonzero(
+        masks > (mask_threshold + stability_score_offset), axis=[-1, -2], dtype=tf.float32
+    )
+    unions = tf.count_nonzero(masks > (mask_threshold - stability_score_offset), axis=[-1, -2], dtype=tf.float32)
+    stability_scores = intersections / unions
+    return stability_scores
+
+
+def _build_point_grid(n_per_side: int) -> np.ndarray:
+    """Generates a 2D grid of points evenly spaced in [0,1]x[0,1]."""
+    offset = 1 / (2 * n_per_side)
+    points_one_side = np.linspace(offset, 1 - offset, n_per_side)
+    points_x = np.tile(points_one_side[None, :], (n_per_side, 1))
+    points_y = np.tile(points_one_side[:, None], (1, n_per_side))
+    points = np.stack([points_x, points_y], axis=-1).reshape(-1, 2)
+    return points
+
+
+def _normalize_coordinates(
+    target_size: int, coords: np.ndarray, original_size: Tuple[int, int], is_bounding_box=False
+) -> np.ndarray:
+    """
+    Expects a numpy array of length 2 in the final dimension. Requires the original image size in (height, width)
+    format.
+    """
+    old_height, old_width = original_size
+
+    scale = target_size * 1.0 / max(old_height, old_width)
+    new_height, new_width = old_height * scale, old_width * scale
+    new_width = int(new_width + 0.5)
+    new_height = int(new_height + 0.5)
+
+    coords = deepcopy(coords).astype(float)
+
+    if is_bounding_box:
+        coords = coords.reshape(-1, 2, 2)
+
+    coords[..., 0] = coords[..., 0] * (new_width / old_width)
+    coords[..., 1] = coords[..., 1] * (new_height / old_height)
+
+    if is_bounding_box:
+        coords = coords.reshape(-1, 4)
+
+    return coords
+
+
+def _generate_crop_boxes(
+    image,
+    target_size: int,  # Is it tuple here?
+    crop_n_layers: int = 0,
+    overlap_ratio: float = 512 / 1500,
+    points_per_crop: Optional[int] = 32,
+    crop_n_points_downscale_factor: Optional[List[int]] = 1,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> Tuple[List[List[int]], List[int]]:
+    """
+    Generates a list of crop boxes of different sizes. Each layer has (2**i)**2 boxes for the ith layer.
+
+    Args:
+        image (Union[`numpy.ndarray`, `PIL.Image`, `torch.Tensor`]):
+            Image to generate crops for.
+        target_size (`int`):
+            Size of the smallest crop.
+        crop_n_layers (`int`, *optional*):
+            If `crops_n_layers>0`, mask prediction will be run again on crops of the image. Sets the number of layers
+            to run, where each layer has 2**i_layer number of image crops.
+        overlap_ratio (`int`, *optional*):
+            Sets the degree to which crops overlap. In the first crop layer, crops will overlap by this fraction of the
+            image length. Later layers with more crops scale down this overlap.
+        points_per_crop (`int`, *optional*):
+            Number of points to sample per crop.
+        crop_n_points_downscale_factor (`int`, *optional*):
+            The number of points-per-side sampled in layer n is scaled down by crop_n_points_downscale_factor**n.
+        input_data_format (`str` or `ChannelDimension`, *optional*):
+            The channel dimension format of the input image. If not provided, it will be inferred.
+    """
+
+    if isinstance(image, list):
+        raise ValueError("Only one image is allowed for crop generation.")
+    image = to_numpy_array(image)
+    original_size = get_image_size(image, input_data_format)
+
+    points_grid = []
+    for i in range(crop_n_layers + 1):
+        n_points = int(points_per_crop / (crop_n_points_downscale_factor**i))
+        points_grid.append(_build_point_grid(n_points))
+
+    crop_boxes, layer_idxs = _generate_per_layer_crops(crop_n_layers, overlap_ratio, original_size)
+
+    cropped_images, point_grid_per_crop = _generate_crop_images(
+        crop_boxes, image, points_grid, layer_idxs, target_size, original_size, input_data_format
+    )
+    crop_boxes = np.array(crop_boxes)
+    crop_boxes = crop_boxes.astype(np.float32)
+    points_per_crop = np.array([point_grid_per_crop])
+    points_per_crop = np.transpose(points_per_crop, axes=(0, 2, 1, 3))
+
+    input_labels = np.ones_like(points_per_crop[:, :, :, 0], dtype=np.int64)
+
+    return crop_boxes, points_per_crop, cropped_images, input_labels
+
+
+def _generate_per_layer_crops(crop_n_layers, overlap_ratio, original_size):
+    """
+    Generates 2 ** (layers idx + 1) crops for each crop_n_layers. Crops are in the XYWH format : The XYWH format
+    consists of the following required indices:
+        - X: X coordinate of the top left of the bounding box
+        - Y: Y coordinate of the top left of the bounding box
+        - W: width of the bounding box
+        - H: height of the bounding box
+    """
+    crop_boxes, layer_idxs = [], []
+    im_height, im_width = original_size
+    short_side = min(im_height, im_width)
+
+    # Original image
+    crop_boxes.append([0, 0, im_width, im_height])
+    layer_idxs.append(0)
+    for i_layer in range(crop_n_layers):
+        n_crops_per_side = 2 ** (i_layer + 1)
+        overlap = int(overlap_ratio * short_side * (2 / n_crops_per_side))
+
+        crop_width = int(math.ceil((overlap * (n_crops_per_side - 1) + im_width) / n_crops_per_side))
+        crop_height = int(math.ceil((overlap * (n_crops_per_side - 1) + im_height) / n_crops_per_side))
+
+        crop_box_x0 = [int((crop_width - overlap) * i) for i in range(n_crops_per_side)]
+        crop_box_y0 = [int((crop_height - overlap) * i) for i in range(n_crops_per_side)]
+
+        for left, top in product(crop_box_x0, crop_box_y0):
+            box = [left, top, min(left + crop_width, im_width), min(top + crop_height, im_height)]
+            crop_boxes.append(box)
+            layer_idxs.append(i_layer + 1)
+
+    return crop_boxes, layer_idxs
+
+
+def _generate_crop_images(
+    crop_boxes, image, points_grid, layer_idxs, target_size, original_size, input_data_format=None
+):
+    """
+    Takes as an input bounding boxes that are used to crop the image. Based in the crops, the corresponding points are
+    also passed.
+    """
+    cropped_images = []
+    total_points_per_crop = []
+    for i, crop_box in enumerate(crop_boxes):
+        left, top, right, bottom = crop_box
+
+        channel_dim = infer_channel_dimension_format(image, input_data_format)
+        if channel_dim == ChannelDimension.LAST:
+            cropped_im = image[top:bottom, left:right, :]
+        else:
+            cropped_im = image[:, top:bottom, left:right]
+
+        cropped_images.append(cropped_im)
+
+        cropped_im_size = get_image_size(cropped_im, channel_dim)
+        points_scale = np.array(cropped_im_size)[None, ::-1]
+
+        points = points_grid[layer_idxs[i]] * points_scale
+        normalized_points = _normalize_coordinates(target_size, points, original_size)
+        total_points_per_crop.append(normalized_points)
+
+    return cropped_images, total_points_per_crop
+
+
+def _pad_masks(masks, crop_box: List[int], orig_height: int, orig_width: int):
+    left, top, right, bottom = crop_box
+    if left == 0 and top == 0 and right == orig_width and bottom == orig_height:
+        return masks
+    # Coordinate transform masks
+    pad_x, pad_y = orig_width - (right - left), orig_height - (bottom - top)
+    pad = (left, pad_x - left, top, pad_y - top)
+    return torch.nn.functional.pad(masks, pad, value=0)
+
+
+def _pad_masks_tf(masks, crop_box: List[int], orig_height: int, orig_width: int):
+    left, top, right, bottom = crop_box
+    if left == 0 and top == 0 and right == orig_width and bottom == orig_height:
+        return masks
+    # Coordinate transform masks
+    pad_x, pad_y = orig_width - (right - left), orig_height - (bottom - top)
+    pad = (left, pad_x - left, top, pad_y - top)
+    return tf.pad(masks, pad, constant_values=0)
+
+
+def _is_box_near_crop_edge(boxes, crop_box, orig_box, atol=20.0):
+    """Filter masks at the edge of a crop, but not at the edge of the original image."""
+    crop_box_torch = torch.as_tensor(crop_box, dtype=torch.float, device=boxes.device)
+    orig_box_torch = torch.as_tensor(orig_box, dtype=torch.float, device=boxes.device)
+
+    left, top, _, _ = crop_box
+    offset = torch.tensor([[left, top, left, top]], device=boxes.device)
+    # Check if boxes has a channel dimension
+    if len(boxes.shape) == 3:
+        offset = offset.unsqueeze(1)
+    boxes = (boxes + offset).float()
+
+    near_crop_edge = torch.isclose(boxes, crop_box_torch[None, :], atol=atol, rtol=0)
+    near_image_edge = torch.isclose(boxes, orig_box_torch[None, :], atol=atol, rtol=0)
+    near_crop_edge = torch.logical_and(near_crop_edge, ~near_image_edge)
+    return torch.any(near_crop_edge, dim=1)
+
+
+def _is_box_near_crop_edge_tf(boxes, crop_box, orig_box, atol=20.0):
+    """Filter masks at the edge of a crop, but not at the edge of the original image."""
+    crop_box_tf = tf.convert_to_tensor(crop_box, dtype=tf.float32)
+    orig_box_tf = tf.convert_to_tensor(orig_box, dtype=tf.float32)
+
+    left, top, _, _ = crop_box
+    offset = tf.convert_to_tensor([[left, top, left, top]])
+    # Check if boxes has a channel dimension
+    if len(boxes.shape) == 3:
+        offset = tf.expand_dims(offset, 1)
+    boxes = tf.cast(boxes + offset, tf.float32)
+
+    near_crop_edge = tnp.isclose(boxes, crop_box_tf[None, :], atol=atol, rtol=0)
+    near_image_edge = tnp.isclose(boxes, orig_box_tf[None, :], atol=atol, rtol=0)
+    near_crop_edge = tf.math.logical_and(near_crop_edge, ~near_image_edge)
+    return tf.reduce_any(near_crop_edge, axis=1)
+
+
+def _batched_mask_to_box(masks: "torch.Tensor"):
+    """
+    Computes the bounding boxes around the given input masks. The bounding boxes are in the XYXY format which
+    corresponds the following required indices:
+        - LEFT: left hand side of the bounding box
+        - TOP: top of the bounding box
+        - RIGHT: right of the bounding box
+        - BOTTOM: bottom of the bounding box
+
+    Return [0,0,0,0] for an empty mask. For input shape channel_1 x channel_2 x ... x height x width, the output shape
+    is channel_1 x channel_2 x ... x 4.
+
+    Args:
+        - masks (`torch.Tensor` of shape `(batch, nb_mask, height, width)`)
+    """
+    # torch.max below raises an error on empty inputs, just skip in this case
+
+    if torch.numel(masks) == 0:
+        return torch.zeros(*masks.shape[:-2], 4, device=masks.device)
+
+    # Normalize shape to Cxheightxwidth
+    shape = masks.shape
+    height, width = shape[-2:]
+
+    # Get top and bottom edges
+    in_height, _ = torch.max(masks, dim=-1)
+    in_height_coords = in_height * torch.arange(height, device=in_height.device)[None, :]
+    bottom_edges, _ = torch.max(in_height_coords, dim=-1)
+    in_height_coords = in_height_coords + height * (~in_height)
+    top_edges, _ = torch.min(in_height_coords, dim=-1)
+
+    # Get left and right edges
+    in_width, _ = torch.max(masks, dim=-2)
+    in_width_coords = in_width * torch.arange(width, device=in_width.device)[None, :]
+    right_edges, _ = torch.max(in_width_coords, dim=-1)
+    in_width_coords = in_width_coords + width * (~in_width)
+    left_edges, _ = torch.min(in_width_coords, dim=-1)
+
+    # If the mask is empty the right edge will be to the left of the left edge.
+    # Replace these boxes with [0, 0, 0, 0]
+    empty_filter = (right_edges < left_edges) | (bottom_edges < top_edges)
+    out = torch.stack([left_edges, top_edges, right_edges, bottom_edges], dim=-1)
+    out = out * (~empty_filter).unsqueeze(-1)
+
+    # Return to original shape
+    out = out.reshape(*shape[:-2], 4)
+    return out
+
+
+def _batched_mask_to_box_tf(masks: "tf.Tensor"):
+    """
+    Computes the bounding boxes around the given input masks. The bounding boxes are in the XYXY format which
+    corresponds the following required indices:
+        - LEFT: left hand side of the bounding box
+        - TOP: top of the bounding box
+        - RIGHT: right of the bounding box
+        - BOTTOM: bottom of the bounding box
+
+    Return [0,0,0,0] for an empty mask. For input shape channel_1 x channel_2 x ... x height x width, the output shape
+    is channel_1 x channel_2 x ... x 4.
+
+    Args:
+        - masks (`tf.Tensor` of shape `(batch, nb_mask, height, width)`)
+    """
+
+    if tf.size(masks) == 0:
+        return tf.zeros([*masks.shape[:-2], 4])
+
+    # Normalize shape to Cxheightxwidth
+    shape = shape_list(masks)
+    height, width = shape[-2:]
+
+    # Get top and bottom edges
+    in_height = tf.reduce_max(masks, axis=-1)
+    in_height_coords = in_height * tf.range(height)[None, :]
+    bottom_edges = tf.reduce_max(in_height_coords, axis=-1)
+    in_height_coords = in_height_coords + height * (~in_height)
+    top_edges = tf.reduce_min(in_height_coords, axis=-1)
+
+    # Get left and right edges
+    in_width, _ = tf.reduce_max(masks, axis=-2)
+    in_width_coords = in_width * tf.range(width)[None, :]
+    right_edges, _ = tf.reduce_max(in_width_coords, axis=-1)
+    in_width_coords = in_width_coords + width * (~in_width)
+    left_edges, _ = tf.reduce_min(in_width_coords, axis=-1)
+
+    # If the mask is empty the right edge will be to the left of the left edge.
+    # Replace these boxes with [0, 0, 0, 0]
+    empty_filter = (right_edges < left_edges) | (bottom_edges < top_edges)
+    out = tf.stack([left_edges, top_edges, right_edges, bottom_edges], axis=-1)
+    out = out * tf.expand_dims(~empty_filter, -1)
+
+    # Return to original shape
+    out = tf.reshape(out, *shape[:-2], 4)
+    return out
+
+
+def _mask_to_rle_pytorch(input_mask: "torch.Tensor"):
+    """
+    Encodes masks the run-length encoding (RLE), in the format expected by pycoco tools.
+    """
+    # Put in fortran order and flatten height and width
+    batch_size, height, width = input_mask.shape
+    input_mask = input_mask.permute(0, 2, 1).flatten(1)
+
+    # Compute change indices
+    diff = input_mask[:, 1:] ^ input_mask[:, :-1]
+    change_indices = diff.nonzero()
+
+    # Encode run length
+    out = []
+    for i in range(batch_size):
+        cur_idxs = change_indices[change_indices[:, 0] == i, 1] + 1
+        btw_idxs = cur_idxs[1:] - cur_idxs[:-1]
+        counts = [] if input_mask[i, 0] == 0 else [0]
+        counts += [cur_idxs[0].item()] + btw_idxs.tolist() + [height * width - cur_idxs[-1]]
+        out.append({"size": [height, width], "counts": counts})
+    return out
+
+
+def _mask_to_rle_tf(input_mask: "tf.Tensor"):
+    """
+    Encodes masks the run-length encoding (RLE), in the format expected by pycoco tools.
+    """
+    # Put in fortran order and flatten height and width
+    batch_size, height, width = input_mask.shape
+    input_mask = flatten(tf.transpose(input_mask, perm=(0, 2, 1)), 1)
+
+    # Compute change indices
+    diff = input_mask[:, 1:] ^ input_mask[:, :-1]
+    change_indices = tf.where(diff)
+
+    # Encode run length
+    out = []
+    for i in range(batch_size):
+        cur_idxs = change_indices[change_indices[:, 0] == i, 1] + 1
+        btw_idxs = cur_idxs[1:] - cur_idxs[:-1]
+        counts = [] if input_mask[i, 0] == 0 else [0]
+        counts += [cur_idxs[0].item()] + btw_idxs.tolist() + [height * width - cur_idxs[-1]]
+        out.append({"size": [height, width], "counts": counts})
+    return out
+
+
+def _rle_to_mask(rle: Dict[str, Any]) -> np.ndarray:
+    """Compute a binary mask from an uncompressed RLE."""
+    height, width = rle["size"]
+    mask = np.empty(height * width, dtype=bool)
+    idx = 0
+    parity = False
+    for count in rle["counts"]:
+        mask[idx : idx + count] = parity
+        idx += count
+        parity = not parity
+    mask = mask.reshape(width, height)
+    return mask.transpose()  # Reshape to original shape
+
+
+def _postprocess_for_mg(rle_masks, iou_scores, mask_boxes, amg_crops_nms_thresh=0.7):
+    """
+    Perform NMS (Non Maximum Suppression) on the outputs.
+
+    Args:
+            rle_masks (`torch.Tensor`):
+                binary masks in the RLE format
+            iou_scores (`torch.Tensor` of shape (nb_masks, 1)):
+                iou_scores predicted by the model
+            mask_boxes (`torch.Tensor`):
+                The bounding boxes corresponding to segmentation masks
+            amg_crops_nms_thresh (`float`, *optional*, defaults to 0.7):
+                NMS threshold.
+    """
+    keep_by_nms = batched_nms(
+        boxes=mask_boxes.float(),
+        scores=iou_scores,
+        idxs=torch.zeros(mask_boxes.shape[0]),
+        iou_threshold=amg_crops_nms_thresh,
+    )
+
+    iou_scores = iou_scores[keep_by_nms]
+    rle_masks = [rle_masks[i] for i in keep_by_nms]
+    mask_boxes = mask_boxes[keep_by_nms]
+    masks = [_rle_to_mask(rle) for rle in rle_masks]
+
+    return masks, iou_scores, rle_masks, mask_boxes
+
+
+def _postprocess_for_mg_tf(rle_masks, iou_scores, mask_boxes, amg_crops_nms_thresh=0.7):
+    """
+    Perform NMS (Non Maximum Suppression) on the outputs.
+
+    Args:
+            rle_masks (`tf.Tensor`):
+                binary masks in the RLE format
+            iou_scores (`tf.Tensor` of shape (nb_masks, 1)):
+                iou_scores predicted by the model
+            mask_boxes (`tf.Tensor`):
+                The bounding boxes corresponding to segmentation masks
+            amg_crops_nms_thresh (`float`, *optional*, defaults to 0.7):
+                NMS threshold.
+    """
+    keep_by_nms = tf.image.combined_non_max_suppression(
+        boxes=mask_boxes.float(),
+        scores=iou_scores,
+        idxs=torch.zeros(mask_boxes.shape[0]),
+        iou_threshold=amg_crops_nms_thresh,
+    )
+
+    iou_scores = iou_scores[keep_by_nms]
+    rle_masks = [rle_masks[i] for i in keep_by_nms]
+    mask_boxes = mask_boxes[keep_by_nms]
+    masks = [_rle_to_mask(rle) for rle in rle_masks]
+
+    return masks, iou_scores, rle_masks, mask_boxes
diff --git a/src/transformers/models/sam/modeling_sam.py b/src/transformers/models/sam/modeling_sam.py
new file mode 100644
index 000000000000..abf5544a5b4d
--- /dev/null
+++ b/src/transformers/models/sam/modeling_sam.py
@@ -0,0 +1,1426 @@
+# coding=utf-8
+# Copyright 2023 The Meta AI Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch SAM model."""
+
+import collections
+import math
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import Tensor, nn
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_sam import SamConfig, SamMaskDecoderConfig, SamPromptEncoderConfig, SamVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "SamConfig"
+_CHECKPOINT_FOR_DOC = "facebook/sam-vit-huge"
+
+SAM_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/sam-vit-huge",
+    "facebook/sam-vit-large",
+    "facebook/sam-vit-base",
+    # See all SAM models at https://huggingface.co/models?filter=sam
+]
+
+
+@dataclass
+class SamVisionEncoderOutput(ModelOutput):
+    """
+    Base class for sam vision model's outputs that also contains image embeddings obtained by applying the projection
+    layer to the pooler_output.
+
+    Args:
+        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The image embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    image_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class SamImageSegmentationOutput(ModelOutput):
+    """
+    Base class for Segment-Anything model's output
+
+    Args:
+        iou_scores (`torch.FloatTensor` of shape `(batch_size, num_masks)`):
+            The iou scores of the predicted masks.
+        pred_masks (`torch.FloatTensor` of shape `(batch_size, num_masks, height, width)`):
+            The predicted low resolutions masks. Needs to be post-processed by the processor
+        vision_hidden_states  (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the vision model at the output of each layer plus the optional initial embedding outputs.
+        vision_attentions  (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        mask_decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    iou_scores: torch.FloatTensor = None
+    pred_masks: torch.FloatTensor = None
+    vision_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    vision_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    mask_decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+class SamPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values):
+        batch_size, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        if height != self.image_size[0] or width != self.image_size[1]:
+            raise ValueError(
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
+            )
+        embeddings = self.projection(pixel_values).permute(0, 2, 3, 1)
+        return embeddings
+
+
+class SamMLPBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.lin1 = nn.Linear(config.hidden_size, config.mlp_dim)
+        self.lin2 = nn.Linear(config.mlp_dim, config.hidden_size)
+        self.act = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.lin1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.lin2(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.convnext.modeling_convnext.ConvNextLayerNorm with ConvNext->Sam
+class SamLayerNorm(nn.Module):
+    r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
+    width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
+    """
+
+    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.data_format = data_format
+        if self.data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError(f"Unsupported data format: {self.data_format}")
+        self.normalized_shape = (normalized_shape,)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.data_format == "channels_last":
+            x = torch.nn.functional.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+        elif self.data_format == "channels_first":
+            input_dtype = x.dtype
+            x = x.float()
+            u = x.mean(1, keepdim=True)
+            s = (x - u).pow(2).mean(1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.eps)
+            x = x.to(dtype=input_dtype)
+            x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
+
+
+class SamAttention(nn.Module):
+    """
+    SAM's attention layer that allows for downscaling the size of the embedding after projection to queries, keys, and
+    values.
+    """
+
+    def __init__(self, config, downsample_rate=None):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        downsample_rate = config.attention_downsample_rate if downsample_rate is None else downsample_rate
+
+        self.internal_dim = config.hidden_size // downsample_rate
+        self.num_attention_heads = config.num_attention_heads
+        if self.internal_dim % config.num_attention_heads != 0:
+            raise ValueError("num_attention_heads must divide hidden_size.")
+
+        self.q_proj = nn.Linear(self.hidden_size, self.internal_dim)
+        self.k_proj = nn.Linear(self.hidden_size, self.internal_dim)
+        self.v_proj = nn.Linear(self.hidden_size, self.internal_dim)
+        self.out_proj = nn.Linear(self.internal_dim, self.hidden_size)
+
+    def _separate_heads(self, hidden_states: Tensor, num_attention_heads: int) -> Tensor:
+        batch, point_batch_size, n_tokens, channel = hidden_states.shape
+        c_per_head = channel // num_attention_heads
+        hidden_states = hidden_states.reshape(batch * point_batch_size, n_tokens, num_attention_heads, c_per_head)
+        return hidden_states.transpose(1, 2)
+
+    def _recombine_heads(self, hidden_states: Tensor, point_batch_size: int) -> Tensor:
+        batch, n_heads, n_tokens, c_per_head = hidden_states.shape
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states.reshape(batch // point_batch_size, point_batch_size, n_tokens, n_heads * c_per_head)
+
+    def forward(self, query: Tensor, key: Tensor, value: Tensor, attention_similarity: Tensor = None) -> Tensor:
+        # Input projections
+        query = self.q_proj(query)
+        key = self.k_proj(key)
+        value = self.v_proj(value)
+
+        point_batch_size = query.shape[1]
+        # Separate into heads
+        query = self._separate_heads(query, self.num_attention_heads)
+        key = self._separate_heads(key, self.num_attention_heads)
+        value = self._separate_heads(value, self.num_attention_heads)
+
+        # SamAttention
+        _, _, _, c_per_head = query.shape
+        attn = query @ key.permute(0, 1, 3, 2)  # batch_size * point_batch_size  x N_heads x N_tokens x N_tokens
+        attn = attn / math.sqrt(c_per_head)
+        attn = torch.softmax(attn, dim=-1)
+
+        if attention_similarity is not None:
+            attn = attn + attention_similarity
+            attn = torch.softmax(attn, dim=-1)
+
+        # Get output
+        out = attn @ value
+        out = self._recombine_heads(out, point_batch_size)
+        out = self.out_proj(out)
+
+        return out
+
+
+class SamTwoWayAttentionBlock(nn.Module):
+    def __init__(self, config, attention_downsample_rate: int = 2, skip_first_layer_pe: bool = False):
+        """
+        A transformer block with four layers:
+            (1) self-attention of sparse inputs (2) cross attention of sparse inputs -> dense inputs (3) mlp block on
+            sparse inputs (4) cross attention of dense inputs -> sparse inputs
+
+        Arguments:
+            config (`SamMaskDecoderConfig`):
+                The configuration file used to instantiate the block
+            attention_downsample_rate (*optionalk*, int, defaults to 2):
+                The downsample ratio of the block used to reduce the inner dim of the attention.
+            skip_first_layer_pe (*optional*, bool, defaults to `False`):
+                Whether or not to skip the addition of the query_point_embedding on the first layer.
+        """
+        super().__init__()
+
+        self.hidden_size = config.hidden_size
+        self.layer_norm_eps = config.layer_norm_eps
+
+        self.self_attn = SamAttention(config, downsample_rate=1)
+        self.layer_norm1 = nn.LayerNorm(self.hidden_size, eps=self.layer_norm_eps)
+
+        self.cross_attn_token_to_image = SamAttention(config, downsample_rate=attention_downsample_rate)
+        self.layer_norm2 = nn.LayerNorm(self.hidden_size, eps=self.layer_norm_eps)
+
+        self.mlp = SamMLPBlock(config)
+        self.layer_norm3 = nn.LayerNorm(self.hidden_size, eps=self.layer_norm_eps)
+
+        self.layer_norm4 = nn.LayerNorm(self.hidden_size, eps=self.layer_norm_eps)
+        self.cross_attn_image_to_token = SamAttention(config, downsample_rate=attention_downsample_rate)
+
+        self.skip_first_layer_pe = skip_first_layer_pe
+
+    def forward(
+        self,
+        queries: Tensor,
+        keys: Tensor,
+        query_point_embedding: Tensor,
+        key_point_embedding: Tensor,
+        attention_similarity: Tensor,
+        output_attentions: bool = False,
+    ):
+        # Self attention block
+        if self.skip_first_layer_pe:
+            queries = self.self_attn(query=queries, key=queries, value=queries)
+        else:
+            query = queries + query_point_embedding
+            attn_out = self.self_attn(query=query, key=query, value=queries)
+            queries = queries + attn_out
+        queries = self.layer_norm1(queries)
+
+        # Cross attention block, tokens attending to image embedding
+        query = queries + query_point_embedding
+        key = keys + key_point_embedding
+
+        attn_out = self.cross_attn_token_to_image(
+            query=query, key=key, value=keys, attention_similarity=attention_similarity
+        )
+        queries = queries + attn_out
+
+        queries = self.layer_norm2(queries)
+
+        # MLP block
+        mlp_out = self.mlp(queries)
+        queries = queries + mlp_out
+        queries = self.layer_norm3(queries)
+
+        # Cross attention block, image embedding attending to tokens
+        query = queries + query_point_embedding
+        key = keys + key_point_embedding
+
+        attn_out = self.cross_attn_image_to_token(query=key, key=query, value=queries)
+        keys = keys + attn_out
+
+        keys = self.layer_norm4(keys)
+
+        outputs = (queries, keys)
+
+        if output_attentions:
+            outputs = outputs + (attn_out,)
+        else:
+            outputs = outputs + (None,)
+
+        return outputs
+
+
+class SamTwoWayTransformer(nn.Module):
+    def __init__(self, config: SamMaskDecoderConfig):
+        super().__init__()
+        self.config = config
+
+        self.num_hidden_layers = config.num_hidden_layers
+        self.layers = nn.ModuleList()
+
+        for i in range(self.num_hidden_layers):
+            self.layers.append(SamTwoWayAttentionBlock(config, skip_first_layer_pe=(i == 0)))
+
+        self.final_attn_token_to_image = SamAttention(config)
+        self.layer_norm_final_attn = nn.LayerNorm(config.hidden_size)
+
+    def forward(
+        self,
+        point_embeddings: Tensor,
+        image_embeddings: Tensor,
+        image_positional_embeddings: Tensor,
+        attention_similarity: Tensor,
+        target_embedding=None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        all_attentions = ()
+
+        if image_embeddings is None:
+            raise ValueError("You have to specify an image_embedding")
+
+        image_embeddings = image_embeddings.flatten(2).permute(0, 2, 1).unsqueeze(1)
+        image_positional_embeddings = image_positional_embeddings.flatten(2).permute(0, 2, 1).unsqueeze(1)
+
+        # Prepare queries
+        queries = point_embeddings
+        keys = image_embeddings
+
+        # Apply transformer blocks and final layernorm
+        for layer in self.layers:
+            if target_embedding is not None:
+                queries += target_embedding
+
+            queries, keys, attention_outputs = layer(
+                queries=queries,
+                keys=keys,
+                query_point_embedding=point_embeddings,
+                key_point_embedding=image_positional_embeddings,
+                attention_similarity=attention_similarity,
+                output_attentions=output_attentions,
+            )
+
+            if output_attentions:
+                all_attentions = all_attentions + (attention_outputs,)
+
+        # Apply the final attenion layer from the points to the image
+        query = queries + point_embeddings
+        key = keys + image_positional_embeddings
+
+        attn_out = self.final_attn_token_to_image(query=query, key=key, value=keys)
+
+        queries = queries + attn_out
+        queries = self.layer_norm_final_attn(queries)
+        return queries, keys, all_attentions
+
+
+class SamFeedForward(nn.Module):
+    def __init__(
+        self, input_dim: int, hidden_dim: int, output_dim: int, num_layers: int, sigmoid_output: bool = False
+    ):
+        super().__init__()
+        self.num_layers = num_layers
+        self.activation = nn.ReLU()
+        self.proj_in = nn.Linear(input_dim, hidden_dim)
+        self.proj_out = nn.Linear(hidden_dim, output_dim)
+        self.layers = nn.ModuleList([nn.Linear(hidden_dim, hidden_dim) for _ in range(num_layers - 2)])
+        self.sigmoid_output = sigmoid_output
+
+    def forward(self, hidden_states):
+        hidden_states = self.proj_in(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        for layer in self.layers:
+            hidden_states = self.activation(layer(hidden_states))
+
+        hidden_states = self.proj_out(hidden_states)
+        if self.sigmoid_output:
+            hidden_states = F.sigmoid(hidden_states)
+        return hidden_states
+
+
+class SamMaskDecoder(nn.Module):
+    def __init__(self, config: SamMaskDecoderConfig):
+        super().__init__()
+
+        self.hidden_size = config.hidden_size
+
+        self.num_multimask_outputs = config.num_multimask_outputs
+        self.num_mask_tokens = config.num_multimask_outputs + 1
+
+        self.iou_token = nn.Embedding(1, self.hidden_size)
+        self.mask_tokens = nn.Embedding(self.num_mask_tokens, self.hidden_size)
+
+        self.transformer = SamTwoWayTransformer(config)
+
+        # should we create a new class for this?
+        self.upscale_conv1 = nn.ConvTranspose2d(self.hidden_size, self.hidden_size // 4, kernel_size=2, stride=2)
+        self.upscale_conv2 = nn.ConvTranspose2d(self.hidden_size // 4, self.hidden_size // 8, kernel_size=2, stride=2)
+        self.upscale_layer_norm = SamLayerNorm(self.hidden_size // 4, data_format="channels_first")
+        self.activation = nn.GELU()
+
+        mlps_list = []
+        for _ in range(self.num_mask_tokens):
+            mlps_list += [SamFeedForward(self.hidden_size, self.hidden_size, self.hidden_size // 8, 3)]
+        self.output_hypernetworks_mlps = nn.ModuleList(mlps_list)
+
+        self.iou_prediction_head = SamFeedForward(
+            self.hidden_size, config.iou_head_hidden_dim, self.num_mask_tokens, config.iou_head_depth
+        )
+
+    def forward(
+        self,
+        image_embeddings: torch.Tensor,
+        image_positional_embeddings: torch.Tensor,
+        sparse_prompt_embeddings: torch.Tensor,
+        dense_prompt_embeddings: torch.Tensor,
+        multimask_output: bool,
+        output_attentions: Optional[bool] = None,
+        attention_similarity: torch.Tensor = None,
+        target_embedding: torch.Tensor = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Predict masks given image and prompt embeddings.
+
+        Args:
+            image_embeddings (`torch.Tensor`):
+                the embeddings from the image encoder
+            image_positional_embedding (`torch.Tensor`):
+                positional encoding with the shape of image_embeddings
+            sparse_prompt_embeddings (`torch.Tensor`):
+                The embeddings of the points and boxes
+            dense_prompt_embeddings (`torch.Tensor`):
+                the embeddings of the mask inputs
+            multimask_output (bool):
+                Whether to return multiple masks or a single mask.
+            output_attentions (bool, *optional*):
+                Whether or not to return the attentions tensors of all attention layers.
+        """
+        batch_size, num_channels, height, width = image_embeddings.shape
+        point_batch_size = sparse_prompt_embeddings.shape[1]
+        # Concatenate output tokens
+        output_tokens = torch.cat([self.iou_token.weight, self.mask_tokens.weight], dim=0)
+        output_tokens = output_tokens.repeat(batch_size, point_batch_size, 1, 1)
+
+        if sparse_prompt_embeddings.sum().item() != 0:
+            tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=2)
+        else:
+            tokens = output_tokens
+        point_embeddings = tokens.to(self.iou_token.weight.dtype)
+
+        # Expand per-image data in batch direction to be per-point
+        image_embeddings = image_embeddings + dense_prompt_embeddings
+        image_embeddings = image_embeddings.repeat_interleave(point_batch_size, 0)
+        image_positional_embeddings = image_positional_embeddings.repeat_interleave(point_batch_size, 0)
+
+        # Run the transformer, image_positional_embedding are consumed
+        point_embedding, image_embeddings, attentions = self.transformer(
+            point_embeddings=point_embeddings,
+            image_embeddings=image_embeddings,
+            image_positional_embeddings=image_positional_embeddings,
+            attention_similarity=attention_similarity,
+            target_embedding=target_embedding,
+            output_attentions=output_attentions,
+        )
+        iou_token_out = point_embedding[:, :, 0, :]
+        mask_tokens_out = point_embedding[:, :, 1 : (1 + self.num_mask_tokens), :]
+
+        # Upscale mask embeddings and predict masks using the mask tokens
+        image_embeddings = image_embeddings.transpose(2, 3).reshape(
+            batch_size * point_batch_size, num_channels, height, width
+        )
+
+        upscaled_embedding = self.upscale_conv1(image_embeddings)
+        upscaled_embedding = self.activation(self.upscale_layer_norm(upscaled_embedding))
+        upscaled_embedding = self.activation(self.upscale_conv2(upscaled_embedding))
+
+        hyper_in_list = []
+        for i in range(self.num_mask_tokens):
+            current_mlp = self.output_hypernetworks_mlps[i]
+            hyper_in_list += [current_mlp(mask_tokens_out[:, :, i, :])]
+        hyper_in = torch.stack(hyper_in_list, dim=2)
+
+        _, num_channels, height, width = upscaled_embedding.shape
+        upscaled_embedding = upscaled_embedding.reshape(batch_size, point_batch_size, num_channels, height * width)
+        masks = (hyper_in @ upscaled_embedding).reshape(batch_size, point_batch_size, -1, height, width)
+
+        # Generate mask quality predictions
+        iou_pred = self.iou_prediction_head(iou_token_out)
+
+        # Select the correct mask or masks for output
+        if multimask_output:
+            mask_slice = slice(1, None)
+        else:
+            mask_slice = slice(0, 1)
+        masks = masks[:, :, mask_slice, :, :]
+        iou_pred = iou_pred[:, :, mask_slice]
+
+        outputs = (masks, iou_pred)
+
+        if output_attentions:
+            outputs = outputs + (attentions,)
+        else:
+            outputs = outputs + (None,)
+
+        return outputs
+
+
+class SamPositionalEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.scale = config.hidden_size // 2
+        self.register_buffer("positional_embedding", self.scale * torch.randn((2, config.num_pos_feats)))
+
+    def forward(self, input_coords, input_shape=None):
+        """Positionally encode points that are normalized to [0,1]."""
+        coordinates = input_coords.clone()
+
+        if input_shape is not None:
+            coordinates[:, :, :, 0] = coordinates[:, :, :, 0] / input_shape[1]
+            coordinates[:, :, :, 1] = coordinates[:, :, :, 1] / input_shape[0]
+
+        # assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape
+        coordinates = 2 * coordinates - 1
+        coordinates = coordinates.to(self.positional_embedding.dtype)
+        coordinates = coordinates @ self.positional_embedding
+        coordinates = 2 * np.pi * coordinates
+        # outputs d_1 x ... x d_n x channel shape
+        return torch.cat([torch.sin(coordinates), torch.cos(coordinates)], dim=-1)
+
+
+class SamMaskEmbedding(nn.Module):
+    def __init__(self, config: SamPromptEncoderConfig):
+        super().__init__()
+        self.mask_input_channels = config.mask_input_channels // 4
+        self.activation = ACT2FN[config.hidden_act]
+        self.conv1 = nn.Conv2d(1, self.mask_input_channels, kernel_size=2, stride=2)
+        self.conv2 = nn.Conv2d(self.mask_input_channels, config.mask_input_channels, kernel_size=2, stride=2)
+        self.conv3 = nn.Conv2d(config.mask_input_channels, config.hidden_size, kernel_size=1)
+        self.layer_norm1 = SamLayerNorm(
+            self.mask_input_channels, eps=config.layer_norm_eps, data_format="channels_first"
+        )
+        self.layer_norm2 = SamLayerNorm(
+            self.mask_input_channels * 4, eps=config.layer_norm_eps, data_format="channels_first"
+        )
+
+    def forward(self, masks):
+        hidden_states = self.conv1(masks)
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states = self.activation(hidden_states)
+
+        hidden_states = self.conv2(hidden_states)
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        dense_embeddings = self.conv3(hidden_states)
+        return dense_embeddings
+
+
+class SamPromptEncoder(nn.Module):
+    def __init__(self, config: SamPromptEncoderConfig, shared_patch_embedding):
+        super().__init__()
+        self.shared_embedding = shared_patch_embedding
+        self.mask_embed = SamMaskEmbedding(config)
+        self.no_mask_embed = nn.Embedding(1, config.hidden_size)
+
+        self.image_embedding_size = (config.image_embedding_size, config.image_embedding_size)
+        self.input_image_size = config.image_size
+
+        self.point_embed = nn.ModuleList(
+            [nn.Embedding(1, config.hidden_size) for i in range(config.num_point_embeddings)]
+        )
+        self.hidden_size = config.hidden_size
+        self.not_a_point_embed = nn.Embedding(1, config.hidden_size)
+
+    def _embed_points(self, points: torch.Tensor, labels: torch.Tensor, pad: bool) -> torch.Tensor:
+        """Embeds point prompts."""
+        points = points + 0.5  # Shift to center of pixel
+        if pad:
+            target_point_shape = (points.shape[0], points.shape[1], 1, points.shape[-1])
+            target_labels_shape = (points.shape[0], points.shape[1], 1)
+            padding_point = torch.zeros(target_point_shape, device=points.device)
+            padding_label = -torch.ones(target_labels_shape, device=labels.device)
+            points = torch.cat([points, padding_point], dim=2)
+            labels = torch.cat([labels, padding_label], dim=2)
+        input_shape = (self.input_image_size, self.input_image_size)
+        point_embedding = self.shared_embedding(points, input_shape)
+
+        # torch.where and expanding the labels tensor is required by the ONNX export
+        point_embedding = torch.where(labels[..., None] == -1, self.not_a_point_embed.weight, point_embedding)
+
+        # This is required for the ONNX export. The dtype, device need to be explicitely
+        # specificed as otherwise torch.onnx.export interprets as double
+        point_embedding = torch.where(
+            labels[..., None] != -10,
+            point_embedding,
+            torch.tensor(0.0, dtype=point_embedding.dtype, device=point_embedding.device),
+        )
+
+        point_embedding = torch.where(
+            (labels == 0)[:, :, :, None],
+            point_embedding + self.point_embed[0].weight[None, None, :, :],
+            point_embedding,
+        )
+
+        point_embedding = torch.where(
+            (labels == 1)[:, :, :, None],
+            point_embedding + self.point_embed[1].weight[None, None, :, :],
+            point_embedding,
+        )
+
+        return point_embedding
+
+    def _embed_boxes(self, boxes: torch.Tensor) -> torch.Tensor:
+        """Embeds box prompts."""
+        boxes = boxes + 0.5  # Shift to center of pixel
+        batch_size, nb_boxes = boxes.shape[:2]
+        coords = boxes.reshape(batch_size, nb_boxes, 2, 2)
+        input_shape = (self.input_image_size, self.input_image_size)
+        corner_embedding = self.shared_embedding(coords, input_shape)
+        corner_embedding[:, :, 0, :] += self.point_embed[2].weight
+        corner_embedding[:, :, 1, :] += self.point_embed[3].weight
+        return corner_embedding
+
+    def forward(
+        self,
+        input_points: Optional[Tuple[torch.Tensor, torch.Tensor]],
+        input_labels: Optional[torch.Tensor],
+        input_boxes: Optional[torch.Tensor],
+        input_masks: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Embeds different types of prompts, returning both sparse and dense embeddings.
+
+        Args:
+            points (`torch.Tensor`, *optional*):
+                point coordinates and labels to embed.
+            boxes (`torch.Tensor`, *optional*):
+                boxes to embed
+            masks (`torch.Tensor`, *optional*):
+                masks to embed
+        """
+        sparse_embeddings = None
+        batch_size = 1
+        target_device = self.shared_embedding.positional_embedding.device
+        if input_points is not None:
+            batch_size, point_batch_size = input_points.shape[:2]
+            if input_labels is None:
+                raise ValueError("If points are provided, labels must also be provided.")
+            point_embeddings = self._embed_points(input_points, input_labels, pad=(input_boxes is None))
+            sparse_embeddings = point_embeddings
+        if input_boxes is not None:
+            batch_size = input_boxes.shape[0]
+            box_embeddings = self._embed_boxes(input_boxes)
+            if sparse_embeddings is None:
+                sparse_embeddings = box_embeddings
+            else:
+                sparse_embeddings = torch.cat([sparse_embeddings, box_embeddings], dim=2)
+        if input_masks is not None:
+            dense_embeddings = self.mask_embed(input_masks)
+        else:
+            dense_embeddings = self.no_mask_embed.weight.reshape(1, -1, 1, 1).expand(
+                batch_size, -1, self.image_embedding_size[0], self.image_embedding_size[1]
+            )
+
+        if sparse_embeddings is None:
+            sparse_embeddings = torch.zeros((batch_size, 1, 1, self.hidden_size), device=target_device)
+
+        return sparse_embeddings, dense_embeddings
+
+
+class SamVisionAttention(nn.Module):
+    """Multi-head Attention block with relative position embeddings."""
+
+    def __init__(self, config, window_size):
+        super().__init__()
+        input_size = (
+            (config.image_size // config.patch_size, config.image_size // config.patch_size)
+            if window_size == 0
+            else (window_size, window_size)
+        )
+
+        self.num_attention_heads = config.num_attention_heads
+        head_dim = config.hidden_size // config.num_attention_heads
+        self.scale = head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.qkv = nn.Linear(config.hidden_size, config.hidden_size * 3, bias=config.qkv_bias)
+        self.proj = nn.Linear(config.hidden_size, config.hidden_size)
+
+        self.use_rel_pos = config.use_rel_pos
+        if self.use_rel_pos:
+            if input_size is None:
+                raise ValueError("Input size must be provided if using relative positional encoding.")
+
+            # initialize relative positional embeddings
+            self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
+            self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
+
+    def get_rel_pos(self, q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor:
+        """
+        Get relative positional embeddings according to the relative positions of
+            query and key sizes.
+
+        Args:
+            q_size (int):
+                size of the query.
+            k_size (int):
+                size of key k.
+            rel_pos (`torch.Tensor`):
+                relative position embeddings (L, channel).
+
+        Returns:
+            Extracted positional embeddings according to relative positions.
+        """
+        max_rel_dist = int(2 * max(q_size, k_size) - 1)
+        # Interpolate rel pos.
+        rel_pos_resized = F.interpolate(
+            rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
+            size=max_rel_dist,
+            mode="linear",
+        )
+        rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
+
+        # Scale the coords with short length if shapes for q and k are different.
+        q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
+        k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
+        relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
+
+        return rel_pos_resized[relative_coords.long()]
+
+    def add_decomposed_rel_pos(
+        self,
+        attn: torch.Tensor,
+        query: torch.Tensor,
+        rel_pos_h: torch.Tensor,
+        rel_pos_w: torch.Tensor,
+        q_size: Tuple[int, int],
+        k_size: Tuple[int, int],
+    ) -> torch.Tensor:
+        """
+        Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
+        https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py
+
+        Args:
+            attn (`torch.Tensor`):
+                attention map.
+            query (`torch.Tensor`):
+                query q in the attention layer with shape (batch_size, query_height * query_width, channel).
+            rel_pos_h (`torch.Tensor`):
+                relative position embeddings (Lh, channel) for height axis.
+            rel_pos_w (`torch.Tensor`):
+                relative position embeddings (Lw, channel) for width axis.
+            q_size (tuple):
+                spatial sequence size of query q with (query_height, query_width).
+            k_size (tuple):
+                spatial sequence size of key k with (key_height, key_width).
+
+        Returns:
+            attn (`torch.Tensor`):
+                attention map with added relative positional embeddings.
+        """
+        query_height, query_width = q_size
+        key_height, key_width = k_size
+        relative_position_height = self.get_rel_pos(query_height, key_height, rel_pos_h)
+        relative_position_width = self.get_rel_pos(query_width, key_width, rel_pos_w)
+
+        batch_size, _, dim = query.shape
+        reshaped_query = query.reshape(batch_size, query_height, query_width, dim)
+        rel_h = torch.einsum("bhwc,hkc->bhwk", reshaped_query, relative_position_height)
+        rel_w = torch.einsum("bhwc,wkc->bhwk", reshaped_query, relative_position_width)
+        attn = attn.reshape(batch_size, query_height, query_width, key_height, key_width)
+        attn = attn + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]
+        attn = attn.reshape(batch_size, query_height * query_width, key_height * key_width)
+        return attn
+
+    def forward(self, hidden_states: torch.Tensor, output_attentions=False) -> torch.Tensor:
+        batch_size, height, width, _ = hidden_states.shape
+        # qkv with shape (3, batch_size, nHead, height * width, channel)
+        qkv = (
+            self.qkv(hidden_states)
+            .reshape(batch_size, height * width, 3, self.num_attention_heads, -1)
+            .permute(2, 0, 3, 1, 4)
+        )
+        # q, k, v with shape (batch_size * nHead, height * width, channel)
+        query, key, value = qkv.reshape(3, batch_size * self.num_attention_heads, height * width, -1).unbind(0)
+
+        attn_weights = (query * self.scale) @ key.transpose(-2, -1)
+
+        if self.use_rel_pos:
+            attn_weights = self.add_decomposed_rel_pos(
+                attn_weights, query, self.rel_pos_h, self.rel_pos_w, (height, width), (height, width)
+            )
+
+        attn_weights = torch.nn.functional.softmax(attn_weights, dtype=torch.float32, dim=-1).to(query.dtype)
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = (attn_probs @ value).reshape(batch_size, self.num_attention_heads, height, width, -1)
+        attn_output = attn_output.permute(0, 2, 3, 1, 4).reshape(batch_size, height, width, -1)
+
+        attn_output = self.proj(attn_output)
+
+        if output_attentions:
+            outputs = (attn_output, attn_weights)
+        else:
+            outputs = (attn_output, None)
+
+        return outputs
+
+
+class SamVisionLayer(nn.Module):
+    def __init__(self, config, window_size):
+        super().__init__()
+        self.layer_norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.attn = SamVisionAttention(config, window_size)
+        self.layer_norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.mlp = SamMLPBlock(config)
+        self.window_size = window_size
+
+    def window_partition(self, hidden_states: torch.Tensor, window_size: int) -> Tuple[torch.Tensor, Tuple[int, int]]:
+        """
+        Args:
+        Partition into non-overlapping windows with padding if needed.
+            hidden_states (tensor): input tokens with [batch_size, height, width, channel]. window_size (int): window
+            size.
+
+        Returns:
+            windows: windows after partition with [batch_size * num_windows, window_size, window_size, channel].
+            (pad_height, pad_width): padded height and width before partition
+        """
+        batch_size, height, width, channel = hidden_states.shape
+
+        pad_h = (window_size - height % window_size) % window_size
+        pad_w = (window_size - width % window_size) % window_size
+        hidden_states = F.pad(hidden_states, (0, 0, 0, pad_w, 0, pad_h))
+        pad_height, pad_width = height + pad_h, width + pad_w
+
+        hidden_states = hidden_states.reshape(
+            batch_size, pad_height // window_size, window_size, pad_width // window_size, window_size, channel
+        )
+        windows = hidden_states.permute(0, 1, 3, 2, 4, 5).contiguous().reshape(-1, window_size, window_size, channel)
+        return windows, (pad_height, pad_width)
+
+    def window_unpartition(
+        self, windows: torch.Tensor, window_size: int, padding_shape: Tuple[int, int], original_shape: Tuple[int, int]
+    ) -> torch.Tensor:
+        """
+        Args:
+        Window unpartition into original sequences and removing padding.
+            hidden_states (tensor):
+                input tokens with [batch_size * num_windows, window_size, window_size, channel].
+            window_size (int):
+                window size.
+            padding_shape (Tuple):
+                padded height and width (pad_height, pad_width).
+            original_shape (Tuple): original height and width (height, width) before padding.
+
+        Returns:
+            hidden_states: unpartitioned sequences with [batch_size, height, width, channel].
+        """
+        pad_height, pad_width = padding_shape
+        height, width = original_shape
+        batch_size = windows.shape[0] // (pad_height * pad_width // window_size // window_size)
+        hidden_states = windows.reshape(
+            batch_size, pad_height // window_size, pad_width // window_size, window_size, window_size, -1
+        )
+        hidden_states = (
+            hidden_states.permute(0, 1, 3, 2, 4, 5).contiguous().reshape(batch_size, pad_height, pad_width, -1)
+        )
+
+        hidden_states = hidden_states[:, :height, :width, :].contiguous()
+        return hidden_states
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        # Window partition
+        if self.window_size > 0:
+            height, width = hidden_states.shape[1], hidden_states.shape[2]
+            hidden_states, padding_shape = self.window_partition(hidden_states, self.window_size)
+
+        hidden_states, attn_weights = self.attn(
+            hidden_states=hidden_states,
+            output_attentions=output_attentions,
+        )
+        # Reverse window partition
+        if self.window_size > 0:
+            hidden_states = self.window_unpartition(hidden_states, self.window_size, padding_shape, (height, width))
+
+        hidden_states = residual + hidden_states
+        layernorm_output = self.layer_norm2(hidden_states)
+        hidden_states = hidden_states + self.mlp(layernorm_output)
+
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class SamVisionNeck(nn.Module):
+    def __init__(self, config: SamVisionConfig):
+        super().__init__()
+        self.config = config
+
+        self.conv1 = nn.Conv2d(config.hidden_size, config.output_channels, kernel_size=1, bias=False)
+        self.layer_norm1 = SamLayerNorm(config.output_channels, data_format="channels_first")
+        self.conv2 = nn.Conv2d(config.output_channels, config.output_channels, kernel_size=3, padding=1, bias=False)
+        self.layer_norm2 = SamLayerNorm(config.output_channels, data_format="channels_first")
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.permute(0, 3, 1, 2)
+        hidden_states = self.conv1(hidden_states)
+        hidden_states = self.layer_norm1(hidden_states)
+
+        hidden_states = self.conv2(hidden_states)
+        hidden_states = self.layer_norm2(hidden_states)
+        return hidden_states
+
+
+class SamVisionEncoder(nn.Module):
+    def __init__(self, config: SamVisionConfig):
+        super().__init__()
+        self.config = config
+        self.image_size = config.image_size
+
+        self.patch_embed = SamPatchEmbeddings(config)
+
+        self.pos_embed = None
+        if config.use_abs_pos:
+            # Initialize absolute positional embedding with pretrain image size.
+            self.pos_embed = nn.Parameter(
+                torch.zeros(
+                    1,
+                    config.image_size // config.patch_size,
+                    config.image_size // config.patch_size,
+                    config.hidden_size,
+                )
+            )
+
+        self.layers = nn.ModuleList()
+        for i in range(config.num_hidden_layers):
+            layer = SamVisionLayer(
+                config,
+                window_size=config.window_size if i not in config.global_attn_indexes else 0,
+            )
+            self.layers.append(layer)
+
+        self.neck = SamVisionNeck(config)
+
+        self.gradient_checkpointing = False
+
+    def get_input_embeddings(self):
+        return self.patch_embed
+
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SamVisionEncoderOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.patch_embed(pixel_values)
+        if self.pos_embed is not None:
+            hidden_states = hidden_states + self.pos_embed
+
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, output_attentions=output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        hidden_states = self.neck(hidden_states)
+
+        if not return_dict:
+            outputs = (hidden_states,)
+            if output_hidden_states:
+                outputs = outputs + (all_hidden_states,)
+            if output_attentions:
+                outputs = outputs + (all_self_attentions,)
+            return outputs
+
+        return SamVisionEncoderOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class SamPreTrainedModel(PreTrainedModel):
+    config_class = SamConfig
+    base_model_prefix = "sam"
+    main_input_name = "pixel_values"
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+SAM_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`SamConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+SAM_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`SamProcessor`]. See [`SamProcessor.__call__`] for
+            details.
+        input_points (`torch.FloatTensor` of shape `(batch_size, num_points, 2)`):
+            Input 2D spatial points, this is used by the prompt encoder to encode the prompt. Generally yields to much
+            better results. The points can be obtained by passing a list of list of list to the processor that will
+            create corresponding `torch` tensors of dimension 4. The first dimension is the image batch size, the
+            second dimension is the point batch size (i.e. how many segmentation masks do we want the model to predict
+            per input point), the third dimension is the number of points per segmentation mask (it is possible to pass
+            multiple points for a single mask), and the last dimension is the x (vertical) and y (horizontal)
+            coordinates of the point. If a different number of points is passed either for each image, or for each
+            mask, the processor will create "PAD" points that will correspond to the (0, 0) coordinate, and the
+            computation of the embedding will be skipped for these points using the labels.
+        input_labels (`torch.LongTensor` of shape `(batch_size, point_batch_size, num_points)`):
+            Input labels for the points, this is used by the prompt encoder to encode the prompt. According to the
+            official implementation, there are 3 types of labels
+
+            - `1`: the point is a point that contains the object of interest
+            - `0`: the point is a point that does not contain the object of interest
+            - `-1`: the point corresponds to the background
+
+            We added the label:
+
+            - `-10`: the point is a padding point, thus should be ignored by the prompt encoder
+
+            The padding labels should be automatically done by the processor.
+        input_boxes (`torch.FloatTensor` of shape `(batch_size, num_boxes, 4)`):
+            Input boxes for the points, this is used by the prompt encoder to encode the prompt. Generally yields to
+            much better generated masks. The boxes can be obtained by passing a list of list of list to the processor,
+            that will generate a `torch` tensor, with each dimension corresponding respectively to the image batch
+            size, the number of boxes per image and the coordinates of the top left and botton right point of the box.
+            In the order (`x1`, `y1`, `x2`, `y2`):
+
+            - `x1`: the x coordinate of the top left point of the input box
+            - `y1`: the y coordinate of the top left point of the input box
+            - `x2`: the x coordinate of the bottom right point of the input box
+            - `y2`: the y coordinate of the bottom right point of the input box
+
+        input_masks (`torch.FloatTensor` of shape `(batch_size, image_size, image_size)`):
+            SAM model also accepts segmentation masks as input. The mask will be embedded by the prompt encoder to
+            generate a corresponding embedding, that will be fed later on to the mask decoder. These masks needs to be
+            manually fed by the user, and they need to be of shape (`batch_size`, `image_size`, `image_size`).
+
+        image_embeddings (`torch.FloatTensor` of shape `(batch_size, output_channels, window_size, window_size)`):
+            Image embeddings, this is used by the mask decder to generate masks and iou scores. For more memory
+            efficient computation, users can first retrieve the image embeddings using the `get_image_embeddings`
+            method, and then feed them to the `forward` method instead of feeding the `pixel_values`.
+        multimask_output (`bool`, *optional*):
+            In the original implementation and paper, the model always outputs 3 masks per image (or per point / per
+            bounding box if relevant). However, it is possible to just output a single mask, that corresponds to the
+            "best" mask, by specifying `multimask_output=False`.
+        attention_similarity (`torch.FloatTensor`, *optional*):
+            Attention similarity tensor, to be provided to the mask decoder for target-guided attention in case the
+            model is used for personalization as introduced in [PerSAM](https://arxiv.org/abs/2305.03048).
+        target_embedding (`torch.FloatTensor`, *optional*):
+            Embedding of the target concept, to be provided to the mask decoder for target-semantic prompting in case
+            the model is used for personalization as introduced in [PerSAM](https://arxiv.org/abs/2305.03048).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "Segment Anything Model (SAM) for generating segmentation masks, given an input image and ",
+    " optional 2D location and bounding boxes.",
+    SAM_START_DOCSTRING,
+)
+class SamModel(SamPreTrainedModel):
+    _tied_weights_keys = ["prompt_encoder.shared_embedding.positional_embedding"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.shared_image_embedding = SamPositionalEmbedding(config.vision_config)
+
+        self.vision_encoder = SamVisionEncoder(config.vision_config)
+        self.prompt_encoder = SamPromptEncoder(config.prompt_encoder_config, self.shared_image_embedding)
+        self.mask_decoder = SamMaskDecoder(config.mask_decoder_config)
+
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.vision_encoder.get_input_embeddings()
+
+    def get_image_wide_positional_embeddings(self):
+        size = self.config.prompt_encoder_config.image_embedding_size
+        target_device = self.shared_image_embedding.positional_embedding.device
+        target_dtype = self.shared_image_embedding.positional_embedding.dtype
+        grid = torch.ones((size, size), device=target_device, dtype=target_dtype)
+        y_embed = grid.cumsum(dim=0) - 0.5
+        x_embed = grid.cumsum(dim=1) - 0.5
+        y_embed = y_embed / size
+        x_embed = x_embed / size
+
+        positional_embedding = self.shared_image_embedding(torch.stack([x_embed, y_embed], dim=-1))
+        return positional_embedding.permute(2, 0, 1).unsqueeze(0)  # channel x height x width
+
+    @torch.no_grad()
+    def get_image_embeddings(
+        self,
+        pixel_values,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Returns the image embeddings by passing the pixel values through the vision encoder.
+
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+                Input pixel values
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+
+        """
+        vision_output = self.vision_encoder(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        image_embeddings = vision_output[0]
+        return image_embeddings
+
+    @torch.no_grad()
+    def get_prompt_embeddings(
+        self,
+        input_points: Optional[torch.FloatTensor] = None,
+        input_labels: Optional[torch.LongTensor] = None,
+        input_boxes: Optional[torch.FloatTensor] = None,
+        input_masks: Optional[torch.LongTensor] = None,
+    ):
+        r"""
+        Returns the prompt embeddings by passing the input points, labels, boxes and masks through the prompt encoder.
+
+        Args:
+            input_points (`torch.FloatTensor` of shape `(batch_size, point_batch_size, num_points_per_image, 2)`):
+                Optional input points for the prompt encoder. The padding of the point is automatically done by the
+                processor. `point_batch_size` refers to the number of masks that we want the model to predict per
+                point. The model will output `point_batch_size` times 3 masks in total.
+            input_labels (`torch.LongTensor` of shape `(batch_size, point_batch_size, num_points_per_image)`):
+                Optional input labels for the prompt encoder. The padding of the labels is automatically done by the
+                processor, or can be fed by the user.
+            input_boxes (`torch.FloatTensor` of shape `(batch_size, num_boxes_per_image, 4)`):
+                Optional input boxes for the prompt encoder. The padding of the boxes is automatically done by the
+                processor. users can also pass manually the input boxes.
+            input_masks (`torch.LongTensor` of shape `(batch_size, image_size, image_size)`):
+                Optional input masks for the prompt encoder.
+        """
+        prompt_output = self.prompt_encoder(
+            input_points=input_points,
+            input_labels=input_labels,
+            input_boxes=input_boxes,
+            input_masks=input_masks,
+        )
+        return prompt_output
+
+    @add_start_docstrings_to_model_forward(SAM_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        input_points: Optional[torch.FloatTensor] = None,
+        input_labels: Optional[torch.LongTensor] = None,
+        input_boxes: Optional[torch.FloatTensor] = None,
+        input_masks: Optional[torch.LongTensor] = None,
+        image_embeddings: Optional[torch.FloatTensor] = None,
+        multimask_output: bool = True,
+        attention_similarity: Optional[torch.FloatTensor] = None,
+        target_embedding: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> List[Dict[str, torch.Tensor]]:
+        r"""
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoModel, AutoProcessor
+
+        >>> model = AutoModel.from_pretrained("facebook/sam-vit-base")
+        >>> processor = AutoProcessor.from_pretrained("facebook/sam-vit-base")
+
+        >>> img_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/sam-car.png"
+        >>> raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
+        >>> input_points = [[[400, 650]]]  # 2D location of a window on the car
+        >>> inputs = processor(images=raw_image, input_points=input_points, return_tensors="pt")
+
+        >>> # Get segmentation mask
+        >>> outputs = model(**inputs)
+
+        >>> # Postprocess masks
+        >>> masks = processor.post_process_masks(
+        ...     outputs.pred_masks, inputs["original_sizes"], inputs["reshaped_input_sizes"]
+        ... )
+        ```
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None and image_embeddings is None:
+            raise ValueError("Either pixel_values or image_embeddings must be provided.")
+
+        if pixel_values is not None and image_embeddings is not None:
+            raise ValueError("Only one of pixel_values and image_embeddings can be provided.")
+
+        if input_points is not None and len(input_points.shape) != 4:
+            raise ValueError(
+                "The input_points must be a 4D tensor. Of shape `batch_size`, `point_batch_size`, `nb_points_per_image`, `2`.",
+                " got {}.".format(input_points.shape),
+            )
+        if input_boxes is not None and len(input_boxes.shape) != 3:
+            raise ValueError(
+                "The input_points must be a 3D tensor. Of shape `batch_size`, `nb_boxes`, `4`.",
+                " got {}.".format(input_boxes.shape),
+            )
+        if input_points is not None and input_boxes is not None:
+            point_batch_size = input_points.shape[1]
+            box_batch_size = input_boxes.shape[1]
+            if point_batch_size != box_batch_size:
+                raise ValueError(
+                    "You should provide as many bounding boxes as input points per box. Got {} and {}.".format(
+                        point_batch_size, box_batch_size
+                    )
+                )
+
+        image_positional_embeddings = self.get_image_wide_positional_embeddings()
+        # repeat with batch size
+        batch_size = pixel_values.shape[0] if pixel_values is not None else image_embeddings.shape[0]
+        image_positional_embeddings = image_positional_embeddings.repeat(batch_size, 1, 1, 1)
+
+        vision_attentions = None
+        vision_hidden_states = None
+
+        if pixel_values is not None:
+            vision_outputs = self.vision_encoder(
+                pixel_values,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            image_embeddings = vision_outputs[0]
+
+            if output_hidden_states:
+                vision_hidden_states = vision_outputs[1]
+            if output_attentions:
+                vision_attentions = vision_outputs[-1]
+
+        if input_points is not None and input_labels is None:
+            input_labels = torch.ones_like(input_points[:, :, :, 0], dtype=torch.int, device=input_points.device)
+
+        if input_points is not None and image_embeddings.shape[0] != input_points.shape[0]:
+            raise ValueError(
+                "The batch size of the image embeddings and the input points must be the same. ",
+                "Got {} and {} respectively.".format(image_embeddings.shape[0], input_points.shape[0]),
+                " if you want to pass multiple points for the same image, make sure that you passed ",
+                " input_points of shape (batch_size, point_batch_size, num_points_per_image, 3) and ",
+                " input_labels of shape (batch_size, point_batch_size, num_points_per_image)",
+            )
+
+        sparse_embeddings, dense_embeddings = self.prompt_encoder(
+            input_points=input_points,
+            input_labels=input_labels,
+            input_boxes=input_boxes,
+            input_masks=input_masks,
+        )
+
+        low_res_masks, iou_predictions, mask_decoder_attentions = self.mask_decoder(
+            image_embeddings=image_embeddings,
+            image_positional_embeddings=image_positional_embeddings,
+            sparse_prompt_embeddings=sparse_embeddings,
+            dense_prompt_embeddings=dense_embeddings,
+            multimask_output=multimask_output,
+            attention_similarity=attention_similarity,
+            target_embedding=target_embedding,
+            output_attentions=output_attentions,
+        )
+
+        if not return_dict:
+            output = (iou_predictions, low_res_masks)
+            if output_hidden_states:
+                output = output + (vision_hidden_states,)
+
+            if output_attentions:
+                output = output + (vision_attentions, mask_decoder_attentions)
+            return output
+
+        return SamImageSegmentationOutput(
+            iou_scores=iou_predictions,
+            pred_masks=low_res_masks,
+            vision_hidden_states=vision_hidden_states,
+            vision_attentions=vision_attentions,
+            mask_decoder_attentions=mask_decoder_attentions,
+        )
diff --git a/src/transformers/models/sam/modeling_tf_sam.py b/src/transformers/models/sam/modeling_tf_sam.py
new file mode 100644
index 000000000000..a0a48b5aa7cd
--- /dev/null
+++ b/src/transformers/models/sam/modeling_tf_sam.py
@@ -0,0 +1,1465 @@
+# coding=utf-8
+# Copyright 2023 The Meta AI Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+TensorFlow SAM model. This file was mostly generated by auto-translation from the PyTorch original. In the event of a
+discrepancy, the original file should be regarded as the 'reference' version.
+"""
+
+
+from __future__ import annotations
+
+import collections
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import tensorflow as tf
+
+from ...activations_tf import ACT2FN
+from ...modeling_tf_outputs import TFBaseModelOutput
+from ...modeling_tf_utils import TFModelInputType, TFPreTrainedModel, shape_list, unpack_inputs
+from ...tf_utils import flatten, functional_layernorm
+from ...utils import ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_sam import SamConfig, SamMaskDecoderConfig, SamPromptEncoderConfig, SamVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "SamConfig"
+_CHECKPOINT_FOR_DOC = "facebook/sam-vit-huge"
+
+TF_SAM_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/sam-vit-huge",
+    "facebook/sam-vit-large",
+    "facebook/sam-vit-base",
+    # See all SAM models at https://huggingface.co/models?filter=sam
+]
+
+
+@dataclass
+class TFSamVisionEncoderOutput(ModelOutput):
+    """
+    Base class for sam vision model's outputs that also contains image embeddings obtained by applying the projection
+    layer to the pooler_output.
+
+    Args:
+        image_embeds (`tf.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The image embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
+            the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    image_embeds: tf.Tensor | None = None
+    last_hidden_state: tf.Tensor = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+
+
+@dataclass
+class TFSamImageSegmentationOutput(ModelOutput):
+    """
+    Base class for Segment-Anything model's output
+
+    Args:
+        iou_scores (`tf.Tensor` of shape `(batch_size, num_masks)`):
+            The iou scores of the predicted masks.
+        pred_masks (`tf.Tensor` of shape `(batch_size, num_masks, height, width)`):
+            The predicted low resolutions masks. Needs to be post-processed by the processor
+        vision_hidden_states  (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
+            the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the vision model at the output of each layer plus the optional initial embedding outputs.
+        vision_attentions  (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        mask_decoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    iou_scores: tf.Tensor = None
+    pred_masks: tf.Tensor = None
+    vision_hidden_states: Tuple[tf.Tensor] | None = None
+    vision_attentions: Tuple[tf.Tensor] | None = None
+    mask_decoder_attentions: Tuple[tf.Tensor] | None = None
+
+
+class TFSamPatchEmbeddings(tf.keras.layers.Layer):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+
+        self.projection = tf.keras.layers.Conv2D(
+            hidden_size, kernel_size=patch_size, strides=patch_size, name="projection"
+        )
+
+    def call(self, pixel_values):
+        batch_size, num_channels, height, width = shape_list(pixel_values)
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        if height != self.image_size[0] or width != self.image_size[1]:
+            raise ValueError(
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
+            )
+        embeddings = self.projection(tf.transpose(pixel_values, perm=[0, 2, 3, 1]))
+        return embeddings
+
+
+class TFSamMLPBlock(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.lin1 = tf.keras.layers.Dense(config.mlp_dim, name="lin1")
+        self.lin2 = tf.keras.layers.Dense(config.hidden_size, name="lin2")
+        self.act = ACT2FN[config.hidden_act]
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        hidden_states = self.lin1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.lin2(hidden_states)
+        return hidden_states
+
+
+class TFSamLayerNorm(tf.keras.layers.Layer):
+    r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
+    width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
+    """
+
+    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last", **kwargs):
+        super().__init__(**kwargs)
+        self.eps = eps
+        self.data_format = data_format
+        self.normalized_shape = normalized_shape
+        if self.data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError(f"Unsupported data format: {self.data_format}")
+
+    def build(self, input_shape):
+        self.weight = self.add_weight(shape=self.normalized_shape, initializer="ones", name="weight")
+        self.bias = self.add_weight(shape=self.normalized_shape, initializer="zeros", name="bias")
+        super().build(input_shape)
+
+    def call(self, x: tf.Tensor) -> tf.Tensor:
+        if self.data_format == "channels_last":
+            x = functional_layernorm(x, weight=self.weight, bias=self.bias, epsilon=self.eps, axis=-1)
+        elif self.data_format == "channels_first":
+            x = functional_layernorm(x, weight=self.weight, bias=self.bias, epsilon=self.eps, axis=1)
+        return x
+
+
+class TFSamAttention(tf.keras.layers.Layer):
+    """
+    SAM's attention layer that allows for downscaling the size of the embedding after projection to queries, keys, and
+    values.
+    """
+
+    def __init__(self, config, downsample_rate=None, **kwargs):
+        super().__init__(**kwargs)
+        self.hidden_size = config.hidden_size
+
+        downsample_rate = config.attention_downsample_rate if downsample_rate is None else downsample_rate
+
+        self.internal_dim = config.hidden_size // downsample_rate
+        self.num_attention_heads = config.num_attention_heads
+        if self.internal_dim % config.num_attention_heads != 0:
+            raise ValueError("num_attention_heads must divide hidden_size.")
+
+        self.q_proj = tf.keras.layers.Dense(self.internal_dim, name="q_proj")
+        self.k_proj = tf.keras.layers.Dense(self.internal_dim, name="k_proj")
+        self.v_proj = tf.keras.layers.Dense(self.internal_dim, name="v_proj")
+        self.out_proj = tf.keras.layers.Dense(self.hidden_size, name="out_proj")
+
+    def _separate_heads(self, hidden_states: tf.Tensor, num_attention_heads: int) -> tf.Tensor:
+        batch, point_batch_size, n_tokens, channel = shape_list(hidden_states)
+        c_per_head = channel // num_attention_heads
+        hidden_states = tf.reshape(
+            hidden_states, (batch * point_batch_size, n_tokens, num_attention_heads, c_per_head)
+        )
+        return tf.transpose(hidden_states, perm=[0, 2, 1, 3])
+
+    def _recombine_heads(self, hidden_states: tf.Tensor, point_batch_size: int) -> tf.Tensor:
+        batch, n_heads, n_tokens, c_per_head = shape_list(hidden_states)
+        hidden_states = tf.transpose(hidden_states, perm=[0, 2, 1, 3])
+        return tf.reshape(
+            hidden_states,
+            (batch // tf.reduce_max([1, point_batch_size]), point_batch_size, n_tokens, n_heads * c_per_head),
+        )
+
+    def call(self, query: tf.Tensor, key: tf.Tensor, value: tf.Tensor) -> tf.Tensor:
+        # Input projections
+        query = self.q_proj(query)
+        key = self.k_proj(key)
+        value = self.v_proj(value)
+
+        point_batch_size = shape_list(query)[1]
+        # Separate into heads
+        query = self._separate_heads(query, self.num_attention_heads)
+        key = self._separate_heads(key, self.num_attention_heads)
+        value = self._separate_heads(value, self.num_attention_heads)
+
+        # SamAttention
+        _, _, _, c_per_head = shape_list(query)
+        attn = tf.matmul(
+            query, tf.transpose(key, perm=[0, 1, 3, 2])
+        )  # batch_size * point_batch_size  x N_heads x N_tokens x N_tokens
+        attn = attn / tf.math.sqrt(float(c_per_head))
+        attn = tf.nn.softmax(attn, axis=-1)
+
+        # Get output
+        out = tf.matmul(attn, value)
+        out = self._recombine_heads(out, point_batch_size)
+        out = self.out_proj(out)
+
+        return out
+
+
+class TFSamTwoWayAttentionBlock(tf.keras.layers.Layer):
+    def __init__(self, config, attention_downsample_rate: int = 2, skip_first_layer_pe: bool = False, **kwargs):
+        """
+        A transformer block with four layers:
+            (1) self-attention of sparse inputs (2) cross attention of sparse inputs -> dense inputs (3) mlp block on
+            sparse inputs (4) cross attention of dense inputs -> sparse inputs
+
+        Arguments:
+            config (`SamMaskDecoderConfig`):
+                The configuration file used to instantiate the block
+            attention_downsample_rate (*optionalk*, int, defaults to 2):
+                The downsample ratio of the block used to reduce the inner dim of the attention.
+            skip_first_layer_pe (*optional*, bool, defaults to `False`):
+                Whether or not to skip the addition of the query_point_embedding on the first layer.
+        """
+        super().__init__(**kwargs)
+
+        self.hidden_size = config.hidden_size
+        self.layer_norm_eps = config.layer_norm_eps
+
+        self.self_attn = TFSamAttention(config, downsample_rate=1, name="self_attn")
+        self.layer_norm1 = tf.keras.layers.LayerNormalization(epsilon=self.layer_norm_eps, name="layer_norm1")
+
+        self.cross_attn_token_to_image = TFSamAttention(
+            config, downsample_rate=attention_downsample_rate, name="cross_attn_token_to_image"
+        )
+        self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=self.layer_norm_eps, name="layer_norm2")
+
+        self.mlp = TFSamMLPBlock(config, name="mlp")
+        self.layer_norm3 = tf.keras.layers.LayerNormalization(epsilon=self.layer_norm_eps, name="layer_norm3")
+
+        self.layer_norm4 = tf.keras.layers.LayerNormalization(epsilon=self.layer_norm_eps, name="layer_norm4")
+        self.cross_attn_image_to_token = TFSamAttention(
+            config, downsample_rate=attention_downsample_rate, name="cross_attn_image_to_token"
+        )
+
+        self.skip_first_layer_pe = skip_first_layer_pe
+
+    def call(
+        self,
+        queries: tf.Tensor,
+        keys: tf.Tensor,
+        query_point_embedding: tf.Tensor,
+        key_point_embedding: tf.Tensor,
+        output_attentions: bool = False,
+    ):
+        # Self attention block
+        if self.skip_first_layer_pe:
+            queries = self.self_attn(query=queries, key=queries, value=queries)
+        else:
+            query = queries + query_point_embedding
+            attn_out = self.self_attn(query=query, key=query, value=queries)
+            queries = queries + attn_out
+        queries = self.layer_norm1(queries)
+
+        # Cross attention block, tokens attending to image embedding
+        query = queries + query_point_embedding
+        key = keys + key_point_embedding
+
+        attn_out = self.cross_attn_token_to_image(query=query, key=key, value=keys)
+        queries = queries + attn_out
+
+        queries = self.layer_norm2(queries)
+
+        # MLP block
+        mlp_out = self.mlp(queries)
+        queries = queries + mlp_out
+        queries = self.layer_norm3(queries)
+
+        # Cross attention block, image embedding attending to tokens
+        query = queries + query_point_embedding
+        key = keys + key_point_embedding
+
+        attn_out = self.cross_attn_image_to_token(query=key, key=query, value=queries)
+        keys = keys + attn_out
+
+        keys = self.layer_norm4(keys)
+
+        outputs = (queries, keys)
+
+        if output_attentions:
+            outputs = outputs + (attn_out,)
+        else:
+            outputs = outputs + (None,)
+
+        return outputs
+
+
+class TFSamTwoWayTransformer(tf.keras.layers.Layer):
+    def __init__(self, config: SamMaskDecoderConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+
+        self.num_hidden_layers = config.num_hidden_layers
+        self.layers = []
+
+        for i in range(self.num_hidden_layers):
+            self.layers.append(TFSamTwoWayAttentionBlock(config, skip_first_layer_pe=(i == 0), name=f"layers_._{i}"))
+
+        self.final_attn_token_to_image = TFSamAttention(config, name="final_attn_token_to_image")
+        self.layer_norm_final_attn = tf.keras.layers.LayerNormalization(
+            epsilon=config.layer_norm_eps, name="layer_norm_final_attn"
+        )
+
+    def call(
+        self,
+        point_embeddings: tf.Tensor,
+        image_embeddings: tf.Tensor,
+        image_positional_embeddings: tf.Tensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TFBaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        all_attentions = ()
+
+        if image_embeddings is None:
+            raise ValueError("You have to specify an image_embedding")
+
+        image_embeddings = tf.transpose(flatten(image_embeddings, 2), perm=(0, 2, 1))[:, None]
+        image_positional_embeddings = tf.transpose(flatten(image_positional_embeddings, 2), (0, 2, 1))[:, None]
+
+        # Prepare queries
+        queries = point_embeddings
+        keys = image_embeddings
+
+        # Apply transformer blocks and final layernorm
+        for layer in self.layers:
+            queries, keys, attention_outputs = layer(
+                queries=queries,
+                keys=keys,
+                query_point_embedding=point_embeddings,
+                key_point_embedding=image_positional_embeddings,
+                output_attentions=output_attentions,
+            )
+
+            if output_attentions:
+                all_attentions = all_attentions + (attention_outputs,)
+
+        # Apply the final attenion layer from the points to the image
+        query = queries + point_embeddings
+        key = keys + image_positional_embeddings
+
+        attn_out = self.final_attn_token_to_image(query=query, key=key, value=keys)
+
+        queries = queries + attn_out
+        queries = self.layer_norm_final_attn(queries)
+        return queries, keys, all_attentions
+
+
+class TFSamFeedForward(tf.keras.layers.Layer):
+    def __init__(
+        self, input_dim: int, hidden_dim: int, output_dim: int, num_layers: int, sigmoid_output: bool = False, **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.num_layers = num_layers
+        self.activation = tf.keras.layers.ReLU()
+        self.proj_in = tf.keras.layers.Dense(hidden_dim, input_shape=(input_dim,), name="proj_in")
+        self.proj_out = tf.keras.layers.Dense(output_dim, input_shape=(hidden_dim,), name="proj_out")
+        self.layers = [
+            tf.keras.layers.Dense(hidden_dim, input_shape=(hidden_dim,), name=f"layers_._{i}")
+            for i in range(num_layers - 2)
+        ]
+        self.sigmoid_output = sigmoid_output
+
+    def call(self, hidden_states):
+        hidden_states = self.proj_in(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        for layer in self.layers:
+            hidden_states = self.activation(layer(hidden_states))
+
+        hidden_states = self.proj_out(hidden_states)
+        if self.sigmoid_output:
+            hidden_states = tf.sigmoid(hidden_states)
+        return hidden_states
+
+
+class TFSamMaskDecoder(tf.keras.layers.Layer):
+    def __init__(self, config: SamMaskDecoderConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.hidden_size = config.hidden_size
+
+        self.num_multimask_outputs = config.num_multimask_outputs
+        self.num_mask_tokens = config.num_multimask_outputs + 1
+
+        self.transformer = TFSamTwoWayTransformer(config, name="transformer")
+
+        self.upscale_conv1 = tf.keras.layers.Conv2DTranspose(
+            self.hidden_size // 4, kernel_size=2, strides=2, name="upscale_conv1", data_format="channels_first"
+        )
+        self.upscale_conv2 = tf.keras.layers.Conv2DTranspose(
+            self.hidden_size // 8, kernel_size=2, strides=2, name="upscale_conv2", data_format="channels_first"
+        )
+        self.upscale_layer_norm = TFSamLayerNorm(
+            self.hidden_size // 4, data_format="channels_first", name="upscale_layer_norm"
+        )
+        self.activation = tf.nn.gelu
+
+        mlps_list = []
+        for i in range(self.num_mask_tokens):
+            mlps_list += [
+                TFSamFeedForward(
+                    self.hidden_size,
+                    self.hidden_size,
+                    self.hidden_size // 8,
+                    3,
+                    name=f"output_hypernetworks_mlps_._{i}",
+                )
+            ]
+        self.output_hypernetworks_mlps = mlps_list
+
+        self.iou_prediction_head = TFSamFeedForward(
+            self.hidden_size,
+            config.iou_head_hidden_dim,
+            self.num_mask_tokens,
+            config.iou_head_depth,
+            name="iou_prediction_head",
+        )
+
+    def build(self, input_shape):
+        self.iou_token = self.add_weight(shape=(1, self.hidden_size), name="iou_token.weight", trainable=True)
+        self.mask_tokens = self.add_weight(
+            shape=(self.num_mask_tokens, self.hidden_size), name="mask_tokens.weight", trainable=True
+        )
+        super().build(input_shape)
+
+    def call(
+        self,
+        image_embeddings: tf.Tensor,
+        image_positional_embeddings: tf.Tensor,
+        sparse_prompt_embeddings: tf.Tensor,
+        dense_prompt_embeddings: tf.Tensor,
+        multimask_output: bool,
+        output_attentions: Optional[bool] = None,
+    ) -> Tuple[tf.Tensor, tf.Tensor]:
+        batch_size, num_channels, height, width = shape_list(image_embeddings)
+        point_batch_size = tf.math.maximum(1, tf.shape(sparse_prompt_embeddings)[1])
+
+        output_tokens = tf.concat([self.iou_token, self.mask_tokens], axis=0)  # Should be (1, 32) + (4, 32) = (5, 32)
+        output_tokens = tf.tile(
+            output_tokens[None, None, :], [batch_size, point_batch_size, 1, 1]
+        )  # Should be (batch_size, point_size, 5, 32)
+
+        # Matt: The original Torch code checked that the sum of sparse_prompt_embeddings equalled 0. However, this only
+        #       happens when the sparse prompt embeddings are an empty tensor with shape[1] == 0. I replaced
+        #       it with an explicit shape check to avoid data-dependent control flow which breaks XLA.
+        if shape_list(sparse_prompt_embeddings)[1] != 0:
+            tokens = tf.concat((output_tokens, sparse_prompt_embeddings), axis=2)
+        else:
+            tokens = output_tokens
+        point_embeddings = tf.cast(tokens, self.iou_token.dtype)
+
+        image_embeddings = image_embeddings + dense_prompt_embeddings
+        image_embeddings = tf.repeat(image_embeddings, point_batch_size, axis=0)
+        image_positional_embeddings = tf.repeat(image_positional_embeddings, point_batch_size, axis=0)
+
+        point_embedding, image_embeddings, attentions = self.transformer(
+            point_embeddings=point_embeddings,
+            image_embeddings=image_embeddings,
+            image_positional_embeddings=image_positional_embeddings,
+            output_attentions=output_attentions,
+        )
+        iou_token_out = point_embedding[:, :, 0, :]
+        mask_tokens_out = point_embedding[:, :, 1 : (1 + self.num_mask_tokens), :]
+
+        image_embeddings = tf.transpose(image_embeddings, perm=(0, 1, 3, 2))
+        image_embeddings = tf.reshape(image_embeddings, [batch_size * point_batch_size, num_channels, height, width])
+
+        upscaled_embedding = self.upscale_conv1(image_embeddings)
+        upscaled_embedding = self.activation(self.upscale_layer_norm(upscaled_embedding))
+        upscaled_embedding = self.activation(self.upscale_conv2(upscaled_embedding))
+
+        hyper_in_list = []
+        for i in range(self.num_mask_tokens):
+            current_mlp = self.output_hypernetworks_mlps[i]
+            hyper_in_list += [current_mlp(mask_tokens_out[:, :, i, :])]
+        hyper_in = tf.stack(hyper_in_list, axis=2)
+
+        _, num_channels, height, width = shape_list(upscaled_embedding)
+        upscaled_embedding = tf.reshape(
+            upscaled_embedding, [batch_size, point_batch_size, num_channels, height * width]
+        )
+        masks = tf.reshape(hyper_in @ upscaled_embedding, [batch_size, point_batch_size, -1, height, width])
+
+        iou_pred = self.iou_prediction_head(iou_token_out)
+
+        if multimask_output:
+            mask_slice = slice(1, None)
+        else:
+            mask_slice = slice(0, 1)
+        masks = masks[:, :, mask_slice, :, :]
+        iou_pred = iou_pred[:, :, mask_slice]
+
+        outputs = (masks, iou_pred)
+
+        if output_attentions:
+            outputs = outputs + (attentions,)
+        else:
+            outputs = outputs + (None,)
+
+        return outputs
+
+
+class TFSamPositionalEmbedding(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.scale = config.hidden_size // 2
+        self.config = config
+
+    def build(self, input_shape):
+        # TODO Matt: What is going on here? Why is a non-trainable weight randomly initialized?
+        self.positional_embedding = self.add_weight(
+            name="positional_embedding",
+            shape=(2, self.config.num_pos_feats),
+            initializer=tf.keras.initializers.RandomNormal(mean=0.0, stddev=self.scale),
+            trainable=False,
+        )
+        super().build(input_shape)
+
+    def call(self, input_coords, input_shape=None):
+        """Positionally encode points that are normalized to [0,1]."""
+        coordinates = tf.identity(input_coords)
+
+        if input_shape is not None:
+            coordinates = tf.stack(
+                [
+                    tf.cast(coordinates[:, :, :, 0], tf.float32) / input_shape[1],
+                    tf.cast(coordinates[:, :, :, 1], tf.float32) / input_shape[0],
+                ],
+                axis=-1,
+            )
+
+        # assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape
+        coordinates = 2 * coordinates - 1
+        coordinates = tf.cast(coordinates, self.positional_embedding.dtype)
+        coordinates = tf.matmul(coordinates, self.positional_embedding)
+        coordinates = 2 * np.pi * coordinates
+        # outputs d_1 x ... x d_n x channel shape
+        return tf.concat([tf.sin(coordinates), tf.cos(coordinates)], axis=-1)
+
+
+class TFSamMaskEmbedding(tf.keras.layers.Layer):
+    def __init__(self, config: SamPromptEncoderConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.mask_input_channels = config.mask_input_channels // 4
+        self.activation = ACT2FN[config.hidden_act]
+        self.conv1 = tf.keras.layers.Conv2D(self.mask_input_channels, kernel_size=2, strides=2, name="conv1")
+        self.conv2 = tf.keras.layers.Conv2D(config.mask_input_channels, kernel_size=2, strides=2, name="conv2")
+        self.conv3 = tf.keras.layers.Conv2D(config.hidden_size, kernel_size=1, name="conv3")
+        self.layer_norm1 = TFSamLayerNorm(self.mask_input_channels, config.layer_norm_eps, name="layer_norm1")
+        self.layer_norm2 = TFSamLayerNorm(self.mask_input_channels * 4, config.layer_norm_eps, name="layer_norm2")
+
+    def call(self, masks):
+        masks = tf.transpose(masks, perm=(0, 2, 3, 1))  # Convert to channels-last
+        hidden_states = self.conv1(masks)
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states = self.activation(hidden_states)
+
+        hidden_states = self.conv2(hidden_states)
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        dense_embeddings = self.conv3(hidden_states)
+        dense_embeddings = tf.transpose(dense_embeddings, perm=(0, 3, 1, 2))  # Convert back to channels-first
+        return dense_embeddings
+
+    def build(self, input_shape):
+        # This class needs an explicit build method because it isn't called with the standard dummy inputs
+        conv1_shape = [None, None, None, 1]
+        conv2_shape = [None, None, None, self.mask_input_channels]
+        conv3_shape = [None, None, None, self.mask_input_channels * 4]
+        layer_norm1_shape = [None, None, None, self.mask_input_channels]
+        layer_norm2_shape = [None, None, None, self.mask_input_channels * 4]
+        with tf.name_scope("conv1"):
+            self.conv1.build(conv1_shape)
+        with tf.name_scope("conv2"):
+            self.conv2.build(conv2_shape)
+        with tf.name_scope("conv3"):
+            self.conv3.build(conv3_shape)
+        with tf.name_scope("layer_norm1"):
+            self.layer_norm1.build(layer_norm1_shape)
+        with tf.name_scope("layer_norm2"):
+            self.layer_norm2.build(layer_norm2_shape)
+        super().build(input_shape)
+
+
+class TFSamPromptEncoder(tf.keras.layers.Layer):
+    def __init__(self, config: SamPromptEncoderConfig, shared_patch_embedding, **kwargs):
+        super().__init__(**kwargs)
+        self.shared_embedding = shared_patch_embedding
+        self.mask_embed = TFSamMaskEmbedding(config, name="mask_embed")
+        self.no_mask_embed = None
+
+        self.image_embedding_size = (config.image_embedding_size, config.image_embedding_size)
+        self.input_image_size = config.image_size
+
+        self.point_embed = []
+        self.hidden_size = config.hidden_size
+        self.not_a_point_embed = None
+        self.config = config
+
+    def build(self, input_shape):
+        self.no_mask_embed = self.add_weight(
+            name="no_mask_embed.weight",
+            shape=(1, self.hidden_size),
+            initializer=tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.02),
+            trainable=True,
+        )
+        self.point_embed = [
+            self.add_weight(
+                name=f"point_embed_._{i}.weight",
+                shape=(1, self.hidden_size),
+                initializer=tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.02),
+                trainable=True,
+            )
+            for i in range(self.config.num_point_embeddings)
+        ]
+        self.not_a_point_embed = self.add_weight(
+            name="not_a_point_embed.weight",
+            shape=(1, self.hidden_size),
+            initializer=tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.02),
+            trainable=True,
+        )
+        with tf.name_scope("mask_embed"):
+            # We must explicitly build the mask embed because it isn't touched by the standard dummy inputs
+            self.mask_embed.build(
+                (None, self.config.mask_input_channels, self.config.image_size, self.config.image_size)
+            )
+        super().build(input_shape)
+
+    def _embed_points(self, points: tf.Tensor, labels: tf.Tensor, pad: bool) -> tf.Tensor:
+        """Embeds point prompts."""
+        points = points + 0.5  # Shift to center of pixel
+        if pad:
+            target_point_shape = (shape_list(points)[0], shape_list(points)[1], 1, shape_list(points)[-1])
+            target_labels_shape = (shape_list(points)[0], shape_list(points)[1], 1)
+            padding_point = tf.zeros(target_point_shape, dtype=points.dtype)
+            padding_label = -tf.ones(target_labels_shape, dtype=labels.dtype)
+            points = tf.concat([points, padding_point], axis=2)
+            labels = tf.concat([labels, padding_label], axis=2)
+        input_shape = (self.input_image_size, self.input_image_size)
+        point_embedding = self.shared_embedding(points, input_shape)
+
+        point_embedding = tf.where(labels[..., None] == -1, self.not_a_point_embed[0], point_embedding)
+
+        point_embedding = tf.where(
+            labels[..., None] != -10,
+            point_embedding,
+            tf.zeros_like(point_embedding),
+        )
+        point_embedding = tf.where(
+            (labels == 0)[:, :, :, None], point_embedding + self.point_embed[0], point_embedding
+        )
+        point_embedding = tf.where(
+            (labels == 1)[:, :, :, None], point_embedding + self.point_embed[1], point_embedding
+        )
+        return point_embedding
+
+    def _embed_boxes(self, boxes: tf.Tensor) -> tf.Tensor:
+        """Embeds box prompts."""
+        boxes = boxes + 0.5  # Shift to center of pixel
+        batch_size, nb_boxes = shape_list(boxes)[:2]
+        coords = tf.reshape(boxes, (batch_size, nb_boxes, 2, 2))
+        input_shape = (self.input_image_size, self.input_image_size)
+        corner_embedding = self.shared_embedding(coords, input_shape)
+        corner_embedding += tf.where(
+            tf.range(shape_list(corner_embedding)[2])[None, None, :, None] == 0,
+            self.point_embed[2][0],
+            self.point_embed[3][0],
+        )
+        return corner_embedding
+
+    def call(
+        self,
+        batch_size: Optional[int],
+        input_points: Optional[Tuple[tf.Tensor, tf.Tensor]],
+        input_labels: tf.Tensor | None,
+        input_boxes: tf.Tensor | None,
+        input_masks: tf.Tensor | None,
+    ) -> Tuple[tf.Tensor, tf.Tensor]:
+        """
+        Embeds different types of prompts, returning both sparse and dense embeddings.
+
+        Args:
+            points (`tf.Tensor`, *optional*):
+                point coordinates and labels to embed.
+            boxes (`tf.Tensor`, *optional*):
+                boxes to embed
+            masks (`tf.Tensor`, *optional*):
+                masks to embed
+        """
+        sparse_embeddings = None
+        if input_points is not None:
+            batch_size, point_batch_size = shape_list(input_points)[:2]
+            if input_labels is None:
+                raise ValueError("If points are provided, labels must also be provided.")
+            point_embeddings = self._embed_points(input_points, input_labels, pad=(input_boxes is None))
+            sparse_embeddings = tf.zeros(
+                (batch_size, point_batch_size, 0, self.hidden_size), dtype=point_embeddings.dtype
+            )
+            sparse_embeddings = tf.concat([sparse_embeddings, point_embeddings], axis=2)
+        if input_boxes is not None:
+            batch_size = shape_list(input_boxes)[0]
+            box_embeddings = self._embed_boxes(input_boxes)
+            if sparse_embeddings is None:
+                sparse_embeddings = box_embeddings
+            else:
+                sparse_embeddings = tf.concat([sparse_embeddings, box_embeddings], axis=2)
+        if input_masks is not None:
+            dense_embeddings = self.mask_embed(input_masks)
+        else:
+            dense_embeddings = self.no_mask_embed[0]
+            dense_embeddings = tf.reshape(dense_embeddings, (1, -1, 1, 1))
+            dense_embeddings = tf.tile(
+                dense_embeddings, (batch_size, 1, self.image_embedding_size[0], self.image_embedding_size[1])
+            )
+        if sparse_embeddings is None:
+            sparse_embeddings = tf.zeros((batch_size, 0, 1, self.hidden_size), dtype=dense_embeddings.dtype)
+
+        return sparse_embeddings, dense_embeddings
+
+
+class TFSamVisionAttention(tf.keras.layers.Layer):
+    """Multi-head Attention block with relative position embeddings."""
+
+    def __init__(self, config, window_size, **kwargs):
+        super().__init__(**kwargs)
+        input_size = (
+            (config.image_size // config.patch_size, config.image_size // config.patch_size)
+            if window_size == 0
+            else (window_size, window_size)
+        )
+        self.input_size = input_size
+
+        self.num_attention_heads = config.num_attention_heads
+        head_dim = config.hidden_size // config.num_attention_heads
+        self.head_dim = head_dim
+        self.scale = head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.qkv = tf.keras.layers.Dense(config.hidden_size * 3, use_bias=config.qkv_bias, name="qkv")
+        self.proj = tf.keras.layers.Dense(config.hidden_size, name="proj")
+
+        self.use_rel_pos = config.use_rel_pos
+        if self.use_rel_pos:
+            if input_size is None:
+                raise ValueError("Input size must be provided if using relative positional encoding.")
+        self.config = config
+
+    def build(self, input_shape):
+        if self.input_size is not None:
+            # initialize relative positional embeddings
+            self.rel_pos_h = self.add_weight(
+                shape=(2 * self.input_size[0] - 1, self.head_dim), initializer="zeros", name="rel_pos_h"
+            )
+            self.rel_pos_w = self.add_weight(
+                shape=(2 * self.input_size[1] - 1, self.head_dim), initializer="zeros", name="rel_pos_w"
+            )
+        super().build(input_shape)
+
+    def get_rel_pos(self, q_size: int, k_size: int, rel_pos: tf.Tensor) -> tf.Tensor:
+        """
+        Get relative positional embeddings according to the relative positions of
+            query and key sizes.
+
+        Args:
+            q_size (int):
+                size of the query.
+            k_size (int):
+                size of key k.
+            rel_pos (`tf.Tensor`):
+                relative position embeddings (L, channel).
+
+        Returns:
+            Extracted positional embeddings according to relative positions.
+        """
+        max_rel_dist = int(2 * max(q_size, k_size) - 1)
+        # Interpolate rel pos if needed.
+        if rel_pos.shape[0] != max_rel_dist:
+            # Interpolate rel pos.
+            rel_pos_resized = tf.image.resize(
+                tf.reshape(rel_pos, (1, rel_pos.shape[0], -1)),
+                size=(max_rel_dist, rel_pos.shape[1]),
+                method="bilinear",
+            )
+            rel_pos_resized = tf.reshape(rel_pos_resized, (-1, max_rel_dist))
+        else:
+            rel_pos_resized = rel_pos
+
+        # Scale the coords with short length if shapes for q and k are different.
+        q_coords = tf.expand_dims(tf.range(q_size, dtype=tf.float32), 1) * max(k_size / q_size, 1.0)
+        k_coords = tf.expand_dims(tf.range(k_size, dtype=tf.float32), 0) * max(q_size / k_size, 1.0)
+        relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
+
+        return tf.gather(rel_pos_resized, tf.cast(relative_coords, tf.int32))
+
+    def add_decomposed_rel_pos(
+        self,
+        attn: tf.Tensor,
+        query: tf.Tensor,
+        rel_pos_h: tf.Tensor,
+        rel_pos_w: tf.Tensor,
+        q_size: Tuple[int, int],
+        k_size: Tuple[int, int],
+    ) -> tf.Tensor:
+        """
+        Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
+        https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py
+
+        Args:
+            attn (`tf.Tensor`):
+                attention map.
+            query (`tf.Tensor`):
+                query q in the attention layer with shape (batch_size, query_height * query_width, channel).
+            rel_pos_h (`tf.Tensor`):
+                relative position embeddings (Lh, channel) for height axis.
+            rel_pos_w (`tf.Tensor`):
+                relative position embeddings (Lw, channel) for width axis.
+            q_size (tuple):
+                spatial sequence size of query q with (query_height, query_width).
+            k_size (tuple):
+                spatial sequence size of key k with (key_height, key_width).
+
+        Returns:
+            attn (`tf.Tensor`):
+                attention map with added relative positional embeddings.
+        """
+        query_height, query_width = q_size
+        key_height, key_width = k_size
+        relative_position_height = self.get_rel_pos(query_height, key_height, rel_pos_h)
+        relative_position_width = self.get_rel_pos(query_width, key_width, rel_pos_w)
+
+        batch_size, _, dim = shape_list(query)
+        reshaped_query = tf.reshape(query, (batch_size, query_height, query_width, dim))
+        rel_h = tf.einsum("bhwc,hkc->bhwk", reshaped_query, relative_position_height)
+        rel_w = tf.einsum("bhwc,wkc->bhwk", reshaped_query, relative_position_width)
+        attn = tf.reshape(attn, (batch_size, query_height, query_width, key_height, key_width))
+        attn = attn + tf.expand_dims(rel_h, axis=-1) + tf.expand_dims(rel_w, axis=-2)
+        attn = tf.reshape(attn, (batch_size, query_height * query_width, key_height * key_width))
+        return attn
+
+    def call(self, hidden_states: tf.Tensor, output_attentions=False, training=False) -> tf.Tensor:
+        batch_size, height, width, _ = shape_list(hidden_states)
+        # qkv with shape (3, batch_size, nHead, height * width, channel)
+        qkv = tf.reshape(self.qkv(hidden_states), (batch_size, height * width, 3, self.num_attention_heads, -1))
+        qkv = tf.transpose(qkv, perm=(2, 0, 3, 1, 4))
+        # q, k, v with shape (batch_size * nHead, height * width, channel)
+        query, key, value = tf.unstack(
+            tf.reshape(qkv, (3, batch_size * self.num_attention_heads, height * width, -1)), axis=0
+        )
+        attn_weights = tf.matmul(query * self.scale, key, transpose_b=True)
+
+        if self.use_rel_pos:
+            attn_weights = self.add_decomposed_rel_pos(
+                attn_weights, query, self.rel_pos_h, self.rel_pos_w, (height, width), (height, width)
+            )
+
+        attn_weights = tf.nn.softmax(attn_weights, axis=-1)
+
+        if training:
+            attn_probs = tf.nn.dropout(attn_weights, rate=self.dropout)
+        else:
+            attn_probs = attn_weights
+
+        attn_output = tf.reshape(attn_probs @ value, (batch_size, self.num_attention_heads, height, width, -1))
+        attn_output = tf.transpose(attn_output, perm=(0, 2, 3, 1, 4))
+        attn_output = tf.reshape(attn_output, (batch_size, height, width, self.config.hidden_size))
+
+        attn_output = self.proj(attn_output)
+
+        if output_attentions:
+            outputs = (attn_output, attn_weights)
+        else:
+            outputs = (attn_output, None)
+
+        return outputs
+
+
+class TFSamVisionLayer(tf.keras.layers.Layer):
+    def __init__(self, config, window_size, **kwargs):
+        super().__init__(**kwargs)
+        self.layer_norm1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1")
+        self.attn = TFSamVisionAttention(config, window_size, name="attn")
+        self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2")
+        self.mlp = TFSamMLPBlock(config, name="mlp")
+        self.window_size = window_size
+
+    def window_partition(self, hidden_states: tf.Tensor, window_size: int) -> Tuple[tf.Tensor, Tuple[int, int]]:
+        batch_size, height, width, channel = shape_list(hidden_states)
+
+        pad_h = (window_size - height % window_size) % window_size
+        pad_w = (window_size - width % window_size) % window_size
+        if pad_h > 0 or pad_w > 0:
+            hidden_states = tf.pad(hidden_states, [[0, 0], [0, pad_h], [0, pad_w], [0, 0]])
+        pad_height, pad_width = height + pad_h, width + pad_w
+
+        hidden_states = tf.reshape(
+            hidden_states,
+            [batch_size, pad_height // window_size, window_size, pad_width // window_size, window_size, channel],
+        )
+        windows = tf.reshape(
+            tf.transpose(hidden_states, perm=[0, 1, 3, 2, 4, 5]), [-1, window_size, window_size, channel]
+        )
+        return windows, (pad_height, pad_width)
+
+    def window_unpartition(
+        self, windows: tf.Tensor, window_size: int, padding_shape: Tuple[int, int], original_shape: Tuple[int, int]
+    ) -> tf.Tensor:
+        pad_height, pad_width = padding_shape
+        height, width = original_shape
+        batch_size = shape_list(windows)[0] // (pad_height * pad_width // window_size // window_size)
+        hidden_states = tf.reshape(
+            windows, [batch_size, pad_height // window_size, pad_width // window_size, window_size, window_size, -1]
+        )
+        hidden_states = tf.reshape(
+            tf.transpose(hidden_states, perm=[0, 1, 3, 2, 4, 5]), [batch_size, pad_height, pad_width, -1]
+        )
+
+        if pad_height > height or pad_width > width:
+            hidden_states = hidden_states[:, :height, :width, :]
+        return hidden_states
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        output_attentions: Optional[bool] = False,
+        training: Optional[bool] = False,
+    ) -> Tuple[tf.Tensor]:
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        if self.window_size > 0:
+            height, width = hidden_states.shape[1], hidden_states.shape[2]
+            hidden_states, padding_shape = self.window_partition(hidden_states, self.window_size)
+
+        hidden_states, attn_weights = self.attn(
+            hidden_states=hidden_states,
+            output_attentions=output_attentions,
+            training=training,
+        )
+        if self.window_size > 0:
+            hidden_states = self.window_unpartition(hidden_states, self.window_size, padding_shape, (height, width))
+
+        hidden_states = residual + hidden_states
+        layernorm_output = self.layer_norm2(hidden_states)
+        hidden_states = hidden_states + self.mlp(layernorm_output)
+
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class TFSamVisionNeck(tf.keras.layers.Layer):
+    def __init__(self, config: SamVisionConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+
+        self.conv1 = tf.keras.layers.Conv2D(
+            config.output_channels,
+            kernel_size=1,
+            use_bias=False,
+            name="conv1",
+        )
+        self.layer_norm1 = TFSamLayerNorm(config.output_channels, name="layer_norm1")
+        self.conv2 = tf.keras.layers.Conv2D(
+            config.output_channels,
+            kernel_size=3,
+            padding="same",
+            use_bias=False,
+            name="conv2",
+        )
+        self.layer_norm2 = TFSamLayerNorm(config.output_channels, name="layer_norm2")
+
+    def call(self, hidden_states):
+        hidden_states = self.conv1(hidden_states)
+        hidden_states = self.layer_norm1(hidden_states)
+
+        hidden_states = self.conv2(hidden_states)
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = tf.transpose(hidden_states, perm=[0, 3, 1, 2])
+        return hidden_states
+
+
+class TFSamVisionEncoder(tf.keras.layers.Layer):
+    def __init__(self, config: SamVisionConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.image_size = config.image_size
+
+        self.patch_embed = TFSamPatchEmbeddings(config, name="patch_embed")
+
+        self.pos_embed = None
+
+        self.layers = []
+        for i in range(config.num_hidden_layers):
+            layer = TFSamVisionLayer(
+                config,
+                window_size=config.window_size if i not in config.global_attn_indexes else 0,
+                name=f"layers_._{i}",
+            )
+            self.layers.append(layer)
+
+        self.neck = TFSamVisionNeck(config, name="neck")
+
+    def build(self, input_shape):
+        if self.config.use_abs_pos:
+            # Initialize absolute positional embedding with pretrain image size.
+            self.pos_embed = self.add_weight(
+                shape=[
+                    1,
+                    self.config.image_size // self.config.patch_size,
+                    self.config.image_size // self.config.patch_size,
+                    self.config.hidden_size,
+                ],
+                initializer="zeros",
+                trainable=True,
+                name="pos_embed",
+            )
+        super().build(input_shape)
+
+    def get_input_embeddings(self):
+        return self.patch_embed
+
+    def call(
+        self,
+        pixel_values: tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: Optional[bool] = False,
+    ) -> Union[Tuple, TFSamVisionEncoderOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.patch_embed(pixel_values)
+        if self.pos_embed is not None:
+            hidden_states = hidden_states + self.pos_embed
+
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_outputs = layer_module(hidden_states, output_attentions=output_attentions, training=training)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        hidden_states = self.neck(hidden_states)
+
+        if not return_dict:
+            outputs = (hidden_states,)
+            if output_hidden_states:
+                outputs = outputs + (all_hidden_states,)
+            if output_attentions:
+                outputs = outputs + (all_self_attentions,)
+            return outputs
+
+        return TFSamVisionEncoderOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class TFSamPreTrainedModel(TFPreTrainedModel):
+    config_class = SamConfig
+    base_model_prefix = "sam"
+    main_input_name = "pixel_values"
+
+
+SAM_START_DOCSTRING = r"""
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a TensorFlow [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model)
+    subclass. Use it as a regular TensorFlow Model and refer to the TensorFlow documentation for all matter related to
+    general usage and behavior.
+
+    Parameters:
+        config ([`SamConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+SAM_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`SamProcessor`]. See [`SamProcessor.__call__`] for
+            details.
+        input_points (`tf.Tensor` of shape `(batch_size, num_points, 2)`):
+            Input 2D spatial points, this is used by the prompt encoder to encode the prompt. Generally yields to much
+            better results. The points can be obtained by passing a list of list of list to the processor that will
+            create corresponding `tf` tensors of dimension 4. The first dimension is the image batch size, the second
+            dimension is the point batch size (i.e. how many segmentation masks do we want the model to predict per
+            input point), the third dimension is the number of points per segmentation mask (it is possible to pass
+            multiple points for a single mask), and the last dimension is the x (vertical) and y (horizontal)
+            coordinates of the point. If a different number of points is passed either for each image, or for each
+            mask, the processor will create "PAD" points that will correspond to the (0, 0) coordinate, and the
+            computation of the embedding will be skipped for these points using the labels.
+        input_labels (`tf.Tensor` of shape `(batch_size, point_batch_size, num_points)`):
+            Input labels for the points, this is used by the prompt encoder to encode the prompt. According to the
+            official implementation, there are 3 types of labels
+
+            - `1`: the point is a point that contains the object of interest
+            - `0`: the point is a point that does not contain the object of interest
+            - `-1`: the point corresponds to the background
+
+            We added the label:
+
+            - `-10`: the point is a padding point, thus should be ignored by the prompt encoder
+
+            The padding labels should be automatically done by the processor.
+        input_boxes (`tf.Tensor` of shape `(batch_size, num_boxes, 4)`):
+            Input boxes for the points, this is used by the prompt encoder to encode the prompt. Generally yields to
+            much better generated masks. The boxes can be obtained by passing a list of list of list to the processor,
+            that will generate a `tf` tensor, with each dimension corresponding respectively to the image batch size,
+            the number of boxes per image and the coordinates of the top left and botton right point of the box. In the
+            order (`x1`, `y1`, `x2`, `y2`):
+
+            - `x1`: the x coordinate of the top left point of the input box
+            - `y1`: the y coordinate of the top left point of the input box
+            - `x2`: the x coordinate of the bottom right point of the input box
+            - `y2`: the y coordinate of the bottom right point of the input box
+
+        input_masks (`tf.Tensor` of shape `(batch_size, image_size, image_size)`):
+            SAM model also accepts segmentation masks as input. The mask will be embedded by the prompt encoder to
+            generate a corresponding embedding, that will be fed later on to the mask decoder. These masks needs to be
+            manually fed by the user, and they need to be of shape (`batch_size`, `image_size`, `image_size`).
+
+        image_embeddings (`tf.Tensor` of shape `(batch_size, output_channels, window_size, window_size)`):
+            Image embeddings, this is used by the mask decder to generate masks and iou scores. For more memory
+            efficient computation, users can first retrieve the image embeddings using the `get_image_embeddings`
+            method, and then feed them to the `call` method instead of feeding the `pixel_values`.
+        multimask_output (`bool`, *optional*):
+            In the original implementation and paper, the model always outputs 3 masks per image (or per point / per
+            bounding box if relevant). However, it is possible to just output a single mask, that corresponds to the
+            "best" mask, by specifying `multimask_output=False`.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "Segment Anything Model (SAM) for generating segmentation masks, given an input image and ",
+    " optional 2D location and bounding boxes.",
+    SAM_START_DOCSTRING,
+)
+class TFSamModel(TFSamPreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"prompt_encoder.shared_embedding.positional_embedding"]
+
+    def __init__(self, config, **kwargs):
+        super().__init__(config, **kwargs)
+        self.shared_image_embedding = TFSamPositionalEmbedding(config.vision_config, name="shared_image_embedding")
+
+        self.vision_encoder = TFSamVisionEncoder(config.vision_config, name="vision_encoder")
+        self.prompt_encoder = TFSamPromptEncoder(
+            config.prompt_encoder_config, self.shared_image_embedding, name="prompt_encoder"
+        )
+        self.mask_decoder = TFSamMaskDecoder(config.mask_decoder_config, name="mask_decoder")
+        self.config = config
+
+    def get_input_embeddings(self):
+        return self.vision_encoder.get_input_embeddings()
+
+    def get_image_wide_positional_embeddings(self):
+        size = self.config.prompt_encoder_config.image_embedding_size
+        grid = tf.ones((size, size))
+        y_embed = tf.math.cumsum(grid, axis=0) - 0.5
+        x_embed = tf.math.cumsum(grid, axis=1) - 0.5
+        y_embed = y_embed / size
+        x_embed = x_embed / size
+
+        positional_embedding = self.shared_image_embedding(tf.stack([x_embed, y_embed], axis=-1))
+        return tf.expand_dims(tf.transpose(positional_embedding, perm=[2, 0, 1]), axis=0)  # channel x height x width
+
+    def get_image_embeddings(
+        self,
+        pixel_values,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Returns the image embeddings by passing the pixel values through the vision encoder.
+
+        Args:
+            pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
+                Input pixel values
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.TFModelOutput`] instead of a plain tuple.
+
+        """
+        vision_output = self.vision_encoder(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        image_embeddings = vision_output[0]
+        return image_embeddings
+
+    def get_prompt_embeddings(
+        self,
+        input_points: tf.Tensor | None = None,
+        input_labels: tf.Tensor | None = None,
+        input_boxes: tf.Tensor | None = None,
+        input_masks: tf.Tensor | None = None,
+    ):
+        r"""
+        Returns the prompt embeddings by passing the input points, labels, boxes and masks through the prompt encoder.
+
+        Args:
+            input_points (`tf.Tensor` of shape `(batch_size, point_batch_size, num_points_per_image, 2)`):
+                Optional input points for the prompt encoder. The padding of the point is automatically done by the
+                processor. `point_batch_size` refers to the number of masks that we want the model to predict per
+                point. The model will output `point_batch_size` times 3 masks in total.
+            input_labels (`tf.Tensor` of shape `(batch_size, point_batch_size, num_points_per_image)`):
+                Optional input labels for the prompt encoder. The padding of the labels is automatically done by the
+                processor, or can be fed by the user.
+            input_boxes (`tf.Tensor` of shape `(batch_size, num_boxes_per_image, 4)`):
+                Optional input boxes for the prompt encoder. The padding of the boxes is automatically done by the
+                processor. users can also pass manually the input boxes.
+            input_masks (`tf.Tensor` of shape `(batch_size, image_size, image_size)`):
+                Optional input masks for the prompt encoder.
+        """
+        prompt_output = self.prompt_encoder(
+            input_points=input_points,
+            input_labels=input_labels,
+            input_boxes=input_boxes,
+            input_masks=input_masks,
+        )
+        return prompt_output
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(SAM_INPUTS_DOCSTRING)
+    def call(
+        self,
+        pixel_values: TFModelInputType | None = None,
+        input_points: tf.Tensor | None = None,
+        input_labels: tf.Tensor | None = None,
+        input_boxes: tf.Tensor | None = None,
+        input_masks: tf.Tensor | None = None,
+        image_embeddings: tf.Tensor | None = None,
+        multimask_output: bool = True,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+        **kwargs,
+    ) -> TFSamImageSegmentationOutput | Tuple[tf.Tensor]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None and image_embeddings is None:
+            raise ValueError("Either pixel_values or image_embeddings must be provided.")
+
+        if pixel_values is not None and image_embeddings is not None:
+            raise ValueError("Only one of pixel_values and image_embeddings can be provided.")
+
+        if input_points is not None and len(input_points.shape) != 4:
+            raise ValueError(
+                "The input_points must be a 4D tensor. Of shape `batch_size`, `point_batch_size`, `nb_points_per_image`, `2`.",
+                " got {}.".format(input_points.shape),
+            )
+        if input_boxes is not None and len(input_boxes.shape) != 3:
+            raise ValueError(
+                "The input_points must be a 3D tensor. Of shape `batch_size`, `nb_boxes`, `4`.",
+                " got {}.".format(input_boxes.shape),
+            )
+        if input_points is not None and input_boxes is not None:
+            point_batch_size = shape_list(input_points)[1]
+            box_batch_size = shape_list(input_boxes)[1]
+            if point_batch_size != box_batch_size:
+                raise ValueError(
+                    "You should provide as many bounding boxes as input points per box. Got {} and {}.".format(
+                        point_batch_size, box_batch_size
+                    )
+                )
+        if pixel_values is not None:
+            # Ensures that later checks pass even with an all-None shape from the serving signature
+            pixel_values = tf.ensure_shape(
+                pixel_values,
+                [
+                    None,
+                    self.config.vision_config.num_channels,
+                    self.config.vision_config.image_size,
+                    self.config.vision_config.image_size,
+                ],
+            )
+        image_positional_embeddings = self.get_image_wide_positional_embeddings()
+        # repeat with batch size
+        batch_size = shape_list(pixel_values)[0] if pixel_values is not None else shape_list(image_embeddings)[0]
+        image_positional_embeddings = tf.repeat(image_positional_embeddings, batch_size, axis=0)
+
+        vision_attentions = None
+        vision_hidden_states = None
+
+        if pixel_values is not None:
+            vision_outputs = self.vision_encoder(
+                pixel_values,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=True,
+                training=training,
+            )
+            image_embeddings = vision_outputs["last_hidden_state"]
+
+            if output_hidden_states:
+                vision_hidden_states = vision_outputs["hidden_states"]
+            if output_attentions:
+                vision_attentions = vision_outputs["attentions"]
+
+        if input_points is not None and input_labels is None:
+            input_labels = tf.ones_like(input_points[:, :, :, 0], dtype=tf.int32)
+
+        if input_points is not None and image_embeddings.shape[0] != input_points.shape[0]:
+            raise ValueError(
+                "The batch size of the image embeddings and the input points must be the same. ",
+                "Got {} and {} respectively.".format(image_embeddings.shape[0], input_points.shape[0]),
+                " if you want to pass multiple points for the same image, make sure that you passed ",
+                " input_points of shape (batch_size, point_batch_size, num_points_per_image, 3) and ",
+                " input_labels of shape (batch_size, point_batch_size, num_points_per_image)",
+            )
+
+        sparse_embeddings, dense_embeddings = self.prompt_encoder(
+            batch_size=shape_list(image_embeddings)[0],
+            input_points=input_points,
+            input_labels=input_labels,
+            input_boxes=input_boxes,
+            input_masks=input_masks,
+        )
+
+        low_res_masks, iou_predictions, mask_decoder_attentions = self.mask_decoder(
+            image_embeddings=image_embeddings,
+            image_positional_embeddings=image_positional_embeddings,
+            sparse_prompt_embeddings=sparse_embeddings,
+            dense_prompt_embeddings=dense_embeddings,
+            multimask_output=multimask_output,
+            output_attentions=output_attentions,
+        )
+
+        if not return_dict:
+            output = (iou_predictions, low_res_masks)
+            if output_hidden_states:
+                output = output + (vision_hidden_states,)
+
+            if output_attentions:
+                output = output + (vision_attentions, mask_decoder_attentions)
+            return output
+
+        return TFSamImageSegmentationOutput(
+            iou_scores=iou_predictions,
+            pred_masks=low_res_masks,
+            vision_hidden_states=vision_hidden_states,
+            vision_attentions=vision_attentions,
+            mask_decoder_attentions=mask_decoder_attentions,
+        )
+
+    def serving_output(self, output: TFSamImageSegmentationOutput) -> TFSamImageSegmentationOutput:
+        hs = tf.convert_to_tensor(output.vision_hidden_states) if self.config.output_hidden_states else None
+        attns = tf.convert_to_tensor(output.vision_attentions) if self.config.output_attentions else None
+
+        return TFSamImageSegmentationOutput(
+            iou_scores=output.iou_scores,
+            pred_masks=output.pred_masks,
+            vision_hidden_states=hs if self.config.output_hidden_states else None,
+            vision_attentions=attns if self.config.output_attentions else None,
+            mask_decoder_attentions=output.mask_decoder_attentions if self.config.output_attentions else None,
+        )
diff --git a/src/transformers/models/sam/processing_sam.py b/src/transformers/models/sam/processing_sam.py
new file mode 100644
index 000000000000..0ec47a995af1
--- /dev/null
+++ b/src/transformers/models/sam/processing_sam.py
@@ -0,0 +1,263 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for SAM.
+"""
+from copy import deepcopy
+from typing import Optional, Union
+
+import numpy as np
+
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils_base import BatchEncoding
+from ...utils import TensorType, is_tf_available, is_torch_available
+
+
+if is_torch_available():
+    import torch
+
+if is_tf_available():
+    import tensorflow as tf
+
+
+class SamProcessor(ProcessorMixin):
+    r"""
+    Constructs a SAM processor which wraps a SAM image processor and an 2D points & Bounding boxes processor into a
+    single processor.
+
+    [`SamProcessor`] offers all the functionalities of [`SamImageProcessor`]. See the docstring of
+    [`~SamImageProcessor.__call__`] for more information.
+
+    Args:
+        image_processor (`SamImageProcessor`):
+            An instance of [`SamImageProcessor`]. The image processor is a required input.
+    """
+    attributes = ["image_processor"]
+    image_processor_class = "SamImageProcessor"
+
+    def __init__(self, image_processor):
+        super().__init__(image_processor)
+        self.current_processor = self.image_processor
+        self.point_pad_value = -10
+        self.target_size = self.image_processor.size["longest_edge"]
+
+    def __call__(
+        self,
+        images=None,
+        input_points=None,
+        input_labels=None,
+        input_boxes=None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        This method uses [`SamImageProcessor.__call__`] method to prepare image(s) for the model. It also prepares 2D
+        points and bounding boxes for the model if they are provided.
+        """
+        encoding_image_processor = self.image_processor(
+            images,
+            return_tensors=return_tensors,
+            **kwargs,
+        )
+
+        # pop arguments that are not used in the foward but used nevertheless
+        original_sizes = encoding_image_processor["original_sizes"]
+
+        if hasattr(original_sizes, "numpy"):  # Checks if Torch or TF tensor
+            original_sizes = original_sizes.numpy()
+
+        input_points, input_labels, input_boxes = self._check_and_preprocess_points(
+            input_points=input_points,
+            input_labels=input_labels,
+            input_boxes=input_boxes,
+        )
+
+        encoding_image_processor = self._normalize_and_convert(
+            encoding_image_processor,
+            original_sizes,
+            input_points=input_points,
+            input_labels=input_labels,
+            input_boxes=input_boxes,
+            return_tensors=return_tensors,
+        )
+
+        return encoding_image_processor
+
+    def _normalize_and_convert(
+        self,
+        encoding_image_processor,
+        original_sizes,
+        input_points=None,
+        input_labels=None,
+        input_boxes=None,
+        return_tensors="pt",
+    ):
+        if input_points is not None:
+            if len(original_sizes) != len(input_points):
+                input_points = [
+                    self._normalize_coordinates(self.target_size, point, original_sizes[0]) for point in input_points
+                ]
+            else:
+                input_points = [
+                    self._normalize_coordinates(self.target_size, point, original_size)
+                    for point, original_size in zip(input_points, original_sizes)
+                ]
+            # check that all arrays have the same shape
+            if not all(point.shape == input_points[0].shape for point in input_points):
+                if input_labels is not None:
+                    input_points, input_labels = self._pad_points_and_labels(input_points, input_labels)
+
+            input_points = np.array(input_points)
+
+        if input_labels is not None:
+            input_labels = np.array(input_labels)
+
+        if input_boxes is not None:
+            if len(original_sizes) != len(input_boxes):
+                input_boxes = [
+                    self._normalize_coordinates(self.target_size, box, original_sizes[0], is_bounding_box=True)
+                    for box in input_boxes
+                ]
+            else:
+                input_boxes = [
+                    self._normalize_coordinates(self.target_size, box, original_size, is_bounding_box=True)
+                    for box, original_size in zip(input_boxes, original_sizes)
+                ]
+            input_boxes = np.array(input_boxes)
+
+        if input_boxes is not None:
+            if return_tensors == "pt":
+                input_boxes = torch.from_numpy(input_boxes)
+                # boxes batch size of 1 by default
+                input_boxes = input_boxes.unsqueeze(1) if len(input_boxes.shape) != 3 else input_boxes
+            elif return_tensors == "tf":
+                input_boxes = tf.convert_to_tensor(input_boxes)
+                # boxes batch size of 1 by default
+                input_boxes = tf.expand_dims(input_boxes, 1) if len(input_boxes.shape) != 3 else input_boxes
+            encoding_image_processor.update({"input_boxes": input_boxes})
+        if input_points is not None:
+            if return_tensors == "pt":
+                input_points = torch.from_numpy(input_points)
+                # point batch size of 1 by default
+                input_points = input_points.unsqueeze(1) if len(input_points.shape) != 4 else input_points
+            elif return_tensors == "tf":
+                input_points = tf.convert_to_tensor(input_points)
+                # point batch size of 1 by default
+                input_points = tf.expand_dims(input_points, 1) if len(input_points.shape) != 4 else input_points
+            encoding_image_processor.update({"input_points": input_points})
+        if input_labels is not None:
+            if return_tensors == "pt":
+                input_labels = torch.from_numpy(input_labels)
+                # point batch size of 1 by default
+                input_labels = input_labels.unsqueeze(1) if len(input_labels.shape) != 3 else input_labels
+            elif return_tensors == "tf":
+                input_labels = tf.convert_to_tensor(input_labels)
+                # point batch size of 1 by default
+                input_labels = tf.expand_dims(input_labels, 1) if len(input_labels.shape) != 3 else input_labels
+            encoding_image_processor.update({"input_labels": input_labels})
+
+        return encoding_image_processor
+
+    def _pad_points_and_labels(self, input_points, input_labels):
+        r"""
+        The method pads the 2D points and labels to the maximum number of points in the batch.
+        """
+        expected_nb_points = max([point.shape[0] for point in input_points])
+        processed_input_points = []
+        for i, point in enumerate(input_points):
+            if point.shape[0] != expected_nb_points:
+                point = np.concatenate(
+                    [point, np.zeros((expected_nb_points - point.shape[0], 2)) + self.point_pad_value], axis=0
+                )
+                input_labels[i] = np.append(input_labels[i], [self.point_pad_value])
+            processed_input_points.append(point)
+        input_points = processed_input_points
+        return input_points, input_labels
+
+    def _normalize_coordinates(
+        self, target_size: int, coords: np.ndarray, original_size, is_bounding_box=False
+    ) -> np.ndarray:
+        """
+        Expects a numpy array of length 2 in the final dimension. Requires the original image size in (H, W) format.
+        """
+        old_h, old_w = original_size
+        new_h, new_w = self.image_processor._get_preprocess_shape(original_size, longest_edge=target_size)
+        coords = deepcopy(coords).astype(float)
+
+        if is_bounding_box:
+            coords = coords.reshape(-1, 2, 2)
+
+        coords[..., 0] = coords[..., 0] * (new_w / old_w)
+        coords[..., 1] = coords[..., 1] * (new_h / old_h)
+
+        if is_bounding_box:
+            coords = coords.reshape(-1, 4)
+
+        return coords
+
+    def _check_and_preprocess_points(
+        self,
+        input_points=None,
+        input_labels=None,
+        input_boxes=None,
+    ):
+        r"""
+        Check and preprocesses the 2D points, labels and bounding boxes. It checks if the input is valid and if they
+        are, it converts the coordinates of the points and bounding boxes. If a user passes directly a `torch.Tensor`,
+        it is converted to a `numpy.ndarray` and then to a `list`.
+        """
+        if input_points is not None:
+            if hasattr(input_points, "numpy"):  # Checks for TF or Torch tensor
+                input_points = input_points.numpy().tolist()
+
+            if not isinstance(input_points, list) or not isinstance(input_points[0], list):
+                raise ValueError("Input points must be a list of list of floating points.")
+            input_points = [np.array(input_point) for input_point in input_points]
+        else:
+            input_points = None
+
+        if input_labels is not None:
+            if hasattr(input_labels, "numpy"):
+                input_labels = input_labels.numpy().tolist()
+
+            if not isinstance(input_labels, list) or not isinstance(input_labels[0], list):
+                raise ValueError("Input labels must be a list of list integers.")
+            input_labels = [np.array(label) for label in input_labels]
+        else:
+            input_labels = None
+
+        if input_boxes is not None:
+            if hasattr(input_boxes, "numpy"):
+                input_boxes = input_boxes.numpy().tolist()
+
+            if (
+                not isinstance(input_boxes, list)
+                or not isinstance(input_boxes[0], list)
+                or not isinstance(input_boxes[0][0], list)
+            ):
+                raise ValueError("Input boxes must be a list of list of list of floating points.")
+            input_boxes = [np.array(box).astype(np.float32) for box in input_boxes]
+        else:
+            input_boxes = None
+
+        return input_points, input_labels, input_boxes
+
+    @property
+    def model_input_names(self):
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(image_processor_input_names))
+
+    def post_process_masks(self, *args, **kwargs):
+        return self.image_processor.post_process_masks(*args, **kwargs)
diff --git a/src/transformers/models/segformer/__init__.py b/src/transformers/models/segformer/__init__.py
index 0d0aeb80ca8d..22dc3655b889 100644
--- a/src/transformers/models/segformer/__init__.py
+++ b/src/transformers/models/segformer/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/segformer/configuration_segformer.py b/src/transformers/models/segformer/configuration_segformer.py
index 994ecba2097d..7f95657e1975 100644
--- a/src/transformers/models/segformer/configuration_segformer.py
+++ b/src/transformers/models/segformer/configuration_segformer.py
@@ -51,19 +51,19 @@ class SegformerConfig(PretrainedConfig):
             The number of input channels.
         num_encoder_blocks (`int`, *optional*, defaults to 4):
             The number of encoder blocks (i.e. stages in the Mix Transformer encoder).
-        depths (`List[int]`, *optional*, defaults to [2, 2, 2, 2]):
+        depths (`List[int]`, *optional*, defaults to `[2, 2, 2, 2]`):
             The number of layers in each encoder block.
-        sr_ratios (`List[int]`, *optional*, defaults to [8, 4, 2, 1]):
+        sr_ratios (`List[int]`, *optional*, defaults to `[8, 4, 2, 1]`):
             Sequence reduction ratios in each encoder block.
-        hidden_sizes (`List[int]`, *optional*, defaults to [32, 64, 160, 256]):
+        hidden_sizes (`List[int]`, *optional*, defaults to `[32, 64, 160, 256]`):
             Dimension of each of the encoder blocks.
-        patch_sizes (`List[int]`, *optional*, defaults to [7, 3, 3, 3]):
+        patch_sizes (`List[int]`, *optional*, defaults to `[7, 3, 3, 3]`):
             Patch size before each encoder block.
-        strides (`List[int]`, *optional*, defaults to [4, 2, 2, 2]):
+        strides (`List[int]`, *optional*, defaults to `[4, 2, 2, 2]`):
             Stride before each encoder block.
-        num_attention_heads (`List[int]`, *optional*, defaults to [1, 2, 5, 8]):
+        num_attention_heads (`List[int]`, *optional*, defaults to `[1, 2, 5, 8]`):
             Number of attention heads for each attention layer in each block of the Transformer encoder.
-        mlp_ratios (`List[int]`, *optional*, defaults to [4, 4, 4, 4]):
+        mlp_ratios (`List[int]`, *optional*, defaults to `[4, 4, 4, 4]`):
             Ratio of the size of the hidden layer compared to the size of the input layer of the Mix FFNs in the
             encoder blocks.
         hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
@@ -79,7 +79,7 @@ class SegformerConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         drop_path_rate (`float`, *optional*, defaults to 0.1):
             The dropout probability for stochastic depth, used in the blocks of the Transformer encoder.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
             The epsilon used by the layer normalization layers.
         decoder_hidden_size (`int`, *optional*, defaults to 256):
             The dimension of the all-MLP decode head.
@@ -122,7 +122,7 @@ def __init__(
         layer_norm_eps=1e-6,
         decoder_hidden_size=256,
         semantic_loss_ignore_index=255,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -155,7 +155,6 @@ def __init__(
 
 
 class SegformerOnnxConfig(OnnxConfig):
-
     torch_onnx_minimum_version = version.parse("1.11")
 
     @property
diff --git a/src/transformers/models/segformer/convert_segformer_original_to_pytorch.py b/src/transformers/models/segformer/convert_segformer_original_to_pytorch.py
index 00dddc9974a9..48dd453309cb 100644
--- a/src/transformers/models/segformer/convert_segformer_original_to_pytorch.py
+++ b/src/transformers/models/segformer/convert_segformer_original_to_pytorch.py
@@ -20,16 +20,16 @@
 from collections import OrderedDict
 from pathlib import Path
 
+import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 
-import requests
-from huggingface_hub import hf_hub_download
 from transformers import (
     SegformerConfig,
-    SegformerFeatureExtractor,
     SegformerForImageClassification,
     SegformerForSemanticSegmentation,
+    SegformerImageProcessor,
 )
 from transformers.utils import logging
 
@@ -179,14 +179,14 @@ def convert_segformer_checkpoint(model_name, checkpoint_path, pytorch_dump_folde
     else:
         raise ValueError(f"Size {size} not supported")
 
-    # load feature extractor (only resize + normalize)
-    feature_extractor = SegformerFeatureExtractor(
+    # load image processor (only resize + normalize)
+    image_processor = SegformerImageProcessor(
         image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False
     )
 
     # prepare image
     image = prepare_img()
-    pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
+    pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
 
     logger.info(f"Converting model {model_name}...")
 
@@ -362,11 +362,11 @@ def convert_segformer_checkpoint(model_name, checkpoint_path, pytorch_dump_folde
         assert logits.shape == expected_shape
         assert torch.allclose(logits[0, :3, :3, :3], expected_slice, atol=1e-2)
 
-    # finally, save model and feature extractor
-    logger.info(f"Saving PyTorch model and feature extractor to {pytorch_dump_folder_path}...")
+    # finally, save model and image processor
+    logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     model.save_pretrained(pytorch_dump_folder_path)
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/segformer/image_processing_segformer.py b/src/transformers/models/segformer/image_processing_segformer.py
index acc6026451f0..27687fde03fb 100644
--- a/src/transformers/models/segformer/image_processing_segformer.py
+++ b/src/transformers/models/segformer/image_processing_segformer.py
@@ -19,22 +19,21 @@
 
 import numpy as np
 
-from transformers.utils import is_torch_available, is_torch_tensor, is_vision_available
-from transformers.utils.generic import TensorType
-
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import center_crop, normalize, rescale, resize, to_channel_dimension_format
+from ...image_transforms import resize, to_channel_dimension_format
 from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
     ChannelDimension,
     ImageInput,
     PILImageResampling,
-    is_batched,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
     to_numpy_array,
     valid_images,
 )
-from ...utils import logging
+from ...utils import TensorType, is_torch_available, is_torch_tensor, is_vision_available, logging
 
 
 if is_vision_available():
@@ -58,7 +57,7 @@ class SegformerImageProcessor(BaseImageProcessor):
         size (`Dict[str, int]` *optional*, defaults to `{"height": 512, "width": 512}`):
             Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
             method.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
             Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
             `preprocess` method.
         do_rescale (`bool`, *optional*, defaults to `True`):
@@ -96,7 +95,7 @@ def __init__(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_reduce_labels: bool = False,
-        **kwargs
+        **kwargs,
     ) -> None:
         if "reduce_labels" in kwargs:
             warnings.warn(
@@ -122,8 +121,8 @@ def __init__(
     @classmethod
     def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
         """
-        Overrides the `from_dict` method from the base class to make sure `reduce_labels` is updated if image processor
-        is created using from_dict and kwargs e.g. `SegformerImageProcessor.from_pretrained(checkpoint,
+        Overrides the `from_dict` method from the base class to make sure `do_reduce_labels` is updated if image
+        processor is created using from_dict and kwargs e.g. `SegformerImageProcessor.from_pretrained(checkpoint,
         reduce_labels=True)`
         """
         image_processor_dict = image_processor_dict.copy()
@@ -131,13 +130,15 @@ def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
             image_processor_dict["reduce_labels"] = kwargs.pop("reduce_labels")
         return super().from_dict(image_processor_dict, **kwargs)
 
+    # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize
     def resize(
         self,
         image: np.ndarray,
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BILINEAR,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image to `(size["height"], size["width"])`.
@@ -146,86 +147,39 @@ def resize(
             image (`np.ndarray`):
                 Image to resize.
             size (`Dict[str, int]`):
-                Size of the output image.
-            resample (`PILImageResampling`, *optional*, defaults to `PIL.Image.BILINEAR`):
-                Resampling filter to use when resiizing the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
         """
         size = get_size_dict(size)
         if "height" not in size or "width" not in size:
             raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        output_size = (size["height"], size["width"])
         return resize(
-            image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
         )
 
-    def center_crop(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Center crop an image to `(size["height"], size["width"])`. If the input size is smaller than `crop_size` along
-        any edge, the image is padded with 0's and then center cropped.
-
-        Args:
-            image (`np.ndarray`):
-                Image to center crop.
-            size (`Dict[str, int]`):
-                Size of the output image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        size = get_size_dict(size)
-        if "height" not in size or "width" not in size:
-            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
-        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
-
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            image_mean (`float` or `List[float]`):
-                Image mean.
-            image_std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
+    # Copied from transformers.models.beit.image_processing_beit.BeitImageProcessor.reduce_label
     def reduce_label(self, label: ImageInput) -> np.ndarray:
         label = to_numpy_array(label)
         # Avoid using underflow conversion
@@ -246,18 +200,19 @@ def _preprocess(
         rescale_factor: Optional[float] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ):
         if do_reduce_labels:
             image = self.reduce_label(image)
 
         if do_resize:
-            image = self.resize(image=image, size=size, resample=resample)
+            image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
 
         if do_rescale:
-            image = self.rescale(image=image, scale=rescale_factor)
+            image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
 
         if do_normalize:
-            image = self.normalize(image=image, mean=image_mean, std=image_std)
+            image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
 
         return image
 
@@ -273,10 +228,18 @@ def _preprocess_image(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.ndarray:
         """Preprocesses a single image."""
         # All transformations expect numpy arrays.
         image = to_numpy_array(image)
+        if is_scaled_image(image) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(image)
         image = self._preprocess(
             image=image,
             do_reduce_labels=False,
@@ -288,9 +251,10 @@ def _preprocess_image(
             do_normalize=do_normalize,
             image_mean=image_mean,
             image_std=image_std,
+            input_data_format=input_data_format,
         )
         if data_format is not None:
-            image = to_channel_dimension_format(image, data_format)
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
         return image
 
     def _preprocess_mask(
@@ -299,14 +263,19 @@ def _preprocess_mask(
         do_reduce_labels: bool = None,
         do_resize: bool = None,
         size: Dict[str, int] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.ndarray:
         """Preprocesses a single mask."""
         segmentation_map = to_numpy_array(segmentation_map)
         # Add channel dimension if missing - needed for certain transformations
-        added_channel_dim = False
         if segmentation_map.ndim == 2:
             added_channel_dim = True
             segmentation_map = segmentation_map[None, ...]
+            input_data_format = ChannelDimension.FIRST
+        else:
+            added_channel_dim = False
+            if input_data_format is None:
+                input_data_format = infer_channel_dimension_format(segmentation_map, num_channels=1)
         # reduce zero label if needed
         segmentation_map = self._preprocess(
             image=segmentation_map,
@@ -316,6 +285,7 @@ def _preprocess_mask(
             size=size,
             do_rescale=False,
             do_normalize=False,
+            input_data_format=input_data_format,
         )
         # Remove extra channel dimension if added for processing
         if added_channel_dim:
@@ -347,6 +317,7 @@ def preprocess(
         do_reduce_labels: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
         **kwargs,
     ) -> PIL.Image.Image:
         """
@@ -354,7 +325,8 @@ def preprocess(
 
         Args:
             images (`ImageInput`):
-                Image to preprocess.
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
             segmentation_maps (`ImageInput`, *optional*):
                 Segmentation map to preprocess.
             do_resize (`bool`, *optional*, defaults to `self.do_resize`):
@@ -389,6 +361,12 @@ def preprocess(
                 The channel dimension format for the output image. Can be one of:
                     - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                     - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         do_rescale = do_rescale if do_rescale is not None else self.do_rescale
@@ -400,9 +378,9 @@ def preprocess(
         image_mean = image_mean if image_mean is not None else self.image_mean
         image_std = image_std if image_std is not None else self.image_std
 
-        if not is_batched(images):
-            images = [images]
-            segmentation_maps = [segmentation_maps] if segmentation_maps is not None else None
+        images = make_list_of_images(images)
+        if segmentation_maps is not None:
+            segmentation_maps = make_list_of_images(segmentation_maps, expected_ndims=2)
 
         if not valid_images(images):
             raise ValueError(
@@ -437,6 +415,7 @@ def preprocess(
                 image_mean=image_mean,
                 image_std=image_std,
                 data_format=data_format,
+                input_data_format=input_data_format,
             )
             for img in images
         ]
@@ -450,6 +429,7 @@ def preprocess(
                     do_reduce_labels=do_reduce_labels,
                     do_resize=do_resize,
                     size=size,
+                    input_data_format=input_data_format,
                 )
                 for segmentation_map in segmentation_maps
             ]
@@ -457,6 +437,7 @@ def preprocess(
 
         return BatchFeature(data=data, tensor_type=return_tensors)
 
+    # Copied from transformers.models.beit.image_processing_beit.BeitImageProcessor.post_process_semantic_segmentation with Beit->Segformer
     def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple] = None):
         """
         Converts the output of [`SegformerForSemanticSegmentation`] into semantic segmentation maps. Only supports
@@ -466,8 +447,9 @@ def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple]
             outputs ([`SegformerForSemanticSegmentation`]):
                 Raw outputs of the model.
             target_sizes (`List[Tuple]` of length `batch_size`, *optional*):
-                List of tuples corresponding to the requested final size (height, width) of each prediction. If left to
-                None, predictions will not be resized.
+                List of tuples corresponding to the requested final size (height, width) of each prediction. If unset,
+                predictions will not be resized.
+
         Returns:
             semantic_segmentation: `List[torch.Tensor]` of length `batch_size`, where each item is a semantic
             segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
diff --git a/src/transformers/models/segformer/modeling_segformer.py b/src/transformers/models/segformer/modeling_segformer.py
index 57eb9fa6c460..47f42b5e0ed5 100755
--- a/src/transformers/models/segformer/modeling_segformer.py
+++ b/src/transformers/models/segformer/modeling_segformer.py
@@ -42,7 +42,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "SegformerConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "SegformerImageProcessor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "nvidia/mit-b0"
@@ -85,8 +84,8 @@ class SegFormerImageClassifierOutput(ImageClassifierOutput):
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 
-# Copied from transformers.models.convnext.modeling_convnext.drop_path
-def drop_path(input, drop_prob: float = 0.0, training: bool = False, scale_by_keep=True):
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
@@ -491,7 +490,7 @@ def _init_weights(self, module):
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`SegformerImageProcessor`]. See [`SegformerImageProcessor.__call__`] for details.
+            [`AutoImageProcessor`]. See [`SegformerImageProcessor.__call__`] for details.
 
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
@@ -529,7 +528,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -589,7 +587,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_IMAGE_CLASS_CHECKPOINT,
         output_type=SegFormerImageClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -772,11 +769,11 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import SegformerImageProcessor, SegformerForSemanticSegmentation
+        >>> from transformers import AutoImageProcessor, SegformerForSemanticSegmentation
         >>> from PIL import Image
         >>> import requests
 
-        >>> image_processor = SegformerImageProcessor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
+        >>> image_processor = AutoImageProcessor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
         >>> model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
diff --git a/src/transformers/models/segformer/modeling_tf_segformer.py b/src/transformers/models/segformer/modeling_tf_segformer.py
index 702730a6f12c..9d6132928a61 100644
--- a/src/transformers/models/segformer/modeling_tf_segformer.py
+++ b/src/transformers/models/segformer/modeling_tf_segformer.py
@@ -14,8 +14,11 @@
 # limitations under the License.
 """ TensorFlow SegFormer model."""
 
+
+from __future__ import annotations
+
 import math
-from typing import Dict, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import tensorflow as tf
 
@@ -37,7 +40,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "SegformerConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "SegformerImageProcessor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "nvidia/mit-b0"
@@ -108,7 +110,7 @@ def __init__(
         hidden_size: int,
         num_attention_heads: int,
         sequence_reduction_ratio: int,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.hidden_size = hidden_size
@@ -214,7 +216,7 @@ def __init__(
         hidden_size: int,
         num_attention_heads: int,
         sequence_reduction_ratio: int,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.self = TFSegformerEfficientSelfAttention(
@@ -263,7 +265,7 @@ def __init__(
         in_features: int,
         hidden_features: int = None,
         out_features: int = None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         out_features = out_features or in_features
@@ -297,7 +299,7 @@ def __init__(
         drop_path: float,
         sequence_reduction_ratio: int,
         mlp_ratio: int,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.layer_norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm_1")
@@ -519,34 +521,8 @@ class TFSegformerPreTrainedModel(TFPreTrainedModel):
     main_input_name = "pixel_values"
 
     @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        VISION_DUMMY_INPUTS = tf.random.uniform(shape=(3, self.config.num_channels, 512, 512), dtype=tf.float32)
-        return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)}
-
-    @tf.function(
-        input_signature=[
-            {
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        """
-        Method used for serving the model.
-
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
-        """
-        output = self.call(inputs)
-
-        return self.serving_output(output)
+    def input_signature(self):
+        return {"pixel_values": tf.TensorSpec(shape=(None, self.config.num_channels, 512, 512), dtype=tf.float32)}
 
 
 SEGFORMER_START_DOCSTRING = r"""
@@ -569,7 +545,7 @@ def serving(self, inputs):
     Args:
         pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
             Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
-            [`AutoImageProcessor.__call__`] for details.
+            [`SegformerImageProcessor.__call__`] for details.
 
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
@@ -606,7 +582,6 @@ def __init__(self, config: SegformerConfig, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFBaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -630,14 +605,6 @@ def call(
         )
         return outputs
 
-    def serving_output(self, output: TFBaseModelOutput) -> TFBaseModelOutput:
-        # hidden_states and attention not converted to Tensor with tf.convert_to_tensor as they are all of different dimensions
-        return TFBaseModelOutput(
-            last_hidden_state=output.last_hidden_state,
-            hidden_states=output.hidden_states,
-            attentions=output.attentions,
-        )
-
 
 @add_start_docstrings(
     """
@@ -659,7 +626,6 @@ def __init__(self, config: SegformerConfig, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_IMAGE_CLASS_CHECKPOINT,
         output_type=TFSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -667,8 +633,8 @@ def __init__(self, config: SegformerConfig, *inputs, **kwargs):
     )
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
-        labels: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
+        labels: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -702,12 +668,6 @@ def call(
             loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
         )
 
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        # hidden_states and attention not converted to Tensor with tf.convert_to_tensor as they are all of different dimensions
-        return TFSequenceClassifierOutput(
-            logits=output.logits, hidden_states=output.hidden_states, attentions=output.attentions
-        )
-
 
 class TFSegformerMLP(tf.keras.layers.Layer):
     """
@@ -749,22 +709,21 @@ def __init__(self, config: SegformerConfig, **kwargs):
 
         self.config = config
 
-    def call(self, encoder_hidden_states, training: bool = False):
-        batch_size = shape_list(encoder_hidden_states[-1])[0]
-
+    def call(self, encoder_hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
         all_hidden_states = ()
         for encoder_hidden_state, mlp in zip(encoder_hidden_states, self.mlps):
             if self.config.reshape_last_stage is False and len(shape_list(encoder_hidden_state)) == 3:
                 height = tf.math.sqrt(tf.cast(shape_list(encoder_hidden_state)[1], tf.float32))
                 height = width = tf.cast(height, tf.int32)
-                encoder_hidden_state = tf.reshape(encoder_hidden_state, (batch_size, height, width, -1))
+                channel_dim = shape_list(encoder_hidden_state)[-1]
+                encoder_hidden_state = tf.reshape(encoder_hidden_state, (-1, height, width, channel_dim))
 
             # unify channel dimension
             encoder_hidden_state = tf.transpose(encoder_hidden_state, perm=[0, 2, 3, 1])
-            height = shape_list(encoder_hidden_state)[1]
-            width = shape_list(encoder_hidden_state)[2]
+            height, width = shape_list(encoder_hidden_state)[1:3]
             encoder_hidden_state = mlp(encoder_hidden_state)
-            encoder_hidden_state = tf.reshape(encoder_hidden_state, (batch_size, height, width, -1))
+            channel_dim = shape_list(encoder_hidden_state)[-1]
+            encoder_hidden_state = tf.reshape(encoder_hidden_state, (-1, height, width, channel_dim))
 
             # upsample
             temp_state = tf.transpose(encoder_hidden_states[0], perm=[0, 2, 3, 1])
@@ -819,7 +778,7 @@ def masked_loss(real, pred):
     def call(
         self,
         pixel_values: tf.Tensor,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -835,14 +794,14 @@ def call(
         Examples:
 
         ```python
-        >>> from transformers import SegformerImageProcessor, TFSegformerForSemanticSegmentation
+        >>> from transformers import AutoImageProcessor, TFSegformerForSemanticSegmentation
         >>> from PIL import Image
         >>> import requests
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> image_processor = SegformerImageProcessor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
+        >>> image_processor = AutoImageProcessor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
         >>> model = TFSegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
 
         >>> inputs = image_processor(images=image, return_tensors="tf")
@@ -892,9 +851,3 @@ def call(
             hidden_states=outputs.hidden_states if output_hidden_states else None,
             attentions=outputs.attentions,
         )
-
-    def serving_output(self, output: TFSemanticSegmenterOutput) -> TFSemanticSegmenterOutput:
-        # hidden_states and attention not converted to Tensor with tf.convert_to_tensor as they are all of different dimensions
-        return TFSemanticSegmenterOutput(
-            logits=output.logits, hidden_states=output.hidden_states, attentions=output.attentions
-        )
diff --git a/src/transformers/models/sew/__init__.py b/src/transformers/models/sew/__init__.py
index bfe39bea1bdc..bd43be68b7c0 100644
--- a/src/transformers/models/sew/__init__.py
+++ b/src/transformers/models/sew/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/sew/configuration_sew.py b/src/transformers/models/sew/configuration_sew.py
index 1c69818fa156..831d95f54d10 100644
--- a/src/transformers/models/sew/configuration_sew.py
+++ b/src/transformers/models/sew/configuration_sew.py
@@ -59,10 +59,15 @@ class SEWConfig(PretrainedConfig):
             `"relu"`, `"selu"` and `"gelu_new"` are supported.
         hidden_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        activation_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for activations inside the fully connected layer.
         attention_dropout (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
         final_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for the final projection layer of [`SEWForCTC`].
+        layerdrop (`float`, *optional*, defaults to 0.1):
+            The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
+            details.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
@@ -190,7 +195,7 @@ def __init__(
         pad_token_id=0,
         bos_token_id=1,
         eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
         self.hidden_size = hidden_size
diff --git a/src/transformers/models/sew/convert_sew_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/sew/convert_sew_original_pytorch_checkpoint_to_pytorch.py
index 58c0338a850d..81c3284af8ef 100644
--- a/src/transformers/models/sew/convert_sew_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/sew/convert_sew_original_pytorch_checkpoint_to_pytorch.py
@@ -25,6 +25,7 @@
 
 # Register SEW's fairseq modules
 from sew_asapp import tasks  # noqa: F401
+
 from transformers import (
     SEWConfig,
     SEWForCTC,
diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py
index bed9e9fbdf0c..17364a255b9c 100644
--- a/src/transformers/models/sew/modeling_sew.py
+++ b/src/transformers/models/sew/modeling_sew.py
@@ -24,28 +24,21 @@
 from torch import nn
 from torch.nn import CrossEntropyLoss
 
-from transformers.deepspeed import is_deepspeed_zero3_enabled
-
 from ...activations import ACT2FN
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
 from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_int_div
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_sew import SEWConfig
 
 
 logger = logging.get_logger(__name__)
 
-_PROCESSOR_FOR_DOC = "Wav2Vec2Processor"
-_FEAT_EXTRACTOR_FOR_DOC = "Wav2Vec2FeatureExtractor"
-
 
 _HIDDEN_STATES_START_POSITION = 1
 
-
 # General docstring
 _CONFIG_FOR_DOC = "SEWConfig"
-_PROCESSOR_FOR_DOC = "Wav2Vec2Processor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "asapp/sew-tiny-100k-ft-ls100h"
@@ -58,7 +51,6 @@
 _CTC_EXPECTED_LOSS = 0.42
 
 # Audio class docstring
-_FEAT_EXTRACTOR_FOR_DOC = "Wav2Vec2FeatureExtractor"
 _SEQ_CLASS_CHECKPOINT = "anton-l/sew-mid-100k-ft-keyword-spotting"
 _SEQ_CLASS_EXPECTED_OUTPUT = "'_unknown_'"
 _SEQ_CLASS_EXPECTED_LOSS = 9.52
@@ -488,8 +480,8 @@ def forward(
 
         proj_shape = (bsz * self.num_heads, -1, self.head_dim)
         query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-        key_states = key_states.view(*proj_shape)
-        value_states = value_states.view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
 
         src_len = key_states.size(1)
         attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
@@ -535,7 +527,7 @@ def forward(
 
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
             raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
                 f" {attn_output.size()}"
             )
 
@@ -543,7 +535,7 @@ def forward(
         attn_output = attn_output.transpose(1, 2)
 
         # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned aross GPUs when using tensor-parallelism.
+        # partitioned across GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
@@ -675,7 +667,7 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = np.random.uniform(0, 1)
+            dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
             if not skip_the_layer or deepspeed_zero3_is_enabled:
@@ -731,7 +723,6 @@ class SEWPreTrainedModel(PreTrainedModel):
     base_model_prefix = "sew"
     main_input_name = "input_values"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -777,7 +768,7 @@ def _get_feat_extract_output_lengths(self, input_lengths: Union[torch.LongTensor
         def _conv_out_length(input_length, kernel_size, stride):
             # 1D convolutional layer output length formula taken
             # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch_int_div(input_length - kernel_size, stride) + 1
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
 
         for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
             input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
@@ -819,10 +810,10 @@ def _get_feature_vector_attention_mask(self, feature_vector_length: int, attenti
 SEW_INPUTS_DOCSTRING = r"""
     Args:
         input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
-            Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
-            into an array of type *List[float]* or a *numpy.ndarray*, *e.g.* via the soundfile library (*pip install
-            soundfile*). To prepare the array into *input_values*, the [`Wav2Vec2Processor`] should be used for padding
-            and conversion into a tensor of type *torch.FloatTensor*. See [`Wav2Vec2Processor.__call__`] for details.
+            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
+            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
+            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
+            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details.
         attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
             1]`:
@@ -916,7 +907,6 @@ def _mask_hidden_states(
 
     @add_start_docstrings_to_model_forward(SEW_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_PROCESSOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -978,12 +968,14 @@ def forward(
 )
 # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->SEW, wav2vec2->sew, WAV_2_VEC_2->SEW
 class SEWForCTC(SEWPreTrainedModel):
-    def __init__(self, config):
+    def __init__(self, config, target_lang: Optional[str] = None):
         super().__init__(config)
 
         self.sew = SEWModel(config)
         self.dropout = nn.Dropout(config.final_dropout)
 
+        self.target_lang = target_lang
+
         if config.vocab_size is None:
             raise ValueError(
                 f"You are trying to instantiate {self.__class__} with a configuration that "
@@ -999,6 +991,27 @@ def __init__(self, config):
         # Initialize weights and apply final processing
         self.post_init()
 
+    def tie_weights(self):
+        """
+        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
+        passing `target_lang=...` to `from_pretrained(...)`.
+
+        This method is **not** supposed to be called by the user and is prone to be changed in the future.
+        """
+
+        # Note that `tie_weights` is usually used to tie input and output embedding weights. The method is re-purposed to
+        # correctly load adapter layers for SEW so that we do not have to introduce a new API to
+        # [`PreTrainedModel`]. While slightly hacky, SEW never has to tie input and output embeddings, so that it is
+        # ok to repurpose this function here.
+        target_lang = self.target_lang
+
+        if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
+            raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
+        elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
+            logger.info("By default `target_lang` is set to 'eng'.")
+        elif target_lang is not None:
+            self.load_adapter(target_lang, force_load=True)
+
     def freeze_feature_extractor(self):
         """
         Calling this function will disable the gradient computation for the feature encoder so that its parameter will
@@ -1018,9 +1031,16 @@ def freeze_feature_encoder(self):
         """
         self.sew.feature_extractor._freeze_parameters()
 
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.sew.parameters():
+            param.requires_grad = False
+
     @add_start_docstrings_to_model_forward(SEW_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_PROCESSOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=CausalLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1061,7 +1081,6 @@ def forward(
 
         loss = None
         if labels is not None:
-
             if labels.max() >= self.config.vocab_size:
                 raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
 
@@ -1155,7 +1174,6 @@ def freeze_base_model(self):
 
     @add_start_docstrings_to_model_forward(SEW_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_SEQ_CLASS_CHECKPOINT,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/sew_d/__init__.py b/src/transformers/models/sew_d/__init__.py
index 905bfb0f5b68..ab1dd5284a32 100644
--- a/src/transformers/models/sew_d/__init__.py
+++ b/src/transformers/models/sew_d/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/sew_d/configuration_sew_d.py b/src/transformers/models/sew_d/configuration_sew_d.py
index 559b149d0373..460c05cf2459 100644
--- a/src/transformers/models/sew_d/configuration_sew_d.py
+++ b/src/transformers/models/sew_d/configuration_sew_d.py
@@ -63,8 +63,6 @@ class SEWDConfig(PretrainedConfig):
             Whether to share attention key with c2p and p2c.
         relative_attention (`bool`, *optional*, defaults to `True`):
             Whether to use relative position encoding.
-        position_biased_input (`bool`, *optional*, defaults to `False`):
-            Whether to add absolute position embedding to content embedding.
         pos_att_type (`Tuple[str]`, *optional*, defaults to `("p2c", "c2p")`):
             The type of relative position attention, it can be a combination of `("p2c", "c2p")`, e.g. `("p2c")`,
             `("p2c", "c2p")`, `("p2c", "c2p")`.
@@ -74,6 +72,8 @@ class SEWDConfig(PretrainedConfig):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"selu"`, `"gelu_python"` and `"gelu_new"` are supported.
         hidden_dropout (`float`, *optional*, defaults to 0.1):
+            Deprecated. Not used by the model and will be removed in a future version.
+        activation_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_dropout (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
@@ -183,7 +183,6 @@ def __init__(
         position_buckets=256,
         share_att_key=True,
         relative_attention=True,
-        position_biased_input=False,
         pos_att_type=("p2c", "c2p"),
         norm_rel_ebd="layer_norm",
         hidden_act="gelu_python",
@@ -192,7 +191,6 @@ def __init__(
         attention_dropout=0.1,
         feat_proj_dropout=0.0,
         final_dropout=0.1,
-        layerdrop=0.1,
         initializer_range=0.02,
         layer_norm_eps=1e-7,
         feature_layer_norm_eps=1e-5,
@@ -218,7 +216,7 @@ def __init__(
         pad_token_id=0,
         bos_token_id=1,
         eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
         self.hidden_size = hidden_size
@@ -239,16 +237,14 @@ def __init__(
         self.share_att_key = share_att_key
         self.relative_attention = relative_attention
         self.norm_rel_ebd = norm_rel_ebd
-        self.position_biased_input = position_biased_input
         self.pos_att_type = list(pos_att_type)
         self.hidden_act = hidden_act
         self.num_attention_heads = num_attention_heads
-        self.hidden_dropout = hidden_dropout
+        self._hidden_dropout = hidden_dropout
         self.attention_dropout = attention_dropout
         self.activation_dropout = activation_dropout
         self.feat_proj_dropout = feat_proj_dropout
         self.final_dropout = final_dropout
-        self.layerdrop = layerdrop
         self.layer_norm_eps = layer_norm_eps
         self.feature_layer_norm_eps = feature_layer_norm_eps
         self.initializer_range = initializer_range
@@ -286,3 +282,16 @@ def __init__(
     @property
     def inputs_to_logits_ratio(self):
         return functools.reduce(operator.mul, self.conv_stride, 1)
+
+    @property
+    def hidden_dropout(self):
+        logger.warning_once("hidden_dropout is not used by the model and will be removed as config attribute in v4.35")
+        return self._hidden_dropout
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary.
+        """
+        output = super().to_dict()
+        output["hidden_dropout"] = output.pop("_hidden_dropout")
+        return output
diff --git a/src/transformers/models/sew_d/convert_sew_d_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/sew_d/convert_sew_d_original_pytorch_checkpoint_to_pytorch.py
index 942add470b9c..7844d7912f2c 100644
--- a/src/transformers/models/sew_d/convert_sew_d_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/sew_d/convert_sew_d_original_pytorch_checkpoint_to_pytorch.py
@@ -25,6 +25,7 @@
 
 # Register SEW's fairseq modules
 from sew_asapp import tasks  # noqa: F401
+
 from transformers import (
     SEWDConfig,
     SEWDForCTC,
diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py
index 804f78c26cdf..fbc6c4ced27e 100644
--- a/src/transformers/models/sew_d/modeling_sew_d.py
+++ b/src/transformers/models/sew_d/modeling_sew_d.py
@@ -25,12 +25,11 @@
 from torch import nn
 from torch.nn import CrossEntropyLoss, LayerNorm
 
-from transformers.deepspeed import is_deepspeed_zero3_enabled
-
 from ...activations import ACT2FN
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
 from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import softmax_backward_data, torch_int_div
+from ...pytorch_utils import softmax_backward_data
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_sew_d import SEWDConfig
 
@@ -42,7 +41,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "SEWDConfig"
-_PROCESSOR_FOR_DOC = "Wav2Vec2Processor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "asapp/sew-d-tiny-100k-ft-ls100h"
@@ -53,7 +51,6 @@
 _CTC_EXPECTED_LOSS = 0.21
 
 # Audio class docstring
-_FEAT_EXTRACTOR_FOR_DOC = "Wav2Vec2FeatureExtractor"
 _SEQ_CLASS_CHECKPOINT = "anton-l/sew-d-mid-400k-ft-keyword-spotting"
 _SEQ_CLASS_EXPECTED_OUTPUT = "'_unknown_'"
 _SEQ_CLASS_EXPECTED_LOSS = 3.16
@@ -562,13 +559,13 @@ def symbolic(g, self, mask, dim):
         r_mask = g.op(
             "Cast",
             g.op("Sub", g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64)), mask_cast_value),
-            to_i=sym_help.cast_pytorch_to_onnx["Byte"],
+            to_i=sym_help.cast_pytorch_to_onnx["Bool"],
         )
         output = masked_fill(
             g, self, r_mask, g.op("Constant", value_t=torch.tensor(torch.finfo(self.type().dtype()).min))
         )
         output = softmax(g, output, dim)
-        return masked_fill(g, output, r_mask, g.op("Constant", value_t=torch.tensor(0, dtype=torch.uint8)))
+        return masked_fill(g, output, r_mask, g.op("Constant", value_t=torch.tensor(0, dtype=torch.bool)))
 
 
 # Copied from transformers.models.deberta.modeling_deberta.DropoutContext
@@ -757,7 +754,7 @@ def forward(
                 Input states to the module usually the output from previous layer, it will be the Q,K and V in
                 *Attention(Q,K,V)*
 
-            attention_mask (`torch.ByteTensor`):
+            attention_mask (`torch.BoolTensor`):
                 An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
                 sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
                 th token.
@@ -792,7 +789,7 @@ def forward(
         if "p2c" in self.pos_att_type:
             scale_factor += 1
         scale = torch.sqrt(torch.tensor(query_layer.size(-1), dtype=torch.float) * scale_factor)
-        attention_scores = torch.bmm(query_layer, key_layer.transpose(-1, -2)) / scale.to(dtype=query_layer.dtype)
+        attention_scores = torch.bmm(query_layer, key_layer.transpose(-1, -2) / scale.to(dtype=query_layer.dtype))
         if self.relative_attention:
             rel_embeddings = self.pos_dropout(rel_embeddings)
             rel_att = self.disentangled_attention_bias(
@@ -1089,7 +1086,6 @@ def get_attention_mask(self, attention_mask):
         if attention_mask.dim() <= 2:
             extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
             attention_mask = extended_attention_mask * extended_attention_mask.squeeze(-2).unsqueeze(-1)
-            attention_mask = attention_mask.byte()
         elif attention_mask.dim() == 3:
             attention_mask = attention_mask.unsqueeze(1)
 
@@ -1120,7 +1116,7 @@ def forward(
         if attention_mask.dim() <= 2:
             input_mask = attention_mask
         else:
-            input_mask = (attention_mask.sum(-2) > 0).byte()
+            input_mask = attention_mask.sum(-2) > 0
         attention_mask = self.get_attention_mask(attention_mask)
         relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos)
 
@@ -1134,7 +1130,6 @@ def forward(
         rel_embeddings = self.get_rel_embedding()
         output_states = next_kv
         for i, layer_module in enumerate(self.layer):
-
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (output_states,)
 
@@ -1262,7 +1257,6 @@ class SEWDPreTrainedModel(PreTrainedModel):
     config_class = SEWDConfig
     base_model_prefix = "sew-d"
     main_input_name = "input_values"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
     supports_gradient_checkpointing = True
 
     def _init_weights(self, module):
@@ -1309,7 +1303,7 @@ def _get_feat_extract_output_lengths(self, input_lengths: Union[torch.LongTensor
         def _conv_out_length(input_length, kernel_size, stride):
             # 1D convolutional layer output length formula taken
             # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch_int_div(input_length - kernel_size, stride) + 1
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
 
         for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
             input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
@@ -1355,10 +1349,10 @@ def _set_gradient_checkpointing(self, module, value=False):
 SEWD_INPUTS_DOCSTRING = r"""
     Args:
         input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
-            Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
-            into an array of type *List[float]* or a *numpy.ndarray*, *e.g.* via the soundfile library (*pip install
-            soundfile*). To prepare the array into *input_values*, the [`Wav2Vec2Processor`] should be used for padding
-            and conversion into a tensor of type *torch.FloatTensor*. See [`Wav2Vec2Processor.__call__`] for details.
+            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
+            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
+            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
+            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details.
         attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
             1]`:
@@ -1453,7 +1447,6 @@ def _mask_hidden_states(
 
     @add_start_docstrings_to_model_forward(SEWD_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_PROCESSOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1515,12 +1508,14 @@ def forward(
 )
 # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->SEWD, wav2vec2->sew_d, WAV_2_VEC_2->SEWD
 class SEWDForCTC(SEWDPreTrainedModel):
-    def __init__(self, config):
+    def __init__(self, config, target_lang: Optional[str] = None):
         super().__init__(config)
 
         self.sew_d = SEWDModel(config)
         self.dropout = nn.Dropout(config.final_dropout)
 
+        self.target_lang = target_lang
+
         if config.vocab_size is None:
             raise ValueError(
                 f"You are trying to instantiate {self.__class__} with a configuration that "
@@ -1536,6 +1531,27 @@ def __init__(self, config):
         # Initialize weights and apply final processing
         self.post_init()
 
+    def tie_weights(self):
+        """
+        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
+        passing `target_lang=...` to `from_pretrained(...)`.
+
+        This method is **not** supposed to be called by the user and is prone to be changed in the future.
+        """
+
+        # Note that `tie_weights` is usually used to tie input and output embedding weights. The method is re-purposed to
+        # correctly load adapter layers for SEWD so that we do not have to introduce a new API to
+        # [`PreTrainedModel`]. While slightly hacky, SEWD never has to tie input and output embeddings, so that it is
+        # ok to repurpose this function here.
+        target_lang = self.target_lang
+
+        if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
+            raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
+        elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
+            logger.info("By default `target_lang` is set to 'eng'.")
+        elif target_lang is not None:
+            self.load_adapter(target_lang, force_load=True)
+
     def freeze_feature_extractor(self):
         """
         Calling this function will disable the gradient computation for the feature encoder so that its parameter will
@@ -1555,9 +1571,16 @@ def freeze_feature_encoder(self):
         """
         self.sew_d.feature_extractor._freeze_parameters()
 
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.sew_d.parameters():
+            param.requires_grad = False
+
     @add_start_docstrings_to_model_forward(SEWD_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_PROCESSOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=CausalLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1598,7 +1621,6 @@ def forward(
 
         loss = None
         if labels is not None:
-
             if labels.max() >= self.config.vocab_size:
                 raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
 
@@ -1692,7 +1714,6 @@ def freeze_base_model(self):
 
     @add_start_docstrings_to_model_forward(SEWD_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_SEQ_CLASS_CHECKPOINT,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/speech_encoder_decoder/__init__.py b/src/transformers/models/speech_encoder_decoder/__init__.py
index 4eea93eacddc..392f21296e72 100644
--- a/src/transformers/models/speech_encoder_decoder/__init__.py
+++ b/src/transformers/models/speech_encoder_decoder/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py
index 8b648f8e21bc..4a144514fd3b 100644
--- a/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py
+++ b/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py
@@ -14,7 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import copy
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -106,16 +105,3 @@ def from_encoder_decoder_configs(
         decoder_config.add_cross_attention = True
 
         return cls(encoder=encoder_config.to_dict(), decoder=decoder_config.to_dict(), **kwargs)
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default *to_dict()* from *PretrainedConfig*.
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["encoder"] = self.encoder.to_dict()
-        output["decoder"] = self.decoder.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/speech_encoder_decoder/convert_mbart_wav2vec2_seq2seq_original_to_pytorch.py b/src/transformers/models/speech_encoder_decoder/convert_mbart_wav2vec2_seq2seq_original_to_pytorch.py
index 8680f96e50d5..89690a5729c9 100644
--- a/src/transformers/models/speech_encoder_decoder/convert_mbart_wav2vec2_seq2seq_original_to_pytorch.py
+++ b/src/transformers/models/speech_encoder_decoder/convert_mbart_wav2vec2_seq2seq_original_to_pytorch.py
@@ -264,7 +264,7 @@ def convert_wav2vec2_checkpoint(
         add_adapter=True,
         adapter_stride=adapter_stride,
         adapter_kernel_size=adapter_kernel_size,
-        use_auth_token=True,
+        token_token=True,
         output_hidden_size=encoder_output_dim,
     )
     decoder_config = MBartConfig.from_pretrained(decoder_config_path)
@@ -282,7 +282,7 @@ def convert_wav2vec2_checkpoint(
     model = model[0].eval()
 
     # load feature extractor
-    feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(encoder_config_path, use_auth_token=True)
+    feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(encoder_config_path, token_token=True)
 
     # set weights for wav2vec2 encoder
     hf_encoder = Wav2Vec2Model(encoder_config)
diff --git a/src/transformers/models/speech_encoder_decoder/convert_speech_to_text_wav2vec2_seq2seq_original_to_pytorch.py b/src/transformers/models/speech_encoder_decoder/convert_speech_to_text_wav2vec2_seq2seq_original_to_pytorch.py
index 0a4bc48dea32..5e726aa9fd90 100644
--- a/src/transformers/models/speech_encoder_decoder/convert_speech_to_text_wav2vec2_seq2seq_original_to_pytorch.py
+++ b/src/transformers/models/speech_encoder_decoder/convert_speech_to_text_wav2vec2_seq2seq_original_to_pytorch.py
@@ -207,7 +207,7 @@ def create_vocab_dict(dict_path):
         "<unk>": 3,
     }
 
-    vocab_dict.update({k: v for k, v in zip(words, range(4, num_words + 4))})
+    vocab_dict.update(dict(zip(words, range(4, num_words + 4))))
     return vocab_dict
 
 
diff --git a/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py
index cd304fa0c0a8..b9975510abfd 100644
--- a/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py
+++ b/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py
@@ -85,11 +85,11 @@
 SPEECH_ENCODER_DECODER_INPUTS_DOCSTRING = r"""
     Args:
         inputs (`jnp.ndarray` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, feature_dim)`, *optional*):
-            Float values of input raw speech waveform or speech features. Values can be obtained by loading a *.flac*
-            or *.wav* audio file into an array of type *List[float]* or a *numpy.ndarray*, *e.g.* via the soundfile
-            library (*pip install soundfile*). To prepare the array into *inputs*, either the [`Wav2Vec2Processor`] or
+            Float values of input raw speech waveform or speech features. Values can be obtained by loading a `.flac`
+            or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile
+            library (`pip install soundfile`). To prepare the array into `inputs`, either the [`Wav2Vec2Processor`] or
             [`Speech2TextProcessor`] should be used for padding and conversion into a tensor of type
-            *torch.FloatTensor*.
+            `torch.FloatTensor`.
         attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
 
@@ -345,9 +345,8 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
-
         if not _do_init:
             raise ValueError(
                 "`FlaxSpeechEncoderDecoderModel` cannot be created without initializing, `_do_init` must be `True`."
@@ -616,7 +615,6 @@ def decode(
         def _decoder_forward(
             module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, encoder_hidden_states, **kwargs
         ):
-
             projection_module = module._get_projection_module()
             decoder_module = module._get_decoder_module()
 
@@ -682,12 +680,12 @@ def __call__(
         Examples:
 
         ```python
-        >>> from transformers import FlaxSpeechEncoderDecoderModel, BartTokenizer
+        >>> from transformers import FlaxSpeechEncoderDecoderModel, AutoTokenizer
 
         >>> # load a fine-tuned wav2vec2-2-bart model
         >>> model = FlaxSpeechEncoderDecoderModel.from_pretrained("patrickvonplaten/wav2vec2-2-bart-large")
         >>> # load output tokenizer
-        >>> tokenizer_output = BartTokenizer.from_pretrained("facebook/bart-large")
+        >>> tokenizer_output = AutoTokenizer.from_pretrained("facebook/bart-large")
 
         >>> inputs = jnp.ones((2, 5000), dtype=jnp.float32)
 
@@ -747,10 +745,10 @@ def prepare_inputs_for_generation(
         self,
         decoder_input_ids,
         max_length,
-        attention_mask: Optional[jnp.DeviceArray] = None,
-        decoder_attention_mask: Optional[jnp.DeviceArray] = None,
+        attention_mask: Optional[jax.Array] = None,
+        decoder_attention_mask: Optional[jax.Array] = None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # initializing the cache
         batch_size, seq_length = decoder_input_ids.shape
@@ -787,7 +785,7 @@ def from_encoder_decoder_pretrained(
         encoder_pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
         decoder_pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
         *model_args,
-        **kwargs
+        **kwargs,
     ) -> FlaxPreTrainedModel:
         r"""
         Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model
diff --git a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
index 79ad51479d42..e80c26e2698d 100644
--- a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
+++ b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
@@ -70,11 +70,11 @@
 SPEECH_ENCODER_DECODER_INPUTS_DOCSTRING = r"""
     Args:
         inputs (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, feature_dim)`, *optional*):
-            Float values of input raw speech waveform or speech features. Values can be obtained by loading a *.flac*
-            or *.wav* audio file into an array of type *List[float]* or a *numpy.ndarray*, *e.g.* via the soundfile
-            library (*pip install soundfile*). To prepare the array into *inputs*, either the [`Wav2Vec2Processor`] or
+            Float values of input raw speech waveform or speech features. Values can be obtained by loading a `.flac`
+            or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile
+            library (`pip install soundfile`). To prepare the array into `inputs`, either the [`Wav2Vec2Processor`] or
             [`Speech2TextProcessor`] should be used for padding and conversion into a tensor of type
-            *torch.FloatTensor*.
+            `torch.FloatTensor`.
         attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
 
@@ -290,7 +290,7 @@ def from_encoder_decoder_pretrained(
         encoder_pretrained_model_name_or_path: str = None,
         decoder_pretrained_model_name_or_path: str = None,
         *model_args,
-        **kwargs
+        **kwargs,
     ) -> PreTrainedModel:
         r"""
         Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model
@@ -465,11 +465,11 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import SpeechEncoderDecoderModel, Wav2Vec2Processor
+        >>> from transformers import SpeechEncoderDecoderModel, AutoProcessor
         >>> from datasets import load_dataset
         >>> import torch
 
-        >>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15")
+        >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15")
         >>> model = SpeechEncoderDecoderModel.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15")
 
         >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
@@ -583,9 +583,9 @@ def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
         return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
 
     def prepare_inputs_for_generation(
-        self, input_ids, past=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs
+        self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs
     ):
-        decoder_inputs = self.decoder.prepare_inputs_for_generation(input_ids, past=past)
+        decoder_inputs = self.decoder.prepare_inputs_for_generation(input_ids, past_key_values=past_key_values)
         decoder_attention_mask = decoder_inputs["attention_mask"] if "attention_mask" in decoder_inputs else None
         input_dict = {
             "attention_mask": attention_mask,
@@ -603,6 +603,6 @@ def resize_token_embeddings(self, *args, **kwargs):
             " respective methods of the wrapped decoder object (model.decoder.resize_token_embeddings(...))"
         )
 
-    def _reorder_cache(self, past, beam_idx):
+    def _reorder_cache(self, past_key_values, beam_idx):
         # apply decoder cache reordering here
-        return self.decoder._reorder_cache(past, beam_idx)
+        return self.decoder._reorder_cache(past_key_values, beam_idx)
diff --git a/src/transformers/models/speech_to_text/__init__.py b/src/transformers/models/speech_to_text/__init__.py
index ea6822cf948e..45a91c2b4962 100644
--- a/src/transformers/models/speech_to_text/__init__.py
+++ b/src/transformers/models/speech_to_text/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -29,6 +25,7 @@
 
 _import_structure = {
     "configuration_speech_to_text": ["SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "Speech2TextConfig"],
+    "processing_speech_to_text": ["Speech2TextProcessor"],
 }
 
 try:
@@ -47,14 +44,6 @@
 else:
     _import_structure["feature_extraction_speech_to_text"] = ["Speech2TextFeatureExtractor"]
 
-try:
-    if not (is_speech_available() and is_sentencepiece_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["processing_speech_to_text"] = ["Speech2TextProcessor"]
-
 try:
     if not is_tf_available():
         raise OptionalDependencyNotAvailable()
@@ -84,6 +73,7 @@
 
 if TYPE_CHECKING:
     from .configuration_speech_to_text import SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, Speech2TextConfig
+    from .processing_speech_to_text import Speech2TextProcessor
 
     try:
         if not is_sentencepiece_available():
@@ -101,14 +91,6 @@
     else:
         from .feature_extraction_speech_to_text import Speech2TextFeatureExtractor
 
-    try:
-        if not (is_speech_available() and is_sentencepiece_available()):
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .processing_speech_to_text import Speech2TextProcessor
-
     try:
         if not is_tf_available():
             raise OptionalDependencyNotAvailable()
diff --git a/src/transformers/models/speech_to_text/configuration_speech_to_text.py b/src/transformers/models/speech_to_text/configuration_speech_to_text.py
index 6f4d4fac0fad..8bad1972e092 100644
--- a/src/transformers/models/speech_to_text/configuration_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/configuration_speech_to_text.py
@@ -66,8 +66,6 @@ class Speech2TextConfig(PretrainedConfig):
             The dropout ratio for the attention probabilities.
         activation_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for classifier.
         init_std (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         encoder_layerdrop (`float`, *optional*, defaults to 0.0):
@@ -135,7 +133,6 @@ def __init__(
         activation_dropout=0.0,
         init_std=0.02,
         decoder_start_token_id=2,
-        classifier_dropout=0.0,
         scale_embedding=True,
         pad_token_id=1,
         bos_token_id=0,
@@ -147,7 +144,7 @@ def __init__(
         conv_channels=1024,
         input_feat_per_channel=80,
         input_channels=1,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.d_model = d_model
@@ -164,7 +161,6 @@ def __init__(
         self.init_std = init_std
         self.encoder_layerdrop = encoder_layerdrop
         self.decoder_layerdrop = decoder_layerdrop
-        self.classifier_dropout = classifier_dropout
         self.use_cache = use_cache
         self.num_hidden_layers = encoder_layers
         self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
diff --git a/src/transformers/models/speech_to_text/convert_s2t_fairseq_to_tfms.py b/src/transformers/models/speech_to_text/convert_s2t_fairseq_to_tfms.py
index 6c1cd993fe46..eb4d85262479 100644
--- a/src/transformers/models/speech_to_text/convert_s2t_fairseq_to_tfms.py
+++ b/src/transformers/models/speech_to_text/convert_s2t_fairseq_to_tfms.py
@@ -95,12 +95,10 @@ def convert_fairseq_s2t_checkpoint_to_tfms(checkpoint_path, pytorch_dump_folder_
 
     model = Speech2TextForConditionalGeneration(config)
     missing, unexpected = model.model.load_state_dict(state_dict, strict=False)
-    if len(missing) > 0 and not set(missing) <= set(
-        [
-            "encoder.embed_positions.weights",
-            "decoder.embed_positions.weights",
-        ]
-    ):
+    if len(missing) > 0 and not set(missing) <= {
+        "encoder.embed_positions.weights",
+        "decoder.embed_positions.weights",
+    }:
         raise ValueError(
             "Only `encoder.embed_positions.weights` and `decoder.embed_positions.weights`  are allowed to be missing,"
             f" but all the following weights are missing {missing}"
diff --git a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py
index af605626d0a8..0d5b077c9387 100644
--- a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py
@@ -41,13 +41,13 @@ class Speech2TextFeatureExtractor(SequenceFeatureExtractor):
     mean and variance normalization to the extracted features.
 
     Args:
-        feature_size (`int`, defaults to 80):
+        feature_size (`int`, *optional*, defaults to 80):
             The feature dimension of the extracted features.
-        sampling_rate (`int`, defaults to 16000):
-            The sampling rate at which the audio files should be digitalized expressed in Hertz per second (Hz).
-        num_mel_bins (`int`, defaults to 80):
+        sampling_rate (`int`, *optional*, defaults to 16000):
+            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
+        num_mel_bins (`int`, *optional*, defaults to 80):
             Number of Mel-frequency bins.
-        padding_value (`float`, defaults to 0.0):
+        padding_value (`float`, *optional*, defaults to 0.0):
             The value that is used to fill the padding vectors.
         do_ceptral_normalize (`bool`, *optional*, defaults to `True`):
             Whether or not to apply utterance-level cepstral mean and variance normalization to extracted features.
@@ -68,7 +68,7 @@ def __init__(
         do_ceptral_normalize=True,
         normalize_means=True,
         normalize_vars=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
         self.num_mel_bins = num_mel_bins
@@ -133,7 +133,7 @@ def __call__(
         return_tensors: Optional[Union[str, TensorType]] = None,
         sampling_rate: Optional[int] = None,
         return_attention_mask: Optional[bool] = None,
-        **kwargs
+        **kwargs,
     ) -> BatchFeature:
         """
         Main method to featurize and prepare for the model one or several sequence(s).
@@ -141,7 +141,8 @@ def __call__(
         Args:
             raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                 The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
-                values, a list of numpy arrays or a list of list of float values.
+                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
+                stereo, i.e. single float per timestep.
             padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
                 Select a strategy to pad the returned sequences (according to the model's padding side and padding
                 index) among:
@@ -160,7 +161,7 @@ def __call__(
                 If set will pad the sequence to a multiple of the provided value.
 
                 This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
-                >= 7.5 (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
+                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
             return_attention_mask (`bool`, *optional*):
                 Whether to return the attention mask. If left to the default, will return the attention mask according
                 to the specific feature_extractor's default.
@@ -169,7 +170,7 @@ def __call__(
 
                 <Tip>
 
-                For Speech2TextTransoformer models, `attention_mask` should alwys be passed for batched inference, to
+                For Speech2TextTransformer models, `attention_mask` should always be passed for batched inference, to
                 avoid subtle bugs.
 
                 </Tip>
@@ -200,9 +201,11 @@ def __call__(
                 "Failing to do so can result in silent errors that might be hard to debug."
             )
 
-        is_batched = bool(
-            isinstance(raw_speech, (list, tuple))
-            and (isinstance(raw_speech[0], np.ndarray) or isinstance(raw_speech[0], (tuple, list)))
+        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
+        if is_batched_numpy and len(raw_speech.shape) > 2:
+            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
+        is_batched = is_batched_numpy or (
+            isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
         )
 
         if is_batched:
diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
index 24329734f452..31c9b6cfe935 100755
--- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
@@ -16,7 +16,6 @@
 
 
 import math
-import random
 from typing import Optional, Tuple, Union
 
 import torch
@@ -64,18 +63,20 @@ def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start
 
 
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
     """
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
-    mask_cond = torch.arange(mask.size(-1))
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
 
     if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
     return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
 
 
@@ -291,8 +292,8 @@ def forward(
 
         proj_shape = (bsz * self.num_heads, -1, self.head_dim)
         query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-        key_states = key_states.view(*proj_shape)
-        value_states = value_states.view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
 
         src_len = key_states.size(1)
         attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
@@ -338,7 +339,7 @@ def forward(
 
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
             raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
                 f" {attn_output.size()}"
             )
 
@@ -346,7 +347,7 @@ def forward(
         attn_output = attn_output.transpose(1, 2)
 
         # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned aross GPUs when using tensor-parallelism.
+        # partitioned across GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
@@ -381,7 +382,7 @@ def forward(
     ) -> torch.Tensor:
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
@@ -465,11 +466,11 @@ def forward(
     ) -> torch.Tensor:
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             encoder_hidden_states (`torch.FloatTensor`):
-                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
             encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
@@ -612,8 +613,8 @@ def _get_feature_vector_attention_mask(self, feature_vector_length, attention_ma
             Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be obtained
             by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.*
             via the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
-            [`Speech2TextFeatureExtractor`] should be used for extracting the fbank features, padding and conversion
-            into a tensor of type `torch.FloatTensor`. See [`~Speech2TextFeatureExtractor.__call__`]
+            [`AutoFeatureExtractor`] should be used for extracting the fbank features, padding and conversion into a
+            tensor of type `torch.FloatTensor`. See [`~Speech2TextFeatureExtractor.__call__`]
         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
             1]`:
@@ -742,7 +743,7 @@ def forward(
                 Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be
                 obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
                 `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
-                `input_features`, the [`Speech2TextFeatureExtractor`] should be used for extracting the fbank features,
+                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the fbank features,
                 padding and conversion into a tensor of type `torch.FloatTensor`. See
                 [`~Speech2TextFeatureExtractor.__call__`]
             attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -806,8 +807,13 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
                 layer_outputs = (None, None)
             else:
                 if self.gradient_checkpointing and self.training:
@@ -893,8 +899,11 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
         combined_attention_mask = None
         if input_shape[-1] > 1:
             combined_attention_mask = _make_causal_mask(
-                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
-            ).to(inputs_embeds.device)
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
 
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
@@ -1024,6 +1033,13 @@ def forward(
         hidden_states = inputs_embeds + positions
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`..."
+                )
+                use_cache = False
+
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
@@ -1041,21 +1057,15 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache ="
-                        " False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         # None for past_key_value
@@ -1074,7 +1084,6 @@ def custom_forward(*inputs):
                     None,
                 )
             else:
-
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
@@ -1172,11 +1181,11 @@ def forward(
 
          ```python
          >>> import torch
-         >>> from transformers import Speech2TextModel, Speech2TextFeatureExtractor
+         >>> from transformers import Speech2TextModel, AutoFeatureExtractor
          >>> from datasets import load_dataset
 
          >>> model = Speech2TextModel.from_pretrained("facebook/s2t-small-librispeech-asr")
-         >>> feature_extractor = Speech2TextFeatureExtractor.from_pretrained("facebook/s2t-small-librispeech-asr")
+         >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/s2t-small-librispeech-asr")
          >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
          >>> inputs = feature_extractor(
          ...     ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt"
@@ -1257,17 +1266,7 @@ def forward(
 )
 class Speech2TextForConditionalGeneration(Speech2TextPreTrainedModel):
     base_model_prefix = "model"
-    _keys_to_ignore_on_load_missing = [
-        r"encoder.version",
-        r"decoder.version",
-        r"model.encoder.embed_positions.weights",
-        r"model.decoder.embed_positions.weights",
-        r"lm_head.weight",
-    ]
-    _keys_to_ignore_on_save = [
-        r"model.encoder.embed_positions.weights",
-        r"model.decoder.embed_positions.weights",
-    ]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config: Speech2TextConfig):
         super().__init__(config)
@@ -1283,10 +1282,6 @@ def get_encoder(self):
     def get_decoder(self):
         return self.model.get_decoder()
 
-    def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
-        new_embeddings = super().resize_token_embeddings(new_num_tokens)
-        return new_embeddings
-
     def get_output_embeddings(self):
         return self.lm_head
 
@@ -1402,7 +1397,7 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # cut decoder_input_ids if past is used
         if past_key_values is not None:
@@ -1420,8 +1415,10 @@ def prepare_inputs_for_generation(
         }
 
     @staticmethod
-    def _reorder_cache(past, beam_idx):
+    def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
         return reordered_past
diff --git a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
index b82c77905197..026d2241b461 100755
--- a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
@@ -15,8 +15,10 @@
 """ TensorFlow Speech2Text model."""
 
 
+from __future__ import annotations
+
 import random
-from typing import Dict, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import numpy as np
 import tensorflow as tf
@@ -36,7 +38,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -50,7 +52,6 @@
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "Speech2TextConfig"
-_TOKENIZER_FOR_DOC = "Speech2TextTokenizer"
 _CHECKPOINT_FOR_DOC = "facebook/s2t-small-librispeech-asr"
 
 
@@ -194,30 +195,19 @@ def _get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optiona
             emb = tf.concat([emb[:padding_idx, :], tf.zeros((1, tf.shape(emb)[1])), emb[padding_idx + 1 :, :]], axis=0)
         return emb
 
-    def build(self, input_shape: tf.TensorShape):
-        """
-        Build shared token embedding layer Shared weights logic adapted from
-        https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
-        """
-        self.embeddings = self.add_weight(
-            name="weights",  # name also used in PT
-            shape=tf.shape(self.embedding_weights),
-            trainable=False,
-        )
-        self.embeddings.assign(self.embedding_weights)
-        super().build(input_shape)
-
     def call(self, input_ids: tf.Tensor, past_key_values_length: int = 0) -> tf.Tensor:
         bsz, seq_len = shape_list(input_ids)
         # Create the position ids from the input token ids. Any padded tokens remain padded.
         position_ids = self.create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
 
-        # expand embeddings if needed
-        max_pos = self.padding_idx + 1 + seq_len
-        if max_pos > shape_list(self.embeddings)[0]:
-            self.embedding_weights = self._get_embedding(max_pos + self.offset, self.embedding_dim, self.padding_idx)
-            self.embeddings.assign(self.embedding_weights)
-        return tf.reshape(tf.gather(self.embeddings, tf.reshape(position_ids, (-1,)), axis=0), (bsz, seq_len, -1))
+        # Matt: The PyTorch code does a lot of work to cache the embeddings, setting the cached values as a
+        # model attribute in the forward pass. This is extremely forbidden in TF, which wants forward calls to be
+        # idempotent. TF doesn't need that caching anyway, since it can just store constants during compilation,
+        # so we just remove all of that code.
+        embeddings = self._get_embedding(
+            self.padding_idx + 1 + seq_len + self.offset + past_key_values_length, self.embedding_dim, self.padding_idx
+        )
+        return tf.reshape(tf.gather(embeddings, tf.reshape(position_ids, (-1,)), axis=0), (bsz, seq_len, -1))
 
     @staticmethod
     def create_position_ids_from_input_ids(
@@ -274,12 +264,12 @@ def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
     def call(
         self,
         hidden_states: tf.Tensor,
-        key_value_states: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[Tuple[tf.Tensor]]] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
+        key_value_states: tf.Tensor | None = None,
+        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
+        attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
         training: Optional[bool] = False,
-    ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
+    ) -> Tuple[tf.Tensor, tf.Tensor | None]:
         """Input shape: Batch x Time x Channel"""
 
         # if key_value_states are provided this layer is used as a cross-attention layer
@@ -410,7 +400,7 @@ def call(
     ):
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`tf.Tensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
@@ -477,21 +467,21 @@ def __init__(self, config: Speech2TextConfig, **kwargs):
     def call(
         self,
         hidden_states,
-        attention_mask: Optional[tf.Tensor] = None,
-        encoder_hidden_states: Optional[tf.Tensor] = None,
-        encoder_attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
-        cross_attn_layer_head_mask: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[tf.Tensor]] = None,
+        attention_mask: tf.Tensor | None = None,
+        encoder_hidden_states: tf.Tensor | None = None,
+        encoder_attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
+        cross_attn_layer_head_mask: tf.Tensor | None = None,
+        past_key_value: Tuple[tf.Tensor] | None = None,
         training=False,
     ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`tf.Tensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             encoder_hidden_states (`tf.Tensor`):
-                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
             encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
@@ -561,26 +551,7 @@ class TFSpeech2TextPreTrainedModel(TFPreTrainedModel):
     config_class = Speech2TextConfig
     base_model_prefix = "model"
     main_input_name = "input_features"
-
-    # Overwritten property due to different expected input shape and type
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        return {
-            self.main_input_name: tf.random.uniform(
-                [
-                    1,
-                    random.randint(1, self.config.max_source_positions),  # time
-                    self.config.input_feat_per_channel * self.config.input_channels,  # input channels
-                ]
-            ),
-            "decoder_input_ids": tf.constant([[2, 3]], dtype=tf.int32),
-        }
+    _keys_to_ignore_on_load_unexpected = [r"encoder.embed_positions.weights"]
 
     def _get_feat_extract_output_lengths(self, input_lengths: tf.Tensor):
         """
@@ -591,20 +562,18 @@ def _get_feat_extract_output_lengths(self, input_lengths: tf.Tensor):
 
         return input_lengths
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_features": tf.TensorSpec((None, None, None), tf.float32, name="input_features"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-                "decoder_input_ids": tf.TensorSpec((None, None), tf.int32, name="decoder_input_ids"),
-                "decoder_attention_mask": tf.TensorSpec((None, None), tf.int32, name="decoder_attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
+    @property
+    def input_signature(self):
+        return {
+            "input_features": tf.TensorSpec(
+                (None, None, self.config.input_feat_per_channel * self.config.input_channels),
+                tf.float32,
+                name="input_features",
+            ),
+            "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
+            "decoder_input_ids": tf.TensorSpec((None, None), tf.int32, name="decoder_input_ids"),
+            "decoder_attention_mask": tf.TensorSpec((None, None), tf.int32, name="decoder_attention_mask"),
+        }
 
 
 SPEECH_TO_TEXT_START_DOCSTRING = r"""
@@ -656,8 +625,8 @@ def serving(self, inputs):
             Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be obtained
             by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.*
             via the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
-            [`Speech2TextFeatureExtractor`] should be used for extracting the fbank features, padding and conversion
-            into a tensor of floats. See [`~Speech2TextFeatureExtractor.__call__`]
+            [`AutoFeatureExtractor`] should be used for extracting the fbank features, padding and conversion into a
+            tensor of floats. See [`~Speech2TextFeatureExtractor.__call__`]
         attention_mask (`tf.Tensor` of shape `({0})`, *optional*):
             Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
 
@@ -812,7 +781,7 @@ def call(
                 Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be
                 obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
                 `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
-                `input_features`, the [`Speech2TextFeatureExtractor`] should be used for extracting the fbank features,
+                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the fbank features,
                 padding and conversion into a tensor of floats. See [`~Speech2TextFeatureExtractor.__call__`]
             attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
@@ -1031,16 +1000,7 @@ def call(
         past_key_values_length = shape_list(past_key_values[0][0])[2] if past_key_values is not None else 0
 
         if inputs_embeds is None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.embed_tokens.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.embed_tokens.vocab_size)
             inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
         else:
             inputs_embeds = inputs_embeds
@@ -1168,7 +1128,7 @@ def call(
         output_hidden_states=None,
         return_dict=None,
         training=False,
-        **kwargs
+        **kwargs,
     ):
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -1257,29 +1217,28 @@ def get_decoder(self):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(SPEECH_TO_TEXT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFSeq2SeqModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_features: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        cross_attn_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_outputs: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_features: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_head_mask: np.ndarray | tf.Tensor | None = None,
+        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
+        encoder_outputs: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        decoder_inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
-        **kwargs
+        **kwargs,
     ) -> Union[Tuple, TFSeq2SeqModelOutput]:
         outputs = self.model(
             input_features=input_features,
@@ -1354,23 +1313,23 @@ def set_output_embeddings(self, new_embeddings):
     @replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_features: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        cross_attn_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_outputs: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_features: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_head_mask: np.ndarray | tf.Tensor | None = None,
+        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
+        encoder_outputs: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        decoder_inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
-        **kwargs
+        **kwargs,
     ) -> Union[Tuple, TFSeq2SeqLMOutput]:
         r"""
         labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -1484,7 +1443,7 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # cut decoder_input_ids if past is used
         if past_key_values is not None:
diff --git a/src/transformers/models/speech_to_text/tokenization_speech_to_text.py b/src/transformers/models/speech_to_text/tokenization_speech_to_text.py
index 843c79e397b8..b7104da7f1a8 100644
--- a/src/transformers/models/speech_to_text/tokenization_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/tokenization_speech_to_text.py
@@ -122,23 +122,12 @@ def __init__(
         do_lower_case=False,
         tgt_lang=None,
         lang_codes=None,
+        additional_special_tokens=None,
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
         **kwargs,
     ) -> None:
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
 
-        super().__init__(
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            pad_token=pad_token,
-            do_upper_case=do_upper_case,
-            do_lower_case=do_lower_case,
-            tgt_lang=tgt_lang,
-            lang_codes=lang_codes,
-            sp_model_kwargs=self.sp_model_kwargs,
-            **kwargs,
-        )
         self.do_upper_case = do_upper_case
         self.do_lower_case = do_lower_case
 
@@ -152,18 +141,39 @@ def __init__(
             self.langs = LANGUAGES[lang_codes]
             self.lang_tokens = [f"<lang:{lang}>" for lang in self.langs]
             self.lang_code_to_id = {lang: self.sp_model.PieceToId(f"<lang:{lang}>") for lang in self.langs}
-
-            self._additional_special_tokens = self.lang_tokens
+            if additional_special_tokens is not None:
+                additional_special_tokens = self.lang_tokens + additional_special_tokens
+            else:
+                additional_special_tokens = self.lang_tokens
             self._tgt_lang = tgt_lang if tgt_lang is not None else self.langs[0]
 
             self.set_tgt_lang_special_tokens(self._tgt_lang)
         else:
             self.lang_code_to_id = {}
 
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            do_upper_case=do_upper_case,
+            do_lower_case=do_lower_case,
+            tgt_lang=tgt_lang,
+            lang_codes=lang_codes,
+            sp_model_kwargs=self.sp_model_kwargs,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs,
+        )
+
     @property
     def vocab_size(self) -> int:
         return len(self.encoder)
 
+    def get_vocab(self) -> Dict:
+        vocab = self.encoder.copy()
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
     @property
     def tgt_lang(self) -> str:
         return self._tgt_lang
@@ -241,11 +251,6 @@ def get_special_tokens_mask(
             return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
         return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
 
-    def get_vocab(self) -> Dict:
-        vocab = self.encoder.copy()
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-
     def __getstate__(self) -> Dict:
         state = self.__dict__.copy()
         state["sp_model"] = None
diff --git a/src/transformers/models/speech_to_text_2/__init__.py b/src/transformers/models/speech_to_text_2/__init__.py
index 645a39746093..bf842f6006b3 100644
--- a/src/transformers/models/speech_to_text_2/__init__.py
+++ b/src/transformers/models/speech_to_text_2/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/speech_to_text_2/configuration_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/configuration_speech_to_text_2.py
index 00e46212b8e5..596f6bea0bbc 100644
--- a/src/transformers/models/speech_to_text_2/configuration_speech_to_text_2.py
+++ b/src/transformers/models/speech_to_text_2/configuration_speech_to_text_2.py
@@ -68,8 +68,6 @@ class Speech2Text2Config(PretrainedConfig):
             for more details.
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models).
-        max_source_positions (`int`, *optional*, defaults to 6000):
-            The maximum sequence length of log-mel filter-bank features that this model might ever be used with.
         max_target_positions (`int`, *optional*, defaults to 1024):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
@@ -111,9 +109,8 @@ def __init__(
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
-        max_source_positions=6000,
         max_target_positions=1024,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.d_model = d_model
@@ -129,7 +126,6 @@ def __init__(
         self.use_cache = use_cache
         self.num_hidden_layers = decoder_layers
         self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
-        self.max_source_positions = max_source_positions
         self.max_target_positions = max_target_positions
 
         super().__init__(
diff --git a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
index f0452e4df1a6..bfd801b24271 100755
--- a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
+++ b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
@@ -17,7 +17,6 @@
 
 import copy
 import math
-import random
 from typing import Optional, Tuple, Union
 
 import torch
@@ -44,18 +43,20 @@
 
 
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
     """
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
-    mask_cond = torch.arange(mask.size(-1))
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
 
     if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
     return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
 
 
@@ -237,8 +238,8 @@ def forward(
 
         proj_shape = (bsz * self.num_heads, -1, self.head_dim)
         query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-        key_states = key_states.view(*proj_shape)
-        value_states = value_states.view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
 
         src_len = key_states.size(1)
         attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
@@ -284,7 +285,7 @@ def forward(
 
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
             raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
                 f" {attn_output.size()}"
             )
 
@@ -292,7 +293,7 @@ def forward(
         attn_output = attn_output.transpose(1, 2)
 
         # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned aross GPUs when using tensor-parallelism.
+        # partitioned across GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
@@ -344,11 +345,11 @@ def forward(
     ):
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             encoder_hidden_states (`torch.FloatTensor`):
-                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
             encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
@@ -501,8 +502,11 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
         combined_attention_mask = None
         if input_shape[-1] > 1:
             combined_attention_mask = _make_causal_mask(
-                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
-            ).to(inputs_embeds.device)
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
 
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
@@ -632,6 +636,13 @@ def forward(
         hidden_states = inputs_embeds + positions
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache =" " False`..."
+                )
+                use_cache = False
+
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
@@ -650,21 +661,15 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache ="
-                        " False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         # None for past_key_value
@@ -683,7 +688,6 @@ def custom_forward(*inputs):
                     None,
                 )
             else:
-
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
@@ -752,7 +756,7 @@ def forward(self, *args, **kwargs):
     SPEECH_TO_TEXT_2_START_DOCSTRING,
 )
 class Speech2Text2ForCausalLM(Speech2Text2PreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         config = copy.deepcopy(config)
@@ -969,8 +973,10 @@ def prepare_inputs_for_generation(
         }
 
     @staticmethod
-    def _reorder_cache(past, beam_idx):
+    def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
         return reordered_past
diff --git a/src/transformers/models/speech_to_text_2/tokenization_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/tokenization_speech_to_text_2.py
index 3365dfe382ae..e28b8a62d015 100644
--- a/src/transformers/models/speech_to_text_2/tokenization_speech_to_text_2.py
+++ b/src/transformers/models/speech_to_text_2/tokenization_speech_to_text_2.py
@@ -108,17 +108,8 @@ def __init__(
         unk_token="<unk>",
         do_lower_case=False,
         merges_file=None,
-        **kwargs
+        **kwargs,
     ):
-        super().__init__(
-            unk_token=unk_token,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            pad_token=pad_token,
-            do_lower_case=do_lower_case,
-            **kwargs,
-        )
-
         self.do_lower_case = do_lower_case
 
         with open(vocab_file, encoding="utf-8") as vocab_handle:
@@ -137,6 +128,14 @@ def __init__(
             merges = [tuple(merge.split()[:2]) for merge in merges]
             self.bpe_ranks = dict(zip(merges, range(len(merges))))
             self.cache = {}
+        super().__init__(
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            do_lower_case=do_lower_case,
+            **kwargs,
+        )
 
     @property
     def vocab_size(self) -> int:
@@ -213,7 +212,7 @@ def _tokenize(self, text):
         split_tokens = []
         for token in text:
             if token:
-                split_tokens.extend([t for t in self.bpe(token).split(" ")])
+                split_tokens.extend(list(self.bpe(token).split(" ")))
 
         return split_tokens
 
diff --git a/src/transformers/models/speecht5/__init__.py b/src/transformers/models/speecht5/__init__.py
new file mode 100644
index 000000000000..20606dda51ef
--- /dev/null
+++ b/src/transformers/models/speecht5/__init__.py
@@ -0,0 +1,96 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_sentencepiece_available,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_speecht5": [
+        "SPEECHT5_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "SPEECHT5_PRETRAINED_HIFIGAN_CONFIG_ARCHIVE_MAP",
+        "SpeechT5Config",
+        "SpeechT5HifiGanConfig",
+    ],
+    "feature_extraction_speecht5": ["SpeechT5FeatureExtractor"],
+    "processing_speecht5": ["SpeechT5Processor"],
+}
+
+try:
+    if not is_sentencepiece_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["tokenization_speecht5"] = ["SpeechT5Tokenizer"]
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_speecht5"] = [
+        "SPEECHT5_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "SpeechT5ForSpeechToText",
+        "SpeechT5ForSpeechToSpeech",
+        "SpeechT5ForTextToSpeech",
+        "SpeechT5Model",
+        "SpeechT5PreTrainedModel",
+        "SpeechT5HifiGan",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_speecht5 import (
+        SPEECHT5_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        SPEECHT5_PRETRAINED_HIFIGAN_CONFIG_ARCHIVE_MAP,
+        SpeechT5Config,
+        SpeechT5HifiGanConfig,
+    )
+    from .feature_extraction_speecht5 import SpeechT5FeatureExtractor
+    from .processing_speecht5 import SpeechT5Processor
+
+    try:
+        if not is_sentencepiece_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .tokenization_speecht5 import SpeechT5Tokenizer
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_speecht5 import (
+            SPEECHT5_PRETRAINED_MODEL_ARCHIVE_LIST,
+            SpeechT5ForSpeechToSpeech,
+            SpeechT5ForSpeechToText,
+            SpeechT5ForTextToSpeech,
+            SpeechT5HifiGan,
+            SpeechT5Model,
+            SpeechT5PreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/speecht5/configuration_speecht5.py b/src/transformers/models/speecht5/configuration_speecht5.py
new file mode 100644
index 000000000000..8d6a61023c7c
--- /dev/null
+++ b/src/transformers/models/speecht5/configuration_speecht5.py
@@ -0,0 +1,427 @@
+# coding=utf-8
+# Copyright 2023 The Fairseq Authors, Microsoft Research, and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" SpeechT5 model configuration"""
+
+import functools
+import operator
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+SPEECHT5_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "microsoft/speecht5_asr": "https://huggingface.co/microsoft/speecht5_asr/resolve/main/config.json",
+    "microsoft/speecht5_tts": "https://huggingface.co/microsoft/speecht5_tts/resolve/main/config.json",
+    "microsoft/speecht5_vc": "https://huggingface.co/microsoft/speecht5_vc/resolve/main/config.json",
+}
+
+SPEECHT5_PRETRAINED_HIFIGAN_CONFIG_ARCHIVE_MAP = {
+    "microsoft/speecht5_hifigan": "https://huggingface.co/microsoft/speecht5_hifigan/resolve/main/config.json",
+}
+
+
+class SpeechT5Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`SpeechT5Model`]. It is used to instantiate a
+    SpeechT5 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the SpeechT5
+    [microsoft/speecht5_asr](https://huggingface.co/microsoft/speecht5_asr) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 81):
+            Vocabulary size of the SpeechT5 model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed to the forward method of [`SpeechT5Model`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        encoder_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        encoder_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.1):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        decoder_layers (`int`, *optional*, defaults to 6):
+            Number of hidden layers in the Transformer decoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer decoder.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.1):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        positional_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the text position encoding layers.
+        hidden_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for activations inside the fully connected layer.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        scale_embedding (`bool`, *optional*, defaults to `False`):
+            Scale embeddings by diving by sqrt(d_model).
+        feat_extract_norm (`str`, *optional*, defaults to `"group"`):
+            The norm to be applied to 1D convolutional layers in the speech encoder pre-net. One of `"group"` for group
+            normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
+            convolutional layers.
+        feat_proj_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for output of the speech encoder pre-net.
+        feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the 1D convolutional layers of the feature
+            extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        conv_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
+            A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
+            speech encoder pre-net. The length of *conv_dim* defines the number of 1D convolutional layers.
+        conv_stride (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
+            A tuple of integers defining the stride of each 1D convolutional layer in the speech encoder pre-net. The
+            length of *conv_stride* defines the number of convolutional layers and has to match the length of
+            *conv_dim*.
+        conv_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the speech encoder pre-net.
+            The length of *conv_kernel* defines the number of convolutional layers and has to match the length of
+            *conv_dim*.
+        conv_bias (`bool`, *optional*, defaults to `False`):
+            Whether the 1D convolutional layers have a bias.
+        num_conv_pos_embeddings (`int`, *optional*, defaults to 128):
+            Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional
+            embeddings layer.
+        num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
+            Number of groups of 1D convolutional positional embeddings layer.
+        apply_spec_augment (`bool`, *optional*, defaults to `True`):
+            Whether to apply *SpecAugment* data augmentation to the outputs of the speech encoder pre-net. For
+            reference see [SpecAugment: A Simple Data Augmentation Method for Automatic Speech
+            Recognition](https://arxiv.org/abs/1904.08779).
+        mask_time_prob (`float`, *optional*, defaults to 0.05):
+            Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
+            procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
+            reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
+            masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
+            actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
+        mask_time_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the time axis.
+        mask_time_min_masks (`int`, *optional*, defaults to 2),:
+            The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step,
+            irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
+            mask_time_min_masks''
+        mask_feature_prob (`float`, *optional*, defaults to 0.0):
+            Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
+            masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
+            the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
+            span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
+            may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
+            True`.
+        mask_feature_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the feature axis.
+        mask_feature_min_masks (`int`, *optional*, defaults to 0),:
+            The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
+            ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
+        num_mel_bins (`int`, *optional*, defaults to 80):
+            Number of mel features used per input features. Used by the speech decoder pre-net. Should correspond to
+            the value used in the [`SpeechT5Processor`] class.
+        speech_decoder_prenet_layers (`int`, *optional*, defaults to 2):
+            Number of layers in the speech decoder pre-net.
+        speech_decoder_prenet_units (`int`, *optional*, defaults to 256):
+            Dimensionality of the layers in the speech decoder pre-net.
+        speech_decoder_prenet_dropout (`float`, *optional*, defaults to 0.5):
+            The dropout probability for the speech decoder pre-net layers.
+        speaker_embedding_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of the *XVector* embedding vectors.
+        speech_decoder_postnet_layers (`int`, *optional*, defaults to 5):
+            Number of layers in the speech decoder post-net.
+        speech_decoder_postnet_units (`int`, *optional*, defaults to 256):
+            Dimensionality of the layers in the speech decoder post-net.
+        speech_decoder_postnet_kernel (`int`, *optional*, defaults to 5):
+            Number of convolutional filter channels in the speech decoder post-net.
+        speech_decoder_postnet_dropout (`float`, *optional*, defaults to 0.5):
+            The dropout probability for the speech decoder post-net layers.
+        reduction_factor (`int`, *optional*, defaults to 2):
+            Spectrogram length reduction factor for the speech decoder inputs.
+        max_speech_positions (`int`, *optional*, defaults to 4000):
+            The maximum sequence length of speech features that this model might ever be used with.
+        max_text_positions (`int`, *optional*, defaults to 450):
+            The maximum sequence length of text features that this model might ever be used with.
+        encoder_max_relative_position (`int`, *optional*, defaults to 160):
+            Maximum distance for relative position embedding in the encoder.
+        use_guided_attention_loss (`bool`, *optional*, defaults to `True`):
+            Whether to apply guided attention loss while training the TTS model.
+        guided_attention_loss_num_heads (`int`, *optional*, defaults to 2):
+            Number of attention heads the guided attention loss will be applied to. Use -1 to apply this loss to all
+            attention heads.
+        guided_attention_loss_sigma (`float`, *optional*, defaults to 0.4):
+            Standard deviation for guided attention loss.
+        guided_attention_loss_scale (`float`, *optional*, defaults to 10.0):
+            Scaling coefficient for guided attention loss (also known as lambda).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+
+    Example:
+
+    ```python
+    >>> from transformers import SpeechT5Model, SpeechT5Config
+
+    >>> # Initializing a "microsoft/speecht5_asr" style configuration
+    >>> configuration = SpeechT5Config()
+
+    >>> # Initializing a model (with random weights) from the "microsoft/speecht5_asr" style configuration
+    >>> model = SpeechT5Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "speecht5"
+    attribute_map = {"num_attention_heads": "encoder_attention_heads", "num_hidden_layers": "encoder_layers"}
+
+    def __init__(
+        self,
+        vocab_size=81,
+        hidden_size=768,
+        encoder_layers=12,
+        encoder_attention_heads=12,
+        encoder_ffn_dim=3072,
+        encoder_layerdrop=0.1,
+        decoder_layers=6,
+        decoder_ffn_dim=3072,
+        decoder_attention_heads=12,
+        decoder_layerdrop=0.1,
+        hidden_act="gelu",
+        positional_dropout=0.1,
+        hidden_dropout=0.1,
+        attention_dropout=0.1,
+        activation_dropout=0.1,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        scale_embedding=False,
+        feat_extract_norm="group",
+        feat_proj_dropout=0.0,
+        feat_extract_activation="gelu",
+        conv_dim=(512, 512, 512, 512, 512, 512, 512),
+        conv_stride=(5, 2, 2, 2, 2, 2, 2),
+        conv_kernel=(10, 3, 3, 3, 3, 2, 2),
+        conv_bias=False,
+        num_conv_pos_embeddings=128,
+        num_conv_pos_embedding_groups=16,
+        apply_spec_augment=True,
+        mask_time_prob=0.05,
+        mask_time_length=10,
+        mask_time_min_masks=2,
+        mask_feature_prob=0.0,
+        mask_feature_length=10,
+        mask_feature_min_masks=0,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        decoder_start_token_id=2,
+        num_mel_bins=80,
+        speech_decoder_prenet_layers=2,
+        speech_decoder_prenet_units=256,
+        speech_decoder_prenet_dropout=0.5,
+        speaker_embedding_dim=512,
+        speech_decoder_postnet_layers=5,
+        speech_decoder_postnet_units=256,
+        speech_decoder_postnet_kernel=5,
+        speech_decoder_postnet_dropout=0.5,
+        reduction_factor=2,
+        max_speech_positions=4000,
+        max_text_positions=450,
+        encoder_max_relative_position=160,
+        use_guided_attention_loss=True,
+        guided_attention_loss_num_heads=2,
+        guided_attention_loss_sigma=0.4,
+        guided_attention_loss_scale=10.0,
+        use_cache=True,
+        is_encoder_decoder=True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.encoder_layers = encoder_layers
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_attention_heads = encoder_attention_heads
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layers = decoder_layers
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_attention_heads = decoder_attention_heads
+        self.decoder_layerdrop = decoder_layerdrop
+        self.hidden_act = hidden_act
+        self.positional_dropout = positional_dropout
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.scale_embedding = scale_embedding
+
+        self.feat_extract_norm = feat_extract_norm
+        self.feat_proj_dropout = feat_proj_dropout
+        self.feat_extract_activation = feat_extract_activation
+        self.conv_dim = list(conv_dim)
+        self.conv_stride = list(conv_stride)
+        self.conv_kernel = list(conv_kernel)
+        self.conv_bias = conv_bias
+        self.num_conv_pos_embeddings = num_conv_pos_embeddings
+        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
+        self.num_feat_extract_layers = len(self.conv_dim)
+
+        if (
+            (len(self.conv_stride) != self.num_feat_extract_layers)
+            or (len(self.conv_kernel) != self.num_feat_extract_layers)
+            or (len(self.conv_dim) != self.num_feat_extract_layers)
+        ):
+            raise ValueError(
+                "Configuration for convolutional layers is incorrect. It is required that `len(config.conv_dim)` =="
+                " `len(config.conv_stride)` == `len(config.conv_kernel)`, but is `len(config.conv_dim) ="
+                f" {len(self.conv_dim)}`, `len(config.conv_stride) = {len(self.conv_stride)}`,"
+                f" `len(config.conv_kernel) = {len(self.conv_kernel)}`."
+            )
+
+        # fine-tuning config parameters for SpecAugment: https://arxiv.org/abs/1904.08779
+        self.apply_spec_augment = apply_spec_augment
+        self.mask_time_prob = mask_time_prob
+        self.mask_time_length = mask_time_length
+        self.mask_time_min_masks = mask_time_min_masks
+        self.mask_feature_prob = mask_feature_prob
+        self.mask_feature_length = mask_feature_length
+        self.mask_feature_min_masks = mask_feature_min_masks
+
+        self.num_mel_bins = num_mel_bins
+        self.speech_decoder_prenet_layers = speech_decoder_prenet_layers
+        self.speech_decoder_prenet_units = speech_decoder_prenet_units
+        self.speech_decoder_prenet_dropout = speech_decoder_prenet_dropout
+        self.speaker_embedding_dim = speaker_embedding_dim
+
+        self.speech_decoder_postnet_layers = speech_decoder_postnet_layers
+        self.speech_decoder_postnet_units = speech_decoder_postnet_units
+        self.speech_decoder_postnet_kernel = speech_decoder_postnet_kernel
+        self.speech_decoder_postnet_dropout = speech_decoder_postnet_dropout
+        self.reduction_factor = reduction_factor
+
+        self.max_speech_positions = max_speech_positions
+        self.max_text_positions = max_text_positions
+        self.encoder_max_relative_position = encoder_max_relative_position
+
+        self.use_guided_attention_loss = use_guided_attention_loss
+        self.guided_attention_loss_num_heads = guided_attention_loss_num_heads
+        self.guided_attention_loss_sigma = guided_attention_loss_sigma
+        self.guided_attention_loss_scale = guided_attention_loss_scale
+
+        self.use_cache = use_cache
+        self.is_encoder_decoder = is_encoder_decoder
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            decoder_start_token_id=decoder_start_token_id,
+            **kwargs,
+        )
+
+    def inputs_to_logits_ratio(self):
+        return functools.reduce(operator.mul, self.conv_stride, 1)
+
+
+class SpeechT5HifiGanConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`SpeechT5HifiGanModel`]. It is used to instantiate
+    a SpeechT5 HiFi-GAN vocoder model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the SpeechT5
+    [microsoft/speecht5_hifigan](https://huggingface.co/microsoft/speecht5_hifigan) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        model_in_dim (`int`, *optional*, defaults to 80):
+            The number of frequency bins in the input log-mel spectrogram.
+        sampling_rate (`int`, *optional*, defaults to 16000):
+            The sampling rate at which the output audio will be generated, expressed in hertz (Hz).
+        upsample_initial_channel (`int`, *optional*, defaults to 512):
+            The number of input channels into the upsampling network.
+        upsample_rates (`Tuple[int]` or `List[int]`, *optional*, defaults to `[4, 4, 4, 4]`):
+            A tuple of integers defining the stride of each 1D convolutional layer in the upsampling network. The
+            length of *upsample_rates* defines the number of convolutional layers and has to match the length of
+            *upsample_kernel_sizes*.
+        upsample_kernel_sizes (`Tuple[int]` or `List[int]`, *optional*, defaults to `[8, 8, 8, 8]`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the upsampling network. The
+            length of *upsample_kernel_sizes* defines the number of convolutional layers and has to match the length of
+            *upsample_rates*.
+        resblock_kernel_sizes (`Tuple[int]` or `List[int]`, *optional*, defaults to `[3, 7, 11]`):
+            A tuple of integers defining the kernel sizes of the 1D convolutional layers in the multi-receptive field
+            fusion (MRF) module.
+        resblock_dilation_sizes (`Tuple[Tuple[int]]` or `List[List[int]]`, *optional*, defaults to `[[1, 3, 5], [1, 3, 5], [1, 3, 5]]`):
+            A nested tuple of integers defining the dilation rates of the dilated 1D convolutional layers in the
+            multi-receptive field fusion (MRF) module.
+        initializer_range (`float`, *optional*, defaults to 0.01):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        leaky_relu_slope (`float`, *optional*, defaults to 0.1):
+            The angle of the negative slope used by the leaky ReLU activation.
+        normalize_before (`bool`, *optional*, defaults to `True`):
+            Whether or not to normalize the spectrogram before vocoding using the vocoder's learned mean and variance.
+
+    Example:
+
+    ```python
+    >>> from transformers import SpeechT5HifiGan, SpeechT5HifiGanConfig
+
+    >>> # Initializing a "microsoft/speecht5_hifigan" style configuration
+    >>> configuration = SpeechT5HifiGanConfig()
+
+    >>> # Initializing a model (with random weights) from the "microsoft/speecht5_hifigan" style configuration
+    >>> model = SpeechT5HifiGan(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "hifigan"
+
+    def __init__(
+        self,
+        model_in_dim=80,
+        sampling_rate=16000,
+        upsample_initial_channel=512,
+        upsample_rates=[4, 4, 4, 4],
+        upsample_kernel_sizes=[8, 8, 8, 8],
+        resblock_kernel_sizes=[3, 7, 11],
+        resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+        initializer_range=0.01,
+        leaky_relu_slope=0.1,
+        normalize_before=True,
+        **kwargs,
+    ):
+        self.model_in_dim = model_in_dim
+        self.sampling_rate = sampling_rate
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_rates = upsample_rates
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.initializer_range = initializer_range
+        self.leaky_relu_slope = leaky_relu_slope
+        self.normalize_before = normalize_before
+        super().__init__(**kwargs)
diff --git a/src/transformers/models/speecht5/convert_hifigan.py b/src/transformers/models/speecht5/convert_hifigan.py
new file mode 100644
index 000000000000..4d78bb73af30
--- /dev/null
+++ b/src/transformers/models/speecht5/convert_hifigan.py
@@ -0,0 +1,108 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert SpeechT5 HiFi-GAN checkpoint."""
+
+import argparse
+
+import numpy as np
+import torch
+
+from transformers import SpeechT5HifiGan, SpeechT5HifiGanConfig, logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger("transformers.models.speecht5")
+
+
+def load_weights(checkpoint, hf_model, config):
+    hf_model.apply_weight_norm()
+
+    hf_model.conv_pre.weight_g.data = checkpoint["input_conv.weight_g"]
+    hf_model.conv_pre.weight_v.data = checkpoint["input_conv.weight_v"]
+    hf_model.conv_pre.bias.data = checkpoint["input_conv.bias"]
+
+    for i in range(len(config.upsample_rates)):
+        hf_model.upsampler[i].weight_g.data = checkpoint[f"upsamples.{i}.1.weight_g"]
+        hf_model.upsampler[i].weight_v.data = checkpoint[f"upsamples.{i}.1.weight_v"]
+        hf_model.upsampler[i].bias.data = checkpoint[f"upsamples.{i}.1.bias"]
+
+    for i in range(len(config.upsample_rates) * len(config.resblock_kernel_sizes)):
+        for j in range(len(config.resblock_dilation_sizes)):
+            hf_model.resblocks[i].convs1[j].weight_g.data = checkpoint[f"blocks.{i}.convs1.{j}.1.weight_g"]
+            hf_model.resblocks[i].convs1[j].weight_v.data = checkpoint[f"blocks.{i}.convs1.{j}.1.weight_v"]
+            hf_model.resblocks[i].convs1[j].bias.data = checkpoint[f"blocks.{i}.convs1.{j}.1.bias"]
+
+            hf_model.resblocks[i].convs2[j].weight_g.data = checkpoint[f"blocks.{i}.convs2.{j}.1.weight_g"]
+            hf_model.resblocks[i].convs2[j].weight_v.data = checkpoint[f"blocks.{i}.convs2.{j}.1.weight_v"]
+            hf_model.resblocks[i].convs2[j].bias.data = checkpoint[f"blocks.{i}.convs2.{j}.1.bias"]
+
+    hf_model.conv_post.weight_g.data = checkpoint["output_conv.1.weight_g"]
+    hf_model.conv_post.weight_v.data = checkpoint["output_conv.1.weight_v"]
+    hf_model.conv_post.bias.data = checkpoint["output_conv.1.bias"]
+
+    hf_model.remove_weight_norm()
+
+
+@torch.no_grad()
+def convert_hifigan_checkpoint(
+    checkpoint_path,
+    stats_path,
+    pytorch_dump_folder_path,
+    config_path=None,
+    repo_id=None,
+):
+    if config_path is not None:
+        config = SpeechT5HifiGanConfig.from_pretrained(config_path)
+    else:
+        config = SpeechT5HifiGanConfig()
+
+    model = SpeechT5HifiGan(config)
+
+    orig_checkpoint = torch.load(checkpoint_path)
+    load_weights(orig_checkpoint["model"]["generator"], model, config)
+
+    stats = np.load(stats_path)
+    mean = stats[0].reshape(-1)
+    scale = stats[1].reshape(-1)
+    model.mean = torch.from_numpy(mean).float()
+    model.scale = torch.from_numpy(scale).float()
+
+    model.save_pretrained(pytorch_dump_folder_path)
+
+    if repo_id:
+        print("Pushing to the hub...")
+        model.push_to_hub(repo_id)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
+    parser.add_argument("--stats_path", required=True, default=None, type=str, help="Path to stats.npy file")
+    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
+    parser.add_argument(
+        "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
+    )
+    parser.add_argument(
+        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
+    )
+
+    args = parser.parse_args()
+    convert_hifigan_checkpoint(
+        args.checkpoint_path,
+        args.stats_path,
+        args.pytorch_dump_folder_path,
+        args.config_path,
+        args.push_to_hub,
+    )
diff --git a/src/transformers/models/speecht5/convert_speecht5_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/speecht5/convert_speecht5_original_pytorch_checkpoint_to_pytorch.py
new file mode 100644
index 000000000000..20dea800d9d1
--- /dev/null
+++ b/src/transformers/models/speecht5/convert_speecht5_original_pytorch_checkpoint_to_pytorch.py
@@ -0,0 +1,401 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert SpeechT5 checkpoint."""
+
+import argparse
+
+import torch
+
+from transformers import (
+    SpeechT5Config,
+    SpeechT5FeatureExtractor,
+    SpeechT5ForSpeechToSpeech,
+    SpeechT5ForSpeechToText,
+    SpeechT5ForTextToSpeech,
+    SpeechT5Processor,
+    SpeechT5Tokenizer,
+    logging,
+)
+from transformers.tokenization_utils import AddedToken
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger("transformers.models.speecht5")
+
+MAPPING_SPEECH_ENCODER_PRENET = {
+    "speech_encoder_prenet.layer_norm": "speecht5.encoder.prenet.feature_projection.layer_norm",
+    "speech_encoder_prenet.post_extract_proj": "speecht5.encoder.prenet.feature_projection.projection",
+    "speech_encoder_prenet.pos_conv.0": "speecht5.encoder.prenet.pos_conv_embed.conv",
+    "speech_encoder_prenet.mask_emb": "speecht5.encoder.prenet.masked_spec_embed",
+}
+MAPPING_TEXT_ENCODER_PRENET = {
+    "text_encoder_prenet.encoder_prenet.0": "speecht5.encoder.prenet.embed_tokens",
+    "text_encoder_prenet.encoder_prenet.1.alpha": "speecht5.encoder.prenet.encode_positions.alpha",
+}
+MAPPING_SPEECH_DECODER_PRENET = {
+    "speech_decoder_prenet.decoder_prenet.0.0.prenet.0.0": "speecht5.decoder.prenet.layers.0",
+    "speech_decoder_prenet.decoder_prenet.0.0.prenet.1.0": "speecht5.decoder.prenet.layers.1",
+    "speech_decoder_prenet.decoder_prenet.0.1": "speecht5.decoder.prenet.final_layer",
+    "speech_decoder_prenet.decoder_prenet.1.alpha": "speecht5.decoder.prenet.encode_positions.alpha",
+    "speech_decoder_prenet.spkembs_layer.0": "speecht5.decoder.prenet.speaker_embeds_layer",
+}
+MAPPING_SPEECH_DECODER_POSTNET = {
+    "speech_decoder_postnet.feat_out": "speech_decoder_postnet.feat_out",
+    "speech_decoder_postnet.prob_out": "speech_decoder_postnet.prob_out",
+    "speech_decoder_postnet.postnet.postnet.0.0": "speech_decoder_postnet.layers.0.conv",
+    "speech_decoder_postnet.postnet.postnet.0.1": "speech_decoder_postnet.layers.0.batch_norm",
+    "speech_decoder_postnet.postnet.postnet.1.0": "speech_decoder_postnet.layers.1.conv",
+    "speech_decoder_postnet.postnet.postnet.1.1": "speech_decoder_postnet.layers.1.batch_norm",
+    "speech_decoder_postnet.postnet.postnet.2.0": "speech_decoder_postnet.layers.2.conv",
+    "speech_decoder_postnet.postnet.postnet.2.1": "speech_decoder_postnet.layers.2.batch_norm",
+    "speech_decoder_postnet.postnet.postnet.3.0": "speech_decoder_postnet.layers.3.conv",
+    "speech_decoder_postnet.postnet.postnet.3.1": "speech_decoder_postnet.layers.3.batch_norm",
+    "speech_decoder_postnet.postnet.postnet.4.0": "speech_decoder_postnet.layers.4.conv",
+    "speech_decoder_postnet.postnet.postnet.4.1": "speech_decoder_postnet.layers.4.batch_norm",
+}
+MAPPING_TEXT_DECODER_PRENET = {
+    "text_decoder_prenet.embed_tokens": "speecht5.decoder.prenet.embed_tokens",
+}
+MAPPING_TEXT_DECODER_POSTNET = {
+    "text_decoder_postnet.output_projection": "text_decoder_postnet.lm_head",
+}
+MAPPING_ENCODER = {
+    "encoder.layers.*.self_attn.k_proj": "speecht5.encoder.wrapped_encoder.layers.*.attention.k_proj",
+    "encoder.layers.*.self_attn.v_proj": "speecht5.encoder.wrapped_encoder.layers.*.attention.v_proj",
+    "encoder.layers.*.self_attn.q_proj": "speecht5.encoder.wrapped_encoder.layers.*.attention.q_proj",
+    "encoder.layers.*.self_attn.out_proj": "speecht5.encoder.wrapped_encoder.layers.*.attention.out_proj",
+    "encoder.layers.*.self_attn_layer_norm": "speecht5.encoder.wrapped_encoder.layers.*.layer_norm",
+    "encoder.layers.*.fc1": "speecht5.encoder.wrapped_encoder.layers.*.feed_forward.intermediate_dense",
+    "encoder.layers.*.fc2": "speecht5.encoder.wrapped_encoder.layers.*.feed_forward.output_dense",
+    "encoder.layers.*.final_layer_norm": "speecht5.encoder.wrapped_encoder.layers.*.final_layer_norm",
+    "encoder.layer_norm": "speecht5.encoder.wrapped_encoder.layer_norm",
+    "encoder.pos_emb.pe_k": "speecht5.encoder.wrapped_encoder.embed_positions.pe_k",
+}
+MAPPING_DECODER = {
+    "decoder.layers.*.self_attn.k_proj": "speecht5.decoder.wrapped_decoder.layers.*.self_attn.k_proj",
+    "decoder.layers.*.self_attn.v_proj": "speecht5.decoder.wrapped_decoder.layers.*.self_attn.v_proj",
+    "decoder.layers.*.self_attn.q_proj": "speecht5.decoder.wrapped_decoder.layers.*.self_attn.q_proj",
+    "decoder.layers.*.self_attn.out_proj": "speecht5.decoder.wrapped_decoder.layers.*.self_attn.out_proj",
+    "decoder.layers.*.self_attn_layer_norm": "speecht5.decoder.wrapped_decoder.layers.*.self_attn_layer_norm",
+    "decoder.layers.*.encoder_attn.k_proj": "speecht5.decoder.wrapped_decoder.layers.*.encoder_attn.k_proj",
+    "decoder.layers.*.encoder_attn.v_proj": "speecht5.decoder.wrapped_decoder.layers.*.encoder_attn.v_proj",
+    "decoder.layers.*.encoder_attn.q_proj": "speecht5.decoder.wrapped_decoder.layers.*.encoder_attn.q_proj",
+    "decoder.layers.*.encoder_attn.out_proj": "speecht5.decoder.wrapped_decoder.layers.*.encoder_attn.out_proj",
+    "decoder.layers.*.encoder_attn_layer_norm": "speecht5.decoder.wrapped_decoder.layers.*.encoder_attn_layer_norm",
+    "decoder.layers.*.fc1": "speecht5.decoder.wrapped_decoder.layers.*.feed_forward.intermediate_dense",
+    "decoder.layers.*.fc2": "speecht5.decoder.wrapped_decoder.layers.*.feed_forward.output_dense",
+    "decoder.layers.*.final_layer_norm": "speecht5.decoder.wrapped_decoder.layers.*.final_layer_norm",
+}
+MAPPING_S2T = {
+    **MAPPING_SPEECH_ENCODER_PRENET,
+    **MAPPING_ENCODER,
+    **MAPPING_DECODER,
+    **MAPPING_TEXT_DECODER_PRENET,
+    **MAPPING_TEXT_DECODER_POSTNET,
+}
+MAPPING_T2S = {
+    **MAPPING_TEXT_ENCODER_PRENET,
+    **MAPPING_ENCODER,
+    **MAPPING_DECODER,
+    **MAPPING_SPEECH_DECODER_PRENET,
+    **MAPPING_SPEECH_DECODER_POSTNET,
+}
+MAPPING_S2S = {
+    **MAPPING_SPEECH_ENCODER_PRENET,
+    **MAPPING_ENCODER,
+    **MAPPING_DECODER,
+    **MAPPING_SPEECH_DECODER_PRENET,
+    **MAPPING_SPEECH_DECODER_POSTNET,
+}
+TOP_LEVEL_KEYS = []
+IGNORE_KEYS = [
+    "encoder.version",
+    "encoder.layers.*.norm_k.weight",
+    "encoder.layers.*.norm_k.bias",
+    "decoder.version",
+    "decoder.layers.*.norm_k.weight",
+    "decoder.layers.*.norm_k.bias",
+    "decoder.pos_emb.pe_k",
+    "speech_encoder_prenet.embed_positions._float_tensor",
+    "text_decoder_prenet.embed_positions._float_tensor",
+]
+IGNORE_KEYS_S2T = IGNORE_KEYS + [
+    "encoder.proj",
+    "text_encoder_prenet.*",
+    "speech_decoder_prenet.*",
+    "speech_decoder_postnet.*",
+]
+IGNORE_KEYS_T2S = IGNORE_KEYS + [
+    "encoder.proj",
+    "speech_encoder_prenet.*",
+    "text_decoder_prenet.*",
+    "text_decoder_postnet.*",
+]
+IGNORE_KEYS_S2S = IGNORE_KEYS + [
+    "encoder.proj",
+    "text_encoder_prenet.*",
+    "text_decoder_prenet.*",
+    "text_decoder_postnet.*",
+]
+
+
+def set_recursively(hf_pointer, key, value, full_name, weight_type):
+    for attribute in key.split("."):
+        hf_pointer = getattr(hf_pointer, attribute)
+
+    if weight_type is not None:
+        hf_shape = getattr(hf_pointer, weight_type).shape
+    else:
+        hf_shape = hf_pointer.shape
+
+    if hf_shape != value.shape:
+        raise ValueError(
+            f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
+            f" {value.shape} for {full_name}"
+        )
+
+    if weight_type == "weight":
+        hf_pointer.weight.data = value
+    elif weight_type == "weight_g":
+        hf_pointer.weight_g.data = value
+    elif weight_type == "weight_v":
+        hf_pointer.weight_v.data = value
+    elif weight_type == "bias":
+        hf_pointer.bias.data = value
+    elif weight_type == "running_mean":
+        hf_pointer.running_mean.data = value
+    elif weight_type == "running_var":
+        hf_pointer.running_var.data = value
+    elif weight_type == "num_batches_tracked":
+        hf_pointer.num_batches_tracked.data = value
+    else:
+        hf_pointer.data = value
+
+    logger.info(f"{key + ('.' + weight_type if weight_type is not None else '')} was initialized from {full_name}.")
+
+
+def should_ignore(name, ignore_keys):
+    for key in ignore_keys:
+        if key.endswith(".*"):
+            if name.startswith(key[:-1]):
+                return True
+        elif ".*." in key:
+            prefix, suffix = key.split(".*.")
+            if prefix in name and suffix in name:
+                return True
+        elif key in name:
+            return True
+    return False
+
+
+def recursively_load_weights(fairseq_dict, hf_model, task):
+    unused_weights = []
+
+    if task == "s2t":
+        feature_encoder = hf_model.speecht5.encoder.prenet.feature_encoder
+        MAPPING = MAPPING_S2T
+        IGNORE_KEYS = IGNORE_KEYS_S2T
+    elif task == "t2s":
+        feature_encoder = None
+        MAPPING = MAPPING_T2S
+        IGNORE_KEYS = IGNORE_KEYS_T2S
+    elif task == "s2s":
+        feature_encoder = hf_model.speecht5.encoder.prenet.feature_encoder
+        MAPPING = MAPPING_S2S
+        IGNORE_KEYS = IGNORE_KEYS_S2S
+    else:
+        raise ValueError(f"Unsupported task: {task}")
+
+    for name, value in fairseq_dict.items():
+        if should_ignore(name, IGNORE_KEYS):
+            logger.info(f"{name} was ignored")
+            continue
+
+        is_used = False
+        if "conv_layers" in name:
+            load_conv_layer(
+                name,
+                value,
+                feature_encoder,
+                unused_weights,
+                hf_model.config.feat_extract_norm == "group",
+            )
+            is_used = True
+        else:
+            for key, mapped_key in MAPPING.items():
+                # mapped_key = "speecht5." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
+
+                if "*" in key:
+                    prefix, suffix = key.split(".*.")
+                    if prefix in name and suffix in name:
+                        key = suffix
+
+                # if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
+                if key in name:
+                    is_used = True
+                    if "*" in mapped_key:
+                        layer_index = name.split(key)[0].split(".")[-2]
+                        mapped_key = mapped_key.replace("*", layer_index)
+                    if "weight_g" in name:
+                        weight_type = "weight_g"
+                    elif "weight_v" in name:
+                        weight_type = "weight_v"
+                    elif "bias" in name:
+                        weight_type = "bias"
+                    elif "weight" in name:
+                        weight_type = "weight"
+                    elif "running_mean" in name:
+                        weight_type = "running_mean"
+                    elif "running_var" in name:
+                        weight_type = "running_var"
+                    elif "num_batches_tracked" in name:
+                        weight_type = "num_batches_tracked"
+                    else:
+                        weight_type = None
+                    set_recursively(hf_model, mapped_key, value, name, weight_type)
+                continue
+        if not is_used:
+            unused_weights.append(name)
+
+    logger.warning(f"Unused weights: {unused_weights}")
+
+
+def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
+    name = full_name.split("conv_layers.")[-1]
+    items = name.split(".")
+    layer_id = int(items[0])
+    type_id = int(items[1])
+
+    if type_id == 0:
+        if "bias" in name:
+            if value.shape != feature_extractor.conv_layers[layer_id].conv.bias.data.shape:
+                raise ValueError(
+                    f"{full_name} has size {value.shape}, but"
+                    f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
+                )
+            feature_extractor.conv_layers[layer_id].conv.bias.data = value
+            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
+        elif "weight" in name:
+            if value.shape != feature_extractor.conv_layers[layer_id].conv.weight.data.shape:
+                raise ValueError(
+                    f"{full_name} has size {value.shape}, but"
+                    f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
+                )
+            feature_extractor.conv_layers[layer_id].conv.weight.data = value
+            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
+    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
+        if "bias" in name:
+            if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape:
+                raise ValueError(
+                    f"{full_name} has size {value.shape}, but"
+                    f" {feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape} was found."
+                )
+            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
+            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
+        elif "weight" in name:
+            if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape:
+                raise ValueError(
+                    f"{full_name} has size {value.shape}, but"
+                    f" {feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape} was found."
+                )
+            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
+            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
+    else:
+        unused_weights.append(full_name)
+
+
+@torch.no_grad()
+def convert_speecht5_checkpoint(
+    task,
+    checkpoint_path,
+    pytorch_dump_folder_path,
+    config_path=None,
+    vocab_path=None,
+    repo_id=None,
+):
+    """
+    Copy/paste/tweak model's weights to transformers design.
+    """
+    if config_path is not None:
+        config = SpeechT5Config.from_pretrained(config_path)
+    else:
+        config = SpeechT5Config()
+
+    if task == "s2t":
+        config.max_length = config.max_text_positions
+        model = SpeechT5ForSpeechToText(config)
+    elif task == "t2s":
+        config.max_speech_positions = 1876
+        config.max_text_positions = 600
+        config.max_length = config.max_speech_positions
+        model = SpeechT5ForTextToSpeech(config)
+    elif task == "s2s":
+        config.max_speech_positions = 1876
+        config.max_length = config.max_speech_positions
+        model = SpeechT5ForSpeechToSpeech(config)
+    else:
+        raise ValueError(f"Unknown task name: {task}")
+
+    if vocab_path:
+        tokenizer = SpeechT5Tokenizer(vocab_path, model_max_length=config.max_text_positions)
+
+        # Mask token behaves like a normal word, i.e. include the space before it
+        mask_token = AddedToken("<mask>", lstrip=True, rstrip=False)
+        tokenizer.mask_token = mask_token
+        tokenizer.add_special_tokens({"mask_token": mask_token})
+        tokenizer.add_tokens(["<ctc_blank>"])
+
+    feature_extractor = SpeechT5FeatureExtractor()
+    processor = SpeechT5Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+    processor.save_pretrained(pytorch_dump_folder_path)
+
+    fairseq_checkpoint = torch.load(checkpoint_path)
+    recursively_load_weights(fairseq_checkpoint["model"], model, task)
+
+    model.save_pretrained(pytorch_dump_folder_path)
+
+    if repo_id:
+        print("Pushing to the hub...")
+        processor.push_to_hub(repo_id)
+        model.push_to_hub(repo_id)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--task",
+        default="s2t",
+        type=str,
+        help="Type of the SpeechT5 model you'd like to convert. Should be one of 's2t', 't2s', 's2s'.",
+    )
+    parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to fairseq checkpoint")
+    parser.add_argument("--vocab_path", default=None, type=str, help="Path to SentencePiece model")
+    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
+    parser.add_argument(
+        "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
+    )
+    parser.add_argument(
+        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
+    )
+
+    args = parser.parse_args()
+    convert_speecht5_checkpoint(
+        args.task,
+        args.checkpoint_path,
+        args.pytorch_dump_folder_path,
+        args.config_path,
+        args.vocab_path,
+        args.push_to_hub,
+    )
diff --git a/src/transformers/models/speecht5/feature_extraction_speecht5.py b/src/transformers/models/speecht5/feature_extraction_speecht5.py
new file mode 100644
index 000000000000..84d51e97df95
--- /dev/null
+++ b/src/transformers/models/speecht5/feature_extraction_speecht5.py
@@ -0,0 +1,393 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for SpeechT5."""
+
+import warnings
+from typing import Any, Dict, List, Optional, Union
+
+import numpy as np
+
+from ...audio_utils import mel_filter_bank, optimal_fft_length, spectrogram, window_function
+from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
+from ...feature_extraction_utils import BatchFeature
+from ...utils import PaddingStrategy, TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class SpeechT5FeatureExtractor(SequenceFeatureExtractor):
+    r"""
+    Constructs a SpeechT5 feature extractor.
+
+    This class can pre-process a raw speech signal by (optionally) normalizing to zero-mean unit-variance, for use by
+    the SpeechT5 speech encoder prenet.
+
+    This class can also extract log-mel filter bank features from raw speech, for use by the SpeechT5 speech decoder
+    prenet.
+
+    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
+    most of the main methods. Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        feature_size (`int`, *optional*, defaults to 1):
+            The feature dimension of the extracted features.
+        sampling_rate (`int`, *optional*, defaults to 16000):
+            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
+        padding_value (`float`, *optional*, defaults to 0.0):
+            The value that is used to fill the padding values.
+        do_normalize (`bool`, *optional*, defaults to `False`):
+            Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
+            improve the performance for some models.
+        num_mel_bins (`int`, *optional*, defaults to 80):
+            The number of mel-frequency bins in the extracted spectrogram features.
+        hop_length (`int`, *optional*, defaults to 16):
+            Number of ms between windows. Otherwise referred to as "shift" in many papers.
+        win_length (`int`, *optional*, defaults to 64):
+            Number of ms per window.
+        win_function (`str`, *optional*, defaults to `"hann_window"`):
+            Name for the window function used for windowing, must be accessible via `torch.{win_function}`
+        frame_signal_scale (`float`, *optional*, defaults to 1.0):
+            Constant multiplied in creating the frames before applying DFT. This argument is deprecated.
+        fmin (`float`, *optional*, defaults to 80):
+            Minimum mel frequency in Hz.
+        fmax (`float`, *optional*, defaults to 7600):
+            Maximum mel frequency in Hz.
+        mel_floor (`float`, *optional*, defaults to 1e-10):
+            Minimum value of mel frequency banks.
+        reduction_factor (`int`, *optional*, defaults to 2):
+            Spectrogram length reduction factor. This argument is deprecated.
+        return_attention_mask (`bool`, *optional*, defaults to `True`):
+            Whether or not [`~SpeechT5FeatureExtractor.__call__`] should return `attention_mask`.
+    """
+
+    model_input_names = ["input_values", "attention_mask"]
+
+    def __init__(
+        self,
+        feature_size: int = 1,
+        sampling_rate: int = 16000,
+        padding_value: float = 0.0,
+        do_normalize: bool = False,
+        num_mel_bins: int = 80,
+        hop_length: int = 16,
+        win_length: int = 64,
+        win_function: str = "hann_window",
+        frame_signal_scale: float = 1.0,
+        fmin: float = 80,
+        fmax: float = 7600,
+        mel_floor: float = 1e-10,
+        reduction_factor: int = 2,
+        return_attention_mask: bool = True,
+        **kwargs,
+    ):
+        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
+        self.do_normalize = do_normalize
+        self.return_attention_mask = return_attention_mask
+
+        self.num_mel_bins = num_mel_bins
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.win_function = win_function
+        self.frame_signal_scale = frame_signal_scale
+        self.fmin = fmin
+        self.fmax = fmax
+        self.mel_floor = mel_floor
+        self.reduction_factor = reduction_factor
+
+        self.sample_size = win_length * sampling_rate // 1000
+        self.sample_stride = hop_length * sampling_rate // 1000
+        self.n_fft = optimal_fft_length(self.sample_size)
+        self.n_freqs = (self.n_fft // 2) + 1
+
+        self.window = window_function(window_length=self.sample_size, name=self.win_function, periodic=True)
+
+        self.mel_filters = mel_filter_bank(
+            num_frequency_bins=self.n_freqs,
+            num_mel_filters=self.num_mel_bins,
+            min_frequency=self.fmin,
+            max_frequency=self.fmax,
+            sampling_rate=self.sampling_rate,
+            norm="slaney",
+            mel_scale="slaney",
+        )
+
+        if frame_signal_scale != 1.0:
+            warnings.warn(
+                "The argument `frame_signal_scale` is deprecated and will be removed in version 4.30.0 of Transformers",
+                FutureWarning,
+            )
+        if reduction_factor != 2.0:
+            warnings.warn(
+                "The argument `reduction_factor` is deprecated and will be removed in version 4.30.0 of Transformers",
+                FutureWarning,
+            )
+
+    @staticmethod
+    # Copied from transformers.models.wav2vec2.feature_extraction_wav2vec2.Wav2Vec2FeatureExtractor.zero_mean_unit_var_norm
+    def zero_mean_unit_var_norm(
+        input_values: List[np.ndarray], attention_mask: List[np.ndarray], padding_value: float = 0.0
+    ) -> List[np.ndarray]:
+        """
+        Every array in the list is normalized to have zero mean and unit variance
+        """
+        if attention_mask is not None:
+            attention_mask = np.array(attention_mask, np.int32)
+            normed_input_values = []
+
+            for vector, length in zip(input_values, attention_mask.sum(-1)):
+                normed_slice = (vector - vector[:length].mean()) / np.sqrt(vector[:length].var() + 1e-7)
+                if length < normed_slice.shape[0]:
+                    normed_slice[length:] = padding_value
+
+                normed_input_values.append(normed_slice)
+        else:
+            normed_input_values = [(x - x.mean()) / np.sqrt(x.var() + 1e-7) for x in input_values]
+
+        return normed_input_values
+
+    def _extract_mel_features(
+        self,
+        one_waveform: np.ndarray,
+    ) -> np.ndarray:
+        """
+        Extracts log-mel filterbank features for one waveform array (unbatched).
+        """
+        log_mel_spec = spectrogram(
+            one_waveform,
+            window=self.window,
+            frame_length=self.sample_size,
+            hop_length=self.sample_stride,
+            fft_length=self.n_fft,
+            mel_filters=self.mel_filters,
+            mel_floor=self.mel_floor,
+            log_mel="log10",
+        )
+        return log_mel_spec.T
+
+    def __call__(
+        self,
+        audio: Optional[Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]]] = None,
+        audio_target: Optional[Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]]] = None,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        max_length: Optional[int] = None,
+        truncation: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        sampling_rate: Optional[int] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Main method to featurize and prepare for the model one or several sequence(s).
+
+        Pass in a value for `audio` to extract waveform features. Pass in a value for `audio_target` to extract log-mel
+        spectrogram features.
+
+        Args:
+            audio (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`, *optional*):
+                The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float
+                values, a list of numpy arrays or a list of list of float values. This outputs waveform features. Must
+                be mono channel audio, not stereo, i.e. single float per timestep.
+            audio_target (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`, *optional*):
+                The sequence or batch of sequences to be processed as targets. Each sequence can be a numpy array, a
+                list of float values, a list of numpy arrays or a list of list of float values. This outputs log-mel
+                spectrogram features.
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            truncation (`bool`):
+                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the sequence to a multiple of the provided value.
+
+                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
+            return_attention_mask (`bool`, *optional*):
+                Whether to return the attention mask. If left to the default, will return the attention mask according
+                to the specific feature_extractor's default.
+
+                [What are attention masks?](../glossary#attention-mask)
+
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            sampling_rate (`int`, *optional*):
+                The sampling rate at which the `audio` or `audio_target` input was sampled. It is strongly recommended
+                to pass `sampling_rate` at the forward call to prevent silent errors.
+        """
+        if audio is None and audio_target is None:
+            raise ValueError("You must provide either `audio` or `audio_target` values.")
+
+        if sampling_rate is not None:
+            if sampling_rate != self.sampling_rate:
+                raise ValueError(
+                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
+                    f" {self.sampling_rate}. Please make sure that the provided audio input was sampled with"
+                    f" {self.sampling_rate} and not {sampling_rate}."
+                )
+        else:
+            logger.warning(
+                "It is strongly recommended to pass the ``sampling_rate`` argument to this function. "
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+
+        if audio is not None:
+            inputs = self._process_audio(
+                audio,
+                False,
+                padding,
+                max_length,
+                truncation,
+                pad_to_multiple_of,
+                return_attention_mask,
+                return_tensors,
+                **kwargs,
+            )
+        else:
+            inputs = None
+
+        if audio_target is not None:
+            inputs_target = self._process_audio(
+                audio_target,
+                True,
+                padding,
+                max_length,
+                truncation,
+                pad_to_multiple_of,
+                return_attention_mask,
+                return_tensors,
+                **kwargs,
+            )
+
+            if inputs is None:
+                return inputs_target
+            else:
+                inputs["labels"] = inputs_target["input_values"]
+                decoder_attention_mask = inputs_target.get("attention_mask")
+                if decoder_attention_mask is not None:
+                    inputs["decoder_attention_mask"] = decoder_attention_mask
+
+        return inputs
+
+    def _process_audio(
+        self,
+        speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
+        is_target: bool = False,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        max_length: Optional[int] = None,
+        truncation: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        is_batched_numpy = isinstance(speech, np.ndarray) and len(speech.shape) > 1
+        if is_batched_numpy and len(speech.shape) > 2:
+            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
+        is_batched = is_batched_numpy or (
+            isinstance(speech, (list, tuple)) and (isinstance(speech[0], (np.ndarray, tuple, list)))
+        )
+
+        if is_batched:
+            speech = [np.asarray(speech, dtype=np.float32) for speech in speech]
+        elif not is_batched and not isinstance(speech, np.ndarray):
+            speech = np.asarray(speech, dtype=np.float32)
+        elif isinstance(speech, np.ndarray) and speech.dtype is np.dtype(np.float64):
+            speech = speech.astype(np.float32)
+
+        # always return batch
+        if not is_batched:
+            speech = [speech]
+
+        # needed to make pad() work on spectrogram inputs
+        feature_size_hack = self.feature_size
+
+        # convert into correct format for padding
+        if is_target:
+            features = [self._extract_mel_features(waveform) for waveform in speech]
+            encoded_inputs = BatchFeature({"input_values": features})
+            self.feature_size = self.num_mel_bins
+        else:
+            encoded_inputs = BatchFeature({"input_values": speech})
+
+        padded_inputs = self.pad(
+            encoded_inputs,
+            padding=padding,
+            max_length=max_length,
+            truncation=truncation,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=return_attention_mask,
+            **kwargs,
+        )
+
+        self.feature_size = feature_size_hack
+
+        # convert input values to correct format
+        input_values = padded_inputs["input_values"]
+        if not isinstance(input_values[0], np.ndarray):
+            padded_inputs["input_values"] = [np.asarray(array, dtype=np.float32) for array in input_values]
+        elif (
+            not isinstance(input_values, np.ndarray)
+            and isinstance(input_values[0], np.ndarray)
+            and input_values[0].dtype is np.dtype(np.float64)
+        ):
+            padded_inputs["input_values"] = [array.astype(np.float32) for array in input_values]
+        elif isinstance(input_values, np.ndarray) and input_values.dtype is np.dtype(np.float64):
+            padded_inputs["input_values"] = input_values.astype(np.float32)
+
+        # convert attention_mask to correct format
+        attention_mask = padded_inputs.get("attention_mask")
+        if attention_mask is not None:
+            padded_inputs["attention_mask"] = [np.asarray(array, dtype=np.int32) for array in attention_mask]
+
+        # zero-mean and unit-variance normalization
+        if not is_target and self.do_normalize:
+            attention_mask = (
+                attention_mask
+                if self._get_padding_strategies(padding, max_length=max_length) is not PaddingStrategy.DO_NOT_PAD
+                else None
+            )
+            padded_inputs["input_values"] = self.zero_mean_unit_var_norm(
+                padded_inputs["input_values"], attention_mask=attention_mask, padding_value=self.padding_value
+            )
+
+        if return_tensors is not None:
+            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
+
+        return padded_inputs
+
+    def to_dict(self) -> Dict[str, Any]:
+        output = super().to_dict()
+
+        # Don't serialize these as they are derived from the other properties.
+        names = ["window", "mel_filters", "sample_size", "sample_stride", "n_fft", "n_freqs"]
+        for name in names:
+            if name in output:
+                del output[name]
+
+        return output
diff --git a/src/transformers/models/speecht5/modeling_speecht5.py b/src/transformers/models/speecht5/modeling_speecht5.py
new file mode 100644
index 000000000000..48334deb3778
--- /dev/null
+++ b/src/transformers/models/speecht5/modeling_speecht5.py
@@ -0,0 +1,3303 @@
+# coding=utf-8
+# Copyright 2023 The Fairseq Authors, Microsoft Research, and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch SpeechT5 model."""
+
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, L1Loss
+
+from ...activations import ACT2FN
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+    Seq2SeqSpectrogramOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from .configuration_speecht5 import SpeechT5Config, SpeechT5HifiGanConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+_HIDDEN_STATES_START_POSITION = 1
+
+# General docstring
+_CONFIG_FOR_DOC = "SpeechT5Config"
+
+
+SPEECHT5_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "microsoft/speecht5_asr",
+    "microsoft/speecht5_tts",
+    "microsoft/speecht5_vc",
+    # See all SpeechT5 models at https://huggingface.co/models?filter=speecht5
+]
+
+
+# Copied from transformers.models.bart.modeling_bart.shift_tokens_right
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
+
+    if pad_token_id is None:
+        raise ValueError("self.model.config.pad_token_id has to be defined.")
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+    return shifted_input_ids
+
+
+def shift_spectrograms_right(input_values: torch.Tensor, reduction_factor: int = 1):
+    """
+    Shift input spectrograms one timestep to the right. Also applies the reduction factor to the sequence length.
+    """
+    # thin out frames for reduction factor
+    if reduction_factor > 1:
+        input_values = input_values[:, reduction_factor - 1 :: reduction_factor]
+
+    shifted_input_values = input_values.new_zeros(input_values.shape)
+    shifted_input_values[:, 1:] = input_values[:, :-1].clone()
+
+    # replace possible -100 values in labels by zeros
+    shifted_input_values.masked_fill_(shifted_input_values == -100.0, 0.0)
+
+    return shifted_input_values
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices
+def _compute_mask_indices(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    attention_mask: Optional[torch.LongTensor] = None,
+    min_masks: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
+    CPU as part of the preprocessing during training.
+
+    Args:
+        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
+               the first element is the batch size and the second element is the length of the axis to span.
+        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
+                    independently generated mask spans of length `mask_length` is computed by
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
+        mask_length: size of the mask
+        min_masks: minimum number of masked spans
+        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+                        each batch dimension.
+    """
+    batch_size, sequence_length = shape
+
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+            f" and `sequence_length`: {sequence_length}`"
+        )
+
+    # epsilon is used for probabilistic rounding
+    epsilon = np.random.rand(1).item()
+
+    def compute_num_masked_span(input_length):
+        """Given input length, compute how many spans should be masked"""
+        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
+        num_masked_span = max(num_masked_span, min_masks)
+
+        # make sure num masked span <= sequence_length
+        if num_masked_span * mask_length > sequence_length:
+            num_masked_span = sequence_length // mask_length
+
+        # make sure num_masked span is also <= input_length - (mask_length - 1)
+        if input_length - (mask_length - 1) < num_masked_span:
+            num_masked_span = max(input_length - (mask_length - 1), 0)
+
+        return num_masked_span
+
+    # compute number of masked spans in batch
+    input_lengths = (
+        attention_mask.sum(-1).detach().tolist()
+        if attention_mask is not None
+        else [sequence_length for _ in range(batch_size)]
+    )
+
+    # SpecAugment mask to fill
+    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
+    spec_aug_mask_idxs = []
+
+    max_num_masked_span = compute_num_masked_span(sequence_length)
+
+    if max_num_masked_span == 0:
+        return spec_aug_mask
+
+    for input_length in input_lengths:
+        # compute num of masked spans for this input
+        num_masked_span = compute_num_masked_span(input_length)
+
+        # get random indices to mask
+        spec_aug_mask_idx = np.random.choice(
+            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
+        )
+
+        # pick first sampled index that will serve as a dummy index to pad vector
+        # to ensure same dimension for all batches due to probabilistic rounding
+        # Picking first sample just pads those vectors twice.
+        if len(spec_aug_mask_idx) == 0:
+            # this case can only happen if `input_length` is strictly smaller then
+            # `sequence_length` in which case the last token has to be a padding
+            # token which we can use as a dummy mask id
+            dummy_mask_idx = sequence_length - 1
+        else:
+            dummy_mask_idx = spec_aug_mask_idx[0]
+
+        spec_aug_mask_idx = np.concatenate(
+            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
+        )
+        spec_aug_mask_idxs.append(spec_aug_mask_idx)
+
+    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
+
+    # expand masked indices to masked spans
+    spec_aug_mask_idxs = np.broadcast_to(
+        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
+
+    # add offset to the starting indexes so that indexes now create a span
+    offsets = np.arange(mask_length)[None, None, :]
+    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
+        batch_size, max_num_masked_span * mask_length
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+
+    # ensure that we cannot have indices larger than sequence_length
+    if spec_aug_mask_idxs.max() > sequence_length - 1:
+        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
+
+    # scatter indices to mask
+    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
+
+    return spec_aug_mask
+
+
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2NoLayerNormConvLayer with Wav2Vec2->SpeechT5
+class SpeechT5NoLayerNormConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2LayerNormConvLayer with Wav2Vec2->SpeechT5
+class SpeechT5LayerNormConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+
+        hidden_states = hidden_states.transpose(-2, -1)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states.transpose(-2, -1)
+
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2GroupNormConvLayer with Wav2Vec2->SpeechT5
+class SpeechT5GroupNormConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+        self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True)
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.speech_to_text.modeling_speech_to_text.Speech2TextSinusoidalPositionalEmbedding with Speech2Text->SpeechT5
+class SpeechT5SinusoidalPositionalEmbedding(nn.Module):
+    """This module produces sinusoidal positional embeddings of any length."""
+
+    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        super().__init__()
+        self.offset = 2
+        self.embedding_dim = embedding_dim
+        self.padding_idx = padding_idx
+        self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
+
+    def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx)
+        if hasattr(self, "weights"):
+            # in forward put the weights on the correct dtype and device of the param
+            emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
+
+        self.weights = nn.Parameter(emb_weights)
+        self.weights.requires_grad = False
+        self.weights.detach_()
+
+    @staticmethod
+    def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        """
+        Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
+        description in Section 3.5 of "Attention Is All You Need".
+        """
+        half_dim = embedding_dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb)
+        emb = torch.arange(num_embeddings, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
+        if embedding_dim % 2 == 1:
+            # zero pad
+            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
+        if padding_idx is not None:
+            emb[padding_idx, :] = 0
+        return emb.to(torch.get_default_dtype())
+
+    @torch.no_grad()
+    def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):
+        bsz, seq_len = input_ids.size()
+        # Create the position ids from the input token ids. Any padded tokens remain padded.
+        position_ids = self.create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length).to(
+            input_ids.device
+        )
+
+        # expand embeddings if needed
+        max_pos = self.padding_idx + 1 + seq_len
+        if max_pos > self.weights.size(0):
+            self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)
+
+        return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, -1).detach()
+
+    def create_position_ids_from_input_ids(
+        self, input_ids: torch.Tensor, padding_idx: int, past_key_values_length: Optional[int] = 0
+    ):
+        """
+        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
+        symbols are ignored. This is modified from fairseq's `utils.make_positions`.
+
+        Args:
+            x: torch.Tensor x:
+        Returns: torch.Tensor
+        """
+        # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+        mask = input_ids.ne(padding_idx).int()
+        incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
+        return incremental_indices.long() + padding_idx
+
+
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2PositionalConvEmbedding with Wav2Vec2->SpeechT5
+class SpeechT5PositionalConvEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=config.num_conv_pos_embeddings,
+            padding=config.num_conv_pos_embeddings // 2,
+            groups=config.num_conv_pos_embedding_groups,
+        )
+
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
+        if is_deepspeed_zero3_enabled():
+            import deepspeed
+
+            with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
+                self.conv = weight_norm(self.conv, name="weight", dim=2)
+            deepspeed.zero.register_external_parameter(self, self.conv.weight_v)
+            deepspeed.zero.register_external_parameter(self, self.conv.weight_g)
+        else:
+            self.conv = weight_norm(self.conv, name="weight", dim=2)
+
+        self.padding = SpeechT5SamePadLayer(config.num_conv_pos_embeddings)
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.transpose(1, 2)
+
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.padding(hidden_states)
+        hidden_states = self.activation(hidden_states)
+
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+class SpeechT5ScaledPositionalEncoding(nn.Module):
+    """
+    Scaled positional encoding, see §3.2 in https://arxiv.org/abs/1809.08895
+    """
+
+    def __init__(self, dropout, dim, max_len=5000):
+        pe = torch.zeros(max_len, dim)
+        position = torch.arange(0, max_len).unsqueeze(1)
+        div_term = torch.exp((torch.arange(0, dim, 2, dtype=torch.float) * -(math.log(10000.0) / dim)))
+        pe[:, 0::2] = torch.sin(position.float() * div_term)
+        pe[:, 1::2] = torch.cos(position.float() * div_term)
+        pe = pe.unsqueeze(0)
+        super().__init__()
+        self.register_buffer("pe", pe, persistent=False)
+        self.dropout = nn.Dropout(p=dropout)
+        self.dim = dim
+        self.alpha = torch.nn.Parameter(torch.tensor(1.0))
+
+    def forward(self, emb):
+        emb = emb + self.alpha * self.pe[:, : emb.size(1)]
+        emb = self.dropout(emb)
+        return emb
+
+
+class SpeechT5RelativePositionalEncoding(torch.nn.Module):
+    def __init__(self, dim, max_length=1000):
+        super().__init__()
+        self.dim = dim
+        self.max_length = max_length
+        self.pe_k = torch.nn.Embedding(2 * max_length, dim)
+
+    def forward(self, hidden_states):
+        seq_len = hidden_states.shape[1]
+        pos_seq = torch.arange(0, seq_len).long().to(hidden_states.device)
+        pos_seq = pos_seq[:, None] - pos_seq[None, :]
+
+        pos_seq[pos_seq < -self.max_length] = -self.max_length
+        pos_seq[pos_seq >= self.max_length] = self.max_length - 1
+        pos_seq = pos_seq + self.max_length
+
+        return self.pe_k(pos_seq)
+
+
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2SamePadLayer with Wav2Vec2->SpeechT5
+class SpeechT5SamePadLayer(nn.Module):
+    def __init__(self, num_conv_pos_embeddings):
+        super().__init__()
+        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
+
+    def forward(self, hidden_states):
+        if self.num_pad_remove > 0:
+            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
+        return hidden_states
+
+
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder with Wav2Vec2->SpeechT5
+class SpeechT5FeatureEncoder(nn.Module):
+    """Construct the features from raw audio waveform"""
+
+    def __init__(self, config):
+        super().__init__()
+
+        if config.feat_extract_norm == "group":
+            conv_layers = [SpeechT5GroupNormConvLayer(config, layer_id=0)] + [
+                SpeechT5NoLayerNormConvLayer(config, layer_id=i + 1) for i in range(config.num_feat_extract_layers - 1)
+            ]
+        elif config.feat_extract_norm == "layer":
+            conv_layers = [
+                SpeechT5LayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)
+            ]
+        else:
+            raise ValueError(
+                f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
+            )
+        self.conv_layers = nn.ModuleList(conv_layers)
+        self.gradient_checkpointing = False
+        self._requires_grad = True
+
+    def _freeze_parameters(self):
+        for param in self.parameters():
+            param.requires_grad = False
+        self._requires_grad = False
+
+    def forward(self, input_values):
+        hidden_states = input_values[:, None]
+
+        # make sure hidden_states require grad for gradient_checkpointing
+        if self._requires_grad and self.training:
+            hidden_states.requires_grad = True
+
+        for conv_layer in self.conv_layers:
+            if self._requires_grad and self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(conv_layer),
+                    hidden_states,
+                )
+            else:
+                hidden_states = conv_layer(hidden_states)
+
+        return hidden_states
+
+
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureProjection with Wav2Vec2->SpeechT5
+class SpeechT5FeatureProjection(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
+        self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
+        self.dropout = nn.Dropout(config.feat_proj_dropout)
+
+    def forward(self, hidden_states):
+        # non-projected hidden states are needed for quantization
+        norm_hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.projection(norm_hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states, norm_hidden_states
+
+
+class SpeechT5SpeechEncoderPrenet(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.feature_encoder = SpeechT5FeatureEncoder(config)
+        self.feature_projection = SpeechT5FeatureProjection(config)
+
+        # model only needs masking vector if mask prob is > 0.0
+        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
+            self.masked_spec_embed = nn.Parameter(torch.FloatTensor(config.hidden_size).uniform_())
+
+        self.pos_conv_embed = SpeechT5PositionalConvEmbedding(config)
+        self.pos_sinusoidal_embed = SpeechT5SinusoidalPositionalEmbedding(
+            config.max_speech_positions + config.pad_token_id + 1,
+            config.hidden_size,
+            config.pad_token_id,
+        )
+
+    def freeze_feature_encoder(self):
+        self.feature_encoder._freeze_parameters()
+
+    def forward(
+        self,
+        input_values: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+    ):
+        extract_features = self.feature_encoder(input_values)
+        extract_features = extract_features.transpose(1, 2)
+
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1],
+                attention_mask,
+            )
+
+        hidden_states, extract_features = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+        )
+
+        positional_conv_embedding = self.pos_conv_embed(hidden_states)
+        hidden_states = hidden_states + positional_conv_embedding
+
+        if attention_mask is not None:
+            padding_mask = attention_mask.ne(1).long()
+        else:
+            padding_mask = torch.zeros(hidden_states.shape[:2], dtype=torch.long, device=hidden_states.device)
+
+        positional_sinusoidal_embeddings = self.pos_sinusoidal_embed(padding_mask)
+        hidden_states = hidden_states + positional_sinusoidal_embeddings
+
+        return hidden_states, attention_mask
+
+    # Copied from transformers.models.unispeech.modeling_unispeech.UniSpeechPreTrainedModel._get_feature_vector_attention_mask
+    def _get_feature_vector_attention_mask(self, feature_vector_length: int, attention_mask: torch.LongTensor):
+        # Effectively attention_mask.sum(-1), but not inplace to be able to run
+        # on inference mode.
+        non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
+        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths).to(torch.long)
+        batch_size = attention_mask.shape[0]
+
+        attention_mask = torch.zeros(
+            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
+        )
+        # these two operations makes sure that all values before the output lengths idxs are attended to
+        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
+        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+        return attention_mask
+
+    # Copied from transformers.models.unispeech.modeling_unispeech.UniSpeechPreTrainedModel._get_feat_extract_output_lengths
+    def _get_feat_extract_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
+
+        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+
+        return input_lengths
+
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model._mask_hidden_states
+    def _mask_hidden_states(
+        self,
+        hidden_states: torch.FloatTensor,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        """
+        Masks extracted features along time axis and/or along feature axis according to
+        [SpecAugment](https://arxiv.org/abs/1904.08779).
+        """
+
+        # `config.apply_spec_augment` can set masking to False
+        if not getattr(self.config, "apply_spec_augment", True):
+            return hidden_states
+
+        # generate indices & apply SpecAugment along time axis
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+
+        if mask_time_indices is not None:
+            # apply SpecAugment along time axis with given mask_time_indices
+            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+        elif self.config.mask_time_prob > 0 and self.training:
+            mask_time_indices = _compute_mask_indices(
+                (batch_size, sequence_length),
+                mask_prob=self.config.mask_time_prob,
+                mask_length=self.config.mask_time_length,
+                attention_mask=attention_mask,
+                min_masks=self.config.mask_time_min_masks,
+            )
+            mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
+            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+
+        if self.config.mask_feature_prob > 0 and self.training:
+            # generate indices & apply SpecAugment along feature axis
+            mask_feature_indices = _compute_mask_indices(
+                (batch_size, hidden_size),
+                mask_prob=self.config.mask_feature_prob,
+                mask_length=self.config.mask_feature_length,
+                min_masks=self.config.mask_feature_min_masks,
+            )
+            mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
+            mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
+            hidden_states[mask_feature_indices] = 0
+
+        return hidden_states
+
+
+class SpeechT5SpeechDecoderPrenet(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        self.layers = nn.ModuleList(
+            [
+                nn.Linear(
+                    config.num_mel_bins if i == 0 else config.speech_decoder_prenet_units,
+                    config.speech_decoder_prenet_units,
+                )
+                for i in range(config.speech_decoder_prenet_layers)
+            ]
+        )
+
+        self.final_layer = nn.Linear(config.speech_decoder_prenet_units, config.hidden_size)
+
+        self.encode_positions = SpeechT5ScaledPositionalEncoding(
+            config.positional_dropout,
+            config.hidden_size,
+            config.max_speech_positions,
+        )
+
+        self.speaker_embeds_layer = nn.Linear(config.speaker_embedding_dim + config.hidden_size, config.hidden_size)
+
+    def forward(
+        self,
+        input_values: torch.Tensor,
+        speaker_embeddings: Optional[torch.Tensor] = None,
+    ):
+        # Dropout is always applied, even when evaluating. See §2.2 in https://arxiv.org/abs/1712.05884.
+
+        inputs_embeds = input_values
+        for layer in self.layers:
+            inputs_embeds = nn.functional.relu(layer(inputs_embeds))
+            inputs_embeds = nn.functional.dropout(
+                inputs_embeds, self.config.speech_decoder_prenet_dropout, training=True
+            )
+
+        inputs_embeds = self.final_layer(inputs_embeds)
+        inputs_embeds = self.encode_positions(inputs_embeds)
+
+        if speaker_embeddings is not None:
+            speaker_embeddings = nn.functional.normalize(speaker_embeddings)
+            speaker_embeddings = speaker_embeddings.unsqueeze(1)
+            speaker_embeddings = speaker_embeddings.expand(-1, inputs_embeds.size(1), -1)
+            inputs_embeds = torch.cat([inputs_embeds, speaker_embeddings], dim=-1)
+            inputs_embeds = nn.functional.relu(self.speaker_embeds_layer(inputs_embeds))
+
+        return inputs_embeds
+
+
+class SpeechT5BatchNormConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+
+        if layer_id == 0:
+            in_conv_dim = config.num_mel_bins
+        else:
+            in_conv_dim = config.speech_decoder_postnet_units
+
+        if layer_id == config.speech_decoder_postnet_layers - 1:
+            out_conv_dim = config.num_mel_bins
+        else:
+            out_conv_dim = config.speech_decoder_postnet_units
+
+        self.conv = nn.Conv1d(
+            in_conv_dim,
+            out_conv_dim,
+            kernel_size=config.speech_decoder_postnet_kernel,
+            stride=1,
+            padding=(config.speech_decoder_postnet_kernel - 1) // 2,
+            bias=False,
+        )
+        self.batch_norm = nn.BatchNorm1d(out_conv_dim)
+
+        if layer_id < config.speech_decoder_postnet_layers - 1:
+            self.activation = nn.Tanh()
+        else:
+            self.activation = None
+
+        self.dropout = nn.Dropout(config.speech_decoder_postnet_dropout)
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.batch_norm(hidden_states)
+        if self.activation is not None:
+            hidden_states = self.activation(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class SpeechT5SpeechDecoderPostnet(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        self.feat_out = nn.Linear(config.hidden_size, config.num_mel_bins * config.reduction_factor)
+        self.prob_out = nn.Linear(config.hidden_size, config.reduction_factor)
+
+        self.layers = nn.ModuleList(
+            [SpeechT5BatchNormConvLayer(config, i) for i in range(config.speech_decoder_postnet_layers)]
+        )
+
+    def forward(self, hidden_states: torch.Tensor):
+        outputs_before_postnet = self.feat_out(hidden_states).view(hidden_states.size(0), -1, self.config.num_mel_bins)
+        outputs_after_postnet = self.postnet(outputs_before_postnet)
+        logits = self.prob_out(hidden_states).view(hidden_states.size(0), -1)
+        return outputs_before_postnet, outputs_after_postnet, logits
+
+    def postnet(self, hidden_states: torch.Tensor):
+        layer_output = hidden_states.transpose(1, 2)
+        for layer in self.layers:
+            layer_output = layer(layer_output)
+        return hidden_states + layer_output.transpose(1, 2)
+
+
+class SpeechT5TextEncoderPrenet(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+        self.encode_positions = SpeechT5ScaledPositionalEncoding(
+            config.positional_dropout,
+            config.hidden_size,
+            config.max_text_positions,
+        )
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    def forward(self, input_ids: torch.Tensor):
+        inputs_embeds = self.embed_tokens(input_ids)
+        inputs_embeds = self.encode_positions(inputs_embeds)
+        return inputs_embeds
+
+
+class SpeechT5TextDecoderPrenet(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.dropout = nn.Dropout(config.positional_dropout)
+        self.embed_scale = math.sqrt(config.hidden_size) if config.scale_embedding else 1.0
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+
+        self.embed_positions = SpeechT5SinusoidalPositionalEmbedding(
+            config.max_text_positions + config.pad_token_id + 1,
+            config.hidden_size,
+            config.pad_token_id,
+        )
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+    ):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        else:
+            raise ValueError("You have to specify `decoder_input_ids`")
+
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+        positions = self.embed_positions(input_ids, past_key_values_length)
+
+        inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+        inputs_embeds += positions
+        inputs_embeds = self.dropout(inputs_embeds)
+
+        return inputs_embeds, attention_mask
+
+
+class SpeechT5TextDecoderPostnet(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+    def forward(self, hidden_states: torch.Tensor):
+        return self.lm_head(hidden_states)
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+
+class SpeechT5Attention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper with relative position bias (see
+    https://aclanthology.org/N18-2074.pdf)
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        position_bias: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        # relative attention bias
+        if position_bias is not None:
+            reshape_q = query_states.contiguous().view(bsz * self.num_heads, -1, self.head_dim).transpose(0, 1)
+            rel_pos_bias = torch.matmul(reshape_q, position_bias.transpose(-2, -1))
+            rel_pos_bias = rel_pos_bias.transpose(0, 1).view(
+                bsz * self.num_heads, position_bias.size(0), position_bias.size(1)
+            )
+            attn_weights += rel_pos_bias
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
+class SpeechT5FeedForward(nn.Module):
+    def __init__(self, config, intermediate_size):
+        super().__init__()
+        self.intermediate_dropout = nn.Dropout(config.activation_dropout)
+
+        self.intermediate_dense = nn.Linear(config.hidden_size, intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+        self.output_dense = nn.Linear(intermediate_size, config.hidden_size)
+        self.output_dropout = nn.Dropout(config.hidden_dropout)
+
+    def forward(self, hidden_states):
+        hidden_states = self.intermediate_dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.intermediate_dropout(hidden_states)
+
+        hidden_states = self.output_dense(hidden_states)
+        hidden_states = self.output_dropout(hidden_states)
+        return hidden_states
+
+
+class SpeechT5EncoderLayer(nn.Module):
+    def __init__(self, config: SpeechT5Config):
+        super().__init__()
+        self.attention = SpeechT5Attention(
+            embed_dim=config.hidden_size,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=False,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = SpeechT5FeedForward(config, config.encoder_ffn_dim)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        position_bias: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                input to the layer of shape `(batch, seq_len, hidden_size)`
+            attention_mask (`torch.FloatTensor`):
+                attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very
+                large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(config.encoder_attention_heads,)`.
+            position_bias (`torch.FloatTensor`):
+                relative position embeddings of size `(seq_len, seq_len, hidden_size // encoder_attention_heads)`
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states, attn_weights, _ = self.attention(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            position_bias=position_bias,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = residual + hidden_states
+
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states + self.feed_forward(hidden_states)
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class SpeechT5DecoderLayer(nn.Module):
+    def __init__(self, config: SpeechT5Config):
+        super().__init__()
+        self.self_attn = SpeechT5Attention(
+            embed_dim=config.hidden_size,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.self_attn_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        self.encoder_attn = SpeechT5Attention(
+            config.hidden_size,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        self.feed_forward = SpeechT5FeedForward(config, config.decoder_ffn_dim)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, hidden_size)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(batch, seq_len, hidden_size)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+                size `(decoder_attention_heads,)`.
+            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+            )
+            hidden_states = self.dropout(hidden_states)
+            hidden_states = residual + hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        # Fully Connected
+        hidden_states = hidden_states + self.feed_forward(hidden_states)
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+class SpeechT5PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = SpeechT5Config
+    base_model_prefix = "speecht5"
+    main_input_name = "input_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, SpeechT5PositionalConvEmbedding):
+            nn.init.normal_(
+                module.conv.weight,
+                mean=0,
+                std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
+            )
+            nn.init.constant_(module.conv.bias, 0)
+        elif isinstance(module, SpeechT5FeatureProjection):
+            k = math.sqrt(1 / module.projection.in_features)
+            nn.init.uniform_(module.projection.weight, a=-k, b=k)
+            nn.init.uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (SpeechT5Encoder, SpeechT5Decoder, SpeechT5FeatureEncoder)):
+            module.gradient_checkpointing = value
+
+
+class SpeechT5Encoder(SpeechT5PreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* layers. Each layer is a [`SpeechT5EncoderLayer`].
+    """
+
+    def __init__(self, config: SpeechT5Config):
+        super().__init__(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layerdrop = config.encoder_layerdrop
+
+        self.layers = nn.ModuleList([SpeechT5EncoderLayer(config) for _ in range(config.encoder_layers)])
+
+        self.embed_positions = SpeechT5RelativePositionalEncoding(
+            config.hidden_size // config.encoder_attention_heads, config.encoder_max_relative_position
+        )
+
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, feature_size)`):
+                Features extracted from the speech or text input by the encoder prenet.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
+                `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
+
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        position_bias = self.embed_positions(hidden_states)
+
+        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            if head_mask.size()[0] != len(self.layers):
+                raise ValueError(
+                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
+                    f" {head_mask.size()[0]}."
+                )
+
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            skip_the_layer = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                skip_the_layer = dropout_probability < self.layerdrop
+
+            if not skip_the_layer or deepspeed_zero3_is_enabled:
+                # under deepspeed zero3 all gpus must run in sync
+                if self.gradient_checkpointing and self.training:
+                    # create gradient checkpointing function
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
+
+                        return custom_forward
+
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(encoder_layer),
+                        hidden_states,
+                        attention_mask,
+                        (head_mask[idx] if head_mask is not None else None),
+                        position_bias,
+                    )
+                else:
+                    layer_outputs = encoder_layer(
+                        hidden_states,
+                        attention_mask=attention_mask,
+                        position_bias=position_bias,
+                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                        output_attentions=output_attentions,
+                    )
+                hidden_states = layer_outputs[0]
+
+            if skip_the_layer:
+                layer_outputs = (None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class SpeechT5EncoderWithSpeechPrenet(SpeechT5PreTrainedModel):
+    """
+    Wrapper around SpeechT5Encoder that applies SpeechT5SpeechEncoderPrenet to convert the audio waveform data to
+    hidden features.
+    """
+
+    def __init__(self, config: SpeechT5Config):
+        super().__init__(config)
+        self.prenet = SpeechT5SpeechEncoderPrenet(config)
+        self.wrapped_encoder = SpeechT5Encoder(config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_values: torch.FloatTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        hidden_states, attention_mask = self.prenet(input_values, attention_mask)
+
+        outputs = self.wrapped_encoder(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        return outputs
+
+
+class SpeechT5EncoderWithTextPrenet(SpeechT5PreTrainedModel):
+    """
+    Wrapper around SpeechT5Encoder that applies SpeechT5TextEncoderPrenet to convert the input_ids to hidden features.
+    """
+
+    def __init__(self, config: SpeechT5Config):
+        super().__init__(config)
+        self.prenet = SpeechT5TextEncoderPrenet(config)
+        self.wrapped_encoder = SpeechT5Encoder(config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.prenet.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.prenet.set_input_embeddings(value)
+
+    def forward(
+        self,
+        input_values: torch.FloatTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        hidden_states = self.prenet(input_values)
+
+        outputs = self.wrapped_encoder(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        return outputs
+
+
+class SpeechT5EncoderWithoutPrenet(SpeechT5PreTrainedModel):
+    """
+    This wrapper class is a helper class to correctly load pretrained checkpoints when used in combination with
+    [`SpeechT5Model`].
+    """
+
+    def __init__(self, config: SpeechT5Config):
+        super().__init__(config)
+        self.wrapped_encoder = SpeechT5Encoder(config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_values: torch.FloatTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        return self.wrapped_encoder(
+            hidden_states=input_values,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class SpeechT5Decoder(SpeechT5PreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`SpeechT5DecoderLayer`]
+    """
+
+    def __init__(self, config: SpeechT5Config):
+        super().__init__(config)
+        self.layerdrop = config.decoder_layerdrop
+
+        self.layers = nn.ModuleList([SpeechT5DecoderLayer(config) for _ in range(config.decoder_layers)])
+
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+
+        return combined_attention_mask
+
+    def forward(
+        self,
+        hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        r"""
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, feature_size)`):
+                Features extracted from the speech or text input by the decoder prenet.
+            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+                selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
+                cross-attention on hidden heads. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of
+                shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing
+                `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more
+                control over how to convert `input_ids` indices into associated vectors than the model's internal
+                embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        input_shape = hidden_states.size()[:-1]
+
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, input_shape, hidden_states, past_key_values_length
+        )
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, hidden_states.dtype, tgt_len=input_shape[-1])
+
+        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        next_decoder_cache = () if use_cache else None
+
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+            if attn_mask is not None:
+                if attn_mask.size()[0] != (len(self.layers)):
+                    raise ValueError(
+                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                        f" {head_mask.size()[0]}."
+                    )
+
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            skip_the_layer = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                skip_the_layer = dropout_probability < self.layerdrop
+            if skip_the_layer and not deepspeed_zero3_is_enabled:
+                continue
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, use_cache)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    head_mask[idx] if head_mask is not None else None,
+                    cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    cross_attn_layer_head_mask=(
+                        cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
+                    ),
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attentions, all_cross_attentions]
+                if v is not None
+            )
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class SpeechT5DecoderWithSpeechPrenet(SpeechT5PreTrainedModel):
+    """
+    Wrapper around SpeechT5Decoder that applies SpeechT5SpeechDecoderPrenet to convert log-mel filterbanks to hidden
+    features.
+    """
+
+    def __init__(self, config: SpeechT5Config):
+        super().__init__(config)
+        self.prenet = SpeechT5SpeechDecoderPrenet(config)
+        self.wrapped_decoder = SpeechT5Decoder(config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.LongTensor] = None,
+        speaker_embeddings: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        decoder_hidden_states = self.prenet(input_values, speaker_embeddings)
+
+        outputs = self.wrapped_decoder(
+            hidden_states=decoder_hidden_states,
+            attention_mask=attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            head_mask=head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        return outputs
+
+
+class SpeechT5DecoderWithTextPrenet(SpeechT5PreTrainedModel):
+    """
+    Wrapper around SpeechT5Decoder that applies SpeechT5TextDecoderPrenet to convert input tokens to hidden features.
+    """
+
+    def __init__(self, config: SpeechT5Config):
+        super().__init__(config)
+        self.prenet = SpeechT5TextDecoderPrenet(config)
+        self.wrapped_decoder = SpeechT5Decoder(config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.prenet.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.prenet.set_input_embeddings(value)
+
+    def forward(
+        self,
+        input_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        decoder_hidden_states, attention_mask = self.prenet(input_values, attention_mask, past_key_values)
+
+        outputs = self.wrapped_decoder(
+            hidden_states=decoder_hidden_states,
+            attention_mask=attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            head_mask=head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        return outputs
+
+
+class SpeechT5DecoderWithoutPrenet(SpeechT5PreTrainedModel):
+    """
+    This wrapper class is a helper class to correctly load pretrained checkpoints when used in combination with
+    [`SpeechT5Model`].
+    """
+
+    def __init__(self, config: SpeechT5Config):
+        super().__init__(config)
+        self.wrapped_decoder = SpeechT5Decoder(config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        outputs = self.wrapped_decoder(
+            hidden_states=input_values,
+            attention_mask=attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            head_mask=head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        return outputs
+
+
+class SpeechT5GuidedMultiheadAttentionLoss(nn.Module):
+    """
+    Guided attention loss from the paper [Efficiently Trainable Text-to-Speech System Based on Deep Convolutional
+    Networks with Guided Attention](https://arxiv.org/abs/1710.08969), adapted for multi-head attention.
+    """
+
+    def __init__(self, config: SpeechT5Config):
+        super().__init__()
+        self.sigma = config.guided_attention_loss_sigma
+        self.scale = config.guided_attention_loss_scale
+
+    def forward(
+        self, attentions: torch.FloatTensor, input_masks: torch.BoolTensor, output_masks: torch.BoolTensor
+    ) -> torch.Tensor:
+        """
+        Compute the attention loss.
+
+        Args:
+            attentions (`torch.FloatTensor` of shape `(batch_size, layers * heads, output_sequence_length, input_sequence_length)`):
+                Batch of multi-head attention weights
+            input_masks (`torch.BoolTensor` of shape `(batch_size, input_sequence_length)`):
+                Input attention mask as booleans.
+            output_masks (`torch.BoolTensor` of shape `(batch_size, output_sequence_length)`):
+                Target attention mask as booleans.
+
+        Returns:
+            `torch.Tensor` with the loss value
+        """
+        guided_attn_masks = self._make_guided_attention_masks(input_masks, output_masks, attentions.device)
+        masks = output_masks.unsqueeze(-1) & input_masks.unsqueeze(-2)
+        masks = masks.to(attentions.device).unsqueeze(1)
+
+        losses = guided_attn_masks * attentions
+        loss = torch.mean(losses.masked_select(masks))
+        return self.scale * loss
+
+    def _make_guided_attention_masks(self, input_masks, output_masks, device):
+        input_lengths = input_masks.sum(-1)
+        output_lengths = output_masks.sum(-1)
+
+        guided_attn_masks = torch.zeros((len(input_masks), output_masks.shape[1], input_masks.shape[1]), device=device)
+
+        for idx, (ilen, olen) in enumerate(zip(input_lengths, output_lengths)):
+            guided_attn_masks[idx, :olen, :ilen] = self._make_guided_attention_mask(ilen, olen, self.sigma, device)
+
+        return guided_attn_masks.unsqueeze(1)
+
+    @staticmethod
+    def _make_guided_attention_mask(input_length, output_length, sigma, device):
+        grid_y, grid_x = torch.meshgrid(
+            torch.arange(input_length, device=device),
+            torch.arange(output_length, device=device),
+            indexing="xy",
+        )
+        grid_x = grid_x.float() / output_length
+        grid_y = grid_y.float() / input_length
+        return 1.0 - torch.exp(-((grid_y - grid_x) ** 2) / (2 * (sigma**2)))
+
+
+class SpeechT5SpectrogramLoss(nn.Module):
+    """
+    Loss computation used by SpeechT5ForTextToSpeech.
+    """
+
+    def __init__(self, config: SpeechT5Config):
+        super().__init__()
+        self.use_guided_attention_loss = config.use_guided_attention_loss
+        self.guided_attention_loss_num_heads = config.guided_attention_loss_num_heads
+        self.reduction_factor = config.reduction_factor
+
+        self.l1_criterion = L1Loss()
+        self.bce_criterion = BCEWithLogitsLoss(pos_weight=torch.tensor(5.0))
+
+        if self.use_guided_attention_loss:
+            self.attn_criterion = SpeechT5GuidedMultiheadAttentionLoss(config)
+
+    def forward(
+        self,
+        attention_mask: torch.LongTensor,
+        outputs_before_postnet: torch.FloatTensor,
+        outputs_after_postnet: torch.FloatTensor,
+        logits: torch.FloatTensor,
+        labels: torch.FloatTensor,
+        cross_attentions: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        padding_mask = labels != -100.0
+
+        # mask out the padded portions
+        labels = labels.masked_select(padding_mask)
+        outputs_before_postnet = outputs_before_postnet.masked_select(padding_mask)
+        outputs_after_postnet = outputs_after_postnet.masked_select(padding_mask)
+
+        # spectrogram loss
+        l1_loss = self.l1_criterion(outputs_after_postnet, labels) + self.l1_criterion(outputs_before_postnet, labels)
+
+        # construct stop labels from the padding mask
+        masks = padding_mask[:, :, 0]
+        stop_labels = torch.cat([~masks * 1.0, torch.ones(masks.size(0), 1).to(masks.device)], dim=1)
+        stop_labels = stop_labels[:, 1:].masked_select(masks)
+        logits = logits.masked_select(masks)
+
+        # stop token loss
+        bce_loss = self.bce_criterion(logits, stop_labels)
+
+        # combined loss
+        loss = l1_loss + bce_loss
+
+        # guided attention loss
+        if self.use_guided_attention_loss:
+            attn = torch.cat([x[:, : self.guided_attention_loss_num_heads] for x in cross_attentions], dim=1)
+            input_masks = attention_mask == 1
+            output_masks = padding_mask[:, :, 0]
+            if self.reduction_factor > 1:
+                output_masks = output_masks[:, self.reduction_factor - 1 :: self.reduction_factor]
+            attn_loss = self.attn_criterion(attn, input_masks, output_masks)
+            loss += attn_loss
+
+        return loss
+
+
+SPEECHT5_BASE_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`SpeechT5Config`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+        encoder ([`SpeechT5EncoderWithSpeechPrenet`] or [`SpeechT5EncoderWithTextPrenet`] or `None`):
+            The Transformer encoder module that applies the appropiate speech or text encoder prenet. If `None`,
+            [`SpeechT5EncoderWithoutPrenet`] will be used and the `input_values` are assumed to be hidden states.
+        decoder ([`SpeechT5DecoderWithSpeechPrenet`] or [`SpeechT5DecoderWithTextPrenet`] or `None`):
+            The Transformer decoder module that applies the appropiate speech or text decoder prenet. If `None`,
+            [`SpeechT5DecoderWithoutPrenet`] will be used and the `decoder_input_values` are assumed to be hidden
+            states.
+"""
+
+
+SPEECHT5_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`SpeechT5Config`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+SPEECHT5_INPUTS_DOCSTRING = r"""
+    Args:
+        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
+            1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            <Tip warning={true}>
+
+            `attention_mask` should only be passed if the corresponding processor has `config.return_attention_mask ==
+            True`. For all models whose processor has `config.return_attention_mask == False`, `attention_mask` should
+            **not** be passed to avoid degraded performance when doing batched inference. For such models
+            `input_values` should simply be padded with 0 and passed without `attention_mask`. Be aware that these
+            models also yield slightly different results depending on whether `input_values` is padded or not.
+
+            </Tip>
+
+        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
+            also be used by default.
+
+            If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+        head_mask (`torch.FloatTensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        decoder_head_mask (`torch.FloatTensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_values` (those
+            that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_values` of shape `(batch_size, sequence_length)`. decoder_inputs_embeds (`torch.FloatTensor`
+            of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*): Optionally, instead of passing
+            `decoder_input_values` you can choose to directly pass an embedded representation. If `past_key_values` is
+            used, optionally only the last `decoder_inputs_embeds` have to be input (see `past_key_values`). This is
+            useful if you want more control over how to convert `decoder_input_values` indices into associated vectors
+            than the model's internal embedding lookup matrix.
+
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare SpeechT5 Encoder-Decoder Model outputting raw hidden-states without any specific pre- or post-nets.",
+    SPEECHT5_BASE_START_DOCSTRING,
+)
+class SpeechT5Model(SpeechT5PreTrainedModel):
+    def __init__(
+        self,
+        config: SpeechT5Config,
+        encoder: Optional[nn.Module] = None,
+        decoder: Optional[nn.Module] = None,
+    ):
+        super().__init__(config)
+        self.config = config
+        self.encoder = SpeechT5EncoderWithoutPrenet(config) if encoder is None else encoder
+        self.decoder = SpeechT5DecoderWithoutPrenet(config) if decoder is None else decoder
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        if isinstance(self.encoder, SpeechT5EncoderWithTextPrenet):
+            return self.encoder.get_input_embeddings()
+        if isinstance(self.decoder, SpeechT5DecoderWithTextPrenet):
+            return self.decoder.get_input_embeddings()
+        return None
+
+    def set_input_embeddings(self, value):
+        if isinstance(self.encoder, SpeechT5EncoderWithTextPrenet):
+            self.encoder.set_input_embeddings(value)
+        if isinstance(self.decoder, SpeechT5DecoderWithTextPrenet):
+            self.decoder.set_input_embeddings(value)
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        if isinstance(self.encoder, SpeechT5EncoderWithSpeechPrenet):
+            self.encoder.prenet.freeze_feature_encoder()
+
+    @add_start_docstrings_to_model_forward(SPEECHT5_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        decoder_input_values: Optional[torch.Tensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        decoder_head_mask: Optional[torch.FloatTensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        speaker_embeddings: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqModelOutput]:
+        r"""
+        input_values (`torch.Tensor` of shape `(batch_size, sequence_length)`):
+            Depending on which encoder is being used, the `input_values` are either: float values of the input raw
+            speech waveform, or indices of input sequence tokens in the vocabulary, or hidden states.
+
+        decoder_input_values (`torch.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Depending on which decoder is being used, the `decoder_input_values` are either: float values of log-mel
+            filterbank features extracted from the raw speech waveform, or indices of decoder input sequence tokens in
+            the vocabulary, or hidden states.
+
+        speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
+            Tensor containing the speaker embeddings.
+
+        Returns:
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # Encode if needed (training, first prediction pass)
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_values=input_values,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # downsample encoder attention mask (only for encoders with speech input)
+        if attention_mask is not None and isinstance(self.encoder, SpeechT5EncoderWithSpeechPrenet):
+            encoder_attention_mask = self.encoder.prenet._get_feature_vector_attention_mask(
+                encoder_outputs[0].shape[1], attention_mask
+            )
+        else:
+            encoder_attention_mask = attention_mask
+
+        if isinstance(self.decoder, SpeechT5DecoderWithSpeechPrenet):
+            decoder_args = {"speaker_embeddings": speaker_embeddings}
+        else:
+            decoder_args = {}
+
+        decoder_outputs = self.decoder(
+            input_values=decoder_input_values,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=encoder_attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            **decoder_args,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """SpeechT5 Model with a speech encoder and a text decoder.""",
+    SPEECHT5_START_DOCSTRING,
+)
+class SpeechT5ForSpeechToText(SpeechT5PreTrainedModel):
+    _tied_weights_keys = ["text_decoder_postnet.lm_head.weight"]
+
+    def __init__(self, config: SpeechT5Config):
+        super().__init__(config)
+
+        if config.vocab_size is None:
+            raise ValueError(
+                f"You are trying to instantiate {self.__class__} with a configuration that does not define the"
+                " vocabulary size of the language model head. Please instantiate the model as follows:"
+                " `SpeechT5ForSpeechToText.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of"
+                " your model's configuration."
+            )
+
+        speech_encoder = SpeechT5EncoderWithSpeechPrenet(config)
+        text_decoder = SpeechT5DecoderWithTextPrenet(config)
+        self.speecht5 = SpeechT5Model(config, speech_encoder, text_decoder)
+
+        self.text_decoder_postnet = SpeechT5TextDecoderPostnet(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_encoder(self):
+        return self.speecht5.get_encoder()
+
+    def get_decoder(self):
+        return self.speecht5.get_decoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.get_encoder().prenet.freeze_feature_encoder()
+
+    def get_output_embeddings(self):
+        return self.text_decoder_postnet.get_output_embeddings()
+
+    def set_output_embeddings(self, new_embeddings):
+        self.text_decoder_postnet.set_output_embeddings(new_embeddings)
+
+    @add_start_docstrings_to_model_forward(SPEECHT5_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        decoder_head_mask: Optional[torch.FloatTensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, Seq2SeqLMOutput]:
+        r"""
+        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
+            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (*pip install
+            soundfile*). To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding
+            and conversion into a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
+
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`SpeechT5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            SpeechT5 uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
+            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the language modeling loss. Indices should either be in `[0, ..., config.vocab_size]`
+            or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is
+            only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+            Label indices can be obtained using [`SpeechT5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import SpeechT5Processor, SpeechT5ForSpeechToText
+        >>> from datasets import load_dataset
+
+        >>> dataset = load_dataset(
+        ...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation"
+        ... )  # doctest: +IGNORE_RESULT
+        >>> dataset = dataset.sort("id")
+        >>> sampling_rate = dataset.features["audio"].sampling_rate
+
+        >>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_asr")
+        >>> model = SpeechT5ForSpeechToText.from_pretrained("microsoft/speecht5_asr")
+
+        >>> # audio file is decoded on the fly
+        >>> inputs = processor(audio=dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
+        >>> predicted_ids = model.generate(**inputs, max_length=100)
+
+        >>> # transcribe speech
+        >>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
+        >>> transcription[0]
+        'mister quilter is the apostle of the middle classes and we are glad to welcome his gospel'
+        ```
+
+        ```python
+        >>> inputs["labels"] = processor(text_target=dataset[0]["text"], return_tensors="pt").input_ids
+
+        >>> # compute loss
+        >>> loss = model(**inputs).loss
+        >>> round(loss.item(), 2)
+        19.68
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None:
+            if decoder_input_ids is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        outputs = self.speecht5(
+            input_values=input_values,
+            attention_mask=attention_mask,
+            decoder_input_values=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            encoder_outputs=encoder_outputs,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+
+        logits = self.text_decoder_postnet(outputs[0])
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs,
+    ):
+        # cut decoder_input_ids if past is used
+        if past_key_values is not None:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+
+        return {
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past_key_values,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
+        }
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+
+def _generate_speech(
+    model: SpeechT5PreTrainedModel,
+    input_values: torch.FloatTensor,
+    speaker_embeddings: Optional[torch.FloatTensor] = None,
+    threshold: float = 0.5,
+    minlenratio: float = 0.0,
+    maxlenratio: float = 20.0,
+    vocoder: Optional[nn.Module] = None,
+    output_cross_attentions: bool = False,
+) -> Union[torch.FloatTensor, Tuple[torch.FloatTensor, torch.FloatTensor]]:
+    encoder_attention_mask = torch.ones_like(input_values)
+
+    encoder_out = model.speecht5.encoder(
+        input_values=input_values,
+        attention_mask=encoder_attention_mask,
+        return_dict=True,
+    )
+
+    encoder_last_hidden_state = encoder_out.last_hidden_state
+
+    # downsample encoder attention mask
+    if isinstance(model.speecht5.encoder, SpeechT5EncoderWithSpeechPrenet):
+        encoder_attention_mask = model.speecht5.encoder.prenet._get_feature_vector_attention_mask(
+            encoder_out[0].shape[1], encoder_attention_mask
+        )
+
+    maxlen = int(encoder_last_hidden_state.size(1) * maxlenratio / model.config.reduction_factor)
+    minlen = int(encoder_last_hidden_state.size(1) * minlenratio / model.config.reduction_factor)
+
+    # Start the output sequence with a mel spectrum that is all zeros.
+    output_sequence = encoder_last_hidden_state.new_zeros(1, 1, model.config.num_mel_bins)
+
+    spectrogram = []
+    cross_attentions = []
+    past_key_values = None
+    idx = 0
+
+    while True:
+        idx += 1
+
+        # Run the decoder prenet on the entire output sequence.
+        decoder_hidden_states = model.speecht5.decoder.prenet(output_sequence, speaker_embeddings)
+
+        # Run the decoder layers on the last element of the prenet output.
+        decoder_out = model.speecht5.decoder.wrapped_decoder(
+            hidden_states=decoder_hidden_states[:, -1:],
+            attention_mask=None,
+            encoder_hidden_states=encoder_last_hidden_state,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=True,
+            output_attentions=output_cross_attentions,
+            return_dict=True,
+        )
+
+        if output_cross_attentions:
+            cross_attentions.append(torch.cat(decoder_out.cross_attentions, dim=0))
+
+        last_decoder_output = decoder_out.last_hidden_state[0, -1]
+        past_key_values = decoder_out.past_key_values
+
+        # Predict the new mel spectrum for this step in the sequence.
+        spectrum = model.speech_decoder_postnet.feat_out(last_decoder_output)
+        spectrum = spectrum.view(model.config.reduction_factor, model.config.num_mel_bins)
+        spectrogram.append(spectrum)
+
+        # Extend the output sequence with the new mel spectrum.
+        output_sequence = torch.cat((output_sequence, spectrum[-1].view(1, 1, model.config.num_mel_bins)), dim=1)
+
+        # Predict the probability that this is the stop token.
+        prob = torch.sigmoid(model.speech_decoder_postnet.prob_out(last_decoder_output))
+
+        # Finished when stop token or maximum length is reached.
+        if idx >= minlen and (int(sum(prob >= threshold)) > 0 or idx >= maxlen):
+            spectrogram = torch.cat(spectrogram, dim=0).unsqueeze(0)
+            spectrogram = model.speech_decoder_postnet.postnet(spectrogram)
+            spectrogram = spectrogram.squeeze(0)
+            break
+
+    if vocoder is not None:
+        outputs = vocoder(spectrogram)
+    else:
+        outputs = spectrogram
+
+    if output_cross_attentions:
+        cross_attentions = torch.cat(cross_attentions, dim=2)
+        outputs = (outputs, cross_attentions)
+
+    return outputs
+
+
+@add_start_docstrings(
+    """SpeechT5 Model with a text encoder and a speech decoder.""",
+    SPEECHT5_START_DOCSTRING,
+)
+class SpeechT5ForTextToSpeech(SpeechT5PreTrainedModel):
+    main_input_name = "input_ids"
+
+    def __init__(self, config: SpeechT5Config):
+        super().__init__(config)
+
+        if config.vocab_size is None:
+            raise ValueError(
+                f"You are trying to instantiate {self.__class__} with a configuration that does not define the"
+                " vocabulary size of the language model head. Please instantiate the model as follows:"
+                " `SpeechT5ForTextToSpeech.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of"
+                " your model's configuration."
+            )
+
+        text_encoder = SpeechT5EncoderWithTextPrenet(config)
+        speech_decoder = SpeechT5DecoderWithSpeechPrenet(config)
+        self.speecht5 = SpeechT5Model(config, text_encoder, speech_decoder)
+
+        self.speech_decoder_postnet = SpeechT5SpeechDecoderPostnet(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_encoder(self):
+        return self.speecht5.get_encoder()
+
+    def get_decoder(self):
+        return self.speecht5.get_decoder()
+
+    @add_start_docstrings_to_model_forward(SPEECHT5_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqSpectrogramOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        decoder_input_values: Optional[torch.FloatTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        decoder_head_mask: Optional[torch.FloatTensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        speaker_embeddings: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.FloatTensor] = None,
+        stop_labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, Seq2SeqSpectrogramOutput]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. The `batch_size` should be 1 currently.
+
+            Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
+            [`~PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        decoder_input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`):
+            Float values of input mel spectrogram.
+
+            SpeechT5 uses an all-zero spectrum as the starting token for `decoder_input_values` generation. If
+            `past_key_values` is used, optionally only the last `decoder_input_values` have to be input (see
+            `past_key_values`).
+        speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
+            Tensor containing the speaker embeddings.
+        labels (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`, *optional*):
+            Float values of target mel spectrogram. Timesteps set to `-100.0` are ignored (masked) for the loss
+            computation. Spectrograms can be obtained using [`SpeechT5Processor`]. See [`SpeechT5Processor.__call__`]
+            for details.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, set_seed
+        >>> import torch
+
+        >>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+        >>> model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
+        >>> vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+
+        >>> inputs = processor(text="Hello, my dog is cute", return_tensors="pt")
+        >>> speaker_embeddings = torch.zeros((1, 512))  # or load xvectors from a file
+
+        >>> set_seed(555)  # make deterministic
+
+        >>> # generate speech
+        >>> speech = model.generate(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
+        >>> speech.shape
+        torch.Size([15872])
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if stop_labels is not None:
+            warnings.warn(
+                "The argument `stop_labels` is deprecated and will be removed in version 4.30.0 of Transformers",
+                FutureWarning,
+            )
+
+        if labels is not None:
+            if decoder_input_values is None:
+                decoder_input_values = shift_spectrograms_right(labels, self.config.reduction_factor)
+            if self.config.use_guided_attention_loss:
+                output_attentions = True
+
+        outputs = self.speecht5(
+            input_values=input_ids,
+            attention_mask=attention_mask,
+            decoder_input_values=decoder_input_values,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            encoder_outputs=encoder_outputs,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            speaker_embeddings=speaker_embeddings,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+
+        outputs_before_postnet, outputs_after_postnet, logits = self.speech_decoder_postnet(outputs[0])
+
+        loss = None
+        if labels is not None:
+            criterion = SpeechT5SpectrogramLoss(self.config)
+            loss = criterion(
+                attention_mask,
+                outputs_before_postnet,
+                outputs_after_postnet,
+                logits,
+                labels,
+                outputs.cross_attentions,
+            )
+
+        if not return_dict:
+            output = (outputs_after_postnet,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return Seq2SeqSpectrogramOutput(
+            loss=loss,
+            spectrogram=outputs_after_postnet,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: torch.LongTensor,
+        speaker_embeddings: Optional[torch.FloatTensor] = None,
+        threshold: float = 0.5,
+        minlenratio: float = 0.0,
+        maxlenratio: float = 20.0,
+        vocoder: Optional[nn.Module] = None,
+        output_cross_attentions: bool = False,
+        **kwargs,
+    ) -> Union[torch.FloatTensor, Tuple[torch.FloatTensor, torch.FloatTensor]]:
+        r"""
+        Converts a sequence of input tokens into a sequence of mel spectrograms, which are subsequently turned into a
+        speech waveform using a vocoder.
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. The `batch_size` should be 1 currently.
+
+                Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
+                [`~PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
+                Tensor containing the speaker embeddings.
+            threshold (`float`, *optional*, defaults to 0.5):
+                The generated sequence ends when the predicted stop token probability exceeds this value.
+            minlenratio (`float`, *optional*, defaults to 0.0):
+                Used to calculate the minimum required length for the output sequence.
+            maxlenratio (`float`, *optional*, defaults to 20.0):
+                Used to calculate the maximum allowed length for the output sequence.
+            vocoder (`nn.Module`, *optional*):
+                The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
+                spectrogram.
+            output_cross_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of the decoder's cross-attention layers.
+
+        Returns:
+            `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
+            - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
+              `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
+            - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
+              `(num_frames,)` -- The predicted speech waveform.
+            - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`) `torch.FloatTensor`
+              of shape `(config.decoder_layers, config.decoder_attention_heads, output_sequence_length,
+              input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
+        """
+        return _generate_speech(
+            self,
+            input_ids,
+            speaker_embeddings,
+            threshold,
+            minlenratio,
+            maxlenratio,
+            vocoder,
+            output_cross_attentions,
+        )
+
+    @torch.no_grad()
+    def generate_speech(
+        self,
+        input_ids: torch.LongTensor,
+        speaker_embeddings: Optional[torch.FloatTensor] = None,
+        threshold: float = 0.5,
+        minlenratio: float = 0.0,
+        maxlenratio: float = 20.0,
+        vocoder: Optional[nn.Module] = None,
+        output_cross_attentions: bool = False,
+    ) -> Union[torch.FloatTensor, Tuple[torch.FloatTensor, torch.FloatTensor]]:
+        r"""
+        Converts a sequence of input tokens into a sequence of mel spectrograms, which are subsequently turned into a
+        speech waveform using a vocoder.
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. The `batch_size` should be 1 currently.
+
+                Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
+                [`~PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
+                Tensor containing the speaker embeddings.
+            threshold (`float`, *optional*, defaults to 0.5):
+                The generated sequence ends when the predicted stop token probability exceeds this value.
+            minlenratio (`float`, *optional*, defaults to 0.0):
+                Used to calculate the minimum required length for the output sequence.
+            maxlenratio (`float`, *optional*, defaults to 20.0):
+                Used to calculate the maximum allowed length for the output sequence.
+            vocoder (`nn.Module`, *optional*, defaults to `None`):
+                The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
+                spectrogram.
+            output_cross_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of the decoder's cross-attention layers.
+
+        Returns:
+            `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
+            - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
+              `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
+            - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
+              `(num_frames,)` -- The predicted speech waveform.
+            - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`) `torch.FloatTensor`
+              of shape `(config.decoder_layers, config.decoder_attention_heads, output_sequence_length,
+              input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
+        """
+        return _generate_speech(
+            self,
+            input_ids,
+            speaker_embeddings,
+            threshold,
+            minlenratio,
+            maxlenratio,
+            vocoder,
+            output_cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """SpeechT5 Model with a speech encoder and a speech decoder.""",
+    SPEECHT5_START_DOCSTRING,
+)
+class SpeechT5ForSpeechToSpeech(SpeechT5PreTrainedModel):
+    def __init__(self, config: SpeechT5Config):
+        super().__init__(config)
+
+        speech_encoder = SpeechT5EncoderWithSpeechPrenet(config)
+        speech_decoder = SpeechT5DecoderWithSpeechPrenet(config)
+        self.speecht5 = SpeechT5Model(config, speech_encoder, speech_decoder)
+
+        self.speech_decoder_postnet = SpeechT5SpeechDecoderPostnet(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_encoder(self):
+        return self.speecht5.get_encoder()
+
+    def get_decoder(self):
+        return self.speecht5.get_decoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.get_encoder().prenet.freeze_feature_encoder()
+
+    @add_start_docstrings_to_model_forward(SPEECHT5_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqSpectrogramOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        decoder_input_values: Optional[torch.FloatTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        decoder_head_mask: Optional[torch.FloatTensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        speaker_embeddings: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.FloatTensor] = None,
+        stop_labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, Seq2SeqSpectrogramOutput]:
+        r"""
+        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
+            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (*pip install
+            soundfile*). To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding
+            and conversion into a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
+        decoder_input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`):
+            Float values of input mel spectrogram.
+
+            SpeechT5 uses an all-zero spectrum as the starting token for `decoder_input_values` generation. If
+            `past_key_values` is used, optionally only the last `decoder_input_values` have to be input (see
+            `past_key_values`).
+        speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
+            Tensor containing the speaker embeddings.
+        labels (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`, *optional*):
+            Float values of target mel spectrogram. Spectrograms can be obtained using [`SpeechT5Processor`]. See
+            [`SpeechT5Processor.__call__`] for details.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import SpeechT5Processor, SpeechT5ForSpeechToSpeech, SpeechT5HifiGan, set_seed
+        >>> from datasets import load_dataset
+        >>> import torch
+
+        >>> dataset = load_dataset(
+        ...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation"
+        ... )  # doctest: +IGNORE_RESULT
+        >>> dataset = dataset.sort("id")
+        >>> sampling_rate = dataset.features["audio"].sampling_rate
+
+        >>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_vc")
+        >>> model = SpeechT5ForSpeechToSpeech.from_pretrained("microsoft/speecht5_vc")
+        >>> vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+
+        >>> # audio file is decoded on the fly
+        >>> inputs = processor(audio=dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
+
+        >>> speaker_embeddings = torch.zeros((1, 512))  # or load xvectors from a file
+
+        >>> set_seed(555)  # make deterministic
+
+        >>> # generate speech
+        >>> speech = model.generate_speech(inputs["input_values"], speaker_embeddings, vocoder=vocoder)
+        >>> speech.shape
+        torch.Size([77824])
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if stop_labels is not None:
+            warnings.warn(
+                "The argument `stop_labels` is deprecated and will be removed in version 4.30.0 of Transformers",
+                FutureWarning,
+            )
+
+        if labels is not None:
+            if decoder_input_values is None:
+                decoder_input_values = shift_spectrograms_right(labels, self.config.reduction_factor)
+
+        outputs = self.speecht5(
+            input_values=input_values,
+            attention_mask=attention_mask,
+            decoder_input_values=decoder_input_values,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            encoder_outputs=encoder_outputs,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            speaker_embeddings=speaker_embeddings,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+
+        _, spectrogram, logits = self.speech_decoder_postnet(outputs[0])
+
+        loss = None
+
+        if not return_dict:
+            output = (spectrogram,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return Seq2SeqSpectrogramOutput(
+            loss=loss,
+            spectrogram=spectrogram,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+    @torch.no_grad()
+    def generate_speech(
+        self,
+        input_values: torch.FloatTensor,
+        speaker_embeddings: Optional[torch.FloatTensor] = None,
+        threshold: float = 0.5,
+        minlenratio: float = 0.0,
+        maxlenratio: float = 20.0,
+        vocoder: Optional[nn.Module] = None,
+        output_cross_attentions: bool = False,
+    ) -> torch.FloatTensor:
+        r"""
+        Converts a raw speech waveform into a sequence of mel spectrograms, which are subsequently turned back into a
+        speech waveform using a vocoder.
+
+        Args:
+            input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+                Float values of input raw speech waveform. The `batch_size` should be 1 currently.
+
+                Values can be obtained by loading a *.flac* or *.wav* audio file into an array of type `List[float]` or
+                a `numpy.ndarray`, *e.g.* via the soundfile library (*pip install soundfile*). To prepare the array
+                into `input_values`, the [`SpeechT5Processor`] should be used for padding and conversion into a tensor
+                of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
+            speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
+                Tensor containing the speaker embeddings.
+            threshold (`float`, *optional*, defaults to 0.5):
+                The generated sequence ends when the predicted stop token probability exceeds this value.
+            minlenratio (`float`, *optional*, defaults to 0.0):
+                Used to calculate the minimum required length for the output sequence.
+            maxlenratio (`float`, *optional*, defaults to 20.0):
+                Used to calculate the maximum allowed length for the output sequence.
+            vocoder (`nn.Module`, *optional*, defaults to `None`):
+                The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
+                spectrogram.
+            output_cross_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of the decoder's cross-attention layers.
+
+        Returns:
+            `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
+            - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
+              `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
+            - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
+              `(num_frames,)` -- The predicted speech waveform.
+            - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`) `torch.FloatTensor`
+              of shape `(config.decoder_layers, config.decoder_attention_heads, output_sequence_length,
+              input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
+        """
+        if speaker_embeddings is None:
+            speaker_embeddings = torch.zeros((1, 512), device=input_values.device)
+
+        return _generate_speech(
+            self,
+            input_values,
+            speaker_embeddings,
+            threshold,
+            minlenratio,
+            maxlenratio,
+            vocoder,
+            output_cross_attentions,
+        )
+
+
+HIFIGAN_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`SpeechT5HifiGanConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+class HifiGanResidualBlock(nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), leaky_relu_slope=0.1):
+        super().__init__()
+        self.leaky_relu_slope = leaky_relu_slope
+
+        self.convs1 = nn.ModuleList(
+            [
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    stride=1,
+                    dilation=dilation[i],
+                    padding=self.get_padding(kernel_size, dilation[i]),
+                )
+                for i in range(len(dilation))
+            ]
+        )
+        self.convs2 = nn.ModuleList(
+            [
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    stride=1,
+                    dilation=1,
+                    padding=self.get_padding(kernel_size, 1),
+                )
+                for _ in range(len(dilation))
+            ]
+        )
+
+    def get_padding(self, kernel_size, dilation=1):
+        return (kernel_size * dilation - dilation) // 2
+
+    def apply_weight_norm(self):
+        for layer in self.convs1:
+            nn.utils.weight_norm(layer)
+        for layer in self.convs2:
+            nn.utils.weight_norm(layer)
+
+    def remove_weight_norm(self):
+        for layer in self.convs1:
+            nn.utils.remove_weight_norm(layer)
+        for layer in self.convs2:
+            nn.utils.remove_weight_norm(layer)
+
+    def forward(self, hidden_states):
+        for conv1, conv2 in zip(self.convs1, self.convs2):
+            residual = hidden_states
+            hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
+            hidden_states = conv1(hidden_states)
+            hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
+            hidden_states = conv2(hidden_states)
+            hidden_states = hidden_states + residual
+        return hidden_states
+
+
+@add_start_docstrings(
+    """HiFi-GAN vocoder.""",
+    HIFIGAN_START_DOCSTRING,
+)
+class SpeechT5HifiGan(PreTrainedModel):
+    config_class = SpeechT5HifiGanConfig
+    main_input_name = "spectrogram"
+
+    def __init__(self, config: SpeechT5HifiGanConfig):
+        super().__init__(config)
+        self.num_kernels = len(config.resblock_kernel_sizes)
+        self.num_upsamples = len(config.upsample_rates)
+        self.conv_pre = nn.Conv1d(
+            config.model_in_dim,
+            config.upsample_initial_channel,
+            kernel_size=7,
+            stride=1,
+            padding=3,
+        )
+
+        self.upsampler = nn.ModuleList()
+        for i, (upsample_rate, kernel_size) in enumerate(zip(config.upsample_rates, config.upsample_kernel_sizes)):
+            self.upsampler.append(
+                nn.ConvTranspose1d(
+                    config.upsample_initial_channel // (2**i),
+                    config.upsample_initial_channel // (2 ** (i + 1)),
+                    kernel_size=kernel_size,
+                    stride=upsample_rate,
+                    padding=(kernel_size - upsample_rate) // 2,
+                )
+            )
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.upsampler)):
+            channels = config.upsample_initial_channel // (2 ** (i + 1))
+            for kernel_size, dilation in zip(config.resblock_kernel_sizes, config.resblock_dilation_sizes):
+                self.resblocks.append(HifiGanResidualBlock(channels, kernel_size, dilation, config.leaky_relu_slope))
+
+        self.conv_post = nn.Conv1d(channels, 1, kernel_size=7, stride=1, padding=3)
+
+        self.register_buffer("mean", torch.zeros(config.model_in_dim))
+        self.register_buffer("scale", torch.ones(config.model_in_dim))
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+
+    def apply_weight_norm(self):
+        nn.utils.weight_norm(self.conv_pre)
+        for layer in self.upsampler:
+            nn.utils.weight_norm(layer)
+        for layer in self.resblocks:
+            layer.apply_weight_norm()
+        nn.utils.weight_norm(self.conv_post)
+
+    def remove_weight_norm(self):
+        nn.utils.remove_weight_norm(self.conv_pre)
+        for layer in self.upsampler:
+            nn.utils.remove_weight_norm(layer)
+        for layer in self.resblocks:
+            layer.remove_weight_norm()
+        nn.utils.remove_weight_norm(self.conv_post)
+
+    def forward(self, spectrogram: torch.FloatTensor) -> torch.FloatTensor:
+        r"""
+        Converts a log-mel spectrogram into a speech waveform. Passing a batch of log-mel spectrograms returns a batch
+        of speech waveforms. Passing a single, un-batched log-mel spectrogram returns a single, un-batched speech
+        waveform.
+
+        Args:
+            spectrogram (`torch.FloatTensor`):
+                Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length,
+                config.model_in_dim)`, or un-batched and of shape `(sequence_length, config.model_in_dim)`.
+
+        Returns:
+            `torch.FloatTensor`: Tensor containing the speech waveform. If the input spectrogram is batched, will be of
+            shape `(batch_size, num_frames,)`. If un-batched, will be of shape `(num_frames,)`.
+        """
+        if self.config.normalize_before:
+            spectrogram = (spectrogram - self.mean) / self.scale
+
+        is_batched = spectrogram.dim() == 3
+        if not is_batched:
+            spectrogram = spectrogram.unsqueeze(0)
+
+        hidden_states = spectrogram.transpose(2, 1)
+
+        hidden_states = self.conv_pre(hidden_states)
+        for i in range(self.num_upsamples):
+            hidden_states = nn.functional.leaky_relu(hidden_states, self.config.leaky_relu_slope)
+            hidden_states = self.upsampler[i](hidden_states)
+
+            res_state = self.resblocks[i * self.num_kernels](hidden_states)
+            for j in range(1, self.num_kernels):
+                res_state += self.resblocks[i * self.num_kernels + j](hidden_states)
+            hidden_states = res_state / self.num_kernels
+
+        hidden_states = nn.functional.leaky_relu(hidden_states)
+        hidden_states = self.conv_post(hidden_states)
+        hidden_states = torch.tanh(hidden_states)
+
+        if not is_batched:
+            # remove batch dim and collapse tensor to 1-d audio waveform
+            waveform = hidden_states.squeeze(0).transpose(1, 0).view(-1)
+        else:
+            # remove seq-len dim since this collapses to 1
+            waveform = hidden_states.squeeze(1)
+
+        return waveform
diff --git a/src/transformers/models/speecht5/number_normalizer.py b/src/transformers/models/speecht5/number_normalizer.py
new file mode 100644
index 000000000000..eb3314c24f24
--- /dev/null
+++ b/src/transformers/models/speecht5/number_normalizer.py
@@ -0,0 +1,192 @@
+# coding=utf-8
+# Copyright 2023 The Fairseq Authors, Microsoft Research, and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Number Normalizer class for SpeechT5."""
+
+import re
+
+
+class EnglishNumberNormalizer:
+    def __init__(self):
+        self.ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
+        self.teens = [
+            "",
+            "eleven",
+            "twelve",
+            "thirteen",
+            "fourteen",
+            "fifteen",
+            "sixteen",
+            "seventeen",
+            "eighteen",
+            "nineteen",
+        ]
+        self.tens = ["", "ten", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
+        self.thousands = [
+            "",
+            "thousand",
+            "million",
+            "billion",
+            "trillion",
+            "quadrillion",
+            "quintillion",
+            "sextillion",
+            "septillion",
+            "octillion",
+            "nonillion",
+            "decillion",
+        ]
+
+        # Define a dictionary to map currency symbols to their names
+        # Top most traded currencies according to
+        # https://en.wikipedia.org/wiki/Template:Most_traded_currencies
+        self.currency_symbols = {
+            "$": " dollars",
+            "€": " euros",
+            "£": " pounds",
+            "¢": " cents",
+            "¥": " japanese yen",
+            "﷼": " saudi riyal",
+            "₹": " indian rupees",
+            "₽": " russian rubles",
+            "฿": " thai baht",
+            "₺": " turkish liras",
+            "₴": " ukrainian hryvnia",
+            "₣": " swiss francs",
+            "₡": " costa rican colon",
+            "₱": " philippine peso",
+            "₪": " israeli shekels",
+            "₮": " mongolian tögrög",
+            "₩": " south korean won",
+            "₦": " nigerian naira",
+            "₫": " vietnamese Đồng",
+        }
+
+    def spell_number(self, num):
+        if num == 0:
+            return "zero"
+
+        parts = []
+        for i in range(0, len(self.thousands)):
+            if num % 1000 != 0:
+                part = ""
+                hundreds = num % 1000 // 100
+                tens_units = num % 100
+
+                if hundreds > 0:
+                    part += self.ones[hundreds] + " hundred"
+                    if tens_units > 0:
+                        part += " and "
+
+                if tens_units > 10 and tens_units < 20:
+                    part += self.teens[tens_units - 10]
+                else:
+                    tens_digit = self.tens[tens_units // 10]
+                    ones_digit = self.ones[tens_units % 10]
+                    if tens_digit:
+                        part += tens_digit
+                    if ones_digit:
+                        if tens_digit:
+                            part += " "
+                        part += ones_digit
+
+                parts.append(part)
+
+            num //= 1000
+
+        return " ".join(reversed(parts))
+
+    def convert(self, number):
+        """
+        Converts an individual number passed in string form to spelt-out form
+        """
+        if "." in number:
+            integer_part, decimal_part = number.split(".")
+        else:
+            integer_part, decimal_part = number, "00"
+
+        # Extract currency symbol if present
+        currency_symbol = ""
+        for symbol, name in self.currency_symbols.items():
+            if integer_part.startswith(symbol):
+                currency_symbol = name
+                integer_part = integer_part[len(symbol) :]
+                break
+
+            if integer_part.startswith("-"):
+                if integer_part[1:].startswith(symbol):
+                    currency_symbol = name
+                    integer_part = "-" + integer_part[len(symbol) + 1 :]
+                    break
+
+        # Extract 'minus' prefix for negative numbers
+        minus_prefix = ""
+        if integer_part.startswith("-"):
+            minus_prefix = "minus "
+            integer_part = integer_part[1:]
+        elif integer_part.startswith("minus"):
+            minus_prefix = "minus "
+            integer_part = integer_part[len("minus") :]
+
+        percent_suffix = ""
+        if "%" in integer_part or "%" in decimal_part:
+            percent_suffix = " percent"
+            integer_part = integer_part.replace("%", "")
+            decimal_part = decimal_part.replace("%", "")
+
+        integer_part = integer_part.zfill(3 * ((len(integer_part) - 1) // 3 + 1))
+
+        parts = []
+        for i in range(0, len(integer_part), 3):
+            chunk = int(integer_part[i : i + 3])
+            if chunk > 0:
+                part = self.spell_number(chunk)
+                unit = self.thousands[len(integer_part[i:]) // 3 - 1]
+                if unit:
+                    part += " " + unit
+                parts.append(part)
+
+        spelled_integer = " ".join(parts)
+
+        # Format the spelt-out number based on conditions, such as:
+        # If it has decimal parts, currency symbol, minus prefix, etc
+        if decimal_part == "00":
+            return (
+                f"{minus_prefix}{spelled_integer}{percent_suffix}{currency_symbol}"
+                if minus_prefix or currency_symbol
+                else f"{spelled_integer}{percent_suffix}"
+            )
+        else:
+            spelled_decimal = " ".join([self.spell_number(int(digit)) for digit in decimal_part])
+            return (
+                f"{minus_prefix}{spelled_integer} point {spelled_decimal}{percent_suffix}{currency_symbol}"
+                if minus_prefix or currency_symbol
+                else f"{minus_prefix}{spelled_integer} point {spelled_decimal}{percent_suffix}"
+            )
+
+    def __call__(self, text):
+        """
+        Convert numbers / number-like quantities in a string to their spelt-out counterparts
+        """
+        # Form part of the pattern for all currency symbols
+        pattern = r"(?<!\w)(-?\$?\€?\£?\¢?\¥?\₹?\₽?\฿?\₺?\₴?\₣?\₡?\₱?\₪?\₮?\₩?\₦?\₫?\﷼?\d+(?:\.\d{1,2})?%?)(?!\w)"
+
+        # Find and replace commas in numbers (15,000 -> 15000, etc)
+        text = re.sub(r"(\d+,\d+)", lambda match: match.group(1).replace(",", ""), text)
+
+        # Use regex to find and replace numbers in the text
+        converted_text = re.sub(pattern, lambda match: self.convert(match.group(1)), text)
+        converted_text = re.sub(" +", " ", converted_text)
+
+        return converted_text
diff --git a/src/transformers/models/speecht5/processing_speecht5.py b/src/transformers/models/speecht5/processing_speecht5.py
new file mode 100644
index 000000000000..27353b4702b1
--- /dev/null
+++ b/src/transformers/models/speecht5/processing_speecht5.py
@@ -0,0 +1,182 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Speech processor class for SpeechT5."""
+
+from ...processing_utils import ProcessorMixin
+
+
+class SpeechT5Processor(ProcessorMixin):
+    r"""
+    Constructs a SpeechT5 processor which wraps a feature extractor and a tokenizer into a single processor.
+
+    [`SpeechT5Processor`] offers all the functionalities of [`SpeechT5FeatureExtractor`] and [`SpeechT5Tokenizer`]. See
+    the docstring of [`~SpeechT5Processor.__call__`] and [`~SpeechT5Processor.decode`] for more information.
+
+    Args:
+        feature_extractor (`SpeechT5FeatureExtractor`):
+            An instance of [`SpeechT5FeatureExtractor`]. The feature extractor is a required input.
+        tokenizer (`SpeechT5Tokenizer`):
+            An instance of [`SpeechT5Tokenizer`]. The tokenizer is a required input.
+    """
+    feature_extractor_class = "SpeechT5FeatureExtractor"
+    tokenizer_class = "SpeechT5Tokenizer"
+
+    def __init__(self, feature_extractor, tokenizer):
+        super().__init__(feature_extractor, tokenizer)
+
+    def __call__(self, *args, **kwargs):
+        """
+        Processes audio and text input, as well as audio and text targets.
+
+        You can process audio by using the argument `audio`, or process audio targets by using the argument
+        `audio_target`. This forwards the arguments to SpeechT5FeatureExtractor's
+        [`~SpeechT5FeatureExtractor.__call__`].
+
+        You can process text by using the argument `text`, or process text labels by using the argument `text_target`.
+        This forwards the arguments to SpeechT5Tokenizer's [`~SpeechT5Tokenizer.__call__`].
+
+        Valid input combinations are:
+
+        - `text` only
+        - `audio` only
+        - `text_target` only
+        - `audio_target` only
+        - `text` and `audio_target`
+        - `audio` and `audio_target`
+        - `text` and `text_target`
+        - `audio` and `text_target`
+
+        Please refer to the docstring of the above two methods for more information.
+        """
+        audio = kwargs.pop("audio", None)
+        text = kwargs.pop("text", None)
+        text_target = kwargs.pop("text_target", None)
+        audio_target = kwargs.pop("audio_target", None)
+        sampling_rate = kwargs.pop("sampling_rate", None)
+
+        if audio is not None and text is not None:
+            raise ValueError(
+                "Cannot process both `audio` and `text` inputs. Did you mean `audio_target` or `text_target`?"
+            )
+        if audio_target is not None and text_target is not None:
+            raise ValueError(
+                "Cannot process both `audio_target` and `text_target` inputs. Did you mean `audio` or `text`?"
+            )
+        if audio is None and audio_target is None and text is None and text_target is None:
+            raise ValueError(
+                "You need to specify either an `audio`, `audio_target`, `text`, or `text_target` input to process."
+            )
+
+        if audio is not None:
+            inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
+        elif text is not None:
+            inputs = self.tokenizer(text, **kwargs)
+        else:
+            inputs = None
+
+        if audio_target is not None:
+            targets = self.feature_extractor(audio_target=audio_target, *args, sampling_rate=sampling_rate, **kwargs)
+            labels = targets["input_values"]
+        elif text_target is not None:
+            targets = self.tokenizer(text_target, **kwargs)
+            labels = targets["input_ids"]
+        else:
+            targets = None
+
+        if inputs is None:
+            return targets
+
+        if targets is not None:
+            inputs["labels"] = labels
+
+            decoder_attention_mask = targets.get("attention_mask")
+            if decoder_attention_mask is not None:
+                inputs["decoder_attention_mask"] = decoder_attention_mask
+
+        return inputs
+
+    def pad(self, *args, **kwargs):
+        """
+        Collates the audio and text inputs, as well as their targets, into a padded batch.
+
+        Audio inputs are padded by SpeechT5FeatureExtractor's [`~SpeechT5FeatureExtractor.pad`]. Text inputs are padded
+        by SpeechT5Tokenizer's [`~SpeechT5Tokenizer.pad`].
+
+        Valid input combinations are:
+
+        - `input_ids` only
+        - `input_values` only
+        - `labels` only, either log-mel spectrograms or text tokens
+        - `input_ids` and log-mel spectrogram `labels`
+        - `input_values` and text `labels`
+
+        Please refer to the docstring of the above two methods for more information.
+        """
+        input_values = kwargs.pop("input_values", None)
+        input_ids = kwargs.pop("input_ids", None)
+        labels = kwargs.pop("labels", None)
+
+        if input_values is not None and input_ids is not None:
+            raise ValueError("Cannot process both `input_values` and `input_ids` inputs.")
+        if input_values is None and input_ids is None and labels is None:
+            raise ValueError(
+                "You need to specify either an `input_values`, `input_ids`, or `labels` input to be padded."
+            )
+
+        if input_values is not None:
+            inputs = self.feature_extractor.pad(input_values, *args, **kwargs)
+        elif input_ids is not None:
+            inputs = self.tokenizer.pad(input_ids, **kwargs)
+        else:
+            inputs = None
+
+        if labels is not None:
+            if "input_ids" in labels or (isinstance(labels, list) and "input_ids" in labels[0]):
+                targets = self.tokenizer.pad(labels, **kwargs)
+                labels = targets["input_ids"]
+            else:
+                feature_size_hack = self.feature_extractor.feature_size
+                self.feature_extractor.feature_size = self.feature_extractor.num_mel_bins
+                targets = self.feature_extractor.pad(labels, *args, **kwargs)
+                self.feature_extractor.feature_size = feature_size_hack
+                labels = targets["input_values"]
+        else:
+            targets = None
+
+        if inputs is None:
+            return targets
+
+        if targets is not None:
+            inputs["labels"] = labels
+
+            decoder_attention_mask = targets.get("attention_mask")
+            if decoder_attention_mask is not None:
+                inputs["decoder_attention_mask"] = decoder_attention_mask
+
+        return inputs
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to SpeechT5Tokenizer's [`~SpeechT5Tokenizer.batch_decode`]. Please refer
+        to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to SpeechT5Tokenizer's [`~SpeechT5Tokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
diff --git a/src/transformers/models/speecht5/tokenization_speecht5.py b/src/transformers/models/speecht5/tokenization_speecht5.py
new file mode 100644
index 000000000000..544dfeaf5d2d
--- /dev/null
+++ b/src/transformers/models/speecht5/tokenization_speecht5.py
@@ -0,0 +1,229 @@
+# coding=utf-8
+# Copyright 2023 The Facebook Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization class for SpeechT5."""
+
+
+import os
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple
+
+import sentencepiece as spm
+
+from ...tokenization_utils import PreTrainedTokenizer
+from ...utils import logging
+from .number_normalizer import EnglishNumberNormalizer
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "spm_char.model"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "microsoft/speecht5_asr": "https://huggingface.co/microsoft/speecht5_asr/resolve/main/spm_char.model",
+        "microsoft/speecht5_tts": "https://huggingface.co/microsoft/speecht5_tts/resolve/main/spm_char.model",
+        "microsoft/speecht5_vc": "https://huggingface.co/microsoft/speecht5_vc/resolve/main/spm_char.model",
+    }
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "microsoft/speecht5_asr": 1024,
+    "microsoft/speecht5_tts": 1024,
+    "microsoft/speecht5_vc": 1024,
+}
+
+
+class SpeechT5Tokenizer(PreTrainedTokenizer):
+    """
+    Construct a SpeechT5 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The begin of sequence token.
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        normalize (`bool`, *optional*, defaults to `False`):
+            Whether to convert numeric quantities in the text to their spelt-out english counterparts.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+
+    Attributes:
+        sp_model (`SentencePieceProcessor`):
+            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        bos_token="<s>",
+        eos_token="</s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        normalize=False,
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ) -> None:
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        self.vocab_file = vocab_file
+        self.normalize = normalize
+        self._normalizer = None
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            normalize=normalize,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+
+    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
+        normalize = kwargs.pop("normalize", self.normalize)
+        if is_split_into_words:
+            text = " " + text
+        if normalize:
+            text = self.normalizer(text)
+        return (text, kwargs)
+
+    @property
+    def vocab_size(self):
+        return self.sp_model.get_piece_size()
+
+    @property
+    def normalizer(self):
+        if self._normalizer is None:
+            self._normalizer = EnglishNumberNormalizer()
+        return self._normalizer
+
+    @normalizer.setter
+    def normalizer(self, value):
+        self._normalizer = value
+
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+
+        # for backward compatibility
+        if not hasattr(self, "sp_model_kwargs"):
+            self.sp_model_kwargs = {}
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(self.vocab_file)
+
+    def _tokenize(self, text: str) -> List[str]:
+        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
+        return self.sp_model.encode(text, out_type=str)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.sp_model.piece_to_id(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        token = self.sp_model.IdToPiece(index)
+        return token
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        current_sub_tokens = []
+        out_string = ""
+        for token in tokens:
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string.strip()
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
+        """Build model inputs from a sequence by appending eos_token_id."""
+        if token_ids_1 is None:
+            return token_ids_0 + [self.eos_token_id]
+        # We don't expect to process pairs, but leave the pair logic for API consistency
+        return token_ids_0 + token_ids_1 + [self.eos_token_id]
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        suffix_ones = [1]
+        if token_ids_1 is None:
+            return ([0] * len(token_ids_0)) + suffix_ones
+        return ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        return (out_vocab_file,)
diff --git a/src/transformers/models/splinter/__init__.py b/src/transformers/models/splinter/__init__.py
index 9f056d7200a1..24355c01add7 100644
--- a/src/transformers/models/splinter/__init__.py
+++ b/src/transformers/models/splinter/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/splinter/configuration_splinter.py b/src/transformers/models/splinter/configuration_splinter.py
index 60b2580bec1a..bdbe5f013143 100644
--- a/src/transformers/models/splinter/configuration_splinter.py
+++ b/src/transformers/models/splinter/configuration_splinter.py
@@ -107,7 +107,7 @@ def __init__(
         use_cache=True,
         pad_token_id=0,
         question_token_id=104,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
 
diff --git a/src/transformers/models/splinter/modeling_splinter.py b/src/transformers/models/splinter/modeling_splinter.py
index 914f4784146b..f72ffb10111b 100755
--- a/src/transformers/models/splinter/modeling_splinter.py
+++ b/src/transformers/models/splinter/modeling_splinter.py
@@ -36,7 +36,6 @@
 
 _CHECKPOINT_FOR_DOC = "tau/splinter-base"
 _CONFIG_FOR_DOC = "SplinterConfig"
-_TOKENIZER_FOR_DOC = "SplinterTokenizer"
 
 SPLINTER_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "tau/splinter-base",
@@ -62,7 +61,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
 
     def forward(
@@ -442,6 +443,13 @@ def forward(
         all_self_attentions = () if output_attentions else None
         all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         next_decoder_cache = () if use_cache else None
         for i, layer_module in enumerate(self.layer):
             if output_hidden_states:
@@ -452,12 +460,6 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         return module(*inputs, past_key_value, output_attentions)
@@ -524,7 +526,6 @@ class SplinterPreTrainedModel(PreTrainedModel):
     config_class = SplinterConfig
     base_model_prefix = "splinter"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
     def _init_weights(self, module):
@@ -564,7 +565,7 @@ def _set_gradient_checkpointing(self, module, value=False):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`SplinterTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -646,7 +647,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(SPLINTER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPastAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -700,6 +700,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -848,7 +849,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(SPLINTER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/splinter/tokenization_splinter.py b/src/transformers/models/splinter/tokenization_splinter.py
index 40daeb09465a..909905979be3 100644
--- a/src/transformers/models/splinter/tokenization_splinter.py
+++ b/src/transformers/models/splinter/tokenization_splinter.py
@@ -135,22 +135,8 @@ def __init__(
         question_token="[QUESTION]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
-        super().__init__(
-            do_lower_case=do_lower_case,
-            do_basic_tokenize=do_basic_tokenize,
-            never_split=never_split,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            tokenize_chinese_chars=tokenize_chinese_chars,
-            strip_accents=strip_accents,
-            **kwargs,
-        )
-
         if not os.path.isfile(vocab_file):
             raise ValueError(
                 f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
@@ -166,8 +152,21 @@ def __init__(
                 tokenize_chinese_chars=tokenize_chinese_chars,
                 strip_accents=strip_accents,
             )
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
         self.question_token = question_token
+        super().__init__(
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
 
     @property
     def question_token_id(self):
@@ -192,7 +191,6 @@ def _tokenize(self, text):
         split_tokens = []
         if self.do_basic_tokenize:
             for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-
                 # If the token is part of the never_split set
                 if token in self.basic_tokenizer.never_split:
                     split_tokens.append(token)
diff --git a/src/transformers/models/splinter/tokenization_splinter_fast.py b/src/transformers/models/splinter/tokenization_splinter_fast.py
index 6eb69755905a..97db72caadc0 100644
--- a/src/transformers/models/splinter/tokenization_splinter_fast.py
+++ b/src/transformers/models/splinter/tokenization_splinter_fast.py
@@ -113,7 +113,7 @@ def __init__(
         question_token="[QUESTION]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
diff --git a/src/transformers/models/squeezebert/__init__.py b/src/transformers/models/squeezebert/__init__.py
index 9f758bebe024..b3af76dff7e1 100644
--- a/src/transformers/models/squeezebert/__init__.py
+++ b/src/transformers/models/squeezebert/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/squeezebert/configuration_squeezebert.py b/src/transformers/models/squeezebert/configuration_squeezebert.py
index 639be83a6c5a..5757b9410fce 100644
--- a/src/transformers/models/squeezebert/configuration_squeezebert.py
+++ b/src/transformers/models/squeezebert/configuration_squeezebert.py
@@ -134,7 +134,7 @@ def __init__(
         post_attention_groups=1,
         intermediate_groups=4,
         output_groups=4,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
 
diff --git a/src/transformers/models/squeezebert/modeling_squeezebert.py b/src/transformers/models/squeezebert/modeling_squeezebert.py
index ffe43013ef8d..0ac1260c82b0 100644
--- a/src/transformers/models/squeezebert/modeling_squeezebert.py
+++ b/src/transformers/models/squeezebert/modeling_squeezebert.py
@@ -41,7 +41,6 @@
 
 _CHECKPOINT_FOR_DOC = "squeezebert/squeezebert-uncased"
 _CONFIG_FOR_DOC = "SqueezeBertConfig"
-_TOKENIZER_FOR_DOC = "SqueezeBertTokenizer"
 
 SQUEEZEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "squeezebert/squeezebert-uncased",
@@ -65,7 +64,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
     def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
         if input_ids is not None:
@@ -316,7 +317,6 @@ def forward(
         output_hidden_states=False,
         return_dict=True,
     ):
-
         if head_mask is None:
             head_mask_is_all_none = True
         elif head_mask.count(None) == len(head_mask):
@@ -332,7 +332,6 @@ def forward(
         all_attentions = () if output_attentions else None
 
         for layer in self.layers:
-
             if output_hidden_states:
                 hidden_states = hidden_states.permute(0, 2, 1)
                 all_hidden_states += (hidden_states,)
@@ -428,7 +427,6 @@ class SqueezeBertPreTrainedModel(PreTrainedModel):
 
     config_class = SqueezeBertConfig
     base_model_prefix = "transformer"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -497,7 +495,7 @@ def _init_weights(self, module):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`SqueezeBertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -573,7 +571,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(SQUEEZEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPooling,
         config_class=_CONFIG_FOR_DOC,
@@ -599,6 +596,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -647,12 +645,7 @@ def forward(
 
 @add_start_docstrings("""SqueezeBERT Model with a `language modeling` head on top.""", SQUEEZEBERT_START_DOCSTRING)
 class SqueezeBertForMaskedLM(SqueezeBertPreTrainedModel):
-
-    _keys_to_ignore_on_load_missing = [
-        r"predictions.decoder.bias",
-        "cls.predictions.decoder.weight",
-        "embeddings.position_ids",
-    ]
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -671,7 +664,6 @@ def set_output_embeddings(self, new_embeddings):
 
     @add_start_docstrings_to_model_forward(SQUEEZEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -751,7 +743,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(SQUEEZEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -851,7 +842,6 @@ def __init__(self, config):
         SQUEEZEBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
     )
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -944,7 +934,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(SQUEEZEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1022,7 +1011,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(SQUEEZEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/squeezebert/tokenization_squeezebert.py b/src/transformers/models/squeezebert/tokenization_squeezebert.py
index 00d450058238..0cefa03edf3e 100644
--- a/src/transformers/models/squeezebert/tokenization_squeezebert.py
+++ b/src/transformers/models/squeezebert/tokenization_squeezebert.py
@@ -136,22 +136,8 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
-        super().__init__(
-            do_lower_case=do_lower_case,
-            do_basic_tokenize=do_basic_tokenize,
-            never_split=never_split,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            tokenize_chinese_chars=tokenize_chinese_chars,
-            strip_accents=strip_accents,
-            **kwargs,
-        )
-
         if not os.path.isfile(vocab_file):
             raise ValueError(
                 f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
@@ -167,7 +153,22 @@ def __init__(
                 tokenize_chinese_chars=tokenize_chinese_chars,
                 strip_accents=strip_accents,
             )
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+
+        super().__init__(
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
 
     @property
     def do_lower_case(self):
@@ -180,11 +181,12 @@ def vocab_size(self):
     def get_vocab(self):
         return dict(self.vocab, **self.added_tokens_encoder)
 
-    def _tokenize(self, text):
+    def _tokenize(self, text, split_special_tokens=False):
         split_tokens = []
         if self.do_basic_tokenize:
-            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-
+            for token in self.basic_tokenizer.tokenize(
+                text, never_split=self.all_special_tokens if not split_special_tokens else None
+            ):
                 # If the token is part of the never_split set
                 if token in self.basic_tokenizer.never_split:
                     split_tokens.append(token)
@@ -329,20 +331,30 @@ class BasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -361,7 +373,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -389,7 +403,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
diff --git a/src/transformers/models/squeezebert/tokenization_squeezebert_fast.py b/src/transformers/models/squeezebert/tokenization_squeezebert_fast.py
index 0423c16fc331..23faab71349f 100644
--- a/src/transformers/models/squeezebert/tokenization_squeezebert_fast.py
+++ b/src/transformers/models/squeezebert/tokenization_squeezebert_fast.py
@@ -124,7 +124,7 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
@@ -173,7 +173,7 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
         """
         output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
 
-        if token_ids_1:
+        if token_ids_1 is not None:
             output += token_ids_1 + [self.sep_token_id]
 
         return output
diff --git a/src/transformers/models/swiftformer/__init__.py b/src/transformers/models/swiftformer/__init__.py
new file mode 100644
index 000000000000..ddba2b806fd1
--- /dev/null
+++ b/src/transformers/models/swiftformer/__init__.py
@@ -0,0 +1,67 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_swiftformer": [
+        "SWIFTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "SwiftFormerConfig",
+        "SwiftFormerOnnxConfig",
+    ]
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_swiftformer"] = [
+        "SWIFTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "SwiftFormerForImageClassification",
+        "SwiftFormerModel",
+        "SwiftFormerPreTrainedModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_swiftformer import (
+        SWIFTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        SwiftFormerConfig,
+        SwiftFormerOnnxConfig,
+    )
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_swiftformer import (
+            SWIFTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            SwiftFormerForImageClassification,
+            SwiftFormerModel,
+            SwiftFormerPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/swiftformer/configuration_swiftformer.py b/src/transformers/models/swiftformer/configuration_swiftformer.py
new file mode 100644
index 000000000000..21dfe4cd8c52
--- /dev/null
+++ b/src/transformers/models/swiftformer/configuration_swiftformer.py
@@ -0,0 +1,136 @@
+# coding=utf-8
+# Copyright 2023 MBZUAI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" SwiftFormer model configuration"""
+
+from collections import OrderedDict
+from typing import Mapping
+
+from packaging import version
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+SWIFTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "MBZUAI/swiftformer-xs": "https://huggingface.co/MBZUAI/swiftformer-xs/resolve/main/config.json",
+}
+
+
+class SwiftFormerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`SwiftFormerModel`]. It is used to instantiate an
+    SwiftFormer model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the SwiftFormer
+    [MBZUAI/swiftformer-xs](https://huggingface.co/MBZUAI/swiftformer-xs) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels
+        depths (`List[int]`, *optional*, defaults to `[3, 3, 6, 4]`):
+            Depth of each stage
+        embed_dims (`List[int]`, *optional*, defaults to `[48, 56, 112, 220]`):
+            The embedding dimension at each stage
+        mlp_ratio (`int`, *optional*, defaults to 4):
+            Ratio of size of the hidden dimensionality of an MLP to the dimensionality of its input.
+        downsamples (`List[bool]`, *optional*, defaults to `[True, True, True, True]`):
+            Whether or not to downsample inputs between two stages.
+        hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (string). `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        down_patch_size (`int`, *optional*, defaults to 3):
+            The size of patches in downsampling layers.
+        down_stride (`int`, *optional*, defaults to 2):
+            The stride of convolution kernels in downsampling layers.
+        down_pad (`int`, *optional*, defaults to 1):
+            Padding in downsampling layers.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            Rate at which to increase dropout probability in DropPath.
+        use_layer_scale (`bool`, *optional*, defaults to `True`):
+            Whether to scale outputs from token mixers.
+        layer_scale_init_value (`float`, *optional*, defaults to 1e-05):
+            Factor by which outputs from token mixers are scaled.
+        batch_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the batch normalization layers.
+
+
+    Example:
+
+    ```python
+    >>> from transformers import SwiftFormerConfig, SwiftFormerModel
+
+    >>> # Initializing a SwiftFormer swiftformer-base-patch16-224 style configuration
+    >>> configuration = SwiftFormerConfig()
+
+    >>> # Initializing a model (with random weights) from the swiftformer-base-patch16-224 style configuration
+    >>> model = SwiftFormerModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "swiftformer"
+
+    def __init__(
+        self,
+        num_channels=3,
+        depths=[3, 3, 6, 4],
+        embed_dims=[48, 56, 112, 220],
+        mlp_ratio=4,
+        downsamples=[True, True, True, True],
+        hidden_act="gelu",
+        down_patch_size=3,
+        down_stride=2,
+        down_pad=1,
+        drop_path_rate=0.0,
+        use_layer_scale=True,
+        layer_scale_init_value=1e-5,
+        batch_norm_eps=1e-5,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.num_channels = num_channels
+        self.depths = depths
+        self.embed_dims = embed_dims
+        self.mlp_ratio = mlp_ratio
+        self.downsamples = downsamples
+        self.hidden_act = hidden_act
+        self.down_patch_size = down_patch_size
+        self.down_stride = down_stride
+        self.down_pad = down_pad
+        self.drop_path_rate = drop_path_rate
+        self.use_layer_scale = use_layer_scale
+        self.layer_scale_init_value = layer_scale_init_value
+        self.batch_norm_eps = batch_norm_eps
+
+
+class SwiftFormerOnnxConfig(OnnxConfig):
+    torch_onnx_minimum_version = version.parse("1.11")
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+            ]
+        )
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-4
diff --git a/src/transformers/models/swiftformer/convert_swiftformer_original_to_hf.py b/src/transformers/models/swiftformer/convert_swiftformer_original_to_hf.py
new file mode 100644
index 000000000000..f6cabb34b6a5
--- /dev/null
+++ b/src/transformers/models/swiftformer/convert_swiftformer_original_to_hf.py
@@ -0,0 +1,176 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert SwiftFormer checkpoints from the original implementation."""
+
+
+import argparse
+import json
+from pathlib import Path
+
+import requests
+import torch
+from huggingface_hub import hf_hub_download
+from PIL import Image
+
+from transformers import (
+    SwiftFormerConfig,
+    SwiftFormerForImageClassification,
+    ViTImageProcessor,
+)
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+device = torch.device("cpu")
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    return im
+
+
+def get_expected_output(swiftformer_name):
+    if swiftformer_name == "swiftformer_xs":
+        return torch.tensor([-2.1703e00, 2.1107e00, -2.0811e00, 8.8685e-01, 2.4360e-01])
+
+    elif swiftformer_name == "swiftformer_s":
+        return torch.tensor([3.9636e-01, 2.3478e-01, -1.6963e00, -1.7381e00, -8.6337e-01])
+
+    elif swiftformer_name == "swiftformer_l1":
+        return torch.tensor([-4.2768e-01, -4.7429e-01, -1.0897e00, -1.0248e00, 3.5523e-02])
+
+    elif swiftformer_name == "swiftformer_l3":
+        return torch.tensor([-2.5330e-01, 2.4211e-01, -6.0185e-01, -8.2789e-01, -6.0446e-02])
+
+
+def rename_key(dct, old, new):
+    val = dct.pop(old)
+    dct[new] = val
+
+
+def create_rename_keys(state_dict):
+    rename_keys = []
+    for k in state_dict.keys():
+        k_new = k
+        if ".pwconv" in k:
+            k_new = k_new.replace(".pwconv", ".point_wise_conv")
+        if ".dwconv" in k:
+            k_new = k_new.replace(".dwconv", ".depth_wise_conv")
+        if ".Proj." in k:
+            k_new = k_new.replace(".Proj.", ".proj.")
+        if "patch_embed" in k_new:
+            k_new = k_new.replace("patch_embed", "swiftformer.patch_embed.patch_embedding")
+        if "network" in k_new:
+            ls = k_new.split(".")
+            if ls[2].isdigit():
+                k_new = "swiftformer.encoder.network." + ls[1] + ".blocks." + ls[2] + "." + ".".join(ls[3:])
+            else:
+                k_new = k_new.replace("network", "swiftformer.encoder.network")
+        rename_keys.append((k, k_new))
+    return rename_keys
+
+
+@torch.no_grad()
+def convert_swiftformer_checkpoint(swiftformer_name, pytorch_dump_folder_path, original_ckpt):
+    """
+    Copy/paste/tweak model's weights to our SwiftFormer structure.
+    """
+
+    # define default SwiftFormer configuration
+    config = SwiftFormerConfig()
+
+    # dataset (ImageNet-21k only or also fine-tuned on ImageNet 2012), patch_size and image_size
+    config.num_labels = 1000
+    repo_id = "huggingface/label-files"
+    filename = "imagenet-1k-id2label.json"
+    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
+    id2label = {int(k): v for k, v in id2label.items()}
+    config.id2label = id2label
+    config.label2id = {v: k for k, v in id2label.items()}
+
+    # size of the architecture
+    if swiftformer_name == "swiftformer_xs":
+        config.depths = [3, 3, 6, 4]
+        config.embed_dims = [48, 56, 112, 220]
+
+    elif swiftformer_name == "swiftformer_s":
+        config.depths = [3, 3, 9, 6]
+        config.embed_dims = [48, 64, 168, 224]
+
+    elif swiftformer_name == "swiftformer_l1":
+        config.depths = [4, 3, 10, 5]
+        config.embed_dims = [48, 96, 192, 384]
+
+    elif swiftformer_name == "swiftformer_l3":
+        config.depths = [4, 4, 12, 6]
+        config.embed_dims = [64, 128, 320, 512]
+
+    # load state_dict of original model, remove and rename some keys
+    if original_ckpt:
+        if original_ckpt.startswith("https"):
+            checkpoint = torch.hub.load_state_dict_from_url(original_ckpt, map_location="cpu", check_hash=True)
+        else:
+            checkpoint = torch.load(original_ckpt, map_location="cpu")
+    state_dict = checkpoint
+
+    rename_keys = create_rename_keys(state_dict)
+    for rename_key_src, rename_key_dest in rename_keys:
+        rename_key(state_dict, rename_key_src, rename_key_dest)
+
+    # load HuggingFace model
+    hf_model = SwiftFormerForImageClassification(config).eval()
+    hf_model.load_state_dict(state_dict)
+
+    # prepare test inputs
+    image = prepare_img()
+    processor = ViTImageProcessor.from_pretrained("preprocessor_config")
+    inputs = processor(images=image, return_tensors="pt")
+
+    # compare outputs from both models
+    timm_logits = get_expected_output(swiftformer_name)
+    hf_logits = hf_model(inputs["pixel_values"]).logits
+
+    assert hf_logits.shape == torch.Size([1, 1000])
+    assert torch.allclose(hf_logits[0, 0:5], timm_logits, atol=1e-3)
+
+    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+    print(f"Saving model {swiftformer_name} to {pytorch_dump_folder_path}")
+    hf_model.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--swiftformer_name",
+        default="swiftformer_xs",
+        choices=["swiftformer_xs", "swiftformer_s", "swiftformer_l1", "swiftformer_l3"],
+        type=str,
+        help="Name of the SwiftFormer model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        default="./converted_outputs/",
+        type=str,
+        help="Path to the output PyTorch model directory.",
+    )
+    parser.add_argument("--original_ckpt", default=None, type=str, help="Path to the original model checkpoint.")
+
+    args = parser.parse_args()
+    convert_swiftformer_checkpoint(args.swiftformer_name, args.pytorch_dump_folder_path, args.original_ckpt)
diff --git a/src/transformers/models/swiftformer/modeling_swiftformer.py b/src/transformers/models/swiftformer/modeling_swiftformer.py
new file mode 100644
index 000000000000..ff72f87506d3
--- /dev/null
+++ b/src/transformers/models/swiftformer/modeling_swiftformer.py
@@ -0,0 +1,623 @@
+# coding=utf-8
+# Copyright 2023 MBZUAI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch SwiftFormer model."""
+
+
+import collections.abc
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2CLS
+from ...modeling_outputs import (
+    BaseModelOutputWithNoAttention,
+    ImageClassifierOutputWithNoAttention,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from .configuration_swiftformer import SwiftFormerConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "SwiftFormerConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "MBZUAI/swiftformer-xs"
+_EXPECTED_OUTPUT_SHAPE = [1, 220, 7, 7]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "MBZUAI/swiftformer-xs"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
+
+
+SWIFTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "MBZUAI/swiftformer-xs",
+    # See all SwiftFormer models at https://huggingface.co/models?filter=swiftformer
+]
+
+
+class SwiftFormerPatchEmbedding(nn.Module):
+    """
+    Patch Embedding Layer constructed of two 2D convolutional layers.
+
+    Input: tensor of shape `[batch_size, in_channels, height, width]`
+
+    Output: tensor of shape `[batch_size, out_channels, height/4, width/4]`
+    """
+
+    def __init__(self, config: SwiftFormerConfig):
+        super().__init__()
+
+        in_chs = config.num_channels
+        out_chs = config.embed_dims[0]
+        self.patch_embedding = nn.Sequential(
+            nn.Conv2d(in_chs, out_chs // 2, kernel_size=3, stride=2, padding=1),
+            nn.BatchNorm2d(out_chs // 2, eps=config.batch_norm_eps),
+            nn.ReLU(),
+            nn.Conv2d(out_chs // 2, out_chs, kernel_size=3, stride=2, padding=1),
+            nn.BatchNorm2d(out_chs, eps=config.batch_norm_eps),
+            nn.ReLU(),
+        )
+
+    def forward(self, x):
+        return self.patch_embedding(x)
+
+
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->Swiftformer
+class SwiftFormerDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+
+
+class SwiftFormerEmbeddings(nn.Module):
+    """
+    Embeddings layer consisting of a single 2D convolutional and batch normalization layer.
+
+    Input: tensor of shape `[batch_size, channels, height, width]`
+
+    Output: tensor of shape `[batch_size, channels, height/stride, width/stride]`
+    """
+
+    def __init__(self, config: SwiftFormerConfig, index: int):
+        super().__init__()
+
+        patch_size = config.down_patch_size
+        stride = config.down_stride
+        padding = config.down_pad
+        embed_dims = config.embed_dims
+
+        in_chans = embed_dims[index]
+        embed_dim = embed_dims[index + 1]
+
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        stride = stride if isinstance(stride, collections.abc.Iterable) else (stride, stride)
+        padding = padding if isinstance(padding, collections.abc.Iterable) else (padding, padding)
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride, padding=padding)
+        self.norm = nn.BatchNorm2d(embed_dim, eps=config.batch_norm_eps)
+
+    def forward(self, x):
+        x = self.proj(x)
+        x = self.norm(x)
+        return x
+
+
+class SwiftFormerConvEncoder(nn.Module):
+    """
+    `SwiftFormerConvEncoder` with 3*3 and 1*1 convolutions.
+
+    Input: tensor of shape `[batch_size, channels, height, width]`
+
+    Output: tensor of shape `[batch_size, channels, height, width]`
+    """
+
+    def __init__(self, config: SwiftFormerConfig, dim: int):
+        super().__init__()
+        hidden_dim = int(config.mlp_ratio * dim)
+
+        self.depth_wise_conv = nn.Conv2d(dim, dim, kernel_size=3, padding=1, groups=dim)
+        self.norm = nn.BatchNorm2d(dim, eps=config.batch_norm_eps)
+        self.point_wise_conv1 = nn.Conv2d(dim, hidden_dim, kernel_size=1)
+        self.act = nn.GELU()
+        self.point_wise_conv2 = nn.Conv2d(hidden_dim, dim, kernel_size=1)
+        self.drop_path = nn.Identity()
+        self.layer_scale = nn.Parameter(torch.ones(dim).unsqueeze(-1).unsqueeze(-1), requires_grad=True)
+
+    def forward(self, x):
+        input = x
+        x = self.depth_wise_conv(x)
+        x = self.norm(x)
+        x = self.point_wise_conv1(x)
+        x = self.act(x)
+        x = self.point_wise_conv2(x)
+        x = input + self.drop_path(self.layer_scale * x)
+        return x
+
+
+class SwiftFormerMlp(nn.Module):
+    """
+    MLP layer with 1*1 convolutions.
+
+    Input: tensor of shape `[batch_size, channels, height, width]`
+
+    Output: tensor of shape `[batch_size, channels, height, width]`
+    """
+
+    def __init__(self, config: SwiftFormerConfig, in_features: int):
+        super().__init__()
+        hidden_features = int(in_features * config.mlp_ratio)
+        self.norm1 = nn.BatchNorm2d(in_features, eps=config.batch_norm_eps)
+        self.fc1 = nn.Conv2d(in_features, hidden_features, 1)
+        act_layer = ACT2CLS[config.hidden_act]
+        self.act = act_layer()
+        self.fc2 = nn.Conv2d(hidden_features, in_features, 1)
+        self.drop = nn.Dropout(p=0.0)
+
+    def forward(self, x):
+        x = self.norm1(x)
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class SwiftFormerEfficientAdditiveAttention(nn.Module):
+    """
+    Efficient Additive Attention module for SwiftFormer.
+
+    Input: tensor of shape `[batch_size, channels, height, width]`
+
+    Output: tensor of shape `[batch_size, channels, height, width]`
+    """
+
+    def __init__(self, config: SwiftFormerConfig, dim: int = 512):
+        super().__init__()
+
+        self.to_query = nn.Linear(dim, dim)
+        self.to_key = nn.Linear(dim, dim)
+
+        self.w_g = nn.Parameter(torch.randn(dim, 1))
+        self.scale_factor = dim**-0.5
+        self.proj = nn.Linear(dim, dim)
+        self.final = nn.Linear(dim, dim)
+
+    def forward(self, x):
+        query = self.to_query(x)
+        key = self.to_key(x)
+
+        query = torch.nn.functional.normalize(query, dim=-1)
+        key = torch.nn.functional.normalize(key, dim=-1)
+
+        query_weight = query @ self.w_g
+        scaled_query_weight = query_weight * self.scale_factor
+        scaled_query_weight = scaled_query_weight.softmax(dim=-1)
+
+        global_queries = torch.sum(scaled_query_weight * query, dim=1)
+        global_queries = global_queries.unsqueeze(1).repeat(1, key.shape[1], 1)
+
+        out = self.proj(global_queries * key) + query
+        out = self.final(out)
+
+        return out
+
+
+class SwiftFormerLocalRepresentation(nn.Module):
+    """
+    Local Representation module for SwiftFormer that is implemented by 3*3 depth-wise and point-wise convolutions.
+
+    Input: tensor of shape `[batch_size, channels, height, width]`
+
+    Output: tensor of shape `[batch_size, channels, height, width]`
+    """
+
+    def __init__(self, config: SwiftFormerConfig, dim: int):
+        super().__init__()
+
+        self.depth_wise_conv = nn.Conv2d(dim, dim, kernel_size=3, padding=1, groups=dim)
+        self.norm = nn.BatchNorm2d(dim, eps=config.batch_norm_eps)
+        self.point_wise_conv1 = nn.Conv2d(dim, dim, kernel_size=1)
+        self.act = nn.GELU()
+        self.point_wise_conv2 = nn.Conv2d(dim, dim, kernel_size=1)
+        self.drop_path = nn.Identity()
+        self.layer_scale = nn.Parameter(torch.ones(dim).unsqueeze(-1).unsqueeze(-1), requires_grad=True)
+
+    def forward(self, x):
+        input = x
+        x = self.depth_wise_conv(x)
+        x = self.norm(x)
+        x = self.point_wise_conv1(x)
+        x = self.act(x)
+        x = self.point_wise_conv2(x)
+        x = input + self.drop_path(self.layer_scale * x)
+        return x
+
+
+class SwiftFormerEncoderBlock(nn.Module):
+    """
+    SwiftFormer Encoder Block for SwiftFormer. It consists of (1) Local representation module, (2)
+    SwiftFormerEfficientAdditiveAttention, and (3) MLP block.
+
+    Input: tensor of shape `[batch_size, channels, height, width]`
+
+    Output: tensor of shape `[batch_size, channels,height, width]`
+    """
+
+    def __init__(self, config: SwiftFormerConfig, dim: int, drop_path: float = 0.0) -> None:
+        super().__init__()
+
+        layer_scale_init_value = config.layer_scale_init_value
+        use_layer_scale = config.use_layer_scale
+
+        self.local_representation = SwiftFormerLocalRepresentation(config, dim=dim)
+        self.attn = SwiftFormerEfficientAdditiveAttention(config, dim=dim)
+        self.linear = SwiftFormerMlp(config, in_features=dim)
+        self.drop_path = SwiftFormerDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.use_layer_scale = use_layer_scale
+        if use_layer_scale:
+            self.layer_scale_1 = nn.Parameter(
+                layer_scale_init_value * torch.ones(dim).unsqueeze(-1).unsqueeze(-1), requires_grad=True
+            )
+            self.layer_scale_2 = nn.Parameter(
+                layer_scale_init_value * torch.ones(dim).unsqueeze(-1).unsqueeze(-1), requires_grad=True
+            )
+
+    def forward(self, x):
+        x = self.local_representation(x)
+        batch_size, channels, height, width = x.shape
+        if self.use_layer_scale:
+            x = x + self.drop_path(
+                self.layer_scale_1
+                * self.attn(x.permute(0, 2, 3, 1).reshape(batch_size, height * width, channels))
+                .reshape(batch_size, height, width, channels)
+                .permute(0, 3, 1, 2)
+            )
+            x = x + self.drop_path(self.layer_scale_2 * self.linear(x))
+
+        else:
+            x = x + self.drop_path(
+                self.attn(x.permute(0, 2, 3, 1).reshape(batch_size, height * width, channels))
+                .reshape(batch_size, height, width, channels)
+                .permute(0, 3, 1, 2)
+            )
+            x = x + self.drop_path(self.linear(x))
+        return x
+
+
+class SwiftFormerStage(nn.Module):
+    """
+    A Swiftformer stage consisting of a series of `SwiftFormerConvEncoder` blocks and a final
+    `SwiftFormerEncoderBlock`.
+
+    Input: tensor in shape `[batch_size, channels, height, width]`
+
+    Output: tensor in shape `[batch_size, channels, height, width]`
+    """
+
+    def __init__(self, config: SwiftFormerConfig, index: int) -> None:
+        super().__init__()
+
+        layer_depths = config.depths
+        dim = config.embed_dims[index]
+        depth = layer_depths[index]
+
+        blocks = []
+        for block_idx in range(depth):
+            block_dpr = config.drop_path_rate * (block_idx + sum(layer_depths[:index])) / (sum(layer_depths) - 1)
+
+            if depth - block_idx <= 1:
+                blocks.append(SwiftFormerEncoderBlock(config, dim=dim, drop_path=block_dpr))
+            else:
+                blocks.append(SwiftFormerConvEncoder(config, dim=dim))
+
+        self.blocks = nn.ModuleList(blocks)
+
+    def forward(self, input):
+        for block in self.blocks:
+            input = block(input)
+        return input
+
+
+class SwiftFormerEncoder(nn.Module):
+    def __init__(self, config: SwiftFormerConfig) -> None:
+        super().__init__()
+        self.config = config
+
+        embed_dims = config.embed_dims
+        downsamples = config.downsamples
+        layer_depths = config.depths
+
+        # Transformer model
+        network = []
+        for i in range(len(layer_depths)):
+            stage = SwiftFormerStage(config=config, index=i)
+            network.append(stage)
+            if i >= len(layer_depths) - 1:
+                break
+            if downsamples[i] or embed_dims[i] != embed_dims[i + 1]:
+                # downsampling between two stages
+                network.append(SwiftFormerEmbeddings(config, index=i))
+        self.network = nn.ModuleList(network)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithNoAttention]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        all_hidden_states = (hidden_states,) if output_hidden_states else None
+
+        for block in self.network:
+            hidden_states = block(hidden_states)
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)
+
+        return BaseModelOutputWithNoAttention(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+        )
+
+
+class SwiftFormerPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = SwiftFormerConfig
+    base_model_prefix = "swiftformer"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Conv2d, nn.Linear)):
+            nn.init.trunc_normal_(module.weight, std=0.02)
+            if module.bias is not None:
+                nn.init.constant_(module.bias, 0)
+        elif isinstance(module, (nn.LayerNorm)):
+            nn.init.constant_(module.bias, 0)
+            nn.init.constant_(module.weight, 1.0)
+
+    def _set_gradient_checkpointing(self, module: SwiftFormerEncoder, value: bool = False) -> None:
+        if isinstance(module, SwiftFormerEncoder):
+            module.gradient_checkpointing = value
+
+
+SWIFTFORMER_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`SwiftFormerConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+SWIFTFORMER_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
+            for details.
+
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare SwiftFormer Model transformer outputting raw hidden-states without any specific head on top.",
+    SWIFTFORMER_START_DOCSTRING,
+)
+class SwiftFormerModel(SwiftFormerPreTrainedModel):
+    def __init__(self, config: SwiftFormerConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.patch_embed = SwiftFormerPatchEmbedding(config)
+        self.encoder = SwiftFormerEncoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(SWIFTFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithNoAttention,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithNoAttention]:
+        r""" """
+
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        embedding_output = self.patch_embed(pixel_values)
+        encoder_outputs = self.encoder(
+            embedding_output,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return tuple(v for v in encoder_outputs if v is not None)
+
+        return BaseModelOutputWithNoAttention(
+            last_hidden_state=encoder_outputs.last_hidden_state,
+            hidden_states=encoder_outputs.hidden_states,
+        )
+
+
+@add_start_docstrings(
+    """
+    SwiftFormer Model transformer with an image classification head on top (e.g. for ImageNet).
+    """,
+    SWIFTFORMER_START_DOCSTRING,
+)
+class SwiftFormerForImageClassification(SwiftFormerPreTrainedModel):
+    def __init__(self, config: SwiftFormerConfig) -> None:
+        super().__init__(config)
+
+        embed_dims = config.embed_dims
+
+        self.num_labels = config.num_labels
+        self.swiftformer = SwiftFormerModel(config)
+
+        # Classifier head
+        self.norm = nn.BatchNorm2d(embed_dims[-1], eps=config.batch_norm_eps)
+        self.head = nn.Linear(embed_dims[-1], self.num_labels) if self.num_labels > 0 else nn.Identity()
+        self.dist_head = nn.Linear(embed_dims[-1], self.num_labels) if self.num_labels > 0 else nn.Identity()
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(SWIFTFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=ImageClassifierOutputWithNoAttention,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, ImageClassifierOutputWithNoAttention]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # run base model
+        outputs = self.swiftformer(
+            pixel_values,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs.last_hidden_state if return_dict else outputs[0]
+
+        # run classification head
+        sequence_output = self.norm(sequence_output)
+        sequence_output = sequence_output.flatten(2).mean(-1)
+        cls_out = self.head(sequence_output)
+        distillation_out = self.dist_head(sequence_output)
+        logits = (cls_out + distillation_out) / 2
+
+        # calculate loss
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ImageClassifierOutputWithNoAttention(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+        )
diff --git a/src/transformers/models/swin/__init__.py b/src/transformers/models/swin/__init__.py
index 7f883dae388b..39cace5d5e88 100644
--- a/src/transformers/models/swin/__init__.py
+++ b/src/transformers/models/swin/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,7 +13,6 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-# rely on isort to merge the imports
 from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tf_available, is_torch_available
 
 
diff --git a/src/transformers/models/swin/configuration_swin.py b/src/transformers/models/swin/configuration_swin.py
index 89c9d556b287..1b0efca1c47f 100644
--- a/src/transformers/models/swin/configuration_swin.py
+++ b/src/transformers/models/swin/configuration_swin.py
@@ -22,6 +22,7 @@
 from ...configuration_utils import PretrainedConfig
 from ...onnx import OnnxConfig
 from ...utils import logging
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
 
 
 logger = logging.get_logger(__name__)
@@ -34,7 +35,7 @@
 }
 
 
-class SwinConfig(PretrainedConfig):
+class SwinConfig(BackboneConfigMixin, PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`SwinModel`]. It is used to instantiate a Swin
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
@@ -54,15 +55,15 @@ class SwinConfig(PretrainedConfig):
             The number of input channels.
         embed_dim (`int`, *optional*, defaults to 96):
             Dimensionality of patch embedding.
-        depths (`list(int)`, *optional*, defaults to [2, 2, 6, 2]):
+        depths (`list(int)`, *optional*, defaults to `[2, 2, 6, 2]`):
             Depth of each layer in the Transformer encoder.
-        num_heads (`list(int)`, *optional*, defaults to [3, 6, 12, 24]):
+        num_heads (`list(int)`, *optional*, defaults to `[3, 6, 12, 24]`):
             Number of attention heads in each layer of the Transformer encoder.
         window_size (`int`, *optional*, defaults to 7):
             Size of windows.
         mlp_ratio (`float`, *optional*, defaults to 4.0):
             Ratio of MLP hidden dimensionality to embedding dimensionality.
-        qkv_bias (`bool`, *optional*, defaults to True):
+        qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether or not a learnable bias should be added to the queries, keys and values.
         hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout probability for all fully connected layers in the embeddings and encoder.
@@ -73,19 +74,22 @@ class SwinConfig(PretrainedConfig):
         hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`,
             `"selu"` and `"gelu_new"` are supported.
-        use_absolute_embeddings (`bool`, *optional*, defaults to False):
+        use_absolute_embeddings (`bool`, *optional*, defaults to `False`):
             Whether or not to add absolute position embeddings to the patch embeddings.
-        patch_norm (`bool`, *optional*, defaults to True):
-            Whether or not to add layer normalization after patch embedding.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
-        encoder_stride (`int`, `optional`, defaults to 32):
+        encoder_stride (`int`, *optional*, defaults to 32):
             Factor to increase the spatial resolution by in the decoder head for masked image modeling.
         out_features (`List[str]`, *optional*):
             If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
-            (depending on how many stages the model has). Will default to the last stage if unset.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage.
+        out_indices (`List[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage.
 
     Example:
 
@@ -124,12 +128,12 @@ def __init__(
         drop_path_rate=0.1,
         hidden_act="gelu",
         use_absolute_embeddings=False,
-        patch_norm=True,
         initializer_range=0.02,
         layer_norm_eps=1e-5,
         encoder_stride=32,
         out_features=None,
-        **kwargs
+        out_indices=None,
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -148,7 +152,6 @@ def __init__(
         self.drop_path_rate = drop_path_rate
         self.hidden_act = hidden_act
         self.use_absolute_embeddings = use_absolute_embeddings
-        self.path_norm = patch_norm
         self.layer_norm_eps = layer_norm_eps
         self.initializer_range = initializer_range
         self.encoder_stride = encoder_stride
@@ -156,19 +159,12 @@ def __init__(
         # this indicates the channel dimension after the last stage of the model
         self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
         self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
-        if out_features is not None:
-            if not isinstance(out_features, list):
-                raise ValueError("out_features should be a list")
-            for feature in out_features:
-                if feature not in self.stage_names:
-                    raise ValueError(
-                        f"Feature {feature} is not a valid feature name. Valid names are {self.stage_names}"
-                    )
-        self.out_features = out_features
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
 
 
 class SwinOnnxConfig(OnnxConfig):
-
     torch_onnx_minimum_version = version.parse("1.11")
 
     @property
diff --git a/src/transformers/models/swin/convert_swin_simmim_to_pytorch.py b/src/transformers/models/swin/convert_swin_simmim_to_pytorch.py
index 302bd6f3f7d1..156b0ba86c52 100644
--- a/src/transformers/models/swin/convert_swin_simmim_to_pytorch.py
+++ b/src/transformers/models/swin/convert_swin_simmim_to_pytorch.py
@@ -18,11 +18,11 @@
 
 import argparse
 
+import requests
 import torch
 from PIL import Image
 
-import requests
-from transformers import SwinConfig, SwinForMaskedImageModeling, ViTFeatureExtractor
+from transformers import SwinConfig, SwinForMaskedImageModeling, ViTImageProcessor
 
 
 def get_swin_config(model_name):
@@ -132,9 +132,9 @@ def convert_swin_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_pat
 
     url = "http://images.cocodataset.org/val2017/000000039769.jpg"
 
-    feature_extractor = ViTFeatureExtractor(size={"height": 192, "width": 192})
+    image_processor = ViTImageProcessor(size={"height": 192, "width": 192})
     image = Image.open(requests.get(url, stream=True).raw)
-    inputs = feature_extractor(images=image, return_tensors="pt")
+    inputs = image_processor(images=image, return_tensors="pt")
 
     with torch.no_grad():
         outputs = model(**inputs).logits
@@ -146,13 +146,13 @@ def convert_swin_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_pat
         print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
         model.save_pretrained(pytorch_dump_folder_path)
 
-        print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-        feature_extractor.save_pretrained(pytorch_dump_folder_path)
+        print(f"Saving image processor to {pytorch_dump_folder_path}")
+        image_processor.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
-        print(f"Pushing model and feature extractor for {model_name} to hub")
+        print(f"Pushing model and image processor for {model_name} to hub")
         model.push_to_hub(f"microsoft/{model_name}")
-        feature_extractor.push_to_hub(f"microsoft/{model_name}")
+        image_processor.push_to_hub(f"microsoft/{model_name}")
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/swin/convert_swin_timm_to_pytorch.py b/src/transformers/models/swin/convert_swin_timm_to_pytorch.py
index 860fdd1b54d2..828237490e0e 100644
--- a/src/transformers/models/swin/convert_swin_timm_to_pytorch.py
+++ b/src/transformers/models/swin/convert_swin_timm_to_pytorch.py
@@ -1,13 +1,13 @@
 import argparse
 import json
 
-import torch
-from PIL import Image
-
 import requests
 import timm
+import torch
 from huggingface_hub import hf_hub_download
-from transformers import AutoFeatureExtractor, SwinConfig, SwinForImageClassification
+from PIL import Image
+
+from transformers import AutoImageProcessor, SwinConfig, SwinForImageClassification
 
 
 def get_swin_config(swin_name):
@@ -140,9 +140,9 @@ def convert_swin_checkpoint(swin_name, pytorch_dump_folder_path):
 
     url = "http://images.cocodataset.org/val2017/000000039769.jpg"
 
-    feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/{}".format(swin_name.replace("_", "-")))
+    image_processor = AutoImageProcessor.from_pretrained("microsoft/{}".format(swin_name.replace("_", "-")))
     image = Image.open(requests.get(url, stream=True).raw)
-    inputs = feature_extractor(images=image, return_tensors="pt")
+    inputs = image_processor(images=image, return_tensors="pt")
 
     timm_outs = timm_model(inputs["pixel_values"])
     hf_outs = model(**inputs).logits
@@ -152,8 +152,8 @@ def convert_swin_checkpoint(swin_name, pytorch_dump_folder_path):
     print(f"Saving model {swin_name} to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path)
 
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/swin/modeling_swin.py b/src/transformers/models/swin/modeling_swin.py
index fe46e7f532c3..2cf1d33a5113 100644
--- a/src/transformers/models/swin/modeling_swin.py
+++ b/src/transformers/models/swin/modeling_swin.py
@@ -17,6 +17,7 @@
 
 import collections.abc
 import math
+import warnings
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union
 
@@ -27,7 +28,7 @@
 
 from ...activations import ACT2FN
 from ...modeling_outputs import BackboneOutput
-from ...modeling_utils import BackboneMixin, PreTrainedModel
+from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
 from ...utils import (
     ModelOutput,
@@ -37,6 +38,7 @@
     logging,
     replace_return_docstrings,
 )
+from ...utils.backbone_utils import BackboneMixin
 from .configuration_swin import SwinConfig
 
 
@@ -44,7 +46,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "SwinConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "microsoft/swin-tiny-patch4-window7-224"
@@ -140,7 +141,7 @@ class SwinMaskedImageModelingOutput(ModelOutput):
     Args:
         loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
             Masked image modeling (MLM) loss.
-        logits (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+        reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Reconstructed pixel values.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
@@ -162,11 +163,20 @@ class SwinMaskedImageModelingOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    reconstruction: torch.FloatTensor = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
     reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
 
+    @property
+    def logits(self):
+        warnings.warn(
+            "logits attribute is deprecated and will be removed in version 5 of Transformers."
+            " Please use the reconstruction attribute to retrieve the final output instead.",
+            FutureWarning,
+        )
+        return self.reconstruction
+
 
 @dataclass
 class SwinImageClassifierOutput(ModelOutput):
@@ -370,7 +380,7 @@ def forward(self, input_feature: torch.Tensor, input_dimensions: Tuple[int, int]
 
 
 # Copied from transformers.models.beit.modeling_beit.drop_path
-def drop_path(input, drop_prob=0.0, training=False, scale_by_keep=True):
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
@@ -644,8 +654,12 @@ def forward(
         input_dimensions: Tuple[int, int],
         head_mask: Optional[torch.FloatTensor] = None,
         output_attentions: Optional[bool] = False,
+        always_partition: Optional[bool] = False,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        self.set_shift_and_window_size(input_dimensions)
+        if not always_partition:
+            self.set_shift_and_window_size(input_dimensions)
+        else:
+            pass
         height, width = input_dimensions
         batch_size, _, channels = hidden_states.size()
         shortcut = hidden_states
@@ -734,13 +748,15 @@ def forward(
         input_dimensions: Tuple[int, int],
         head_mask: Optional[torch.FloatTensor] = None,
         output_attentions: Optional[bool] = False,
+        always_partition: Optional[bool] = False,
     ) -> Tuple[torch.Tensor]:
         height, width = input_dimensions
         for i, layer_module in enumerate(self.blocks):
-
             layer_head_mask = head_mask[i] if head_mask is not None else None
 
-            layer_outputs = layer_module(hidden_states, input_dimensions, layer_head_mask, output_attentions)
+            layer_outputs = layer_module(
+                hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition
+            )
 
             hidden_states = layer_outputs[0]
 
@@ -790,6 +806,7 @@ def forward(
         output_attentions: Optional[bool] = False,
         output_hidden_states: Optional[bool] = False,
         output_hidden_states_before_downsampling: Optional[bool] = False,
+        always_partition: Optional[bool] = False,
         return_dict: Optional[bool] = True,
     ) -> Union[Tuple, SwinEncoderOutput]:
         all_hidden_states = () if output_hidden_states else None
@@ -819,7 +836,9 @@ def custom_forward(*inputs):
                     create_custom_forward(layer_module), hidden_states, input_dimensions, layer_head_mask
                 )
             else:
-                layer_outputs = layer_module(hidden_states, input_dimensions, layer_head_mask, output_attentions)
+                layer_outputs = layer_module(
+                    hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition
+                )
 
             hidden_states = layer_outputs[0]
             hidden_states_before_downsampling = layer_outputs[1]
@@ -901,8 +920,8 @@ def _set_gradient_checkpointing(self, module, value=False):
 SWIN_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
-            [`AutoImageProcessor.__call__`] for details.
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
+            for details.
         head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
             Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
 
@@ -953,7 +972,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=SwinModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -969,6 +987,10 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, SwinModelOutput]:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+        """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1083,7 +1105,7 @@ def forward(
         >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()
 
         >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
-        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.logits
+        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
         >>> list(reconstructed_pixel_values.shape)
         [1, 3, 192, 192]
         ```"""
@@ -1127,7 +1149,7 @@ def forward(
 
         return SwinMaskedImageModelingOutput(
             loss=masked_im_loss,
-            logits=reconstructed_pixel_values,
+            reconstruction=reconstructed_pixel_values,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
             reshaped_hidden_states=outputs.reshaped_hidden_states,
@@ -1158,7 +1180,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_IMAGE_CLASS_CHECKPOINT,
         output_type=SwinImageClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1238,23 +1259,15 @@ def forward(
 class SwinBackbone(SwinPreTrainedModel, BackboneMixin):
     def __init__(self, config: SwinConfig):
         super().__init__(config)
+        super()._init_backbone(config)
 
-        self.stage_names = config.stage_names
-
+        self.num_features = [config.embed_dim] + [int(config.embed_dim * 2**i) for i in range(len(config.depths))]
         self.embeddings = SwinEmbeddings(config)
         self.encoder = SwinEncoder(config, self.embeddings.patch_grid)
 
-        self.out_features = config.out_features if config.out_features is not None else [self.stage_names[-1]]
-
-        num_features = [int(config.embed_dim * 2**i) for i in range(len(config.depths))]
-        self.out_feature_channels = {}
-        self.out_feature_channels["stem"] = config.embed_dim
-        for i, stage in enumerate(self.stage_names[1:]):
-            self.out_feature_channels[stage] = num_features[i]
-
         # Add layer norms to hidden states of out_features
-        hidden_states_norms = dict()
-        for stage, num_channels in zip(self.out_features, self.channels):
+        hidden_states_norms = {}
+        for stage, num_channels in zip(self._out_features, self.channels):
             hidden_states_norms[stage] = nn.LayerNorm(num_channels)
         self.hidden_states_norms = nn.ModuleDict(hidden_states_norms)
 
@@ -1264,10 +1277,6 @@ def __init__(self, config: SwinConfig):
     def get_input_embeddings(self):
         return self.embeddings.patch_embeddings
 
-    @property
-    def channels(self):
-        return [self.out_feature_channels[name] for name in self.out_features]
-
     def forward(
         self,
         pixel_values: torch.Tensor,
@@ -1315,6 +1324,7 @@ def forward(
             output_attentions=output_attentions,
             output_hidden_states=True,
             output_hidden_states_before_downsampling=True,
+            always_partition=True,
             return_dict=True,
         )
 
diff --git a/src/transformers/models/swin/modeling_tf_swin.py b/src/transformers/models/swin/modeling_tf_swin.py
index fc4b321fa093..02ec39edb0fe 100644
--- a/src/transformers/models/swin/modeling_tf_swin.py
+++ b/src/transformers/models/swin/modeling_tf_swin.py
@@ -15,8 +15,11 @@
 """ TF 2.0 Swin Transformer model."""
 
 
+from __future__ import annotations
+
 import collections.abc
 import math
+import warnings
 from dataclasses import dataclass
 from functools import partial
 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
@@ -47,7 +50,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "SwinConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "microsoft/swin-tiny-patch4-window7-224"
@@ -95,9 +97,9 @@ class TFSwinEncoderOutput(ModelOutput):
     """
 
     last_hidden_state: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
-    reshaped_hidden_states: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+    reshaped_hidden_states: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -130,10 +132,10 @@ class TFSwinModelOutput(ModelOutput):
     """
 
     last_hidden_state: tf.Tensor = None
-    pooler_output: Optional[tf.Tensor] = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
-    reshaped_hidden_states: Optional[Tuple[tf.Tensor]] = None
+    pooler_output: tf.Tensor | None = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+    reshaped_hidden_states: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -144,7 +146,7 @@ class TFSwinMaskedImageModelingOutput(ModelOutput):
     Args:
         loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
             Masked image modeling (MLM) loss.
-        logits (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
+        reconstruction (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
             Reconstructed pixel values.
         hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape
@@ -165,11 +167,20 @@ class TFSwinMaskedImageModelingOutput(ModelOutput):
             include the spatial dimensions.
     """
 
-    loss: Optional[tf.Tensor] = None
-    logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
-    reshaped_hidden_states: Optional[Tuple[tf.Tensor]] = None
+    loss: tf.Tensor | None = None
+    reconstruction: tf.Tensor = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+    reshaped_hidden_states: Tuple[tf.Tensor] | None = None
+
+    @property
+    def logits(self):
+        warnings.warn(
+            "logits attribute is deprecated and will be removed in version 5 of Transformers."
+            " Please use the reconstruction attribute to retrieve the final output instead.",
+            FutureWarning,
+        )
+        return self.reconstruction
 
 
 @dataclass
@@ -201,11 +212,11 @@ class TFSwinImageClassifierOutput(ModelOutput):
             include the spatial dimensions.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
-    reshaped_hidden_states: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+    reshaped_hidden_states: Tuple[tf.Tensor] | None = None
 
 
 def window_partition(input_feature: tf.Tensor, window_size: int) -> tf.Tensor:
@@ -520,8 +531,8 @@ def transpose_for_scores(self, x: tf.Tensor) -> tf.Tensor:
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
         output_attentions: bool = False,
         training: bool = False,
     ) -> Tuple[tf.Tensor, ...]:
@@ -610,8 +621,8 @@ def prune_heads(self, heads):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
         output_attentions: bool = False,
         training: bool = False,
     ) -> tf.Tensor:
@@ -674,7 +685,7 @@ def __init__(
         self.intermediate = TFSwinIntermediate(config, dim, name="intermediate")
         self.swin_output = TFSwinOutput(config, dim, name="output")
 
-    def get_attn_mask(self, height: int, width: int, window_size: int, shift_size: int) -> Optional[tf.Tensor]:
+    def get_attn_mask(self, height: int, width: int, window_size: int, shift_size: int) -> tf.Tensor | None:
         img_mask = tf.zeros((height, width))
         height_slices = ((0, -window_size), (-window_size, -shift_size), (-shift_size, -1))
         width_slices = ((0, -window_size), (-window_size, -shift_size), (-shift_size, -1))
@@ -716,7 +727,7 @@ def call(
         self,
         hidden_states: tf.Tensor,
         input_dimensions: Tuple[int, int],
-        head_mask: Optional[tf.Tensor] = None,
+        head_mask: tf.Tensor | None = None,
         output_attentions: bool = False,
         training: bool = False,
     ) -> tf.Tensor:
@@ -789,7 +800,7 @@ def __init__(
         num_heads: int,
         drop_path: List[float],
         downsample: Optional[Callable],
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         self.config = config
@@ -823,7 +834,7 @@ def call(
         self,
         hidden_states: tf.Tensor,
         input_dimensions: Tuple[int, int],
-        head_mask: Optional[tf.Tensor] = None,
+        head_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = False,
         training: bool = False,
     ) -> Tuple[tf.Tensor, ...]:
@@ -877,7 +888,7 @@ def call(
         self,
         hidden_states: tf.Tensor,
         input_dimensions: Tuple[int, int],
-        head_mask: Optional[tf.Tensor] = None,
+        head_mask: tf.Tensor | None = None,
         output_attentions: bool = False,
         output_hidden_states: bool = False,
         return_dict: bool = True,
@@ -946,29 +957,6 @@ def _set_gradient_checkpointing(self, module, value=False) -> None:
         if isinstance(module, TFSwinEncoder):
             module.gradient_checkpointing = value
 
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network. Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        VISION_DUMMY_INPUTS = tf.random.uniform(
-            shape=(3, self.config.num_channels, self.config.image_size, self.config.image_size),
-            dtype=tf.float32,
-        )
-        return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)}
-
-    @tf.function(
-        input_signature=[
-            {
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-        return self.serving_output(output)
-
 
 SWIN_START_DOCSTRING = r"""
     This model is a Tensorflow
@@ -985,8 +973,8 @@ def serving(self, inputs):
 SWIN_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
-            [`AutoImageProcessor.__call__`] for details.
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
+            for details.
         head_mask (`tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
             Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
 
@@ -1119,9 +1107,9 @@ def get_head_mask(self, head_mask: Optional[Any]) -> List:
     @unpack_inputs
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
-        bool_masked_pos: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
+        bool_masked_pos: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1192,7 +1180,6 @@ def __init__(
 
     @add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFSwinModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1202,14 +1189,18 @@ def __init__(
     @unpack_inputs
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
-        bool_masked_pos: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
+        bool_masked_pos: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFSwinModelOutput, Tuple[tf.Tensor, ...]]:
+        r"""
+        bool_masked_pos (`tf.Tensor` of shape `(batch_size, num_patches)`, *optional*):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+        """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1231,16 +1222,6 @@ def call(
 
         return swin_outputs
 
-    def serving_output(self, output: TFSwinModelOutput) -> TFSwinModelOutput:
-        # hidden_states and attentions not converted to Tensor with tf.convert_to_tensor as they are all of different dimensions
-        return TFSwinModelOutput(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            hidden_states=output.hidden_states,
-            attentions=output.attentions,
-            reshaped_hidden_states=output.reshaped_hidden_states,
-        )
-
 
 class TFSwinPixelShuffle(tf.keras.layers.Layer):
     """TF layer implementation of torch.nn.PixelShuffle"""
@@ -1305,9 +1286,9 @@ def __init__(self, config: SwinConfig):
     @unpack_inputs
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
-        bool_masked_pos: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
+        bool_masked_pos: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1338,7 +1319,7 @@ def call(
         >>> bool_masked_pos = tf.random.uniform((1, num_patches)) >= 0.5
 
         >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
-        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.logits
+        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
         >>> list(reconstructed_pixel_values.shape)
         [1, 3, 224, 224]
         ```"""
@@ -1390,21 +1371,12 @@ def call(
 
         return TFSwinMaskedImageModelingOutput(
             loss=masked_im_loss,
-            logits=reconstructed_pixel_values,
+            reconstruction=reconstructed_pixel_values,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
             reshaped_hidden_states=outputs.reshaped_hidden_states,
         )
 
-    def serving_output(self, output: TFSwinMaskedImageModelingOutput) -> TFSwinMaskedImageModelingOutput:
-        # hidden_states and attentions not converted to Tensor with tf.convert_to_tensor as they are all of different dimensions
-        return TFSwinMaskedImageModelingOutput(
-            logits=output.logits,
-            hidden_states=output.hidden_states,
-            attentions=output.attentions,
-            reshaped_hidden_states=output.reshaped_hidden_states,
-        )
-
 
 @add_start_docstrings(
     """
@@ -1429,7 +1401,6 @@ def __init__(self, config: SwinConfig):
 
     @add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_IMAGE_CLASS_CHECKPOINT,
         output_type=TFSwinImageClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1438,9 +1409,9 @@ def __init__(self, config: SwinConfig):
     @unpack_inputs
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        labels: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        labels: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1480,12 +1451,3 @@ def call(
             attentions=outputs.attentions,
             reshaped_hidden_states=outputs.reshaped_hidden_states,
         )
-
-    def serving_output(self, output: TFSwinImageClassifierOutput) -> TFSwinImageClassifierOutput:
-        # hidden_states and attentions not converted to Tensor with tf.convert_to_tensor as they are all of different dimensions
-        return TFSwinImageClassifierOutput(
-            logits=output.logits,
-            hidden_states=output.hidden_states,
-            attentions=output.attentions,
-            reshaped_hidden_states=output.reshaped_hidden_states,
-        )
diff --git a/src/transformers/models/swin2sr/__init__.py b/src/transformers/models/swin2sr/__init__.py
index 3b0c885a7dc3..881a7673512e 100644
--- a/src/transformers/models/swin2sr/__init__.py
+++ b/src/transformers/models/swin2sr/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,7 +13,6 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-# rely on isort to merge the imports
 from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
 
 
diff --git a/src/transformers/models/swin2sr/configuration_swin2sr.py b/src/transformers/models/swin2sr/configuration_swin2sr.py
index 4547b5848a1b..622001f29fca 100644
--- a/src/transformers/models/swin2sr/configuration_swin2sr.py
+++ b/src/transformers/models/swin2sr/configuration_swin2sr.py
@@ -44,6 +44,8 @@ class Swin2SRConfig(PretrainedConfig):
             The size (resolution) of each patch.
         num_channels (`int`, *optional*, defaults to 3):
             The number of input channels.
+        num_channels_out (`int`, *optional*, defaults to `num_channels`):
+            The number of output channels. If not set, it will be set to `num_channels`.
         embed_dim (`int`, *optional*, defaults to 180):
             Dimensionality of patch embedding.
         depths (`list(int)`, *optional*, defaults to `[6, 6, 6, 6, 6, 6]`):
@@ -67,16 +69,14 @@ class Swin2SRConfig(PretrainedConfig):
             `"selu"` and `"gelu_new"` are supported.
         use_absolute_embeddings (`bool`, *optional*, defaults to `False`):
             Whether or not to add absolute position embeddings to the patch embeddings.
-        patch_norm (`bool`, *optional*, defaults to `True`):
-            Whether or not to add layer normalization after patch embedding.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
         upscale (`int`, *optional*, defaults to 2):
             The upscale factor for the image. 2/3/4/8 for image super resolution, 1 for denoising and compress artifact
             reduction
-        img_range (`float`, *optional*, defaults to 1.):
+        img_range (`float`, *optional*, defaults to 1.0):
             The range of the values of the input image.
         resi_connection (`str`, *optional*, defaults to `"1conv"`):
             The convolutional block to use before the residual connection in each stage.
@@ -110,6 +110,7 @@ def __init__(
         image_size=64,
         patch_size=1,
         num_channels=3,
+        num_channels_out=None,
         embed_dim=180,
         depths=[6, 6, 6, 6, 6, 6],
         num_heads=[6, 6, 6, 6, 6, 6],
@@ -121,20 +122,20 @@ def __init__(
         drop_path_rate=0.1,
         hidden_act="gelu",
         use_absolute_embeddings=False,
-        patch_norm=True,
         initializer_range=0.02,
         layer_norm_eps=1e-5,
         upscale=2,
         img_range=1.0,
         resi_connection="1conv",
         upsampler="pixelshuffle",
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
         self.image_size = image_size
         self.patch_size = patch_size
         self.num_channels = num_channels
+        self.num_channels_out = num_channels if num_channels_out is None else num_channels_out
         self.embed_dim = embed_dim
         self.depths = depths
         self.num_layers = len(depths)
@@ -147,7 +148,6 @@ def __init__(
         self.drop_path_rate = drop_path_rate
         self.hidden_act = hidden_act
         self.use_absolute_embeddings = use_absolute_embeddings
-        self.path_norm = patch_norm
         self.layer_norm_eps = layer_norm_eps
         self.initializer_range = initializer_range
         self.upscale = upscale
diff --git a/src/transformers/models/swin2sr/convert_swin2sr_original_to_pytorch.py b/src/transformers/models/swin2sr/convert_swin2sr_original_to_pytorch.py
index 38a11496f7ee..6884bf0afc0c 100644
--- a/src/transformers/models/swin2sr/convert_swin2sr_original_to_pytorch.py
+++ b/src/transformers/models/swin2sr/convert_swin2sr_original_to_pytorch.py
@@ -16,11 +16,11 @@
 
 import argparse
 
+import requests
 import torch
 from PIL import Image
 from torchvision.transforms import Compose, Normalize, Resize, ToTensor
 
-import requests
 from transformers import Swin2SRConfig, Swin2SRForImageSuperResolution, Swin2SRImageProcessor
 
 
diff --git a/src/transformers/models/swin2sr/image_processing_swin2sr.py b/src/transformers/models/swin2sr/image_processing_swin2sr.py
index c5c5458d8aa7..95eafb3d01d9 100644
--- a/src/transformers/models/swin2sr/image_processing_swin2sr.py
+++ b/src/transformers/models/swin2sr/image_processing_swin2sr.py
@@ -18,12 +18,18 @@
 
 import numpy as np
 
-from transformers.utils.generic import TensorType
-
 from ...image_processing_utils import BaseImageProcessor, BatchFeature
-from ...image_transforms import get_image_size, pad, rescale, to_channel_dimension_format
-from ...image_utils import ChannelDimension, ImageInput, is_batched, to_numpy_array, valid_images
-from ...utils import logging
+from ...image_transforms import get_image_size, pad, to_channel_dimension_format
+from ...image_utils import (
+    ChannelDimension,
+    ImageInput,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import TensorType, logging
 
 
 logger = logging.get_logger(__name__)
@@ -50,7 +56,7 @@ def __init__(
         rescale_factor: Union[int, float] = 1 / 255,
         do_pad: bool = True,
         pad_size: int = 8,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
 
@@ -59,29 +65,13 @@ def __init__(
         self.do_pad = do_pad
         self.pad_size = pad_size
 
-    def rescale(
-        self, image: np.ndarray, scale: float, data_format: Optional[Union[str, ChannelDimension]] = None, **kwargs
-    ) -> np.ndarray:
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`float`):
-                The scaling factor to rescale pixel values by.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format for the output image. If unset, the channel dimension format of the input
-                image is used. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-
-        Returns:
-            `np.ndarray`: The rescaled image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def pad(self, image: np.ndarray, size: int, data_format: Optional[Union[str, ChannelDimension]] = None):
+    def pad(
+        self,
+        image: np.ndarray,
+        size: int,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
         """
         Pad an image to make the height and width divisible by `size`.
 
@@ -95,15 +85,26 @@ def pad(self, image: np.ndarray, size: int, data_format: Optional[Union[str, Cha
                 image is used. Can be one of:
                 - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                 - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
 
         Returns:
             `np.ndarray`: The padded image.
         """
-        old_height, old_width = get_image_size(image)
+        old_height, old_width = get_image_size(image, input_data_format)
         pad_height = (old_height // size + 1) * size - old_height
         pad_width = (old_width // size + 1) * size - old_width
 
-        return pad(image, ((0, pad_height), (0, pad_width)), mode="symmetric", data_format=data_format)
+        return pad(
+            image,
+            ((0, pad_height), (0, pad_width)),
+            mode="symmetric",
+            data_format=data_format,
+            input_data_format=input_data_format,
+        )
 
     def preprocess(
         self,
@@ -114,6 +115,7 @@ def preprocess(
         pad_size: Optional[int] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
         **kwargs,
     ):
         """
@@ -121,19 +123,21 @@ def preprocess(
 
         Args:
             images (`ImageInput`):
-                Image to preprocess.
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
             do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
                 Whether to rescale the image values between [0 - 1].
             rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
                 Rescale factor to rescale the image by if `do_rescale` is set to `True`.
             do_pad (`bool`, *optional*, defaults to `True`):
                 Whether to pad the image to make the height and width divisible by `window_size`.
-            pad_size (`int`, *optional*, defaults to `32`):
+            pad_size (`int`, *optional*, defaults to 32):
                 The size of the sliding window for the local attention.
             return_tensors (`str` or `TensorType`, *optional*):
                 The type of tensors to return. Can be one of:
                 - Unset: Return a list of `np.ndarray`.
-                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of typ, input_data_format=input_data_formate
+                  `tf.Tensor`.
                 - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
                 - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
                 - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
@@ -142,14 +146,19 @@ def preprocess(
                 - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                 - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                 - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
         do_rescale = do_rescale if do_rescale is not None else self.do_rescale
         rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
         do_pad = do_pad if do_pad is not None else self.do_pad
         pad_size = pad_size if pad_size is not None else self.pad_size
 
-        if not is_batched(images):
-            images = [images]
+        images = make_list_of_images(images)
 
         if not valid_images(images):
             raise ValueError(
@@ -163,13 +172,28 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
         if do_rescale:
-            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
 
         if do_pad:
-            images = [self.pad(image, size=pad_size) for image in images]
+            images = [self.pad(image, size=pad_size, input_data_format=input_data_format) for image in images]
 
-        images = [to_channel_dimension_format(image, data_format) for image in images]
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
 
         data = {"pixel_values": images}
         return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/src/transformers/models/swin2sr/modeling_swin2sr.py b/src/transformers/models/swin2sr/modeling_swin2sr.py
index b40ce8868cdb..a8a17bdf584b 100644
--- a/src/transformers/models/swin2sr/modeling_swin2sr.py
+++ b/src/transformers/models/swin2sr/modeling_swin2sr.py
@@ -43,7 +43,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "Swin2SRConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "caidas/swin2SR-classical-sr-x2-64"
@@ -106,8 +105,8 @@ def window_reverse(windows, window_size, height, width):
     return windows
 
 
-# Copied from transformers.models.swin.modeling_swin.drop_path
-def drop_path(input, drop_prob=0.0, training=False, scale_by_keep=True):
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
@@ -521,9 +520,8 @@ def set_shift_and_window_size(self, input_resolution):
             if isinstance(self.shift_size, collections.abc.Iterable)
             else (self.shift_size, self.shift_size)
         )
-        self.window_size = (
-            input_resolution[0] if input_resolution[0] <= target_window_size[0] else target_window_size[0]
-        )
+        window_dim = input_resolution[0].item() if torch.is_tensor(input_resolution[0]) else input_resolution[0]
+        self.window_size = window_dim if window_dim <= target_window_size[0] else target_window_size[0]
         self.shift_size = (
             0
             if input_resolution
@@ -576,8 +574,12 @@ def forward(
         input_dimensions: Tuple[int, int],
         head_mask: Optional[torch.FloatTensor] = None,
         output_attentions: Optional[bool] = False,
+        always_partition: Optional[bool] = False,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        self.set_shift_and_window_size(input_dimensions)
+        if not always_partition:
+            self.set_shift_and_window_size(input_dimensions)
+        else:
+            pass
         height, width = input_dimensions
         batch_size, _, channels = hidden_states.size()
         shortcut = hidden_states
@@ -819,8 +821,8 @@ def _set_gradient_checkpointing(self, module, value=False):
 SWIN2SR_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
-            [`AutoFeatureExtractor.__call__`] for details.
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`Swin2SRImageProcessor.__call__`] for details.
         head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
             Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
 
@@ -847,7 +849,7 @@ def __init__(self, config):
         super().__init__(config)
         self.config = config
 
-        if config.num_channels == 3:
+        if config.num_channels == 3 and config.num_channels_out == 3:
             rgb_mean = (0.4488, 0.4371, 0.4040)
             self.mean = torch.Tensor(rgb_mean).view(1, 3, 1, 1)
         else:
@@ -893,7 +895,6 @@ def pad_and_normalize(self, pixel_values):
 
     @add_start_docstrings_to_model_forward(SWIN2SR_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -902,7 +903,7 @@ def pad_and_normalize(self, pixel_values):
     )
     def forward(
         self,
-        pixel_values,
+        pixel_values: torch.FloatTensor,
         head_mask: Optional[torch.FloatTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1004,6 +1005,8 @@ class UpsampleOneStep(nn.Module):
             Scale factor. Supported scales: 2^n and 3.
         in_channels (int):
             Channel number of intermediate features.
+        out_channels (int):
+            Channel number of output features.
     """
 
     def __init__(self, scale, in_channels, out_channels):
@@ -1025,7 +1028,7 @@ def __init__(self, config, num_features):
         self.conv_before_upsample = nn.Conv2d(config.embed_dim, num_features, 3, 1, 1)
         self.activation = nn.LeakyReLU(inplace=True)
         self.upsample = Upsample(config.upscale, num_features)
-        self.final_convolution = nn.Conv2d(num_features, config.num_channels, 3, 1, 1)
+        self.final_convolution = nn.Conv2d(num_features, config.num_channels_out, 3, 1, 1)
 
     def forward(self, sequence_output):
         x = self.conv_before_upsample(sequence_output)
@@ -1047,7 +1050,7 @@ def __init__(self, config, num_features):
         self.conv_up1 = nn.Conv2d(num_features, num_features, 3, 1, 1)
         self.conv_up2 = nn.Conv2d(num_features, num_features, 3, 1, 1)
         self.conv_hr = nn.Conv2d(num_features, num_features, 3, 1, 1)
-        self.final_convolution = nn.Conv2d(num_features, config.num_channels, 3, 1, 1)
+        self.final_convolution = nn.Conv2d(num_features, config.num_channels_out, 3, 1, 1)
         self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
 
     def forward(self, sequence_output):
@@ -1074,7 +1077,7 @@ def __init__(self, config, num_features):
         self.conv_aux = nn.Conv2d(num_features, config.num_channels, 3, 1, 1)
         self.conv_after_aux = nn.Sequential(nn.Conv2d(3, num_features, 3, 1, 1), nn.LeakyReLU(inplace=True))
         self.upsample = Upsample(config.upscale, num_features)
-        self.final_convolution = nn.Conv2d(num_features, config.num_channels, 3, 1, 1)
+        self.final_convolution = nn.Conv2d(num_features, config.num_channels_out, 3, 1, 1)
 
     def forward(self, sequence_output, bicubic, height, width):
         bicubic = self.conv_bicubic(bicubic)
@@ -1113,13 +1116,13 @@ def __init__(self, config):
             self.upsample = PixelShuffleAuxUpsampler(config, num_features)
         elif self.upsampler == "pixelshuffledirect":
             # for lightweight SR (to save parameters)
-            self.upsample = UpsampleOneStep(config.upscale, config.embed_dim, config.num_channels)
+            self.upsample = UpsampleOneStep(config.upscale, config.embed_dim, config.num_channels_out)
         elif self.upsampler == "nearest+conv":
             # for real-world SR (less artifacts)
             self.upsample = NearestConvUpsampler(config, num_features)
         else:
             # for image denoising and JPEG compression artifact reduction
-            self.final_convolution = nn.Conv2d(config.embed_dim, config.num_channels, 3, 1, 1)
+            self.final_convolution = nn.Conv2d(config.embed_dim, config.num_channels_out, 3, 1, 1)
 
         # Initialize weights and apply final processing
         self.post_init()
diff --git a/src/transformers/models/swinv2/__init__.py b/src/transformers/models/swinv2/__init__.py
index 1cf259b8303e..5b3bb21cad59 100644
--- a/src/transformers/models/swinv2/__init__.py
+++ b/src/transformers/models/swinv2/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,7 +13,6 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-# rely on isort to merge the imports
 from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
 
 
diff --git a/src/transformers/models/swinv2/configuration_swinv2.py b/src/transformers/models/swinv2/configuration_swinv2.py
index 46d943cbe924..595d920c6b54 100644
--- a/src/transformers/models/swinv2/configuration_swinv2.py
+++ b/src/transformers/models/swinv2/configuration_swinv2.py
@@ -68,13 +68,11 @@ class Swinv2Config(PretrainedConfig):
             `"selu"` and `"gelu_new"` are supported.
         use_absolute_embeddings (`bool`, *optional*, defaults to `False`):
             Whether or not to add absolute position embeddings to the patch embeddings.
-        patch_norm (`bool`, *optional*, defaults to `True`):
-            Whether or not to add layer normalization after patch embedding.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
-        encoder_stride (`int`, `optional`, defaults to 32):
+        encoder_stride (`int`, *optional*, defaults to 32):
             Factor to increase the spatial resolution by in the decoder head for masked image modeling.
 
     Example:
@@ -114,11 +112,10 @@ def __init__(
         drop_path_rate=0.1,
         hidden_act="gelu",
         use_absolute_embeddings=False,
-        patch_norm=True,
         initializer_range=0.02,
         layer_norm_eps=1e-5,
         encoder_stride=32,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -137,7 +134,6 @@ def __init__(
         self.drop_path_rate = drop_path_rate
         self.hidden_act = hidden_act
         self.use_absolute_embeddings = use_absolute_embeddings
-        self.path_norm = patch_norm
         self.layer_norm_eps = layer_norm_eps
         self.initializer_range = initializer_range
         self.encoder_stride = encoder_stride
diff --git a/src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py b/src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py
index 7af3bfb86c17..21deda864c6d 100644
--- a/src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py
+++ b/src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py
@@ -18,13 +18,13 @@
 import json
 from pathlib import Path
 
-import torch
-from PIL import Image
-
 import requests
 import timm
+import torch
 from huggingface_hub import hf_hub_download
-from transformers import AutoFeatureExtractor, Swinv2Config, Swinv2ForImageClassification
+from PIL import Image
+
+from transformers import AutoImageProcessor, Swinv2Config, Swinv2ForImageClassification
 
 
 def get_swinv2_config(swinv2_name):
@@ -180,9 +180,9 @@ def convert_swinv2_checkpoint(swinv2_name, pytorch_dump_folder_path):
 
     url = "http://images.cocodataset.org/val2017/000000039769.jpg"
 
-    feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/{}".format(swinv2_name.replace("_", "-")))
+    image_processor = AutoImageProcessor.from_pretrained("microsoft/{}".format(swinv2_name.replace("_", "-")))
     image = Image.open(requests.get(url, stream=True).raw)
-    inputs = feature_extractor(images=image, return_tensors="pt")
+    inputs = image_processor(images=image, return_tensors="pt")
 
     timm_outs = timm_model(inputs["pixel_values"])
     hf_outs = model(**inputs).logits
@@ -192,8 +192,8 @@ def convert_swinv2_checkpoint(swinv2_name, pytorch_dump_folder_path):
     print(f"Saving model {swinv2_name} to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path)
 
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
     model.push_to_hub(
         repo_path_or_name=Path(pytorch_dump_folder_path, swinv2_name),
diff --git a/src/transformers/models/swinv2/modeling_swinv2.py b/src/transformers/models/swinv2/modeling_swinv2.py
index c73afc096607..e05643a63583 100644
--- a/src/transformers/models/swinv2/modeling_swinv2.py
+++ b/src/transformers/models/swinv2/modeling_swinv2.py
@@ -17,6 +17,7 @@
 
 import collections.abc
 import math
+import warnings
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union
 
@@ -43,7 +44,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "Swinv2Config"
-_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "microsoft/swinv2-tiny-patch4-window8-256"
@@ -143,7 +143,7 @@ class Swinv2MaskedImageModelingOutput(ModelOutput):
     Args:
         loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
             Masked image modeling (MLM) loss.
-        logits (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+        reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Reconstructed pixel values.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
@@ -165,11 +165,20 @@ class Swinv2MaskedImageModelingOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    reconstruction: torch.FloatTensor = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
     reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
 
+    @property
+    def logits(self):
+        warnings.warn(
+            "logits attribute is deprecated and will be removed in version 5 of Transformers."
+            " Please use the reconstruction attribute to retrieve the final output instead.",
+            FutureWarning,
+        )
+        return self.reconstruction
+
 
 @dataclass
 # Copied from transformers.models.swin.modeling_swin.SwinImageClassifierOutput with Swin->Swinv2
@@ -233,7 +242,7 @@ def window_reverse(windows, window_size, height, width):
 
 
 # Copied from transformers.models.swin.modeling_swin.drop_path
-def drop_path(input, drop_prob=0.0, training=False, scale_by_keep=True):
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
@@ -663,9 +672,8 @@ def set_shift_and_window_size(self, input_resolution):
             if isinstance(self.shift_size, collections.abc.Iterable)
             else (self.shift_size, self.shift_size)
         )
-        self.window_size = (
-            input_resolution[0] if input_resolution[0] <= target_window_size[0] else target_window_size[0]
-        )
+        window_dim = input_resolution[0].item() if torch.is_tensor(input_resolution[0]) else input_resolution[0]
+        self.window_size = window_dim if window_dim <= target_window_size[0] else target_window_size[0]
         self.shift_size = (
             0
             if input_resolution
@@ -718,8 +726,12 @@ def forward(
         input_dimensions: Tuple[int, int],
         head_mask: Optional[torch.FloatTensor] = None,
         output_attentions: Optional[bool] = False,
+        always_partition: Optional[bool] = False,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        self.set_shift_and_window_size(input_dimensions)
+        if not always_partition:
+            self.set_shift_and_window_size(input_dimensions)
+        else:
+            pass
         height, width = input_dimensions
         batch_size, _, channels = hidden_states.size()
         shortcut = hidden_states
@@ -808,13 +820,15 @@ def forward(
         input_dimensions: Tuple[int, int],
         head_mask: Optional[torch.FloatTensor] = None,
         output_attentions: Optional[bool] = False,
+        always_partition: Optional[bool] = False,
     ) -> Tuple[torch.Tensor]:
         height, width = input_dimensions
         for i, layer_module in enumerate(self.blocks):
-
             layer_head_mask = head_mask[i] if head_mask is not None else None
 
-            layer_outputs = layer_module(hidden_states, input_dimensions, layer_head_mask, output_attentions)
+            layer_outputs = layer_module(
+                hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition
+            )
 
             hidden_states = layer_outputs[0]
 
@@ -868,6 +882,7 @@ def forward(
         output_attentions: Optional[bool] = False,
         output_hidden_states: Optional[bool] = False,
         output_hidden_states_before_downsampling: Optional[bool] = False,
+        always_partition: Optional[bool] = False,
         return_dict: Optional[bool] = True,
     ) -> Union[Tuple, Swinv2EncoderOutput]:
         all_hidden_states = () if output_hidden_states else None
@@ -897,7 +912,9 @@ def custom_forward(*inputs):
                     create_custom_forward(layer_module), hidden_states, input_dimensions, layer_head_mask
                 )
             else:
-                layer_outputs = layer_module(hidden_states, input_dimensions, layer_head_mask, output_attentions)
+                layer_outputs = layer_module(
+                    hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition
+                )
 
             hidden_states = layer_outputs[0]
             hidden_states_before_downsampling = layer_outputs[1]
@@ -980,8 +997,8 @@ def _set_gradient_checkpointing(self, module, value=False):
 SWINV2_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
-            [`AutoImageProcessor.__call__`] for details.
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
+            for details.
         head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
             Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
 
@@ -1033,7 +1050,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(SWINV2_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=Swinv2ModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1049,6 +1065,10 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, Swinv2ModelOutput]:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+        """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1165,7 +1185,7 @@ def forward(
         >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()
 
         >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
-        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.logits
+        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
         >>> list(reconstructed_pixel_values.shape)
         [1, 3, 256, 256]
         ```"""
@@ -1209,7 +1229,7 @@ def forward(
 
         return Swinv2MaskedImageModelingOutput(
             loss=masked_im_loss,
-            logits=reconstructed_pixel_values,
+            reconstruction=reconstructed_pixel_values,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
             reshaped_hidden_states=outputs.reshaped_hidden_states,
@@ -1241,7 +1261,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(SWINV2_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_IMAGE_CLASS_CHECKPOINT,
         output_type=Swinv2ImageClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/switch_transformers/__init__.py b/src/transformers/models/switch_transformers/__init__.py
index 9352b14d9fee..358161101110 100644
--- a/src/transformers/models/switch_transformers/__init__.py
+++ b/src/transformers/models/switch_transformers/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/switch_transformers/configuration_switch_transformers.py b/src/transformers/models/switch_transformers/configuration_switch_transformers.py
index 0d84d7ee33ff..291a9f1f3ab9 100644
--- a/src/transformers/models/switch_transformers/configuration_switch_transformers.py
+++ b/src/transformers/models/switch_transformers/configuration_switch_transformers.py
@@ -111,7 +111,6 @@ def __init__(
         num_sparse_decoder_layers=3,
         num_heads=12,
         num_experts=8,
-        router_type="tokens_masked",
         router_bias=False,
         router_jitter_noise=0.01,
         router_dtype="float32",
@@ -123,13 +122,13 @@ def __init__(
         router_z_loss_coef=0.001,
         router_aux_loss_coef=0.001,
         initializer_factor=1.0,
-        feed_forward_proj="relu",
+        dense_act_fn="relu",
         is_encoder_decoder=True,
         add_router_probs=False,
         use_cache=True,
         pad_token_id=0,
         eos_token_id=1,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.d_model = d_model
@@ -157,7 +156,6 @@ def __init__(
             self.decoder_sparse_step = self.num_decoder_layers  # HACK: this will create 0 sparse layers
 
         self.num_heads = num_heads
-        self.router_type = router_type
         self.num_experts = num_experts
         self.expert_capacity = expert_capacity
         self.router_bias = router_bias
@@ -173,27 +171,12 @@ def __init__(
         self.dropout_rate = dropout_rate
         self.layer_norm_epsilon = layer_norm_epsilon
         self.initializer_factor = initializer_factor
-        self.feed_forward_proj = feed_forward_proj
         self.use_cache = use_cache
         self.add_router_probs = add_router_probs
 
         self.router_z_loss_coef = router_z_loss_coef
         self.router_aux_loss_coef = router_aux_loss_coef
-
-        act_info = self.feed_forward_proj.split("-")
-        self.dense_act_fn = act_info[-1]
-        self.is_gated_act = act_info[0] == "gated"
-
-        if len(act_info) > 1 and act_info[0] != "gated" or len(act_info) > 2:
-            raise ValueError(
-                f"`feed_forward_proj`: {feed_forward_proj} is not a valid activation function of the dense layer."
-                "Please make sure `feed_forward_proj` is of the format `gated-{ACT_FN}` or `{ACT_FN}`, e.g. "
-                "'gated-gelu' or 'relu'"
-            )
-
-        # for backwards compatibility
-        if feed_forward_proj == "gated-gelu":
-            self.dense_act_fn = "gelu_new"
+        self.dense_act_fn = dense_act_fn
 
         super().__init__(
             pad_token_id=pad_token_id,
diff --git a/src/transformers/models/switch_transformers/convert_big_switch.py b/src/transformers/models/switch_transformers/convert_big_switch.py
index aa44f9a2190d..86c673b48a4e 100644
--- a/src/transformers/models/switch_transformers/convert_big_switch.py
+++ b/src/transformers/models/switch_transformers/convert_big_switch.py
@@ -2,12 +2,12 @@
 import json
 import os
 
-import torch
-from tensorflow.io import gfile
-
 import tensorstore as ts
+import torch
 from flax import serialization
 from flax.traverse_util import flatten_dict, unflatten_dict
+from tensorflow.io import gfile
+
 from transformers.modeling_utils import dtype_byte_size
 from transformers.models.switch_transformers.convert_switch_transformers_original_flax_checkpoint_to_pytorch import (
     rename_keys,
diff --git a/src/transformers/models/switch_transformers/convert_switch_transformers_original_flax_checkpoint_to_pytorch.py b/src/transformers/models/switch_transformers/convert_switch_transformers_original_flax_checkpoint_to_pytorch.py
index 45cd63e47433..5937101169c6 100644
--- a/src/transformers/models/switch_transformers/convert_switch_transformers_original_flax_checkpoint_to_pytorch.py
+++ b/src/transformers/models/switch_transformers/convert_switch_transformers_original_flax_checkpoint_to_pytorch.py
@@ -20,6 +20,7 @@
 
 from flax.traverse_util import flatten_dict, unflatten_dict
 from t5x import checkpoints
+
 from transformers import SwitchTransformersConfig, SwitchTransformersForConditionalGeneration
 from transformers.modeling_flax_pytorch_utils import load_flax_weights_in_pytorch_model
 from transformers.utils import logging
@@ -92,7 +93,6 @@ def rename_keys(s_dict):
     # 3. Take extra care of the EXPERTS layer
     for key in list(s_dict.keys()):
         if "expert" in key:
-
             num_experts = s_dict[key].shape[0]
             expert_weihts = s_dict[key]
             for idx in range(num_experts):
diff --git a/src/transformers/models/switch_transformers/modeling_switch_transformers.py b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
index 4cae9762e0c8..6c2fe8269782 100644
--- a/src/transformers/models/switch_transformers/modeling_switch_transformers.py
+++ b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
@@ -49,7 +49,6 @@
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "SwitchTransformersConfig"
-_TOKENIZER_FOR_DOC = "T5Tokenizer"
 _CHECKPOINT_FOR_DOC = "google/switch-base-8"
 
 ####################################################
@@ -242,7 +241,6 @@ def __init__(self, hidden_size, eps=1e-6):
         self.variance_epsilon = eps
 
     def forward(self, hidden_states):
-
         # SwitchTransformers uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
         # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
         # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
@@ -274,25 +272,12 @@ def forward(self, hidden_states):
         hidden_states = self.wi(hidden_states)
         hidden_states = self.act(hidden_states)
         hidden_states = self.dropout(hidden_states)
-        hidden_states = self.wo(hidden_states)
-        return hidden_states
-
-
-# Copied from transformers.models.longt5.modeling_longt5.LongT5DenseGatedActDense with LongT5->SwitchTransformers
-class SwitchTransformersDenseGatedActDense(nn.Module):
-    def __init__(self, config: SwitchTransformersConfig):
-        super().__init__()
-        self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
-        self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
-        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
-        self.dropout = nn.Dropout(config.dropout_rate)
-        self.act = ACT2FN[config.dense_act_fn]
-
-    def forward(self, hidden_states):
-        hidden_gelu = self.act(self.wi_0(hidden_states))
-        hidden_linear = self.wi_1(hidden_states)
-        hidden_states = hidden_gelu * hidden_linear
-        hidden_states = self.dropout(hidden_states)
+        if (
+            isinstance(self.wo.weight, torch.Tensor)
+            and hidden_states.dtype != self.wo.weight.dtype
+            and self.wo.weight.dtype != torch.int8
+        ):
+            hidden_states = hidden_states.to(self.wo.weight.dtype)
         hidden_states = self.wo(hidden_states)
         return hidden_states
 
@@ -333,7 +318,6 @@ def forward(self, hidden_states):
 
         next_states = hidden_states.clone()
         for idx, expert in enumerate(self.experts.values()):
-
             token_indices = router_mask[:, :, idx].bool()
             next_states[token_indices] = expert(hidden_states[token_indices])
 
@@ -512,9 +496,10 @@ def forward(
         real_seq_length = seq_length
 
         if past_key_value is not None:
-            assert (
-                len(past_key_value) == 2
-            ), f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
+            if len(past_key_value) != 2:
+                raise ValueError(
+                    f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
+                )
             real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length
 
         key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
@@ -720,7 +705,6 @@ def forward(
         output_router_logits=True,
         return_dict=True,
     ):
-
         if past_key_value is not None:
             if not self.is_decoder:
                 logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.")
@@ -795,7 +779,7 @@ def forward(
         if isinstance(hidden_states, tuple):
             hidden_states, router_tuple = hidden_states
         else:
-            router_tuple = (None,)
+            router_tuple = (torch.zeros((1,), device=hidden_states.device, dtype=torch.int64),)
 
         # clamp inf values to enable fp16 training
         if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
@@ -858,16 +842,6 @@ def _init_weights(self, module):
             module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
             if hasattr(module.wo, "bias") and module.wo.bias is not None:
                 module.wo.bias.data.zero_()
-        elif isinstance(module, SwitchTransformersDenseGatedActDense):
-            module.wi_0.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
-            if hasattr(module.wi_0, "bias") and module.wi_0.bias is not None:
-                module.wi_0.bias.data.zero_()
-            module.wi_1.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
-            if hasattr(module.wi_1, "bias") and module.wi_1.bias is not None:
-                module.wi_1.bias.data.zero_()
-            module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
-            if hasattr(module.wo, "bias") and module.wo.bias is not None:
-                module.wo.bias.data.zero_()
         elif isinstance(module, SwitchTransformersAttention):
             # Mesh TensorFlow attention initialization to avoid scaling before softmax
             # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136
@@ -938,7 +912,6 @@ def __init__(self, config, embed_tokens=None):
         config.num_layers = config.num_decoder_layers if self.is_decoder else config.num_layers
         self.block = nn.ModuleList()
         for i in range(config.num_layers):
-
             is_sparse = (i % sparse_step == 1) if sparse_step > 0 else False
 
             self.block.append(
@@ -1038,6 +1011,13 @@ def forward(
         else:
             encoder_extended_attention_mask = None
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         # Prepare head mask if needed
         head_mask = self.get_head_mask(head_mask, self.config.num_layers)
         cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
@@ -1059,11 +1039,6 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             if self.gradient_checkpointing and self.training:
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
 
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
@@ -1186,7 +1161,7 @@ def custom_forward(*inputs):
             Indices of input sequence tokens in the vocabulary. SWITCH_TRANSFORMERS is a model with relative position
             embeddings so you should be able to pad the inputs on both the right and the left.
 
-            Indices can be obtained using [`T5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for detail.
 
             [What are input IDs?](../glossary#input-ids)
@@ -1203,7 +1178,7 @@ def custom_forward(*inputs):
         decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`T5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -1284,7 +1259,7 @@ def custom_forward(*inputs):
             Indices of input sequence tokens in the vocabulary. SWITCH_TRANSFORMERS is a model with relative position
             embeddings so you should be able to pad the inputs on both the right and the left.
 
-            Indices can be obtained using [`T5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for detail.
 
             To know more on how to prepare `input_ids` for pretraining take a look a [SWITCH_TRANSFORMERS
@@ -1333,7 +1308,7 @@ def custom_forward(*inputs):
     SWITCH_TRANSFORMERS_START_DOCSTRING,
 )
 class SwitchTransformersModel(SwitchTransformersPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight", r"decoder.embed_tokens.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: SwitchTransformersConfig):
         super().__init__(config)
@@ -1405,9 +1380,9 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import T5Tokenizer, SwitchTransformersModel
+        >>> from transformers import AutoTokenizer, SwitchTransformersModel
 
-        >>> tokenizer = T5Tokenizer.from_pretrained("google/switch-base-8")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/switch-base-8")
         >>> model = SwitchTransformersModel.from_pretrained("google/switch-base-8")
 
         >>> input_ids = tokenizer(
@@ -1501,11 +1476,7 @@ def forward(
     """SWITCH_TRANSFORMERS Model with a `language modeling` head on top.""", SWITCH_TRANSFORMERS_START_DOCSTRING
 )
 class SwitchTransformersForConditionalGeneration(SwitchTransformersPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        r"encoder.embed_tokens.weight",
-        r"decoder.embed_tokens.weight",
-        r"lm_head.weight",
-    ]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
 
     def __init__(self, config: SwitchTransformersConfig):
         super().__init__(config)
@@ -1589,9 +1560,9 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import T5Tokenizer, SwitchTransformersForConditionalGeneration
+        >>> from transformers import AutoTokenizer, SwitchTransformersForConditionalGeneration
 
-        >>> tokenizer = T5Tokenizer.from_pretrained("google/switch-base-8")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/switch-base-8")
         >>> model = SwitchTransformersForConditionalGeneration.from_pretrained("google/switch-base-8")
 
         >>> # training
@@ -1677,48 +1648,45 @@ def forward(
         decoder_z_loss = None
         decoder_aux_loss = None
 
-        if labels is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-100)
-            # todo check in the config if router loss enables
-
-            if output_router_logits:
-                # Compute the router loss (z_loss + auxiliary loss) for each router in the encoder and decoder
-                encoder_router_logits, encoder_expert_indexes = self._unpack_router_logits(
-                    encoder_outputs.router_probs
-                )
+        if output_router_logits:
+            # Compute the router loss (z_loss + auxiliary loss) for each router in the encoder and decoder
+            if self.encoder.config.encoder_sparse_step > 1:
+                encoder_router_logits, encoder_expert_indexes = self._unpack_router_logits(encoder_outputs[-1])
                 encoder_z_loss = router_z_loss_func(encoder_router_logits)
                 encoder_router_probs = nn.Softmax(dim=-1)(encoder_router_logits)
                 encoder_aux_loss = load_balancing_loss_func(encoder_router_probs, encoder_expert_indexes)
+            else:
+                encoder_z_loss = 0
+                encoder_aux_loss = 0
 
-                decoder_router_logits, decoder_expert_indexes = self._unpack_router_logits(
-                    decoder_outputs.router_probs
-                )
+            if self.decoder.config.decoder_sparse_step > 1:
+                decoder_router_logits, decoder_expert_indexes = self._unpack_router_logits(decoder_outputs[-1])
                 decoder_z_loss = router_z_loss_func(decoder_router_logits)
                 decoder_router_probs = nn.Softmax(dim=-1)(decoder_router_logits)
                 decoder_aux_loss = load_balancing_loss_func(decoder_router_probs, decoder_expert_indexes)
+            else:
+                decoder_z_loss = 0
+                decoder_aux_loss = 0
 
+        if labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-100)
+            # move labels to correct device to enable PP
+            labels = labels.to(lm_logits.device)
             loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
 
-            if output_router_logits and labels is not None:
+            if output_router_logits:
                 z_loss = self.router_z_loss_coef * (encoder_z_loss + decoder_z_loss)
                 aux_loss = self.router_aux_loss_coef * (encoder_aux_loss + decoder_aux_loss)
                 loss = loss + z_loss + aux_loss
 
         if not return_dict:
             output = (lm_logits,)
-            if output_router_logits:  # only return the loss if they are not None
-                output += (
-                    encoder_z_loss,
-                    encoder_aux_loss,
-                    decoder_z_loss,
-                    decoder_aux_loss,
-                    *decoder_outputs[1:],
-                    *encoder_outputs,
-                )
-            else:
-                output += (*decoder_outputs[1:], *encoder_outputs)
+            if output_router_logits:
+                output += (encoder_z_loss, encoder_aux_loss, decoder_z_loss, decoder_aux_loss)
+            output += (*decoder_outputs[1:], *encoder_outputs)
 
             return ((loss,) + output) if loss is not None else output
+
         return Seq2SeqMoEOutput(
             loss=loss,
             logits=lm_logits,
@@ -1730,18 +1698,18 @@ def forward(
             decoder_hidden_states=decoder_outputs.hidden_states,
             decoder_attentions=decoder_outputs.attentions,
             cross_attentions=decoder_outputs.cross_attentions,
+            decoder_router_logits=decoder_outputs.router_probs,
             encoder_last_hidden_state=encoder_outputs.last_hidden_state,
             encoder_hidden_states=encoder_outputs.hidden_states,
             encoder_attentions=encoder_outputs.attentions,
             encoder_router_logits=encoder_outputs.router_probs,
-            decoder_router_logits=decoder_outputs.router_probs,
         )
 
     def _unpack_router_logits(self, router_outputs):
         total_router_logits = []
         total_expert_indexes = []
         for router_output in router_outputs:
-            if router_output[0] is not None:
+            if len(router_output[0].shape) > 1:
                 router_logits, expert_indexes = router_output
                 total_router_logits.append(router_logits)
                 total_expert_indexes.append(expert_indexes)
@@ -1757,9 +1725,8 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
-
         # cut decoder_input_ids if past is used
         if past_key_values is not None:
             input_ids = input_ids[:, -1:]
@@ -1778,15 +1745,15 @@ def prepare_inputs_for_generation(
     def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
         return self._shift_right(labels)
 
-    def _reorder_cache(self, past, beam_idx):
+    def _reorder_cache(self, past_key_values, beam_idx):
         # if decoder past is not included in output
         # speedy decoding is disabled and no need to reorder
-        if past is None:
+        if past_key_values is None:
             logger.warning("You might want to consider setting `use_cache=True` to speed up decoding")
-            return past
+            return past_key_values
 
         reordered_decoder_past = ()
-        for layer_past_states in past:
+        for layer_past_states in past_key_values:
             # get the correct batch idx from layer past batch dim
             # batch dim of `past` is at 2nd position
             reordered_layer_past_states = ()
@@ -1817,7 +1784,7 @@ def _reorder_cache(self, past, beam_idx):
     SWITCH_TRANSFORMERS_START_DOCSTRING,
 )
 class SwitchTransformersEncoderModel(SwitchTransformersPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight"]
 
     def __init__(self, config: SwitchTransformersConfig):
         super().__init__(config)
@@ -1871,9 +1838,9 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import T5Tokenizer, SwitchTransformersEncoderModel
+        >>> from transformers import AutoTokenizer, SwitchTransformersEncoderModel
 
-        >>> tokenizer = T5Tokenizer.from_pretrained("google/switch-base-8")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/switch-base-8")
         >>> model = SwitchTransformersEncoderModel.from_pretrained("google/switch-base-8")
         >>> input_ids = tokenizer(
         ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
diff --git a/src/transformers/models/t5/__init__.py b/src/transformers/models/t5/__init__.py
index 2f0bd9521ac2..be73c1f6480b 100644
--- a/src/transformers/models/t5/__init__.py
+++ b/src/transformers/models/t5/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -60,6 +56,8 @@
         "T5Model",
         "T5PreTrainedModel",
         "load_tf_weights_in_t5",
+        "T5ForQuestionAnswering",
+        "T5ForSequenceClassification",
     ]
 
 try:
@@ -119,6 +117,8 @@
             T5_PRETRAINED_MODEL_ARCHIVE_LIST,
             T5EncoderModel,
             T5ForConditionalGeneration,
+            T5ForQuestionAnswering,
+            T5ForSequenceClassification,
             T5Model,
             T5PreTrainedModel,
             load_tf_weights_in_t5,
diff --git a/src/transformers/models/t5/configuration_t5.py b/src/transformers/models/t5/configuration_t5.py
index a2bd03dfd74c..7eb621f58821 100644
--- a/src/transformers/models/t5/configuration_t5.py
+++ b/src/transformers/models/t5/configuration_t5.py
@@ -48,8 +48,8 @@ class T5Config(PretrainedConfig):
         d_model (`int`, *optional*, defaults to 512):
             Size of the encoder layers and the pooler layer.
         d_kv (`int`, *optional*, defaults to 64):
-            Size of the key, query, value projections per attention head. `d_kv` has to be equal to `d_model //
-            num_heads`.
+            Size of the key, query, value projections per attention head. The `inner_dim` of the projection layer will
+            be defined as `num_heads * d_kv`.
         d_ff (`int`, *optional*, defaults to 2048):
             Size of the intermediate feed forward layer in each `T5Block`.
         num_layers (`int`, *optional*, defaults to 6):
@@ -64,6 +64,8 @@ class T5Config(PretrainedConfig):
             The maximum distance of the longer sequences for the bucket separation.
         dropout_rate (`float`, *optional*, defaults to 0.1):
             The ratio for all dropout layers.
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for classifier.
         layer_norm_eps (`float`, *optional*, defaults to 1e-6):
             The epsilon used by the layer normalization layers.
         initializer_factor (`float`, *optional*, defaults to 1):
@@ -98,7 +100,8 @@ def __init__(
         use_cache=True,
         pad_token_id=0,
         eos_token_id=1,
-        **kwargs
+        classifier_dropout=0.0,
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.d_model = d_model
@@ -112,6 +115,7 @@ def __init__(
         self.relative_attention_num_buckets = relative_attention_num_buckets
         self.relative_attention_max_distance = relative_attention_max_distance
         self.dropout_rate = dropout_rate
+        self.classifier_dropout = classifier_dropout
         self.layer_norm_epsilon = layer_norm_epsilon
         self.initializer_factor = initializer_factor
         self.feed_forward_proj = feed_forward_proj
diff --git a/src/transformers/models/t5/convert_t5x_checkpoint_to_flax.py b/src/transformers/models/t5/convert_t5x_checkpoint_to_flax.py
index 68e66c829823..11f32c8461e9 100644
--- a/src/transformers/models/t5/convert_t5x_checkpoint_to_flax.py
+++ b/src/transformers/models/t5/convert_t5x_checkpoint_to_flax.py
@@ -18,6 +18,7 @@
 import argparse
 
 from t5x import checkpoints
+
 from transformers import FlaxT5ForConditionalGeneration, T5Config
 
 
diff --git a/src/transformers/models/t5/convert_t5x_checkpoint_to_pytorch.py b/src/transformers/models/t5/convert_t5x_checkpoint_to_pytorch.py
index 8269e292e8cd..d69e14057fc5 100755
--- a/src/transformers/models/t5/convert_t5x_checkpoint_to_pytorch.py
+++ b/src/transformers/models/t5/convert_t5x_checkpoint_to_pytorch.py
@@ -32,10 +32,10 @@
 import collections
 
 import torch
-
 from flax import traverse_util
 from t5x import checkpoints
-from transformers import T5Config, T5ForConditionalGeneration
+
+from transformers import T5Config, T5EncoderModel, T5ForConditionalGeneration
 from transformers.utils import logging
 
 
@@ -69,7 +69,7 @@ def t5x_layer_norm_lookup(params, i, prefix, layer_name):
     return params[f"{prefix}/layers_{i}/{layer_name}/scale"]
 
 
-def convert_t5x_to_pytorch(variables: dict, *, num_layers: int):
+def convert_t5x_to_pytorch(variables: dict, *, num_layers: int, is_encoder_only: bool):
     """Converts the parameters from T5X-Flax to Transformers-PyTorch."""
     old = traverse_util.flatten_dict(variables["target"])
     old = {"/".join(k): v for k, v in old.items()}
@@ -110,50 +110,51 @@ def convert_t5x_to_pytorch(variables: dict, *, num_layers: int):
     ].T
     new["encoder.final_layer_norm.weight"] = old["encoder/encoder_norm/scale"]
 
-    # Decoder.
-    for i in range(num_layers):
-        # Block i, layer 0 (Self Attention).
-        layer_norm = t5x_layer_norm_lookup(old, i, "decoder", "pre_self_attention_layer_norm")
-        k, o, q, v = t5x_attention_lookup(old, i, "decoder", "self_attention")
-        new[f"decoder.block.{i}.layer.0.layer_norm.weight"] = layer_norm
-        new[f"decoder.block.{i}.layer.0.SelfAttention.k.weight"] = k.T
-        new[f"decoder.block.{i}.layer.0.SelfAttention.o.weight"] = o.T
-        new[f"decoder.block.{i}.layer.0.SelfAttention.q.weight"] = q.T
-        new[f"decoder.block.{i}.layer.0.SelfAttention.v.weight"] = v.T
-
-        # Block i, layer 1 (Cross Attention).
-        layer_norm = t5x_layer_norm_lookup(old, i, "decoder", "pre_cross_attention_layer_norm")
-        k, o, q, v = t5x_attention_lookup(old, i, "decoder", "encoder_decoder_attention")
-        new[f"decoder.block.{i}.layer.1.layer_norm.weight"] = layer_norm
-        new[f"decoder.block.{i}.layer.1.EncDecAttention.k.weight"] = k.T
-        new[f"decoder.block.{i}.layer.1.EncDecAttention.o.weight"] = o.T
-        new[f"decoder.block.{i}.layer.1.EncDecAttention.q.weight"] = q.T
-        new[f"decoder.block.{i}.layer.1.EncDecAttention.v.weight"] = v.T
-
-        # Block i, layer 2 (MLP).
-        layer_norm = t5x_layer_norm_lookup(old, i, "decoder", "pre_mlp_layer_norm")
-        wi, wo = t5x_mlp_lookup(old, i, "decoder", split_mlp_wi)
-        new[f"decoder.block.{i}.layer.2.layer_norm.weight"] = layer_norm
-        if split_mlp_wi:
-            new[f"decoder.block.{i}.layer.2.DenseReluDense.wi_0.weight"] = wi[0].T
-            new[f"decoder.block.{i}.layer.2.DenseReluDense.wi_1.weight"] = wi[1].T
-        else:
-            new[f"encoder.block.{i}.layer.2.DenseReluDense.wi.weight"] = wi.T
-        new[f"decoder.block.{i}.layer.2.DenseReluDense.wo.weight"] = wo.T
-
-    new["decoder.final_layer_norm.weight"] = old["decoder/decoder_norm/scale"]
-    new["decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"] = old[
-        "decoder/relpos_bias/rel_embedding"
-    ].T
-
-    # LM Head (only in v1.1 checkpoints, in v1.0 embeddings are used instead)
-    if "decoder/logits_dense/kernel" in old:
-        new["lm_head.weight"] = old["decoder/logits_dense/kernel"].T
+    if not is_encoder_only:
+        # Decoder.
+        for i in range(num_layers):
+            # Block i, layer 0 (Self Attention).
+            layer_norm = t5x_layer_norm_lookup(old, i, "decoder", "pre_self_attention_layer_norm")
+            k, o, q, v = t5x_attention_lookup(old, i, "decoder", "self_attention")
+            new[f"decoder.block.{i}.layer.0.layer_norm.weight"] = layer_norm
+            new[f"decoder.block.{i}.layer.0.SelfAttention.k.weight"] = k.T
+            new[f"decoder.block.{i}.layer.0.SelfAttention.o.weight"] = o.T
+            new[f"decoder.block.{i}.layer.0.SelfAttention.q.weight"] = q.T
+            new[f"decoder.block.{i}.layer.0.SelfAttention.v.weight"] = v.T
+
+            # Block i, layer 1 (Cross Attention).
+            layer_norm = t5x_layer_norm_lookup(old, i, "decoder", "pre_cross_attention_layer_norm")
+            k, o, q, v = t5x_attention_lookup(old, i, "decoder", "encoder_decoder_attention")
+            new[f"decoder.block.{i}.layer.1.layer_norm.weight"] = layer_norm
+            new[f"decoder.block.{i}.layer.1.EncDecAttention.k.weight"] = k.T
+            new[f"decoder.block.{i}.layer.1.EncDecAttention.o.weight"] = o.T
+            new[f"decoder.block.{i}.layer.1.EncDecAttention.q.weight"] = q.T
+            new[f"decoder.block.{i}.layer.1.EncDecAttention.v.weight"] = v.T
+
+            # Block i, layer 2 (MLP).
+            layer_norm = t5x_layer_norm_lookup(old, i, "decoder", "pre_mlp_layer_norm")
+            wi, wo = t5x_mlp_lookup(old, i, "decoder", split_mlp_wi)
+            new[f"decoder.block.{i}.layer.2.layer_norm.weight"] = layer_norm
+            if split_mlp_wi:
+                new[f"decoder.block.{i}.layer.2.DenseReluDense.wi_0.weight"] = wi[0].T
+                new[f"decoder.block.{i}.layer.2.DenseReluDense.wi_1.weight"] = wi[1].T
+            else:
+                new[f"decoder.block.{i}.layer.2.DenseReluDense.wi.weight"] = wi.T
+            new[f"decoder.block.{i}.layer.2.DenseReluDense.wo.weight"] = wo.T
+
+        new["decoder.final_layer_norm.weight"] = old["decoder/decoder_norm/scale"]
+        new["decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"] = old[
+            "decoder/relpos_bias/rel_embedding"
+        ].T
+
+        # LM Head (only in v1.1 checkpoints, in v1.0 embeddings are used instead)
+        if "decoder/logits_dense/kernel" in old:
+            new["lm_head.weight"] = old["decoder/logits_dense/kernel"].T
 
     return new
 
 
-def make_state_dict(converted_params):
+def make_state_dict(converted_params, is_encoder_only: bool):
     """Prepares a state dict for the PyTorch model."""
     # Make a state dict with torch tensors.
     state_dict = collections.OrderedDict([(k, torch.from_numpy(v.copy())) for (k, v) in converted_params.items()])
@@ -162,35 +163,41 @@ def make_state_dict(converted_params):
     if "encoder.embed_tokens.weight" not in state_dict:
         state_dict["encoder.embed_tokens.weight"] = state_dict["shared.weight"]
 
-    if "decoder.embed_tokens.weight" not in state_dict:
-        state_dict["decoder.embed_tokens.weight"] = state_dict["shared.weight"]
+    if not is_encoder_only:
+        if "decoder.embed_tokens.weight" not in state_dict:
+            state_dict["decoder.embed_tokens.weight"] = state_dict["shared.weight"]
 
-    if "lm_head.weight" not in state_dict:  # For old 1.0 models.
-        print("Using shared word embeddings as lm_head.")
-        state_dict["lm_head.weight"] = state_dict["shared.weight"]
+        if "lm_head.weight" not in state_dict:  # For old 1.0 models.
+            print("Using shared word embeddings as lm_head.")
+            state_dict["lm_head.weight"] = state_dict["shared.weight"]
 
     return state_dict
 
 
-def load_t5x_weights_in_t5(model, config, t5x_checkpoint_path):
+def load_t5x_weights_in_t5(model, config, t5x_checkpoint_path, is_encoder_only):
     """Replaces the params in model witht the T5X converted params."""
     variables = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path)
-    converted = convert_t5x_to_pytorch(variables, num_layers=config.num_layers)
-    state_dict = make_state_dict(converted)
+    converted = convert_t5x_to_pytorch(variables, num_layers=config.num_layers, is_encoder_only=is_encoder_only)
+    state_dict = make_state_dict(converted, is_encoder_only)
     model.load_state_dict(state_dict, strict=True)
 
 
-def convert_t5x_checkpoint_to_pytorch(t5x_checkpoint_path, config_file, pytorch_dump_path):
+def convert_t5x_checkpoint_to_pytorch(
+    t5x_checkpoint_path, config_file, pytorch_dump_path, is_encoder_only: bool = False
+):
     """Loads the config and model, converts the T5X checkpoint, and saves a PyTorch checkpoint."""
     # Initialise PyTorch model
     config = T5Config.from_json_file(config_file)
     print(f"Building PyTorch model from configuration: {config}")
     # Non-v1.1 checkpoints could also use T5Model, but this works for all.
     # The v1.0 checkpoints will simply have an LM head that is the word embeddings.
-    model = T5ForConditionalGeneration(config)
+    if is_encoder_only:
+        model = T5EncoderModel(config)
+    else:
+        model = T5ForConditionalGeneration(config)
 
     # Load weights from tf checkpoint
-    load_t5x_weights_in_t5(model, config, t5x_checkpoint_path)
+    load_t5x_weights_in_t5(model, config, t5x_checkpoint_path, is_encoder_only)
 
     # Save pytorch-model
     print(f"Save PyTorch model to {pytorch_dump_path}")
@@ -217,5 +224,10 @@ def convert_t5x_checkpoint_to_pytorch(t5x_checkpoint_path, config_file, pytorch_
     parser.add_argument(
         "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
     )
+    parser.add_argument(
+        "--is_encoder_only", action="store_true", help="Check if the model is encoder-decoder model", default=False
+    )
     args = parser.parse_args()
-    convert_t5x_checkpoint_to_pytorch(args.t5x_checkpoint_path, args.config_file, args.pytorch_dump_path)
+    convert_t5x_checkpoint_to_pytorch(
+        args.t5x_checkpoint_path, args.config_file, args.pytorch_dump_path, args.is_encoder_only
+    )
diff --git a/src/transformers/models/t5/modeling_flax_t5.py b/src/transformers/models/t5/modeling_flax_t5.py
index 1e93fb323572..b2a718142152 100644
--- a/src/transformers/models/t5/modeling_flax_t5.py
+++ b/src/transformers/models/t5/modeling_flax_t5.py
@@ -18,11 +18,10 @@
 import copy
 from typing import Callable, Optional, Tuple
 
-import numpy as np
-
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen import combine_masks, make_causal_mask
 from flax.linen import partitioning as nn_partitioning
@@ -52,21 +51,20 @@
 
 _CHECKPOINT_FOR_DOC = "t5-small"
 _CONFIG_FOR_DOC = "T5Config"
-_TOKENIZER_FOR_DOC = "T5Tokenizer"
 
 remat = nn_partitioning.remat
 
 
 # Copied from transformers.models.bart.modeling_flax_bart.shift_tokens_right
-def shift_tokens_right(input_ids: np.array, pad_token_id: int, decoder_start_token_id: int) -> np.ndarray:
+def shift_tokens_right(input_ids: jnp.array, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray:
     """
     Shift input ids one token to the right.
     """
-    shifted_input_ids = np.zeros_like(input_ids)
-    shifted_input_ids[:, 1:] = input_ids[:, :-1]
-    shifted_input_ids[:, 0] = decoder_start_token_id
+    shifted_input_ids = jnp.zeros_like(input_ids)
+    shifted_input_ids = shifted_input_ids.at[:, 1:].set(input_ids[:, :-1])
+    shifted_input_ids = shifted_input_ids.at[:, 0].set(decoder_start_token_id)
 
-    shifted_input_ids = np.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
+    shifted_input_ids = jnp.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
     return shifted_input_ids
 
 
@@ -805,7 +803,7 @@ def __call__(
             Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
             should be able to pad the inputs on both the right and the left.
 
-            Indices can be obtained using [`T5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for detail.
 
             To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training).
@@ -831,7 +829,7 @@ def __call__(
         decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`T5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -874,7 +872,7 @@ def __call__(
             Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
             should be able to pad the inputs on both the right and the left.
 
-            Indices can be obtained using [`T5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for detail.
 
             [What are input IDs?](../glossary#input-ids)
@@ -890,7 +888,7 @@ def __call__(
         decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`T5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -944,7 +942,7 @@ def __init__(
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
         gradient_checkpointing: bool = False,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, gradient_checkpointing=gradient_checkpointing, **kwargs)
         super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
@@ -1090,9 +1088,9 @@ def encode(
         Example:
 
         ```python
-        >>> from transformers import T5Tokenizer, FlaxT5ForConditionalGeneration
+        >>> from transformers import AutoTokenizer, FlaxT5ForConditionalGeneration
 
-        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        >>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
         >>> model = FlaxT5ForConditionalGeneration.from_pretrained("t5-small")
 
         >>> text = "My friends are cool but they eat too many carbs."
@@ -1151,10 +1149,10 @@ def decode(
         Example:
 
         ```python
-        >>> from transformers import T5Tokenizer, FlaxT5ForConditionalGeneration
+        >>> from transformers import AutoTokenizer, FlaxT5ForConditionalGeneration
         >>> import jax.numpy as jnp
 
-        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        >>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
         >>> model = FlaxT5ForConditionalGeneration.from_pretrained("t5-small")
 
         >>> text = "My friends are cool but they eat too many carbs."
@@ -1370,9 +1368,7 @@ class FlaxT5Model(FlaxT5PreTrainedModel):
     module_class = FlaxT5Module
 
 
-append_call_sample_docstring(
-    FlaxT5Model, _TOKENIZER_FOR_DOC, _CHECKPOINT_FOR_DOC, FlaxSeq2SeqModelOutput, _CONFIG_FOR_DOC
-)
+append_call_sample_docstring(FlaxT5Model, _CHECKPOINT_FOR_DOC, FlaxSeq2SeqModelOutput, _CONFIG_FOR_DOC)
 
 FLAX_T5_MODEL_DOCSTRING = """
     Returns:
@@ -1380,9 +1376,9 @@ class FlaxT5Model(FlaxT5PreTrainedModel):
     Example:
 
     ```python
-    >>> from transformers import T5Tokenizer, FlaxT5Model
+    >>> from transformers import AutoTokenizer, FlaxT5Model
 
-    >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+    >>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
     >>> model = FlaxT5Model.from_pretrained("t5-small")
 
     >>> input_ids = tokenizer(
@@ -1442,7 +1438,6 @@ def __call__(
         return_dict: bool = True,
         deterministic: bool = True,
     ):
-
         # Encode if needed (training, first prediction pass)
         encoder_outputs = self.encoder(
             input_ids=input_ids,
@@ -1632,10 +1627,10 @@ def decode(
         Example:
 
         ```python
-        >>> from transformers import T5Tokenizer, FlaxT5ForConditionalGeneration
+        >>> from transformers import AutoTokenizer, FlaxT5ForConditionalGeneration
         >>> import jax.numpy as jnp
 
-        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        >>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
         >>> model = FlaxT5ForConditionalGeneration.from_pretrained("t5-small")
 
         >>> text = "summarize: My friends are cool but they eat too many carbs."
@@ -1745,10 +1740,10 @@ def prepare_inputs_for_generation(
         self,
         decoder_input_ids,
         max_length,
-        attention_mask: Optional[jnp.DeviceArray] = None,
-        decoder_attention_mask: Optional[jnp.DeviceArray] = None,
+        attention_mask: Optional[jax.Array] = None,
+        decoder_attention_mask: Optional[jax.Array] = None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # initializing the cache
         batch_size, seq_length = decoder_input_ids.shape
@@ -1781,9 +1776,9 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
     Example:
 
     ```python
-    >>> from transformers import T5Tokenizer, FlaxT5ForConditionalGeneration
+    >>> from transformers import AutoTokenizer, FlaxT5ForConditionalGeneration
 
-    >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+    >>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
     >>> model = FlaxT5ForConditionalGeneration.from_pretrained("t5-small")
 
     >>> ARTICLE_TO_SUMMARIZE = "summarize: My friends are cool but they eat too many carbs."
diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py
index 8439b1728b86..178274404838 100644
--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@@ -19,11 +19,11 @@
 import math
 import os
 import warnings
-from typing import Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import torch
 from torch import nn
-from torch.nn import CrossEntropyLoss
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from torch.utils.checkpoint import checkpoint
 
 from ...activations import ACT2FN
@@ -32,6 +32,8 @@
     BaseModelOutputWithPastAndCrossAttentions,
     Seq2SeqLMOutput,
     Seq2SeqModelOutput,
+    Seq2SeqQuestionAnsweringModelOutput,
+    Seq2SeqSequenceClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer
@@ -51,7 +53,6 @@
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "T5Config"
-_TOKENIZER_FOR_DOC = "T5Tokenizer"
 _CHECKPOINT_FOR_DOC = "t5-small"
 
 ####################################################
@@ -164,9 +165,8 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
             logger.info(f"Transposing numpy weight of shape {array.shape} for {name}")
             array = np.transpose(array)
         try:
-            assert (
-                pointer.shape == array.shape
-            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+            if pointer.shape != array.shape:
+                raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
         except AssertionError as e:
             e.args += (pointer.shape, array.shape)
             raise
@@ -246,7 +246,6 @@ def __init__(self, hidden_size, eps=1e-6):
         self.variance_epsilon = eps
 
     def forward(self, hidden_states):
-
         # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
         # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
         # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
@@ -314,6 +313,12 @@ def forward(self, hidden_states):
         hidden_states = self.wi(hidden_states)
         hidden_states = self.act(hidden_states)
         hidden_states = self.dropout(hidden_states)
+        if (
+            isinstance(self.wo.weight, torch.Tensor)
+            and hidden_states.dtype != self.wo.weight.dtype
+            and self.wo.weight.dtype != torch.int8
+        ):
+            hidden_states = hidden_states.to(self.wo.weight.dtype)
         hidden_states = self.wo(hidden_states)
         return hidden_states
 
@@ -335,7 +340,12 @@ def forward(self, hidden_states):
 
         # To make 8bit quantization work for google/flan-t5-xxl, self.wo is kept in float32.
         # See https://github.com/huggingface/transformers/issues/20287
-        if hidden_states.dtype != self.wo.weight.dtype:
+        # we also make sure the weights are not in `int8` in case users will force `_keep_in_fp32_modules` to be `None``
+        if (
+            isinstance(self.wo.weight, torch.Tensor)
+            and hidden_states.dtype != self.wo.weight.dtype
+            and self.wo.weight.dtype != torch.int8
+        ):
             hidden_states = hidden_states.to(self.wo.weight.dtype)
 
         hidden_states = self.wo(hidden_states)
@@ -488,9 +498,10 @@ def forward(
         real_seq_length = seq_length
 
         if past_key_value is not None:
-            assert (
-                len(past_key_value) == 2
-            ), f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
+            if len(past_key_value) != 2:
+                raise ValueError(
+                    f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
+                )
             real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length
 
         key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
@@ -689,7 +700,6 @@ def forward(
         output_attentions=False,
         return_dict=True,
     ):
-
         if past_key_value is not None:
             if not self.is_decoder:
                 logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.")
@@ -720,16 +730,13 @@ def forward(
         attention_outputs = self_attention_outputs[2:]  # Keep self-attention outputs and relative position weights
 
         # clamp inf values to enable fp16 training
-        if self.ort:
-            # Remove data-based control flow for static graph
-            if hidden_states.dtype == torch.float16:
-                clamp_value = torch.where(torch.isinf(hidden_states).any(), torch.finfo(hidden_states.dtype).max - 1000,
-                    torch.finfo(hidden_states.dtype).max)
-                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-        else:
-            if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
-                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+        if hidden_states.dtype == torch.float16:
+            clamp_value = torch.where(
+                torch.isinf(hidden_states).any(),
+                torch.finfo(hidden_states.dtype).max - 1000,
+                torch.finfo(hidden_states.dtype).max,
+            )
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
 
         do_cross_attention = self.is_decoder and encoder_hidden_states is not None
         if do_cross_attention:
@@ -754,16 +761,13 @@ def forward(
             hidden_states = cross_attention_outputs[0]
 
             # clamp inf values to enable fp16 training
-            if self.ort:
-                # Remove data-based control flow for static graph
-                if hidden_states.dtype == torch.float16:
-                    clamp_value = torch.where(torch.isinf(hidden_states).any(), torch.finfo(hidden_states.dtype).max - 1000,
-                        torch.finfo(hidden_states.dtype).max)
-                    hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-            else:
-                if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
-                    clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-                    hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+            if hidden_states.dtype == torch.float16:
+                clamp_value = torch.where(
+                    torch.isinf(hidden_states).any(),
+                    torch.finfo(hidden_states.dtype).max - 1000,
+                    torch.finfo(hidden_states.dtype).max,
+                )
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
 
             # Combine self attn and cross attn key value states
             if present_key_value_state is not None:
@@ -776,16 +780,13 @@ def forward(
         hidden_states = self.layer[-1](hidden_states)
 
         # clamp inf values to enable fp16 training
-        if self.ort:
-            # Remove data-based control flow for static graph
-            if hidden_states.dtype == torch.float16:
-                clamp_value = torch.where(torch.isinf(hidden_states).any(), torch.finfo(hidden_states.dtype).max - 1000,
-                    torch.finfo(hidden_states.dtype).max)
-                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-        else:
-            if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
-                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+        if hidden_states.dtype == torch.float16:
+            clamp_value = torch.where(
+                torch.isinf(hidden_states).any(),
+                torch.finfo(hidden_states.dtype).max - 1000,
+                torch.finfo(hidden_states.dtype).max,
+            )
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
 
         outputs = (hidden_states,)
 
@@ -797,6 +798,24 @@ def forward(
         return outputs  # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
 
 
+class T5ClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config: T5Config):
+        super().__init__()
+        self.dense = nn.Linear(config.d_model, config.d_model)
+        self.dropout = nn.Dropout(p=config.classifier_dropout)
+        self.out_proj = nn.Linear(config.d_model, config.num_labels)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = torch.tanh(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.out_proj(hidden_states)
+        return hidden_states
+
+
 class T5PreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -827,12 +846,25 @@ def _init_weights(self, module):
         factor = self.config.initializer_factor  # Used for testing weights initialization
         if isinstance(module, T5LayerNorm):
             module.weight.data.fill_(factor * 1.0)
-        elif isinstance(module, (T5Model, T5ForConditionalGeneration, T5EncoderModel)):
+        elif isinstance(
+            module,
+            (T5Model, T5ForConditionalGeneration, T5EncoderModel, T5ForQuestionAnswering),
+        ):
             # Mesh TensorFlow embeddings initialization
             # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
             module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0)
             if hasattr(module, "lm_head") and not self.config.tie_word_embeddings:
                 module.lm_head.weight.data.normal_(mean=0.0, std=factor * 1.0)
+            if hasattr(module, "qa_outputs"):
+                module.qa_outputs.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+                module.qa_outputs.bias.data.zero_()
+        elif isinstance(module, T5ClassificationHead):
+            module.dense.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.dense, "bias") and module.dense.bias is not None:
+                module.dense.bias.data.zero_()
+            module.out_proj.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.out_proj, "bias") and module.out_proj.bias is not None:
+                module.out_proj.bias.data.zero_()
         elif isinstance(module, T5DenseActDense):
             # Mesh TensorFlow FF initialization
             # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
@@ -874,10 +906,11 @@ def _shift_right(self, input_ids):
         decoder_start_token_id = self.config.decoder_start_token_id
         pad_token_id = self.config.pad_token_id
 
-        assert decoder_start_token_id is not None, (
-            "self.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id."
-            " See T5 docs for more information"
-        )
+        if decoder_start_token_id is None:
+            raise ValueError(
+                "self.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id."
+                "See T5 docs for more information."
+            )
 
         # shift inputs to the right
         if is_torch_fx_proxy(input_ids):
@@ -889,7 +922,8 @@ def _shift_right(self, input_ids):
             shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
             shifted_input_ids[..., 0] = decoder_start_token_id
 
-        assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
+        if pad_token_id is None:
+            raise ValueError("self.model.config.pad_token_id has to be defined.")
         # replace possible -100 values in labels by `pad_token_id`
         shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
 
@@ -918,6 +952,13 @@ def __init__(self, config, embed_tokens=None):
 
     @add_start_docstrings(PARALLELIZE_DOCSTRING)
     def parallelize(self, device_map=None):
+        warnings.warn(
+            "`T5Stack.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your model"
+            " with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
+            " `device_map` but it needs to be a dictionary module_name to device, so for instance {'block.0': 0,"
+            " 'block.1': 1, ...}",
+            FutureWarning,
+        )
         # Check validity of device_map
         self.device_map = (
             get_device_map(len(self.block), range(torch.cuda.device_count())) if device_map is None else device_map
@@ -939,6 +980,10 @@ def parallelize(self, device_map=None):
 
     @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
     def deparallelize(self):
+        warnings.warn(
+            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
+            FutureWarning,
+        )
         self.model_parallel = False
         self.device_map = None
         self.first_device = "cpu"
@@ -996,7 +1041,8 @@ def forward(
             raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds")
 
         if inputs_embeds is None:
-            assert self.embed_tokens is not None, "You have to initialize the model with valid token embeddings"
+            if self.embed_tokens is None:
+                raise ValueError("You have to initialize the model with valid token embeddings")
             inputs_embeds = self.embed_tokens(input_ids)
 
         batch_size, seq_length = input_shape
@@ -1005,7 +1051,8 @@ def forward(
         mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length
 
         if use_cache is True:
-            assert self.is_decoder, f"`use_cache` can only be set to `True` if {self} is used as a decoder"
+            if not self.is_decoder:
+                raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder")
 
         if attention_mask is None:
             attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
@@ -1034,6 +1081,13 @@ def forward(
         else:
             encoder_extended_attention_mask = None
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         # Prepare head mask if needed
         head_mask = self.get_head_mask(head_mask, self.config.num_layers)
         cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
@@ -1071,11 +1125,6 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             if self.gradient_checkpointing and self.training:
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
 
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
@@ -1193,7 +1242,7 @@ def custom_forward(*inputs):
             Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
             should be able to pad the inputs on both the right and the left.
 
-            Indices can be obtained using [`T5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for detail.
 
             [What are input IDs?](../glossary#input-ids)
@@ -1209,7 +1258,7 @@ def custom_forward(*inputs):
         decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`T5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -1286,7 +1335,7 @@ def custom_forward(*inputs):
             Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
             should be able to pad the inputs on both the right and the left.
 
-            Indices can be obtained using [`T5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for detail.
 
             To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training).
@@ -1331,13 +1380,10 @@ def custom_forward(*inputs):
     T5_START_DOCSTRING,
 )
 class T5Model(T5PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        r"encoder.embed_tokens.weight",
-        r"decoder.embed_tokens.weight",
-    ]
     _keys_to_ignore_on_load_unexpected = [
-        r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
+        "decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
     ]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: T5Config):
         super().__init__(config)
@@ -1364,6 +1410,13 @@ def __init__(self, config: T5Config):
 
     @add_start_docstrings(PARALLELIZE_DOCSTRING)
     def parallelize(self, device_map=None):
+        warnings.warn(
+            "`T5Model.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your model"
+            " with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
+            " `device_map` but it needs to be a dictionary module_name to device, so for instance {'encoder.block.0':"
+            " 0, 'encoder.block.1': 1, ...}",
+            FutureWarning,
+        )
         self.device_map = (
             get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
             if device_map is None
@@ -1376,6 +1429,10 @@ def parallelize(self, device_map=None):
 
     @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
     def deparallelize(self):
+        warnings.warn(
+            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
+            FutureWarning,
+        )
         self.encoder.deparallelize()
         self.decoder.deparallelize()
         self.encoder = self.encoder.to("cpu")
@@ -1432,9 +1489,9 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import T5Tokenizer, T5Model
+        >>> from transformers import AutoTokenizer, T5Model
 
-        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        >>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
         >>> model = T5Model.from_pretrained("t5-small")
 
         >>> input_ids = tokenizer(
@@ -1523,14 +1580,10 @@ def forward(
 
 @add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING)
 class T5ForConditionalGeneration(T5PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        r"encoder.embed_tokens.weight",
-        r"decoder.embed_tokens.weight",
-        r"lm_head.weight",
-    ]
     _keys_to_ignore_on_load_unexpected = [
-        r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
+        "decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
     ]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
 
     def __init__(self, config: T5Config):
         super().__init__(config)
@@ -1561,6 +1614,13 @@ def __init__(self, config: T5Config):
 
     @add_start_docstrings(PARALLELIZE_DOCSTRING)
     def parallelize(self, device_map=None):
+        warnings.warn(
+            "`T5ForConditionalGeneration.parallelize` is deprecated and will be removed in v5 of Transformers, you"
+            " should load your model with `device_map='balanced'` in the call to `from_pretrained`. You can also"
+            " provide your own `device_map` but it needs to be a dictionary module_name to device, so for instance"
+            " {'encoder.block.0': 0, 'encoder.block.1': 1, ...}",
+            FutureWarning,
+        )
         self.device_map = (
             get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
             if device_map is None
@@ -1574,6 +1634,10 @@ def parallelize(self, device_map=None):
 
     @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
     def deparallelize(self):
+        warnings.warn(
+            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
+            FutureWarning,
+        )
         self.encoder.deparallelize()
         self.decoder.deparallelize()
         self.encoder = self.encoder.to("cpu")
@@ -1635,9 +1699,9 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import T5Tokenizer, T5ForConditionalGeneration
+        >>> from transformers import AutoTokenizer, T5ForConditionalGeneration
 
-        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        >>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
         >>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
 
         >>> # training
@@ -1737,6 +1801,8 @@ def forward(
         loss = None
         if labels is not None:
             loss_fct = CrossEntropyLoss(ignore_index=-100)
+            # move labels to correct device to enable PP
+            labels = labels.to(lm_logits.device)
             loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
             # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
 
@@ -1763,12 +1829,12 @@ def prepare_inputs_for_generation(
         attention_mask=None,
         head_mask=None,
         decoder_head_mask=None,
+        decoder_attention_mask=None,
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
-
         # cut decoder_input_ids if past is used
         if past_key_values is not None:
             input_ids = input_ids[:, -1:]
@@ -1780,6 +1846,7 @@ def prepare_inputs_for_generation(
             "attention_mask": attention_mask,
             "head_mask": head_mask,
             "decoder_head_mask": decoder_head_mask,
+            "decoder_attention_mask": decoder_attention_mask,
             "cross_attn_head_mask": cross_attn_head_mask,
             "use_cache": use_cache,
         }
@@ -1787,15 +1854,15 @@ def prepare_inputs_for_generation(
     def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
         return self._shift_right(labels)
 
-    def _reorder_cache(self, past, beam_idx):
+    def _reorder_cache(self, past_key_values, beam_idx):
         # if decoder past is not included in output
         # speedy decoding is disabled and no need to reorder
-        if past is None:
+        if past_key_values is None:
             logger.warning("You might want to consider setting `use_cache=True` to speed up decoding")
-            return past
+            return past_key_values
 
         reordered_decoder_past = ()
-        for layer_past_states in past:
+        for layer_past_states in past_key_values:
             # get the correct batch idx from layer past batch dim
             # batch dim of `past` is at 2nd position
             reordered_layer_past_states = ()
@@ -1805,8 +1872,14 @@ def _reorder_cache(self, past, beam_idx):
                     layer_past_state.index_select(0, beam_idx.to(layer_past_state.device)),
                 )
 
-            assert reordered_layer_past_states[0].shape == layer_past_states[0].shape
-            assert len(reordered_layer_past_states) == len(layer_past_states)
+            if reordered_layer_past_states[0].shape != layer_past_states[0].shape:
+                raise ValueError(
+                    f"reordered_layer_past_states[0] shape {reordered_layer_past_states[0].shape} and layer_past_states[0] shape {layer_past_states[0].shape} mismatched"
+                )
+            if len(reordered_layer_past_states) != len(layer_past_states):
+                raise ValueError(
+                    f"length of reordered_layer_past_states {len(reordered_layer_past_states)} and length of layer_past_states {len(layer_past_states)} mismatched"
+                )
 
             reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)
         return reordered_decoder_past
@@ -1817,7 +1890,8 @@ def _reorder_cache(self, past, beam_idx):
     T5_START_DOCSTRING,
 )
 class T5EncoderModel(T5PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight"]
+    _keys_to_ignore_on_load_unexpected = [r"decoder"]
 
     def __init__(self, config: T5Config):
         super().__init__(config)
@@ -1837,6 +1911,13 @@ def __init__(self, config: T5Config):
 
     @add_start_docstrings(PARALLELIZE_DOCSTRING)
     def parallelize(self, device_map=None):
+        warnings.warn(
+            "`T5EncoderModel.parallelize` is deprecated and will be removed in v5 of Transformers, you should load"
+            " your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
+            " `device_map` but it needs to be a dictionary module_name to device, so for instance {'block.0': 0,"
+            " 'block.1': 1, ...}",
+            FutureWarning,
+        )
         self.device_map = (
             get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
             if device_map is None
@@ -1848,6 +1929,10 @@ def parallelize(self, device_map=None):
 
     @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
     def deparallelize(self):
+        warnings.warn(
+            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
+            FutureWarning,
+        )
         self.encoder.deparallelize()
         self.encoder = self.encoder.to("cpu")
         self.model_parallel = False
@@ -1890,9 +1975,9 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import T5Tokenizer, T5EncoderModel
+        >>> from transformers import AutoTokenizer, T5EncoderModel
 
-        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        >>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
         >>> model = T5EncoderModel.from_pretrained("t5-small")
         >>> input_ids = tokenizer(
         ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
@@ -1913,3 +1998,323 @@ def forward(
         )
 
         return encoder_outputs
+
+
+@add_start_docstrings(
+    """
+    T5 model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
+    tasks.
+    """,
+    T5_START_DOCSTRING,
+)
+class T5ForSequenceClassification(T5PreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+
+    def __init__(self, config: T5Config):
+        super().__init__(config)
+        self.transformer = T5Model(config)
+        self.classification_head = T5ClassificationHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        self.model_parallel = False
+
+    @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Seq2SeqSequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        Returns:
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        if input_ids is None and inputs_embeds is not None:
+            raise NotImplementedError(
+                f"Passing input embeddings is currently not supported for {self.__class__.__name__}"
+            )
+
+        # Copied from models.bart.modeling_bart.BartModel.forward different to other models, T5 automatically creates
+        # decoder_input_ids from input_ids if no decoder_input_ids are provided
+        if decoder_input_ids is None and decoder_inputs_embeds is None:
+            if input_ids is None:
+                raise ValueError(
+                    "If no `decoder_input_ids` or `decoder_inputs_embeds` are "
+                    "passed, `input_ids` cannot be `None`. Please pass either "
+                    "`input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`."
+                )
+            decoder_input_ids = self._shift_right(input_ids)
+
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+
+        eos_mask = input_ids.eq(self.config.eos_token_id).to(sequence_output.device)
+
+        if len(torch.unique_consecutive(eos_mask.sum(1))) > 1:
+            raise ValueError("All examples must have the same number of <eos> tokens.")
+        batch_size, _, hidden_size = sequence_output.shape
+        sentence_representation = sequence_output[eos_mask, :].view(batch_size, -1, hidden_size)[:, -1, :]
+        logits = self.classification_head(sentence_representation)
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.config.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.config.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.config.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return Seq2SeqSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    T5 Model with a span classification head on top for extractive question-answering tasks like SQuAD (linear layers
+    on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    T5_START_DOCSTRING,
+)
+class T5ForQuestionAnswering(T5PreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+
+    def __init__(self, config: T5Config):
+        super().__init__(config)
+        self.model_dim = config.d_model
+
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_decoder = False
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = T5Stack(encoder_config, self.shared)
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.is_encoder_decoder = False
+        decoder_config.num_layers = config.num_decoder_layers
+        self.decoder = T5Stack(decoder_config, self.shared)
+
+        self.num_labels = config.num_labels
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        self.model_parallel = False
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+        self.decoder.set_input_embeddings(new_embeddings)
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqQuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        decoder_head_mask: Optional[torch.FloatTensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqQuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
+            are not taken into account for computing the loss.
+        Returns:
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if start_positions is not None and end_positions is not None:
+            use_cache = False
+
+        # Copied from models.bart.modeling_bart.BartModel.forward
+        #   different to other models, T5 automatically creates decoder_input_ids from
+        #   input_ids if no decoder_input_ids are provided
+        if decoder_input_ids is None and decoder_inputs_embeds is None:
+            if input_ids is None:
+                raise ValueError(
+                    "If no `decoder_input_ids` or `decoder_inputs_embeds` are "
+                    "passed, `input_ids` cannot be `None`. Please pass either "
+                    "`input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`."
+                )
+            decoder_input_ids = self._shift_right(input_ids)
+
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+        if head_mask is not None and decoder_head_mask is None:
+            if self.config.num_layers == self.config.num_decoder_layers:
+                warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
+                decoder_head_mask = head_mask
+
+        # Encode if needed (training, first prediction pass)
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        hidden_states = encoder_outputs[0]
+
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            past_key_values=None,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = decoder_outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1).to(start_logits.device)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1).to(end_logits.device)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + decoder_outputs[1:] + encoder_outputs
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return Seq2SeqQuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
diff --git a/src/transformers/models/t5/modeling_tf_t5.py b/src/transformers/models/t5/modeling_tf_t5.py
index 039dcb132a9c..5454b8186c7a 100644
--- a/src/transformers/models/t5/modeling_tf_t5.py
+++ b/src/transformers/models/t5/modeling_tf_t5.py
@@ -15,6 +15,9 @@
 # limitations under the License.
 """ TF 2.0 T5 model."""
 
+
+from __future__ import annotations
+
 import copy
 import itertools
 import math
@@ -40,10 +43,8 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
-    DUMMY_INPUTS,
-    DUMMY_MASK,
     ContextManagers,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -56,7 +57,6 @@
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "T5Config"
-_TOKENIZER_FOR_DOC = "T5Tokenizer"
 
 TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "t5-small",
@@ -548,7 +548,6 @@ def call(
         output_attentions=False,
         training=False,
     ):
-
         if past_key_value is not None:
             assert self.is_decoder, "Only decoder can use `past_key_values`"
             expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
@@ -664,7 +663,6 @@ def call(
         return_dict=None,
         training=False,
     ) -> Tuple:
-
         if input_ids is not None and inputs_embeds is not None:
             err_msg_prefix = "decoder_" if self.is_decoder else ""
             raise ValueError(
@@ -689,16 +687,7 @@ def call(
             if hasattr(self.embed_tokens, "load_weight_prefix"):
                 context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
             with ContextManagers(context):
-                # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-                # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-                tf.debugging.assert_less(
-                    input_ids,
-                    tf.cast(self.embed_tokens.input_dim, dtype=input_ids.dtype),
-                    message=(
-                        "input_ids must be smaller than the embedding layer's input dimension (got"
-                        f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.input_dim})"
-                    ),
-                )
+                check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
                 inputs_embeds = self.embed_tokens(input_ids)
 
         batch_size, seq_length = input_shape
@@ -876,32 +865,6 @@ class TFT5PreTrainedModel(TFPreTrainedModel):
     # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
     _keys_to_ignore_on_load_unexpected = [r"decoder\Wblock[\W_0]+layer[\W_1]+EncDecAttention\Wrelative_attention_bias"]
 
-    @property
-    def dummy_inputs(self):
-        inputs = tf.constant(DUMMY_INPUTS, dtype=tf.int32)
-        input_mask = tf.constant(DUMMY_MASK, dtype=tf.int32)
-        dummy_inputs = {
-            "input_ids": inputs,
-            "decoder_input_ids": inputs,
-            "decoder_attention_mask": input_mask,
-        }
-        return dummy_inputs
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-                "decoder_input_ids": tf.TensorSpec((None, None), tf.int32, name="decoder_input_ids"),
-                "decoder_attention_mask": tf.TensorSpec((None, None), tf.int32, name="decoder_attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
     def get_input_embeddings(self):
         return self.shared
 
@@ -997,7 +960,7 @@ def _shift_right(self, input_ids):
             Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
             should be able to pad the inputs on the right or the left.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -1081,7 +1044,7 @@ def _shift_right(self, input_ids):
             Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
             should be able to pad the inputs on the right or the left.
 
-            Indices can be obtained using [`T5Tokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             To know more on how to prepare `inputs` for pre-training take a look at [T5 Training](./t5#training).
@@ -1160,16 +1123,16 @@ def get_decoder(self):
     @replace_return_docstrings(output_type=TFSeq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_outputs: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_head_mask: np.ndarray | tf.Tensor | None = None,
+        encoder_outputs: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1182,9 +1145,9 @@ def call(
         Examples:
 
         ```python
-        >>> from transformers import T5Tokenizer, TFT5Model
+        >>> from transformers import AutoTokenizer, TFT5Model
 
-        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        >>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
         >>> model = TFT5Model.from_pretrained("t5-small")
 
         >>> input_ids = tokenizer(
@@ -1258,25 +1221,6 @@ def call(
             encoder_attentions=encoder_outputs.attentions,
         )
 
-    def serving_output(self, output):
-        pkv = tf.convert_to_tensor(output.past_key_values[1:]) if self.config.use_cache else None
-        dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
-        dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
-        cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None
-        enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
-        enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None
-
-        return TFSeq2SeqModelOutput(
-            last_hidden_state=output.last_hidden_state,
-            past_key_values=pkv,
-            decoder_hidden_states=dec_hs,
-            decoder_attentions=dec_attns,
-            encoder_last_hidden_state=output.encoder_last_hidden_state,
-            cross_attentions=cross_attns,
-            encoder_hidden_states=enc_hs,
-            encoder_attentions=enc_attns,
-        )
-
 
 @add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING)
 class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModelingLoss):
@@ -1339,17 +1283,17 @@ def get_decoder(self):
     @replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_outputs: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_head_mask: np.ndarray | tf.Tensor | None = None,
+        encoder_outputs: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1366,9 +1310,9 @@ def call(
         Examples:
 
         ```python
-        >>> from transformers import T5Tokenizer, TFT5ForConditionalGeneration
+        >>> from transformers import AutoTokenizer, TFT5ForConditionalGeneration
 
-        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        >>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
         >>> model = TFT5ForConditionalGeneration.from_pretrained("t5-small")
 
         >>> # training
@@ -1506,9 +1450,8 @@ def prepare_inputs_for_generation(
         decoder_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
-
         # cut decoder_input_ids if past is used
         if past_key_values is not None:
             input_ids = input_ids[:, -1:]
@@ -1549,10 +1492,6 @@ def __init__(self, config, *inputs, **kwargs):
         encoder_config.use_cache = False
         self.encoder = TFT5MainLayer(encoder_config, self.shared, name="encoder")
 
-    @property
-    def dummy_inputs(self):
-        return {"input_ids": tf.constant(DUMMY_INPUTS, dtype=tf.int32)}
-
     def get_encoder(self):
         return self.encoder
 
@@ -1561,10 +1500,10 @@ def get_encoder(self):
     @replace_return_docstrings(output_type=TFBaseModelOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1576,9 +1515,9 @@ def call(
         Examples:
 
         ```python
-        >>> from transformers import T5Tokenizer, TFT5EncoderModel
+        >>> from transformers import AutoTokenizer, TFT5EncoderModel
 
-        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        >>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
         >>> model = TFT5EncoderModel.from_pretrained("t5-small")
 
         >>> input_ids = tokenizer(
@@ -1610,23 +1549,3 @@ def call(
             hidden_states=encoder_outputs.hidden_states,
             attentions=encoder_outputs.attentions,
         )
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
-    # Copied from transformers.models.distilbert.modeling_tf_distilbert.TFDistilBertModel.serving_output
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutput(last_hidden_state=output.last_hidden_state, hidden_states=hs, attentions=attns)
diff --git a/src/transformers/models/t5/tokenization_t5.py b/src/transformers/models/t5/tokenization_t5.py
index 44fc58251ce5..e0462dd73483 100644
--- a/src/transformers/models/t5/tokenization_t5.py
+++ b/src/transformers/models/t5/tokenization_t5.py
@@ -19,11 +19,17 @@
 import re
 import warnings
 from shutil import copyfile
-from typing import Any, Dict, List, Optional, Tuple
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
 
 import sentencepiece as spm
 
+from ...convert_slow_tokenizer import import_protobuf
 from ...tokenization_utils import PreTrainedTokenizer
+from ...tokenization_utils_base import AddedToken
+
+
+if TYPE_CHECKING:
+    from ...tokenization_utils_base import TextInput
 from ...utils import logging
 
 
@@ -51,6 +57,8 @@
     "t5-11b": 512,
 }
 
+SPIECE_UNDERLINE = "▁"
+
 
 class T5Tokenizer(PreTrainedTokenizer):
     """
@@ -100,6 +108,28 @@ class T5Tokenizer(PreTrainedTokenizer):
 
             - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
               BPE-dropout.
+        legacy (`bool`, *optional*):
+            Whether or not the `legacy` behaviour of the tokenizer should be used. Legacy is before the merge of #24622
+            and #25224 which includes fixes to properly handle tokens that appear after special tokens. A simple
+            example:
+
+            - `legacy=True`:
+            ```python
+            >>> from transformers import T5Tokenizer
+
+            >>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=True)
+            >>> tokenizer.encode("Hello <extra_id_0>.")
+            [8774, 32099, 3, 5, 1]
+            ```
+            - `legacy=False`:
+            ```python
+            >>> from transformers import T5Tokenizer
+
+            >>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=False)
+            >>> tokenizer.encode("Hello <extra_id_0>.")  # the extra space `[3]` is no longer here
+            [8774, 32099, 5, 1]
+            ```
+            Checkout the [pull request](https://github.com/huggingface/transformers/pull/24565) for more details.
 
     Attributes:
         sp_model (`SentencePieceProcessor`):
@@ -120,22 +150,54 @@ def __init__(
         extra_ids=100,
         additional_special_tokens=None,
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        legacy=None,
+        **kwargs,
     ) -> None:
-        # Add extra_ids to the special token list
-        if extra_ids > 0 and additional_special_tokens is None:
-            additional_special_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)]
-        elif extra_ids > 0 and additional_special_tokens is not None:
-            # Check that we have the right number of extra_id special tokens
-            extra_tokens = len(set(filter(lambda x: bool("extra_id" in str(x)), additional_special_tokens)))
-            if extra_tokens != extra_ids:
+        pad_token = AddedToken(pad_token, rstrip=True, lstrip=True)
+        unk_token = AddedToken(unk_token, rstrip=True, lstrip=True)
+        eos_token = AddedToken(eos_token, rstrip=True, lstrip=True)
+
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+        self.vocab_file = vocab_file
+        self._extra_ids = extra_ids
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+
+        if additional_special_tokens is not None:
+            extra_tokens = [x for x in additional_special_tokens if "<extra_id_" in str(x)]
+            if extra_ids > 0 and extra_ids != len(extra_tokens):
                 raise ValueError(
                     f"Both extra_ids ({extra_ids}) and additional_special_tokens ({additional_special_tokens}) are"
                     " provided to T5Tokenizer. In this case the additional_special_tokens must include the extra_ids"
                     " tokens"
                 )
+        else:
+            extra_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)]
+            additional_special_tokens = extra_tokens
+
+        # for legacy purpose, we keep this. Will be removed and tests updated. (when `added_tokens_decoder` is not passed as kwargs)
+        self._added_tokens_decoder = {}
+        for i in range(len(extra_tokens)):
+            self._added_tokens_decoder[len(self.sp_model) - 1 + extra_ids - i] = AddedToken(
+                f"<extra_id_{i}>", single_word=True, lstrip=True, rstrip=True, special=True
+            )
 
-        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        if legacy is None:
+            logger.warning_once(
+                f"You are using the default legacy behaviour of the {self.__class__}. This is"
+                " expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you."
+                " If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it"
+                " means, and thouroughly read the reason why this was added as explained in"
+                " https://github.com/huggingface/transformers/pull/24565"
+            )
+            legacy = True
+
+        self.legacy = legacy
+        self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False))
+        self.vocab_file = vocab_file
+        self._extra_ids = extra_ids
 
         super().__init__(
             eos_token=eos_token,
@@ -144,14 +206,27 @@ def __init__(
             extra_ids=extra_ids,
             additional_special_tokens=additional_special_tokens,
             sp_model_kwargs=self.sp_model_kwargs,
+            legacy=legacy,
             **kwargs,
         )
 
-        self.vocab_file = vocab_file
-        self._extra_ids = extra_ids
-
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(vocab_file)
+    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.get_spm_processor
+    def get_spm_processor(self, from_slow=False):
+        tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        if self.legacy or from_slow:  # no dependency on protobuf
+            tokenizer.Load(self.vocab_file)
+            return tokenizer
+
+        with open(self.vocab_file, "rb") as f:
+            sp_model = f.read()
+            model_pb2 = import_protobuf(f"The new behaviour of {self.__class__.__name__} (with `self.legacy = False`)")
+            model = model_pb2.ModelProto.FromString(sp_model)
+            normalizer_spec = model_pb2.NormalizerSpec()
+            normalizer_spec.add_dummy_prefix = False
+            model.normalizer_spec.MergeFrom(normalizer_spec)
+            sp_model = model.SerializeToString()
+            tokenizer.LoadFromSerializedProto(sp_model)
+        return tokenizer
 
     @staticmethod
     def _eventually_correct_t5_max_length(pretrained_model_name_or_path, max_model_length, init_max_model_length):
@@ -177,7 +252,7 @@ def _eventually_correct_t5_max_length(pretrained_model_name_or_path, max_model_l
 
     @property
     def vocab_size(self):
-        return self.sp_model.get_piece_size() + self._extra_ids
+        return self.sp_model.get_piece_size()
 
     def get_vocab(self):
         vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
@@ -214,11 +289,11 @@ def get_special_tokens_mask(
 
     def get_sentinel_tokens(self):
         return list(
-            set(filter(lambda x: bool(re.search("<extra_id_\d+>", x)) is not None, self.additional_special_tokens))
+            set(filter(lambda x: bool(re.search(r"<extra_id_\d+>", x)) is not None, self.additional_special_tokens))
         )
 
     def get_sentinel_token_ids(self):
-        return [self._convert_token_to_id(token) for token in self.get_sentinel_tokens()]
+        return [self.convert_tokens_to_ids(token) for token in self.get_sentinel_tokens()]
 
     def _add_eos_if_not_present(self, token_ids: List[int]) -> List[int]:
         """Do not add eos again if user already added it."""
@@ -294,29 +369,58 @@ def __setstate__(self, d):
         self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
         self.sp_model.Load(self.vocab_file)
 
-    def _tokenize(self, text: str) -> List[str]:
-        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
-        return self.sp_model.encode(text, out_type=str)
+    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.tokenize
+    def tokenize(self, text: "TextInput", add_special_tokens=False, **kwargs) -> List[str]:
+        """
+        Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the
+        first token is special.
+        """
+        if self.legacy or len(text) == 0:
+            return super().tokenize(text, **kwargs)
+
+        tokens = super().tokenize(SPIECE_UNDERLINE + text.replace(SPIECE_UNDERLINE, " "), **kwargs)
+
+        if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens:
+            tokens = tokens[1:]
+        return tokens
+
+    @property
+    def unk_token_length(self):
+        return len(self.sp_model.encode(str(self.unk_token)))
+
+    def _tokenize(self, text, **kwargs):
+        """
+        Returns a tokenized string.
+
+        We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
+        SPIECE_UNDERLINE. For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give
+        `['H', 'e', 'y']` instead of `['▁He', 'y']`. Thus we always encode `f"{unk_token}text"` and strip the
+        `unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
+        `self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
+        """
+        tokens = self.sp_model.encode(text, out_type=str)
+        if self.legacy or not text.startswith((SPIECE_UNDERLINE, " ")):
+            return tokens
+
+        # 1. Encode string + prefix ex: "<unk> Hey"
+        tokens = self.sp_model.encode(self.unk_token + text, out_type=str)
+        # 2. Remove self.unk_token from ['<','unk','>', '▁Hey']
+        return tokens[self.unk_token_length :] if len(tokens) >= self.unk_token_length else tokens
 
     def _convert_token_to_id(self, token):
         """Converts a token (str) in an id using the vocab."""
-        if token.startswith("<extra_id_"):
-            match = re.match(r"<extra_id_(\d+)>", token)
-            num = int(match.group(1))
-            return self.vocab_size - num - 1
         return self.sp_model.piece_to_id(token)
 
     def _convert_id_to_token(self, index):
         """Converts an index (integer) in a token (str) using the vocab."""
-        if index < self.sp_model.get_piece_size():
-            token = self.sp_model.IdToPiece(index)
-        else:
-            token = f"<extra_id_{self.vocab_size - 1 - index}>"
+        token = self.sp_model.IdToPiece(index)
         return token
 
     def convert_tokens_to_string(self, tokens):
         """Converts a sequence of tokens (string) in a single string."""
         current_sub_tokens = []
+        # since we manually add the prefix space, we have to remove it
+        tokens[0] = tokens[0].lstrip(SPIECE_UNDERLINE)
         out_string = ""
         prev_is_special = False
         for token in tokens:
diff --git a/src/transformers/models/t5/tokenization_t5_fast.py b/src/transformers/models/t5/tokenization_t5_fast.py
index 6fcb34043d18..697469383181 100644
--- a/src/transformers/models/t5/tokenization_t5_fast.py
+++ b/src/transformers/models/t5/tokenization_t5_fast.py
@@ -115,7 +115,7 @@ def __init__(
         pad_token="<pad>",
         extra_ids=100,
         additional_special_tokens=None,
-        **kwargs
+        **kwargs,
     ):
         # Add extra_ids to the special token list
         if extra_ids > 0 and additional_special_tokens is None:
@@ -142,9 +142,12 @@ def __init__(
         )
 
         self.vocab_file = vocab_file
-        self.can_save_slow_tokenizer = False if not self.vocab_file else True
         self._extra_ids = extra_ids
 
+    @property
+    def can_save_slow_tokenizer(self) -> bool:
+        return os.path.isfile(self.vocab_file) if self.vocab_file else False
+
     @staticmethod
     def _eventually_correct_t5_max_length(pretrained_model_name_or_path, max_model_length, init_max_model_length):
         if pretrained_model_name_or_path in T5TokenizerFast.max_model_input_sizes:
@@ -237,7 +240,7 @@ def create_token_type_ids_from_sequences(
 
     def get_sentinel_tokens(self):
         return list(
-            set(filter(lambda x: bool(re.search("<extra_id_\d+>", x)) is not None, self.additional_special_tokens))
+            set(filter(lambda x: bool(re.search(r"<extra_id_\d+>", x)) is not None, self.additional_special_tokens))
         )
 
     def get_sentinel_token_ids(self):
diff --git a/src/transformers/models/table_transformer/__init__.py b/src/transformers/models/table_transformer/__init__.py
index 279e6d3cde7b..346bc9ef9caa 100644
--- a/src/transformers/models/table_transformer/__init__.py
+++ b/src/transformers/models/table_transformer/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,7 +14,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_timm_available
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
 
 
 _import_structure = {
@@ -30,7 +26,7 @@
 }
 
 try:
-    if not is_timm_available():
+    if not is_torch_available():
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     pass
@@ -51,7 +47,7 @@
     )
 
     try:
-        if not is_timm_available():
+        if not is_torch_available():
             raise OptionalDependencyNotAvailable()
     except OptionalDependencyNotAvailable:
         pass
diff --git a/src/transformers/models/table_transformer/configuration_table_transformer.py b/src/transformers/models/table_transformer/configuration_table_transformer.py
index 4984a1e8a8b9..9cc903656a4c 100644
--- a/src/transformers/models/table_transformer/configuration_table_transformer.py
+++ b/src/transformers/models/table_transformer/configuration_table_transformer.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Table Transformer model configuration"""
-
 from collections import OrderedDict
 from typing import Mapping
 
@@ -22,6 +21,7 @@
 from ...configuration_utils import PretrainedConfig
 from ...onnx import OnnxConfig
 from ...utils import logging
+from ..auto import CONFIG_MAPPING
 
 
 logger = logging.get_logger(__name__)
@@ -44,6 +44,12 @@ class TableTransformerConfig(PretrainedConfig):
     documentation from [`PretrainedConfig`] for more information.
 
     Args:
+        use_timm_backbone (`bool`, *optional*, defaults to `True`):
+            Whether or not to use the `timm` library for the backbone. If set to `False`, will use the [`AutoBackbone`]
+            API.
+        backbone_config (`PretrainedConfig` or `dict`, *optional*):
+            The configuration of the backbone model. Only used in case `use_timm_backbone` is set to `False` in which
+            case it will default to `ResNetConfig()`.
         num_channels (`int`, *optional*, defaults to 3):
             The number of input channels.
         num_queries (`int`, *optional*, defaults to 100):
@@ -87,13 +93,14 @@ class TableTransformerConfig(PretrainedConfig):
         position_embedding_type (`str`, *optional*, defaults to `"sine"`):
             Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`.
         backbone (`str`, *optional*, defaults to `"resnet50"`):
-            Name of convolutional backbone to use. Supports any convolutional backbone from the timm package. For a
-            list of all available models, see [this
+            Name of convolutional backbone to use in case `use_timm_backbone` = `True`. Supports any convolutional
+            backbone from the timm package. For a list of all available models, see [this
             page](https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model).
         use_pretrained_backbone (`bool`, *optional*, defaults to `True`):
-            Whether to use pretrained weights for the backbone.
+            Whether to use pretrained weights for the backbone. Only supported when `use_timm_backbone` = `True`.
         dilation (`bool`, *optional*, defaults to `False`):
-            Whether to replace stride with dilation in the last convolutional block (DC5).
+            Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when
+            `use_timm_backbone` = `True`.
         class_cost (`float`, *optional*, defaults to 1):
             Relative weight of the classification error in the Hungarian matching cost.
         bbox_cost (`float`, *optional*, defaults to 5):
@@ -135,6 +142,8 @@ class TableTransformerConfig(PretrainedConfig):
     # Copied from transformers.models.detr.configuration_detr.DetrConfig.__init__
     def __init__(
         self,
+        use_timm_backbone=True,
+        backbone_config=None,
         num_channels=3,
         num_queries=100,
         encoder_layers=6,
@@ -153,8 +162,6 @@ def __init__(
         activation_dropout=0.0,
         init_std=0.02,
         init_xavier_std=1.0,
-        classifier_dropout=0.0,
-        scale_embedding=False,
         auxiliary_loss=False,
         position_embedding_type="sine",
         backbone="resnet50",
@@ -168,8 +175,24 @@ def __init__(
         bbox_loss_coefficient=5,
         giou_loss_coefficient=2,
         eos_coefficient=0.1,
-        **kwargs
+        **kwargs,
     ):
+        if backbone_config is not None and use_timm_backbone:
+            raise ValueError("You can't specify both `backbone_config` and `use_timm_backbone`.")
+
+        if not use_timm_backbone:
+            if backbone_config is None:
+                logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
+                backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"])
+            elif isinstance(backbone_config, dict):
+                backbone_model_type = backbone_config.get("model_type")
+                config_class = CONFIG_MAPPING[backbone_model_type]
+                backbone_config = config_class.from_dict(backbone_config)
+            # set timm attributes to None
+            dilation, backbone, use_pretrained_backbone = None, None, None
+
+        self.use_timm_backbone = use_timm_backbone
+        self.backbone_config = backbone_config
         self.num_channels = num_channels
         self.num_queries = num_queries
         self.d_model = d_model
@@ -188,7 +211,6 @@ def __init__(
         self.encoder_layerdrop = encoder_layerdrop
         self.decoder_layerdrop = decoder_layerdrop
         self.num_hidden_layers = encoder_layers
-        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
         self.auxiliary_loss = auxiliary_loss
         self.position_embedding_type = position_embedding_type
         self.backbone = backbone
@@ -217,7 +239,6 @@ def hidden_size(self) -> int:
 
 # Copied from transformers.models.detr.configuration_detr.DetrOnnxConfig
 class TableTransformerOnnxConfig(OnnxConfig):
-
     torch_onnx_minimum_version = version.parse("1.11")
 
     @property
diff --git a/src/transformers/models/table_transformer/convert_table_transformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/table_transformer/convert_table_transformer_original_pytorch_checkpoint_to_pytorch.py
index a53bd9e03d80..d351473e2224 100644
--- a/src/transformers/models/table_transformer/convert_table_transformer_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/table_transformer/convert_table_transformer_original_pytorch_checkpoint_to_pytorch.py
@@ -23,11 +23,11 @@
 from pathlib import Path
 
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 from torchvision.transforms import functional as F
 
-from huggingface_hub import hf_hub_download
-from transformers import DetrFeatureExtractor, TableTransformerConfig, TableTransformerForObjectDetection
+from transformers import DetrImageProcessor, TableTransformerConfig, TableTransformerForObjectDetection
 from transformers.utils import logging
 
 
@@ -242,7 +242,7 @@ def convert_table_transformer_checkpoint(checkpoint_url, pytorch_dump_folder_pat
         config.id2label = id2label
         config.label2id = {v: k for k, v in id2label.items()}
 
-    feature_extractor = DetrFeatureExtractor(
+    image_processor = DetrImageProcessor(
         format="coco_detection", max_size=800 if "detection" in checkpoint_url else 1000
     )
     model = TableTransformerForObjectDetection(config)
@@ -277,11 +277,11 @@ def convert_table_transformer_checkpoint(checkpoint_url, pytorch_dump_folder_pat
     print("Looks ok!")
 
     if pytorch_dump_folder_path is not None:
-        # Save model and feature extractor
-        logger.info(f"Saving PyTorch model and feature extractor to {pytorch_dump_folder_path}...")
+        # Save model and image processor
+        logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
         Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
         model.save_pretrained(pytorch_dump_folder_path)
-        feature_extractor.save_pretrained(pytorch_dump_folder_path)
+        image_processor.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
         # Push model to HF hub
@@ -292,7 +292,7 @@ def convert_table_transformer_checkpoint(checkpoint_url, pytorch_dump_folder_pat
             else "microsoft/table-transformer-structure-recognition"
         )
         model.push_to_hub(model_name)
-        feature_extractor.push_to_hub(model_name)
+        image_processor.push_to_hub(model_name)
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py
index 5fc0ccfe869f..8f59bd4b6e17 100644
--- a/src/transformers/models/table_transformer/modeling_table_transformer.py
+++ b/src/transformers/models/table_transformer/modeling_table_transformer.py
@@ -16,9 +16,8 @@
 
 
 import math
-import random
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple, Union
 
 import torch
 from torch import Tensor, nn
@@ -26,7 +25,6 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_int_div
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -38,6 +36,7 @@
     replace_return_docstrings,
     requires_backends,
 )
+from ..auto import AutoBackbone
 from .configuration_table_transformer import TableTransformerConfig
 
 
@@ -240,61 +239,80 @@ def forward(self, x):
 
 
 # Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->TableTransformer
-def replace_batch_norm(m, name=""):
-    for attr_str in dir(m):
-        target_attr = getattr(m, attr_str)
-        if isinstance(target_attr, nn.BatchNorm2d):
-            frozen = TableTransformerFrozenBatchNorm2d(target_attr.num_features)
-            bn = getattr(m, attr_str)
-            frozen.weight.data.copy_(bn.weight)
-            frozen.bias.data.copy_(bn.bias)
-            frozen.running_mean.data.copy_(bn.running_mean)
-            frozen.running_var.data.copy_(bn.running_var)
-            setattr(m, attr_str, frozen)
-    for n, ch in m.named_children():
-        replace_batch_norm(ch, n)
-
-
-# Copied from transformers.models.detr.modeling_detr.DetrTimmConvEncoder with Detr->TableTransformer
-class TableTransformerTimmConvEncoder(nn.Module):
+def replace_batch_norm(model):
+    r"""
+    Recursively replace all `torch.nn.BatchNorm2d` with `TableTransformerFrozenBatchNorm2d`.
+
+    Args:
+        model (torch.nn.Module):
+            input model
+    """
+    for name, module in model.named_children():
+        if isinstance(module, nn.BatchNorm2d):
+            new_module = TableTransformerFrozenBatchNorm2d(module.num_features)
+
+            new_module.weight.data.copy_(module.weight)
+            new_module.bias.data.copy_(module.bias)
+            new_module.running_mean.data.copy_(module.running_mean)
+            new_module.running_var.data.copy_(module.running_var)
+
+            model._modules[name] = new_module
+
+        if len(list(module.children())) > 0:
+            replace_batch_norm(module)
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrConvEncoder with Detr->TableTransformer
+class TableTransformerConvEncoder(nn.Module):
     """
-    Convolutional encoder (backbone) from the timm library.
+    Convolutional backbone, using either the AutoBackbone API or one from the timm library.
 
     nn.BatchNorm2d layers are replaced by TableTransformerFrozenBatchNorm2d as defined above.
 
     """
 
-    def __init__(self, name: str, dilation: bool, use_pretrained_backbone: bool, num_channels: int = 3):
+    def __init__(self, config):
         super().__init__()
 
-        kwargs = {}
-        if dilation:
-            kwargs["output_stride"] = 16
-
-        requires_backends(self, ["timm"])
+        self.config = config
+
+        if config.use_timm_backbone:
+            requires_backends(self, ["timm"])
+            kwargs = {}
+            if config.dilation:
+                kwargs["output_stride"] = 16
+            backbone = create_model(
+                config.backbone,
+                pretrained=config.use_pretrained_backbone,
+                features_only=True,
+                out_indices=(1, 2, 3, 4),
+                in_chans=config.num_channels,
+                **kwargs,
+            )
+        else:
+            backbone = AutoBackbone.from_config(config.backbone_config)
 
-        backbone = create_model(
-            name,
-            pretrained=use_pretrained_backbone,
-            features_only=True,
-            out_indices=(1, 2, 3, 4),
-            in_chans=num_channels,
-            **kwargs,
-        )
         # replace batch norm by frozen batch norm
         with torch.no_grad():
             replace_batch_norm(backbone)
         self.model = backbone
-        self.intermediate_channel_sizes = self.model.feature_info.channels()
+        self.intermediate_channel_sizes = (
+            self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
+        )
 
-        if "resnet" in name:
+        backbone_model_type = config.backbone if config.use_timm_backbone else config.backbone_config.model_type
+        if "resnet" in backbone_model_type:
             for name, parameter in self.model.named_parameters():
-                if "layer2" not in name and "layer3" not in name and "layer4" not in name:
-                    parameter.requires_grad_(False)
+                if config.use_timm_backbone:
+                    if "layer2" not in name and "layer3" not in name and "layer4" not in name:
+                        parameter.requires_grad_(False)
+                else:
+                    if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name:
+                        parameter.requires_grad_(False)
 
     def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
         # send pixel_values through the model to get list of feature maps
-        features = self.model(pixel_values)
+        features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps
 
         out = []
         for feature_map in features:
@@ -368,7 +386,7 @@ def forward(self, pixel_values, pixel_mask):
             x_embed = x_embed / (x_embed[:, :, -1:] + 1e-6) * self.scale
 
         dim_t = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device)
-        dim_t = self.temperature ** (2 * torch_int_div(dim_t, 2) / self.embedding_dim)
+        dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim)
 
         pos_x = x_embed[:, :, :, None] / dim_t
         pos_y = y_embed[:, :, :, None] / dim_t
@@ -429,7 +447,6 @@ def __init__(
         embed_dim: int,
         num_heads: int,
         dropout: float = 0.0,
-        is_decoder: bool = False,
         bias: bool = True,
     ):
         super().__init__()
@@ -452,34 +469,79 @@ def __init__(
     def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
         return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
 
-    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
-        return tensor if position_embeddings is None else tensor + position_embeddings
+    def with_pos_embed(self, tensor: torch.Tensor, object_queries: Optional[Tensor], **kwargs):
+        position_embeddings = kwargs.pop("position_embeddings", None)
+
+        if kwargs:
+            raise ValueError(f"Unexpected arguments {kwargs.keys()}")
+
+        if position_embeddings is not None and object_queries is not None:
+            raise ValueError(
+                "Cannot specify both position_embeddings and object_queries. Please use just object_queries"
+            )
+
+        if position_embeddings is not None:
+            logger.warning_once(
+                "position_embeddings has been deprecated and will be removed in v4.34. Please use object_queries instead"
+            )
+            object_queries = position_embeddings
+
+        return tensor if object_queries is None else tensor + object_queries
 
     def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
-        position_embeddings: Optional[torch.Tensor] = None,
+        object_queries: Optional[torch.Tensor] = None,
         key_value_states: Optional[torch.Tensor] = None,
-        key_value_position_embeddings: Optional[torch.Tensor] = None,
+        spatial_position_embeddings: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
+        **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         """Input shape: Batch x Time x Channel"""
 
+        position_embeddings = kwargs.pop("position_ebmeddings", None)
+        key_value_position_embeddings = kwargs.pop("key_value_position_embeddings", None)
+
+        if kwargs:
+            raise ValueError(f"Unexpected arguments {kwargs.keys()}")
+
+        if position_embeddings is not None and object_queries is not None:
+            raise ValueError(
+                "Cannot specify both position_embeddings and object_queries. Please use just object_queries"
+            )
+
+        if key_value_position_embeddings is not None and spatial_position_embeddings is not None:
+            raise ValueError(
+                "Cannot specify both key_value_position_embeddings and spatial_position_embeddings. Please use just spatial_position_embeddings"
+            )
+
+        if position_embeddings is not None:
+            logger.warning_once(
+                "position_embeddings has been deprecated and will be removed in v4.34. Please use object_queries instead"
+            )
+            object_queries = position_embeddings
+
+        if key_value_position_embeddings is not None:
+            logger.warning_once(
+                "key_value_position_embeddings has been deprecated and will be removed in v4.34. Please use spatial_position_embeddings instead"
+            )
+            spatial_position_embeddings = key_value_position_embeddings
+
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
         batch_size, target_len, embed_dim = hidden_states.size()
 
         # add position embeddings to the hidden states before projecting to queries and keys
-        if position_embeddings is not None:
+        if object_queries is not None:
             hidden_states_original = hidden_states
-            hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
+            hidden_states = self.with_pos_embed(hidden_states, object_queries)
 
         # add key-value position embeddings to the key value states
-        if key_value_position_embeddings is not None:
+        if spatial_position_embeddings is not None:
             key_value_states_original = key_value_states
-            key_value_states = self.with_pos_embed(key_value_states, key_value_position_embeddings)
+            key_value_states = self.with_pos_embed(key_value_states, spatial_position_embeddings)
 
         # get query proj
         query_states = self.q_proj(hidden_states) * self.scaling
@@ -570,16 +632,16 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: torch.Tensor,
-        position_embeddings: torch.Tensor = None,
+        object_queries: torch.Tensor = None,
         output_attentions: bool = False,
     ):
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
                 values.
-            position_embeddings (`torch.FloatTensor`, *optional*): position embeddings, to be added to hidden_states.
+            object_queries (`torch.FloatTensor`, *optional*): object queries, to be added to hidden_states.
             output_attentions (`bool`, *optional*):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more detail.
@@ -590,7 +652,7 @@ def forward(
         hidden_states, attn_weights = self.self_attn(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
-            position_embeddings=position_embeddings,
+            object_queries=object_queries,
             output_attentions=output_attentions,
         )
 
@@ -631,7 +693,6 @@ def __init__(self, config: TableTransformerConfig):
             embed_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
             dropout=config.attention_dropout,
-            is_decoder=True,
         )
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
@@ -642,7 +703,6 @@ def __init__(self, config: TableTransformerConfig):
             self.embed_dim,
             config.decoder_attention_heads,
             dropout=config.attention_dropout,
-            is_decoder=True,
         )
         self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
@@ -653,7 +713,7 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
-        position_embeddings: Optional[torch.Tensor] = None,
+        object_queries: Optional[torch.Tensor] = None,
         query_position_embeddings: Optional[torch.Tensor] = None,
         encoder_hidden_states: Optional[torch.Tensor] = None,
         encoder_attention_mask: Optional[torch.Tensor] = None,
@@ -661,18 +721,18 @@ def forward(
     ):
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
                 values.
-            position_embeddings (`torch.FloatTensor`, *optional*):
-                position embeddings that are added to the queries and keys
+            object_queries (`torch.FloatTensor`, *optional*):
+                object queries that are added to the queries and keys
             in the cross-attention layer.
             query_position_embeddings (`torch.FloatTensor`, *optional*):
-                position embeddings that are added to the queries and keys
+                object queries that are added to the queries and keys
             in the self-attention layer.
             encoder_hidden_states (`torch.FloatTensor`):
-                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
             encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                 `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
                 values.
@@ -686,7 +746,7 @@ def forward(
         # Self Attention
         hidden_states, self_attn_weights = self.self_attn(
             hidden_states=hidden_states,
-            position_embeddings=query_position_embeddings,
+            object_queries=query_position_embeddings,
             attention_mask=attention_mask,
             output_attentions=output_attentions,
         )
@@ -702,10 +762,10 @@ def forward(
         if encoder_hidden_states is not None:
             hidden_states, cross_attn_weights = self.encoder_attn(
                 hidden_states=hidden_states,
-                position_embeddings=query_position_embeddings,
+                object_queries=query_position_embeddings,
                 key_value_states=encoder_hidden_states,
                 attention_mask=encoder_attention_mask,
-                key_value_position_embeddings=position_embeddings,
+                spatial_position_embeddings=object_queries,
                 output_attentions=output_attentions,
             )
 
@@ -799,7 +859,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 
             Pixel values can be obtained using [`DetrImageProcessor`]. See [`DetrImageProcessor.__call__`] for details.
 
-        pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+        pixel_mask (`torch.FloatTensor` of shape `(batch_size, height, width)`, *optional*):
             Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
 
             - 1 for pixels that are real (i.e. **not masked**),
@@ -807,7 +867,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 
             [What are attention masks?](../glossary#attention-mask)
 
-        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, num_queries)`, *optional*):
+        decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
             Not used by default. Can be used to mask object queries.
         encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
             Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
@@ -839,7 +899,7 @@ class TableTransformerEncoder(TableTransformerPreTrainedModel):
 
     Small tweak for Table Transformer:
 
-    - position_embeddings are added to the forward pass.
+    - object_queries are added to the forward pass.
 
     Args:
         config: TableTransformerConfig
@@ -862,7 +922,7 @@ def forward(
         self,
         inputs_embeds=None,
         attention_mask=None,
-        position_embeddings=None,
+        object_queries=None,
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
@@ -880,7 +940,7 @@ def forward(
 
                 [What are attention masks?](../glossary#attention-mask)
 
-            position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            object_queries (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                 Position embeddings that are added to the queries and keys in each self-attention layer.
 
             output_attentions (`bool`, *optional*):
@@ -912,15 +972,20 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
                 layer_outputs = (None, None)
             else:
-                # we add position_embeddings as extra input to the encoder_layer
+                # we add object_queries as extra input to the encoder_layer
                 layer_outputs = encoder_layer(
                     hidden_states,
                     attention_mask,
-                    position_embeddings=position_embeddings,
+                    object_queries=object_queries,
                     output_attentions=output_attentions,
                 )
 
@@ -950,7 +1015,7 @@ class TableTransformerDecoder(TableTransformerPreTrainedModel):
 
     Some small tweaks for TABLE_TRANSFORMER:
 
-    - position_embeddings and query_position_embeddings are added to the forward pass.
+    - object_queries and query_position_embeddings are added to the forward pass.
     - if self.config.auxiliary_loss is set to True, also returns a stack of activations from all decoding layers.
 
     Args:
@@ -976,11 +1041,12 @@ def forward(
         attention_mask=None,
         encoder_hidden_states=None,
         encoder_attention_mask=None,
-        position_embeddings=None,
+        object_queries=None,
         query_position_embeddings=None,
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
+        **kwargs,
     ):
         r"""
         Args:
@@ -1004,10 +1070,11 @@ def forward(
                 - 1 for pixels that are real (i.e. **not masked**),
                 - 0 for pixels that are padding (i.e. **masked**).
 
-            position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-                Position embeddings that are added to the queries and keys in each cross-attention layer.
+            object_queries (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Object queries that are added to the queries and keys in each cross-attention layer.
             query_position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
-                , *optional*): Position embeddings that are added to the queries and keys in each self-attention layer.
+                , *optional*): Position embeddings that are added to the values and keys in each self-attention layer.
+
             output_attentions (`bool`, *optional*):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more detail.
@@ -1017,6 +1084,22 @@ def forward(
             return_dict (`bool`, *optional*):
                 Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
         """
+        position_embeddings = kwargs.pop("position_embeddings", None)
+
+        if kwargs:
+            raise ValueError(f"Unexpected arguments {kwargs.keys()}")
+
+        if position_embeddings is not None and object_queries is not None:
+            raise ValueError(
+                "Cannot specify both position_embeddings and object_queries. Please use just object_queries"
+            )
+
+        if position_embeddings is not None:
+            logger.warning_once(
+                "position_embeddings has been deprecated and will be removed in v4.34. Please use object_queries instead"
+            )
+            object_queries = position_embeddings
+
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1054,9 +1137,10 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             if self.gradient_checkpointing and self.training:
 
@@ -1078,7 +1162,7 @@ def custom_forward(*inputs):
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=combined_attention_mask,
-                    position_embeddings=position_embeddings,
+                    object_queries=object_queries,
                     query_position_embeddings=query_position_embeddings,
                     encoder_hidden_states=encoder_hidden_states,
                     encoder_attention_mask=encoder_attention_mask,
@@ -1136,11 +1220,9 @@ def __init__(self, config: TableTransformerConfig):
         super().__init__(config)
 
         # Create backbone + positional encoding
-        backbone = TableTransformerTimmConvEncoder(
-            config.backbone, config.dilation, config.use_pretrained_backbone, config.num_channels
-        )
-        position_embeddings = build_position_encoding(config)
-        self.backbone = TableTransformerConvModel(backbone, position_embeddings)
+        backbone = TableTransformerConvEncoder(config)
+        object_queries = build_position_encoding(config)
+        self.backbone = TableTransformerConvModel(backbone, object_queries)
 
         # Create projection layer
         self.input_projection = nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1)
@@ -1171,16 +1253,16 @@ def unfreeze_backbone(self):
     @replace_return_docstrings(output_type=TableTransformerModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        pixel_values,
-        pixel_mask=None,
-        decoder_attention_mask=None,
-        encoder_outputs=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        pixel_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.FloatTensor] = None,
+        decoder_attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_outputs: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], TableTransformerModelOutput]:
         r"""
         Returns:
 
@@ -1235,21 +1317,21 @@ def forward(
         # Second, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
         projected_feature_map = self.input_projection(feature_map)
 
-        # Third, flatten the feature map + position embeddings of shape NxCxHxW to NxCxHW, and permute it to NxHWxC
+        # Third, flatten the feature map + object queries of shape NxCxHxW to NxCxHW, and permute it to NxHWxC
         # In other words, turn their shape into (batch_size, sequence_length, hidden_size)
         flattened_features = projected_feature_map.flatten(2).permute(0, 2, 1)
-        position_embeddings = position_embeddings_list[-1].flatten(2).permute(0, 2, 1)
+        object_queries = position_embeddings_list[-1].flatten(2).permute(0, 2, 1)
 
         flattened_mask = mask.flatten(1)
 
-        # Fourth, sent flattened_features + flattened_mask + position embeddings through encoder
+        # Fourth, sent flattened_features + flattened_mask + object queries through encoder
         # flattened_features is a Tensor of shape (batch_size, heigth*width, hidden_size)
         # flattened_mask is a Tensor of shape (batch_size, heigth*width)
         if encoder_outputs is None:
             encoder_outputs = self.encoder(
                 inputs_embeds=flattened_features,
                 attention_mask=flattened_mask,
-                position_embeddings=position_embeddings,
+                object_queries=object_queries,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict,
@@ -1262,7 +1344,7 @@ def forward(
                 attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
             )
 
-        # Fifth, sent query embeddings + position embeddings through the decoder (which is conditioned on the encoder output)
+        # Fifth, sent query embeddings + object queries through the decoder (which is conditioned on the encoder output)
         query_position_embeddings = self.query_position_embeddings.weight.unsqueeze(0).repeat(batch_size, 1, 1)
         queries = torch.zeros_like(query_position_embeddings)
 
@@ -1270,7 +1352,7 @@ def forward(
         decoder_outputs = self.decoder(
             inputs_embeds=queries,
             attention_mask=None,
-            position_embeddings=position_embeddings,
+            object_queries=object_queries,
             query_position_embeddings=query_position_embeddings,
             encoder_hidden_states=encoder_outputs[0],
             encoder_attention_mask=flattened_mask,
@@ -1332,17 +1414,17 @@ def _set_aux_loss(self, outputs_class, outputs_coord):
     @replace_return_docstrings(output_type=TableTransformerObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        pixel_values,
-        pixel_mask=None,
-        decoder_attention_mask=None,
-        encoder_outputs=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        pixel_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.FloatTensor] = None,
+        decoder_attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_outputs: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[List[Dict]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], TableTransformerObjectDetectionOutput]:
         r"""
         labels (`List[Dict]` of len `(batch_size,)`, *optional*):
             Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
diff --git a/src/transformers/models/tapas/__init__.py b/src/transformers/models/tapas/__init__.py
index bbfb09ea0fee..e1afab325420 100644
--- a/src/transformers/models/tapas/__init__.py
+++ b/src/transformers/models/tapas/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/tapas/configuration_tapas.py b/src/transformers/models/tapas/configuration_tapas.py
index 71fd5715ef57..f466ab42545f 100644
--- a/src/transformers/models/tapas/configuration_tapas.py
+++ b/src/transformers/models/tapas/configuration_tapas.py
@@ -193,9 +193,8 @@ def __init__(
         disable_per_token_loss=False,
         aggregation_labels=None,
         no_aggregation_label_index=None,
-        **kwargs
+        **kwargs,
     ):
-
         super().__init__(pad_token_id=pad_token_id, **kwargs)
 
         # BERT hyperparameters (with updated max_position_embeddings and type_vocab_sizes)
diff --git a/src/transformers/models/tapas/modeling_tapas.py b/src/transformers/models/tapas/modeling_tapas.py
index 5b88269788fb..cdaa4b3e2725 100644
--- a/src/transformers/models/tapas/modeling_tapas.py
+++ b/src/transformers/models/tapas/modeling_tapas.py
@@ -29,7 +29,12 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, MaskedLMOutput, SequenceClassifierOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    is_torch_greater_or_equal_than_1_12,
+    prune_linear_layer,
+)
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -42,9 +47,13 @@
 
 logger = logging.get_logger(__name__)
 
+if not is_torch_greater_or_equal_than_1_12:
+    logger.warning(
+        f"You are using torch=={torch.__version__}, but torch>=1.12.0 is required to use "
+        "TapasModel. Please upgrade torch."
+    )
+
 _CONFIG_FOR_DOC = "TapasConfig"
-_TOKENIZER_FOR_DOC = "TapasTokenizer"
-_TOKENIZER_FOR_DOC = "google/tapas-base"
 _CHECKPOINT_FOR_DOC = "google/tapas-base"
 
 TAPAS_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -291,7 +300,6 @@ def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs
             position_ids = position_ids.unsqueeze(0).expand(input_shape)
             # when self.config.reset_position_index_per_cell is set to True, create relative position embeddings
             if self.config.reset_position_index_per_cell:
-
                 # shape (batch_size, seq_len)
                 col_index = IndexMap(token_type_ids[:, :, 1], self.config.type_vocab_sizes[1], batch_dims=1)
                 # shape (batch_size, seq_len)
@@ -793,7 +801,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 TAPAS_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `({0})`):
-            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`TapasTokenizer`]. See
+            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
             [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -805,7 +813,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 
             [What are attention masks?](../glossary#attention-mask)
         token_type_ids (`torch.LongTensor` of shape `({0}, 7)`, *optional*):
-            Token indices that encode tabular structure. Indices can be obtained using [`TapasTokenizer`]. See this
+            Token indices that encode tabular structure. Indices can be obtained using [`AutoTokenizer`]. See this
             class for more info.
 
             [What are token type IDs?](../glossary#token-type-ids)
@@ -896,10 +904,10 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import TapasTokenizer, TapasModel
+        >>> from transformers import AutoTokenizer, TapasModel
         >>> import pandas as pd
 
-        >>> tokenizer = TapasTokenizer.from_pretrained("google/tapas-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/tapas-base")
         >>> model = TapasModel.from_pretrained("google/tapas-base")
 
         >>> data = {
@@ -924,6 +932,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -990,7 +999,7 @@ def forward(
 
 @add_start_docstrings("""Tapas Model with a `language modeling` head on top.""", TAPAS_START_DOCSTRING)
 class TapasForMaskedLM(TapasPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
     config_class = TapasConfig
     base_model_prefix = "tapas"
 
@@ -1025,7 +1034,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        **kwargs
+        **kwargs,
     ) -> Union[Tuple, MaskedLMOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -1038,10 +1047,10 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import TapasTokenizer, TapasForMaskedLM
+        >>> from transformers import AutoTokenizer, TapasForMaskedLM
         >>> import pandas as pd
 
-        >>> tokenizer = TapasTokenizer.from_pretrained("google/tapas-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/tapas-base")
         >>> model = TapasForMaskedLM.from_pretrained("google/tapas-base")
 
         >>> data = {
@@ -1166,7 +1175,7 @@ def forward(
             padding are 0.
         labels (`torch.LongTensor` of shape `(batch_size, seq_length)`, *optional*):
             Labels per token for computing the hierarchical cell selection loss. This encodes the positions of the
-            answer appearing in the table. Can be obtained using [`TapasTokenizer`].
+            answer appearing in the table. Can be obtained using [`AutoTokenizer`].
 
             - 1 for tokens that are **part of the answer**,
             - 0 for tokens that are **not part of the answer**.
@@ -1180,10 +1189,10 @@ def forward(
             required in case of weak supervision (WTQ) to calculate the aggregate mask and regression loss.
         numeric_values (`torch.FloatTensor` of shape `(batch_size, seq_length)`, *optional*):
             Numeric values of every token, NaN for tokens which are not numeric values. Can be obtained using
-            [`TapasTokenizer`]. Only required in case of weak supervision for aggregation (WTQ) to calculate the
+            [`AutoTokenizer`]. Only required in case of weak supervision for aggregation (WTQ) to calculate the
             regression loss.
         numeric_values_scale (`torch.FloatTensor` of shape `(batch_size, seq_length)`, *optional*):
-            Scale of the numeric values of every token. Can be obtained using [`TapasTokenizer`]. Only required in case
+            Scale of the numeric values of every token. Can be obtained using [`AutoTokenizer`]. Only required in case
             of weak supervision for aggregation (WTQ) to calculate the regression loss.
 
         Returns:
@@ -1191,10 +1200,10 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import TapasTokenizer, TapasForQuestionAnswering
+        >>> from transformers import AutoTokenizer, TapasForQuestionAnswering
         >>> import pandas as pd
 
-        >>> tokenizer = TapasTokenizer.from_pretrained("google/tapas-base-finetuned-wtq")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/tapas-base-finetuned-wtq")
         >>> model = TapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-wtq")
 
         >>> data = {
@@ -1489,11 +1498,11 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import TapasTokenizer, TapasForSequenceClassification
+        >>> from transformers import AutoTokenizer, TapasForSequenceClassification
         >>> import torch
         >>> import pandas as pd
 
-        >>> tokenizer = TapasTokenizer.from_pretrained("google/tapas-base-finetuned-tabfact")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/tapas-base-finetuned-tabfact")
         >>> model = TapasForSequenceClassification.from_pretrained("google/tapas-base-finetuned-tabfact")
 
         >>> data = {
@@ -1635,11 +1644,8 @@ def __init__(self, outer_index, inner_index):
 
     def project_outer(self, index):
         """Projects an index with the same index set onto the outer components."""
-        return IndexMap(
-            indices=(index.indices // self.inner_index.num_segments).type(torch.float).floor().type(torch.long),
-            num_segments=self.outer_index.num_segments,
-            batch_dims=index.batch_dims,
-        )
+        indices = torch.div(index.indices, self.inner_index.num_segments, rounding_mode="floor").type(torch.long)
+        return IndexMap(indices=indices, num_segments=self.outer_index.num_segments, batch_dims=index.batch_dims)
 
     def project_inner(self, index):
         """Projects an index with the same index set onto the inner components."""
diff --git a/src/transformers/models/tapas/modeling_tf_tapas.py b/src/transformers/models/tapas/modeling_tf_tapas.py
index ea379a039d5a..62e77a6678de 100644
--- a/src/transformers/models/tapas/modeling_tf_tapas.py
+++ b/src/transformers/models/tapas/modeling_tf_tapas.py
@@ -14,6 +14,9 @@
 # limitations under the License.
 """TF 2.0 TAPAS model."""
 
+
+from __future__ import annotations
+
 import enum
 import math
 from dataclasses import dataclass
@@ -38,7 +41,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -69,7 +72,6 @@
         )
 
 _CONFIG_FOR_DOC = "TapasConfig"
-_TOKENIZER_FOR_DOC = "TapasTokenizer"
 _CHECKPOINT_FOR_DOC = "google/tapas-base"
 
 TF_TAPAS_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -133,11 +135,11 @@ class TFTableQuestionAnsweringOutput(ModelOutput):
             the self-attention heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    logits_aggregation: Optional[tf.Tensor] = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    logits_aggregation: tf.Tensor | None = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 class TFTapasEmbeddings(tf.keras.layers.Layer):
@@ -149,8 +151,7 @@ class TFTapasEmbeddings(tf.keras.layers.Layer):
     def __init__(self, config: TapasConfig, **kwargs):
         super().__init__(**kwargs)
 
-        self.vocab_size = config.vocab_size
-        self.type_vocab_sizes = config.type_vocab_sizes
+        self.config = config
         self.number_of_token_type_embeddings = len(config.type_vocab_sizes)
         self.reset_position_index_per_cell = config.reset_position_index_per_cell
         self.hidden_size = config.hidden_size
@@ -163,7 +164,7 @@ def build(self, input_shape: tf.TensorShape):
         with tf.name_scope("word_embeddings"):
             self.weight = self.add_weight(
                 name="weight",
-                shape=[self.vocab_size, self.hidden_size],
+                shape=[self.config.vocab_size, self.hidden_size],
                 initializer=get_initializer(self.initializer_range),
             )
 
@@ -173,7 +174,7 @@ def build(self, input_shape: tf.TensorShape):
                 shape=[self.max_position_embeddings, self.hidden_size],
                 initializer=get_initializer(self.initializer_range),
             )
-        for i, type_vocab_size in enumerate(self.type_vocab_sizes):
+        for i, type_vocab_size in enumerate(self.config.type_vocab_sizes):
             with tf.name_scope(f"token_type_embeddings_{i}"):
                 setattr(
                     self,
@@ -218,11 +219,10 @@ def call(
             position_ids = tf.broadcast_to(position_ids, shape=input_shape)
             # when self.config.reset_position_index_per_cell is set to True, create relative position embeddings
             if self.reset_position_index_per_cell:
-
                 # shape (batch_size, seq_len)
-                col_index = IndexMap(token_type_ids[:, :, 1], self.type_vocab_sizes[1], batch_dims=1)
+                col_index = IndexMap(token_type_ids[:, :, 1], self.config.type_vocab_sizes[1], batch_dims=1)
                 # shape (batch_size, seq_len)
-                row_index = IndexMap(token_type_ids[:, :, 2], self.type_vocab_sizes[2], batch_dims=1)
+                row_index = IndexMap(token_type_ids[:, :, 2], self.config.type_vocab_sizes[2], batch_dims=1)
                 # shape (batch_size, seq_len)
                 full_index = ProductIndexMap(col_index, row_index)
                 # shape (max_rows * max_columns,). First absolute position for every cell
@@ -234,6 +234,7 @@ def call(
                 position_ids = tf.math.minimum(self.max_position_embeddings - 1, position - first_position)
 
         if input_ids is not None:
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         position_embeddings = tf.gather(self.position_embeddings, indices=position_ids)
@@ -488,9 +489,9 @@ def call(
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
         head_mask: tf.Tensor,
-        encoder_hidden_states: Optional[tf.Tensor],
-        encoder_attention_mask: Optional[tf.Tensor],
-        past_key_value: Optional[Tuple[tf.Tensor]],
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_value: Tuple[tf.Tensor] | None,
         output_attentions: bool,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
@@ -567,9 +568,9 @@ def call(
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
         head_mask: tf.Tensor,
-        encoder_hidden_states: Optional[tf.Tensor],
-        encoder_attention_mask: Optional[tf.Tensor],
-        past_key_values: Optional[Tuple[Tuple[tf.Tensor]]],
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_values: Tuple[Tuple[tf.Tensor]] | None,
         use_cache: Optional[bool],
         output_attentions: bool,
         output_hidden_states: bool,
@@ -677,7 +678,7 @@ class TFTapasLMPredictionHead(tf.keras.layers.Layer):
     def __init__(self, config: TapasConfig, input_embeddings: tf.keras.layers.Layer, **kwargs):
         super().__init__(**kwargs)
 
-        self.vocab_size = config.vocab_size
+        self.config = config
         self.hidden_size = config.hidden_size
 
         self.transform = TFTapasPredictionHeadTransform(config, name="transform")
@@ -687,7 +688,7 @@ def __init__(self, config: TapasConfig, input_embeddings: tf.keras.layers.Layer,
         self.input_embeddings = input_embeddings
 
     def build(self, input_shape: tf.TensorShape):
-        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
+        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
 
         super().build(input_shape)
 
@@ -703,14 +704,14 @@ def get_bias(self) -> Dict[str, tf.Variable]:
 
     def set_bias(self, value: tf.Variable):
         self.bias = value["bias"]
-        self.vocab_size = shape_list(value["bias"])[0]
+        self.config.vocab_size = shape_list(value["bias"])[0]
 
     def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
         hidden_states = self.transform(hidden_states=hidden_states)
         seq_length = shape_list(hidden_states)[1]
         hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
         hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
-        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
         hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
 
         return hidden_states
@@ -760,18 +761,17 @@ class PreTrainedModel
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
-
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
@@ -862,18 +862,13 @@ class TFTapasPreTrainedModel(TFPreTrainedModel):
     config_class = TapasConfig
     base_model_prefix = "tapas"
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.float32, name="attention_mask"),
-                "token_type_ids": tf.TensorSpec((None, None, None), tf.int32, name="token_type_ids"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-        return self.serving_output(output)
+    @property
+    def input_signature(self):
+        return {
+            "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
+            "attention_mask": tf.TensorSpec((None, None), tf.float32, name="attention_mask"),
+            "token_type_ids": tf.TensorSpec((None, None, 7), tf.int32, name="token_type_ids"),
+        }
 
 
 TAPAS_START_DOCSTRING = r"""
@@ -923,7 +918,7 @@ def serving(self, inputs):
         input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`TapasTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -935,7 +930,7 @@ def serving(self, inputs):
 
             [What are attention masks?](../glossary#attention-mask)
         token_type_ids (`np.ndarray` or `tf.Tensor` of shape `({0}, 7)`, *optional*):
-            Token indices that encode tabular structure. Indices can be obtained using [`TapasTokenizer`]. See this
+            Token indices that encode tabular structure. Indices can be obtained using [`AutoTokenizer`]. See this
             class for more info.
 
             [What are token type IDs?](../glossary#token-type-ids)
@@ -987,12 +982,12 @@ def __init__(self, config: TapasConfig, *inputs, **kwargs):
     @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1004,10 +999,10 @@ def call(
         Examples:
 
         ```python
-        >>> from transformers import TapasTokenizer, TapasModel
+        >>> from transformers import AutoTokenizer, TapasModel
         >>> import pandas as pd
 
-        >>> tokenizer = TapasTokenizer.from_pretrained("google/tapas-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/tapas-base")
         >>> model = TapasModel.from_pretrained("google/tapas-base")
 
         >>> data = {
@@ -1038,17 +1033,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
-        hidden_states = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attentions = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutputWithPooling(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            hidden_states=hidden_states,
-            attentions=attentions,
-        )
-
 
 @add_start_docstrings("""Tapas Model with a `language modeling` head on top.""", TAPAS_START_DOCSTRING)
 class TFTapasForMaskedLM(TFTapasPreTrainedModel, TFMaskedLanguageModelingLoss):
@@ -1072,16 +1056,16 @@ def get_lm_head(self) -> tf.keras.layers.Layer:
     @replace_return_docstrings(output_type=TFMaskedLMOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1095,10 +1079,10 @@ def call(
         Examples:
 
         ```python
-        >>> from transformers import TapasTokenizer, TapasForMaskedLM
+        >>> from transformers import AutoTokenizer, TapasForMaskedLM
         >>> import pandas as pd
 
-        >>> tokenizer = TapasTokenizer.from_pretrained("google/tapas-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/tapas-base")
         >>> model = TapasForMaskedLM.from_pretrained("google/tapas-base")
 
         >>> data = {
@@ -1145,12 +1129,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
-        hidden_states = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attentions = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedLMOutput(logits=output.logits, hidden_states=hidden_states, attentions=attentions)
-
 
 class TFTapasComputeTokenLogits(tf.keras.layers.Layer):
     def __init__(self, config: TapasConfig, **kwargs):
@@ -1284,21 +1262,21 @@ def __init__(self, config: TapasConfig, *inputs, **kwargs):
     @replace_return_docstrings(output_type=TFTableQuestionAnsweringOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        table_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        aggregation_labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        float_answer: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        numeric_values: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        numeric_values_scale: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        table_mask: np.ndarray | tf.Tensor | None = None,
+        aggregation_labels: np.ndarray | tf.Tensor | None = None,
+        float_answer: np.ndarray | tf.Tensor | None = None,
+        numeric_values: np.ndarray | tf.Tensor | None = None,
+        numeric_values_scale: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFTableQuestionAnsweringOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1307,7 +1285,7 @@ def call(
             padding are 0.
         labels (`tf.Tensor` of shape `(batch_size, seq_length)`, *optional*):
             Labels per token for computing the hierarchical cell selection loss. This encodes the positions of the
-            answer appearing in the table. Can be obtained using [`TapasTokenizer`].
+            answer appearing in the table. Can be obtained using [`AutoTokenizer`].
 
             - 1 for tokens that are **part of the answer**,
             - 0 for tokens that are **not part of the answer**.
@@ -1321,10 +1299,10 @@ def call(
             required in case of weak supervision (WTQ) to calculate the aggregate mask and regression loss.
         numeric_values (`tf.Tensor` of shape `(batch_size, seq_length)`, *optional*):
             Numeric values of every token, NaN for tokens which are not numeric values. Can be obtained using
-            [`TapasTokenizer`]. Only required in case of weak supervision for aggregation (WTQ) to calculate the
+            [`AutoTokenizer`]. Only required in case of weak supervision for aggregation (WTQ) to calculate the
             regression loss.
         numeric_values_scale (`tf.Tensor` of shape `(batch_size, seq_length)`, *optional*):
-            Scale of the numeric values of every token. Can be obtained using [`TapasTokenizer`]. Only required in case
+            Scale of the numeric values of every token. Can be obtained using [`AutoTokenizer`]. Only required in case
             of weak supervision for aggregation (WTQ) to calculate the regression loss.
 
         Returns:
@@ -1332,10 +1310,10 @@ def call(
         Examples:
 
         ```python
-        >>> from transformers import TapasTokenizer, TapasForQuestionAnswering
+        >>> from transformers import AutoTokenizer, TapasForQuestionAnswering
         >>> import pandas as pd
 
-        >>> tokenizer = TapasTokenizer.from_pretrained("google/tapas-base-finetuned-wtq")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/tapas-base-finetuned-wtq")
         >>> model = TapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-wtq")
 
         >>> data = {
@@ -1574,17 +1552,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFTableQuestionAnsweringOutput) -> TFTableQuestionAnsweringOutput:
-        hidden_states = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attentions = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTableQuestionAnsweringOutput(
-            logits=output.logits,
-            logits_aggregation=output.logits_aggregation,
-            hidden_states=hidden_states,
-            attentions=attentions,
-        )
-
 
 @add_start_docstrings(
     """
@@ -1609,16 +1576,16 @@ def __init__(self, config: TapasConfig, *inputs, **kwargs):
     @replace_return_docstrings(output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1633,11 +1600,11 @@ def call(
         Examples:
 
         ```python
-        >>> from transformers import TapasTokenizer, TapasForSequenceClassification
+        >>> from transformers import AutoTokenizer, TapasForSequenceClassification
         >>> import tensorflow as tf
         >>> import pandas as pd
 
-        >>> tokenizer = TapasTokenizer.from_pretrained("google/tapas-base-finetuned-tabfact")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/tapas-base-finetuned-tabfact")
         >>> model = TapasForSequenceClassification.from_pretrained("google/tapas-base-finetuned-tabfact")
 
         >>> data = {
@@ -1687,12 +1654,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hidden_states = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attentions = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hidden_states, attentions=attentions)
-
 
 """ TAPAS utilities."""
 
diff --git a/src/transformers/models/tapas/tokenization_tapas.py b/src/transformers/models/tapas/tokenization_tapas.py
index 5c8c9d4f6e25..7ec1e68f21d7 100644
--- a/src/transformers/models/tapas/tokenization_tapas.py
+++ b/src/transformers/models/tapas/tokenization_tapas.py
@@ -31,6 +31,7 @@
 from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
 from ...tokenization_utils_base import (
     ENCODE_KWARGS_DOCSTRING,
+    VERY_LARGE_INTEGER,
     BatchEncoding,
     EncodedInput,
     PreTokenizedInput,
@@ -223,7 +224,7 @@ def whitespace_tokenize(text):
                 which it will tokenize. This is useful for NER or token classification.
             pad_to_multiple_of (`int`, *optional*):
                 If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
-                the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
+                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
             return_tensors (`str` or [`~utils.TensorType`], *optional*):
                 If set, will return tensors instead of list of python integers. Acceptable values are:
 
@@ -340,7 +341,7 @@ def __init__(
         max_question_length=None,
         model_max_length: int = 512,
         additional_special_tokens: Optional[List[str]] = None,
-        **kwargs
+        **kwargs,
     ):
         if not is_pandas_available():
             raise ImportError("Pandas is required for the TAPAS tokenizer.")
@@ -351,6 +352,44 @@ def __init__(
         else:
             additional_special_tokens = [empty_token]
 
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
+                " model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+            self.basic_tokenizer = BasicTokenizer(
+                do_lower_case=do_lower_case,
+                never_split=never_split,
+                tokenize_chinese_chars=tokenize_chinese_chars,
+                strip_accents=strip_accents,
+            )
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+
+        # Additional properties
+        self.cell_trim_length = cell_trim_length
+        self.max_column_id = (
+            max_column_id
+            if max_column_id is not None
+            else model_max_length
+            if model_max_length is not None
+            else VERY_LARGE_INTEGER
+        )
+        self.max_row_id = (
+            max_row_id
+            if max_row_id is not None
+            else model_max_length
+            if model_max_length is not None
+            else VERY_LARGE_INTEGER
+        )
+        self.strip_column_names = strip_column_names
+        self.update_answer_coordinates = update_answer_coordinates
+        self.min_question_length = min_question_length
+        self.max_question_length = max_question_length
+
         super().__init__(
             do_lower_case=do_lower_case,
             do_basic_tokenize=do_basic_tokenize,
@@ -375,32 +414,6 @@ def __init__(
             **kwargs,
         )
 
-        if not os.path.isfile(vocab_file):
-            raise ValueError(
-                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
-                " model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
-            )
-        self.vocab = load_vocab(vocab_file)
-        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
-        self.do_basic_tokenize = do_basic_tokenize
-        if do_basic_tokenize:
-            self.basic_tokenizer = BasicTokenizer(
-                do_lower_case=do_lower_case,
-                never_split=never_split,
-                tokenize_chinese_chars=tokenize_chinese_chars,
-                strip_accents=strip_accents,
-            )
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
-
-        # Additional properties
-        self.cell_trim_length = cell_trim_length
-        self.max_column_id = max_column_id if max_column_id is not None else self.model_max_length
-        self.max_row_id = max_row_id if max_row_id is not None else self.model_max_length
-        self.strip_column_names = strip_column_names
-        self.update_answer_coordinates = update_answer_coordinates
-        self.min_question_length = min_question_length
-        self.max_question_length = max_question_length
-
     @property
     def do_lower_case(self):
         return self.basic_tokenizer.do_lower_case
@@ -418,7 +431,6 @@ def _tokenize(self, text):
         split_tokens = []
         if self.do_basic_tokenize:
             for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-
                 # If the token is part of the never_split set
                 if token in self.basic_tokenizer.never_split:
                     split_tokens.append(token)
@@ -602,7 +614,7 @@ def __call__(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Main method to tokenize and prepare for the model one or several sequence(s) related to a table.
@@ -716,7 +728,7 @@ def batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Prepare a table and a list of strings for the model.
@@ -823,7 +835,7 @@ def _batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         table_tokens = self._tokenize_table(table)
 
@@ -882,7 +894,7 @@ def _batch_prepare_for_model(
         return_length: bool = False,
         verbose: bool = True,
         prepend_batch_axis: bool = False,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         batch_outputs = {}
 
@@ -944,7 +956,7 @@ def encode(
         truncation: Union[bool, str, TapasTruncationStrategy] = False,
         max_length: Optional[int] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs
+        **kwargs,
     ) -> List[int]:
         """
         Prepare a table and a string for the model. This method does not return token type IDs, attention masks, etc.
@@ -996,7 +1008,7 @@ def encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Prepare a table and a string for the model.
@@ -1077,7 +1089,7 @@ def _encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ):
         if query is None:
             query = ""
@@ -1136,7 +1148,7 @@ def prepare_for_model(
         return_length: bool = False,
         verbose: bool = True,
         prepend_batch_axis: bool = False,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Prepares a sequence of input id so that it can be used by the model. It adds special tokens, truncates
@@ -1689,7 +1701,7 @@ def _get_numeric_values_scale(self, table, column_ids, row_ids):
 
         for col_index in range(num_columns):
             for row_index in range(num_rows):
-                indices = [index for index in self._get_cell_token_indexes(column_ids, row_ids, col_index, row_index)]
+                indices = list(self._get_cell_token_indexes(column_ids, row_ids, col_index, row_index))
                 num_indices = len(indices)
                 if num_indices > 1:
                     for index in indices:
@@ -1852,7 +1864,7 @@ def _pad(
                     - 'right': pads on the right of the sequences
             pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                 This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
-                >= 7.5 (Volta).
+                `>= 7.5` (Volta).
             return_attention_mask:
                 (optional) Set to False to avoid returning attention mask (default: set to model specifics)
         """
@@ -2056,20 +2068,30 @@ class BasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -2088,7 +2110,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -2116,7 +2140,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
diff --git a/src/transformers/models/tapex/__init__.py b/src/transformers/models/tapex/__init__.py
deleted file mode 100644
index 3b13bed2ca10..000000000000
--- a/src/transformers/models/tapex/__init__.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import TYPE_CHECKING
-
-# rely on isort to merge the imports
-from ...file_utils import _LazyModule
-
-
-_import_structure = {"tokenization_tapex": ["TapexTokenizer"]}
-
-
-if TYPE_CHECKING:
-    from .tokenization_tapex import TapexTokenizer
-
-
-else:
-    import sys
-
-    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
diff --git a/src/transformers/models/time_series_transformer/__init__.py b/src/transformers/models/time_series_transformer/__init__.py
index 221cc874092e..1c09b683a346 100644
--- a/src/transformers/models/time_series_transformer/__init__.py
+++ b/src/transformers/models/time_series_transformer/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,7 +13,6 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-# rely on isort to merge the imports
 from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
 
 
diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 8d89d5cd7f19..9676b50ed0b9 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 """ Time Series Transformer model configuration"""
 
-from typing import List, Optional
+from typing import List, Optional, Union
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -44,7 +44,8 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
 
     Args:
         prediction_length (`int`):
-            The prediction length for the decoder. In other words, the prediction horizon of the model.
+            The prediction length for the decoder. In other words, the prediction horizon of the model. This value is
+            typically dictated by the dataset and we recommend to set it appropriately.
         context_length (`int`, *optional*, defaults to `prediction_length`):
             The context length for the encoder. If `None`, the context length will be the same as the
             `prediction_length`.
@@ -56,11 +57,12 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
         input_size (`int`, *optional*, defaults to 1):
             The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of
             multivariate targets.
-        scaling (`bool`, *optional* defaults to `True`):
-            Whether to scale the input targets.
+        scaling (`string` or `bool`, *optional* defaults to `"mean"`):
+            Whether to scale the input targets via "mean" scaler, "std" scaler or no scaler if `None`. If `True`, the
+            scaler is set to "mean".
         lags_sequence (`list[int]`, *optional*, defaults to `[1, 2, 3, 4, 5, 6, 7]`):
-            The lags of the input time series as covariates often dictated by the frequency. Default is `[1, 2, 3, 4,
-            5, 6, 7]`.
+            The lags of the input time series as covariates often dictated by the frequency of the data. Default is
+            `[1, 2, 3, 4, 5, 6, 7]` but we recommend to change it based on the dataset appropriately.
         num_time_features (`int`, *optional*, defaults to 0):
             The number of time features in the input time series.
         num_dynamic_real_features (`int`, *optional*, defaults to 0):
@@ -77,6 +79,8 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
             The dimension of the embedding for each of the static categorical features. Should be a list of integers,
             having the same length as `num_static_categorical_features`. Cannot be `None` if
             `num_static_categorical_features` is > 0.
+        d_model (`int`, *optional*, defaults to 64):
+            Dimensionality of the transformer layers.
         encoder_layers (`int`, *optional*, defaults to 2):
             Number of encoder layers.
         decoder_layers (`int`, *optional*, defaults to 2):
@@ -114,8 +118,8 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
     ```python
     >>> from transformers import TimeSeriesTransformerConfig, TimeSeriesTransformerModel
 
-    >>> # Initializing a default Time Series Transformer configuration
-    >>> configuration = TimeSeriesTransformerConfig()
+    >>> # Initializing a Time Series Transformer configuration with 12 time steps for prediction
+    >>> configuration = TimeSeriesTransformerConfig(prediction_length=12)
 
     >>> # Randomly initializing a model (with random weights) from the configuration
     >>> model = TimeSeriesTransformerModel(configuration)
@@ -132,13 +136,13 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
 
     def __init__(
         self,
-        input_size: int = 1,
         prediction_length: Optional[int] = None,
         context_length: Optional[int] = None,
         distribution_output: str = "student_t",
         loss: str = "nll",
+        input_size: int = 1,
         lags_sequence: List[int] = [1, 2, 3, 4, 5, 6, 7],
-        scaling: bool = True,
+        scaling: Optional[Union[str, bool]] = "mean",
         num_dynamic_real_features: int = 0,
         num_static_categorical_features: int = 0,
         num_static_real_features: int = 0,
@@ -153,6 +157,7 @@ def __init__(
         decoder_layers: int = 2,
         is_encoder_decoder: bool = True,
         activation_function: str = "gelu",
+        d_model: int = 64,
         dropout: float = 0.1,
         encoder_layerdrop: float = 0.1,
         decoder_layerdrop: float = 0.1,
@@ -161,7 +166,7 @@ def __init__(
         num_parallel_samples: int = 100,
         init_std: float = 0.02,
         use_cache=True,
-        **kwargs
+        **kwargs,
     ):
         # time series specific configuration
         self.prediction_length = prediction_length
@@ -182,7 +187,7 @@ def __init__(
                 )
             self.cardinality = cardinality
         else:
-            self.cardinality = [1]
+            self.cardinality = [0]
         if embedding_dimension and num_static_categorical_features > 0:
             if len(embedding_dimension) != num_static_categorical_features:
                 raise ValueError(
@@ -194,7 +199,8 @@ def __init__(
         self.num_parallel_samples = num_parallel_samples
 
         # Transformer architecture configuration
-        self.d_model = input_size * len(lags_sequence) + self._number_of_features
+        self.feature_size = input_size * len(lags_sequence) + self._number_of_features
+        self.d_model = d_model
         self.encoder_attention_heads = encoder_attention_heads
         self.decoder_attention_heads = decoder_attention_heads
         self.encoder_ffn_dim = encoder_ffn_dim
@@ -211,9 +217,6 @@ def __init__(
         self.activation_function = activation_function
         self.init_std = init_std
 
-        self.output_attentions = False
-        self.output_hidden_states = False
-
         self.use_cache = use_cache
 
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
@@ -224,6 +227,6 @@ def _number_of_features(self) -> int:
             sum(self.embedding_dimension)
             + self.num_dynamic_real_features
             + self.num_time_features
-            + max(1, self.num_static_real_features)  # there is at least one dummy static real feature
-            + self.input_size  # the log(scale)
+            + self.num_static_real_features
+            + self.input_size * 2  # the log1p(abs(loc)) and log(scale) features
         )
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index e9f412c1bfec..2caca5bd1051 100644
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -15,25 +15,22 @@
 # limitations under the License.
 """ PyTorch Time Series Transformer model."""
 
-import random
-from dataclasses import dataclass
-from typing import Callable, Dict, List, Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
+import numpy as np
 import torch
 from torch import nn
-from torch.distributions import (
-    AffineTransform,
-    Distribution,
-    Independent,
-    NegativeBinomial,
-    Normal,
-    StudentT,
-    TransformedDistribution,
-)
 
 from ...activations import ACT2FN
-from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, ModelOutput
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    SampleTSPredictionOutput,
+    Seq2SeqTSModelOutput,
+    Seq2SeqTSPredictionOutput,
+)
 from ...modeling_utils import PreTrainedModel
+from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_time_series_transformer import TimeSeriesTransformerConfig
 
@@ -49,189 +46,17 @@
 ]
 
 
-class AffineTransformed(TransformedDistribution):
-    def __init__(self, base_distribution: Distribution, loc=None, scale=None, event_dim=0):
-        self.scale = 1.0 if scale is None else scale
-        self.loc = 0.0 if loc is None else loc
-
-        super().__init__(base_distribution, [AffineTransform(loc=self.loc, scale=self.scale, event_dim=event_dim)])
-
-    @property
-    def mean(self):
-        """
-        Returns the mean of the distribution.
-        """
-        return self.base_dist.mean * self.scale + self.loc
-
-    @property
-    def variance(self):
-        """
-        Returns the variance of the distribution.
-        """
-        return self.base_dist.variance * self.scale**2
-
-    @property
-    def stddev(self):
-        """
-        Returns the standard deviation of the distribution.
-        """
-        return self.variance.sqrt()
-
-
-class ParameterProjection(nn.Module):
-    def __init__(
-        self, in_features: int, args_dim: Dict[str, int], domain_map: Callable[..., Tuple[torch.Tensor]], **kwargs
-    ) -> None:
-        super().__init__(**kwargs)
-        self.args_dim = args_dim
-        self.proj = nn.ModuleList([nn.Linear(in_features, dim) for dim in args_dim.values()])
-        self.domain_map = domain_map
-
-    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor]:
-        params_unbounded = [proj(x) for proj in self.proj]
-
-        return self.domain_map(*params_unbounded)
-
-
-class LambdaLayer(nn.Module):
-    def __init__(self, function):
-        super().__init__()
-        self.function = function
-
-    def forward(self, x, *args):
-        return self.function(x, *args)
-
-
-class DistributionOutput:
-    distribution_class: type
-    in_features: int
-    args_dim: Dict[str, int]
-
-    def __init__(self, dim: int = 1) -> None:
-        self.dim = dim
-        self.args_dim = {k: dim * self.args_dim[k] for k in self.args_dim}
-
-    def _base_distribution(self, distr_args):
-        if self.dim == 1:
-            return self.distribution_class(*distr_args)
-        else:
-            return Independent(self.distribution_class(*distr_args), 1)
-
-    def distribution(
-        self,
-        distr_args,
-        loc: Optional[torch.Tensor] = None,
-        scale: Optional[torch.Tensor] = None,
-    ) -> Distribution:
-        distr = self._base_distribution(distr_args)
-        if loc is None and scale is None:
-            return distr
-        else:
-            return AffineTransformed(distr, loc=loc, scale=scale, event_dim=self.event_dim)
-
-    @property
-    def event_shape(self) -> Tuple:
-        r"""
-        Shape of each individual event contemplated by the distributions that this object constructs.
-        """
-        return () if self.dim == 1 else (self.dim,)
-
-    @property
-    def event_dim(self) -> int:
-        r"""
-        Number of event dimensions, i.e., length of the `event_shape` tuple, of the distributions that this object
-        constructs.
-        """
-        return len(self.event_shape)
-
-    @property
-    def value_in_support(self) -> float:
-        r"""
-        A float that will have a valid numeric value when computing the log-loss of the corresponding distribution. By
-        default 0.0. This value will be used when padding data series.
-        """
-        return 0.0
-
-    def get_parameter_projection(self, in_features: int) -> nn.Module:
-        r"""
-        Return the parameter projection layer that maps the input to the appropriate parameters of the distribution.
-        """
-        return ParameterProjection(
-            in_features=in_features,
-            args_dim=self.args_dim,
-            domain_map=LambdaLayer(self.domain_map),
-        )
-
-    def domain_map(self, *args: torch.Tensor):
-        r"""
-        Converts arguments to the right shape and domain. The domain depends on the type of distribution, while the
-        correct shape is obtained by reshaping the trailing axis in such a way that the returned tensors define a
-        distribution of the right event_shape.
-        """
-        raise NotImplementedError()
-
-    @classmethod
-    def squareplus(cls, x: torch.Tensor) -> torch.Tensor:
-        r"""
-        Helper to map inputs to the positive orthant by applying the square-plus operation. Reference:
-        https://twitter.com/jon_barron/status/1387167648669048833
-        """
-        return (x + torch.sqrt(torch.square(x) + 4.0)) / 2.0
-
-
-class StudentTOutput(DistributionOutput):
-    args_dim: Dict[str, int] = {"df": 1, "loc": 1, "scale": 1}
-    distribution_class: type = StudentT
-
-    @classmethod
-    def domain_map(cls, df: torch.Tensor, loc: torch.Tensor, scale: torch.Tensor):
-        scale = cls.squareplus(scale)
-        df = 2.0 + cls.squareplus(df)
-        return df.squeeze(-1), loc.squeeze(-1), scale.squeeze(-1)
-
-
-class NormalOutput(DistributionOutput):
-    args_dim: Dict[str, int] = {"loc": 1, "scale": 1}
-    distribution_class: type = Normal
-
-    @classmethod
-    def domain_map(cls, loc: torch.Tensor, scale: torch.Tensor):
-        scale = cls.squareplus(scale)
-        return loc.squeeze(-1), scale.squeeze(-1)
-
-
-class NegativeBinomialOutput(DistributionOutput):
-    args_dim: Dict[str, int] = {"total_count": 1, "logits": 1}
-    distribution_class: type = NegativeBinomial
-
-    @classmethod
-    def domain_map(cls, total_count: torch.Tensor, logits: torch.Tensor):
-        total_count = cls.squareplus(total_count)
-        return total_count.squeeze(-1), logits.squeeze(-1)
-
-    def _base_distribution(self, distr_args) -> Distribution:
-        total_count, logits = distr_args
-        if self.dim == 1:
-            return self.distribution_class(total_count=total_count, logits=logits)
-        else:
-            return Independent(self.distribution_class(total_count=total_count, logits=logits), 1)
-
-    # Overwrites the parent class method. We cannot scale using the affine
-    # transformation since negative binomial should return integers. Instead
-    # we scale the parameters.
-    def distribution(
-        self, distr_args, loc: Optional[torch.Tensor] = None, scale: Optional[torch.Tensor] = None
-    ) -> Distribution:
-        total_count, logits = distr_args
-
-        if scale is not None:
-            # See scaling property of Gamma.
-            logits += scale.log()
-
-        return self._base_distribution((total_count, logits))
+class TimeSeriesFeatureEmbedder(nn.Module):
+    """
+    Embed a sequence of categorical features.
 
+    Args:
+        cardinalities (`list[int]`):
+            List of cardinalities of the categorical features.
+        embedding_dims (`list[int]`):
+            List of embedding dimensions of the categorical features.
+    """
 
-class FeatureEmbedder(nn.Module):
     def __init__(self, cardinalities: List[int], embedding_dims: List[int]) -> None:
         super().__init__()
 
@@ -255,7 +80,40 @@ def forward(self, features: torch.Tensor) -> torch.Tensor:
         )
 
 
-class MeanScaler(nn.Module):
+class TimeSeriesStdScaler(nn.Module):
+    """
+    Standardize features by calculating the mean and scaling along some given dimension `dim`, and then normalizes it
+    by subtracting from the mean and dividing by the standard deviation.
+
+    Args:
+        dim (`int`):
+            Dimension along which to calculate the mean and standard deviation.
+        keepdim (`bool`, *optional*, defaults to `False`):
+            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+        minimum_scale (`float`, *optional*, defaults to 1e-5):
+            Default scale that is used for elements that are constantly zero along dimension `dim`.
+    """
+
+    def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-5):
+        super().__init__()
+        if not dim > 0:
+            raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0")
+        self.dim = dim
+        self.keepdim = keepdim
+        self.minimum_scale = minimum_scale
+
+    @torch.no_grad()
+    def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        denominator = weights.sum(self.dim, keepdim=self.keepdim)
+        denominator = denominator.clamp_min(1.0)
+        loc = (data * weights).sum(self.dim, keepdim=self.keepdim) / denominator
+
+        variance = (((data - loc) * weights) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator
+        scale = torch.sqrt(variance + self.minimum_scale)
+        return (data - loc) / scale, loc, scale
+
+
+class TimeSeriesMeanScaler(nn.Module):
     """
     Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data
     accordingly.
@@ -265,51 +123,54 @@ class MeanScaler(nn.Module):
             Dimension along which to compute the scale.
         keepdim (`bool`, *optional*, defaults to `False`):
             Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+        default_scale (`float`, *optional*, defaults to `None`):
+            Default scale that is used for elements that are constantly zero. If `None`, we use the scale of the batch.
         minimum_scale (`float`, *optional*, defaults to 1e-10):
-            Default scale that is used for elements that are constantly zero along dimension `dim`.
+            Default minimum possible scale that is used for any item.
     """
 
-    def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-10):
+    def __init__(
+        self, dim: int = -1, keepdim: bool = True, default_scale: Optional[float] = None, minimum_scale: float = 1e-10
+    ):
         super().__init__()
-        if not dim > 0:
-            raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0")
         self.dim = dim
         self.keepdim = keepdim
-        self.register_buffer("minimum_scale", torch.tensor(minimum_scale))
-
-    def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        # these will have shape (N, C)
-        total_weight = weights.sum(dim=self.dim)
-        weighted_sum = (data.abs() * weights).sum(dim=self.dim)
-
-        # first compute a global scale per-dimension
-        total_observed = total_weight.sum(dim=0)
-        denominator = torch.max(total_observed, torch.ones_like(total_observed))
-        default_scale = weighted_sum.sum(dim=0) / denominator
-
-        # then compute a per-item, per-dimension scale
-        denominator = torch.max(total_weight, torch.ones_like(total_weight))
-        scale = weighted_sum / denominator
-
-        # use per-batch scale when no element is observed
-        # or when the sequence contains only zeros
-        scale = (
-            torch.max(
-                self.minimum_scale,
-                torch.where(
-                    weighted_sum > torch.zeros_like(weighted_sum),
-                    scale,
-                    default_scale * torch.ones_like(total_weight),
-                ),
-            )
-            .detach()
-            .unsqueeze(dim=self.dim)
-        )
+        self.minimum_scale = minimum_scale
+        self.default_scale = default_scale
 
-        return data / scale, scale if self.keepdim else scale.squeeze(dim=self.dim)
+    @torch.no_grad()
+    def forward(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        # shape: (N, [C], T=1)
+        ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True)
+        num_observed = observed_indicator.sum(self.dim, keepdim=True)
+
+        scale = ts_sum / torch.clamp(num_observed, min=1)
+
+        # If `default_scale` is provided, we use it, otherwise we use the scale
+        # of the batch.
+        if self.default_scale is None:
+            batch_sum = ts_sum.sum(dim=0)
+            batch_observations = torch.clamp(num_observed.sum(0), min=1)
+            default_scale = torch.squeeze(batch_sum / batch_observations)
+        else:
+            default_scale = self.default_scale * torch.ones_like(scale)
 
+        # apply default scale where there are no observations
+        scale = torch.where(num_observed > 0, scale, default_scale)
 
-class NOPScaler(nn.Module):
+        # ensure the scale is at least `self.minimum_scale`
+        scale = torch.clamp(scale, min=self.minimum_scale)
+        scaled_data = data / scale
+
+        if not self.keepdim:
+            scale = scale.squeeze(dim=self.dim)
+
+        return scaled_data, torch.zeros_like(scale), scale
+
+
+class TimeSeriesNOPScaler(nn.Module):
     """
     Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data.
 
@@ -325,9 +186,19 @@ def __init__(self, dim: int, keepdim: bool = False):
         self.dim = dim
         self.keepdim = keepdim
 
-    def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        scale = torch.ones_like(data).mean(dim=self.dim, keepdim=self.keepdim)
-        return data, scale
+    def forward(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
+        loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
+        return data, loc, scale
+
+
+def nll(input: torch.distributions.Distribution, target: torch.Tensor) -> torch.Tensor:
+    """
+    Computes the negative log likelihood loss from input distribution with respect to target.
+    """
+    return -input.log_prob(target)
 
 
 def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
@@ -354,28 +225,21 @@ def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor]
         return input_tensor.mean(dim=dim)
 
 
-class NegativeLogLikelihood:
-    """
-    Computes the negative log likelihood loss from input distribution with respect to target.
-    """
-
-    def __call__(self, input: torch.distributions.Distribution, target: torch.Tensor) -> torch.Tensor:
-        return -input.log_prob(target)
-
-
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
     """
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
-    mask_cond = torch.arange(mask.size(-1))
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
 
     if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
     return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
 
 
@@ -394,145 +258,48 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
     return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
 
 
-@dataclass
-class Seq2SeqTimeSeriesModelOutput(ModelOutput):
-    """
-    Base class for model encoder's outputs that also contains pre-computed hidden states that can speed up sequential
-    decoding.
-
-    Args:
-        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the decoder of the model.
-
-            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
-            hidden_size)` is output.
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs.
-        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
-            weighted average in the cross-attention heads.
-        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Sequence of hidden-states at the output of the last layer of the encoder of the model.
-        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs.
-        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-        scale: (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
-            Scaling values of each time series' context window which is used to give the model inputs of the same
-            magnitude and then used to rescale to the original scale.
-        static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
-            Static features of each time series' in a batch which are copied to the covariates at inference time.
-    """
-
-    last_hidden_state: torch.FloatTensor = None
-    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
-    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
-    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    scale: Optional[torch.FloatTensor] = None
-    static_features: Optional[torch.FloatTensor] = None
+# Copied from transformers.models.marian.modeling_marian.MarianSinusoidalPositionalEmbedding with Marian->TimeSeries
+class TimeSeriesSinusoidalPositionalEmbedding(nn.Embedding):
+    """This module produces sinusoidal positional embeddings of any length."""
 
+    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None) -> None:
+        super().__init__(num_positions, embedding_dim)
+        self.weight = self._init_weight(self.weight)
 
-@dataclass
-class Seq2SeqTimeSeriesPredictionOutput(ModelOutput):
-    """
-    Base class for model's predictions outputs that also contain the loss as well parameters of the chosen
-    distribution.
-
-    Args:
-        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when a `future_values` is provided):
-            Distributional loss.
-        params (`torch.FloatTensor` of shape `(batch_size, num_samples, num_params)`):
-            Parameters of the chosen distribution.
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+    @staticmethod
+    def _init_weight(out: nn.Parameter) -> nn.Parameter:
+        """
+        Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in
+        the 2nd half of the vector. [dim // 2:]
+        """
+        n_pos, dim = out.shape
+        position_enc = np.array(
+            [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
+        )
+        out.requires_grad = False  # set early to avoid an error in pytorch-1.8+
+        sentinel = dim // 2 if dim % 2 == 0 else (dim // 2) + 1
+        out[:, 0:sentinel] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
+        out[:, sentinel:] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
+        out.detach_()
+        return out
 
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
-        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
-            weighted average in the cross-attention heads.
-        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Sequence of hidden-states at the output of the last layer of the encoder of the model.
-        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
-        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-        scale: (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
-            Scaling values of each time series' context window which is used to give the model inputs of the same
-            magnitude and then used to rescale to the original scale.
-        static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
-            Static features of each time series' in a batch which are copied to the covariates at inference time.
-    """
+    @torch.no_grad()
+    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0) -> torch.Tensor:
+        """`input_ids_shape` is expected to be [bsz x seqlen]."""
+        bsz, seq_len = input_ids_shape[:2]
+        positions = torch.arange(
+            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
+        )
+        return super().forward(positions)
 
-    loss: Optional[torch.FloatTensor] = None
-    params: Optional[Tuple[torch.FloatTensor]] = None
-    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
-    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
-    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    scale: Optional[torch.FloatTensor] = None
-    static_features: Optional[torch.FloatTensor] = None
 
+class TimeSeriesValueEmbedding(nn.Module):
+    def __init__(self, feature_size, d_model):
+        super().__init__()
+        self.value_projection = nn.Linear(in_features=feature_size, out_features=d_model, bias=False)
 
-@dataclass
-class SampleTimeSeriesPredictionOutput(ModelOutput):
-    sequences: torch.FloatTensor = None
+    def forward(self, x):
+        return self.value_projection(x)
 
 
 # Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->TimeSeriesTransformer
@@ -627,8 +394,8 @@ def forward(
 
         proj_shape = (bsz * self.num_heads, -1, self.head_dim)
         query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-        key_states = key_states.view(*proj_shape)
-        value_states = value_states.view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
 
         src_len = key_states.size(1)
         attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
@@ -674,7 +441,7 @@ def forward(
 
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
             raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
                 f" {attn_output.size()}"
             )
 
@@ -682,7 +449,7 @@ def forward(
         attn_output = attn_output.transpose(1, 2)
 
         # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned aross GPUs when using tensor-parallelism.
+        # partitioned across GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
@@ -717,7 +484,7 @@ def forward(
     ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
@@ -889,6 +656,8 @@ def _init_weights(self, module):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.bias is not None:
                 module.bias.data.zero_()
+        elif isinstance(module, TimeSeriesSinusoidalPositionalEmbedding):
+            pass
         elif isinstance(module, nn.Embedding):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.padding_idx is not None:
@@ -917,30 +686,41 @@ def _set_gradient_checkpointing(self, module, value=False):
 
 TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING = r"""
     Args:
-        past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
-            Past values of the time series, that serve as context in order to predict the future. These values may
-            contain lags, i.e. additional values from the past which are added in order to serve as "extra context".
-            The `past_values` is what the Transformer encoder gets as input (with optional additional features, such as
-            `static_categorical_features`, `static_real_features`, `past_time_features`).
+        past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`):
+            Past values of the time series, that serve as context in order to predict the future. The sequence size of
+            this tensor must be larger than the `context_length` of the model, since the model will use the larger size
+            to construct lag features, i.e. additional values from the past which are added in order to serve as "extra
+            context".
 
-            The sequence length here is equal to `context_length` + `max(config.lags_sequence)`.
+            The `sequence_length` here is equal to `config.context_length` + `max(config.lags_sequence)`, which if no
+            `lags_sequence` is configured, is equal to `config.context_length` + 7 (as by default, the largest
+            look-back index in `config.lags_sequence` is 7). The property `_past_length` returns the actual length of
+            the past.
 
-            Missing values need to be replaced with zeros.
+            The `past_values` is what the Transformer encoder gets as input (with optional additional features, such as
+            `static_categorical_features`, `static_real_features`, `past_time_features` and lags).
+
+            Optionally, missing values need to be replaced with zeros and indicated via the `past_observed_mask`.
 
-        past_time_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`, *optional*):
-            Optional time features, which the model internally will add to `past_values`. These could be things like
+            For multivariate time series, the `input_size` > 1 dimension is required and corresponds to the number of
+            variates in the time series per time step.
+        past_time_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`):
+            Required time features, which the model internally will add to `past_values`. These could be things like
             "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These
             could also be so-called "age" features, which basically help the model know "at which point in life" a
             time-series is. Age features have small values for distant past time steps and increase monotonically the
-            more we approach the current time step.
+            more we approach the current time step. Holiday features are also a good example of time features.
 
             These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
             the position encodings are learned from scratch internally as parameters of the model, the Time Series
-            Transformer requires to provide additional time features.
+            Transformer requires to provide additional time features. The Time Series Transformer only learns
+            additional embeddings for `static_categorical_features`.
 
-            The Time Series Transformer only learns additional embeddings for `static_categorical_features`.
+            Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these features
+            must but known at prediction time.
 
-        past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`.
+        past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`, *optional*):
             Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected in
             `[0, 1]`:
 
@@ -954,35 +734,50 @@ def _set_gradient_checkpointing(self, module, value=False):
             Static categorical features are features which have the same value for all time steps (static over time).
 
             A typical example of a static categorical feature is a time series ID.
-
         static_real_features (`torch.FloatTensor` of shape `(batch_size, number of static real features)`, *optional*):
             Optional static real features which the model will add to the values of the time series.
 
             Static real features are features which have the same value for all time steps (static over time).
 
             A typical example of a static real feature is promotion information.
-
-        future_values (`torch.FloatTensor` of shape `(batch_size, prediction_length)`):
+        future_values (`torch.FloatTensor` of shape `(batch_size, prediction_length)` or `(batch_size, prediction_length, input_size)`, *optional*):
             Future values of the time series, that serve as labels for the model. The `future_values` is what the
-            Transformer needs to learn to output, given the `past_values`.
+            Transformer needs during training to learn to output, given the `past_values`.
+
+            The sequence length here is equal to `prediction_length`.
 
             See the demo notebook and code snippets for details.
 
-            Missing values need to be replaced with zeros.
+            Optionally, during training any missing values need to be replaced with zeros and indicated via the
+            `future_observed_mask`.
 
-        future_time_features (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`, *optional*):
-            Optional time features, which the model internally will add to `future_values`. These could be things like
-            "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These
-            could also be so-called "age" features, which basically help the model know "at which point in life" a
-            time-series is. Age features have small values for distant past time steps and increase monotonically the
-            more we approach the current time step.
+            For multivariate time series, the `input_size` > 1 dimension is required and corresponds to the number of
+            variates in the time series per time step.
+        future_time_features (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`):
+            Required time features for the prediction window, which the model internally will add to `future_values`.
+            These could be things like "month of year", "day of the month", etc. encoded as vectors (for instance as
+            Fourier features). These could also be so-called "age" features, which basically help the model know "at
+            which point in life" a time-series is. Age features have small values for distant past time steps and
+            increase monotonically the more we approach the current time step. Holiday features are also a good example
+            of time features.
 
             These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
             the position encodings are learned from scratch internally as parameters of the model, the Time Series
-            Transformer requires to provide additional features.
+            Transformer requires to provide additional time features. The Time Series Transformer only learns
+            additional embeddings for `static_categorical_features`.
 
-            The Time Series Transformer only learns additional embeddings for `static_categorical_features`.
+            Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these features
+            must but known at prediction time.
+
+            The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`.
+        future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`, *optional*):
+            Boolean mask to indicate which `future_values` were observed and which were missing. Mask values selected
+            in `[0, 1]`:
 
+            - 1 for values that are **observed**,
+            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+
+            This mask is used to filter out missing values for the final loss calculation.
         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing attention on certain token indices. Mask values selected in `[0, 1]`:
 
@@ -990,11 +785,9 @@ def _set_gradient_checkpointing(self, module, value=False):
             - 0 for tokens that are **masked**.
 
             [What are attention masks?](../glossary#attention-mask)
-
         decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
             Mask to avoid performing attention on certain token indices. By default, a causal mask will be used, to
             make sure the model can only look at previous inputs in order to predict the future.
-
         head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
             Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
 
@@ -1032,7 +825,6 @@ def _set_gradient_checkpointing(self, module, value=False):
             Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
             is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
             model's internal embedding lookup matrix.
-
         use_cache (`bool`, *optional*):
             If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
             `past_key_values`).
@@ -1061,11 +853,15 @@ def __init__(self, config: TimeSeriesTransformerConfig):
 
         self.dropout = config.dropout
         self.layerdrop = config.encoder_layerdrop
+        if config.prediction_length is None:
+            raise ValueError("The `prediction_length` config needs to be specified.")
 
-        embed_dim = config.d_model
-
+        self.value_embedding = TimeSeriesValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
+        self.embed_positions = TimeSeriesSinusoidalPositionalEmbedding(
+            config.context_length + config.prediction_length, config.d_model
+        )
         self.layers = nn.ModuleList([TimeSeriesTransformerEncoderLayer(config) for _ in range(config.encoder_layers)])
-        self.layernorm_embedding = nn.LayerNorm(embed_dim)
+        self.layernorm_embedding = nn.LayerNorm(config.d_model)
 
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
@@ -1114,8 +910,10 @@ def forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        hidden_states = inputs_embeds
-        hidden_states = self.layernorm_embedding(hidden_states)
+        hidden_states = self.value_embedding(inputs_embeds)
+        embed_pos = self.embed_positions(inputs_embeds.size())
+
+        hidden_states = self.layernorm_embedding(hidden_states + embed_pos)
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
 
         # expand attention_mask
@@ -1138,8 +936,13 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
                 layer_outputs = (None, None)
             else:
                 if self.gradient_checkpointing and self.training:
@@ -1192,7 +995,13 @@ def __init__(self, config: TimeSeriesTransformerConfig):
         super().__init__(config)
         self.dropout = config.dropout
         self.layerdrop = config.decoder_layerdrop
+        if config.prediction_length is None:
+            raise ValueError("The `prediction_length` config needs to be specified.")
 
+        self.value_embedding = TimeSeriesValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
+        self.embed_positions = TimeSeriesSinusoidalPositionalEmbedding(
+            config.context_length + config.prediction_length, config.d_model
+        )
         self.layers = nn.ModuleList([TimeSeriesTransformerDecoderLayer(config) for _ in range(config.decoder_layers)])
         self.layernorm_embedding = nn.LayerNorm(config.d_model)
 
@@ -1206,8 +1015,11 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
         combined_attention_mask = None
         if input_shape[-1] > 1:
             combined_attention_mask = _make_causal_mask(
-                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
-            ).to(inputs_embeds.device)
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
 
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
@@ -1278,20 +1090,16 @@ def forward(
                 If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                 that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                 all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-
             inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                 Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                 This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                 than the model's internal embedding lookup matrix.
-
             output_attentions (`bool`, *optional*):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more detail.
-
             output_hidden_states (`bool`, *optional*):
                 Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                 for more detail.
-
             return_dict (`bool`, *optional*):
                 Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
         """
@@ -1316,11 +1124,18 @@ def forward(
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
             encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
 
-        hidden_states = inputs_embeds
-        hidden_states = self.layernorm_embedding(hidden_states)
-
+        hidden_states = self.value_embedding(inputs_embeds)
+        embed_pos = self.embed_positions(inputs_embeds.size(), past_key_values_length=self.config.context_length)
+        hidden_states = self.layernorm_embedding(hidden_states + embed_pos)
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
@@ -1340,20 +1155,15 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         # None for past_key_value
@@ -1372,7 +1182,6 @@ def custom_forward(*inputs):
                     None,
                 )
             else:
-
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
@@ -1425,15 +1234,18 @@ class TimeSeriesTransformerModel(TimeSeriesTransformerPreTrainedModel):
     def __init__(self, config: TimeSeriesTransformerConfig):
         super().__init__(config)
 
-        if config.scaling:
-            self.scaler = MeanScaler(dim=1, keepdim=True)
+        if config.scaling == "mean" or config.scaling is True:
+            self.scaler = TimeSeriesMeanScaler(dim=1, keepdim=True)
+        elif config.scaling == "std":
+            self.scaler = TimeSeriesStdScaler(dim=1, keepdim=True)
         else:
-            self.scaler = NOPScaler(dim=1, keepdim=True)
+            self.scaler = TimeSeriesNOPScaler(dim=1, keepdim=True)
 
-        self.embedder = FeatureEmbedder(
-            cardinalities=config.cardinality,
-            embedding_dims=config.embedding_dimension,
-        )
+        if config.num_static_categorical_features > 0:
+            self.embedder = TimeSeriesFeatureEmbedder(
+                cardinalities=config.cardinality,
+                embedding_dims=config.embedding_dimension,
+            )
 
         # transformer encoder-decoder and mask initializer
         self.encoder = TimeSeriesTransformerEncoder(config)
@@ -1465,14 +1277,11 @@ def get_lagged_subsequences(
         sequence_length = sequence.shape[1]
         indices = [lag - shift for lag in self.config.lags_sequence]
 
-        try:
-            assert max(indices) + subsequences_length <= sequence_length, (
+        if max(indices) + subsequences_length > sequence_length:
+            raise ValueError(
                 f"lags cannot go further than history length, found lag {max(indices)} "
                 f"while history length is only {sequence_length}"
             )
-        except AssertionError as e:
-            e.args += (max(indices), sequence_length)
-            raise
 
         lagged_values = []
         for lag_index in indices:
@@ -1485,8 +1294,8 @@ def create_network_inputs(
         self,
         past_values: torch.Tensor,
         past_time_features: torch.Tensor,
-        static_categorical_features: torch.Tensor,
-        static_real_features: torch.Tensor,
+        static_categorical_features: Optional[torch.Tensor] = None,
+        static_real_features: Optional[torch.Tensor] = None,
         past_observed_mask: Optional[torch.Tensor] = None,
         future_values: Optional[torch.Tensor] = None,
         future_time_features: Optional[torch.Tensor] = None,
@@ -1510,59 +1319,48 @@ def create_network_inputs(
 
         context = past_values[:, -self.config.context_length :]
         observed_context = past_observed_mask[:, -self.config.context_length :]
-        _, scale = self.scaler(context, observed_context)
+        _, loc, scale = self.scaler(context, observed_context)
 
         inputs = (
-            torch.cat((past_values, future_values), dim=1) / scale
+            (torch.cat((past_values, future_values), dim=1) - loc) / scale
             if future_values is not None
-            else past_values / scale
+            else (past_values - loc) / scale
         )
 
-        inputs_length = (
-            self._past_length + self.config.prediction_length if future_values is not None else self._past_length
-        )
-        try:
-            assert inputs.shape[1] == inputs_length, (
-                f"input length {inputs.shape[1]} and dynamic feature lengths {inputs_length} does not match",
-            )
-        except AssertionError as e:
-            e.args += (inputs.shape[1], inputs_length)
-            raise
-
-        subsequences_length = (
-            self.config.context_length + self.config.prediction_length
-            if future_values is not None
-            else self.config.context_length
-        )
-
-        # embeddings
-        embedded_cat = self.embedder(static_categorical_features)
         # static features
+        log_abs_loc = loc.abs().log1p() if self.config.input_size == 1 else loc.squeeze(1).abs().log1p()
         log_scale = scale.log() if self.config.input_size == 1 else scale.squeeze(1).log()
-        static_feat = torch.cat((embedded_cat, static_real_features, log_scale), dim=1)
+        static_feat = torch.cat((log_abs_loc, log_scale), dim=1)
+
+        if static_real_features is not None:
+            static_feat = torch.cat((static_real_features, static_feat), dim=1)
+        if static_categorical_features is not None:
+            embedded_cat = self.embedder(static_categorical_features)
+            static_feat = torch.cat((embedded_cat, static_feat), dim=1)
         expanded_static_feat = static_feat.unsqueeze(1).expand(-1, time_feat.shape[1], -1)
 
         # all features
         features = torch.cat((expanded_static_feat, time_feat), dim=-1)
 
+        # lagged features
+        subsequences_length = (
+            self.config.context_length + self.config.prediction_length
+            if future_values is not None
+            else self.config.context_length
+        )
         lagged_sequence = self.get_lagged_subsequences(sequence=inputs, subsequences_length=subsequences_length)
-
         lags_shape = lagged_sequence.shape
         reshaped_lagged_sequence = lagged_sequence.reshape(lags_shape[0], lags_shape[1], -1)
 
-        transformer_inputs = torch.cat((reshaped_lagged_sequence, features), dim=-1)
-
-        return transformer_inputs, scale, static_feat
+        if reshaped_lagged_sequence.shape[1] != time_feat.shape[1]:
+            raise ValueError(
+                f"input length {reshaped_lagged_sequence.shape[1]} and time feature lengths {time_feat.shape[1]} does not match"
+            )
 
-    def enc_dec_outputs(self, transformer_inputs):
-        enc_input = transformer_inputs[:, : self.config.context_length, ...]
-        dec_input = transformer_inputs[:, self.config.context_length :, ...]
+        # transformer inputs
+        transformer_inputs = torch.cat((reshaped_lagged_sequence, features), dim=-1)
 
-        encoder_outputs = self.encoder(inputs_embeds=enc_input)
-        decoder_outputs = self.decoder(
-            inputs_embeds=dec_input, encoder_hidden_states=encoder_outputs.last_hidden_state
-        )
-        return encoder_outputs, decoder_outputs
+        return transformer_inputs, loc, scale, static_feat
 
     def get_encoder(self):
         return self.encoder
@@ -1571,14 +1369,14 @@ def get_decoder(self):
         return self.decoder
 
     @add_start_docstrings_to_model_forward(TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=Seq2SeqTimeSeriesModelOutput, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(output_type=Seq2SeqTSModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         past_values: torch.Tensor,
         past_time_features: torch.Tensor,
         past_observed_mask: torch.Tensor,
-        static_categorical_features: torch.Tensor,
-        static_real_features: torch.Tensor,
+        static_categorical_features: Optional[torch.Tensor] = None,
+        static_real_features: Optional[torch.Tensor] = None,
         future_values: Optional[torch.Tensor] = None,
         future_time_features: Optional[torch.Tensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -1591,7 +1389,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         use_cache: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[Seq2SeqTimeSeriesModelOutput, Tuple]:
+    ) -> Union[Seq2SeqTSModelOutput, Tuple]:
         r"""
         Returns:
 
@@ -1603,7 +1401,7 @@ def forward(
         >>> from transformers import TimeSeriesTransformerModel
 
         >>> file = hf_hub_download(
-        ...     repo_id="kashif/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
+        ...     repo_id="hf-internal-testing/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
         ... )
         >>> batch = torch.load(file)
 
@@ -1630,7 +1428,7 @@ def forward(
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        transformer_inputs, scale, static_feat = self.create_network_inputs(
+        transformer_inputs, loc, scale, static_feat = self.create_network_inputs(
             past_values=past_values,
             past_time_features=past_time_features,
             past_observed_mask=past_observed_mask,
@@ -1672,9 +1470,9 @@ def forward(
         )
 
         if not return_dict:
-            return decoder_outputs + encoder_outputs + (scale, static_feat)
+            return decoder_outputs + encoder_outputs + (loc, scale, static_feat)
 
-        return Seq2SeqTimeSeriesModelOutput(
+        return Seq2SeqTSModelOutput(
             last_hidden_state=decoder_outputs.last_hidden_state,
             past_key_values=decoder_outputs.past_key_values,
             decoder_hidden_states=decoder_outputs.hidden_states,
@@ -1683,6 +1481,7 @@ def forward(
             encoder_last_hidden_state=encoder_outputs.last_hidden_state,
             encoder_hidden_states=encoder_outputs.hidden_states,
             encoder_attentions=encoder_outputs.attentions,
+            loc=loc,
             scale=scale,
             static_features=static_feat,
         )
@@ -1709,7 +1508,7 @@ def __init__(self, config: TimeSeriesTransformerConfig):
         self.target_shape = self.distribution_output.event_shape
 
         if config.loss == "nll":
-            self.loss = NegativeLogLikelihood()
+            self.loss = nll
         else:
             raise ValueError(f"Unknown loss function {config.loss}")
 
@@ -1726,21 +1525,21 @@ def get_decoder(self):
         return self.model.get_decoder()
 
     @torch.jit.ignore
-    def output_distribution(self, params, scale=None, trailing_n=None) -> torch.distributions.Distribution:
+    def output_distribution(self, params, loc=None, scale=None, trailing_n=None) -> torch.distributions.Distribution:
         sliced_params = params
         if trailing_n is not None:
             sliced_params = [p[:, -trailing_n:] for p in params]
-        return self.distribution_output.distribution(sliced_params, scale=scale)
+        return self.distribution_output.distribution(sliced_params, loc=loc, scale=scale)
 
     @add_start_docstrings_to_model_forward(TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=Seq2SeqTimeSeriesModelOutput, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(output_type=Seq2SeqTSModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         past_values: torch.Tensor,
         past_time_features: torch.Tensor,
         past_observed_mask: torch.Tensor,
-        static_categorical_features: torch.Tensor,
-        static_real_features: torch.Tensor,
+        static_categorical_features: Optional[torch.Tensor] = None,
+        static_real_features: Optional[torch.Tensor] = None,
         future_values: Optional[torch.Tensor] = None,
         future_time_features: Optional[torch.Tensor] = None,
         future_observed_mask: Optional[torch.Tensor] = None,
@@ -1754,19 +1553,10 @@ def forward(
         output_attentions: Optional[bool] = None,
         use_cache: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[Seq2SeqTimeSeriesModelOutput, Tuple]:
+    ) -> Union[Seq2SeqTSModelOutput, Tuple]:
         r"""
         Returns:
 
-        future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Boolean mask to indicate which `future_values` were observed and which were missing. Mask values selected
-            in `[0, 1]`:
-
-            - 1 for values that are **observed**,
-            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
-
-            This mask is used to filter out missing values for the final loss calculation.
-
         Examples:
 
         ```python
@@ -1775,7 +1565,7 @@ def forward(
         >>> from transformers import TimeSeriesTransformerForPrediction
 
         >>> file = hf_hub_download(
-        ...     repo_id="kashif/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
+        ...     repo_id="hf-internal-testing/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
         ... )
         >>> batch = torch.load(file)
 
@@ -1841,7 +1631,8 @@ def forward(
         params = None
         if future_values is not None:
             params = self.output_params(outputs[0])  # outputs.last_hidden_state
-            distribution = self.output_distribution(params, outputs[-2])  # outputs.scale
+            # loc is 3rd last and scale is 2nd last output
+            distribution = self.output_distribution(params, loc=outputs[-3], scale=outputs[-2])
 
             loss = self.loss(distribution, future_values)
 
@@ -1859,7 +1650,7 @@ def forward(
             outputs = ((params,) + outputs[1:]) if params is not None else outputs[1:]
             return ((prediction_loss,) + outputs) if prediction_loss is not None else outputs
 
-        return Seq2SeqTimeSeriesPredictionOutput(
+        return Seq2SeqTSPredictionOutput(
             loss=prediction_loss,
             params=params,
             past_key_values=outputs.past_key_values,
@@ -1869,6 +1660,7 @@ def forward(
             encoder_last_hidden_state=outputs.encoder_last_hidden_state,
             encoder_hidden_states=outputs.encoder_hidden_states,
             encoder_attentions=outputs.encoder_attentions,
+            loc=outputs.loc,
             scale=outputs.scale,
             static_features=outputs.static_features,
         )
@@ -1876,15 +1668,102 @@ def forward(
     @torch.no_grad()
     def generate(
         self,
-        static_categorical_features: torch.Tensor,
-        static_real_features: torch.Tensor,
-        past_time_features: torch.Tensor,
         past_values: torch.Tensor,
-        past_observed_mask: torch.Tensor,
-        future_time_features: Optional[torch.Tensor],
+        past_time_features: torch.Tensor,
+        future_time_features: torch.Tensor,
+        past_observed_mask: Optional[torch.Tensor] = None,
+        static_categorical_features: Optional[torch.Tensor] = None,
+        static_real_features: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-    ) -> torch.Tensor:
+    ) -> SampleTSPredictionOutput:
+        r"""
+        Greedily generate sequences of sample predictions from a model with a probability distribution head.
+
+        Parameters:
+            past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`):
+                Past values of the time series, that serve as context in order to predict the future. The sequence size
+                of this tensor must be larger than the `context_length` of the model, since the model will use the
+                larger size to construct lag features, i.e. additional values from the past which are added in order to
+                serve as "extra context".
+
+                The `sequence_length` here is equal to `config.context_length` + `max(config.lags_sequence)`, which if
+                no `lags_sequence` is configured, is equal to `config.context_length` + 7 (as by default, the largest
+                look-back index in `config.lags_sequence` is 7). The property `_past_length` returns the actual length
+                of the past.
+
+                The `past_values` is what the Transformer encoder gets as input (with optional additional features,
+                such as `static_categorical_features`, `static_real_features`, `past_time_features` and lags).
+
+                Optionally, missing values need to be replaced with zeros and indicated via the `past_observed_mask`.
+
+                For multivariate time series, the `input_size` > 1 dimension is required and corresponds to the number
+                of variates in the time series per time step.
+            past_time_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`):
+                Required time features, which the model internally will add to `past_values`. These could be things
+                like "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features).
+                These could also be so-called "age" features, which basically help the model know "at which point in
+                life" a time-series is. Age features have small values for distant past time steps and increase
+                monotonically the more we approach the current time step. Holiday features are also a good example of
+                time features.
+
+                These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT,
+                where the position encodings are learned from scratch internally as parameters of the model, the Time
+                Series Transformer requires to provide additional time features. The Time Series Transformer only
+                learns additional embeddings for `static_categorical_features`.
+
+                Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these
+                features must but known at prediction time.
+
+                The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`.
+            future_time_features (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`):
+                Required time features for the prediction window, which the model internally will add to sampled
+                predictions. These could be things like "month of year", "day of the month", etc. encoded as vectors
+                (for instance as Fourier features). These could also be so-called "age" features, which basically help
+                the model know "at which point in life" a time-series is. Age features have small values for distant
+                past time steps and increase monotonically the more we approach the current time step. Holiday features
+                are also a good example of time features.
+
+                These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT,
+                where the position encodings are learned from scratch internally as parameters of the model, the Time
+                Series Transformer requires to provide additional time features. The Time Series Transformer only
+                learns additional embeddings for `static_categorical_features`.
+
+                Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these
+                features must but known at prediction time.
+
+                The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`.
+            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`, *optional*):
+                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
+                in `[0, 1]`:
+
+                - 1 for values that are **observed**,
+                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+
+            static_categorical_features (`torch.LongTensor` of shape `(batch_size, number of static categorical features)`, *optional*):
+                Optional static categorical features for which the model will learn an embedding, which it will add to
+                the values of the time series.
+
+                Static categorical features are features which have the same value for all time steps (static over
+                time).
+
+                A typical example of a static categorical feature is a time series ID.
+            static_real_features (`torch.FloatTensor` of shape `(batch_size, number of static real features)`, *optional*):
+                Optional static real features which the model will add to the values of the time series.
+
+                Static real features are features which have the same value for all time steps (static over time).
+
+                A typical example of a static real feature is promotion information.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers.
+
+        Return:
+            [`SampleTSPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of
+            samples, prediction_length)` or `(batch_size, number of samples, prediction_length, input_size)` for
+            multivariate predictions.
+        """
         outputs = self(
             static_categorical_features=static_categorical_features,
             static_real_features=static_real_features,
@@ -1901,13 +1780,17 @@ def generate(
 
         decoder = self.model.get_decoder()
         enc_last_hidden = outputs.encoder_last_hidden_state
+        loc = outputs.loc
         scale = outputs.scale
         static_feat = outputs.static_features
 
         num_parallel_samples = self.config.num_parallel_samples
+        repeated_loc = loc.repeat_interleave(repeats=num_parallel_samples, dim=0)
         repeated_scale = scale.repeat_interleave(repeats=num_parallel_samples, dim=0)
 
-        repeated_past_values = past_values.repeat_interleave(repeats=num_parallel_samples, dim=0) / repeated_scale
+        repeated_past_values = (
+            past_values.repeat_interleave(repeats=num_parallel_samples, dim=0) - repeated_loc
+        ) / repeated_scale
 
         expanded_static_feat = static_feat.unsqueeze(1).expand(-1, future_time_features.shape[1], -1)
         features = torch.cat((expanded_static_feat, future_time_features), dim=-1)
@@ -1934,15 +1817,17 @@ def generate(
             dec_last_hidden = dec_output.last_hidden_state
 
             params = self.parameter_projection(dec_last_hidden[:, -1:])
-            distr = self.output_distribution(params, scale=repeated_scale)
+            distr = self.output_distribution(params, loc=repeated_loc, scale=repeated_scale)
             next_sample = distr.sample()
 
-            repeated_past_values = torch.cat((repeated_past_values, next_sample / repeated_scale), dim=1)
+            repeated_past_values = torch.cat(
+                (repeated_past_values, (next_sample - repeated_loc) / repeated_scale), dim=1
+            )
             future_samples.append(next_sample)
 
         concat_future_samples = torch.cat(future_samples, dim=1)
 
-        return SampleTimeSeriesPredictionOutput(
+        return SampleTSPredictionOutput(
             sequences=concat_future_samples.reshape(
                 (-1, num_parallel_samples, self.config.prediction_length) + self.target_shape,
             )
diff --git a/src/transformers/models/timesformer/__init__.py b/src/transformers/models/timesformer/__init__.py
index eb0052dad811..f777a11ad1bd 100644
--- a/src/transformers/models/timesformer/__init__.py
+++ b/src/transformers/models/timesformer/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/timesformer/configuration_timesformer.py b/src/transformers/models/timesformer/configuration_timesformer.py
index d3db62e53d25..dcd7b2a518aa 100644
--- a/src/transformers/models/timesformer/configuration_timesformer.py
+++ b/src/transformers/models/timesformer/configuration_timesformer.py
@@ -30,7 +30,8 @@ class TimesformerConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`TimesformerModel`]. It is used to instantiate a
     TimeSformer model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the TimeSformer
-    [facebook/timesformer](https://huggingface.co/facebook/timesformer-base-finetuned-k600) architecture.
+    [facebook/timesformer-base-finetuned-k600](https://huggingface.co/facebook/timesformer-base-finetuned-k600)
+    architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -61,7 +62,7 @@ class TimesformerConfig(PretrainedConfig):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
             The epsilon used by the layer normalization layers.
         qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to the queries, keys and values.
@@ -78,7 +79,7 @@ class TimesformerConfig(PretrainedConfig):
     >>> # Initializing a TimeSformer timesformer-base style configuration
     >>> configuration = TimesformerConfig()
 
-    >>> # Randomly initializing a model from the configuration
+    >>> # Initializing a model from the configuration
     >>> model = TimesformerModel(configuration)
 
     >>> # Accessing the model configuration
@@ -104,7 +105,7 @@ def __init__(
         qkv_bias=True,
         attention_type="divided_space_time",
         drop_path_rate=0,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/timesformer/convert_timesformer_to_pytorch.py b/src/transformers/models/timesformer/convert_timesformer_to_pytorch.py
index ca58994db828..ce4d13421ffd 100644
--- a/src/transformers/models/timesformer/convert_timesformer_to_pytorch.py
+++ b/src/transformers/models/timesformer/convert_timesformer_to_pytorch.py
@@ -17,12 +17,12 @@
 import argparse
 import json
 
+import gdown
 import numpy as np
 import torch
-
-import gdown
 from huggingface_hub import hf_hub_download
-from transformers import TimesformerConfig, TimesformerForVideoClassification, VideoMAEFeatureExtractor
+
+from transformers import TimesformerConfig, TimesformerForVideoClassification, VideoMAEImageProcessor
 
 
 def get_timesformer_config(model_name):
@@ -156,9 +156,9 @@ def convert_timesformer_checkpoint(checkpoint_url, pytorch_dump_folder_path, mod
     model.eval()
 
     # verify model on basic input
-    feature_extractor = VideoMAEFeatureExtractor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
+    image_processor = VideoMAEImageProcessor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
     video = prepare_video()
-    inputs = feature_extractor(video[:8], return_tensors="pt")
+    inputs = image_processor(video[:8], return_tensors="pt")
 
     outputs = model(**inputs)
     logits = outputs.logits
@@ -215,8 +215,8 @@ def convert_timesformer_checkpoint(checkpoint_url, pytorch_dump_folder_path, mod
     print("Logits ok!")
 
     if pytorch_dump_folder_path is not None:
-        print(f"Saving model and feature extractor to {pytorch_dump_folder_path}")
-        feature_extractor.save_pretrained(pytorch_dump_folder_path)
+        print(f"Saving model and image processor to {pytorch_dump_folder_path}")
+        image_processor.save_pretrained(pytorch_dump_folder_path)
         model.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
diff --git a/src/transformers/models/timesformer/modeling_timesformer.py b/src/transformers/models/timesformer/modeling_timesformer.py
index 03fa4251a8ae..676bcf7a5e27 100644
--- a/src/transformers/models/timesformer/modeling_timesformer.py
+++ b/src/transformers/models/timesformer/modeling_timesformer.py
@@ -235,7 +235,6 @@ def __init__(self, config: TimesformerConfig) -> None:
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
 
@@ -274,7 +273,6 @@ def __init__(self, config: TimesformerConfig) -> None:
             self.intermediate_act_fn = config.hidden_act
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.intermediate_act_fn(hidden_states)
         hidden_states = self.dropout(hidden_states)
@@ -515,8 +513,8 @@ def _set_gradient_checkpointing(self, module, value=False):
 TIMESFORMER_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`VideoMAEFeatureExtractor`]. See
-            [`VideoMAEFeatureExtractor.__call__`] for details.
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`VideoMAEImageProcessor.preprocess`] for details.
 
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
@@ -561,25 +559,57 @@ class PreTrainedModel
     @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        pixel_values,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        pixel_values: torch.FloatTensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], BaseModelOutput]:
         r"""
         Returns:
 
         Examples:
 
         ```python
-        >>> from decord import VideoReader, cpu
+        >>> import av
         >>> import numpy as np
 
-        >>> from transformers import TimeSformerFeatureExtractor, TimesformerModel
+        >>> from transformers import AutoImageProcessor, TimesformerModel
         >>> from huggingface_hub import hf_hub_download
 
+        >>> np.random.seed(0)
+
+
+        >>> def read_video_pyav(container, indices):
+        ...     '''
+        ...     Decode the video with PyAV decoder.
+        ...     Args:
+        ...         container (`av.container.input.InputContainer`): PyAV container.
+        ...         indices (`List[int]`): List of frame indices to decode.
+        ...     Returns:
+        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
+        ...     '''
+        ...     frames = []
+        ...     container.seek(0)
+        ...     start_index = indices[0]
+        ...     end_index = indices[-1]
+        ...     for i, frame in enumerate(container.decode(video=0)):
+        ...         if i > end_index:
+        ...             break
+        ...         if i >= start_index and i in indices:
+        ...             frames.append(frame)
+        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+
 
         >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
+        ...     '''
+        ...     Sample a given number of frame indices from the video.
+        ...     Args:
+        ...         clip_len (`int`): Total number of frames to sample.
+        ...         frame_sample_rate (`int`): Sample every n-th frame.
+        ...         seg_len (`int`): Maximum allowed index of sample's last frame.
+        ...     Returns:
+        ...         indices (`List[int]`): List of sampled frame indices
+        ...     '''
         ...     converted_len = int(clip_len * frame_sample_rate)
         ...     end_idx = np.random.randint(converted_len, seg_len)
         ...     start_idx = end_idx - converted_len
@@ -592,24 +622,23 @@ def forward(
         >>> file_path = hf_hub_download(
         ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
         ... )
-        >>> videoreader = VideoReader(file_path, num_threads=1, ctx=cpu(0))
+        >>> container = av.open(file_path)
 
         >>> # sample 8 frames
-        >>> videoreader.seek(0)
-        >>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=4, seg_len=len(videoreader))
-        >>> video = videoreader.get_batch(indices).asnumpy()
+        >>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=4, seg_len=container.streams.video[0].frames)
+        >>> video = read_video_pyav(container, indices)
 
-        >>> feature_extractor = VideoMAEFeatureExtractor.from_pretrained("MCG-NJU/videomae-base")
+        >>> image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
         >>> model = TimesformerModel.from_pretrained("facebook/timesformer-base-finetuned-k400")
 
         >>> # prepare video for the model
-        >>> inputs = feature_extractor(list(video), return_tensors="pt")
+        >>> inputs = image_processor(list(video), return_tensors="pt")
 
         >>> # forward pass
         >>> outputs = model(**inputs)
         >>> last_hidden_states = outputs.last_hidden_state
         >>> list(last_hidden_states.shape)
-        [1, 1568, 768]
+        [1, 1569, 768]
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -678,17 +707,47 @@ def forward(
         Examples:
 
         ```python
-        >>> from decord import VideoReader, cpu
+        >>> import av
         >>> import torch
         >>> import numpy as np
 
-        >>> from transformers import VideoMAEFeatureExtractor, TimesformerForVideoClassification
+        >>> from transformers import AutoImageProcessor, TimesformerForVideoClassification
         >>> from huggingface_hub import hf_hub_download
 
         >>> np.random.seed(0)
 
 
+        >>> def read_video_pyav(container, indices):
+        ...     '''
+        ...     Decode the video with PyAV decoder.
+        ...     Args:
+        ...         container (`av.container.input.InputContainer`): PyAV container.
+        ...         indices (`List[int]`): List of frame indices to decode.
+        ...     Returns:
+        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
+        ...     '''
+        ...     frames = []
+        ...     container.seek(0)
+        ...     start_index = indices[0]
+        ...     end_index = indices[-1]
+        ...     for i, frame in enumerate(container.decode(video=0)):
+        ...         if i > end_index:
+        ...             break
+        ...         if i >= start_index and i in indices:
+        ...             frames.append(frame)
+        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+
+
         >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
+        ...     '''
+        ...     Sample a given number of frame indices from the video.
+        ...     Args:
+        ...         clip_len (`int`): Total number of frames to sample.
+        ...         frame_sample_rate (`int`): Sample every n-th frame.
+        ...         seg_len (`int`): Maximum allowed index of sample's last frame.
+        ...     Returns:
+        ...         indices (`List[int]`): List of sampled frame indices
+        ...     '''
         ...     converted_len = int(clip_len * frame_sample_rate)
         ...     end_idx = np.random.randint(converted_len, seg_len)
         ...     start_idx = end_idx - converted_len
@@ -701,17 +760,16 @@ def forward(
         >>> file_path = hf_hub_download(
         ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
         ... )
-        >>> videoreader = VideoReader(file_path, num_threads=1, ctx=cpu(0))
+        >>> container = av.open(file_path)
 
         >>> # sample 8 frames
-        >>> videoreader.seek(0)
-        >>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=4, seg_len=len(videoreader))
-        >>> video = videoreader.get_batch(indices).asnumpy()
+        >>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
+        >>> video = read_video_pyav(container, indices)
 
-        >>> feature_extractor = VideoMAEFeatureExtractor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
+        >>> image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
         >>> model = TimesformerForVideoClassification.from_pretrained("facebook/timesformer-base-finetuned-k400")
 
-        >>> inputs = feature_extractor(list(video), return_tensors="pt")
+        >>> inputs = image_processor(list(video), return_tensors="pt")
 
         >>> with torch.no_grad():
         ...     outputs = model(**inputs)
diff --git a/src/transformers/models/timm_backbone/__init__.py b/src/transformers/models/timm_backbone/__init__.py
new file mode 100644
index 000000000000..4c692f76432f
--- /dev/null
+++ b/src/transformers/models/timm_backbone/__init__.py
@@ -0,0 +1,49 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {"configuration_timm_backbone": ["TimmBackboneConfig"]}
+
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_timm_backbone"] = ["TimmBackbone"]
+
+
+if TYPE_CHECKING:
+    from .configuration_timm_backbone import TimmBackboneConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_timm_backbone import TimmBackbone
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/timm_backbone/configuration_timm_backbone.py b/src/transformers/models/timm_backbone/configuration_timm_backbone.py
new file mode 100644
index 000000000000..19bfcbebf62b
--- /dev/null
+++ b/src/transformers/models/timm_backbone/configuration_timm_backbone.py
@@ -0,0 +1,78 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Configuration for Backbone models"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class TimmBackboneConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration for a timm backbone [`TimmBackbone`].
+
+    It is used to instantiate a timm backbone model according to the specified arguments, defining the model.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        backbone (`str`, *optional*):
+            The timm checkpoint to load.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        features_only (`bool`, *optional*, defaults to `True`):
+            Whether to output only the features or also the logits.
+        use_pretrained_backbone (`bool`, *optional*, defaults to `True`):
+            Whether to use a pretrained backbone.
+        out_indices (`List[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). Will default to the last stage if unset.
+
+    Example:
+    ```python
+    >>> from transformers import TimmBackboneConfig, TimmBackbone
+
+    >>> # Initializing a timm backbone
+    >>> configuration = TimmBackboneConfig("resnet50")
+
+    >>> # Initializing a model from the configuration
+    >>> model = TimmBackbone(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+    model_type = "timm_backbone"
+
+    def __init__(
+        self,
+        backbone=None,
+        num_channels=3,
+        features_only=True,
+        use_pretrained_backbone=True,
+        out_indices=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.backbone = backbone
+        self.num_channels = num_channels
+        self.features_only = features_only
+        self.use_pretrained_backbone = use_pretrained_backbone
+        self.use_timm_backbone = True
+        self.out_indices = out_indices if out_indices is not None else (-1,)
diff --git a/src/transformers/models/timm_backbone/modeling_timm_backbone.py b/src/transformers/models/timm_backbone/modeling_timm_backbone.py
new file mode 100644
index 000000000000..dc117f743642
--- /dev/null
+++ b/src/transformers/models/timm_backbone/modeling_timm_backbone.py
@@ -0,0 +1,147 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple, Union
+
+import torch
+
+from ...modeling_outputs import BackboneOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import is_timm_available, is_torch_available, requires_backends
+from ...utils.backbone_utils import BackboneMixin
+from .configuration_timm_backbone import TimmBackboneConfig
+
+
+if is_timm_available():
+    import timm
+
+
+if is_torch_available():
+    from torch import Tensor
+
+
+class TimmBackbone(PreTrainedModel, BackboneMixin):
+    """
+    Wrapper class for timm models to be used as backbones. This enables using the timm models interchangeably with the
+    other models in the library keeping the same API.
+    """
+
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = False
+    config_class = TimmBackboneConfig
+
+    def __init__(self, config, **kwargs):
+        requires_backends(self, "timm")
+        super().__init__(config)
+        self.config = config
+
+        if config.backbone is None:
+            raise ValueError("backbone is not set in the config. Please set it to a timm model name.")
+
+        if config.backbone not in timm.list_models():
+            raise ValueError(f"backbone {config.backbone} is not supported by timm.")
+
+        if hasattr(config, "out_features") and config.out_features is not None:
+            raise ValueError("out_features is not supported by TimmBackbone. Please use out_indices instead.")
+
+        pretrained = getattr(config, "use_pretrained_backbone", None)
+        if pretrained is None:
+            raise ValueError("use_pretrained_backbone is not set in the config. Please set it to True or False.")
+
+        # We just take the final layer by default. This matches the default for the transformers models.
+        out_indices = config.out_indices if getattr(config, "out_indices", None) is not None else (-1,)
+
+        self._backbone = timm.create_model(
+            config.backbone,
+            pretrained=pretrained,
+            # This is currently not possible for transformer architectures.
+            features_only=config.features_only,
+            in_chans=config.num_channels,
+            out_indices=out_indices,
+            **kwargs,
+        )
+        # These are used to control the output of the model when called. If output_hidden_states is True, then
+        # return_layers is modified to include all layers.
+        self._return_layers = self._backbone.return_layers
+        self._all_layers = {layer["module"]: str(i) for i, layer in enumerate(self._backbone.feature_info.info)}
+        super()._init_backbone(config)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        requires_backends(cls, ["vision", "timm"])
+        from ...models.timm_backbone import TimmBackboneConfig
+
+        config = kwargs.pop("config", TimmBackboneConfig())
+
+        use_timm = kwargs.pop("use_timm_backbone", True)
+        if not use_timm:
+            raise ValueError("use_timm_backbone must be True for timm backbones")
+
+        num_channels = kwargs.pop("num_channels", config.num_channels)
+        features_only = kwargs.pop("features_only", config.features_only)
+        use_pretrained_backbone = kwargs.pop("use_pretrained_backbone", config.use_pretrained_backbone)
+        out_indices = kwargs.pop("out_indices", config.out_indices)
+        config = TimmBackboneConfig(
+            backbone=pretrained_model_name_or_path,
+            num_channels=num_channels,
+            features_only=features_only,
+            use_pretrained_backbone=use_pretrained_backbone,
+            out_indices=out_indices,
+        )
+        return super()._from_config(config, **kwargs)
+
+    def _init_weights(self, module):
+        """
+        Empty init weights function to ensure compatibility of the class in the library.
+        """
+        pass
+
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[BackboneOutput, Tuple[Tensor, ...]]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        if output_attentions:
+            raise ValueError("Cannot output attentions for timm backbones at the moment")
+
+        if output_hidden_states:
+            # We modify the return layers to include all the stages of the backbone
+            self._backbone.return_layers = self._all_layers
+            hidden_states = self._backbone(pixel_values, **kwargs)
+            self._backbone.return_layers = self._return_layers
+            feature_maps = tuple(hidden_states[i] for i in self.out_indices)
+        else:
+            feature_maps = self._backbone(pixel_values, **kwargs)
+            hidden_states = None
+
+        feature_maps = tuple(feature_maps)
+        hidden_states = tuple(hidden_states) if hidden_states is not None else None
+
+        if not return_dict:
+            output = (feature_maps,)
+            if output_hidden_states:
+                output = output + (hidden_states,)
+            return output
+
+        return BackboneOutput(feature_maps=feature_maps, hidden_states=hidden_states, attentions=None)
diff --git a/src/transformers/models/trajectory_transformer/__init__.py b/src/transformers/models/trajectory_transformer/__init__.py
deleted file mode 100644
index 0b8a6f2c5892..000000000000
--- a/src/transformers/models/trajectory_transformer/__init__.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import TYPE_CHECKING
-
-# rely on isort to merge the imports
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
-
-
-_import_structure = {
-    "configuration_trajectory_transformer": [
-        "TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
-        "TrajectoryTransformerConfig",
-    ],
-}
-
-try:
-    if not is_torch_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_trajectory_transformer"] = [
-        "TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
-        "TrajectoryTransformerModel",
-        "TrajectoryTransformerPreTrainedModel",
-        "load_tf_weights_in_trajectory_transformer",
-    ]
-
-
-if TYPE_CHECKING:
-    from .configuration_trajectory_transformer import (
-        TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        TrajectoryTransformerConfig,
-    )
-
-    try:
-        if not is_torch_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_trajectory_transformer import (
-            TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
-            TrajectoryTransformerModel,
-            TrajectoryTransformerPreTrainedModel,
-            load_tf_weights_in_trajectory_transformer,
-        )
-
-
-else:
-    import sys
-
-    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/transfo_xl/__init__.py b/src/transformers/models/transfo_xl/__init__.py
index 672ad9afc527..ce4215b0217b 100644
--- a/src/transformers/models/transfo_xl/__init__.py
+++ b/src/transformers/models/transfo_xl/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/transfo_xl/configuration_transfo_xl.py b/src/transformers/models/transfo_xl/configuration_transfo_xl.py
index c9b8464b1039..8550e7180286 100644
--- a/src/transformers/models/transfo_xl/configuration_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/configuration_transfo_xl.py
@@ -141,7 +141,7 @@ def __init__(
         init_std=0.02,
         layer_norm_epsilon=1e-5,
         eos_token_id=0,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.cutoffs = []
diff --git a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
index ce3f95df5e52..88005b7e0600 100644
--- a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
@@ -17,6 +17,8 @@
  TF 2.0 Transformer XL model.
 """
 
+from __future__ import annotations
+
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
 
@@ -47,7 +49,6 @@
 
 _CHECKPOINT_FOR_DOC = "transfo-xl-wt103"
 _CONFIG_FOR_DOC = "TransfoXLConfig"
-_TOKENIZER_FOR_DOC = "TransfoXLTokenizer"
 
 TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "transfo-xl-wt103",
@@ -129,7 +130,7 @@ def __init__(
         layer_norm_epsilon=1e-5,
         init_std=0.02,
         output_attentions=False,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -282,7 +283,7 @@ def __init__(
         layer_norm_epsilon=1e-5,
         init_std=0.02,
         output_attentions=False,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -542,17 +543,16 @@ def _update_mems(self, hids, mems, mlen, qlen):
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        mems: Optional[List[tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        mems: List[tf.Tensor] | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ):
-
         # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
         # so we transpose here from shape [bsz, len] to shape [len, bsz]
         if input_ids is not None and inputs_embeds is not None:
@@ -588,35 +588,19 @@ def call(
         klen = mlen + qlen
 
         # Compute decoder attention mask
-
-        # ::: PyTorch masking code for reference :::
-        # if self.same_length:
-        #     all_ones = word_emb.new_ones((qlen, klen), dtype=torch.uint8)
-        #     mask_len = klen - self.mem_len
-        #     if mask_len > 0:
-        #         mask_shift_len = qlen - mask_len
-        #     else:
-        #         mask_shift_len = qlen
-        #     dec_attn_mask = (torch.triu(all_ones, 1+mlen)
-        #             + torch.tril(all_ones, -mask_shift_len))[:, :, None] # -1
-        # else:
-        #     dec_attn_mask = torch.triu(
-        #         word_emb.new_ones((qlen, klen), dtype=torch.uint8), diagonal=1+mlen)[:,:,None]
-
-        # TensorFlow version
-        dec_attn_mask = 1 - tf.linalg.band_part(
-            tf.ones([qlen, klen], dtype=tf.int32), -1, mlen
-        )  # (q, q): diagonal with 1's
+        all_ones = tf.ones([qlen, klen], dtype=tf.int32)
+        upper_mask = 1 - tf.linalg.band_part(tf.ones([qlen, klen], dtype=tf.int32), -1, mlen)
         if self.same_length:
             mask_len = klen - self.mem_len
-            if mask_len > 0:
-                mask_shift_len = qlen - mask_len
-            else:
-                mask_shift_len = qlen
-            if mask_shift_len >= 1:
-                dec_attn_mask += 1 - tf.linalg.band_part(tf.ones([qlen, klen], dtype=tf.int32), mask_shift_len - 1, -1)
-            else:
-                dec_attn_mask += tf.linalg.band_part(tf.ones([qlen, klen], dtype=tf.int32), -1, -mask_shift_len)
+            mask_shift_len = qlen - tf.nn.relu(mask_len)  # Lazy clamping of negatives to zero
+
+            # Use an indicator variable instead of a conditional to keep the compiler happy
+            lower_mask = tf.linalg.band_part(all_ones, -1, 0) - (
+                tf.linalg.band_part(all_ones, mask_shift_len - 1, 0) * tf.cast(mask_shift_len != 0, tf.int32)
+            )
+            dec_attn_mask = upper_mask + lower_mask
+        else:
+            dec_attn_mask = upper_mask
 
         hids = []
         attentions = [] if output_attentions else None
@@ -684,18 +668,6 @@ class TFTransfoXLPreTrainedModel(TFPreTrainedModel):
     config_class = TransfoXLConfig
     base_model_prefix = "transformer"
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 @dataclass
 class TFTransfoXLModelOutput(ModelOutput):
@@ -724,8 +696,8 @@ class TFTransfoXLModelOutput(ModelOutput):
 
     last_hidden_state: tf.Tensor = None
     mems: List[tf.Tensor] = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -757,8 +729,8 @@ class TFTransfoXLLMHeadModelOutput(ModelOutput):
 
     prediction_scores: tf.Tensor = None
     mems: List[tf.Tensor] = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -788,11 +760,11 @@ class TFTransfoXLSequenceClassifierOutputWithPast(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
     mems: List[tf.Tensor] = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 TRANSFO_XL_START_DOCSTRING = r"""
@@ -842,7 +814,7 @@ class TFTransfoXLSequenceClassifierOutputWithPast(ModelOutput):
         input_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -888,22 +860,21 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(TRANSFO_XL_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFTransfoXLModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        mems: Optional[List[tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
+        input_ids: TFModelInputType | None = None,
+        mems: List[tf.Tensor] | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
         training: bool = False,
-    ):
+    ) -> TFTransfoXLModelOutput | Tuple[tf.Tensor]:
         outputs = self.transformer(
             input_ids=input_ids,
             mems=mems,
@@ -917,17 +888,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTransfoXLModelOutput(
-            last_hidden_state=output.last_hidden_state,
-            mems=tf.convert_to_tensor(output.mems),
-            hidden_states=hs,
-            attentions=attns,
-        )
-
 
 @add_start_docstrings(
     """
@@ -968,23 +928,22 @@ def init_mems(self, bsz):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(TRANSFO_XL_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFTransfoXLLMHeadModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        mems: Optional[List[tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        mems: List[tf.Tensor] | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
-    ):
+    ) -> TFTransfoXLLMHeadModelOutput | Tuple[tf.Tensor]:
         if input_ids is not None:
             bsz, tgt_len = shape_list(input_ids)[:2]
         else:
@@ -1017,17 +976,6 @@ def call(
             attentions=transformer_outputs.attentions,
         )
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTransfoXLLMHeadModelOutput(
-            prediction_scores=output.prediction_scores,
-            mems=tf.convert_to_tensor(output.mems),
-            hidden_states=hs,
-            attentions=attns,
-        )
-
     def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **model_kwargs):
         inputs = {}
 
@@ -1068,26 +1016,30 @@ def __init__(self, config, *inputs, **kwargs):
         self.transformer = TFTransfoXLMainLayer(config, name="transformer")
 
     def get_output_embeddings(self):
+        # Remove after transformers v4.32. Fix this model's `test_model_common_attributes` test too.
+        logger.warning(
+            "Sequence classification models do not have output embeddings. `.get_output_embeddings` will be removed "
+            "in transformers v4.32."
+        )
         return self.transformer.word_emb
 
     @unpack_inputs
     @add_start_docstrings_to_model_forward(TRANSFO_XL_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFTransfoXLSequenceClassifierOutputWithPast,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        mems: Optional[List[tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        mems: List[tf.Tensor] | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFTransfoXLSequenceClassifierOutputWithPast]:
         r"""
@@ -1114,16 +1066,10 @@ def call(
         else:
             if input_ids is not None:
                 sequence_lengths = (
-                    tf.reduce_sum(
-                        tf.cast(
-                            tf.math.not_equal(input_ids, self.config.pad_token_id),
-                            dtype=input_ids.dtype,
-                        ),
-                        -1,
-                        keepdims=False,
-                    )
+                    tf.argmax(tf.cast(tf.math.equal(input_ids, self.config.pad_token_id), input_ids.dtype), axis=-1)
                     - 1
                 )
+                sequence_lengths = tf.where(sequence_lengths >= 0, sequence_lengths, input_ids.shape[-1] - 1)
                 in_logits = tf.gather(logits, sequence_lengths, batch_dims=1, axis=1)
             else:
                 sequence_lengths = -1
@@ -1160,11 +1106,3 @@ def call(
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
         )
-
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTransfoXLSequenceClassifierOutputWithPast(
-            logits=output.logits, mems=tf.convert_to_tensor(output.mems), hidden_states=hs, attentions=attns
-        )
diff --git a/src/transformers/models/transfo_xl/modeling_transfo_xl.py b/src/transformers/models/transfo_xl/modeling_transfo_xl.py
index 1750ccc64b53..59c532650eb8 100644
--- a/src/transformers/models/transfo_xl/modeling_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/modeling_transfo_xl.py
@@ -41,7 +41,6 @@
 
 _CHECKPOINT_FOR_DOC = "transfo-xl-wt103"
 _CONFIG_FOR_DOC = "TransfoXLConfig"
-_TOKENIZER_FOR_DOC = "TransfoXLTokenizer"
 
 TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "transfo-xl-wt103",
@@ -186,7 +185,7 @@ def __init__(self, demb):
         self.register_buffer("inv_freq", inv_freq)
 
     def forward(self, pos_seq, bsz=None):
-        sinusoid_inp = torch.ger(pos_seq, self.inv_freq)
+        sinusoid_inp = torch.outer(pos_seq, self.inv_freq)
         pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1)
 
         if bsz is not None:
@@ -382,7 +381,6 @@ def __init__(self, n_head, d_model, d_head, d_inner, dropout, layer_norm_epsilon
         )
 
     def forward(self, dec_inp, r, dec_attn_mask=None, mems=None, head_mask=None, output_attentions=False):
-
         attn_outputs = self.dec_attn(
             dec_inp,
             r,
@@ -730,7 +728,7 @@ def logits(self):
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`TransfoXLTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -861,7 +859,6 @@ def _update_mems(self, hids, mems, mlen, qlen):
             end_idx = mlen + max(0, qlen)
             beg_idx = max(0, end_idx - self.mem_len)
             for i in range(len(hids)):
-
                 cat = torch.cat([mems[i], hids[i]], dim=0)
                 new_mems.append(cat[beg_idx:end_idx].detach())
 
@@ -869,7 +866,6 @@ def _update_mems(self, hids, mems, mlen, qlen):
 
     @add_start_docstrings_to_model_forward(TRANSFO_XL_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TransfoXLModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -931,7 +927,7 @@ def forward(
         mlen = mems[0].size(0) if mems is not None else 0
         klen = mlen + qlen
         if self.same_length:
-            all_ones = word_emb.new_ones((qlen, klen), dtype=torch.uint8)
+            all_ones = word_emb.new_ones((qlen, klen), dtype=torch.bool)
             mask_len = klen - self.mem_len
             if mask_len > 0:
                 mask_shift_len = qlen - mask_len
@@ -939,7 +935,7 @@ def forward(
                 mask_shift_len = qlen
             dec_attn_mask = (torch.triu(all_ones, 1 + mlen) + torch.tril(all_ones, -mask_shift_len))[:, :, None]  # -1
         else:
-            dec_attn_mask = torch.triu(word_emb.new_ones((qlen, klen), dtype=torch.uint8), diagonal=1 + mlen)[
+            dec_attn_mask = torch.triu(word_emb.new_ones((qlen, klen), dtype=torch.bool), diagonal=1 + mlen)[
                 :, :, None
             ]
 
@@ -1006,7 +1002,7 @@ def forward(
     TRANSFO_XL_START_DOCSTRING,
 )
 class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"crit\.out_projs\.\d+", r"crit\.out_layers\.\d+\.weight"]
+    _tied_weights_keys = [r"crit\.out_projs\.\d+", r"crit\.out_layers\.\d+\.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1063,7 +1059,6 @@ def init_mems(self, bsz):
 
     @add_start_docstrings_to_model_forward(TRANSFO_XL_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TransfoXLLMHeadModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1195,8 +1190,6 @@ def _reorder_cache(mems: List[torch.Tensor], beam_idx: torch.Tensor) -> List[tor
     TRANSFO_XL_START_DOCSTRING,
 )
 class TransfoXLForSequenceClassification(TransfoXLPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1207,7 +1200,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(TRANSFO_XL_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TransfoXLSequenceClassifierOutputWithPast,
         config_class=_CONFIG_FOR_DOC,
@@ -1255,7 +1247,9 @@ def forward(
             sequence_lengths = -1
         else:
             if input_ids is not None:
-                sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+                    logits.device
+                )
             else:
                 sequence_lengths = -1
                 logger.warning(
diff --git a/src/transformers/models/transfo_xl/modeling_transfo_xl_utilities.py b/src/transformers/models/transfo_xl/modeling_transfo_xl_utilities.py
index e25ba2cd476a..addf2a08372b 100644
--- a/src/transformers/models/transfo_xl/modeling_transfo_xl_utilities.py
+++ b/src/transformers/models/transfo_xl/modeling_transfo_xl_utilities.py
@@ -86,7 +86,7 @@ def forward(self, hidden, labels=None, keep_order=False):
         """
         Params:
             hidden :: [len*bsz x d_proj]
-            labels :: [len*bsz
+            labels :: [len*bsz]
 
         Return:
             if labels is None: out :: [len*bsz x n_tokens] log probabilities of tokens over the vocabulary else: out ::
@@ -109,7 +109,11 @@ def forward(self, hidden, labels=None, keep_order=False):
         if self.n_clusters == 0:
             logit = self._compute_logit(hidden, self.out_layers[0].weight, self.out_layers[0].bias, self.out_projs[0])
             if labels is not None:
-                out = -nn.functional.log_softmax(logit, dim=-1).gather(1, labels.unsqueeze(1)).squeeze(1)
+                mask = labels != -100
+                out = torch.zeros_like(labels, dtype=hidden.dtype, device=hidden.device)
+                out[mask] = (
+                    -nn.functional.log_softmax(logit, dim=-1)[mask].gather(1, labels[mask].unsqueeze(1)).squeeze(1)
+                )
             else:
                 out = nn.functional.log_softmax(logit, dim=-1)
         else:
diff --git a/src/transformers/models/transfo_xl/tokenization_transfo_xl.py b/src/transformers/models/transfo_xl/tokenization_transfo_xl.py
index 5b284a219a47..91f3d78aae76 100644
--- a/src/transformers/models/transfo_xl/tokenization_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/tokenization_transfo_xl.py
@@ -88,7 +88,7 @@ def tokenize_numbers(text_array: List[str]) -> List[str]:
 
     ```python
     >>> tokenize_numbers(["$", "5,000", "1.73", "m"])
-    ["$", "5", "@,@", "000", "1", "@.@", "73", "m"]
+    ['$', '5', '@,@', '000', '1', '@.@', '73', 'm']
     ```"""
     tokenized = []
     for i in range(len(text_array)):
@@ -113,7 +113,7 @@ def detokenize_numbers(text: str) -> str:
 
     ```python
     >>> detokenize_numbers("$ 5 @,@ 000 1 @.@ 73 m")
-    "$ 5,000 1.73 m"
+    '$ 5,000 1.73 m'
     ```"""
     for reg, sub in DETOKENIZE_NUMBERS:
         text = re.sub(reg, sub, text)
@@ -154,7 +154,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
             token instead.
         eos_token (`str`, *optional*, defaults to `"<eos>"`):
             The end of sequence token.
-        additional_special_tokens (`List[str]`, *optional*, defaults to `["<formula>"]`):
+        additional_special_tokens (`List[str]`, *optional*, defaults to `['<formula>']`):
             A list of additional special tokens (for the HuggingFace functionality).
         language (`str`, *optional*, defaults to `"en"`):
             The language of this tokenizer (used for mose preprocessing).
@@ -179,27 +179,9 @@ def __init__(
         eos_token="<eos>",
         additional_special_tokens=["<formula>"],
         language="en",
-        **kwargs
+        **kwargs,
     ):
-        super().__init__(
-            special=special,
-            min_freq=min_freq,
-            max_size=max_size,
-            lower_case=lower_case,
-            delimiter=delimiter,
-            vocab_file=vocab_file,
-            pretrained_vocab_file=pretrained_vocab_file,
-            never_split=never_split,
-            unk_token=unk_token,
-            eos_token=eos_token,
-            additional_special_tokens=additional_special_tokens,
-            language=language,
-            **kwargs,
-        )
         requires_backends(self, "sacremoses")
-
-        if never_split is None:
-            never_split = self.all_special_tokens
         if special is None:
             special = []
         self.counter = Counter()
@@ -209,7 +191,6 @@ def __init__(
         self.lower_case = lower_case
         self.delimiter = delimiter
         self.vocab_file = vocab_file
-        self.never_split = never_split
         self.punctuation_symbols = '!"#$%&()*+,-./\\:;<=>?@[\\]^_`{|}~'
         self.punction_without_space_before_pattern = re.compile(rf"[^\s][{self.punctuation_symbols}]")
         self.punctuation_with_space_around_pattern = self._compile_space_around_punctuation_pattern()
@@ -217,7 +198,8 @@ def __init__(
         self.moses_punct_normalizer = sm.MosesPunctNormalizer(language)
         self.moses_tokenizer = sm.MosesTokenizer(language)
         self.moses_detokenizer = sm.MosesDetokenizer(language)
-
+        self.idx2sym = []
+        self.sym2idx = OrderedDict()
         # This try... catch... is not beautiful but honestly this tokenizer was not made to be used
         # in a library like ours, at all.
         try:
@@ -241,7 +223,7 @@ def __init__(
 
             if vocab_dict is not None:
                 for key, value in vocab_dict.items():
-                    if key not in self.__dict__:
+                    if key not in self.__dict__ or key == "sym2idx":
                         self.__dict__[key] = value
             elif vocab_file is not None:
                 self.build_vocab()
@@ -256,6 +238,27 @@ def __init__(
         if vocab_file is not None:
             self.build_vocab()
 
+        super().__init__(
+            special=special,
+            min_freq=min_freq,
+            max_size=max_size,
+            lower_case=lower_case,
+            delimiter=delimiter,
+            vocab_file=vocab_file,
+            pretrained_vocab_file=pretrained_vocab_file,
+            never_split=never_split,
+            unk_token=unk_token,
+            eos_token=eos_token,
+            additional_special_tokens=additional_special_tokens,
+            language=language,
+            **kwargs,
+        )
+
+        # these are not required to initialize the parent class as only used when tokenizing.
+        if never_split is None:
+            never_split = self.all_special_tokens
+        self.never_split = never_split
+
     @property
     def do_lower_case(self):
         return self.lower_case
@@ -305,7 +308,7 @@ def _build_from_file(self, vocab_file):
         elif "<unk>" in self.sym2idx:
             self.unk_idx = self.sym2idx["<unk>"]
         else:
-            raise ValueError("No <unknown> token in vocabulary")
+            raise ValueError("Token not in vocabulary and no <unk> token in vocabulary for replacement.")
 
     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
         if os.path.isdir(save_directory):
@@ -323,7 +326,7 @@ def build_vocab(self):
         if self.vocab_file:
             logger.info(f"building vocab from {self.vocab_file}")
             self._build_from_file(self.vocab_file)
-            logger.info(f"final vocab size {len(self)}")
+            logger.info(f"Final vocab size {len(self.sym2idx)}")
         else:
             logger.info(f"building vocab with min_freq={self.min_freq}, max_size={self.max_size}")
             self.idx2sym = []
@@ -337,7 +340,7 @@ def build_vocab(self):
                     break
                 self.add_symbol(sym)
 
-            logger.info(f"final vocab size {len(self)} from {len(self.counter)} unique tokens")
+            logger.info(f"Final vocab size {len(self.sym2idx)} from {len(self.counter)} unique tokens")
 
     @torch_only_method
     def encode_file(self, path, ordered=False, verbose=False, add_eos=True, add_double_eos=False):
@@ -406,9 +409,8 @@ def move_added_token(self, token: str, target_idx: int):
             self.sym2idx[current_sym] = idx
 
         # Delete token from added_tokens
-        old_index = self.added_tokens_encoder[token]
-        del self.added_tokens_decoder[old_index]
-        del self.added_tokens_encoder[token]
+        old_index = self._added_tokens_encoder.pop(token)
+        self._added_tokens_decoder.pop(old_index)
 
     def moses_punct_norm(self, text):
         return self.moses_punct_normalizer.normalize(text)
@@ -463,7 +465,7 @@ def _convert_token_to_id(self, sym):
             elif "<UNK>" in self.sym2idx:
                 return self.sym2idx["<UNK>"]
             else:
-                raise ValueError("Token not in vocabulary and no <unk> token in vocabulary for replacement")
+                raise ValueError("Token not in vocabulary and no <unk> token in vocabulary for replacement.")
 
     def convert_tokens_to_string(self, tokens):
         """
@@ -482,7 +484,9 @@ def vocab_size(self):
         return len(self.idx2sym)
 
     def get_vocab(self):
-        return dict(self.sym2idx, **self.added_tokens_encoder)
+        vocab = self.sym2idx.copy()
+        vocab.update(self.added_tokens_encoder)
+        return vocab
 
     def _tokenize(self, line, add_eos=False, add_double_eos=False):
         line = line.strip()
@@ -643,7 +647,6 @@ def __iter__(self):
 
 class LMMultiFileIterator(LMShuffledIterator):
     def __init__(self, paths, vocab, bsz, bptt, device="cpu", ext_len=None, shuffle=False):
-
         self.paths = paths
         self.vocab = vocab
 
diff --git a/src/transformers/models/trocr/__init__.py b/src/transformers/models/trocr/__init__.py
index 8e18eaeb4069..08400fc916ec 100644
--- a/src/transformers/models/trocr/__init__.py
+++ b/src/transformers/models/trocr/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/trocr/configuration_trocr.py b/src/transformers/models/trocr/configuration_trocr.py
index ad22fbbe0fa2..b3f033736184 100644
--- a/src/transformers/models/trocr/configuration_trocr.py
+++ b/src/transformers/models/trocr/configuration_trocr.py
@@ -63,8 +63,6 @@ class TrOCRConfig(PretrainedConfig):
             The dropout ratio for the attention probabilities.
         activation_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for classifier.
         init_std (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         decoder_layerdrop (`float`, *optional*, defaults to 0.0):
@@ -114,7 +112,6 @@ def __init__(
         attention_dropout=0.0,
         activation_dropout=0.0,
         decoder_start_token_id=2,
-        classifier_dropout=0.0,
         init_std=0.02,
         decoder_layerdrop=0.0,
         use_cache=True,
@@ -124,7 +121,7 @@ def __init__(
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.d_model = d_model
@@ -136,7 +133,6 @@ def __init__(
         self.dropout = dropout
         self.attention_dropout = attention_dropout
         self.activation_dropout = activation_dropout
-        self.classifier_dropout = classifier_dropout
         self.init_std = init_std
         self.decoder_layerdrop = decoder_layerdrop
         self.use_cache = use_cache
diff --git a/src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py b/src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py
index 997fd7476210..b82adf690e7e 100644
--- a/src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py
+++ b/src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py
@@ -18,10 +18,10 @@
 import argparse
 from pathlib import Path
 
+import requests
 import torch
 from PIL import Image
 
-import requests
 from transformers import (
     RobertaTokenizer,
     TrOCRConfig,
@@ -29,7 +29,7 @@
     TrOCRProcessor,
     VisionEncoderDecoderModel,
     ViTConfig,
-    ViTFeatureExtractor,
+    ViTImageProcessor,
     ViTModel,
 )
 from transformers.utils import logging
@@ -182,9 +182,9 @@ def convert_tr_ocr_checkpoint(checkpoint_url, pytorch_dump_folder_path):
     model.load_state_dict(state_dict)
 
     # Check outputs on an image
-    feature_extractor = ViTFeatureExtractor(size=encoder_config.image_size)
+    image_processor = ViTImageProcessor(size=encoder_config.image_size)
     tokenizer = RobertaTokenizer.from_pretrained("roberta-large")
-    processor = TrOCRProcessor(feature_extractor, tokenizer)
+    processor = TrOCRProcessor(image_processor, tokenizer)
 
     pixel_values = processor(images=prepare_img(checkpoint_url), return_tensors="pt").pixel_values
 
diff --git a/src/transformers/models/trocr/modeling_trocr.py b/src/transformers/models/trocr/modeling_trocr.py
index 3f3ed27b2bc0..50829592a02e 100644
--- a/src/transformers/models/trocr/modeling_trocr.py
+++ b/src/transformers/models/trocr/modeling_trocr.py
@@ -17,7 +17,6 @@
 
 import copy
 import math
-import random
 from typing import Optional, Tuple, Union
 
 import torch
@@ -34,7 +33,6 @@
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "TrOCRConfig"
-_TOKENIZER_FOR_DOC = "TrOCRTokenizer"
 _CHECKPOINT_FOR_DOC = "microsoft/trocr-base-handwritten"
 
 
@@ -45,18 +43,20 @@
 
 
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
     """
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
-    mask_cond = torch.arange(mask.size(-1))
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
 
     if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
     return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
 
 
@@ -358,11 +358,11 @@ def forward(
     ):
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             encoder_hidden_states (`torch.FloatTensor`):
-                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
             encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
@@ -525,8 +525,11 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
         combined_attention_mask = None
         if input_shape[-1] > 1:
             combined_attention_mask = _make_causal_mask(
-                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
-            ).to(inputs_embeds.device)
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
 
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
@@ -558,7 +561,7 @@ def forward(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`TrOCRTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -665,6 +668,13 @@ def forward(
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
             encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`..."
+                )
+                use_cache = False
+
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
@@ -683,21 +693,15 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache ="
-                        " False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         # None for past_key_value
@@ -716,7 +720,6 @@ def custom_forward(*inputs):
                     None,
                 )
             else:
-
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
@@ -785,7 +788,7 @@ def forward(self, *args, **kwargs):
     TROCR_START_DOCSTRING,
 )
 class TrOCRForCausalLM(TrOCRPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["output_projection.weight"]
+    _tied_weights_keys = ["output_projection.weight"]
 
     def __init__(self, config):
         config = copy.deepcopy(config)
@@ -840,7 +843,7 @@ def forward(
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
 
-                Indices can be obtained using [`TrOCRTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
 
                 [What are input IDs?](../glossary#input-ids)
@@ -1010,8 +1013,10 @@ def prepare_inputs_for_generation(
         }
 
     @staticmethod
-    def _reorder_cache(past, beam_idx):
+    def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
         return reordered_past
diff --git a/src/transformers/models/trocr/processing_trocr.py b/src/transformers/models/trocr/processing_trocr.py
index 8466df54f230..6b7723a975bb 100644
--- a/src/transformers/models/trocr/processing_trocr.py
+++ b/src/transformers/models/trocr/processing_trocr.py
@@ -30,9 +30,9 @@ class TrOCRProcessor(ProcessorMixin):
     more information.
 
     Args:
-        image_processor ([`ViTImageProcessor`/`DeiTImageProcessor`]):
+        image_processor ([`ViTImageProcessor`/`DeiTImageProcessor`], *optional*):
             An instance of [`ViTImageProcessor`/`DeiTImageProcessor`]. The image processor is a required input.
-        tokenizer ([`RobertaTokenizer`/`XLMRobertaTokenizer`]):
+        tokenizer ([`RobertaTokenizer`/`XLMRobertaTokenizer`], *optional*):
             An instance of [`RobertaTokenizer`/`XLMRobertaTokenizer`]. The tokenizer is a required input.
     """
     attributes = ["image_processor", "tokenizer"]
@@ -40,6 +40,7 @@ class TrOCRProcessor(ProcessorMixin):
     tokenizer_class = "AutoTokenizer"
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        feature_extractor = None
         if "feature_extractor" in kwargs:
             warnings.warn(
                 "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
diff --git a/src/transformers/models/tvlt/__init__.py b/src/transformers/models/tvlt/__init__.py
new file mode 100644
index 000000000000..86c0f7c1c0b9
--- /dev/null
+++ b/src/transformers/models/tvlt/__init__.py
@@ -0,0 +1,88 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+    is_vision_available,
+)
+
+
+_import_structure = {
+    "configuration_tvlt": ["TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP", "TvltConfig"],
+    "feature_extraction_tvlt": ["TvltFeatureExtractor"],
+    "processing_tvlt": ["TvltProcessor"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_tvlt"] = [
+        "TVLT_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "TvltModel",
+        "TvltForPreTraining",
+        "TvltForAudioVisualClassification",
+        "TvltPreTrainedModel",
+    ]
+
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["image_processing_tvlt"] = ["TvltImageProcessor"]
+
+
+if TYPE_CHECKING:
+    from .configuration_tvlt import TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP, TvltConfig
+    from .processing_tvlt import TvltProcessor
+    from .feature_extraction_tvlt import TvltFeatureExtractor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_tvlt import (
+            TVLT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TvltForAudioVisualClassification,
+            TvltForPreTraining,
+            TvltModel,
+            TvltPreTrainedModel,
+        )
+
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .image_processing_tvlt import TvltImageProcessor
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/tvlt/configuration_tvlt.py b/src/transformers/models/tvlt/configuration_tvlt.py
new file mode 100644
index 000000000000..013952dbb1ba
--- /dev/null
+++ b/src/transformers/models/tvlt/configuration_tvlt.py
@@ -0,0 +1,187 @@
+# coding=utf-8
+# Copyright 2023 MURGe-Lab and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" TVLT model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "ZinengTang/tvlt-base": "https://huggingface.co/ZinengTang/tvlt-base/blob/main/config.json",
+}
+
+
+class TvltConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`TvltModel`]. It is used to instantiate a TVLT
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the TVLT
+    [ZinengTang/tvlt-base](https://huggingface.co/ZinengTang/tvlt-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        spectrogram_length (`int`, *optional*, defaults to 2048):
+            The time length of each audio spectrogram.
+        frequency_length (`int`, *optional*, defaults to 128):
+            The frequency length of audio spectrogram.
+        image_patch_size (`List[int]`, *optional*, defaults to `[16, 16]`):
+            The size (resolution) of each image patch.
+        audio_patch_size (`List[int]`, *optional*, defaults to `[16, 16]`):
+            The size (resolution) of each audio patch.
+        num_image_channels (`int`, *optional*, defaults to 3):
+            The number of input image channels.
+        num_audio_channels (`int`, *optional*, defaults to 1):
+            The number of input audio channels.
+        num_frames (`int`, *optional*, defaults to 8):
+            The maximum number of frames for an input video.
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        use_mean_pooling (`bool`, *optional*, defaults to `False`):
+            Whether to mean pool the final hidden states instead of using the final hidden state of the [CLS] token.
+        decoder_num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the decoder.
+        decoder_hidden_size (`int`, *optional*, defaults to 512):
+            Dimensionality of the decoder.
+        decoder_num_hidden_layers (`int`, *optional*, defaults to 8):
+            Number of hidden layers in the decoder.
+        decoder_intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the decoder.
+        pixel_mask_ratio (`float`, *optional*, defaults to 0.75):
+            Image patch masking ratio.
+        audio_mask_ratio (`float`, *optional*, defaults to 0.15):
+            Audio patch masking ratio.
+        audio_mask_type (`str`, *optional*, defaults to `"frame-level"`):
+            Audio patch masking type, choose between "frame-level" and "patch-level".
+        task_matching (`bool`, *optional*, defaults to `True`):
+            Whether to use vision audio matching task in pretraining.
+        task_mae (`bool`, *optional*, defaults to `True`):
+            Whether to use the masked auto-encoder (MAE) in pretraining.
+        loss_type (`str`, *optional*, defaults to `"classification"`):
+            Loss types including regression and classification.
+
+    Example:
+
+    ```python
+    >>> from transformers import TvltConfig, TvltModel
+
+    >>> # # Initializing a TVLT ZinengTang/tvlt-base style configuration
+    >>> configuration = TvltConfig()
+
+    >>> # # Initializing a model (with random weights) from the ZinengTang/tvlt-base style configuration
+    >>> model = TvltModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "tvlt"
+
+    def __init__(
+        self,
+        image_size=224,
+        spectrogram_length=2048,
+        frequency_length=128,
+        image_patch_size=[16, 16],
+        audio_patch_size=[16, 16],
+        num_image_channels=3,
+        num_audio_channels=1,
+        num_frames=8,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-6,
+        qkv_bias=True,
+        use_mean_pooling=False,
+        decoder_num_attention_heads=16,
+        decoder_hidden_size=512,
+        decoder_num_hidden_layers=8,
+        decoder_intermediate_size=2048,
+        pixel_mask_ratio=0.75,
+        audio_mask_ratio=0.15,
+        audio_mask_type="frame-level",
+        task_matching=True,
+        task_mae=True,
+        loss_type="classification",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        if audio_mask_type not in ("frame-level", "patch_level"):
+            raise ValueError(
+                "audio_mask_type must be one of two acceptable strategies - {'frame_level', 'patch-level') "
+                f"got {audio_mask_type}"
+            )
+
+        self.image_size = image_size
+        self.spectrogram_length = spectrogram_length
+        self.frequency_length = frequency_length
+        self.image_patch_size = image_patch_size
+        self.audio_patch_size = audio_patch_size
+        self.num_image_channels = num_image_channels
+        self.num_audio_channels = num_audio_channels
+        self.num_frames = num_frames
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.qkv_bias = qkv_bias
+        self.use_mean_pooling = use_mean_pooling
+
+        self.decoder_num_attention_heads = decoder_num_attention_heads
+        self.decoder_hidden_size = decoder_hidden_size
+        self.decoder_num_hidden_layers = decoder_num_hidden_layers
+        self.decoder_intermediate_size = decoder_intermediate_size
+        self.pixel_mask_ratio = pixel_mask_ratio
+        self.audio_mask_ratio = audio_mask_ratio
+        self.audio_mask_type = audio_mask_type
+
+        self.task_matching = task_matching
+        self.task_mae = task_mae
+        self.loss_type = loss_type
diff --git a/src/transformers/models/tvlt/feature_extraction_tvlt.py b/src/transformers/models/tvlt/feature_extraction_tvlt.py
new file mode 100644
index 000000000000..7dc5e0463138
--- /dev/null
+++ b/src/transformers/models/tvlt/feature_extraction_tvlt.py
@@ -0,0 +1,230 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for TVLT."""
+
+from math import ceil
+from typing import List, Optional, Union
+
+import numpy as np
+
+from ...audio_utils import mel_filter_bank, spectrogram, window_function
+from ...feature_extraction_sequence_utils import BatchFeature, SequenceFeatureExtractor
+from ...utils import TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class TvltFeatureExtractor(SequenceFeatureExtractor):
+    r"""
+    Constructs a TVLT audio feature extractor. This feature extractor can be used to prepare audios for the model.
+
+    This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
+    should refer to this superclass for more information regarding those methods.
+
+    Args:
+        spectrogram_length (`Dict[str, int]` *optional*, defaults to 2048):
+            The time length of each audio spectrogram.
+        num_channels (`int` *optional*, defaults to 1):
+            Number of audio channels.
+        patch_size (`List[int]` *optional*, defaults to `[16, 16]`):
+            The patch size of audio patch embedding.
+        feature_size (`int`, *optional*, defaults to 128):
+            The frequency length of audio spectrogram.
+        sampling_rate (`int`, *optional*, defaults to 44100):
+            The sampling rate at which the audio files should be digitalized expressed in Hertz (Hz).
+        hop_length_to_sampling_rate (`int`, *optional*, defaults to 86):
+            Hop length is length of the overlaping windows for the STFT used to obtain the Mel Frequency coefficients.
+            For example, with sampling rate 44100, the hop length is 512, with 44100 / 512 = 86
+        n_fft (`int`, *optional*, defaults to 2048):
+            Size of the Fourier transform.
+        padding_value (`float`, *optional*, defaults to 0.0):
+            Padding value used to pad the audio. Should correspond to silences.
+    """
+
+    model_input_names = ["audio_values", "audio_mask"]
+
+    def __init__(
+        self,
+        spectrogram_length=2048,
+        num_channels=1,
+        patch_size=[16, 16],
+        feature_size=128,
+        sampling_rate=44100,
+        hop_length_to_sampling_rate=86,
+        n_fft=2048,
+        padding_value=0.0,
+        **kwargs,
+    ):
+        super().__init__(
+            feature_size=feature_size,
+            sampling_rate=sampling_rate,
+            padding_value=padding_value,
+            **kwargs,
+        )
+
+        self.spectrogram_length = spectrogram_length
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.freq_len = feature_size // self.patch_size[1]
+        self.n_fft = n_fft
+        self.hop_length = sampling_rate // hop_length_to_sampling_rate
+        self.sampling_rate = sampling_rate
+        self.padding_value = padding_value
+        self.mel_filters = mel_filter_bank(
+            num_frequency_bins=1 + n_fft // 2,
+            num_mel_filters=feature_size,
+            min_frequency=0.0,
+            max_frequency=22050.0,
+            sampling_rate=sampling_rate,
+            norm="slaney",
+            mel_scale="slaney",
+        ).T
+
+    def _np_extract_fbank_features(self, waveform: np.array) -> np.ndarray:
+        """
+        Compute the log-mel spectrogram of the provided audio, gives similar results to Whisper's original torch
+        implementation with 1e-5 tolerance.
+        """
+        log_spec = spectrogram(
+            waveform,
+            window_function(self.n_fft, "hann"),
+            frame_length=self.n_fft,
+            hop_length=self.hop_length,
+            power=2.0,
+            mel_filters=self.mel_filters.T,
+            log_mel="dB",
+            db_range=80.0,
+        )
+        log_spec = log_spec[:, :-1]
+        log_spec = log_spec - 20.0
+        log_spec = np.clip(log_spec / 40.0, -2.0, 0.0) + 1.0
+        return log_spec
+
+    def __call__(
+        self,
+        raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_attention_mask: Optional[bool] = True,
+        sampling_rate: Optional[int] = None,
+        resample: bool = False,
+        mask_audio: bool = False,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Main method to prepare one or several audio(s) for the model.
+
+        Args:
+            raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
+                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
+                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
+                stereo, i.e. single float per timestep.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            return_attention_mask (`bool`, *optional*, default to `True`):
+                Whether to return the attention mask. If left to the default, will return the attention mask according
+                to the specific feature_extractor's default. [What are attention masks?](../glossary#attention-mask)
+
+                <Tip>
+
+                For TvltTransformer models, `attention_mask` should alwys be passed for batched inference, to avoid
+                subtle bugs.
+
+                </Tip>
+
+            sampling_rate (`int`, *optional*):
+                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
+                `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
+                pipeline. Current model supports sampling rate 16000 and 44100.
+            resample (`bool`, *optional*, defaults to `False`):
+                If the sampling rate is not matched, resample the input audio to match.
+            mask_audio (`bool`, *optional*, defaults to `False`):
+                Whether or not to mask input audio for MAE task.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **audio_values** -- Audio values to be fed to a model, of shape (batch_size, num_channels, height,
+              width).
+
+            - **audio_mask** -- Audio masks to be fed to a model, of shape (batch_size, num_audio_patches).
+        """
+
+        if sampling_rate is not None:
+            if sampling_rate != self.sampling_rate:
+                raise ValueError(
+                    "This feature extractor is set to support sampling rate"
+                    f" of {self.sampling_rate}. Please make sure that the provided `raw_speech` input was sampled"
+                    f" with {self.sampling_rate} and not {sampling_rate}."
+                )
+        else:
+            logger.warning(
+                "It is strongly recommended to pass the `sampling_rate` argument to this function. "
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+
+        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
+        if is_batched_numpy and len(raw_speech.shape) > 2:
+            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
+        is_batched = is_batched_numpy or (
+            isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
+        )
+        if is_batched:
+            raw_speech = [np.asarray([speech], dtype=np.float32).T for speech in raw_speech]
+        elif not is_batched and not isinstance(raw_speech, np.ndarray):
+            raw_speech = np.asarray(raw_speech, dtype=np.float32)
+        elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64):
+            raw_speech = raw_speech.astype(np.float32)
+        # always return batch
+        if not is_batched:
+            raw_speech = [np.asarray([raw_speech]).T]
+
+        # Convert audio signals to log mel spectrograms, truncate by time axis
+        audio_features = [
+            self._np_extract_fbank_features(waveform.squeeze()).T[: self.spectrogram_length] for waveform in raw_speech
+        ]
+        if isinstance(audio_features[0], List):
+            audio_features = [np.asarray(feature, dtype=np.float32) for feature in audio_features]
+
+        # Create audio attention mask
+        max_patch_len = max(
+            [ceil(feature.shape[0] / self.patch_size[0]) * self.freq_len for feature in audio_features]
+        )  # The maximum number of audio patches in a batch
+        if return_attention_mask:
+            audio_mask = [
+                (ceil(feature.shape[0] / self.patch_size[0]) * self.freq_len) * [1]
+                + (max_patch_len - ceil(feature.shape[0] / self.patch_size[0]) * self.freq_len) * [0]
+                for feature in audio_features
+            ]
+            audio_mask = np.array(audio_mask).astype(np.float32)
+
+        # convert into correct format for padding
+        max_time_len = max_patch_len // self.freq_len * self.patch_size[0]  # The maximum audio size in a batch
+        padded_audio_features = np.ones([len(audio_features), 1, max_time_len, self.feature_size]).astype(np.float32)
+        padded_audio_features = padded_audio_features * self.padding_value
+        for i in range(len(audio_features)):
+            feature = audio_features[i]
+            padded_audio_features[i, :, : feature.shape[0], :] = feature
+
+        # return as BatchFeature
+        if return_attention_mask:
+            data = {"audio_values": padded_audio_features, "audio_mask": audio_mask}
+        else:
+            data = {"audio_values": padded_audio_features}
+
+        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+        return encoded_inputs
diff --git a/src/transformers/models/tvlt/image_processing_tvlt.py b/src/transformers/models/tvlt/image_processing_tvlt.py
new file mode 100644
index 000000000000..f5860b2c1dcc
--- /dev/null
+++ b/src/transformers/models/tvlt/image_processing_tvlt.py
@@ -0,0 +1,409 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for TVLT."""
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+    get_resize_output_image_size,
+    resize,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    is_valid_image,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+def make_batched(videos) -> List[List[ImageInput]]:
+    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)):
+        return videos
+
+    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
+        videos_dim = np.array(videos[0]).ndim
+        if videos_dim == 3:
+            return [videos]
+        elif videos_dim == 4:
+            return videos
+
+    elif is_valid_image(videos):
+        videos_dim = np.array(videos).ndim
+        if videos_dim == 3:
+            return [[videos]]
+        elif videos_dim == 4:
+            return [videos]
+        elif videos_dim == 5:
+            return videos
+
+    raise ValueError(f"Could not make batched video from {videos}")
+
+
+class TvltImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a TVLT image processor.
+
+    This processor can be used to prepare either videos or images for the model by converting images to 1-frame videos.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
+            `do_resize` parameter in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
+            Size of the output image after resizing. The shortest edge of the image will be resized to
+            `size["shortest_edge"]` while maintaining the aspect ratio of the original image. Can be overriden by
+            `size` in the `preprocess` method.
+        patch_size (`List[int]` *optional*, defaults to [16,16]):
+            The patch size of image patch embedding.
+        num_frames (`int` *optional*, defaults to 8):
+            The maximum number of video frames.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
+            `preprocess` method.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to center crop the image to the specified `crop_size`. Can be overridden by the `do_center_crop`
+            parameter in the `preprocess` method.
+        crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
+            Size of the image after applying the center crop. Can be overridden by the `crop_size` parameter in the
+            `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to 1/255):
+            Defines the scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter
+            in the `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+    """
+
+    model_input_names = [
+        "pixel_values",
+        "pixel_mask",
+        "pixel_values_mixed",
+        "pixel_mask_mixed",
+    ]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        patch_size: List[int] = [16, 16],
+        num_frames: int = 8,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_center_crop: bool = True,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = IMAGENET_STANDARD_MEAN,
+        image_std: Optional[Union[float, List[float]]] = IMAGENET_STANDARD_STD,
+        init_mask_generator=False,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"shortest_edge": 224}
+        size = get_size_dict(size, default_to_square=False)
+        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
+        crop_size = get_size_dict(crop_size, param_name="crop_size")
+
+        self.do_resize = do_resize
+        self.size = size
+        self.patch_size = patch_size
+        self.num_frames = num_frames
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Size of the output image. If `size` is of the form `{"height": h, "width": w}`, the output image will
+                have the size `(h, w)`. If `size` is of the form `{"shortest_edge": s}`, the output image will have its
+                shortest edge of length `s` while keeping the aspect ratio of the original image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        size = get_size_dict(size, default_to_square=False)
+        if "shortest_edge" in size:
+            output_size = get_resize_output_image_size(
+                image, size["shortest_edge"], default_to_square=False, input_data_format=input_data_format
+            )
+        elif "height" in size and "width" in size:
+            output_size = (size["height"], size["width"])
+        else:
+            raise ValueError(f"Size must have 'height' and 'width' or 'shortest_edge' as keys. Got {size.keys()}")
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    def _preprocess_image(
+        self,
+        image: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """Preprocesses a single image."""
+        if do_resize and size is None or resample is None:
+            raise ValueError("Size and resample must be specified if do_resize is True.")
+
+        if do_center_crop and crop_size is None:
+            raise ValueError("Crop size must be specified if do_center_crop is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize and (image_mean is None or image_std is None):
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
+
+        # All transformations expect numpy arrays.
+        image = to_numpy_array(image)
+
+        if is_scaled_image(image) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(image)
+
+        if do_resize:
+            image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+
+        if do_center_crop:
+            image = self.center_crop(image, size=crop_size, input_data_format=input_data_format)
+
+        if do_rescale:
+            image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+
+        if do_normalize:
+            image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+        image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+        return image
+
+    def preprocess(
+        self,
+        videos: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        patch_size: List[int] = None,
+        num_frames: int = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        is_mixed: bool = False,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Preprocess an videos or image or batch of videos or images.
+
+        Args:
+            videos (`ImageInput`):
+                Images or videos to preprocess. Expects a single or batch of frames with pixel values ranging from 0 to
+                255. If passing in frames with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after applying resize.
+            patch_size (`List[int]` *optional*, defaults to self.patch_size):
+                The patch size of image patch embedding.
+            num_frames (`int` *optional*, defaults to self.num_frames):
+                The maximum number of video frames.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only
+                has an effect if `do_resize` is set to `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_centre_crop`):
+                Whether to centre crop the image.
+            crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the image after applying the centre crop.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation.
+            is_mixed (`bool`, *optional*):
+                If the input video has negative samples.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                    - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                    - Unset: Use the inferred channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
+              width).
+
+            - **pixel_mask** -- Pixel masks to be fed to a model, of shape (batch_size, num_pixel_patches).
+
+            - **pixel_values_mixed** -- Pixel values with both postive or negative to be fed to a model, of shape
+              (batch_size, num_channels, height, width).
+
+            - **pixel_mask_mixed** -- Pixel masks with both postive or negative to be fed to a model, of shape
+              (batch_size, num_pixel_patches).
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        resample = resample if resample is not None else self.resample
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+
+        size = size if size is not None else self.size
+        size = get_size_dict(size, default_to_square=False)
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(crop_size, param_name="crop_size")
+        patch_size = patch_size if patch_size is not None else self.patch_size
+        num_frames = num_frames if patch_size is not None else self.num_frames
+
+        if not valid_images(videos):
+            raise ValueError(
+                "Invalid image or video type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        videos = make_batched(videos)
+
+        # Check number of frames is fewer than maximum frames
+        for video in videos:
+            if len(video) > self.num_frames:
+                raise ValueError(
+                    f"number of frames must not be greater than the maximum frames of the model {self.num_frames}."
+                )
+
+        max_num_frames = max([len(video) for video in videos])
+        num_patches_per_image = (size["shortest_edge"] // patch_size[0]) ** 2
+        video_masks = np.array(
+            [
+                len(video) * num_patches_per_image * [1] + (max_num_frames - len(video)) * num_patches_per_image * [0]
+                for video in videos
+            ]
+        )
+
+        videos = [
+            [
+                self._preprocess_image(
+                    image=img,
+                    do_resize=do_resize,
+                    size=size,
+                    resample=resample,
+                    do_center_crop=do_center_crop,
+                    crop_size=crop_size,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    data_format=data_format,
+                    input_data_format=input_data_format,
+                )
+                for img in video
+            ]
+            for video in videos
+        ]
+
+        # If videos contain both positive/negative, use mixed key for video-audio matching task
+        if is_mixed:
+            data = {"pixel_values_mixed": videos, "pixel_mask_mixed": video_masks}
+        else:
+            data = {"pixel_values": videos, "pixel_mask": video_masks}
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/src/transformers/models/tvlt/modeling_tvlt.py b/src/transformers/models/tvlt/modeling_tvlt.py
new file mode 100644
index 000000000000..464c3e76a11f
--- /dev/null
+++ b/src/transformers/models/tvlt/modeling_tvlt.py
@@ -0,0 +1,1317 @@
+# coding=utf-8
+# Copyright 2023 MURGe-Lab and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch TVLT model."""
+
+
+import collections.abc
+import math
+from copy import deepcopy
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutput, SequenceClassifierOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_tvlt import TvltConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "TvltConfig"
+_CHECKPOINT_FOR_DOC = "ZinengTang/tvlt-base"
+
+TVLT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "ZinengTang/tvlt-base",
+    # See all TVLT models at https://huggingface.co/ZinengTang/tvlt-base
+]
+
+
+@dataclass
+class TvltModelOutput(ModelOutput):
+    """
+    Class for TvltModel's outputs, with potential hidden states and attentions.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        last_pixel_hidden_state (`torch.FloatTensor` of shape `(batch_size, pixel_sequence_length, hidden_size)`):
+            Pixel sequence of hidden-states at the output of the last layer of the model.
+        last_audio_hidden_state (`torch.FloatTensor` of shape `(batch_size, audio_sequence_length, hidden_size)`):
+            Audio sequence of hidden-states at the output of the last layer of the model.
+        pixel_label_masks (`torch.FloatTensor` of shape `(batch_size, pixel_patch_length)`):
+            Tensor indicating which pixel patches are masked (1) and which are not (0).
+        audio_label_masks (`torch.FloatTensor` of shape `(batch_size, audio_patch_length)`):
+            Tensor indicating which audio patches are masked (1) and which are not (0).
+        pixel_ids_restore (`torch.LongTensor` of shape `(batch_size, pixel_patch_length)`):
+            Tensor containing the ids permutation of pixel masking.
+        audio_ids_restore (`torch.LongTensor` of shape `(batch_size, audio_patch_length)`):
+            Tensor containing the ids permutation of audio masking.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    last_pixel_hidden_state: torch.FloatTensor = None
+    last_audio_hidden_state: torch.FloatTensor = None
+    pixel_label_masks: torch.LongTensor = None
+    audio_label_masks: torch.LongTensor = None
+    pixel_ids_restore: torch.LongTensor = None
+    audio_ids_restore: torch.LongTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class TvltDecoderOutput(ModelOutput):
+    """
+    Class for TvltDecoder's outputs, with potential hidden states and attentions.
+
+    Args:
+        logits (`torch.FloatTensor` of shape `(batch_size, patch_size ** 2 * num_channels)`):
+            Pixel reconstruction logits.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+    """
+
+    logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class TvltForPreTrainingOutput(ModelOutput):
+    """
+    Class for TvltForPreTraining's outputs, with potential hidden states and attentions.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`):
+            Pixel reconstruction loss.
+        matching_logits (`torch.FloatTensor` of shape `(batch_size, 1)`):
+            Matching objective logits.
+        pixel_logits (`torch.FloatTensor` of shape
+            `(batch_size, pixel_patch_length, image_patch_size ** 3 * pixel_num_channels)`): Pixel reconstruction
+            logits.
+        audio_logits (`torch.FloatTensor` of shape
+            `(batch_size, audio_patch_length, image_patch_size[0] * image_patch_size[1])`): Audio reconstruction
+            logits.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    matching_logits: torch.FloatTensor = None
+    pixel_logits: torch.FloatTensor = None
+    audio_logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+def generate_pixel_mask_noise(pixel_values, pixel_mask=None, mask_ratio=0.75):
+    """Generate noise for audio masking."""
+
+    batch_size, seq_len = pixel_values.shape[:2]
+    noise = torch.rand((batch_size, seq_len), device=pixel_values.device)  # noise in [0, 1]
+    len_keep = int(seq_len * (1 - mask_ratio))
+    return noise, len_keep
+
+
+def generate_audio_mask_noise(audio_values, audio_mask=None, mask_ratio=0.75, mask_type="patch-level", freq_len=8):
+    """Generate noise for audio masking."""
+
+    batch_size, seq_len = audio_values.shape[:2]
+    if mask_type == "frame-level":
+        num_time_patches = seq_len // freq_len
+        noise = (
+            torch.rand(batch_size, num_time_patches, device=audio_values.device)
+            .unsqueeze(-1)
+            .repeat(1, 1, freq_len)
+            .view(batch_size, seq_len)
+        )  # noise in [0, 1]
+    elif mask_type == "patch-level":
+        noise = torch.rand(batch_size, seq_len, device=audio_values.device)  # noise in [0, 1]
+    len_keep = int(seq_len * (1 - mask_ratio))
+    return noise, len_keep
+
+
+def random_masking(sequence, noise, len_keep, attention_masks=None):
+    """
+    Perform random masking by per-sample shuffling on frame-level. Per-sample shuffling is done by argsort random
+    noise. sequence: [batch_size, seq_len, hidden_dim], sequence
+    """
+
+    batch_size, seq_len, hidden_dim = sequence.shape
+
+    # sort noise for each sample
+    ids_shuffle = torch.argsort(noise, dim=1)  # ascend: small is keep, large is remove
+    ids_restore = torch.argsort(ids_shuffle, dim=1)
+
+    # keep the first subset
+    ids_keep = ids_shuffle[:, :len_keep]
+    sequence_masked = torch.gather(sequence, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, hidden_dim))
+
+    # generate the binary mask: 0 is keep, 1 is remove
+    label_masks = torch.ones([batch_size, seq_len], device=sequence.device)
+    label_masks[:, :len_keep] = 0
+    # unshuffle to get the binary mask
+    label_masks = torch.gather(label_masks, dim=1, index=ids_restore)
+
+    if attention_masks is not None:
+        label_masks *= attention_masks
+        attention_masks = torch.gather(attention_masks, dim=1, index=ids_keep)
+
+    return sequence_masked, attention_masks, label_masks, ids_restore
+
+
+class TvltPixelEmbeddings(nn.Module):
+    """Construct the patch and position embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.patch_embeddings = TvltPixelPatchEmbeddings(config)
+        self.num_patches_per_image = self.patch_embeddings.num_patches_per_image
+
+        self.type_embed_v = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        self.temporal_embed = nn.Parameter(torch.zeros(1, config.num_frames, config.hidden_size))
+        self.pos_embed_v = nn.Parameter(torch.zeros(1, self.num_patches_per_image, config.hidden_size))
+
+        self.config = config
+
+    def forward(self, pixel_values, attention_masks=None):
+        # create patch embeddings
+        batch_size, num_frames, num_channels, height, width = pixel_values.shape
+
+        embeddings = self.patch_embeddings(pixel_values)
+        embeddings += self.pos_embed_v.repeat(1, num_frames, 1)
+        embeddings += torch.repeat_interleave(self.temporal_embed[:, :num_frames], self.num_patches_per_image, dim=1)
+        embeddings += self.type_embed_v
+
+        return embeddings, attention_masks
+
+
+class TvltAudioEmbeddings(nn.Module):
+    """Construct the patch and position embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.patch_embeddings = TvltAudioPatchEmbeddings(config)
+        self.num_patches = self.patch_embeddings.num_patches
+
+        self.type_embed_a = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        self.num_freq_patches = config.frequency_length // config.audio_patch_size[1]
+        self.pos_embed_a = nn.Parameter(torch.zeros(1, self.num_patches // self.num_freq_patches, config.hidden_size))
+        self.freq_embed = nn.Parameter(torch.zeros(1, self.num_freq_patches, config.hidden_size))
+
+        self.num_freq_patches = config.frequency_length // config.audio_patch_size[1]
+        self.config = config
+
+    def forward(self, audio_values, attention_masks=None):
+        # create patch embeddings
+        embeddings = self.patch_embeddings(audio_values)
+
+        num_time_patches = embeddings.size(1) // self.num_freq_patches
+        embeddings += self.freq_embed.repeat(1, num_time_patches, 1)
+        embeddings += torch.repeat_interleave(self.pos_embed_a[:, :num_time_patches], self.num_freq_patches, dim=1)
+        embeddings += self.type_embed_a
+
+        return embeddings, attention_masks
+
+
+class TvltPixelPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.image_patch_size
+        num_channels, hidden_size = config.num_image_channels, config.hidden_size
+
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches_per_image = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches_per_image = num_patches_per_image
+        self.hidden_size = hidden_size
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        batch_size, num_frames, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        if height != self.image_size[0] or width != self.image_size[1]:
+            raise ValueError(
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
+            )
+
+        pixel_values = pixel_values.reshape(batch_size * num_frames, num_channels, height, width)
+        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        embeddings = embeddings.reshape(batch_size, num_frames * self.num_patches_per_image, self.hidden_size)
+
+        return embeddings
+
+
+class TvltAudioPatchEmbeddings(nn.Module):
+    """
+    This class turns `audio_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        spectrogram_length, frequency_length, patch_size = (
+            config.spectrogram_length,
+            config.frequency_length,
+            config.audio_patch_size,
+        )
+        num_channels, hidden_size = config.num_audio_channels, config.hidden_size
+
+        spectrogram_size = (spectrogram_length, frequency_length)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (spectrogram_size[1] // patch_size[1]) * (spectrogram_size[0] // patch_size[0])
+        patch_shape = (spectrogram_size[0] // patch_size[0], spectrogram_size[1] // patch_size[1])
+        self.spectrogram_size = spectrogram_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+        self.patch_shape = patch_shape
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, audio_values: torch.Tensor) -> torch.Tensor:
+        batch_size, num_channels, height, width = audio_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        if height > self.spectrogram_size[0] or width != self.spectrogram_size[1]:
+            raise ValueError(
+                f"Input audio size ({height}*{width}) doesn't match model"
+                f" ({self.spectrogram_size[0]}*{self.spectrogram_size[1]})."
+            )
+        embeddings = self.projection(audio_values).flatten(2).transpose(1, 2)
+
+        return embeddings
+
+
+# Copied from transformers.models.vilt.modeling_vilt.ViltSelfAttention with Vilt->Tvlt
+class TvltSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False):
+        mixed_query_layer = self.query(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+# Copied from transformers.models.vilt.modeling_vilt.ViltSelfOutput with Vilt->Tvlt
+class TvltSelfOutput(nn.Module):
+    """
+    The residual connection is defined in TvltLayer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+
+    def __init__(self, config: TvltConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+# Copied from transformers.models.vilt.modeling_vilt.ViltAttention with Vilt->Tvlt
+class TvltAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.attention = TvltSelfAttention(config)
+        self.output = TvltSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False):
+        self_outputs = self.attention(hidden_states, attention_mask, head_mask, output_attentions)
+
+        attention_output = self.output(self_outputs[0], hidden_states)
+
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.vilt.modeling_vilt.ViltIntermediate with Vilt->Tvlt
+class TvltIntermediate(nn.Module):
+    def __init__(self, config: TvltConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+
+        return hidden_states
+
+
+# Copied from transformers.models.vilt.modeling_vilt.ViltOutput with Vilt->Tvlt
+class TvltOutput(nn.Module):
+    def __init__(self, config: TvltConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        hidden_states = hidden_states + input_tensor
+
+        return hidden_states
+
+
+# Copied from transformers.models.vilt.modeling_vilt.ViltLayer with Vilt->Tvlt
+class TvltLayer(nn.Module):
+    """This corresponds to the Block class in the timm implementation."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = TvltAttention(config)
+        self.intermediate = TvltIntermediate(config)
+        self.output = TvltOutput(config)
+        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False):
+        self_attention_outputs = self.attention(
+            self.layernorm_before(hidden_states),  # in ViLT, layernorm is applied before self-attention
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        # first residual connection
+        hidden_states = attention_output + hidden_states.to(attention_output.device)
+
+        # in ViLT, layernorm is also applied after self-attention
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.intermediate(layer_output)
+
+        # second residual connection is done here
+        layer_output = self.output(layer_output, hidden_states)
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+# Copied from transformers.models.vilt.modeling_vilt.ViltEncoder with Vilt->Tvlt
+class TvltEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([TvltLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, attention_mask, layer_head_mask, output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class TvltPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = TvltConfig
+    base_model_prefix = "tvlt"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, TvltEncoder):
+            module.gradient_checkpointing = value
+
+
+TVLT_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`TvltConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+TVLT_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for
+            details.
+
+        audio_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Audio values. Audio values can be obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for
+            details.
+
+        pixel_mask (`torch.FloatTensor` of shape `(batch_size, num_pixel_patches)`):
+            Pixel masks. Pixel masks can be obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for
+            details.
+
+        audio_mask (`torch.FloatTensor` of shape `(batch_size, num_audio_patches)`):
+            Audio masks. Audio masks can be obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for
+            details.
+
+        pixel_values_mixed (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
+            Pixel values that mix positive and negative samples in Tvlt vision-audio matching. Pixel values mixed can
+            be obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for details.
+
+        pixel_mask_mixed (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel masks of pixel_values_mixed. Pixel masks mixed can be obtained using [`TvltProcessor`]. See
+            [`TvltProcessor.__call__`] for details.
+
+        mask_pixel (`bool`, *optional*):
+            Whether to mask pixel for MAE tasks. Only set to True in TvltForPreTraining.
+
+        mask_audio (`bool`, *optional*):
+            Whether to mask audio for MAE tasks. Only set to True in TvltForPreTraining.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare TVLT Model transformer outputting raw hidden-states without any specific head on top.",
+    TVLT_START_DOCSTRING,
+)
+class TvltModel(TvltPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        self.pixel_embeddings = TvltPixelEmbeddings(config)
+        self.audio_embeddings = TvltAudioEmbeddings(config)
+        self.encoder = TvltEncoder(config)
+
+        self.cls_embedding = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+
+        if config.use_mean_pooling:
+            self.layernorm = None
+        else:
+            self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.pixel_embeddings.patch_embeddings, self.audio_embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(TVLT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TvltModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        audio_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.FloatTensor] = None,
+        audio_mask: Optional[torch.FloatTensor] = None,
+        mask_pixel: bool = False,
+        mask_audio: bool = False,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], TvltModelOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import TvltProcessor, TvltModel
+        >>> import numpy as np
+        >>> import torch
+
+        >>> num_frames = 8
+        >>> images = list(np.random.randn(num_frames, 3, 224, 224))
+        >>> audio = list(np.random.randn(10000))
+
+        >>> processor = TvltProcessor.from_pretrained("ZinengTang/tvlt-base")
+        >>> model = TvltModel.from_pretrained("ZinengTang/tvlt-base")
+
+        >>> input_dict = processor(images, audio, sampling_rate=44100, return_tensors="pt")
+
+        >>> outputs = model(**input_dict)
+        >>> loss = outputs.loss
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        pixel_embedding_output, pixel_mask = self.pixel_embeddings(pixel_values, pixel_mask)
+
+        audio_embedding_output, audio_mask = self.audio_embeddings(audio_values, audio_mask)
+
+        # Mask pixel if mask_pixel is True
+        pixel_label_masks = None
+        pixel_ids_restore = None
+        if mask_pixel:
+            pixel_mask_noise, pixel_len_keep = generate_pixel_mask_noise(
+                pixel_embedding_output, pixel_mask=pixel_mask, mask_ratio=self.config.pixel_mask_ratio
+            )
+            pixel_embedding_output, pixel_mask, pixel_label_masks, pixel_ids_restore = random_masking(
+                pixel_embedding_output,
+                pixel_mask_noise,
+                pixel_len_keep,
+                attention_masks=pixel_mask,
+            )
+
+        # Mask audio if mask_audio is True
+        audio_label_masks = None
+        audio_ids_restore = None
+        if mask_audio:
+            num_freq_patches = self.config.frequency_length // self.config.audio_patch_size[1]
+            audio_mask_noise, audio_len_keep = generate_audio_mask_noise(
+                audio_embedding_output,
+                audio_mask=audio_mask,
+                mask_ratio=self.config.audio_mask_ratio,
+                mask_type=self.config.audio_mask_type,
+                freq_len=num_freq_patches,
+            )
+            audio_embedding_output, audio_mask, audio_label_masks, audio_ids_restore = random_masking(
+                audio_embedding_output,
+                audio_mask_noise,
+                audio_len_keep,
+                attention_masks=audio_mask,
+            )
+
+        # Prepare for encoder inputs and attention masks
+        batch_size = pixel_values.size(0)
+        embedding_output = torch.cat(
+            [self.cls_embedding.repeat(batch_size, 1, 1), pixel_embedding_output, audio_embedding_output], 1
+        )
+        masked_pixel_len = pixel_embedding_output.size(1)
+
+        attention_mask = None
+        if pixel_mask is not None and audio_mask is not None:
+            attention_mask = torch.cat([pixel_mask[:, :1], pixel_mask, audio_mask], 1)
+
+        input_shape = embedding_output.size()
+        extended_attention_mask = None
+        if attention_mask is not None:
+            extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        if self.layernorm is not None:
+            sequence_output = self.layernorm(sequence_output)
+
+        pixel_sequence_output = sequence_output[:, 1 : 1 + masked_pixel_len]
+        audio_sequence_output = sequence_output[:, 1 + masked_pixel_len :]
+        if not return_dict:
+            return (
+                sequence_output,
+                pixel_sequence_output,
+                audio_sequence_output,
+                pixel_label_masks,
+                audio_label_masks,
+                pixel_ids_restore,
+                audio_ids_restore,
+            ) + encoder_outputs[1:]
+
+        return TvltModelOutput(
+            last_hidden_state=sequence_output,
+            last_pixel_hidden_state=pixel_sequence_output,
+            last_audio_hidden_state=audio_sequence_output,
+            pixel_label_masks=pixel_label_masks,
+            audio_label_masks=audio_label_masks,
+            pixel_ids_restore=pixel_ids_restore,
+            audio_ids_restore=audio_ids_restore,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class TvltDecoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        decoder_config = deepcopy(config)
+        decoder_config.hidden_size = config.decoder_hidden_size
+        decoder_config.num_hidden_layers = config.decoder_num_hidden_layers
+        decoder_config.num_attention_heads = config.decoder_num_attention_heads
+        decoder_config.intermediate_size = config.decoder_intermediate_size
+        self.decoder_layers = nn.ModuleList(
+            [TvltLayer(decoder_config) for _ in range(config.decoder_num_hidden_layers)]
+        )
+
+        self.layernorm = nn.LayerNorm(config.decoder_hidden_size, eps=config.layer_norm_eps)
+
+        self.gradient_checkpointing = False
+        self.config = config
+
+    def forward(
+        self,
+        hidden_states,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        # apply Transformer layers (blocks)
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        for i, layer_module in enumerate(self.decoder_layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    None,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, output_attentions=output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        # predictor projection
+        logits = self.layernorm(hidden_states)
+
+        if not return_dict:
+            return tuple(v for v in [logits, all_hidden_states, all_self_attentions] if v is not None)
+        return TvltDecoderOutput(logits=logits, hidden_states=all_hidden_states, attentions=all_self_attentions)
+
+
+@add_start_docstrings(
+    "The TVLT Model transformer with the decoder on top for self-supervised pre-training.",
+    TVLT_START_DOCSTRING,
+)
+class TvltForPreTraining(TvltPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        self.task_matching = config.task_matching
+        self.task_mae = config.task_mae
+        if not (self.task_matching or self.task_mae):
+            raise ValueError("Must set at least one of matching task and MAE task to true")
+
+        self.tvlt = TvltModel(config)
+
+        if self.task_matching:
+            self.matching_head = TvltMatchingHead(config)
+
+        if self.task_mae:
+            self.encoder_to_decoder = nn.Linear(config.hidden_size, config.decoder_hidden_size, bias=True)
+
+            self.pixel_mask_token = nn.Parameter(torch.zeros(1, 1, config.decoder_hidden_size))
+            self.audio_mask_token = nn.Parameter(torch.zeros(1, 1, config.decoder_hidden_size))
+
+            self.decoder = TvltDecoder(config)
+
+            decoder_hidden_size = config.decoder_hidden_size
+
+            num_frames = config.num_frames
+            num_patches_per_image = self.tvlt.pixel_embeddings.num_patches_per_image
+            self.decoder_pixel_pos_embed = nn.Parameter(torch.zeros(1, num_patches_per_image, decoder_hidden_size))
+            self.decoder_temporal_embed = nn.Parameter(torch.zeros(1, config.num_frames, decoder_hidden_size))
+            self.decoder_pixel_type_embed = nn.Parameter(torch.zeros(1, 1, decoder_hidden_size))
+
+            num_audio_patches = self.tvlt.audio_embeddings.num_patches
+            num_freq_patches = config.frequency_length // config.audio_patch_size[1]
+            self.decoder_audio_pos_embed = nn.Parameter(
+                torch.zeros(1, num_audio_patches // num_freq_patches, decoder_hidden_size)
+            )
+            self.decoder_freq_embed = nn.Parameter(torch.zeros(1, num_freq_patches, decoder_hidden_size))
+            self.decoder_audio_type_embed = nn.Parameter(torch.zeros(1, 1, decoder_hidden_size))
+
+            pixel_mae_output_dim = self.config.image_patch_size[0] ** 2 * self.config.num_image_channels
+            self.pixel_mae_head = TvltMAEHead(config, pixel_mae_output_dim)
+            audio_mae_output_dim = (
+                self.config.audio_patch_size[0] * self.config.audio_patch_size[1] * self.config.num_audio_channels
+            )
+            self.audio_mae_head = TvltMAEHead(config, audio_mae_output_dim)
+
+            self.num_frames = num_frames
+            self.num_patches_per_image = num_patches_per_image
+            self.num_freq_patches = num_freq_patches
+            self.image_patch_size = config.image_patch_size
+            self.audio_patch_size = config.audio_patch_size
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def patchify_pixel(self, pixel_values):
+        """
+        pixel_values: [batch_size, num_frames, 3, height, width]
+        """
+        batch_size, num_frames, num_channels, height, width = pixel_values.shape
+        num_patches_height = pixel_values.shape[3] // self.image_patch_size[0]
+        num_patches_width = pixel_values.shape[4] // self.image_patch_size[1]
+        patchified_pixel_values = pixel_values.reshape(
+            shape=(
+                batch_size,
+                num_frames,
+                num_channels,
+                num_patches_height,
+                self.image_patch_size[0],
+                num_patches_width,
+                self.image_patch_size[1],
+            )
+        )
+        patchified_pixel_values = torch.einsum("ntchpwq->nthwpqc", patchified_pixel_values)
+        patchified_pixel_values = patchified_pixel_values.reshape(
+            shape=(
+                batch_size,
+                num_patches_height * num_patches_width * num_frames,
+                self.image_patch_size[0] * self.image_patch_size[1] * num_channels,
+            )
+        )
+        return patchified_pixel_values
+
+    def patchify_audio(self, audio_values):
+        """
+        audio_values: [batch_size, 1, height, width]
+        """
+        batch_size, num_channels, height, width = audio_values.shape
+        num_patches_height = height // self.audio_patch_size[0]
+        num_patches_width = width // self.audio_patch_size[1]
+        patchified_audio_values = audio_values.reshape(
+            shape=(
+                batch_size,
+                num_channels,
+                num_patches_height,
+                self.audio_patch_size[0],
+                num_patches_width,
+                self.audio_patch_size[1],
+            )
+        )
+        patchified_audio_values = torch.einsum("nchpwq->nhwpqc", patchified_audio_values)
+        patchified_audio_values = patchified_audio_values.reshape(
+            shape=(
+                batch_size,
+                num_patches_height * num_patches_width,
+                self.audio_patch_size[0] * self.audio_patch_size[1] * num_channels,
+            )
+        )
+        return patchified_audio_values
+
+    def pixel_mae_loss(self, pixel_values, pixel_predictions, mask):
+        patchified_pixel_values = self.patchify_pixel(pixel_values)
+        loss = (pixel_predictions - patchified_pixel_values) ** 2
+        loss = loss.mean(dim=-1)  # [batch_size, pixel_pixel_length], mean loss per patch
+        loss = (loss * mask).sum() / mask.sum()  # mean loss on removed patches
+        return loss
+
+    def audio_mae_loss(self, audio_values, audio_predictions, mask):
+        patchified_audio_values = self.patchify_audio(audio_values)
+        loss = (audio_predictions - patchified_audio_values) ** 2
+        loss = loss.mean(dim=-1)  # [batch_size, audio_pixel_length], mean loss per patch
+        loss = (loss * mask).sum() / mask.sum()  # mean loss on removed patches
+        return loss
+
+    def concatenate_mask(self, mask_token, sequence, ids_restore):
+        batch_size, seq_length, dim = sequence.shape
+        mask_tokens = mask_token.repeat(batch_size, ids_restore.shape[1] - seq_length, 1)
+        padded_sequence = torch.cat([sequence, mask_tokens], dim=1)
+        padded_sequence = torch.gather(
+            padded_sequence, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, dim)
+        )  # unshuffle
+        return padded_sequence
+
+    @add_start_docstrings_to_model_forward(TVLT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TvltForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        audio_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.FloatTensor] = None,
+        audio_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        pixel_values_mixed: Optional[torch.FloatTensor] = None,
+        pixel_mask_mixed: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], TvltForPreTrainingOutput]:
+        r"""
+        pixel_values_mixed (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
+            Pixel values that mix positive and negative samples in Tvlt vision-audio matching. Audio values can be
+            obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for details.
+
+        pixel_mask_mixed (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel masks of pixel_values_mixed. Pixel values mixed can be obtained using [`TvltProcessor`]. See
+            [`TvltProcessor.__call__`] for details.
+
+        labels (`torch.LongTensor` of shape `(batch_size, num_labels)`, *optional*):
+            Labels for computing the vision audio matching loss. Indices should be in `[0, 1]`. num_labels has to be 1.
+
+        Return:
+
+        Examples:
+
+        ```python
+        >>> from transformers import TvltProcessor, TvltForPreTraining
+        >>> import numpy as np
+        >>> import torch
+
+        >>> num_frames = 8
+        >>> images = list(np.random.randn(num_frames, 3, 224, 224))
+        >>> images_mixed = list(np.random.randn(num_frames, 3, 224, 224))
+        >>> audio = list(np.random.randn(10000))
+        >>> processor = TvltProcessor.from_pretrained("ZinengTang/tvlt-base")
+        >>> model = TvltForPreTraining.from_pretrained("ZinengTang/tvlt-base")
+        >>> input_dict = processor(
+        ...     images, audio, images_mixed, sampling_rate=44100, mask_pixel=True, mask_audio=True, return_tensors="pt"
+        ... )
+
+        >>> outputs = model(**input_dict)
+        >>> loss = outputs.loss
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        total_loss = 0.0
+
+        if self.task_matching:
+            if labels is None:
+                raise ValueError("Matching task requires labels")
+            if pixel_values_mixed is None:
+                raise ValueError("Matching task requires pixel_values_mixed")
+
+            outputs = self.tvlt(
+                pixel_values_mixed,
+                audio_values,
+                pixel_mask=pixel_mask_mixed,
+                audio_mask=audio_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+
+            sequence_output = outputs[0]
+            matching_logits = self.matching_head(sequence_output)
+
+            loss_fct = BCEWithLogitsLoss()
+            loss = loss_fct(matching_logits.view(-1), labels.view(-1))
+            total_loss += loss
+
+        pixel_logits = None
+        audio_logits = None
+        if self.task_mae and self.training:
+            outputs = self.tvlt(
+                pixel_values,
+                audio_values,
+                pixel_mask=pixel_mask,
+                audio_mask=audio_mask,
+                mask_pixel=True,
+                mask_audio=True,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            pixel_sequence_output = outputs.last_pixel_hidden_state if return_dict else outputs[1]
+            audio_sequence_output = outputs.last_audio_hidden_state if return_dict else outputs[2]
+            pixel_label_masks = outputs.pixel_label_masks if return_dict else outputs[3]
+            audio_label_masks = outputs.audio_label_masks if return_dict else outputs[4]
+            pixel_ids_restore = outputs.pixel_ids_restore if return_dict else outputs[5]
+            audio_ids_restore = outputs.audio_ids_restore if return_dict else outputs[6]
+
+            pixel_decoder_input = self.encoder_to_decoder(
+                pixel_sequence_output
+            )  # [batch_size, num_masked_pixel_patches, decoder_hidden_size]
+            audio_decoder_input = self.encoder_to_decoder(
+                audio_sequence_output
+            )  # [batch_size, num_masked_audio_patches, decoder_hidden_size]
+            num_frames = pixel_values.size(1)
+            pixel_decoder_input = self.concatenate_mask(self.pixel_mask_token, pixel_decoder_input, pixel_ids_restore)
+            pixel_decoder_input = pixel_decoder_input + self.decoder_pixel_pos_embed.repeat(1, num_frames, 1)
+            pixel_decoder_input = pixel_decoder_input + torch.repeat_interleave(
+                self.decoder_temporal_embed[:, :num_frames], self.num_patches_per_image, dim=1
+            )
+            pixel_decoder_input = pixel_decoder_input + self.decoder_pixel_type_embed
+            pixel_decoder_outputs = self.decoder(pixel_decoder_input)
+            pixel_logits = self.pixel_mae_head(pixel_decoder_outputs.logits)
+
+            audio_decoder_input = self.concatenate_mask(self.audio_mask_token, audio_decoder_input, audio_ids_restore)
+            num_time_patches = audio_decoder_input.size(1) // self.num_freq_patches
+            audio_decoder_input = audio_decoder_input + self.decoder_freq_embed.repeat(1, num_time_patches, 1)
+            audio_decoder_input = audio_decoder_input + torch.repeat_interleave(
+                self.decoder_audio_pos_embed[:, :num_time_patches], self.num_freq_patches, dim=1
+            )
+            audio_decoder_input = audio_decoder_input + self.decoder_audio_type_embed
+            audio_decoder_outputs = self.decoder(audio_decoder_input)
+            audio_logits = self.audio_mae_head(audio_decoder_outputs.logits)
+
+            loss = self.pixel_mae_loss(pixel_values, pixel_logits, pixel_label_masks) + self.audio_mae_loss(
+                audio_values, audio_logits, audio_label_masks
+            )
+            total_loss += loss
+
+        if not return_dict:
+            output = (matching_logits, pixel_logits, audio_logits) + outputs[7:]
+            return ((total_loss,) + output) if loss is not None else output
+
+        return TvltForPreTrainingOutput(
+            loss=total_loss,
+            matching_logits=matching_logits,
+            pixel_logits=pixel_logits,
+            audio_logits=audio_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class TvltPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class TvltMatchingHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.pooler = TvltPooler(config)
+        self.fc = nn.Linear(config.hidden_size, 1)
+
+    def forward(self, hidden_states):
+        hidden_states = self.fc(self.pooler(hidden_states))
+        return hidden_states
+
+
+class TvltMAEHead(nn.Module):
+    def __init__(self, config, output_dim=None):
+        super().__init__()
+        self.config = config
+        self.decoder = nn.Linear(config.decoder_hidden_size, output_dim)
+
+    def forward(self, hidden_states):
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+@add_start_docstrings(
+    """
+    Tvlt Model transformer with a classifier head on top (an MLP on top of the final hidden state of the [CLS] token)
+    for audiovisual classification tasks, e.g. CMU-MOSEI Sentiment Analysis and Audio to Video Retrieval.
+    """,
+    TVLT_START_DOCSTRING,
+)
+class TvltForAudioVisualClassification(TvltPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.tvlt = TvltModel(config)
+
+        # Classifier head
+        self.classifier = nn.Sequential(
+            nn.Linear(config.hidden_size, config.hidden_size * 2),
+            nn.LayerNorm(config.hidden_size * 2, eps=config.layer_norm_eps),
+            nn.GELU(),
+            nn.Linear(config.hidden_size * 2, config.num_labels),
+        )
+        self.config = config
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(TVLT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        audio_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.FloatTensor] = None,
+        audio_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple[torch.FloatTensor], SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, num_labels)`, *optional*):
+            Labels for computing the audiovisual loss. Indices should be in `[0, ..., num_classes-1]` where num_classes
+            refers to the number of classes in audiovisual tasks.
+
+        Return:
+
+        Examples:
+        ```python
+        >>> from transformers import TvltProcessor, TvltForAudioVisualClassification
+        >>> import numpy as np
+        >>> import torch
+
+        >>> num_frames = 8
+        >>> images = list(np.random.randn(num_frames, 3, 224, 224))
+        >>> audio = list(np.random.randn(10000))
+        >>> processor = TvltProcessor.from_pretrained("ZinengTang/tvlt-base")
+        >>> model = TvltForAudioVisualClassification.from_pretrained("ZinengTang/tvlt-base")
+        >>> input_dict = processor(images, audio, sampling_rate=44100, return_tensors="pt")
+
+        >>> outputs = model(**input_dict)
+        >>> loss = outputs.loss
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.tvlt(
+            pixel_values,
+            audio_values,
+            pixel_mask=pixel_mask,
+            audio_mask=audio_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0][:, 0]
+        logits = self.classifier(sequence_output)  # rank value
+
+        loss = None
+        if labels is not None:
+            if self.config.loss_type == "regression":
+                loss_fct = MSELoss()
+                loss = loss_fct(logits, labels)
+            elif self.config.loss_type == "classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[4:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/tvlt/processing_tvlt.py b/src/transformers/models/tvlt/processing_tvlt.py
new file mode 100644
index 000000000000..b14a3437c285
--- /dev/null
+++ b/src/transformers/models/tvlt/processing_tvlt.py
@@ -0,0 +1,88 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for TVLT.
+"""
+
+from ...processing_utils import ProcessorMixin
+
+
+class TvltProcessor(ProcessorMixin):
+    r"""
+    Constructs a TVLT processor which wraps a TVLT image processor and TVLT feature extractor into a single processor.
+
+    [`TvltProcessor`] offers all the functionalities of [`TvltImageProcessor`] and [`TvltFeatureExtractor`]. See the
+    docstring of [`~TvltProcessor.__call__`] for more information.
+
+    Args:
+        image_processor (`TvltImageProcessor`):
+            An instance of [`TvltImageProcessor`]. The image processor is a required input.
+        feature_extractor (`TvltFeatureExtractor`):
+            An instance of [`TvltFeatureExtractor`]. The feature extractor is a required input.
+    """
+    attributes = ["image_processor", "feature_extractor"]
+    image_processor_class = "TvltImageProcessor"
+    feature_extractor_class = "TvltFeatureExtractor"
+
+    def __init__(self, image_processor, feature_extractor):
+        super().__init__(image_processor=image_processor, feature_extractor=feature_extractor)
+
+        self.image_processor = image_processor
+        self.feature_extractor = feature_extractor
+
+    def __call__(
+        self,
+        images=None,
+        audio=None,
+        images_mixed=None,
+        sampling_rate=None,
+        mask_audio=False,
+        mask_pixel=False,
+        *args,
+        **kwargs,
+    ):
+        """
+        Forwards the `images` argument to TvltImageProcessor's [`~TvltImageProcessor.preprocess`] and the `audio`
+        argument to TvltFeatureExtractor's [`~TvltFeatureExtractor.__call__`]. Please refer to the docstring of the
+        above two methods for more information.
+        """
+
+        if images is None and audio is None:
+            raise ValueError("You need to specify either an `images` or `audio` input to process.")
+
+        images_mixed_dict = None
+        if images is not None:
+            images_dict = self.image_processor(images, mask_pixel=mask_pixel, *args, **kwargs)
+        if images_mixed is not None:
+            images_mixed_dict = self.image_processor(images_mixed, is_mixed=True, *args, **kwargs)
+        if audio is not None:
+            audio_dict = self.feature_extractor(
+                audio, *args, sampling_rate=sampling_rate, mask_audio=mask_audio, **kwargs
+            )
+
+        output_dict = {}
+        if audio is not None:
+            output_dict.update(audio_dict)
+        if images is not None:
+            output_dict.update(images_dict)
+        if images_mixed_dict is not None:
+            output_dict.update(images_mixed_dict)
+        return output_dict
+
+    @property
+    def model_input_names(self):
+        image_processor_input_names = self.image_processor.model_input_names
+        feature_extractor_input_names = self.feature_extractor.model_input_names
+        return list(dict.fromkeys(image_processor_input_names + feature_extractor_input_names))
diff --git a/src/transformers/models/umt5/__init__.py b/src/transformers/models/umt5/__init__.py
new file mode 100644
index 000000000000..cd7301e36d28
--- /dev/null
+++ b/src/transformers/models/umt5/__init__.py
@@ -0,0 +1,58 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {"configuration_umt5": ["UMT5Config", "UMT5OnnxConfig"]}
+
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_umt5"] = [
+        "UMT5EncoderModel",
+        "UMT5ForConditionalGeneration",
+        "UMT5ForQuestionAnswering",
+        "UMT5ForSequenceClassification",
+        "UMT5Model",
+        "UMT5PreTrainedModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_umt5 import UMT5Config, UMT5OnnxConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_umt5 import (
+            UMT5EncoderModel,
+            UMT5ForConditionalGeneration,
+            UMT5ForQuestionAnswering,
+            UMT5ForSequenceClassification,
+            UMT5Model,
+            UMT5PreTrainedModel,
+        )
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/umt5/configuration_umt5.py b/src/transformers/models/umt5/configuration_umt5.py
new file mode 100644
index 000000000000..4c3b279230f1
--- /dev/null
+++ b/src/transformers/models/umt5/configuration_umt5.py
@@ -0,0 +1,186 @@
+# coding=utf-8
+# Copyright 2023, The T5 Authors and HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" UMT5 model configuration"""
+from typing import Mapping
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxSeq2SeqConfigWithPast
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+UMT5_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "google/umt5-small": "https://huggingface.co/google/umt5-small/resolve/main/config.json",
+    # See all umt5 models at https://huggingface.co/models?filter=umt5
+}
+
+
+class UMT5Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`UMT5Model`]. It is used to instantiate a UMT5
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the UMT5
+    [google/umt5-small](https://huggingface.co/google/umt5-small) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Arguments:
+        vocab_size (`int`, *optional*, defaults to 250112):
+            Vocabulary size of the UMT5 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`UMT5Model`] or [`TFUMT5Model`].
+        d_model (`int`, *optional*, defaults to 512):
+            Size of the encoder layers and the pooler layer.
+        d_kv (`int`, *optional*, defaults to 64):
+            Size of the key, query, value projections per attention head. `d_kv` has to be equal to `d_model //
+            num_heads`.
+        d_ff (`int`, *optional*, defaults to 1024):
+            Size of the intermediate feed forward layer in each `UMT5Block`.
+        num_layers (`int`, *optional*, defaults to 8):
+            Number of hidden layers in the Transformer encoder.
+        num_decoder_layers (`int`, *optional*):
+            Number of hidden layers in the Transformer decoder. Will use the same value as `num_layers` if not set.
+        num_heads (`int`, *optional*, defaults to 6):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        relative_attention_num_buckets (`int`, *optional*, defaults to 32):
+            The number of buckets to use for each attention layer.
+        relative_attention_max_distance (`int`, *optional*, defaults to 128):
+            The maximum distance of the longer sequences for the bucket separation.
+        dropout_rate (`float`, *optional*, defaults to 0.1):
+            The ratio for all dropout layers.
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for classifier.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+            The epsilon used by the layer normalization layers.
+        initializer_factor (`float`, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+        feed_forward_proj (`string`, *optional*, defaults to `"gated-gelu"`):
+            Type of feed forward layer to be used. Should be one of `"relu"` or `"gated-gelu"`.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+    """
+    model_type = "umt5"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=250112,
+        d_model=512,
+        d_kv=64,
+        d_ff=1024,
+        num_layers=8,
+        num_decoder_layers=None,
+        num_heads=6,
+        relative_attention_num_buckets=32,
+        relative_attention_max_distance=128,
+        dropout_rate=0.1,
+        layer_norm_epsilon=1e-6,
+        initializer_factor=1.0,
+        feed_forward_proj="gated-gelu",
+        is_encoder_decoder=True,
+        use_cache=True,
+        tokenizer_class="T5Tokenizer",
+        tie_word_embeddings=True,
+        pad_token_id=0,
+        eos_token_id=1,
+        decoder_start_token_id=0,
+        classifier_dropout=0.0,
+        **kwargs,
+    ):
+        super().__init__(
+            is_encoder_decoder=is_encoder_decoder,
+            tokenizer_class=tokenizer_class,
+            tie_word_embeddings=tie_word_embeddings,
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            decoder_start_token_id=decoder_start_token_id,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        self.d_kv = d_kv
+        self.d_ff = d_ff
+        self.num_layers = num_layers
+        self.num_decoder_layers = (
+            num_decoder_layers if num_decoder_layers is not None else self.num_layers
+        )  # default = symmetry
+        self.num_heads = num_heads
+        self.relative_attention_num_buckets = relative_attention_num_buckets
+        self.relative_attention_max_distance = relative_attention_max_distance
+        self.dropout_rate = dropout_rate
+        self.classifier_dropout = classifier_dropout
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_factor = initializer_factor
+        self.feed_forward_proj = feed_forward_proj
+        self.use_cache = use_cache
+
+        act_info = self.feed_forward_proj.split("-")
+        self.dense_act_fn = act_info[-1]
+        self.is_gated_act = act_info[0] == "gated"
+
+        if len(act_info) > 1 and act_info[0] != "gated" or len(act_info) > 2:
+            raise ValueError(
+                f"`feed_forward_proj`: {feed_forward_proj} is not a valid activation function of the dense layer."
+                "Please make sure `feed_forward_proj` is of the format `gated-{ACT_FN}` or `{ACT_FN}`, e.g. "
+                "'gated-gelu' or 'relu'"
+            )
+
+        if feed_forward_proj == "gated-gelu":
+            self.dense_act_fn = "gelu_new"
+
+    @property
+    def hidden_size(self):
+        return self.d_model
+
+    @property
+    def num_attention_heads(self):
+        return self.num_heads
+
+    @property
+    def num_hidden_layers(self):
+        return self.num_layers
+
+
+class UMT5OnnxConfig(OnnxSeq2SeqConfigWithPast):
+    @property
+    # Copied from transformers.models.t5.configuration_t5.T5OnnxConfig.inputs
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        common_inputs = {
+            "input_ids": {0: "batch", 1: "encoder_sequence"},
+            "attention_mask": {0: "batch", 1: "encoder_sequence"},
+        }
+        if self.use_past:
+            common_inputs["attention_mask"][1] = "past_encoder_sequence + sequence"
+            common_inputs["decoder_input_ids"] = {0: "batch"}
+            common_inputs["decoder_attention_mask"] = {0: "batch", 1: "past_decoder_sequence + sequence"}
+        else:
+            common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"}
+            common_inputs["decoder_attention_mask"] = {0: "batch", 1: "decoder_sequence"}
+
+        if self.use_past:
+            self.fill_with_past_key_values_(common_inputs, direction="inputs")
+
+        return common_inputs
+
+    @property
+    # Copied from transformers.models.t5.configuration_t5.T5OnnxConfig.default_onnx_opset
+    def default_onnx_opset(self) -> int:
+        return 13
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 5e-4
diff --git a/src/transformers/models/umt5/convert_umt5_checkpoint_to_pytorch.py b/src/transformers/models/umt5/convert_umt5_checkpoint_to_pytorch.py
new file mode 100644
index 000000000000..eeb5b3eb400e
--- /dev/null
+++ b/src/transformers/models/umt5/convert_umt5_checkpoint_to_pytorch.py
@@ -0,0 +1,274 @@
+# coding=utf-8
+# Copyright 2023 Google LLC and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Convert T5X checkpoint to PyTorch
+
+Steps:
+- Install gsutil according to https://cloud.google.com/storage/docs/gsutil_install
+- Get a T5X checkpoint at https://github.com/google-research/t5x/blob/main/docs/models.md#t5-11-checkpoints Example:
+    `gsutil -m cp -r gs://t5-data/pretrained_models/t5x/t5_1_1_small $HOME/`
+- Create or download a corresponding config for the downloaded model. E.g. for T5 v1.1 small, you can use
+    https://huggingface.co/google/t5-v1_1-small/blob/main/config.json
+- Convert:
+    ```
+    python3 convert_t5x_checkpoint_to_pytorch.py --t5x_checkpoint_path=$HOME/t5_1_1_small --config_file=config.json\
+      --pytorch_dump_path=$HOME/t5_1_1_small_pt
+    ```
+"""
+
+import argparse
+import collections
+
+import numpy as np
+import torch
+from flax import traverse_util
+from t5x import checkpoints
+
+from transformers import MT5Config, UMT5EncoderModel, UMT5ForConditionalGeneration
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+
+
+def t5x_relpos_bias_lookup(params, i, prefix):
+    """Returns the Relative Position Bias parameters of a layer. Does not transpose."""
+    return params[f"{prefix}/{prefix}/relpos_bias/rel_embedding"][:, i, :]
+
+
+def t5x_attention_lookup(params, i, prefix, layer_name="attention"):
+    """Returns the KOQV parameters of (self-)attention. Does not transpose."""
+    k_tmp = k_tmp = np.ascontiguousarray(params[f"{prefix}/{prefix}/{layer_name}/key/kernel"][:, i, :, :])
+    k = k_tmp.reshape(k_tmp.shape[0], k_tmp.shape[1] * k_tmp.shape[2])
+    o_tmp = np.ascontiguousarray(params[f"{prefix}/{prefix}/{layer_name}/out/kernel"][:, i, :, :])
+    o = o_tmp.reshape(o_tmp.shape[0] * o_tmp.shape[1], o_tmp.shape[2])
+    q_tmp = np.ascontiguousarray(params[f"{prefix}/{prefix}/{layer_name}/query/kernel"][:, i, :, :])
+    q = q_tmp.reshape(q_tmp.shape[0], q_tmp.shape[1] * q_tmp.shape[2])
+    v_tmp = np.ascontiguousarray(params[f"{prefix}/{prefix}/{layer_name}/value/kernel"][:, i, :, :])
+    v = v_tmp.reshape(v_tmp.shape[0], v_tmp.shape[1] * v_tmp.shape[2])
+    return k, o, q, v
+
+
+def t5x_mlp_lookup(params, i, prefix, split_mlp_wi=False):
+    """Returns the MLP parameters of a layer. Does not transpose."""
+    if split_mlp_wi:
+        wi_0 = params[f"{prefix}/{prefix}/mlp/wi_0/kernel"][:, i, :]
+        wi_1 = params[f"{prefix}/{prefix}/mlp/wi_1/kernel"][:, i, :]
+        wi = (wi_0, wi_1)
+    else:
+        wi = params[f"{prefix}/{prefix}/mlp/wi/kernel"][:, i, :]
+
+    wo = params[f"{prefix}/{prefix}/mlp/wo/kernel"][:, i, :]
+    return wi, wo
+
+
+def t5x_layer_norm_lookup(params, i, prefix, layer_name):
+    """Returns the layer norm param of a layer."""
+    return params[f"{prefix}/{prefix}/{layer_name}/scale"][:, i]
+
+
+def convert_t5x_to_pytorch(
+    variables: dict, *, num_layers: int, is_encoder_only: bool, scalable_attention: bool = False
+):
+    """Converts the parameters from T5X-Flax to Transformers-PyTorch."""
+    old = traverse_util.flatten_dict(variables["target"])
+    old = {"/".join(k): v for k, v in old.items()}
+
+    # v1.1 models have a gated GeLU with wi_0 and wi_1 instead of wi
+    split_mlp_wi = "encoder/encoder/mlp/wi_0/kernel" in old
+    print("Split MLP:", split_mlp_wi)
+
+    new = collections.OrderedDict()
+
+    # Shared embeddings.
+    new["shared.weight"] = old["token_embedder/embedding"]
+
+    # Encoder.
+    for i in range(num_layers):
+        # Block i, layer 0 (Self Attention).
+        layer_norm = t5x_layer_norm_lookup(old, i, "encoder", "pre_attention_layer_norm")
+        k, o, q, v = t5x_attention_lookup(old, i, "encoder", "attention")
+        new[f"encoder.block.{i}.layer.0.layer_norm.weight"] = layer_norm
+        new[f"encoder.block.{i}.layer.0.SelfAttention.k.weight"] = k.T
+        new[f"encoder.block.{i}.layer.0.SelfAttention.o.weight"] = o.T
+        new[f"encoder.block.{i}.layer.0.SelfAttention.q.weight"] = q.T
+        new[f"encoder.block.{i}.layer.0.SelfAttention.v.weight"] = v.T
+
+        # Block i, layer 1 (MLP).
+        layer_norm = t5x_layer_norm_lookup(old, i, "encoder", "pre_mlp_layer_norm")
+        wi, wo = t5x_mlp_lookup(old, i, "encoder", split_mlp_wi)
+        new[f"encoder.block.{i}.layer.1.layer_norm.weight"] = layer_norm
+        if split_mlp_wi:
+            new[f"encoder.block.{i}.layer.1.DenseReluDense.wi_0.weight"] = wi[0].T
+            new[f"encoder.block.{i}.layer.1.DenseReluDense.wi_1.weight"] = wi[1].T
+        else:
+            new[f"encoder.block.{i}.layer.1.DenseReluDense.wi.weight"] = wi.T
+        new[f"encoder.block.{i}.layer.1.DenseReluDense.wo.weight"] = wo.T
+        if scalable_attention:
+            # convert the rel_embedding of each layer
+            new[f"encoder.block.{i}.layer.0.SelfAttention.relative_attention_bias.weight"] = t5x_relpos_bias_lookup(
+                old, i, "encoder"
+            ).T
+
+    new["encoder.final_layer_norm.weight"] = old["encoder/encoder_norm/scale"]
+
+    if not scalable_attention:
+        new["encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"] = t5x_relpos_bias_lookup(
+            old, 0, "encoder"
+        ).T
+        new["decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"] = t5x_relpos_bias_lookup(
+            old, 0, "decoder"
+        ).T
+
+    if not is_encoder_only:
+        # Decoder.
+        for i in range(num_layers):
+            # Block i, layer 0 (Self Attention).
+            layer_norm = t5x_layer_norm_lookup(old, i, "decoder", "pre_self_attention_layer_norm")
+            k, o, q, v = t5x_attention_lookup(old, i, "decoder", "self_attention")
+            new[f"decoder.block.{i}.layer.0.layer_norm.weight"] = layer_norm
+            new[f"decoder.block.{i}.layer.0.SelfAttention.k.weight"] = k.T
+            new[f"decoder.block.{i}.layer.0.SelfAttention.o.weight"] = o.T
+            new[f"decoder.block.{i}.layer.0.SelfAttention.q.weight"] = q.T
+            new[f"decoder.block.{i}.layer.0.SelfAttention.v.weight"] = v.T
+
+            # Block i, layer 1 (Cross Attention).
+            layer_norm = t5x_layer_norm_lookup(old, i, "decoder", "pre_cross_attention_layer_norm")
+            k, o, q, v = t5x_attention_lookup(old, i, "decoder", "encoder_decoder_attention")
+            new[f"decoder.block.{i}.layer.1.layer_norm.weight"] = layer_norm
+            new[f"decoder.block.{i}.layer.1.EncDecAttention.k.weight"] = k.T
+            new[f"decoder.block.{i}.layer.1.EncDecAttention.o.weight"] = o.T
+            new[f"decoder.block.{i}.layer.1.EncDecAttention.q.weight"] = q.T
+            new[f"decoder.block.{i}.layer.1.EncDecAttention.v.weight"] = v.T
+
+            # Block i, layer 2 (MLP).
+            layer_norm = t5x_layer_norm_lookup(old, i, "decoder", "pre_mlp_layer_norm")
+            wi, wo = t5x_mlp_lookup(old, i, "decoder", split_mlp_wi)
+            new[f"decoder.block.{i}.layer.2.layer_norm.weight"] = layer_norm
+            if split_mlp_wi:
+                new[f"decoder.block.{i}.layer.2.DenseReluDense.wi_0.weight"] = wi[0].T
+                new[f"decoder.block.{i}.layer.2.DenseReluDense.wi_1.weight"] = wi[1].T
+            else:
+                new[f"encoder.block.{i}.layer.2.DenseReluDense.wi.weight"] = wi.T
+            new[f"decoder.block.{i}.layer.2.DenseReluDense.wo.weight"] = wo.T
+
+            if scalable_attention:
+                # convert the rel_embedding of each layer
+                new[
+                    f"decoder.block.{i}.layer.0.SelfAttention.relative_attention_bias.weight"
+                ] = t5x_relpos_bias_lookup(old, i, "decoder").T
+
+        new["decoder.final_layer_norm.weight"] = old["decoder/decoder_norm/scale"]
+
+        # LM Head (only in v1.1 checkpoints, in v1.0 embeddings are used instead)
+        if "decoder/logits_dense/kernel" in old:
+            new["lm_head.weight"] = old["decoder/logits_dense/kernel"].T
+
+    return new
+
+
+def make_state_dict(converted_params, is_encoder_only: bool):
+    """Prepares a state dict for the PyTorch model."""
+    # Make a state dict with torch tensors.
+    state_dict = collections.OrderedDict([(k, torch.from_numpy(v.copy())) for (k, v) in converted_params.items()])
+
+    # Add what is missing.
+    if "encoder.embed_tokens.weight" not in state_dict:
+        state_dict["encoder.embed_tokens.weight"] = state_dict["shared.weight"]
+
+    if not is_encoder_only:
+        if "decoder.embed_tokens.weight" not in state_dict:
+            state_dict["decoder.embed_tokens.weight"] = state_dict["shared.weight"]
+
+        if "lm_head.weight" not in state_dict:  # For old 1.0 models.
+            print("Using shared word embeddings as lm_head.")
+            state_dict["lm_head.weight"] = state_dict["shared.weight"]
+
+    return state_dict
+
+
+def load_t5x_weights_in_t5(model, config, t5x_checkpoint_path, is_encoder_only, scalable_attention):
+    """Replaces the params in model witht the T5X converted params."""
+    variables = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path)
+    converted = convert_t5x_to_pytorch(
+        variables, num_layers=config.num_layers, is_encoder_only=is_encoder_only, scalable_attention=scalable_attention
+    )
+    state_dict = make_state_dict(converted, is_encoder_only)
+    model.load_state_dict(state_dict, strict=True)
+
+
+def convert_t5x_checkpoint_to_pytorch(
+    t5x_checkpoint_path,
+    config_file,
+    pytorch_dump_path,
+    is_encoder_only: bool = False,
+    scalable_attention: bool = False,
+):
+    """Loads the config and model, converts the T5X checkpoint, and saves a PyTorch checkpoint."""
+    # Initialise PyTorch model
+    config = MT5Config.from_json_file(config_file)
+    print(f"Building PyTorch model from configuration: {config}")
+    # Non-v1.1 checkpoints could also use T5Model, but this works for all.
+    # The v1.0 checkpoints will simply have an LM head that is the word embeddings.
+    if is_encoder_only:
+        model = UMT5EncoderModel(config)
+    else:
+        model = UMT5ForConditionalGeneration(config)
+
+    # Load weights from tf checkpoint
+    load_t5x_weights_in_t5(model, config, t5x_checkpoint_path, is_encoder_only, scalable_attention)
+
+    # Save pytorch-model
+    print(f"Save PyTorch model to {pytorch_dump_path}")
+    model.save_pretrained(pytorch_dump_path)
+
+    # Verify that we can load the checkpoint.
+    model.from_pretrained(pytorch_dump_path)
+    print("Done")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Converts a native T5X checkpoint into a PyTorch checkpoint.")
+    # Required parameters
+    parser.add_argument(
+        "--t5x_checkpoint_path", default=None, type=str, required=True, help="Path to the T5X checkpoint."
+    )
+    parser.add_argument(
+        "--config_file",
+        default=None,
+        type=str,
+        required=True,
+        help="The config json file corresponding to the pre-trained T5 model.\nThis specifies the model architecture.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
+    parser.add_argument(
+        "--is_encoder_only", action="store_true", help="Check if the model is encoder-decoder model", default=False
+    )
+    parser.add_argument(
+        "--scalable_attention",
+        action="store_true",
+        help="Whether the model uses scaled attention (umt5 model)",
+        default=False,
+    )
+    args = parser.parse_args()
+    convert_t5x_checkpoint_to_pytorch(
+        args.t5x_checkpoint_path,
+        args.config_file,
+        args.pytorch_dump_path,
+        args.is_encoder_only,
+        args.scalable_attention,
+    )
diff --git a/src/transformers/models/umt5/modeling_umt5.py b/src/transformers/models/umt5/modeling_umt5.py
new file mode 100644
index 000000000000..8323054144f5
--- /dev/null
+++ b/src/transformers/models/umt5/modeling_umt5.py
@@ -0,0 +1,1754 @@
+# coding=utf-8
+# Copyright 2023 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch UMT5 model."""
+
+import copy
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from torch.utils.checkpoint import checkpoint
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+    Seq2SeqQuestionAnsweringModelOutput,
+    Seq2SeqSequenceClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    DUMMY_INPUTS,
+    DUMMY_MASK,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_torch_fx_proxy,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_umt5 import UMT5Config
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "UMT5Config"
+_CHECKPOINT_FOR_DOC = "google/umt5-small"
+
+
+# Copied from transformers.models.t5.modeling_t5.T5LayerNorm with T5->UMT5
+class UMT5LayerNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Construct a layernorm module in the UMT5 style. No bias and no subtraction of mean.
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        # UMT5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
+        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
+        # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
+        # half-precision inputs is done in fp32
+
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+
+        # convert into half-precision if necessary
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.to(self.weight.dtype)
+
+        return self.weight * hidden_states
+
+
+# Copied from transformers.models.t5.modeling_t5.T5DenseActDense with T5->UMT5
+class UMT5DenseActDense(nn.Module):
+    def __init__(self, config: UMT5Config):
+        super().__init__()
+        self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+        self.act = ACT2FN[config.dense_act_fn]
+
+    def forward(self, hidden_states):
+        hidden_states = self.wi(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        if (
+            isinstance(self.wo.weight, torch.Tensor)
+            and hidden_states.dtype != self.wo.weight.dtype
+            and self.wo.weight.dtype != torch.int8
+        ):
+            hidden_states = hidden_states.to(self.wo.weight.dtype)
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.t5.modeling_t5.T5DenseGatedActDense with T5->UMT5
+class UMT5DenseGatedActDense(nn.Module):
+    def __init__(self, config: UMT5Config):
+        super().__init__()
+        self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+        self.act = ACT2FN[config.dense_act_fn]
+
+    def forward(self, hidden_states):
+        hidden_gelu = self.act(self.wi_0(hidden_states))
+        hidden_linear = self.wi_1(hidden_states)
+        hidden_states = hidden_gelu * hidden_linear
+        hidden_states = self.dropout(hidden_states)
+
+        # To make 8bit quantization work for google/flan-t5-xxl, self.wo is kept in float32.
+        # See https://github.com/huggingface/transformers/issues/20287
+        # we also make sure the weights are not in `int8` in case users will force `_keep_in_fp32_modules` to be `None``
+        if (
+            isinstance(self.wo.weight, torch.Tensor)
+            and hidden_states.dtype != self.wo.weight.dtype
+            and self.wo.weight.dtype != torch.int8
+        ):
+            hidden_states = hidden_states.to(self.wo.weight.dtype)
+
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.t5.modeling_t5.T5LayerFF with T5->UMT5
+class UMT5LayerFF(nn.Module):
+    def __init__(self, config: UMT5Config):
+        super().__init__()
+        if config.is_gated_act:
+            self.DenseReluDense = UMT5DenseGatedActDense(config)
+        else:
+            self.DenseReluDense = UMT5DenseActDense(config)
+
+        self.layer_norm = UMT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, hidden_states):
+        forwarded_states = self.layer_norm(hidden_states)
+        forwarded_states = self.DenseReluDense(forwarded_states)
+        hidden_states = hidden_states + self.dropout(forwarded_states)
+        return hidden_states
+
+
+class UMT5Attention(nn.Module):
+    """
+    T5's attention using relative_attention_bias.
+    """
+
+    def __init__(self, config, has_relative_attention_bias=False):
+        super().__init__()
+        self.is_decoder = config.is_decoder
+        self.has_relative_attention_bias = has_relative_attention_bias
+        self.relative_attention_num_buckets = config.relative_attention_num_buckets
+        self.relative_attention_max_distance = config.relative_attention_max_distance
+        self.d_model = config.d_model
+        self.key_value_proj_dim = config.d_kv
+        self.n_heads = config.num_heads
+        self.dropout = config.dropout_rate
+        self.inner_dim = self.n_heads * self.key_value_proj_dim
+
+        # Mesh TensorFlow initialization to avoid scaling before softmax
+        self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)
+
+        if self.has_relative_attention_bias:
+            self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
+        self.pruned_heads = set()
+
+    def _shape(self, projection: torch.Tensor) -> torch.Tensor:
+        new_projection_shape = projection.size()[:-1] + (self.n_heads, self.key_value_proj_dim)
+        # move heads to 2nd position (B, T, H * D) -> (B, T, H, D) -> (B, H, T, D)
+        new_projection = projection.view(new_projection_shape).permute(0, 2, 1, 3)
+        return new_projection
+
+    def _relative_position_bucket(self, relative_position):
+        """
+        Adapted from Mesh Tensorflow:
+        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
+
+        Translate relative position to a bucket number for relative attention. The relative position is defined as
+        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
+        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
+        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
+        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
+        This should allow for more graceful generalization to longer sequences than the model has been trained on
+
+        Args:
+            relative_position: an int32 Tensor
+            bidirectional: a boolean - whether the attention is bidirectional
+            num_buckets: an integer
+            max_distance: an integer
+
+        Returns:
+            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
+        """
+        relative_buckets = 0
+        num_buckets = self.relative_attention_num_buckets
+        max_distance = self.relative_attention_max_distance
+        if not self.is_decoder:
+            num_buckets //= 2
+            relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
+            relative_position = torch.abs(relative_position)
+        else:
+            relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))
+        # now relative_position is in the range [0, inf)
+
+        # half of the buckets are for exact increments in positions
+        max_exact = num_buckets // 2
+        is_small = relative_position < max_exact
+
+        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
+        log_ratio = torch.log(relative_position.float() / max_exact) / math.log(max_distance / max_exact)
+        log_ratio = log_ratio * (num_buckets - max_exact)
+        relative_position_if_large = max_exact + log_ratio.to(torch.long)
+        relative_position_if_large = torch.min(
+            relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
+        )
+
+        relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)
+        return relative_buckets
+
+    def compute_bias(self, query_length, key_length, device=None):
+        """Compute binned relative position bias"""
+        if device is None:
+            device = self.relative_attention_bias.weight.device
+        context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]
+        memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :]
+        relative_position = memory_position - context_position  # shape (query_length, key_length)
+        relative_position_bucket = self._relative_position_bucket(relative_position)
+        values = self.relative_attention_bias(relative_position_bucket)  # shape (query_length, key_length, num_heads)
+        values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, query_length, key_length)
+        return values
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+    ):
+        is_cross_attention = encoder_hidden_states is not None
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        # use encoder_hidden_states if cross attention
+        current_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
+        # checking that the `sequence_length` of the `past_key_value` is the same as the he provided
+        # `encoder_hidden_states` to support prefix tuning
+        if is_cross_attention and past_key_value and past_key_value[0].shape[2] == current_states.shape[1]:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        else:
+            key_states = self._shape(self.k(current_states))
+            value_states = self._shape(self.v(current_states))
+            if past_key_value is not None and not is_cross_attention:
+                # reuse k, v, self_attention
+                key_states = torch.cat([past_key_value[0], key_states], dim=2)
+                value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+        query_states = self._shape(self.q(hidden_states))
+        attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2))
+
+        # compute positional bias
+        if self.has_relative_attention_bias:
+            query_length = seq_length
+            if past_key_value is not None:
+                query_length += past_key_value[0].shape[2]
+            position_bias = self.compute_bias(query_length, key_states.size(2), device=attention_scores.device)
+        else:
+            position_bias = torch.zeros(
+                (1, self.n_heads, seq_length, key_states.size(2)),
+                device=attention_scores.device,
+                dtype=attention_scores.dtype,
+                requires_grad=self.training,
+            )
+        if past_key_value is not None:
+            position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
+        if attention_mask is not None:
+            position_bias = position_bias + attention_mask  # (batch_size, n_heads, seq_length, key_length)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        attention_scores += position_bias
+        # (batch_size, n_heads, seq_length, key_length)
+        attn_weights = nn.functional.softmax(attention_scores.float(), dim=-1).type_as(attention_scores)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        # Mask heads if we want to
+        if layer_head_mask is not None:
+            attn_weights = attn_weights * layer_head_mask
+
+        #  attn_output = torch.bmm(attn_probs, value_states) ?
+        context_states = torch.matmul(attn_weights, value_states)
+        # attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) ?
+        context_states = context_states.permute(0, 2, 1, 3).contiguous().view(batch_size, seq_length, -1)
+        attn_output = self.o(context_states)
+        return attn_output, attn_weights, past_key_value
+
+
+class UMT5LayerSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.SelfAttention = UMT5Attention(config, has_relative_attention_bias=True)
+        self.layer_norm = UMT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        layer_head_mask=None,
+        past_key_value=None,
+    ):
+        normed_hidden_states = self.layer_norm(hidden_states)
+        attention_output = self.SelfAttention(
+            normed_hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            past_key_value=past_key_value,
+        )
+        hidden_states = hidden_states + self.dropout(attention_output[0])
+        outputs = (hidden_states,) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+class UMT5LayerCrossAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.EncDecAttention = UMT5Attention(config, has_relative_attention_bias=False)
+        self.layer_norm = UMT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(
+        self,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        layer_head_mask=None,
+        past_key_value=None,
+    ):
+        normed_hidden_states = self.layer_norm(hidden_states)
+        attention_output = self.EncDecAttention(
+            normed_hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            past_key_value=past_key_value,
+        )
+        layer_output = hidden_states + self.dropout(attention_output[0])
+        outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+class UMT5Block(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.is_decoder = config.is_decoder
+        self.layer = nn.ModuleList()
+        self.layer.append(UMT5LayerSelfAttention(config))
+        if self.is_decoder:
+            self.layer.append(UMT5LayerCrossAttention(config))
+
+        self.layer.append(UMT5LayerFF(config))
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        layer_head_mask=None,
+        cross_attn_layer_head_mask=None,
+        past_key_value=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+
+        hidden_states, self_attn_weights, present_key_value = self.layer[0](
+            hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            past_key_value=self_attn_past_key_value,
+        )
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == torch.float16:
+            max_dtype = torch.finfo(hidden_states.dtype).max
+            clamp_value = torch.where(torch.isinf(hidden_states).any(), max_dtype - 1000, max_dtype)
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        do_cross_attention = self.is_decoder and encoder_hidden_states is not None
+        if do_cross_attention:
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.layer[1](
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+            )
+            # clamp inf values to enable fp16 training
+            if hidden_states.dtype == torch.float16:
+                max_dtype = torch.finfo(hidden_states.dtype).max
+                clamp_value = torch.where(torch.isinf(hidden_states).any(), max_dtype - 1000, max_dtype)
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+            present_key_value += cross_attn_present_key_value
+
+        # Apply Feed Forward layer
+        hidden_states = self.layer[-1](hidden_states)
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == torch.float16:
+            max_dtype = torch.finfo(hidden_states.dtype).max
+            clamp_value = torch.where(torch.isinf(hidden_states).any(), max_dtype - 1000, max_dtype)
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (
+            hidden_states,
+            present_key_value,
+        )
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        return outputs
+
+
+# Copied from transformers.models.t5.modeling_t5.T5ClassificationHead with T5->UMT5
+class UMT5ClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config: UMT5Config):
+        super().__init__()
+        self.dense = nn.Linear(config.d_model, config.d_model)
+        self.dropout = nn.Dropout(p=config.classifier_dropout)
+        self.out_proj = nn.Linear(config.d_model, config.num_labels)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = torch.tanh(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.out_proj(hidden_states)
+        return hidden_states
+
+
+class UMT5PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = UMT5Config
+    base_model_prefix = "transformer"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["UMT5Block"]
+    _keep_in_fp32_modules = ["wo"]
+
+    @property
+    def dummy_inputs(self):
+        input_ids = torch.tensor(DUMMY_INPUTS)
+        input_mask = torch.tensor(DUMMY_MASK)
+        dummy_inputs = {
+            "decoder_input_ids": input_ids,
+            "input_ids": input_ids,
+            "decoder_attention_mask": input_mask,
+        }
+        return dummy_inputs
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_factor  # Used for testing weights initialization
+        if isinstance(module, UMT5LayerNorm):
+            module.weight.data.fill_(factor * 1.0)
+        elif isinstance(
+            module,
+            (
+                UMT5Model,
+                UMT5ForConditionalGeneration,
+                UMT5EncoderModel,
+                UMT5ForQuestionAnswering,
+            ),
+        ):
+            # Mesh TensorFlow embeddings initialization
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
+            module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0)
+            if hasattr(module, "lm_head") and not self.config.tie_word_embeddings:
+                module.lm_head.weight.data.normal_(mean=0.0, std=factor * 1.0)
+            if hasattr(module, "qa_outputs"):
+                module.qa_outputs.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+                module.qa_outputs.bias.data.zero_()
+        elif isinstance(module, UMT5ClassificationHead):
+            module.dense.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.dense, "bias") and module.dense.bias is not None:
+                module.dense.bias.data.zero_()
+            module.out_proj.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.out_proj, "bias") and module.out_proj.bias is not None:
+                module.out_proj.bias.data.zero_()
+        elif isinstance(module, UMT5DenseActDense):
+            # Mesh TensorFlow FF initialization
+            # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
+            # and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89
+            module.wi.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.wi, "bias") and module.wi.bias is not None:
+                module.wi.bias.data.zero_()
+            module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
+            if hasattr(module.wo, "bias") and module.wo.bias is not None:
+                module.wo.bias.data.zero_()
+        elif isinstance(module, UMT5DenseGatedActDense):
+            module.wi_0.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.wi_0, "bias") and module.wi_0.bias is not None:
+                module.wi_0.bias.data.zero_()
+            module.wi_1.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.wi_1, "bias") and module.wi_1.bias is not None:
+                module.wi_1.bias.data.zero_()
+            module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
+            if hasattr(module.wo, "bias") and module.wo.bias is not None:
+                module.wo.bias.data.zero_()
+        elif isinstance(module, UMT5Attention):
+            # Mesh TensorFlow attention initialization to avoid scaling before softmax
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136
+            d_model = self.config.d_model
+            key_value_proj_dim = self.config.d_kv
+            n_heads = self.config.num_heads
+            module.q.weight.data.normal_(mean=0.0, std=factor * ((d_model * key_value_proj_dim) ** -0.5))
+            module.k.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5))
+            module.v.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5))
+            module.o.weight.data.normal_(mean=0.0, std=factor * ((n_heads * key_value_proj_dim) ** -0.5))
+            if module.has_relative_attention_bias:
+                module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor * ((d_model) ** -0.5))
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (UMT5Attention, UMT5Stack)):
+            module.gradient_checkpointing = value
+
+    def _shift_right(self, input_ids):
+        decoder_start_token_id = self.config.decoder_start_token_id
+        pad_token_id = self.config.pad_token_id
+
+        if decoder_start_token_id is None:
+            raise ValueError(
+                "self.model.config.decoder_start_token_id has to be defined. In UMT5 it is usually set to the pad_token_id."
+                "See UMT5 docs for more information."
+            )
+
+        # shift inputs to the right
+        if is_torch_fx_proxy(input_ids):
+            # Item assignment is not supported natively for proxies.
+            shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), decoder_start_token_id)
+            shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)
+        else:
+            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+            shifted_input_ids[..., 0] = decoder_start_token_id
+
+        if pad_token_id is None:
+            raise ValueError("self.model.config.pad_token_id has to be defined.")
+        # replace possible -100 values in labels by `pad_token_id`
+        shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+        return shifted_input_ids
+
+
+class UMT5Stack(UMT5PreTrainedModel):
+    def __init__(self, config, embed_tokens=None):
+        super().__init__(config)
+        self.embed_tokens = embed_tokens
+        self.is_decoder = config.is_decoder
+        self.block = nn.ModuleList([UMT5Block(config) for i in range(config.num_layers)])
+        self.final_layer_norm = UMT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+        # Initialize weights and apply final processing
+        self.gradient_checkpointing = False
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, new_embeddings):
+        self.embed_tokens = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        inputs_embeds=None,
+        head_mask=None,
+        cross_attn_head_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            err_msg_prefix = "decoder_" if self.is_decoder else ""
+            raise ValueError(
+                f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            err_msg_prefix = "decoder_" if self.is_decoder else ""
+            raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds")
+
+        if inputs_embeds is None:
+            if self.embed_tokens is None:
+                raise ValueError("You have to initialize the model with valid token embeddings")
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        batch_size, seq_length = input_shape
+
+        # required mask seq length can be calculated via length of past
+        mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length
+
+        if use_cache is True:
+            if not self.is_decoder:
+                raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder")
+
+        if attention_mask is None:
+            attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
+        if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None:
+            encoder_seq_length = encoder_hidden_states.shape[1]
+            encoder_attention_mask = torch.ones(
+                batch_size, encoder_seq_length, device=inputs_embeds.device, dtype=torch.long
+            )
+
+        # initialize past_key_values with `None` if past does not exist
+        if past_key_values is None:
+            past_key_values = [None] * len(self.block)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=inputs_embeds.device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # Prepare head mask if needed
+        head_mask = self.get_head_mask(head_mask, self.config.num_layers)
+        cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
+        present_key_value_states = () if use_cache else None
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.is_decoder else None
+
+        hidden_states = self.dropout(inputs_embeds)
+
+        for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)):
+            layer_head_mask = head_mask[i]
+            cross_attn_layer_head_mask = cross_attn_head_mask[i]
+
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return tuple(module(*inputs, use_cache, output_attentions))
+
+                    return custom_forward
+
+                layer_outputs = checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    extended_attention_mask,
+                    encoder_hidden_states,
+                    encoder_extended_attention_mask,
+                    layer_head_mask,
+                    cross_attn_layer_head_mask,
+                    None,  # past_key_value is always None with gradient checkpointing
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask=extended_attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_extended_attention_mask,
+                    layer_head_mask=layer_head_mask,
+                    cross_attn_layer_head_mask=cross_attn_layer_head_mask,
+                    past_key_value=past_key_value,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                )
+
+                hidden_states = layer_outputs[0]
+
+            if use_cache:
+                present_key_value_states += (layer_outputs[1],)
+
+            if output_attentions:
+                all_attentions += (layer_outputs[2],)
+                if self.is_decoder:
+                    all_cross_attentions += (layer_outputs[3],)
+
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    present_key_value_states,
+                    all_hidden_states,
+                    all_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=present_key_value_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+UMT5_START_DOCSTRING = r"""
+
+    The UMT5 model was proposed in [Exploring the Limits of Transfer Learning with a Unified Text-to-Text
+    Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan
+    Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. It's an encoder decoder transformer pre-trained in a
+    text-to-text denoising generative setting.
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`UMT5Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+UMT5_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
+            you should be able to pad the inputs on both the right and the left.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for detail.
+
+            [What are input IDs?](../glossary#input-ids)
+
+            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
+        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
+            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
+
+            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
+            Training](./umt5#training).
+        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules in the encoder. Mask values selected in `[0,
+            1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
+            1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
+                `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, `optional`: *hidden_states*, `optional`: *attentions*)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` is a sequence of hidden states at
+            the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
+            input (see `past_key_values`). This is useful if you want more control over how to convert
+            `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+
+            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
+            of `inputs_embeds`.
+
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+UMT5_ENCODER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
+            you should be able to pad the inputs on both the right and the left.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for detail.
+
+            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
+        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare UMT5 Model transformer outputting raw hidden-states without any specific head on top.",
+    UMT5_START_DOCSTRING,
+)
+class UMT5Model(UMT5PreTrainedModel):
+    r"""
+    Examples:
+
+    ```python
+    >>> from transformers import UMT5Model, AutoTokenizer
+
+    >>> model = UMT5Model.from_pretrained("google/umt5-small")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
+    >>> noisy_text = "UN Offizier sagt, dass weiter <extra_id_0> werden muss in Syrien."
+    >>> label = "<extra_id_0> verhandelt"
+    >>> inputs = tokenizer(inputs, return_tensors="pt")
+    >>> labels = tokenizer(label=label, return_tensors="pt")
+
+    >>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=labels["input_ids"])
+    >>> hidden_states = outputs.last_hidden_state
+    ```"""
+    model_type = "uumt5"
+    config_class = UMT5Config
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_decoder = False
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = UMT5Stack(encoder_config, self.shared)
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.is_encoder_decoder = False
+        decoder_config.num_layers = config.num_decoder_layers
+        self.decoder = UMT5Stack(decoder_config, self.shared)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.t5.modeling_t5.T5Model.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.shared
+
+    # Copied from transformers.models.t5.modeling_t5.T5Model.set_input_embeddings
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+        self.decoder.set_input_embeddings(new_embeddings)
+
+    # Copied from transformers.models.t5.modeling_t5.T5Model.get_encoder
+    def get_encoder(self):
+        return self.encoder
+
+    # Copied from transformers.models.t5.modeling_t5.T5Model.get_decoder
+    def get_decoder(self):
+        return self.decoder
+
+    # Copied from transformers.models.t5.modeling_t5.T5Model._prune_heads
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(UMT5_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        decoder_head_mask: Optional[torch.FloatTensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        decoder_inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqModelOutput]:
+        r"""
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, UMT5Model
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
+        >>> model = UMT5Model.from_pretrained("google/umt5-small")
+
+        >>> input_ids = tokenizer(
+        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
+        ... ).input_ids  # Batch size 1
+        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
+
+        >>> # preprocess: Prepend decoder_input_ids with start token which is pad token for UMT5Model.
+        >>> # This is not needed for torch's UMT5ForConditionalGeneration as it does this internally using labels arg.
+        >>> decoder_input_ids = model._shift_right(decoder_input_ids)
+
+        >>> # forward pass
+        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # Encode if needed (training, first prediction pass)
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        hidden_states = encoder_outputs[0]
+
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            past_key_values=past_key_values,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings("""UMT5 Model with a `language modeling` head on top.""", UMT5_START_DOCSTRING)
+class UMT5ForConditionalGeneration(UMT5PreTrainedModel):
+    r"""
+    Examples:
+
+    ```python
+    >>> from transformers import UMT5ForConditionalGeneration, AutoTokenizer
+
+    >>> model = UMT5ForConditionalGeneration.from_pretrained("google/umt5-small")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
+    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
+    >>> summary = "Weiter Verhandlung in Syrien."
+    >>> inputs = tokenizer(article, text_target=summary, return_tensors="pt")
+
+    >>> outputs = model(**inputs)
+    >>> loss = outputs.loss
+    ```"""
+
+    model_type = "umt5"
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model_dim = config.d_model
+
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_decoder = False
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = UMT5Stack(encoder_config, self.shared)
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.is_encoder_decoder = False
+        decoder_config.num_layers = config.num_decoder_layers
+        self.decoder = UMT5Stack(decoder_config, self.shared)
+
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.shared
+
+    # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.set_input_embeddings
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+        self.decoder.set_input_embeddings(new_embeddings)
+
+    # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.set_output_embeddings
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.get_output_embeddings
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.get_encoder
+    def get_encoder(self):
+        return self.encoder
+
+    # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.get_decoder
+    def get_decoder(self):
+        return self.decoder
+
+    @add_start_docstrings_to_model_forward(UMT5_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        decoder_head_mask: Optional[torch.FloatTensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
+            labels in `[0, ..., config.vocab_size]`
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, UMT5ForConditionalGeneration
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
+        >>> model = UMT5ForConditionalGeneration.from_pretrained("google/umt5-small")
+
+        >>> # training
+        >>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
+        >>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
+        >>> outputs = model(input_ids=input_ids, labels=labels)
+        >>> loss = outputs.loss
+        >>> logits = outputs.logits
+
+        >>> # inference
+        >>> input_ids = tokenizer("Studies have shown that <extra_id_0> good for you", return_tensors="pt").input_ids
+        >>> outputs = model.generate(input_ids)
+        >>> tokenizer.decode(outputs[0], skip_special_tokens=True)
+        ```"""
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # Encode if needed (training, first prediction pass)
+        if encoder_outputs is None:
+            # Convert encoder inputs in embeddings if needed
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        hidden_states = encoder_outputs[0]
+
+        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
+            # get decoder inputs from shifting lm labels to the right
+            decoder_input_ids = self._shift_right(labels)
+
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            past_key_values=past_key_values,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = decoder_outputs[0]
+
+        if self.config.tie_word_embeddings:
+            # Rescale output before projecting on vocab
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
+            sequence_output = sequence_output * (self.model_dim**-0.5)
+
+        lm_logits = self.lm_head(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-100)
+            # move labels to correct device to enable PP
+            labels = labels.to(lm_logits.device)
+            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
+
+        if not return_dict:
+            output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
+            return ((loss,) + output) if loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.prepare_inputs_for_generation
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        decoder_attention_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs,
+    ):
+        # cut decoder_input_ids if past is used
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            "decoder_input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "encoder_outputs": encoder_outputs,
+            "attention_mask": attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "decoder_attention_mask": decoder_attention_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "use_cache": use_cache,
+        }
+
+    # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.prepare_decoder_input_ids_from_labels
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return self._shift_right(labels)
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+
+@add_start_docstrings(
+    "The bare UMT5 Model transformer outputting encoder's raw hidden-states without any specific head on top.",
+    UMT5_START_DOCSTRING,
+)
+class UMT5EncoderModel(UMT5PreTrainedModel):
+    r"""
+    Examples:
+
+    ```python
+    >>> from transformers import UMT5EncoderModel, AutoTokenizer
+
+    >>> model = UMT5EncoderModel.from_pretrained("google/umt5-small")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
+    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
+    >>> input_ids = tokenizer(article, return_tensors="pt").input_ids
+    >>> outputs = model(input_ids)
+    >>> hidden_state = outputs.last_hidden_state
+    ```"""
+
+    model_type = "umt5"
+    # config_class = UMT5Config
+    _tied_weights_keys = ["encoder.embed_tokens.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = UMT5Stack(encoder_config, self.shared)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.t5.modeling_t5.T5EncoderModel.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.shared
+
+    # Copied from transformers.models.t5.modeling_t5.T5EncoderModel.set_input_embeddings
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+
+    # Copied from transformers.models.t5.modeling_t5.T5EncoderModel.get_encoder
+    def get_encoder(self):
+        return self.encoder
+
+    # Copied from transformers.models.t5.modeling_t5.T5EncoderModel._prune_heads
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.block[layer].layer[0].SelfAttention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(UMT5_ENCODER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
+    # Copied from transformers.models.t5.modeling_t5.T5EncoderModel.forward with T5->UMT5, t5-small->google/umt5-small
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], BaseModelOutput]:
+        r"""
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, UMT5EncoderModel
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
+        >>> model = UMT5EncoderModel.from_pretrained("google/umt5-small")
+        >>> input_ids = tokenizer(
+        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
+        ... ).input_ids  # Batch size 1
+        >>> outputs = model(input_ids=input_ids)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_outputs = self.encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        return encoder_outputs
+
+
+@add_start_docstrings(
+    """
+    UMT5 model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
+    tasks.
+    """,
+    UMT5_START_DOCSTRING,
+)
+class UMT5ForSequenceClassification(UMT5PreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+
+    # Copied from transformers.models.t5.modeling_t5.T5ForSequenceClassification.__init__ with T5->UMT5
+    def __init__(self, config: UMT5Config):
+        super().__init__(config)
+        self.transformer = UMT5Model(config)
+        self.classification_head = UMT5ClassificationHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        self.model_parallel = False
+
+    @add_start_docstrings_to_model_forward(UMT5_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Seq2SeqSequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        Returns:
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        if input_ids is None and inputs_embeds is not None:
+            raise NotImplementedError(
+                f"Passing input embeddings is currently not supported for {self.__class__.__name__}"
+            )
+
+        # Copied from models.bart.modeling_bart.BartModel.forward different to other models, T5 automatically creates
+        # decoder_input_ids from input_ids if no decoder_input_ids are provided
+        if decoder_input_ids is None and decoder_inputs_embeds is None:
+            if input_ids is None:
+                raise ValueError(
+                    "If no `decoder_input_ids` or `decoder_inputs_embeds` are "
+                    "passed, `input_ids` cannot be `None`. Please pass either "
+                    "`input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`."
+                )
+            decoder_input_ids = self._shift_right(input_ids)
+
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+
+        eos_mask = input_ids.eq(self.config.eos_token_id).to(sequence_output.device)
+
+        if len(torch.unique_consecutive(eos_mask.sum(1))) > 1:
+            raise ValueError("All examples must have the same number of <eos> tokens.")
+        batch_size, _, hidden_size = sequence_output.shape
+        sentence_representation = sequence_output[eos_mask, :].view(batch_size, -1, hidden_size)[:, -1, :]
+        logits = self.classification_head(sentence_representation)
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.config.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.config.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.config.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return Seq2SeqSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    UMT5 Model with a span classification head on top for extractive question-answering tasks like SQuAD (linear layers
+    on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    UMT5_START_DOCSTRING,
+)
+class UMT5ForQuestionAnswering(UMT5PreTrainedModel):
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model_dim = config.d_model
+
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_decoder = False
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = UMT5Stack(encoder_config, self.shared)
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.is_encoder_decoder = False
+        decoder_config.num_layers = config.num_decoder_layers
+        self.decoder = UMT5Stack(decoder_config, self.shared)
+
+        self.num_labels = config.num_labels
+        self.qa_outputs = nn.Linear(config.d_model, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.t5.modeling_t5.T5ForQuestionAnswering.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.shared
+
+    # Copied from transformers.models.t5.modeling_t5.T5ForQuestionAnswering.set_input_embeddings
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+        self.decoder.set_input_embeddings(new_embeddings)
+
+    # Copied from transformers.models.t5.modeling_t5.T5ForQuestionAnswering.get_encoder
+    def get_encoder(self):
+        return self.encoder
+
+    # Copied from transformers.models.t5.modeling_t5.T5ForQuestionAnswering.get_decoder
+    def get_decoder(self):
+        return self.decoder
+
+    @add_start_docstrings_to_model_forward(UMT5_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqQuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        decoder_head_mask: Optional[torch.FloatTensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqQuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
+            are not taken into account for computing the loss.
+        Returns:
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if start_positions is not None and end_positions is not None:
+            use_cache = False
+
+        # Copied from models.bart.modeling_bart.BartModel.forward
+        #   different to other models, T5 automatically creates decoder_input_ids from
+        #   input_ids if no decoder_input_ids are provided
+        if decoder_input_ids is None and decoder_inputs_embeds is None:
+            if input_ids is None:
+                raise ValueError(
+                    "If no `decoder_input_ids` or `decoder_inputs_embeds` are "
+                    "passed, `input_ids` cannot be `None`. Please pass either "
+                    "`input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`."
+                )
+            decoder_input_ids = self._shift_right(input_ids)
+
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # Encode if needed (training, first prediction pass)
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        hidden_states = encoder_outputs[0]
+
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            past_key_values=None,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = decoder_outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1).to(start_logits.device)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1).to(end_logits.device)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + decoder_outputs[1:] + encoder_outputs
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return Seq2SeqQuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
diff --git a/src/transformers/models/unispeech/__init__.py b/src/transformers/models/unispeech/__init__.py
index 3713e7d8a11c..2800fa17076e 100644
--- a/src/transformers/models/unispeech/__init__.py
+++ b/src/transformers/models/unispeech/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/unispeech/configuration_unispeech.py b/src/transformers/models/unispeech/configuration_unispeech.py
index d041075c9b62..eda06fa3d4bf 100644
--- a/src/transformers/models/unispeech/configuration_unispeech.py
+++ b/src/transformers/models/unispeech/configuration_unispeech.py
@@ -61,10 +61,15 @@ class UniSpeechConfig(PretrainedConfig):
             `"relu"`, `"selu"` and `"gelu_new"` are supported.
         hidden_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        activation_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for activations inside the fully connected layer.
         attention_dropout (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
         final_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for the final projection layer of [`UniSpeechForCTC`].
+        layerdrop (`float`, *optional*, defaults to 0.1):
+            The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
+            details.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
@@ -226,7 +231,7 @@ def __init__(
         bos_token_id=1,
         eos_token_id=2,
         replace_prob=0.5,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
         self.hidden_size = hidden_size
diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py
index 2949a0201963..4c6a1ec13daa 100755
--- a/src/transformers/models/unispeech/modeling_unispeech.py
+++ b/src/transformers/models/unispeech/modeling_unispeech.py
@@ -26,10 +26,9 @@
 from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
-from ...deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
 from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput, Wav2Vec2BaseModelOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_int_div
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -48,7 +47,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "UniSpeechConfig"
-_PROCESSOR_FOR_DOC = "Wav2Vec2Processor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "patrickvonplaten/unispeech-large-1500h-cv-timit"
@@ -58,12 +56,6 @@
 _CTC_EXPECTED_OUTPUT = "'mister quilter is the apposl of the midle classes and weare glad to welcom his gosepl'"
 _CTC_EXPECTED_LOSS = 17.17
 
-# Audio class docstring
-_FEAT_EXTRACTOR_FOR_DOC = "Wav2Vec2FeatureExtractor"
-_SEQ_CLASS_CHECKPOINT = "hf-internal-testing/tiny-random-unispeech"
-_SEQ_CLASS_EXPECTED_OUTPUT = "'LABEL_0'"  # TODO(anton) - could you quickly fine-tune a KS WavLM Model
-_SEQ_CLASS_EXPECTED_LOSS = 0.66  # TODO(anton) - could you quickly fine-tune a KS WavLM Model
-
 UNISPEECH_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "microsoft/unispeech-large-1500h-cv",
     "microsoft/unispeech-large-multi-lingual-1500h-cv",
@@ -314,15 +306,19 @@ def __init__(self, config):
             groups=config.num_conv_pos_embedding_groups,
         )
 
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
         if is_deepspeed_zero3_enabled():
             import deepspeed
 
             with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
-                self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+                self.conv = weight_norm(self.conv, name="weight", dim=2)
             deepspeed.zero.register_external_parameter(self, self.conv.weight_v)
             deepspeed.zero.register_external_parameter(self, self.conv.weight_g)
         else:
-            self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+            self.conv = weight_norm(self.conv, name="weight", dim=2)
 
         self.padding = UniSpeechSamePadLayer(config.num_conv_pos_embeddings)
         self.activation = ACT2FN[config.feat_extract_activation]
@@ -524,8 +520,8 @@ def forward(
 
         proj_shape = (bsz * self.num_heads, -1, self.head_dim)
         query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-        key_states = key_states.view(*proj_shape)
-        value_states = value_states.view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
 
         src_len = key_states.size(1)
         attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
@@ -571,7 +567,7 @@ def forward(
 
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
             raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
                 f" {attn_output.size()}"
             )
 
@@ -579,7 +575,7 @@ def forward(
         attn_output = attn_output.transpose(1, 2)
 
         # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned aross GPUs when using tensor-parallelism.
+        # partitioned across GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
@@ -647,6 +643,32 @@ def forward(self, hidden_states, attention_mask=None, output_attentions=False):
         return outputs
 
 
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2AttnAdapterLayer with Wav2Vec2->UniSpeech
+class UniSpeechAttnAdapterLayer(nn.Module):
+    def __init__(self, config):
+        """
+        Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
+        up training throughput.
+        """
+        super().__init__()
+        self.input_dim = config.adapter_attn_dim
+        self.hidden_dim = config.hidden_size
+
+        self.norm = nn.LayerNorm(self.hidden_dim)
+        self.linear_1 = nn.Linear(self.hidden_dim, self.input_dim)
+        self.act_fn = nn.ReLU()
+        self.linear_2 = nn.Linear(self.input_dim, self.hidden_dim)
+
+    def forward(self, hidden_states: torch.FloatTensor):
+        hidden_states = self.norm(hidden_states)
+
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act_fn(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+
+        return hidden_states
+
+
 # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2EncoderLayerStableLayerNorm with Wav2Vec2->UniSpeech
 class UniSpeechEncoderLayerStableLayerNorm(nn.Module):
     def __init__(self, config):
@@ -662,6 +684,11 @@ def __init__(self, config):
         self.feed_forward = UniSpeechFeedForward(config)
         self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
+        if getattr(config, "adapter_attn_dim", None) is not None:
+            self.adapter_layer = UniSpeechAttnAdapterLayer(config)
+        else:
+            self.adapter_layer = None
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -677,6 +704,9 @@ def forward(
         hidden_states = attn_residual + hidden_states
         hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
 
+        if self.adapter_layer is not None:
+            hidden_states = hidden_states + self.adapter_layer(hidden_states)
+
         outputs = (hidden_states,)
 
         if output_attentions:
@@ -731,7 +761,7 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = np.random.uniform(0, 1)
+            dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
             if not skip_the_layer or deepspeed_zero3_is_enabled:
@@ -820,7 +850,7 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = np.random.uniform(0, 1)
+            dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
             if not skip_the_layer or deepspeed_zero3_is_enabled:
@@ -944,7 +974,6 @@ class UniSpeechPreTrainedModel(PreTrainedModel):
     config_class = UniSpeechConfig
     base_model_prefix = "unispeech"
     main_input_name = "input_values"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
     supports_gradient_checkpointing = True
 
     def _init_weights(self, module):
@@ -988,7 +1017,7 @@ def _get_feat_extract_output_lengths(self, input_lengths: Union[torch.LongTensor
         def _conv_out_length(input_length, kernel_size, stride):
             # 1D convolutional layer output length formula taken
             # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch_int_div(input_length - kernel_size, stride) + 1
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
 
         for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
             input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
@@ -1037,11 +1066,10 @@ def _set_gradient_checkpointing(self, module, value=False):
 UNISPEECH_INPUTS_DOCSTRING = r"""
     Args:
         input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
-            Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
-            into an array of type *List[float]* or a *numpy.ndarray*, *e.g.* via the soundfile library (*pip install
-            soundfile*). To prepare the array into *input_values*, the [`UniSpeechProcessor`] should be used for
-            padding and conversion into a tensor of type *torch.FloatTensor*. See [`UniSpeechProcessor.__call__`] for
-            details.
+            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
+            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
+            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
+            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details.
         attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
             1]`:
@@ -1143,7 +1171,6 @@ def _mask_hidden_states(
 
     @add_start_docstrings_to_model_forward(UNISPEECH_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_PROCESSOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=Wav2Vec2BaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1286,12 +1313,9 @@ def forward(
 
         ```python
         >>> import torch
-        >>> from transformers import Wav2Vec2FeatureExtractor, UniSpeechForPreTraining
-        >>> from transformers.models.unispeech.modeling_unispeech import _compute_mask_indices
+        >>> from transformers import AutoFeatureExtractor, UniSpeechForPreTraining
 
-        >>> feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
-        ...     "hf-internal-testing/tiny-random-unispeech-sat"
-        ... )
+        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/unispeech-large-1500h-cv")
         >>> model = UniSpeechForPreTraining.from_pretrained("microsoft/unispeech-large-1500h-cv")
         >>> # TODO: Add full pretraining example
         ```"""
@@ -1353,12 +1377,14 @@ def forward(
 )
 # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->UniSpeech, wav2vec2->unispeech, WAV_2_VEC_2->UNISPEECH
 class UniSpeechForCTC(UniSpeechPreTrainedModel):
-    def __init__(self, config):
+    def __init__(self, config, target_lang: Optional[str] = None):
         super().__init__(config)
 
         self.unispeech = UniSpeechModel(config)
         self.dropout = nn.Dropout(config.final_dropout)
 
+        self.target_lang = target_lang
+
         if config.vocab_size is None:
             raise ValueError(
                 f"You are trying to instantiate {self.__class__} with a configuration that "
@@ -1374,6 +1400,27 @@ def __init__(self, config):
         # Initialize weights and apply final processing
         self.post_init()
 
+    def tie_weights(self):
+        """
+        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
+        passing `target_lang=...` to `from_pretrained(...)`.
+
+        This method is **not** supposed to be called by the user and is prone to be changed in the future.
+        """
+
+        # Note that `tie_weights` is usually used to tie input and output embedding weights. The method is re-purposed to
+        # correctly load adapter layers for UniSpeech so that we do not have to introduce a new API to
+        # [`PreTrainedModel`]. While slightly hacky, UniSpeech never has to tie input and output embeddings, so that it is
+        # ok to repurpose this function here.
+        target_lang = self.target_lang
+
+        if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
+            raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
+        elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
+            logger.info("By default `target_lang` is set to 'eng'.")
+        elif target_lang is not None:
+            self.load_adapter(target_lang, force_load=True)
+
     def freeze_feature_extractor(self):
         """
         Calling this function will disable the gradient computation for the feature encoder so that its parameter will
@@ -1393,9 +1440,16 @@ def freeze_feature_encoder(self):
         """
         self.unispeech.feature_extractor._freeze_parameters()
 
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.unispeech.parameters():
+            param.requires_grad = False
+
     @add_start_docstrings_to_model_forward(UNISPEECH_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_PROCESSOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=CausalLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1436,7 +1490,6 @@ def forward(
 
         loss = None
         if labels is not None:
-
             if labels.max() >= self.config.vocab_size:
                 raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
 
@@ -1482,7 +1535,6 @@ def forward(
     """,
     UNISPEECH_START_DOCSTRING,
 )
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification with Wav2Vec2->UniSpeech, wav2vec2->unispeech, WAV_2_VEC_2->UNISPEECH
 class UniSpeechForSequenceClassification(UniSpeechPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -1501,6 +1553,7 @@ def __init__(self, config):
         # Initialize weights and apply final processing
         self.post_init()
 
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.freeze_feature_extractor
     def freeze_feature_extractor(self):
         """
         Calling this function will disable the gradient computation for the feature encoder so that its parameters will
@@ -1513,6 +1566,7 @@ def freeze_feature_extractor(self):
         )
         self.freeze_feature_encoder()
 
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.freeze_feature_encoder with wav2vec2->unispeech
     def freeze_feature_encoder(self):
         """
         Calling this function will disable the gradient computation for the feature encoder so that its parameter will
@@ -1520,6 +1574,7 @@ def freeze_feature_encoder(self):
         """
         self.unispeech.feature_extractor._freeze_parameters()
 
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.freeze_base_model with wav2vec2->unispeech
     def freeze_base_model(self):
         """
         Calling this function will disable the gradient computation for the base model so that its parameters will not
@@ -1530,14 +1585,12 @@ def freeze_base_model(self):
 
     @add_start_docstrings_to_model_forward(UNISPEECH_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
-        checkpoint=_SEQ_CLASS_CHECKPOINT,
+        checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
         modality="audio",
-        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
-        expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
     )
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.forward with Wav2Vec2->UniSpeech, wav2vec2->unispeech
     def forward(
         self,
         input_values: Optional[torch.Tensor],
diff --git a/src/transformers/models/unispeech_sat/__init__.py b/src/transformers/models/unispeech_sat/__init__.py
index d4a5e179539a..d1ac3ec2c43f 100644
--- a/src/transformers/models/unispeech_sat/__init__.py
+++ b/src/transformers/models/unispeech_sat/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py b/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py
index 3205bbc2cca8..a8ca71806024 100644
--- a/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py
@@ -62,10 +62,15 @@ class UniSpeechSatConfig(PretrainedConfig):
             `"relu"`, `"selu"` and `"gelu_new"` are supported.
         hidden_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        activation_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for activations inside the fully connected layer.
         attention_dropout (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
         final_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for the final projection layer of [`UniSpeechSatForCTC`].
+        layerdrop (`float`, *optional*, defaults to 0.1):
+            The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
+            details.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
@@ -239,7 +244,7 @@ def __init__(
         bos_token_id=1,
         eos_token_id=2,
         num_clusters=504,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
         self.hidden_size = hidden_size
diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
index d1b2074531ed..73906c691208 100755
--- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
@@ -26,7 +26,7 @@
 from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
-from ...deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
 from ...modeling_outputs import (
     BaseModelOutput,
     CausalLMOutput,
@@ -36,7 +36,6 @@
     XVectorOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_int_div
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -55,7 +54,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "UniSpeechSatConfig"
-_PROCESSOR_FOR_DOC = "Wav2Vec2Processor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "microsoft/unispeech-sat-base-100h-libri-ft"
@@ -65,12 +63,6 @@
 _CTC_EXPECTED_OUTPUT = "'MISTER QUILDER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
 _CTC_EXPECTED_LOSS = 39.88
 
-# Audio class docstring
-_FEAT_EXTRACTOR_FOR_DOC = "Wav2Vec2FeatureExtractor"
-_SEQ_CLASS_CHECKPOINT = "hf-internal-testing/tiny-random-unispeech-sat"
-_SEQ_CLASS_EXPECTED_OUTPUT = "'LABEL_1'"  # TODO(anton) - could you quickly fine-tune a KS WavLM Model
-_SEQ_CLASS_EXPECTED_LOSS = 0.71  # TODO(anton) - could you quickly fine-tune a KS WavLM Model
-
 # Frame class docstring
 _FRAME_CLASS_CHECKPOINT = "microsoft/unispeech-sat-base-plus-sd"
 _FRAME_EXPECTED_OUTPUT = [0, 0]
@@ -328,15 +320,19 @@ def __init__(self, config):
             groups=config.num_conv_pos_embedding_groups,
         )
 
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
         if is_deepspeed_zero3_enabled():
             import deepspeed
 
             with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
-                self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+                self.conv = weight_norm(self.conv, name="weight", dim=2)
             deepspeed.zero.register_external_parameter(self, self.conv.weight_v)
             deepspeed.zero.register_external_parameter(self, self.conv.weight_g)
         else:
-            self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+            self.conv = weight_norm(self.conv, name="weight", dim=2)
 
         self.padding = UniSpeechSatSamePadLayer(config.num_conv_pos_embeddings)
         self.activation = ACT2FN[config.feat_extract_activation]
@@ -538,8 +534,8 @@ def forward(
 
         proj_shape = (bsz * self.num_heads, -1, self.head_dim)
         query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-        key_states = key_states.view(*proj_shape)
-        value_states = value_states.view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
 
         src_len = key_states.size(1)
         attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
@@ -585,7 +581,7 @@ def forward(
 
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
             raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
                 f" {attn_output.size()}"
             )
 
@@ -593,7 +589,7 @@ def forward(
         attn_output = attn_output.transpose(1, 2)
 
         # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned aross GPUs when using tensor-parallelism.
+        # partitioned across GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
@@ -661,6 +657,32 @@ def forward(self, hidden_states, attention_mask=None, output_attentions=False):
         return outputs
 
 
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2AttnAdapterLayer with Wav2Vec2->UniSpeechSat
+class UniSpeechSatAttnAdapterLayer(nn.Module):
+    def __init__(self, config):
+        """
+        Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
+        up training throughput.
+        """
+        super().__init__()
+        self.input_dim = config.adapter_attn_dim
+        self.hidden_dim = config.hidden_size
+
+        self.norm = nn.LayerNorm(self.hidden_dim)
+        self.linear_1 = nn.Linear(self.hidden_dim, self.input_dim)
+        self.act_fn = nn.ReLU()
+        self.linear_2 = nn.Linear(self.input_dim, self.hidden_dim)
+
+    def forward(self, hidden_states: torch.FloatTensor):
+        hidden_states = self.norm(hidden_states)
+
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act_fn(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+
+        return hidden_states
+
+
 # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2EncoderLayerStableLayerNorm with Wav2Vec2->UniSpeechSat
 class UniSpeechSatEncoderLayerStableLayerNorm(nn.Module):
     def __init__(self, config):
@@ -676,6 +698,11 @@ def __init__(self, config):
         self.feed_forward = UniSpeechSatFeedForward(config)
         self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
+        if getattr(config, "adapter_attn_dim", None) is not None:
+            self.adapter_layer = UniSpeechSatAttnAdapterLayer(config)
+        else:
+            self.adapter_layer = None
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -691,6 +718,9 @@ def forward(
         hidden_states = attn_residual + hidden_states
         hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
 
+        if self.adapter_layer is not None:
+            hidden_states = hidden_states + self.adapter_layer(hidden_states)
+
         outputs = (hidden_states,)
 
         if output_attentions:
@@ -745,7 +775,7 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = np.random.uniform(0, 1)
+            dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
             if not skip_the_layer or deepspeed_zero3_is_enabled:
@@ -834,7 +864,7 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = np.random.uniform(0, 1)
+            dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
             if not skip_the_layer or deepspeed_zero3_is_enabled:
@@ -958,7 +988,6 @@ class UniSpeechSatPreTrainedModel(PreTrainedModel):
     config_class = UniSpeechSatConfig
     base_model_prefix = "unispeech_sat"
     main_input_name = "input_values"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
     supports_gradient_checkpointing = True
 
     def _init_weights(self, module):
@@ -1002,7 +1031,7 @@ def _get_feat_extract_output_lengths(self, input_lengths: Union[torch.LongTensor
         def _conv_out_length(input_length, kernel_size, stride):
             # 1D convolutional layer output length formula taken
             # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch_int_div(input_length - kernel_size, stride) + 1
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
 
         for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
             input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
@@ -1051,11 +1080,10 @@ def _set_gradient_checkpointing(self, module, value=False):
 UNISPEECH_SAT_INPUTS_DOCSTRING = r"""
     Args:
         input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
-            Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
-            into an array of type *List[float]* or a *numpy.ndarray*, *e.g.* via the soundfile library (*pip install
-            soundfile*). To prepare the array into *input_values*, the [`UniSpeechSatProcessor`] should be used for
-            padding and conversion into a tensor of type *torch.FloatTensor*. See [`UniSpeechSatProcessor.__call__`]
-            for details.
+            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
+            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
+            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
+            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details.
         attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
             1]`:
@@ -1158,7 +1186,6 @@ def _mask_hidden_states(
 
     @add_start_docstrings_to_model_forward(UNISPEECH_SAT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_PROCESSOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=Wav2Vec2BaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1299,10 +1326,10 @@ def forward(
 
         ```python
         >>> import torch
-        >>> from transformers import Wav2Vec2FeatureExtractor, UniSpeechSatForPreTraining
+        >>> from transformers import AutoFeatureExtractor, UniSpeechSatForPreTraining
         >>> from transformers.models.unispeech_sat.modeling_unispeech_sat import _compute_mask_indices
 
-        >>> feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("microsoft/unispeech-sat-base")
+        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/unispeech-sat-base")
         >>> model = UniSpeechSatForPreTraining.from_pretrained("microsoft/unispeech-sat-base")
         >>> # TODO: Add full pretraining example
         ```"""
@@ -1357,12 +1384,14 @@ def forward(
 )
 # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->UniSpeechSat, wav2vec2->unispeech_sat, WAV_2_VEC_2->UNISPEECH_SAT
 class UniSpeechSatForCTC(UniSpeechSatPreTrainedModel):
-    def __init__(self, config):
+    def __init__(self, config, target_lang: Optional[str] = None):
         super().__init__(config)
 
         self.unispeech_sat = UniSpeechSatModel(config)
         self.dropout = nn.Dropout(config.final_dropout)
 
+        self.target_lang = target_lang
+
         if config.vocab_size is None:
             raise ValueError(
                 f"You are trying to instantiate {self.__class__} with a configuration that "
@@ -1378,6 +1407,27 @@ def __init__(self, config):
         # Initialize weights and apply final processing
         self.post_init()
 
+    def tie_weights(self):
+        """
+        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
+        passing `target_lang=...` to `from_pretrained(...)`.
+
+        This method is **not** supposed to be called by the user and is prone to be changed in the future.
+        """
+
+        # Note that `tie_weights` is usually used to tie input and output embedding weights. The method is re-purposed to
+        # correctly load adapter layers for UniSpeechSat so that we do not have to introduce a new API to
+        # [`PreTrainedModel`]. While slightly hacky, UniSpeechSat never has to tie input and output embeddings, so that it is
+        # ok to repurpose this function here.
+        target_lang = self.target_lang
+
+        if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
+            raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
+        elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
+            logger.info("By default `target_lang` is set to 'eng'.")
+        elif target_lang is not None:
+            self.load_adapter(target_lang, force_load=True)
+
     def freeze_feature_extractor(self):
         """
         Calling this function will disable the gradient computation for the feature encoder so that its parameter will
@@ -1397,9 +1447,16 @@ def freeze_feature_encoder(self):
         """
         self.unispeech_sat.feature_extractor._freeze_parameters()
 
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.unispeech_sat.parameters():
+            param.requires_grad = False
+
     @add_start_docstrings_to_model_forward(UNISPEECH_SAT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_PROCESSOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=CausalLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1440,7 +1497,6 @@ def forward(
 
         loss = None
         if labels is not None:
-
             if labels.max() >= self.config.vocab_size:
                 raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
 
@@ -1486,7 +1542,6 @@ def forward(
     """,
     UNISPEECH_SAT_START_DOCSTRING,
 )
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification with Wav2Vec2->UniSpeechSat, wav2vec2->unispeech_sat, WAV_2_VEC_2->UNISPEECH_SAT
 class UniSpeechSatForSequenceClassification(UniSpeechSatPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -1505,6 +1560,7 @@ def __init__(self, config):
         # Initialize weights and apply final processing
         self.post_init()
 
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.freeze_feature_extractor
     def freeze_feature_extractor(self):
         """
         Calling this function will disable the gradient computation for the feature encoder so that its parameters will
@@ -1517,6 +1573,7 @@ def freeze_feature_extractor(self):
         )
         self.freeze_feature_encoder()
 
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.freeze_feature_encoder with wav2vec2->unispeech_sat
     def freeze_feature_encoder(self):
         """
         Calling this function will disable the gradient computation for the feature encoder so that its parameter will
@@ -1524,6 +1581,7 @@ def freeze_feature_encoder(self):
         """
         self.unispeech_sat.feature_extractor._freeze_parameters()
 
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.freeze_base_model with wav2vec2->unispeech_sat
     def freeze_base_model(self):
         """
         Calling this function will disable the gradient computation for the base model so that its parameters will not
@@ -1534,14 +1592,12 @@ def freeze_base_model(self):
 
     @add_start_docstrings_to_model_forward(UNISPEECH_SAT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
-        checkpoint=_SEQ_CLASS_CHECKPOINT,
+        checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
         modality="audio",
-        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
-        expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
     )
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.forward with Wav2Vec2->UniSpeechSat, wav2vec2->unispeech_sat
     def forward(
         self,
         input_values: Optional[torch.Tensor],
@@ -1617,8 +1673,7 @@ def __init__(self, config):
 
         if hasattr(config, "add_adapter") and config.add_adapter:
             raise ValueError(
-                "Audio frame classification does not support the use of UniSpeechSat adapters"
-                " (config.add_adapter=True)"
+                "Audio frame classification does not support the use of UniSpeechSat adapters (config.add_adapter=True)"
             )
         self.unispeech_sat = UniSpeechSatModel(config)
         num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
@@ -1658,7 +1713,6 @@ def freeze_base_model(self):
 
     @add_start_docstrings_to_model_forward(UNISPEECH_SAT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_FRAME_CLASS_CHECKPOINT,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1841,7 +1895,6 @@ def _conv_out_length(input_length, kernel_size, stride):
 
     @add_start_docstrings_to_model_forward(UNISPEECH_SAT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_XVECTOR_CHECKPOINT,
         output_type=XVectorOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/upernet/__init__.py b/src/transformers/models/upernet/__init__.py
new file mode 100644
index 000000000000..3954fe4594da
--- /dev/null
+++ b/src/transformers/models/upernet/__init__.py
@@ -0,0 +1,50 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {
+    "configuration_upernet": ["UperNetConfig"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_upernet"] = [
+        "UperNetForSemanticSegmentation",
+        "UperNetPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_upernet import UperNetConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_upernet import UperNetForSemanticSegmentation, UperNetPreTrainedModel
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/upernet/configuration_upernet.py b/src/transformers/models/upernet/configuration_upernet.py
new file mode 100644
index 000000000000..f7ad5d04652c
--- /dev/null
+++ b/src/transformers/models/upernet/configuration_upernet.py
@@ -0,0 +1,109 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" UperNet model configuration"""
+
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto.configuration_auto import CONFIG_MAPPING
+
+
+logger = logging.get_logger(__name__)
+
+
+class UperNetConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of an [`UperNetForSemanticSegmentation`]. It is used to
+    instantiate an UperNet model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the UperNet
+    [openmmlab/upernet-convnext-tiny](https://huggingface.co/openmmlab/upernet-convnext-tiny) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `ResNetConfig()`):
+            The configuration of the backbone model.
+        hidden_size (`int`, *optional*, defaults to 512):
+            The number of hidden units in the convolutional layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        pool_scales (`Tuple[int]`, *optional*, defaults to `[1, 2, 3, 6]`):
+            Pooling scales used in Pooling Pyramid Module applied on the last feature map.
+        use_auxiliary_head (`bool`, *optional*, defaults to `True`):
+            Whether to use an auxiliary head during training.
+        auxiliary_loss_weight (`float`, *optional*, defaults to 0.4):
+            Weight of the cross-entropy loss of the auxiliary head.
+        auxiliary_channels (`int`, *optional*, defaults to 256):
+            Number of channels to use in the auxiliary head.
+        auxiliary_num_convs (`int`, *optional*, defaults to 1):
+            Number of convolutional layers to use in the auxiliary head.
+        auxiliary_concat_input (`bool`, *optional*, defaults to `False`):
+            Whether to concatenate the output of the auxiliary head with the input before the classification layer.
+        loss_ignore_index (`int`, *optional*, defaults to 255):
+            The index that is ignored by the loss function.
+
+    Examples:
+
+    ```python
+    >>> from transformers import UperNetConfig, UperNetForSemanticSegmentation
+
+    >>> # Initializing a configuration
+    >>> configuration = UperNetConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = UperNetForSemanticSegmentation(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "upernet"
+
+    def __init__(
+        self,
+        backbone_config=None,
+        hidden_size=512,
+        initializer_range=0.02,
+        pool_scales=[1, 2, 3, 6],
+        use_auxiliary_head=True,
+        auxiliary_loss_weight=0.4,
+        auxiliary_in_channels=384,
+        auxiliary_channels=256,
+        auxiliary_num_convs=1,
+        auxiliary_concat_input=False,
+        loss_ignore_index=255,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        if backbone_config is None:
+            logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
+            backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage1", "stage2", "stage3", "stage4"])
+        elif isinstance(backbone_config, dict):
+            backbone_model_type = backbone_config.get("model_type")
+            config_class = CONFIG_MAPPING[backbone_model_type]
+            backbone_config = config_class.from_dict(backbone_config)
+
+        self.backbone_config = backbone_config
+        self.hidden_size = hidden_size
+        self.initializer_range = initializer_range
+        self.pool_scales = pool_scales
+        self.use_auxiliary_head = use_auxiliary_head
+        self.auxiliary_loss_weight = auxiliary_loss_weight
+        self.auxiliary_in_channels = auxiliary_in_channels
+        self.auxiliary_channels = auxiliary_channels
+        self.auxiliary_num_convs = auxiliary_num_convs
+        self.auxiliary_concat_input = auxiliary_concat_input
+        self.loss_ignore_index = loss_ignore_index
diff --git a/src/transformers/models/upernet/convert_convnext_upernet_to_pytorch.py b/src/transformers/models/upernet/convert_convnext_upernet_to_pytorch.py
new file mode 100644
index 000000000000..eeb3ab5fc993
--- /dev/null
+++ b/src/transformers/models/upernet/convert_convnext_upernet_to_pytorch.py
@@ -0,0 +1,214 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert ConvNext + UperNet checkpoints from mmsegmentation."""
+
+import argparse
+import json
+
+import requests
+import torch
+from huggingface_hub import hf_hub_download
+from PIL import Image
+
+from transformers import ConvNextConfig, SegformerImageProcessor, UperNetConfig, UperNetForSemanticSegmentation
+
+
+def get_upernet_config(model_name):
+    auxiliary_in_channels = 384
+    if "tiny" in model_name:
+        depths = [3, 3, 9, 3]
+        hidden_sizes = [96, 192, 384, 768]
+    if "small" in model_name:
+        depths = [3, 3, 27, 3]
+        hidden_sizes = [96, 192, 384, 768]
+    if "base" in model_name:
+        depths = [3, 3, 27, 3]
+        hidden_sizes = [128, 256, 512, 1024]
+        auxiliary_in_channels = 512
+    if "large" in model_name:
+        depths = [3, 3, 27, 3]
+        hidden_sizes = [192, 384, 768, 1536]
+        auxiliary_in_channels = 768
+    if "xlarge" in model_name:
+        depths = [3, 3, 27, 3]
+        hidden_sizes = [256, 512, 1024, 2048]
+        auxiliary_in_channels = 1024
+
+    # set label information
+    num_labels = 150
+    repo_id = "huggingface/label-files"
+    filename = "ade20k-id2label.json"
+    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
+    id2label = {int(k): v for k, v in id2label.items()}
+    label2id = {v: k for k, v in id2label.items()}
+
+    backbone_config = ConvNextConfig(
+        depths=depths, hidden_sizes=hidden_sizes, out_features=["stage1", "stage2", "stage3", "stage4"]
+    )
+    config = UperNetConfig(
+        backbone_config=backbone_config,
+        auxiliary_in_channels=auxiliary_in_channels,
+        num_labels=num_labels,
+        id2label=id2label,
+        label2id=label2id,
+    )
+
+    return config
+
+
+# here we list all keys to be renamed (original name on the left, our name on the right)
+def create_rename_keys(config):
+    rename_keys = []
+
+    # fmt: off
+    # stem
+    rename_keys.append(("backbone.downsample_layers.0.0.weight", "backbone.embeddings.patch_embeddings.weight"))
+    rename_keys.append(("backbone.downsample_layers.0.0.bias", "backbone.embeddings.patch_embeddings.bias"))
+    rename_keys.append(("backbone.downsample_layers.0.1.weight", "backbone.embeddings.layernorm.weight"))
+    rename_keys.append(("backbone.downsample_layers.0.1.bias", "backbone.embeddings.layernorm.bias"))
+    # stages
+    for i in range(len(config.backbone_config.depths)):
+        for j in range(config.backbone_config.depths[i]):
+            rename_keys.append((f"backbone.stages.{i}.{j}.gamma", f"backbone.encoder.stages.{i}.layers.{j}.layer_scale_parameter"))
+            rename_keys.append((f"backbone.stages.{i}.{j}.depthwise_conv.weight", f"backbone.encoder.stages.{i}.layers.{j}.dwconv.weight"))
+            rename_keys.append((f"backbone.stages.{i}.{j}.depthwise_conv.bias", f"backbone.encoder.stages.{i}.layers.{j}.dwconv.bias"))
+            rename_keys.append((f"backbone.stages.{i}.{j}.norm.weight", f"backbone.encoder.stages.{i}.layers.{j}.layernorm.weight"))
+            rename_keys.append((f"backbone.stages.{i}.{j}.norm.bias", f"backbone.encoder.stages.{i}.layers.{j}.layernorm.bias"))
+            rename_keys.append((f"backbone.stages.{i}.{j}.pointwise_conv1.weight", f"backbone.encoder.stages.{i}.layers.{j}.pwconv1.weight"))
+            rename_keys.append((f"backbone.stages.{i}.{j}.pointwise_conv1.bias", f"backbone.encoder.stages.{i}.layers.{j}.pwconv1.bias"))
+            rename_keys.append((f"backbone.stages.{i}.{j}.pointwise_conv2.weight", f"backbone.encoder.stages.{i}.layers.{j}.pwconv2.weight"))
+            rename_keys.append((f"backbone.stages.{i}.{j}.pointwise_conv2.bias", f"backbone.encoder.stages.{i}.layers.{j}.pwconv2.bias"))
+        if i > 0:
+            rename_keys.append((f"backbone.downsample_layers.{i}.0.weight", f"backbone.encoder.stages.{i}.downsampling_layer.0.weight"))
+            rename_keys.append((f"backbone.downsample_layers.{i}.0.bias", f"backbone.encoder.stages.{i}.downsampling_layer.0.bias"))
+            rename_keys.append((f"backbone.downsample_layers.{i}.1.weight", f"backbone.encoder.stages.{i}.downsampling_layer.1.weight"))
+            rename_keys.append((f"backbone.downsample_layers.{i}.1.bias", f"backbone.encoder.stages.{i}.downsampling_layer.1.bias"))
+
+        rename_keys.append((f"backbone.norm{i}.weight", f"backbone.hidden_states_norms.stage{i+1}.weight"))
+        rename_keys.append((f"backbone.norm{i}.bias", f"backbone.hidden_states_norms.stage{i+1}.bias"))
+
+    # decode head
+    rename_keys.extend(
+        [
+            ("decode_head.conv_seg.weight", "decode_head.classifier.weight"),
+            ("decode_head.conv_seg.bias", "decode_head.classifier.bias"),
+            ("auxiliary_head.conv_seg.weight", "auxiliary_head.classifier.weight"),
+            ("auxiliary_head.conv_seg.bias", "auxiliary_head.classifier.bias"),
+        ]
+    )
+    # fmt: on
+
+    return rename_keys
+
+
+def rename_key(dct, old, new):
+    val = dct.pop(old)
+    dct[new] = val
+
+
+def convert_upernet_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
+    model_name_to_url = {
+        "upernet-convnext-tiny": "https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_tiny_fp16_512x512_160k_ade20k/upernet_convnext_tiny_fp16_512x512_160k_ade20k_20220227_124553-cad485de.pth",
+        "upernet-convnext-small": "https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_small_fp16_512x512_160k_ade20k/upernet_convnext_small_fp16_512x512_160k_ade20k_20220227_131208-1b1e394f.pth",
+        "upernet-convnext-base": "https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_base_fp16_512x512_160k_ade20k/upernet_convnext_base_fp16_512x512_160k_ade20k_20220227_181227-02a24fc6.pth",
+        "upernet-convnext-large": "https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_large_fp16_640x640_160k_ade20k/upernet_convnext_large_fp16_640x640_160k_ade20k_20220226_040532-e57aa54d.pth",
+        "upernet-convnext-xlarge": "https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_xlarge_fp16_640x640_160k_ade20k/upernet_convnext_xlarge_fp16_640x640_160k_ade20k_20220226_080344-95fc38c2.pth",
+    }
+    checkpoint_url = model_name_to_url[model_name]
+    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["state_dict"]
+
+    config = get_upernet_config(model_name)
+    model = UperNetForSemanticSegmentation(config)
+    model.eval()
+
+    # replace "bn" => "batch_norm"
+    for key in state_dict.copy().keys():
+        val = state_dict.pop(key)
+        if "bn" in key:
+            key = key.replace("bn", "batch_norm")
+        state_dict[key] = val
+
+    # rename keys
+    rename_keys = create_rename_keys(config)
+    for src, dest in rename_keys:
+        rename_key(state_dict, src, dest)
+
+    model.load_state_dict(state_dict)
+
+    # verify on image
+    url = "https://huggingface.co/datasets/hf-internal-testing/fixtures_ade20k/resolve/main/ADE_val_00000001.jpg"
+    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+
+    processor = SegformerImageProcessor()
+    pixel_values = processor(image, return_tensors="pt").pixel_values
+
+    with torch.no_grad():
+        outputs = model(pixel_values)
+
+    if model_name == "upernet-convnext-tiny":
+        expected_slice = torch.tensor(
+            [[-8.8110, -8.8110, -8.6521], [-8.8110, -8.8110, -8.6521], [-8.7746, -8.7746, -8.6130]]
+        )
+    elif model_name == "upernet-convnext-small":
+        expected_slice = torch.tensor(
+            [[-8.8236, -8.8236, -8.6771], [-8.8236, -8.8236, -8.6771], [-8.7638, -8.7638, -8.6240]]
+        )
+    elif model_name == "upernet-convnext-base":
+        expected_slice = torch.tensor(
+            [[-8.8558, -8.8558, -8.6905], [-8.8558, -8.8558, -8.6905], [-8.7669, -8.7669, -8.6021]]
+        )
+    elif model_name == "upernet-convnext-large":
+        expected_slice = torch.tensor(
+            [[-8.6660, -8.6660, -8.6210], [-8.6660, -8.6660, -8.6210], [-8.6310, -8.6310, -8.5964]]
+        )
+    elif model_name == "upernet-convnext-xlarge":
+        expected_slice = torch.tensor(
+            [[-8.4980, -8.4980, -8.3977], [-8.4980, -8.4980, -8.3977], [-8.4379, -8.4379, -8.3412]]
+        )
+    print("Logits:", outputs.logits[0, 0, :3, :3])
+    assert torch.allclose(outputs.logits[0, 0, :3, :3], expected_slice, atol=1e-4)
+    print("Looks ok!")
+
+    if pytorch_dump_folder_path is not None:
+        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
+        model.save_pretrained(pytorch_dump_folder_path)
+        print(f"Saving processor to {pytorch_dump_folder_path}")
+        processor.save_pretrained(pytorch_dump_folder_path)
+
+    if push_to_hub:
+        print(f"Pushing model and processor for {model_name} to hub")
+        model.push_to_hub(f"openmmlab/{model_name}")
+        processor.push_to_hub(f"openmmlab/{model_name}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--model_name",
+        default="upernet-convnext-tiny",
+        type=str,
+        choices=[f"upernet-convnext-{size}" for size in ["tiny", "small", "base", "large", "xlarge"]],
+        help="Name of the ConvNext UperNet model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+    )
+    parser.add_argument(
+        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
+    )
+
+    args = parser.parse_args()
+    convert_upernet_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/upernet/convert_swin_upernet_to_pytorch.py b/src/transformers/models/upernet/convert_swin_upernet_to_pytorch.py
new file mode 100644
index 000000000000..9580af7c46a5
--- /dev/null
+++ b/src/transformers/models/upernet/convert_swin_upernet_to_pytorch.py
@@ -0,0 +1,297 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert Swin Transformer + UperNet checkpoints from mmsegmentation.
+
+URL: https://github.com/open-mmlab/mmsegmentation/tree/master/configs/swin
+"""
+
+import argparse
+import json
+
+import requests
+import torch
+from huggingface_hub import hf_hub_download
+from PIL import Image
+
+from transformers import SegformerImageProcessor, SwinConfig, UperNetConfig, UperNetForSemanticSegmentation
+
+
+def get_upernet_config(model_name):
+    auxiliary_in_channels = 384
+    window_size = 7
+    if "tiny" in model_name:
+        embed_dim = 96
+        depths = (2, 2, 6, 2)
+        num_heads = (3, 6, 12, 24)
+    elif "small" in model_name:
+        embed_dim = 96
+        depths = (2, 2, 18, 2)
+        num_heads = (3, 6, 12, 24)
+    elif "base" in model_name:
+        embed_dim = 128
+        depths = (2, 2, 18, 2)
+        num_heads = (4, 8, 16, 32)
+        window_size = 12
+        auxiliary_in_channels = 512
+    elif "large" in model_name:
+        embed_dim = 192
+        depths = (2, 2, 18, 2)
+        num_heads = (6, 12, 24, 48)
+        window_size = 12
+        auxiliary_in_channels = 768
+
+    # set label information
+    num_labels = 150
+    repo_id = "huggingface/label-files"
+    filename = "ade20k-id2label.json"
+    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
+    id2label = {int(k): v for k, v in id2label.items()}
+    label2id = {v: k for k, v in id2label.items()}
+
+    backbone_config = SwinConfig(
+        embed_dim=embed_dim,
+        depths=depths,
+        num_heads=num_heads,
+        window_size=window_size,
+        out_features=["stage1", "stage2", "stage3", "stage4"],
+    )
+    config = UperNetConfig(
+        backbone_config=backbone_config,
+        auxiliary_in_channels=auxiliary_in_channels,
+        num_labels=num_labels,
+        id2label=id2label,
+        label2id=label2id,
+    )
+
+    return config
+
+
+# here we list all keys to be renamed (original name on the left, our name on the right)
+def create_rename_keys(config):
+    rename_keys = []
+
+    # fmt: off
+    # stem
+    rename_keys.append(("backbone.patch_embed.projection.weight", "backbone.embeddings.patch_embeddings.projection.weight"))
+    rename_keys.append(("backbone.patch_embed.projection.bias", "backbone.embeddings.patch_embeddings.projection.bias"))
+    rename_keys.append(("backbone.patch_embed.norm.weight", "backbone.embeddings.norm.weight"))
+    rename_keys.append(("backbone.patch_embed.norm.bias", "backbone.embeddings.norm.bias"))
+    # stages
+    for i in range(len(config.backbone_config.depths)):
+        for j in range(config.backbone_config.depths[i]):
+            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.norm1.weight", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_before.weight"))
+            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.norm1.bias", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_before.bias"))
+            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.attn.w_msa.relative_position_bias_table", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_bias_table"))
+            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.attn.w_msa.relative_position_index", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_index"))
+            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.attn.w_msa.proj.weight", f"backbone.encoder.layers.{i}.blocks.{j}.attention.output.dense.weight"))
+            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.attn.w_msa.proj.bias", f"backbone.encoder.layers.{i}.blocks.{j}.attention.output.dense.bias"))
+            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.norm2.weight", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_after.weight"))
+            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.norm2.bias", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_after.bias"))
+            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.ffn.layers.0.0.weight", f"backbone.encoder.layers.{i}.blocks.{j}.intermediate.dense.weight"))
+            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.ffn.layers.0.0.bias", f"backbone.encoder.layers.{i}.blocks.{j}.intermediate.dense.bias"))
+            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.ffn.layers.1.weight", f"backbone.encoder.layers.{i}.blocks.{j}.output.dense.weight"))
+            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.ffn.layers.1.bias", f"backbone.encoder.layers.{i}.blocks.{j}.output.dense.bias"))
+
+        if i < 3:
+            rename_keys.append((f"backbone.stages.{i}.downsample.reduction.weight", f"backbone.encoder.layers.{i}.downsample.reduction.weight"))
+            rename_keys.append((f"backbone.stages.{i}.downsample.norm.weight", f"backbone.encoder.layers.{i}.downsample.norm.weight"))
+            rename_keys.append((f"backbone.stages.{i}.downsample.norm.bias", f"backbone.encoder.layers.{i}.downsample.norm.bias"))
+        rename_keys.append((f"backbone.norm{i}.weight", f"backbone.hidden_states_norms.stage{i+1}.weight"))
+        rename_keys.append((f"backbone.norm{i}.bias", f"backbone.hidden_states_norms.stage{i+1}.bias"))
+
+    # decode head
+    rename_keys.extend(
+        [
+            ("decode_head.conv_seg.weight", "decode_head.classifier.weight"),
+            ("decode_head.conv_seg.bias", "decode_head.classifier.bias"),
+            ("auxiliary_head.conv_seg.weight", "auxiliary_head.classifier.weight"),
+            ("auxiliary_head.conv_seg.bias", "auxiliary_head.classifier.bias"),
+        ]
+    )
+    # fmt: on
+
+    return rename_keys
+
+
+def rename_key(dct, old, new):
+    val = dct.pop(old)
+    dct[new] = val
+
+
+# we split up the matrix of each encoder layer into queries, keys and values
+def read_in_q_k_v(state_dict, backbone_config):
+    num_features = [int(backbone_config.embed_dim * 2**i) for i in range(len(backbone_config.depths))]
+    for i in range(len(backbone_config.depths)):
+        dim = num_features[i]
+        for j in range(backbone_config.depths[i]):
+            # fmt: off
+            # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias)
+            in_proj_weight = state_dict.pop(f"backbone.stages.{i}.blocks.{j}.attn.w_msa.qkv.weight")
+            in_proj_bias = state_dict.pop(f"backbone.stages.{i}.blocks.{j}.attn.w_msa.qkv.bias")
+            # next, add query, keys and values (in that order) to the state dict
+            state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.query.weight"] = in_proj_weight[:dim, :]
+            state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.query.bias"] = in_proj_bias[: dim]
+            state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.key.weight"] = in_proj_weight[
+                dim : dim * 2, :
+            ]
+            state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.key.bias"] = in_proj_bias[
+                dim : dim * 2
+            ]
+            state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.value.weight"] = in_proj_weight[
+                -dim :, :
+            ]
+            state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.value.bias"] = in_proj_bias[-dim :]
+            # fmt: on
+
+
+def correct_unfold_reduction_order(x):
+    out_channel, in_channel = x.shape
+    x = x.reshape(out_channel, 4, in_channel // 4)
+    x = x[:, [0, 2, 1, 3], :].transpose(1, 2).reshape(out_channel, in_channel)
+    return x
+
+
+def reverse_correct_unfold_reduction_order(x):
+    out_channel, in_channel = x.shape
+    x = x.reshape(out_channel, in_channel // 4, 4)
+    x = x[:, :, [0, 2, 1, 3]].transpose(1, 2).reshape(out_channel, in_channel)
+
+    return x
+
+
+def correct_unfold_norm_order(x):
+    in_channel = x.shape[0]
+    x = x.reshape(4, in_channel // 4)
+    x = x[[0, 2, 1, 3], :].transpose(0, 1).reshape(in_channel)
+    return x
+
+
+# there was an incompatibility with this version, due to a new implementation of their downsampling operation using nn.Unfold.
+# was resolved as seen here:
+# https://github.com/open-mmlab/mmdetection/blob/31c84958f54287a8be2b99cbf87a6dcf12e57753/mmdet/models/utils/ckpt_convert.py#L96.
+def reverse_correct_unfold_norm_order(x):
+    in_channel = x.shape[0]
+    x = x.reshape(in_channel // 4, 4)
+    x = x[:, [0, 2, 1, 3]].transpose(0, 1).reshape(in_channel)
+    return x
+
+
+def convert_upernet_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
+    model_name_to_url = {
+        "upernet-swin-tiny": "https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210531_112542-e380ad3e.pth",
+        "upernet-swin-small": "https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210526_192015-ee2fff1c.pth",
+        "upernet-swin-base": "https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K_20210531_125459-429057bf.pth",
+        "upernet-swin-large": "https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_large_patch4_window12_512x512_pretrain_384x384_22K_160k_ade20k/upernet_swin_large_patch4_window12_512x512_pretrain_384x384_22K_160k_ade20k_20220318_091743-9ba68901.pth",
+    }
+    checkpoint_url = model_name_to_url[model_name]
+    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", file_name=model_name)[
+        "state_dict"
+    ]
+
+    for name, param in state_dict.items():
+        print(name, param.shape)
+
+    config = get_upernet_config(model_name)
+    model = UperNetForSemanticSegmentation(config)
+    model.eval()
+
+    # replace "bn" => "batch_norm"
+    for key in state_dict.copy().keys():
+        val = state_dict.pop(key)
+        if "bn" in key:
+            key = key.replace("bn", "batch_norm")
+        state_dict[key] = val
+
+    # rename keys
+    rename_keys = create_rename_keys(config)
+    for src, dest in rename_keys:
+        rename_key(state_dict, src, dest)
+    read_in_q_k_v(state_dict, config.backbone_config)
+
+    # fix downsample parameters
+    for key, value in state_dict.items():
+        if "downsample" in key:
+            if "reduction" in key:
+                state_dict[key] = reverse_correct_unfold_reduction_order(value)
+            if "norm" in key:
+                state_dict[key] = reverse_correct_unfold_norm_order(value)
+
+    model.load_state_dict(state_dict)
+
+    # verify on image
+    url = "https://huggingface.co/datasets/hf-internal-testing/fixtures_ade20k/resolve/main/ADE_val_00000001.jpg"
+    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+
+    processor = SegformerImageProcessor()
+    pixel_values = processor(image, return_tensors="pt").pixel_values
+
+    with torch.no_grad():
+        outputs = model(pixel_values)
+        logits = outputs.logits
+
+    print(logits.shape)
+    print("First values of logits:", logits[0, 0, :3, :3])
+    # assert values
+    if model_name == "upernet-swin-tiny":
+        expected_slice = torch.tensor(
+            [[-7.5958, -7.5958, -7.4302], [-7.5958, -7.5958, -7.4302], [-7.4797, -7.4797, -7.3068]]
+        )
+    elif model_name == "upernet-swin-small":
+        expected_slice = torch.tensor(
+            [[-7.1921, -7.1921, -6.9532], [-7.1921, -7.1921, -6.9532], [-7.0908, -7.0908, -6.8534]]
+        )
+    elif model_name == "upernet-swin-base":
+        expected_slice = torch.tensor(
+            [[-6.5851, -6.5851, -6.4330], [-6.5851, -6.5851, -6.4330], [-6.4763, -6.4763, -6.3254]]
+        )
+    elif model_name == "upernet-swin-large":
+        expected_slice = torch.tensor(
+            [[-7.5297, -7.5297, -7.3802], [-7.5297, -7.5297, -7.3802], [-7.4044, -7.4044, -7.2586]]
+        )
+    print("Logits:", outputs.logits[0, 0, :3, :3])
+    assert torch.allclose(outputs.logits[0, 0, :3, :3], expected_slice, atol=1e-4)
+    print("Looks ok!")
+
+    if pytorch_dump_folder_path is not None:
+        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
+        model.save_pretrained(pytorch_dump_folder_path)
+        print(f"Saving processor to {pytorch_dump_folder_path}")
+        processor.save_pretrained(pytorch_dump_folder_path)
+
+    if push_to_hub:
+        print(f"Pushing model and processor for {model_name} to hub")
+        model.push_to_hub(f"openmmlab/{model_name}")
+        processor.push_to_hub(f"openmmlab/{model_name}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--model_name",
+        default="upernet-swin-tiny",
+        type=str,
+        choices=[f"upernet-swin-{size}" for size in ["tiny", "small", "base", "large"]],
+        help="Name of the Swin + UperNet model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+    )
+    parser.add_argument(
+        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
+    )
+
+    args = parser.parse_args()
+    convert_upernet_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/upernet/modeling_upernet.py b/src/transformers/models/upernet/modeling_upernet.py
new file mode 100644
index 000000000000..b56b508d14ae
--- /dev/null
+++ b/src/transformers/models/upernet/modeling_upernet.py
@@ -0,0 +1,451 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch UperNet model. Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation."""
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ... import AutoBackbone
+from ...modeling_outputs import SemanticSegmenterOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
+from ...utils.backbone_utils import BackboneMixin
+from .configuration_upernet import UperNetConfig
+
+
+UPERNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "openmmlab/upernet-convnext-tiny",
+    # See all UperNet models at https://huggingface.co/models?filter=upernet
+]
+
+# General docstring
+_CONFIG_FOR_DOC = "UperNetConfig"
+
+
+class UperNetConvModule(nn.Module):
+    """
+    A convolutional block that bundles conv/norm/activation layers. This block simplifies the usage of convolution
+    layers, which are commonly used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int, int]],
+        padding: Union[int, Tuple[int, int], str] = 0,
+        bias: bool = False,
+        dilation: Union[int, Tuple[int, int]] = 1,
+    ) -> None:
+        super().__init__()
+        self.conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            padding=padding,
+            bias=bias,
+            dilation=dilation,
+        )
+        self.batch_norm = nn.BatchNorm2d(out_channels)
+        self.activation = nn.ReLU()
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        output = self.conv(input)
+        output = self.batch_norm(output)
+        output = self.activation(output)
+
+        return output
+
+
+class UperNetPyramidPoolingBlock(nn.Module):
+    def __init__(self, pool_scale: int, in_channels: int, channels: int) -> None:
+        super().__init__()
+        self.layers = [
+            nn.AdaptiveAvgPool2d(pool_scale),
+            UperNetConvModule(in_channels, channels, kernel_size=1),
+        ]
+        for i, layer in enumerate(self.layers):
+            self.add_module(str(i), layer)
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        hidden_state = input
+        for layer in self.layers:
+            hidden_state = layer(hidden_state)
+        return hidden_state
+
+
+class UperNetPyramidPoolingModule(nn.Module):
+    """
+    Pyramid Pooling Module (PPM) used in PSPNet.
+
+    Args:
+        pool_scales (`Tuple[int]`):
+            Pooling scales used in Pooling Pyramid Module.
+        in_channels (`int`):
+            Input channels.
+        channels (`int`):
+            Channels after modules, before conv_seg.
+        align_corners (`bool`):
+            align_corners argument of F.interpolate.
+    """
+
+    def __init__(self, pool_scales: Tuple[int, ...], in_channels: int, channels: int, align_corners: bool) -> None:
+        super().__init__()
+        self.pool_scales = pool_scales
+        self.align_corners = align_corners
+        self.in_channels = in_channels
+        self.channels = channels
+        self.blocks = []
+        for i, pool_scale in enumerate(pool_scales):
+            block = UperNetPyramidPoolingBlock(pool_scale=pool_scale, in_channels=in_channels, channels=channels)
+            self.blocks.append(block)
+            self.add_module(str(i), block)
+
+    def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
+        ppm_outs = []
+        for ppm in self.blocks:
+            ppm_out = ppm(x)
+            upsampled_ppm_out = nn.functional.interpolate(
+                ppm_out, size=x.size()[2:], mode="bilinear", align_corners=self.align_corners
+            )
+            ppm_outs.append(upsampled_ppm_out)
+        return ppm_outs
+
+
+class UperNetHead(nn.Module):
+    """
+    Unified Perceptual Parsing for Scene Understanding. This head is the implementation of
+    [UPerNet](https://arxiv.org/abs/1807.10221).
+    """
+
+    def __init__(self, config, in_channels):
+        super().__init__()
+
+        self.config = config
+        self.pool_scales = config.pool_scales  # e.g. (1, 2, 3, 6)
+        self.in_channels = in_channels
+        self.channels = config.hidden_size
+        self.align_corners = False
+        self.classifier = nn.Conv2d(self.channels, config.num_labels, kernel_size=1)
+
+        # PSP Module
+        self.psp_modules = UperNetPyramidPoolingModule(
+            self.pool_scales,
+            self.in_channels[-1],
+            self.channels,
+            align_corners=self.align_corners,
+        )
+        self.bottleneck = UperNetConvModule(
+            self.in_channels[-1] + len(self.pool_scales) * self.channels,
+            self.channels,
+            kernel_size=3,
+            padding=1,
+        )
+        # FPN Module
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_convs = nn.ModuleList()
+        for in_channels in self.in_channels[:-1]:  # skip the top layer
+            l_conv = UperNetConvModule(in_channels, self.channels, kernel_size=1)
+            fpn_conv = UperNetConvModule(self.channels, self.channels, kernel_size=3, padding=1)
+            self.lateral_convs.append(l_conv)
+            self.fpn_convs.append(fpn_conv)
+
+        self.fpn_bottleneck = UperNetConvModule(
+            len(self.in_channels) * self.channels,
+            self.channels,
+            kernel_size=3,
+            padding=1,
+        )
+
+    def init_weights(self):
+        self.apply(self._init_weights)
+
+    def _init_weights(self, module):
+        if isinstance(module, nn.Conv2d):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+
+    def psp_forward(self, inputs):
+        x = inputs[-1]
+        psp_outs = [x]
+        psp_outs.extend(self.psp_modules(x))
+        psp_outs = torch.cat(psp_outs, dim=1)
+        output = self.bottleneck(psp_outs)
+
+        return output
+
+    def forward(self, encoder_hidden_states: torch.Tensor) -> torch.Tensor:
+        # build laterals
+        laterals = [lateral_conv(encoder_hidden_states[i]) for i, lateral_conv in enumerate(self.lateral_convs)]
+
+        laterals.append(self.psp_forward(encoder_hidden_states))
+
+        # build top-down path
+        used_backbone_levels = len(laterals)
+        for i in range(used_backbone_levels - 1, 0, -1):
+            prev_shape = laterals[i - 1].shape[2:]
+            laterals[i - 1] = laterals[i - 1] + nn.functional.interpolate(
+                laterals[i], size=prev_shape, mode="bilinear", align_corners=self.align_corners
+            )
+
+        # build outputs
+        fpn_outs = [self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels - 1)]
+        # append psp feature
+        fpn_outs.append(laterals[-1])
+
+        for i in range(used_backbone_levels - 1, 0, -1):
+            fpn_outs[i] = nn.functional.interpolate(
+                fpn_outs[i], size=fpn_outs[0].shape[2:], mode="bilinear", align_corners=self.align_corners
+            )
+        fpn_outs = torch.cat(fpn_outs, dim=1)
+        output = self.fpn_bottleneck(fpn_outs)
+        output = self.classifier(output)
+
+        return output
+
+
+class UperNetFCNHead(nn.Module):
+    """
+    Fully Convolution Networks for Semantic Segmentation. This head is the implementation of
+    [FCNNet](https://arxiv.org/abs/1411.4038>).
+
+    Args:
+        config:
+            Configuration.
+        in_channels (int):
+            Number of input channels.
+        kernel_size (int):
+            The kernel size for convs in the head. Default: 3.
+        dilation (int):
+            The dilation rate for convs in the head. Default: 1.
+    """
+
+    def __init__(
+        self, config, in_index: int = 2, kernel_size: int = 3, dilation: Union[int, Tuple[int, int]] = 1
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.in_channels = config.auxiliary_in_channels
+        self.channels = config.auxiliary_channels
+        self.num_convs = config.auxiliary_num_convs
+        self.concat_input = config.auxiliary_concat_input
+        self.in_index = in_index
+
+        conv_padding = (kernel_size // 2) * dilation
+        convs = []
+        convs.append(
+            UperNetConvModule(
+                self.in_channels, self.channels, kernel_size=kernel_size, padding=conv_padding, dilation=dilation
+            )
+        )
+        for i in range(self.num_convs - 1):
+            convs.append(
+                UperNetConvModule(
+                    self.channels, self.channels, kernel_size=kernel_size, padding=conv_padding, dilation=dilation
+                )
+            )
+        if self.num_convs == 0:
+            self.convs = nn.Identity()
+        else:
+            self.convs = nn.Sequential(*convs)
+        if self.concat_input:
+            self.conv_cat = UperNetConvModule(
+                self.in_channels + self.channels, self.channels, kernel_size=kernel_size, padding=kernel_size // 2
+            )
+
+        self.classifier = nn.Conv2d(self.channels, config.num_labels, kernel_size=1)
+
+    def init_weights(self):
+        self.apply(self._init_weights)
+
+    def _init_weights(self, module):
+        if isinstance(module, nn.Conv2d):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+
+    def forward(self, encoder_hidden_states: torch.Tensor) -> torch.Tensor:
+        # just take the relevant feature maps
+        hidden_states = encoder_hidden_states[self.in_index]
+        output = self.convs(hidden_states)
+        if self.concat_input:
+            output = self.conv_cat(torch.cat([hidden_states, output], dim=1))
+        output = self.classifier(output)
+        return output
+
+
+class UperNetPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = UperNetConfig
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        if isinstance(module, UperNetPreTrainedModel):
+            module.backbone.init_weights()
+            module.decode_head.init_weights()
+            if module.auxiliary_head is not None:
+                module.auxiliary_head.init_weights()
+
+    def init_weights(self):
+        """Initialize the weights"""
+        self.backbone.init_weights()
+        self.decode_head.init_weights()
+        if self.auxiliary_head is not None:
+            self.auxiliary_head.init_weights()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, BackboneMixin):
+            module.gradient_checkpointing = value
+
+
+UPERNET_START_DOCSTRING = r"""
+    Parameters:
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+        config ([`UperNetConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+UPERNET_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`SegformerImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers in case the backbone has them. See
+            `attentions` under returned tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers of the backbone. See `hidden_states` under
+            returned tensors for more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    """UperNet framework leveraging any vision backbone e.g. for ADE20k, CityScapes.""",
+    UPERNET_START_DOCSTRING,
+)
+class UperNetForSemanticSegmentation(UperNetPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.backbone = AutoBackbone.from_config(config.backbone_config)
+
+        # Semantic segmentation head(s)
+        self.decode_head = UperNetHead(config, in_channels=self.backbone.channels)
+        self.auxiliary_head = UperNetFCNHead(config) if config.use_auxiliary_head else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(UPERNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=SemanticSegmenterOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, SemanticSegmenterOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).
+
+        Returns:
+
+        Examples:
+        ```python
+        >>> from transformers import AutoImageProcessor, UperNetForSemanticSegmentation
+        >>> from PIL import Image
+        >>> from huggingface_hub import hf_hub_download
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("openmmlab/upernet-convnext-tiny")
+        >>> model = UperNetForSemanticSegmentation.from_pretrained("openmmlab/upernet-convnext-tiny")
+
+        >>> filepath = hf_hub_download(
+        ...     repo_id="hf-internal-testing/fixtures_ade20k", filename="ADE_val_00000001.jpg", repo_type="dataset"
+        ... )
+        >>> image = Image.open(filepath).convert("RGB")
+
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+
+        >>> logits = outputs.logits  # shape (batch_size, num_labels, height, width)
+        >>> list(logits.shape)
+        [1, 150, 512, 512]
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        outputs = self.backbone.forward_with_filtered_kwargs(
+            pixel_values, output_hidden_states=output_hidden_states, output_attentions=output_attentions
+        )
+        features = outputs.feature_maps
+
+        logits = self.decode_head(features)
+        logits = nn.functional.interpolate(logits, size=pixel_values.shape[2:], mode="bilinear", align_corners=False)
+
+        auxiliary_logits = None
+        if self.auxiliary_head is not None:
+            auxiliary_logits = self.auxiliary_head(features)
+            auxiliary_logits = nn.functional.interpolate(
+                auxiliary_logits, size=pixel_values.shape[2:], mode="bilinear", align_corners=False
+            )
+
+        loss = None
+        if labels is not None:
+            if self.config.num_labels == 1:
+                raise ValueError("The number of labels should be greater than one")
+            else:
+                # compute weighted loss
+                loss_fct = CrossEntropyLoss(ignore_index=self.config.loss_ignore_index)
+                loss = loss_fct(logits, labels)
+                if auxiliary_logits is not None:
+                    auxiliary_loss = loss_fct(auxiliary_logits, labels)
+                    loss += self.config.auxiliary_loss_weight * auxiliary_loss
+
+        if not return_dict:
+            if output_hidden_states:
+                output = (logits,) + outputs[1:]
+            else:
+                output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SemanticSegmenterOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/van/__init__.py b/src/transformers/models/van/__init__.py
deleted file mode 100644
index 44c88f0448c3..000000000000
--- a/src/transformers/models/van/__init__.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import TYPE_CHECKING
-
-# rely on isort to merge the imports
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
-
-
-_import_structure = {"configuration_van": ["VAN_PRETRAINED_CONFIG_ARCHIVE_MAP", "VanConfig"]}
-
-
-try:
-    if not is_torch_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_van"] = [
-        "VAN_PRETRAINED_MODEL_ARCHIVE_LIST",
-        "VanForImageClassification",
-        "VanModel",
-        "VanPreTrainedModel",
-    ]
-
-if TYPE_CHECKING:
-    from .configuration_van import VAN_PRETRAINED_CONFIG_ARCHIVE_MAP, VanConfig
-
-    try:
-        if not is_torch_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_van import (
-            VAN_PRETRAINED_MODEL_ARCHIVE_LIST,
-            VanForImageClassification,
-            VanModel,
-            VanPreTrainedModel,
-        )
-
-else:
-    import sys
-
-    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
diff --git a/src/transformers/models/videomae/__init__.py b/src/transformers/models/videomae/__init__.py
index a3630044063e..663b6d41aba6 100644
--- a/src/transformers/models/videomae/__init__.py
+++ b/src/transformers/models/videomae/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/videomae/configuration_videomae.py b/src/transformers/models/videomae/configuration_videomae.py
index 932c4c1d98ca..8120bb23fc2a 100644
--- a/src/transformers/models/videomae/configuration_videomae.py
+++ b/src/transformers/models/videomae/configuration_videomae.py
@@ -119,7 +119,7 @@ def __init__(
         decoder_num_hidden_layers=4,
         decoder_intermediate_size=1536,
         norm_pix_loss=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/videomae/convert_videomae_to_pytorch.py b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
index 2f4ce5d44704..c98160a6bb82 100644
--- a/src/transformers/models/videomae/convert_videomae_to_pytorch.py
+++ b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
@@ -17,31 +17,23 @@
 import argparse
 import json
 
+import gdown
 import numpy as np
 import torch
-
-import gdown
 from huggingface_hub import hf_hub_download
+
 from transformers import (
     VideoMAEConfig,
-    VideoMAEFeatureExtractor,
     VideoMAEForPreTraining,
     VideoMAEForVideoClassification,
+    VideoMAEImageProcessor,
 )
 
 
 def get_videomae_config(model_name):
     config = VideoMAEConfig()
 
-    if "large" in model_name:
-        config.hidden_size = 1024
-        config.intermediate_size = 4096
-        config.num_hidden_layers = 24
-        config.num_attention_heads = 16
-        config.decoder_num_hidden_layers = 12
-        config.decoder_num_attention_heads = 8
-        config.decoder_hidden_size = 512
-        config.decoder_intermediate_size = 2048
+    set_architecture_configs(model_name, config)
 
     if "finetuned" not in model_name:
         config.use_mean_pooling = False
@@ -64,6 +56,38 @@ def get_videomae_config(model_name):
     return config
 
 
+def set_architecture_configs(model_name, config):
+    if "small" in model_name:
+        config.hidden_size = 384
+        config.intermediate_size = 1536
+        config.num_hidden_layers = 12
+        config.num_attention_heads = 16
+        config.decoder_num_hidden_layers = 12
+        config.decoder_num_attention_heads = 3
+        config.decoder_hidden_size = 192
+        config.decoder_intermediate_size = 768
+    elif "large" in model_name:
+        config.hidden_size = 1024
+        config.intermediate_size = 4096
+        config.num_hidden_layers = 24
+        config.num_attention_heads = 16
+        config.decoder_num_hidden_layers = 12
+        config.decoder_num_attention_heads = 8
+        config.decoder_hidden_size = 512
+        config.decoder_intermediate_size = 2048
+    elif "huge" in model_name:
+        config.hidden_size = 1280
+        config.intermediate_size = 5120
+        config.num_hidden_layers = 32
+        config.num_attention_heads = 16
+        config.decoder_num_hidden_layers = 12
+        config.decoder_num_attention_heads = 8
+        config.decoder_hidden_size = 640
+        config.decoder_intermediate_size = 2560
+    elif "base" not in model_name:
+        raise ValueError('Model name should include either "small", "base", "large", or "huge"')
+
+
 def rename_key(name):
     if "encoder." in name:
         name = name.replace("encoder.", "")
@@ -174,9 +198,9 @@ def convert_videomae_checkpoint(checkpoint_url, pytorch_dump_folder_path, model_
     model.eval()
 
     # verify model on basic input
-    feature_extractor = VideoMAEFeatureExtractor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
+    image_processor = VideoMAEImageProcessor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
     video = prepare_video()
-    inputs = feature_extractor(video, return_tensors="pt")
+    inputs = image_processor(video, return_tensors="pt")
 
     if "finetuned" not in model_name:
         local_path = hf_hub_download(repo_id="hf-internal-testing/bool-masked-pos", filename="bool_masked_pos.pt")
@@ -186,6 +210,8 @@ def convert_videomae_checkpoint(checkpoint_url, pytorch_dump_folder_path, model_
     logits = outputs.logits
 
     model_names = [
+        "videomae-small-finetuned-kinetics",
+        "videomae-small-finetuned-ssv2",
         # Kinetics-400 checkpoints (short = pretrained only for 800 epochs instead of 1600)
         "videomae-base-short",
         "videomae-base-short-finetuned-kinetics",
@@ -193,6 +219,7 @@ def convert_videomae_checkpoint(checkpoint_url, pytorch_dump_folder_path, model_
         "videomae-base-finetuned-kinetics",
         "videomae-large",
         "videomae-large-finetuned-kinetics",
+        "videomae-huge-finetuned-kinetics",
         # Something-Something-v2 checkpoints (short = pretrained only for 800 epochs instead of 2400)
         "videomae-base-short-ssv2",
         "videomae-base-short-finetuned-ssv2",
@@ -201,7 +228,13 @@ def convert_videomae_checkpoint(checkpoint_url, pytorch_dump_folder_path, model_
     ]
 
     # NOTE: logits were tested with image_mean and image_std equal to [0.5, 0.5, 0.5] and [0.5, 0.5, 0.5]
-    if model_name == "videomae-base":
+    if model_name == "videomae-small-finetuned-kinetics":
+        expected_shape = torch.Size([1, 400])
+        expected_slice = torch.tensor([-0.9291, -0.4061, -0.9307])
+    elif model_name == "videomae-small-finetuned-ssv2":
+        expected_shape = torch.Size([1, 174])
+        expected_slice = torch.tensor([0.2671, -0.4689, -0.8235])
+    elif model_name == "videomae-base":
         expected_shape = torch.Size([1, 1408, 1536])
         expected_slice = torch.tensor([[0.7739, 0.7968, 0.7089], [0.6701, 0.7487, 0.6209], [0.4287, 0.5158, 0.4773]])
     elif model_name == "videomae-base-short":
@@ -215,6 +248,9 @@ def convert_videomae_checkpoint(checkpoint_url, pytorch_dump_folder_path, model_
     elif model_name == "videomae-large-finetuned-kinetics":
         expected_shape = torch.Size([1, 400])
         expected_slice = torch.tensor([0.0771, 0.0011, -0.3625])
+    elif model_name == "videomae-huge-finetuned-kinetics":
+        expected_shape = torch.Size([1, 400])
+        expected_slice = torch.tensor([0.2433, 0.1632, -0.4894])
     elif model_name == "videomae-base-short-finetuned-kinetics":
         expected_shape = torch.Size([1, 400])
         expected_slice = torch.tensor([0.6588, 0.0990, -0.2493])
@@ -252,8 +288,8 @@ def convert_videomae_checkpoint(checkpoint_url, pytorch_dump_folder_path, model_
         print("Loss ok!")
 
     if pytorch_dump_folder_path is not None:
-        print(f"Saving model and feature extractor to {pytorch_dump_folder_path}")
-        feature_extractor.save_pretrained(pytorch_dump_folder_path)
+        print(f"Saving model and image processor to {pytorch_dump_folder_path}")
+        image_processor.save_pretrained(pytorch_dump_folder_path)
         model.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
diff --git a/src/transformers/models/videomae/image_processing_videomae.py b/src/transformers/models/videomae/image_processing_videomae.py
index 451d2461c344..6df708eec3ea 100644
--- a/src/transformers/models/videomae/image_processing_videomae.py
+++ b/src/transformers/models/videomae/image_processing_videomae.py
@@ -18,15 +18,9 @@
 
 import numpy as np
 
-from transformers.utils import is_vision_available
-from transformers.utils.generic import TensorType
-
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import (
-    center_crop,
     get_resize_output_image_size,
-    normalize,
-    rescale,
     resize,
     to_channel_dimension_format,
 )
@@ -36,11 +30,13 @@
     ChannelDimension,
     ImageInput,
     PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
     is_valid_image,
     to_numpy_array,
     valid_images,
 )
-from ...utils import logging
+from ...utils import TensorType, is_vision_available, logging
 
 
 if is_vision_available():
@@ -75,7 +71,7 @@ class VideoMAEImageProcessor(BaseImageProcessor):
             Size of the output image after resizing. The shortest edge of the image will be resized to
             `size["shortest_edge"]` while maintaining the aspect ratio of the original image. Can be overriden by
             `size` in the `preprocess` method.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
             Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
             `preprocess` method.
         do_center_crop (`bool`, *optional*, defaults to `True`):
@@ -115,7 +111,7 @@ def __init__(
         do_normalize: bool = True,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"shortest_edge": 224}
@@ -140,7 +136,8 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BILINEAR,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image.
@@ -156,82 +153,26 @@ def resize(
                 Resampling filter to use when resiizing the image.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
         """
         size = get_size_dict(size, default_to_square=False)
         if "shortest_edge" in size:
-            output_size = get_resize_output_image_size(image, size["shortest_edge"], default_to_square=False)
+            output_size = get_resize_output_image_size(
+                image, size["shortest_edge"], default_to_square=False, input_data_format=input_data_format
+            )
         elif "height" in size and "width" in size:
             output_size = (size["height"], size["width"])
         else:
             raise ValueError(f"Size must have 'height' and 'width' or 'shortest_edge' as keys. Got {size.keys()}")
-        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
-
-    def center_crop(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Center crop an image to `(size["height"], size["width"])`. If the input size is smaller than `size` along any
-        edge, the image is padded with 0's and then center cropped.
-
-        Args:
-            image (`np.ndarray`):
-                Image to center crop.
-            size (`Dict[str, int]`):
-                Size of the output image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        size = get_size_dict(size)
-        if "height" not in size or "width" not in size:
-            raise ValueError(f"Size must have 'height' and 'width' as keys. Got {size.keys()}")
-        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
-
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            image_mean (`float` or `List[float]`):
-                Image mean.
-            image_std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
 
     def _preprocess_image(
         self,
@@ -247,6 +188,7 @@ def _preprocess_image(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.ndarray:
         """Preprocesses a single image."""
         if do_resize and size is None or resample is None:
@@ -264,19 +206,28 @@ def _preprocess_image(
         # All transformations expect numpy arrays.
         image = to_numpy_array(image)
 
+        if is_scaled_image(image) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(image)
+
         if do_resize:
-            image = self.resize(image=image, size=size, resample=resample)
+            image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
 
         if do_center_crop:
-            image = self.center_crop(image, size=crop_size)
+            image = self.center_crop(image, size=crop_size, input_data_format=input_data_format)
 
         if do_rescale:
-            image = self.rescale(image=image, scale=rescale_factor)
+            image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
 
         if do_normalize:
-            image = self.normalize(image=image, mean=image_mean, std=image_std)
+            image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
 
-        image = to_channel_dimension_format(image, data_format)
+        image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
         return image
 
     def preprocess(
@@ -294,6 +245,7 @@ def preprocess(
         image_std: Optional[Union[float, List[float]]] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
         **kwargs,
     ) -> PIL.Image.Image:
         """
@@ -301,7 +253,8 @@ def preprocess(
 
         Args:
             images (`ImageInput`):
-                Image to preprocess.
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
             do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                 Whether to resize the image.
             size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@@ -335,6 +288,12 @@ def preprocess(
                     - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                     - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                     - Unset: Use the inferred channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         resample = resample if resample is not None else self.resample
@@ -373,6 +332,7 @@ def preprocess(
                     image_mean=image_mean,
                     image_std=image_std,
                     data_format=data_format,
+                    input_data_format=input_data_format,
                 )
                 for img in video
             ]
diff --git a/src/transformers/models/videomae/modeling_videomae.py b/src/transformers/models/videomae/modeling_videomae.py
index bf6ca688fb6a..07c32d149290 100644
--- a/src/transformers/models/videomae/modeling_videomae.py
+++ b/src/transformers/models/videomae/modeling_videomae.py
@@ -106,6 +106,7 @@ class VideoMAEForPreTrainingOutput(ModelOutput):
 # https://github.com/jadore801120/attention-is-all-you-need-pytorch/blob/master/transformer/Models.py#L31
 def get_sinusoid_encoding_table(n_position, d_hid):
     """Sinusoid position encoding table"""
+
     # TODO: make it with torch instead of numpy
     def get_position_angle_vec(position):
         return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
@@ -236,7 +237,6 @@ def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
     def forward(
         self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
     ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-
         k_bias = torch.zeros_like(self.v_bias, requires_grad=False) if self.q_bias is not None else None
         keys = nn.functional.linear(input=hidden_states, weight=self.key.weight, bias=k_bias)
         values = nn.functional.linear(input=hidden_states, weight=self.value.weight, bias=self.v_bias)
@@ -286,7 +286,6 @@ def __init__(self, config: VideoMAEConfig) -> None:
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
 
@@ -344,7 +343,6 @@ def __init__(self, config: VideoMAEConfig) -> None:
             self.intermediate_act_fn = config.hidden_act
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.intermediate_act_fn(hidden_states)
 
@@ -510,7 +508,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 VIDEOMAE_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`VideoMAEImageProcessor`]. See
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
             [`VideoMAEImageProcessor.__call__`] for details.
 
         head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
@@ -573,19 +571,56 @@ def forward(
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutput]:
         r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Each video in the
+            batch must have the same number of masked patches. If `None`, then all patches are considered. Sequence
+            length is `(num_frames // tubelet_size) * (image_size // patch_size) ** 2`.
+
         Returns:
 
         Examples:
 
         ```python
-        >>> from decord import VideoReader, cpu
+        >>> import av
         >>> import numpy as np
 
-        >>> from transformers import VideoMAEImageProcessor, VideoMAEModel
+        >>> from transformers import AutoImageProcessor, VideoMAEModel
         >>> from huggingface_hub import hf_hub_download
 
+        >>> np.random.seed(0)
+
+
+        >>> def read_video_pyav(container, indices):
+        ...     '''
+        ...     Decode the video with PyAV decoder.
+        ...     Args:
+        ...         container (`av.container.input.InputContainer`): PyAV container.
+        ...         indices (`List[int]`): List of frame indices to decode.
+        ...     Returns:
+        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
+        ...     '''
+        ...     frames = []
+        ...     container.seek(0)
+        ...     start_index = indices[0]
+        ...     end_index = indices[-1]
+        ...     for i, frame in enumerate(container.decode(video=0)):
+        ...         if i > end_index:
+        ...             break
+        ...         if i >= start_index and i in indices:
+        ...             frames.append(frame)
+        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+
 
         >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
+        ...     '''
+        ...     Sample a given number of frame indices from the video.
+        ...     Args:
+        ...         clip_len (`int`): Total number of frames to sample.
+        ...         frame_sample_rate (`int`): Sample every n-th frame.
+        ...         seg_len (`int`): Maximum allowed index of sample's last frame.
+        ...     Returns:
+        ...         indices (`List[int]`): List of sampled frame indices
+        ...     '''
         ...     converted_len = int(clip_len * frame_sample_rate)
         ...     end_idx = np.random.randint(converted_len, seg_len)
         ...     start_idx = end_idx - converted_len
@@ -598,14 +633,13 @@ def forward(
         >>> file_path = hf_hub_download(
         ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
         ... )
-        >>> videoreader = VideoReader(file_path, num_threads=1, ctx=cpu(0))
+        >>> container = av.open(file_path)
 
         >>> # sample 16 frames
-        >>> videoreader.seek(0)
-        >>> indices = sample_frame_indices(clip_len=16, frame_sample_rate=4, seg_len=len(videoreader))
-        >>> video = videoreader.get_batch(indices).asnumpy()
+        >>> indices = sample_frame_indices(clip_len=16, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
+        >>> video = read_video_pyav(container, indices)
 
-        >>> image_processor = VideoMAEImageProcessor.from_pretrained("MCG-NJU/videomae-base")
+        >>> image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
         >>> model = VideoMAEModel.from_pretrained("MCG-NJU/videomae-base")
 
         >>> # prepare video for the model
@@ -761,18 +795,23 @@ def forward(
         return_dict: Optional[bool] = None,
     ) -> Union[tuple, VideoMAEForPreTrainingOutput]:
         r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Each video in the
+            batch must have the same number of masked patches. Sequence length is `(num_frames // tubelet_size) *
+            (image_size // patch_size) ** 2`.
+
         Returns:
 
         Examples:
         ```python
-        >>> from transformers import VideoMAEImageProcessor, VideoMAEForPreTraining
+        >>> from transformers import AutoImageProcessor, VideoMAEForPreTraining
         >>> import numpy as np
         >>> import torch
 
         >>> num_frames = 16
-        >>> video = list(np.random.randn(16, 3, 224, 224))
+        >>> video = list(np.random.randint(0, 256, (num_frames, 3, 224, 224)))
 
-        >>> image_processor = VideoMAEImageProcessor.from_pretrained("MCG-NJU/videomae-base")
+        >>> image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
         >>> model = VideoMAEForPreTraining.from_pretrained("MCG-NJU/videomae-base")
 
         >>> pixel_values = image_processor(video, return_tensors="pt").pixel_values
@@ -819,11 +858,15 @@ def forward(
         loss = None
         with torch.no_grad():
             # calculate the labels to be predicted
-            # first, unnormalize the frames
-            device = pixel_values.device
-            mean = torch.as_tensor(IMAGENET_DEFAULT_MEAN).to(device)[None, None, :, None, None]
-            std = torch.as_tensor(IMAGENET_DEFAULT_STD).to(device)[None, None, :, None, None]
-            frames = pixel_values * std + mean  # in [0, 1]
+            if self.config.num_channels != 3:
+                # Can't unnormalize with default means/stds
+                frames = pixel_values
+            else:
+                # first, unnormalize the frames
+                device = pixel_values.device
+                mean = torch.as_tensor(IMAGENET_DEFAULT_MEAN).to(device)[None, None, :, None, None]
+                std = torch.as_tensor(IMAGENET_DEFAULT_STD).to(device)[None, None, :, None, None]
+                frames = pixel_values * std + mean  # in [0, 1]
 
             batch_size, time, num_channels, height, width = frames.shape
             tubelet_size, patch_size = self.config.tubelet_size, self.config.patch_size
@@ -859,6 +902,10 @@ def forward(
                     tubelet_size * patch_size * patch_size * num_channels,
                 )
             else:
+                if self.config.num_channels != 3:
+                    raise ValueError(
+                        "Can't unnormalize non-RGB images. Consider setting config.norm_pix_loss to False."
+                    )
                 # step 1: split up dimensions (time by tubelet_size, height by patch_size, width by patch_size)
                 frames = frames.view(
                     batch_size,
@@ -898,8 +945,8 @@ def forward(
 
 
 @add_start_docstrings(
-    """VideoMAE Model transformer with a video classification head on top (a linear layer on top of the final hidden state of
-    the [CLS] token) e.g. for ImageNet.""",
+    """VideoMAE Model transformer with a video classification head on top (a linear layer on top of the average pooled hidden
+    states of all tokens) e.g. for ImageNet.""",
     VIDEOMAE_START_DOCSTRING,
 )
 class VideoMAEForVideoClassification(VideoMAEPreTrainedModel):
@@ -938,17 +985,47 @@ def forward(
         Examples:
 
         ```python
-        >>> from decord import VideoReader, cpu
+        >>> import av
         >>> import torch
         >>> import numpy as np
 
-        >>> from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification
+        >>> from transformers import AutoImageProcessor, VideoMAEForVideoClassification
         >>> from huggingface_hub import hf_hub_download
 
         >>> np.random.seed(0)
 
 
+        >>> def read_video_pyav(container, indices):
+        ...     '''
+        ...     Decode the video with PyAV decoder.
+        ...     Args:
+        ...         container (`av.container.input.InputContainer`): PyAV container.
+        ...         indices (`List[int]`): List of frame indices to decode.
+        ...     Returns:
+        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
+        ...     '''
+        ...     frames = []
+        ...     container.seek(0)
+        ...     start_index = indices[0]
+        ...     end_index = indices[-1]
+        ...     for i, frame in enumerate(container.decode(video=0)):
+        ...         if i > end_index:
+        ...             break
+        ...         if i >= start_index and i in indices:
+        ...             frames.append(frame)
+        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+
+
         >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
+        ...     '''
+        ...     Sample a given number of frame indices from the video.
+        ...     Args:
+        ...         clip_len (`int`): Total number of frames to sample.
+        ...         frame_sample_rate (`int`): Sample every n-th frame.
+        ...         seg_len (`int`): Maximum allowed index of sample's last frame.
+        ...     Returns:
+        ...         indices (`List[int]`): List of sampled frame indices
+        ...     '''
         ...     converted_len = int(clip_len * frame_sample_rate)
         ...     end_idx = np.random.randint(converted_len, seg_len)
         ...     start_idx = end_idx - converted_len
@@ -961,14 +1038,13 @@ def forward(
         >>> file_path = hf_hub_download(
         ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
         ... )
-        >>> videoreader = VideoReader(file_path, num_threads=1, ctx=cpu(0))
+        >>> container = av.open(file_path)
 
         >>> # sample 16 frames
-        >>> videoreader.seek(0)
-        >>> indices = sample_frame_indices(clip_len=16, frame_sample_rate=4, seg_len=len(videoreader))
-        >>> video = videoreader.get_batch(indices).asnumpy()
+        >>> indices = sample_frame_indices(clip_len=16, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
+        >>> video = read_video_pyav(container, indices)
 
-        >>> image_processor = VideoMAEImageProcessor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
+        >>> image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
         >>> model = VideoMAEForVideoClassification.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
 
         >>> inputs = image_processor(list(video), return_tensors="pt")
diff --git a/src/transformers/models/vilt/__init__.py b/src/transformers/models/vilt/__init__.py
index 436a3a56d7bc..6d5afba10dac 100644
--- a/src/transformers/models/vilt/__init__.py
+++ b/src/transformers/models/vilt/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,7 +13,6 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-# rely on isort to merge the imports
 from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
 
 
diff --git a/src/transformers/models/vilt/configuration_vilt.py b/src/transformers/models/vilt/configuration_vilt.py
index 7a5c856413a0..3db6535e5f07 100644
--- a/src/transformers/models/vilt/configuration_vilt.py
+++ b/src/transformers/models/vilt/configuration_vilt.py
@@ -58,9 +58,9 @@ class ViltConfig(PretrainedConfig):
         hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"selu"` and `"gelu_new"` are supported.
-        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
@@ -120,7 +120,7 @@ def __init__(
         max_image_length=-1,
         tie_word_embeddings=False,
         num_images=-1,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
 
diff --git a/src/transformers/models/vilt/convert_vilt_original_to_pytorch.py b/src/transformers/models/vilt/convert_vilt_original_to_pytorch.py
index 5e737f784c81..015db07453d1 100644
--- a/src/transformers/models/vilt/convert_vilt_original_to_pytorch.py
+++ b/src/transformers/models/vilt/convert_vilt_original_to_pytorch.py
@@ -19,19 +19,19 @@
 import json
 from pathlib import Path
 
+import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 
-import requests
-from huggingface_hub import hf_hub_download
 from transformers import (
     BertTokenizer,
     ViltConfig,
-    ViltFeatureExtractor,
     ViltForImageAndTextRetrieval,
     ViltForImagesAndTextClassification,
     ViltForMaskedLM,
     ViltForQuestionAnswering,
+    ViltImageProcessor,
     ViltProcessor,
 )
 from transformers.utils import logging
@@ -223,9 +223,9 @@ def convert_vilt_checkpoint(checkpoint_url, pytorch_dump_folder_path):
         model.load_state_dict(state_dict)
 
     # Define processor
-    feature_extractor = ViltFeatureExtractor(size=384)
+    image_processor = ViltImageProcessor(size=384)
     tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
-    processor = ViltProcessor(feature_extractor, tokenizer)
+    processor = ViltProcessor(image_processor, tokenizer)
 
     # Forward pass on example inputs (image + text)
     if nlvr_model:
diff --git a/src/transformers/models/vilt/image_processing_vilt.py b/src/transformers/models/vilt/image_processing_vilt.py
index e4fbdec0322f..06aa1bc9b3de 100644
--- a/src/transformers/models/vilt/image_processing_vilt.py
+++ b/src/transformers/models/vilt/image_processing_vilt.py
@@ -14,16 +14,12 @@
 # limitations under the License.
 """Image processor class for Vilt."""
 
-import warnings
 from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 
 import numpy as np
 
-from transformers.utils import is_vision_available
-from transformers.utils.generic import TensorType
-
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import PaddingMode, normalize, pad, rescale, resize, to_channel_dimension_format
+from ...image_transforms import PaddingMode, pad, resize, to_channel_dimension_format
 from ...image_utils import (
     IMAGENET_STANDARD_MEAN,
     IMAGENET_STANDARD_STD,
@@ -32,11 +28,12 @@
     PILImageResampling,
     get_image_size,
     infer_channel_dimension_format,
-    is_batched,
+    is_scaled_image,
+    make_list_of_images,
     to_numpy_array,
     valid_images,
 )
-from ...utils import logging
+from ...utils import TensorType, is_vision_available, logging
 
 
 if is_vision_available():
@@ -53,7 +50,9 @@ def max_across_indices(values: Iterable[Any]) -> List[Any]:
     return [max(values_i) for values_i in zip(*values)]
 
 
-def make_pixel_mask(image: np.ndarray, output_size: Tuple[int, int]) -> np.ndarray:
+def make_pixel_mask(
+    image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> np.ndarray:
     """
     Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
 
@@ -63,31 +62,38 @@ def make_pixel_mask(image: np.ndarray, output_size: Tuple[int, int]) -> np.ndarr
         output_size (`Tuple[int, int]`):
             Output size of the mask.
     """
-    input_height, input_width = get_image_size(image)
+    input_height, input_width = get_image_size(image, channel_dim=input_data_format)
     mask = np.zeros(output_size, dtype=np.int64)
     mask[:input_height, :input_width] = 1
     return mask
 
 
-def get_max_height_width(images: List[np.ndarray]) -> List[int]:
+def get_max_height_width(
+    images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> List[int]:
     """
     Get the maximum height and width across all images in a batch.
     """
-    input_channel_dimension = infer_channel_dimension_format(images[0])
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(images[0])
 
-    if input_channel_dimension == ChannelDimension.FIRST:
+    if input_data_format == ChannelDimension.FIRST:
         _, max_height, max_width = max_across_indices([img.shape for img in images])
-    elif input_channel_dimension == ChannelDimension.LAST:
+    elif input_data_format == ChannelDimension.LAST:
         max_height, max_width, _ = max_across_indices([img.shape for img in images])
     else:
-        raise ValueError(f"Invalid channel dimension format: {input_channel_dimension}")
+        raise ValueError(f"Invalid channel dimension format: {input_data_format}")
     return (max_height, max_width)
 
 
 def get_resize_output_image_size(
-    input_image: np.ndarray, shorter: int = 800, longer: int = 1333, size_divisor: int = 32
+    input_image: np.ndarray,
+    shorter: int = 800,
+    longer: int = 1333,
+    size_divisor: int = 32,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
 ) -> Tuple[int, int]:
-    input_height, input_width = get_image_size(input_image)
+    input_height, input_width = get_image_size(input_image, input_data_format)
     min_size, max_size = shorter, longer
 
     scale = min_size / min(input_height, input_width)
@@ -126,7 +132,7 @@ class ViltImageProcessor(BaseImageProcessor):
         size_divisor (`int`, *optional*, defaults to 32):
             The size by which to make sure both the height and width can be divided. Only has an effect if `do_resize`
             is set to `True`. Can be overridden by the `size_divisor` parameter in the `preprocess` method.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
             Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
             overridden by the `resample` parameter in the `preprocess` method.
         do_rescale (`bool`, *optional*, defaults to `True`):
@@ -165,7 +171,7 @@ def __init__(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_pad: bool = True,
-        **kwargs
+        **kwargs,
     ) -> None:
         if "pad_and_return_pixel_mask" in kwargs:
             do_pad = kwargs.pop("pad_and_return_pixel_mask")
@@ -204,7 +210,8 @@ def resize(
         size_divisor: int = 32,
         resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image.
@@ -224,95 +231,75 @@ def resize(
                 Resampling filter to use when resiizing the image.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
         """
         size = get_size_dict(size, default_to_square=False)
         if "shortest_edge" not in size:
             raise ValueError(f"The `size` dictionary must contain the key `shortest_edge`. Got {size.keys()}")
         shorter = size["shortest_edge"]
         longer = int(1333 / 800 * shorter)
-        output_size = get_resize_output_image_size(image, shorter=shorter, longer=longer, size_divisor=size_divisor)
-        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
-
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            mean (`float` or `List[float]`):
-                Image mean.
-            std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
+        output_size = get_resize_output_image_size(
+            image, shorter=shorter, longer=longer, size_divisor=size_divisor, input_data_format=input_data_format
+        )
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
 
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
     def _pad_image(
         self,
         image: np.ndarray,
         output_size: Tuple[int, int],
         constant_values: Union[float, Iterable[float]] = 0,
         data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.ndarray:
         """
         Pad an image with zeros to the given size.
         """
-        input_height, input_width = get_image_size(image)
+        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
         output_height, output_width = output_size
 
         pad_bottom = output_height - input_height
         pad_right = output_width - input_width
         padding = ((0, pad_bottom), (0, pad_right))
         padded_image = pad(
-            image, padding, mode=PaddingMode.CONSTANT, constant_values=constant_values, data_format=data_format
+            image,
+            padding,
+            mode=PaddingMode.CONSTANT,
+            constant_values=constant_values,
+            data_format=data_format,
+            input_data_format=input_data_format,
         )
         return padded_image
 
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
     def pad(
         self,
         images: List[np.ndarray],
+        constant_values: Union[float, Iterable[float]] = 0,
         return_pixel_mask: bool = True,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> BatchFeature:
         """
-        Pads a batch of images with zeros to the size of largest height and width in the batch and optionally returns
-        their corresponding pixel mask.
+        Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
+        in the batch and optionally returns their corresponding pixel mask.
 
         Args:
-            images (`List[np.ndarray]`):
-                Batch of images to pad.
-            return_pixel_mask (`bool`, *optional*, defaults to `False`):
-                Whether to return the pixel mask.
+            image (`np.ndarray`):
+                Image to pad.
+            constant_values (`float` or `Iterable[float]`, *optional*):
+                The value to use for the padding if `mode` is `"constant"`.
+            return_pixel_mask (`bool`, *optional*, defaults to `True`):
+                Whether to return a pixel mask.
             return_tensors (`str` or `TensorType`, *optional*):
                 The type of tensors to return. Can be one of:
                     - Unset: Return a list of `np.ndarray`.
@@ -322,53 +309,32 @@ def pad(
                     - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
         """
-        pad_size = get_max_height_width(images)
+        pad_size = get_max_height_width(images, input_data_format=input_data_format)
+
         padded_images = [
-            self._pad_image(image=image, output_size=pad_size, data_format=data_format) for image in images
+            self._pad_image(
+                image,
+                pad_size,
+                constant_values=constant_values,
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
+            for image in images
         ]
         data = {"pixel_values": padded_images}
+
         if return_pixel_mask:
-            masks = [make_pixel_mask(image=image, output_size=pad_size) for image in images]
+            masks = [
+                make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
+                for image in images
+            ]
             data["pixel_mask"] = masks
 
         return BatchFeature(data=data, tensor_type=return_tensors)
 
-    def pad_and_create_pixel_mask(
-        self,
-        pixel_values_list: List[ImageInput],
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        data_format: Optional[ChannelDimension] = None,
-    ) -> BatchFeature:
-        """
-        Pads a batch of images with zeros to the size of largest height and width in the batch and returns their
-        corresponding pixel mask.
-
-        Args:
-            images (`List[np.ndarray]`):
-                Batch of images to pad.
-            return_tensors (`str` or `TensorType`, *optional*):
-                The type of tensors to return. Can be one of:
-                    - Unset: Return a list of `np.ndarray`.
-                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
-                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
-                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
-                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        warnings.warn(
-            "This method is deprecated and will be removed in v4.26.0. Please use pad instead.", FutureWarning
-        )
-        # pad expects a list of np.ndarray, but the previous feature extractors expected torch tensors
-        images = [to_numpy_array(image) for image in pixel_values_list]
-        return self.pad(
-            images=images,
-            return_pixel_mask=True,
-            return_tensors=return_tensors,
-            data_format=data_format,
-        )
-
     def preprocess(
         self,
         images: ImageInput,
@@ -384,6 +350,7 @@ def preprocess(
         do_pad: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
         **kwargs,
     ) -> PIL.Image.Image:
         """
@@ -391,7 +358,8 @@ def preprocess(
 
         Args:
             images (`ImageInput`):
-                Image to preprocess.
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
             do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                 Whether to resize the image.
             size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@@ -427,6 +395,12 @@ def preprocess(
                 The channel dimension format for the output image. Can be one of:
                     - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                     - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         size_divisor = size_divisor if size_divisor is not None else self.size_divisor
@@ -441,8 +415,7 @@ def preprocess(
         size = size if size is not None else self.size
         size = get_size_dict(size, default_to_square=False)
 
-        if not is_batched(images):
-            images = [images]
+        images = make_list_of_images(images)
 
         if not valid_images(images):
             raise ValueError(
@@ -462,21 +435,48 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
         if do_resize:
             images = [
-                self.resize(image=image, size=size, size_divisor=size_divisor, resample=resample) for image in images
+                self.resize(
+                    image=image,
+                    size=size,
+                    size_divisor=size_divisor,
+                    resample=resample,
+                    input_data_format=input_data_format,
+                )
+                for image in images
             ]
 
         if do_rescale:
-            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
 
         if do_normalize:
-            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
 
-        images = [to_channel_dimension_format(image, data_format) for image in images]
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
 
         if do_pad:
-            encoded_outputs = self.pad(images, return_pixel_mask=True, return_tensors=return_tensors)
+            encoded_outputs = self.pad(
+                images, return_pixel_mask=True, return_tensors=return_tensors, input_data_format=data_format
+            )
         else:
             encoded_outputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
 
diff --git a/src/transformers/models/vilt/modeling_vilt.py b/src/transformers/models/vilt/modeling_vilt.py
index 642b070ab27e..a36d58bd235b 100755
--- a/src/transformers/models/vilt/modeling_vilt.py
+++ b/src/transformers/models/vilt/modeling_vilt.py
@@ -36,7 +36,6 @@
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import (
     find_pruneable_heads_and_indices,
-    is_torch_greater_or_equal_than_1_10,
     meshgrid,
     prune_linear_layer,
 )
@@ -46,12 +45,6 @@
 
 logger = logging.get_logger(__name__)
 
-if not is_torch_greater_or_equal_than_1_10:
-    logger.warning(
-        f"You are using torch=={torch.__version__}, but torch>=1.10.0 is required to use "
-        "ViltModel. Please upgrade torch."
-    )
-
 _CONFIG_FOR_DOC = "ViltConfig"
 _CHECKPOINT_FOR_DOC = "dandelin/vilt-b32-mlm"
 
@@ -171,7 +164,7 @@ def visual_embed(self, pixel_values, pixel_mask, max_image_length=200):
         non_valid_nums = [v.size(0) for v in non_valid_row_idx]
         pad_nums = [max_image_length - v for v in valid_nums]
 
-        select = list()
+        select = []
         for i, (v, nv, p) in enumerate(zip(valid_nums, non_valid_nums, pad_nums)):
             if p <= 0:
                 valid_choice = torch.multinomial(torch.ones(v).float(), max_image_length)
@@ -256,7 +249,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
         )
@@ -399,7 +394,6 @@ def __init__(self, config: ViltConfig) -> None:
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
 
@@ -451,7 +445,6 @@ def __init__(self, config: ViltConfig) -> None:
             self.intermediate_act_fn = config.hidden_act
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.intermediate_act_fn(hidden_states)
 
@@ -580,7 +573,7 @@ class ViltPreTrainedModel(PreTrainedModel):
     config_class = ViltConfig
     base_model_prefix = "vilt"
     supports_gradient_checkpointing = True
-    _no_split_modules = ["ViltSelfAttention"]
+    _no_split_modules = ["ViltEmbeddings", "ViltSelfAttention"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -617,7 +610,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 VILT_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `({0})`):
-            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`BertTokenizer`]. See
+            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
             [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
             IDs?](../glossary#input-ids)
 
@@ -635,8 +628,8 @@ def _set_gradient_checkpointing(self, module, value=False):
             [What are token type IDs?](../glossary#token-type-ids)
 
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`ViltFeatureExtractor`]. See
-            [`ViltFeatureExtractor.__call__`] for details.
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`ViltImageProcessor.__call__`] for details.
 
         pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
             Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
@@ -672,7 +665,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 VILT_IMAGES_AND_TEXT_CLASSIFICATION_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `({0})`):
-            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`BertTokenizer`]. See
+            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
             [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
             IDs?](../glossary#input-ids)
 
@@ -690,8 +683,8 @@ def _set_gradient_checkpointing(self, module, value=False):
             [What are token type IDs?](../glossary#token-type-ids)
 
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_images, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`ViltFeatureExtractor`]. See
-            [`ViltFeatureExtractor.__call__`] for details.
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`ViltImageProcessor.__call__`] for details.
 
         pixel_mask (`torch.LongTensor` of shape `(batch_size, num_images, height, width)`, *optional*):
             Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
@@ -805,6 +798,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -895,7 +889,7 @@ def forward(self, hidden_states):
     VILT_START_DOCSTRING,
 )
 class ViltForMaskedLM(ViltPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["mlm_score.decoder.bias"]
+    _tied_weights_keys = ["mlm_score.decoder.weight", "mlm_score.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1011,6 +1005,8 @@ def forward(
         masked_lm_loss = None
         if labels is not None:
             loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            # move labels to correct device to enable PP
+            labels = labels.to(mlm_logits.device)
             masked_lm_loss = loss_fct(mlm_logits.view(-1, self.config.vocab_size), labels.view(-1))
 
         if not return_dict:
@@ -1157,6 +1153,8 @@ def forward(
 
         loss = None
         if labels is not None:
+            # move labels to correct device to enable PP
+            labels = labels.to(logits.device)
             loss = nn.functional.binary_cross_entropy_with_logits(logits, labels) * labels.shape[1]
             # see https://github.com/jnhwkim/ban-vqa/blob/master/train.py#L19
 
@@ -1258,6 +1256,8 @@ def forward(
 
         loss = None
         if labels is not None:
+            # move labels to correct device to enable PP
+            labels = labels.to(logits.device)
             raise NotImplementedError("Training is not yet supported.")
 
         if not return_dict:
@@ -1397,6 +1397,8 @@ def forward(
         loss = None
         if labels is not None:
             loss_fct = CrossEntropyLoss()
+            # move labels to correct device to enable PP
+            labels = labels.to(logits.device)
             loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
 
         if not return_dict:
@@ -1419,9 +1421,6 @@ def forward(
     VILT_START_DOCSTRING,
 )
 class ViltForTokenClassification(ViltPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -1484,6 +1483,8 @@ def forward(
         loss = None
         if labels is not None:
             loss_fct = CrossEntropyLoss()
+            # move labels to correct device to enable PP
+            labels = labels.to(logits.device)
             loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
 
         if not return_dict:
diff --git a/src/transformers/models/vilt/processing_vilt.py b/src/transformers/models/vilt/processing_vilt.py
index 2578724066be..e86aa34c0995 100644
--- a/src/transformers/models/vilt/processing_vilt.py
+++ b/src/transformers/models/vilt/processing_vilt.py
@@ -32,9 +32,9 @@ class ViltProcessor(ProcessorMixin):
     docstring of [`~ViltProcessor.__call__`] and [`~ViltProcessor.decode`] for more information.
 
     Args:
-        image_processor (`ViltImageProcessor`):
+        image_processor (`ViltImageProcessor`, *optional*):
             An instance of [`ViltImageProcessor`]. The image processor is a required input.
-        tokenizer (`BertTokenizerFast`):
+        tokenizer (`BertTokenizerFast`, *optional*):
             An instance of ['BertTokenizerFast`]. The tokenizer is a required input.
     """
     attributes = ["image_processor", "tokenizer"]
@@ -42,6 +42,7 @@ class ViltProcessor(ProcessorMixin):
     tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        feature_extractor = None
         if "feature_extractor" in kwargs:
             warnings.warn(
                 "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
@@ -77,7 +78,7 @@ def __call__(
         return_length: bool = False,
         verbose: bool = True,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         This method uses [`ViltImageProcessor.__call__`] method to prepare image(s) for the model, and
diff --git a/src/transformers/models/vision_encoder_decoder/__init__.py b/src/transformers/models/vision_encoder_decoder/__init__.py
index fcb53d9d1337..b0fe3bdc82a9 100644
--- a/src/transformers/models/vision_encoder_decoder/__init__.py
+++ b/src/transformers/models/vision_encoder_decoder/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py
index 693c41c74691..8a8fd2f0f631 100644
--- a/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py
@@ -14,7 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import copy
 from typing import TYPE_CHECKING, Any, Mapping, Optional, OrderedDict
 
 from packaging import version
@@ -114,19 +113,6 @@ def from_encoder_decoder_configs(
 
         return cls(encoder=encoder_config.to_dict(), decoder=decoder_config.to_dict(), **kwargs)
 
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default *to_dict()* from *PretrainedConfig*.
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["encoder"] = self.encoder.to_dict()
-        output["decoder"] = self.decoder.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
-
 
 class VisionEncoderDecoderEncoderOnnxConfig(OnnxConfig):
     torch_onnx_minimum_version = version.parse("1.11")
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
index 5f9edbe7f930..3d914c9658da 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
@@ -87,7 +87,7 @@
     Args:
         pixel_values (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Pixel values can be obtained using the vision model's image processor. For example, using
-            [`ViTImageProcessor`]. See [`ViTImageProcessor.__call__`] for details.
+            [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`] for details.
         decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
 
@@ -115,7 +115,7 @@
     Args:
         pixel_values (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Pixel values can be obtained using the vision model's image processor. For example, using
-            [`ViTImageProcessor`]. See [`ViTImageProcessor.__call__`] for details.
+            [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`] for details.
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
             tensors for more detail.
@@ -284,7 +284,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         if not _do_init:
             raise ValueError(
@@ -409,14 +409,14 @@ def encode(
         Example:
 
         ```python
-        >>> from transformers import ViTImageProcessor, FlaxVisionEncoderDecoderModel
+        >>> from transformers import AutoImageProcessor, FlaxVisionEncoderDecoderModel
         >>> from PIL import Image
         >>> import requests
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
+        >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
 
         >>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized
         >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
@@ -487,7 +487,7 @@ def decode(
         Example:
 
         ```python
-        >>> from transformers import ViTImageProcessor, FlaxVisionEncoderDecoderModel
+        >>> from transformers import AutoImageProcessor, FlaxVisionEncoderDecoderModel
         >>> import jax.numpy as jnp
         >>> from PIL import Image
         >>> import requests
@@ -495,7 +495,7 @@ def decode(
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
+        >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
 
         >>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized
         >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
@@ -553,7 +553,6 @@ def decode(
         def _decoder_forward(
             module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, encoder_hidden_states, **kwargs
         ):
-
             projection_module = module._get_projection_module()
             decoder_module = module._get_decoder_module()
 
@@ -617,17 +616,17 @@ def __call__(
         Examples:
 
         ```python
-        >>> from transformers import FlaxVisionEncoderDecoderModel, ViTImageProcessor, GPT2Tokenizer
+        >>> from transformers import FlaxVisionEncoderDecoderModel, AutoImageProcessor, AutoTokenizer
         >>> from PIL import Image
         >>> import requests
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
+        >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
 
         >>> # load output tokenizer
-        >>> tokenizer_output = GPT2Tokenizer.from_pretrained("gpt2")
+        >>> tokenizer_output = AutoTokenizer.from_pretrained("gpt2")
 
         >>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized
         >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
@@ -689,9 +688,9 @@ def prepare_inputs_for_generation(
         self,
         decoder_input_ids,
         max_length,
-        decoder_attention_mask: Optional[jnp.DeviceArray] = None,
+        decoder_attention_mask: Optional[jax.Array] = None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # initializing the cache
         batch_size, seq_length = decoder_input_ids.shape
@@ -727,7 +726,7 @@ def from_encoder_decoder_pretrained(
         encoder_pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
         decoder_pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
         *model_args,
-        **kwargs
+        **kwargs,
     ) -> FlaxPreTrainedModel:
         r"""
         Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
index 50564de22abd..9667c529b564 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
@@ -15,12 +15,13 @@
 """ Classes to support TF Vision-Encoder-Text-Decoder architectures"""
 
 
-import gc
-import os
-import tempfile
+from __future__ import annotations
+
+import re
 import warnings
-from typing import Optional
+from typing import Optional, Tuple, Union
 
+import numpy as np
 import tensorflow as tf
 
 from ...configuration_utils import PretrainedConfig
@@ -28,7 +29,6 @@
 from ...modeling_tf_utils import TFCausalLanguageModelingLoss, TFPreTrainedModel, get_initializer, unpack_inputs
 from ...tf_utils import shape_list
 from ...utils import (
-    DUMMY_INPUTS,
     ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -88,7 +88,7 @@
     Args:
         pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
             Pixel values. Pixel values can be obtained using the vision's model's image processor. For example, using
-            [`ViTImageProcessor`]. See [`ViTImageProcessor.__call__`] for details.
+            [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`] for details.
         decoder_input_ids (`np.ndarray` or `tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
 
@@ -147,7 +147,6 @@
 
 # Copied from transformers.models.encoder_decoder.modeling_tf_encoder_decoder.shift_tokens_right
 def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int):
-
     if pad_token_id is None:
         raise ValueError("Make sure to set the pad_token_id attribute of the model's configuration.")
     pad_token_id = tf.cast(pad_token_id, input_ids.dtype)
@@ -254,29 +253,26 @@ def __init__(
             )
 
     @property
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        decoder_input_ids = tf.constant(DUMMY_INPUTS, dtype=tf.int32)
-        batch_size, seq_len = decoder_input_ids.shape
-
-        VISION_DUMMY_INPUTS = tf.random.uniform(
-            shape=(
-                batch_size,
-                self.config.encoder.num_channels,
-                self.config.encoder.image_size,
-                self.config.encoder.image_size,
+    def input_signature(self):
+        vision_config = self.config.encoder
+        if hasattr(vision_config, "vision_config"):
+            vision_config = vision_config.vision_config
+        if hasattr(vision_config, "image_size"):
+            image_size = vision_config.image_size
+        else:
+            image_size = vision_config.input_size
+        return {
+            "pixel_values": tf.TensorSpec(
+                shape=(
+                    None,
+                    vision_config.num_channels,
+                    image_size,
+                    image_size,
+                ),
+                dtype=tf.float32,
             ),
-            dtype=tf.float32,
-        )
-        pixel_values = tf.constant(VISION_DUMMY_INPUTS)
-        # Add `decoder_input_ids` because `self.decoder` requires it.
-        dummy = {"pixel_values": pixel_values, "decoder_input_ids": decoder_input_ids}
-        return dummy
+            "decoder_input_ids": tf.TensorSpec(shape=(None, None), dtype=tf.int32, name="decoder_input_ids"),
+        }
 
     def get_encoder(self):
         return self.encoder
@@ -299,12 +295,12 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         Example:
 
         ```python
-        >>> from transformers import TFVisionEncoderDecoderModel, ViTImageProcessor, GPT2Tokenizer
+        >>> from transformers import TFVisionEncoderDecoderModel, AutoImageProcessor, AutoTokenizer
         >>> from PIL import Image
         >>> import requests
 
-        >>> image_processor = ViTImageProcessor.from_pretrained("ydshieh/vit-gpt2-coco-en")
-        >>> decoder_tokenizer = GPT2Tokenizer.from_pretrained("ydshieh/vit-gpt2-coco-en")
+        >>> image_processor = AutoImageProcessor.from_pretrained("ydshieh/vit-gpt2-coco-en")
+        >>> decoder_tokenizer = AutoTokenizer.from_pretrained("ydshieh/vit-gpt2-coco-en")
         >>> model = TFVisionEncoderDecoderModel.from_pretrained("ydshieh/vit-gpt2-coco-en")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
@@ -320,46 +316,23 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
 
         >>> assert preds == ["a cat laying on top of a couch next to another cat"]
         ```"""
-
-        from_pt = kwargs.pop("from_pt", False)
-        if from_pt:
-            import torch
-
-            from transformers import VisionEncoderDecoderModel
-
-            # a workaround to load from pytorch checkpoint
-            _model = VisionEncoderDecoderModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-            config = _model.config
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                encoder_dir = os.path.join(tmpdirname, "encoder")
-                decoder_dir = os.path.join(tmpdirname, "decoder")
-                _model.encoder.save_pretrained(encoder_dir)
-                _model.decoder.save_pretrained(decoder_dir)
-
-                if hasattr(_model, "enc_to_dec_proj"):
-                    enc_to_dec_proj_kernel = tf.transpose(
-                        tf.constant(_model.enc_to_dec_proj.weight.detach().to("cpu").numpy()), perm=(1, 0)
-                    )
-                    enc_to_dec_proj_bias = tf.constant(_model.enc_to_dec_proj.bias.detach().to("cpu").numpy())
-
-                del _model
-                gc.collect()
-                torch.cuda.empty_cache()
-
-                model = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
-                    encoder_dir, decoder_dir, encoder_from_pt=True, decoder_from_pt=True
-                )
-                # This is only for copying some specific attributes of this particular model.
-                model.config = config
-
-                if hasattr(model, "enc_to_dec_proj"):
-                    model(model.dummy_inputs)
-                    model.enc_to_dec_proj.kernel.assign(enc_to_dec_proj_kernel)
-                    model.enc_to_dec_proj.bias.assign(enc_to_dec_proj_bias)
-
-                return model
-
+        # Matt: The TF and PT weights don't align because our TF base classes have an extra layer compared to PT models
+        # (the main model stem is in the MainLayer class). If we remove that layer, then weight names sync up as normal.
+        # However, the name of that extra layer is the name of the MainLayer in the base model. We make the assumption
+        # here that the config model_type is the same as the name of the MainLayer. I don't know of anywhere that's
+        # not the case, and I wasn't sure how else to go from the config to the correct MainLayer name!
+
+        if kwargs.get("from_pt", False):
+            config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
+            encoder_model_type = config.encoder.model_type
+
+            def tf_to_pt_weight_rename(tf_weight):
+                if "encoder" in tf_weight and "decoder" not in tf_weight:
+                    return re.sub(rf"encoder\.{encoder_model_type}\.", "encoder.", tf_weight)
+                else:
+                    return tf_weight
+
+            kwargs["tf_to_pt_weight_rename"] = tf_to_pt_weight_rename
         return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
 
     @classmethod
@@ -368,7 +341,7 @@ def from_encoder_decoder_pretrained(
         encoder_pretrained_model_name_or_path: str = None,
         decoder_pretrained_model_name_or_path: str = None,
         *model_args,
-        **kwargs
+        **kwargs,
     ) -> TFPreTrainedModel:
         r"""
         Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model
@@ -466,15 +439,6 @@ def from_encoder_decoder_pretrained(
             kwargs_encoder["load_weight_prefix"] = cls.load_weight_prefix
             encoder = TFAutoModel.from_pretrained(encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder)
 
-            # Necessary to make `save_pretrained -> from_pretrained` work correctly for the converted PT -> TF model.
-            # See https://github.com/huggingface/transformers/pull/14016#issuecomment-944046313
-            if kwargs_encoder.get("from_pt", None):
-                del kwargs_encoder["from_pt"]
-                with tempfile.TemporaryDirectory() as tmp_dirname:
-                    encoder.save_pretrained(tmp_dirname)
-                    del encoder
-                    encoder = TFAutoModel.from_pretrained(tmp_dirname, *model_args, **kwargs_encoder)
-
         decoder = kwargs_decoder.pop("model", None)
         if decoder is None:
             if decoder_pretrained_model_name_or_path is None:
@@ -509,15 +473,6 @@ def from_encoder_decoder_pretrained(
             kwargs_decoder["load_weight_prefix"] = cls.load_weight_prefix
             decoder = TFAutoModelForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder)
 
-            # Necessary to make `save_pretrained -> from_pretrained` work correctly for the converted PT -> TF model.
-            # See https://github.com/huggingface/transformers/pull/14016#issuecomment-944046313
-            if kwargs_decoder.get("from_pt", None):
-                del kwargs_decoder["from_pt"]
-                with tempfile.TemporaryDirectory() as tmp_dirname:
-                    decoder.save_pretrained(tmp_dirname)
-                    del decoder
-                    decoder = TFAutoModelForCausalLM.from_pretrained(tmp_dirname, **kwargs_decoder)
-
         # Make sure these 2 `tf.keras.Model` have fixed names so `from_pretrained` could load model weights correctly.
         if encoder.name != "encoder":
             raise ValueError("encoder model must be created with the name `encoder`.")
@@ -535,20 +490,20 @@ def from_encoder_decoder_pretrained(
     @replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        pixel_values=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        encoder_outputs=None,
-        past_key_values=None,
-        decoder_inputs_embeds=None,
-        labels=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        training=False,
+        pixel_values: np.ndarray | tf.Tensor | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
+        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
+        decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
         **kwargs,
-    ):
+    ) -> Union[TFSeq2SeqLMOutput, Tuple[tf.Tensor]]:
         r"""
         Returns:
 
@@ -603,7 +558,6 @@ def call(
                 )
 
         if encoder_outputs is None:
-
             encoder_inputs = {
                 "input_ids": pixel_values,
                 "output_attentions": output_attentions,
@@ -706,14 +660,18 @@ def call(
         )
 
     def serving_output(self, output):
-        pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
-        dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
-        dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
-        enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
-        enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None
+        pkv = tf.tuple(output.past_key_values)[1] if self.config.decoder.use_cache else None
+        dec_hs = (
+            tf.convert_to_tensor(output.decoder_hidden_states) if self.config.decoder.output_hidden_states else None
+        )
+        dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.decoder.output_attentions else None
+        enc_hs = (
+            tf.convert_to_tensor(output.encoder_hidden_states) if self.config.encoder.output_hidden_states else None
+        )
+        enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.encoder.output_attentions else None
         cross_attns = (
             tf.convert_to_tensor(output.cross_attentions)
-            if self.config.output_attentions and output.cross_attentions is not None
+            if self.config.decoder.output_attentions and output.cross_attentions is not None
             else None
         )
 
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
index e6c7658da419..d3e464cbfffa 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
@@ -93,7 +93,7 @@ def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Pixel values can be obtained using an image processor (e.g. if you use ViT as the encoder,
-            you should use [`ViTImageProcessor`]). See [`ViTImageProcessor.__call__`] for details.
+            you should use [`AutoImageProcessor`]). See [`ViTImageProcessor.__call__`] for details.
         decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
 
@@ -248,12 +248,12 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         Example:
 
         ```python
-        >>> from transformers import VisionEncoderDecoderModel, ViTImageProcessor, GPT2Tokenizer
+        >>> from transformers import VisionEncoderDecoderModel, AutoImageProcessor, AutoTokenizer
         >>> from PIL import Image
         >>> import requests
 
-        >>> image_processor = ViTImageProcessor.from_pretrained("ydshieh/vit-gpt2-coco-en")
-        >>> decoder_tokenizer = GPT2Tokenizer.from_pretrained("ydshieh/vit-gpt2-coco-en")
+        >>> image_processor = AutoImageProcessor.from_pretrained("ydshieh/vit-gpt2-coco-en")
+        >>> decoder_tokenizer = AutoTokenizer.from_pretrained("ydshieh/vit-gpt2-coco-en")
         >>> model = VisionEncoderDecoderModel.from_pretrained("ydshieh/vit-gpt2-coco-en")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
@@ -368,7 +368,7 @@ def from_encoder_decoder_pretrained(
         encoder_pretrained_model_name_or_path: str = None,
         decoder_pretrained_model_name_or_path: str = None,
         *model_args,
-        **kwargs
+        **kwargs,
     ) -> PreTrainedModel:
         r"""
         Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model
@@ -539,12 +539,12 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+        >>> from transformers import AutoProcessor, VisionEncoderDecoderModel
         >>> import requests
         >>> from PIL import Image
         >>> import torch
 
-        >>> processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
+        >>> processor = AutoProcessor.from_pretrained("microsoft/trocr-base-handwritten")
         >>> model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
 
         >>> # load image from the IAM dataset
@@ -625,7 +625,7 @@ def forward(
         if labels is not None:
             logits = decoder_outputs.logits if return_dict else decoder_outputs[0]
             loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.reshape(-1, self.decoder.config.vocab_size), labels.view(-1))
+            loss = loss_fct(logits.reshape(-1, self.decoder.config.vocab_size), labels.reshape(-1))
 
         if not return_dict:
             if loss is not None:
@@ -669,6 +669,6 @@ def resize_token_embeddings(self, *args, **kwargs):
             " respective methods of the wrapped decoder object (model.decoder.resize_token_embeddings(...))"
         )
 
-    def _reorder_cache(self, past, beam_idx):
+    def _reorder_cache(self, past_key_values, beam_idx):
         # apply decoder cache reordering here
-        return self.decoder._reorder_cache(past, beam_idx)
+        return self.decoder._reorder_cache(past_key_values, beam_idx)
diff --git a/src/transformers/models/vision_text_dual_encoder/__init__.py b/src/transformers/models/vision_text_dual_encoder/__init__.py
index 89aa78c83112..27c117274b64 100644
--- a/src/transformers/models/vision_text_dual_encoder/__init__.py
+++ b/src/transformers/models/vision_text_dual_encoder/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,8 +13,13 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-# rely on isort to merge the imports
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_flax_available, is_torch_available
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_flax_available,
+    is_tf_available,
+    is_torch_available,
+)
 
 
 _import_structure = {
@@ -44,10 +45,18 @@
 else:
     _import_structure["modeling_flax_vision_text_dual_encoder"] = ["FlaxVisionTextDualEncoderModel"]
 
+try:
+    if not is_tf_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_tf_vision_text_dual_encoder"] = ["TFVisionTextDualEncoderModel"]
+
 
 if TYPE_CHECKING:
     from .configuration_vision_text_dual_encoder import VisionTextDualEncoderConfig
-    from .processing_visiotn_text_dual_encoder import VisionTextDualEncoderProcessor
+    from .processing_vision_text_dual_encoder import VisionTextDualEncoderProcessor
 
     try:
         if not is_torch_available():
@@ -63,7 +72,15 @@
     except OptionalDependencyNotAvailable:
         pass
     else:
-        from .modeling_vision_text_dual_encoder import FlaxVisionTextDualEncoderModel
+        from .modeling_flax_vision_text_dual_encoder import FlaxVisionTextDualEncoderModel
+
+    try:
+        if not is_tf_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_tf_vision_text_dual_encoder import TFVisionTextDualEncoderModel
 
 
 else:
diff --git a/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py
index 8c31c1ac0303..5dab0f42dc7c 100644
--- a/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ VisionTextDualEncoder model configuration"""
 
-import copy
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -35,10 +34,6 @@ class VisionTextDualEncoderConfig(PretrainedConfig):
     documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        text_config (`dict`):
-            Dictionary of configuration options that defines text model config.
-        vision_config (`dict`):
-            Dictionary of configuration options that defines vison model config.
         projection_dim (`int`, *optional*, defaults to 512):
             Dimentionality of text and vision projection layers.
         logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
@@ -113,16 +108,3 @@ def from_vision_text_configs(cls, vision_config: PretrainedConfig, text_config:
         """
 
         return cls(vision_config=vision_config.to_dict(), text_config=text_config.to_dict(), **kwargs)
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["vision_config"] = self.vision_config.to_dict()
-        output["text_config"] = self.text_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
index 6c6235f51883..12453fde9812 100644
--- a/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
@@ -88,7 +88,7 @@
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -106,7 +106,7 @@
             [What are position IDs?](../glossary#position-ids)
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            an image processor (e.g. if you use ViT as the encoder, you should use [`ViTImageProcessor`]). See
+            an image processor (e.g. if you use ViT as the encoder, you should use [`AutoImageProcessor`]). See
             [`ViTImageProcessor.__call__`] for details.
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
@@ -227,9 +227,8 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
-
         if not _do_init:
             raise ValueError(
                 "`FlaxVisionTextDualEncoderModel` cannot be created without initializing, `_do_init` must be `True`."
@@ -556,13 +555,13 @@ def from_vision_text_pretrained(
     >>> from transformers import (
     ...     FlaxVisionTextDualEncoderModel,
     ...     VisionTextDualEncoderProcessor,
-    ...     ViTFeatureExtractor,
-    ...     BertTokenizer,
+    ...     AutoImageProcessor,
+    ...     AutoTokenizer,
     ... )
 
-    >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
-    >>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
-    >>> processor = VisionTextDualEncoderProcessor(feature_extractor, tokenizer)
+    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+    >>> image_processor = AutoImageProcesor.from_pretrained("google/vit-base-patch16-224")
+    >>> processor = VisionTextDualEncoderProcessor(image_processor, tokenizer)
     >>> model = FlaxVisionTextDualEncoderModel.from_vision_text_pretrained(
     ...     "google/vit-base-patch16-224", "bert-base-uncased"
     ... )
diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py
new file mode 100644
index 000000000000..34349c866175
--- /dev/null
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py
@@ -0,0 +1,621 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TensorFlow VisionTextDualEncoder model."""
+
+
+from __future__ import annotations
+
+import re
+from typing import Optional, Tuple, Union
+
+import tensorflow as tf
+from tensorflow.keras.layers import Dense
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_tf_utils import TFPreTrainedModel, unpack_inputs
+from ...tf_utils import shape_list
+from ...utils import (
+    DUMMY_INPUTS,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from ..auto.configuration_auto import AutoConfig
+from ..auto.modeling_tf_auto import TFAutoModel
+from ..clip.modeling_tf_clip import CLIPVisionConfig, TFCLIPOutput, TFCLIPVisionModel
+from .configuration_vision_text_dual_encoder import VisionTextDualEncoderConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "VisionTextDualEncoderConfig"
+
+VISION_TEXT_DUAL_ENCODER_START_DOCSTRING = r"""
+    This class can be used to initialize a vision-text dual encoder model with any pretrained vision autoencoding model
+    as the vision encoder and any pretrained text model as the text encoder. The vision and text encoders are loaded
+    via the [`~TFAutoModel.from_pretrained`] method. The projection layers are automatically added to the model and
+    should be fine-tuned on a downstream task, like contrastive image-text modeling.
+
+    In [LiT: Zero-Shot Transfer with Locked-image Text Tuning](https://arxiv.org/abs/2111.07991) it is shown how
+    leveraging pre-trained (locked/frozen) image and text model for contrastive learning yields significant improvment
+    on new zero-shot vision tasks such as image classification or retrieval.
+
+    After such a Vision-Text-Dual-Encoder model has been trained/fine-tuned, it can be saved/loaded just like any other
+    models (see the examples for more information).
+
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a Keras [Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a
+    regular Keras Model and refer to the TF documentation for all matter related to general usage and behavior.
+
+    Parameters:
+        config ([`VisionEncoderDecoderConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+VISION_TEXT_DUAL_ENCODER_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+VISION_TEXT_DUAL_ENCODER_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+VISION_TEXT_DUAL_ENCODER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            an image processor (e.g. if you use ViT as the encoder, you should use [`AutoImageProcessor`]). See
+            [`ViTImageProcessor.__call__`] for details.
+        return_loss (`bool`, *optional*):
+            Whether or not to return the contrastive loss.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.clip.modeling_tf_clip.contrastive_loss
+def contrastive_loss(logits: tf.Tensor) -> tf.Tensor:
+    return tf.math.reduce_mean(
+        tf.keras.metrics.sparse_categorical_crossentropy(
+            y_true=tf.range(shape_list(logits)[0]), y_pred=logits, from_logits=True
+        )
+    )
+
+
+# Copied from transformers.models.clip.modeling_tf_clip.clip_loss
+def clip_loss(similarity: tf.Tensor) -> tf.Tensor:
+    caption_loss = contrastive_loss(similarity)
+    image_loss = contrastive_loss(tf.transpose(similarity))
+    return (caption_loss + image_loss) / 2.0
+
+
+@add_start_docstrings(VISION_TEXT_DUAL_ENCODER_START_DOCSTRING)
+class TFVisionTextDualEncoderModel(TFPreTrainedModel):
+    config_class = VisionTextDualEncoderConfig
+    base_model_prefix = "vision_text_dual_encoder"
+    load_weight_prefix = "tf_vision_text_dual_encoder_model"
+
+    def __init__(
+        self,
+        config: Optional[VisionTextDualEncoderConfig] = None,
+        vision_model: Optional[TFPreTrainedModel] = None,
+        text_model: Optional[TFPreTrainedModel] = None,
+    ):
+        if config is None and (vision_model is None or text_model is None):
+            raise ValueError("Either a configuration or an vision and a text model has to be provided")
+
+        if config is None:
+            config = VisionTextDualEncoderConfig.from_vision_text_configs(vision_model.config, text_model.config)
+        else:
+            if not isinstance(config, self.config_class):
+                raise ValueError(f"config: {config} has to be of type {self.config_class}")
+
+        # initialize with config
+        super().__init__(config)
+
+        if vision_model is None:
+            if isinstance(config.vision_config, CLIPVisionConfig):
+                vision_model = TFCLIPVisionModel.from_config(config.vision_config, name="vision_model")
+            else:
+                vision_model = TFAutoModel.from_config(config.vision_config, name="vision_model")
+
+        if text_model is None:
+            text_model = TFAutoModel.from_config(config.text_config, name="text_model")
+
+        self.vision_model = vision_model
+        self.text_model = text_model
+
+        # make sure that the individual model's config refers to the shared config
+        # so that the updates to the config will be synced
+        self.vision_model.config = self.config.vision_config
+        self.text_model.config = self.config.text_config
+
+        self.vision_embed_dim = config.vision_config.hidden_size
+        self.text_embed_dim = config.text_config.hidden_size
+        self.projection_dim = config.projection_dim
+
+        self.visual_projection = Dense(self.projection_dim, use_bias=False, name="visual_projection")
+        self.text_projection = Dense(self.projection_dim, use_bias=False, name="text_projection")
+        self.logit_scale = None
+
+    def build(self, input_shape=None):
+        # Build in the build() method to make sure the names are right
+        initializer = tf.keras.initializers.Constant(self.config.logit_scale_init_value)
+        self.logit_scale = self.add_weight(shape=(1,), initializer=initializer, name="logit_scale")
+        super().build(input_shape)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        # Matt: The TF and PT weights don't align because our TF base classes have an extra layer compared to PT models
+        # (the main model stem is in the MainLayer class). If we remove that layer, then weight names sync up as normal.
+        # However, the name of that extra layer is the name of the MainLayer in the base model.
+
+        if kwargs.get("from_pt", False):
+
+            def tf_to_pt_weight_rename(tf_weight):
+                if "vision_model" in tf_weight:
+                    if tf_weight.count("vision_model") == 1:
+                        return re.sub(r"vision_model\..*?\.", "vision_model.", tf_weight)
+                    elif tf_weight.count("vision_model") == 2:
+                        return re.sub(r"vision_model\..*?\.vision_model", "vision_model.vision_model", tf_weight)
+                    else:
+                        raise ValueError(
+                            f"Unexpected weight name {tf_weight}. Please file an issue on the"
+                            " Transformers repo to let us know about this error!"
+                        )
+                elif "text_model" in tf_weight:
+                    return re.sub(r"text_model\..*?\.", "text_model.", tf_weight)
+                else:
+                    return tf_weight
+
+            kwargs["tf_to_pt_weight_rename"] = tf_to_pt_weight_rename
+        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+    @add_start_docstrings_to_model_forward(VISION_TEXT_DUAL_ENCODER_TEXT_INPUTS_DOCSTRING)
+    def get_text_features(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        token_type_ids=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Returns:
+            text_features (`tf.Tensor` of shape `(batch_size, output_dim`): The text embeddings obtained by applying
+            the projection layer to the pooled output of [`TFCLIPTextModel`].
+
+        Examples:
+
+        ```python
+        >>> from transformers import TFVisionTextDualEncoderModel, AutoTokenizer
+
+        >>> model = TFVisionTextDualEncoderModel.from_pretrained("clip-italian/clip-italian", from_pt=True)
+        >>> tokenizer = AutoTokenizer.from_pretrained("clip-italian/clip-italian")
+
+        >>> inputs = tokenizer(["una foto di un gatto", "una foto di un cane"], padding=True, return_tensors="np")
+        >>> text_features = model.get_text_features(**inputs)
+        ```"""
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = text_outputs[1]
+        text_features = self.text_projection(pooled_output)
+
+        return text_features
+
+    @add_start_docstrings_to_model_forward(VISION_TEXT_DUAL_ENCODER_VISION_INPUTS_DOCSTRING)
+    def get_image_features(
+        self,
+        pixel_values=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Returns:
+            image_features (`tf.Tensor` of shape `(batch_size, output_dim`): The image embeddings obtained by applying
+            the projection layer to the pooled output of [`TFCLIPVisionModel`].
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import TFVisionTextDualEncoderModel, AutoImageProcessor
+
+        >>> model = TFVisionTextDualEncoderModel.from_pretrained("clip-italian/clip-italian", from_pt=True)
+        >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = image_processor(images=image, return_tensors="np")
+
+        >>> image_features = model.get_image_features(**inputs)
+        ```"""
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = vision_outputs[1]  # pooled_output
+        image_features = self.visual_projection(pooled_output)
+
+        return image_features
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(VISION_TEXT_DUAL_ENCODER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFCLIPOutput, config_class=_CONFIG_FOR_DOC)
+    def call(
+        self,
+        input_ids: tf.Tensor | None = None,
+        pixel_values: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        return_loss: Optional[bool] = None,
+        token_type_ids: tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+    ) -> Union[Tuple[tf.Tensor], TFCLIPOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import (
+        ...     TFVisionTextDualEncoderModel,
+        ...     VisionTextDualEncoderProcessor,
+        ...     AutoImageProcessor,
+        ...     AutoTokenizer,
+        ... )
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+        >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
+        >>> processor = VisionTextDualEncoderProcessor(image_processor, tokenizer)
+        >>> model = TFVisionTextDualEncoderModel.from_vision_text_pretrained(
+        ...     "google/vit-base-patch16-224", "bert-base-uncased"
+        ... )
+
+        >>> # contrastive training
+        >>> urls = [
+        ...     "http://images.cocodataset.org/val2017/000000039769.jpg",
+        ...     "https://farm3.staticflickr.com/2674/5850229113_4fe05d5265_z.jpg",
+        ... ]
+        >>> images = [Image.open(requests.get(url, stream=True).raw) for url in urls]
+        >>> inputs = processor(
+        ...     text=["a photo of a cat", "a photo of a dog"], images=images, return_tensors="np", padding=True
+        ... )
+        >>> outputs = model(
+        ...     input_ids=inputs.input_ids,
+        ...     attention_mask=inputs.attention_mask,
+        ...     pixel_values=inputs.pixel_values,
+        ...     return_loss=True,
+        ... )
+        >>> loss, logits_per_image = outputs.loss, outputs.logits_per_image  # this is the image-text similarity score
+
+        >>> # save and load from pretrained
+        >>> model.save_pretrained("vit-bert")
+        >>> model = TFVisionTextDualEncoderModel.from_pretrained("vit-bert")
+
+        >>> # inference
+        >>> outputs = model(**inputs)
+        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+        >>> probs = tf.nn.softmax(logits_per_image, axis=1)  # we can take the softmax to get the label probabilities
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        image_embeds = vision_outputs[1]  # pooler_output
+        image_embeds = self.visual_projection(image_embeds)
+
+        text_embeds = text_outputs[1]  # pooler_output
+        text_embeds = self.text_projection(text_embeds)
+
+        # normalized features
+        image_embeds = image_embeds / tf.norm(image_embeds, axis=-1, keepdims=True)
+        text_embeds = text_embeds / tf.norm(text_embeds, axis=-1, keepdims=True)
+
+        # cosine similarity as logits
+        logit_scale = tf.math.exp(self.logit_scale)
+        logits_per_text = tf.matmul(text_embeds, image_embeds, transpose_b=True) * logit_scale
+        logits_per_image = tf.transpose(logits_per_text)
+
+        loss = None
+        if return_loss:
+            loss = clip_loss(logits_per_text)
+            if loss.shape.rank == 0:
+                loss = tf.expand_dims(loss, 0)
+
+        if not return_dict:
+            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return TFCLIPOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
+
+    @classmethod
+    def from_vision_text_pretrained(
+        cls,
+        vision_model_name_or_path: str = None,
+        text_model_name_or_path: str = None,
+        *model_args,
+        **kwargs,
+    ) -> TFPreTrainedModel:
+        """
+        Params:
+            vision_model_name_or_path (`str`, *optional*, defaults to `None`):
+                Information necessary to initiate the vision model. Can be either:
+
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
+                      user or organization name, like `dbmdz/bert-base-german-cased`.
+                    - A path to a *directory* containing model weights saved using
+                      [`~TFPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+                    - A path or url to a *PyTorch checkpoint folder* (e.g, `./pt_model`). In this case, `from_pt`
+                      should be set to `True` and a configuration object should be provided as `config` argument.
+
+            text_model_name_or_path (`str`, *optional*):
+                Information necessary to initiate the text model. Can be either:
+
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
+                      user or organization name, like `dbmdz/bert-base-german-cased`.
+                    - A path to a *directory* containing model weights saved using
+                      [`~TFPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+                    - A path or url to a *PyTorch checkpoint folder* (e.g, `./pt_model`). In this case, `from_pt`
+                      should be set to `True` and a configuration object should be provided as `config` argument.
+
+            model_args (remaining positional arguments, *optional*):
+                All remaning positional arguments will be passed to the underlying model's `__init__` method.
+
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
+                `output_attentions=True`).
+
+                - To update the text configuration, use the prefix *text_* for each configuration parameter.
+                - To update the vision configuration, use the prefix *vision_* for each configuration parameter.
+                - To update the parent model configuration, do not use a prefix for each configuration parameter.
+
+                Behaves differently depending on whether a `config` is provided or automatically loaded.
+
+        Example:
+
+        ```python
+        >>> from transformers import TFVisionTextDualEncoderModel
+
+        >>> # initialize a model from pretrained ViT and BERT models. Note that the projection layers will be randomly initialized.
+        >>> model = TFVisionTextDualEncoderModel.from_vision_text_pretrained(
+        ...     "google/vit-base-patch16-224", "bert-base-uncased"
+        ... )
+        >>> # saving model after fine-tuning
+        >>> model.save_pretrained("./vit-bert")
+        >>> # load fine-tuned model
+        >>> model = TFVisionTextDualEncoderModel.from_pretrained("./vit-bert")
+        ```"""
+        kwargs_vision = {
+            argument[len("vision_") :]: value for argument, value in kwargs.items() if argument.startswith("vision_")
+        }
+
+        kwargs_text = {
+            argument[len("text_") :]: value for argument, value in kwargs.items() if argument.startswith("text_")
+        }
+
+        # remove vision, text kwargs from kwargs
+        for key in kwargs_vision.keys():
+            del kwargs["vision_" + key]
+        for key in kwargs_text.keys():
+            del kwargs["text_" + key]
+
+        # Load and initialize the vision and text model
+        vision_model = kwargs_vision.pop("model", None)
+        if vision_model is None:
+            if vision_model_name_or_path is None:
+                raise ValueError(
+                    "If `vision_model` is not defined as an argument, a `vision_model_name_or_path` has to be defined"
+                )
+            kwargs_vision["name"] = "vision_model"
+            kwargs_vision["load_weight_prefix"] = cls.load_weight_prefix
+
+            vision_config_dict, unused_args = PretrainedConfig.get_config_dict(vision_model_name_or_path, **kwargs)
+            if vision_config_dict.get("model_type", None) == "clip_vision_model":
+                vision_config = CLIPVisionConfig.from_dict(vision_config_dict)
+            else:
+                vision_config = AutoConfig.from_pretrained(vision_model_name_or_path)
+
+            if vision_config.model_type == "clip_vision_model":
+                kwargs_vision["config"] = vision_config
+                vision_class = TFCLIPVisionModel
+            elif vision_config.model_type == "clip":
+                kwargs_vision["config"] = vision_config.vision_config
+                vision_class = TFCLIPVisionModel
+            else:
+                kwargs_vision["config"] = vision_config
+                vision_class = TFAutoModel
+            vision_model = vision_class.from_pretrained(vision_model_name_or_path, *model_args, **kwargs_vision)
+
+        text_model = kwargs_text.pop("model", None)
+        if text_model is None:
+            if text_model_name_or_path is None:
+                raise ValueError(
+                    "If `text_model` is not defined as an argument, a `text_model_name_or_path` has to be defined"
+                )
+            kwargs_text["name"] = "text_model"
+            kwargs_text["load_weight_prefix"] = cls.load_weight_prefix
+
+            if "config" not in kwargs_text:
+                text_config = AutoConfig.from_pretrained(text_model_name_or_path)
+                kwargs_text["config"] = text_config
+
+            text_model = TFAutoModel.from_pretrained(text_model_name_or_path, *model_args, **kwargs_text)
+
+        # instantiate config with corresponding kwargs
+        config = VisionTextDualEncoderConfig.from_vision_text_configs(vision_model.config, text_model.config, **kwargs)
+
+        # init model
+        model = cls(config=config, vision_model=vision_model, text_model=text_model)
+
+        # the projection layers are always newly initialized when loading the model
+        # using pre-trained vision and text model.
+        logger.warning(
+            "The projection layer and logit scale weights `['visual_projection.weight', 'text_projection.weight',"
+            " 'logit_scale']` are newly initialized. You should probably TRAIN this model on a down-stream task to be"
+            " able to use it for predictions and inference."
+        )
+
+        if vision_model.name != "vision_model":
+            raise ValueError("vision model must be created with the name `vision_model`.")
+        if text_model.name != "text_model":
+            raise ValueError("text model must be created with the name `text_model`.")
+
+        model.build()  # Ensure model is fully built
+
+        return model
+
+    @property
+    def dummy_inputs(self):
+        """
+        Dummy inputs to build the network.
+
+        Returns:
+            `Dict[str, tf.Tensor]`: The dummy inputs.
+        """
+        input_ids = tf.constant(DUMMY_INPUTS, dtype=tf.int32)
+        batch_size, seq_len = input_ids.shape
+
+        VISION_DUMMY_INPUTS = tf.random.uniform(
+            shape=(
+                batch_size,
+                self.config.vision_config.num_channels,
+                self.config.vision_config.image_size,
+                self.config.vision_config.image_size,
+            ),
+            dtype=tf.float32,
+        )
+        pixel_values = tf.constant(VISION_DUMMY_INPUTS)
+        dummy = {"pixel_values": pixel_values, "input_ids": input_ids}
+        return dummy
diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
index 80bba55d3f2c..106ff462e3e3 100755
--- a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
@@ -96,7 +96,7 @@
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
             tensors for more detail.
@@ -113,7 +113,7 @@
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`CLIPTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -131,7 +131,7 @@
             [What are position IDs?](../glossary#position-ids)
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            an image processor (e.g. if you use ViT as the encoder, you should use [`ViTImageProcessor`]). See
+            an image processor (e.g. if you use ViT as the encoder, you should use [`AutoImageProcessor`]). See
             [`ViTImageProcessor.__call__`] for details.
         return_loss (`bool`, *optional*):
             Whether or not to return the contrastive loss.
@@ -169,7 +169,6 @@ def __init__(
         vision_model: Optional[PreTrainedModel] = None,
         text_model: Optional[PreTrainedModel] = None,
     ):
-
         if config is None and (vision_model is None or text_model is None):
             raise ValueError("Either a configuration or an vision and a text model has to be provided")
 
@@ -205,7 +204,7 @@ def __init__(
 
         self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
         self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
-        self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
+        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
 
     @add_start_docstrings_to_model_forward(VISION_TEXT_DUAL_ENCODER_TEXT_INPUTS_DOCSTRING)
     def get_text_features(
@@ -316,12 +315,12 @@ def forward(
         >>> from transformers import (
         ...     VisionTextDualEncoderModel,
         ...     VisionTextDualEncoderProcessor,
-        ...     ViTImageProcessor,
-        ...     BertTokenizer,
+        ...     AutoImageProcessor,
+        ...     AutoTokenizer,
         ... )
 
-        >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
-        >>> image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224")
+        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+        >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
         >>> processor = VisionTextDualEncoderProcessor(image_processor, tokenizer)
         >>> model = VisionTextDualEncoderModel.from_vision_text_pretrained(
         ...     "google/vit-base-patch16-224", "bert-base-uncased"
@@ -429,7 +428,7 @@ def from_vision_text_pretrained(
                       Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
                       user or organization name, like `dbmdz/bert-base-german-cased`.
                     - A path to a *directory* containing model weights saved using
-                      [`~FlaxPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
                     - A path or url to a *PyTorch checkpoint folder* (e.g, `./pt_model`). In this case, `from_pt`
                       should be set to `True` and a configuration object should be provided as `config` argument. This
                       loading path is slower than converting the PyTorch checkpoint in a Flax model using the provided
@@ -442,7 +441,7 @@ def from_vision_text_pretrained(
                       Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
                       user or organization name, like `dbmdz/bert-base-german-cased`.
                     - A path to a *directory* containing model weights saved using
-                      [`~FlaxPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
                     - A path or url to a *PyTorch checkpoint folder* (e.g, `./pt_model`). In this case, `from_pt`
                       should be set to `True` and a configuration object should be provided as `config` argument. This
                       loading path is slower than converting the PyTorch checkpoint in a Flax model using the provided
diff --git a/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py
index 118ec4705957..e6449914680b 100644
--- a/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py
@@ -32,9 +32,9 @@ class VisionTextDualEncoderProcessor(ProcessorMixin):
     information.
 
     Args:
-        image_processor ([`AutoImageProcessor`]):
+        image_processor ([`AutoImageProcessor`], *optional*):
             The image processor is a required input.
-        tokenizer ([`PreTrainedTokenizer`]):
+        tokenizer ([`PreTrainedTokenizer`], *optional*):
             The tokenizer is a required input.
     """
     attributes = ["image_processor", "tokenizer"]
@@ -42,6 +42,7 @@ class VisionTextDualEncoderProcessor(ProcessorMixin):
     tokenizer_class = "AutoTokenizer"
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        feature_extractor = None
         if "feature_extractor" in kwargs:
             warnings.warn(
                 "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
@@ -63,7 +64,7 @@ def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
         """
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
         and `kwargs` arguments to VisionTextDualEncoderTokenizer's [`~PreTrainedTokenizer.__call__`] if `text` is not
-        `None` to encode the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        `None` to encode the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
         AutoImageProcessor's [`~AutoImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
         of the above two methods for more information.
 
diff --git a/src/transformers/models/visual_bert/__init__.py b/src/transformers/models/visual_bert/__init__.py
index f7a5390d1348..a752f1fa0c14 100644
--- a/src/transformers/models/visual_bert/__init__.py
+++ b/src/transformers/models/visual_bert/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/visual_bert/configuration_visual_bert.py b/src/transformers/models/visual_bert/configuration_visual_bert.py
index f256a286a0bc..a7282ef2bb53 100644
--- a/src/transformers/models/visual_bert/configuration_visual_bert.py
+++ b/src/transformers/models/visual_bert/configuration_visual_bert.py
@@ -130,7 +130,7 @@ def __init__(
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
diff --git a/src/transformers/models/visual_bert/modeling_visual_bert.py b/src/transformers/models/visual_bert/modeling_visual_bert.py
index 91d44a7143ef..81ad1068483a 100755
--- a/src/transformers/models/visual_bert/modeling_visual_bert.py
+++ b/src/transformers/models/visual_bert/modeling_visual_bert.py
@@ -78,7 +78,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
         # For Visual Features
         # Token type and position embedding for image features
@@ -531,7 +533,6 @@ class VisualBertPreTrainedModel(PreTrainedModel):
     config_class = VisualBertConfig
     base_model_prefix = "visual_bert"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -605,7 +606,7 @@ class VisualBertForPreTrainingOutput(ModelOutput):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -739,10 +740,10 @@ def forward(
 
         ```python
         # Assumption: *get_visual_embeddings(image)* gets the visual embeddings of the image.
-        from transformers import BertTokenizer, VisualBertModel
+        from transformers import AutoTokenizer, VisualBertModel
         import torch
 
-        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
         model = VisualBertModel.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
 
         inputs = tokenizer("The capital of France is Paris.", return_tensors="pt")
@@ -772,6 +773,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -871,7 +873,7 @@ def forward(
     VISUAL_BERT_START_DOCSTRING,
 )
 class VisualBertForPreTraining(VisualBertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -926,9 +928,9 @@ def forward(
 
         ```python
         # Assumption: *get_visual_embeddings(image)* gets the visual embeddings of the image in the batch.
-        from transformers import BertTokenizer, VisualBertForPreTraining
+        from transformers import AutoTokenizer, VisualBertForPreTraining
 
-        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
         model = VisualBertForPreTraining.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
 
         inputs = tokenizer("The capital of France is [MASK].", return_tensors="pt")
@@ -1065,10 +1067,10 @@ def forward(
 
         ```python
         # Assumption: *get_visual_embeddings(image)* gets the visual embeddings of the image in the batch.
-        from transformers import BertTokenizer, VisualBertForMultipleChoice
+        from transformers import AutoTokenizer, VisualBertForMultipleChoice
         import torch
 
-        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
         model = VisualBertForMultipleChoice.from_pretrained("uclanlp/visualbert-vcr")
 
         prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
@@ -1216,10 +1218,10 @@ def forward(
 
         ```python
         # Assumption: *get_visual_embeddings(image)* gets the visual embeddings of the image in the batch.
-        from transformers import BertTokenizer, VisualBertForQuestionAnswering
+        from transformers import AutoTokenizer, VisualBertForQuestionAnswering
         import torch
 
-        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
         model = VisualBertForQuestionAnswering.from_pretrained("uclanlp/visualbert-vqa")
 
         text = "Who is eating the apple?"
@@ -1342,10 +1344,10 @@ def forward(
 
         ```python
         # Assumption: *get_visual_embeddings(image)* gets the visual embeddings of the image in the batch.
-        from transformers import BertTokenizer, VisualBertForVisualReasoning
+        from transformers import AutoTokenizer, VisualBertForVisualReasoning
         import torch
 
-        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
         model = VisualBertForVisualReasoning.from_pretrained("uclanlp/visualbert-nlvr2")
 
         text = "Who is eating the apple?"
@@ -1461,7 +1463,7 @@ def forward(self, query, key, attention_mask):
     VISUAL_BERT_START_DOCSTRING,
 )
 class VisualBertForRegionToPhraseAlignment(VisualBertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias"]
+    _tied_weights_keys = ["cls.predictions.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1508,10 +1510,10 @@ def forward(
 
         ```python
         # Assumption: *get_visual_embeddings(image)* gets the visual embeddings of the image in the batch.
-        from transformers import BertTokenizer, VisualBertForRegionToPhraseAlignment
+        from transformers import AutoTokenizer, VisualBertForRegionToPhraseAlignment
         import torch
 
-        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
         model = VisualBertForRegionToPhraseAlignment.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
 
         text = "Who is eating the apple?"
@@ -1587,7 +1589,6 @@ def forward(
         loss = None
 
         if labels is not None:
-
             # scores = batch x selected position x visual_feature
             # scores = selected_positions.bmm(visual_features.transpose(1,2))
             # label = batch x selected_postion x needed position
diff --git a/src/transformers/models/vit/__init__.py b/src/transformers/models/vit/__init__.py
index cda977d61765..d426ec93bf58 100644
--- a/src/transformers/models/vit/__init__.py
+++ b/src/transformers/models/vit/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/vit/configuration_vit.py b/src/transformers/models/vit/configuration_vit.py
index dcb3ac795217..cbf67a010934 100644
--- a/src/transformers/models/vit/configuration_vit.py
+++ b/src/transformers/models/vit/configuration_vit.py
@@ -55,23 +55,23 @@ class ViTConfig(PretrainedConfig):
         hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"selu"` and `"gelu_new"` are supported.
-        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        image_size (`int`, *optional*, defaults to `224`):
+        image_size (`int`, *optional*, defaults to 224):
             The size (resolution) of each image.
-        patch_size (`int`, *optional*, defaults to `16`):
+        patch_size (`int`, *optional*, defaults to 16):
             The size (resolution) of each patch.
-        num_channels (`int`, *optional*, defaults to `3`):
+        num_channels (`int`, *optional*, defaults to 3):
             The number of input channels.
         qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to the queries, keys and values.
-        encoder_stride (`int`, `optional`, defaults to 16):
+        encoder_stride (`int`, *optional*, defaults to 16):
            Factor to increase the spatial resolution by in the decoder head for masked image modeling.
 
     Example:
@@ -106,7 +106,7 @@ def __init__(
         num_channels=3,
         qkv_bias=True,
         encoder_stride=16,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -127,7 +127,6 @@ def __init__(
 
 
 class ViTOnnxConfig(OnnxConfig):
-
     torch_onnx_minimum_version = version.parse("1.11")
 
     @property
diff --git a/src/transformers/models/vit/convert_dino_to_pytorch.py b/src/transformers/models/vit/convert_dino_to_pytorch.py
index 1a8ba21a658b..7eec823ad5d1 100644
--- a/src/transformers/models/vit/convert_dino_to_pytorch.py
+++ b/src/transformers/models/vit/convert_dino_to_pytorch.py
@@ -19,12 +19,12 @@
 import json
 from pathlib import Path
 
+import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 
-import requests
-from huggingface_hub import hf_hub_download
-from transformers import ViTConfig, ViTFeatureExtractor, ViTForImageClassification, ViTModel
+from transformers import ViTConfig, ViTForImageClassification, ViTImageProcessor, ViTModel
 from transformers.utils import logging
 
 
@@ -175,9 +175,9 @@ def convert_vit_checkpoint(model_name, pytorch_dump_folder_path, base_model=True
         model = ViTForImageClassification(config).eval()
     model.load_state_dict(state_dict)
 
-    # Check outputs on an image, prepared by ViTFeatureExtractor
-    feature_extractor = ViTFeatureExtractor()
-    encoding = feature_extractor(images=prepare_img(), return_tensors="pt")
+    # Check outputs on an image, prepared by ViTImageProcessor
+    image_processor = ViTImageProcessor()
+    encoding = image_processor(images=prepare_img(), return_tensors="pt")
     pixel_values = encoding["pixel_values"]
     outputs = model(pixel_values)
 
@@ -192,8 +192,8 @@ def convert_vit_checkpoint(model_name, pytorch_dump_folder_path, base_model=True
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/vit/convert_vit_timm_to_pytorch.py b/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
index bc1f7f72dd5f..b73c5f346dba 100644
--- a/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
+++ b/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
@@ -19,13 +19,13 @@
 import json
 from pathlib import Path
 
-import torch
-from PIL import Image
-
 import requests
 import timm
+import torch
 from huggingface_hub import hf_hub_download
-from transformers import DeiTFeatureExtractor, ViTConfig, ViTFeatureExtractor, ViTForImageClassification, ViTModel
+from PIL import Image
+
+from transformers import DeiTImageProcessor, ViTConfig, ViTForImageClassification, ViTImageProcessor, ViTModel
 from transformers.utils import logging
 
 
@@ -208,12 +208,12 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path):
         model = ViTForImageClassification(config).eval()
     model.load_state_dict(state_dict)
 
-    # Check outputs on an image, prepared by ViTFeatureExtractor/DeiTFeatureExtractor
+    # Check outputs on an image, prepared by ViTImageProcessor/DeiTImageProcessor
     if "deit" in vit_name:
-        feature_extractor = DeiTFeatureExtractor(size=config.image_size)
+        image_processor = DeiTImageProcessor(size=config.image_size)
     else:
-        feature_extractor = ViTFeatureExtractor(size=config.image_size)
-    encoding = feature_extractor(images=prepare_img(), return_tensors="pt")
+        image_processor = ViTImageProcessor(size=config.image_size)
+    encoding = image_processor(images=prepare_img(), return_tensors="pt")
     pixel_values = encoding["pixel_values"]
     outputs = model(pixel_values)
 
@@ -229,8 +229,8 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path):
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     print(f"Saving model {vit_name} to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/vit/image_processing_vit.py b/src/transformers/models/vit/image_processing_vit.py
index 4287b34b73d3..be806d94c4d2 100644
--- a/src/transformers/models/vit/image_processing_vit.py
+++ b/src/transformers/models/vit/image_processing_vit.py
@@ -18,21 +18,21 @@
 
 import numpy as np
 
-from transformers.utils.generic import TensorType
-
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import normalize, rescale, resize, to_channel_dimension_format
+from ...image_transforms import resize, to_channel_dimension_format
 from ...image_utils import (
     IMAGENET_STANDARD_MEAN,
     IMAGENET_STANDARD_STD,
     ChannelDimension,
     ImageInput,
     PILImageResampling,
-    is_batched,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
     to_numpy_array,
     valid_images,
 )
-from ...utils import logging
+from ...utils import TensorType, logging
 
 
 logger = logging.get_logger(__name__)
@@ -49,7 +49,7 @@ class ViTImageProcessor(BaseImageProcessor):
         size (`dict`, *optional*, defaults to `{"height": 224, "width": 224}`):
             Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
             method.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
             Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
             `preprocess` method.
         do_rescale (`bool`, *optional*, defaults to `True`):
@@ -58,7 +58,7 @@ class ViTImageProcessor(BaseImageProcessor):
         rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
             Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
             `preprocess` method.
-        do_normalize:
+        do_normalize (`bool`, *optional*, defaults to `True`):
             Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
             method.
         image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
@@ -81,7 +81,7 @@ def __init__(
         do_normalize: bool = True,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"height": 224, "width": 224}
@@ -101,7 +101,8 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BILINEAR,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image to `(size["height"], size["width"])`.
@@ -111,13 +112,20 @@ def resize(
                 Image to resize.
             size (`Dict[str, int]`):
                 Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
-            resample:
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
                 `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
             data_format (`ChannelDimension` or `str`, *optional*):
                 The channel dimension format for the output image. If unset, the channel dimension format of the input
                 image is used. Can be one of:
                 - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                 - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
 
         Returns:
             `np.ndarray`: The resized image.
@@ -125,61 +133,16 @@ def resize(
         size = get_size_dict(size)
         if "height" not in size or "width" not in size:
             raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        output_size = (size["height"], size["width"])
         return resize(
-            image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
         )
 
-    def rescale(
-        self, image: np.ndarray, scale: float, data_format: Optional[Union[str, ChannelDimension]] = None, **kwargs
-    ) -> np.ndarray:
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`float`):
-                The scaling factor to rescale pixel values by.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format for the output image. If unset, the channel dimension format of the input
-                image is used. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-
-        Returns:
-            `np.ndarray`: The rescaled image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            mean (`float` or `List[float]`):
-                Image mean to use for normalization.
-            std (`float` or `List[float]`):
-                Image standard deviation to use for normalization.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format for the output image. If unset, the channel dimension format of the input
-                image is used. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-
-        Returns:
-            `np.ndarray`: The normalized image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: ImageInput,
@@ -193,6 +156,7 @@ def preprocess(
         image_std: Optional[Union[float, List[float]]] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
         **kwargs,
     ):
         """
@@ -200,7 +164,8 @@ def preprocess(
 
         Args:
             images (`ImageInput`):
-                Image to preprocess.
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
             do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                 Whether to resize the image.
             size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@@ -231,6 +196,12 @@ def preprocess(
                 - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                 - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                 - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         do_rescale = do_rescale if do_rescale is not None else self.do_rescale
@@ -243,8 +214,7 @@ def preprocess(
         size = size if size is not None else self.size
         size_dict = get_size_dict(size)
 
-        if not is_batched(images):
-            images = [images]
+        images = make_list_of_images(images)
 
         if not valid_images(images):
             raise ValueError(
@@ -261,16 +231,37 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
         if do_resize:
-            images = [self.resize(image=image, size=size_dict, resample=resample) for image in images]
+            images = [
+                self.resize(image=image, size=size_dict, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
 
         if do_rescale:
-            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
 
         if do_normalize:
-            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
-
-        images = [to_channel_dimension_format(image, data_format) for image in images]
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
 
         data = {"pixel_values": images}
         return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/src/transformers/models/vit/modeling_flax_vit.py b/src/transformers/models/vit/modeling_flax_vit.py
index 0ba305e936f1..1ab2671efd75 100644
--- a/src/transformers/models/vit/modeling_flax_vit.py
+++ b/src/transformers/models/vit/modeling_flax_vit.py
@@ -70,7 +70,7 @@
 VIT_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`ViTImageProcessor`]. See [`ViTImageProcessor.__call__`]
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
             for details.
 
         output_attentions (`bool`, *optional*):
@@ -85,7 +85,6 @@
 
 
 class FlaxViTPatchEmbeddings(nn.Module):
-
     config: ViTConfig
     dtype: jnp.dtype = jnp.float32  # the dtype of the computation
 
@@ -356,7 +355,6 @@ def __call__(
         output_hidden_states: bool = False,
         return_dict: bool = True,
     ):
-
         all_attentions = () if output_attentions else None
         all_hidden_states = () if output_hidden_states else None
 
@@ -444,7 +442,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, **kwargs)
         if input_shape is None:
@@ -523,7 +521,6 @@ def __call__(
         output_hidden_states: bool = False,
         return_dict: bool = True,
     ):
-
         hidden_states = self.embeddings(pixel_values, deterministic=deterministic)
 
         outputs = self.encoder(
@@ -565,14 +562,14 @@ class FlaxViTModel(FlaxViTPreTrainedModel):
     Examples:
 
     ```python
-    >>> from transformers import ViTImageProcessor, FlaxViTModel
+    >>> from transformers import AutoImageProcessor, FlaxViTModel
     >>> from PIL import Image
     >>> import requests
 
     >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
     >>> image = Image.open(requests.get(url, stream=True).raw)
 
-    >>> image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
+    >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
     >>> model = FlaxViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
 
     >>> inputs = image_processor(images=image, return_tensors="np")
@@ -648,7 +645,7 @@ class FlaxViTForImageClassification(FlaxViTPreTrainedModel):
     Example:
 
     ```python
-    >>> from transformers import ViTImageProcessor, FlaxViTForImageClassification
+    >>> from transformers import AutoImageProcessor, FlaxViTForImageClassification
     >>> from PIL import Image
     >>> import jax
     >>> import requests
@@ -656,7 +653,7 @@ class FlaxViTForImageClassification(FlaxViTPreTrainedModel):
     >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
     >>> image = Image.open(requests.get(url, stream=True).raw)
 
-    >>> image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224")
+    >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
     >>> model = FlaxViTForImageClassification.from_pretrained("google/vit-base-patch16-224")
 
     >>> inputs = image_processor(images=image, return_tensors="np")
diff --git a/src/transformers/models/vit/modeling_tf_vit.py b/src/transformers/models/vit/modeling_tf_vit.py
index 7fd664644e7f..727db8dfc6c0 100644
--- a/src/transformers/models/vit/modeling_tf_vit.py
+++ b/src/transformers/models/vit/modeling_tf_vit.py
@@ -15,9 +15,11 @@
 """ TF 2.0 ViT model."""
 
 
+from __future__ import annotations
+
 import collections.abc
 import math
-from typing import Dict, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import numpy as np
 import tensorflow as tf
@@ -41,7 +43,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "ViTConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "ViTImageProcessor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "google/vit-base-patch16-224-in21k"
@@ -66,7 +67,6 @@ def __init__(self, config: ViTConfig, **kwargs):
         self.config = config
 
     def build(self, input_shape: tf.TensorShape):
-
         num_patches = self.patch_embeddings.num_patches
         self.cls_token = self.add_weight(
             shape=(1, 1, self.config.hidden_size),
@@ -489,15 +489,14 @@ class PreTrainedModel
     @unpack_inputs
     def call(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        pixel_values: TFModelInputType | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         interpolate_pos_encoding: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
-
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
 
@@ -551,38 +550,6 @@ class TFViTPreTrainedModel(TFPreTrainedModel):
     base_model_prefix = "vit"
     main_input_name = "pixel_values"
 
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        VISION_DUMMY_INPUTS = tf.random.uniform(
-            shape=(3, self.config.num_channels, self.config.image_size, self.config.image_size), dtype=tf.float32
-        )
-        return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)}
-
-    @tf.function(
-        input_signature=[
-            {
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        """
-        Method used for serving the model.
-
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
-        """
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 VIT_START_DOCSTRING = r"""
 
@@ -629,7 +596,7 @@ def serving(self, inputs):
 VIT_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`ViTImageProcessor`]. See [`ViTImageProcessor.__call__`]
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
             for details.
 
         head_mask (`np.ndarray` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
@@ -670,7 +637,6 @@ def __init__(self, config: ViTConfig, *inputs, add_pooling_layer=True, **kwargs)
     @unpack_inputs
     @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFBaseModelOutputWithPooling,
         config_class=_CONFIG_FOR_DOC,
@@ -679,15 +645,14 @@ def __init__(self, config: ViTConfig, *inputs, add_pooling_layer=True, **kwargs)
     )
     def call(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        pixel_values: TFModelInputType | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         interpolate_pos_encoding: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
-
         outputs = self.vit(
             pixel_values=pixel_values,
             head_mask=head_mask,
@@ -700,17 +665,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutputWithPooling(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            hidden_states=hs,
-            attentions=attns,
-        )
-
 
 class TFViTPooler(tf.keras.layers.Layer):
     def __init__(self, config: ViTConfig, **kwargs):
@@ -764,7 +718,6 @@ def __init__(self, config: ViTConfig, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_IMAGE_CLASS_CHECKPOINT,
         output_type=TFSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -772,13 +725,13 @@ def __init__(self, config: ViTConfig, *inputs, **kwargs):
     )
     def call(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        pixel_values: TFModelInputType | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         interpolate_pos_encoding: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -811,9 +764,3 @@ def call(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
-
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py
index 5cf09889ca8e..461c7285f23e 100644
--- a/src/transformers/models/vit/modeling_vit.py
+++ b/src/transformers/models/vit/modeling_vit.py
@@ -25,7 +25,12 @@
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput, MaskedLMOutput
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPooling,
+    ImageClassifierOutput,
+    MaskedImageModelingOutput,
+)
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
@@ -42,7 +47,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "ViTConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "ViTImageProcessor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "google/vit-base-patch16-224-in21k"
@@ -166,6 +170,7 @@ def forward(self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = F
         if num_channels != self.num_channels:
             raise ValueError(
                 "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+                f" Expected {self.num_channels} but got {num_channels}."
             )
         if not interpolate_pos_encoding:
             if height != self.image_size[0] or width != self.image_size[1]:
@@ -249,7 +254,6 @@ def __init__(self, config: ViTConfig) -> None:
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
 
@@ -305,7 +309,6 @@ def __init__(self, config: ViTConfig) -> None:
             self.intermediate_act_fn = config.hidden_act
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.intermediate_act_fn(hidden_states)
 
@@ -436,7 +439,7 @@ class ViTPreTrainedModel(PreTrainedModel):
     base_model_prefix = "vit"
     main_input_name = "pixel_values"
     supports_gradient_checkpointing = True
-    _no_split_modules = []
+    _no_split_modules = ["ViTEmbeddings", "ViTLayer"]
 
     def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
         """Initialize the weights"""
@@ -452,17 +455,17 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> No
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
         elif isinstance(module, ViTEmbeddings):
-            nn.init.trunc_normal_(
-                module.position_embeddings,
+            module.position_embeddings.data = nn.init.trunc_normal_(
+                module.position_embeddings.data.to(torch.float32),
                 mean=0.0,
                 std=self.config.initializer_range,
-            )
+            ).to(module.position_embeddings.dtype)
 
-            nn.init.trunc_normal_(
-                module.cls_token,
+            module.cls_token.data = nn.init.trunc_normal_(
+                module.cls_token.data.to(torch.float32),
                 mean=0.0,
                 std=self.config.initializer_range,
-            )
+            ).to(module.cls_token.dtype)
 
     def _set_gradient_checkpointing(self, module: ViTEncoder, value: bool = False) -> None:
         if isinstance(module, ViTEncoder):
@@ -483,7 +486,7 @@ def _set_gradient_checkpointing(self, module: ViTEncoder, value: bool = False) -
 VIT_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`ViTImageProcessor`]. See [`ViTImageProcessor.__call__`]
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
             for details.
 
         head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
@@ -536,7 +539,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPooling,
         config_class=_CONFIG_FOR_DOC,
@@ -553,6 +555,10 @@ def forward(
         interpolate_pos_encoding: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+        """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -647,7 +653,7 @@ def __init__(self, config: ViTConfig) -> None:
         self.post_init()
 
     @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(output_type=MaskedImageModelingOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         pixel_values: Optional[torch.Tensor] = None,
@@ -657,7 +663,7 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         interpolate_pos_encoding: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[tuple, MaskedLMOutput]:
+    ) -> Union[tuple, MaskedImageModelingOutput]:
         r"""
         bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
             Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
@@ -666,7 +672,7 @@ def forward(
 
         Examples:
         ```python
-        >>> from transformers import ViTImageProcessor, ViTForMaskedImageModeling
+        >>> from transformers import AutoImageProcessor, ViTForMaskedImageModeling
         >>> import torch
         >>> from PIL import Image
         >>> import requests
@@ -674,7 +680,7 @@ def forward(
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
+        >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
         >>> model = ViTForMaskedImageModeling.from_pretrained("google/vit-base-patch16-224-in21k")
 
         >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
@@ -683,12 +689,19 @@ def forward(
         >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()
 
         >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
-        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.logits
+        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
         >>> list(reconstructed_pixel_values.shape)
         [1, 3, 224, 224]
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
+        if bool_masked_pos is not None and (self.config.patch_size != self.config.encoder_stride):
+            raise ValueError(
+                "When `bool_masked_pos` is provided, `patch_size` must be equal to `encoder_stride` to ensure that "
+                "the reconstructed image has the same dimensions as the input."
+                f"Got `patch_size` = {self.config.patch_size} and `encoder_stride` = {self.config.encoder_stride}."
+            )
+
         outputs = self.vit(
             pixel_values,
             bool_masked_pos=bool_masked_pos,
@@ -727,9 +740,9 @@ def forward(
             output = (reconstructed_pixel_values,) + outputs[1:]
             return ((masked_im_loss,) + output) if masked_im_loss is not None else output
 
-        return MaskedLMOutput(
+        return MaskedImageModelingOutput(
             loss=masked_im_loss,
-            logits=reconstructed_pixel_values,
+            reconstruction=reconstructed_pixel_values,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
@@ -765,7 +778,6 @@ def __init__(self, config: ViTConfig) -> None:
 
     @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_IMAGE_CLASS_CHECKPOINT,
         output_type=ImageClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -804,6 +816,8 @@ def forward(
 
         loss = None
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
             if self.config.problem_type is None:
                 if self.num_labels == 1:
                     self.config.problem_type = "regression"
diff --git a/src/transformers/models/vit_hybrid/__init__.py b/src/transformers/models/vit_hybrid/__init__.py
index b50378682a83..47342d3a2604 100644
--- a/src/transformers/models/vit_hybrid/__init__.py
+++ b/src/transformers/models/vit_hybrid/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
index abc9920782e0..5e5db3600d78 100644
--- a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
@@ -14,8 +14,6 @@
 # limitations under the License.
 """ ViT Hybrid model configuration"""
 
-import copy
-from typing import Dict
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -42,6 +40,8 @@ class ViTHybridConfig(PretrainedConfig):
     documentation from [`PretrainedConfig`] for more information.
 
     Args:
+        backbone_config (`Union[Dict[str, Any], PretrainedConfig]`, *optional*):
+            The configuration of the backbone in a dictionary or the config object of the backbone.
         hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
         num_hidden_layers (`int`, *optional*, defaults to 12):
@@ -53,9 +53,9 @@ class ViTHybridConfig(PretrainedConfig):
         hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"selu"` and `"gelu_new"` are supported.
-        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
@@ -67,12 +67,10 @@ class ViTHybridConfig(PretrainedConfig):
             The size (resolution) of each patch.
         num_channels (`int`, *optional*, defaults to 3):
             The number of input channels.
-        qkv_bias (`bool`, *optional*, defaults to `True`):
-            Whether to add a bias to the queries, keys and values.
-        backbone_config (`Union[Dict[str, Any], PretrainedConfig]`, *optional*, defaults to `None`):
-            The configuration of the backbone in a dictionary or the config object of the backbone.
         backbone_featmap_shape (`List[int]`, *optional*, defaults to `[1, 1024, 24, 24]`):
             Used only for the `hybrid` embedding type. The shape of the feature maps of the backbone.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
 
     Example:
 
@@ -107,7 +105,7 @@ def __init__(
         num_channels=3,
         backbone_featmap_shape=[1, 1024, 24, 24],
         qkv_bias=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -146,13 +144,3 @@ def __init__(
         self.patch_size = patch_size
         self.num_channels = num_channels
         self.qkv_bias = qkv_bias
-
-    def to_dict(self) -> Dict[str, any]:
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["backbone_config"] = self.backbone_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py b/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
index ad5847360f98..e88ee246ba1c 100644
--- a/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
+++ b/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
@@ -19,14 +19,14 @@
 import json
 from pathlib import Path
 
-import torch
-from PIL import Image
-
 import requests
 import timm
+import torch
 from huggingface_hub import hf_hub_download
+from PIL import Image
 from timm.data import resolve_data_config
 from timm.data.transforms_factory import create_transform
+
 from transformers import (
     BitConfig,
     ViTHybridConfig,
diff --git a/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py b/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py
index 2cd0074708dc..ce6e3ffafe88 100644
--- a/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py
@@ -18,21 +18,26 @@
 
 import numpy as np
 
-from transformers.utils.generic import TensorType
-
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import (
-    center_crop,
     convert_to_rgb,
     get_resize_output_image_size,
-    normalize,
-    rescale,
     resize,
     to_channel_dimension_format,
 )
-from ...image_utils import ChannelDimension, ImageInput, PILImageResampling, is_batched, to_numpy_array, valid_images
-from ...utils import logging
-from ...utils.import_utils import is_vision_available
+from ...image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import TensorType, is_vision_available, logging
 
 
 logger = logging.get_logger(__name__)
@@ -74,10 +79,11 @@ class ViTHybridImageProcessor(BaseImageProcessor):
             Mean to use if normalizing the image. This is a float or list of floats the length of the number of
             channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
         image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
-            Image standard deviation.
-        do_convert_rgb (`bool`, *optional*, defaults to `True`):
             Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
             number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
     """
 
     model_input_names = ["pixel_values"]
@@ -95,7 +101,7 @@ def __init__(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_convert_rgb: bool = True,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"shortest_edge": 224}
@@ -111,17 +117,19 @@ def __init__(
         self.do_rescale = do_rescale
         self.rescale_factor = rescale_factor
         self.do_normalize = do_normalize
-        self.image_mean = image_mean if image_mean is not None else [0.48145466, 0.4578275, 0.40821073]
-        self.image_std = image_std if image_std is not None else [0.26862954, 0.26130258, 0.27577711]
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
         self.do_convert_rgb = do_convert_rgb
 
+    # Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize
     def resize(
         self,
         image: np.ndarray,
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
@@ -136,79 +144,23 @@ def resize(
                 Resampling filter to use when resiizing the image.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
         """
         size = get_size_dict(size, default_to_square=False)
         if "shortest_edge" not in size:
             raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}")
-        output_size = get_resize_output_image_size(image, size=size["shortest_edge"], default_to_square=False)
-        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
-
-    def center_crop(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Center crop an image. If the image is too small to be cropped to the size given, it will be padded (so the
-        returned result will always be of size `size`).
-
-        Args:
-            image (`np.ndarray`):
-                Image to center crop.
-            size (`Dict[str, int]`):
-                Size of the output image in the form of a dictionary with keys `height` and `width`.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        size = get_size_dict(size)
-        if "height" not in size or "width" not in size:
-            raise ValueError(f"The `size` parameter must contain the keys (height, width). Got {size.keys()}")
-        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
-
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            image_mean (`float` or `List[float]`):
-                Image mean.
-            image_std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
+        output_size = get_resize_output_image_size(
+            image, size=size["shortest_edge"], default_to_square=False, input_data_format=input_data_format
+        )
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
 
     def preprocess(
         self,
@@ -226,14 +178,16 @@ def preprocess(
         do_convert_rgb: bool = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> PIL.Image.Image:
         """
         Preprocess an image or batch of images.
 
         Args:
             images (`ImageInput`):
-                Image to preprocess.
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
             do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                 Whether to resize the image.
             size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@@ -271,6 +225,12 @@ def preprocess(
                 - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                 - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                 - Unset: defaults to the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         size = size if size is not None else self.size
@@ -286,8 +246,7 @@ def preprocess(
         image_std = image_std if image_std is not None else self.image_std
         do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
 
-        if not is_batched(images):
-            images = [images]
+        images = make_list_of_images(images)
 
         if not valid_images(images):
             raise ValueError(
@@ -314,19 +273,42 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
         if do_resize:
-            images = [self.resize(image=image, size=size, resample=resample) for image in images]
+            images = [
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
 
         if do_center_crop:
-            images = [self.center_crop(image=image, size=crop_size) for image in images]
+            images = [
+                self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images
+            ]
 
         if do_rescale:
-            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
 
         if do_normalize:
-            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
-
-        images = [to_channel_dimension_format(image, data_format) for image in images]
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
 
         data = {"pixel_values": images}
         return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
index 8517f0f95bf8..008f6b3c9db5 100644
--- a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
@@ -37,7 +37,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "ViTHybridConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "google/vit-hybrid-base-bit-384"
@@ -268,7 +267,6 @@ def __init__(self, config: ViTHybridConfig) -> None:
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
 
@@ -326,7 +324,6 @@ def __init__(self, config: ViTHybridConfig) -> None:
             self.intermediate_act_fn = config.hidden_act
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.intermediate_act_fn(hidden_states)
 
@@ -461,7 +458,7 @@ class ViTHybridPreTrainedModel(PreTrainedModel):
     base_model_prefix = "vit"
     main_input_name = "pixel_values"
     supports_gradient_checkpointing = True
-    _no_split_modules = []
+    _no_split_modules = ["ViTHybridEmbeddings", "ViTHybridLayer"]
 
     def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
         """Initialize the weights"""
@@ -477,17 +474,17 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> No
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
         elif isinstance(module, ViTHybridEmbeddings):
-            nn.init.trunc_normal_(
-                module.position_embeddings,
+            module.position_embeddings.data = nn.init.trunc_normal_(
+                module.position_embeddings.data.to(torch.float32),
                 mean=0.0,
                 std=self.config.initializer_range,
-            )
+            ).to(module.position_embeddings.dtype)
 
-            nn.init.trunc_normal_(
-                module.cls_token,
+            module.cls_token.data = nn.init.trunc_normal_(
+                module.cls_token.data.to(torch.float32),
                 mean=0.0,
                 std=self.config.initializer_range,
-            )
+            ).to(module.cls_token.dtype)
 
     def _set_gradient_checkpointing(self, module: ViTHybridEncoder, value: bool = False) -> None:
         if isinstance(module, ViTHybridEncoder):
@@ -508,8 +505,8 @@ def _set_gradient_checkpointing(self, module: ViTHybridEncoder, value: bool = Fa
 VIT_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
-            [`AutoFeatureExtractor.__call__`] for details.
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`ViTHybridImageProcessor.__call__`] for details.
 
         head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
             Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
@@ -560,7 +557,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPooling,
         config_class=_CONFIG_FOR_DOC,
@@ -577,6 +573,10 @@ def forward(
         interpolate_pos_encoding: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+        """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -664,7 +664,6 @@ def __init__(self, config: ViTHybridConfig) -> None:
 
     @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_IMAGE_CLASS_CHECKPOINT,
         output_type=ImageClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -703,6 +702,8 @@ def forward(
 
         loss = None
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
             if self.config.problem_type is None:
                 if self.num_labels == 1:
                     self.config.problem_type = "regression"
diff --git a/src/transformers/models/vit_mae/__init__.py b/src/transformers/models/vit_mae/__init__.py
index b785f7f6ee39..bfd200e9dcb9 100644
--- a/src/transformers/models/vit_mae/__init__.py
+++ b/src/transformers/models/vit_mae/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/vit_mae/configuration_vit_mae.py b/src/transformers/models/vit_mae/configuration_vit_mae.py
index b7167fbbdc14..aed808d73251 100644
--- a/src/transformers/models/vit_mae/configuration_vit_mae.py
+++ b/src/transformers/models/vit_mae/configuration_vit_mae.py
@@ -65,7 +65,7 @@ class ViTMAEConfig(PretrainedConfig):
             The number of input channels.
         qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to the queries, keys and values.
-        decoder_num_attention_heads (`int`, *optional*, defaults to 12):
+        decoder_num_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the decoder.
         decoder_hidden_size (`int`, *optional*, defaults to 512):
             Dimensionality of the decoder.
@@ -116,7 +116,7 @@ def __init__(
         decoder_intermediate_size=2048,
         mask_ratio=0.75,
         norm_pix_loss=False,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/vit_mae/convert_vit_mae_to_pytorch.py b/src/transformers/models/vit_mae/convert_vit_mae_to_pytorch.py
index 4cf9a75b674b..47e77593f6fd 100644
--- a/src/transformers/models/vit_mae/convert_vit_mae_to_pytorch.py
+++ b/src/transformers/models/vit_mae/convert_vit_mae_to_pytorch.py
@@ -16,11 +16,11 @@
 
 import argparse
 
+import requests
 import torch
 from PIL import Image
 
-import requests
-from transformers import ViTMAEConfig, ViTMAEFeatureExtractor, ViTMAEForPreTraining
+from transformers import ViTMAEConfig, ViTMAEForPreTraining, ViTMAEImageProcessor
 
 
 def rename_key(name):
@@ -120,7 +120,7 @@ def convert_vit_mae_checkpoint(checkpoint_url, pytorch_dump_folder_path):
 
     state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"]
 
-    feature_extractor = ViTMAEFeatureExtractor(size=config.image_size)
+    image_processor = ViTMAEImageProcessor(size=config.image_size)
 
     new_state_dict = convert_state_dict(state_dict, config)
 
@@ -130,8 +130,8 @@ def convert_vit_mae_checkpoint(checkpoint_url, pytorch_dump_folder_path):
     url = "https://user-images.githubusercontent.com/11435359/147738734-196fd92f-9260-48d5-ba7e-bf103d29364d.jpg"
 
     image = Image.open(requests.get(url, stream=True).raw)
-    feature_extractor = ViTMAEFeatureExtractor(size=config.image_size)
-    inputs = feature_extractor(images=image, return_tensors="pt")
+    image_processor = ViTMAEImageProcessor(size=config.image_size)
+    inputs = image_processor(images=image, return_tensors="pt")
 
     # forward pass
     torch.manual_seed(2)
@@ -157,8 +157,8 @@ def convert_vit_mae_checkpoint(checkpoint_url, pytorch_dump_folder_path):
     print(f"Saving model to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path)
 
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/vit_mae/modeling_tf_vit_mae.py b/src/transformers/models/vit_mae/modeling_tf_vit_mae.py
index ef5de2545787..21898bbe83bb 100644
--- a/src/transformers/models/vit_mae/modeling_tf_vit_mae.py
+++ b/src/transformers/models/vit_mae/modeling_tf_vit_mae.py
@@ -14,11 +14,14 @@
 # limitations under the License.
 """ TF 2.0 ViT MAE (masked autoencoder) model."""
 
+
+from __future__ import annotations
+
 import collections.abc
 import math
 from copy import deepcopy
 from dataclasses import dataclass
-from typing import Dict, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import numpy as np
 import tensorflow as tf
@@ -74,8 +77,8 @@ class TFViTMAEModelOutput(ModelOutput):
     last_hidden_state: tf.Tensor = None
     mask: tf.Tensor = None
     ids_restore: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -97,8 +100,8 @@ class TFViTMAEDecoderOutput(ModelOutput):
     """
 
     logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -125,12 +128,12 @@ class TFViTMAEForPreTrainingOutput(ModelOutput):
             the self-attention heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
     mask: tf.Tensor = None
     ids_restore: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 def get_2d_sincos_pos_embed(embed_dim, grid_size, add_cls_token=False):
@@ -232,7 +235,7 @@ def build(self, input_shape: tf.TensorShape):
 
         super().build(input_shape)
 
-    def random_masking(self, sequence: tf.Tensor, noise: Optional[tf.Tensor] = None):
+    def random_masking(self, sequence: tf.Tensor, noise: tf.Tensor | None = None):
         """
         Perform per-sample random masking by per-sample shuffling. Per-sample shuffling is done by argsort random
         noise.
@@ -639,9 +642,9 @@ class PreTrainedModel
     @unpack_inputs
     def call(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
+        pixel_values: TFModelInputType | None = None,
         noise: tf.Tensor = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -695,36 +698,6 @@ class TFViTMAEPreTrainedModel(TFPreTrainedModel):
     base_model_prefix = "vit"
     main_input_name = "pixel_values"
 
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network. Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        VISION_DUMMY_INPUTS = tf.random.uniform(
-            shape=(3, self.config.num_channels, self.config.image_size, self.config.image_size),
-            dtype=tf.float32,
-        )
-        return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)}
-
-    @tf.function(
-        input_signature=[
-            {
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        """
-        Method used for serving the model.
-
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
-        """
-        output = self.call(inputs)
-        return self.serving_output(output)
-
 
 VIT_MAE_START_DOCSTRING = r"""
     This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
@@ -770,8 +743,8 @@ def serving(self, inputs):
 VIT_MAE_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
-            [`AutoImageProcessor.__call__`] for details.
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
+            for details.
 
         head_mask (`np.ndarray` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
             Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
@@ -816,9 +789,9 @@ def get_input_embeddings(self):
     @replace_return_docstrings(output_type=TFViTMAEModelOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
+        pixel_values: TFModelInputType | None = None,
         noise: tf.Tensor = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -856,18 +829,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output: TFViTMAEModelOutput) -> TFViTMAEModelOutput:
-        hidden_states = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attentions = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFViTMAEModelOutput(
-            last_hidden_state=output.last_hidden_state,
-            mask=output.mask,
-            ids_restore=output.ids_restore,
-            hidden_states=hidden_states,
-            attentions=attentions,
-        )
-
 
 class TFViTMAEDecoder(tf.keras.layers.Layer):
     def __init__(self, config, num_patches, **kwargs):
@@ -1005,11 +966,8 @@ def patchify(self, pixel_values):
         """
         patch_size, num_channels = self.config.patch_size, self.config.num_channels
         # make sure channels are last
-        pixel_values = tf.cond(
-            tf.math.equal(shape_list(pixel_values)[1], num_channels),
-            lambda: tf.transpose(pixel_values, perm=(0, 2, 3, 1)),
-            lambda: pixel_values,
-        )
+        if shape_list(pixel_values)[1] == num_channels:
+            pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
 
         # sanity checks
         tf.debugging.assert_equal(
@@ -1107,9 +1065,9 @@ def forward_loss(self, pixel_values, pred, mask):
     @replace_return_docstrings(output_type=TFViTMAEForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
+        pixel_values: TFModelInputType | None = None,
         noise: tf.Tensor = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1170,15 +1128,3 @@ def call(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
-
-    def serving_output(self, output: TFViTMAEForPreTrainingOutput) -> TFViTMAEForPreTrainingOutput:
-        hidden_states = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attentions = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFViTMAEForPreTrainingOutput(
-            logits=output.logits,
-            mask=output.mask,
-            ids_restore=output.ids_restore,
-            hidden_states=hidden_states,
-            attentions=attentions,
-        )
diff --git a/src/transformers/models/vit_mae/modeling_vit_mae.py b/src/transformers/models/vit_mae/modeling_vit_mae.py
index 39be66e691a2..ef0c7c9f3686 100755
--- a/src/transformers/models/vit_mae/modeling_vit_mae.py
+++ b/src/transformers/models/vit_mae/modeling_vit_mae.py
@@ -388,7 +388,6 @@ def __init__(self, config: ViTMAEConfig) -> None:
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
 
@@ -446,7 +445,6 @@ def __init__(self, config: ViTMAEConfig) -> None:
             self.intermediate_act_fn = config.hidden_act
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.intermediate_act_fn(hidden_states)
 
@@ -612,8 +610,8 @@ def _set_gradient_checkpointing(self, module, value=False):
 VIT_MAE_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
-            [`AutoImageProcessor.__call__`] for details.
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
+            for details.
 
         head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
             Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
diff --git a/src/transformers/models/vit_msn/__init__.py b/src/transformers/models/vit_msn/__init__.py
index 832e730c5881..c36cb750cfa4 100644
--- a/src/transformers/models/vit_msn/__init__.py
+++ b/src/transformers/models/vit_msn/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/vit_msn/configuration_vit_msn.py b/src/transformers/models/vit_msn/configuration_vit_msn.py
index 057824e5d4e1..87d9a37a68e0 100644
--- a/src/transformers/models/vit_msn/configuration_vit_msn.py
+++ b/src/transformers/models/vit_msn/configuration_vit_msn.py
@@ -98,7 +98,7 @@ def __init__(
         patch_size=16,
         num_channels=3,
         qkv_bias=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/vit_msn/convert_msn_to_pytorch.py b/src/transformers/models/vit_msn/convert_msn_to_pytorch.py
index f04d26d5eb88..899c74f18320 100644
--- a/src/transformers/models/vit_msn/convert_msn_to_pytorch.py
+++ b/src/transformers/models/vit_msn/convert_msn_to_pytorch.py
@@ -17,12 +17,12 @@
 import argparse
 import json
 
+import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 
-import requests
-from huggingface_hub import hf_hub_download
-from transformers import ViTFeatureExtractor, ViTMSNConfig, ViTMSNModel
+from transformers import ViTImageProcessor, ViTMSNConfig, ViTMSNModel
 from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 
 
@@ -180,7 +180,7 @@ def convert_vit_msn_checkpoint(checkpoint_url, pytorch_dump_folder_path):
 
     state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["target_encoder"]
 
-    feature_extractor = ViTFeatureExtractor(size=config.image_size)
+    image_processor = ViTImageProcessor(size=config.image_size)
 
     remove_projection_head(state_dict)
     rename_keys = create_rename_keys(config, base_model=True)
@@ -195,10 +195,10 @@ def convert_vit_msn_checkpoint(checkpoint_url, pytorch_dump_folder_path):
     url = "http://images.cocodataset.org/val2017/000000039769.jpg"
 
     image = Image.open(requests.get(url, stream=True).raw)
-    feature_extractor = ViTFeatureExtractor(
+    image_processor = ViTImageProcessor(
         size=config.image_size, image_mean=IMAGENET_DEFAULT_MEAN, image_std=IMAGENET_DEFAULT_STD
     )
-    inputs = feature_extractor(images=image, return_tensors="pt")
+    inputs = image_processor(images=image, return_tensors="pt")
 
     # forward pass
     torch.manual_seed(2)
@@ -224,8 +224,8 @@ def convert_vit_msn_checkpoint(checkpoint_url, pytorch_dump_folder_path):
     print(f"Saving model to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path)
 
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/vit_msn/modeling_vit_msn.py b/src/transformers/models/vit_msn/modeling_vit_msn.py
index 54be1afcc8e9..46639e7d622c 100644
--- a/src/transformers/models/vit_msn/modeling_vit_msn.py
+++ b/src/transformers/models/vit_msn/modeling_vit_msn.py
@@ -153,6 +153,7 @@ def forward(self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = F
         if num_channels != self.num_channels:
             raise ValueError(
                 "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+                f" Expected {self.num_channels} but got {num_channels}."
             )
         if not interpolate_pos_encoding:
             if height != self.image_size[0] or width != self.image_size[1]:
@@ -238,7 +239,6 @@ def __init__(self, config: ViTMSNConfig) -> None:
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
 
@@ -296,7 +296,6 @@ def __init__(self, config: ViTMSNConfig) -> None:
             self.intermediate_act_fn = config.hidden_act
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.intermediate_act_fn(hidden_states)
 
@@ -464,8 +463,8 @@ def _set_gradient_checkpointing(self, module: ViTMSNEncoder, value: bool = False
 VIT_MSN_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
-            [`AutoImageProcessor.__call__`] for details.
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
+            for details.
 
         head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
             Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
@@ -527,6 +526,9 @@ def forward(
         return_dict: Optional[bool] = None,
     ) -> Union[tuple, BaseModelOutput]:
         r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+
         Returns:
 
         Examples:
diff --git a/src/transformers/models/vitdet/__init__.py b/src/transformers/models/vitdet/__init__.py
new file mode 100644
index 000000000000..8ccc1365820d
--- /dev/null
+++ b/src/transformers/models/vitdet/__init__.py
@@ -0,0 +1,57 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+
+
+_import_structure = {"configuration_vitdet": ["VITDET_PRETRAINED_CONFIG_ARCHIVE_MAP", "VitDetConfig"]}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_vitdet"] = [
+        "VITDET_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "VitDetModel",
+        "VitDetPreTrainedModel",
+        "VitDetBackbone",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_vitdet import VITDET_PRETRAINED_CONFIG_ARCHIVE_MAP, VitDetConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_vitdet import (
+            VITDET_PRETRAINED_MODEL_ARCHIVE_LIST,
+            VitDetBackbone,
+            VitDetModel,
+            VitDetPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/vitdet/configuration_vitdet.py b/src/transformers/models/vitdet/configuration_vitdet.py
new file mode 100644
index 000000000000..45dc9e9296f5
--- /dev/null
+++ b/src/transformers/models/vitdet/configuration_vitdet.py
@@ -0,0 +1,155 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" VitDet model configuration"""
+
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
+
+
+logger = logging.get_logger(__name__)
+
+VITDET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "facebook/vit-det-base": "https://huggingface.co/facebook/vit-det-base/resolve/main/config.json",
+}
+
+
+class VitDetConfig(BackboneConfigMixin, PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`VitDetModel`]. It is used to instantiate an
+    VitDet model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the VitDet
+    [google/vitdet-base-patch16-224](https://huggingface.co/google/vitdet-base-patch16-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        mlp_ratio (`int`, *optional*, defaults to 4):
+            Ratio of mlp hidden dim to embedding dim.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        pretrain_image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image during pretraining.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            Stochastic depth rate.
+        window_block_indices (`List[int]`, *optional*, defaults to `[]`):
+            List of indices of blocks that should have window attention instead of regular global self-attention.
+        residual_block_indices (`List[int]`, *optional*, defaults to `[]`):
+            List of indices of blocks that should have an extra residual block after the MLP.
+        use_absolute_position_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to add absolute position embeddings to the patch embeddings.
+        use_relative_position_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to add relative position embeddings to the attention maps.
+        window_size (`int`, *optional*, defaults to 0):
+            The size of the attention window.
+        out_features (`List[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage.
+        out_indices (`List[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage.
+
+    Example:
+
+    ```python
+    >>> from transformers import VitDetConfig, VitDetModel
+
+    >>> # Initializing a VitDet configuration
+    >>> configuration = VitDetConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = VitDetModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "vitdet"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        mlp_ratio=4,
+        hidden_act="gelu",
+        dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-6,
+        image_size=224,
+        pretrain_image_size=224,
+        patch_size=16,
+        num_channels=3,
+        qkv_bias=True,
+        drop_path_rate=0.0,
+        window_block_indices=[],
+        residual_block_indices=[],
+        use_absolute_position_embeddings=True,
+        use_relative_position_embeddings=False,
+        window_size=0,
+        out_features=None,
+        out_indices=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.mlp_ratio = mlp_ratio
+        self.hidden_act = hidden_act
+        self.dropout_prob = dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.pretrain_image_size = pretrain_image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+        self.drop_path_rate = drop_path_rate
+        self.window_block_indices = window_block_indices
+        self.residual_block_indices = residual_block_indices
+        self.use_absolute_position_embeddings = use_absolute_position_embeddings
+        self.use_relative_position_embeddings = use_relative_position_embeddings
+        self.window_size = window_size
+
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, self.num_hidden_layers + 1)]
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
diff --git a/src/transformers/models/vitdet/modeling_vitdet.py b/src/transformers/models/vitdet/modeling_vitdet.py
new file mode 100644
index 000000000000..e89fdbd7a336
--- /dev/null
+++ b/src/transformers/models/vitdet/modeling_vitdet.py
@@ -0,0 +1,886 @@
+# coding=utf-8
+# Copyright 2023 Meta AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch ViTDet backbone."""
+
+
+import collections.abc
+import math
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BackboneOutput, BaseModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from ...utils.backbone_utils import BackboneMixin
+from .configuration_vitdet import VitDetConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "VitDetConfig"
+
+
+VITDET_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/vit-det-base",
+    # See all ViTDet models at https://huggingface.co/models?filter=vitdet
+]
+
+
+class VitDetEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) to be consumed by a Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.pretrain_image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+
+        if config.use_absolute_position_embeddings:
+            # Initialize absolute positional embedding with pretrain image size.
+            num_positions = num_patches + 1
+            self.position_embeddings = nn.Parameter(torch.zeros(1, num_positions, config.hidden_size))
+        else:
+            self.position_embeddings = None
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def get_absolute_positions(self, abs_pos_embeddings, has_cls_token, height, width):
+        """
+        Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token dimension for the
+        original embeddings.
+
+        Args:
+            abs_pos_embeddings (`torch.Tensor`):
+                Absolute positional embeddings with (1, num_position, num_channels).
+            has_cls_token (`bool`):
+                If true, has 1 embedding in abs_pos_embeddings for cls token.
+            height (`int`):
+                Height of input image tokens.
+            width (`int`):
+                Width of input image tokens.
+
+        Returns:
+            Absolute positional embeddings after processing with shape (1, height, width, num_channels)
+        """
+        if has_cls_token:
+            abs_pos_embeddings = abs_pos_embeddings[:, 1:]
+        num_position = abs_pos_embeddings.shape[1]
+        size = int(math.sqrt(num_position))
+        if size * size != num_position:
+            raise ValueError("Absolute position embeddings must be a square number.")
+
+        if size != height or size != width:
+            new_abs_pos_embeddings = nn.functional.interpolate(
+                abs_pos_embeddings.reshape(1, size, size, -1).permute(0, 3, 1, 2),
+                size=(height, width),
+                mode="bicubic",
+                align_corners=False,
+            )
+
+            return new_abs_pos_embeddings.permute(0, 2, 3, 1)
+        else:
+            return abs_pos_embeddings.reshape(1, height, width, -1)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        num_channels = pixel_values.shape[1]
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+                f" Expected {self.num_channels} but got {num_channels}."
+            )
+        embeddings = self.projection(pixel_values)
+
+        if self.position_embeddings is not None:
+            # (batch_size, num_channels, height, width) -> (batch_size, height, width, num_channels)
+            embeddings = embeddings.permute(0, 2, 3, 1)
+            # add position embeddings
+            embeddings = embeddings + self.get_absolute_positions(
+                self.position_embeddings, True, embeddings.shape[1], embeddings.shape[2]
+            )
+            # (batch_size, height, width, num_channels) -> (batch_size, num_channels, height, width)
+            embeddings = embeddings.permute(0, 3, 1, 2)
+
+        return embeddings
+
+
+def get_rel_pos(q_size, k_size, rel_pos):
+    """
+    Get relative positional embeddings according to the relative positions of query and key sizes.
+
+    Args:
+        q_size (`int`):
+            Size of query q.
+        k_size (`int`):
+            Size of key k.
+        rel_pos (`torch.Tensor`):
+            Relative position embeddings (num_embeddings, num_channels).
+
+    Returns:
+        Extracted positional embeddings according to relative positions.
+    """
+    max_rel_dist = int(2 * max(q_size, k_size) - 1)
+    # Interpolate rel pos if needed.
+    if rel_pos.shape[0] != max_rel_dist:
+        # Interpolate rel position embeddings.
+        rel_pos_resized = nn.functional.interpolate(
+            rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
+            size=max_rel_dist,
+            mode="linear",
+        )
+        rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
+    else:
+        rel_pos_resized = rel_pos
+
+    # Scale the coords with short length if shapes for q and k are different.
+    q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
+    k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
+    relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
+
+    return rel_pos_resized[relative_coords.long()]
+
+
+def add_decomposed_relative_positions(attn, queries, rel_pos_h, rel_pos_w, q_size, k_size):
+    """
+    Calculate decomposed Relative Positional Embeddings as introduced in
+    [MViT2](https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py).
+
+    Args:
+        attn (`torch.Tensor`):
+            Attention map.
+        queries (`torch.Tensor`):
+            Query q in the attention layer with shape (batch_size, queries_height * queries_width, num_channels).
+        rel_pos_h (`torch.Tensor`):
+            Relative position embeddings (Lh, num_channels) for height axis.
+        rel_pos_w (`torch.Tensor`):
+            Relative position embeddings (Lw, num_channels) for width axis.
+        q_size (`Tuple[int]`):
+            Spatial sequence size of query q with (queries_height, queries_width).
+        k_size (`Tuple[int]`]):
+            Spatial sequence size of key k with (keys_height, keys_width).
+
+    Returns:
+        attn (Tensor): attention map with added relative positional embeddings.
+    """
+    queries_height, queries_width = q_size
+    keys_height, keys_width = k_size
+    relative_height = get_rel_pos(queries_height, keys_height, rel_pos_h)
+    relative_width = get_rel_pos(queries_width, keys_width, rel_pos_w)
+
+    batch_size, _, dim = queries.shape
+    r_q = queries.reshape(batch_size, queries_height, queries_width, dim)
+    relative_height = torch.einsum("bhwc,hkc->bhwk", r_q, relative_height)
+    relative_weight = torch.einsum("bhwc,wkc->bhwk", r_q, relative_width)
+
+    attn = (
+        attn.view(batch_size, queries_height, queries_width, keys_height, keys_width)
+        + relative_height[:, :, :, :, None]
+        + relative_weight[:, :, :, None, :]
+    ).view(batch_size, queries_height * queries_width, keys_height * keys_width)
+
+    return attn
+
+
+class VitDetAttention(nn.Module):
+    """Multi-head Attention block with relative position embeddings."""
+
+    def __init__(self, config, input_size=None):
+        """
+        Args:
+            config (`VitDetConfig`):
+                Model configuration.
+            input_size (`Tuple[int]`, *optional*):
+                Input resolution, only required in case relative position embeddings are added.
+        """
+        super().__init__()
+
+        dim = config.hidden_size
+        num_heads = config.num_attention_heads
+
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=config.qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+
+        self.use_relative_position_embeddings = config.use_relative_position_embeddings
+        if self.use_relative_position_embeddings:
+            # initialize relative positional embeddings
+            self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
+            self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
+
+    def forward(self, hidden_state, output_attentions=False):
+        batch_size, height, width, _ = hidden_state.shape
+        # qkv with shape (3, batch_size, num_heads, height * width, num_channels)
+        qkv = self.qkv(hidden_state).reshape(batch_size, height * width, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        # queries, keys and values have shape (batch_size * num_heads, height * width, num_channels)
+        queries, keys, values = qkv.reshape(3, batch_size * self.num_heads, height * width, -1).unbind(0)
+
+        attention_scores = (queries * self.scale) @ keys.transpose(-2, -1)
+
+        if self.use_relative_position_embeddings:
+            attention_scores = add_decomposed_relative_positions(
+                attention_scores, queries, self.rel_pos_h, self.rel_pos_w, (height, width), (height, width)
+            )
+
+        attention_probs = attention_scores.softmax(dim=-1)
+
+        hidden_state = attention_probs @ values
+        hidden_state = hidden_state.view(batch_size, self.num_heads, height, width, -1)
+        hidden_state = hidden_state.permute(0, 2, 3, 1, 4)
+        hidden_state = hidden_state.reshape(batch_size, height, width, -1)
+        hidden_state = self.proj(hidden_state)
+
+        if output_attentions:
+            attention_probs = attention_probs.reshape(
+                batch_size, self.num_heads, attention_probs.shape[-2], attention_probs.shape[-1]
+            )
+            outputs = (hidden_state, attention_probs)
+        else:
+            outputs = (hidden_state,)
+
+        return outputs
+
+
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath
+class VitDetDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+
+
+class VitDetLayerNorm(nn.Module):
+    """
+    A LayerNorm variant, popularized by Transformers, that performs point-wise mean and variance normalization over the
+    channel dimension for inputs that have shape (batch_size, channels, height, width).
+    https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119
+    """
+
+    def __init__(self, normalized_shape, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.normalized_shape = (normalized_shape,)
+
+    def forward(self, x):
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
+
+
+class VitDetResBottleneckBlock(nn.Module):
+    """
+    The standard bottleneck residual block without the last activation layer. It contains 3 conv layers with kernels
+    1x1, 3x3, 1x1.
+    """
+
+    def __init__(self, config, in_channels, out_channels, bottleneck_channels):
+        """
+        Args:
+            config (`VitDetConfig`):
+                Model configuration.
+            in_channels (`int`):
+                Number of input channels.
+            out_channels (`int`):
+                Number of output channels.
+            bottleneck_channels (`int`):
+                Number of output channels for the 3x3 "bottleneck" conv layers.
+        """
+        super().__init__()
+        self.conv1 = nn.Conv2d(in_channels, bottleneck_channels, 1, bias=False)
+        self.norm1 = VitDetLayerNorm(bottleneck_channels)
+        self.act1 = ACT2FN[config.hidden_act]
+
+        self.conv2 = nn.Conv2d(bottleneck_channels, bottleneck_channels, 3, padding=1, bias=False)
+        self.norm2 = VitDetLayerNorm(bottleneck_channels)
+        self.act2 = ACT2FN[config.hidden_act]
+
+        self.conv3 = nn.Conv2d(bottleneck_channels, out_channels, 1, bias=False)
+        self.norm3 = VitDetLayerNorm(out_channels)
+
+    def forward(self, x):
+        out = x
+        for layer in self.children():
+            out = layer(out)
+
+        out = x + out
+        return out
+
+
+class VitDetMlp(nn.Module):
+    def __init__(self, config, in_features: int, hidden_features: int) -> None:
+        super().__init__()
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = ACT2FN[config.hidden_act]
+        self.fc2 = nn.Linear(hidden_features, in_features)
+        self.drop = nn.Dropout(config.dropout_prob)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+
+        return x
+
+
+def window_partition(hidden_state, window_size):
+    """
+    Partition into non-overlapping windows with padding if needed.
+
+    Args:
+        hidden_state (`torch.Tensor`):
+            Input tokens with [batch_size, height, width, num_channels].
+        window_size (`int`):
+            Window size.
+
+    Returns:
+        `tuple(torch.FloatTensor)` comprising various elements:
+        - windows: windows after partition with [batch_size * num_windows, window_size, window_size, num_channels].
+        - (patch_height, patch_width): padded height and width before partition
+    """
+    batch_size, height, width, num_channels = hidden_state.shape
+
+    pad_height = (window_size - height % window_size) % window_size
+    pad_width = (window_size - width % window_size) % window_size
+    if pad_height > 0 or pad_width > 0:
+        hidden_state = nn.functional.pad(hidden_state, (0, 0, 0, pad_width, 0, pad_height))
+    patch_height, patch_width = height + pad_height, width + pad_width
+
+    hidden_state = hidden_state.view(
+        batch_size, patch_height // window_size, window_size, patch_width // window_size, window_size, num_channels
+    )
+    windows = hidden_state.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, num_channels)
+    return windows, (patch_height, patch_width)
+
+
+def window_unpartition(windows, window_size, pad_height_width, height_width):
+    """
+    Window unpartition into original sequences and removing padding.
+
+    Args:
+        windows (`torch.Tensor`):
+            Input tokens with [batch_size * num_windows, window_size, window_size, num_channels].
+        window_size (`int`):
+            Window size.
+        pad_height_width (`Tuple[int]`):
+            Padded height and width (patch_height, patch_width).
+        height_width (`Tuple[int]`):
+            Original height and width before padding.
+
+    Returns:
+        hidden_state: unpartitioned sequences with [batch_size, height, width, num_channels].
+    """
+    patch_height, patch_width = pad_height_width
+    height, width = height_width
+    batch_size = windows.shape[0] // (patch_height * patch_width // window_size // window_size)
+    hidden_state = windows.view(
+        batch_size, patch_height // window_size, patch_width // window_size, window_size, window_size, -1
+    )
+    hidden_state = hidden_state.permute(0, 1, 3, 2, 4, 5).contiguous().view(batch_size, patch_height, patch_width, -1)
+
+    if patch_height > height or patch_width > width:
+        hidden_state = hidden_state[:, :height, :width, :].contiguous()
+    return hidden_state
+
+
+class VitDetLayer(nn.Module):
+    """This corresponds to the Block class in the original implementation."""
+
+    def __init__(
+        self, config: VitDetConfig, drop_path_rate: float = 0, window_size: int = 0, use_residual_block: bool = False
+    ) -> None:
+        super().__init__()
+
+        dim = config.hidden_size
+        input_size = (config.image_size // config.patch_size, config.image_size // config.patch_size)
+
+        self.norm1 = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+        self.attention = VitDetAttention(
+            config, input_size=input_size if window_size == 0 else (window_size, window_size)
+        )
+
+        self.drop_path = VitDetDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+        self.norm2 = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+        self.mlp = VitDetMlp(config=config, in_features=dim, hidden_features=int(dim * config.mlp_ratio))
+
+        self.window_size = window_size
+
+        self.use_residual_block = use_residual_block
+        if self.use_residual_block:
+            # Use a residual block with bottleneck channel as dim // 2
+            self.residual = VitDetResBottleneckBlock(
+                config=config,
+                in_channels=dim,
+                out_channels=dim,
+                bottleneck_channels=dim // 2,
+            )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        hidden_states = hidden_states.permute(0, 2, 3, 1)
+
+        shortcut = hidden_states
+
+        hidden_states = self.norm1(hidden_states)
+
+        # Window partition
+        if self.window_size > 0:
+            height, width = hidden_states.shape[1], hidden_states.shape[2]
+            hidden_states, pad_height_width = window_partition(hidden_states, self.window_size)
+
+        self_attention_outputs = self.attention(
+            hidden_states,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        # Reverse window partition
+        if self.window_size > 0:
+            hidden_states = window_unpartition(hidden_states, self.window_size, pad_height_width, (height, width))
+
+        # first residual connection
+        hidden_states = shortcut + self.drop_path(hidden_states)
+
+        hidden_states = hidden_states + self.drop_path(self.mlp(self.norm2(hidden_states)))
+
+        hidden_states = hidden_states.permute(0, 3, 1, 2)
+
+        if self.use_residual_block:
+            hidden_states = self.residual(hidden_states)
+
+        outputs = (hidden_states,) + outputs
+
+        return outputs
+
+
+class VitDetEncoder(nn.Module):
+    def __init__(self, config: VitDetConfig) -> None:
+        super().__init__()
+        self.config = config
+        depth = config.num_hidden_layers
+
+        # stochastic depth decay rule
+        drop_path_rate = [x.item() for x in torch.linspace(0, config.drop_path_rate, depth)]
+
+        layers = []
+        for i in range(depth):
+            layers.append(
+                VitDetLayer(
+                    config,
+                    drop_path_rate=drop_path_rate[i],
+                    window_size=config.window_size if i in config.window_block_indices else 0,
+                    use_residual_block=i in config.residual_block_indices,
+                )
+            )
+
+        self.layer = nn.ModuleList(layers)
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    layer_head_mask,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+def caffe2_msra_fill(module: nn.Module) -> None:
+    """
+    Initialize `module.weight` using the "MSRAFill" implemented in Caffe2. Also initializes `module.bias` to 0.
+
+    Source: https://detectron2.readthedocs.io/en/latest/_modules/fvcore/nn/weight_init.html.
+
+    Args:
+        module (torch.nn.Module): module to initialize.
+    """
+    nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+    if module.bias is not None:
+        nn.init.constant_(module.bias, 0)
+
+
+class VitDetPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = VitDetConfig
+    base_model_prefix = "vitdet"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = []
+
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(
+                module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
+            ).to(module.weight.dtype)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+        elif isinstance(module, VitDetEmbeddings):
+            module.position_embeddings.data = nn.init.trunc_normal_(
+                module.position_embeddings.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.position_embeddings.dtype)
+
+        elif isinstance(module, VitDetAttention) and self.config.use_relative_position_embeddings:
+            module.rel_pos_h.data = nn.init.trunc_normal_(
+                module.rel_pos_h.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            )
+            module.rel_pos_w.data = nn.init.trunc_normal_(
+                module.rel_pos_w.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            )
+
+        elif isinstance(module, VitDetResBottleneckBlock):
+            for layer in [module.conv1, module.conv2, module.conv3]:
+                caffe2_msra_fill(layer)
+            for layer in [module.norm1, module.norm2]:
+                layer.weight.data.fill_(1.0)
+                layer.bias.data.zero_()
+            # zero init last norm layer.
+            module.norm3.weight.data.zero_()
+            module.norm3.bias.data.zero_()
+
+    def _set_gradient_checkpointing(self, module: VitDetEncoder, value: bool = False) -> None:
+        if isinstance(module, VitDetEncoder):
+            module.gradient_checkpointing = value
+
+
+VITDET_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`VitDetConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+VITDET_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
+            for details.
+
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare VitDet Transformer model outputting raw hidden-states without any specific head on top.",
+    VITDET_START_DOCSTRING,
+)
+class VitDetModel(VitDetPreTrainedModel):
+    def __init__(self, config: VitDetConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = VitDetEmbeddings(config)
+        self.encoder = VitDetEncoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> VitDetEmbeddings:
+        return self.embeddings.projection
+
+    def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(VITDET_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        """
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import VitDetConfig, VitDetModel
+        >>> import torch
+
+        >>> config = VitDetConfig()
+        >>> model = VitDetModel(config)
+
+        >>> pixel_values = torch.randn(1, 3, 224, 224)
+
+        >>> with torch.no_grad():
+        ...     outputs = model(pixel_values)
+
+        >>> last_hidden_states = outputs.last_hidden_state
+        >>> list(last_hidden_states.shape)
+        [1, 768, 14, 14]
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+
+        if not return_dict:
+            return (sequence_output,) + encoder_outputs[1:]
+
+        return BaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    ViTDet backbone, to be used with frameworks like Mask R-CNN.
+    """,
+    VITDET_START_DOCSTRING,
+)
+class VitDetBackbone(VitDetPreTrainedModel, BackboneMixin):
+    def __init__(self, config):
+        super().__init__(config)
+        super()._init_backbone(config)
+
+        self.embeddings = VitDetEmbeddings(config)
+        self.encoder = VitDetEncoder(config)
+        self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)]
+
+        # initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> VitDetEmbeddings:
+        return self.embeddings.projection
+
+    @add_start_docstrings_to_model_forward(VITDET_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> BackboneOutput:
+        """
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import VitDetConfig, VitDetBackbone
+        >>> import torch
+
+        >>> config = VitDetConfig()
+        >>> model = VitDetBackbone(config)
+
+        >>> pixel_values = torch.randn(1, 3, 224, 224)
+
+        >>> with torch.no_grad():
+        ...     outputs = model(pixel_values)
+
+        >>> feature_maps = outputs.feature_maps
+        >>> list(feature_maps[-1].shape)
+        [1, 768, 14, 14]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        embedding_output = self.embeddings(pixel_values)
+
+        outputs = self.encoder(
+            embedding_output,
+            output_hidden_states=True,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs.hidden_states if return_dict else outputs[1]
+
+        feature_maps = ()
+        for stage, hidden_state in zip(self.stage_names, hidden_states):
+            if stage in self.out_features:
+                feature_maps += (hidden_state,)
+
+        if not return_dict:
+            if output_hidden_states:
+                output = (feature_maps,) + outputs[1:]
+            else:
+                output = (feature_maps,) + outputs[2:]
+            return output
+
+        return BackboneOutput(
+            feature_maps=feature_maps,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/vitmatte/__init__.py b/src/transformers/models/vitmatte/__init__.py
new file mode 100644
index 000000000000..abbfae97c220
--- /dev/null
+++ b/src/transformers/models/vitmatte/__init__.py
@@ -0,0 +1,72 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+    is_vision_available,
+)
+
+
+_import_structure = {"configuration_vitmatte": ["VITMATTE_PRETRAINED_CONFIG_ARCHIVE_MAP", "VitMatteConfig"]}
+
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["image_processing_vitmatte"] = ["VitMatteImageProcessor"]
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_vitmatte"] = [
+        "VITMATTE_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "VitMattePreTrainedModel",
+        "VitMatteForImageMatting",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_vitmatte import VITMATTE_PRETRAINED_CONFIG_ARCHIVE_MAP, VitMatteConfig
+
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .image_processing_vitmatte import VitMatteImageProcessor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_vitmatte import (
+            VITMATTE_PRETRAINED_MODEL_ARCHIVE_LIST,
+            VitMatteForImageMatting,
+            VitMattePreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/vitmatte/configuration_vitmatte.py b/src/transformers/models/vitmatte/configuration_vitmatte.py
new file mode 100644
index 000000000000..aee3463dd90b
--- /dev/null
+++ b/src/transformers/models/vitmatte/configuration_vitmatte.py
@@ -0,0 +1,107 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" VitMatte model configuration"""
+
+import copy
+from typing import List
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto.configuration_auto import CONFIG_MAPPING
+
+
+logger = logging.get_logger(__name__)
+
+VITMATTE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "hustvl/vitmatte-small-composition-1k": "https://huggingface.co/hustvl/vitmatte-small-composition-1k/resolve/main/config.json",
+}
+
+
+class VitMatteConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of [`VitMatteForImageMatting`]. It is used to
+    instantiate a ViTMatte model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the ViTMatte
+    [hustvl/vitmatte-small-composition-1k](https://huggingface.co/hustvl/vitmatte-small-composition-1k) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `VitDetConfig()`):
+            The configuration of the backbone model.
+        hidden_size (`int`, *optional*, defaults to 384):
+            The number of input channels of the decoder.
+        batch_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the batch norm layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        convstream_hidden_sizes (`List[int]`, *optional*, defaults to `[48, 96, 192]`):
+            The output channels of the ConvStream module.
+        fusion_hidden_sizes (`List[int]`, *optional*, defaults to `[256, 128, 64, 32]`):
+            The output channels of the Fusion blocks.
+
+    Example:
+
+    ```python
+    >>> from transformers import VitMatteConfig, VitMatteForImageMatting
+
+    >>> # Initializing a ViTMatte hustvl/vitmatte-small-composition-1k style configuration
+    >>> configuration = VitMatteConfig()
+
+    >>> # Initializing a model (with random weights) from the hustvl/vitmatte-small-composition-1k style configuration
+    >>> model = VitMatteForImageMatting(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "vitmatte"
+
+    def __init__(
+        self,
+        backbone_config: PretrainedConfig = None,
+        hidden_size: int = 384,
+        batch_norm_eps: float = 1e-5,
+        initializer_range: float = 0.02,
+        convstream_hidden_sizes: List[int] = [48, 96, 192],
+        fusion_hidden_sizes: List[int] = [256, 128, 64, 32],
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        if backbone_config is None:
+            logger.info("`backbone_config` is `None`. Initializing the config with the default `VitDet` backbone.")
+            backbone_config = CONFIG_MAPPING["vitdet"](out_features=["stage4"])
+        elif isinstance(backbone_config, dict):
+            backbone_model_type = backbone_config.get("model_type")
+            config_class = CONFIG_MAPPING[backbone_model_type]
+            backbone_config = config_class.from_dict(backbone_config)
+
+        self.backbone_config = backbone_config
+        self.batch_norm_eps = batch_norm_eps
+        self.hidden_size = hidden_size
+        self.initializer_range = initializer_range
+        self.convstream_hidden_sizes = convstream_hidden_sizes
+        self.fusion_hidden_sizes = fusion_hidden_sizes
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["backbone_config"] = self.backbone_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
diff --git a/src/transformers/models/vitmatte/convert_vitmatte_to_hf.py b/src/transformers/models/vitmatte/convert_vitmatte_to_hf.py
new file mode 100644
index 000000000000..bcc055633371
--- /dev/null
+++ b/src/transformers/models/vitmatte/convert_vitmatte_to_hf.py
@@ -0,0 +1,170 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert VitMatte checkpoints from the original repository.
+
+URL: https://github.com/hustvl/ViTMatte
+"""
+
+import argparse
+
+import requests
+import torch
+from huggingface_hub import hf_hub_download
+from PIL import Image
+
+from transformers import VitDetConfig, VitMatteConfig, VitMatteForImageMatting, VitMatteImageProcessor
+
+
+def get_config(model_name):
+    hidden_size = 384 if "small" in model_name else 768
+    num_attention_heads = 6 if "small" in model_name else 12
+
+    backbone_config = VitDetConfig(
+        num_channels=4,
+        image_size=512,
+        pretrain_image_size=224,
+        patch_size=16,
+        hidden_size=hidden_size,
+        num_attention_heads=num_attention_heads,
+        use_absolute_position_embeddings=True,
+        use_relative_position_embeddings=True,
+        window_size=14,
+        # 2, 5, 8, 11 for global attention
+        window_block_indices=[0, 1, 3, 4, 6, 7, 9, 10],
+        residual_block_indices=[2, 5, 8, 11],
+        out_features=["stage12"],
+    )
+
+    return VitMatteConfig(backbone_config=backbone_config, hidden_size=hidden_size)
+
+
+# here we list all keys to be renamed (original name on the left, our name on the right)
+def create_rename_keys(config):
+    rename_keys = []
+
+    # fmt: off
+    # stem
+    rename_keys.append(("backbone.pos_embed", "backbone.embeddings.position_embeddings"))
+    rename_keys.append(("backbone.patch_embed.proj.weight", "backbone.embeddings.projection.weight"))
+    rename_keys.append(("backbone.patch_embed.proj.bias", "backbone.embeddings.projection.bias"))
+    # fmt: on
+
+    return rename_keys
+
+
+def rename_key(dct, old, new):
+    val = dct.pop(old)
+    dct[new] = val
+
+
+def convert_vitmatte_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
+    config = get_config(model_name)
+
+    # load original state dict
+    model_name_to_filename = {
+        "vitmatte-small-composition-1k": "ViTMatte_S_Com.pth",
+        "vitmatte-base-composition-1k": "ViTMatte_B_Com.pth",
+        "vitmatte-small-distinctions-646": "ViTMatte_S_DIS.pth",
+        "vitmatte-base-distinctions-646": "ViTMatte_B_DIS.pth",
+    }
+
+    filename = model_name_to_filename[model_name]
+    filepath = hf_hub_download(repo_id="nielsr/vitmatte-checkpoints", filename=filename, repo_type="model")
+    state_dict = torch.load(filepath, map_location="cpu")
+
+    # rename keys
+    for key in state_dict.copy().keys():
+        val = state_dict.pop(key)
+        if "backbone.blocks" in key:
+            key = key.replace("backbone.blocks", "backbone.encoder.layer")
+        if "attn" in key:
+            key = key.replace("attn", "attention")
+        if "fusion_blks" in key:
+            key = key.replace("fusion_blks", "fusion_blocks")
+        if "bn" in key:
+            key = key.replace("bn", "batch_norm")
+        state_dict[key] = val
+
+    # rename keys
+    rename_keys = create_rename_keys(config)
+    for src, dest in rename_keys:
+        rename_key(state_dict, src, dest)
+
+    # create model
+    processor = VitMatteImageProcessor()
+    model = VitMatteForImageMatting(config)
+    model.eval()
+
+    # load state dict
+    model.load_state_dict(state_dict)
+
+    # verify on dummy image + trimap
+    url = "https://github.com/hustvl/ViTMatte/blob/main/demo/bulb_rgb.png?raw=true"
+    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+    url = "https://github.com/hustvl/ViTMatte/blob/main/demo/bulb_trimap.png?raw=true"
+    trimap = Image.open(requests.get(url, stream=True).raw)
+
+    pixel_values = processor(images=image, trimaps=trimap.convert("L"), return_tensors="pt").pixel_values
+
+    with torch.no_grad():
+        alphas = model(pixel_values).alphas
+
+    if model_name == "vitmatte-small-composition-1k":
+        expected_slice = torch.tensor([[0.9977, 0.9987, 0.9990], [0.9980, 0.9998, 0.9998], [0.9983, 0.9998, 0.9998]])
+    elif model_name == "vitmatte-base-composition-1k":
+        expected_slice = torch.tensor([[0.9972, 0.9971, 0.9981], [0.9948, 0.9987, 0.9994], [0.9963, 0.9992, 0.9995]])
+    elif model_name == "vitmatte-small-distinctions-646":
+        expected_slice = torch.tensor([[0.9880, 0.9970, 0.9972], [0.9960, 0.9996, 0.9997], [0.9963, 0.9996, 0.9997]])
+    elif model_name == "vitmatte-base-distinctions-646":
+        expected_slice = torch.tensor([[0.9963, 0.9998, 0.9999], [0.9995, 1.0000, 1.0000], [0.9992, 0.9999, 1.0000]])
+
+    assert torch.allclose(alphas[0, 0, :3, :3], expected_slice, atol=1e-4)
+    print("Looks ok!")
+
+    if pytorch_dump_folder_path is not None:
+        print(f"Saving model and processor of {model_name} to {pytorch_dump_folder_path}")
+        model.save_pretrained(pytorch_dump_folder_path)
+        processor.save_pretrained(pytorch_dump_folder_path)
+
+    if push_to_hub:
+        print(f"Pushing model and processor for {model_name} to hub")
+        model.push_to_hub(f"hustvl/{model_name}")
+        processor.push_to_hub(f"hustvl/{model_name}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--model_name",
+        default="vitmatte-small-composition-1k",
+        type=str,
+        choices=[
+            "vitmatte-small-composition-1k",
+            "vitmatte-base-composition-1k",
+            "vitmatte-small-distinctions-646",
+            "vitmatte-base-distinctions-646",
+        ],
+        help="Name of the VitMatte model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+    )
+    parser.add_argument(
+        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
+    )
+
+    args = parser.parse_args()
+    convert_vitmatte_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/vitmatte/image_processing_vitmatte.py b/src/transformers/models/vitmatte/image_processing_vitmatte.py
new file mode 100644
index 000000000000..602b1fbefa8c
--- /dev/null
+++ b/src/transformers/models/vitmatte/image_processing_vitmatte.py
@@ -0,0 +1,267 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for ViTMatte."""
+
+from typing import List, Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature
+from ...image_transforms import pad, to_channel_dimension_format
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class VitMatteImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a ViTMatte image processor.
+
+    Args:
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+            `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_pad (`bool`, *optional*, defaults to `True`):
+            Whether to pad the image to make the width and height divisible by `size_divisibility`. Can be overridden
+            by the `do_pad` parameter in the `preprocess` method.
+        size_divisibility (`int`, *optional*, defaults to 32):
+            The width and height of the image will be padded to be divisible by this number.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_pad: bool = True,
+        size_divisibility: int = 32,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.do_rescale = do_rescale
+        self.do_normalize = do_normalize
+        self.do_pad = do_pad
+        self.rescale_factor = rescale_factor
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+        self.size_divisibility = size_divisibility
+
+    def pad_image(
+        self,
+        image: np.ndarray,
+        size_divisibility: int = 32,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Args:
+            image (`np.ndarray`):
+                Image to pad.
+            size_divisibility (`int`, *optional*, defaults to 32):
+                The width and height of the image will be padded to be divisible by this number.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(image)
+
+        height, width = get_image_size(image, input_data_format)
+
+        if height % size_divisibility != 0 or width % size_divisibility != 0:
+            pad_height = size_divisibility - height % size_divisibility
+            pad_width = size_divisibility - width % size_divisibility
+            padding = ((0, pad_height), (0, pad_width))
+            image = pad(image, padding=padding, data_format=data_format, input_data_format=input_data_format)
+
+        if data_format is not None:
+            image = to_channel_dimension_format(image, data_format, input_data_format)
+
+        return image
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        trimaps: ImageInput,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_pad: Optional[bool] = None,
+        size_divisibility: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ):
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            trimaps (`ImageInput`):
+                Trimap to preprocess.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use if `do_normalize` is set to `True`.
+            do_pad (`bool`, *optional*, defaults to `self.do_pad`):
+                Whether to pad the image.
+            size_divisibility (`int`, *optional*, defaults to `self.size_divisibility`):
+                The size divisibility to pad the image to if `do_pad` is set to `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        do_pad = do_pad if do_pad is not None else self.do_pad
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        size_divisibility = size_divisibility if size_divisibility is not None else self.size_divisibility
+
+        images = make_list_of_images(images)
+        trimaps = make_list_of_images(trimaps, expected_ndims=2)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        if not valid_images(trimaps):
+            raise ValueError(
+                "Invalid trimap type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_pad and size_divisibility is None:
+            raise ValueError("Size divisilibyt must be specified if do_pad is True.")
+
+        if do_normalize and (image_mean is None or image_std is None):
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+        trimaps = [to_numpy_array(trimap) for trimap in trimaps]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        if do_rescale:
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
+            trimaps = [
+                self.rescale(image=trimap, scale=rescale_factor, input_data_format=input_data_format)
+                for trimap in trimaps
+            ]
+
+        if do_normalize:
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        # concatenate images and trimaps
+        images = [
+            np.concatenate([image, np.expand_dims(trimap, axis=-1)], axis=-1) for image, trimap in zip(images, trimaps)
+        ]
+
+        if do_pad:
+            images = [
+                self.pad_image(image, size_divisibility=size_divisibility, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image=image, channel_dim=data_format, input_channel_dim=input_data_format)
+            for image in images
+        ]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/src/transformers/models/vitmatte/modeling_vitmatte.py b/src/transformers/models/vitmatte/modeling_vitmatte.py
new file mode 100644
index 000000000000..b23bdd21d56b
--- /dev/null
+++ b/src/transformers/models/vitmatte/modeling_vitmatte.py
@@ -0,0 +1,343 @@
+# coding=utf-8
+# Copyright 2023 HUST-VL and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch ViTMatte model."""
+
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import torch
+from torch import nn
+
+from ... import AutoBackbone
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from ...utils.backbone_utils import BackboneMixin
+from .configuration_vitmatte import VitMatteConfig
+
+
+VITMATTE_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "hustvl/vitmatte-small-composition-1k",
+    # See all VitMatte models at https://huggingface.co/models?filter=vitmatte
+]
+
+
+# General docstring
+_CONFIG_FOR_DOC = "VitMatteConfig"
+
+
+@dataclass
+class ImageMattingOutput(ModelOutput):
+    """
+    Class for outputs of image matting models.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Loss.
+        alphas (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+           Estimated alpha values.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
+            (also called feature maps) of the model at the output of each stage.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    alphas: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+class VitMattePreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = VitMatteConfig
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        if isinstance(module, nn.Conv2d):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, BackboneMixin):
+            module.gradient_checkpointing = value
+
+
+class VitMatteBasicConv3x3(nn.Module):
+    """
+    Basic convolution layers including: Conv3x3, BatchNorm2d, ReLU layers.
+    """
+
+    def __init__(self, config, in_channels, out_channels, stride=2, padding=1):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=stride,
+            padding=padding,
+            bias=False,
+        )
+        self.batch_norm = nn.BatchNorm2d(out_channels, eps=config.batch_norm_eps)
+        self.relu = nn.ReLU()
+
+    def forward(self, hidden_state):
+        hidden_state = self.conv(hidden_state)
+        hidden_state = self.batch_norm(hidden_state)
+        hidden_state = self.relu(hidden_state)
+
+        return hidden_state
+
+
+class VitMatteConvStream(nn.Module):
+    """
+    Simple ConvStream containing a series of basic conv3x3 layers to extract detail features.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        in_channels = config.backbone_config.num_channels
+        out_channels = config.convstream_hidden_sizes
+
+        self.convs = nn.ModuleList()
+        self.conv_chans = [in_channels] + out_channels
+
+        for i in range(len(self.conv_chans) - 1):
+            in_chan_ = self.conv_chans[i]
+            out_chan_ = self.conv_chans[i + 1]
+            self.convs.append(VitMatteBasicConv3x3(config, in_chan_, out_chan_))
+
+    def forward(self, pixel_values):
+        out_dict = {"detailed_feature_map_0": pixel_values}
+        embeddings = pixel_values
+        for i in range(len(self.convs)):
+            embeddings = self.convs[i](embeddings)
+            name_ = "detailed_feature_map_" + str(i + 1)
+            out_dict[name_] = embeddings
+
+        return out_dict
+
+
+class VitMatteFusionBlock(nn.Module):
+    """
+    Simple fusion block to fuse features from ConvStream and Plain Vision Transformer.
+    """
+
+    def __init__(self, config, in_channels, out_channels):
+        super().__init__()
+        self.conv = VitMatteBasicConv3x3(config, in_channels, out_channels, stride=1, padding=1)
+
+    def forward(self, features, detailed_feature_map):
+        upscaled_features = nn.functional.interpolate(features, scale_factor=2, mode="bilinear", align_corners=False)
+        out = torch.cat([detailed_feature_map, upscaled_features], dim=1)
+        out = self.conv(out)
+
+        return out
+
+
+class VitMatteHead(nn.Module):
+    """
+    Simple Matting Head, containing only conv3x3 and conv1x1 layers.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        in_channels = config.fusion_hidden_sizes[-1]
+        mid_channels = 16
+
+        self.matting_convs = nn.Sequential(
+            nn.Conv2d(in_channels, mid_channels, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(mid_channels),
+            nn.ReLU(True),
+            nn.Conv2d(mid_channels, 1, kernel_size=1, stride=1, padding=0),
+        )
+
+    def forward(self, hidden_state):
+        hidden_state = self.matting_convs(hidden_state)
+
+        return hidden_state
+
+
+class VitMatteDetailCaptureModule(nn.Module):
+    """
+    Simple and lightweight Detail Capture Module for ViT Matting.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        if len(config.fusion_hidden_sizes) != len(config.convstream_hidden_sizes) + 1:
+            raise ValueError(
+                "The length of fusion_hidden_sizes should be equal to the length of convstream_hidden_sizes + 1."
+            )
+
+        self.config = config
+        self.convstream = VitMatteConvStream(config)
+        self.conv_chans = self.convstream.conv_chans
+
+        self.fusion_blocks = nn.ModuleList()
+        self.fusion_channels = [config.hidden_size] + config.fusion_hidden_sizes
+
+        for i in range(len(self.fusion_channels) - 1):
+            self.fusion_blocks.append(
+                VitMatteFusionBlock(
+                    config=config,
+                    in_channels=self.fusion_channels[i] + self.conv_chans[-(i + 1)],
+                    out_channels=self.fusion_channels[i + 1],
+                )
+            )
+
+        self.matting_head = VitMatteHead(config)
+
+    def forward(self, features, pixel_values):
+        detail_features = self.convstream(pixel_values)
+        for i in range(len(self.fusion_blocks)):
+            detailed_feature_map_name = "detailed_feature_map_" + str(len(self.fusion_blocks) - i - 1)
+            features = self.fusion_blocks[i](features, detail_features[detailed_feature_map_name])
+
+        alphas = torch.sigmoid(self.matting_head(features))
+
+        return alphas
+
+
+VITMATTE_START_DOCSTRING = r"""
+    Parameters:
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+        config ([`UperNetConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+VITMATTE_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`VitMatteImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers in case the backbone has them. See
+            `attentions` under returned tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers of the backbone. See `hidden_states` under
+            returned tensors for more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    """ViTMatte framework leveraging any vision backbone e.g. for ADE20k, CityScapes.""",
+    VITMATTE_START_DOCSTRING,
+)
+class VitMatteForImageMatting(VitMattePreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        self.backbone = AutoBackbone.from_config(config.backbone_config)
+        self.decoder = VitMatteDetailCaptureModule(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(VITMATTE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=ImageMattingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        """
+        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+            Ground truth image matting for computing the loss.
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import VitMatteImageProcessor, VitMatteForImageMatting
+        >>> import torch
+        >>> from PIL import Image
+        >>> from huggingface_hub import hf_hub_download
+
+        >>> processor = VitMatteImageProcessor.from_pretrained("hustvl/vitmatte-small-composition-1k")
+        >>> model = VitMatteForImageMatting.from_pretrained("hustvl/vitmatte-small-composition-1k")
+
+        >>> filepath = hf_hub_download(
+        ...     repo_id="hf-internal-testing/image-matting-fixtures", filename="image.png", repo_type="dataset"
+        ... )
+        >>> image = Image.open(filepath).convert("RGB")
+        >>> filepath = hf_hub_download(
+        ...     repo_id="hf-internal-testing/image-matting-fixtures", filename="trimap.png", repo_type="dataset"
+        ... )
+        >>> trimap = Image.open(filepath).convert("L")
+
+        >>> # prepare image + trimap for the model
+        >>> inputs = processor(images=image, trimaps=trimap, return_tensors="pt")
+
+        >>> with torch.no_grad():
+        ...     alphas = model(**inputs).alphas
+        >>> print(alphas.shape)
+        torch.Size([1, 1, 640, 960])
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        outputs = self.backbone.forward_with_filtered_kwargs(
+            pixel_values, output_hidden_states=output_hidden_states, output_attentions=output_attentions
+        )
+
+        features = outputs.feature_maps[-1]
+        alphas = self.decoder(features, pixel_values)
+
+        loss = None
+        if labels is not None:
+            raise NotImplementedError("Training is not yet supported")
+
+        if not return_dict:
+            output = (alphas,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ImageMattingOutput(
+            loss=loss,
+            alphas=alphas,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/vits/__init__.py b/src/transformers/models/vits/__init__.py
new file mode 100644
index 000000000000..79c18048e7c7
--- /dev/null
+++ b/src/transformers/models/vits/__init__.py
@@ -0,0 +1,67 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_sentencepiece_available,
+    is_speech_available,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_vits": [
+        "VITS_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "VitsConfig",
+    ],
+    "tokenization_vits": ["VitsTokenizer"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_vits"] = [
+        "VITS_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "VitsModel",
+        "VitsPreTrainedModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_vits import (
+        VITS_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        VitsConfig,
+    )
+    from .tokenization_vits import VitsTokenizer
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_vits import (
+            VITS_PRETRAINED_MODEL_ARCHIVE_LIST,
+            VitsModel,
+            VitsPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/vits/configuration_vits.py b/src/transformers/models/vits/configuration_vits.py
new file mode 100644
index 000000000000..2cadd39792b7
--- /dev/null
+++ b/src/transformers/models/vits/configuration_vits.py
@@ -0,0 +1,254 @@
+# coding=utf-8
+# Copyright 2023 The Kakao Enterprise Authors and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" VITS model configuration"""
+
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VITS_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "facebook/mms-tts-eng": "https://huggingface.co/facebook/mms-tts-eng/resolve/main/config.json",
+}
+
+
+class VitsConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`VitsModel`]. It is used to instantiate a VITS
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the VITS
+    [facebook/mms-tts-eng](https://huggingface.co/facebook/mms-tts-eng) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 38):
+            Vocabulary size of the VITS model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed to the forward method of [`VitsModel`].
+        hidden_size (`int`, *optional*, defaults to 192):
+            Dimensionality of the text encoder layers.
+        num_hidden_layers (`int`, *optional*, defaults to 6):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 2):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        window_size (`int`, *optional*, defaults to 4):
+            Window size for the relative positional embeddings in the attention layers of the Transformer encoder.
+        use_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use bias in the key, query, value projection layers in the Transformer encoder.
+        ffn_dim (`int`, *optional*, defaults to 768):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        layerdrop (`float`, *optional*, defaults to 0.1):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        ffn_kernel_size (`int`, *optional*, defaults to 3):
+            Kernel size of the 1D convolution layers used by the feed-forward network in the Transformer encoder.
+        flow_size (`int`, *optional*, defaults to 192):
+            Dimensionality of the flow layers.
+        spectrogram_bins (`int`, *optional*, defaults to 513):
+            Number of frequency bins in the target spectrogram.
+        hidden_act (`str` or `function`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for activations inside the fully connected layer.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        use_stochastic_duration_prediction (`bool`, *optional*, defaults to `True`):
+            Whether to use the stochastic duration prediction module or the regular duration predictor.
+        num_speakers (`int`, *optional*, defaults to 1):
+            Number of speakers if this is a multi-speaker model.
+        speaker_embedding_size (`int`, *optional*, defaults to 0):
+            Number of channels used by the speaker embeddings. Is zero for single-speaker models.
+        upsample_initial_channel (`int`, *optional*, defaults to 512):
+            The number of input channels into the HiFi-GAN upsampling network.
+        upsample_rates (`Tuple[int]` or `List[int]`, *optional*, defaults to `[8, 8, 2, 2]`):
+            A tuple of integers defining the stride of each 1D convolutional layer in the HiFi-GAN upsampling network.
+            The length of `upsample_rates` defines the number of convolutional layers and has to match the length of
+            `upsample_kernel_sizes`.
+        upsample_kernel_sizes (`Tuple[int]` or `List[int]`, *optional*, defaults to `[16, 16, 4, 4]`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the HiFi-GAN upsampling
+            network. The length of `upsample_kernel_sizes` defines the number of convolutional layers and has to match
+            the length of `upsample_rates`.
+        resblock_kernel_sizes (`Tuple[int]` or `List[int]`, *optional*, defaults to `[3, 7, 11]`):
+            A tuple of integers defining the kernel sizes of the 1D convolutional layers in the HiFi-GAN
+            multi-receptive field fusion (MRF) module.
+        resblock_dilation_sizes (`Tuple[Tuple[int]]` or `List[List[int]]`, *optional*, defaults to `[[1, 3, 5], [1, 3, 5], [1, 3, 5]]`):
+            A nested tuple of integers defining the dilation rates of the dilated 1D convolutional layers in the
+            HiFi-GAN multi-receptive field fusion (MRF) module.
+        leaky_relu_slope (`float`, *optional*, defaults to 0.1):
+            The angle of the negative slope used by the leaky ReLU activation.
+        depth_separable_channels (`int`, *optional*, defaults to 2):
+            Number of channels to use in each depth-separable block.
+        depth_separable_num_layers (`int`, *optional*, defaults to 3):
+            Number of convolutional layers to use in each depth-separable block.
+        duration_predictor_flow_bins (`int`, *optional*, defaults to 10):
+            Number of channels to map using the unonstrained rational spline in the duration predictor model.
+        duration_predictor_tail_bound (`float`, *optional*, defaults to 5.0):
+            Value of the tail bin boundary when computing the unconstrained rational spline in the duration predictor
+            model.
+        duration_predictor_kernel_size (`int`, *optional*, defaults to 3):
+            Kernel size of the 1D convolution layers used in the duration predictor model.
+        duration_predictor_dropout (`float`, *optional*, defaults to 0.5):
+            The dropout ratio for the duration predictor model.
+        duration_predictor_num_flows (`int`, *optional*, defaults to 4):
+            Number of flow stages used by the duration predictor model.
+        duration_predictor_filter_channels (`int`, *optional*, defaults to 256):
+            Number of channels for the convolution layers used in the duration predictor model.
+        prior_encoder_num_flows (`int`, *optional*, defaults to 4):
+            Number of flow stages used by the prior encoder flow model.
+        prior_encoder_num_wavenet_layers (`int`, *optional*, defaults to 4):
+            Number of WaveNet layers used by the prior encoder flow model.
+        posterior_encoder_num_wavenet_layers (`int`, *optional*, defaults to 16):
+            Number of WaveNet layers used by the posterior encoder model.
+        wavenet_kernel_size (`int`, *optional*, defaults to 5):
+            Kernel size of the 1D convolution layers used in the WaveNet model.
+        wavenet_dilation_rate (`int`, *optional*, defaults to 1):
+            Dilation rates of the dilated 1D convolutional layers used in the WaveNet model.
+        wavenet_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the WaveNet layers.
+        speaking_rate (`float`, *optional*, defaults to 1.0):
+            Speaking rate. Larger values give faster synthesised speech.
+        noise_scale (`float`, *optional*, defaults to 0.667):
+            How random the speech prediction is. Larger values create more variation in the predicted speech.
+        noise_scale_duration (`float`, *optional*, defaults to 0.8):
+            How random the duration prediction is. Larger values create more variation in the predicted durations.
+        sampling_rate (`int`, *optional*, defaults to 16000):
+            The sampling rate at which the output audio waveform is digitalized expressed in hertz (Hz).
+
+    Example:
+
+    ```python
+    >>> from transformers import VitsModel, VitsConfig
+
+    >>> # Initializing a "facebook/mms-tts-eng" style configuration
+    >>> configuration = VitsConfig()
+
+    >>> # Initializing a model (with random weights) from the "facebook/mms-tts-eng" style configuration
+    >>> model = VitsModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "vits"
+
+    def __init__(
+        self,
+        vocab_size=38,
+        hidden_size=192,
+        num_hidden_layers=6,
+        num_attention_heads=2,
+        window_size=4,
+        use_bias=True,
+        ffn_dim=768,
+        layerdrop=0.1,
+        ffn_kernel_size=3,
+        flow_size=192,
+        spectrogram_bins=513,
+        hidden_act="relu",
+        hidden_dropout=0.1,
+        attention_dropout=0.1,
+        activation_dropout=0.1,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        use_stochastic_duration_prediction=True,
+        num_speakers=1,
+        speaker_embedding_size=0,
+        upsample_initial_channel=512,
+        upsample_rates=[8, 8, 2, 2],
+        upsample_kernel_sizes=[16, 16, 4, 4],
+        resblock_kernel_sizes=[3, 7, 11],
+        resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+        leaky_relu_slope=0.1,
+        depth_separable_channels=2,
+        depth_separable_num_layers=3,
+        duration_predictor_flow_bins=10,
+        duration_predictor_tail_bound=5.0,
+        duration_predictor_kernel_size=3,
+        duration_predictor_dropout=0.5,
+        duration_predictor_num_flows=4,
+        duration_predictor_filter_channels=256,
+        prior_encoder_num_flows=4,
+        prior_encoder_num_wavenet_layers=4,
+        posterior_encoder_num_wavenet_layers=16,
+        wavenet_kernel_size=5,
+        wavenet_dilation_rate=1,
+        wavenet_dropout=0.0,
+        speaking_rate=1.0,
+        noise_scale=0.667,
+        noise_scale_duration=0.8,
+        sampling_rate=16_000,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.window_size = window_size
+        self.use_bias = use_bias
+        self.ffn_dim = ffn_dim
+        self.layerdrop = layerdrop
+        self.ffn_kernel_size = ffn_kernel_size
+        self.flow_size = flow_size
+        self.spectrogram_bins = spectrogram_bins
+        self.hidden_act = hidden_act
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.use_stochastic_duration_prediction = use_stochastic_duration_prediction
+        self.num_speakers = num_speakers
+        self.speaker_embedding_size = speaker_embedding_size
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_rates = upsample_rates
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.leaky_relu_slope = leaky_relu_slope
+        self.depth_separable_channels = depth_separable_channels
+        self.depth_separable_num_layers = depth_separable_num_layers
+        self.duration_predictor_flow_bins = duration_predictor_flow_bins
+        self.duration_predictor_tail_bound = duration_predictor_tail_bound
+        self.duration_predictor_kernel_size = duration_predictor_kernel_size
+        self.duration_predictor_dropout = duration_predictor_dropout
+        self.duration_predictor_num_flows = duration_predictor_num_flows
+        self.duration_predictor_filter_channels = duration_predictor_filter_channels
+        self.prior_encoder_num_flows = prior_encoder_num_flows
+        self.prior_encoder_num_wavenet_layers = prior_encoder_num_wavenet_layers
+        self.posterior_encoder_num_wavenet_layers = posterior_encoder_num_wavenet_layers
+        self.wavenet_kernel_size = wavenet_kernel_size
+        self.wavenet_dilation_rate = wavenet_dilation_rate
+        self.wavenet_dropout = wavenet_dropout
+        self.speaking_rate = speaking_rate
+        self.noise_scale = noise_scale
+        self.noise_scale_duration = noise_scale_duration
+        self.sampling_rate = sampling_rate
+
+        if len(upsample_kernel_sizes) != len(upsample_rates):
+            raise ValueError(
+                f"The length of `upsample_kernel_sizes` ({len(upsample_kernel_sizes)}) must match the length of "
+                f"`upsample_rates` ({len(upsample_rates)})"
+            )
+
+        super().__init__(**kwargs)
diff --git a/src/transformers/models/vits/convert_original_checkpoint.py b/src/transformers/models/vits/convert_original_checkpoint.py
new file mode 100644
index 000000000000..267f72ccd08f
--- /dev/null
+++ b/src/transformers/models/vits/convert_original_checkpoint.py
@@ -0,0 +1,390 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert VITS checkpoint."""
+
+import argparse
+import json
+import tempfile
+
+import torch
+from huggingface_hub import hf_hub_download
+
+from transformers import VitsConfig, VitsModel, VitsTokenizer, logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger("transformers.models.vits")
+
+MAPPING_TEXT_ENCODER = {
+    "enc_p.emb": "text_encoder.embed_tokens",
+    "enc_p.encoder.attn_layers.*.conv_k": "text_encoder.encoder.layers.*.attention.k_proj",
+    "enc_p.encoder.attn_layers.*.conv_v": "text_encoder.encoder.layers.*.attention.v_proj",
+    "enc_p.encoder.attn_layers.*.conv_q": "text_encoder.encoder.layers.*.attention.q_proj",
+    "enc_p.encoder.attn_layers.*.conv_o": "text_encoder.encoder.layers.*.attention.out_proj",
+    "enc_p.encoder.attn_layers.*.emb_rel_k": "text_encoder.encoder.layers.*.attention.emb_rel_k",
+    "enc_p.encoder.attn_layers.*.emb_rel_v": "text_encoder.encoder.layers.*.attention.emb_rel_v",
+    "enc_p.encoder.norm_layers_1.*.gamma": "text_encoder.encoder.layers.*.layer_norm.weight",
+    "enc_p.encoder.norm_layers_1.*.beta": "text_encoder.encoder.layers.*.layer_norm.bias",
+    "enc_p.encoder.ffn_layers.*.conv_1": "text_encoder.encoder.layers.*.feed_forward.conv_1",
+    "enc_p.encoder.ffn_layers.*.conv_2": "text_encoder.encoder.layers.*.feed_forward.conv_2",
+    "enc_p.encoder.norm_layers_2.*.gamma": "text_encoder.encoder.layers.*.final_layer_norm.weight",
+    "enc_p.encoder.norm_layers_2.*.beta": "text_encoder.encoder.layers.*.final_layer_norm.bias",
+    "enc_p.proj": "text_encoder.project",
+}
+MAPPING_STOCHASTIC_DURATION_PREDICTOR = {
+    "dp.pre": "duration_predictor.conv_pre",
+    "dp.proj": "duration_predictor.conv_proj",
+    "dp.convs.convs_sep.*": "duration_predictor.conv_dds.convs_dilated.*",
+    "dp.convs.convs_1x1.*": "duration_predictor.conv_dds.convs_pointwise.*",
+    "dp.convs.norms_1.*.gamma": "duration_predictor.conv_dds.norms_1.*.weight",
+    "dp.convs.norms_1.*.beta": "duration_predictor.conv_dds.norms_1.*.bias",
+    "dp.convs.norms_2.*.gamma": "duration_predictor.conv_dds.norms_2.*.weight",
+    "dp.convs.norms_2.*.beta": "duration_predictor.conv_dds.norms_2.*.bias",
+    "dp.flows.0.logs": "duration_predictor.flows.0.log_scale",
+    "dp.flows.0.m": "duration_predictor.flows.0.translate",
+    "dp.flows.*.pre": "duration_predictor.flows.*.conv_pre",
+    "dp.flows.*.proj": "duration_predictor.flows.*.conv_proj",
+    "dp.flows.*.convs.convs_1x1.0": "duration_predictor.flows.*.conv_dds.convs_pointwise.0",
+    "dp.flows.*.convs.convs_1x1.1": "duration_predictor.flows.*.conv_dds.convs_pointwise.1",
+    "dp.flows.*.convs.convs_1x1.2": "duration_predictor.flows.*.conv_dds.convs_pointwise.2",
+    "dp.flows.*.convs.convs_sep.0": "duration_predictor.flows.*.conv_dds.convs_dilated.0",
+    "dp.flows.*.convs.convs_sep.1": "duration_predictor.flows.*.conv_dds.convs_dilated.1",
+    "dp.flows.*.convs.convs_sep.2": "duration_predictor.flows.*.conv_dds.convs_dilated.2",
+    "dp.flows.*.convs.norms_1.0.gamma": "duration_predictor.flows.*.conv_dds.norms_1.0.weight",
+    "dp.flows.*.convs.norms_1.0.beta": "duration_predictor.flows.*.conv_dds.norms_1.0.bias",
+    "dp.flows.*.convs.norms_1.1.gamma": "duration_predictor.flows.*.conv_dds.norms_1.1.weight",
+    "dp.flows.*.convs.norms_1.1.beta": "duration_predictor.flows.*.conv_dds.norms_1.1.bias",
+    "dp.flows.*.convs.norms_1.2.gamma": "duration_predictor.flows.*.conv_dds.norms_1.2.weight",
+    "dp.flows.*.convs.norms_1.2.beta": "duration_predictor.flows.*.conv_dds.norms_1.2.bias",
+    "dp.flows.*.convs.norms_2.0.gamma": "duration_predictor.flows.*.conv_dds.norms_2.0.weight",
+    "dp.flows.*.convs.norms_2.0.beta": "duration_predictor.flows.*.conv_dds.norms_2.0.bias",
+    "dp.flows.*.convs.norms_2.1.gamma": "duration_predictor.flows.*.conv_dds.norms_2.1.weight",
+    "dp.flows.*.convs.norms_2.1.beta": "duration_predictor.flows.*.conv_dds.norms_2.1.bias",
+    "dp.flows.*.convs.norms_2.2.gamma": "duration_predictor.flows.*.conv_dds.norms_2.2.weight",
+    "dp.flows.*.convs.norms_2.2.beta": "duration_predictor.flows.*.conv_dds.norms_2.2.bias",
+    "dp.post_pre": "duration_predictor.post_conv_pre",
+    "dp.post_proj": "duration_predictor.post_conv_proj",
+    "dp.post_convs.convs_sep.*": "duration_predictor.post_conv_dds.convs_dilated.*",
+    "dp.post_convs.convs_1x1.*": "duration_predictor.post_conv_dds.convs_pointwise.*",
+    "dp.post_convs.norms_1.*.gamma": "duration_predictor.post_conv_dds.norms_1.*.weight",
+    "dp.post_convs.norms_1.*.beta": "duration_predictor.post_conv_dds.norms_1.*.bias",
+    "dp.post_convs.norms_2.*.gamma": "duration_predictor.post_conv_dds.norms_2.*.weight",
+    "dp.post_convs.norms_2.*.beta": "duration_predictor.post_conv_dds.norms_2.*.bias",
+    "dp.post_flows.0.logs": "duration_predictor.post_flows.0.log_scale",
+    "dp.post_flows.0.m": "duration_predictor.post_flows.0.translate",
+    "dp.post_flows.*.pre": "duration_predictor.post_flows.*.conv_pre",
+    "dp.post_flows.*.proj": "duration_predictor.post_flows.*.conv_proj",
+    "dp.post_flows.*.convs.convs_1x1.0": "duration_predictor.post_flows.*.conv_dds.convs_pointwise.0",
+    "dp.post_flows.*.convs.convs_1x1.1": "duration_predictor.post_flows.*.conv_dds.convs_pointwise.1",
+    "dp.post_flows.*.convs.convs_1x1.2": "duration_predictor.post_flows.*.conv_dds.convs_pointwise.2",
+    "dp.post_flows.*.convs.convs_sep.0": "duration_predictor.post_flows.*.conv_dds.convs_dilated.0",
+    "dp.post_flows.*.convs.convs_sep.1": "duration_predictor.post_flows.*.conv_dds.convs_dilated.1",
+    "dp.post_flows.*.convs.convs_sep.2": "duration_predictor.post_flows.*.conv_dds.convs_dilated.2",
+    "dp.post_flows.*.convs.norms_1.0.gamma": "duration_predictor.post_flows.*.conv_dds.norms_1.0.weight",
+    "dp.post_flows.*.convs.norms_1.0.beta": "duration_predictor.post_flows.*.conv_dds.norms_1.0.bias",
+    "dp.post_flows.*.convs.norms_1.1.gamma": "duration_predictor.post_flows.*.conv_dds.norms_1.1.weight",
+    "dp.post_flows.*.convs.norms_1.1.beta": "duration_predictor.post_flows.*.conv_dds.norms_1.1.bias",
+    "dp.post_flows.*.convs.norms_1.2.gamma": "duration_predictor.post_flows.*.conv_dds.norms_1.2.weight",
+    "dp.post_flows.*.convs.norms_1.2.beta": "duration_predictor.post_flows.*.conv_dds.norms_1.2.bias",
+    "dp.post_flows.*.convs.norms_2.0.gamma": "duration_predictor.post_flows.*.conv_dds.norms_2.0.weight",
+    "dp.post_flows.*.convs.norms_2.0.beta": "duration_predictor.post_flows.*.conv_dds.norms_2.0.bias",
+    "dp.post_flows.*.convs.norms_2.1.gamma": "duration_predictor.post_flows.*.conv_dds.norms_2.1.weight",
+    "dp.post_flows.*.convs.norms_2.1.beta": "duration_predictor.post_flows.*.conv_dds.norms_2.1.bias",
+    "dp.post_flows.*.convs.norms_2.2.gamma": "duration_predictor.post_flows.*.conv_dds.norms_2.2.weight",
+    "dp.post_flows.*.convs.norms_2.2.beta": "duration_predictor.post_flows.*.conv_dds.norms_2.2.bias",
+    "dp.cond": "duration_predictor.cond",  # num_speakers > 1
+}
+MAPPING_FLOW = {
+    "flow.flows.*.pre": "flow.flows.*.conv_pre",
+    "flow.flows.*.enc.in_layers.0": "flow.flows.*.wavenet.in_layers.0",
+    "flow.flows.*.enc.in_layers.1": "flow.flows.*.wavenet.in_layers.1",
+    "flow.flows.*.enc.in_layers.2": "flow.flows.*.wavenet.in_layers.2",
+    "flow.flows.*.enc.in_layers.3": "flow.flows.*.wavenet.in_layers.3",
+    "flow.flows.*.enc.res_skip_layers.0": "flow.flows.*.wavenet.res_skip_layers.0",
+    "flow.flows.*.enc.res_skip_layers.1": "flow.flows.*.wavenet.res_skip_layers.1",
+    "flow.flows.*.enc.res_skip_layers.2": "flow.flows.*.wavenet.res_skip_layers.2",
+    "flow.flows.*.enc.res_skip_layers.3": "flow.flows.*.wavenet.res_skip_layers.3",
+    "flow.flows.*.enc.cond_layer": "flow.flows.*.wavenet.cond_layer",  # num_speakers > 1
+    "flow.flows.*.post": "flow.flows.*.conv_post",
+}
+MAPPING_GENERATOR = {
+    "dec.conv_pre": "decoder.conv_pre",
+    "dec.ups.0": "decoder.upsampler.0",
+    "dec.ups.1": "decoder.upsampler.1",
+    "dec.ups.2": "decoder.upsampler.2",
+    "dec.ups.3": "decoder.upsampler.3",
+    "dec.resblocks.*.convs1.0": "decoder.resblocks.*.convs1.0",
+    "dec.resblocks.*.convs1.1": "decoder.resblocks.*.convs1.1",
+    "dec.resblocks.*.convs1.2": "decoder.resblocks.*.convs1.2",
+    "dec.resblocks.*.convs2.0": "decoder.resblocks.*.convs2.0",
+    "dec.resblocks.*.convs2.1": "decoder.resblocks.*.convs2.1",
+    "dec.resblocks.*.convs2.2": "decoder.resblocks.*.convs2.2",
+    "dec.conv_post": "decoder.conv_post",
+    "dec.cond": "decoder.cond",  # num_speakers > 1
+}
+MAPPING_POSTERIOR_ENCODER = {
+    "enc_q.pre": "posterior_encoder.conv_pre",
+    "enc_q.enc.in_layers.*": "posterior_encoder.wavenet.in_layers.*",
+    "enc_q.enc.res_skip_layers.*": "posterior_encoder.wavenet.res_skip_layers.*",
+    "enc_q.enc.cond_layer": "posterior_encoder.wavenet.cond_layer",  # num_speakers > 1
+    "enc_q.proj": "posterior_encoder.conv_proj",
+}
+MAPPING = {
+    **MAPPING_TEXT_ENCODER,
+    **MAPPING_STOCHASTIC_DURATION_PREDICTOR,
+    **MAPPING_FLOW,
+    **MAPPING_GENERATOR,
+    **MAPPING_POSTERIOR_ENCODER,
+    "emb_g": "embed_speaker",  # num_speakers > 1
+}
+TOP_LEVEL_KEYS = []
+IGNORE_KEYS = []
+
+
+def set_recursively(hf_pointer, key, value, full_name, weight_type):
+    for attribute in key.split("."):
+        hf_pointer = getattr(hf_pointer, attribute)
+
+    if weight_type is not None:
+        hf_shape = getattr(hf_pointer, weight_type).shape
+    else:
+        hf_shape = hf_pointer.shape
+
+    # strip off the kernel dimension at the end (original weights are Conv1d)
+    if key.endswith(".k_proj") or key.endswith(".v_proj") or key.endswith(".q_proj") or key.endswith(".out_proj"):
+        value = value.squeeze(-1)
+
+    if hf_shape != value.shape:
+        raise ValueError(
+            f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
+            f" {value.shape} for {full_name}"
+        )
+
+    if weight_type == "weight":
+        hf_pointer.weight.data = value
+    elif weight_type == "weight_g":
+        hf_pointer.weight_g.data = value
+    elif weight_type == "weight_v":
+        hf_pointer.weight_v.data = value
+    elif weight_type == "bias":
+        hf_pointer.bias.data = value
+    elif weight_type == "running_mean":
+        hf_pointer.running_mean.data = value
+    elif weight_type == "running_var":
+        hf_pointer.running_var.data = value
+    elif weight_type == "num_batches_tracked":
+        hf_pointer.num_batches_tracked.data = value
+    else:
+        hf_pointer.data = value
+
+    logger.info(f"{key + ('.' + weight_type if weight_type is not None else '')} was initialized from {full_name}.")
+
+
+def should_ignore(name, ignore_keys):
+    for key in ignore_keys:
+        if key.endswith(".*"):
+            if name.startswith(key[:-1]):
+                return True
+        elif ".*." in key:
+            prefix, suffix = key.split(".*.")
+            if prefix in name and suffix in name:
+                return True
+        elif key in name:
+            return True
+    return False
+
+
+def recursively_load_weights(fairseq_dict, hf_model):
+    unused_weights = []
+
+    for name, value in fairseq_dict.items():
+        if should_ignore(name, IGNORE_KEYS):
+            logger.info(f"{name} was ignored")
+            continue
+
+        is_used = False
+        for key, mapped_key in MAPPING.items():
+            if key.endswith(".*"):
+                key = key[:-1]
+            elif "*" in key:
+                prefix, suffix = key.split(".*.")
+                if prefix in name and suffix in name:
+                    key = suffix
+
+            if key in name:
+                is_used = True
+                if mapped_key.endswith(".*"):
+                    layer_index = name.split(key)[-1].split(".")[0]
+                    mapped_key = mapped_key.replace("*", layer_index)
+                elif "*" in mapped_key:
+                    layer_index = name.split(key)[0].split(".")[-2]
+
+                    # remap the layer index since we removed the Flip layers
+                    if "flow.flows" in mapped_key:
+                        layer_index = str(int(layer_index) // 2)
+                    if "duration_predictor.flows" in mapped_key or "duration_predictor.post_flows" in mapped_key:
+                        layer_index = str(int(layer_index) // 2 + 1)
+
+                    mapped_key = mapped_key.replace("*", layer_index)
+                if "weight_g" in name:
+                    weight_type = "weight_g"
+                elif "weight_v" in name:
+                    weight_type = "weight_v"
+                elif "bias" in name:
+                    weight_type = "bias"
+                elif "weight" in name:
+                    weight_type = "weight"
+                elif "running_mean" in name:
+                    weight_type = "running_mean"
+                elif "running_var" in name:
+                    weight_type = "running_var"
+                elif "num_batches_tracked" in name:
+                    weight_type = "num_batches_tracked"
+                else:
+                    weight_type = None
+                set_recursively(hf_model, mapped_key, value, name, weight_type)
+            continue
+        if not is_used:
+            unused_weights.append(name)
+
+    logger.warning(f"Unused weights: {unused_weights}")
+
+
+@torch.no_grad()
+def convert_checkpoint(
+    pytorch_dump_folder_path,
+    checkpoint_path=None,
+    config_path=None,
+    vocab_path=None,
+    language=None,
+    num_speakers=None,
+    sampling_rate=None,
+    repo_id=None,
+):
+    """
+    Copy/paste/tweak model's weights to transformers design.
+    """
+    if config_path is not None:
+        config = VitsConfig.from_pretrained(config_path)
+    else:
+        config = VitsConfig()
+
+    if num_speakers:
+        config.num_speakers = num_speakers
+        config.speaker_embedding_size = 256
+
+    if sampling_rate:
+        config.sampling_rate = sampling_rate
+
+    if checkpoint_path is None:
+        logger.info(f"***Converting model: facebook/mms-tts {language}***")
+
+        vocab_path = hf_hub_download(
+            repo_id="facebook/mms-tts",
+            filename="vocab.txt",
+            subfolder=f"models/{language}",
+        )
+        config_file = hf_hub_download(
+            repo_id="facebook/mms-tts",
+            filename="config.json",
+            subfolder=f"models/{language}",
+        )
+        checkpoint_path = hf_hub_download(
+            repo_id="facebook/mms-tts",
+            filename="G_100000.pth",
+            subfolder=f"models/{language}",
+        )
+
+        with open(config_file, "r") as f:
+            data = f.read()
+            hps = json.loads(data)
+
+        is_uroman = hps["data"]["training_files"].split(".")[-1] == "uroman"
+        if is_uroman:
+            logger.warning("For this checkpoint, you should use `uroman` to convert input text before tokenizing it!")
+    else:
+        logger.info(f"***Converting model: {checkpoint_path}***")
+        is_uroman = False
+
+    # original VITS checkpoint
+    if vocab_path is None:
+        _pad = "_"
+        _punctuation = ';:,.!?¡¿—…"«»“” '
+        _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+        _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
+        symbols = _pad + _punctuation + _letters + _letters_ipa
+        symbol_to_id = {s: i for i, s in enumerate(symbols)}
+        phonemize = True
+    else:
+        # Save vocab as temporary json file
+        symbols = [line.replace("\n", "") for line in open(vocab_path, encoding="utf-8").readlines()]
+        symbol_to_id = {s: i for i, s in enumerate(symbols)}
+        # MMS-TTS does not use a <pad> token, so we set to the token used to space characters
+        _pad = symbols[0]
+        phonemize = False
+
+    with tempfile.NamedTemporaryFile() as tf:
+        with open(tf.name, "w", encoding="utf-8") as f:
+            f.write(json.dumps(symbol_to_id, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+
+        tokenizer = VitsTokenizer(tf.name, language=language, phonemize=phonemize, is_uroman=is_uroman, pad_token=_pad)
+
+    config.vocab_size = len(symbols)
+    model = VitsModel(config)
+
+    model.decoder.apply_weight_norm()
+
+    orig_checkpoint = torch.load(checkpoint_path, map_location=torch.device("cpu"))
+    recursively_load_weights(orig_checkpoint["model"], model)
+
+    model.decoder.remove_weight_norm()
+
+    model.save_pretrained(pytorch_dump_folder_path)
+    tokenizer.save_pretrained(pytorch_dump_folder_path)
+
+    if repo_id:
+        print("Pushing to the hub...")
+        tokenizer.push_to_hub(repo_id)
+        model.push_to_hub(repo_id)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--checkpoint_path", default=None, type=str, help="Local path to original checkpoint")
+    parser.add_argument("--vocab_path", default=None, type=str, help="Path to vocab.txt")
+    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
+    parser.add_argument("--language", default=None, type=str, help="Tokenizer language (three-letter code)")
+    parser.add_argument("--num_speakers", default=None, type=int, help="Number of speakers")
+    parser.add_argument(
+        "--sampling_rate", default=None, type=int, help="Sampling rate on which the model was trained."
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
+    )
+    parser.add_argument(
+        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
+    )
+
+    args = parser.parse_args()
+    convert_checkpoint(
+        args.pytorch_dump_folder_path,
+        args.checkpoint_path,
+        args.config_path,
+        args.vocab_path,
+        args.language,
+        args.num_speakers,
+        args.sampling_rate,
+        args.push_to_hub,
+    )
diff --git a/src/transformers/models/vits/modeling_vits.py b/src/transformers/models/vits/modeling_vits.py
new file mode 100644
index 000000000000..49b9a1f1ae15
--- /dev/null
+++ b/src/transformers/models/vits/modeling_vits.py
@@ -0,0 +1,1513 @@
+# coding=utf-8
+# Copyright 2023 The Kakao Enterprise Authors and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch VITS model."""
+
+import math
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...modeling_outputs import (
+    BaseModelOutput,
+    ModelOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from .configuration_vits import VitsConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+# General docstring
+_CONFIG_FOR_DOC = "VitsConfig"
+
+
+VITS_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/mms-tts-eng",
+    # See all VITS models at https://huggingface.co/models?filter=vits
+    # and all MMS models at https://huggingface.co/models?sort=trending&search=facebook%2Fmms-tts
+]
+
+
+@dataclass
+class VitsModelOutput(ModelOutput):
+    """
+    Describes the outputs for the VITS model, with potential hidden states and attentions.
+
+    Args:
+        waveform (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            The final audio waveform predicted by the model.
+        sequence_lengths  (`torch.FloatTensor` of shape `(batch_size,)`):
+            The length in samples of each element in the `waveform` batch.
+        spectrogram (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_bins)`):
+            The log-mel spectrogram predicted at the output of the flow model. This spectrogram is passed to the Hi-Fi
+            GAN decoder model to obtain the final audio waveform.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attention weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    waveform: torch.FloatTensor = None
+    sequence_lengths: torch.FloatTensor = None
+    spectrogram: Optional[Tuple[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class VitsTextEncoderOutput(ModelOutput):
+    """
+    Describes the outputs for the VITS text encoder model, with potential hidden states and attentions.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        prior_means (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            The predicted mean values of the prior distribution for the latent text variables.
+        prior_log_variances (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            The predicted log-variance values of the prior distribution for the latent text variables.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attention weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    prior_means: torch.FloatTensor = None
+    prior_log_variances: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, num_channels):
+    in_act = input_a + input_b
+    t_act = torch.tanh(in_act[:, :num_channels, :])
+    s_act = torch.sigmoid(in_act[:, num_channels:, :])
+    acts = t_act * s_act
+    return acts
+
+
+def _unconstrained_rational_quadratic_spline(
+    inputs,
+    unnormalized_widths,
+    unnormalized_heights,
+    unnormalized_derivatives,
+    reverse=False,
+    tail_bound=5.0,
+    min_bin_width=1e-3,
+    min_bin_height=1e-3,
+    min_derivative=1e-3,
+):
+    """
+    This transformation represents a monotonically increasing piecewise rational quadratic function. Outside of the
+    `tail_bound`, the transform behaves as an identity function.
+
+    Args:
+        inputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
+            Second half of the hidden-states input to the Vits convolutional flow module.
+        unnormalized_widths (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
+            First `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
+            layer in the convolutional flow module
+        unnormalized_heights (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
+            Second `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
+            layer in the convolutional flow module
+        unnormalized_derivatives (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
+            Third `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
+            layer in the convolutional flow module
+        reverse (`bool`, *optional*, defaults to `False`):
+            Whether the model is being run in reverse mode.
+        tail_bound (`float`, *optional* defaults to 5):
+            Upper and lower limit bound for the rational quadratic function. Outside of this `tail_bound`, the
+            transform behaves as an identity function.
+        min_bin_width (`float`, *optional*, defaults to 1e-3):
+            Minimum bin value across the width dimension for the piecewise rational quadratic function.
+        min_bin_height (`float`, *optional*, defaults to 1e-3):
+            Minimum bin value across the height dimension for the piecewise rational quadratic function.
+        min_derivative (`float`, *optional*, defaults to 1e-3):
+            Minimum bin value across the derivatives for the piecewise rational quadratic function.
+    Returns:
+        outputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
+            Hidden-states as transformed by the piecewise rational quadratic function with the `tail_bound` limits
+            applied.
+        log_abs_det (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
+            Logarithm of the absolute value of the determinants corresponding to the `outputs` with the `tail_bound`
+            limits applied.
+    """
+    inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
+    outside_interval_mask = ~inside_interval_mask
+
+    outputs = torch.zeros_like(inputs)
+    log_abs_det = torch.zeros_like(inputs)
+    constant = np.log(np.exp(1 - min_derivative) - 1)
+
+    unnormalized_derivatives = nn.functional.pad(unnormalized_derivatives, pad=(1, 1))
+    unnormalized_derivatives[..., 0] = constant
+    unnormalized_derivatives[..., -1] = constant
+
+    outputs[outside_interval_mask] = inputs[outside_interval_mask]
+    log_abs_det[outside_interval_mask] = 0.0
+
+    outputs[inside_interval_mask], log_abs_det[inside_interval_mask] = _rational_quadratic_spline(
+        inputs=inputs[inside_interval_mask],
+        unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
+        unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
+        unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
+        reverse=reverse,
+        tail_bound=tail_bound,
+        min_bin_width=min_bin_width,
+        min_bin_height=min_bin_height,
+        min_derivative=min_derivative,
+    )
+    return outputs, log_abs_det
+
+
+def _rational_quadratic_spline(
+    inputs,
+    unnormalized_widths,
+    unnormalized_heights,
+    unnormalized_derivatives,
+    reverse,
+    tail_bound,
+    min_bin_width,
+    min_bin_height,
+    min_derivative,
+):
+    """
+    This transformation represents a monotonically increasing piecewise rational quadratic function. Unlike the
+    function `_unconstrained_rational_quadratic_spline`, the function behaves the same across the `tail_bound`.
+
+    Args:
+        inputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
+            Second half of the hidden-states input to the Vits convolutional flow module.
+        unnormalized_widths (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
+            First `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
+            layer in the convolutional flow module
+        unnormalized_heights (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
+            Second `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
+            layer in the convolutional flow module
+        unnormalized_derivatives (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
+            Third `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
+            layer in the convolutional flow module
+        reverse (`bool`):
+            Whether the model is being run in reverse mode.
+        tail_bound (`float`):
+            Upper and lower limit bound for the rational quadratic function. Outside of this `tail_bound`, the
+            transform behaves as an identity function.
+        min_bin_width (`float`):
+            Minimum bin value across the width dimension for the piecewise rational quadratic function.
+        min_bin_height (`float`):
+            Minimum bin value across the height dimension for the piecewise rational quadratic function.
+        min_derivative (`float`):
+            Minimum bin value across the derivatives for the piecewise rational quadratic function.
+    Returns:
+        outputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
+            Hidden-states as transformed by the piecewise rational quadratic function.
+        log_abs_det (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
+            Logarithm of the absolute value of the determinants corresponding to the `outputs`.
+    """
+    upper_bound = tail_bound
+    lower_bound = -tail_bound
+
+    if torch.min(inputs) < lower_bound or torch.max(inputs) > upper_bound:
+        raise ValueError("Input to a transform is not within its domain")
+
+    num_bins = unnormalized_widths.shape[-1]
+
+    if min_bin_width * num_bins > 1.0:
+        raise ValueError(f"Minimal bin width {min_bin_width} too large for the number of bins {num_bins}")
+    if min_bin_height * num_bins > 1.0:
+        raise ValueError(f"Minimal bin height {min_bin_height} too large for the number of bins {num_bins}")
+
+    widths = nn.functional.softmax(unnormalized_widths, dim=-1)
+    widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
+    cumwidths = torch.cumsum(widths, dim=-1)
+    cumwidths = nn.functional.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
+    cumwidths = (upper_bound - lower_bound) * cumwidths + lower_bound
+    cumwidths[..., 0] = lower_bound
+    cumwidths[..., -1] = upper_bound
+    widths = cumwidths[..., 1:] - cumwidths[..., :-1]
+
+    derivatives = min_derivative + nn.functional.softplus(unnormalized_derivatives)
+
+    heights = nn.functional.softmax(unnormalized_heights, dim=-1)
+    heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
+    cumheights = torch.cumsum(heights, dim=-1)
+    cumheights = nn.functional.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
+    cumheights = (upper_bound - lower_bound) * cumheights + lower_bound
+    cumheights[..., 0] = lower_bound
+    cumheights[..., -1] = upper_bound
+    heights = cumheights[..., 1:] - cumheights[..., :-1]
+
+    bin_locations = cumheights if reverse else cumwidths
+    bin_locations[..., -1] += 1e-6
+    bin_idx = torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
+    bin_idx = bin_idx[..., None]
+
+    input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
+    input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
+
+    input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
+    delta = heights / widths
+    input_delta = delta.gather(-1, bin_idx)[..., 0]
+
+    input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
+    input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
+
+    input_heights = heights.gather(-1, bin_idx)[..., 0]
+
+    intermediate1 = input_derivatives + input_derivatives_plus_one - 2 * input_delta
+    if not reverse:
+        theta = (inputs - input_cumwidths) / input_bin_widths
+        theta_one_minus_theta = theta * (1 - theta)
+
+        numerator = input_heights * (input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta)
+        denominator = input_delta + intermediate1 * theta_one_minus_theta
+        outputs = input_cumheights + numerator / denominator
+
+        derivative_numerator = input_delta.pow(2) * (
+            input_derivatives_plus_one * theta.pow(2)
+            + 2 * input_delta * theta_one_minus_theta
+            + input_derivatives * (1 - theta).pow(2)
+        )
+        log_abs_det = torch.log(derivative_numerator) - 2 * torch.log(denominator)
+        return outputs, log_abs_det
+    else:
+        # find the roots of a quadratic equation
+        intermediate2 = inputs - input_cumheights
+        intermediate3 = intermediate2 * intermediate1
+        a = input_heights * (input_delta - input_derivatives) + intermediate3
+        b = input_heights * input_derivatives - intermediate3
+        c = -input_delta * intermediate2
+
+        discriminant = b.pow(2) - 4 * a * c
+        if not (discriminant >= 0).all():
+            raise RuntimeError(f"invalid discriminant {discriminant}")
+
+        root = (2 * c) / (-b - torch.sqrt(discriminant))
+        outputs = root * input_bin_widths + input_cumwidths
+
+        theta_one_minus_theta = root * (1 - root)
+        denominator = input_delta + intermediate1 * theta_one_minus_theta
+        derivative_numerator = input_delta.pow(2) * (
+            input_derivatives_plus_one * root.pow(2)
+            + 2 * input_delta * theta_one_minus_theta
+            + input_derivatives * (1 - root).pow(2)
+        )
+        log_abs_det = torch.log(derivative_numerator) - 2 * torch.log(denominator)
+        return outputs, -log_abs_det
+
+
+class VitsWaveNet(torch.nn.Module):
+    def __init__(self, config: VitsConfig, num_layers: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.num_layers = num_layers
+
+        self.in_layers = torch.nn.ModuleList()
+        self.res_skip_layers = torch.nn.ModuleList()
+        self.dropout = nn.Dropout(config.wavenet_dropout)
+
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+        else:
+            weight_norm = nn.utils.weight_norm
+
+        if config.speaker_embedding_size != 0:
+            cond_layer = torch.nn.Conv1d(config.speaker_embedding_size, 2 * config.hidden_size * num_layers, 1)
+            self.cond_layer = weight_norm(cond_layer, name="weight")
+
+        for i in range(num_layers):
+            dilation = config.wavenet_dilation_rate**i
+            padding = (config.wavenet_kernel_size * dilation - dilation) // 2
+            in_layer = torch.nn.Conv1d(
+                in_channels=config.hidden_size,
+                out_channels=2 * config.hidden_size,
+                kernel_size=config.wavenet_kernel_size,
+                dilation=dilation,
+                padding=padding,
+            )
+            in_layer = weight_norm(in_layer, name="weight")
+            self.in_layers.append(in_layer)
+
+            # last one is not necessary
+            if i < num_layers - 1:
+                res_skip_channels = 2 * config.hidden_size
+            else:
+                res_skip_channels = config.hidden_size
+
+            res_skip_layer = torch.nn.Conv1d(config.hidden_size, res_skip_channels, 1)
+            res_skip_layer = weight_norm(res_skip_layer, name="weight")
+            self.res_skip_layers.append(res_skip_layer)
+
+    def forward(self, inputs, padding_mask, global_conditioning=None):
+        outputs = torch.zeros_like(inputs)
+        num_channels_tensor = torch.IntTensor([self.hidden_size])
+
+        if global_conditioning is not None:
+            global_conditioning = self.cond_layer(global_conditioning)
+
+        for i in range(self.num_layers):
+            hidden_states = self.in_layers[i](inputs)
+
+            if global_conditioning is not None:
+                cond_offset = i * 2 * self.hidden_size
+                global_states = global_conditioning[:, cond_offset : cond_offset + 2 * self.hidden_size, :]
+            else:
+                global_states = torch.zeros_like(hidden_states)
+
+            acts = fused_add_tanh_sigmoid_multiply(hidden_states, global_states, num_channels_tensor[0])
+            acts = self.dropout(acts)
+
+            res_skip_acts = self.res_skip_layers[i](acts)
+            if i < self.num_layers - 1:
+                res_acts = res_skip_acts[:, : self.hidden_size, :]
+                inputs = (inputs + res_acts) * padding_mask
+                outputs = outputs + res_skip_acts[:, self.hidden_size :, :]
+            else:
+                outputs = outputs + res_skip_acts
+
+        return outputs * padding_mask
+
+    def remove_weight_norm(self):
+        if self.speaker_embedding_size != 0:
+            torch.nn.utils.remove_weight_norm(self.cond_layer)
+        for layer in self.in_layers:
+            torch.nn.utils.remove_weight_norm(layer)
+        for layer in self.res_skip_layers:
+            torch.nn.utils.remove_weight_norm(layer)
+
+
+class VitsPosteriorEncoder(nn.Module):
+    def __init__(self, config: VitsConfig):
+        super().__init__()
+        self.out_channels = config.flow_size
+
+        self.conv_pre = nn.Conv1d(config.spectrogram_bins, config.hidden_size, 1)
+        self.wavenet = VitsWaveNet(config, num_layers=config.posterior_encoder_num_wavenet_layers)
+        self.conv_proj = nn.Conv1d(config.hidden_size, self.out_channels * 2, 1)
+
+    def forward(self, inputs, padding_mask, global_conditioning=None):
+        inputs = self.conv_pre(inputs) * padding_mask
+        inputs = self.wavenet(inputs, padding_mask, global_conditioning)
+        stats = self.conv_proj(inputs) * padding_mask
+        mean, log_stddev = torch.split(stats, self.out_channels, dim=1)
+        sampled = (mean + torch.randn_like(mean) * torch.exp(log_stddev)) * padding_mask
+        return sampled, mean, log_stddev
+
+
+# Copied from transformers.models.speecht5.modeling_speecht5.HifiGanResidualBlock
+class HifiGanResidualBlock(nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), leaky_relu_slope=0.1):
+        super().__init__()
+        self.leaky_relu_slope = leaky_relu_slope
+
+        self.convs1 = nn.ModuleList(
+            [
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    stride=1,
+                    dilation=dilation[i],
+                    padding=self.get_padding(kernel_size, dilation[i]),
+                )
+                for i in range(len(dilation))
+            ]
+        )
+        self.convs2 = nn.ModuleList(
+            [
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    stride=1,
+                    dilation=1,
+                    padding=self.get_padding(kernel_size, 1),
+                )
+                for _ in range(len(dilation))
+            ]
+        )
+
+    def get_padding(self, kernel_size, dilation=1):
+        return (kernel_size * dilation - dilation) // 2
+
+    def apply_weight_norm(self):
+        for layer in self.convs1:
+            nn.utils.weight_norm(layer)
+        for layer in self.convs2:
+            nn.utils.weight_norm(layer)
+
+    def remove_weight_norm(self):
+        for layer in self.convs1:
+            nn.utils.remove_weight_norm(layer)
+        for layer in self.convs2:
+            nn.utils.remove_weight_norm(layer)
+
+    def forward(self, hidden_states):
+        for conv1, conv2 in zip(self.convs1, self.convs2):
+            residual = hidden_states
+            hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
+            hidden_states = conv1(hidden_states)
+            hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
+            hidden_states = conv2(hidden_states)
+            hidden_states = hidden_states + residual
+        return hidden_states
+
+
+class VitsHifiGan(nn.Module):
+    def __init__(self, config: VitsConfig):
+        super().__init__()
+        self.config = config
+        self.num_kernels = len(config.resblock_kernel_sizes)
+        self.num_upsamples = len(config.upsample_rates)
+        self.conv_pre = nn.Conv1d(
+            config.flow_size,
+            config.upsample_initial_channel,
+            kernel_size=7,
+            stride=1,
+            padding=3,
+        )
+
+        self.upsampler = nn.ModuleList()
+        for i, (upsample_rate, kernel_size) in enumerate(zip(config.upsample_rates, config.upsample_kernel_sizes)):
+            self.upsampler.append(
+                nn.ConvTranspose1d(
+                    config.upsample_initial_channel // (2**i),
+                    config.upsample_initial_channel // (2 ** (i + 1)),
+                    kernel_size=kernel_size,
+                    stride=upsample_rate,
+                    padding=(kernel_size - upsample_rate) // 2,
+                )
+            )
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.upsampler)):
+            channels = config.upsample_initial_channel // (2 ** (i + 1))
+            for kernel_size, dilation in zip(config.resblock_kernel_sizes, config.resblock_dilation_sizes):
+                self.resblocks.append(HifiGanResidualBlock(channels, kernel_size, dilation, config.leaky_relu_slope))
+
+        self.conv_post = nn.Conv1d(channels, 1, kernel_size=7, stride=1, padding=3, bias=False)
+
+        if config.speaker_embedding_size != 0:
+            self.cond = nn.Conv1d(config.speaker_embedding_size, config.upsample_initial_channel, 1)
+
+    def apply_weight_norm(self):
+        for layer in self.upsampler:
+            nn.utils.weight_norm(layer)
+        for layer in self.resblocks:
+            layer.apply_weight_norm()
+
+    def remove_weight_norm(self):
+        for layer in self.upsampler:
+            nn.utils.remove_weight_norm(layer)
+        for layer in self.resblocks:
+            layer.remove_weight_norm()
+
+    def forward(
+        self, spectrogram: torch.FloatTensor, global_conditioning: Optional[torch.FloatTensor] = None
+    ) -> torch.FloatTensor:
+        r"""
+        Converts a spectrogram into a speech waveform.
+
+        Args:
+            spectrogram (`torch.FloatTensor` of shape `(batch_size, config.spectrogram_bins, sequence_length)`):
+                Tensor containing the spectrograms.
+            global_conditioning (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_size, 1)`, *optional*):
+                Tensor containing speaker embeddings, for multispeaker models.
+
+        Returns:
+            `torch.FloatTensor`: Tensor of shape shape `(batch_size, 1, num_frames)` containing the speech waveform.
+        """
+        hidden_states = self.conv_pre(spectrogram)
+
+        if global_conditioning is not None:
+            hidden_states = hidden_states + self.cond(global_conditioning)
+
+        for i in range(self.num_upsamples):
+            hidden_states = nn.functional.leaky_relu(hidden_states, self.config.leaky_relu_slope)
+            hidden_states = self.upsampler[i](hidden_states)
+
+            res_state = self.resblocks[i * self.num_kernels](hidden_states)
+            for j in range(1, self.num_kernels):
+                res_state += self.resblocks[i * self.num_kernels + j](hidden_states)
+            hidden_states = res_state / self.num_kernels
+
+        hidden_states = nn.functional.leaky_relu(hidden_states)
+        hidden_states = self.conv_post(hidden_states)
+        waveform = torch.tanh(hidden_states)
+        return waveform
+
+
+class VitsResidualCouplingLayer(nn.Module):
+    def __init__(self, config: VitsConfig):
+        super().__init__()
+        self.half_channels = config.flow_size // 2
+
+        self.conv_pre = nn.Conv1d(self.half_channels, config.hidden_size, 1)
+        self.wavenet = VitsWaveNet(config, num_layers=config.prior_encoder_num_wavenet_layers)
+        self.conv_post = nn.Conv1d(config.hidden_size, self.half_channels, 1)
+
+    def forward(self, inputs, padding_mask, global_conditioning=None, reverse=False):
+        first_half, second_half = torch.split(inputs, [self.half_channels] * 2, dim=1)
+        hidden_states = self.conv_pre(first_half) * padding_mask
+        hidden_states = self.wavenet(hidden_states, padding_mask, global_conditioning)
+        mean = self.conv_post(hidden_states) * padding_mask
+        log_stddev = torch.zeros_like(mean)
+
+        if not reverse:
+            second_half = mean + second_half * torch.exp(log_stddev) * padding_mask
+            outputs = torch.cat([first_half, second_half], dim=1)
+            log_determinant = torch.sum(log_stddev, [1, 2])
+            return outputs, log_determinant
+        else:
+            second_half = (second_half - mean) * torch.exp(-log_stddev) * padding_mask
+            outputs = torch.cat([first_half, second_half], dim=1)
+            return outputs, None
+
+
+class VitsResidualCouplingBlock(nn.Module):
+    def __init__(self, config: VitsConfig):
+        super().__init__()
+        self.flows = nn.ModuleList()
+        for _ in range(config.prior_encoder_num_flows):
+            self.flows.append(VitsResidualCouplingLayer(config))
+
+    def forward(self, inputs, padding_mask, global_conditioning=None, reverse=False):
+        if not reverse:
+            for flow in self.flows:
+                inputs, _ = flow(inputs, padding_mask, global_conditioning)
+                inputs = torch.flip(inputs, [1])
+        else:
+            for flow in reversed(self.flows):
+                inputs = torch.flip(inputs, [1])
+                inputs, _ = flow(inputs, padding_mask, global_conditioning, reverse=True)
+        return inputs
+
+
+class VitsDilatedDepthSeparableConv(nn.Module):
+    def __init__(self, config: VitsConfig, dropout_rate=0.0):
+        super().__init__()
+        kernel_size = config.duration_predictor_kernel_size
+        channels = config.hidden_size
+        self.num_layers = config.depth_separable_num_layers
+
+        self.dropout = nn.Dropout(dropout_rate)
+        self.convs_dilated = nn.ModuleList()
+        self.convs_pointwise = nn.ModuleList()
+        self.norms_1 = nn.ModuleList()
+        self.norms_2 = nn.ModuleList()
+        for i in range(self.num_layers):
+            dilation = kernel_size**i
+            padding = (kernel_size * dilation - dilation) // 2
+            self.convs_dilated.append(
+                nn.Conv1d(
+                    in_channels=channels,
+                    out_channels=channels,
+                    kernel_size=kernel_size,
+                    groups=channels,
+                    dilation=dilation,
+                    padding=padding,
+                )
+            )
+            self.convs_pointwise.append(nn.Conv1d(channels, channels, 1))
+            self.norms_1.append(nn.LayerNorm(channels))
+            self.norms_2.append(nn.LayerNorm(channels))
+
+    def forward(self, inputs, padding_mask, global_conditioning=None):
+        if global_conditioning is not None:
+            inputs = inputs + global_conditioning
+
+        for i in range(self.num_layers):
+            hidden_states = self.convs_dilated[i](inputs * padding_mask)
+            hidden_states = self.norms_1[i](hidden_states.transpose(1, -1)).transpose(1, -1)
+            hidden_states = nn.functional.gelu(hidden_states)
+            hidden_states = self.convs_pointwise[i](hidden_states)
+            hidden_states = self.norms_2[i](hidden_states.transpose(1, -1)).transpose(1, -1)
+            hidden_states = nn.functional.gelu(hidden_states)
+            hidden_states = self.dropout(hidden_states)
+            inputs = inputs + hidden_states
+
+        return inputs * padding_mask
+
+
+class VitsConvFlow(nn.Module):
+    def __init__(self, config: VitsConfig):
+        super().__init__()
+        self.filter_channels = config.hidden_size
+        self.half_channels = config.depth_separable_channels // 2
+        self.num_bins = config.duration_predictor_flow_bins
+        self.tail_bound = config.duration_predictor_tail_bound
+
+        self.conv_pre = nn.Conv1d(self.half_channels, self.filter_channels, 1)
+        self.conv_dds = VitsDilatedDepthSeparableConv(config)
+        self.conv_proj = nn.Conv1d(self.filter_channels, self.half_channels * (self.num_bins * 3 - 1), 1)
+
+    def forward(self, inputs, padding_mask, global_conditioning=None, reverse=False):
+        first_half, second_half = torch.split(inputs, [self.half_channels] * 2, dim=1)
+
+        hidden_states = self.conv_pre(first_half)
+        hidden_states = self.conv_dds(hidden_states, padding_mask, global_conditioning)
+        hidden_states = self.conv_proj(hidden_states) * padding_mask
+
+        batch_size, channels, length = first_half.shape
+        hidden_states = hidden_states.reshape(batch_size, channels, -1, length).permute(0, 1, 3, 2)
+
+        unnormalized_widths = hidden_states[..., : self.num_bins] / math.sqrt(self.filter_channels)
+        unnormalized_heights = hidden_states[..., self.num_bins : 2 * self.num_bins] / math.sqrt(self.filter_channels)
+        unnormalized_derivatives = hidden_states[..., 2 * self.num_bins :]
+
+        second_half, log_abs_det = _unconstrained_rational_quadratic_spline(
+            second_half,
+            unnormalized_widths,
+            unnormalized_heights,
+            unnormalized_derivatives,
+            reverse=reverse,
+            tail_bound=self.tail_bound,
+        )
+
+        outputs = torch.cat([first_half, second_half], dim=1) * padding_mask
+        if not reverse:
+            log_determinant = torch.sum(log_abs_det * padding_mask, [1, 2])
+            return outputs, log_determinant
+        else:
+            return outputs, None
+
+
+class VitsElementwiseAffine(nn.Module):
+    def __init__(self, config: VitsConfig):
+        super().__init__()
+        self.channels = config.depth_separable_channels
+        self.translate = nn.Parameter(torch.zeros(self.channels, 1))
+        self.log_scale = nn.Parameter(torch.zeros(self.channels, 1))
+
+    def forward(self, inputs, padding_mask, global_conditioning=None, reverse=False):
+        if not reverse:
+            outputs = self.translate + torch.exp(self.log_scale) * inputs
+            outputs = outputs * padding_mask
+            log_determinant = torch.sum(self.log_scale * padding_mask, [1, 2])
+            return outputs, log_determinant
+        else:
+            outputs = (inputs - self.translate) * torch.exp(-self.log_scale) * padding_mask
+            return outputs, None
+
+
+class VitsStochasticDurationPredictor(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        embed_dim = config.speaker_embedding_size
+        filter_channels = config.hidden_size
+
+        self.conv_pre = nn.Conv1d(filter_channels, filter_channels, 1)
+        self.conv_proj = nn.Conv1d(filter_channels, filter_channels, 1)
+        self.conv_dds = VitsDilatedDepthSeparableConv(
+            config,
+            dropout_rate=config.duration_predictor_dropout,
+        )
+
+        if embed_dim != 0:
+            self.cond = nn.Conv1d(embed_dim, filter_channels, 1)
+
+        self.flows = nn.ModuleList()
+        self.flows.append(VitsElementwiseAffine(config))
+        for _ in range(config.duration_predictor_num_flows):
+            self.flows.append(VitsConvFlow(config))
+
+        self.post_conv_pre = nn.Conv1d(1, filter_channels, 1)
+        self.post_conv_proj = nn.Conv1d(filter_channels, filter_channels, 1)
+        self.post_conv_dds = VitsDilatedDepthSeparableConv(
+            config,
+            dropout_rate=config.duration_predictor_dropout,
+        )
+
+        self.post_flows = nn.ModuleList()
+        self.post_flows.append(VitsElementwiseAffine(config))
+        for _ in range(config.duration_predictor_num_flows):
+            self.post_flows.append(VitsConvFlow(config))
+
+    def forward(self, inputs, padding_mask, global_conditioning=None, durations=None, reverse=False, noise_scale=1.0):
+        inputs = torch.detach(inputs)
+        inputs = self.conv_pre(inputs)
+
+        if global_conditioning is not None:
+            global_conditioning = torch.detach(global_conditioning)
+            inputs = inputs + self.cond(global_conditioning)
+
+        inputs = self.conv_dds(inputs, padding_mask)
+        inputs = self.conv_proj(inputs) * padding_mask
+
+        if not reverse:
+            hidden_states = self.post_conv_pre(durations)
+            hidden_states = self.post_conv_dds(hidden_states, padding_mask)
+            hidden_states = self.post_conv_proj(hidden_states) * padding_mask
+
+            random_posterior = (
+                torch.randn(durations.size(0), 2, durations.size(2)).to(device=inputs.device, dtype=inputs.dtype)
+                * padding_mask
+            )
+            log_determinant_posterior_sum = 0
+            latents_posterior = random_posterior
+            for flow in self.post_flows:
+                latents_posterior, log_determinant = flow(
+                    latents_posterior, padding_mask, global_conditioning=inputs + hidden_states
+                )
+                latents_posterior = torch.flip(latents_posterior, [1])
+                log_determinant_posterior_sum += log_determinant
+
+            first_half, second_half = torch.split(latents_posterior, [1, 1], dim=1)
+
+            log_determinant_posterior_sum += torch.sum(
+                (nn.functional.logsigmoid(first_half) + nn.functional.logsigmoid(-first_half)) * padding_mask, [1, 2]
+            )
+            logq = (
+                torch.sum(-0.5 * (math.log(2 * math.pi) + (random_posterior**2)) * padding_mask, [1, 2])
+                - log_determinant_posterior_sum
+            )
+
+            first_half = (durations - torch.sigmoid(first_half)) * padding_mask
+            first_half = torch.log(torch.clamp_min(first_half, 1e-5)) * padding_mask
+            log_determinant_sum = torch.sum(-first_half, [1, 2])
+
+            latents = torch.cat([first_half, second_half], dim=1)
+            for flow in self.flows:
+                latents, log_determinant = flow(latents, padding_mask, global_conditioning=inputs)
+                latents = torch.flip(latents, [1])
+                log_determinant_sum += log_determinant
+
+            nll = (
+                torch.sum(0.5 * (math.log(2 * math.pi) + (latents**2)) * padding_mask, [1, 2]) - log_determinant_sum
+            )
+            return nll + logq
+        else:
+            flows = list(reversed(self.flows))
+            flows = flows[:-2] + [flows[-1]]  # remove a useless vflow
+
+            latents = (
+                torch.randn(inputs.size(0), 2, inputs.size(2)).to(device=inputs.device, dtype=inputs.dtype)
+                * noise_scale
+            )
+            for flow in flows:
+                latents = torch.flip(latents, [1])
+                latents, _ = flow(latents, padding_mask, global_conditioning=inputs, reverse=True)
+
+            log_duration, _ = torch.split(latents, [1, 1], dim=1)
+            return log_duration
+
+
+class VitsDurationPredictor(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        kernel_size = config.duration_predictor_kernel_size
+        filter_channels = config.duration_predictor_filter_channels
+
+        self.dropout = nn.Dropout(config.duration_predictor_dropout)
+        self.conv_1 = nn.Conv1d(config.hidden_size, filter_channels, kernel_size, padding=kernel_size // 2)
+        self.norm_1 = nn.LayerNorm(filter_channels, eps=config.layer_norm_eps)
+        self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2)
+        self.norm_2 = nn.LayerNorm(filter_channels, eps=config.layer_norm_eps)
+        self.proj = nn.Conv1d(filter_channels, 1, 1)
+
+        if config.speaker_embedding_size != 0:
+            self.cond = nn.Conv1d(config.speaker_embedding_size, config.hidden_size, 1)
+
+    def forward(self, inputs, padding_mask, global_conditioning=None):
+        inputs = torch.detach(inputs)
+
+        if global_conditioning is not None:
+            global_conditioning = torch.detach(global_conditioning)
+            inputs = inputs + self.cond(global_conditioning)
+
+        inputs = self.conv_1(inputs * padding_mask)
+        inputs = torch.relu(inputs)
+        inputs = self.norm_1(inputs.transpose(1, -1)).transpose(1, -1)
+        inputs = self.dropout(inputs)
+
+        inputs = self.conv_2(inputs * padding_mask)
+        inputs = torch.relu(inputs)
+        inputs = self.norm_2(inputs.transpose(1, -1)).transpose(1, -1)
+        inputs = self.dropout(inputs)
+
+        inputs = self.proj(inputs * padding_mask)
+        return inputs * padding_mask
+
+
+class VitsAttention(nn.Module):
+    """Multi-headed attention with relative positional representation."""
+
+    def __init__(self, config: VitsConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.dropout = config.attention_dropout
+        self.window_size = config.window_size
+
+        self.head_dim = self.embed_dim // self.num_heads
+        self.scaling = self.head_dim**-0.5
+
+        if (self.head_dim * self.num_heads) != self.embed_dim:
+            raise ValueError(
+                f"hidden_size must be divisible by num_attention_heads (got `hidden_size`: {self.embed_dim}"
+                f" and `num_attention_heads`: {self.num_heads})."
+            )
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
+
+        if self.window_size:
+            self.emb_rel_k = nn.Parameter(torch.randn(1, self.window_size * 2 + 1, self.head_dim) * self.scaling)
+            self.emb_rel_v = nn.Parameter(torch.randn(1, self.window_size * 2 + 1, self.head_dim) * self.scaling)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+
+        # self_attention
+        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if self.window_size is not None:
+            key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, src_len)
+            relative_logits = torch.matmul(query_states, key_relative_embeddings.transpose(-2, -1))
+            rel_pos_bias = self._relative_position_to_absolute_position(relative_logits)
+            attn_weights += rel_pos_bias
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        if self.window_size is not None:
+            value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, src_len)
+            relative_weights = self._absolute_position_to_relative_position(attn_probs)
+            rel_pos_bias = torch.matmul(relative_weights, value_relative_embeddings)
+            attn_output += rel_pos_bias
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+    def _get_relative_embeddings(self, relative_embeddings, length):
+        pad_length = max(length - (self.window_size + 1), 0)
+        if pad_length > 0:
+            relative_embeddings = nn.functional.pad(relative_embeddings, [0, 0, pad_length, pad_length, 0, 0])
+
+        slice_start_position = max((self.window_size + 1) - length, 0)
+        slice_end_position = slice_start_position + 2 * length - 1
+        return relative_embeddings[:, slice_start_position:slice_end_position]
+
+    def _relative_position_to_absolute_position(self, x):
+        batch_heads, length, _ = x.size()
+
+        # Concat columns of pad to shift from relative to absolute indexing.
+        x = nn.functional.pad(x, [0, 1, 0, 0, 0, 0])
+
+        # Concat extra elements so to add up to shape (len+1, 2*len-1).
+        x_flat = x.view([batch_heads, length * 2 * length])
+        x_flat = nn.functional.pad(x_flat, [0, length - 1, 0, 0])
+
+        # Reshape and slice out the padded elements.
+        x_final = x_flat.view([batch_heads, length + 1, 2 * length - 1])
+        x_final = x_final[:, :length, length - 1 :]
+        return x_final
+
+    def _absolute_position_to_relative_position(self, x):
+        batch_heads, length, _ = x.size()
+
+        # Pad along column
+        x = nn.functional.pad(x, [0, length - 1, 0, 0, 0, 0])
+        x_flat = x.view([batch_heads, length**2 + length * (length - 1)])
+
+        # Add 0's in the beginning that will skew the elements after reshape
+        x_flat = nn.functional.pad(x_flat, [length, 0, 0, 0])
+        x_final = x_flat.view([batch_heads, length, 2 * length])[:, :, 1:]
+        return x_final
+
+
+class VitsFeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv_1 = nn.Conv1d(config.hidden_size, config.ffn_dim, config.ffn_kernel_size)
+        self.conv_2 = nn.Conv1d(config.ffn_dim, config.hidden_size, config.ffn_kernel_size)
+        self.dropout = nn.Dropout(config.activation_dropout)
+
+        if isinstance(config.hidden_act, str):
+            self.act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.act_fn = config.hidden_act
+
+        if config.ffn_kernel_size > 1:
+            pad_left = (config.ffn_kernel_size - 1) // 2
+            pad_right = config.ffn_kernel_size // 2
+            self.padding = [pad_left, pad_right, 0, 0, 0, 0]
+        else:
+            self.padding = None
+
+    def forward(self, hidden_states, padding_mask):
+        hidden_states = hidden_states.permute(0, 2, 1)
+        padding_mask = padding_mask.permute(0, 2, 1)
+
+        hidden_states = hidden_states * padding_mask
+        if self.padding is not None:
+            hidden_states = nn.functional.pad(hidden_states, self.padding)
+
+        hidden_states = self.conv_1(hidden_states)
+        hidden_states = self.act_fn(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        hidden_states = hidden_states * padding_mask
+        if self.padding is not None:
+            hidden_states = nn.functional.pad(hidden_states, self.padding)
+
+        hidden_states = self.conv_2(hidden_states)
+        hidden_states = hidden_states * padding_mask
+
+        hidden_states = hidden_states.permute(0, 2, 1)
+        return hidden_states
+
+
+class VitsEncoderLayer(nn.Module):
+    def __init__(self, config: VitsConfig):
+        super().__init__()
+        self.attention = VitsAttention(config)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = VitsFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        padding_mask: torch.FloatTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ):
+        residual = hidden_states
+        hidden_states, attn_weights = self.attention(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.layer_norm(residual + hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.feed_forward(hidden_states, padding_mask)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.final_layer_norm(residual + hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class VitsEncoder(nn.Module):
+    def __init__(self, config: VitsConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([VitsEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+        self.layerdrop = config.layerdrop
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        padding_mask: torch.FloatTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
+
+        hidden_states = hidden_states * padding_mask
+
+        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = np.random.uniform(0, 1)
+
+            skip_the_layer = self.training and (dropout_probability < self.layerdrop)
+            if not skip_the_layer or deepspeed_zero3_is_enabled:
+                # under deepspeed zero3 all gpus must run in sync
+                if self.gradient_checkpointing and self.training:
+                    # create gradient checkpointing function
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
+
+                        return custom_forward
+
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(encoder_layer),
+                        hidden_states,
+                        padding_mask,
+                        attention_mask,
+                    )
+                else:
+                    layer_outputs = encoder_layer(
+                        hidden_states,
+                        attention_mask=attention_mask,
+                        padding_mask=padding_mask,
+                        output_attentions=output_attentions,
+                    )
+                hidden_states = layer_outputs[0]
+
+            if skip_the_layer:
+                layer_outputs = (None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        hidden_states = hidden_states * padding_mask
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class VitsTextEncoder(nn.Module):
+    """
+    Transformer encoder that uses relative positional representation instead of absolute positional encoding.
+    """
+
+    def __init__(self, config: VitsConfig):
+        super().__init__()
+        self.config = config
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+        self.encoder = VitsEncoder(config)
+        self.project = nn.Conv1d(config.hidden_size, config.flow_size * 2, kernel_size=1)
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        padding_mask: torch.FloatTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple[torch.Tensor], VitsTextEncoderOutput]:
+        hidden_states = self.embed_tokens(input_ids) * math.sqrt(self.config.hidden_size)
+
+        encoder_outputs = self.encoder(
+            hidden_states=hidden_states,
+            padding_mask=padding_mask,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0] if not return_dict else encoder_outputs.last_hidden_state
+
+        stats = self.project(last_hidden_state.transpose(1, 2)).transpose(1, 2) * padding_mask
+        prior_means, prior_log_variances = torch.split(stats, self.config.flow_size, dim=2)
+
+        if not return_dict:
+            outputs = (last_hidden_state, prior_means, prior_log_variances) + encoder_outputs[1:]
+            return outputs
+
+        return VitsTextEncoderOutput(
+            last_hidden_state=last_hidden_state,
+            prior_means=prior_means,
+            prior_log_variances=prior_log_variances,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class VitsPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = VitsConfig
+    base_model_prefix = "vits"
+    main_input_name = "input_ids"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (VitsTextEncoder)):
+            module.gradient_checkpointing = value
+
+
+VITS_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`VitsConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+VITS_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
+            1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        speaker_id (`int`, *optional*):
+            Which speaker embedding to use. Only used for multispeaker models.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The complete VITS model, for text-to-speech synthesis.",
+    VITS_START_DOCSTRING,
+)
+class VitsModel(VitsPreTrainedModel):
+    def __init__(self, config: VitsConfig):
+        super().__init__(config)
+        self.config = config
+        self.text_encoder = VitsTextEncoder(config)
+        self.flow = VitsResidualCouplingBlock(config)
+        self.decoder = VitsHifiGan(config)
+
+        if config.use_stochastic_duration_prediction:
+            self.duration_predictor = VitsStochasticDurationPredictor(config)
+        else:
+            self.duration_predictor = VitsDurationPredictor(config)
+
+        if config.num_speakers > 1:
+            self.embed_speaker = nn.Embedding(config.num_speakers, config.speaker_embedding_size)
+
+        # This is used only for training.
+        self.posterior_encoder = VitsPosteriorEncoder(config)
+
+        # These parameters control the synthesised speech properties
+        self.speaking_rate = config.speaking_rate
+        self.noise_scale = config.noise_scale
+        self.noise_scale_duration = config.noise_scale_duration
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_encoder(self):
+        return self.text_encoder
+
+    @add_start_docstrings_to_model_forward(VITS_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=VitsModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        speaker_id: Optional[int] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.FloatTensor] = None,
+    ) -> Union[Tuple[Any], VitsModelOutput]:
+        r"""
+        labels (`torch.FloatTensor` of shape `(batch_size, config.spectrogram_bins, sequence_length)`, *optional*):
+            Float values of target spectrogram. Timesteps set to `-100.0` are ignored (masked) for the loss
+            computation.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import VitsTokenizer, VitsModel, set_seed
+        >>> import torch
+
+        >>> tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
+        >>> model = VitsModel.from_pretrained("facebook/mms-tts-eng")
+
+        >>> inputs = tokenizer(text="Hello - my dog is cute", return_tensors="pt")
+
+        >>> set_seed(555)  # make deterministic
+
+        >>> with torch.no_grad():
+        ...     outputs = model(inputs["input_ids"])
+        >>> outputs.waveform.shape
+        torch.Size([1, 45824])
+        ```
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if attention_mask is not None:
+            input_padding_mask = attention_mask.unsqueeze(-1).float()
+        else:
+            input_padding_mask = torch.ones_like(input_ids).unsqueeze(-1).float()
+
+        if self.config.num_speakers > 1 and speaker_id is not None:
+            if not 0 <= speaker_id < self.config.num_speakers:
+                raise ValueError(f"Set `speaker_id` in the range 0-{self.config.num_speakers - 1}.")
+            if isinstance(speaker_id, int):
+                speaker_id = torch.full(size=(1,), fill_value=speaker_id, device=self.device)
+            speaker_embeddings = self.embed_speaker(speaker_id).unsqueeze(-1)
+        else:
+            speaker_embeddings = None
+
+        if labels is not None:
+            raise NotImplementedError("Training of VITS is not supported yet.")
+
+        text_encoder_output = self.text_encoder(
+            input_ids=input_ids,
+            padding_mask=input_padding_mask,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = text_encoder_output[0] if not return_dict else text_encoder_output.last_hidden_state
+        hidden_states = hidden_states.transpose(1, 2)
+        input_padding_mask = input_padding_mask.transpose(1, 2)
+        prior_means = text_encoder_output[1] if not return_dict else text_encoder_output.prior_means
+        prior_log_variances = text_encoder_output[2] if not return_dict else text_encoder_output.prior_log_variances
+
+        if self.config.use_stochastic_duration_prediction:
+            log_duration = self.duration_predictor(
+                hidden_states,
+                input_padding_mask,
+                speaker_embeddings,
+                reverse=True,
+                noise_scale=self.noise_scale_duration,
+            )
+        else:
+            log_duration = self.duration_predictor(hidden_states, input_padding_mask, speaker_embeddings)
+
+        length_scale = 1.0 / self.speaking_rate
+        duration = torch.ceil(torch.exp(log_duration) * input_padding_mask * length_scale)
+        predicted_lengths = torch.clamp_min(torch.sum(duration, [1, 2]), 1).long()
+
+        # Create a padding mask for the output lengths of shape (batch, 1, max_output_length)
+        indices = torch.arange(predicted_lengths.max(), dtype=predicted_lengths.dtype, device=predicted_lengths.device)
+        output_padding_mask = indices.unsqueeze(0) < predicted_lengths.unsqueeze(1)
+        output_padding_mask = output_padding_mask.unsqueeze(1).to(input_padding_mask.dtype)
+
+        # Reconstruct an attention tensor of shape (batch, 1, out_length, in_length)
+        attn_mask = torch.unsqueeze(input_padding_mask, 2) * torch.unsqueeze(output_padding_mask, -1)
+        batch_size, _, output_length, input_length = attn_mask.shape
+        cum_duration = torch.cumsum(duration, -1).view(batch_size * input_length, 1)
+        indices = torch.arange(output_length, dtype=duration.dtype, device=duration.device)
+        valid_indices = indices.unsqueeze(0) < cum_duration
+        valid_indices = valid_indices.to(attn_mask.dtype).view(batch_size, input_length, output_length)
+        padded_indices = valid_indices - nn.functional.pad(valid_indices, [0, 0, 1, 0, 0, 0])[:, :-1]
+        attn = padded_indices.unsqueeze(1).transpose(2, 3) * attn_mask
+
+        # Expand prior distribution
+        prior_means = torch.matmul(attn.squeeze(1), prior_means).transpose(1, 2)
+        prior_log_variances = torch.matmul(attn.squeeze(1), prior_log_variances).transpose(1, 2)
+
+        prior_latents = prior_means + torch.randn_like(prior_means) * torch.exp(prior_log_variances) * self.noise_scale
+        latents = self.flow(prior_latents, output_padding_mask, speaker_embeddings, reverse=True)
+
+        spectrogram = latents * output_padding_mask
+        waveform = self.decoder(spectrogram, speaker_embeddings)
+        waveform = waveform.squeeze(1)
+        sequence_lengths = predicted_lengths * np.prod(self.config.upsample_rates)
+
+        if not return_dict:
+            outputs = (waveform, sequence_lengths, spectrogram) + text_encoder_output[3:]
+            return outputs
+
+        return VitsModelOutput(
+            waveform=waveform,
+            sequence_lengths=sequence_lengths,
+            spectrogram=spectrogram,
+            hidden_states=text_encoder_output.hidden_states,
+            attentions=text_encoder_output.attentions,
+        )
diff --git a/src/transformers/models/vits/tokenization_vits.py b/src/transformers/models/vits/tokenization_vits.py
new file mode 100644
index 000000000000..0563be326cdb
--- /dev/null
+++ b/src/transformers/models/vits/tokenization_vits.py
@@ -0,0 +1,250 @@
+# coding=utf-8
+# Copyright 2023 The Kakao Enterprise Authors, the MMS-TTS Authors and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization class for VITS."""
+
+
+import json
+import os
+import re
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from ...tokenization_utils import PreTrainedTokenizer
+from ...utils import is_phonemizer_available, logging
+
+
+if is_phonemizer_available():
+    import phonemizer
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "facebook/mms-tts-eng": "https://huggingface.co/facebook/mms-tts-eng/resolve/main/vocab.json",
+    }
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    # This model does not have a maximum input length.
+    "facebook/mms-tts-eng": 4096,
+}
+
+
+def has_non_roman_characters(input_string):
+    # Find any character outside the ASCII range
+    non_roman_pattern = re.compile(r"[^\x00-\x7F]")
+
+    # Search the input string for non-Roman characters
+    match = non_roman_pattern.search(input_string)
+    has_non_roman = match is not None
+    return has_non_roman
+
+
+class VitsTokenizer(PreTrainedTokenizer):
+    """
+    Construct a VITS tokenizer. Also supports MMS-TTS.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        language (`str`, *optional*):
+            Language identifier.
+        add_blank (`bool`, *optional*, defaults to `True`):
+            Whether to insert token id 0 in between the other tokens.
+        normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the input text by removing all casing and punctuation.
+        phonemize (`bool`, *optional*, defaults to `True`):
+            Whether to convert the input text into phonemes.
+        is_uroman (`bool`, *optional*, defaults to `False`):
+            Whether the `uroman` Romanizer needs to be applied to the input text prior to tokenizing.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        pad_token="<pad>",
+        unk_token="<unk>",
+        language=None,
+        add_blank=True,
+        normalize=True,
+        phonemize=True,
+        is_uroman=False,
+        **kwargs,
+    ) -> None:
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.language = language
+        self.add_blank = add_blank
+        self.normalize = normalize
+        self.phonemize = phonemize
+
+        self.is_uroman = is_uroman
+
+        super().__init__(
+            pad_token=pad_token,
+            unk_token=unk_token,
+            language=language,
+            add_blank=add_blank,
+            normalize=normalize,
+            phonemize=phonemize,
+            is_uroman=is_uroman,
+            **kwargs,
+        )
+
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
+
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def normalize_text(self, input_string):
+        """Lowercase the input string, respecting any special token ids that may be part or entirely upper-cased."""
+        all_vocabulary = list(self.encoder.keys()) + list(self.added_tokens_encoder.keys())
+        filtered_text = ""
+
+        i = 0
+        while i < len(input_string):
+            found_match = False
+            for word in all_vocabulary:
+                if input_string[i : i + len(word)] == word:
+                    filtered_text += word
+                    i += len(word)
+                    found_match = True
+                    break
+
+            if not found_match:
+                filtered_text += input_string[i].lower()
+                i += 1
+
+        return filtered_text
+
+    def _preprocess_char(self, text):
+        """Special treatment of characters in certain languages"""
+        if self.language == "ron":
+            text = text.replace("ț", "ţ")
+        return text
+
+    def prepare_for_tokenization(
+        self, text: str, is_split_into_words: bool = False, normalize: Optional[bool] = None, **kwargs
+    ) -> Tuple[str, Dict[str, Any]]:
+        """
+        Performs any necessary transformations before tokenization.
+
+        This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
+        `kwargs` at the end of the encoding process to be sure all the arguments have been used.
+
+        Args:
+            text (`str`):
+                The text to prepare.
+            is_split_into_words (`bool`, *optional*, defaults to `False`):
+                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
+                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
+                which it will tokenize.
+            normalize (`bool`, *optional*, defaults to `None`):
+                Whether or not to apply punctuation and casing normalization to the text inputs. Typically, VITS is
+                trained on lower-cased and un-punctuated text. Hence, normalization is used to ensure that the input
+                text consists only of lower-case characters.
+            kwargs (`Dict[str, Any]`, *optional*):
+                Keyword arguments to use for the tokenization.
+
+        Returns:
+            `Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
+        """
+        normalize = normalize if normalize is not None else self.normalize
+
+        if normalize:
+            # normalise for casing
+            text = self.normalize_text(text)
+
+        filtered_text = self._preprocess_char(text)
+
+        if has_non_roman_characters(filtered_text) and self.is_uroman:
+            logger.warning(
+                "Text to the tokenizer contains non-Roman characters. Ensure the `uroman` Romanizer is "
+                "applied to the text prior to passing it to the tokenizer. See "
+                "`https://github.com/isi-nlp/uroman` for details."
+            )
+
+        if self.phonemize:
+            if not is_phonemizer_available():
+                raise ImportError("Please install the `phonemizer` Python package to use this tokenizer.")
+
+            filtered_text = phonemizer.phonemize(
+                filtered_text,
+                language="en-us",
+                backend="espeak",
+                strip=True,
+                preserve_punctuation=True,
+                with_stress=True,
+            )
+            filtered_text = re.sub(r"\s+", " ", filtered_text)
+        elif normalize:
+            # strip any chars outside of the vocab (punctuation)
+            filtered_text = "".join(list(filter(lambda char: char in self.encoder, filtered_text))).strip()
+
+        return filtered_text, kwargs
+
+    def _tokenize(self, text: str) -> List[str]:
+        """Tokenize a string by inserting the `<pad>` token at the boundary between adjacent characters."""
+        tokens = list(text)
+
+        if self.add_blank:
+            interspersed = [self._convert_id_to_token(0)] * (len(tokens) * 2 + 1)
+            interspersed[1::2] = tokens
+            tokens = interspersed
+
+        return tokens
+
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        if self.add_blank and len(tokens) > 1:
+            tokens = tokens[1::2]
+        return "".join(tokens)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index)
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Union[Tuple[str], None]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+
+        return (vocab_file,)
diff --git a/src/transformers/models/vivit/__init__.py b/src/transformers/models/vivit/__init__.py
new file mode 100644
index 000000000000..ec446b797072
--- /dev/null
+++ b/src/transformers/models/vivit/__init__.py
@@ -0,0 +1,78 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+# rely on isort to merge the imports
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
+
+
+_import_structure = {
+    "configuration_vivit": ["VIVIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "VivitConfig"],
+}
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["image_processing_vivit"] = ["VivitImageProcessor"]
+
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_vivit"] = [
+        "VIVIT_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "VivitModel",
+        "VivitPreTrainedModel",
+        "VivitForVideoClassification",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_vivit import VIVIT_PRETRAINED_CONFIG_ARCHIVE_MAP, VivitConfig
+
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .image_processing_vivit import VivitImageProcessor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_vivit import (
+            VIVIT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            VivitForVideoClassification,
+            VivitModel,
+            VivitPreTrainedModel,
+        )
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/vivit/configuration_vivit.py b/src/transformers/models/vivit/configuration_vivit.py
new file mode 100644
index 000000000000..c554999b9064
--- /dev/null
+++ b/src/transformers/models/vivit/configuration_vivit.py
@@ -0,0 +1,122 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" ViViT model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VIVIT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "google/vivit-b-16x2-kinetics400": (
+        "https://huggingface.co/google/vivit-b-16x2-kinetics400/resolve/main/config.json"
+    ),
+    # See all Vivit models at https://huggingface.co/models?filter=vivit
+}
+
+
+class VivitConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`VivitModel`]. It is used to instantiate a ViViT
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the ViViT
+    [google/vivit-b-16x2-kinetics400](https://huggingface.co/google/vivit-b-16x2-kinetics400) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        num_frames (`int`, *optional*, defaults to 32):
+            The number of frames in each video.
+        tubelet_size (`List[int]`, *optional*, defaults to `[2, 16, 16]`):
+            The size (resolution) of each tubelet.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_fast"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"`, `"gelu_fast"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+
+    Example:
+
+    ```python
+    >>> from transformers import VivitConfig, VivitModel
+
+    >>> # Initializing a ViViT google/vivit-b-16x2-kinetics400 style configuration
+    >>> configuration = VivitConfig()
+
+    >>> # Initializing a model (with random weights) from the google/vivit-b-16x2-kinetics400 style configuration
+    >>> model = VivitModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "vivit"
+
+    def __init__(
+        self,
+        image_size=224,
+        num_frames=32,
+        tubelet_size=[2, 16, 16],
+        num_channels=3,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu_fast",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-06,
+        qkv_bias=True,
+        **kwargs,
+    ):
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+
+        self.image_size = image_size
+        self.num_frames = num_frames
+        self.tubelet_size = tubelet_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+
+        super().__init__(**kwargs)
diff --git a/src/transformers/models/vivit/convert_vivit_flax_to_pytorch.py b/src/transformers/models/vivit/convert_vivit_flax_to_pytorch.py
new file mode 100644
index 000000000000..bcd2e37c0a6a
--- /dev/null
+++ b/src/transformers/models/vivit/convert_vivit_flax_to_pytorch.py
@@ -0,0 +1,235 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert Flax ViViT checkpoints from the original repository to PyTorch. URL:
+https://github.com/google-research/scenic/tree/main/scenic/projects/vivit
+"""
+import argparse
+import json
+import os.path
+from collections import OrderedDict
+
+import numpy as np
+import requests
+import torch
+from flax.training.checkpoints import restore_checkpoint
+from huggingface_hub import hf_hub_download
+
+from transformers import VivitConfig, VivitForVideoClassification, VivitImageProcessor
+from transformers.image_utils import PILImageResampling
+
+
+def download_checkpoint(path):
+    url = "https://storage.googleapis.com/scenic-bucket/vivit/kinetics_400/vivit_base_16x2_unfactorized/checkpoint"
+
+    with open(path, "wb") as f:
+        with requests.get(url, stream=True) as req:
+            for chunk in req.iter_content(chunk_size=2048):
+                f.write(chunk)
+
+
+def get_vivit_config() -> VivitConfig:
+    config = VivitConfig()
+
+    config.num_labels = 400
+    repo_id = "huggingface/label-files"
+    filename = "kinetics400-id2label.json"
+
+    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
+    id2label = {int(k): v for k, v in id2label.items()}
+    config.id2label = id2label
+    config.label2id = {v: k for k, v in id2label.items()}
+    return config
+
+
+# We will verify our results on a video of eating spaghetti
+# Frame indices used: [ 47, 51, 55, 59, 63, 67, 71, 75, 80, 84, 88, 92, 96, 100, 104, 108, 113, 117,
+# 121, 125, 129, 133, 137, 141, 146, 150, 154, 158, 162, 166, 170, 174]
+def prepare_video():
+    file = hf_hub_download(
+        repo_id="hf-internal-testing/spaghetti-video", filename="eating_spaghetti_32_frames.npy", repo_type="dataset"
+    )
+    video = np.load(file)
+    return list(video)
+
+
+def transform_attention(current: np.ndarray):
+    if np.ndim(current) == 2:
+        return transform_attention_bias(current)
+
+    elif np.ndim(current) == 3:
+        return transform_attention_kernel(current)
+
+    else:
+        raise Exception(f"Invalid number of dimesions: {np.ndim(current)}")
+
+
+def transform_attention_bias(current: np.ndarray):
+    return current.flatten()
+
+
+def transform_attention_kernel(current: np.ndarray):
+    return np.reshape(current, (current.shape[0], current.shape[1] * current.shape[2])).T
+
+
+def transform_attention_output_weight(current: np.ndarray):
+    return np.reshape(current, (current.shape[0] * current.shape[1], current.shape[2])).T
+
+
+def transform_state_encoder_block(state_dict, i):
+    state = state_dict["optimizer"]["target"]["Transformer"][f"encoderblock_{i}"]
+
+    prefix = f"encoder.layer.{i}."
+    new_state = {
+        prefix + "intermediate.dense.bias": state["MlpBlock_0"]["Dense_0"]["bias"],
+        prefix + "intermediate.dense.weight": np.transpose(state["MlpBlock_0"]["Dense_0"]["kernel"]),
+        prefix + "output.dense.bias": state["MlpBlock_0"]["Dense_1"]["bias"],
+        prefix + "output.dense.weight": np.transpose(state["MlpBlock_0"]["Dense_1"]["kernel"]),
+        prefix + "layernorm_before.bias": state["LayerNorm_0"]["bias"],
+        prefix + "layernorm_before.weight": state["LayerNorm_0"]["scale"],
+        prefix + "layernorm_after.bias": state["LayerNorm_1"]["bias"],
+        prefix + "layernorm_after.weight": state["LayerNorm_1"]["scale"],
+        prefix
+        + "attention.attention.query.bias": transform_attention(
+            state["MultiHeadDotProductAttention_0"]["query"]["bias"]
+        ),
+        prefix
+        + "attention.attention.query.weight": transform_attention(
+            state["MultiHeadDotProductAttention_0"]["query"]["kernel"]
+        ),
+        prefix
+        + "attention.attention.key.bias": transform_attention(state["MultiHeadDotProductAttention_0"]["key"]["bias"]),
+        prefix
+        + "attention.attention.key.weight": transform_attention(
+            state["MultiHeadDotProductAttention_0"]["key"]["kernel"]
+        ),
+        prefix
+        + "attention.attention.value.bias": transform_attention(
+            state["MultiHeadDotProductAttention_0"]["value"]["bias"]
+        ),
+        prefix
+        + "attention.attention.value.weight": transform_attention(
+            state["MultiHeadDotProductAttention_0"]["value"]["kernel"]
+        ),
+        prefix + "attention.output.dense.bias": state["MultiHeadDotProductAttention_0"]["out"]["bias"],
+        prefix
+        + "attention.output.dense.weight": transform_attention_output_weight(
+            state["MultiHeadDotProductAttention_0"]["out"]["kernel"]
+        ),
+    }
+
+    return new_state
+
+
+def get_n_layers(state_dict):
+    return sum([1 if "encoderblock_" in k else 0 for k in state_dict["optimizer"]["target"]["Transformer"].keys()])
+
+
+def transform_state(state_dict, classification_head=False):
+    transformer_layers = get_n_layers(state_dict)
+
+    new_state = OrderedDict()
+
+    new_state["layernorm.bias"] = state_dict["optimizer"]["target"]["Transformer"]["encoder_norm"]["bias"]
+    new_state["layernorm.weight"] = state_dict["optimizer"]["target"]["Transformer"]["encoder_norm"]["scale"]
+
+    new_state["embeddings.patch_embeddings.projection.weight"] = np.transpose(
+        state_dict["optimizer"]["target"]["embedding"]["kernel"], (4, 3, 0, 1, 2)
+    )
+    new_state["embeddings.patch_embeddings.projection.bias"] = state_dict["optimizer"]["target"]["embedding"]["bias"]
+
+    new_state["embeddings.cls_token"] = state_dict["optimizer"]["target"]["cls"]
+    new_state["embeddings.position_embeddings"] = state_dict["optimizer"]["target"]["Transformer"]["posembed_input"][
+        "pos_embedding"
+    ]
+
+    for i in range(transformer_layers):
+        new_state.update(transform_state_encoder_block(state_dict, i))
+
+    if classification_head:
+        new_state = {"vivit." + k: v for k, v in new_state.items()}
+        new_state["classifier.weight"] = np.transpose(state_dict["optimizer"]["target"]["output_projection"]["kernel"])
+        new_state["classifier.bias"] = np.transpose(state_dict["optimizer"]["target"]["output_projection"]["bias"])
+
+    return {k: torch.tensor(v) for k, v in new_state.items()}
+
+
+# checks that image processor settings are the same as in the original implementation
+# original: https://github.com/google-research/scenic/blob/main/scenic/projects/vivit/data/video_tfrecord_dataset.py
+# dataset specific config:
+# https://github.com/google-research/scenic/blob/main/scenic/projects/vivit/configs/kinetics400/vivit_base_k400.py
+def get_processor() -> VivitImageProcessor:
+    extractor = VivitImageProcessor()
+
+    assert extractor.do_resize is True
+    assert extractor.size == {"shortest_edge": 256}
+    assert extractor.do_center_crop is True
+    assert extractor.crop_size == {"width": 224, "height": 224}
+    assert extractor.resample == PILImageResampling.BILINEAR
+
+    # here: https://github.com/deepmind/dmvr/blob/master/dmvr/modalities.py
+    # one can seen that add_image has default values for normalization_mean and normalization_std set to 0 and 1
+    # which effectively means no normalization (and ViViT does not overwrite those when calling this func)
+    assert extractor.do_normalize is False
+    assert extractor.do_rescale is True
+    assert extractor.rescale_factor == 1 / 255
+
+    # zero-centering = True in original implementation
+    assert extractor.do_zero_centering is True
+
+    return extractor
+
+
+def convert(output_path: str):
+    flax_model_path = "checkpoint"
+
+    if not os.path.exists(flax_model_path):
+        download_checkpoint(flax_model_path)
+
+    state_dict = restore_checkpoint(flax_model_path, None)
+    new_state = transform_state(state_dict, classification_head=True)
+
+    config = get_vivit_config()
+
+    assert config.image_size == 224
+    assert config.num_frames == 32
+
+    model = VivitForVideoClassification(config)
+    model.load_state_dict(new_state)
+    model.eval()
+
+    extractor = get_processor()
+
+    video = prepare_video()
+    inputs = extractor(video, return_tensors="pt")
+
+    outputs = model(**inputs)
+
+    expected_shape = torch.Size([1, 400])
+    expected_slice = torch.tensor([-1.0543, 2.0764, -0.2104, 0.4439, -0.9658])
+
+    assert outputs.logits.shape == expected_shape
+    assert torch.allclose(outputs.logits[0, :5], expected_slice, atol=1e-4), outputs.logits[0, :5]
+
+    model.save_pretrained(output_path)
+    extractor.save_pretrained(output_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--output_model_name", "-o", type=str, help="Output path for the converted HuggingFace model")
+
+    args = parser.parse_args()
+    convert(args.output_model_name)
diff --git a/src/transformers/models/vivit/image_processing_vivit.py b/src/transformers/models/vivit/image_processing_vivit.py
new file mode 100644
index 000000000000..f32dd0d3aea4
--- /dev/null
+++ b/src/transformers/models/vivit/image_processing_vivit.py
@@ -0,0 +1,400 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Vivit."""
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+
+from transformers.utils import is_vision_available
+from transformers.utils.generic import TensorType
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+    get_resize_output_image_size,
+    rescale,
+    resize,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    is_valid_image,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import logging
+
+
+if is_vision_available():
+    import PIL
+
+logger = logging.get_logger(__name__)
+
+
+def make_batched(videos) -> List[List[ImageInput]]:
+    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
+        return videos
+
+    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
+        return [videos]
+
+    elif is_valid_image(videos):
+        return [[videos]]
+
+    raise ValueError(f"Could not make batched video from {videos}")
+
+
+class VivitImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Vivit image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
+            `do_resize` parameter in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 256}`):
+            Size of the output image after resizing. The shortest edge of the image will be resized to
+            `size["shortest_edge"]` while maintaining the aspect ratio of the original image. Can be overriden by
+            `size` in the `preprocess` method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
+            `preprocess` method.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to center crop the image to the specified `crop_size`. Can be overridden by the `do_center_crop`
+            parameter in the `preprocess` method.
+        crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
+            Size of the image after applying the center crop. Can be overridden by the `crop_size` parameter in the
+            `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/127.5`):
+            Defines the scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter
+            in the `preprocess` method.
+        offset (`bool`, *optional*, defaults to `True`):
+            Whether to scale the image in both negative and positive directions. Can be overriden by the `offset` in
+            the `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_center_crop: bool = True,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 127.5,
+        offset: bool = True,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"shortest_edge": 256}
+        size = get_size_dict(size, default_to_square=False)
+        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
+        crop_size = get_size_dict(crop_size, param_name="crop_size")
+
+        self.do_resize = do_resize
+        self.size = size
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.offset = offset
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Size of the output image. If `size` is of the form `{"height": h, "width": w}`, the output image will
+                have the size `(h, w)`. If `size` is of the form `{"shortest_edge": s}`, the output image will have its
+                shortest edge of length `s` while keeping the aspect ratio of the original image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        size = get_size_dict(size, default_to_square=False)
+        if "shortest_edge" in size:
+            output_size = get_resize_output_image_size(
+                image, size["shortest_edge"], default_to_square=False, input_data_format=input_data_format
+            )
+        elif "height" in size and "width" in size:
+            output_size = (size["height"], size["width"])
+        else:
+            raise ValueError(f"Size must have 'height' and 'width' or 'shortest_edge' as keys. Got {size.keys()}")
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    # Copied from transformers.models.efficientnet.image_processing_efficientnet.EfficientNetImageProcessor.rescale
+    def rescale(
+        self,
+        image: np.ndarray,
+        scale: Union[int, float],
+        offset: bool = True,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ):
+        """
+        Rescale an image by a scale factor.
+
+        If `offset` is `True`, the image has its values rescaled by `scale` and then offset by 1. If `scale` is
+        1/127.5, the image is rescaled between [-1, 1].
+            image = image * scale - 1
+
+        If `offset` is `False`, and `scale` is 1/255, the image is rescaled between [0, 1].
+            image = image * scale
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            scale (`int` or `float`):
+                Scale to apply to the image.
+            offset (`bool`, *optional*):
+                Whether to scale the image in both negative and positive directions.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        rescaled_image = rescale(
+            image, scale=scale, data_format=data_format, input_data_format=input_data_format, **kwargs
+        )
+
+        if offset:
+            rescaled_image = rescaled_image - 1
+
+        return rescaled_image
+
+    def _preprocess_image(
+        self,
+        image: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        offset: bool = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """Preprocesses a single image."""
+        if do_resize and size is None or resample is None:
+            raise ValueError("Size and resample must be specified if do_resize is True.")
+
+        if do_center_crop and crop_size is None:
+            raise ValueError("Crop size must be specified if do_center_crop is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize and (image_mean is None or image_std is None):
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
+
+        if offset and not do_rescale:
+            raise ValueError("For offset, do_rescale must also be set to True.")
+
+        # All transformations expect numpy arrays.
+        image = to_numpy_array(image)
+
+        if is_scaled_image(image) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(image)
+
+        if do_resize:
+            image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+
+        if do_center_crop:
+            image = self.center_crop(image, size=crop_size, input_data_format=input_data_format)
+
+        if do_rescale:
+            image = self.rescale(image=image, scale=rescale_factor, offset=offset, input_data_format=input_data_format)
+
+        if do_normalize:
+            image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+
+        image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+        return image
+
+    def preprocess(
+        self,
+        videos: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        offset: bool = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            videos (`ImageInput`):
+                Video frames to preprocess. Expects a single or batch of video frames with pixel values ranging from 0
+                to 255. If passing in frames with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after applying resize.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only
+                has an effect if `do_resize` is set to `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_centre_crop`):
+                Whether to centre crop the image.
+            crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the image after applying the centre crop.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between `[-1 - 1]` if `offset` is `True`, `[0, 1]` otherwise.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            offset (`bool`, *optional*, defaults to `self.offset`):
+                Whether to scale the image in both negative and positive directions.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                    - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                    - Unset: Use the inferred channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        resample = resample if resample is not None else self.resample
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        offset = offset if offset is not None else self.offset
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+
+        size = size if size is not None else self.size
+        size = get_size_dict(size, default_to_square=False)
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(crop_size, param_name="crop_size")
+
+        if not valid_images(videos):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        videos = make_batched(videos)
+
+        videos = [
+            [
+                self._preprocess_image(
+                    image=img,
+                    do_resize=do_resize,
+                    size=size,
+                    resample=resample,
+                    do_center_crop=do_center_crop,
+                    crop_size=crop_size,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    offset=offset,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    data_format=data_format,
+                    input_data_format=input_data_format,
+                )
+                for img in video
+            ]
+            for video in videos
+        ]
+
+        data = {"pixel_values": videos}
+        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/src/transformers/models/vivit/modeling_vivit.py b/src/transformers/models/vivit/modeling_vivit.py
new file mode 100755
index 000000000000..fd35668572a7
--- /dev/null
+++ b/src/transformers/models/vivit/modeling_vivit.py
@@ -0,0 +1,755 @@
+# coding=utf-8
+# Copyright 2023 Google AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch ViViT model."""
+
+
+import math
+from typing import Optional, Set, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from .configuration_vivit import VivitConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "google/vivit-b-16x2-kinetics400"
+_CONFIG_FOR_DOC = "VivitConfig"
+
+VIVIT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "google/vivit-b-16x2-kinetics400",
+    # See all Vivit models at https://huggingface.co/models?filter=vivit
+]
+
+
+class VivitTubeletEmbeddings(nn.Module):
+    """
+    Construct Vivit Tubelet embeddings.
+
+    This module turns a batch of videos of shape (batch_size, num_frames, num_channels, height, width) into a tensor of
+    shape (batch_size, seq_len, hidden_size) to be consumed by a Transformer encoder.
+
+    The seq_len (the number of patches) equals (number of frames // tubelet_size[0]) * (height // tubelet_size[1]) *
+    (width // tubelet_size[2]).
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.num_frames = config.num_frames
+        self.image_size = config.image_size
+        self.patch_size = config.tubelet_size
+        self.num_patches = (
+            (self.image_size // self.patch_size[2])
+            * (self.image_size // self.patch_size[1])
+            * (self.num_frames // self.patch_size[0])
+        )
+        self.embed_dim = config.hidden_size
+
+        self.projection = nn.Conv3d(
+            config.num_channels, config.hidden_size, kernel_size=config.tubelet_size, stride=config.tubelet_size
+        )
+
+    def forward(self, pixel_values):
+        batch_size, num_frames, num_channels, height, width = pixel_values.shape
+        if height != self.image_size or width != self.image_size:
+            raise ValueError(
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})."
+            )
+
+        # permute to (batch_size, num_channels, num_frames, height, width)
+        pixel_values = pixel_values.permute(0, 2, 1, 3, 4)
+
+        x = self.projection(pixel_values)
+        # out_batch_size, out_num_channels, out_num_frames, out_height, out_width = x.shape
+        x = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        return x
+
+
+class VivitEmbeddings(nn.Module):
+    """
+    Vivit Embeddings.
+
+    Creates embeddings from a video using VivitTubeletEmbeddings, adds CLS token and positional embeddings.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        self.patch_embeddings = VivitTubeletEmbeddings(config)
+
+        self.position_embeddings = nn.Parameter(
+            torch.zeros(1, self.patch_embeddings.num_patches + 1, config.hidden_size)
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.config = config
+
+    def forward(self, pixel_values):
+        batch_size = pixel_values.shape[0]
+        embeddings = self.patch_embeddings(pixel_values)
+
+        cls_tokens = self.cls_token.tile([batch_size, 1, 1])
+
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
+        # add positional encoding to each token
+        embeddings = embeddings + self.position_embeddings
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->Vivit
+class VivitSelfAttention(nn.Module):
+    def __init__(self, config: VivitConfig) -> None:
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        mixed_query_layer = self.query(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->Vivit
+class VivitSelfOutput(nn.Module):
+    """
+    The residual connection is defined in VivitLayer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+
+    def __init__(self, config: VivitConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->Vivit
+class VivitAttention(nn.Module):
+    def __init__(self, config: VivitConfig) -> None:
+        super().__init__()
+        self.attention = VivitSelfAttention(config)
+        self.output = VivitSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads: Set[int]) -> None:
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_outputs = self.attention(hidden_states, head_mask, output_attentions)
+
+        attention_output = self.output(self_outputs[0], hidden_states)
+
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class VivitIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+class VivitOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+
+        hidden_states = self.dropout(hidden_states)
+
+        hidden_states = hidden_states + input_tensor
+
+        return hidden_states
+
+
+class VivitLayer(nn.Module):
+    """This corresponds to the EncoderBlock class in the scenic/vivit implementation."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = VivitAttention(config)
+        self.intermediate = VivitIntermediate(config)
+        self.output = VivitOutput(config)
+        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states, head_mask=None, output_attentions=False):
+        self_attention_outputs = self.attention(
+            # in Vivit, layernorm is applied before self-attention
+            self.layernorm_before(hidden_states),
+            head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        # add self attentions if we output attention weights
+        outputs = self_attention_outputs[1:]
+
+        # first residual connection
+        hidden_states = attention_output + hidden_states
+
+        # in Vivit, layernorm is also applied after self-attention
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.intermediate(layer_output)
+
+        # second residual connection is done here
+        layer_output = self.output(layer_output, hidden_states)
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+class VivitEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([VivitLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        head_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    layer_head_mask,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class VivitPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class VivitPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = VivitConfig
+    base_model_prefix = "vivit"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv3d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Parameter):
+            module.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, VivitEncoder):
+            module.gradient_checkpointing = value
+
+
+VIVIT_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`VivitConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+VIVIT_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`VivitImageProcessor`]. See
+            [`VivitImageProcessor.preprocess`] for details.
+
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare ViViT Transformer model outputting raw hidden-states without any specific head on top.",
+    VIVIT_START_DOCSTRING,
+)
+class VivitModel(VivitPreTrainedModel):
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = VivitEmbeddings(config)
+        self.encoder = VivitEncoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.pooler = VivitPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model.
+
+        Args:
+            heads_to_prune:
+                dict of {layer_num: list of heads to prune in this layer}
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(VIVIT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> import av
+        >>> import numpy as np
+
+        >>> from transformers import VivitImageProcessor, VivitModel
+        >>> from huggingface_hub import hf_hub_download
+
+        >>> np.random.seed(0)
+
+
+        >>> def read_video_pyav(container, indices):
+        ...     '''
+        ...     Decode the video with PyAV decoder.
+        ...     Args:
+        ...         container (`av.container.input.InputContainer`): PyAV container.
+        ...         indices (`List[int]`): List of frame indices to decode.
+        ...     Returns:
+        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
+        ...     '''
+        ...     frames = []
+        ...     container.seek(0)
+        ...     start_index = indices[0]
+        ...     end_index = indices[-1]
+        ...     for i, frame in enumerate(container.decode(video=0)):
+        ...         if i > end_index:
+        ...             break
+        ...         if i >= start_index and i in indices:
+        ...             frames.append(frame)
+        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+
+
+        >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
+        ...     '''
+        ...     Sample a given number of frame indices from the video.
+        ...     Args:
+        ...         clip_len (`int`): Total number of frames to sample.
+        ...         frame_sample_rate (`int`): Sample every n-th frame.
+        ...         seg_len (`int`): Maximum allowed index of sample's last frame.
+        ...     Returns:
+        ...         indices (`List[int]`): List of sampled frame indices
+        ...     '''
+        ...     converted_len = int(clip_len * frame_sample_rate)
+        ...     end_idx = np.random.randint(converted_len, seg_len)
+        ...     start_idx = end_idx - converted_len
+        ...     indices = np.linspace(start_idx, end_idx, num=clip_len)
+        ...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
+        ...     return indices
+
+
+        >>> # video clip consists of 300 frames (10 seconds at 30 FPS)
+        >>> file_path = hf_hub_download(
+        ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
+        ... )
+        >>> container = av.open(file_path)
+
+        >>> # sample 32 frames
+        >>> indices = sample_frame_indices(clip_len=32, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
+        >>> video = read_video_pyav(container=container, indices=indices)
+
+        >>> image_processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2-kinetics400")
+        >>> model = VivitModel.from_pretrained("google/vivit-b-16x2-kinetics400")
+
+        >>> # prepare video for the model
+        >>> inputs = image_processor(list(video), return_tensors="pt")
+
+        >>> # forward pass
+        >>> outputs = model(**inputs)
+        >>> last_hidden_states = outputs.last_hidden_state
+        >>> list(last_hidden_states.shape)
+        [1, 3137, 768]
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """ViViT Transformer model with a video classification head on top (a linear layer on top of the final hidden state of the
+[CLS] token) e.g. for Kinetics-400.""",
+    VIVIT_START_DOCSTRING,
+)
+class VivitForVideoClassification(VivitPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.vivit = VivitModel(config, add_pooling_layer=False)
+
+        # Classifier head
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(VIVIT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=ImageClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], ImageClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> import av
+        >>> import numpy as np
+        >>> import torch
+
+        >>> from transformers import VivitImageProcessor, VivitForVideoClassification
+        >>> from huggingface_hub import hf_hub_download
+
+        >>> np.random.seed(0)
+
+
+        >>> def read_video_pyav(container, indices):
+        ...     '''
+        ...     Decode the video with PyAV decoder.
+        ...     Args:
+        ...         container (`av.container.input.InputContainer`): PyAV container.
+        ...         indices (`List[int]`): List of frame indices to decode.
+        ...     Returns:
+        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
+        ...     '''
+        ...     frames = []
+        ...     container.seek(0)
+        ...     start_index = indices[0]
+        ...     end_index = indices[-1]
+        ...     for i, frame in enumerate(container.decode(video=0)):
+        ...         if i > end_index:
+        ...             break
+        ...         if i >= start_index and i in indices:
+        ...             frames.append(frame)
+        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+
+
+        >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
+        ...     '''
+        ...     Sample a given number of frame indices from the video.
+        ...     Args:
+        ...         clip_len (`int`): Total number of frames to sample.
+        ...         frame_sample_rate (`int`): Sample every n-th frame.
+        ...         seg_len (`int`): Maximum allowed index of sample's last frame.
+        ...     Returns:
+        ...         indices (`List[int]`): List of sampled frame indices
+        ...     '''
+        ...     converted_len = int(clip_len * frame_sample_rate)
+        ...     end_idx = np.random.randint(converted_len, seg_len)
+        ...     start_idx = end_idx - converted_len
+        ...     indices = np.linspace(start_idx, end_idx, num=clip_len)
+        ...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
+        ...     return indices
+
+
+        >>> # video clip consists of 300 frames (10 seconds at 30 FPS)
+        >>> file_path = hf_hub_download(
+        ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
+        ... )
+        >>> container = av.open(file_path)
+
+        >>> # sample 32 frames
+        >>> indices = sample_frame_indices(clip_len=32, frame_sample_rate=4, seg_len=container.streams.video[0].frames)
+        >>> video = read_video_pyav(container=container, indices=indices)
+
+        >>> image_processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2-kinetics400")
+        >>> model = VivitForVideoClassification.from_pretrained("google/vivit-b-16x2-kinetics400")
+
+        >>> inputs = image_processor(list(video), return_tensors="pt")
+
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+        ...     logits = outputs.logits
+
+        >>> # model predicts one of the 400 Kinetics-400 classes
+        >>> predicted_label = logits.argmax(-1).item()
+        >>> print(model.config.id2label[predicted_label])
+        LABEL_116
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.vivit(
+            pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.classifier(sequence_output[:, 0, :])
+
+        loss = None
+        if labels is not None:
+            if self.num_labels == 1:
+                #  We are doing regression
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1), labels.view(-1))
+            else:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/wav2vec2/__init__.py b/src/transformers/models/wav2vec2/__init__.py
index 306c2197f4c3..b3abdb99ec72 100644
--- a/src/transformers/models/wav2vec2/__init__.py
+++ b/src/transformers/models/wav2vec2/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -63,6 +59,7 @@
         "TFWav2Vec2ForCTC",
         "TFWav2Vec2Model",
         "TFWav2Vec2PreTrainedModel",
+        "TFWav2Vec2ForSequenceClassification",
     ]
 
 try:
@@ -112,6 +109,7 @@
         from .modeling_tf_wav2vec2 import (
             TF_WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST,
             TFWav2Vec2ForCTC,
+            TFWav2Vec2ForSequenceClassification,
             TFWav2Vec2Model,
             TFWav2Vec2PreTrainedModel,
         )
diff --git a/src/transformers/models/wav2vec2/configuration_wav2vec2.py b/src/transformers/models/wav2vec2/configuration_wav2vec2.py
index ab7d11611817..91be7cf85b60 100644
--- a/src/transformers/models/wav2vec2/configuration_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/configuration_wav2vec2.py
@@ -59,10 +59,15 @@ class Wav2Vec2Config(PretrainedConfig):
             `"relu"`, `"selu"` and `"gelu_new"` are supported.
         hidden_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        activation_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for activations inside the fully connected layer.
         attention_dropout (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
         final_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for the final projection layer of [`Wav2Vec2ForCTC`].
+        layerdrop (`float`, *optional*, defaults to 0.1):
+            The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
+            details.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
@@ -177,6 +182,9 @@ class Wav2Vec2Config(PretrainedConfig):
         num_adapter_layers (`int`, *optional*, defaults to 3):
             Number of convolutional layers that should be used in the adapter network. Only relevant if `add_adapter is
             True`.
+        adapter_attn_dim (`int`, *optional*):
+            Dimension of the attention adapter weights to be used in each attention block. An example of a model using
+            attention adapters is [facebook/mms-1b-all](https://huggingface.co/facebook/mms-1b-all).
         output_hidden_size (`int`, *optional*):
             Dimensionality of the encoder output layer. If not defined, this defaults to *hidden-size*. Only relevant
             if `add_adapter is True`.
@@ -253,7 +261,8 @@ def __init__(
         adapter_stride=2,
         num_adapter_layers=3,
         output_hidden_size=None,
-        **kwargs
+        adapter_attn_dim=None,
+        **kwargs,
     ):
         super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
         self.hidden_size = hidden_size
@@ -323,6 +332,7 @@ def __init__(
         self.adapter_stride = adapter_stride
         self.num_adapter_layers = num_adapter_layers
         self.output_hidden_size = output_hidden_size or hidden_size
+        self.adapter_attn_dim = adapter_attn_dim
 
         # SequenceClassification-specific parameter. Feel free to ignore for other classes.
         self.classifier_proj_size = classifier_proj_size
diff --git a/src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py
index 4656f5b811e1..28554691c6e2 100644
--- a/src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py
@@ -32,6 +32,7 @@
     Wav2Vec2Processor,
     logging,
 )
+from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2ForSequenceClassification
 
 
 logging.set_verbosity_info()
@@ -49,6 +50,7 @@
     "fc2": "encoder.layers.*.feed_forward.output_dense",
     "final_layer_norm": "encoder.layers.*.final_layer_norm",
     "encoder.layer_norm": "encoder.layer_norm",
+    "adapter_layer": "encoder.layers.*.adapter_layer",
     "w2v_model.layer_norm": "feature_projection.layer_norm",
     "quantizer.weight_proj": "quantizer.weight_proj",
     "quantizer.vars": "quantizer.codevectors",
@@ -56,6 +58,8 @@
     "final_proj": "project_hid",
     "w2v_encoder.proj": "lm_head",
     "mask_emb": "masked_spec_embed",
+    "pooling_layer.linear": "projector",
+    "pooling_layer.projection": "classifier",
 }
 TOP_LEVEL_KEYS = [
     "lm_head",
@@ -63,15 +67,44 @@
     "quantizer.codevectors",
     "project_q",
     "project_hid",
+    "projector",
+    "classifier",
 ]
 
 
-def set_recursively(hf_pointer, key, value, full_name, weight_type):
+def read_txt_into_dict(filename):
+    result = {}
+    with open(filename, "r") as file:
+        for line_number, line in enumerate(file):
+            line = line.strip()
+            if line:
+                words = line.split()
+                key = line_number
+                value = words[0]
+                result[key] = value
+    return result
+
+
+def set_recursively(key, value, full_name, weight_type, hf_pointer):
     for attribute in key.split("."):
         hf_pointer = getattr(hf_pointer, attribute)
 
-    if weight_type is not None:
+    hf_param_name = None
+    for param_key in PARAM_MAPPING.keys():
+        if full_name.endswith(param_key):
+            hf_param_name = PARAM_MAPPING[full_name.split(".")[-1]]
+            weight_type = "param"
+
+    if weight_type is not None and weight_type != "param":
         hf_shape = getattr(hf_pointer, weight_type).shape
+    elif weight_type is not None and weight_type == "param":
+        shape_pointer = hf_pointer
+        for attribute in hf_param_name.split("."):
+            shape_pointer = getattr(shape_pointer, attribute)
+        hf_shape = shape_pointer.shape
+
+        # let's reduce dimension
+        value = value[0]
     else:
         hf_shape = hf_pointer.shape
 
@@ -89,12 +122,71 @@ def set_recursively(hf_pointer, key, value, full_name, weight_type):
         hf_pointer.weight_v.data = value
     elif weight_type == "bias":
         hf_pointer.bias.data = value
+    elif weight_type == "param":
+        for attribute in hf_param_name.split("."):
+            hf_pointer = getattr(hf_pointer, attribute)
+        hf_pointer.data = value
     else:
         hf_pointer.data = value
 
     logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
 
 
+def rename_dict(key, value, full_name, weight_type, hf_dict):
+    hf_param_name = None
+    for param_key in PARAM_MAPPING.keys():
+        if full_name.endswith(param_key):
+            hf_param_name = PARAM_MAPPING[full_name.split(".")[-1]]
+            weight_type = "param"
+
+    if weight_type is not None and weight_type != "param":
+        full_key = ".".join([key, weight_type])
+    elif weight_type is not None and weight_type == "param":
+        full_key = ".".join([key, hf_param_name])
+    else:
+        full_key = key
+
+    hf_dict[full_key] = value if "lm_head" in full_key else value[0]
+
+
+PARAM_MAPPING = {
+    "W_a": "linear_1.weight",
+    "W_b": "linear_2.weight",
+    "b_a": "linear_1.bias",
+    "b_b": "linear_2.bias",
+    "ln_W": "norm.weight",
+    "ln_b": "norm.bias",
+}
+
+
+def load_wav2vec2_layer(name, value, hf_model=None, hf_dict=None):
+    is_used = False
+    for key, mapped_key in MAPPING.items():
+        mapped_key = "wav2vec2." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
+        if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
+            is_used = True
+            if "*" in mapped_key:
+                layer_index = name.split(key)[0].split(".")[-2]
+                mapped_key = mapped_key.replace("*", layer_index)
+            if "weight_g" in name:
+                weight_type = "weight_g"
+            elif "weight_v" in name:
+                weight_type = "weight_v"
+            elif "bias" in name:
+                weight_type = "bias"
+            elif "weight" in name:
+                # TODO: don't match quantizer.weight_proj
+                weight_type = "weight"
+            else:
+                weight_type = None
+            if hf_dict is not None:
+                rename_dict(mapped_key, value, name, weight_type, hf_dict)
+            else:
+                set_recursively(mapped_key, value, name, weight_type, hf_model)
+            return is_used
+    return is_used
+
+
 def recursively_load_weights(fairseq_model, hf_model, is_headless):
     unused_weights = []
     fairseq_dict = fairseq_model.state_dict()
@@ -113,26 +205,7 @@ def recursively_load_weights(fairseq_model, hf_model, is_headless):
             )
             is_used = True
         else:
-            for key, mapped_key in MAPPING.items():
-                mapped_key = "wav2vec2." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
-                if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
-                    is_used = True
-                    if "*" in mapped_key:
-                        layer_index = name.split(key)[0].split(".")[-2]
-                        mapped_key = mapped_key.replace("*", layer_index)
-                    if "weight_g" in name:
-                        weight_type = "weight_g"
-                    elif "weight_v" in name:
-                        weight_type = "weight_v"
-                    elif "bias" in name:
-                        weight_type = "bias"
-                    elif "weight" in name:
-                        # TODO: don't match quantizer.weight_proj
-                        weight_type = "weight"
-                    else:
-                        weight_type = None
-                    set_recursively(hf_model, mapped_key, value, name, weight_type)
-                continue
+            is_used = load_wav2vec2_layer(name, value, hf_model)
         if not is_used:
             unused_weights.append(name)
 
@@ -185,7 +258,7 @@ def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_gro
 
 @torch.no_grad()
 def convert_wav2vec2_checkpoint(
-    checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True
+    checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True, is_seq_class=False
 ):
     """
     Copy/paste/tweak model's weights to transformers design.
@@ -195,7 +268,20 @@ def convert_wav2vec2_checkpoint(
     else:
         config = Wav2Vec2Config()
 
-    if is_finetuned:
+    if is_seq_class:
+        id2label = read_txt_into_dict(dict_path)
+        config.id2label = id2label
+        hf_wav2vec = Wav2Vec2ForSequenceClassification(config)
+        feature_extractor = Wav2Vec2FeatureExtractor(
+            feature_size=1,
+            sampling_rate=16000,
+            padding_value=0,
+            do_normalize=True,
+            return_attention_mask=True,
+        )
+        feature_extractor.save_pretrained(pytorch_dump_folder_path)
+
+    elif is_finetuned:
         if dict_path:
             target_dict = Dictionary.load(dict_path)
 
@@ -241,7 +327,7 @@ def convert_wav2vec2_checkpoint(
     else:
         hf_wav2vec = Wav2Vec2ForPreTraining(config)
 
-    if is_finetuned:
+    if is_finetuned or is_seq_class:
         model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
             [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])}
         )
@@ -267,7 +353,19 @@ def convert_wav2vec2_checkpoint(
     parser.add_argument(
         "--not_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
     )
+    parser.add_argument(
+        "--is_seq_class",
+        action="store_true",
+        help="Whether the model to convert is a fine-tuned sequence classification model or not",
+    )
     args = parser.parse_args()
+
+    is_finetuned = not args.not_finetuned and not args.is_seq_class
     convert_wav2vec2_checkpoint(
-        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, not args.not_finetuned
+        args.checkpoint_path,
+        args.pytorch_dump_folder_path,
+        args.config_path,
+        args.dict_path,
+        is_finetuned,
+        args.is_seq_class,
     )
diff --git a/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py b/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py
index b3ceef27d398..2c2066739ddd 100644
--- a/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py
@@ -39,7 +39,7 @@ class Wav2Vec2FeatureExtractor(SequenceFeatureExtractor):
         feature_size (`int`, defaults to 1):
             The feature dimension of the extracted features.
         sampling_rate (`int`, defaults to 16000):
-            The sampling rate at which the audio files should be digitalized expressed in Hertz per second (Hz).
+            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
         padding_value (`float`, defaults to 0.0):
             The value that is used to fill the padding values.
         do_normalize (`bool`, *optional*, defaults to `True`):
@@ -71,7 +71,7 @@ def __init__(
         padding_value=0.0,
         return_attention_mask=False,
         do_normalize=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
         self.return_attention_mask = return_attention_mask
@@ -109,7 +109,7 @@ def __call__(
         return_attention_mask: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         sampling_rate: Optional[int] = None,
-        **kwargs
+        **kwargs,
     ) -> BatchFeature:
         """
         Main method to featurize and prepare for the model one or several sequence(s).
@@ -117,7 +117,8 @@ def __call__(
         Args:
             raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                 The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
-                values, a list of numpy arrays or a list of list of float values.
+                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
+                stereo, i.e. single float per timestep.
             padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
                 Select a strategy to pad the returned sequences (according to the model's padding side and padding
                 index) among:
@@ -136,7 +137,7 @@ def __call__(
                 If set will pad the sequence to a multiple of the provided value.
 
                 This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
-                >= 7.5 (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
+                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
             return_attention_mask (`bool`, *optional*):
                 Whether to return the attention mask. If left to the default, will return the attention mask according
                 to the specific feature_extractor's default.
@@ -181,9 +182,11 @@ def __call__(
                 "Failing to do so can result in silent errors that might be hard to debug."
             )
 
-        is_batched = bool(
-            isinstance(raw_speech, (list, tuple))
-            and (isinstance(raw_speech[0], np.ndarray) or isinstance(raw_speech[0], (tuple, list)))
+        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
+        if is_batched_numpy and len(raw_speech.shape) > 2:
+            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
+        is_batched = is_batched_numpy or (
+            isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
         )
 
         # always return batch
diff --git a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
index 03496b821011..86cfb5e089ea 100644
--- a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
@@ -17,12 +17,11 @@
 from functools import partial
 from typing import Optional, Tuple, Union
 
-import numpy as np
-
 import flax
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen.attention import dot_product_attention_weights
 from flax.traverse_util import flatten_dict, unflatten_dict
@@ -255,10 +254,10 @@ def _sample_negative_indices(features_shape: Tuple, num_negatives: int, attentio
 WAV_2_VEC_2_INPUTS_DOCSTRING = r"""
     Args:
         input_values (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
-            Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
-            into an array of type *List[float]* or a *numpy.ndarray*, *e.g.* via the soundfile library (*pip install
-            soundfile*). To prepare the array into *input_values*, the [`Wav2Vec2Processor`] should be used for padding
-            and conversion into a tensor of type *jnp.ndarray*. See [`Wav2Vec2Processor.__call__`] for details.
+            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
+            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
+            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
+            conversion into a tensor of type `jnp.ndarray`. See [`Wav2Vec2Processor.__call__`] for details.
         attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
             1]`:
@@ -497,7 +496,7 @@ def __call__(
             attention_bias = lax.select(
                 attention_mask > 0,
                 jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
-                jnp.full(attention_mask.shape, float("-inf")).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
             )
         else:
             attention_bias = None
@@ -663,7 +662,6 @@ def __call__(
         output_hidden_states=False,
         return_dict=True,
     ):
-
         if attention_mask is not None:
             # make sure padded tokens are not attended to
             hidden_states = jnp.where(
@@ -1034,7 +1032,6 @@ def _conv_out_length(input_length, kernel_size, stride):
     def _get_feature_vector_attention_mask(
         self, feature_vector_length: int, attention_mask: jnp.ndarray, add_adapter=None
     ):
-
         # Effectively attention_mask.sum(-1), but not inplace to be able to run
         # on inference mode.
         non_padded_lengths = attention_mask.cumsum(axis=-1)[:, -1]
@@ -1065,11 +1062,11 @@ class FlaxWav2Vec2Model(FlaxWav2Vec2PreTrainedModel):
     Example:
 
     ```python
-    >>> from transformers import Wav2Vec2Processor, FlaxWav2Vec2Model
+    >>> from transformers import AutoProcessor, FlaxWav2Vec2Model
     >>> from datasets import load_dataset
     >>> import soundfile as sf
 
-    >>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-lv60")
+    >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-large-lv60")
     >>> model = FlaxWav2Vec2Model.from_pretrained("facebook/wav2vec2-large-lv60")
 
 
@@ -1184,11 +1181,11 @@ class FlaxWav2Vec2ForCTC(FlaxWav2Vec2PreTrainedModel):
 
     ```python
     >>> import jax.numpy as jnp
-    >>> from transformers import Wav2Vec2Processor, FlaxWav2Vec2ForCTC
+    >>> from transformers import AutoProcessor, FlaxWav2Vec2ForCTC
     >>> from datasets import load_dataset
     >>> import soundfile as sf
 
-    >>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60")
+    >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-large-960h-lv60")
     >>> model = FlaxWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60")
 
 
@@ -1384,12 +1381,12 @@ def __call__(
     >>> import optax
     >>> import numpy as np
     >>> import jax.numpy as jnp
-    >>> from transformers import Wav2Vec2FeatureExtractor, FlaxWav2Vec2ForPreTraining
+    >>> from transformers import AutoFeatureExtractor, FlaxWav2Vec2ForPreTraining
     >>> from transformers.models.wav2vec2.modeling_flax_wav2vec2 import _compute_mask_indices
     >>> from datasets import load_dataset
     >>> import soundfile as sf
 
-    >>> feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-large-lv60")
+    >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-large-lv60")
     >>> model = FlaxWav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-large-lv60")
 
 
diff --git a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
index 58110b51207d..c1511e2a88ea 100644
--- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
@@ -14,20 +14,20 @@
 # limitations under the License.
 """ TensorFlow Wav2Vec2 model."""
 
-import inspect
+
+from __future__ import annotations
+
 import warnings
-from collections.abc import Mapping
 from dataclasses import dataclass
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Any, Optional, Tuple, Union
 
 import numpy as np
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
-from ...modeling_tf_outputs import TFBaseModelOutput, TFCausalLMOutput
+from ...modeling_tf_outputs import TFBaseModelOutput, TFCausalLMOutput, TFSequenceClassifierOutput
 from ...modeling_tf_utils import (
     TFPreTrainedModel,
-    booleans_processing,
     get_initializer,
     keras_serializable,
     unpack_inputs,
@@ -50,7 +50,6 @@
 
 _CHECKPOINT_FOR_DOC = "facebook/wav2vec2-base-960h"
 _CONFIG_FOR_DOC = "Wav2Vec2Config"
-_TOKENIZER_FOR_DOC = "Wav2Vec2Tokenizer"
 
 TF_WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "facebook/wav2vec2-base-960h",
@@ -88,125 +87,8 @@ class TFWav2Vec2BaseModelOutput(ModelOutput):
 
     last_hidden_state: tf.Tensor = None
     extract_features: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
-
-
-def input_values_processing(func, config, input_values, **kwargs):
-    """
-    Process the input of each TensorFlow model including the booleans. In case of a list of symbolic inputs, each input
-    has to be named accordingly to the parameters name, i.e. `input_values = tf.keras.Input(shape=(128,),
-    dtype='float32', name="input_values")` otherwise the order of the tensors will not be guaranteed during the
-    training.
-
-    Args:
-        func (`callable`):
-            The callable function of the TensorFlow model.
-        config ([`PretrainedConfig`]):
-            The config of the running model.
-        **kwargs:
-            The inputs of the model.
-
-    Returns:
-        Two lists, one for the missing layers, and another one for the unexpected layers.
-    """
-    signature = dict(inspect.signature(func).parameters)
-    signature.pop("kwargs", None)
-    signature.pop("self", None)
-    parameter_names = list(signature.keys())
-    output = {}
-    allowed_types = (tf.Tensor, bool, int, ModelOutput, tuple, list, dict, np.ndarray)
-
-    for k, v in kwargs.items():
-        if isinstance(v, allowed_types) or v is None:
-            output[k] = v
-        else:
-            raise ValueError(f"Data of type {type(v)} is not allowed only {allowed_types} is accepted for {k}.")
-
-    if isinstance(input_values, (tuple, list)):
-        for i, input in enumerate(input_values):
-            # EagerTensors don't allow to use the .name property so we check for a real Tensor
-            if type(input) == tf.Tensor:
-                # Tensor names have always the pattern `name:id` then we check only the
-                # `name` part
-                tensor_name = input.name.split(":")[0]
-
-                if tensor_name in parameter_names:
-                    output[tensor_name] = input
-                else:
-                    output[parameter_names[i]] = input
-            elif isinstance(input, allowed_types) or input is None:
-                output[parameter_names[i]] = input
-            else:
-                raise ValueError(
-                    f"Data of type {type(input)} is not allowed only {allowed_types} is accepted for"
-                    f" {parameter_names[i]}."
-                )
-    elif isinstance(input_values, Mapping):
-        if "inputs" in input_values:
-            warnings.warn(
-                "The `inputs` argument is deprecated and will be removed in a future version, use `input_values`"
-                " instead.",
-                FutureWarning,
-            )
-
-            output["input_values"] = input_values.pop("inputs")
-
-        if "decoder_cached_states" in input_values:
-            warnings.warn(
-                "The `decoder_cached_states` argument is deprecated and will be removed in a future version, use"
-                " `past_key_values` instead.",
-                FutureWarning,
-            )
-            output["past_key_values"] = input_values.pop("decoder_cached_states")
-
-        for k, v in dict(input_values).items():
-            if isinstance(v, allowed_types) or v is None:
-                output[k] = v
-            elif k not in parameter_names and "args" not in parameter_names:
-                logger.warning(
-                    f"The parameter {k} does not belongs to the parameter list {parameter_names} and will be ignored."
-                )
-                continue
-            else:
-                raise ValueError(f"Data of type {type(v)} is not allowed only {allowed_types} is accepted for {k}.")
-    else:
-        if isinstance(input_values, tf.Tensor) or input_values is None:
-            output[parameter_names[0]] = input_values
-        else:
-            raise ValueError(
-                f"Data of type {type(input_values)} is not allowed only {allowed_types} is accepted for"
-                f" {parameter_names[0]}."
-            )
-
-    for name in parameter_names:
-        if name not in list(output.keys()) and name != "args":
-            output[name] = kwargs.pop(name, signature[name].default)
-
-    # When creating a SavedModel TF calls the method with LayerCall.__call__(args, **kwargs)
-    # So to respect the proper output we have to add this exception
-    if "args" in output:
-        if output["args"] is not None and type(output["args"]) == tf.Tensor:
-            tensor_name = output["args"].name.split(":")[0]
-            output[tensor_name] = output["args"]
-        else:
-            # `args` in this case is always the first parameter, then `input_values`
-            output["input_values"] = output["args"]
-
-        del output["args"]
-
-    if "kwargs" in output:
-        del output["kwargs"]
-
-    boolean_dict = {
-        k: v
-        for k, v in output.items()
-        if k in ["return_dict", "output_attentions", "output_hidden_states", "use_cache"]
-    }
-
-    output.update(booleans_processing(config=config, **boolean_dict))
-
-    return output
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 def _sample_without_replacement(distribution, num_samples):
@@ -262,13 +144,17 @@ def _compute_mask_indices(
     if mask_length < 1:
         raise ValueError("`mask_length` has to be bigger than 0.")
 
-    if mask_length > sequence_length:
-        raise ValueError(
+    tf.debugging.assert_less(
+        mask_length,
+        sequence_length,
+        message=(
             f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and"
             f" `sequence_length`: {sequence_length}`"
-        )
+        ),
+    )
+
     # compute number of masked spans in batch
-    num_masked_spans = mask_prob * sequence_length / mask_length + tf.random.uniform((1,))
+    num_masked_spans = mask_prob * tf.cast(sequence_length, tf.float32) / mask_length + tf.random.uniform((1,))
     num_masked_spans = tf.maximum(num_masked_spans, min_masks)
     num_masked_spans = tf.cast(num_masked_spans, tf.int32)
 
@@ -354,7 +240,6 @@ def __init__(
         self._check_axis()
 
     def build(self, input_shape):
-
         self._check_if_input_shape_is_none(input_shape)
         self._set_number_of_groups_for_instance_norm(input_shape)
         self._check_size_of_dimensions(input_shape)
@@ -366,7 +251,6 @@ def build(self, input_shape):
         super().build(input_shape)
 
     def call(self, inputs):
-
         input_shape = tf.keras.backend.int_shape(inputs)
         tensor_input_shape = tf.shape(inputs)
 
@@ -403,7 +287,6 @@ def compute_output_shape(self, input_shape):
         return input_shape
 
     def _reshape_into_groups(self, inputs, input_shape, tensor_input_shape):
-
         group_shape = [tensor_input_shape[i] for i in range(len(input_shape))]
         is_instance_norm = (input_shape[self.axis] // self.groups) == 1
         if not is_instance_norm:
@@ -416,7 +299,6 @@ def _reshape_into_groups(self, inputs, input_shape, tensor_input_shape):
             return inputs, group_shape
 
     def _apply_normalization(self, reshaped_inputs, input_shape):
-
         group_shape = tf.keras.backend.int_shape(reshaped_inputs)
         group_reduction_axes = list(range(1, len(group_shape)))
         is_instance_norm = (input_shape[self.axis] // self.groups) == 1
@@ -468,7 +350,6 @@ def _set_number_of_groups_for_instance_norm(self, input_shape):
             self.groups = dim
 
     def _check_size_of_dimensions(self, input_shape):
-
         dim = input_shape[self.axis]
         if dim < self.groups:
             raise ValueError(
@@ -489,19 +370,16 @@ def _check_size_of_dimensions(self, input_shape):
             )
 
     def _check_axis(self):
-
         if self.axis == 0:
             raise ValueError(
                 "You are trying to normalize your batch axis. Do you want to use tf.layer.batch_normalization instead"
             )
 
     def _create_input_spec(self, input_shape):
-
         dim = input_shape[self.axis]
         self.input_spec = tf.keras.layers.InputSpec(ndim=len(input_shape), axes={self.axis: dim})
 
     def _add_gamma_weight(self, input_shape):
-
         dim = input_shape[self.axis]
         shape = (dim,)
 
@@ -517,7 +395,6 @@ def _add_gamma_weight(self, input_shape):
             self.gamma = None
 
     def _add_beta_weight(self, input_shape):
-
         dim = input_shape[self.axis]
         shape = (dim,)
 
@@ -574,8 +451,10 @@ def _normalize_kernel(self):
     def build(self, input_shape):
         if not self.built:
             input_shape = input_shape.as_list()
-            # Conv1D output shapes are checked at build time since TF 2.7, so we need to account for padding
-            input_shape[-2] += self.explicit_padding * 2
+            # If a specific input shape is passed in, we need to modify it to account for padding
+            # Not necessary if those portions of the shape are None
+            if input_shape[-2] is not None:
+                input_shape[-2] += self.explicit_padding * 2
             super().build(input_shape)
 
             self.kernel = tf.Variable(tf.transpose(self.kernel), name="weight_v", trainable=True)
@@ -799,12 +678,12 @@ def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
     def call(
         self,
         hidden_states: tf.Tensor,
-        key_value_states: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[Tuple[tf.Tensor]]] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
+        key_value_states: tf.Tensor | None = None,
+        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
+        attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
         training: Optional[bool] = False,
-    ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
+    ) -> Tuple[tf.Tensor, tf.Tensor | None]:
         """Input shape: Batch x Time x Channel"""
 
         # if key_value_states are provided this layer is used as a cross-attention layer
@@ -967,7 +846,7 @@ def __init__(self, config: Wav2Vec2Config, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = False,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
@@ -1010,7 +889,7 @@ def __init__(self, config: Wav2Vec2Config, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = False,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
@@ -1043,7 +922,7 @@ def __init__(self, config: Wav2Vec2Config, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = False,
         output_hidden_states: Optional[bool] = False,
         return_dict: Optional[bool] = True,
@@ -1110,7 +989,7 @@ def __init__(self, config: Wav2Vec2Config, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = False,
         output_hidden_states: Optional[bool] = False,
         return_dict: Optional[bool] = True,
@@ -1200,7 +1079,7 @@ def _conv_out_length(input_length, kernel_size, stride):
 
         return input_lengths
 
-    def _mask_hidden_states(self, hidden_states: tf.Tensor, mask_time_indices: Optional[tf.Tensor] = None):
+    def _mask_hidden_states(self, hidden_states: tf.Tensor, mask_time_indices: tf.Tensor | None = None):
         """
         Masks extracted features along time axis and/or along feature axis according to
         [SpecAugment](https://arxiv.org/abs/1904.08779).
@@ -1244,66 +1123,49 @@ def _mask_hidden_states(self, hidden_states: tf.Tensor, mask_time_indices: Optio
 
         return hidden_states
 
+    @unpack_inputs
     def call(
         self,
         input_values: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
-        token_type_ids: Optional[tf.Tensor] = None,
-        position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
         **kwargs: Any,
     ):
-        inputs = input_values_processing(
-            func=self.call,
-            config=self.config,
-            input_values=input_values,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            training=training,
-            kwargs_call=kwargs,
-        )
-
-        extract_features = self.feature_extractor(
-            tf.cast(inputs["input_values"], tf.float32), training=inputs["training"]
-        )
+        extract_features = self.feature_extractor(tf.cast(input_values, tf.float32), training=training)
         # extract_features = tf.transpose(extract_features, perm=(0, 2, 1))
 
-        if inputs["attention_mask"] is not None:
+        if attention_mask is not None:
             # compute real output lengths according to convolution formula
-            output_lengths = self._get_feat_extract_output_lengths(tf.reduce_sum(inputs["attention_mask"], -1))
+            output_lengths = self._get_feat_extract_output_lengths(tf.reduce_sum(attention_mask, -1))
 
             attention_mask = tf.sequence_mask(
                 output_lengths, maxlen=shape_list(extract_features)[1], dtype=extract_features.dtype
             )
 
-        hidden_states, extract_features = self.feature_projection(extract_features, training=inputs["training"])
+        hidden_states, extract_features = self.feature_projection(extract_features, training=training)
 
         mask_time_indices = kwargs.get("mask_time_indices", None)
-        if inputs["training"]:
+        if training:
             hidden_states = self._mask_hidden_states(hidden_states, mask_time_indices=mask_time_indices)
 
         encoder_outputs = self.encoder(
             hidden_states,
             attention_mask=attention_mask,
-            output_attentions=inputs["output_attentions"],
-            output_hidden_states=inputs["output_hidden_states"],
-            return_dict=inputs["return_dict"],
-            training=inputs["training"],
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
         )
         hidden_states = encoder_outputs[0]
 
-        if not inputs["return_dict"]:
+        if not return_dict:
             return (hidden_states, extract_features) + encoder_outputs[1:]
 
         return TFWav2Vec2BaseModelOutput(
@@ -1325,35 +1187,65 @@ class TFWav2Vec2PreTrainedModel(TFPreTrainedModel):
     main_input_name = "input_values"
 
     @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        pad_token = 0.0
-        input_values = tf.convert_to_tensor(np.random.rand(1, 16000), tf.float32)
-        dummy_inputs = {
-            "input_values": input_values,
-            "attention_mask": tf.cast(tf.not_equal(input_values, pad_token), tf.float32),
+    def input_signature(self):
+        return {
+            "input_values": tf.TensorSpec((None, None), tf.float32, name="input_values"),
+            "attention_mask": tf.TensorSpec((None, None), tf.float32, name="attention_mask"),
+        }
+
+    @property
+    def dummy_inputs(self):
+        return {
+            "input_values": tf.random.uniform(shape=(1, 500), dtype=tf.float32),
+            "attention_mask": tf.ones(shape=(1, 500), dtype=tf.float32),
         }
-        return dummy_inputs
 
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         logger.warning(
             f"\n{self.__class__.__name__} has backpropagation operations that are NOT supported on CPU. If you wish "
-            "to train/fine-tine this model, you need a GPU or a TPU"
+            "to train/fine-tune this model, you need a GPU or a TPU"
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_values": tf.TensorSpec((None, None), tf.float32, name="input_values"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-                "token_type_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(input_values=inputs, training=False)
+    def _get_feat_extract_output_lengths(self, input_lengths, add_adapter=None):
+        """
+        Computes the output length of the convolutional layers
+        """
+        add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
 
-        return self.serving_output(output)
+        def _conv_out_length(input_length, kernel_size, stride):
+            return tf.math.floordiv(input_length - kernel_size, stride) + 1
+
+        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+
+        if add_adapter:
+            for _ in range(self.config.num_adapter_layers):
+                input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride)
+        return input_lengths
+
+    def _get_feature_vector_attention_mask(
+        self, feature_vector_length: int, attention_mask: tf.Tensor, add_adapter=None
+    ):
+        non_padded_lengths = tf.math.cumsum(attention_mask, axis=-1)[:, -1]
+        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
+        output_lengths = tf.cast(output_lengths, tf.int32)
+        batch_size = tf.shape(attention_mask)[0]
+        # check device here
+        attention_mask = tf.zeros(
+            (batch_size, feature_vector_length), dtype=attention_mask.dtype, name="attention_mask"
+        )  # these two operations makes sure that all values before the output lengths idxs are attended to
+        ## check device
+        attention_mask = tf.tensor_scatter_nd_update(
+            attention_mask,
+            indices=tf.stack([tf.range(batch_size), output_lengths - 1], axis=1),
+            updates=tf.ones([batch_size], dtype=attention_mask.dtype),
+        )
+        attention_mask = tf.reverse(attention_mask, axis=[-1])
+        attention_mask = tf.cumsum(attention_mask, axis=-1)
+        attention_mask = tf.reverse(attention_mask, axis=[-1])
+        attention_mask = tf.cast(attention_mask, tf.bool)
+        return attention_mask
 
 
 WAV_2_VEC_2_START_DOCSTRING = r"""
@@ -1400,10 +1292,10 @@ def serving(self, inputs):
 
 WAV_2_VEC_2_INPUTS_DOCSTRING = r"""
     Args:
-        input_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
+        input_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -1466,14 +1358,15 @@ def __init__(self, config: Wav2Vec2Config, *inputs, **kwargs):
 
     @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=TFBaseModelOutput, config_class=_CONFIG_FOR_DOC)
+    @unpack_inputs
     def call(
         self,
         input_values: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
-        token_type_ids: Optional[tf.Tensor] = None,
-        position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1486,11 +1379,11 @@ def call(
         Example:
 
         ```python
-        >>> from transformers import Wav2Vec2Processor, TFWav2Vec2Model
+        >>> from transformers import AutoProcessor, TFWav2Vec2Model
         >>> from datasets import load_dataset
         >>> import soundfile as sf
 
-        >>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
+        >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
         >>> model = TFWav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
 
 
@@ -1507,9 +1400,11 @@ def call(
         >>> hidden_states = model(input_values).last_hidden_state
         ```"""
 
-        inputs = input_values_processing(
-            func=self.call,
-            config=self.config,
+        output_hidden_states = output_hidden_states if output_hidden_states else self.config.output_hidden_states
+        output_attentions = output_attentions if output_attentions else self.config.output_attentions
+        return_dict = return_dict if return_dict else self.config.return_dict
+
+        outputs = self.wav2vec2(
             input_values=input_values,
             attention_mask=attention_mask,
             token_type_ids=token_type_ids,
@@ -1522,40 +1417,8 @@ def call(
             training=training,
         )
 
-        inputs["output_hidden_states"] = (
-            inputs["output_hidden_states"] if inputs["output_hidden_states"] else self.config.output_hidden_states
-        )
-        inputs["output_attentions"] = (
-            inputs["output_attentions"] if inputs["output_attentions"] else self.config.output_attentions
-        )
-        inputs["return_dict"] = inputs["return_dict"] if inputs["return_dict"] else self.config.return_dict
-
-        outputs = self.wav2vec2(
-            input_values=inputs["input_values"],
-            attention_mask=inputs["attention_mask"],
-            token_type_ids=inputs["token_type_ids"],
-            position_ids=inputs["position_ids"],
-            head_mask=inputs["head_mask"],
-            inputs_embeds=inputs["inputs_embeds"],
-            output_attentions=inputs["output_attentions"],
-            output_hidden_states=inputs["output_hidden_states"],
-            return_dict=inputs["return_dict"],
-            training=inputs["training"],
-        )
-
         return outputs
 
-    def serving_output(self, output):
-        hidden_states = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attentions = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFWav2Vec2BaseModelOutput(
-            last_hidden_state=output.last_hidden_state,
-            extract_features=output.extract_features,
-            hidden_states=hidden_states,
-            attentions=attentions,
-        )
-
 
 @add_start_docstrings(
     """TFWav2Vec2 Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
@@ -1594,13 +1457,13 @@ def freeze_feature_encoder(self):
     def call(
         self,
         input_values: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
-        token_type_ids: Optional[tf.Tensor] = None,
-        position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
@@ -1617,11 +1480,11 @@ def call(
 
         ```python
         >>> import tensorflow as tf
-        >>> from transformers import Wav2Vec2Processor, TFWav2Vec2ForCTC
+        >>> from transformers import AutoProcessor, TFWav2Vec2ForCTC
         >>> from datasets import load_dataset
         >>> import soundfile as sf
 
-        >>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
+        >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
         >>> model = TFWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
 
 
@@ -1648,9 +1511,8 @@ def call(
 
         >>> loss = model(input_values, labels=labels).loss
         ```"""
-        inputs = input_values_processing(
-            func=self.call,
-            config=self.config,
+
+        outputs = self.wav2vec2(
             input_values=input_values,
             attention_mask=attention_mask,
             token_type_ids=token_type_ids,
@@ -1662,33 +1524,17 @@ def call(
             return_dict=return_dict,
             training=training,
         )
-
-        outputs = self.wav2vec2(
-            input_values=inputs["input_values"],
-            attention_mask=inputs["attention_mask"],
-            token_type_ids=inputs["token_type_ids"],
-            position_ids=inputs["position_ids"],
-            head_mask=inputs["head_mask"],
-            inputs_embeds=inputs["inputs_embeds"],
-            output_attentions=inputs["output_attentions"],
-            output_hidden_states=inputs["output_hidden_states"],
-            return_dict=inputs["return_dict"],
-            training=inputs["training"],
-        )
         hidden_states = outputs[0]
-        hidden_states = self.dropout(hidden_states, training=inputs["training"])
+        hidden_states = self.dropout(hidden_states, training=training)
 
         logits = self.lm_head(hidden_states)
 
         if labels is not None:
-
             if tf.reduce_max(labels) >= self.config.vocab_size:
                 raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
 
             attention_mask = (
-                inputs["attention_mask"]
-                if inputs["attention_mask"] is not None
-                else tf.ones_like(inputs["input_values"], dtype=tf.float32)
+                attention_mask if attention_mask is not None else tf.ones_like(input_values, dtype=tf.float32)
             )
             input_lengths = self.wav2vec2._get_feat_extract_output_lengths(tf.reduce_sum(attention_mask, axis=-1))
 
@@ -1715,7 +1561,7 @@ def call(
         else:
             loss = None
 
-        if not inputs["return_dict"]:
+        if not return_dict:
             output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
             return ((loss,) + output) if loss is not None else output
 
@@ -1726,7 +1572,100 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFCausalLMOutput) -> TFCausalLMOutput:
-        hidden_states = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attentions = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        return TFCausalLMOutput(logits=output.logits, hidden_states=hidden_states, attentions=attentions)
+
+class TFWav2Vec2ForSequenceClassification(TFWav2Vec2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.wav2vec2 = TFWav2Vec2MainLayer(config, name="wav2vec2")
+        self.num_layers = config.num_hidden_layers + 1
+        with tf.name_scope(self._name_scope()):
+            if config.use_weighted_layer_sum:
+                self.layer_weights = self.add_weight(
+                    shape=(self.num_layers,), initializer="ones", trainable=True, name="layer_weights"
+                )
+        self.config = config
+        self.projector = tf.keras.layers.Dense(units=config.classifier_proj_size, name="projector")
+        self.classifier = tf.keras.layers.Dense(units=config.num_labels, activation=None, name="classifier")
+
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wav2vec2.feature_extractor.trainable = False
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for layer in self.wav2vec2.layers:
+            layer.trainable = False
+
+    @unpack_inputs
+    def call(
+        self,
+        input_values: tf.Tensor,
+        attention_mask: tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: tf.Tensor | None = None,
+        training: bool = False,
+    ) -> TFSequenceClassifierOutput | Tuple[tf.Tensor]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.wav2vec2(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = tf.stack(hidden_states, axis=1)
+            norm_weights = tf.nn.softmax(self.layer_weights, axis=-1)
+            hidden_states = tf.reduce_sum(hidden_states * tf.reshape(norm_weights, [-1, 1, 1]), axis=1)
+        else:
+            hidden_states = outputs[0]
+
+        hidden_states = self.projector(hidden_states)
+        if attention_mask is None:
+            pooled_output = tf.reduce_mean(hidden_states, axis=1)
+        else:
+            padding_mask = self._get_feature_vector_attention_mask(shape_list(hidden_states)[1], attention_mask)
+            padding_mask_float = tf.cast(padding_mask, hidden_states.dtype)
+            hidden_states = tf.multiply(hidden_states, tf.expand_dims(padding_mask_float, axis=-1))
+            pooled_output = tf.divide(
+                tf.reduce_sum(hidden_states, axis=1), tf.expand_dims(tf.reduce_sum(padding_mask_float, axis=1), axis=1)
+            )
+        logits = self.classifier(pooled_output)
+        loss = None
+        if labels is not None:
+            loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+            loss = loss_fn(tf.reshape(labels, [-1]), tf.reshape(logits, [-1, self.config.num_labels]))
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
index cb2aeb7562ef..af74533ad062 100755
--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -26,7 +26,7 @@
 from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
-from ...deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
 from ...modeling_outputs import (
     BaseModelOutput,
     CausalLMOutput,
@@ -37,18 +37,26 @@
     XVectorOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_int_div
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    cached_file,
+    is_safetensors_available,
     logging,
     replace_return_docstrings,
 )
 from .configuration_wav2vec2 import Wav2Vec2Config
 
 
+WAV2VEC2_ADAPTER_PT_FILE = "adapter.{}.bin"
+WAV2VEC2_ADAPTER_SAFE_FILE = "adapter.{}.safetensors"
+
+if is_safetensors_available():
+    from safetensors.torch import load_file as safe_load_file
+
+
 logger = logging.get_logger(__name__)
 
 
@@ -56,7 +64,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "Wav2Vec2Config"
-_PROCESSOR_FOR_DOC = "Wav2Vec2Processor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "facebook/wav2vec2-base-960h"
@@ -67,7 +74,6 @@
 _CTC_EXPECTED_LOSS = 53.48
 
 # Audio class docstring
-_FEAT_EXTRACTOR_FOR_DOC = "Wav2Vec2FeatureExtractor"
 _SEQ_CLASS_CHECKPOINT = "superb/wav2vec2-base-superb-ks"
 _SEQ_CLASS_EXPECTED_OUTPUT = "'_unknown_'"
 _SEQ_CLASS_EXPECTED_LOSS = 6.54
@@ -370,15 +376,19 @@ def __init__(self, config):
             groups=config.num_conv_pos_embedding_groups,
         )
 
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
         if is_deepspeed_zero3_enabled():
             import deepspeed
 
             with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
-                self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+                self.conv = weight_norm(self.conv, name="weight", dim=2)
             deepspeed.zero.register_external_parameter(self, self.conv.weight_v)
             deepspeed.zero.register_external_parameter(self, self.conv.weight_g)
         else:
-            self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+            self.conv = weight_norm(self.conv, name="weight", dim=2)
 
         self.padding = Wav2Vec2SamePadLayer(config.num_conv_pos_embeddings)
         self.activation = ACT2FN[config.feat_extract_activation]
@@ -576,8 +586,8 @@ def forward(
 
         proj_shape = (bsz * self.num_heads, -1, self.head_dim)
         query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-        key_states = key_states.view(*proj_shape)
-        value_states = value_states.view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
 
         src_len = key_states.size(1)
         attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
@@ -623,7 +633,7 @@ def forward(
 
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
             raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
                 f" {attn_output.size()}"
             )
 
@@ -631,7 +641,7 @@ def forward(
         attn_output = attn_output.transpose(1, 2)
 
         # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned aross GPUs when using tensor-parallelism.
+        # partitioned across GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
@@ -711,6 +721,11 @@ def __init__(self, config):
         self.feed_forward = Wav2Vec2FeedForward(config)
         self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
+        if getattr(config, "adapter_attn_dim", None) is not None:
+            self.adapter_layer = Wav2Vec2AttnAdapterLayer(config)
+        else:
+            self.adapter_layer = None
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -726,6 +741,9 @@ def forward(
         hidden_states = attn_residual + hidden_states
         hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
 
+        if self.adapter_layer is not None:
+            hidden_states = hidden_states + self.adapter_layer(hidden_states)
+
         outputs = (hidden_states,)
 
         if output_attentions:
@@ -779,7 +797,7 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = np.random.uniform(0, 1)
+            dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
             if not skip_the_layer or deepspeed_zero3_is_enabled:
@@ -867,7 +885,7 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = np.random.uniform(0, 1)
+            dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
             if not skip_the_layer or deepspeed_zero3_is_enabled:
@@ -1037,6 +1055,31 @@ def forward(self, hidden_states):
         return hidden_states
 
 
+class Wav2Vec2AttnAdapterLayer(nn.Module):
+    def __init__(self, config):
+        """
+        Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
+        up training throughput.
+        """
+        super().__init__()
+        self.input_dim = config.adapter_attn_dim
+        self.hidden_dim = config.hidden_size
+
+        self.norm = nn.LayerNorm(self.hidden_dim)
+        self.linear_1 = nn.Linear(self.hidden_dim, self.input_dim)
+        self.act_fn = nn.ReLU()
+        self.linear_2 = nn.Linear(self.input_dim, self.hidden_dim)
+
+    def forward(self, hidden_states: torch.FloatTensor):
+        hidden_states = self.norm(hidden_states)
+
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act_fn(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+
+        return hidden_states
+
+
 class Wav2Vec2PreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -1046,13 +1089,18 @@ class Wav2Vec2PreTrainedModel(PreTrainedModel):
     config_class = Wav2Vec2Config
     base_model_prefix = "wav2vec2"
     main_input_name = "input_values"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
     supports_gradient_checkpointing = True
 
     def _init_weights(self, module):
         """Initialize the weights"""
+        # Wav2Vec2ForPreTraining last 2 linear layers need standard Linear init.
+        if isinstance(module, Wav2Vec2ForPreTraining):
+            module.project_hid.reset_parameters()
+            module.project_q.reset_parameters()
+            module.project_hid._is_hf_initialized = True
+            module.project_q._is_hf_initialized = True
         # gumbel softmax requires special init
-        if isinstance(module, Wav2Vec2GumbelVectorQuantizer):
+        elif isinstance(module, Wav2Vec2GumbelVectorQuantizer):
             module.weight_proj.weight.data.normal_(mean=0.0, std=1)
             module.weight_proj.bias.data.zero_()
             nn.init.uniform_(module.codevectors)
@@ -1094,7 +1142,7 @@ def _get_feat_extract_output_lengths(
         def _conv_out_length(input_length, kernel_size, stride):
             # 1D convolutional layer output length formula taken
             # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch_int_div(input_length - kernel_size, stride) + 1
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
 
         for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
             input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
@@ -1129,6 +1177,220 @@ def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, (Wav2Vec2Encoder, Wav2Vec2EncoderStableLayerNorm, Wav2Vec2FeatureEncoder)):
             module.gradient_checkpointing = value
 
+    def _get_adapters(self):
+        if self.config.adapter_attn_dim is None:
+            raise ValueError(f"{self.__class__} has no adapter layers. Make sure to define `config.adapter_attn_dim`.")
+
+        adapter_weights = {}
+        for name, module in self.named_modules():
+            if isinstance(module, Wav2Vec2AttnAdapterLayer):
+                for param_name, param in module.named_parameters():
+                    adapter_weights[".".join([name, param_name])] = param
+
+        if isinstance(self, Wav2Vec2ForCTC):
+            for name, param in self.lm_head.named_parameters():
+                adapter_weights[".".join(["lm_head", name])] = param
+
+        return adapter_weights
+
+    def init_adapter_layers(self):
+        """
+        (Re-)initialize attention adapter layers and lm head for adapter-only fine-tuning
+        """
+        # init attention adapters
+        for module in self.modules():
+            if isinstance(module, Wav2Vec2AttnAdapterLayer):
+                self._init_weights(module)
+
+        # init lm head
+        if isinstance(self, Wav2Vec2ForCTC):
+            self._init_weights(self.lm_head)
+
+    def load_adapter(self, target_lang: str, force_load=True, **kwargs):
+        r"""
+        Load a language adapter model from a pre-trained adapter model.
+
+        Parameters:
+            target_lang (`str`):
+                Has to be a language id of an existing adapter weight. Adapter weights are stored in the format
+                adapter.<lang>.safetensors or adapter.<lang>.bin
+            force_load (`bool`, defaults to `True`):
+                Whether the weights shall be loaded even if `target_lang` matches `self.target_lang`.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory in which a downloaded pretrained model configuration should be cached if the
+                standard cache should not be used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
+                file exists.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only(`bool`, *optional*, defaults to `False`):
+                Whether or not to only look at local files (i.e., do not try to download the model).
+            token (`str` or `bool`, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
+                the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+
+                <Tip>
+
+                To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>".
+
+                </Tip>
+
+            mirror (`str`, *optional*):
+                Mirror source to accelerate downloads in China. If you are from China and have an accessibility
+                problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
+                Please refer to the mirror site for more information.
+
+        <Tip>
+
+        Activate the special ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to
+        use this method in a firewalled environment.
+
+        </Tip>
+
+        Examples:
+
+        ```python
+        >>> from transformers import Wav2Vec2ForCTC, AutoProcessor
+
+        >>> ckpt = "facebook/mms-1b-all"
+        >>> processor = AutoProcessor.from_pretrained(ckpt)
+        >>> model = Wav2Vec2ForCTC.from_pretrained(ckpt, target_lang="eng")
+        >>> # set specific language
+        >>> processor.tokenizer.set_target_lang("spa")
+        >>> model.load_adapter("spa")
+        ```
+        """
+        if self.config.adapter_attn_dim is None:
+            raise ValueError(f"Cannot load_adapter for {target_lang} if `config.adapter_attn_dim` is not defined.")
+
+        if target_lang == self.target_lang and not force_load:
+            logger.warning(f"Adapter weights are already set to {target_lang}.")
+            return
+
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        token = kwargs.pop("token", None)
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        revision = kwargs.pop("revision", None)
+        use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False)
+
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
+        model_path_or_id = self.config._name_or_path
+        state_dict = None
+
+        # 1. Let's first try loading a safetensors adapter weight
+        if use_safetensors is not False:
+            filepath = WAV2VEC2_ADAPTER_SAFE_FILE.format(target_lang)
+
+            try:
+                weight_path = cached_file(
+                    model_path_or_id,
+                    filename=filepath,
+                    force_download=force_download,
+                    resume_download=resume_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    token=token,
+                    revision=revision,
+                    cache_dir=cache_dir,
+                )
+
+                state_dict = safe_load_file(weight_path)
+
+            except EnvironmentError:
+                if use_safetensors:
+                    # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted
+                    # to the original exception.
+                    raise
+
+            except Exception:
+                # For any other exception, we throw a generic error.
+                if use_safetensors:
+                    raise EnvironmentError(
+                        f"Can't load the model for '{model_path_or_id}'. If you were trying to load it"
+                        " from 'https://huggingface.co/models', make sure you don't have a local directory with the"
+                        f" same name. Otherwise, make sure '{model_path_or_id}' is the correct path to a"
+                        f" directory containing a file named {filepath}."
+                    )
+
+        # 2. If this didn't work let's try loading a PyTorch adapter weight
+        if state_dict is None:
+            filepath = WAV2VEC2_ADAPTER_PT_FILE.format(target_lang)
+
+            try:
+                weight_path = cached_file(
+                    model_path_or_id,
+                    filename=filepath,
+                    force_download=force_download,
+                    resume_download=resume_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    token=token,
+                    revision=revision,
+                    cache_dir=cache_dir,
+                )
+
+                state_dict = torch.load(weight_path, map_location="cpu")
+
+            except EnvironmentError:
+                # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted
+                # to the original exception.
+                raise
+
+            except Exception:
+                # For any other exception, we throw a generic error.
+                raise EnvironmentError(
+                    f"Can't load the model for '{model_path_or_id}'. If you were trying to load it"
+                    " from 'https://huggingface.co/models', make sure you don't have a local directory with the"
+                    f" same name. Otherwise, make sure '{model_path_or_id}' is the correct path to a"
+                    f" directory containing a file named {filepath}."
+                )
+
+        adapter_weights = self._get_adapters()
+        unexpected_keys = set(state_dict.keys()) - set(adapter_weights.keys())
+        missing_keys = set(adapter_weights.keys()) - set(state_dict.keys())
+
+        if len(unexpected_keys) > 0:
+            raise ValueError(f"The adapter weights {weight_path} has unexpected keys: {', '.join(unexpected_keys)}.")
+        elif len(missing_keys) > 0:
+            raise ValueError(f"The adapter weights {weight_path} has missing keys: {', '.join(missing_keys)}.")
+
+        # make sure now vocab size is correct
+        target_vocab_size = state_dict["lm_head.weight"].shape[0]
+        if target_vocab_size != self.config.vocab_size:
+            self.lm_head = nn.Linear(
+                self.config.output_hidden_size, target_vocab_size, device=self.device, dtype=self.dtype
+            )
+            self.config.vocab_size = target_vocab_size
+
+        # make sure that adapter weights are put in exactly the same precision and device placement and overwritten adapter weights
+        state_dict = {k: v.to(adapter_weights[k]) for k, v in state_dict.items()}
+        self.load_state_dict(state_dict, strict=False)
+
+        # set target language corectly
+        self.target_lang = target_lang
+
 
 WAV_2_VEC_2_START_DOCSTRING = r"""
     Wav2Vec2 was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech
@@ -1152,10 +1414,10 @@ def _set_gradient_checkpointing(self, module, value=False):
 WAV_2_VEC_2_INPUTS_DOCSTRING = r"""
     Args:
         input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
-            Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
-            into an array of type *List[float]* or a *numpy.ndarray*, *e.g.* via the soundfile library (*pip install
-            soundfile*). To prepare the array into *input_values*, the [`Wav2Vec2Processor`] should be used for padding
-            and conversion into a tensor of type *torch.FloatTensor*. See [`Wav2Vec2Processor.__call__`] for details.
+            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
+            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
+            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
+            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details.
         attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
             1]`:
@@ -1279,7 +1541,6 @@ def _mask_hidden_states(
 
     @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_PROCESSOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=Wav2Vec2BaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1348,13 +1609,12 @@ def __init__(self, config: Wav2Vec2Config):
 
         self.quantizer = Wav2Vec2GumbelVectorQuantizer(config)
 
-        # Initialize weights and apply final processing
-        self.post_init()
-
-        # make sure that project_hid & project_q are initialized like normal linear layers
         self.project_hid = nn.Linear(config.hidden_size, config.proj_codevector_dim)
         self.project_q = nn.Linear(config.codevector_dim, config.proj_codevector_dim)
 
+        # Initialize weights and apply final processing
+        self.post_init()
+
     def set_gumbel_temperature(self, temperature: int):
         """
         Set the Gumbel softmax temperature to a given value. Only necessary for training
@@ -1613,12 +1873,14 @@ def forward(
     WAV_2_VEC_2_START_DOCSTRING,
 )
 class Wav2Vec2ForCTC(Wav2Vec2PreTrainedModel):
-    def __init__(self, config):
+    def __init__(self, config, target_lang: Optional[str] = None):
         super().__init__(config)
 
         self.wav2vec2 = Wav2Vec2Model(config)
         self.dropout = nn.Dropout(config.final_dropout)
 
+        self.target_lang = target_lang
+
         if config.vocab_size is None:
             raise ValueError(
                 f"You are trying to instantiate {self.__class__} with a configuration that "
@@ -1634,6 +1896,27 @@ def __init__(self, config):
         # Initialize weights and apply final processing
         self.post_init()
 
+    def tie_weights(self):
+        """
+        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
+        passing `target_lang=...` to `from_pretrained(...)`.
+
+        This method is **not** supposed to be called by the user and is prone to be changed in the future.
+        """
+
+        # Note that `tie_weights` is usually used to tie input and output embedding weights. The method is re-purposed to
+        # correctly load adapter layers for Wav2Vec2 so that we do not have to introduce a new API to
+        # [`PreTrainedModel`]. While slightly hacky, Wav2Vec2 never has to tie input and output embeddings, so that it is
+        # ok to repurpose this function here.
+        target_lang = self.target_lang
+
+        if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
+            raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
+        elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
+            logger.info("By default `target_lang` is set to 'eng'.")
+        elif target_lang is not None:
+            self.load_adapter(target_lang, force_load=True)
+
     def freeze_feature_extractor(self):
         """
         Calling this function will disable the gradient computation for the feature encoder so that its parameter will
@@ -1653,9 +1936,16 @@ def freeze_feature_encoder(self):
         """
         self.wav2vec2.feature_extractor._freeze_parameters()
 
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wav2vec2.parameters():
+            param.requires_grad = False
+
     @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_PROCESSOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=CausalLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1696,7 +1986,6 @@ def forward(
 
         loss = None
         if labels is not None:
-
             if labels.max() >= self.config.vocab_size:
                 raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
 
@@ -1789,7 +2078,6 @@ def freeze_base_model(self):
 
     @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_SEQ_CLASS_CHECKPOINT,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1911,7 +2199,6 @@ def freeze_base_model(self):
 
     @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_FRAME_CLASS_CHECKPOINT,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -2091,7 +2378,6 @@ def _conv_out_length(input_length, kernel_size, stride):
 
     @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_XVECTOR_CHECKPOINT,
         output_type=XVectorOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
index 8d8406817d0d..b826eb048ed9 100644
--- a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
@@ -24,7 +24,7 @@
 
 import numpy as np
 
-from ...tokenization_utils import PreTrainedTokenizer, _insert_one_token_to_ordered_list
+from ...tokenization_utils import PreTrainedTokenizer
 from ...tokenization_utils_base import AddedToken, BatchEncoding
 from ...utils import (
     ModelOutput,
@@ -88,7 +88,7 @@
                 length (like XLNet) truncation/padding to a maximum length will be deactivated.
             pad_to_multiple_of (`int`, *optional*):
                 If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
-                the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
+                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
             return_tensors (`str` or [`~utils.TensorType`], *optional*):
                 If set, will return tensors instead of list of python integers. Acceptable values are:
 
@@ -148,6 +148,9 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
             The token used for defining the end of a word.
         do_lower_case (`bool`, *optional*, defaults to `False`):
             Whether or not to accept lowercase input and lowercase the output when decoding.
+        target_lang (`str`, *optional*):
+            A target language the tokenizer should set by default. `target_lang` has to be defined for multi-lingual,
+            nested vocabulary such as [facebook/mms-1b-all](https://huggingface.co/facebook/mms-1b-all).
 
         **kwargs
             Additional keyword arguments passed along to [`PreTrainedTokenizer`]
@@ -168,8 +171,27 @@ def __init__(
         word_delimiter_token="|",
         replace_word_delimiter_char=" ",
         do_lower_case=False,
-        **kwargs
+        target_lang=None,
+        **kwargs,
     ):
+        self._word_delimiter_token = word_delimiter_token
+
+        self.do_lower_case = do_lower_case
+        self.replace_word_delimiter_char = replace_word_delimiter_char
+        self.target_lang = target_lang
+
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.vocab = json.load(vocab_handle)
+
+        # if target lang is defined vocab must be a nested dict
+        # with each target lang being one vocabulary
+        if target_lang is not None:
+            self.encoder = self.vocab[target_lang]
+        else:
+            self.encoder = self.vocab
+
+        self.decoder = {v: k for k, v in self.encoder.items()}
+
         super().__init__(
             unk_token=unk_token,
             bos_token=bos_token,
@@ -178,25 +200,36 @@ def __init__(
             do_lower_case=do_lower_case,
             word_delimiter_token=word_delimiter_token,
             replace_word_delimiter_char=replace_word_delimiter_char,
+            target_lang=target_lang,
             **kwargs,
         )
 
-        self._word_delimiter_token = word_delimiter_token
+        # make sure that tokens made of several
+        # characters are not split at tokenization
+        for token in self.encoder.keys():
+            if len(token) > 1:
+                self.add_tokens(AddedToken(token, rstrip=True, lstrip=True, normalized=False))
 
-        self.do_lower_case = do_lower_case
-        self.replace_word_delimiter_char = replace_word_delimiter_char
+    def set_target_lang(self, target_lang: str):
+        """
+        Set the target language of a nested multi-lingual dictionary
+        """
+        if self.vocab == self.encoder:
+            raise ValueError(f"{self.vocab} is not a multi-lingual, nested tokenizer. Cannot set target language.")
 
-        with open(vocab_file, encoding="utf-8") as vocab_handle:
-            self.encoder = json.load(vocab_handle)
+        if target_lang not in self.vocab:
+            raise ValueError(f"{target_lang} does not exist. Choose one of {', '.join(self.vocab.keys())}.")
+
+        self.target_lang = target_lang
+        self.init_kwargs["target_lang"] = target_lang
+        self.encoder = self.vocab[target_lang]
         self.decoder = {v: k for k, v in self.encoder.items()}
 
         # make sure that tokens made of several
         # characters are not split at tokenization
         for token in self.encoder.keys():
             if len(token) > 1:
-                self.unique_no_split_tokens.append(token)
-
-        self._create_trie(self.unique_no_split_tokens)
+                self.add_tokens(AddedToken(token, rstrip=True, lstrip=True, normalized=False))
 
     @property
     def word_delimiter_token(self) -> str:
@@ -231,7 +264,20 @@ def vocab_size(self) -> int:
         return len(self.decoder)
 
     def get_vocab(self) -> Dict:
-        return dict(self.encoder, **self.added_tokens_encoder)
+        vocab = dict(self.encoder)
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
+        # Overwritten to never strip!
+        to_add = []
+        for token in new_tokens:
+            if isinstance(token, str):
+                to_add.append(AddedToken(token, rstrip=False, lstrip=False, normalize=False))
+            else:
+                to_add.append(token)
+
+        return super()._add_tokens(to_add, special_tokens)
 
     def _tokenize(self, text, **kwargs):
         """
@@ -373,7 +419,7 @@ def _decode(
         self,
         token_ids: List[int],
         skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: bool = True,
+        clean_up_tokenization_spaces: bool = None,
         group_tokens: bool = True,
         spaces_between_special_tokens: bool = False,
         output_word_offsets: Optional[bool] = False,
@@ -402,6 +448,11 @@ def _decode(
 
         text = string_output["text"]
 
+        clean_up_tokenization_spaces = (
+            clean_up_tokenization_spaces
+            if clean_up_tokenization_spaces is not None
+            else self.clean_up_tokenization_spaces
+        )
         if clean_up_tokenization_spaces:
             text = self.clean_up_tokenization(text)
 
@@ -421,10 +472,10 @@ def batch_decode(
         self,
         sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"],
         skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: bool = True,
+        clean_up_tokenization_spaces: bool = None,
         output_char_offsets: bool = False,
         output_word_offsets: bool = False,
-        **kwargs
+        **kwargs,
     ) -> List[str]:
         """
         Convert a list of lists of token ids into a list of strings by calling decode.
@@ -434,7 +485,7 @@ def batch_decode(
                 List of tokenized input ids. Can be obtained using the `__call__` method.
             skip_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not to remove special tokens in the decoding.
-            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
+            clean_up_tokenization_spaces (`bool`, *optional*):
                 Whether or not to clean up the tokenization spaces.
             output_char_offsets (`bool`, *optional*, defaults to `False`):
                 Whether or not to output character offsets. Character offsets can be used in combination with the
@@ -491,10 +542,10 @@ def decode(
         self,
         token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
         skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: bool = True,
+        clean_up_tokenization_spaces: bool = None,
         output_char_offsets: bool = False,
         output_word_offsets: bool = False,
-        **kwargs
+        **kwargs,
     ) -> str:
         """
         Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
@@ -507,7 +558,7 @@ def decode(
                 List of tokenized input ids. Can be obtained using the `__call__` method.
             skip_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not to remove special tokens in the decoding.
-            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
+            clean_up_tokenization_spaces (`bool`, *optional*):
                 Whether or not to clean up the tokenization spaces.
             output_char_offsets (`bool`, *optional*, defaults to `False`):
                 Whether or not to output character offsets. Character offsets can be used in combination with the
@@ -601,68 +652,10 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
         )
 
         with open(vocab_file, "w", encoding="utf-8") as f:
-            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+            f.write(json.dumps(self.vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
 
         return (vocab_file,)
 
-    def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
-        """
-        Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
-        it with indices starting from length of the current vocabulary.
-
-        Args:
-            new_tokens (`List[str]`or `List[tokenizers.AddedToken]`):
-                Token(s) to add in vocabulary. A token is only added if it's not already in the vocabulary (tested by
-                checking if the tokenizer assign the index of the `unk_token` to them).
-            special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the tokens should be added as special tokens.
-
-        Returns:
-            `int`: The number of tokens actually added to the vocabulary.
-
-        Example:
-
-        ```python
-        # Let's see how to increase the vocabulary of Bert model and tokenizer
-        tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
-        model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
-
-        num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
-        print("We have added", num_added_toks, "tokens")
-        # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
-        model.resize_token_embeddings(len(tokenizer))
-        ```"""
-        new_tokens = [str(tok) for tok in new_tokens]
-
-        tokens_to_add = []
-        for token in new_tokens:
-            assert isinstance(token, str)
-            if not special_tokens and hasattr(self, "do_lower_case") and self.do_lower_case:
-                token = token.lower()
-            if (
-                token != self.unk_token
-                and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token)
-                and token not in tokens_to_add
-            ):
-                tokens_to_add.append(token)
-                if self.verbose:
-                    logger.info(f"Adding {token} to the vocabulary")
-
-        added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(tokens_to_add))
-        added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
-        self.added_tokens_encoder.update(added_tok_encoder)
-        self.added_tokens_decoder.update(added_tok_decoder)
-
-        # Make sure we don't split on any special tokens (even they were already in the vocab before)
-        for token in tokens_to_add:
-            if len(token) > 1:
-                self._additional_special_tokens.append(AddedToken(token))
-                _insert_one_token_to_ordered_list(self.unique_no_split_tokens, token)
-
-        self._create_trie(self.unique_no_split_tokens)
-
-        return len(tokens_to_add)
-
 
 class Wav2Vec2Tokenizer(PreTrainedTokenizer):
     """
@@ -735,20 +728,8 @@ def __init__(
         do_lower_case=False,
         do_normalize=False,
         return_attention_mask=False,
-        **kwargs
+        **kwargs,
     ):
-        super().__init__(
-            unk_token=unk_token,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            pad_token=pad_token,
-            do_lower_case=do_lower_case,
-            do_normalize=do_normalize,
-            return_attention_mask=return_attention_mask,
-            word_delimiter_token=word_delimiter_token,
-            **kwargs,
-        )
-
         warnings.warn(
             "The class `Wav2Vec2Tokenizer` is deprecated and will be removed in version 5 of Transformers. Please use"
             " `Wav2Vec2Processor` or `Wav2Vec2CTCTokenizer` instead.",
@@ -766,6 +747,18 @@ def __init__(
 
         self.decoder = {v: k for k, v in self.encoder.items()}
 
+        super().__init__(
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            do_lower_case=do_lower_case,
+            do_normalize=do_normalize,
+            return_attention_mask=return_attention_mask,
+            word_delimiter_token=word_delimiter_token,
+            **kwargs,
+        )
+
     @property
     def word_delimiter_token(self) -> str:
         """
@@ -803,7 +796,7 @@ def __call__(
         pad_to_multiple_of: Optional[int] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
@@ -812,12 +805,15 @@ def __call__(
         Args:
             raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                 The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
-                values, a list of numpy arrayr or a list of list of float values.
+                values, a list of numpy array or a list of list of float values. Must be mono channel audio, not
+                stereo, i.e. single float per timestep.
         """
 
-        is_batched = bool(
-            isinstance(raw_speech, (list, tuple))
-            and (isinstance(raw_speech[0], np.ndarray) or isinstance(raw_speech[0], (tuple, list)))
+        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
+        if is_batched_numpy and len(raw_speech.shape) > 2:
+            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
+        is_batched = is_batched_numpy or (
+            isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
         )
 
         # make sure input is in list format
@@ -887,8 +883,8 @@ def _decode(
         self,
         token_ids: List[int],
         skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: bool = True,
-        **kwargs
+        clean_up_tokenization_spaces: bool = None,
+        **kwargs,
     ) -> str:
         """
         special _decode function is needed for Wav2Vec2Tokenizer because added tokens should be treated exactly the
@@ -905,6 +901,11 @@ def _decode(
 
         text = self.convert_tokens_to_string(result)
 
+        clean_up_tokenization_spaces = (
+            clean_up_tokenization_spaces
+            if clean_up_tokenization_spaces is not None
+            else self.clean_up_tokenization_spaces
+        )
         if clean_up_tokenization_spaces:
             clean_text = self.clean_up_tokenization(text)
             return clean_text
diff --git a/src/transformers/models/wav2vec2_conformer/__init__.py b/src/transformers/models/wav2vec2_conformer/__init__.py
index df9fe20e2571..35081cfcdef9 100644
--- a/src/transformers/models/wav2vec2_conformer/__init__.py
+++ b/src/transformers/models/wav2vec2_conformer/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py
index 2a3f951b3960..f408338b457d 100644
--- a/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py
+++ b/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py
@@ -61,10 +61,15 @@ class Wav2Vec2ConformerConfig(PretrainedConfig):
             `"relu"`, `"selu"` and `"gelu_new"` are supported.
         hidden_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        activation_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for activations inside the fully connected layer.
         attention_dropout (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
         final_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for the final projection layer of [`Wav2Vec2ConformerForCTC`].
+        layerdrop (`float`, *optional*, defaults to 0.1):
+            The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
+            details.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
@@ -266,7 +271,7 @@ def __init__(
         max_source_positions=5000,
         conv_depthwise_kernel_size=31,
         conformer_conv_dropout=0.1,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
         self.hidden_size = hidden_size
diff --git a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
index d72522d294be..f162c5142970 100644
--- a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
+++ b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
@@ -25,7 +25,7 @@
 from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
-from ...deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
 from ...modeling_outputs import (
     BaseModelOutput,
     CausalLMOutput,
@@ -35,7 +35,6 @@
     XVectorOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_int_div
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -54,7 +53,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "Wav2Vec2ConformerConfig"
-_PROCESSOR_FOR_DOC = "Wav2Vec2Processor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "facebook/wav2vec2-conformer-rope-large-960h-ft"
@@ -64,20 +62,6 @@
 _CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
 _CTC_EXPECTED_LOSS = 64.21
 
-# Audio class docstring
-_FEAT_EXTRACTOR_FOR_DOC = "Wav2Vec2FeatureExtractor"
-_SEQ_CLASS_CHECKPOINT = "hf-internal-testing/wav2vec2-conformer-seq-class"
-_SEQ_CLASS_EXPECTED_OUTPUT = "'LABEL_0'"
-_SEQ_CLASS_EXPECTED_LOSS = 0.68
-
-# Frame class docstring
-_FRAME_CLASS_CHECKPOINT = "hf-internal-testing/wav2vec2-conformer-frame-class"
-_FRAME_EXPECTED_OUTPUT = [1, 0]
-
-# Speaker Verification docstring
-_XVECTOR_CHECKPOINT = "hf-internal-testing/wav2vec2-conformer-xvector"
-_XVECTOR_EXPECTED_OUTPUT = 1.0
-
 
 WAV2VEC2_CONFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "facebook/wav2vec2-conformer-rel-pos-large",
@@ -372,15 +356,19 @@ def __init__(self, config):
             groups=config.num_conv_pos_embedding_groups,
         )
 
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
         if is_deepspeed_zero3_enabled():
             import deepspeed
 
             with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
-                self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+                self.conv = weight_norm(self.conv, name="weight", dim=2)
             deepspeed.zero.register_external_parameter(self, self.conv.weight_v)
             deepspeed.zero.register_external_parameter(self, self.conv.weight_g)
         else:
-            self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+            self.conv = weight_norm(self.conv, name="weight", dim=2)
 
         self.padding = Wav2Vec2ConformerSamePadLayer(config.num_conv_pos_embeddings)
         self.activation = ACT2FN[config.feat_extract_activation]
@@ -418,13 +406,15 @@ def forward(self, hidden_states):
             return self.cached_rotary_positional_embedding
 
         self.cached_sequence_length = sequence_length
+        # Embeddings are computed in the dtype of the inv_freq constant
         time_stamps = torch.arange(sequence_length).type_as(self.inv_freq)
         freqs = torch.einsum("i,j->ij", time_stamps, self.inv_freq)
         embeddings = torch.cat((freqs, freqs), dim=-1)
 
         cos_embeddings = embeddings.cos()[:, None, None, :]
         sin_embeddings = embeddings.sin()[:, None, None, :]
-        self.cached_rotary_positional_embedding = torch.stack([cos_embeddings, sin_embeddings])
+        # Computed embeddings are cast to the dtype of the hidden state inputs
+        self.cached_rotary_positional_embedding = torch.stack([cos_embeddings, sin_embeddings]).type_as(hidden_states)
         return self.cached_rotary_positional_embedding
 
 
@@ -594,7 +584,7 @@ def __init__(self, config):
         if (config.conv_depthwise_kernel_size - 1) % 2 == 1:
             raise ValueError("`config.conv_depthwise_kernel_size` should be a odd number for 'SAME' padding")
         self.layer_norm = nn.LayerNorm(config.hidden_size)
-        self.pointwise_conv1 = torch.nn.Conv1d(
+        self.pointwise_conv1 = nn.Conv1d(
             config.hidden_size,
             2 * config.hidden_size,
             kernel_size=1,
@@ -602,8 +592,8 @@ def __init__(self, config):
             padding=0,
             bias=False,
         )
-        self.glu = torch.nn.GLU(dim=1)
-        self.depthwise_conv = torch.nn.Conv1d(
+        self.glu = nn.GLU(dim=1)
+        self.depthwise_conv = nn.Conv1d(
             config.hidden_size,
             config.hidden_size,
             config.conv_depthwise_kernel_size,
@@ -612,9 +602,9 @@ def __init__(self, config):
             groups=config.hidden_size,
             bias=False,
         )
-        self.batch_norm = torch.nn.BatchNorm1d(config.hidden_size)
+        self.batch_norm = nn.BatchNorm1d(config.hidden_size)
         self.activation = ACT2FN[config.hidden_act]
-        self.pointwise_conv2 = torch.nn.Conv1d(
+        self.pointwise_conv2 = nn.Conv1d(
             config.hidden_size,
             config.hidden_size,
             kernel_size=1,
@@ -622,7 +612,7 @@ def __init__(self, config):
             padding=0,
             bias=False,
         )
-        self.dropout = torch.nn.Dropout(config.conformer_conv_dropout)
+        self.dropout = nn.Dropout(config.conformer_conv_dropout)
 
     def forward(self, hidden_states):
         hidden_states = self.layer_norm(hidden_states)
@@ -808,7 +798,7 @@ def __init__(self, config):
 
         # Self-Attention
         self.self_attn_layer_norm = nn.LayerNorm(embed_dim)
-        self.self_attn_dropout = torch.nn.Dropout(dropout)
+        self.self_attn_dropout = nn.Dropout(dropout)
         self.self_attn = Wav2Vec2ConformerSelfAttention(config)
 
         # Conformer Convolution
@@ -915,7 +905,7 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = np.random.uniform(0, 1)
+            dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
             if not skip_the_layer or deepspeed_zero3_is_enabled:
@@ -1099,13 +1089,18 @@ class Wav2Vec2ConformerPreTrainedModel(PreTrainedModel):
     config_class = Wav2Vec2ConformerConfig
     base_model_prefix = "wav2vec2_conformer"
     main_input_name = "input_values"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
     supports_gradient_checkpointing = True
 
     def _init_weights(self, module):
         """Initialize the weights"""
+        # Wav2Vec2ForPreTraining last 2 linear layers need standard Linear init.
+        if isinstance(module, Wav2Vec2ConformerForPreTraining):
+            module.project_hid.reset_parameters()
+            module.project_q.reset_parameters()
+            module.project_hid._is_hf_initialized = True
+            module.project_q._is_hf_initialized = True
         # gumbel softmax requires special init
-        if isinstance(module, Wav2Vec2ConformerGumbelVectorQuantizer):
+        elif isinstance(module, Wav2Vec2ConformerGumbelVectorQuantizer):
             module.weight_proj.weight.data.normal_(mean=0.0, std=1)
             module.weight_proj.bias.data.zero_()
             nn.init.uniform_(module.codevectors)
@@ -1152,7 +1147,7 @@ def _get_feat_extract_output_lengths(
         def _conv_out_length(input_length, kernel_size, stride):
             # 1D convolutional layer output length formula taken
             # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch_int_div(input_length - kernel_size, stride) + 1
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
 
         for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
             input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
@@ -1209,10 +1204,10 @@ def _set_gradient_checkpointing(self, module, value=False):
 WAV2VEC2_CONFORMER_INPUTS_DOCSTRING = r"""
     Args:
         input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
-            Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
-            into an array of type *List[float]* or a *numpy.ndarray*, *e.g.* via the soundfile library (*pip install
-            soundfile*). To prepare the array into *input_values*, the [`Wav2Vec2Processor`] should be used for padding
-            and conversion into a tensor of type *torch.FloatTensor*. See [`Wav2Vec2Processor.__call__`] for details.
+            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
+            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
+            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
+            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details.
         attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
             1]`:
@@ -1324,7 +1319,6 @@ def _mask_hidden_states(
 
     @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_PROCESSOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=Wav2Vec2BaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1397,13 +1391,12 @@ def __init__(self, config: Wav2Vec2ConformerConfig):
 
         self.quantizer = Wav2Vec2ConformerGumbelVectorQuantizer(config)
 
-        # Initialize weights and apply final processing
-        self.post_init()
-
-        # make sure that project_hid & project_q are initialized like normal linear layers
         self.project_hid = nn.Linear(config.hidden_size, config.proj_codevector_dim)
         self.project_q = nn.Linear(config.codevector_dim, config.proj_codevector_dim)
 
+        # Initialize weights and apply final processing
+        self.post_init()
+
     # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTraining.set_gumbel_temperature
     def set_gumbel_temperature(self, temperature: int):
         """
@@ -1612,18 +1605,20 @@ def forward(
 )
 class Wav2Vec2ConformerForCTC(Wav2Vec2ConformerPreTrainedModel):
     # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.__init__ with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer
-    def __init__(self, config):
+    def __init__(self, config, target_lang: Optional[str] = None):
         super().__init__(config)
 
         self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
         self.dropout = nn.Dropout(config.final_dropout)
 
+        self.target_lang = target_lang
+
         if config.vocab_size is None:
             raise ValueError(
-                f"You are trying to instantiate {self.__class__} with a configuration that does not define the"
-                " vocabulary size of the language model head. Please instantiate the model as follows:"
-                " `Wav2Vec2ConformerForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of"
-                " your model's configuration."
+                f"You are trying to instantiate {self.__class__} with a configuration that "
+                "does not define the vocabulary size of the language model head. Please "
+                "instantiate the model as follows: `Wav2Vec2ConformerForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
+                "or define `vocab_size` of your model's configuration."
             )
         output_hidden_size = (
             config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
@@ -1643,7 +1638,6 @@ def freeze_feature_encoder(self):
 
     @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_PROCESSOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=CausalLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1685,7 +1679,6 @@ def forward(
 
         loss = None
         if labels is not None:
-
             if labels.max() >= self.config.vocab_size:
                 raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
 
@@ -1738,8 +1731,7 @@ def __init__(self, config):
 
         if hasattr(config, "add_adapter") and config.add_adapter:
             raise ValueError(
-                "Sequence classification does not support the use of Wav2Vec2Conformer adapters"
-                " (config.add_adapter=True)"
+                "Sequence classification does not support the use of Wav2Vec2Conformer adapters (config.add_adapter=True)"
             )
         self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
         num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
@@ -1769,13 +1761,10 @@ def freeze_base_model(self):
 
     @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
-        checkpoint=_SEQ_CLASS_CHECKPOINT,
+        checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
         modality="audio",
-        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
-        expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
     )
     # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.forward with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer,WAV_2_VEC_2->WAV2VEC2_CONFORMER
     def forward(
@@ -1853,8 +1842,7 @@ def __init__(self, config):
 
         if hasattr(config, "add_adapter") and config.add_adapter:
             raise ValueError(
-                "Audio frame classification does not support the use of Wav2Vec2Conformer adapters"
-                " (config.add_adapter=True)"
+                "Audio frame classification does not support the use of Wav2Vec2Conformer adapters (config.add_adapter=True)"
             )
         self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
         num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
@@ -1884,12 +1872,10 @@ def freeze_base_model(self):
 
     @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
-        checkpoint=_FRAME_CLASS_CHECKPOINT,
+        checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
         modality="audio",
-        expected_output=_FRAME_EXPECTED_OUTPUT,
     )
     # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification.forward with wav2vec2->wav2vec2_conformer
     def forward(
@@ -2058,12 +2044,10 @@ def _conv_out_length(input_length, kernel_size, stride):
 
     @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
-        checkpoint=_XVECTOR_CHECKPOINT,
+        checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=XVectorOutput,
         config_class=_CONFIG_FOR_DOC,
         modality="audio",
-        expected_output=_XVECTOR_EXPECTED_OUTPUT,
     )
     # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector.forward with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer,WAV_2_VEC_2->WAV2VEC2_CONFORMER
     def forward(
diff --git a/src/transformers/models/wav2vec2_phoneme/__init__.py b/src/transformers/models/wav2vec2_phoneme/__init__.py
index 84dc9942d515..7859f381dd51 100644
--- a/src/transformers/models/wav2vec2_phoneme/__init__.py
+++ b/src/transformers/models/wav2vec2_phoneme/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py b/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
index c983c4be8264..bd64dcf18d97 100644
--- a/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
+++ b/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
@@ -23,7 +23,7 @@
 
 import numpy as np
 
-from ...tokenization_utils import PreTrainedTokenizer, _insert_one_token_to_ordered_list
+from ...tokenization_utils import PreTrainedTokenizer
 from ...tokenization_utils_base import AddedToken
 from ...utils import (
     ModelOutput,
@@ -141,21 +141,8 @@ def __init__(
         do_phonemize=True,
         phonemizer_lang="en-us",
         phonemizer_backend="espeak",
-        **kwargs
+        **kwargs,
     ):
-        super().__init__(
-            unk_token=unk_token,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            pad_token=pad_token,
-            word_delimiter_token=word_delimiter_token,
-            phone_delimiter_token=phone_delimiter_token,
-            do_phonemize=do_phonemize,
-            phonemizer_lang=phonemizer_lang,
-            phonemizer_backend=phonemizer_backend,
-            **kwargs,
-        )
-
         self._word_delimiter_token = word_delimiter_token
         self._phone_delimiter_token = phone_delimiter_token
         self.do_phonemize = do_phonemize
@@ -168,13 +155,38 @@ def __init__(
         with open(vocab_file, encoding="utf-8") as vocab_handle:
             self.encoder = json.load(vocab_handle)
         self.decoder = {v: k for k, v in self.encoder.items()}
+        super().__init__(
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            word_delimiter_token=word_delimiter_token,
+            phone_delimiter_token=phone_delimiter_token,
+            do_phonemize=do_phonemize,
+            phonemizer_lang=phonemizer_lang,
+            phonemizer_backend=phonemizer_backend,
+            **kwargs,
+        )
 
     @property
     def vocab_size(self) -> int:
         return len(self.decoder)
 
     def get_vocab(self) -> Dict:
-        return dict(self.encoder, **self.added_tokens_encoder)
+        vocab = dict(self.encoder)
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
+        # Overwritten to never strip!
+        to_add = []
+        for token in new_tokens:
+            if isinstance(token, str):
+                to_add.append(AddedToken(token, rstrip=False, lstrip=False, normalize=True))
+            else:
+                to_add.append(token)
+
+        return super()._add_tokens(to_add, special_tokens)
 
     def init_backend(self, phonemizer_lang: str):
         """
@@ -409,7 +421,7 @@ def _decode(
         self,
         token_ids: List[int],
         skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: bool = True,
+        clean_up_tokenization_spaces: bool = None,
         group_tokens: bool = True,
         filter_word_delimiter_token: bool = True,
         spaces_between_special_tokens: bool = False,
@@ -438,6 +450,11 @@ def _decode(
 
         text = string_output["text"]
 
+        clean_up_tokenization_spaces = (
+            clean_up_tokenization_spaces
+            if clean_up_tokenization_spaces is not None
+            else self.clean_up_tokenization_spaces
+        )
         if clean_up_tokenization_spaces:
             text = self.clean_up_tokenization(text)
 
@@ -451,9 +468,9 @@ def decode(
         self,
         token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
         skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: bool = True,
+        clean_up_tokenization_spaces: bool = None,
         output_char_offsets: bool = False,
-        **kwargs
+        **kwargs,
     ) -> str:
         """
         Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
@@ -466,7 +483,7 @@ def decode(
                 List of tokenized input ids. Can be obtained using the `__call__` method.
             skip_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not to remove special tokens in the decoding.
-            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
+            clean_up_tokenization_spaces (`bool`, *optional*):
                 Whether or not to clean up the tokenization spaces.
             output_char_offsets (`bool`, *optional*, defaults to `False`):
                 Whether or not to output character offsets. Character offsets can be used in combination with the
@@ -507,9 +524,9 @@ def batch_decode(
         self,
         sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"],
         skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: bool = True,
+        clean_up_tokenization_spaces: bool = None,
         output_char_offsets: bool = False,
-        **kwargs
+        **kwargs,
     ) -> List[str]:
         """
         Convert a list of lists of token ids into a list of strings by calling decode.
@@ -519,7 +536,7 @@ def batch_decode(
                 List of tokenized input ids. Can be obtained using the `__call__` method.
             skip_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not to remove special tokens in the decoding.
-            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
+            clean_up_tokenization_spaces (`bool`, *optional*):
                 Whether or not to clean up the tokenization spaces.
             output_char_offsets (`bool`, *optional*, defaults to `False`):
                 Whether or not to output character offsets. Character offsets can be used in combination with the
@@ -571,61 +588,3 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
             f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
 
         return (vocab_file,)
-
-    def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
-        """
-        Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
-        it with indices starting from length of the current vocabulary.
-
-        Args:
-            new_tokens (`List[str]`or `List[tokenizers.AddedToken]`):
-                Token(s) to add in vocabulary. A token is only added if it's not already in the vocabulary (tested by
-                checking if the tokenizer assign the index of the `unk_token` to them).
-            special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the tokens should be added as special tokens.
-
-        Returns:
-            `int`: The number of tokens actually added to the vocabulary.
-
-        Examples:
-
-        ```python
-        # Let's see how to increase the vocabulary of Bert model and tokenizer
-        tokenizer = Wav2Vec2PhonemeCTCTokenizer.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
-        model = Wav2Vec2PhonemeForCTC.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
-
-        num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
-        print("We have added", num_added_toks, "tokens")
-        # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
-        model.resize_token_embeddings(len(tokenizer))
-        ```"""
-        new_tokens = [str(tok) for tok in new_tokens]
-
-        tokens_to_add = []
-        for token in new_tokens:
-            if not isinstance(token, str):
-                raise ValueError(f"Token {token} has to be of type string, but is of type {type(token)}.")
-            assert isinstance(token, str)
-            if (
-                token != self.unk_token
-                and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token)
-                and token not in tokens_to_add
-            ):
-                tokens_to_add.append(token)
-                if self.verbose:
-                    logger.info(f"Adding {token} to the vocabulary")
-
-        added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(tokens_to_add))
-        added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
-        self.added_tokens_encoder.update(added_tok_encoder)
-        self.added_tokens_decoder.update(added_tok_decoder)
-
-        # Make sure we don't split on any special tokens (even they were already in the vocab before)
-        for token in tokens_to_add:
-            if len(token) > 1:
-                self._additional_special_tokens.append(AddedToken(token))
-                _insert_one_token_to_ordered_list(self.unique_no_split_tokens, token)
-
-        self._create_trie(self.unique_no_split_tokens)
-
-        return len(tokens_to_add)
diff --git a/src/transformers/models/wav2vec2_with_lm/__init__.py b/src/transformers/models/wav2vec2_with_lm/__init__.py
index 174946ae1018..611688f6a683 100644
--- a/src/transformers/models/wav2vec2_with_lm/__init__.py
+++ b/src/transformers/models/wav2vec2_with_lm/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
index 400a08c0352c..e331da14e810 100644
--- a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
+++ b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
@@ -50,18 +50,18 @@ class Wav2Vec2DecoderWithLMOutput(ModelOutput):
         text (list of `str` or `str`):
             Decoded logits in text from. Usually the speech transcription.
         logit_score (list of `float` or `float`):
-            Total logit score of the beam associated with produced text.
+            Total logit score of the beams associated with produced text.
         lm_score (list of `float`):
-            Fused lm_score of the beam associated with produced text.
+            Fused lm_score of the beams associated with produced text.
         word_offsets (list of `List[Dict[str, Union[int, str]]]` or `List[Dict[str, Union[int, str]]]`):
             Offsets of the decoded words. In combination with sampling rate and model downsampling rate word offsets
             can be used to compute time stamps for each word.
     """
 
-    text: Union[List[str], str]
-    logit_score: Union[List[float], float] = None
-    lm_score: Union[List[float], float] = None
-    word_offsets: Union[List[ListOfDict], ListOfDict] = None
+    text: Union[List[List[str]], List[str], str]
+    logit_score: Union[List[List[float]], List[float], float] = None
+    lm_score: Union[List[List[float]], List[float], float] = None
+    word_offsets: Union[List[List[ListOfDict]], List[ListOfDict], ListOfDict] = None
 
 
 class Wav2Vec2ProcessorWithLM(ProcessorMixin):
@@ -156,10 +156,10 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
             # make sure that only relevant filenames are downloaded
             language_model_filenames = os.path.join(BeamSearchDecoderCTC._LANGUAGE_MODEL_SERIALIZED_DIRECTORY, "*")
             alphabet_filename = BeamSearchDecoderCTC._ALPHABET_SERIALIZED_FILENAME
-            allow_regex = [language_model_filenames, alphabet_filename]
+            allow_patterns = [language_model_filenames, alphabet_filename]
 
             decoder = BeamSearchDecoderCTC.load_from_hf_hub(
-                pretrained_model_name_or_path, allow_regex=allow_regex, **kwargs
+                pretrained_model_name_or_path, allow_patterns=allow_patterns, **kwargs
             )
 
         # set language model attributes
@@ -296,6 +296,7 @@ def batch_decode(
         unk_score_offset: Optional[float] = None,
         lm_score_boundary: Optional[bool] = None,
         output_word_offsets: bool = False,
+        n_best: int = 1,
     ):
         """
         Batch decode output logits to audio transcription with language model support.
@@ -350,6 +351,11 @@ def batch_decode(
             output_word_offsets (`bool`, *optional*, defaults to `False`):
                 Whether or not to output word offsets. Word offsets can be used in combination with the sampling rate
                 and model downsampling rate to compute the time-stamps of transcribed words.
+            n_best (`int`, *optional*, defaults to `1`):
+                Number of best hypotheses to return. If `n_best` is greater than 1, the returned `text` will be a list
+                of lists of strings, `logit_score` will be a list of lists of floats, and `lm_score` will be a list of
+                lists of floats, where the length of the outer list will correspond to the batch size and the length of
+                the inner list will correspond to the number of returned hypotheses . The value should be >= 1.
 
                 <Tip>
 
@@ -425,17 +431,40 @@ def batch_decode(
 
         # extract text and scores
         batch_texts, logit_scores, lm_scores, word_offsets = [], [], [], []
+
         for d in decoded_beams:
-            batch_texts.append(d[0][0])
-            logit_scores.append(d[0][-2])
-            lm_scores.append(d[0][-1])
-            word_offsets.append([{"word": t[0], "start_offset": t[1][0], "end_offset": t[1][1]} for t in d[0][1]])
+            batch_texts.append([beam[0] for beam in d])
+            logit_scores.append([beam[-2] for beam in d])
+            lm_scores.append([beam[-1] for beam in d])
+
+            # word_offsets.append([{"word": t[0], "start_offset": t[1][0], "end_offset": t[1][1]} for t in d[0][1]])
+
+            word_offsets.append(
+                [
+                    [
+                        {"word": word, "start_offset": start_offset, "end_offset": end_offset}
+                        for word, (start_offset, end_offset) in beam[1]
+                    ]
+                    for beam in d
+                ]
+            )
 
         word_offsets = word_offsets if output_word_offsets else None
 
-        return Wav2Vec2DecoderWithLMOutput(
-            text=batch_texts, logit_score=logit_scores, lm_score=lm_scores, word_offsets=word_offsets
-        )
+        if n_best == 1:
+            return Wav2Vec2DecoderWithLMOutput(
+                text=[hyps[0] for hyps in batch_texts],
+                logit_score=[hyps[0] for hyps in logit_scores],
+                lm_score=[hyps[0] for hyps in lm_scores],
+                word_offsets=[hyps[0] for hyps in word_offsets] if word_offsets is not None else None,
+            )
+        else:
+            return Wav2Vec2DecoderWithLMOutput(
+                text=[hyps[:n_best] for hyps in batch_texts],
+                logit_score=[hyps[:n_best] for hyps in logit_scores],
+                lm_score=[hyps[:n_best] for hyps in lm_scores],
+                word_offsets=[hyps[:n_best] for hyps in word_offsets] if word_offsets is not None else None,
+            )
 
     def decode(
         self,
@@ -450,6 +479,7 @@ def decode(
         unk_score_offset: Optional[float] = None,
         lm_score_boundary: Optional[bool] = None,
         output_word_offsets: bool = False,
+        n_best: int = 1,
     ):
         """
         Decode output logits to audio transcription with language model support.
@@ -480,6 +510,10 @@ def decode(
             output_word_offsets (`bool`, *optional*, defaults to `False`):
                 Whether or not to output word offsets. Word offsets can be used in combination with the sampling rate
                 and model downsampling rate to compute the time-stamps of transcribed words.
+            n_best (`int`, *optional*, defaults to `1`):
+                Number of best hypotheses to return. If `n_best` is greater than 1, the returned `text` will be a list
+                of strings, `logit_score` will be a list of floats, and `lm_score` will be a list of floats, where the
+                length of these lists will correspond to the number of returned hypotheses. The value should be >= 1.
 
                 <Tip>
 
@@ -530,7 +564,7 @@ def decode(
         >>> # compare word offsets with audio `common_voice_en_100038.mp3` online on the dataset viewer:
         >>> # https://huggingface.co/datasets/common_voice/viewer/en/train
         >>> word_offsets[:4]
-        [{'word': 'WHY', 'start_time': 1.42, 'end_time': 1.54}, {'word': 'DOES', 'start_time': 1.64, 'end_time': 1.88}, {'word': 'A', 'start_time': 2.12, 'end_time': 2.14}, {'word': 'MILE', 'start_time': 2.26, 'end_time': 2.46}]
+        [{'word': 'WHY', 'start_time': 1.42, 'end_time': 1.54}, {'word': 'DOES', 'start_time': 1.66, 'end_time': 1.9}, {'word': 'MILISANDRA', 'start_time': 2.26, 'end_time': 2.9}, {'word': 'LOOK', 'start_time': 3.0, 'end_time': 3.16}]
         ```"""
 
         from pyctcdecode.constants import (
@@ -564,17 +598,37 @@ def decode(
         word_offsets = None
         if output_word_offsets:
             word_offsets = [
-                {"word": word, "start_offset": start_offset, "end_offset": end_offset}
-                for word, (start_offset, end_offset) in decoded_beams[0][2]
+                [
+                    {"word": word, "start_offset": start_offset, "end_offset": end_offset}
+                    for word, (start_offset, end_offset) in beam[2]
+                ]
+                for beam in decoded_beams
             ]
+        logit_scores = [beam[-2] for beam in decoded_beams]
 
-        # more output features will be added in the future
-        return Wav2Vec2DecoderWithLMOutput(
-            text=decoded_beams[0][0],
-            logit_score=decoded_beams[0][-2],
-            lm_score=decoded_beams[0][-1],
-            word_offsets=word_offsets,
-        )
+        lm_scores = [beam[-1] for beam in decoded_beams]
+
+        hypotheses = [beam[0] for beam in decoded_beams]
+
+        if n_best > len(decoded_beams):
+            logger.info(
+                "N-best size is larger than the number of generated hypotheses, all hypotheses will be returned."
+            )
+
+        if n_best == 1:
+            return Wav2Vec2DecoderWithLMOutput(
+                text=hypotheses[0],
+                logit_score=logit_scores[0],
+                lm_score=lm_scores[0],
+                word_offsets=word_offsets[0] if word_offsets is not None else None,
+            )
+        else:
+            return Wav2Vec2DecoderWithLMOutput(
+                text=hypotheses[:n_best],
+                logit_score=logit_scores[:n_best],
+                lm_score=lm_scores[:n_best],
+                word_offsets=word_offsets[:n_best] if word_offsets is not None else None,
+            )
 
     @contextmanager
     def as_target_processor(self):
diff --git a/src/transformers/models/wavlm/__init__.py b/src/transformers/models/wavlm/__init__.py
index 9cd64b25dafa..3d48a3615bb4 100644
--- a/src/transformers/models/wavlm/__init__.py
+++ b/src/transformers/models/wavlm/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/wavlm/configuration_wavlm.py b/src/transformers/models/wavlm/configuration_wavlm.py
index aa5fecc27e48..831b85f24c65 100644
--- a/src/transformers/models/wavlm/configuration_wavlm.py
+++ b/src/transformers/models/wavlm/configuration_wavlm.py
@@ -58,10 +58,15 @@ class WavLMConfig(PretrainedConfig):
             `"relu"`, `"selu"` and `"gelu_new"` are supported.
         hidden_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        activation_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for activations inside the fully connected layer.
         attention_dropout (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
         final_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for the final projection layer of [`WavLMForCTC`].
+        layerdrop (`float`, *optional*, defaults to 0.1):
+            The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
+            details.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
@@ -75,8 +80,6 @@ class WavLMConfig(PretrainedConfig):
         feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the 1D convolutional layers of the feature
             extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
-        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probabilitiy for quantized feature encoder states.
         conv_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
             A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
             feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
@@ -124,8 +127,6 @@ class WavLMConfig(PretrainedConfig):
             Number of codevector groups for product codevector quantization.
         contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
             The temperature *kappa* in the contrastive loss.
-        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probabilitiy for the output of the feature encoder that's used by the quantizer.
         num_negatives (`int`, *optional*, defaults to 100):
             Number of negative samples for the contrastive loss.
         codevector_dim (`int`, *optional*, defaults to 256):
@@ -205,7 +206,6 @@ def __init__(
         activation_dropout=0.1,
         attention_dropout=0.1,
         feat_proj_dropout=0.0,
-        feat_quantizer_dropout=0.0,
         final_dropout=0.1,
         layerdrop=0.1,
         initializer_range=0.02,
@@ -251,7 +251,7 @@ def __init__(
         adapter_stride=2,
         num_adapter_layers=3,
         output_hidden_size=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
         self.hidden_size = hidden_size
@@ -308,7 +308,6 @@ def __init__(
         self.num_codevectors_per_group = num_codevectors_per_group
         self.num_codevector_groups = num_codevector_groups
         self.contrastive_logits_temperature = contrastive_logits_temperature
-        self.feat_quantizer_dropout = feat_quantizer_dropout
         self.num_negatives = num_negatives
         self.codevector_dim = codevector_dim
         self.proj_codevector_dim = proj_codevector_dim
diff --git a/src/transformers/models/wavlm/convert_wavlm_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/wavlm/convert_wavlm_original_pytorch_checkpoint_to_pytorch.py
index 91758cc95952..84e3d231ea38 100644
--- a/src/transformers/models/wavlm/convert_wavlm_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/wavlm/convert_wavlm_original_pytorch_checkpoint_to_pytorch.py
@@ -19,8 +19,6 @@
 
 import torch
 
-from transformers import WavLMConfig, WavLMModel, logging
-
 # Step 1. clone https://github.com/microsoft/unilm
 # Step 2. git checkout to https://github.com/microsoft/unilm/commit/b94ec76c36f02fb2b0bf0dcb0b8554a2185173cd
 # Step 3. cd unilm
@@ -29,6 +27,8 @@
 from unilm.wavlm.WavLM import WavLM as WavLMOrig
 from unilm.wavlm.WavLM import WavLMConfig as WavLMConfigOrig
 
+from transformers import WavLMConfig, WavLMModel, logging
+
 
 logging.set_verbosity_info()
 logger = logging.get_logger(__name__)
@@ -179,7 +179,6 @@ def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_gro
 
 @torch.no_grad()
 def convert_wavlm_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None):
-
     # load the pre-trained checkpoints
     checkpoint = torch.load(checkpoint_path)
     cfg = WavLMConfigOrig(checkpoint["cfg"])
diff --git a/src/transformers/models/wavlm/modeling_wavlm.py b/src/transformers/models/wavlm/modeling_wavlm.py
index 2d77c6a33e37..9cf67a458b46 100755
--- a/src/transformers/models/wavlm/modeling_wavlm.py
+++ b/src/transformers/models/wavlm/modeling_wavlm.py
@@ -26,7 +26,7 @@
 from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
-from ...deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
 from ...modeling_outputs import (
     BaseModelOutput,
     CausalLMOutput,
@@ -36,7 +36,6 @@
     XVectorOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_int_div
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_wavlm import WavLMConfig
 
@@ -48,7 +47,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "WavLMConfig"
-_PROCESSOR_FOR_DOC = "Wav2Vec2Processor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "patrickvonplaten/wavlm-libri-clean-100h-base-plus"
@@ -58,12 +56,6 @@
 _CTC_EXPECTED_OUTPUT = "'mister quilter is the aposle of the middle classes and we are glad to welcome his gospel'"
 _CTC_EXPECTED_LOSS = 12.51
 
-# Audio class docstring
-_FEAT_EXTRACTOR_FOR_DOC = "Wav2Vec2FeatureExtractor"
-_SEQ_CLASS_CHECKPOINT = "hf-internal-testing/tiny-random-wavlm"
-_SEQ_CLASS_EXPECTED_OUTPUT = "'no'"  # TODO(anton) - could you quickly fine-tune a KS WavLM Model
-_SEQ_CLASS_EXPECTED_LOSS = 0.7  # TODO(anton) - could you quickly fine-tune a KS WavLM Model
-
 # Frame class docstring
 _FRAME_CLASS_CHECKPOINT = "microsoft/wavlm-base-plus-sd"
 _FRAME_EXPECTED_OUTPUT = [0, 0]
@@ -287,15 +279,19 @@ def __init__(self, config):
             groups=config.num_conv_pos_embedding_groups,
         )
 
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
         if is_deepspeed_zero3_enabled():
             import deepspeed
 
             with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
-                self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+                self.conv = weight_norm(self.conv, name="weight", dim=2)
             deepspeed.zero.register_external_parameter(self, self.conv.weight_v)
             deepspeed.zero.register_external_parameter(self, self.conv.weight_g)
         else:
-            self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+            self.conv = weight_norm(self.conv, name="weight", dim=2)
 
         self.padding = WavLMSamePadLayer(config.num_conv_pos_embeddings)
         self.activation = ACT2FN[config.feat_extract_activation]
@@ -711,7 +707,7 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = np.random.uniform(0, 1)
+            dropout_probability = torch.rand([])
 
             skip_the_layer = self.training and i > 0 and (dropout_probability < self.config.layerdrop)
             if not skip_the_layer or deepspeed_zero3_is_enabled:
@@ -801,7 +797,7 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = np.random.uniform(0, 1)
+            dropout_probability = torch.rand([])
 
             skip_the_layer = self.training and i > 0 and (dropout_probability < self.config.layerdrop)
             if not skip_the_layer or deepspeed_zero3_is_enabled:
@@ -969,7 +965,6 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2PreTrainedModel with Wav2Vec2->WavLM, wav2vec2->wavlm
 class WavLMPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -979,7 +974,6 @@ class WavLMPreTrainedModel(PreTrainedModel):
     config_class = WavLMConfig
     base_model_prefix = "wavlm"
     main_input_name = "input_values"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
     supports_gradient_checkpointing = True
 
     def _init_weights(self, module):
@@ -1027,7 +1021,7 @@ def _get_feat_extract_output_lengths(
         def _conv_out_length(input_length, kernel_size, stride):
             # 1D convolutional layer output length formula taken
             # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch_int_div(input_length - kernel_size, stride) + 1
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
 
         for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
             input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
@@ -1065,8 +1059,9 @@ def _set_gradient_checkpointing(self, module, value=False):
 
 WAVLM_START_DOCSTRING = r"""
     WavLM was proposed in [WavLM: Unified Speech Representation Learning with Labeled and Unlabeled
-    Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei,
-    Michael Zeng, Xuedong Huang.
+    Data](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo
+    Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian,
+    Jian Wu, Michael Zeng, Xiangzhan Yu, Furu Wei.
 
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving etc.).
@@ -1085,10 +1080,10 @@ def _set_gradient_checkpointing(self, module, value=False):
 WAVLM_INPUTS_DOCSTRING = r"""
     Args:
         input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
-            Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
-            into an array of type *List[float]* or a *numpy.ndarray*, *e.g.* via the soundfile library (*pip install
-            soundfile*). To prepare the array into *input_values*, the [`WavLMProcessor`] should be used for padding
-            and conversion into a tensor of type *torch.FloatTensor*. See [`WavLMProcessor.__call__`] for details.
+            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
+            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
+            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
+            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details.
         attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
             1]`:
@@ -1212,7 +1207,6 @@ def _mask_hidden_states(
 
     @add_start_docstrings_to_model_forward(WAVLM_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_PROCESSOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=Wav2Vec2BaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1278,12 +1272,14 @@ def forward(
 )
 # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->WavLM, wav2vec2->wavlm, WAV_2_VEC_2->WAVLM
 class WavLMForCTC(WavLMPreTrainedModel):
-    def __init__(self, config):
+    def __init__(self, config, target_lang: Optional[str] = None):
         super().__init__(config)
 
         self.wavlm = WavLMModel(config)
         self.dropout = nn.Dropout(config.final_dropout)
 
+        self.target_lang = target_lang
+
         if config.vocab_size is None:
             raise ValueError(
                 f"You are trying to instantiate {self.__class__} with a configuration that "
@@ -1299,6 +1295,27 @@ def __init__(self, config):
         # Initialize weights and apply final processing
         self.post_init()
 
+    def tie_weights(self):
+        """
+        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
+        passing `target_lang=...` to `from_pretrained(...)`.
+
+        This method is **not** supposed to be called by the user and is prone to be changed in the future.
+        """
+
+        # Note that `tie_weights` is usually used to tie input and output embedding weights. The method is re-purposed to
+        # correctly load adapter layers for WavLM so that we do not have to introduce a new API to
+        # [`PreTrainedModel`]. While slightly hacky, WavLM never has to tie input and output embeddings, so that it is
+        # ok to repurpose this function here.
+        target_lang = self.target_lang
+
+        if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
+            raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
+        elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
+            logger.info("By default `target_lang` is set to 'eng'.")
+        elif target_lang is not None:
+            self.load_adapter(target_lang, force_load=True)
+
     def freeze_feature_extractor(self):
         """
         Calling this function will disable the gradient computation for the feature encoder so that its parameter will
@@ -1318,9 +1335,16 @@ def freeze_feature_encoder(self):
         """
         self.wavlm.feature_extractor._freeze_parameters()
 
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wavlm.parameters():
+            param.requires_grad = False
+
     @add_start_docstrings_to_model_forward(WAVLM_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_PROCESSOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=CausalLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1361,7 +1385,6 @@ def forward(
 
         loss = None
         if labels is not None:
-
             if labels.max() >= self.config.vocab_size:
                 raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
 
@@ -1407,7 +1430,6 @@ def forward(
     """,
     WAVLM_START_DOCSTRING,
 )
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification with Wav2Vec2->WavLM, wav2vec2->wavlm, WAV_2_VEC_2->WAVLM
 class WavLMForSequenceClassification(WavLMPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -1426,6 +1448,7 @@ def __init__(self, config):
         # Initialize weights and apply final processing
         self.post_init()
 
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.freeze_feature_extractor
     def freeze_feature_extractor(self):
         """
         Calling this function will disable the gradient computation for the feature encoder so that its parameters will
@@ -1438,6 +1461,7 @@ def freeze_feature_extractor(self):
         )
         self.freeze_feature_encoder()
 
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.freeze_feature_encoder with wav2vec2->wavlm
     def freeze_feature_encoder(self):
         """
         Calling this function will disable the gradient computation for the feature encoder so that its parameter will
@@ -1445,6 +1469,7 @@ def freeze_feature_encoder(self):
         """
         self.wavlm.feature_extractor._freeze_parameters()
 
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.freeze_base_model with wav2vec2->wavlm
     def freeze_base_model(self):
         """
         Calling this function will disable the gradient computation for the base model so that its parameters will not
@@ -1455,14 +1480,12 @@ def freeze_base_model(self):
 
     @add_start_docstrings_to_model_forward(WAVLM_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
-        checkpoint=_SEQ_CLASS_CHECKPOINT,
+        checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
         modality="audio",
-        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
-        expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
     )
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.forward with Wav2Vec2->WavLM, wav2vec2->wavlm
     def forward(
         self,
         input_values: Optional[torch.Tensor],
@@ -1578,7 +1601,6 @@ def freeze_base_model(self):
 
     @add_start_docstrings_to_model_forward(WAVLM_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_FRAME_CLASS_CHECKPOINT,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1761,7 +1783,6 @@ def _conv_out_length(input_length, kernel_size, stride):
 
     @add_start_docstrings_to_model_forward(WAVLM_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_XVECTOR_CHECKPOINT,
         output_type=XVectorOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/whisper/__init__.py b/src/transformers/models/whisper/__init__.py
index 2528e03a4d2c..cd962478e34d 100644
--- a/src/transformers/models/whisper/__init__.py
+++ b/src/transformers/models/whisper/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,7 +13,14 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tf_available, is_torch_available
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_flax_available,
+    is_tf_available,
+    is_tokenizers_available,
+    is_torch_available,
+)
 
 
 _import_structure = {
@@ -27,6 +30,13 @@
     "tokenization_whisper": ["WhisperTokenizer"],
 }
 
+try:
+    if not is_tokenizers_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["tokenization_whisper_fast"] = ["WhisperTokenizerFast"]
 
 try:
     if not is_torch_available():
@@ -39,6 +49,7 @@
         "WhisperForConditionalGeneration",
         "WhisperModel",
         "WhisperPreTrainedModel",
+        "WhisperForAudioClassification",
     ]
 
 try:
@@ -54,12 +65,34 @@
         "TFWhisperPreTrainedModel",
     ]
 
+try:
+    if not is_flax_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_flax_whisper"] = [
+        "FlaxWhisperForConditionalGeneration",
+        "FlaxWhisperModel",
+        "FlaxWhisperPreTrainedModel",
+        "FlaxWhisperForAudioClassification",
+    ]
+
+
 if TYPE_CHECKING:
     from .configuration_whisper import WHISPER_PRETRAINED_CONFIG_ARCHIVE_MAP, WhisperConfig, WhisperOnnxConfig
     from .feature_extraction_whisper import WhisperFeatureExtractor
     from .processing_whisper import WhisperProcessor
     from .tokenization_whisper import WhisperTokenizer
 
+    try:
+        if not is_tokenizers_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .tokenization_whisper_fast import WhisperTokenizerFast
+
     try:
         if not is_torch_available():
             raise OptionalDependencyNotAvailable()
@@ -68,6 +101,7 @@
     else:
         from .modeling_whisper import (
             WHISPER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            WhisperForAudioClassification,
             WhisperForConditionalGeneration,
             WhisperModel,
             WhisperPreTrainedModel,
@@ -86,6 +120,19 @@
             TFWhisperPreTrainedModel,
         )
 
+    try:
+        if not is_flax_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_flax_whisper import (
+            FlaxWhisperForAudioClassification,
+            FlaxWhisperForConditionalGeneration,
+            FlaxWhisperModel,
+            FlaxWhisperPreTrainedModel,
+        )
+
 else:
     import sys
 
diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index ee5929fd8b4d..a8bbc9718f11 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -127,10 +127,8 @@ class WhisperConfig(PretrainedConfig):
             Padding token id.
         bos_token_id (`int`, *optional*, defaults to 50256):
             Begin of stream token id.
-        eos_token_id (`int`, *optional*, defaults to 50257):
+        eos_token_id (`int`, *optional*, defaults to 50256):
             End of stream token id.
-        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
-            Whether to tie input and output embeddings.
         suppress_tokens (`List[int]`, *optional*):
             A list containing the non-speech tokens that will be used by the logit processor in the `generate`
             function. NON_SPEECH_TOKENS and NON_SPEECH_TOKENS_MULTI each correspond to the `english-only` and the
@@ -138,7 +136,44 @@ class WhisperConfig(PretrainedConfig):
         begin_suppress_tokens (`List[int]`, *optional*, defaults to `[220,50256]`):
             A list containing tokens that will be supressed at the beginning of the sampling process. Initialized as
             the token for `" "` (`blank_token_id`) and the `eos_token_id`
-
+        use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
+            Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
+            instance of [`WhisperForAudioClassification`].
+        classifier_proj_size (`int`, *optional*, defaults to 256):
+            Dimensionality of the projection before token mean-pooling for classification. Only relevant when using an
+            instance of [`WhisperForAudioClassification`].
+        apply_spec_augment (`bool`, *optional*, defaults to `False`):
+            Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
+            [SpecAugment: A Simple Data Augmentation Method for Automatic Speech
+            Recognition](https://arxiv.org/abs/1904.08779).
+        mask_time_prob (`float`, *optional*, defaults to 0.05):
+            Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
+            procecure generates `mask_time_prob*len(time_axis)/mask_time_length` independent masks over the axis. If
+            reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
+            masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
+            actual percentage of masked vectors. This is only relevant if `apply_spec_augment == True`.
+        mask_time_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the time axis.
+        mask_time_min_masks (`int`, *optional*, defaults to 2),:
+            The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step,
+            irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
+            mask_time_min_masks''
+        mask_feature_prob (`float`, *optional*, defaults to 0.0):
+            Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
+            masking procecure generates `mask_feature_prob*len(feature_axis)/mask_time_length` independent masks over
+            the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
+            span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
+            may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
+            True`.
+        mask_feature_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the feature axis.
+        mask_feature_min_masks (`int`, *optional*, defaults to 0),:
+            The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
+            `mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks`.
+        median_filter_width (`int`, *optional*, defaults to 7):
+            Width of the median filter used to smoothen to cross-attention outputs when computing token timestamps.
+            Should be an odd number.
 
     Example:
 
@@ -183,12 +218,21 @@ def __init__(
         max_source_positions=1500,
         max_target_positions=448,
         pad_token_id=50256,
-        bos_token_id=50257,
+        bos_token_id=50256,
         eos_token_id=50256,
-        tie_word_embeddings=True,
         suppress_tokens=None,
         begin_suppress_tokens=[220, 50256],
-        **kwargs
+        use_weighted_layer_sum=False,
+        classifier_proj_size=256,
+        apply_spec_augment=False,
+        mask_time_prob=0.05,
+        mask_time_length=10,
+        mask_time_min_masks=2,
+        mask_feature_prob=0.0,
+        mask_feature_length=10,
+        mask_feature_min_masks=0,
+        median_filter_width=7,
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.num_mel_bins = num_mel_bins
@@ -209,16 +253,30 @@ def __init__(
         self.use_cache = use_cache
         self.num_hidden_layers = encoder_layers
         self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
-        self.tie_word_embeddings = tie_word_embeddings
         self.max_source_positions = max_source_positions
         self.max_target_positions = max_target_positions
+
+        # Audio Classification-specific parameters. Feel free to ignore for other classes.
+        self.classifier_proj_size = classifier_proj_size
+        self.use_weighted_layer_sum = use_weighted_layer_sum
+
+        # fine-tuning config parameters for SpecAugment: https://arxiv.org/abs/1904.08779
+        self.apply_spec_augment = apply_spec_augment
+        self.mask_time_prob = mask_time_prob
+        self.mask_time_length = mask_time_length
+        self.mask_time_min_masks = mask_time_min_masks
+        self.mask_feature_prob = mask_feature_prob
+        self.mask_feature_length = mask_feature_length
+        self.mask_feature_min_masks = mask_feature_min_masks
+
+        self.median_filter_width = median_filter_width
+
         super().__init__(
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,
             eos_token_id=eos_token_id,
             is_encoder_decoder=is_encoder_decoder,
             decoder_start_token_id=decoder_start_token_id,
-            tie_word_embeddings=tie_word_embeddings,
             suppress_tokens=suppress_tokens,
             begin_suppress_tokens=begin_suppress_tokens,
             **kwargs,
diff --git a/src/transformers/models/whisper/convert_openai_to_hf.py b/src/transformers/models/whisper/convert_openai_to_hf.py
index d838515fec85..3e7d42634bad 100644
--- a/src/transformers/models/whisper/convert_openai_to_hf.py
+++ b/src/transformers/models/whisper/convert_openai_to_hf.py
@@ -50,7 +50,6 @@ def remove_ignore_keys_(state_dict):
     "mlp.0": "fc1",
     "mlp.2": "fc2",
     "mlp_ln": "final_layer_norm",
-    "blocks": "layers",
     ".attn.query": ".self_attn.q_proj",
     ".attn.key": ".self_attn.k_proj",
     ".attn.value": ".self_attn.v_proj",
@@ -158,12 +157,10 @@ def convert_openai_whisper_to_tfms(checkpoint_path, pytorch_dump_folder_path):
 
     model = WhisperForConditionalGeneration(config)
     missing, unexpected = model.model.load_state_dict(state_dict, strict=False)
-    if len(missing) > 0 and not set(missing) <= set(
-        [
-            "encoder.embed_positions.weights",
-            "decoder.embed_positions.weights",
-        ]
-    ):
+    if len(missing) > 0 and not set(missing) <= {
+        "encoder.embed_positions.weights",
+        "decoder.embed_positions.weights",
+    }:
         raise ValueError(
             "Only `encoder.embed_positions.weights` and `decoder.embed_positions.weights`  are allowed to be missing,"
             f" but all the following weights are missing {missing}"
diff --git a/src/transformers/models/whisper/english_normalizer.py b/src/transformers/models/whisper/english_normalizer.py
index 11912bcc55b7..7f6aab4ad29d 100644
--- a/src/transformers/models/whisper/english_normalizer.py
+++ b/src/transformers/models/whisper/english_normalizer.py
@@ -14,17 +14,10 @@
 # limitations under the License.
 
 import re
+import unicodedata
 from fractions import Fraction
 from typing import Iterator, List, Match, Optional, Union
 
-from ...utils import is_more_itertools_available
-
-
-if is_more_itertools_available():
-    from more_itertools import windowed
-
-import unicodedata
-
 import regex
 
 
@@ -196,25 +189,23 @@ def __init__(self):
         }
         self.specials = {"and", "double", "triple", "point"}
 
-        self.words = set(
-            [
-                key
-                for mapping in [
-                    self.zeros,
-                    self.ones,
-                    self.ones_suffixed,
-                    self.tens,
-                    self.tens_suffixed,
-                    self.multipliers,
-                    self.multipliers_suffixed,
-                    self.preceding_prefixers,
-                    self.following_prefixers,
-                    self.suffixers,
-                    self.specials,
-                ]
-                for key in mapping
+        self.words = {
+            key
+            for mapping in [
+                self.zeros,
+                self.ones,
+                self.ones_suffixed,
+                self.tens,
+                self.tens_suffixed,
+                self.multipliers,
+                self.multipliers_suffixed,
+                self.preceding_prefixers,
+                self.following_prefixers,
+                self.suffixers,
+                self.specials,
             ]
-        )
+            for key in mapping
+        }
         self.literal_words = {"one", "ones"}
 
     def process_words(self, words: List[str]) -> Iterator[str]:
@@ -240,7 +231,9 @@ def output(result: Union[str, int]):
         if len(words) == 0:
             return
 
-        for prev, current, next in windowed([None] + words + [None], 3):
+        for i, current in enumerate(words):
+            prev = words[i - 1] if i != 0 else None
+            next = words[i + 1] if i != len(words) - 1 else None
             if skip:
                 skip = False
                 continue
diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index 5a328db65639..70eb8bd94e76 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -15,12 +15,12 @@
 """
 Feature extractor class for Whisper
 """
-
-from typing import List, Optional, Union
+import copy
+from typing import Any, Dict, List, Optional, Union
 
 import numpy as np
-from numpy.fft import fft
 
+from ...audio_utils import mel_filter_bank, spectrogram, window_function
 from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
 from ...feature_extraction_utils import BatchFeature
 from ...utils import TensorType, logging
@@ -33,8 +33,8 @@ class WhisperFeatureExtractor(SequenceFeatureExtractor):
     r"""
     Constructs a Whisper feature extractor.
 
-    This feature extractor inherits from [`WhisperFeatureExtractor`] which contains most of the main methods. Users
-    should refer to this superclass for more information regarding those methods.
+    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
+    most of the main methods. Users should refer to this superclass for more information regarding those methods.
 
     This class extracts mel-filter bank features from raw speech using a custom numpy implementation of the `Short Time
     Fourier Transform` which should match pytorch's `torch.stft` equivalent.
@@ -43,7 +43,7 @@ class WhisperFeatureExtractor(SequenceFeatureExtractor):
         feature_size (`int`, defaults to 80):
             The feature dimension of the extracted features.
         sampling_rate (`int`, defaults to 16000):
-            The sampling rate at which the audio files should be digitalized expressed in Hertz per second (Hz).
+            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
         hop_length (`int`, defaults to 160):
             Length of the overlaping windows for the STFT used to obtain the Mel Frequency coefficients.
         chunk_length (`int`, defaults to 30):
@@ -66,7 +66,7 @@ def __init__(
         n_fft=400,
         padding_value=0.0,
         return_attention_mask=False,  # pad inputs to max length with silence token (zero) and no attention mask
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             feature_size=feature_size,
@@ -81,139 +81,57 @@ def __init__(
         self.n_samples = chunk_length * sampling_rate
         self.nb_max_frames = self.n_samples // hop_length
         self.sampling_rate = sampling_rate
-        self.mel_filters = self.get_mel_filters(sampling_rate, n_fft, n_mels=feature_size)
-
-    def get_mel_filters(self, sr, n_fft, n_mels=128, dtype=np.float32):
-        # Initialize the weights
-        n_mels = int(n_mels)
-        weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
-
-        # Center freqs of each FFT bin
-        fftfreqs = np.fft.rfftfreq(n=n_fft, d=1.0 / sr)
-
-        # 'Center freqs' of mel bands - uniformly spaced between limits
-        min_mel = 0.0
-        max_mel = 45.245640471924965
-
-        mels = np.linspace(min_mel, max_mel, n_mels + 2)
-
-        mels = np.asanyarray(mels)
-
-        # Fill in the linear scale
-        f_min = 0.0
-        f_sp = 200.0 / 3
-        freqs = f_min + f_sp * mels
-
-        # And now the nonlinear scale
-        min_log_hz = 1000.0  # beginning of log region (Hz)
-        min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
-        logstep = np.log(6.4) / 27.0  # step size for log region
-
-        # If we have vector data, vectorize
-        log_t = mels >= min_log_mel
-        freqs[log_t] = min_log_hz * np.exp(logstep * (mels[log_t] - min_log_mel))
-
-        mel_f = freqs
-
-        fdiff = np.diff(mel_f)
-        ramps = np.subtract.outer(mel_f, fftfreqs)
-
-        for i in range(n_mels):
-            # lower and upper slopes for all bins
-            lower = -ramps[i] / fdiff[i]
-            upper = ramps[i + 2] / fdiff[i + 1]
-
-            # .. then intersect them with each other and zero
-            weights[i] = np.maximum(0, np.minimum(lower, upper))
-
-        # Slaney-style mel is scaled to be approx constant energy per channel
-        enorm = 2.0 / (mel_f[2 : n_mels + 2] - mel_f[:n_mels])
-        weights *= enorm[:, np.newaxis]
-
-        return weights
-
-    def fram_wave(self, waveform, center=True):
-        """
-        Transform a raw waveform into a list of smaller waveforms. The window length defines how much of the signal is
-        contain in each frame (smalle waveform), while the hope length defines the step between the beginning of each
-        new frame.
-
-        Centering is done by reflecting the waveform which is first centered around `frame_idx * hop_length`.
-        """
-        frames = []
-        for i in range(0, waveform.shape[0] + 1, self.hop_length):
-            half_window = (self.n_fft - 1) // 2 + 1
-            if center:
-                start = i - half_window if i > half_window else 0
-                end = i + half_window if i < waveform.shape[0] - half_window else waveform.shape[0]
-
-                frame = waveform[start:end]
-
-                if start == 0:
-                    padd_width = (-i + half_window, 0)
-                    frame = np.pad(frame, pad_width=padd_width, mode="reflect")
-
-                elif end == waveform.shape[0]:
-                    padd_width = (0, (i - waveform.shape[0] + half_window))
-                    frame = np.pad(frame, pad_width=padd_width, mode="reflect")
-
-            else:
-                frame = waveform[i : i + self.n_fft]
-                frame_width = frame.shape[0]
-                if frame_width < waveform.shape[0]:
-                    frame = np.lib.pad(
-                        frame, pad_width=(0, self.n_fft - frame_width), mode="constant", constant_values=0
-                    )
-
-            frames.append(frame)
-        return np.stack(frames, 0)
-
-    def stft(self, frames, window):
-        """
-        Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same
-        results as `torch.stft`.
-        """
-        frame_size = frames.shape[1]
-        fft_size = self.n_fft
-
-        if fft_size is None:
-            fft_size = frame_size
-
-        if fft_size < frame_size:
-            raise ValueError("FFT size must greater or equal the frame size")
-        # number of FFT bins to store
-        num_fft_bins = (fft_size >> 1) + 1
-
-        data = np.empty((len(frames), num_fft_bins), dtype=np.complex64)
-        fft_signal = np.zeros(fft_size)
-
-        for f, frame in enumerate(frames):
-            if window is not None:
-                np.multiply(frame, window, out=fft_signal[:frame_size])
-            else:
-                fft_signal[:frame_size] = frame
-            data[f] = fft(fft_signal, axis=0)[:num_fft_bins]
-        return data.T
+        self.mel_filters = mel_filter_bank(
+            num_frequency_bins=1 + n_fft // 2,
+            num_mel_filters=feature_size,
+            min_frequency=0.0,
+            max_frequency=8000.0,
+            sampling_rate=sampling_rate,
+            norm="slaney",
+            mel_scale="slaney",
+        )
 
     def _np_extract_fbank_features(self, waveform: np.array) -> np.ndarray:
         """
-        Compute the log-Mel spectrogram of the provided audio, gives similar results whisper's original torch
+        Compute the log-mel spectrogram of the provided audio, gives similar results to Whisper's original torch
         implementation with 1e-5 tolerance.
         """
-        window = np.hanning(self.n_fft + 1)[:-1]
+        log_spec = spectrogram(
+            waveform,
+            window_function(self.n_fft, "hann"),
+            frame_length=self.n_fft,
+            hop_length=self.hop_length,
+            power=2.0,
+            mel_filters=self.mel_filters,
+            log_mel="log10",
+        )
+        log_spec = log_spec[:, :-1]
+        log_spec = np.maximum(log_spec, log_spec.max() - 8.0)
+        log_spec = (log_spec + 4.0) / 4.0
+        return log_spec
 
-        frames = self.fram_wave(waveform)
-        stft = self.stft(frames, window=window)
-        magnitudes = np.abs(stft[:, :-1]) ** 2
+    @staticmethod
+    # Copied from transformers.models.wav2vec2.feature_extraction_wav2vec2.Wav2Vec2FeatureExtractor.zero_mean_unit_var_norm
+    def zero_mean_unit_var_norm(
+        input_values: List[np.ndarray], attention_mask: List[np.ndarray], padding_value: float = 0.0
+    ) -> List[np.ndarray]:
+        """
+        Every array in the list is normalized to have zero mean and unit variance
+        """
+        if attention_mask is not None:
+            attention_mask = np.array(attention_mask, np.int32)
+            normed_input_values = []
 
-        filters = self.mel_filters
-        mel_spec = filters @ magnitudes
+            for vector, length in zip(input_values, attention_mask.sum(-1)):
+                normed_slice = (vector - vector[:length].mean()) / np.sqrt(vector[:length].var() + 1e-7)
+                if length < normed_slice.shape[0]:
+                    normed_slice[length:] = padding_value
 
-        log_spec = np.log10(np.clip(mel_spec, a_min=1e-10, a_max=None))
-        log_spec = np.maximum(log_spec, log_spec.max() - 8.0)
-        log_spec = (log_spec + 4.0) / 4.0
+                normed_input_values.append(normed_slice)
+        else:
+            normed_input_values = [(x - x.mean()) / np.sqrt(x.var() + 1e-7) for x in input_values]
 
-        return log_spec
+        return normed_input_values
 
     def __call__(
         self,
@@ -225,7 +143,8 @@ def __call__(
         padding: Optional[str] = "max_length",
         max_length: Optional[int] = None,
         sampling_rate: Optional[int] = None,
-        **kwargs
+        do_normalize: Optional[bool] = None,
+        **kwargs,
     ) -> BatchFeature:
         """
         Main method to featurize and prepare for the model one or several sequence(s).
@@ -233,14 +152,15 @@ def __call__(
         Args:
             raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                 The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
-                values, a list of numpy arrays or a list of list of float values.
+                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
+                stereo, i.e. single float per timestep.
             truncation (`bool`, *optional*, default to `True`):
                 Activates truncation to cut input sequences longer than *max_length* to *max_length*.
             pad_to_multiple_of (`int`, *optional*, defaults to None):
                 If set will pad the sequence to a multiple of the provided value.
 
                 This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
-                >= 7.5 (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
+                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
             return_attention_mask (`bool`, *optional*):
                 Whether to return the attention mask. If left to the default, will return the attention mask according
                 to the specific feature_extractor's default.
@@ -249,8 +169,8 @@ def __call__(
 
                 <Tip>
 
-                For WhisperTransoformer models, `attention_mask` should alwys be passed for batched inference, to avoid
-                subtle bugs.
+                For Whisper models, `attention_mask` should always be passed for batched inference, to avoid subtle
+                bugs.
 
                 </Tip>
 
@@ -266,6 +186,9 @@ def __call__(
                 pipeline.
             padding_value (`float`, defaults to 0.0):
                 The value that is used to fill the padding values / vectors.
+            do_normalize (`bool`, *optional*, defaults to `False`):
+                Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
+                improve the performance of the model.
         """
 
         if sampling_rate is not None:
@@ -281,9 +204,11 @@ def __call__(
                 "Failing to do so can result in silent errors that might be hard to debug."
             )
 
-        is_batched = bool(
-            isinstance(raw_speech, (list, tuple))
-            and (isinstance(raw_speech[0], np.ndarray) or isinstance(raw_speech[0], (tuple, list)))
+        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
+        if is_batched_numpy and len(raw_speech.shape) > 2:
+            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
+        is_batched = is_batched_numpy or (
+            isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
         )
 
         if is_batched:
@@ -307,7 +232,18 @@ def __call__(
             max_length=max_length if max_length else self.n_samples,
             truncation=truncation,
             pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=return_attention_mask or do_normalize,
         )
+
+        # zero-mean and unit-variance normalization
+        if do_normalize:
+            padded_inputs["input_features"] = self.zero_mean_unit_var_norm(
+                padded_inputs["input_features"],
+                attention_mask=padded_inputs["attention_mask"],
+                padding_value=self.padding_value,
+            )
+            padded_inputs["input_features"] = np.stack(padded_inputs["input_features"], axis=0)
+
         # make sure list is in array format
         input_features = padded_inputs.get("input_features").transpose(2, 0, 1)
 
@@ -318,7 +254,24 @@ def __call__(
         else:
             padded_inputs["input_features"] = input_features
 
+        if return_attention_mask:
+            # rescale from sample (48000) to feature (3000)
+            padded_inputs["attention_mask"] = padded_inputs["attention_mask"][:, :: self.hop_length]
+
         if return_tensors is not None:
             padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
 
         return padded_inputs
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes this instance to a Python dictionary.
+
+        Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["feature_extractor_type"] = self.__class__.__name__
+        if "mel_filters" in output:
+            del output["mel_filters"]
+        return output
diff --git a/src/transformers/models/whisper/modeling_flax_whisper.py b/src/transformers/models/whisper/modeling_flax_whisper.py
new file mode 100644
index 000000000000..0f158fb60208
--- /dev/null
+++ b/src/transformers/models/whisper/modeling_flax_whisper.py
@@ -0,0 +1,1672 @@
+# coding=utf-8
+# Copyright 2022 The OpenAI Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Flax whisper model."""
+
+import random
+from functools import partial
+from typing import Optional, Tuple
+
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.linen import combine_masks, make_causal_mask
+from flax.linen import partitioning as nn_partitioning
+from flax.linen.attention import dot_product_attention_weights
+from flax.traverse_util import flatten_dict, unflatten_dict
+from jax import lax
+from jax.random import PRNGKey
+
+from ...generation.flax_logits_process import FlaxWhisperTimeStampLogitsProcessor
+from ...modeling_flax_outputs import (
+    FlaxBaseModelOutput,
+    FlaxBaseModelOutputWithPastAndCrossAttentions,
+    FlaxCausalLMOutputWithCrossAttentions,
+    FlaxSeq2SeqLMOutput,
+    FlaxSeq2SeqModelOutput,
+    FlaxSequenceClassifierOutput,
+)
+from ...modeling_flax_utils import (
+    ACT2FN,
+    FlaxPreTrainedModel,
+    append_call_sample_docstring,
+    append_replace_return_docstrings,
+    overwrite_call_docstring,
+)
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from .configuration_whisper import WhisperConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+_CHECKPOINT_FOR_DOC = "openai/whisper-tiny"
+_CONFIG_FOR_DOC = "WhisperConfig"
+
+remat = nn_partitioning.remat
+
+
+WHISPER_START_DOCSTRING = r"""
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its models (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.) This model is also a Flax Linen
+    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
+    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.
+    Finally, this model supports inherent JAX features such as:
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+    Parameters:
+        config ([`WhisperConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+            `jax.numpy.bfloat16` (on TPUs). This can be used to enable mixed-precision training or half-precision
+            inference on GPUs or TPUs. If specified all the computation will be performed with the given `dtype`.
+            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+            parameters.** If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`]
+            and [`~FlaxPreTrainedModel.to_bf16`].
+"""
+
+WHISPER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_features (`numpy.ndarray` of shape `(batch_size, feature_size, sequence_length)`):
+            Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
+            loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via
+            the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
+            [`WhisperFeatureExtractor`] should be used for extracting the features, padding and conversion into a
+            tensor of type `numpy.ndarray`. See [`~WhisperFeatureExtractor.__call__`]
+        attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Whisper does not support masking of the `input_features`, this argument is preserved for compatibility, but
+            is not used. By default the silence in the input log mel spectrogram are ignored.
+        decoder_input_ids (`numpy.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary. Indices can be obtained using
+            [`WhisperTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+            [What are decoder input IDs?](../glossary#decoder-input-ids) Whisper uses the `decoder_start_token_id` as
+            the starting token for `decoder_input_ids` generation.
+        decoder_attention_mask (`numpy.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default. If you want to change padding behavior, you should modify to your needs. See diagram 1
+            in [the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
+        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Whisper does not use `position_ids` in the encoder as `input_features` is always the same size and doesn't
+            use masking, but this argument is preserved for compatibility. By default the silence in the input log mel
+            spectrogram are ignored.
+        decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
+            range `[0, config.max_position_embeddings - 1]`.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+WHISPER_ENCODE_INPUTS_DOCSTRING = r"""
+    Args:
+        input_features (`numpy.ndarray` of shape `(batch_size, feature_size, sequence_length)`):
+            Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
+            loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via
+            the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
+            [`WhisperFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
+            tensor of type `numpy.ndarray`. See [`~WhisperFeatureExtractor.__call__`].
+        attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Whisper does not support masking of the `input_features`, this argument is preserved for compatibility, but
+            is not used. By default the silence in the input log mel spectrogram are ignored.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+WHISPER_DECODE_INPUTS_DOCSTRING = r"""
+    Args:
+        decoder_input_ids (`numpy.ndarray` of shape `(batch_size, target_sequence_length)`):
+            Indices of decoder input sequence tokens in the vocabulary. Indices can be obtained using
+            [`WhisperTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+        encoder_outputs (`tuple(tuple(numpy.ndarray)`):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        encoder_attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+           Whisper does not support masking of the `input_features`, this argument is preserved for compatibility,
+            but it is not used. By default the silence in the input log mel spectrogram are ignored.
+        decoder_attention_mask (`numpy.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default. If you want to change padding behavior, you should modify to your needs. See diagram 1
+            in [the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
+        decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
+            range `[0, config.max_position_embeddings - 1]`.
+        past_key_values (`Dict[str, numpy.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
+            Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
+            auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class FlaxWhisperAttention(nn.Module):
+    config: WhisperConfig
+    embed_dim: int
+    num_heads: int
+    dropout: float = 0.0
+    causal: bool = False
+    bias: bool = True
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self) -> None:
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+
+        dense = partial(
+            nn.Dense,
+            self.embed_dim,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.init_std),
+        )
+
+        self.q_proj = dense(use_bias=self.bias)
+        self.k_proj = dense(use_bias=False)
+        self.v_proj = dense(use_bias=self.bias)
+        self.out_proj = dense(use_bias=self.bias)
+
+        if self.causal:
+            self.causal_mask = make_causal_mask(
+                jnp.ones((1, self.config.max_target_positions), dtype="bool"), dtype="bool"
+            )
+
+    def __call__(
+        self,
+        hidden_states: jnp.ndarray,
+        key_value_states: Optional[jnp.ndarray] = None,
+        attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+    ) -> Tuple[jnp.ndarray]:
+        is_cross_attention = key_value_states is not None
+        batch_size = hidden_states.shape[0]
+
+        query_states = self.q_proj(hidden_states)
+
+        if is_cross_attention:
+            key_states = self.k_proj(key_value_states)
+            value_states = self.v_proj(key_value_states)
+        else:
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+
+        query_states = self._split_heads(query_states)
+        key_states = self._split_heads(key_states)
+        value_states = self._split_heads(value_states)
+
+        if self.causal:
+            query_length, key_length = query_states.shape[1], key_states.shape[1]
+            if self.has_variable("cache", "cached_key"):
+                mask_shift = self.variables["cache"]["cache_index"]
+                max_decoder_length = self.variables["cache"]["cached_key"].shape[1]
+                causal_mask = lax.dynamic_slice(
+                    self.causal_mask,
+                    (0, 0, mask_shift, 0),
+                    (1, 1, query_length, max_decoder_length),
+                )
+            else:
+                causal_mask = self.causal_mask[:, :, :query_length, :key_length]
+            causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])
+
+        # combine masks if needed
+        if attention_mask is not None and self.causal:
+            attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
+            attention_mask = combine_masks(attention_mask, causal_mask)
+        elif self.causal:
+            attention_mask = causal_mask
+        elif attention_mask is not None:
+            attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2))
+
+        # During fast autoregressive decoding, we feed one position at a time,
+        # and cache the keys and values step by step.
+
+        if self.causal and (self.has_variable("cache", "cached_key") or init_cache):
+            key_states, value_states, attention_mask = self._concatenate_to_cache(
+                key_states, value_states, query_states, attention_mask
+            )
+
+        # Convert the boolean attention mask to an attention bias.
+        if attention_mask is not None:
+            # attention mask in the form of attention bias
+            attention_bias = lax.select(
+                attention_mask > 0,
+                jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
+            )
+        else:
+            attention_bias = None
+
+        dropout_rng = None
+        if not deterministic and self.dropout > 0.0:
+            dropout_rng = self.make_rng("dropout")
+
+        attn_weights = dot_product_attention_weights(
+            query_states,
+            key_states,
+            bias=attention_bias,
+            dropout_rng=dropout_rng,
+            dropout_rate=self.dropout,
+            broadcast_dropout=True,
+            deterministic=deterministic,
+            dtype=self.dtype,
+            precision=None,
+        )
+
+        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states)
+        attn_output = self._merge_heads(attn_output)
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights
+
+    def _split_heads(self, hidden_state) -> jnp.ndarray:
+        return hidden_state.reshape(hidden_state.shape[:2] + (self.num_heads, self.head_dim))
+
+    def _merge_heads(self, hidden_state) -> jnp.ndarray:
+        return hidden_state.reshape(hidden_state.shape[:2] + (self.embed_dim,))
+
+    @nn.compact
+    def _concatenate_to_cache(self, key, value, query, attention_mask) -> Tuple[jnp.ndarray, jnp.ndarray, jnp.ndarray]:
+        # detect if we're initializing by absence of existing cache data.
+        is_initialized = self.has_variable("cache", "cached_key")
+        cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
+        cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
+        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
+
+        if is_initialized:
+            *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
+            # update key, value caches with our new 1d spatial slices
+            cur_index = cache_index.value
+            indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
+            key = lax.dynamic_update_slice(cached_key.value, key, indices)
+            value = lax.dynamic_update_slice(cached_value.value, value, indices)
+            cached_key.value = key
+            cached_value.value = value
+            num_updated_cache_vectors = query.shape[1]
+            cache_index.value = cache_index.value + num_updated_cache_vectors
+            # causal mask for cached decoder self-attention: our single query position should only
+            # attend to those key positions that have already been generated and cached, not the
+            # remaining zero elements.
+            pad_mask = jnp.broadcast_to(
+                jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
+                tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
+            )
+            attention_mask = combine_masks(pad_mask, attention_mask)
+
+        return key, value, attention_mask
+
+
+# Copied from transformers.models.mbart.modeling_flax_mbart.FlaxMBartEncoderLayer with MBart->Whisper
+class FlaxWhisperEncoderLayer(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self) -> None:
+        self.embed_dim = self.config.d_model
+        self.self_attn = FlaxWhisperAttention(
+            config=self.config,
+            embed_dim=self.embed_dim,
+            num_heads=self.config.encoder_attention_heads,
+            dropout=self.config.attention_dropout,
+            dtype=self.dtype,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
+        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
+        self.activation_fn = ACT2FN[self.config.activation_function]
+        self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout)
+        self.fc1 = nn.Dense(
+            self.config.encoder_ffn_dim,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.init_std),
+        )
+        self.fc2 = nn.Dense(
+            self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(self.config.init_std)
+        )
+        self.final_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
+
+    def __call__(
+        self,
+        hidden_states: jnp.ndarray,
+        attention_mask: jnp.ndarray,
+        output_attentions: bool = True,
+        deterministic: bool = True,
+    ) -> Tuple[jnp.ndarray]:
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weights = self.self_attn(hidden_states=hidden_states, attention_mask=attention_mask)
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = self.activation_dropout_layer(hidden_states, deterministic=deterministic)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class FlaxWhisperEncoderLayerCollection(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        if self.gradient_checkpointing:
+            FlaxWhisperEncoderCheckpointLayer = remat(FlaxWhisperEncoderLayer, static_argnums=(2, 3))
+            self.layers = [
+                FlaxWhisperEncoderCheckpointLayer(self.config, name=str(i), dtype=self.dtype)
+                for i in range(self.config.encoder_layers)
+            ]
+        else:
+            self.layers = [
+                FlaxWhisperEncoderLayer(self.config, name=str(i), dtype=self.dtype)
+                for i in range(self.config.encoder_layers)
+            ]
+        self.layerdrop = self.config.encoder_layerdrop
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if not deterministic and (dropout_probability < self.layerdrop):  # skip the layer
+                layer_outputs = (None, None)
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions,
+                    deterministic,
+                )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        outputs = (hidden_states, all_hidden_states, all_attentions)
+
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+
+        return FlaxBaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
+        )
+
+
+# Copied from transformers.models.mbart.modeling_flax_mbart.FlaxMBartDecoderLayer with MBart->Whisper
+class FlaxWhisperDecoderLayer(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self) -> None:
+        self.embed_dim = self.config.d_model
+        self.self_attn = FlaxWhisperAttention(
+            config=self.config,
+            embed_dim=self.embed_dim,
+            num_heads=self.config.decoder_attention_heads,
+            dropout=self.config.attention_dropout,
+            causal=True,
+            dtype=self.dtype,
+        )
+        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
+        self.activation_fn = ACT2FN[self.config.activation_function]
+        self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout)
+
+        self.self_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
+        self.encoder_attn = FlaxWhisperAttention(
+            config=self.config,
+            embed_dim=self.embed_dim,
+            num_heads=self.config.decoder_attention_heads,
+            dropout=self.config.attention_dropout,
+            dtype=self.dtype,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
+        self.fc1 = nn.Dense(
+            self.config.decoder_ffn_dim,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.init_std),
+        )
+        self.fc2 = nn.Dense(
+            self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(self.config.init_std)
+        )
+        self.final_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
+
+    def __call__(
+        self,
+        hidden_states: jnp.ndarray,
+        attention_mask: jnp.ndarray,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        output_attentions: bool = True,
+        deterministic: bool = True,
+    ) -> Tuple[jnp.ndarray]:
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states, attention_mask=attention_mask, init_cache=init_cache
+        )
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+        hidden_states = residual + hidden_states
+
+        # Cross-Attention Block
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+            hidden_states, cross_attn_weights = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+            )
+            hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+            hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = self.activation_dropout_layer(hidden_states, deterministic=deterministic)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        return outputs
+
+
+class FlaxWhisperDecoderLayerCollection(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        if self.gradient_checkpointing:
+            FlaxWhisperDecoderCheckpointLayer = remat(FlaxWhisperDecoderLayer, static_argnums=(4, 5, 6))
+            self.layers = [
+                FlaxWhisperDecoderCheckpointLayer(self.config, name=str(i), dtype=self.dtype)
+                for i in range(self.config.decoder_layers)
+            ]
+        else:
+            self.layers = [
+                FlaxWhisperDecoderLayer(self.config, name=str(i), dtype=self.dtype)
+                for i in range(self.config.decoder_layers)
+            ]
+        self.layerdrop = self.config.decoder_layerdrop
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+                # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if not deterministic and (dropout_probability < self.layerdrop):
+                layer_outputs = (None, None, None)
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    init_cache,
+                    output_attentions,
+                    deterministic,
+                )
+
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        outputs = [hidden_states, all_hidden_states, all_self_attns, all_cross_attentions]
+
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+
+        return FlaxBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class FlaxWhisperEncoder(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self) -> None:
+        self.conv1 = nn.Conv(
+            self.config.d_model,
+            kernel_size=(3,),
+            padding=1,
+            kernel_init=jax.nn.initializers.normal(self.config.init_std),
+            dtype=self.dtype,
+        )
+        self.conv2 = nn.Conv(
+            self.config.d_model,
+            kernel_size=(3,),
+            strides=2,
+            padding=1,
+            kernel_init=jax.nn.initializers.normal(self.config.init_std),
+            dtype=self.dtype,
+        )
+
+        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
+
+        self.layers = FlaxWhisperEncoderLayerCollection(
+            self.config,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.embed_positions = nn.Embed(self.config.max_source_positions, self.config.d_model, dtype=self.dtype)
+
+        self.layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
+
+    def __call__(
+        self,
+        input_features: jnp.ndarray,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+    ) -> Tuple[jnp.ndarray]:
+        if input_features.shape[1:] != (self.config.num_mel_bins, self.config.max_source_positions * 2):
+            raise ValueError(
+                "input_features.shape[1:], must be equal to (self.config.num_mel_bins,"
+                f" self.config.max_source_positions * 2) (got {input_features.shape[1:]}, but should be"
+                f" ({self.config.num_mel_bins}, {self.config.max_source_positions * 2}))"
+            )
+
+        input_features = input_features.transpose(0, 2, 1)
+        hidden_states = jax.nn.gelu(self.conv1(input_features), approximate=False)
+        hidden_states = jax.nn.gelu(self.conv2(hidden_states), approximate=False)
+
+        embed_positions = self.embed_positions(jnp.arange(self.config.max_source_positions))
+        hidden_states = hidden_states + embed_positions
+
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+
+        outputs = self.layers(
+            hidden_states,
+            attention_mask=None,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_states = outputs[0]
+        last_hidden_states = self.layer_norm(last_hidden_states)
+
+        # update the last element in `hidden_states` after applying `layernorm` above
+        hidden_states = None
+        if output_hidden_states:
+            hidden_states = outputs[1]
+            hidden_states = hidden_states[:-1] + (last_hidden_states,)
+
+        if not return_dict:
+            outputs = (last_hidden_states, hidden_states) + (outputs[2:] if output_hidden_states else outputs[1:])
+            return tuple(v for v in outputs if v is not None)
+
+        return FlaxBaseModelOutput(
+            last_hidden_state=last_hidden_states,
+            hidden_states=hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class FlaxWhisperDecoder(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self) -> None:
+        self.embed_tokens = nn.Embed(self.config.vocab_size, self.config.d_model, dtype=self.dtype)
+        self.embed_positions = nn.Embed(self.config.max_target_positions, self.config.d_model, dtype=self.dtype)
+
+        self.layers = FlaxWhisperDecoderLayerCollection(
+            self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing
+        )
+
+        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
+
+        self.layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-5)
+
+    def __call__(
+        self,
+        input_ids: jnp.ndarray,
+        attention_mask: jnp.ndarray,
+        position_ids: jnp.ndarray,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+    ) -> Tuple[jnp.ndarray]:
+        input_embeds = self.embed_tokens(input_ids)
+        position_embeds = self.embed_positions(position_ids)
+
+        hidden_states = input_embeds + position_embeds
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+
+        outputs = self.layers(
+            hidden_states,
+            attention_mask=attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_states = outputs[0]
+        last_hidden_states = self.layer_norm(last_hidden_states)
+
+        # update the last element in `hidden_states` after applying `layernorm` above
+        hidden_states = None
+        if output_hidden_states:
+            hidden_states = outputs[1]
+            hidden_states = hidden_states[:-1] + (last_hidden_states,)
+
+        if not return_dict:
+            outputs = (last_hidden_states, hidden_states) + (outputs[2:] if output_hidden_states else outputs[1:])
+            return tuple(v for v in outputs if v is not None)
+
+        return FlaxBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=last_hidden_states,
+            hidden_states=hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+
+class FlaxWhisperModule(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self) -> None:
+        self.encoder = FlaxWhisperEncoder(
+            self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing
+        )
+        self.decoder = FlaxWhisperDecoder(
+            self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing
+        )
+
+    def __call__(
+        self,
+        input_features: jnp.ndarray,
+        decoder_input_ids: jnp.ndarray,
+        decoder_attention_mask: jnp.ndarray,
+        decoder_position_ids: jnp.ndarray,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+    ):
+        encoder_outputs = self.encoder(
+            input_features,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=deterministic,
+        )
+
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            position_ids=decoder_position_ids,
+            encoder_hidden_states=encoder_outputs[0],
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=deterministic,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return FlaxSeq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    def _get_encoder_module(self):
+        return self.encoder
+
+    def _get_decoder_module(self):
+        return self.decoder
+
+
+class FlaxWhisperPreTrainedModel(FlaxPreTrainedModel):
+    config_class = WhisperConfig
+    base_model_prefix: str = "model"
+    main_input_name = "input_features"
+    module_class: nn.Module = None
+
+    def __init__(
+        self,
+        config: WhisperConfig,
+        input_shape: Tuple[int] = (1, 80, 3000),
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        gradient_checkpointing: bool = False,
+        **kwargs,
+    ):
+        module = self.module_class(config=config, dtype=dtype, gradient_checkpointing=gradient_checkpointing, **kwargs)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+
+    def enable_gradient_checkpointing(self):
+        self._module = self.module_class(
+            config=self.config,
+            dtype=self.dtype,
+            gradient_checkpointing=True,
+        )
+
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        input_features = jnp.zeros(input_shape, dtype="f4")
+        input_features = input_features.at[(..., -1)].set(self.config.eos_token_id)
+
+        decoder_input_ids = jnp.zeros((input_shape[0], 1), dtype="i4")
+        decoder_attention_mask = jnp.ones_like(decoder_input_ids)
+
+        batch_size, sequence_length = decoder_input_ids.shape
+        decoder_position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
+
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+
+        random_params = self.module.init(
+            rngs,
+            input_features=input_features,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            decoder_position_ids=decoder_position_ids,
+        )["params"]
+
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+
+    # Copied from transformers.models.bart.modeling_flax_bart.FlaxBartPreTrainedModel.init_cache with Bart->Whisper
+    def init_cache(self, batch_size, max_length, encoder_outputs):
+        r"""
+        Args:
+            batch_size (`int`):
+                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
+            max_length (`int`):
+                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
+                cache.
+            encoder_outputs (`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`):
+                `encoder_outputs` consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+                `attentions`). `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*)
+                is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+                cross-attention of the decoder.
+        """
+        # init input variables to retrieve cache
+        decoder_input_ids = jnp.ones((batch_size, max_length), dtype="i4")
+        decoder_attention_mask = jnp.ones_like(decoder_input_ids)
+        decoder_position_ids = jnp.broadcast_to(
+            jnp.arange(jnp.atleast_2d(decoder_input_ids).shape[-1]), decoder_input_ids.shape
+        )
+
+        def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs):
+            decoder_module = module._get_decoder_module()
+            return decoder_module(
+                decoder_input_ids,
+                decoder_attention_mask,
+                decoder_position_ids,
+                **kwargs,
+            )
+
+        init_variables = self.module.init(
+            jax.random.PRNGKey(0),
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            decoder_position_ids=decoder_position_ids,
+            encoder_hidden_states=encoder_outputs[0],
+            init_cache=True,
+            method=_decoder_forward,  # we only need to call the decoder to init the cache
+        )
+        return unfreeze(init_variables["cache"])
+
+    @add_start_docstrings(WHISPER_ENCODE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=FlaxBaseModelOutput, config_class=WhisperConfig)
+    def encode(
+        self,
+        input_features: jnp.ndarray,
+        attention_mask: Optional[jnp.ndarray] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: dict = None,
+        dropout_rng: PRNGKey = None,
+        **kwargs,
+    ):
+        r"""
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import WhisperProcessor, FlaxWhisperForConditionalGeneration
+        >>> from datasets import load_dataset
+
+        >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+        >>> model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True)
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="np")
+        >>> input_features = inputs.input_features
+        >>> encoder_outputs = model.encode(input_features=input_features)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        def _encoder_forward(module, input_features, **kwargs):
+            encode_module = module._get_encoder_module()
+            return encode_module(input_features, **kwargs)
+
+        return self.module.apply(
+            {"params": params or self.params},
+            input_features=jnp.array(input_features, dtype="f4"),
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+            method=_encoder_forward,
+        )
+
+    @add_start_docstrings(WHISPER_DECODE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=FlaxBaseModelOutputWithPastAndCrossAttentions, config_class=WhisperConfig)
+    def decode(
+        self,
+        decoder_input_ids,
+        encoder_outputs,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_position_ids: Optional[jnp.ndarray] = None,
+        past_key_values: dict = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: dict = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        r"""
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import WhisperProcessor, FlaxWhisperForConditionalGeneration
+        >>> from datasets import load_dataset
+        >>> import jax.numpy as jnp
+
+        >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+        >>> model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True)
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> input_features = processor(ds[0]["audio"]["array"], return_tensors="np").input_features
+
+        >>> encoder_outputs = model.encode(input_features=input_features)
+        >>> decoder_start_token_id = model.config.decoder_start_token_id
+
+        >>> decoder_input_ids = jnp.ones((input_features.shape[0], 1), dtype="i4") * decoder_start_token_id
+
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> last_decoder_hidden_states = outputs.last_hidden_state
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        encoder_hidden_states = encoder_outputs[0]
+
+        batch_size, sequence_length = decoder_input_ids.shape
+        if decoder_position_ids is None:
+            if past_key_values is not None:
+                raise ValueError("Make sure to provide `decoder_position_ids` when passing `past_key_values`.")
+
+            if decoder_attention_mask is not None:
+                decoder_position_ids = (decoder_attention_mask.cumsum(-1) * decoder_attention_mask) - 1
+            else:
+                decoder_position_ids = jnp.broadcast_to(
+                    jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
+                )
+
+        if decoder_attention_mask is None:
+            decoder_attention_mask = jnp.ones((batch_size, sequence_length))
+
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        inputs = {"params": params or self.params}
+
+        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be
+        # passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that
+        # it can be changed by FlaxWhisperAttention module
+        if past_key_values:
+            inputs["cache"] = past_key_values
+            mutable = ["cache"]
+        else:
+            mutable = False
+
+        def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs):
+            decoder_module = module._get_decoder_module()
+            return decoder_module(
+                input_ids=decoder_input_ids,
+                attention_mask=decoder_attention_mask,
+                position_ids=decoder_position_ids,
+                **kwargs,
+            )
+
+        outputs = self.module.apply(
+            inputs,
+            decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
+            decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
+            decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
+            encoder_hidden_states=encoder_hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+            mutable=mutable,
+            method=_decoder_forward,
+        )
+
+        # add updated cache to model output
+        if past_key_values is not None and return_dict:
+            outputs, past = outputs
+            outputs["past_key_values"] = unfreeze(past["cache"])
+            return outputs
+        elif past_key_values is not None and not return_dict:
+            outputs, past = outputs
+            outputs = outputs[:1] + (unfreeze(past["cache"]),) + outputs[1:]
+
+        return outputs
+
+    @add_start_docstrings_to_model_forward(WHISPER_INPUTS_DOCSTRING)
+    def __call__(
+        self,
+        input_features: jnp.ndarray,
+        decoder_input_ids: jnp.ndarray,
+        attention_mask: Optional[jnp.ndarray] = None,
+        decoder_attention_mask: Optional[jnp.ndarray] = None,
+        position_ids: Optional[jnp.ndarray] = None,
+        decoder_position_ids: Optional[jnp.ndarray] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: dict = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        # prepare decoder inputs
+        if decoder_position_ids is None:
+            if decoder_attention_mask is not None:
+                decoder_position_ids = (decoder_attention_mask.cumsum(-1) * decoder_attention_mask) - 1
+            else:
+                batch_size, sequence_length = decoder_input_ids.shape
+                decoder_position_ids = jnp.broadcast_to(
+                    jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
+                )
+        if decoder_attention_mask is None:
+            decoder_attention_mask = jnp.ones_like(decoder_input_ids)
+
+        # Handle any PRNG if needed
+        rngs = {"dropout": dropout_rng} if dropout_rng is not None else {}
+
+        return self.module.apply(
+            {"params": params or self.params},
+            input_features=jnp.array(input_features, dtype="f4"),
+            decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
+            decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
+            decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+        )
+
+
+@add_start_docstrings(
+    "The bare Whisper Model transformer outputting raw hidden-states without any specific head on top.",
+    WHISPER_START_DOCSTRING,
+)
+class FlaxWhisperModel(FlaxWhisperPreTrainedModel):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    module_class = FlaxWhisperModule
+
+
+append_call_sample_docstring(FlaxWhisperModel, _CHECKPOINT_FOR_DOC, FlaxSeq2SeqModelOutput, _CONFIG_FOR_DOC)
+
+
+class FlaxWhisperForConditionalGenerationModule(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self) -> None:
+        self.model = FlaxWhisperModule(
+            config=self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing
+        )
+        self.lm_head = nn.Dense(
+            self.config.vocab_size,
+            use_bias=False,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.init_std),
+        )
+
+    def _get_encoder_module(self):
+        return self.model.encoder
+
+    def _get_decoder_module(self):
+        return self.model.decoder
+
+    def __call__(
+        self,
+        input_features,
+        decoder_input_ids,
+        decoder_attention_mask: jnp.ndarray = None,
+        decoder_position_ids: jnp.ndarray = None,
+        position_ids: jnp.ndarray = None,
+        attention_mask: jnp.ndarray = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+    ):
+        outputs = self.model(
+            input_features=input_features,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            decoder_position_ids=decoder_position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=deterministic,
+        )
+
+        hidden_states = outputs[0]
+
+        if self.config.tie_word_embeddings:
+            shared_embedding = self.model.decoder.embed_tokens.variables["params"]["embedding"]
+            lm_logits = self.lm_head.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
+        else:
+            lm_logits = self.lm_head(hidden_states)
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return output
+
+        return FlaxSeq2SeqLMOutput(
+            logits=lm_logits,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+
+@add_start_docstrings("The Whisper Model with a language modeling head.", WHISPER_START_DOCSTRING)
+class FlaxWhisperForConditionalGeneration(FlaxWhisperPreTrainedModel):
+    module_class = FlaxWhisperForConditionalGenerationModule
+    dtype: jnp.dtype = jnp.float32
+
+    @add_start_docstrings(WHISPER_DECODE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=FlaxCausalLMOutputWithCrossAttentions, config_class=WhisperConfig)
+    def decode(
+        self,
+        decoder_input_ids,
+        encoder_outputs,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_position_ids: Optional[jnp.ndarray] = None,
+        past_key_values: dict = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: dict = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        r"""
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import WhisperProcessor, FlaxWhisperForConditionalGeneration
+        >>> from datasets import load_dataset
+
+        >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+        >>> model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True)
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="np")
+        >>> input_features = inputs.input_features
+        >>> encoder_outputs = model.encode(input_features=input_features)
+        >>> decoder_start_token_id = model.config.decoder_start_token_id
+
+        >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> last_decoder_hidden_states = outputs.last_hidden_state
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        encoder_hidden_states = encoder_outputs[0]
+
+        batch_size, sequence_length = decoder_input_ids.shape
+        if decoder_position_ids is None:
+            if past_key_values is not None:
+                raise ValueError("Make sure to provide `decoder_position_ids` when passing `past_key_values`.")
+
+            if decoder_attention_mask is not None:
+                decoder_position_ids = (decoder_attention_mask.cumsum(-1) * decoder_attention_mask) - 1
+            else:
+                decoder_position_ids = jnp.broadcast_to(
+                    jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
+                )
+        if decoder_attention_mask is None:
+            decoder_attention_mask = jnp.ones((batch_size, sequence_length), dtype="i4")
+
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        inputs = {"params": params or self.params}
+
+        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be
+        # passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that
+        # it can be changed by FlaxWhisperAttention module
+        if past_key_values:
+            inputs["cache"] = past_key_values
+            mutable = ["cache"]
+        else:
+            mutable = False
+
+        def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs):
+            decoder_module = module._get_decoder_module()
+            outputs = decoder_module(
+                input_ids=decoder_input_ids,
+                attention_mask=decoder_attention_mask,
+                position_ids=decoder_position_ids,
+                **kwargs,
+            )
+            hidden_states = outputs[0]
+
+            if self.config.tie_word_embeddings:
+                shared_embedding = module.model.decoder.embed_tokens.variables["params"]["embedding"]
+                lm_logits = module.lm_head.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
+            else:
+                lm_logits = module.lm_head(hidden_states)
+
+            return lm_logits, outputs
+
+        outputs = self.module.apply(
+            inputs,
+            decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
+            decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
+            decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
+            encoder_hidden_states=encoder_hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+            mutable=mutable,
+            method=_decoder_forward,
+        )
+
+        if past_key_values is None:
+            lm_logits, decoder_outputs = outputs
+        else:
+            (lm_logits, decoder_outputs), past = outputs
+
+        if return_dict:
+            outputs = FlaxCausalLMOutputWithCrossAttentions(
+                logits=lm_logits,
+                hidden_states=decoder_outputs.hidden_states,
+                attentions=decoder_outputs.attentions,
+                cross_attentions=decoder_outputs.cross_attentions,
+            )
+        else:
+            outputs = (lm_logits,) + decoder_outputs[1:]
+
+        # add updated cache to model output
+        if past_key_values is not None and return_dict:
+            outputs["past_key_values"] = unfreeze(past["cache"])
+            return outputs
+        elif past_key_values is not None and not return_dict:
+            outputs = outputs[:1] + (unfreeze(past["cache"]),) + outputs[1:]
+
+        return outputs
+
+    def generate(
+        self,
+        input_features,
+        generation_config=None,
+        logits_processor=None,
+        return_timestamps=None,
+        task=None,
+        language=None,
+        is_multilingual=None,
+        **kwargs,
+    ):
+        if generation_config is None:
+            generation_config = self.generation_config
+
+        if return_timestamps is not None:
+            generation_config.return_timestamps = return_timestamps
+
+        if task is not None:
+            generation_config.task = task
+
+        if is_multilingual is not None:
+            generation_config.is_multilingual = is_multilingual
+
+        if language is not None:
+            generation_config.language = language
+
+        if kwargs is not None and "decoder_input_ids" in kwargs:
+            decoder_input_length = len(kwargs["decoder_input_ids"])
+        else:
+            decoder_input_length = 1
+
+        forced_decoder_ids = []
+
+        if hasattr(generation_config, "is_multilingual") and generation_config.is_multilingual:
+            if hasattr(generation_config, "language"):
+                forced_decoder_ids.append((1, generation_config.lang_to_id[generation_config.language]))
+            else:
+                forced_decoder_ids.append((1, None))
+
+            if hasattr(generation_config, "task"):
+                forced_decoder_ids.append((2, generation_config.task_to_id[generation_config.task]))
+            else:
+                forced_decoder_ids.append((2, generation_config.task_to_id["transcribe"]))
+
+        if (
+            hasattr(generation_config, "return_timestamps") and generation_config.return_timestamps
+        ) or return_timestamps:
+            logits_processor = [
+                FlaxWhisperTimeStampLogitsProcessor(generation_config, self.config, decoder_input_length)
+            ]
+        else:
+            if forced_decoder_ids and forced_decoder_ids[-1][0] != generation_config.no_timestamps_token_id:
+                idx = forced_decoder_ids[-1][0] + 1 if forced_decoder_ids else 1
+                forced_decoder_ids.append((idx, generation_config.no_timestamps_token_id))
+
+        if len(forced_decoder_ids) > 0:
+            generation_config.forced_decoder_ids = forced_decoder_ids
+
+        return super().generate(
+            input_features,
+            generation_config,
+            logits_processor=logits_processor,
+            **kwargs,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        max_length,
+        attention_mask: Optional[jax.Array] = None,
+        decoder_attention_mask: Optional[jax.Array] = None,
+        encoder_outputs=None,
+        **kwargs,
+    ):
+        # initializing the cache
+        batch_size, seq_length = decoder_input_ids.shape
+
+        past_key_values = self.init_cache(batch_size, max_length, encoder_outputs)
+        # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
+        # But since the decoder uses a causal mask, those positions are masked anyways.
+        # Thus we can create a single static attention_mask here, which is more efficient for compilation
+        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
+        if decoder_attention_mask is not None:
+            position_ids = decoder_attention_mask.cumsum(-1) - 1
+            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, decoder_attention_mask, (0, 0))
+        else:
+            position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
+
+        return {
+            "past_key_values": past_key_values,
+            "encoder_outputs": encoder_outputs,
+            "encoder_attention_mask": attention_mask,
+            "decoder_attention_mask": extended_attention_mask,
+            "decoder_position_ids": position_ids,
+        }
+
+    def update_inputs_for_generation(self, model_outputs, model_kwargs):
+        model_kwargs["past_key_values"] = model_outputs.past_key_values
+        model_kwargs["decoder_position_ids"] = model_kwargs["decoder_position_ids"][:, -1:] + 1
+        return model_kwargs
+
+
+FLAX_WHISPER_CONDITIONAL_GENERATION_DOCSTRING = r"""
+    Returns:
+
+    Transcription example:
+
+    ```python
+    >>> from transformers import WhisperProcessor, FlaxWhisperForConditionalGeneration
+    >>> from datasets import load_dataset
+
+    >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+    >>> model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True)
+    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+    >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="np")
+    >>> input_features = inputs.input_features
+    >>> generated_ids = model.generate(input_ids=input_features)
+    >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    >>> transcription
+    ' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
+    ```
+"""
+
+overwrite_call_docstring(
+    FlaxWhisperForConditionalGeneration, WHISPER_INPUTS_DOCSTRING + FLAX_WHISPER_CONDITIONAL_GENERATION_DOCSTRING
+)
+append_replace_return_docstrings(
+    FlaxWhisperForConditionalGeneration, output_type=FlaxSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC
+)
+
+
+class FlaxWhisperForAudioClassificationModule(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self) -> None:
+        self.encoder = FlaxWhisperEncoder(
+            config=self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing
+        )
+        self.config.is_encoder_decoder = False
+        num_layers = self.config.num_hidden_layers + 1
+        if self.config.use_weighted_layer_sum:
+            self.layer_weights = jnp.repeat(1 / num_layers, num_layers)
+        self.projector = nn.Dense(self.config.classifier_proj_size, dtype=self.dtype)
+        self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_features,
+        encoder_outputs=None,
+        output_attentions=None,
+        output_hidden_states: bool = True,
+        return_dict: bool = True,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_features,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = jnp.stack(encoder_outputs, axis=1)
+            norm_weights = jax.nn.softmax(self.layer_weights, axis=-1)
+            hidden_states = jnp.sum(hidden_states * jnp.reshape(norm_weights, [-1, 1, 1]), axis=1)
+        else:
+            hidden_states = encoder_outputs[0]
+
+        hidden_states = self.projector(hidden_states)
+        pooled_output = jnp.mean(hidden_states, axis=1)
+
+        logits = self.classifier(pooled_output)
+
+        if not return_dict:
+            return (logits,) + encoder_outputs[1:]
+
+        return FlaxSequenceClassifierOutput(
+            logits=logits,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings("The Whisper Model with an audio classification head on top.", WHISPER_START_DOCSTRING)
+class FlaxWhisperForAudioClassification(FlaxWhisperPreTrainedModel):
+    module_class = FlaxWhisperForAudioClassificationModule
+    dtype: jnp.dtype = jnp.float32
+
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        input_features = jnp.zeros(input_shape, dtype="f4")
+        input_features = input_features.at[(..., -1)].set(self.config.eos_token_id)
+
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+
+        random_params = self.module.init(
+            rngs,
+            input_features=input_features,
+        )["params"]
+
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+
+    @add_start_docstrings_to_model_forward(WHISPER_INPUTS_DOCSTRING)
+    def __call__(
+        self,
+        input_features: jnp.ndarray,
+        attention_mask: Optional[jnp.ndarray] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: dict = None,
+        dropout_rng: PRNGKey = None,
+        **kwargs,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        return self.module.apply(
+            {"params": params or self.params},
+            input_features=jnp.array(input_features, dtype="f4"),
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            rngs=rngs,
+        )
+
+
+FLAX_WHISPER_AUDIO_CLASSIFICATION_DOCSTRING = r"""
+    Returns:
+
+    Transcription example:
+
+    ```python
+    >>> import jax.numpy as jnp
+    >>> from transformers import AutoFeatureExtractor, FlaxWhisperForAudioClassification
+    >>> from datasets import load_dataset
+
+    >>> feature_extractor = AutoFeatureExtractor.from_pretrained("sanchit-gandhi/whisper-medium-fleurs-lang-id")
+    >>> model = FlaxWhisperForAudioClassification.from_pretrained(
+    ...     "sanchit-gandhi/whisper-medium-fleurs-lang-id", from_pt=True
+    ... )
+    >>> ds = load_dataset("google/fleurs", "all", split="validation", streaming=True)
+
+    >>> sample = next(iter(ds))
+
+    >>> inputs = feature_extractor(
+    ...     sample["audio"]["array"], sampling_rate=sample["audio"]["sampling_rate"], return_tensors="np"
+    ... )
+    >>> input_features = inputs.input_features
+
+    >>> logits = model(input_features).logits
+
+    >>> predicted_class_ids = jnp.argmax(logits).item()
+    >>> predicted_label = model.config.id2label[predicted_class_ids]
+    >>> predicted_label
+    'af_za'
+    ```
+"""
+
+overwrite_call_docstring(
+    FlaxWhisperForAudioClassification, WHISPER_INPUTS_DOCSTRING + FLAX_WHISPER_AUDIO_CLASSIFICATION_DOCSTRING
+)
+append_replace_return_docstrings(
+    FlaxWhisperForAudioClassification, output_type=FlaxSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC
+)
diff --git a/src/transformers/models/whisper/modeling_tf_whisper.py b/src/transformers/models/whisper/modeling_tf_whisper.py
index 7a76d42fd526..27b6ff63ceda 100644
--- a/src/transformers/models/whisper/modeling_tf_whisper.py
+++ b/src/transformers/models/whisper/modeling_tf_whisper.py
@@ -15,14 +15,18 @@
 """ TensorFlow Whisper model."""
 
 
+from __future__ import annotations
+
 import math
 import random
-from typing import Dict, Optional, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
+from ...generation.configuration_utils import GenerationConfig
+from ...generation.tf_logits_process import TFLogitsProcessorList
 from ...modeling_tf_outputs import (
     TFBaseModelOutput,
     TFBaseModelOutputWithPastAndCrossAttentions,
@@ -36,9 +40,10 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_whisper import WhisperConfig
+from .tokenization_whisper import TASK_IDS, TO_LANGUAGE_CODE
 
 
 logger = logging.get_logger(__name__)
@@ -128,7 +133,7 @@ def build(self, input_shape):
 
     def call(self, input_ids, past_key_values_length=0):
         past_key_values_length = tf.cast(past_key_values_length, tf.int32)
-        gather_indices = tf.range(tf.shape(input_ids)[-1], delta=1) + past_key_values_length
+        gather_indices = tf.range(tf.shape(input_ids)[1], delta=1) + past_key_values_length
         return tf.gather(self.weight, gather_indices)
 
 
@@ -142,7 +147,7 @@ def __init__(
         dropout: float = 0.0,
         is_decoder: bool = False,
         bias: bool = True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.embed_dim = embed_dim
@@ -171,12 +176,12 @@ def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
     def call(
         self,
         hidden_states: tf.Tensor,
-        key_value_states: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[Tuple[tf.Tensor]]] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
+        key_value_states: tf.Tensor | None = None,
+        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
+        attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
         training: Optional[bool] = False,
-    ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
+    ) -> Tuple[tf.Tensor, tf.Tensor | None]:
         """Input shape: Batch x Time x Channel"""
 
         # if key_value_states are provided this layer is used as a cross-attention layer
@@ -308,7 +313,7 @@ def call(
     ):
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`tf.Tensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
@@ -376,21 +381,21 @@ def __init__(self, config: WhisperConfig, **kwargs):
     def call(
         self,
         hidden_states,
-        attention_mask: Optional[tf.Tensor] = None,
-        encoder_hidden_states: Optional[tf.Tensor] = None,
-        encoder_attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
-        cross_attn_layer_head_mask: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[tf.Tensor]] = None,
+        attention_mask: tf.Tensor | None = None,
+        encoder_hidden_states: tf.Tensor | None = None,
+        encoder_attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
+        cross_attn_layer_head_mask: tf.Tensor | None = None,
+        past_key_value: Tuple[tf.Tensor] | None = None,
         training=False,
     ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`tf.Tensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             encoder_hidden_states (`tf.Tensor`):
-                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
             encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
@@ -479,23 +484,18 @@ def dummy_inputs(self) -> Dict[str, tf.Tensor]:
         """
         return {
             self.main_input_name: tf.random.uniform(
-                [2, self.config.num_mel_bins, self.config.max_source_positions * 2 - 1], dtype=tf.float32
+                [1, self.config.num_mel_bins, self.config.max_source_positions * 2 - 1], dtype=tf.float32
             ),
-            "decoder_input_ids": tf.constant([[2, 3]], dtype=tf.int32),
+            "decoder_input_ids": tf.constant([[1, 3]], dtype=tf.int32),
         }
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_features": tf.TensorSpec((None, None, None), tf.float32, name="input_features"),
-                "decoder_input_ids": tf.TensorSpec((None, None), tf.int32, name="decoder_input_ids"),
-                "decoder_attention_mask": tf.TensorSpec((None, None), tf.int32, name="decoder_attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-        return self.serving_output(output)
+    @property
+    def input_signature(self):
+        return {
+            "input_features": tf.TensorSpec((None, self.config.num_mel_bins, None), tf.float32, name="input_features"),
+            "decoder_input_ids": tf.TensorSpec((None, None), tf.int32, name="decoder_input_ids"),
+            "decoder_attention_mask": tf.TensorSpec((None, None), tf.int32, name="decoder_attention_mask"),
+        }
 
 
 WHISPER_START_DOCSTRING = r"""
@@ -520,7 +520,7 @@ def serving(self, inputs):
             Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be obtained
             by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.*
             via the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
-            [`WhisperFeatureExtractor`] should be used for extracting the fbank features, padding and conversion into a
+            [`AutoFeatureExtractor`] should be used for extracting the fbank features, padding and conversion into a
             tensor of type `tf.Tensor`. See [`~WhisperFeatureExtractor.__call__`]
         decoder_input_ids (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
@@ -644,7 +644,7 @@ def call(
                 Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be
                 obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
                 `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
-                `input_features`, the [`WhisperFeatureExtractor`] should be used for extracting the fbank features,
+                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the fbank features,
                 padding and conversion into a tensor of type `tf.Tensor`. See [`~WhisperFeatureExtractor.__call__`]
             head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
                 Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
@@ -879,19 +879,10 @@ def call(
             raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
 
         # past_key_values_length
-        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+        past_key_values_length = tf.shape(past_key_values[0][0])[2] if past_key_values is not None else 0
 
         if inputs_embeds is None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.embed_tokens.input_dim, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.input_dim})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
             inputs_embeds = self.embed_tokens(input_ids)
 
         attention_mask = self._prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values_length)
@@ -1026,11 +1017,11 @@ def call(
 
          ```python
          >>> import tensorflow as tf
-         >>> from transformers import TFWhisperModel, WhisperFeatureExtractor
+         >>> from transformers import TFWhisperModel, AutoFeatureExtractor
          >>> from datasets import load_dataset
 
          >>> model = TFWhisperModel.from_pretrained("openai/whisper-base")
-         >>> feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base")
+         >>> feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-base")
          >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
          >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="tf")
          >>> input_features = inputs.input_features
@@ -1128,13 +1119,13 @@ def encoder(self):
     @unpack_inputs
     def call(
         self,
-        input_features: Optional[TFModelInputType] = None,
-        decoder_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        cross_attn_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_features: TFModelInputType | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_head_mask: np.ndarray | tf.Tensor | None = None,
+        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
         encoder_outputs: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         decoder_inputs_embeds: Optional[Tuple[Union[np.ndarray, tf.Tensor]]] = None,
@@ -1151,11 +1142,11 @@ def call(
 
          ```python
          >>> import tensorflow as tf
-         >>> from transformers import TFWhisperModel, WhisperFeatureExtractor
+         >>> from transformers import TFWhisperModel, AutoFeatureExtractor
          >>> from datasets import load_dataset
 
          >>> model = TFWhisperModel.from_pretrained("openai/whisper-base")
-         >>> feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base")
+         >>> feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-base")
          >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
          >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="tf")
          >>> input_features = inputs.input_features
@@ -1243,17 +1234,17 @@ def resize_token_embeddings(self, new_num_tokens: int) -> tf.keras.layers.Embedd
     @unpack_inputs
     def call(
         self,
-        input_features: Optional[TFModelInputType] = None,
-        decoder_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        cross_attn_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_features: TFModelInputType | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_head_mask: np.ndarray | tf.Tensor | None = None,
+        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
         encoder_outputs: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         decoder_inputs_embeds: Optional[Tuple[Union[np.ndarray, tf.Tensor]]] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1272,10 +1263,10 @@ def call(
 
         ```python
         >>> import tensorflow as tf
-        >>> from transformers import WhisperProcessor, TFWhisperForConditionalGeneration
+        >>> from transformers import AutoProcessor, TFWhisperForConditionalGeneration
         >>> from datasets import load_dataset
 
-        >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+        >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
         >>> model = TFWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
 
         >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
@@ -1283,7 +1274,7 @@ def call(
         >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="tf")
         >>> input_features = inputs.input_features
 
-        >>> generated_ids = model.generate(input_ids=input_features)
+        >>> generated_ids = model.generate(input_features=input_features)
 
         >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         >>> transcription
@@ -1336,6 +1327,228 @@ def call(
             encoder_attentions=outputs.encoder_attentions,
         )
 
+    def generate(
+        self,
+        inputs: Optional[tf.Tensor] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        logits_processor: Optional[TFLogitsProcessorList] = None,
+        seed: Optional[List[int]] = None,
+        return_timestamps: Optional[bool] = None,
+        task: Optional[str] = None,
+        language: Optional[str] = None,
+        is_multilingual: Optional[bool] = None,
+        prompt_ids: Optional[tf.Tensor] = None,
+        return_token_timestamps=None,
+        **kwargs,
+    ):
+        r"""
+        Generates sequences of token ids for models with a language modeling head.
+
+        <Tip warning={true}>
+
+        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
+        model's default generation configuration. You can override any `generation_config` by passing the corresponding
+        parameters to generate, e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
+
+        For an overview of generation strategies and code examples, check out the [following
+        guide](../generation_strategies).
+
+        </Tip>
+
+        Parameters:
+            inputs (`tf.Tensor` of varying shape depending on the modality, *optional*):
+                The sequence used as a prompt for the generation or as model inputs to the encoder. If unset the method
+                initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs` should of in
+                the format of `input_ids`. For encoder-decoder models *inputs* can represent any of `input_ids`,
+                `input_values`, `input_features`, or `pixel_values`.
+            generation_config (`~generation.GenerationConfig`, *optional*):
+                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+                passed to generate matching the attributes of `generation_config` will override them. If
+                `generation_config` is not provided, the default will be used, which had the following loading
+                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
+                default values, whose documentation should be checked to parameterize generation.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                Custom logits processors that complement the default logits processors built from arguments and
+                generation config. If a logit processor is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            seed (`List[int]`, *optional*):
+                Random seed to control sampling, containing two integers, used when `do_sample` is `True`. See the
+                `seed` argument from stateless functions in `tf.random`.
+            return_timestamps (`bool`, *optional*):
+                Whether to return the timestamps with the text. This enables the `TFWhisperTimestampsLogitsProcessor`.
+            task (`str`, *optional*):
+                Task to use for generation, either "translate" or "transcribe". The `model.config.forced_decoder_ids`
+                will be updated accordingly.
+            language (`str`, *optional*):
+                Language token to use for generation, can be either in the form of `<|en|>`, `en` or `english`. You can
+                find all the possible language tokens in the `model.generation_config.lang_to_id` dictionary.
+            is_multilingual (`bool`, *optional*):
+                Whether or not the model is multilingual.
+            prompt_ids (`tf.Tensor`, *optional*):
+                Rank-1 tensor of token IDs created by passing text to [`~WhisperProcessor.get_prompt_ids`] that is
+                provided as a prompt to each chunk. This can be used to provide or "prompt-engineer" a context for
+                transcription, e.g. custom vocabularies or proper nouns to make it more likely to predict those words
+                correctly. It cannot be used in conjunction with `decoder_start_token_id` as it overwrites this value.
+            return_token_timestamps (`bool`, *optional*):
+                Whether to return token-level timestamps with the text. This can be used with or without the
+                `return_timestamps` option. To get word-level timestamps, use the tokenizer to group the tokens into
+                words.
+            kwargs (`Dict[str, Any]`, *optional*):
+                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
+                forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
+                specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
+
+        Return:
+            [`~utils.ModelOutput`] or `tf.Tensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True` or when
+            `config.return_dict_in_generate=True`) or a `tf.Tensor`.
+
+                If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
+                [`~utils.ModelOutput`] types are:
+
+                    - [`~generation.TFGreedySearchDecoderOnlyOutput`],
+                    - [`~generation.TFSampleDecoderOnlyOutput`],
+                    - [`~generation.TFBeamSearchDecoderOnlyOutput`],
+                    - [`~generation.TFBeamSampleDecoderOnlyOutput`]
+
+                If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
+                [`~utils.ModelOutput`] types are:
+
+                    - [`~generation.TFGreedySearchEncoderDecoderOutput`],
+                    - [`~generation.TFSampleEncoderDecoderOutput`],
+                    - [`~generation.TFBeamSearchEncoderDecoderOutput`],
+                    - [`~generation.TFBeamSampleEncoderDecoderOutput`]
+
+        """
+        if generation_config is None:
+            generation_config = self.generation_config
+
+        if return_timestamps is not None:
+            if not hasattr(generation_config, "no_timestamps_token_id"):
+                raise ValueError(
+                    "You are trying to return timestamps, but the generation config is not properly set."
+                    "Make sure to initialize the generation config with the correct attributes that are needed such as `no_timestamps_token_id`."
+                    "For more details on how to generate the approtiate config, refer to https://github.com/huggingface/transformers/issues/21878#issuecomment-1451902363"
+                )
+
+            generation_config.return_timestamps = return_timestamps
+        else:
+            generation_config.return_timestamps = False
+
+        if language is not None:
+            language = language.lower()
+            generation_config.language = language
+        if task is not None:
+            generation_config.task = task
+
+        forced_decoder_ids = None
+
+        # Legacy code for backward compatibility
+        if hasattr(self.config, "forced_decoder_ids") and self.config.forced_decoder_ids is not None:
+            forced_decoder_ids = self.config.forced_decoder_ids
+        elif (
+            hasattr(self.generation_config, "forced_decoder_ids")
+            and self.generation_config.forced_decoder_ids is not None
+        ):
+            forced_decoder_ids = self.generation_config.forced_decoder_ids
+        else:
+            forced_decoder_ids = kwargs.get("forced_decoder_ids", None)
+
+        if task is not None or language is not None or (forced_decoder_ids is None and prompt_ids is not None):
+            forced_decoder_ids = []
+            if hasattr(generation_config, "language"):
+                if generation_config.language in generation_config.lang_to_id.keys():
+                    language_token = generation_config.language
+                elif generation_config.language in TO_LANGUAGE_CODE.keys():
+                    language_token = f"<|{TO_LANGUAGE_CODE[generation_config.language]}|>"
+                elif generation_config.language in TO_LANGUAGE_CODE.values():
+                    language_token = f"<|{generation_config.language}|>"
+                else:
+                    is_language_code = len(generation_config.language) == 2
+                    raise ValueError(
+                        f"Unsupported language: {generation_config.language}. Language should be one of:"
+                        f" {list(TO_LANGUAGE_CODE.values()) if is_language_code else list(TO_LANGUAGE_CODE.keys())}."
+                    )
+                forced_decoder_ids.append((1, generation_config.lang_to_id[language_token]))
+            else:
+                forced_decoder_ids.append((1, None))  # automatically detect the language
+
+            if hasattr(generation_config, "task"):
+                if generation_config.task in TASK_IDS:
+                    forced_decoder_ids.append((2, generation_config.task_to_id[generation_config.task]))
+                else:
+                    raise ValueError(
+                        f"The `{generation_config.task}`task is not supported. The task should be one of `{TASK_IDS}`"
+                    )
+            elif hasattr(generation_config, "task_to_id"):
+                forced_decoder_ids.append((2, generation_config.task_to_id["transcribe"]))  # defaults to transcribe
+            if hasattr(generation_config, "no_timestamps_token_id") and not generation_config.return_timestamps:
+                idx = forced_decoder_ids[-1][0] + 1 if forced_decoder_ids else 1
+                forced_decoder_ids.append((idx, generation_config.no_timestamps_token_id))
+
+        if forced_decoder_ids is not None:
+            generation_config.forced_decoder_ids = forced_decoder_ids
+
+        if prompt_ids is not None:
+            if kwargs.get("decoder_start_token_id") is not None:
+                raise ValueError(
+                    "When specifying `prompt_ids`, you cannot also specify `decoder_start_token_id` as it gets overwritten."
+                )
+            prompt_ids = prompt_ids.tolist()
+            decoder_start_token_id, *text_prompt_ids = prompt_ids
+            # Slicing the text prompt ids in a manner consistent with the OpenAI implementation
+            # to accomodate context space for the prefix (see https://github.com/openai/whisper/blob/c09a7ae299c4c34c5839a76380ae407e7d785914/whisper/decoding.py#L599)
+            text_prompt_ids = text_prompt_ids[-self.config.max_length // 2 - 1 :]
+            # Set the decoder_start_token_id to <|startofprev|>
+            kwargs.update({"decoder_start_token_id": decoder_start_token_id})
+
+            # Update the max generation length to include the prompt
+            specified_max_length = kwargs.pop("max_new_tokens", None) or kwargs.pop("max_length", None)
+            default_max_length = generation_config.max_new_tokens or generation_config.max_length
+            non_prompt_max_length = specified_max_length or default_max_length
+            kwargs["max_new_tokens"] = non_prompt_max_length + len(text_prompt_ids)
+
+            # Reformat the forced_decoder_ids to incorporate the prompt
+            non_prompt_forced_decoder_ids = (
+                kwargs.pop("forced_decoder_ids", None) or generation_config.forced_decoder_ids
+            )
+            forced_decoder_ids = [
+                *text_prompt_ids,
+                generation_config.decoder_start_token_id,
+                *[token for _rank, token in non_prompt_forced_decoder_ids],
+            ]
+            forced_decoder_ids = [(rank + 1, token) for rank, token in enumerate(forced_decoder_ids)]
+            generation_config.forced_decoder_ids = forced_decoder_ids
+
+        # TODO: Implement `WhisperTimeStampLogitsProcessor`.
+        if generation_config.return_timestamps:
+            # logits_processor = [TFWhisperTimeStampLogitsProcessor(generation_config)]
+            raise ValueError("`TFWhisperForConditionalGeneration` doesn't support returning the timestamps yet.")
+
+        if return_token_timestamps:
+            kwargs["output_attentions"] = True
+            kwargs["return_dict_in_generate"] = True
+
+            if getattr(generation_config, "task", None) == "translate":
+                logger.warning("Token-level timestamps may not be reliable for task 'translate'.")
+            if not hasattr(generation_config, "alignment_heads"):
+                raise ValueError(
+                    "Model generation config has no `alignment_heads`, token-level timestamps not available. "
+                    "See https://gist.github.com/hollance/42e32852f24243b748ae6bc1f985b13a on how to add this property to the generation config."
+                )
+
+        outputs = super().generate(
+            inputs,
+            generation_config,
+            logits_processor,
+            **kwargs,
+        )
+
+        if return_token_timestamps and hasattr(generation_config, "alignment_heads"):
+            outputs["token_timestamps"] = self._extract_token_timestamps(outputs, generation_config.alignment_heads)
+
+        return outputs
+
     def serving_output(self, output):
         pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
         dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
@@ -1363,7 +1576,7 @@ def prepare_inputs_for_generation(
         encoder_outputs=None,
         attention_mask=None,
         decoder_attention_mask=None,
-        **kwargs
+        **kwargs,
     ):
         # cut decoder_input_ids if past is used
         if past_key_values is not None:
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index 4ae94fb399c5..447d7275d557 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -14,26 +14,33 @@
 # limitations under the License.
 """ PyTorch Whisper model."""
 
-
 import math
-import random
 from typing import Optional, Tuple, Union
 
+import numpy as np
 import torch
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
+from ...generation.logits_process import WhisperTimeStampLogitsProcessor
 from ...modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPastAndCrossAttentions,
     Seq2SeqLMOutput,
     Seq2SeqModelOutput,
+    SequenceClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_whisper import WhisperConfig
+from .tokenization_whisper import TASK_IDS, TO_LANGUAGE_CODE
 
 
 logger = logging.get_logger(__name__)
@@ -66,18 +73,20 @@ def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start
 
 
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
     """
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
-    mask_cond = torch.arange(mask.size(-1))
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
 
     if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
     return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
 
 
@@ -96,13 +105,207 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
     return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
 
 
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices
+def _compute_mask_indices(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    attention_mask: Optional[torch.LongTensor] = None,
+    min_masks: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
+    CPU as part of the preprocessing during training.
+
+    Args:
+        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
+               the first element is the batch size and the second element is the length of the axis to span.
+        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
+                    independently generated mask spans of length `mask_length` is computed by
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
+        mask_length: size of the mask
+        min_masks: minimum number of masked spans
+        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+                        each batch dimension.
+    """
+    batch_size, sequence_length = shape
+
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+            f" and `sequence_length`: {sequence_length}`"
+        )
+
+    # epsilon is used for probabilistic rounding
+    epsilon = np.random.rand(1).item()
+
+    def compute_num_masked_span(input_length):
+        """Given input length, compute how many spans should be masked"""
+        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
+        num_masked_span = max(num_masked_span, min_masks)
+
+        # make sure num masked span <= sequence_length
+        if num_masked_span * mask_length > sequence_length:
+            num_masked_span = sequence_length // mask_length
+
+        # make sure num_masked span is also <= input_length - (mask_length - 1)
+        if input_length - (mask_length - 1) < num_masked_span:
+            num_masked_span = max(input_length - (mask_length - 1), 0)
+
+        return num_masked_span
+
+    # compute number of masked spans in batch
+    input_lengths = (
+        attention_mask.sum(-1).detach().tolist()
+        if attention_mask is not None
+        else [sequence_length for _ in range(batch_size)]
+    )
+
+    # SpecAugment mask to fill
+    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
+    spec_aug_mask_idxs = []
+
+    max_num_masked_span = compute_num_masked_span(sequence_length)
+
+    if max_num_masked_span == 0:
+        return spec_aug_mask
+
+    for input_length in input_lengths:
+        # compute num of masked spans for this input
+        num_masked_span = compute_num_masked_span(input_length)
+
+        # get random indices to mask
+        spec_aug_mask_idx = np.random.choice(
+            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
+        )
+
+        # pick first sampled index that will serve as a dummy index to pad vector
+        # to ensure same dimension for all batches due to probabilistic rounding
+        # Picking first sample just pads those vectors twice.
+        if len(spec_aug_mask_idx) == 0:
+            # this case can only happen if `input_length` is strictly smaller then
+            # `sequence_length` in which case the last token has to be a padding
+            # token which we can use as a dummy mask id
+            dummy_mask_idx = sequence_length - 1
+        else:
+            dummy_mask_idx = spec_aug_mask_idx[0]
+
+        spec_aug_mask_idx = np.concatenate(
+            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
+        )
+        spec_aug_mask_idxs.append(spec_aug_mask_idx)
+
+    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
+
+    # expand masked indices to masked spans
+    spec_aug_mask_idxs = np.broadcast_to(
+        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
+
+    # add offset to the starting indexes so that indexes now create a span
+    offsets = np.arange(mask_length)[None, None, :]
+    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
+        batch_size, max_num_masked_span * mask_length
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+
+    # ensure that we cannot have indices larger than sequence_length
+    if spec_aug_mask_idxs.max() > sequence_length - 1:
+        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
+
+    # scatter indices to mask
+    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
+
+    return spec_aug_mask
+
+
+def _median_filter(inputs: torch.Tensor, filter_width: int) -> torch.Tensor:
+    """
+    Applies a median filter of width `filter_width` along the last dimension of the input.
+
+    The `inputs` tensor is assumed to be 3- or 4-dimensional.
+    """
+    if filter_width <= 0 or filter_width % 2 != 1:
+        raise ValueError("`filter_width` should be an odd number")
+
+    pad_width = filter_width // 2
+    if inputs.shape[-1] <= pad_width:
+        return inputs
+
+    # Pad the left and right edges.
+    inputs = nn.functional.pad(inputs, (pad_width, pad_width, 0, 0), mode="reflect")
+
+    # sort() is faster than torch.median (https://github.com/pytorch/pytorch/issues/51450)
+    result = inputs.unfold(-1, filter_width, 1).sort()[0][..., pad_width]
+    return result
+
+
+def _dynamic_time_warping(matrix: np.ndarray):
+    """
+    Measures similarity between two temporal sequences: the input audio and the output tokens. Used to generate
+    token-level timestamps.
+    """
+    output_length, input_length = matrix.shape
+    cost = np.ones((output_length + 1, input_length + 1), dtype=np.float32) * np.inf
+    trace = -np.ones((output_length + 1, input_length + 1), dtype=np.float32)
+
+    cost[0, 0] = 0
+    for j in range(1, input_length + 1):
+        for i in range(1, output_length + 1):
+            c0 = cost[i - 1, j - 1]
+            c1 = cost[i - 1, j]
+            c2 = cost[i, j - 1]
+
+            if c0 < c1 and c0 < c2:
+                c, t = c0, 0
+            elif c1 < c0 and c1 < c2:
+                c, t = c1, 1
+            else:
+                c, t = c2, 2
+
+            cost[i, j] = matrix[i - 1, j - 1] + c
+            trace[i, j] = t
+
+    # backtrace
+    i = trace.shape[0] - 1
+    j = trace.shape[1] - 1
+    trace[0, :] = 2
+    trace[:, 0] = 1
+
+    text_indices = []
+    time_indices = []
+    while i > 0 or j > 0:
+        text_indices.append(i - 1)
+        time_indices.append(j - 1)
+        if trace[i, j] == 0:
+            i -= 1
+            j -= 1
+        elif trace[i, j] == 1:
+            i -= 1
+        elif trace[i, j] == 2:
+            j -= 1
+        else:
+            raise RuntimeError(
+                f"Internal error in dynamic time warping. Unexpected trace[{i}, {j}]. Please file a bug report."
+            )
+
+    text_indices = np.array(text_indices)[::-1]
+    time_indices = np.array(time_indices)[::-1]
+    return text_indices, time_indices
+
+
 class WhisperPositionalEmbedding(nn.Embedding):
     def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
         super().__init__(num_positions, embedding_dim)
 
     def forward(self, input_ids, past_key_values_length=0):
-
-        return self.weight[past_key_values_length : past_key_values_length + input_ids.shape[-1]]
+        return self.weight[past_key_values_length : past_key_values_length + input_ids.shape[1]]
 
 
 class WhisperAttention(nn.Module):
@@ -198,8 +401,8 @@ def forward(
 
         proj_shape = (bsz * self.num_heads, -1, self.head_dim)
         query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-        key_states = key_states.view(*proj_shape)
-        value_states = value_states.view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
 
         src_len = key_states.size(1)
         attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
@@ -245,7 +448,7 @@ def forward(
 
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
             raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
                 f" {attn_output.size()}"
             )
 
@@ -253,7 +456,7 @@ def forward(
         attn_output = attn_output.transpose(1, 2)
 
         # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned aross GPUs when using tensor-parallelism.
+        # partitioned across GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
@@ -288,7 +491,7 @@ def forward(
     ) -> torch.Tensor:
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
@@ -372,11 +575,11 @@ def forward(
     ) -> torch.Tensor:
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             encoder_hidden_states (`torch.FloatTensor`):
-                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
             encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
@@ -453,7 +656,7 @@ class WhisperPreTrainedModel(PreTrainedModel):
     base_model_prefix = "model"
     main_input_name = "input_features"
     supports_gradient_checkpointing = True
-    _no_split_modules = ["WhisperEncoderLayer"]
+    _no_split_modules = ["WhisperEncoderLayer", "WhisperDecoderLayer"]
 
     def _init_weights(self, module):
         std = self.config.init_std
@@ -501,8 +704,16 @@ def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
             Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
             loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via
             the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
-            [`WhisperFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
+            [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
             tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
+        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing *SpecAugment* data augmentation on padding token indices. Mask values selected in
+            `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
         decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
 
@@ -572,6 +783,33 @@ def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
 """
 
+WHISPER_ENCODER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
+            Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
+            loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via
+            the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
+            [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
+            tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
+        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
 
 class WhisperEncoder(WhisperPreTrainedModel):
     """
@@ -580,7 +818,6 @@ class WhisperEncoder(WhisperPreTrainedModel):
 
     Args:
         config: WhisperConfig
-        embed_tokens (nn.Embedding): output embedding
     """
 
     def __init__(self, config: WhisperConfig):
@@ -611,6 +848,12 @@ def _freeze_parameters(self):
             param.requires_grad = False
         self._requires_grad = False
 
+    def get_input_embeddings(self) -> nn.Module:
+        return self.conv1
+
+    def set_input_embeddings(self, value: nn.Module):
+        self.conv1 = value
+
     def forward(
         self,
         input_features,
@@ -626,9 +869,8 @@ def forward(
                 Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
                 obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
                 `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
-                `input_features`, the [`WhisperFeatureExtractor`] should be used for extracting the mel features,
-                padding and conversion into a tensor of type `torch.FloatTensor`. See
-                [`~WhisperFeatureExtractor.__call__`]
+                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
+                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
             attention_mask (`torch.Tensor`)`, *optional*):
                 Whisper does not support masking of the `input_features`, this argument is preserved for compatibility,
                 but it is not used. By default the silence in the input log mel spectrogram are ignored.
@@ -673,8 +915,13 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
                 layer_outputs = (None, None)
             else:
                 if self.gradient_checkpointing and self.training:
@@ -756,8 +1003,11 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
 
         if input_shape[-1] > 1:
             combined_attention_mask = _make_causal_mask(
-                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
-            ).to(inputs_embeds.device)
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
 
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
@@ -868,11 +1118,20 @@ def forward(
         )
 
         # embed positions
-        positions = self.embed_positions(input_ids, past_key_values_length=past_key_values_length)
+        if input_ids is not None:
+            positions = self.embed_positions(input_ids, past_key_values_length=past_key_values_length)
+        else:
+            positions = self.embed_positions(inputs_embeds, past_key_values_length=past_key_values_length)
 
         hidden_states = inputs_embeds + positions
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`..."
+                )
+                use_cache = False
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
@@ -890,21 +1149,15 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache ="
-                        " False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         # None for past_key_value
@@ -923,7 +1176,6 @@ def custom_forward(*inputs):
                     None,  # past_key_value
                 )
             else:
-
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
@@ -973,8 +1225,6 @@ def custom_forward(*inputs):
     WHISPER_START_DOCSTRING,
 )
 class WhisperModel(WhisperPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"proj_out.weight"]
-
     def __init__(self, config: WhisperConfig):
         super().__init__(config)
 
@@ -1002,11 +1252,55 @@ def freeze_encoder(self):
         """
         self.encoder._freeze_parameters()
 
+    def _mask_input_features(
+        self,
+        input_features: torch.FloatTensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        """
+        Masks extracted features along time axis and/or along feature axis according to
+        [SpecAugment](https://arxiv.org/abs/1904.08779).
+        """
+
+        # `config.apply_spec_augment` can set masking to False
+        if not getattr(self.config, "apply_spec_augment", True):
+            return input_features
+
+        # generate indices & apply SpecAugment along time axis
+        batch_size, hidden_size, sequence_length = input_features.size()
+
+        if self.config.mask_time_prob > 0 and self.training:
+            # generate indices & apply SpecAugment along time axis
+            mask_time_indices = _compute_mask_indices(
+                (batch_size, sequence_length),
+                mask_prob=self.config.mask_time_prob,
+                mask_length=self.config.mask_time_length,
+                attention_mask=attention_mask,
+                min_masks=self.config.mask_time_min_masks,
+            )
+            mask_time_indices = torch.tensor(mask_time_indices, device=input_features.device, dtype=torch.bool)
+            mask_time_indices = mask_time_indices[:, None].expand(-1, hidden_size, -1)
+            input_features[mask_time_indices] = 0
+
+        if self.config.mask_feature_prob > 0 and self.training:
+            # generate indices & apply SpecAugment along feature axis
+            mask_feature_indices = _compute_mask_indices(
+                (batch_size, hidden_size),
+                mask_prob=self.config.mask_feature_prob,
+                mask_length=self.config.mask_feature_length,
+                min_masks=self.config.mask_feature_min_masks,
+            )
+            mask_feature_indices = torch.tensor(mask_feature_indices, device=input_features.device, dtype=torch.bool)
+            input_features[mask_feature_indices] = 0
+
+        return input_features
+
     @add_start_docstrings_to_model_forward(WHISPER_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_features: Optional[torch.LongTensor] = None,
+        input_features: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
         head_mask: Optional[torch.Tensor] = None,
@@ -1026,11 +1320,11 @@ def forward(
         Example:
          ```python
          >>> import torch
-         >>> from transformers import WhisperFeatureExtractor, WhisperModel
+         >>> from transformers import AutoFeatureExtractor, WhisperModel
          >>> from datasets import load_dataset
 
          >>> model = WhisperModel.from_pretrained("openai/whisper-base")
-         >>> feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base")
+         >>> feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-base")
          >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
          >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
          >>> input_features = inputs.input_features
@@ -1047,6 +1341,8 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if encoder_outputs is None:
+            input_features = self._mask_input_features(input_features, attention_mask=attention_mask)
+
             encoder_outputs = self.encoder(
                 input_features,
                 head_mask=head_mask,
@@ -1098,14 +1394,7 @@ def forward(
 )
 class WhisperForConditionalGeneration(WhisperPreTrainedModel):
     base_model_prefix = "model"
-    _keys_to_ignore_on_load_missing = [
-        r"encoder.version",
-        r"decoder.version",
-        r"proj_out.weight",
-    ]
-    _keys_to_ignore_on_save = [
-        r"proj_out.weight",
-    ]
+    _tied_weights_keys = ["proj_out.weight"]
 
     def __init__(self, config: WhisperConfig):
         super().__init__(config)
@@ -1121,16 +1410,15 @@ def get_encoder(self):
     def get_decoder(self):
         return self.model.get_decoder()
 
-    def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
-        new_embeddings = super().resize_token_embeddings(new_num_tokens)
-        return new_embeddings
-
     def get_output_embeddings(self):
         return self.proj_out
 
     def set_output_embeddings(self, new_embeddings):
         self.proj_out = new_embeddings
 
+    def get_input_embeddings(self) -> nn.Module:
+        return self.model.get_input_embeddings()
+
     def freeze_encoder(self):
         """
         Calling this function will disable the gradient computation for the Whisper encoder so that its parameters will
@@ -1142,7 +1430,8 @@ def freeze_encoder(self):
     @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_features: Optional[torch.LongTensor] = None,
+        input_features: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
         head_mask: Optional[torch.Tensor] = None,
@@ -1169,10 +1458,10 @@ def forward(
 
         ```python
         >>> import torch
-        >>> from transformers import WhisperProcessor, WhisperForConditionalGeneration
+        >>> from transformers import AutoProcessor, WhisperForConditionalGeneration
         >>> from datasets import load_dataset
 
-        >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+        >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
         >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
 
         >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
@@ -1196,6 +1485,7 @@ def forward(
 
         outputs = self.model(
             input_features,
+            attention_mask=attention_mask,
             decoder_input_ids=decoder_input_ids,
             encoder_outputs=encoder_outputs,
             decoder_attention_mask=decoder_attention_mask,
@@ -1214,7 +1504,9 @@ def forward(
         loss = None
         if labels is not None:
             loss_fct = CrossEntropyLoss()
-            loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+            # move labels to correct device to enable PP
+            labels = labels.to(lm_logits.device)
+            loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.reshape(-1))
 
         if not return_dict:
             output = (lm_logits,) + outputs[1:]
@@ -1232,6 +1524,266 @@ def forward(
             encoder_attentions=outputs.encoder_attentions,
         )
 
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        generation_config=None,
+        logits_processor=None,
+        stopping_criteria=None,
+        prefix_allowed_tokens_fn=None,
+        synced_gpus=False,
+        return_timestamps=None,
+        task=None,
+        language=None,
+        is_multilingual=None,
+        prompt_ids: Optional[torch.Tensor] = None,
+        return_token_timestamps=None,
+        **kwargs,
+    ):
+        """
+
+        Generates sequences of token ids for models with a language modeling head.
+
+        <Tip warning={true}>
+
+        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
+        model's default generation configuration. You can override any `generation_config` by passing the corresponding
+        parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
+
+        For an overview of generation strategies and code examples, check out the [following
+        guide](./generation_strategies).
+
+        </Tip>
+
+        Parameters:
+            inputs (`torch.Tensor` of varying shape depending on the modality, *optional*):
+                The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
+                method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs`
+                should of in the format of `input_ids`. For encoder-decoder models *inputs* can represent any of
+                `input_ids`, `input_values`, `input_features`, or `pixel_values`.
+            generation_config (`~generation.GenerationConfig`, *optional*):
+                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+                passed to generate matching the attributes of `generation_config` will override them. If
+                `generation_config` is not provided, the default will be used, which had the following loading
+                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
+                default values, whose documentation should be checked to parameterize generation.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                Custom logits processors that complement the default logits processors built from arguments and
+                generation config. If a logit processor is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                Custom stopping criteria that complement the default stopping criteria built from arguments and a
+                generation config. If a stopping criteria is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
+                If provided, this function constraints the beam search to allowed tokens only at each step. If not
+                provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
+                `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned
+                on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful
+                for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
+                Retrieval](https://arxiv.org/abs/2010.00904).
+            synced_gpus (`bool`, *optional*, defaults to `False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+            return_timestamps (`bool`, *optional*):
+                Whether to return the timestamps with the text. This enables the `WhisperTimestampsLogitsProcessor`.
+            task (`str`, *optional*):
+                Task to use for generation, either "translate" or "transcribe". The `model.config.forced_decoder_ids`
+                will be updated accordingly.
+            language (`str`, *optional*):
+                Language token to use for generation, can be either in the form of `<|en|>`, `en` or `english`. You can
+                find all the possible language tokens in the `model.generation_config.lang_to_id` dictionary.
+            is_multilingual (`bool`, *optional*):
+                Whether or not the model is multilingual.
+            prompt_ids (`torch.Tensor`, *optional*):
+                Rank-1 tensor of token IDs created by passing text to [`~WhisperProcessor.get_prompt_ids`] that is
+                provided as a prompt to each chunk. This can be used to provide or "prompt-engineer" a context for
+                transcription, e.g. custom vocabularies or proper nouns to make it more likely to predict those words
+                correctly. It cannot be used in conjunction with `decoder_start_token_id` as it overwrites this value.
+            return_token_timestamps (`bool`, *optional*):
+                Whether to return token-level timestamps with the text. This can be used with or without the
+                `return_timestamps` option. To get word-level timestamps, use the tokenizer to group the tokens into
+                words.
+            kwargs (`Dict[str, Any]`, *optional*):
+                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
+                forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
+                specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
+
+        Return:
+            [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
+            or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`.
+
+                If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
+                [`~utils.ModelOutput`] types are:
+
+                    - [`~generation.GreedySearchDecoderOnlyOutput`],
+                    - [`~generation.SampleDecoderOnlyOutput`],
+                    - [`~generation.BeamSearchDecoderOnlyOutput`],
+                    - [`~generation.BeamSampleDecoderOnlyOutput`]
+
+                If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
+                [`~utils.ModelOutput`] types are:
+
+                    - [`~generation.GreedySearchEncoderDecoderOutput`],
+                    - [`~generation.SampleEncoderDecoderOutput`],
+                    - [`~generation.BeamSearchEncoderDecoderOutput`],
+                    - [`~generation.BeamSampleEncoderDecoderOutput`]
+        """
+        if generation_config is None:
+            generation_config = self.generation_config
+
+        if return_timestamps is not None:
+            if not hasattr(generation_config, "no_timestamps_token_id"):
+                raise ValueError(
+                    "You are trying to return timestamps, but the generation config is not properly set."
+                    "Make sure to initialize the generation config with the correct attributes that are needed such as `no_timestamps_token_id`."
+                    "For more details on how to generate the approtiate config, refer to https://github.com/huggingface/transformers/issues/21878#issuecomment-1451902363"
+                )
+
+            generation_config.return_timestamps = return_timestamps
+        else:
+            generation_config.return_timestamps = False
+
+        if language is not None:
+            if not hasattr(generation_config, "lang_to_id"):
+                raise ValueError(
+                    "The generation config is outdated and is thus not compatible with the `language` argument"
+                    "to `generate`. Either set the language using the `forced_decoder_ids` in the model config, "
+                    "or update the generation config as per the instructions https://github.com/huggingface/transformers/issues/25084#issuecomment-1664398224"
+                )
+            language = language.lower()
+            generation_config.language = language
+        if task is not None:
+            if not hasattr(generation_config, "task_to_id"):
+                raise ValueError(
+                    "The generation config is outdated and is thus not compatible with the `task` argument"
+                    "to `generate`. Either set the task using the `forced_decoder_ids` in the model config, "
+                    "or update the generation config as per the instructions https://github.com/huggingface/transformers/issues/25084#issuecomment-1664398224"
+                )
+            generation_config.task = task
+
+        forced_decoder_ids = None
+
+        # Legacy code for backward compatibility
+        if hasattr(self.config, "forced_decoder_ids") and self.config.forced_decoder_ids is not None:
+            forced_decoder_ids = self.config.forced_decoder_ids
+        elif (
+            hasattr(self.generation_config, "forced_decoder_ids")
+            and self.generation_config.forced_decoder_ids is not None
+        ):
+            forced_decoder_ids = self.generation_config.forced_decoder_ids
+        else:
+            forced_decoder_ids = kwargs.get("forced_decoder_ids", None)
+
+        if task is not None or language is not None or (forced_decoder_ids is None and prompt_ids is not None):
+            forced_decoder_ids = []
+            if hasattr(generation_config, "language"):
+                if generation_config.language in generation_config.lang_to_id.keys():
+                    language_token = generation_config.language
+                elif generation_config.language in TO_LANGUAGE_CODE.keys():
+                    language_token = f"<|{TO_LANGUAGE_CODE[generation_config.language]}|>"
+                elif generation_config.language in TO_LANGUAGE_CODE.values():
+                    language_token = f"<|{generation_config.language}|>"
+                else:
+                    is_language_code = len(generation_config.language) == 2
+                    raise ValueError(
+                        f"Unsupported language: {generation_config.language}. Language should be one of:"
+                        f" {list(TO_LANGUAGE_CODE.values()) if is_language_code else list(TO_LANGUAGE_CODE.keys())}."
+                    )
+                forced_decoder_ids.append((1, generation_config.lang_to_id[language_token]))
+            else:
+                forced_decoder_ids.append((1, None))  # automatically detect the language
+
+            if hasattr(generation_config, "task"):
+                if generation_config.task in TASK_IDS:
+                    forced_decoder_ids.append((2, generation_config.task_to_id[generation_config.task]))
+                else:
+                    raise ValueError(
+                        f"The `{generation_config.task}`task is not supported. The task should be one of `{TASK_IDS}`"
+                    )
+            elif hasattr(generation_config, "task_to_id"):
+                forced_decoder_ids.append((2, generation_config.task_to_id["transcribe"]))  # defaults to transcribe
+            if hasattr(generation_config, "no_timestamps_token_id") and not generation_config.return_timestamps:
+                idx = forced_decoder_ids[-1][0] + 1 if forced_decoder_ids else 1
+                forced_decoder_ids.append((idx, generation_config.no_timestamps_token_id))
+
+        if forced_decoder_ids is not None:
+            generation_config.forced_decoder_ids = forced_decoder_ids
+
+        if prompt_ids is not None:
+            if kwargs.get("decoder_start_token_id") is not None:
+                raise ValueError(
+                    "When specifying `prompt_ids`, you cannot also specify `decoder_start_token_id` as it gets overwritten."
+                )
+            prompt_ids = prompt_ids.tolist()
+            decoder_start_token_id, *text_prompt_ids = prompt_ids
+            # Slicing the text prompt ids in a manner consistent with the OpenAI implementation
+            # to accomodate context space for the prefix (see https://github.com/openai/whisper/blob/c09a7ae299c4c34c5839a76380ae407e7d785914/whisper/decoding.py#L599)
+            text_prompt_ids = text_prompt_ids[-self.config.max_target_positions // 2 - 1 :]
+            # Set the decoder_start_token_id to <|startofprev|>
+            kwargs.update({"decoder_start_token_id": decoder_start_token_id})
+
+            # If the user passes `max_new_tokens`, increase its number to account for the prompt
+            if kwargs.get("max_new_tokens", None) is not None:
+                kwargs["max_new_tokens"] += len(text_prompt_ids)
+                if kwargs["max_new_tokens"] >= self.config.max_target_positions:
+                    raise ValueError(
+                        f"The length of the sliced `prompt_ids` is {len(text_prompt_ids)}, and the `max_new_tokens` "
+                        f"{kwargs['max_new_tokens'] - len(text_prompt_ids)}. Thus, the combined length of the sliced "
+                        f"`prompt_ids` and `max_new_tokens` is: {kwargs['max_new_tokens']}. This exceeds the "
+                        f"`max_target_positions` of the Whisper model: {self.config.max_target_positions}. "
+                        "You should either reduce the length of your prompt, or reduce the value of `max_new_tokens`, "
+                        f"so that their combined length is less that {self.config.max_target_positions}."
+                    )
+
+            # Reformat the forced_decoder_ids to incorporate the prompt
+            non_prompt_forced_decoder_ids = (
+                kwargs.pop("forced_decoder_ids", None) or generation_config.forced_decoder_ids
+            )
+            forced_decoder_ids = [
+                *text_prompt_ids,
+                generation_config.decoder_start_token_id,
+                *[token for _rank, token in non_prompt_forced_decoder_ids],
+            ]
+            forced_decoder_ids = [(rank + 1, token) for rank, token in enumerate(forced_decoder_ids)]
+            generation_config.forced_decoder_ids = forced_decoder_ids
+
+        if generation_config.return_timestamps:
+            logits_processor = [WhisperTimeStampLogitsProcessor(generation_config)]
+
+        if return_token_timestamps:
+            kwargs["output_attentions"] = True
+            kwargs["return_dict_in_generate"] = True
+
+            if getattr(generation_config, "task", None) == "translate":
+                logger.warning("Token-level timestamps may not be reliable for task 'translate'.")
+            if not hasattr(generation_config, "alignment_heads"):
+                raise ValueError(
+                    "Model generation config has no `alignment_heads`, token-level timestamps not available. "
+                    "See https://gist.github.com/hollance/42e32852f24243b748ae6bc1f985b13a on how to add this property to the generation config."
+                )
+
+            if kwargs.get("num_frames") is not None:
+                generation_config.num_frames = kwargs.pop("num_frames")
+
+        outputs = super().generate(
+            inputs,
+            generation_config,
+            logits_processor,
+            stopping_criteria,
+            prefix_allowed_tokens_fn,
+            synced_gpus,
+            **kwargs,
+        )
+
+        if return_token_timestamps and hasattr(generation_config, "alignment_heads"):
+            num_frames = getattr(generation_config, "num_frames", None)
+            outputs["token_timestamps"] = self._extract_token_timestamps(
+                outputs, generation_config.alignment_heads, num_frames=num_frames
+            )
+
+        return outputs
+
     def prepare_inputs_for_generation(
         self,
         decoder_input_ids,
@@ -1239,7 +1791,7 @@ def prepare_inputs_for_generation(
         use_cache=None,
         encoder_outputs=None,
         attention_mask=None,
-        **kwargs
+        **kwargs,
     ):
         # cut decoder_input_ids if past is used
         if past_key_values is not None:
@@ -1253,10 +1805,180 @@ def prepare_inputs_for_generation(
             "decoder_attention_mask": None,
         }
 
-    #
     @staticmethod
-    def _reorder_cache(past, beam_idx):
+    def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
         return reordered_past
+
+    def _extract_token_timestamps(self, generate_outputs, alignment_heads, time_precision=0.02, num_frames=None):
+        """
+        Calculates token-level timestamps using the encoder-decoder cross-attentions and dynamic time-warping (DTW) to
+        map each output token to a position in the input audio. If `num_frames` is specified, the encoder-decoder
+        cross-attentions will be cropped before applying DTW.
+
+        Returns:
+            tensor containing the timestamps in seconds for each predicted token
+        """
+        # Create a list with `decoder_layers` elements, each a tensor of shape
+        # (batch size, attention_heads, output length, input length).
+        cross_attentions = []
+        for i in range(self.config.decoder_layers):
+            cross_attentions.append(torch.cat([x[i] for x in generate_outputs.cross_attentions], dim=2))
+
+        # Select specific cross-attention layers and heads. This is a tensor
+        # of shape (batch size, num selected, output length, input length).
+        weights = torch.stack([cross_attentions[l][:, h] for l, h in alignment_heads])
+        weights = weights.permute([1, 0, 2, 3])
+        if num_frames is not None:
+            weights = weights[..., : num_frames // 2]
+
+        # Normalize and smoothen the weights.
+        std, mean = torch.std_mean(weights, dim=-2, keepdim=True, unbiased=False)
+        weights = (weights - mean) / std
+        weights = _median_filter(weights, self.config.median_filter_width)
+
+        # Average the different cross-attention heads.
+        matrix = weights.mean(dim=1)
+
+        timestamps = torch.zeros_like(generate_outputs.sequences, dtype=torch.float32)
+
+        # Perform dynamic time warping on each element of the batch.
+        for batch_idx in range(timestamps.shape[0]):
+            text_indices, time_indices = _dynamic_time_warping(-matrix[batch_idx].double().cpu().numpy())
+            jumps = np.pad(np.diff(text_indices), (1, 0), constant_values=1).astype(bool)
+            jump_times = time_indices[jumps] * time_precision
+            timestamps[batch_idx, 1:] = torch.tensor(jump_times)
+
+        return timestamps
+
+
+@add_start_docstrings(
+    """
+    Whisper Encoder Model with a sequence classification head on top (a linear layer over the pooled output) for tasks
+    like SUPERB Keyword Spotting.
+    """,
+    WHISPER_ENCODER_INPUTS_DOCSTRING,
+)
+class WhisperForAudioClassification(WhisperPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.encoder = WhisperEncoder(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
+        self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the Whisper encoder so that its parameters will
+        not be updated during training. Only the projection layers and classification head will be updated.
+        """
+        self.encoder._freeze_parameters()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.encoder.get_input_embeddings()
+
+    def set_input_embeddings(self, value: nn.Module):
+        self.encoder.set_input_embeddings(value)
+
+    @add_start_docstrings_to_model_forward(WHISPER_ENCODER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_features: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoFeatureExtractor, WhisperForAudioClassification
+        >>> from datasets import load_dataset
+
+        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("sanchit-gandhi/whisper-medium-fleurs-lang-id")
+        >>> model = WhisperForAudioClassification.from_pretrained("sanchit-gandhi/whisper-medium-fleurs-lang-id")
+
+        >>> ds = load_dataset("google/fleurs", "all", split="validation", streaming=True)
+        >>> sample = next(iter(ds))
+
+        >>> inputs = feature_extractor(
+        ...     sample["audio"]["array"], sampling_rate=sample["audio"]["sampling_rate"], return_tensors="pt"
+        ... )
+        >>> input_features = inputs.input_features
+
+        >>> with torch.no_grad():
+        ...     logits = model(input_features).logits
+
+        >>> predicted_class_ids = torch.argmax(logits).item()
+        >>> predicted_label = model.config.id2label[predicted_class_ids]
+        >>> predicted_label
+        'Afrikaans'
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_features,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = torch.stack(encoder_outputs, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = encoder_outputs[0]
+
+        hidden_states = self.projector(hidden_states)
+        pooled_output = hidden_states.mean(dim=1)
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # move labels to correct device to enable PP
+            labels = labels.to(logits.device)
+            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + encoder_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
diff --git a/src/transformers/models/whisper/processing_whisper.py b/src/transformers/models/whisper/processing_whisper.py
index 8c158b041f7c..b0d0d6c95450 100644
--- a/src/transformers/models/whisper/processing_whisper.py
+++ b/src/transformers/models/whisper/processing_whisper.py
@@ -16,6 +16,7 @@
 Speech processor class for Whisper
 """
 
+
 from ...processing_utils import ProcessorMixin
 
 
@@ -91,3 +92,6 @@ def decode(self, *args, **kwargs):
         the docstring of this method for more information.
         """
         return self.tokenizer.decode(*args, **kwargs)
+
+    def get_prompt_ids(self, text: str, return_tensors="np"):
+        return self.tokenizer.get_prompt_ids(text, return_tensors=return_tensors)
diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index 26c642c13483..b83528a84a26 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -15,13 +15,15 @@
 """Tokenization classes for Whisper."""
 import json
 import os
+from functools import lru_cache
 from typing import List, Optional, Tuple, Union
 
+import numpy as np
 import regex as re
 
 from ...tokenization_utils import AddedToken, PreTrainedTokenizer
 from ...utils import logging
-from .english_normalizer import EnglishTextNormalizer
+from .english_normalizer import BasicTextNormalizer, EnglishTextNormalizer
 
 
 VOCAB_FILES_NAMES = {
@@ -110,7 +112,7 @@ def get_pairs(word):
     "hi": "hindi",
     "fi": "finnish",
     "vi": "vietnamese",
-    "iw": "hebrew",
+    "he": "hebrew",
     "uk": "ukrainian",
     "el": "greek",
     "ms": "malay",
@@ -217,7 +219,7 @@ class WhisperTokenizer(PreTrainedTokenizer):
     This tokenizer inherits from [`PreTrainedTokenizer`] which contains some of the main methods. Users should refer to
     the superclass for more information regarding such methods.
 
-     Args:
+    Args:
         vocab_file (`str`):
             Path to the vocabulary file.
         merges_file (`str`):
@@ -230,8 +232,9 @@ class WhisperTokenizer(PreTrainedTokenizer):
         unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        bos_token (`str`, *optional*, defaults to `"<|startoftranscript|>"`):
-            The beginning of sequence token.
+        bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The beginning of sequence token. The `decoder_start_token_id` is used to set the first token as
+            `"<|startoftranscript|>"` when generating.
         eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
             The end of sequence token.
         add_prefix_space (`bool`, *optional*, defaults to `False`):
@@ -260,28 +263,34 @@ def __init__(
         normalizer_file=None,
         errors="replace",
         unk_token="<|endoftext|>",
-        bos_token="<|startoftranscript|>",
+        bos_token="<|endoftext|>",
         eos_token="<|endoftext|>",
         pad_token=None,
         add_prefix_space=False,
         language=None,
         task=None,
         predict_timestamps=False,
-        **kwargs
+        **kwargs,
     ):
-
-        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
-        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
-        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
-        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
-        super().__init__(
-            errors=errors,
-            unk_token=unk_token,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            pad_token=pad_token,
-            add_prefix_space=add_prefix_space,
-            **kwargs,
+        bos_token = (
+            AddedToken(bos_token, lstrip=False, rstrip=False, normalized=False, special=True)
+            if isinstance(bos_token, str)
+            else bos_token
+        )
+        eos_token = (
+            AddedToken(eos_token, lstrip=False, rstrip=False, normalized=False, special=True)
+            if isinstance(eos_token, str)
+            else eos_token
+        )
+        unk_token = (
+            AddedToken(unk_token, lstrip=False, rstrip=False, normalized=False, special=True)
+            if isinstance(unk_token, str)
+            else unk_token
+        )
+        pad_token = (
+            AddedToken(pad_token, lstrip=False, rstrip=False, normalized=False, special=True)
+            if isinstance(pad_token, str)
+            else pad_token
         )
 
         with open(vocab_file, encoding="utf-8") as vocab_handle:
@@ -305,20 +314,31 @@ def __init__(
 
         # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
         self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+        self.timestamp_pat = re.compile(r"<\|(\d+\.\d+)\|>")
 
         self.language = language
+        super().__init__(
+            errors=errors,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            add_prefix_space=add_prefix_space,
+            **kwargs,
+        )
+
         self.task = task
         self.predict_timestamps = predict_timestamps
 
+    @property
+    def vocab_size(self) -> int:
+        return len(self.encoder)
+
     def get_vocab(self):
         vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
         vocab.update(self.added_tokens_encoder)
         return vocab
 
-    @property
-    def vocab_size(self) -> int:
-        return len(self.encoder)
-
     # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.bpe with GPT2 -> Whisper
     def bpe(self, token):
         if token in self.cache:
@@ -388,11 +408,10 @@ def set_prefix_tokens(self, language: str = None, task: str = None, predict_time
 
     @property
     def prefix_tokens(self) -> List[int]:
-        all_special_ids = self.all_special_ids
-        bos_token_id = all_special_ids[-106]
-        translate_token_id = all_special_ids[-6]
-        transcribe_token_id = all_special_ids[-5]
-        notimestamps_token_id = all_special_ids[-1]
+        bos_token_id = self.convert_tokens_to_ids("<|startoftranscript|>")
+        translate_token_id = self.convert_tokens_to_ids("<|translate|>")
+        transcribe_token_id = self.convert_tokens_to_ids("<|transcribe|>")
+        notimestamps_token_id = self.convert_tokens_to_ids("<|notimestamps|>")
         langs = tuple(LANGUAGES.keys())
 
         if self.language is not None:
@@ -477,8 +496,11 @@ def _convert_token_to_id(self, token):
         return self.encoder.get(token, self.encoder.get(self.unk_token))
 
     def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.decoder.get(index, self.decoder.get(self.unk_token_id))
+        """
+        Converts an index (integer) in a token (str) using the vocab. Whisper's base tokenizer always decodes OOV
+        tokens as "", thus we do not use the `unk_token` here.
+        """
+        return self.decoder.get(index, "")
 
     def _normalize(self, text):
         """
@@ -488,11 +510,200 @@ def _normalize(self, text):
         normalizer = EnglishTextNormalizer(self.english_spelling_normalizer)
         return normalizer(text)
 
+    @staticmethod
+    def _basic_normalize(text, remove_diacritics=False):
+        """
+        Normalize a given string using the `BasicTextNormalizer` class, which preforms commons transformation on
+        multilingual text.
+        """
+        normalizer = BasicTextNormalizer(remove_diacritics=remove_diacritics)
+        return normalizer(text)
+
+    def _decode_with_timestamps(self, token_ids, skip_special_tokens=False, time_precision=0.02) -> str:
+        """
+        Timestamp tokens are above the special tokens' id range and are ignored by `decode()`. This method decodes
+        given tokens with timestamps tokens annotated, e.g. "<|1.08|>".
+        """
+        timestamp_begin = self.all_special_ids[-1] + 1
+        outputs = [[]]
+        for token in token_ids:
+            if token >= timestamp_begin:
+                timestamp = f"<|{(token - timestamp_begin) * time_precision:.2f}|>"
+                outputs.append(timestamp)
+                outputs.append([])
+            else:
+                outputs[-1].append(token)
+        outputs = [
+            s if isinstance(s, str) else self.decode(s, skip_special_tokens=skip_special_tokens) for s in outputs
+        ]
+        return "".join(outputs)
+
+    def _compute_offsets(self, token_ids, time_precision=0.02):
+        """
+        Compute offsets for a given tokenized input
+
+        Args:
+            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
+                List of tokenized input ids. Can be obtained using the `__call__` method.
+            time_precision (`float`, `optional`, defaults to 0.02):
+                The time ratio to convert from token to time.
+        """
+        offsets = []
+        token_ids = np.array(token_ids)
+        if token_ids.shape[0] > 1 and len(token_ids.shape) > 1:
+            raise ValueError("Can only process a single input at a time")
+        timestamp_begin = self.all_special_ids[-1] + 1
+        timestamp_tokens = token_ids >= timestamp_begin
+
+        consecutive = np.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0] + 1
+        if consecutive.shape[0] == 0 and timestamp_tokens.sum() <= 1:
+            # either there are no timestamps or there are no consecutive ones
+            return []
+        elif np.where(timestamp_tokens)[0][-1] + 1 not in consecutive:
+            # we add the final timestamp if it is not already in the list
+            consecutive = np.append(consecutive, np.where(timestamp_tokens)[0][-1] + 1)
+
+        last_slice = np.where(timestamp_tokens)[0][0]
+        for current_slice in consecutive:
+            sliced_tokens = token_ids[last_slice:current_slice]
+            if len(sliced_tokens) > 1:
+                start_timestamp_position = sliced_tokens[0].item() - timestamp_begin
+                end_timestamp_position = sliced_tokens[-1].item() - timestamp_begin
+                # strip timestamp tokens from the text output
+                sliced_tokens = self._preprocess_token_ids(sliced_tokens)
+                text = self._decode(sliced_tokens)
+                text = self._filter_timestamp_ids(text)
+                offsets.append(
+                    {
+                        "text": text,
+                        "timestamp": (
+                            start_timestamp_position * time_precision,
+                            end_timestamp_position * time_precision,
+                        ),
+                    }
+                )
+            last_slice = current_slice
+
+        return offsets
+
+    @lru_cache
+    def timestamp_ids(self, time_precision=0.02):
+        """
+        Compute the timestamp token ids for a given precision and save to least-recently used (LRU) cache.
+
+        Args:
+            time_precision (`float`, `optional`, defaults to 0.02):
+                The time ratio to convert from token to time.
+        """
+        return self.convert_tokens_to_ids([("<|%.2f|>" % (i * time_precision)) for i in range(1500 + 1)])
+
+    def _preprocess_token_ids(self, token_ids, skip_special_tokens: bool = False):
+        """
+        Pre-process the token ids for decoding by removing the prompt tokens ids and timestamp token ids.
+
+        Args:
+            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
+                List of tokenized input ids. Typically, obtained using the `__call__` method of the tokenizer.
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to remove special tokens from the token ids. If `True`, the prompt token ids will be
+                removed.
+        """
+        if skip_special_tokens:
+            prompt_token_id = self.convert_tokens_to_ids("<|startofprev|>")
+            decoder_start_token_id = self.convert_tokens_to_ids("<|startoftranscript|>")
+            token_ids = self._strip_prompt(token_ids, prompt_token_id, decoder_start_token_id)
+
+        return token_ids
+
+    def _filter_timestamp_ids(self, token_ids):
+        return re.sub(self.timestamp_pat, "", token_ids)
+
+    def decode(
+        self,
+        token_ids,
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: bool = None,
+        output_offsets: bool = False,
+        time_precision=0.02,
+        decode_with_timestamps: bool = False,
+        normalize: bool = False,
+        basic_normalize: bool = False,
+        remove_diacritics: bool = False,
+        **kwargs,
+    ) -> str:
+        """
+        Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
+        tokens and clean up tokenization spaces.
+
+        Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.
+
+        Args:
+            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
+                List of tokenized input ids. Can be obtained using the `__call__` method.
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to remove special tokens in the decoding.
+            clean_up_tokenization_spaces (`bool`, *optional*):
+                Whether or not to clean up the tokenization spaces. If `None`, will default to
+                `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
+            output_offsets (`bool`, *optional*, defaults to `False`):
+                Whether or not to output the offsets of the tokens. This should only be set if the model predicted
+                timestamps.
+            time_precision (`float`, `optional`, defaults to 0.02):
+                The time ratio to convert from token to time.
+            decode_with_timestamps (`bool`, *optional*, defaults to `False`):
+                Whether or not to decode with timestamps included in the raw text.
+            normalize (`bool`, *optional*, defaults to `False`):
+                Whether or not to apply the English text normalizer to the decoded text. Only applicable when the
+                target text is in English. Otherwise, the basic text normalizer should be applied.
+            basic_normalize (`bool`, *optional*, defaults to `False`):
+                Whether or not to apply the Basic text normalizer to the decoded text. Applicable to multilingual
+                target text.
+            remove_diacritics (`bool`, *optional*, defaults to `False`):
+                Whether or not to remove diacritics when applying the Basic text normalizer. Removing diacritics may
+                destroy information in the decoded text, hence it should be used with caution.
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific decode method.
+        Returns:
+            `str`: The decoded sentence.
+        """
+        filtered_ids = self._preprocess_token_ids(
+            token_ids,
+            skip_special_tokens=skip_special_tokens,
+        )
+
+        text = super().decode(
+            filtered_ids,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            normalize=normalize,
+            basic_normalize=basic_normalize,
+            remove_diacritics=remove_diacritics,
+            **kwargs,
+        )
+        if decode_with_timestamps:
+            # legacy method to decode timestamps when not included in the tokenizer vocabulary
+            text = self._decode_with_timestamps(
+                filtered_ids, time_precision=time_precision, skip_special_tokens=skip_special_tokens
+            )
+        else:
+            text = self._filter_timestamp_ids(text)
+
+        # retrieve offsets
+        if output_offsets:
+            offsets = self._compute_offsets(token_ids, time_precision=time_precision)
+            return {"text": text, "offsets": offsets}
+        return text
+
     def _decode(
-        self, token_ids: Union[int, List[int]], skip_special_tokens: bool = False, normalize: bool = False, **kwargs
+        self,
+        token_ids: Union[int, List[int]],
+        skip_special_tokens: bool = False,
+        normalize: bool = False,
+        basic_normalize: bool = False,
+        remove_diacritics: bool = False,
+        **kwargs,
     ) -> str:
         self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
-
         filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
 
         # To avoid mixing byte-level and unicode for byte-level BPT
@@ -518,6 +729,9 @@ def _decode(
         if normalize:
             clean_text = self._normalize(text)
             return clean_text
+        elif basic_normalize:
+            clean_text = self._basic_normalize(text, remove_diacritics=remove_diacritics)
+            return clean_text
         else:
             return text
 
@@ -573,14 +787,13 @@ def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
             text = " " + text
         return (text, kwargs)
 
-    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._build_conversation_input_ids with GPT2 -> Whisper
-    def _build_conversation_input_ids(self, conversation) -> List[int]:
-        input_ids = []
-        for is_user, text in conversation.iter_texts():
-            input_ids.extend(self.encode(text, add_special_tokens=False) + [self.eos_token_id])
-        if len(input_ids) > self.model_max_length:
-            input_ids = input_ids[-self.model_max_length :]
-        return input_ids
+    @property
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.default_chat_template
+    def default_chat_template(self):
+        """
+        A simple chat template that ignores role information and just concatenates messages with EOS tokens.
+        """
+        return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}"
 
     def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
         self.set_prefix_tokens(task=task, language=language, predict_timestamps=not no_timestamps)
@@ -591,3 +804,498 @@ def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
         forced_tokens = self.prefix_tokens[1:]
         forced_decoder_ids = [(rank + 1, token) for rank, token in enumerate(forced_tokens)]
         return forced_decoder_ids
+
+    def _decode_asr(self, model_outputs, *, return_timestamps, return_language, time_precision):
+        return _decode_asr(
+            self,
+            model_outputs,
+            return_timestamps=return_timestamps,
+            return_language=return_language,
+            time_precision=time_precision,
+        )
+
+    def get_prompt_ids(self, text: str, return_tensors="np"):
+        """Converts prompt text to IDs that can be passed to [`~WhisperForConditionalGeneration.generate`]."""
+        batch_encoding = self("<|startofprev|>", " " + text.strip(), add_special_tokens=False)
+
+        # Check for special tokens
+        prompt_text_ids = batch_encoding["input_ids"][1:]
+        special_token_id = next((x for x in prompt_text_ids if x >= self.all_special_ids[0]), None)
+        if special_token_id is not None:
+            token = self.convert_ids_to_tokens(special_token_id)
+            raise ValueError(f"Encountered text in the prompt corresponding to disallowed special token: {token}.")
+
+        batch_encoding.convert_to_tensors(tensor_type=return_tensors)
+        return batch_encoding["input_ids"]
+
+    @staticmethod
+    def _strip_prompt(token_ids: List[int], prompt_token_id: int, decoder_start_token_id: int):
+        has_prompt = isinstance(token_ids, list) and token_ids and token_ids[0] == prompt_token_id
+        if has_prompt:
+            if decoder_start_token_id in token_ids:
+                return token_ids[token_ids.index(decoder_start_token_id) :]
+            else:
+                return []
+
+        return token_ids
+
+
+def _decode_asr(tokenizer, model_outputs, *, return_timestamps, return_language, time_precision):
+    """
+    Internal method meant to only be used by asr pipeline. Handles all the little quirks specific to whisper to handle
+    the various options not allowed in other seq2seq models
+    """
+
+    # =========== Overview ============
+    # - iterate over all outputs
+    # - all tokens within output
+    # - Each token can be
+    #   - language token
+    #   - special token
+    #   - timestamp token
+    #   - text token
+    # - We accumulate the text tokens.
+    # - We split on end timestamps
+    # - Lots of complexity comes from stride and timestamps
+
+    last_language = None
+
+    def new_chunk():
+        return {"language": last_language, "timestamp": [None, None], "text": ""}
+
+    # Welcome to the state machine !
+    chunks = []
+    chunk = new_chunk()
+    time_offset = 0.0
+    timestamp_begin = tokenizer.convert_tokens_to_ids("<|notimestamps|>") + 1
+    previous_tokens = []
+    previous_token_timestamps = []
+    skip = False
+    right_stride_start = None
+
+    all_special_ids = set(tokenizer.all_special_ids)
+    # - iterate over all outputs
+    for chunk_id, output in enumerate(model_outputs):
+        # We can drop everything to Python list, it's going to make
+        # our lives easier
+        token_ids = output["tokens"][0].tolist()
+        if return_timestamps == "word":
+            token_timestamps = output["token_timestamps"][0].tolist()
+
+        # Those keep track of timestamps within strides
+        # Which need to be skipped and resolve all tokens in a single
+        # chunk.
+        last_timestamp = None
+        first_timestamp = timestamp_begin
+
+        if "stride" in output:
+            chunk_len, stride_left, stride_right = output["stride"]
+            # Offset the timings to account for the other `model_outputs`.
+            time_offset -= stride_left
+            right_stride_start = chunk_len - stride_right
+
+            # Keeping track of timestamps within strides
+            # We're going to NOT split on those, and delay until we're
+            # out of BOTH stride. Otherwise lots of issues occur and
+            # corner cases
+            if stride_left:
+                first_timestamp = stride_left / time_precision + timestamp_begin
+            if stride_right:
+                for token in reversed(token_ids):
+                    if token >= timestamp_begin:
+                        # There can be several token in the right stride
+                        # But the last one is ALWAYS going to be skipped
+                        if (
+                            last_timestamp is not None
+                            and (token - timestamp_begin) * time_precision < right_stride_start
+                        ):
+                            break
+                        last_timestamp = token
+
+        current_tokens = []
+        current_token_timestamps = []
+
+        # - all tokens within output
+        for i, token in enumerate(token_ids):
+            # 4 possible states for each token
+            # - 1/ Language code
+            # - 2/ all other special tokens (which we ignore)
+            # - 3/ Timestamp
+            # - 4/ Regular text
+            if token in all_special_ids:
+                # Either language code or other
+                text = tokenizer.decode([token])
+                # Removing outer shell <|XX|>
+                text = text[2:-2]
+                language = LANGUAGES.get(text, None)
+                if language is not None:
+                    # 1/ Indeed some language
+                    # TODO Handle when language is different from the previous
+                    # one, and we cannot use timestamped tokens to create chunks
+                    if last_language and language != last_language and not return_timestamps:
+                        previous_tokens.append(current_tokens)
+                        resolved_tokens = _find_longest_common_sequence(previous_tokens)
+                        resolved_text = tokenizer.decode(resolved_tokens)
+                        chunk["text"] = resolved_text
+                        chunks.append(chunk)
+
+                        # Flush all our temporary context
+                        previous_tokens = []
+                        current_tokens = []
+                        chunk = new_chunk()
+                    chunk["language"] = language
+                    last_language = language
+                else:
+                    # 2/ This is a regular special token, ignoring it
+                    pass
+            elif token >= timestamp_begin:
+                # 3/ Timestamp token
+                time = (token - timestamp_begin) * time_precision + time_offset
+                time = round(time, 2)
+                if last_timestamp and token >= last_timestamp:
+                    # Whisper outputted a timestamp token, but it falls within
+                    # our stride, so we're going to skip it for the time being
+                    # and resolve this later
+                    # Skip is necessary because timestamp tokens always come
+                    # by pair, so we need to skip the next one too (which would mark the start of another chunk).
+                    skip = True
+                elif skip or (previous_tokens and token < first_timestamp):
+                    skip = False
+                elif chunk["timestamp"][0] is None:
+                    chunk["timestamp"][0] = time
+                else:
+                    # This is the end of the timestamp chunk
+                    if time == chunk["timestamp"][0]:
+                        # This is a bug in timestamp token output
+                        # where we're taking the duplicate token
+                        # as a stop where it should be a start.
+                        # This is an issue in the underlying model output
+                        # Let's just skip it so it becomes de-factor
+                        # a start agin
+                        pass
+                    else:
+                        chunk["timestamp"][1] = time
+                        # Handling merges.
+                        previous_tokens.append(current_tokens)
+                        if return_timestamps == "word":
+                            previous_token_timestamps.append(current_token_timestamps)
+                        resolved_tokens, resolved_token_timestamps = _find_longest_common_sequence(
+                            previous_tokens, previous_token_timestamps
+                        )
+                        resolved_text = tokenizer.decode(resolved_tokens)
+                        chunk["text"] = resolved_text
+                        if return_timestamps == "word":
+                            chunk["words"] = _collate_word_timestamps(
+                                tokenizer, resolved_tokens, resolved_token_timestamps, last_language
+                            )
+                        chunks.append(chunk)
+
+                        # Flush all our temporary context
+                        previous_tokens = []
+                        current_tokens = []
+                        previous_token_timestamps = []
+                        current_token_timestamps = []
+                        chunk = new_chunk()
+            else:
+                # 4/ Regular token
+                # We just append to the list of all tokens so we can handle
+                # merges later and decode into text.
+                current_tokens.append(token)
+                if return_timestamps == "word":
+                    start_time = round(token_timestamps[i] + time_offset, 2)
+                    if i + 1 < len(token_timestamps):
+                        end_time = round(token_timestamps[i + 1] + time_offset, 2)
+                    else:
+                        end_time = None  # should never happen
+                    current_token_timestamps.append((start_time, end_time))
+
+        if "stride" in output:
+            time_offset += chunk_len - stride_right
+
+        # Leftover tokens
+        if current_tokens:
+            previous_tokens.append(current_tokens)
+            if return_timestamps == "word":
+                previous_token_timestamps.append(current_token_timestamps)
+        elif not (any(p for p in previous_tokens)):
+            chunk = new_chunk()
+            previous_tokens = []
+            current_tokens = []
+            previous_token_timestamps = []
+            current_token_timestamps = []
+
+    if previous_tokens:
+        if return_timestamps:
+            logger.warning(
+                "Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. "
+                "Also make sure WhisperTimeStampLogitsProcessor was used during generation."
+            )
+        # Happens when we don't use timestamps
+        resolved_tokens, resolved_token_timestamps = _find_longest_common_sequence(
+            previous_tokens, previous_token_timestamps
+        )
+        resolved_text = tokenizer.decode(resolved_tokens)
+        chunk["text"] = resolved_text
+        if return_timestamps == "word":
+            chunk["words"] = _collate_word_timestamps(
+                tokenizer, resolved_tokens, resolved_token_timestamps, last_language
+            )
+        chunks.append(chunk)
+
+    # Preparing and cleaning up the pipeline output
+    full_text = "".join(chunk["text"] for chunk in chunks)
+    if return_timestamps or return_language:
+        for chunk in chunks:
+            if not return_timestamps:
+                chunk.pop("timestamp")
+            else:
+                chunk["timestamp"] = tuple(chunk["timestamp"])
+            if not return_language:
+                chunk.pop("language")
+
+        if return_timestamps == "word":
+            new_chunks = []
+            for chunk in chunks:
+                new_chunks.extend(chunk["words"])
+            optional = {"chunks": new_chunks}
+        else:
+            optional = {"chunks": chunks}
+    else:
+        optional = {}
+    return full_text, optional
+
+
+def _find_longest_common_sequence(sequences, token_timestamp_sequences=None):
+    # It would be much harder to do O(n) because of fault tolerance.
+    # We actually have a really good property which is that the total sequence
+    # MUST be those subsequences in order.
+    # If token_timestamp_sequences is provided, will split those sequences in
+    # exactly the same way.
+
+    left_sequence = sequences[0]
+    left_length = len(left_sequence)
+    total_sequence = []
+
+    if token_timestamp_sequences:
+        left_token_timestamp_sequence = token_timestamp_sequences[0]
+        total_token_timestamp_sequence = []
+
+    for seq_idx, right_sequence in enumerate(sequences[1:]):
+        # index = 0
+        max_ = 0.0
+        max_indices = (left_length, left_length, 0, 0)
+        # Here we're sliding matches
+        # [a, b, c, d]
+        #          [c, d, f]
+        # =        [c] == [d]
+        #
+        # [a, b, c, d]
+        #       [c, d, f]
+        # =     [c, d] == [c, d]
+        #
+        #
+        # [a, b, c, d]
+        #    [c, d, f]
+        #
+        # =  [b, c, d] == [c, d, f]
+        #
+        # [a, b, c, d]
+        # [c, d, f]
+        #
+        # [a, b, c] == [c, d, f]
+        #
+        # [a, b, c, d]
+        # [d, f]
+        #
+        # [a, b] == [d, f]
+        #
+        # [a, b, c, d]
+        # [f]
+        #
+        # [a] == [f]
+        right_length = len(right_sequence)
+        for i in range(1, left_length + right_length):
+            # epsilon to favor long perfect matches
+            eps = i / 10000.0
+
+            # Slightly convoluted because we don't want out of bound indices
+            # This will be necessary for a small conflict resolution optimization
+            # later
+            left_start = max(0, left_length - i)
+            left_stop = min(left_length, left_length + right_length - i)
+            left = np.array(left_sequence[left_start:left_stop])
+
+            right_start = max(0, i - left_length)
+            right_stop = min(right_length, i)
+            right = np.array(right_sequence[right_start:right_stop])
+
+            # We can only match subsequences of the same size.
+            if len(left) != len(right):
+                raise RuntimeError(
+                    "There is a bug within whisper `decode_asr` function, please report it. Dropping to prevent bad inference."
+                )
+
+            matches = np.sum(left == right)
+            matching = matches / i + eps
+            if matches > 1 and matching > max_:
+                max_ = matching
+                max_indices = (left_start, left_stop, right_start, right_stop)
+
+        (left_start, left_stop, right_start, right_stop) = max_indices
+
+        # This is a small conflict optimization since those sequences overlap
+        # in audio.
+        # We're going to give more confidence to the left sequence
+        # for the left of the overlap,
+        # and to the right of the sequence, for the right of the overlap
+        left_mid = (left_stop + left_start) // 2
+        right_mid = (right_stop + right_start) // 2
+        total_sequence.extend(left_sequence[:left_mid])
+        left_sequence = right_sequence[right_mid:]
+        left_length = len(left_sequence)
+
+        if token_timestamp_sequences:
+            total_token_timestamp_sequence.extend(left_token_timestamp_sequence[:left_mid])
+            left_token_timestamp_sequence = token_timestamp_sequences[seq_idx + 1][right_mid:]
+
+    total_sequence.extend(left_sequence)
+
+    if token_timestamp_sequences is None:
+        return total_sequence
+
+    if len(token_timestamp_sequences) > 0:
+        total_token_timestamp_sequence.extend(left_token_timestamp_sequence)
+        return total_sequence, total_token_timestamp_sequence
+    else:
+        return total_sequence, []
+
+
+def _collate_word_timestamps(tokenizer, tokens, token_timestamps, language):
+    words, _, token_indices = _combine_tokens_into_words(tokenizer, tokens, language)
+    timings = [
+        {
+            "text": word,
+            "timestamp": (token_timestamps[indices[0]][0], token_timestamps[indices[-1]][1]),
+        }
+        for word, indices in zip(words, token_indices)
+    ]
+    return timings
+
+
+def _combine_tokens_into_words(
+    tokenizer,
+    tokens: List[int],
+    language: str = None,
+    prepend_punctuations: str = "\"'“¡¿([{-",
+    append_punctuations: str = "\"'.。,，!！?？:：”)]}、",
+):
+    """
+    Groups tokens by word. Returns a tuple containing a list of strings with the words, and a list of `token_id`
+    sequences with the tokens making up each word.
+    """
+    if language is None:
+        language = tokenizer.language
+    if language is None:
+        language = "english"
+
+    if language in {"chinese", "japanese", "thai", "lao", "myanmar"}:
+        # These languages don't typically use spaces.
+        words, word_tokens, token_indices = _split_tokens_on_unicode(tokenizer, tokens)
+    else:
+        words, word_tokens, token_indices = _split_tokens_on_spaces(tokenizer, tokens)
+
+    _merge_punctuations(words, word_tokens, token_indices, prepend_punctuations, append_punctuations)
+    return words, word_tokens, token_indices
+
+
+def _split_tokens_on_unicode(tokenizer, tokens: List[int]):
+    """Combine tokens into words by splitting at any position where the tokens are decoded as valid unicode points."""
+    decoded_full = tokenizer.decode(tokens, decode_with_timestamps=True)
+    replacement_char = "\ufffd"
+
+    words = []
+    word_tokens = []
+    token_indices = []
+    current_tokens = []
+    current_indices = []
+    unicode_offset = 0
+
+    for token_idx, token in enumerate(tokens):
+        current_tokens.append(token)
+        current_indices.append(token_idx)
+        decoded = tokenizer.decode(current_tokens, decode_with_timestamps=True)
+
+        if (
+            replacement_char not in decoded
+            or decoded_full[unicode_offset + decoded.index(replacement_char)] == replacement_char
+        ):
+            words.append(decoded)
+            word_tokens.append(current_tokens)
+            token_indices.append(current_indices)
+            current_tokens = []
+            current_indices = []
+            unicode_offset += len(decoded)
+
+    return words, word_tokens, token_indices
+
+
+def _split_tokens_on_spaces(tokenizer, tokens: List[int]):
+    """Combine tokens into words by splitting at whitespace and punctuation tokens."""
+    subwords, subword_tokens_list, subword_indices_list = _split_tokens_on_unicode(tokenizer, tokens)
+    words = []
+    word_tokens = []
+    token_indices = []
+
+    for subword, subword_tokens, subword_indices in zip(subwords, subword_tokens_list, subword_indices_list):
+        special = subword_tokens[0] >= tokenizer.eos_token_id
+        with_space = subword.startswith(" ")
+        punctuation = subword.strip() in "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
+
+        if special or with_space or punctuation or len(words) == 0:
+            words.append(subword)
+            word_tokens.append(subword_tokens)
+            token_indices.append(subword_indices)
+        else:
+            words[-1] = words[-1] + subword
+            word_tokens[-1].extend(subword_tokens)
+            token_indices[-1].extend(subword_indices)
+
+    return words, word_tokens, token_indices
+
+
+def _merge_punctuations(words, tokens, indices, prepended, appended):
+    """Merges punctuation tokens with neighboring words."""
+    # prepend punctuations
+    i = len(words) - 2
+    j = len(words) - 1
+    while i >= 0:
+        if words[i].startswith(" ") and words[i].strip() in prepended:
+            words[j] = words[i] + words[j]
+            tokens[j] = tokens[i] + tokens[j]
+            indices[j] = indices[i] + indices[j]
+            words[i] = ""
+            tokens[i] = []
+            indices[i] = []
+        else:
+            j = i
+        i -= 1
+
+    # append punctuations
+    i = 0
+    j = 1
+    while j < len(words):
+        if not words[i].endswith(" ") and words[j] in appended:
+            words[i] += words[j]
+            tokens[i] += tokens[j]
+            indices[i] += indices[j]
+            words[j] = ""
+            tokens[j] = []
+            indices[j] = []
+        else:
+            i = j
+        j += 1
+
+    # remove elements that are now empty
+    words[:] = [word for word in words if word]
+    tokens[:] = [token for token in tokens if token]
+    indices[:] = [idx for idx in indices if idx]
diff --git a/src/transformers/models/whisper/tokenization_whisper_fast.py b/src/transformers/models/whisper/tokenization_whisper_fast.py
new file mode 100644
index 000000000000..64a4343a1968
--- /dev/null
+++ b/src/transformers/models/whisper/tokenization_whisper_fast.py
@@ -0,0 +1,615 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for Whisper."""
+import json
+import os
+import re
+from functools import lru_cache
+from typing import List, Optional, Tuple
+
+import numpy as np
+from tokenizers import AddedToken, pre_tokenizers, processors
+
+from ...tokenization_utils_base import BatchEncoding
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import logging
+from .english_normalizer import BasicTextNormalizer, EnglishTextNormalizer
+from .tokenization_whisper import LANGUAGES, TASK_IDS, TO_LANGUAGE_CODE, WhisperTokenizer, _decode_asr
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "tokenizer_file": "tokenizer.json",
+    "merges_file": "merges.txt",
+    "normalizer_file": "normalizer.json",
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "openai/whisper-tiny": "https://huggingface.co/openai/whisper-tiny/resolve/main/vocab.json",
+        "openai/whisper-base": "https://huggingface.co/openai/whisper-base/resolve/main/vocab.json",
+        "openai/whisper-small": "https://huggingface.co/openai/whisper-small/resolve/main/vocab.json",
+        "openai/whisper-medium": "https://huggingface.co/openai/whisper-medium/resolve/main/vocab.json",
+        "openai/whisper-large": "https://huggingface.co/openai/whisper-large/resolve/main/vocab.json",
+        "openai/whisper-tiny.en": "https://huggingface.co/openai/whisper-tiny.en/resolve/main/vocab.json",
+        "openai/whisper-base.en": "https://huggingface.co/openai/whisper-base.en/resolve/main/vocab.json",
+        "openai/whisper-small.en": "https://huggingface.co/openai/whisper-small.en/resolve/main/vocab.json",
+        "openai/whisper-medium.en": "https://huggingface.co/openai/whisper-medium.en/resolve/main/vocab.json",
+    },
+    "merges_file": {
+        "openai/whisper-tiny": "https://huggingface.co/openai/whisper-tiny/resolve/main/merges.txt",
+        "openai/whisper-base": "https://huggingface.co/openai/whisper-base/resolve/main/merges.txt",
+        "openai/whisper-small": "https://huggingface.co/openai/whisper-small/resolve/main/merges.txt",
+        "openai/whisper-medium": "https://huggingface.co/openai/whisper-medium/resolve/main/merges.txt",
+        "openai/whisper-large": "https://huggingface.co/openai/whisper-large/resolve/main/merges.txt",
+        "openai/whisper-tiny.en": "https://huggingface.co/openai/whisper-tiny.en/resolve/main/merges.txt",
+        "openai/whisper-base.en": "https://huggingface.co/openai/whisper-base.en/resolve/main/merges.txt",
+        "openai/whisper-small.en": "https://huggingface.co/openai/whisper-small.en/resolve/main/merges.txt",
+        "openai/whisper-medium.en": "https://huggingface.co/openai/whisper-medium.en/resolve/main/merges.txt",
+    },
+    "tokenizer_file": {
+        "openai/whisper-tiny": "https://huggingface.co/openai/whisper-tiny/resolve/main/tokenizer.json",
+        "openai/whisper-base": "https://huggingface.co/openai/whisper-base/resolve/main/tokenizer.json",
+        "openai/whisper-small": "https://huggingface.co/openai/whisper-small/resolve/main/tokenizer.json",
+        "openai/whisper-medium": "https://huggingface.co/openai/whisper-medium/resolve/main/tokenizer.json",
+        "openai/whisper-large": "https://huggingface.co/openai/whisper-large/resolve/main/tokenizer.json",
+        "openai/whisper-tiny.en": "https://huggingface.co/openai/whisper-tiny.en/resolve/main/tokenizer.json",
+        "openai/whisper-base.en": "https://huggingface.co/openai/whisper-base.en/resolve/main/tokenizer.json",
+        "openai/whisper-small.en": "https://huggingface.co/openai/whisper-small.en/resolve/main/tokenizer.json",
+        "openai/whisper-medium.en": "https://huggingface.co/openai/whisper-medium.en/resolve/main/tokenizer.json",
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "openai/whisper-tiny": 1500,
+    "openai/whisper-base": 1500,
+    "openai/whisper-small": 1500,
+    "openai/whisper-medium": 1500,
+    "openai/whisper-large": 1500,
+    "openai/whisper-tiny.en": 1500,
+    "openai/whisper-base.en": 1500,
+    "openai/whisper-small.en": 1500,
+    "openai/whisper-medium.en": 1500,
+}
+
+
+class WhisperTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a "fast" Whisper tokenizer (backed by HuggingFace's *tokenizers* library).
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        merges_file (`str`):
+            Path to the merges file.
+        normalizer_file (`str`, *optional*, defaults to `None`):
+            Path to the normalizer_file file.
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See
+            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The beginning of sequence token. The `decoder_start_token_id` is used to set the first token as
+            `"<|startoftranscript|>"` when generating.
+        eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
+            The end of sequence token.
+        add_prefix_space (`bool`, *optional*, defaults to `False`):
+            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
+            other word. (Whisper tokenizer detect beginning of words by the preceding space).
+        trim_offsets (`bool`, *optional*, defaults to `True`):
+            Whether or not the post-processing step should trim offsets to avoid including whitespaces.
+        language (`str`, *optional*):
+            The language of the transcription text. The corresponding language id token is appended to the start of the
+            sequence for multilingual speech recognition and speech translation tasks, e.g. for Spanish the token
+            `"<|es|>"` is appended to the start of sequence. This should be used for multilingual fine-tuning only.
+        task (`str`, *optional*):
+            Task identifier to append at the start of sequence (if any). This should be used for mulitlingual
+            fine-tuning, with `"transcribe"` for speech recognition and `"translate"` for speech translation.
+        predict_timestamps (`bool`, *optional*, defaults to `False`):
+            Whether to omit the `<|notimestamps|>` token at the start of the sequence.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = WhisperTokenizer
+
+    def __init__(
+        self,
+        vocab_file=None,
+        merges_file=None,
+        normalizer_file=None,
+        tokenizer_file=None,
+        unk_token="<|endoftext|>",
+        bos_token="<|endoftext|>",
+        eos_token="<|endoftext|>",
+        add_prefix_space=False,
+        language=None,
+        task=None,
+        predict_timestamps=False,
+        **kwargs,
+    ):
+        bos_token = (
+            AddedToken(bos_token, lstrip=False, rstrip=False, normalized=False, special=True)
+            if isinstance(bos_token, str)
+            else bos_token
+        )
+        eos_token = (
+            AddedToken(eos_token, lstrip=False, rstrip=False, normalized=False, special=True)
+            if isinstance(eos_token, str)
+            else eos_token
+        )
+        unk_token = (
+            AddedToken(unk_token, lstrip=False, rstrip=False, normalized=False, special=True)
+            if isinstance(unk_token, str)
+            else unk_token
+        )
+
+        super().__init__(
+            vocab_file,
+            merges_file,
+            tokenizer_file=tokenizer_file,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            add_prefix_space=add_prefix_space,
+            **kwargs,
+        )
+
+        self.add_bos_token = kwargs.pop("add_bos_token", False)
+
+        pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
+        if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
+            pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
+            pre_tok_state["add_prefix_space"] = add_prefix_space
+            self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
+
+        if normalizer_file is not None:
+            with open(normalizer_file, encoding="utf-8") as vocab_handle:
+                self.english_spelling_normalizer = json.load(vocab_handle)
+        else:
+            self.english_spelling_normalizer = None
+
+        self.add_prefix_space = add_prefix_space
+        self.timestamp_pat = re.compile(r"<\|(\d+\.\d+)\|>")
+
+        self.language = language
+        self.task = task
+        self.predict_timestamps = predict_timestamps
+
+    # Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast._batch_encode_plus
+    def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
+        is_split_into_words = kwargs.get("is_split_into_words", False)
+        assert self.add_prefix_space or not is_split_into_words, (
+            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
+            "to use it with pretokenized inputs."
+        )
+
+        return super()._batch_encode_plus(*args, **kwargs)
+
+    # Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast._encode_plus
+    def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
+        is_split_into_words = kwargs.get("is_split_into_words", False)
+
+        assert self.add_prefix_space or not is_split_into_words, (
+            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
+            "to use it with pretokenized inputs."
+        )
+
+        return super()._encode_plus(*args, **kwargs)
+
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._decode_with_timestamps
+    def _decode_with_timestamps(self, token_ids, skip_special_tokens=False, time_precision=0.02) -> str:
+        """
+        Timestamp tokens are above the special tokens' id range and are ignored by `decode()`. This method decodes
+        given tokens with timestamps tokens annotated, e.g. "<|1.08|>".
+        """
+        timestamp_begin = self.all_special_ids[-1] + 1
+        outputs = [[]]
+        for token in token_ids:
+            if token >= timestamp_begin:
+                timestamp = f"<|{(token - timestamp_begin) * time_precision:.2f}|>"
+                outputs.append(timestamp)
+                outputs.append([])
+            else:
+                outputs[-1].append(token)
+        outputs = [
+            s if isinstance(s, str) else self.decode(s, skip_special_tokens=skip_special_tokens) for s in outputs
+        ]
+        return "".join(outputs)
+
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._compute_offsets
+    def _compute_offsets(self, token_ids, time_precision=0.02):
+        """
+        Compute offsets for a given tokenized input
+
+        Args:
+            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
+                List of tokenized input ids. Can be obtained using the `__call__` method.
+            time_precision (`float`, `optional`, defaults to 0.02):
+                The time ratio to convert from token to time.
+        """
+        offsets = []
+        token_ids = np.array(token_ids)
+        if token_ids.shape[0] > 1 and len(token_ids.shape) > 1:
+            raise ValueError("Can only process a single input at a time")
+        timestamp_begin = self.all_special_ids[-1] + 1
+        timestamp_tokens = token_ids >= timestamp_begin
+
+        consecutive = np.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0] + 1
+        if consecutive.shape[0] == 0 and timestamp_tokens.sum() <= 1:
+            # either there are no timestamps or there are no consecutive ones
+            return []
+        elif np.where(timestamp_tokens)[0][-1] + 1 not in consecutive:
+            # we add the final timestamp if it is not already in the list
+            consecutive = np.append(consecutive, np.where(timestamp_tokens)[0][-1] + 1)
+
+        last_slice = np.where(timestamp_tokens)[0][0]
+        for current_slice in consecutive:
+            sliced_tokens = token_ids[last_slice:current_slice]
+            if len(sliced_tokens) > 1:
+                start_timestamp_position = sliced_tokens[0].item() - timestamp_begin
+                end_timestamp_position = sliced_tokens[-1].item() - timestamp_begin
+                # strip timestamp tokens from the text output
+                sliced_tokens = self._preprocess_token_ids(sliced_tokens)
+                text = self._decode(sliced_tokens)
+                text = self._filter_timestamp_ids(text)
+                offsets.append(
+                    {
+                        "text": text,
+                        "timestamp": (
+                            start_timestamp_position * time_precision,
+                            end_timestamp_position * time_precision,
+                        ),
+                    }
+                )
+            last_slice = current_slice
+
+        return offsets
+
+    @lru_cache
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.timestamp_ids
+    def timestamp_ids(self, time_precision=0.02):
+        """
+        Compute the timestamp token ids for a given precision and save to least-recently used (LRU) cache.
+
+        Args:
+            time_precision (`float`, `optional`, defaults to 0.02):
+                The time ratio to convert from token to time.
+        """
+        return self.convert_tokens_to_ids([("<|%.2f|>" % (i * time_precision)) for i in range(1500 + 1)])
+
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._preprocess_token_ids
+    def _preprocess_token_ids(self, token_ids, skip_special_tokens: bool = False):
+        """
+        Pre-process the token ids for decoding by removing the prompt tokens ids and timestamp token ids.
+
+        Args:
+            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
+                List of tokenized input ids. Typically, obtained using the `__call__` method of the tokenizer.
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to remove special tokens from the token ids. If `True`, the prompt token ids will be
+                removed.
+        """
+        if skip_special_tokens:
+            prompt_token_id = self.convert_tokens_to_ids("<|startofprev|>")
+            decoder_start_token_id = self.convert_tokens_to_ids("<|startoftranscript|>")
+            token_ids = self._strip_prompt(token_ids, prompt_token_id, decoder_start_token_id)
+
+        return token_ids
+
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._filter_timestamp_ids
+    def _filter_timestamp_ids(self, token_ids):
+        return re.sub(self.timestamp_pat, "", token_ids)
+
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.decode
+    def decode(
+        self,
+        token_ids,
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: bool = None,
+        output_offsets: bool = False,
+        time_precision=0.02,
+        decode_with_timestamps: bool = False,
+        normalize: bool = False,
+        basic_normalize: bool = False,
+        remove_diacritics: bool = False,
+        **kwargs,
+    ) -> str:
+        """
+        Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
+        tokens and clean up tokenization spaces.
+
+        Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.
+
+        Args:
+            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
+                List of tokenized input ids. Can be obtained using the `__call__` method.
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to remove special tokens in the decoding.
+            clean_up_tokenization_spaces (`bool`, *optional*):
+                Whether or not to clean up the tokenization spaces. If `None`, will default to
+                `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
+            output_offsets (`bool`, *optional*, defaults to `False`):
+                Whether or not to output the offsets of the tokens. This should only be set if the model predicted
+                timestamps.
+            time_precision (`float`, `optional`, defaults to 0.02):
+                The time ratio to convert from token to time.
+            decode_with_timestamps (`bool`, *optional*, defaults to `False`):
+                Whether or not to decode with timestamps included in the raw text.
+            normalize (`bool`, *optional*, defaults to `False`):
+                Whether or not to apply the English text normalizer to the decoded text. Only applicable when the
+                target text is in English. Otherwise, the basic text normalizer should be applied.
+            basic_normalize (`bool`, *optional*, defaults to `False`):
+                Whether or not to apply the Basic text normalizer to the decoded text. Applicable to multilingual
+                target text.
+            remove_diacritics (`bool`, *optional*, defaults to `False`):
+                Whether or not to remove diacritics when applying the Basic text normalizer. Removing diacritics may
+                destroy information in the decoded text, hence it should be used with caution.
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific decode method.
+        Returns:
+            `str`: The decoded sentence.
+        """
+        filtered_ids = self._preprocess_token_ids(
+            token_ids,
+            skip_special_tokens=skip_special_tokens,
+        )
+
+        text = super().decode(
+            filtered_ids,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            normalize=normalize,
+            basic_normalize=basic_normalize,
+            remove_diacritics=remove_diacritics,
+            **kwargs,
+        )
+        if decode_with_timestamps:
+            # legacy method to decode timestamps when not included in the tokenizer vocabulary
+            text = self._decode_with_timestamps(
+                filtered_ids, time_precision=time_precision, skip_special_tokens=skip_special_tokens
+            )
+        else:
+            text = self._filter_timestamp_ids(text)
+
+        # retrieve offsets
+        if output_offsets:
+            offsets = self._compute_offsets(token_ids, time_precision=time_precision)
+            return {"text": text, "offsets": offsets}
+        return text
+
+    def _decode(
+        self, *args, normalize: bool = False, basic_normalize: bool = False, remove_diacritics: bool = False, **kwargs
+    ) -> str:
+        text = super()._decode(*args, **kwargs)
+
+        if normalize:
+            clean_text = self._normalize(text)
+            return clean_text
+        elif basic_normalize:
+            clean_text = self._basic_normalize(text, remove_diacritics=remove_diacritics)
+            return clean_text
+        else:
+            return text
+
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._normalize
+    def _normalize(self, text):
+        """
+        Normalize a given string using the `EnglishTextNormalizer` class, which preforms commons transformation on
+        english text.
+        """
+        normalizer = EnglishTextNormalizer(self.english_spelling_normalizer)
+        return normalizer(text)
+
+    @staticmethod
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._basic_normalize
+    def _basic_normalize(text, remove_diacritics=False):
+        """
+        Normalize a given string using the `BasicTextNormalizer` class, which preforms commons transformation on
+        multilingual text.
+        """
+        normalizer = BasicTextNormalizer(remove_diacritics=remove_diacritics)
+        return normalizer(text)
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
+
+        normalizer_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["normalizer_file"]
+        )
+
+        if self.english_spelling_normalizer is not None:
+            with open(normalizer_file, "w", encoding="utf-8") as f:
+                f.write(
+                    json.dumps(self.english_spelling_normalizer, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
+                )
+
+        return tuple(files) + (normalizer_file,)
+
+    def set_prefix_tokens(self, language: str = None, task: str = None, predict_timestamps: bool = None):
+        """
+        Override the prefix tokens appended to the start of the label sequence. This method can be used standalone to
+        update the prefix tokens as required when fine-tuning. Example:
+
+        ```python
+        >>> # instantiate the tokenizer and set the prefix token to Spanish
+        >>> tokenizer = WhisperTokenizerFast.from_pretrained("openai/whisper-tiny", language="spanish")
+        >>> # now switch the prefix token from Spanish to French
+        >>> tokenizer.set_prefix_tokens(language="french")
+        ```
+
+        Args:
+            language (`str`, *optional*, defaults to `None`):
+                The language of the transcription text.
+            task (`str`, *optional*, defaults to `None`):
+                Task identifier to append at the start of sequence (if any).
+            predict_timestamps (`bool`, *optional*, defaults to `None`):
+                Whether to omit the `<|notimestamps|>` token at the start of the sequence.
+        """
+        self.language = language if language is not None else self.language
+        self.task = task if task is not None else self.task
+        self.predict_timestamps = predict_timestamps if predict_timestamps is not None else self.predict_timestamps
+
+        prefix_token_ids = self.prefix_tokens
+        prefixes = self.convert_ids_to_tokens(prefix_token_ids)
+        eos = self.eos_token
+        eos_token_id = self.eos_token_id
+        prefix_template = " ".join([f"{token}:0" for token in prefixes])
+        self.backend_tokenizer.post_processor = processors.TemplateProcessing(
+            single=f"{prefix_template} $A:0 {eos}:0",
+            pair=f"{prefix_template} $A:0 $B:1 {eos}:1",
+            special_tokens=[
+                (eos, eos_token_id),
+                *zip(prefixes, prefix_token_ids),
+            ],
+        )
+
+    @property
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.prefix_tokens
+    def prefix_tokens(self) -> List[int]:
+        bos_token_id = self.convert_tokens_to_ids("<|startoftranscript|>")
+        translate_token_id = self.convert_tokens_to_ids("<|translate|>")
+        transcribe_token_id = self.convert_tokens_to_ids("<|transcribe|>")
+        notimestamps_token_id = self.convert_tokens_to_ids("<|notimestamps|>")
+        langs = tuple(LANGUAGES.keys())
+
+        if self.language is not None:
+            self.language = self.language.lower()
+            if self.language in TO_LANGUAGE_CODE:
+                language_id = TO_LANGUAGE_CODE[self.language]
+            elif self.language in TO_LANGUAGE_CODE.values():
+                language_id = self.language
+            else:
+                is_language_code = len(self.language) == 2
+                raise ValueError(
+                    f"Unsupported language: {self.language}. Language should be one of:"
+                    f" {list(TO_LANGUAGE_CODE.values()) if is_language_code else list(TO_LANGUAGE_CODE.keys())}."
+                )
+
+        if self.task is not None:
+            if self.task not in TASK_IDS:
+                raise ValueError(f"Unsupported task: {self.task}. Task should be in: {TASK_IDS}")
+
+        bos_sequence = [bos_token_id]
+        if self.language is not None:
+            bos_sequence.append(bos_token_id + 1 + langs.index(language_id))
+        if self.task is not None:
+            bos_sequence.append(transcribe_token_id if self.task == "transcribe" else translate_token_id)
+        if not self.predict_timestamps:
+            bos_sequence.append(notimestamps_token_id)
+        return bos_sequence
+
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.build_inputs_with_special_tokens
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
+        """Build model inputs from a sequence by appending eos_token_id."""
+        if token_ids_1 is None:
+            return self.prefix_tokens + token_ids_0 + [self.eos_token_id]
+        # We don't expect to process pairs, but leave the pair logic for API consistency
+        return self.prefix_tokens + token_ids_0 + token_ids_1 + [self.eos_token_id]
+
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.get_special_tokens_mask
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        prefix_ones = [1] * len(self.prefix_tokens)
+        suffix_ones = [1]
+        if token_ids_1 is None:
+            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
+        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
+
+    @property
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.default_chat_template
+    def default_chat_template(self):
+        """
+        A simple chat template that ignores role information and just concatenates messages with EOS tokens.
+        """
+        return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}"
+
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.get_decoder_prompt_ids
+    def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
+        self.set_prefix_tokens(task=task, language=language, predict_timestamps=not no_timestamps)
+        # prefix tokens are of the form: <|startoftranscript|> <|lang_id|> <|task|> <|notimestamps|>
+        # we don't want to force the bos token at position 1, as this is the starting token
+        # when we generate, so we slice the prefix tokens to: <|lang_id|> <|task|> <|notimestamps|>
+        # to get the forced tokens
+        forced_tokens = self.prefix_tokens[1:]
+        forced_decoder_ids = [(rank + 1, token) for rank, token in enumerate(forced_tokens)]
+        return forced_decoder_ids
+
+    def _decode_asr(self, model_outputs, *, return_timestamps, return_language, time_precision):
+        return _decode_asr(
+            self,
+            model_outputs,
+            return_timestamps=return_timestamps,
+            return_language=return_language,
+            time_precision=time_precision,
+        )
+
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.get_prompt_ids
+    def get_prompt_ids(self, text: str, return_tensors="np"):
+        """Converts prompt text to IDs that can be passed to [`~WhisperForConditionalGeneration.generate`]."""
+        batch_encoding = self("<|startofprev|>", " " + text.strip(), add_special_tokens=False)
+
+        # Check for special tokens
+        prompt_text_ids = batch_encoding["input_ids"][1:]
+        special_token_id = next((x for x in prompt_text_ids if x >= self.all_special_ids[0]), None)
+        if special_token_id is not None:
+            token = self.convert_ids_to_tokens(special_token_id)
+            raise ValueError(f"Encountered text in the prompt corresponding to disallowed special token: {token}.")
+
+        batch_encoding.convert_to_tensors(tensor_type=return_tensors)
+        return batch_encoding["input_ids"]
+
+    @staticmethod
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._strip_prompt
+    def _strip_prompt(token_ids: List[int], prompt_token_id: int, decoder_start_token_id: int):
+        has_prompt = isinstance(token_ids, list) and token_ids and token_ids[0] == prompt_token_id
+        if has_prompt:
+            if decoder_start_token_id in token_ids:
+                return token_ids[token_ids.index(decoder_start_token_id) :]
+            else:
+                return []
+
+        return token_ids
diff --git a/src/transformers/models/x_clip/__init__.py b/src/transformers/models/x_clip/__init__.py
index 10d848b7bc4e..ed3d2ff51528 100644
--- a/src/transformers/models/x_clip/__init__.py
+++ b/src/transformers/models/x_clip/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/x_clip/configuration_x_clip.py b/src/transformers/models/x_clip/configuration_x_clip.py
index 809b6349fb5d..183b66439b36 100644
--- a/src/transformers/models/x_clip/configuration_x_clip.py
+++ b/src/transformers/models/x_clip/configuration_x_clip.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ X-CLIP model configuration"""
 
-import copy
 import os
 from typing import Union
 
@@ -62,8 +61,6 @@ class XCLIPTextConfig(PretrainedConfig):
             The epsilon used by the layer normalization layers.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         initializer_factor (`float``, *optional*, defaults to 1):
@@ -95,22 +92,20 @@ def __init__(
         num_attention_heads=8,
         max_position_embeddings=77,
         hidden_act="quick_gelu",
-        layer_norm_eps=0.00001,
-        dropout=0.0,
+        layer_norm_eps=1e-5,
         attention_dropout=0.0,
         initializer_range=0.02,
         initializer_factor=1.0,
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size
-        self.dropout = dropout
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
         self.max_position_embeddings = max_position_embeddings
@@ -122,6 +117,7 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
 
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
@@ -176,8 +172,6 @@ class XCLIPVisionConfig(PretrainedConfig):
             `"relu"`, `"selu"`, `"gelu_new"` and ``"quick_gelu"` are supported.
         layer_norm_eps (`float`, *optional*, defaults to 1e-5):
             The epsilon used by the layer normalization layers.
-        dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
@@ -220,19 +214,17 @@ def __init__(
         patch_size=32,
         num_frames=8,
         hidden_act="quick_gelu",
-        layer_norm_eps=0.00001,
-        dropout=0.0,
+        layer_norm_eps=1e-5,
         attention_dropout=0.0,
         initializer_range=0.02,
         initializer_factor=1.0,
         drop_path_rate=0.0,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size
-        self.dropout = dropout
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
         self.mit_hidden_size = mit_hidden_size
@@ -252,6 +244,7 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
 
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
@@ -305,7 +298,6 @@ class XCLIPConfig(PretrainedConfig):
     """
 
     model_type = "xclip"
-    is_composition = True
 
     def __init__(
         self,
@@ -319,25 +311,85 @@ def __init__(
         prompt_attention_dropout=0.0,
         prompt_projection_dropout=0.0,
         logit_scale_init_value=2.6592,
-        **kwargs
+        **kwargs,
     ):
-        super().__init__(**kwargs)
-
         # If `_config_dict` exist, we use them for the backward compatibility.
+        # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
+        # of confusion!).
         text_config_dict = kwargs.pop("text_config_dict", None)
         vision_config_dict = kwargs.pop("vision_config_dict", None)
+
+        super().__init__(**kwargs)
+
+        # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
+        # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
+        # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
         if text_config_dict is not None:
-            text_config = text_config_dict
+            if text_config is None:
+                text_config = {}
+
+            # This is the complete result when using `text_config_dict`.
+            _text_config_dict = XCLIPTextConfig(**text_config_dict).to_dict()
+
+            # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
+            for key, value in _text_config_dict.items():
+                if key in text_config and value != text_config[key] and key not in ["transformers_version"]:
+                    # If specified in `text_config_dict`
+                    if key in text_config_dict:
+                        message = (
+                            f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
+                            f'The value `text_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`text_config_dict` is provided which will be used to initialize `XCLIPTextConfig`. The "
+                            f'value `text_config["{key}"]` will be overriden.'
+                        )
+                    logger.warning(message)
+
+            # Update all values in `text_config` with the ones in `_text_config_dict`.
+            text_config.update(_text_config_dict)
+
         if vision_config_dict is not None:
-            vision_config = vision_config_dict
+            if vision_config is None:
+                vision_config = {}
+
+            # This is the complete result when using `vision_config_dict`.
+            _vision_config_dict = XCLIPVisionConfig(**vision_config_dict).to_dict()
+            # convert keys to string instead of integer
+            if "id2label" in _vision_config_dict:
+                _vision_config_dict["id2label"] = {
+                    str(key): value for key, value in _vision_config_dict["id2label"].items()
+                }
+
+            # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different.
+            for key, value in _vision_config_dict.items():
+                if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]:
+                    # If specified in `vision_config_dict`
+                    if key in vision_config_dict:
+                        message = (
+                            f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different "
+                            f'values. The value `vision_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`vision_config_dict` is provided which will be used to initialize `XCLIPVisionConfig`. "
+                            f'The value `vision_config["{key}"]` will be overriden.'
+                        )
+                    logger.warning(message)
+
+            # Update all values in `vision_config` with the ones in `_vision_config_dict`.
+            vision_config.update(_vision_config_dict)
 
         if text_config is None:
             text_config = {}
-            logger.info("text_config is None. Initializing the XCLIPTextConfig with default values.")
+            logger.info("`text_config` is `None`. Initializing the `XCLIPTextConfig` with default values.")
 
         if vision_config is None:
             vision_config = {}
-            logger.info("vision_config is None. initializing the XCLIPVisionConfig with default values.")
+            logger.info("`vision_config` is `None`. initializing the `XCLIPVisionConfig` with default values.")
 
         self.text_config = XCLIPTextConfig(**text_config)
         self.vision_config = XCLIPVisionConfig(**vision_config)
@@ -363,16 +415,3 @@ def from_text_vision_configs(cls, text_config: XCLIPTextConfig, vision_config: X
         """
 
         return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["text_config"] = self.text_config.to_dict()
-        output["vision_config"] = self.vision_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
index 8210b3f709e3..8ff878f2cc9f 100644
--- a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
+++ b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
@@ -15,15 +15,15 @@
 
 import argparse
 
+import gdown
 import numpy as np
 import torch
-
-import gdown
 from huggingface_hub import hf_hub_download
+
 from transformers import (
     CLIPTokenizer,
     CLIPTokenizerFast,
-    VideoMAEFeatureExtractor,
+    VideoMAEImageProcessor,
     XCLIPConfig,
     XCLIPModel,
     XCLIPProcessor,
@@ -216,7 +216,6 @@ def prepare_video(num_frames):
 
 
 def convert_xclip_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
-
     model_to_url = {
         # fully supervised kinetics-400 checkpoints
         "xclip-base-patch32": "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k400_32_8.pth",
@@ -292,10 +291,10 @@ def convert_xclip_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_
     model.eval()
 
     size = 336 if model_name == "xclip-large-patch14-16-frames" else 224
-    feature_extractor = VideoMAEFeatureExtractor(size=size)
+    image_processor = VideoMAEImageProcessor(size=size)
     slow_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
     fast_tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-base-patch32")
-    processor = XCLIPProcessor(feature_extractor=feature_extractor, tokenizer=fast_tokenizer)
+    processor = XCLIPProcessor(image_processor=image_processor, tokenizer=fast_tokenizer)
 
     video = prepare_video(num_frames)
     inputs = processor(
diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index 83ca74761274..da7eddff8df8 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -139,11 +139,12 @@ def __init__(self, config: XCLIPVisionConfig):
         self.num_patches = (self.image_size // self.patch_size) ** 2
         self.num_positions = self.num_patches + 1
         self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
-        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)))
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
 
     def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
         batch_size = pixel_values.shape[0]
-        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
         patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
 
         class_embeds = self.class_embedding.expand(batch_size, 1, -1)
@@ -162,7 +163,9 @@ def __init__(self, config: XCLIPTextConfig):
         self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
     def forward(
         self,
@@ -311,9 +314,9 @@ def __init__(self, config: XCLIPConfig):
         super().__init__()
         self.embed_dim = config.hidden_size
         self.self_attn = XCLIPAttention(config)
-        self.layer_norm1 = nn.LayerNorm(self.embed_dim)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
         self.mlp = XCLIPMLP(config)
-        self.layer_norm2 = nn.LayerNorm(self.embed_dim)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
 
     def forward(
         self,
@@ -357,7 +360,7 @@ def forward(
 
 
 # Copied from transformers.models.beit.modeling_beit.drop_path
-def drop_path(input, drop_prob: float = 0.0, training: bool = False):
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
@@ -403,15 +406,15 @@ def __init__(self, config: XCLIPConfig):
         self.embed_dim = config.hidden_size
 
         self.message_fc = nn.Linear(self.embed_dim, self.embed_dim)
-        self.message_ln = nn.LayerNorm(self.embed_dim)
+        self.message_ln = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
         self.message_attn = XCLIPAttention(config)
 
         self.drop_path = XCLIPDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
 
         self.self_attn = XCLIPAttention(config)
-        self.layer_norm1 = nn.LayerNorm(self.embed_dim)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
         self.mlp = XCLIPMLP(config)
-        self.layer_norm2 = nn.LayerNorm(self.embed_dim)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
 
     def forward(
         self,
@@ -481,7 +484,6 @@ class XCLIPPreTrainedModel(PreTrainedModel):
     config_class = XCLIPConfig
     base_model_prefix = "x_clip"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -554,7 +556,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`CLIPTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -584,7 +586,7 @@ def _set_gradient_checkpointing(self, module, value=False):
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
             tensors for more detail.
@@ -601,7 +603,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`CLIPTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -619,7 +621,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             [What are position IDs?](../glossary#position-ids)
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
         return_loss (`bool`, *optional*):
             Whether or not to return the contrastive loss.
         output_attentions (`bool`, *optional*):
@@ -737,6 +739,24 @@ def custom_forward(*inputs):
         )
 
 
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
 class XCLIPTextTransformer(nn.Module):
     def __init__(self, config: XCLIPTextConfig):
         super().__init__()
@@ -744,7 +764,7 @@ def __init__(self, config: XCLIPTextConfig):
         embed_dim = config.hidden_size
         self.embeddings = XCLIPTextEmbeddings(config)
         self.encoder = XCLIPEncoder(config)
-        self.final_layer_norm = nn.LayerNorm(embed_dim)
+        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
 
     @add_start_docstrings_to_model_forward(X_CLIP_TEXT_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XCLIPTextConfig)
@@ -775,12 +795,9 @@ def forward(
 
         hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
 
-        batch_size, seq_len = input_shape
         # X_CLIP's text model uses causal mask, prepare it here.
         # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
-        causal_attention_mask = self._build_causal_attention_mask(batch_size, seq_len, hidden_states.dtype).to(
-            hidden_states.device
-        )
+        causal_attention_mask = _make_causal_mask(input_shape, hidden_states.dtype, device=hidden_states.device)
         # expand attention_mask
         if attention_mask is not None:
             # [batch_size, seq_len] -> [batch_size, 1, tgt_seq_len, src_seq_len]
@@ -812,15 +829,6 @@ def forward(
             attentions=encoder_outputs.attentions,
         )
 
-    def _build_causal_attention_mask(self, batch_size, seq_len, dtype):
-        # lazily create causal attention mask, with full attention between the vision tokens
-        # pytorch uses additive attention mask; fill with -inf
-        mask = torch.empty(batch_size, seq_len, seq_len, dtype=dtype)
-        mask.fill_(torch.tensor(torch.finfo(dtype).min))
-        mask.triu_(1)  # zero out the lower diagonal
-        mask = mask.unsqueeze(1)  # expand mask
-        return mask
-
 
 class XCLIPTextModel(XCLIPPreTrainedModel):
     config_class = XCLIPTextConfig
@@ -854,10 +862,10 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import CLIPTokenizer, XCLIPTextModel
+        >>> from transformers import AutoTokenizer, XCLIPTextModel
 
         >>> model = XCLIPTextModel.from_pretrained("microsoft/xclip-base-patch32")
-        >>> tokenizer = CLIPTokenizer.from_pretrained("microsoft/xclip-base-patch32")
+        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/xclip-base-patch32")
 
         >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
 
@@ -989,9 +997,9 @@ def __init__(self, config: XCLIPVisionConfig):
         embed_dim = config.hidden_size
 
         self.embeddings = XCLIPVisionEmbeddings(config)
-        self.pre_layernorm = nn.LayerNorm(embed_dim)
+        self.pre_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
         self.encoder = XCLIPVisionEncoder(config)
-        self.post_layernorm = nn.LayerNorm(embed_dim)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
 
     @add_start_docstrings_to_model_forward(X_CLIP_VISION_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XCLIPVisionConfig)
@@ -1065,7 +1073,7 @@ def forward(
         Examples:
 
         ```python
-        >>> from decord import VideoReader, cpu
+        >>> import av
         >>> import torch
         >>> import numpy as np
 
@@ -1075,7 +1083,37 @@ def forward(
         >>> np.random.seed(0)
 
 
+        >>> def read_video_pyav(container, indices):
+        ...     '''
+        ...     Decode the video with PyAV decoder.
+        ...     Args:
+        ...         container (`av.container.input.InputContainer`): PyAV container.
+        ...         indices (`List[int]`): List of frame indices to decode.
+        ...     Returns:
+        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
+        ...     '''
+        ...     frames = []
+        ...     container.seek(0)
+        ...     start_index = indices[0]
+        ...     end_index = indices[-1]
+        ...     for i, frame in enumerate(container.decode(video=0)):
+        ...         if i > end_index:
+        ...             break
+        ...         if i >= start_index and i in indices:
+        ...             frames.append(frame)
+        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+
+
         >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
+        ...     '''
+        ...     Sample a given number of frame indices from the video.
+        ...     Args:
+        ...         clip_len (`int`): Total number of frames to sample.
+        ...         frame_sample_rate (`int`): Sample every n-th frame.
+        ...         seg_len (`int`): Maximum allowed index of sample's last frame.
+        ...     Returns:
+        ...         indices (`List[int]`): List of sampled frame indices
+        ...     '''
         ...     converted_len = int(clip_len * frame_sample_rate)
         ...     end_idx = np.random.randint(converted_len, seg_len)
         ...     start_idx = end_idx - converted_len
@@ -1088,12 +1126,11 @@ def forward(
         >>> file_path = hf_hub_download(
         ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
         ... )
-        >>> vr = VideoReader(file_path, num_threads=1, ctx=cpu(0))
+        >>> container = av.open(file_path)
 
         >>> # sample 16 frames
-        >>> vr.seek(0)
-        >>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=len(vr))
-        >>> video = vr.get_batch(indices).asnumpy()
+        >>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
+        >>> video = read_video_pyav(container, indices)
 
         >>> processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32")
         >>> model = XCLIPVisionModel.from_pretrained("microsoft/xclip-base-patch32")
@@ -1218,8 +1255,8 @@ def __init__(self, config):
 
         embed_dim = config.projection_dim
         self.cross_attn = XCLIPCrossAttention(config)
-        self.norm1 = nn.LayerNorm(embed_dim)
-        self.norm3 = nn.LayerNorm(embed_dim)
+        self.norm1 = nn.LayerNorm(embed_dim, eps=config.text_config.layer_norm_eps)
+        self.norm3 = nn.LayerNorm(embed_dim, eps=config.text_config.layer_norm_eps)
         self.mlp = nn.Sequential(
             nn.Linear(embed_dim, embed_dim * 4),
             ACT2FN[config.prompt_hidden_act],
@@ -1239,7 +1276,7 @@ class XCLIPPromptGenerator(nn.Module):
     def __init__(self, config):
         super().__init__()
         embed_dim = config.projection_dim
-        self.layernorm = nn.LayerNorm(embed_dim)
+        self.layernorm = nn.LayerNorm(embed_dim, eps=config.vision_config.layer_norm_eps)
         self.decoder = nn.ModuleList([PromptGeneratorLayer(config) for _ in range(config.prompt_layers)])
         self.alpha = nn.Parameter(torch.ones(embed_dim) * config.prompt_alpha)
 
@@ -1282,9 +1319,9 @@ def __init__(self, config: XCLIPConfig):
 
         self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
         self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
-        self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
+        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
 
-        self.prompts_visual_layernorm = nn.LayerNorm(self.vision_embed_dim)
+        self.prompts_visual_layernorm = nn.LayerNorm(self.vision_embed_dim, eps=config.vision_config.layer_norm_eps)
         self.prompts_visual_projection = nn.Parameter(torch.randn(self.vision_embed_dim, self.projection_dim))
 
         mit_config = copy(vision_config)
@@ -1363,7 +1400,7 @@ def get_video_features(
         Examples:
 
         ```python
-        >>> from decord import VideoReader, cpu
+        >>> import av
         >>> import torch
         >>> import numpy as np
 
@@ -1373,7 +1410,37 @@ def get_video_features(
         >>> np.random.seed(0)
 
 
+        >>> def read_video_pyav(container, indices):
+        ...     '''
+        ...     Decode the video with PyAV decoder.
+        ...     Args:
+        ...         container (`av.container.input.InputContainer`): PyAV container.
+        ...         indices (`List[int]`): List of frame indices to decode.
+        ...     Returns:
+        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
+        ...     '''
+        ...     frames = []
+        ...     container.seek(0)
+        ...     start_index = indices[0]
+        ...     end_index = indices[-1]
+        ...     for i, frame in enumerate(container.decode(video=0)):
+        ...         if i > end_index:
+        ...             break
+        ...         if i >= start_index and i in indices:
+        ...             frames.append(frame)
+        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+
+
         >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
+        ...     '''
+        ...     Sample a given number of frame indices from the video.
+        ...     Args:
+        ...         clip_len (`int`): Total number of frames to sample.
+        ...         frame_sample_rate (`int`): Sample every n-th frame.
+        ...         seg_len (`int`): Maximum allowed index of sample's last frame.
+        ...     Returns:
+        ...         indices (`List[int]`): List of sampled frame indices
+        ...     '''
         ...     converted_len = int(clip_len * frame_sample_rate)
         ...     end_idx = np.random.randint(converted_len, seg_len)
         ...     start_idx = end_idx - converted_len
@@ -1386,12 +1453,11 @@ def get_video_features(
         >>> file_path = hf_hub_download(
         ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
         ... )
-        >>> vr = VideoReader(file_path, num_threads=1, ctx=cpu(0))
+        >>> container = av.open(file_path)
 
-        >>> # sample 16 frames
-        >>> vr.seek(0)
-        >>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=len(vr))
-        >>> video = vr.get_batch(indices).asnumpy()
+        >>> # sample 8 frames
+        >>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
+        >>> video = read_video_pyav(container, indices)
 
         >>> processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32")
         >>> model = AutoModel.from_pretrained("microsoft/xclip-base-patch32")
@@ -1451,7 +1517,7 @@ def forward(
         Examples:
 
         ```python
-        >>> from decord import VideoReader, cpu
+        >>> import av
         >>> import torch
         >>> import numpy as np
 
@@ -1461,7 +1527,37 @@ def forward(
         >>> np.random.seed(0)
 
 
+        >>> def read_video_pyav(container, indices):
+        ...     '''
+        ...     Decode the video with PyAV decoder.
+        ...     Args:
+        ...         container (`av.container.input.InputContainer`): PyAV container.
+        ...         indices (`List[int]`): List of frame indices to decode.
+        ...     Returns:
+        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
+        ...     '''
+        ...     frames = []
+        ...     container.seek(0)
+        ...     start_index = indices[0]
+        ...     end_index = indices[-1]
+        ...     for i, frame in enumerate(container.decode(video=0)):
+        ...         if i > end_index:
+        ...             break
+        ...         if i >= start_index and i in indices:
+        ...             frames.append(frame)
+        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+
+
         >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
+        ...     '''
+        ...     Sample a given number of frame indices from the video.
+        ...     Args:
+        ...         clip_len (`int`): Total number of frames to sample.
+        ...         frame_sample_rate (`int`): Sample every n-th frame.
+        ...         seg_len (`int`): Maximum allowed index of sample's last frame.
+        ...     Returns:
+        ...         indices (`List[int]`): List of sampled frame indices
+        ...     '''
         ...     converted_len = int(clip_len * frame_sample_rate)
         ...     end_idx = np.random.randint(converted_len, seg_len)
         ...     start_idx = end_idx - converted_len
@@ -1474,12 +1570,11 @@ def forward(
         >>> file_path = hf_hub_download(
         ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
         ... )
-        >>> vr = VideoReader(file_path, num_threads=1, ctx=cpu(0))
+        >>> container = av.open(file_path)
 
-        >>> # sample 16 frames
-        >>> vr.seek(0)
-        >>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=len(vr))
-        >>> video = vr.get_batch(indices).asnumpy()
+        >>> # sample 8 frames
+        >>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
+        >>> video = read_video_pyav(container, indices)
 
         >>> processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32")
         >>> model = AutoModel.from_pretrained("microsoft/xclip-base-patch32")
diff --git a/src/transformers/models/x_clip/processing_x_clip.py b/src/transformers/models/x_clip/processing_x_clip.py
index 0854c1d86890..6e54c9e7876a 100644
--- a/src/transformers/models/x_clip/processing_x_clip.py
+++ b/src/transformers/models/x_clip/processing_x_clip.py
@@ -30,9 +30,9 @@ class XCLIPProcessor(ProcessorMixin):
     [`~XCLIPProcessor.__call__`] and [`~XCLIPProcessor.decode`] for more information.
 
     Args:
-        image_processor ([`VideoMAEImageProcessor`]):
+        image_processor ([`VideoMAEImageProcessor`], *optional*):
             The image processor is a required input.
-        tokenizer ([`CLIPTokenizerFast`]):
+        tokenizer ([`CLIPTokenizerFast`], *optional*):
             The tokenizer is a required input.
     """
     attributes = ["image_processor", "tokenizer"]
@@ -40,6 +40,7 @@ class XCLIPProcessor(ProcessorMixin):
     tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        feature_extractor = None
         if "feature_extractor" in kwargs:
             warnings.warn(
                 "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
diff --git a/src/transformers/models/xglm/__init__.py b/src/transformers/models/xglm/__init__.py
index 096886e5bd32..747a4ddb4ed9 100644
--- a/src/transformers/models/xglm/__init__.py
+++ b/src/transformers/models/xglm/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,7 +13,6 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-# rely on isort to merge the imports
 from ...utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
diff --git a/src/transformers/models/xglm/configuration_xglm.py b/src/transformers/models/xglm/configuration_xglm.py
index c9ac1111a08d..8a59ee6682d6 100644
--- a/src/transformers/models/xglm/configuration_xglm.py
+++ b/src/transformers/models/xglm/configuration_xglm.py
@@ -114,7 +114,7 @@ def __init__(
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
diff --git a/src/transformers/models/xglm/modeling_flax_xglm.py b/src/transformers/models/xglm/modeling_flax_xglm.py
index 0b3cd6d73e1a..d6b90a7f00f7 100644
--- a/src/transformers/models/xglm/modeling_flax_xglm.py
+++ b/src/transformers/models/xglm/modeling_flax_xglm.py
@@ -20,11 +20,10 @@
 from functools import partial
 from typing import Optional, Tuple
 
-import numpy as np
-
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen import combine_masks, make_causal_mask
 from flax.linen.attention import dot_product_attention_weights
@@ -45,7 +44,6 @@
 
 _CHECKPOINT_FOR_DOC = "facebook/xglm-564M"
 _CONFIG_FOR_DOC = "XGLMConfig"
-_TOKENIZER_FOR_DOC = "XGLMTokenizer"
 
 XGLM_START_DOCSTRING = r"""
     This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
@@ -87,7 +85,7 @@
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`~XGLMTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -126,18 +124,6 @@ def create_sinusoidal_positions(n_pos, dim, padding_idx=1):
     return jnp.array(emb)
 
 
-def shift_tokens_right(input_ids: jnp.ndarray, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray:
-    """
-    Shift input ids one token to the right.
-    """
-    shifted_input_ids = jnp.roll(input_ids, 1, axis=-1)
-    shifted_input_ids = shifted_input_ids.at[(..., 0)].set(decoder_start_token_id)
-    # replace possible -100 values in labels by `pad_token_id`
-    shifted_input_ids = jnp.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
-
-    return shifted_input_ids
-
-
 class FlaxXGLMAttention(nn.Module):
     config: XGLMConfig
     embed_dim: int
@@ -563,7 +549,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, **kwargs)
         super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
@@ -706,7 +692,6 @@ class FlaxXGLMModel(FlaxXGLMPreTrainedModel):
 
 append_call_sample_docstring(
     FlaxXGLMModel,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxBaseModelOutputWithPastAndCrossAttentions,
     _CONFIG_FOR_DOC,
@@ -739,7 +724,6 @@ def __call__(
         return_dict: bool = True,
         deterministic: bool = True,
     ):
-
         outputs = self.model(
             input_ids,
             attention_mask,
@@ -782,7 +766,7 @@ def __call__(
 class FlaxXGLMForCausalLM(FlaxXGLMPreTrainedModel):
     module_class = FlaxXGLMForCausalLMModule
 
-    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jnp.DeviceArray] = None):
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
         # initializing the cache
         batch_size, seq_length = input_ids.shape
 
@@ -811,7 +795,6 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
 
 append_call_sample_docstring(
     FlaxXGLMForCausalLM,
-    _TOKENIZER_FOR_DOC,
     _CHECKPOINT_FOR_DOC,
     FlaxCausalLMOutputWithCrossAttentions,
     _CONFIG_FOR_DOC,
diff --git a/src/transformers/models/xglm/modeling_tf_xglm.py b/src/transformers/models/xglm/modeling_tf_xglm.py
index 4ca15c78c832..e2890edeb665 100644
--- a/src/transformers/models/xglm/modeling_tf_xglm.py
+++ b/src/transformers/models/xglm/modeling_tf_xglm.py
@@ -15,6 +15,8 @@
 """ TF 2.0 XGLM model."""
 
 
+from __future__ import annotations
+
 import math
 import random
 from typing import Any, Optional, Tuple, Union
@@ -26,7 +28,6 @@
 
 # Public API
 from ...file_utils import (
-    DUMMY_INPUTS,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -42,7 +43,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import logging
 from .configuration_xglm import XGLMConfig
 
@@ -51,7 +52,6 @@
 
 _CHECKPOINT_FOR_DOC = "facebook/xglm-564M"
 _CONFIG_FOR_DOC = "XGLMConfig"
-_TOKENIZER_FOR_DOC = "XGLMTokenizer"
 
 
 TF_XGLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -63,7 +63,7 @@
 LARGE_NEGATIVE = -1e8
 
 
-def create_sinusiodal_positions(num_positions: int, embedding_dim: int, padding_idx: Optional[int]) -> tf.Tensor:
+def create_sinusoidal_positions(num_positions: int, embedding_dim: int, padding_idx: Optional[int]) -> tf.Tensor:
     half_dim = embedding_dim // 2
     emb = math.log(10000) / (half_dim - 1)
     emb = tf.exp(tf.range(half_dim, dtype=tf.float32) * -emb)
@@ -83,7 +83,7 @@ def create_sinusiodal_positions(num_positions: int, embedding_dim: int, padding_
         )
         emb *= _padding_mask
 
-    return tf.Variable(emb, trainable=False, name="model.embed_positions.weights")
+    return tf.constant(emb, name="embed_positions")
 
 
 def _create_position_ids_from_input_ids(
@@ -135,7 +135,7 @@ def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: i
 
 
 # Copied from transformers.models.bart.modeling_tf_bart._expand_mask
-def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None, past_key_values_length: int = 0):
+def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
     """
     Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
     """
@@ -186,12 +186,12 @@ def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
     def call(
         self,
         hidden_states: tf.Tensor,
-        key_value_states: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[Tuple[tf.Tensor]]] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
+        key_value_states: tf.Tensor | None = None,
+        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
+        attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
         training: Optional[bool] = False,
-    ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
+    ) -> Tuple[tf.Tensor, tf.Tensor | None]:
         """Input shape: Batch x Time x Channel"""
 
         # if key_value_states are provided this layer is used as a cross-attention layer
@@ -338,21 +338,21 @@ def __init__(self, config: XGLMConfig, **kwargs: Any) -> None:
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
-        encoder_hidden_states: Optional[tf.Tensor] = None,
-        encoder_attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
-        cross_attn_layer_head_mask: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[tf.Tensor]] = None,
+        attention_mask: tf.Tensor | None = None,
+        encoder_hidden_states: tf.Tensor | None = None,
+        encoder_attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
+        cross_attn_layer_head_mask: tf.Tensor | None = None,
+        past_key_value: Tuple[tf.Tensor] | None = None,
         training: Optional[bool] = False,
     ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+            hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
             attention_mask (`tf.Tensor`): attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             encoder_hidden_states (`tf.Tensor`):
-                cross attention input to the layer of shape *(seq_len, batch, embed_dim)*
+                cross attention input to the layer of shape *(batch, seq_len, embed_dim)*
             encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
@@ -438,7 +438,7 @@ def __init__(
             )
 
         self.offset = 2
-        self._embed_positions_weights = create_sinusiodal_positions(
+        self._embed_positions_weights = create_sinusoidal_positions(
             num_positions=config.max_position_embeddings + self.offset,
             embedding_dim=config.d_model,
             padding_idx=config.pad_token_id,
@@ -457,53 +457,38 @@ def set_input_embeddings(self, value: TFSharedEmbeddings) -> None:
 
     def _prepare_decoder_attention_mask(
         self,
-        attention_mask: Optional[tf.Tensor],
+        attention_mask: tf.Tensor | None,
         input_shape: tf.TensorShape,
         past_key_values_length: int,
     ) -> tf.Tensor:
         # create causal mask
         # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        combined_attention_mask: Optional[tf.Tensor] = None
-        if input_shape[-1] > 1:
-            combined_attention_mask = _make_causal_mask(input_shape, past_key_values_length)
-
-        if attention_mask is not None:
-            expand_attention_mask = _expand_mask(attention_mask, tgt_len=input_shape[-1])
-            combined_attention_mask = (
-                expand_attention_mask
-                if combined_attention_mask is None
-                else expand_attention_mask + combined_attention_mask
-            )
-
-        return combined_attention_mask
-
-    def embed_positions(
-        self,
-        input_ids: Optional[TFModelInputType] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        past_key_values_length: Optional[int] = None,
-    ) -> tf.Tensor:
-        if input_ids is not None:
-            position_ids = _create_position_ids_from_input_ids(input_ids, past_key_values_length, self.padding_idx)
-        else:
-            position_ids = _create_position_ids_from_inputs_embeds(
-                inputs_embeds, past_key_values_length, self.padding_idx
-            )
+        combined_attention_mask = _make_causal_mask(input_shape, past_key_values_length)
+        combined_attention_mask = tf.cond(
+            input_shape[-1] > 1, lambda: combined_attention_mask, lambda: tf.ones_like(combined_attention_mask)
+        )
+        if attention_mask is None:
+            return combined_attention_mask
+        expand_attention_mask = _expand_mask(attention_mask, tgt_len=input_shape[-1])
+        return expand_attention_mask + combined_attention_mask
 
+    def embed_positions(self, position_ids: np.ndarray | tf.Tensor | None = None) -> tf.Tensor:
+        position_ids += self.offset
         positions = tf.gather(self._embed_positions_weights, position_ids, axis=0)
         return positions
 
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        cross_attn_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -522,27 +507,23 @@ def call(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
-            input_shape = shape_list(input_ids)
+            input_shape = tf.shape(input_ids)
             input_ids = tf.reshape(input_ids, (-1, input_shape[-1]))
         elif inputs_embeds is not None:
-            input_shape = shape_list(inputs_embeds)[:-1]
+            input_shape = tf.shape(inputs_embeds)[:-1]
         else:
             raise ValueError("You have to specify either input_ids or inputs_embeds")
 
-        # past_key_values_length
         past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
 
-        if inputs_embeds is None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.embed_tokens.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.vocab_size})"
-                ),
+        if position_ids is None:
+            position_ids = tf.expand_dims(
+                tf.range(past_key_values_length, input_shape[-1] + past_key_values_length), axis=0
             )
+        position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
+
+        if inputs_embeds is None:
+            check_embeddings_within_bounds(input_ids, self.embed_tokens.vocab_size)
             inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
         attention_mask = self._prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values_length)
@@ -553,7 +534,7 @@ def call(
             encoder_attention_mask = _expand_mask(encoder_attention_mask, tgt_len=input_shape[-1])
 
         # embed positions
-        positions = self.embed_positions(input_ids, inputs_embeds, past_key_values_length)
+        positions = self.embed_positions(position_ids)
 
         hidden_states = tf.cast(inputs_embeds, dtype=tf.float32) + positions
 
@@ -633,29 +614,6 @@ class TFXGLMPreTrainedModel(TFPreTrainedModel):
     config_class = XGLMConfig
     base_model_prefix = "model"
 
-    @property
-    def dummy_inputs(self):
-        pad_token = 1
-        input_ids = tf.cast(tf.convert_to_tensor(DUMMY_INPUTS), tf.int32)
-        dummy_inputs = {
-            "input_ids": input_ids,
-            "attention_mask": tf.cast(input_ids != pad_token, tf.int32),
-        }
-        return dummy_inputs
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 XGLM_START_DOCSTRING = r"""
     This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
@@ -703,7 +661,7 @@ def serving(self, inputs):
         input_ids (`tf.Tensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`XGLMTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -714,6 +672,11 @@ def serving(self, inputs):
             - 0 for tokens that are **masked**.
 
             [What are attention masks?](../glossary#attention-mask)
+        position_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
         encoder_hidden_states (`tf.Tensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
             Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
             the decoder.
@@ -789,21 +752,21 @@ def __init__(
     @unpack_inputs
     @add_start_docstrings_to_model_forward(XGLM_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFBaseModelOutputWithPastAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        cross_attn_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -811,7 +774,6 @@ def call(
         training: Optional[bool] = False,
         **kwargs: Any,
     ) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]:
-
         outputs = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -830,24 +792,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output):
-        pkv = tf.convert_to_tensor(output.past_key_values) if self.config.use_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = (
-            tf.convert_to_tensor(output.cross_attentions)
-            if self.config.output_attentions and self.config.add_cross_attention
-            else None
-        )
-
-        return TFBaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=output.hidden_states,
-            past_key_values=pkv,
-            hidden_states=hs,
-            attentions=attns,
-            cross_attentions=cross_attns,
-        )
-
 
 @add_start_docstrings(
     """
@@ -879,9 +823,6 @@ def __init__(
             name="lm_head",
         )
 
-        # TODO (Joao): investigate why XGLM has numerical issues in XLA generate
-        self.supports_xla_generation = False
-
     def get_output_embeddings(self):
         return self.lm_head
 
@@ -893,11 +834,18 @@ def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_cache=
         if past_key_values:
             inputs = tf.expand_dims(inputs[:, -1], -1)
 
+        position_ids = kwargs.get("position_ids", None)
         attention_mask = kwargs.get("attention_mask", None)
 
+        if attention_mask is not None and position_ids is None:
+            position_ids = tf.math.cumsum(attention_mask, axis=-1, exclusive=True)
+            if past_key_values:
+                position_ids = tf.expand_dims(position_ids[:, -1], -1)
+
         return {
             "input_ids": inputs,
             "attention_mask": attention_mask,
+            "position_ids": position_ids,
             "past_key_values": past_key_values,
             "use_cache": use_cache,
         }
@@ -906,22 +854,22 @@ def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_cache=
     @add_start_docstrings_to_model_forward(XGLM_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=TFCausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFCausalLMOutputWithCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        cross_attn_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -939,6 +887,7 @@ def call(
         outputs = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
+            position_ids=position_ids,
             encoder_hidden_states=encoder_hidden_states,
             encoder_attention_mask=encoder_attention_mask,
             head_mask=head_mask,
@@ -957,9 +906,11 @@ def call(
         loss = None
         if labels is not None:
             # shift labels to the left and cut last logit token
-            shifted_logits = lm_logits[:, :-1]
-            labels = labels[:, 1:]
-            loss = self.hf_compute_loss(labels, shifted_logits)
+            labels = tf.concat(
+                [labels[:, 1:], tf.fill((labels.shape[0], 1), tf.cast(self.config.pad_token_id, labels.dtype))],
+                axis=-1,
+            )
+            loss = self.hf_compute_loss(labels, lm_logits)
 
         if not return_dict:
             output = (lm_logits,) + outputs[1:]
@@ -973,22 +924,3 @@ def call(
             attentions=outputs.attentions,
             cross_attentions=outputs.cross_attentions,
         )
-
-    def serving_output(self, output):
-        pkv = tf.convert_to_tensor(output.past_key_values) if self.config.use_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = (
-            tf.convert_to_tensor(output.cross_attentions)
-            if self.config.output_attentions and self.config.add_cross_attention
-            else None
-        )
-
-        return TFCausalLMOutputWithCrossAttentions(
-            loss=output.loss,
-            logits=output.logits,
-            past_key_values=pkv,
-            hidden_states=hs,
-            attentions=attns,
-            cross_attentions=cross_attns,
-        )
diff --git a/src/transformers/models/xglm/modeling_xglm.py b/src/transformers/models/xglm/modeling_xglm.py
index 64386e2946ff..5f8778f98dcd 100755
--- a/src/transformers/models/xglm/modeling_xglm.py
+++ b/src/transformers/models/xglm/modeling_xglm.py
@@ -16,7 +16,6 @@
 
 
 import math
-import random
 from typing import List, Optional, Tuple, Union
 
 import torch
@@ -35,7 +34,6 @@
 
 _CHECKPOINT_FOR_DOC = "facebook/xglm-564M"
 _CONFIG_FOR_DOC = "XGLMConfig"
-_TOKENIZER_FOR_DOC = "XGLMTokenizer"
 
 
 XGLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -65,7 +63,7 @@
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`XGLMTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -76,11 +74,34 @@
             - 0 for tokens that are **masked**.
 
             [What are attention masks?](../glossary#attention-mask)
-        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
-            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
+            the decoder.
+        encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+            Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+            selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        head_mask (`torch.Tensor` of shape `(num_layers, attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        cross_attn_head_mask (`torch.Tensor` of shape `(num_layers, attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
 
             - 1 indicates the head is **not masked**,
             - 0 indicates the head is **masked**.
+
         past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
             Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
             `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
@@ -89,20 +110,12 @@
             Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
 
-            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
-            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
-            of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape `(batch_size,
-            sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to
-            directly pass an embedded representation. This is useful if you want more control over how to convert
-            `input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. If
-            `past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see
-            `past_key_values`). This is useful if you want more control over how to convert `input_ids` indices into
-            associated vectors than the model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape
+            `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you
+            can choose to directly pass an embedded representation. This is useful if you want more control over how to
+            convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
             tensors for more detail.
@@ -115,18 +128,20 @@
 
 
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
     """
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
-    mask_cond = torch.arange(mask.size(-1))
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
 
     if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
     return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
 
 
@@ -145,18 +160,6 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
     return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
 
 
-def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
-    """
-    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
-    are ignored. This is modified from fairseq's `utils.make_positions`.
-    """
-    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
-    mask = input_ids.ne(padding_idx).int()
-    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
-    return incremental_indices.long() + padding_idx
-
-
-# Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100SinusoidalPositionalEmbedding with M2M100->XGLM
 class XGLMSinusoidalPositionalEmbedding(nn.Module):
     """This module produces sinusoidal positional embeddings of any length."""
 
@@ -173,7 +176,7 @@ def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Opt
             # in forward put the weights on the correct dtype and device of the param
             emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
 
-        self.register_buffer("weights", emb_weights)
+        self.register_buffer("weights", emb_weights, persistent=False)
 
     @staticmethod
     def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
@@ -197,42 +200,16 @@ def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional
         return emb.to(torch.get_default_dtype())
 
     @torch.no_grad()
-    def forward(
-        self, input_ids: torch.Tensor = None, inputs_embeds: torch.Tensor = None, past_key_values_length: int = 0
-    ):
-        if input_ids is not None:
-            bsz, seq_len = input_ids.size()
-            # Create the position ids from the input token ids. Any padded tokens remain padded.
-            position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length).to(
-                input_ids.device
-            )
-        else:
-            bsz, seq_len = inputs_embeds.size()[:-1]
-            position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds, past_key_values_length)
+    def forward(self, position_ids: torch.Tensor = None, past_key_values_length: int = 0):
+        bsz, seq_len = position_ids.size()
+        position_ids += self.offset
 
-        # expand embeddings if needed
-        max_pos = self.padding_idx + 1 + seq_len + past_key_values_length
+        # Expand embeddings if needed. `position_ids.max()` is NOT used to keep torch.fx compatibility.
+        max_pos = 2 + seq_len + past_key_values_length
         if max_pos > self.weights.size(0):
-            self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)
-
-        return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, -1).detach()
-
-    def create_position_ids_from_inputs_embeds(self, inputs_embeds, past_key_values_length):
-        """
-        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
-
-        Args:
-            inputs_embeds: torch.Tensor
+            self.make_weights(max_pos, self.embedding_dim, self.padding_idx)
 
-        Returns: torch.Tensor
-        """
-        input_shape = inputs_embeds.size()[:-1]
-        sequence_length = input_shape[1]
-
-        position_ids = torch.arange(
-            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
-        )
-        return position_ids.unsqueeze(0).expand(input_shape).contiguous() + past_key_values_length
+        return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, self.weights.shape[-1]).detach()
 
 
 class XGLMAttention(nn.Module):
@@ -337,7 +314,9 @@ def forward(
                     f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
                 )
             attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
-            attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
+            attn_weights = torch.max(
+                attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min, device=attn_weights.device)
+            )
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
 
         # upcast to fp32 if the weights are in fp16. Please see https://github.com/huggingface/transformers/pull/17437
@@ -431,11 +410,11 @@ def forward(
     ) -> torch.Tensor:
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             encoder_hidden_states (`torch.FloatTensor`):
-                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
             encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
@@ -511,6 +490,7 @@ class XGLMPreTrainedModel(PreTrainedModel):
     config_class = XGLMConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
+    _no_split_modules = ["XGLMDecoderLayer"]
 
     def _init_weights(self, module):
         std = self.config.init_std
@@ -578,8 +558,11 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
         combined_attention_mask = None
         if input_shape[-1] > 1:
             combined_attention_mask = _make_causal_mask(
-                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
-            ).to(inputs_embeds.device)
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
 
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
@@ -592,7 +575,6 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
 
     @add_start_docstrings_to_model_forward(XGLM_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPastAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -601,6 +583,7 @@ def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
         encoder_hidden_states: Optional[torch.Tensor] = None,
         encoder_attention_mask: Optional[torch.Tensor] = None,
         head_mask: Optional[torch.Tensor] = None,
@@ -612,70 +595,6 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
-        r"""
-        Args:
-            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
-                provide it.
-
-                Indices can be obtained using [`~XGLMTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-                [`PreTrainedTokenizer.__call__`] for details.
-
-                [What are input IDs?](../glossary#input-ids)
-            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-
-                [What are attention masks?](../glossary#attention-mask)
-            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
-                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
-                of the decoder.
-            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
-                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
-                selected in `[0, 1]`:
-
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-
-                [What are attention masks?](../glossary#attention-mask)
-            head_mask (`torch.Tensor` of shape `(num_layers, attention_heads)`, *optional*):
-                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
-
-                - 1 indicates the head is **not masked**,
-                - 0 indicates the head is **masked**.
-
-            cross_attn_head_mask (`torch.Tensor` of shape `(num_layers, attention_heads)`, *optional*):
-                Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
-
-                - 1 indicates the head is **not masked**,
-                - 0 indicates the head is **masked**.
-
-            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
-                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
-                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-
-                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
-                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
-                all `decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of
-                shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing
-                `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more
-                control over how to convert `input_ids` indices into associated vectors than the model's internal
-                embedding lookup matrix.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            output_hidden_states (`bool`, *optional*):
-                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
-                for more detail.
-            return_dict (`bool`, *optional*):
-                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-        """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -687,6 +606,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
         elif inputs_embeds is not None:
@@ -694,9 +614,17 @@ def forward(
         else:
             raise ValueError("You have to specify either input_ids or inputs_embeds")
 
-        # past_key_values_length
         past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
 
+        if position_ids is None:
+            position_ids = torch.arange(
+                past_key_values_length,
+                input_shape[-1] + past_key_values_length,
+                dtype=torch.long,
+                device=input_ids.device if input_ids is not None else inputs_embeds.device,
+            )
+            position_ids = position_ids.unsqueeze(0)
+
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
@@ -709,13 +637,17 @@ def forward(
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
             encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
 
-        # embed positions
-        positions = self.embed_positions(input_ids, inputs_embeds, past_key_values_length)
-
-        hidden_states = inputs_embeds + positions
-
+        hidden_states = inputs_embeds + self.embed_positions(position_ids, past_key_values_length)
         hidden_states = nn.functional.dropout(hidden_states, p=float(self.dropout), training=self.training)
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache = True` is incompatible with gradient checkpointing`. Setting `use_cache ="
+                    " False`..."
+                )
+                use_cache = False
+
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
@@ -734,21 +666,15 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache = True` is incompatible with gradient checkpointing`. Setting `use_cache ="
-                        " False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         # None for past_key_value
@@ -767,7 +693,6 @@ def custom_forward(*inputs):
                     None,
                 )
             else:
-
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
@@ -823,14 +748,7 @@ def custom_forward(*inputs):
 )
 class XGLMForCausalLM(XGLMPreTrainedModel):
     base_model_prefix = "model"
-    _keys_to_ignore_on_load_missing = [
-        r"model.embed_positions.weights",
-        r"embed_positions.weights",
-        r"lm_head.weight",
-    ]
-    _keys_to_ignore_on_save = [
-        r"model.embed_positions.weights",
-    ]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -854,7 +772,6 @@ def set_output_embeddings(self, new_embeddings):
 
     @add_start_docstrings_to_model_forward(XGLM_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=CausalLMOutputWithCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -863,6 +780,7 @@ def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
         encoder_hidden_states: Optional[torch.Tensor] = None,
         encoder_attention_mask: Optional[torch.Tensor] = None,
         head_mask: Optional[torch.Tensor] = None,
@@ -892,6 +810,7 @@ def forward(
         outputs = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
+            position_ids=position_ids,
             encoder_hidden_states=encoder_hidden_states,
             encoder_attention_mask=encoder_attention_mask,
             head_mask=head_mask,
@@ -932,9 +851,18 @@ def forward(
     def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
     ):
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_ids.shape)
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+        else:
+            position_ids = None
+            # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+            if attention_mask is None:
+                attention_mask = input_ids.new_ones(input_ids.shape)
 
         if past_key_values:
             input_ids = input_ids[:, -1:]
@@ -942,13 +870,16 @@ def prepare_inputs_for_generation(
         return {
             "input_ids": input_ids,  # encoder_outputs is defined. input_ids not needed
             "attention_mask": attention_mask,
+            "position_ids": position_ids,
             "past_key_values": past_key_values,
             "use_cache": use_cache,
         }
 
     @staticmethod
-    def _reorder_cache(past, beam_idx):
+    def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
         return reordered_past
diff --git a/src/transformers/models/xglm/tokenization_xglm.py b/src/transformers/models/xglm/tokenization_xglm.py
index 4dc6c3d37f7c..913d25b2b46f 100644
--- a/src/transformers/models/xglm/tokenization_xglm.py
+++ b/src/transformers/models/xglm/tokenization_xglm.py
@@ -83,11 +83,6 @@ class XGLMTokenizer(PreTrainedTokenizer):
             token instead.
         pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        mask_token (`str`, *optional*, defaults to `"<mask>"`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
-            Additional special tokens used by the tokenizer.
         sp_model_kwargs (`dict`, *optional*):
             Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
             SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
@@ -124,7 +119,7 @@ def __init__(
         unk_token="<unk>",
         pad_token="<pad>",
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
 
@@ -137,17 +132,6 @@ def __init__(
             word for word in madeup_words if word not in kwargs["additional_special_tokens"]
         ]
 
-        super().__init__(
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            pad_token=pad_token,
-            sp_model_kwargs=self.sp_model_kwargs,
-            **kwargs,
-        )
-
         self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
         self.sp_model.Load(str(vocab_file))
         self.vocab_file = vocab_file
@@ -170,6 +154,17 @@ def __init__(
 
         self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
 
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+
     def __getstate__(self):
         state = self.__dict__.copy()
         state["sp_model"] = None
diff --git a/src/transformers/models/xglm/tokenization_xglm_fast.py b/src/transformers/models/xglm/tokenization_xglm_fast.py
index a0d4cebafeef..5963d37ceaa1 100644
--- a/src/transformers/models/xglm/tokenization_xglm_fast.py
+++ b/src/transformers/models/xglm/tokenization_xglm_fast.py
@@ -110,7 +110,7 @@ def __init__(
         cls_token="<s>",
         unk_token="<unk>",
         pad_token="<pad>",
-        **kwargs
+        **kwargs,
     ):
         # Compatibility with the original tokenizer
         self.num_madeup_words = 7
@@ -134,7 +134,10 @@ def __init__(
         )
 
         self.vocab_file = vocab_file
-        self.can_save_slow_tokenizer = False if not self.vocab_file else True
+
+    @property
+    def can_save_slow_tokenizer(self) -> bool:
+        return os.path.isfile(self.vocab_file) if self.vocab_file else False
 
     def build_inputs_with_special_tokens(
         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
diff --git a/src/transformers/models/xlm/__init__.py b/src/transformers/models/xlm/__init__.py
index de9be348b94c..1dd57a90b927 100644
--- a/src/transformers/models/xlm/__init__.py
+++ b/src/transformers/models/xlm/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/xlm/configuration_xlm.py b/src/transformers/models/xlm/configuration_xlm.py
index 52a53d5bf006..cd8d721bfc37 100644
--- a/src/transformers/models/xlm/configuration_xlm.py
+++ b/src/transformers/models/xlm/configuration_xlm.py
@@ -192,7 +192,7 @@ def __init__(
         lang_id=0,
         pad_token_id=2,
         bos_token_id=0,
-        **kwargs
+        **kwargs,
     ):
         """Constructs XLMConfig."""
         self.vocab_size = vocab_size
diff --git a/src/transformers/models/xlm/convert_xlm_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/xlm/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
index 4221cdfc9085..6f3cdf920a0e 100755
--- a/src/transformers/models/xlm/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/xlm/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
@@ -43,10 +43,10 @@ def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_p
             two_levels_state_dict["transformer." + k] = v
 
     config = chkpt["params"]
-    config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray)))
+    config = {n: v for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray))}
 
     vocab = chkpt["dico_word2id"]
-    vocab = dict((s + "</w>" if s.find("@@") == -1 and i > 13 else s.replace("@@", ""), i) for s, i in vocab.items())
+    vocab = {s + "</w>" if s.find("@@") == -1 and i > 13 else s.replace("@@", ""): i for s, i in vocab.items()}
 
     # Save pytorch-model
     pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
diff --git a/src/transformers/models/xlm/modeling_tf_xlm.py b/src/transformers/models/xlm/modeling_tf_xlm.py
index f910ff2fdead..63d214da0c54 100644
--- a/src/transformers/models/xlm/modeling_tf_xlm.py
+++ b/src/transformers/models/xlm/modeling_tf_xlm.py
@@ -16,6 +16,9 @@
  TF 2.0 XLM model.
 """
 
+
+from __future__ import annotations
+
 import itertools
 import warnings
 from dataclasses import dataclass
@@ -45,7 +48,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     MULTIPLE_CHOICE_DUMMY_INPUTS,
     ModelOutput,
@@ -61,7 +64,6 @@
 
 _CHECKPOINT_FOR_DOC = "xlm-mlm-en-2048"
 _CONFIG_FOR_DOC = "XLMConfig"
-_TOKENIZER_FOR_DOC = "XLMTokenizer"
 
 TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "xlm-mlm-en-2048",
@@ -441,16 +443,7 @@ def call(
 
         # embeddings
         if inputs_embeds is None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.embeddings.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.embeddings.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.embeddings.vocab_size)
             inputs_embeds = self.embeddings(input_ids)
 
         tensor = inputs_embeds + tf.gather(self.position_embeddings, position_ids)
@@ -568,8 +561,8 @@ class TFXLMWithLMHeadModelOutput(ModelOutput):
     """
 
     logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 XLM_START_DOCSTRING = r"""
@@ -619,7 +612,7 @@ class TFXLMWithLMHeadModelOutput(ModelOutput):
         input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -656,9 +649,8 @@ class TFXLMWithLMHeadModelOutput(ModelOutput):
             also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in
             `[0, ..., input_ids.size(-1)]`.
         cache (`Dict[str, tf.Tensor]`, *optional*):
-            Dictionary string to `torch.FloatTensor` that contains precomputed hidden states (key and values in the
-            attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential
-            decoding.
+            Dictionary string to `tf.Tensor` that contains precomputed hidden states (key and values in the attention
+            blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential decoding.
 
             The dictionary object will be modified in-place during the forward pass to add newly computed
             hidden-states.
@@ -701,27 +693,26 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFBaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids=None,
-        attention_mask=None,
-        langs=None,
-        token_type_ids=None,
-        position_ids=None,
-        lengths=None,
-        cache=None,
-        head_mask=None,
-        inputs_embeds=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        training=False,
-    ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
+        input_ids: TFModelInputType | None = None,
+        attention_mask: tf.Tensor | None = None,
+        langs: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        lengths: tf.Tensor | None = None,
+        cache: Dict[str, tf.Tensor] | None = None,
+        head_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+    ) -> TFBaseModelOutput | Tuple[tf.Tensor]:
         outputs = self.transformer(
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -740,13 +731,6 @@ def call(
 
         return outputs
 
-    # Copied from transformers.models.distilbert.modeling_tf_distilbert.TFDistilBertModel.serving_output
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutput(last_hidden_state=output.last_hidden_state, hidden_states=hs, attentions=attns)
-
 
 class TFXLMPredLayer(tf.keras.layers.Layer):
     """
@@ -838,22 +822,21 @@ def prepare_inputs_for_generation(self, inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFXLMWithLMHeadModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        langs: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        lengths: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
         cache: Optional[Dict[str, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -885,12 +868,6 @@ def call(
             logits=outputs, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions
         )
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFXLMWithLMHeadModelOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -910,26 +887,25 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        langs: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        lengths: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
         cache: Optional[Dict[str, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -970,13 +946,6 @@ def call(
             attentions=transformer_outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1017,26 +986,25 @@ def dummy_inputs(self):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFMultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        langs: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        lengths: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
         cache: Optional[Dict[str, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
         if input_ids is not None:
@@ -1097,28 +1065,6 @@ def call(
             attentions=transformer_outputs.attentions,
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-                "token_type_ids": tf.TensorSpec((None, None, None), tf.int32, name="token_type_ids"),
-            }
-        ]
-    )
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving
-    def serving(self, inputs: Dict[str, tf.Tensor]):
-        output = self.call(input_ids=inputs)
-
-        return self.serving_output(output)
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output
-    def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMultipleChoiceModelOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1141,26 +1087,25 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFTokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        langs: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        lengths: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
         cache: Optional[Dict[str, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1200,13 +1145,6 @@ def call(
             attentions=transformer_outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1226,27 +1164,26 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        langs: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        lengths: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
         cache: Optional[Dict[str, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1298,12 +1235,3 @@ def call(
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
         )
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output
-    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
diff --git a/src/transformers/models/xlm/modeling_xlm.py b/src/transformers/models/xlm/modeling_xlm.py
index 00014048933b..d342cde80d3c 100755
--- a/src/transformers/models/xlm/modeling_xlm.py
+++ b/src/transformers/models/xlm/modeling_xlm.py
@@ -52,7 +52,6 @@
 
 _CHECKPOINT_FOR_DOC = "xlm-mlm-en-2048"
 _CONFIG_FOR_DOC = "XLMConfig"
-_TOKENIZER_FOR_DOC = "XLMTokenizer"
 
 XLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "xlm-mlm-en-2048",
@@ -103,7 +102,6 @@ def get_masks(slen, lengths, causal, padding_mask=None):
 
 
 class MultiHeadAttention(nn.Module):
-
     NEW_ID = itertools.count()
 
     def __init__(self, n_heads, dim, config):
@@ -324,7 +322,7 @@ class XLMForQuestionAnsweringOutput(ModelOutput):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`XLMTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -393,8 +391,6 @@ class XLMForQuestionAnsweringOutput(ModelOutput):
     XLM_START_DOCSTRING,
 )
 class XLMModel(XLMPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -463,7 +459,9 @@ def __init__(self, config):
 
         # Initialize weights and apply final processing
         self.post_init()
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
     def get_input_embeddings(self):
         return self.embeddings
@@ -481,7 +479,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -673,7 +670,7 @@ def forward(self, x, y=None):
     XLM_START_DOCSTRING,
 )
 class XLMWithLMHeadModel(XLMPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["pred_layer.proj.weight"]
+    _tied_weights_keys = ["pred_layer.proj.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -704,7 +701,6 @@ def prepare_inputs_for_generation(self, input_ids, **kwargs):
 
     @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -784,7 +780,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -885,7 +880,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1033,10 +1027,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import XLMTokenizer, XLMForQuestionAnswering
+        >>> from transformers import AutoTokenizer, XLMForQuestionAnswering
         >>> import torch
 
-        >>> tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048")
+        >>> tokenizer = AutoTokenizer.from_pretrained("xlm-mlm-en-2048")
         >>> model = XLMForQuestionAnswering.from_pretrained("xlm-mlm-en-2048")
 
         >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(
@@ -1113,7 +1107,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1197,7 +1190,6 @@ def __init__(self, config, *inputs, **kwargs):
 
     @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/xlm/tokenization_xlm.py b/src/transformers/models/xlm/tokenization_xlm.py
index 8bb021c5b969..49d22934e072 100644
--- a/src/transformers/models/xlm/tokenization_xlm.py
+++ b/src/transformers/models/xlm/tokenization_xlm.py
@@ -571,7 +571,7 @@ class XLMTokenizer(PreTrainedTokenizer):
         mask_token (`str`, *optional*, defaults to `"<special1>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        additional_special_tokens (`List[str]`, *optional*, defaults to `["<special0>","<special1>","<special2>","<special3>","<special4>","<special5>","<special6>","<special7>","<special8>","<special9>"]`):
+        additional_special_tokens (`List[str]`, *optional*, defaults to `['<special0>', '<special1>', '<special2>', '<special3>', '<special4>', '<special5>', '<special6>', '<special7>', '<special8>', '<special9>']`):
             List of additional special tokens.
         lang2id (`Dict[str, int]`, *optional*):
             Dictionary mapping languages string identifiers to their IDs.
@@ -611,22 +611,8 @@ def __init__(
         lang2id=None,
         id2lang=None,
         do_lowercase_and_remove_accent=True,
-        **kwargs
+        **kwargs,
     ):
-        super().__init__(
-            unk_token=unk_token,
-            bos_token=bos_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            additional_special_tokens=additional_special_tokens,
-            lang2id=lang2id,
-            id2lang=id2lang,
-            do_lowercase_and_remove_accent=do_lowercase_and_remove_accent,
-            **kwargs,
-        )
-
         try:
             import sacremoses
         except ImportError:
@@ -638,10 +624,10 @@ def __init__(
         self.sm = sacremoses
 
         # cache of sm.MosesPunctNormalizer instance
-        self.cache_moses_punct_normalizer = dict()
+        self.cache_moses_punct_normalizer = {}
         # cache of sm.MosesTokenizer instance
-        self.cache_moses_tokenizer = dict()
-        self.lang_with_custom_tokenizer = set(["zh", "th", "ja"])
+        self.cache_moses_tokenizer = {}
+        self.lang_with_custom_tokenizer = {"zh", "th", "ja"}
         # True for current supported model (v1.2.0), False for XLM-17 & 100
         self.do_lowercase_and_remove_accent = do_lowercase_and_remove_accent
         self.lang2id = lang2id
@@ -660,6 +646,19 @@ def __init__(
         merges = [tuple(merge.split()[:2]) for merge in merges]
         self.bpe_ranks = dict(zip(merges, range(len(merges))))
         self.cache = {}
+        super().__init__(
+            unk_token=unk_token,
+            bos_token=bos_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            additional_special_tokens=additional_special_tokens,
+            lang2id=lang2id,
+            id2lang=id2lang,
+            do_lowercase_and_remove_accent=do_lowercase_and_remove_accent,
+            **kwargs,
+        )
 
     @property
     def do_lower_case(self):
@@ -851,7 +850,7 @@ def _tokenize(self, text, lang="en", bypass_tokenizer=False):
         split_tokens = []
         for token in text:
             if token:
-                split_tokens.extend([t for t in self.bpe(token).split(" ")])
+                split_tokens.extend(list(self.bpe(token).split(" ")))
 
         return split_tokens
 
diff --git a/src/transformers/models/xlm_prophetnet/__init__.py b/src/transformers/models/xlm_prophetnet/__init__.py
index 89407b8d304f..ff14e5b987a7 100644
--- a/src/transformers/models/xlm_prophetnet/__init__.py
+++ b/src/transformers/models/xlm_prophetnet/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/xlm_prophetnet/configuration_xlm_prophetnet.py b/src/transformers/models/xlm_prophetnet/configuration_xlm_prophetnet.py
index cdca20ef3b43..29c8678f2799 100644
--- a/src/transformers/models/xlm_prophetnet/configuration_xlm_prophetnet.py
+++ b/src/transformers/models/xlm_prophetnet/configuration_xlm_prophetnet.py
@@ -134,7 +134,7 @@ def __init__(
         pad_token_id: Optional[int] = 0,
         bos_token_id: Optional[int] = 1,
         eos_token_id: Optional[int] = 2,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
diff --git a/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py b/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py
index 57a32d257708..cde05cfe8a8a 100644
--- a/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py
+++ b/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py
@@ -43,7 +43,6 @@
 
 
 _CONFIG_FOR_DOC = "XLMProphetNetConfig"
-_TOKENIZER_FOR_DOC = "XLMProphetNetTokenizer"
 
 XLM_PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "microsoft/xprophetnet-large-wiki100-cased",
@@ -77,7 +76,7 @@
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`XLMProphetNetTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -91,7 +90,7 @@
         decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`XLMProphetNetTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
@@ -152,7 +151,7 @@
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`XLMProphetNetTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -682,7 +681,6 @@ def forward(
         past_key_value: Optional[Tuple[Tensor]] = None,
         output_attentions: bool = False,
     ) -> Tuple[Tensor, Optional[Tensor]]:
-
         batch_size, tgt_len, hidden_size = hidden_states.size()
 
         # if key_value_states are provided this layer is used as a cross-attention layer
@@ -718,44 +716,27 @@ def forward(
             past_key_value = (key_states, value_states)
 
         # project states into the correct shape
-        proj_shape = (batch_size * self.num_attn_heads, -1, self.head_dim)
+        proj_shape = (batch_size, self.num_attn_heads, -1, self.head_dim)
         query_states = self._shape(query_states, tgt_len, batch_size).view(*proj_shape)
         key_states = key_states.view(*proj_shape)
         value_states = value_states.view(*proj_shape)
-
-        src_len = key_states.size(1)
-        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
-        assert attn_weights.size() == (
-            batch_size * self.num_attn_heads,
-            tgt_len,
-            src_len,
-        ), (
-            f"`attn_weights` should be of size {batch_size * self.num_attn_heads, tgt_len, src_len}, but is of size"
-            f" {attn_weights.shape}"
-        )
+        src_len = key_states.size(2)
+        attn_weights = torch.einsum("bsij,bsjk->bsik", query_states, key_states.transpose(2, 3))
+        expected_shape = (batch_size, self.num_attn_heads, tgt_len, src_len)
+        if attn_weights.size() != expected_shape:
+            raise ValueError(f"Attention weights should have size {expected_shape}, but is {attn_weights.size()}")
 
         # This is part of a workaround to get around fork/join parallelism not supporting Optional types.
         if attention_mask is not None and attention_mask.dim() == 0:
             attention_mask = None
-        assert attention_mask is None or attention_mask.size() == (
-            self.num_attn_heads * batch_size,
-            1,
-            src_len,
-        ), (
-            "`attention_mask` should be `None` or of shape attention_mask.size() =="
-            f" {batch_size * self.num_attn_heads, 1, src_len}, but is {attention_mask.shape}"
-        )
 
+        expected_shape = (batch_size, self.num_attn_heads, 1, src_len)
+        if attention_mask is not None and attention_mask.size() != expected_shape:
+            raise ValueError(f"Attention mask should have size {expected_shape}, but is {attention_mask.size()}")
         if attention_mask is not None:  # don't attend to padding symbols
             attn_weights = attn_weights + attention_mask
-
         if output_attentions:
-            # this operation is a bit awkward, but it's required to
-            # make sure that attn_weights keeps its gradient.
-            # In order to do so, attn_weights have to be reshaped
-            # twice and have to be reused in the following
-            attn_weights_reshaped = attn_weights.view(batch_size, self.num_attn_heads, tgt_len, src_len)
-            attn_weights = attn_weights_reshaped.view(batch_size * self.num_attn_heads, tgt_len, src_len)
+            attn_weights_reshaped = attn_weights
         else:
             attn_weights_reshaped = None
 
@@ -769,7 +750,6 @@ def forward(
             attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(
                 batch_size, self.num_attn_heads, tgt_len, src_len
             )
-            attn_weights = attn_weights.view(batch_size * self.num_attn_heads, tgt_len, src_len)
 
             # apply head_mask also on attn_weights_reshaped which is used for n-gram attention inside the model
             attn_weights_reshaped = layer_head_mask.view(1, -1, 1, 1) * attn_weights_reshaped
@@ -779,23 +759,12 @@ def forward(
             p=self.attention_dropout,
             training=self.training,
         )
+        attn_output = torch.einsum("bsij,bsjk->bsik", attn_probs, value_states)
+        expected_shape = (batch_size, self.num_attn_heads, tgt_len, self.head_dim)
+        if attn_output.size() != expected_shape:
+            raise ValueError(f"`attn_output` should have shape {expected_shape}, but is of shape {attn_output.size()}")
 
-        attn_output = torch.bmm(attn_probs, value_states)
-        assert attn_output.size() == (
-            batch_size * self.num_attn_heads,
-            tgt_len,
-            self.head_dim,
-        ), (
-            f"`attn_output` should be of shape {batch_size * self.num_attn_heads, tgt_len, self.head_dim}, but is of"
-            f" shape {attn_output.size()}"
-        )
-
-        attn_output = (
-            attn_output.view(batch_size, self.num_attn_heads, tgt_len, self.head_dim)
-            .transpose(1, 2)
-            .reshape(batch_size, tgt_len, hidden_size)
-        )
-
+        attn_output = attn_output.transpose(1, 2).reshape(batch_size, tgt_len, hidden_size)
         attn_output = self.out_proj(attn_output)
 
         attn_output = nn.functional.dropout(attn_output, p=self.dropout, training=self.training)
@@ -875,7 +844,6 @@ def forward(
         position_ids=None,
     ):
         batch_size, ngram_sequence_length, hidden_size = hidden_states.size()
-
         assert list(hidden_states.size()) == [batch_size, ngram_sequence_length, hidden_size], (
             f"`hidden_states` should be of shape {batch_size, ngram_sequence_length, hidden_size}, but is of shape"
             f" {hidden_states.shape}"
@@ -893,8 +861,7 @@ def forward(
         query_states = self._shape(query_states, ngram_sequence_length, batch_size)
         key_states = self._shape(key_states, -1, batch_size)
         value_states = self._shape(value_states, -1, batch_size)
-
-        proj_shape = (batch_size * self.num_attn_heads, -1, self.head_dim)
+        proj_shape = (batch_size, self.num_attn_heads, -1, self.head_dim)
 
         query_states = query_states.view(*proj_shape)
         key_states = key_states.view(*proj_shape)
@@ -902,10 +869,9 @@ def forward(
 
         # chunk into main stream and predict stream
         hidden_states_list = hidden_states.chunk(1 + self.ngram, dim=1)
-
-        query_states_list = query_states.chunk(1 + self.ngram, dim=1)
-        key_states_list = key_states.chunk(1 + self.ngram, dim=1)
-        value_states_list = value_states.chunk(1 + self.ngram, dim=1)
+        query_states_list = query_states.chunk(1 + self.ngram, dim=2)
+        key_states_list = key_states.chunk(1 + self.ngram, dim=2)
+        value_states_list = value_states.chunk(1 + self.ngram, dim=2)
 
         main_hidden_states, hidden_states_predict_list = hidden_states_list[0], hidden_states_list[1:]
         main_query_states, predict_query_states_list = query_states_list[0], query_states_list[1:]
@@ -914,28 +880,29 @@ def forward(
 
         # saved states are stored with shape (batch_size, num_attn_heads, seq_len, head_dim)
         if past_key_value is not None:
-            prev_main_key_states = past_key_value[0].view(batch_size * self.num_attn_heads, -1, self.head_dim)
-            main_key_states = torch.cat((prev_main_key_states, main_key_states), dim=1)
-            prev_main_value_states = past_key_value[1].view(batch_size * self.num_attn_heads, -1, self.head_dim)
-            main_value_states = torch.cat((prev_main_value_states, main_value_states), dim=1)
+            prev_main_key_states = past_key_value[0]
+            main_key_states = torch.cat((prev_main_key_states, main_key_states), dim=2)
+            prev_main_value_states = past_key_value[1]
+            main_value_states = torch.cat((prev_main_value_states, main_value_states), dim=2)
 
         # Update cache
-        past_key_value = (
-            main_key_states.view(batch_size, self.num_attn_heads, -1, self.head_dim),
-            main_value_states.view(batch_size, self.num_attn_heads, -1, self.head_dim),
-        )
+        past_key_value = (main_key_states, main_value_states)
 
         # get seq_length of main stream only
         sequence_length = ngram_sequence_length // (1 + self.ngram)
 
         # MAIN-STREAM
         # main attn weights
-        main_attn_weights = torch.bmm(main_query_states, main_key_states.transpose(1, 2))
+        # [batch_size, number_heads, sequence_length, head_dimesion]
+        # x [batch_size, number_heads, head_dimesion, sequence_length]
+        # -> [batch_size, number_heads, sequence_length, sequence_length]
+        main_attn_weights = torch.einsum("bntc,bncs->bnts", main_query_states, main_key_states.transpose(2, 3))
 
         # retrieve relative position embeddings for each layer -> see paper for more details
         main_relative_pos_embeddings = self.get_main_relative_pos_embeddings(
             main_hidden_states, main_attn_weights, position_ids, main_relative_position_buckets
         )
+
         main_attn_weights = main_attn_weights + main_relative_pos_embeddings
 
         if attention_mask is not None:
@@ -955,55 +922,53 @@ def forward(
             main_attn_probs = layer_head_mask.view(1, -1, 1, 1) * main_attn_probs.view(
                 batch_size, self.num_attn_heads, -1, sequence_length
             )
-            main_attn_probs = main_attn_probs.view(batch_size * self.num_attn_heads, -1, sequence_length)
 
         main_attn_probs = nn.functional.dropout(main_attn_probs, p=self.attention_dropout, training=self.training)
         # project to attn_output
-        main_attn_output = torch.bmm(main_attn_probs, main_value_states)
-
+        # [batch_size, number_heads, sequence_length, sequence_length]
+        # x [batch_size, number_heads, sequence_length, head_dimesion]
+        # -> [batch_size, number_heads, sequence_length, head_dimesion]
+        main_attn_output = torch.einsum("bntc,bncs->bnts", main_attn_probs, main_value_states)
         # reshape so that num_heads dim is merged into last `head_dim` axis
-        main_attn_output = (
-            main_attn_output.view(batch_size, self.num_attn_heads, sequence_length, self.head_dim)
-            .transpose(1, 2)
-            .reshape(batch_size, 1, sequence_length, hidden_size)
-        )
+        main_attn_output = main_attn_output.transpose(1, 2).reshape(batch_size, 1, sequence_length, hidden_size)
         main_attn_output = self.out_proj(main_attn_output)
 
         # PREDICT-STREAM
-        # [ngram, B*head, T, c]
-        predict_query_states = torch.cat(predict_query_states_list, 0).view(
-            self.ngram, -1, sequence_length, self.head_dim
-        )
-        # [ngram, B*head, 2*T, c]
-        predict_key_states = torch.cat(
-            [torch.cat([main_key_states, key], 1).unsqueeze(0) for key in predict_key_states_list], 0
+        # [batch_size, ngram, number_heads, sequence_length, head_dimesion]
+        predict_query_states = torch.stack(predict_query_states_list, 1).view(
+            batch_size, self.ngram, self.num_attn_heads, sequence_length, self.head_dim
         )
 
-        # [ngram, T, B, C]
-        predict_hidden_states = torch.cat(hidden_states_predict_list, 0).view(
-            self.ngram, sequence_length, batch_size, hidden_size
-        )
+        # [batch_size, ngram, number_heads, 2*sequence_length, head_dimesion]
+        predict_key_states = torch.stack([torch.cat([main_key_states, key], 2) for key in predict_key_states_list], 1)
+
+        # [batch_size, sequence_length, ngram, hidden_size]
+        predict_hidden_states = torch.stack(hidden_states_predict_list, dim=2)
 
-        # [ngram, B*head, 2*T, c]
+        # [batch_size, number_heads, ngram, 2*sequence_length, head_dimesion]
         predict_value_states = torch.cat(
-            [torch.cat([main_value_states, v_p], 1).unsqueeze(0) for v_p in predict_value_states_list], 0
+            [torch.cat([main_value_states, v_p], 2).unsqueeze(2) for v_p in predict_value_states_list], 2
         )
-        # [ngram, B*head, T, 2*T]
-        predict_attn_weights = torch.einsum("nbtc,nbsc->nbts", (predict_query_states, predict_key_states))
 
-        # [ngram, B*head, T, S]
+        # [batch_size, ngram, number_heads, sequence_length, head_dimesion]
+        # x [batch_size, ngram, number_heads, 2*sequence_length, head_dimesion]
+        # -> [batch_size, ngram, number_heads, sequence_length, 2*sequence_length]
+        predict_attn_weights = torch.einsum("bnhtc,bnhsc->bnhts", (predict_query_states, predict_key_states))
+
         # retrieve relative position embeddings for each layer -> see paper for more details
+        # [batch_size, ngram, number_heads, sequence_length, predict_relative_pos_embeddings]
         predict_relative_pos_embeddings = self.get_predict_relative_pos_embeddings(
             predict_hidden_states, predict_attn_weights, position_ids, predict_relative_position_buckets
         )
 
-        # [ngram, B*head, T, 2*T]
+        # [batch_size, ngram, number_heads, sequence_length, 2*sequence_length]
         predict_attn_weights = predict_attn_weights + predict_relative_pos_embeddings
 
         if extended_predict_attention_mask is not None:
-            predict_attn_weights = predict_attn_weights + extended_predict_attention_mask.to(
-                predict_attn_weights.dtype
-            )
+            # Permuting Predict attention mask to [batch_size, ngram, number_heads, sequence_length, 2*sequence_length]
+            extended_predict_attention_mask = extended_predict_attention_mask.permute(0, 2, 1, 3, 4)
+            extended_predict_attention_mask = extended_predict_attention_mask.to(predict_attn_weights.dtype)
+            predict_attn_weights = predict_attn_weights + extended_predict_attention_mask
 
         predict_attn_probs = softmax(
             predict_attn_weights,
@@ -1016,37 +981,30 @@ def forward(
                 f"Head mask for a single layer should be of size {(self.num_attn_heads,)}, but is"
                 f" {layer_head_mask.size()}"
             )
-            predict_attn_probs = layer_head_mask.view(1, 1, -1, 1, 1) * predict_attn_probs.view(
-                self.ngram, batch_size, self.num_attn_heads, sequence_length, 2 * sequence_length
-            )
-            predict_attn_probs = predict_attn_probs.view(
-                self.ngram, batch_size * self.num_attn_heads, sequence_length, 2 * sequence_length
-            )
+            predict_attn_probs = layer_head_mask.view(1, 1, -1, 1, 1) * predict_attn_probs
 
         predict_attn_probs = nn.functional.dropout(
             predict_attn_probs, p=self.attention_dropout, training=self.training
         )
         # project to attention output
-        # [ngram, B*head, T, c]
-        predict_attn_output = torch.einsum("nbts,nbsc->nbtc", (predict_attn_probs, predict_value_states))
+        # [batch_size, ngram, number_heads, sequence_length, 2*sequence_length]
+        # x [batch_size, ngram, number_heads, 2*sequence_length, head_dimesion]
+        # -> [batch_size, ngram, number_heads, sequence_length, head_dimesion]
+        predict_attn_output = torch.einsum(
+            "bnhts,bnhsc->bnhtc", (predict_attn_probs, predict_value_states.transpose(1, 2))
+        )
 
         # reshape so that num_heads dim is merged into last `head_dim` axis
-        # [ngram, B, T, C]
-        predict_attn_output = (
-            predict_attn_output.view(self.ngram, batch_size, self.num_attn_heads, sequence_length, self.head_dim)
-            .permute(1, 0, 3, 2, 4)
-            .reshape(batch_size, self.ngram, sequence_length, hidden_size)
-        )
+        # [batch_size, ngram, number_heads, sequence_length, head_dimesion] -> [batch_size, ngram, sequence_length, hidden_size]
+        predict_attn_output = predict_attn_output.transpose(2, 3)
+        predict_attn_output = predict_attn_output.reshape(batch_size, self.ngram, sequence_length, hidden_size)
         predict_attn_output = self.out_proj(predict_attn_output)
 
         # concat to single attn output
-        # [B, 1+ngram*T, C]
+        # [batch_size, (1+ngram)*sequence_length, hidden_size]
         attn_output = torch.cat([main_attn_output, predict_attn_output], 1).view(batch_size, -1, hidden_size)
         # reshape into better form for `config.output_attentions`
         main_attn_probs = main_attn_probs.view(batch_size, self.num_attn_heads, sequence_length, -1)
-        predict_attn_probs = predict_attn_probs.view(
-            self.ngram, batch_size, self.num_attn_heads, sequence_length, -1
-        ).transpose(0, 1)
 
         attn_output = nn.functional.dropout(attn_output, p=self.dropout, training=self.training)
 
@@ -1055,8 +1013,11 @@ def forward(
     def get_main_relative_pos_embeddings(
         self, hidden_states, attn_weights, position_ids, main_relative_position_buckets
     ):
-        # input hidden_states [B,T,C], input attn_weights [T*head,T,S], input position_ids [B,T] or [1,1]
-
+        # input hidden_states [batch_size, sequence_length, hidden_size]
+        # input attn_weights [batch_size, num_heads, sequence_length, sequence_length]
+        # input position_ids [batch_size, sequence_length] or [1,1]
+        batch_size, num_attn_heads, tgt_len, src_len = attn_weights.shape
+        attn_weights = attn_weights.view(batch_size, num_attn_heads, tgt_len, src_len)
         if main_relative_position_buckets is None:
             batch_size, sequence_length = hidden_states.shape[:2]
             relative_positions = (
@@ -1066,39 +1027,42 @@ def get_main_relative_pos_embeddings(
                 .repeat(batch_size, sequence_length, 1)
                 .to(position_ids.device)
             )
-            relative_positions = relative_positions - position_ids.unsqueeze(0).repeat(
-                batch_size, sequence_length, 1
-            )  # [B, T, s]
+            # [batch_size, sequence_length, sequence_length+1]
+            relative_positions = relative_positions - position_ids.unsqueeze(0).repeat(batch_size, sequence_length, 1)
             main_relative_position_buckets = compute_relative_buckets(
                 self.num_buckets, self.relative_max_distance, relative_positions, False
             )
 
-        rel_pos_embeddings = self.relative_pos_embeddings(hidden_states)  # [B,T,Buckets*head]
+        # [batch_size, sequence_length, num_buckets * num_heads]
+        rel_pos_embeddings = self.relative_pos_embeddings(hidden_states)
         rel_pos_embeddings = rel_pos_embeddings.view(
             rel_pos_embeddings.shape[:2] + (self.num_buckets, self.num_attn_heads)
-        ).permute(
-            0, 3, 1, 2
-        )  # [B,T,Buckets,head]
-        rel_pos_embeddings = rel_pos_embeddings.reshape(attn_weights.shape[:2] + (-1,))  # [B*head,T,Buckets]
-
-        main_relative_position_buckets = (
-            main_relative_position_buckets.repeat(1, self.num_attn_heads, 1)
-            .view(-1, main_relative_position_buckets.shape[-1])
-            .long()
-        )  # [B*head*T, T]
-        rel_pos_embeddings = rel_pos_embeddings.reshape(-1, rel_pos_embeddings.size(-1))  # [B*head*T,Buckets]
-
-        main_relative_pos_embeddings = torch.gather(
-            rel_pos_embeddings, dim=1, index=main_relative_position_buckets
-        ).view(attn_weights.shape[:2] + (-1,))
+        )
+        rel_pos_embeddings = rel_pos_embeddings.permute(0, 3, 1, 2)
+        # [batch_size, num_heads, sequence_length, num_buckets]
+        rel_pos_embeddings = rel_pos_embeddings.reshape(attn_weights.shape[:3] + (-1,))
+
+        main_relative_position_buckets = main_relative_position_buckets.repeat(1, self.num_attn_heads, 1)
+        # [batch_size * num_heads * sequence_length, sequence_length]
+        main_relative_position_buckets = main_relative_position_buckets.view(
+            -1, main_relative_position_buckets.shape[-1]
+        )
+        main_relative_position_buckets = main_relative_position_buckets.long()
+        # [batch_size * num_heads * sequence_length, sequence_length]
+        rel_pos_embeddings = rel_pos_embeddings.reshape(-1, rel_pos_embeddings.size(-1))
 
+        main_relative_pos_embeddings = torch.gather(rel_pos_embeddings, dim=1, index=main_relative_position_buckets)
+        main_relative_pos_embeddings = main_relative_pos_embeddings.view(batch_size, num_attn_heads, tgt_len, -1)
         return main_relative_pos_embeddings
 
     def get_predict_relative_pos_embeddings(
         self, hidden_states, attn_weights, position_ids, predict_relative_position_buckets
     ):
-        # input hidden_states [ngram, T,B,C], input attn_weights [ngram, B*head,T,S], input position_ids [B,T] or [1,1], input predict_relative_position_buckets [B,T, 2*T] or None
-        sequence_length, batch_size = hidden_states.shape[1:3]
+        # input hidden_states [batch_size, sequence_length, ngram, hidden_size]
+        # input attn_weights [batch_size, ngram, num_heads, sequence_length, 2*sequence_length]
+        # input position_ids [batch_size, sequence_length] or [1,1]
+        # input predict_relative_position_buckets [batch_size, sequence_length, 2*sequence_length] or None
+        batch_size, sequence_length = hidden_states.shape[0:2]
 
         if predict_relative_position_buckets is None:
             key_sequence_length = attn_weights.shape[-1]
@@ -1118,28 +1082,35 @@ def get_predict_relative_pos_embeddings(
                 self.num_buckets, self.relative_max_distance, relative_positions, False
             )
 
-        hidden_states = hidden_states.transpose(1, 2)  # [ngram, B, T, C]
-        rel_pos_embeddings = self.relative_pos_embeddings(hidden_states).view(
-            hidden_states.shape[:-1] + (self.num_buckets, self.num_attn_heads)
-        )  # [ngram, B, T, bucket, head]
-        rel_pos_embeddings = rel_pos_embeddings.permute(0, 1, 4, 2, 3).reshape(
-            self.ngram * batch_size * self.num_attn_heads, sequence_length, -1
-        )  # [ngram*B*head, T, bucket]
+        # [batch_size, ngram, sequence_length, hidden_size]
+        hidden_states = hidden_states.transpose(1, 2)
+        rel_pos_embeddings = self.relative_pos_embeddings(hidden_states)
 
-        predict_relative_position_buckets = predict_relative_position_buckets.unsqueeze(0).repeat(
+        # [batch_size, ngram, sequence_length, num_buckets, num_heads]
+        rel_pos_embeddings = rel_pos_embeddings.view(
+            hidden_states.shape[:-1] + (self.num_buckets, self.num_attn_heads)
+        )
+        rel_pos_embeddings = rel_pos_embeddings.permute(0, 2, 1, 4, 3)
+        # [batch_size * ngram * sequence_length * num_heads, num_buckets]
+        rel_pos_embeddings = rel_pos_embeddings.reshape(-1, self.num_buckets)
+        # [ngram, batch_size, num_heads * sequence_length, -1]
+        predict_relative_position_buckets = predict_relative_position_buckets.unsqueeze(0)
+        predict_relative_position_buckets = predict_relative_position_buckets.repeat(
             self.ngram, 1, self.num_attn_heads, 1
-        )  # [ngram, B, head*T, S]
-
-        rel_pos_embeddings = rel_pos_embeddings.reshape(-1, rel_pos_embeddings.size(-1))
+        )
+        # [ngram * batch_size * num_heads * sequence_length, -1]
         predict_relative_position_buckets = predict_relative_position_buckets.view(
             -1, predict_relative_position_buckets.size(-1)
-        ).long()  # [ngram*B*head*T, S]
+        ).long()
 
         predict_relative_pos_embeddings = torch.gather(
             rel_pos_embeddings, dim=1, index=predict_relative_position_buckets
-        ).view(
-            self.ngram, batch_size * self.num_attn_heads, sequence_length, -1
-        )  # [ngram, B*head, T, S]
+        )
+
+        # [batch_size, gram, num_heads, sequence_length, -1]
+        predict_relative_pos_embeddings = predict_relative_pos_embeddings.view(
+            batch_size, self.ngram, self.num_attn_heads, sequence_length, -1
+        )
 
         return predict_relative_pos_embeddings
 
@@ -1326,10 +1297,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import XLMProphetNetTokenizer, XLMProphetNetEncoder
+        >>> from transformers import AutoTokenizer, XLMProphetNetEncoder
         >>> import torch
 
-        >>> tokenizer = XLMProphetNetTokenizer.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone")
+        >>> tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone")
         >>> model = XLMProphetNetEncoder.from_pretrained("patrickvonplaten/prophetnet-large-uncased-standalone")
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -1353,7 +1324,7 @@ def forward(
         # prepare attention mask
         if attention_mask is not None:
             extended_attention_mask = (
-                1.0 - attention_mask[:, None, :].repeat(self.config.num_encoder_attention_heads, 1, 1)
+                1.0 - attention_mask[:, None, None, :].repeat(1, self.config.num_encoder_attention_heads, 1, 1)
             ) * torch.finfo(self.dtype).min
             extended_attention_mask = extended_attention_mask.to(inputs_embeds.dtype)
         else:
@@ -1504,10 +1475,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import XLMProphetNetTokenizer, XLMProphetNetDecoder
+        >>> from transformers import AutoTokenizer, XLMProphetNetDecoder
         >>> import torch
 
-        >>> tokenizer = XLMProphetNetTokenizer.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone")
+        >>> tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone")
         >>> model = XLMProphetNetDecoder.from_pretrained(
         ...     "patrickvonplaten/xprophetnet-large-uncased-standalone", add_cross_attention=False
         ... )
@@ -1574,7 +1545,7 @@ def forward(
         # prepare encoder attention mask
         if encoder_attention_mask is not None:
             extended_encoder_attention_mask = (
-                1.0 - encoder_attention_mask[:, None, :].repeat(self.config.num_decoder_attention_heads, 1, 1)
+                1.0 - encoder_attention_mask[:, None, None, :].repeat(1, self.config.num_decoder_attention_heads, 1, 1)
             ) * torch.finfo(self.dtype).min
             extended_encoder_attention_mask = extended_encoder_attention_mask.to(inputs_embeds.dtype)
         else:
@@ -1594,6 +1565,14 @@ def forward(
         all_main_stream_attns = () if output_attentions else None
         all_ngram_stream_attns = () if output_attentions else None
         all_cross_attns = () if output_attentions and self.config.add_cross_attention else None
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         present_key_values = () if use_cache else None
 
         # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
@@ -1614,12 +1593,6 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         # None for past_key_value
@@ -1740,17 +1713,18 @@ def prepare_attention_mask(self, hidden_states, attention_mask):
             device=hidden_states.device,
         )
         causal_mask = torch.triu(causal_mask, 1)
-        extended_causal_mask = causal_mask[:seq_length, :seq_length][None, :, :].expand(
-            (batch_size,) + causal_mask.shape
+
+        extended_causal_mask = causal_mask[:seq_length, :seq_length][None, None, :, :].expand(
+            (batch_size, self.config.num_decoder_attention_heads) + causal_mask.shape
         )
 
         # add usual attention mask
         if attention_mask is not None:
-            extended_attention_mask = (1.0 - attention_mask[:, None, :]) * torch.finfo(self.dtype).min
+            extended_attention_mask = (1.0 - attention_mask[:, None, None, :]) * torch.finfo(self.dtype).min
             extended_attention_mask = extended_causal_mask + extended_attention_mask
         else:
             extended_attention_mask = extended_causal_mask
-        return extended_attention_mask.repeat(self.config.num_decoder_attention_heads, 1, 1).to(hidden_states.dtype)
+        return extended_attention_mask.to(hidden_states.dtype)
 
     def prepare_predict_attention_mask(self, hidden_states, attention_mask):
         batch_size, seq_length = hidden_states.shape[:2]
@@ -1768,14 +1742,16 @@ def prepare_predict_attention_mask(self, hidden_states, attention_mask):
             ],
             dim=-1,
         )
-        extended_predict_causal_mask = predict_causal_mask[:, None, :, :].expand(
-            predict_causal_mask.shape[:1] + (batch_size,) + predict_causal_mask.shape[1:]
+        extended_predict_causal_mask = predict_causal_mask[None, None, :, :, :].expand(
+            (batch_size, self.config.num_decoder_attention_heads) + predict_causal_mask.shape
         )
 
         # add usual attention mask
         if attention_mask is not None:
-            extended_attention_mask = (1.0 - attention_mask[None, :, None, :]) * torch.finfo(self.dtype).min
-            extended_attention_mask = extended_attention_mask.expand((self.ngram, batch_size, seq_length, seq_length))
+            extended_attention_mask = (1.0 - attention_mask[:, None, None, None, :]) * torch.finfo(self.dtype).min
+            extended_attention_mask = extended_attention_mask.expand(
+                (batch_size, self.config.num_decoder_attention_heads, self.ngram, seq_length, seq_length)
+            )
             # predicted stream attention_mask should always be 0
             extended_attention_mask = torch.cat(
                 [extended_attention_mask, torch.zeros_like(extended_attention_mask)], dim=-1
@@ -1783,9 +1759,7 @@ def prepare_predict_attention_mask(self, hidden_states, attention_mask):
             extended_predict_attention_mask = extended_predict_causal_mask + extended_attention_mask
         else:
             extended_predict_attention_mask = extended_predict_causal_mask
-        return extended_predict_attention_mask.repeat(1, self.config.num_decoder_attention_heads, 1, 1).to(
-            hidden_states.dtype
-        )
+        return extended_predict_attention_mask.to(hidden_states.dtype)
 
 
 @add_start_docstrings(
@@ -1794,7 +1768,7 @@ def prepare_predict_attention_mask(self, hidden_states, attention_mask):
 )
 # Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetModel with microsoft/prophetnet-large-uncased->patrickvonplaten/xprophetnet-large-uncased-standalone, ProphetNet->XLMProphetNet, PROPHETNET->XLM_PROPHETNET
 class XLMProphetNetModel(XLMProphetNetPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["decoder.word_embeddings.weight", "encoder.word_embeddings.weight"]
+    _tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight"]
 
     def __init__(self, config: XLMProphetNetConfig):
         super().__init__(config)
@@ -1853,9 +1827,9 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import XLMProphetNetTokenizer, XLMProphetNetModel
+        >>> from transformers import AutoTokenizer, XLMProphetNetModel
 
-        >>> tokenizer = XLMProphetNetTokenizer.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone")
+        >>> tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone")
         >>> model = XLMProphetNetModel.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone")
 
         >>> input_ids = tokenizer(
@@ -1924,11 +1898,7 @@ def forward(
 )
 # Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetForConditionalGeneration with microsoft/prophetnet-large-uncased->patrickvonplaten/xprophetnet-large-uncased-standalone, ProphetNet->XLMProphetNet, PROPHETNET->XLM_PROPHETNET
 class XLMProphetNetForConditionalGeneration(XLMProphetNetPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        "decoder.word_embeddings.weight",
-        "encoder.word_embeddings.weight",
-        "lm_head.weight",
-    ]
+    _tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight", "lm_head.weight"]
 
     def __init__(self, config: XLMProphetNetConfig):
         super().__init__(config)
@@ -1982,9 +1952,9 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import XLMProphetNetTokenizer, XLMProphetNetForConditionalGeneration
+        >>> from transformers import AutoTokenizer, XLMProphetNetForConditionalGeneration
 
-        >>> tokenizer = XLMProphetNetTokenizer.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone")
+        >>> tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone")
         >>> model = XLMProphetNetForConditionalGeneration.from_pretrained(
         ...     "patrickvonplaten/xprophetnet-large-uncased-standalone"
         ... )
@@ -2120,12 +2090,13 @@ def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
 
     @staticmethod
     # Copied from transformers.models.bart.modeling_bart.BartForConditionalGeneration._reorder_cache
-    def _reorder_cache(past, beam_idx):
+    def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
+        for layer_past in past_key_values:
             # cached cross_attention states don't have to be reordered -> they are always the same
             reordered_past += (
-                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
+                + layer_past[2:],
             )
         return reordered_past
 
@@ -2143,7 +2114,7 @@ def get_decoder(self):
 )
 # Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetForCausalLM with microsoft/prophetnet-large-uncased->patrickvonplaten/xprophetnet-large-uncased-standalone, ProphetNet->XLMProphetNet, PROPHETNET->XLM_PROPHETNET
 class XLMProphetNetForCausalLM(XLMProphetNetPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config: XLMProphetNetConfig):
         # set config for CLM
@@ -2233,10 +2204,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import XLMProphetNetTokenizer, XLMProphetNetForCausalLM
+        >>> from transformers import AutoTokenizer, XLMProphetNetForCausalLM
         >>> import torch
 
-        >>> tokenizer = XLMProphetNetTokenizer.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone")
+        >>> tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone")
         >>> model = XLMProphetNetForCausalLM.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone")
         >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
@@ -2245,13 +2216,11 @@ def forward(
         >>> logits = outputs.logits
 
         >>> # Model can also be used with EncoderDecoder framework
-        >>> from transformers import BertTokenizer, EncoderDecoderModel, XLMProphetNetTokenizer
+        >>> from transformers import BertTokenizer, EncoderDecoderModel, AutoTokenizer
         >>> import torch
 
         >>> tokenizer_enc = BertTokenizer.from_pretrained("bert-large-uncased")
-        >>> tokenizer_dec = XLMProphetNetTokenizer.from_pretrained(
-        ...     "patrickvonplaten/xprophetnet-large-uncased-standalone"
-        ... )
+        >>> tokenizer_dec = AutoTokenizer.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone")
         >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained(
         ...     "bert-large-uncased", "patrickvonplaten/xprophetnet-large-uncased-standalone"
         ... )
@@ -2369,10 +2338,12 @@ def prepare_inputs_for_generation(
 
     @staticmethod
     # Copied from transformers.models.bart.modeling_bart.BartForCausalLM._reorder_cache
-    def _reorder_cache(past, beam_idx):
+    def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
         return reordered_past
 
 
diff --git a/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py b/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py
index af8308287939..c024d5d16dc0 100644
--- a/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py
+++ b/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py
@@ -67,7 +67,7 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer):
     Args:
         vocab_file (`str`):
             Path to the vocabulary file.
-        bos_token (`str`, *optional*, defaults to `"<s>"`):
+        bos_token (`str`, *optional*, defaults to `"[SEP]"`):
             The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
 
             <Tip>
@@ -77,7 +77,7 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer):
 
             </Tip>
 
-        eos_token (`str`, *optional*, defaults to `"</s>"`):
+        eos_token (`str`, *optional*, defaults to `"[SEP]"`):
             The end of sequence token.
 
             <Tip>
@@ -87,23 +87,21 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer):
 
             </Tip>
 
-        sep_token (`str`, *optional*, defaults to `"</s>"`):
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        cls_token (`str`, *optional*, defaults to `"<s>"`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
             The token used for padding, for example when batching sequences of different lengths.
-        mask_token (`str`, *optional*, defaults to `"<mask>"`):
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
-            Additional special tokens used by the tokenizer.
         sp_model_kwargs (`dict`, *optional*):
             Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
             SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
@@ -141,22 +139,10 @@ def __init__(
         cls_token="[CLS]",
         mask_token="[MASK]",
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
 
-        super().__init__(
-            bos_token=bos_token,
-            eos_token=eos_token,
-            sep_token=sep_token,
-            unk_token=unk_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            sp_model_kwargs=self.sp_model_kwargs,
-            **kwargs,
-        )
-
         try:
             import sentencepiece as spm
         except ImportError:
@@ -186,8 +172,24 @@ def __init__(
         # The first "real" token "," has position 15 in the embedding vocab and position 3 in the spm vocab
         self.fairseq_offset = 12
         self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
-        for k in self.fairseq_tokens_to_ids.keys():
-            self.unique_no_split_tokens.append(k)
+
+        # TODO ArthurZ fairseq_ids_to_tokens should be removed
+
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+
+    @property
+    def can_save_slow_tokenizer(self) -> bool:
+        return os.path.isfile(self.vocab_file) if self.vocab_file else False
 
     def __getstate__(self):
         state = self.__dict__.copy()
diff --git a/src/transformers/models/xlm_roberta/__init__.py b/src/transformers/models/xlm_roberta/__init__.py
index 2a2abf6f618a..813cba9fe17c 100644
--- a/src/transformers/models/xlm_roberta/__init__.py
+++ b/src/transformers/models/xlm_roberta/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -79,12 +75,14 @@
 else:
     _import_structure["modeling_tf_xlm_roberta"] = [
         "TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "TFXLMRobertaForCausalLM",
         "TFXLMRobertaForMaskedLM",
         "TFXLMRobertaForMultipleChoice",
         "TFXLMRobertaForQuestionAnswering",
         "TFXLMRobertaForSequenceClassification",
         "TFXLMRobertaForTokenClassification",
         "TFXLMRobertaModel",
+        "TFXLMRobertaPreTrainedModel",
     ]
 
 try:
@@ -94,12 +92,15 @@
     pass
 else:
     _import_structure["modeling_flax_xlm_roberta"] = [
+        "FLAX_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST",
         "FlaxXLMRobertaForMaskedLM",
+        "FlaxXLMRobertaForCausalLM",
         "FlaxXLMRobertaForMultipleChoice",
         "FlaxXLMRobertaForQuestionAnswering",
         "FlaxXLMRobertaForSequenceClassification",
         "FlaxXLMRobertaForTokenClassification",
         "FlaxXLMRobertaModel",
+        "FlaxXLMRobertaPreTrainedModel",
     ]
 
 if TYPE_CHECKING:
@@ -151,12 +152,14 @@
     else:
         from .modeling_tf_xlm_roberta import (
             TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TFXLMRobertaForCausalLM,
             TFXLMRobertaForMaskedLM,
             TFXLMRobertaForMultipleChoice,
             TFXLMRobertaForQuestionAnswering,
             TFXLMRobertaForSequenceClassification,
             TFXLMRobertaForTokenClassification,
             TFXLMRobertaModel,
+            TFXLMRobertaPreTrainedModel,
         )
 
     try:
@@ -166,12 +169,15 @@
         pass
     else:
         from .modeling_flax_xlm_roberta import (
+            FLAX_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
+            FlaxXLMRobertaForCausalLM,
             FlaxXLMRobertaForMaskedLM,
             FlaxXLMRobertaForMultipleChoice,
             FlaxXLMRobertaForQuestionAnswering,
             FlaxXLMRobertaForSequenceClassification,
             FlaxXLMRobertaForTokenClassification,
             FlaxXLMRobertaModel,
+            FlaxXLMRobertaPreTrainedModel,
         )
 
 else:
diff --git a/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py b/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py
index c8bad5905a48..98e12d07826e 100644
--- a/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py
@@ -132,7 +132,7 @@ def __init__(
         position_embedding_type="absolute",
         use_cache=True,
         classifier_dropout=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
diff --git a/src/transformers/models/xlm_roberta/modeling_flax_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_flax_xlm_roberta.py
index 19c25346b606..f6f39ee93ba6 100644
--- a/src/transformers/models/xlm_roberta/modeling_flax_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/modeling_flax_xlm_roberta.py
@@ -15,31 +15,76 @@
 # limitations under the License.
 """Flax XLM-RoBERTa model."""
 
-from ...utils import add_start_docstrings, logging
-from ..roberta.modeling_flax_roberta import (
-    FlaxRobertaForMaskedLM,
-    FlaxRobertaForMultipleChoice,
-    FlaxRobertaForQuestionAnswering,
-    FlaxRobertaForSequenceClassification,
-    FlaxRobertaForTokenClassification,
-    FlaxRobertaModel,
+from typing import Callable, Optional, Tuple
+
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+import numpy as np
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.linen import combine_masks, make_causal_mask
+from flax.linen import partitioning as nn_partitioning
+from flax.linen.attention import dot_product_attention_weights
+from flax.traverse_util import flatten_dict, unflatten_dict
+from jax import lax
+
+from ...modeling_flax_outputs import (
+    FlaxBaseModelOutputWithPastAndCrossAttentions,
+    FlaxBaseModelOutputWithPooling,
+    FlaxBaseModelOutputWithPoolingAndCrossAttentions,
+    FlaxCausalLMOutputWithCrossAttentions,
+    FlaxMaskedLMOutput,
+    FlaxMultipleChoiceModelOutput,
+    FlaxQuestionAnsweringModelOutput,
+    FlaxSequenceClassifierOutput,
+    FlaxTokenClassifierOutput,
 )
+from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring, overwrite_call_docstring
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_xlm_roberta import XLMRobertaConfig
 
 
 logger = logging.get_logger(__name__)
 
-XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
+_CHECKPOINT_FOR_DOC = "xlm-roberta-base"
+_CONFIG_FOR_DOC = "XLMRobertaConfig"
+
+remat = nn_partitioning.remat
+
+FLAX_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "xlm-roberta-base",
     "xlm-roberta-large",
-    "xlm-roberta-large-finetuned-conll02-dutch",
-    "xlm-roberta-large-finetuned-conll02-spanish",
-    "xlm-roberta-large-finetuned-conll03-english",
-    "xlm-roberta-large-finetuned-conll03-german",
     # See all XLM-RoBERTa models at https://huggingface.co/models?filter=xlm-roberta
 ]
 
+
+# Copied from transformers.models.roberta.modeling_flax_roberta.create_position_ids_from_input_ids
+def create_position_ids_from_input_ids(input_ids, padding_idx):
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+
+    Args:
+        input_ids: jnp.ndarray
+        padding_idx: int
+
+    Returns: jnp.ndarray
+    """
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    mask = (input_ids != padding_idx).astype("i4")
+
+    if mask.ndim > 2:
+        mask = mask.reshape((-1, mask.shape[-1]))
+        incremental_indices = jnp.cumsum(mask, axis=1).astype("i4") * mask
+        incremental_indices = incremental_indices.reshape(input_ids.shape)
+    else:
+        incremental_indices = jnp.cumsum(mask, axis=1).astype("i4") * mask
+
+    return incremental_indices.astype("i4") + padding_idx
+
+
 XLM_ROBERTA_START_DOCSTRING = r"""
+
     This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading, saving and converting weights from PyTorch models)
 
@@ -60,92 +105,1400 @@
             configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
 """
 
+XLM_ROBERTA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`numpy.ndarray` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
 
-@add_start_docstrings(
-    "The bare XLM-RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
-    XLM_ROBERTA_START_DOCSTRING,
-)
-class FlaxXLMRobertaModel(FlaxRobertaModel):
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+        head_mask (`numpy.ndarray` of shape `({0})`, `optional):
+            Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertEmbeddings with Bert->XLMRoberta
+class FlaxXLMRobertaEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    config: XLMRobertaConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.word_embeddings = nn.Embed(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.position_embeddings = nn.Embed(
+            self.config.max_position_embeddings,
+            self.config.hidden_size,
+            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.token_type_embeddings = nn.Embed(
+            self.config.type_vocab_size,
+            self.config.hidden_size,
+            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+
+    def __call__(self, input_ids, token_type_ids, position_ids, attention_mask, deterministic: bool = True):
+        # Embed
+        inputs_embeds = self.word_embeddings(input_ids.astype("i4"))
+        position_embeds = self.position_embeddings(position_ids.astype("i4"))
+        token_type_embeddings = self.token_type_embeddings(token_type_ids.astype("i4"))
+
+        # Sum all embeddings
+        hidden_states = inputs_embeds + token_type_embeddings + position_embeds
+
+        # Layer Norm
+        hidden_states = self.LayerNorm(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertSelfAttention with Bert->XLMRoberta
+class FlaxXLMRobertaSelfAttention(nn.Module):
+    config: XLMRobertaConfig
+    causal: bool = False
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.head_dim = self.config.hidden_size // self.config.num_attention_heads
+        if self.config.hidden_size % self.config.num_attention_heads != 0:
+            raise ValueError(
+                "`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads` "
+                "                   : {self.config.num_attention_heads}"
+            )
+
+        self.query = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        self.key = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        self.value = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+
+        if self.causal:
+            self.causal_mask = make_causal_mask(
+                jnp.ones((1, self.config.max_position_embeddings), dtype="bool"), dtype="bool"
+            )
+
+    def _split_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.config.num_attention_heads, self.head_dim))
+
+    def _merge_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.config.hidden_size,))
+
+    @nn.compact
+    # Copied from transformers.models.bart.modeling_flax_bart.FlaxBartAttention._concatenate_to_cache
+    def _concatenate_to_cache(self, key, value, query, attention_mask):
+        """
+        This function takes projected key, value states from a single input token and concatenates the states to cached
+        states from previous steps. This function is slighly adapted from the official Flax repository:
+        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
+        """
+        # detect if we're initializing by absence of existing cache data.
+        is_initialized = self.has_variable("cache", "cached_key")
+        cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
+        cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
+        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
+
+        if is_initialized:
+            *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
+            # update key, value caches with our new 1d spatial slices
+            cur_index = cache_index.value
+            indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
+            key = lax.dynamic_update_slice(cached_key.value, key, indices)
+            value = lax.dynamic_update_slice(cached_value.value, value, indices)
+            cached_key.value = key
+            cached_value.value = value
+            num_updated_cache_vectors = query.shape[1]
+            cache_index.value = cache_index.value + num_updated_cache_vectors
+            # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements.
+            pad_mask = jnp.broadcast_to(
+                jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
+                tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
+            )
+            attention_mask = combine_masks(pad_mask, attention_mask)
+        return key, value, attention_mask
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        layer_head_mask,
+        key_value_states: Optional[jnp.array] = None,
+        init_cache: bool = False,
+        deterministic=True,
+        output_attentions: bool = False,
+    ):
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        batch_size = hidden_states.shape[0]
+
+        # get query proj
+        query_states = self.query(hidden_states)
+        # get key, value proj
+        if is_cross_attention:
+            # cross_attentions
+            key_states = self.key(key_value_states)
+            value_states = self.value(key_value_states)
+        else:
+            # self_attention
+            key_states = self.key(hidden_states)
+            value_states = self.value(hidden_states)
+
+        query_states = self._split_heads(query_states)
+        key_states = self._split_heads(key_states)
+        value_states = self._split_heads(value_states)
+
+        # handle cache prepare causal attention mask
+        if self.causal:
+            query_length, key_length = query_states.shape[1], key_states.shape[1]
+            if self.has_variable("cache", "cached_key"):
+                mask_shift = self.variables["cache"]["cache_index"]
+                max_decoder_length = self.variables["cache"]["cached_key"].shape[1]
+                causal_mask = lax.dynamic_slice(
+                    self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length)
+                )
+            else:
+                causal_mask = self.causal_mask[:, :, :query_length, :key_length]
+            causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])
+
+        # combine masks if needed
+        if attention_mask is not None and self.causal:
+            attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
+            attention_mask = combine_masks(attention_mask, causal_mask)
+        elif self.causal:
+            attention_mask = causal_mask
+        elif attention_mask is not None:
+            attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2))
+
+        # During fast autoregressive decoding, we feed one position at a time,
+        # and cache the keys and values step by step.
+        if self.causal and (self.has_variable("cache", "cached_key") or init_cache):
+            key_states, value_states, attention_mask = self._concatenate_to_cache(
+                key_states, value_states, query_states, attention_mask
+            )
+
+        # Convert the boolean attention mask to an attention bias.
+        if attention_mask is not None:
+            # attention mask in the form of attention bias
+            attention_bias = lax.select(
+                attention_mask > 0,
+                jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
+            )
+        else:
+            attention_bias = None
+
+        dropout_rng = None
+        if not deterministic and self.config.attention_probs_dropout_prob > 0.0:
+            dropout_rng = self.make_rng("dropout")
+
+        attn_weights = dot_product_attention_weights(
+            query_states,
+            key_states,
+            bias=attention_bias,
+            dropout_rng=dropout_rng,
+            dropout_rate=self.config.attention_probs_dropout_prob,
+            broadcast_dropout=True,
+            deterministic=deterministic,
+            dtype=self.dtype,
+            precision=None,
+        )
+
+        # Mask heads if we want to
+        if layer_head_mask is not None:
+            attn_weights = jnp.einsum("...hqk,h->...hqk", attn_weights, layer_head_mask)
+
+        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states)
+        attn_output = attn_output.reshape(attn_output.shape[:2] + (-1,))
+
+        outputs = (attn_output, attn_weights) if output_attentions else (attn_output,)
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertSelfOutput with Bert->XLMRoberta
+class FlaxXLMRobertaSelfOutput(nn.Module):
+    config: XLMRobertaConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.hidden_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+
+    def __call__(self, hidden_states, input_tensor, deterministic: bool = True):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertAttention with Bert->XLMRoberta
+class FlaxXLMRobertaAttention(nn.Module):
+    config: XLMRobertaConfig
+    causal: bool = False
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.self = FlaxXLMRobertaSelfAttention(self.config, causal=self.causal, dtype=self.dtype)
+        self.output = FlaxXLMRobertaSelfOutput(self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        layer_head_mask,
+        key_value_states=None,
+        init_cache=False,
+        deterministic=True,
+        output_attentions: bool = False,
+    ):
+        # Attention mask comes in as attention_mask.shape == (*batch_sizes, kv_length)
+        # FLAX expects: attention_mask.shape == (*batch_sizes, 1, 1, kv_length) such that it is broadcastable
+        # with attn_weights.shape == (*batch_sizes, num_heads, q_length, kv_length)
+        attn_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            layer_head_mask=layer_head_mask,
+            key_value_states=key_value_states,
+            init_cache=init_cache,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+        )
+        attn_output = attn_outputs[0]
+        hidden_states = self.output(attn_output, hidden_states, deterministic=deterministic)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_outputs[1],)
+
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertIntermediate with Bert->XLMRoberta
+class FlaxXLMRobertaIntermediate(nn.Module):
+    config: XLMRobertaConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.intermediate_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.activation = ACT2FN[self.config.hidden_act]
+
+    def __call__(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertOutput with Bert->XLMRoberta
+class FlaxXLMRobertaOutput(nn.Module):
+    config: XLMRobertaConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.hidden_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+
+    def __call__(self, hidden_states, attention_output, deterministic: bool = True):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        hidden_states = self.LayerNorm(hidden_states + attention_output)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertLayer with Bert->XLMRoberta
+class FlaxXLMRobertaLayer(nn.Module):
+    config: XLMRobertaConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.attention = FlaxXLMRobertaAttention(self.config, causal=self.config.is_decoder, dtype=self.dtype)
+        self.intermediate = FlaxXLMRobertaIntermediate(self.config, dtype=self.dtype)
+        self.output = FlaxXLMRobertaOutput(self.config, dtype=self.dtype)
+        if self.config.add_cross_attention:
+            self.crossattention = FlaxXLMRobertaAttention(self.config, causal=False, dtype=self.dtype)
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        layer_head_mask,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+    ):
+        # Self Attention
+        attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            layer_head_mask=layer_head_mask,
+            init_cache=init_cache,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+        )
+        attention_output = attention_outputs[0]
+
+        # Cross-Attention Block
+        if encoder_hidden_states is not None:
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=layer_head_mask,
+                key_value_states=encoder_hidden_states,
+                deterministic=deterministic,
+                output_attentions=output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+
+        hidden_states = self.intermediate(attention_output)
+        hidden_states = self.output(hidden_states, attention_output, deterministic=deterministic)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attention_outputs[1],)
+            if encoder_hidden_states is not None:
+                outputs += (cross_attention_outputs[1],)
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertLayerCollection with Bert->XLMRoberta
+class FlaxXLMRobertaLayerCollection(nn.Module):
+    config: XLMRobertaConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        if self.gradient_checkpointing:
+            FlaxXLMRobertaCheckpointLayer = remat(FlaxXLMRobertaLayer, static_argnums=(5, 6, 7))
+            self.layers = [
+                FlaxXLMRobertaCheckpointLayer(self.config, name=str(i), dtype=self.dtype)
+                for i in range(self.config.num_hidden_layers)
+            ]
+        else:
+            self.layers = [
+                FlaxXLMRobertaLayer(self.config, name=str(i), dtype=self.dtype)
+                for i in range(self.config.num_hidden_layers)
+            ]
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        head_mask,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+
+        # Check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            if head_mask.shape[0] != (len(self.layers)):
+                raise ValueError(
+                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for                  "
+                    f"       {head_mask.shape[0]}."
+                )
+
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = layer(
+                hidden_states,
+                attention_mask,
+                head_mask[i] if head_mask is not None else None,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                init_cache,
+                deterministic,
+                output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        outputs = (hidden_states, all_hidden_states, all_attentions, all_cross_attentions)
+
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+
+        return FlaxBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertEncoder with Bert->XLMRoberta
+class FlaxXLMRobertaEncoder(nn.Module):
+    config: XLMRobertaConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.layer = FlaxXLMRobertaLayerCollection(
+            self.config,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        head_mask,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        return self.layer(
+            hidden_states,
+            attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            init_cache=init_cache,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPooler with Bert->XLMRoberta
+class FlaxXLMRobertaPooler(nn.Module):
+    config: XLMRobertaConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.hidden_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+
+    def __call__(self, hidden_states):
+        cls_hidden_state = hidden_states[:, 0]
+        cls_hidden_state = self.dense(cls_hidden_state)
+        return nn.tanh(cls_hidden_state)
+
+
+# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaLMHead with Roberta->XLMRoberta
+class FlaxXLMRobertaLMHead(nn.Module):
+    config: XLMRobertaConfig
+    dtype: jnp.dtype = jnp.float32
+    bias_init: Callable[..., np.ndarray] = jax.nn.initializers.zeros
+
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        self.layer_norm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        self.decoder = nn.Dense(
+            self.config.vocab_size,
+            dtype=self.dtype,
+            use_bias=False,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        self.bias = self.param("bias", self.bias_init, (self.config.vocab_size,))
+
+    def __call__(self, hidden_states, shared_embedding=None):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = ACT2FN["gelu"](hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+
+        if shared_embedding is not None:
+            hidden_states = self.decoder.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
+        else:
+            hidden_states = self.decoder(hidden_states)
+
+        bias = jnp.asarray(self.bias, self.dtype)
+        hidden_states += bias
+        return hidden_states
+
+
+# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaClassificationHead with Roberta->XLMRoberta
+class FlaxXLMRobertaClassificationHead(nn.Module):
+    config: XLMRobertaConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        classifier_dropout = (
+            self.config.classifier_dropout
+            if self.config.classifier_dropout is not None
+            else self.config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(rate=classifier_dropout)
+        self.out_proj = nn.Dense(
+            self.config.num_labels,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+
+    def __call__(self, hidden_states, deterministic=True):
+        hidden_states = hidden_states[:, 0, :]  # take <s> token (equiv. to [CLS])
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = nn.tanh(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        hidden_states = self.out_proj(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaPreTrainedModel with Roberta->XLMRoberta, roberta->xlm-roberta, ROBERTA->XLM_ROBERTA
+class FlaxXLMRobertaPreTrainedModel(FlaxPreTrainedModel):
     """
-    This class overrides [`FlaxRobertaModel`]. Please check the superclass for the appropriate documentation alongside
-    usage examples.
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
     """
 
     config_class = XLMRobertaConfig
+    base_model_prefix = "xlm-roberta"
+
+    module_class: nn.Module = None
+
+    def __init__(
+        self,
+        config: XLMRobertaConfig,
+        input_shape: Tuple = (1, 1),
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        gradient_checkpointing: bool = False,
+        **kwargs,
+    ):
+        module = self.module_class(config=config, dtype=dtype, gradient_checkpointing=gradient_checkpointing, **kwargs)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+
+    # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPreTrainedModel.enable_gradient_checkpointing
+    def enable_gradient_checkpointing(self):
+        self._module = self.module_class(
+            config=self.config,
+            dtype=self.dtype,
+            gradient_checkpointing=True,
+        )
+
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        input_ids = jnp.zeros(input_shape, dtype="i4")
+        token_type_ids = jnp.ones_like(input_ids)
+        position_ids = create_position_ids_from_input_ids(input_ids, self.config.pad_token_id)
+        attention_mask = jnp.ones_like(input_ids)
+        head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads))
+
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+
+        if self.config.add_cross_attention:
+            encoder_hidden_states = jnp.zeros(input_shape + (self.config.hidden_size,))
+            encoder_attention_mask = attention_mask
+            module_init_outputs = self.module.init(
+                rngs,
+                input_ids,
+                attention_mask,
+                token_type_ids,
+                position_ids,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                return_dict=False,
+            )
+        else:
+            module_init_outputs = self.module.init(
+                rngs, input_ids, attention_mask, token_type_ids, position_ids, head_mask, return_dict=False
+            )
+
+        random_params = module_init_outputs["params"]
+
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+
+    # Copied from transformers.models.bart.modeling_flax_bart.FlaxBartDecoderPreTrainedModel.init_cache
+    def init_cache(self, batch_size, max_length):
+        r"""
+        Args:
+            batch_size (`int`):
+                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
+            max_length (`int`):
+                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
+                cache.
+        """
+        # init input variables to retrieve cache
+        input_ids = jnp.ones((batch_size, max_length), dtype="i4")
+        attention_mask = jnp.ones_like(input_ids, dtype="i4")
+        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
+
+        init_variables = self.module.init(
+            jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
+        )
+        return unfreeze(init_variables["cache"])
+
+    @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        params: dict = None,
+        dropout_rng: jax.random.PRNGKey = None,
+        train: bool = False,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        past_key_values: dict = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        # init input tensors if not passed
+        if token_type_ids is None:
+            token_type_ids = jnp.zeros_like(input_ids)
+
+        if position_ids is None:
+            position_ids = create_position_ids_from_input_ids(input_ids, self.config.pad_token_id)
+
+        if attention_mask is None:
+            attention_mask = jnp.ones_like(input_ids)
+
+        if head_mask is None:
+            head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads))
+
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        inputs = {"params": params or self.params}
+
+        if self.config.add_cross_attention:
+            # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed
+            # down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be
+            # changed by FlaxXLMRobertaAttention module
+            if past_key_values:
+                inputs["cache"] = past_key_values
+                mutable = ["cache"]
+            else:
+                mutable = False
+
+            outputs = self.module.apply(
+                inputs,
+                jnp.array(input_ids, dtype="i4"),
+                jnp.array(attention_mask, dtype="i4"),
+                token_type_ids=jnp.array(token_type_ids, dtype="i4"),
+                position_ids=jnp.array(position_ids, dtype="i4"),
+                head_mask=jnp.array(head_mask, dtype="i4"),
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                deterministic=not train,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                rngs=rngs,
+                mutable=mutable,
+            )
+
+            # add updated cache to model output
+            if past_key_values is not None and return_dict:
+                outputs, past_key_values = outputs
+                outputs["past_key_values"] = unfreeze(past_key_values["cache"])
+                return outputs
+            elif past_key_values is not None and not return_dict:
+                outputs, past_key_values = outputs
+                outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:]
+
+        else:
+            outputs = self.module.apply(
+                inputs,
+                jnp.array(input_ids, dtype="i4"),
+                jnp.array(attention_mask, dtype="i4"),
+                token_type_ids=jnp.array(token_type_ids, dtype="i4"),
+                position_ids=jnp.array(position_ids, dtype="i4"),
+                head_mask=jnp.array(head_mask, dtype="i4"),
+                deterministic=not train,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                rngs=rngs,
+            )
+
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertModule with Bert->XLMRoberta
+class FlaxXLMRobertaModule(nn.Module):
+    config: XLMRobertaConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    add_pooling_layer: bool = True
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.embeddings = FlaxXLMRobertaEmbeddings(self.config, dtype=self.dtype)
+        self.encoder = FlaxXLMRobertaEncoder(
+            self.config,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.pooler = FlaxXLMRobertaPooler(self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids: Optional[jnp.ndarray] = None,
+        position_ids: Optional[jnp.ndarray] = None,
+        head_mask: Optional[jnp.ndarray] = None,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # make sure `token_type_ids` is correctly initialized when not passed
+        if token_type_ids is None:
+            token_type_ids = jnp.zeros_like(input_ids)
+
+        # make sure `position_ids` is correctly initialized when not passed
+        if position_ids is None:
+            position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
+
+        hidden_states = self.embeddings(
+            input_ids, token_type_ids, position_ids, attention_mask, deterministic=deterministic
+        )
+        outputs = self.encoder(
+            hidden_states,
+            attention_mask,
+            head_mask=head_mask,
+            deterministic=deterministic,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        pooled = self.pooler(hidden_states) if self.add_pooling_layer else None
+
+        if not return_dict:
+            # if pooled is None, don't return it
+            if pooled is None:
+                return (hidden_states,) + outputs[1:]
+            return (hidden_states, pooled) + outputs[1:]
+
+        return FlaxBaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            pooler_output=pooled,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
 
 
 @add_start_docstrings(
-    """XLM-RoBERTa Model with a `language modeling` head on top.""",
+    "The bare XLM RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
     XLM_ROBERTA_START_DOCSTRING,
 )
-class FlaxXLMRobertaForMaskedLM(FlaxRobertaForMaskedLM):
-    """
-    This class overrides [`FlaxRobertaForMaskedLM`]. Please check the superclass for the appropriate documentation
-    alongside usage examples.
-    """
+class FlaxXLMRobertaModel(FlaxXLMRobertaPreTrainedModel):
+    module_class = FlaxXLMRobertaModule
 
-    config_class = XLMRobertaConfig
+
+append_call_sample_docstring(FlaxXLMRobertaModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutputWithPooling, _CONFIG_FOR_DOC)
+
+
+# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForMaskedLMModule with Roberta->XLMRoberta
+class FlaxXLMRobertaForMaskedLMModule(nn.Module):
+    config: XLMRobertaConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.roberta = FlaxXLMRobertaModule(
+            config=self.config,
+            add_pooling_layer=False,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.lm_head = FlaxXLMRobertaLMHead(config=self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.roberta(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        if self.config.tie_word_embeddings:
+            shared_embedding = self.roberta.variables["params"]["embeddings"]["word_embeddings"]["embedding"]
+        else:
+            shared_embedding = None
+
+        # Compute the prediction scores
+        logits = self.lm_head(hidden_states, shared_embedding=shared_embedding)
+
+        if not return_dict:
+            return (logits,) + outputs[1:]
+
+        return FlaxMaskedLMOutput(
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings("""XLM RoBERTa Model with a `language modeling` head on top.""", XLM_ROBERTA_START_DOCSTRING)
+class FlaxXLMRobertaForMaskedLM(FlaxXLMRobertaPreTrainedModel):
+    module_class = FlaxXLMRobertaForMaskedLMModule
+
+
+append_call_sample_docstring(
+    FlaxXLMRobertaForMaskedLM,
+    _CHECKPOINT_FOR_DOC,
+    FlaxBaseModelOutputWithPooling,
+    _CONFIG_FOR_DOC,
+    mask="<mask>",
+)
+
+
+# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForSequenceClassificationModule with Roberta->XLMRoberta
+class FlaxXLMRobertaForSequenceClassificationModule(nn.Module):
+    config: XLMRobertaConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.roberta = FlaxXLMRobertaModule(
+            config=self.config,
+            dtype=self.dtype,
+            add_pooling_layer=False,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.classifier = FlaxXLMRobertaClassificationHead(config=self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.roberta(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output, deterministic=deterministic)
+
+        if not return_dict:
+            return (logits,) + outputs[1:]
+
+        return FlaxSequenceClassifierOutput(
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
 
 
 @add_start_docstrings(
     """
-    XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    XLM Roberta Model transformer with a sequence classification/regression head on top (a linear layer on top of the
     pooled output) e.g. for GLUE tasks.
     """,
     XLM_ROBERTA_START_DOCSTRING,
 )
-class FlaxXLMRobertaForSequenceClassification(FlaxRobertaForSequenceClassification):
-    """
-    This class overrides [`FlaxRobertaForSequenceClassification`]. Please check the superclass for the appropriate
-    documentation alongside usage examples.
-    """
+class FlaxXLMRobertaForSequenceClassification(FlaxXLMRobertaPreTrainedModel):
+    module_class = FlaxXLMRobertaForSequenceClassificationModule
 
-    config_class = XLMRobertaConfig
+
+append_call_sample_docstring(
+    FlaxXLMRobertaForSequenceClassification,
+    _CHECKPOINT_FOR_DOC,
+    FlaxSequenceClassifierOutput,
+    _CONFIG_FOR_DOC,
+)
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForMultipleChoiceModule with Bert->XLMRoberta, with self.bert->self.roberta
+class FlaxXLMRobertaForMultipleChoiceModule(nn.Module):
+    config: XLMRobertaConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.roberta = FlaxXLMRobertaModule(
+            config=self.config,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+        self.classifier = nn.Dense(1, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        num_choices = input_ids.shape[1]
+        input_ids = input_ids.reshape(-1, input_ids.shape[-1]) if input_ids is not None else None
+        attention_mask = attention_mask.reshape(-1, attention_mask.shape[-1]) if attention_mask is not None else None
+        token_type_ids = token_type_ids.reshape(-1, token_type_ids.shape[-1]) if token_type_ids is not None else None
+        position_ids = position_ids.reshape(-1, position_ids.shape[-1]) if position_ids is not None else None
+
+        # Model
+        outputs = self.roberta(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output, deterministic=deterministic)
+        logits = self.classifier(pooled_output)
+
+        reshaped_logits = logits.reshape(-1, num_choices)
+
+        if not return_dict:
+            return (reshaped_logits,) + outputs[2:]
+
+        return FlaxMultipleChoiceModelOutput(
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
 
 
 @add_start_docstrings(
     """
-    XLM-RoBERTa Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
+    XLM Roberta Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
     a softmax) e.g. for RocStories/SWAG tasks.
     """,
     XLM_ROBERTA_START_DOCSTRING,
 )
-class FlaxXLMRobertaForMultipleChoice(FlaxRobertaForMultipleChoice):
-    """
-    This class overrides [`FlaxRobertaForMultipleChoice`]. Please check the superclass for the appropriate
-    documentation alongside usage examples.
-    """
+class FlaxXLMRobertaForMultipleChoice(FlaxXLMRobertaPreTrainedModel):
+    module_class = FlaxXLMRobertaForMultipleChoiceModule
 
-    config_class = XLMRobertaConfig
+
+overwrite_call_docstring(
+    FlaxXLMRobertaForMultipleChoice, XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
+)
+append_call_sample_docstring(
+    FlaxXLMRobertaForMultipleChoice,
+    _CHECKPOINT_FOR_DOC,
+    FlaxMultipleChoiceModelOutput,
+    _CONFIG_FOR_DOC,
+)
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForTokenClassificationModule with Bert->XLMRoberta, with self.bert->self.roberta
+class FlaxXLMRobertaForTokenClassificationModule(nn.Module):
+    config: XLMRobertaConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.roberta = FlaxXLMRobertaModule(
+            config=self.config,
+            dtype=self.dtype,
+            add_pooling_layer=False,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        classifier_dropout = (
+            self.config.classifier_dropout
+            if self.config.classifier_dropout is not None
+            else self.config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(rate=classifier_dropout)
+        self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.roberta(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        logits = self.classifier(hidden_states)
+
+        if not return_dict:
+            return (logits,) + outputs[1:]
+
+        return FlaxTokenClassifierOutput(
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
 
 
 @add_start_docstrings(
     """
-    XLM-RoBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
+    XLM Roberta Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
     for Named-Entity-Recognition (NER) tasks.
     """,
     XLM_ROBERTA_START_DOCSTRING,
 )
-class FlaxXLMRobertaForTokenClassification(FlaxRobertaForTokenClassification):
-    """
-    This class overrides [`FlaxRobertaForTokenClassification`]. Please check the superclass for the appropriate
-    documentation alongside usage examples.
-    """
+class FlaxXLMRobertaForTokenClassification(FlaxXLMRobertaPreTrainedModel):
+    module_class = FlaxXLMRobertaForTokenClassificationModule
 
-    config_class = XLMRobertaConfig
+
+append_call_sample_docstring(
+    FlaxXLMRobertaForTokenClassification,
+    _CHECKPOINT_FOR_DOC,
+    FlaxTokenClassifierOutput,
+    _CONFIG_FOR_DOC,
+)
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForQuestionAnsweringModule with Bert->XLMRoberta, with self.bert->self.roberta
+class FlaxXLMRobertaForQuestionAnsweringModule(nn.Module):
+    config: XLMRobertaConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.roberta = FlaxXLMRobertaModule(
+            config=self.config,
+            dtype=self.dtype,
+            add_pooling_layer=False,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.qa_outputs = nn.Dense(self.config.num_labels, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.roberta(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+
+        logits = self.qa_outputs(hidden_states)
+        start_logits, end_logits = logits.split(self.config.num_labels, axis=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        if not return_dict:
+            return (start_logits, end_logits) + outputs[1:]
+
+        return FlaxQuestionAnsweringModelOutput(
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
 
 
 @add_start_docstrings(
     """
-    XLM-RoBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
+    XLM Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
     linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
     """,
     XLM_ROBERTA_START_DOCSTRING,
 )
-class FlaxXLMRobertaForQuestionAnswering(FlaxRobertaForQuestionAnswering):
-    """
-    This class overrides [`FlaxRobertaForQuestionAnswering`]. Please check the superclass for the appropriate
-    documentation alongside usage examples.
+class FlaxXLMRobertaForQuestionAnswering(FlaxXLMRobertaPreTrainedModel):
+    module_class = FlaxXLMRobertaForQuestionAnsweringModule
+
+
+append_call_sample_docstring(
+    FlaxXLMRobertaForQuestionAnswering,
+    _CHECKPOINT_FOR_DOC,
+    FlaxQuestionAnsweringModelOutput,
+    _CONFIG_FOR_DOC,
+)
+
+
+# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForCausalLMModule with Roberta->XLMRoberta
+class FlaxXLMRobertaForCausalLMModule(nn.Module):
+    config: XLMRobertaConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.roberta = FlaxXLMRobertaModule(
+            config=self.config,
+            add_pooling_layer=False,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.lm_head = FlaxXLMRobertaLMHead(config=self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        position_ids,
+        token_type_ids: Optional[jnp.ndarray] = None,
+        head_mask: Optional[jnp.ndarray] = None,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.roberta(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            init_cache=init_cache,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        if self.config.tie_word_embeddings:
+            shared_embedding = self.roberta.variables["params"]["embeddings"]["word_embeddings"]["embedding"]
+        else:
+            shared_embedding = None
+
+        # Compute the prediction scores
+        logits = self.lm_head(hidden_states, shared_embedding=shared_embedding)
+
+        if not return_dict:
+            return (logits,) + outputs[1:]
+
+        return FlaxCausalLMOutputWithCrossAttentions(
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
     """
+    XLM Roberta Model with a language modeling head on top (a linear layer on top of the hidden-states output) e.g for
+    autoregressive tasks.
+    """,
+    XLM_ROBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForCausalLM with Roberta->XLMRoberta
+class FlaxXLMRobertaForCausalLM(FlaxXLMRobertaPreTrainedModel):
+    module_class = FlaxXLMRobertaForCausalLMModule
 
-    config_class = XLMRobertaConfig
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
+        # initializing the cache
+        batch_size, seq_length = input_ids.shape
+
+        past_key_values = self.init_cache(batch_size, max_length)
+        # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
+        # But since the decoder uses a causal mask, those positions are masked anyway.
+        # Thus, we can create a single static attention_mask here, which is more efficient for compilation
+        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
+        if attention_mask is not None:
+            position_ids = attention_mask.cumsum(axis=-1) - 1
+            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0))
+        else:
+            position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
+
+        return {
+            "past_key_values": past_key_values,
+            "attention_mask": extended_attention_mask,
+            "position_ids": position_ids,
+        }
+
+    def update_inputs_for_generation(self, model_outputs, model_kwargs):
+        model_kwargs["past_key_values"] = model_outputs.past_key_values
+        model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1
+        return model_kwargs
+
+
+append_call_sample_docstring(
+    FlaxXLMRobertaForCausalLM,
+    _CHECKPOINT_FOR_DOC,
+    FlaxCausalLMOutputWithCrossAttentions,
+    _CONFIG_FOR_DOC,
+)
diff --git a/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py
index d7bdd92fc98a..65f3be9e2f27 100644
--- a/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py
@@ -15,26 +15,65 @@
 # limitations under the License.
 """ TF 2.0 XLM-RoBERTa model."""
 
-from ...utils import add_start_docstrings, logging
-from ..roberta.modeling_tf_roberta import (
-    TFRobertaForCausalLM,
-    TFRobertaForMaskedLM,
-    TFRobertaForMultipleChoice,
-    TFRobertaForQuestionAnswering,
-    TFRobertaForSequenceClassification,
-    TFRobertaForTokenClassification,
-    TFRobertaModel,
+
+from __future__ import annotations
+
+import math
+import warnings
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...modeling_tf_outputs import (
+    TFBaseModelOutputWithPastAndCrossAttentions,
+    TFBaseModelOutputWithPoolingAndCrossAttentions,
+    TFCausalLMOutputWithCrossAttentions,
+    TFMaskedLMOutput,
+    TFMultipleChoiceModelOutput,
+    TFQuestionAnsweringModelOutput,
+    TFSequenceClassifierOutput,
+    TFTokenClassifierOutput,
+)
+from ...modeling_tf_utils import (
+    TFCausalLanguageModelingLoss,
+    TFMaskedLanguageModelingLoss,
+    TFModelInputType,
+    TFMultipleChoiceLoss,
+    TFPreTrainedModel,
+    TFQuestionAnsweringLoss,
+    TFSequenceClassificationLoss,
+    TFTokenClassificationLoss,
+    get_initializer,
+    keras_serializable,
+    unpack_inputs,
+)
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
 )
 from .configuration_xlm_roberta import XLMRobertaConfig
 
 
 logger = logging.get_logger(__name__)
 
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "xlm-roberta-base"
+_CONFIG_FOR_DOC = "XLMRobertaConfig"
+
 TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "xlm-roberta-base",
+    "xlm-roberta-large",
+    "joeddav/xlm-roberta-large-xnli",
+    "cardiffnlp/twitter-xlm-roberta-base-sentiment",
     # See all XLM-RoBERTa models at https://huggingface.co/models?filter=xlm-roberta
 ]
 
-
 XLM_ROBERTA_START_DOCSTRING = r"""
 
     This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
@@ -77,105 +116,1461 @@
             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
 
+XLM_ROBERTA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
+            [`PreTrainedTokenizer.__call__`] and [`PreTrainedTokenizer.encode`] for details. [What are input
+            IDs?](../glossary#input-ids)
+        attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`. [What are position IDs?](../glossary#position-ids)
+        head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+            config will be used instead.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+            eager mode, in graph mode the value will always be set to True.
+        training (`bool`, *optional*, defaults to `False`):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
 
-@add_start_docstrings(
-    "The bare XLM-RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
-    XLM_ROBERTA_START_DOCSTRING,
-)
-class TFXLMRobertaModel(TFRobertaModel):
+
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaEmbeddings with Roberta->XLMRoberta
+class TFXLMRobertaEmbeddings(tf.keras.layers.Layer):
     """
-    This class overrides [`TFRobertaModel`]. Please check the superclass for the appropriate documentation alongside
-    usage examples.
+    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
     """
 
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+
+        self.padding_idx = 1
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.max_position_embeddings = config.max_position_embeddings
+        self.initializer_range = config.initializer_range
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
+
+    def build(self, input_shape: tf.TensorShape):
+        with tf.name_scope("word_embeddings"):
+            self.weight = self.add_weight(
+                name="weight",
+                shape=[self.config.vocab_size, self.hidden_size],
+                initializer=get_initializer(self.initializer_range),
+            )
+
+        with tf.name_scope("token_type_embeddings"):
+            self.token_type_embeddings = self.add_weight(
+                name="embeddings",
+                shape=[self.config.type_vocab_size, self.hidden_size],
+                initializer=get_initializer(self.initializer_range),
+            )
+
+        with tf.name_scope("position_embeddings"):
+            self.position_embeddings = self.add_weight(
+                name="embeddings",
+                shape=[self.max_position_embeddings, self.hidden_size],
+                initializer=get_initializer(self.initializer_range),
+            )
+
+        super().build(input_shape)
+
+    def create_position_ids_from_input_ids(self, input_ids, past_key_values_length=0):
+        """
+        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
+        symbols are ignored. This is modified from fairseq's `utils.make_positions`.
+
+        Args:
+            input_ids: tf.Tensor
+        Returns: tf.Tensor
+        """
+        mask = tf.cast(tf.math.not_equal(input_ids, self.padding_idx), dtype=input_ids.dtype)
+        incremental_indices = (tf.math.cumsum(mask, axis=1) + past_key_values_length) * mask
+
+        return incremental_indices + self.padding_idx
+
+    def call(
+        self,
+        input_ids=None,
+        position_ids=None,
+        token_type_ids=None,
+        inputs_embeds=None,
+        past_key_values_length=0,
+        training=False,
+    ):
+        """
+        Applies embedding based on inputs tensor.
+
+        Returns:
+            final_embeddings (`tf.Tensor`): output embedding tensor.
+        """
+        assert not (input_ids is None and inputs_embeds is None)
+
+        if input_ids is not None:
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
+            inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
+
+        input_shape = shape_list(inputs_embeds)[:-1]
+
+        if token_type_ids is None:
+            token_type_ids = tf.fill(dims=input_shape, value=0)
+
+        if position_ids is None:
+            if input_ids is not None:
+                # Create the position ids from the input token ids. Any padded tokens remain padded.
+                position_ids = self.create_position_ids_from_input_ids(
+                    input_ids=input_ids, past_key_values_length=past_key_values_length
+                )
+            else:
+                position_ids = tf.expand_dims(
+                    tf.range(start=self.padding_idx + 1, limit=input_shape[-1] + self.padding_idx + 1), axis=0
+                )
+
+        position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
+        token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
+        final_embeddings = inputs_embeds + position_embeds + token_type_embeds
+        final_embeddings = self.LayerNorm(inputs=final_embeddings)
+        final_embeddings = self.dropout(inputs=final_embeddings, training=training)
+
+        return final_embeddings
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->XLMRoberta
+class TFXLMRobertaPooler(tf.keras.layers.Layer):
+    def __init__(self, config: XLMRobertaConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = tf.keras.layers.Dense(
+            units=config.hidden_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            activation="tanh",
+            name="dense",
+        )
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(inputs=first_token_tensor)
+
+        return pooled_output
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->XLMRoberta
+class TFXLMRobertaSelfAttention(tf.keras.layers.Layer):
+    def __init__(self, config: XLMRobertaConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number "
+                f"of attention heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
+
+        self.query = tf.keras.layers.Dense(
+            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
+        )
+        self.key = tf.keras.layers.Dense(
+            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
+        )
+        self.value = tf.keras.layers.Dense(
+            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
+        )
+        self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
+        # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
+        tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))
+
+        # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size]
+        return tf.transpose(tensor, perm=[0, 2, 1, 3])
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor,
+        head_mask: tf.Tensor,
+        encoder_hidden_states: tf.Tensor,
+        encoder_attention_mask: tf.Tensor,
+        past_key_value: Tuple[tf.Tensor],
+        output_attentions: bool,
+        training: bool = False,
+    ) -> Tuple[tf.Tensor]:
+        batch_size = shape_list(hidden_states)[0]
+        mixed_query_layer = self.query(inputs=hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(inputs=encoder_hidden_states), batch_size)
+            value_layer = self.transpose_for_scores(self.value(inputs=encoder_hidden_states), batch_size)
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(inputs=hidden_states), batch_size)
+            value_layer = self.transpose_for_scores(self.value(inputs=hidden_states), batch_size)
+            key_layer = tf.concat([past_key_value[0], key_layer], axis=2)
+            value_layer = tf.concat([past_key_value[1], value_layer], axis=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(inputs=hidden_states), batch_size)
+            value_layer = self.transpose_for_scores(self.value(inputs=hidden_states), batch_size)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(tf.Tensor, tf.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(tf.Tensor, tf.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        # (batch size, num_heads, seq_len_q, seq_len_k)
+        attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
+        dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype)
+        attention_scores = tf.divide(attention_scores, dk)
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in TFXLMRobertaModel call() function)
+            attention_scores = tf.add(attention_scores, attention_mask)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = stable_softmax(logits=attention_scores, axis=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(inputs=attention_probs, training=training)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = tf.multiply(attention_probs, head_mask)
+
+        attention_output = tf.matmul(attention_probs, value_layer)
+        attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3])
+
+        # (batch_size, seq_len_q, all_head_size)
+        attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.all_head_size))
+        outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->XLMRoberta
+class TFXLMRobertaSelfOutput(tf.keras.layers.Layer):
+    def __init__(self, config: XLMRobertaConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = tf.keras.layers.Dense(
+            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
+
+    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.dropout(inputs=hidden_states, training=training)
+        hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
+
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->XLMRoberta
+class TFXLMRobertaAttention(tf.keras.layers.Layer):
+    def __init__(self, config: XLMRobertaConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.self_attention = TFXLMRobertaSelfAttention(config, name="self")
+        self.dense_output = TFXLMRobertaSelfOutput(config, name="output")
+
+    def prune_heads(self, heads):
+        raise NotImplementedError
+
+    def call(
+        self,
+        input_tensor: tf.Tensor,
+        attention_mask: tf.Tensor,
+        head_mask: tf.Tensor,
+        encoder_hidden_states: tf.Tensor,
+        encoder_attention_mask: tf.Tensor,
+        past_key_value: Tuple[tf.Tensor],
+        output_attentions: bool,
+        training: bool = False,
+    ) -> Tuple[tf.Tensor]:
+        self_outputs = self.self_attention(
+            hidden_states=input_tensor,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            training=training,
+        )
+        attention_output = self.dense_output(
+            hidden_states=self_outputs[0], input_tensor=input_tensor, training=training
+        )
+        # add attentions (possibly with past_key_value) if we output them
+        outputs = (attention_output,) + self_outputs[1:]
+
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->XLMRoberta
+class TFXLMRobertaIntermediate(tf.keras.layers.Layer):
+    def __init__(self, config: XLMRobertaConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = tf.keras.layers.Dense(
+            units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->XLMRoberta
+class TFXLMRobertaOutput(tf.keras.layers.Layer):
+    def __init__(self, config: XLMRobertaConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = tf.keras.layers.Dense(
+            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
+
+    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.dropout(inputs=hidden_states, training=training)
+        hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
+
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->XLMRoberta
+class TFXLMRobertaLayer(tf.keras.layers.Layer):
+    def __init__(self, config: XLMRobertaConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.attention = TFXLMRobertaAttention(config, name="attention")
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = TFXLMRobertaAttention(config, name="crossattention")
+        self.intermediate = TFXLMRobertaIntermediate(config, name="intermediate")
+        self.bert_output = TFXLMRobertaOutput(config, name="output")
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor,
+        head_mask: tf.Tensor,
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_value: Tuple[tf.Tensor] | None,
+        output_attentions: bool,
+        training: bool = False,
+    ) -> Tuple[tf.Tensor]:
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            input_tensor=hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            past_key_value=self_attn_past_key_value,
+            output_attentions=output_attentions,
+            training=training,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                input_tensor=attention_output,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                past_key_value=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+                training=training,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        intermediate_output = self.intermediate(hidden_states=attention_output)
+        layer_output = self.bert_output(
+            hidden_states=intermediate_output, input_tensor=attention_output, training=training
+        )
+        outputs = (layer_output,) + outputs  # add attentions if we output them
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->XLMRoberta
+class TFXLMRobertaEncoder(tf.keras.layers.Layer):
+    def __init__(self, config: XLMRobertaConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.layer = [TFXLMRobertaLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor,
+        head_mask: tf.Tensor,
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_values: Tuple[Tuple[tf.Tensor]] | None,
+        use_cache: Optional[bool],
+        output_attentions: bool,
+        output_hidden_states: bool,
+        return_dict: bool,
+        training: bool = False,
+    ) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]:
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                head_mask=head_mask[i],
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                training=training,
+            )
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention and encoder_hidden_states is not None:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v for v in [hidden_states, all_hidden_states, all_attentions, all_cross_attentions] if v is not None
+            )
+
+        return TFBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@keras_serializable
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaMainLayer with Roberta->XLMRoberta
+class TFXLMRobertaMainLayer(tf.keras.layers.Layer):
     config_class = XLMRobertaConfig
 
+    def __init__(self, config, add_pooling_layer=True, **kwargs):
+        super().__init__(**kwargs)
 
-@add_start_docstrings(
-    "XLM-RoBERTa Model with a `language modeling` head on top for CLM fine-tuning.",
-    XLM_ROBERTA_START_DOCSTRING,
-)
-class XLMRobertaForCausalLM(TFRobertaForCausalLM):
+        self.config = config
+        self.is_decoder = config.is_decoder
+
+        self.num_hidden_layers = config.num_hidden_layers
+        self.initializer_range = config.initializer_range
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        self.return_dict = config.use_return_dict
+        self.encoder = TFXLMRobertaEncoder(config, name="encoder")
+        self.pooler = TFXLMRobertaPooler(config, name="pooler") if add_pooling_layer else None
+        # The embeddings must be the last declaration in order to follow the weights order
+        self.embeddings = TFXLMRobertaEmbeddings(config, name="embeddings")
+
+    # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings
+    def get_input_embeddings(self) -> tf.keras.layers.Layer:
+        return self.embeddings
+
+    # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings
+    def set_input_embeddings(self, value: tf.Variable):
+        self.embeddings.weight = value
+        self.embeddings.vocab_size = shape_list(value)[0]
+
+    # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer._prune_heads
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        raise NotImplementedError
+
+    @unpack_inputs
+    # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.call
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+    ) -> Union[TFBaseModelOutputWithPoolingAndCrossAttentions, Tuple[tf.Tensor]]:
+        if not self.config.is_decoder:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = shape_list(input_ids)
+        elif inputs_embeds is not None:
+            input_shape = shape_list(inputs_embeds)[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+
+        if past_key_values is None:
+            past_key_values_length = 0
+            past_key_values = [None] * len(self.encoder.layer)
+        else:
+            past_key_values_length = shape_list(past_key_values[0][0])[-2]
+
+        if attention_mask is None:
+            attention_mask = tf.fill(dims=(batch_size, seq_length + past_key_values_length), value=1)
+
+        if token_type_ids is None:
+            token_type_ids = tf.fill(dims=input_shape, value=0)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+            training=training,
+        )
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        attention_mask_shape = shape_list(attention_mask)
+
+        mask_seq_length = seq_length + past_key_values_length
+        # Copied from `modeling_tf_t5.py`
+        # Provided a padding mask of dimensions [batch_size, mask_seq_length]
+        # - if the model is a decoder, apply a causal mask in addition to the padding mask
+        # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length]
+        if self.is_decoder:
+            seq_ids = tf.range(mask_seq_length)
+            causal_mask = tf.less_equal(
+                tf.tile(seq_ids[None, None, :], (batch_size, mask_seq_length, 1)),
+                seq_ids[None, :, None],
+            )
+            causal_mask = tf.cast(causal_mask, dtype=attention_mask.dtype)
+            extended_attention_mask = causal_mask * attention_mask[:, None, :]
+            attention_mask_shape = shape_list(extended_attention_mask)
+            extended_attention_mask = tf.reshape(
+                extended_attention_mask, (attention_mask_shape[0], 1, attention_mask_shape[1], attention_mask_shape[2])
+            )
+            if past_key_values[0] is not None:
+                # attention_mask needs to be sliced to the shape `[batch_size, 1, from_seq_length - cached_seq_length, to_seq_length]
+                extended_attention_mask = extended_attention_mask[:, :, -seq_length:, :]
+        else:
+            extended_attention_mask = tf.reshape(
+                attention_mask, (attention_mask_shape[0], 1, 1, attention_mask_shape[1])
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = tf.cast(extended_attention_mask, dtype=embedding_output.dtype)
+        one_cst = tf.constant(1.0, dtype=embedding_output.dtype)
+        ten_thousand_cst = tf.constant(-10000.0, dtype=embedding_output.dtype)
+        extended_attention_mask = tf.multiply(tf.subtract(one_cst, extended_attention_mask), ten_thousand_cst)
+
+        # Copied from `modeling_tf_t5.py` with -1e9 -> -10000
+        if self.is_decoder and encoder_attention_mask is not None:
+            # If a 2D ou 3D attention mask is provided for the cross-attention
+            # we need to make broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length]
+            # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            encoder_attention_mask = tf.cast(encoder_attention_mask, dtype=extended_attention_mask.dtype)
+            num_dims_encoder_attention_mask = len(shape_list(encoder_attention_mask))
+            if num_dims_encoder_attention_mask == 3:
+                encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
+            if num_dims_encoder_attention_mask == 2:
+                encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
+
+            # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
+            # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
+            # encoder_extended_attention_mask = tf.math.equal(encoder_extended_attention_mask,
+            #                                         tf.transpose(encoder_extended_attention_mask, perm=(-1, -2)))
+
+            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if head_mask is not None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.config.num_hidden_layers
+
+        encoder_outputs = self.encoder(
+            hidden_states=embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(hidden_states=sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (
+                sequence_output,
+                pooled_output,
+            ) + encoder_outputs[1:]
+
+        return TFBaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaPreTrainedModel with Roberta->XLMRoberta
+class TFXLMRobertaPreTrainedModel(TFPreTrainedModel):
     """
-    This class overrides [`TFRobertaForCausalLM`]. Please check the superclass for the appropriate documentation
-    alongside usage examples.
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
     """
 
     config_class = XLMRobertaConfig
+    base_model_prefix = "roberta"
 
 
 @add_start_docstrings(
-    """XLM-RoBERTa Model with a `language modeling` head on top.""",
+    "The bare XLM RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
     XLM_ROBERTA_START_DOCSTRING,
 )
-class TFXLMRobertaForMaskedLM(TFRobertaForMaskedLM):
-    """
-    This class overrides [`TFRobertaForMaskedLM`]. Please check the superclass for the appropriate documentation
-    alongside usage examples.
-    """
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaModel with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
+class TFXLMRobertaModel(TFXLMRobertaPreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.roberta = TFXLMRobertaMainLayer(config, name="roberta")
 
-    config_class = XLMRobertaConfig
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFBaseModelOutputWithPoolingAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: Optional[bool] = False,
+    ) -> Union[Tuple, TFBaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
+            contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`). Set to `False` during training, `True` during generation
+        """
+        outputs = self.roberta(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        return outputs
+
+
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->XLMRoberta
+class TFXLMRobertaLMHead(tf.keras.layers.Layer):
+    """XLMRoberta Head for masked language modeling."""
+
+    def __init__(self, config, input_embeddings, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.dense = tf.keras.layers.Dense(
+            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
+        self.act = get_tf_activation("gelu")
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = input_embeddings
+
+    def build(self, input_shape):
+        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
+
+        super().build(input_shape)
+
+    def get_output_embeddings(self):
+        return self.decoder
+
+    def set_output_embeddings(self, value):
+        self.decoder.weight = value
+        self.decoder.vocab_size = shape_list(value)[0]
+
+    def get_bias(self):
+        return {"bias": self.bias}
+
+    def set_bias(self, value):
+        self.bias = value["bias"]
+        self.config.vocab_size = shape_list(value["bias"])[0]
+
+    def call(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+
+        # project back to size of vocabulary with bias
+        seq_length = shape_list(tensor=hidden_states)[1]
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
+        hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True)
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
+        hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
+
+        return hidden_states
+
+
+@add_start_docstrings("""XLM RoBERTa Model with a `language modeling` head on top.""", XLM_ROBERTA_START_DOCSTRING)
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForMaskedLM with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
+class TFXLMRobertaForMaskedLM(TFXLMRobertaPreTrainedModel, TFMaskedLanguageModelingLoss):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head.decoder.weight"]
+
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.roberta = TFXLMRobertaMainLayer(config, add_pooling_layer=False, name="roberta")
+        self.lm_head = TFXLMRobertaLMHead(config, self.roberta.embeddings, name="lm_head")
+
+    def get_lm_head(self):
+        return self.lm_head
+
+    def get_prefix_bias_name(self):
+        warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
+        return self.name + "/" + self.lm_head.name
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFMaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        mask="<mask>",
+        expected_output="' Paris'",
+        expected_loss=0.1,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: Optional[bool] = False,
+    ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output)
+
+        loss = None if labels is None else self.hf_compute_loss(labels, prediction_scores)
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFMaskedLMOutput(
+            loss=loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    "XLM-RoBERTa Model with a `language modeling` head on top for CLM fine-tuning.",
+    XLM_ROBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForCausalLM with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
+class TFXLMRobertaForCausalLM(TFXLMRobertaPreTrainedModel, TFCausalLanguageModelingLoss):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head.decoder.weight"]
+
+    def __init__(self, config: XLMRobertaConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        if not config.is_decoder:
+            logger.warning("If you want to use `TFXLMRobertaLMHeadModel` as a standalone, add `is_decoder=True.`")
+
+        self.roberta = TFXLMRobertaMainLayer(config, add_pooling_layer=False, name="roberta")
+        self.lm_head = TFXLMRobertaLMHead(config, input_embeddings=self.roberta.embeddings, name="lm_head")
+
+    def get_lm_head(self):
+        return self.lm_head
+
+    def get_prefix_bias_name(self):
+        warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
+        return self.name + "/" + self.lm_head.name
+
+    # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMHeadModel.prepare_inputs_for_generation
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = tf.ones(input_shape)
+
+        # cut decoder_input_ids if past is used
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFCausalLMOutputWithCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: Optional[bool] = False,
+    ) -> Union[TFCausalLMOutputWithCrossAttentions, Tuple[tf.Tensor]]:
+        r"""
+        encoder_hidden_states  (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
+            contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`). Set to `False` during training, `True` during generation
+        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the cross entropy classification loss. Indices should be in `[0, ...,
+            config.vocab_size - 1]`.
+        """
+        outputs = self.roberta(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        sequence_output = outputs[0]
+        logits = self.lm_head(hidden_states=sequence_output, training=training)
+        loss = None
+
+        if labels is not None:
+            # shift labels to the left and cut last logit token
+            shifted_logits = logits[:, :-1]
+            labels = labels[:, 1:]
+            loss = self.hf_compute_loss(labels=labels, logits=shifted_logits)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFCausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead with Roberta->XLMRoberta
+class TFXLMRobertaClassificationHead(tf.keras.layers.Layer):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.dense = tf.keras.layers.Dense(
+            config.hidden_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            activation="tanh",
+            name="dense",
+        )
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = tf.keras.layers.Dropout(classifier_dropout)
+        self.out_proj = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
+        )
+
+    def call(self, features, training=False):
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x, training=training)
+        x = self.dense(x)
+        x = self.dropout(x, training=training)
+        x = self.out_proj(x)
+        return x
 
 
 @add_start_docstrings(
     """
-    XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    XLM RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
     pooled output) e.g. for GLUE tasks.
     """,
     XLM_ROBERTA_START_DOCSTRING,
 )
-class TFXLMRobertaForSequenceClassification(TFRobertaForSequenceClassification):
-    """
-    This class overrides [`TFRobertaForSequenceClassification`]. Please check the superclass for the appropriate
-    documentation alongside usage examples.
-    """
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForSequenceClassification with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
+class TFXLMRobertaForSequenceClassification(TFXLMRobertaPreTrainedModel, TFSequenceClassificationLoss):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"]
 
-    config_class = XLMRobertaConfig
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.roberta = TFXLMRobertaMainLayer(config, add_pooling_layer=False, name="roberta")
+        self.classifier = TFXLMRobertaClassificationHead(config, name="classifier")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint="cardiffnlp/twitter-roberta-base-emotion",
+        output_type=TFSequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output="'optimism'",
+        expected_loss=0.08,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: Optional[bool] = False,
+    ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output, training=training)
+
+        loss = None if labels is None else self.hf_compute_loss(labels, logits)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
 
 
 @add_start_docstrings(
     """
-    XLM-RoBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
-    for Named-Entity-Recognition (NER) tasks.
+    XLM Roberta Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
+    a softmax) e.g. for RocStories/SWAG tasks.
     """,
     XLM_ROBERTA_START_DOCSTRING,
 )
-class TFXLMRobertaForTokenClassification(TFRobertaForTokenClassification):
-    """
-    This class overrides [`TFRobertaForTokenClassification`]. Please check the superclass for the appropriate
-    documentation alongside usage examples.
-    """
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForMultipleChoice with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
+class TFXLMRobertaForMultipleChoice(TFXLMRobertaPreTrainedModel, TFMultipleChoiceLoss):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"lm_head"]
+    _keys_to_ignore_on_load_missing = [r"dropout"]
 
-    config_class = XLMRobertaConfig
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.roberta = TFXLMRobertaMainLayer(config, name="roberta")
+        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+        self.classifier = tf.keras.layers.Dense(
+            1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(
+        XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
+    )
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFMultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: Optional[bool] = False,
+    ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
+            where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
+        """
+
+        if input_ids is not None:
+            num_choices = shape_list(input_ids)[1]
+            seq_length = shape_list(input_ids)[2]
+        else:
+            num_choices = shape_list(inputs_embeds)[1]
+            seq_length = shape_list(inputs_embeds)[2]
+
+        flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
+        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
+        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
+        flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
+        outputs = self.roberta(
+            flat_input_ids,
+            flat_attention_mask,
+            flat_token_type_ids,
+            flat_position_ids,
+            head_mask,
+            inputs_embeds,
+            output_attentions,
+            output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output, training=training)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = tf.reshape(logits, (-1, num_choices))
+
+        loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFMultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
 
 
 @add_start_docstrings(
     """
-XLM-RoBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
-layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
-""",
+    XLM RoBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
+    for Named-Entity-Recognition (NER) tasks.
+    """,
     XLM_ROBERTA_START_DOCSTRING,
 )
-class TFXLMRobertaForQuestionAnswering(TFRobertaForQuestionAnswering):
-    """
-    This class overrides [`TFRobertaForQuestionAnsweringSimple`]. Please check the superclass for the appropriate
-    documentation alongside usage examples.
-    """
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForTokenClassification with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
+class TFXLMRobertaForTokenClassification(TFXLMRobertaPreTrainedModel, TFTokenClassificationLoss):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"]
+    _keys_to_ignore_on_load_missing = [r"dropout"]
 
-    config_class = XLMRobertaConfig
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.roberta = TFXLMRobertaMainLayer(config, add_pooling_layer=False, name="roberta")
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = tf.keras.layers.Dropout(classifier_dropout)
+        self.classifier = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint="ydshieh/roberta-large-ner-english",
+        output_type=TFTokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output="['O', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'LOC', 'LOC']",
+        expected_loss=0.01,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: Optional[bool] = False,
+    ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output, training=training)
+        logits = self.classifier(sequence_output)
+
+        loss = None if labels is None else self.hf_compute_loss(labels, logits)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFTokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
 
 
 @add_start_docstrings(
     """
-    Roberta Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
-    softmax) e.g. for RocStories/SWAG tasks.
+    XLM RoBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
+    linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
     """,
     XLM_ROBERTA_START_DOCSTRING,
 )
-class TFXLMRobertaForMultipleChoice(TFRobertaForMultipleChoice):
-    """
-    This class overrides [`TFRobertaForMultipleChoice`]. Please check the superclass for the appropriate documentation
-    alongside usage examples.
-    """
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForQuestionAnswering with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
+class TFXLMRobertaForQuestionAnswering(TFXLMRobertaPreTrainedModel, TFQuestionAnsweringLoss):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"]
 
-    config_class = XLMRobertaConfig
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.roberta = TFXLMRobertaMainLayer(config, add_pooling_layer=False, name="roberta")
+        self.qa_outputs = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
+        )
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint="ydshieh/roberta-base-squad2",
+        output_type=TFQuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output="' puppet'",
+        expected_loss=0.86,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
+        training: Optional[bool] = False,
+    ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
+        r"""
+        start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = tf.split(logits, 2, axis=-1)
+        start_logits = tf.squeeze(start_logits, axis=-1)
+        end_logits = tf.squeeze(end_logits, axis=-1)
+
+        loss = None
+        if start_positions is not None and end_positions is not None:
+            labels = {"start_position": start_positions}
+            labels["end_position"] = end_positions
+            loss = self.hf_compute_loss(labels, (start_logits, end_logits))
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFQuestionAnsweringModelOutput(
+            loss=loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
index d8df4ece98f2..761e96a11b73 100644
--- a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
@@ -50,7 +50,6 @@
 
 _CHECKPOINT_FOR_DOC = "xlm-roberta-base"
 _CONFIG_FOR_DOC = "XLMRobertaConfig"
-_TOKENIZER_FOR_DOC = "XLMRobertaTokenizer"
 
 XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "xlm-roberta-base",
@@ -82,7 +81,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
         )
@@ -286,7 +287,7 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.roberta.modeling_roberta.RobertaSelfOutput
+# Copied from transformers.models.roberta.modeling_roberta.RobertaSelfOutput with Roberta->XLMRoberta
 class XLMRobertaSelfOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -351,7 +352,7 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.roberta.modeling_roberta.RobertaIntermediate
+# Copied from transformers.models.roberta.modeling_roberta.RobertaIntermediate with Roberta->XLMRoberta
 class XLMRobertaIntermediate(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -367,7 +368,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-# Copied from transformers.models.roberta.modeling_roberta.RobertaOutput
+# Copied from transformers.models.roberta.modeling_roberta.RobertaOutput with Roberta->XLMRoberta
 class XLMRobertaOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -494,6 +495,13 @@ def forward(
         all_self_attentions = () if output_attentions else None
         all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         next_decoder_cache = () if use_cache else None
         for i, layer_module in enumerate(self.layer):
             if output_hidden_states:
@@ -504,12 +512,6 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         return module(*inputs, past_key_value, output_attentions)
@@ -567,7 +569,7 @@ def custom_forward(*inputs):
         )
 
 
-# Copied from transformers.models.roberta.modeling_roberta.RobertaPooler
+# Copied from transformers.models.roberta.modeling_roberta.RobertaPooler with Roberta->XLMRoberta
 class XLMRobertaPooler(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -593,7 +595,7 @@ class XLMRobertaPreTrainedModel(PreTrainedModel):
     config_class = XLMRobertaConfig
     base_model_prefix = "roberta"
     supports_gradient_checkpointing = True
-    _no_split_modules = []
+    _no_split_modules = ["XLMRobertaEmbeddings", "XLMRobertaSelfAttention"]
 
     # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
     def _init_weights(self, module):
@@ -616,15 +618,6 @@ def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, XLMRobertaEncoder):
             module.gradient_checkpointing = value
 
-    def update_keys_to_ignore(self, config, del_keys_to_ignore):
-        """Remove some keys from ignore list"""
-        if not config.tie_word_embeddings:
-            # must make a new list, or the class variable gets modified!
-            self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore]
-            self._keys_to_ignore_on_load_missing = [
-                k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore
-            ]
-
 
 XLM_ROBERTA_START_DOCSTRING = r"""
 
@@ -647,7 +640,7 @@ def update_keys_to_ignore(self, config, del_keys_to_ignore):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`XLMRobertaTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -713,8 +706,6 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
 
     """
 
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->XLMRoberta
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
@@ -744,7 +735,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPoolingAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -800,6 +790,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -886,9 +877,7 @@ def forward(
 )
 # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
 class XLMRobertaForCausalLM(XLMRobertaPreTrainedModel):
-    _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -899,9 +888,6 @@ def __init__(self, config):
         self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
         self.lm_head = XLMRobertaLMHead(config)
 
-        # The LM head weights require special treatment only when they are tied with the word embeddings
-        self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
-
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -998,6 +984,8 @@ def forward(
 
         lm_loss = None
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(prediction_scores.device)
             # we are doing next-token prediction; shift prediction scores and input ids by one
             shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
             labels = labels[:, 1:].contiguous()
@@ -1029,10 +1017,12 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attenti
 
         return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
 
-    def _reorder_cache(self, past, beam_idx):
+    def _reorder_cache(self, past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
         return reordered_past
 
 
@@ -1042,9 +1032,7 @@ def _reorder_cache(self, past, beam_idx):
 )
 # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
 class XLMRobertaForMaskedLM(XLMRobertaPreTrainedModel):
-    _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1058,9 +1046,6 @@ def __init__(self, config):
         self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
         self.lm_head = XLMRobertaLMHead(config)
 
-        # The LM head weights require special treatment only when they are tied with the word embeddings
-        self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
-
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -1072,7 +1057,6 @@ def set_output_embeddings(self, new_embeddings):
 
     @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1123,6 +1107,8 @@ def forward(
 
         masked_lm_loss = None
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(prediction_scores.device)
             loss_fct = CrossEntropyLoss()
             masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
 
@@ -1179,8 +1165,6 @@ def _tie_weights(self):
 )
 # Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
 class XLMRobertaForSequenceClassification(XLMRobertaPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1194,7 +1178,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="cardiffnlp/twitter-roberta-base-emotion",
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1238,6 +1221,8 @@ def forward(
 
         loss = None
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
             if self.config.problem_type is None:
                 if self.num_labels == 1:
                     self.config.problem_type = "regression"
@@ -1280,8 +1265,6 @@ def forward(
 )
 # Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
 class XLMRobertaForMultipleChoice(XLMRobertaPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -1296,7 +1279,6 @@ def __init__(self, config):
         XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
     )
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1352,6 +1334,8 @@ def forward(
 
         loss = None
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(reshaped_logits.device)
             loss_fct = CrossEntropyLoss()
             loss = loss_fct(reshaped_logits, labels)
 
@@ -1376,9 +1360,6 @@ def forward(
 )
 # Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
 class XLMRobertaForTokenClassification(XLMRobertaPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1395,7 +1376,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="Jean-Baptiste/roberta-large-ner-english",
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1440,6 +1420,8 @@ def forward(
 
         loss = None
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
             loss_fct = CrossEntropyLoss()
             loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
 
@@ -1455,7 +1437,7 @@ def forward(
         )
 
 
-# Copied from transformers.models.roberta.modeling_roberta.RobertaClassificationHead
+# Copied from transformers.models.roberta.modeling_roberta.RobertaClassificationHead with Roberta->XLMRoberta
 class XLMRobertaClassificationHead(nn.Module):
     """Head for sentence-level classification tasks."""
 
@@ -1487,9 +1469,6 @@ def forward(self, features, **kwargs):
 )
 # Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
 class XLMRobertaForQuestionAnswering(XLMRobertaPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1502,7 +1481,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="deepset/roberta-base-squad2",
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
index 40928d8dc306..c014aa1eb5eb 100644
--- a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
@@ -106,8 +106,6 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
         mask_token (`str`, *optional*, defaults to `"<mask>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
-            Additional special tokens used by the tokenizer.
         sp_model_kwargs (`dict`, *optional*):
             Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
             SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
@@ -145,25 +143,13 @@ def __init__(
         pad_token="<pad>",
         mask_token="<mask>",
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
 
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
 
-        super().__init__(
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            sp_model_kwargs=self.sp_model_kwargs,
-            **kwargs,
-        )
-
         self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
         self.sp_model.Load(str(vocab_file))
         self.vocab_file = vocab_file
@@ -183,6 +169,18 @@ def __init__(
         self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + self.fairseq_offset
         self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
 
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+
     def __getstate__(self):
         state = self.__dict__.copy()
         state["sp_model"] = None
@@ -288,6 +286,7 @@ def get_vocab(self):
         return vocab
 
     def _tokenize(self, text: str) -> List[str]:
+        # TODO check if the t5/llama PR also applies here
         return self.sp_model.encode(text, out_type=str)
 
     def _convert_token_to_id(self, token):
diff --git a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
index f99e3c086a88..41079e29d8ca 100644
--- a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
+++ b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
@@ -147,7 +147,7 @@ def __init__(
         unk_token="<unk>",
         pad_token="<pad>",
         mask_token="<mask>",
-        **kwargs
+        **kwargs,
     ):
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
@@ -166,7 +166,10 @@ def __init__(
         )
 
         self.vocab_file = vocab_file
-        self.can_save_slow_tokenizer = False if not self.vocab_file else True
+
+    @property
+    def can_save_slow_tokenizer(self) -> bool:
+        return os.path.isfile(self.vocab_file) if self.vocab_file else False
 
     def build_inputs_with_special_tokens(
         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
diff --git a/src/transformers/models/xlm_roberta_xl/__init__.py b/src/transformers/models/xlm_roberta_xl/__init__.py
index 3140e3bd2267..2df95dbc4920 100644
--- a/src/transformers/models/xlm_roberta_xl/__init__.py
+++ b/src/transformers/models/xlm_roberta_xl/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py
index a36fe66de66c..acf30bf3878a 100644
--- a/src/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py
+++ b/src/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py
@@ -119,7 +119,7 @@ def __init__(
         position_embedding_type="absolute",
         use_cache=True,
         classifier_dropout=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
         self.vocab_size = vocab_size
diff --git a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
index 7097add2dad5..025bab3887c0 100644
--- a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
+++ b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
@@ -49,7 +49,6 @@
 
 _CHECKPOINT_FOR_DOC = "xlm-roberta-xlarge"
 _CONFIG_FOR_DOC = "XLMRobertaXLConfig"
-_TOKENIZER_FOR_DOC = "XLMRobertaTokenizer"
 
 XLM_ROBERTA_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "facebook/xlm-roberta-xl",
@@ -74,7 +73,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
         )
@@ -479,6 +480,12 @@ def forward(
         output_hidden_states=False,
         return_dict=True,
     ):
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
         all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
@@ -493,12 +500,6 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
 
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         return module(*inputs, past_key_value, output_attentions)
@@ -600,15 +601,6 @@ def _init_weights(self, module):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
 
-    def update_keys_to_ignore(self, config, del_keys_to_ignore):
-        """Remove some keys from ignore list"""
-        if not config.tie_word_embeddings:
-            # must make a new list, or the class variable gets modified!
-            self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore]
-            self._keys_to_ignore_on_load_missing = [
-                k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore
-            ]
-
 
 XLM_ROBERTA_XL_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
@@ -626,7 +618,7 @@ def update_keys_to_ignore(self, config, del_keys_to_ignore):
 XLM_ROBERTA_XL_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `({0})`):
-            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`RobertaTokenizer`]. See
+            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
             [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
             IDs?](../glossary#input-ids)
         attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
@@ -680,8 +672,6 @@ class XLMRobertaXLModel(XLMRobertaXLPreTrainedModel):
     an input to the forward pass. .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
     """
 
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->XLMRobertaXL
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
@@ -711,7 +701,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(XLM_ROBERTA_XL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPoolingAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -767,6 +756,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -852,9 +842,7 @@ def forward(
     XLM_ROBERTA_XL_START_DOCSTRING,
 )
 class XLMRobertaXLForCausalLM(XLMRobertaXLPreTrainedModel):
-    _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -865,9 +853,6 @@ def __init__(self, config):
         self.roberta = XLMRobertaXLModel(config, add_pooling_layer=False)
         self.lm_head = XLMRobertaXLLMHead(config)
 
-        # The LM head weights require special treatment only when they are tied with the word embeddings
-        self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
-
         self.init_weights()
 
     def get_output_embeddings(self):
@@ -923,10 +908,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import RobertaTokenizer, RobertaForCausalLM, RobertaConfig
+        >>> from transformers import AutoTokenizer, RobertaForCausalLM, RobertaConfig
         >>> import torch
 
-        >>> tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("roberta-base")
         >>> config = RobertaConfig.from_pretrained("roberta-base")
         >>> config.is_decoder = True
         >>> model = RobertaForCausalLM.from_pretrained("roberta-base", config=config)
@@ -991,10 +976,12 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attenti
 
         return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
 
-    def _reorder_cache(self, past, beam_idx):
+    def _reorder_cache(self, past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
         return reordered_past
 
 
@@ -1002,9 +989,7 @@ def _reorder_cache(self, past, beam_idx):
     """XLM-RoBERTa-xlarge Model with a `language modeling` head on top.""", XLM_ROBERTA_XL_START_DOCSTRING
 )
 class XLMRobertaXLForMaskedLM(XLMRobertaXLPreTrainedModel):
-    _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1018,9 +1003,6 @@ def __init__(self, config):
         self.roberta = XLMRobertaXLModel(config, add_pooling_layer=False)
         self.lm_head = XLMRobertaXLLMHead(config)
 
-        # The LM head weights require special treatment only when they are tied with the word embeddings
-        self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
-
         self.init_weights()
 
     def get_output_embeddings(self):
@@ -1031,7 +1013,6 @@ def set_output_embeddings(self, new_embeddings):
 
     @add_start_docstrings_to_model_forward(XLM_ROBERTA_XL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1130,8 +1111,6 @@ def _tie_weights(self):
     XLM_ROBERTA_XL_START_DOCSTRING,
 )
 class XLMRobertaXLForSequenceClassification(XLMRobertaXLPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1144,7 +1123,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(XLM_ROBERTA_XL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1227,8 +1205,6 @@ def forward(
     XLM_ROBERTA_XL_START_DOCSTRING,
 )
 class XLMRobertaXLForMultipleChoice(XLMRobertaXLPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -1242,7 +1218,6 @@ def __init__(self, config):
         XLM_ROBERTA_XL_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
     )
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1321,9 +1296,6 @@ def forward(
     XLM_ROBERTA_XL_START_DOCSTRING,
 )
 class XLMRobertaXLForTokenClassification(XLMRobertaXLPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1339,7 +1311,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(XLM_ROBERTA_XL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1436,9 +1407,6 @@ def forward(self, features, **kwargs):
     XLM_ROBERTA_XL_START_DOCSTRING,
 )
 class XLMRobertaXLForQuestionAnswering(XLMRobertaXLPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1450,7 +1418,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(XLM_ROBERTA_XL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/xlnet/__init__.py b/src/transformers/models/xlnet/__init__.py
index d01edf267cc1..f5e1d4568a66 100644
--- a/src/transformers/models/xlnet/__init__.py
+++ b/src/transformers/models/xlnet/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/xlnet/configuration_xlnet.py b/src/transformers/models/xlnet/configuration_xlnet.py
index 3aa05f77c985..9ebc1f8bb9fb 100644
--- a/src/transformers/models/xlnet/configuration_xlnet.py
+++ b/src/transformers/models/xlnet/configuration_xlnet.py
@@ -176,7 +176,7 @@ def __init__(
         pad_token_id=5,
         bos_token_id=1,
         eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         """Constructs XLNetConfig."""
         self.vocab_size = vocab_size
diff --git a/src/transformers/models/xlnet/modeling_tf_xlnet.py b/src/transformers/models/xlnet/modeling_tf_xlnet.py
index dbefc7535dfc..a0e6a8c2aa50 100644
--- a/src/transformers/models/xlnet/modeling_tf_xlnet.py
+++ b/src/transformers/models/xlnet/modeling_tf_xlnet.py
@@ -17,6 +17,9 @@
  TF 2.0 XLNet model.
 """
 
+
+from __future__ import annotations
+
 import warnings
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
@@ -39,9 +42,8 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -56,7 +58,6 @@
 
 _CHECKPOINT_FOR_DOC = "xlnet-base-cased"
 _CONFIG_FOR_DOC = "XLNetConfig"
-_TOKENIZER_FOR_DOC = "XLNetTokenizer"
 
 TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "xlnet-base-cased",
@@ -196,9 +197,9 @@ def call(
         attn_mask_g,
         r,
         seg_mat,
-        mems: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        target_mapping: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        mems: np.ndarray | tf.Tensor | None = None,
+        target_mapping: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = False,
         training: bool = False,
     ):
@@ -370,9 +371,9 @@ def call(
         attn_mask,
         pos_emb,
         seg_mat,
-        mems: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        target_mapping: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        mems: np.ndarray | tf.Tensor | None = None,
+        target_mapping: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = False,
         training: bool = False,
     ):
@@ -402,13 +403,13 @@ def call(
 class TFXLNetLMHead(tf.keras.layers.Layer):
     def __init__(self, config, input_embeddings, **kwargs):
         super().__init__(**kwargs)
-        self.vocab_size = config.vocab_size
+        self.config = config
         # The output weights are the same as the input embeddings, but there is
         # an output-only bias for each token.
         self.input_embeddings = input_embeddings
 
     def build(self, input_shape):
-        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
+        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
         super().build(input_shape)
 
     def get_output_embeddings(self):
@@ -423,7 +424,7 @@ def get_bias(self):
 
     def set_bias(self, value):
         self.bias = value["bias"]
-        self.vocab_size = shape_list(value["bias"])[0]
+        self.config.vocab_size = shape_list(value["bias"])[0]
 
     def call(self, hidden_states):
         hidden_states = self.input_embeddings(hidden_states, mode="linear")
@@ -475,6 +476,7 @@ def build(self, input_shape):
         self.mask_emb = self.add_weight(
             shape=(1, 1, self.d_model), initializer=initializer, trainable=True, name="mask_emb"
         )
+        super().build(input_shape)
 
     def _prune_heads(self, heads_to_prune):
         raise NotImplementedError
@@ -583,22 +585,21 @@ def relative_positional_encoding(self, qlen, klen, bsz=None):
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        mems: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        perm_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        target_mapping: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        input_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        mems: np.ndarray | tf.Tensor | None = None,
+        perm_mask: np.ndarray | tf.Tensor | None = None,
+        target_mapping: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        input_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_mems: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
     ):
-
         if training and use_mems is None:
             use_mems = self.use_mems_train
         else:
@@ -680,16 +681,7 @@ def call(
         if inputs_embeds is not None:
             word_emb_k = inputs_embeds
         else:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.word_embedding.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.word_embedding.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.word_embedding.vocab_size)
             word_emb_k = self.word_embedding(input_ids)
         output_h = self.dropout(word_emb_k, training=training)
         if target_mapping is not None:
@@ -835,9 +827,9 @@ class TFXLNetModelOutput(ModelOutput):
     """
 
     last_hidden_state: tf.Tensor = None
-    mems: Optional[List[tf.Tensor]] = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    mems: List[tf.Tensor] | None = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -870,11 +862,11 @@ class TFXLNetLMHeadModelOutput(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    mems: Optional[List[tf.Tensor]] = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    mems: List[tf.Tensor] | None = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -904,11 +896,11 @@ class TFXLNetForSequenceClassificationOutput(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    mems: Optional[List[tf.Tensor]] = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    mems: List[tf.Tensor] | None = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -938,11 +930,11 @@ class TFXLNetForTokenClassificationOutput(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    mems: Optional[List[tf.Tensor]] = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    mems: List[tf.Tensor] | None = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -974,11 +966,11 @@ class TFXLNetForMultipleChoiceOutput(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    mems: Optional[List[tf.Tensor]] = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    mems: List[tf.Tensor] | None = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -1010,12 +1002,12 @@ class TFXLNetForQuestionAnsweringSimpleOutput(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     start_logits: tf.Tensor = None
     end_logits: tf.Tensor = None
-    mems: Optional[List[tf.Tensor]] = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    mems: List[tf.Tensor] | None = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 XLNET_START_DOCSTRING = r"""
@@ -1065,7 +1057,7 @@ class TFXLNetForQuestionAnsweringSimpleOutput(ModelOutput):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`XLNetTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -1145,22 +1137,21 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFXLNetModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        mems: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        perm_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        target_mapping: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        input_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        mems: np.ndarray | tf.Tensor | None = None,
+        perm_mask: np.ndarray | tf.Tensor | None = None,
+        target_mapping: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        input_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_mems: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1186,15 +1177,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        mems = tf.convert_to_tensor(output.mems) if output.mems is not None else None
-
-        return TFXLNetModelOutput(
-            last_hidden_state=output.last_hidden_state, mems=mems, hidden_states=hs, attentions=attns
-        )
-
 
 @add_start_docstrings(
     """
@@ -1261,20 +1243,20 @@ def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_mems=N
     @replace_return_docstrings(output_type=TFXLNetLMHeadModelOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        mems: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        perm_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        target_mapping: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        input_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        mems: np.ndarray | tf.Tensor | None = None,
+        perm_mask: np.ndarray | tf.Tensor | None = None,
+        target_mapping: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        input_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_mems: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[TFXLNetLMHeadModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1289,9 +1271,9 @@ def call(
         ```python
         >>> import tensorflow as tf
         >>> import numpy as np
-        >>> from transformers import XLNetTokenizer, TFXLNetLMHeadModel
+        >>> from transformers import AutoTokenizer, TFXLNetLMHeadModel
 
-        >>> tokenizer = XLNetTokenizer.from_pretrained("xlnet-large-cased")
+        >>> tokenizer = AutoTokenizer.from_pretrained("xlnet-large-cased")
         >>> model = TFXLNetLMHeadModel.from_pretrained("xlnet-large-cased")
 
         >>> # We show how to setup inputs to predict a next token using a bi-directional context.
@@ -1354,13 +1336,6 @@ def call(
             attentions=transformer_outputs.attentions,
         )
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        mems = tf.convert_to_tensor(output.mems) if output.mems is not None else None
-
-        return TFXLNetLMHeadModelOutput(logits=output.logits, mems=mems, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1385,27 +1360,26 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFXLNetForSequenceClassificationOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        mems: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        perm_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        target_mapping: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        input_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        mems: np.ndarray | tf.Tensor | None = None,
+        perm_mask: np.ndarray | tf.Tensor | None = None,
+        target_mapping: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        input_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_mems: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[TFXLNetForSequenceClassificationOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1449,15 +1423,6 @@ def call(
             attentions=transformer_outputs.attentions,
         )
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        mems = tf.convert_to_tensor(output.mems) if output.mems is not None else None
-
-        return TFXLNetForSequenceClassificationOutput(
-            logits=output.logits, mems=mems, hidden_states=hs, attentions=attns
-        )
-
 
 @add_start_docstrings(
     """
@@ -1478,40 +1443,29 @@ def __init__(self, config, *inputs, **kwargs):
             1, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj"
         )
 
-    @property
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            tf.Tensor with dummy inputs
-        """
-        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32)}
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFXLNetForMultipleChoiceOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        input_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        mems: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        perm_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        target_mapping: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        input_mask: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        mems: np.ndarray | tf.Tensor | None = None,
+        perm_mask: np.ndarray | tf.Tensor | None = None,
+        target_mapping: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_mems: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[TFXLNetForMultipleChoiceOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1570,27 +1524,6 @@ def call(
             attentions=transformer_outputs.attentions,
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-                "token_type_ids": tf.TensorSpec((None, None, None), tf.int32, name="token_type_ids"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        mems = tf.convert_to_tensor(output.mems) if output.mems is not None else None
-
-        return TFXLNetForMultipleChoiceOutput(logits=output.logits, mems=mems, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1612,27 +1545,26 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFXLNetForTokenClassificationOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        mems: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        perm_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        target_mapping: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        input_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        mems: np.ndarray | tf.Tensor | None = None,
+        perm_mask: np.ndarray | tf.Tensor | None = None,
+        target_mapping: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        input_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_mems: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[TFXLNetForTokenClassificationOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1672,13 +1604,6 @@ def call(
             attentions=transformer_outputs.attentions,
         )
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        mems = tf.convert_to_tensor(output.mems) if output.mems is not None else None
-
-        return TFXLNetForTokenClassificationOutput(logits=output.logits, mems=mems, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1698,28 +1623,27 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFXLNetForQuestionAnsweringSimpleOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        mems: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        perm_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        target_mapping: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        input_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        mems: np.ndarray | tf.Tensor | None = None,
+        perm_mask: np.ndarray | tf.Tensor | None = None,
+        target_mapping: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        input_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_mems: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[TFXLNetForQuestionAnsweringSimpleOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1773,16 +1697,3 @@ def call(
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
         )
-
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        mems = tf.convert_to_tensor(output.mems) if output.mems is not None else None
-
-        return TFXLNetForQuestionAnsweringSimpleOutput(
-            start_logits=output.start_logits,
-            end_logits=output.end_logits,
-            mems=mems,
-            hidden_states=hs,
-            attentions=attns,
-        )
diff --git a/src/transformers/models/xlnet/modeling_xlnet.py b/src/transformers/models/xlnet/modeling_xlnet.py
index b1ac4c75b9b3..87bf48d61ed5 100755
--- a/src/transformers/models/xlnet/modeling_xlnet.py
+++ b/src/transformers/models/xlnet/modeling_xlnet.py
@@ -42,7 +42,6 @@
 
 _CHECKPOINT_FOR_DOC = "xlnet-base-cased"
 _CONFIG_FOR_DOC = "XLNetConfig"
-_TOKENIZER_FOR_DOC = "XLNetTokenizer"
 
 XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "xlnet-base-cased",
@@ -857,7 +856,7 @@ class XLNetForQuestionAnsweringOutput(ModelOutput):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`XLNetTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -977,16 +976,15 @@ def create_mask(self, qlen, mlen):
                v [0 0 0 0 0 0 0 0 0] [1 1 1 1 0 0 0 0 0]
 
         """
-        attn_mask = torch.ones([qlen, qlen])
-        mask_up = torch.triu(attn_mask, diagonal=1)
-        attn_mask_pad = torch.zeros([qlen, mlen])
-        ret = torch.cat([attn_mask_pad, mask_up], dim=1)
+        mask = torch.ones(qlen, qlen + mlen, self.device)
         if self.same_length:
-            mask_lo = torch.tril(attn_mask, diagonal=-1)
-            ret = torch.cat([ret[:, :qlen] + mask_lo, ret[:, qlen:]], dim=1)
+            mask_lo = mask[:, :qlen].tril(-1)
+            mask.triu_(mlen + 1)
+            mask[:, :qlen] += mask_lo
+        else:
+            mask.triu_(mlen + 1)
 
-        ret = ret.to(self.device)
-        return ret
+        return mask
 
     def cache_mem(self, curr_out, prev_mem):
         # cache hidden states into memory.
@@ -1060,7 +1058,6 @@ def relative_positional_encoding(self, qlen, klen, bsz=None):
 
     @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=XLNetModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1082,7 +1079,6 @@ def forward(
         return_dict: Optional[bool] = None,
         **kwargs,  # delete after depreciation warning is removed
     ) -> Union[Tuple, XLNetModelOutput]:
-
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1296,7 +1292,7 @@ def forward(
     XLNET_START_DOCSTRING,
 )
 class XLNetLMHeadModel(XLNetPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"lm_loss.weight"]
+    _tied_weights_keys = ["lm_loss.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1394,10 +1390,10 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import XLNetTokenizer, XLNetLMHeadModel
+        >>> from transformers import AutoTokenizer, XLNetLMHeadModel
         >>> import torch
 
-        >>> tokenizer = XLNetTokenizer.from_pretrained("xlnet-large-cased")
+        >>> tokenizer = AutoTokenizer.from_pretrained("xlnet-large-cased")
         >>> model = XLNetLMHeadModel.from_pretrained("xlnet-large-cased")
 
         >>> # We show how to setup inputs to predict a next token using a bi-directional context.
@@ -1516,7 +1512,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=XLNetForSequenceClassificationOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1624,7 +1619,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=XLNetForTokenClassificationOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1712,7 +1706,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=XLNetForMultipleChoiceOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1816,7 +1809,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=XLNetForQuestionAnsweringSimpleOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1975,10 +1967,10 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import XLNetTokenizer, XLNetForQuestionAnswering
+        >>> from transformers import AutoTokenizer, XLNetForQuestionAnswering
         >>> import torch
 
-        >>> tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
+        >>> tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
         >>> model = XLNetForQuestionAnswering.from_pretrained("xlnet-base-cased")
 
         >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(
diff --git a/src/transformers/models/xlnet/tokenization_xlnet.py b/src/transformers/models/xlnet/tokenization_xlnet.py
index 9dc6fd245964..c3e44d2e3d94 100644
--- a/src/transformers/models/xlnet/tokenization_xlnet.py
+++ b/src/transformers/models/xlnet/tokenization_xlnet.py
@@ -61,7 +61,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
         vocab_file (`str`):
             [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
             contains the vocabulary necessary to instantiate a tokenizer.
-        do_lower_case (`bool`, *optional*, defaults to `True`):
+        do_lower_case (`bool`, *optional*, defaults to `False`):
             Whether to lowercase the input when tokenizing.
         remove_space (`bool`, *optional*, defaults to `True`):
             Whether to strip the text when tokenizing (removing excess spaces before and after the string).
@@ -102,7 +102,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
         mask_token (`str`, *optional*, defaults to `"<mask>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        additional_special_tokens (`List[str]`, *optional*, defaults to `["<eop>", "<eod>"]`):
+        additional_special_tokens (`List[str]`, *optional*, defaults to `['<eop>', '<eod>']`):
             Additional special tokens used by the tokenizer.
         sp_model_kwargs (`dict`, *optional*):
             Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
@@ -145,13 +145,21 @@ def __init__(
         mask_token="<mask>",
         additional_special_tokens=["<eop>", "<eod>"],
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
 
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
 
+        self.do_lower_case = do_lower_case
+        self.remove_space = remove_space
+        self.keep_accents = keep_accents
+        self.vocab_file = vocab_file
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+
         super().__init__(
             do_lower_case=do_lower_case,
             remove_space=remove_space,
@@ -170,14 +178,6 @@ def __init__(
 
         self._pad_token_type_id = 3
 
-        self.do_lower_case = do_lower_case
-        self.remove_space = remove_space
-        self.keep_accents = keep_accents
-        self.vocab_file = vocab_file
-
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(vocab_file)
-
     @property
     def vocab_size(self):
         return len(self.sp_model)
@@ -254,9 +254,9 @@ def _decode(
         self,
         token_ids: List[int],
         skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: bool = True,
+        clean_up_tokenization_spaces: bool = None,
         spaces_between_special_tokens: bool = True,
-        **kwargs
+        **kwargs,
     ) -> str:
         self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
 
@@ -284,6 +284,11 @@ def _decode(
         # By default, there are no spaces between special tokens
         text = "".join(sub_texts)
 
+        clean_up_tokenization_spaces = (
+            clean_up_tokenization_spaces
+            if clean_up_tokenization_spaces is not None
+            else self.clean_up_tokenization_spaces
+        )
         if clean_up_tokenization_spaces:
             clean_text = self.clean_up_tokenization(text)
             return clean_text
diff --git a/src/transformers/models/xlnet/tokenization_xlnet_fast.py b/src/transformers/models/xlnet/tokenization_xlnet_fast.py
index c27c5262f94c..589675f0062c 100644
--- a/src/transformers/models/xlnet/tokenization_xlnet_fast.py
+++ b/src/transformers/models/xlnet/tokenization_xlnet_fast.py
@@ -142,7 +142,7 @@ def __init__(
         cls_token="<cls>",
         mask_token="<mask>",
         additional_special_tokens=["<eop>", "<eod>"],
-        **kwargs
+        **kwargs,
     ):
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
@@ -169,7 +169,10 @@ def __init__(
         self.remove_space = remove_space
         self.keep_accents = keep_accents
         self.vocab_file = vocab_file
-        self.can_save_slow_tokenizer = False if not self.vocab_file else True
+
+    @property
+    def can_save_slow_tokenizer(self) -> bool:
+        return os.path.isfile(self.vocab_file) if self.vocab_file else False
 
     def build_inputs_with_special_tokens(
         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
diff --git a/src/transformers/models/xmod/__init__.py b/src/transformers/models/xmod/__init__.py
new file mode 100644
index 000000000000..f3cb6f195bd4
--- /dev/null
+++ b/src/transformers/models/xmod/__init__.py
@@ -0,0 +1,74 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {
+    "configuration_xmod": [
+        "XMOD_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "XmodConfig",
+        "XmodOnnxConfig",
+    ],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_xmod"] = [
+        "XMOD_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "XmodForCausalLM",
+        "XmodForMaskedLM",
+        "XmodForMultipleChoice",
+        "XmodForQuestionAnswering",
+        "XmodForSequenceClassification",
+        "XmodForTokenClassification",
+        "XmodModel",
+        "XmodPreTrainedModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_xmod import XMOD_PRETRAINED_CONFIG_ARCHIVE_MAP, XmodConfig, XmodOnnxConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_xmod import (
+            XMOD_PRETRAINED_MODEL_ARCHIVE_LIST,
+            XmodForCausalLM,
+            XmodForMaskedLM,
+            XmodForMultipleChoice,
+            XmodForQuestionAnswering,
+            XmodForSequenceClassification,
+            XmodForTokenClassification,
+            XmodModel,
+            XmodPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/xmod/configuration_xmod.py b/src/transformers/models/xmod/configuration_xmod.py
new file mode 100644
index 000000000000..012b7446c4c4
--- /dev/null
+++ b/src/transformers/models/xmod/configuration_xmod.py
@@ -0,0 +1,192 @@
+# coding=utf-8
+# Copyright 2023 The Meta AI Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" X-MOD configuration"""
+from collections import OrderedDict
+from typing import Mapping
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+XMOD_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "facebook/xmod-base": "https://huggingface.co/facebook/xmod-base/resolve/main/config.json",
+    "facebook/xmod-large-prenorm": "https://huggingface.co/facebook/xmod-large-prenorm/resolve/main/config.json",
+    "facebook/xmod-base-13-125k": "https://huggingface.co/facebook/xmod-base-13-125k/resolve/main/config.json",
+    "facebook/xmod-base-30-125k": "https://huggingface.co/facebook/xmod-base-30-125k/resolve/main/config.json",
+    "facebook/xmod-base-30-195k": "https://huggingface.co/facebook/xmod-base-30-195k/resolve/main/config.json",
+    "facebook/xmod-base-60-125k": "https://huggingface.co/facebook/xmod-base-60-125k/resolve/main/config.json",
+    "facebook/xmod-base-60-265k": "https://huggingface.co/facebook/xmod-base-60-265k/resolve/main/config.json",
+    "facebook/xmod-base-75-125k": "https://huggingface.co/facebook/xmod-base-75-125k/resolve/main/config.json",
+    "facebook/xmod-base-75-269k": "https://huggingface.co/facebook/xmod-base-75-269k/resolve/main/config.json",
+}
+
+
+class XmodConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`XmodModel`]. It is used to instantiate an X-MOD
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the
+    [facebook/xmod-base](https://huggingface.co/facebook/xmod-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the X-MOD model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`XmodModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`XmodModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+        pre_norm (`bool`, *optional*, defaults to `False`):
+            Whether to apply layer normalization before each block.
+        adapter_reduction_factor (`int` or `float`, *optional*, defaults to 2):
+            The factor by which the dimensionality of the adapter is reduced relative to `hidden_size`.
+        adapter_layer_norm (`bool`, *optional*, defaults to `False`):
+            Whether to apply a new layer normalization before the adapter modules (shared across all adapters).
+        adapter_reuse_layer_norm (`bool`, *optional*, defaults to `True`):
+            Whether to reuse the second layer normalization and apply it before the adapter modules as well.
+        ln_before_adapter (`bool`, *optional*, defaults to `True`):
+            Whether to apply the layer normalization before the residual connection around the adapter module.
+        languages (`Iterable[str]`, *optional*, defaults to `["en_XX"]`):
+            An iterable of language codes for which adapter modules should be initialized.
+        default_language (`str`, *optional*):
+            Language code of a default language. It will be assumed that the input is in this language if no language
+            codes are explicitly passed to the forward method.
+
+    Examples:
+
+    ```python
+    >>> from transformers import XmodConfig, XmodModel
+
+    >>> # Initializing an X-MOD facebook/xmod-base style configuration
+    >>> configuration = XmodConfig()
+
+    >>> # Initializing a model (with random weights) from the facebook/xmod-base style configuration
+    >>> model = XmodModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "xmod"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        position_embedding_type="absolute",
+        use_cache=True,
+        classifier_dropout=None,
+        pre_norm=False,
+        adapter_reduction_factor=2,
+        adapter_layer_norm=False,
+        adapter_reuse_layer_norm=True,
+        ln_before_adapter=True,
+        languages=("en_XX",),
+        default_language=None,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
+        self.pre_norm = pre_norm
+        self.adapter_reduction_factor = adapter_reduction_factor
+        self.adapter_layer_norm = adapter_layer_norm
+        self.adapter_reuse_layer_norm = adapter_reuse_layer_norm
+        self.ln_before_adapter = ln_before_adapter
+        self.languages = list(languages)
+        self.default_language = default_language
+
+
+# Copied from transformers.models.roberta.configuration_roberta.RobertaOnnxConfig with Roberta->Xmod
+class XmodOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task == "multiple-choice":
+            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
+        else:
+            dynamic_axis = {0: "batch", 1: "sequence"}
+        return OrderedDict(
+            [
+                ("input_ids", dynamic_axis),
+                ("attention_mask", dynamic_axis),
+            ]
+        )
diff --git a/src/transformers/models/xmod/convert_xmod_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/xmod/convert_xmod_original_pytorch_checkpoint_to_pytorch.py
new file mode 100644
index 000000000000..6352b7130055
--- /dev/null
+++ b/src/transformers/models/xmod/convert_xmod_original_pytorch_checkpoint_to_pytorch.py
@@ -0,0 +1,212 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert X-MOD checkpoint."""
+
+import argparse
+from pathlib import Path
+
+import fairseq
+import torch
+from fairseq.models.xmod import XMODModel as FairseqXmodModel
+from packaging import version
+
+from transformers import XmodConfig, XmodForMaskedLM, XmodForSequenceClassification
+from transformers.utils import logging
+
+
+if version.parse(fairseq.__version__) < version.parse("0.12.2"):
+    raise Exception("requires fairseq >= 0.12.2")
+if version.parse(fairseq.__version__) > version.parse("2"):
+    raise Exception("requires fairseq < v2")
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+SAMPLE_TEXT = "Hello, World!"
+SAMPLE_LANGUAGE = "en_XX"
+
+
+def convert_xmod_checkpoint_to_pytorch(
+    xmod_checkpoint_path: str, pytorch_dump_folder_path: str, classification_head: bool
+):
+    data_dir = Path("data_bin")
+    xmod = FairseqXmodModel.from_pretrained(
+        model_name_or_path=str(Path(xmod_checkpoint_path).parent),
+        checkpoint_file=Path(xmod_checkpoint_path).name,
+        _name="xmod_base",
+        arch="xmod_base",
+        task="multilingual_masked_lm",
+        data_name_or_path=str(data_dir),
+        bpe="sentencepiece",
+        sentencepiece_model=str(Path(xmod_checkpoint_path).parent / "sentencepiece.bpe.model"),
+        src_dict=str(data_dir / "dict.txt"),
+    )
+    xmod.eval()  # disable dropout
+    print(xmod)
+
+    xmod_sent_encoder = xmod.model.encoder.sentence_encoder
+    config = XmodConfig(
+        vocab_size=xmod_sent_encoder.embed_tokens.num_embeddings,
+        hidden_size=xmod.cfg.model.encoder_embed_dim,
+        num_hidden_layers=xmod.cfg.model.encoder_layers,
+        num_attention_heads=xmod.cfg.model.encoder_attention_heads,
+        intermediate_size=xmod.cfg.model.encoder_ffn_embed_dim,
+        max_position_embeddings=514,
+        type_vocab_size=1,
+        layer_norm_eps=1e-5,  # PyTorch default used in fairseq
+        pre_norm=xmod.cfg.model.encoder_normalize_before,
+        adapter_reduction_factor=getattr(xmod.cfg.model, "bottleneck", 2),
+        adapter_layer_norm=xmod.cfg.model.adapter_layer_norm,
+        adapter_reuse_layer_norm=xmod.cfg.model.adapter_reuse_layer_norm,
+        ln_before_adapter=xmod.cfg.model.ln_before_adapter,
+        languages=xmod.cfg.model.languages,
+    )
+    if classification_head:
+        config.num_labels = xmod.model.classification_heads["mnli"].out_proj.weight.shape[0]
+
+    print("Our X-MOD config:", config)
+
+    model = XmodForSequenceClassification(config) if classification_head else XmodForMaskedLM(config)
+    model.eval()
+
+    # Now let's copy all the weights.
+    # Embeddings
+    model.roberta.embeddings.word_embeddings.weight = xmod_sent_encoder.embed_tokens.weight
+    model.roberta.embeddings.position_embeddings.weight = xmod_sent_encoder.embed_positions.weight
+    model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
+        model.roberta.embeddings.token_type_embeddings.weight
+    )  # just zero them out b/c xmod doesn't use them.
+
+    model.roberta.embeddings.LayerNorm.weight = xmod_sent_encoder.layernorm_embedding.weight
+    model.roberta.embeddings.LayerNorm.bias = xmod_sent_encoder.layernorm_embedding.bias
+
+    for i in range(config.num_hidden_layers):
+        # Encoder: start of layer
+        layer = model.roberta.encoder.layer[i]
+        xmod_layer = xmod_sent_encoder.layers[i]
+
+        # self attention
+        self_attn = layer.attention.self
+        if not (
+            xmod_layer.self_attn.k_proj.weight.data.shape
+            == xmod_layer.self_attn.q_proj.weight.data.shape
+            == xmod_layer.self_attn.v_proj.weight.data.shape
+            == torch.Size((config.hidden_size, config.hidden_size))
+        ):
+            raise AssertionError("Dimensions of self-attention weights do not match.")
+
+        self_attn.query.weight.data = xmod_layer.self_attn.q_proj.weight
+        self_attn.query.bias.data = xmod_layer.self_attn.q_proj.bias
+        self_attn.key.weight.data = xmod_layer.self_attn.k_proj.weight
+        self_attn.key.bias.data = xmod_layer.self_attn.k_proj.bias
+        self_attn.value.weight.data = xmod_layer.self_attn.v_proj.weight
+        self_attn.value.bias.data = xmod_layer.self_attn.v_proj.bias
+
+        # self-attention output
+        self_output = layer.attention.output
+        if self_output.dense.weight.shape != xmod_layer.self_attn.out_proj.weight.shape:
+            raise AssertionError("Dimensions of self-attention output weights do not match.")
+        self_output.dense.weight = xmod_layer.self_attn.out_proj.weight
+        self_output.dense.bias = xmod_layer.self_attn.out_proj.bias
+        self_output.LayerNorm.weight = xmod_layer.self_attn_layer_norm.weight
+        self_output.LayerNorm.bias = xmod_layer.self_attn_layer_norm.bias
+
+        # intermediate
+        intermediate = layer.intermediate
+        if intermediate.dense.weight.shape != xmod_layer.fc1.weight.shape:
+            raise AssertionError("Dimensions of intermediate weights do not match.")
+        intermediate.dense.weight = xmod_layer.fc1.weight
+        intermediate.dense.bias = xmod_layer.fc1.bias
+
+        # output
+        bert_output = layer.output
+        if bert_output.dense.weight.shape != xmod_layer.fc2.weight.shape:
+            raise AssertionError("Dimensions of feed-forward weights do not match.")
+        bert_output.dense.weight = xmod_layer.fc2.weight
+        bert_output.dense.bias = xmod_layer.fc2.bias
+        bert_output.LayerNorm.weight = xmod_layer.final_layer_norm.weight
+        bert_output.LayerNorm.bias = xmod_layer.final_layer_norm.bias
+        if bert_output.adapter_layer_norm is not None:
+            bert_output.adapter_layer_norm.weight = xmod_layer.adapter_layer_norm.weight
+            bert_output.adapter_layer_norm.bias = xmod_layer.adapter_layer_norm.bias
+
+        if sorted(bert_output.adapter_modules.keys()) != sorted(xmod_layer.adapter_modules.keys()):
+            raise AssertionError("Lists of language adapters do not match.")
+        for lang_code, adapter in xmod_layer.adapter_modules.items():
+            to_adapter = bert_output.adapter_modules[lang_code]
+            from_adapter = xmod_layer.adapter_modules[lang_code]
+            to_adapter.dense1.weight = from_adapter.fc1.weight
+            to_adapter.dense1.bias = from_adapter.fc1.bias
+            to_adapter.dense2.weight = from_adapter.fc2.weight
+            to_adapter.dense2.bias = from_adapter.fc2.bias
+
+        # end of layer
+
+    if xmod_sent_encoder.layer_norm is not None:
+        model.roberta.encoder.LayerNorm.weight = xmod_sent_encoder.layer_norm.weight
+        model.roberta.encoder.LayerNorm.bias = xmod_sent_encoder.layer_norm.bias
+
+    if classification_head:
+        model.classifier.dense.weight = xmod.model.classification_heads["mnli"].dense.weight
+        model.classifier.dense.bias = xmod.model.classification_heads["mnli"].dense.bias
+        model.classifier.out_proj.weight = xmod.model.classification_heads["mnli"].out_proj.weight
+        model.classifier.out_proj.bias = xmod.model.classification_heads["mnli"].out_proj.bias
+    else:
+        # LM Head
+        model.lm_head.dense.weight = xmod.model.encoder.lm_head.dense.weight
+        model.lm_head.dense.bias = xmod.model.encoder.lm_head.dense.bias
+        model.lm_head.layer_norm.weight = xmod.model.encoder.lm_head.layer_norm.weight
+        model.lm_head.layer_norm.bias = xmod.model.encoder.lm_head.layer_norm.bias
+        model.lm_head.decoder.weight = xmod.model.encoder.lm_head.weight
+        model.lm_head.decoder.bias = xmod.model.encoder.lm_head.bias
+
+    # Let's check that we get the same results.
+    input_ids = xmod.encode(SAMPLE_TEXT).unsqueeze(0)  # batch of size 1
+    model.roberta.set_default_language(SAMPLE_LANGUAGE)
+
+    our_output = model(input_ids)[0]
+    if classification_head:
+        their_output = xmod.model.classification_heads["mnli"](xmod.extract_features(input_ids))
+    else:
+        their_output = xmod.model(input_ids, lang_id=[SAMPLE_LANGUAGE])[0]
+    print(our_output.shape, their_output.shape)
+    max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
+    print(f"max_absolute_diff = {max_absolute_diff}")  # ~ 1e-7
+    success = torch.allclose(our_output, their_output, atol=1e-3)
+    print("Do both models output the same tensors?", "🔥" if success else "💩")
+    if not success:
+        raise Exception("Something went wRoNg")
+
+    Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
+    print(f"Saving model to {pytorch_dump_folder_path}")
+    model.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--xmod_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
+    parser.add_argument(
+        "--classification_head", action="store_true", help="Whether to convert a final classification head."
+    )
+    args = parser.parse_args()
+    convert_xmod_checkpoint_to_pytorch(
+        args.xmod_checkpoint_path, args.pytorch_dump_folder_path, args.classification_head
+    )
diff --git a/src/transformers/models/xmod/modeling_xmod.py b/src/transformers/models/xmod/modeling_xmod.py
new file mode 100644
index 000000000000..61002bd2772e
--- /dev/null
+++ b/src/transformers/models/xmod/modeling_xmod.py
@@ -0,0 +1,1656 @@
+# coding=utf-8
+# Copyright 2023 Meta AI Team and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch X-MOD model."""
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN, gelu
+from ...modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_xmod import XmodConfig
+
+
+logger = logging.get_logger(__name__)
+
+XMOD_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/xmod-base",
+    "facebook/xmod-large-prenorm",
+    "facebook/xmod-base-13-125k",
+    "facebook/xmod-base-30-125k",
+    "facebook/xmod-base-30-195k",
+    "facebook/xmod-base-60-125k",
+    "facebook/xmod-base-60-265k",
+    "facebook/xmod-base-75-125k",
+    "facebook/xmod-base-75-269k",
+    # See all X-MOD models at https://huggingface.co/models?filter=xmod
+]
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->Xmod
+class XmodEmbeddings(nn.Module):
+    """
+    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
+    """
+
+    # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+        self.register_buffer(
+            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
+        )
+
+        # End copy
+        self.padding_idx = config.pad_token_id
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
+        )
+
+    def forward(
+        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        if position_ids is None:
+            if input_ids is not None:
+                # Create the position ids from the input token ids. Any padded tokens remain padded.
+                position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
+            else:
+                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
+
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
+        """
+        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+
+        Args:
+            inputs_embeds: torch.Tensor
+
+        Returns: torch.Tensor
+        """
+        input_shape = inputs_embeds.size()[:-1]
+        sequence_length = input_shape[1]
+
+        position_ids = torch.arange(
+            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+        )
+        return position_ids.unsqueeze(0).expand(input_shape)
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaSelfAttention with Roberta->Xmod
+class XmodSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        use_cache = past_key_value is not None
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
+            if use_cache:
+                position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
+                    -1, 1
+                )
+            else:
+                position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in XmodModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+
+class XmodSelfOutput(nn.Module):
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaSelfOutput.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states + input_tensor
+        return hidden_states
+
+
+class XmodAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        self.self = XmodSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.output = XmodSelfOutput(config)
+        self.pruned_heads = set()
+        self.pre_norm = config.pre_norm
+
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaAttention.prune_heads
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        residual = hidden_states
+        if self.pre_norm:
+            hidden_states = self.output.LayerNorm(hidden_states)
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], residual)
+        if not self.pre_norm:
+            attention_output = self.output.LayerNorm(attention_output)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaIntermediate
+class XmodIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class XmodAdapter(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.bottleneck_size = config.hidden_size // config.adapter_reduction_factor
+        self.dense1 = nn.Linear(config.hidden_size, self.bottleneck_size)
+        self.dense2 = nn.Linear(self.bottleneck_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.adapter_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.adapter_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense1(hidden_states)
+        hidden_states = self.adapter_act_fn(hidden_states)
+        hidden_states = self.dense2(hidden_states)
+        return hidden_states
+
+
+class XmodOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.ln_before_adapter = config.ln_before_adapter
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        if config.adapter_layer_norm:
+            self.adapter_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        else:
+            self.adapter_layer_norm = None
+        self.adapter_reuse_layer_norm = config.adapter_reuse_layer_norm
+        self.adapter_modules = nn.ModuleDict({})
+        for language in config.languages:
+            self.adapter_modules[str(language)] = XmodAdapter(config)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor, lang_ids: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states + input_tensor
+        hidden_states = self.lang_adapter(lang_ids, hidden_states)
+        return hidden_states
+
+    def lang_adapter(self, lang_ids: torch.Tensor, hidden_states: torch.Tensor):
+        # Process subsequent samples with the same lang_id in parallel
+        lang_ids, lang_lengths = torch.unique_consecutive(lang_ids, return_counts=True)
+
+        if not self.ln_before_adapter:
+            residual = hidden_states
+
+        if self.adapter_layer_norm is not None:
+            hidden_states = self.adapter_layer_norm(hidden_states)
+        elif self.adapter_reuse_layer_norm:
+            hidden_states = self.LayerNorm(hidden_states)
+
+        if self.ln_before_adapter:
+            residual = hidden_states
+
+        split_hidden_states = torch.split(hidden_states, lang_lengths.tolist(), 0)
+        lang_wise_outputs = []
+        for i, (lang_id, split_hidden_state) in enumerate(zip(lang_ids, split_hidden_states)):
+            lang = list(self.adapter_modules.keys())[int(lang_id.item())]
+            lang_wise_outputs.append(self.adapter_modules[lang](split_hidden_state))
+        hidden_states = torch.cat(lang_wise_outputs, 0)
+
+        hidden_states = self.dropout(hidden_states)
+        hidden_states += residual
+        return hidden_states
+
+
+class XmodLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = XmodAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = XmodAttention(config, position_embedding_type="absolute")
+        self.intermediate = XmodIntermediate(config)
+        self.output = XmodOutput(config)
+        self.pre_norm = config.pre_norm
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        lang_ids: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        residual = attention_output
+        if self.pre_norm:
+            attention_output = self.output.LayerNorm(attention_output)
+        intermediate_output = apply_chunking_to_forward(
+            self.feed_forward_chunk,
+            self.chunk_size_feed_forward,
+            self.seq_len_dim,
+            attention_output,
+        )
+        layer_output = self.output(intermediate_output, residual, lang_ids)
+        if not self.pre_norm:
+            layer_output = self.output.LayerNorm(layer_output)
+        outputs = (layer_output,) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        return self.intermediate(attention_output)
+
+
+class XmodEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([XmodLayer(config) for _ in range(config.num_hidden_layers)])
+        self.is_pre_norm = config.pre_norm
+        if self.is_pre_norm:
+            self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        lang_ids: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    lang_ids,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    lang_ids,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if self.is_pre_norm:
+            hidden_states = self.LayerNorm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaPooler
+class XmodPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class XmodPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = XmodConfig
+    base_model_prefix = "roberta"
+    supports_gradient_checkpointing = True
+
+    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaPreTrainedModel._set_gradient_checkpointing with Roberta->Xmod
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, XmodEncoder):
+            module.gradient_checkpointing = value
+
+    def set_default_language(self, language: str):
+        """
+        Set the default language code for the model. This is used when the language is not specified in the input.
+
+        Args:
+            language (`str`): The language code, such as `"en_XX"` or `"de_DE"`.
+        """
+        if language not in self.config.languages:
+            raise ValueError(
+                f"{self} does not have an adapter for {language}. Supported languages: {list(self.config.languages)}"
+            )
+        self.config.default_language = language
+
+    def freeze_embeddings_and_language_adapters(self):
+        """
+        Freeze the embeddings and language adapters of the model. Usually, this is applied before the model is
+        fine-tuned on a downstream task.
+        """
+        logger.info("Freezing embeddings")
+        for parameter in self.roberta.embeddings.parameters():
+            parameter.requires_grad = False
+        logger.info("Freezing adapters")
+        for layer in self.roberta.encoder.layer:
+            if layer.output.adapter_layer_norm is not None:
+                for parameter in layer.output.adapter_layer_norm.parameters():
+                    parameter.requires_grad = False
+            for parameter in layer.output.adapter_modules.parameters():
+                parameter.requires_grad = False
+
+
+XMOD_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`XmodConfig`]): Model configuration class with all the parameters of the
+            model. Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+XMOD_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        lang_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of the language adapters that should be activated for each sample, respectively. Default: the index
+            that corresponds to `self.config.default_language`.
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare X-MOD Model transformer outputting raw hidden-states without any specific head on top.",
+    XMOD_START_DOCSTRING,
+)
+class XmodModel(XmodPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
+    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
+    Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
+    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
+    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
+
+    .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
+
+    """
+
+    # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Xmod
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = XmodEmbeddings(config)
+        self.encoder = XmodEncoder(config)
+
+        self.pooler = XmodPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaModel.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaModel.set_input_embeddings
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaModel._prune_heads
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(XMOD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        lang_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors:
+        of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if lang_ids is None:
+            if self.config.default_language is None:
+                raise ValueError("Input language unknown. Please call `XmodPreTrainedModel.set_default_language()`")
+            adapter_languages = list(self.encoder.layer[0].output.adapter_modules.keys())
+            default_lang_id = adapter_languages.index(self.config.default_language)
+            lang_ids = default_lang_id * torch.ones(batch_size, device=device)
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            lang_ids=lang_ids,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    "X-MOD Model with a `language modeling` head on top for CLM fine-tuning.",
+    XMOD_START_DOCSTRING,
+)
+class XmodForCausalLM(XmodPreTrainedModel):
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
+
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM.__init__ with Roberta->Xmod
+    def __init__(self, config):
+        super().__init__(config)
+
+        if not config.is_decoder:
+            logger.warning("If you want to use `XmodLMHeadModel` as a standalone, add `is_decoder=True.`")
+
+        self.roberta = XmodModel(config, add_pooling_layer=False)
+        self.lm_head = XmodLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM.get_output_embeddings
+    def get_output_embeddings(self):
+        return self.lm_head.decoder
+
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM.set_output_embeddings
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(XMOD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        lang_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        past_key_values: Tuple[Tuple[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+
+        Returns: `transformers.modeling_outputs.CausalLMOutputWithCrossAttentions` or `tuple(torch.FloatTensor)`
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, XmodForCausalLM, AutoConfig
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
+        >>> config = AutoConfig.from_pretrained("facebook/xmod-base")
+        >>> config.is_decoder = True
+        >>> model = XmodForCausalLM.from_pretrained("facebook/xmod-base", config=config)
+        >>> model.set_default_language("en_XX")
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> prediction_logits = outputs.logits
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.roberta(
+            input_ids,
+            lang_ids=lang_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output)
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM.prepare_inputs_for_generation
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        # cut decoder_input_ids if past is used
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
+
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM._reorder_cache
+    def _reorder_cache(self, past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+
+@add_start_docstrings(
+    """X-MOD Model with a `language modeling` head on top.""",
+    XMOD_START_DOCSTRING,
+)
+class XmodForMaskedLM(XmodPreTrainedModel):
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
+
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM.__init__ with Roberta->Xmod
+    def __init__(self, config):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                "If you want to use `XmodForMaskedLM` make sure `config.is_decoder=False` for "
+                "bi-directional self-attention."
+            )
+
+        self.roberta = XmodModel(config, add_pooling_layer=False)
+        self.lm_head = XmodLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM.get_output_embeddings
+    def get_output_embeddings(self):
+        return self.lm_head.decoder
+
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM.set_output_embeddings
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(XMOD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        lang_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+            Used to hide legacy arguments that have been deprecated.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta(
+            input_ids,
+            lang_ids=lang_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaLMHead
+class XmodLMHead(nn.Module):
+    """Roberta Head for masked language modeling."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+        self.decoder.bias = self.bias
+
+    def forward(self, features, **kwargs):
+        x = self.dense(features)
+        x = gelu(x)
+        x = self.layer_norm(x)
+
+        # project back to size of vocabulary with bias
+        x = self.decoder(x)
+
+        return x
+
+    def _tie_weights(self):
+        # To tie those two weights if they get disconnected (on TPU or when the bias is resized)
+        # For accelerate compatibility and to not break backward compatibility
+        if self.decoder.bias.device.type == "meta":
+            self.decoder.bias = self.bias
+        else:
+            self.bias = self.decoder.bias
+
+
+@add_start_docstrings(
+    """
+    X-MOD Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
+    output) e.g. for GLUE tasks.
+    """,
+    XMOD_START_DOCSTRING,
+)
+class XmodForSequenceClassification(XmodPreTrainedModel):
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification.__init__ with Roberta->Xmod
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.roberta = XmodModel(config, add_pooling_layer=False)
+        self.classifier = XmodClassificationHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(XMOD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        lang_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta(
+            input_ids,
+            lang_ids=lang_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    X-MOD Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    XMOD_START_DOCSTRING,
+)
+class XmodForMultipleChoice(XmodPreTrainedModel):
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice.__init__ with Roberta->Xmod
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.roberta = XmodModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(XMOD_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        lang_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        flat_lang_ids = lang_ids.repeat(input_ids.size(0) * input_ids.size(1)) if lang_ids is not None else None
+        flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        flat_inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.roberta(
+            flat_input_ids,
+            lang_ids=flat_lang_ids,
+            position_ids=flat_position_ids,
+            token_type_ids=flat_token_type_ids,
+            attention_mask=flat_attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=flat_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    X-MOD Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    XMOD_START_DOCSTRING,
+)
+class XmodForTokenClassification(XmodPreTrainedModel):
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification.__init__ with Roberta->Xmod
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.roberta = XmodModel(config, add_pooling_layer=False)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(XMOD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        lang_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta(
+            input_ids,
+            lang_ids=lang_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaClassificationHead
+class XmodClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, features, **kwargs):
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = torch.tanh(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+
+
+@add_start_docstrings(
+    """
+    X-MOD Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    XMOD_START_DOCSTRING,
+)
+class XmodForQuestionAnswering(XmodPreTrainedModel):
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering.__init__ with Roberta->Xmod
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.roberta = XmodModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(XMOD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        lang_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta(
+            input_ids,
+            lang_ids=lang_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+# Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids
+def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+
+    Args:
+        x: torch.Tensor x:
+
+    Returns: torch.Tensor
+    """
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    mask = input_ids.ne(padding_idx).int()
+    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
+    return incremental_indices.long() + padding_idx
diff --git a/src/transformers/models/yolos/__init__.py b/src/transformers/models/yolos/__init__.py
index f16fc90da4e3..28d59763bb85 100644
--- a/src/transformers/models/yolos/__init__.py
+++ b/src/transformers/models/yolos/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/yolos/configuration_yolos.py b/src/transformers/models/yolos/configuration_yolos.py
index c6bfbff444dd..77a036f5adb7 100644
--- a/src/transformers/models/yolos/configuration_yolos.py
+++ b/src/transformers/models/yolos/configuration_yolos.py
@@ -54,9 +54,9 @@ class YolosConfig(PretrainedConfig):
         hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"selu"` and `"gelu_new"` are supported.
-        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
@@ -64,13 +64,13 @@ class YolosConfig(PretrainedConfig):
             The epsilon used by the layer normalization layers.
         image_size (`List[int]`, *optional*, defaults to `[512, 864]`):
             The size (resolution) of each image.
-        patch_size (`int`, *optional*, defaults to `16`):
+        patch_size (`int`, *optional*, defaults to 16):
             The size (resolution) of each patch.
-        num_channels (`int`, *optional*, defaults to `3`):
+        num_channels (`int`, *optional*, defaults to 3):
             The number of input channels.
         qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to the queries, keys and values.
-        num_detection_tokens (`int`, *optional*, defaults to `100`):
+        num_detection_tokens (`int`, *optional*, defaults to 100):
             The number of detection tokens.
         use_mid_position_embeddings (`bool`, *optional*, defaults to `True`):
             Whether to use the mid-layer position encodings.
@@ -129,7 +129,7 @@ def __init__(
         bbox_loss_coefficient=5,
         giou_loss_coefficient=2,
         eos_coefficient=0.1,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -160,7 +160,6 @@ def __init__(
 
 
 class YolosOnnxConfig(OnnxConfig):
-
     torch_onnx_minimum_version = version.parse("1.11")
 
     @property
diff --git a/src/transformers/models/yolos/convert_yolos_to_pytorch.py b/src/transformers/models/yolos/convert_yolos_to_pytorch.py
index d953936e24ff..35238151ab93 100644
--- a/src/transformers/models/yolos/convert_yolos_to_pytorch.py
+++ b/src/transformers/models/yolos/convert_yolos_to_pytorch.py
@@ -19,12 +19,12 @@
 import json
 from pathlib import Path
 
+import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 
-import requests
-from huggingface_hub import hf_hub_download
-from transformers import YolosConfig, YolosFeatureExtractor, YolosForObjectDetection
+from transformers import YolosConfig, YolosForObjectDetection, YolosImageProcessor
 from transformers.utils import logging
 
 
@@ -172,10 +172,10 @@ def convert_yolos_checkpoint(
     new_state_dict = convert_state_dict(state_dict, model)
     model.load_state_dict(new_state_dict)
 
-    # Check outputs on an image, prepared by YolosFeatureExtractor
+    # Check outputs on an image, prepared by YolosImageProcessor
     size = 800 if yolos_name != "yolos_ti" else 512
-    feature_extractor = YolosFeatureExtractor(format="coco_detection", size=size)
-    encoding = feature_extractor(images=prepare_img(), return_tensors="pt")
+    image_processor = YolosImageProcessor(format="coco_detection", size=size)
+    encoding = image_processor(images=prepare_img(), return_tensors="pt")
     outputs = model(**encoding)
     logits, pred_boxes = outputs.logits, outputs.pred_boxes
 
@@ -224,8 +224,8 @@ def convert_yolos_checkpoint(
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     print(f"Saving model {yolos_name} to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
         model_mapping = {
@@ -238,7 +238,7 @@ def convert_yolos_checkpoint(
 
         print("Pushing to the hub...")
         model_name = model_mapping[yolos_name]
-        feature_extractor.push_to_hub(model_name, organization="hustvl")
+        image_processor.push_to_hub(model_name, organization="hustvl")
         model.push_to_hub(model_name, organization="hustvl")
 
 
diff --git a/src/transformers/models/yolos/image_processing_yolos.py b/src/transformers/models/yolos/image_processing_yolos.py
index ff0cd23caa5b..c51f5add3049 100644
--- a/src/transformers/models/yolos/image_processing_yolos.py
+++ b/src/transformers/models/yolos/image_processing_yolos.py
@@ -15,26 +15,24 @@
 """Image processor class for YOLOS."""
 
 import pathlib
-import warnings
 from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 import numpy as np
 
-from transformers.feature_extraction_utils import BatchFeature
-from transformers.image_processing_utils import BaseImageProcessor, get_size_dict
-from transformers.image_transforms import (
+from ...feature_extraction_utils import BatchFeature
+from ...image_processing_utils import BaseImageProcessor, get_size_dict
+from ...image_transforms import (
     PaddingMode,
     center_to_corners_format,
     corners_to_center_format,
     id_to_rgb,
-    normalize,
     pad,
     rescale,
     resize,
     rgb_to_id,
     to_channel_dimension_format,
 )
-from transformers.image_utils import (
+from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
     ChannelDimension,
@@ -42,13 +40,16 @@
     PILImageResampling,
     get_image_size,
     infer_channel_dimension_format,
-    is_batched,
+    is_scaled_image,
+    make_list_of_images,
     to_numpy_array,
     valid_coco_detection_annotations,
     valid_coco_panoptic_annotations,
     valid_images,
 )
-from transformers.utils import (
+from ...utils import (
+    ExplicitEnum,
+    TensorType,
     is_flax_available,
     is_jax_tensor,
     is_scipy_available,
@@ -57,8 +58,8 @@
     is_torch_available,
     is_torch_tensor,
     is_vision_available,
+    logging,
 )
-from transformers.utils.generic import ExplicitEnum, TensorType
 
 
 if is_torch_available():
@@ -74,6 +75,10 @@
     import scipy.special
     import scipy.stats
 
+logger = logging.get_logger(__name__)
+
+AnnotationType = Dict[str, Union[int, str, List[Dict]]]
+
 
 class AnnotionFormat(ExplicitEnum):
     COCO_DETECTION = "coco_detection"
@@ -84,18 +89,21 @@ class AnnotionFormat(ExplicitEnum):
 
 
 # Copied from transformers.models.detr.image_processing_detr.get_max_height_width
-def get_max_height_width(images: List[np.ndarray]) -> List[int]:
+def get_max_height_width(
+    images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> List[int]:
     """
     Get the maximum height and width across all images in a batch.
     """
-    input_channel_dimension = infer_channel_dimension_format(images[0])
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(images[0])
 
-    if input_channel_dimension == ChannelDimension.FIRST:
+    if input_data_format == ChannelDimension.FIRST:
         _, max_height, max_width = max_across_indices([img.shape for img in images])
-    elif input_channel_dimension == ChannelDimension.LAST:
+    elif input_data_format == ChannelDimension.LAST:
         max_height, max_width, _ = max_across_indices([img.shape for img in images])
     else:
-        raise ValueError(f"Invalid channel dimension format: {input_channel_dimension}")
+        raise ValueError(f"Invalid channel dimension format: {input_data_format}")
     return (max_height, max_width)
 
 
@@ -133,7 +141,10 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in
 
 # Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size
 def get_resize_output_image_size(
-    input_image: np.ndarray, size: Union[int, Tuple[int, int], List[int]], max_size: Optional[int] = None
+    input_image: np.ndarray,
+    size: Union[int, Tuple[int, int], List[int]],
+    max_size: Optional[int] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
 ) -> Tuple[int, int]:
     """
     Computes the output image size given the input image size and the desired output size. If the desired output size
@@ -147,8 +158,10 @@ def get_resize_output_image_size(
             The desired output size.
         max_size (`int`, *optional*):
             The maximum allowed output size.
+        input_data_format (`ChannelDimension` or `str`, *optional*):
+            The channel dimension format of the input image. If not provided, it will be inferred from the input image.
     """
-    image_size = get_image_size(input_image)
+    image_size = get_image_size(input_image, input_data_format)
     if isinstance(size, (list, tuple)):
         return size
 
@@ -218,7 +231,9 @@ def max_across_indices(values: Iterable[Any]) -> List[Any]:
 
 
 # Copied from transformers.models.detr.image_processing_detr.make_pixel_mask
-def make_pixel_mask(image: np.ndarray, output_size: Tuple[int, int]) -> np.ndarray:
+def make_pixel_mask(
+    image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> np.ndarray:
     """
     Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
 
@@ -228,7 +243,7 @@ def make_pixel_mask(image: np.ndarray, output_size: Tuple[int, int]) -> np.ndarr
         output_size (`Tuple[int, int]`):
             Output size of the mask.
     """
-    input_height, input_width = get_image_size(image)
+    input_height, input_width = get_image_size(image, channel_dim=input_data_format)
     mask = np.zeros(output_size, dtype=np.int64)
     mask[:input_height, :input_width] = 1
     return mask
@@ -270,11 +285,16 @@ def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndar
 
 
 # Copied from transformers.models.detr.image_processing_detr.prepare_coco_detection_annotation
-def prepare_coco_detection_annotation(image, target, return_segmentation_masks: bool = False):
+def prepare_coco_detection_annotation(
+    image,
+    target,
+    return_segmentation_masks: bool = False,
+    input_data_format: Optional[Union[ChannelDimension, str]] = None,
+):
     """
     Convert the target in COCO format into the format expected by DETR.
     """
-    image_height, image_width = get_image_size(image)
+    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
 
     image_id = target["image_id"]
     image_id = np.asarray([image_id], dtype=np.int64)
@@ -359,12 +379,16 @@ def masks_to_boxes(masks: np.ndarray) -> np.ndarray:
 
 # Copied from transformers.models.detr.image_processing_detr.prepare_coco_panoptic_annotation with DETR->YOLOS
 def prepare_coco_panoptic_annotation(
-    image: np.ndarray, target: Dict, masks_path: Union[str, pathlib.Path], return_masks: bool = True
+    image: np.ndarray,
+    target: Dict,
+    masks_path: Union[str, pathlib.Path],
+    return_masks: bool = True,
+    input_data_format: Union[ChannelDimension, str] = None,
 ) -> Dict:
     """
     Prepare a coco panoptic annotation for YOLOS.
     """
-    image_height, image_width = get_image_size(image)
+    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
     annotation_path = pathlib.Path(masks_path) / target["file_name"]
 
     new_target = {}
@@ -512,7 +536,7 @@ def binary_mask_to_rle(mask):
     pixels = np.concatenate([[0], pixels, [0]])
     runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
     runs[1::2] -= runs[::2]
-    return [x for x in runs]
+    return list(runs)
 
 
 # Copied from transformers.models.detr.image_processing_detr.convert_segmentation_to_rle
@@ -695,16 +719,15 @@ def __init__(
         image_mean: Union[float, List[float]] = None,
         image_std: Union[float, List[float]] = None,
         do_pad: bool = True,
-        **kwargs
+        **kwargs,
     ) -> None:
         if "pad_and_return_pixel_mask" in kwargs:
             do_pad = kwargs.pop("pad_and_return_pixel_mask")
 
         if "max_size" in kwargs:
-            warnings.warn(
+            logger.warning_once(
                 "The `max_size` parameter is deprecated and will be removed in v4.26. "
                 "Please specify in `size['longest_edge'] instead`.",
-                FutureWarning,
             )
             max_size = kwargs.pop("max_size")
         else:
@@ -748,6 +771,7 @@ def prepare_annotation(
         format: Optional[AnnotionFormat] = None,
         return_segmentation_masks: bool = None,
         masks_path: Optional[Union[str, pathlib.Path]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> Dict:
         """
         Prepare an annotation for feeding into DETR model.
@@ -756,20 +780,26 @@ def prepare_annotation(
 
         if format == AnnotionFormat.COCO_DETECTION:
             return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
-            target = prepare_coco_detection_annotation(image, target, return_segmentation_masks)
+            target = prepare_coco_detection_annotation(
+                image, target, return_segmentation_masks, input_data_format=input_data_format
+            )
         elif format == AnnotionFormat.COCO_PANOPTIC:
             return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
             target = prepare_coco_panoptic_annotation(
-                image, target, masks_path=masks_path, return_masks=return_segmentation_masks
+                image,
+                target,
+                masks_path=masks_path,
+                return_masks=return_segmentation_masks,
+                input_data_format=input_data_format,
             )
         else:
             raise ValueError(f"Format {format} is not supported.")
         return target
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare
-    def prepare(self, image, target, return_segmentation_masks=False, masks_path=None):
-        warnings.warn(
-            "The `prepare` method is deprecated and will be removed in a future version. "
+    def prepare(self, image, target, return_segmentation_masks=None, masks_path=None):
+        logger.warning_once(
+            "The `prepare` method is deprecated and will be removed in a v4.33. "
             "Please use `prepare_annotation` instead. Note: the `prepare_annotation` method "
             "does not return the image anymore.",
         )
@@ -778,17 +808,17 @@ def prepare(self, image, target, return_segmentation_masks=False, masks_path=Non
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.convert_coco_poly_to_mask
     def convert_coco_poly_to_mask(self, *args, **kwargs):
-        warnings.warn("The `convert_coco_poly_to_mask` method is deprecated and will be removed in a future version. ")
+        logger.warning_once("The `convert_coco_poly_to_mask` method is deprecated and will be removed in v4.33. ")
         return convert_coco_poly_to_mask(*args, **kwargs)
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_detection with DETR->Yolos
     def prepare_coco_detection(self, *args, **kwargs):
-        warnings.warn("The `prepare_coco_detection` method is deprecated and will be removed in a future version. ")
+        logger.warning_once("The `prepare_coco_detection` method is deprecated and will be removed in v4.33. ")
         return prepare_coco_detection_annotation(*args, **kwargs)
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_panoptic
     def prepare_coco_panoptic(self, *args, **kwargs):
-        warnings.warn("The `prepare_coco_panoptic` method is deprecated and will be removed in a future version. ")
+        logger.warning_once("The `prepare_coco_panoptic` method is deprecated and will be removed in v4.33. ")
         return prepare_coco_panoptic_annotation(*args, **kwargs)
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize
@@ -798,24 +828,40 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BILINEAR,
         data_format: Optional[ChannelDimension] = None,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
         int, smaller edge of the image will be matched to this number.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or
+                `height` and `width`.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                Resampling filter to use if resizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
         """
         if "max_size" in kwargs:
-            warnings.warn(
+            logger.warning_once(
                 "The `max_size` parameter is deprecated and will be removed in v4.26. "
                 "Please specify in `size['longest_edge'] instead`.",
-                FutureWarning,
             )
             max_size = kwargs.pop("max_size")
         else:
             max_size = None
         size = get_size_dict(size, max_size=max_size, default_to_square=False)
         if "shortest_edge" in size and "longest_edge" in size:
-            size = get_resize_output_image_size(image, size["shortest_edge"], size["longest_edge"])
+            size = get_resize_output_image_size(
+                image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
+            )
         elif "height" in size and "width" in size:
             size = (size["height"], size["width"])
         else:
@@ -823,7 +869,9 @@ def resize(
                 "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
                 f" {size.keys()}."
             )
-        image = resize(image, size=size, resample=resample, data_format=data_format)
+        image = resize(
+            image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
+        )
         return image
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize_annotation
@@ -842,25 +890,32 @@ def resize_annotation(
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
     def rescale(
-        self, image: np.ndarray, rescale_factor: Union[float, int], data_format: Optional[ChannelDimension] = None
-    ) -> np.ndarray:
-        """
-        Rescale the image by the given factor.
-        """
-        return rescale(image, rescale_factor, data_format=data_format)
-
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize
-    def normalize(
         self,
         image: np.ndarray,
-        mean: Union[float, Iterable[float]],
-        std: Union[float, Iterable[float]],
-        data_format: Optional[ChannelDimension] = None,
+        rescale_factor: float,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.ndarray:
         """
-        Normalize the image with the given mean and standard deviation.
+        Rescale the image by the given factor. image = image * rescale_factor.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            rescale_factor (`float`):
+                The value to use for rescaling.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the input image. If unset, is inferred from the input image. Can be
+                one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
         """
-        return normalize(image, mean=mean, std=std, data_format=data_format)
+        return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation
     def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
@@ -877,28 +932,36 @@ def _pad_image(
         output_size: Tuple[int, int],
         constant_values: Union[float, Iterable[float]] = 0,
         data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.ndarray:
         """
         Pad an image with zeros to the given size.
         """
-        input_height, input_width = get_image_size(image)
+        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
         output_height, output_width = output_size
 
         pad_bottom = output_height - input_height
         pad_right = output_width - input_width
         padding = ((0, pad_bottom), (0, pad_right))
         padded_image = pad(
-            image, padding, mode=PaddingMode.CONSTANT, constant_values=constant_values, data_format=data_format
+            image,
+            padding,
+            mode=PaddingMode.CONSTANT,
+            constant_values=constant_values,
+            data_format=data_format,
+            input_data_format=input_data_format,
         )
         return padded_image
 
     def pad(
         self,
         images: List[np.ndarray],
+        constant_values: Union[float, Iterable[float]] = 0,
         return_pixel_mask: bool = False,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = None,
-    ) -> np.ndarray:
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> BatchFeature:
         """
         Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
         in the batch and optionally returns their corresponding pixel mask.
@@ -906,20 +969,41 @@ def pad(
         Args:
             image (`np.ndarray`):
                 Image to pad.
+            constant_values (`float` or `Iterable[float]`, *optional*):
+                The value to use for the padding if `mode` is `"constant"`.
             return_pixel_mask (`bool`, *optional*, defaults to `True`):
                 Whether to return a pixel mask.
-            input_channel_dimension (`ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be inferred from the input image.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
         """
-        pad_size = get_max_height_width(images)
-
-        padded_images = [self._pad_image(image, pad_size, data_format=data_format) for image in images]
+        pad_size = get_max_height_width(images, input_data_format=input_data_format)
+
+        padded_images = [
+            self._pad_image(
+                image,
+                pad_size,
+                constant_values=constant_values,
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
+            for image in images
+        ]
         data = {"pixel_values": padded_images}
 
         if return_pixel_mask:
-            masks = [make_pixel_mask(image=image, output_size=pad_size) for image in images]
+            masks = [
+                make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
+                for image in images
+            ]
             data["pixel_mask"] = masks
 
         return BatchFeature(data=data, tensor_type=return_tensors)
@@ -927,7 +1011,7 @@ def pad(
     def preprocess(
         self,
         images: ImageInput,
-        annotations: Optional[Union[List[Dict], List[List[Dict]]]] = None,
+        annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
         return_segmentation_masks: bool = None,
         masks_path: Optional[Union[str, pathlib.Path]] = None,
         do_resize: Optional[bool] = None,
@@ -942,15 +1026,17 @@ def preprocess(
         format: Optional[Union[str, AnnotionFormat]] = None,
         return_tensors: Optional[Union[TensorType, str]] = None,
         data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
-        **kwargs
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> BatchFeature:
         """
         Preprocess an image or a batch of images so that it can be used by the model.
 
         Args:
             images (`ImageInput`):
-                Image or batch of images to preprocess.
-            annotations (`List[Dict]` or `List[List[Dict]]`, *optional*):
+                Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging
+                from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
                 List of annotations associated with the image or batch of images. If annotionation is for object
                 detection, the annotations should be a dictionary with the following keys:
                 - "image_id" (`int`): The image id.
@@ -989,21 +1075,25 @@ def preprocess(
                 Type of tensors to return. If `None`, will return the list of images.
             data_format (`str` or `ChannelDimension`, *optional*, defaults to self.data_format):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
         if "pad_and_return_pixel_mask" in kwargs:
-            warnings.warn(
-                "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, "
+            logger.warning_once(
+                "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in v4.33, "
                 "use `do_pad` instead.",
-                FutureWarning,
             )
             do_pad = kwargs.pop("pad_and_return_pixel_mask")
 
         max_size = None
         if "max_size" in kwargs:
-            warnings.warn(
-                "The `max_size` argument is deprecated and will be removed in a future version, use"
+            logger.warning_once(
+                "The `max_size` argument is deprecated and will be removed in v4.33, use"
                 " `size['longest_edge']` instead.",
-                FutureWarning,
             )
             size = kwargs.pop("max_size")
 
@@ -1028,9 +1118,9 @@ def preprocess(
         if do_normalize is not None and (image_mean is None or image_std is None):
             raise ValueError("Image mean and std must be specified if do_normalize is True.")
 
-        if not is_batched(images):
-            images = [images]
-            annotations = [annotations] if annotations is not None else None
+        images = make_list_of_images(images)
+        if annotations is not None and isinstance(annotations, dict):
+            annotations = [annotations]
 
         if annotations is not None and len(images) != len(annotations):
             raise ValueError(
@@ -1075,13 +1165,28 @@ def preprocess(
         # All transformations expect numpy arrays
         images = [to_numpy_array(image) for image in images]
 
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
         # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
         if annotations is not None:
             prepared_images = []
             prepared_annotations = []
             for image, target in zip(images, annotations):
                 target = self.prepare_annotation(
-                    image, target, format, return_segmentation_masks=return_segmentation_masks, masks_path=masks_path
+                    image,
+                    target,
+                    format,
+                    return_segmentation_masks=return_segmentation_masks,
+                    masks_path=masks_path,
+                    input_data_format=input_data_format,
                 )
                 prepared_images.append(image)
                 prepared_annotations.append(target)
@@ -1094,22 +1199,31 @@ def preprocess(
             if annotations is not None:
                 resized_images, resized_annotations = [], []
                 for image, target in zip(images, annotations):
-                    orig_size = get_image_size(image)
-                    resized_image = self.resize(image, size=size, max_size=max_size, resample=resample)
-                    resized_annotation = self.resize_annotation(target, orig_size, get_image_size(resized_image))
+                    orig_size = get_image_size(image, input_data_format)
+                    resized_image = self.resize(
+                        image, size=size, max_size=max_size, resample=resample, input_data_format=input_data_format
+                    )
+                    resized_annotation = self.resize_annotation(
+                        target, orig_size, get_image_size(resized_image, input_data_format)
+                    )
                     resized_images.append(resized_image)
                     resized_annotations.append(resized_annotation)
                 images = resized_images
                 annotations = resized_annotations
                 del resized_images, resized_annotations
             else:
-                images = [self.resize(image, size=size, resample=resample) for image in images]
+                images = [
+                    self.resize(image, size=size, resample=resample, input_data_format=input_data_format)
+                    for image in images
+                ]
 
         if do_rescale:
-            images = [self.rescale(image, rescale_factor) for image in images]
+            images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images]
 
         if do_normalize:
-            images = [self.normalize(image, image_mean, image_std) for image in images]
+            images = [
+                self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
+            ]
             if annotations is not None:
                 annotations = [
                     self.normalize_annotation(annotation, get_image_size(image))
@@ -1117,9 +1231,12 @@ def preprocess(
                 ]
 
         if do_pad:
-            data = self.pad(images, data_format=data_format)
+            data = self.pad(images, data_format=data_format, input_data_format=input_data_format)
         else:
-            images = [to_channel_dimension_format(image, data_format) for image in images]
+            images = [
+                to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+                for image in images
+            ]
             data = {"pixel_values": images}
 
         encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
@@ -1148,10 +1265,9 @@ def post_process(self, outputs, target_sizes):
             `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
             in the batch as predicted by the model.
         """
-        warnings.warn(
+        logger.warning_once(
             "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
-            " `post_process_object_detection`",
-            FutureWarning,
+            " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
         )
 
         out_logits, out_bbox = outputs.logits, outputs.pred_boxes
@@ -1216,7 +1332,7 @@ def post_process_object_detection(
             else:
                 img_h, img_w = target_sizes.unbind(1)
 
-            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
             boxes = boxes * scale_fct[:, None, :]
 
         results = []
diff --git a/src/transformers/models/yolos/modeling_yolos.py b/src/transformers/models/yolos/modeling_yolos.py
index 71c9ddb37e72..e3cb02ceae6e 100755
--- a/src/transformers/models/yolos/modeling_yolos.py
+++ b/src/transformers/models/yolos/modeling_yolos.py
@@ -53,7 +53,6 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "YolosConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "YolosImageProcessor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "hustvl/yolos-small"
@@ -83,7 +82,7 @@ class YolosObjectDetectionOutput(ModelOutput):
         pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
             Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
             values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
-            possible padding). You can use [`~DetrImageProcessor.post_process`] to retrieve the unnormalized bounding
+            possible padding). You can use [`~YolosImageProcessor.post_process`] to retrieve the unnormalized bounding
             boxes.
         auxiliary_outputs (`list[Dict]`, *optional*):
             Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
@@ -323,7 +322,6 @@ def __init__(self, config: YolosConfig) -> None:
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
 
@@ -381,7 +379,6 @@ def __init__(self, config: YolosConfig) -> None:
             self.intermediate_act_fn = config.hidden_act
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.intermediate_act_fn(hidden_states)
 
@@ -574,7 +571,7 @@ def _set_gradient_checkpointing(self, module: YolosEncoder, value: bool = False)
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
-            [`AutoImageProcessor.__call__`] for details.
+            [`YolosImageProcessor.__call__`] for details.
 
         head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
             Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
@@ -627,7 +624,6 @@ def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
 
     @add_start_docstrings_to_model_forward(YOLOS_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPooling,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/models/yoso/__init__.py b/src/transformers/models/yoso/__init__.py
index 400a0303c0c7..e1f89d73ac47 100644
--- a/src/transformers/models/yoso/__init__.py
+++ b/src/transformers/models/yoso/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,7 +13,6 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-# rely on isort to merge the imports
 from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
 
 
diff --git a/src/transformers/models/yoso/configuration_yoso.py b/src/transformers/models/yoso/configuration_yoso.py
index 7a2458146c6f..c6d2b176ef94 100644
--- a/src/transformers/models/yoso/configuration_yoso.py
+++ b/src/transformers/models/yoso/configuration_yoso.py
@@ -120,7 +120,7 @@ def __init__(
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
diff --git a/src/transformers/models/yoso/convert_yoso_pytorch_to_pytorch.py b/src/transformers/models/yoso/convert_yoso_pytorch_to_pytorch.py
index 2b9a2c7cd853..be46a4de81b3 100644
--- a/src/transformers/models/yoso/convert_yoso_pytorch_to_pytorch.py
+++ b/src/transformers/models/yoso/convert_yoso_pytorch_to_pytorch.py
@@ -75,7 +75,6 @@ def convert_checkpoint_helper(max_position_embeddings, orig_state_dict):
 
 
 def convert_yoso_checkpoint(checkpoint_path, yoso_config_file, pytorch_dump_path):
-
     orig_state_dict = torch.load(checkpoint_path, map_location="cpu")["model_state_dict"]
     config = YosoConfig.from_json_file(yoso_config_file)
     model = YosoForMaskedLM(config)
diff --git a/src/transformers/models/yoso/modeling_yoso.py b/src/transformers/models/yoso/modeling_yoso.py
index 9795824f85df..5edd7f883542 100644
--- a/src/transformers/models/yoso/modeling_yoso.py
+++ b/src/transformers/models/yoso/modeling_yoso.py
@@ -16,7 +16,7 @@
 
 
 import math
-import os
+from pathlib import Path
 from typing import Optional, Tuple, Union
 
 import torch
@@ -43,7 +43,6 @@
 
 _CHECKPOINT_FOR_DOC = "uw-madison/yoso-4096"
 _CONFIG_FOR_DOC = "YosoConfig"
-_TOKENIZER_FOR_DOC = "AutoTokenizer"
 
 YOSO_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "uw-madison/yoso-4096",
@@ -57,8 +56,8 @@ def load_cuda_kernels():
         from torch.utils.cpp_extension import load
 
         def append_root(files):
-            src_folder = os.path.dirname(os.path.realpath(__file__))
-            return [os.path.join(src_folder, file) for file in files]
+            src_folder = Path(__file__).resolve().parent.parent.parent / "kernels" / "yoso"
+            return [src_folder / file for file in files]
 
         src_files = append_root(
             ["fast_lsh_cumulation_torch.cpp", "fast_lsh_cumulation.cu", "fast_lsh_cumulation_cuda.cu"]
@@ -99,7 +98,6 @@ def normalize(input_tensors):
 
 
 def hashing(query, key, num_hash, hash_len):
-
     if len(query.size()) != 3:
         raise ValueError("Query has incorrect size.")
     if len(key.size()) != 3:
@@ -254,7 +252,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + 2)
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + 2, persistent=False
+        )
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
         self.register_buffer(
             "token_type_ids",
@@ -651,7 +651,6 @@ class YosoPreTrainedModel(PreTrainedModel):
     config_class = YosoConfig
     base_model_prefix = "yoso"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -766,7 +765,6 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(YOSO_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -792,6 +790,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -852,11 +851,7 @@ def forward(
 
 @add_start_docstrings("""YOSO Model with a `language modeling` head on top.""", YOSO_START_DOCSTRING)
 class YosoForMaskedLM(YosoPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        "cls.predictions.decoder.bias",
-        "cls.predictions.decoder.weight",
-        "embeddings.position_ids",
-    ]
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -875,7 +870,6 @@ def set_output_embeddings(self, new_embeddings):
 
     @add_start_docstrings_to_model_forward(YOSO_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -971,7 +965,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(YOSO_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1064,7 +1057,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(YOSO_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1157,7 +1149,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(YOSO_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1244,7 +1235,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(YOSO_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
diff --git a/src/transformers/onnx/__init__.py b/src/transformers/onnx/__init__.py
index 4491aefba9c9..33350c83a2c1 100644
--- a/src/transformers/onnx/__init__.py
+++ b/src/transformers/onnx/__init__.py
@@ -1,4 +1,3 @@
-# flake8: noqa
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/onnx/__main__.py b/src/transformers/onnx/__main__.py
index 1a0cdb68b111..92dba71ed789 100644
--- a/src/transformers/onnx/__main__.py
+++ b/src/transformers/onnx/__main__.py
@@ -19,7 +19,7 @@
 
 from packaging import version
 
-from .. import AutoFeatureExtractor, AutoProcessor, AutoTokenizer
+from .. import AutoFeatureExtractor, AutoImageProcessor, AutoProcessor, AutoTokenizer
 from ..utils import logging
 from ..utils.import_utils import is_optimum_available
 from .convert import export, validate_model_outputs
@@ -145,6 +145,8 @@ def export_with_transformers(args):
             preprocessor = get_preprocessor(args.model)
         elif args.preprocessor == "tokenizer":
             preprocessor = AutoTokenizer.from_pretrained(args.model)
+        elif args.preprocessor == "image_processor":
+            preprocessor = AutoImageProcessor.from_pretrained(args.model)
         elif args.preprocessor == "feature_extractor":
             preprocessor = AutoFeatureExtractor.from_pretrained(args.model)
         elif args.preprocessor == "processor":
@@ -213,7 +215,7 @@ def main():
     parser.add_argument(
         "--preprocessor",
         type=str,
-        choices=["auto", "tokenizer", "feature_extractor", "processor"],
+        choices=["auto", "tokenizer", "feature_extractor", "image_processor", "processor"],
         default="auto",
         help="Which type of preprocessor to use. 'auto' tries to automatically detect it.",
     )
diff --git a/src/transformers/onnx/config.py b/src/transformers/onnx/config.py
index bbf06b07929d..02bf2421f4d2 100644
--- a/src/transformers/onnx/config.py
+++ b/src/transformers/onnx/config.py
@@ -232,9 +232,9 @@ def is_torch_support_available(self) -> bool:
             `bool`: Whether the installed version of PyTorch is compatible with the model.
         """
         if is_torch_available():
-            from transformers.utils import torch_version
+            from transformers.utils import get_torch_version
 
-            return torch_version >= self.torch_onnx_minimum_version
+            return version.parse(get_torch_version()) >= self.torch_onnx_minimum_version
         else:
             return False
 
diff --git a/src/transformers/onnx/convert.py b/src/transformers/onnx/convert.py
index e953207b3a59..be46f7cd3106 100644
--- a/src/transformers/onnx/convert.py
+++ b/src/transformers/onnx/convert.py
@@ -26,7 +26,6 @@
     TensorType,
     is_tf_available,
     is_torch_available,
-    is_torch_onnx_dict_inputs_support_available,
     logging,
 )
 from .config import OnnxConfig
@@ -145,7 +144,7 @@ def export_pytorch(
             device = torch.device(device)
             if device.type == "cuda" and torch.cuda.is_available():
                 model.to(device)
-                model_inputs_device = dict()
+                model_inputs_device = {}
                 for k, v in model_inputs.items():
                     if isinstance(v, Tuple):
                         model_inputs_device[k] = tuple(
@@ -180,9 +179,7 @@ def export_pytorch(
                         f=output.as_posix(),
                         input_names=list(config.inputs.keys()),
                         output_names=onnx_outputs,
-                        dynamic_axes={
-                            name: axes for name, axes in chain(config.inputs.items(), config.outputs.items())
-                        },
+                        dynamic_axes=dict(chain(config.inputs.items(), config.outputs.items())),
                         do_constant_folding=True,
                         use_external_data_format=config.use_external_data_format(model.num_parameters()),
                         enable_onnx_checker=True,
@@ -209,7 +206,7 @@ def export_pytorch(
                     f=output.as_posix(),
                     input_names=list(config.inputs.keys()),
                     output_names=onnx_outputs,
-                    dynamic_axes={name: axes for name, axes in chain(config.inputs.items(), config.outputs.items())},
+                    dynamic_axes=dict(chain(config.inputs.items(), config.outputs.items())),
                     do_constant_folding=True,
                     opset_version=opset,
                 )
@@ -246,9 +243,8 @@ def export_tensorflow(
         `Tuple[List[str], List[str]]`: A tuple with an ordered list of the model's inputs, and the named inputs from
         the ONNX configuration.
     """
-    import tensorflow as tf
-
     import onnx
+    import tensorflow as tf
     import tf2onnx
 
     if isinstance(preprocessor, PreTrainedTokenizerBase) and tokenizer is not None:
@@ -338,15 +334,12 @@ def export(
         preprocessor = tokenizer
 
     if is_torch_available():
-        from ..utils import torch_version
-
-        if not is_torch_onnx_dict_inputs_support_available():
-            raise AssertionError(f"Unsupported PyTorch version, minimum required is 1.8.0, got: {torch_version}")
+        from ..utils import get_torch_version
 
         if not config.is_torch_support_available:
             logger.warning(
                 f"Unsupported PyTorch version for this model. Minimum required is {config.torch_onnx_minimum_version},"
-                f" got: {torch_version}"
+                f" got: {get_torch_version()}"
             )
 
     if is_torch_available() and issubclass(type(model), PreTrainedModel):
diff --git a/src/transformers/onnx/features.py b/src/transformers/onnx/features.py
index ff82bf60b35c..1b0bf23d6121 100644
--- a/src/transformers/onnx/features.py
+++ b/src/transformers/onnx/features.py
@@ -267,7 +267,7 @@ class FeaturesManager:
             onnx_config_cls="models.deberta_v2.DebertaV2OnnxConfig",
         ),
         "deit": supported_features_mapping(
-            "default", "image-classification", "masked-im", onnx_config_cls="models.deit.DeiTOnnxConfig"
+            "default", "image-classification", onnx_config_cls="models.deit.DeiTOnnxConfig"
         ),
         "detr": supported_features_mapping(
             "default",
@@ -502,7 +502,7 @@ class FeaturesManager:
             onnx_config_cls="models.squeezebert.SqueezeBertOnnxConfig",
         ),
         "swin": supported_features_mapping(
-            "default", "image-classification", "masked-im", onnx_config_cls="models.swin.SwinOnnxConfig"
+            "default", "image-classification", onnx_config_cls="models.swin.SwinOnnxConfig"
         ),
         "t5": supported_features_mapping(
             "default",
@@ -515,7 +515,7 @@ class FeaturesManager:
             "vision2seq-lm", onnx_config_cls="models.vision_encoder_decoder.VisionEncoderDecoderOnnxConfig"
         ),
         "vit": supported_features_mapping(
-            "default", "image-classification", "masked-im", onnx_config_cls="models.vit.ViTOnnxConfig"
+            "default", "image-classification", onnx_config_cls="models.vit.ViTOnnxConfig"
         ),
         "whisper": supported_features_mapping(
             "default",
diff --git a/src/transformers/optimization.py b/src/transformers/optimization.py
index b957acb6de93..5734b6e9cd58 100644
--- a/src/transformers/optimization.py
+++ b/src/transformers/optimization.py
@@ -16,12 +16,13 @@
 
 import math
 import warnings
+from functools import partial
 from typing import Callable, Iterable, Optional, Tuple, Union
 
 import torch
 from torch import nn
 from torch.optim import Optimizer
-from torch.optim.lr_scheduler import LambdaLR
+from torch.optim.lr_scheduler import LambdaLR, ReduceLROnPlateau
 
 from .trainer_utils import SchedulerType
 from .utils import logging
@@ -31,6 +32,10 @@
 logger = logging.get_logger(__name__)
 
 
+def _get_constant_lambda(_=None):
+    return 1
+
+
 def get_constant_schedule(optimizer: Optimizer, last_epoch: int = -1):
     """
     Create a schedule with a constant learning rate, using the learning rate set in optimizer.
@@ -44,7 +49,29 @@ def get_constant_schedule(optimizer: Optimizer, last_epoch: int = -1):
     Return:
         `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
     """
-    return LambdaLR(optimizer, lambda _: 1, last_epoch=last_epoch)
+
+    return LambdaLR(optimizer, _get_constant_lambda, last_epoch=last_epoch)
+
+
+def get_reduce_on_plateau_schedule(optimizer: Optimizer):
+    """
+    Create a schedule with a constant learning rate that decreases when a metric has stopped improving.
+
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+
+    Return:
+        `torch.optim.lr_scheduler.ReduceLROnPlateau` with the appropriate schedule.
+    """
+
+    return ReduceLROnPlateau(optimizer)
+
+
+def _get_constant_schedule_with_warmup_lr_lambda(current_step: int, *, num_warmup_steps: int):
+    if current_step < num_warmup_steps:
+        return float(current_step) / float(max(1.0, num_warmup_steps))
+    return 1.0
 
 
 def get_constant_schedule_with_warmup(optimizer: Optimizer, num_warmup_steps: int, last_epoch: int = -1):
@@ -64,14 +91,16 @@ def get_constant_schedule_with_warmup(optimizer: Optimizer, num_warmup_steps: in
         `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
     """
 
-    def lr_lambda(current_step: int):
-        if current_step < num_warmup_steps:
-            return float(current_step) / float(max(1.0, num_warmup_steps))
-        return 1.0
-
+    lr_lambda = partial(_get_constant_schedule_with_warmup_lr_lambda, num_warmup_steps=num_warmup_steps)
     return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)
 
 
+def _get_linear_schedule_with_warmup_lr_lambda(current_step: int, *, num_warmup_steps: int, num_training_steps: int):
+    if current_step < num_warmup_steps:
+        return float(current_step) / float(max(1, num_warmup_steps))
+    return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)))
+
+
 def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
     """
     Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
@@ -91,16 +120,23 @@ def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_st
         `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
     """
 
-    def lr_lambda(current_step: int):
-        if current_step < num_warmup_steps:
-            return float(current_step) / float(max(1, num_warmup_steps))
-        return max(
-            0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
-        )
-
+    lr_lambda = partial(
+        _get_linear_schedule_with_warmup_lr_lambda,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=num_training_steps,
+    )
     return LambdaLR(optimizer, lr_lambda, last_epoch)
 
 
+def _get_cosine_schedule_with_warmup_lr_lambda(
+    current_step: int, *, num_warmup_steps: int, num_training_steps: int, num_cycles: float
+):
+    if current_step < num_warmup_steps:
+        return float(current_step) / float(max(1, num_warmup_steps))
+    progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
+    return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
+
+
 def get_cosine_schedule_with_warmup(
     optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, num_cycles: float = 0.5, last_epoch: int = -1
 ):
@@ -126,15 +162,26 @@ def get_cosine_schedule_with_warmup(
         `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
     """
 
-    def lr_lambda(current_step):
-        if current_step < num_warmup_steps:
-            return float(current_step) / float(max(1, num_warmup_steps))
-        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
-        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
-
+    lr_lambda = partial(
+        _get_cosine_schedule_with_warmup_lr_lambda,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=num_training_steps,
+        num_cycles=num_cycles,
+    )
     return LambdaLR(optimizer, lr_lambda, last_epoch)
 
 
+def _get_cosine_with_hard_restarts_schedule_with_warmup_lr_lambda(
+    current_step: int, *, num_warmup_steps: int, num_training_steps: int, num_cycles: int
+):
+    if current_step < num_warmup_steps:
+        return float(current_step) / float(max(1, num_warmup_steps))
+    progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
+    if progress >= 1.0:
+        return 0.0
+    return max(0.0, 0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0))))
+
+
 def get_cosine_with_hard_restarts_schedule_with_warmup(
     optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, num_cycles: int = 1, last_epoch: int = -1
 ):
@@ -159,17 +206,36 @@ def get_cosine_with_hard_restarts_schedule_with_warmup(
         `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
     """
 
-    def lr_lambda(current_step):
-        if current_step < num_warmup_steps:
-            return float(current_step) / float(max(1, num_warmup_steps))
-        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
-        if progress >= 1.0:
-            return 0.0
-        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0))))
-
+    lr_lambda = partial(
+        _get_cosine_with_hard_restarts_schedule_with_warmup_lr_lambda,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=num_training_steps,
+        num_cycles=num_cycles,
+    )
     return LambdaLR(optimizer, lr_lambda, last_epoch)
 
 
+def _get_polynomial_decay_schedule_with_warmup_lr_lambda(
+    current_step: int,
+    *,
+    num_warmup_steps: int,
+    num_training_steps: int,
+    lr_end: float,
+    power: float,
+    lr_init: int,
+):
+    if current_step < num_warmup_steps:
+        return float(current_step) / float(max(1, num_warmup_steps))
+    elif current_step > num_training_steps:
+        return lr_end / lr_init  # as LambdaLR multiplies by lr_init
+    else:
+        lr_range = lr_init - lr_end
+        decay_steps = num_training_steps - num_warmup_steps
+        pct_remaining = 1 - (current_step - num_warmup_steps) / decay_steps
+        decay = lr_range * pct_remaining**power + lr_end
+        return decay / lr_init  # as LambdaLR multiplies by lr_init
+
+
 def get_polynomial_decay_schedule_with_warmup(
     optimizer, num_warmup_steps, num_training_steps, lr_end=1e-7, power=1.0, last_epoch=-1
 ):
@@ -205,21 +271,55 @@ def get_polynomial_decay_schedule_with_warmup(
     if not (lr_init > lr_end):
         raise ValueError(f"lr_end ({lr_end}) must be be smaller than initial lr ({lr_init})")
 
-    def lr_lambda(current_step: int):
-        if current_step < num_warmup_steps:
-            return float(current_step) / float(max(1, num_warmup_steps))
-        elif current_step > num_training_steps:
-            return lr_end / lr_init  # as LambdaLR multiplies by lr_init
-        else:
-            lr_range = lr_init - lr_end
-            decay_steps = num_training_steps - num_warmup_steps
-            pct_remaining = 1 - (current_step - num_warmup_steps) / decay_steps
-            decay = lr_range * pct_remaining**power + lr_end
-            return decay / lr_init  # as LambdaLR multiplies by lr_init
-
+    lr_lambda = partial(
+        _get_polynomial_decay_schedule_with_warmup_lr_lambda,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=num_training_steps,
+        lr_end=lr_end,
+        power=power,
+        lr_init=lr_init,
+    )
     return LambdaLR(optimizer, lr_lambda, last_epoch)
 
 
+def _get_inverse_sqrt_schedule_lr_lambda(current_step: int, *, num_warmup_steps: int, timescale: int = None):
+    if current_step < num_warmup_steps:
+        return float(current_step) / float(max(1, num_warmup_steps))
+    shift = timescale - num_warmup_steps
+    decay = 1.0 / math.sqrt((current_step + shift) / timescale)
+    return decay
+
+
+def get_inverse_sqrt_schedule(
+    optimizer: Optimizer, num_warmup_steps: int, timescale: int = None, last_epoch: int = -1
+):
+    """
+    Create a schedule with an inverse square-root learning rate, from the initial lr set in the optimizer, after a
+    warmup period which increases lr linearly from 0 to the initial lr set in the optimizer.
+
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        timescale (`int`, *optional*, defaults to `num_warmup_steps`):
+            Time scale.
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+    # Note: this implementation is adapted from
+    # https://github.com/google-research/big_vision/blob/f071ce68852d56099437004fd70057597a95f6ef/big_vision/utils.py#L930
+
+    if timescale is None:
+        timescale = num_warmup_steps
+
+    lr_lambda = partial(_get_inverse_sqrt_schedule_lr_lambda, num_warmup_steps=num_warmup_steps, timescale=timescale)
+    return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)
+
+
 TYPE_TO_SCHEDULER_FUNCTION = {
     SchedulerType.LINEAR: get_linear_schedule_with_warmup,
     SchedulerType.COSINE: get_cosine_schedule_with_warmup,
@@ -227,6 +327,8 @@ def lr_lambda(current_step: int):
     SchedulerType.POLYNOMIAL: get_polynomial_decay_schedule_with_warmup,
     SchedulerType.CONSTANT: get_constant_schedule,
     SchedulerType.CONSTANT_WITH_WARMUP: get_constant_schedule_with_warmup,
+    SchedulerType.INVERSE_SQRT: get_inverse_sqrt_schedule,
+    SchedulerType.REDUCE_ON_PLATEAU: get_reduce_on_plateau_schedule,
 }
 
 
@@ -253,7 +355,7 @@ def get_scheduler(
     """
     name = SchedulerType(name)
     schedule_func = TYPE_TO_SCHEDULER_FUNCTION[name]
-    if name == SchedulerType.CONSTANT:
+    if name == SchedulerType.CONSTANT or name == SchedulerType.REDUCE_ON_PLATEAU:
         return schedule_func(optimizer)
 
     # All other schedulers require `num_warmup_steps`
@@ -263,6 +365,9 @@ def get_scheduler(
     if name == SchedulerType.CONSTANT_WITH_WARMUP:
         return schedule_func(optimizer, num_warmup_steps=num_warmup_steps)
 
+    if name == SchedulerType.INVERSE_SQRT:
+        return schedule_func(optimizer, num_warmup_steps=num_warmup_steps)
+
     # All other schedulers require `num_training_steps`
     if num_training_steps is None:
         raise ValueError(f"{name} requires `num_training_steps`, please provide that argument.")
@@ -278,13 +383,13 @@ class AdamW(Optimizer):
     Parameters:
         params (`Iterable[nn.parameter.Parameter]`):
             Iterable of parameters to optimize or dictionaries defining parameter groups.
-        lr (`float`, *optional*, defaults to 1e-3):
+        lr (`float`, *optional*, defaults to 0.001):
             The learning rate to use.
-        betas (`Tuple[float,float]`, *optional*, defaults to (0.9, 0.999)):
+        betas (`Tuple[float,float]`, *optional*, defaults to `(0.9, 0.999)`):
             Adam's betas parameters (b1, b2).
-        eps (`float`, *optional*, defaults to 1e-6):
+        eps (`float`, *optional*, defaults to 1e-06):
             Adam's epsilon for numerical stability.
-        weight_decay (`float`, *optional*, defaults to 0):
+        weight_decay (`float`, *optional*, defaults to 0.0):
             Decoupled weight decay to apply.
         correct_bias (`bool`, *optional*, defaults to `True`):
             Whether or not to correct bias in Adam (for instance, in Bert TF repository they use `False`).
@@ -318,9 +423,10 @@ def __init__(
             raise ValueError(f"Invalid beta parameter: {betas[1]} - should be in [0.0, 1.0)")
         if not 0.0 <= eps:
             raise ValueError(f"Invalid epsilon value: {eps} - should be >= 0.0")
-        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, correct_bias=correct_bias)
+        defaults = {"lr": lr, "betas": betas, "eps": eps, "weight_decay": weight_decay, "correct_bias": correct_bias}
         super().__init__(params, defaults)
 
+    @torch.no_grad()
     def step(self, closure: Callable = None):
         """
         Performs a single optimization step.
@@ -336,7 +442,7 @@ def step(self, closure: Callable = None):
             for p in group["params"]:
                 if p.grad is None:
                     continue
-                grad = p.grad.data
+                grad = p.grad
                 if grad.is_sparse:
                     raise RuntimeError("Adam does not support sparse gradients, please consider SparseAdam instead")
 
@@ -346,9 +452,9 @@ def step(self, closure: Callable = None):
                 if len(state) == 0:
                     state["step"] = 0
                     # Exponential moving average of gradient values
-                    state["exp_avg"] = torch.zeros_like(p.data)
+                    state["exp_avg"] = torch.zeros_like(p)
                     # Exponential moving average of squared gradient values
-                    state["exp_avg_sq"] = torch.zeros_like(p.data)
+                    state["exp_avg_sq"] = torch.zeros_like(p)
 
                 exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
                 beta1, beta2 = group["betas"]
@@ -367,7 +473,7 @@ def step(self, closure: Callable = None):
                     bias_correction2 = 1.0 - beta2 ** state["step"]
                     step_size = step_size * math.sqrt(bias_correction2) / bias_correction1
 
-                p.data.addcdiv_(exp_avg, denom, value=-step_size)
+                p.addcdiv_(exp_avg, denom, value=-step_size)
 
                 # Just adding the square of the weights to the loss function is *not*
                 # the correct way of using L2 regularization/weight decay with Adam,
@@ -378,7 +484,7 @@ def step(self, closure: Callable = None):
                 # of the weights to the loss with plain (non-momentum) SGD.
                 # Add weight decay at the end (fixed version)
                 if group["weight_decay"] > 0.0:
-                    p.data.add_(p.data, alpha=(-group["lr"] * group["weight_decay"]))
+                    p.add_(p, alpha=(-group["lr"] * group["weight_decay"]))
 
         return loss
 
@@ -398,15 +504,15 @@ class Adafactor(Optimizer):
             Iterable of parameters to optimize or dictionaries defining parameter groups.
         lr (`float`, *optional*):
             The external learning rate.
-        eps (`Tuple[float, float]`, *optional*, defaults to (1e-30, 1e-3)):
+        eps (`Tuple[float, float]`, *optional*, defaults to `(1e-30, 0.001)`):
             Regularization constants for square gradient and parameter scale respectively
-        clip_threshold (`float`, *optional*, defaults 1.0):
+        clip_threshold (`float`, *optional*, defaults to 1.0):
             Threshold of root mean square of final gradient update
         decay_rate (`float`, *optional*, defaults to -0.8):
             Coefficient used to compute running averages of square
         beta1 (`float`, *optional*):
             Coefficient used for computing running averages of gradient
-        weight_decay (`float`, *optional*, defaults to 0):
+        weight_decay (`float`, *optional*, defaults to 0.0):
             Weight decay (L2 penalty)
         scale_parameter (`bool`, *optional*, defaults to `True`):
             If True, learning rate is scaled by root mean square
@@ -487,17 +593,17 @@ def __init__(
         if warmup_init and not relative_step:
             raise ValueError("`warmup_init=True` requires `relative_step=True`")
 
-        defaults = dict(
-            lr=lr,
-            eps=eps,
-            clip_threshold=clip_threshold,
-            decay_rate=decay_rate,
-            beta1=beta1,
-            weight_decay=weight_decay,
-            scale_parameter=scale_parameter,
-            relative_step=relative_step,
-            warmup_init=warmup_init,
-        )
+        defaults = {
+            "lr": lr,
+            "eps": eps,
+            "clip_threshold": clip_threshold,
+            "decay_rate": decay_rate,
+            "beta1": beta1,
+            "weight_decay": weight_decay,
+            "scale_parameter": scale_parameter,
+            "relative_step": relative_step,
+            "warmup_init": warmup_init,
+        }
         super().__init__(params, defaults)
 
     @staticmethod
@@ -529,6 +635,7 @@ def _approx_sq_grad(exp_avg_sq_row, exp_avg_sq_col):
         c_factor = exp_avg_sq_col.unsqueeze(-2).rsqrt()
         return torch.mul(r_factor, c_factor)
 
+    @torch.no_grad()
     def step(self, closure=None):
         """
         Performs a single optimization step
@@ -545,7 +652,7 @@ def step(self, closure=None):
             for p in group["params"]:
                 if p.grad is None:
                     continue
-                grad = p.grad.data
+                grad = p.grad
                 if grad.dtype in {torch.float16, torch.bfloat16}:
                     grad = grad.float()
                 if grad.is_sparse:
@@ -578,8 +685,8 @@ def step(self, closure=None):
                     else:
                         state["exp_avg_sq"] = state["exp_avg_sq"].to(grad)
 
-                p_data_fp32 = p.data
-                if p.data.dtype in {torch.float16, torch.bfloat16}:
+                p_data_fp32 = p
+                if p.dtype in {torch.float16, torch.bfloat16}:
                     p_data_fp32 = p_data_fp32.float()
 
                 state["step"] += 1
@@ -617,8 +724,8 @@ def step(self, closure=None):
 
                 p_data_fp32.add_(-update)
 
-                if p.data.dtype in {torch.float16, torch.bfloat16}:
-                    p.data.copy_(p_data_fp32)
+                if p.dtype in {torch.float16, torch.bfloat16}:
+                    p.copy_(p_data_fp32)
 
         return loss
 
diff --git a/src/transformers/optimization_tf.py b/src/transformers/optimization_tf.py
index 3c833875a877..a4a84b06f879 100644
--- a/src/transformers/optimization_tf.py
+++ b/src/transformers/optimization_tf.py
@@ -39,7 +39,7 @@ class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
             The schedule function to apply after the warmup for the rest of training.
         warmup_steps (`int`):
             The number of steps for the warmup part of training.
-        power (`float`, *optional*, defaults to 1):
+        power (`float`, *optional*, defaults to 1.0):
             The power to use for the polynomial warmup (defaults is a linear warmup).
         name (`str`, *optional*):
             Optional name prefix for the returned tensors during the schedule.
@@ -117,9 +117,9 @@ def create_optimizer(
             The beta2 to use in Adam.
         adam_epsilon (`float`, *optional*, defaults to 1e-8):
             The epsilon to use in Adam.
-        adam_clipnorm: (`float`, *optional*, defaults to `None`):
+        adam_clipnorm (`float`, *optional*, defaults to `None`):
             If not `None`, clip the gradient norm for each weight tensor to this value.
-        adam_global_clipnorm: (`float`, *optional*, defaults to `None`)
+        adam_global_clipnorm (`float`, *optional*, defaults to `None`)
             If not `None`, clip gradient norm to this value. When using this argument, the norm is computed over all
             weight tensors, as if they were concatenated into a single vector.
         weight_decay_rate (`float`, *optional*, defaults to 0):
@@ -176,22 +176,22 @@ class AdamWeightDecay(Adam):
     with the m and v parameters in strange ways as shown in [Decoupled Weight Decay
     Regularization](https://arxiv.org/abs/1711.05101).
 
-    Instead we want ot decay the weights in a manner that doesn't interact with the m/v parameters. This is equivalent
+    Instead we want to decay the weights in a manner that doesn't interact with the m/v parameters. This is equivalent
     to adding the square of the weights to the loss with plain (non-momentum) SGD.
 
     Args:
-        learning_rate (`Union[float, tf.keras.optimizers.schedules.LearningRateSchedule]`, *optional*, defaults to 1e-3):
+        learning_rate (`Union[float, tf.keras.optimizers.schedules.LearningRateSchedule]`, *optional*, defaults to 0.001):
             The learning rate to use or a schedule.
         beta_1 (`float`, *optional*, defaults to 0.9):
             The beta1 parameter in Adam, which is the exponential decay rate for the 1st momentum estimates.
         beta_2 (`float`, *optional*, defaults to 0.999):
             The beta2 parameter in Adam, which is the exponential decay rate for the 2nd momentum estimates.
-        epsilon (`float`, *optional*, defaults to 1e-7):
+        epsilon (`float`, *optional*, defaults to 1e-07):
             The epsilon parameter in Adam, which is a small constant for numerical stability.
-        amsgrad (`bool`, *optional*, default to `False`):
+        amsgrad (`bool`, *optional*, defaults to `False`):
             Whether to apply AMSGrad variant of this algorithm or not, see [On the Convergence of Adam and
             Beyond](https://arxiv.org/abs/1904.09237).
-        weight_decay_rate (`float`, *optional*, defaults to 0):
+        weight_decay_rate (`float`, *optional*, defaults to 0.0):
             The weight decay to apply.
         include_in_weight_decay (`List[str]`, *optional*):
             List of the parameter names (or re patterns) to apply weight decay to. If none is passed, weight decay is
@@ -199,9 +199,9 @@ class AdamWeightDecay(Adam):
         exclude_from_weight_decay (`List[str]`, *optional*):
             List of the parameter names (or re patterns) to exclude from applying weight decay to. If a
             `include_in_weight_decay` is passed, the names in it will supersede this list.
-        name (`str`, *optional*, defaults to 'AdamWeightDecay'):
+        name (`str`, *optional*, defaults to `"AdamWeightDecay"`):
             Optional name for the operations created when applying gradients.
-        kwargs:
+        kwargs (`Dict[str, Any]`, *optional*):
             Keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients by
             norm; `clipvalue` is clip gradients by value, `decay` is included for backward compatibility to allow time
             inverse decay of learning rate. `lr` is included for backward compatibility, recommended to use
@@ -219,7 +219,7 @@ def __init__(
         include_in_weight_decay: Optional[List[str]] = None,
         exclude_from_weight_decay: Optional[List[str]] = None,
         name: str = "AdamWeightDecay",
-        **kwargs
+        **kwargs,
     ):
         super().__init__(learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs)
         self.weight_decay_rate = weight_decay_rate
@@ -262,7 +262,7 @@ def _get_lr(self, var_device, var_dtype, apply_state):
             coefficients = self._fallback_apply_state(var_device, var_dtype)
             apply_state[(var_device, var_dtype)] = coefficients
 
-        return coefficients["lr_t"], dict(apply_state=apply_state)
+        return coefficients["lr_t"], {"apply_state": apply_state}
 
     def _resource_apply_dense(self, grad, var, apply_state=None):
         lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
@@ -333,7 +333,7 @@ def gradients(self):
         """The accumulated gradients on the current replica."""
         if not self._gradients:
             raise ValueError("The accumulator should be called first to initialize the gradients")
-        return list(gradient.value() if gradient is not None else gradient for gradient in self._gradients)
+        return [gradient.value() if gradient is not None else gradient for gradient in self._gradients]
 
     def __call__(self, gradients):
         """Accumulates `gradients` on the current replica."""
diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index 8b06009a4cd1..ae6d20265a0a 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -1,11 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
-import io
-import json
-import os
-
 # coding=utf-8
 # Copyright 2018 The HuggingFace Inc. team.
 #
@@ -20,26 +12,32 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import io
+import json
+import os
 import warnings
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
-from numpy import isin
-
 from huggingface_hub import model_info
+from numpy import isin
 
 from ..configuration_utils import PretrainedConfig
 from ..dynamic_module_utils import get_class_from_dynamic_module
 from ..feature_extraction_utils import PreTrainedFeatureExtractor
+from ..image_processing_utils import BaseImageProcessor
 from ..models.auto.configuration_auto import AutoConfig
 from ..models.auto.feature_extraction_auto import FEATURE_EXTRACTOR_MAPPING, AutoFeatureExtractor
-from ..models.auto.modeling_auto import AutoModelForDepthEstimation
+from ..models.auto.image_processing_auto import IMAGE_PROCESSOR_MAPPING, AutoImageProcessor
+from ..models.auto.modeling_auto import AutoModelForDepthEstimation, AutoModelForImageToImage
 from ..models.auto.tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer
 from ..tokenization_utils import PreTrainedTokenizer
-from ..tokenization_utils_fast import PreTrainedTokenizerFast
 from ..utils import (
     HUGGINGFACE_CO_RESOLVE_ENDPOINT,
+    find_adapter_config_file,
     is_kenlm_available,
+    is_offline_mode,
+    is_peft_available,
     is_pyctcdecode_available,
     is_tf_available,
     is_torch_available,
@@ -66,13 +64,16 @@
 from .fill_mask import FillMaskPipeline
 from .image_classification import ImageClassificationPipeline
 from .image_segmentation import ImageSegmentationPipeline
+from .image_to_image import ImageToImagePipeline
 from .image_to_text import ImageToTextPipeline
+from .mask_generation import MaskGenerationPipeline
 from .object_detection import ObjectDetectionPipeline
 from .question_answering import QuestionAnsweringArgumentHandler, QuestionAnsweringPipeline
 from .table_question_answering import TableQuestionAnsweringArgumentHandler, TableQuestionAnsweringPipeline
 from .text2text_generation import SummarizationPipeline, Text2TextGenerationPipeline, TranslationPipeline
 from .text_classification import TextClassificationPipeline
 from .text_generation import TextGenerationPipeline
+from .text_to_audio import TextToAudioPipeline
 from .token_classification import (
     AggregationStrategy,
     NerPipeline,
@@ -81,6 +82,7 @@
 )
 from .video_classification import VideoClassificationPipeline
 from .visual_question_answering import VisualQuestionAnsweringPipeline
+from .zero_shot_audio_classification import ZeroShotAudioClassificationPipeline
 from .zero_shot_classification import ZeroShotClassificationArgumentHandler, ZeroShotClassificationPipeline
 from .zero_shot_image_classification import ZeroShotImageClassificationPipeline
 from .zero_shot_object_detection import ZeroShotObjectDetectionPipeline
@@ -90,11 +92,6 @@
     import tensorflow as tf
 
     from ..models.auto.modeling_tf_auto import (
-        TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
-        TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
-        TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-        TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
-        TF_MODEL_WITH_LM_HEAD_MAPPING,
         TFAutoModel,
         TFAutoModelForCausalLM,
         TFAutoModelForImageClassification,
@@ -105,19 +102,13 @@
         TFAutoModelForTableQuestionAnswering,
         TFAutoModelForTokenClassification,
         TFAutoModelForVision2Seq,
+        TFAutoModelForZeroShotImageClassification,
     )
 
 if is_torch_available():
     import torch
 
     from ..models.auto.modeling_auto import (
-        MODEL_FOR_MASKED_LM_MAPPING,
-        MODEL_FOR_QUESTION_ANSWERING_MAPPING,
-        MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
-        MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-        MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
-        MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
-        MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING,
         AutoModel,
         AutoModelForAudioClassification,
         AutoModelForCausalLM,
@@ -126,6 +117,7 @@
         AutoModelForImageClassification,
         AutoModelForImageSegmentation,
         AutoModelForMaskedLM,
+        AutoModelForMaskGeneration,
         AutoModelForObjectDetection,
         AutoModelForQuestionAnswering,
         AutoModelForSemanticSegmentation,
@@ -133,15 +125,22 @@
         AutoModelForSequenceClassification,
         AutoModelForSpeechSeq2Seq,
         AutoModelForTableQuestionAnswering,
+        AutoModelForTextToSpectrogram,
+        AutoModelForTextToWaveform,
         AutoModelForTokenClassification,
         AutoModelForVideoClassification,
         AutoModelForVision2Seq,
         AutoModelForVisualQuestionAnswering,
+        AutoModelForZeroShotImageClassification,
         AutoModelForZeroShotObjectDetection,
     )
+
+
 if TYPE_CHECKING:
     from ..modeling_tf_utils import TFPreTrainedModel
     from ..modeling_utils import PreTrainedModel
+    from ..tokenization_utils_fast import PreTrainedTokenizerFast
+
 
 logger = logging.get_logger(__name__)
 
@@ -151,6 +150,7 @@
     "sentiment-analysis": "text-classification",
     "ner": "token-classification",
     "vqa": "visual-question-answering",
+    "text-to-speech": "text-to-audio",
 }
 SUPPORTED_TASKS = {
     "audio-classification": {
@@ -167,6 +167,13 @@
         "default": {"model": {"pt": ("facebook/wav2vec2-base-960h", "55bb623")}},
         "type": "multimodal",
     },
+    "text-to-audio": {
+        "impl": TextToAudioPipeline,
+        "tf": (),
+        "pt": (AutoModelForTextToWaveform, AutoModelForTextToSpectrogram) if is_torch_available() else (),
+        "default": {"model": {"pt": ("suno/bark-small", "645cfba")}},
+        "type": "text",
+    },
     "feature-extraction": {
         "impl": FeatureExtractionPipeline,
         "tf": (TFAutoModel,) if is_tf_available() else (),
@@ -292,8 +299,8 @@
     },
     "zero-shot-image-classification": {
         "impl": ZeroShotImageClassificationPipeline,
-        "tf": (TFAutoModel,) if is_tf_available() else (),
-        "pt": (AutoModel,) if is_torch_available() else (),
+        "tf": (TFAutoModelForZeroShotImageClassification,) if is_tf_available() else (),
+        "pt": (AutoModelForZeroShotImageClassification,) if is_torch_available() else (),
         "default": {
             "model": {
                 "pt": ("openai/clip-vit-base-patch32", "f4881ba"),
@@ -302,6 +309,17 @@
         },
         "type": "multimodal",
     },
+    "zero-shot-audio-classification": {
+        "impl": ZeroShotAudioClassificationPipeline,
+        "tf": (),
+        "pt": (AutoModel,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("laion/clap-htsat-fused", "973b6e5"),
+            }
+        },
+        "type": "multimodal",
+    },
     "conversational": {
         "impl": ConversationalPipeline,
         "tf": (TFAutoModelForSeq2SeqLM, TFAutoModelForCausalLM) if is_tf_available() else (),
@@ -328,7 +346,7 @@
         "tf": (),
         "pt": (AutoModelForImageSegmentation, AutoModelForSemanticSegmentation) if is_torch_available() else (),
         "default": {"model": {"pt": ("facebook/detr-resnet-50-panoptic", "fc15262")}},
-        "type": "image",
+        "type": "multimodal",
     },
     "image-to-text": {
         "impl": ImageToTextPipeline,
@@ -370,10 +388,26 @@
         "default": {"model": {"pt": ("MCG-NJU/videomae-base-finetuned-kinetics", "4800870")}},
         "type": "video",
     },
+    "mask-generation": {
+        "impl": MaskGenerationPipeline,
+        "tf": (),
+        "pt": (AutoModelForMaskGeneration,) if is_torch_available() else (),
+        "default": {"model": {"pt": ("facebook/sam-vit-huge", "997b15")}},
+        "type": "multimodal",
+    },
+    "image-to-image": {
+        "impl": ImageToImagePipeline,
+        "tf": (),
+        "pt": (AutoModelForImageToImage,) if is_torch_available() else (),
+        "default": {"model": {"pt": ("caidas/swin2SR-classical-sr-x2-64", "4aaedcb")}},
+        "type": "image",
+    },
 }
 
 NO_FEATURE_EXTRACTOR_TASKS = set()
+NO_IMAGE_PROCESSOR_TASKS = set()
 NO_TOKENIZER_TASKS = set()
+
 # Those model configs are special, they are generic over their task, meaning
 # any tokenizer/feature_extractor might be use for a given model so we cannot
 # use the statically defined TOKENIZER_MAPPING and FEATURE_EXTRACTOR_MAPPING to
@@ -382,8 +416,12 @@
 for task, values in SUPPORTED_TASKS.items():
     if values["type"] == "text":
         NO_FEATURE_EXTRACTOR_TASKS.add(task)
-    elif values["type"] in {"audio", "image", "video"}:
+        NO_IMAGE_PROCESSOR_TASKS.add(task)
+    elif values["type"] in {"image", "video"}:
+        NO_TOKENIZER_TASKS.add(task)
+    elif values["type"] in {"audio"}:
         NO_TOKENIZER_TASKS.add(task)
+        NO_IMAGE_PROCESSOR_TASKS.add(task)
     elif values["type"] != "multimodal":
         raise ValueError(f"SUPPORTED_TASK {task} contains invalid type {values['type']}")
 
@@ -397,9 +435,20 @@ def get_supported_tasks() -> List[str]:
     return PIPELINE_REGISTRY.get_supported_tasks()
 
 
-def get_task(model: str, use_auth_token: Optional[str] = None) -> str:
+def get_task(model: str, token: Optional[str] = None, **deprecated_kwargs) -> str:
+    use_auth_token = deprecated_kwargs.pop("use_auth_token", None)
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+        )
+        if token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        token = use_auth_token
+
+    if is_offline_mode():
+        raise RuntimeError("You cannot infer task automatically within `pipeline` when using offline mode")
     try:
-        info = model_info(model, token=use_auth_token)
+        info = model_info(model, token=token)
     except Exception as e:
         raise RuntimeError(f"Instantiating a pipeline without a task set raised an error: {e}")
     if not info.pipeline_tag:
@@ -424,20 +473,30 @@ def check_task(task: str) -> Tuple[str, Dict, Any]:
             - `"audio-classification"`
             - `"automatic-speech-recognition"`
             - `"conversational"`
+            - `"depth-estimation"`
+            - `"document-question-answering"`
             - `"feature-extraction"`
             - `"fill-mask"`
             - `"image-classification"`
+            - `"image-segmentation"`
+            - `"image-to-text"`
+            - `"image-to-image"`
+            - `"object-detection"`
             - `"question-answering"`
+            - `"summarization"`
             - `"table-question-answering"`
             - `"text2text-generation"`
             - `"text-classification"` (alias `"sentiment-analysis"` available)
             - `"text-generation"`
+            - `"text-to-audio"` (alias `"text-to-speech"` available)
             - `"token-classification"` (alias `"ner"` available)
             - `"translation"`
             - `"translation_xx_to_yy"`
-            - `"summarization"`
+            - `"video-classification"`
+            - `"visual-question-answering"`
             - `"zero-shot-classification"`
             - `"zero-shot-image-classification"`
+            - `"zero-shot-object-detection"`
 
     Returns:
         (normalized_task: `str`, task_defaults: `dict`, task_options: (`tuple`, None)) The normalized task name
@@ -467,14 +526,15 @@ def clean_custom_task(task_info):
 
 def pipeline(
     task: str = None,
-    model: Optional = None,
+    model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None,
     config: Optional[Union[str, PretrainedConfig]] = None,
-    tokenizer: Optional[Union[str, PreTrainedTokenizer, PreTrainedTokenizerFast]] = None,
+    tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None,
     feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None,
+    image_processor: Optional[Union[str, BaseImageProcessor]] = None,
     framework: Optional[str] = None,
     revision: Optional[str] = None,
     use_fast: bool = True,
-    use_auth_token: Optional[Union[str, bool]] = None,
+    token: Optional[Union[str, bool]] = None,
     device: Optional[Union[int, str, "torch.device"]] = None,
     device_map=None,
     torch_dtype=None,
@@ -499,20 +559,33 @@ def pipeline(
             - `"audio-classification"`: will return a [`AudioClassificationPipeline`].
             - `"automatic-speech-recognition"`: will return a [`AutomaticSpeechRecognitionPipeline`].
             - `"conversational"`: will return a [`ConversationalPipeline`].
+            - `"depth-estimation"`: will return a [`DepthEstimationPipeline`].
+            - `"document-question-answering"`: will return a [`DocumentQuestionAnsweringPipeline`].
             - `"feature-extraction"`: will return a [`FeatureExtractionPipeline`].
             - `"fill-mask"`: will return a [`FillMaskPipeline`]:.
             - `"image-classification"`: will return a [`ImageClassificationPipeline`].
+            - `"image-segmentation"`: will return a [`ImageSegmentationPipeline`].
+            - `"image-to-image"`: will return a [`ImageToImagePipeline`].
+            - `"image-to-text"`: will return a [`ImageToTextPipeline`].
+            - `"mask-generation"`: will return a [`MaskGenerationPipeline`].
+            - `"object-detection"`: will return a [`ObjectDetectionPipeline`].
             - `"question-answering"`: will return a [`QuestionAnsweringPipeline`].
+            - `"summarization"`: will return a [`SummarizationPipeline`].
             - `"table-question-answering"`: will return a [`TableQuestionAnsweringPipeline`].
             - `"text2text-generation"`: will return a [`Text2TextGenerationPipeline`].
             - `"text-classification"` (alias `"sentiment-analysis"` available): will return a
               [`TextClassificationPipeline`].
             - `"text-generation"`: will return a [`TextGenerationPipeline`]:.
+            - `"text-to-audio"` (alias `"text-to-speech"` available): will return a [`TextToAudioPipeline`]:.
             - `"token-classification"` (alias `"ner"` available): will return a [`TokenClassificationPipeline`].
             - `"translation"`: will return a [`TranslationPipeline`].
             - `"translation_xx_to_yy"`: will return a [`TranslationPipeline`].
-            - `"summarization"`: will return a [`SummarizationPipeline`].
+            - `"video-classification"`: will return a [`VideoClassificationPipeline`].
+            - `"visual-question-answering"`: will return a [`VisualQuestionAnsweringPipeline`].
             - `"zero-shot-classification"`: will return a [`ZeroShotClassificationPipeline`].
+            - `"zero-shot-image-classification"`: will return a [`ZeroShotImageClassificationPipeline`].
+            - `"zero-shot-audio-classification"`: will return a [`ZeroShotAudioClassificationPipeline`].
+            - `"zero-shot-object-detection"`: will return a [`ZeroShotObjectDetectionPipeline`].
 
         model (`str` or [`PreTrainedModel`] or [`TFPreTrainedModel`], *optional*):
             The model that will be used by the pipeline to make predictions. This can be a model identifier or an
@@ -584,10 +657,10 @@ def pipeline(
             Whether or not to allow for custom code defined on the Hub in their own modeling, configuration,
             tokenization or even pipeline files. This option should only be set to `True` for repositories you trust
             and in which you have read the code, as it will execute code present on the Hub on your local machine.
-        model_kwargs:
+        model_kwargs (`Dict[str, Any]`, *optional*):
             Additional dictionary of keyword arguments passed along to the model's `from_pretrained(...,
             **model_kwargs)` function.
-        kwargs:
+        kwargs (`Dict[str, Any]`, *optional*):
             Additional keyword arguments passed along to the specific pipeline init (see the documentation for the
             corresponding pipeline class for possible values).
 
@@ -616,10 +689,18 @@ def pipeline(
         model_kwargs = {}
     # Make sure we only pass use_auth_token once as a kwarg (it used to be possible to pass it in model_kwargs,
     # this is to keep BC).
-    use_auth_token = model_kwargs.pop("use_auth_token", use_auth_token)
+    use_auth_token = model_kwargs.pop("use_auth_token", None)
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+        )
+        if token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        token = use_auth_token
+
     hub_kwargs = {
         "revision": revision,
-        "use_auth_token": use_auth_token,
+        "token": token,
         "trust_remote_code": trust_remote_code,
         "_commit_hash": None,
     }
@@ -652,6 +733,21 @@ def pipeline(
         config = AutoConfig.from_pretrained(config, _from_pipeline=task, **hub_kwargs, **model_kwargs)
         hub_kwargs["_commit_hash"] = config._commit_hash
     elif config is None and isinstance(model, str):
+        # Check for an adapter file in the model path if PEFT is available
+        if is_peft_available():
+            subfolder = hub_kwargs.get("subfolder", None)
+            maybe_adapter_path = find_adapter_config_file(
+                model,
+                revision=revision,
+                token=use_auth_token,
+                subfolder=subfolder,
+            )
+
+            if maybe_adapter_path is not None:
+                with open(maybe_adapter_path, "r", encoding="utf-8") as f:
+                    adapter_config = json.load(f)
+                    model = adapter_config["base_model_name_or_path"]
+
         config = AutoConfig.from_pretrained(model, _from_pipeline=task, **hub_kwargs, **model_kwargs)
         hub_kwargs["_commit_hash"] = config._commit_hash
 
@@ -687,9 +783,8 @@ def pipeline(
                     " set the option `trust_remote_code=True` to remove this error."
                 )
             class_ref = targeted_task["impl"]
-            module_file, class_name = class_ref.split(".")
             pipeline_class = get_class_from_dynamic_module(
-                model, module_file + ".py", class_name, revision=revision, use_auth_token=use_auth_token
+                class_ref, model, revision=revision, use_auth_token=use_auth_token
             )
     else:
         normalized_task, targeted_task, task_options = check_task(task)
@@ -716,6 +811,11 @@ def pipeline(
                 'You cannot use both `pipeline(... device_map=..., model_kwargs={"device_map":...})` as those'
                 " arguments might conflict, use only one.)"
             )
+        if device is not None:
+            logger.warning(
+                "Both `device` and `device_map` are specified. `device` will override `device_map`. You"
+                " will most likely encounter unexpected behavior. Please remove `device` and keep `device_map`."
+            )
         model_kwargs["device_map"] = device_map
     if torch_dtype is not None:
         if "torch_dtype" in model_kwargs:
@@ -727,25 +827,33 @@ def pipeline(
 
     model_name = model if isinstance(model, str) else None
 
-    # Infer the framework from the model
-    # Forced if framework already defined, inferred if it's None
-    # Will load the correct model if possible
-    model_classes = {"tf": targeted_task["tf"], "pt": targeted_task["pt"]}
-    framework, model = infer_framework_load_model(
-        model,
-        model_classes=model_classes,
-        config=config,
-        framework=framework,
-        task=task,
-        **hub_kwargs,
-        **model_kwargs,
-    )
+    # Load the correct model if possible
+    # Infer the framework from the model if not already defined
+    if isinstance(model, str) or framework is None:
+        model_classes = {"tf": targeted_task["tf"], "pt": targeted_task["pt"]}
+        framework, model = infer_framework_load_model(
+            model,
+            model_classes=model_classes,
+            config=config,
+            framework=framework,
+            task=task,
+            **hub_kwargs,
+            **model_kwargs,
+        )
 
     model_config = model.config
     hub_kwargs["_commit_hash"] = model.config._commit_hash
-
     load_tokenizer = type(model_config) in TOKENIZER_MAPPING or model_config.tokenizer_class is not None
     load_feature_extractor = type(model_config) in FEATURE_EXTRACTOR_MAPPING or feature_extractor is not None
+    load_image_processor = type(model_config) in IMAGE_PROCESSOR_MAPPING or image_processor is not None
+
+    # If `model` (instance of `PretrainedModel` instead of `str`) is passed (and/or same for config), while
+    # `image_processor` or `feature_extractor` is `None`, the loading will fail. This happens particularly for some
+    # vision tasks when calling `pipeline()` with `model` and only one of the `image_processor` and `feature_extractor`.
+    # TODO: we need to make `NO_IMAGE_PROCESSOR_TASKS` and `NO_FEATURE_EXTRACTOR_TASKS` more robust to avoid such issue.
+    # This block is only temporarily to make CI green.
+    if load_image_processor and load_feature_extractor:
+        load_feature_extractor = False
 
     if (
         tokenizer is None
@@ -758,6 +866,18 @@ def pipeline(
         # so the model_config might not define a tokenizer, but it seems to be
         # necessary for the task, so we're force-trying to load it.
         load_tokenizer = True
+    if (
+        image_processor is None
+        and not load_image_processor
+        and normalized_task not in NO_IMAGE_PROCESSOR_TASKS
+        # Using class name to avoid importing the real class.
+        and model_config.__class__.__name__ in MULTI_MODEL_CONFIGS
+        and normalized_task != "automatic-speech-recognition"
+    ):
+        # This is a special category of models, that are fusions of multiple models
+        # so the model_config might not define a tokenizer, but it seems to be
+        # necessary for the task, so we're force-trying to load it.
+        load_image_processor = True
     if (
         feature_extractor is None
         and not load_feature_extractor
@@ -779,6 +899,8 @@ def pipeline(
 
     if task in NO_FEATURE_EXTRACTOR_TASKS:
         load_feature_extractor = False
+    if task in NO_IMAGE_PROCESSOR_TASKS:
+        load_image_processor = False
 
     if load_tokenizer:
         # Try to infer tokenizer from model or config name (if provided as str)
@@ -803,12 +925,38 @@ def pipeline(
                 tokenizer_kwargs = tokenizer[1]
             else:
                 tokenizer_identifier = tokenizer
-                tokenizer_kwargs = model_kwargs
+                tokenizer_kwargs = model_kwargs.copy()
+                tokenizer_kwargs.pop("torch_dtype", None)
 
             tokenizer = AutoTokenizer.from_pretrained(
                 tokenizer_identifier, use_fast=use_fast, _from_pipeline=task, **hub_kwargs, **tokenizer_kwargs
             )
 
+    if load_image_processor:
+        # Try to infer image processor from model or config name (if provided as str)
+        if image_processor is None:
+            if isinstance(model_name, str):
+                image_processor = model_name
+            elif isinstance(config, str):
+                image_processor = config
+            # Backward compatibility, as `feature_extractor` used to be the name
+            # for `ImageProcessor`.
+            elif feature_extractor is not None and isinstance(feature_extractor, BaseImageProcessor):
+                image_processor = feature_extractor
+            else:
+                # Impossible to guess what is the right image_processor here
+                raise Exception(
+                    "Impossible to guess which image processor to use. "
+                    "Please provide a PreTrainedImageProcessor class or a path/identifier "
+                    "to a pretrained image processor."
+                )
+
+        # Instantiate image_processor if needed
+        if isinstance(image_processor, (str, tuple)):
+            image_processor = AutoImageProcessor.from_pretrained(
+                image_processor, _from_pipeline=task, **hub_kwargs, **model_kwargs
+            )
+
     if load_feature_extractor:
         # Try to infer feature extractor from model or config name (if provided as str)
         if feature_extractor is None:
@@ -846,8 +994,8 @@ def pipeline(
                             BeamSearchDecoderCTC._LANGUAGE_MODEL_SERIALIZED_DIRECTORY, "*"
                         )
                         alphabet_filename = BeamSearchDecoderCTC._ALPHABET_SERIALIZED_FILENAME
-                        allow_regex = [language_model_glob, alphabet_filename]
-                        decoder = BeamSearchDecoderCTC.load_from_hf_hub(model_name, allow_regex=allow_regex)
+                        allow_patterns = [language_model_glob, alphabet_filename]
+                        decoder = BeamSearchDecoderCTC.load_from_hf_hub(model_name, allow_patterns=allow_patterns)
 
                     kwargs["decoder"] = decoder
                 except ImportError as e:
@@ -877,6 +1025,9 @@ def pipeline(
     if torch_dtype is not None:
         kwargs["torch_dtype"] = torch_dtype
 
+    if image_processor is not None:
+        kwargs["image_processor"] = image_processor
+
     if device is not None:
         kwargs["device"] = device
 
diff --git a/src/transformers/pipelines/audio_classification.py b/src/transformers/pipelines/audio_classification.py
index a58247d41287..96b974b7363a 100644
--- a/src/transformers/pipelines/audio_classification.py
+++ b/src/transformers/pipelines/audio_classification.py
@@ -15,15 +15,14 @@
 from typing import Union
 
 import numpy as np
-
 import requests
 
-from ..utils import add_end_docstrings, is_torch_available, logging
+from ..utils import add_end_docstrings, is_torch_available, is_torchaudio_available, logging
 from .base import PIPELINE_INIT_ARGS, Pipeline
 
 
 if is_torch_available():
-    from ..models.auto.modeling_auto import MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
+    from ..models.auto.modeling_auto import MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES
 
 logger = logging.get_logger(__name__)
 
@@ -99,7 +98,7 @@ def __init__(self, *args, **kwargs):
         if self.framework != "pt":
             raise ValueError(f"The {self.__class__} is only available in PyTorch.")
 
-        self.check_model_type(MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING)
+        self.check_model_type(MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES)
 
     def __call__(
         self,
@@ -111,12 +110,18 @@ def __call__(
         information.
 
         Args:
-            inputs (`np.ndarray` or `bytes` or `str`):
-                The inputs is either a raw waveform (`np.ndarray` of shape (n, ) of type `np.float32` or `np.float64`)
-                at the correct sampling rate (no further check will be done) or a `str` that is the filename of the
-                audio file, the file will be read at the correct sampling rate to get the waveform using *ffmpeg*. This
-                requires *ffmpeg* to be installed on the system. If *inputs* is `bytes` it is supposed to be the
-                content of an audio file and is interpreted by *ffmpeg* in the same way.
+            inputs (`np.ndarray` or `bytes` or `str` or `dict`):
+                The inputs is either :
+                    - `str` that is the filename of the audio file, the file will be read at the correct sampling rate
+                      to get the waveform using *ffmpeg*. This requires *ffmpeg* to be installed on the system.
+                    - `bytes` it is supposed to be the content of an audio file and is interpreted by *ffmpeg* in the
+                      same way.
+                    - (`np.ndarray` of shape (n, ) of type `np.float32` or `np.float64`)
+                        Raw audio at the correct sampling rate (no further check will be done)
+                    - `dict` form can be used to pass raw audio sampled at arbitrary `sampling_rate` and let this
+                      pipeline do the resampling. The dict must be either be in the format `{"sampling_rate": int,
+                      "raw": np.array}`, or `{"sampling_rate": int, "array": np.array}`, where the key `"raw"` or
+                      `"array"` is used to denote the raw audio waveform.
             top_k (`int`, *optional*, defaults to None):
                 The number of top labels that will be returned by the pipeline. If the provided number is `None` or
                 higher than the number of labels available in the model configuration, it will default to the number of
@@ -152,10 +157,42 @@ def preprocess(self, inputs):
         if isinstance(inputs, bytes):
             inputs = ffmpeg_read(inputs, self.feature_extractor.sampling_rate)
 
+        if isinstance(inputs, dict):
+            # Accepting `"array"` which is the key defined in `datasets` for
+            # better integration
+            if not ("sampling_rate" in inputs and ("raw" in inputs or "array" in inputs)):
+                raise ValueError(
+                    "When passing a dictionary to AudioClassificationPipeline, the dict needs to contain a "
+                    '"raw" key containing the numpy array representing the audio and a "sampling_rate" key, '
+                    "containing the sampling_rate associated with that array"
+                )
+
+            _inputs = inputs.pop("raw", None)
+            if _inputs is None:
+                # Remove path which will not be used from `datasets`.
+                inputs.pop("path", None)
+                _inputs = inputs.pop("array", None)
+            in_sampling_rate = inputs.pop("sampling_rate")
+            inputs = _inputs
+            if in_sampling_rate != self.feature_extractor.sampling_rate:
+                import torch
+
+                if is_torchaudio_available():
+                    from torchaudio import functional as F
+                else:
+                    raise ImportError(
+                        "torchaudio is required to resample audio samples in AudioClassificationPipeline. "
+                        "The torchaudio package can be installed through: `pip install torchaudio`."
+                    )
+
+                inputs = F.resample(
+                    torch.from_numpy(inputs), in_sampling_rate, self.feature_extractor.sampling_rate
+                ).numpy()
+
         if not isinstance(inputs, np.ndarray):
             raise ValueError("We expect a numpy ndarray as input")
         if len(inputs.shape) != 1:
-            raise ValueError("We expect a single channel audio input for AutomaticSpeechRecognitionPipeline")
+            raise ValueError("We expect a single channel audio input for AudioClassificationPipeline")
 
         processed = self.feature_extractor(
             inputs, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="pt"
diff --git a/src/transformers/pipelines/audio_utils.py b/src/transformers/pipelines/audio_utils.py
index 8e0969b56f35..6a03abb88460 100644
--- a/src/transformers/pipelines/audio_utils.py
+++ b/src/transformers/pipelines/audio_utils.py
@@ -1,3 +1,5 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+import datetime
 import platform
 import subprocess
 from typing import Optional, Tuple, Union
@@ -36,7 +38,11 @@ def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
     out_bytes = output_stream[0]
     audio = np.frombuffer(out_bytes, np.float32)
     if audio.shape[0] == 0:
-        raise ValueError("Malformed soundfile")
+        raise ValueError(
+            "Soundfile is either not in the correct format or is malformed. Ensure that the soundfile has "
+            "a valid audio file extension (e.g. wav, flac or mp3) and is not corrupted. If reading from a remote "
+            "URL, ensure that the URL is the full address to **download** the audio file."
+        )
     return audio
 
 
@@ -117,7 +123,7 @@ def ffmpeg_microphone_live(
             The length of the striding to be used. Stride is used to provide context to a model on the (left, right) of
             an audio sample but without using that part to actually make the prediction. Setting this does not change
             the length of the chunk.
-        format_for_conversion: (`str`, defalts to `f32le`)
+        format_for_conversion (`str`, defalts to `f32le`)
             The name of the format of the audio samples to be returned by ffmpeg. The standard is `f32le`, `s16le`
             could also be used.
     Return:
@@ -154,6 +160,8 @@ def ffmpeg_microphone_live(
 
     stride_left = int(round(sampling_rate * stride_length_s[0])) * size_of_sample
     stride_right = int(round(sampling_rate * stride_length_s[1])) * size_of_sample
+    audio_time = datetime.datetime.now()
+    delta = datetime.timedelta(seconds=chunk_s)
     for item in chunk_bytes_iter(microphone, chunk_len, stride=(stride_left, stride_right), stream=True):
         # Put everything back in numpy scale
         item["raw"] = np.frombuffer(item["raw"], dtype=dtype)
@@ -162,6 +170,10 @@ def ffmpeg_microphone_live(
             item["stride"][1] // size_of_sample,
         )
         item["sampling_rate"] = sampling_rate
+        audio_time += delta
+        if datetime.datetime.now() > audio_time + 10 * delta:
+            # We're late !! SKIP
+            continue
         yield item
 
 
diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py
index 57cecef44a89..cd053660ad56 100644
--- a/src/transformers/pipelines/automatic_speech_recognition.py
+++ b/src/transformers/pipelines/automatic_speech_recognition.py
@@ -15,23 +15,27 @@
 from typing import TYPE_CHECKING, Dict, Optional, Union
 
 import numpy as np
-
 import requests
 
-from ..utils import is_torch_available, logging
+from ..modelcard import ModelCard
+from ..tokenization_utils import PreTrainedTokenizer
+from ..utils import is_torch_available, is_torchaudio_available, logging
 from .audio_utils import ffmpeg_read
-from .base import ChunkPipeline
+from .base import ArgumentHandler, ChunkPipeline, infer_framework_load_model
 
 
 if TYPE_CHECKING:
     from pyctcdecode import BeamSearchDecoderCTC
 
-    from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
+    from ..feature_extraction_sequence_utils import SequenceFeatureExtractor
+    from ..modeling_utils import PreTrainedModel
 
 logger = logging.get_logger(__name__)
 
 if is_torch_available():
-    from ..models.auto.modeling_auto import MODEL_FOR_CTC_MAPPING, MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING
+    import torch
+
+    from ..models.auto.modeling_auto import MODEL_FOR_CTC_MAPPING_NAMES, MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES
 
 
 def rescale_stride(stride, ratio):
@@ -54,30 +58,53 @@ def rescale_stride(stride, ratio):
     return new_strides
 
 
-def chunk_iter(inputs, feature_extractor, chunk_len, stride_left, stride_right, dtype=None):
+def chunk_iter(inputs, feature_extractor, chunk_len, stride_left, stride_right, rescale=True, dtype=None):
     inputs_len = inputs.shape[0]
     step = chunk_len - stride_left - stride_right
-    for i in range(0, inputs_len, step):
-        # add start and end paddings to the chunk
-        chunk = inputs[i : i + chunk_len]
+    for chunk_start_idx in range(0, inputs_len, step):
+        chunk_end_idx = chunk_start_idx + chunk_len
+        chunk = inputs[chunk_start_idx:chunk_end_idx]
         processed = feature_extractor(chunk, sampling_rate=feature_extractor.sampling_rate, return_tensors="pt")
         if dtype is not None:
             processed = processed.to(dtype=dtype)
-        _stride_left = 0 if i == 0 else stride_left
-        is_last = i + step + stride_left >= inputs_len
+        _stride_left = 0 if chunk_start_idx == 0 else stride_left
+        # all right strides must be full, otherwise it is the last item
+        is_last = chunk_end_idx > inputs_len if stride_right > 0 else chunk_end_idx >= inputs_len
         _stride_right = 0 if is_last else stride_right
 
+        chunk_len = chunk.shape[0]
+        stride = (chunk_len, _stride_left, _stride_right)
         if "input_features" in processed:
             processed_len = processed["input_features"].shape[-1]
         elif "input_values" in processed:
             processed_len = processed["input_values"].shape[-1]
-        chunk_len = chunk.shape[0]
-        stride = (chunk_len, _stride_left, _stride_right)
-        if processed_len != chunk.shape[-1]:
+        if processed_len != chunk.shape[-1] and rescale:
             ratio = processed_len / chunk_len
             stride = rescale_stride([stride], ratio)[0]
         if chunk.shape[0] > _stride_left:
             yield {"is_last": is_last, "stride": stride, **processed}
+        if is_last:
+            break
+
+
+def _fast_find_longest_common_sequence(sequence_left, sequence_right):
+    seq_len_left = len(sequence_left)
+    seq_len_right = len(sequence_right)
+    counter = [[0] * (seq_len_right + 1) for _ in range(seq_len_left + 1)]
+    longest = 0
+    for i in range(seq_len_left):
+        for j in range(seq_len_right):
+            if sequence_left[i] == sequence_right[j]:
+                previous_counter = counter[i][j] + 1
+                counter[i + 1][j + 1] = previous_counter
+                if previous_counter > longest:
+                    longest = previous_counter
+
+    counter = np.array(counter)
+    # we return the idx of the first element of the longest common sequence in the left sequence
+    index_left = np.argwhere(counter == longest)[-1][0] - longest if longest != 0 else -1
+    index_right = np.argwhere(counter == longest)[-1][1] - longest if longest != 0 else -1
+    return index_left, index_right, longest
 
 
 def _find_longest_common_sequence(sequences, tokenizer):
@@ -134,8 +161,7 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline):
         feature_extractor ([`SequenceFeatureExtractor`]):
             The feature extractor that will be used by the pipeline to encode waveform for the model.
         chunk_length_s (`float`, *optional*, defaults to 0):
-            The input length for in each chunk. If `chunk_length_s = 0` then chunking is disabled (default). Only
-            available for CTC models, e.g. [`Wav2Vec2ForCTC`].
+            The input length for in each chunk. If `chunk_length_s = 0` then chunking is disabled (default).
 
             <Tip>
 
@@ -161,9 +187,9 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline):
             installed. If no framework is specified, will default to the one currently installed. If no framework is
             specified and both frameworks are installed, will default to the framework of the `model`, or to PyTorch if
             no model is provided.
-        device (`int`, *optional*, defaults to -1):
-            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, a positive will run the model on
-            the associated CUDA device id.
+        device (Union[`int`, `torch.device`], *optional*):
+            Device ordinal for CPU/GPU supports. Setting this to `None` will leverage CPU, a positive will run the
+            model on the associated CUDA device id.
         decoder (`pyctcdecode.BeamSearchDecoderCTC`, *optional*):
             [PyCTCDecode's
             BeamSearchDecoderCTC](https://github.com/kensho-technologies/pyctcdecode/blob/2fd33dc37c4111417e08d89ccd23d28e9b308d19/pyctcdecode/decoder.py#L180)
@@ -173,15 +199,81 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline):
 
     def __init__(
         self,
-        feature_extractor: Union["SequenceFeatureExtractor", str],
-        *,
+        model: "PreTrainedModel",
+        feature_extractor: Union["SequenceFeatureExtractor", str] = None,
+        tokenizer: Optional[PreTrainedTokenizer] = None,
         decoder: Optional[Union["BeamSearchDecoderCTC", str]] = None,
-        **kwargs
+        modelcard: Optional[ModelCard] = None,
+        framework: Optional[str] = None,
+        task: str = "",
+        args_parser: ArgumentHandler = None,
+        device: Union[int, "torch.device"] = None,
+        torch_dtype: Optional[Union[str, "torch.dtype"]] = None,
+        binary_output: bool = False,
+        **kwargs,
     ):
-        super().__init__(**kwargs)
+        if framework is None:
+            framework, model = infer_framework_load_model(model, config=model.config)
+
+        self.task = task
+        self.model = model
+        self.tokenizer = tokenizer
         self.feature_extractor = feature_extractor
+        self.modelcard = modelcard
+        self.framework = framework
 
-        if self.model.__class__ in MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING.values():
+        # `accelerate` device map
+        hf_device_map = getattr(self.model, "hf_device_map", None)
+
+        if hf_device_map is not None and device is not None:
+            raise ValueError(
+                "The model has been loaded with `accelerate` and therefore cannot be moved to a specific device. Please "
+                "discard the `device` argument when creating your pipeline object."
+            )
+
+        if self.framework == "tf":
+            raise ValueError("The AutomaticSpeechRecognitionPipeline is only available in PyTorch.")
+
+        # We shouldn't call `model.to()` for models loaded with accelerate
+        if device is not None and not (isinstance(device, int) and device < 0):
+            self.model.to(device)
+
+        if device is None:
+            if hf_device_map is not None:
+                # Take the first device used by `accelerate`.
+                device = next(iter(hf_device_map.values()))
+            else:
+                device = -1
+
+        if is_torch_available() and self.framework == "pt":
+            if isinstance(device, torch.device):
+                self.device = device
+            elif isinstance(device, str):
+                self.device = torch.device(device)
+            elif device < 0:
+                self.device = torch.device("cpu")
+            else:
+                self.device = torch.device(f"cuda:{device}")
+        else:
+            self.device = device if device is not None else -1
+        self.torch_dtype = torch_dtype
+        self.binary_output = binary_output
+
+        # Update config and generation_config with task specific parameters
+        task_specific_params = self.model.config.task_specific_params
+        if task_specific_params is not None and task in task_specific_params:
+            self.model.config.update(task_specific_params.get(task))
+            if self.model.can_generate():
+                self.model.generation_config.update(**task_specific_params.get(task))
+
+        self.call_count = 0
+        self._batch_size = kwargs.pop("batch_size", None)
+        self._num_workers = kwargs.pop("num_workers", None)
+
+        # set the model type so we can check we have the right pre- and post-processing parameters
+        if self.model.config.model_type == "whisper":
+            self.type = "seq2seq_whisper"
+        elif self.model.__class__.__name__ in MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES.values():
             self.type = "seq2seq"
         elif (
             feature_extractor._processor_class
@@ -193,10 +285,11 @@ def __init__(
         else:
             self.type = "ctc"
 
-        if self.framework == "tf":
-            raise ValueError("The AutomaticSpeechRecognitionPipeline is only available in PyTorch.")
+        self._preprocess_params, self._forward_params, self._postprocess_params = self._sanitize_parameters(**kwargs)
 
-        self.check_model_type(dict(MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING.items() + MODEL_FOR_CTC_MAPPING.items()))
+        mapping = MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES.copy()
+        mapping.update(MODEL_FOR_CTC_MAPPING_NAMES)
+        self.check_model_type(mapping)
 
     def __call__(
         self,
@@ -210,8 +303,9 @@ def __call__(
         Args:
             inputs (`np.ndarray` or `bytes` or `str` or `dict`):
                 The inputs is either :
-                    - `str` that is the filename of the audio file, the file will be read at the correct sampling rate
-                      to get the waveform using *ffmpeg*. This requires *ffmpeg* to be installed on the system.
+                    - `str` that is either the filename of a local audio file, or a public URL address to download the
+                      audio file. The file will be read at the correct sampling rate to get the waveform using
+                      *ffmpeg*. This requires *ffmpeg* to be installed on the system.
                     - `bytes` it is supposed to be the content of an audio file and is interpreted by *ffmpeg* in the
                       same way.
                     - (`np.ndarray` of shape (n, ) of type `np.float32` or `np.float64`)
@@ -221,14 +315,29 @@ def __call__(
                       np.array}` with optionally a `"stride": (left: int, right: int)` than can ask the pipeline to
                       treat the first `left` samples and last `right` samples to be ignored in decoding (but used at
                       inference to provide more context to the model). Only use `stride` with CTC models.
-            return_timestamps (*optional*, `str`):
-                Only available for pure CTC models. If set to `"char"`, the pipeline will return `timestamps` along the
-                text for every character in the text. For instance if you get `[{"text": "h", "timestamps": (0.5,0.6),
-                {"text": "i", "timestamps": (0.7, .9)}]`, then it means the model predicts that the letter "h" was
-                pronounced after `0.5` and before `0.6` seconds. If set to `"word"`, the pipeline will return
-                `timestamps` along the text for every word in the text. For instance if you get `[{"text": "hi ",
-                "timestamps": (0.5,0.9), {"text": "there", "timestamps": (1.0, .1.5)}]`, then it means the model
-                predicts that the word "hi" was pronounced after `0.5` and before `0.9` seconds.
+            return_timestamps (*optional*, `str` or `bool`):
+                Only available for pure CTC models (Wav2Vec2, HuBERT, etc) and the Whisper model. Not available for
+                other sequence-to-sequence models.
+
+                For CTC models, timestamps can take one of two formats:
+                    - `"char"`: the pipeline will return timestamps along the text for every character in the text. For
+                        instance, if you get `[{"text": "h", "timestamp": (0.5, 0.6)}, {"text": "i", "timestamp": (0.7,
+                        0.9)}]`, then it means the model predicts that the letter "h" was spoken after `0.5` and before
+                        `0.6` seconds.
+                    - `"word"`: the pipeline will return timestamps along the text for every word in the text. For
+                        instance, if you get `[{"text": "hi ", "timestamp": (0.5, 0.9)}, {"text": "there", "timestamp":
+                        (1.0, 1.5)}]`, then it means the model predicts that the word "hi" was spoken after `0.5` and
+                        before `0.9` seconds.
+
+                For the Whisper model, timestamps can take one of two formats:
+                    - `"word"`: same as above for word-level CTC timestamps. Word-level timestamps are predicted
+                        through the *dynamic-time warping (DTW)* algorithm, an approximation to word-level timestamps
+                        by inspecting the cross-attention weights.
+                    - `True`: the pipeline will return timestamps along the text for *segments* of words in the text.
+                        For instance, if you get `[{"text": " Hi there!", "timestamp": (0.5, 1.5)}]`, then it means the
+                        model predicts that the segment "Hi there!" was spoken after `0.5` and before `1.5` seconds.
+                        Note that a segment of text refers to a sequence of one or more words, rather than individual
+                        words as with word-level timestamps.
             generate_kwargs (`dict`, *optional*):
                 The dictionary of ad-hoc parametrization of `generate_config` to be used for the generation call. For a
                 complete overview of generate, check the [following
@@ -238,12 +347,12 @@ def __call__(
 
         Return:
             `Dict`: A dictionary with the following keys:
-                - **text** (`str` ) -- The recognized text.
+                - **text** (`str`): The recognized text.
                 - **chunks** (*optional(, `List[Dict]`)
-                        When using `return_timestamps`, the `chunks` will become a list containing all the various text
-                        chunks identified by the model, *e.g.* `[{"text": "hi ", "timestamps": (0.5,0.9), {"text":
-                        "there", "timestamps": (1.0, 1.5)}]`. The original full text can roughly be recovered by doing
-                        `"".join(chunk["text"] for chunk in output["chunks"])`.
+                    When using `return_timestamps`, the `chunks` will become a list containing all the various text
+                    chunks identified by the model, *e.g.* `[{"text": "hi ", "timestamp": (0.5, 0.9)}, {"text":
+                    "there", "timestamp": (1.0, 1.5)}]`. The original full text can roughly be recovered by doing
+                    `"".join(chunk["text"] for chunk in output["chunks"])`.
         """
         return super().__call__(inputs, **kwargs)
 
@@ -254,19 +363,25 @@ def _sanitize_parameters(
         ignore_warning=None,
         decoder_kwargs=None,
         return_timestamps=None,
+        return_language=None,
         generate_kwargs=None,
         max_new_tokens=None,
     ):
         # No parameters on this pipeline right now
         preprocess_params = {}
         if chunk_length_s is not None:
+            if self.type == "seq2seq" and not ignore_warning:
+                logger.warning(
+                    "Using `chunk_length_s` is very experimental with seq2seq models. The results will not necessarily"
+                    " be entirely accurate and will have caveats. More information:"
+                    " https://github.com/huggingface/transformers/pull/20104. Ignore this warning with pipeline(...,"
+                    " ignore_warning=True)"
+                )
             preprocess_params["chunk_length_s"] = chunk_length_s
         if stride_length_s is not None:
             preprocess_params["stride_length_s"] = stride_length_s
-        if ignore_warning is not None:
-            preprocess_params["ignore_warning"] = ignore_warning
 
-        forward_params = {"generate_kwargs": {}}
+        forward_params = defaultdict(dict)
         if max_new_tokens is not None:
             forward_params["generate_kwargs"]["max_new_tokens"] = max_new_tokens
         if generate_kwargs is not None:
@@ -281,11 +396,31 @@ def _sanitize_parameters(
         if decoder_kwargs is not None:
             postprocess_params["decoder_kwargs"] = decoder_kwargs
         if return_timestamps is not None:
+            # Check whether we have a valid setting for return_timestamps and throw an error before we perform a forward pass
+            if self.type == "seq2seq" and return_timestamps:
+                raise ValueError("We cannot return_timestamps yet on non-CTC models apart from Whisper!")
+            if self.type == "ctc_with_lm" and return_timestamps != "word":
+                raise ValueError("CTC with LM can only predict word level timestamps, set `return_timestamps='word'`")
+            if self.type == "ctc" and return_timestamps not in ["char", "word"]:
+                raise ValueError(
+                    "CTC can either predict character level timestamps, or word level timestamps."
+                    "Set `return_timestamps='char'` or `return_timestamps='word'` as required."
+                )
+            if self.type == "seq2seq_whisper" and return_timestamps == "char":
+                raise ValueError(
+                    "Whisper cannot return `char` timestamps, only word level or segment level timestamps. "
+                    "Use `return_timestamps='word'` or `return_timestamps=True` respectively."
+                )
+            forward_params["return_timestamps"] = return_timestamps
             postprocess_params["return_timestamps"] = return_timestamps
+        if return_language is not None:
+            if self.type != "seq2seq_whisper":
+                raise ValueError("Only Whisper can return language for now.")
+            postprocess_params["return_language"] = return_language
 
         return preprocess_params, forward_params, postprocess_params
 
-    def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None, ignore_warning=False):
+    def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None):
         if isinstance(inputs, str):
             if inputs.startswith("http://") or inputs.startswith("https://"):
                 # We need to actually check for a real protocol, otherwise it's impossible to use a local file
@@ -313,13 +448,20 @@ def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None, ignore_warn
 
             _inputs = inputs.pop("raw", None)
             if _inputs is None:
+                # Remove path which will not be used from `datasets`.
+                inputs.pop("path", None)
                 _inputs = inputs.pop("array", None)
             in_sampling_rate = inputs.pop("sampling_rate")
             extra = inputs
             inputs = _inputs
             if in_sampling_rate != self.feature_extractor.sampling_rate:
-                import torch
-                from torchaudio import functional as F
+                if is_torchaudio_available():
+                    from torchaudio import functional as F
+                else:
+                    raise ImportError(
+                        "torchaudio is required to resample audio samples in AutomaticSpeechRecognitionPipeline. "
+                        "The torchaudio package can be installed through: `pip install torchaudio`."
+                    )
 
                 inputs = F.resample(
                     torch.from_numpy(inputs), in_sampling_rate, self.feature_extractor.sampling_rate
@@ -342,14 +484,6 @@ def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None, ignore_warn
             raise ValueError("We expect a single channel audio input for AutomaticSpeechRecognitionPipeline")
 
         if chunk_length_s:
-            if self.type == "seq2seq" and not ignore_warning:
-                logger.warning(
-                    "Using `chunk_length_s` is very experimental with seq2seq models. The results will not necessarily"
-                    " be entirely accurate and will have caveats. More information:"
-                    " https://github.com/huggingface/transformers/pull/20104. Ignore this warning with pipeline(...,"
-                    " ignore_warning=True)"
-                )
-                self._preprocess_params["ignore_warning"] = True
             if stride_length_s is None:
                 stride_length_s = chunk_length_s / 6
 
@@ -367,9 +501,10 @@ def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None, ignore_warn
             if chunk_len < stride_left + stride_right:
                 raise ValueError("Chunk length must be superior to stride length")
 
+            rescale = self.type != "seq2seq_whisper"
             # make sure that
             for item in chunk_iter(
-                inputs, self.feature_extractor, chunk_len, stride_left, stride_right, self.torch_dtype
+                inputs, self.feature_extractor, chunk_len, stride_left, stride_right, rescale, self.torch_dtype
             ):
                 yield item
         else:
@@ -379,18 +514,21 @@ def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None, ignore_warn
             if self.torch_dtype is not None:
                 processed = processed.to(dtype=self.torch_dtype)
             if stride is not None:
-                if self.model.__class__ in MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING.values():
-                    raise ValueError("Stride is only usable with CTC models, try removing it")
+                if self.type == "seq2seq":
+                    raise ValueError("Stride is only usable with CTC models, try removing it !")
 
                 processed["stride"] = stride
             yield {"is_last": True, **processed, **extra}
 
-    def _forward(self, model_inputs, generate_kwargs=None):
+    def _forward(self, model_inputs, return_timestamps=False, generate_kwargs=None):
         if generate_kwargs is None:
             generate_kwargs = {}
 
+        attention_mask = model_inputs.pop("attention_mask", None)
+        stride = model_inputs.pop("stride", None)
         is_last = model_inputs.pop("is_last")
-        if self.type == "seq2seq":
+
+        if self.type in {"seq2seq", "seq2seq_whisper"}:
             encoder = self.model.get_encoder()
             # Consume values so we can let extra information flow freely through
             # the pipeline (important for `partial` in microphone)
@@ -404,24 +542,30 @@ def _forward(self, model_inputs, generate_kwargs=None):
                     f"`input_features` or `input_values` key, but only has {model_inputs.keys()}"
                 )
 
-            # we need to pass `processed.get("attention_mask")` here since audio encoder
-            # attention mask  length is different from expected text decoder `encoder_attention_mask` length
-            # `generate` magic to create the mask automatically won't work, we basically need to help
-            # it here.
-            attention_mask = model_inputs.pop("attention_mask", None)
+            # custom processing for Whisper timestamps and word-level timestamps
+            if return_timestamps and self.type == "seq2seq_whisper":
+                generate_kwargs["return_timestamps"] = return_timestamps
+                if return_timestamps == "word":
+                    generate_kwargs["return_token_timestamps"] = True
+
+                    if stride is not None:
+                        generate_kwargs["num_frames"] = stride[0] // self.feature_extractor.hop_length
 
             tokens = self.model.generate(
                 encoder_outputs=encoder(inputs, attention_mask=attention_mask),
                 attention_mask=attention_mask,
                 **generate_kwargs,
             )
-
-            out = {"tokens": tokens}
+            if return_timestamps == "word" and self.type == "seq2seq_whisper":
+                out = {"tokens": tokens["sequences"], "token_timestamps": tokens["token_timestamps"]}
+            else:
+                out = {"tokens": tokens}
+            if self.type == "seq2seq_whisper":
+                if stride is not None:
+                    out["stride"] = stride
 
         else:
-            stride = model_inputs.pop("stride", None)
             input_values = model_inputs.pop("input_values")
-            attention_mask = model_inputs.pop("attention_mask", None)
             outputs = self.model(input_values=input_values, attention_mask=attention_mask)
             logits = outputs.logits
 
@@ -442,21 +586,18 @@ def _forward(self, model_inputs, generate_kwargs=None):
         extra = model_inputs
         return {"is_last": is_last, **out, **extra}
 
-    def postprocess(self, model_outputs, decoder_kwargs: Optional[Dict] = None, return_timestamps=None):
+    def postprocess(
+        self, model_outputs, decoder_kwargs: Optional[Dict] = None, return_timestamps=None, return_language=None
+    ):
         # Optional return types
         optional = {}
 
-        if return_timestamps and self.type == "seq2seq":
-            raise ValueError("We cannot return_timestamps yet on non-ctc models !")
-        if return_timestamps == "char" and self.type == "ctc_with_lm":
-            raise ValueError("CTC with LM cannot return `char` timestamps, only `words`")
-
         final_items = []
         key = "logits" if self.type == "ctc_with_lm" else "tokens"
         stride = None
         for outputs in model_outputs:
             items = outputs[key].numpy()
-            stride = outputs.pop("stride", None)
+            stride = outputs.get("stride", None)
             if stride is not None and self.type in {"ctc", "ctc_with_lm"}:
                 total_n, left, right = stride
                 # Total_n might be < logits.shape[1]
@@ -466,11 +607,32 @@ def postprocess(self, model_outputs, decoder_kwargs: Optional[Dict] = None, retu
                 right_n = total_n - right
                 items = items[:, left:right_n]
             final_items.append(items)
+
         if stride and self.type == "seq2seq":
             items = _find_longest_common_sequence(final_items, self.tokenizer)
+        elif self.type == "seq2seq_whisper":
+            time_precision = self.feature_extractor.chunk_length / self.model.config.max_source_positions
+            # Send the chunking back to seconds, it's easier to handle in whisper
+            sampling_rate = self.feature_extractor.sampling_rate
+            for output in model_outputs:
+                if "stride" in output:
+                    chunk_len, stride_left, stride_right = output["stride"]
+                    # Go back in seconds
+                    chunk_len /= sampling_rate
+                    stride_left /= sampling_rate
+                    stride_right /= sampling_rate
+                    output["stride"] = chunk_len, stride_left, stride_right
+
+            text, optional = self.tokenizer._decode_asr(
+                model_outputs,
+                return_timestamps=return_timestamps,
+                return_language=return_language,
+                time_precision=time_precision,
+            )
         else:
             items = np.concatenate(final_items, axis=1)
             items = items.squeeze(0)
+
         if self.type == "ctc_with_lm":
             if decoder_kwargs is None:
                 decoder_kwargs = {}
@@ -480,27 +642,20 @@ def postprocess(self, model_outputs, decoder_kwargs: Optional[Dict] = None, retu
                 # Simply cast from pyctcdecode format to wav2vec2 format to leverage
                 # pre-existing code later
                 chunk_offset = beams[0][2]
-                word_offsets = []
+                offsets = []
                 for word, (start_offset, end_offset) in chunk_offset:
-                    word_offsets.append({"word": word, "start_offset": start_offset, "end_offset": end_offset})
-
-        else:
+                    offsets.append({"word": word, "start_offset": start_offset, "end_offset": end_offset})
+        elif self.type != "seq2seq_whisper":
             skip_special_tokens = self.type != "ctc"
             text = self.tokenizer.decode(items, skip_special_tokens=skip_special_tokens)
             if return_timestamps:
-                char_offsets = self.tokenizer.decode(
+                offsets = self.tokenizer.decode(
                     items, skip_special_tokens=skip_special_tokens, output_char_offsets=True
                 )["char_offsets"]
                 if return_timestamps == "word":
-                    word_offsets = self.tokenizer._get_word_offsets(
-                        char_offsets, self.tokenizer.replace_word_delimiter_char
-                    )
+                    offsets = self.tokenizer._get_word_offsets(offsets, self.tokenizer.replace_word_delimiter_char)
 
-        if return_timestamps:
-            if return_timestamps == "word":
-                offsets = word_offsets
-            else:
-                offsets = char_offsets
+        if return_timestamps and self.type not in {"seq2seq", "seq2seq_whisper"}:
             chunks = []
             for item in offsets:
                 start = item["start_offset"] * self.model.config.inputs_to_logits_ratio
@@ -517,6 +672,114 @@ def postprocess(self, model_outputs, decoder_kwargs: Optional[Dict] = None, retu
             output.pop("tokens", None)
             output.pop("logits", None)
             output.pop("is_last", None)
+            output.pop("stride", None)
+            output.pop("token_timestamps", None)
             for k, v in output.items():
                 extra[k].append(v)
         return {"text": text, **optional, **extra}
+
+
+def _find_timestamp_sequence(sequences, tokenizer, feature_extractor, max_source_positions):
+    """
+    Computes the final sequences by merging the end of the nth sequence with the beginning of the n+1th sequence. Since
+    `WhisperForConditionalGeneration` produces the timestamps pairwise, we filter the consecutive timestamps and only
+    iterate over them. We keep track of the `time` which indicates the actual starting time of the chunk that is
+    processed. We need to make sure to offset the timestamps tokens by the `time` in order for the tokenizer to
+    properly compute the final `offset`.
+    """
+    # index of the first timestamp token
+    timestamp_begin = tokenizer.convert_tokens_to_ids("<|notimestamps|>") + 1
+    items = []
+    # approximation of the token to time ratio : ~0.2seconds
+    time_precision = feature_extractor.chunk_length / max_source_positions
+    time = 0
+    for seq_idx, item in enumerate(sequences):
+        sequence, stride = item
+        if isinstance(sequence, list):
+            sequence = np.array(sequence)
+        chunk_len, stride_left, stride_right = stride
+        sequence = sequence.squeeze(0)
+        # get rid of the `forced_decoder_idx` that are use to parametrize the generation
+        begin_idx = np.where(sequence == timestamp_begin)[0][0] if timestamp_begin in sequence else 0
+        sequence = sequence[begin_idx:]
+
+        timestamp_tokens = sequence >= timestamp_begin
+        if seq_idx != 0 and sum(timestamp_tokens) > 0:
+            consecutive = np.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0] + 1
+            last_timestamp = np.where(timestamp_tokens)[0][-1]
+            consecutive = np.append(consecutive, last_timestamp) if last_timestamp not in consecutive else consecutive
+            time -= stride_left + stride_right
+            offset = int((time / feature_extractor.sampling_rate) / time_precision)
+            overlap_time = int((stride_left / feature_extractor.sampling_rate) / time_precision)
+            # relevant timestamps are in the overlapping part
+            relevant_timestamp = np.where(sequence[consecutive] >= timestamp_begin + overlap_time)[0]
+            if relevant_timestamp.shape[0] > 0:
+                relevant_timestamp = (
+                    consecutive[relevant_timestamp[0] - 1] if relevant_timestamp[0] > 0 else consecutive[0]
+                )
+                # if a big stride is used, we need to check some of the previous items for the best overlap
+                best_match = 0
+                sliced_sequence = []
+                for idx, previous_sequence in enumerate(reversed(items)):
+                    previous_tokens = previous_sequence[1:-1]
+                    if previous_sequence[0] < (timestamp_begin + offset - overlap_time) and idx != 0:
+                        break  # the previous sequence is too far in the past
+                    if len(previous_tokens) > 0:
+                        # find the longest common sequence between the overlapping parts
+                        index_left, index_right, match_length = _fast_find_longest_common_sequence(
+                            sequence[1:relevant_timestamp], previous_tokens
+                        )
+                        # don't do anything if only 1 token was matched
+                        if match_length > 1 and match_length > best_match:
+                            best_match = match_length
+                            best_idx = idx
+                            end_of_curr_sequence_idx = (
+                                np.where(sequence[index_left + 1 :] >= timestamp_begin)[0][0] + 1
+                            )
+                            end_of_curr_sequence_idx = end_of_curr_sequence_idx + 1 + index_left
+                            # if all the tokens are matched, suffix
+                            if index_left == 0 and match_length == len(previous_tokens):
+                                sliced_sequence = np.insert(
+                                    sequence[index_left + 1 : end_of_curr_sequence_idx], 0, previous_sequence[0]
+                                )
+                                sliced_sequence[-1] = previous_sequence[-1]
+                            # if part of the previous sequence is not taken
+                            elif index_left >= 0:
+                                sliced_sequence = sequence[index_left + 1 : end_of_curr_sequence_idx]
+                                # let's insert the missing part of the previous sequence
+                                previous_slice = (
+                                    previous_sequence[: index_right + 1] if index_right > 0 else [previous_sequence[0]]
+                                )
+                                sliced_sequence = np.insert(sliced_sequence, 0, previous_slice)
+                                sliced_sequence[-1] += offset
+
+                if len(sliced_sequence) > 0:
+                    items[len(items) - best_idx - 1] = sliced_sequence
+                    items = items[: len(items) - best_idx]
+                    sequence = sequence[end_of_curr_sequence_idx:]
+
+        # sequence might have changed
+        timestamp_tokens = sequence >= timestamp_begin
+        consecutive = np.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0] + 1
+        if sum(timestamp_tokens) > 0:
+            last_timestamp = np.where(timestamp_tokens)[0][-1]
+            consecutive = (
+                np.append(consecutive, last_timestamp + 1) if last_timestamp not in consecutive else consecutive
+            )
+
+        if len(consecutive) > 0:
+            last_slice = 0
+            for current_slice in consecutive:
+                actual_offset = items[-1][-1] if seq_idx != 0 or last_slice != 0 else sequence[0]
+                sliced_tokens = sequence[last_slice:current_slice]
+                duration = sliced_tokens[-1] - sliced_tokens[0]
+                sliced_tokens[0] = actual_offset
+                sliced_tokens[-1] = actual_offset + duration
+                items.append(sliced_tokens)
+                last_slice = current_slice
+
+        time += chunk_len
+    result = []
+    for i in range(len(items)):
+        result += items[i].tolist()
+    return result
diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index 038da8865fdd..36c9585a69d7 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -19,6 +19,7 @@
 import os
 import pickle
 import sys
+import traceback
 import types
 import warnings
 from abc import ABC, abstractmethod
@@ -27,14 +28,13 @@
 from os.path import abspath, exists
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
-from packaging import version
-
 from ..dynamic_module_utils import custom_object_save
 from ..feature_extraction_utils import PreTrainedFeatureExtractor
+from ..image_processing_utils import BaseImageProcessor
 from ..modelcard import ModelCard
 from ..models.auto.configuration_auto import AutoConfig
 from ..tokenization_utils import PreTrainedTokenizer
-from ..utils import ModelOutput, add_end_docstrings, is_tf_available, is_torch_available, logging
+from ..utils import ModelOutput, add_end_docstrings, infer_framework, is_tf_available, is_torch_available, logging
 
 
 GenericTensor = Union[List["GenericTensor"], "torch.Tensor", "tf.Tensor"]
@@ -76,10 +76,13 @@ def _pad(items, key, padding_value, padding_side):
         # Others include `attention_mask` etc...
         shape = items[0][key].shape
         dim = len(shape)
-        if key == "pixel_values":
+        if key in ["pixel_values", "image"]:
             # This is probable image so padding shouldn't be necessary
             # B, C, H, W
             return torch.cat([item[key] for item in items], dim=0)
+        elif dim == 4 and key == "input_features":
+            # this is probably a mel spectrogram batched
+            return torch.cat([item[key] for item in items], dim=0)
         max_length = max(item[key].shape[1] for item in items)
         min_length = min(item[key].shape[1] for item in items)
         dtype = items[0][key].dtype
@@ -92,6 +95,8 @@ def _pad(items, key, padding_value, padding_side):
             tensor = torch.zeros((batch_size, max_length), dtype=dtype) + padding_value
         elif dim == 3:
             tensor = torch.zeros((batch_size, max_length, shape[-1]), dtype=dtype) + padding_value
+        elif dim == 4:
+            tensor = torch.zeros((batch_size, max_length, shape[-2], shape[-1]), dtype=dtype) + padding_value
 
         for i, item in enumerate(items):
             if dim == 2:
@@ -104,6 +109,12 @@ def _pad(items, key, padding_value, padding_side):
                     tensor[i, -len(item[key][0]) :, :] = item[key][0].clone()
                 else:
                     tensor[i, : len(item[key][0]), :] = item[key][0].clone()
+            elif dim == 4:
+                if padding_side == "left":
+                    tensor[i, -len(item[key][0]) :, :, :] = item[key][0].clone()
+                else:
+                    tensor[i, : len(item[key][0]), :, :] = item[key][0].clone()
+
         return tensor
     else:
         return [item[key] for item in items]
@@ -153,7 +164,7 @@ def inner(items):
         for key in keys:
             if key in {"input_ids"}:
                 # ImageGPT uses a feature extractor
-                if feature_extractor is not None:
+                if tokenizer is None and feature_extractor is not None:
                     _padding_value = f_padding_value
                 else:
                     _padding_value = t_padding_value
@@ -238,6 +249,7 @@ def infer_framework_load_model(
         if len(class_tuple) == 0:
             raise ValueError(f"Pipeline cannot infer suitable model classes from {model}")
 
+        all_traceback = {}
         for model_class in class_tuple:
             kwargs = model_kwargs.copy()
             if framework == "pt" and model.endswith(".h5"):
@@ -260,12 +272,19 @@ def infer_framework_load_model(
                 # Stop loading on the first successful load.
                 break
             except (OSError, ValueError):
+                all_traceback[model_class.__name__] = traceback.format_exc()
                 continue
 
         if isinstance(model, str):
-            raise ValueError(f"Could not load model {model} with any of the following classes: {class_tuple}.")
+            error = ""
+            for class_name, trace in all_traceback.items():
+                error += f"while loading with {class_name}, an error is thrown:\n{trace}\n"
+            raise ValueError(
+                f"Could not load model {model} with any of the following classes: {class_tuple}. See the original errors:\n\n{error}\n"
+            )
 
-    framework = "tf" if model.__class__.__name__.startswith("TF") else "pt"
+    if framework is None:
+        framework = infer_framework(model.__class__)
     return framework, model
 
 
@@ -338,7 +357,7 @@ def get_framework(model, revision: Optional[str] = None):
             except OSError:
                 model = TFAutoModel.from_pretrained(model, revision=revision)
 
-    framework = "tf" if model.__class__.__name__.startswith("TF") else "pt"
+    framework = infer_framework(model.__class__)
     return framework
 
 
@@ -426,9 +445,9 @@ class PipelineDataFormat:
     pipelines keyword arguments through the `dataset_kwarg_1=dataset_column_1` format.
 
     Args:
-        output_path (`str`, *optional*): Where to save the outgoing data.
-        input_path (`str`, *optional*): Where to look for the input data.
-        column (`str`, *optional*): The column to read.
+        output_path (`str`): Where to save the outgoing data.
+        input_path (`str`): Where to look for the input data.
+        column (`str`): The column to read.
         overwrite (`bool`, *optional*, defaults to `False`):
             Whether or not to overwrite the `output_path`.
     """
@@ -502,7 +521,7 @@ def from_str(
         Creates an instance of the right subclass of [`~pipelines.PipelineDataFormat`] depending on `format`.
 
         Args:
-            format: (`str`):
+            format (`str`):
                 The format of the desired pipeline. Acceptable values are `"json"`, `"csv"` or `"pipe"`.
             output_path (`str`, *optional*):
                 Where to save the outgoing data.
@@ -531,9 +550,9 @@ class CsvPipelineDataFormat(PipelineDataFormat):
     Support for pipelines using CSV data format.
 
     Args:
-        output_path (`str`, *optional*): Where to save the outgoing data.
-        input_path (`str`, *optional*): Where to look for the input data.
-        column (`str`, *optional*): The column to read.
+        output_path (`str`): Where to save the outgoing data.
+        input_path (`str`): Where to look for the input data.
+        column (`str`): The column to read.
         overwrite (`bool`, *optional*, defaults to `False`):
             Whether or not to overwrite the `output_path`.
     """
@@ -575,9 +594,9 @@ class JsonPipelineDataFormat(PipelineDataFormat):
     Support for pipelines using JSON file format.
 
     Args:
-        output_path (`str`, *optional*): Where to save the outgoing data.
-        input_path (`str`, *optional*): Where to look for the input data.
-        column (`str`, *optional*): The column to read.
+        output_path (`str`): Where to save the outgoing data.
+        input_path (`str`): Where to look for the input data.
+        column (`str`): The column to read.
         overwrite (`bool`, *optional*, defaults to `False`):
             Whether or not to overwrite the `output_path`.
     """
@@ -619,9 +638,9 @@ class PipedPipelineDataFormat(PipelineDataFormat):
     If columns are provided, then the output will be a dictionary with {column_x: value_x}
 
     Args:
-        output_path (`str`, *optional*): Where to save the outgoing data.
-        input_path (`str`, *optional*): Where to look for the input data.
-        column (`str`, *optional*): The column to read.
+        output_path (`str`): Where to save the outgoing data.
+        input_path (`str`): Where to look for the input data.
+        column (`str`): The column to read.
         overwrite (`bool`, *optional*, defaults to `False`):
             Whether or not to overwrite the `output_path`.
     """
@@ -743,11 +762,12 @@ def __init__(
         model: Union["PreTrainedModel", "TFPreTrainedModel"],
         tokenizer: Optional[PreTrainedTokenizer] = None,
         feature_extractor: Optional[PreTrainedFeatureExtractor] = None,
+        image_processor: Optional[BaseImageProcessor] = None,
         modelcard: Optional[ModelCard] = None,
         framework: Optional[str] = None,
         task: str = "",
         args_parser: ArgumentHandler = None,
-        device: Union[int, str, "torch.device"] = -1,
+        device: Union[int, "torch.device"] = None,
         torch_dtype: Optional[Union[str, "torch.dtype"]] = None,
         binary_output: bool = False,
         **kwargs,
@@ -759,8 +779,30 @@ def __init__(
         self.model = model
         self.tokenizer = tokenizer
         self.feature_extractor = feature_extractor
+        self.image_processor = image_processor
         self.modelcard = modelcard
         self.framework = framework
+
+        # `accelerate` device map
+        hf_device_map = getattr(self.model, "hf_device_map", None)
+
+        if hf_device_map is not None and device is not None:
+            raise ValueError(
+                "The model has been loaded with `accelerate` and therefore cannot be moved to a specific device. Please "
+                "discard the `device` argument when creating your pipeline object."
+            )
+
+        # We shouldn't call `model.to()` for models loaded with accelerate
+        if self.framework == "pt" and device is not None and not (isinstance(device, int) and device < 0):
+            self.model.to(device)
+
+        if device is None:
+            if hf_device_map is not None:
+                # Take the first device used by `accelerate`.
+                device = next(iter(hf_device_map.values()))
+            else:
+                device = -1
+
         if is_torch_available() and self.framework == "pt":
             if isinstance(device, torch.device):
                 self.device = device
@@ -771,31 +813,38 @@ def __init__(
             else:
                 self.device = torch.device(f"cuda:{device}")
         else:
-            self.device = device
+            self.device = device if device is not None else -1
         self.torch_dtype = torch_dtype
         self.binary_output = binary_output
 
-        # Special handling
-        if self.framework == "pt" and self.device.type != "cpu":
-            self.model = self.model.to(self.device)
-
-        # Update config with task specific parameters
+        # Update config and generation_config with task specific parameters
         task_specific_params = self.model.config.task_specific_params
         if task_specific_params is not None and task in task_specific_params:
             self.model.config.update(task_specific_params.get(task))
+            if self.model.can_generate():
+                self.model.generation_config.update(**task_specific_params.get(task))
 
         self.call_count = 0
         self._batch_size = kwargs.pop("batch_size", None)
         self._num_workers = kwargs.pop("num_workers", None)
         self._preprocess_params, self._forward_params, self._postprocess_params = self._sanitize_parameters(**kwargs)
 
-    def save_pretrained(self, save_directory: str):
+        if self.image_processor is None and self.feature_extractor is not None:
+            if isinstance(self.feature_extractor, BaseImageProcessor):
+                # Backward compatible change, if users called
+                # ImageSegmentationPipeline(.., feature_extractor=MyFeatureExtractor())
+                # then we should keep working
+                self.image_processor = self.feature_extractor
+
+    def save_pretrained(self, save_directory: str, safe_serialization: bool = False):
         """
         Save the pipeline's model and tokenizer.
 
         Args:
             save_directory (`str`):
                 A path to the directory where to saved. It will be created if it doesn't exist.
+            safe_serialization (`str`):
+                Whether to save the model using `safetensors` or the traditional way for PyTorch or Tensorflow
         """
         if os.path.isfile(save_directory):
             logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
@@ -823,7 +872,7 @@ def save_pretrained(self, save_directory: str):
             # Save the pipeline custom code
             custom_object_save(self, save_directory)
 
-        self.model.save_pretrained(save_directory)
+        self.model.save_pretrained(save_directory, safe_serialization=safe_serialization)
 
         if self.tokenizer is not None:
             self.tokenizer.save_pretrained(save_directory)
@@ -831,6 +880,9 @@ def save_pretrained(self, save_directory: str):
         if self.feature_extractor is not None:
             self.feature_extractor.save_pretrained(save_directory)
 
+        if self.image_processor is not None:
+            self.image_processor.save_pretrained(save_directory)
+
         if self.modelcard is not None:
             self.modelcard.save_pretrained(save_directory)
 
@@ -868,9 +920,10 @@ def device_placement(self):
                 yield
         else:
             if self.device.type == "cuda":
-                torch.cuda.set_device(self.device)
-
-            yield
+                with torch.cuda.device(self.device):
+                    yield
+            else:
+                yield
 
     def ensure_tensor_on_device(self, **inputs):
         """
@@ -916,12 +969,18 @@ def check_model_type(self, supported_models: Union[List[str], dict]):
         """
         if not isinstance(supported_models, list):  # Create from a model mapping
             supported_models_names = []
-            for config, model in supported_models.items():
+            for _, model_name in supported_models.items():
                 # Mapping can now contain tuples of models for the same configuration.
-                if isinstance(model, tuple):
-                    supported_models_names.extend([_model.__name__ for _model in model])
+                if isinstance(model_name, tuple):
+                    supported_models_names.extend(list(model_name))
                 else:
-                    supported_models_names.append(model.__name__)
+                    supported_models_names.append(model_name)
+            if hasattr(supported_models, "_model_mapping"):
+                for _, model in supported_models._model_mapping._extra_content.items():
+                    if isinstance(model_name, tuple):
+                        supported_models_names.extend([m.__name__ for m in model])
+                    else:
+                        supported_models_names.append(model.__name__)
             supported_models = supported_models_names
         if self.model.__class__.__name__ not in supported_models:
             logger.error(
@@ -973,12 +1032,7 @@ def postprocess(self, model_outputs: ModelOutput, **postprocess_parameters: Dict
         raise NotImplementedError("postprocess not implemented")
 
     def get_inference_context(self):
-        inference_context = (
-            torch.inference_mode
-            if version.parse(version.parse(torch.__version__).base_version) >= version.parse("1.9.0")
-            else torch.no_grad
-        )
-        return inference_context
+        return torch.no_grad
 
     def forward(self, model_inputs, **forward_params):
         with self.device_placement():
@@ -1012,7 +1066,9 @@ def get_iterator(
         if "TOKENIZERS_PARALLELISM" not in os.environ:
             logger.info("Disabling tokenizer parallelism, we're using DataLoader multithreading already")
             os.environ["TOKENIZERS_PARALLELISM"] = "false"
-        collate_fn = no_collate_fn if batch_size == 1 else pad_collate_fn(self.tokenizer, self.feature_extractor)
+        # TODO hack by collating feature_extractor and image_processor
+        feature_extractor = self.feature_extractor if self.feature_extractor is not None else self.image_processor
+        collate_fn = no_collate_fn if batch_size == 1 else pad_collate_fn(self.tokenizer, feature_extractor)
         dataloader = DataLoader(dataset, num_workers=num_workers, batch_size=batch_size, collate_fn=collate_fn)
         model_iterator = PipelineIterator(dataloader, self.forward, forward_params, loader_batch_size=batch_size)
         final_iterator = PipelineIterator(model_iterator, self.postprocess, postprocess_params)
@@ -1062,7 +1118,7 @@ def __call__(self, inputs, *args, num_workers=None, batch_size=None, **kwargs):
                 final_iterator = self.get_iterator(
                     inputs, num_workers, batch_size, preprocess_params, forward_params, postprocess_params
                 )
-                outputs = [output for output in final_iterator]
+                outputs = list(final_iterator)
                 return outputs
             else:
                 return self.run_multi(inputs, preprocess_params, forward_params, postprocess_params)
@@ -1072,6 +1128,14 @@ def __call__(self, inputs, *args, num_workers=None, batch_size=None, **kwargs):
             )
         elif is_iterable:
             return self.iterate(inputs, preprocess_params, forward_params, postprocess_params)
+        elif self.framework == "pt" and isinstance(self, ChunkPipeline):
+            return next(
+                iter(
+                    self.get_iterator(
+                        [inputs], num_workers, batch_size, preprocess_params, forward_params, postprocess_params
+                    )
+                )
+            )
         else:
             return self.run_single(inputs, preprocess_params, forward_params, postprocess_params)
 
@@ -1113,7 +1177,10 @@ def get_iterator(
             )
             num_workers = 1
         dataset = PipelineChunkIterator(inputs, self.preprocess, preprocess_params)
-        collate_fn = no_collate_fn if batch_size == 1 else pad_collate_fn(self.tokenizer, self.feature_extractor)
+
+        # TODO hack by collating feature_extractor and image_processor
+        feature_extractor = self.feature_extractor if self.feature_extractor is not None else self.image_processor
+        collate_fn = no_collate_fn if batch_size == 1 else pad_collate_fn(self.tokenizer, feature_extractor)
         dataloader = DataLoader(dataset, num_workers=num_workers, batch_size=batch_size, collate_fn=collate_fn)
         model_iterator = PipelinePackIterator(dataloader, self.forward, forward_params, loader_batch_size=batch_size)
         final_iterator = PipelineIterator(model_iterator, self.postprocess, postprocess_params)
diff --git a/src/transformers/pipelines/conversational.py b/src/transformers/pipelines/conversational.py
index 2b5758d47c2c..639ad868f2a4 100644
--- a/src/transformers/pipelines/conversational.py
+++ b/src/transformers/pipelines/conversational.py
@@ -1,5 +1,5 @@
 import uuid
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Union
 
 from ..utils import add_end_docstrings, is_tf_available, is_torch_available, logging
 from .base import PIPELINE_INIT_ARGS, Pipeline
@@ -18,138 +18,154 @@
 class Conversation:
     """
     Utility class containing a conversation and its history. This class is meant to be used as an input to the
-    [`ConversationalPipeline`]. The conversation contains a number of utility function to manage the addition of new
-    user input and generated model responses. A conversation needs to contain an unprocessed user input before being
-    passed to the [`ConversationalPipeline`]. This user input is either created when the class is instantiated, or by
-    calling `conversational_pipeline.append_response("input")` after a conversation turn.
+    [`ConversationalPipeline`]. The conversation contains several utility functions to manage the addition of new user
+    inputs and generated model responses.
 
     Arguments:
-        text (`str`, *optional*):
-            The initial user input to start the conversation. If not provided, a user input needs to be provided
-            manually using the [`~Conversation.add_user_input`] method before the conversation can begin.
+        messages (Union[str, List[Dict[str, str]]], *optional*):
+            The initial messages to start the conversation, either a string, or a list of dicts containing "role" and
+            "content" keys. If a string is passed, it is interpreted as a single message with the "user" role.
         conversation_id (`uuid.UUID`, *optional*):
             Unique identifier for the conversation. If not provided, a random UUID4 id will be assigned to the
             conversation.
-        past_user_inputs (`List[str]`, *optional*):
-            Eventual past history of the conversation of the user. You don't need to pass it manually if you use the
-            pipeline interactively but if you want to recreate history you need to set both `past_user_inputs` and
-            `generated_responses` with equal length lists of strings
-        generated_responses (`List[str]`, *optional*):
-            Eventual past history of the conversation of the model. You don't need to pass it manually if you use the
-            pipeline interactively but if you want to recreate history you need to set both `past_user_inputs` and
-            `generated_responses` with equal length lists of strings
 
     Usage:
 
     ```python
     conversation = Conversation("Going to the movies tonight - any suggestions?")
-
-    # Steps usually performed by the model when generating a response:
-    # 1. Mark the user input as processed (moved to the history)
-    conversation.mark_processed()
-    # 2. Append a mode response
-    conversation.append_response("The Big lebowski.")
-
-    conversation.add_user_input("Is it good?")
+    conversation.add_message({"role": "assistant", "content": "The Big lebowski."})
+    conversation.add_message({"role": "user", "content": "Is it good?"})
     ```"""
 
     def __init__(
-        self, text: str = None, conversation_id: uuid.UUID = None, past_user_inputs=None, generated_responses=None
+        self, messages: Union[str, List[Dict[str, str]]] = None, conversation_id: uuid.UUID = None, **deprecated_kwargs
     ):
         if not conversation_id:
             conversation_id = uuid.uuid4()
-        if past_user_inputs is None:
-            past_user_inputs = []
-        if generated_responses is None:
-            generated_responses = []
 
-        self.uuid: uuid.UUID = conversation_id
-        self.past_user_inputs: List[str] = past_user_inputs
-        self.generated_responses: List[str] = generated_responses
-        self.new_user_input: Optional[str] = text
+        if messages is None:
+            text = deprecated_kwargs.pop("text", None)
+            if text is not None:
+                messages = [{"role": "user", "content": text}]
+            else:
+                messages = []
+        elif isinstance(messages, str):
+            messages = [{"role": "user", "content": messages}]
+
+        # This block deals with the legacy args - new code should just totally
+        # avoid past_user_inputs and generated_responses
+        generated_responses = deprecated_kwargs.pop("generated_responses", None)
+        past_user_inputs = deprecated_kwargs.pop("past_user_inputs", None)
+        if generated_responses is not None and past_user_inputs is None:
+            raise ValueError("generated_responses cannot be passed without past_user_inputs!")
+        if past_user_inputs is not None:
+            legacy_messages = []
+            if generated_responses is None:
+                generated_responses = []
+            # We structure it this way instead of using zip() because the lengths may differ by 1
+            for i in range(max([len(past_user_inputs), len(generated_responses)])):
+                if i < len(past_user_inputs):
+                    legacy_messages.append({"role": "user", "content": past_user_inputs[i]})
+                if i < len(generated_responses):
+                    legacy_messages.append({"role": "assistant", "content": generated_responses[i]})
+            messages = legacy_messages + messages
+
+        self.uuid = conversation_id
+        self.messages = messages
 
     def __eq__(self, other):
         if not isinstance(other, Conversation):
             return False
-        if self.uuid == other.uuid:
-            return True
-        return (
-            self.new_user_input == other.new_user_input
-            and self.past_user_inputs == other.past_user_inputs
-            and self.generated_responses == other.generated_responses
-        )
+        return self.uuid == other.uuid or self.messages == other.messages
+
+    def add_message(self, message: Dict[str, str]):
+        if not set(message.keys()) == {"role", "content"}:
+            raise ValueError("Message should contain only 'role' and 'content' keys!")
+        if message["role"] not in ("user", "assistant", "system"):
+            raise ValueError("Only 'user', 'assistant' and 'system' roles are supported for now!")
+        self.messages.append(message)
 
     def add_user_input(self, text: str, overwrite: bool = False):
         """
-        Add a user input to the conversation for the next round. This populates the internal `new_user_input` field.
-
-        Args:
-            text (`str`): The user input for the next conversation round.
-            overwrite (`bool`, *optional*, defaults to `False`):
-                Whether or not existing and unprocessed user input should be overwritten when this function is called.
+        Add a user input to the conversation for the next round. This is a legacy method that assumes that inputs must
+        alternate user/assistant/user/assistant, and so will not add multiple user messages in succession. We recommend
+        just using `add_message` with role "user" instead.
         """
-        if self.new_user_input:
+        if len(self) > 0 and self[-1]["role"] == "user":
             if overwrite:
                 logger.warning(
-                    f'User input added while unprocessed input was existing: "{self.new_user_input}" was overwritten '
+                    f'User input added while unprocessed input was existing: "{self[-1]["content"]}" was overwritten '
                     f'with: "{text}".'
                 )
-                self.new_user_input = text
+                self[-1]["content"] = text
             else:
                 logger.warning(
-                    f'User input added while unprocessed input was existing: "{self.new_user_input}" new input '
+                    f'User input added while unprocessed input was existing: "{self[-1]["content"]}" new input '
                     f'ignored: "{text}". Set `overwrite` to True to overwrite unprocessed user input'
                 )
         else:
-            self.new_user_input = text
+            self.messages.append({"role": "user", "content": text})
 
-    def mark_processed(self):
+    def append_response(self, response: str):
         """
-        Mark the conversation as processed (moves the content of `new_user_input` to `past_user_inputs`) and empties
-        the `new_user_input` field.
+        This is a legacy method. We recommend just using `add_message` with an appropriate role instead.
         """
-        if self.new_user_input:
-            self.past_user_inputs.append(self.new_user_input)
-        self.new_user_input = None
+        self.messages.append({"role": "assistant", "content": response})
 
-    def append_response(self, response: str):
+    def mark_processed(self):
         """
-        Append a response to the list of generated responses.
-
-        Args:
-            response (`str`): The model generated response.
+        This is a legacy method that no longer has any effect, as the Conversation no longer distinguishes between
+        processed and unprocessed user input.
         """
-        self.generated_responses.append(response)
+        pass
 
-    def iter_texts(self):
-        """
-        Iterates over all blobs of the conversation.
+    def __iter__(self):
+        for message in self.messages:
+            yield message
 
-        Returns: Iterator of (is_user, text_chunk) in chronological order of the conversation. `is_user` is a `bool`,
-        `text_chunks` is a `str`.
-        """
-        for user_input, generated_response in zip(self.past_user_inputs, self.generated_responses):
-            yield True, user_input
-            yield False, generated_response
-        if self.new_user_input:
-            yield True, self.new_user_input
+    def __getitem__(self, item):
+        return self.messages[item]
+
+    def __setitem__(self, key, value):
+        self.messages[key] = value
+
+    def __len__(self):
+        return len(self.messages)
 
     def __repr__(self):
         """
         Generates a string representation of the conversation.
 
-        Return:
+        Returns:
             `str`:
 
-            Example: Conversation id: 7d15686b-dc94-49f2-9c4b-c9eac6a1f114 user >> Going to the movies tonight - any
-            suggestions? bot >> The Big Lebowski
+        Example:
+            Conversation id: 7d15686b-dc94-49f2-9c4b-c9eac6a1f114 user: Going to the movies tonight - any suggestions?
+            bot: The Big Lebowski
         """
-        output = f"Conversation id: {self.uuid} \n"
-        for is_user, text in self.iter_texts():
-            name = "user" if is_user else "bot"
-            output += f"{name} >> {text} \n"
+        output = f"Conversation id: {self.uuid}\n"
+        for message in self.messages:
+            output += f"{message['role']}: {message['content']}\n"
         return output
 
+    def iter_texts(self):
+        # This is a legacy method for backwards compatibility. It is recommended to just directly access
+        # conversation.messages instead.
+        for message in self.messages:
+            yield message["role"] == "user", message["content"]
+
+    @property
+    def past_user_inputs(self):
+        # This is a legacy property for backwards compatibility. It is recommended to just directly access
+        # conversation.messages instead.
+        return [message["content"] for message in self.messages if message["role"] == "user"]
+
+    @property
+    def generated_responses(self):
+        # This is a legacy property for backwards compatibility. It is recommended to just directly access
+        # conversation.messages instead.
+        return [message["content"] for message in self.messages if message["role"] == "assistant"]
+
 
 @add_end_docstrings(
     PIPELINE_INIT_ARGS,
@@ -246,18 +262,7 @@ def __call__(self, conversations: Union[Conversation, List[Conversation]], num_w
         return outputs
 
     def preprocess(self, conversation: Conversation, min_length_for_response=32) -> Dict[str, Any]:
-        if not isinstance(conversation, Conversation):
-            raise ValueError("ConversationalPipeline, expects Conversation as inputs")
-        if conversation.new_user_input is None:
-            raise ValueError(
-                f"Conversation with UUID {type(conversation.uuid)} does not contain new user input to process. "
-                "Add user inputs with the conversation's `add_user_input` method"
-            )
-        if hasattr(self.tokenizer, "_build_conversation_input_ids"):
-            input_ids = self.tokenizer._build_conversation_input_ids(conversation)
-        else:
-            # If the tokenizer cannot handle conversations, we default to only the old version
-            input_ids = self._legacy_parse_and_tokenize(conversation)
+        input_ids = self.tokenizer.apply_chat_template(conversation, add_generation_prompt=True)
 
         if self.framework == "pt":
             input_ids = torch.LongTensor([input_ids])
@@ -270,7 +275,9 @@ def _forward(self, model_inputs, minimum_tokens=10, **generate_kwargs):
 
         n = model_inputs["input_ids"].shape[1]
         if max_length - minimum_tokens < n:
-            logger.warning(f"Conversation input is to long ({n}), trimming it to ({max_length} - {minimum_tokens})")
+            logger.warning(
+                f"Conversation input is too long ({n}), trimming it to {max_length - minimum_tokens} tokens. Consider increasing `max_length` to avoid truncation."
+            )
             trim = max_length - minimum_tokens
             model_inputs["input_ids"] = model_inputs["input_ids"][:, -trim:]
             if "attention_mask" in model_inputs:
@@ -292,19 +299,5 @@ def postprocess(self, model_outputs, clean_up_tokenization_spaces=True):
             clean_up_tokenization_spaces=clean_up_tokenization_spaces,
         )
         conversation = model_outputs["conversation"]
-        conversation.mark_processed()
-        conversation.append_response(answer)
+        conversation.add_message({"role": "assistant", "content": answer})
         return conversation
-
-    def _legacy_parse_and_tokenize(self, conversation: Conversation) -> Dict:
-        eos_token_id = self.tokenizer.eos_token_id
-        input_ids = []
-        for is_user, text in conversation.iter_texts():
-            if eos_token_id is not None:
-                input_ids.extend(self.tokenizer.encode(text, add_special_tokens=False) + [eos_token_id])
-            else:
-                input_ids.extend(self.tokenizer.encode(text, add_special_tokens=False))
-
-        if len(input_ids) > self.tokenizer.model_max_length:
-            input_ids = input_ids[-self.tokenizer.model_max_length :]
-        return input_ids
diff --git a/src/transformers/pipelines/depth_estimation.py b/src/transformers/pipelines/depth_estimation.py
index ef3b661d68c9..c8d0cad6fc77 100644
--- a/src/transformers/pipelines/depth_estimation.py
+++ b/src/transformers/pipelines/depth_estimation.py
@@ -14,7 +14,7 @@
 if is_torch_available():
     import torch
 
-    from ..models.auto.modeling_auto import MODEL_FOR_DEPTH_ESTIMATION_MAPPING
+    from ..models.auto.modeling_auto import MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES
 
 logger = logging.get_logger(__name__)
 
@@ -48,7 +48,7 @@ class DepthEstimationPipeline(Pipeline):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         requires_backends(self, "vision")
-        self.check_model_type(MODEL_FOR_DEPTH_ESTIMATION_MAPPING)
+        self.check_model_type(MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES)
 
     def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Image"]], **kwargs):
         """
@@ -68,6 +68,9 @@ def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Imag
             top_k (`int`, *optional*, defaults to 5):
                 The number of top labels that will be returned by the pipeline. If the provided number is higher than
                 the number of labels available in the model configuration, it will default to the number of labels.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
 
         Return:
             A dictionary or a list of dictionaries containing result. If the input is a single image, will return a
@@ -81,13 +84,16 @@ def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Imag
         """
         return super().__call__(images, **kwargs)
 
-    def _sanitize_parameters(self, **kwargs):
-        return {}, {}, {}
+    def _sanitize_parameters(self, timeout=None, **kwargs):
+        preprocess_params = {}
+        if timeout is not None:
+            preprocess_params["timeout"] = timeout
+        return preprocess_params, {}, {}
 
-    def preprocess(self, image):
-        image = load_image(image)
+    def preprocess(self, image, timeout=None):
+        image = load_image(image, timeout)
         self.image_size = image.size
-        model_inputs = self.feature_extractor(images=image, return_tensors=self.framework)
+        model_inputs = self.image_processor(images=image, return_tensors=self.framework)
         return model_inputs
 
     def _forward(self, model_inputs):
diff --git a/src/transformers/pipelines/document_question_answering.py b/src/transformers/pipelines/document_question_answering.py
index d3708fb1b5cf..3c107d650cfd 100644
--- a/src/transformers/pipelines/document_question_answering.py
+++ b/src/transformers/pipelines/document_question_answering.py
@@ -37,7 +37,7 @@
 if is_torch_available():
     import torch
 
-    from ..models.auto.modeling_auto import MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING
+    from ..models.auto.modeling_auto import MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES
 
 TESSERACT_LOADED = False
 if is_pytesseract_available():
@@ -131,13 +131,18 @@ class DocumentQuestionAnsweringPipeline(ChunkPipeline):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
+        if self.tokenizer is not None and not self.tokenizer.__class__.__name__.endswith("Fast"):
+            raise ValueError(
+                "`DocumentQuestionAnsweringPipeline` requires a fast tokenizer, but a slow tokenizer "
+                f"(`{self.tokenizer.__class__.__name__}`) is provided."
+            )
 
         if self.model.config.__class__.__name__ == "VisionEncoderDecoderConfig":
             self.model_type = ModelType.VisionEncoderDecoder
             if self.model.config.encoder.model_type != "donut-swin":
                 raise ValueError("Currently, the only supported VisionEncoderDecoder model is Donut")
         else:
-            self.check_model_type(MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING)
+            self.check_model_type(MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES)
             if self.model.config.__class__.__name__ == "LayoutLMConfig":
                 self.model_type = ModelType.LayoutLM
             else:
@@ -154,6 +159,7 @@ def _sanitize_parameters(
         max_seq_len=None,
         top_k=None,
         handle_impossible_answer=None,
+        timeout=None,
         **kwargs,
     ):
         preprocess_params, postprocess_params = {}, {}
@@ -169,6 +175,8 @@ def _sanitize_parameters(
             preprocess_params["lang"] = lang
         if tesseract_config is not None:
             preprocess_params["tesseract_config"] = tesseract_config
+        if timeout is not None:
+            preprocess_params["timeout"] = timeout
 
         if top_k is not None:
             if top_k < 1:
@@ -239,6 +247,9 @@ def __call__(
                 Language to use while running OCR. Defaults to english.
             tesseract_config (`str`, *optional*):
                 Additional flags to pass to tesseract while running OCR.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
 
         Return:
             A `dict` or a list of `dict`: Each result comes as a dictionary with the following keys:
@@ -268,6 +279,7 @@ def preprocess(
         word_boxes: Tuple[str, List[float]] = None,
         lang=None,
         tesseract_config="",
+        timeout=None,
     ):
         # NOTE: This code mirrors the code in question answering and will be implemented in a follow up PR
         # to support documents with enough tokens that overflow the model's window
@@ -280,8 +292,10 @@ def preprocess(
         image = None
         image_features = {}
         if input.get("image", None) is not None:
-            image = load_image(input["image"])
-            if self.feature_extractor is not None:
+            image = load_image(input["image"], timeout=timeout)
+            if self.image_processor is not None:
+                image_features.update(self.image_processor(images=image, return_tensors=self.framework))
+            elif self.feature_extractor is not None:
                 image_features.update(self.feature_extractor(images=image, return_tensors=self.framework))
             elif self.model_type == ModelType.VisionEncoderDecoder:
                 raise ValueError("If you are using a VisionEncoderDecoderModel, you must provide a feature extractor")
@@ -352,7 +366,9 @@ def preprocess(
                 return_overflowing_tokens=True,
                 **tokenizer_kwargs,
             )
-            encoding.pop("overflow_to_sample_mapping")  # We do not use this
+            # TODO: check why slower `LayoutLMTokenizer` and `LayoutLMv2Tokenizer` don't have this key in outputs
+            # FIXME: ydshieh and/or Narsil
+            encoding.pop("overflow_to_sample_mapping", None)  # We do not use this
 
             num_spans = len(encoding["input_ids"])
 
@@ -414,7 +430,7 @@ def _forward(self, model_inputs):
         else:
             model_outputs = self.model(**model_inputs)
 
-        model_outputs = {k: v for (k, v) in model_outputs.items()}
+        model_outputs = dict(model_outputs.items())
         model_outputs["p_mask"] = p_mask
         model_outputs["word_ids"] = word_ids
         model_outputs["words"] = words
diff --git a/src/transformers/pipelines/feature_extraction.py b/src/transformers/pipelines/feature_extraction.py
index f2dc6eaaaeac..b8b5eafeb760 100644
--- a/src/transformers/pipelines/feature_extraction.py
+++ b/src/transformers/pipelines/feature_extraction.py
@@ -53,6 +53,8 @@ class FeatureExtractionPipeline(Pipeline):
         device (`int`, *optional*, defaults to -1):
             Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, a positive will run the model on
             the associated CUDA device id.
+        tokenize_kwargs (`dict`, *optional*):
+            Additional dictionary of keyword arguments passed along to the tokenizer.
     """
 
     def _sanitize_parameters(self, truncation=None, tokenize_kwargs=None, return_tensors=None, **kwargs):
diff --git a/src/transformers/pipelines/fill_mask.py b/src/transformers/pipelines/fill_mask.py
index fd977a72f8ef..d22a838f2766 100644
--- a/src/transformers/pipelines/fill_mask.py
+++ b/src/transformers/pipelines/fill_mask.py
@@ -61,7 +61,28 @@ class FillMaskPipeline(Pipeline):
     masks. The returned values are raw model output, and correspond to disjoint probabilities where one might expect
     joint probabilities (See [discussion](https://github.com/huggingface/transformers/pull/10222)).
 
-    </Tip>"""
+    </Tip>
+
+    <Tip>
+
+    This pipeline now supports tokenizer_kwargs. For example try:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> fill_masker = pipeline(model="bert-base-uncased")
+    >>> tokenizer_kwargs = {"truncation": True}
+    >>> fill_masker(
+    ...     "This is a simple [MASK]. " + "...with a large amount of repeated text appended. " * 100,
+    ...     tokenizer_kwargs=tokenizer_kwargs,
+    ... )
+    ```
+
+
+    </Tip>
+
+
+    """
 
     def get_masked_index(self, input_ids: GenericTensor) -> np.ndarray:
         if self.framework == "tf":
@@ -90,10 +111,15 @@ def ensure_exactly_one_mask_token(self, model_inputs: GenericTensor):
             for input_ids in model_inputs["input_ids"]:
                 self._ensure_exactly_one_mask_token(input_ids)
 
-    def preprocess(self, inputs, return_tensors=None, **preprocess_parameters) -> Dict[str, GenericTensor]:
+    def preprocess(
+        self, inputs, return_tensors=None, tokenizer_kwargs=None, **preprocess_parameters
+    ) -> Dict[str, GenericTensor]:
         if return_tensors is None:
             return_tensors = self.framework
-        model_inputs = self.tokenizer(inputs, return_tensors=return_tensors)
+        if tokenizer_kwargs is None:
+            tokenizer_kwargs = {}
+
+        model_inputs = self.tokenizer(inputs, return_tensors=return_tensors, **tokenizer_kwargs)
         self.ensure_exactly_one_mask_token(model_inputs)
         return model_inputs
 
@@ -198,7 +224,12 @@ def get_target_ids(self, targets, top_k=None):
         target_ids = np.array(target_ids)
         return target_ids
 
-    def _sanitize_parameters(self, top_k=None, targets=None):
+    def _sanitize_parameters(self, top_k=None, targets=None, tokenizer_kwargs=None):
+        preprocess_params = {}
+
+        if tokenizer_kwargs is not None:
+            preprocess_params["tokenizer_kwargs"] = tokenizer_kwargs
+
         postprocess_params = {}
 
         if targets is not None:
@@ -212,7 +243,7 @@ def _sanitize_parameters(self, top_k=None, targets=None):
             raise PipelineException(
                 "fill-mask", self.model.base_model_prefix, "The tokenizer does not define a `mask_token`."
             )
-        return {}, {}, postprocess_params
+        return preprocess_params, {}, postprocess_params
 
     def __call__(self, inputs, *args, **kwargs):
         """
@@ -234,7 +265,7 @@ def __call__(self, inputs, *args, **kwargs):
             - **sequence** (`str`) -- The corresponding input with the mask token prediction.
             - **score** (`float`) -- The corresponding probability.
             - **token** (`int`) -- The predicted token id (to replace the masked one).
-            - **token** (`str`) -- The predicted token (to replace the masked one).
+            - **token_str** (`str`) -- The predicted token (to replace the masked one).
         """
         outputs = super().__call__(inputs, **kwargs)
         if isinstance(inputs, list) and len(inputs) == 1:
diff --git a/src/transformers/pipelines/image_classification.py b/src/transformers/pipelines/image_classification.py
index 6e9d519fb4b3..59ebabbd20e4 100644
--- a/src/transformers/pipelines/image_classification.py
+++ b/src/transformers/pipelines/image_classification.py
@@ -19,11 +19,11 @@
 if is_tf_available():
     import tensorflow as tf
 
-    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
     from ..tf_utils import stable_softmax
 
 if is_torch_available():
-    from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
+    from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
 
 logger = logging.get_logger(__name__)
 
@@ -57,16 +57,19 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         requires_backends(self, "vision")
         self.check_model_type(
-            TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
+            TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
             if self.framework == "tf"
-            else MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
+            else MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
         )
 
-    def _sanitize_parameters(self, top_k=None):
+    def _sanitize_parameters(self, top_k=None, timeout=None):
+        preprocess_params = {}
+        if timeout is not None:
+            preprocess_params["timeout"] = timeout
         postprocess_params = {}
         if top_k is not None:
             postprocess_params["top_k"] = top_k
-        return {}, {}, postprocess_params
+        return preprocess_params, {}, postprocess_params
 
     def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Image"]], **kwargs):
         """
@@ -86,6 +89,9 @@ def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Imag
             top_k (`int`, *optional*, defaults to 5):
                 The number of top labels that will be returned by the pipeline. If the provided number is higher than
                 the number of labels available in the model configuration, it will default to the number of labels.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
 
         Return:
             A dictionary or a list of dictionaries containing result. If the input is a single image, will return a
@@ -99,9 +105,9 @@ def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Imag
         """
         return super().__call__(images, **kwargs)
 
-    def preprocess(self, image):
-        image = load_image(image)
-        model_inputs = self.feature_extractor(images=image, return_tensors=self.framework)
+    def preprocess(self, image, timeout=None):
+        image = load_image(image, timeout=timeout)
+        model_inputs = self.image_processor(images=image, return_tensors=self.framework)
         return model_inputs
 
     def _forward(self, model_inputs):
diff --git a/src/transformers/pipelines/image_segmentation.py b/src/transformers/pipelines/image_segmentation.py
index 9fdb0dc3314d..01540729e57b 100644
--- a/src/transformers/pipelines/image_segmentation.py
+++ b/src/transformers/pipelines/image_segmentation.py
@@ -13,10 +13,10 @@
 
 if is_torch_available():
     from ..models.auto.modeling_auto import (
-        MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
-        MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
-        MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING,
-        MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING,
+        MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES,
+        MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES,
+        MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES,
+        MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES,
     )
 
 
@@ -71,27 +71,28 @@ def __init__(self, *args, **kwargs):
             raise ValueError(f"The {self.__class__} is only available in PyTorch.")
 
         requires_backends(self, "vision")
-        self.check_model_type(
-            dict(
-                MODEL_FOR_IMAGE_SEGMENTATION_MAPPING.items()
-                + MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING.items()
-                + MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING.items()
-                + MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING.items()
-            )
-        )
+        mapping = MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES.copy()
+        mapping.update(MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES)
+        mapping.update(MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES)
+        mapping.update(MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES)
+        self.check_model_type(mapping)
 
     def _sanitize_parameters(self, **kwargs):
+        preprocess_kwargs = {}
         postprocess_kwargs = {}
         if "subtask" in kwargs:
             postprocess_kwargs["subtask"] = kwargs["subtask"]
+            preprocess_kwargs["subtask"] = kwargs["subtask"]
         if "threshold" in kwargs:
             postprocess_kwargs["threshold"] = kwargs["threshold"]
         if "mask_threshold" in kwargs:
             postprocess_kwargs["mask_threshold"] = kwargs["mask_threshold"]
         if "overlap_mask_area_threshold" in kwargs:
             postprocess_kwargs["overlap_mask_area_threshold"] = kwargs["overlap_mask_area_threshold"]
+        if "timeout" in kwargs:
+            preprocess_kwargs["timeout"] = kwargs["timeout"]
 
-        return {}, {}, postprocess_kwargs
+        return preprocess_kwargs, {}, postprocess_kwargs
 
     def __call__(self, images, **kwargs) -> Union[Predictions, List[Prediction]]:
         """
@@ -117,6 +118,9 @@ def __call__(self, images, **kwargs) -> Union[Predictions, List[Prediction]]:
                 Threshold to use when turning the predicted masks into binary values.
             overlap_mask_area_threshold (`float`, *optional*, defaults to 0.5):
                 Mask overlap threshold to eliminate small, disconnected segments.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
 
         Return:
             A dictionary or a list of dictionaries containing the result. If the input is a single image, will return a
@@ -134,10 +138,23 @@ def __call__(self, images, **kwargs) -> Union[Predictions, List[Prediction]]:
         """
         return super().__call__(images, **kwargs)
 
-    def preprocess(self, image):
-        image = load_image(image)
+    def preprocess(self, image, subtask=None, timeout=None):
+        image = load_image(image, timeout=timeout)
         target_size = [(image.height, image.width)]
-        inputs = self.feature_extractor(images=[image], return_tensors="pt")
+        if self.model.config.__class__.__name__ == "OneFormerConfig":
+            if subtask is None:
+                kwargs = {}
+            else:
+                kwargs = {"task_inputs": [subtask]}
+            inputs = self.image_processor(images=[image], return_tensors="pt", **kwargs)
+            inputs["task_inputs"] = self.tokenizer(
+                inputs["task_inputs"],
+                padding="max_length",
+                max_length=self.model.config.task_seq_len,
+                return_tensors=self.framework,
+            )["input_ids"]
+        else:
+            inputs = self.image_processor(images=[image], return_tensors="pt")
         inputs["target_size"] = target_size
         return inputs
 
@@ -150,12 +167,11 @@ def _forward(self, model_inputs):
     def postprocess(
         self, model_outputs, subtask=None, threshold=0.9, mask_threshold=0.5, overlap_mask_area_threshold=0.5
     ):
-
         fn = None
-        if subtask in {"panoptic", None} and hasattr(self.feature_extractor, "post_process_panoptic_segmentation"):
-            fn = self.feature_extractor.post_process_panoptic_segmentation
-        elif subtask in {"instance", None} and hasattr(self.feature_extractor, "post_process_instance_segmentation"):
-            fn = self.feature_extractor.post_process_instance_segmentation
+        if subtask in {"panoptic", None} and hasattr(self.image_processor, "post_process_panoptic_segmentation"):
+            fn = self.image_processor.post_process_panoptic_segmentation
+        elif subtask in {"instance", None} and hasattr(self.image_processor, "post_process_instance_segmentation"):
+            fn = self.image_processor.post_process_instance_segmentation
 
         if fn is not None:
             outputs = fn(
@@ -176,8 +192,8 @@ def postprocess(
                 score = segment["score"]
                 annotation.append({"score": score, "label": label, "mask": mask})
 
-        elif subtask in {"semantic", None} and hasattr(self.feature_extractor, "post_process_semantic_segmentation"):
-            outputs = self.feature_extractor.post_process_semantic_segmentation(
+        elif subtask in {"semantic", None} and hasattr(self.image_processor, "post_process_semantic_segmentation"):
+            outputs = self.image_processor.post_process_semantic_segmentation(
                 model_outputs, target_sizes=model_outputs["target_size"]
             )[0]
 
diff --git a/src/transformers/pipelines/image_to_image.py b/src/transformers/pipelines/image_to_image.py
new file mode 100644
index 000000000000..dbd88deb1ee0
--- /dev/null
+++ b/src/transformers/pipelines/image_to_image.py
@@ -0,0 +1,134 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Union
+
+import numpy as np
+
+from ..utils import (
+    add_end_docstrings,
+    is_torch_available,
+    is_vision_available,
+    logging,
+    requires_backends,
+)
+from .base import PIPELINE_INIT_ARGS, Pipeline
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from ..image_utils import load_image
+
+if is_torch_available():
+    from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(PIPELINE_INIT_ARGS)
+class ImageToImagePipeline(Pipeline):
+    """
+    Image to Image pipeline using any `AutoModelForImageToImage`. This pipeline generates an image based on a previous
+    image input.
+
+    Example:
+
+    ```python
+    >>> from PIL import Image
+    >>> import requests
+
+    >>> from transformers import pipeline
+
+    >>> upscaler = pipeline("image-to-image", model="caidas/swin2SR-classical-sr-x2-64")
+    >>> img = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
+    >>> img = img.resize((64, 64))
+    >>> upscaled_img = upscaler(img)
+    >>> img.size
+    (64, 64)
+
+    >>> upscaled_img.size
+    (144, 144)
+    ```
+
+    This image to image pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"image-to-image"`.
+
+    See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=image-to-image).
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        requires_backends(self, "vision")
+        self.check_model_type(MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES)
+
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_params = {}
+        postprocess_params = {}
+        forward_params = {}
+
+        if "timeout" in kwargs:
+            preprocess_params["timeout"] = kwargs["timeout"]
+        if "head_mask" in kwargs:
+            forward_params["head_mask"] = kwargs["head_mask"]
+
+        return preprocess_params, forward_params, postprocess_params
+
+    def __call__(
+        self, images: Union[str, List[str], "Image.Image", List["Image.Image"]], **kwargs
+    ) -> Union["Image.Image", List["Image.Image"]]:
+        """
+        Transform the image(s) passed as inputs.
+
+        Args:
+            images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+                The pipeline handles three types of images:
+
+                - A string containing a http link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                The pipeline accepts either a single image or a batch of images, which must then be passed as a string.
+                Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL
+                images.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is used and
+                the call may block forever.
+
+        Return:
+            An image (Image.Image) or a list of images (List["Image.Image"]) containing result(s). If the input is a
+            single image, the return will be also a single image, if the input is a list of several images, it will
+            return a list of transformed images.
+        """
+        return super().__call__(images, **kwargs)
+
+    def _forward(self, model_inputs):
+        model_outputs = self.model(**model_inputs)
+        return model_outputs
+
+    def preprocess(self, image, timeout=None):
+        image = load_image(image, timeout=timeout)
+        inputs = self.image_processor(images=[image], return_tensors="pt")
+        return inputs
+
+    def postprocess(self, model_outputs):
+        images = []
+        if "reconstruction" in model_outputs.keys():
+            outputs = model_outputs.reconstruction
+        for output in outputs:
+            output = output.data.squeeze().float().cpu().clamp_(0, 1).numpy()
+            output = np.moveaxis(output, source=0, destination=-1)
+            output = (output * 255.0).round().astype(np.uint8)  # float32 to uint8
+            images.append(Image.fromarray(output))
+
+        return images if len(images) > 1 else images[0]
diff --git a/src/transformers/pipelines/image_to_text.py b/src/transformers/pipelines/image_to_text.py
index 2053d241630d..e5cbb36ea526 100644
--- a/src/transformers/pipelines/image_to_text.py
+++ b/src/transformers/pipelines/image_to_text.py
@@ -17,10 +17,12 @@
     from ..image_utils import load_image
 
 if is_tf_available():
-    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_VISION_2_SEQ_MAPPING
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES
 
 if is_torch_available():
-    from ..models.auto.modeling_auto import MODEL_FOR_VISION_2_SEQ_MAPPING
+    import torch
+
+    from ..models.auto.modeling_auto import MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES
 
 logger = logging.get_logger(__name__)
 
@@ -53,11 +55,18 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         requires_backends(self, "vision")
         self.check_model_type(
-            TF_MODEL_FOR_VISION_2_SEQ_MAPPING if self.framework == "tf" else MODEL_FOR_VISION_2_SEQ_MAPPING
+            TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES if self.framework == "tf" else MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES
         )
 
-    def _sanitize_parameters(self, max_new_tokens=None, generate_kwargs=None):
+    def _sanitize_parameters(self, max_new_tokens=None, generate_kwargs=None, prompt=None, timeout=None):
         forward_kwargs = {}
+        preprocess_params = {}
+
+        if prompt is not None:
+            preprocess_params["prompt"] = prompt
+        if timeout is not None:
+            preprocess_params["timeout"] = timeout
+
         if generate_kwargs is not None:
             forward_kwargs["generate_kwargs"] = generate_kwargs
         if max_new_tokens is not None:
@@ -69,7 +78,7 @@ def _sanitize_parameters(self, max_new_tokens=None, generate_kwargs=None):
                     " please use only one"
                 )
             forward_kwargs["generate_kwargs"]["max_new_tokens"] = max_new_tokens
-        return {}, forward_kwargs, {}
+        return preprocess_params, forward_kwargs, {}
 
     def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Image"]], **kwargs):
         """
@@ -90,6 +99,9 @@ def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Imag
 
             generate_kwargs (`Dict`, *optional*):
                 Pass it to send all of these arguments directly to `generate` allowing full control of this function.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
 
         Return:
             A list or a list of list of `dict`: Each result comes as a dictionary with the following key:
@@ -98,12 +110,55 @@ def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Imag
         """
         return super().__call__(images, **kwargs)
 
-    def preprocess(self, image):
-        image = load_image(image)
-        model_inputs = self.feature_extractor(images=image, return_tensors=self.framework)
+    def preprocess(self, image, prompt=None, timeout=None):
+        image = load_image(image, timeout=timeout)
+
+        if prompt is not None:
+            if not isinstance(prompt, str):
+                raise ValueError(
+                    f"Received an invalid text input, got - {type(prompt)} - but expected a single string. "
+                    "Note also that one single text can be provided for conditional image to text generation."
+                )
+
+            model_type = self.model.config.model_type
+
+            if model_type == "git":
+                model_inputs = self.image_processor(images=image, return_tensors=self.framework)
+                input_ids = self.tokenizer(text=prompt, add_special_tokens=False).input_ids
+                input_ids = [self.tokenizer.cls_token_id] + input_ids
+                input_ids = torch.tensor(input_ids).unsqueeze(0)
+                model_inputs.update({"input_ids": input_ids})
+
+            elif model_type == "pix2struct":
+                model_inputs = self.image_processor(images=image, header_text=prompt, return_tensors=self.framework)
+
+            elif model_type != "vision-encoder-decoder":
+                # vision-encoder-decoder does not support conditional generation
+                model_inputs = self.image_processor(images=image, return_tensors=self.framework)
+                text_inputs = self.tokenizer(prompt, return_tensors=self.framework)
+                model_inputs.update(text_inputs)
+
+            else:
+                raise ValueError(f"Model type {model_type} does not support conditional text generation")
+
+        else:
+            model_inputs = self.image_processor(images=image, return_tensors=self.framework)
+
+        if self.model.config.model_type == "git" and prompt is None:
+            model_inputs["input_ids"] = None
+
         return model_inputs
 
     def _forward(self, model_inputs, generate_kwargs=None):
+        # Git model sets `model_inputs["input_ids"] = None` in `preprocess` (when `prompt=None`). In batch model, the
+        # pipeline will group them into a list of `None`, which fail `_forward`. Avoid this by checking it first.
+        if (
+            "input_ids" in model_inputs
+            and isinstance(model_inputs["input_ids"], list)
+            and all(x is None for x in model_inputs["input_ids"])
+        ):
+            model_inputs["input_ids"] = None
+
         if generate_kwargs is None:
             generate_kwargs = {}
         # FIXME: We need to pop here due to a difference in how `generation.py` and `generation.tf_utils.py`
diff --git a/src/transformers/pipelines/mask_generation.py b/src/transformers/pipelines/mask_generation.py
new file mode 100644
index 000000000000..bc2c719084a1
--- /dev/null
+++ b/src/transformers/pipelines/mask_generation.py
@@ -0,0 +1,292 @@
+from collections import defaultdict
+from typing import Optional
+
+from ..image_utils import load_image
+from ..utils import (
+    add_end_docstrings,
+    is_torch_available,
+    logging,
+    requires_backends,
+)
+from .base import PIPELINE_INIT_ARGS, ChunkPipeline
+
+
+if is_torch_available():
+    import torch
+
+    from ..models.auto.modeling_auto import MODEL_FOR_MASK_GENERATION_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(PIPELINE_INIT_ARGS)
+class MaskGenerationPipeline(ChunkPipeline):
+    """
+    Automatic mask generation for images using `SamForMaskGeneration`. This pipeline predicts binary masks for an
+    image, given an image. It is a `ChunkPipeline` because you can seperate the points in a mini-batch in order to
+    avoid OOM issues. Use the `points_per_batch` argument to control the number of points that will be processed at the
+    same time. Default is `64`.
+
+    The pipeline works in 3 steps:
+        1. `preprocess`: A grid of 1024 points evenly separated is generated along with bounding boxes and point
+           labels.
+            For more details on how the points and bounding boxes are created, check the `_generate_crop_boxes`
+            function. The image is also preprocessed using the `image_processor`. This function `yields` a minibatch of
+            `points_per_batch`.
+
+        2. `forward`: feeds the outputs of `preprocess` to the model. The image embedding is computed only once.
+            Calls both `self.model.get_image_embeddings` and makes sure that the gradients are not computed, and the
+            tensors and models are on the same device.
+
+        3. `postprocess`: The most important part of the automatic mask generation happens here. Three steps
+            are induced:
+                - image_processor.postprocess_masks (run on each minibatch loop): takes in the raw output masks,
+                  resizes them according
+                to the image size, and transforms there to binary masks.
+                - image_processor.filter_masks (on each minibatch loop): uses both `pred_iou_thresh` and
+                  `stability_scores`. Also
+                applies a variety of filters based on non maximum suppression to remove bad masks.
+                - image_processor.postprocess_masks_for_amg applies the NSM on the mask to only keep relevant ones.
+
+    Arguments:
+        model ([`PreTrainedModel`] or [`TFPreTrainedModel`]):
+            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
+            [`PreTrainedModel`] for PyTorch and [`TFPreTrainedModel`] for TensorFlow.
+        tokenizer ([`PreTrainedTokenizer`]):
+            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
+            [`PreTrainedTokenizer`].
+        feature_extractor ([`SequenceFeatureExtractor`]):
+            The feature extractor that will be used by the pipeline to encode the input.
+        points_per_batch (*optional*, int, default to 64):
+            Sets the number of points run simultaneously by the model. Higher numbers may be faster but use more GPU
+            memory.
+        output_bboxes_mask (`bool`, *optional*, default to `False`):
+           Whether or not to output the bounding box predictions.
+        output_rle_masks (`bool`, *optional*, default to `False`):
+            Whether or not to output the masks in `RLE` format
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> generator = pipeline(model="facebook/sam-vit-base", task="mask-generation")
+    >>> outputs = generator(
+    ...     "http://images.cocodataset.org/val2017/000000039769.jpg",
+    ... )
+
+    >>> outputs = generator(
+    ...     "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", points_per_batch=128
+    ... )
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This segmentation pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"mask-generation"`.
+
+    See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=mask-generation).
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        requires_backends(self, "vision")
+        requires_backends(self, "torch")
+
+        if self.framework != "pt":
+            raise ValueError(f"The {self.__class__} is only available in PyTorch.")
+
+        self.check_model_type(MODEL_FOR_MASK_GENERATION_MAPPING_NAMES)
+
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_kwargs = {}
+        postprocess_kwargs = {}
+        forward_params = {}
+        # preprocess args
+        if "points_per_batch" in kwargs:
+            preprocess_kwargs["points_per_batch"] = kwargs["points_per_batch"]
+        if "points_per_crop" in kwargs:
+            preprocess_kwargs["points_per_crop"] = kwargs["points_per_crop"]
+        if "crops_n_layers" in kwargs:
+            preprocess_kwargs["crops_n_layers"] = kwargs["crops_n_layers"]
+        if "crop_overlap_ratio" in kwargs:
+            preprocess_kwargs["crop_overlap_ratio"] = kwargs["crop_overlap_ratio"]
+        if "crop_n_points_downscale_factor" in kwargs:
+            preprocess_kwargs["crop_n_points_downscale_factor"] = kwargs["crop_n_points_downscale_factor"]
+        if "timeout" in kwargs:
+            preprocess_kwargs["timeout"] = kwargs["timeout"]
+        # postprocess args
+        if "pred_iou_thresh" in kwargs:
+            forward_params["pred_iou_thresh"] = kwargs["pred_iou_thresh"]
+        if "stability_score_offset" in kwargs:
+            forward_params["stability_score_offset"] = kwargs["stability_score_offset"]
+        if "mask_threshold" in kwargs:
+            forward_params["mask_threshold"] = kwargs["mask_threshold"]
+        if "stability_score_thresh" in kwargs:
+            forward_params["stability_score_thresh"] = kwargs["stability_score_thresh"]
+        if "crops_nms_thresh" in kwargs:
+            postprocess_kwargs["crops_nms_thresh"] = kwargs["crops_nms_thresh"]
+        if "output_rle_mask" in kwargs:
+            postprocess_kwargs["output_rle_mask"] = kwargs["output_rle_mask"]
+        if "output_bboxes_mask" in kwargs:
+            postprocess_kwargs["output_bboxes_mask"] = kwargs["output_bboxes_mask"]
+        return preprocess_kwargs, forward_params, postprocess_kwargs
+
+    def __call__(self, image, *args, num_workers=None, batch_size=None, **kwargs):
+        """
+        Generates binary segmentation masks
+
+        Args:
+            inputs (`np.ndarray` or `bytes` or `str` or `dict`):
+                Image or list of images.
+            mask_threshold (`float`, *optional*, defaults to 0.0):
+                Threshold to use when turning the predicted masks into binary values.
+            pred_iou_thresh (`float`, *optional*, defaults to 0.88):
+                A filtering threshold in `[0,1]` applied on the model's predicted mask quality.
+            stability_score_thresh (`float`, *optional*, defaults to 0.95):
+                A filtering threshold in `[0,1]`, using the stability of the mask under changes to the cutoff used to
+                binarize the model's mask predictions.
+            stability_score_offset (`int`, *optional*, defaults to 1):
+                The amount to shift the cutoff when calculated the stability score.
+            crops_nms_thresh (`float`, *optional*, defaults to 0.7):
+                The box IoU cutoff used by non-maximal suppression to filter duplicate masks.
+            crops_n_layers (`int`, *optional*, defaults to 0):
+                If `crops_n_layers>0`, mask prediction will be run again on crops of the image. Sets the number of
+                layers to run, where each layer has 2**i_layer number of image crops.
+            crop_overlap_ratio (`float`, *optional*, defaults to `512 / 1500`):
+                Sets the degree to which crops overlap. In the first crop layer, crops will overlap by this fraction of
+                the image length. Later layers with more crops scale down this overlap.
+            crop_n_points_downscale_factor (`int`, *optional*, defaults to `1`):
+                The number of points-per-side sampled in layer n is scaled down by crop_n_points_downscale_factor**n.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+
+        Return:
+            `Dict`: A dictionary with the following keys:
+                - **mask** (`PIL.Image`) -- A binary mask of the detected object as a PIL Image of shape `(width,
+                  height)` of the original image. Returns a mask filled with zeros if no object is found.
+                - **score** (*optional* `float`) -- Optionally, when the model is capable of estimating a confidence of
+                  the "object" described by the label and the mask.
+
+        """
+        return super().__call__(image, *args, num_workers=num_workers, batch_size=batch_size, **kwargs)
+
+    def preprocess(
+        self,
+        image,
+        points_per_batch=64,
+        crops_n_layers: int = 0,
+        crop_overlap_ratio: float = 512 / 1500,
+        points_per_crop: Optional[int] = 32,
+        crop_n_points_downscale_factor: Optional[int] = 1,
+        timeout: Optional[float] = None,
+    ):
+        image = load_image(image, timeout=timeout)
+        target_size = self.image_processor.size["longest_edge"]
+        crop_boxes, grid_points, cropped_images, input_labels = self.image_processor.generate_crop_boxes(
+            image, target_size, crops_n_layers, crop_overlap_ratio, points_per_crop, crop_n_points_downscale_factor
+        )
+        model_inputs = self.image_processor(images=cropped_images, return_tensors="pt")
+
+        with self.device_placement():
+            if self.framework == "pt":
+                inference_context = self.get_inference_context()
+                with inference_context():
+                    model_inputs = self._ensure_tensor_on_device(model_inputs, device=self.device)
+                    image_embeddings = self.model.get_image_embeddings(model_inputs.pop("pixel_values"))
+                    model_inputs["image_embeddings"] = image_embeddings
+
+        n_points = grid_points.shape[1]
+        points_per_batch = points_per_batch if points_per_batch is not None else n_points
+
+        if points_per_batch <= 0:
+            raise ValueError(
+                "Cannot have points_per_batch<=0. Must be >=1 to returned batched outputs. "
+                "To return all points at once, set points_per_batch to None"
+            )
+
+        for i in range(0, n_points, points_per_batch):
+            batched_points = grid_points[:, i : i + points_per_batch, :, :]
+            labels = input_labels[:, i : i + points_per_batch]
+            is_last = i == n_points - points_per_batch
+            yield {
+                "input_points": batched_points,
+                "input_labels": labels,
+                "input_boxes": crop_boxes,
+                "is_last": is_last,
+                **model_inputs,
+            }
+
+    def _forward(
+        self,
+        model_inputs,
+        pred_iou_thresh=0.88,
+        stability_score_thresh=0.95,
+        mask_threshold=0,
+        stability_score_offset=1,
+    ):
+        input_boxes = model_inputs.pop("input_boxes")
+        is_last = model_inputs.pop("is_last")
+        original_sizes = model_inputs.pop("original_sizes").tolist()
+        reshaped_input_sizes = model_inputs.pop("reshaped_input_sizes").tolist()
+
+        model_outputs = self.model(**model_inputs)
+
+        # post processing happens here in order to avoid CPU GPU copies of ALL the masks
+        low_resolution_masks = model_outputs["pred_masks"]
+        masks = self.image_processor.post_process_masks(
+            low_resolution_masks, original_sizes, reshaped_input_sizes, mask_threshold, binarize=False
+        )
+        iou_scores = model_outputs["iou_scores"]
+        masks, iou_scores, boxes = self.image_processor.filter_masks(
+            masks[0],
+            iou_scores[0],
+            original_sizes[0],
+            input_boxes[0],
+            pred_iou_thresh,
+            stability_score_thresh,
+            mask_threshold,
+            stability_score_offset,
+        )
+        return {
+            "masks": masks,
+            "is_last": is_last,
+            "boxes": boxes,
+            "iou_scores": iou_scores,
+        }
+
+    def postprocess(
+        self,
+        model_outputs,
+        output_rle_mask=False,
+        output_bboxes_mask=False,
+        crops_nms_thresh=0.7,
+    ):
+        all_scores = []
+        all_masks = []
+        all_boxes = []
+        for model_output in model_outputs:
+            all_scores.append(model_output.pop("iou_scores"))
+            all_masks.extend(model_output.pop("masks"))
+            all_boxes.append(model_output.pop("boxes"))
+
+        all_scores = torch.cat(all_scores)
+        all_boxes = torch.cat(all_boxes)
+        output_masks, iou_scores, rle_mask, bounding_boxes = self.image_processor.post_process_for_mask_generation(
+            all_masks, all_scores, all_boxes, crops_nms_thresh
+        )
+
+        extra = defaultdict(list)
+        for output in model_outputs:
+            for k, v in output.items():
+                extra[k].append(v)
+
+        optional = {}
+        if output_rle_mask:
+            optional["rle_mask"] = rle_mask
+
+        if output_bboxes_mask:
+            optional["bounding_boxes"] = bounding_boxes
+
+        return {"masks": output_masks, "scores": iou_scores, **optional, **extra}
diff --git a/src/transformers/pipelines/object_detection.py b/src/transformers/pipelines/object_detection.py
index e418438310b8..636a1b6a061b 100644
--- a/src/transformers/pipelines/object_detection.py
+++ b/src/transformers/pipelines/object_detection.py
@@ -11,7 +11,10 @@
 if is_torch_available():
     import torch
 
-    from ..models.auto.modeling_auto import MODEL_FOR_OBJECT_DETECTION_MAPPING, MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
+    from ..models.auto.modeling_auto import (
+        MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES,
+        MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES,
+    )
 
 logger = logging.get_logger(__name__)
 
@@ -53,15 +56,18 @@ def __init__(self, *args, **kwargs):
             raise ValueError(f"The {self.__class__} is only available in PyTorch.")
 
         requires_backends(self, "vision")
-        self.check_model_type(
-            dict(MODEL_FOR_OBJECT_DETECTION_MAPPING.items() + MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.items())
-        )
+        mapping = MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES.copy()
+        mapping.update(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES)
+        self.check_model_type(mapping)
 
     def _sanitize_parameters(self, **kwargs):
+        preprocess_params = {}
+        if "timeout" in kwargs:
+            preprocess_params["timeout"] = kwargs["timeout"]
         postprocess_kwargs = {}
         if "threshold" in kwargs:
             postprocess_kwargs["threshold"] = kwargs["threshold"]
-        return {}, {}, postprocess_kwargs
+        return preprocess_params, {}, postprocess_kwargs
 
     def __call__(self, *args, **kwargs) -> Union[Predictions, List[Prediction]]:
         """
@@ -79,6 +85,9 @@ def __call__(self, *args, **kwargs) -> Union[Predictions, List[Prediction]]:
                 same format: all as HTTP(S) links, all as local paths, or all as PIL images.
             threshold (`float`, *optional*, defaults to 0.9):
                 The probability necessary to make a prediction.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
 
         Return:
             A list of dictionaries or a list of list of dictionaries containing the result. If the input is a single
@@ -94,10 +103,10 @@ def __call__(self, *args, **kwargs) -> Union[Predictions, List[Prediction]]:
 
         return super().__call__(*args, **kwargs)
 
-    def preprocess(self, image):
-        image = load_image(image)
+    def preprocess(self, image, timeout=None):
+        image = load_image(image, timeout=timeout)
         target_size = torch.IntTensor([[image.height, image.width]])
-        inputs = self.feature_extractor(images=[image], return_tensors="pt")
+        inputs = self.image_processor(images=[image], return_tensors="pt")
         if self.tokenizer is not None:
             inputs = self.tokenizer(text=inputs["words"], boxes=inputs["boxes"], return_tensors="pt")
         inputs["target_size"] = target_size
@@ -137,9 +146,7 @@ def unnormalize(bbox):
             annotation = [dict(zip(keys, vals)) for vals in zip(scores.tolist(), labels, boxes) if vals[0] > threshold]
         else:
             # This is a regular ForObjectDetectionModel
-            raw_annotations = self.feature_extractor.post_process_object_detection(
-                model_outputs, threshold, target_size
-            )
+            raw_annotations = self.image_processor.post_process_object_detection(model_outputs, threshold, target_size)
             raw_annotation = raw_annotations[0]
             scores = raw_annotation["scores"]
             labels = raw_annotation["labels"]
diff --git a/src/transformers/pipelines/pt_utils.py b/src/transformers/pipelines/pt_utils.py
index a2ce6fc7f21a..4a95d050ec8c 100644
--- a/src/transformers/pipelines/pt_utils.py
+++ b/src/transformers/pipelines/pt_utils.py
@@ -2,7 +2,7 @@
 import torch
 from torch.utils.data import Dataset, IterableDataset
 
-from transformers.utils.generic import ModelOutput
+from ..utils.generic import ModelOutput
 
 
 class PipelineDataset(Dataset):
diff --git a/src/transformers/pipelines/question_answering.py b/src/transformers/pipelines/question_answering.py
index 4607398ad862..5bc72151fba5 100644
--- a/src/transformers/pipelines/question_answering.py
+++ b/src/transformers/pipelines/question_answering.py
@@ -1,3 +1,4 @@
+import inspect
 import types
 import warnings
 from collections.abc import Iterable
@@ -31,7 +32,7 @@
 if is_tf_available():
     import tensorflow as tf
 
-    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
 
     Dataset = None
 
@@ -39,7 +40,7 @@
     import torch
     from torch.utils.data import Dataset
 
-    from ..models.auto.modeling_auto import MODEL_FOR_QUESTION_ANSWERING_MAPPING
+    from ..models.auto.modeling_auto import MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
 
 
 def decode_spans(
@@ -210,7 +211,7 @@ def __call__(self, *args, **kwargs):
             inputs = [inputs]
         elif isinstance(inputs, Iterable):
             # Copy to avoid overriding arguments
-            inputs = [i for i in inputs]
+            inputs = list(inputs)
         else:
             raise ValueError(f"Invalid arguments {kwargs}")
 
@@ -255,7 +256,6 @@ def __init__(
         tokenizer: PreTrainedTokenizer,
         modelcard: Optional[ModelCard] = None,
         framework: Optional[str] = None,
-        device: int = -1,
         task: str = "",
         **kwargs,
     ):
@@ -264,14 +264,15 @@ def __init__(
             tokenizer=tokenizer,
             modelcard=modelcard,
             framework=framework,
-            device=device,
             task=task,
             **kwargs,
         )
 
         self._args_parser = QuestionAnsweringArgumentHandler()
         self.check_model_type(
-            TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING if self.framework == "tf" else MODEL_FOR_QUESTION_ANSWERING_MAPPING
+            TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
+            if self.framework == "tf"
+            else MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
         )
 
     @staticmethod
@@ -307,7 +308,7 @@ def _sanitize_parameters(
         max_question_len=None,
         handle_impossible_answer=None,
         align_to_words=None,
-        **kwargs
+        **kwargs,
     ):
         # Set defaults values
         preprocess_params = {}
@@ -404,6 +405,9 @@ def preprocess(self, example, padding="do_not_pad", doc_stride=None, max_questio
         if doc_stride is None:
             doc_stride = min(max_seq_len // 2, 128)
 
+        if doc_stride > max_seq_len:
+            raise ValueError(f"`doc_stride` ({doc_stride}) is larger than `max_seq_len` ({max_seq_len})")
+
         if not self.tokenizer.is_fast:
             features = squad_convert_examples_to_features(
                 examples=[example],
@@ -509,8 +513,16 @@ def preprocess(self, example, padding="do_not_pad", doc_stride=None, max_questio
     def _forward(self, inputs):
         example = inputs["example"]
         model_inputs = {k: inputs[k] for k in self.tokenizer.model_input_names}
-        start, end = self.model(**model_inputs)[:2]
-        return {"start": start, "end": end, "example": example, **inputs}
+        # `XXXForSequenceClassification` models should not use `use_cache=True` even if it's supported
+        model_forward = self.model.forward if self.framework == "pt" else self.model.call
+        if "use_cache" in inspect.signature(model_forward).parameters.keys():
+            model_inputs["use_cache"] = False
+        output = self.model(**model_inputs)
+        if isinstance(output, dict):
+            return {"start": output["start_logits"], "end": output["end_logits"], "example": example, **inputs}
+        else:
+            start, end = output[:2]
+            return {"start": start, "end": end, "example": example, **inputs}
 
     def postprocess(
         self,
diff --git a/src/transformers/pipelines/table_question_answering.py b/src/transformers/pipelines/table_question_answering.py
index 615037a6d96a..e0cb2ff3e178 100644
--- a/src/transformers/pipelines/table_question_answering.py
+++ b/src/transformers/pipelines/table_question_answering.py
@@ -17,18 +17,17 @@
     import torch
 
     from ..models.auto.modeling_auto import (
-        MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
-        MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
+        MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
+        MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES,
     )
 
 if is_tf_available() and is_tensorflow_probability_available():
     import tensorflow as tf
-
     import tensorflow_probability as tfp
 
     from ..models.auto.modeling_tf_auto import (
-        TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
-        TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
+        TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
+        TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES,
     )
 
 
@@ -123,16 +122,13 @@ def __init__(self, args_parser=TableQuestionAnsweringArgumentHandler(), *args, *
         super().__init__(*args, **kwargs)
         self._args_parser = args_parser
 
-        self.check_model_type(
-            dict(
-                TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING.items()
-                + TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.items()
-            )
-            if self.framework == "tf"
-            else dict(
-                MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING.items() + MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.items()
-            )
-        )
+        if self.framework == "tf":
+            mapping = TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES.copy()
+            mapping.update(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES)
+        else:
+            mapping = MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES.copy()
+            mapping.update(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES)
+        self.check_model_type(mapping)
 
         self.aggregate = bool(getattr(self.model.config, "aggregation_labels", None)) and bool(
             getattr(self.model.config, "num_aggregation_labels", None)
diff --git a/src/transformers/pipelines/text2text_generation.py b/src/transformers/pipelines/text2text_generation.py
index a9f73218ad54..5b9ce06832da 100644
--- a/src/transformers/pipelines/text2text_generation.py
+++ b/src/transformers/pipelines/text2text_generation.py
@@ -9,10 +9,10 @@
 if is_tf_available():
     import tensorflow as tf
 
-    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
 
 if is_torch_available():
-    from ..models.auto.modeling_auto import MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
+    from ..models.auto.modeling_auto import MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
 
 logger = logging.get_logger(__name__)
 
@@ -39,8 +39,10 @@ class Text2TextGenerationPipeline(Pipeline):
     [{'generated_text': 'question: Who created the RuPERTa-base?'}]
     ```
 
-    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
-
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial). You can pass text
+    generation parameters to this pipeline to control stopping criteria, decoding strategy, and more. Learn more about
+    text generation parameters in [Text generation strategies](../generation_strategies) and [Text
+    generation](text_generation).
 
     This Text2TextGenerationPipeline pipeline can currently be loaded from [`pipeline`] using the following task
     identifier: `"text2text-generation"`.
@@ -65,9 +67,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         self.check_model_type(
-            TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
+            TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
             if self.framework == "tf"
-            else MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
+            else MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
         )
 
     def _sanitize_parameters(
@@ -78,7 +80,7 @@ def _sanitize_parameters(
         clean_up_tokenization_spaces=None,
         truncation=None,
         stop_sequence=None,
-        **generate_kwargs
+        **generate_kwargs,
     ):
         preprocess_params = {}
         if truncation is not None:
@@ -181,9 +183,11 @@ def _forward(self, model_inputs, **generate_kwargs):
         elif self.framework == "tf":
             in_b, input_length = tf.shape(model_inputs["input_ids"]).numpy()
 
-        generate_kwargs["min_length"] = generate_kwargs.get("min_length", self.model.config.min_length)
-        generate_kwargs["max_length"] = generate_kwargs.get("max_length", self.model.config.max_length)
-        self.check_inputs(input_length, generate_kwargs["min_length"], generate_kwargs["max_length"])
+        self.check_inputs(
+            input_length,
+            generate_kwargs.get("min_length", self.model.config.min_length),
+            generate_kwargs.get("max_length", self.model.config.max_length),
+        )
         output_ids = self.model.generate(**model_inputs, **generate_kwargs)
         out_b = output_ids.shape[0]
         if self.framework == "pt":
@@ -273,7 +277,8 @@ def check_inputs(self, input_length: int, min_length: int, max_length: int) -> b
 
         if input_length < max_length:
             logger.warning(
-                f"Your max_length is set to {max_length}, but you input_length is only {input_length}. You might "
+                f"Your max_length is set to {max_length}, but your input_length is only {input_length}. Since this is "
+                "a summarization task, where outputs shorter than the input are typically wanted, you might "
                 f"consider decreasing max_length manually, e.g. summarizer('...', max_length={input_length//2})"
             )
 
diff --git a/src/transformers/pipelines/text_classification.py b/src/transformers/pipelines/text_classification.py
index 1c1c7249f49d..f9c87fb944a0 100644
--- a/src/transformers/pipelines/text_classification.py
+++ b/src/transformers/pipelines/text_classification.py
@@ -1,3 +1,4 @@
+import inspect
 import warnings
 from typing import Dict
 
@@ -8,10 +9,10 @@
 
 
 if is_tf_available():
-    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
 
 if is_torch_available():
-    from ..models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
+    from ..models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
 
 
 def sigmoid(_outputs):
@@ -83,9 +84,9 @@ def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
         self.check_model_type(
-            TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
+            TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
             if self.framework == "tf"
-            else MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
+            else MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
         )
 
     def _sanitize_parameters(self, return_all_scores=None, function_to_apply=None, top_k="", **tokenizer_kwargs):
@@ -102,7 +103,7 @@ def _sanitize_parameters(self, return_all_scores=None, function_to_apply=None, t
             postprocess_params["_legacy"] = False
         elif return_all_scores is not None:
             warnings.warn(
-                "`return_all_scores` is now deprecated,  if want a similar funcionality use `top_k=None` instead of"
+                "`return_all_scores` is now deprecated,  if want a similar functionality use `top_k=None` instead of"
                 " `return_all_scores=True` or `top_k=1` instead of `return_all_scores=False`.",
                 UserWarning,
             )
@@ -179,6 +180,10 @@ def preprocess(self, inputs, **tokenizer_kwargs) -> Dict[str, GenericTensor]:
         return self.tokenizer(inputs, return_tensors=return_tensors, **tokenizer_kwargs)
 
     def _forward(self, model_inputs):
+        # `XXXForSequenceClassification` models should not use `use_cache=True` even if it's supported
+        model_forward = self.model.forward if self.framework == "pt" else self.model.call
+        if "use_cache" in inspect.signature(model_forward).parameters.keys():
+            model_inputs["use_cache"] = False
         return self.model(**model_inputs)
 
     def postprocess(self, model_outputs, function_to_apply=None, top_k=1, _legacy=True):
diff --git a/src/transformers/pipelines/text_generation.py b/src/transformers/pipelines/text_generation.py
index b19d58f4ffbb..109971d8ac85 100644
--- a/src/transformers/pipelines/text_generation.py
+++ b/src/transformers/pipelines/text_generation.py
@@ -1,15 +1,18 @@
 import enum
 import warnings
 
-from transformers import MODEL_FOR_CAUSAL_LM_MAPPING, TF_MODEL_FOR_CAUSAL_LM_MAPPING
-
-from ..utils import add_end_docstrings, is_tf_available
+from ..utils import add_end_docstrings, is_tf_available, is_torch_available
 from .base import PIPELINE_INIT_ARGS, Pipeline
 
 
+if is_torch_available():
+    from ..models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+
 if is_tf_available():
     import tensorflow as tf
 
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+
 
 class ReturnType(enum.Enum):
     TENSORS = 0
@@ -36,7 +39,10 @@ class TextGenerationPipeline(Pipeline):
     >>> outputs = generator("My tart needs some", num_return_sequences=4, return_full_text=False)
     ```
 
-    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial). You can pass text
+    generation parameters to this pipeline to control stopping criteria, decoding strategy, and more. Learn more about
+    text generation parameters in [Text generation strategies](../generation_strategies) and [Text
+    generation](text_generation).
 
     This language generation pipeline can currently be loaded from [`pipeline`] using the following task identifier:
     `"text-generation"`.
@@ -63,7 +69,7 @@ class TextGenerationPipeline(Pipeline):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.check_model_type(
-            TF_MODEL_FOR_CAUSAL_LM_MAPPING if self.framework == "tf" else MODEL_FOR_CAUSAL_LM_MAPPING
+            TF_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES if self.framework == "tf" else MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
         )
         if "prefix" not in self._preprocess_params:
             # This is very specific. The logic is quite complex and needs to be done
@@ -97,26 +103,18 @@ def _sanitize_parameters(
         prefix=None,
         handle_long_generation=None,
         stop_sequence=None,
-        **generate_kwargs
+        add_special_tokens=False,
+        **generate_kwargs,
     ):
-        preprocess_params = {}
+        preprocess_params = {"add_special_tokens": add_special_tokens}
         if prefix is not None:
             preprocess_params["prefix"] = prefix
         if prefix:
             prefix_inputs = self.tokenizer(
-                prefix, padding=False, add_special_tokens=False, return_tensors=self.framework
+                prefix, padding=False, add_special_tokens=add_special_tokens, return_tensors=self.framework
             )
-            prefix_length = prefix_inputs["input_ids"].shape[-1]
-
-            if "max_new_tokens" in generate_kwargs:
-                pass
-            elif "max_length" in generate_kwargs:
-                generate_kwargs["max_length"] += prefix_length
-            else:
-                generate_kwargs["max_length"] = self.model.config.max_length + prefix_length
+            generate_kwargs["prefix_length"] = prefix_inputs["input_ids"].shape[-1]
 
-            if "min_length" in generate_kwargs:
-                generate_kwargs["min_length"] += prefix_length
         if handle_long_generation is not None:
             if handle_long_generation not in {"hole"}:
                 raise ValueError(
@@ -209,9 +207,11 @@ def __call__(self, text_inputs, **kwargs):
         """
         return super().__call__(text_inputs, **kwargs)
 
-    def preprocess(self, prompt_text, prefix="", handle_long_generation=None, **generate_kwargs):
+    def preprocess(
+        self, prompt_text, prefix="", handle_long_generation=None, add_special_tokens=False, **generate_kwargs
+    ):
         inputs = self.tokenizer(
-            prefix + prompt_text, padding=False, add_special_tokens=False, return_tensors=self.framework
+            prefix + prompt_text, padding=False, add_special_tokens=add_special_tokens, return_tensors=self.framework
         )
         inputs["prompt_text"] = prompt_text
 
@@ -248,6 +248,25 @@ def _forward(self, model_inputs, **generate_kwargs):
         else:
             in_b = input_ids.shape[0]
         prompt_text = model_inputs.pop("prompt_text")
+
+        # If there is a prefix, we may need to adjust the generation length. Do so without permanently modifying
+        # generate_kwargs, as some of the parameterization may come from the initialization of the pipeline.
+        prefix_length = generate_kwargs.pop("prefix_length", 0)
+        if prefix_length > 0:
+            has_max_new_tokens = "max_new_tokens" in generate_kwargs or (
+                "generation_config" in generate_kwargs
+                and generate_kwargs["generation_config"].max_new_tokens is not None
+            )
+            if not has_max_new_tokens:
+                generate_kwargs["max_length"] = generate_kwargs.get("max_length") or self.model.config.max_length
+                generate_kwargs["max_length"] += prefix_length
+            has_min_new_tokens = "min_new_tokens" in generate_kwargs or (
+                "generation_config" in generate_kwargs
+                and generate_kwargs["generation_config"].min_new_tokens is not None
+            )
+            if not has_min_new_tokens and "min_length" in generate_kwargs:
+                generate_kwargs["min_length"] += prefix_length
+
         # BS x SL
         generated_sequence = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, **generate_kwargs)
         out_b = generated_sequence.shape[0]
@@ -286,10 +305,9 @@ def postprocess(self, model_outputs, return_type=ReturnType.FULL_TEXT, clean_up_
                         )
                     )
 
+                all_text = text[prompt_length:]
                 if return_type == ReturnType.FULL_TEXT:
-                    all_text = prompt_text + text[prompt_length:]
-                else:
-                    all_text = text[prompt_length:]
+                    all_text = prompt_text + all_text
 
                 record = {"generated_text": all_text}
             records.append(record)
diff --git a/src/transformers/pipelines/text_to_audio.py b/src/transformers/pipelines/text_to_audio.py
new file mode 100644
index 000000000000..299fa7ac014b
--- /dev/null
+++ b/src/transformers/pipelines/text_to_audio.py
@@ -0,0 +1,159 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.from typing import List, Union
+from typing import List, Union
+
+from ..utils import is_torch_available
+from .base import Pipeline
+
+
+if is_torch_available():
+    from ..models.auto.modeling_auto import MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING
+    from ..models.speecht5.modeling_speecht5 import SpeechT5HifiGan
+
+DEFAULT_VOCODER_ID = "microsoft/speecht5_hifigan"
+
+
+class TextToAudioPipeline(Pipeline):
+    """
+    Text-to-audio generation pipeline using any `AutoModelForTextToWaveform` or `AutoModelForTextToSpectrogram`. This
+    pipeline generates an audio file from an input text and optional other conditional inputs.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> pipe = pipeline(model="suno/bark-small")
+    >>> output = pipe("Hey it's HuggingFace on the phone!")
+
+    >>> audio = output["audio"]
+    >>> sampling_rate = output["sampling_rate"]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+
+    This pipeline can currently be loaded from [`pipeline`] using the following task identifiers: `"text-to-speech"` or
+    `"text-to-audio"`.
+
+    See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=text-to-speech).
+    """
+
+    def __init__(self, *args, vocoder=None, sampling_rate=None, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        if self.framework == "tf":
+            raise ValueError("The TextToAudioPipeline is only available in PyTorch.")
+
+        self.vocoder = None
+        if self.model.__class__ in MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING.values():
+            self.vocoder = (
+                SpeechT5HifiGan.from_pretrained(DEFAULT_VOCODER_ID).to(self.model.device)
+                if vocoder is None
+                else vocoder
+            )
+
+        self.sampling_rate = sampling_rate
+        if self.vocoder is not None:
+            self.sampling_rate = self.vocoder.config.sampling_rate
+
+        if self.sampling_rate is None:
+            # get sampling_rate from config and generation config
+
+            config = self.model.config
+            gen_config = self.model.__dict__.get("generation_config", None)
+            if gen_config is not None:
+                config.update(gen_config.to_dict())
+
+            for sampling_rate_name in ["sample_rate", "sampling_rate"]:
+                sampling_rate = getattr(config, sampling_rate_name, None)
+                if sampling_rate is not None:
+                    self.sampling_rate = sampling_rate
+
+    def preprocess(self, text, **kwargs):
+        if isinstance(text, str):
+            text = [text]
+
+        if self.model.config.model_type == "bark":
+            # bark Tokenizer is called with BarkProcessor which uses those kwargs
+            new_kwargs = {
+                "max_length": self.model.generation_config.semantic_config.get("max_input_semantic_length", 256),
+                "add_special_tokens": False,
+                "return_attention_mask": True,
+                "return_token_type_ids": False,
+                "padding": "max_length",
+            }
+
+            # priority is given to kwargs
+            new_kwargs.update(kwargs)
+
+            kwargs = new_kwargs
+
+        output = self.tokenizer(text, **kwargs, return_tensors="pt")
+
+        return output
+
+    def _forward(self, model_inputs, **kwargs):
+        # we expect some kwargs to be additional tensors which need to be on the right device
+        kwargs = self._ensure_tensor_on_device(kwargs, device=self.device)
+
+        if self.model.can_generate():
+            output = self.model.generate(**model_inputs, **kwargs)
+        else:
+            output = self.model(**model_inputs, **kwargs)[0]
+
+        if self.vocoder is not None:
+            # in that case, the output is a spectrogram that needs to be converted into a waveform
+            output = self.vocoder(output)
+
+        return output
+
+    def __call__(self, text_inputs: Union[str, List[str]], **forward_params):
+        """
+        Generates speech/audio from the inputs. See the [`TextToAudioPipeline`] documentation for more information.
+
+        Args:
+            text_inputs (`str` or `List[str]`):
+                The text(s) to generate.
+            forward_params (*optional*):
+                Parameters passed to the model generation/forward method.
+
+        Return:
+            A `dict` or a list of `dict`: The dictionaries have two keys:
+
+            - **audio** (`np.ndarray` of shape `(nb_channels, audio_length)`) -- The generated audio waveform.
+            - **sampling_rate** (`int`) -- The sampling rate of the generated audio waveform.
+        """
+        return super().__call__(text_inputs, **forward_params)
+
+    def _sanitize_parameters(
+        self,
+        preprocess_params=None,
+        forward_params=None,
+    ):
+        if preprocess_params is None:
+            preprocess_params = {}
+        if forward_params is None:
+            forward_params = {}
+        postprocess_params = {}
+
+        return preprocess_params, forward_params, postprocess_params
+
+    def postprocess(self, waveform):
+        output_dict = {}
+
+        output_dict["audio"] = waveform.cpu().float().numpy()
+        output_dict["sampling_rate"] = self.sampling_rate
+
+        return output_dict
diff --git a/src/transformers/pipelines/token_classification.py b/src/transformers/pipelines/token_classification.py
index 963f491db7e5..a32a9aa9ad8b 100644
--- a/src/transformers/pipelines/token_classification.py
+++ b/src/transformers/pipelines/token_classification.py
@@ -5,15 +5,21 @@
 import numpy as np
 
 from ..models.bert.tokenization_bert import BasicTokenizer
-from ..utils import ExplicitEnum, add_end_docstrings, is_tf_available, is_torch_available
-from .base import PIPELINE_INIT_ARGS, ArgumentHandler, Dataset, Pipeline
+from ..utils import (
+    ExplicitEnum,
+    add_end_docstrings,
+    is_tf_available,
+    is_torch_available,
+)
+from .base import PIPELINE_INIT_ARGS, ArgumentHandler, ChunkPipeline, Dataset
 
 
 if is_tf_available():
-    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
+    import tensorflow as tf
 
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES
 if is_torch_available():
-    from ..models.auto.modeling_auto import MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
+    from ..models.auto.modeling_auto import MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES
 
 
 class TokenClassificationArgumentHandler(ArgumentHandler):
@@ -22,7 +28,6 @@ class TokenClassificationArgumentHandler(ArgumentHandler):
     """
 
     def __call__(self, inputs: Union[str, List[str]], **kwargs):
-
         if inputs is not None and isinstance(inputs, (list, tuple)) and len(inputs) > 0:
             inputs = list(inputs)
             batch_size = len(inputs)
@@ -61,6 +66,11 @@ class AggregationStrategy(ExplicitEnum):
         grouped_entities (`bool`, *optional*, defaults to `False`):
             DEPRECATED, use `aggregation_strategy` instead. Whether or not to group the tokens corresponding to the
             same entity together in the predictions or not.
+        stride (`int`, *optional*):
+            If stride is provided, the pipeline is applied on all the text. The text is split into chunks of size
+            model_max_length. Works only with fast tokenizers and `aggregation_strategy` different from `NONE`. The
+            value of this argument defines the number of overlapping tokens between chunks. In other words, the model
+            will shift forward by `tokenizer.model_max_length - stride` tokens each step.
         aggregation_strategy (`str`, *optional*, defaults to `"none"`):
             The strategy to fuse (or not) tokens based on the model prediction.
 
@@ -83,7 +93,7 @@ class AggregationStrategy(ExplicitEnum):
                   end up with different tags. Word entity will simply be the token with the maximum score.
     """,
 )
-class TokenClassificationPipeline(Pipeline):
+class TokenClassificationPipeline(ChunkPipeline):
     """
     Named Entity Recognition pipeline using any `ModelForTokenClassification`. See the [named entity recognition
     examples](../task_summary#named-entity-recognition) for more information.
@@ -125,9 +135,9 @@ class TokenClassificationPipeline(Pipeline):
     def __init__(self, args_parser=TokenClassificationArgumentHandler(), *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.check_model_type(
-            TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
+            TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES
             if self.framework == "tf"
-            else MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
+            else MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES
         )
 
         self._basic_tokenizer = BasicTokenizer(do_lower_case=False)
@@ -140,8 +150,8 @@ def _sanitize_parameters(
         ignore_subwords: Optional[bool] = None,
         aggregation_strategy: Optional[AggregationStrategy] = None,
         offset_mapping: Optional[List[Tuple[int, int]]] = None,
+        stride: Optional[int] = None,
     ):
-
         preprocess_params = {}
         if offset_mapping is not None:
             preprocess_params["offset_mapping"] = offset_mapping
@@ -176,11 +186,34 @@ def _sanitize_parameters(
             ):
                 raise ValueError(
                     "Slow tokenizers cannot handle subwords. Please set the `aggregation_strategy` option"
-                    'to `"simple"` or use a fast tokenizer.'
+                    ' to `"simple"` or use a fast tokenizer.'
                 )
             postprocess_params["aggregation_strategy"] = aggregation_strategy
         if ignore_labels is not None:
             postprocess_params["ignore_labels"] = ignore_labels
+        if stride is not None:
+            if stride >= self.tokenizer.model_max_length:
+                raise ValueError(
+                    "`stride` must be less than `tokenizer.model_max_length` (or even lower if the tokenizer adds special tokens)"
+                )
+            if aggregation_strategy == AggregationStrategy.NONE:
+                raise ValueError(
+                    "`stride` was provided to process all the text but `aggregation_strategy="
+                    f'"{aggregation_strategy}"`, please select another one instead.'
+                )
+            else:
+                if self.tokenizer.is_fast:
+                    tokenizer_params = {
+                        "return_overflowing_tokens": True,
+                        "padding": True,
+                        "stride": stride,
+                    }
+                    preprocess_params["tokenizer_params"] = tokenizer_params
+                else:
+                    raise ValueError(
+                        "`stride` was provided to process all the text but you're using a slow tokenizer."
+                        " Please use a fast tokenizer."
+                    )
         return preprocess_params, {}, postprocess_params
 
     def __call__(self, inputs: Union[str, List[str]], **kwargs):
@@ -215,69 +248,110 @@ def __call__(self, inputs: Union[str, List[str]], **kwargs):
 
         return super().__call__(inputs, **kwargs)
 
-    def preprocess(self, sentence, offset_mapping=None):
+    def preprocess(self, sentence, offset_mapping=None, **preprocess_params):
+        tokenizer_params = preprocess_params.pop("tokenizer_params", {})
         truncation = True if self.tokenizer.model_max_length and self.tokenizer.model_max_length > 0 else False
-        model_inputs = self.tokenizer(
+        inputs = self.tokenizer(
             sentence,
             return_tensors=self.framework,
             truncation=truncation,
             return_special_tokens_mask=True,
             return_offsets_mapping=self.tokenizer.is_fast,
+            **tokenizer_params,
         )
-        if offset_mapping:
-            model_inputs["offset_mapping"] = offset_mapping
+        inputs.pop("overflow_to_sample_mapping", None)
+        num_chunks = len(inputs["input_ids"])
 
-        model_inputs["sentence"] = sentence
+        for i in range(num_chunks):
+            if self.framework == "tf":
+                model_inputs = {k: tf.expand_dims(v[i], 0) for k, v in inputs.items()}
+            else:
+                model_inputs = {k: v[i].unsqueeze(0) for k, v in inputs.items()}
+            if offset_mapping is not None:
+                model_inputs["offset_mapping"] = offset_mapping
+            model_inputs["sentence"] = sentence if i == 0 else None
+            model_inputs["is_last"] = i == num_chunks - 1
 
-        return model_inputs
+            yield model_inputs
 
     def _forward(self, model_inputs):
         # Forward
         special_tokens_mask = model_inputs.pop("special_tokens_mask")
         offset_mapping = model_inputs.pop("offset_mapping", None)
         sentence = model_inputs.pop("sentence")
+        is_last = model_inputs.pop("is_last")
         if self.framework == "tf":
-            logits = self.model(model_inputs.data)[0]
-        else:
             logits = self.model(**model_inputs)[0]
+        else:
+            output = self.model(**model_inputs)
+            logits = output["logits"] if isinstance(output, dict) else output[0]
 
         return {
             "logits": logits,
             "special_tokens_mask": special_tokens_mask,
             "offset_mapping": offset_mapping,
             "sentence": sentence,
+            "is_last": is_last,
             **model_inputs,
         }
 
-    def postprocess(self, model_outputs, aggregation_strategy=AggregationStrategy.NONE, ignore_labels=None):
+    def postprocess(self, all_outputs, aggregation_strategy=AggregationStrategy.NONE, ignore_labels=None):
         if ignore_labels is None:
             ignore_labels = ["O"]
-        logits = model_outputs["logits"][0].numpy()
-        sentence = model_outputs["sentence"]
-        input_ids = model_outputs["input_ids"][0]
-        offset_mapping = model_outputs["offset_mapping"][0] if model_outputs["offset_mapping"] is not None else None
-        special_tokens_mask = model_outputs["special_tokens_mask"][0].numpy()
-
-        maxes = np.max(logits, axis=-1, keepdims=True)
-        shifted_exp = np.exp(logits - maxes)
-        scores = shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
-
-        if self.framework == "tf":
-            input_ids = input_ids.numpy()
-            offset_mapping = offset_mapping.numpy() if offset_mapping is not None else None
-
-        pre_entities = self.gather_pre_entities(
-            sentence, input_ids, scores, offset_mapping, special_tokens_mask, aggregation_strategy
-        )
-        grouped_entities = self.aggregate(pre_entities, aggregation_strategy)
-        # Filter anything that is in self.ignore_labels
-        entities = [
-            entity
-            for entity in grouped_entities
-            if entity.get("entity", None) not in ignore_labels
-            and entity.get("entity_group", None) not in ignore_labels
-        ]
-        return entities
+        all_entities = []
+        for model_outputs in all_outputs:
+            logits = model_outputs["logits"][0].numpy()
+            sentence = all_outputs[0]["sentence"]
+            input_ids = model_outputs["input_ids"][0]
+            offset_mapping = (
+                model_outputs["offset_mapping"][0] if model_outputs["offset_mapping"] is not None else None
+            )
+            special_tokens_mask = model_outputs["special_tokens_mask"][0].numpy()
+
+            maxes = np.max(logits, axis=-1, keepdims=True)
+            shifted_exp = np.exp(logits - maxes)
+            scores = shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
+
+            if self.framework == "tf":
+                input_ids = input_ids.numpy()
+                offset_mapping = offset_mapping.numpy() if offset_mapping is not None else None
+
+            pre_entities = self.gather_pre_entities(
+                sentence, input_ids, scores, offset_mapping, special_tokens_mask, aggregation_strategy
+            )
+            grouped_entities = self.aggregate(pre_entities, aggregation_strategy)
+            # Filter anything that is in self.ignore_labels
+            entities = [
+                entity
+                for entity in grouped_entities
+                if entity.get("entity", None) not in ignore_labels
+                and entity.get("entity_group", None) not in ignore_labels
+            ]
+            all_entities.extend(entities)
+        num_chunks = len(all_outputs)
+        if num_chunks > 1:
+            all_entities = self.aggregate_overlapping_entities(all_entities)
+        return all_entities
+
+    def aggregate_overlapping_entities(self, entities):
+        if len(entities) == 0:
+            return entities
+        entities = sorted(entities, key=lambda x: x["start"])
+        aggregated_entities = []
+        previous_entity = entities[0]
+        for entity in entities:
+            if previous_entity["start"] <= entity["start"] < previous_entity["end"]:
+                current_length = entity["end"] - entity["start"]
+                previous_length = previous_entity["end"] - previous_entity["start"]
+                if current_length > previous_length:
+                    previous_entity = entity
+                elif current_length == previous_length and entity["score"] > previous_entity["score"]:
+                    previous_entity = entity
+            else:
+                aggregated_entities.append(previous_entity)
+                previous_entity = entity
+        aggregated_entities.append(previous_entity)
+        return aggregated_entities
 
     def gather_pre_entities(
         self,
@@ -291,9 +365,7 @@ def gather_pre_entities(
         """Fuse various numpy arrays into dicts with all the information needed for aggregation"""
         pre_entities = []
         for idx, token_scores in enumerate(scores):
-            # Filter special_tokens, they should only occur
-            # at the sentence boundaries since we're not encoding pairs of
-            # sentences so we don't have to keep track of those.
+            # Filter special_tokens
             if special_tokens_mask[idx]:
                 continue
 
@@ -305,7 +377,9 @@ def gather_pre_entities(
                         start_ind = start_ind.item()
                         end_ind = end_ind.item()
                 word_ref = sentence[start_ind:end_ind]
-                if getattr(self.tokenizer._tokenizer.model, "continuing_subword_prefix", None):
+                if getattr(self.tokenizer, "_tokenizer", None) and getattr(
+                    self.tokenizer._tokenizer.model, "continuing_subword_prefix", None
+                ):
                     # This is a BPE, word aware tokenizer, there is a correct way
                     # to fuse tokens
                     is_subword = len(word) != len(word_ref)
@@ -316,7 +390,10 @@ def gather_pre_entities(
                         AggregationStrategy.AVERAGE,
                         AggregationStrategy.MAX,
                     }:
-                        warnings.warn("Tokenizer does not support real words, using fallback heuristic", UserWarning)
+                        warnings.warn(
+                            "Tokenizer does not support real words, using fallback heuristic",
+                            UserWarning,
+                        )
                     is_subword = start_ind > 0 and " " not in sentence[start_ind - 1 : start_ind + 1]
 
                 if int(input_ids[idx]) == self.tokenizer.unk_token_id:
@@ -414,7 +491,8 @@ def aggregate_words(self, entities: List[dict], aggregation_strategy: Aggregatio
                 word_entities.append(self.aggregate_word(word_group, aggregation_strategy))
                 word_group = [entity]
         # Last item
-        word_entities.append(self.aggregate_word(word_group, aggregation_strategy))
+        if word_group is not None:
+            word_entities.append(self.aggregate_word(word_group, aggregation_strategy))
         return word_entities
 
     def group_sub_entities(self, entities: List[dict]) -> dict:
diff --git a/src/transformers/pipelines/video_classification.py b/src/transformers/pipelines/video_classification.py
index 8d53fb851b5a..4255856aa26d 100644
--- a/src/transformers/pipelines/video_classification.py
+++ b/src/transformers/pipelines/video_classification.py
@@ -9,12 +9,11 @@
 
 if is_decord_available():
     import numpy as np
-
     from decord import VideoReader
 
 
 if is_torch_available():
-    from ..models.auto.modeling_auto import MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING
+    from ..models.auto.modeling_auto import MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES
 
 logger = logging.get_logger(__name__)
 
@@ -35,7 +34,7 @@ class VideoClassificationPipeline(Pipeline):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         requires_backends(self, "decord")
-        self.check_model_type(MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING)
+        self.check_model_type(MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES)
 
     def _sanitize_parameters(self, top_k=None, num_frames=None, frame_sampling_rate=None):
         preprocess_params = {}
@@ -85,7 +84,6 @@ def __call__(self, videos: Union[str, List[str]], **kwargs):
         return super().__call__(videos, **kwargs)
 
     def preprocess(self, video, num_frames=None, frame_sampling_rate=1):
-
         if num_frames is None:
             num_frames = self.model.config.num_frames
 
@@ -102,7 +100,7 @@ def preprocess(self, video, num_frames=None, frame_sampling_rate=1):
         video = videoreader.get_batch(indices).asnumpy()
         video = list(video)
 
-        model_inputs = self.feature_extractor(video, return_tensors=self.framework)
+        model_inputs = self.image_processor(video, return_tensors=self.framework)
         return model_inputs
 
     def _forward(self, model_inputs):
diff --git a/src/transformers/pipelines/visual_question_answering.py b/src/transformers/pipelines/visual_question_answering.py
index 05a2b9f73626..c3bf65114fc5 100644
--- a/src/transformers/pipelines/visual_question_answering.py
+++ b/src/transformers/pipelines/visual_question_answering.py
@@ -10,7 +10,7 @@
     from ..image_utils import load_image
 
 if is_torch_available():
-    from ..models.auto.modeling_auto import MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING
+    from ..models.auto.modeling_auto import MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES
 
 logger = logging.get_logger(__name__)
 
@@ -53,14 +53,16 @@ class VisualQuestionAnsweringPipeline(Pipeline):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.check_model_type(MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING)
+        self.check_model_type(MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES)
 
-    def _sanitize_parameters(self, top_k=None, padding=None, truncation=None, **kwargs):
+    def _sanitize_parameters(self, top_k=None, padding=None, truncation=None, timeout=None, **kwargs):
         preprocess_params, postprocess_params = {}, {}
         if padding is not None:
             preprocess_params["padding"] = padding
         if truncation is not None:
             preprocess_params["truncation"] = truncation
+        if timeout is not None:
+            preprocess_params["timeout"] = timeout
         if top_k is not None:
             postprocess_params["top_k"] = top_k
         return preprocess_params, {}, postprocess_params
@@ -90,6 +92,9 @@ def __call__(self, image: Union["Image.Image", str], question: str = None, **kwa
             top_k (`int`, *optional*, defaults to 5):
                 The number of top labels that will be returned by the pipeline. If the provided number is higher than
                 the number of labels available in the model configuration, it will default to the number of labels.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
         Return:
             A dictionary or a list of dictionaries containing the result. The dictionaries contain the following keys:
 
@@ -109,29 +114,38 @@ def __call__(self, image: Union["Image.Image", str], question: str = None, **kwa
         results = super().__call__(inputs, **kwargs)
         return results
 
-    def preprocess(self, inputs, padding=False, truncation=False):
-        image = load_image(inputs["image"])
+    def preprocess(self, inputs, padding=False, truncation=False, timeout=None):
+        image = load_image(inputs["image"], timeout=timeout)
         model_inputs = self.tokenizer(
             inputs["question"], return_tensors=self.framework, padding=padding, truncation=truncation
         )
-        image_features = self.feature_extractor(images=image, return_tensors=self.framework)
+        image_features = self.image_processor(images=image, return_tensors=self.framework)
         model_inputs.update(image_features)
         return model_inputs
 
     def _forward(self, model_inputs):
-        model_outputs = self.model(**model_inputs)
+        if self.model.can_generate():
+            model_outputs = self.model.generate(**model_inputs)
+        else:
+            model_outputs = self.model(**model_inputs)
         return model_outputs
 
     def postprocess(self, model_outputs, top_k=5):
-        if top_k > self.model.config.num_labels:
-            top_k = self.model.config.num_labels
-
-        if self.framework == "pt":
-            probs = model_outputs.logits.sigmoid()[0]
-            scores, ids = probs.topk(top_k)
+        if self.model.can_generate():
+            return [
+                {"answer": self.tokenizer.decode(output_ids, skip_special_tokens=True).strip()}
+                for output_ids in model_outputs
+            ]
         else:
-            raise ValueError(f"Unsupported framework: {self.framework}")
-
-        scores = scores.tolist()
-        ids = ids.tolist()
-        return [{"score": score, "answer": self.model.config.id2label[_id]} for score, _id in zip(scores, ids)]
+            if top_k > self.model.config.num_labels:
+                top_k = self.model.config.num_labels
+
+            if self.framework == "pt":
+                probs = model_outputs.logits.sigmoid()[0]
+                scores, ids = probs.topk(top_k)
+            else:
+                raise ValueError(f"Unsupported framework: {self.framework}")
+
+            scores = scores.tolist()
+            ids = ids.tolist()
+            return [{"score": score, "answer": self.model.config.id2label[_id]} for score, _id in zip(scores, ids)]
diff --git a/src/transformers/pipelines/zero_shot_audio_classification.py b/src/transformers/pipelines/zero_shot_audio_classification.py
new file mode 100644
index 000000000000..e6b1da7df70a
--- /dev/null
+++ b/src/transformers/pipelines/zero_shot_audio_classification.py
@@ -0,0 +1,155 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections import UserDict
+from typing import Union
+
+import numpy as np
+import requests
+
+from ..utils import (
+    add_end_docstrings,
+    logging,
+)
+from .audio_classification import ffmpeg_read
+from .base import PIPELINE_INIT_ARGS, Pipeline
+
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(PIPELINE_INIT_ARGS)
+class ZeroShotAudioClassificationPipeline(Pipeline):
+    """
+    Zero shot audio classification pipeline using `ClapModel`. This pipeline predicts the class of an audio when you
+    provide an audio and a set of `candidate_labels`.
+
+    Example:
+    ```python
+    >>> from transformers import pipeline
+    >>> from datasets import load_dataset
+
+    >>> dataset = load_dataset("ashraq/esc50")
+    >>> audio = next(iter(dataset["train"]["audio"]))["array"]
+    >>> classifier = pipeline(task="zero-shot-audio-classification", model="laion/clap-htsat-unfused")
+    >>> classifier(audio, candidate_labels=["Sound of a dog", "Sound of vaccum cleaner"])
+    [{'score': 0.9996, 'label': 'Sound of a dog'}, {'score': 0.0004, 'label': 'Sound of vaccum cleaner'}]
+    ```
+
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) This audio
+    classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"zero-shot-audio-classification"`. See the list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=zero-shot-audio-classification).
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        if self.framework != "pt":
+            raise ValueError(f"The {self.__class__} is only available in PyTorch.")
+        # No specific FOR_XXX available yet
+
+    def __call__(self, audios: Union[np.ndarray, bytes, str], **kwargs):
+        """
+        Assign labels to the audio(s) passed as inputs.
+
+        Args:
+            audios (`str`, `List[str]`, `np.array` or `List[np.array]`):
+                The pipeline handles three types of inputs:
+                - A string containing a http link pointing to an audio
+                - A string containing a local path to an audio
+                - An audio loaded in numpy
+            candidate_labels (`List[str]`):
+                The candidate labels for this audio
+            hypothesis_template (`str`, *optional*, defaults to `"This is a sound of {}"`):
+                The sentence used in cunjunction with *candidate_labels* to attempt the audio classification by
+                replacing the placeholder with the candidate_labels. Then likelihood is estimated by using
+                logits_per_audio
+        Return:
+            A list of dictionaries containing result, one dictionary per proposed label. The dictionaries contain the
+            following keys:
+            - **label** (`str`) -- The label identified by the model. It is one of the suggested `candidate_label`.
+            - **score** (`float`) -- The score attributed by the model for that label (between 0 and 1).
+        """
+        return super().__call__(audios, **kwargs)
+
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_params = {}
+        if "candidate_labels" in kwargs:
+            preprocess_params["candidate_labels"] = kwargs["candidate_labels"]
+        if "hypothesis_template" in kwargs:
+            preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"]
+
+        return preprocess_params, {}, {}
+
+    def preprocess(self, audio, candidate_labels=None, hypothesis_template="This is a sound of {}."):
+        if isinstance(audio, str):
+            if audio.startswith("http://") or audio.startswith("https://"):
+                # We need to actually check for a real protocol, otherwise it's impossible to use a local file
+                # like http_huggingface_co.png
+                audio = requests.get(audio).content
+            else:
+                with open(audio, "rb") as f:
+                    audio = f.read()
+
+        if isinstance(audio, bytes):
+            audio = ffmpeg_read(audio, self.feature_extractor.sampling_rate)
+
+        if not isinstance(audio, np.ndarray):
+            raise ValueError("We expect a numpy ndarray as input")
+        if len(audio.shape) != 1:
+            raise ValueError("We expect a single channel audio input for ZeroShotAudioClassificationPipeline")
+
+        inputs = self.feature_extractor(
+            [audio], sampling_rate=self.feature_extractor.sampling_rate, return_tensors="pt"
+        )
+        inputs["candidate_labels"] = candidate_labels
+        sequences = [hypothesis_template.format(x) for x in candidate_labels]
+        text_inputs = self.tokenizer(sequences, return_tensors=self.framework, padding=True)
+        inputs["text_inputs"] = [text_inputs]
+        return inputs
+
+    def _forward(self, model_inputs):
+        candidate_labels = model_inputs.pop("candidate_labels")
+        text_inputs = model_inputs.pop("text_inputs")
+        if isinstance(text_inputs[0], UserDict):
+            text_inputs = text_inputs[0]
+        else:
+            # Batching case.
+            text_inputs = text_inputs[0][0]
+
+        outputs = self.model(**text_inputs, **model_inputs)
+
+        model_outputs = {
+            "candidate_labels": candidate_labels,
+            "logits": outputs.logits_per_audio,
+        }
+        return model_outputs
+
+    def postprocess(self, model_outputs):
+        candidate_labels = model_outputs.pop("candidate_labels")
+        logits = model_outputs["logits"][0]
+
+        if self.framework == "pt":
+            probs = logits.softmax(dim=0)
+            scores = probs.tolist()
+        else:
+            raise ValueError("`tf` framework not supported.")
+
+        result = [
+            {"score": score, "label": candidate_label}
+            for score, candidate_label in sorted(zip(scores, candidate_labels), key=lambda x: -x[0])
+        ]
+        return result
diff --git a/src/transformers/pipelines/zero_shot_classification.py b/src/transformers/pipelines/zero_shot_classification.py
index 413591cf9050..eb01d3a5354a 100644
--- a/src/transformers/pipelines/zero_shot_classification.py
+++ b/src/transformers/pipelines/zero_shot_classification.py
@@ -1,3 +1,4 @@
+import inspect
 from typing import List, Union
 
 import numpy as np
@@ -221,6 +222,10 @@ def _forward(self, inputs):
         candidate_label = inputs["candidate_label"]
         sequence = inputs["sequence"]
         model_inputs = {k: inputs[k] for k in self.tokenizer.model_input_names}
+        # `XXXForSequenceClassification` models should not use `use_cache=True` even if it's supported
+        model_forward = self.model.forward if self.framework == "pt" else self.model.call
+        if "use_cache" in inspect.signature(model_forward).parameters.keys():
+            model_inputs["use_cache"] = False
         outputs = self.model(**model_inputs)
 
         model_outputs = {
diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py
index d73760529153..b16d191754a1 100644
--- a/src/transformers/pipelines/zero_shot_image_classification.py
+++ b/src/transformers/pipelines/zero_shot_image_classification.py
@@ -1,3 +1,4 @@
+from collections import UserDict
 from typing import List, Union
 
 from ..utils import (
@@ -8,7 +9,7 @@
     logging,
     requires_backends,
 )
-from .base import PIPELINE_INIT_ARGS, ChunkPipeline
+from .base import PIPELINE_INIT_ARGS, Pipeline
 
 
 if is_vision_available():
@@ -17,18 +18,17 @@
     from ..image_utils import load_image
 
 if is_torch_available():
-    import torch
+    from ..models.auto.modeling_auto import MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES
 
 if is_tf_available():
-    import tensorflow as tf
-
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES
     from ..tf_utils import stable_softmax
 
 logger = logging.get_logger(__name__)
 
 
 @add_end_docstrings(PIPELINE_INIT_ARGS)
-class ZeroShotImageClassificationPipeline(ChunkPipeline):
+class ZeroShotImageClassificationPipeline(Pipeline):
     """
     Zero shot image classification pipeline using `CLIPModel`. This pipeline predicts the class of an image when you
     provide an image and a set of `candidate_labels`.
@@ -65,8 +65,11 @@ def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
         requires_backends(self, "vision")
-        # No specific FOR_XXX available yet
-        # self.check_model_type(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING)
+        self.check_model_type(
+            TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES
+            if self.framework == "tf"
+            else MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES
+        )
 
     def __call__(self, images: Union[str, List[str], "Image", List["Image"]], **kwargs):
         """
@@ -88,6 +91,10 @@ def __call__(self, images: Union[str, List[str], "Image", List["Image"]], **kwar
                 replacing the placeholder with the candidate_labels. Then likelihood is estimated by using
                 logits_per_image
 
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+
         Return:
             A list of dictionaries containing result, one dictionary per proposed label. The dictionaries contain the
             following keys:
@@ -101,49 +108,52 @@ def _sanitize_parameters(self, **kwargs):
         preprocess_params = {}
         if "candidate_labels" in kwargs:
             preprocess_params["candidate_labels"] = kwargs["candidate_labels"]
+        if "timeout" in kwargs:
+            preprocess_params["timeout"] = kwargs["timeout"]
         if "hypothesis_template" in kwargs:
             preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"]
 
         return preprocess_params, {}, {}
 
-    def preprocess(self, image, candidate_labels=None, hypothesis_template="This is a photo of {}."):
-        n = len(candidate_labels)
-        for i, candidate_label in enumerate(candidate_labels):
-            image = load_image(image)
-            images = self.feature_extractor(images=[image], return_tensors=self.framework)
-            sequence = hypothesis_template.format(candidate_label)
-            inputs = self.tokenizer(sequence, return_tensors=self.framework)
-            inputs["pixel_values"] = images.pixel_values
-            yield {"is_last": i == n - 1, "candidate_label": candidate_label, **inputs}
+    def preprocess(self, image, candidate_labels=None, hypothesis_template="This is a photo of {}.", timeout=None):
+        image = load_image(image, timeout=timeout)
+        inputs = self.image_processor(images=[image], return_tensors=self.framework)
+        inputs["candidate_labels"] = candidate_labels
+        sequences = [hypothesis_template.format(x) for x in candidate_labels]
+        text_inputs = self.tokenizer(sequences, return_tensors=self.framework, padding=True)
+        inputs["text_inputs"] = [text_inputs]
+        return inputs
 
     def _forward(self, model_inputs):
-        is_last = model_inputs.pop("is_last")
-        candidate_label = model_inputs.pop("candidate_label")
-        outputs = self.model(**model_inputs)
+        candidate_labels = model_inputs.pop("candidate_labels")
+        text_inputs = model_inputs.pop("text_inputs")
+        if isinstance(text_inputs[0], UserDict):
+            text_inputs = text_inputs[0]
+        else:
+            # Batching case.
+            text_inputs = text_inputs[0][0]
 
-        # Clip does crossproduct scoring by default, so we're only
-        # interested in the results where image and text and in the same
-        # batch position.
-        diag = torch.diagonal if self.framework == "pt" else tf.linalg.diag_part
-        logits_per_image = diag(outputs.logits_per_image)
+        outputs = self.model(**text_inputs, **model_inputs)
 
         model_outputs = {
-            "is_last": is_last,
-            "candidate_label": candidate_label,
-            "logits_per_image": logits_per_image,
+            "candidate_labels": candidate_labels,
+            "logits": outputs.logits_per_image,
         }
         return model_outputs
 
     def postprocess(self, model_outputs):
-        candidate_labels = [outputs["candidate_label"] for outputs in model_outputs]
+        candidate_labels = model_outputs.pop("candidate_labels")
+        logits = model_outputs["logits"][0]
         if self.framework == "pt":
-            logits = torch.cat([output["logits_per_image"] for output in model_outputs])
-            probs = logits.softmax(dim=0)
+            probs = logits.softmax(dim=-1).squeeze(-1)
             scores = probs.tolist()
-        else:
-            logits = tf.concat([output["logits_per_image"] for output in model_outputs], axis=0)
-            probs = stable_softmax(logits, axis=0)
+            if not isinstance(scores, list):
+                scores = [scores]
+        elif self.framework == "tf":
+            probs = stable_softmax(logits, axis=-1)
             scores = probs.numpy().tolist()
+        else:
+            raise ValueError(f"Unsupported framework: {self.framework}")
 
         result = [
             {"score": score, "label": candidate_label}
diff --git a/src/transformers/pipelines/zero_shot_object_detection.py b/src/transformers/pipelines/zero_shot_object_detection.py
index 7f8c46c0d707..a7181d9540b9 100644
--- a/src/transformers/pipelines/zero_shot_object_detection.py
+++ b/src/transformers/pipelines/zero_shot_object_detection.py
@@ -14,7 +14,7 @@
 
     from transformers.modeling_outputs import BaseModelOutput
 
-    from ..models.auto.modeling_auto import MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING
+    from ..models.auto.modeling_auto import MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES
 
 logger = logging.get_logger(__name__)
 
@@ -60,13 +60,13 @@ def __init__(self, **kwargs):
             raise ValueError(f"The {self.__class__} is only available in PyTorch.")
 
         requires_backends(self, "vision")
-        self.check_model_type(MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING)
+        self.check_model_type(MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES)
 
     def __call__(
         self,
         image: Union[str, "Image.Image", List[Dict[str, Any]]],
         candidate_labels: Union[str, List[str]] = None,
-        **kwargs
+        **kwargs,
     ):
         """
         Detect objects (bounding boxes & classes) in the image(s) passed as inputs.
@@ -111,6 +111,10 @@ def __call__(
                 The number of top predictions that will be returned by the pipeline. If the provided number is `None`
                 or higher than the number of predictions available, it will default to the number of predictions.
 
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+
 
         Return:
             A list of lists containing prediction results, one list per input image. Each list contains dictionaries
@@ -132,15 +136,18 @@ def __call__(
         return results
 
     def _sanitize_parameters(self, **kwargs):
+        preprocess_params = {}
+        if "timeout" in kwargs:
+            preprocess_params["timeout"] = kwargs["timeout"]
         postprocess_params = {}
         if "threshold" in kwargs:
             postprocess_params["threshold"] = kwargs["threshold"]
         if "top_k" in kwargs:
             postprocess_params["top_k"] = kwargs["top_k"]
-        return {}, {}, postprocess_params
+        return preprocess_params, {}, postprocess_params
 
-    def preprocess(self, inputs):
-        image = load_image(inputs["image"])
+    def preprocess(self, inputs, timeout=None):
+        image = load_image(inputs["image"], timeout=timeout)
         candidate_labels = inputs["candidate_labels"]
         if isinstance(candidate_labels, str):
             candidate_labels = candidate_labels.split(",")
@@ -148,7 +155,7 @@ def preprocess(self, inputs):
         target_size = torch.tensor([[image.height, image.width]], dtype=torch.int32)
         for i, candidate_label in enumerate(candidate_labels):
             text_inputs = self.tokenizer(candidate_label, return_tensors=self.framework)
-            image_features = self.feature_extractor(image, return_tensors=self.framework)
+            image_features = self.image_processor(image, return_tensors=self.framework)
             yield {
                 "is_last": i == len(candidate_labels) - 1,
                 "target_size": target_size,
@@ -168,12 +175,11 @@ def _forward(self, model_inputs):
         return model_outputs
 
     def postprocess(self, model_outputs, threshold=0.1, top_k=None):
-
         results = []
         for model_output in model_outputs:
             label = model_output["candidate_label"]
             model_output = BaseModelOutput(model_output)
-            outputs = self.feature_extractor.post_process_object_detection(
+            outputs = self.image_processor.post_process_object_detection(
                 outputs=model_output, threshold=threshold, target_sizes=model_output["target_size"]
             )[0]
 
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index d13f6d845815..e446c1214fb1 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -16,22 +16,20 @@
  Processing saving/loading class for common processors.
 """
 
-import importlib.util
 import os
+import warnings
 from pathlib import Path
+from typing import Optional, Union
 
 from .dynamic_module_utils import custom_object_save
 from .tokenization_utils_base import PreTrainedTokenizerBase
-from .utils import PushToHubMixin, copy_func, logging
+from .utils import PushToHubMixin, copy_func, direct_transformers_import, logging
 
 
 logger = logging.get_logger(__name__)
 
 # Dynamically import the Transformers module to grab the attribute classes of the processor form their names.
-spec = importlib.util.spec_from_file_location(
-    "transformers", Path(__file__).parent / "__init__.py", submodule_search_locations=[Path(__file__).parent]
-)
-transformers_module = spec.loader.load_module()
+transformers_module = direct_transformers_import(Path(__file__).parent)
 
 
 AUTO_TO_BASE_CLASS_MAPPING = {
@@ -113,15 +111,27 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs):
                 Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
                 repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
                 namespace).
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
         """
+        use_auth_token = kwargs.pop("use_auth_token", None)
+
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if kwargs.get("token", None) is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            kwargs["token"] = use_auth_token
+
         os.makedirs(save_directory, exist_ok=True)
 
         if push_to_hub:
             commit_message = kwargs.pop("commit_message", None)
             repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
-            repo_id, token = self._create_repo(repo_id, **kwargs)
+            repo_id = self._create_repo(repo_id, **kwargs)
             files_timestamps = self._get_files_timestamps(save_directory)
         # If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
         # loaded from the Hub.
@@ -147,11 +157,24 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs):
 
         if push_to_hub:
             self._upload_modified_files(
-                save_directory, repo_id, files_timestamps, commit_message=commit_message, token=token
+                save_directory,
+                repo_id,
+                files_timestamps,
+                commit_message=commit_message,
+                token=kwargs.get("token"),
             )
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
+        **kwargs,
+    ):
         r"""
         Instantiate a processor associated with a pretrained model.
 
@@ -181,6 +204,25 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
                 [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and
                 [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`].
         """
+        kwargs["cache_dir"] = cache_dir
+        kwargs["force_download"] = force_download
+        kwargs["local_files_only"] = local_files_only
+        kwargs["revision"] = revision
+
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
+        if token is not None:
+            kwargs["token"] = token
+
         args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
         return cls(*args)
 
diff --git a/src/transformers/pytorch_utils.py b/src/transformers/pytorch_utils.py
index 647101d1e866..73f4176d4b93 100644
--- a/src/transformers/pytorch_utils.py
+++ b/src/transformers/pytorch_utils.py
@@ -16,9 +16,10 @@
 
 import torch
 from packaging import version
+from safetensors.torch import storage_ptr, storage_size
 from torch import nn
 
-from .utils import logging
+from .utils import is_torch_tpu_available, logging
 
 
 ALL_LAYERNORM_LAYERS = [nn.LayerNorm]
@@ -27,20 +28,11 @@
 
 parsed_torch_version_base = version.parse(version.parse(torch.__version__).base_version)
 
-is_torch_less_than_1_8 = parsed_torch_version_base < version.parse("1.8.0")
-is_torch_less_than_1_9 = parsed_torch_version_base < version.parse("1.9.0")
-is_torch_greater_or_equal_than_1_10 = parsed_torch_version_base >= version.parse("1.10")
+is_torch_greater_or_equal_than_2_0 = parsed_torch_version_base >= version.parse("2.0")
+is_torch_greater_or_equal_than_1_12 = parsed_torch_version_base >= version.parse("1.12")
+is_torch_greater_or_equal_than_1_11 = parsed_torch_version_base >= version.parse("1.11")
 is_torch_less_than_1_11 = parsed_torch_version_base < version.parse("1.11")
-
-
-def torch_int_div(tensor1, tensor2):
-    """
-    A function that performs integer division across different versions of PyTorch.
-    """
-    if is_torch_less_than_1_8:
-        return tensor1 // tensor2
-    else:
-        return torch.div(tensor1, tensor2, rounding_mode="floor")
+is_torch_1_8_0 = parsed_torch_version_base == version.parse("1.8.0")
 
 
 def softmax_backward_data(parent, grad_output, output, dim, self):
@@ -105,10 +97,9 @@ class Conv1D(nn.Module):
     def __init__(self, nf, nx):
         super().__init__()
         self.nf = nf
-        w = torch.empty(nx, nf)
-        nn.init.normal_(w, std=0.02)
-        self.weight = nn.Parameter(w)
+        self.weight = nn.Parameter(torch.empty(nx, nf))
         self.bias = nn.Parameter(torch.zeros(nf))
+        nn.init.normal_(self.weight, std=0.02)
 
     def forward(self, x):
         size_out = x.size()[:-1] + (self.nf,)
@@ -262,7 +253,8 @@ def find_pruneable_heads_and_indices(
         already_pruned_heads (`Set[int]`): A set of already pruned heads.
 
     Returns:
-        `Tuple[Set[int], torch.LongTensor]`: A tuple with the remaining heads and their corresponding indices.
+        `Tuple[Set[int], torch.LongTensor]`: A tuple with the indices of heads to prune taking `already_pruned_heads`
+        into account and the indices of rows/columns to keep in the layer weight.
     """
     mask = torch.ones(n_heads, head_size)
     heads = set(heads) - already_pruned_heads  # Convert to set and remove already pruned heads
@@ -283,9 +275,25 @@ def meshgrid(
 
     Reference: https://pytorch.org/docs/1.13/generated/torch.meshgrid.html
     """
-    if is_torch_greater_or_equal_than_1_10:
-        return torch.meshgrid(*tensors, indexing=indexing)
+    return torch.meshgrid(*tensors, indexing=indexing)
+
+
+def id_tensor_storage(tensor: torch.Tensor) -> Tuple[torch.device, int, int]:
+    """
+    Unique identifier to a tensor storage. Multiple different tensors can share the same underlying storage. For
+    example, "meta" tensors all share the same storage, and thus their identifier will all be equal. This identifier is
+    guaranteed to be unique and constant for this tensor's storage during its lifetime. Two tensor storages with
+    non-overlapping lifetimes may have the same id.
+    """
+    if tensor.device.type == "xla" and is_torch_tpu_available():
+        # NOTE: xla tensors dont have storage
+        # use some other unique id to distinguish.
+        # this is a XLA tensor, it must be created using torch_xla's
+        # device. So the following import is safe:
+        import torch_xla
+
+        unique_id = torch_xla._XLAC._xla_get_tensor_id(tensor)
     else:
-        if indexing != "ij":
-            raise ValueError('torch.meshgrid only supports `indexing="ij"` for torch<1.10.')
-        return torch.meshgrid(*tensors)
+        unique_id = storage_ptr(tensor)
+
+    return tensor.device, unique_id, storage_size(tensor)
diff --git a/src/transformers/sagemaker/__init__.py b/src/transformers/sagemaker/__init__.py
index 22bdaf294647..98fe38de89cd 100644
--- a/src/transformers/sagemaker/__init__.py
+++ b/src/transformers/sagemaker/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/sagemaker/training_args_sm.py b/src/transformers/sagemaker/training_args_sm.py
index e4a356a25b18..3daac7859b55 100644
--- a/src/transformers/sagemaker/training_args_sm.py
+++ b/src/transformers/sagemaker/training_args_sm.py
@@ -20,8 +20,8 @@
 
 import torch
 
-from transformers.training_args import TrainingArguments
-from transformers.utils import cached_property, is_sagemaker_dp_enabled, logging
+from ..training_args import TrainingArguments
+from ..utils import cached_property, is_sagemaker_dp_enabled, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 31760557aa9c..341e6cd1688f 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -14,7 +14,9 @@
 
 import collections
 import contextlib
+import doctest
 import functools
+import importlib
 import inspect
 import logging
 import multiprocessing
@@ -28,49 +30,64 @@
 import time
 import unittest
 from collections.abc import Mapping
-from distutils.util import strtobool
 from io import StringIO
 from pathlib import Path
-from typing import Iterator, List, Optional, Union
+from typing import Iterable, Iterator, List, Optional, Union
 from unittest import mock
 
 import huggingface_hub
+import requests
+
 from transformers import logging as transformers_logging
 
-from .deepspeed import is_deepspeed_available
 from .integrations import (
     is_clearml_available,
-    is_fairscale_available,
     is_optuna_available,
     is_ray_available,
     is_sigopt_available,
     is_wandb_available,
 )
+from .integrations.deepspeed import is_deepspeed_available
 from .utils import (
     is_accelerate_available,
     is_apex_available,
+    is_auto_gptq_available,
     is_bitsandbytes_available,
     is_bs4_available,
+    is_cv2_available,
+    is_cython_available,
     is_decord_available,
     is_detectron2_available,
+    is_essentia_available,
     is_faiss_available,
+    is_flash_attn_available,
     is_flax_available,
+    is_fsdp_available,
     is_ftfy_available,
     is_ipex_available,
+    is_jieba_available,
+    is_jinja_available,
     is_jumanpp_available,
     is_keras_nlp_available,
+    is_levenshtein_available,
     is_librosa_available,
     is_natten_available,
+    is_nltk_available,
     is_onnx_available,
+    is_optimum_available,
     is_pandas_available,
+    is_peft_available,
     is_phonemizer_available,
+    is_pretty_midi_available,
     is_pyctcdecode_available,
     is_pytesseract_available,
+    is_pytest_available,
     is_pytorch_quantization_available,
     is_rjieba_available,
     is_safetensors_available,
     is_scipy_available,
     is_sentencepiece_available,
+    is_seqio_available,
     is_soundfile_availble,
     is_spacy_available,
     is_sudachi_available,
@@ -83,15 +100,42 @@
     is_torch_available,
     is_torch_bf16_cpu_available,
     is_torch_bf16_gpu_available,
+    is_torch_neuroncore_available,
+    is_torch_npu_available,
     is_torch_tensorrt_fx_available,
     is_torch_tf32_available,
     is_torch_tpu_available,
+    is_torch_xpu_available,
     is_torchaudio_available,
     is_torchdynamo_available,
+    is_torchvision_available,
     is_vision_available,
+    strtobool,
 )
 
 
+if is_accelerate_available():
+    from accelerate.state import AcceleratorState, PartialState
+
+
+if is_pytest_available():
+    from _pytest.doctest import (
+        Module,
+        _get_checker,
+        _get_continue_on_failure,
+        _get_runner,
+        _is_mocked,
+        _patch_unwrap_mock_aware,
+        get_optionflags,
+        import_path,
+    )
+    from _pytest.outcomes import skip
+    from pytest import DoctestItem
+else:
+    Module = object
+    DoctestItem = object
+
+
 SMALL_MODEL_IDENTIFIER = "julien-c/bert-xsmall-dummy"
 DUMMY_UNKNOWN_IDENTIFIER = "julien-c/dummy-unknown"
 DUMMY_DIFF_TOKENIZER_IDENTIFIER = "julien-c/dummy-diff-tokenizer"
@@ -135,12 +179,14 @@ def parse_int_from_env(key, default=None):
 
 
 _run_slow_tests = parse_flag_from_env("RUN_SLOW", default=False)
-_run_pt_tf_cross_tests = parse_flag_from_env("RUN_PT_TF_CROSS_TESTS", default=False)
-_run_pt_flax_cross_tests = parse_flag_from_env("RUN_PT_FLAX_CROSS_TESTS", default=False)
+_run_pt_tf_cross_tests = parse_flag_from_env("RUN_PT_TF_CROSS_TESTS", default=True)
+_run_pt_flax_cross_tests = parse_flag_from_env("RUN_PT_FLAX_CROSS_TESTS", default=True)
 _run_custom_tokenizers = parse_flag_from_env("RUN_CUSTOM_TOKENIZERS", default=False)
 _run_staging = parse_flag_from_env("HUGGINGFACE_CO_STAGING", default=False)
-_run_git_lfs_tests = parse_flag_from_env("RUN_GIT_LFS_TESTS", default=False)
 _tf_gpu_memory_limit = parse_int_from_env("TF_GPU_MEMORY_LIMIT", default=None)
+_run_pipeline_tests = parse_flag_from_env("RUN_PIPELINE_TESTS", default=True)
+_run_tool_tests = parse_flag_from_env("RUN_TOOL_TESTS", default=False)
+_run_third_party_device_tests = parse_flag_from_env("RUN_THIRD_PARTY_DEVICE_TESTS", default=False)
 
 
 def is_pt_tf_cross_test(test_case):
@@ -198,6 +244,37 @@ def is_staging_test(test_case):
             return pytest.mark.is_staging_test()(test_case)
 
 
+def is_pipeline_test(test_case):
+    """
+    Decorator marking a test as a pipeline test. If RUN_PIPELINE_TESTS is set to a falsy value, those tests will be
+    skipped.
+    """
+    if not _run_pipeline_tests:
+        return unittest.skip("test is pipeline test")(test_case)
+    else:
+        try:
+            import pytest  # We don't need a hard dependency on pytest in the main library
+        except ImportError:
+            return test_case
+        else:
+            return pytest.mark.is_pipeline_test()(test_case)
+
+
+def is_tool_test(test_case):
+    """
+    Decorator marking a test as a tool test. If RUN_TOOL_TESTS is set to a falsy value, those tests will be skipped.
+    """
+    if not _run_tool_tests:
+        return unittest.skip("test is a tool test")(test_case)
+    else:
+        try:
+            import pytest  # We don't need a hard dependency on pytest in the main library
+        except ImportError:
+            return test_case
+        else:
+            return pytest.mark.is_tool_test()(test_case)
+
+
 def slow(test_case):
     """
     Decorator marking a test as slow.
@@ -236,14 +313,34 @@ def require_bs4(test_case):
     return unittest.skipUnless(is_bs4_available(), "test requires BeautifulSoup4")(test_case)
 
 
-def require_git_lfs(test_case):
+def require_cv2(test_case):
     """
-    Decorator marking a test that requires git-lfs.
+    Decorator marking a test that requires OpenCV.
+
+    These tests are skipped when OpenCV isn't installed.
 
-    git-lfs requires additional dependencies, and tests are skipped by default. Set the RUN_GIT_LFS_TESTS environment
-    variable to a truthy value to run them.
     """
-    return unittest.skipUnless(_run_git_lfs_tests, "test of git lfs workflow")(test_case)
+    return unittest.skipUnless(is_cv2_available(), "test requires OpenCV")(test_case)
+
+
+def require_levenshtein(test_case):
+    """
+    Decorator marking a test that requires Levenshtein.
+
+    These tests are skipped when Levenshtein isn't installed.
+
+    """
+    return unittest.skipUnless(is_levenshtein_available(), "test requires Levenshtein")(test_case)
+
+
+def require_nltk(test_case):
+    """
+    Decorator marking a test that requires NLTK.
+
+    These tests are skipped when NLTK isn't installed.
+
+    """
+    return unittest.skipUnless(is_nltk_available(), "test requires NLTK")(test_case)
 
 
 def require_accelerate(test_case):
@@ -253,6 +350,15 @@ def require_accelerate(test_case):
     return unittest.skipUnless(is_accelerate_available(), "test requires accelerate")(test_case)
 
 
+def require_fsdp(test_case, min_version: str = "1.12.0"):
+    """
+    Decorator marking a test that requires fsdp. These tests are skipped when fsdp isn't installed.
+    """
+    return unittest.skipUnless(is_fsdp_available(min_version), f"test requires torch version >= {min_version}")(
+        test_case
+    )
+
+
 def require_safetensors(test_case):
     """
     Decorator marking a test that requires safetensors. These tests are skipped when safetensors isn't installed.
@@ -267,6 +373,20 @@ def require_rjieba(test_case):
     return unittest.skipUnless(is_rjieba_available(), "test requires rjieba")(test_case)
 
 
+def require_jieba(test_case):
+    """
+    Decorator marking a test that requires jieba. These tests are skipped when jieba isn't installed.
+    """
+    return unittest.skipUnless(is_jieba_available(), "test requires jieba")(test_case)
+
+
+def require_jinja(test_case):
+    """
+    Decorator marking a test that requires jinja. These tests are skipped when jinja isn't installed.
+    """
+    return unittest.skipUnless(is_jinja_available(), "test requires jinja")(test_case)
+
+
 def require_tf2onnx(test_case):
     return unittest.skipUnless(is_tf2onnx_available(), "test requires tf2onnx")(test_case)
 
@@ -305,6 +425,36 @@ def require_torch(test_case):
     return unittest.skipUnless(is_torch_available(), "test requires PyTorch")(test_case)
 
 
+def require_flash_attn(test_case):
+    """
+    Decorator marking a test that requires Flash Attention.
+
+    These tests are skipped when Flash Attention isn't installed.
+
+    """
+    return unittest.skipUnless(is_flash_attn_available(), "test requires Flash Attention")(test_case)
+
+
+def require_peft(test_case):
+    """
+    Decorator marking a test that requires PEFT.
+
+    These tests are skipped when PEFT isn't installed.
+
+    """
+    return unittest.skipUnless(is_peft_available(), "test requires PEFT")(test_case)
+
+
+def require_torchvision(test_case):
+    """
+    Decorator marking a test that requires Torchvision.
+
+    These tests are skipped when Torchvision isn't installed.
+
+    """
+    return unittest.skipUnless(is_torchvision_available(), "test requires Torchvision")(test_case)
+
+
 def require_torch_or_tf(test_case):
     """
     Decorator marking a test that requires PyTorch or TensorFlow.
@@ -372,6 +522,13 @@ def require_sentencepiece(test_case):
     return unittest.skipUnless(is_sentencepiece_available(), "test requires SentencePiece")(test_case)
 
 
+def require_seqio(test_case):
+    """
+    Decorator marking a test that requires SentencePiece. These tests are skipped when SentencePiece isn't installed.
+    """
+    return unittest.skipUnless(is_seqio_available(), "test requires Seqio")(test_case)
+
+
 def require_scipy(test_case):
     """
     Decorator marking a test that requires Scipy. These tests are skipped when SentencePiece isn't installed.
@@ -500,11 +657,90 @@ def require_torch_tpu(test_case):
     return unittest.skipUnless(is_torch_tpu_available(check_device=False), "test requires PyTorch TPU")(test_case)
 
 
+def require_torch_neuroncore(test_case):
+    """
+    Decorator marking a test that requires NeuronCore (in PyTorch).
+    """
+    return unittest.skipUnless(is_torch_neuroncore_available(check_device=False), "test requires PyTorch NeuronCore")(
+        test_case
+    )
+
+
+def require_torch_npu(test_case):
+    """
+    Decorator marking a test that requires NPU (in PyTorch).
+    """
+    return unittest.skipUnless(is_torch_npu_available(), "test requires PyTorch NPU")(test_case)
+
+
+def require_torch_multi_npu(test_case):
+    """
+    Decorator marking a test that requires a multi-NPU setup (in PyTorch). These tests are skipped on a machine without
+    multiple NPUs.
+
+    To run *only* the multi_npu tests, assuming all test names contain multi_npu: $ pytest -sv ./tests -k "multi_npu"
+    """
+    if not is_torch_npu_available():
+        return unittest.skip("test requires PyTorch NPU")(test_case)
+
+    return unittest.skipUnless(torch.npu.device_count() > 1, "test requires multiple NPUs")(test_case)
+
+
+def require_torch_xpu(test_case):
+    """
+    Decorator marking a test that requires XPU and IPEX.
+
+    These tests are skipped when Intel Extension for PyTorch isn't installed or it does not match current PyTorch
+    version.
+    """
+    return unittest.skipUnless(is_torch_xpu_available(), "test requires IPEX and an XPU device")(test_case)
+
+
+def require_torch_multi_xpu(test_case):
+    """
+    Decorator marking a test that requires a multi-XPU setup with IPEX and atleast one XPU device. These tests are
+    skipped on a machine without IPEX or multiple XPUs.
+
+    To run *only* the multi_xpu tests, assuming all test names contain multi_xpu: $ pytest -sv ./tests -k "multi_xpu"
+    """
+    if not is_torch_xpu_available():
+        return unittest.skip("test requires IPEX and atleast one XPU device")(test_case)
+
+    return unittest.skipUnless(torch.xpu.device_count() > 1, "test requires multiple XPUs")(test_case)
+
+
 if is_torch_available():
     # Set env var CUDA_VISIBLE_DEVICES="" to force cpu-mode
     import torch
 
-    torch_device = "cuda" if torch.cuda.is_available() else "cpu"
+    if "TRANSFORMERS_TEST_DEVICE" in os.environ:
+        torch_device = os.environ["TRANSFORMERS_TEST_DEVICE"]
+        try:
+            # try creating device to see if provided device is valid
+            _ = torch.device(torch_device)
+        except RuntimeError as e:
+            raise RuntimeError(
+                f"Unknown testing device specified by environment variable `TRANSFORMERS_TEST_DEVICE`: {torch_device}"
+            ) from e
+    elif torch.cuda.is_available():
+        torch_device = "cuda"
+    elif _run_third_party_device_tests and is_torch_npu_available():
+        torch_device = "npu"
+    elif _run_third_party_device_tests and is_torch_xpu_available():
+        torch_device = "xpu"
+    else:
+        torch_device = "cpu"
+
+    if "TRANSFORMERS_TEST_BACKEND" in os.environ:
+        backend = os.environ["TRANSFORMERS_TEST_BACKEND"]
+        try:
+            _ = importlib.import_module(backend)
+        except ModuleNotFoundError as e:
+            raise ModuleNotFoundError(
+                f"Failed to import `TRANSFORMERS_TEST_BACKEND` '{backend}'! This should be the name of an installed module. The original error (look up to see its"
+                f" traceback):\n{e}"
+            ) from e
+
 else:
     torch_device = None
 
@@ -634,13 +870,6 @@ def require_deepspeed(test_case):
     return unittest.skipUnless(is_deepspeed_available(), "test requires deepspeed")(test_case)
 
 
-def require_fairscale(test_case):
-    """
-    Decorator marking a test that requires fairscale
-    """
-    return unittest.skipUnless(is_fairscale_available(), "test requires fairscale")(test_case)
-
-
 def require_apex(test_case):
     """
     Decorator marking a test that requires apex
@@ -655,6 +884,20 @@ def require_bitsandbytes(test_case):
     return unittest.skipUnless(is_bitsandbytes_available(), "test requires bnb")(test_case)
 
 
+def require_optimum(test_case):
+    """
+    Decorator for optimum dependency
+    """
+    return unittest.skipUnless(is_optimum_available(), "test requires optimum")(test_case)
+
+
+def require_auto_gptq(test_case):
+    """
+    Decorator for auto_gptq dependency
+    """
+    return unittest.skipUnless(is_auto_gptq_available(), "test requires auto-gptq")(test_case)
+
+
 def require_phonemizer(test_case):
     """
     Decorator marking a test that requires phonemizer
@@ -676,6 +919,20 @@ def require_librosa(test_case):
     return unittest.skipUnless(is_librosa_available(), "test requires librosa")(test_case)
 
 
+def require_essentia(test_case):
+    """
+    Decorator marking a test that requires essentia
+    """
+    return unittest.skipUnless(is_essentia_available(), "test requires essentia")(test_case)
+
+
+def require_pretty_midi(test_case):
+    """
+    Decorator marking a test that requires pretty_midi
+    """
+    return unittest.skipUnless(is_pretty_midi_available(), "test requires pretty_midi")(test_case)
+
+
 def cmd_exists(cmd):
     return shutil.which(cmd) is not None
 
@@ -701,6 +958,13 @@ def require_jumanpp(test_case):
     return unittest.skipUnless(is_jumanpp_available(), "test requires jumanpp")(test_case)
 
 
+def require_cython(test_case):
+    """
+    Decorator marking a test that requires jumanpp
+    """
+    return unittest.skipUnless(is_cython_available(), "test requires cython")(test_case)
+
+
 def get_gpu_count():
     """
     Return the number of available gpus (regardless of whether torch, tf or jax is used)
@@ -749,6 +1013,7 @@ def get_tests_dir(append_path=None):
 # The original code came from:
 # https://github.com/fastai/fastai/blob/master/tests/utils/text.py
 
+
 # When any function contains print() calls that get overwritten, like progress bars,
 # a special care needs to be applied, since under pytest -s captured output (capsys
 # or contextlib.redirect_stdout) contains any temporary printed strings, followed by
@@ -1248,6 +1513,14 @@ def tearDown(self):
         for path in self.teardown_tmp_dirs:
             shutil.rmtree(path, ignore_errors=True)
         self.teardown_tmp_dirs = []
+        if is_accelerate_available():
+            AcceleratorState._reset_state()
+            PartialState._reset_state()
+
+            # delete all the env variables having `ACCELERATE` in them
+            for k in list(os.environ.keys()):
+                if "ACCELERATE" in k:
+                    del os.environ[k]
 
 
 def mockenv(**kwargs):
@@ -1643,12 +1916,16 @@ def __enter__(self):
         self.head_request_count = 0
         self.get_request_count = 0
         self.other_request_count = 0
-        self.old_request = huggingface_hub.file_download.requests.request
-        huggingface_hub.file_download.requests.request = self.new_request
+
+        # Mock `get_session` to count HTTP calls.
+        self.old_get_session = huggingface_hub.utils._http.get_session
+        self.session = requests.Session()
+        self.session.request = self.new_request
+        huggingface_hub.utils._http.get_session = lambda: self.session
         return self
 
     def __exit__(self, *args, **kwargs):
-        huggingface_hub.file_download.requests.request = self.old_request
+        huggingface_hub.utils._http.get_session = self.old_get_session
 
     def new_request(self, method, **kwargs):
         if method == "GET":
@@ -1658,10 +1935,10 @@ def new_request(self, method, **kwargs):
         else:
             self.other_request_count += 1
 
-        return self.old_request(method=method, **kwargs)
+        return requests.request(method=method, **kwargs)
 
 
-def is_flaky(max_attempts: int = 5, wait_before_retry: Optional[float] = None):
+def is_flaky(max_attempts: int = 5, wait_before_retry: Optional[float] = None, description: Optional[str] = None):
     """
     To decorate flaky tests. They will be retried on failures.
 
@@ -1670,6 +1947,9 @@ def is_flaky(max_attempts: int = 5, wait_before_retry: Optional[float] = None):
             The maximum number of attempts to retry the flaky test.
         wait_before_retry (`float`, *optional*):
             If provided, will wait that number of seconds before retrying the test.
+        description (`str`, *optional*):
+            A string to describe the situation (what / where / why is flaky, link to GH issue/PR comments, errors,
+            etc.)
     """
 
     def decorator(test_func_ref):
@@ -1694,7 +1974,7 @@ def wrapper(*args, **kwargs):
     return decorator
 
 
-def run_test_in_subprocess(test_case, target_func, inputs=None, timeout=600):
+def run_test_in_subprocess(test_case, target_func, inputs=None, timeout=None):
     """
     To run a test in a subprocess. In particular, this can avoid (GPU) memory issue.
 
@@ -1705,9 +1985,12 @@ def run_test_in_subprocess(test_case, target_func, inputs=None, timeout=600):
             The function implementing the actual testing logic.
         inputs (`dict`, *optional*, defaults to `None`):
             The inputs that will be passed to `target_func` through an (input) queue.
-        timeout (`int`, *optional*, defaults to 600):
-            The timeout (in seconds) that will be passed to the input and output queues.
+        timeout (`int`, *optional*, defaults to `None`):
+            The timeout (in seconds) that will be passed to the input and output queues. If not specified, the env.
+            variable `PYTEST_TIMEOUT` will be checked. If still `None`, its value will be set to `600`.
     """
+    if timeout is None:
+        timeout = int(os.environ.get("PYTEST_TIMEOUT", 600))
 
     start_methohd = "spawn"
     ctx = multiprocessing.get_context(start_methohd)
@@ -1732,3 +2015,164 @@ def run_test_in_subprocess(test_case, target_func, inputs=None, timeout=600):
 
     if results["error"] is not None:
         test_case.fail(f'{results["error"]}')
+
+
+"""
+The following contains utils to run the documentation tests without having to overwrite any files.
+
+The `preprocess_string` function adds `# doctest: +IGNORE_RESULT` markers on the fly anywhere a `load_dataset` call is
+made as a print would otherwise fail the corresonding line.
+
+To skip cuda tests, make sure to call `SKIP_CUDA_DOCTEST=1 pytest --doctest-modules <path_to_files_to_test>
+"""
+
+
+def preprocess_string(string, skip_cuda_tests):
+    """Prepare a docstring or a `.md` file to be run by doctest.
+
+    The argument `string` would be the whole file content if it is a `.md` file. For a python file, it would be one of
+    its docstring. In each case, it may contain multiple python code examples. If `skip_cuda_tests` is `True` and a
+    cuda stuff is detective (with a heuristic), this method will return an empty string so no doctest will be run for
+    `string`.
+    """
+    codeblock_pattern = r"(```(?:python|py)\s*\n\s*>>> )((?:.*?\n)*?.*?```)"
+    codeblocks = re.split(re.compile(codeblock_pattern, flags=re.MULTILINE | re.DOTALL), string)
+    is_cuda_found = False
+    for i, codeblock in enumerate(codeblocks):
+        if "load_dataset(" in codeblock and "# doctest: +IGNORE_RESULT" not in codeblock:
+            codeblocks[i] = re.sub(r"(>>> .*load_dataset\(.*)", r"\1 # doctest: +IGNORE_RESULT", codeblock)
+        if (
+            (">>>" in codeblock or "..." in codeblock)
+            and re.search(r"cuda|to\(0\)|device=0", codeblock)
+            and skip_cuda_tests
+        ):
+            is_cuda_found = True
+            break
+
+    modified_string = ""
+    if not is_cuda_found:
+        modified_string = "".join(codeblocks)
+
+    return modified_string
+
+
+class HfDocTestParser(doctest.DocTestParser):
+    """
+    Overwrites the DocTestParser from doctest to properly parse the codeblocks that are formatted with black. This
+    means that there are no extra lines at the end of our snippets. The `# doctest: +IGNORE_RESULT` marker is also
+    added anywhere a `load_dataset` call is made as a print would otherwise fail the corresponding line.
+
+    Tests involving cuda are skipped base on a naive pattern that should be updated if it is not enough.
+    """
+
+    # This regular expression is used to find doctest examples in a
+    # string.  It defines three groups: `source` is the source code
+    # (including leading indentation and prompts); `indent` is the
+    # indentation of the first (PS1) line of the source code; and
+    # `want` is the expected output (including leading indentation).
+    # fmt: off
+    _EXAMPLE_RE = re.compile(r'''
+        # Source consists of a PS1 line followed by zero or more PS2 lines.
+        (?P<source>
+            (?:^(?P<indent> [ ]*) >>>    .*)    # PS1 line
+            (?:\n           [ ]*  \.\.\. .*)*)  # PS2 lines
+        \n?
+        # Want consists of any non-blank lines that do not start with PS1.
+        (?P<want> (?:(?![ ]*$)    # Not a blank line
+             (?![ ]*>>>)          # Not a line starting with PS1
+             # !!!!!!!!!!! HF Specific !!!!!!!!!!!
+             (?:(?!```).)*        # Match any character except '`' until a '```' is found (this is specific to HF because black removes the last line)
+             # !!!!!!!!!!! HF Specific !!!!!!!!!!!
+             (?:\n|$)  # Match a new line or end of string
+          )*)
+        ''', re.MULTILINE | re.VERBOSE
+    )
+    # fmt: on
+
+    # !!!!!!!!!!! HF Specific !!!!!!!!!!!
+    skip_cuda_tests: bool = bool(os.environ.get("SKIP_CUDA_DOCTEST", False))
+    # !!!!!!!!!!! HF Specific !!!!!!!!!!!
+
+    def parse(self, string, name="<string>"):
+        """
+        Overwrites the `parse` method to incorporate a skip for CUDA tests, and remove logs and dataset prints before
+        calling `super().parse`
+        """
+        string = preprocess_string(string, self.skip_cuda_tests)
+        return super().parse(string, name)
+
+
+class HfDoctestModule(Module):
+    """
+    Overwrites the `DoctestModule` of the pytest package to make sure the HFDocTestParser is used when discovering
+    tests.
+    """
+
+    def collect(self) -> Iterable[DoctestItem]:
+        class MockAwareDocTestFinder(doctest.DocTestFinder):
+            """A hackish doctest finder that overrides stdlib internals to fix a stdlib bug.
+
+            https://github.com/pytest-dev/pytest/issues/3456 https://bugs.python.org/issue25532
+            """
+
+            def _find_lineno(self, obj, source_lines):
+                """Doctest code does not take into account `@property`, this
+                is a hackish way to fix it. https://bugs.python.org/issue17446
+
+                Wrapped Doctests will need to be unwrapped so the correct line number is returned. This will be
+                reported upstream. #8796
+                """
+                if isinstance(obj, property):
+                    obj = getattr(obj, "fget", obj)
+
+                if hasattr(obj, "__wrapped__"):
+                    # Get the main obj in case of it being wrapped
+                    obj = inspect.unwrap(obj)
+
+                # Type ignored because this is a private function.
+                return super()._find_lineno(  # type:ignore[misc]
+                    obj,
+                    source_lines,
+                )
+
+            def _find(self, tests, obj, name, module, source_lines, globs, seen) -> None:
+                if _is_mocked(obj):
+                    return
+                with _patch_unwrap_mock_aware():
+                    # Type ignored because this is a private function.
+                    super()._find(  # type:ignore[misc]
+                        tests, obj, name, module, source_lines, globs, seen
+                    )
+
+        if self.path.name == "conftest.py":
+            module = self.config.pluginmanager._importconftest(
+                self.path,
+                self.config.getoption("importmode"),
+                rootpath=self.config.rootpath,
+            )
+        else:
+            try:
+                module = import_path(
+                    self.path,
+                    root=self.config.rootpath,
+                    mode=self.config.getoption("importmode"),
+                )
+            except ImportError:
+                if self.config.getvalue("doctest_ignore_import_errors"):
+                    skip("unable to import module %r" % self.path)
+                else:
+                    raise
+
+        # !!!!!!!!!!! HF Specific !!!!!!!!!!!
+        finder = MockAwareDocTestFinder(parser=HfDocTestParser())
+        # !!!!!!!!!!! HF Specific !!!!!!!!!!!
+        optionflags = get_optionflags(self)
+        runner = _get_runner(
+            verbose=False,
+            optionflags=optionflags,
+            checker=_get_checker(),
+            continue_on_failure=_get_continue_on_failure(self.config),
+        )
+        for test in finder.find(module, module.__name__):
+            if test.examples:  # skip empty doctests and cuda
+                yield DoctestItem.from_parent(self, name=test.name, runner=runner, dtest=test)
diff --git a/src/transformers/tf_utils.py b/src/transformers/tf_utils.py
index ce43a4537a54..0900ac587c46 100644
--- a/src/transformers/tf_utils.py
+++ b/src/transformers/tf_utils.py
@@ -68,3 +68,188 @@ def stable_softmax(logits: tf.Tensor, axis: Optional[int] = None, name: Optional
     # TODO: When the issue linked above gets sorted, add a check on TF version here and use the original function if
     # it has the fix. After we drop the support for unfixed versions, remove this function.
     return tf.nn.softmax(logits=logits + 1e-9, axis=axis, name=name)
+
+
+def functional_layernorm(inputs, weight, bias, epsilon=1e-5, axis=-1):
+    # This is a very simplified functional layernorm, designed to duplicate
+    # the functionality of PyTorch nn.functional.layer_norm when this is needed to port
+    # models in Transformers.
+
+    if weight.shape.rank != 1 or bias.shape.rank != 1 or not isinstance(axis, int):
+        raise NotImplementedError("Only 1D weight and bias tensors are supported for now, with only a single axis.")
+
+    # Get mean and variance on the axis to be normalized
+    mean, variance = tf.nn.moments(inputs, axes=[axis], keepdims=True)
+
+    if axis != -1:
+        # Reshape scale and weight to have the same rank as inputs, but with 1 dimensions
+        # on every dimension except axis
+        shape = [1] * inputs.shape.rank
+        shape[axis] = shape_list(inputs)[axis]
+        weight = tf.reshape(weight, shape)
+        bias = tf.reshape(bias, shape)
+
+    # Compute layer normalization using the batch_normalization
+    # function.
+    outputs = tf.nn.batch_normalization(
+        inputs,
+        mean,
+        variance,
+        offset=bias,
+        scale=weight,
+        variance_epsilon=epsilon,
+    )
+    return outputs
+
+
+def flatten(input, start_dim=0, end_dim=-1):
+    # Replicates the behavior of torch.flatten in TF
+
+    # If end_dim or start_dim is negative, count them from the end
+    if end_dim < 0:
+        end_dim += input.shape.rank
+    if start_dim < 0:
+        start_dim += input.shape.rank
+
+    if start_dim == end_dim:
+        return input
+
+    in_shape = tf.shape(input)
+    flattened_dim = tf.math.reduce_prod(in_shape[start_dim : end_dim + 1])
+    out_shape = tf.concat([in_shape[:start_dim], [flattened_dim], in_shape[end_dim + 1 :]], axis=0)
+    return tf.reshape(input, out_shape)
+
+
+def invert_attention_mask(encoder_attention_mask: tf.Tensor) -> tf.Tensor:
+    """
+    Invert an attention mask (e.g., switches 0. and 1.).
+
+    Args:
+        encoder_attention_mask (`torch.Tensor`): An attention mask.
+
+    Returns:
+        `tf.Tensor`: The inverted attention mask.
+    """
+    if not isinstance(encoder_attention_mask, tf.Tensor):
+        encoder_attention_mask = tf.convert_to_tensor(encoder_attention_mask)  # Catches stray NumPy inputs
+    if encoder_attention_mask.shape.rank == 3:
+        encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
+    if encoder_attention_mask.shape.rank == 2:
+        encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
+    # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
+    # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow
+    # /transformer/transformer_layers.py#L270
+    # encoder_extended_attention_mask = (encoder_extended_attention_mask ==
+    # encoder_extended_attention_mask.transpose(-1, -2))
+    encoder_extended_attention_mask = (
+        tf.cast(1, encoder_attention_mask.dtype) - encoder_extended_attention_mask
+    ) * encoder_extended_attention_mask.dtype.min
+
+    return encoder_extended_attention_mask
+
+
+def check_embeddings_within_bounds(tensor: tf.Tensor, embed_dim: int, tensor_name: str = "input_ids") -> None:
+    """
+    `tf.gather`, on which TF embedding layers are based, won't check positive out of bound indices on GPU, returning
+    zeros instead. This function adds a check against that dangerous silent behavior.
+
+    Args:
+        tensor (`tf.Tensor`): The tensor of indices to check.
+        embed_dim (`int`): The embedding dimension.
+        tensor_name (`str`, *optional*): The name of the tensor to use in the error message.
+    """
+    tf.debugging.assert_less(
+        tensor,
+        tf.cast(embed_dim, dtype=tensor.dtype),
+        message=(
+            f"The maximum value of {tensor_name} ({tf.math.reduce_max(tensor)}) must be smaller than the embedding "
+            f"layer's input dimension ({embed_dim}). The likely cause is some problem at tokenization time."
+        ),
+    )
+
+
+def save_attributes_to_hdf5_group(group, name, data):
+    """Saves attributes (data) of the specified name into the HDF5 group.
+
+    This method deals with an inherent problem of HDF5 file which is not able to store data larger than
+    HDF5_OBJECT_HEADER_LIMIT bytes.
+
+    Args:
+        group: A pointer to a HDF5 group.
+        name: A name of the attributes to save.
+        data: Attributes data to store.
+
+    Raises:
+      RuntimeError: If any single attribute is too large to be saved.
+
+    Copied from Keras to Transformers to avoid versioning issues.
+    """
+    HDF5_OBJECT_HEADER_LIMIT = 64512
+    # Check that no item in `data` is larger than `HDF5_OBJECT_HEADER_LIMIT`
+    # because in that case even chunking the array would not make the saving
+    # possible.
+    bad_attributes = [x for x in data if len(x) > HDF5_OBJECT_HEADER_LIMIT]
+
+    # Expecting this to never be true.
+    if bad_attributes:
+        raise RuntimeError(
+            "The following attributes cannot be saved to HDF5 file because "
+            f"they are larger than {HDF5_OBJECT_HEADER_LIMIT} "
+            f"bytes: {bad_attributes}"
+        )
+
+    data_npy = np.asarray(data)
+
+    num_chunks = 1
+    chunked_data = np.array_split(data_npy, num_chunks)
+
+    # This will never loop forever thanks to the test above.
+    while any(x.nbytes > HDF5_OBJECT_HEADER_LIMIT for x in chunked_data):
+        num_chunks += 1
+        chunked_data = np.array_split(data_npy, num_chunks)
+
+    if num_chunks > 1:
+        for chunk_id, chunk_data in enumerate(chunked_data):
+            group.attrs["%s%d" % (name, chunk_id)] = chunk_data
+    else:
+        group.attrs[name] = data
+
+
+def load_attributes_from_hdf5_group(group, name):
+    """Loads attributes of the specified name from the HDF5 group.
+
+    This method deals with an inherent problem of HDF5 file which is not able to store data larger than
+    HDF5_OBJECT_HEADER_LIMIT bytes.
+
+    Args:
+        group: A pointer to a HDF5 group.
+        name: A name of the attributes to load.
+
+    Returns:
+        data: Attributes data.
+
+    Copied from Keras to Transformers to avoid versioning issues.
+    """
+    if name in group.attrs:
+        data = [n.decode("utf8") if hasattr(n, "decode") else n for n in group.attrs[name]]
+    else:
+        data = []
+        chunk_id = 0
+        while "%s%d" % (name, chunk_id) in group.attrs:
+            data.extend(
+                [n.decode("utf8") if hasattr(n, "decode") else n for n in group.attrs["%s%d" % (name, chunk_id)]]
+            )
+            chunk_id += 1
+    return data
+
+
+def expand_1d(data):
+    """Expands 1-dimensional `Tensor`s into 2-dimensional `Tensor`s.
+    Copied from Keras to here to avoid versioning issues."""
+
+    def _expand_single_1d_tensor(t):
+        if isinstance(t, tf.Tensor) and t.shape.rank == 1:
+            return tf.expand_dims(t, axis=-1)
+        return t
+
+    return tf.nest.map_structure(_expand_single_1d_tensor, data)
diff --git a/src/transformers/time_series_utils.py b/src/transformers/time_series_utils.py
new file mode 100644
index 000000000000..02eddd72cebd
--- /dev/null
+++ b/src/transformers/time_series_utils.py
@@ -0,0 +1,225 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Time series distributional output classes and utilities.
+"""
+from typing import Callable, Dict, Optional, Tuple
+
+import torch
+from torch import nn
+from torch.distributions import (
+    AffineTransform,
+    Distribution,
+    Independent,
+    NegativeBinomial,
+    Normal,
+    StudentT,
+    TransformedDistribution,
+)
+
+
+class AffineTransformed(TransformedDistribution):
+    def __init__(self, base_distribution: Distribution, loc=None, scale=None, event_dim=0):
+        self.scale = 1.0 if scale is None else scale
+        self.loc = 0.0 if loc is None else loc
+
+        super().__init__(base_distribution, [AffineTransform(loc=self.loc, scale=self.scale, event_dim=event_dim)])
+
+    @property
+    def mean(self):
+        """
+        Returns the mean of the distribution.
+        """
+        return self.base_dist.mean * self.scale + self.loc
+
+    @property
+    def variance(self):
+        """
+        Returns the variance of the distribution.
+        """
+        return self.base_dist.variance * self.scale**2
+
+    @property
+    def stddev(self):
+        """
+        Returns the standard deviation of the distribution.
+        """
+        return self.variance.sqrt()
+
+
+class ParameterProjection(nn.Module):
+    def __init__(
+        self, in_features: int, args_dim: Dict[str, int], domain_map: Callable[..., Tuple[torch.Tensor]], **kwargs
+    ) -> None:
+        super().__init__(**kwargs)
+        self.args_dim = args_dim
+        self.proj = nn.ModuleList([nn.Linear(in_features, dim) for dim in args_dim.values()])
+        self.domain_map = domain_map
+
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor]:
+        params_unbounded = [proj(x) for proj in self.proj]
+
+        return self.domain_map(*params_unbounded)
+
+
+class LambdaLayer(nn.Module):
+    def __init__(self, function):
+        super().__init__()
+        self.function = function
+
+    def forward(self, x, *args):
+        return self.function(x, *args)
+
+
+class DistributionOutput:
+    distribution_class: type
+    in_features: int
+    args_dim: Dict[str, int]
+
+    def __init__(self, dim: int = 1) -> None:
+        self.dim = dim
+        self.args_dim = {k: dim * self.args_dim[k] for k in self.args_dim}
+
+    def _base_distribution(self, distr_args):
+        if self.dim == 1:
+            return self.distribution_class(*distr_args)
+        else:
+            return Independent(self.distribution_class(*distr_args), 1)
+
+    def distribution(
+        self,
+        distr_args,
+        loc: Optional[torch.Tensor] = None,
+        scale: Optional[torch.Tensor] = None,
+    ) -> Distribution:
+        distr = self._base_distribution(distr_args)
+        if loc is None and scale is None:
+            return distr
+        else:
+            return AffineTransformed(distr, loc=loc, scale=scale, event_dim=self.event_dim)
+
+    @property
+    def event_shape(self) -> Tuple:
+        r"""
+        Shape of each individual event contemplated by the distributions that this object constructs.
+        """
+        return () if self.dim == 1 else (self.dim,)
+
+    @property
+    def event_dim(self) -> int:
+        r"""
+        Number of event dimensions, i.e., length of the `event_shape` tuple, of the distributions that this object
+        constructs.
+        """
+        return len(self.event_shape)
+
+    @property
+    def value_in_support(self) -> float:
+        r"""
+        A float that will have a valid numeric value when computing the log-loss of the corresponding distribution. By
+        default 0.0. This value will be used when padding data series.
+        """
+        return 0.0
+
+    def get_parameter_projection(self, in_features: int) -> nn.Module:
+        r"""
+        Return the parameter projection layer that maps the input to the appropriate parameters of the distribution.
+        """
+        return ParameterProjection(
+            in_features=in_features,
+            args_dim=self.args_dim,
+            domain_map=LambdaLayer(self.domain_map),
+        )
+
+    def domain_map(self, *args: torch.Tensor):
+        r"""
+        Converts arguments to the right shape and domain. The domain depends on the type of distribution, while the
+        correct shape is obtained by reshaping the trailing axis in such a way that the returned tensors define a
+        distribution of the right event_shape.
+        """
+        raise NotImplementedError()
+
+    @staticmethod
+    def squareplus(x: torch.Tensor) -> torch.Tensor:
+        r"""
+        Helper to map inputs to the positive orthant by applying the square-plus operation. Reference:
+        https://twitter.com/jon_barron/status/1387167648669048833
+        """
+        return (x + torch.sqrt(torch.square(x) + 4.0)) / 2.0
+
+
+class StudentTOutput(DistributionOutput):
+    """
+    Student-T distribution output class.
+    """
+
+    args_dim: Dict[str, int] = {"df": 1, "loc": 1, "scale": 1}
+    distribution_class: type = StudentT
+
+    @classmethod
+    def domain_map(cls, df: torch.Tensor, loc: torch.Tensor, scale: torch.Tensor):
+        scale = cls.squareplus(scale).clamp_min(torch.finfo(scale.dtype).eps)
+        df = 2.0 + cls.squareplus(df)
+        return df.squeeze(-1), loc.squeeze(-1), scale.squeeze(-1)
+
+
+class NormalOutput(DistributionOutput):
+    """
+    Normal distribution output class.
+    """
+
+    args_dim: Dict[str, int] = {"loc": 1, "scale": 1}
+    distribution_class: type = Normal
+
+    @classmethod
+    def domain_map(cls, loc: torch.Tensor, scale: torch.Tensor):
+        scale = cls.squareplus(scale).clamp_min(torch.finfo(scale.dtype).eps)
+        return loc.squeeze(-1), scale.squeeze(-1)
+
+
+class NegativeBinomialOutput(DistributionOutput):
+    """
+    Negative Binomial distribution output class.
+    """
+
+    args_dim: Dict[str, int] = {"total_count": 1, "logits": 1}
+    distribution_class: type = NegativeBinomial
+
+    @classmethod
+    def domain_map(cls, total_count: torch.Tensor, logits: torch.Tensor):
+        total_count = cls.squareplus(total_count)
+        return total_count.squeeze(-1), logits.squeeze(-1)
+
+    def _base_distribution(self, distr_args) -> Distribution:
+        total_count, logits = distr_args
+        if self.dim == 1:
+            return self.distribution_class(total_count=total_count, logits=logits)
+        else:
+            return Independent(self.distribution_class(total_count=total_count, logits=logits), 1)
+
+    # Overwrites the parent class method. We cannot scale using the affine
+    # transformation since negative binomial should return integers. Instead
+    # we scale the parameters.
+    def distribution(
+        self, distr_args, loc: Optional[torch.Tensor] = None, scale: Optional[torch.Tensor] = None
+    ) -> Distribution:
+        total_count, logits = distr_args
+
+        if scale is not None:
+            # See scaling property of Gamma.
+            logits += scale.log()
+
+        return self._base_distribution((total_count, logits))
diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py
index 6d33266c03f4..2ceed1b46d48 100644
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -57,6 +57,7 @@ class Trie:
 
     def __init__(self):
         self.data = {}
+        self._tokens = set()
 
     def add(self, word: str):
         """
@@ -81,6 +82,8 @@ def add(self, word: str):
         if not word:
             # Prevent empty string
             return
+
+        self._tokens.add(word)
         ref = self.data
         for char in word:
             ref[char] = char in ref and ref[char] or {}
@@ -344,15 +347,24 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
     """
 
     def __init__(self, **kwargs):
+        # 1. Init the parent class
         super().__init__(**kwargs)
-
-        # Added tokens - We store this for both slow and fast tokenizers
-        # until the serialization of Fast tokenizers is updated
-        self.added_tokens_encoder: Dict[str, int] = {}
-        self.added_tokens_decoder: Dict[int, str] = {}
-        self.unique_no_split_tokens: List[str] = []
         self.tokens_trie = Trie()
 
+        # 2. init `_added_tokens_decoder` if child class did not
+        if not hasattr(self, "_added_tokens_decoder"):
+            self._added_tokens_decoder: Dict[int, AddedToken] = {}
+        # 3. if a `added_tokens_decoder` is passed, we are loading from a saved tokenizer, we overwrite
+        if "added_tokens_decoder" in kwargs:
+            # overwriting the class's added_tokens_decoder. This is the source of truth!
+            self._added_tokens_decoder.update(kwargs.get("added_tokens_decoder"))
+
+        self._added_tokens_encoder: Dict[str, int] = {k.content: v for v, k in self._added_tokens_decoder.items()}
+
+        # 4. If some of the special tokens are not part of the vocab, we add them, at the end.
+        # the order of addition is the same as self.SPECIAL_TOKENS_ATTRIBUTES following `tokenizers`
+        self._add_tokens(self.all_special_tokens_extended, special_tokens=True)
+
         self._decode_use_source_tokenizer = False
 
     @property
@@ -366,30 +378,66 @@ def vocab_size(self) -> int:
         """
         raise NotImplementedError
 
+    @property
+    def added_tokens_encoder(self) -> Dict[str, int]:
+        """
+        Returns the sorted mapping from string to index. The added tokens encoder is cached for performance
+        optimisation in `self._added_tokens_encoder` for the slow tokenizers.
+        """
+        return {k.content: v for v, k in sorted(self._added_tokens_decoder.items(), key=lambda item: item[0])}
+
+    @property
+    def added_tokens_decoder(self) -> Dict[int, AddedToken]:
+        """
+        Returns the added tokens in the vocabulary as a dictionary of index to AddedToken.
+
+        Returns:
+            `Dict[str, int]`: The added tokens.
+        """
+        return dict(sorted(self._added_tokens_decoder.items(), key=lambda item: item[0]))
+
+    @added_tokens_decoder.setter
+    def added_tokens_decoder(self, value: Dict[int, Union[AddedToken, str]]) -> Dict[int, AddedToken]:
+        # Always raise an error if string because users should define the behavior
+        for index, token in value.items():
+            if not isinstance(token, (str, AddedToken)) or not isinstance(index, int):
+                raise ValueError(
+                    f"The provided `added_tokens_decoder` has an element of type {index.__class__, token.__class__}, should be a dict of {int, Union[AddedToken, str]}"
+                )
+
+            self._added_tokens_decoder[index] = AddedToken(token) if isinstance(token, str) else token
+            self._added_tokens_encoder[str(token)] = index
+
     def get_added_vocab(self) -> Dict[str, int]:
         """
-        Returns the added tokens in the vocabulary as a dictionary of token to index.
+        Returns the added tokens in the vocabulary as a dictionary of token to index. Results might be different from
+        the fast call because for now we always add the tokens even if they are already in the vocabulary. This is
+        something we should change.
 
         Returns:
             `Dict[str, int]`: The added tokens.
         """
-        return self.added_tokens_encoder
+        return self._added_tokens_encoder
 
     def __len__(self):
         """
-        Size of the full vocabulary with the added tokens.
+        Size of the full vocabulary with the added tokens. Counts the `keys` and not the `values` because otherwise if
+        there is a hole in the vocab, we will add tokenizers at a wrong index.
         """
-        return self.vocab_size + len(self.added_tokens_encoder)
+        return len(set(self.get_vocab().keys()))
 
     def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
         """
         Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
-        it with indices starting from length of the current vocabulary.
+        it with indices starting from length of the current vocabulary. Special tokens are sometimes already in the
+        vocab which is why they have to be handled specifically.
 
         Args:
             new_tokens (`List[str]`or `List[tokenizers.AddedToken]`):
-                Token(s) to add in vocabulary. A token is only added if it's not already in the vocabulary (tested by
-                checking if the tokenizer assign the index of the `unk_token` to them).
+                Token(s) to add in vocabulary. A token is counted as added if it's not already in the vocabulary
+                (tested by checking if the tokenizer assign the index of the `unk_token` to them). If a token is part
+                of the vocabulary then we simply mark this token as an `AddedToken` which allows to control the
+                stripping and normalization of this token. This is NOT possible in `tokenizers`.
             special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the tokens should be added as special tokens.
 
@@ -408,52 +456,52 @@ def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_to
         # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
         model.resize_token_embeddings(len(tokenizer))
         ```"""
-        new_tokens = [str(tok) for tok in new_tokens]
-
-        tokens_to_add = []
+        added_tokens = 0
+        if new_tokens is None:
+            return added_tokens
+        current_vocab = self.get_vocab().copy()
+        new_idx = len(current_vocab)  # only call this once, len gives the last index + 1
         for token in new_tokens:
-            if not isinstance(token, str):
+            if not isinstance(token, (str, AddedToken)):
                 raise TypeError(f"Token {token} is not a string but a {type(token)}.")
-            if not special_tokens and hasattr(self, "do_lower_case") and self.do_lower_case:
-                token = token.lower()
-            if (
-                token != self.unk_token
-                and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token)
-                and token not in tokens_to_add
-            ):
-                tokens_to_add.append(token)
-                if self.verbose:
-                    logger.info(f"Adding {token} to the vocabulary")
-
-        added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(tokens_to_add))
-        added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
-        self.added_tokens_encoder.update(added_tok_encoder)
-        self.added_tokens_decoder.update(added_tok_decoder)
-
-        # Make sure we don't split on any special tokens (even they were already in the vocab before e.g. for Albert)
-        if special_tokens:
-            if len(new_tokens) == 1:
-                _insert_one_token_to_ordered_list(self.unique_no_split_tokens, new_tokens[0])
-            else:
-                self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(new_tokens)))
-        else:
-            # Or on the newly added tokens
-            if len(tokens_to_add) == 1:
-                _insert_one_token_to_ordered_list(self.unique_no_split_tokens, tokens_to_add[0])
+            if str(token) == "":
+                continue
+            if isinstance(token, str):
+                # for legacy AddedTokens strip left and right by default
+                # TODO this will be remove to have the same default behavior as rust
+                token = AddedToken(token, normalized=not special_tokens, rstrip=True, lstrip=True)
+            if special_tokens:
+                token.special = True
+            if token in self._added_tokens_decoder:
+                continue
+            if not token.special and token.normalized and hasattr(self, "do_lower_case") and self.do_lower_case:
+                # Normalize if requested
+                token.content = token.content.lower()
+            if token.content not in current_vocab:
+                token_index = new_idx + added_tokens
+                current_vocab[token.content] = token_index
+                added_tokens += 1
             else:
-                self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(tokens_to_add)))
-        self._create_trie(self.unique_no_split_tokens)
-
-        return len(tokens_to_add)
-
-    def _create_trie(self, unique_no_split_tokens):
-        trie = Trie()
+                token_index = current_vocab[token.content]
+
+            if token.special and str(token) not in self.all_special_tokens:
+                self._additional_special_tokens.append(token)
+            # the setter automatically updates the reverse map
+            self._added_tokens_decoder[token_index] = token
+            self._added_tokens_encoder[token.content] = token_index
+            if self.verbose:
+                logger.info(f"Adding {token} to the vocabulary")
+
+        self._update_trie()
+        return added_tokens
+
+    def _update_trie(self, unique_no_split_tokens: Optional[str] = []):
+        for token in self._added_tokens_decoder.values():
+            if token not in self.tokens_trie._tokens:
+                self.tokens_trie.add(token.content)
         for token in unique_no_split_tokens:
-            if hasattr(self, "do_lower_case") and self.do_lower_case and token not in self.all_special_tokens:
-                trie.add(token.lower())
-            else:
-                trie.add(token)
-        self.tokens_trie = trie
+            if token not in self.tokens_trie._tokens:
+                self.tokens_trie.add(token)
 
     def num_special_tokens_to_add(self, pair: bool = False) -> int:
         """
@@ -494,31 +542,36 @@ def tokenize(self, text: TextInput, **kwargs) -> List[str]:
         Returns:
             `List[str]`: The list of tokens.
         """
-        # Simple mapping string => AddedToken for special tokens with specific tokenization behaviors
-        all_special_tokens_extended = dict(
-            (str(t), t) for t in self.all_special_tokens_extended if isinstance(t, AddedToken)
-        )
+        split_special_tokens = kwargs.pop("split_special_tokens", self.split_special_tokens)
 
         text, kwargs = self.prepare_for_tokenization(text, **kwargs)
 
         if kwargs:
             logger.warning(f"Keyword arguments {kwargs} not recognized.")
 
-        # TODO: should this be in the base class?
         if hasattr(self, "do_lower_case") and self.do_lower_case:
             # convert non-special tokens to lowercase
-            escaped_special_toks = [
-                re.escape(s_tok) for s_tok in (self.unique_no_split_tokens + self.all_special_tokens)
+            escaped_special_toks = [re.escape(s_tok) for s_tok in (self.all_special_tokens)]
+            escaped_special_toks += [
+                re.escape(s_tok.content)
+                for s_tok in (self._added_tokens_decoder.values())
+                if not s_tok.special and s_tok.normalized
             ]
             pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
             text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text)
 
-        no_split_token = set(self.unique_no_split_tokens)
-        tokens = self.tokens_trie.split(text)
+        if split_special_tokens:
+            no_split_token = []
+            tokens = [text]
+        else:
+            no_split_token = set(self._added_tokens_encoder.keys())  # don't split on any of the added tokens
+            # "This is something<special_token_1>  else"
+            tokens = self.tokens_trie.split(text)
+
         # ["This is something", "<special_token_1>", "  else"]
         for i, token in enumerate(tokens):
             if token in no_split_token:
-                tok_extended = all_special_tokens_extended.get(token, None)
+                tok_extended = self._added_tokens_decoder.get(self._added_tokens_encoder[token], None)
                 left = tokens[i - 1] if i > 0 else None
                 right = tokens[i + 1] if i < len(tokens) - 1 else None
                 if isinstance(tok_extended, AddedToken):
@@ -529,12 +582,18 @@ def tokenize(self, text: TextInput, **kwargs) -> List[str]:
                     # Strip white spaces on the left
                     if tok_extended.lstrip and left:
                         tokens[i - 1] = left.rstrip()  # Opposite here
+                    if tok_extended.single_word and left and left[-1] != " ":
+                        tokens[i - 1] += token
+                        tokens[i] = ""
+                    elif tok_extended.single_word and right and right[0] != " ":
+                        tokens[i + 1] = token + tokens[i + 1]
+                        tokens[i] = ""
+
                 else:
-                    # We strip left and right by default
-                    if right:
-                        tokens[i + 1] = right.lstrip()
-                    if left:
-                        tokens[i - 1] = left.rstrip()
+                    raise ValueError(
+                        f"{tok_extended} cannot be tokenized because it was not properly added"
+                        f" to the tokenizer. This means that it is not an `AddedToken` but a {type(tok_extended)}"
+                    )
         # ["This is something", "<special_token_1>", "else"]
         tokenized_text = []
         for token in tokens:
@@ -583,8 +642,8 @@ def _convert_token_to_id_with_added_voc(self, token):
         if token is None:
             return None
 
-        if token in self.added_tokens_encoder:
-            return self.added_tokens_encoder[token]
+        if token in self._added_tokens_encoder:
+            return self._added_tokens_encoder[token]
         return self._convert_token_to_id(token)
 
     def _convert_token_to_id(self, token):
@@ -609,7 +668,7 @@ def _encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         def get_input_ids(text):
             if isinstance(text, str):
@@ -693,7 +752,7 @@ def _batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         def get_input_ids(text):
             if isinstance(text, str):
@@ -834,7 +893,7 @@ def prepare_for_tokenization(
                 Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
                 tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
                 which it will tokenize. This is useful for NER or token classification.
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Keyword arguments to use for the tokenization.
 
         Returns:
@@ -897,8 +956,8 @@ def convert_ids_to_tokens(
             `str` or `List[str]`: The decoded token(s).
         """
         if isinstance(ids, int):
-            if ids in self.added_tokens_decoder:
-                return self.added_tokens_decoder[ids]
+            if ids in self._added_tokens_decoder:
+                return self._added_tokens_decoder[ids].content
             else:
                 return self._convert_id_to_token(ids)
         tokens = []
@@ -906,8 +965,8 @@ def convert_ids_to_tokens(
             index = int(index)
             if skip_special_tokens and index in self.all_special_ids:
                 continue
-            if index in self.added_tokens_decoder:
-                tokens.append(self.added_tokens_decoder[index])
+            if index in self._added_tokens_decoder:
+                tokens.append(self._added_tokens_decoder[index].content)
             else:
                 tokens.append(self._convert_id_to_token(index))
         return tokens
@@ -922,25 +981,30 @@ def _decode(
         self,
         token_ids: List[int],
         skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: bool = True,
+        clean_up_tokenization_spaces: bool = None,
         spaces_between_special_tokens: bool = True,
-        **kwargs
+        **kwargs,
     ) -> str:
         self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
 
         filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
-
+        legacy_added_tokens = set(self._added_tokens_encoder.keys()) - set(self.all_special_tokens) | {
+            token for token in self.additional_special_tokens if self.convert_tokens_to_ids(token) >= self.vocab_size
+        }
         # To avoid mixing byte-level and unicode for byte-level BPT
         # we need to build string separately for added tokens and byte-level tokens
         # cf. https://github.com/huggingface/transformers/issues/1133
         sub_texts = []
         current_sub_text = []
+        # TODO @ArthurZ in version 5, special tokens should be handled in convert_tokens_to_string, while _convert_tokens_to_string
         for token in filtered_tokens:
             if skip_special_tokens and token in self.all_special_ids:
                 continue
-            if token in self.added_tokens_encoder:
+            if token in legacy_added_tokens:
                 if current_sub_text:
-                    sub_texts.append(self.convert_tokens_to_string(current_sub_text))
+                    string = self.convert_tokens_to_string(current_sub_text)
+                    if len(string) > 0:
+                        sub_texts.append(string)
                     current_sub_text = []
                 sub_texts.append(token)
             else:
@@ -953,6 +1017,11 @@ def _decode(
         else:
             text = "".join(sub_texts)
 
+        clean_up_tokenization_spaces = (
+            clean_up_tokenization_spaces
+            if clean_up_tokenization_spaces is not None
+            else self.clean_up_tokenization_spaces
+        )
         if clean_up_tokenization_spaces:
             clean_text = self.clean_up_tokenization(text)
             return clean_text
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index f63591dbac9a..cf30c7695ff9 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -23,10 +23,11 @@
 import os
 import re
 import warnings
-from collections import OrderedDict, UserDict
+from collections import UserDict
 from collections.abc import Mapping, Sized
 from contextlib import contextmanager
-from dataclasses import dataclass, field
+from dataclasses import dataclass
+from functools import lru_cache
 from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union
 
 import numpy as np
@@ -40,6 +41,7 @@
     PushToHubMixin,
     TensorType,
     add_end_docstrings,
+    add_model_info_to_auto_map,
     cached_file,
     copy_func,
     download_url,
@@ -68,6 +70,7 @@
         import tensorflow as tf
     if is_flax_available():
         import jax.numpy as jnp  # noqa: F401
+    from .pipelines.conversational import Conversation
 
 
 if is_tokenizers_available():
@@ -75,22 +78,32 @@
     from tokenizers import Encoding as EncodingFast
 else:
 
-    @dataclass(frozen=True, eq=True)
+    @dataclass(frozen=False, eq=True)
     class AddedToken:
         """
         AddedToken represents a token to be added to a Tokenizer An AddedToken can have special options defining the
         way it should behave.
+
+        The `normalized` will default to `not special` if it is not specified, similarly to the definition in
+        `tokenizers`.
         """
 
-        content: str = field(default_factory=str)
-        single_word: bool = False
-        lstrip: bool = False
-        rstrip: bool = False
-        normalized: bool = True
+        def __init__(
+            self, content: str, single_word=False, lstrip=False, rstrip=False, special=False, normalized=None
+        ):
+            self.content = content
+            self.single_word = single_word
+            self.lstrip = lstrip
+            self.rstrip = rstrip
+            self.special = special
+            self.normalized = normalized if normalized is not None else not special
 
         def __getstate__(self):
             return self.__dict__
 
+        def __str__(self):
+            return self.content
+
     @dataclass
     class EncodingFast:
         """This is dummy class because without the `tokenizers` library we don't have these objects anyway"""
@@ -170,7 +183,7 @@ class BatchEncoding(UserDict):
     utility methods to map from word/character space to token space.
 
     Args:
-        data (`dict`):
+        data (`dict`, *optional*):
             Dictionary of lists/arrays/tensors returned by the `__call__`/`encode_plus`/`batch_encode_plus` methods
             ('input_ids', 'attention_mask', etc.).
         encoding (`tokenizers.Encoding` or `Sequence[tokenizers.Encoding]`, *optional*):
@@ -232,15 +245,20 @@ def __getitem__(self, item: Union[int, str]) -> Union[Any, EncodingFast]:
         etc.).
 
         If the key is an integer, get the `tokenizers.Encoding` for batch item with index `key`.
+
+        If the key is a slice, returns the value of the dict associated to `key` ('input_ids', 'attention_mask', etc.)
+        with the constraint of slice.
         """
         if isinstance(item, str):
             return self.data[item]
         elif self._encodings is not None:
             return self._encodings[item]
+        elif isinstance(item, slice):
+            return {key: self.data[key][item] for key in self.data.keys()}
         else:
             raise KeyError(
-                "Indexing with integers (to access backend Encoding for a given batch index) "
-                "is not available when using Python based tokenizers"
+                "Invalid key. Only three types of key are available: "
+                "(1) string, (2) integers for backend Encoding, and (3) slices for data subsetting."
             )
 
     def __getattr__(self, item: str):
@@ -694,8 +712,13 @@ def convert_to_tensors(
                 raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.")
             import torch
 
-            as_tensor = torch.tensor
             is_tensor = torch.is_tensor
+
+            def as_tensor(value, dtype=None):
+                if isinstance(value, list) and isinstance(value[0], np.ndarray):
+                    return torch.tensor(np.array(value))
+                return torch.tensor(value)
+
         elif tensor_type == TensorType.JAX:
             if not is_flax_available():
                 raise ImportError("Unable to convert output to JAX tensors format, JAX is not installed.")
@@ -704,7 +727,15 @@ def convert_to_tensors(
             as_tensor = jnp.array
             is_tensor = is_jax_tensor
         else:
-            as_tensor = np.asarray
+
+            def as_tensor(value, dtype=None):
+                if isinstance(value, (list, tuple)) and isinstance(value[0], (list, tuple, np.ndarray)):
+                    value_lens = [len(val) for val in value]
+                    if len(set(value_lens)) > 1 and dtype is None:
+                        # we have a ragged list so handle explicitly
+                        value = as_tensor([np.asarray(val) for val in value], dtype=object)
+                return np.asarray(value, dtype=dtype)
+
             is_tensor = is_numpy_array
 
         # Do the tensor conversion in batch
@@ -724,18 +755,18 @@ def convert_to_tensors(
                     #     tensor = tensor[None, :]
 
                     self[key] = tensor
-            except:  # noqa E722
+            except Exception as e:
                 if key == "overflowing_tokens":
                     raise ValueError(
                         "Unable to create tensor returning overflowing tokens of different lengths. "
                         "Please see if a fast version of this tokenizer is available to have this feature available."
-                    )
+                    ) from e
                 raise ValueError(
                     "Unable to create tensor, you should probably activate truncation and/or padding with"
                     " 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your"
                     f" features (`{key}` in this case) have excessive nesting (inputs type `list` where type `int` is"
                     " expected)."
-                )
+                ) from e
 
         return self
 
@@ -785,7 +816,8 @@ class SpecialTokensMixin:
             A special token representing a masked token (used by masked-language modeling pretraining objectives, like
             BERT).
         additional_special_tokens (tuple or list of `str` or `tokenizers.AddedToken`, *optional*):
-            A tuple or a list of additional special tokens.
+            A tuple or a list of additional tokens, which will be marked as `special`, meaning that they will be
+            skipped when decoding if `skip_special_tokens` is set to `True`.
     """
 
     SPECIAL_TOKENS_ATTRIBUTES = [
@@ -814,31 +846,41 @@ def __init__(self, verbose=True, **kwargs):
         # We directly set the hidden value to allow initialization with special tokens
         # which are not yet in the vocabulary. Necessary for serialization/de-serialization
         # TODO clean this up at some point (probably by switching to fast tokenizers)
+
         for key, value in kwargs.items():
             if value is None:
                 continue
             if key in self.SPECIAL_TOKENS_ATTRIBUTES:
                 if key == "additional_special_tokens":
+                    # TODO THIS IS NASTY! Will always reset tokens to default rstrip and lstrip because self.set_attr on strings
+                    # will not check the addedtokens decoder. WILL FIX TOMORROW
                     assert isinstance(value, (list, tuple)), f"Value {value} is not a list or tuple"
                     assert all(
                         isinstance(t, (str, AddedToken)) for t in value
                     ), "One of the tokens is not a string or an AddedToken"
+                    if hasattr(self, "added_tokens_encoder"):
+                        extended_token = []
+                        for token in value:
+                            if isinstance(token, str) and str(token) in self.added_tokens_encoder:
+                                extended_token.append(self.added_tokens_decoder[self.added_tokens_encoder[str(token)]])
+                            else:
+                                extended_token.append(token)
+                        value = extended_token
                     setattr(self, key, value)
-                elif isinstance(value, (str, AddedToken)):
+                elif isinstance(value, (str)):
+                    value = AddedToken(value, normalized=False, special=True)
+                    setattr(self, key, value)
+                elif isinstance(value, AddedToken):
                     setattr(self, key, value)
                 else:
-                    raise TypeError(f"special token {key} has to be either str or AddedToken but got: {type(value)}")
+                    raise TypeError(f"Special token {key} has to be either str or AddedToken but got: {type(value)}")
 
     def sanitize_special_tokens(self) -> int:
         """
-        Make sure that all the special tokens attributes of the tokenizer (`tokenizer.mask_token`,
-        `tokenizer.cls_token`, etc.) are in the vocabulary.
-
-        Add the missing ones to the vocabulary if needed.
-
-        Return:
-            `int`: The number of tokens added in the vocabulary during the operation.
+        The `sanitize_special_tokens` is now deprecated kept for backward compatibility and will be removed in
+        transformers v5.
         """
+        logger.warning_once("The `sanitize_special_tokens` will be removed in transformers v5.")
         return self.add_tokens(self.all_special_tokens_extended, special_tokens=True)
 
     def add_special_tokens(
@@ -849,14 +891,15 @@ def add_special_tokens(
         special tokens are NOT in the vocabulary, they are added to it (indexed starting from the last index of the
         current vocabulary).
 
-        Note,None When adding new tokens to the vocabulary, you should make sure to also resize the token embedding
-        matrix of the model so that its embedding matrix matches the tokenizer.
+        When adding new tokens to the vocabulary, you should make sure to also resize the token embedding matrix of the
+        model so that its embedding matrix matches the tokenizer.
 
         In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method.
 
         Using `add_special_tokens` will ensure your special tokens can be used in several ways:
 
-        - Special tokens are carefully handled by the tokenizer (they are never split).
+        - Special tokens can be skipped when decoding using `skip_special_tokens = True`.
+        - Special tokens are carefully handled by the tokenizer (they are never split), similar to `AddedTokens`.
         - You can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This
           makes it easy to develop model-agnostic training and fine-tuning scripts.
 
@@ -872,10 +915,12 @@ def add_special_tokens(
                 Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer
                 assign the index of the `unk_token` to them).
             replace_additional_special_tokens (`bool`, *optional*,, defaults to `True`):
-                If `True`, the existing list of additional special tokens will be replaced by the one specified in
-                `special_tokens_dict`. Otherwise, `self._additional_special_tokens` is updated. In the former case, the
-                tokens will NOT be removed from the tokenizer's full vocabulary - they are only being flagged as
-                non-special tokens.
+                If `True`, the existing list of additional special tokens will be replaced by the list provided in
+                `special_tokens_dict`. Otherwise, `self._additional_special_tokens` is just extended. In the former
+                case, the tokens will NOT be removed from the tokenizer's full vocabulary - they are only being flagged
+                as non-special tokens. Remember, this only affects which tokens are skipped during decoding, not the
+                `added_tokens_encoder` and `added_tokens_decoder`. This means that the previous
+                `additional_special_tokens` are still added tokens, and will not be split by the model.
 
         Returns:
             `int`: Number of tokens added to the vocabulary.
@@ -899,7 +944,7 @@ def add_special_tokens(
         if not special_tokens_dict:
             return 0
 
-        added_tokens = 0
+        added_tokens = []
         for key, value in special_tokens_dict.items():
             assert key in self.SPECIAL_TOKENS_ATTRIBUTES, f"Key {key} is not a special token"
 
@@ -911,28 +956,32 @@ def add_special_tokens(
                     isinstance(t, (str, AddedToken)) for t in value
                 ), f"Tokens {value} for key {key} should all be str or AddedToken instances"
 
+                to_add = set()
+                for token in value:
+                    if isinstance(token, str):
+                        # for legacy purpose we default to stripping. `test_add_tokens_tokenizer` depends on this
+                        token = AddedToken(token, normalized=False, rstrip=True, lstrip=True)
+                    if str(token) not in self.additional_special_tokens:
+                        to_add.add(token)
                 if replace_additional_special_tokens:
-                    setattr(self, key, value)
+                    setattr(self, key, list(to_add))
                 else:
-                    # This is a copy of `self._additional_special_tokens`
-                    additional_special_tokens = getattr(self, key)
-                    additional_special_tokens_set = set(additional_special_tokens)
-                    to_add = []
-                    for token in value:
-                        if str(token) not in additional_special_tokens_set and str(token) not in to_add:
-                            to_add.append(token)
-                    # update the property
-                    additional_special_tokens.extend(to_add)
-                    self.additional_special_tokens = additional_special_tokens
-
-                added_tokens += self.add_tokens(value, special_tokens=True)
+                    self._additional_special_tokens.extend(to_add)
+                added_tokens += to_add
+
             else:
-                assert isinstance(
-                    value, (str, AddedToken)
-                ), f"Token {value} for key {key} should be a str or an AddedToken instance"
-                setattr(self, key, value)
-                added_tokens += self.add_tokens([value], special_tokens=True)
+                if not isinstance(value, (str, AddedToken)):
+                    raise ValueError(f"Token {value} for key {key} should be a str or an AddedToken instance")
+                if isinstance(value, (str)):
+                    # for legacy purpose we default to stripping. `test_add_tokens_tokenizer` depends on this
+                    value = AddedToken(value, normalized=False, rstrip=True, lstrip=True)
+                if isinstance(value, AddedToken):
+                    setattr(self, key, value)
+                if value not in added_tokens:
+                    added_tokens.append(value)
 
+        # if we are adding tokens that were not part of the vocab, we ought to add them
+        added_tokens = self.add_tokens(added_tokens, special_tokens=True)
         return added_tokens
 
     def add_tokens(
@@ -1081,35 +1130,74 @@ def additional_special_tokens(self) -> List[str]:
 
     @bos_token.setter
     def bos_token(self, value):
+        if isinstance(value, str) and value != "":
+            value = AddedToken(value, normalized=False, rstrip=True, lstrip=True, special=True)
+        elif not isinstance(value, AddedToken) and value is not None:
+            raise ValueError("Cannot set a non-string value as the BOS token")
         self._bos_token = value
 
     @eos_token.setter
     def eos_token(self, value):
+        if isinstance(value, str) and value != "":
+            value = AddedToken(value, normalized=False, rstrip=True, lstrip=True, special=True)
+        elif not isinstance(value, AddedToken) and value is not None:
+            raise ValueError("Cannot set a non-string value as the EOS token")
         self._eos_token = value
 
     @unk_token.setter
     def unk_token(self, value):
+        if isinstance(value, str) and value != "":
+            value = AddedToken(value, normalized=False, rstrip=True, lstrip=True, special=True)
+        elif not isinstance(value, AddedToken) and value is not None:
+            raise ValueError("Cannot set a non-string value as the UNK token")
         self._unk_token = value
 
     @sep_token.setter
     def sep_token(self, value):
+        if isinstance(value, str) and value != "":
+            value = AddedToken(value, normalized=False, rstrip=True, lstrip=True, special=True)
+        elif not isinstance(value, AddedToken) and value is not None:
+            raise ValueError("Cannot set a non-string value as the SEP token")
         self._sep_token = value
 
     @pad_token.setter
     def pad_token(self, value):
+        if isinstance(value, str) and value != "":
+            value = AddedToken(value, normalized=False, rstrip=True, lstrip=True, special=True)
+        elif not isinstance(value, AddedToken) and value is not None:
+            raise ValueError("Cannot set a non-string value as the PAD token")
         self._pad_token = value
 
     @cls_token.setter
     def cls_token(self, value):
+        if isinstance(value, str) and value != "":
+            value = AddedToken(value, normalized=False, rstrip=True, lstrip=True, special=True)
+        elif not isinstance(value, AddedToken) and value is not None:
+            raise ValueError("Cannot set a non-string value as the CLS token")
         self._cls_token = value
 
     @mask_token.setter
     def mask_token(self, value):
+        if isinstance(value, str) and value != "":
+            value = AddedToken(value, normalized=False, rstrip=True, lstrip=True, special=True)
+        elif not isinstance(value, AddedToken) and value is not None:
+            raise ValueError("Cannot set a non-string value as the MASK token")
         self._mask_token = value
 
     @additional_special_tokens.setter
     def additional_special_tokens(self, value):
-        self._additional_special_tokens = value
+        if value is None:
+            self._additional_special_tokens = value
+            return
+        if self._additional_special_tokens is None:
+            self._additional_special_tokens = []
+        # We store the `AddedToken` to allow adding tokens via `tokenizer.add_special_tokens`
+        for token in value:
+            if isinstance(token, str) and token != "":
+                token = AddedToken(token, normalized=False, rstrip=True, lstrip=True, special=True)
+            elif not isinstance(token, AddedToken):
+                raise ValueError(f"Cannot add instance of type {type(value)} to additional_special_tokens!")
+            self._additional_special_tokens.append(token)
 
     @property
     def bos_token_id(self) -> Optional[int]:
@@ -1238,13 +1326,9 @@ def special_tokens_map(self) -> Dict[str, Union[str, List[str]]]:
         """
         set_attr = {}
         for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
-            attr_value = getattr(self, "_" + attr)
+            attr_value = getattr(self, attr)
             if attr_value:
-                set_attr[attr] = (
-                    type(attr_value)(str(attr_value_sub) for attr_value_sub in attr_value)
-                    if isinstance(attr_value, (list, tuple))
-                    else str(attr_value)
-                )
+                set_attr[attr] = attr_value
         return set_attr
 
     @property
@@ -1264,29 +1348,34 @@ def special_tokens_map_extended(self) -> Dict[str, Union[str, AddedToken, List[U
         return set_attr
 
     @property
-    def all_special_tokens(self) -> List[str]:
+    def all_special_tokens_extended(self) -> List[Union[str, AddedToken]]:
         """
-        `List[str]`: All the special tokens (`'<unk>'`, `'<cls>'`, etc.) mapped to class attributes.
+        `List[Union[str, tokenizers.AddedToken]]`: All the special tokens (`'<unk>'`, `'<cls>'`, etc.), the order has
+        nothing to do with the index of each tokens. If you want to know the correct indices, check
+        `self.added_tokens_encoder`. We can't create an order anymore as the keys are `AddedTokens` and not `Strings`.
 
-        Convert tokens of `tokenizers.AddedToken` type to string.
+        Don't convert tokens of `tokenizers.AddedToken` type to string so they can be used to control more finely how
+        special tokens are tokenized.
         """
-        all_toks = [str(s) for s in self.all_special_tokens_extended]
-        return all_toks
+        all_tokens = []
+        seen = set()
+        for value in self.special_tokens_map_extended.values():
+            if isinstance(value, (list, tuple)):
+                tokens_to_add = [token for token in value if str(token) not in seen]
+            else:
+                tokens_to_add = [value] if str(value) not in seen else []
+            seen.update(map(str, tokens_to_add))
+            all_tokens.extend(tokens_to_add)
+        return all_tokens
 
     @property
-    def all_special_tokens_extended(self) -> List[Union[str, AddedToken]]:
+    def all_special_tokens(self) -> List[str]:
         """
-        `List[Union[str, tokenizers.AddedToken]]`: All the special tokens (`'<unk>'`, `'<cls>'`, etc.) mapped to class
-        attributes.
+        `List[str]`: A list of the unique special tokens (`'<unk>'`, `'<cls>'`, ..., etc.).
 
-        Don't convert tokens of `tokenizers.AddedToken` type to string so they can be used to control more finely how
-        special tokens are tokenized.
+        Convert tokens of `tokenizers.AddedToken` type to string.
         """
-        all_toks = []
-        set_attr = self.special_tokens_map_extended
-        for attr_value in set_attr.values():
-            all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value])
-        all_toks = list(OrderedDict.fromkeys(all_toks))
+        all_toks = [str(s) for s in self.all_special_tokens_extended]
         return all_toks
 
     @property
@@ -1301,7 +1390,10 @@ def all_special_ids(self) -> List[int]:
 
 ENCODE_KWARGS_DOCSTRING = r"""
             add_special_tokens (`bool`, *optional*, defaults to `True`):
-                Whether or not to encode the sequences with the special tokens relative to their model.
+                Whether or not to add special tokens when encoding the sequences. This will use the underlying
+                `PretrainedTokenizerBase.build_inputs_with_special_tokens` function, which defines which tokens are
+                automatically added to the input ids. This is usefull if you want to add `bos` or `eos` tokens
+                automatically.
             padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
                 Activates and controls padding. Accepts the following values:
 
@@ -1342,8 +1434,9 @@ def all_special_ids(self) -> List[int]:
                 tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
                 which it will tokenize. This is useful for NER or token classification.
             pad_to_multiple_of (`int`, *optional*):
-                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
-                the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
+                If set will pad the sequence to a multiple of the provided value. Requires `padding` to be activated.
+                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta).
             return_tensors (`str` or [`~utils.TensorType`], *optional*):
                 If set, will return tensors instead of list of python integers. Acceptable values are:
 
@@ -1406,6 +1499,7 @@ def all_special_ids(self) -> List[int]:
             - **length** -- The length of the inputs (when `return_length=True`)
 """
 
+
 INIT_TOKENIZER_DOCSTRING = r"""
     Class attributes (overridden by derived classes)
 
@@ -1441,6 +1535,9 @@ def all_special_ids(self) -> List[int]:
         truncation_side (`str`, *optional*):
             The side on which the model should have truncation applied. Should be selected between ['right', 'left'].
             Default value is picked from the class attribute of the same name.
+        chat_template (`str`, *optional*):
+            A Jinja template string that will be used to format lists of chat messages. See
+            https://huggingface.co/docs/transformers/chat_templating for a full description.
         model_input_names (`List[string]`, *optional*):
             The list of inputs accepted by the forward pass of the model (like `"token_type_ids"` or
             `"attention_mask"`). Default value is picked from the class attribute of the same name.
@@ -1466,9 +1563,17 @@ def all_special_ids(self) -> List[int]:
             A special token representing a masked token (used by masked-language modeling pretraining objectives, like
             BERT). Will be associated to `self.mask_token` and `self.mask_token_id`.
         additional_special_tokens (tuple or list of `str` or `tokenizers.AddedToken`, *optional*):
-            A tuple or a list of additional special tokens. Add them here to ensure they won't be split by the
-            tokenization process. Will be associated to `self.additional_special_tokens` and
-            `self.additional_special_tokens_ids`.
+            A tuple or a list of additional special tokens. Add them here to ensure they are skipped when decoding with
+            `skip_special_tokens` is set to True. If they are not part of the vocabulary, they will be added at the end
+            of the vocabulary.
+        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should cleanup the spaces that were added when splitting the input text during the
+            tokenization process.
+        split_special_tokens (`bool`, *optional*, defaults to `False`):
+            Whether or not the special tokens should be split during the tokenization process. The default behavior is
+            to not split special tokens. This means that if `<s>` is the `bos_token`, then `tokenizer.tokenize("<s>") =
+            ['<s>`]. Otherwise, if `split_special_tokens=True`, then `tokenizer.tokenize("<s>")` will be give `['<',
+            's', '>']`. This argument is only supported for `slow` tokenizers for the moment.
 """
 
 
@@ -1520,10 +1625,20 @@ def __init__(self, **kwargs):
 
         self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)
 
+        # By default, cleaning tokenization spaces for both fast and slow tokenizers
+        self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", True)
+
+        # By default, do not split special tokens for both fast and slow tokenizers
+        self.split_special_tokens = kwargs.pop("split_special_tokens", False)
+
         self.deprecation_warnings = (
             {}
         )  # Use to store when we have already noticed a deprecation warning (avoid overlogging).
         self._in_target_context_manager = False
+
+        # Stores a Jinja template that formats chat histories into tokenizable strings
+        self.chat_template = kwargs.pop("chat_template", None)
+
         super().__init__(**kwargs)
 
     @property
@@ -1570,14 +1685,23 @@ def _set_processor_class(self, processor_class: str):
         """Sets processor class as an attribute."""
         self._processor_class = processor_class
 
+    @property
+    def added_tokens_decoder(self) -> Dict[int, AddedToken]:
+        raise NotImplementedError()
+
     def __repr__(self) -> str:
+        added_tokens_decoder_rep = "\n\t".join([f"{k}: {v.__repr__()}," for k, v in self.added_tokens_decoder.items()])
         return (
-            f"{'PreTrainedTokenizerFast' if self.is_fast else 'PreTrainedTokenizer'}(name_or_path='{self.name_or_path}',"
-            f" vocab_size={self.vocab_size}, model_max_len={self.model_max_length}, is_fast={self.is_fast},"
+            f"{self.__class__.__name__}(name_or_path='{self.name_or_path}',"
+            f" vocab_size={self.vocab_size}, model_max_length={self.model_max_length}, is_fast={self.is_fast},"
             f" padding_side='{self.padding_side}', truncation_side='{self.truncation_side}',"
-            f" special_tokens={self.special_tokens_map_extended})"
+            f" special_tokens={self.special_tokens_map}, clean_up_tokenization_spaces={self.clean_up_tokenization_spaces}), "
+            " added_tokens_decoder={\n\t" + added_tokens_decoder_rep + "\n}"
         )
 
+    def __len__(self) -> int:
+        raise NotImplementedError()
+
     def get_vocab(self) -> Dict[str, int]:
         """
         Returns the vocabulary as a dictionary of token to index.
@@ -1590,8 +1714,131 @@ def get_vocab(self) -> Dict[str, int]:
         """
         raise NotImplementedError()
 
+    def apply_chat_template(
+        self,
+        conversation: Union[List[Dict[str, str]], "Conversation"],
+        chat_template: Optional[str] = None,
+        add_generation_prompt: bool = False,
+        tokenize: bool = True,
+        padding: bool = False,
+        truncation: bool = False,
+        max_length: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **tokenizer_kwargs,
+    ) -> Union[str, List[int]]:
+        """
+        Converts a Conversation object or a list of dictionaries with `"role"` and `"content"` keys to a list of token
+        ids. This method is intended for use with chat models, and will read the tokenizer's chat_template attribute to
+        determine the format and control tokens to use when converting. When chat_template is None, it will fall back
+        to the default_chat_template specified at the class level.
+
+        Args:
+            conversation (Union[List[Dict[str, str]], "Conversation"]): A Conversation object or list of dicts
+                with "role" and "content" keys, representing the chat history so far.
+            chat_template (str, *optional*): A Jinja template to use for this conversion. If
+                this is not passed, the model's default chat template will be used instead.
+            add_generation_prompt (bool, *optional*): Whether to end the prompt with the token(s) that indicate
+                the start of an assistant message. This is useful when you want to generate a response from the model.
+                Note that this argument will be passed to the chat template, and so it must be supported in the
+                template for this argument to have any effect.
+            tokenize (`bool`, defaults to `True`):
+                Whether to tokenize the output. If `False`, the output will be a string.
+            padding (`bool`, defaults to `False`):
+                Whether to pad sequences to the maximum length. Has no effect if tokenize is `False`.
+            truncation (`bool`, defaults to `False`):
+                Whether to truncate sequences at the maximum length. Has no effect if tokenize is `False`.
+            max_length (`int`, *optional*):
+                Maximum length (in tokens) to use for padding or truncation. Has no effect if tokenize is `False`. If
+                not specified, the tokenizer's `max_length` attribute will be used as a default.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Has no effect if tokenize is `False`. Acceptable
+                values are:
+                - `'tf'`: Return TensorFlow `tf.Tensor` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+            **tokenizer_kwargs: Additional kwargs to pass to the tokenizer.
+
+        Returns:
+            `List[int]`: A list of token ids representing the tokenized chat so far, including control tokens. This
+            output is ready to pass to the model, either directly or via methods like `generate()`.
+        """
+
+        if hasattr(conversation, "messages"):
+            # Indicates it's a Conversation object
+            conversation = conversation.messages
+
+        # priority: `chat_template` argument > `tokenizer.chat_template` > `tokenizer.default_chat_template`
+        if chat_template is None:
+            if self.chat_template is not None:
+                chat_template = self.chat_template
+            else:
+                chat_template = self.default_chat_template
+
+        # Compilation function uses a cache to avoid recompiling the same template
+        compiled_template = self._compile_jinja_template(chat_template)
+
+        rendered = compiled_template.render(
+            messages=conversation, add_generation_prompt=add_generation_prompt, **self.special_tokens_map
+        )
+
+        if padding is True:
+            padding = "max_length"  # There's only one sequence here, so "longest" makes no sense
+        if tokenize:
+            return self.encode(
+                rendered,
+                add_special_tokens=False,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                return_tensors=return_tensors,
+                **tokenizer_kwargs,
+            )
+        else:
+            return rendered
+
+    @lru_cache
+    def _compile_jinja_template(self, chat_template):
+        try:
+            from jinja2.exceptions import TemplateError
+            from jinja2.sandbox import ImmutableSandboxedEnvironment
+        except ImportError:
+            raise ImportError("apply_chat_template requires jinja2 to be installed.")
+
+        def raise_exception(message):
+            raise TemplateError(message)
+
+        jinja_env = ImmutableSandboxedEnvironment(trim_blocks=True, lstrip_blocks=True)
+        jinja_env.globals["raise_exception"] = raise_exception
+        return jinja_env.from_string(chat_template)
+
+    @property
+    def default_chat_template(self):
+        """
+        This template formats inputs in the standard ChatML format. See
+        https://github.com/openai/openai-python/blob/main/chatml.md
+        """
+        return (
+            "{% for message in messages %}"
+            "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}"
+            "{% endfor %}"
+            "{% if add_generation_prompt %}"
+            "{{ '<|im_start|>assistant\n' }}"
+            "{% endif %}"
+        )
+
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], *init_inputs, **kwargs):
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        *init_inputs,
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
+        **kwargs,
+    ):
         r"""
         Instantiate a [`~tokenization_utils_base.PreTrainedTokenizerBase`] (or a derived class) from a predefined
         tokenizer.
@@ -1621,7 +1868,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            use_auth_token (`str` or *bool*, *optional*):
+            token (`str` or *bool*, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
                 when running `huggingface-cli login` (stored in `~/.huggingface`).
             local_files_only (`bool`, *optional*, defaults to `False`):
@@ -1642,7 +1889,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
 
         <Tip>
 
-        Passing `use_auth_token=True` is required when you want to use a private model.
+        Passing `token=True` is required when you want to use a private model.
 
         </Tip>
 
@@ -1668,18 +1915,24 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
         # Otherwise use tokenizer.add_special_tokens({'unk_token': '<unk>'}) instead)
         assert tokenizer.unk_token == "<unk>"
         ```"""
-        cache_dir = kwargs.pop("cache_dir", None)
-        force_download = kwargs.pop("force_download", False)
         resume_download = kwargs.pop("resume_download", False)
         proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", False)
         use_auth_token = kwargs.pop("use_auth_token", None)
-        revision = kwargs.pop("revision", None)
         subfolder = kwargs.pop("subfolder", None)
         from_pipeline = kwargs.pop("_from_pipeline", None)
         from_auto_class = kwargs.pop("_from_auto", False)
         commit_hash = kwargs.pop("_commit_hash", None)
 
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
         user_agent = {"file_type": "tokenizer", "from_auto_class": from_auto_class, "is_fast": "Fast" in cls.__name__}
         if from_pipeline is not None:
             user_agent["using_pipeline"] = from_pipeline
@@ -1712,12 +1965,13 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
         else:
             # At this point pretrained_model_name_or_path is either a directory or a model identifier name
             additional_files_names = {
-                "added_tokens_file": ADDED_TOKENS_FILE,
-                "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
+                "added_tokens_file": ADDED_TOKENS_FILE,  # kept only for legacy
+                "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,  # kept only for legacy
                 "tokenizer_config_file": TOKENIZER_CONFIG_FILE,
+                # tokenizer_file used to initialize a slow from a fast. Properly copy the `addedTokens` instead of adding in random orders
+                "tokenizer_file": FULL_TOKENIZER_FILE,
             }
             vocab_files = {**cls.vocab_files_names, **additional_files_names}
-
             if "tokenizer_file" in vocab_files:
                 # Try to get the tokenizer config to see if there are versioned tokenizer files.
                 fast_tokenizer_file = FULL_TOKENIZER_FILE
@@ -1728,7 +1982,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
                     force_download=force_download,
                     resume_download=resume_download,
                     proxies=proxies,
-                    use_auth_token=use_auth_token,
+                    token=token,
                     revision=revision,
                     local_files_only=local_files_only,
                     subfolder=subfolder,
@@ -1765,7 +2019,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
                     proxies=proxies,
                     resume_download=resume_download,
                     local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
+                    token=token,
                     user_agent=user_agent,
                     revision=revision,
                     subfolder=subfolder,
@@ -1803,10 +2057,11 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
             pretrained_model_name_or_path,
             init_configuration,
             *init_inputs,
-            use_auth_token=use_auth_token,
+            token=token,
             cache_dir=cache_dir,
             local_files_only=local_files_only,
             _commit_hash=commit_hash,
+            _is_local=is_local,
             **kwargs,
         )
 
@@ -1817,11 +2072,12 @@ def _from_pretrained(
         pretrained_model_name_or_path,
         init_configuration,
         *init_inputs,
-        use_auth_token=None,
+        token=None,
         cache_dir=None,
         local_files_only=False,
         _commit_hash=None,
-        **kwargs
+        _is_local=False,
+        **kwargs,
     ):
         # We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json
         # file or if `from_slow` is set to True.
@@ -1833,7 +2089,7 @@ def _from_pretrained(
                 pretrained_model_name_or_path,
                 copy.deepcopy(init_configuration),
                 *init_inputs,
-                use_auth_token=use_auth_token,
+                token=token,
                 cache_dir=cache_dir,
                 local_files_only=local_files_only,
                 _commit_hash=_commit_hash,
@@ -1851,7 +2107,8 @@ def _from_pretrained(
             # First attempt. We get tokenizer_class from tokenizer_config to check mismatch between tokenizers.
             config_tokenizer_class = init_kwargs.get("tokenizer_class")
             init_kwargs.pop("tokenizer_class", None)
-            init_kwargs.pop("auto_map", None)
+            if not has_tokenizer_file:
+                init_kwargs.pop("tokenizer_file", None)
             saved_init_inputs = init_kwargs.pop("init_inputs", ())
             if not init_inputs:
                 init_inputs = saved_init_inputs
@@ -1859,6 +2116,14 @@ def _from_pretrained(
             config_tokenizer_class = None
             init_kwargs = init_configuration
 
+        if "auto_map" in init_kwargs and not _is_local:
+            # For backward compatibility with odl format.
+            if isinstance(init_kwargs["auto_map"], (tuple, list)):
+                init_kwargs["auto_map"] = {"AutoTokenizer": init_kwargs["auto_map"]}
+            init_kwargs["auto_map"] = add_model_info_to_auto_map(
+                init_kwargs["auto_map"], pretrained_model_name_or_path
+            )
+
         if config_tokenizer_class is None:
             from .models.auto.configuration_auto import AutoConfig  # tests_ignore
 
@@ -1866,7 +2131,7 @@ def _from_pretrained(
             try:
                 config = AutoConfig.from_pretrained(
                     pretrained_model_name_or_path,
-                    use_auth_token=use_auth_token,
+                    token=token,
                     cache_dir=cache_dir,
                     local_files_only=local_files_only,
                     _commit_hash=_commit_hash,
@@ -1909,19 +2174,6 @@ def _from_pretrained(
         # Update with newly provided kwargs
         init_kwargs.update(kwargs)
 
-        # Convert AddedTokens serialized as dict to class instances
-        def convert_added_tokens(obj: Union[AddedToken, Any]):
-            if isinstance(obj, dict) and "__type" in obj and obj["__type"] == "AddedToken":
-                obj.pop("__type")
-                return AddedToken(**obj)
-            elif isinstance(obj, (list, tuple)):
-                return list(convert_added_tokens(o) for o in obj)
-            elif isinstance(obj, dict):
-                return {k: convert_added_tokens(v) for k, v in obj.items()}
-            return obj
-
-        init_kwargs = convert_added_tokens(init_kwargs)
-
         # Set max length if needed
         if pretrained_model_name_or_path in cls.max_model_input_sizes:
             # if we're using a pretrained model, ensure the tokenizer
@@ -1929,7 +2181,6 @@ def convert_added_tokens(obj: Union[AddedToken, Any]):
 
             model_max_length = cls.max_model_input_sizes[pretrained_model_name_or_path]
             if model_max_length is not None and isinstance(model_max_length, (int, float)):
-
                 model_max_length = min(init_kwargs.get("model_max_length", int(1e30)), model_max_length)
                 # TODO(PVP) - uncomment following line in Transformers v5
                 # init_kwargs["model_max_length"] = model_max_length
@@ -1942,16 +2193,75 @@ def convert_added_tokens(obj: Union[AddedToken, Any]):
 
         # Merge resolved_vocab_files arguments in init_kwargs.
         added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
+        special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None)
         for args_name, file_path in resolved_vocab_files.items():
             if args_name not in init_kwargs:
                 init_kwargs[args_name] = file_path
 
         if slow_tokenizer is not None:
             init_kwargs["__slow_tokenizer"] = slow_tokenizer
-
         init_kwargs["name_or_path"] = pretrained_model_name_or_path
 
-        # Instantiate tokenizer.
+        additional_special_tokens = init_kwargs.pop("additional_special_tokens", None) or []
+        added_tokens_decoder = {}
+        legacy_saved = "added_tokens_decoder" not in init_kwargs
+        if not legacy_saved:
+            for idx, token in init_kwargs["added_tokens_decoder"].items():
+                if isinstance(token, dict):
+                    token = AddedToken(**token)
+                if isinstance(token, AddedToken):
+                    added_tokens_decoder[int(idx)] = token
+                    if str(token) in additional_special_tokens:
+                        # at this point the token is in `additional_special_tokens` as an str, let's add the AddedToken info
+                        additional_special_tokens.remove(str(token))
+                    if token.special and token not in additional_special_tokens:
+                        additional_special_tokens.append(token)
+                else:
+                    raise ValueError(
+                        f"Found a {token.__class__} in the saved `added_tokens_decoder`, should be a dictionary."
+                    )
+        else:
+            # begin legacy: read the added_tokens_file and update kwargs with special_tokens_map if modified
+            if special_tokens_map_file is not None:
+                with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
+                    special_tokens_map = json.load(special_tokens_map_handle)
+                    for key, value in special_tokens_map.items():
+                        if key in kwargs and kwargs[key]:
+                            # This value has already been redefined by the kwargs
+                            # We keep this new value and ignore the one stored in the special_tokens_map_file
+                            continue
+                        if isinstance(value, dict):
+                            value = AddedToken(**value)
+                            init_kwargs[key] = value
+                        elif key == "additional_special_tokens" and isinstance(value, list):
+                            for token in value:
+                                token = AddedToken(**token) if isinstance(token, dict) else token
+                                if token not in additional_special_tokens:
+                                    additional_special_tokens.append(token)
+                        else:
+                            init_kwargs[key] = value
+            # slow -> slow|fast, legacy: convert the `"added_tokens.json"` file to `added_tokens_decoder`.
+            if added_tokens_file is not None:
+                with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
+                    added_tok_encoder = json.load(added_tokens_handle)
+                # legacy: we have to init with (rstrip=True, lstrip=True)
+                strip = True if "Fast" not in cls.__name__ else False
+                added_tokens_decoder = {
+                    index: AddedToken(token, rstrip=strip, lstrip=strip) for token, index in added_tok_encoder.items()
+                }
+            # end legacy
+
+        # slow -> fast, non-legacy: we need to make sure the `added_tokens_decoder` is used to add tokens if the `fast` was not properly saved!
+        # thus we delay adding special tokens in the init using `slow_to_fast` flag.
+        if added_tokens_decoder is not {} and "Fast" in cls.__name__:
+            init_kwargs["slow_to_fast"] = True
+        if len(additional_special_tokens) > 0:
+            init_kwargs["additional_special_tokens"] = additional_special_tokens
+        init_kwargs["added_tokens_decoder"] = added_tokens_decoder
+
+        # convert {'__type': 'AddedToken', 'content': '<ent>', 'lstrip': False, 'normalized': True, ...} to AddedTokens
+        init_kwargs = cls.convert_added_tokens(init_kwargs, False)
+        # Instantiate the tokenizer.
         try:
             tokenizer = cls(*init_inputs, **init_kwargs)
         except OSError:
@@ -1960,79 +2270,33 @@ def convert_added_tokens(obj: Union[AddedToken, Any]):
                 "Please check that the provided vocabulary is accessible and not corrupted."
             )
 
-        # Save inputs and kwargs for saving and re-loading with ``save_pretrained``
-        # Removed: Now done at the base class level
-        # tokenizer.init_inputs = init_inputs
-        # tokenizer.init_kwargs = init_kwargs
-
-        # If there is a complementary special token map, load it
-        special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None)
-        if special_tokens_map_file is not None:
-            with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
-                special_tokens_map = json.load(special_tokens_map_handle)
-            for key, value in special_tokens_map.items():
-                if key in kwargs and kwargs[key]:
-                    # This value has already been redefined by the kwargs
-                    # We keep this new value and ignore the one stored in the special_tokens_map_file
-
-                    continue
-
-                if isinstance(value, dict):
-                    value = AddedToken(**value)
-                elif isinstance(value, list):
-                    value = [AddedToken(**token) if isinstance(token, dict) else token for token in value]
-                setattr(tokenizer, key, value)
-
-        # Add supplementary tokens.
-        special_tokens = tokenizer.all_special_tokens
-        if added_tokens_file is not None:
-            with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
-                added_tok_encoder = json.load(added_tokens_handle)
-
-            # Sort added tokens by index
-            added_tok_encoder_sorted = list(sorted(added_tok_encoder.items(), key=lambda x: x[1]))
-
-            # Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for
-            # individual tokens would repeatedly rebuild a trie, which can be slow.
-            is_last_special = None
-            tokens = []
-
-            for token, index in added_tok_encoder_sorted:
-                current_index = len(tokenizer) + len(tokens)
-                if has_tokenizer_file and index != current_index and tokenizer.convert_tokens_to_ids(token) != index:
-                    # Tokenizer fast: added token needs to either be in the vocabulary with the proper index or the
-                    # index is the current length of the tokenizer (not in vocabulary)
-                    raise ValueError(
-                        f"Wrong index found for {token}: should be {tokenizer.convert_tokens_to_ids(token)} but found "
-                        f"{index}."
-                    )
-                elif not has_tokenizer_file and index != current_index:
-                    # Tokenizer slow: added token cannot already be in the vocabulary so its index needs to be the
-                    # current length of the tokenizer.
-                    raise ValueError(
-                        f"Non-consecutive added token '{token}' found. "
-                        f"Should have index {current_index} but has index {index} in saved vocabulary."
-                    )
-
-                is_special = bool(token in special_tokens)
-                if is_last_special is None or is_last_special == is_special:
-                    tokens.append(token)
-                else:
-                    tokenizer.add_tokens(tokens, special_tokens=is_last_special)
-                    tokens = [token]
-                is_last_special = is_special
-
-            if tokens:
-                tokenizer.add_tokens(tokens, special_tokens=is_last_special)
-
-        # Check all our special tokens are registered as "no split" token (we don't cut them) and are in the vocab
-        added_tokens = tokenizer.sanitize_special_tokens()
-        if added_tokens:
+        # allows converting a fast -> slow: add the `tokenizer.json`'s `"added_tokens"` to the slow tokenizer
+        # if `added_tokens_decoder` not in `tokenizer_config.json` and  `added_tokens.json` is `None`
+        tokenizer_file = resolved_vocab_files.pop("tokenizer_file", None)
+        if legacy_saved and "Fast" not in cls.__name__ and added_tokens_file is None and tokenizer_file is not None:
+            tokens_to_add_from_fast = []
+            with open(tokenizer_file, encoding="utf-8") as tokenizer_file_handle:
+                tokenizer_file_handle = json.load(tokenizer_file_handle)
+                added_tokens = tokenizer_file_handle.pop("added_tokens")
+            for serialized_tokens in added_tokens:
+                serialized_tokens.pop("id")
+                # for legacy purpose, we ignore whether or not these tokens are special.
+                serialized_tokens.pop("special")
+                tokens_to_add_from_fast.append(AddedToken(**serialized_tokens))
+            tokenizer.add_tokens(tokens_to_add_from_fast)
+
+        # allows converting a slow -> fast, non-legacy: if the `tokenizer.json` does not have all the added tokens
+        # uses the information stored in `added_tokens_decoder`. Checks after addition that we have the same ids
+        if init_kwargs.get("slow_to_fast", False):
+            tokenizer.add_tokens([token for _, token in sorted(added_tokens_decoder.items(), key=lambda x: x[0])])
+            # finally we add all the special_tokens to make sure eveything is initialized
+            tokenizer.add_tokens(tokenizer.all_special_tokens_extended, special_tokens=True)
+
+        if len(added_tokens_decoder) > 0:
             logger.warning_advice(
                 "Special tokens have been added in the vocabulary, make sure the associated word embeddings are"
                 " fine-tuned or trained."
             )
-
         return tokenizer
 
     @staticmethod
@@ -2043,6 +2307,21 @@ def _eventually_correct_t5_max_length(pretrained_model_name_or_path, max_model_l
         # which we will correct in Transformers v5.
         return max_model_length
 
+    @classmethod
+    def convert_added_tokens(cls, obj: Union[AddedToken, Any], add_type_field=True):
+        if isinstance(obj, dict) and "__type" in obj and obj["__type"] == "AddedToken":
+            obj.pop("__type")
+            return AddedToken(**obj)
+        if isinstance(obj, AddedToken):
+            if add_type_field:
+                obj = obj.content
+            return obj
+        elif isinstance(obj, (list, tuple)):
+            return [cls.convert_added_tokens(o, add_type_field=add_type_field) for o in obj]
+        elif isinstance(obj, dict):
+            return {k: cls.convert_added_tokens(v, add_type_field=add_type_field) for k, v in obj.items()}
+        return obj
+
     def save_pretrained(
         self,
         save_directory: Union[str, os.PathLike],
@@ -2074,18 +2353,30 @@ def save_pretrained(
 
                 If `True`, will save the tokenizer in legacy format. If the "slow" tokenizer doesn't exits, a value
                 error is raised.
-            filename_prefix: (`str`, *optional*):
+            filename_prefix (`str`, *optional*):
                 A prefix to add to the names of the files saved by the tokenizer.
             push_to_hub (`bool`, *optional*, defaults to `False`):
                 Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
                 repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
                 namespace).
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
 
         Returns:
             A tuple of `str`: The files saved.
         """
+        use_auth_token = kwargs.pop("use_auth_token", None)
+
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if kwargs.get("token", None) is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            kwargs["token"] = use_auth_token
+
         if os.path.isfile(save_directory):
             logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
             return
@@ -2095,7 +2386,7 @@ def save_pretrained(
         if push_to_hub:
             commit_message = kwargs.pop("commit_message", None)
             repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
-            repo_id, token = self._create_repo(repo_id, **kwargs)
+            repo_id = self._create_repo(repo_id, **kwargs)
             files_timestamps = self._get_files_timestamps(save_directory)
 
         special_tokens_map_file = os.path.join(
@@ -2107,33 +2398,27 @@ def save_pretrained(
 
         tokenizer_config = copy.deepcopy(self.init_kwargs)
 
-        # TODO: Ensure the modified attributes (those are also in the __init__ kwargs) will give identical tokenizers
-        # target_keys = self.init_kwargs.keys()
-        target_keys = ["model_max_length"]
+        target_keys = list(self.init_kwargs.keys())
+        target_keys += ["model_max_length", "clean_up_tokenization_spaces", "additional_special_tokens"]
         for k in target_keys:
             if hasattr(self, k):
                 tokenizer_config[k] = getattr(self, k)
 
+        if self.chat_template is not None:
+            tokenizer_config["chat_template"] = self.chat_template
+
         if len(self.init_inputs) > 0:
             tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs)
         for file_id in self.vocab_files_names.keys():
             tokenizer_config.pop(file_id, None)
 
-        # Sanitize AddedTokens
-        def convert_added_tokens(obj: Union[AddedToken, Any], add_type_field=True):
-            if isinstance(obj, AddedToken):
-                out = obj.__getstate__()
-                if add_type_field:
-                    out["__type"] = "AddedToken"
-                return out
-            elif isinstance(obj, (list, tuple)):
-                return list(convert_added_tokens(o, add_type_field=add_type_field) for o in obj)
-            elif isinstance(obj, dict):
-                return {k: convert_added_tokens(v, add_type_field=add_type_field) for k, v in obj.items()}
-            return obj
-
         # add_type_field=True to allow dicts in the kwargs / differentiate from AddedToken serialization
-        tokenizer_config = convert_added_tokens(tokenizer_config, add_type_field=True)
+        tokenizer_config = self.convert_added_tokens(tokenizer_config, add_type_field=True)
+
+        added_tokens = {}
+        for key, value in self.added_tokens_decoder.items():
+            added_tokens[key] = value.__getstate__()
+        tokenizer_config["added_tokens_decoder"] = added_tokens
 
         # Add tokenizer class to the tokenizer config to be able to reload it with from_pretrained
         tokenizer_class = self.__class__.__name__
@@ -2151,13 +2436,20 @@ def convert_added_tokens(obj: Union[AddedToken, Any], add_type_field=True):
         if self._auto_class is not None:
             custom_object_save(self, save_directory, config=tokenizer_config)
 
+        # remove private information
+        if "name_or_path" in tokenizer_config:
+            tokenizer_config.pop("name_or_path")
+            tokenizer_config.pop("special_tokens_map_file", None)
+
         with open(tokenizer_config_file, "w", encoding="utf-8") as f:
             out_str = json.dumps(tokenizer_config, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
             f.write(out_str)
         logger.info(f"tokenizer config file saved in {tokenizer_config_file}")
 
         # Sanitize AddedTokens in special_tokens_map
-        write_dict = convert_added_tokens(self.special_tokens_map_extended, add_type_field=False)
+
+        # kept for forward compatibility, will be removed in transoformers 5
+        write_dict = self.convert_added_tokens(self.special_tokens_map_extended, add_type_field=True)
         with open(special_tokens_map_file, "w", encoding="utf-8") as f:
             out_str = json.dumps(write_dict, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
             f.write(out_str)
@@ -2174,7 +2466,11 @@ def convert_added_tokens(obj: Union[AddedToken, Any], add_type_field=True):
 
         if push_to_hub:
             self._upload_modified_files(
-                save_directory, repo_id, files_timestamps, commit_message=commit_message, token=token
+                save_directory,
+                repo_id,
+                files_timestamps,
+                commit_message=commit_message,
+                token=kwargs.get("token"),
             )
 
         return save_files
@@ -2271,7 +2567,7 @@ def encode(
         max_length: Optional[int] = None,
         stride: int = 0,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs
+        **kwargs,
     ) -> List[int]:
         """
         Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary.
@@ -2421,7 +2717,7 @@ def _get_padding_truncation_strategies(
                     max_length = self.model_max_length
 
         # Test if we have a padding token
-        if padding_strategy != PaddingStrategy.DO_NOT_PAD and (not self.pad_token or self.pad_token_id < 0):
+        if padding_strategy != PaddingStrategy.DO_NOT_PAD and (self.pad_token is None or self.pad_token_id < 0):
             raise ValueError(
                 "Asking to pad but the tokenizer does not have a padding token. "
                 "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
@@ -2467,7 +2763,7 @@ def __call__(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
@@ -2492,23 +2788,23 @@ def __call__(
                 you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
         """
         # To avoid duplicating
-        all_kwargs = dict(
-            add_special_tokens=add_special_tokens,
-            padding=padding,
-            truncation=truncation,
-            max_length=max_length,
-            stride=stride,
-            is_split_into_words=is_split_into_words,
-            pad_to_multiple_of=pad_to_multiple_of,
-            return_tensors=return_tensors,
-            return_token_type_ids=return_token_type_ids,
-            return_attention_mask=return_attention_mask,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-            return_offsets_mapping=return_offsets_mapping,
-            return_length=return_length,
-            verbose=verbose,
-        )
+        all_kwargs = {
+            "add_special_tokens": add_special_tokens,
+            "padding": padding,
+            "truncation": truncation,
+            "max_length": max_length,
+            "stride": stride,
+            "is_split_into_words": is_split_into_words,
+            "pad_to_multiple_of": pad_to_multiple_of,
+            "return_tensors": return_tensors,
+            "return_token_type_ids": return_token_type_ids,
+            "return_attention_mask": return_attention_mask,
+            "return_overflowing_tokens": return_overflowing_tokens,
+            "return_special_tokens_mask": return_special_tokens_mask,
+            "return_offsets_mapping": return_offsets_mapping,
+            "return_length": return_length,
+            "verbose": verbose,
+        }
         all_kwargs.update(kwargs)
         if text is None and text_target is None:
             raise ValueError("You need to specify either `text` or `text_target`.")
@@ -2551,7 +2847,7 @@ def _call_one(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         # Input type checking for clearer error
         def _is_valid_text_input(t):
@@ -2664,7 +2960,7 @@ def encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Tokenize and prepare for the model a sequence or a pair of sequences.
@@ -2736,7 +3032,7 @@ def _encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         raise NotImplementedError
 
@@ -2766,7 +3062,7 @@ def batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Tokenize and prepare for the model a list of sequences or a list of pairs of sequences.
@@ -2839,7 +3135,7 @@ def _batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         raise NotImplementedError
 
@@ -2902,7 +3198,7 @@ def pad(
                 If set will pad the sequence to a multiple of the provided value.
 
                 This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
-                >= 7.5 (Volta).
+                `>= 7.5` (Volta).
             return_attention_mask (`bool`, *optional*):
                 Whether to return the attention mask. If left to the default, will return the attention mask according
                 to the specific tokenizer's default, defined by the `return_outputs` attribute.
@@ -3000,7 +3296,7 @@ def pad(
 
         batch_outputs = {}
         for i in range(batch_size):
-            inputs = dict((k, v[i]) for k, v in encoded_inputs.items())
+            inputs = {k: v[i] for k, v in encoded_inputs.items()}
             outputs = self._pad(
                 inputs,
                 max_length=max_length,
@@ -3076,7 +3372,7 @@ def prepare_for_model(
         return_length: bool = False,
         verbose: bool = True,
         prepend_batch_axis: bool = False,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
@@ -3264,8 +3560,7 @@ def truncate_sequences(
                 )
                 if truncation_strategy == TruncationStrategy.ONLY_FIRST:
                     error_msg = (
-                        error_msg
-                        + "Please select another truncation strategy than "
+                        error_msg + "Please select another truncation strategy than "
                         f"{truncation_strategy}, for instance 'longest_first' or 'only_second'."
                     )
                 logger.error(error_msg)
@@ -3339,7 +3634,7 @@ def _pad(
                     - 'right': pads on the right of the sequences
             pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                 This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
-                >= 7.5 (Volta).
+                `>= 7.5` (Volta).
             return_attention_mask:
                 (optional) Set to False to avoid returning attention mask (default: set to model specifics)
         """
@@ -3366,7 +3661,6 @@ def _pad(
 
             if self.padding_side == "right":
                 if return_attention_mask:
-
                     encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
                 if "token_type_ids" in encoded_inputs:
                     encoded_inputs["token_type_ids"] = (
@@ -3407,8 +3701,8 @@ def batch_decode(
         self,
         sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"],
         skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: bool = True,
-        **kwargs
+        clean_up_tokenization_spaces: bool = None,
+        **kwargs,
     ) -> List[str]:
         """
         Convert a list of lists of token ids into a list of strings by calling decode.
@@ -3418,8 +3712,9 @@ def batch_decode(
                 List of tokenized input ids. Can be obtained using the `__call__` method.
             skip_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not to remove special tokens in the decoding.
-            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
-                Whether or not to clean up the tokenization spaces.
+            clean_up_tokenization_spaces (`bool`, *optional*):
+                Whether or not to clean up the tokenization spaces. If `None`, will default to
+                `self.clean_up_tokenization_spaces`.
             kwargs (additional keyword arguments, *optional*):
                 Will be passed to the underlying model specific decode method.
 
@@ -3440,8 +3735,8 @@ def decode(
         self,
         token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
         skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: bool = True,
-        **kwargs
+        clean_up_tokenization_spaces: bool = None,
+        **kwargs,
     ) -> str:
         """
         Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
@@ -3454,8 +3749,9 @@ def decode(
                 List of tokenized input ids. Can be obtained using the `__call__` method.
             skip_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not to remove special tokens in the decoding.
-            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
-                Whether or not to clean up the tokenization spaces.
+            clean_up_tokenization_spaces (`bool`, *optional*):
+                Whether or not to clean up the tokenization spaces. If `None`, will default to
+                `self.clean_up_tokenization_spaces`.
             kwargs (additional keyword arguments, *optional*):
                 Will be passed to the underlying model specific decode method.
 
@@ -3476,8 +3772,8 @@ def _decode(
         self,
         token_ids: Union[int, List[int]],
         skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: bool = True,
-        **kwargs
+        clean_up_tokenization_spaces: bool = None,
+        **kwargs,
     ) -> str:
         raise NotImplementedError
 
diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py
index d6690dda560e..2c6b3c167fec 100644
--- a/src/transformers/tokenization_utils_fast.py
+++ b/src/transformers/tokenization_utils_fast.py
@@ -90,13 +90,13 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
 
     vocab_files_names = VOCAB_FILES_NAMES
     slow_tokenizer_class: PreTrainedTokenizer = None
-    can_save_slow_tokenizer: bool = True
 
     def __init__(self, *args, **kwargs):
         tokenizer_object = kwargs.pop("tokenizer_object", None)
         slow_tokenizer = kwargs.pop("__slow_tokenizer", None)
         fast_tokenizer_file = kwargs.pop("tokenizer_file", None)
         from_slow = kwargs.pop("from_slow", False)
+        slow_to_fast = kwargs.pop("slow_to_fast", False)
 
         if from_slow and slow_tokenizer is None and self.slow_tokenizer_class is None:
             raise ValueError(
@@ -132,13 +132,45 @@ def __init__(self, *args, **kwargs):
 
         self._decode_use_source_tokenizer = False
 
+        _truncation = self._tokenizer.truncation
+
+        if _truncation is not None:
+            self._tokenizer.enable_truncation(**_truncation)
+            kwargs.setdefault("max_length", _truncation["max_length"])
+            kwargs.setdefault("truncation_side", _truncation["direction"])
+            kwargs.setdefault("stride", _truncation["stride"])
+            kwargs.setdefault("truncation_strategy", _truncation["strategy"])
+        else:
+            self._tokenizer.no_truncation()
+
+        _padding = self._tokenizer.padding
+        if _padding is not None:
+            self._tokenizer.enable_padding(**_padding)
+            kwargs.setdefault("pad_token", _padding["pad_token"])
+            kwargs.setdefault("pad_token_type_id", _padding["pad_type_id"])
+            kwargs.setdefault("padding_side", _padding["direction"])
+            kwargs.setdefault("max_length", _padding["length"])
+            kwargs.setdefault("pad_to_multiple_of", _padding["pad_to_multiple_of"])
+
         # We call this after having initialized the backend tokenizer because we update it.
         super().__init__(**kwargs)
 
+        # We add the additional tokens that are not part of the vocab
+        if not slow_to_fast:
+            self._add_tokens(self.all_special_tokens_extended, special_tokens=True)
+
     @property
     def is_fast(self) -> bool:
         return True
 
+    @property
+    def can_save_slow_tokenizer(self) -> bool:
+        """
+        `bool`: Whether or not the slow tokenizer can be saved. Usually for sentencepiece based slow tokenizer, this
+        can only be `True` if the original `"sentencepiece.model"` was not deleted.
+        """
+        return True
+
     @property
     def vocab_size(self) -> int:
         """
@@ -153,6 +185,24 @@ def get_vocab(self) -> Dict[str, int]:
     def vocab(self) -> Dict[str, int]:
         return self.get_vocab()
 
+    @property
+    def added_tokens_encoder(self) -> Dict[str, int]:
+        """
+        Returns the sorted mapping from string to index. The added tokens encoder is cached for performance
+        optimisation in `self._added_tokens_encoder` for the slow tokenizers.
+        """
+        return {k.content: v for v, k in sorted(self.added_tokens_decoder.items(), key=lambda item: item[0])}
+
+    @property
+    def added_tokens_decoder(self) -> Dict[int, AddedToken]:
+        """
+        Returns the added tokens in the vocabulary as a dictionary of index to AddedToken.
+
+        Returns:
+            `Dict[str, int]`: The added tokens.
+        """
+        return self._tokenizer.get_added_tokens_decoder()
+
     def get_added_vocab(self) -> Dict[str, int]:
         """
         Returns the added tokens in the vocabulary as a dictionary of token to index.
@@ -160,10 +210,7 @@ def get_added_vocab(self) -> Dict[str, int]:
         Returns:
             `Dict[str, int]`: The added tokens.
         """
-        base_vocab = self._tokenizer.get_vocab(with_added_tokens=False)
-        full_vocab = self._tokenizer.get_vocab(with_added_tokens=True)
-        added_vocab = dict((tok, index) for tok, index in full_vocab.items() if tok not in base_vocab)
-        return added_vocab
+        return {k.content: v for v, k in sorted(self.added_tokens_decoder.items(), key=lambda item: item[0])}
 
     def __len__(self) -> int:
         """
@@ -249,10 +296,7 @@ def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, Lis
         if isinstance(tokens, str):
             return self._convert_token_to_id_with_added_voc(tokens)
 
-        ids = []
-        for token in tokens:
-            ids.append(self._convert_token_to_id_with_added_voc(token))
-        return ids
+        return [self._convert_token_to_id_with_added_voc(token) for token in tokens]
 
     def _convert_token_to_id_with_added_voc(self, token: str) -> int:
         index = self._tokenizer.token_to_id(token)
@@ -346,7 +390,7 @@ def set_truncation_and_padding(
                 The stride to use when handling overflow.
             pad_to_multiple_of (`int`, *optional*):
                 If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
-                the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
+                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
         """
         _truncation = self._tokenizer.truncation
         _padding = self._tokenizer.padding
@@ -411,7 +455,6 @@ def _batch_encode_plus(
         return_length: bool = False,
         verbose: bool = True,
     ) -> BatchEncoding:
-
         if not isinstance(batch_text_or_text_pairs, (tuple, list)):
             raise TypeError(
                 f"batch_text_or_text_pairs has to be a list or a tuple (got {type(batch_text_or_text_pairs)})"
@@ -495,9 +538,8 @@ def _encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
-
         batched_input = [(text, text_pair)] if text_pair else [text]
         batched_output = self._batch_encode_plus(
             batched_input,
@@ -541,8 +583,8 @@ def _decode(
         self,
         token_ids: Union[int, List[int]],
         skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: bool = True,
-        **kwargs
+        clean_up_tokenization_spaces: bool = None,
+        **kwargs,
     ) -> str:
         self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
 
@@ -550,6 +592,11 @@ def _decode(
             token_ids = [token_ids]
         text = self._tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
 
+        clean_up_tokenization_spaces = (
+            clean_up_tokenization_spaces
+            if clean_up_tokenization_spaces is not None
+            else self.clean_up_tokenization_spaces
+        )
         if clean_up_tokenization_spaces:
             clean_text = self.clean_up_tokenization(text)
             return clean_text
@@ -630,7 +677,7 @@ def train_new_from_iterator(
             special_tokens_map (`Dict[str, str]`, *optional*):
                 If you want to rename some of the special tokens this tokenizer uses, pass along a mapping old special
                 token name to new special token name in this argument.
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Additional keyword arguments passed along to the trainer from the 🤗 Tokenizers library.
 
         Returns:
@@ -703,7 +750,7 @@ def train_new_from_iterator(
             kwargs["end_of_word_suffix"] = tokenizer_json["model"]["end_of_word_suffix"]
         if tokenizer_json["model"]["type"] == "Unigram" and unk_token is not None:
             kwargs["unk_token"] = unk_token
-        if tokenizer_json["pre_tokenizer"]["type"] == "ByteLevel":
+        if tokenizer_json["pre_tokenizer"] is not None and tokenizer_json["pre_tokenizer"]["type"] == "ByteLevel":
             kwargs["initial_alphabet"] = pre_tokenizers_fast.ByteLevel.alphabet()
 
         trainer_class = MODEL_TO_TRAINER_MAPPING[tokenizer_json["model"]["type"]]
@@ -752,6 +799,7 @@ def train_new_from_iterator(
                         lstrip=special_token_full.lstrip,
                         rstrip=special_token_full.rstrip,
                         normalized=special_token_full.normalized,
+                        special=True,
                     )
                 else:
                     kwargs[token] = special_token
diff --git a/src/transformers/tools/__init__.py b/src/transformers/tools/__init__.py
new file mode 100644
index 000000000000..68d66eb275e0
--- /dev/null
+++ b/src/transformers/tools/__init__.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ..utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "agents": ["Agent", "AzureOpenAiAgent", "HfAgent", "LocalAgent", "OpenAiAgent"],
+    "base": ["PipelineTool", "RemoteTool", "Tool", "launch_gradio_demo", "load_tool"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["document_question_answering"] = ["DocumentQuestionAnsweringTool"]
+    _import_structure["image_captioning"] = ["ImageCaptioningTool"]
+    _import_structure["image_question_answering"] = ["ImageQuestionAnsweringTool"]
+    _import_structure["image_segmentation"] = ["ImageSegmentationTool"]
+    _import_structure["speech_to_text"] = ["SpeechToTextTool"]
+    _import_structure["text_classification"] = ["TextClassificationTool"]
+    _import_structure["text_question_answering"] = ["TextQuestionAnsweringTool"]
+    _import_structure["text_summarization"] = ["TextSummarizationTool"]
+    _import_structure["text_to_speech"] = ["TextToSpeechTool"]
+    _import_structure["translation"] = ["TranslationTool"]
+
+if TYPE_CHECKING:
+    from .agents import Agent, AzureOpenAiAgent, HfAgent, LocalAgent, OpenAiAgent
+    from .base import PipelineTool, RemoteTool, Tool, launch_gradio_demo, load_tool
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .document_question_answering import DocumentQuestionAnsweringTool
+        from .image_captioning import ImageCaptioningTool
+        from .image_question_answering import ImageQuestionAnsweringTool
+        from .image_segmentation import ImageSegmentationTool
+        from .speech_to_text import SpeechToTextTool
+        from .text_classification import TextClassificationTool
+        from .text_question_answering import TextQuestionAnsweringTool
+        from .text_summarization import TextSummarizationTool
+        from .text_to_speech import TextToSpeechTool
+        from .translation import TranslationTool
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/tools/agent_types.py b/src/transformers/tools/agent_types.py
new file mode 100644
index 000000000000..f1c3261d57ca
--- /dev/null
+++ b/src/transformers/tools/agent_types.py
@@ -0,0 +1,277 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import pathlib
+import tempfile
+import uuid
+
+import numpy as np
+
+from ..utils import is_soundfile_availble, is_torch_available, is_vision_available, logging
+
+
+logger = logging.get_logger(__name__)
+
+if is_vision_available():
+    import PIL.Image
+    from PIL import Image
+    from PIL.Image import Image as ImageType
+else:
+    ImageType = object
+
+if is_torch_available():
+    import torch
+
+if is_soundfile_availble():
+    import soundfile as sf
+
+
+class AgentType:
+    """
+    Abstract class to be reimplemented to define types that can be returned by agents.
+
+    These objects serve three purposes:
+
+    - They behave as they were the type they're meant to be, e.g., a string for text, a PIL.Image for images
+    - They can be stringified: str(object) in order to return a string defining the object
+    - They should be displayed correctly in ipython notebooks/colab/jupyter
+    """
+
+    def __init__(self, value):
+        self._value = value
+
+    def __str__(self):
+        return self.to_string()
+
+    def to_raw(self):
+        logger.error(
+            "This is a raw AgentType of unknown type. Display in notebooks and string conversion will be unreliable"
+        )
+        return self._value
+
+    def to_string(self) -> str:
+        logger.error(
+            "This is a raw AgentType of unknown type. Display in notebooks and string conversion will be unreliable"
+        )
+        return str(self._value)
+
+
+class AgentText(AgentType, str):
+    """
+    Text type returned by the agent. Behaves as a string.
+    """
+
+    def to_raw(self):
+        return self._value
+
+    def to_string(self):
+        return self._value
+
+
+class AgentImage(AgentType, ImageType):
+    """
+    Image type returned by the agent. Behaves as a PIL.Image.
+    """
+
+    def __init__(self, value):
+        super().__init__(value)
+
+        if not is_vision_available():
+            raise ImportError("PIL must be installed in order to handle images.")
+
+        self._path = None
+        self._raw = None
+        self._tensor = None
+
+        if isinstance(value, ImageType):
+            self._raw = value
+        elif isinstance(value, (str, pathlib.Path)):
+            self._path = value
+        elif isinstance(value, torch.Tensor):
+            self._tensor = value
+        else:
+            raise ValueError(f"Unsupported type for {self.__class__.__name__}: {type(value)}")
+
+    def _ipython_display_(self, include=None, exclude=None):
+        """
+        Displays correctly this type in an ipython notebook (ipython, colab, jupyter, ...)
+        """
+        from IPython.display import Image, display
+
+        display(Image(self.to_string()))
+
+    def to_raw(self):
+        """
+        Returns the "raw" version of that object. In the case of an AgentImage, it is a PIL.Image.
+        """
+        if self._raw is not None:
+            return self._raw
+
+        if self._path is not None:
+            self._raw = Image.open(self._path)
+            return self._raw
+
+    def to_string(self):
+        """
+        Returns the stringified version of that object. In the case of an AgentImage, it is a path to the serialized
+        version of the image.
+        """
+        if self._path is not None:
+            return self._path
+
+        if self._raw is not None:
+            directory = tempfile.mkdtemp()
+            self._path = os.path.join(directory, str(uuid.uuid4()) + ".png")
+            self._raw.save(self._path)
+
+            return self._path
+
+        if self._tensor is not None:
+            array = self._tensor.cpu().detach().numpy()
+
+            # There is likely simpler than load into image into save
+            img = Image.fromarray((array * 255).astype(np.uint8))
+
+            directory = tempfile.mkdtemp()
+            self._path = os.path.join(directory, str(uuid.uuid4()) + ".png")
+
+            img.save(self._path)
+
+            return self._path
+
+
+class AgentAudio(AgentType):
+    """
+    Audio type returned by the agent.
+    """
+
+    def __init__(self, value, samplerate=16_000):
+        super().__init__(value)
+
+        if not is_soundfile_availble():
+            raise ImportError("soundfile must be installed in order to handle audio.")
+
+        self._path = None
+        self._tensor = None
+
+        self.samplerate = samplerate
+
+        if isinstance(value, (str, pathlib.Path)):
+            self._path = value
+        elif isinstance(value, torch.Tensor):
+            self._tensor = value
+        else:
+            raise ValueError(f"Unsupported audio type: {type(value)}")
+
+    def _ipython_display_(self, include=None, exclude=None):
+        """
+        Displays correctly this type in an ipython notebook (ipython, colab, jupyter, ...)
+        """
+        from IPython.display import Audio, display
+
+        display(Audio(self.to_string(), rate=self.samplerate))
+
+    def to_raw(self):
+        """
+        Returns the "raw" version of that object. It is a `torch.Tensor` object.
+        """
+        if self._tensor is not None:
+            return self._tensor
+
+        if self._path is not None:
+            tensor, self.samplerate = sf.read(self._path)
+            self._tensor = torch.tensor(tensor)
+            return self._tensor
+
+    def to_string(self):
+        """
+        Returns the stringified version of that object. In the case of an AgentAudio, it is a path to the serialized
+        version of the audio.
+        """
+        if self._path is not None:
+            return self._path
+
+        if self._tensor is not None:
+            directory = tempfile.mkdtemp()
+            self._path = os.path.join(directory, str(uuid.uuid4()) + ".wav")
+            sf.write(self._path, self._tensor, samplerate=self.samplerate)
+            return self._path
+
+
+AGENT_TYPE_MAPPING = {"text": AgentText, "image": AgentImage, "audio": AgentAudio}
+INSTANCE_TYPE_MAPPING = {str: AgentText}
+
+if is_vision_available():
+    INSTANCE_TYPE_MAPPING[PIL.Image] = AgentImage
+
+
+def handle_agent_inputs(*args, **kwargs):
+    args = [(arg.to_raw() if isinstance(arg, AgentType) else arg) for arg in args]
+    kwargs = {k: (v.to_raw() if isinstance(v, AgentType) else v) for k, v in kwargs.items()}
+    return args, kwargs
+
+
+def handle_agent_outputs(outputs, output_types=None):
+    if isinstance(outputs, dict):
+        decoded_outputs = {}
+        for i, (k, v) in enumerate(outputs.items()):
+            if output_types is not None:
+                # If the class has defined outputs, we can map directly according to the class definition
+                if output_types[i] in AGENT_TYPE_MAPPING:
+                    decoded_outputs[k] = AGENT_TYPE_MAPPING[output_types[i]](v)
+                else:
+                    decoded_outputs[k] = AgentType(v)
+
+            else:
+                # If the class does not have defined output, then we map according to the type
+                for _k, _v in INSTANCE_TYPE_MAPPING.items():
+                    if isinstance(v, _k):
+                        decoded_outputs[k] = _v(v)
+                if k not in decoded_outputs:
+                    decoded_outputs[k] = AgentType[v]
+
+    elif isinstance(outputs, (list, tuple)):
+        decoded_outputs = type(outputs)()
+        for i, v in enumerate(outputs):
+            if output_types is not None:
+                # If the class has defined outputs, we can map directly according to the class definition
+                if output_types[i] in AGENT_TYPE_MAPPING:
+                    decoded_outputs.append(AGENT_TYPE_MAPPING[output_types[i]](v))
+                else:
+                    decoded_outputs.append(AgentType(v))
+            else:
+                # If the class does not have defined output, then we map according to the type
+                found = False
+                for _k, _v in INSTANCE_TYPE_MAPPING.items():
+                    if isinstance(v, _k):
+                        decoded_outputs.append(_v(v))
+                        found = True
+
+                if not found:
+                    decoded_outputs.append(AgentType(v))
+
+    else:
+        if output_types[0] in AGENT_TYPE_MAPPING:
+            # If the class has defined outputs, we can map directly according to the class definition
+            decoded_outputs = AGENT_TYPE_MAPPING[output_types[0]](outputs)
+
+        else:
+            # If the class does not have defined output, then we map according to the type
+            for _k, _v in INSTANCE_TYPE_MAPPING.items():
+                if isinstance(outputs, _k):
+                    return _v(outputs)
+            return AgentType(outputs)
+
+    return decoded_outputs
diff --git a/src/transformers/tools/agents.py b/src/transformers/tools/agents.py
new file mode 100644
index 000000000000..51e3f6db0c25
--- /dev/null
+++ b/src/transformers/tools/agents.py
@@ -0,0 +1,771 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib.util
+import json
+import os
+import time
+from dataclasses import dataclass
+from typing import Dict
+
+import requests
+from huggingface_hub import HfFolder, hf_hub_download, list_spaces
+
+from ..models.auto import AutoTokenizer
+from ..utils import is_offline_mode, is_openai_available, is_torch_available, logging
+from .base import TASK_MAPPING, TOOL_CONFIG_FILE, Tool, load_tool, supports_remote
+from .prompts import CHAT_MESSAGE_PROMPT, download_prompt
+from .python_interpreter import evaluate
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_openai_available():
+    import openai
+
+if is_torch_available():
+    from ..generation import StoppingCriteria, StoppingCriteriaList
+    from ..models.auto import AutoModelForCausalLM
+else:
+    StoppingCriteria = object
+
+_tools_are_initialized = False
+
+
+BASE_PYTHON_TOOLS = {
+    "print": print,
+    "range": range,
+    "float": float,
+    "int": int,
+    "bool": bool,
+    "str": str,
+}
+
+
+@dataclass
+class PreTool:
+    task: str
+    description: str
+    repo_id: str
+
+
+HUGGINGFACE_DEFAULT_TOOLS = {}
+
+
+HUGGINGFACE_DEFAULT_TOOLS_FROM_HUB = [
+    "image-transformation",
+    "text-download",
+    "text-to-image",
+    "text-to-video",
+]
+
+
+def get_remote_tools(organization="huggingface-tools"):
+    if is_offline_mode():
+        logger.info("You are in offline mode, so remote tools are not available.")
+        return {}
+
+    spaces = list_spaces(author=organization)
+    tools = {}
+    for space_info in spaces:
+        repo_id = space_info.id
+        resolved_config_file = hf_hub_download(repo_id, TOOL_CONFIG_FILE, repo_type="space")
+        with open(resolved_config_file, encoding="utf-8") as reader:
+            config = json.load(reader)
+
+        task = repo_id.split("/")[-1]
+        tools[config["name"]] = PreTool(task=task, description=config["description"], repo_id=repo_id)
+
+    return tools
+
+
+def _setup_default_tools():
+    global HUGGINGFACE_DEFAULT_TOOLS
+    global _tools_are_initialized
+
+    if _tools_are_initialized:
+        return
+
+    main_module = importlib.import_module("transformers")
+    tools_module = main_module.tools
+
+    remote_tools = get_remote_tools()
+    for task_name, tool_class_name in TASK_MAPPING.items():
+        tool_class = getattr(tools_module, tool_class_name)
+        description = tool_class.description
+        HUGGINGFACE_DEFAULT_TOOLS[tool_class.name] = PreTool(task=task_name, description=description, repo_id=None)
+
+    if not is_offline_mode():
+        for task_name in HUGGINGFACE_DEFAULT_TOOLS_FROM_HUB:
+            found = False
+            for tool_name, tool in remote_tools.items():
+                if tool.task == task_name:
+                    HUGGINGFACE_DEFAULT_TOOLS[tool_name] = tool
+                    found = True
+                    break
+
+            if not found:
+                raise ValueError(f"{task_name} is not implemented on the Hub.")
+
+    _tools_are_initialized = True
+
+
+def resolve_tools(code, toolbox, remote=False, cached_tools=None):
+    if cached_tools is None:
+        resolved_tools = BASE_PYTHON_TOOLS.copy()
+    else:
+        resolved_tools = cached_tools
+    for name, tool in toolbox.items():
+        if name not in code or name in resolved_tools:
+            continue
+
+        if isinstance(tool, Tool):
+            resolved_tools[name] = tool
+        else:
+            task_or_repo_id = tool.task if tool.repo_id is None else tool.repo_id
+            _remote = remote and supports_remote(task_or_repo_id)
+            resolved_tools[name] = load_tool(task_or_repo_id, remote=_remote)
+
+    return resolved_tools
+
+
+def get_tool_creation_code(code, toolbox, remote=False):
+    code_lines = ["from transformers import load_tool", ""]
+    for name, tool in toolbox.items():
+        if name not in code or isinstance(tool, Tool):
+            continue
+
+        task_or_repo_id = tool.task if tool.repo_id is None else tool.repo_id
+        line = f'{name} = load_tool("{task_or_repo_id}"'
+        if remote:
+            line += ", remote=True"
+        line += ")"
+        code_lines.append(line)
+
+    return "\n".join(code_lines) + "\n"
+
+
+def clean_code_for_chat(result):
+    lines = result.split("\n")
+    idx = 0
+    while idx < len(lines) and not lines[idx].lstrip().startswith("```"):
+        idx += 1
+    explanation = "\n".join(lines[:idx]).strip()
+    if idx == len(lines):
+        return explanation, None
+
+    idx += 1
+    start_idx = idx
+    while not lines[idx].lstrip().startswith("```"):
+        idx += 1
+    code = "\n".join(lines[start_idx:idx]).strip()
+
+    return explanation, code
+
+
+def clean_code_for_run(result):
+    result = f"I will use the following {result}"
+    explanation, code = result.split("Answer:")
+    explanation = explanation.strip()
+    code = code.strip()
+
+    code_lines = code.split("\n")
+    if code_lines[0] in ["```", "```py", "```python"]:
+        code_lines = code_lines[1:]
+    if code_lines[-1] == "```":
+        code_lines = code_lines[:-1]
+    code = "\n".join(code_lines)
+
+    return explanation, code
+
+
+class Agent:
+    """
+    Base class for all agents which contains the main API methods.
+
+    Args:
+        chat_prompt_template (`str`, *optional*):
+            Pass along your own prompt if you want to override the default template for the `chat` method. Can be the
+            actual prompt template or a repo ID (on the Hugging Face Hub). The prompt should be in a file named
+            `chat_prompt_template.txt` in this repo in this case.
+        run_prompt_template (`str`, *optional*):
+            Pass along your own prompt if you want to override the default template for the `run` method. Can be the
+            actual prompt template or a repo ID (on the Hugging Face Hub). The prompt should be in a file named
+            `run_prompt_template.txt` in this repo in this case.
+        additional_tools ([`Tool`], list of tools or dictionary with tool values, *optional*):
+            Any additional tools to include on top of the default ones. If you pass along a tool with the same name as
+            one of the default tools, that default tool will be overridden.
+    """
+
+    def __init__(self, chat_prompt_template=None, run_prompt_template=None, additional_tools=None):
+        _setup_default_tools()
+
+        agent_name = self.__class__.__name__
+        self.chat_prompt_template = download_prompt(chat_prompt_template, agent_name, mode="chat")
+        self.run_prompt_template = download_prompt(run_prompt_template, agent_name, mode="run")
+        self._toolbox = HUGGINGFACE_DEFAULT_TOOLS.copy()
+        self.log = print
+        if additional_tools is not None:
+            if isinstance(additional_tools, (list, tuple)):
+                additional_tools = {t.name: t for t in additional_tools}
+            elif not isinstance(additional_tools, dict):
+                additional_tools = {additional_tools.name: additional_tools}
+
+            replacements = {name: tool for name, tool in additional_tools.items() if name in HUGGINGFACE_DEFAULT_TOOLS}
+            self._toolbox.update(additional_tools)
+            if len(replacements) > 1:
+                names = "\n".join([f"- {n}: {t}" for n, t in replacements.items()])
+                logger.warning(
+                    f"The following tools have been replaced by the ones provided in `additional_tools`:\n{names}."
+                )
+            elif len(replacements) == 1:
+                name = list(replacements.keys())[0]
+                logger.warning(f"{name} has been replaced by {replacements[name]} as provided in `additional_tools`.")
+
+        self.prepare_for_new_chat()
+
+    @property
+    def toolbox(self) -> Dict[str, Tool]:
+        """Get all tool currently available to the agent"""
+        return self._toolbox
+
+    def format_prompt(self, task, chat_mode=False):
+        description = "\n".join([f"- {name}: {tool.description}" for name, tool in self.toolbox.items()])
+        if chat_mode:
+            if self.chat_history is None:
+                prompt = self.chat_prompt_template.replace("<<all_tools>>", description)
+            else:
+                prompt = self.chat_history
+            prompt += CHAT_MESSAGE_PROMPT.replace("<<task>>", task)
+        else:
+            prompt = self.run_prompt_template.replace("<<all_tools>>", description)
+            prompt = prompt.replace("<<prompt>>", task)
+        return prompt
+
+    def set_stream(self, streamer):
+        """
+        Set the function use to stream results (which is `print` by default).
+
+        Args:
+            streamer (`callable`): The function to call when streaming results from the LLM.
+        """
+        self.log = streamer
+
+    def chat(self, task, *, return_code=False, remote=False, **kwargs):
+        """
+        Sends a new request to the agent in a chat. Will use the previous ones in its history.
+
+        Args:
+            task (`str`): The task to perform
+            return_code (`bool`, *optional*, defaults to `False`):
+                Whether to just return code and not evaluate it.
+            remote (`bool`, *optional*, defaults to `False`):
+                Whether or not to use remote tools (inference endpoints) instead of local ones.
+            kwargs (additional keyword arguments, *optional*):
+                Any keyword argument to send to the agent when evaluating the code.
+
+        Example:
+
+        ```py
+        from transformers import HfAgent
+
+        agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
+        agent.chat("Draw me a picture of rivers and lakes")
+
+        agent.chat("Transform the picture so that there is a rock in there")
+        ```
+        """
+        prompt = self.format_prompt(task, chat_mode=True)
+        result = self.generate_one(prompt, stop=["Human:", "====="])
+        self.chat_history = prompt + result.strip() + "\n"
+        explanation, code = clean_code_for_chat(result)
+
+        self.log(f"==Explanation from the agent==\n{explanation}")
+
+        if code is not None:
+            self.log(f"\n\n==Code generated by the agent==\n{code}")
+            if not return_code:
+                self.log("\n\n==Result==")
+                self.cached_tools = resolve_tools(code, self.toolbox, remote=remote, cached_tools=self.cached_tools)
+                self.chat_state.update(kwargs)
+                return evaluate(code, self.cached_tools, self.chat_state, chat_mode=True)
+            else:
+                tool_code = get_tool_creation_code(code, self.toolbox, remote=remote)
+                return f"{tool_code}\n{code}"
+
+    def prepare_for_new_chat(self):
+        """
+        Clears the history of prior calls to [`~Agent.chat`].
+        """
+        self.chat_history = None
+        self.chat_state = {}
+        self.cached_tools = None
+
+    def run(self, task, *, return_code=False, remote=False, **kwargs):
+        """
+        Sends a request to the agent.
+
+        Args:
+            task (`str`): The task to perform
+            return_code (`bool`, *optional*, defaults to `False`):
+                Whether to just return code and not evaluate it.
+            remote (`bool`, *optional*, defaults to `False`):
+                Whether or not to use remote tools (inference endpoints) instead of local ones.
+            kwargs (additional keyword arguments, *optional*):
+                Any keyword argument to send to the agent when evaluating the code.
+
+        Example:
+
+        ```py
+        from transformers import HfAgent
+
+        agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
+        agent.run("Draw me a picture of rivers and lakes")
+        ```
+        """
+        prompt = self.format_prompt(task)
+        result = self.generate_one(prompt, stop=["Task:"])
+        explanation, code = clean_code_for_run(result)
+
+        self.log(f"==Explanation from the agent==\n{explanation}")
+
+        self.log(f"\n\n==Code generated by the agent==\n{code}")
+        if not return_code:
+            self.log("\n\n==Result==")
+            self.cached_tools = resolve_tools(code, self.toolbox, remote=remote, cached_tools=self.cached_tools)
+            return evaluate(code, self.cached_tools, state=kwargs.copy())
+        else:
+            tool_code = get_tool_creation_code(code, self.toolbox, remote=remote)
+            return f"{tool_code}\n{code}"
+
+    def generate_one(self, prompt, stop):
+        # This is the method to implement in your custom agent.
+        raise NotImplementedError
+
+    def generate_many(self, prompts, stop):
+        # Override if you have a way to do batch generation faster than one by one
+        return [self.generate_one(prompt, stop) for prompt in prompts]
+
+
+class OpenAiAgent(Agent):
+    """
+    Agent that uses the openai API to generate code.
+
+    <Tip warning={true}>
+
+    The openAI models are used in generation mode, so even for the `chat()` API, it's better to use models like
+    `"text-davinci-003"` over the chat-GPT variant. Proper support for chat-GPT models will come in a next version.
+
+    </Tip>
+
+    Args:
+        model (`str`, *optional*, defaults to `"text-davinci-003"`):
+            The name of the OpenAI model to use.
+        api_key (`str`, *optional*):
+            The API key to use. If unset, will look for the environment variable `"OPENAI_API_KEY"`.
+        chat_prompt_template (`str`, *optional*):
+            Pass along your own prompt if you want to override the default template for the `chat` method. Can be the
+            actual prompt template or a repo ID (on the Hugging Face Hub). The prompt should be in a file named
+            `chat_prompt_template.txt` in this repo in this case.
+        run_prompt_template (`str`, *optional*):
+            Pass along your own prompt if you want to override the default template for the `run` method. Can be the
+            actual prompt template or a repo ID (on the Hugging Face Hub). The prompt should be in a file named
+            `run_prompt_template.txt` in this repo in this case.
+        additional_tools ([`Tool`], list of tools or dictionary with tool values, *optional*):
+            Any additional tools to include on top of the default ones. If you pass along a tool with the same name as
+            one of the default tools, that default tool will be overridden.
+
+    Example:
+
+    ```py
+    from transformers import OpenAiAgent
+
+    agent = OpenAiAgent(model="text-davinci-003", api_key=xxx)
+    agent.run("Is the following `text` (in Spanish) positive or negative?", text="¡Este es un API muy agradable!")
+    ```
+    """
+
+    def __init__(
+        self,
+        model="text-davinci-003",
+        api_key=None,
+        chat_prompt_template=None,
+        run_prompt_template=None,
+        additional_tools=None,
+    ):
+        if not is_openai_available():
+            raise ImportError("Using `OpenAiAgent` requires `openai`: `pip install openai`.")
+
+        if api_key is None:
+            api_key = os.environ.get("OPENAI_API_KEY", None)
+        if api_key is None:
+            raise ValueError(
+                "You need an openai key to use `OpenAIAgent`. You can get one here: Get one here "
+                "https://openai.com/api/`. If you have one, set it in your env with `os.environ['OPENAI_API_KEY'] = "
+                "xxx."
+            )
+        else:
+            openai.api_key = api_key
+        self.model = model
+        super().__init__(
+            chat_prompt_template=chat_prompt_template,
+            run_prompt_template=run_prompt_template,
+            additional_tools=additional_tools,
+        )
+
+    def generate_many(self, prompts, stop):
+        if "gpt" in self.model:
+            return [self._chat_generate(prompt, stop) for prompt in prompts]
+        else:
+            return self._completion_generate(prompts, stop)
+
+    def generate_one(self, prompt, stop):
+        if "gpt" in self.model:
+            return self._chat_generate(prompt, stop)
+        else:
+            return self._completion_generate([prompt], stop)[0]
+
+    def _chat_generate(self, prompt, stop):
+        result = openai.ChatCompletion.create(
+            model=self.model,
+            messages=[{"role": "user", "content": prompt}],
+            temperature=0,
+            stop=stop,
+        )
+        return result["choices"][0]["message"]["content"]
+
+    def _completion_generate(self, prompts, stop):
+        result = openai.Completion.create(
+            model=self.model,
+            prompt=prompts,
+            temperature=0,
+            stop=stop,
+            max_tokens=200,
+        )
+        return [answer["text"] for answer in result["choices"]]
+
+
+class AzureOpenAiAgent(Agent):
+    """
+    Agent that uses Azure OpenAI to generate code. See the [official
+    documentation](https://learn.microsoft.com/en-us/azure/cognitive-services/openai/) to learn how to deploy an openAI
+    model on Azure
+
+    <Tip warning={true}>
+
+    The openAI models are used in generation mode, so even for the `chat()` API, it's better to use models like
+    `"text-davinci-003"` over the chat-GPT variant. Proper support for chat-GPT models will come in a next version.
+
+    </Tip>
+
+    Args:
+        deployment_id (`str`):
+            The name of the deployed Azure openAI model to use.
+        api_key (`str`, *optional*):
+            The API key to use. If unset, will look for the environment variable `"AZURE_OPENAI_API_KEY"`.
+        resource_name (`str`, *optional*):
+            The name of your Azure OpenAI Resource. If unset, will look for the environment variable
+            `"AZURE_OPENAI_RESOURCE_NAME"`.
+        api_version (`str`, *optional*, default to `"2022-12-01"`):
+            The API version to use for this agent.
+        is_chat_mode (`bool`, *optional*):
+            Whether you are using a completion model or a chat model (see note above, chat models won't be as
+            efficient). Will default to `gpt` being in the `deployment_id` or not.
+        chat_prompt_template (`str`, *optional*):
+            Pass along your own prompt if you want to override the default template for the `chat` method. Can be the
+            actual prompt template or a repo ID (on the Hugging Face Hub). The prompt should be in a file named
+            `chat_prompt_template.txt` in this repo in this case.
+        run_prompt_template (`str`, *optional*):
+            Pass along your own prompt if you want to override the default template for the `run` method. Can be the
+            actual prompt template or a repo ID (on the Hugging Face Hub). The prompt should be in a file named
+            `run_prompt_template.txt` in this repo in this case.
+        additional_tools ([`Tool`], list of tools or dictionary with tool values, *optional*):
+            Any additional tools to include on top of the default ones. If you pass along a tool with the same name as
+            one of the default tools, that default tool will be overridden.
+
+    Example:
+
+    ```py
+    from transformers import AzureOpenAiAgent
+
+    agent = AzureAiAgent(deployment_id="Davinci-003", api_key=xxx, resource_name=yyy)
+    agent.run("Is the following `text` (in Spanish) positive or negative?", text="¡Este es un API muy agradable!")
+    ```
+    """
+
+    def __init__(
+        self,
+        deployment_id,
+        api_key=None,
+        resource_name=None,
+        api_version="2022-12-01",
+        is_chat_model=None,
+        chat_prompt_template=None,
+        run_prompt_template=None,
+        additional_tools=None,
+    ):
+        if not is_openai_available():
+            raise ImportError("Using `OpenAiAgent` requires `openai`: `pip install openai`.")
+
+        self.deployment_id = deployment_id
+        openai.api_type = "azure"
+        if api_key is None:
+            api_key = os.environ.get("AZURE_OPENAI_API_KEY", None)
+        if api_key is None:
+            raise ValueError(
+                "You need an Azure openAI key to use `AzureOpenAIAgent`. If you have one, set it in your env with "
+                "`os.environ['AZURE_OPENAI_API_KEY'] = xxx."
+            )
+        else:
+            openai.api_key = api_key
+        if resource_name is None:
+            resource_name = os.environ.get("AZURE_OPENAI_RESOURCE_NAME", None)
+        if resource_name is None:
+            raise ValueError(
+                "You need a resource_name to use `AzureOpenAIAgent`. If you have one, set it in your env with "
+                "`os.environ['AZURE_OPENAI_RESOURCE_NAME'] = xxx."
+            )
+        else:
+            openai.api_base = f"https://{resource_name}.openai.azure.com"
+        openai.api_version = api_version
+
+        if is_chat_model is None:
+            is_chat_model = "gpt" in deployment_id.lower()
+        self.is_chat_model = is_chat_model
+
+        super().__init__(
+            chat_prompt_template=chat_prompt_template,
+            run_prompt_template=run_prompt_template,
+            additional_tools=additional_tools,
+        )
+
+    def generate_many(self, prompts, stop):
+        if self.is_chat_model:
+            return [self._chat_generate(prompt, stop) for prompt in prompts]
+        else:
+            return self._completion_generate(prompts, stop)
+
+    def generate_one(self, prompt, stop):
+        if self.is_chat_model:
+            return self._chat_generate(prompt, stop)
+        else:
+            return self._completion_generate([prompt], stop)[0]
+
+    def _chat_generate(self, prompt, stop):
+        result = openai.ChatCompletion.create(
+            engine=self.deployment_id,
+            messages=[{"role": "user", "content": prompt}],
+            temperature=0,
+            stop=stop,
+        )
+        return result["choices"][0]["message"]["content"]
+
+    def _completion_generate(self, prompts, stop):
+        result = openai.Completion.create(
+            engine=self.deployment_id,
+            prompt=prompts,
+            temperature=0,
+            stop=stop,
+            max_tokens=200,
+        )
+        return [answer["text"] for answer in result["choices"]]
+
+
+class HfAgent(Agent):
+    """
+    Agent that uses an inference endpoint to generate code.
+
+    Args:
+        url_endpoint (`str`):
+            The name of the url endpoint to use.
+        token (`str`, *optional*):
+            The token to use as HTTP bearer authorization for remote files. If unset, will use the token generated when
+            running `huggingface-cli login` (stored in `~/.huggingface`).
+        chat_prompt_template (`str`, *optional*):
+            Pass along your own prompt if you want to override the default template for the `chat` method. Can be the
+            actual prompt template or a repo ID (on the Hugging Face Hub). The prompt should be in a file named
+            `chat_prompt_template.txt` in this repo in this case.
+        run_prompt_template (`str`, *optional*):
+            Pass along your own prompt if you want to override the default template for the `run` method. Can be the
+            actual prompt template or a repo ID (on the Hugging Face Hub). The prompt should be in a file named
+            `run_prompt_template.txt` in this repo in this case.
+        additional_tools ([`Tool`], list of tools or dictionary with tool values, *optional*):
+            Any additional tools to include on top of the default ones. If you pass along a tool with the same name as
+            one of the default tools, that default tool will be overridden.
+
+    Example:
+
+    ```py
+    from transformers import HfAgent
+
+    agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
+    agent.run("Is the following `text` (in Spanish) positive or negative?", text="¡Este es un API muy agradable!")
+    ```
+    """
+
+    def __init__(
+        self, url_endpoint, token=None, chat_prompt_template=None, run_prompt_template=None, additional_tools=None
+    ):
+        self.url_endpoint = url_endpoint
+        if token is None:
+            self.token = f"Bearer {HfFolder().get_token()}"
+        elif token.startswith("Bearer") or token.startswith("Basic"):
+            self.token = token
+        else:
+            self.token = f"Bearer {token}"
+        super().__init__(
+            chat_prompt_template=chat_prompt_template,
+            run_prompt_template=run_prompt_template,
+            additional_tools=additional_tools,
+        )
+
+    def generate_one(self, prompt, stop):
+        headers = {"Authorization": self.token}
+        inputs = {
+            "inputs": prompt,
+            "parameters": {"max_new_tokens": 200, "return_full_text": False, "stop": stop},
+        }
+
+        response = requests.post(self.url_endpoint, json=inputs, headers=headers)
+        if response.status_code == 429:
+            logger.info("Getting rate-limited, waiting a tiny bit before trying again.")
+            time.sleep(1)
+            return self._generate_one(prompt)
+        elif response.status_code != 200:
+            raise ValueError(f"Error {response.status_code}: {response.json()}")
+
+        result = response.json()[0]["generated_text"]
+        # Inference API returns the stop sequence
+        for stop_seq in stop:
+            if result.endswith(stop_seq):
+                return result[: -len(stop_seq)]
+        return result
+
+
+class LocalAgent(Agent):
+    """
+    Agent that uses a local model and tokenizer to generate code.
+
+    Args:
+        model ([`PreTrainedModel`]):
+            The model to use for the agent.
+        tokenizer ([`PreTrainedTokenizer`]):
+            The tokenizer to use for the agent.
+        chat_prompt_template (`str`, *optional*):
+            Pass along your own prompt if you want to override the default template for the `chat` method. Can be the
+            actual prompt template or a repo ID (on the Hugging Face Hub). The prompt should be in a file named
+            `chat_prompt_template.txt` in this repo in this case.
+        run_prompt_template (`str`, *optional*):
+            Pass along your own prompt if you want to override the default template for the `run` method. Can be the
+            actual prompt template or a repo ID (on the Hugging Face Hub). The prompt should be in a file named
+            `run_prompt_template.txt` in this repo in this case.
+        additional_tools ([`Tool`], list of tools or dictionary with tool values, *optional*):
+            Any additional tools to include on top of the default ones. If you pass along a tool with the same name as
+            one of the default tools, that default tool will be overridden.
+
+    Example:
+
+    ```py
+    import torch
+    from transformers import AutoModelForCausalLM, AutoTokenizer, LocalAgent
+
+    checkpoint = "bigcode/starcoder"
+    model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto", torch_dtype=torch.bfloat16)
+    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+
+    agent = LocalAgent(model, tokenizer)
+    agent.run("Draw me a picture of rivers and lakes.")
+    ```
+    """
+
+    def __init__(self, model, tokenizer, chat_prompt_template=None, run_prompt_template=None, additional_tools=None):
+        self.model = model
+        self.tokenizer = tokenizer
+        super().__init__(
+            chat_prompt_template=chat_prompt_template,
+            run_prompt_template=run_prompt_template,
+            additional_tools=additional_tools,
+        )
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        """
+        Convenience method to build a `LocalAgent` from a pretrained checkpoint.
+
+        Args:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                The name of a repo on the Hub or a local path to a folder containing both model and tokenizer.
+            kwargs (`Dict[str, Any]`, *optional*):
+                Keyword arguments passed along to [`~PreTrainedModel.from_pretrained`].
+
+        Example:
+
+        ```py
+        import torch
+        from transformers import LocalAgent
+
+        agent = LocalAgent.from_pretrained("bigcode/starcoder", device_map="auto", torch_dtype=torch.bfloat16)
+        agent.run("Draw me a picture of rivers and lakes.")
+        ```
+        """
+        model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        return cls(model, tokenizer)
+
+    @property
+    def _model_device(self):
+        if hasattr(self.model, "hf_device_map"):
+            return list(self.model.hf_device_map.values())[0]
+        for param in self.model.parameters():
+            return param.device
+
+    def generate_one(self, prompt, stop):
+        encoded_inputs = self.tokenizer(prompt, return_tensors="pt").to(self._model_device)
+        src_len = encoded_inputs["input_ids"].shape[1]
+        stopping_criteria = StoppingCriteriaList([StopSequenceCriteria(stop, self.tokenizer)])
+        outputs = self.model.generate(
+            encoded_inputs["input_ids"], max_new_tokens=200, stopping_criteria=stopping_criteria
+        )
+
+        result = self.tokenizer.decode(outputs[0].tolist()[src_len:])
+        # Inference API returns the stop sequence
+        for stop_seq in stop:
+            if result.endswith(stop_seq):
+                result = result[: -len(stop_seq)]
+        return result
+
+
+class StopSequenceCriteria(StoppingCriteria):
+    """
+    This class can be used to stop generation whenever a sequence of tokens is encountered.
+
+    Args:
+        stop_sequences (`str` or `List[str]`):
+            The sequence (or list of sequences) on which to stop execution.
+        tokenizer:
+            The tokenizer used to decode the model outputs.
+    """
+
+    def __init__(self, stop_sequences, tokenizer):
+        if isinstance(stop_sequences, str):
+            stop_sequences = [stop_sequences]
+        self.stop_sequences = stop_sequences
+        self.tokenizer = tokenizer
+
+    def __call__(self, input_ids, scores, **kwargs) -> bool:
+        decoded_output = self.tokenizer.decode(input_ids.tolist()[0])
+        return any(decoded_output.endswith(stop_sequence) for stop_sequence in self.stop_sequences)
diff --git a/src/transformers/tools/base.py b/src/transformers/tools/base.py
new file mode 100644
index 000000000000..f7ce384e9263
--- /dev/null
+++ b/src/transformers/tools/base.py
@@ -0,0 +1,753 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import base64
+import importlib
+import inspect
+import io
+import json
+import os
+import tempfile
+from typing import Any, Dict, List, Optional, Union
+
+from huggingface_hub import create_repo, hf_hub_download, metadata_update, upload_folder
+from huggingface_hub.utils import RepositoryNotFoundError, build_hf_headers, get_session
+
+from ..dynamic_module_utils import custom_object_save, get_class_from_dynamic_module, get_imports
+from ..image_utils import is_pil_image
+from ..models.auto import AutoProcessor
+from ..utils import (
+    CONFIG_NAME,
+    cached_file,
+    is_accelerate_available,
+    is_torch_available,
+    is_vision_available,
+    logging,
+)
+from .agent_types import handle_agent_inputs, handle_agent_outputs
+
+
+logger = logging.get_logger(__name__)
+
+if is_torch_available():
+    import torch
+
+if is_accelerate_available():
+    from accelerate.utils import send_to_device
+
+
+TOOL_CONFIG_FILE = "tool_config.json"
+
+
+def get_repo_type(repo_id, repo_type=None, **hub_kwargs):
+    if repo_type is not None:
+        return repo_type
+    try:
+        hf_hub_download(repo_id, TOOL_CONFIG_FILE, repo_type="space", **hub_kwargs)
+        return "space"
+    except RepositoryNotFoundError:
+        try:
+            hf_hub_download(repo_id, TOOL_CONFIG_FILE, repo_type="model", **hub_kwargs)
+            return "model"
+        except RepositoryNotFoundError:
+            raise EnvironmentError(f"`{repo_id}` does not seem to be a valid repo identifier on the Hub.")
+        except Exception:
+            return "model"
+    except Exception:
+        return "space"
+
+
+# docstyle-ignore
+APP_FILE_TEMPLATE = """from transformers import launch_gradio_demo
+from {module_name} import {class_name}
+
+launch_gradio_demo({class_name})
+"""
+
+
+class Tool:
+    """
+    A base class for the functions used by the agent. Subclass this and implement the `__call__` method as well as the
+    following class attributes:
+
+    - **description** (`str`) -- A short description of what your tool does, the inputs it expects and the output(s) it
+      will return. For instance 'This is a tool that downloads a file from a `url`. It takes the `url` as input, and
+      returns the text contained in the file'.
+    - **name** (`str`) -- A performative name that will be used for your tool in the prompt to the agent. For instance
+      `"text-classifier"` or `"image_generator"`.
+    - **inputs** (`List[str]`) -- The list of modalities expected for the inputs (in the same order as in the call).
+      Modalitiies should be `"text"`, `"image"` or `"audio"`. This is only used by `launch_gradio_demo` or to make a
+      nice space from your tool.
+    - **outputs** (`List[str]`) -- The list of modalities returned but the tool (in the same order as the return of the
+      call method). Modalitiies should be `"text"`, `"image"` or `"audio"`. This is only used by `launch_gradio_demo`
+      or to make a nice space from your tool.
+
+    You can also override the method [`~Tool.setup`] if your tool as an expensive operation to perform before being
+    usable (such as loading a model). [`~Tool.setup`] will be called the first time you use your tool, but not at
+    instantiation.
+    """
+
+    description: str = "This is a tool that ..."
+    name: str = ""
+
+    inputs: List[str]
+    outputs: List[str]
+
+    def __init__(self, *args, **kwargs):
+        self.is_initialized = False
+
+    def __call__(self, *args, **kwargs):
+        return NotImplemented("Write this method in your subclass of `Tool`.")
+
+    def setup(self):
+        """
+        Overwrite this method here for any operation that is expensive and needs to be executed before you start using
+        your tool. Such as loading a big model.
+        """
+        self.is_initialized = True
+
+    def save(self, output_dir):
+        """
+        Saves the relevant code files for your tool so it can be pushed to the Hub. This will copy the code of your
+        tool in `output_dir` as well as autogenerate:
+
+        - a config file named `tool_config.json`
+        - an `app.py` file so that your tool can be converted to a space
+        - a `requirements.txt` containing the names of the module used by your tool (as detected when inspecting its
+          code)
+
+        You should only use this method to save tools that are defined in a separate module (not `__main__`).
+
+        Args:
+            output_dir (`str`): The folder in which you want to save your tool.
+        """
+        os.makedirs(output_dir, exist_ok=True)
+        # Save module file
+        if self.__module__ == "__main__":
+            raise ValueError(
+                f"We can't save the code defining {self} in {output_dir} as it's been defined in __main__. You "
+                "have to put this code in a separate module so we can include it in the saved folder."
+            )
+        module_files = custom_object_save(self, output_dir)
+
+        module_name = self.__class__.__module__
+        last_module = module_name.split(".")[-1]
+        full_name = f"{last_module}.{self.__class__.__name__}"
+
+        # Save config file
+        config_file = os.path.join(output_dir, "tool_config.json")
+        if os.path.isfile(config_file):
+            with open(config_file, "r", encoding="utf-8") as f:
+                tool_config = json.load(f)
+        else:
+            tool_config = {}
+
+        tool_config = {"tool_class": full_name, "description": self.description, "name": self.name}
+        with open(config_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(tool_config, indent=2, sort_keys=True) + "\n")
+
+        # Save app file
+        app_file = os.path.join(output_dir, "app.py")
+        with open(app_file, "w", encoding="utf-8") as f:
+            f.write(APP_FILE_TEMPLATE.format(module_name=last_module, class_name=self.__class__.__name__))
+
+        # Save requirements file
+        requirements_file = os.path.join(output_dir, "requirements.txt")
+        imports = []
+        for module in module_files:
+            imports.extend(get_imports(module))
+        imports = list(set(imports))
+        with open(requirements_file, "w", encoding="utf-8") as f:
+            f.write("\n".join(imports) + "\n")
+
+    @classmethod
+    def from_hub(
+        cls,
+        repo_id: str,
+        model_repo_id: Optional[str] = None,
+        token: Optional[str] = None,
+        remote: bool = False,
+        **kwargs,
+    ):
+        """
+        Loads a tool defined on the Hub.
+
+        Args:
+            repo_id (`str`):
+                The name of the repo on the Hub where your tool is defined.
+            model_repo_id (`str`, *optional*):
+                If your tool uses a model and you want to use a different model than the default, you can pass a second
+                repo ID or an endpoint url to this argument.
+            token (`str`, *optional*):
+                The token to identify you on hf.co. If unset, will use the token generated when running
+                `huggingface-cli login` (stored in `~/.huggingface`).
+            remote (`bool`, *optional*, defaults to `False`):
+                Whether to use your tool by downloading the model or (if it is available) with an inference endpoint.
+            kwargs (additional keyword arguments, *optional*):
+                Additional keyword arguments that will be split in two: all arguments relevant to the Hub (such as
+                `cache_dir`, `revision`, `subfolder`) will be used when downloading the files for your tool, and the
+                others will be passed along to its init.
+        """
+        if remote and model_repo_id is None:
+            endpoints = get_default_endpoints()
+            if repo_id not in endpoints:
+                raise ValueError(
+                    f"Could not infer a default endpoint for {repo_id}, you need to pass one using the "
+                    "`model_repo_id` argument."
+                )
+            model_repo_id = endpoints[repo_id]
+        hub_kwargs_names = [
+            "cache_dir",
+            "force_download",
+            "resume_download",
+            "proxies",
+            "revision",
+            "repo_type",
+            "subfolder",
+            "local_files_only",
+        ]
+        hub_kwargs = {k: v for k, v in kwargs.items() if k in hub_kwargs_names}
+
+        # Try to get the tool config first.
+        hub_kwargs["repo_type"] = get_repo_type(repo_id, **hub_kwargs)
+        resolved_config_file = cached_file(
+            repo_id,
+            TOOL_CONFIG_FILE,
+            use_auth_token=token,
+            **hub_kwargs,
+            _raise_exceptions_for_missing_entries=False,
+            _raise_exceptions_for_connection_errors=False,
+        )
+        is_tool_config = resolved_config_file is not None
+        if resolved_config_file is None:
+            resolved_config_file = cached_file(
+                repo_id,
+                CONFIG_NAME,
+                use_auth_token=token,
+                **hub_kwargs,
+                _raise_exceptions_for_missing_entries=False,
+                _raise_exceptions_for_connection_errors=False,
+            )
+        if resolved_config_file is None:
+            raise EnvironmentError(
+                f"{repo_id} does not appear to provide a valid configuration in `tool_config.json` or `config.json`."
+            )
+
+        with open(resolved_config_file, encoding="utf-8") as reader:
+            config = json.load(reader)
+
+        if not is_tool_config:
+            if "custom_tool" not in config:
+                raise EnvironmentError(
+                    f"{repo_id} does not provide a mapping to custom tools in its configuration `config.json`."
+                )
+            custom_tool = config["custom_tool"]
+        else:
+            custom_tool = config
+
+        tool_class = custom_tool["tool_class"]
+        tool_class = get_class_from_dynamic_module(tool_class, repo_id, use_auth_token=token, **hub_kwargs)
+
+        if len(tool_class.name) == 0:
+            tool_class.name = custom_tool["name"]
+        if tool_class.name != custom_tool["name"]:
+            logger.warning(
+                f"{tool_class.__name__} implements a different name in its configuration and class. Using the tool "
+                "configuration name."
+            )
+            tool_class.name = custom_tool["name"]
+
+        if len(tool_class.description) == 0:
+            tool_class.description = custom_tool["description"]
+        if tool_class.description != custom_tool["description"]:
+            logger.warning(
+                f"{tool_class.__name__} implements a different description in its configuration and class. Using the "
+                "tool configuration description."
+            )
+            tool_class.description = custom_tool["description"]
+
+        if remote:
+            return RemoteTool(model_repo_id, token=token, tool_class=tool_class)
+        return tool_class(model_repo_id, token=token, **kwargs)
+
+    def push_to_hub(
+        self,
+        repo_id: str,
+        commit_message: str = "Upload tool",
+        private: Optional[bool] = None,
+        token: Optional[Union[bool, str]] = None,
+        create_pr: bool = False,
+    ) -> str:
+        """
+        Upload the tool to the Hub.
+
+        Parameters:
+            repo_id (`str`):
+                The name of the repository you want to push your tool to. It should contain your organization name when
+                pushing to a given organization.
+            commit_message (`str`, *optional*, defaults to `"Upload tool"`):
+                Message to commit while pushing.
+            private (`bool`, *optional*):
+                Whether or not the repository created should be private.
+            token (`bool` or `str`, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If unset, will use the token generated
+                when running `huggingface-cli login` (stored in `~/.huggingface`).
+            create_pr (`bool`, *optional*, defaults to `False`):
+                Whether or not to create a PR with the uploaded files or directly commit.
+        """
+        repo_url = create_repo(
+            repo_id=repo_id, token=token, private=private, exist_ok=True, repo_type="space", space_sdk="gradio"
+        )
+        repo_id = repo_url.repo_id
+        metadata_update(repo_id, {"tags": ["tool"]}, repo_type="space")
+
+        with tempfile.TemporaryDirectory() as work_dir:
+            # Save all files.
+            self.save(work_dir)
+            logger.info(f"Uploading the following files to {repo_id}: {','.join(os.listdir(work_dir))}")
+            return upload_folder(
+                repo_id=repo_id,
+                commit_message=commit_message,
+                folder_path=work_dir,
+                token=token,
+                create_pr=create_pr,
+                repo_type="space",
+            )
+
+    @staticmethod
+    def from_gradio(gradio_tool):
+        """
+        Creates a [`Tool`] from a gradio tool.
+        """
+
+        class GradioToolWrapper(Tool):
+            def __init__(self, _gradio_tool):
+                super().__init__()
+                self.name = _gradio_tool.name
+                self.description = _gradio_tool.description
+
+        GradioToolWrapper.__call__ = gradio_tool.run
+        return GradioToolWrapper(gradio_tool)
+
+
+class RemoteTool(Tool):
+    """
+    A [`Tool`] that will make requests to an inference endpoint.
+
+    Args:
+        endpoint_url (`str`, *optional*):
+            The url of the endpoint to use.
+        token (`str`, *optional*):
+            The token to use as HTTP bearer authorization for remote files. If unset, will use the token generated when
+            running `huggingface-cli login` (stored in `~/.huggingface`).
+        tool_class (`type`, *optional*):
+            The corresponding `tool_class` if this is a remote version of an existing tool. Will help determine when
+            the output should be converted to another type (like images).
+    """
+
+    def __init__(self, endpoint_url=None, token=None, tool_class=None):
+        self.endpoint_url = endpoint_url
+        self.client = EndpointClient(endpoint_url, token=token)
+        self.tool_class = tool_class
+
+    def prepare_inputs(self, *args, **kwargs):
+        """
+        Prepare the inputs received for the HTTP client sending data to the endpoint. Positional arguments will be
+        matched with the signature of the `tool_class` if it was provided at instantation. Images will be encoded into
+        bytes.
+
+        You can override this method in your custom class of [`RemoteTool`].
+        """
+        inputs = kwargs.copy()
+        if len(args) > 0:
+            if self.tool_class is not None:
+                # Match args with the signature
+                if issubclass(self.tool_class, PipelineTool):
+                    call_method = self.tool_class.encode
+                else:
+                    call_method = self.tool_class.__call__
+                signature = inspect.signature(call_method).parameters
+                parameters = [
+                    k
+                    for k, p in signature.items()
+                    if p.kind not in [inspect._ParameterKind.VAR_POSITIONAL, inspect._ParameterKind.VAR_KEYWORD]
+                ]
+                if parameters[0] == "self":
+                    parameters = parameters[1:]
+                if len(args) > len(parameters):
+                    raise ValueError(
+                        f"{self.tool_class} only accepts {len(parameters)} arguments but {len(args)} were given."
+                    )
+                for arg, name in zip(args, parameters):
+                    inputs[name] = arg
+            elif len(args) > 1:
+                raise ValueError("A `RemoteTool` can only accept one positional input.")
+            elif len(args) == 1:
+                if is_pil_image(args[0]):
+                    return {"inputs": self.client.encode_image(args[0])}
+                return {"inputs": args[0]}
+
+        for key, value in inputs.items():
+            if is_pil_image(value):
+                inputs[key] = self.client.encode_image(value)
+
+        return {"inputs": inputs}
+
+    def extract_outputs(self, outputs):
+        """
+        You can override this method in your custom class of [`RemoteTool`] to apply some custom post-processing of the
+        outputs of the endpoint.
+        """
+        return outputs
+
+    def __call__(self, *args, **kwargs):
+        args, kwargs = handle_agent_inputs(*args, **kwargs)
+
+        output_image = self.tool_class is not None and self.tool_class.outputs == ["image"]
+        inputs = self.prepare_inputs(*args, **kwargs)
+        if isinstance(inputs, dict):
+            outputs = self.client(**inputs, output_image=output_image)
+        else:
+            outputs = self.client(inputs, output_image=output_image)
+        if isinstance(outputs, list) and len(outputs) == 1 and isinstance(outputs[0], list):
+            outputs = outputs[0]
+
+        outputs = handle_agent_outputs(outputs, self.tool_class.outputs if self.tool_class is not None else None)
+
+        return self.extract_outputs(outputs)
+
+
+class PipelineTool(Tool):
+    """
+    A [`Tool`] tailored towards Transformer models. On top of the class attributes of the base class [`Tool`], you will
+    need to specify:
+
+    - **model_class** (`type`) -- The class to use to load the model in this tool.
+    - **default_checkpoint** (`str`) -- The default checkpoint that should be used when the user doesn't specify one.
+    - **pre_processor_class** (`type`, *optional*, defaults to [`AutoProcessor`]) -- The class to use to load the
+      pre-processor
+    - **post_processor_class** (`type`, *optional*, defaults to [`AutoProcessor`]) -- The class to use to load the
+      post-processor (when different from the pre-processor).
+
+    Args:
+        model (`str` or [`PreTrainedModel`], *optional*):
+            The name of the checkpoint to use for the model, or the instantiated model. If unset, will default to the
+            value of the class attribute `default_checkpoint`.
+        pre_processor (`str` or `Any`, *optional*):
+            The name of the checkpoint to use for the pre-processor, or the instantiated pre-processor (can be a
+            tokenizer, an image processor, a feature extractor or a processor). Will default to the value of `model` if
+            unset.
+        post_processor (`str` or `Any`, *optional*):
+            The name of the checkpoint to use for the post-processor, or the instantiated pre-processor (can be a
+            tokenizer, an image processor, a feature extractor or a processor). Will default to the `pre_processor` if
+            unset.
+        device (`int`, `str` or `torch.device`, *optional*):
+            The device on which to execute the model. Will default to any accelerator available (GPU, MPS etc...), the
+            CPU otherwise.
+        device_map (`str` or `dict`, *optional*):
+            If passed along, will be used to instantiate the model.
+        model_kwargs (`dict`, *optional*):
+            Any keyword argument to send to the model instantiation.
+        token (`str`, *optional*):
+            The token to use as HTTP bearer authorization for remote files. If unset, will use the token generated when
+            running `huggingface-cli login` (stored in `~/.huggingface`).
+        hub_kwargs (additional keyword arguments, *optional*):
+            Any additional keyword argument to send to the methods that will load the data from the Hub.
+    """
+
+    pre_processor_class = AutoProcessor
+    model_class = None
+    post_processor_class = AutoProcessor
+    default_checkpoint = None
+
+    def __init__(
+        self,
+        model=None,
+        pre_processor=None,
+        post_processor=None,
+        device=None,
+        device_map=None,
+        model_kwargs=None,
+        token=None,
+        **hub_kwargs,
+    ):
+        if not is_torch_available():
+            raise ImportError("Please install torch in order to use this tool.")
+
+        if not is_accelerate_available():
+            raise ImportError("Please install accelerate in order to use this tool.")
+
+        if model is None:
+            if self.default_checkpoint is None:
+                raise ValueError("This tool does not implement a default checkpoint, you need to pass one.")
+            model = self.default_checkpoint
+        if pre_processor is None:
+            pre_processor = model
+
+        self.model = model
+        self.pre_processor = pre_processor
+        self.post_processor = post_processor
+        self.device = device
+        self.device_map = device_map
+        self.model_kwargs = {} if model_kwargs is None else model_kwargs
+        if device_map is not None:
+            self.model_kwargs["device_map"] = device_map
+        self.hub_kwargs = hub_kwargs
+        self.hub_kwargs["token"] = token
+
+        super().__init__()
+
+    def setup(self):
+        """
+        Instantiates the `pre_processor`, `model` and `post_processor` if necessary.
+        """
+        if isinstance(self.pre_processor, str):
+            self.pre_processor = self.pre_processor_class.from_pretrained(self.pre_processor, **self.hub_kwargs)
+
+        if isinstance(self.model, str):
+            self.model = self.model_class.from_pretrained(self.model, **self.model_kwargs, **self.hub_kwargs)
+
+        if self.post_processor is None:
+            self.post_processor = self.pre_processor
+        elif isinstance(self.post_processor, str):
+            self.post_processor = self.post_processor_class.from_pretrained(self.post_processor, **self.hub_kwargs)
+
+        if self.device is None:
+            if self.device_map is not None:
+                self.device = list(self.model.hf_device_map.values())[0]
+            else:
+                self.device = get_default_device()
+
+        if self.device_map is None:
+            self.model.to(self.device)
+
+        super().setup()
+
+    def encode(self, raw_inputs):
+        """
+        Uses the `pre_processor` to prepare the inputs for the `model`.
+        """
+        return self.pre_processor(raw_inputs)
+
+    def forward(self, inputs):
+        """
+        Sends the inputs through the `model`.
+        """
+        with torch.no_grad():
+            return self.model(**inputs)
+
+    def decode(self, outputs):
+        """
+        Uses the `post_processor` to decode the model output.
+        """
+        return self.post_processor(outputs)
+
+    def __call__(self, *args, **kwargs):
+        args, kwargs = handle_agent_inputs(*args, **kwargs)
+
+        if not self.is_initialized:
+            self.setup()
+
+        encoded_inputs = self.encode(*args, **kwargs)
+        encoded_inputs = send_to_device(encoded_inputs, self.device)
+        outputs = self.forward(encoded_inputs)
+        outputs = send_to_device(outputs, "cpu")
+        decoded_outputs = self.decode(outputs)
+
+        return handle_agent_outputs(decoded_outputs, self.outputs)
+
+
+def launch_gradio_demo(tool_class: Tool):
+    """
+    Launches a gradio demo for a tool. The corresponding tool class needs to properly implement the class attributes
+    `inputs` and `outputs`.
+
+    Args:
+        tool_class (`type`): The class of the tool for which to launch the demo.
+    """
+    try:
+        import gradio as gr
+    except ImportError:
+        raise ImportError("Gradio should be installed in order to launch a gradio demo.")
+
+    tool = tool_class()
+
+    def fn(*args, **kwargs):
+        return tool(*args, **kwargs)
+
+    gr.Interface(
+        fn=fn,
+        inputs=tool_class.inputs,
+        outputs=tool_class.outputs,
+        title=tool_class.__name__,
+        article=tool.description,
+    ).launch()
+
+
+# TODO: Migrate to Accelerate for this once `PartialState.default_device` makes its way into a release.
+def get_default_device():
+    if not is_torch_available():
+        raise ImportError("Please install torch in order to use this tool.")
+
+    if torch.backends.mps.is_available() and torch.backends.mps.is_built():
+        return torch.device("mps")
+    elif torch.cuda.is_available():
+        return torch.device("cuda")
+    else:
+        return torch.device("cpu")
+
+
+TASK_MAPPING = {
+    "document-question-answering": "DocumentQuestionAnsweringTool",
+    "image-captioning": "ImageCaptioningTool",
+    "image-question-answering": "ImageQuestionAnsweringTool",
+    "image-segmentation": "ImageSegmentationTool",
+    "speech-to-text": "SpeechToTextTool",
+    "summarization": "TextSummarizationTool",
+    "text-classification": "TextClassificationTool",
+    "text-question-answering": "TextQuestionAnsweringTool",
+    "text-to-speech": "TextToSpeechTool",
+    "translation": "TranslationTool",
+}
+
+
+def get_default_endpoints():
+    endpoints_file = cached_file("huggingface-tools/default-endpoints", "default_endpoints.json", repo_type="dataset")
+    with open(endpoints_file, "r", encoding="utf-8") as f:
+        endpoints = json.load(f)
+    return endpoints
+
+
+def supports_remote(task_or_repo_id):
+    endpoints = get_default_endpoints()
+    return task_or_repo_id in endpoints
+
+
+def load_tool(task_or_repo_id, model_repo_id=None, remote=False, token=None, **kwargs):
+    """
+    Main function to quickly load a tool, be it on the Hub or in the Transformers library.
+
+    Args:
+        task_or_repo_id (`str`):
+            The task for which to load the tool or a repo ID of a tool on the Hub. Tasks implemented in Transformers
+            are:
+
+            - `"document-question-answering"`
+            - `"image-captioning"`
+            - `"image-question-answering"`
+            - `"image-segmentation"`
+            - `"speech-to-text"`
+            - `"summarization"`
+            - `"text-classification"`
+            - `"text-question-answering"`
+            - `"text-to-speech"`
+            - `"translation"`
+
+        model_repo_id (`str`, *optional*):
+            Use this argument to use a different model than the default one for the tool you selected.
+        remote (`bool`, *optional*, defaults to `False`):
+            Whether to use your tool by downloading the model or (if it is available) with an inference endpoint.
+        token (`str`, *optional*):
+            The token to identify you on hf.co. If unset, will use the token generated when running `huggingface-cli
+            login` (stored in `~/.huggingface`).
+        kwargs (additional keyword arguments, *optional*):
+            Additional keyword arguments that will be split in two: all arguments relevant to the Hub (such as
+            `cache_dir`, `revision`, `subfolder`) will be used when downloading the files for your tool, and the others
+            will be passed along to its init.
+    """
+    if task_or_repo_id in TASK_MAPPING:
+        tool_class_name = TASK_MAPPING[task_or_repo_id]
+        main_module = importlib.import_module("transformers")
+        tools_module = main_module.tools
+        tool_class = getattr(tools_module, tool_class_name)
+
+        if remote:
+            if model_repo_id is None:
+                endpoints = get_default_endpoints()
+                if task_or_repo_id not in endpoints:
+                    raise ValueError(
+                        f"Could not infer a default endpoint for {task_or_repo_id}, you need to pass one using the "
+                        "`model_repo_id` argument."
+                    )
+                model_repo_id = endpoints[task_or_repo_id]
+            return RemoteTool(model_repo_id, token=token, tool_class=tool_class)
+        else:
+            return tool_class(model_repo_id, token=token, **kwargs)
+    else:
+        return Tool.from_hub(task_or_repo_id, model_repo_id=model_repo_id, token=token, remote=remote, **kwargs)
+
+
+def add_description(description):
+    """
+    A decorator that adds a description to a function.
+    """
+
+    def inner(func):
+        func.description = description
+        func.name = func.__name__
+        return func
+
+    return inner
+
+
+## Will move to the Hub
+class EndpointClient:
+    def __init__(self, endpoint_url: str, token: Optional[str] = None):
+        self.headers = {**build_hf_headers(token=token), "Content-Type": "application/json"}
+        self.endpoint_url = endpoint_url
+
+    @staticmethod
+    def encode_image(image):
+        _bytes = io.BytesIO()
+        image.save(_bytes, format="PNG")
+        b64 = base64.b64encode(_bytes.getvalue())
+        return b64.decode("utf-8")
+
+    @staticmethod
+    def decode_image(raw_image):
+        if not is_vision_available():
+            raise ImportError(
+                "This tool returned an image but Pillow is not installed. Please install it (`pip install Pillow`)."
+            )
+
+        from PIL import Image
+
+        b64 = base64.b64decode(raw_image)
+        _bytes = io.BytesIO(b64)
+        return Image.open(_bytes)
+
+    def __call__(
+        self,
+        inputs: Optional[Union[str, Dict, List[str], List[List[str]]]] = None,
+        params: Optional[Dict] = None,
+        data: Optional[bytes] = None,
+        output_image: bool = False,
+    ) -> Any:
+        # Build payload
+        payload = {}
+        if inputs:
+            payload["inputs"] = inputs
+        if params:
+            payload["parameters"] = params
+
+        # Make API call
+        response = get_session().post(self.endpoint_url, headers=self.headers, json=payload, data=data)
+
+        # By default, parse the response for the user.
+        if output_image:
+            return self.decode_image(response.content)
+        else:
+            return response.json()
diff --git a/src/transformers/tools/document_question_answering.py b/src/transformers/tools/document_question_answering.py
new file mode 100644
index 000000000000..7b5e8782bd78
--- /dev/null
+++ b/src/transformers/tools/document_question_answering.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+
+from ..models.auto import AutoProcessor
+from ..models.vision_encoder_decoder import VisionEncoderDecoderModel
+from ..utils import is_vision_available
+from .base import PipelineTool
+
+
+if is_vision_available():
+    from PIL import Image
+
+
+class DocumentQuestionAnsweringTool(PipelineTool):
+    default_checkpoint = "naver-clova-ix/donut-base-finetuned-docvqa"
+    description = (
+        "This is a tool that answers a question about an document (pdf). It takes an input named `document` which "
+        "should be the document containing the information, as well as a `question` that is the question about the "
+        "document. It returns a text that contains the answer to the question."
+    )
+    name = "document_qa"
+    pre_processor_class = AutoProcessor
+    model_class = VisionEncoderDecoderModel
+
+    inputs = ["image", "text"]
+    outputs = ["text"]
+
+    def __init__(self, *args, **kwargs):
+        if not is_vision_available():
+            raise ValueError("Pillow must be installed to use the DocumentQuestionAnsweringTool.")
+
+        super().__init__(*args, **kwargs)
+
+    def encode(self, document: "Image", question: str):
+        task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
+        prompt = task_prompt.replace("{user_input}", question)
+        decoder_input_ids = self.pre_processor.tokenizer(
+            prompt, add_special_tokens=False, return_tensors="pt"
+        ).input_ids
+        pixel_values = self.pre_processor(document, return_tensors="pt").pixel_values
+
+        return {"decoder_input_ids": decoder_input_ids, "pixel_values": pixel_values}
+
+    def forward(self, inputs):
+        return self.model.generate(
+            inputs["pixel_values"].to(self.device),
+            decoder_input_ids=inputs["decoder_input_ids"].to(self.device),
+            max_length=self.model.decoder.config.max_position_embeddings,
+            early_stopping=True,
+            pad_token_id=self.pre_processor.tokenizer.pad_token_id,
+            eos_token_id=self.pre_processor.tokenizer.eos_token_id,
+            use_cache=True,
+            num_beams=1,
+            bad_words_ids=[[self.pre_processor.tokenizer.unk_token_id]],
+            return_dict_in_generate=True,
+        ).sequences
+
+    def decode(self, outputs):
+        sequence = self.pre_processor.batch_decode(outputs)[0]
+        sequence = sequence.replace(self.pre_processor.tokenizer.eos_token, "")
+        sequence = sequence.replace(self.pre_processor.tokenizer.pad_token, "")
+        sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
+        sequence = self.pre_processor.token2json(sequence)
+
+        return sequence["answer"]
diff --git a/src/transformers/tools/evaluate_agent.py b/src/transformers/tools/evaluate_agent.py
new file mode 100644
index 000000000000..47d1d4330ad3
--- /dev/null
+++ b/src/transformers/tools/evaluate_agent.py
@@ -0,0 +1,692 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .agents import BASE_PYTHON_TOOLS, clean_code_for_chat, clean_code_for_run
+from .python_interpreter import InterpretorError, evaluate
+
+
+### Fake tools for test
+def classifier(text, labels):
+    return f"This is the classification of {text} along {labels}."
+
+
+def translator(text, src_lang, tgt_lang):
+    return f"This is the translation of {text} from {src_lang} to {tgt_lang}."
+
+
+def speaker(text):
+    return f"This is actually a sound reading {text}."
+
+
+def transcriber(audio):
+    if "sound" not in audio:
+        raise ValueError(f"`audio` ({audio}) is not a sound.")
+    return f"This is the transcribed text from {audio}."
+
+
+def image_generator(prompt):
+    return f"This is actually an image representing {prompt}."
+
+
+def image_captioner(image):
+    if "image" not in image:
+        raise ValueError(f"`image` ({image}) is not an image.")
+    return f"This is a description of {image}."
+
+
+def image_transformer(image, prompt):
+    if "image" not in image:
+        raise ValueError(f"`image` ({image}) is not an image.")
+    return f"This is a transformation of {image} according to {prompt}."
+
+
+def question_answerer(text, question):
+    return f"This is the answer to {question} from {text}."
+
+
+def image_qa(image, question):
+    if "image" not in image:
+        raise ValueError(f"`image` ({image}) is not an image.")
+    return f"This is the answer to {question} from {image}."
+
+
+def text_downloader(url):
+    return f"This is the content of {url}."
+
+
+def summarizer(text):
+    return f"This is a summary of {text}."
+
+
+def video_generator(prompt, seconds=2):
+    return f"A video of {prompt}"
+
+
+def document_qa(image, question):
+    return f"This is the answer to {question} from the document {image}."
+
+
+def image_segmenter(image, prompt):
+    return f"This is the mask of {prompt} in {image}"
+
+
+TEST_TOOLS = {
+    "text_classifier": classifier,
+    "translator": translator,
+    "text_reader": speaker,
+    "summarizer": summarizer,
+    "transcriber": transcriber,
+    "image_generator": image_generator,
+    "image_captioner": image_captioner,
+    "image_transformer": image_transformer,
+    "text_qa": question_answerer,
+    "text_downloader": text_downloader,
+    "image_qa": image_qa,
+    "video_generator": video_generator,
+    "document_qa": document_qa,
+    "image_segmenter": image_segmenter,
+}
+
+
+class Problem:
+    """
+    A class regrouping all the information to solve a problem on which we will evaluate agents.
+
+    Args:
+        task (`str` ou `list[str]`):
+            One or several descriptions of the task to perform. If a list, it should contain variations on the
+            phrasing, but for the same task.
+        inputs (`list[str]` or `dict[str, str]`):
+            The inputs that will be fed to the tools. For this testing environment, only strings are accepted as
+            values. Pass along a dictionary when you want to specify the values of each inputs, or just the list of
+            inputs expected (the value used will be `<<input_name>>` in this case).
+        answer (`str` or `list[str`]):
+            The theoretical answer (or list of possible valid answers) to the problem, as code.
+    """
+
+    def __init__(self, task, inputs, answer):
+        self.task = task
+        self.inputs = inputs
+        self.answer = answer
+
+
+### The list of problems the agent will be evaluated on.
+EVALUATION_TASKS = [
+    Problem(
+        task=[
+            "Is the following `text` (in Spanish) positive or negative?",
+            "Is the text in the variable `text` (in Spanish) positive or negative?",
+            "Translate the following `text` from Spanish to English then tell me if its positive or negative.",
+        ],
+        inputs=["text"],
+        answer="""text_classifier(translator(text, src_lang="Spanish", tgt_lang="English"), labels=["positive", "negative"])""",
+    ),
+    Problem(
+        task=[
+            "Tell me out loud what the `image` contains.",
+            "Describe the following `image` out loud.",
+            "Find what is in the picture stored in `image` then read it out loud.",
+        ],
+        inputs=["image"],
+        answer=[
+            "text_reader(image_captioner(image))",
+            "text_reader(image_qa(image, question='What is in the image?'))",
+        ],
+    ),
+    Problem(
+        task=[
+            "Generate an image from the text given in `text_input`. Then transform it according to the text in `prompt`.",
+            "Use the following `text_input` to generate an image, then transform it by using the text in `prompt`.",
+        ],
+        inputs=["text_input", "prompt"],
+        answer="image_transformer(image_generator(text_input), prompt)",
+    ),
+    Problem(
+        task=[
+            "Download the content of `url`, summarize it then generate an image from its content.",
+            "Use a summary of the web page at `url` to generate an image.",
+            "Summarize the content of the web page at `url`, and use the result to generate an image.",
+        ],
+        inputs=["url"],
+        answer="image_generator(summarizer(text_downloader(url)))",
+    ),
+    Problem(
+        task=[
+            "Transform the following `image` using the prompt in `text`. The prompt is in Spanish.",
+            "Use the text prompt in `text` (in Spanish) to transform the following `image`.",
+            "Translate the `text` from Spanish to English then use it to transform the picture in `image`.",
+        ],
+        inputs=["text", "image"],
+        answer="image_transformer(image, translator(text, src_lang='Spanish', tgt_lang='English'))",
+    ),
+    Problem(
+        task=[
+            "Download the content of `url`, summarize it then read it out loud to me.",
+            "Read me a summary of the web page at `url`.",
+        ],
+        inputs=["url"],
+        answer="text_reader(summarizer(text_downloader(url)))",
+    ),
+    Problem(
+        task=[
+            "Generate an image from the text given in `text_input`.",
+        ],
+        inputs=["text_input"],
+        answer="image_generator(text_input)",
+    ),
+    Problem(
+        task=[
+            "Replace the beaver in the `image` by the `prompt`.",
+            "Transform the `image` so that it contains the `prompt`.",
+            "Use `prompt` to transform this `image`.",
+        ],
+        inputs=["image", "prompt"],
+        answer="image_transformer(image, prompt)",
+    ),
+    Problem(
+        task=[
+            "Provide me the summary of the `text`, then read it to me before transcribing it and translating it in French.",
+            "Summarize `text`, read it out loud then transcribe the audio and translate it in French.",
+            "Read me a summary of the the `text` out loud. Transcribe this and translate it in French.",
+        ],
+        inputs=["text"],
+        answer="translator(transcriber(text_reader(summarizer(text))), src_lang='English', tgt_lang='French')",
+    ),
+    Problem(
+        task=["Generate a video of the `prompt`", "Animate a `prompt`", "Make me a short video using `prompt`."],
+        inputs={"prompt": "A lobster swimming"},
+        answer="video_generator('A lobster swimming')",
+    ),
+    Problem(
+        task=[
+            "Download the following file `url`, summarize it in a few words and generate a video from it."
+            "Fetch the file at this `url`, summarize it, and create an animation out of it."
+        ],
+        inputs=["url"],
+        answer="video_generator(summarizer(text_downloader(url)))",
+    ),
+]
+
+
+EVALUATION_CHATS = [
+    [
+        Problem(
+            task=[
+                "Translate the following `text` from Spanish to English.",
+                "Translate the following `text` from Spanish to English.",
+            ],
+            inputs=["text"],
+            answer="translated_text=translator(text, src_lang='Spanish', tgt_lang='English')",
+        ),
+        Problem(
+            task=[
+                "Is it positive or negative?",
+                "Tell me if its positive or negative.",
+            ],
+            inputs=[],
+            answer="text_classifier(translated_text, labels=['positive', 'negative'])",
+        ),
+    ],
+    [
+        Problem(
+            task=[
+                "What does this `image` contain?",
+                "Describe the following `image`.",
+                "Find what is in the picture stored in `image`",
+            ],
+            inputs=["image"],
+            answer=[
+                "description=image_captioner(image)",
+                "description=image_qa(image, question='What is in the image?')",
+            ],
+        ),
+        Problem(
+            task=["Now, read the description out loud.", "Great! Can you read it out loud?", "Read it out loud."],
+            inputs=[],
+            answer=["audio=text_reader(description)", "audio=text_reader(description)"],
+        ),
+    ],
+    [
+        Problem(
+            task=[
+                "Generate an image from the text given in `text_input`.",
+                "Use the following `text_input` to generate an image",
+            ],
+            inputs=["text_input"],
+            answer="image = image_generator(text_input)",
+        ),
+        Problem(
+            task=[
+                "Transform it according to the text in `prompt`.",
+                "Transform it by using the text in `prompt`.",
+            ],
+            inputs=["prompt"],
+            answer="image_transformer(image, prompt)",
+        ),
+    ],
+    [
+        Problem(
+            task=[
+                "Download the content of `url` and summarize it.",
+                "Summarize the content of the web page at `url`.",
+            ],
+            inputs=["url"],
+            answer="summary = summarizer(text_downloader(url))",
+        ),
+        Problem(
+            task=[
+                "Generate an image from its content.",
+                "Use the previous result to generate an image.",
+            ],
+            inputs=[],
+            answer="image_generator(summary)",
+        ),
+    ],
+    [
+        Problem(
+            task=[
+                "Translate this Spanish `text` in English.",
+                "Translate the `text` from Spanish to English.",
+            ],
+            inputs=["text"],
+            answer="translated_text = translator(text, src_lang='Spanish', tgt_lang='English')",
+        ),
+        Problem(
+            task=[
+                "Transform the following `image` using the translated `text`.",
+                "Use the previous result to transform the following `image`.",
+            ],
+            inputs=["image"],
+            answer="image_transformer(image, translated_text)",
+        ),
+    ],
+    [
+        Problem(
+            task=["Download the content of `url`.", "Get me the text on the weg page `url`."],
+            inputs=["url"],
+            answer="text = text_downloader(url)",
+        ),
+        Problem(
+            task=["Summarize this text.", "Summarize this text."],
+            inputs=[],
+            answer="summary = summarizer(text)",
+        ),
+        Problem(
+            task=["Read it out loud to me.", "Read me the previous result."],
+            inputs=[],
+            answer="text_reader(summary)",
+        ),
+    ],
+    [
+        Problem(
+            task=[
+                "Generate an image from the text given in `text_input`.",
+            ],
+            inputs=["text_input"],
+            answer="image_generator(text_input)",
+        ),
+    ],
+    [
+        Problem(
+            task=[
+                "Replace the beaver in the `image` by the `prompt`.",
+                "Transform the `image` so that it contains the `prompt`.",
+                "Use `prompt` to transform this `image`.",
+            ],
+            inputs=["image", "prompt"],
+            answer="image_transformer(image, prompt)",
+        ),
+    ],
+    [
+        Problem(
+            task=["Provide me the summary of the `text`.", "Summarize `text`."],
+            inputs=["text"],
+            answer="summary = summarizer(text)",
+        ),
+        Problem(
+            task=["Read this summary to me.", "Read it out loud."],
+            inputs=[],
+            answer="audio = text_reader(summarizer(text))",
+        ),
+        Problem(
+            task=["Transcribing the previous result back in text.", "Transcribe the audio."],
+            inputs=[],
+            answer="text = transcriber(audio)",
+        ),
+        Problem(
+            task=["Translating the last result in French.", "Translate this in French."],
+            inputs=[],
+            answer="translator(text, src_lang='English', tgt_lang='French')",
+        ),
+    ],
+    [
+        Problem(
+            task=["Generate a video of the `prompt`", "Animate a `prompt`", "Make me a short video using `prompt`."],
+            inputs={"prompt": "A lobster swimming"},
+            answer="video_generator('A lobster swimming')",
+        ),
+    ],
+    [
+        Problem(
+            task=[
+                "Download the content of `url` and summarize it.",
+                "Summarize the content of the web page at `url`.",
+            ],
+            inputs=["url"],
+            answer="summary = summarizer(text_downloader(url))",
+        ),
+        Problem(
+            task=["generate a video from it.", "Create an animation from the last result."],
+            inputs=[],
+            answer="video_generator(summary)",
+        ),
+    ],
+]
+
+
+def get_theoretical_tools(agent_answer, theoretical_answer, code_answer):
+    if not isinstance(theoretical_answer, list):
+        return {name for name in TEST_TOOLS if name in code_answer}
+
+    if isinstance(agent_answer, dict):
+        for one_answer, one_code in zip(theoretical_answer, code_answer):
+            if one_answer in agent_answer.values():
+                return {name for name in TEST_TOOLS if name in one_code}
+
+    for one_answer, one_code in zip(theoretical_answer, code_answer):
+        if agent_answer == one_answer:
+            return {name for name in TEST_TOOLS if name in one_code}
+
+    return {name for name in TEST_TOOLS if name in code_answer[0]}
+
+
+def evaluate_code(code, inputs=None, state=None, verbose=False, return_interpretor_error=False):
+    tools = BASE_PYTHON_TOOLS.copy()
+    for name, tool in TEST_TOOLS.items():
+        if name not in code:
+            continue
+        tools[name] = tool
+
+    if isinstance(inputs, dict):
+        inputs = inputs.copy()
+    elif inputs is not None:
+        inputs = {inp: f"<<{inp}>>" for inp in inputs}
+
+    if state is not None:
+        state.update(inputs)
+    else:
+        state = inputs
+
+    try:
+        return evaluate(code, tools, state)
+    except InterpretorError as e:
+        return str(e)
+    except Exception as e:
+        if verbose:
+            print(e)
+        return None
+
+
+def score_code(agent_answer, theoretical_answer, verbose: bool = False):
+    if verbose:
+        print(agent_answer, theoretical_answer)
+    theoretical_answer = theoretical_answer if isinstance(theoretical_answer, list) else [theoretical_answer]
+
+    if agent_answer in theoretical_answer:
+        if verbose:
+            print("Perfect!")
+        return 1
+    elif isinstance(agent_answer, dict) and any(v in theoretical_answer for v in agent_answer.values()):
+        if verbose:
+            print("Almsot perfect, result in state!")
+        return 0.75
+    else:
+        if verbose:
+            print("Result is not the right one but code executed.")
+        return 0.3
+
+
+def evaluate_one_result(explanation, code, agent_answer, theoretical_answer, answer, verbose=False):
+    tools_in_explanation = {name for name in TEST_TOOLS if f"`{name}`" in explanation}
+    theoretical_tools = get_theoretical_tools(agent_answer, theoretical_answer, answer)
+    if tools_in_explanation == theoretical_tools:
+        tool_selection_score = 1.0
+        tool_selection_errors = None
+    else:
+        missing_tools = len(theoretical_tools - tools_in_explanation)
+        unexpected_tools = len(tools_in_explanation - theoretical_tools)
+        tool_selection_score = max(0, 1.0 - 0.25 * missing_tools - 0.25 * unexpected_tools)
+
+        tool_selection_errors = {
+            "selected_tools": tools_in_explanation,
+            "theoretical_tools": theoretical_tools,
+        }
+
+    tools_in_code = {name for name in TEST_TOOLS if name in code}
+    if tools_in_code == theoretical_tools:
+        tool_used_score = 1.0
+        tool_used_errors = None
+    else:
+        missing_tools = len(theoretical_tools - tools_in_code)
+        unexpected_tools = len(tools_in_code - theoretical_tools)
+        tool_used_score = max(0, 1.0 - 0.25 * missing_tools - 0.25 * unexpected_tools)
+
+        tool_used_errors = {
+            "selected_tools": tools_in_explanation,
+            "theoretical_tools": theoretical_tools,
+        }
+
+    score = score_code(agent_answer, theoretical_answer, verbose=verbose)
+    if score < 1.0:
+        code_errors = {
+            "code_produced": code,
+            "evaluation": agent_answer,
+            "theoretical_answer": theoretical_answer,
+        }
+    else:
+        code_errors = None
+
+    return (tool_selection_score, tool_used_score, score), (tool_selection_errors, tool_used_errors, code_errors)
+
+
+def evaluate_agent(agent, batch_size=8, verbose=False, return_errors=False):
+    """
+    Evaluates a new agent on all `EVALUATION_TASKS`.
+
+    Example:
+
+    ```py
+    agent = NewOpenAiAgent(model="text-davinci-003", api_key=your_api_key)
+    bads = new_evaluate_agent(agent)
+    for bad in bads:
+        print(bad)
+    ```
+    """
+    # Sanity check
+    agent_tools = set(agent.toolbox.keys())
+    if agent_tools != set(TEST_TOOLS):
+        missing_tools = set(TEST_TOOLS) - agent_tools
+        unexpected_tools = set(agent_tools) - TEST_TOOLS
+        raise ValueError(
+            f"Fix the test tools in the evaluate_agent module. Tools mising: {missing_tools}. Extra tools: {unexpected_tools}."
+        )
+
+    eval_tasks = []
+    eval_idx = []
+    for idx, pb in enumerate(EVALUATION_TASKS):
+        if isinstance(pb.task, list):
+            eval_tasks.extend(pb.task)
+            eval_idx.extend([idx] * len(pb.task))
+        else:
+            eval_tasks.append(pb.task)
+            eval_idx.append(idx)
+
+    tool_selection_score = 0
+    tool_used_score = 0
+    code_score = 0
+
+    if return_errors:
+        tool_selection_errors = {}
+        tool_used_errors = {}
+        code_errors = {}
+
+    for start_idx in range(0, len(eval_tasks), batch_size):
+        end_idx = min(start_idx + batch_size, len(eval_tasks))
+        batch_tasks = eval_tasks[start_idx:end_idx]
+
+        prompts = [agent.format_prompt(task) for task in batch_tasks]
+        results = agent.generate_many(prompts, stop=["Task:"])
+
+        for idx, result in enumerate(results):
+            problem = EVALUATION_TASKS[eval_idx[start_idx + idx]]
+            if verbose:
+                print(f"====Task {start_idx + idx}====\n{batch_tasks[idx]}\n")
+            explanation, code = clean_code_for_run(result)
+
+            # Evaluate agent answer and code answer
+            agent_answer = evaluate_code(code, problem.inputs, verbose=verbose)
+            if isinstance(problem.answer, list):
+                theoretical_answer = [evaluate_code(answer, problem.inputs) for answer in problem.answer]
+            else:
+                theoretical_answer = evaluate_code(problem.answer, problem.inputs)
+
+            scores, errors = evaluate_one_result(
+                explanation, code, agent_answer, theoretical_answer, problem.answer, verbose=verbose
+            )
+
+            tool_selection_score += scores[0]
+            tool_used_score += scores[1]
+            code_score += scores[2]
+
+            if return_errors:
+                if errors[0] is not None:
+                    tool_selection_errors[batch_tasks[idx]] = errors[0]
+                if errors[1] is not None:
+                    tool_used_errors[batch_tasks[idx]] = errors[1]
+                if errors[2] is not None:
+                    code_errors[batch_tasks[idx]] = errors[2]
+
+    scores = {
+        "tool selection score": 100 * (tool_selection_score / len(eval_tasks)),
+        "tool used score": 100 * (tool_used_score / len(eval_tasks)),
+        "code score": 100 * (code_score / len(eval_tasks)),
+    }
+
+    if return_errors:
+        return scores, tool_selection_errors, tool_used_errors, code_errors
+    else:
+        return scores
+
+
+def evaluate_chat_agent(agent, verbose=False, return_errors=False):
+    """
+    Evaluates a new agent on all `EVALUATION_CHATS`.
+
+    Example:
+
+    ```py
+    agent = NewOpenAiAgent(model="text-davinci-003", api_key=your_api_key)
+    bads = new_evaluate_agent(agent)
+    for bad in bads:
+        print(bad)
+    ```
+    """
+    # Sanity check
+    agent_tools = set(agent.toolbox.keys())
+    if agent_tools != set(TEST_TOOLS):
+        missing_tools = set(TEST_TOOLS) - agent_tools
+        unexpected_tools = agent_tools - set(TEST_TOOLS)
+        raise ValueError(
+            f"Fix the test tools in the evaluate_agent module. Tools mising: {missing_tools}. Extra tools: {unexpected_tools}."
+        )
+
+    tool_selection_score = 0
+    tool_used_score = 0
+    code_score = 0
+    total_steps = 0
+
+    if return_errors:
+        tool_selection_errors = {}
+        tool_used_errors = {}
+        code_errors = {}
+
+    for chat_problem in EVALUATION_CHATS:
+        if isinstance(chat_problem[0].task, str):
+            resolved_problems = [chat_problem]
+        else:
+            resolved_problems = [
+                [Problem(task=pb.task[i], inputs=pb.inputs, answer=pb.answer) for pb in chat_problem]
+                for i in range(len(chat_problem[0].task))
+            ]
+        for problem in resolved_problems:
+            agent.prepare_for_new_chat()
+            agent_state = {}
+            theoretical_state = (
+                [{} for _ in range(len(problem[0].answer))] if isinstance(problem[0].answer, list) else {}
+            )
+
+            for step, step_problem in enumerate(problem):
+                if verbose:
+                    print(step_problem.task)
+                total_steps += 1
+                prompt = agent.format_prompt(step_problem.task, chat_mode=True)
+                result = agent.generate_one(prompt, stop=["Human:", "====="])
+                agent.chat_history = prompt + result + "\n"
+
+                explanation, code = clean_code_for_chat(result)
+
+                if verbose:
+                    print(f"==Explanation from the agent==\n{explanation}")
+                    print(f"\n==Code generated by the agent==\n{code}")
+
+                # Evaluate agent answer and code answer
+                agent_answer = evaluate_code(code, step_problem.inputs, state=agent_state, verbose=verbose)
+
+                answer = step_problem.answer
+                if isinstance(answer, list):
+                    theoretical_answer = [
+                        evaluate_code(a, step_problem.inputs, state=state)
+                        for a, state in zip(answer, theoretical_state)
+                    ]
+                else:
+                    theoretical_answer = evaluate_code(answer, step_problem.inputs, state=theoretical_state)
+
+                scores, errors = evaluate_one_result(
+                    explanation, code, agent_answer, theoretical_answer, answer, verbose=verbose
+                )
+
+                tool_selection_score += scores[0]
+                tool_used_score += scores[1]
+                code_score += scores[2]
+
+                if return_errors:
+                    if errors[0] is not None:
+                        tool_selection_errors[step_problem.task] = errors[0]
+                    if errors[1] is not None:
+                        tool_used_errors[step_problem.task] = errors[1]
+                    if errors[2] is not None:
+                        code_errors[step_problem.task] = errors[2]
+
+    scores = {
+        "tool selection score": 100 * (tool_selection_score / total_steps),
+        "tool used score": 100 * (tool_used_score / total_steps),
+        "code score": 100 * (code_score / total_steps),
+    }
+
+    if return_errors:
+        return scores, tool_selection_errors, tool_used_errors, code_errors
+    else:
+        return scores
diff --git a/src/transformers/tools/image_captioning.py b/src/transformers/tools/image_captioning.py
new file mode 100644
index 000000000000..cfcf0bc8dc28
--- /dev/null
+++ b/src/transformers/tools/image_captioning.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ..models.auto import AutoModelForVision2Seq
+from ..utils import requires_backends
+from .base import PipelineTool
+
+
+if TYPE_CHECKING:
+    from PIL import Image
+
+
+class ImageCaptioningTool(PipelineTool):
+    default_checkpoint = "Salesforce/blip-image-captioning-base"
+    description = (
+        "This is a tool that generates a description of an image. It takes an input named `image` which should be the "
+        "image to caption, and returns a text that contains the description in English."
+    )
+    name = "image_captioner"
+    model_class = AutoModelForVision2Seq
+
+    inputs = ["image"]
+    outputs = ["text"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+        super().__init__(*args, **kwargs)
+
+    def encode(self, image: "Image"):
+        return self.pre_processor(images=image, return_tensors="pt")
+
+    def forward(self, inputs):
+        return self.model.generate(**inputs)
+
+    def decode(self, outputs):
+        return self.pre_processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
diff --git a/src/transformers/tools/image_question_answering.py b/src/transformers/tools/image_question_answering.py
new file mode 100644
index 000000000000..a9d9ef82b514
--- /dev/null
+++ b/src/transformers/tools/image_question_answering.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+import torch
+
+from ..models.auto import AutoModelForVisualQuestionAnswering, AutoProcessor
+from ..utils import requires_backends
+from .base import PipelineTool
+
+
+if TYPE_CHECKING:
+    from PIL import Image
+
+
+class ImageQuestionAnsweringTool(PipelineTool):
+    default_checkpoint = "dandelin/vilt-b32-finetuned-vqa"
+    description = (
+        "This is a tool that answers a question about an image. It takes an input named `image` which should be the "
+        "image containing the information, as well as a `question` which should be the question in English. It "
+        "returns a text that is the answer to the question."
+    )
+    name = "image_qa"
+    pre_processor_class = AutoProcessor
+    model_class = AutoModelForVisualQuestionAnswering
+
+    inputs = ["image", "text"]
+    outputs = ["text"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+        super().__init__(*args, **kwargs)
+
+    def encode(self, image: "Image", question: str):
+        return self.pre_processor(image, question, return_tensors="pt")
+
+    def forward(self, inputs):
+        with torch.no_grad():
+            return self.model(**inputs).logits
+
+    def decode(self, outputs):
+        idx = outputs.argmax(-1).item()
+        return self.model.config.id2label[idx]
diff --git a/src/transformers/tools/image_segmentation.py b/src/transformers/tools/image_segmentation.py
new file mode 100644
index 000000000000..b6cbf3eb3f7d
--- /dev/null
+++ b/src/transformers/tools/image_segmentation.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import torch
+
+from ..models.clipseg import CLIPSegForImageSegmentation
+from ..utils import is_vision_available, requires_backends
+from .base import PipelineTool
+
+
+if is_vision_available():
+    from PIL import Image
+
+
+class ImageSegmentationTool(PipelineTool):
+    description = (
+        "This is a tool that creates a segmentation mask of an image according to a label. It cannot create an image."
+        "It takes two arguments named `image` which should be the original image, and `label` which should be a text "
+        "describing the elements what should be identified in the segmentation mask. The tool returns the mask."
+    )
+    default_checkpoint = "CIDAS/clipseg-rd64-refined"
+    name = "image_segmenter"
+    model_class = CLIPSegForImageSegmentation
+
+    inputs = ["image", "text"]
+    outputs = ["image"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+        super().__init__(*args, **kwargs)
+
+    def encode(self, image: "Image", label: str):
+        return self.pre_processor(text=[label], images=[image], padding=True, return_tensors="pt")
+
+    def forward(self, inputs):
+        with torch.no_grad():
+            logits = self.model(**inputs).logits
+        return logits
+
+    def decode(self, outputs):
+        array = outputs.cpu().detach().numpy()
+        array[array <= 0] = 0
+        array[array > 0] = 1
+        return Image.fromarray((array * 255).astype(np.uint8))
diff --git a/src/transformers/tools/prompts.py b/src/transformers/tools/prompts.py
new file mode 100644
index 000000000000..2dbb799f859f
--- /dev/null
+++ b/src/transformers/tools/prompts.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+
+from ..utils import cached_file
+
+
+# docstyle-ignore
+CHAT_MESSAGE_PROMPT = """
+Human: <<task>>
+
+Assistant: """
+
+
+DEFAULT_PROMPTS_REPO = "huggingface-tools/default-prompts"
+PROMPT_FILES = {"chat": "chat_prompt_template.txt", "run": "run_prompt_template.txt"}
+
+
+def download_prompt(prompt_or_repo_id, agent_name, mode="run"):
+    """
+    Downloads and caches the prompt from a repo and returns it contents (if necessary)
+    """
+    if prompt_or_repo_id is None:
+        prompt_or_repo_id = DEFAULT_PROMPTS_REPO
+
+    # prompt is considered a repo ID when it does not contain any kind of space
+    if re.search("\\s", prompt_or_repo_id) is not None:
+        return prompt_or_repo_id
+
+    prompt_file = cached_file(
+        prompt_or_repo_id, PROMPT_FILES[mode], repo_type="dataset", user_agent={"agent": agent_name}
+    )
+    with open(prompt_file, "r", encoding="utf-8") as f:
+        return f.read()
diff --git a/src/transformers/tools/python_interpreter.py b/src/transformers/tools/python_interpreter.py
new file mode 100644
index 000000000000..960be1a2a265
--- /dev/null
+++ b/src/transformers/tools/python_interpreter.py
@@ -0,0 +1,253 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import ast
+import difflib
+from collections.abc import Mapping
+from typing import Any, Callable, Dict
+
+
+class InterpretorError(ValueError):
+    """
+    An error raised when the interpretor cannot evaluate a Python expression, due to syntax error or unsupported
+    operations.
+    """
+
+    pass
+
+
+def evaluate(code: str, tools: Dict[str, Callable], state=None, chat_mode=False):
+    """
+    Evaluate a python expression using the content of the variables stored in a state and only evaluating a given set
+    of functions.
+
+    This function will recurse through the nodes of the tree provided.
+
+    Args:
+        code (`str`):
+            The code to evaluate.
+        tools (`Dict[str, Callable]`):
+            The functions that may be called during the evaluation. Any call to another function will fail with an
+            `InterpretorError`.
+        state (`Dict[str, Any]`):
+            A dictionary mapping variable names to values. The `state` should contain the initial inputs but will be
+            updated by this function to contain all variables as they are evaluated.
+        chat_mode (`bool`, *optional*, defaults to `False`):
+            Whether or not the function is called from `Agent.chat`.
+    """
+    try:
+        expression = ast.parse(code)
+    except SyntaxError as e:
+        print("The code generated by the agent is not valid.\n", e)
+        return
+    if state is None:
+        state = {}
+    result = None
+    for idx, node in enumerate(expression.body):
+        try:
+            line_result = evaluate_ast(node, state, tools)
+        except InterpretorError as e:
+            msg = f"Evaluation of the code stopped at line {idx} before the end because of the following error"
+            if chat_mode:
+                msg += (
+                    f". Copy paste the following error message and send it back to the agent:\nI get an error: '{e}'"
+                )
+            else:
+                msg += f":\n{e}"
+            print(msg)
+            break
+        if line_result is not None:
+            result = line_result
+
+    return result
+
+
+def evaluate_ast(expression: ast.AST, state: Dict[str, Any], tools: Dict[str, Callable]):
+    """
+    Evaluate an absract syntax tree using the content of the variables stored in a state and only evaluating a given
+    set of functions.
+
+    This function will recurse trough the nodes of the tree provided.
+
+    Args:
+        expression (`ast.AST`):
+            The code to evaluate, as an abastract syntax tree.
+        state (`Dict[str, Any]`):
+            A dictionary mapping variable names to values. The `state` is updated if need be when the evaluation
+            encounters assignements.
+        tools (`Dict[str, Callable]`):
+            The functions that may be called during the evaluation. Any call to another function will fail with an
+            `InterpretorError`.
+    """
+    if isinstance(expression, ast.Assign):
+        # Assignement -> we evaluate the assignement which should update the state
+        # We return the variable assigned as it may be used to determine the final result.
+        return evaluate_assign(expression, state, tools)
+    elif isinstance(expression, ast.Call):
+        # Function call -> we return the value of the function call
+        return evaluate_call(expression, state, tools)
+    elif isinstance(expression, ast.Constant):
+        # Constant -> just return the value
+        return expression.value
+    elif isinstance(expression, ast.Dict):
+        # Dict -> evaluate all keys and values
+        keys = [evaluate_ast(k, state, tools) for k in expression.keys]
+        values = [evaluate_ast(v, state, tools) for v in expression.values]
+        return dict(zip(keys, values))
+    elif isinstance(expression, ast.Expr):
+        # Expression -> evaluate the content
+        return evaluate_ast(expression.value, state, tools)
+    elif isinstance(expression, ast.For):
+        # For loop -> execute the loop
+        return evaluate_for(expression, state, tools)
+    elif isinstance(expression, ast.FormattedValue):
+        # Formatted value (part of f-string) -> evaluate the content and return
+        return evaluate_ast(expression.value, state, tools)
+    elif isinstance(expression, ast.If):
+        # If -> execute the right branch
+        return evaluate_if(expression, state, tools)
+    elif hasattr(ast, "Index") and isinstance(expression, ast.Index):
+        return evaluate_ast(expression.value, state, tools)
+    elif isinstance(expression, ast.JoinedStr):
+        return "".join([str(evaluate_ast(v, state, tools)) for v in expression.values])
+    elif isinstance(expression, ast.List):
+        # List -> evaluate all elements
+        return [evaluate_ast(elt, state, tools) for elt in expression.elts]
+    elif isinstance(expression, ast.Name):
+        # Name -> pick up the value in the state
+        return evaluate_name(expression, state, tools)
+    elif isinstance(expression, ast.Subscript):
+        # Subscript -> return the value of the indexing
+        return evaluate_subscript(expression, state, tools)
+    else:
+        # For now we refuse anything else. Let's add things as we need them.
+        raise InterpretorError(f"{expression.__class__.__name__} is not supported.")
+
+
+def evaluate_assign(assign, state, tools):
+    var_names = assign.targets
+    result = evaluate_ast(assign.value, state, tools)
+
+    if len(var_names) == 1:
+        state[var_names[0].id] = result
+    else:
+        if len(result) != len(var_names):
+            raise InterpretorError(f"Expected {len(var_names)} values but got {len(result)}.")
+        for var_name, r in zip(var_names, result):
+            state[var_name.id] = r
+    return result
+
+
+def evaluate_call(call, state, tools):
+    if not isinstance(call.func, ast.Name):
+        raise InterpretorError(
+            f"It is not permitted to evaluate other functions than the provided tools (tried to execute {call.func} of "
+            f"type {type(call.func)}."
+        )
+    func_name = call.func.id
+    if func_name not in tools:
+        raise InterpretorError(
+            f"It is not permitted to evaluate other functions than the provided tools (tried to execute {call.func.id})."
+        )
+
+    func = tools[func_name]
+    # Todo deal with args
+    args = [evaluate_ast(arg, state, tools) for arg in call.args]
+    kwargs = {keyword.arg: evaluate_ast(keyword.value, state, tools) for keyword in call.keywords}
+    return func(*args, **kwargs)
+
+
+def evaluate_subscript(subscript, state, tools):
+    index = evaluate_ast(subscript.slice, state, tools)
+    value = evaluate_ast(subscript.value, state, tools)
+    if isinstance(value, (list, tuple)):
+        return value[int(index)]
+    if index in value:
+        return value[index]
+    if isinstance(index, str) and isinstance(value, Mapping):
+        close_matches = difflib.get_close_matches(index, list(value.keys()))
+        if len(close_matches) > 0:
+            return value[close_matches[0]]
+
+    raise InterpretorError(f"Could not index {value} with '{index}'.")
+
+
+def evaluate_name(name, state, tools):
+    if name.id in state:
+        return state[name.id]
+    close_matches = difflib.get_close_matches(name.id, list(state.keys()))
+    if len(close_matches) > 0:
+        return state[close_matches[0]]
+    raise InterpretorError(f"The variable `{name.id}` is not defined.")
+
+
+def evaluate_condition(condition, state, tools):
+    if len(condition.ops) > 1:
+        raise InterpretorError("Cannot evaluate conditions with multiple operators")
+
+    left = evaluate_ast(condition.left, state, tools)
+    comparator = condition.ops[0]
+    right = evaluate_ast(condition.comparators[0], state, tools)
+
+    if isinstance(comparator, ast.Eq):
+        return left == right
+    elif isinstance(comparator, ast.NotEq):
+        return left != right
+    elif isinstance(comparator, ast.Lt):
+        return left < right
+    elif isinstance(comparator, ast.LtE):
+        return left <= right
+    elif isinstance(comparator, ast.Gt):
+        return left > right
+    elif isinstance(comparator, ast.GtE):
+        return left >= right
+    elif isinstance(comparator, ast.Is):
+        return left is right
+    elif isinstance(comparator, ast.IsNot):
+        return left is not right
+    elif isinstance(comparator, ast.In):
+        return left in right
+    elif isinstance(comparator, ast.NotIn):
+        return left not in right
+    else:
+        raise InterpretorError(f"Operator not supported: {comparator}")
+
+
+def evaluate_if(if_statement, state, tools):
+    result = None
+    if evaluate_condition(if_statement.test, state, tools):
+        for line in if_statement.body:
+            line_result = evaluate_ast(line, state, tools)
+            if line_result is not None:
+                result = line_result
+    else:
+        for line in if_statement.orelse:
+            line_result = evaluate_ast(line, state, tools)
+            if line_result is not None:
+                result = line_result
+    return result
+
+
+def evaluate_for(for_loop, state, tools):
+    result = None
+    iterator = evaluate_ast(for_loop.iter, state, tools)
+    for counter in iterator:
+        state[for_loop.target.id] = counter
+        for expression in for_loop.body:
+            line_result = evaluate_ast(expression, state, tools)
+            if line_result is not None:
+                result = line_result
+    return result
diff --git a/src/transformers/tools/speech_to_text.py b/src/transformers/tools/speech_to_text.py
new file mode 100644
index 000000000000..d3b8fd29ee1a
--- /dev/null
+++ b/src/transformers/tools/speech_to_text.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ..models.whisper import WhisperForConditionalGeneration, WhisperProcessor
+from .base import PipelineTool
+
+
+class SpeechToTextTool(PipelineTool):
+    default_checkpoint = "openai/whisper-base"
+    description = (
+        "This is a tool that transcribes an audio into text. It takes an input named `audio` and returns the "
+        "transcribed text."
+    )
+    name = "transcriber"
+    pre_processor_class = WhisperProcessor
+    model_class = WhisperForConditionalGeneration
+
+    inputs = ["audio"]
+    outputs = ["text"]
+
+    def encode(self, audio):
+        return self.pre_processor(audio, return_tensors="pt").input_features
+
+    def forward(self, inputs):
+        return self.model.generate(inputs=inputs)
+
+    def decode(self, outputs):
+        return self.pre_processor.batch_decode(outputs, skip_special_tokens=True)[0]
diff --git a/src/transformers/tools/text_classification.py b/src/transformers/tools/text_classification.py
new file mode 100644
index 000000000000..f04cdc05b6ac
--- /dev/null
+++ b/src/transformers/tools/text_classification.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+
+from ..models.auto import AutoModelForSequenceClassification, AutoTokenizer
+from .base import PipelineTool
+
+
+class TextClassificationTool(PipelineTool):
+    """
+    Example:
+
+    ```py
+    from transformers.tools import TextClassificationTool
+
+    classifier = TextClassificationTool()
+    classifier("This is a super nice API!", labels=["positive", "negative"])
+    ```
+    """
+
+    default_checkpoint = "facebook/bart-large-mnli"
+    description = (
+        "This is a tool that classifies an English text using provided labels. It takes two inputs: `text`, which "
+        "should be the text to classify, and `labels`, which should be the list of labels to use for classification. "
+        "It returns the most likely label in the list of provided `labels` for the input text."
+    )
+    name = "text_classifier"
+    pre_processor_class = AutoTokenizer
+    model_class = AutoModelForSequenceClassification
+
+    inputs = ["text", ["text"]]
+    outputs = ["text"]
+
+    def setup(self):
+        super().setup()
+        config = self.model.config
+        self.entailment_id = -1
+        for idx, label in config.id2label.items():
+            if label.lower().startswith("entail"):
+                self.entailment_id = int(idx)
+        if self.entailment_id == -1:
+            raise ValueError("Could not determine the entailment ID from the model config, please pass it at init.")
+
+    def encode(self, text, labels):
+        self._labels = labels
+        return self.pre_processor(
+            [text] * len(labels),
+            [f"This example is {label}" for label in labels],
+            return_tensors="pt",
+            padding="max_length",
+        )
+
+    def decode(self, outputs):
+        logits = outputs.logits
+        label_id = torch.argmax(logits[:, 2]).item()
+        return self._labels[label_id]
diff --git a/src/transformers/tools/text_question_answering.py b/src/transformers/tools/text_question_answering.py
new file mode 100644
index 000000000000..2a7c2fc09a63
--- /dev/null
+++ b/src/transformers/tools/text_question_answering.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ..models.auto import AutoModelForSeq2SeqLM, AutoTokenizer
+from .base import PipelineTool
+
+
+QA_PROMPT = """Here is a text containing a lot of information: '''{text}'''.
+
+Can you answer this question about the text: '{question}'"""
+
+
+class TextQuestionAnsweringTool(PipelineTool):
+    default_checkpoint = "google/flan-t5-base"
+    description = (
+        "This is a tool that answers questions related to a text. It takes two arguments named `text`, which is the "
+        "text where to find the answer, and `question`, which is the question, and returns the answer to the question."
+    )
+    name = "text_qa"
+    pre_processor_class = AutoTokenizer
+    model_class = AutoModelForSeq2SeqLM
+
+    inputs = ["text", "text"]
+    outputs = ["text"]
+
+    def encode(self, text: str, question: str):
+        prompt = QA_PROMPT.format(text=text, question=question)
+        return self.pre_processor(prompt, return_tensors="pt")
+
+    def forward(self, inputs):
+        output_ids = self.model.generate(**inputs)
+
+        in_b, _ = inputs["input_ids"].shape
+        out_b = output_ids.shape[0]
+
+        return output_ids.reshape(in_b, out_b // in_b, *output_ids.shape[1:])[0][0]
+
+    def decode(self, outputs):
+        return self.pre_processor.decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True)
diff --git a/src/transformers/tools/text_summarization.py b/src/transformers/tools/text_summarization.py
new file mode 100644
index 000000000000..8eedf234ae50
--- /dev/null
+++ b/src/transformers/tools/text_summarization.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ..models.auto import AutoModelForSeq2SeqLM, AutoTokenizer
+from .base import PipelineTool
+
+
+class TextSummarizationTool(PipelineTool):
+    """
+    Example:
+
+    ```py
+    from transformers.tools import TextSummarizationTool
+
+    summarizer = TextSummarizationTool()
+    summarizer(long_text)
+    ```
+    """
+
+    default_checkpoint = "philschmid/bart-large-cnn-samsum"
+    description = (
+        "This is a tool that summarizes an English text. It takes an input `text` containing the text to summarize, "
+        "and returns a summary of the text."
+    )
+    name = "summarizer"
+    pre_processor_class = AutoTokenizer
+    model_class = AutoModelForSeq2SeqLM
+
+    inputs = ["text"]
+    outputs = ["text"]
+
+    def encode(self, text):
+        return self.pre_processor(text, return_tensors="pt", truncation=True)
+
+    def forward(self, inputs):
+        return self.model.generate(**inputs)[0]
+
+    def decode(self, outputs):
+        return self.pre_processor.decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True)
diff --git a/src/transformers/tools/text_to_speech.py b/src/transformers/tools/text_to_speech.py
new file mode 100644
index 000000000000..9faed77b01a3
--- /dev/null
+++ b/src/transformers/tools/text_to_speech.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+
+from ..models.speecht5 import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor
+from ..utils import is_datasets_available
+from .base import PipelineTool
+
+
+if is_datasets_available():
+    from datasets import load_dataset
+
+
+class TextToSpeechTool(PipelineTool):
+    default_checkpoint = "microsoft/speecht5_tts"
+    description = (
+        "This is a tool that reads an English text out loud. It takes an input named `text` which should contain the "
+        "text to read (in English) and returns a waveform object containing the sound."
+    )
+    name = "text_reader"
+    pre_processor_class = SpeechT5Processor
+    model_class = SpeechT5ForTextToSpeech
+    post_processor_class = SpeechT5HifiGan
+
+    inputs = ["text"]
+    outputs = ["audio"]
+
+    def setup(self):
+        if self.post_processor is None:
+            self.post_processor = "microsoft/speecht5_hifigan"
+        super().setup()
+
+    def encode(self, text, speaker_embeddings=None):
+        inputs = self.pre_processor(text=text, return_tensors="pt", truncation=True)
+
+        if speaker_embeddings is None:
+            if not is_datasets_available():
+                raise ImportError("Datasets needs to be installed if not passing speaker embeddings.")
+
+            embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+            speaker_embeddings = torch.tensor(embeddings_dataset[7305]["xvector"]).unsqueeze(0)
+
+        return {"input_ids": inputs["input_ids"], "speaker_embeddings": speaker_embeddings}
+
+    def forward(self, inputs):
+        with torch.no_grad():
+            return self.model.generate_speech(**inputs)
+
+    def decode(self, outputs):
+        with torch.no_grad():
+            return self.post_processor(outputs).cpu().detach()
diff --git a/src/transformers/tools/translation.py b/src/transformers/tools/translation.py
new file mode 100644
index 000000000000..50a164d5bd6f
--- /dev/null
+++ b/src/transformers/tools/translation.py
@@ -0,0 +1,271 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ..models.auto import AutoModelForSeq2SeqLM, AutoTokenizer
+from .base import PipelineTool
+
+
+LANGUAGE_CODES = {
+    "Acehnese Arabic": "ace_Arab",
+    "Acehnese Latin": "ace_Latn",
+    "Mesopotamian Arabic": "acm_Arab",
+    "Ta'izzi-Adeni Arabic": "acq_Arab",
+    "Tunisian Arabic": "aeb_Arab",
+    "Afrikaans": "afr_Latn",
+    "South Levantine Arabic": "ajp_Arab",
+    "Akan": "aka_Latn",
+    "Amharic": "amh_Ethi",
+    "North Levantine Arabic": "apc_Arab",
+    "Modern Standard Arabic": "arb_Arab",
+    "Modern Standard Arabic Romanized": "arb_Latn",
+    "Najdi Arabic": "ars_Arab",
+    "Moroccan Arabic": "ary_Arab",
+    "Egyptian Arabic": "arz_Arab",
+    "Assamese": "asm_Beng",
+    "Asturian": "ast_Latn",
+    "Awadhi": "awa_Deva",
+    "Central Aymara": "ayr_Latn",
+    "South Azerbaijani": "azb_Arab",
+    "North Azerbaijani": "azj_Latn",
+    "Bashkir": "bak_Cyrl",
+    "Bambara": "bam_Latn",
+    "Balinese": "ban_Latn",
+    "Belarusian": "bel_Cyrl",
+    "Bemba": "bem_Latn",
+    "Bengali": "ben_Beng",
+    "Bhojpuri": "bho_Deva",
+    "Banjar Arabic": "bjn_Arab",
+    "Banjar Latin": "bjn_Latn",
+    "Standard Tibetan": "bod_Tibt",
+    "Bosnian": "bos_Latn",
+    "Buginese": "bug_Latn",
+    "Bulgarian": "bul_Cyrl",
+    "Catalan": "cat_Latn",
+    "Cebuano": "ceb_Latn",
+    "Czech": "ces_Latn",
+    "Chokwe": "cjk_Latn",
+    "Central Kurdish": "ckb_Arab",
+    "Crimean Tatar": "crh_Latn",
+    "Welsh": "cym_Latn",
+    "Danish": "dan_Latn",
+    "German": "deu_Latn",
+    "Southwestern Dinka": "dik_Latn",
+    "Dyula": "dyu_Latn",
+    "Dzongkha": "dzo_Tibt",
+    "Greek": "ell_Grek",
+    "English": "eng_Latn",
+    "Esperanto": "epo_Latn",
+    "Estonian": "est_Latn",
+    "Basque": "eus_Latn",
+    "Ewe": "ewe_Latn",
+    "Faroese": "fao_Latn",
+    "Fijian": "fij_Latn",
+    "Finnish": "fin_Latn",
+    "Fon": "fon_Latn",
+    "French": "fra_Latn",
+    "Friulian": "fur_Latn",
+    "Nigerian Fulfulde": "fuv_Latn",
+    "Scottish Gaelic": "gla_Latn",
+    "Irish": "gle_Latn",
+    "Galician": "glg_Latn",
+    "Guarani": "grn_Latn",
+    "Gujarati": "guj_Gujr",
+    "Haitian Creole": "hat_Latn",
+    "Hausa": "hau_Latn",
+    "Hebrew": "heb_Hebr",
+    "Hindi": "hin_Deva",
+    "Chhattisgarhi": "hne_Deva",
+    "Croatian": "hrv_Latn",
+    "Hungarian": "hun_Latn",
+    "Armenian": "hye_Armn",
+    "Igbo": "ibo_Latn",
+    "Ilocano": "ilo_Latn",
+    "Indonesian": "ind_Latn",
+    "Icelandic": "isl_Latn",
+    "Italian": "ita_Latn",
+    "Javanese": "jav_Latn",
+    "Japanese": "jpn_Jpan",
+    "Kabyle": "kab_Latn",
+    "Jingpho": "kac_Latn",
+    "Kamba": "kam_Latn",
+    "Kannada": "kan_Knda",
+    "Kashmiri Arabic": "kas_Arab",
+    "Kashmiri Devanagari": "kas_Deva",
+    "Georgian": "kat_Geor",
+    "Central Kanuri Arabic": "knc_Arab",
+    "Central Kanuri Latin": "knc_Latn",
+    "Kazakh": "kaz_Cyrl",
+    "Kabiyè": "kbp_Latn",
+    "Kabuverdianu": "kea_Latn",
+    "Khmer": "khm_Khmr",
+    "Kikuyu": "kik_Latn",
+    "Kinyarwanda": "kin_Latn",
+    "Kyrgyz": "kir_Cyrl",
+    "Kimbundu": "kmb_Latn",
+    "Northern Kurdish": "kmr_Latn",
+    "Kikongo": "kon_Latn",
+    "Korean": "kor_Hang",
+    "Lao": "lao_Laoo",
+    "Ligurian": "lij_Latn",
+    "Limburgish": "lim_Latn",
+    "Lingala": "lin_Latn",
+    "Lithuanian": "lit_Latn",
+    "Lombard": "lmo_Latn",
+    "Latgalian": "ltg_Latn",
+    "Luxembourgish": "ltz_Latn",
+    "Luba-Kasai": "lua_Latn",
+    "Ganda": "lug_Latn",
+    "Luo": "luo_Latn",
+    "Mizo": "lus_Latn",
+    "Standard Latvian": "lvs_Latn",
+    "Magahi": "mag_Deva",
+    "Maithili": "mai_Deva",
+    "Malayalam": "mal_Mlym",
+    "Marathi": "mar_Deva",
+    "Minangkabau Arabic ": "min_Arab",
+    "Minangkabau Latin": "min_Latn",
+    "Macedonian": "mkd_Cyrl",
+    "Plateau Malagasy": "plt_Latn",
+    "Maltese": "mlt_Latn",
+    "Meitei Bengali": "mni_Beng",
+    "Halh Mongolian": "khk_Cyrl",
+    "Mossi": "mos_Latn",
+    "Maori": "mri_Latn",
+    "Burmese": "mya_Mymr",
+    "Dutch": "nld_Latn",
+    "Norwegian Nynorsk": "nno_Latn",
+    "Norwegian Bokmål": "nob_Latn",
+    "Nepali": "npi_Deva",
+    "Northern Sotho": "nso_Latn",
+    "Nuer": "nus_Latn",
+    "Nyanja": "nya_Latn",
+    "Occitan": "oci_Latn",
+    "West Central Oromo": "gaz_Latn",
+    "Odia": "ory_Orya",
+    "Pangasinan": "pag_Latn",
+    "Eastern Panjabi": "pan_Guru",
+    "Papiamento": "pap_Latn",
+    "Western Persian": "pes_Arab",
+    "Polish": "pol_Latn",
+    "Portuguese": "por_Latn",
+    "Dari": "prs_Arab",
+    "Southern Pashto": "pbt_Arab",
+    "Ayacucho Quechua": "quy_Latn",
+    "Romanian": "ron_Latn",
+    "Rundi": "run_Latn",
+    "Russian": "rus_Cyrl",
+    "Sango": "sag_Latn",
+    "Sanskrit": "san_Deva",
+    "Santali": "sat_Olck",
+    "Sicilian": "scn_Latn",
+    "Shan": "shn_Mymr",
+    "Sinhala": "sin_Sinh",
+    "Slovak": "slk_Latn",
+    "Slovenian": "slv_Latn",
+    "Samoan": "smo_Latn",
+    "Shona": "sna_Latn",
+    "Sindhi": "snd_Arab",
+    "Somali": "som_Latn",
+    "Southern Sotho": "sot_Latn",
+    "Spanish": "spa_Latn",
+    "Tosk Albanian": "als_Latn",
+    "Sardinian": "srd_Latn",
+    "Serbian": "srp_Cyrl",
+    "Swati": "ssw_Latn",
+    "Sundanese": "sun_Latn",
+    "Swedish": "swe_Latn",
+    "Swahili": "swh_Latn",
+    "Silesian": "szl_Latn",
+    "Tamil": "tam_Taml",
+    "Tatar": "tat_Cyrl",
+    "Telugu": "tel_Telu",
+    "Tajik": "tgk_Cyrl",
+    "Tagalog": "tgl_Latn",
+    "Thai": "tha_Thai",
+    "Tigrinya": "tir_Ethi",
+    "Tamasheq Latin": "taq_Latn",
+    "Tamasheq Tifinagh": "taq_Tfng",
+    "Tok Pisin": "tpi_Latn",
+    "Tswana": "tsn_Latn",
+    "Tsonga": "tso_Latn",
+    "Turkmen": "tuk_Latn",
+    "Tumbuka": "tum_Latn",
+    "Turkish": "tur_Latn",
+    "Twi": "twi_Latn",
+    "Central Atlas Tamazight": "tzm_Tfng",
+    "Uyghur": "uig_Arab",
+    "Ukrainian": "ukr_Cyrl",
+    "Umbundu": "umb_Latn",
+    "Urdu": "urd_Arab",
+    "Northern Uzbek": "uzn_Latn",
+    "Venetian": "vec_Latn",
+    "Vietnamese": "vie_Latn",
+    "Waray": "war_Latn",
+    "Wolof": "wol_Latn",
+    "Xhosa": "xho_Latn",
+    "Eastern Yiddish": "ydd_Hebr",
+    "Yoruba": "yor_Latn",
+    "Yue Chinese": "yue_Hant",
+    "Chinese Simplified": "zho_Hans",
+    "Chinese Traditional": "zho_Hant",
+    "Standard Malay": "zsm_Latn",
+    "Zulu": "zul_Latn",
+}
+
+
+class TranslationTool(PipelineTool):
+    """
+    Example:
+
+    ```py
+    from transformers.tools import TranslationTool
+
+    translator = TranslationTool()
+    translator("This is a super nice API!", src_lang="English", tgt_lang="French")
+    ```
+    """
+
+    default_checkpoint = "facebook/nllb-200-distilled-600M"
+    description = (
+        "This is a tool that translates text from a language to another. It takes three inputs: `text`, which should "
+        "be the text to translate, `src_lang`, which should be the language of the text to translate and `tgt_lang`, "
+        "which should be the language for the desired ouput language. Both `src_lang` and `tgt_lang` are written in "
+        "plain English, such as 'Romanian', or 'Albanian'. It returns the text translated in `tgt_lang`."
+    )
+    name = "translator"
+    pre_processor_class = AutoTokenizer
+    model_class = AutoModelForSeq2SeqLM
+    lang_to_code = LANGUAGE_CODES
+
+    inputs = ["text", "text", "text"]
+    outputs = ["text"]
+
+    def encode(self, text, src_lang, tgt_lang):
+        if src_lang not in self.lang_to_code:
+            raise ValueError(f"{src_lang} is not a supported language.")
+        if tgt_lang not in self.lang_to_code:
+            raise ValueError(f"{tgt_lang} is not a supported language.")
+        src_lang = self.lang_to_code[src_lang]
+        tgt_lang = self.lang_to_code[tgt_lang]
+        return self.pre_processor._build_translation_inputs(
+            text, return_tensors="pt", src_lang=src_lang, tgt_lang=tgt_lang
+        )
+
+    def forward(self, inputs):
+        return self.model.generate(**inputs)
+
+    def decode(self, outputs):
+        return self.post_processor.decode(outputs[0].tolist(), skip_special_tokens=True)
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index c3f38f203733..9d9c5434a705 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -17,8 +17,10 @@
 """
 
 import contextlib
+import copy
 import functools
 import glob
+import importlib.metadata
 import inspect
 import math
 import os
@@ -29,50 +31,39 @@
 import time
 import warnings
 from collections.abc import Mapping
-from distutils.util import strtobool
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 
-from tqdm.auto import tqdm
-
 
 # Integrations must be imported before ML frameworks:
-from .integrations import (  # isort: split
-    default_hp_search_backend,
+# isort: off
+from .integrations import (
     get_reporting_integration_callbacks,
     hp_params,
-    is_fairscale_available,
-    is_optuna_available,
-    is_ray_tune_available,
-    is_sigopt_available,
-    is_wandb_available,
-    run_hp_search_optuna,
-    run_hp_search_ray,
-    run_hp_search_sigopt,
-    run_hp_search_wandb,
 )
 
+# isort: on
+
+import huggingface_hub.utils as hf_hub_utils
 import numpy as np
 import torch
 import torch.distributed as dist
+from huggingface_hub import Repository, create_repo, upload_folder
 from packaging import version
 from torch import nn
 from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
-from torch.utils.data.distributed import DistributedSampler
-
-from huggingface_hub import Repository
 
 from . import __version__
 from .configuration_utils import PretrainedConfig
 from .data.data_collator import DataCollator, DataCollatorWithPadding, default_data_collator
 from .debug_utils import DebugOption, DebugUnderflowOverflow
-from .deepspeed import deepspeed_init, is_deepspeed_zero3_enabled
-from .dependency_versions_check import dep_version_check
+from .hyperparameter_search import ALL_HYPERPARAMETER_SEARCH_BACKENDS, default_hp_search_backend
+from .integrations.deepspeed import deepspeed_init, deepspeed_load_checkpoint, is_deepspeed_available
 from .modelcard import TrainingSummary
 from .modeling_utils import PreTrainedModel, load_sharded_checkpoint, unwrap_model
 from .models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, MODEL_MAPPING_NAMES
 from .optimization import Adafactor, get_scheduler
-from .pytorch_utils import ALL_LAYERNORM_LAYERS, is_torch_greater_or_equal_than_1_10, is_torch_less_than_1_11
+from .pytorch_utils import ALL_LAYERNORM_LAYERS, is_torch_less_than_1_11
 from .tokenization_utils_base import PreTrainedTokenizerBase
 from .trainer_callback import (
     CallbackHandler,
@@ -84,25 +75,24 @@
     TrainerState,
 )
 from .trainer_pt_utils import (
-    DistributedLengthGroupedSampler,
-    DistributedSamplerWithLoop,
     DistributedTensorGatherer,
     IterableDatasetShard,
     LabelSmoother,
     LengthGroupedSampler,
     SequentialDistributedSampler,
-    ShardSampler,
     distributed_broadcast_scalars,
     distributed_concat,
     find_batch_size,
+    get_dataloader_sampler,
+    get_model_param_count,
     get_module_class_from_name,
     get_parameter_names,
     nested_concat,
     nested_detach,
     nested_numpify,
-    nested_truncate,
     nested_xla_mesh_reduce,
     reissue_pt_warnings,
+    remove_dummy_checkpoint,
 )
 from .trainer_utils import (
     PREFIX_CHECKPOINT_DIR,
@@ -115,11 +105,9 @@
     IntervalStrategy,
     PredictionOutput,
     RemoveColumnsCollator,
-    ShardedDDPOption,
     TrainerMemoryTracker,
     TrainOutput,
     default_compute_objective,
-    default_hp_space,
     denumpify_detensorize,
     enable_full_determinism,
     find_executable_batch_size,
@@ -132,27 +120,36 @@
 )
 from .training_args import OptimizerNames, ParallelMode, TrainingArguments
 from .utils import (
+    ADAPTER_CONFIG_NAME,
+    ADAPTER_SAFE_WEIGHTS_NAME,
+    ADAPTER_WEIGHTS_NAME,
     CONFIG_NAME,
+    SAFE_WEIGHTS_INDEX_NAME,
+    SAFE_WEIGHTS_NAME,
     WEIGHTS_INDEX_NAME,
     WEIGHTS_NAME,
+    PushInProgress,
     can_return_loss,
     find_labels,
-    get_full_repo_name,
+    is_accelerate_available,
     is_apex_available,
+    is_bitsandbytes_available,
     is_datasets_available,
     is_in_notebook,
     is_ipex_available,
+    is_peft_available,
+    is_safetensors_available,
     is_sagemaker_dp_enabled,
     is_sagemaker_mp_enabled,
     is_torch_compile_available,
+    is_torch_neuroncore_available,
     is_torch_tpu_available,
     logging,
+    strtobool,
 )
-from .utils.generic import ContextManagers
+from .utils.quantization_config import QuantizationMethod
 
 
-_is_native_cpu_amp_available = is_torch_greater_or_equal_than_1_10
-
 DEFAULT_CALLBACKS = [DefaultFlowCallback]
 DEFAULT_PROGRESS_CALLBACK = ProgressCallback
 
@@ -170,16 +167,6 @@
 if is_torch_tpu_available(check_device=False):
     import torch_xla.core.xla_model as xm
     import torch_xla.debug.metrics as met
-    import torch_xla.distributed.parallel_loader as pl
-
-if is_fairscale_available():
-    dep_version_check("fairscale")
-    import fairscale
-    from fairscale.nn.data_parallel import FullyShardedDataParallel as FullyShardedDDP
-    from fairscale.nn.data_parallel import ShardedDataParallel as ShardedDDP
-    from fairscale.nn.wrap import auto_wrap
-    from fairscale.optim import OSS
-    from fairscale.optim.grad_scaler import ShardedGradScaler
 
 
 if is_sagemaker_mp_enabled():
@@ -193,9 +180,35 @@
     IS_SAGEMAKER_MP_POST_1_10 = False
 
 
+if is_safetensors_available():
+    import safetensors.torch
+
+
+if is_peft_available():
+    from peft import PeftModel
+
+
+if is_accelerate_available():
+    from accelerate import Accelerator, skip_first_batches
+    from accelerate import __version__ as accelerate_version
+    from accelerate.utils import DistributedDataParallelKwargs, GradientAccumulationPlugin
+
+    if version.parse(accelerate_version) > version.parse("0.20.3"):
+        from accelerate.utils import (
+            load_fsdp_model,
+            load_fsdp_optimizer,
+            save_fsdp_model,
+            save_fsdp_optimizer,
+        )
+
+    if is_deepspeed_available():
+        from accelerate.utils import DeepSpeedSchedulerWrapper
+
+
 if TYPE_CHECKING:
     import optuna
 
+
 logger = logging.get_logger(__name__)
 
 
@@ -203,6 +216,7 @@
 TRAINING_ARGS_NAME = "training_args.bin"
 TRAINER_STATE_NAME = "trainer_state.json"
 OPTIMIZER_NAME = "optimizer.pt"
+OPTIMIZER_NAME_BIN = "optimizer.bin"
 SCHEDULER_NAME = "scheduler.pt"
 SCALER_NAME = "scaler.pt"
 
@@ -244,7 +258,7 @@ class Trainer:
              `model.forward()` method are automatically removed. If it is a dictionary, it will evaluate on each
              dataset prepending the dictionary key to the metric name.
         tokenizer ([`PreTrainedTokenizerBase`], *optional*):
-            The tokenizer used to preprocess the data. If provided, will be used to automatically pad the inputs the
+            The tokenizer used to preprocess the data. If provided, will be used to automatically pad the inputs to the
             maximum length when batching inputs, and it will be saved along the model to make it easier to rerun an
             interrupted training or reuse the fine-tuned model.
         model_init (`Callable[[], PreTrainedModel]`, *optional*):
@@ -262,9 +276,9 @@ class Trainer:
             detailed in [here](callback).
 
             If you want to remove one of the default callbacks used, use the [`Trainer.remove_callback`] method.
-        optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*): A tuple
-            containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your model
-            and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`.
+        optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`):
+            A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your
+            model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`.
         preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`, *optional*):
             A function that preprocess the logits right before caching them at each evaluation step. Must take two
             tensors, the logits and the labels, and return the logits once processed as desired. The modifications made
@@ -290,6 +304,7 @@ class Trainer:
 
     """
 
+    # Those are used as methods of the Trainer in examples.
     from .trainer_pt_utils import _get_learning_rate, log_metrics, metrics_format, save_metrics, save_state
 
     def __init__(
@@ -317,6 +332,8 @@ def __init__(
         self.deepspeed = None
         self.is_in_train = False
 
+        self.create_accelerator_and_postprocess()
+
         # memory metrics - must set up as early as possible
         self._memory_tracker = TrainerMemoryTracker(self.args.skip_memory_metrics)
         self._memory_tracker.start()
@@ -349,7 +366,7 @@ def __init__(
                 f"The model you have picked ({model.__class__.__name__}) cannot be used as is for training: it only "
                 "computes hidden states and does not accept any labels. You should choose a model with a head "
                 "suitable for your task like any of the `AutoModelForXxx` listed at "
-                "https://huggingface.co/docs/transformers/model_doc/auto."
+                "https://huggingface.co/docs/transformers/model_doc/auto"
             )
 
         if hasattr(model, "is_parallelizable") and model.is_parallelizable and model.model_parallel:
@@ -357,48 +374,47 @@ def __init__(
         else:
             self.is_model_parallel = False
 
-        # At this stage the model is already loaded
-        if getattr(model, "is_loaded_in_8bit", False):
-            raise ValueError(
-                "The model you want to train is loaded in 8-bit precision. "
-                "Training an 8-bit model is not supported yet. "
-            )
+        if getattr(model, "hf_device_map", None) is not None:
+            devices = [device for device in set(model.hf_device_map.values()) if device not in ["cpu", "disk"]]
+            if len(devices) > 1:
+                self.is_model_parallel = True
+            elif len(devices) == 1:
+                self.is_model_parallel = self.args.device != torch.device(devices[0])
+            else:
+                self.is_model_parallel = False
 
-        # Setup Sharded DDP training
-        self.sharded_ddp = None
-        if len(args.sharded_ddp) > 0:
-            if args.deepspeed:
-                raise ValueError(
-                    "Using --sharded_ddp xxx together with --deepspeed is not possible, deactivate one of those flags."
-                )
-            if len(args.fsdp) > 0:
-                raise ValueError(
-                    "Using --sharded_ddp xxx together with --fsdp is not possible, deactivate one of those flags."
+            # warn users
+            if self.is_model_parallel:
+                logger.info(
+                    "You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set"
+                    " to `True` to avoid any unexpected behavior such as device placement mismatching."
                 )
 
-            if args.local_rank == -1:
-                raise ValueError("Using sharded DDP only works in distributed training.")
-            elif not is_fairscale_available():
-                raise ImportError("Sharded DDP training requires fairscale: `pip install fairscale`.")
-            elif ShardedDDPOption.SIMPLE not in args.sharded_ddp and FullyShardedDDP is None:
-                raise ImportError(
-                    "Sharded DDP in a mode other than simple training requires fairscale version >= 0.3, found "
-                    f"{fairscale.__version__}. Upgrade your fairscale library: `pip install --upgrade fairscale`."
-                )
-            elif ShardedDDPOption.SIMPLE in args.sharded_ddp:
-                self.sharded_ddp = ShardedDDPOption.SIMPLE
-            elif ShardedDDPOption.ZERO_DP_2 in args.sharded_ddp:
-                self.sharded_ddp = ShardedDDPOption.ZERO_DP_2
-            elif ShardedDDPOption.ZERO_DP_3 in args.sharded_ddp:
-                self.sharded_ddp = ShardedDDPOption.ZERO_DP_3
+        _is_peft_model = is_peft_available() and isinstance(model, PeftModel)
+        _is_quantized_and_base_model = getattr(model, "is_quantized", False) and not getattr(
+            model, "_hf_peft_config_loaded", False
+        )
+
+        # At this stage the model is already loaded
+        if _is_quantized_and_base_model and not _is_peft_model:
+            raise ValueError(
+                "You cannot perform fine-tuning on purely quantized models. Please attach trainable adapters on top of"
+                " the quantized model to correctly perform fine-tuning. Please see: https://huggingface.co/docs/transformers/peft"
+                " for more details"
+            )
+        elif _is_quantized_and_base_model and not getattr(model, "_is_quantized_training_enabled", False):
+            raise ValueError(
+                "The model you want to train is loaded in 8-bit precision.  if you want to fine-tune an 8-bit"
+                " model, please make sure that you have installed `bitsandbytes>=0.37.0`. "
+            )
 
         self.fsdp = None
         if len(args.fsdp) > 0:
-            if args.deepspeed:
+            if self.is_deepspeed_enabled:
                 raise ValueError(
                     "Using --fsdp xxx together with --deepspeed is not possible, deactivate one of those flags."
                 )
-            if args.local_rank == -1:
+            if not args.fsdp_config["xla"] and args.parallel_mode != ParallelMode.DISTRIBUTED:
                 raise ValueError("Using fsdp only works in distributed training.")
 
             # dep_version_check("torch>=1.12.0")
@@ -408,7 +424,7 @@ def __init__(
             if version.parse(version.parse(torch.__version__).base_version) < version.parse("1.12.0"):
                 raise ValueError("FSDP requires PyTorch >= 1.12.0")
 
-            from torch.distributed.fsdp.fully_sharded_data_parallel import ShardingStrategy
+            from torch.distributed.fsdp.fully_sharded_data_parallel import BackwardPrefetch, ShardingStrategy
 
             if FSDPOption.FULL_SHARD in args.fsdp:
                 self.fsdp = ShardingStrategy.FULL_SHARD
@@ -417,21 +433,30 @@ def __init__(
             elif FSDPOption.NO_SHARD in args.fsdp:
                 self.fsdp = ShardingStrategy.NO_SHARD
 
+            self.backward_prefetch = BackwardPrefetch.BACKWARD_PRE
+            if "backward_prefetch" in self.args.fsdp_config and "backward_post" in self.args.fsdp_config.get(
+                "backward_prefetch", []
+            ):
+                self.backward_prefetch = BackwardPrefetch.BACKWARD_POST
+
+            self.limit_all_gathers = False
+            if self.args.fsdp_config.get("limit_all_gathers", False):
+                self.limit_all_gathers = True
+
         # one place to sort out whether to place the model on device or not
         # postpone switching model to cuda when:
         # 1. MP - since we are trying to fit a much bigger than 1 gpu model
         # 2. fp16-enabled DeepSpeed loads the model in half the size and it doesn't need .to() anyway,
         #    and we only use deepspeed for training at the moment
         # 3. full bf16 or fp16 eval - since the model needs to be cast to the right dtype first
-        # 4. Sharded DDP - same as MP
-        # 5. FSDP - same as MP
+        # 4. FSDP - same as MP
         self.place_model_on_device = args.place_model_on_device
         if (
             self.is_model_parallel
-            or args.deepspeed
+            or self.is_deepspeed_enabled
             or ((args.fp16_full_eval or args.bf16_full_eval) and not args.do_train)
-            or (self.sharded_ddp in [ShardedDDPOption.ZERO_DP_2, ShardedDDPOption.ZERO_DP_3])
             or (self.fsdp is not None)
+            or self.is_fsdp_enabled
         ):
             self.place_model_on_device = False
 
@@ -441,7 +466,11 @@ def __init__(
         self.eval_dataset = eval_dataset
         self.tokenizer = tokenizer
 
-        if self.place_model_on_device:
+        # Bnb Quantized models doesn't support `.to` operation.
+        if (
+            self.place_model_on_device
+            and not getattr(model, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES
+        ):
             self._move_model_to_device(model, args.device)
 
         # Force n_gpu to 1 to avoid DataParallel as MP will manage the GPUs
@@ -475,11 +504,11 @@ def __init__(
                     " `Trainer`. Make sure the lines `import torch_xla.core.xla_model as xm` and"
                     " `model.to(xm.xla_device())` is performed before the optimizer creation in your script."
                 )
-        if ((self.sharded_ddp is not None) or args.deepspeed or (self.fsdp is not None)) and (
+        if (self.is_deepspeed_enabled or (self.fsdp is not None)) and (
             self.optimizer is not None or self.lr_scheduler is not None
         ):
             raise RuntimeError(
-                "Passing `optimizers` is not allowed if Fairscale, Deepspeed or PyTorch FSDP is enabled."
+                "Passing `optimizers` is not allowed if Deepspeed or PyTorch FSDP is enabled."
                 "You should subclass `Trainer` and override the `create_optimizer_and_scheduler` method."
             )
         default_callbacks = DEFAULT_CALLBACKS + get_reporting_integration_callbacks(self.args.report_to)
@@ -492,15 +521,10 @@ def __init__(
         # Will be set to True by `self._setup_loggers()` on first call to `self.log()`.
         self._loggers_initialized = False
 
-        # Create clone of distant repo and output directory if needed
+        # Create distant repo and output directory if needed
+        self.hub_model_id = None
         if self.args.push_to_hub:
-            self.init_git_repo(at_init=True)
-            # In case of pull, we need to make sure every process has the latest.
-            if is_torch_tpu_available():
-                xm.rendezvous("init git repo")
-            elif args.local_rank != -1:
-                dist.barrier()
-
+            self.init_hf_repo()
         if self.args.should_save:
             os.makedirs(self.args.output_dir, exist_ok=True)
 
@@ -511,7 +535,10 @@ def __init__(
             logger.info("max_steps is given, it will override any value given in num_train_epochs")
 
         if train_dataset is not None and not has_length(train_dataset) and args.max_steps <= 0:
-            raise ValueError("train_dataset does not implement __len__, max_steps has to be specified")
+            raise ValueError(
+                "The train_dataset does not implement __len__, max_steps has to be specified. "
+                "The number of steps needs to be known in advance for the learning rate scheduler."
+            )
 
         if (
             train_dataset is not None
@@ -524,7 +551,6 @@ def __init__(
 
         # Mixed precision setup
         self.use_apex = False
-        self.use_cuda_amp = False
         self.use_cpu_amp = False
 
         # Mixed precision setup for SageMaker Model Parallel
@@ -549,52 +575,20 @@ def __init__(
                         f"FP16 provided in SM_HP_MP_PARAMETERS is {smp.state.cfg.fp16}, "
                         "but SageMaker Model Parallelism < 1.10 does not support FP16 in trainer."
                     )
-
-        if args.fp16 or args.bf16:
-            if args.half_precision_backend == "auto":
-                if args.device == torch.device("cpu"):
-                    if args.fp16:
-                        raise ValueError("Tried to use `fp16` but it is not supported on cpu")
-                    elif _is_native_cpu_amp_available:
-                        args.half_precision_backend = "cpu_amp"
-                    else:
-                        raise ValueError("Tried to use cpu amp but native cpu amp is not available")
+        if (args.fp16 or args.bf16) and args.half_precision_backend == "auto":
+            if args.device == torch.device("cpu"):
+                if args.fp16:
+                    raise ValueError("Tried to use `fp16` but it is not supported on cpu")
                 else:
-                    args.half_precision_backend = "cuda_amp"
-
+                    args.half_precision_backend = "cpu_amp"
             logger.info(f"Using {args.half_precision_backend} half precision backend")
 
-        self.do_grad_scaling = False
-        if (args.fp16 or args.bf16) and not (args.deepspeed or is_sagemaker_mp_enabled() or is_torch_tpu_available()):
+        if (args.fp16 or args.bf16) and not (self.is_deepspeed_enabled or is_sagemaker_mp_enabled()):
             # deepspeed and SageMaker Model Parallel manage their own half precision
-            if args.half_precision_backend == "cuda_amp":
-                self.use_cuda_amp = True
-                self.amp_dtype = torch.float16 if args.fp16 else torch.bfloat16
-                self.do_grad_scaling = True
-                if self.sharded_ddp is not None:
-                    self.scaler = ShardedGradScaler()
-                elif self.fsdp is not None:
-                    if self.amp_dtype == torch.float16:
-                        from torch.distributed.fsdp.sharded_grad_scaler import (
-                            ShardedGradScaler as FSDPShardedGradScaler,
-                        )
-
-                        self.scaler = FSDPShardedGradScaler()
-                    else:
-                        self.do_grad_scaling = False
-                        self.use_cuda_amp = False
-                        self.amp_dtype = None
-
-                elif is_torch_tpu_available():
-                    from torch_xla.amp import GradScaler
-
-                    self.scaler = GradScaler()
-                else:
-                    self.scaler = torch.cuda.amp.GradScaler()
-            elif args.half_precision_backend == "cpu_amp":
+            if args.half_precision_backend == "cpu_amp":
                 self.use_cpu_amp = True
                 self.amp_dtype = torch.bfloat16
-            else:
+            elif args.half_precision_backend == "apex":
                 if not is_apex_available():
                     raise ImportError(
                         "Using FP16 with APEX but APEX is not installed, please refer to"
@@ -602,18 +596,6 @@ def __init__(
                     )
                 self.use_apex = True
 
-        # FP16 + model parallelism in SageMaker: gradient clipping does not work for now so we raise a helpful error.
-        if (
-            is_sagemaker_mp_enabled()
-            and self.use_cuda_amp
-            and args.max_grad_norm is not None
-            and args.max_grad_norm > 0
-        ):
-            raise ValueError(
-                "SageMaker Model Parallelism in mixed precision mode does not support gradient clipping yet. Pass "
-                "along 'max_grad_norm': 0 in your hyperparameters."
-            )
-
         # Label smoothing
         if self.args.label_smoothing_factor != 0:
             self.label_smoother = LabelSmoother(epsilon=self.args.label_smoothing_factor)
@@ -636,15 +618,16 @@ def __init__(
         self.can_return_loss = can_return_loss(self.model.__class__)
         self.control = self.callback_handler.on_init_end(self.args, self.state, self.control)
 
-        # Internal variables to keep track of the original batch size
+        # Internal variables to help with automatic batch size reduction
         self._train_batch_size = args.train_batch_size
+        self._created_lr_scheduler = False
 
         # very last
         self._memory_tracker.stop_and_update_metrics()
 
         # torch.compile
         if args.torch_compile and not is_torch_compile_available():
-            raise RuntimeError("Using torch.compile requires a nighly install of PyTorch.")
+            raise RuntimeError("Using torch.compile requires PyTorch 2.0 or higher.")
 
     def add_callback(self, callback):
         """
@@ -746,20 +729,6 @@ def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
         if self.train_dataset is None or not has_length(self.train_dataset):
             return None
 
-        generator = None
-        if self.args.world_size <= 1:
-            generator = torch.Generator()
-            # for backwards compatibility, we generate a seed here (which is sampled from a generator seeded with
-            # `args.seed`) if data_seed isn't provided.
-            # Further on in this method, we default to `args.seed` instead.
-            if self.args.data_seed is None:
-                seed = int(torch.empty((), dtype=torch.int64).random_().item())
-            else:
-                seed = self.args.data_seed
-            generator.manual_seed(seed)
-
-        seed = self.args.data_seed if self.args.data_seed is not None else self.args.seed
-
         # Build the sampler.
         if self.args.group_by_length:
             if is_datasets_available() and isinstance(self.train_dataset, datasets.Dataset):
@@ -771,47 +740,15 @@ def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
             else:
                 lengths = None
             model_input_name = self.tokenizer.model_input_names[0] if self.tokenizer is not None else None
-            if self.args.world_size <= 1:
-                return LengthGroupedSampler(
-                    self.args.train_batch_size * self.args.gradient_accumulation_steps,
-                    dataset=self.train_dataset,
-                    lengths=lengths,
-                    model_input_name=model_input_name,
-                    generator=generator,
-                )
-            else:
-                return DistributedLengthGroupedSampler(
-                    self.args.train_batch_size * self.args.gradient_accumulation_steps,
-                    dataset=self.train_dataset,
-                    num_replicas=self.args.world_size,
-                    rank=self.args.process_index,
-                    lengths=lengths,
-                    model_input_name=model_input_name,
-                    seed=seed,
-                )
+            return LengthGroupedSampler(
+                self.args.train_batch_size * self.args.gradient_accumulation_steps,
+                dataset=self.train_dataset,
+                lengths=lengths,
+                model_input_name=model_input_name,
+            )
 
         else:
-            if self.args.world_size <= 1:
-                return RandomSampler(self.train_dataset, generator=generator)
-            elif (
-                self.args.parallel_mode in [ParallelMode.TPU, ParallelMode.SAGEMAKER_MODEL_PARALLEL]
-                and not self.args.dataloader_drop_last
-            ):
-                # Use a loop for TPUs when drop_last is False to have all batches have the same size.
-                return DistributedSamplerWithLoop(
-                    self.train_dataset,
-                    batch_size=self.args.per_device_train_batch_size,
-                    num_replicas=self.args.world_size,
-                    rank=self.args.process_index,
-                    seed=seed,
-                )
-            else:
-                return DistributedSampler(
-                    self.train_dataset,
-                    num_replicas=self.args.world_size,
-                    rank=self.args.process_index,
-                    seed=seed,
-                )
+            return RandomSampler(self.train_dataset)
 
     def get_train_dataloader(self) -> DataLoader:
         """
@@ -832,36 +769,19 @@ def get_train_dataloader(self) -> DataLoader:
         else:
             data_collator = self._get_collator_with_removed_columns(data_collator, description="training")
 
-        if isinstance(train_dataset, torch.utils.data.IterableDataset):
-            if self.args.world_size > 1:
-                train_dataset = IterableDatasetShard(
-                    train_dataset,
-                    batch_size=self._train_batch_size,
-                    drop_last=self.args.dataloader_drop_last,
-                    num_processes=self.args.world_size,
-                    process_index=self.args.process_index,
-                )
+        dataloader_params = {
+            "batch_size": self._train_batch_size,
+            "collate_fn": data_collator,
+            "num_workers": self.args.dataloader_num_workers,
+            "pin_memory": self.args.dataloader_pin_memory,
+        }
 
-            return DataLoader(
-                train_dataset,
-                batch_size=self.args.per_device_train_batch_size,
-                collate_fn=data_collator,
-                num_workers=self.args.dataloader_num_workers,
-                pin_memory=self.args.dataloader_pin_memory,
-            )
+        if not isinstance(train_dataset, torch.utils.data.IterableDataset):
+            dataloader_params["sampler"] = self._get_train_sampler()
+            dataloader_params["drop_last"] = self.args.dataloader_drop_last
+            dataloader_params["worker_init_fn"] = seed_worker
 
-        train_sampler = self._get_train_sampler()
-
-        return DataLoader(
-            train_dataset,
-            batch_size=self._train_batch_size,
-            sampler=train_sampler,
-            collate_fn=data_collator,
-            drop_last=self.args.dataloader_drop_last,
-            num_workers=self.args.dataloader_num_workers,
-            pin_memory=self.args.dataloader_pin_memory,
-            worker_init_fn=seed_worker,
-        )
+        return self.accelerator.prepare(DataLoader(train_dataset, **dataloader_params))
 
     def _get_eval_sampler(self, eval_dataset: Dataset) -> Optional[torch.utils.data.Sampler]:
         # Deprecated code
@@ -877,20 +797,13 @@ def _get_eval_sampler(self, eval_dataset: Dataset) -> Optional[torch.utils.data.
                     rank=smp.dp_rank(),
                     batch_size=self.args.per_device_eval_batch_size,
                 )
-            elif self.args.local_rank != -1:
-                return SequentialDistributedSampler(eval_dataset)
             else:
                 return SequentialSampler(eval_dataset)
 
         if self.args.world_size <= 1:
             return SequentialSampler(eval_dataset)
         else:
-            return ShardSampler(
-                eval_dataset,
-                batch_size=self.args.per_device_eval_batch_size,
-                num_processes=self.args.world_size,
-                process_index=self.args.process_index,
-            )
+            return None
 
     def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoader:
         """
@@ -913,34 +826,18 @@ def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoa
         else:
             data_collator = self._get_collator_with_removed_columns(data_collator, description="evaluation")
 
-        if isinstance(eval_dataset, torch.utils.data.IterableDataset):
-            if self.args.world_size > 1:
-                eval_dataset = IterableDatasetShard(
-                    eval_dataset,
-                    batch_size=self.args.per_device_eval_batch_size,
-                    drop_last=self.args.dataloader_drop_last,
-                    num_processes=self.args.world_size,
-                    process_index=self.args.process_index,
-                )
-            return DataLoader(
-                eval_dataset,
-                batch_size=self.args.eval_batch_size,
-                collate_fn=data_collator,
-                num_workers=self.args.dataloader_num_workers,
-                pin_memory=self.args.dataloader_pin_memory,
-            )
+        dataloader_params = {
+            "batch_size": self.args.eval_batch_size,
+            "collate_fn": data_collator,
+            "num_workers": self.args.dataloader_num_workers,
+            "pin_memory": self.args.dataloader_pin_memory,
+        }
 
-        eval_sampler = self._get_eval_sampler(eval_dataset)
+        if not isinstance(eval_dataset, torch.utils.data.IterableDataset):
+            dataloader_params["sampler"] = self._get_eval_sampler(eval_dataset)
+            dataloader_params["drop_last"] = self.args.dataloader_drop_last
 
-        return DataLoader(
-            eval_dataset,
-            sampler=eval_sampler,
-            batch_size=self.args.eval_batch_size,
-            collate_fn=data_collator,
-            drop_last=self.args.dataloader_drop_last,
-            num_workers=self.args.dataloader_num_workers,
-            pin_memory=self.args.dataloader_pin_memory,
-        )
+        return self.accelerator.prepare(DataLoader(eval_dataset, **dataloader_params))
 
     def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader:
         """
@@ -960,35 +857,19 @@ def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader:
         else:
             data_collator = self._get_collator_with_removed_columns(data_collator, description="test")
 
-        if isinstance(test_dataset, torch.utils.data.IterableDataset):
-            if self.args.world_size > 1:
-                test_dataset = IterableDatasetShard(
-                    test_dataset,
-                    batch_size=self.args.eval_batch_size,
-                    drop_last=self.args.dataloader_drop_last,
-                    num_processes=self.args.world_size,
-                    process_index=self.args.process_index,
-                )
-            return DataLoader(
-                test_dataset,
-                batch_size=self.args.eval_batch_size,
-                collate_fn=data_collator,
-                num_workers=self.args.dataloader_num_workers,
-                pin_memory=self.args.dataloader_pin_memory,
-            )
+        dataloader_params = {
+            "batch_size": self.args.eval_batch_size,
+            "collate_fn": data_collator,
+            "num_workers": self.args.dataloader_num_workers,
+            "pin_memory": self.args.dataloader_pin_memory,
+        }
 
-        test_sampler = self._get_eval_sampler(test_dataset)
+        if not isinstance(test_dataset, torch.utils.data.IterableDataset):
+            dataloader_params["sampler"] = self._get_eval_sampler(test_dataset)
+            dataloader_params["drop_last"] = self.args.dataloader_drop_last
 
         # We use the same batch_size as for eval.
-        return DataLoader(
-            test_dataset,
-            sampler=test_sampler,
-            batch_size=self.args.eval_batch_size,
-            collate_fn=data_collator,
-            drop_last=self.args.dataloader_drop_last,
-            num_workers=self.args.dataloader_num_workers,
-            pin_memory=self.args.dataloader_pin_memory,
-        )
+        return self.accelerator.prepare(DataLoader(test_dataset, **dataloader_params))
 
     def create_optimizer_and_scheduler(self, num_training_steps: int):
         """
@@ -1006,6 +887,17 @@ def create_optimizer_and_scheduler(self, num_training_steps: int):
             optimizer = self.optimizer
         self.create_scheduler(num_training_steps=num_training_steps, optimizer=optimizer)
 
+    def get_decay_parameter_names(self, model) -> List[str]:
+        """
+        Get all parameter names that weight decay will be applied to
+
+        Note that some models implement their own layernorm instead of calling nn.LayerNorm, weight decay could still
+        apply to those modules since this function only filter out instance of nn.LayerNorm
+        """
+        decay_parameters = get_parameter_names(model, ALL_LAYERNORM_LAYERS)
+        decay_parameters = [name for name in decay_parameters if "bias" not in name]
+        return decay_parameters
+
     def create_optimizer(self):
         """
         Setup the optimizer.
@@ -1016,38 +908,38 @@ def create_optimizer(self):
         opt_model = self.model_wrapped if is_sagemaker_mp_enabled() else self.model
 
         if self.optimizer is None:
-            decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS)
-            decay_parameters = [name for name in decay_parameters if "bias" not in name]
+            decay_parameters = self.get_decay_parameter_names(opt_model)
             optimizer_grouped_parameters = [
                 {
-                    "params": [p for n, p in opt_model.named_parameters() if n in decay_parameters],
+                    "params": [
+                        p for n, p in opt_model.named_parameters() if (n in decay_parameters and p.requires_grad)
+                    ],
                     "weight_decay": self.args.weight_decay,
                 },
                 {
-                    "params": [p for n, p in opt_model.named_parameters() if n not in decay_parameters],
+                    "params": [
+                        p for n, p in opt_model.named_parameters() if (n not in decay_parameters and p.requires_grad)
+                    ],
                     "weight_decay": 0.0,
                 },
             ]
 
             optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args)
 
-            if self.sharded_ddp == ShardedDDPOption.SIMPLE:
-                self.optimizer = OSS(
-                    params=optimizer_grouped_parameters,
-                    optim=optimizer_cls,
-                    **optimizer_kwargs,
-                )
-            else:
-                self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
-                if optimizer_cls.__name__ == "Adam8bit":
-                    import bitsandbytes
+            self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
+            if optimizer_cls.__name__ == "Adam8bit":
+                import bitsandbytes
 
-                    manager = bitsandbytes.optim.GlobalOptimManager.get_instance()
+                manager = bitsandbytes.optim.GlobalOptimManager.get_instance()
 
-                    for module in opt_model.modules():
-                        if isinstance(module, nn.Embedding):
-                            manager.register_module_override(module, "weight", {"optim_bits": 32})
-                            logger.debug(f"bitsandbytes: will optimize {module} in fp32")
+                skipped = 0
+                for module in opt_model.modules():
+                    if isinstance(module, nn.Embedding):
+                        skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values())
+                        logger.info(f"skipped {module}: {skipped/2**20}M params")
+                        manager.register_module_override(module, "weight", {"optim_bits": 32})
+                        logger.debug(f"bitsandbytes: will optimize {module} in fp32")
+                logger.info(f"skipped: {skipped/2**20}M params")
 
         if is_sagemaker_mp_enabled():
             self.optimizer = smp.DistributedOptimizer(self.optimizer)
@@ -1086,11 +978,13 @@ def get_optimizer_cls_and_kwargs(args: TrainingArguments) -> Tuple[Any, Any]:
 
             optimizer_cls = AdamW
             optimizer_kwargs.update(adam_kwargs)
-        elif args.optim == OptimizerNames.ADAMW_TORCH:
+        elif args.optim in [OptimizerNames.ADAMW_TORCH, OptimizerNames.ADAMW_TORCH_FUSED]:
             from torch.optim import AdamW
 
             optimizer_cls = AdamW
             optimizer_kwargs.update(adam_kwargs)
+            if args.optim == OptimizerNames.ADAMW_TORCH_FUSED:
+                optimizer_kwargs.update({"fused": True})
         elif args.optim == OptimizerNames.ADAMW_TORCH_XLA:
             try:
                 from torch_xla.amp.syncfree import AdamW
@@ -1099,6 +993,14 @@ def get_optimizer_cls_and_kwargs(args: TrainingArguments) -> Tuple[Any, Any]:
                 optimizer_kwargs.update(adam_kwargs)
             except ImportError:
                 raise ValueError("Trainer failed to import syncfree AdamW from torch_xla.")
+        elif args.optim == OptimizerNames.ADAMW_TORCH_NPU_FUSED:
+            try:
+                from torch_npu.optim import NpuFusedAdamW
+
+                optimizer_cls = NpuFusedAdamW
+                optimizer_kwargs.update(adam_kwargs)
+            except ImportError:
+                raise ValueError("Trainer failed to import FusedAdamW from torch_npu.")
         elif args.optim == OptimizerNames.ADAMW_APEX_FUSED:
             try:
                 from apex.optimizers import FusedAdam
@@ -1107,14 +1009,45 @@ def get_optimizer_cls_and_kwargs(args: TrainingArguments) -> Tuple[Any, Any]:
                 optimizer_kwargs.update(adam_kwargs)
             except ImportError:
                 raise ValueError("Trainer tried to instantiate apex FusedAdam but apex is not installed!")
-        elif args.optim == OptimizerNames.ADAMW_BNB:
+        elif args.optim in [
+            OptimizerNames.ADAMW_BNB,
+            OptimizerNames.ADAMW_8BIT,
+            OptimizerNames.PAGED_ADAMW,
+            OptimizerNames.PAGED_ADAMW_8BIT,
+            OptimizerNames.LION,
+            OptimizerNames.LION_8BIT,
+            OptimizerNames.PAGED_LION,
+            OptimizerNames.PAGED_LION_8BIT,
+        ]:
             try:
-                from bitsandbytes.optim import Adam8bit
-
-                optimizer_cls = Adam8bit
-                optimizer_kwargs.update(adam_kwargs)
+                from bitsandbytes.optim import AdamW, Lion
+
+                is_paged = False
+                optim_bits = 32
+                optimizer_cls = None
+                additional_optim_kwargs = adam_kwargs
+                if "paged" in args.optim:
+                    is_paged = True
+                if "8bit" in args.optim:
+                    optim_bits = 8
+                if "adam" in args.optim:
+                    optimizer_cls = AdamW
+                elif "lion" in args.optim:
+                    optimizer_cls = Lion
+                    additional_optim_kwargs = {"betas": (args.adam_beta1, args.adam_beta2)}
+
+                bnb_kwargs = {"is_paged": is_paged, "optim_bits": optim_bits}
+                optimizer_kwargs.update(additional_optim_kwargs)
+                optimizer_kwargs.update(bnb_kwargs)
             except ImportError:
-                raise ValueError("Trainer tried to instantiate bnb Adam8bit but bnb is not installed!")
+                raise ValueError("Trainer tried to instantiate bnb optimizer but bnb is not installed!")
+            if is_bitsandbytes_available() and version.parse(
+                importlib.metadata.version("bitsandbytes")
+            ) < version.parse("0.41.1"):
+                logger.warning(
+                    "You are using 8-bit optimizers with a version of `bitsandbytes` < 0.41.1. "
+                    "It is recommended to update your version as a major bug has been fixed in 8-bit optimizers."
+                )
         elif args.optim == OptimizerNames.ADAMW_ANYPRECISION:
             try:
                 from torchdistx.optimizers import AnyPrecisionAdamW
@@ -1139,6 +1072,8 @@ def get_optimizer_cls_and_kwargs(args: TrainingArguments) -> Tuple[Any, Any]:
             optimizer_cls = torch.optim.SGD
         elif args.optim == OptimizerNames.ADAGRAD:
             optimizer_cls = torch.optim.Adagrad
+        elif args.optim == OptimizerNames.RMSPROP:
+            optimizer_cls = torch.optim.RMSprop
         else:
             raise ValueError(f"Trainer cannot instantiate unsupported optimizer: {args.optim}")
         return optimizer_cls, optimizer_kwargs
@@ -1158,6 +1093,7 @@ def create_scheduler(self, num_training_steps: int, optimizer: torch.optim.Optim
                 num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
                 num_training_steps=num_training_steps,
             )
+            self._created_lr_scheduler = True
         return self.lr_scheduler
 
     def num_examples(self, dataloader: DataLoader) -> int:
@@ -1174,6 +1110,22 @@ def num_examples(self, dataloader: DataLoader) -> int:
         except (NameError, AttributeError, TypeError):  # no dataset or length, estimate by length of dataloader
             return len(dataloader) * self.args.per_device_train_batch_size
 
+    def num_tokens(self, train_dl: DataLoader, max_steps: Optional[int] = None) -> int:
+        """
+        Helper to get number of tokens in a [`~torch.utils.data.DataLoader`] by enumerating dataloader.
+        """
+        train_tokens = 0
+        try:
+            for step, batch in enumerate(train_dl):
+                tokens = batch["input_ids"].numel()
+                if max_steps is not None:
+                    return tokens * max_steps
+                train_tokens += tokens
+            return train_tokens
+        except KeyError:
+            logger.warning("Cannot get num_tokens from dataloader")
+            return train_tokens
+
     def _hp_search_setup(self, trial: Union["optuna.Trial", Dict[str, Any]]):
         """HP search setup code"""
         self._trial = trial
@@ -1201,6 +1153,7 @@ def _hp_search_setup(self, trial: Union["optuna.Trial", Dict[str, Any]]):
             # Casting value to the proper type
             if old_attr is not None:
                 value = type(old_attr)(value)
+
             setattr(self.args, key, value)
         if self.hp_search_backend == HPSearchBackend.OPTUNA:
             logger.info(f"Trial: {trial.params}")
@@ -1208,12 +1161,19 @@ def _hp_search_setup(self, trial: Union["optuna.Trial", Dict[str, Any]]):
             logger.info(f"SigOpt Assignments: {trial.assignments}")
         if self.hp_search_backend == HPSearchBackend.WANDB:
             logger.info(f"W&B Sweep parameters: {trial}")
-        if self.args.deepspeed:
+        if self.is_deepspeed_enabled:
+            if self.args.deepspeed is None:
+                raise ValueError("For sweeps with deepspeed, `args.deepspeed` must be set")
             # Rebuild the deepspeed config to reflect the updated training parameters
-            from transformers.deepspeed import HfTrainerDeepSpeedConfig
+            from accelerate.utils import DeepSpeedPlugin
+
+            from transformers.integrations.deepspeed import HfTrainerDeepSpeedConfig
 
             self.args.hf_deepspeed_config = HfTrainerDeepSpeedConfig(self.args.deepspeed)
             self.args.hf_deepspeed_config.trainer_config_process(self.args)
+            self.args.deepspeed_plugin = DeepSpeedPlugin(hf_ds_config=self.args.hf_deepspeed_config)
+
+        self.create_accelerator_and_postprocess()
 
     def _report_to_hp_search(self, trial: Union["optuna.Trial", Dict[str, Any]], step: int, metrics: Dict[str, float]):
         if self.hp_search_backend is None or trial is None:
@@ -1222,10 +1182,11 @@ def _report_to_hp_search(self, trial: Union["optuna.Trial", Dict[str, Any]], ste
         if self.hp_search_backend == HPSearchBackend.OPTUNA:
             import optuna
 
-            trial.report(self.objective, step)
-            if trial.should_prune():
-                self.callback_handler.on_train_end(self.args, self.state, self.control)
-                raise optuna.TrialPruned()
+            if not trial.study._is_multi_objective():
+                trial.report(self.objective, step)
+                if trial.should_prune():
+                    self.callback_handler.on_train_end(self.args, self.state, self.control)
+                    raise optuna.TrialPruned()
         elif self.hp_search_backend == HPSearchBackend.RAY:
             from ray import tune
 
@@ -1268,9 +1229,14 @@ def torch_jit_model_eval(self, model, dataloader, training=False):
             example_batch = next(iter(dataloader))
             example_batch = self._prepare_inputs(example_batch)
             try:
-                jit_model = model.eval()
-                with ContextManagers([self.autocast_smart_context_manager(cache_enabled=False), torch.no_grad()]):
-                    if version.parse(version.parse(torch.__version__).base_version) >= version.parse("1.14.0"):
+                jit_model = copy.copy(model)
+                jit_model.eval()
+                original_forward = jit_model.__dict__.pop("_original_forward", None)
+                # remove mixed precision hooks from the model
+                if original_forward:
+                    jit_model.forward = original_forward
+                with self.accelerator.autocast(cache_enabled=False), torch.no_grad():
+                    if version.parse(version.parse(torch.__version__).base_version) >= version.parse("2.0.0"):
                         if isinstance(example_batch, dict):
                             jit_model = torch.jit.trace(jit_model, example_kwarg_inputs=example_batch, strict=False)
                         else:
@@ -1292,7 +1258,6 @@ def torch_jit_model_eval(self, model, dataloader, training=False):
                     jit_model(**example_batch)
                 model = jit_model
                 self.use_cpu_amp = False
-                self.use_cuda_amp = False
             except (RuntimeError, TypeError, ValueError, NameError, IndexError) as e:
                 logger.warning(f"failed to use PyTorch jit mode due to: {e}.")
 
@@ -1309,8 +1274,9 @@ def ipex_optimize_model(self, model, training=False, dtype=torch.float32):
 
         if not training:
             model.eval()
+            dtype = torch.bfloat16 if not self.is_in_train and self.args.bf16_full_eval else dtype
             # conv_bn_folding is disabled as it fails in symbolic tracing, resulting in ipex warnings
-            model = ipex.optimize(model, dtype=dtype, level="O1", conv_bn_folding=False)
+            model = ipex.optimize(model, dtype=dtype, level="O1", conv_bn_folding=False, inplace=not self.is_in_train)
         else:
             if not model.training:
                 model.train()
@@ -1321,9 +1287,6 @@ def ipex_optimize_model(self, model, training=False, dtype=torch.float32):
         return model
 
     def _wrap_model(self, model, training=True, dataloader=None):
-        if self.args.torch_compile:
-            model = torch.compile(model, backend=self.args.torch_compile_backend, mode=self.args.torch_compile_mode)
-
         if self.args.use_ipex:
             dtype = torch.bfloat16 if self.use_cpu_amp else torch.float32
             model = self.ipex_optimize_model(model, training, dtype=dtype)
@@ -1334,10 +1297,6 @@ def _wrap_model(self, model, training=True, dataloader=None):
                 return self.model_wrapped
             return smp.DistributedModel(model, backward_passes_per_step=self.args.gradient_accumulation_steps)
 
-        # already initialized its own DDP and AMP
-        if self.deepspeed:
-            return self.deepspeed
-
         # train/eval could be run multiple-times - if already wrapped, don't re-wrap it again
         if unwrap_model(model) is not model:
             if self.args.ort:
@@ -1351,8 +1310,8 @@ def _wrap_model(self, model, training=True, dataloader=None):
         if self.use_apex and training:
             model, self.optimizer = amp.initialize(model, self.optimizer, opt_level=self.args.fp16_opt_level)
 
-        # Multi-gpu training (should be after apex fp16 initialization)
-        if self.args.n_gpu > 1:
+        # Multi-gpu training (should be after apex fp16 initialization) / 8bit models does not support DDP
+        if self.args.n_gpu > 1 and not getattr(model, "is_loaded_in_8bit", False):
             model = nn.DataParallel(model)
 
         if self.args.jit_mode_eval:
@@ -1366,76 +1325,72 @@ def _wrap_model(self, model, training=True, dataloader=None):
             return model
 
         # Distributed training (should be after apex fp16 initialization)
-        if self.sharded_ddp is not None:
-            # Sharded DDP!
-            if self.sharded_ddp == ShardedDDPOption.SIMPLE:
-                model = ShardedDDP(model, self.optimizer)
-            else:
-                mixed_precision = self.args.fp16 or self.args.bf16
-                cpu_offload = ShardedDDPOption.OFFLOAD in self.args.sharded_ddp
-                zero_3 = self.sharded_ddp == ShardedDDPOption.ZERO_DP_3
-                # XXX: Breaking the self.model convention but I see no way around it for now.
-                if ShardedDDPOption.AUTO_WRAP in self.args.sharded_ddp:
-                    model = auto_wrap(model)
-                self.model = model = FullyShardedDDP(
-                    model,
-                    mixed_precision=mixed_precision,
-                    reshard_after_forward=zero_3,
-                    cpu_offload=cpu_offload,
-                ).to(self.args.device)
         # Distributed training using PyTorch FSDP
-        elif self.fsdp is not None:
-            # PyTorch FSDP!
-            from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload
-            from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
-            from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision
-            from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy, transformer_auto_wrap_policy
-
-            if FSDPOption.OFFLOAD in self.args.fsdp:
-                cpu_offload = CPUOffload(offload_params=True)
-            else:
-                cpu_offload = CPUOffload(offload_params=False)
-
+        if self.fsdp is not None and self.args.fsdp_config["xla"]:
+            try:
+                from torch_xla.distributed.fsdp import XlaFullyShardedDataParallel as FSDP
+                from torch_xla.distributed.fsdp import checkpoint_module
+                from torch_xla.distributed.fsdp.wrap import (
+                    size_based_auto_wrap_policy,
+                    transformer_auto_wrap_policy,
+                )
+            except ImportError:
+                raise ImportError("Missing XLA FSDP related module; please make sure to use torch-xla >= 2.0.")
             auto_wrap_policy = None
-            if FSDPOption.AUTO_WRAP in self.args.fsdp:
-                if self.args.fsdp_min_num_params > 0:
-                    auto_wrap_policy = functools.partial(
-                        size_based_auto_wrap_policy, min_num_params=self.args.fsdp_min_num_params
-                    )
-                elif self.args.fsdp_transformer_layer_cls_to_wrap is not None:
-                    transformer_cls_to_wrap = get_module_class_from_name(
-                        model, self.args.fsdp_transformer_layer_cls_to_wrap
-                    )
-                    if transformer_cls_to_wrap is None:
+            auto_wrapper_callable = None
+            default_transformer_cls_names_to_wrap = getattr(model, "_no_split_modules", None)
+            fsdp_transformer_layer_cls_to_wrap = self.args.fsdp_config.get(
+                "transformer_layer_cls_to_wrap", default_transformer_cls_names_to_wrap
+            )
+
+            if self.args.fsdp_config["min_num_params"] > 0:
+                auto_wrap_policy = functools.partial(
+                    size_based_auto_wrap_policy, min_num_params=self.args.fsdp_config["min_num_params"]
+                )
+            elif fsdp_transformer_layer_cls_to_wrap is not None:
+                transformer_cls_to_wrap = set()
+                for layer_class in fsdp_transformer_layer_cls_to_wrap:
+                    transformer_cls = get_module_class_from_name(model, layer_class)
+                    if transformer_cls is None:
                         raise Exception("Could not find the transformer layer class to wrap in the model.")
-                    auto_wrap_policy = functools.partial(
-                        transformer_auto_wrap_policy,
-                        # Transformer layer class to wrap
-                        transformer_layer_cls={transformer_cls_to_wrap},
-                    )
-            mixed_precision_policy = None
-            dtype = None
-            if self.args.fp16:
-                dtype = torch.float16
-            elif self.args.bf16:
-                dtype = torch.bfloat16
-            if dtype is not None:
-                mixed_precision_policy = MixedPrecision(param_dtype=dtype, reduce_dtype=dtype, buffer_dtype=dtype)
-            if type(model) != FSDP:
-                # XXX: Breaking the self.model convention but I see no way around it for now.
-                self.model = model = FSDP(
-                    model,
-                    sharding_strategy=self.fsdp,
-                    cpu_offload=cpu_offload,
-                    auto_wrap_policy=auto_wrap_policy,
-                    mixed_precision=mixed_precision_policy,
-                    device_id=self.args.device,
+                    else:
+                        transformer_cls_to_wrap.add(transformer_cls)
+
+                auto_wrap_policy = functools.partial(
+                    transformer_auto_wrap_policy,
+                    # Transformer layer class to wrap
+                    transformer_layer_cls=transformer_cls_to_wrap,
                 )
+            fsdp_kwargs = self.args.xla_fsdp_config
+            if self.args.fsdp_config["xla_fsdp_grad_ckpt"]:
+                # Apply gradient checkpointing to auto-wrapped sub-modules if specified
+                def auto_wrapper_callable(m, *args, **kwargs):
+                    return FSDP(checkpoint_module(m), *args, **kwargs)
+
+            # Wrap the base model with an outer FSDP wrapper
+            self.model = model = FSDP(
+                model,
+                auto_wrap_policy=auto_wrap_policy,
+                auto_wrapper_callable=auto_wrapper_callable,
+                **fsdp_kwargs,
+            )
+
+            # Patch `xm.optimizer_step` should not reduce gradients in this case,
+            # as FSDP does not need gradient reduction over sharded parameters.
+            def patched_optimizer_step(optimizer, barrier=False, optimizer_args={}):
+                loss = optimizer.step(**optimizer_args)
+                if barrier:
+                    xm.mark_step()
+                return loss
+
+            xm.optimizer_step = patched_optimizer_step
         elif is_sagemaker_dp_enabled():
             model = nn.parallel.DistributedDataParallel(
                 model, device_ids=[int(os.getenv("SMDATAPARALLEL_LOCAL_RANK"))]
             )
-        elif self.args.local_rank != -1:
+        elif self.args.parallel_mode == ParallelMode.DISTRIBUTED:
+            if is_torch_neuroncore_available():
+                return model
             kwargs = {}
             if self.args.ddp_find_unused_parameters is not None:
                 kwargs["find_unused_parameters"] = self.args.ddp_find_unused_parameters
@@ -1448,12 +1403,11 @@ def _wrap_model(self, model, training=True, dataloader=None):
 
             if self.args.ddp_bucket_cap_mb is not None:
                 kwargs["bucket_cap_mb"] = self.args.ddp_bucket_cap_mb
-            model = nn.parallel.DistributedDataParallel(
-                model,
-                device_ids=[self.args.local_rank] if self.args._n_gpu != 0 else None,
-                output_device=self.args.local_rank if self.args._n_gpu != 0 else None,
-                **kwargs,
-            )
+
+            if self.args.ddp_broadcast_buffers is not None:
+                kwargs["broadcast_buffers"] = self.args.ddp_broadcast_buffers
+
+            self.accelerator.ddp_handler = DistributedDataParallelKwargs(**kwargs)
 
         return model
 
@@ -1477,7 +1431,7 @@ def train(
             ignore_keys_for_eval (`List[str]`, *optional*)
                 A list of keys in the output of your model (if it is a dictionary) that should be ignored when
                 gathering predictions for evaluation during the training.
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Additional keyword arguments used to hide deprecated arguments
         """
         if resume_from_checkpoint is False:
@@ -1524,7 +1478,12 @@ def train(
             if resume_from_checkpoint is None:
                 raise ValueError(f"No valid checkpoint found in output directory ({args.output_dir})")
 
-        if resume_from_checkpoint is not None and not is_sagemaker_mp_enabled():
+        if (
+            resume_from_checkpoint is not None
+            and not is_sagemaker_mp_enabled()
+            and not self.is_deepspeed_enabled
+            and not self.is_fsdp_enabled
+        ):
             self._load_from_checkpoint(resume_from_checkpoint)
 
         # If model was re-initialized, put it on the right device and update self.model_wrapped
@@ -1536,17 +1495,32 @@ def train(
         inner_training_loop = find_executable_batch_size(
             self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
         )
-        return inner_training_loop(
-            args=args,
-            resume_from_checkpoint=resume_from_checkpoint,
-            trial=trial,
-            ignore_keys_for_eval=ignore_keys_for_eval,
-        )
+        if args.push_to_hub:
+            try:
+                # Disable progress bars when uploading models during checkpoints to avoid polluting stdout
+                hf_hub_utils.disable_progress_bars()
+                return inner_training_loop(
+                    args=args,
+                    resume_from_checkpoint=resume_from_checkpoint,
+                    trial=trial,
+                    ignore_keys_for_eval=ignore_keys_for_eval,
+                )
+            finally:
+                hf_hub_utils.enable_progress_bars()
+        else:
+            return inner_training_loop(
+                args=args,
+                resume_from_checkpoint=resume_from_checkpoint,
+                trial=trial,
+                ignore_keys_for_eval=ignore_keys_for_eval,
+            )
 
     def _inner_training_loop(
         self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None
     ):
+        self.accelerator.free_memory()
         self._train_batch_size = batch_size
+        logger.debug(f"Currently training with a batch size of: {self._train_batch_size}")
         # Data loader and number of training steps
         train_dataloader = self.get_train_dataloader()
 
@@ -1554,9 +1528,10 @@ def _inner_training_loop(
         # number of training epochs: num_train_epochs
         # number of training steps per epoch: num_update_steps_per_epoch
         # total number of training steps to execute: max_steps
-        total_train_batch_size = args.train_batch_size * args.gradient_accumulation_steps * args.world_size
+        total_train_batch_size = self._train_batch_size * args.gradient_accumulation_steps * args.world_size
 
         len_dataloader = None
+        num_train_tokens = None
         if has_length(train_dataloader):
             len_dataloader = len(train_dataloader)
             num_update_steps_per_epoch = len_dataloader // args.gradient_accumulation_steps
@@ -1570,10 +1545,16 @@ def _inner_training_loop(
                 # May be slightly incorrect if the last batch in the training dataloader has a smaller size but it's
                 # the best we can do.
                 num_train_samples = args.max_steps * total_train_batch_size
+                if args.include_tokens_per_second:
+                    num_train_tokens = (
+                        self.num_tokens(train_dataloader, args.max_steps) * args.gradient_accumulation_steps
+                    )
             else:
                 max_steps = math.ceil(args.num_train_epochs * num_update_steps_per_epoch)
                 num_train_epochs = math.ceil(args.num_train_epochs)
                 num_train_samples = self.num_examples(train_dataloader) * args.num_train_epochs
+                if args.include_tokens_per_second:
+                    num_train_tokens = self.num_tokens(train_dataloader) * args.num_train_epochs
         elif args.max_steps > 0:  # Rely on max_steps when dataloader does not have a working size
             max_steps = args.max_steps
             # Setting a very large number of epochs so we go as many times as necessary over the iterator.
@@ -1581,6 +1562,8 @@ def _inner_training_loop(
             num_update_steps_per_epoch = max_steps
             num_examples = total_train_batch_size * args.max_steps
             num_train_samples = args.max_steps * total_train_batch_size
+            if args.include_tokens_per_second:
+                num_train_tokens = self.num_tokens(train_dataloader, args.max_steps) * args.gradient_accumulation_steps
         else:
             raise ValueError(
                 "args.max_steps must be set to a positive value if dataloader does not have a length, was"
@@ -1598,68 +1581,106 @@ def _inner_training_loop(
             else:
                 debug_overflow = DebugUnderflowOverflow(self.model)  # noqa
 
-        delay_optimizer_creation = (
-            self.sharded_ddp is not None
-            and self.sharded_ddp != ShardedDDPOption.SIMPLE
-            or is_sagemaker_mp_enabled()
-            or self.fsdp is not None
-        )
-        if args.ort:
-            from torch_ort import ORTModule
-            logger.info("Converting to ORTModule ....")
-            model = ORTModule(self.model)
-            self.model_wrapped = model
-        if args.deepspeed:
-            if args.ort:
-                self.model = model
-            deepspeed_engine, optimizer, lr_scheduler = deepspeed_init(
-                self, num_training_steps=max_steps, resume_from_checkpoint=resume_from_checkpoint
-            )
-            self.model = deepspeed_engine.module
-            self.model_wrapped = deepspeed_engine
-            self.deepspeed = deepspeed_engine
-            self.optimizer = optimizer
-            self.lr_scheduler = lr_scheduler
-        elif not delay_optimizer_creation:
+        delay_optimizer_creation = is_sagemaker_mp_enabled() or self.fsdp is not None or self.is_fsdp_enabled
+
+        # We need to reset the scheduler, as its parameters may be different on subsequent calls
+        if self._created_lr_scheduler:
+            self.lr_scheduler = None
+            self._created_lr_scheduler = False
+
+        if self.is_deepspeed_enabled:
+            self.optimizer, self.lr_scheduler = deepspeed_init(self, num_training_steps=max_steps)
+
+        if not delay_optimizer_creation:
             self.create_optimizer_and_scheduler(num_training_steps=max_steps)
 
         self.state = TrainerState()
         self.state.is_hyper_param_search = trial is not None
 
+        # Compute absolute values for logging, eval, and save if given as ratio
+        if args.logging_steps is not None:
+            if args.logging_steps < 1:
+                self.state.logging_steps = math.ceil(max_steps * args.logging_steps)
+            else:
+                self.state.logging_steps = args.logging_steps
+        if args.eval_steps is not None:
+            if args.eval_steps < 1:
+                self.state.eval_steps = math.ceil(max_steps * args.eval_steps)
+            else:
+                self.state.eval_steps = args.eval_steps
+        if args.save_steps is not None:
+            if args.save_steps < 1:
+                self.state.save_steps = math.ceil(max_steps * args.save_steps)
+            else:
+                self.state.save_steps = args.save_steps
+
         # Activate gradient checkpointing if needed
         if args.gradient_checkpointing:
             self.model.gradient_checkpointing_enable()
 
         model = self._wrap_model(self.model_wrapped)
 
-        if is_sagemaker_mp_enabled() and resume_from_checkpoint is not None:
-            self._load_from_checkpoint(resume_from_checkpoint, model)
+        # as the model is wrapped, don't use `accelerator.prepare`
+        # this is for unhandled cases such as
+        # FSDP-XLA, SageMaker MP/DP, DataParallel, IPEX
+        use_accelerator_prepare = True if model is self.model else False
+
+        if delay_optimizer_creation:
+            if use_accelerator_prepare:
+                self.model = self.accelerator.prepare(self.model)
+            self.create_optimizer_and_scheduler(num_training_steps=max_steps)
+
+        # prepare using `accelerator` prepare
+        if use_accelerator_prepare:
+            self.model.train()
+            if hasattr(self.lr_scheduler, "step"):
+                if self.use_apex:
+                    model = self.accelerator.prepare(self.model)
+                else:
+                    model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
+            else:
+                # to handle cases wherein we pass "DummyScheduler" such as when it is specified in DeepSpeed config.
+                model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
+                    self.model, self.optimizer, self.lr_scheduler
+                )
+
+        if self.is_fsdp_enabled:
+            self.model = self.model_wrapped = model
 
         # for the rest of this function `model` is the outside model, whether it was wrapped or not
         if model is not self.model:
             self.model_wrapped = model
 
-        if delay_optimizer_creation:
-            self.create_optimizer_and_scheduler(num_training_steps=max_steps)
+        # backward compatibility
+        if self.is_deepspeed_enabled:
+            self.deepspeed = self.model_wrapped
+
+        # ckpt loading
+        if resume_from_checkpoint is not None:
+            if self.is_deepspeed_enabled:
+                deepspeed_load_checkpoint(self.model_wrapped, resume_from_checkpoint)
+            elif is_sagemaker_mp_enabled() or self.is_fsdp_enabled:
+                self._load_from_checkpoint(resume_from_checkpoint, self.model_wrapped)
 
         # Check if saved optimizer or scheduler states exist
         self._load_optimizer_and_scheduler(resume_from_checkpoint)
 
         # important: at this point:
         # self.model         is the Transformers Model
-        # self.model_wrapped is DDP(Transformers Model), Deepspeed(Transformers Model), etc.
+        # self.model_wrapped is DDP(Transformers Model), Deepspeed(Transformers Model),
+        # FSDP(Transformers Model), Dynamo Optimized Module(Transformers Model) etc.
 
         # Train!
         logger.info("***** Running training *****")
-        logger.info(f"  Num examples = {num_examples}")
-        logger.info(f"  Num Epochs = {num_train_epochs}")
-        logger.info(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
-        logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size}")
+        logger.info(f"  Num examples = {num_examples:,}")
+        logger.info(f"  Num Epochs = {num_train_epochs:,}")
+        logger.info(f"  Instantaneous batch size per device = {self.args.per_device_train_batch_size:,}")
+        if self.args.per_device_train_batch_size != self._train_batch_size:
+            logger.info(f"  Training with DataParallel so batch size has been adjusted to: {self._train_batch_size:,}")
+        logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size:,}")
         logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
-        logger.info(f"  Total optimization steps = {max_steps}")
-        logger.info(
-            f"  Number of trainable parameters = {sum(p.numel() for p in model.parameters() if p.requires_grad)}"
-        )
+        logger.info(f"  Total optimization steps = {max_steps:,}")
+        logger.info(f"  Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}")
 
         self.state.epoch = 0
         start_time = time.time()
@@ -1684,13 +1705,9 @@ def _inner_training_loop(
             logger.info(f"  Continuing training from global step {self.state.global_step}")
             if not args.ignore_data_skip:
                 logger.info(
-                    f"  Will skip the first {epochs_trained} epochs then the first {steps_trained_in_current_epoch} "
-                    "batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` "
-                    "flag to your launch command, but you will resume the training on data already seen by your model."
+                    f"  Will skip the first {epochs_trained} epochs then the first"
+                    f" {steps_trained_in_current_epoch} batches in the first epoch."
                 )
-                if self.is_local_process_zero() and not args.disable_tqdm:
-                    steps_trained_progress_bar = tqdm(total=steps_trained_in_current_epoch)
-                    steps_trained_progress_bar.set_description("Skipping the first batches")
 
         # Update the references
         self.callback_handler.model = self.model
@@ -1725,31 +1742,22 @@ def _inner_training_loop(
         # Skip the first epochs_trained epochs to get the random state of the dataloader at the right point.
         if not args.ignore_data_skip:
             for epoch in range(epochs_trained):
-                is_random_sampler = hasattr(train_dataloader, "sampler") and isinstance(
-                    train_dataloader.sampler, RandomSampler
-                )
+                sampler = get_dataloader_sampler(train_dataloader)
+                is_random_sampler = isinstance(sampler, RandomSampler)
                 if is_torch_less_than_1_11 or not is_random_sampler:
                     # We just need to begin an iteration to create the randomization of the sampler.
-                    # That was before PyTorch 1.11 however...
                     for _ in train_dataloader:
                         break
                 else:
                     # Otherwise we need to call the whooooole sampler cause there is some random operation added
                     # AT THE VERY END!
-                    _ = list(train_dataloader.sampler)
+                    sampler = sampler if sampler is not None else []
+                    _ = list(sampler)
 
         start_train_stable_time = 0
+        total_batched_samples = 0
         for epoch in range(epochs_trained, num_train_epochs):
-            if isinstance(train_dataloader, DataLoader) and isinstance(train_dataloader.sampler, DistributedSampler):
-                train_dataloader.sampler.set_epoch(epoch)
-            elif hasattr(train_dataloader, "dataset") and isinstance(train_dataloader.dataset, IterableDatasetShard):
-                train_dataloader.dataset.set_epoch(epoch)
-
-            if is_torch_tpu_available():
-                parallel_loader = pl.ParallelLoader(train_dataloader, [args.device]).per_device_loader(args.device)
-                epoch_iterator = parallel_loader
-            else:
-                epoch_iterator = train_dataloader
+            epoch_iterator = train_dataloader
 
             # Reset the past mems state at the beginning of each epoch if necessary.
             if args.past_index >= 0:
@@ -1765,8 +1773,20 @@ def _inner_training_loop(
             if epoch == epochs_trained and resume_from_checkpoint is not None and steps_trained_in_current_epoch == 0:
                 self._load_rng_state(resume_from_checkpoint)
 
+            rng_to_sync = False
+            steps_skipped = 0
+            if steps_trained_in_current_epoch > 0:
+                epoch_iterator = skip_first_batches(epoch_iterator, steps_trained_in_current_epoch)
+                steps_skipped = steps_trained_in_current_epoch
+                steps_trained_in_current_epoch = 0
+                rng_to_sync = True
+
             step = -1
             for step, inputs in enumerate(epoch_iterator):
+                total_batched_samples += 1
+                if rng_to_sync:
+                    self._load_rng_state(resume_from_checkpoint)
+                    rng_to_sync = False
 
                 if (self.state.global_step == 10):
                     start_train_stable_time = time.time()
@@ -1786,15 +1806,7 @@ def _inner_training_loop(
                 if step % args.gradient_accumulation_steps == 0:
                     self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
 
-                if (
-                    ((step + 1) % args.gradient_accumulation_steps != 0)
-                    and args.local_rank != -1
-                    and args._no_sync_in_gradient_accumulation
-                ):
-                    # Avoid unnecessary DDP synchronization since there will be no backward pass on this example.
-                    with model.no_sync():
-                        tr_loss_step = self.training_step(model, inputs)
-                else:
+                with self.accelerator.accumulate(model):
                     tr_loss_step = self.training_step(model, inputs)
 
                 if (
@@ -1809,27 +1821,27 @@ def _inner_training_loop(
 
                 self.current_flos += float(self.floating_point_ops(inputs))
 
-                # Optimizer step for deepspeed must be called on every step regardless of the value of gradient_accumulation_steps
-                if self.deepspeed:
-                    self.deepspeed.step()
+                is_last_step_and_steps_less_than_grad_acc = (
+                    steps_in_epoch <= args.gradient_accumulation_steps and (step + 1) == steps_in_epoch
+                )
 
-                if (step + 1) % args.gradient_accumulation_steps == 0 or (
+                if (
+                    total_batched_samples % args.gradient_accumulation_steps == 0
+                    or
                     # last step in epoch but step is always smaller than gradient_accumulation_steps
-                    steps_in_epoch <= args.gradient_accumulation_steps
-                    and (step + 1) == steps_in_epoch
+                    is_last_step_and_steps_less_than_grad_acc
                 ):
+                    # the `or` condition of `is_last_step_and_steps_less_than_grad_acc` is not covered
+                    # in accelerate. So, explicitly enable sync gradients to True in that case.
+                    if is_last_step_and_steps_less_than_grad_acc or (
+                        version.parse(accelerate_version) <= version.parse("0.20.3")
+                    ):
+                        self.accelerator.gradient_state._set_sync_gradients(True)
+
                     # Gradient clipping
-                    if args.max_grad_norm is not None and args.max_grad_norm > 0 and not self.deepspeed:
+                    if args.max_grad_norm is not None and args.max_grad_norm > 0:
                         # deepspeed does its own clipping
 
-                        if self.do_grad_scaling:
-                            # Reduce gradients first for XLA
-                            if is_torch_tpu_available():
-                                gradients = xm._fetch_gradients(self.optimizer)
-                                xm.all_reduce("sum", gradients, scale=1.0 / xm.xrt_world_size())
-                            # AMP: gradients need unscaling
-                            self.scaler.unscale_(self.optimizer)
-
                         if is_sagemaker_mp_enabled() and args.fp16:
                             self.optimizer.clip_master_grads(args.max_grad_norm)
                         elif hasattr(self.optimizer, "clip_grad_norm"):
@@ -1838,38 +1850,29 @@ def _inner_training_loop(
                         elif hasattr(model, "clip_grad_norm_"):
                             # Some models (like FullyShardedDDP) have a specific way to do gradient clipping
                             model.clip_grad_norm_(args.max_grad_norm)
-                        else:
+                        elif self.use_apex:
                             # Revert to normal clipping otherwise, handling Apex or full precision
                             nn.utils.clip_grad_norm_(
-                                amp.master_params(self.optimizer) if self.use_apex else model.parameters(),
+                                amp.master_params(self.optimizer),
                                 args.max_grad_norm,
                             )
-
-                    # Optimizer step
-                    optimizer_was_run = True
-                    if self.deepspeed:
-                        pass  # called outside the loop
-                    elif is_torch_tpu_available():
-                        if self.do_grad_scaling:
-                            self.scaler.step(self.optimizer)
-                            self.scaler.update()
                         else:
-                            xm.optimizer_step(self.optimizer)
-                    elif self.do_grad_scaling:
-                        scale_before = self.scaler.get_scale()
-                        self.scaler.step(self.optimizer)
-                        self.scaler.update()
-                        scale_after = self.scaler.get_scale()
-                        optimizer_was_run = scale_before <= scale_after
-                    else:
-                        self.optimizer.step()
+                            self.accelerator.clip_grad_norm_(
+                                model.parameters(),
+                                args.max_grad_norm,
+                            )
 
-                    if optimizer_was_run and not self.deepspeed:
-                        self.lr_scheduler.step()
+                    # Optimizer step
+                    self.optimizer.step()
+                    optimizer_was_run = not self.accelerator.optimizer_step_was_skipped
+                    if optimizer_was_run:
+                        # Delay optimizer scheduling until metrics are generated
+                        if not isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
+                            self.lr_scheduler.step()
 
                     model.zero_grad()
                     self.state.global_step += 1
-                    self.state.epoch = epoch + (step + 1) / steps_in_epoch
+                    self.state.epoch = epoch + (step + 1 + steps_skipped) / steps_in_epoch
                     self.control = self.callback_handler.on_step_end(args, self.state, self.control)
 
                     self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
@@ -1910,7 +1913,7 @@ def _inner_training_loop(
             # Wait for everyone to get here so we are sur the model has been saved by process 0.
             if is_torch_tpu_available():
                 xm.rendezvous("load_best_model_at_end")
-            elif args.local_rank != -1:
+            elif args.parallel_mode == ParallelMode.DISTRIBUTED:
                 dist.barrier()
             elif is_sagemaker_mp_enabled():
                 smp.barrier()
@@ -1921,7 +1924,7 @@ def _inner_training_loop(
         self._total_loss_scalar += tr_loss.item()
         train_loss = self._total_loss_scalar / self.state.global_step
 
-        metrics = speed_metrics("train", start_time, num_samples=num_train_samples, num_steps=self.state.max_steps)
+        metrics = speed_metrics("train", start_time, num_samples=num_train_samples, num_steps=self.state.max_steps,num_tokens=num_train_tokens,)
 
         total_samples = self.state.global_step*total_train_batch_size if args.max_steps > 0 else num_examples*num_train_epochs
         perf_samples = total_samples - self.args.warmup_steps*total_train_batch_size
@@ -1945,12 +1948,15 @@ def _inner_training_loop(
         # Delete the last checkpoint when save_total_limit=1 if it's different from the best checkpoint and process allowed to save.
         if self.args.should_save and self.state.best_model_checkpoint is not None and self.args.save_total_limit == 1:
             for checkpoint in checkpoints_sorted:
-                if checkpoint != self.state.best_model_checkpoint:
+                if not os.path.samefile(checkpoint, self.state.best_model_checkpoint):
                     logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit")
                     shutil.rmtree(checkpoint)
 
         self.control = self.callback_handler.on_train_end(args, self.state, self.control)
 
+        # Wait for the checkpoint to be uploaded.
+        self._finish_current_push()
+
         return TrainOutput(self.state.global_step, train_loss, metrics)
 
     def _get_output_dir(self, trial):
@@ -1974,19 +1980,45 @@ def _get_output_dir(self, trial):
         return run_dir
 
     def _load_from_checkpoint(self, resume_from_checkpoint, model=None):
-
         if model is None:
             model = self.model
 
-        if not os.path.isfile(os.path.join(resume_from_checkpoint, WEIGHTS_NAME)) and not os.path.isfile(
-            os.path.join(resume_from_checkpoint, WEIGHTS_INDEX_NAME)
+        config_file = os.path.join(resume_from_checkpoint, CONFIG_NAME)
+        adapter_weights_file = os.path.join(resume_from_checkpoint, ADAPTER_WEIGHTS_NAME)
+        adapter_safe_weights_file = os.path.join(resume_from_checkpoint, ADAPTER_SAFE_WEIGHTS_NAME)
+        weights_file = os.path.join(resume_from_checkpoint, WEIGHTS_NAME)
+        weights_index_file = os.path.join(resume_from_checkpoint, WEIGHTS_INDEX_NAME)
+        safe_weights_file = os.path.join(resume_from_checkpoint, SAFE_WEIGHTS_NAME)
+        safe_weights_index_file = os.path.join(resume_from_checkpoint, SAFE_WEIGHTS_INDEX_NAME)
+        is_fsdp_ckpt = os.path.isdir(resume_from_checkpoint) and any(
+            WEIGHTS_NAME.split(".")[0] in folder_name
+            for folder_name in os.listdir(resume_from_checkpoint)
+            if os.path.isdir(os.path.join(resume_from_checkpoint, folder_name))
+        )
+
+        if is_fsdp_ckpt and not self.is_fsdp_enabled:
+            raise ValueError(f"Checkpoint found at {resume_from_checkpoint} is only supported when using PyTorch FSDP")
+
+        if not (
+            any(
+                os.path.isfile(f)
+                for f in [
+                    weights_file,
+                    safe_weights_file,
+                    weights_index_file,
+                    safe_weights_index_file,
+                    adapter_weights_file,
+                    adapter_safe_weights_file,
+                ]
+            )
+            or is_fsdp_ckpt
         ):
             raise ValueError(f"Can't find a valid checkpoint at {resume_from_checkpoint}")
 
         logger.info(f"Loading model from {resume_from_checkpoint}.")
 
-        if os.path.isfile(os.path.join(resume_from_checkpoint, CONFIG_NAME)):
-            config = PretrainedConfig.from_json_file(os.path.join(resume_from_checkpoint, CONFIG_NAME))
+        if os.path.isfile(config_file):
+            config = PretrainedConfig.from_json_file(config_file)
             checkpoint_version = config.transformers_version
             if checkpoint_version is not None and checkpoint_version != __version__:
                 logger.warning(
@@ -1995,10 +2027,7 @@ def _load_from_checkpoint(self, resume_from_checkpoint, model=None):
                     "yield to errors or unwanted behaviors."
                 )
 
-        if self.args.deepspeed:
-            # will be resumed in deepspeed_init
-            pass
-        elif os.path.isfile(os.path.join(resume_from_checkpoint, WEIGHTS_NAME)):
+        if os.path.isfile(weights_file) or os.path.isfile(safe_weights_file) or is_fsdp_ckpt:
             # If the model is on the GPU, it still works!
             if is_sagemaker_mp_enabled():
                 if os.path.isfile(os.path.join(resume_from_checkpoint, "user_content.pt")):
@@ -2014,75 +2043,123 @@ def _load_from_checkpoint(self, resume_from_checkpoint, model=None):
                         logger.warning(
                             "Enabling FP16 and loading from smp < 1.10 checkpoint together is not suppported."
                         )
-                    state_dict = torch.load(os.path.join(resume_from_checkpoint, WEIGHTS_NAME), map_location="cpu")
+                    state_dict = torch.load(weights_file, map_location="cpu")
                     # Required for smp to not auto-translate state_dict from hf to smp (is already smp).
                     state_dict["_smp_is_partial"] = False
                     load_result = model.load_state_dict(state_dict, strict=True)
                     # release memory
                     del state_dict
+            elif self.is_fsdp_enabled:
+                load_fsdp_model(self.accelerator.state.fsdp_plugin, self.accelerator, model, resume_from_checkpoint)
             else:
                 # We load the model state dict on the CPU to avoid an OOM error.
-                state_dict = torch.load(os.path.join(resume_from_checkpoint, WEIGHTS_NAME), map_location="cpu")
+                if self.args.save_safetensors and os.path.isfile(safe_weights_file):
+                    state_dict = safetensors.torch.load_file(safe_weights_file, device="cpu")
+                else:
+                    state_dict = torch.load(weights_file, map_location="cpu")
+
                 # workaround for FSDP bug https://github.com/pytorch/pytorch/issues/82963
                 # which takes *args instead of **kwargs
                 load_result = model.load_state_dict(state_dict, False)
                 # release memory
                 del state_dict
                 self._issue_warnings_after_load(load_result)
+
+        # Load adapters following PR # 24096
+        elif is_peft_available() and isinstance(model, PeftModel):
+            # If train a model using PEFT & LoRA, assume that adapter have been saved properly.
+            if hasattr(model, "active_adapter") and hasattr(model, "load_adapter"):
+                if os.path.exists(resume_from_checkpoint):
+                    model.load_adapter(resume_from_checkpoint, model.active_adapter, is_trainable=True)
+                else:
+                    logger.warning(
+                        "The intermediate checkpoints of PEFT may not be saved correctly, "
+                        f"consider using a custom callback to save {ADAPTER_WEIGHTS_NAME} in corresponding saving folders. "
+                        "Check some examples here: https://github.com/huggingface/peft/issues/96"
+                    )
+            else:
+                logger.warning("Could not load adapter model, make sure to have `peft>=0.3.0` installed")
         else:
             # We load the sharded checkpoint
-            load_result = load_sharded_checkpoint(model, resume_from_checkpoint, strict=is_sagemaker_mp_enabled())
+            load_result = load_sharded_checkpoint(
+                model, resume_from_checkpoint, strict=is_sagemaker_mp_enabled(), prefer_safe=self.args.save_safetensors
+            )
             if not is_sagemaker_mp_enabled():
                 self._issue_warnings_after_load(load_result)
 
     def _load_best_model(self):
         logger.info(f"Loading best model from {self.state.best_model_checkpoint} (score: {self.state.best_metric}).")
         best_model_path = os.path.join(self.state.best_model_checkpoint, WEIGHTS_NAME)
+        best_safe_model_path = os.path.join(self.state.best_model_checkpoint, SAFE_WEIGHTS_NAME)
+        best_adapter_model_path = os.path.join(self.state.best_model_checkpoint, ADAPTER_WEIGHTS_NAME)
+        best_safe_adapter_model_path = os.path.join(self.state.best_model_checkpoint, ADAPTER_SAFE_WEIGHTS_NAME)
+
         model = self.model_wrapped if is_sagemaker_mp_enabled() else self.model
-        if os.path.exists(best_model_path):
-            if self.deepspeed:
-
-                if self.model_wrapped is not None:
-                    # this removes the pre-hooks from the previous engine
-                    self.model_wrapped.destroy()
-                    self.model_wrapped = None
-
-                # temp hack until Deepspeed fixes the problem with resume from an existing engine that did some stepping
-                deepspeed_engine, optimizer, lr_scheduler = deepspeed_init(
-                    self,
-                    num_training_steps=self.args.max_steps,
-                    resume_from_checkpoint=self.state.best_model_checkpoint,
-                )
-                self.model = deepspeed_engine.module
-                self.model_wrapped = deepspeed_engine
-                self.deepspeed = deepspeed_engine
-                self.optimizer = optimizer
-                self.lr_scheduler = lr_scheduler
-            else:
-                if is_sagemaker_mp_enabled():
-                    if os.path.isfile(os.path.join(self.state.best_model_checkpoint, "user_content.pt")):
-                        # If the 'user_content.pt' file exists, load with the new smp api.
-                        # Checkpoint must have been saved with the new smp api.
-                        smp.resume_from_checkpoint(
-                            path=self.state.best_model_checkpoint,
-                            tag=WEIGHTS_NAME,
-                            partial=False,
-                            load_optimizer=False,
-                        )
+        if self.is_deepspeed_enabled:
+            deepspeed_load_checkpoint(self.model_wrapped, self.state.best_model_checkpoint)
+        elif self.is_fsdp_enabled:
+            load_result = load_fsdp_model(
+                self.accelerator.state.fsdp_plugin, self.accelerator, model, self.state.best_model_checkpoint
+            )
+        elif (
+            os.path.exists(best_model_path)
+            or os.path.exists(best_safe_model_path)
+            or os.path.exists(best_adapter_model_path)
+            or os.path.exists(best_safe_adapter_model_path)
+        ):
+            has_been_loaded = True
+            if is_sagemaker_mp_enabled():
+                if os.path.isfile(os.path.join(self.state.best_model_checkpoint, "user_content.pt")):
+                    # If the 'user_content.pt' file exists, load with the new smp api.
+                    # Checkpoint must have been saved with the new smp api.
+                    smp.resume_from_checkpoint(
+                        path=self.state.best_model_checkpoint,
+                        tag=WEIGHTS_NAME,
+                        partial=False,
+                        load_optimizer=False,
+                    )
+                else:
+                    # If the 'user_content.pt' file does NOT exist, load with the old smp api.
+                    # Checkpoint must have been saved with the old smp api.
+                    if self.args.save_safetensors and os.path.isfile(best_safe_model_path):
+                        state_dict = safetensors.torch.load_file(best_safe_model_path, device="cpu")
                     else:
-                        # If the 'user_content.pt' file does NOT exist, load with the old smp api.
-                        # Checkpoint must have been saved with the old smp api.
                         state_dict = torch.load(best_model_path, map_location="cpu")
-                        state_dict["_smp_is_partial"] = False
-                        load_result = model.load_state_dict(state_dict, strict=True)
+
+                    state_dict["_smp_is_partial"] = False
+                    load_result = model.load_state_dict(state_dict, strict=True)
+            else:
+                if is_peft_available() and isinstance(model, PeftModel):
+                    # If train a model using PEFT & LoRA, assume that adapter have been saved properly.
+                    if hasattr(model, "active_adapter") and hasattr(model, "load_adapter"):
+                        if os.path.exists(best_adapter_model_path) or os.path.exists(best_safe_adapter_model_path):
+                            model.load_adapter(self.state.best_model_checkpoint, model.active_adapter)
+                            # Load_adapter has no return value present, modify it when appropriate.
+                            from torch.nn.modules.module import _IncompatibleKeys
+
+                            load_result = _IncompatibleKeys([], [])
+                        else:
+                            logger.warning(
+                                "The intermediate checkpoints of PEFT may not be saved correctly, "
+                                f"consider using a custom callback to save {ADAPTER_WEIGHTS_NAME} in corresponding saving folders. "
+                                "Check some examples here: https://github.com/huggingface/peft/issues/96"
+                            )
+                            has_been_loaded = False
+                    else:
+                        logger.warning("Could not load adapter model, make sure to have `peft>=0.3.0` installed")
+                        has_been_loaded = False
                 else:
                     # We load the model state dict on the CPU to avoid an OOM error.
-                    state_dict = torch.load(best_model_path, map_location="cpu")
+                    if self.args.save_safetensors and os.path.isfile(best_safe_model_path):
+                        state_dict = safetensors.torch.load_file(best_safe_model_path, device="cpu")
+                    else:
+                        state_dict = torch.load(best_model_path, map_location="cpu")
+
                     # If the model is on the GPU, it still works!
                     # workaround for FSDP bug https://github.com/pytorch/pytorch/issues/82963
                     # which takes *args instead of **kwargs
                     load_result = model.load_state_dict(state_dict, False)
-                if not is_sagemaker_mp_enabled():
+                if not is_sagemaker_mp_enabled() and has_been_loaded:
                     self._issue_warnings_after_load(load_result)
         elif os.path.exists(os.path.join(self.state.best_model_checkpoint, WEIGHTS_INDEX_NAME)):
             load_result = load_sharded_checkpoint(
@@ -2097,7 +2174,6 @@ def _load_best_model(self):
             )
 
     def _issue_warnings_after_load(self, load_result):
-
         if len(load_result.missing_keys) != 0:
             if self.model._keys_to_ignore_on_save is not None and set(load_result.missing_keys) == set(
                 self.model._keys_to_ignore_on_save
@@ -2135,16 +2211,25 @@ def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, ignore_keys_for
         metrics = None
         if self.control.should_evaluate:
             if isinstance(self.eval_dataset, dict):
+                metrics = {}
                 for eval_dataset_name, eval_dataset in self.eval_dataset.items():
-                    metrics = self.evaluate(
+                    dataset_metrics = self.evaluate(
                         eval_dataset=eval_dataset,
                         ignore_keys=ignore_keys_for_eval,
                         metric_key_prefix=f"eval_{eval_dataset_name}",
                     )
+                    metrics.update(dataset_metrics)
             else:
                 metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
             self._report_to_hp_search(trial, self.state.global_step, metrics)
 
+            # Run delayed LR scheduler now that metrics are populated
+            if isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
+                metric_to_check = self.args.metric_for_best_model
+                if not metric_to_check.startswith("eval_"):
+                    metric_to_check = f"eval_{metric_to_check}"
+                self.lr_scheduler.step(metrics[metric_to_check])
+
         if self.control.should_save:
             self._save_checkpoint(model, trial, metrics=metrics)
             self.control = self.callback_handler.on_save(self.args, self.state, self.control)
@@ -2177,11 +2262,11 @@ def _load_rng_state(self, checkpoint):
         np.random.set_state(checkpoint_rng_state["numpy"])
         torch.random.set_rng_state(checkpoint_rng_state["cpu"])
         if torch.cuda.is_available():
-            if self.args.local_rank != -1:
-                torch.cuda.random.set_rng_state(checkpoint_rng_state["cuda"])
+            if self.args.parallel_mode == ParallelMode.DISTRIBUTED:
+                torch.cuda.random.set_rng_state_all(checkpoint_rng_state["cuda"])
             else:
                 try:
-                    torch.cuda.random.set_rng_state_all(checkpoint_rng_state["cuda"])
+                    torch.cuda.random.set_rng_state(checkpoint_rng_state["cuda"])
                 except Exception as e:
                     logger.info(
                         f"Didn't manage to set back the RNG states of the GPU because of the following error:\n {e}"
@@ -2204,14 +2289,23 @@ def _save_checkpoint(self, model, trial, metrics=None):
         run_dir = self._get_output_dir(trial=trial)
         output_dir = os.path.join(run_dir, checkpoint_folder)
         self.save_model(output_dir, _internal_call=True)
-        if self.deepspeed:
+        if self.is_deepspeed_enabled:
             # under zero3 model file itself doesn't get saved since it's bogus! Unless deepspeed
             # config `stage3_gather_16bit_weights_on_model_save` is True
-            self.deepspeed.save_checkpoint(output_dir)
+            self.model_wrapped.save_checkpoint(output_dir)
 
         # Save optimizer and scheduler
-        if self.sharded_ddp == ShardedDDPOption.SIMPLE:
-            self.optimizer.consolidate_state_dict()
+        if self.fsdp or self.is_fsdp_enabled:
+            if self.is_fsdp_enabled:
+                save_fsdp_optimizer(
+                    self.accelerator.state.fsdp_plugin, self.accelerator, self.optimizer, self.model, output_dir
+                )
+            else:
+                # FSDP has a different interface for saving optimizer states.
+                # Needs to be called on all ranks to gather all states.
+                # full_optim_state_dict will be deprecated after Pytorch 2.2!
+                full_osd = self.model.__class__.full_optim_state_dict(self.model, self.optimizer)
+                torch.save(full_osd, os.path.join(output_dir, OPTIMIZER_NAME))
 
         if is_torch_tpu_available():
             xm.rendezvous("saving_optimizer_states")
@@ -2229,20 +2323,22 @@ def _save_checkpoint(self, model, trial, metrics=None):
                     partial=True,
                     v3=smp.state.cfg.shard_optimizer_state,
                 )
-            if self.args.should_save:
-                with warnings.catch_warnings(record=True) as caught_warnings:
-                    torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
-                reissue_pt_warnings(caught_warnings)
-                if self.do_grad_scaling:
-                    torch.save(self.scaler.state_dict(), os.path.join(output_dir, SCALER_NAME))
-        elif self.args.should_save and not self.deepspeed:
+        elif self.args.should_save and not self.is_deepspeed_enabled and not (self.fsdp or self.is_fsdp_enabled):
             # deepspeed.save_checkpoint above saves model/optim/sched
             torch.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
+
+        # Save SCHEDULER & SCALER
+        is_deepspeed_custom_scheduler = self.is_deepspeed_enabled and not isinstance(
+            self.lr_scheduler, DeepSpeedSchedulerWrapper
+        )
+        if (
+            self.args.should_save
+            and (not self.is_deepspeed_enabled or is_deepspeed_custom_scheduler)
+            and not is_torch_tpu_available()
+        ):
             with warnings.catch_warnings(record=True) as caught_warnings:
                 torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
             reissue_pt_warnings(caught_warnings)
-            if self.do_grad_scaling:
-                torch.save(self.scaler.state_dict(), os.path.join(output_dir, SCALER_NAME))
 
         # Determine the new best metric / best model checkpoint
         if metrics is not None and self.args.metric_for_best_model is not None:
@@ -2271,7 +2367,7 @@ def _save_checkpoint(self, model, trial, metrics=None):
             "cpu": torch.random.get_rng_state(),
         }
         if torch.cuda.is_available():
-            if self.args.local_rank == -1:
+            if self.args.parallel_mode == ParallelMode.DISTRIBUTED:
                 # In non distributed, we save the global CUDA RNG state (will take care of DataParallel)
                 rng_states["cuda"] = torch.cuda.random.get_rng_state_all()
             else:
@@ -2301,14 +2397,29 @@ def _load_optimizer_and_scheduler(self, checkpoint):
         if checkpoint is None:
             return
 
-        if self.deepspeed:
+        if self.is_deepspeed_enabled:
             # deepspeed loads optimizer/lr_scheduler together with the model in deepspeed_init
+            if not isinstance(self.lr_scheduler, DeepSpeedSchedulerWrapper):
+                with warnings.catch_warnings(record=True) as caught_warnings:
+                    self.lr_scheduler.load_state_dict(torch.load(os.path.join(checkpoint, SCHEDULER_NAME)))
+                reissue_pt_warnings(caught_warnings)
             return
 
         checkpoint_file_exists = (
             glob.glob(os.path.join(checkpoint, OPTIMIZER_NAME) + "_*")
             if is_sagemaker_mp_enabled()
-            else os.path.isfile(os.path.join(checkpoint, OPTIMIZER_NAME))
+            else (
+                os.path.isfile(os.path.join(checkpoint, OPTIMIZER_NAME))
+                or os.path.isfile(os.path.join(checkpoint, OPTIMIZER_NAME_BIN))
+                or (
+                    os.path.isdir(checkpoint)
+                    and any(
+                        OPTIMIZER_NAME_BIN.split(".")[0] in folder_name
+                        for folder_name in os.listdir(checkpoint)
+                        if os.path.isdir(os.path.join(checkpoint, folder_name))
+                    )
+                )
+            )
         )
         if checkpoint_file_exists and os.path.isfile(os.path.join(checkpoint, SCHEDULER_NAME)):
             # Load in optimizer and scheduler states
@@ -2325,7 +2436,6 @@ def _load_optimizer_and_scheduler(self, checkpoint):
                 self.optimizer.load_state_dict(optimizer_state)
                 self.lr_scheduler.load_state_dict(lr_scheduler_state)
             else:
-                map_location = "cpu" if is_sagemaker_mp_enabled() else self.args.device
                 if is_sagemaker_mp_enabled():
                     if os.path.isfile(os.path.join(checkpoint, "user_content.pt")):
                         # Optimizer checkpoint was saved with smp >= 1.10
@@ -2344,25 +2454,45 @@ def opt_load_hook(mod, opt):
 
                     self.model_wrapped.register_post_step_hook(opt_load_hook)
                 else:
-                    self.optimizer.load_state_dict(
-                        torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
-                    )
+                    # We use the CPU when training on one GPU to avoid OOM for GPU RAM when training big models.
+                    # In distributed training however, we load directly on each GPU and risk the GPU OOM as it's more
+                    # likely to get OOM on CPU (since we load num_gpu times the optimizer state
+                    map_location = self.args.device if self.args.world_size > 1 else "cpu"
+                    if self.fsdp or self.is_fsdp_enabled:
+                        if self.is_fsdp_enabled:
+                            load_fsdp_optimizer(
+                                self.accelerator.state.fsdp_plugin,
+                                self.accelerator,
+                                self.optimizer,
+                                self.model,
+                                checkpoint,
+                            )
+                        else:
+                            full_osd = None
+                            # In FSDP, we need to load the full optimizer state dict on rank 0 and then shard it
+                            if self.args.process_index == 0:
+                                full_osd = torch.load(os.path.join(checkpoint, OPTIMIZER_NAME))
+                            # call scatter_full_optim_state_dict on all ranks
+                            sharded_osd = self.model.__class__.scatter_full_optim_state_dict(full_osd, self.model)
+                            self.optimizer.load_state_dict(sharded_osd)
+                    else:
+                        self.optimizer.load_state_dict(
+                            torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
+                        )
                 with warnings.catch_warnings(record=True) as caught_warnings:
                     self.lr_scheduler.load_state_dict(torch.load(os.path.join(checkpoint, SCHEDULER_NAME)))
                 reissue_pt_warnings(caught_warnings)
-                if self.do_grad_scaling and os.path.isfile(os.path.join(checkpoint, SCALER_NAME)):
-                    self.scaler.load_state_dict(torch.load(os.path.join(checkpoint, SCALER_NAME)))
 
     def hyperparameter_search(
         self,
         hp_space: Optional[Callable[["optuna.Trial"], Dict[str, float]]] = None,
         compute_objective: Optional[Callable[[Dict[str, float]], float]] = None,
         n_trials: int = 20,
-        direction: str = "minimize",
+        direction: Union[str, List[str]] = "minimize",
         backend: Optional[Union["str", HPSearchBackend]] = None,
         hp_name: Optional[Callable[["optuna.Trial"], str]] = None,
         **kwargs,
-    ) -> BestRun:
+    ) -> Union[BestRun, List[BestRun]]:
         """
         Launch an hyperparameter search using `optuna` or `Ray Tune` or `SigOpt`. The optimized quantity is determined
         by `compute_objective`, which defaults to a function returning the evaluation loss when no metric is provided,
@@ -2387,9 +2517,12 @@ def hyperparameter_search(
                 method. Will default to [`~trainer_utils.default_compute_objective`].
             n_trials (`int`, *optional*, defaults to 100):
                 The number of trial runs to test.
-            direction (`str`, *optional*, defaults to `"minimize"`):
-                Whether to optimize greater or lower objects. Can be `"minimize"` or `"maximize"`, you should pick
-                `"minimize"` when optimizing the validation loss, `"maximize"` when optimizing one or several metrics.
+            direction (`str` or `List[str]`, *optional*, defaults to `"minimize"`):
+                If it's single objective optimization, direction is `str`, can be `"minimize"` or `"maximize"`, you
+                should pick `"minimize"` when optimizing the validation loss, `"maximize"` when optimizing one or
+                several metrics. If it's multi objectives optimization, direction is `List[str]`, can be List of
+                `"minimize"` and `"maximize"`, you should pick `"minimize"` when optimizing the validation loss,
+                `"maximize"` when optimizing one or several metrics.
             backend (`str` or [`~training_utils.HPSearchBackend`], *optional*):
                 The backend to use for hyperparameter search. Will default to optuna or Ray Tune or SigOpt, depending
                 on which one is installed. If all are installed, will default to optuna.
@@ -2405,45 +2538,26 @@ def hyperparameter_search(
                 - the documentation of [sigopt](https://app.sigopt.com/docs/endpoints/experiments/create)
 
         Returns:
-            [`trainer_utils.BestRun`]: All the information about the best run.
+            [`trainer_utils.BestRun` or `List[trainer_utils.BestRun]`]: All the information about the best run or best
+            runs for multi-objective optimization. Experiment summary can be found in `run_summary` attribute for Ray
+            backend.
         """
         if backend is None:
             backend = default_hp_search_backend()
-            if backend is None:
-                raise RuntimeError(
-                    "At least one of optuna or ray should be installed. "
-                    "To install optuna run `pip install optuna`. "
-                    "To install ray run `pip install ray[tune]`. "
-                    "To install sigopt run `pip install sigopt`."
-                )
         backend = HPSearchBackend(backend)
-        if backend == HPSearchBackend.OPTUNA and not is_optuna_available():
-            raise RuntimeError("You picked the optuna backend, but it is not installed. Use `pip install optuna`.")
-        if backend == HPSearchBackend.RAY and not is_ray_tune_available():
-            raise RuntimeError(
-                "You picked the Ray Tune backend, but it is not installed. Use `pip install 'ray[tune]'`."
-            )
-        if backend == HPSearchBackend.SIGOPT and not is_sigopt_available():
-            raise RuntimeError("You picked the sigopt backend, but it is not installed. Use `pip install sigopt`.")
-        if backend == HPSearchBackend.WANDB and not is_wandb_available():
-            raise RuntimeError("You picked the wandb backend, but it is not installed. Use `pip install wandb`.")
+        backend_obj = ALL_HYPERPARAMETER_SEARCH_BACKENDS[backend]()
+        backend_obj.ensure_available()
         self.hp_search_backend = backend
         if self.model_init is None:
             raise RuntimeError(
                 "To use hyperparameter search, you need to pass your model through a model_init function."
             )
 
-        self.hp_space = default_hp_space[backend] if hp_space is None else hp_space
+        self.hp_space = backend_obj.default_hp_space if hp_space is None else hp_space
         self.hp_name = hp_name
         self.compute_objective = default_compute_objective if compute_objective is None else compute_objective
 
-        backend_dict = {
-            HPSearchBackend.OPTUNA: run_hp_search_optuna,
-            HPSearchBackend.RAY: run_hp_search_ray,
-            HPSearchBackend.SIGOPT: run_hp_search_sigopt,
-            HPSearchBackend.WANDB: run_hp_search_wandb,
-        }
-        best_run = backend_dict[backend](self, n_trials, direction, **kwargs)
+        best_run = backend_obj.run(self, n_trials, direction, **kwargs)
 
         self.hp_search_backend = None
         return best_run
@@ -2474,12 +2588,12 @@ def _prepare_input(self, data: Union[torch.Tensor, Any]) -> Union[torch.Tensor,
         elif isinstance(data, (tuple, list)):
             return type(data)(self._prepare_input(v) for v in data)
         elif isinstance(data, torch.Tensor):
-            kwargs = dict(device=self.args.device)
-            if self.deepspeed and data.dtype != torch.int64:
-                # NLP models inputs are int64 and those get adjusted to the right dtype of the
+            kwargs = {"device": self.args.device}
+            if self.is_deepspeed_enabled and (torch.is_floating_point(data) or torch.is_complex(data)):
+                # NLP models inputs are int/uint and those get adjusted to the right dtype of the
                 # embedding. Other models such as wav2vec2's inputs are already float and thus
                 # may need special handling to match the dtypes of the model
-                kwargs.update(dict(dtype=self.args.hf_deepspeed_config.dtype()))
+                kwargs.update({"dtype": self.accelerator.state.deepspeed_plugin.hf_ds_config.dtype()})
             return data.to(**kwargs)
         return data
 
@@ -2510,17 +2624,10 @@ def autocast_smart_context_manager(self, cache_enabled: Optional[bool] = True):
         A helper wrapper that creates an appropriate context manager for `autocast` while feeding it the desired
         arguments, depending on the situation.
         """
-        if self.use_cuda_amp or self.use_cpu_amp:
-            if is_torch_greater_or_equal_than_1_10:
-                ctx_manager = (
-                    torch.cpu.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)
-                    if self.use_cpu_amp
-                    else torch.cuda.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)
-                )
-            else:
-                ctx_manager = torch.cuda.amp.autocast()
+        if self.use_cpu_amp:
+            ctx_manager = torch.cpu.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)
         else:
-            ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
+            ctx_manager = contextlib.nullcontext()
 
         return ctx_manager
 
@@ -2555,22 +2662,13 @@ def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor,
         if self.args.n_gpu > 1:
             loss = loss.mean()  # mean() to average on multi-gpu parallel training
 
-        if self.args.gradient_accumulation_steps > 1 and not self.deepspeed:
-            # deepspeed handles loss scaling by gradient_accumulation_steps in its `backward`
-            loss = loss / self.args.gradient_accumulation_steps
-
-        if self.do_grad_scaling:
-            self.scaler.scale(loss).backward()
-        elif self.use_apex:
+        if self.use_apex:
             with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                 scaled_loss.backward()
-        elif self.deepspeed:
-            # loss gets scaled under gradient_accumulation_steps in deepspeed
-            loss = self.deepspeed.backward(loss)
         else:
-            loss.backward()
+            self.accelerator.backward(loss)
 
-        return loss.detach()
+        return loss.detach() / self.args.gradient_accumulation_steps
 
     def compute_loss(self, model, inputs, return_outputs=False):
         """
@@ -2589,7 +2687,11 @@ def compute_loss(self, model, inputs, return_outputs=False):
             self._past = outputs[self.args.past_index]
 
         if labels is not None:
-            if unwrap_model(model)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
+            if is_peft_available() and isinstance(model, PeftModel):
+                model_name = unwrap_model(model.base_model)._get_name()
+            else:
+                model_name = unwrap_model(model)._get_name()
+            if model_name in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
                 loss = self.label_smoother(outputs, labels, shift_labels=True)
             else:
                 loss = self.label_smoother(outputs, labels)
@@ -2644,42 +2746,33 @@ def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = Fa
             if IS_SAGEMAKER_MP_POST_1_10:
                 # 'user_content.pt' indicates model state_dict saved with smp >= 1.10
                 Path(os.path.join(output_dir, "user_content.pt")).touch()
-        elif (
-            ShardedDDPOption.ZERO_DP_2 in self.args.sharded_ddp
-            or ShardedDDPOption.ZERO_DP_3 in self.args.sharded_ddp
-            or self.fsdp is not None
-        ):
-            state_dict = self.model.state_dict()
-
+        elif self.fsdp is not None or self.is_fsdp_enabled:
+            state_dict = self.model.state_dict() if not self.is_fsdp_enabled else {}
             if self.args.should_save:
                 self._save(output_dir, state_dict=state_dict)
-        elif self.deepspeed:
+            if self.is_fsdp_enabled:
+                # remove the dummy state_dict
+                remove_dummy_checkpoint(self.args.should_save, output_dir, [WEIGHTS_NAME, SAFE_WEIGHTS_NAME])
+                save_fsdp_model(self.accelerator.state.fsdp_plugin, self.accelerator, self.model, output_dir)
 
+        elif self.is_deepspeed_enabled:
             # this takes care of everything as long as we aren't under zero3
-            if self.args.should_save:
-                self._save(output_dir)
-
-            if is_deepspeed_zero3_enabled():
-                # It's too complicated to try to override different places where the weights dump gets
-                # saved, so since under zero3 the file is bogus, simply delete it. The user should
-                # either user deepspeed checkpoint to resume or to recover full weights use
-                # zero_to_fp32.py stored in the checkpoint.
+            if version.parse(accelerate_version) <= version.parse("0.20.3"):
+                raise ValueError("Install Accelerate from main branch")
+            try:
+                state_dict = self.accelerator.get_state_dict(self.deepspeed)
                 if self.args.should_save:
-                    file = os.path.join(output_dir, WEIGHTS_NAME)
-                    if os.path.isfile(file):
-                        # logger.info(f"deepspeed zero3: removing {file}, see zero_to_fp32.py to recover weights")
-                        os.remove(file)
-
-                # now save the real model if stage3_gather_16bit_weights_on_model_save=True
-                # if false it will not be saved.
-                # This must be called on all ranks
-                if not self.deepspeed.save_16bit_model(output_dir, WEIGHTS_NAME):
-                    logger.warning(
-                        "deepspeed.save_16bit_model didn't save the model, since"
-                        " stage3_gather_16bit_weights_on_model_save=false. Saving the full checkpoint instead, use"
-                        " zero_to_fp32.py to recover weights"
-                    )
-                    self.deepspeed.save_checkpoint(output_dir)
+                    self._save(output_dir, state_dict=state_dict)
+            except ValueError:
+                logger.warning(
+                    " stage3_gather_16bit_weights_on_model_save=false. Saving the full checkpoint instead, use"
+                    " zero_to_fp32.py to recover weights"
+                )
+                if self.args.should_save:
+                    self._save(output_dir, state_dict={})
+                # remove the dummy state_dict
+                remove_dummy_checkpoint(self.args.should_save, output_dir, [WEIGHTS_NAME, SAFE_WEIGHTS_NAME])
+                self.model_wrapped.save_checkpoint(output_dir)
 
         elif self.args.should_save:
             self._save(output_dir)
@@ -2721,20 +2814,29 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None):
         output_dir = output_dir if output_dir is not None else self.args.output_dir
         os.makedirs(output_dir, exist_ok=True)
         logger.info(f"Saving model checkpoint to {output_dir}")
+
+        supported_classes = (PreTrainedModel,) if not is_peft_available() else (PreTrainedModel, PeftModel)
         # Save a trained model and configuration using `save_pretrained()`.
         # They can then be reloaded using `from_pretrained()`
-        if not isinstance(self.model, PreTrainedModel):
-            if isinstance(unwrap_model(self.model), PreTrainedModel):
-                if state_dict is None:
-                    state_dict = self.model.state_dict()
-                unwrap_model(self.model).save_pretrained(output_dir, state_dict=state_dict)
+        if not isinstance(self.model, supported_classes):
+            if state_dict is None:
+                state_dict = self.model.state_dict()
+
+            if isinstance(unwrap_model(self.model), supported_classes):
+                unwrap_model(self.model).save_pretrained(
+                    output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors
+                )
             else:
                 logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.")
-                if state_dict is None:
-                    state_dict = self.model.state_dict()
-                torch.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME))
+                if self.args.save_safetensors:
+                    safetensors.torch.save_file(state_dict, os.path.join(output_dir, SAFE_WEIGHTS_NAME))
+                else:
+                    torch.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME))
         else:
-            self.model.save_pretrained(output_dir, state_dict=state_dict)
+            self.model.save_pretrained(
+                output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors
+            )
+
         if self.tokenizer is not None:
             self.tokenizer.save_pretrained(output_dir)
 
@@ -2743,7 +2845,7 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None):
 
     def store_flos(self):
         # Storing the number of floating-point operations that went into the model
-        if self.args.local_rank != -1:
+        if self.args.parallel_mode == ParallelMode.DISTRIBUTED:
             self.state.total_flos += (
                 distributed_broadcast_scalars([self.current_flos], device=self.args.device).sum().item()
             )
@@ -2820,7 +2922,7 @@ def evaluate(
                 Pass a dataset if you wish to override `self.eval_dataset`. If it is a [`~datasets.Dataset`], columns
                 not accepted by the `model.forward()` method are automatically removed. It must implement the `__len__`
                 method.
-            ignore_keys (`Lst[str]`, *optional*):
+            ignore_keys (`List[str]`, *optional*):
                 A list of keys in the output of your model (if it is a dictionary) that should be ignored when
                 gathering predictions.
             metric_key_prefix (`str`, *optional*, defaults to `"eval"`):
@@ -2885,7 +2987,7 @@ def predict(
             test_dataset (`Dataset`):
                 Dataset to run the predictions on. If it is an `datasets.Dataset`, columns not accepted by the
                 `model.forward()` method are automatically removed. Has to implement the method `__len__`
-            ignore_keys (`Lst[str]`, *optional*):
+            ignore_keys (`List[str]`, *optional*):
                 A list of keys in the output of your model (if it is a dictionary) that should be ignored when
                 gathering predictions.
             metric_key_prefix (`str`, *optional*, defaults to `"test"`):
@@ -2951,19 +3053,29 @@ def evaluation_loop(
 
         prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else args.prediction_loss_only
 
-        # if eval is called w/o train init deepspeed here
-        if args.deepspeed and not self.deepspeed:
+        # if eval is called w/o train, handle model prep here
+        if self.is_deepspeed_enabled and self.deepspeed is None:
+            _, _ = deepspeed_init(self, num_training_steps=0, inference=True)
+
+        model = self._wrap_model(self.model, training=False, dataloader=dataloader)
 
-            # XXX: eval doesn't have `resume_from_checkpoint` arg but we should be able to do eval
-            # from the checkpoint eventually
-            deepspeed_engine, _, _ = deepspeed_init(
-                self, num_training_steps=0, resume_from_checkpoint=None, inference=True
+        if len(self.accelerator._models) == 0 and model is self.model:
+            model = (
+                self.accelerator.prepare(model)
+                if self.is_deepspeed_enabled
+                else self.accelerator.prepare_model(model, evaluation_mode=True)
             )
-            self.model = deepspeed_engine.module
-            self.model_wrapped = deepspeed_engine
-            self.deepspeed = deepspeed_engine
 
-        model = self._wrap_model(self.model, training=False, dataloader=dataloader)
+            if self.is_fsdp_enabled:
+                self.model = model
+
+            # for the rest of this function `model` is the outside model, whether it was wrapped or not
+            if model is not self.model:
+                self.model_wrapped = model
+
+            # backward compatibility
+            if self.is_deepspeed_enabled:
+                self.deepspeed = self.model_wrapped
 
         # if full fp16 or bf16 eval is wanted and this ``evaluation`` or ``predict`` isn't called
         # while ``train`` is running, cast it to the right dtype first and then put on device
@@ -2988,9 +3100,6 @@ def evaluation_loop(
         # Do this before wrapping.
         eval_dataset = getattr(dataloader, "dataset", None)
 
-        if is_torch_tpu_available():
-            dataloader = pl.ParallelLoader(dataloader, [args.device]).per_device_loader(args.device)
-
         if args.past_index >= 0:
             self._past = None
 
@@ -3021,37 +3130,45 @@ def evaluation_loop(
 
             # Prediction step
             loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
-            inputs_decode = self._prepare_input(inputs["input_ids"]) if args.include_inputs_for_metrics else None
+            main_input_name = getattr(self.model, "main_input_name", "input_ids")
+            inputs_decode = self._prepare_input(inputs[main_input_name]) if args.include_inputs_for_metrics else None
 
             if is_torch_tpu_available():
                 xm.mark_step()
 
             # Update containers on host
             if loss is not None:
-                losses = self._nested_gather(loss.repeat(batch_size))
-                losses_host = losses if losses_host is None else torch.cat((losses_host, losses), dim=0)
+                losses = self.accelerator.gather_for_metrics((loss.repeat(batch_size)))
+                losses_host = losses if losses_host is None else nested_concat(losses_host, losses, padding_index=-100)
             if labels is not None:
-                labels = self._pad_across_processes(labels)
-                labels = self._nested_gather(labels)
-                labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100)
+                labels = self.accelerator.pad_across_processes(labels, dim=1, pad_index=-100)
             if inputs_decode is not None:
-                inputs_decode = self._pad_across_processes(inputs_decode)
-                inputs_decode = self._nested_gather(inputs_decode)
+                inputs_decode = self.accelerator.pad_across_processes(inputs_decode, dim=1, pad_index=-100)
+                inputs_decode = self.accelerator.gather_for_metrics((inputs_decode))
                 inputs_host = (
                     inputs_decode
                     if inputs_host is None
                     else nested_concat(inputs_host, inputs_decode, padding_index=-100)
                 )
             if logits is not None:
-                logits = self._pad_across_processes(logits)
-                logits = self._nested_gather(logits)
+                logits = self.accelerator.pad_across_processes(logits, dim=1, pad_index=-100)
                 if self.preprocess_logits_for_metrics is not None:
                     logits = self.preprocess_logits_for_metrics(logits, labels)
+                logits = self.accelerator.gather_for_metrics((logits))
                 preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100)
+
+            if labels is not None:
+                labels = self.accelerator.gather_for_metrics((labels))
+                labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100)
+
             self.control = self.callback_handler.on_prediction_step(args, self.state, self.control)
 
             # Gather all tensors and put them back on the CPU if we have done enough accumulation steps.
-            if args.eval_accumulation_steps is not None and (step + 1) % args.eval_accumulation_steps == 0:
+            if (
+                args.eval_accumulation_steps is not None
+                and (step + 1) % args.eval_accumulation_steps == 0
+                and (self.accelerator.sync_gradients or version.parse(accelerate_version) > version.parse("0.20.3"))
+            ):
                 if losses_host is not None:
                     losses = nested_numpify(losses_host)
                     all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0)
@@ -3109,17 +3226,6 @@ def evaluation_loop(
         if num_samples == 0 and observed_num_examples > 0:
             num_samples = observed_num_examples
 
-        # Number of losses has been rounded to a multiple of batch_size and in a distributed training, the number of
-        # samplers has been rounded to a multiple of batch_size, so we truncate.
-        if all_losses is not None:
-            all_losses = all_losses[:num_samples]
-        if all_preds is not None:
-            all_preds = nested_truncate(all_preds, num_samples)
-        if all_labels is not None:
-            all_labels = nested_truncate(all_labels, num_samples)
-        if all_inputs is not None:
-            all_inputs = nested_truncate(all_inputs, num_samples)
-
         # Metrics!
         if self.compute_metrics is not None and all_preds is not None and all_labels is not None:
             if args.include_inputs_for_metrics:
@@ -3159,45 +3265,12 @@ def _nested_gather(self, tensors, name=None):
             tensors = nested_xla_mesh_reduce(tensors, name)
         elif is_sagemaker_mp_enabled():
             tensors = smp_gather(tensors)
-        elif self.args.local_rank != -1:
+        elif (self.args.distributed_state is not None and self.args.distributed_state.distributed_type != "NO") or (
+            self.args.distributed_state is None and self.args.local_rank != -1
+        ):
             tensors = distributed_concat(tensors)
         return tensors
 
-    # Copied from Accelerate.
-    def _pad_across_processes(self, tensor, pad_index=-100):
-        """
-        Recursively pad the tensors in a nested list/tuple/dictionary of tensors from all devices to the same size so
-        they can safely be gathered.
-        """
-        if isinstance(tensor, (list, tuple)):
-            return type(tensor)(self._pad_across_processes(t, pad_index=pad_index) for t in tensor)
-        elif isinstance(tensor, dict):
-            return type(tensor)({k: self._pad_across_processes(v, pad_index=pad_index) for k, v in tensor.items()})
-        elif not isinstance(tensor, torch.Tensor):
-            raise TypeError(
-                f"Can't pad the values of type {type(tensor)}, only of nested list/tuple/dicts of tensors."
-            )
-
-        if len(tensor.shape) < 2:
-            return tensor
-        # Gather all sizes
-        size = torch.tensor(tensor.shape, device=tensor.device)[None]
-        sizes = self._nested_gather(size).cpu()
-
-        max_size = max(s[1] for s in sizes)
-        # When extracting XLA graphs for compilation, max_size is 0,
-        # so use inequality to avoid errors.
-        if tensor.shape[1] >= max_size:
-            return tensor
-
-        # Then pad to the maximum size
-        old_size = tensor.shape
-        new_size = list(old_size)
-        new_size[1] = max_size
-        new_tensor = tensor.new_zeros(tuple(new_size)) + pad_index
-        new_tensor[:, : old_size[1]] = tensor
-        return new_tensor
-
     def prediction_step(
         self,
         model: nn.Module,
@@ -3220,7 +3293,7 @@ def prediction_step(
                 argument `labels`. Check your model's documentation for all accepted arguments.
             prediction_loss_only (`bool`):
                 Whether or not to return the loss only.
-            ignore_keys (`Lst[str]`, *optional*):
+            ignore_keys (`List[str]`, *optional*):
                 A list of keys in the output of your model (if it is a dictionary) that should be ignored when
                 gathering predictions.
 
@@ -3321,42 +3394,61 @@ def floating_point_ops(self, inputs: Dict[str, Union[torch.Tensor, Any]]):
         else:
             return 0
 
+    def init_hf_repo(self):
+        """
+        Initializes a git repo in `self.args.hub_model_id`.
+        """
+        # Only on process zero
+        if not self.is_world_process_zero():
+            return
+
+        if self.args.hub_model_id is None:
+            repo_name = Path(self.args.output_dir).absolute().name
+        else:
+            repo_name = self.args.hub_model_id
+
+        repo_url = create_repo(repo_name, token=self.args.hub_token, private=self.args.hub_private_repo, exist_ok=True)
+        self.hub_model_id = repo_url.repo_id
+        self.push_in_progress = None
+
     def init_git_repo(self, at_init: bool = False):
         """
         Initializes a git repo in `self.args.hub_model_id`.
 
+        <Tip warning={true}>
+
+        This function is deprecated and will be removed in v4.34.0 of Transformers.
+
+        </Tip>
+
         Args:
             at_init (`bool`, *optional*, defaults to `False`):
                 Whether this function is called before any training or not. If `self.args.overwrite_output_dir` is
                 `True` and `at_init` is `True`, the path to the repo (which is `self.args.output_dir`) might be wiped
                 out.
         """
+        warnings.warn(
+            "`Trainer.init_git_repo` is deprecated and will be removed in v4.34.0 of Transformers. Use "
+            "`Trainer.init_hf_repo` instead."
+        )
         if not self.is_world_process_zero():
             return
-        use_auth_token = True if self.args.hub_token is None else self.args.hub_token
-        if self.args.hub_model_id is None:
+
+        # Make sure the repo exists + retrieve "real" repo_id
+        repo_name = self.args.hub_model_id
+        if repo_name is None:
             repo_name = Path(self.args.output_dir).absolute().name
-        else:
-            repo_name = self.args.hub_model_id
-        if "/" not in repo_name:
-            repo_name = get_full_repo_name(repo_name, token=self.args.hub_token)
+        repo_id = create_repo(
+            repo_id=repo_name, token=self.args.hub_token, private=self.args.hub_private_repo, exist_ok=True
+        ).repo_id
 
         try:
-            self.repo = Repository(
-                self.args.output_dir,
-                clone_from=repo_name,
-                use_auth_token=use_auth_token,
-                private=self.args.hub_private_repo,
-            )
+            self.repo = Repository(self.args.output_dir, clone_from=repo_id, token=self.args.hub_token)
         except EnvironmentError:
             if self.args.overwrite_output_dir and at_init:
                 # Try again after wiping output_dir
                 shutil.rmtree(self.args.output_dir)
-                self.repo = Repository(
-                    self.args.output_dir,
-                    clone_from=repo_name,
-                    use_auth_token=use_auth_token,
-                )
+                self.repo = Repository(self.args.output_dir, clone_from=repo_id, token=self.args.hub_token)
             else:
                 raise
 
@@ -3370,6 +3462,10 @@ def init_git_repo(self, at_init: bool = False):
             with open(os.path.join(self.args.output_dir, ".gitignore"), "w", encoding="utf-8") as writer:
                 writer.writelines(["checkpoint-*/"])
 
+        # Add "*.sagemaker" to .gitignore if using SageMaker
+        if os.environ.get("SM_TRAINING_ENV"):
+            self._add_sm_patterns_to_gitignore()
+
         self.push_in_progress = None
 
     def create_model_card(
@@ -3432,13 +3528,15 @@ def _push_from_checkpoint(self, checkpoint_folder):
         # Only push from one node.
         if not self.is_world_process_zero() or self.args.hub_strategy == HubStrategy.END:
             return
-        # If we haven't finished the last push, we don't do this one.
-        if self.push_in_progress is not None and not self.push_in_progress.is_done:
+        # If we haven't finished the last push, we don't do this one unless args.hub_always_push=True.
+        if not self.args.hub_always_push and self.push_in_progress is not None and not self.push_in_progress.is_done():
             return
 
         output_dir = self.args.output_dir
         # To avoid a new synchronization of all model weights, we just copy the file from the checkpoint folder
-        modeling_files = [CONFIG_NAME, WEIGHTS_NAME]
+        modeling_files = [CONFIG_NAME, WEIGHTS_NAME, SAFE_WEIGHTS_NAME]
+        if is_peft_available():
+            modeling_files.extend([ADAPTER_CONFIG_NAME, ADAPTER_WEIGHTS_NAME, ADAPTER_SAFE_WEIGHTS_NAME])
         for modeling_file in modeling_files:
             if os.path.isfile(os.path.join(checkpoint_folder, modeling_file)):
                 shutil.copy(os.path.join(checkpoint_folder, modeling_file), os.path.join(output_dir, modeling_file))
@@ -3448,49 +3546,64 @@ def _push_from_checkpoint(self, checkpoint_folder):
         # Same for the training arguments
         torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
 
-        try:
-            if self.args.hub_strategy == HubStrategy.CHECKPOINT:
-                # Temporarily move the checkpoint just saved for the push
-                tmp_checkpoint = os.path.join(output_dir, "last-checkpoint")
-                # We have to remove the "last-checkpoint" dir if it exists, otherwise the checkpoint is moved as a
-                # subfolder.
-                if os.path.isdir(tmp_checkpoint):
-                    shutil.rmtree(tmp_checkpoint)
-                shutil.move(checkpoint_folder, tmp_checkpoint)
-
-            if self.args.save_strategy == IntervalStrategy.STEPS:
-                commit_message = f"Training in progress, step {self.state.global_step}"
-            else:
-                commit_message = f"Training in progress, epoch {int(self.state.epoch)}"
-            _, self.push_in_progress = self.repo.push_to_hub(
-                commit_message=commit_message, blocking=False, auto_lfs_prune=True
+        if self.args.save_strategy == IntervalStrategy.STEPS:
+            commit_message = f"Training in progress, step {self.state.global_step}"
+        else:
+            commit_message = f"Training in progress, epoch {int(self.state.epoch)}"
+
+        model_push_job = upload_folder(
+            repo_id=self.hub_model_id,
+            folder_path=output_dir,
+            commit_message=commit_message,
+            token=self.args.hub_token,
+            run_as_future=True,
+            ignore_patterns=["_*", "**/*"],
+        )
+
+        push_jobs = [model_push_job]
+
+        if self.args.hub_strategy in [HubStrategy.CHECKPOINT, HubStrategy.ALL_CHECKPOINTS]:
+            path_in_repo = (
+                "last-checkpoint" if self.args.hub_strategy == HubStrategy.CHECKPOINT else Path(checkpoint_folder).name
             )
-        finally:
-            if self.args.hub_strategy == HubStrategy.CHECKPOINT:
-                # Move back the checkpoint to its place
-                shutil.move(tmp_checkpoint, checkpoint_folder)
+            checkpoint_push = upload_folder(
+                repo_id=self.hub_model_id,
+                folder_path=checkpoint_folder,
+                path_in_repo=path_in_repo,
+                commit_message=commit_message + ", checkpoint",
+                token=self.args.hub_token,
+                run_as_future=True,
+            )
+            push_jobs.append(checkpoint_push)
+
+        if self.push_in_progress is None or self.push_in_progress.is_done():
+            self.push_in_progress = PushInProgress(push_jobs)
+        else:
+            self.push_in_progress.jobs.extend(push_jobs)
+
+    def _finish_current_push(self):
+        if not hasattr(self, "push_in_progress"):
+            return
+        if self.push_in_progress is not None and not self.push_in_progress.is_done():
+            logger.info("Waiting for the current checkpoint push to be finished, this might take a couple of minutes.")
+            self.push_in_progress.wait_until_done()
 
     def push_to_hub(self, commit_message: Optional[str] = "End of training", blocking: bool = True, **kwargs) -> str:
         """
-        Upload *self.model* and *self.tokenizer* to the 🤗 model hub on the repo *self.args.hub_model_id*.
+        Upload `self.model` and `self.tokenizer` to the 🤗 model hub on the repo `self.args.hub_model_id`.
 
         Parameters:
             commit_message (`str`, *optional*, defaults to `"End of training"`):
                 Message to commit while pushing.
             blocking (`bool`, *optional*, defaults to `True`):
                 Whether the function should return only when the `git push` has finished.
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Additional keyword arguments passed along to [`~Trainer.create_model_card`].
 
         Returns:
-            The url of the commit of your model in the given repository if `blocking=False`, a tuple with the url of
-            the commit and an object to track the progress of the commit if `blocking=True`
+            The URL of the repository where the model was pushed if `blocking=False`, or a `Future` object tracking the
+            progress of the commit if `blocking=True`.
         """
-        # If a user calls manually `push_to_hub` with `self.args.push_to_hub = False`, we try to create the repo but
-        # it might fail.
-        if not hasattr(self, "repo"):
-            self.init_git_repo()
-
         model_name = kwargs.pop("model_name", None)
         if model_name is None and self.args.should_save:
             if self.args.hub_model_id is None:
@@ -3498,6 +3611,10 @@ def push_to_hub(self, commit_message: Optional[str] = "End of training", blockin
             else:
                 model_name = self.args.hub_model_id.split("/")[-1]
 
+        # In case the user calls this method with args.push_to_hub = False
+        if self.hub_model_id is None:
+            self.init_hf_repo()
+
         # Needs to be executed on all processes for TPU training, but will only save on the processed determined by
         # self.args.should_save.
         self.save_model(_internal_call=True)
@@ -3506,25 +3623,19 @@ def push_to_hub(self, commit_message: Optional[str] = "End of training", blockin
         if not self.is_world_process_zero():
             return
 
-        # Cancel any async push in progress if blocking=True. The commits will all be pushed together.
-        if blocking and self.push_in_progress is not None and not self.push_in_progress.is_done:
-            self.push_in_progress._process.kill()
-            self.push_in_progress = None
+        self.create_model_card(model_name=model_name, **kwargs)
 
-        git_head_commit_url = self.repo.push_to_hub(
-            commit_message=commit_message, blocking=blocking, auto_lfs_prune=True
-        )
-        # push separately the model card to be independant from the rest of the model
-        if self.args.should_save:
-            self.create_model_card(model_name=model_name, **kwargs)
-            try:
-                self.repo.push_to_hub(
-                    commit_message="update model card README.md", blocking=blocking, auto_lfs_prune=True
-                )
-            except EnvironmentError as exc:
-                logger.error(f"Error pushing update to the model card. Please read logs and retry.\n${exc}")
+        # Wait for the current upload to be finished.
+        self._finish_current_push()
 
-        return git_head_commit_url
+        return upload_folder(
+            repo_id=self.hub_model_id,
+            folder_path=self.args.output_dir,
+            commit_message=commit_message,
+            token=self.args.hub_token,
+            run_as_future=not blocking,
+            ignore_patterns=["_*", "**/*"],
+        )
 
     #
     # Deprecated code
@@ -3550,22 +3661,30 @@ def prediction_loop(
 
         prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else args.prediction_loss_only
 
-        # if eval is called w/o train init deepspeed here
-        if args.deepspeed and not self.deepspeed:
-            # XXX: eval doesn't have `resume_from_checkpoint` arg but we should be able to do eval
-            # from the checkpoint eventually
-            deepspeed_engine, _, _ = deepspeed_init(self, num_training_steps=0, resume_from_checkpoint=None)
-            self.model = deepspeed_engine.module
-            self.model_wrapped = deepspeed_engine
-            self.deepspeed = deepspeed_engine
-            # XXX: we don't need optim/sched for inference, but this needs to be sorted out, since
-            # for example the Z3-optimizer is a must for zero3 to work even for inference - what we
-            # don't need is the deepspeed basic optimizer which is self.optimizer.optimizer
-            deepspeed_engine.optimizer.optimizer = None
-            deepspeed_engine.lr_scheduler = None
+        # if eval is called w/o train, handle model prep here
+        if self.is_deepspeed_enabled and self.deepspeed is None:
+            _, _ = deepspeed_init(self, num_training_steps=0, inference=True)
 
         model = self._wrap_model(self.model, training=False, dataloader=dataloader)
 
+        if len(self.accelerator._models) == 0 and model is self.model:
+            model = (
+                self.accelerator.prepare(model)
+                if self.is_deepspeed_enabled
+                else self.accelerator.prepare_model(model, evaluation_mode=True)
+            )
+
+            if self.is_fsdp_enabled:
+                self.model = model
+
+            # for the rest of this function `model` is the outside model, whether it was wrapped or not
+            if model is not self.model:
+                self.model_wrapped = model
+
+            # backward compatibility
+            if self.is_deepspeed_enabled:
+                self.deepspeed = self.model_wrapped
+
         # if full fp16 or bf16 eval is wanted and this ``evaluation`` or ``predict`` isn't called
         # while ``train`` is running, cast it to the right dtype first and then put on device
         if not self.is_in_train:
@@ -3599,9 +3718,6 @@ def prediction_loop(
 
         model.eval()
 
-        if is_torch_tpu_available():
-            dataloader = pl.ParallelLoader(dataloader, [args.device]).per_device_loader(args.device)
-
         if args.past_index >= 0:
             self._past = None
 
@@ -3609,7 +3725,8 @@ def prediction_loop(
 
         for step, inputs in enumerate(dataloader):
             loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
-            inputs_decode = self._prepare_input(inputs["input_ids"]) if args.include_inputs_for_metrics else None
+            main_input_name = getattr(self.model, "main_input_name", "input_ids")
+            inputs_decode = self._prepare_input(inputs[main_input_name]) if args.include_inputs_for_metrics else None
 
             if loss is not None:
                 losses = loss.repeat(batch_size)
@@ -3687,7 +3804,90 @@ def _gather_and_numpify(self, tensors, name):
             tensors = nested_xla_mesh_reduce(tensors, name)
         elif is_sagemaker_mp_enabled():
             tensors = smp_gather(tensors)
-        elif self.args.local_rank != -1:
+        elif self.args.parallel_mode == ParallelMode.DISTRIBUTED:
             tensors = distributed_concat(tensors)
 
         return nested_numpify(tensors)
+
+    def _add_sm_patterns_to_gitignore(self) -> None:
+        """Add SageMaker Checkpointing patterns to .gitignore file."""
+        # Make sure we only do this on the main process
+        if not self.is_world_process_zero():
+            return
+
+        patterns = ["*.sagemaker-uploading", "*.sagemaker-uploaded"]
+
+        # Get current .gitignore content
+        if os.path.exists(os.path.join(self.repo.local_dir, ".gitignore")):
+            with open(os.path.join(self.repo.local_dir, ".gitignore"), "r") as f:
+                current_content = f.read()
+        else:
+            current_content = ""
+
+        # Add the patterns to .gitignore
+        content = current_content
+        for pattern in patterns:
+            if pattern not in content:
+                if content.endswith("\n"):
+                    content += pattern
+                else:
+                    content += f"\n{pattern}"
+
+        # Write the .gitignore file if it has changed
+        if content != current_content:
+            with open(os.path.join(self.repo.local_dir, ".gitignore"), "w") as f:
+                logger.debug(f"Writing .gitignore file. Content: {content}")
+                f.write(content)
+
+        self.repo.git_add(".gitignore")
+
+        # avoid race condition with git status
+        time.sleep(0.5)
+
+        if not self.repo.is_repo_clean():
+            self.repo.git_commit("Add *.sagemaker patterns to .gitignore.")
+            self.repo.git_push()
+
+    def create_accelerator_and_postprocess(self):
+        grad_acc_kwargs = {"num_steps": self.args.gradient_accumulation_steps}
+        if version.parse(accelerate_version) > version.parse("0.20.3"):
+            grad_acc_kwargs["sync_with_dataloader"] = False
+        gradient_accumulation_plugin = GradientAccumulationPlugin(**grad_acc_kwargs)
+
+        # create accelerator object
+        self.accelerator = Accelerator(
+            dispatch_batches=self.args.dispatch_batches,
+            deepspeed_plugin=self.args.deepspeed_plugin,
+            gradient_accumulation_plugin=gradient_accumulation_plugin,
+        )
+
+        # deepspeed and accelerate flags covering both trainer args and accelerate launcher
+        self.is_deepspeed_enabled = getattr(self.accelerator.state, "deepspeed_plugin", None) is not None
+        self.is_fsdp_enabled = getattr(self.accelerator.state, "fsdp_plugin", None) is not None
+
+        # post accelerator creation setup
+        if self.is_fsdp_enabled:
+            fsdp_plugin = self.accelerator.state.fsdp_plugin
+            fsdp_plugin.limit_all_gathers = self.args.fsdp_config.get(
+                "limit_all_gathers", fsdp_plugin.limit_all_gathers
+            )
+            if is_accelerate_available("0.23.0"):
+                fsdp_plugin.activation_checkpointing = self.args.fsdp_config.get(
+                    "activation_checkpointing", fsdp_plugin.activation_checkpointing
+                )
+                if fsdp_plugin.activation_checkpointing and self.args.gradient_checkpointing:
+                    raise ValueError(
+                        "The activation_checkpointing in FSDP config and the gradient_checkpointing in training arg "
+                        "can't be set to True simultaneously. Please use FSDP's activation_checkpointing logic "
+                        "when using FSDP."
+                    )
+
+        if self.is_deepspeed_enabled:
+            if getattr(self.args, "hf_deepspeed_config", None) is None:
+                from transformers.integrations.deepspeed import HfTrainerDeepSpeedConfig
+
+                ds_plugin = self.accelerator.state.deepspeed_plugin
+
+                ds_plugin.hf_ds_config = HfTrainerDeepSpeedConfig(ds_plugin.hf_ds_config.config)
+                ds_plugin.deepspeed_config = ds_plugin.hf_ds_config.config
+                ds_plugin.hf_ds_config.trainer_config_process(self.args)
diff --git a/src/transformers/trainer_callback.py b/src/transformers/trainer_callback.py
index 8749e5f3f574..298b473850f4 100644
--- a/src/transformers/trainer_callback.py
+++ b/src/transformers/trainer_callback.py
@@ -53,6 +53,12 @@ class TrainerState:
             During training, represents the number of update steps completed.
         max_steps (`int`, *optional*, defaults to 0):
             The number of update steps to do during the current training.
+        logging_steps (`int`, *optional*, defaults to 500):
+            Log every X updates steps
+        eval_steps (`int`, *optional*):
+            Run an evaluation every X steps.
+        save_steps (`int`, *optional*, defaults to 500):
+            Save checkpoint every X updates steps.
         total_flos (`float`, *optional*, defaults to 0):
             The total number of floating operations done by the model since the beginning of training (stored as floats
             to avoid overflow).
@@ -77,6 +83,9 @@ class TrainerState:
     epoch: Optional[float] = None
     global_step: int = 0
     max_steps: int = 0
+    logging_steps: int = 500
+    eval_steps: int = 500
+    save_steps: int = 500
     num_train_epochs: int = 0
     total_flos: float = 0
     log_history: List[Dict[str, float]] = None
@@ -157,6 +166,7 @@ def _new_step(self):
 
 
 class TrainerCallback:
+    # no-format
     """
     A class for objects that will inspect the state of the training loop at some events and take some decisions. At
     each of those events the following arguments are available:
@@ -421,13 +431,13 @@ def on_step_end(self, args: TrainingArguments, state: TrainerState, control: Tra
         # Log
         if state.global_step == 1 and args.logging_first_step:
             control.should_log = True
-        if args.logging_strategy == IntervalStrategy.STEPS and state.global_step % args.logging_steps == 0:
+        if args.logging_strategy == IntervalStrategy.STEPS and state.global_step % state.logging_steps == 0:
             control.should_log = True
 
         # Evaluate
         if (
             args.evaluation_strategy == IntervalStrategy.STEPS
-            and state.global_step % args.eval_steps == 0
+            and state.global_step % state.eval_steps == 0
             and args.eval_delay <= state.global_step
         ):
             control.should_evaluate = True
@@ -435,8 +445,8 @@ def on_step_end(self, args: TrainingArguments, state: TrainerState, control: Tra
         # Save
         if (
             args.save_strategy == IntervalStrategy.STEPS
-            and args.save_steps > 0
-            and state.global_step % args.save_steps == 0
+            and state.save_steps > 0
+            and state.global_step % state.save_steps == 0
         ):
             control.should_save = True
 
@@ -473,7 +483,7 @@ def __init__(self):
 
     def on_train_begin(self, args, state, control, **kwargs):
         if state.is_local_process_zero:
-            self.training_bar = tqdm(total=state.max_steps)
+            self.training_bar = tqdm(total=state.max_steps, dynamic_ncols=True)
         self.current_step = 0
 
     def on_step_end(self, args, state, control, **kwargs):
@@ -484,7 +494,9 @@ def on_step_end(self, args, state, control, **kwargs):
     def on_prediction_step(self, args, state, control, eval_dataloader=None, **kwargs):
         if state.is_local_process_zero and has_length(eval_dataloader):
             if self.prediction_bar is None:
-                self.prediction_bar = tqdm(total=len(eval_dataloader), leave=self.training_bar is None)
+                self.prediction_bar = tqdm(
+                    total=len(eval_dataloader), leave=self.training_bar is None, dynamic_ncols=True
+                )
             self.prediction_bar.update(1)
 
     def on_evaluate(self, args, state, control, **kwargs):
@@ -526,15 +538,16 @@ class EarlyStoppingCallback(TrainerCallback):
     A [`TrainerCallback`] that handles early stopping.
 
     Args:
-       early_stopping_patience (`int`):
+        early_stopping_patience (`int`):
             Use with `metric_for_best_model` to stop training when the specified metric worsens for
             `early_stopping_patience` evaluation calls.
-       early_stopping_threshold(`float`, *optional*):
+        early_stopping_threshold(`float`, *optional*):
             Use with TrainingArguments `metric_for_best_model` and `early_stopping_patience` to denote how much the
             specified metric must improve to satisfy early stopping conditions. `
 
     This callback depends on [`TrainingArguments`] argument *load_best_model_at_end* functionality to set best_metric
-    in [`TrainerState`].
+    in [`TrainerState`]. Note that if the [`TrainingArguments`] argument *save_steps* differs from *eval_steps*, the
+    early stopping will not occur until the next save step.
     """
 
     def __init__(self, early_stopping_patience: int = 1, early_stopping_threshold: Optional[float] = 0.0):
diff --git a/src/transformers/trainer_pt_utils.py b/src/transformers/trainer_pt_utils.py
index cabc2ee51384..cb6249f19a93 100644
--- a/src/transformers/trainer_pt_utils.py
+++ b/src/transformers/trainer_pt_utils.py
@@ -35,6 +35,7 @@
 from torch.utils.data import Dataset, IterableDataset, RandomSampler, Sampler
 from torch.utils.data.distributed import DistributedSampler
 
+from .integrations.deepspeed import is_deepspeed_zero3_enabled
 from .tokenization_utils_base import BatchEncoding
 from .utils import is_sagemaker_mp_enabled, is_torch_tpu_available, is_training_run_on_sagemaker, logging
 
@@ -54,6 +55,13 @@
 logger = logging.get_logger(__name__)
 
 
+def get_dataloader_sampler(dataloader):
+    if hasattr(dataloader, "batch_sampler") and dataloader.batch_sampler is not None:
+        return get_dataloader_sampler(dataloader.batch_sampler)
+    elif hasattr(dataloader, "sampler"):
+        return dataloader.sampler
+
+
 def atleast_1d(tensor_or_array: Union[torch.Tensor, np.ndarray]):
     if isinstance(tensor_or_array, torch.Tensor):
         if hasattr(torch, "atleast_1d"):
@@ -189,6 +197,8 @@ def distributed_concat(tensor: Any, num_total_examples: Optional[int] = None) ->
     try:
         if isinstance(tensor, (tuple, list)):
             return type(tensor)(distributed_concat(t, num_total_examples) for t in tensor)
+        if isinstance(tensor, Mapping):
+            return type(tensor)({k: distributed_concat(t, num_total_examples) for k, t in tensor.items()})
         tensor = atleast_1d(tensor).contiguous()
         output_tensors = [tensor.clone() for _ in range(dist.get_world_size())]
         dist.all_gather(output_tensors, tensor)
@@ -254,7 +264,7 @@ class DistributedSamplerWithLoop(DistributedSampler):
             Dataset used for sampling.
         batch_size (`int`):
             The batch size used with this sampler
-        kwargs:
+        kwargs (`Dict[str, Any]`, *optional*):
             All other keyword arguments passed to `DistributedSampler`.
     """
 
@@ -532,7 +542,7 @@ def get_length_grouped_indices(lengths, batch_size, mega_batch_mult=None, genera
     indices = torch.randperm(len(lengths), generator=generator)
     megabatch_size = mega_batch_mult * batch_size
     megabatches = [indices[i : i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)]
-    megabatches = [list(sorted(megabatch, key=lambda i: lengths[i], reverse=True)) for megabatch in megabatches]
+    megabatches = [sorted(megabatch, key=lambda i: lengths[i], reverse=True) for megabatch in megabatches]
 
     # The rest is to get the biggest batch first.
     # Since each megabatch is sorted by descending length, the longest element is the first
@@ -595,6 +605,7 @@ class DistributedLengthGroupedSampler(DistributedSampler):
     Distributed Sampler that samples indices in a way that groups together features of the dataset of roughly the same
     length while keeping a bit of randomness.
     """
+
     # Copied and adapted from PyTorch DistributedSampler.
     def __init__(
         self,
@@ -834,7 +845,7 @@ def __len__(self):
 
 
 def _get_learning_rate(self):
-    if self.deepspeed:
+    if self.is_deepspeed_enabled:
         # with deepspeed's fp16 and dynamic loss scale enabled the optimizer/scheduler steps may
         # not run for the first few dozen steps while loss scale is too large, and thus during
         # that time `get_last_lr` will fail if called during that warm up stage, so work around it:
@@ -847,7 +858,10 @@ def _get_learning_rate(self):
             else:
                 raise
     else:
-        last_lr = self.lr_scheduler.get_last_lr()[0]
+        if isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
+            last_lr = self.optimizer.param_groups[0]["lr"]
+        else:
+            last_lr = self.lr_scheduler.get_last_lr()[0]
         if torch.is_tensor(last_lr):
             last_lr = last_lr.item()
     return last_lr
@@ -1029,6 +1043,23 @@ def save_state(self):
     self.state.save_to_json(path)
 
 
+def get_model_param_count(model, trainable_only=False):
+    """
+    Calculate model's total param count. If trainable_only is True then count only those requiring grads
+    """
+    if is_deepspeed_zero3_enabled():
+
+        def numel(p):
+            return p.ds_numel if hasattr(p, "ds_numel") else p.numel()
+
+    else:
+
+        def numel(p):
+            return p.numel()
+
+    return sum(numel(p) for p in model.parameters() if not trainable_only or p.requires_grad)
+
+
 def get_parameter_names(model, forbidden_layer_types):
     """
     Returns the names of the model parameters that are not inside a forbidden layer.
@@ -1065,6 +1096,14 @@ def get_module_class_from_name(module, name):
                 return module_class
 
 
+def remove_dummy_checkpoint(is_main_process, output_dir, filenames):
+    if is_main_process:
+        for filename in filenames:
+            file = os.path.join(output_dir, filename)
+            if os.path.isfile(file):
+                os.remove(file)
+
+
 if is_sagemaker_mp_enabled():
     import smdistributed.modelparallel.torch as smp
 
diff --git a/src/transformers/trainer_seq2seq.py b/src/transformers/trainer_seq2seq.py
index 2a2b6565cd22..aaff31a2dc9e 100644
--- a/src/transformers/trainer_seq2seq.py
+++ b/src/transformers/trainer_seq2seq.py
@@ -12,28 +12,109 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Any, Dict, List, Optional, Tuple, Union
+from copy import deepcopy
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
 from torch import nn
 from torch.utils.data import Dataset
 
-from .deepspeed import is_deepspeed_zero3_enabled
+from .generation.configuration_utils import GenerationConfig
+from .integrations.deepspeed import is_deepspeed_zero3_enabled
 from .trainer import Trainer
-from .trainer_utils import PredictionOutput
 from .utils import logging
 
 
+if TYPE_CHECKING:
+    from .data.data_collator import DataCollator
+    from .modeling_utils import PreTrainedModel
+    from .tokenization_utils_base import PreTrainedTokenizerBase
+    from .trainer_callback import TrainerCallback
+    from .trainer_utils import EvalPrediction, PredictionOutput
+    from .training_args import TrainingArguments
+
+
 logger = logging.get_logger(__name__)
 
 
 class Seq2SeqTrainer(Trainer):
+    def __init__(
+        self,
+        model: Union["PreTrainedModel", nn.Module] = None,
+        args: "TrainingArguments" = None,
+        data_collator: Optional["DataCollator"] = None,
+        train_dataset: Optional[Dataset] = None,
+        eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
+        tokenizer: Optional["PreTrainedTokenizerBase"] = None,
+        model_init: Optional[Callable[[], "PreTrainedModel"]] = None,
+        compute_metrics: Optional[Callable[["EvalPrediction"], Dict]] = None,
+        callbacks: Optional[List["TrainerCallback"]] = None,
+        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
+    ):
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            tokenizer=tokenizer,
+            model_init=model_init,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+        )
+
+        # Override self.model.generation_config if a GenerationConfig is specified in args.
+        # Priority: args.generation_config > model.generation_config > default GenerationConfig.
+        if self.args.generation_config is not None:
+            gen_config = self.load_generation_config(self.args.generation_config)
+            self.model.generation_config = gen_config
+
+    @staticmethod
+    def load_generation_config(gen_config_arg: Union[str, GenerationConfig]) -> GenerationConfig:
+        """
+        Loads a `~generation.GenerationConfig` from the `Seq2SeqTrainingArguments.generation_config` arguments.
+
+        Args:
+            gen_config_arg (`str` or [`~generation.GenerationConfig`]):
+                `Seq2SeqTrainingArguments.generation_config` argument.
+
+        Returns:
+            A `~generation.GenerationConfig`.
+        """
+
+        # GenerationConfig provided, nothing to do
+        if isinstance(gen_config_arg, GenerationConfig):
+            return deepcopy(gen_config_arg)
+
+        # str or Path
+        pretrained_model_name = Path(gen_config_arg) if isinstance(gen_config_arg, str) else gen_config_arg
+        config_file_name = None
+
+        # Figuring if it is path pointing to a file, pointing to a directory or else a model id or URL
+        # This step is required in order to determine config_file_name
+        if pretrained_model_name.is_file():
+            config_file_name = pretrained_model_name.name
+            pretrained_model_name = pretrained_model_name.parent
+        # dir path
+        elif pretrained_model_name.is_dir():
+            pass
+        # model id or URL
+        else:
+            pretrained_model_name = gen_config_arg
+
+        gen_config = GenerationConfig.from_pretrained(pretrained_model_name, config_file_name)
+        return gen_config
+
     def evaluate(
         self,
         eval_dataset: Optional[Dataset] = None,
         ignore_keys: Optional[List[str]] = None,
         metric_key_prefix: str = "eval",
-        **gen_kwargs
+        **gen_kwargs,
     ) -> Dict[str, float]:
         """
         Run evaluation and returns metrics.
@@ -68,11 +149,17 @@ def evaluate(
         """
 
         gen_kwargs = gen_kwargs.copy()
-        if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None:
+
+        # Use legacy argument setting if a) the option is not explicitly passed; and b) the argument is set in the
+        # training args
+        if (
+            gen_kwargs.get("max_length") is None
+            and gen_kwargs.get("max_new_tokens") is None
+            and self.args.generation_max_length is not None
+        ):
             gen_kwargs["max_length"] = self.args.generation_max_length
-        gen_kwargs["num_beams"] = (
-            gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.args.generation_num_beams
-        )
+        if gen_kwargs.get("num_beams") is None and self.args.generation_num_beams is not None:
+            gen_kwargs["num_beams"] = self.args.generation_num_beams
         self._gen_kwargs = gen_kwargs
 
         return super().evaluate(eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix)
@@ -82,8 +169,8 @@ def predict(
         test_dataset: Dataset,
         ignore_keys: Optional[List[str]] = None,
         metric_key_prefix: str = "test",
-        **gen_kwargs
-    ) -> PredictionOutput:
+        **gen_kwargs,
+    ) -> "PredictionOutput":
         """
         Run prediction and returns predictions and potential metrics.
 
@@ -125,11 +212,17 @@ def predict(
         """
 
         gen_kwargs = gen_kwargs.copy()
-        if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None:
+
+        # Use legacy argument setting if a) the option is not explicitly passed; and b) the argument is set in the
+        # training args
+        if (
+            gen_kwargs.get("max_length") is None
+            and gen_kwargs.get("max_new_tokens") is None
+            and self.args.generation_max_length is not None
+        ):
             gen_kwargs["max_length"] = self.args.generation_max_length
-        gen_kwargs["num_beams"] = (
-            gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.args.generation_num_beams
-        )
+        if gen_kwargs.get("num_beams") is None and self.args.generation_num_beams is not None:
+            gen_kwargs["num_beams"] = self.args.generation_num_beams
         self._gen_kwargs = gen_kwargs
 
         return super().predict(test_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix)
@@ -140,6 +233,7 @@ def prediction_step(
         inputs: Dict[str, Union[torch.Tensor, Any]],
         prediction_loss_only: bool,
         ignore_keys: Optional[List[str]] = None,
+        **gen_kwargs,
     ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
         """
         Perform an evaluation step on `model` using `inputs`.
@@ -156,6 +250,8 @@ def prediction_step(
                 argument `labels`. Check your model's documentation for all accepted arguments.
             prediction_loss_only (`bool`):
                 Whether or not to return the loss only.
+            gen_kwargs:
+                Additional `generate` specific kwargs.
 
         Return:
             Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
@@ -170,42 +266,44 @@ def prediction_step(
         has_labels = "labels" in inputs
         inputs = self._prepare_inputs(inputs)
 
-        # XXX: adapt synced_gpus for fairscale as well
-        gen_kwargs = self._gen_kwargs.copy()
-        if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None:
-            gen_kwargs["max_length"] = self.model.config.max_length
-        gen_kwargs["num_beams"] = (
-            gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.model.config.num_beams
-        )
+        # Priority (handled in generate):
+        # non-`None` gen_kwargs > model.generation_config > default GenerationConfig()
+        if len(gen_kwargs) == 0 and hasattr(self, "_gen_kwargs"):
+            gen_kwargs = self._gen_kwargs.copy()
+        if "num_beams" in gen_kwargs and gen_kwargs["num_beams"] is None:
+            gen_kwargs.pop("num_beams")
+        if "max_length" in gen_kwargs and gen_kwargs["max_length"] is None:
+            gen_kwargs.pop("max_length")
+
         default_synced_gpus = True if is_deepspeed_zero3_enabled() else False
         gen_kwargs["synced_gpus"] = (
             gen_kwargs["synced_gpus"] if gen_kwargs.get("synced_gpus") is not None else default_synced_gpus
         )
 
-        if "attention_mask" in inputs:
-            gen_kwargs["attention_mask"] = inputs.get("attention_mask", None)
-        if "global_attention_mask" in inputs:
-            gen_kwargs["global_attention_mask"] = inputs.get("global_attention_mask", None)
+        generation_inputs = inputs.copy()
+        # If the `decoder_input_ids` was created from `labels`, evict the former, so that the model can freely generate
+        # (otherwise, it would continue generating from the padded `decoder_input_ids`)
+        if (
+            "labels" in generation_inputs
+            and "decoder_input_ids" in generation_inputs
+            and generation_inputs["labels"].shape == generation_inputs["decoder_input_ids"].shape
+        ):
+            generation_inputs = {k: v for k, v in inputs.items() if k != "decoder_input_ids"}
+        generated_tokens = self.model.generate(**generation_inputs, **gen_kwargs)
 
-        # prepare generation inputs
-        # some encoder-decoder models can have varying encoder's and thus
-        # varying model input names
-        if hasattr(self.model, "encoder") and self.model.encoder.main_input_name != self.model.main_input_name:
-            generation_inputs = inputs[self.model.encoder.main_input_name]
-        else:
-            generation_inputs = inputs[self.model.main_input_name]
+        # Temporary hack to ensure the generation config is not initialized for each iteration of the evaluation loop
+        # TODO: remove this hack when the legacy code that initializes generation_config from a model config is
+        # removed in https://github.com/huggingface/transformers/blob/98d88b23f54e5a23e741833f1e973fdf600cc2c5/src/transformers/generation/utils.py#L1183
+        if self.model.generation_config._from_model_config:
+            self.model.generation_config._from_model_config = False
 
-        generated_tokens = self.model.generate(
-            generation_inputs,
-            **gen_kwargs,
-        )
+        # Retrieves GenerationConfig from model.generation_config
+        gen_config = self.model.generation_config
         # in case the batch is shorter than max length, the output should be padded
-        if gen_kwargs.get("max_length") is not None and generated_tokens.shape[-1] < gen_kwargs["max_length"]:
-            generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"])
-        elif gen_kwargs.get("max_new_tokens") is not None and generated_tokens.shape[-1] < (
-            gen_kwargs["max_new_tokens"] + 1
-        ):
-            generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_new_tokens"] + 1)
+        if generated_tokens.shape[-1] < gen_config.max_length:
+            generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_config.max_length)
+        elif gen_config.max_new_tokens is not None and generated_tokens.shape[-1] < gen_config.max_new_tokens + 1:
+            generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_config.max_new_tokens + 1)
 
         with torch.no_grad():
             if has_labels:
@@ -219,20 +317,18 @@ def prediction_step(
                 loss = None
 
         if self.args.prediction_loss_only:
-            return (loss, None, None)
+            return loss, None, None
 
         if has_labels:
             labels = inputs["labels"]
-            if gen_kwargs.get("max_length") is not None and labels.shape[-1] < gen_kwargs["max_length"]:
-                labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"])
-            elif gen_kwargs.get("max_new_tokens") is not None and labels.shape[-1] < (
-                gen_kwargs["max_new_tokens"] + 1
-            ):
-                labels = self._pad_tensors_to_max_len(labels, (gen_kwargs["max_new_tokens"] + 1))
+            if labels.shape[-1] < gen_config.max_length:
+                labels = self._pad_tensors_to_max_len(labels, gen_config.max_length)
+            elif gen_config.max_new_tokens is not None and labels.shape[-1] < gen_config.max_new_tokens + 1:
+                labels = self._pad_tensors_to_max_len(labels, gen_config.max_new_tokens + 1)
         else:
             labels = None
 
-        return (loss, generated_tokens, labels)
+        return loss, generated_tokens, labels
 
     def _pad_tensors_to_max_len(self, tensor, max_length):
         if self.tokenizer is not None and hasattr(self.tokenizer, "pad_token_id"):
diff --git a/src/transformers/trainer_tf.py b/src/transformers/trainer_tf.py
index 737dd4deaf68..1f6435b787a0 100644
--- a/src/transformers/trainer_tf.py
+++ b/src/transformers/trainer_tf.py
@@ -23,11 +23,14 @@
 
 
 # Integrations must be imported before ML frameworks:
-from .integrations import (  # isort: split
+# isort: off
+from .integrations import (
     is_comet_available,
     is_wandb_available,
 )
 
+# isort: on
+
 import numpy as np
 import tensorflow as tf
 from tensorflow.python.distribute.values import PerReplica
@@ -462,7 +465,6 @@ def prediction_step(
 
     @tf.function
     def distributed_prediction_steps(self, batch):
-
         nb_instances_in_batch = self._compute_nb_instances(batch)
         inputs = self._get_step_inputs(batch, nb_instances_in_batch)
 
@@ -516,7 +518,6 @@ def train(self) -> None:
             epochs_trained = 0
             steps_trained_in_current_epoch = 0
             if self.model.ckpt_manager.latest_checkpoint:
-
                 logger.info(
                     f"Checkpoint file {self.model.ckpt_manager.latest_checkpoint} found and restoring from checkpoint"
                 )
@@ -560,7 +561,6 @@ def train(self) -> None:
                     self._past = None
 
                 for step, batch in enumerate(train_ds):
-
                     # Skip past any already trained steps if resuming training
                     if steps_trained_in_current_epoch > 0:
                         steps_trained_in_current_epoch -= 1
@@ -704,7 +704,6 @@ def apply_gradients(self, features, labels, nb_instances_in_global_batch):
     @tf.function
     def distributed_training_steps(self, batch):
         with self.args.strategy.scope():
-
             nb_instances_in_batch = self._compute_nb_instances(batch)
             inputs = self._get_step_inputs(batch, nb_instances_in_batch)
 
@@ -712,7 +711,6 @@ def distributed_training_steps(self, batch):
 
     @staticmethod
     def _compute_nb_instances(batch):
-
         labels = batch[-1]
         if isinstance(labels, PerReplica):
             labels = tf.concat(labels.values, axis=0)
@@ -723,7 +721,6 @@ def _compute_nb_instances(batch):
 
     @staticmethod
     def _get_step_inputs(batch, nb_instances):
-
         features, labels = batch
 
         if isinstance(labels, PerReplica):
diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py
index e857a260c7a9..5bf29efffa8f 100644
--- a/src/transformers/trainer_utils.py
+++ b/src/transformers/trainer_utils.py
@@ -35,7 +35,10 @@
     is_tf_available,
     is_torch_available,
     is_torch_cuda_available,
+    is_torch_mps_available,
+    is_torch_npu_available,
     is_torch_tpu_available,
+    is_torch_xpu_available,
     requires_backends,
 )
 
@@ -43,9 +46,6 @@
 if is_torch_available():
     import torch
 
-if is_tf_available():
-    import tensorflow as tf
-
 
 def seed_worker(_):
     """
@@ -55,7 +55,7 @@ def seed_worker(_):
     set_seed(worker_seed)
 
 
-def enable_full_determinism(seed: int):
+def enable_full_determinism(seed: int, warn_only: bool = False):
     """
     Helper function for reproducible behavior during distributed training. See
     - https://pytorch.org/docs/stable/notes/randomness.html for pytorch
@@ -70,13 +70,15 @@ def enable_full_determinism(seed: int):
         # depending on the CUDA version, so we set them both here
         os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
         os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
-        torch.use_deterministic_algorithms(True)
+        torch.use_deterministic_algorithms(True, warn_only=warn_only)
 
         # Enable CUDNN deterministic mode
         torch.backends.cudnn.deterministic = True
         torch.backends.cudnn.benchmark = False
 
     if is_tf_available():
+        import tensorflow as tf
+
         tf.config.experimental.enable_op_determinism()
 
 
@@ -93,7 +95,13 @@ def set_seed(seed: int):
         torch.manual_seed(seed)
         torch.cuda.manual_seed_all(seed)
         # ^^ safe to call this function even if cuda is not available
+    if is_torch_npu_available():
+        torch.npu.manual_seed_all(seed)
+    if is_torch_xpu_available():
+        torch.xpu.manual_seed_all(seed)
     if is_tf_available():
+        import tensorflow as tf
+
         tf.random.set_seed(seed)
 
 
@@ -104,7 +112,7 @@ class EvalPrediction:
     Parameters:
         predictions (`np.ndarray`): Predictions of the model.
         label_ids (`np.ndarray`): Targets to be matched.
-        inputs (`np.ndarray`, *optional*)
+        inputs (`np.ndarray`, *optional*):
     """
 
     def __init__(
@@ -192,7 +200,7 @@ class HubStrategy(ExplicitEnum):
 
 class BestRun(NamedTuple):
     """
-    The best run found by an hyperparameter search (see [`~Trainer.hyperparameter_search`]).
+    The best run found by a hyperparameter search (see [`~Trainer.hyperparameter_search`]).
 
     Parameters:
         run_id (`str`):
@@ -202,11 +210,14 @@ class BestRun(NamedTuple):
             The objective that was obtained for this run.
         hyperparameters (`Dict[str, Any]`):
             The hyperparameters picked to get this run.
+        run_summary (`Optional[Any]`):
+            A summary of tuning experiments. `ray.tune.ExperimentAnalysis` object for Ray backend.
     """
 
     run_id: str
-    objective: float
+    objective: Union[float, List[float]]
     hyperparameters: Dict[str, Any]
+    run_summary: Optional[Any] = None
 
 
 def default_compute_objective(metrics: Dict[str, float]) -> float:
@@ -298,14 +309,6 @@ class HPSearchBackend(ExplicitEnum):
     WANDB = "wandb"
 
 
-default_hp_space = {
-    HPSearchBackend.OPTUNA: default_hp_space_optuna,
-    HPSearchBackend.RAY: default_hp_space_ray,
-    HPSearchBackend.SIGOPT: default_hp_space_sigopt,
-    HPSearchBackend.WANDB: default_hp_space_wandb,
-}
-
-
 def is_main_process(local_rank):
     """
     Whether or not the current process is the local process, based on `xm.get_ordinal()` (for TPUs) first, then on
@@ -333,7 +336,7 @@ def total_processes_number(local_rank):
     return 1
 
 
-def speed_metrics(split, start_time, num_samples=None, num_steps=None):
+def speed_metrics(split, start_time, num_samples=None, num_steps=None, num_tokens=None):
     """
     Measure and return speed performance metrics.
 
@@ -344,15 +347,21 @@ def speed_metrics(split, start_time, num_samples=None, num_steps=None):
     - split: name to prefix metric (like train, eval, test...)
     - start_time: operation start time
     - num_samples: number of samples processed
+    - num_tokens: number of tokens processed
     """
     runtime = time.time() - start_time
     result = {f"{split}_runtime": round(runtime, 4)}
+    if runtime == 0:
+        return result
     if num_samples is not None:
         samples_per_second = num_samples / runtime
         result[f"{split}_samples_per_second"] = round(samples_per_second, 3)
     if num_steps is not None:
         steps_per_second = num_steps / runtime
         result[f"{split}_steps_per_second"] = round(steps_per_second, 3)
+    if num_tokens is not None:
+        tokens_per_second = num_tokens / runtime
+        result[f"{split}_tokens_per_second"] = round(tokens_per_second, 3)
     return result
 
 
@@ -363,6 +372,8 @@ class SchedulerType(ExplicitEnum):
     POLYNOMIAL = "polynomial"
     CONSTANT = "constant"
     CONSTANT_WITH_WARMUP = "constant_with_warmup"
+    INVERSE_SQRT = "inverse_sqrt"
+    REDUCE_ON_PLATEAU = "reduce_lr_on_plateau"
 
 
 class TrainerMemoryTracker:
@@ -398,7 +409,6 @@ class TrainerMemoryTracker:
     }
 
     def __init__(self, skip_memory_metrics=False):
-
         self.skip_memory_metrics = skip_memory_metrics
 
         if not is_psutil_available():
@@ -413,6 +423,16 @@ def __init__(self, skip_memory_metrics=False):
         if is_torch_cuda_available():
             import torch
 
+            self.torch = torch
+            self.gpu = {}
+        elif is_torch_mps_available():
+            import torch
+
+            self.torch = torch
+            self.gpu = {}
+        elif is_torch_xpu_available():
+            import torch
+
             self.torch = torch
             self.gpu = {}
         else:
@@ -465,12 +485,19 @@ def start(self):
         gc.collect()
 
         if self.torch is not None:
-            self.torch.cuda.reset_peak_memory_stats()
-            self.torch.cuda.empty_cache()
+            if torch.cuda.is_available():
+                self.torch.cuda.reset_peak_memory_stats()
+                self.torch.cuda.empty_cache()
+            elif is_torch_xpu_available():
+                self.torch.xpu.reset_peak_memory_stats()
+                self.torch.xpu.empty_cache()
 
         # gpu
         if self.torch is not None:
-            self.gpu_mem_used_at_start = self.torch.cuda.memory_allocated()
+            if torch.cuda.is_available():
+                self.gpu_mem_used_at_start = self.torch.cuda.memory_allocated()
+            elif is_torch_xpu_available():
+                self.gpu_mem_used_at_start = self.torch.xpu.memory_allocated()
 
         # cpu
         self.cpu_mem_used_at_start = self.cpu_mem_used()
@@ -494,7 +521,10 @@ def stop(self, stage):
         gc.collect()
 
         if self.torch is not None:
-            self.torch.cuda.empty_cache()
+            if torch.cuda.is_available():
+                self.torch.cuda.empty_cache()
+            elif is_torch_xpu_available():
+                self.torch.xpu.empty_cache()
 
         # concepts:
         # - alloc_delta:  the difference of allocated memory between the end and the start
@@ -503,23 +533,30 @@ def stop(self, stage):
 
         # gpu
         if self.torch is not None:
-            self.gpu_mem_used_now = self.torch.cuda.memory_allocated()
-            self.gpu_mem_used_peak = self.torch.cuda.max_memory_allocated()
-            self.gpu[self.cur_stage] = dict(
-                begin=self.gpu_mem_used_at_start,
-                end=self.gpu_mem_used_now,
-                alloc=(self.gpu_mem_used_now - self.gpu_mem_used_at_start),
-                peaked=max(0, self.gpu_mem_used_peak - self.gpu_mem_used_now),
-            )
+            if torch.cuda.is_available():
+                self.gpu_mem_used_now = self.torch.cuda.memory_allocated()
+                self.gpu_mem_used_peak = self.torch.cuda.max_memory_allocated()
+            elif is_torch_xpu_available():
+                self.gpu_mem_used_now = self.torch.xpu.memory_allocated()
+                self.gpu_mem_used_peak = self.torch.xpu.max_memory_allocated()
+            else:
+                raise ValueError("No available GPU device found!")
+
+            self.gpu[self.cur_stage] = {
+                "begin": self.gpu_mem_used_at_start,
+                "end": self.gpu_mem_used_now,
+                "alloc": (self.gpu_mem_used_now - self.gpu_mem_used_at_start),
+                "peaked": max(0, self.gpu_mem_used_peak - self.gpu_mem_used_now),
+            }
 
         # cpu
         self.cpu_mem_used_now = self.cpu_mem_used()
-        self.cpu[self.cur_stage] = dict(
-            begin=self.cpu_mem_used_at_start,
-            end=self.cpu_mem_used_now,
-            alloc=(self.cpu_mem_used_now - self.cpu_mem_used_at_start),
-            peaked=max(0, self.cpu_mem_used_peak - self.cpu_mem_used_now),
-        )
+        self.cpu[self.cur_stage] = {
+            "begin": self.cpu_mem_used_at_start,
+            "end": self.cpu_mem_used_now,
+            "alloc": (self.cpu_mem_used_now - self.cpu_mem_used_at_start),
+            "peaked": max(0, self.cpu_mem_used_peak - self.cpu_mem_used_now),
+        }
 
         # reset - cycle finished
         self.cur_stage = None
@@ -614,21 +651,13 @@ def number_of_arguments(func):
     return len(inspect.signature(func).parameters)
 
 
-class ShardedDDPOption(ExplicitEnum):
-    SIMPLE = "simple"
-    ZERO_DP_2 = "zero_dp_2"
-    ZERO_DP_3 = "zero_dp_3"
-    OFFLOAD = "offload"
-    AUTO_WRAP = "auto_wrap"
-
-
 def find_executable_batch_size(
     function: callable = None, starting_batch_size: int = 128, auto_find_batch_size: bool = False
 ):
     """
     Args:
     A basic decorator that will try to execute `function`. If it fails from exceptions related to out-of-memory or
-    CUDNN, the batch size is cut in half and passed to `function` `function` must take in a `batch_size` parameter as
+    CUDNN, the batch size is cut in half and passed to `function`. `function` must take in a `batch_size` parameter as
     its first argument.
         function (`callable`, *optional*)
             A function to wrap
@@ -646,9 +675,9 @@ def find_executable_batch_size(
 
     if auto_find_batch_size:
         requires_backends(find_executable_batch_size, "accelerate")
-        import accelerate.memory_utils as mem_utils
+        from accelerate.utils import find_executable_batch_size as accelerate_find_executable_batch_size
 
-        return mem_utils.find_executable_batch_size(function=function, starting_batch_size=starting_batch_size)
+        return accelerate_find_executable_batch_size(function=function, starting_batch_size=starting_batch_size)
 
     return functools.partial(function, batch_size=starting_batch_size)
 
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index eb196bfb85ab..2e7a3c8156cb 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import contextlib
+import io
 import json
 import math
 import os
@@ -23,6 +24,7 @@
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 
+from huggingface_hub import get_full_repo_name
 from packaging import version
 
 from .debug_utils import DebugOption
@@ -32,34 +34,66 @@
     HubStrategy,
     IntervalStrategy,
     SchedulerType,
-    ShardedDDPOption,
 )
 from .utils import (
     ExplicitEnum,
     cached_property,
-    ccl_version,
-    get_full_repo_name,
     is_accelerate_available,
-    is_psutil_available,
+    is_safetensors_available,
     is_sagemaker_dp_enabled,
     is_sagemaker_mp_enabled,
     is_torch_available,
     is_torch_bf16_cpu_available,
     is_torch_bf16_gpu_available,
+    is_torch_neuroncore_available,
+    is_torch_npu_available,
     is_torch_tf32_available,
     is_torch_tpu_available,
+    is_torch_xpu_available,
     logging,
     requires_backends,
 )
+from .utils.generic import strtobool
+from .utils.import_utils import is_optimum_neuron_available
 
 
+logger = logging.get_logger(__name__)
+log_levels = logging.get_log_levels_dict().copy()
+trainer_log_levels = dict(**log_levels, passive=-1)
+
 if is_torch_available():
     import torch
     import torch.distributed as dist
 
+if is_accelerate_available():
+    from accelerate.state import AcceleratorState, PartialState
+    from accelerate.utils import DistributedType
+
 if is_torch_tpu_available(check_device=False):
     import torch_xla.core.xla_model as xm
 
+if is_torch_neuroncore_available(check_device=False):
+    # torchrun support
+    # https://github.com/pytorch/xla/pull/3609
+    if os.environ.get("TORCHELASTIC_RUN_ID"):
+        if is_optimum_neuron_available():
+            logger.info(
+                "Make sure that you are performing the training with the TrainiumTrainer from optimum[neuron], this "
+                "will fail otherwise."
+            )
+        else:
+            logger.warning(
+                "Please use the TrainiumTrainer from optimum[neuron] instead of the Transformers library to perform "
+                "training on AWS Trainium instances. More information here: "
+                "https://github.com/huggingface/optimum-neuron"
+            )
+            import torch_xla.distributed.xla_backend as xbn
+
+            if not isinstance(dist.group.WORLD, xbn.ProcessGroupXla):
+                dist.init_process_group(backend="xla")
+                if not isinstance(dist.group.WORLD, xbn.ProcessGroupXla):
+                    raise AssertionError("Failed to initialize torch.distributed process group using XLA backend.")
+
 
 if is_sagemaker_mp_enabled():
     import smdistributed.modelparallel.torch as smp
@@ -67,25 +101,6 @@
     smp.init()
 
 
-logger = logging.get_logger(__name__)
-log_levels = logging.get_log_levels_dict().copy()
-trainer_log_levels = dict(**log_levels, passive=-1)
-
-
-TORCH_COMPILE_BACKENDS = [
-    "eager",
-    "aot_eager",
-    "inductor",
-    "nvfuser",
-    "aot_nvfuser",
-    "aot_cudagraphs",
-    "ofi",
-    "fx2trt",
-    "onnxrt",
-    "ipex",
-]
-
-
 def default_logdir() -> str:
     """
     Same default as PyTorch
@@ -122,15 +137,26 @@ class OptimizerNames(ExplicitEnum):
 
     ADAMW_HF = "adamw_hf"
     ADAMW_TORCH = "adamw_torch"
+    ADAMW_TORCH_FUSED = "adamw_torch_fused"
     ADAMW_TORCH_XLA = "adamw_torch_xla"
+    ADAMW_TORCH_NPU_FUSED = "adamw_torch_npu_fused"
     ADAMW_APEX_FUSED = "adamw_apex_fused"
     ADAFACTOR = "adafactor"
-    ADAMW_BNB = "adamw_bnb_8bit"
     ADAMW_ANYPRECISION = "adamw_anyprecision"
     SGD = "sgd"
     ADAGRAD = "adagrad"
+    ADAMW_BNB = "adamw_bnb_8bit"
+    ADAMW_8BIT = "adamw_8bit"  # just an alias for adamw_bnb_8bit
+    LION_8BIT = "lion_8bit"
+    LION = "lion_32bit"
+    PAGED_ADAMW = "paged_adamw_32bit"
+    PAGED_ADAMW_8BIT = "paged_adamw_8bit"
+    PAGED_LION = "paged_lion_32bit"
+    PAGED_LION_8BIT = "paged_lion_8bit"
+    RMSPROP = "rmsprop"
 
 
+# TODO: `TrainingArguments` users rely on it being fully mutable. In the future see if we can narrow this to a few keys: https://github.com/huggingface/transformers/pull/25903
 @dataclass
 class TrainingArguments:
     """
@@ -170,9 +196,9 @@ class TrainingArguments:
         prediction_loss_only (`bool`, *optional*, defaults to `False`):
             When performing evaluation and generating predictions, only returns the loss.
         per_device_train_batch_size (`int`, *optional*, defaults to 8):
-            The batch size per GPU/TPU core/CPU for training.
+            The batch size per GPU/XPU/TPU/MPS/NPU core/CPU for training.
         per_device_eval_batch_size (`int`, *optional*, defaults to 8):
-            The batch size per GPU/TPU core/CPU for evaluation.
+            The batch size per GPU/XPU/TPU/MPS/NPU core/CPU for evaluation.
         gradient_accumulation_steps (`int`, *optional*, defaults to 1):
             Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
 
@@ -185,7 +211,7 @@ class TrainingArguments:
 
         eval_accumulation_steps (`int`, *optional*):
             Number of predictions steps to accumulate the output tensors for, before moving the results to the CPU. If
-            left unset, the whole predictions are accumulated on GPU/TPU before being moved to the CPU (faster but
+            left unset, the whole predictions are accumulated on GPU/NPU/TPU before being moved to the CPU (faster but
             requires more memory).
         eval_delay (`float`, *optional*):
             Number of epochs or steps to wait for before the first evaluation can be performed, depending on the
@@ -218,9 +244,9 @@ class TrainingArguments:
             Number of steps used for a linear warmup from 0 to `learning_rate`. Overrides any effect of `warmup_ratio`.
         log_level (`str`, *optional*, defaults to `passive`):
             Logger log level to use on the main process. Possible choices are the log levels as strings: 'debug',
-            'info', 'warning', 'error' and 'critical', plus a 'passive' level which doesn't set anything and lets the
-            application set the level.
-        log_level_replica (`str`, *optional*, defaults to `passive`):
+            'info', 'warning', 'error' and 'critical', plus a 'passive' level which doesn't set anything and keeps the
+            current log level for the Transformers library (which will be `"warning"` by default).
+        log_level_replica (`str`, *optional*, defaults to `"warning"`):
             Logger log level to use on replicas. Same choices as `log_level`"
         log_on_each_node (`bool`, *optional*, defaults to `True`):
             In multinode distributed training, whether to log using `log_level` once per node, or only on the main
@@ -237,8 +263,9 @@ class TrainingArguments:
 
         logging_first_step (`bool`, *optional*, defaults to `False`):
             Whether to log and evaluate the first `global_step` or not.
-        logging_steps (`int`, *optional*, defaults to 500):
-            Number of update steps between two logs if `logging_strategy="steps"`.
+        logging_steps (`int` or `float`, *optional*, defaults to 500):
+            Number of update steps between two logs if `logging_strategy="steps"`. Should be an integer or a float in
+            range `[0,1)`. If smaller than 1, will be interpreted as ratio of total training steps.
         logging_nan_inf_filter (`bool`, *optional*, defaults to `True`):
             Whether to filter `nan` and `inf` losses for logging. If set to `True` the loss of every step that is `nan`
             or `inf` is filtered and the average loss of the current logging window is taken instead.
@@ -256,19 +283,27 @@ class TrainingArguments:
                 - `"no"`: No save is done during training.
                 - `"epoch"`: Save is done at the end of each epoch.
                 - `"steps"`: Save is done every `save_steps`.
-        save_steps (`int`, *optional*, defaults to 500):
-            Number of updates steps before two checkpoint saves if `save_strategy="steps"`.
+        save_steps (`int` or `float`, *optional*, defaults to 500):
+            Number of updates steps before two checkpoint saves if `save_strategy="steps"`. Should be an integer or a
+            float in range `[0,1)`. If smaller than 1, will be interpreted as ratio of total training steps.
         save_total_limit (`int`, *optional*):
             If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in
-            `output_dir`.
+            `output_dir`. When `load_best_model_at_end` is enabled, the "best" checkpoint according to
+            `metric_for_best_model` will always be retained in addition to the most recent ones. For example, for
+            `save_total_limit=5` and `load_best_model_at_end`, the four last checkpoints will always be retained
+            alongside the best model. When `save_total_limit=1` and `load_best_model_at_end`, it is possible that two
+            checkpoints are saved: the last one and the best one (if they are different).
+        save_safetensors (`bool`, *optional*, defaults to `False`):
+            Use [safetensors](https://huggingface.co/docs/safetensors) saving and loading for state dicts instead of
+            default `torch.load` and `torch.save`.
         save_on_each_node (`bool`, *optional*, defaults to `False`):
             When doing multi-node distributed training, whether to save models and checkpoints on each node, or only on
             the main one.
 
             This should not be activated when the different nodes use the same storage as the files will be saved with
             the same names for each node.
-        no_cuda (`bool`, *optional*, defaults to `False`):
-            Whether to not use CUDA even when it is available or not.
+        use_cpu (`bool`, *optional*, defaults to `False`):
+            Whether or not to use cpu. If set to False, we will use cuda or mps device if available.
         seed (`int`, *optional*, defaults to 42):
             Random seed that will be set at the beginning of training. To ensure reproducibility across runs, use the
             [`~Trainer.model_init`] function to instantiate the model if it has some randomly initialized parameters.
@@ -283,7 +318,7 @@ class TrainingArguments:
             installation](https://github.com/intel/intel-extension-for-pytorch).
         bf16 (`bool`, *optional*, defaults to `False`):
             Whether to use bf16 16-bit (mixed) precision training instead of 32-bit training. Requires Ampere or higher
-            NVIDIA architecture or using CPU (no_cuda). This is an experimental API and it may change.
+            NVIDIA architecture or using CPU (use_cpu) or Ascend NPU. This is an experimental API and it may change.
         fp16 (`bool`, *optional*, defaults to `False`):
             Whether to use fp16 16-bit (mixed) precision training instead of 32-bit training.
         fp16_opt_level (`str`, *optional*, defaults to 'O1'):
@@ -292,9 +327,9 @@ class TrainingArguments:
         fp16_backend (`str`, *optional*, defaults to `"auto"`):
             This argument is deprecated. Use `half_precision_backend` instead.
         half_precision_backend (`str`, *optional*, defaults to `"auto"`):
-            The backend to use for mixed precision training. Must be one of `"auto", "cuda_amp", "apex", "cpu_amp"`.
-            `"auto"` will use CPU/CUDA AMP or APEX depending on the PyTorch version detected, while the other choices
-            will force the requested backend.
+            The backend to use for mixed precision training. Must be one of `"auto", "apex", "cpu_amp"`. `"auto"` will
+            use CPU/CUDA AMP or APEX depending on the PyTorch version detected, while the other choices will force the
+            requested backend.
         bf16_full_eval (`bool`, *optional*, defaults to `False`):
             Whether to use full bfloat16 evaluation instead of 32-bit. This will be faster and save memory but can harm
             metric values. This is an experimental API and it may change.
@@ -308,16 +343,17 @@ class TrainingArguments:
             experimental API and it may change.
         local_rank (`int`, *optional*, defaults to -1):
             Rank of the process during distributed training.
-        xpu_backend (`str`, *optional*):
-            The backend to use for xpu distributed training. Must be one of `"mpi"` or `"ccl"` or `"gloo"`.
+        ddp_backend (`str`, *optional*):
+            The backend to use for distributed training. Must be one of `"nccl"`, `"mpi"`, `"ccl"`, `"gloo"`, `"hccl"`.
         tpu_num_cores (`int`, *optional*):
             When training on TPU, the number of TPU cores (automatically passed by launcher script).
         dataloader_drop_last (`bool`, *optional*, defaults to `False`):
             Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size)
             or not.
-        eval_steps (`int`, *optional*):
+        eval_steps (`int` or `float`, *optional*):
             Number of update steps between two evaluations if `evaluation_strategy="steps"`. Will default to the same
-            value as `logging_steps` if not set.
+            value as `logging_steps` if not set. Should be an integer or a float in range `[0,1)`. If smaller than 1,
+            will be interpreted as ratio of total training steps.
         dataloader_num_workers (`int`, *optional*, defaults to 0):
             Number of subprocesses to use for data loading (PyTorch only). 0 means that the data will be loaded in the
             main process.
@@ -340,10 +376,14 @@ class TrainingArguments:
         label_names (`List[str]`, *optional*):
             The list of keys in your dictionary of inputs that correspond to the labels.
 
-            Will eventually default to `["labels"]` except if the model used is one of the `XxxForQuestionAnswering` in
-            which case it will default to `["start_positions", "end_positions"]`.
+            Will eventually default to the list of argument names accepted by the model that contain the word "label",
+            except if the model used is one of the `XxxForQuestionAnswering` in which case it will also include the
+            `["start_positions", "end_positions"]` keys.
         load_best_model_at_end (`bool`, *optional*, defaults to `False`):
-            Whether or not to load the best model found during training at the end of training.
+            Whether or not to load the best model found during training at the end of training. When this option is
+            enabled, the best checkpoint will always be saved. See
+            [`save_total_limit`](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.save_total_limit)
+            for more.
 
             <Tip>
 
@@ -369,22 +409,7 @@ class TrainingArguments:
             When resuming training, whether or not to skip the epochs and batches to get the data loading at the same
             stage as in the previous training. If set to `True`, the training will begin faster (as that skipping step
             can take a long time) but will not yield the same results as the interrupted training would have.
-        sharded_ddp (`bool`, `str` or list of [`~trainer_utils.ShardedDDPOption`], *optional*, defaults to `False`):
-            Use Sharded DDP training from [FairScale](https://github.com/facebookresearch/fairscale) (in distributed
-            training only). This is an experimental feature.
-
-            A list of options along the following:
-
-            - `"simple"`: to use first instance of sharded DDP released by fairscale (`ShardedDDP`) similar to ZeRO-2.
-            - `"zero_dp_2"`: to use the second instance of sharded DPP released by fairscale (`FullyShardedDDP`) in
-              Zero-2 mode (with `reshard_after_forward=False`).
-            - `"zero_dp_3"`: to use the second instance of sharded DPP released by fairscale (`FullyShardedDDP`) in
-              Zero-3 mode (with `reshard_after_forward=True`).
-            - `"offload"`: to add ZeRO-offload (only compatible with `"zero_dp_2"` and `"zero_dp_3"`).
-
-            If a string is passed, it will be split on space. If a bool is passed, it will be converted to an empty
-            list for `False` and `["simple"]` for `True`.
-        fsdp (`bool`, `str` or list of [`~trainer_utils.FSDPOption`], *optional*, defaults to `False`):
+        fsdp (`bool`, `str` or list of [`~trainer_utils.FSDPOption`], *optional*, defaults to `''`):
             Use PyTorch Distributed Parallel Training (in distributed training only).
 
             A list of options along the following:
@@ -394,8 +419,62 @@ class TrainingArguments:
             - `"offload"`: Offload parameters and gradients to CPUs (only compatible with `"full_shard"` and
               `"shard_grad_op"`).
             - `"auto_wrap"`: Automatically recursively wrap layers with FSDP using `default_auto_wrap_policy`.
-        fsdp_min_num_params (`int`, *optional*, defaults to `0`):
-            FSDP's minimum number of parameters for Default Auto Wrapping. (useful only when `fsdp` field is passed).
+        fsdp_config (`str` or `dict`, *optional*):
+            Config to be used with fsdp (Pytorch Distributed Parallel Training). The value is either a location of
+            deepspeed json config file (e.g., `ds_config.json`) or an already loaded json file as `dict`.
+
+            A List of config and its options:
+                - min_num_params (`int`, *optional*, defaults to `0`):
+                    FSDP's minimum number of parameters for Default Auto Wrapping. (useful only when `fsdp` field is
+                    passed).
+                - transformer_layer_cls_to_wrap (`List[str]`, *optional*):
+                    List of transformer layer class names (case-sensitive) to wrap, e.g, `BertLayer`, `GPTJBlock`,
+                    `T5Block` .... (useful only when `fsdp` flag is passed).
+                - backward_prefetch (`str`, *optional*)
+                    FSDP's backward prefetch mode. Controls when to prefetch next set of parameters (useful only when
+                    `fsdp` field is passed).
+
+                    A list of options along the following:
+
+                    - `"backward_pre"` : Prefetches the next set of parameters before the current set of parameter's
+                      gradient
+                        computation.
+                    - `"backward_post"` : This prefetches the next set of parameters after the current set of
+                      parameter’s
+                        gradient computation.
+                - forward_prefetch (`bool`, *optional*, defaults to `False`)
+                    FSDP's forward prefetch mode (useful only when `fsdp` field is passed).
+                     If `"True"`, then FSDP explicitly prefetches the next upcoming all-gather while executing in the
+                     forward pass.
+                - limit_all_gathers (`bool`, *optional*, defaults to `False`)
+                    FSDP's limit_all_gathers (useful only when `fsdp` field is passed).
+                     If `"True"`, FSDP explicitly synchronizes the CPU thread to prevent too many in-flight
+                     all-gathers.
+                - use_orig_params (`bool`, *optional*, defaults to `False`)
+                    If `"True"`, allows non-uniform `requires_grad` during init, which means support for interspersed
+                    frozen and trainable paramteres. Useful in cases such as parameter-efficient fine-tuning. Please
+                    refer this
+                    [blog](https://dev-discuss.pytorch.org/t/rethinking-pytorch-fully-sharded-data-parallel-fsdp-from-first-principles/1019
+                - sync_module_states (`bool`, *optional*, defaults to `True`)
+                    If `"True"`, each individually wrapped FSDP unit will broadcast module parameters from rank 0 to
+                    ensure they are the same across all ranks after initialization
+                - xla (`bool`, *optional*, defaults to `False`):
+                    Whether to use PyTorch/XLA Fully Sharded Data Parallel Training. This is an experimental feature
+                    and its API may evolve in the future.
+                - xla_fsdp_settings (`dict`, *optional*)
+                    The value is a dictionary which stores the XLA FSDP wrapping parameters.
+
+                    For a complete list of options, please see [here](
+                    https://github.com/pytorch/xla/blob/master/torch_xla/distributed/fsdp/xla_fully_sharded_data_parallel.py).
+                - xla_fsdp_grad_ckpt (`bool`, *optional*, defaults to `False`):
+                    Will use gradient checkpointing over each nested XLA FSDP wrapped layer. This setting can only be
+                    used when the xla flag is set to true, and an auto wrapping policy is specified through
+                    fsdp_min_num_params or fsdp_transformer_layer_cls_to_wrap.
+                - activation_checkpointing (`bool`, *optional*, defaults to `False`):
+                    If True, activation checkpointing is a technique to reduce memory usage by clearing activations of
+                    certain layers and recomputing them during a backward pass. Effectively, this trades extra
+                    computation time for reduced memory usage.
+
         deepspeed (`str` or `dict`, *optional*):
             Use [Deepspeed](https://github.com/microsoft/deepspeed). This is an experimental feature and its API may
             evolve in the future. The value is either the location of DeepSpeed json config file (e.g.,
@@ -416,12 +495,11 @@ class TrainingArguments:
             - `"tpu_metrics_debug"`: print debug metrics on TPU
 
             The options should be separated by whitespaces.
-        optim (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_hf"`):
-            The optimizer to use: adamw_hf, adamw_torch, adamw_apex_fused, adamw_anyprecision or adafactor.
+        optim (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_torch"`):
+            The optimizer to use: adamw_hf, adamw_torch, adamw_torch_fused, adamw_apex_fused, adamw_anyprecision or
+            adafactor.
         optim_args (`str`, *optional*):
             Optional arguments that are supplied to AnyPrecisionAdamW.
-        adafactor (`bool`, *optional*, defaults to `False`):
-            This argument is deprecated. Use `--optim adafactor` instead.
         group_by_length (`bool`, *optional*, defaults to `False`):
             Whether or not to group together samples of roughly the same length in the training dataset (to minimize
             padding applied and be more efficient). Only useful if applying dynamic padding.
@@ -431,13 +509,17 @@ class TrainingArguments:
             instance of `Dataset`.
         report_to (`str` or `List[str]`, *optional*, defaults to `"all"`):
             The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,
-            `"comet_ml"`, `"mlflow"`, `"neptune"`, `"tensorboard"`,`"clearml"` and `"wandb"`. Use `"all"` to report to
-            all integrations installed, `"none"` for no integrations.
+            `"clearml"`, `"codecarbon"`, `"comet_ml"`, `"dagshub"`, `"flyte"`, `"mlflow"`, `"neptune"`,
+            `"tensorboard"`, and `"wandb"`. Use `"all"` to report to all integrations installed, `"none"` for no
+            integrations.
         ddp_find_unused_parameters (`bool`, *optional*):
             When using distributed training, the value of the flag `find_unused_parameters` passed to
             `DistributedDataParallel`. Will default to `False` if gradient checkpointing is used, `True` otherwise.
         ddp_bucket_cap_mb (`int`, *optional*):
             When using distributed training, the value of the flag `bucket_cap_mb` passed to `DistributedDataParallel`.
+        ddp_broadcast_buffers (`bool`, *optional*):
+            When using distributed training, the value of the flag `broadcast_buffers` passed to
+            `DistributedDataParallel`. Will default to `False` if gradient checkpointing is used, `True` otherwise.
         dataloader_pin_memory (`bool`, *optional*, defaults to `True`):
             Whether you want to pin memory in data loaders or not. Will default to `True`.
         skip_memory_metrics (`bool`, *optional*, defaults to `True`):
@@ -488,6 +570,8 @@ class TrainingArguments:
             `huggingface-cli login`.
         hub_private_repo (`bool`, *optional*, defaults to `False`):
             If True, the Hub repo will be set to private.
+        hub_always_push (`bool`, *optional*, defaults to `False`):
+            Unless this is `True`, the `Trainer` will skip pushing a checkpoint when the previous push is not finished.
         gradient_checkpointing (`bool`, *optional*, defaults to `False`):
             If True, use gradient checkpointing to save memory at the expense of slower backward pass.
         include_inputs_for_metrics (`bool`, *optional*, defaults to `False`):
@@ -498,7 +582,7 @@ class TrainingArguments:
             CUDA Out-of-Memory errors. Requires accelerate to be installed (`pip install accelerate`)
         full_determinism (`bool`, *optional*, defaults to `False`)
             If `True`, [`enable_full_determinism`] is called instead of [`set_seed`] to ensure reproducible results in
-            distributed training
+            distributed training. Important: this will negatively impact the performance, so only use it for debugging.
         torchdynamo (`str`, *optional*):
             If set, the backend compiler for TorchDynamo. Possible choices are `"eager"`, `"aot_eager"`, `"inductor"`,
             `"nvfuser"`, `"aot_nvfuser"`, `"aot_cudagraphs"`, `"ofi"`, `"fx2trt"`, `"onnxrt"` and `"ipex"`.
@@ -514,22 +598,35 @@ class TrainingArguments:
             (https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group) for more
             information.
         use_mps_device (`bool`, *optional*, defaults to `False`):
-            Whether to use Apple Silicon chip based `mps` device.
+            This argument is deprecated.`mps` device will be used if it is available similar to `cuda` device.
         torch_compile (`bool`, *optional*, defaults to `False`):
             Whether or not to compile the model using PyTorch 2.0
-            [`torch.compile`](https://pytorch.org/get-started/pytorch-2.0/) (requires a nighlty install of PyTorch).
+            [`torch.compile`](https://pytorch.org/get-started/pytorch-2.0/).
+
+            This will use the best defaults for the [`torch.compile`
+            API](https://pytorch.org/docs/stable/generated/torch.compile.html?highlight=torch+compile#torch.compile).
+            You can customize the defaults with the argument `torch_compile_backend` and `torch_compile_mode` but we
+            don't guarantee any of them will work as the support is progressively rolled in in PyTorch.
 
-            If set, the backend will default to `"inductor"` (can be customized with `torch_compile_backend`) and the
-            mode will default to `"default"` (can be customized with `torch_compile_mode`).
+            This flag and the whole compile API is experimental and subject to change in future releases.
         torch_compile_backend (`str`, *optional*):
             The backend to use in `torch.compile`. If set to any value, `torch_compile` will be set to `True`.
 
-            Possible choices are `"eager"`, `"aot_eager"`, `"inductor"`, `"nvfuser"`, `"aot_nvfuser"`,
-            `"aot_cudagraphs"`, `"ofi"`, `"fx2trt"`, `"onnxrt"` and `"ipex"`.
+            Refer to the PyTorch doc for possible values and note that they may change across PyTorch versions.
+
+            This flag is experimental and subject to change in future releases.
         torch_compile_mode (`str`, *optional*):
             The mode to use in `torch.compile`. If set to any value, `torch_compile` will be set to `True`.
 
-            Possible choices are `"default"`, `"reduce-overhead"` and `"max-autotune"`.
+            Refer to the PyTorch doc for possible values and note that they may change across PyTorch versions.
+
+            This flag is experimental and subject to change in future releases.
+        include_tokens_per_second (`bool`, *optional*):
+            Whether or not to compute the number of tokens per second per device for training speed metrics.
+
+            This will iterate over the entire training dataloader once beforehand,
+
+            and will slow down the entire process.
     """
 
     framework = "pt"
@@ -559,10 +656,10 @@ class TrainingArguments:
     )
 
     per_device_train_batch_size: int = field(
-        default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for training."}
+        default=8, metadata={"help": "Batch size per GPU/TPU/MPS/NPU core/CPU for training."}
     )
     per_device_eval_batch_size: int = field(
-        default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for evaluation."}
+        default=8, metadata={"help": "Batch size per GPU/TPU/MPS/NPU core/CPU for evaluation."}
     )
 
     per_gpu_train_batch_size: Optional[int] = field(
@@ -636,7 +733,7 @@ class TrainingArguments:
         },
     )
     log_level_replica: Optional[str] = field(
-        default="passive",
+        default="warning",
         metadata={
             "help": "Logger log level to use on replica nodes. Same choices and defaults as ``log_level``",
             "choices": trainer_log_levels.keys(),
@@ -657,22 +754,49 @@ class TrainingArguments:
         metadata={"help": "The logging strategy to use."},
     )
     logging_first_step: bool = field(default=False, metadata={"help": "Log the first global_step"})
-    logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."})
+    logging_steps: float = field(
+        default=500,
+        metadata={
+            "help": (
+                "Log every X updates steps. Should be an integer or a float in range `[0,1)`."
+                "If smaller than 1, will be interpreted as ratio of total training steps."
+            )
+        },
+    )
     logging_nan_inf_filter: bool = field(default=True, metadata={"help": "Filter nan and inf losses for logging."})
     save_strategy: Union[IntervalStrategy, str] = field(
         default="steps",
         metadata={"help": "The checkpoint save strategy to use."},
     )
-    save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."})
+    save_steps: float = field(
+        default=500,
+        metadata={
+            "help": (
+                "Save checkpoint every X updates steps. Should be an integer or a float in range `[0,1)`."
+                "If smaller than 1, will be interpreted as ratio of total training steps."
+            )
+        },
+    )
     save_total_limit: Optional[int] = field(
         default=None,
         metadata={
             "help": (
-                "Limit the total amount of checkpoints. "
-                "Deletes the older checkpoints in the output_dir. Default is unlimited checkpoints"
+                "If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in"
+                " `output_dir`. When `load_best_model_at_end` is enabled, the 'best' checkpoint according to"
+                " `metric_for_best_model` will always be retained in addition to the most recent ones. For example,"
+                " for `save_total_limit=5` and `load_best_model_at_end=True`, the four last checkpoints will always be"
+                " retained alongside the best model. When `save_total_limit=1` and `load_best_model_at_end=True`,"
+                " it is possible that two checkpoints are saved: the last one and the best one (if they are different)."
+                " Default is unlimited checkpoints"
             )
         },
     )
+    save_safetensors: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": "Use safetensors saving and loading for state dicts instead of default torch.load and torch.save."
+        },
+    )
     save_on_each_node: bool = field(
         default=False,
         metadata={
@@ -682,9 +806,22 @@ class TrainingArguments:
             )
         },
     )
-    no_cuda: bool = field(default=False, metadata={"help": "Do not use CUDA even when it is available"})
+    no_cuda: bool = field(
+        default=False,
+        metadata={"help": "This argument is deprecated. It will be removed in version 5.0 of 🤗 Transformers."},
+    )
+    use_cpu: bool = field(
+        default=False,
+        metadata={
+            "help": " Whether or not to use cpu. If set to False, we will use cuda/tpu/mps/npu device if available."
+        },
+    )
     use_mps_device: bool = field(
-        default=False, metadata={"help": "Whether to use Apple Silicon chip based `mps` device."}
+        default=False,
+        metadata={
+            "help": "This argument is deprecated. `mps` device will be used if available similar to `cuda` device."
+            " It will be removed in version 5.0 of 🤗 Transformers"
+        },
     )
     seed: int = field(default=42, metadata={"help": "Random seed that will be set at the beginning of training."})
     data_seed: Optional[int] = field(default=None, metadata={"help": "Random seed to be used with data samplers."})
@@ -705,7 +842,7 @@ class TrainingArguments:
         metadata={
             "help": (
                 "Whether to use bf16 (mixed) precision instead of 32-bit. Requires Ampere or higher NVIDIA"
-                " architecture or using CPU (no_cuda). This is an experimental API and it may change."
+                " architecture or using CPU (use_cpu) or Ascend NPU. This is an experimental API and it may change."
             )
         },
     )
@@ -726,7 +863,7 @@ class TrainingArguments:
         default="auto",
         metadata={
             "help": "The backend to be used for half precision.",
-            "choices": ["auto", "cuda_amp", "apex", "cpu_amp"],
+            "choices": ["auto", "apex", "cpu_amp"],
         },
     )
     bf16_full_eval: bool = field(
@@ -752,11 +889,11 @@ class TrainingArguments:
         },
     )
     local_rank: int = field(default=-1, metadata={"help": "For distributed training: local_rank"})
-    xpu_backend: Optional[str] = field(
+    ddp_backend: Optional[str] = field(
         default=None,
         metadata={
-            "help": "The backend to be used for distributed training on Intel XPU.",
-            "choices": ["mpi", "ccl", "gloo"],
+            "help": "The backend to be used for distributed training",
+            "choices": ["nccl", "gloo", "mpi", "ccl", "hccl"],
         },
     )
     tpu_num_cores: Optional[int] = field(
@@ -770,7 +907,7 @@ class TrainingArguments:
             )
         },
     )
-    debug: str = field(
+    debug: Union[str, List[DebugOption]] = field(
         default="",
         metadata={
             "help": (
@@ -784,7 +921,15 @@ class TrainingArguments:
     dataloader_drop_last: bool = field(
         default=False, metadata={"help": "Drop the last incomplete batch if it is not divisible by the batch size."}
     )
-    eval_steps: Optional[int] = field(default=None, metadata={"help": "Run an evaluation every X steps."})
+    eval_steps: Optional[float] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Run an evaluation every X steps. Should be an integer or a float in range `[0,1)`."
+                "If smaller than 1, will be interpreted as ratio of total training steps."
+            )
+        },
+    )
     dataloader_num_workers: int = field(
         default=0,
         metadata={
@@ -813,10 +958,14 @@ class TrainingArguments:
     label_names: Optional[List[str]] = field(
         default=None, metadata={"help": "The list of keys in your dictionary of inputs that correspond to the labels."}
     )
-
     load_best_model_at_end: Optional[bool] = field(
         default=False,
-        metadata={"help": "Whether or not to load the best model found during training at the end of training."},
+        metadata={
+            "help": (
+                "Whether or not to load the best model found during training at the end of training. When this option"
+                " is enabled, the best checkpoint will always be saved. See `save_total_limit` for more."
+            )
+        },
     )
     metric_for_best_model: Optional[str] = field(
         default=None, metadata={"help": "The metric to use to compare two different models."}
@@ -833,18 +982,7 @@ class TrainingArguments:
             )
         },
     )
-    sharded_ddp: str = field(
-        default="",
-        metadata={
-            "help": (
-                "Whether or not to use sharded DDP training (in distributed training only). The base option should be"
-                " `simple`, `zero_dp_2` or `zero_dp_3` and you can add CPU-offload to `zero_dp_2` or `zero_dp_3` like"
-                " this: zero_dp_2 offload` or `zero_dp_3 offload`. You can add auto-wrap to `zero_dp_2` or `zero_dp_3`"
-                " with the same syntax: zero_dp_2 auto_wrap` or `zero_dp_3 auto_wrap`."
-            ),
-        },
-    )
-    fsdp: str = field(
+    fsdp: Optional[Union[List[FSDPOption], str]] = field(
         default="",
         metadata={
             "help": (
@@ -860,8 +998,18 @@ class TrainingArguments:
         default=0,
         metadata={
             "help": (
-                "FSDP's minimum number of parameters for Default Auto Wrapping. (useful only when `fsdp` field is"
-                " passed)."
+                "This parameter is deprecated. FSDP's minimum number of parameters for Default Auto Wrapping. (useful"
+                " only when `fsdp` field is passed)."
+            )
+        },
+    )
+    # Do not touch this type annotation or it will stop working in CLI
+    fsdp_config: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Config to be used with FSDP (Pytorch Fully Sharded  Data Parallel). The value is either a"
+                "fsdp json config file (e.g., `fsdp_config.json`) or an already loaded json file as `dict`."
             )
         },
     )
@@ -869,16 +1017,17 @@ class TrainingArguments:
         default=None,
         metadata={
             "help": (
-                "Transformer layer class name (case-sensitive) to wrap ,e.g, `BertLayer`, `GPTJBlock`, `T5Block` .... "
-                "(useful only when `fsdp` flag is passed)."
+                "This parameter is deprecated. Transformer layer class name (case-sensitive) to wrap, e.g,"
+                " `BertLayer`, `GPTJBlock`, `T5Block` .... (useful only when `fsdp` flag is passed)."
             )
         },
     )
+    # Do not touch this type annotation or it will stop working in CLI
     deepspeed: Optional[str] = field(
         default=None,
         metadata={
             "help": (
-                "Enable deepspeed and pass the path to deepspeed json config file (e.g. ds_config.json) or an already"
+                "Enable deepspeed and pass the path to deepspeed json config file (e.g. `ds_config.json`) or an already"
                 " loaded json file as a dict"
             )
         },
@@ -890,8 +1039,15 @@ class TrainingArguments:
     label_smoothing_factor: float = field(
         default=0.0, metadata={"help": "The label smoothing epsilon to apply (zero means no label smoothing)."}
     )
+
+    default_optim = "adamw_torch"
+    # XXX: enable when pytorch==2.0.1 comes out - we want to give it time to get all the bugs sorted out
+    # if is_torch_available() and version.parse(version.parse(torch.__version__).base_version) >= version.parse("2.1.0"):
+    #     default_optim = "adamw_torch_fused"
+    # and update the doc above to:
+    # optim (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_torch_fused"` (for torch<2.1.0 `"adamw_torch"`):
     optim: Union[OptimizerNames, str] = field(
-        default="adamw_hf",
+        default=default_optim,
         metadata={"help": "The optimizer to use."},
     )
     optim_args: Optional[str] = field(default=None, metadata={"help": "Optional arguments to supply to optimizer."})
@@ -925,6 +1081,15 @@ class TrainingArguments:
             )
         },
     )
+    ddp_broadcast_buffers: Optional[bool] = field(
+        default=None,
+        metadata={
+            "help": (
+                "When using distributed training, the value of the flag `broadcast_buffers` passed to "
+                "`DistributedDataParallel`."
+            )
+        },
+    )
     dataloader_pin_memory: bool = field(
         default=True, metadata={"help": "Whether or not to pin memory for DataLoader."}
     )
@@ -950,6 +1115,10 @@ class TrainingArguments:
     )
     hub_token: Optional[str] = field(default=None, metadata={"help": "The token to use to push to the Model Hub."})
     hub_private_repo: bool = field(default=False, metadata={"help": "Whether the model repository is private or not."})
+    hub_always_push: bool = field(
+        default=False,
+        metadata={"help": "Unless `True`, the Trainer will skip pushes if the previous one wasn't finished yet."},
+    )
     gradient_checkpointing: bool = field(
         default=False,
         metadata={
@@ -964,7 +1133,7 @@ class TrainingArguments:
         default="auto",
         metadata={
             "help": "Deprecated. Use half_precision_backend instead",
-            "choices": ["auto", "cuda_amp", "apex", "cpu_amp"],
+            "choices": ["auto", "apex", "cpu_amp"],
         },
     )
     push_to_hub_model_id: Optional[str] = field(
@@ -996,7 +1165,7 @@ class TrainingArguments:
         metadata={
             "help": (
                 "Whether to call enable_full_determinism instead of set_seed for reproducibility in distributed"
-                " training"
+                " training. Important: this will negatively impact the performance, so only use it for debugging."
             )
         },
     )
@@ -1004,7 +1173,6 @@ class TrainingArguments:
         default=None,
         metadata={
             "help": "This argument is deprecated, use `--torch_compile_backend` instead.",
-            "choices": TORCH_COMPILE_BACKENDS,
         },
     )
     ray_scope: Optional[str] = field(
@@ -1033,24 +1201,30 @@ class TrainingArguments:
         default=None,
         metadata={
             "help": "Which backend to use with `torch.compile`, passing one will trigger a model compilation.",
-            "choices": TORCH_COMPILE_BACKENDS,
         },
     )
     torch_compile_mode: Optional[str] = field(
         default=None,
         metadata={
             "help": "Which mode to use with `torch.compile`, passing one will trigger a model compilation.",
-            "choices": ["default", "reduce-overhead", "max-autotune"],
         },
     )
 
-    def __post_init__(self):
-        # Handle --use_env option in torch.distributed.launch (local_rank not passed as an arg then).
-        # This needs to happen before any call to self.device or self.n_gpu.
-        env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
-        if env_local_rank != -1 and env_local_rank != self.local_rank:
-            self.local_rank = env_local_rank
+    dispatch_batches: Optional[bool] = field(
+        default=None,
+        metadata={
+            "help": "Whether to dispatch batches across devices in distributed training. If set to `True`, the dataloader prepared by the Accelerator is only iterated through on the main process"
+            "and then the batches are split and broadcast to each process. Will default to `True` for `DataLoader` whose"
+            "underlying dataset is an `IterableDataset`, `False` otherwise."
+        },
+    )
 
+    include_tokens_per_second: Optional[bool] = field(
+        default=False,
+        metadata={"help": "If set to `True`, the speed metrics will include `tgs` (tokens per second per device)."},
+    )
+
+    def __post_init__(self):
         # expand paths, if not os.makedirs("~/bar") will make directory
         # in the current directory instead of the actual home
         # see https://github.com/huggingface/transformers/issues/10628
@@ -1072,6 +1246,13 @@ def __post_init__(self):
             )
             # Go back to the underlying string or we won't be able to instantiate `IntervalStrategy` on it.
             self.evaluation_strategy = self.evaluation_strategy.value
+        if self.no_cuda:
+            warnings.warn(
+                "using `no_cuda` is deprecated and will be removed in version 5.0 of 🤗 Transformers. "
+                "Use `use_cpu` instead",
+                FutureWarning,
+            )
+            self.use_cpu = self.no_cuda
 
         self.evaluation_strategy = IntervalStrategy(self.evaluation_strategy)
         self.logging_strategy = IntervalStrategy(self.logging_strategy)
@@ -1097,6 +1278,19 @@ def __post_init__(self):
         if self.logging_strategy == IntervalStrategy.STEPS and self.logging_steps == 0:
             raise ValueError(f"logging strategy {self.logging_strategy} requires non-zero --logging_steps")
 
+        if self.logging_strategy == IntervalStrategy.STEPS and self.logging_steps > 1:
+            if self.logging_steps != int(self.logging_steps):
+                raise ValueError(f"--logging_steps must be an integer if bigger than 1: {self.logging_steps}")
+            self.logging_steps = int(self.logging_steps)
+        if self.evaluation_strategy == IntervalStrategy.STEPS and self.eval_steps > 1:
+            if self.eval_steps != int(self.eval_steps):
+                raise ValueError(f"--eval_steps must be an integer if bigger than 1: {self.eval_steps}")
+            self.eval_steps = int(self.eval_steps)
+        if self.save_strategy == IntervalStrategy.STEPS and self.save_steps > 1:
+            if self.save_steps != int(self.save_steps):
+                raise ValueError(f"--save_steps must be an integer if bigger than 1: {self.save_steps}")
+            self.save_steps = int(self.save_steps)
+
         # Sanity checks for load_best_model_at_end: we require save and eval strategies to be compatible.
         if self.load_best_model_at_end:
             if self.evaluation_strategy != self.save_strategy:
@@ -1105,12 +1299,39 @@ def __post_init__(self):
                     f"strategy: {self.evaluation_strategy}\n- Save strategy: {self.save_strategy}"
                 )
             if self.evaluation_strategy == IntervalStrategy.STEPS and self.save_steps % self.eval_steps != 0:
+                if self.eval_steps < 1 or self.save_steps < 1:
+                    if not (self.eval_steps < 1 and self.save_steps < 1):
+                        raise ValueError(
+                            "--load_best_model_at_end requires the saving steps to be a multiple of the evaluation "
+                            "steps, which cannot get guaranteed when mixing ratio and absolute steps for save_steps"
+                            f"{self.save_steps} and eval_steps {self.eval_steps}."
+                        )
+                    # Work around floating point precision issues
+                    LARGE_MULTIPLIER = 1_000_000
+                    if (self.save_steps * LARGE_MULTIPLIER) % (self.eval_steps * LARGE_MULTIPLIER) != 0:
+                        raise ValueError(
+                            "--load_best_model_at_end requires the saving steps to be a multiple of the evaluation "
+                            f"steps, but found {self.save_steps}, which is not a multiple of {self.eval_steps}."
+                        )
                 raise ValueError(
                     "--load_best_model_at_end requires the saving steps to be a round multiple of the evaluation "
                     f"steps, but found {self.save_steps}, which is not a round multiple of {self.eval_steps}."
                 )
 
-        if self.load_best_model_at_end and self.metric_for_best_model is None:
+        safetensors_available = is_safetensors_available()
+        if self.save_safetensors and not safetensors_available:
+            raise ValueError(f"--save_safetensors={self.save_safetensors} requires safetensors to be installed!")
+        if not self.save_safetensors and safetensors_available:
+            logger.info(
+                f"Found safetensors installation, but --save_safetensors={self.save_safetensors}. "
+                f"Safetensors should be a preferred weights saving format due to security and performance reasons. "
+                f"If your model cannot be saved by safetensors please feel free to open an issue at "
+                f"https://github.com/huggingface/safetensors!"
+            )
+
+        if (
+            self.load_best_model_at_end or self.lr_scheduler_type == SchedulerType.REDUCE_ON_PLATEAU
+        ) and self.metric_for_best_model is None:
             self.metric_for_best_model = "loss"
         if self.greater_is_better is None and self.metric_for_best_model is not None:
             self.greater_is_better = self.metric_for_best_model not in ["loss", "eval_loss"]
@@ -1126,15 +1347,32 @@ def __post_init__(self):
                 self.half_precision_backend = self.fp16_backend
 
             if self.bf16 or self.bf16_full_eval:
-
-                if self.no_cuda and not is_torch_bf16_cpu_available() and not is_torch_tpu_available():
+                if self.use_cpu and not is_torch_bf16_cpu_available() and not is_torch_tpu_available():
                     # cpu
                     raise ValueError("Your setup doesn't support bf16/(cpu, tpu, neuroncore). You need torch>=1.10")
-                elif not self.no_cuda and torch.cuda.is_available() and not is_torch_bf16_gpu_available():
-                    # gpu
-                    raise ValueError(
-                        "Your setup doesn't support bf16/gpu. You need torch>=1.10, using Ampere GPU with cuda>=11.0"
-                    )
+                elif not self.use_cpu:
+                    if torch.cuda.is_available() and not is_torch_bf16_gpu_available():
+                        # gpu
+                        raise ValueError(
+                            "Your setup doesn't support bf16/gpu. You need torch>=1.10, using Ampere GPU with cuda>=11.0"
+                        )
+                    elif is_torch_npu_available():
+                        # npu
+                        from .pytorch_utils import is_torch_greater_or_equal_than_1_11
+
+                        if not is_torch_greater_or_equal_than_1_11:
+                            raise ValueError(
+                                "Your setup doesn't support bf16/npu. You need torch>=1.11, using Ascend NPU with "
+                                "`torch_npu` installed"
+                            )
+                    elif not is_torch_xpu_available():
+                        # xpu
+                        from .pytorch_utils import is_torch_greater_or_equal_than_1_12
+
+                        if not is_torch_greater_or_equal_than_1_12:
+                            raise ValueError(
+                                "Your setup doesn't support bf16/xpu. You need torch>=1.12, using Intel XPU/GPU with IPEX installed"
+                            )
 
         if self.fp16 and self.bf16:
             raise ValueError("At most one of fp16 and bf16 can be True, but not both")
@@ -1148,8 +1386,12 @@ def __post_init__(self):
                     " `--half_precision_backend apex`: GPU bf16 is not supported by apex. Use"
                     " `--half_precision_backend cuda_amp` instead"
                 )
-            if not (self.sharded_ddp == "" or not self.sharded_ddp):
-                raise ValueError("sharded_ddp is not supported with bf16")
+
+        if self.lr_scheduler_type == SchedulerType.REDUCE_ON_PLATEAU:
+            if self.evaluation_strategy == IntervalStrategy.NO:
+                raise ValueError("lr_scheduler_type reduce_lr_on_plateau requires an eval strategy")
+            if not is_torch_available():
+                raise ValueError("lr_scheduler_type reduce_lr_on_plateau requires torch>=0.2.0")
 
         self.optim = OptimizerNames(self.optim)
         if self.adafactor:
@@ -1159,23 +1401,33 @@ def __post_init__(self):
                 FutureWarning,
             )
             self.optim = OptimizerNames.ADAFACTOR
+        if self.optim == OptimizerNames.ADAMW_TORCH_FUSED and is_torch_available():
+            if version.parse(version.parse(torch.__version__).base_version) < version.parse("2.0.0"):
+                raise ValueError("--optim adamw_torch_fused requires PyTorch 2.0 or higher")
+            # there is a bug in fp16/AMP in pt-2.0.0
+            if version.parse(version.parse(torch.__version__).base_version) == version.parse("2.0.0") and self.fp16:
+                raise ValueError("--optim adamw_torch_fused with --fp16 requires PyTorch>2.0")
 
         if (
             self.framework == "pt"
             and is_torch_available()
             and (self.device.type != "cuda")
+            and (self.device.type != "npu")
+            and (self.device.type != "xpu")
             and (get_xla_device_type(self.device) != "GPU")
             and (self.fp16 or self.fp16_full_eval)
         ):
             raise ValueError(
                 "FP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation"
-                " (`--fp16_full_eval`) can only be used on CUDA devices."
+                " (`--fp16_full_eval`) can only be used on CUDA or NPU devices or certain XPU devices (with IPEX)."
             )
 
         if (
             self.framework == "pt"
             and is_torch_available()
             and (self.device.type != "cuda")
+            and (self.device.type != "npu")
+            and (self.device.type != "xpu")
             and (get_xla_device_type(self.device) != "GPU")
             and (get_xla_device_type(self.device) != "TPU")
             and (self.device.type != "cpu")
@@ -1183,7 +1435,7 @@ def __post_init__(self):
         ):
             raise ValueError(
                 "BF16 Mixed precision training with AMP (`--bf16`) and BF16 half precision evaluation"
-                " (`--bf16_full_eval`) can only be used on CUDA or CPU/TPU/NeuronCore devices."
+                " (`--bf16_full_eval`) can only be used on CUDA, XPU (with IPEX), NPU or CPU/TPU/NeuronCore devices."
             )
 
         if self.torchdynamo is not None:
@@ -1197,6 +1449,15 @@ def __post_init__(self):
             self.torch_compile = True
         if self.torch_compile and self.torch_compile_backend is None:
             self.torch_compile_backend = "inductor"
+
+        # accelerate integration for torch compile
+        if self.torch_compile:
+            # set env vars for accelerate
+            prefix = "ACCELERATE_DYNAMO_"
+            os.environ[prefix + "BACKEND"] = self.torch_compile_backend
+            if self.torch_compile_mode is not None:
+                os.environ[prefix + "MODE"] = self.torch_compile_mode
+
         if self.framework == "pt" and is_torch_available() and self.torch_compile:
             if is_torch_tf32_available():
                 if self.tf32 is None and not self.fp16 or self.bf16:
@@ -1205,6 +1466,7 @@ def __post_init__(self):
                         " otherwise."
                     )
                     torch.backends.cuda.matmul.allow_tf32 = True
+                    torch.backends.cudnn.allow_tf32 = True
             else:
                 logger.warning(
                     "The speedups for torchdynamo mostly come wih GPU Ampere or higher and which is not detected here."
@@ -1213,13 +1475,24 @@ def __post_init__(self):
             if self.tf32:
                 if is_torch_tf32_available():
                     torch.backends.cuda.matmul.allow_tf32 = True
+                    torch.backends.cudnn.allow_tf32 = True
                 else:
                     raise ValueError("--tf32 requires Ampere or a newer GPU arch, cuda>=11 and torch>=1.7")
             else:
                 if is_torch_tf32_available():
                     torch.backends.cuda.matmul.allow_tf32 = False
+                    torch.backends.cudnn.allow_tf32 = False
                 # no need to assert on else
 
+        # if training args is specified, it will override the one specified in the accelerate config
+        if self.half_precision_backend != "apex":
+            mixed_precision_dtype = os.environ.get("ACCELERATE_MIXED_PRECISION", "no")
+            if self.fp16:
+                mixed_precision_dtype = "fp16"
+            elif self.bf16:
+                mixed_precision_dtype = "bf16"
+            os.environ["ACCELERATE_MIXED_PRECISION"] = mixed_precision_dtype
+
         if self.report_to is None:
             logger.info(
                 "The default value for the training argument `--report_to` will change in v5 (from all installed "
@@ -1245,20 +1518,6 @@ def __post_init__(self):
                 " during training"
             )
 
-        if isinstance(self.sharded_ddp, bool):
-            self.sharded_ddp = "simple" if self.sharded_ddp else ""
-        if isinstance(self.sharded_ddp, str):
-            self.sharded_ddp = [ShardedDDPOption(s) for s in self.sharded_ddp.split()]
-        if self.sharded_ddp == [ShardedDDPOption.OFFLOAD]:
-            raise ValueError(
-                "`--sharded_ddp offload` can't work on its own. It needs to be added to `--sharded_ddp zero_dp_2` or "
-                '`--sharded_ddp zero_dp_3`. For example, `--sharded_ddp "zero_dp_2 offload"`.'
-            )
-        elif len(self.sharded_ddp) > 1 and ShardedDDPOption.SIMPLE in self.sharded_ddp:
-            raise ValueError("`--sharded_ddp simple` is not compatible with any other option.")
-        elif ShardedDDPOption.ZERO_DP_2 in self.sharded_ddp and ShardedDDPOption.ZERO_DP_3 in self.sharded_ddp:
-            raise ValueError("`--sharded_ddp zero_dp_2` is not compatible with `--sharded_ddp zero_dp_3`.")
-
         if isinstance(self.fsdp, bool):
             self.fsdp = "full_shard" if self.fsdp else ""
         if isinstance(self.fsdp, str):
@@ -1271,40 +1530,141 @@ def __post_init__(self):
         elif FSDPOption.FULL_SHARD in self.fsdp and FSDPOption.SHARD_GRAD_OP in self.fsdp:
             raise ValueError("`--fsdp full_shard` is not compatible with `--fsdp shard_grad_op`.")
 
-        if len(self.fsdp) == 0 and self.fsdp_min_num_params > 0:
-            warnings.warn("`--fsdp_min_num_params` is useful only when `--fsdp` is specified.")
+        if self.fsdp_config is None:
+            self.fsdp_config = {}
 
-        if len(self.fsdp) == 0 and self.fsdp_transformer_layer_cls_to_wrap is not None:
-            warnings.warn("`--fsdp_transformer_layer_cls_to_wrap` is useful only when `--fsdp` is specified.")
+        if isinstance(self.fsdp_config, str):
+            if len(self.fsdp) == 0:
+                warnings.warn("`--fsdp_config` is useful only when `--fsdp` is specified.")
+            with io.open(self.fsdp_config, "r", encoding="utf-8") as f:
+                self.fsdp_config = json.load(f)
+                for k in list(self.fsdp_config.keys()):
+                    if k.startswith("fsdp_"):
+                        v = self.fsdp_config.pop(k)
+                        self.fsdp_config[k[5:]] = v
 
-        if len(self.fsdp) > 0 and self.fsdp_min_num_params > 0 and self.fsdp_transformer_layer_cls_to_wrap is not None:
-            raise ValueError(
-                "`--fsdp_min_num_params` and `--fsdp_transformer_layer_cls_to_wrap` are mutually exclusive."
+        if self.fsdp_min_num_params > 0:
+            warnings.warn("using `--fsdp_min_num_params` is deprecated. Use fsdp_config instead ", FutureWarning)
+
+        self.fsdp_config["min_num_params"] = max(self.fsdp_config.get("min_num_params", 0), self.fsdp_min_num_params)
+
+        # if fsdp_config["transformer_layer_cls_to_wrap"] is specified as a string, convert it to a list with a single object
+        if isinstance(self.fsdp_config.get("transformer_layer_cls_to_wrap", None), str):
+            self.fsdp_config["transformer_layer_cls_to_wrap"] = [self.fsdp_config["transformer_layer_cls_to_wrap"]]
+
+        if self.fsdp_transformer_layer_cls_to_wrap is not None:
+            warnings.warn(
+                "using `--fsdp_transformer_layer_cls_to_wrap` is deprecated. Use fsdp_config instead ", FutureWarning
+            )
+            self.fsdp_config["transformer_layer_cls_to_wrap"] = self.fsdp_config.get(
+                "transformer_layer_cls_to_wrap", []
+            ) + [self.fsdp_transformer_layer_cls_to_wrap]
+
+        if len(self.fsdp) == 0 and self.fsdp_config["min_num_params"] > 0:
+            warnings.warn("`min_num_params` is useful only when `--fsdp` is specified.")
+
+        if len(self.fsdp) == 0 and self.fsdp_config.get("transformer_layer_cls_to_wrap", None) is not None:
+            warnings.warn("`transformer_layer_cls_to_wrap` is useful only when `--fsdp` is specified.")
+
+        if (
+            len(self.fsdp) > 0
+            and self.fsdp_config["min_num_params"] > 0
+            and self.fsdp_config.get("transformer_layer_cls_to_wrap", None) is not None
+        ):
+            raise ValueError("`min_num_params` and `transformer_layer_cls_to_wrap` are mutually exclusive.")
+        self.fsdp_config["xla"] = self.fsdp_config.get("xla", False)
+        self.fsdp_config["xla_fsdp_grad_ckpt"] = self.fsdp_config.get("xla_fsdp_grad_ckpt", False)
+        if self.fsdp_config["xla"]:
+            if len(self.fsdp) > 0:
+                # store XLA fsdp configuration parameters into a dictionary
+                self.xla_fsdp_config = self.fsdp_config.get("xla_fsdp_settings", {})
+                # apply appropriate string to torch.dtype conversions for parameters
+                if "compute_dtype" in self.xla_fsdp_config:
+                    self.xla_fsdp_config["compute_dtype"] = getattr(torch, self.xla_fsdp_config["compute_dtype"])
+                if "buffer_dtype" in self.xla_fsdp_config:
+                    self.xla_fsdp_config["buffer_dtype"] = getattr(torch, self.xla_fsdp_config["buffer_dtype"])
+            else:
+                warnings.warn("XLA FSDP can be used only when `--fsdp` is specified.")
+        else:
+            if self.fsdp_config["xla_fsdp_grad_ckpt"]:
+                warnings.warn("`--xla_fsdp_grad_ckpt` is useful only when `--xla` is set to true.")
+
+        # accelerate integration for FSDP
+        if len(self.fsdp) > 0 and not self.fsdp_config["xla"]:
+            os.environ["ACCELERATE_USE_FSDP"] = "true"
+            from accelerate.utils.constants import (
+                FSDP_AUTO_WRAP_POLICY,
+                FSDP_SHARDING_STRATEGY,
             )
 
+            prefix = "FSDP_"
+            for fsdp_option in self.fsdp:
+                if fsdp_option.upper() in FSDP_SHARDING_STRATEGY:
+                    # set environment variable for FSDP sharding strategy
+                    os.environ[f"{prefix}SHARDING_STRATEGY"] = str(
+                        FSDP_SHARDING_STRATEGY.index(fsdp_option.upper()) + 1
+                    )
+                elif fsdp_option == FSDPOption.OFFLOAD:
+                    os.environ[f"{prefix}OFFLOAD_PARAMS"] = "true"
+                elif fsdp_option == FSDPOption.AUTO_WRAP:
+                    os.environ[f"{prefix}AUTO_WRAP_POLICY"] = FSDP_AUTO_WRAP_POLICY[0]
+                    if self.fsdp_config["min_num_params"] > 0:
+                        os.environ[f"{prefix}MIN_NUM_PARAMS"] = str(self.fsdp_config["min_num_params"])
+                        os.environ[f"{prefix}AUTO_WRAP_POLICY"] = FSDP_AUTO_WRAP_POLICY[1]
+                    elif self.fsdp_config.get("transformer_layer_cls_to_wrap", None) is not None:
+                        os.environ[f"{prefix}TRANSFORMER_CLS_TO_WRAP"] = ",".join(
+                            self.fsdp_config["transformer_layer_cls_to_wrap"]
+                        )
+            prefetch_policy = self.fsdp_config.get("fsdp_backward_prefetch", "NO_PREFETCH")
+            os.environ[f"{prefix}BACKWARD_PREFETCH"] = prefetch_policy.upper()
+            os.environ[f"{prefix}FORWARD_PREFETCH"] = self.fsdp_config.get("forward_prefect", "false")
+            os.environ[f"{prefix}SYNC_MODULE_STATES"] = self.fsdp_config.get("sync_module_states", "true")
+            os.environ[f"{prefix}USE_ORIG_PARAMS"] = self.fsdp_config.get("use_orig_params", "false")
+
         if self.tpu_metrics_debug:
             warnings.warn(
                 "using `--tpu_metrics_debug` is deprecated and will be removed in version 5 of 🤗 Transformers. Use"
                 " `--debug tpu_metrics_debug` instead",
                 FutureWarning,
             )
-            self.debug += " tpu_metrics_debug"
+            if self.debug is None:
+                self.debug = " tpu_metrics_debug"
+            else:
+                self.debug += " tpu_metrics_debug"
             self.tpu_metrics_debug = False
+
         if isinstance(self.debug, str):
             self.debug = [DebugOption(s) for s in self.debug.split()]
+        elif self.debug is None:
+            self.debug = []
 
+        self.deepspeed_plugin = None
         if self.deepspeed:
             # - must be run very last in arg parsing, since it will use a lot of these settings.
             # - must be run before the model is created.
             if not is_accelerate_available():
                 raise ValueError("--deepspeed requires Accelerate to be installed: `pip install accelerate`.")
-            from transformers.deepspeed import HfTrainerDeepSpeedConfig
+            from transformers.integrations.deepspeed import HfTrainerDeepSpeedConfig
 
             # will be used later by the Trainer
             # note: leave self.deepspeed unmodified in case a user relies on it not to be modified)
             self.hf_deepspeed_config = HfTrainerDeepSpeedConfig(self.deepspeed)
             self.hf_deepspeed_config.trainer_config_process(self)
 
+            # Accelerate DeepSpeed Plugin
+            from accelerate.utils import DeepSpeedPlugin
+
+            os.environ["ACCELERATE_USE_DEEPSPEED"] = "true"
+            self.deepspeed_plugin = DeepSpeedPlugin(hf_ds_config=self.hf_deepspeed_config)
+        elif strtobool(os.environ.get("ACCELERATE_USE_DEEPSPEED", "false")):
+            # Accelerate DeepSpeed Plugin
+            from accelerate.utils import DeepSpeedPlugin
+
+            self.deepspeed_plugin = DeepSpeedPlugin()
+            mixed_precision = os.environ.get("ACCELERATE_MIXED_PRECISION", "no")
+            self.deepspeed_plugin.set_mixed_precision(mixed_precision)
+            self.deepspeed_plugin.set_deepspeed_weakref()
+
         if self.push_to_hub_token is not None:
             warnings.warn(
                 "`--push_to_hub_token` is deprecated and will be removed in version 5 of 🤗 Transformers. Use "
@@ -1394,129 +1754,87 @@ def ddp_timeout_delta(self) -> timedelta:
     def _setup_devices(self) -> "torch.device":
         requires_backends(self, ["torch"])
         logger.info("PyTorch: setting up devices")
-        if torch.distributed.is_available() and torch.distributed.is_initialized() and self.local_rank == -1:
-            logger.warning(
-                "torch.distributed process group is initialized, but local_rank == -1. "
-                "In order to use Torch DDP, launch your script with `python -m torch.distributed.launch"
-            )
-        if self.no_cuda:
-            device = torch.device("cpu")
-            self._n_gpu = 0
-            self.local_rank = get_int_from_env(
-                ["LOCAL_RANK", "MPI_LOCALRANKID", "OMPI_COMM_WORLD_LOCAL_RANK", "MV2_COMM_WORLD_LOCAL_RANK"],
-                self.local_rank,
-            )
-            if self.local_rank != -1 and not torch.distributed.is_initialized():
-                # Initializes distributed backend for cpu
-                if self.xpu_backend not in ("mpi", "ccl", "gloo"):
-                    raise ValueError(
-                        "CPU distributed training backend is not properly set. "
-                        "Please set '--xpu_backend' to either 'mpi' or 'ccl' or 'gloo'."
-                    )
-                if self.xpu_backend == "ccl":
-                    requires_backends(self, "oneccl_bind_pt")
-                    if ccl_version >= "1.12":
-                        import oneccl_bindings_for_pytorch  # noqa: F401
-                    else:
-                        import torch_ccl  # noqa: F401
-                    if int(os.environ.get("CCL_WORKER_COUNT", 0)) < 1:
-                        raise ValueError(
-                            "CPU distributed training backend is ccl. but CCL_WORKER_COUNT is not correctly set. "
-                            "Please use like 'export CCL_WORKER_COUNT = 1' to set."
-                        )
-
-                # Try to get launch configuration from environment variables set by MPI launcher - works for Intel MPI, OpenMPI and MVAPICH
-                rank = get_int_from_env(["RANK", "PMI_RANK", "OMPI_COMM_WORLD_RANK", "MV2_COMM_WORLD_RANK"], 0)
-                size = get_int_from_env(["WORLD_SIZE", "PMI_SIZE", "OMPI_COMM_WORLD_SIZE", "MV2_COMM_WORLD_SIZE"], 1)
-                local_size = get_int_from_env(
-                    ["MPI_LOCALNRANKS", "OMPI_COMM_WORLD_LOCAL_SIZE", "MV2_COMM_WORLD_LOCAL_SIZE"], 1
+        if not is_sagemaker_mp_enabled():
+            if not is_accelerate_available(min_version="0.20.1"):
+                raise ImportError(
+                    "Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`: Please run `pip install transformers[torch]` or `pip install accelerate -U`"
                 )
-                os.environ["RANK"] = str(rank)
-                os.environ["WORLD_SIZE"] = str(size)
-                os.environ["LOCAL_RANK"] = str(self.local_rank)
-                if not os.environ.get("MASTER_PORT", None):
-                    os.environ["MASTER_PORT"] = "29500"
-                if not os.environ.get("MASTER_ADDR", None):
-                    if local_size != size or self.xpu_backend != "mpi":
-                        raise ValueError(
-                            "Looks like distributed multinode run but MASTER_ADDR env not set, "
-                            "please try exporting rank 0's hostname as MASTER_ADDR"
-                        )
-                if (
-                    torch.get_num_threads() == 1
-                    and get_int_from_env(["OMP_NUM_THREADS", "MKL_NUM_THREADS"], 0) == 0
-                    and is_psutil_available()
-                ):
-                    import psutil
-
-                    num_cpu_threads_per_process = int(psutil.cpu_count(logical=False) / local_size)
-                    if num_cpu_threads_per_process == 0:
-                        num_cpu_threads_per_process = 1
-                    torch.set_num_threads(num_cpu_threads_per_process)
-                    logger.info(
-                        f"num_cpu_threads_per_process unset, we set it at {num_cpu_threads_per_process} to improve oob"
-                        " performance."
-                    )
-                torch.distributed.init_process_group(
-                    backend=self.xpu_backend, rank=rank, world_size=size, timeout=self.ddp_timeout_delta
-                )
-        elif is_torch_tpu_available():
-            device = xm.xla_device()
+            AcceleratorState._reset_state(reset_partial_state=True)
+        self.distributed_state = None
+        if not self.use_ipex and "ACCELERATE_USE_IPEX" not in os.environ:
+            os.environ["ACCELERATE_USE_IPEX"] = "false"
+        if self.use_cpu or strtobool(os.environ.get("ACCELERATE_USE_CPU", "False")):
+            self.distributed_state = PartialState(cpu=True, backend=self.ddp_backend)
             self._n_gpu = 0
         elif is_sagemaker_mp_enabled():
             local_rank = smp.local_rank()
             device = torch.device("cuda", local_rank)
             self._n_gpu = 1
+            torch.cuda.set_device(device)
+        elif is_torch_xpu_available() and "ACCELERATE_USE_XPU" not in os.environ:
+            os.environ["ACCELERATE_USE_XPU"] = "true"
+            self.distributed_state = PartialState(timeout=timedelta(seconds=self.ddp_timeout))
+            device = torch.device("xpu:0")
+            self._n_gpu = 1
         elif is_sagemaker_dp_enabled():
-            import smdistributed.dataparallel.torch.torch_smddp  # noqa: F401
-
-            dist.init_process_group(backend="smddp", timeout=self.ddp_timeout_delta)
-            self.local_rank = int(os.getenv("SMDATAPARALLEL_LOCAL_RANK"))
-            device = torch.device("cuda", self.local_rank)
+            self.distributed_state = PartialState(_use_sagemaker_dp=True)
             self._n_gpu = 1
         elif self.deepspeed:
-            # deepspeed inits torch.distributed internally
-            from .deepspeed import is_deepspeed_available
-
-            if not is_deepspeed_available():
-                raise ImportError("--deepspeed requires deepspeed: `pip install deepspeed`.")
-            import deepspeed
-
-            deepspeed.init_distributed(timeout=timedelta(seconds=self.ddp_timeout))
-
-            # workaround for setups like notebooks where the launcher can't be used,
-            # but deepspeed requires a dist env.
-            # env LOCAL_RANK could be set manually by the user, or via init_distributed if mpi4py is installed
-            self.local_rank = int(os.environ.get("LOCAL_RANK", "-1"))
-
-            device = torch.device("cuda", self.local_rank)
+            # Need to do similar for Accelerator init
+            os.environ["ACCELERATE_USE_DEEPSPEED"] = "true"
+            self.distributed_state = PartialState(timeout=timedelta(seconds=self.ddp_timeout))
+            del os.environ["ACCELERATE_USE_DEEPSPEED"]
+            self._n_gpu = 1
+        else:
+            self.distributed_state = PartialState(
+                backend=self.ddp_backend, timeout=timedelta(seconds=self.ddp_timeout)
+            )
             self._n_gpu = 1
-        elif self.local_rank == -1:
+        if not is_sagemaker_mp_enabled():
+            device = self.distributed_state.device
+            self.local_rank = self.distributed_state.local_process_index
+        if dist.is_available() and dist.is_initialized() and self.parallel_mode != ParallelMode.DISTRIBUTED:
+            logger.warning(
+                "torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. "
+                "In order to use Torch DDP, launch your script with `python -m torch.distributed.launch"
+            )
+        if is_torch_tpu_available():
+            device = self.distributed_state.device
+            self._n_gpu = 0
+        elif is_sagemaker_dp_enabled() or is_sagemaker_mp_enabled():
+            # Already set _n_gpu
+            pass
+        elif self.distributed_state.distributed_type == DistributedType.MULTI_XPU:
+            if "ACCELERATE_USE_XPU" not in os.environ:
+                os.environ["ACCELERATE_USE_XPU"] = "true"
+            self._n_gpu = torch.xpu.device_count()
+            device = torch.device("xpu:0")
+            torch.xpu.set_device(device)
+        elif self.distributed_state.distributed_type == DistributedType.NO:
             if self.use_mps_device:
-                if not torch.backends.mps.is_available():
-                    if not torch.backends.mps.is_built():
-                        raise AssertionError(
-                            "MPS not available because the current PyTorch install was not "
-                            "built with MPS enabled. Please install torch version >=1.12.0 on "
-                            "your Apple silicon Mac running macOS 12.3 or later with a native "
-                            "version (arm64) of Python"
-                        )
-                    else:
-                        raise AssertionError(
-                            "MPS not available because the current MacOS version is not 12.3+ "
-                            "and/or you do not have an MPS-enabled device on this machine."
-                        )
-                else:
-                    if not version.parse(version.parse(torch.__version__).base_version) > version.parse("1.12.0"):
-                        warnings.warn(
-                            "We strongly recommend to install PyTorch >= 1.13 (nightly version at the time of writing)"
-                            " on your MacOS machine. It has major fixes related to model correctness and performance"
-                            " improvements for transformer based models. Please refer to"
-                            " https://github.com/pytorch/pytorch/issues/82707 for more details."
-                        )
-                    device = torch.device("mps")
-                    self._n_gpu = 1
-
+                warnings.warn(
+                    "`use_mps_device` is deprecated and will be removed in version 5.0 of 🤗 Transformers."
+                    "`mps` device will be used by default if available similar to the way `cuda` device is used."
+                    "Therefore, no action from user is required. "
+                )
+                if device.type != "mps":
+                    raise ValueError(
+                        "Either you do not have an MPS-enabled device on this machine or MacOS version is not 12.3+ "
+                        "or current PyTorch install was not built with MPS enabled."
+                    )
+            if device.type == "mps":
+                self._n_gpu = 1
+            elif self.use_cpu:
+                device = torch.device("cpu")
+                self._n_gpu = 0
+            elif is_torch_xpu_available():
+                device = torch.device("xpu:0")
+                torch.xpu.set_device(device)
+                self._n_gpu = 1
+            elif is_torch_npu_available():
+                device = torch.device("npu:0")
+                torch.npu.set_device(device)
+                self._n_gpu = 1
             else:
                 # if n_gpu is > 1 we'll use nn.DataParallel.
                 # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
@@ -1528,17 +1846,8 @@ def _setup_devices(self) -> "torch.device":
                 # Sometimes the line in the postinit has not been run before we end up here, so just checking we're not at
                 # the default value.
                 self._n_gpu = torch.cuda.device_count()
-        else:
-            # Here, we'll use torch.distributed.
-            # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
-            if not torch.distributed.is_initialized():
-                torch.distributed.init_process_group(backend="nccl", timeout=self.ddp_timeout_delta)
-            device = torch.device("cuda", self.local_rank)
-            self._n_gpu = 1
-
-        if device.type == "cuda":
-            torch.cuda.set_device(device)
-
+                if device.type == "cuda":
+                    torch.cuda.set_device(device)
         return device
 
     @property
@@ -1560,7 +1869,8 @@ def n_gpu(self):
         """
         requires_backends(self, ["torch"])
         # Make sure `self._n_gpu` is properly setup.
-        _ = self._setup_devices
+        if not hasattr(self, "_n_gpu"):
+            _ = self._setup_devices
         return self._n_gpu
 
     @property
@@ -1581,7 +1891,9 @@ def parallel_mode(self):
             return ParallelMode.SAGEMAKER_MODEL_PARALLEL
         elif is_sagemaker_dp_enabled():
             return ParallelMode.SAGEMAKER_DATA_PARALLEL
-        elif self.local_rank != -1:
+        elif (
+            self.distributed_state is not None and self.distributed_state.distributed_type != DistributedType.NO
+        ) or (self.distributed_state is None and self.local_rank != -1):
             return ParallelMode.DISTRIBUTED
         elif self.n_gpu > 1:
             return ParallelMode.NOT_DISTRIBUTED
@@ -1594,15 +1906,10 @@ def world_size(self):
         The number of processes used in parallel.
         """
         requires_backends(self, ["torch"])
-
-        if is_torch_tpu_available():
-            return xm.xrt_world_size()
+        if self.distributed_state is not None:
+            return self.distributed_state.num_processes
         elif is_sagemaker_mp_enabled():
             return smp.dp_size() if not smp.state.cfg.prescaled_batch else smp.rdp_size()
-        elif is_sagemaker_dp_enabled():
-            return dist.get_world_size()
-        elif self.local_rank != -1:
-            return torch.distributed.get_world_size()
         return 1
 
     @property
@@ -1611,14 +1918,10 @@ def process_index(self):
         The index of the current process used.
         """
         requires_backends(self, ["torch"])
-        if is_torch_tpu_available():
-            return xm.get_ordinal()
+        if self.distributed_state is not None:
+            return self.distributed_state.process_index
         elif is_sagemaker_mp_enabled():
             return smp.dp_rank() if not smp.state.cfg.prescaled_batch else smp.rdp_rank()
-        elif is_sagemaker_dp_enabled():
-            return dist.get_rank()
-        elif self.local_rank != -1:
-            return torch.distributed.get_rank()
         return 0
 
     @property
@@ -1627,14 +1930,11 @@ def local_process_index(self):
         The index of the local process used.
         """
         requires_backends(self, ["torch"])
-        if is_torch_tpu_available():
-            return xm.get_local_ordinal()
+
+        if self.distributed_state is not None:
+            return self.distributed_state.local_process_index
         elif is_sagemaker_mp_enabled():
             return smp.local_rank()
-        elif is_sagemaker_dp_enabled():
-            return dist.get_rank()
-        elif self.local_rank != -1:
-            return self.local_rank
         return 0
 
     @property
@@ -1668,7 +1968,8 @@ def get_process_log_level(self):
         Returns the log level to be used depending on whether this process is the main process of node 0, main process
         of node non-0, or a non-main process.
 
-        For the main process the log level defaults to `logging.INFO` unless overridden by `log_level` argument.
+        For the main process the log level defaults to the logging level set (`logging.WARNING` if you didn't do
+        anything) unless overridden by `log_level` argument.
 
         For the replica processes the log level defaults to `logging.WARNING` unless overridden by `log_level_replica`
         argument.
@@ -1680,8 +1981,8 @@ def get_process_log_level(self):
         log_level = trainer_log_levels[self.log_level]
         log_level_replica = trainer_log_levels[self.log_level_replica]
 
-        log_level_main_node = logging.INFO if log_level == -1 else log_level
-        log_level_replica_node = logging.WARNING if log_level_replica == -1 else log_level_replica
+        log_level_main_node = logging.get_verbosity() if log_level == -1 else log_level
+        log_level_replica_node = logging.get_verbosity() if log_level_replica == -1 else log_level_replica
         return log_level_main_node if self.should_log else log_level_replica_node
 
     @property
@@ -1696,7 +1997,9 @@ def _no_sync_in_gradient_accumulation(self):
         """
         Whether or not to use no_sync for the gradients when doing gradient accumulation.
         """
-        return not (self.deepspeed or is_sagemaker_dp_enabled() or is_sagemaker_mp_enabled())
+        return not (
+            self.deepspeed or is_sagemaker_dp_enabled() or is_sagemaker_mp_enabled() or is_torch_neuroncore_available()
+        )
 
     @contextlib.contextmanager
     def main_process_first(self, local=True, desc="work"):
@@ -1720,25 +2023,23 @@ def main_process_first(self, local=True, desc="work"):
 
         """
         if is_torch_available() and self.world_size > 1:
-            main_process_desc = "main process"
-            if local:
-                is_main_process = self.local_process_index == 0
-                main_process_desc = "main local process"
+            main_process_desc = "main local process" if local else "main process"
+            if self.distributed_state is not None:
+                is_main_process = (
+                    self.distributed_state.is_local_main_process if local else self.distributed_state.is_main_process
+                )
             elif is_sagemaker_mp_enabled():
                 is_main_process = smp.rank() == 0
-            else:
-                is_main_process = self.process_index == 0
 
             try:
                 if not is_main_process:
                     # tell all replicas to wait
                     logger.debug(f"{self.process_index}: waiting for the {main_process_desc} to perform {desc}")
+
                     if is_torch_tpu_available():
                         xm.rendezvous(desc)
-                    elif is_sagemaker_dp_enabled():
-                        dist.barrier()
                     else:
-                        torch.distributed.barrier()
+                        dist.barrier()
                 yield
             finally:
                 if is_main_process:
@@ -1746,10 +2047,8 @@ def main_process_first(self, local=True, desc="work"):
                     logger.debug(f"{self.process_index}: {main_process_desc} completed {desc}, releasing all replicas")
                     if is_torch_tpu_available():
                         xm.rendezvous(desc)
-                    elif is_sagemaker_dp_enabled():
-                        dist.barrier()
                     else:
-                        torch.distributed.barrier()
+                        dist.barrier()
         else:
             yield
 
@@ -1768,7 +2067,7 @@ def to_dict(self):
         the token values by removing their value.
         """
         # filter out fields that are defined as field(init=False)
-        d = dict((field.name, getattr(self, field.name)) for field in fields(self) if field.init)
+        d = {field.name: getattr(self, field.name) for field in fields(self) if field.init}
 
         for k, v in d.items():
             if isinstance(v, Enum):
@@ -1798,6 +2097,529 @@ def to_sanitized_dict(self) -> Dict[str, Any]:
 
         return {k: v if type(v) in valid_types else str(v) for k, v in d.items()}
 
+    # The following methods are there to simplify the instantiation of `TrainingArguments`
+    def set_training(
+        self,
+        learning_rate: float = 5e-5,
+        batch_size: int = 8,
+        weight_decay: float = 0,
+        num_epochs: float = 3,
+        max_steps: int = -1,
+        gradient_accumulation_steps: int = 1,
+        seed: int = 42,
+        gradient_checkpointing: bool = False,
+    ):
+        """
+        A method that regroups all basic arguments linked to the training.
+
+        <Tip>
+
+        Calling this method will automatically set `self.do_train` to `True`.
+
+        </Tip>
+
+        Args:
+            learning_rate (`float`, *optional*, defaults to 5e-5):
+                The initial learning rate for the optimizer.
+            batch_size (`int` *optional*, defaults to 8):
+                The batch size per device (GPU/TPU core/CPU...) used for training.
+            weight_decay (`float`, *optional*, defaults to 0):
+                The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights in the
+                optimizer.
+            num_train_epochs(`float`, *optional*, defaults to 3.0):
+                Total number of training epochs to perform (if not an integer, will perform the decimal part percents
+                of the last epoch before stopping training).
+            max_steps (`int`, *optional*, defaults to -1):
+                If set to a positive number, the total number of training steps to perform. Overrides
+                `num_train_epochs`. In case of using a finite iterable dataset the training may stop before reaching
+                the set number of steps when all data is exhausted.
+            gradient_accumulation_steps (`int`, *optional*, defaults to 1):
+                Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
+
+                <Tip warning={true}>
+
+                When using gradient accumulation, one step is counted as one step with backward pass. Therefore,
+                logging, evaluation, save will be conducted every `gradient_accumulation_steps * xxx_step` training
+                examples.
+
+                </Tip>
+
+            seed (`int`, *optional*, defaults to 42):
+                Random seed that will be set at the beginning of training. To ensure reproducibility across runs, use
+                the [`~Trainer.model_init`] function to instantiate the model if it has some randomly initialized
+                parameters.
+            gradient_checkpointing (`bool`, *optional*, defaults to `False`):
+                If True, use gradient checkpointing to save memory at the expense of slower backward pass.
+
+        Example:
+
+        ```py
+        >>> from transformers import TrainingArguments
+
+        >>> args = TrainingArguments("working_dir")
+        >>> args = args.set_training(learning_rate=1e-4, batch_size=32)
+        >>> args.learning_rate
+        1e-4
+        ```
+        """
+        self.do_train = True
+        self.learning_rate = learning_rate
+        self.per_device_train_batch_size = batch_size
+        self.weight_decay = weight_decay
+        self.num_train_epochs = num_epochs
+        self.max_steps = max_steps
+        self.gradient_accumulation_steps = gradient_accumulation_steps
+        self.seed = seed
+        self.gradient_checkpointing = gradient_checkpointing
+        return self
+
+    def set_evaluate(
+        self,
+        strategy: Union[str, IntervalStrategy] = "no",
+        steps: int = 500,
+        batch_size: int = 8,
+        accumulation_steps: Optional[int] = None,
+        delay: Optional[float] = None,
+        loss_only: bool = False,
+        jit_mode: bool = False,
+    ):
+        """
+        A method that regroups all arguments linked to the evaluation.
+
+        Args:
+            strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"no"`):
+                The evaluation strategy to adopt during training. Possible values are:
+
+                    - `"no"`: No evaluation is done during training.
+                    - `"steps"`: Evaluation is done (and logged) every `steps`.
+                    - `"epoch"`: Evaluation is done at the end of each epoch.
+
+                Setting a `strategy` different from `"no"` will set `self.do_eval` to `True`.
+            steps (`int`, *optional*, defaults to 500):
+                Number of update steps between two evaluations if `strategy="steps"`.
+            batch_size (`int` *optional*, defaults to 8):
+                The batch size per device (GPU/TPU core/CPU...) used for evaluation.
+            accumulation_steps (`int`, *optional*):
+                Number of predictions steps to accumulate the output tensors for, before moving the results to the CPU.
+                If left unset, the whole predictions are accumulated on GPU/TPU before being moved to the CPU (faster
+                but requires more memory).
+            delay (`float`, *optional*):
+                Number of epochs or steps to wait for before the first evaluation can be performed, depending on the
+                evaluation_strategy.
+            loss_only (`bool`, *optional*, defaults to `False`):
+                Ignores all outputs except the loss.
+            jit_mode (`bool`, *optional*):
+                Whether or not to use PyTorch jit trace for inference.
+
+        Example:
+
+        ```py
+        >>> from transformers import TrainingArguments
+
+        >>> args = TrainingArguments("working_dir")
+        >>> args = args.set_evaluate(strategy="steps", steps=100)
+        >>> args.eval_steps
+        100
+        ```
+        """
+        self.evaluation_strategy = IntervalStrategy(strategy)
+        if self.evaluation_strategy == IntervalStrategy.STEPS and steps == 0:
+            raise ValueError("Setting `strategy` as 'steps' requires a positive value for `steps`.")
+        self.do_eval = self.evaluation_strategy != IntervalStrategy.NO
+        self.eval_steps = steps
+        self.per_device_eval_batch_size = batch_size
+        self.eval_accumulation_steps = accumulation_steps
+        self.eval_delay = delay
+        self.prediction_loss_only = loss_only
+        self.jit_mode_eval = jit_mode
+        return self
+
+    def set_testing(
+        self,
+        batch_size: int = 8,
+        loss_only: bool = False,
+        jit_mode: bool = False,
+    ):
+        """
+        A method that regroups all basic arguments linked to testing on a held-out dataset.
+
+        <Tip>
+
+        Calling this method will automatically set `self.do_predict` to `True`.
+
+        </Tip>
+
+        Args:
+            batch_size (`int` *optional*, defaults to 8):
+                The batch size per device (GPU/TPU core/CPU...) used for testing.
+            loss_only (`bool`, *optional*, defaults to `False`):
+                Ignores all outputs except the loss.
+            jit_mode (`bool`, *optional*):
+                Whether or not to use PyTorch jit trace for inference.
+
+        Example:
+
+        ```py
+        >>> from transformers import TrainingArguments
+
+        >>> args = TrainingArguments("working_dir")
+        >>> args = args.set_testing(batch_size=32)
+        >>> args.per_device_eval_batch_size
+        32
+        ```
+        """
+        self.do_predict = True
+        self.per_device_eval_batch_size = batch_size
+        self.prediction_loss_only = loss_only
+        self.jit_mode_eval = jit_mode
+        return self
+
+    def set_save(
+        self,
+        strategy: Union[str, IntervalStrategy] = "steps",
+        steps: int = 500,
+        total_limit: Optional[int] = None,
+        on_each_node: bool = False,
+    ):
+        """
+        A method that regroups all arguments linked to the evaluation.
+
+        Args:
+            strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"steps"`):
+                The checkpoint save strategy to adopt during training. Possible values are:
+
+                    - `"no"`: No save is done during training.
+                    - `"epoch"`: Save is done at the end of each epoch.
+                    - `"steps"`: Save is done every `save_steps`.
+
+            steps (`int`, *optional*, defaults to 500):
+                Number of updates steps before two checkpoint saves if `strategy="steps"`.
+            total_limit (`int`, *optional*):
+                If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in
+                `output_dir`.
+            on_each_node (`bool`, *optional*, defaults to `False`):
+                When doing multi-node distributed training, whether to save models and checkpoints on each node, or
+                only on the main one.
+
+                This should not be activated when the different nodes use the same storage as the files will be saved
+                with the same names for each node.
+
+        Example:
+
+        ```py
+        >>> from transformers import TrainingArguments
+
+        >>> args = TrainingArguments("working_dir")
+        >>> args = args.set_save(strategy="steps", steps=100)
+        >>> args.save_steps
+        100
+        ```
+        """
+        self.save_strategy = IntervalStrategy(strategy)
+        if self.save_strategy == IntervalStrategy.STEPS and steps == 0:
+            raise ValueError("Setting `strategy` as 'steps' requires a positive value for `steps`.")
+        self.save_steps = steps
+        self.save_total_limit = total_limit
+        self.save_on_each_node = on_each_node
+        return self
+
+    def set_logging(
+        self,
+        strategy: Union[str, IntervalStrategy] = "steps",
+        steps: int = 500,
+        report_to: Union[str, List[str]] = "none",
+        level: str = "passive",
+        first_step: bool = False,
+        nan_inf_filter: bool = False,
+        on_each_node: bool = False,
+        replica_level: str = "passive",
+    ):
+        """
+        A method that regroups all arguments linked to the evaluation.
+
+        Args:
+            strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"steps"`):
+                The logging strategy to adopt during training. Possible values are:
+
+                    - `"no"`: No save is done during training.
+                    - `"epoch"`: Save is done at the end of each epoch.
+                    - `"steps"`: Save is done every `save_steps`.
+
+            steps (`int`, *optional*, defaults to 500):
+                Number of update steps between two logs if `strategy="steps"`.
+            level (`str`, *optional*, defaults to `"passive"`):
+                Logger log level to use on the main process. Possible choices are the log levels as strings: `"debug"`,
+                `"info"`, `"warning"`, `"error"` and `"critical"`, plus a `"passive"` level which doesn't set anything
+                and lets the application set the level.
+            report_to (`str` or `List[str]`, *optional*, defaults to `"none"`):
+                The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,
+                `"comet_ml"`, `"mlflow"`, `"neptune"`, `"tensorboard"`,`"clearml"` and `"wandb"`. Use `"all"` to report
+                to all integrations installed, `"none"` for no integrations.
+            first_step (`bool`, *optional*, defaults to `False`):
+                Whether to log and evaluate the first `global_step` or not.
+            nan_inf_filter (`bool`, *optional*, defaults to `True`):
+                Whether to filter `nan` and `inf` losses for logging. If set to `True` the loss of every step that is
+                `nan` or `inf` is filtered and the average loss of the current logging window is taken instead.
+
+                <Tip>
+
+                `nan_inf_filter` only influences the logging of loss values, it does not change the behavior the
+                gradient is computed or applied to the model.
+
+                </Tip>
+
+            on_each_node (`bool`, *optional*, defaults to `True`):
+                In multinode distributed training, whether to log using `log_level` once per node, or only on the main
+                node.
+            replica_level (`str`, *optional*, defaults to `"passive"`):
+                Logger log level to use on replicas. Same choices as `log_level`
+
+        Example:
+
+        ```py
+        >>> from transformers import TrainingArguments
+
+        >>> args = TrainingArguments("working_dir")
+        >>> args = args.set_logging(strategy="steps", steps=100)
+        >>> args.logging_steps
+        100
+        ```
+        """
+        self.logging_strategy = IntervalStrategy(strategy)
+        if self.logging_strategy == IntervalStrategy.STEPS and steps == 0:
+            raise ValueError("Setting `strategy` as 'steps' requires a positive value for `steps`.")
+        self.logging_steps = steps
+        self.report_to = report_to
+        self.log_level = level
+        self.logging_first_step = first_step
+        self.logging_nan_inf_filter = nan_inf_filter
+        self.log_on_each_node = on_each_node
+        self.log_level_replica = replica_level
+        return self
+
+    def set_push_to_hub(
+        self,
+        model_id: str,
+        strategy: Union[str, HubStrategy] = "every_save",
+        token: Optional[str] = None,
+        private_repo: bool = False,
+        always_push: bool = False,
+    ):
+        """
+        A method that regroups all arguments linked to synchronizing checkpoints with the Hub.
+
+        <Tip>
+
+        Calling this method will set `self.push_to_hub` to `True`, which means the `output_dir` will begin a git
+        directory synced with the repo (determined by `model_id`) and the content will be pushed each time a save is
+        triggered (depending on`self.save_strategy`). Calling [`~Trainer.save_model`] will also trigger a push.
+
+        </Tip>
+
+        Args:
+            model_id (`str`):
+                The name of the repository to keep in sync with the local *output_dir*. It can be a simple model ID in
+                which case the model will be pushed in your namespace. Otherwise it should be the whole repository
+                name, for instance `"user_name/model"`, which allows you to push to an organization you are a member of
+                with `"organization_name/model"`.
+            strategy (`str` or [`~trainer_utils.HubStrategy`], *optional*, defaults to `"every_save"`):
+                Defines the scope of what is pushed to the Hub and when. Possible values are:
+
+                - `"end"`: push the model, its configuration, the tokenizer (if passed along to the [`Trainer`]) and a
+                draft of a model card when the [`~Trainer.save_model`] method is called.
+                - `"every_save"`: push the model, its configuration, the tokenizer (if passed along to the [`Trainer`])
+                  and
+                a draft of a model card each time there is a model save. The pushes are asynchronous to not block
+                training, and in case the save are very frequent, a new push is only attempted if the previous one is
+                finished. A last push is made with the final model at the end of training.
+                - `"checkpoint"`: like `"every_save"` but the latest checkpoint is also pushed in a subfolder named
+                last-checkpoint, allowing you to resume training easily with
+                `trainer.train(resume_from_checkpoint="last-checkpoint")`.
+                - `"all_checkpoints"`: like `"checkpoint"` but all checkpoints are pushed like they appear in the
+                  output
+                folder (so you will get one checkpoint folder per folder in your final repository)
+
+            token (`str`, *optional*):
+                The token to use to push the model to the Hub. Will default to the token in the cache folder obtained
+                with `huggingface-cli login`.
+            private_repo (`bool`, *optional*, defaults to `False`):
+                If True, the Hub repo will be set to private.
+            always_push (`bool`, *optional*, defaults to `False`):
+                Unless this is `True`, the `Trainer` will skip pushing a checkpoint when the previous push is not
+                finished.
+
+        Example:
+
+        ```py
+        >>> from transformers import TrainingArguments
+
+        >>> args = TrainingArguments("working_dir")
+        >>> args = args.set_push_to_hub("me/awesome-model")
+        >>> args.hub_model_id
+        'me/awesome-model'
+        ```
+        """
+        self.push_to_hub = True
+        self.hub_model_id = model_id
+        self.hub_strategy = HubStrategy(strategy)
+        self.hub_token = token
+        self.hub_private_repo = private_repo
+        self.hub_always_push = always_push
+        return self
+
+    def set_optimizer(
+        self,
+        name: Union[str, OptimizerNames] = "adamw_torch",
+        learning_rate: float = 5e-5,
+        weight_decay: float = 0,
+        beta1: float = 0.9,
+        beta2: float = 0.999,
+        epsilon: float = 1e-8,
+        args: Optional[str] = None,
+    ):
+        """
+        A method that regroups all arguments linked to the optimizer and its hyperparameters.
+
+        Args:
+            name (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_torch"`):
+                The optimizer to use: `"adamw_hf"`, `"adamw_torch"`, `"adamw_torch_fused"`, `"adamw_apex_fused"`,
+                `"adamw_anyprecision"` or `"adafactor"`.
+            learning_rate (`float`, *optional*, defaults to 5e-5):
+                The initial learning rate.
+            weight_decay (`float`, *optional*, defaults to 0):
+                The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights.
+            beta1 (`float`, *optional*, defaults to 0.9):
+                The beta1 hyperparameter for the adam optimizer or its variants.
+            beta2 (`float`, *optional*, defaults to 0.999):
+                The beta2 hyperparameter for the adam optimizer or its variants.
+            epsilon (`float`, *optional*, defaults to 1e-8):
+                The epsilon hyperparameter for the adam optimizer or its variants.
+            args (`str`, *optional*):
+                Optional arguments that are supplied to AnyPrecisionAdamW (only useful when
+                `optim="adamw_anyprecision"`).
+
+        Example:
+
+        ```py
+        >>> from transformers import TrainingArguments
+
+        >>> args = TrainingArguments("working_dir")
+        >>> args = args.set_optimizer(name="adamw_torch", beta1=0.8)
+        >>> args.optim
+        'adamw_torch'
+        ```
+        """
+        self.optim = OptimizerNames(name)
+        self.learning_rate = learning_rate
+        self.weight_decay = weight_decay
+        self.adam_beta1 = beta1
+        self.adam_beta2 = beta2
+        self.adam_epsilon = epsilon
+        self.optim_args = args
+        return self
+
+    def set_lr_scheduler(
+        self,
+        name: Union[str, SchedulerType] = "linear",
+        num_epochs: float = 3.0,
+        max_steps: int = -1,
+        warmup_ratio: float = 0,
+        warmup_steps: int = 0,
+    ):
+        """
+        A method that regroups all arguments linked to the learning rate scheduler and its hyperparameters.
+
+        Args:
+            name (`str` or [`SchedulerType`], *optional*, defaults to `"linear"`):
+                The scheduler type to use. See the documentation of [`SchedulerType`] for all possible values.
+            num_epochs(`float`, *optional*, defaults to 3.0):
+                Total number of training epochs to perform (if not an integer, will perform the decimal part percents
+                of the last epoch before stopping training).
+            max_steps (`int`, *optional*, defaults to -1):
+                If set to a positive number, the total number of training steps to perform. Overrides
+                `num_train_epochs`. In case of using a finite iterable dataset the training may stop before reaching
+                the set number of steps when all data is exhausted.
+            warmup_ratio (`float`, *optional*, defaults to 0.0):
+                Ratio of total training steps used for a linear warmup from 0 to `learning_rate`.
+            warmup_steps (`int`, *optional*, defaults to 0):
+                Number of steps used for a linear warmup from 0 to `learning_rate`. Overrides any effect of
+                `warmup_ratio`.
+
+        Example:
+
+        ```py
+        >>> from transformers import TrainingArguments
+
+        >>> args = TrainingArguments("working_dir")
+        >>> args = args.set_lr_scheduler(name="cosine", warmup_ratio=0.05)
+        >>> args.warmup_ratio
+        0.05
+        ```
+        """
+        self.lr_scheduler_type = SchedulerType(name)
+        self.num_train_epochs = num_epochs
+        self.max_steps = max_steps
+        self.warmup_ratio = warmup_ratio
+        self.warmup_steps = warmup_steps
+        return self
+
+    def set_dataloader(
+        self,
+        train_batch_size: int = 8,
+        eval_batch_size: int = 8,
+        drop_last: bool = False,
+        num_workers: int = 0,
+        pin_memory: bool = True,
+        auto_find_batch_size: bool = False,
+        ignore_data_skip: bool = False,
+        sampler_seed: Optional[int] = None,
+    ):
+        """
+        A method that regroups all arguments linked to the dataloaders creation.
+
+        Args:
+            drop_last (`bool`, *optional*, defaults to `False`):
+                Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch
+                size) or not.
+            num_workers (`int`, *optional*, defaults to 0):
+                Number of subprocesses to use for data loading (PyTorch only). 0 means that the data will be loaded in
+                the main process.
+            pin_memory (`bool`, *optional*, defaults to `True`):
+                Whether you want to pin memory in data loaders or not. Will default to `True`.
+            auto_find_batch_size (`bool`, *optional*, defaults to `False`)
+                Whether to find a batch size that will fit into memory automatically through exponential decay,
+                avoiding CUDA Out-of-Memory errors. Requires accelerate to be installed (`pip install accelerate`)
+            ignore_data_skip (`bool`, *optional*, defaults to `False`):
+                When resuming training, whether or not to skip the epochs and batches to get the data loading at the
+                same stage as in the previous training. If set to `True`, the training will begin faster (as that
+                skipping step can take a long time) but will not yield the same results as the interrupted training
+                would have.
+            sampler_seed (`int`, *optional*):
+                Random seed to be used with data samplers. If not set, random generators for data sampling will use the
+                same seed as `self.seed`. This can be used to ensure reproducibility of data sampling, independent of
+                the model seed.
+
+        Example:
+
+        ```py
+        >>> from transformers import TrainingArguments
+
+        >>> args = TrainingArguments("working_dir")
+        >>> args = args.set_dataloader(train_batch_size=16, eval_batch_size=64)
+        >>> args.per_device_train_batch_size
+        16
+        ```
+        """
+        self.per_device_train_batch_size = train_batch_size
+        self.per_device_eval_batch_size = eval_batch_size
+        self.dataloader_drop_last = drop_last
+        self.dataloader_num_workers = num_workers
+        self.dataloader_pin_memory = pin_memory
+        self.auto_find_batch_size = auto_find_batch_size
+        self.ignore_data_skip = ignore_data_skip
+        self.data_seed = sampler_seed
+        return self
+
 
 class ParallelMode(Enum):
     NOT_PARALLEL = "not_parallel"
diff --git a/src/transformers/training_args_seq2seq.py b/src/transformers/training_args_seq2seq.py
index 026dce81bcfd..ccacbbb37027 100644
--- a/src/transformers/training_args_seq2seq.py
+++ b/src/transformers/training_args_seq2seq.py
@@ -14,8 +14,10 @@
 
 import logging
 from dataclasses import dataclass, field
-from typing import Optional
+from pathlib import Path
+from typing import Optional, Union
 
+from .generation.configuration_utils import GenerationConfig
 from .training_args import TrainingArguments
 from .utils import add_start_docstrings
 
@@ -42,6 +44,15 @@ class Seq2SeqTrainingArguments(TrainingArguments):
         generation_num_beams (`int`, *optional*):
             The `num_beams` to use on each evaluation loop when `predict_with_generate=True`. Will default to the
             `num_beams` value of the model configuration.
+        generation_config (`str` or `Path` or [`~generation.GenerationConfig`], *optional*):
+            Allows to load a [`~generation.GenerationConfig`] from the `from_pretrained` method. This can be either:
+
+            - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
+              huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced
+              under a user or organization name, like `dbmdz/bert-base-german-cased`.
+            - a path to a *directory* containing a configuration file saved using the
+              [`~GenerationConfig.save_pretrained`] method, e.g., `./my_model_directory/`.
+            - a [`~generation.GenerationConfig`] object.
     """
 
     sortish_sampler: bool = field(default=False, metadata={"help": "Whether to use SortishSampler or not."})
@@ -66,3 +77,21 @@ class Seq2SeqTrainingArguments(TrainingArguments):
             )
         },
     )
+    generation_config: Optional[Union[str, Path, GenerationConfig]] = field(
+        default=None,
+        metadata={
+            "help": "Model id, file path or url pointing to a GenerationConfig json file, to use during prediction."
+        },
+    )
+
+    def to_dict(self):
+        """
+        Serializes this instance while replace `Enum` by their values and `GenerationConfig` by dictionaries (for JSON
+        serialization support). It obfuscates the token values by removing their value.
+        """
+        # filter out fields that are defined as field(init=False)
+        d = super().to_dict()
+        for k, v in d.items():
+            if isinstance(v, GenerationConfig):
+                d[k] = v.to_dict()
+        return d
diff --git a/src/transformers/training_args_tf.py b/src/transformers/training_args_tf.py
index 3cacfba16e8f..461c4086acc3 100644
--- a/src/transformers/training_args_tf.py
+++ b/src/transformers/training_args_tf.py
@@ -66,7 +66,7 @@ class TFTrainingArguments(TrainingArguments):
             The batch size per GPU/TPU core/CPU for training.
         per_device_eval_batch_size (`int`, *optional*, defaults to 8):
             The batch size per GPU/TPU core/CPU for evaluation.
-        gradient_accumulation_steps: (`int`, *optional*, defaults to 1):
+        gradient_accumulation_steps (`int`, *optional*, defaults to 1):
             Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
 
             <Tip warning={true}>
@@ -249,6 +249,13 @@ def n_replicas(self) -> int:
         requires_backends(self, ["tf"])
         return self._setup_strategy.num_replicas_in_sync
 
+    @property
+    def should_log(self):
+        """
+        Whether or not the current process should produce log.
+        """
+        return False  # TF Logging is handled by Keras not the Trainer
+
     @property
     def train_batch_size(self) -> int:
         """
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 353fe45e8e41..c5b80c617c85 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -1,10 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
 
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2021 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -19,6 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from huggingface_hub import get_full_repo_name  # for backward compatibility
 from packaging import version
 
 from .. import __version__
@@ -37,20 +34,24 @@
     ModelOutput,
     PaddingStrategy,
     TensorType,
+    add_model_info_to_auto_map,
     cached_property,
     can_return_loss,
     expand_dims,
     find_labels,
     flatten_dict,
+    infer_framework,
     is_jax_tensor,
     is_numpy_array,
     is_tensor,
+    is_tf_symbolic_tensor,
     is_tf_tensor,
     is_torch_device,
     is_torch_dtype,
     is_torch_tensor,
     reshape,
     squeeze,
+    strtobool,
     tensor_size,
     to_numpy,
     to_py_obj,
@@ -69,6 +70,7 @@
     TRANSFORMERS_CACHE,
     TRANSFORMERS_DYNAMIC_MODULE_NAME,
     EntryNotFoundError,
+    PushInProgress,
     PushToHubMixin,
     RepositoryNotFoundError,
     RevisionNotFoundError,
@@ -79,13 +81,13 @@
     extract_commit_hash,
     get_cached_models,
     get_file_from_repo,
-    get_full_repo_name,
     has_file,
     http_user_agent,
     is_offline_mode,
     is_remote_url,
     move_cache,
     send_example_telemetry,
+    try_to_load_from_cache,
 )
 from .import_utils import (
     ENV_VARS_TRUE_AND_AUTO_VALUES,
@@ -98,34 +100,50 @@
     OptionalDependencyNotAvailable,
     _LazyModule,
     ccl_version,
+    direct_transformers_import,
+    get_torch_version,
     is_accelerate_available,
     is_apex_available,
+    is_auto_gptq_available,
     is_bitsandbytes_available,
     is_bs4_available,
     is_coloredlogs_available,
+    is_cv2_available,
+    is_cython_available,
     is_datasets_available,
     is_decord_available,
     is_detectron2_available,
+    is_essentia_available,
     is_faiss_available,
+    is_flash_attn_available,
     is_flax_available,
+    is_fsdp_available,
     is_ftfy_available,
     is_in_notebook,
     is_ipex_available,
+    is_jieba_available,
+    is_jinja_available,
     is_jumanpp_available,
     is_kenlm_available,
     is_keras_nlp_available,
+    is_levenshtein_available,
     is_librosa_available,
-    is_more_itertools_available,
     is_natten_available,
     is_ninja_available,
+    is_nltk_available,
     is_onnx_available,
+    is_openai_available,
+    is_optimum_available,
     is_pandas_available,
+    is_peft_available,
     is_phonemizer_available,
+    is_pretty_midi_available,
     is_protobuf_available,
     is_psutil_available,
     is_py3nvml_available,
     is_pyctcdecode_available,
     is_pytesseract_available,
+    is_pytest_available,
     is_pytorch_quantization_available,
     is_rjieba_available,
     is_sacremoses_available,
@@ -134,6 +152,7 @@
     is_sagemaker_mp_enabled,
     is_scipy_available,
     is_sentencepiece_available,
+    is_seqio_available,
     is_sklearn_available,
     is_soundfile_availble,
     is_spacy_available,
@@ -153,18 +172,28 @@
     is_torch_cuda_available,
     is_torch_fx_available,
     is_torch_fx_proxy,
-    is_torch_onnx_dict_inputs_support_available,
+    is_torch_mps_available,
+    is_torch_neuroncore_available,
+    is_torch_npu_available,
     is_torch_tensorrt_fx_available,
     is_torch_tf32_available,
     is_torch_tpu_available,
+    is_torch_xpu_available,
     is_torchaudio_available,
     is_torchdistx_available,
     is_torchdynamo_available,
+    is_torchvision_available,
     is_training_run_on_sagemaker,
     is_vision_available,
     requires_backends,
     torch_only_method,
-    torch_version,
+)
+from .peft_utils import (
+    ADAPTER_CONFIG_NAME,
+    ADAPTER_SAFE_WEIGHTS_NAME,
+    ADAPTER_WEIGHTS_NAME,
+    check_peft_version,
+    find_adapter_config_file,
 )
 
 
@@ -198,13 +227,13 @@ def check_min_version(min_version):
         if "dev" in min_version:
             error_message = (
                 "This example requires a source install from HuggingFace Transformers (see "
-                "`https://huggingface.co/transformers/installation.html#installing-from-source`),"
+                "`https://huggingface.co/docs/transformers/installation#install-from-source`),"
             )
         else:
             error_message = f"This example requires a minimum version of {min_version},"
         error_message += f" but the version found is {__version__}.\n"
         raise ImportError(
             error_message
-            + "Check out https://huggingface.co/transformers/examples.html for the examples corresponding to other "
+            + "Check out https://github.com/huggingface/transformers/tree/main/examples#important-note for the examples corresponding to other "
             "versions of HuggingFace Transformers."
         )
diff --git a/src/transformers/utils/backbone_utils.py b/src/transformers/utils/backbone_utils.py
new file mode 100644
index 000000000000..595aae18832c
--- /dev/null
+++ b/src/transformers/utils/backbone_utils.py
@@ -0,0 +1,271 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Collection of utils to be used by backbones and their components."""
+
+import enum
+import inspect
+from typing import Iterable, List, Optional, Tuple, Union
+
+
+class BackboneType(enum.Enum):
+    TIMM = "timm"
+    TRANSFORMERS = "transformers"
+
+
+def verify_out_features_out_indices(
+    out_features: Optional[Iterable[str]], out_indices: Optional[Iterable[int]], stage_names: Optional[Iterable[str]]
+):
+    """
+    Verify that out_indices and out_features are valid for the given stage_names.
+    """
+    if stage_names is None:
+        raise ValueError("Stage_names must be set for transformers backbones")
+
+    if out_features is not None:
+        if not isinstance(out_features, (list,)):
+            raise ValueError(f"out_features must be a list {type(out_features)}")
+        if any(feat not in stage_names for feat in out_features):
+            raise ValueError(f"out_features must be a subset of stage_names: {stage_names} got {out_features}")
+
+    if out_indices is not None:
+        if not isinstance(out_indices, (list, tuple)):
+            raise ValueError(f"out_indices must be a list or tuple, got {type(out_indices)}")
+        if any(idx >= len(stage_names) for idx in out_indices):
+            raise ValueError("out_indices must be valid indices for stage_names {stage_names}, got {out_indices}")
+
+    if out_features is not None and out_indices is not None:
+        if len(out_features) != len(out_indices):
+            raise ValueError("out_features and out_indices should have the same length if both are set")
+        if out_features != [stage_names[idx] for idx in out_indices]:
+            raise ValueError("out_features and out_indices should correspond to the same stages if both are set")
+
+
+def _align_output_features_output_indices(
+    out_features: Optional[List[str]],
+    out_indices: Optional[Union[List[int], Tuple[int]]],
+    stage_names: List[str],
+):
+    """
+    Finds the corresponding `out_features` and `out_indices` for the given `stage_names`.
+
+    The logic is as follows:
+        - `out_features` not set, `out_indices` set: `out_features` is set to the `out_features` corresponding to the
+        `out_indices`.
+        - `out_indices` not set, `out_features` set: `out_indices` is set to the `out_indices` corresponding to the
+        `out_features`.
+        - `out_indices` and `out_features` not set: `out_indices` and `out_features` are set to the last stage.
+        - `out_indices` and `out_features` set: input `out_indices` and `out_features` are returned.
+
+    Args:
+        out_features (`List[str]`): The names of the features for the backbone to output.
+        out_indices (`List[int]` or `Tuple[int]`): The indices of the features for the backbone to output.
+        stage_names (`List[str]`): The names of the stages of the backbone.
+    """
+    if out_indices is None and out_features is None:
+        out_indices = [len(stage_names) - 1]
+        out_features = [stage_names[-1]]
+    elif out_indices is None and out_features is not None:
+        out_indices = [stage_names.index(layer) for layer in out_features]
+    elif out_features is None and out_indices is not None:
+        out_features = [stage_names[idx] for idx in out_indices]
+    return out_features, out_indices
+
+
+def get_aligned_output_features_output_indices(
+    out_features: Optional[List[str]],
+    out_indices: Optional[Union[List[int], Tuple[int]]],
+    stage_names: List[str],
+) -> Tuple[List[str], List[int]]:
+    """
+    Get the `out_features` and `out_indices` so that they are aligned.
+
+    The logic is as follows:
+        - `out_features` not set, `out_indices` set: `out_features` is set to the `out_features` corresponding to the
+        `out_indices`.
+        - `out_indices` not set, `out_features` set: `out_indices` is set to the `out_indices` corresponding to the
+        `out_features`.
+        - `out_indices` and `out_features` not set: `out_indices` and `out_features` are set to the last stage.
+        - `out_indices` and `out_features` set: they are verified to be aligned.
+
+    Args:
+        out_features (`List[str]`): The names of the features for the backbone to output.
+        out_indices (`List[int]` or `Tuple[int]`): The indices of the features for the backbone to output.
+        stage_names (`List[str]`): The names of the stages of the backbone.
+    """
+    # First verify that the out_features and out_indices are valid
+    verify_out_features_out_indices(out_features=out_features, out_indices=out_indices, stage_names=stage_names)
+    output_features, output_indices = _align_output_features_output_indices(
+        out_features=out_features, out_indices=out_indices, stage_names=stage_names
+    )
+    # Verify that the aligned out_features and out_indices are valid
+    verify_out_features_out_indices(out_features=output_features, out_indices=output_indices, stage_names=stage_names)
+    return output_features, output_indices
+
+
+class BackboneMixin:
+    backbone_type: Optional[BackboneType] = None
+
+    def _init_timm_backbone(self, config) -> None:
+        """
+        Initialize the backbone model from timm The backbone must already be loaded to self._backbone
+        """
+        if getattr(self, "_backbone", None) is None:
+            raise ValueError("self._backbone must be set before calling _init_timm_backbone")
+
+        # These will diagree with the defaults for the transformers models e.g. for resnet50
+        # the transformer model has out_features = ['stem', 'stage1', 'stage2', 'stage3', 'stage4']
+        # the timm model has out_features = ['act', 'layer1', 'layer2', 'layer3', 'layer4']
+        self.stage_names = [stage["module"] for stage in self._backbone.feature_info.info]
+        self.num_features = [stage["num_chs"] for stage in self._backbone.feature_info.info]
+        out_indices = self._backbone.feature_info.out_indices
+        out_features = self._backbone.feature_info.module_name()
+
+        # We verify the out indices and out features are valid
+        verify_out_features_out_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
+        self._out_features, self._out_indices = out_features, out_indices
+
+    def _init_transformers_backbone(self, config) -> None:
+        stage_names = getattr(config, "stage_names")
+        out_features = getattr(config, "out_features", None)
+        out_indices = getattr(config, "out_indices", None)
+
+        self.stage_names = stage_names
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=stage_names
+        )
+        # Number of channels for each stage. This is set in the transformer backbone model init
+        self.num_features = None
+
+    def _init_backbone(self, config) -> None:
+        """
+        Method to initialize the backbone. This method is called by the constructor of the base class after the
+        pretrained model weights have been loaded.
+        """
+        self.config = config
+
+        self.use_timm_backbone = getattr(config, "use_timm_backbone", False)
+        self.backbone_type = BackboneType.TIMM if self.use_timm_backbone else BackboneType.TRANSFORMERS
+
+        if self.backbone_type == BackboneType.TIMM:
+            self._init_timm_backbone(config)
+        elif self.backbone_type == BackboneType.TRANSFORMERS:
+            self._init_transformers_backbone(config)
+        else:
+            raise ValueError(f"backbone_type {self.backbone_type} not supported.")
+
+    @property
+    def out_features(self):
+        return self._out_features
+
+    @out_features.setter
+    def out_features(self, out_features: List[str]):
+        """
+        Set the out_features attribute. This will also update the out_indices attribute to match the new out_features.
+        """
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=None, stage_names=self.stage_names
+        )
+
+    @property
+    def out_indices(self):
+        return self._out_indices
+
+    @out_indices.setter
+    def out_indices(self, out_indices: Union[Tuple[int], List[int]]):
+        """
+        Set the out_indices attribute. This will also update the out_features attribute to match the new out_indices.
+        """
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=None, out_indices=out_indices, stage_names=self.stage_names
+        )
+
+    @property
+    def out_feature_channels(self):
+        # the current backbones will output the number of channels for each stage
+        # even if that stage is not in the out_features list.
+        return {stage: self.num_features[i] for i, stage in enumerate(self.stage_names)}
+
+    @property
+    def channels(self):
+        return [self.out_feature_channels[name] for name in self.out_features]
+
+    def forward_with_filtered_kwargs(self, *args, **kwargs):
+        signature = dict(inspect.signature(self.forward).parameters)
+        filtered_kwargs = {k: v for k, v in kwargs.items() if k in signature}
+        return self(*args, **filtered_kwargs)
+
+    def forward(
+        self,
+        pixel_values,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        raise NotImplementedError("This method should be implemented by the derived class.")
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default `to_dict()` from `PretrainedConfig` to
+        include the `out_features` and `out_indices` attributes.
+        """
+        output = super().to_dict()
+        output["out_features"] = output.pop("_out_features")
+        output["out_indices"] = output.pop("_out_indices")
+        return output
+
+
+class BackboneConfigMixin:
+    """
+    A Mixin to support handling the `out_features` and `out_indices` attributes for the backbone configurations.
+    """
+
+    @property
+    def out_features(self):
+        return self._out_features
+
+    @out_features.setter
+    def out_features(self, out_features: List[str]):
+        """
+        Set the out_features attribute. This will also update the out_indices attribute to match the new out_features.
+        """
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=None, stage_names=self.stage_names
+        )
+
+    @property
+    def out_indices(self):
+        return self._out_indices
+
+    @out_indices.setter
+    def out_indices(self, out_indices: Union[Tuple[int], List[int]]):
+        """
+        Set the out_indices attribute. This will also update the out_features attribute to match the new out_indices.
+        """
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=None, out_indices=out_indices, stage_names=self.stage_names
+        )
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default `to_dict()` from `PretrainedConfig` to
+        include the `out_features` and `out_indices` attributes.
+        """
+        output = super().to_dict()
+        output["out_features"] = output.pop("_out_features")
+        output["out_indices"] = output.pop("_out_indices")
+        return output
diff --git a/src/transformers/utils/bitsandbytes.py b/src/transformers/utils/bitsandbytes.py
index 4e14dbaf77d3..71707cf56599 100644
--- a/src/transformers/utils/bitsandbytes.py
+++ b/src/transformers/utils/bitsandbytes.py
@@ -1,163 +1,28 @@
-from copy import deepcopy
-
-from transformers.utils import is_accelerate_available, is_bitsandbytes_available
-
-
-if is_bitsandbytes_available():
-    import torch
-    import torch.nn as nn
-
-    import bitsandbytes as bnb
-
-if is_accelerate_available():
-    from accelerate import init_empty_weights
-    from accelerate.utils import find_tied_parameters
-
-
-def set_module_8bit_tensor_to_device(module, tensor_name, device, value=None):
-    """
-    A helper function to set a given tensor (parameter of buffer) of a module on a specific device (note that doing
-    `param.to(device)` creates a new tensor not linked to the parameter, which is why we need this function). The
-    function is adapted from `set_module_tensor_to_device` function from accelerate that is adapted to support the
-    class `Int8Params` from `bitsandbytes`.
-
-    Args:
-        module (`torch.nn.Module`):
-            The module in which the tensor we want to move lives.
-        tensor_name (`str`):
-            The full name of the parameter/buffer.
-        device (`int`, `str` or `torch.device`):
-            The device on which to set the tensor.
-        value (`torch.Tensor`, *optional*):
-            The value of the tensor (useful when going from the meta device to any other device).
-    """
-    # Recurse if needed
-    if "." in tensor_name:
-        splits = tensor_name.split(".")
-        for split in splits[:-1]:
-            new_module = getattr(module, split)
-            if new_module is None:
-                raise ValueError(f"{module} has no attribute {split}.")
-            module = new_module
-        tensor_name = splits[-1]
-
-    if tensor_name not in module._parameters and tensor_name not in module._buffers:
-        raise ValueError(f"{module} does not have a parameter or a buffer named {tensor_name}.")
-    is_buffer = tensor_name in module._buffers
-    old_value = getattr(module, tensor_name)
-
-    if old_value.device == torch.device("meta") and device not in ["meta", torch.device("meta")] and value is None:
-        raise ValueError(f"{tensor_name} is on the meta device, we need a `value` to put in on {device}.")
-
-    if is_buffer:
-        has_fp16_weights = None
-    else:
-        has_fp16_weights = getattr(module._parameters[tensor_name], "has_fp16_weights", None)
-
-    if has_fp16_weights is not None:
-        param = module._parameters[tensor_name]
-        if param.device.type != "cuda":
-            if value is None:
-                new_value = old_value.to(device)
-            elif isinstance(value, torch.Tensor):
-                new_value = value.to("cpu")
-                if value.dtype == torch.int8:
-                    raise ValueError(
-                        "You cannot load weights that are saved in int8 using `load_in_8bit=True`, make sure you are",
-                        " using `load_in_8bit=True` on float32/float16/bfloat16 weights.",
-                    )
-            else:
-                new_value = torch.tensor(value, device="cpu")
-            new_value = bnb.nn.Int8Params(new_value, requires_grad=False, has_fp16_weights=has_fp16_weights).to(device)
-            module._parameters[tensor_name] = new_value
-    else:
-        if value is None:
-            new_value = old_value.to(device)
-        elif isinstance(value, torch.Tensor):
-            new_value = value.to(device)
-        else:
-            new_value = torch.tensor(value, device=device)
-
-        if is_buffer:
-            module._buffers[tensor_name] = new_value
-        else:
-            new_value = nn.Parameter(new_value, requires_grad=old_value.requires_grad)
-            module._parameters[tensor_name] = new_value
-
-
-def replace_8bit_linear(model, threshold=6.0, modules_to_not_convert="lm_head"):
-    """
-    A helper function to replace all `torch.nn.Linear` modules by `bnb.nn.Linear8bit` modules from the `bitsandbytes`
-    library. This will enable running your models using mixed int8 precision as described by the paper `GPT3.int8():
-    8-bit Matrix Multiplication for Transformers at Scale`. Make sure `bitsandbytes` compiled with the correct CUDA
-    version of your hardware is installed before running this function. `pip install -i https://test.pypi.org/simple/
-    bitsandbytes`
-
-    The function will be run recursively and replace all `torch.nn.Linear` modules except for the `lm_head` that should
-    be kept as a `torch.nn.Linear` module. The replacement is done under `init_empty_weights` context manager so no
-    CPU/GPU memory is required to run this function. Int8 mixed-precision matrix decomposition works by separating a
-    matrix multiplication into two streams: (1) and systematic feature outlier stream matrix multiplied in fp16
-    (0.01%), (2) a regular stream of int8 matrix multiplication (99.9%). With this method, int8 inference with no
-    predictive degradation is possible for very large models (>=176B parameters).
-
-    Parameters:
-        model (`torch.nn.Module`):
-            Input model or `torch.nn.Module` as the function is run recursively.
-        threshold (`float`, *optional*, defaults to 6.0):
-            `int8_threshold` for outlier detection as described in the formentioned paper. This parameters is set to
-            `6.0` as described by the paper.
-        modules_to_not_convert (`str`, *optional*, defaults to `lm_head`):
-            Name of the module to not convert in `Linear8bitLt`. In practice we keep the `lm_head` in full precision
-            for numerical stability reasons.
-    """
-    for name, module in model.named_children():
-        if len(list(module.children())) > 0:
-            replace_8bit_linear(module, threshold, modules_to_not_convert)
-
-        if isinstance(module, nn.Linear) and name not in modules_to_not_convert:
-            with init_empty_weights():
-                model._modules[name] = bnb.nn.Linear8bitLt(
-                    module.in_features,
-                    module.out_features,
-                    module.bias is not None,
-                    has_fp16_weights=False,
-                    threshold=threshold,
-                )
-    return model
-
-
-def get_keys_to_not_convert(model):
-    r"""
-    An utility function to get the key of the module to keep in full precision if any For example for CausalLM modules
-    we may want to keep the lm_head in full precision for numerical stability reasons. For other architectures, we want
-    to keep the tied weights of the model. The function will return a list of the keys of the modules to not convert in
-    int8.
-
-    Parameters:
-    model (`torch.nn.Module`):
-        Input model
-    """
-    # Create a copy of the model and tie the weights, then
-    # check if it contains tied weights
-    tied_model = deepcopy(model)  # this has 0 cost since it is done inside `init_empty_weights` context manager`
-    tied_model.tie_weights()
-
-    tied_keys = list(find_tied_parameters(tied_model).values())
-    has_tied_params = len(tied_keys) > 0
-
-    # Check if it is a base model
-    is_base_model = not hasattr(model, model.base_model_prefix)
-
-    # Ignore this for base models (BertModel, GPT2Model, etc.)
-    if (not has_tied_params) and is_base_model:
-        return []
-
-    # otherwise they have an attached head
-    list_modules = list(model.named_parameters())
-    list_last_module = [list_modules[-1][0]]
-
-    # add last module together with tied weights
-    intersection = set(list_last_module) - set(tied_keys)
-    list_untouched = tied_keys + list(intersection)
-
-    return [module_name.split(".")[0] for module_name in list_untouched]
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+
+
+warnings.warn(
+    "transformers.utils.bitsandbytes module is deprecated and will be removed in a future version. Please import bitsandbytes modules directly from transformers.integrations",
+    FutureWarning,
+)
+
+from ..integrations import (  # noqa
+    get_keys_to_not_convert,
+    replace_8bit_linear,
+    replace_with_bnb_linear,
+    set_module_8bit_tensor_to_device,
+    set_module_quantized_tensor_to_device,
+)
diff --git a/src/transformers/utils/constants.py b/src/transformers/utils/constants.py
index af2e48ab0a8b..fefd1b4601da 100644
--- a/src/transformers/utils/constants.py
+++ b/src/transformers/utils/constants.py
@@ -2,3 +2,5 @@
 IMAGENET_DEFAULT_STD = [0.229, 0.224, 0.225]
 IMAGENET_STANDARD_MEAN = [0.5, 0.5, 0.5]
 IMAGENET_STANDARD_STD = [0.5, 0.5, 0.5]
+OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
+OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
diff --git a/src/transformers/utils/doc.py b/src/transformers/utils/doc.py
index 0e68b5d7f239..17aeadcfdf99 100644
--- a/src/transformers/utils/doc.py
+++ b/src/transformers/utils/doc.py
@@ -122,6 +122,11 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     if i < len(lines):
         params_docstring = "\n".join(lines[(i + 1) :])
         params_docstring = _convert_output_args_doc(params_docstring)
+    else:
+        raise ValueError(
+            f"No `Args` or `Parameters` section is found in the docstring of `{output_type.__name__}`. Make sure it has"
+            "docstring and contain either `Args` or `Parameters`."
+        )
 
     # Add the return introduction
     full_output_type = f"{output_type.__module__}.{output_type.__name__}"
@@ -146,14 +151,25 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     return result
 
 
+FAKE_MODEL_DISCLAIMER = """
+    <Tip warning={true}>
+
+    This example uses a random model as the real ones are all very big. To get proper results, you should use
+    {real_checkpoint} instead of {fake_checkpoint}. If you get out-of-memory when loading that checkpoint, you can try
+    adding `device_map="auto"` in the `from_pretrained` call.
+
+    </Tip>
+"""
+
+
 PT_TOKEN_CLASSIFICATION_SAMPLE = r"""
     Example:
 
     ```python
-    >>> from transformers import {processor_class}, {model_class}
+    >>> from transformers import AutoTokenizer, {model_class}
     >>> import torch
 
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
     >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> inputs = tokenizer(
@@ -171,9 +187,7 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     >>> predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]
     >>> predicted_tokens_classes
     {expected_output}
-    ```
 
-    ```python
     >>> labels = predicted_token_class_ids
     >>> loss = model(**inputs, labels=labels).loss
     >>> round(loss.item(), 2)
@@ -185,10 +199,10 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     Example:
 
     ```python
-    >>> from transformers import {processor_class}, {model_class}
+    >>> from transformers import AutoTokenizer, {model_class}
     >>> import torch
 
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
     >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
@@ -203,9 +217,7 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     >>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
     >>> tokenizer.decode(predict_answer_tokens, skip_special_tokens=True)
     {expected_output}
-    ```
 
-    ```python
     >>> # target is "nice puppet"
     >>> target_start_index = torch.tensor([{qa_target_start_index}])
     >>> target_end_index = torch.tensor([{qa_target_end_index}])
@@ -222,9 +234,9 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
 
     ```python
     >>> import torch
-    >>> from transformers import {processor_class}, {model_class}
+    >>> from transformers import AutoTokenizer, {model_class}
 
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
     >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
@@ -235,9 +247,7 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     >>> predicted_class_id = logits.argmax().item()
     >>> model.config.id2label[predicted_class_id]
     {expected_output}
-    ```
 
-    ```python
     >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
     >>> num_labels = len(model.config.id2label)
     >>> model = {model_class}.from_pretrained("{checkpoint}", num_labels=num_labels)
@@ -252,9 +262,9 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
 
     ```python
     >>> import torch
-    >>> from transformers import {processor_class}, {model_class}
+    >>> from transformers import AutoTokenizer, {model_class}
 
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
     >>> model = {model_class}.from_pretrained("{checkpoint}", problem_type="multi_label_classification")
 
     >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
@@ -262,23 +272,18 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     >>> with torch.no_grad():
     ...     logits = model(**inputs).logits
 
-    >>> predicted_class_id = logits.argmax().item()
-    >>> model.config.id2label[predicted_class_id]
-    {expected_output}
-    ```
+    >>> predicted_class_ids = torch.arange(0, logits.shape[-1])[torch.sigmoid(logits).squeeze(dim=0) > 0.5]
 
-    ```python
     >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
     >>> num_labels = len(model.config.id2label)
     >>> model = {model_class}.from_pretrained(
     ...     "{checkpoint}", num_labels=num_labels, problem_type="multi_label_classification"
     ... )
 
-    >>> labels = torch.nn.functional.one_hot(torch.tensor([predicted_class_id]), num_classes=num_labels).to(
-    ...     torch.float
-    ... )
+    >>> labels = torch.sum(
+    ...     torch.nn.functional.one_hot(predicted_class_ids[None, :].clone(), num_classes=num_labels), dim=1
+    ... ).to(torch.float)
     >>> loss = model(**inputs, labels=labels).loss
-    >>> loss.backward()  # doctest: +IGNORE_RESULT
     ```
 """
 
@@ -286,10 +291,10 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     Example:
 
     ```python
-    >>> from transformers import {processor_class}, {model_class}
+    >>> from transformers import AutoTokenizer, {model_class}
     >>> import torch
 
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
     >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="pt")
@@ -303,9 +308,7 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     >>> predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
     >>> tokenizer.decode(predicted_token_id)
     {expected_output}
-    ```
 
-    ```python
     >>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"]
     >>> # mask labels of non-{mask} tokens
     >>> labels = torch.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100)
@@ -320,10 +323,10 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     Example:
 
     ```python
-    >>> from transformers import {processor_class}, {model_class}
+    >>> from transformers import AutoTokenizer, {model_class}
     >>> import torch
 
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
     >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
@@ -337,10 +340,10 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     Example:
 
     ```python
-    >>> from transformers import {processor_class}, {model_class}
+    >>> from transformers import AutoTokenizer, {model_class}
     >>> import torch
 
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
     >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
@@ -362,9 +365,9 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
 
     ```python
     >>> import torch
-    >>> from transformers import {processor_class}, {model_class}
+    >>> from transformers import AutoTokenizer, {model_class}
 
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
     >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
@@ -378,7 +381,7 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     Example:
 
     ```python
-    >>> from transformers import {processor_class}, {model_class}
+    >>> from transformers import AutoProcessor, {model_class}
     >>> import torch
     >>> from datasets import load_dataset
 
@@ -386,7 +389,7 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     >>> dataset = dataset.sort("id")
     >>> sampling_rate = dataset.features["audio"].sampling_rate
 
-    >>> processor = {processor_class}.from_pretrained("{checkpoint}")
+    >>> processor = AutoProcessor.from_pretrained("{checkpoint}")
     >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> # audio file is decoded on the fly
@@ -404,7 +407,7 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     Example:
 
     ```python
-    >>> from transformers import {processor_class}, {model_class}
+    >>> from transformers import AutoProcessor, {model_class}
     >>> from datasets import load_dataset
     >>> import torch
 
@@ -412,7 +415,7 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     >>> dataset = dataset.sort("id")
     >>> sampling_rate = dataset.features["audio"].sampling_rate
 
-    >>> processor = {processor_class}.from_pretrained("{checkpoint}")
+    >>> processor = AutoProcessor.from_pretrained("{checkpoint}")
     >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> # audio file is decoded on the fly
@@ -425,9 +428,7 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     >>> transcription = processor.batch_decode(predicted_ids)
     >>> transcription[0]
     {expected_output}
-    ```
 
-    ```python
     >>> inputs["labels"] = processor(text=dataset[0]["text"], return_tensors="pt").input_ids
 
     >>> # compute loss
@@ -441,7 +442,7 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     Example:
 
     ```python
-    >>> from transformers import {processor_class}, {model_class}
+    >>> from transformers import AutoFeatureExtractor, {model_class}
     >>> from datasets import load_dataset
     >>> import torch
 
@@ -449,7 +450,7 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     >>> dataset = dataset.sort("id")
     >>> sampling_rate = dataset.features["audio"].sampling_rate
 
-    >>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}")
+    >>> feature_extractor = AutoFeatureExtractor.from_pretrained("{checkpoint}")
     >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> # audio file is decoded on the fly
@@ -462,9 +463,7 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     >>> predicted_label = model.config.id2label[predicted_class_ids]
     >>> predicted_label
     {expected_output}
-    ```
 
-    ```python
     >>> # compute loss - target_label is e.g. "down"
     >>> target_label = model.config.id2label[0]
     >>> inputs["labels"] = torch.tensor([model.config.label2id[target_label]])
@@ -479,7 +478,7 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     Example:
 
     ```python
-    >>> from transformers import {processor_class}, {model_class}
+    >>> from transformers import AutoFeatureExtractor, {model_class}
     >>> from datasets import load_dataset
     >>> import torch
 
@@ -487,7 +486,7 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     >>> dataset = dataset.sort("id")
     >>> sampling_rate = dataset.features["audio"].sampling_rate
 
-    >>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}")
+    >>> feature_extractor = AutoFeatureExtractor.from_pretrained("{checkpoint}")
     >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> # audio file is decoded on the fly
@@ -508,7 +507,7 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     Example:
 
     ```python
-    >>> from transformers import {processor_class}, {model_class}
+    >>> from transformers import AutoFeatureExtractor, {model_class}
     >>> from datasets import load_dataset
     >>> import torch
 
@@ -516,7 +515,7 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     >>> dataset = dataset.sort("id")
     >>> sampling_rate = dataset.features["audio"].sampling_rate
 
-    >>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}")
+    >>> feature_extractor = AutoFeatureExtractor.from_pretrained("{checkpoint}")
     >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> # audio file is decoded on the fly
@@ -543,17 +542,17 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     Example:
 
     ```python
-    >>> from transformers import {processor_class}, {model_class}
+    >>> from transformers import AutoImageProcessor, {model_class}
     >>> import torch
     >>> from datasets import load_dataset
 
     >>> dataset = load_dataset("huggingface/cats-image")
     >>> image = dataset["test"]["image"][0]
 
-    >>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}")
+    >>> image_processor = AutoImageProcessor.from_pretrained("{checkpoint}")
     >>> model = {model_class}.from_pretrained("{checkpoint}")
 
-    >>> inputs = feature_extractor(image, return_tensors="pt")
+    >>> inputs = image_processor(image, return_tensors="pt")
 
     >>> with torch.no_grad():
     ...     outputs = model(**inputs)
@@ -568,17 +567,17 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     Example:
 
     ```python
-    >>> from transformers import {processor_class}, {model_class}
+    >>> from transformers import AutoImageProcessor, {model_class}
     >>> import torch
     >>> from datasets import load_dataset
 
     >>> dataset = load_dataset("huggingface/cats-image")
     >>> image = dataset["test"]["image"][0]
 
-    >>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}")
+    >>> image_processor = AutoImageProcessor.from_pretrained("{checkpoint}")
     >>> model = {model_class}.from_pretrained("{checkpoint}")
 
-    >>> inputs = feature_extractor(image, return_tensors="pt")
+    >>> inputs = image_processor(image, return_tensors="pt")
 
     >>> with torch.no_grad():
     ...     logits = model(**inputs).logits
@@ -613,10 +612,10 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     Example:
 
     ```python
-    >>> from transformers import {processor_class}, {model_class}
+    >>> from transformers import AutoTokenizer, {model_class}
     >>> import tensorflow as tf
 
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
     >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> inputs = tokenizer(
@@ -646,10 +645,10 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     Example:
 
     ```python
-    >>> from transformers import {processor_class}, {model_class}
+    >>> from transformers import AutoTokenizer, {model_class}
     >>> import tensorflow as tf
 
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
     >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
@@ -681,10 +680,10 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     Example:
 
     ```python
-    >>> from transformers import {processor_class}, {model_class}
+    >>> from transformers import AutoTokenizer, {model_class}
     >>> import tensorflow as tf
 
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
     >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
@@ -712,10 +711,10 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     Example:
 
     ```python
-    >>> from transformers import {processor_class}, {model_class}
+    >>> from transformers import AutoTokenizer, {model_class}
     >>> import tensorflow as tf
 
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
     >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="tf")
@@ -745,10 +744,10 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     Example:
 
     ```python
-    >>> from transformers import {processor_class}, {model_class}
+    >>> from transformers import AutoTokenizer, {model_class}
     >>> import tensorflow as tf
 
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
     >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
@@ -762,10 +761,10 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     Example:
 
     ```python
-    >>> from transformers import {processor_class}, {model_class}
+    >>> from transformers import AutoTokenizer, {model_class}
     >>> import tensorflow as tf
 
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
     >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
@@ -785,10 +784,10 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     Example:
 
     ```python
-    >>> from transformers import {processor_class}, {model_class}
+    >>> from transformers import AutoTokenizer, {model_class}
     >>> import tensorflow as tf
 
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
     >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
@@ -801,14 +800,14 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     Example:
 
     ```python
-    >>> from transformers import {processor_class}, {model_class}
+    >>> from transformers import AutoProcessor, {model_class}
     >>> from datasets import load_dataset
 
     >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
     >>> dataset = dataset.sort("id")
     >>> sampling_rate = dataset.features["audio"].sampling_rate
 
-    >>> processor = {processor_class}.from_pretrained("{checkpoint}")
+    >>> processor = AutoProcessor.from_pretrained("{checkpoint}")
     >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> # audio file is decoded on the fly
@@ -825,7 +824,7 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     Example:
 
     ```python
-    >>> from transformers import {processor_class}, {model_class}
+    >>> from transformers import AutoProcessor, {model_class}
     >>> from datasets import load_dataset
     >>> import tensorflow as tf
 
@@ -833,7 +832,7 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     >>> dataset = dataset.sort("id")
     >>> sampling_rate = dataset.features["audio"].sampling_rate
 
-    >>> processor = {processor_class}.from_pretrained("{checkpoint}")
+    >>> processor = AutoProcessor.from_pretrained("{checkpoint}")
     >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> # audio file is decoded on the fly
@@ -861,16 +860,16 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     Example:
 
     ```python
-    >>> from transformers import {processor_class}, {model_class}
+    >>> from transformers import AutoImageProcessor, {model_class}
     >>> from datasets import load_dataset
 
     >>> dataset = load_dataset("huggingface/cats-image")
     >>> image = dataset["test"]["image"][0]
 
-    >>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}")
+    >>> image_processor = AutoImageProcessor.from_pretrained("{checkpoint}")
     >>> model = {model_class}.from_pretrained("{checkpoint}")
 
-    >>> inputs = feature_extractor(image, return_tensors="tf")
+    >>> inputs = image_processor(image, return_tensors="tf")
     >>> outputs = model(**inputs)
 
     >>> last_hidden_states = outputs.last_hidden_state
@@ -883,17 +882,17 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     Example:
 
     ```python
-    >>> from transformers import {processor_class}, {model_class}
+    >>> from transformers import AutoImageProcessor, {model_class}
     >>> import tensorflow as tf
     >>> from datasets import load_dataset
 
     >>> dataset = load_dataset("huggingface/cats-image")
     >>> image = dataset["test"]["image"][0]
 
-    >>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}")
+    >>> image_processor = AutoImageProcessor.from_pretrained("{checkpoint}")
     >>> model = {model_class}.from_pretrained("{checkpoint}")
 
-    >>> inputs = feature_extractor(image, return_tensors="tf")
+    >>> inputs = image_processor(image, return_tensors="tf")
     >>> logits = model(**inputs).logits
 
     >>> # model predicts one of the 1000 ImageNet classes
@@ -922,9 +921,9 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     Example:
 
     ```python
-    >>> from transformers import {processor_class}, {model_class}
+    >>> from transformers import AutoTokenizer, {model_class}
 
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
     >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="jax")
@@ -938,9 +937,9 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     Example:
 
     ```python
-    >>> from transformers import {processor_class}, {model_class}
+    >>> from transformers import AutoTokenizer, {model_class}
 
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
     >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
@@ -956,9 +955,9 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     Example:
 
     ```python
-    >>> from transformers import {processor_class}, {model_class}
+    >>> from transformers import AutoTokenizer, {model_class}
 
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
     >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="jax")
@@ -972,9 +971,9 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     Example:
 
     ```python
-    >>> from transformers import {processor_class}, {model_class}
+    >>> from transformers import AutoTokenizer, {model_class}
 
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
     >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="jax")
@@ -988,9 +987,9 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     Example:
 
     ```python
-    >>> from transformers import {processor_class}, {model_class}
+    >>> from transformers import AutoTokenizer, {model_class}
 
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
     >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="jax")
@@ -1004,9 +1003,9 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     Example:
 
     ```python
-    >>> from transformers import {processor_class}, {model_class}
+    >>> from transformers import AutoTokenizer, {model_class}
 
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
     >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
@@ -1024,9 +1023,9 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     Example:
 
     ```python
-    >>> from transformers import {processor_class}, {model_class}
+    >>> from transformers import AutoTokenizer, {model_class}
 
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
     >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
@@ -1048,6 +1047,20 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
 }
 
 
+def filter_outputs_from_example(docstring, **kwargs):
+    """
+    Removes the lines testing an output with the doctest syntax in a code sample when it's set to `None`.
+    """
+    for key, value in kwargs.items():
+        if value is not None:
+            continue
+
+        doc_key = "{" + key + "}"
+        docstring = re.sub(rf"\n([^\n]+)\n\s+{doc_key}\n", "\n", docstring)
+
+    return docstring
+
+
 def add_code_sample_docstrings(
     *docstr,
     processor_class=None,
@@ -1059,8 +1072,9 @@ def add_code_sample_docstrings(
     qa_target_end_index=15,
     model_cls=None,
     modality=None,
-    expected_output="",
-    expected_loss="",
+    expected_output=None,
+    expected_loss=None,
+    real_checkpoint=None,
 ):
     def docstring_decorator(fn):
         # model_class defaults to function's class if not specified otherwise
@@ -1076,16 +1090,19 @@ def docstring_decorator(fn):
         # putting all kwargs for docstrings in a dict to be used
         # with the `.format(**doc_kwargs)`. Note that string might
         # be formatted with non-existing keys, which is fine.
-        doc_kwargs = dict(
-            model_class=model_class,
-            processor_class=processor_class,
-            checkpoint=checkpoint,
-            mask=mask,
-            qa_target_start_index=qa_target_start_index,
-            qa_target_end_index=qa_target_end_index,
-            expected_output=expected_output,
-            expected_loss=expected_loss,
-        )
+        doc_kwargs = {
+            "model_class": model_class,
+            "processor_class": processor_class,
+            "checkpoint": checkpoint,
+            "mask": mask,
+            "qa_target_start_index": qa_target_start_index,
+            "qa_target_end_index": qa_target_end_index,
+            "expected_output": expected_output,
+            "expected_loss": expected_loss,
+            "real_checkpoint": real_checkpoint,
+            "fake_checkpoint": checkpoint,
+            "true": "{true}",  # For <Tip warning={true}> syntax that conflicts with formatting.
+        }
 
         if ("SequenceClassification" in model_class or "AudioClassification" in model_class) and modality == "audio":
             code_sample = sample_docstrings["AudioClassification"]
@@ -1118,6 +1135,11 @@ def docstring_decorator(fn):
         else:
             raise ValueError(f"Docstring can't be built for model {model_class}")
 
+        code_sample = filter_outputs_from_example(
+            code_sample, expected_output=expected_output, expected_loss=expected_loss
+        )
+        if real_checkpoint is not None:
+            code_sample = FAKE_MODEL_DISCLAIMER + code_sample
         func_doc = (fn.__doc__ or "") + "".join(docstr)
         output_doc = "" if output_type is None else _prepare_output_docstrings(output_type, config_class)
         built_doc = code_sample.format(**doc_kwargs)
diff --git a/src/transformers/utils/dummy_essentia_and_librosa_and_pretty_midi_and_scipy_and_torch_objects.py b/src/transformers/utils/dummy_essentia_and_librosa_and_pretty_midi_and_scipy_and_torch_objects.py
new file mode 100644
index 000000000000..e6d75a6ec22e
--- /dev/null
+++ b/src/transformers/utils/dummy_essentia_and_librosa_and_pretty_midi_and_scipy_and_torch_objects.py
@@ -0,0 +1,23 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class Pop2PianoFeatureExtractor(metaclass=DummyObject):
+    _backends = ["essentia", "librosa", "pretty_midi", "scipy", "torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["essentia", "librosa", "pretty_midi", "scipy", "torch"])
+
+
+class Pop2PianoTokenizer(metaclass=DummyObject):
+    _backends = ["essentia", "librosa", "pretty_midi", "scipy", "torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["essentia", "librosa", "pretty_midi", "scipy", "torch"])
+
+
+class Pop2PianoProcessor(metaclass=DummyObject):
+    _backends = ["essentia", "librosa", "pretty_midi", "scipy", "torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["essentia", "librosa", "pretty_midi", "scipy", "torch"])
diff --git a/src/transformers/utils/dummy_flax_objects.py b/src/transformers/utils/dummy_flax_objects.py
index d7271900b2ff..4090e4ff5134 100644
--- a/src/transformers/utils/dummy_flax_objects.py
+++ b/src/transformers/utils/dummy_flax_objects.py
@@ -1,5 +1,4 @@
 # This file is autogenerated by the command `make fix-copies`, do not edit.
-# flake8: noqa
 from ..utils import DummyObject, requires_backends
 
 
@@ -17,6 +16,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["flax"])
 
 
+class FlaxForceTokensLogitsProcessor(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
 class FlaxGenerationMixin(metaclass=DummyObject):
     _backends = ["flax"]
 
@@ -52,6 +58,20 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["flax"])
 
 
+class FlaxSuppressTokensAtBeginLogitsProcessor(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
+class FlaxSuppressTokensLogitsProcessor(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
 class FlaxTemperatureLogitsWarper(metaclass=DummyObject):
     _backends = ["flax"]
 
@@ -73,6 +93,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["flax"])
 
 
+class FlaxWhisperTimeStampLogitsProcessor(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
 class FlaxPreTrainedModel(metaclass=DummyObject):
     _backends = ["flax"]
 
@@ -136,6 +163,9 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["flax"])
 
 
+FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING = None
+
+
 FLAX_MODEL_FOR_CAUSAL_LM_MAPPING = None
 
 
@@ -163,6 +193,9 @@ def __init__(self, *args, **kwargs):
 FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = None
 
 
+FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING = None
+
+
 FLAX_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = None
 
 
@@ -242,6 +275,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["flax"])
 
 
+class FlaxAutoModelForSpeechSeq2Seq(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
 class FlaxAutoModelForTokenClassification(metaclass=DummyObject):
     _backends = ["flax"]
 
@@ -508,6 +548,27 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["flax"])
 
 
+class FlaxBloomForCausalLM(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
+class FlaxBloomModel(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
+class FlaxBloomPreTrainedModel(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
 class FlaxCLIPModel(metaclass=DummyObject):
     _backends = ["flax"]
 
@@ -529,6 +590,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["flax"])
 
 
+class FlaxCLIPTextModelWithProjection(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
 class FlaxCLIPTextPreTrainedModel(metaclass=DummyObject):
     _backends = ["flax"]
 
@@ -872,6 +940,48 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["flax"])
 
 
+class FlaxRegNetForImageClassification(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
+class FlaxRegNetModel(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
+class FlaxRegNetPreTrainedModel(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
+class FlaxResNetForImageClassification(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
+class FlaxResNetModel(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
+class FlaxResNetPreTrainedModel(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
 class FlaxRobertaForCausalLM(metaclass=DummyObject):
     _backends = ["flax"]
 
@@ -1131,6 +1241,34 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["flax"])
 
 
+class FlaxWhisperForAudioClassification(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
+class FlaxWhisperForConditionalGeneration(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
+class FlaxWhisperModel(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
+class FlaxWhisperPreTrainedModel(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
 class FlaxXGLMForCausalLM(metaclass=DummyObject):
     _backends = ["flax"]
 
@@ -1152,6 +1290,16 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["flax"])
 
 
+FLAX_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class FlaxXLMRobertaForCausalLM(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
 class FlaxXLMRobertaForMaskedLM(metaclass=DummyObject):
     _backends = ["flax"]
 
@@ -1192,3 +1340,10 @@ class FlaxXLMRobertaModel(metaclass=DummyObject):
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["flax"])
+
+
+class FlaxXLMRobertaPreTrainedModel(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
diff --git a/src/transformers/utils/dummy_keras_nlp_objects.py b/src/transformers/utils/dummy_keras_nlp_objects.py
index 6d9a466d29e3..c6bb86a6d9b4 100644
--- a/src/transformers/utils/dummy_keras_nlp_objects.py
+++ b/src/transformers/utils/dummy_keras_nlp_objects.py
@@ -1,5 +1,4 @@
 # This file is autogenerated by the command `make fix-copies`, do not edit.
-# flake8: noqa
 from ..utils import DummyObject, requires_backends
 
 
diff --git a/src/transformers/utils/dummy_music_objects.py b/src/transformers/utils/dummy_music_objects.py
new file mode 100644
index 000000000000..89052be47c1d
--- /dev/null
+++ b/src/transformers/utils/dummy_music_objects.py
@@ -0,0 +1,16 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class Pop2PianoFeatureExtractor(metaclass=DummyObject):
+    _backends = ["music"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["music"])
+
+
+class Pop2PianoTokenizer(metaclass=DummyObject):
+    _backends = ["music"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["music"])
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index b3dc278739c9..aad1018e1937 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -1,5 +1,4 @@
 # This file is autogenerated by the command `make fix-copies`, do not edit.
-# flake8: noqa
 from ..utils import DummyObject, requires_backends
 
 
@@ -80,6 +79,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class AlternatingCodebooksLogitsProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class BeamScorer(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -94,6 +100,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class ClassifierFreeGuidanceLogitsProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class ConstrainedBeamSearchScorer(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -122,6 +135,41 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class EncoderNoRepeatNGramLogitsProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class EncoderRepetitionPenaltyLogitsProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class EpsilonLogitsWarper(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class EtaLogitsWarper(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ExponentialDecayLengthPenalty(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class ForcedBOSTokenLogitsProcessor(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -136,6 +184,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class ForceTokensLogitsProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class GenerationMixin(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -157,6 +212,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class LogitNormalization(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class LogitsProcessor(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -241,6 +303,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class SequenceBiasLogitsProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class StoppingCriteria(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -255,6 +324,20 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class SuppressTokensAtBeginLogitsProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class SuppressTokensLogitsProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class TemperatureLogitsWarper(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -283,6 +366,20 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class UnbatchedClassifierFreeGuidanceLogitsProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class WhisperTimeStampLogitsProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 def top_k_top_p_filtering(*args, **kwargs):
     requires_backends(top_k_top_p_filtering, ["torch"])
 
@@ -357,6 +454,37 @@ def load_tf_weights_in_albert(*args, **kwargs):
     requires_backends(load_tf_weights_in_albert, ["torch"])
 
 
+ALIGN_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class AlignModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class AlignPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class AlignTextModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class AlignVisionModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 ALTCLIP_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -415,6 +543,9 @@ def __init__(self, *args, **kwargs):
 MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING = None
 
 
+MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING = None
+
+
 MODEL_FOR_AUDIO_XVECTOR_MAPPING = None
 
 
@@ -442,9 +573,15 @@ def __init__(self, *args, **kwargs):
 MODEL_FOR_IMAGE_SEGMENTATION_MAPPING = None
 
 
+MODEL_FOR_IMAGE_TO_IMAGE_MAPPING = None
+
+
 MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING = None
 
 
+MODEL_FOR_MASK_GENERATION_MAPPING = None
+
+
 MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING = None
 
 
@@ -481,6 +618,15 @@ def __init__(self, *args, **kwargs):
 MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING = None
 
 
+MODEL_FOR_TEXT_ENCODING_MAPPING = None
+
+
+MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING = None
+
+
+MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING = None
+
+
 MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = None
 
 
@@ -496,6 +642,9 @@ def __init__(self, *args, **kwargs):
 MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING = None
 
 
+MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING = None
+
+
 MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING = None
 
 
@@ -582,6 +731,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class AutoModelForImageToImage(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class AutoModelForInstanceSegmentation(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -603,6 +759,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class AutoModelForMaskGeneration(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class AutoModelForMultipleChoice(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -673,6 +836,27 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class AutoModelForTextEncoding(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class AutoModelForTextToSpectrogram(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class AutoModelForTextToWaveform(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class AutoModelForTokenClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -708,6 +892,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class AutoModelForZeroShotImageClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class AutoModelForZeroShotObjectDetection(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -722,6 +913,75 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+AUTOFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class AutoformerForPrediction(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class AutoformerModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class AutoformerPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+BARK_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class BarkCausalModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class BarkCoarseModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class BarkFineModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class BarkModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class BarkPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class BarkSemanticModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 BART_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -760,6 +1020,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class BartPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class BartPretrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -1053,6 +1320,20 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class BioGptForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class BioGptForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class BioGptModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -1212,69 +1493,190 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-BLOOM_PRETRAINED_MODEL_ARCHIVE_LIST = None
+BLIP_2_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
-class BloomForCausalLM(metaclass=DummyObject):
+class Blip2ForConditionalGeneration(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class BloomForQuestionAnswering(metaclass=DummyObject):
+class Blip2Model(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class BloomForSequenceClassification(metaclass=DummyObject):
+class Blip2PreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class BloomForTokenClassification(metaclass=DummyObject):
+class Blip2QFormerModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class BloomModel(metaclass=DummyObject):
+class Blip2VisionModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class BloomPreTrainedModel(metaclass=DummyObject):
+BLOOM_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class BloomForCausalLM(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
-
-
-class CamembertForCausalLM(metaclass=DummyObject):
+class BloomForQuestionAnswering(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class CamembertForMaskedLM(metaclass=DummyObject):
+class BloomForSequenceClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class CamembertForMultipleChoice(metaclass=DummyObject):
+class BloomForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class BloomModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class BloomPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+BRIDGETOWER_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class BridgeTowerForContrastiveLearning(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class BridgeTowerForImageAndTextRetrieval(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class BridgeTowerForMaskedLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class BridgeTowerModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class BridgeTowerPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+BROS_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class BrosForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class BrosModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class BrosPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class BrosProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class BrosSpadeEEForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class BrosSpadeELForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class CamembertForCausalLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class CamembertForMaskedLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class CamembertForMultipleChoice(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
@@ -1403,6 +1805,58 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+CLAP_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class ClapAudioModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ClapAudioModelWithProjection(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ClapFeatureExtractor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ClapModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ClapPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ClapTextModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ClapTextModelWithProjection(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 CLIP_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -1510,6 +1964,37 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class ConditionalDetrForObjectDetection(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ConditionalDetrForSegmentation(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ConditionalDetrModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ConditionalDetrPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -1576,6 +2061,13 @@ def load_tf_weights_in_convbert(*args, **kwargs):
 CONVNEXT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
+class ConvNextBackbone(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class ConvNextForImageClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -1597,6 +2089,61 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+CONVNEXTV2_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class ConvNextV2Backbone(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ConvNextV2ForImageClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ConvNextV2Model(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ConvNextV2PreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+CPMANT_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class CpmAntForCausalLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class CpmAntModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class CpmAntPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -1915,76 +2462,317 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-DEIT_PRETRAINED_MODEL_ARCHIVE_LIST = None
+DEFORMABLE_DETR_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
-class DeiTForImageClassification(metaclass=DummyObject):
+class DeformableDetrForObjectDetection(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DeiTForImageClassificationWithTeacher(metaclass=DummyObject):
+class DeformableDetrModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DeiTForMaskedImageModeling(metaclass=DummyObject):
+class DeformableDetrPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DeiTModel(metaclass=DummyObject):
+DEIT_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class DeiTForImageClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DeiTPreTrainedModel(metaclass=DummyObject):
+class DeiTForImageClassificationWithTeacher(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-DINAT_PRETRAINED_MODEL_ARCHIVE_LIST = None
-
-
-class DinatBackbone(metaclass=DummyObject):
+class DeiTForMaskedImageModeling(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DinatForImageClassification(metaclass=DummyObject):
+class DeiTModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DinatModel(metaclass=DummyObject):
+class DeiTPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DinatPreTrainedModel(metaclass=DummyObject):
+MCTCT_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class MCTCTForCTC(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
+class MCTCTModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MCTCTPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MMBTForClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MMBTModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ModalEmbeddings(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class OpenLlamaForCausalLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class OpenLlamaForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class OpenLlamaModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class OpenLlamaPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class RetriBertModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class RetriBertPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class TrajectoryTransformerModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class TrajectoryTransformerPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+VAN_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class VanForImageClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class VanModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class VanPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+DETA_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class DetaForObjectDetection(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class DetaModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class DetaPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+DETR_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class DetrForObjectDetection(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class DetrForSegmentation(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class DetrModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class DetrPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+DINAT_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class DinatBackbone(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class DinatForImageClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class DinatModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class DinatPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+DINOV2_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class Dinov2Backbone(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Dinov2ForImageClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Dinov2Model(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Dinov2PreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
 class DistilBertForMaskedLM(metaclass=DummyObject):
@@ -2142,6 +2930,61 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class EfficientFormerForImageClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class EfficientFormerForImageClassificationWithTeacher(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class EfficientFormerModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class EfficientFormerPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+EFFICIENTNET_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class EfficientNetForImageClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class EfficientNetModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class EfficientNetPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -2212,6 +3055,23 @@ def load_tf_weights_in_electra(*args, **kwargs):
     requires_backends(load_tf_weights_in_electra, ["torch"])
 
 
+ENCODEC_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class EncodecModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class EncodecPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class EncoderDecoderModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -2292,52 +3152,149 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-ESM_PRETRAINED_MODEL_ARCHIVE_LIST = None
-
-
-class EsmFoldPreTrainedModel(metaclass=DummyObject):
+ERNIE_M_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class ErnieMForInformationExtraction(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ErnieMForMultipleChoice(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ErnieMForQuestionAnswering(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ErnieMForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ErnieMForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ErnieMModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ErnieMPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+ESM_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class EsmFoldPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class EsmForMaskedLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class EsmForProteinFolding(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class EsmForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class EsmForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class EsmModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class EsmPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class EsmForMaskedLM(metaclass=DummyObject):
+FALCON_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class FalconForCausalLM(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class EsmForProteinFolding(metaclass=DummyObject):
+class FalconForQuestionAnswering(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class EsmForSequenceClassification(metaclass=DummyObject):
+class FalconForSequenceClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class EsmForTokenClassification(metaclass=DummyObject):
+class FalconForTokenClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class EsmModel(metaclass=DummyObject):
+class FalconModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class EsmPreTrainedModel(metaclass=DummyObject):
+class FalconPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
@@ -2528,6 +3485,44 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+FOCALNET_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class FocalNetBackbone(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class FocalNetForImageClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class FocalNetForMaskedImageModeling(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class FocalNetModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class FocalNetPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class FSMTForConditionalGeneration(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -2684,6 +3679,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class GPT2ForQuestionAnswering(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class GPT2ForSequenceClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -2723,6 +3725,44 @@ def load_tf_weights_in_gpt2(*args, **kwargs):
     requires_backends(load_tf_weights_in_gpt2, ["torch"])
 
 
+GPT_BIGCODE_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class GPTBigCodeForCausalLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class GPTBigCodeForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class GPTBigCodeForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class GPTBigCodeModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class GPTBigCodePreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 GPT_NEO_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -2733,6 +3773,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class GPTNeoForQuestionAnswering(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class GPTNeoForSequenceClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -2740,6 +3787,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class GPTNeoForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class GPTNeoModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -2768,6 +3822,27 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class GPTNeoXForQuestionAnswering(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class GPTNeoXForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class GPTNeoXForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class GPTNeoXLayer(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -2858,6 +3933,54 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+GPTSAN_JAPANESE_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class GPTSanJapaneseForConditionalGeneration(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class GPTSanJapaneseModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class GPTSanJapanesePreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+GRAPHORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class GraphormerForGraphClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class GraphormerModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class GraphormerPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -2972,6 +4095,37 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+IDEFICS_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class IdeficsForVisionText2Text(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class IdeficsModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class IdeficsPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class IdeficsProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 IMAGEGPT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -3007,6 +4161,61 @@ def load_tf_weights_in_imagegpt(*args, **kwargs):
     requires_backends(load_tf_weights_in_imagegpt, ["torch"])
 
 
+INFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class InformerForPrediction(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class InformerModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class InformerPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+INSTRUCTBLIP_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class InstructBlipForConditionalGeneration(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class InstructBlipPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class InstructBlipQFormerModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class InstructBlipVisionModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -3197,69 +4406,97 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-LEVIT_PRETRAINED_MODEL_ARCHIVE_LIST = None
+LEVIT_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class LevitForImageClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class LevitForImageClassificationWithTeacher(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class LevitModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class LevitPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+LILT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
-class LevitForImageClassification(metaclass=DummyObject):
+class LiltForQuestionAnswering(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class LevitForImageClassificationWithTeacher(metaclass=DummyObject):
+class LiltForSequenceClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class LevitModel(metaclass=DummyObject):
+class LiltForTokenClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class LevitPreTrainedModel(metaclass=DummyObject):
+class LiltModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-LILT_PRETRAINED_MODEL_ARCHIVE_LIST = None
-
-
-class LiltForQuestionAnswering(metaclass=DummyObject):
+class LiltPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class LiltForSequenceClassification(metaclass=DummyObject):
+class LlamaForCausalLM(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class LiltForTokenClassification(metaclass=DummyObject):
+class LlamaForSequenceClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class LiltModel(metaclass=DummyObject):
+class LlamaModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class LiltPreTrainedModel(metaclass=DummyObject):
+class LlamaPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
@@ -3561,6 +4798,30 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+MASK2FORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class Mask2FormerForUniversalSegmentation(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Mask2FormerModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Mask2FormerPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 MASKFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -3634,24 +4895,59 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-MCTCT_PRETRAINED_MODEL_ARCHIVE_LIST = None
+MEGA_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
-class MCTCTForCTC(metaclass=DummyObject):
+class MegaForCausalLM(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class MCTCTModel(metaclass=DummyObject):
+class MegaForMaskedLM(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class MCTCTPreTrainedModel(metaclass=DummyObject):
+class MegaForMultipleChoice(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MegaForQuestionAnswering(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MegaForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MegaForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MegaModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MegaPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
@@ -3731,21 +5027,52 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class MMBTForClassification(metaclass=DummyObject):
+MGP_STR_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class MgpstrForSceneTextRecognition(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class MMBTModel(metaclass=DummyObject):
+class MgpstrModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ModalEmbeddings(metaclass=DummyObject):
+class MgpstrPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MistralForCausalLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MistralForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MistralModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MistralPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
@@ -3923,6 +5250,37 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+MOBILEVITV2_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class MobileViTV2ForImageClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MobileViTV2ForSemanticSegmentation(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MobileViTV2Model(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MobileViTV2PreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 MPNET_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -3968,42 +5326,191 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class MPNetModel(metaclass=DummyObject):
+class MPNetModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MPNetPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+MPT_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class MptForCausalLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MptForQuestionAnswering(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MptForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MptForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MptModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MptPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+MRA_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class MraForMaskedLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MraForMultipleChoice(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MraForQuestionAnswering(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MraForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MraForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MraModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MraPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MT5EncoderModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MT5ForConditionalGeneration(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MT5ForQuestionAnswering(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MT5ForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MT5Model(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MT5PreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class MPNetPreTrainedModel(metaclass=DummyObject):
+MUSICGEN_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class MusicgenForCausalLM(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class MT5EncoderModel(metaclass=DummyObject):
+class MusicgenForConditionalGeneration(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class MT5ForConditionalGeneration(metaclass=DummyObject):
+class MusicgenModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class MT5Model(metaclass=DummyObject):
+class MusicgenPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class MT5PreTrainedModel(metaclass=DummyObject):
+class MusicgenProcessor(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
@@ -4152,6 +5659,44 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+NLLB_MOE_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class NllbMoeForConditionalGeneration(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class NllbMoeModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class NllbMoePreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class NllbMoeSparseMLP(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class NllbMoeTop2Router(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 NYSTROMFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -4211,6 +5756,30 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+ONEFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class OneFormerForUniversalSegmentation(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class OneFormerModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class OneFormerPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -4454,6 +6023,65 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class PersimmonForCausalLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class PersimmonForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class PersimmonModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class PersimmonPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+PIX2STRUCT_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class Pix2StructForConditionalGeneration(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Pix2StructPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Pix2StructTextModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Pix2StructVisionModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 PLBART_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -4516,6 +6144,23 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+POP2PIANO_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class Pop2PianoForConditionalGeneration(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Pop2PianoPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -4561,6 +6206,30 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+PVT_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class PvtForImageClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class PvtModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class PvtPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 QDQBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -4906,23 +6575,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
-
-
-class RetriBertModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class RetriBertPreTrainedModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -5188,6 +6840,47 @@ def load_tf_weights_in_roformer(*args, **kwargs):
     requires_backends(load_tf_weights_in_roformer, ["torch"])
 
 
+RWKV_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class RwkvForCausalLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class RwkvModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class RwkvPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+SAM_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class SamModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class SamPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -5340,6 +7033,51 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+SPEECHT5_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class SpeechT5ForSpeechToSpeech(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class SpeechT5ForSpeechToText(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class SpeechT5ForTextToSpeech(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class SpeechT5HifiGan(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class SpeechT5Model(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class SpeechT5PreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 SPLINTER_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -5388,49 +7126,73 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class SqueezeBertForMultipleChoice(metaclass=DummyObject):
+class SqueezeBertForMultipleChoice(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class SqueezeBertForQuestionAnswering(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class SqueezeBertForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class SqueezeBertForTokenClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class SqueezeBertForQuestionAnswering(metaclass=DummyObject):
+class SqueezeBertModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class SqueezeBertForSequenceClassification(metaclass=DummyObject):
+class SqueezeBertModule(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class SqueezeBertForTokenClassification(metaclass=DummyObject):
+class SqueezeBertPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class SqueezeBertModel(metaclass=DummyObject):
+SWIFTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class SwiftFormerForImageClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class SqueezeBertModule(metaclass=DummyObject):
+class SwiftFormerModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class SqueezeBertPreTrainedModel(metaclass=DummyObject):
+class SwiftFormerPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
@@ -5592,6 +7354,20 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class T5ForQuestionAnswering(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class T5ForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class T5Model(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -5610,6 +7386,30 @@ def load_tf_weights_in_t5(*args, **kwargs):
     requires_backends(load_tf_weights_in_t5, ["torch"])
 
 
+TABLE_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class TableTransformerForObjectDetection(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class TableTransformerModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class TableTransformerPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 TAPAS_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -5700,17 +7500,7 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
-
-
-class TrajectoryTransformerModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class TrajectoryTransformerPreTrainedModel(metaclass=DummyObject):
+class TimmBackbone(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
@@ -5776,6 +7566,79 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+TVLT_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class TvltForAudioVisualClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class TvltForPreTraining(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class TvltModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class TvltPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class UMT5EncoderModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class UMT5ForConditionalGeneration(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class UMT5ForQuestionAnswering(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class UMT5ForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class UMT5Model(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class UMT5PreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 UNISPEECH_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -5866,24 +7729,14 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-VAN_PRETRAINED_MODEL_ARCHIVE_LIST = None
-
-
-class VanForImageClassification(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class VanModel(metaclass=DummyObject):
+class UperNetForSemanticSegmentation(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class VanPreTrainedModel(metaclass=DummyObject):
+class UperNetPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
@@ -6163,6 +8016,88 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+VITDET_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class VitDetBackbone(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class VitDetModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class VitDetPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+VITMATTE_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class VitMatteForImageMatting(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class VitMattePreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+VITS_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class VitsModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class VitsPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+VIVIT_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class VivitForVideoClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class VivitModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class VivitPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -6322,6 +8257,13 @@ def __init__(self, *args, **kwargs):
 WHISPER_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
+class WhisperForAudioClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class WhisperForConditionalGeneration(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -6683,6 +8625,65 @@ def load_tf_weights_in_xlnet(*args, **kwargs):
     requires_backends(load_tf_weights_in_xlnet, ["torch"])
 
 
+XMOD_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class XmodForCausalLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class XmodForMaskedLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class XmodForMultipleChoice(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class XmodForQuestionAnswering(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class XmodForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class XmodForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class XmodModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class XmodPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 YOLOS_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -6796,6 +8797,10 @@ def get_cosine_with_hard_restarts_schedule_with_warmup(*args, **kwargs):
     requires_backends(get_cosine_with_hard_restarts_schedule_with_warmup, ["torch"])
 
 
+def get_inverse_sqrt_schedule(*args, **kwargs):
+    requires_backends(get_inverse_sqrt_schedule, ["torch"])
+
+
 def get_linear_schedule_with_warmup(*args, **kwargs):
     requires_backends(get_linear_schedule_with_warmup, ["torch"])
 
diff --git a/src/transformers/utils/dummy_sentencepiece_and_speech_objects.py b/src/transformers/utils/dummy_sentencepiece_and_speech_objects.py
deleted file mode 100644
index b9b971f1f15c..000000000000
--- a/src/transformers/utils/dummy_sentencepiece_and_speech_objects.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# This file is autogenerated by the command `make fix-copies`, do not edit.
-# flake8: noqa
-from ..utils import DummyObject, requires_backends
-
-
-class Speech2TextProcessor(metaclass=DummyObject):
-    _backends = ["sentencepiece", "speech"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece", "speech"])
diff --git a/src/transformers/utils/dummy_sentencepiece_and_tokenizers_objects.py b/src/transformers/utils/dummy_sentencepiece_and_tokenizers_objects.py
index 476117fea666..38775330a81d 100644
--- a/src/transformers/utils/dummy_sentencepiece_and_tokenizers_objects.py
+++ b/src/transformers/utils/dummy_sentencepiece_and_tokenizers_objects.py
@@ -1,5 +1,4 @@
 # This file is autogenerated by the command `make fix-copies`, do not edit.
-# flake8: noqa
 from ..utils import DummyObject, requires_backends
 
 
diff --git a/src/transformers/utils/dummy_sentencepiece_objects.py b/src/transformers/utils/dummy_sentencepiece_objects.py
index 9d9c7613cd0a..32bf223d5722 100644
--- a/src/transformers/utils/dummy_sentencepiece_objects.py
+++ b/src/transformers/utils/dummy_sentencepiece_objects.py
@@ -1,5 +1,4 @@
 # This file is autogenerated by the command `make fix-copies`, do not edit.
-# flake8: noqa
 from ..utils import DummyObject, requires_backends
 
 
@@ -45,6 +44,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["sentencepiece"])
 
 
+class CodeLlamaTokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
 class CpmTokenizer(metaclass=DummyObject):
     _backends = ["sentencepiece"]
 
@@ -59,6 +65,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["sentencepiece"])
 
 
+class ErnieMTokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
 class FNetTokenizer(metaclass=DummyObject):
     _backends = ["sentencepiece"]
 
@@ -80,6 +93,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["sentencepiece"])
 
 
+class LlamaTokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
 class M2M100Tokenizer(metaclass=DummyObject):
     _backends = ["sentencepiece"]
 
@@ -164,6 +184,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["sentencepiece"])
 
 
+class SpeechT5Tokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
 class T5Tokenizer(metaclass=DummyObject):
     _backends = ["sentencepiece"]
 
diff --git a/src/transformers/utils/dummy_speech_objects.py b/src/transformers/utils/dummy_speech_objects.py
index d1929dd2853b..0bf08ebea42b 100644
--- a/src/transformers/utils/dummy_speech_objects.py
+++ b/src/transformers/utils/dummy_speech_objects.py
@@ -1,5 +1,4 @@
 # This file is autogenerated by the command `make fix-copies`, do not edit.
-# flake8: noqa
 from ..utils import DummyObject, requires_backends
 
 
@@ -10,13 +9,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["speech"])
 
 
-class MCTCTFeatureExtractor(metaclass=DummyObject):
-    _backends = ["speech"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["speech"])
-
-
 class Speech2TextFeatureExtractor(metaclass=DummyObject):
     _backends = ["speech"]
 
diff --git a/src/transformers/utils/dummy_tensorflow_text_objects.py b/src/transformers/utils/dummy_tensorflow_text_objects.py
index 691774bb6bbf..70c7ad5cbf40 100644
--- a/src/transformers/utils/dummy_tensorflow_text_objects.py
+++ b/src/transformers/utils/dummy_tensorflow_text_objects.py
@@ -1,5 +1,4 @@
 # This file is autogenerated by the command `make fix-copies`, do not edit.
-# flake8: noqa
 from ..utils import DummyObject, requires_backends
 
 
diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py
index 624e08b88e9e..972ab49c0f5b 100644
--- a/src/transformers/utils/dummy_tf_objects.py
+++ b/src/transformers/utils/dummy_tf_objects.py
@@ -1,5 +1,4 @@
 # This file is autogenerated by the command `make fix-copies`, do not edit.
-# flake8: noqa
 from ..utils import DummyObject, requires_backends
 
 
@@ -31,6 +30,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
+class TFForceTokensLogitsProcessor(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
 class TFGenerationMixin(metaclass=DummyObject):
     _backends = ["tf"]
 
@@ -87,91 +93,53 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFTemperatureLogitsWarper(metaclass=DummyObject):
-    _backends = ["tf"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["tf"])
-
-
-class TFTopKLogitsWarper(metaclass=DummyObject):
-    _backends = ["tf"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["tf"])
-
-
-class TFTopPLogitsWarper(metaclass=DummyObject):
-    _backends = ["tf"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["tf"])
-
-
-def tf_top_k_top_p_filtering(*args, **kwargs):
-    requires_backends(tf_top_k_top_p_filtering, ["tf"])
-
-
-class KerasMetricCallback(metaclass=DummyObject):
-    _backends = ["tf"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["tf"])
-
-
-class PushToHubCallback(metaclass=DummyObject):
+class TFSuppressTokensAtBeginLogitsProcessor(metaclass=DummyObject):
     _backends = ["tf"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-TF_LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST = None
-
-
-class TFLayoutLMForMaskedLM(metaclass=DummyObject):
+class TFSuppressTokensLogitsProcessor(metaclass=DummyObject):
     _backends = ["tf"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFLayoutLMForQuestionAnswering(metaclass=DummyObject):
+class TFTemperatureLogitsWarper(metaclass=DummyObject):
     _backends = ["tf"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFLayoutLMForSequenceClassification(metaclass=DummyObject):
+class TFTopKLogitsWarper(metaclass=DummyObject):
     _backends = ["tf"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFLayoutLMForTokenClassification(metaclass=DummyObject):
+class TFTopPLogitsWarper(metaclass=DummyObject):
     _backends = ["tf"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFLayoutLMMainLayer(metaclass=DummyObject):
-    _backends = ["tf"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["tf"])
+def tf_top_k_top_p_filtering(*args, **kwargs):
+    requires_backends(tf_top_k_top_p_filtering, ["tf"])
 
 
-class TFLayoutLMModel(metaclass=DummyObject):
+class KerasMetricCallback(metaclass=DummyObject):
     _backends = ["tf"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFLayoutLMPreTrainedModel(metaclass=DummyObject):
+class PushToHubCallback(metaclass=DummyObject):
     _backends = ["tf"]
 
     def __init__(self, *args, **kwargs):
@@ -269,6 +237,9 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
+TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING = None
+
+
 TF_MODEL_FOR_CAUSAL_LM_MAPPING = None
 
 
@@ -278,6 +249,9 @@ def __init__(self, *args, **kwargs):
 TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING = None
 
 
+TF_MODEL_FOR_MASK_GENERATION_MAPPING = None
+
+
 TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING = None
 
 
@@ -311,12 +285,18 @@ def __init__(self, *args, **kwargs):
 TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING = None
 
 
+TF_MODEL_FOR_TEXT_ENCODING_MAPPING = None
+
+
 TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = None
 
 
 TF_MODEL_FOR_VISION_2_SEQ_MAPPING = None
 
 
+TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING = None
+
+
 TF_MODEL_MAPPING = None
 
 
@@ -330,6 +310,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
+class TFAutoModelForAudioClassification(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
 class TFAutoModelForCausalLM(metaclass=DummyObject):
     _backends = ["tf"]
 
@@ -351,6 +338,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
+class TFAutoModelForMaskedImageModeling(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
 class TFAutoModelForMaskedLM(metaclass=DummyObject):
     _backends = ["tf"]
 
@@ -358,6 +352,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
+class TFAutoModelForMaskGeneration(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
 class TFAutoModelForMultipleChoice(metaclass=DummyObject):
     _backends = ["tf"]
 
@@ -421,6 +422,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
+class TFAutoModelForTextEncoding(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
 class TFAutoModelForTokenClassification(metaclass=DummyObject):
     _backends = ["tf"]
 
@@ -435,6 +443,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
+class TFAutoModelForZeroShotImageClassification(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
 class TFAutoModelWithLMHead(metaclass=DummyObject):
     _backends = ["tf"]
 
@@ -599,6 +614,58 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
+TF_BLIP_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class TFBlipForConditionalGeneration(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFBlipForImageTextRetrieval(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFBlipForQuestionAnswering(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFBlipModel(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFBlipPreTrainedModel(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFBlipTextModel(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFBlipVisionModel(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
 TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -907,6 +974,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
+class TFDebertaV2ForMultipleChoice(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
 class TFDebertaV2ForQuestionAnswering(metaclass=DummyObject):
     _backends = ["tf"]
 
@@ -1090,6 +1164,37 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
+TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class TFEfficientFormerForImageClassification(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFEfficientFormerForImageClassificationWithTeacher(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFEfficientFormerModel(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFEfficientFormerPreTrainedModel(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
 TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -1447,6 +1552,58 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
+TF_LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class TFLayoutLMForMaskedLM(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFLayoutLMForQuestionAnswering(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFLayoutLMForSequenceClassification(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFLayoutLMForTokenClassification(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFLayoutLMMainLayer(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFLayoutLMModel(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFLayoutLMPreTrainedModel(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
 TF_LAYOUTLMV3_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -1648,9 +1805,6 @@ def __init__(self, *args, **kwargs):
 TF_MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
-TF_MOBILEVIT_PRETRAINED_MODEL_ARCHIVE_LIST = None
-
-
 class TFMobileBertForMaskedLM(metaclass=DummyObject):
     _backends = ["tf"]
 
@@ -1721,6 +1875,9 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
+TF_MOBILEVIT_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
 class TFMobileViTForImageClassification(metaclass=DummyObject):
     _backends = ["tf"]
 
@@ -2256,6 +2413,23 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
+TF_SAM_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class TFSamModel(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFSamPreTrainedModel(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
 TF_SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -2470,6 +2644,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
+class TFVisionTextDualEncoderModel(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
 class TFViTForImageClassification(metaclass=DummyObject):
     _backends = ["tf"]
 
@@ -2522,6 +2703,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
+class TFWav2Vec2ForSequenceClassification(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
 class TFWav2Vec2Model(metaclass=DummyObject):
     _backends = ["tf"]
 
@@ -2646,6 +2834,13 @@ def __init__(self, *args, **kwargs):
 TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
+class TFXLMRobertaForCausalLM(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
 class TFXLMRobertaForMaskedLM(metaclass=DummyObject):
     _backends = ["tf"]
 
@@ -2688,6 +2883,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
+class TFXLMRobertaPreTrainedModel(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
 TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/src/transformers/utils/dummy_timm_and_vision_objects.py b/src/transformers/utils/dummy_timm_and_vision_objects.py
deleted file mode 100644
index b4be05ece1af..000000000000
--- a/src/transformers/utils/dummy_timm_and_vision_objects.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# This file is autogenerated by the command `make fix-copies`, do not edit.
-# flake8: noqa
-from ..utils import DummyObject, requires_backends
-
-
-CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST = None
-
-
-class ConditionalDetrForObjectDetection(metaclass=DummyObject):
-    _backends = ["timm", "vision"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["timm", "vision"])
-
-
-class ConditionalDetrForSegmentation(metaclass=DummyObject):
-    _backends = ["timm", "vision"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["timm", "vision"])
-
-
-class ConditionalDetrModel(metaclass=DummyObject):
-    _backends = ["timm", "vision"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["timm", "vision"])
-
-
-class ConditionalDetrPreTrainedModel(metaclass=DummyObject):
-    _backends = ["timm", "vision"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["timm", "vision"])
-
-
-DEFORMABLE_DETR_PRETRAINED_MODEL_ARCHIVE_LIST = None
-
-
-class DeformableDetrForObjectDetection(metaclass=DummyObject):
-    _backends = ["timm", "vision"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["timm", "vision"])
-
-
-class DeformableDetrModel(metaclass=DummyObject):
-    _backends = ["timm", "vision"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["timm", "vision"])
-
-
-class DeformableDetrPreTrainedModel(metaclass=DummyObject):
-    _backends = ["timm", "vision"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["timm", "vision"])
-
-
-DETR_PRETRAINED_MODEL_ARCHIVE_LIST = None
-
-
-class DetrForObjectDetection(metaclass=DummyObject):
-    _backends = ["timm", "vision"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["timm", "vision"])
-
-
-class DetrForSegmentation(metaclass=DummyObject):
-    _backends = ["timm", "vision"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["timm", "vision"])
-
-
-class DetrModel(metaclass=DummyObject):
-    _backends = ["timm", "vision"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["timm", "vision"])
-
-
-class DetrPreTrainedModel(metaclass=DummyObject):
-    _backends = ["timm", "vision"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["timm", "vision"])
-
-
-TABLE_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
-
-
-class TableTransformerForObjectDetection(metaclass=DummyObject):
-    _backends = ["timm", "vision"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["timm", "vision"])
-
-
-class TableTransformerModel(metaclass=DummyObject):
-    _backends = ["timm", "vision"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["timm", "vision"])
-
-
-class TableTransformerPreTrainedModel(metaclass=DummyObject):
-    _backends = ["timm", "vision"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["timm", "vision"])
diff --git a/src/transformers/utils/dummy_tokenizers_objects.py b/src/transformers/utils/dummy_tokenizers_objects.py
index 8a24d9bea6b2..07234f74dd0a 100644
--- a/src/transformers/utils/dummy_tokenizers_objects.py
+++ b/src/transformers/utils/dummy_tokenizers_objects.py
@@ -1,5 +1,4 @@
 # This file is autogenerated by the command `make fix-copies`, do not edit.
-# flake8: noqa
 from ..utils import DummyObject, requires_backends
 
 
@@ -73,6 +72,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tokenizers"])
 
 
+class CodeLlamaTokenizerFast(metaclass=DummyObject):
+    _backends = ["tokenizers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tokenizers"])
+
+
 class CodeGenTokenizerFast(metaclass=DummyObject):
     _backends = ["tokenizers"]
 
@@ -108,6 +114,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tokenizers"])
 
 
+class RetriBertTokenizerFast(metaclass=DummyObject):
+    _backends = ["tokenizers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tokenizers"])
+
+
 class DistilBertTokenizerFast(metaclass=DummyObject):
     _backends = ["tokenizers"]
 
@@ -220,6 +233,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tokenizers"])
 
 
+class LlamaTokenizerFast(metaclass=DummyObject):
+    _backends = ["tokenizers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tokenizers"])
+
+
 class LongformerTokenizerFast(metaclass=DummyObject):
     _backends = ["tokenizers"]
 
@@ -290,42 +310,42 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tokenizers"])
 
 
-class OpenAIGPTTokenizerFast(metaclass=DummyObject):
+class NougatTokenizerFast(metaclass=DummyObject):
     _backends = ["tokenizers"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tokenizers"])
 
 
-class PegasusTokenizerFast(metaclass=DummyObject):
+class OpenAIGPTTokenizerFast(metaclass=DummyObject):
     _backends = ["tokenizers"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tokenizers"])
 
 
-class RealmTokenizerFast(metaclass=DummyObject):
+class PegasusTokenizerFast(metaclass=DummyObject):
     _backends = ["tokenizers"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tokenizers"])
 
 
-class ReformerTokenizerFast(metaclass=DummyObject):
+class RealmTokenizerFast(metaclass=DummyObject):
     _backends = ["tokenizers"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tokenizers"])
 
 
-class RemBertTokenizerFast(metaclass=DummyObject):
+class ReformerTokenizerFast(metaclass=DummyObject):
     _backends = ["tokenizers"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tokenizers"])
 
 
-class RetriBertTokenizerFast(metaclass=DummyObject):
+class RemBertTokenizerFast(metaclass=DummyObject):
     _backends = ["tokenizers"]
 
     def __init__(self, *args, **kwargs):
@@ -367,6 +387,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tokenizers"])
 
 
+class WhisperTokenizerFast(metaclass=DummyObject):
+    _backends = ["tokenizers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tokenizers"])
+
+
 class XGLMTokenizerFast(metaclass=DummyObject):
     _backends = ["tokenizers"]
 
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 9237d637c3df..0d8383b38b0c 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -1,5 +1,4 @@
 # This file is autogenerated by the command `make fix-copies`, do not edit.
-# flake8: noqa
 from ..utils import DummyObject, requires_backends
 
 
@@ -45,6 +44,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class BridgeTowerImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class ChineseCLIPFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
@@ -129,6 +135,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class DetaImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class DetrFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
@@ -171,6 +184,20 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class EfficientFormerImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
+class EfficientNetImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class FlavaFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
@@ -206,6 +233,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class IdeficsImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class ImageGPTFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
@@ -262,6 +296,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class Mask2FormerImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class MaskFormerFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
@@ -318,6 +359,20 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class NougatImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
+class OneFormerImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class OwlViTFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
@@ -346,6 +401,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class Pix2StructImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class PoolFormerFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
@@ -360,6 +422,20 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class PvtImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
+class SamImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class SegformerFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
@@ -381,6 +457,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class TvltImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class VideoMAEFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
@@ -437,6 +520,20 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class VitMatteImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
+class VivitImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class YolosFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
diff --git a/src/transformers/utils/fx.py b/src/transformers/utils/fx.py
index 7e951fdb1921..0eba32d25931 100755
--- a/src/transformers/utils/fx.py
+++ b/src/transformers/utils/fx.py
@@ -25,9 +25,9 @@
 from typing import Any, Callable, Dict, List, Optional, Type, Union
 
 import torch
-from packaging import version
 from torch import nn
 from torch.fx import Graph, GraphModule, Proxy, Tracer
+from torch.fx._compatibility import compatibility
 from torch.fx.proxy import ParameterProxy
 
 from .. import PretrainedConfig, PreTrainedModel, logging
@@ -50,10 +50,20 @@
     MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES,
     MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES,
     MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES,
+    MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES,
     MODEL_MAPPING_NAMES,
 )
-from ..utils import ENV_VARS_TRUE_VALUES, TORCH_FX_REQUIRED_VERSION, is_torch_fx_available
-from ..utils.versions import importlib_metadata
+from ..utils import (
+    ENV_VARS_TRUE_VALUES,
+    TORCH_FX_REQUIRED_VERSION,
+    get_torch_version,
+    is_peft_available,
+    is_torch_fx_available,
+)
+
+
+if is_peft_available():
+    from peft import PeftModel
 
 
 logger = logging.get_logger(__name__)
@@ -64,7 +74,6 @@ def _generate_supported_model_class_names(
     model_name: Type[PretrainedConfig],
     supported_tasks: Optional[Union[str, List[str]]] = None,
 ) -> List[str]:
-
     task_mapping = {
         "default": MODEL_MAPPING_NAMES,
         "pretraining": MODEL_FOR_PRETRAINING_MAPPING_NAMES,
@@ -80,6 +89,7 @@ def _generate_supported_model_class_names(
         "token-classification": MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES,
         "masked-image-modeling": MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES,
         "image-classification": MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES,
+        "zero-shot-image-classification": MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES,
         "ctc": MODEL_FOR_CTC_MAPPING_NAMES,
         "audio-classification": MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES,
         "semantic-segmentation": MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES,
@@ -163,6 +173,8 @@ def _generate_supported_model_class_names(
     "GPT2DoubleHeadsModel",
     "Speech2Text2Decoder",
     "TrOCRDecoder",
+    "PeftModelForCausalLM",
+    "PeftModelForSeq2SeqLM"
     # TODO: add support for them as it should be quite easy to do so (small blocking issues).
     # XLNetForQuestionAnswering,
 ]
@@ -170,13 +182,13 @@ def _generate_supported_model_class_names(
 
 
 def torch_nn_embedding(self, input):
-    return torch.empty(*input.shape, self.weight.shape[-1], device="meta")
+    return torch.empty(*input.shape, self.weight.shape[-1], device="meta", dtype=self.weight.dtype)
 
 
 def torch_nn_functional_embedding(
     input, weight, padding_idx=None, max_norm=None, norm_type=2.0, scale_grad_by_freq=False, sparse=False
 ):
-    return torch.empty(*input.shape, weight.shape[-1], device="meta")
+    return torch.empty(*input.shape, weight.shape[-1], device="meta", dtype=weight.dtype)
 
 
 def torch_nn_layernorm(self, input):
@@ -362,6 +374,26 @@ def torch_tensor_repeat(self, *sizes):
     return torch.empty(shape, device="meta")
 
 
+def torch_repeat_interleave(*args, dim=None, output_size=None):
+    num_args = len(args)
+    if num_args == 1:
+        shape = [output_size if output_size is not None else args[0].sum()]
+    else:
+        shape = list(args[0].shape)
+        if dim is None:
+            if num_args > 2:
+                dim = args[2]
+            else:
+                shape = [sum(shape)]
+                dim = 0
+        repeats = args[1]
+        if isinstance(repeats, int) or torch.numel(repeats) == 1:
+            shape[dim] *= int(repeats)
+        else:
+            shape[dim] = output_size if output_size is not None else repeats.sum()
+    return torch.empty(*shape, device="meta")
+
+
 def torch_index_select(input, dim, index, *, out=None):
     shape = list(input.shape)
     shape[dim] = len(index)
@@ -372,6 +404,16 @@ def torch_tensor_index_select(self, dim, index):
     return torch_index_select(self, dim, index)
 
 
+def torch_gather(input, dim, index, *, sparse_grad=False, out=None):
+    shape = list(input.shape)
+    shape[dim] = index.shape[dim]
+    return torch.empty(*shape, device="meta")
+
+
+def torch_tensor_gather(self, dim, index):
+    return torch_gather(self, dim, index)
+
+
 def torch_roll(input, shifts, dims=None):
     return input
 
@@ -538,11 +580,14 @@ def to_concrete(t):
     torch.Tensor.baddbmm: torch_tensor_baddbmm,
     torch.einsum: torch_einsum,
     torch.Tensor.repeat: torch_tensor_repeat,
+    torch.repeat_interleave: torch_repeat_interleave,
     torch.roll: torch_roll,
     torch.flip: torch_flip,
     torch.Tensor.flip: torch_tensor_flip,
     torch.index_select: torch_index_select,
     torch.Tensor.index_select: torch_tensor_index_select,
+    torch.gather: torch_gather,
+    torch.Tensor.gather: torch_tensor_gather,
     torch.nn.Conv1d: torch_nn_conv1d,
     torch.nn.Conv2d: torch_nn_conv2d,
     torch.squeeze: torch_squeeze,
@@ -690,15 +735,14 @@ class HFTracer(Tracer):
         "clamp",
         "finfo",
     ]
+    supported_archs = (PreTrainedModel,) if not is_peft_available() else (PreTrainedModel, PeftModel)
 
     def __init__(self, autowrap_modules=(math,), autowrap_functions=()):
-
         super().__init__(autowrap_modules=autowrap_modules, autowrap_functions=autowrap_functions)
 
         if not is_torch_fx_available():
-            torch_version = version.parse(importlib_metadata.version("torch"))
             raise ImportError(
-                f"Found an incompatible version of torch. Found version {torch_version}, but only version "
+                f"Found an incompatible version of torch. Found version {get_torch_version()}, but only version "
                 f"{TORCH_FX_REQUIRED_VERSION} is supported."
             )
 
@@ -713,7 +757,6 @@ def _generate_dummy_input(
         inputs_dict = {}
 
         if input_name in ["labels", "start_positions", "end_positions"]:
-
             batch_size = shape[0]
             if model_class_name in [
                 *get_values(MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES),
@@ -762,6 +805,8 @@ def _generate_dummy_input(
                 *get_values(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES),
                 *get_values(MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES),
                 "GPT2DoubleHeadsModel",
+                "PeftModelForCausalLM",
+                "PeftModelForSeq2SeqLM",
             ]:
                 inputs_dict["labels"] = torch.zeros(shape, dtype=torch.long, device=device)
             elif model_class_name in [*get_values(MODEL_FOR_CTC_MAPPING_NAMES)]:
@@ -987,7 +1032,13 @@ def trace(
                     continue
                 if param.default is inspect.Parameter.empty:
                     raise ValueError(f"You need to specify a default value for the parameter {param.name}.")
-            concrete_args.update({p.name: p.default for p in sig.parameters.values() if p.name not in dummy_inputs})
+            concrete_args.update(
+                {
+                    p.name: p.default
+                    for p in sig.parameters.values()
+                    if (p.name not in dummy_inputs and p.name not in concrete_args)
+                }
+            )
 
         input_names = sig.parameters.keys() - concrete_args.keys()
 
@@ -1006,7 +1057,9 @@ def trace(
                 continue
             # We enforce that root must either be a PreTrainedModel or deserialized from a serialized traced model to
             # be able to use HFTracer._generate_dummy_input.
-            if isinstance(root, PreTrainedModel) or type(root).__qualname__.startswith("_deserialize_graph_module"):
+            if isinstance(root, self.supported_archs) or type(root).__qualname__.startswith(
+                ("_deserialize_graph_module", "_CodeOnlyModule")
+            ):
                 inputs.update(self._generate_dummy_input(root, input_name, shape))
             else:
                 raise RuntimeError(
@@ -1119,6 +1172,17 @@ def is_leaf_module(self, m: torch.nn.Module, module_qualified_name: str) -> bool
             m, module_qualified_name
         )
 
+    @compatibility(is_backward_compatible=True)
+    def keys(self, obj: "Proxy") -> Any:
+        """Called when a proxy object is has the keys() method called.
+        This is what happens when ** is called on a proxy. This should return an iterator if ** is supposed to work in
+        your custom tracer.
+        """
+        attribute = HFAttribute(obj, "keys")()
+        if obj.node.target == "**kwargs":
+            return attribute._metadata
+        return attribute
+
 
 def get_concrete_args(model: nn.Module, input_names: List[str]):
     sig = inspect.signature(model.forward)
@@ -1146,8 +1210,8 @@ def symbolic_trace(
     model: PreTrainedModel,
     input_names: Optional[List[str]] = None,
     disable_check: bool = False,
+    tracer_cls: Type[HFTracer] = HFTracer,
 ) -> GraphModule:
-
     """
     Performs symbolic tracing on the model.
 
@@ -1158,6 +1222,8 @@ def symbolic_trace(
             The names of the inputs of the traced model. If unset, model.dummy_inputs.keys() are used instead.
         disable_check (`bool`, *optional*, defaults to `False`):
             If `True`, no check is done before trying to trace the model, this is mostly usesul for debugging purposes.
+        tracer_cls (`Type[HFTracer]`, *optional*, defaults to `HFTracer`):
+            The tracer class to use for instantiating the tracer. If unset, `HFTracer` is used instead.
 
     Returns:
         `torch.fx.GraphModule`: A GraphModule constructed by recording operations seen while tracing the model.
@@ -1180,7 +1246,7 @@ def symbolic_trace(
         check_if_model_is_supported(model)
 
     # Tracing.
-    tracer = HFTracer()
+    tracer = tracer_cls()
     traced_graph = tracer.trace(model, concrete_args=concrete_args)
     traced = torch.fx.GraphModule(model, traced_graph)
 
diff --git a/src/transformers/utils/generic.py b/src/transformers/utils/generic.py
index b601d1e61b1f..dc9ca4b51d0f 100644
--- a/src/transformers/utils/generic.py
+++ b/src/transformers/utils/generic.py
@@ -20,7 +20,7 @@
 from collections import OrderedDict, UserDict
 from collections.abc import MutableMapping
 from contextlib import ExitStack, contextmanager
-from dataclasses import fields
+from dataclasses import fields, is_dataclass
 from enum import Enum
 from typing import Any, ContextManager, List, Tuple
 
@@ -29,9 +29,6 @@
 from .import_utils import is_flax_available, is_tf_available, is_torch_available, is_torch_fx_proxy
 
 
-if is_tf_available():
-    import tensorflow as tf
-
 if is_flax_available():
     import jax.numpy as jnp
 
@@ -59,31 +56,79 @@ def __get__(self, obj, objtype=None):
         return cached
 
 
-def is_tensor(x):
+# vendored from distutils.util
+def strtobool(val):
+    """Convert a string representation of truth to true (1) or false (0).
+
+    True values are 'y', 'yes', 't', 'true', 'on', and '1'; false values are 'n', 'no', 'f', 'false', 'off', and '0'.
+    Raises ValueError if 'val' is anything else.
     """
-    Tests if `x` is a `torch.Tensor`, `tf.Tensor`, `jaxlib.xla_extension.DeviceArray` or `np.ndarray`.
+    val = val.lower()
+    if val in {"y", "yes", "t", "true", "on", "1"}:
+        return 1
+    if val in {"n", "no", "f", "false", "off", "0"}:
+        return 0
+    raise ValueError(f"invalid truth value {val!r}")
+
+
+def infer_framework_from_repr(x):
     """
-    if is_torch_fx_proxy(x):
-        return True
-    if is_torch_available():
-        import torch
+    Tries to guess the framework of an object `x` from its repr (brittle but will help in `is_tensor` to try the
+    frameworks in a smart order, without the need to import the frameworks).
+    """
+    representation = str(type(x))
+    if representation.startswith("<class 'torch."):
+        return "pt"
+    elif representation.startswith("<class 'tensorflow."):
+        return "tf"
+    elif representation.startswith("<class 'jax"):
+        return "jax"
+    elif representation.startswith("<class 'numpy."):
+        return "np"
 
-        if isinstance(x, torch.Tensor):
-            return True
-    if is_tf_available():
-        import tensorflow as tf
 
-        if isinstance(x, tf.Tensor):
+def _get_frameworks_and_test_func(x):
+    """
+    Returns an (ordered since we are in Python 3.7+) dictionary framework to test function, which places the framework
+    we can guess from the repr first, then Numpy, then the others.
+    """
+    framework_to_test = {
+        "pt": is_torch_tensor,
+        "tf": is_tf_tensor,
+        "jax": is_jax_tensor,
+        "np": is_numpy_array,
+    }
+    preferred_framework = infer_framework_from_repr(x)
+    # We will test this one first, then numpy, then the others.
+    frameworks = [] if preferred_framework is None else [preferred_framework]
+    if preferred_framework != "np":
+        frameworks.append("np")
+    frameworks.extend([f for f in framework_to_test if f not in [preferred_framework, "np"]])
+    return {f: framework_to_test[f] for f in frameworks}
+
+
+def is_tensor(x):
+    """
+    Tests if `x` is a `torch.Tensor`, `tf.Tensor`, `jaxlib.xla_extension.DeviceArray` or `np.ndarray` in the order
+    defined by `infer_framework_from_repr`
+    """
+    # This gives us a smart order to test the frameworks with the corresponding tests.
+    framework_to_test_func = _get_frameworks_and_test_func(x)
+    for test_func in framework_to_test_func.values():
+        if test_func(x):
             return True
 
+    # Tracers
+    if is_torch_fx_proxy(x):
+        return True
+
     if is_flax_available():
-        import jax.numpy as jnp
         from jax.core import Tracer
 
-        if isinstance(x, (jnp.ndarray, Tracer)):
+        if isinstance(x, Tracer):
             return True
 
-    return isinstance(x, np.ndarray)
+    return False
 
 
 def _is_numpy(x):
@@ -154,6 +199,23 @@ def is_tf_tensor(x):
     return False if not is_tf_available() else _is_tensorflow(x)
 
 
+def _is_tf_symbolic_tensor(x):
+    import tensorflow as tf
+
+    # the `is_symbolic_tensor` predicate is only available starting with TF 2.14
+    if hasattr(tf, "is_symbolic_tensor"):
+        return tf.is_symbolic_tensor(x)
+    return type(x) == tf.Tensor
+
+
+def is_tf_symbolic_tensor(x):
+    """
+    Tests if `x` is a tensorflow symbolic tensor or not (ie. not eager). Safe to call even if tensorflow is not
+    installed.
+    """
+    return False if not is_tf_available() else _is_tf_symbolic_tensor(x)
+
+
 def _is_jax(x):
     import jax.numpy as jnp  # noqa: F811
 
@@ -171,17 +233,27 @@ def to_py_obj(obj):
     """
     Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a python list.
     """
+
+    framework_to_py_obj = {
+        "pt": lambda obj: obj.detach().cpu().tolist(),
+        "tf": lambda obj: obj.numpy().tolist(),
+        "jax": lambda obj: np.asarray(obj).tolist(),
+        "np": lambda obj: obj.tolist(),
+    }
+
     if isinstance(obj, (dict, UserDict)):
         return {k: to_py_obj(v) for k, v in obj.items()}
     elif isinstance(obj, (list, tuple)):
         return [to_py_obj(o) for o in obj]
-    elif is_tf_tensor(obj):
-        return obj.numpy().tolist()
-    elif is_torch_tensor(obj):
-        return obj.detach().cpu().tolist()
-    elif is_jax_tensor(obj):
-        return np.asarray(obj).tolist()
-    elif isinstance(obj, (np.ndarray, np.number)):  # tolist also works on 0d np arrays
+
+    # This gives us a smart order to test the frameworks with the corresponding tests.
+    framework_to_test_func = _get_frameworks_and_test_func(obj)
+    for framework, test_func in framework_to_test_func.items():
+        if test_func(obj):
+            return framework_to_py_obj[framework](obj)
+
+    # tolist also works on 0d np arrays
+    if isinstance(obj, np.number):
         return obj.tolist()
     else:
         return obj
@@ -191,18 +263,26 @@ def to_numpy(obj):
     """
     Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a Numpy array.
     """
+
+    framework_to_numpy = {
+        "pt": lambda obj: obj.detach().cpu().numpy(),
+        "tf": lambda obj: obj.numpy(),
+        "jax": lambda obj: np.asarray(obj),
+        "np": lambda obj: obj,
+    }
+
     if isinstance(obj, (dict, UserDict)):
         return {k: to_numpy(v) for k, v in obj.items()}
     elif isinstance(obj, (list, tuple)):
         return np.array(obj)
-    elif is_tf_tensor(obj):
-        return obj.numpy()
-    elif is_torch_tensor(obj):
-        return obj.detach().cpu().numpy()
-    elif is_jax_tensor(obj):
-        return np.asarray(obj)
-    else:
-        return obj
+
+    # This gives us a smart order to test the frameworks with the corresponding tests.
+    framework_to_test_func = _get_frameworks_and_test_func(obj)
+    for framework, test_func in framework_to_test_func.items():
+        if test_func(obj):
+            return framework_to_numpy[framework](obj)
+
+    return obj
 
 
 class ModelOutput(OrderedDict):
@@ -219,7 +299,41 @@ class ModelOutput(OrderedDict):
     </Tip>
     """
 
+    def __init_subclass__(cls) -> None:
+        """Register subclasses as pytree nodes.
+
+        This is necessary to synchronize gradients when using `torch.nn.parallel.DistributedDataParallel` with
+        `static_graph=True` with modules that output `ModelOutput` subclasses.
+        """
+        if is_torch_available():
+            import torch.utils._pytree
+
+            torch.utils._pytree._register_pytree_node(
+                cls,
+                torch.utils._pytree._dict_flatten,
+                lambda values, context: cls(**torch.utils._pytree._dict_unflatten(values, context)),
+            )
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # Subclasses of ModelOutput must use the @dataclass decorator
+        # This check is done in __init__ because the @dataclass decorator operates after __init_subclass__
+        # issubclass() would return True for issubclass(ModelOutput, ModelOutput) when False is needed
+        # Just need to check that the current class is not ModelOutput
+        is_modeloutput_subclass = self.__class__ != ModelOutput
+
+        if is_modeloutput_subclass and not is_dataclass(self):
+            raise TypeError(
+                f"{self.__module__}.{self.__class__.__name__} is not a dataclasss."
+                " This is a subclass of ModelOutput and so must use the @dataclass decorator."
+            )
+
     def __post_init__(self):
+        """Check the ModelOutput dataclass.
+
+        Only occurs if @dataclass decorator has been used.
+        """
         class_fields = fields(self)
 
         # Safety and consistency checks
@@ -285,7 +399,7 @@ def update(self, *args, **kwargs):
 
     def __getitem__(self, k):
         if isinstance(k, str):
-            inner_dict = {k: v for (k, v) in self.items()}
+            inner_dict = dict(self.items())
             return inner_dict[k]
         else:
             return self.to_tuple()[k]
@@ -302,6 +416,13 @@ def __setitem__(self, key, value):
         # Don't call self.__setattr__ to avoid recursion errors
         super().__setattr__(key, value)
 
+    def __reduce__(self):
+        if not is_dataclass(self):
+            return super().__reduce__()
+        callable, _args, *remaining = super().__reduce__()
+        args = tuple(getattr(self, field.name) for field in fields(self))
+        return callable, args, *remaining
+
     def to_tuple(self) -> Tuple[Any]:
         """
         Convert self to a tuple containing all the attributes/keys that are not `None`.
@@ -369,13 +490,13 @@ def can_return_loss(model_class):
     Args:
         model_class (`type`): The class of the model.
     """
-    model_name = model_class.__name__
-    if model_name.startswith("TF"):
-        signature = inspect.signature(model_class.call)
-    elif model_name.startswith("Flax"):
-        signature = inspect.signature(model_class.__call__)
+    framework = infer_framework(model_class)
+    if framework == "tf":
+        signature = inspect.signature(model_class.call)  # TensorFlow models
+    elif framework == "pt":
+        signature = inspect.signature(model_class.forward)  # PyTorch models
     else:
-        signature = inspect.signature(model_class.forward)
+        signature = inspect.signature(model_class.__call__)  # Flax models
 
     for p in signature.parameters:
         if p == "return_loss" and signature.parameters[p].default is True:
@@ -392,12 +513,14 @@ def find_labels(model_class):
         model_class (`type`): The class of the model.
     """
     model_name = model_class.__name__
-    if model_name.startswith("TF"):
-        signature = inspect.signature(model_class.call)
-    elif model_name.startswith("Flax"):
-        signature = inspect.signature(model_class.__call__)
+    framework = infer_framework(model_class)
+    if framework == "tf":
+        signature = inspect.signature(model_class.call)  # TensorFlow models
+    elif framework == "pt":
+        signature = inspect.signature(model_class.forward)  # PyTorch models
     else:
-        signature = inspect.signature(model_class.forward)
+        signature = inspect.signature(model_class.__call__)  # Flax models
+
     if "QuestionAnswering" in model_name:
         return [p for p in signature.parameters if "label" in p or p in ("start_positions", "end_positions")]
     else:
@@ -437,6 +560,8 @@ def transpose(array, axes=None):
     elif is_torch_tensor(array):
         return array.T if axes is None else array.permute(*axes)
     elif is_tf_tensor(array):
+        import tensorflow as tf
+
         return tf.transpose(array, perm=axes)
     elif is_jax_tensor(array):
         return jnp.transpose(array, axes=axes)
@@ -454,6 +579,8 @@ def reshape(array, newshape):
     elif is_torch_tensor(array):
         return array.reshape(*newshape)
     elif is_tf_tensor(array):
+        import tensorflow as tf
+
         return tf.reshape(array, newshape)
     elif is_jax_tensor(array):
         return jnp.reshape(array, newshape)
@@ -471,6 +598,8 @@ def squeeze(array, axis=None):
     elif is_torch_tensor(array):
         return array.squeeze() if axis is None else array.squeeze(dim=axis)
     elif is_tf_tensor(array):
+        import tensorflow as tf
+
         return tf.squeeze(array, axis=axis)
     elif is_jax_tensor(array):
         return jnp.squeeze(array, axis=axis)
@@ -488,6 +617,8 @@ def expand_dims(array, axis):
     elif is_torch_tensor(array):
         return array.unsqueeze(dim=axis)
     elif is_tf_tensor(array):
+        import tensorflow as tf
+
         return tf.expand_dims(array, axis=axis)
     elif is_jax_tensor(array):
         return jnp.expand_dims(array, axis=axis)
@@ -504,8 +635,41 @@ def tensor_size(array):
     elif is_torch_tensor(array):
         return array.numel()
     elif is_tf_tensor(array):
+        import tensorflow as tf
+
         return tf.size(array)
     elif is_jax_tensor(array):
         return array.size
     else:
         raise ValueError(f"Type not supported for expand_dims: {type(array)}.")
+
+
+def add_model_info_to_auto_map(auto_map, repo_id):
+    """
+    Adds the information of the repo_id to a given auto map.
+    """
+    for key, value in auto_map.items():
+        if isinstance(value, (tuple, list)):
+            auto_map[key] = [f"{repo_id}--{v}" if (v is not None and "--" not in v) else v for v in value]
+        elif value is not None and "--" not in value:
+            auto_map[key] = f"{repo_id}--{value}"
+
+    return auto_map
+
+
+def infer_framework(model_class):
+    """
+    Infers the framework of a given model without using isinstance(), because we cannot guarantee that the relevant
+    classes are imported or available.
+    """
+    for base_class in inspect.getmro(model_class):
+        module = base_class.__module__
+        name = base_class.__name__
+        if module.startswith("tensorflow") or module.startswith("keras") or name == "TFPreTrainedModel":
+            return "tf"
+        elif module.startswith("torch") or name == "PreTrainedModel":
+            return "pt"
+        elif module.startswith("flax") or module.startswith("jax") or name == "FlaxPreTrainedModel":
+            return "flax"
+    else:
+        raise TypeError(f"Could not infer framework from class {model_class}.")
diff --git a/src/transformers/utils/hp_naming.py b/src/transformers/utils/hp_naming.py
index bc806e822293..f7c5cb5259f8 100644
--- a/src/transformers/utils/hp_naming.py
+++ b/src/transformers/utils/hp_naming.py
@@ -96,12 +96,12 @@ def build_naming_info(cls):
         if cls.NAMING_INFO is not None:
             return
 
-        info = dict(
-            short_word={},
-            reverse_short_word={},
-            short_param={},
-            reverse_short_param={},
-        )
+        info = {
+            "short_word": {},
+            "reverse_short_word": {},
+            "short_param": {},
+            "reverse_short_param": {},
+        }
 
         field_keys = list(cls.DEFAULTS.keys())
 
diff --git a/src/transformers/utils/hub.py b/src/transformers/utils/hub.py
index 5bff3fccb1dd..b900311003b8 100644
--- a/src/transformers/utils/hub.py
+++ b/src/transformers/utils/hub.py
@@ -22,6 +22,7 @@
 import tempfile
 import traceback
 import warnings
+from concurrent import futures
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple, Union
 from urllib.parse import urlparse
@@ -31,24 +32,24 @@
 import requests
 from huggingface_hub import (
     CommitOperationAdd,
-    HfFolder,
+    create_branch,
     create_commit,
     create_repo,
     get_hf_file_metadata,
     hf_hub_download,
     hf_hub_url,
-    whoami,
 )
 from huggingface_hub.file_download import REGEX_COMMIT_HASH, http_get
 from huggingface_hub.utils import (
     EntryNotFoundError,
+    GatedRepoError,
     LocalEntryNotFoundError,
     RepositoryNotFoundError,
     RevisionNotFoundError,
+    build_hf_headers,
     hf_raise_for_status,
 )
 from requests.exceptions import HTTPError
-from transformers.utils.logging import tqdm
 
 from . import __version__, logging
 from .generic import working_or_temp_dir
@@ -60,6 +61,7 @@
     is_torch_available,
     is_training_run_on_sagemaker,
 )
+from .logging import tqdm
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -235,6 +237,7 @@ def try_to_load_from_cache(
     filename: str,
     cache_dir: Union[str, Path, None] = None,
     revision: Optional[str] = None,
+    repo_type: Optional[str] = None,
 ) -> Optional[str]:
     """
     Explores the cache to return the latest cached file for a given revision if found.
@@ -251,6 +254,8 @@ def try_to_load_from_cache(
         revision (`str`, *optional*):
             The specific model version to use. Will default to `"main"` if it's not provided and no `commit_hash` is
             provided either.
+        repo_type (`str`, *optional*):
+            The type of the repo.
 
     Returns:
         `Optional[str]` or `_CACHED_NO_EXIST`:
@@ -266,7 +271,9 @@ def try_to_load_from_cache(
         cache_dir = TRANSFORMERS_CACHE
 
     object_id = repo_id.replace("/", "--")
-    repo_cache = os.path.join(cache_dir, f"models--{object_id}")
+    if repo_type is None:
+        repo_type = "model"
+    repo_cache = os.path.join(cache_dir, f"{repo_type}s--{object_id}")
     if not os.path.isdir(repo_cache):
         # No cache for this model
         return None
@@ -299,14 +306,16 @@ def cached_file(
     force_download: bool = False,
     resume_download: bool = False,
     proxies: Optional[Dict[str, str]] = None,
-    use_auth_token: Optional[Union[bool, str]] = None,
+    token: Optional[Union[bool, str]] = None,
     revision: Optional[str] = None,
     local_files_only: bool = False,
     subfolder: str = "",
+    repo_type: Optional[str] = None,
     user_agent: Optional[Union[str, Dict[str, str]]] = None,
     _raise_exceptions_for_missing_entries: bool = True,
     _raise_exceptions_for_connection_errors: bool = True,
     _commit_hash: Optional[str] = None,
+    **deprecated_kwargs,
 ):
     """
     Tries to locate a file in a local folder and repo, downloads and cache it if necessary.
@@ -330,7 +339,7 @@ def cached_file(
         proxies (`Dict[str, str]`, *optional*):
             A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
             'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-        use_auth_token (`str` or *bool*, *optional*):
+        token (`str` or *bool*, *optional*):
             The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
             when running `huggingface-cli login` (stored in `~/.huggingface`).
         revision (`str`, *optional*, defaults to `"main"`):
@@ -342,10 +351,12 @@ def cached_file(
         subfolder (`str`, *optional*, defaults to `""`):
             In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
             specify the folder name here.
+        repo_type (`str`, *optional*):
+            Specify the repo type (useful when downloading from a space for instance).
 
     <Tip>
 
-    Passing `use_auth_token=True` is required when you want to use a private model.
+    Passing `token=True` is required when you want to use a private model.
 
     </Tip>
 
@@ -358,6 +369,15 @@ def cached_file(
     # Download a model weight from the Hub and cache it.
     model_weights_file = cached_file("bert-base-uncased", "pytorch_model.bin")
     ```"""
+    use_auth_token = deprecated_kwargs.pop("use_auth_token", None)
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+        )
+        if token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        token = use_auth_token
+
     # Private arguments
     #     _raise_exceptions_for_missing_entries: if False, do not raise an exception for missing entries but return
     #         None.
@@ -390,10 +410,10 @@ def cached_file(
     if isinstance(cache_dir, Path):
         cache_dir = str(cache_dir)
 
-    if _commit_hash is not None:
+    if _commit_hash is not None and not force_download:
         # If the file is cached under that commit hash, we return it directly.
         resolved_file = try_to_load_from_cache(
-            path_or_repo_id, full_filename, cache_dir=cache_dir, revision=_commit_hash
+            path_or_repo_id, full_filename, cache_dir=cache_dir, revision=_commit_hash, repo_type=repo_type
         )
         if resolved_file is not None:
             if resolved_file is not _CACHED_NO_EXIST:
@@ -410,30 +430,36 @@ def cached_file(
             path_or_repo_id,
             filename,
             subfolder=None if len(subfolder) == 0 else subfolder,
+            repo_type=repo_type,
             revision=revision,
             cache_dir=cache_dir,
             user_agent=user_agent,
             force_download=force_download,
             proxies=proxies,
             resume_download=resume_download,
-            use_auth_token=use_auth_token,
+            token=token,
             local_files_only=local_files_only,
         )
-
-    except RepositoryNotFoundError:
+    except GatedRepoError as e:
+        raise EnvironmentError(
+            "You are trying to access a gated repo.\nMake sure to request access at "
+            f"https://huggingface.co/{path_or_repo_id} and pass a token having permission to this repo either "
+            "by logging in with `huggingface-cli login` or by passing `token=<your_token>`."
+        ) from e
+    except RepositoryNotFoundError as e:
         raise EnvironmentError(
             f"{path_or_repo_id} is not a local folder and is not a valid model identifier "
-            "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to "
-            "pass a token having permission to this repo with `use_auth_token` or log in with "
-            "`huggingface-cli login` and pass `use_auth_token=True`."
-        )
-    except RevisionNotFoundError:
+            "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a token "
+            "having permission to this repo either by logging in with `huggingface-cli login` or by passing "
+            "`token=<your_token>`"
+        ) from e
+    except RevisionNotFoundError as e:
         raise EnvironmentError(
             f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists "
             "for this model name. Check the model page at "
             f"'https://huggingface.co/{path_or_repo_id}' for available revisions."
-        )
-    except LocalEntryNotFoundError:
+        ) from e
+    except LocalEntryNotFoundError as e:
         # We try to see if we have a cached version (not up to date):
         resolved_file = try_to_load_from_cache(path_or_repo_id, full_filename, cache_dir=cache_dir, revision=revision)
         if resolved_file is not None and resolved_file != _CACHED_NO_EXIST:
@@ -445,8 +471,8 @@ def cached_file(
             f" cached files and it looks like {path_or_repo_id} is not the path to a directory containing a file named"
             f" {full_filename}.\nCheckout your internet connection or see how to run the library in offline mode at"
             " 'https://huggingface.co/docs/transformers/installation#offline-mode'."
-        )
-    except EntryNotFoundError:
+        ) from e
+    except EntryNotFoundError as e:
         if not _raise_exceptions_for_missing_entries:
             return None
         if revision is None:
@@ -454,7 +480,7 @@ def cached_file(
         raise EnvironmentError(
             f"{path_or_repo_id} does not appear to have a file named {full_filename}. Checkout "
             f"'https://huggingface.co/{path_or_repo_id}/{revision}' for available files."
-        )
+        ) from e
     except HTTPError as err:
         # First we try to see if we have a cached version (not up to date):
         resolved_file = try_to_load_from_cache(path_or_repo_id, full_filename, cache_dir=cache_dir, revision=revision)
@@ -475,10 +501,11 @@ def get_file_from_repo(
     force_download: bool = False,
     resume_download: bool = False,
     proxies: Optional[Dict[str, str]] = None,
-    use_auth_token: Optional[Union[bool, str]] = None,
+    token: Optional[Union[bool, str]] = None,
     revision: Optional[str] = None,
     local_files_only: bool = False,
     subfolder: str = "",
+    **deprecated_kwargs,
 ):
     """
     Tries to locate a file in a local folder and repo, downloads and cache it if necessary.
@@ -502,7 +529,7 @@ def get_file_from_repo(
         proxies (`Dict[str, str]`, *optional*):
             A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
             'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-        use_auth_token (`str` or *bool*, *optional*):
+        token (`str` or *bool*, *optional*):
             The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
             when running `huggingface-cli login` (stored in `~/.huggingface`).
         revision (`str`, *optional*, defaults to `"main"`):
@@ -517,7 +544,7 @@ def get_file_from_repo(
 
     <Tip>
 
-    Passing `use_auth_token=True` is required when you want to use a private model.
+    Passing `token=True` is required when you want to use a private model.
 
     </Tip>
 
@@ -533,6 +560,15 @@ def get_file_from_repo(
     # This model does not have a tokenizer config so the result will be None.
     tokenizer_config = get_file_from_repo("xlm-roberta-base", "tokenizer_config.json")
     ```"""
+    use_auth_token = deprecated_kwargs.pop("use_auth_token", None)
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+        )
+        if token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        token = use_auth_token
+
     return cached_file(
         path_or_repo_id=path_or_repo,
         filename=filename,
@@ -540,7 +576,7 @@ def get_file_from_repo(
         force_download=force_download,
         resume_download=resume_download,
         proxies=proxies,
-        use_auth_token=use_auth_token,
+        token=token,
         revision=revision,
         local_files_only=local_files_only,
         subfolder=subfolder,
@@ -569,7 +605,7 @@ def download_url(url, proxies=None):
         " that this is not compatible with the caching system (your file will be downloaded at each execution) or"
         " multiple processes (each process will download the file in a different temporary file)."
     )
-    tmp_file = tempfile.mktemp()
+    tmp_file = tempfile.mkstemp()[1]
     with open(tmp_file, "wb") as f:
         http_get(url, f, proxies=proxies)
     return tmp_file
@@ -580,10 +616,11 @@ def has_file(
     filename: str,
     revision: Optional[str] = None,
     proxies: Optional[Dict[str, str]] = None,
-    use_auth_token: Optional[Union[bool, str]] = None,
+    token: Optional[Union[bool, str]] = None,
+    **deprecated_kwargs,
 ):
     """
-    Checks if a repo contains a given file wihtout downloading it. Works for remote repos and local folders.
+    Checks if a repo contains a given file without downloading it. Works for remote repos and local folders.
 
     <Tip warning={false}>
 
@@ -592,24 +629,32 @@ def has_file(
 
     </Tip>
     """
+    use_auth_token = deprecated_kwargs.pop("use_auth_token", None)
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+        )
+        if token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        token = use_auth_token
+
     if os.path.isdir(path_or_repo):
         return os.path.isfile(os.path.join(path_or_repo, filename))
 
     url = hf_hub_url(path_or_repo, filename=filename, revision=revision)
-
-    headers = {"user-agent": http_user_agent()}
-    if isinstance(use_auth_token, str):
-        headers["authorization"] = f"Bearer {use_auth_token}"
-    elif use_auth_token:
-        token = HfFolder.get_token()
-        if token is None:
-            raise EnvironmentError("You specified use_auth_token=True, but a huggingface token was not found.")
-        headers["authorization"] = f"Bearer {token}"
+    headers = build_hf_headers(token=token, user_agent=http_user_agent())
 
     r = requests.head(url, headers=headers, allow_redirects=False, proxies=proxies, timeout=10)
     try:
         hf_raise_for_status(r)
         return True
+    except GatedRepoError as e:
+        logger.error(e)
+        raise EnvironmentError(
+            f"{path_or_repo} is a gated repository. Make sure to request access at "
+            f"https://huggingface.co/{path_or_repo} and pass a token having permission to this repo either by "
+            "logging in with `huggingface-cli login` or by passing `token=<your_token>`."
+        ) from e
     except RepositoryNotFoundError as e:
         logger.error(e)
         raise EnvironmentError(f"{path_or_repo} is not a local folder or a valid repository name on 'https://hf.co'.")
@@ -633,19 +678,23 @@ def _create_repo(
         self,
         repo_id: str,
         private: Optional[bool] = None,
-        use_auth_token: Optional[Union[bool, str]] = None,
+        token: Optional[Union[bool, str]] = None,
         repo_url: Optional[str] = None,
         organization: Optional[str] = None,
-    ):
+    ) -> str:
         """
-        Create the repo if needed, cleans up repo_id with deprecated kwards `repo_url` and `organization`, retrives the
-        token.
+        Create the repo if needed, cleans up repo_id with deprecated kwargs `repo_url` and `organization`, retrieves
+        the token.
         """
         if repo_url is not None:
             warnings.warn(
                 "The `repo_url` argument is deprecated and will be removed in v5 of Transformers. Use `repo_id` "
                 "instead."
             )
+            if repo_id is not None:
+                raise ValueError(
+                    "`repo_id` and `repo_url` are both specified. Please set only the argument `repo_id`."
+                )
             repo_id = repo_url.replace(f"{HUGGINGFACE_CO_RESOLVE_ENDPOINT}/", "")
         if organization is not None:
             warnings.warn(
@@ -657,13 +706,8 @@ def _create_repo(
                     repo_id = repo_id.split("/")[-1]
                 repo_id = f"{organization}/{repo_id}"
 
-        token = HfFolder.get_token() if use_auth_token is True else use_auth_token
         url = create_repo(repo_id=repo_id, token=token, private=private, exist_ok=True)
-
-        # If the namespace is not there, add it or `upload_file` will complain
-        if "/" not in repo_id and url != f"{HUGGINGFACE_CO_RESOLVE_ENDPOINT}/{repo_id}":
-            repo_id = get_full_repo_name(repo_id, token=token)
-        return repo_id, token
+        return url.repo_id
 
     def _get_files_timestamps(self, working_dir: Union[str, os.PathLike]):
         """
@@ -677,8 +721,9 @@ def _upload_modified_files(
         repo_id: str,
         files_timestamps: Dict[str, float],
         commit_message: Optional[str] = None,
-        token: Optional[str] = None,
+        token: Optional[Union[bool, str]] = None,
         create_pr: bool = False,
+        revision: str = None,
     ):
         """
         Uploads all modified files in `working_dir` to `repo_id`, based on `files_timestamps`.
@@ -701,12 +746,41 @@ def _upload_modified_files(
             for f in os.listdir(working_dir)
             if f not in files_timestamps or os.path.getmtime(os.path.join(working_dir, f)) > files_timestamps[f]
         ]
+
+        # filter for actual files + folders at the root level
+        modified_files = [
+            f
+            for f in modified_files
+            if os.path.isfile(os.path.join(working_dir, f)) or os.path.isdir(os.path.join(working_dir, f))
+        ]
+
         operations = []
+        # upload standalone files
         for file in modified_files:
-            operations.append(CommitOperationAdd(path_or_fileobj=os.path.join(working_dir, file), path_in_repo=file))
+            if os.path.isdir(os.path.join(working_dir, file)):
+                # go over individual files of folder
+                for f in os.listdir(os.path.join(working_dir, file)):
+                    operations.append(
+                        CommitOperationAdd(
+                            path_or_fileobj=os.path.join(working_dir, file, f), path_in_repo=os.path.join(file, f)
+                        )
+                    )
+            else:
+                operations.append(
+                    CommitOperationAdd(path_or_fileobj=os.path.join(working_dir, file), path_in_repo=file)
+                )
+
+        if revision is not None:
+            create_branch(repo_id=repo_id, branch=revision, token=token, exist_ok=True)
+
         logger.info(f"Uploading the following files to {repo_id}: {','.join(modified_files)}")
         return create_commit(
-            repo_id=repo_id, operations=operations, commit_message=commit_message, token=token, create_pr=create_pr
+            repo_id=repo_id,
+            operations=operations,
+            commit_message=commit_message,
+            token=token,
+            create_pr=create_pr,
+            revision=revision,
         )
 
     def push_to_hub(
@@ -715,14 +789,15 @@ def push_to_hub(
         use_temp_dir: Optional[bool] = None,
         commit_message: Optional[str] = None,
         private: Optional[bool] = None,
-        use_auth_token: Optional[Union[bool, str]] = None,
+        token: Optional[Union[bool, str]] = None,
         max_shard_size: Optional[Union[int, str]] = "10GB",
         create_pr: bool = False,
-        **deprecated_kwargs
+        safe_serialization: bool = False,
+        revision: str = None,
+        **deprecated_kwargs,
     ) -> str:
         """
-        Upload the {object_files} to the 🤗 Model Hub while synchronizing a local clone of the repo in
-        `repo_path_or_name`.
+        Upload the {object_files} to the 🤗 Model Hub.
 
         Parameters:
             repo_id (`str`):
@@ -735,7 +810,7 @@ def push_to_hub(
                 Message to commit while pushing. Will default to `"Upload {object}"`.
             private (`bool`, *optional*):
                 Whether or not the repository created should be private.
-            use_auth_token (`bool` or `str`, *optional*):
+            token (`bool` or `str`, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
                 when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url`
                 is not specified.
@@ -745,6 +820,10 @@ def push_to_hub(
                 by a unit (like `"5MB"`).
             create_pr (`bool`, *optional*, defaults to `False`):
                 Whether or not to create a PR with the uploaded files or directly commit.
+            safe_serialization (`bool`, *optional*, defaults to `False`):
+                Whether or not to convert the model weights in safetensors format for safer serialization.
+            revision (`str`, *optional*):
+                Branch to push the uploaded files to.
 
         Examples:
 
@@ -760,24 +839,48 @@ def push_to_hub(
         {object}.push_to_hub("huggingface/my-finetuned-bert")
         ```
         """
-        if "repo_path_or_name" in deprecated_kwargs:
+        use_auth_token = deprecated_kwargs.pop("use_auth_token", None)
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
+        repo_path_or_name = deprecated_kwargs.pop("repo_path_or_name", None)
+        if repo_path_or_name is not None:
+            # Should use `repo_id` instead of `repo_path_or_name`. When using `repo_path_or_name`, we try to infer
+            # repo_id from the folder path, if it exists.
             warnings.warn(
                 "The `repo_path_or_name` argument is deprecated and will be removed in v5 of Transformers. Use "
-                "`repo_id` instead."
+                "`repo_id` instead.",
+                FutureWarning,
             )
-            repo_id = deprecated_kwargs.pop("repo_path_or_name")
+            if repo_id is not None:
+                raise ValueError(
+                    "`repo_id` and `repo_path_or_name` are both specified. Please set only the argument `repo_id`."
+                )
+            if os.path.isdir(repo_path_or_name):
+                # repo_path: infer repo_id from the path
+                repo_id = repo_id.split(os.path.sep)[-1]
+                working_dir = repo_id
+            else:
+                # repo_name: use it as repo_id
+                repo_id = repo_path_or_name
+                working_dir = repo_id.split("/")[-1]
+        else:
+            # Repo_id is passed correctly: infer working_dir from it
+            working_dir = repo_id.split("/")[-1]
+
         # Deprecation warning will be sent after for repo_url and organization
         repo_url = deprecated_kwargs.pop("repo_url", None)
         organization = deprecated_kwargs.pop("organization", None)
 
-        if os.path.isdir(repo_id):
-            working_dir = repo_id
-            repo_id = repo_id.split(os.path.sep)[-1]
-        else:
-            working_dir = repo_id.split("/")[-1]
-
-        repo_id, token = self._create_repo(
-            repo_id, private=private, use_auth_token=use_auth_token, repo_url=repo_url, organization=organization
+        repo_id = self._create_repo(
+            repo_id, private=private, token=token, repo_url=repo_url, organization=organization
         )
 
         if use_temp_dir is None:
@@ -787,23 +890,19 @@ def push_to_hub(
             files_timestamps = self._get_files_timestamps(work_dir)
 
             # Save all files.
-            self.save_pretrained(work_dir, max_shard_size=max_shard_size)
+            self.save_pretrained(work_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization)
 
             return self._upload_modified_files(
-                work_dir, repo_id, files_timestamps, commit_message=commit_message, token=token, create_pr=create_pr
+                work_dir,
+                repo_id,
+                files_timestamps,
+                commit_message=commit_message,
+                token=token,
+                create_pr=create_pr,
+                revision=revision,
             )
 
 
-def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
-    if token is None:
-        token = HfFolder.get_token()
-    if organization is None:
-        username = whoami(token)["name"]
-        return f"{username}/{model_id}"
-    else:
-        return f"{organization}/{model_id}"
-
-
 def send_example_telemetry(example_name, *example_args, framework="pytorch"):
     """
     Sends telemetry that helps tracking the examples use.
@@ -883,11 +982,12 @@ def get_checkpoint_shard_files(
     proxies=None,
     resume_download=False,
     local_files_only=False,
-    use_auth_token=None,
+    token=None,
     user_agent=None,
     revision=None,
     subfolder="",
     _commit_hash=None,
+    **deprecated_kwargs,
 ):
     """
     For a given model:
@@ -901,13 +1001,22 @@ def get_checkpoint_shard_files(
     """
     import json
 
+    use_auth_token = deprecated_kwargs.pop("use_auth_token", None)
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+        )
+        if token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        token = use_auth_token
+
     if not os.path.isfile(index_filename):
         raise ValueError(f"Can't find a checkpoint index ({index_filename}) in {pretrained_model_name_or_path}.")
 
     with open(index_filename, "r") as f:
         index = json.loads(f.read())
 
-    shard_filenames = sorted(list(set(index["weight_map"].values())))
+    shard_filenames = sorted(set(index["weight_map"].values()))
     sharded_metadata = index["metadata"]
     sharded_metadata["all_checkpoint_keys"] = list(index["weight_map"].keys())
     sharded_metadata["weight_map"] = index["weight_map"].copy()
@@ -919,7 +1028,13 @@ def get_checkpoint_shard_files(
 
     # At this stage pretrained_model_name_or_path is a model identifier on the Hub
     cached_filenames = []
-    for shard_filename in shard_filenames:
+    # Check if the model is already cached or not. We only try the last checkpoint, this should cover most cases of
+    # downloaded (if interrupted).
+    last_shard = try_to_load_from_cache(
+        pretrained_model_name_or_path, shard_filenames[-1], cache_dir=cache_dir, revision=_commit_hash
+    )
+    show_progress_bar = last_shard is None or force_download
+    for shard_filename in tqdm(shard_filenames, desc="Downloading shards", disable=not show_progress_bar):
         try:
             # Load from URL
             cached_filename = cached_file(
@@ -930,7 +1045,7 @@ def get_checkpoint_shard_files(
                 proxies=proxies,
                 resume_download=resume_download,
                 local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
+                token=token,
                 user_agent=user_agent,
                 revision=revision,
                 subfolder=subfolder,
@@ -1040,17 +1155,15 @@ def move_cache(cache_dir=None, new_cache_dir=None, token=None):
             cache_dir = str(old_cache)
         else:
             cache_dir = new_cache_dir
-    if token is None:
-        token = HfFolder.get_token()
     cached_files = get_all_cached_files(cache_dir=cache_dir)
-    print(f"Moving {len(cached_files)} files to the new cache system")
+    logger.info(f"Moving {len(cached_files)} files to the new cache system")
 
     hub_metadata = {}
     for file_info in tqdm(cached_files):
         url = file_info.pop("url")
         if url not in hub_metadata:
             try:
-                hub_metadata[url] = get_hf_file_metadata(url, use_auth_token=token)
+                hub_metadata[url] = get_hf_file_metadata(url, token=token)
             except requests.HTTPError:
                 continue
 
@@ -1079,12 +1192,38 @@ def move_cache(cache_dir=None, new_cache_dir=None, token=None):
         )
 
 
+class PushInProgress:
+    """
+    Internal class to keep track of a push in progress (which might contain multiple `Future` jobs).
+    """
+
+    def __init__(self, jobs: Optional[futures.Future] = None) -> None:
+        self.jobs = [] if jobs is None else jobs
+
+    def is_done(self):
+        return all(job.done() for job in self.jobs)
+
+    def wait_until_done(self):
+        futures.wait(self.jobs)
+
+    def cancel(self) -> None:
+        self.jobs = [
+            job
+            for job in self.jobs
+            # Cancel the job if it wasn't started yet and remove cancelled/done jobs from the list
+            if not (job.cancel() or job.done())
+        ]
+
+
 cache_version_file = os.path.join(TRANSFORMERS_CACHE, "version.txt")
 if not os.path.isfile(cache_version_file):
     cache_version = 0
 else:
     with open(cache_version_file) as f:
-        cache_version = int(f.read())
+        try:
+            cache_version = int(f.read())
+        except ValueError:
+            cache_version = 0
 
 cache_is_not_empty = os.path.isdir(TRANSFORMERS_CACHE) and len(os.listdir(TRANSFORMERS_CACHE)) > 0
 
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 69cb1f05163d..895c54fd5425 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -15,27 +15,46 @@
 Import utilities: Utilities related to imports and our lazy inits.
 """
 
+import importlib.metadata
 import importlib.util
 import json
 import os
 import shutil
+import subprocess
 import sys
 import warnings
 from collections import OrderedDict
 from functools import lru_cache
 from itertools import chain
 from types import ModuleType
-from typing import Any
+from typing import Any, Tuple, Union
 
 from packaging import version
 
-from transformers.utils.versions import importlib_metadata
-
 from . import logging
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
+
+# TODO: This doesn't work for all packages (`bs4`, `faiss`, etc.) Talk to Sylvain to see how to do with it better.
+def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[Tuple[bool, str], bool]:
+    # Check we're not importing a "pkg_name" directory somewhere but the actual library by trying to grab the version
+    package_exists = importlib.util.find_spec(pkg_name) is not None
+    package_version = "N/A"
+    if package_exists:
+        try:
+            package_version = importlib.metadata.version(pkg_name)
+            package_exists = True
+        except importlib.metadata.PackageNotFoundError:
+            package_exists = False
+        logger.debug(f"Detected {pkg_name} version {package_version}")
+    if return_version:
+        return package_exists, package_version
+    else:
+        return package_exists
+
+
 ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
 ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"})
 
@@ -45,25 +64,99 @@
 
 FORCE_TF_AVAILABLE = os.environ.get("FORCE_TF_AVAILABLE", "AUTO").upper()
 
+# This is the version of torch required to run torch.fx features and torch.onnx with dictionary inputs.
+TORCH_FX_REQUIRED_VERSION = version.parse("1.10")
+
+
+_accelerate_available, _accelerate_version = _is_package_available("accelerate", return_version=True)
+_apex_available = _is_package_available("apex")
+_bitsandbytes_available = _is_package_available("bitsandbytes")
+_flash_attn_available = _is_package_available("flash_attn")
+# `importlib.metadata.version` doesn't work with `bs4` but `beautifulsoup4`. For `importlib.util.find_spec`, reversed.
+_bs4_available = importlib.util.find_spec("bs4") is not None
+_coloredlogs_available = _is_package_available("coloredlogs")
+# `importlib.metadata.util` doesn't work with `opencv-python-headless`.
+_cv2_available = importlib.util.find_spec("cv2") is not None
+_datasets_available = _is_package_available("datasets")
+_decord_available = importlib.util.find_spec("decord") is not None
+_detectron2_available = _is_package_available("detectron2")
+# We need to check both `faiss` and `faiss-cpu`.
+_faiss_available = importlib.util.find_spec("faiss") is not None
+try:
+    _faiss_version = importlib.metadata.version("faiss")
+    logger.debug(f"Successfully imported faiss version {_faiss_version}")
+except importlib.metadata.PackageNotFoundError:
+    try:
+        _faiss_version = importlib.metadata.version("faiss-cpu")
+        logger.debug(f"Successfully imported faiss version {_faiss_version}")
+    except importlib.metadata.PackageNotFoundError:
+        _faiss_available = False
+_ftfy_available = _is_package_available("ftfy")
+_ipex_available, _ipex_version = _is_package_available("intel_extension_for_pytorch", return_version=True)
+_jieba_available = _is_package_available("jieba")
+_jinja_available = _is_package_available("jinja2")
+_kenlm_available = _is_package_available("kenlm")
+_keras_nlp_available = _is_package_available("keras_nlp")
+_levenshtein_available = _is_package_available("Levenshtein")
+_librosa_available = _is_package_available("librosa")
+_natten_available = _is_package_available("natten")
+_nltk_available = _is_package_available("nltk")
+_onnx_available = _is_package_available("onnx")
+_openai_available = _is_package_available("openai")
+_optimum_available = _is_package_available("optimum")
+_auto_gptq_available = _is_package_available("auto_gptq")
+_pandas_available = _is_package_available("pandas")
+_peft_available = _is_package_available("peft")
+_phonemizer_available = _is_package_available("phonemizer")
+_psutil_available = _is_package_available("psutil")
+_py3nvml_available = _is_package_available("py3nvml")
+_pyctcdecode_available = _is_package_available("pyctcdecode")
+_pytesseract_available = _is_package_available("pytesseract")
+_pytest_available = _is_package_available("pytest")
+_pytorch_quantization_available = _is_package_available("pytorch_quantization")
+_rjieba_available = _is_package_available("rjieba")
+_sacremoses_available = _is_package_available("sacremoses")
+_safetensors_available = _is_package_available("safetensors")
+_scipy_available = _is_package_available("scipy")
+_sentencepiece_available = _is_package_available("sentencepiece")
+_is_seqio_available = _is_package_available("seqio")
+_sklearn_available = importlib.util.find_spec("sklearn") is not None
+if _sklearn_available:
+    try:
+        importlib.metadata.version("scikit-learn")
+    except importlib.metadata.PackageNotFoundError:
+        _sklearn_available = False
+_smdistributed_available = importlib.util.find_spec("smdistributed") is not None
+_soundfile_available = _is_package_available("soundfile")
+_spacy_available = _is_package_available("spacy")
+_sudachipy_available = _is_package_available("sudachipy")
+_tensorflow_probability_available = _is_package_available("tensorflow_probability")
+_tensorflow_text_available = _is_package_available("tensorflow_text")
+_tf2onnx_available = _is_package_available("tf2onnx")
+_timm_available = _is_package_available("timm")
+_tokenizers_available = _is_package_available("tokenizers")
+_torchaudio_available = _is_package_available("torchaudio")
+_torchdistx_available = _is_package_available("torchdistx")
+_torchvision_available = _is_package_available("torchvision")
+
+
 _torch_version = "N/A"
+_torch_available = False
 if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES:
-    _torch_available = importlib.util.find_spec("torch") is not None
-    if _torch_available:
-        try:
-            _torch_version = importlib_metadata.version("torch")
-            logger.info(f"PyTorch version {_torch_version} available.")
-        except importlib_metadata.PackageNotFoundError:
-            _torch_available = False
+    _torch_available, _torch_version = _is_package_available("torch", return_version=True)
 else:
     logger.info("Disabling PyTorch because USE_TF is set")
     _torch_available = False
 
 
 _tf_version = "N/A"
+_tf_available = False
 if FORCE_TF_AVAILABLE in ENV_VARS_TRUE_VALUES:
     _tf_available = True
 else:
     if USE_TF in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TORCH not in ENV_VARS_TRUE_VALUES:
+        # Note: _is_package_available("tensorflow") fails for tensorflow-cpu. Please test any changes to the line below
+        # with tensorflow-cpu to make sure it still works!
         _tf_available = importlib.util.find_spec("tensorflow") is not None
         if _tf_available:
             candidates = (
@@ -84,9 +177,9 @@
             # For the metadata, we have to look for both tensorflow and tensorflow-cpu
             for pkg in candidates:
                 try:
-                    _tf_version = importlib_metadata.version(pkg)
+                    _tf_version = importlib.metadata.version(pkg)
                     break
-                except importlib_metadata.PackageNotFoundError:
+                except importlib.metadata.PackageNotFoundError:
                     pass
             _tf_available = _tf_version is not None
         if _tf_available:
@@ -95,168 +188,25 @@
                     f"TensorFlow found but with version {_tf_version}. Transformers requires version 2 minimum."
                 )
                 _tf_available = False
-            else:
-                logger.info(f"TensorFlow version {_tf_version} available.")
     else:
         logger.info("Disabling Tensorflow because USE_TORCH is set")
-        _tf_available = False
 
 
-if USE_JAX in ENV_VARS_TRUE_AND_AUTO_VALUES:
-    _flax_available = importlib.util.find_spec("jax") is not None and importlib.util.find_spec("flax") is not None
-    if _flax_available:
-        try:
-            _jax_version = importlib_metadata.version("jax")
-            _flax_version = importlib_metadata.version("flax")
-            logger.info(f"JAX version {_jax_version}, Flax version {_flax_version} available.")
-        except importlib_metadata.PackageNotFoundError:
-            _flax_available = False
-else:
-    _flax_available = False
-
-
-_datasets_available = importlib.util.find_spec("datasets") is not None
+_essentia_available = importlib.util.find_spec("essentia") is not None
 try:
-    # Check we're not importing a "datasets" directory somewhere but the actual library by trying to grab the version
-    # AND checking it has an author field in the metadata that is HuggingFace.
-    _ = importlib_metadata.version("datasets")
-    _datasets_metadata = importlib_metadata.metadata("datasets")
-    if _datasets_metadata.get("author", "") != "HuggingFace Inc.":
-        _datasets_available = False
-except importlib_metadata.PackageNotFoundError:
-    _datasets_available = False
+    _essentia_version = importlib.metadata.version("essentia")
+    logger.debug(f"Successfully imported essentia version {_essentia_version}")
+except importlib.metadata.PackageNotFoundError:
+    _essentia_version = False
 
 
-_detectron2_available = importlib.util.find_spec("detectron2") is not None
+_pretty_midi_available = importlib.util.find_spec("pretty_midi") is not None
 try:
-    _detectron2_version = importlib_metadata.version("detectron2")
-    logger.debug(f"Successfully imported detectron2 version {_detectron2_version}")
-except importlib_metadata.PackageNotFoundError:
-    _detectron2_available = False
-
+    _pretty_midi_version = importlib.metadata.version("pretty_midi")
+    logger.debug(f"Successfully imported pretty_midi version {_pretty_midi_version}")
+except importlib.metadata.PackageNotFoundError:
+    _pretty_midi_available = False
 
-_faiss_available = importlib.util.find_spec("faiss") is not None
-try:
-    _faiss_version = importlib_metadata.version("faiss")
-    logger.debug(f"Successfully imported faiss version {_faiss_version}")
-except importlib_metadata.PackageNotFoundError:
-    try:
-        _faiss_version = importlib_metadata.version("faiss-cpu")
-        logger.debug(f"Successfully imported faiss version {_faiss_version}")
-    except importlib_metadata.PackageNotFoundError:
-        _faiss_available = False
-
-
-_ftfy_available = importlib.util.find_spec("ftfy") is not None
-try:
-    _ftfy_version = importlib_metadata.version("ftfy")
-    logger.debug(f"Successfully imported ftfy version {_ftfy_version}")
-except importlib_metadata.PackageNotFoundError:
-    _ftfy_available = False
-
-
-coloredlogs = importlib.util.find_spec("coloredlogs") is not None
-try:
-    _coloredlogs_available = importlib_metadata.version("coloredlogs")
-    logger.debug(f"Successfully imported sympy version {_coloredlogs_available}")
-except importlib_metadata.PackageNotFoundError:
-    _coloredlogs_available = False
-
-
-sympy_available = importlib.util.find_spec("sympy") is not None
-try:
-    _sympy_available = importlib_metadata.version("sympy")
-    logger.debug(f"Successfully imported sympy version {_sympy_available}")
-except importlib_metadata.PackageNotFoundError:
-    _sympy_available = False
-
-
-_tf2onnx_available = importlib.util.find_spec("tf2onnx") is not None
-try:
-    _tf2onnx_version = importlib_metadata.version("tf2onnx")
-    logger.debug(f"Successfully imported tf2onnx version {_tf2onnx_version}")
-except importlib_metadata.PackageNotFoundError:
-    _tf2onnx_available = False
-
-
-_onnx_available = importlib.util.find_spec("onnxruntime") is not None
-try:
-    _onxx_version = importlib_metadata.version("onnx")
-    logger.debug(f"Successfully imported onnx version {_onxx_version}")
-except importlib_metadata.PackageNotFoundError:
-    _onnx_available = False
-
-
-_pytorch_quantization_available = importlib.util.find_spec("pytorch_quantization") is not None
-try:
-    _pytorch_quantization_version = importlib_metadata.version("pytorch_quantization")
-    logger.debug(f"Successfully imported pytorch-quantization version {_pytorch_quantization_version}")
-except importlib_metadata.PackageNotFoundError:
-    _pytorch_quantization_available = False
-
-
-_soundfile_available = importlib.util.find_spec("soundfile") is not None
-try:
-    _soundfile_version = importlib_metadata.version("soundfile")
-    logger.debug(f"Successfully imported soundfile version {_soundfile_version}")
-except importlib_metadata.PackageNotFoundError:
-    _soundfile_available = False
-
-
-_tensorflow_probability_available = importlib.util.find_spec("tensorflow_probability") is not None
-try:
-    _tensorflow_probability_version = importlib_metadata.version("tensorflow_probability")
-    logger.debug(f"Successfully imported tensorflow-probability version {_tensorflow_probability_version}")
-except importlib_metadata.PackageNotFoundError:
-    _tensorflow_probability_available = False
-
-
-_timm_available = importlib.util.find_spec("timm") is not None
-try:
-    _timm_version = importlib_metadata.version("timm")
-    logger.debug(f"Successfully imported timm version {_timm_version}")
-except importlib_metadata.PackageNotFoundError:
-    _timm_available = False
-
-
-_natten_available = importlib.util.find_spec("natten") is not None
-try:
-    _natten_version = importlib_metadata.version("natten")
-    logger.debug(f"Successfully imported natten version {_natten_version}")
-except importlib_metadata.PackageNotFoundError:
-    _natten_available = False
-
-
-_torchaudio_available = importlib.util.find_spec("torchaudio") is not None
-try:
-    _torchaudio_version = importlib_metadata.version("torchaudio")
-    logger.debug(f"Successfully imported torchaudio version {_torchaudio_version}")
-except importlib_metadata.PackageNotFoundError:
-    _torchaudio_available = False
-
-
-_phonemizer_available = importlib.util.find_spec("phonemizer") is not None
-try:
-    _phonemizer_version = importlib_metadata.version("phonemizer")
-    logger.debug(f"Successfully imported phonemizer version {_phonemizer_version}")
-except importlib_metadata.PackageNotFoundError:
-    _phonemizer_available = False
-
-
-_pyctcdecode_available = importlib.util.find_spec("pyctcdecode") is not None
-try:
-    _pyctcdecode_version = importlib_metadata.version("pyctcdecode")
-    logger.debug(f"Successfully imported pyctcdecode version {_pyctcdecode_version}")
-except importlib_metadata.PackageNotFoundError:
-    _pyctcdecode_available = False
-
-
-_librosa_available = importlib.util.find_spec("librosa") is not None
-try:
-    _librosa_version = importlib_metadata.version("librosa")
-    logger.debug(f"Successfully imported librosa version {_librosa_version}")
-except importlib_metadata.PackageNotFoundError:
-    _librosa_available = False
 
 ccl_version = "N/A"
 _is_ccl_available = (
@@ -264,31 +214,53 @@
     or importlib.util.find_spec("oneccl_bindings_for_pytorch") is not None
 )
 try:
-    ccl_version = importlib_metadata.version("oneccl_bind_pt")
-    logger.debug(f"Successfully imported oneccl_bind_pt version {ccl_version}")
-except importlib_metadata.PackageNotFoundError:
+    ccl_version = importlib.metadata.version("oneccl_bind_pt")
+    logger.debug(f"Detected oneccl_bind_pt version {ccl_version}")
+except importlib.metadata.PackageNotFoundError:
     _is_ccl_available = False
 
-_decord_availale = importlib.util.find_spec("decord") is not None
-try:
-    _decord_version = importlib_metadata.version("decord")
-    logger.debug(f"Successfully imported decord version {_decord_version}")
-except importlib_metadata.PackageNotFoundError:
-    _decord_availale = False
 
-# This is the version of torch required to run torch.fx features and torch.onnx with dictionary inputs.
-TORCH_FX_REQUIRED_VERSION = version.parse("1.10")
-TORCH_ONNX_DICT_INPUTS_MINIMUM_VERSION = version.parse("1.8")
+_flax_available = False
+if USE_JAX in ENV_VARS_TRUE_AND_AUTO_VALUES:
+    _flax_available, _flax_version = _is_package_available("flax", return_version=True)
+    if _flax_available:
+        _jax_available, _jax_version = _is_package_available("jax", return_version=True)
+        if _jax_available:
+            logger.info(f"JAX version {_jax_version}, Flax version {_flax_version} available.")
+        else:
+            _flax_available = _jax_available = False
+            _jax_version = _flax_version = "N/A"
+
+
+_torch_fx_available = False
+if _torch_available:
+    torch_version = version.parse(_torch_version)
+    _torch_fx_available = (torch_version.major, torch_version.minor) >= (
+        TORCH_FX_REQUIRED_VERSION.major,
+        TORCH_FX_REQUIRED_VERSION.minor,
+    )
 
 
 def is_kenlm_available():
-    return importlib.util.find_spec("kenlm") is not None
+    return _kenlm_available
+
+
+def is_cv2_available():
+    return _cv2_available
 
 
 def is_torch_available():
     return _torch_available
 
 
+def get_torch_version():
+    return _torch_version
+
+
+def is_torchvision_available():
+    return _torchvision_available
+
+
 def is_pyctcdecode_available():
     return _pyctcdecode_available
 
@@ -297,6 +269,14 @@ def is_librosa_available():
     return _librosa_available
 
 
+def is_essentia_available():
+    return _essentia_available
+
+
+def is_pretty_midi_available():
+    return _pretty_midi_available
+
+
 def is_torch_cuda_available():
     if is_torch_available():
         import torch
@@ -306,6 +286,15 @@ def is_torch_cuda_available():
         return False
 
 
+def is_torch_mps_available():
+    if is_torch_available():
+        import torch
+
+        if hasattr(torch.backends, "mps"):
+            return torch.backends.mps.is_available()
+    return False
+
+
 def is_torch_bf16_gpu_available():
     if not is_torch_available():
         return False
@@ -320,9 +309,6 @@ def is_torch_bf16_cpu_available():
 
     import torch
 
-    if version.parse(version.parse(torch.__version__).base_version) < version.parse("1.10"):
-        return False
-
     try:
         # multiple levels of AttributeError depending on the pytorch version so do them all in one check
         _ = torch.cpu.amp.autocast
@@ -361,28 +347,16 @@ def is_torch_tf32_available():
     return True
 
 
-torch_version = None
-_torch_fx_available = _torch_onnx_dict_inputs_support_available = False
-if _torch_available:
-    torch_version = version.parse(importlib_metadata.version("torch"))
-    _torch_fx_available = (torch_version.major, torch_version.minor) >= (
-        TORCH_FX_REQUIRED_VERSION.major,
-        TORCH_FX_REQUIRED_VERSION.minor,
-    )
-
-    _torch_onnx_dict_inputs_support_available = torch_version >= TORCH_ONNX_DICT_INPUTS_MINIMUM_VERSION
-
-
 def is_torch_fx_available():
     return _torch_fx_available
 
 
-def is_bs4_available():
-    return importlib.util.find_spec("bs4") is not None
+def is_peft_available():
+    return _peft_available
 
 
-def is_torch_onnx_dict_inputs_support_available():
-    return _torch_onnx_dict_inputs_support_available
+def is_bs4_available():
+    return _bs4_available
 
 
 def is_tf_available():
@@ -401,6 +375,10 @@ def is_onnx_available():
     return _onnx_available
 
 
+def is_openai_available():
+    return _openai_available
+
+
 def is_flax_available():
     return _flax_available
 
@@ -428,6 +406,32 @@ def is_torch_tpu_available(check_device=True):
     return False
 
 
+@lru_cache()
+def is_torch_neuroncore_available(check_device=True):
+    if importlib.util.find_spec("torch_neuronx") is not None:
+        return is_torch_tpu_available(check_device)
+    return False
+
+
+@lru_cache()
+def is_torch_npu_available(check_device=False):
+    "Checks if `torch_npu` is installed and potentially if a NPU is in the environment"
+    if not _torch_available or importlib.util.find_spec("torch_npu") is None:
+        return False
+
+    import torch
+    import torch_npu  # noqa: F401
+
+    if check_device:
+        try:
+            # Will raise a RuntimeError if no NPU is found
+            _ = torch.npu.device_count()
+            return torch.npu.is_available()
+        except RuntimeError:
+            return False
+    return hasattr(torch, "npu") and torch.npu.is_available()
+
+
 def is_torchdynamo_available():
     if not is_torch_available():
         return False
@@ -445,9 +449,22 @@ def is_torch_compile_available():
 
     import torch
 
+    # We don't do any version check here to support nighlies marked as 1.14. Ultimately needs to check version against
+    # 2.0 but let's do it later.
     return hasattr(torch, "compile")
 
 
+def is_torchdynamo_compiling():
+    if not is_torch_available():
+        return False
+    try:
+        import torch._dynamo as dynamo  # noqa: F401
+
+        return dynamo.is_compiling()
+    except Exception:
+        return False
+
+
 def is_torch_tensorrt_fx_available():
     if importlib.util.find_spec("torch_tensorrt") is None:
         return False
@@ -462,45 +479,46 @@ def is_detectron2_available():
     return _detectron2_available
 
 
-def is_more_itertools_available():
-    return importlib.util.find_spec("more_itertools") is not None
-
-
 def is_rjieba_available():
-    return importlib.util.find_spec("rjieba") is not None
+    return _rjieba_available
 
 
 def is_psutil_available():
-    return importlib.util.find_spec("psutil") is not None
+    return _psutil_available
 
 
 def is_py3nvml_available():
-    return importlib.util.find_spec("py3nvml") is not None
+    return _py3nvml_available
 
 
 def is_sacremoses_available():
-    return importlib.util.find_spec("sacremoses") is not None
+    return _sacremoses_available
 
 
 def is_apex_available():
-    return importlib.util.find_spec("apex") is not None
+    return _apex_available
 
 
 def is_ninja_available():
-    return importlib.util.find_spec("ninja") is not None
+    r"""
+    Code comes from *torch.utils.cpp_extension.is_ninja_available()*. Returns `True` if the
+    [ninja](https://ninja-build.org/) build system is available on the system, `False` otherwise.
+    """
+    try:
+        subprocess.check_output("ninja --version".split())
+    except Exception:
+        return False
+    else:
+        return True
 
 
 def is_ipex_available():
     def get_major_and_minor_from_version(full_version):
         return str(version.parse(full_version).major) + "." + str(version.parse(full_version).minor)
 
-    if not is_torch_available() or importlib.util.find_spec("intel_extension_for_pytorch") is None:
-        return False
-    _ipex_version = "N/A"
-    try:
-        _ipex_version = importlib_metadata.version("intel_extension_for_pytorch")
-    except importlib_metadata.PackageNotFoundError:
+    if not is_torch_available() or not _ipex_available:
         return False
+
     torch_major_and_minor = get_major_and_minor_from_version(_torch_version)
     ipex_major_and_minor = get_major_and_minor_from_version(_ipex_version)
     if torch_major_and_minor != ipex_major_and_minor:
@@ -512,12 +530,48 @@ def get_major_and_minor_from_version(full_version):
     return True
 
 
+@lru_cache
+def is_torch_xpu_available(check_device=False):
+    "Checks if `intel_extension_for_pytorch` is installed and potentially if a XPU is in the environment"
+    if not is_ipex_available():
+        return False
+
+    import intel_extension_for_pytorch  # noqa: F401
+    import torch
+
+    if check_device:
+        try:
+            # Will raise a RuntimeError if no XPU  is found
+            _ = torch.xpu.device_count()
+            return torch.xpu.is_available()
+        except RuntimeError:
+            return False
+    return hasattr(torch, "xpu") and torch.xpu.is_available()
+
+
 def is_bitsandbytes_available():
-    return importlib.util.find_spec("bitsandbytes") is not None
+    if not is_torch_available():
+        return False
+
+    # bitsandbytes throws an error if cuda is not available
+    # let's avoid that by adding a simple check
+    import torch
+
+    return _bitsandbytes_available and torch.cuda.is_available()
+
+
+def is_flash_attn_available():
+    if not is_torch_available():
+        return False
+
+    # Let's add an extra check to see if cuda is available
+    import torch
+
+    return _flash_attn_available and torch.cuda.is_available()
 
 
 def is_torchdistx_available():
-    return importlib.util.find_spec("torchdistx") is not None
+    return _torchdistx_available
 
 
 def is_faiss_available():
@@ -525,17 +579,19 @@ def is_faiss_available():
 
 
 def is_scipy_available():
-    return importlib.util.find_spec("scipy") is not None
+    return _scipy_available
 
 
 def is_sklearn_available():
-    if importlib.util.find_spec("sklearn") is None:
-        return False
-    return is_scipy_available() and importlib.util.find_spec("sklearn.metrics")
+    return _sklearn_available
 
 
 def is_sentencepiece_available():
-    return importlib.util.find_spec("sentencepiece") is not None
+    return _sentencepiece_available
+
+
+def is_seqio_available():
+    return _is_seqio_available
 
 
 def is_protobuf_available():
@@ -544,40 +600,72 @@ def is_protobuf_available():
     return importlib.util.find_spec("google.protobuf") is not None
 
 
-def is_accelerate_available():
-    return importlib.util.find_spec("accelerate") is not None
+def is_accelerate_available(min_version: str = None):
+    if min_version is not None:
+        return _accelerate_available and version.parse(_accelerate_version) >= version.parse(min_version)
+    return _accelerate_available
+
+
+def is_fsdp_available(min_version: str = "1.12.0"):
+    return is_torch_available() and version.parse(_torch_version) >= version.parse(min_version)
 
 
 def is_optimum_available():
-    return importlib.util.find_spec("optimum") is not None
+    return _optimum_available
+
+
+def is_auto_gptq_available():
+    return _auto_gptq_available
+
+
+def is_levenshtein_available():
+    return _levenshtein_available
+
+
+def is_optimum_neuron_available():
+    return _optimum_available and _is_package_available("optimum.neuron")
 
 
 def is_safetensors_available():
-    return importlib.util.find_spec("safetensors") is not None
+    return _safetensors_available
 
 
 def is_tokenizers_available():
-    return importlib.util.find_spec("tokenizers") is not None
+    return _tokenizers_available
 
 
 def is_vision_available():
-    return importlib.util.find_spec("PIL") is not None
+    _pil_available = importlib.util.find_spec("PIL") is not None
+    if _pil_available:
+        try:
+            package_version = importlib.metadata.version("Pillow")
+        except importlib.metadata.PackageNotFoundError:
+            try:
+                package_version = importlib.metadata.version("Pillow-SIMD")
+            except importlib.metadata.PackageNotFoundError:
+                return False
+        logger.debug(f"Detected PIL version {package_version}")
+    return _pil_available
 
 
 def is_pytesseract_available():
-    return importlib.util.find_spec("pytesseract") is not None
+    return _pytesseract_available
+
+
+def is_pytest_available():
+    return _pytest_available
 
 
 def is_spacy_available():
-    return importlib.util.find_spec("spacy") is not None
+    return _spacy_available
 
 
 def is_tensorflow_text_available():
-    return importlib.util.find_spec("tensorflow_text") is not None
+    return is_tf_available() and _tensorflow_text_available
 
 
 def is_keras_nlp_available():
-    return importlib.util.find_spec("keras_nlp") is not None
+    return is_tensorflow_text_available() and _keras_nlp_available
 
 
 def is_in_notebook():
@@ -607,7 +695,7 @@ def is_tensorflow_probability_available():
 
 
 def is_pandas_available():
-    return importlib.util.find_spec("pandas") is not None
+    return _pandas_available
 
 
 def is_sagemaker_dp_enabled():
@@ -621,7 +709,7 @@ def is_sagemaker_dp_enabled():
     except json.JSONDecodeError:
         return False
     # Lastly, check if the `smdistributed` module is present.
-    return importlib.util.find_spec("smdistributed") is not None
+    return _smdistributed_available
 
 
 def is_sagemaker_mp_enabled():
@@ -645,7 +733,7 @@ def is_sagemaker_mp_enabled():
     except json.JSONDecodeError:
         return False
     # Lastly, check if the `smdistributed` module is present.
-    return importlib.util.find_spec("smdistributed") is not None
+    return _smdistributed_available
 
 
 def is_training_run_on_sagemaker():
@@ -664,6 +752,10 @@ def is_natten_available():
     return _natten_available
 
 
+def is_nltk_available():
+    return _nltk_available
+
+
 def is_torchaudio_available():
     return _torchaudio_available
 
@@ -695,17 +787,39 @@ def is_ccl_available():
 
 
 def is_decord_available():
-    return _decord_availale
+    return _decord_available
 
 
 def is_sudachi_available():
-    return importlib.util.find_spec("sudachipy") is not None
+    return _sudachipy_available
 
 
 def is_jumanpp_available():
     return (importlib.util.find_spec("rhoknp") is not None) and (shutil.which("jumanpp") is not None)
 
 
+def is_cython_available():
+    return importlib.util.find_spec("pyximport") is not None
+
+
+def is_jieba_available():
+    return _jieba_available
+
+
+def is_jinja_available():
+    return _jinja_available
+
+
+# docstyle-ignore
+CV2_IMPORT_ERROR = """
+{0} requires the OpenCV library but it was not found in your environment. You can install it with:
+```
+pip install opencv-python
+```
+Please note that you may need to restart your runtime after installation.
+"""
+
+
 # docstyle-ignore
 DATASETS_IMPORT_ERROR = """
 {0} requires the 🤗 Datasets library but it was not found in your environment. You can install it with:
@@ -769,6 +883,14 @@ def is_jumanpp_available():
 Please note that you may need to restart your runtime after installation.
 """
 
+
+# docstyle-ignore
+TORCHVISION_IMPORT_ERROR = """
+{0} requires the Torchvision library but it was not found in your environment. Checkout the instructions on the
+installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
+Please note that you may need to restart your runtime after installation.
+"""
+
 # docstyle-ignore
 PYTORCH_IMPORT_ERROR_WITH_TF = """
 {0} requires the PyTorch library but it was not found in your environment.
@@ -844,6 +966,11 @@ def is_jumanpp_available():
 that match your environment. Please note that you may need to restart your runtime after installation.
 """
 
+LEVENSHTEIN_IMPORT_ERROR = """
+{0} requires the python-Levenshtein library but it was not found in your environment. You can install it with pip: `pip
+install python-Levenshtein`. Please note that you may need to restart your runtime after installation.
+"""
+
 # docstyle-ignore
 PYTORCH_QUANTIZATION_IMPORT_ERROR = """
 {0} requires the pytorch-quantization library but it was not found in your environment. You can install it with pip:
@@ -913,6 +1040,14 @@ def is_jumanpp_available():
 `pip install natten`. Please note that you may need to restart your runtime after installation.
 """
 
+
+# docstyle-ignore
+NLTK_IMPORT_ERROR = """
+{0} requires the NLTK library but it was not found in your environment. You can install it by referring to:
+https://www.nltk.org/install.html. Please note that you may need to restart your runtime after installation.
+"""
+
+
 # docstyle-ignore
 VISION_IMPORT_ERROR = """
 {0} requires the PIL library but it was not found in your environment. You can install it with pip:
@@ -945,21 +1080,67 @@ def is_jumanpp_available():
 Please note that you may need to restart your runtime after installation.
 """
 
+# docstyle-ignore
+ESSENTIA_IMPORT_ERROR = """
+{0} requires essentia library. But that was not found in your environment. You can install them with pip:
+`pip install essentia==2.1b6.dev1034`
+Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+LIBROSA_IMPORT_ERROR = """
+{0} requires thes librosa library. But that was not found in your environment. You can install them with pip:
+`pip install librosa`
+Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+PRETTY_MIDI_IMPORT_ERROR = """
+{0} requires thes pretty_midi library. But that was not found in your environment. You can install them with pip:
+`pip install pretty_midi`
+Please note that you may need to restart your runtime after installation.
+"""
+
 DECORD_IMPORT_ERROR = """
 {0} requires the decord library but it was not found in your environment. You can install it with pip: `pip install
 decord`. Please note that you may need to restart your runtime after installation.
 """
 
+CYTHON_IMPORT_ERROR = """
+{0} requires the Cython library but it was not found in your environment. You can install it with pip: `pip install
+Cython`. Please note that you may need to restart your runtime after installation.
+"""
+
+JIEBA_IMPORT_ERROR = """
+{0} requires the jieba library but it was not found in your environment. You can install it with pip: `pip install
+jieba`. Please note that you may need to restart your runtime after installation.
+"""
+
+PEFT_IMPORT_ERROR = """
+{0} requires the peft library but it was not found in your environment. You can install it with pip: `pip install
+peft`. Please note that you may need to restart your runtime after installation.
+"""
+
+JINJA_IMPORT_ERROR = """
+{0} requires the jinja library but it was not found in your environment. You can install it with pip: `pip install
+jinja2`. Please note that you may need to restart your runtime after installation.
+"""
+
 BACKENDS_MAPPING = OrderedDict(
     [
         ("bs4", (is_bs4_available, BS4_IMPORT_ERROR)),
+        ("cv2", (is_cv2_available, CV2_IMPORT_ERROR)),
         ("datasets", (is_datasets_available, DATASETS_IMPORT_ERROR)),
         ("detectron2", (is_detectron2_available, DETECTRON2_IMPORT_ERROR)),
+        ("essentia", (is_essentia_available, ESSENTIA_IMPORT_ERROR)),
         ("faiss", (is_faiss_available, FAISS_IMPORT_ERROR)),
         ("flax", (is_flax_available, FLAX_IMPORT_ERROR)),
         ("ftfy", (is_ftfy_available, FTFY_IMPORT_ERROR)),
         ("pandas", (is_pandas_available, PANDAS_IMPORT_ERROR)),
         ("phonemizer", (is_phonemizer_available, PHONEMIZER_IMPORT_ERROR)),
+        ("pretty_midi", (is_pretty_midi_available, PRETTY_MIDI_IMPORT_ERROR)),
+        ("levenshtein", (is_levenshtein_available, LEVENSHTEIN_IMPORT_ERROR)),
+        ("librosa", (is_librosa_available, LIBROSA_IMPORT_ERROR)),
         ("protobuf", (is_protobuf_available, PROTOBUF_IMPORT_ERROR)),
         ("pyctcdecode", (is_pyctcdecode_available, PYCTCDECODE_IMPORT_ERROR)),
         ("pytesseract", (is_pytesseract_available, PYTESSERACT_IMPORT_ERROR)),
@@ -973,13 +1154,19 @@ def is_jumanpp_available():
         ("tensorflow_text", (is_tensorflow_text_available, TENSORFLOW_TEXT_IMPORT_ERROR)),
         ("timm", (is_timm_available, TIMM_IMPORT_ERROR)),
         ("natten", (is_natten_available, NATTEN_IMPORT_ERROR)),
+        ("nltk", (is_nltk_available, NLTK_IMPORT_ERROR)),
         ("tokenizers", (is_tokenizers_available, TOKENIZERS_IMPORT_ERROR)),
         ("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)),
+        ("torchvision", (is_torchvision_available, TORCHVISION_IMPORT_ERROR)),
         ("vision", (is_vision_available, VISION_IMPORT_ERROR)),
         ("scipy", (is_scipy_available, SCIPY_IMPORT_ERROR)),
         ("accelerate", (is_accelerate_available, ACCELERATE_IMPORT_ERROR)),
         ("oneccl_bind_pt", (is_ccl_available, CCL_IMPORT_ERROR)),
         ("decord", (is_decord_available, DECORD_IMPORT_ERROR)),
+        ("cython", (is_cython_available, CYTHON_IMPORT_ERROR)),
+        ("jieba", (is_jieba_available, JIEBA_IMPORT_ERROR)),
+        ("peft", (is_peft_available, PEFT_IMPORT_ERROR)),
+        ("jinja", (is_jinja_available, JINJA_IMPORT_ERROR)),
     ]
 )
 
@@ -1086,3 +1273,22 @@ def __reduce__(self):
 
 class OptionalDependencyNotAvailable(BaseException):
     """Internally used error class for signalling an optional dependency was not found."""
+
+
+def direct_transformers_import(path: str, file="__init__.py") -> ModuleType:
+    """Imports transformers directly
+
+    Args:
+        path (`str`): The path to the source file
+        file (`str`, optional): The file to join with the path. Defaults to "__init__.py".
+
+    Returns:
+        `ModuleType`: The resulting imported module
+    """
+    name = "transformers"
+    location = os.path.join(path, file)
+    spec = importlib.util.spec_from_file_location(name, location, submodule_search_locations=[path])
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    module = sys.modules[name]
+    return module
diff --git a/src/transformers/utils/logging.py b/src/transformers/utils/logging.py
index a98e2f30fd6e..80d5b71f63e0 100644
--- a/src/transformers/utils/logging.py
+++ b/src/transformers/utils/logging.py
@@ -14,29 +14,33 @@
 # limitations under the License.
 """ Logging utilities."""
 
+
+import functools
 import logging
 import os
 import sys
 import threading
-from logging import CRITICAL  # NOQA
-from logging import DEBUG  # NOQA
-from logging import ERROR  # NOQA
-from logging import FATAL  # NOQA
-from logging import INFO  # NOQA
-from logging import NOTSET  # NOQA
-from logging import WARN  # NOQA
-from logging import WARNING  # NOQA
+from logging import (
+    CRITICAL,  # NOQA
+    DEBUG,  # NOQA
+    ERROR,  # NOQA
+    FATAL,  # NOQA
+    INFO,  # NOQA
+    NOTSET,  # NOQA
+    WARN,  # NOQA
+    WARNING,  # NOQA
+)
 from typing import Optional
 
-from tqdm import auto as tqdm_lib
-
 import huggingface_hub.utils as hf_hub_utils
+from tqdm import auto as tqdm_lib
 
 
 _lock = threading.Lock()
 _default_handler: Optional[logging.Handler] = None
 
 log_levels = {
+    "detail": logging.DEBUG,  # will also print filename and line number
     "debug": logging.DEBUG,
     "info": logging.INFO,
     "warning": logging.WARNING,
@@ -67,17 +71,14 @@ def _get_default_logging_level():
 
 
 def _get_library_name() -> str:
-
     return __name__.split(".")[0]
 
 
 def _get_library_root_logger() -> logging.Logger:
-
     return logging.getLogger(_get_library_name())
 
 
 def _configure_library_root_logger() -> None:
-
     global _default_handler
 
     with _lock:
@@ -85,17 +86,25 @@ def _configure_library_root_logger() -> None:
             # This library has already configured the library root logger.
             return
         _default_handler = logging.StreamHandler()  # Set sys.stderr as stream.
+        # set defaults based on https://github.com/pyinstaller/pyinstaller/issues/7334#issuecomment-1357447176
+        if sys.stderr is None:
+            sys.stderr = open(os.devnull, "w")
+
         _default_handler.flush = sys.stderr.flush
 
         # Apply our default configuration to the library root logger.
         library_root_logger = _get_library_root_logger()
         library_root_logger.addHandler(_default_handler)
         library_root_logger.setLevel(_get_default_logging_level())
+        # if logging level is debug, we add pathname and lineno to formatter for easy debugging
+        if os.getenv("TRANSFORMERS_VERBOSITY", None) == "detail":
+            formatter = logging.Formatter("[%(levelname)s|%(pathname)s:%(lineno)s] %(asctime)s >> %(message)s")
+            _default_handler.setFormatter(formatter)
+
         library_root_logger.propagate = False
 
 
 def _reset_library_root_logger() -> None:
-
     global _default_handler
 
     with _lock:
@@ -284,6 +293,21 @@ def warning_advice(self, *args, **kwargs):
 logging.Logger.warning_advice = warning_advice
 
 
+@functools.lru_cache(None)
+def warning_once(self, *args, **kwargs):
+    """
+    This method is identical to `logger.warning()`, but will emit the warning with the same message only once
+
+    Note: The cache is for the function arguments, so 2 different callers using the same arguments will hit the cache.
+    The assumption here is that all warning messages are unique across the code. If they aren't then need to switch to
+    another type of cache that includes the caller frame information in the hashing function.
+    """
+    self.warning(*args, **kwargs)
+
+
+logging.Logger.warning_once = warning_once
+
+
 class EmptyTqdm:
     """Dummy tqdm which doesn't do anything."""
 
diff --git a/src/transformers/utils/model_parallel_utils.py b/src/transformers/utils/model_parallel_utils.py
index bcbe80801359..7ec79a5e23cb 100644
--- a/src/transformers/utils/model_parallel_utils.py
+++ b/src/transformers/utils/model_parallel_utils.py
@@ -33,14 +33,12 @@ def assert_device_map(device_map, num_blocks):
     if len(duplicate_blocks) != 0:
         raise ValueError(
             "Duplicate attention blocks specified in device_map. Attention blocks must be specified to one device."
-            " These attention blocks were specified more than once: "
-            + str(duplicate_blocks)
+            " These attention blocks were specified more than once: " + str(duplicate_blocks)
         )
     if len(missing_blocks) != 0:
         raise ValueError(
             "There are attention blocks for this model that are not specified in the device_map. Add these attention "
-            "blocks to a device on the device_map: "
-            + str(missing_blocks)
+            "blocks to a device on the device_map: " + str(missing_blocks)
         )
     if len(extra_blocks) != 0:
         raise ValueError(
@@ -53,6 +51,6 @@ def get_device_map(n_layers, devices):
     """Returns a dictionary of layers distributed evenly across all devices."""
     layers = list(range(n_layers))
     n_blocks = int(ceil(n_layers / len(devices)))
-    layers_list = list(layers[i : i + n_blocks] for i in range(0, n_layers, n_blocks))
+    layers_list = [layers[i : i + n_blocks] for i in range(0, n_layers, n_blocks)]
 
     return dict(zip(devices, layers_list))
diff --git a/src/transformers/utils/notebook.py b/src/transformers/utils/notebook.py
index 7894f4ad9ca6..c97849d02dbe 100644
--- a/src/transformers/utils/notebook.py
+++ b/src/transformers/utils/notebook.py
@@ -235,13 +235,25 @@ def write_line(self, values):
             self.inner_table = [list(values.keys()), list(values.values())]
         else:
             columns = self.inner_table[0]
-            if len(self.inner_table) == 1:
-                # We give a chance to update the column names at the first iteration
-                for key in values.keys():
-                    if key not in columns:
-                        columns.append(key)
-                self.inner_table[0] = columns
-            self.inner_table.append([values[c] for c in columns])
+            for key in values.keys():
+                if key not in columns:
+                    columns.append(key)
+            self.inner_table[0] = columns
+            if len(self.inner_table) > 1:
+                last_values = self.inner_table[-1]
+                first_column = self.inner_table[0][0]
+                if last_values[0] != values[first_column]:
+                    # write new line
+                    self.inner_table.append([values[c] if c in values else "No Log" for c in columns])
+                else:
+                    # update last line
+                    new_values = values
+                    for c in columns:
+                        if c not in new_values.keys():
+                            new_values[c] = last_values[columns.index(c)]
+                    self.inner_table[-1] = [new_values[c] for c in columns]
+            else:
+                self.inner_table.append([values[c] for c in columns])
 
     def add_child(self, total, prefix=None, width=300):
         """
@@ -341,12 +353,12 @@ def on_evaluate(self, args, state, control, metrics=None, **kwargs):
             _ = metrics.pop(f"{metric_key_prefix}_steps_per_second", None)
             _ = metrics.pop(f"{metric_key_prefix}_jit_compilation_time", None)
             for k, v in metrics.items():
-                if k == f"{metric_key_prefix}_loss":
-                    values["Validation Loss"] = v
-                else:
-                    splits = k.split("_")
-                    name = " ".join([part.capitalize() for part in splits[1:]])
-                    values[name] = v
+                splits = k.split("_")
+                name = " ".join([part.capitalize() for part in splits[1:]])
+                if name == "Loss":
+                    # Single dataset
+                    name = "Validation Loss"
+                values[name] = v
             self.training_tracker.write_line(values)
             self.training_tracker.remove_child()
             self.prediction_bar = None
diff --git a/src/transformers/utils/peft_utils.py b/src/transformers/utils/peft_utils.py
new file mode 100644
index 000000000000..0e20db8ea06c
--- /dev/null
+++ b/src/transformers/utils/peft_utils.py
@@ -0,0 +1,123 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+import os
+from typing import Dict, Optional, Union
+
+from packaging import version
+
+from .hub import cached_file
+from .import_utils import is_peft_available
+
+
+ADAPTER_CONFIG_NAME = "adapter_config.json"
+ADAPTER_WEIGHTS_NAME = "adapter_model.bin"
+ADAPTER_SAFE_WEIGHTS_NAME = "adapter_model.safetensors"
+
+
+def find_adapter_config_file(
+    model_id: str,
+    cache_dir: Optional[Union[str, os.PathLike]] = None,
+    force_download: bool = False,
+    resume_download: bool = False,
+    proxies: Optional[Dict[str, str]] = None,
+    token: Optional[Union[bool, str]] = None,
+    revision: Optional[str] = None,
+    local_files_only: bool = False,
+    subfolder: str = "",
+    _commit_hash: Optional[str] = None,
+) -> Optional[str]:
+    r"""
+    Simply checks if the model stored on the Hub or locally is an adapter model or not, return the path the the adapter
+    config file if it is, None otherwise.
+
+    Args:
+        model_id (`str`):
+            The identifier of the model to look for, can be either a local path or an id to the repository on the Hub.
+        cache_dir (`str` or `os.PathLike`, *optional*):
+            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
+            cache should not be used.
+        force_download (`bool`, *optional*, defaults to `False`):
+            Whether or not to force to (re-)download the configuration files and override the cached versions if they
+            exist.
+        resume_download (`bool`, *optional*, defaults to `False`):
+            Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
+        proxies (`Dict[str, str]`, *optional*):
+            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+        token (`str` or *bool*, *optional*):
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+            when running `huggingface-cli login` (stored in `~/.huggingface`).
+        revision (`str`, *optional*, defaults to `"main"`):
+            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+            identifier allowed by git.
+
+            <Tip>
+
+            To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>".
+
+            </Tip>
+
+        local_files_only (`bool`, *optional*, defaults to `False`):
+            If `True`, will only try to load the tokenizer configuration from local files.
+        subfolder (`str`, *optional*, defaults to `""`):
+            In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
+            specify the folder name here.
+    """
+    adapter_cached_filename = None
+    if model_id is None:
+        return None
+    elif os.path.isdir(model_id):
+        list_remote_files = os.listdir(model_id)
+        if ADAPTER_CONFIG_NAME in list_remote_files:
+            adapter_cached_filename = os.path.join(model_id, ADAPTER_CONFIG_NAME)
+    else:
+        adapter_cached_filename = cached_file(
+            model_id,
+            ADAPTER_CONFIG_NAME,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            resume_download=resume_download,
+            proxies=proxies,
+            token=token,
+            revision=revision,
+            local_files_only=local_files_only,
+            subfolder=subfolder,
+            _commit_hash=_commit_hash,
+            _raise_exceptions_for_missing_entries=False,
+            _raise_exceptions_for_connection_errors=False,
+        )
+
+    return adapter_cached_filename
+
+
+def check_peft_version(min_version: str) -> None:
+    r"""
+    Checks if the version of PEFT is compatible.
+
+    Args:
+        version (`str`):
+            The version of PEFT to check against.
+    """
+    if not is_peft_available():
+        raise ValueError("PEFT is not installed. Please install it with `pip install peft`")
+
+    is_peft_version_compatible = version.parse(importlib.metadata.version("peft")) >= version.parse(min_version)
+
+    if not is_peft_version_compatible:
+        raise ValueError(
+            f"The version of PEFT you are using is not compatible, please use a version that is greater"
+            f" than {min_version}"
+        )
diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
new file mode 100644
index 000000000000..13f81a5a2cfa
--- /dev/null
+++ b/src/transformers/utils/quantization_config.py
@@ -0,0 +1,420 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import importlib.metadata
+import json
+import os
+from dataclasses import dataclass
+from enum import Enum
+from typing import Any, Dict, List, Optional, Union
+
+from packaging import version
+
+from ..utils import is_torch_available, logging
+
+
+if is_torch_available():
+    import torch
+
+
+logger = logging.get_logger(__name__)
+
+
+class QuantizationMethod(str, Enum):
+    BITS_AND_BYTES = "bitsandbytes"
+    GPTQ = "gptq"
+
+
+@dataclass
+class QuantizationConfigMixin:
+    """
+    Mixin class for quantization config
+    """
+
+    quant_method: QuantizationMethod
+
+    @classmethod
+    def from_dict(cls, config_dict, return_unused_kwargs=False, **kwargs):
+        """
+        Instantiates a [`QuantizationConfigMixin`] from a Python dictionary of parameters.
+
+        Args:
+            config_dict (`Dict[str, Any]`):
+                Dictionary that will be used to instantiate the configuration object.
+            return_unused_kwargs (`bool`,*optional*, defaults to `False`):
+                Whether or not to return a list of unused keyword arguments. Used for `from_pretrained` method in
+                `PreTrainedModel`.
+            kwargs (`Dict[str, Any]`):
+                Additional parameters from which to initialize the configuration object.
+
+        Returns:
+            [`QuantizationConfigMixin`]: The configuration object instantiated from those parameters.
+        """
+
+        config = cls(**config_dict)
+
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(config, key):
+                setattr(config, key, value)
+                to_remove.append(key)
+        for key in to_remove:
+            kwargs.pop(key, None)
+
+        if return_unused_kwargs:
+            return config, kwargs
+        else:
+            return config
+
+    def to_json_file(self, json_file_path: Union[str, os.PathLike]):
+        """
+        Save this instance to a JSON file.
+
+        Args:
+            json_file_path (`str` or `os.PathLike`):
+                Path to the JSON file in which this configuration instance's parameters will be saved.
+            use_diff (`bool`, *optional*, defaults to `True`):
+                If set to `True`, only the difference between the config instance and the default
+                `QuantizationConfig()` is serialized to JSON file.
+        """
+        with open(json_file_path, "w", encoding="utf-8") as writer:
+            config_dict = self.to_dict()
+            json_string = json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
+
+            writer.write(json_string)
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes this instance to a Python dictionary. Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
+        """
+        return copy.deepcopy(self.__dict__)
+
+    def __repr__(self):
+        return f"{self.__class__.__name__} {self.to_json_string()}"
+
+    def to_json_string(self, use_diff: bool = True) -> str:
+        """
+        Serializes this instance to a JSON string.
+
+        Args:
+            use_diff (`bool`, *optional*, defaults to `True`):
+                If set to `True`, only the difference between the config instance and the default `PretrainedConfig()`
+                is serialized to JSON string.
+
+        Returns:
+            `str`: String containing all the attributes that make up this configuration instance in JSON format.
+        """
+        if use_diff is True:
+            config_dict = self.to_diff_dict()
+        else:
+            config_dict = self.to_dict()
+        return json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
+
+
+@dataclass
+class BitsAndBytesConfig(QuantizationConfigMixin):
+    """
+    This is a wrapper class about all possible attributes and features that you can play with a model that has been
+    loaded using `bitsandbytes`.
+
+    This replaces `load_in_8bit` or `load_in_4bit`therefore both options are mutually exclusive.
+
+    Currently only supports `LLM.int8()`, `FP4`, and `NF4` quantization. If more methods are added to `bitsandbytes`,
+    then more arguments will be added to this class.
+
+    Args:
+        load_in_8bit (`bool`, *optional*, defaults to `False`):
+            This flag is used to enable 8-bit quantization with LLM.int8().
+        load_in_4bit (`bool`, *optional*, defaults to `False`):
+            This flag is used to enable 4-bit quantization by replacing the Linear layers with FP4/NF4 layers from
+            `bitsandbytes`.
+        llm_int8_threshold (`float`, *optional*, defaults to 6.0):
+            This corresponds to the outlier threshold for outlier detection as described in `LLM.int8() : 8-bit Matrix
+            Multiplication for Transformers at Scale` paper: https://arxiv.org/abs/2208.07339 Any hidden states value
+            that is above this threshold will be considered an outlier and the operation on those values will be done
+            in fp16. Values are usually normally distributed, that is, most values are in the range [-3.5, 3.5], but
+            there are some exceptional systematic outliers that are very differently distributed for large models.
+            These outliers are often in the interval [-60, -6] or [6, 60]. Int8 quantization works well for values of
+            magnitude ~5, but beyond that, there is a significant performance penalty. A good default threshold is 6,
+            but a lower threshold might be needed for more unstable models (small models, fine-tuning).
+        llm_int8_skip_modules (`List[str]`, *optional*):
+            An explicit list of the modules that we do not want to convert in 8-bit. This is useful for models such as
+            Jukebox that has several heads in different places and not necessarily at the last position. For example
+            for `CausalLM` models, the last `lm_head` is kept in its original `dtype`.
+        llm_int8_enable_fp32_cpu_offload (`bool`, *optional*, defaults to `False`):
+            This flag is used for advanced use cases and users that are aware of this feature. If you want to split
+            your model in different parts and run some parts in int8 on GPU and some parts in fp32 on CPU, you can use
+            this flag. This is useful for offloading large models such as `google/flan-t5-xxl`. Note that the int8
+            operations will not be run on CPU.
+        llm_int8_has_fp16_weight (`bool`, *optional*, defaults to `False`):
+            This flag runs LLM.int8() with 16-bit main weights. This is useful for fine-tuning as the weights do not
+            have to be converted back and forth for the backward pass.
+        bnb_4bit_compute_dtype (`torch.dtype` or str, *optional*, defaults to `torch.float32`):
+            This sets the computational type which might be different than the input time. For example, inputs might be
+            fp32, but computation can be set to bf16 for speedups.
+        bnb_4bit_quant_type (`str`,  *optional*, defaults to `"fp4"`):
+            This sets the quantization data type in the bnb.nn.Linear4Bit layers. Options are FP4 and NF4 data types
+            which are specified by `fp4` or `nf4`.
+        bnb_4bit_use_double_quant (`bool`, *optional*, defaults to `False`):
+            This flag is used for nested quantization where the quantization constants from the first quantization are
+            quantized again.
+        kwargs (`Dict[str, Any]`, *optional*):
+            Additional parameters from which to initialize the configuration object.
+    """
+
+    def __init__(
+        self,
+        load_in_8bit=False,
+        load_in_4bit=False,
+        llm_int8_threshold=6.0,
+        llm_int8_skip_modules=None,
+        llm_int8_enable_fp32_cpu_offload=False,
+        llm_int8_has_fp16_weight=False,
+        bnb_4bit_compute_dtype=None,
+        bnb_4bit_quant_type="fp4",
+        bnb_4bit_use_double_quant=False,
+        **kwargs,
+    ):
+        self.quant_method = QuantizationMethod.BITS_AND_BYTES
+        self.load_in_8bit = load_in_8bit
+        self.load_in_4bit = load_in_4bit
+        self.llm_int8_threshold = llm_int8_threshold
+        self.llm_int8_skip_modules = llm_int8_skip_modules
+        self.llm_int8_enable_fp32_cpu_offload = llm_int8_enable_fp32_cpu_offload
+        self.llm_int8_has_fp16_weight = llm_int8_has_fp16_weight
+        self.bnb_4bit_quant_type = bnb_4bit_quant_type
+        self.bnb_4bit_use_double_quant = bnb_4bit_use_double_quant
+
+        if bnb_4bit_compute_dtype is None:
+            self.bnb_4bit_compute_dtype = torch.float32
+        elif isinstance(bnb_4bit_compute_dtype, str):
+            self.bnb_4bit_compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
+        elif isinstance(bnb_4bit_compute_dtype, torch.dtype):
+            self.bnb_4bit_compute_dtype = bnb_4bit_compute_dtype
+        else:
+            raise ValueError("bnb_4bit_compute_dtype must be a string or a torch.dtype")
+
+        self.post_init()
+
+    def post_init(self):
+        r"""
+        Safety checker that arguments are correct - also replaces some NoneType arguments with their default values.
+        """
+        if not isinstance(self.llm_int8_threshold, float):
+            raise ValueError("llm_int8_threshold must be a float")
+
+        if self.llm_int8_skip_modules is not None and not isinstance(self.llm_int8_skip_modules, list):
+            raise ValueError("llm_int8_skip_modules must be a list of strings")
+        if not isinstance(self.llm_int8_enable_fp32_cpu_offload, bool):
+            raise ValueError("llm_int8_enable_fp32_cpu_offload must be a boolean")
+
+        if not isinstance(self.llm_int8_has_fp16_weight, bool):
+            raise ValueError("llm_int8_has_fp16_weight must be a boolean")
+
+        if self.bnb_4bit_compute_dtype is not None and not isinstance(self.bnb_4bit_compute_dtype, torch.dtype):
+            raise ValueError("bnb_4bit_compute_dtype must be torch.dtype")
+
+        if not isinstance(self.bnb_4bit_quant_type, str):
+            raise ValueError("bnb_4bit_quant_type must be a string")
+
+        if not isinstance(self.bnb_4bit_use_double_quant, bool):
+            raise ValueError("bnb_4bit_use_double_quant must be a boolean")
+
+        if self.load_in_4bit and not version.parse(importlib.metadata.version("bitsandbytes")) >= version.parse(
+            "0.39.0"
+        ):
+            raise ValueError(
+                "4 bit quantization requires bitsandbytes>=0.39.0 - please upgrade your bitsandbytes version"
+            )
+
+    def is_quantizable(self):
+        r"""
+        Returns `True` if the model is quantizable, `False` otherwise.
+        """
+        return self.load_in_8bit or self.load_in_4bit
+
+    def quantization_method(self):
+        r"""
+        This method returns the quantization method used for the model. If the model is not quantizable, it returns
+        `None`.
+        """
+        if self.load_in_8bit:
+            return "llm_int8"
+        elif self.load_in_4bit and self.bnb_4bit_quant_type == "fp4":
+            return "fp4"
+        elif self.load_in_4bit and self.bnb_4bit_quant_type == "nf4":
+            return "nf4"
+        else:
+            return None
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes this instance to a Python dictionary. Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["bnb_4bit_compute_dtype"] = str(output["bnb_4bit_compute_dtype"]).split(".")[1]
+
+        return output
+
+    def __repr__(self):
+        config_dict = self.to_dict()
+        return f"{self.__class__.__name__} {json.dumps(config_dict, indent=2, sort_keys=True)}\n"
+
+    def to_diff_dict(self) -> Dict[str, Any]:
+        """
+        Removes all attributes from config which correspond to the default config attributes for better readability and
+        serializes to a Python dictionary.
+
+        Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        config_dict = self.to_dict()
+
+        # get the default config dict
+        default_config_dict = BitsAndBytesConfig().to_dict()
+
+        serializable_config_dict = {}
+
+        # only serialize values that differ from the default config
+        for key, value in config_dict.items():
+            if value != default_config_dict[key]:
+                serializable_config_dict[key] = value
+
+        return serializable_config_dict
+
+
+@dataclass
+class GPTQConfig(QuantizationConfigMixin):
+    """
+    This is a wrapper class about all possible attributes and features that you can play with a model that has been
+    loaded using `optimum` api for gptq quantization relying on auto_gptq backend.
+
+    Args:
+        bits (`int`):
+            The number of bits to quantize to, supported numbers are (2, 3, 4, 8).
+        tokenizer (`str` or `PreTrainedTokenizerBase`, *optional*):
+            The tokenizer used to process the dataset. You can pass either:
+                - A custom tokenizer object.
+                - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
+                    Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
+                    user or organization name, like `dbmdz/bert-base-german-cased`.
+                - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
+                    using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
+        dataset (`Union[List[str]]`, *optional*):
+            The dataset used for quantization. You can provide your own dataset in a list of string or just use the
+            original datasets used in GPTQ paper ['wikitext2','c4','c4-new','ptb','ptb-new']
+        group_size (`int`, *optional*, defaults to 128):
+            The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization.
+        damp_percent (`float`, *optional*, defaults to 0.1):
+            The percent of the average Hessian diagonal to use for dampening. Recommended value is 0.1.
+        desc_act (`bool`, *optional*, defaults to `False`):
+            Whether to quantize columns in order of decreasing activation size. Setting it to False can significantly
+            speed up inference but the perplexity may become slightly worse. Also known as act-order.
+        sym (`bool`, *optional*, defaults to `True`):
+            Whether to use symetric quantization.
+        true_sequential (`bool`, *optional*, defaults to `True`):
+            Whether to perform sequential quantization even within a single Transformer block. Instead of quantizing
+            the entire block at once, we perform layer-wise quantization. As a result, each layer undergoes
+            quantization using inputs that have passed through the previously quantized layers.
+        use_cuda_fp16 (`bool`, *optional*, defaults to `False`):
+            Whether or not to use optimized cuda kernel for fp16 model. Need to have model in fp16.
+        model_seqlen (`int`, *optional*):
+            The maximum sequence length that the model can take.
+        block_name_to_quantize (`str`, *optional*):
+            The transformers block name to quantize.
+        module_name_preceding_first_block (`List[str]`, *optional*):
+            The layers that are preceding the first Transformer block.
+        batch_size (`int`, *optional*, defaults to 1):
+            The batch size used when processing the dataset
+        pad_token_id (`int`, *optional*):
+            The pad token id. Needed to prepare the dataset when `batch_size` > 1.
+        disable_exllama (`bool`, *optional*, defaults to `False`):
+            Whether to use exllama backend. Only works with `bits` = 4.
+        max_input_length (`int`, *optional*):
+            The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input
+            length. It is specific to the exllama backend with act-order.
+    """
+
+    def __init__(
+        self,
+        bits: int,
+        tokenizer: Any = None,
+        dataset: Optional[Union[List[str], str]] = None,
+        group_size: int = 128,
+        damp_percent: float = 0.1,
+        desc_act: bool = False,
+        sym: bool = True,
+        true_sequential: bool = True,
+        use_cuda_fp16: bool = False,
+        model_seqlen: Optional[int] = None,
+        block_name_to_quantize: Optional[str] = None,
+        module_name_preceding_first_block: Optional[List[str]] = None,
+        batch_size: int = 1,
+        pad_token_id: Optional[int] = None,
+        disable_exllama: bool = False,
+        max_input_length: Optional[int] = None,
+        **kwargs,
+    ):
+        self.quant_method = QuantizationMethod.GPTQ
+        self.bits = bits
+        self.tokenizer = tokenizer
+        self.dataset = dataset
+        self.group_size = group_size
+        self.damp_percent = damp_percent
+        self.desc_act = desc_act
+        self.sym = sym
+        self.true_sequential = true_sequential
+        self.use_cuda_fp16 = use_cuda_fp16
+        self.model_seqlen = model_seqlen
+        self.block_name_to_quantize = block_name_to_quantize
+        self.module_name_preceding_first_block = module_name_preceding_first_block
+        self.batch_size = batch_size
+        self.pad_token_id = pad_token_id
+        self.disable_exllama = disable_exllama
+        self.max_input_length = max_input_length
+        self.post_init()
+
+    def get_loading_attributes(self):
+        attibutes_dict = copy.deepcopy(self.__dict__)
+        loading_attibutes = ["disable_exllama", "use_cuda_fp16", "max_input_length"]
+        loading_attibutes_dict = {i: j for i, j in attibutes_dict.items() if i in loading_attibutes}
+        return loading_attibutes_dict
+
+    def post_init(self):
+        r"""
+        Safety checker that arguments are correct
+        """
+        if self.bits not in [2, 3, 4, 8]:
+            raise ValueError(f"Only support quantization to [2,3,4,8] bits but found {self.bits}")
+        if self.group_size != -1 and self.group_size <= 0:
+            raise ValueError("group_size must be greater than 0 or equal to -1")
+        if not (0 < self.damp_percent < 1):
+            raise ValueError("damp_percent must between 0 and 1.")
+        if self.dataset is not None:
+            if isinstance(self.dataset, str):
+                if self.dataset not in ["wikitext2", "c4", "c4-new", "ptb", "ptb-new"]:
+                    raise ValueError(
+                        f"""You have entered a string value for dataset. You can only choose between
+                        ['wikitext2','c4','c4-new','ptb','ptb-new'], but we found {self.dataset}"""
+                    )
+            elif not isinstance(self.dataset, list):
+                raise ValueError(
+                    f"""dataset needs to be either a list of string or a value in
+                    ['wikitext2','c4','c4-new','ptb','ptb-new'], but we found {self.dataset}"""
+                )
diff --git a/src/transformers/utils/sentencepiece_model_pb2.py b/src/transformers/utils/sentencepiece_model_pb2.py
index 41411cee8cd6..458fe913d63a 100644
--- a/src/transformers/utils/sentencepiece_model_pb2.py
+++ b/src/transformers/utils/sentencepiece_model_pb2.py
@@ -1,4 +1,3 @@
-# flake8: noqa
 # Generated by the protocol buffer compiler.  DO NOT EDIT!
 # source: sentencepiece_model.proto
 
diff --git a/src/transformers/utils/sentencepiece_model_pb2_new.py b/src/transformers/utils/sentencepiece_model_pb2_new.py
new file mode 100644
index 000000000000..4a2e29b1bdc3
--- /dev/null
+++ b/src/transformers/utils/sentencepiece_model_pb2_new.py
@@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: sentencepiece_model.proto
+"""Generated protocol buffer code."""
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import descriptor_pool as _descriptor_pool
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf.internal import builder as _builder
+
+
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
+    b'\n\x19sentencepiece_model.proto\x12\rsentencepiece"\x80\x0c\n\x0bTrainerSpec\x12\r\n\x05input\x18\x01 \x03(\t\x12\x14\n\x0cinput_format\x18\x07 \x01(\t\x12\x14\n\x0cmodel_prefix\x18\x02 \x01(\t\x12\x41\n\nmodel_type\x18\x03 \x01(\x0e\x32$.sentencepiece.TrainerSpec.ModelType:\x07UNIGRAM\x12\x18\n\nvocab_size\x18\x04 \x01(\x05:\x04\x38\x30\x30\x30\x12\x17\n\x0f\x61\x63\x63\x65pt_language\x18\x05 \x03(\t\x12 \n\x15self_test_sample_size\x18\x06 \x01(\x05:\x01\x30\x12*\n\x1b\x65nable_differential_privacy\x18\x32 \x01(\x08:\x05\x66\x61lse\x12+\n differential_privacy_noise_level\x18\x33 \x01(\x02:\x01\x30\x12\x32\n\'differential_privacy_clipping_threshold\x18\x34 \x01(\x04:\x01\x30\x12"\n\x12\x63haracter_coverage\x18\n \x01(\x02:\x06\x30.9995\x12\x1e\n\x13input_sentence_size\x18\x0b \x01(\x04:\x01\x30\x12$\n\x16shuffle_input_sentence\x18\x13 \x01(\x08:\x04true\x12 \n\x14mining_sentence_size\x18\x0c \x01(\x05\x42\x02\x18\x01\x12"\n\x16training_sentence_size\x18\r \x01(\x05\x42\x02\x18\x01\x12(\n\x17seed_sentencepiece_size\x18\x0e \x01(\x05:\x07\x31\x30\x30\x30\x30\x30\x30\x12\x1e\n\x10shrinking_factor\x18\x0f \x01(\x02:\x04\x30.75\x12!\n\x13max_sentence_length\x18\x12 \x01(\x05:\x04\x34\x31\x39\x32\x12\x17\n\x0bnum_threads\x18\x10 \x01(\x05:\x02\x31\x36\x12\x1d\n\x12num_sub_iterations\x18\x11 \x01(\x05:\x01\x32\x12$\n\x18max_sentencepiece_length\x18\x14 \x01(\x05:\x02\x31\x36\x12%\n\x17split_by_unicode_script\x18\x15 \x01(\x08:\x04true\x12\x1d\n\x0fsplit_by_number\x18\x17 \x01(\x08:\x04true\x12!\n\x13split_by_whitespace\x18\x16 \x01(\x08:\x04true\x12)\n\x1atreat_whitespace_as_suffix\x18\x18 \x01(\x08:\x05\x66\x61lse\x12+\n\x1c\x61llow_whitespace_only_pieces\x18\x1a \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x0csplit_digits\x18\x19 \x01(\x08:\x05\x66\x61lse\x12#\n\x19pretokenization_delimiter\x18\x35 \x01(\t:\x00\x12\x17\n\x0f\x63ontrol_symbols\x18\x1e \x03(\t\x12\x1c\n\x14user_defined_symbols\x18\x1f \x03(\t\x12\x16\n\x0erequired_chars\x18$ \x01(\t\x12\x1c\n\rbyte_fallback\x18# \x01(\x08:\x05\x66\x61lse\x12+\n\x1dvocabulary_output_piece_score\x18  \x01(\x08:\x04true\x12\x1e\n\x10hard_vocab_limit\x18! \x01(\x08:\x04true\x12\x1c\n\ruse_all_vocab\x18" \x01(\x08:\x05\x66\x61lse\x12\x11\n\x06unk_id\x18( \x01(\x05:\x01\x30\x12\x11\n\x06\x62os_id\x18) \x01(\x05:\x01\x31\x12\x11\n\x06\x65os_id\x18* \x01(\x05:\x01\x32\x12\x12\n\x06pad_id\x18+ \x01(\x05:\x02-1\x12\x18\n\tunk_piece\x18- \x01(\t:\x05<unk>\x12\x16\n\tbos_piece\x18. \x01(\t:\x03<s>\x12\x17\n\teos_piece\x18/ \x01(\t:\x04</s>\x12\x18\n\tpad_piece\x18\x30 \x01(\t:\x05<pad>\x12\x1a\n\x0bunk_surface\x18, \x01(\t:\x05 \xe2\x81\x87 \x12+\n\x1ctrain_extremely_large_corpus\x18\x31 \x01(\x08:\x05\x66\x61lse"5\n\tModelType\x12\x0b\n\x07UNIGRAM\x10\x01\x12\x07\n\x03\x42PE\x10\x02\x12\x08\n\x04WORD\x10\x03\x12\x08\n\x04\x43HAR\x10\x04*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02"\xd1\x01\n\x0eNormalizerSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1c\n\x14precompiled_charsmap\x18\x02 \x01(\x0c\x12\x1e\n\x10\x61\x64\x64_dummy_prefix\x18\x03 \x01(\x08:\x04true\x12&\n\x18remove_extra_whitespaces\x18\x04 \x01(\x08:\x04true\x12 \n\x12\x65scape_whitespaces\x18\x05 \x01(\x08:\x04true\x12\x1e\n\x16normalization_rule_tsv\x18\x06 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02"y\n\x0cSelfTestData\x12\x33\n\x07samples\x18\x01 \x03(\x0b\x32".sentencepiece.SelfTestData.Sample\x1a)\n\x06Sample\x12\r\n\x05input\x18\x01 \x01(\t\x12\x10\n\x08\x65xpected\x18\x02 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02"\xfe\x03\n\nModelProto\x12\x37\n\x06pieces\x18\x01 \x03(\x0b\x32\'.sentencepiece.ModelProto.SentencePiece\x12\x30\n\x0ctrainer_spec\x18\x02 \x01(\x0b\x32\x1a.sentencepiece.TrainerSpec\x12\x36\n\x0fnormalizer_spec\x18\x03 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x12\x33\n\x0eself_test_data\x18\x04 \x01(\x0b\x32\x1b.sentencepiece.SelfTestData\x12\x38\n\x11\x64\x65normalizer_spec\x18\x05 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x1a\xd2\x01\n\rSentencePiece\x12\r\n\x05piece\x18\x01 \x01(\t\x12\r\n\x05score\x18\x02 \x01(\x02\x12\x42\n\x04type\x18\x03 \x01(\x0e\x32,.sentencepiece.ModelProto.SentencePiece.Type:\x06NORMAL"T\n\x04Type\x12\n\n\x06NORMAL\x10\x01\x12\x0b\n\x07UNKNOWN\x10\x02\x12\x0b\n\x07\x43ONTROL\x10\x03\x12\x10\n\x0cUSER_DEFINED\x10\x04\x12\x08\n\x04\x42YTE\x10\x06\x12\n\n\x06UNUSED\x10\x05*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\x42\x02H\x03'
+)
+
+_globals = globals()
+_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
+_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "sentencepiece_model_pb2", _globals)
+if _descriptor._USE_C_DESCRIPTORS is False:
+    DESCRIPTOR._options = None
+    DESCRIPTOR._serialized_options = b"H\003"
+    # (generated by protobuf compiler, but `_TRAINERSPEC` is not defined)
+    # _TRAINERSPEC.fields_by_name["mining_sentence_size"]._options = None
+    # _TRAINERSPEC.fields_by_name["mining_sentence_size"]._serialized_options = b"\030\001"
+    # _TRAINERSPEC.fields_by_name["training_sentence_size"]._options = None
+    # _TRAINERSPEC.fields_by_name["training_sentence_size"]._serialized_options = b"\030\001"
+    _globals["_TRAINERSPEC"]._serialized_start = 45
+    _globals["_TRAINERSPEC"]._serialized_end = 1581
+    _globals["_TRAINERSPEC_MODELTYPE"]._serialized_start = 1517
+    _globals["_TRAINERSPEC_MODELTYPE"]._serialized_end = 1570
+    _globals["_NORMALIZERSPEC"]._serialized_start = 1584
+    _globals["_NORMALIZERSPEC"]._serialized_end = 1793
+    _globals["_SELFTESTDATA"]._serialized_start = 1795
+    _globals["_SELFTESTDATA"]._serialized_end = 1916
+    _globals["_SELFTESTDATA_SAMPLE"]._serialized_start = 1864
+    _globals["_SELFTESTDATA_SAMPLE"]._serialized_end = 1905
+    _globals["_MODELPROTO"]._serialized_start = 1919
+    _globals["_MODELPROTO"]._serialized_end = 2429
+    _globals["_MODELPROTO_SENTENCEPIECE"]._serialized_start = 2208
+    _globals["_MODELPROTO_SENTENCEPIECE"]._serialized_end = 2418
+    _globals["_MODELPROTO_SENTENCEPIECE_TYPE"]._serialized_start = 2323
+    _globals["_MODELPROTO_SENTENCEPIECE_TYPE"]._serialized_end = 2407
+# @@protoc_insertion_point(module_scope)
diff --git a/src/transformers/utils/versions.py b/src/transformers/utils/versions.py
index b97e75a33682..945a3977ce62 100644
--- a/src/transformers/utils/versions.py
+++ b/src/transformers/utils/versions.py
@@ -15,6 +15,7 @@
 Utilities for working with package versions
 """
 
+import importlib.metadata
 import operator
 import re
 import sys
@@ -23,13 +24,6 @@
 from packaging import version
 
 
-# The package importlib_metadata is in a different place, depending on the python version.
-if sys.version_info < (3, 8):
-    import importlib_metadata
-else:
-    import importlib.metadata as importlib_metadata
-
-
 ops = {
     "<": operator.lt,
     "<=": operator.le,
@@ -56,7 +50,7 @@ def require_version(requirement: str, hint: Optional[str] = None) -> None:
     """
     Perform a runtime check of the dependency versions, using the exact same syntax used by pip.
 
-    The installed module version comes from the *site-packages* dir via *importlib_metadata*.
+    The installed module version comes from the *site-packages* dir via *importlib.metadata*.
 
     Args:
         requirement (`str`): pip style definition, e.g.,  "tokenizers==0.9.4", "tqdm>=4.27", "numpy"
@@ -105,9 +99,9 @@ def require_version(requirement: str, hint: Optional[str] = None) -> None:
 
     # check if any version is installed
     try:
-        got_ver = importlib_metadata.version(pkg)
-    except importlib_metadata.PackageNotFoundError:
-        raise importlib_metadata.PackageNotFoundError(
+        got_ver = importlib.metadata.version(pkg)
+    except importlib.metadata.PackageNotFoundError:
+        raise importlib.metadata.PackageNotFoundError(
             f"The '{requirement}' distribution was not found and is required by this application. {hint}"
         )
 
diff --git a/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py b/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py
index e7a622edd715..c49eded5807d 100755
--- a/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py
+++ b/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py
@@ -115,11 +115,23 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
-    use_auth_token: bool = field(
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
-            "help": "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-            "with private models)."
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
         },
     )
 {% endif %}
@@ -289,7 +301,8 @@ def main():
     config_kwargs = {
         "cache_dir": model_args.cache_dir,
         "revision": model_args.model_revision,
-        "use_auth_token": True if model_args.use_auth_token else None,
+        "token": model_args.token,
+        "trust_remote_code": model_args.trust_remote_code,
     }
     if model_args.config_name:
         config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
@@ -303,7 +316,8 @@ def main():
         "cache_dir": model_args.cache_dir,
         "use_fast": model_args.use_fast_tokenizer,
         "revision": model_args.model_revision,
-        "use_auth_token": True if model_args.use_auth_token else None,
+        "token": model_args.token,
+        "trust_remote_code": model_args.trust_remote_code,
     }
     if model_args.tokenizer_name:
         tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
@@ -322,7 +336,8 @@ def main():
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         logger.info("Training new model from scratch")
@@ -336,14 +351,16 @@ def main():
         finetuning_task=data_args.task_name,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     model = AutoModelForSequenceClassification.from_pretrained(
         model_args.model_name_or_path,
@@ -351,7 +368,8 @@ def main():
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 {% endif %}
 
diff --git a/templates/adding_a_new_model/ADD_NEW_MODEL_PROPOSAL_TEMPLATE.md b/templates/adding_a_new_model/ADD_NEW_MODEL_PROPOSAL_TEMPLATE.md
index b06c29657136..201806837591 100644
--- a/templates/adding_a_new_model/ADD_NEW_MODEL_PROPOSAL_TEMPLATE.md
+++ b/templates/adding_a_new_model/ADD_NEW_MODEL_PROPOSAL_TEMPLATE.md
@@ -43,7 +43,7 @@ open-source contribution to Transformers. Along the way, you will:
 -   understand the design principles of one of the most popular NLP
     libraries
 -   learn how to do efficiently test large NLP models
--   learn how to integrate Python utilities like `black`, `isort`,
+-   learn how to integrate Python utilities like `black`, `ruff`,
     `make fix-copies` into a library to always ensure clean and readable
     code
 
@@ -848,7 +848,7 @@ model.save_pretrained("/path/to/converted/checkpoint/folder")
 Having managed to correctly load the pretrained weights into the 🤗
 Transformers implementation, you should now make sure that the forward
 pass is correctly implemented. In [Get familiar with the original
-repository](#run-a-pretrained-checkpoint-using-the-original-repository),
+repository](#34-run-a-pretrained-checkpoint-using-the-original-repository),
 you have already created a script that runs a forward pass of the model
 using the original repository. Now you should write an analogous script
 using the 🤗 Transformers implementation instead of the original one. It
diff --git a/templates/adding_a_new_model/README.md b/templates/adding_a_new_model/README.md
index c8ee0ce667d5..e1785853dcd3 100644
--- a/templates/adding_a_new_model/README.md
+++ b/templates/adding_a_new_model/README.md
@@ -34,6 +34,14 @@ cd transformers
 pip install -e ".[dev]"
 ```
 
+Depending on your OS, and since the number of optional dependencies of Transformers is growing, you might get a
+failure with this command. If that's the case make sure to install the Deep Learning framework you are working with
+(PyTorch, TensorFlow and/or Flax) then do:
+
+```bash
+pip install -e ".[quality]"
+```
+
 Once the installation is done, you can use the CLI command `add-new-model` to generate your models:
 
 ```shell script
@@ -84,7 +92,7 @@ Choose from 1, 2 [1]:
 
 Once the command has finished, you should have a total of 7 new files spread across the repository:
 ```
-docs/source/model_doc/<model_name>.mdx
+docs/source/model_doc/<model_name>.md
 src/transformers/models/<model_name>/configuration_<model_name>.py
 src/transformers/models/<model_name>/modeling_<model_name>.py
 src/transformers/models/<model_name>/modeling_tf_<model_name>.py
@@ -133,6 +141,14 @@ cd transformers
 pip install -e ".[dev]"
 ```
 
+Depending on your OS, and since the number of optional dependencies of Transformers is growing, you might get a
+failure with this command. If that's the case make sure to install the Deep Learning framework you are working with
+(PyTorch, TensorFlow and/or Flax) then do:
+
+```bash
+pip install -e ".[quality]"
+```
+
 Once the installation is done, you can use the CLI command `add-new-model-like` to generate your models:
 
 ```shell script
@@ -226,7 +242,7 @@ make fix-copies
 ```
 
 and then you can start tweaking your model. You should:
-- fill the doc file at `docs/source/model_doc/model_name.mdx`
+- fill the doc file at `docs/source/model_doc/model_name.md`
 - tweak the configuration and modeling files to your need
 
 Once you're done, you can run the tests to ensure that they all pass:
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/__init__.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/__init__.py
index 0d05ee406add..5dd27ef591a1 100644
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/__init__.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,7 +13,6 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-# rely on isort to merge the imports
 from ...utils import  _LazyModule, OptionalDependencyNotAvailable, is_tokenizers_available
 
 
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration_{{cookiecutter.lowercase_modelname}}.py
index 3221696317bd..2898b5cf6f8f 100644
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration_{{cookiecutter.lowercase_modelname}}.py
@@ -107,10 +107,10 @@ class {{cookiecutter.camelcase_modelname}}Config(PretrainedConfig):
             just in case (e.g., 512 or 1024 or 2048).
         init_std (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
             The LayerDrop probability for the encoder. See the [LayerDrop paper](see
             https://arxiv.org/abs/1909.11556) for more details.
-        decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
             The LayerDrop probability for the decoder. See the [LayerDrop paper](see
             https://arxiv.org/abs/1909.11556) for more details.
         use_cache (`bool`, *optional*, defaults to `True`):
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_flax_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_flax_{{cookiecutter.lowercase_modelname}}.py
index 676270c131fb..83263a6a47ef 100644
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_flax_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_flax_{{cookiecutter.lowercase_modelname}}.py
@@ -312,7 +312,7 @@ def __call__(
             attention_bias = lax.select(
                 attention_mask > 0,
                 jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
-                jnp.full(attention_mask.shape, -1e10).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
             )
         else:
             attention_bias = None
@@ -1469,7 +1469,7 @@ def __call__(
 class Flax{{cookiecutter.camelcase_modelname}}ForCausalLM(Flax{{cookiecutter.camelcase_modelname}}PreTrainedModel):
     module_class = Flax{{cookiecutter.camelcase_modelname}}ForCausalLMModule
 
-    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jnp.DeviceArray] = None):
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
         # initializing the cache
         batch_size, seq_length = input_ids.shape
 
@@ -1859,7 +1859,7 @@ def __call__(
             attention_bias = lax.select(
                 attention_mask > 0,
                 jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
-                jnp.full(attention_mask.shape, float("-inf")).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
             )
         else:
             attention_bias = None
@@ -2969,8 +2969,8 @@ def prepare_inputs_for_generation(
         self,
         decoder_input_ids,
         max_length,
-        attention_mask: Optional[jnp.DeviceArray] = None,
-        decoder_attention_mask: Optional[jnp.DeviceArray] = None,
+        attention_mask: Optional[jax.Array] = None,
+        decoder_attention_mask: Optional[jax.Array] = None,
         encoder_outputs=None,
         **kwargs
     ):
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
index d7c6fbf69b9b..6e9e848cb868 100644
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
@@ -53,7 +53,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import logging
 from .configuration_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}Config
 
@@ -62,7 +62,6 @@
 
 _CHECKPOINT_FOR_DOC = "{{cookiecutter.checkpoint_identifier}}"
 _CONFIG_FOR_DOC = "{{cookiecutter.camelcase_modelname}}Config"
-_TOKENIZER_FOR_DOC = "{{cookiecutter.camelcase_modelname}}Tokenizer"
 
 TF_{{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "{{cookiecutter.checkpoint_identifier}}",
@@ -127,16 +126,7 @@ def call(
         assert not (input_ids is None and inputs_embeds is None)
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
@@ -396,9 +386,9 @@ def call(
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
         head_mask: tf.Tensor,
-        encoder_hidden_states: Optional[tf.Tensor],
-        encoder_attention_mask: Optional[tf.Tensor],
-        past_key_value: Optional[Tuple[tf.Tensor]],
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_value: Tuple[tf.Tensor] | None,
         output_attentions: bool,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
@@ -475,9 +465,9 @@ def call(
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
         head_mask: tf.Tensor,
-        encoder_hidden_states: Optional[tf.Tensor],
-        encoder_attention_mask: Optional[tf.Tensor],
-        past_key_values: Optional[Tuple[Tuple[tf.Tensor]]],
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_values: Tuple[Tuple[tf.Tensor]] | None,
         use_cache: Optional[bool],
         output_attentions: bool,
         output_hidden_states: bool,
@@ -649,14 +639,14 @@ class PreTrainedModel
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -813,23 +803,6 @@ class TF{{cookiecutter.camelcase_modelname}}PreTrainedModel(TFPreTrainedModel):
     config_class = {{cookiecutter.camelcase_modelname}}Config
     base_model_prefix = "{{cookiecutter.lowercase_modelname}}"
 
-    @property
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        dummy = {"input_ids": tf.constant(DUMMY_INPUTS, dtype=tf.int64)}
-        # Add `encoder_hidden_states` to make the cross-attention layers' weights initialized
-        if self.config.add_cross_attention:
-            batch_size, seq_len = tf.constant(DUMMY_INPUTS).shape
-            shape = (batch_size, seq_len) + (self.config.hidden_size,)
-            h = tf.random.uniform(shape=shape)
-            dummy["encoder_hidden_states"] = h
-
-        return dummy
 
 
 {{cookiecutter.uppercase_modelname}}_START_DOCSTRING = r"""
@@ -878,7 +851,7 @@ def dummy_inputs(self):
         input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]`, `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See
+            Indices can be obtained using [`AutoTokenizer`]. See
             [`PreTrainedTokenizer.__call__`] and [`PreTrainedTokenizer.encode`] for
             details.
 
@@ -941,21 +914,20 @@ def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs,
     @unpack_inputs
     @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFBaseModelOutputWithPastAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -1002,24 +974,6 @@ def call(
 
         return outputs
 
-    def serving_output(
-        self, output: TFBaseModelOutputWithPastAndCrossAttentions
-    ) -> TFBaseModelOutputWithPastAndCrossAttentions:
-        output_cache = self.config.use_cache and self.config.is_decoder
-        pkv = tf.convert_to_tensor(output.past_key_values) if output_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = tf.convert_to_tensor(output.cross_attentions) if output.cross_attentions is not None else None
-        if not (self.config.output_attentions and self.config.add_cross_attention):
-            cross_attns = None
-
-        return TFBaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=output.last_hidden_state,
-            past_key_values=pkv,
-            hidden_states=hs,
-            attentions=attns,
-            cross_attentions=cross_attns,
-        )
 
 
 @add_start_docstrings("""{{cookiecutter.modelname}} Model with a `language modeling` head on top. """, {{cookiecutter.uppercase_modelname}}_START_DOCSTRING)
@@ -1043,23 +997,22 @@ def get_lm_head(self) -> tf.keras.layers.Layer:
     @unpack_inputs
     @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFMaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1096,13 +1049,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output
-    def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """{{cookiecutter.modelname}} Model with a `language modeling` head on top for CLM fine-tuning. """, {{cookiecutter.uppercase_modelname}}_START_DOCSTRING
@@ -1135,27 +1081,26 @@ def prepare_inputs_for_generation(self, inputs, past_key_values=None, attention_
 
     @unpack_inputs
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFCausalLMOutputWithCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFCausalLMOutputWithCrossAttentions, Tuple[tf.Tensor]]:
         r"""
@@ -1219,19 +1164,6 @@ def call(
             cross_attentions=outputs.cross_attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMHeadModel.serving_output
-    def serving_output(self, output: TFCausalLMOutputWithCrossAttentions) -> TFCausalLMOutputWithCrossAttentions:
-        output_cache = self.config.use_cache and self.config.is_decoder
-        pkv = tf.convert_to_tensor(output.past_key_values) if output_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = tf.convert_to_tensor(output.cross_attentions) if output.cross_attentions is not None else None
-        if not (self.config.output_attentions and self.config.add_cross_attention):
-            cross_attns = None
-
-        return TFCausalLMOutputWithCrossAttentions(
-            logits=output.logits, past_key_values=pkv, hidden_states=hs, attentions=attns, cross_attentions=cross_attns
-        )
 
 
 class TF{{cookiecutter.camelcase_modelname}}ClassificationHead(tf.keras.layers.Layer):
@@ -1281,23 +1213,22 @@ def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs,
     @unpack_inputs
     @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1332,13 +1263,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """{{cookiecutter.modelname}} Model with a multiple choice classification head on top (a linear layer on top of
@@ -1357,36 +1281,25 @@ def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs,
             units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
         )
 
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            tf.Tensor with dummy inputs
-        """
-        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int64)}
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFMultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1456,24 +1369,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    @tf.function(input_signature=[{
-        "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-        "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-        "token_type_ids": tf.TensorSpec((None, None, None), tf.int32, name="token_type_ids"),
-    }])
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving
-    def serving(self, inputs: Dict[str, tf.Tensor]) -> TFMultipleChoiceModelOutput:
-        output = self.call(input_ids=inputs)
-
-        return self.serving_output(output)
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output
-    def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMultipleChoiceModelOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """{{cookiecutter.modelname}} Model with a token classification head on top (a linear layer on top of
@@ -1496,23 +1391,22 @@ def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs,
     @unpack_inputs
     @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFTokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1548,13 +1442,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """{{cookiecutter.modelname}} Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
@@ -1576,24 +1463,23 @@ def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs,
     @unpack_inputs
     @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1642,14 +1528,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output
-    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
 
 {% else %}
 import random
@@ -1678,7 +1556,7 @@ def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAn
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import ContextManagers, logging
 from .configuration_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}Config
 
@@ -1794,12 +1672,12 @@ def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
     def call(
         self,
         hidden_states: tf.Tensor,
-        key_value_states: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[Tuple[tf.Tensor]]] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
+        key_value_states: tf.Tensor | None = None,
+        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
+        attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
         training=False,
-    ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
+    ) -> Tuple[tf.Tensor, tf.Tensor | None]:
         """Input shape: Batch x Time x Channel"""
 
         # if key_value_states are provided this layer is used as a cross-attention layer
@@ -1916,7 +1794,7 @@ def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs)
     def call(self, hidden_states: tf.Tensor, attention_mask: tf.Tensor, layer_head_mask: tf.Tensor, training=False):
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+            hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
             attention_mask (`tf.Tensor`): attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
@@ -1979,20 +1857,20 @@ def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs)
     def call(
         self,
         hidden_states,
-        attention_mask: Optional[tf.Tensor] = None,
-        encoder_hidden_states: Optional[tf.Tensor] = None,
-        encoder_attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
-        cross_attn_layer_head_mask: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[tf.Tensor]] = None,
+        attention_mask: tf.Tensor | None = None,
+        encoder_hidden_states: tf.Tensor | None = None,
+        encoder_attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
+        cross_attn_layer_head_mask: tf.Tensor | None = None,
+        past_key_value: Tuple[tf.Tensor] | None = None,
         training=False,
     ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+            hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
             attention_mask (`tf.Tensor`): attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
-            encoder_hidden_states (`tf.Tensor`): cross attention input to the layer of shape *(seq_len, batch, embed_dim)*
+            encoder_hidden_states (`tf.Tensor`): cross attention input to the layer of shape *(batch, seq_len, embed_dim)*
             encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
@@ -2060,34 +1938,6 @@ class TF{{cookiecutter.camelcase_modelname}}PreTrainedModel(TFPreTrainedModel):
     config_class = {{cookiecutter.camelcase_modelname}}Config
     base_model_prefix = "model"
 
-    @property
-    def dummy_inputs(self):
-        pad_token = 1
-        input_ids = tf.cast(tf.convert_to_tensor(DUMMY_INPUTS), tf.int32)
-        decoder_input_ids = tf.cast(tf.convert_to_tensor(DUMMY_INPUTS), tf.int32)
-        dummy_inputs = {
-            "decoder_input_ids": decoder_input_ids,
-            "attention_mask": tf.math.not_equal(input_ids, pad_token),
-            "input_ids": input_ids,
-        }
-        return dummy_inputs
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-                "decoder_input_ids": tf.TensorSpec((None, None), tf.int32, name="decoder_input_ids"),
-                "decoder_attention_mask": tf.TensorSpec((None, None), tf.int32, name="decoder_attention_mask"),
-            }
-        ]
-    )
-    # Copied from transformers.models.bart.modeling_tf_bart.TFBartPretrainedModel.serving
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 {{cookiecutter.uppercase_modelname}}_START_DOCSTRING = r"""
     This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the
@@ -2319,16 +2169,7 @@ def call(
             if hasattr(self.embed_tokens, "load_weight_prefix"):
                 context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
             with ContextManagers(context):
-                # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-                # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-                tf.debugging.assert_less(
-                    input_ids,
-                    tf.cast(self.embed_tokens.input_dim, dtype=input_ids.dtype),
-                    message=(
-                        "input_ids must be smaller than the embedding layer's input dimension (got"
-                        f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.input_dim})"
-                    ),
-                )
+                check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
                 inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
         embed_pos = self.embed_positions(input_shape)
@@ -2526,16 +2367,7 @@ def call(
             if hasattr(self.embed_tokens, "load_weight_prefix"):
                 context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
             with ContextManagers(context):
-                # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-                # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-                tf.debugging.assert_less(
-                    input_ids,
-                    tf.cast(self.embed_tokens.input_dim, dtype=input_ids.dtype),
-                    message=(
-                        "input_ids must be smaller than the embedding layer's input dimension (got"
-                        f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.input_dim})"
-                    ),
-                )
+                check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
                 inputs_embeds = self.embed_tokens(input_ids)
 
         hidden_states = inputs_embeds
@@ -2766,7 +2598,6 @@ def get_decoder(self):
     @unpack_inputs
     @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFSeq2SeqModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -2813,26 +2644,6 @@ def call(
 
         return outputs
 
-    # Copied from transformers.models.bart.modeling_tf_bart.TFBartModel.serving_output
-    def serving_output(self, output):
-        pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
-        dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
-        dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
-        cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None
-        enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
-        enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None
-
-        return TFSeq2SeqModelOutput(
-            last_hidden_state=output.last_hidden_state,
-            past_key_values=pkv,
-            decoder_hidden_states=dec_hs,
-            decoder_attentions=dec_attns,
-            cross_attentions=cross_attns,
-            encoder_last_hidden_state=output.encoder_last_hidden_state,
-            encoder_hidden_states=enc_hs,
-            encoder_attentions=enc_attns,
-        )
-
 
 # Copied from transformers.models.bart.modeling_tf_bart.BiasLayer
 class BiasLayer(tf.keras.layers.Layer):
@@ -2865,7 +2676,6 @@ class TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration(TF{{cookiec
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.model = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="model")
-        self.model._set_save_spec(inputs=self.serving.input_signature)
         self.use_cache = config.use_cache
         # final_bias_logits is registered as a buffer in pytorch, so not trainable for the sake of consistency.
         self.bias_layer = BiasLayer(
@@ -2980,26 +2790,6 @@ def call(
             encoder_attentions=outputs.encoder_attentions,  # 2 of e out
         )
 
-    # Copied from transformers.models.bart.modeling_tf_bart.TFBartForConditionalGeneration.serving_output
-    def serving_output(self, output):
-        pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
-        dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
-        dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
-        cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None
-        enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
-        enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None
-
-        return TFSeq2SeqLMOutput(
-            logits=output.logits,
-            past_key_values=pkv,
-            decoder_hidden_states=dec_hs,
-            decoder_attentions=dec_attns,
-            cross_attentions=cross_attns,
-            encoder_last_hidden_state=output.encoder_last_hidden_state,
-            encoder_hidden_states=enc_hs,
-            encoder_attentions=enc_attns,
-        )
-
     def prepare_inputs_for_generation(
         self,
         decoder_input_ids,
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
index a64bd6c8fd93..02fcb7d2f511 100755
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
@@ -56,7 +56,6 @@
 
 _CHECKPOINT_FOR_DOC = "{{cookiecutter.checkpoint_identifier}}"
 _CONFIG_FOR_DOC = "{{cookiecutter.camelcase_modelname}}Config"
-_TOKENIZER_FOR_DOC = "{{cookiecutter.camelcase_modelname}}Tokenizer"
 
 {{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "{{cookiecutter.checkpoint_identifier}}",
@@ -185,7 +184,7 @@ def forward(
                 token_type_ids = buffered_token_type_ids_expanded
             else:
                 token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
-                
+
         if inputs_embeds is None:
             inputs_embeds = self.word_embeddings(input_ids)
         token_type_embeddings = self.token_type_embeddings(token_type_ids)
@@ -526,11 +525,17 @@ def forward(
         output_hidden_states=False,
         return_dict=True,
     ):
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+            )
+            use_cache = False
+
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
         all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
-
         next_decoder_cache = () if use_cache else None
+
         for i, layer_module in enumerate(self.layer):
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
@@ -539,13 +544,6 @@ def forward(
             past_key_value = past_key_values[i] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         return module(*inputs, past_key_value, output_attentions)
@@ -793,7 +791,6 @@ def _prune_heads(self, heads_to_prune):
 
     @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPastAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -848,6 +845,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -952,7 +950,6 @@ def set_output_embeddings(self, new_embeddings):
 
     @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1180,10 +1177,10 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attenti
 
         return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
 
-    def _reorder_cache(self, past, beam_idx):
+    def _reorder_cache(self, past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],)
+        for layer_past in past_key_values:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2]) + layer_past[2:],)
         return reordered_past
 
 class {{cookiecutter.camelcase_modelname}}ClassificationHead(nn.Module):
@@ -1224,7 +1221,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1317,7 +1313,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1408,7 +1403,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1487,7 +1481,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1568,7 +1561,6 @@ def forward(
 {% else %}
 import math
 import copy
-import random
 from typing import Optional, Tuple, List, Union
 
 import torch
@@ -1601,7 +1593,6 @@ def forward(
 
 _CHECKPOINT_FOR_DOC = "{{cookiecutter.checkpoint_identifier}}"
 _CONFIG_FOR_DOC = "{{cookiecutter.camelcase_modelname}}Config"
-_TOKENIZER_FOR_DOC = "{{cookiecutter.camelcase_modelname}}Tokenizer"
 
 
 {{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -1630,7 +1621,7 @@ def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min)
     mask_cond = torch.arange(mask.size(-1))
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
@@ -1836,7 +1827,7 @@ def forward(
     ):
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+            hidden_states (`torch.FloatTensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
             attention_mask (`torch.FloatTensor`): attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
@@ -1917,10 +1908,10 @@ def forward(
     ):
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+            hidden_states (`torch.FloatTensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
             attention_mask (`torch.FloatTensor`): attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
-            encoder_hidden_states (`torch.FloatTensor`): cross attention input to the layer of shape *(seq_len, batch, embed_dim)*
+            encoder_hidden_states (`torch.FloatTensor`): cross attention input to the layer of shape *(batch, seq_len, embed_dim)*
             encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
@@ -2032,7 +2023,7 @@ def _init_weights(self, module):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
-    
+
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, ({{cookiecutter.camelcase_modelname}}Decoder, {{cookiecutter.camelcase_modelname}}Encoder)):
             module.gradient_checkpointing = value
@@ -2281,6 +2272,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
         elif inputs_embeds is not None:
@@ -2315,7 +2307,7 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.randn([])
             if self.training and (dropout_probability < self.layerdrop):  # skip the layer
                 layer_outputs = (None, None)
             else:
@@ -2533,6 +2525,10 @@ def forward(
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
 
         # decoder layers
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning("`use_cache = True` is incompatible with gradient checkpointing`. Setting `use_cache = False`...")
+            use_cache = False
+
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
         all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
@@ -2548,18 +2544,13 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.randn([])
             if self.training and (dropout_probability < self.layerdrop):
                 continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
-                if use_cache:
-                    logger.warning("`use_cache = True` is incompatible with gradient checkpointing`. Setting `use_cache = False`...")
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         # None for past_key_value
@@ -2654,7 +2645,6 @@ def get_decoder(self):
 
     @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=Seq2SeqModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -2905,10 +2895,10 @@ def prepare_inputs_for_generation(
         }
 
     @staticmethod
-    def _reorder_cache(past, beam_idx):
+    def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        for layer_past in past_key_values:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),)
         return reordered_past
 
 
@@ -2934,7 +2924,6 @@ def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs)
 
     @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=Seq2SeqSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -3051,7 +3040,6 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=Seq2SeqQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -3344,9 +3332,9 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attenti
         }
 
     @staticmethod
-    def _reorder_cache(past, beam_idx):
+    def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        for layer_past in past_key_values:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),)
         return reordered_past
 {% endif -%}
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_tf_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_tf_{{cookiecutter.lowercase_modelname}}.py
index 28c28884e248..a92a900947cc 100644
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_tf_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_tf_{{cookiecutter.lowercase_modelname}}.py
@@ -869,26 +869,6 @@ def test_decoder_model_past_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
 
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-
-            if model_class in self.all_generative_model_classes:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert isinstance(name, dict)
-                for k, v in name.items():
-                    assert isinstance(v, tf.Variable)
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
-
     @unittest.skip(reason="Template classes interact badly with this test.")
     def test_keras_fit(self):
         pass
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.mdx b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.md
similarity index 100%
rename from templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.mdx
rename to templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.md
diff --git a/templates/adding_a_new_model/open_model_proposals/ADD_BIG_BIRD.md b/templates/adding_a_new_model/open_model_proposals/ADD_BIG_BIRD.md
index c856fe45b891..9c45c5b07f9d 100644
--- a/templates/adding_a_new_model/open_model_proposals/ADD_BIG_BIRD.md
+++ b/templates/adding_a_new_model/open_model_proposals/ADD_BIG_BIRD.md
@@ -25,7 +25,7 @@ open-source contribution to Transformers. Along the way, you will:
 -   understand the design principles of one of the most popular NLP
     libraries
 -   learn how to do efficiently test large NLP models
--   learn how to integrate Python utilities like `black`, `isort`,
+-   learn how to integrate Python utilities like `black`, `ruff`,
     `make fix-copies` into a library to always ensure clean and readable
     code
 
diff --git a/tests/models/bort/__init__.py b/tests/bettertransformer/__init__.py
similarity index 100%
rename from tests/models/bort/__init__.py
rename to tests/bettertransformer/__init__.py
diff --git a/tests/bettertransformer/test_integration.py b/tests/bettertransformer/test_integration.py
new file mode 100644
index 000000000000..8e2208bec611
--- /dev/null
+++ b/tests/bettertransformer/test_integration.py
@@ -0,0 +1,86 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tempfile
+import unittest
+
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+from transformers.testing_utils import (
+    is_torch_available,
+    require_optimum,
+    require_torch,
+    slow,
+)
+
+
+if is_torch_available():
+    import torch
+
+
+@require_torch
+@require_optimum
+@slow
+class BetterTransformerIntegrationTest(unittest.TestCase):
+    # refer to the full test suite in Optimum library:
+    # https://github.com/huggingface/optimum/tree/main/tests/bettertransformer
+
+    def test_transform_and_reverse(self):
+        r"""
+        Classic tests to simply check if the conversion has been successfull.
+        """
+        model_id = "hf-internal-testing/tiny-random-t5"
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
+
+        inp = tokenizer("This is me", return_tensors="pt")
+
+        model = model.to_bettertransformer()
+
+        self.assertTrue(any("BetterTransformer" in mod.__class__.__name__ for _, mod in model.named_modules()))
+
+        output = model.generate(**inp)
+
+        model = model.reverse_bettertransformer()
+
+        self.assertFalse(any("BetterTransformer" in mod.__class__.__name__ for _, mod in model.named_modules()))
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_pretrained(tmpdirname)
+
+            model_reloaded = AutoModelForSeq2SeqLM.from_pretrained(tmpdirname)
+
+            self.assertFalse(
+                any("BetterTransformer" in mod.__class__.__name__ for _, mod in model_reloaded.named_modules())
+            )
+
+            output_from_pretrained = model_reloaded.generate(**inp)
+            self.assertTrue(torch.allclose(output, output_from_pretrained))
+
+    def test_error_save_pretrained(self):
+        r"""
+        The save_pretrained method should raise a ValueError if the model is in BetterTransformer mode.
+        All should be good if the model is reversed.
+        """
+        model_id = "hf-internal-testing/tiny-random-t5"
+        model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
+
+        model = model.to_bettertransformer()
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            with self.assertRaises(ValueError):
+                model.save_pretrained(tmpdirname)
+
+            model = model.reverse_bettertransformer()
+            model.save_pretrained(tmpdirname)
diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index 65644ef3ac6d..5e8c0d162903 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -19,13 +19,19 @@
 import os
 import unittest
 from copy import deepcopy
+from functools import partial
 
 import datasets
-
 from parameterized import parameterized
+
+import tests.trainer.test_trainer
 from tests.trainer.test_trainer import TrainerIntegrationCommon  # noqa
 from transformers import AutoModel, TrainingArguments, is_torch_available, logging
-from transformers.deepspeed import HfDeepSpeedConfig, is_deepspeed_available, unset_hf_deepspeed_config
+from transformers.integrations.deepspeed import (
+    HfDeepSpeedConfig,
+    is_deepspeed_available,
+    unset_hf_deepspeed_config,
+)
 from transformers.testing_utils import (
     CaptureLogger,
     CaptureStd,
@@ -49,9 +55,11 @@
     from tests.trainer.test_trainer import (  # noqa
         RegressionModelConfig,
         RegressionPreTrainedModel,
-        get_regression_trainer,
     )
 
+    # hack to restore original logging level pre #21700
+    get_regression_trainer = partial(tests.trainer.test_trainer.get_regression_trainer, log_level="info")
+
 
 set_seed(42)
 
@@ -109,7 +117,7 @@ def require_deepspeed_aio(test_case):
 if is_deepspeed_available():
     from deepspeed.utils import logger as deepspeed_logger  # noqa
     from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
-    from transformers.deepspeed import deepspeed_config, is_deepspeed_zero3_enabled  # noqa
+    from transformers.integrations.deepspeed import deepspeed_config, is_deepspeed_zero3_enabled  # noqa
 
 
 def get_launcher(distributed=False):
@@ -128,6 +136,14 @@ def get_launcher(distributed=False):
 FP16 = "fp16"
 BF16 = "bf16"
 
+HF_OPTIM = "hf_optim"
+HF_SCHEDULER = "hf_scheduler"
+DS_OPTIM = "ds_optim"
+DS_SCHEDULER = "ds_scheduler"
+
+optims = [HF_OPTIM, DS_OPTIM]
+schedulers = [HF_SCHEDULER, DS_SCHEDULER]
+
 stages = [ZERO2, ZERO3]
 if is_torch_bf16_gpu_available():
     dtypes = [FP16, BF16]
@@ -145,6 +161,8 @@ def parameterized_custom_name_func(func, param_num, param):
 # Cartesian-product of zero stages with models to test
 params = list(itertools.product(stages, dtypes))
 
+params_with_optims_and_schedulers = list(itertools.product(stages, dtypes, optims, schedulers))
+
 
 @require_deepspeed
 @require_torch_gpu
@@ -157,9 +175,13 @@ def setUp(self):
         super().setUp()
 
         master_port = get_master_port(real_launcher=False)
-        self.dist_env_1_gpu = dict(
-            MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
-        )
+        self.dist_env_1_gpu = {
+            "MASTER_ADDR": "localhost",
+            "MASTER_PORT": master_port,
+            "RANK": "0",
+            "LOCAL_RANK": "0",
+            "WORLD_SIZE": "1",
+        }
 
     def tearDown(self):
         super().tearDown()
@@ -212,14 +234,18 @@ def setUp(self):
         self.batch_size = args.train_batch_size
 
         master_port = get_master_port(real_launcher=False)
-        self.dist_env_1_gpu = dict(
-            MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
-        )
+        self.dist_env_1_gpu = {
+            "MASTER_ADDR": "localhost",
+            "MASTER_PORT": master_port,
+            "RANK": "0",
+            "LOCAL_RANK": "0",
+            "WORLD_SIZE": "1",
+        }
 
-        self.ds_config_file = dict(
-            zero2=f"{self.test_file_dir_str}/ds_config_zero2.json",
-            zero3=f"{self.test_file_dir_str}/ds_config_zero3.json",
-        )
+        self.ds_config_file = {
+            "zero2": f"{self.test_file_dir_str}/ds_config_zero2.json",
+            "zero3": f"{self.test_file_dir_str}/ds_config_zero3.json",
+        }
 
         # use self.get_config_dict(stage) to use these to ensure the original is not modified
         with io.open(self.ds_config_file[ZERO2], "r", encoding="utf-8") as f:
@@ -230,10 +256,10 @@ def setUp(self):
             # It's in the file as a demo for users since we want everything to work out of the box even if slower.
             config_zero3["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = False
 
-        self.ds_config_dict = dict(
-            zero2=config_zero2,
-            zero3=config_zero3,
-        )
+        self.ds_config_dict = {
+            "zero2": config_zero2,
+            "zero3": config_zero3,
+        }
 
     def tearDown(self):
         super().tearDown()
@@ -271,7 +297,6 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
     # --- These tests are enough to run on one of zero stages --- #
 
     def test_hf_ds_config_mismatch(self):
-
         ds_config = self.get_config_dict(ZERO2)
 
         # Purposefully configure these values to mismatch TrainingArguments values.
@@ -360,7 +385,7 @@ def test_hf_scheduler_ds_optimizer(self):
             del ds_config_zero2_dict["scheduler"]  # force default HF Trainer scheduler
             ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
             ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
-            trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict)
+            trainer = get_regression_trainer(a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict)
             trainer.train()
         new_a = trainer.model.a.item()
         self.assertNotEqual(new_a, a)
@@ -371,7 +396,7 @@ def test_stage3_nvme_offload(self):
             # this actually doesn't have to be on NVMe, any storage will do since this test only
             # runs a simple check that we can use some directory as if it were NVMe
             nvme_path = self.get_auto_remove_tmp_dir()
-            nvme_config = dict(device="nvme", nvme_path=nvme_path)
+            nvme_config = {"device": "nvme", "nvme_path": nvme_path}
             ds_config_zero3_dict = self.get_config_dict(ZERO3)
             ds_config_zero3_dict["zero_optimization"]["offload_optimizer"] = nvme_config
             ds_config_zero3_dict["zero_optimization"]["offload_param"] = nvme_config
@@ -383,7 +408,6 @@ def test_stage3_nvme_offload(self):
     @require_optuna
     def test_hyperparameter_search(self):
         with mockenv_context(**self.dist_env_1_gpu):
-
             ds_config_zero3_dict = self.get_config_dict(ZERO3)
 
             # hyperparameter_search requires model_init() to recreate the model for each trial
@@ -416,8 +440,9 @@ def test_hf_optimizer_with_offload(self, stage, dtype):
         del ds_config_dict["optimizer"]  # force default HF Trainer optimizer
         # force cpu offload
         ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu"
+        ds_config_dict["zero_force_ds_cpu_optimizer"] = False  # offload is not efficient w/o CPUAdam
         with mockenv_context(**self.dist_env_1_gpu):
-            kwargs = dict(local_rank=0, deepspeed=ds_config_dict)
+            kwargs = {"local_rank": 0, "deepspeed": ds_config_dict}
             kwargs[dtype] = True
             trainer = get_regression_trainer(**kwargs)
             with CaptureLogger(deepspeed_logger) as cl:
@@ -433,7 +458,7 @@ def test_fake_notebook_no_launcher(self, stage, dtype):
         # it's run not as a first test as `sys.stdout` will no longer be the same. So we either have
         # to reset `deepspeed_logger.handlers[0].setStream(sys.stdout)` or directly capture from the deepspeed_logger.
         with mockenv_context(**self.dist_env_1_gpu):
-            kwargs = dict(local_rank=0, deepspeed=self.get_config_dict(stage))
+            kwargs = {"local_rank": 0, "deepspeed": self.get_config_dict(stage)}
             kwargs[dtype] = True
             trainer = get_regression_trainer(**kwargs)
 
@@ -451,15 +476,15 @@ def test_early_get_last_lr(self, stage, dtype):
         # `self.lr_scheduler.get_last_lr()` and originally it'd fail on the very first step.
         with mockenv_context(**self.dist_env_1_gpu):
             a = b = 0.0
-            kwargs = dict(
-                a=a,
-                b=b,
-                local_rank=0,
-                train_len=8,
-                deepspeed=self.get_config_dict(stage),
-                per_device_train_batch_size=8,
-                logging_steps=1,
-            )
+            kwargs = {
+                "a": a,
+                "b": b,
+                "local_rank": 0,
+                "train_len": 8,
+                "deepspeed": self.get_config_dict(stage),
+                "per_device_train_batch_size": 8,
+                "logging_steps": 1,
+            }
             kwargs[dtype] = True
             trainer = get_regression_trainer(**kwargs)
 
@@ -496,13 +521,13 @@ def test_gradient_accumulation(self, stage, dtype):
         train_len = 64
         a = b = 0.0
 
-        kwargs = dict(
-            a=a,
-            b=b,
-            local_rank=0,
-            train_len=train_len,
-            deepspeed=self.get_config_dict(stage),
-        )
+        kwargs = {
+            "a": a,
+            "b": b,
+            "local_rank": 0,
+            "train_len": train_len,
+            "deepspeed": self.get_config_dict(stage),
+        }
         kwargs[dtype] = True
 
         with mockenv_context(**self.dist_env_1_gpu):
@@ -585,11 +610,11 @@ def test_save_checkpoints(self, stage, dtype):
 
         # save checkpoints
         with mockenv_context(**self.dist_env_1_gpu):
-            kwargs = dict(
-                output_dir=output_dir,
-                save_steps=freq,
-                deepspeed=ds_config_dict,
-            )
+            kwargs = {
+                "output_dir": output_dir,
+                "save_steps": freq,
+                "deepspeed": ds_config_dict,
+            }
             kwargs[dtype] = True
             trainer = get_regression_trainer(**kwargs)
             trainer.train()
@@ -599,11 +624,10 @@ def test_save_checkpoints(self, stage, dtype):
 
     @parameterized.expand(params, name_func=parameterized_custom_name_func)
     def test_can_resume_training_errors(self, stage, dtype):
-
         with mockenv_context(**self.dist_env_1_gpu):
             ds_config_dict = self.get_config_dict(stage)
             output_dir = self.get_auto_remove_tmp_dir()
-            kwargs = dict(output_dir=output_dir, deepspeed=ds_config_dict)
+            kwargs = {"output_dir": output_dir, "deepspeed": ds_config_dict}
             kwargs[dtype] = True
             trainer = get_regression_trainer(**kwargs)
 
@@ -623,10 +647,16 @@ def test_can_resume_training_errors(self, stage, dtype):
                 "Can't find a valid checkpoint at" in str(context.exception), f"got exception: {context.exception}"
             )
 
-    @parameterized.expand(params, name_func=parameterized_custom_name_func)
-    def test_can_resume_training_normal(self, stage, dtype):
+    @parameterized.expand(params_with_optims_and_schedulers, name_func=parameterized_custom_name_func)
+    def test_can_resume_training_normal(self, stage, dtype, optim, scheduler):
         # adapted from TrainerIntegrationTest.test_can_resume_training
         # test normal resume for each stage separately, error-handling is tested in a different test
+
+        # ToDo: Currently, hf_optim + hf_scheduler resumes with the correct states and
+        # also has same losses for few steps but then slowly diverges. Need to figure it out.
+        if optim == HF_OPTIM and scheduler == HF_SCHEDULER:
+            return
+
         output_dir = self.get_auto_remove_tmp_dir("./xxx", after=False)
         ds_config_dict = self.get_config_dict(stage)
         if dtype == FP16:
@@ -635,7 +665,19 @@ def test_can_resume_training_normal(self, stage, dtype):
         if stage == ZERO3:
             ds_config_dict["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = True
 
-        kwargs = dict(output_dir=output_dir, train_len=128, save_steps=5, learning_rate=0.1, deepspeed=ds_config_dict)
+        if optim == HF_OPTIM:
+            del ds_config_dict["optimizer"]
+
+        if scheduler == HF_SCHEDULER:
+            del ds_config_dict["scheduler"]
+
+        kwargs = {
+            "output_dir": output_dir,
+            "train_len": 128,
+            "save_steps": 5,
+            "learning_rate": 0.1,
+            "deepspeed": ds_config_dict,
+        }
         kwargs[dtype] = True
 
         with mockenv_context(**self.dist_env_1_gpu):
@@ -682,16 +724,16 @@ def test_load_state_dict_from_zero_checkpoint(self, stage, dtype):
 
         ds_config_dict = self.get_config_dict(stage)
 
-        kwargs = dict(
-            output_dir=output_dir,
-            train_len=4,
-            per_device_train_batch_size=4,
-            num_train_epochs=1,
-            save_strategy="steps",
-            save_steps=1,
-            learning_rate=0.1,
-            deepspeed=ds_config_dict,
-        )
+        kwargs = {
+            "output_dir": output_dir,
+            "train_len": 4,
+            "per_device_train_batch_size": 4,
+            "num_train_epochs": 1,
+            "save_strategy": "steps",
+            "save_steps": 1,
+            "learning_rate": 0.1,
+            "deepspeed": ds_config_dict,
+        }
         kwargs[dtype] = True
 
         with mockenv_context(**self.dist_env_1_gpu):
@@ -713,7 +755,7 @@ def test_config_object(self):
         # test that we can switch from zero2 to zero3 in the same process for example
         # test is_zero, etc.
         output_dir = self.get_auto_remove_tmp_dir()
-        kwargs = dict(output_dir=output_dir, train_len=8, fp16=True)
+        kwargs = {"output_dir": output_dir, "train_len": 8, "fp16": True}
 
         ds_config_zero3_dict = self.get_config_dict(ZERO3)
         ds_config_zero2_dict = self.get_config_dict(ZERO2)
@@ -735,6 +777,8 @@ def test_config_object(self):
             config = deepspeed_config()
             self.assertTrue(bool(config), "Deepspeed config should be accessible")
 
+            # with accelerate integration below line is additionally required for this test to pass
+            trainer.accelerator.state._reset_state()
             del trainer
             # now weakref should gc the global and we shouldn't get anything here
             config = deepspeed_config()
@@ -761,14 +805,14 @@ def test_load_best_model(self, stage, dtype):
         ds_config_dict = self.get_config_dict(stage)
         del ds_config_dict["optimizer"]  # will use HF Trainer optimizer
         del ds_config_dict["scheduler"]  # will use HF Trainer scheduler
+        ds_config_dict["zero_force_ds_cpu_optimizer"] = False  # offload is not efficient w/o CPUAdam
         # must use this setting to get the reload path exercised
         ds_config_dict["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = True
 
         with mockenv_context(**self.dist_env_1_gpu):
-
             args_dict = {
-                "per_gpu_train_batch_size": 1,
-                "per_gpu_eval_batch_size": 1,
+                "per_device_train_batch_size": 1,
+                "per_device_eval_batch_size": 1,
                 "gradient_accumulation_steps": 1,
                 "learning_rate": 1e-4,
                 "num_train_epochs": 1,
@@ -782,6 +826,7 @@ def test_load_best_model(self, stage, dtype):
                 "load_best_model_at_end": True,
                 "max_steps": 1,
                 "deepspeed": ds_config_dict,
+                "report_to": "none",
             }
 
             training_args = TrainingArguments(output_dir, **args_dict)
@@ -811,7 +856,7 @@ def _convert_to_features(example_batch):
 
             def get_dataset():
                 data_file = str(self.tests_dir / "fixtures/tests_samples/SQUAD/sample.json")
-                data_files = dict(train=data_file, validation=data_file)
+                data_files = {"train": data_file, "validation": data_file}
                 raw_datasets = datasets.load_dataset("json", data_files=data_files, field="data")
                 train_dataset = raw_datasets["train"].map(_add_eos_to_examples).map(_convert_to_features, batched=True)
                 valid_dataset = deepcopy(train_dataset)
@@ -852,8 +897,8 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
     # 2. most tests should probably be run on both: zero2 and zero3 configs
     #
 
-    @require_torch_multi_gpu
     @parameterized.expand(params, name_func=parameterized_custom_name_func)
+    @require_torch_multi_gpu
     def test_basic_distributed(self, stage, dtype):
         self.run_and_check(stage=stage, dtype=dtype, distributed=True)
 
@@ -883,8 +928,8 @@ def test_fp32_non_distributed(self, stage, dtype):
             fp32=True,
         )
 
-    @require_torch_multi_gpu
     @parameterized.expand(params, name_func=parameterized_custom_name_func)
+    @require_torch_multi_gpu
     def test_fp32_distributed(self, stage, dtype):
         # real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
         # therefore no quality checks, just basic completion checks are done
@@ -906,7 +951,14 @@ def test_resume_train_not_from_ds_checkpoint(self, stage, dtype):
 
         do_train = True
         do_eval = False
-        kwargs = dict(stage=stage, dtype=dtype, eval_steps=1, distributed=True, do_train=do_train, do_eval=do_eval)
+        kwargs = {
+            "stage": stage,
+            "dtype": dtype,
+            "eval_steps": 1,
+            "distributed": True,
+            "do_train": do_train,
+            "do_eval": do_eval,
+        }
 
         # 1. normal training
         output_dir = self.run_and_check(**kwargs)
@@ -917,8 +969,8 @@ def test_resume_train_not_from_ds_checkpoint(self, stage, dtype):
 
         self.do_checks(output_dir, do_train=do_train, do_eval=do_eval)
 
-    @require_torch_multi_gpu
     @parameterized.expand(["bf16", "fp16", "fp32"])
+    @require_torch_multi_gpu
     def test_inference(self, dtype):
         if dtype == "bf16" and not is_torch_bf16_gpu_available():
             self.skipTest("test requires bfloat16 hardware support")
@@ -938,7 +990,6 @@ def test_inference(self, dtype):
         )
 
     def do_checks(self, output_dir, do_train=True, do_eval=True, quality_checks=True):
-
         if do_train:
             train_metrics = load_json(os.path.join(output_dir, "train_results.json"))
             self.assertIn("train_samples_per_second", train_metrics)
@@ -966,7 +1017,6 @@ def run_and_check(
         extra_args_str: str = None,
         remove_args_str: str = None,
     ):
-
         # we are doing quality testing so using a small real model
         output_dir = self.run_trainer(
             stage=stage,
diff --git a/tests/deepspeed/test_model_zoo.py b/tests/deepspeed/test_model_zoo.py
index cd2c6b9e254f..e51fe1e7cfcc 100644
--- a/tests/deepspeed/test_model_zoo.py
+++ b/tests/deepspeed/test_model_zoo.py
@@ -18,6 +18,7 @@
 from os.path import dirname
 
 from parameterized import parameterized
+
 from tests.trainer.test_trainer import TrainerIntegrationCommon  # noqa
 from transformers import is_torch_available
 from transformers.testing_utils import (
@@ -165,8 +166,8 @@ def make_task_cmds():
     # but need a tiny model for each
     #
     # should have "{model_type.upper()}_TINY" corresponding vars defined, e.g., T5_TINY, etc.
-    tasks2models = dict(
-        trans=[
+    tasks2models = {
+        "trans": [
             "bart",
             "fsmt",
             "m2m_100",
@@ -176,10 +177,10 @@ def make_task_cmds():
             "t5_v1",
             # "mt5", missing model files
         ],
-        sum=[
+        "sum": [
             "pegasus",
         ],
-        clm=[
+        "clm": [
             "big_bird",
             "bigbird_pegasus",
             "blenderbot",
@@ -191,7 +192,7 @@ def make_task_cmds():
             "prophetnet",
             # "camembert", missing model files
         ],
-        mlm=[
+        "mlm": [
             "albert",
             "deberta",
             "deberta-v2",
@@ -202,7 +203,7 @@ def make_task_cmds():
             "layoutlm",
             # "reformer", # multiple issues with either mlm/qa/clas
         ],
-        qa=[
+        "qa": [
             "led",
             "longformer",
             "mobilebert",
@@ -212,7 +213,7 @@ def make_task_cmds():
             # "convbert", # missing tokenizer files
             # "layoutlmv2", missing model files
         ],
-        clas=[
+        "clas": [
             "bert",
             "xlnet",
             # "hubert", # missing tokenizer files
@@ -222,54 +223,54 @@ def make_task_cmds():
             # "openai-gpt", missing model files
             # "tapas", multiple issues
         ],
-        img_clas=[
+        "img_clas": [
             "vit",
         ],
-    )
+    }
 
     scripts_dir = f"{ROOT_DIRECTORY}/examples/pytorch"
 
-    tasks = dict(
-        trans=f"""
+    tasks = {
+        "trans": f"""
         {scripts_dir}/translation/run_translation.py
         --train_file {data_dir_wmt}/train.json
         --source_lang en
         --target_lang ro
         """,
-        sum=f"""
+        "sum": f"""
         {scripts_dir}/summarization/run_summarization.py
         --train_file {data_dir_xsum}/sample.json
         --max_source_length 12
         --max_target_length 12
         --lang en
         """,
-        clm=f"""
+        "clm": f"""
         {scripts_dir}/language-modeling/run_clm.py
         --train_file {FIXTURE_DIRECTORY}/sample_text.txt
         --block_size 8
         """,
-        mlm=f"""
+        "mlm": f"""
         {scripts_dir}/language-modeling/run_mlm.py
         --train_file {FIXTURE_DIRECTORY}/sample_text.txt
         """,
-        qa=f"""
+        "qa": f"""
         {scripts_dir}/question-answering/run_qa.py
         --train_file {data_dir_samples}/SQUAD/sample.json
         """,
-        clas=f"""
+        "clas": f"""
         {scripts_dir}/text-classification/run_glue.py
         --train_file {data_dir_samples}/MRPC/train.csv
         --max_seq_length 12
         --task_name MRPC
         """,
-        img_clas=f"""
+        "img_clas": f"""
         {scripts_dir}/image-classification/run_image_classification.py
             --dataset_name hf-internal-testing/cats_vs_dogs_sample
             --remove_unused_columns False
             --max_steps 10
-            --feature_extractor_name {DS_TESTS_DIRECTORY}/vit_feature_extractor.json
+            --image_processor_name {DS_TESTS_DIRECTORY}/vit_feature_extractor.json
         """,
-    )
+    }
 
     launcher = get_launcher(distributed=True)
 
diff --git a/tests/extended/test_trainer_ext.py b/tests/extended/test_trainer_ext.py
index 64c244ae8ed2..831ffd5feede 100644
--- a/tests/extended/test_trainer_ext.py
+++ b/tests/extended/test_trainer_ext.py
@@ -16,12 +16,12 @@
 import os
 import re
 import sys
-import unittest
+from pathlib import Path
 from typing import Tuple
 from unittest.mock import patch
 
 from parameterized import parameterized
-from transformers import AutoModel
+
 from transformers.testing_utils import (
     CaptureStderr,
     ExtendSysPath,
@@ -31,7 +31,6 @@
     get_torch_dist_unique_port,
     require_apex,
     require_bitsandbytes,
-    require_fairscale,
     require_torch,
     require_torch_gpu,
     require_torch_multi_gpu,
@@ -104,36 +103,6 @@ def test_run_seq2seq_dp(self):
     def test_run_seq2seq_ddp(self):
         self.run_seq2seq_quick(distributed=True)
 
-    # test --sharded_ddp w/o --fp16
-    @unittest.skip("Requires an update of the env running those tests")
-    @require_torch_multi_gpu
-    @require_fairscale
-    def test_run_seq2seq_sharded_ddp(self):
-        self.run_seq2seq_quick(distributed=True, extra_args_str="--sharded_ddp simple")
-
-    # test --sharded_ddp w/ --fp16
-    @unittest.skip("Requires an update of the env running those tests")
-    @require_torch_multi_gpu
-    @require_fairscale
-    def test_run_seq2seq_sharded_ddp_fp16(self):
-        self.run_seq2seq_quick(distributed=True, extra_args_str="--sharded_ddp simple --fp16")
-
-    # test --sharded_ddp zero_dp_2 w/o --fp16
-    @unittest.skip("Requires an update of the env running those tests")
-    @require_torch_multi_gpu
-    @require_fairscale
-    def test_run_seq2seq_fully_sharded_ddp(self):
-        self.run_seq2seq_quick(distributed=True, extra_args_str="--sharded_ddp zero_dp_2", predict_with_generate=False)
-
-    # test --sharded_ddp zero_dp_2 w/ --fp16
-    @unittest.skip("Requires an update of the env running those tests")
-    @require_torch_multi_gpu
-    @require_fairscale
-    def test_run_seq2seq_fully_sharded_ddp_fp16(self):
-        self.run_seq2seq_quick(
-            distributed=True, extra_args_str="--sharded_ddp zero_dp_2 --fp16", predict_with_generate=False
-        )
-
     @require_apex
     @require_torch_gpu
     def test_run_seq2seq_apex(self):
@@ -154,21 +123,21 @@ def test_run_seq2seq_apex(self):
     @require_torch_multi_gpu
     def test_trainer_log_level_replica(self, experiment_id):
         # as each sub-test is slow-ish split into multiple sub-tests to avoid CI timeout
-        experiments = dict(
+        experiments = {
             # test with the default log_level - should be info and thus log info once
-            base=dict(extra_args_str="", n_matches=1),
+            "base": {"extra_args_str": "", "n_matches": 1},
             # test with low log_level and log_level_replica - should be noisy on all processes
             # now the info string should appear twice on 2 processes
-            low=dict(extra_args_str="--log_level debug --log_level_replica debug", n_matches=2),
+            "low": {"extra_args_str": "--log_level debug --log_level_replica debug", "n_matches": 2},
             # test with high log_level and low log_level_replica
             # now the info string should appear once only on the replica
-            high=dict(extra_args_str="--log_level error --log_level_replica debug", n_matches=1),
+            "high": {"extra_args_str": "--log_level error --log_level_replica debug", "n_matches": 1},
             # test with high log_level and log_level_replica - should be quiet on all processes
-            mixed=dict(extra_args_str="--log_level error --log_level_replica error", n_matches=0),
-        )
+            "mixed": {"extra_args_str": "--log_level error --log_level_replica error", "n_matches": 0},
+        }
 
         data = experiments[experiment_id]
-        kwargs = dict(distributed=True, predict_with_generate=False, do_eval=False, do_predict=False)
+        kwargs = {"distributed": True, "predict_with_generate": False, "do_eval": False, "do_predict": False}
         log_info_string = "Running training"
         with CaptureStderr() as cl:
             self.run_seq2seq_quick(**kwargs, extra_args_str=data["extra_args_str"])
@@ -207,96 +176,97 @@ def test_run_seq2seq_bnb(self):
         from transformers.training_args import OptimizerNames
 
         def train_and_return_metrics(optim: str) -> Tuple[int, float]:
-            from pathlib import Path
-
-            extra_args = (
-                f"--skip_memory_metrics 0 --optim {optim} --do_eval False --do_predict "
-                "False --adafactor False --log_level debug"
-            )
+            extra_args = "--skip_memory_metrics 0"
 
             output_dir = self.run_trainer(
-                eval_steps=2,
                 max_len=128,
                 model_name=MARIAN_MODEL,
                 learning_rate=3e-4,
                 num_train_epochs=1,
+                optim=optim,
                 distributed=True,  # force run in a new process
                 extra_args_str=extra_args,
                 do_eval=False,
                 do_predict=False,
+                n_gpus_to_use=1,  # to allow deterministic fixed memory usage
             )
 
             # Check metrics
             logs = TrainerState.load_from_json(Path(output_dir, "trainer_state.json")).log_history
-            gpu_peak_mem = logs[0]["train_mem_gpu_peaked_delta"]
-            gpu_alloc_mem = logs[0]["train_mem_gpu_alloc_delta"]
+            gpu_peak_mem_mb = int(logs[0]["train_mem_gpu_peaked_delta"] / 2**20)
+            gpu_alloc_mem_mb = int(logs[0]["train_mem_gpu_alloc_delta"] / 2**20)
 
             loss = logs[0]["train_loss"]
-            return gpu_peak_mem, gpu_alloc_mem, loss
+            return gpu_peak_mem_mb, gpu_alloc_mem_mb, loss
 
         gpu_peak_mem_orig, gpu_alloc_mem_orig, loss_orig = train_and_return_metrics(OptimizerNames.ADAMW_TORCH.value)
         gpu_peak_mem_bnb, gpu_alloc_mem_bnb, loss_bnb = train_and_return_metrics(OptimizerNames.ADAMW_BNB.value)
 
-        gpu_peak_mem_diff_bytes = gpu_peak_mem_orig - gpu_peak_mem_bnb
-        gpu_peak_mem_diff_percent = gpu_peak_mem_diff_bytes / gpu_peak_mem_bnb
+        gpu_alloc_mem_diff = gpu_alloc_mem_orig - gpu_alloc_mem_bnb
 
         gpu_total_mem_orig = gpu_peak_mem_orig + gpu_alloc_mem_orig
         gpu_total_mem_bnb = gpu_peak_mem_bnb + gpu_alloc_mem_bnb
+        gpu_total_mem_diff = gpu_total_mem_orig - gpu_total_mem_bnb
 
-        gpu_total_mem_diff_bytes = gpu_total_mem_orig - gpu_total_mem_bnb
-        gpu_total_mem_diff_percent = gpu_total_mem_diff_bytes / gpu_total_mem_bnb
-
-        # leave this for now if CI gets very different results
-        # print(f"{gpu_alloc_mem_orig=:010d} {gpu_peak_mem_orig=:010d} {gpu_alloc_mem_orig+gpu_peak_mem_orig=:010d}" )
-        # print(f" {gpu_alloc_mem_bnb=:010d}  {gpu_peak_mem_bnb=:010d}   {gpu_alloc_mem_bnb+gpu_peak_mem_bnb=:010d}")
-        # print(f"{gpu_peak_mem_diff_bytes=}, {gpu_peak_mem_diff_percent=}")
-        # print(f"{gpu_total_mem_orig=}, {gpu_total_mem_bnb=}")
-        # print(f"{gpu_total_mem_diff_bytes=}, {gpu_total_mem_diff_percent=}")
+        # sshleifer/student_marian_en_ro_6_1 has 54M parameter, 29M of which is `nn.Embedding` which
+        # doesn't get quantized and remains in fp32. Therefore we only have 25M parameters quantized
+        # in 2 bytes and the diff in optim memory usage is derived as so:
+        #
+        # - normal 25*8=~200MB (8 bytes per param)
+        # - bnb    25*2= ~50MB (2 bytes per param)
+        #
+        # Thus we should expect ~150MB total memory saved.
+        #
+        # Peak memory should be the same - the total should be different by about that same margin
+        #
+        # After leaving a small margin to accommodate for differences between gpus let's check
+        # that we have at least 120MB in savings
+        expected_savings = 120
+
+        # uncomment the following if this test starts failing - requires py38 for a new print feature
+        # gpu_peak_mem_diff = gpu_peak_mem_orig - gpu_peak_mem_bnb
+        # print(f"{gpu_alloc_mem_orig=}MB {gpu_peak_mem_orig=}MB {gpu_alloc_mem_orig+gpu_peak_mem_orig=}MB")
+        # print(f" {gpu_alloc_mem_bnb=}MB  {gpu_peak_mem_bnb=}MB  {gpu_alloc_mem_bnb+gpu_peak_mem_bnb=}MB")
+        # print(f"{gpu_alloc_mem_diff=}MB")
+        # print(f"{gpu_peak_mem_diff=}MB")
+        # print(f"{gpu_total_mem_orig=}MB, {gpu_total_mem_bnb=}MB")
+        # print(f"{gpu_total_mem_diff=}MB, {gpu_total_mem_diff=}MB")
 
         self.assertGreater(
-            gpu_peak_mem_diff_percent,
-            10,  # basically a huge difference - got ~30x on my desktop
-            "should use very little peak gpu memory with BNB, compared to without it"
-            f"but got gpu_peak_mem_orig={gpu_peak_mem_orig} and gpu_peak_mem_bnb={gpu_peak_mem_bnb}",
+            gpu_alloc_mem_diff,
+            expected_savings,
+            "should use ~150MB less alloc gpu memory with BNB, compared to without it for this model but got"
+            f" a difference of {gpu_alloc_mem_diff}MB, with gpu_alloc_mem_orig={gpu_alloc_mem_orig}MB and"
+            f" gpu_alloc_mem_bnb={gpu_alloc_mem_bnb}MB",
         )
 
         self.assertGreater(
-            gpu_total_mem_diff_percent,
-            0.20,  # could easily be 0.50, but let's stay on the safe side
-            "Using BNB should use less total GPU memory than without it"
-            f"but got gpu_total_mem_orig={gpu_total_mem_orig} and gpu_total_mem_bnb={gpu_total_mem_bnb}",
+            gpu_total_mem_diff,
+            expected_savings,
+            "should use ~150MB less total gpu memory with BNB, compared to without it for this model but got"
+            f" a difference of {gpu_total_mem_diff}MB, with gpu_total_mem_orig={gpu_total_mem_orig}MB and"
+            f" gpu_total_mem_bnb={gpu_total_mem_bnb}MB",
         )
 
         self.assertEqual(
             loss_orig, loss_bnb, f"loss should be the same, but got loss_orig={loss_orig}, loss_bnb={loss_bnb}"
         )
 
-        # Additionally let's test that the absolute gpu memory difference is larger or about the
-        # same as the expected saving coming from BNB (6 bytes per param)
-        model = AutoModel.from_pretrained(MARIAN_MODEL)
-        total_numel = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())
-        bnb_saved_bytes = total_numel * 6  # 324MB
-
-        self.assertGreater(
-            gpu_total_mem_diff_bytes,
-            bnb_saved_bytes * 0.8,  # add a safety margin, if it saved slightly less
-            f"BNB should have saved about {bnb_saved_bytes} bytes, but the saved bytes were"
-            f" {gpu_total_mem_diff_bytes}",
-        )
-
     def run_trainer(
         self,
-        eval_steps: int,
         max_len: int,
         model_name: str,
         num_train_epochs: int,
         learning_rate: float = 3e-3,
+        optim: str = "adafactor",
         distributed: bool = False,
         extra_args_str: str = None,
+        eval_steps: int = 0,
         predict_with_generate: bool = True,
         do_train: bool = True,
         do_eval: bool = True,
         do_predict: bool = True,
+        n_gpus_to_use: int = None,
     ):
         data_dir = self.test_file_dir / "../fixtures/tests_samples/wmt_en_ro"
         output_dir = self.get_auto_remove_tmp_dir()
@@ -320,10 +290,9 @@ def run_trainer(
             --save_steps {str(eval_steps)}
             --group_by_length
             --label_smoothing_factor 0.1
-            --adafactor
             --target_lang ro_RO
             --source_lang en_XX
-        """
+        """.split()
 
         args_eval = f"""
             --do_eval
@@ -332,13 +301,13 @@ def run_trainer(
             --val_max_target_length {max_len}
             --evaluation_strategy steps
             --eval_steps {str(eval_steps)}
-        """
+        """.split()
 
         args_predict = """
             --do_predict
-        """
+        """.split()
 
-        args = ""
+        args = []
         if do_train:
             args += args_train
 
@@ -349,19 +318,24 @@ def run_trainer(
             args += args_predict
 
         if predict_with_generate:
-            args += "--predict_with_generate"
+            args += "--predict_with_generate".split()
 
-        args = args.split()
+        if do_train:
+            if optim == "adafactor":
+                args += "--adafactor".split()
+            else:
+                args += f"--optim {optim}".split()
 
         if extra_args_str is not None:
-            args.extend(extra_args_str.split())
+            args += extra_args_str.split()
 
         if distributed:
-            n_gpu = get_gpu_count()
+            if n_gpus_to_use is None:
+                n_gpus_to_use = get_gpu_count()
             master_port = get_torch_dist_unique_port()
             distributed_args = f"""
-                -m torch.distributed.launch
-                --nproc_per_node={n_gpu}
+                -m torch.distributed.run
+                --nproc_per_node={n_gpus_to_use}
                 --master_port={master_port}
                 {self.examples_dir_str}/pytorch/translation/run_translation.py
             """.split()
diff --git a/tests/fixtures/test_sentencepiece_bpe_char.model b/tests/fixtures/test_sentencepiece_bpe_char.model
new file mode 100644
index 000000000000..82ee359fb5e6
Binary files /dev/null and b/tests/fixtures/test_sentencepiece_bpe_char.model differ
diff --git a/tests/fsdp/test_fsdp.py b/tests/fsdp/test_fsdp.py
new file mode 100644
index 000000000000..f9dd3006264c
--- /dev/null
+++ b/tests/fsdp/test_fsdp.py
@@ -0,0 +1,252 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+import os
+from functools import partial
+
+from parameterized import parameterized
+
+import tests.trainer.test_trainer
+from tests.trainer.test_trainer import TrainerIntegrationCommon  # noqa
+from transformers import is_torch_available
+from transformers.testing_utils import (
+    TestCasePlus,
+    execute_subprocess_async,
+    get_gpu_count,
+    mockenv_context,
+    require_accelerate,
+    require_fsdp,
+    require_torch_gpu,
+    require_torch_multi_gpu,
+    slow,
+)
+from transformers.trainer_callback import TrainerState
+from transformers.trainer_utils import FSDPOption, set_seed
+from transformers.utils import is_accelerate_available, is_torch_bf16_gpu_available
+
+
+# default torch.distributed port
+DEFAULT_MASTER_PORT = "10999"
+dtypes = ["fp16"]
+if is_torch_bf16_gpu_available():
+    dtypes += ["bf16"]
+sharding_strategies = ["full_shard", "shard_grad_op"]
+state_dict_types = ["FULL_STATE_DICT", "SHARDED_STATE_DICT"]
+set_seed(42)
+params = list(itertools.product(sharding_strategies, dtypes))
+
+
+def get_master_port(real_launcher=False):
+    """
+    When using a single gpu launcher emulation (i.e. not deepspeed or python -m torch.distributed)
+    the issue is that once the port is tied it can't be used anywhere else outside of this process,
+    since torch.dist doesn't free the port until the process exits. Therefore for the sake of being
+    able to run both emulated launcher and normal launcher tests we need 2 distinct ports.
+
+    This function will give the right port in the right context. For real launcher it'll give the
+    base port, for emulated launcher it'll give the base port + 1. In both cases a string is
+    returned.
+
+    Args:
+        `real_launcher`: whether a real launcher is going to be used, or the emulated one
+
+    """
+
+    master_port_base = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT)
+    if not real_launcher:
+        master_port_base = str(int(master_port_base) + 1)
+    return master_port_base
+
+
+if is_torch_available():
+    from tests.trainer.test_trainer import (  # noqa
+        RegressionModelConfig,
+        RegressionPreTrainedModel,
+    )
+
+    # hack to restore original logging level pre #21700
+    get_regression_trainer = partial(tests.trainer.test_trainer.get_regression_trainer, log_level="info")
+
+require_fsdp_version = require_fsdp
+if is_accelerate_available():
+    from accelerate.utils.constants import (
+        FSDP_PYTORCH_VERSION,
+        FSDP_SHARDING_STRATEGY,
+    )
+
+    require_fsdp_version = partial(require_fsdp, min_version=FSDP_PYTORCH_VERSION)
+
+
+def get_launcher(distributed=False, use_accelerate=False):
+    # 1. explicitly set --num_nodes=1 just in case these tests end up run on a multi-node setup
+    # - it won't be able to handle that
+    # 2. for now testing with just 2 gpus max (since some quality tests may give different
+    # results with mode gpus because we use very little data)
+    num_gpus = min(2, get_gpu_count()) if distributed else 1
+    master_port = get_master_port(real_launcher=True)
+    if use_accelerate:
+        return f"""accelerate launch
+        --num_processes {num_gpus}
+        --main_process_port {master_port}
+        --use_fsdp
+        --fsdp_auto_wrap_policy TRANSFORMER_BASED_WRAP
+        --fsdp_state_dict_type SHARDED_STATE_DICT
+        --fsdp_transformer_layer_cls_to_wrap BertLayer""".split()
+    return f"torchrun --nnodes 1 --nproc-per-node {num_gpus} --master-port {master_port}".split()
+
+
+def _parameterized_custom_name_func(func, param_num, param):
+    # customize the test name generator function as we want both params to appear in the sub-test
+    # name, as by default it shows only the first param
+    param_based_name = parameterized.to_safe_name("_".join(str(x) for x in param.args))
+    return f"{func.__name__}_{param_based_name}"
+
+
+@require_accelerate
+@require_torch_gpu
+@require_fsdp_version
+class TrainerIntegrationFSDP(TestCasePlus, TrainerIntegrationCommon):
+    def setUp(self):
+        super().setUp()
+        master_port = get_master_port(real_launcher=False)
+        self.dist_env_1_gpu = {
+            "MASTER_ADDR": "localhost",
+            "MASTER_PORT": master_port,
+            "RANK": "0",
+            "LOCAL_RANK": "0",
+            "WORLD_SIZE": "1",
+        }
+
+        self.fsdp_config = {
+            "backward_prefetch": "backward_pre",
+            "forward_prefetch": "False",
+            "limit_all_gathers": "False",
+            "use_orig_params": "True",
+            "sync_module_states": "True",
+            "activation_checkpointing": "False",
+            "min_num_params": 1,
+        }
+
+    def tearDown(self):
+        super().tearDown()
+
+    @parameterized.expand(params, name_func=_parameterized_custom_name_func)
+    def test_fsdp_config(self, sharding_strategy, dtype):
+        output_dir = self.get_auto_remove_tmp_dir()
+        kwargs = {
+            "output_dir": output_dir,
+            "train_len": 128,
+            "save_steps": 5,
+            "learning_rate": 0.1,
+            "fsdp": f"{sharding_strategy} offload auto_wrap",
+            "fsdp_config": self.fsdp_config,
+        }
+        kwargs[dtype] = True
+        with mockenv_context(**self.dist_env_1_gpu):
+            trainer = get_regression_trainer(**kwargs)
+            self.assertEqual(trainer.args.fsdp[0], sharding_strategy)
+            self.assertEqual(trainer.args.fsdp[1], FSDPOption.OFFLOAD)
+            self.assertEqual(trainer.args.fsdp[2], FSDPOption.AUTO_WRAP)
+            for k, v in trainer.args.fsdp_config.items():
+                self.assertEqual(v, self.fsdp_config[k])
+            self.assertEqual(os.environ.get("ACCELERATE_USE_FSDP", "false"), "true")
+
+    @parameterized.expand(params, name_func=_parameterized_custom_name_func)
+    @require_torch_multi_gpu
+    @slow
+    def test_basic_run(self, sharding_strategy, dtype):
+        launcher = get_launcher(distributed=True, use_accelerate=False)
+        output_dir = self.get_auto_remove_tmp_dir()
+        args = self.get_base_args(output_dir, 1, 50).split() + [f"--{dtype}"]
+        fsdp_args = ["--fsdp", f"{sharding_strategy} auto_wrap", "--fsdp_transformer_layer_cls_to_wrap", "BertLayer"]
+        script = [f"{self.examples_dir_str}/pytorch/text-classification/run_glue.py"]
+        cmd = launcher + script + args + fsdp_args
+        execute_subprocess_async(cmd, env=self.get_env())
+
+    @parameterized.expand(dtypes)
+    @require_torch_multi_gpu
+    @slow
+    def test_basic_run_with_cpu_offload(self, dtype):
+        launcher = get_launcher(distributed=True, use_accelerate=False)
+        output_dir = self.get_auto_remove_tmp_dir()
+        args = self.get_base_args(output_dir, 1, 50).split() + [f"--{dtype}", "--max_steps", "10"]
+        fsdp_args = ["--fsdp", "full_shard auto_wrap offload", "--fsdp_transformer_layer_cls_to_wrap", "BertLayer"]
+        script = [f"{self.examples_dir_str}/pytorch/text-classification/run_glue.py"]
+        cmd = launcher + script + args + fsdp_args
+        execute_subprocess_async(cmd, env=self.get_env())
+
+    @parameterized.expand(state_dict_types, name_func=_parameterized_custom_name_func)
+    @require_torch_multi_gpu
+    @slow
+    def test_training_and_can_resume_normally(self, state_dict_type):
+        output_dir = self.get_auto_remove_tmp_dir("./xxx", after=False)
+
+        sharding_strategy = "full_shard"
+        use_accelerate = state_dict_type == "SHARDED_STATE_DICT"
+        launcher = get_launcher(True, use_accelerate=use_accelerate)
+        args = self.get_base_args(output_dir, 2, 25).split()
+        script = [f"{self.examples_dir_str}/pytorch/text-classification/run_glue.py"]
+        logs = self.run_cmd_and_get_logs(use_accelerate, sharding_strategy, launcher, script, args, output_dir)
+
+        # resume from ckpt
+        checkpoint = os.path.join(output_dir, "checkpoint-115")
+        resume_args = args + f"--resume_from_checkpoint {checkpoint}".split()
+        logs_resume = self.run_cmd_and_get_logs(
+            use_accelerate, sharding_strategy, launcher, script, resume_args, output_dir
+        )
+
+        for log, log1 in zip(logs, logs_resume):
+            if "learning_rate" in log:
+                self.assertAlmostEqual(log["learning_rate"], log1["learning_rate"], delta=1e-5)
+
+    def run_cmd_and_get_logs(self, use_accelerate, sharding_strategy, launcher, script, args, output_dir):
+        if not use_accelerate:
+            fsdp_args = [
+                "--fsdp",
+                f"{sharding_strategy} auto_wrap",
+                "--fsdp_transformer_layer_cls_to_wrap",
+                "BertLayer",
+            ]
+            cmd = launcher + script + args + fsdp_args
+        else:
+            fsdp_config = f"""
+                --fsdp_sharding_strategy {FSDP_SHARDING_STRATEGY.index(sharding_strategy.upper()) + 1}
+            """.split()
+            cmd = launcher + fsdp_config + script + args
+
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+        execute_subprocess_async(cmd, env=self.get_env())
+        logs = TrainerState.load_from_json(os.path.join(output_dir, "trainer_state.json")).log_history
+        return logs
+
+    def get_base_args(self, output_dir, num_epochs, logging_steps):
+        return f"""
+            --model_name_or_path bert-base-cased
+            --task_name mrpc
+            --output_dir {output_dir}
+            --overwrite_output_dir
+            --do_train
+            --max_seq_length 128
+            --per_device_train_batch_size 16
+            --learning_rate 5e-5
+            --num_train_epochs {num_epochs}
+            --lr_scheduler_type cosine
+            --logging_steps {logging_steps}
+            --save_strategy epoch
+            --do_eval
+            --evaluation_strategy epoch
+            --report_to none
+        """
diff --git a/tests/generation/test_beam_search.py b/tests/generation/test_beam_search.py
index 72202ae2dad9..47d3b4b38a7b 100644
--- a/tests/generation/test_beam_search.py
+++ b/tests/generation/test_beam_search.py
@@ -160,10 +160,10 @@ def cut_expected_tensor(tensor):
         expected_output_scores = cut_expected_tensor(next_scores)
 
         # add num_beams * batch_idx
-        expected_output_indices = (
-            cut_expected_tensor(next_indices)
-            + (torch.arange(self.num_beams * self.batch_size, device=torch_device) // self.num_beams) * self.num_beams
+        offset = torch.div(
+            torch.arange(self.num_beams * self.batch_size, device=torch_device), self.num_beams, rounding_mode="floor"
         )
+        expected_output_indices = cut_expected_tensor(next_indices) + offset * self.num_beams
 
         self.parent.assertListEqual(expected_output_tokens.tolist(), output_tokens.tolist())
         self.parent.assertListEqual(expected_output_indices.tolist(), output_indices.tolist())
@@ -399,10 +399,10 @@ def cut_expected_tensor(tensor):
         expected_output_scores = cut_expected_tensor(next_scores)
 
         # add num_beams * batch_idx
-        expected_output_indices = (
-            cut_expected_tensor(next_indices)
-            + (torch.arange(self.num_beams * self.batch_size, device=torch_device) // self.num_beams) * self.num_beams
+        offset = torch.div(
+            torch.arange(self.num_beams * self.batch_size, device=torch_device), self.num_beams, rounding_mode="floor"
         )
+        expected_output_indices = cut_expected_tensor(next_indices) + offset * self.num_beams
 
         self.parent.assertListEqual(expected_output_tokens.tolist(), output_tokens.tolist())
         self.parent.assertListEqual(expected_output_indices.tolist(), output_indices.tolist())
diff --git a/tests/generation/test_configuration_utils.py b/tests/generation/test_configuration_utils.py
index 004720e110b9..a181b00ee08d 100644
--- a/tests/generation/test_configuration_utils.py
+++ b/tests/generation/test_configuration_utils.py
@@ -14,14 +14,20 @@
 # limitations under the License.
 
 import copy
+import os
 import tempfile
 import unittest
+import warnings
 
+from huggingface_hub import HfFolder, delete_repo
 from parameterized import parameterized
+from requests.exceptions import HTTPError
+
 from transformers import AutoConfig, GenerationConfig
+from transformers.testing_utils import TOKEN, USER, is_staging_test
 
 
-class LogitsProcessorTest(unittest.TestCase):
+class GenerationConfigTest(unittest.TestCase):
     @parameterized.expand([(None,), ("foo.json",)])
     def test_save_load_config(self, config_name):
         config = GenerationConfig(
@@ -74,3 +80,147 @@ def test_update(self):
 
         # `.update()` returns a dictionary of unused kwargs
         self.assertEqual(unused_kwargs, {"foo": "bar"})
+
+    def test_initialize_new_kwargs(self):
+        generation_config = GenerationConfig()
+        generation_config.foo = "bar"
+
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        # update_kwargs was used to update the config on valid attributes
+        self.assertEqual(new_config.foo, "bar")
+
+        generation_config = GenerationConfig.from_model_config(new_config)
+        assert not hasattr(generation_config, "foo")  # no new kwargs should be initialized if from config
+
+    def test_kwarg_init(self):
+        """Tests that we can overwrite attributes at `from_pretrained` time."""
+        default_config = GenerationConfig()
+        self.assertEqual(default_config.temperature, 1.0)
+        self.assertEqual(default_config.do_sample, False)
+        self.assertEqual(default_config.num_beams, 1)
+
+        config = GenerationConfig(
+            do_sample=True,
+            temperature=0.7,
+            length_penalty=1.0,
+            bad_words_ids=[[1, 2, 3], [4, 5]],
+        )
+        self.assertEqual(config.temperature, 0.7)
+        self.assertEqual(config.do_sample, True)
+        self.assertEqual(config.num_beams, 1)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            config.save_pretrained(tmp_dir)
+            loaded_config = GenerationConfig.from_pretrained(tmp_dir, temperature=1.0)
+
+        self.assertEqual(loaded_config.temperature, 1.0)
+        self.assertEqual(loaded_config.do_sample, True)
+        self.assertEqual(loaded_config.num_beams, 1)  # default value
+
+    def test_refuse_to_save(self):
+        """Tests that we refuse to save a generation config that fails validation."""
+
+        # setting the temperature alone is invalid, as we also need to set do_sample to True -> throws a warning that
+        # is caught, doesn't save, and raises a warning
+        config = GenerationConfig()
+        config.temperature = 0.5
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            with warnings.catch_warnings(record=True) as captured_warnings:
+                config.save_pretrained(tmp_dir)
+            self.assertEqual(len(captured_warnings), 1)
+            self.assertTrue("Fix these issues to save the configuration." in str(captured_warnings[0].message))
+            self.assertTrue(len(os.listdir(tmp_dir)) == 0)
+
+        # greedy decoding throws an exception if we try to return multiple sequences -> throws an exception that is
+        # caught, doesn't save, and raises a warning
+        config = GenerationConfig()
+        config.num_return_sequences = 2
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            with warnings.catch_warnings(record=True) as captured_warnings:
+                config.save_pretrained(tmp_dir)
+            self.assertEqual(len(captured_warnings), 1)
+            self.assertTrue("Fix these issues to save the configuration." in str(captured_warnings[0].message))
+            self.assertTrue(len(os.listdir(tmp_dir)) == 0)
+
+        # final check: no warnings thrown if it is correct, and file is saved
+        config = GenerationConfig()
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            with warnings.catch_warnings(record=True) as captured_warnings:
+                config.save_pretrained(tmp_dir)
+            self.assertEqual(len(captured_warnings), 0)
+            self.assertTrue(len(os.listdir(tmp_dir)) == 1)
+
+
+@is_staging_test
+class ConfigPushToHubTester(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._token = TOKEN
+        HfFolder.save_token(TOKEN)
+
+    @classmethod
+    def tearDownClass(cls):
+        try:
+            delete_repo(token=cls._token, repo_id="test-generation-config")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="valid_org/test-generation-config-org")
+        except HTTPError:
+            pass
+
+    def test_push_to_hub(self):
+        config = GenerationConfig(
+            do_sample=True,
+            temperature=0.7,
+            length_penalty=1.0,
+        )
+        config.push_to_hub("test-generation-config", token=self._token)
+
+        new_config = GenerationConfig.from_pretrained(f"{USER}/test-generation-config")
+        for k, v in config.to_dict().items():
+            if k != "transformers_version":
+                self.assertEqual(v, getattr(new_config, k))
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="test-generation-config")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            config.save_pretrained(tmp_dir, repo_id="test-generation-config", push_to_hub=True, token=self._token)
+
+        new_config = GenerationConfig.from_pretrained(f"{USER}/test-generation-config")
+        for k, v in config.to_dict().items():
+            if k != "transformers_version":
+                self.assertEqual(v, getattr(new_config, k))
+
+    def test_push_to_hub_in_organization(self):
+        config = GenerationConfig(
+            do_sample=True,
+            temperature=0.7,
+            length_penalty=1.0,
+        )
+        config.push_to_hub("valid_org/test-generation-config-org", token=self._token)
+
+        new_config = GenerationConfig.from_pretrained("valid_org/test-generation-config-org")
+        for k, v in config.to_dict().items():
+            if k != "transformers_version":
+                self.assertEqual(v, getattr(new_config, k))
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="valid_org/test-generation-config-org")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            config.save_pretrained(
+                tmp_dir, repo_id="valid_org/test-generation-config-org", push_to_hub=True, token=self._token
+            )
+
+        new_config = GenerationConfig.from_pretrained("valid_org/test-generation-config-org")
+        for k, v in config.to_dict().items():
+            if k != "transformers_version":
+                self.assertEqual(v, getattr(new_config, k))
diff --git a/tests/generation/test_flax_logits_process.py b/tests/generation/test_flax_logits_process.py
index 27dea2b029dd..a45d75ae244b 100644
--- a/tests/generation/test_flax_logits_process.py
+++ b/tests/generation/test_flax_logits_process.py
@@ -27,6 +27,7 @@
 if is_flax_available():
     import jax
     import jax.numpy as jnp
+
     from transformers.generation import (
         FlaxForcedBOSTokenLogitsProcessor,
         FlaxForcedEOSTokenLogitsProcessor,
diff --git a/tests/generation/test_flax_utils.py b/tests/generation/test_flax_utils.py
index aabab559853b..647482b88cd8 100644
--- a/tests/generation/test_flax_utils.py
+++ b/tests/generation/test_flax_utils.py
@@ -27,6 +27,7 @@
 
     import jax.numpy as jnp
     from jax import jit
+
     from transformers import AutoTokenizer, FlaxAutoModelForCausalLM
     from transformers.modeling_flax_pytorch_utils import load_flax_weights_in_pytorch_model
 
@@ -157,6 +158,19 @@ def test_beam_search_generate(self):
 
             self.assertListEqual(generation_outputs.tolist(), jit_generation_outputs.tolist())
 
+    def test_beam_search_generate_num_return_sequences(self):
+        config, input_ids, _, max_length = self._get_input_ids_and_config()
+        config.do_sample = False
+        config.max_length = max_length
+        config.num_beams = 2
+        config.num_return_sequences = 2
+
+        for model_class in self.all_generative_model_classes:
+            model = model_class(config)
+
+            generation_outputs = model.generate(input_ids).sequences
+            self.assertEqual(generation_outputs.shape[0], input_ids.shape[0] * config.num_return_sequences)
+
     def test_sample_generate_logits_warper(self):
         config, input_ids, _, max_length = self._get_input_ids_and_config()
         config.do_sample = True
diff --git a/tests/generation/test_framework_agnostic.py b/tests/generation/test_framework_agnostic.py
new file mode 100644
index 000000000000..306cb15168e5
--- /dev/null
+++ b/tests/generation/test_framework_agnostic.py
@@ -0,0 +1,688 @@
+"""
+Framework agnostic tests for generate()-related methods.
+"""
+
+import numpy as np
+
+from transformers import AutoTokenizer
+from transformers.testing_utils import slow, torch_device
+
+
+class GenerationIntegrationTestsMixin:
+    # To be populated by the child classes
+    framework_dependent_parameters = {
+        "AutoModelForCausalLM": None,
+        "AutoModelForSpeechSeq2Seq": None,
+        "AutoModelForSeq2SeqLM": None,
+        "AutoModelForVision2Seq": None,
+        "LogitsProcessorList": None,
+        "MinLengthLogitsProcessor": None,
+        "create_tensor_fn": None,
+        "floats_tensor": None,
+        "return_tensors": None,
+        "set_seed": None,
+    }
+
+    def test_validate_generation_inputs(self):
+        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
+        return_tensors = self.framework_dependent_parameters["return_tensors"]
+        create_tensor_fn = self.framework_dependent_parameters["create_tensor_fn"]
+
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        encoder_input_str = "Hello world"
+        input_ids = tokenizer(encoder_input_str, return_tensors=return_tensors).input_ids
+
+        # typos are quickly detected (the correct argument is `do_sample`)
+        with self.assertRaisesRegex(ValueError, "do_samples"):
+            model.generate(input_ids, do_samples=True)
+
+        # arbitrary arguments that will not be used anywhere are also not accepted
+        with self.assertRaisesRegex(ValueError, "foo"):
+            fake_model_kwargs = {"foo": "bar"}
+            model.generate(input_ids, **fake_model_kwargs)
+
+        # however, valid model_kwargs are accepted
+        valid_model_kwargs = {"attention_mask": create_tensor_fn(np.zeros_like(input_ids))}
+        model.generate(input_ids, **valid_model_kwargs)
+
+    def test_custom_logits_processor(self):
+        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
+        logits_processor_list_cls = self.framework_dependent_parameters["LogitsProcessorList"]
+        min_length_logits_processor_cls = self.framework_dependent_parameters["MinLengthLogitsProcessor"]
+        return_tensors = self.framework_dependent_parameters["return_tensors"]
+
+        bart_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
+        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
+        bart_model = model_cls.from_pretrained("hf-internal-testing/tiny-random-bart", min_length=1)
+        input_ids = bart_tokenizer(article, return_tensors=return_tensors).input_ids
+
+        logits_processor = logits_processor_list_cls()
+        logits_processor.append(min_length_logits_processor_cls(min_length=10, eos_token_id=0))
+        # it should not be allowed to both define `min_length` via config and `logits_processor` list
+        with self.assertRaises(ValueError):
+            bart_model.generate(input_ids, logits_processor=logits_processor)
+
+        bart_model.config.min_length = None
+        bart_model.generate(input_ids, logits_processor=logits_processor)
+
+    def test_max_new_tokens_encoder_decoder(self):
+        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
+        return_tensors = self.framework_dependent_parameters["return_tensors"]
+        is_pt = not model_cls.__name__.startswith("TF")
+
+        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
+        bart_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
+
+        bart_model = model_cls.from_pretrained("hf-internal-testing/tiny-random-bart")
+        input_ids = bart_tokenizer(article, return_tensors=return_tensors).input_ids
+        if is_pt:
+            bart_model = bart_model.to(torch_device)
+            input_ids = input_ids.to(torch_device)
+
+        self.assertEqual(list(input_ids.shape), [1, 29])
+
+        max_new_tokens = 3
+        bart_model.config.max_length = 20
+        bart_model.config.eos_token_id = None
+
+        # Encoder decoder call
+        outputs = bart_model.generate(input_ids, max_new_tokens=max_new_tokens)
+        # 1 BOS + 3 new tokens
+        self.assertEqual(list(outputs.shape), [1, 4])
+
+        # Decoder only call
+        outputs = bart_model.generate(decoder_input_ids=input_ids, max_new_tokens=max_new_tokens)
+        # 1 BOS + 29 (input length) + 3 new tokens
+        self.assertEqual(list(outputs.shape), [1, 33])
+
+        # Encoder decoder call > 20
+        outputs = bart_model.generate(max_new_tokens=max_new_tokens + 20)
+
+        # 1 BOS + 20 + 3 new tokens
+        self.assertEqual(list(outputs.shape), [1, 24])
+
+    def test_max_new_tokens_decoder_only(self):
+        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
+        return_tensors = self.framework_dependent_parameters["return_tensors"]
+        is_pt = not model_cls.__name__.startswith("TF")
+
+        article = """Justin Timberlake."""
+        gpt2_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+
+        gpt2_model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        input_ids = gpt2_tokenizer(article, return_tensors=return_tensors).input_ids
+        if is_pt:
+            gpt2_model = gpt2_model.to(torch_device)
+            input_ids = input_ids.to(torch_device)
+
+        self.assertEqual(list(input_ids.shape), [1, 9])
+
+        max_new_tokens = 3
+        gpt2_model.config.max_length = 20
+
+        # call < 20
+        outputs = gpt2_model.generate(input_ids, max_new_tokens=max_new_tokens)
+
+        # 9 input_ids + 3 new tokens
+        self.assertEqual(list(outputs.shape), [1, 12])
+
+        # call > 20
+        outputs = gpt2_model.generate(max_new_tokens=max_new_tokens + 20)
+
+        # 1 BOS token + 23 new tokens
+        self.assertEqual(list(outputs.shape), [1, 24])
+
+    def test_encoder_decoder_generate_with_inputs_embeds(self):
+        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
+        return_tensors = self.framework_dependent_parameters["return_tensors"]
+
+        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
+        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-bart", max_length=5)
+        model.config.eos_token_id = None
+        input_ids = tokenizer(article, return_tensors=return_tensors).input_ids
+
+        inputs_embeds = model.get_input_embeddings()(input_ids)
+
+        output_sequences = model.generate(inputs_embeds=inputs_embeds)
+
+        # make sure model generated correctly until `max_length`
+        self.assertEqual(output_sequences.shape, (1, 5))
+
+    def test_transition_scores_greedy_search(self):
+        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
+        return_tensors = self.framework_dependent_parameters["return_tensors"]
+        is_pt = not model_cls.__name__.startswith("TF")
+
+        articles = ["Justin Timberlake", "Michael Phelps"]
+        tokenizer = AutoTokenizer.from_pretrained("distilgpt2", padding_side="left")
+        tokenizer.pad_token = tokenizer.eos_token
+
+        model = model_cls.from_pretrained("distilgpt2")
+        input_ids = tokenizer(articles, return_tensors=return_tensors, padding=True).input_ids
+        if is_pt:
+            model = model.to(torch_device)
+            input_ids = input_ids.to(torch_device)
+
+        outputs = model.generate(
+            input_ids=input_ids,
+            max_new_tokens=5,
+            pad_token_id=tokenizer.eos_token_id,
+            eos_token_id=None,
+            return_dict_in_generate=True,
+            output_scores=True,
+        )
+
+        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores)
+        if is_pt:
+            transition_scores = transition_scores.cpu().numpy()
+
+        expected_scores = np.array(
+            [
+                [-57.8844, -60.45698, -70.16364, -65.50791, -66.35648],
+                [-54.417572, -60.216614, -62.661243, -58.621933, -58.298683],
+            ]
+        )
+        self.assertTrue(np.allclose(transition_scores, expected_scores, atol=1e-3))
+
+    def test_transition_scores_greedy_search_normalized(self):
+        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
+        return_tensors = self.framework_dependent_parameters["return_tensors"]
+        is_pt = not model_cls.__name__.startswith("TF")
+
+        articles = ["Justin Timberlake", "Michael Phelps"]
+        tokenizer = AutoTokenizer.from_pretrained("distilgpt2", padding_side="left")
+        tokenizer.pad_token = tokenizer.eos_token
+
+        model = model_cls.from_pretrained("distilgpt2")
+        input_ids = tokenizer(articles, return_tensors=return_tensors, padding=True).input_ids
+        if is_pt:
+            model = model.to(torch_device)
+            input_ids = input_ids.to(torch_device)
+
+        outputs = model.generate(
+            input_ids=input_ids,
+            max_new_tokens=5,
+            pad_token_id=tokenizer.eos_token_id,
+            eos_token_id=None,
+            return_dict_in_generate=True,
+            output_scores=True,
+        )
+
+        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, normalize_logits=True)
+        if is_pt:
+            transition_scores = transition_scores.cpu().numpy()
+
+        expected_scores = np.array(
+            [
+                [-2.538938, -2.2694316, -2.1580915, -1.572299, -2.6719835],
+                [-1.8826028, -2.2461371, -1.7556462, -2.9644494, -1.7996008],
+            ]
+        )
+        self.assertTrue(np.allclose(transition_scores, expected_scores, atol=1e-3))
+
+    def test_transition_scores_beam_search_encoder_decoder(self):
+        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
+        return_tensors = self.framework_dependent_parameters["return_tensors"]
+        is_pt = not model_cls.__name__.startswith("TF")
+
+        articles = [
+            "Justin Timberlake and Jessica Biel, welcome to parenthood.",
+            "Michael Phelps is arguably the most decorated Olympian of all time.",
+        ]
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
+
+        model = model_cls.from_pretrained(
+            "hf-internal-testing/tiny-random-bart",
+            max_length=10,
+            num_beams=4,
+            num_return_sequences=2,
+            eos_token_id=None,
+            return_dict_in_generate=True,
+            output_scores=True,
+            length_penalty=0.0,
+        )
+        input_ids = tokenizer(articles, return_tensors=return_tensors, padding=True).input_ids
+        if is_pt:
+            model = model.to(torch_device)
+            input_ids = input_ids.to(torch_device)
+
+        outputs = model.generate(input_ids=input_ids)
+
+        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, outputs.beam_indices)
+        if is_pt:
+            transition_scores = transition_scores.cpu().numpy()
+            outputs.sequences_scores = outputs.sequences_scores.cpu().numpy()
+
+        self.assertTrue(np.allclose(np.sum(transition_scores, axis=-1), outputs.sequences_scores, atol=1e-3))
+
+    def test_transition_scores_beam_search_encoder_decoder_with_eos(self):
+        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
+        return_tensors = self.framework_dependent_parameters["return_tensors"]
+        is_pt = not model_cls.__name__.startswith("TF")
+
+        articles = [
+            "Justin Timberlake and Jessica Biel, welcome to parenthood.",
+            "Michael Phelps is arguably the most decorated Olympian of all time.",
+        ]
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
+
+        model = model_cls.from_pretrained(
+            "hf-internal-testing/tiny-random-bart",
+            max_length=10,
+            num_beams=4,
+            num_return_sequences=2,
+            return_dict_in_generate=True,
+            output_scores=True,
+            length_penalty=0.0,
+        )
+        input_ids = tokenizer(articles, return_tensors=return_tensors, padding=True).input_ids
+        if is_pt:
+            model = model.to(torch_device)
+            input_ids = input_ids.to(torch_device)
+
+        outputs = model.generate(input_ids=input_ids)
+
+        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, outputs.beam_indices)
+        if is_pt:
+            transition_scores = transition_scores.cpu().numpy()
+            outputs.sequences_scores = outputs.sequences_scores.cpu().numpy()
+
+        self.assertTrue(np.allclose(np.sum(transition_scores, axis=-1), outputs.sequences_scores, atol=1e-3))
+
+    def test_transition_scores_beam_search_decoder_only(self):
+        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
+        return_tensors = self.framework_dependent_parameters["return_tensors"]
+        is_pt = not model_cls.__name__.startswith("TF")
+
+        articles = [
+            "Justin Timberlake",
+            "Michael Phelps",
+        ]
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        tokenizer.pad_token = tokenizer.eos_token
+
+        model = model_cls.from_pretrained(
+            "hf-internal-testing/tiny-random-gpt2",
+            max_length=10,
+            num_beams=4,
+            num_return_sequences=2,
+            pad_token_id=tokenizer.eos_token_id,
+            eos_token_id=None,
+            return_dict_in_generate=True,
+            output_scores=True,
+            length_penalty=0.0,
+        )
+        input_ids = tokenizer(articles, return_tensors=return_tensors, padding=True).input_ids
+        if is_pt:
+            model = model.to(torch_device)
+            input_ids = input_ids.to(torch_device)
+
+        outputs = model.generate(input_ids=input_ids)
+
+        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, outputs.beam_indices)
+        if is_pt:
+            transition_scores = transition_scores.cpu().numpy()
+            outputs.sequences_scores = outputs.sequences_scores.cpu().numpy()
+
+        self.assertTrue(np.allclose(np.sum(transition_scores, axis=-1), outputs.sequences_scores, atol=1e-3))
+
+    def test_transition_scores_beam_sample_encoder_decoder(self):
+        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
+        return_tensors = self.framework_dependent_parameters["return_tensors"]
+        is_pt = not model_cls.__name__.startswith("TF")
+
+        articles = [
+            "Justin Timberlake and Jessica Biel, welcome to parenthood.",
+            "Michael Phelps is arguably the most decorated Olympian of all time.",
+        ]
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
+
+        model = model_cls.from_pretrained(
+            "hf-internal-testing/tiny-random-bart",
+            do_sample=True,
+            max_length=10,
+            num_beams=4,
+            num_return_sequences=2,
+            eos_token_id=None,
+            return_dict_in_generate=True,
+            output_scores=True,
+            length_penalty=0.0,
+        )
+        input_ids = tokenizer(articles, return_tensors=return_tensors, padding=True).input_ids
+        if is_pt:
+            model = model.to(torch_device)
+            input_ids = input_ids.to(torch_device)
+
+        outputs = model.generate(input_ids=input_ids)
+
+        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, outputs.beam_indices)
+        if is_pt:
+            transition_scores = transition_scores.cpu().numpy()
+            outputs.sequences_scores = outputs.sequences_scores.cpu().numpy()
+
+        self.assertTrue(np.allclose(np.sum(transition_scores, axis=-1), outputs.sequences_scores, atol=1e-3))
+
+    @slow
+    def test_transition_scores_early_stopping(self):
+        # This is an aggressive test that makes sure that `beam_search's`
+        # transition scores are computed correctly for varying `num_return_sequences`, `num_beams` and `batch_size > 1`
+        # 2 x input_ids for "question: How are you? \n context: I had a long day, "
+        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
+        create_tensor_fn = self.framework_dependent_parameters["create_tensor_fn"]
+        is_pt = not model_cls.__name__.startswith("TF")
+
+        input_ids = create_tensor_fn(2 * [[822, 10, 571, 33, 25, 58, 2625, 10, 27, 141, 3, 9, 307, 239, 6, 1]])
+        model = model_cls.from_pretrained("t5-small")
+        if is_pt:
+            model = model.to(torch_device)
+            input_ids = input_ids.to(torch_device)
+
+        outputs = model.generate(
+            input_ids,
+            max_length=10,
+            return_dict_in_generate=True,
+            output_scores=True,
+            forced_eos_token_id=model.config.eos_token_id,
+            num_beams=4,
+            do_sample=False,
+            num_return_sequences=3,
+            length_penalty=0.0,
+        )
+
+        transition_scores = model.compute_transition_scores(
+            sequences=outputs.sequences, scores=outputs.scores, beam_indices=outputs.beam_indices
+        )
+        if is_pt:
+            transition_scores = transition_scores.cpu().numpy()
+            outputs.sequences_scores = outputs.sequences_scores.cpu().numpy()
+
+        self.assertTrue(np.allclose(np.sum(transition_scores, axis=-1), outputs.sequences_scores))
+
+    def test_encoder_decoder_generate_attention_mask(self):
+        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
+        return_tensors = self.framework_dependent_parameters["return_tensors"]
+        is_pt = not model_cls.__name__.startswith("TF")
+
+        articles = ["Timberlake", "Jessica Biel, welcome to parenthood among other things"]
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
+        # need extreme generation values here to force this test
+        # to fail when `attention_mask` is not correctly treated in generate
+        model = model_cls.from_pretrained(
+            "hf-internal-testing/tiny-random-bart", max_length=50, num_beams=5, num_return_sequences=5
+        )
+        model.config.eos_token_id = None
+        input_ids = tokenizer(articles[0], return_tensors=return_tensors).input_ids
+        input_ids_batched = tokenizer(articles, padding=True, return_tensors=return_tensors).input_ids
+        if is_pt:
+            model = model.to(torch_device)
+            input_ids = input_ids.to(torch_device)
+            input_ids_batched = input_ids_batched.to(torch_device)
+
+        output_sequences_batched = model.generate(
+            input_ids=input_ids_batched, return_dict_in_generate=True, output_scores=True
+        )
+        output_sequences = model.generate(input_ids=input_ids, return_dict_in_generate=True, output_scores=True)
+
+        batched_out = output_sequences_batched.sequences_scores
+        out = output_sequences.sequences_scores
+        if is_pt:
+            batched_out = batched_out.cpu().numpy()
+            out = out.cpu().numpy()
+
+        diff = np.abs(np.sum(batched_out[:5]) - np.sum(out))
+        self.assertTrue(diff < 1e-4)
+
+    def test_generate_input_ids_as_kwarg(self):
+        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
+        return_tensors = self.framework_dependent_parameters["return_tensors"]
+        is_pt = not model_cls.__name__.startswith("TF")
+
+        article = """I need input_ids to generate"""
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2", max_length=15)
+        input_ids = tokenizer(article, return_tensors=return_tensors).input_ids
+        if is_pt:
+            model = model.to(torch_device)
+            input_ids = input_ids.to(torch_device)
+
+        output_sequences_kwargs = model.generate(input_ids=input_ids)
+        output_sequences = model.generate(input_ids)
+        if is_pt:
+            output_sequences_kwargs = output_sequences_kwargs.cpu().numpy()
+            output_sequences = output_sequences.cpu().numpy()
+
+        self.assertTrue(np.array_equal(output_sequences, output_sequences_kwargs))
+        self.assertEqual(output_sequences.shape, (1, 15))
+
+    def test_generate_input_ids_as_encoder_kwarg(self):
+        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
+        return_tensors = self.framework_dependent_parameters["return_tensors"]
+        is_pt = not model_cls.__name__.startswith("TF")
+
+        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
+        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-bart", max_length=5)
+        model.config.eos_token_id = None
+        input_ids = tokenizer(article, return_tensors=return_tensors).input_ids
+        if is_pt:
+            model = model.to(torch_device)
+            input_ids = input_ids.to(torch_device)
+
+        output_sequences_kwargs = model.generate(input_ids=input_ids)
+        output_sequences = model.generate(input_ids)
+        if is_pt:
+            output_sequences_kwargs = output_sequences_kwargs.cpu().numpy()
+            output_sequences = output_sequences.cpu().numpy()
+
+        self.assertTrue(np.array_equal(output_sequences, output_sequences_kwargs))
+        self.assertEqual(output_sequences.shape, (1, 5))
+
+    def test_generate_inputs_and_encoder_kwargs(self):
+        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
+        return_tensors = self.framework_dependent_parameters["return_tensors"]
+
+        article = """I need input_ids to generate"""
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2", max_length=10)
+        input_ids = tokenizer(article, return_tensors=return_tensors).input_ids
+        with self.assertRaises(ValueError):
+            model.generate(input_ids, input_ids=input_ids)
+
+    def test_generate_too_many_encoder_kwargs(self):
+        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
+        return_tensors = self.framework_dependent_parameters["return_tensors"]
+
+        article = """I need input_ids to generate"""
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
+        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-bart", max_length=10)
+        input_ids = tokenizer(article, return_tensors=return_tensors).input_ids
+        with self.assertRaises(ValueError):
+            model.generate(input_ids=input_ids, inputs_embeds=input_ids)
+
+    def test_generate_input_features_as_encoder_kwarg(self):
+        model_cls = self.framework_dependent_parameters["AutoModelForSpeechSeq2Seq"]
+        floats_tensor = self.framework_dependent_parameters["floats_tensor"]
+        is_pt = not model_cls.__name__.startswith("TF")
+
+        input_features = floats_tensor((3, 80, 60))
+        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-WhisperForConditionalGeneration")
+        if is_pt:
+            input_features.to(torch_device)
+            model = model.to(torch_device)
+
+        output_sequences_kwargs = model.generate(input_features=input_features, max_length=5)
+        output_sequences = model.generate(input_features, max_length=5)
+        if is_pt:
+            output_sequences_kwargs = output_sequences_kwargs.cpu().numpy()
+            output_sequences = output_sequences.cpu().numpy()
+
+        self.assertTrue(np.array_equal(output_sequences, output_sequences_kwargs))
+        self.assertEqual(output_sequences.shape, (3, 5))
+
+    def test_generate_pixel_values_as_encoder_kwarg(self):
+        model_cls = self.framework_dependent_parameters["AutoModelForVision2Seq"]
+        floats_tensor = self.framework_dependent_parameters["floats_tensor"]
+        is_pt = not model_cls.__name__.startswith("TF")
+
+        pixel_values = floats_tensor((2, 3, 30, 30))
+        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2")
+        model.config.decoder.eos_token_id = None
+        if is_pt:
+            pixel_values = pixel_values.to(torch_device)
+            model = model.to(torch_device)
+
+        output_sequences_kwargs = model.generate(pixel_values=pixel_values, max_length=5)
+        output_sequences = model.generate(pixel_values, max_length=5)
+        if is_pt:
+            output_sequences_kwargs = output_sequences_kwargs.cpu().numpy()
+            output_sequences = output_sequences.cpu().numpy()
+
+        self.assertTrue(np.array_equal(output_sequences, output_sequences_kwargs))
+        self.assertEqual(output_sequences.shape, (2, 5))
+
+    def test_generate_encoder_outputs_attention_mask(self):
+        model_cls = self.framework_dependent_parameters["AutoModelForSpeechSeq2Seq"]
+        floats_tensor = self.framework_dependent_parameters["floats_tensor"]
+        create_tensor_fn = self.framework_dependent_parameters["create_tensor_fn"]
+        is_pt = not model_cls.__name__.startswith("TF")
+
+        input_features = floats_tensor((3, 80, 60))
+        attention_mask = create_tensor_fn(np.ones(input_features.shape))
+        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-WhisperForConditionalGeneration")
+        if is_pt:
+            input_features = input_features.to(torch_device)
+            attention_mask = attention_mask.to(torch_device)
+            model = model.to(torch_device)
+
+        encoder = model.get_encoder()
+        encoder_outputs = encoder(input_features)
+
+        output_sequences_no_mask = model.generate(encoder_outputs=encoder_outputs)
+        output_sequences_with_mask = model.generate(encoder_outputs=encoder_outputs, attention_mask=attention_mask)
+        if is_pt:
+            output_sequences_no_mask = output_sequences_no_mask.cpu().numpy()
+            output_sequences_with_mask = output_sequences_with_mask.cpu().numpy()
+
+        self.assertTrue(np.array_equal(output_sequences_no_mask, output_sequences_with_mask))
+
+    def test_eos_token_id_int_and_list_greedy_search(self):
+        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
+        return_tensors = self.framework_dependent_parameters["return_tensors"]
+        is_pt = not model_cls.__name__.startswith("TF")
+
+        generation_kwargs = {
+            "do_sample": False,
+            "num_beams": 1,
+        }
+        expectation = 13
+
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        text = """Hello, my dog is cute and"""
+        tokens = tokenizer(text, return_tensors=return_tensors)
+        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        if is_pt:
+            model = model.to(torch_device)
+            tokens = tokens.to(torch_device)
+
+        eos_token_id = 873
+        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
+        self.assertTrue(expectation == len(generated_tokens[0]))
+
+        eos_token_id = [873, 198]
+        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
+        self.assertTrue(expectation == len(generated_tokens[0]))
+
+    def test_eos_token_id_int_and_list_contrastive_search(self):
+        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
+        return_tensors = self.framework_dependent_parameters["return_tensors"]
+        is_pt = not model_cls.__name__.startswith("TF")
+
+        generation_kwargs = {
+            "do_sample": False,
+            "num_beams": 1,
+            "penalty_alpha": 0.6,
+            "top_k": 4,
+        }
+        expectation = 17
+
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        text = """Hello, my dog is cute and"""
+        tokens = tokenizer(text, return_tensors=return_tensors)
+        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        if is_pt:
+            model = model.to(torch_device)
+            tokens = tokens.to(torch_device)
+
+        eos_token_id = 225
+        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
+        self.assertTrue(expectation == len(generated_tokens[0]))
+
+        eos_token_id = [225, 198]
+        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
+        self.assertTrue(expectation == len(generated_tokens[0]))
+
+    def test_eos_token_id_int_and_list_beam_search(self):
+        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
+        return_tensors = self.framework_dependent_parameters["return_tensors"]
+        is_pt = not model_cls.__name__.startswith("TF")
+
+        generation_kwargs = {
+            "do_sample": False,
+            "num_beams": 3,
+        }
+        expectation = 13
+
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        text = """Hello, my dog is cute and"""
+        tokens = tokenizer(text, return_tensors=return_tensors)
+        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        if is_pt:
+            model = model.to(torch_device)
+            tokens = tokens.to(torch_device)
+
+        eos_token_id = 873
+        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
+        unpadded_correct_condition = expectation == len(generated_tokens[0])
+        padded_correct_condition = expectation < len(generated_tokens[0]) and all(
+            token == model.config.pad_token_id for token in generated_tokens[0][expectation:]
+        )
+        self.assertTrue(unpadded_correct_condition or padded_correct_condition)
+
+        eos_token_id = [873, 198]
+        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
+        unpadded_correct_condition = expectation == len(generated_tokens[0])
+        padded_correct_condition = expectation < len(generated_tokens[0]) and all(
+            token == model.config.pad_token_id for token in generated_tokens[0][expectation:]
+        )
+        self.assertTrue(unpadded_correct_condition or padded_correct_condition)
+
+    def test_generate_vision2text_conditioning(self):
+        model_cls = self.framework_dependent_parameters["AutoModelForVision2Seq"]
+        floats_tensor = self.framework_dependent_parameters["floats_tensor"]
+        create_tensor_fn = self.framework_dependent_parameters["create_tensor_fn"]
+        is_pt = not model_cls.__name__.startswith("TF")
+
+        pixel_values = floats_tensor((2, 3, 30, 30))
+        conditioning_input = create_tensor_fn([[10], [10]])  # this should be the 2nd output token, after the BOS token
+        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2")
+        if is_pt:
+            pixel_values = pixel_values.to(torch_device)
+            model = model.to(torch_device)
+            conditioning_input = conditioning_input.to(torch_device)
+
+        # we can condition on decoder_input_ids (expected decoder input) and input_ids (which we pipe internally as
+        # decoder_input_ids, if the encoder is not a model with text input)
+        output_sequences_decoder_input_ids = model.generate(
+            pixel_values, max_length=5, decoder_input_ids=conditioning_input
+        )
+        output_sequences_input_ids = model.generate(pixel_values, max_length=5, input_ids=conditioning_input)
+        if is_pt:
+            output_sequences_decoder_input_ids = output_sequences_decoder_input_ids.cpu().numpy()
+            output_sequences_input_ids = output_sequences_input_ids.cpu().numpy()
+            conditioning_input = conditioning_input.cpu().numpy()
+
+        self.assertTrue(np.array_equal(output_sequences_decoder_input_ids, output_sequences_input_ids))
+        self.assertTrue(np.array_equal(output_sequences_decoder_input_ids[:, 1:2], conditioning_input))
diff --git a/tests/generation/test_logits_process.py b/tests/generation/test_logits_process.py
index 5a47884f4ab2..32bd02936d1c 100644
--- a/tests/generation/test_logits_process.py
+++ b/tests/generation/test_logits_process.py
@@ -13,8 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 import unittest
+from typing import List, Union
+
+from parameterized import parameterized
 
 from transformers import is_torch_available
 from transformers.testing_utils import require_torch, torch_device
@@ -28,6 +30,9 @@
 
     from transformers.generation import (
         EncoderNoRepeatNGramLogitsProcessor,
+        EncoderRepetitionPenaltyLogitsProcessor,
+        EpsilonLogitsWarper,
+        EtaLogitsWarper,
         ExponentialDecayLengthPenalty,
         ForcedBOSTokenLogitsProcessor,
         ForcedEOSTokenLogitsProcessor,
@@ -41,10 +46,12 @@
         NoRepeatNGramLogitsProcessor,
         PrefixConstrainedLogitsProcessor,
         RepetitionPenaltyLogitsProcessor,
+        SequenceBiasLogitsProcessor,
         TemperatureLogitsWarper,
         TopKLogitsWarper,
         TopPLogitsWarper,
         TypicalLogitsWarper,
+        UnbatchedClassifierFreeGuidanceLogitsProcessor,
     )
 
 
@@ -73,10 +80,10 @@ def test_min_length_dist_processor(self):
         scores_before_min_length = min_dist_processor(input_ids, scores)
         self.assertFalse(torch.isinf(scores_before_min_length).any())
 
-    def test_new_min_length_dist_processor(self):
+    @parameterized.expand([(0,), ([0, 18],)])
+    def test_new_min_length_dist_processor(self, eos_token_id: Union[int, List[int]]):
         vocab_size = 20
         batch_size = 4
-        eos_token_id = 0
 
         # check that first input is skipped (min new length applying)
         input_ids = ids_tensor((batch_size, 5), vocab_size=20)
@@ -84,9 +91,15 @@ def test_new_min_length_dist_processor(self):
             prompt_length_to_skip=input_ids.shape[-1], min_new_tokens=3, eos_token_id=eos_token_id
         )
 
+        expected_eos_scores_before_min_length = batch_size * [-float("inf")]
+        if isinstance(eos_token_id, list):
+            expected_eos_scores_before_min_length *= len(eos_token_id)
+
         scores = self._get_uniform_logits(batch_size, vocab_size)
         scores_before_min_length = new_min_dist_processor(input_ids, scores)
-        self.assertListEqual(scores_before_min_length[:, eos_token_id].tolist(), batch_size * [-float("inf")])
+        self.assertListEqual(
+            scores_before_min_length[:, eos_token_id].flatten().tolist(), expected_eos_scores_before_min_length
+        )
 
         # check that, for skipping, now prompt length is 5, after that we expect first 5 tokens will be skipped
         self.assertTrue(new_min_dist_processor.prompt_length_to_skip == 5)
@@ -95,19 +108,25 @@ def test_new_min_length_dist_processor(self):
         input_ids = ids_tensor((batch_size, 2), vocab_size=20)
         scores = self._get_uniform_logits(batch_size, vocab_size)
         scores_before_min_length = new_min_dist_processor(input_ids, scores)
-        self.assertListEqual(scores_before_min_length[:, eos_token_id].tolist(), batch_size * [-float("inf")])
+        self.assertListEqual(
+            scores_before_min_length[:, eos_token_id].flatten().tolist(), expected_eos_scores_before_min_length
+        )
 
         # check that min new length is applied at length 6 (because it has only 1 new token)
         input_ids = ids_tensor((batch_size, 6), vocab_size=20)
         scores = self._get_uniform_logits(batch_size, vocab_size)
         scores_before_min_length = new_min_dist_processor(input_ids, scores)
-        self.assertListEqual(scores_before_min_length[:, eos_token_id].tolist(), batch_size * [-float("inf")])
+        self.assertListEqual(
+            scores_before_min_length[:, eos_token_id].flatten().tolist(), expected_eos_scores_before_min_length
+        )
 
         # check that min new length is applied at length 7 (because it has only 2 new tokens)
         input_ids = ids_tensor((batch_size, 7), vocab_size=20)
         scores = self._get_uniform_logits(batch_size, vocab_size)
         scores_before_min_length = new_min_dist_processor(input_ids, scores)
-        self.assertListEqual(scores_before_min_length[:, eos_token_id].tolist(), batch_size * [-float("inf")])
+        self.assertListEqual(
+            scores_before_min_length[:, eos_token_id].flatten().tolist(), expected_eos_scores_before_min_length
+        )
 
         # check that min new length is not applied anymore at length 8
         input_ids = ids_tensor((batch_size, 8), vocab_size=20)
@@ -173,6 +192,31 @@ def test_repetition_penalty_dist_process(self):
         self.assertAlmostEqual(scores[1, 0].item(), (1 / vocab_size) / 2)
         self.assertAlmostEqual(scores[1, 5].item(), (4 / vocab_size) / 2)
 
+    def test_encoder_repetition_penalty_dist_process(self):
+        input_ids = torch.tensor([[0, 1], [5, 0]], device=torch_device, dtype=torch.long)
+        vocab_size = 10
+
+        scores = self._get_uniform_logits(batch_size=2, length=vocab_size)
+
+        # give values special values
+        scores[0, 0] = -(1 / vocab_size)
+        scores[1, 5] = 4 / vocab_size
+
+        rep_penalty_proc = EncoderRepetitionPenaltyLogitsProcessor(penalty=2.0, encoder_input_ids=input_ids)
+
+        scores = rep_penalty_proc(input_ids, scores.clone())
+
+        # check that values were correctly changed
+        self.assertAlmostEqual(scores[0, 0].item(), -(1 / vocab_size) / 2)
+        self.assertAlmostEqual(scores[0, 1].item(), (1 / vocab_size) * 2)
+
+        self.assertAlmostEqual(scores[1, 0].item(), (1 / vocab_size) * 2)
+        self.assertAlmostEqual(scores[1, 5].item(), (4 / vocab_size) * 2)
+
+        # check that values not in the encoder ids were NOT changed
+        self.assertAlmostEqual(scores[0, 2].item(), (1 / vocab_size))
+        self.assertAlmostEqual(scores[1, 2].item(), (1 / vocab_size))
+
     def test_top_k_dist_warper(self):
         input_ids = None
         vocab_size = 10
@@ -288,6 +332,80 @@ def test_typical_dist_warper(self):
         # first batch should keep two tokens, second batch would keep only 1, but due to `min_tokens_to_keep=2` keeps 2.
         self.assertListEqual((filtered_dist != 0.0).to(torch.long).sum(dim=-1).tolist(), [2, 2])
 
+    def test_epsilon_dist_warper(self):
+        input_ids = None
+        vocab_size = 10
+        batch_size = 2
+
+        # create distribution and take log (inverse to Softmax as taken in TopPLogitsWarper)
+        dist = torch.log(
+            torch.tensor(
+                [[0.87, 0.099, 0.001, 0.03], [0.4, 0.299, 0.101, 0.2]], device=torch_device, dtype=torch.float
+            )
+        )
+
+        epsilon_warp = EpsilonLogitsWarper(0.1)
+        filtered_dist = torch.exp(epsilon_warp(input_ids, dist))
+
+        # dist should be filtered to only keep values with proba >= 0.1
+        # exp (-inf) => 0
+        EXPECTED_FILTERED_DIST = torch.tensor(
+            [[0.87, 0, 0, 0], [0.4, 0.299, 0.101, 0.2]], device=torch_device, dtype=torch.float
+        )
+        self.assertTrue(torch.allclose(filtered_dist, EXPECTED_FILTERED_DIST, atol=1e-3))
+
+        # check edge cases with negative and extreme logits
+        ramp_logits = torch.arange(vocab_size, device=torch_device, dtype=torch.float).unsqueeze(0).repeat(
+            batch_size, 1
+        ) - (vocab_size // 2)
+
+        # make ramp_logits more extreme
+        ramp_logits[1] = ramp_logits[1] * 100.0
+
+        # make sure at least 2 tokens are kept
+        epsilon_warp = EpsilonLogitsWarper(5e-2, min_tokens_to_keep=2, filter_value=0.0)
+        filtered_dist = epsilon_warp(input_ids, ramp_logits)
+
+        # first batch should keep 3 tokens, second batch would keep only 1, but due to `min_tokens_to_keep=2` keeps 2.
+        self.assertListEqual((filtered_dist != 0.0).to(torch.long).sum(dim=-1).tolist(), [3, 2])
+
+    def test_eta_dist_warper(self):
+        input_ids = None
+        vocab_size = 10
+        batch_size = 2
+
+        # create distribution and take log (inverse to Softmax as taken in TopPLogitsWarper)
+        dist = torch.log(
+            torch.tensor([[0.0, 0.1, 0.8, 0.1], [0.01, 0.04, 0.9, 0.05]], device=torch_device, dtype=torch.float)
+        )
+
+        eta_warp = EtaLogitsWarper(0.0625)
+        filtered_dist = torch.exp(eta_warp(input_ids, dist))
+
+        # dist should be filtered to only keep values with proba >= min(0.0625, sqrt(0.0625) * e^-H(p))
+        # min(0.0625, 0.1320) is the cutoff for the first row and min(0.0625, 0.1644) is for the second
+        # where H is the entropy function and p is the probability vector.
+        # exp (-inf) => 0
+        EXPECTED_FILTERED_DIST = torch.tensor(
+            [[0.0, 0.1, 0.8, 0.1], [0.0, 0.0, 0.9, 0.0]], device=torch_device, dtype=torch.float
+        )
+        self.assertTrue(torch.allclose(filtered_dist, EXPECTED_FILTERED_DIST, atol=1e-3))
+
+        # check edge cases with negative and extreme logits
+        ramp_logits = torch.arange(vocab_size, device=torch_device, dtype=torch.float).unsqueeze(0).repeat(
+            batch_size, 1
+        ) - (vocab_size // 2)
+
+        # make ramp_logits more extreme
+        ramp_logits[1] = ramp_logits[1] * 100.0
+
+        # make sure at least 2 tokens are kept
+        eta_warp = EtaLogitsWarper(0.1, min_tokens_to_keep=2, filter_value=0.0)
+        filtered_dist = eta_warp(input_ids, ramp_logits)
+
+        # first batch should keep 2 tokens, second batch would keep only 1, but due to `min_tokens_to_keep=2` keeps 2.
+        self.assertListEqual((filtered_dist != 0.0).to(torch.long).sum(dim=-1).tolist(), [2, 2])
+
     def test_no_repeat_ngram_dist_processor(self):
         vocab_size = 3
         batch_size = 2
@@ -396,6 +514,30 @@ def test_no_bad_words_dist_processor(self):
         filtered_scores = no_bad_words_dist_proc(input_ids, scores.clone())
         self.assertTrue(torch.allclose(scores, filtered_scores, atol=1e-3))
 
+    def test_bias_dist_processor(self):
+        vocab_size = 5
+        batch_size = 2
+
+        input_ids = torch.tensor([[0, 1, 3, 1], [0, 1, 0, 1]], device=torch_device, dtype=torch.long)
+        positive_bias = {(1,): 100.0, (4,): 100.0}
+        negative_bias = {(1, 0): -100.0, (0, 1, 2): -100.0, (1, 3, 1, 3): -100.0}
+        # biases the same termination twice, to ensure we can handle overlapping terminations (it won't have an effect
+        # on the test cases, though)
+        negative_bias.update({(1, 3, 1, 3, 1, 3): -100.0})
+        sequence_bias = {**positive_bias, **negative_bias}
+
+        # scores = 0 to facilitate checks
+        scores = torch.zeros((batch_size, vocab_size), dtype=torch.float, device=torch_device)
+
+        bias_dist_proc = SequenceBiasLogitsProcessor(sequence_bias=sequence_bias)
+        filtered_scores = bias_dist_proc(input_ids, scores.clone())
+
+        # batch 1: positive bias: tokens (1, 4); negative bias: tokens (0, 3); neutral: tokens (2)
+        # batch 2: positive bias: tokens (1, 4); negative bias: tokens (0, 2); neutral: tokens (3)
+        self.assertListEqual(
+            filtered_scores.tolist(), [[-100.0, 100.0, 0.0, -100.0, 100.0], [-100.0, 100.0, -100.0, 0.0, 100.0]]
+        )
+
     def test_processor_list(self):
         batch_size = 4
         sequence_length = 10
@@ -575,18 +717,23 @@ def test_exponential_decay_length_penalty(self):
 
         # check that penalty is not applied before start
         scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores_before_start = length_decay_processor(input_ids, scores)
+        scores_before_start = torch.clone(scores)  # clone scores as precessor updates them inplace
+        scores_before_start = length_decay_processor(input_ids, scores_before_start)
         self.assertListEqual(scores_before_start[:, eos_token_id].tolist(), scores[:, eos_token_id].tolist())
 
         # check that penalty is applied after start
         input_ids = ids_tensor((batch_size, 20), vocab_size=vocab_size)
         scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores_after_start = length_decay_processor(input_ids, scores)
-        self.assertTrue(
-            torch.gt(
-                scores_after_start[penalty_start + 1 :, eos_token_id], scores[penalty_start + 1 :, eos_token_id]
-            ).all()
-        )
+        scores_after_start = torch.clone(scores)  # clone scores as precessor updates them inplace
+        scores_after_start = length_decay_processor(input_ids, scores_after_start)
+        self.assertTrue(torch.gt(scores_after_start[:, eos_token_id], scores[:, eos_token_id]).all())
+
+        # check the penalty increases negative scores
+        input_ids = ids_tensor((batch_size, 20), vocab_size=vocab_size)
+        scores = torch.neg(self._get_uniform_logits(batch_size, vocab_size))
+        scores_after_start = torch.clone(scores)  # clone scores as precessor updates them inplace
+        scores_after_start = length_decay_processor(input_ids, scores_after_start)
+        self.assertTrue(torch.gt(scores_after_start[:, eos_token_id], scores[:, eos_token_id]).all())
 
     def test_normalization(self):
         input_ids = None
@@ -602,3 +749,54 @@ def test_normalization(self):
         self.assertTrue(normalized_scores.sum(dim=-1).allclose(ones))
 
         self.assertTrue(normalized_scores.allclose(scores.softmax(dim=-1)))
+
+    def test_classifier_free_guidance(self):
+        class Namespace(dict):
+            pass
+
+        logits_uncond = torch.tensor([[[1.0, 0, 1.5]]])
+        logits_cond = torch.tensor([[[1.0, 1.0, 1.0]]])
+
+        def dummy_model(input_ids, attention_mask, use_cache=True, past_key_values=None):
+            out = Namespace()
+            out.logits = logits_uncond
+            out.past_key_values = None
+            return out
+
+        def lsm(x):
+            return torch.nn.functional.log_softmax(x, dim=-1)
+
+        # explicit unconditional prompt + attention mask
+        input_ids = torch.LongTensor([[0]])
+        cfg = UnbatchedClassifierFreeGuidanceLogitsProcessor(
+            1.5, dummy_model, input_ids, torch.ones_like(input_ids, dtype=torch.long)
+        )
+        out = cfg(input_ids, logits_cond)[0, -1]
+
+        res = (lsm(logits_uncond) + 1.5 * (lsm(logits_cond) - lsm(logits_uncond)))[0, -1]
+
+        self.assertAlmostEqual(out[0].item(), res[0].item())
+        self.assertAlmostEqual(out[1].item(), res[1].item())
+        self.assertAlmostEqual(out[2].item(), res[2].item())
+
+        # explicit unconditional prompt
+        input_ids = torch.LongTensor([[0]])
+        cfg = UnbatchedClassifierFreeGuidanceLogitsProcessor(1.5, dummy_model, input_ids)
+        out = cfg(input_ids, logits_cond)[0, -1]
+
+        res = (lsm(logits_uncond) + 1.5 * (lsm(logits_cond) - lsm(logits_uncond)))[0, -1]
+
+        self.assertAlmostEqual(out[0].item(), res[0].item())
+        self.assertAlmostEqual(out[1].item(), res[1].item())
+        self.assertAlmostEqual(out[2].item(), res[2].item())
+
+        # all implicit
+        input_ids = torch.LongTensor([[0]])
+        cfg = UnbatchedClassifierFreeGuidanceLogitsProcessor(1.5, dummy_model)
+        out = cfg(input_ids, logits_cond)[0, -1]
+
+        res = (lsm(logits_uncond) + 1.5 * (lsm(logits_cond) - lsm(logits_uncond)))[0, -1]
+
+        self.assertAlmostEqual(out[0].item(), res[0].item())
+        self.assertAlmostEqual(out[1].item(), res[1].item())
+        self.assertAlmostEqual(out[2].item(), res[2].item())
diff --git a/tests/generation/test_streamers.py b/tests/generation/test_streamers.py
new file mode 100644
index 000000000000..361f39e03e0f
--- /dev/null
+++ b/tests/generation/test_streamers.py
@@ -0,0 +1,122 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a clone of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from queue import Empty
+from threading import Thread
+
+from transformers import AutoTokenizer, TextIteratorStreamer, TextStreamer, is_torch_available
+from transformers.testing_utils import CaptureStdout, require_torch, torch_device
+
+from ..test_modeling_common import ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import AutoModelForCausalLM
+
+
+@require_torch
+class StreamerTester(unittest.TestCase):
+    def test_text_streamer_matches_non_streaming(self):
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
+        model.config.eos_token_id = -1
+
+        input_ids = ids_tensor((1, 5), vocab_size=model.config.vocab_size).to(torch_device)
+        greedy_ids = model.generate(input_ids, max_new_tokens=10, do_sample=False)
+        greedy_text = tokenizer.decode(greedy_ids[0])
+
+        with CaptureStdout() as cs:
+            streamer = TextStreamer(tokenizer)
+            model.generate(input_ids, max_new_tokens=10, do_sample=False, streamer=streamer)
+        # The greedy text should be printed to stdout, except for the final "\n" in the streamer
+        streamer_text = cs.out[:-1]
+
+        self.assertEqual(streamer_text, greedy_text)
+
+    def test_iterator_streamer_matches_non_streaming(self):
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
+        model.config.eos_token_id = -1
+
+        input_ids = ids_tensor((1, 5), vocab_size=model.config.vocab_size).to(torch_device)
+        greedy_ids = model.generate(input_ids, max_new_tokens=10, do_sample=False)
+        greedy_text = tokenizer.decode(greedy_ids[0])
+
+        streamer = TextIteratorStreamer(tokenizer)
+        generation_kwargs = {"input_ids": input_ids, "max_new_tokens": 10, "do_sample": False, "streamer": streamer}
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        thread.start()
+        streamer_text = ""
+        for new_text in streamer:
+            streamer_text += new_text
+
+        self.assertEqual(streamer_text, greedy_text)
+
+    def test_text_streamer_skip_prompt(self):
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
+        model.config.eos_token_id = -1
+
+        input_ids = ids_tensor((1, 5), vocab_size=model.config.vocab_size).to(torch_device)
+        greedy_ids = model.generate(input_ids, max_new_tokens=10, do_sample=False)
+        new_greedy_ids = greedy_ids[:, input_ids.shape[1] :]
+        new_greedy_text = tokenizer.decode(new_greedy_ids[0])
+
+        with CaptureStdout() as cs:
+            streamer = TextStreamer(tokenizer, skip_prompt=True)
+            model.generate(input_ids, max_new_tokens=10, do_sample=False, streamer=streamer)
+        # The greedy text should be printed to stdout, except for the final "\n" in the streamer
+        streamer_text = cs.out[:-1]
+
+        self.assertEqual(streamer_text, new_greedy_text)
+
+    def test_text_streamer_decode_kwargs(self):
+        # Tests that we can pass `decode_kwargs` to the streamer to control how the tokens are decoded. Must be tested
+        # with actual models -- the dummy models' tokenizers are not aligned with their models, and
+        # `skip_special_tokens=True` has no effect on them
+        tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+        model = AutoModelForCausalLM.from_pretrained("distilgpt2").to(torch_device)
+        model.config.eos_token_id = -1
+
+        input_ids = torch.ones((1, 5), device=torch_device).long() * model.config.bos_token_id
+        with CaptureStdout() as cs:
+            streamer = TextStreamer(tokenizer, skip_special_tokens=True)
+            model.generate(input_ids, max_new_tokens=1, do_sample=False, streamer=streamer)
+
+        # The prompt contains a special token, so the streamer should not print it. As such, the output text, when
+        # re-tokenized, must only contain one token
+        streamer_text = cs.out[:-1]  # Remove the final "\n"
+        streamer_text_tokenized = tokenizer(streamer_text, return_tensors="pt")
+        self.assertEqual(streamer_text_tokenized.input_ids.shape, (1, 1))
+
+    def test_iterator_streamer_timeout(self):
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
+        model.config.eos_token_id = -1
+
+        input_ids = ids_tensor((1, 5), vocab_size=model.config.vocab_size).to(torch_device)
+        streamer = TextIteratorStreamer(tokenizer, timeout=0.001)
+        generation_kwargs = {"input_ids": input_ids, "max_new_tokens": 10, "do_sample": False, "streamer": streamer}
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        thread.start()
+
+        # The streamer will timeout after 0.001 seconds, so an exception will be raised
+        with self.assertRaises(Empty):
+            streamer_text = ""
+            for new_text in streamer:
+                streamer_text += new_text
diff --git a/tests/generation/test_tf_logits_process.py b/tests/generation/test_tf_logits_process.py
index 195188f10bfc..e87c843d9cb4 100644
--- a/tests/generation/test_tf_logits_process.py
+++ b/tests/generation/test_tf_logits_process.py
@@ -14,11 +14,13 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 import numpy as np
-
 from parameterized import parameterized
+
 from transformers import is_tf_available
 from transformers.testing_utils import require_tf
 
diff --git a/tests/generation/test_tf_utils.py b/tests/generation/test_tf_utils.py
index d0d284182b53..186e0c8d4327 100644
--- a/tests/generation/test_tf_utils.py
+++ b/tests/generation/test_tf_utils.py
@@ -13,22 +13,43 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
+import os
 import tempfile
 import unittest
 
-from transformers import is_tf_available
-from transformers.testing_utils import require_tf, slow
+import numpy as np
+from huggingface_hub import hf_hub_download
+
+from transformers import is_tensorflow_text_available, is_tf_available
+from transformers.testing_utils import require_tensorflow_text, require_tf, slow
+
+from ..test_modeling_tf_common import floats_tensor
+from .test_framework_agnostic import GenerationIntegrationTestsMixin
 
 
 if is_tf_available():
     import tensorflow as tf
 
-    from transformers import AutoTokenizer, TFAutoModelForCausalLM, TFAutoModelForSeq2SeqLM, tf_top_k_top_p_filtering
+    from transformers import (
+        AutoTokenizer,
+        TFAutoModelForCausalLM,
+        TFAutoModelForSeq2SeqLM,
+        TFAutoModelForSpeechSeq2Seq,
+        TFAutoModelForVision2Seq,
+        TFBartForConditionalGeneration,
+        TFLogitsProcessorList,
+        TFMinLengthLogitsProcessor,
+        tf_top_k_top_p_filtering,
+    )
+
+if is_tensorflow_text_available():
+    import tensorflow_text as text
 
 
 @require_tf
 class UtilsFunctionsTest(unittest.TestCase):
-
     # tests whether the top_k_top_p_filtering function behaves as expected
     def test_top_k_top_p_filtering(self):
         logits = tf.convert_to_tensor(
@@ -124,11 +145,27 @@ def test_top_k_top_p_filtering(self):
 
 
 @require_tf
-class TFGenerationIntegrationTests(unittest.TestCase):
+class TFGenerationIntegrationTests(unittest.TestCase, GenerationIntegrationTestsMixin):
+    # setting framework_dependent_parameters needs to be gated, just like its contents' imports
+    if is_tf_available():
+        framework_dependent_parameters = {
+            "AutoModelForCausalLM": TFAutoModelForCausalLM,
+            "AutoModelForSpeechSeq2Seq": TFAutoModelForSpeechSeq2Seq,
+            "AutoModelForSeq2SeqLM": TFAutoModelForSeq2SeqLM,
+            "AutoModelForVision2Seq": TFAutoModelForVision2Seq,
+            "LogitsProcessorList": TFLogitsProcessorList,
+            "MinLengthLogitsProcessor": TFMinLengthLogitsProcessor,
+            "create_tensor_fn": tf.convert_to_tensor,
+            "floats_tensor": floats_tensor,
+            "return_tensors": "tf",
+        }
+
     @slow
-    def test_generate_tf_function_export(self):
+    def test_generate_tf_function_export_fixed_input_length(self):
+        # TF-only test: tf.saved_model export
         test_model = TFAutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        max_length = 2
+        input_length = 2
+        max_new_tokens = 2
 
         class DummyModel(tf.Module):
             def __init__(self, model):
@@ -137,8 +174,8 @@ def __init__(self, model):
 
             @tf.function(
                 input_signature=(
-                    tf.TensorSpec((None, max_length), tf.int32, name="input_ids"),
-                    tf.TensorSpec((None, max_length), tf.int32, name="attention_mask"),
+                    tf.TensorSpec((None, input_length), tf.int32, name="input_ids"),
+                    tf.TensorSpec((None, input_length), tf.int32, name="attention_mask"),
                 ),
                 jit_compile=True,
             )
@@ -146,7 +183,7 @@ def serving(self, input_ids, attention_mask):
                 outputs = self.model.generate(
                     input_ids=input_ids,
                     attention_mask=attention_mask,
-                    max_new_tokens=max_length,
+                    max_new_tokens=max_new_tokens,
                     return_dict_in_generate=True,
                 )
                 return {"sequences": outputs["sequences"]}
@@ -163,21 +200,142 @@ def serving(self, input_ids, attention_mask):
                     "attention_mask": tf.constant(dummy_attention_masks[:batch_size]),
                 }
                 tf_func_outputs = serving_func(**inputs)["sequences"]
-                tf_model_outputs = test_model.generate(**inputs, max_new_tokens=max_length)
+                tf_model_outputs = test_model.generate(**inputs, max_new_tokens=max_new_tokens)
+                tf.debugging.assert_equal(tf_func_outputs, tf_model_outputs)
+
+    @slow
+    def test_generate_tf_function_export_fixed_batch_size(self):
+        # TF-only test: tf.saved_model export
+        test_model = TFAutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        batch_size = 1
+        max_new_tokens = 2
+
+        class DummyModel(tf.Module):
+            def __init__(self, model):
+                super(DummyModel, self).__init__()
+                self.model = model
+
+            @tf.function(
+                input_signature=(
+                    tf.TensorSpec((batch_size, None), tf.int32, name="input_ids"),
+                    tf.TensorSpec((batch_size, None), tf.int32, name="attention_mask"),
+                ),
+                jit_compile=True,
+            )
+            def serving(self, input_ids, attention_mask):
+                outputs = self.model.generate(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    max_new_tokens=max_new_tokens,
+                    return_dict_in_generate=True,
+                )
+                return {"sequences": outputs["sequences"]}
+
+        dummy_input_ids = [[2], [102, 103]]
+        dummy_attention_masks = [[1], [1, 1]]
+        dummy_model = DummyModel(model=test_model)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            tf.saved_model.save(dummy_model, tmp_dir, signatures={"serving_default": dummy_model.serving})
+            serving_func = tf.saved_model.load(tmp_dir).signatures["serving_default"]
+            for input_row in range(len(dummy_input_ids)):
+                inputs = {
+                    "input_ids": tf.constant([dummy_input_ids[input_row]]),
+                    "attention_mask": tf.constant([dummy_attention_masks[input_row]]),
+                }
+                tf_func_outputs = serving_func(**inputs)["sequences"]
+                tf_model_outputs = test_model.generate(**inputs, max_new_tokens=max_new_tokens)
                 tf.debugging.assert_equal(tf_func_outputs, tf_model_outputs)
 
-    def test_validate_generation_inputs(self):
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
-        model = TFAutoModelForSeq2SeqLM.from_pretrained("hf-internal-testing/tiny-random-t5")
+    @slow
+    @require_tensorflow_text
+    def test_generate_tf_function_export_with_tf_tokenizer(self):
+        # TF-only test: tf.saved_model export
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            # file needed to load the TF tokenizer
+            hf_hub_download(repo_id="google/flan-t5-small", filename="spiece.model", local_dir=tmp_dir)
+
+            class CompleteSentenceTransformer(tf.keras.layers.Layer):
+                def __init__(self):
+                    super().__init__()
+                    self.tokenizer = text.SentencepieceTokenizer(
+                        model=tf.io.gfile.GFile(os.path.join(tmp_dir, "spiece.model"), "rb").read()
+                    )
+                    self.model = TFAutoModelForSeq2SeqLM.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+                def call(self, inputs, *args, **kwargs):
+                    tokens = self.tokenizer.tokenize(inputs)
+                    input_ids, attention_mask = text.pad_model_inputs(
+                        tokens, max_seq_length=64, pad_value=self.model.config.pad_token_id
+                    )
+                    outputs = self.model.generate(input_ids=input_ids, attention_mask=attention_mask)
+                    return self.tokenizer.detokenize(outputs)
+
+            complete_model = CompleteSentenceTransformer()
+            inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string, name="inputs")
+            outputs = complete_model(inputs)
+            keras_model = tf.keras.Model(inputs, outputs)
+            keras_model.save(tmp_dir)
+
+    def test_eos_token_id_int_and_list_top_k_top_sampling(self):
+        # Has PT equivalent: this test relies on random sampling
+        generation_kwargs = {
+            "do_sample": True,
+            "num_beams": 1,
+            "top_p": 0.7,
+            "top_k": 10,
+            "temperature": 0.7,
+        }
+        expectation = 14
+
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        text = """Hello, my dog is cute and"""
+        tokens = tokenizer(text, return_tensors="tf")
+        model = TFAutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+
+        eos_token_id = 638
+        # forces the generation to happen on CPU, to avoid GPU-related quirks
+        with tf.device(":/CPU:0"):
+            tf.random.set_seed(0)
+            generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
+        self.assertTrue(expectation == len(generated_tokens[0]))
+
+        eos_token_id = [638, 198]
+        with tf.device(":/CPU:0"):
+            tf.random.set_seed(0)
+            generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
+        self.assertTrue(expectation == len(generated_tokens[0]))
+
+    def test_model_kwarg_encoder_signature_filtering(self):
+        # Has PT equivalent: ample use of framework-specific code
+        bart_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
+        article = """Hugging Face is a technology company based in New York and Paris."""
+        input_ids = bart_tokenizer(article, return_tensors="tf").input_ids
+        bart_model = TFBartForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-bart")
+        output = bart_model.generate(input_ids).numpy()
+
+        # Let's create a fake model that has a different signature. In particular, this fake model accepts "foo" as an
+        # argument. Because "foo" is not in the encoder signature and doesn't start with "decoder_", it will be part of
+        # the encoder kwargs prior to signature filtering, which would lead to an exception. But filtering kicks in and
+        # saves the day.
+        class FakeBart(TFBartForConditionalGeneration):
+            def call(self, input_ids, foo=None, **kwargs):
+                return super().call(input_ids, **kwargs)
+
+        bart_model = FakeBart.from_pretrained("hf-internal-testing/tiny-random-bart")
+        fake_output = bart_model.generate(input_ids, foo="bar").numpy()
+        self.assertTrue(np.array_equal(output, fake_output))
 
-        encoder_input_str = "Hello world"
-        input_ids = tokenizer(encoder_input_str, return_tensors="tf").input_ids
+        # Encoder signature filtering only kicks in if it doesn't accept wildcard kwargs. The following test will fail
+        # because it doesn't do signature filtering.
+        class FakeEncoder(bart_model.model.encoder.__class__):
+            def call(self, input_ids, **kwargs):
+                return super().call(input_ids, **kwargs)
 
-        # typos are quickly detected (the correct argument is `do_sample`)
-        with self.assertRaisesRegex(ValueError, "do_samples"):
-            model.generate(input_ids, do_samples=True)
+        fake_encoder = FakeEncoder(bart_model.config, bart_model.model.shared)
+        bart_model.model.encoder = fake_encoder
 
-        # arbitrary arguments that will not be used anywhere are also not accepted
-        with self.assertRaisesRegex(ValueError, "foo"):
-            fake_model_kwargs = {"foo": "bar"}
-            model.generate(input_ids, **fake_model_kwargs)
+        # Normal generation still works (the output will be different because the encoder weights are different)
+        fake_output = bart_model.generate(input_ids).numpy()
+        with self.assertRaises(ValueError):
+            # FakeEncoder.call() accepts **kwargs -> no filtering -> value error due to unexpected input "foo"
+            bart_model.generate(input_ids, foo="bar")
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index aeb2bf480b25..f73e3f60a553 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -15,12 +15,17 @@
 
 
 import inspect
+import tempfile
 import unittest
+import warnings
+
+import numpy as np
 
 from transformers import is_torch_available, pipeline
-from transformers.testing_utils import require_torch, slow, torch_device
+from transformers.testing_utils import require_accelerate, require_torch, require_torch_multi_gpu, slow, torch_device
 
 from ..test_modeling_common import floats_tensor, ids_tensor
+from .test_framework_agnostic import GenerationIntegrationTestsMixin
 
 
 if is_torch_available():
@@ -29,16 +34,15 @@
     from transformers import (
         AutoModelForCausalLM,
         AutoModelForSeq2SeqLM,
+        AutoModelForSpeechSeq2Seq,
+        AutoModelForVision2Seq,
         AutoTokenizer,
         BartForConditionalGeneration,
         BartTokenizer,
         GPT2LMHeadModel,
         GPT2Tokenizer,
         ImageGPTForCausalImageModeling,
-        Speech2TextForConditionalGeneration,
         SpeechEncoderDecoderModel,
-        T5ForConditionalGeneration,
-        VisionEncoderDecoderModel,
         top_k_top_p_filtering,
     )
     from transformers.generation import (
@@ -77,14 +81,13 @@ class GenerationTesterMixin:
     all_generative_model_classes = ()
     input_name = "input_ids"
 
-    def _get_input_ids_and_config(self):
+    def _get_input_ids_and_config(self, batch_size=2):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         input_ids = inputs_dict[self.input_name]
 
         # cut to half length & take max batch_size 3
-        max_batch_size = 2
         sequence_length = input_ids.shape[-1] // 2
-        input_ids = input_ids[:max_batch_size, :sequence_length]
+        input_ids = input_ids[:batch_size, :sequence_length]
 
         # generate max 3 tokens
         max_length = input_ids.shape[-1] + 3
@@ -97,7 +100,7 @@ def _get_input_ids_and_config(self):
         if "transfoxl" in config.__class__.__name__.lower():
             attention_mask = None
         else:
-            attention_mask = torch.ones_like(input_ids, dtype=torch.long)[:max_batch_size, :sequence_length]
+            attention_mask = torch.ones_like(input_ids, dtype=torch.long)[:batch_size, :sequence_length]
 
         return config, input_ids, attention_mask, max_length
 
@@ -437,7 +440,6 @@ def _beam_sample_generate(
         input_ids,
         attention_mask,
         max_length,
-        num_return_sequences,
         beam_scorer,
         beam_kwargs,
         logits_warper,
@@ -462,30 +464,30 @@ def _beam_sample_generate(
             **logits_warper_kwargs,
             **model_kwargs,
         )
-        # beam_search does not automatically interleave `batch_size` dim for `num_beams * num_return_sequences`
+        # beam_search does not automatically interleave `batch_size` dim for `num_beams`
+        torch.manual_seed(0)
         kwargs = {}
         if model.config.is_encoder_decoder:
             encoder_outputs, input_ids, attention_mask = self._get_encoder_outputs(
                 model,
                 input_ids,
                 attention_mask,
-                num_interleave=beam_scorer.num_beams * num_return_sequences,
+                num_interleave=beam_scorer.num_beams,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
             )
             kwargs["encoder_outputs"] = encoder_outputs
         elif attention_mask is not None:
-            attention_mask = attention_mask.repeat_interleave(beam_scorer.num_beams * num_return_sequences, dim=0)
+            attention_mask = attention_mask.repeat_interleave(beam_scorer.num_beams, dim=0)
 
         # prevent flaky generation test failures
         logits_processor = LogitsProcessorList()
         logits_processor.append(InfNanRemoveLogitsProcessor())
 
-        torch.manual_seed(0)
         with torch.no_grad():
             model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
             output_beam_sample = model.beam_sample(
-                input_ids.repeat_interleave(beam_scorer.num_beams * num_return_sequences, dim=0),
+                input_ids.repeat_interleave(beam_scorer.num_beams, dim=0),
                 beam_scorer,
                 max_length=max_length,
                 logits_warper=logits_warper,
@@ -777,7 +779,7 @@ def test_sample_generate(self):
                 forced_eos_token_id=model.config.forced_eos_token_id,
                 max_length=max_length,
             )
-            logits_warper_kwargs, logits_warper = self._get_warper_and_kwargs(num_beams=1)
+            logits_warper_kwargs, logits_warper = self._get_warper_and_kwargs(num_beams=2)
 
             # check `generate()` and `sample()` are equal
             output_sample, output_generate = self._sample_generate(
@@ -890,13 +892,9 @@ def test_beam_search_generate(self):
 
             self.assertListEqual(output_generate.tolist(), output_beam_search.tolist())
 
-            # check `generate()` and `beam_search()` are equal for `num_return_sequences`
-            num_return_sequences = 2
             if model.config.is_encoder_decoder:
                 max_length = 4
-            beam_kwargs, beam_scorer = self._get_beam_scorer_and_kwargs(
-                input_ids.shape[0], max_length, num_return_sequences=num_return_sequences
-            )
+            beam_kwargs, beam_scorer = self._get_beam_scorer_and_kwargs(input_ids.shape[0], max_length)
 
             output_generate, output_beam_search = self._beam_search_generate(
                 model=model,
@@ -1020,6 +1018,27 @@ def test_beam_search_generate_dict_outputs_use_cache(self):
                     output, input_ids, model.config, use_cache=True, num_return_sequences=beam_scorer.num_beams
                 )
 
+    @require_accelerate
+    @require_torch_multi_gpu
+    def test_model_parallel_beam_search(self):
+        for model_class in self.all_generative_model_classes:
+            if model_class._no_split_modules is None:
+                continue
+
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
+
+            model = model_class(config).eval()
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                model.cpu().save_pretrained(tmp_dir)
+                new_model = model_class.from_pretrained(tmp_dir, device_map="auto")
+
+                new_model.generate(
+                    input_ids,
+                    attention_mask=attention_mask,
+                    max_length=max_length,
+                    num_beams=2,
+                )
+
     def test_beam_sample_generate(self):
         for model_class in self.all_generative_model_classes:
             config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
@@ -1035,21 +1054,15 @@ def test_beam_sample_generate(self):
             model = model_class(config).to(torch_device).eval()
 
             # check `generate()` and `beam_search()` are equal
-            # change `num_return_sequences = 2` but not for `beam_scorer`
-            num_return_sequences = 2
             if model.config.is_encoder_decoder:
                 max_length = 4
-            beam_kwargs, beam_scorer = self._get_beam_scorer_and_kwargs(
-                input_ids.shape[0] * num_return_sequences, max_length
-            )
-            beam_kwargs["num_return_sequences"] = num_return_sequences
+            beam_kwargs, beam_scorer = self._get_beam_scorer_and_kwargs(input_ids.shape[0], max_length)
 
             output_generate, output_beam_sample = self._beam_sample_generate(
                 model=model,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
                 max_length=max_length,
-                num_return_sequences=num_return_sequences,
                 beam_scorer=beam_scorer,
                 beam_kwargs=beam_kwargs,
                 logits_warper=logits_warper,
@@ -1073,20 +1086,15 @@ def test_beam_sample_generate_dict_output(self):
             model = model_class(config).to(torch_device).eval()
             logits_warper_kwargs, logits_warper = self._get_warper_and_kwargs(num_beams=1)
 
-            num_return_sequences = 2
             if model.config.is_encoder_decoder:
                 max_length = 4
-            beam_kwargs, beam_scorer = self._get_beam_scorer_and_kwargs(
-                input_ids.shape[0] * num_return_sequences, max_length
-            )
-            beam_kwargs["num_return_sequences"] = num_return_sequences
+            beam_kwargs, beam_scorer = self._get_beam_scorer_and_kwargs(input_ids.shape[0], max_length)
 
             output_beam_sample, output_generate = self._beam_sample_generate(
                 model=model,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
                 max_length=max_length,
-                num_return_sequences=num_return_sequences,
                 beam_scorer=beam_scorer,
                 beam_kwargs=beam_kwargs,
                 logits_warper=logits_warper,
@@ -1112,9 +1120,7 @@ def test_beam_sample_generate_dict_output(self):
             self.assertTrue((output_generate["sequences_scores"] < 0).all().item())
 
             for output in (output_beam_sample, output_generate):
-                self._check_outputs(
-                    output, input_ids, model.config, num_return_sequences=num_return_sequences * beam_scorer.num_beams
-                )
+                self._check_outputs(output, input_ids, model.config, num_return_sequences=beam_scorer.num_beams)
 
     def test_generate_without_input_ids(self):
         config, _, _, max_length = self._get_input_ids_and_config()
@@ -1359,13 +1365,8 @@ def test_constrained_beam_search_generate_dict_output(self):
             )
 
             # Sample constraints
-            if not input_ids.dtype == torch.float32:
-                min_id = torch.min(input_ids) + 3
-                max_id = torch.max(input_ids)
-            else:
-                # otherwise this throws an error for Speech2TextModel since its inputs are floating points
-                min_id = 3
-                max_id = 100
+            min_id = 3
+            max_id = model.config.vocab_size
             force_tokens = torch.randint(min_id, max_id, (1, 2)).tolist()[0]
             constraints = [
                 PhrasalConstraint(force_tokens),
@@ -1410,7 +1411,6 @@ def test_constrained_beam_search_generate_dict_output(self):
     def test_contrastive_generate(self):
         # check `generate()` and `contrastive_search()` are equal
         for model_class in self.all_generative_model_classes:
-
             # won't fix: FSMT and Reformer have a different cache variable type (and format).
             if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
                 return
@@ -1432,7 +1432,6 @@ def test_contrastive_generate(self):
 
     def test_contrastive_generate_dict_outputs_use_cache(self):
         for model_class in self.all_generative_model_classes:
-
             # won't fix: FSMT and Reformer have a different cache variable type (and format).
             if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
                 return
@@ -1463,15 +1462,170 @@ def test_contrastive_generate_dict_outputs_use_cache(self):
             for output in (output_contrastive, output_generate):
                 self._check_outputs(output, input_ids, model.config, use_cache=True)
 
+    def test_contrastive_generate_low_memory(self):
+        # Check that choosing 'low_memory' does not change the model output
+        for model_class in self.all_generative_model_classes:
+            # won't fix: FSMT, Reformer, gptbigcode, and speech2text have a different cache variable type (and format).
+            if any(
+                model_name in model_class.__name__.lower()
+                for model_name in ["fsmt", "reformer", "gptbigcode", "speech2text"]
+            ):
+                return
+
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config(batch_size=1)
+
+            # NOTE: contrastive search only works with cache on at the moment.
+            if not hasattr(config, "use_cache"):
+                return
+
+            config.use_cache = True
+            config.is_decoder = True
+
+            # test output equality of low versus high memory
+            model = model_class(config).to(torch_device).eval()
+
+            low_output = model.generate(
+                input_ids,
+                top_k=4,
+                penalty_alpha=0.6,
+                low_memory=True,
+                max_length=max_length,
+                attention_mask=attention_mask,
+            )
+
+            high_output = model.generate(
+                input_ids,
+                top_k=4,
+                penalty_alpha=0.6,
+                low_memory=False,
+                max_length=max_length,
+                attention_mask=attention_mask,
+            )
+            self.assertListEqual(low_output.tolist(), high_output.tolist())
+
+        return
+
+    @slow  # TODO(Joao): remove this. Some models (e.g. data2vec, xcom, roberta) have an error rate between 1 and 10%.
+    def test_assisted_decoding_matches_greedy_search(self):
+        # This test ensures that the assisted generation does not introduce output changes over greedy search.
+        # It breaks the pattern in the tests above, for multiple reasons:
+        # - assisted_decoding, contrarily to the other methods, can't be called on its own (e.g. needs to
+        # prepare the assistant encoder outputs in the main generate body);
+        # - assisted_decoding does not support `use_cache = False`
+        # - assisted_decoding does not support `batch_size > 1`
+
+        for model_class in self.all_generative_model_classes:
+            # won't fix: FSMT and Reformer have a different cache variable type (and format).
+            if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
+                return
+            # may fix in the future: the following models fail with assisted decoding, and need model-specific fixes
+            if any(
+                model_name in model_class.__name__.lower()
+                for model_name in ["bigbirdpegasus", "led", "mega", "speech2text", "git", "prophetnet"]
+            ):
+                return
+
+            # This for loop is a naive and temporary effort to make the test less flaky.
+            failed = 0
+            for i in range(10):
+                # enable cache
+                config, input_ids, attention_mask, max_length = self._get_input_ids_and_config(batch_size=1)
+
+                # NOTE: assisted generation only works with cache on at the moment.
+                if not hasattr(config, "use_cache"):
+                    return
+
+                config.use_cache = True
+                config.is_decoder = True
+                model = model_class(config).to(torch_device).eval()
+                output_greedy = model.generate(
+                    input_ids,
+                    attention_mask=attention_mask,
+                    max_length=max_length,
+                    num_beams=1,
+                    do_sample=False,
+                    output_scores=True,
+                    output_hidden_states=True,
+                    output_attentions=True,
+                    return_dict_in_generate=True,
+                )
+                # Note: with assisted generate, if the same model is used as assistant, then all assistant tokens will
+                # be correct
+                output_assisted = model.generate(
+                    input_ids,
+                    attention_mask=attention_mask,
+                    max_length=max_length,
+                    num_beams=1,
+                    do_sample=False,
+                    assistant_model=model,
+                    output_scores=True,
+                    output_hidden_states=True,
+                    output_attentions=True,
+                    return_dict_in_generate=True,
+                )
+
+                try:
+                    self.assertListEqual(output_greedy.sequences.tolist(), output_assisted.sequences.tolist())
+
+                    for output in (output_greedy, output_assisted):
+                        self._check_outputs(output, input_ids, model.config, use_cache=True)
+                except AssertionError:
+                    failed += 1
+                    if failed > 1:
+                        self.assertListEqual(output_greedy.sequences.tolist(), output_assisted.sequences.tolist())
+
+                        for output in (output_greedy, output_assisted):
+                            self._check_outputs(output, input_ids, model.config, use_cache=True)
+
+    def test_assisted_decoding_sample(self):
+        # Seeded assisted decoding will not match sample for the same seed, as the forward pass does not return the
+        # exact same logits (the forward pass of the main model, now with several tokens at once, has causal masking).
+
+        for model_class in self.all_generative_model_classes:
+            # won't fix: FSMT and Reformer have a different cache variable type (and format).
+            if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
+                return
+            # may fix in the future: the following models fail with assisted decoding, and need model-specific fixes
+            if any(
+                model_name in model_class.__name__.lower()
+                for model_name in ["bigbirdpegasus", "led", "mega", "speech2text", "git", "prophetnet"]
+            ):
+                return
+
+            # enable cache
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config(batch_size=1)
+
+            # NOTE: assisted generation only works with cache on at the moment.
+            if not hasattr(config, "use_cache"):
+                return
+
+            config.use_cache = True
+            config.is_decoder = True
+            model = model_class(config).to(torch_device).eval()
+            output_assisted = model.generate(
+                input_ids,
+                attention_mask=attention_mask,
+                max_length=max_length,
+                num_beams=1,
+                do_sample=True,
+                assistant_model=model,  # triggers assisted decoding
+                output_scores=True,
+                output_hidden_states=True,
+                output_attentions=True,
+                return_dict_in_generate=True,
+            )
+
+            self._check_outputs(output_assisted, input_ids, model.config, use_cache=True)
+
     def test_generate_with_head_masking(self):
         """Test designed for encoder-decoder models to ensure the attention head masking is used."""
         attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
         for model_class in self.all_generative_model_classes:
             config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
-            model = model_class(config).to(torch_device)
             # We want to test only encoder-decoder models
             if not config.is_encoder_decoder:
                 continue
+            model = model_class(config).to(torch_device)
 
             head_masking = {
                 "head_mask": torch.zeros(config.encoder_layers, config.encoder_attention_heads, device=torch_device),
@@ -1485,7 +1639,7 @@ def test_generate_with_head_masking(self):
 
             signature = inspect.signature(model.forward)
             # We want to test only models where encoder/decoder head masking is implemented
-            if not set(head_masking.keys()) < set([*signature.parameters.keys()]):
+            if not set(head_masking.keys()) < {*signature.parameters.keys()}:
                 continue
 
             for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
@@ -1502,6 +1656,172 @@ def test_generate_with_head_masking(self):
                 attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
                 self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
 
+    def test_left_padding_compatibility(self):
+        # The check done in this test is fairly difficult -- depending on the model architecture, passing the right
+        # position index for the position embeddings can still result in a different output, due to numerical masking.
+        # On the other hand, for some types of position embeddings, an incorrect position index can have a minimal
+        # impact on the output.
+        # There are two tricks employed to check whether left-padding compatibility is in place:
+        # 1 - To reduce the negative impact of the numerical attention mask on a correct position index, we set the
+        # padding size to 1.
+        # 2 - To reduce the chance of false positives (i.e. passing when it should be failing), we run the check
+        # multiple times with random inputs, and it has to pass with all of them.
+        # NOTE: because of 2), there is some chance of false positives in this test.
+
+        for model_class in self.all_generative_model_classes:
+            config, _, _, _ = self._get_input_ids_and_config()
+            if config.is_encoder_decoder:
+                continue  # skip for encoder-decoder models -- they don't need left-padding compatibility
+            model = model_class(config).to(torch_device).eval()
+            signature = inspect.signature(model.forward).parameters.keys()
+
+            no_failures = True
+            for _ in range(10):  # there may be false positives with 10 runs, we rely on the CI to catch the flakiness
+                _, input_ids, attention_mask, _ = self._get_input_ids_and_config()
+                model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask}
+                if "position_ids" in signature:
+                    position_ids = torch.cumsum(attention_mask, dim=-1) - 1
+                    position_ids.masked_fill_(attention_mask == 0, 1)
+                    model_kwargs["position_ids"] = position_ids
+                next_logits_wo_padding = model(**model_kwargs).logits[:, -1, :]
+
+                pad_size = (input_ids.shape[0], 1)
+                padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * config.pad_token_id
+                padded_input_ids = torch.cat((padding, input_ids), dim=1)
+                padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1)
+                model_kwargs = {"input_ids": padded_input_ids, "attention_mask": padded_attention_mask}
+                if "position_ids" in signature:
+                    position_ids = torch.cumsum(padded_attention_mask, dim=-1) - 1
+                    position_ids.masked_fill_(padded_attention_mask == 0, 1)
+                    model_kwargs["position_ids"] = position_ids
+                next_logits_with_padding = model(**model_kwargs).logits[:, -1, :]
+                if not torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-7):
+                    no_failures = False
+                    break
+
+            self.assertTrue(no_failures)
+
+    def test_past_key_values_format(self):
+        # Test that the KV cache is formatted correctly. Exceptions need to explicitly overwrite this test. Having a
+        # standard KV cache format is important for a consistent API (and for advanced generation methods).
+        for model_class in self.all_generative_model_classes:
+            config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
+
+            # If it doesn't support cache, pass the test
+            if not hasattr(config, "use_cache"):
+                return
+
+            model = model_class(config).to(torch_device)
+            if "use_cache" not in inputs:
+                inputs["use_cache"] = True
+            outputs = model(**inputs)
+
+            # If "past_key_values" is not returned, pass the test (e.g. RWKV uses a different cache name and format)
+            if "past_key_values" not in outputs:
+                return
+
+            num_hidden_layers = (
+                getattr(config, "decoder_layers", None)
+                or getattr(config, "num_decoder_layers", None)
+                or config.num_hidden_layers
+            )
+            num_attention_heads = getattr(config, "decoder_attention_heads", config.num_attention_heads)
+            embed_dim = getattr(config, "d_model", config.hidden_size)
+            per_head_embed_dim = embed_dim // num_attention_heads
+
+            past_kv = outputs["past_key_values"]
+            self.assertEqual(len(past_kv), num_hidden_layers)
+
+            # Encoder-Decoder checks
+            if config.is_encoder_decoder:
+                encoder_num_attention_heads = config.encoder_attention_heads
+                encoder_per_head_embed_dim = embed_dim // encoder_num_attention_heads
+                batch_size, seq_length = inputs["decoder_input_ids"].shape
+                for i in range(num_hidden_layers):
+                    self.assertEqual(len(past_kv[i]), 4)  # K V for the decoder + K V for the encoder = 4
+                    self.assertEqual(
+                        past_kv[i][0].shape, (batch_size, num_attention_heads, seq_length, per_head_embed_dim)
+                    )
+                    self.assertEqual(
+                        past_kv[i][1].shape, (batch_size, num_attention_heads, seq_length, per_head_embed_dim)
+                    )
+                    # The sequence length for the encoder K V depends on the model. Since it is not manipulated in
+                    # autoregressive generation, I'm keeping the test general and not checking the 3rd dim
+                    self.assertEqual(
+                        (past_kv[i][2].shape[0], past_kv[i][2].shape[1], past_kv[i][2].shape[3]),
+                        (batch_size, encoder_num_attention_heads, encoder_per_head_embed_dim),
+                    )
+                    self.assertEqual(
+                        (past_kv[i][3].shape[0], past_kv[i][3].shape[1], past_kv[i][3].shape[3]),
+                        (batch_size, encoder_num_attention_heads, encoder_per_head_embed_dim),
+                    )
+
+            # Decoder-only checks
+            else:
+                # TODO: this line is only needed because of imagegpt, where "pixel_values" = "input_ids". Fix the
+                # tests in imagegpt such that `prepare_config_and_inputs_for_common` returns the later (and the other
+                # tests use it)
+                key = "input_ids" if "input_ids" in inputs else "pixel_values"
+                batch_size, seq_length = inputs[key].shape
+                for i in range(num_hidden_layers):
+                    self.assertEqual(len(past_kv[0]), 2)  # K V for the decoder = 2
+                    self.assertEqual(
+                        past_kv[i][0].shape, (batch_size, num_attention_heads, seq_length, per_head_embed_dim)
+                    )
+                    self.assertEqual(
+                        past_kv[i][1].shape, (batch_size, num_attention_heads, seq_length, per_head_embed_dim)
+                    )
+
+    def test_generate_from_inputs_embeds_decoder_only(self):
+        # When supported, tests that the decoder model can generate from `inputs_embeds` instead of `input_ids`
+        # if fails, you should probably update the `prepare_inputs_for_generation` function
+        for model_class in self.all_generative_model_classes:
+            config, input_ids, _, _ = self._get_input_ids_and_config()
+
+            # Ignore:
+            # a) eos (to always output 20 tokens) and pad (so we don't try to infer the attn mask from the input_ids,
+            #   which would cause a mismatch),
+            config.pad_token_id = config.eos_token_id = -1
+            # b) embedding scaling, the scaling factor applied after embeding from input_ids (requires knowledge of the
+            #   variable that holds the scaling factor, which is model-dependent)
+            if hasattr(config, "scale_embedding"):
+                config.scale_embedding = False
+
+            # This test is for decoder-only models (encoder-decoder models have native input embeddings support in the
+            # decoder)
+            if config.is_encoder_decoder:
+                continue
+
+            # Skip models without explicit support
+            model = model_class(config).to(torch_device).eval()
+            if "inputs_embeds" not in inspect.signature(model.prepare_inputs_for_generation).parameters.keys():
+                continue
+
+            # Traditional way of generating text
+            outputs_from_ids = model.generate(input_ids)
+            self.assertEqual(outputs_from_ids.shape, (2, 20))
+
+            # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output)
+            inputs_embeds = model.get_input_embeddings()(input_ids)
+            outputs_from_embeds = model.generate(input_ids, inputs_embeds=inputs_embeds)
+            self.assertListEqual(outputs_from_ids.tolist(), outputs_from_embeds.tolist())
+
+            # But if we pass different inputs_embeds, we should get different outputs
+            torch.manual_seed(0)
+            random_embeds = torch.rand_like(inputs_embeds)
+            outputs_from_rand_embeds = model.generate(input_ids, inputs_embeds=random_embeds)
+            with self.assertRaises(AssertionError):
+                self.assertListEqual(outputs_from_rand_embeds.tolist(), outputs_from_embeds.tolist())
+
+            # input_ids is not a required input -- if we don't pass it, the newly generated tokens will be the same
+            outputs_from_embeds_wo_ids = model.generate(
+                inputs_embeds=inputs_embeds, max_new_tokens=20 - inputs_embeds.shape[1]
+            )
+            self.assertListEqual(
+                outputs_from_embeds[:, inputs_embeds.shape[1] :].tolist(),
+                outputs_from_embeds_wo_ids[:, 1:].tolist(),
+            )
+
     def _check_outputs(self, output, input_ids, config, use_cache=False, num_return_sequences=1):
         batch_size, seq_length = input_ids.shape
         num_sequences_in_output = batch_size * num_return_sequences
@@ -1658,7 +1978,6 @@ def _check_sequence_inside_sequence(self, tensor_1, tensor_2):
 
 @require_torch
 class UtilsFunctionsTest(unittest.TestCase):
-
     # tests whether the top_k_top_p function behaves as expected
     def test_top_k_top_p_filtering(self):
         logits = torch.tensor(
@@ -1788,9 +2107,24 @@ def test_top_k_top_p_filtering_with_filter_value(self):
 
 
 @require_torch
-class GenerationIntegrationTests(unittest.TestCase):
+class GenerationIntegrationTests(unittest.TestCase, GenerationIntegrationTestsMixin):
+    # setting framework_dependent_parameters needs to be gated, just like its contents' imports
+    if is_torch_available():
+        framework_dependent_parameters = {
+            "AutoModelForCausalLM": AutoModelForCausalLM,
+            "AutoModelForSpeechSeq2Seq": AutoModelForSpeechSeq2Seq,
+            "AutoModelForSeq2SeqLM": AutoModelForSeq2SeqLM,
+            "AutoModelForVision2Seq": AutoModelForVision2Seq,
+            "LogitsProcessorList": LogitsProcessorList,
+            "MinLengthLogitsProcessor": MinLengthLogitsProcessor,
+            "create_tensor_fn": torch.tensor,
+            "floats_tensor": floats_tensor,
+            "return_tensors": "pt",
+        }
+
     @slow
     def test_diverse_beam_search(self):
+        # PT-only test: TF doesn't have a diverse beam search implementation
         article = """Justin Timberlake and Jessica Biel, welcome to parenthood.
         The celebrity couple announced the arrival of their son, Silas Randall Timberlake, in statements to People.
         "Silas was the middle name of Timberlake's maternal grandfather Bill Bomar, who died in 2012, while Randall is the musician's own middle name, as well as his father's first," People reports.
@@ -1824,6 +2158,7 @@ def test_diverse_beam_search(self):
         )
 
     def test_max_length_backward_compat_greedy(self):
+        # PT-only test: TF doesn't have StoppingCriteria
         article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
         bart_tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
         bart_model = BartForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-bart").to(
@@ -1834,8 +2169,10 @@ def test_max_length_backward_compat_greedy(self):
         max_length = 20
         input_ids = input_ids.expand(2, -1)
         model_kwargs = bart_model._prepare_encoder_decoder_kwargs_for_generation(input_ids, {})
-        input_ids = bart_model._prepare_decoder_input_ids_for_generation(
-            input_ids.shape[0],
+        input_ids, model_kwargs = bart_model._prepare_decoder_input_ids_for_generation(
+            batch_size=input_ids.shape[0],
+            model_input_name=bart_model.main_input_name,
+            model_kwargs=model_kwargs,
             decoder_start_token_id=bart_model.config.decoder_start_token_id,
             bos_token_id=bart_model.config.bos_token_id,
         )
@@ -1850,6 +2187,7 @@ def test_max_length_backward_compat_greedy(self):
             )
 
     def test_max_length_backward_compat_sample(self):
+        # PT-only test: TF doesn't have StoppingCriteria
         article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
         bart_tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
         bart_model = BartForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-bart").to(
@@ -1860,8 +2198,10 @@ def test_max_length_backward_compat_sample(self):
         max_length = 20
         input_ids = input_ids.expand(2, -1)
         model_kwargs = bart_model._prepare_encoder_decoder_kwargs_for_generation(input_ids, {})
-        input_ids = bart_model._prepare_decoder_input_ids_for_generation(
-            input_ids.shape[0],
+        input_ids, model_kwargs = bart_model._prepare_decoder_input_ids_for_generation(
+            batch_size=input_ids.shape[0],
+            model_input_name=bart_model.main_input_name,
+            model_kwargs=model_kwargs,
             decoder_start_token_id=bart_model.config.decoder_start_token_id,
             bos_token_id=bart_model.config.bos_token_id,
         )
@@ -1876,6 +2216,7 @@ def test_max_length_backward_compat_sample(self):
                 )
 
     def test_max_length_backward_compat_beam_search(self):
+        # PT-only test: TF doesn't have StoppingCriteria
         article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
         bart_tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
         bart_model = BartForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-bart").to(
@@ -1889,8 +2230,10 @@ def test_max_length_backward_compat_beam_search(self):
 
         input_ids = input_ids.expand(2, -1)
         model_kwargs = bart_model._prepare_encoder_decoder_kwargs_for_generation(input_ids, {})
-        input_ids = bart_model._prepare_decoder_input_ids_for_generation(
-            input_ids.shape[0],
+        input_ids, model_kwargs = bart_model._prepare_decoder_input_ids_for_generation(
+            batch_size=input_ids.shape[0],
+            model_input_name=bart_model.main_input_name,
+            model_kwargs=model_kwargs,
             decoder_start_token_id=bart_model.config.decoder_start_token_id,
             bos_token_id=bart_model.config.bos_token_id,
         )
@@ -1906,6 +2249,7 @@ def test_max_length_backward_compat_beam_search(self):
             )
 
     def test_max_length_backward_compat_group_beam_search(self):
+        # PT-only test: TF doesn't have StoppingCriteria & group beam search
         article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
         bart_tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
         bart_model = BartForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-bart").to(
@@ -1921,8 +2265,10 @@ def test_max_length_backward_compat_group_beam_search(self):
 
         input_ids = input_ids.expand(6, -1)
         model_kwargs = bart_model._prepare_encoder_decoder_kwargs_for_generation(input_ids, {})
-        input_ids = bart_model._prepare_decoder_input_ids_for_generation(
-            input_ids.shape[0],
+        input_ids, model_kwargs = bart_model._prepare_decoder_input_ids_for_generation(
+            batch_size=input_ids.shape[0],
+            model_input_name=bart_model.main_input_name,
+            model_kwargs=model_kwargs,
             decoder_start_token_id=bart_model.config.decoder_start_token_id,
             bos_token_id=bart_model.config.bos_token_id,
         )
@@ -1940,6 +2286,7 @@ def test_max_length_backward_compat_group_beam_search(self):
             )
 
     def test_max_length_warning_if_different(self):
+        # PT-only test: TF doesn't have StoppingCriteria
         article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
         bart_tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
         bart_model = BartForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-bart").to(
@@ -1959,8 +2306,10 @@ def test_max_length_warning_if_different(self):
         # Greedy
         input_ids = input_ids.expand(6, -1)
         model_kwargs = bart_model._prepare_encoder_decoder_kwargs_for_generation(input_ids, {})
-        input_ids = bart_model._prepare_decoder_input_ids_for_generation(
-            input_ids.shape[0],
+        input_ids, model_kwargs = bart_model._prepare_decoder_input_ids_for_generation(
+            batch_size=input_ids.shape[0],
+            model_input_name=bart_model.main_input_name,
+            model_kwargs=model_kwargs,
             decoder_start_token_id=bart_model.config.decoder_start_token_id,
             bos_token_id=bart_model.config.bos_token_id,
         )
@@ -2022,60 +2371,8 @@ def test_max_length_warning_if_different(self):
                 **model_kwargs,
             )
 
-    def test_beam_search_warning_if_max_length_is_passed(self):
-        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
-        bart_tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        bart_model = BartForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-bart").to(
-            torch_device
-        )
-
-        batch_size = 1
-        num_beams = 3
-
-        input_ids = bart_tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
-        input_ids = input_ids.expand(num_beams, -1)
-        model_kwargs = bart_model._prepare_encoder_decoder_kwargs_for_generation(input_ids, {})
-
-        # pretend decoder_input_ids correspond to first encoder input id
-        decoder_input_ids = input_ids[:, :1]
-
-        stopping_criteria_max_length = 18
-        stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=stopping_criteria_max_length)])
-
-        with self.assertWarns(UserWarning):
-            beam_scorer = BeamSearchScorer(
-                batch_size=batch_size,
-                num_beams=num_beams,
-                device=torch_device,
-                max_length=10,
-            )
-
-        generated_ids = bart_model.beam_search(
-            decoder_input_ids,
-            num_beams=num_beams,
-            stopping_criteria=stopping_criteria,
-            beam_scorer=beam_scorer,
-            **model_kwargs,
-        )
-
-        beam_scorer_no_max_len = BeamSearchScorer(
-            batch_size=batch_size,
-            num_beams=num_beams,
-            device=torch_device,
-        )
-
-        generated_ids_no_max_len = bart_model.beam_search(
-            decoder_input_ids,
-            num_beams=num_beams,
-            stopping_criteria=stopping_criteria,
-            beam_scorer=beam_scorer_no_max_len,
-            **model_kwargs,
-        )
-
-        # BeamSearchScorer max_length should not influence "real" max_length
-        self.assertEqual(generated_ids.tolist(), generated_ids_no_max_len.tolist())
-
     def test_custom_stopping_criteria_overload_error(self):
+        # PT-only test: TF doesn't have StoppingCriteria
         article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
         bart_tokenizer = BartTokenizer.from_pretrained("sshleifer/bart-tiny-random")
         bart_model = BartForConditionalGeneration.from_pretrained("sshleifer/bart-tiny-random").to(torch_device)
@@ -2089,6 +2386,7 @@ def test_custom_stopping_criteria_overload_error(self):
             bart_model.generate(input_ids, stopping_criteria=stopping_criteria, max_length=32)
 
     def test_custom_stopping_criteria(self):
+        # PT-only test: TF doesn't have StoppingCriteria
         article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
         bart_tokenizer = BartTokenizer.from_pretrained("sshleifer/bart-tiny-random")
         bart_model = BartForConditionalGeneration.from_pretrained("sshleifer/bart-tiny-random").to(torch_device)
@@ -2111,7 +2409,7 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwa
         )
 
     def test_stop_sequence_stopping_criteria(self):
-
+        # PT-only test: TF doesn't have StoppingCriteria
         prompt = """Hello I believe in"""
         generator = pipeline("text-generation", model="hf-internal-testing/tiny-random-bart")
         output = generator(prompt)
@@ -2129,603 +2427,69 @@ def test_stop_sequence_stopping_criteria(self):
         output = generator(prompt, stop_sequence=" number")
         self.assertEqual(output, [{"generated_text": "Hello I believe in in in number"}])
 
-    def test_custom_logits_processor(self):
-        bart_tokenizer = BartTokenizer.from_pretrained("sshleifer/bart-tiny-random")
-        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
-        bart_model = BartForConditionalGeneration.from_pretrained("sshleifer/bart-tiny-random", min_length=1).to(
-            torch_device
-        )
-        input_ids = bart_tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
-
-        logits_processor = LogitsProcessorList()
-        logits_processor.append(MinLengthLogitsProcessor(min_length=10, eos_token_id=0))
-        # it should not be allowed to both define `min_length` via config and `logits_processor` list
-        with self.assertRaises(ValueError):
-            bart_model.generate(input_ids, logits_processor=logits_processor)
+    def test_generate_non_nlp_input_ids_as_kwarg(self):
+        # PT-only test: AFAIK there's no non-NLP model architecture in TF that supports `input_ids` as its only input
+        model = ImageGPTForCausalImageModeling.from_pretrained(
+            "hf-internal-testing/tiny-random-imagegpt", max_length=10
+        ).to(torch_device)
+        input_ids = ids_tensor((3, 5), vocab_size=10)
 
-        bart_model.config.min_length = None
-        bart_model.generate(input_ids, logits_processor=logits_processor)
+        output_sequences_kwargs = model.generate(input_ids=input_ids).cpu()
+        output_sequences = model.generate(input_ids).cpu()
 
-    def test_max_new_tokens_encoder_decoder(self):
-        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
-        bart_tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        bart_model = BartForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-bart").to(
-            torch_device
-        )
-        input_ids = bart_tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
+        self.assertListEqual(output_sequences.tolist(), output_sequences_kwargs.tolist())
+        self.assertEqual(output_sequences.shape, (3, 10))
 
-        self.assertEqual(list(input_ids.shape), [1, 29])
+    def test_generate_input_values_as_encoder_kwarg(self):
+        # PT-only test: AFAIK there's no generate-capable architecture in TF that supports `input_values` as its input
+        input_values = floats_tensor((2, 250))
+        model = SpeechEncoderDecoderModel.from_pretrained("hf-internal-testing/tiny-random-speech-encoder-decoder")
+        model = model.to(torch_device)
+        output_sequences_kwargs = model.generate(input_values=input_values, max_length=5).cpu()
+        output_sequences = model.generate(input_values, max_length=5).cpu()
 
-        max_new_tokens = 3
-        bart_model.config.max_length = 20
-        bart_model.config.eos_token_id = None
+        self.assertListEqual(output_sequences.tolist(), output_sequences_kwargs.tolist())
+        self.assertEqual(output_sequences.shape, (2, 5))
 
-        # Encoder decoder call
-        outputs = bart_model.generate(input_ids, max_new_tokens=max_new_tokens)
-        # 1 BOS + 3 new tokens
-        self.assertEqual(list(outputs.shape), [1, 4])
+    def test_transition_scores_group_beam_search_encoder_decoder(self):
+        # PT-only test: TF doesn't have group beam search
+        articles = [
+            "Justin Timberlake and Jessica Biel, welcome to parenthood.",
+            "Michael Phelps is arguably the most decorated Olympian of all time.",
+        ]
+        tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
+        model = BartForConditionalGeneration.from_pretrained(
+            "hf-internal-testing/tiny-random-bart",
+            max_length=10,
+            num_beams=2,
+            num_beam_groups=2,
+            num_return_sequences=2,
+            diversity_penalty=1.0,
+            eos_token_id=None,
+            return_dict_in_generate=True,
+            output_scores=True,
+            length_penalty=0.0,
+        )
+        model = model.to(torch_device)
 
-        # Decoder only call
-        outputs = bart_model.generate(decoder_input_ids=input_ids, max_new_tokens=max_new_tokens)
-        # 29 + 3 new tokens
-        self.assertEqual(list(outputs.shape), [1, 32])
+        input_ids = tokenizer(articles, return_tensors="pt", padding=True).input_ids.to(torch_device)
+        outputs = model.generate(input_ids=input_ids)
 
-        # Encoder decoder call > 20
-        outputs = bart_model.generate(max_new_tokens=max_new_tokens + 20)
+        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, outputs.beam_indices)
+        transition_scores_sum = transition_scores.sum(-1)
 
-        # 1 BOS + 20 + 3 new tokens
-        self.assertEqual(list(outputs.shape), [1, 24])
+        self.assertTrue(torch.allclose(transition_scores_sum, outputs.sequences_scores, atol=1e-3))
 
-        # max_new_tokens and max_length serve the same purpose and must not be used together.
-        with self.assertRaises(ValueError):
-            bart_model.generate(decoder_input_ids=input_ids, max_new_tokens=10, max_length=20)
+    @slow
+    def test_beam_search_example_integration(self):
+        # PT-only test: TF doesn't have a BeamSearchScorer
+        # exactly the example provided in the docstrings of beam search, which previously
+        # failed after directly copying from it. Refer to PR #15555
+        tokenizer = AutoTokenizer.from_pretrained("t5-base")
+        model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
 
-    def test_max_new_tokens_decoder_only_contrastive_search_t5(self):
-        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
-        t5_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
-        t5_model = T5ForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-t5").to(torch_device)
-        input_ids = t5_tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
-
-        self.assertEqual(list(input_ids.shape), [1, 56])
-
-        max_new_tokens = 3
-        t5_model.config.max_length = 20
-        t5_model.config.eos_token_id = None
-
-        # Encoder decoder call
-        outputs = t5_model.generate(input_ids, max_new_tokens=max_new_tokens, penalty_alpha=0.6, top_k=4)
-        # 1 BOS + 3 new tokens
-        self.assertEqual(list(outputs.shape), [1, 4])
-
-        # Decoder only call
-        outputs = t5_model.generate(
-            decoder_input_ids=input_ids, max_new_tokens=max_new_tokens, penalty_alpha=0.6, top_k=4
-        )
-        # 56 + 3 new tokens
-        self.assertEqual(list(outputs.shape), [1, 59])
-
-        # Encoder decoder call > 20
-        outputs = t5_model.generate(max_new_tokens=max_new_tokens + 20, penalty_alpha=0.6, top_k=4)
-
-        # 1 BOS + 20 + 3 new tokens
-        self.assertEqual(list(outputs.shape), [1, 24])
-
-        # max_new_tokens and max_length serve the same purpose and must not be used together.
-        with self.assertRaises(ValueError):
-            t5_model.generate(
-                decoder_input_ids=input_ids, max_new_tokens=10, max_length=20, penalty_alpha=0.6, top_k=4
-            )
-
-    def test_max_new_tokens_decoder_only_contrastive_search_bart(self):
-        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
-        bart_tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        bart_model = BartForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-bart").to(
-            torch_device
-        )
-        input_ids = bart_tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
-
-        self.assertEqual(list(input_ids.shape), [1, 29])
-
-        max_new_tokens = 3
-        bart_model.config.max_length = 20
-        bart_model.config.eos_token_id = None
-
-        # Encoder decoder call
-        outputs = bart_model.generate(input_ids, max_new_tokens=max_new_tokens, penalty_alpha=0.6, top_k=4)
-        # 1 BOS + 3 new tokens
-        self.assertEqual(list(outputs.shape), [1, 4])
-
-        # Decoder only call
-        outputs = bart_model.generate(
-            decoder_input_ids=input_ids, max_new_tokens=max_new_tokens, penalty_alpha=0.6, top_k=4
-        )
-        # 29 + 3 new tokens
-        self.assertEqual(list(outputs.shape), [1, 32])
-
-        # Encoder decoder call > 20
-        outputs = bart_model.generate(max_new_tokens=max_new_tokens + 20, penalty_alpha=0.6, top_k=4)
-
-        # 1 BOS + 20 + 3 new tokens
-        self.assertEqual(list(outputs.shape), [1, 24])
-
-        # max_new_tokens and max_length serve the same purpose and must not be used together.
-        with self.assertRaises(ValueError):
-            bart_model.generate(
-                decoder_input_ids=input_ids, max_new_tokens=10, max_length=20, penalty_alpha=0.6, top_k=4
-            )
-
-    def test_max_new_tokens_decoder_only_contrastive_search_gptj(self):
-        article = """Justin Timberlake."""
-        gptj_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gptj")
-        gptj_model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gptj").to(torch_device)
-        input_ids = gptj_tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
-
-        self.assertEqual(list(input_ids.shape), [1, 9])
-
-        max_new_tokens = 3
-        gptj_model.config.max_length = 20
-
-        # call < 20
-        outputs = gptj_model.generate(input_ids, max_new_tokens=max_new_tokens, penalty_alpha=0.6, top_k=4)
-
-        # 9 input_ids + 3 new tokens
-        self.assertEqual(list(outputs.shape), [1, 12])
-
-        # call > 20
-        outputs = gptj_model.generate(max_new_tokens=max_new_tokens + 20, penalty_alpha=0.6, top_k=4)
-
-        # 1 BOS token + 23 new tokens
-        self.assertEqual(list(outputs.shape), [1, 24])
-
-        # max_new_tokens and max_length serve the same purpose and must not be used together.
-        with self.assertRaises(ValueError):
-            gptj_model.generate(input_ids=input_ids, max_new_tokens=10, max_length=20, penalty_alpha=0.6, top_k=4)
-
-    def test_max_new_tokens_decoder_only_contrastive_search_gpt2(self):
-        article = """Justin Timberlake."""
-        gpt2_tokenizer = GPT2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        gpt2_model = GPT2LMHeadModel.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
-        input_ids = gpt2_tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
-
-        self.assertEqual(list(input_ids.shape), [1, 9])
-
-        max_new_tokens = 3
-        gpt2_model.config.max_length = 20
-
-        # call < 20
-        outputs = gpt2_model.generate(input_ids, max_new_tokens=max_new_tokens, penalty_alpha=0.6, top_k=4)
-
-        # 9 input_ids + 3 new tokens
-        self.assertEqual(list(outputs.shape), [1, 12])
-
-        # call > 20
-        outputs = gpt2_model.generate(max_new_tokens=max_new_tokens + 20, penalty_alpha=0.6, top_k=4)
-
-        # 1 BOS token + 23 new tokens
-        self.assertEqual(list(outputs.shape), [1, 24])
-
-        # max_new_tokens and max_length serve the same purpose and must not be used together.
-        with self.assertRaises(ValueError):
-            gpt2_model.generate(input_ids=input_ids, max_new_tokens=10, max_length=20, penalty_alpha=0.6, top_k=4)
-
-    def test_max_new_tokens_decoder_only(self):
-        article = """Justin Timberlake."""
-        gpt2_tokenizer = GPT2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        gpt2_model = GPT2LMHeadModel.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
-        input_ids = gpt2_tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
-
-        self.assertEqual(list(input_ids.shape), [1, 9])
-
-        max_new_tokens = 3
-        gpt2_model.config.max_length = 20
-
-        # call < 20
-        outputs = gpt2_model.generate(input_ids, max_new_tokens=max_new_tokens)
-
-        # 9 input_ids + 3 new tokens
-        self.assertEqual(list(outputs.shape), [1, 12])
-
-        # call > 20
-        outputs = gpt2_model.generate(max_new_tokens=max_new_tokens + 20)
-
-        # 1 BOS token + 23 new tokens
-        self.assertEqual(list(outputs.shape), [1, 24])
-
-        # max_new_tokens and max_length serve the same purpose and must not be used together.
-        with self.assertRaises(ValueError):
-            gpt2_model.generate(input_ids=input_ids, max_new_tokens=10, max_length=20)
-
-    def test_encoder_decoder_generate_with_inputs_embeds(self):
-        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
-        tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        model = BartForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-bart", max_length=5).to(
-            torch_device
-        )
-        model.config.eos_token_id = None
-        input_ids = tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
-        inputs_embeds = model.get_input_embeddings()(input_ids)
-
-        output_sequences = model.generate(inputs_embeds=inputs_embeds)
-
-        # make sure model generated correctly until `max_length`
-        self.assertEqual(output_sequences.shape, (1, 5))
-
-    def test_encoder_decoder_generate_attention_mask(self):
-        articles = ["Timberlake", "Jessica Biel, welcome to parenthood among other things"]
-        tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        # need extrem generation values here to force this test
-        # to fail when `attention_mask` is not correctly treated in generate
-        model = BartForConditionalGeneration.from_pretrained(
-            "hf-internal-testing/tiny-random-bart", max_length=50, num_beams=5, num_return_sequences=5
-        ).to(torch_device)
-
-        model.config.eos_token_id = None
-        input_ids = tokenizer(articles[0], return_tensors="pt").input_ids.to(torch_device)
-        input_ids_batched = tokenizer(articles, padding=True, return_tensors="pt").input_ids.to(torch_device)
-
-        output_sequences_batched = model.generate(
-            input_ids=input_ids_batched, return_dict_in_generate=True, output_scores=True
-        )
-        output_sequences = model.generate(input_ids=input_ids, return_dict_in_generate=True, output_scores=True)
-
-        batched_out = output_sequences_batched.sequences_scores
-        out = output_sequences.sequences_scores
-
-        diff = (batched_out[:5].sum() - out.sum()).abs()
-
-        self.assertTrue(diff < 1e-4)
-
-    def test_decoder_generate_with_inputs_embeds(self):
-        article = """I need input_ids to generate"""
-        tokenizer = GPT2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        model = GPT2LMHeadModel.from_pretrained("hf-internal-testing/tiny-random-gpt2", max_length=5).to(torch_device)
-        input_ids = tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
-        inputs_embeds = model.get_input_embeddings()(input_ids)
-
-        # cannot generate from `inputs_embeds` for decoder only
-        with self.assertRaises(ValueError):
-            model.generate(inputs_embeds=inputs_embeds)
-
-    def test_generate_input_ids_as_kwarg(self):
-        article = """I need input_ids to generate"""
-        tokenizer = GPT2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        model = GPT2LMHeadModel.from_pretrained("hf-internal-testing/tiny-random-gpt2", max_length=15).to(torch_device)
-        input_ids = tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
-        output_sequences_kwargs = model.generate(input_ids=input_ids).cpu()
-        output_sequences = model.generate(input_ids).cpu()
-
-        self.assertListEqual(output_sequences.tolist(), output_sequences_kwargs.tolist())
-        self.assertEqual(output_sequences.shape, (1, 15))
-
-    def test_generate_non_nlp_input_ids_as_kwarg(self):
-        model = ImageGPTForCausalImageModeling.from_pretrained(
-            "hf-internal-testing/tiny-random-imagegpt", max_length=10
-        ).to(torch_device)
-        input_ids = ids_tensor((3, 5), vocab_size=10)
-
-        output_sequences_kwargs = model.generate(input_ids=input_ids).cpu()
-        output_sequences = model.generate(input_ids).cpu()
-
-        self.assertListEqual(output_sequences.tolist(), output_sequences_kwargs.tolist())
-        self.assertEqual(output_sequences.shape, (3, 10))
-
-    def test_generate_input_ids_as_encoder_kwarg(self):
-        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
-        tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        model = BartForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-bart", max_length=5).to(
-            torch_device
-        )
-        model.config.eos_token_id = None
-        input_ids = tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
-        output_sequences_kwargs = model.generate(input_ids=input_ids).cpu()
-        output_sequences = model.generate(input_ids).cpu()
-
-        self.assertListEqual(output_sequences.tolist(), output_sequences_kwargs.tolist())
-        self.assertEqual(output_sequences.shape, (1, 5))
-
-    def test_generate_inputs_and_encoder_kwargs(self):
-        article = """I need input_ids to generate"""
-        tokenizer = GPT2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        model = GPT2LMHeadModel.from_pretrained("hf-internal-testing/tiny-random-gpt2", max_length=10).to(torch_device)
-        input_ids = tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
-        with self.assertRaises(ValueError):
-            model.generate(input_ids, input_ids=input_ids)
-
-    def test_generate_too_many_encoder_kwargs(self):
-        article = """I need input_ids to generate"""
-        tokenizer = GPT2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        model = GPT2LMHeadModel.from_pretrained("hf-internal-testing/tiny-random-gpt2", max_length=10).to(torch_device)
-        input_ids = tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
-        with self.assertRaises(ValueError):
-            model.generate(input_ids=input_ids, inputs_embeds=input_ids)
-
-    def test_generate_input_values_as_encoder_kwarg(self):
-        input_values = floats_tensor((2, 250))
-        model = SpeechEncoderDecoderModel.from_pretrained("hf-internal-testing/tiny-random-speech-encoder-decoder")
-        model = model.to(torch_device)
-        output_sequences_kwargs = model.generate(input_values=input_values, max_length=5).cpu()
-        output_sequences = model.generate(input_values, max_length=5).cpu()
-
-        self.assertListEqual(output_sequences.tolist(), output_sequences_kwargs.tolist())
-        self.assertEqual(output_sequences.shape, (2, 5))
-
-    def test_generate_input_features_as_encoder_kwarg(self):
-        input_features = floats_tensor((3, 20, 24))
-        model = Speech2TextForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-speech_to_text")
-        model = model.to(torch_device)
-        output_sequences_kwargs = model.generate(input_features=input_features, max_length=5).cpu()
-        output_sequences = model.generate(input_features, max_length=5).cpu()
-
-        self.assertListEqual(output_sequences.tolist(), output_sequences_kwargs.tolist())
-        self.assertEqual(output_sequences.shape, (3, 5))
-
-    def test_generate_pixel_values_as_encoder_kwarg(self):
-        pixel_values = floats_tensor((2, 3, 30, 30))
-        model = VisionEncoderDecoderModel.from_pretrained("hf-internal-testing/tiny-random-vision-encoder-decoder")
-        model = model.to(torch_device)
-        output_sequences_kwargs = model.generate(pixel_values=pixel_values, max_length=5).cpu()
-        output_sequences = model.generate(pixel_values, max_length=5).cpu()
-
-        self.assertListEqual(output_sequences.tolist(), output_sequences_kwargs.tolist())
-        self.assertEqual(output_sequences.shape, (2, 5))
-
-    def test_generate_encoder_outputs_attention_mask(self):
-        input_values = floats_tensor((2, 250)).to(torch_device)
-        attention_mask = torch.ones_like(input_values)
-        model = SpeechEncoderDecoderModel.from_pretrained("hf-internal-testing/tiny-random-speech-encoder-decoder")
-        model = model.to(torch_device)
-
-        encoder = model.get_encoder()
-
-        encoder_outputs = encoder(input_values)
-
-        output_sequences_no_mask = model.generate(encoder_outputs=encoder_outputs).cpu()
-        output_sequences_with_mask = model.generate(encoder_outputs=encoder_outputs, attention_mask=attention_mask)
-        output_sequences_with_mask = output_sequences_with_mask.cpu()
-
-        self.assertListEqual(output_sequences_no_mask.tolist(), output_sequences_with_mask.tolist())
-
-    def test_transition_scores_beam_search_encoder_decoder(self):
-        articles = [
-            "Justin Timberlake and Jessica Biel, welcome to parenthood.",
-            "Michael Phelps is arguably the most decorated Olympian of all time.",
-        ]
-        tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        model = BartForConditionalGeneration.from_pretrained(
-            "hf-internal-testing/tiny-random-bart",
-            max_length=10,
-            num_beams=4,
-            num_return_sequences=2,
-            eos_token_id=None,
-            return_dict_in_generate=True,
-            output_scores=True,
-            length_penalty=0.0,
-        )
-        model = model.to(torch_device)
-
-        input_ids = tokenizer(articles, return_tensors="pt", padding=True).input_ids.to(torch_device)
-        outputs = model.generate(input_ids=input_ids)
-
-        transition_scores = model.compute_transition_beam_scores(
-            outputs.sequences, outputs.scores, outputs.beam_indices
-        )
-        transition_scores_sum = transition_scores.sum(-1)
-
-        self.assertTrue(torch.allclose(transition_scores_sum, outputs.sequences_scores, atol=1e-3))
-
-    def test_transition_scores_beam_search_encoder_decoder_with_eos(self):
-        articles = [
-            "Justin Timberlake and Jessica Biel, welcome to parenthood.",
-            "Michael Phelps is arguably the most decorated Olympian of all time.",
-        ]
-        tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        model = BartForConditionalGeneration.from_pretrained(
-            "hf-internal-testing/tiny-random-bart",
-            max_length=10,
-            num_beams=4,
-            num_return_sequences=2,
-            return_dict_in_generate=True,
-            output_scores=True,
-            length_penalty=0.0,
-        )
-        model = model.to(torch_device)
-
-        input_ids = tokenizer(articles, return_tensors="pt", padding=True).input_ids.to(torch_device)
-        outputs = model.generate(input_ids=input_ids)
-
-        transition_scores = model.compute_transition_beam_scores(
-            outputs.sequences, outputs.scores, outputs.beam_indices
-        )
-        transition_scores_sum = transition_scores.sum(-1)
-
-        self.assertTrue(torch.allclose(transition_scores_sum, outputs.sequences_scores, atol=1e-3))
-
-    def test_transition_scores_beam_search_decoder_only(self):
-        articles = [
-            "Justin Timberlake",
-            "Michael Phelps",
-        ]
-        tokenizer = GPT2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        tokenizer.pad_token = tokenizer.eos_token
-
-        model = GPT2LMHeadModel.from_pretrained(
-            "hf-internal-testing/tiny-random-gpt2",
-            max_length=10,
-            num_beams=4,
-            num_return_sequences=2,
-            pad_token_id=tokenizer.eos_token_id,
-            eos_token_id=None,
-            return_dict_in_generate=True,
-            output_scores=True,
-            length_penalty=0.0,
-        )
-        model = model.to(torch_device)
-
-        input_ids = tokenizer(articles, return_tensors="pt", padding=True).input_ids.to(torch_device)
-        outputs = model.generate(input_ids=input_ids)
-
-        transition_scores = model.compute_transition_beam_scores(
-            outputs.sequences, outputs.scores, outputs.beam_indices
-        )
-        transition_scores_sum = transition_scores.sum(-1)
-
-        self.assertTrue(torch.allclose(transition_scores_sum, outputs.sequences_scores, atol=1e-3))
-
-    def test_transition_scores_beam_sample_encoder_decoder(self):
-        articles = [
-            "Justin Timberlake and Jessica Biel, welcome to parenthood.",
-            "Michael Phelps is arguably the most decorated Olympian of all time.",
-        ]
-        tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        model = BartForConditionalGeneration.from_pretrained(
-            "hf-internal-testing/tiny-random-bart",
-            do_sample=True,
-            max_length=10,
-            num_beams=4,
-            num_return_sequences=2,
-            eos_token_id=None,
-            return_dict_in_generate=True,
-            output_scores=True,
-            length_penalty=0.0,
-        )
-        model = model.to(torch_device)
-
-        input_ids = tokenizer(articles, return_tensors="pt", padding=True).input_ids.to(torch_device)
-        outputs = model.generate(input_ids=input_ids)
-
-        transition_scores = model.compute_transition_beam_scores(
-            outputs.sequences, outputs.scores, outputs.beam_indices
-        )
-        transition_scores_sum = transition_scores.sum(-1)
-
-        self.assertTrue(torch.allclose(transition_scores_sum, outputs.sequences_scores, atol=1e-3))
-
-    def test_transition_scores_group_beam_search_encoder_decoder(self):
-        articles = [
-            "Justin Timberlake and Jessica Biel, welcome to parenthood.",
-            "Michael Phelps is arguably the most decorated Olympian of all time.",
-        ]
-        tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        model = BartForConditionalGeneration.from_pretrained(
-            "hf-internal-testing/tiny-random-bart",
-            max_length=10,
-            num_beams=2,
-            num_beam_groups=2,
-            num_return_sequences=2,
-            eos_token_id=None,
-            return_dict_in_generate=True,
-            output_scores=True,
-            length_penalty=0.0,
-        )
-        model = model.to(torch_device)
-
-        input_ids = tokenizer(articles, return_tensors="pt", padding=True).input_ids.to(torch_device)
-        outputs = model.generate(input_ids=input_ids)
-
-        transition_scores = model.compute_transition_beam_scores(
-            outputs.sequences, outputs.scores, outputs.beam_indices
-        )
-        transition_scores_sum = transition_scores.sum(-1)
-
-        self.assertTrue(torch.allclose(transition_scores_sum, outputs.sequences_scores, atol=1e-3))
-
-    @slow
-    def test_transition_scores_early_stopping(self):
-        # This is an aggressive test that makes sure that `beam_search's`
-        # transition scores are computed correctly for varying `num_return_sequences`,
-        # `num_beams` and `batch_size > 1`
-        # 2 x input_ids for "question: How are you? \n context: I had a long day, "
-        input_ids = torch.tensor(2 * [[822, 10, 571, 33, 25, 58, 2625, 10, 27, 141, 3, 9, 307, 239, 6, 1]]).to(
-            torch_device
-        )
-
-        model = AutoModelForSeq2SeqLM.from_pretrained("t5-small").to(torch_device)
-
-        result = model.generate(
-            input_ids,
-            max_length=10,
-            return_dict_in_generate=True,
-            output_scores=True,
-            forced_eos_token_id=model.config.eos_token_id,
-            num_beams=4,
-            do_sample=False,
-            num_return_sequences=3,
-            length_penalty=0.0,
-        )
-
-        transition_scores = model.compute_transition_beam_scores(
-            sequences=result.sequences, scores=result.scores, beam_indices=result.beam_indices
-        )
-
-        sum_transition_scores = torch.sum(transition_scores, dim=1)
-
-        self.assertListEqual(sum_transition_scores.cpu().tolist(), result.sequences_scores.cpu().tolist())
-
-    def test_log_scores_sample_decoder_only(self):
-        articles = ["I need input_ids to generate", "Short and"]
-        tokenizer = GPT2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        tokenizer.padding_side = "left"
-        tokenizer.pad_token = tokenizer.eos_token
-
-        model = GPT2LMHeadModel.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
-
-        inputs = tokenizer(articles, return_tensors="pt", padding=True).to(torch_device)
-
-        result = model.generate(
-            **inputs,
-            max_length=15,
-            return_dict_in_generate=True,
-            do_sample=False,
-            output_scores=True,
-        )
-
-        # decoder-only starts generating from `input_ids`
-        begin_generation = inputs.input_ids.shape[-1]
-
-        gen_sequences = result.sequences[:, begin_generation:]
-        probs = torch.stack(result.scores, dim=1).softmax(-1)
-
-        gen_probs = torch.gather(probs, 2, gen_sequences[:, :, None]).squeeze(-1)
-        expected_probs = torch.tensor([[0.0014, 0.0015], [0.0014, 0.0014]])
-
-        self.assertTrue(torch.allclose(gen_probs.cpu(), expected_probs, atol=1e-3))
-
-    def test_log_scores_sample_encoder_decoder(self):
-        articles = ["I need input_ids to generate", "Short and"]
-        tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        model = BartForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-bart").to(torch_device)
-
-        inputs = tokenizer(articles, return_tensors="pt", padding=True).to(torch_device)
-
-        result = model.generate(
-            **inputs,
-            max_length=3,
-            return_dict_in_generate=True,
-            do_sample=False,
-            num_beams=1,
-            output_scores=True,
-        )
-
-        # encoder-decoder has one decoder_start_token_id by default
-        begin_generation = 1
-
-        gen_sequences = result.sequences[:, begin_generation:]
-        probs = torch.stack(result.scores, dim=1).softmax(-1)
-
-        gen_probs = torch.gather(probs, 2, gen_sequences[:, :, None]).squeeze(-1)
-        expected_probs = torch.tensor([[0.0013, 1.0000], [0.0013, 1.0000]])
-
-        self.assertTrue(torch.allclose(gen_probs.cpu(), expected_probs, atol=1e-3))
-
-    @slow
-    def test_beam_search_example_integration(self):
-        # exactly the example provided in the docstrings of beam search, which previously
-        # failed after directly copying from it. Refer to PR #15555
-        tokenizer = AutoTokenizer.from_pretrained("t5-base")
-        model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
-
-        encoder_input_str = "translate English to German: How old are you?"
-        encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
+        encoder_input_str = "translate English to German: How old are you?"
+        encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
 
         # lets run beam search using 3 beams
         num_beams = 3
@@ -2761,6 +2525,7 @@ def test_beam_search_example_integration(self):
 
     @slow
     def test_constrained_beam_search(self):
+        # PT-only test: TF doesn't have constrained beam search
         model = GPT2LMHeadModel.from_pretrained("gpt2").to(torch_device)
         tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
 
@@ -2798,6 +2563,7 @@ def test_constrained_beam_search(self):
 
     @slow
     def test_constrained_beam_search_mixed(self):
+        # PT-only test: TF doesn't have constrained beam search
         model = GPT2LMHeadModel.from_pretrained("gpt2").to(torch_device)
         tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
 
@@ -2838,6 +2604,7 @@ def test_constrained_beam_search_mixed(self):
 
     @slow
     def test_constrained_beam_search_mixed_mixin(self):
+        # PT-only test: TF doesn't have constrained beam search
         model = GPT2LMHeadModel.from_pretrained("gpt2").to(torch_device)
         tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
 
@@ -2873,8 +2640,49 @@ def test_constrained_beam_search_mixed_mixin(self):
             ],
         )
 
+    @slow
+    def test_cfg_mixin(self):
+        model = GPT2LMHeadModel.from_pretrained("gpt2").to(torch_device)
+        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+
+        input = tokenizer(["The dragon flew over Paris,"], return_tensors="pt", return_attention_mask=True)
+        input["input_ids"] = input["input_ids"].to(torch_device)
+        input["attention_mask"] = input["attention_mask"].to(torch_device)
+
+        outputs = model.generate(**input, max_new_tokens=32, guidance_scale=1.5)
+        generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+
+        self.assertListEqual(
+            generated_text,
+            [
+                "The dragon flew over Paris, landing in the Rue de la Bastille. The crowd was so excited "
+                'that they had to leave the city.\n\n"We\'re going to Paris!"\n'
+            ],
+        )
+
+        neg = tokenizer(["France,"], return_tensors="pt", return_attention_mask=True)
+        neg["input_ids"] = neg["input_ids"].to(torch_device)
+        neg["attention_mask"] = neg["attention_mask"].to(torch_device)
+        outputs = model.generate(
+            **input,
+            max_new_tokens=32,
+            guidance_scale=1.5,
+            negative_prompt_ids=neg["input_ids"],
+            negative_prompt_attention_mask=neg["attention_mask"],
+        )
+        generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+
+        self.assertListEqual(
+            generated_text,
+            [
+                'The dragon flew over Paris, landing on the pavement.\n\n"Paris!"\n\n"Paris!"\n\n"'
+                'Paris!"\n\n"Paris!"\n\n"Paris!"\n\n'
+            ],
+        )
+
     @slow
     def test_constrained_beam_search_example_translation_mixin(self):
+        # PT-only test: TF doesn't have constrained beam search
         tokenizer = AutoTokenizer.from_pretrained("t5-base")
         model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
 
@@ -2899,6 +2707,7 @@ def test_constrained_beam_search_example_translation_mixin(self):
 
     @slow
     def test_constrained_beam_search_example_integration(self):
+        # PT-only test: TF doesn't have constrained beam search
         tokenizer = AutoTokenizer.from_pretrained("t5-base")
         model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
 
@@ -2942,6 +2751,7 @@ def test_constrained_beam_search_example_integration(self):
         self.assertListEqual(outputs, ["Wie alt sind Sie?"])
 
     def test_constrained_beam_search_mixin_type_checks(self):
+        # PT-only test: TF doesn't have constrained beam search
         tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/t5-tiny-random")
         model = AutoModelForSeq2SeqLM.from_pretrained("patrickvonplaten/t5-tiny-random")
 
@@ -2982,6 +2792,7 @@ def test_constrained_beam_search_mixin_type_checks(self):
             model.generate(input_ids, force_words_ids=[[[-1]]])
 
     def test_contrastive_search_batched(self):
+        # PT-only test: TF doesn't have constrained beam search
         # Tests that contrastive search works with batched inputs (i.e. has the same output as for non-batched inputs)
         articles = ["Foo", "Bar Baz"]
         tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
@@ -3006,75 +2817,8 @@ def test_contrastive_search_batched(self):
         max_score_diff = (output_sequences_batched.scores[0][1] - output_sequences.scores[0][0]).abs().max()
         self.assertTrue(max_score_diff < 1e-5)
 
-    def test_validate_generation_inputs(self):
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-roberta")
-        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-roberta")
-
-        encoder_input_str = "Hello world"
-        input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
-
-        # typos are quickly detected (the correct argument is `do_sample`)
-        with self.assertRaisesRegex(ValueError, "do_samples"):
-            model.generate(input_ids, do_samples=True)
-
-        # arbitrary arguments that will not be used anywhere are also not accepted
-        with self.assertRaisesRegex(ValueError, "foo"):
-            fake_model_kwargs = {"foo": "bar"}
-            model.generate(input_ids, **fake_model_kwargs)
-
-        # However, valid model_kwargs are accepted
-        valid_model_kwargs = {"attention_mask": torch.zeros_like(input_ids)}
-        model.generate(input_ids, **valid_model_kwargs)
-
-    def test_eos_token_id_int_and_list_greedy_search(self):
-        generation_kwargs = {
-            "do_sample": False,
-            "num_beams": 1,
-        }
-        expectation = 13
-
-        tokenizer = GPT2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        text = """Hello, my dog is cute and"""
-        tokens = tokenizer(text, return_tensors="pt").to(torch_device)
-
-        model = GPT2LMHeadModel.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
-
-        torch.manual_seed(0)
-        eos_token_id = 873
-        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
-        self.assertTrue(expectation == len(generated_tokens[0]))
-
-        torch.manual_seed(0)
-        eos_token_id = [873]
-        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
-        self.assertTrue(expectation == len(generated_tokens[0]))
-
-    def test_eos_token_id_int_and_list_contrastive_search(self):
-        generation_kwargs = {
-            "do_sample": False,
-            "num_beams": 1,
-            "penalty_alpha": 0.6,
-            "top_k": 4,
-        }
-        expectation = 17
-
-        tokenizer = GPT2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        text = """Hello, my dog is cute and"""
-        tokens = tokenizer(text, return_tensors="pt").to(torch_device)
-
-        model = GPT2LMHeadModel.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
-
-        torch.manual_seed(0)
-        eos_token_id = 225
-        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
-        self.assertTrue(expectation == len(generated_tokens[0]))
-
-        torch.manual_seed(0)
-        eos_token_id = [225]
-        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
-        self.assertTrue(expectation == len(generated_tokens[0]))
-
     def test_eos_token_id_int_and_list_top_k_top_sampling(self):
+        # Has TF equivalent: this test relies on random sampling
         generation_kwargs = {
             "do_sample": True,
             "num_beams": 1,
@@ -3082,43 +2826,83 @@ def test_eos_token_id_int_and_list_top_k_top_sampling(self):
             "top_k": 10,
             "temperature": 0.7,
         }
-        expectation = 15
+        expectation = 20
 
-        tokenizer = GPT2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
         text = """Hello, my dog is cute and"""
         tokens = tokenizer(text, return_tensors="pt").to(torch_device)
+        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
 
-        model = GPT2LMHeadModel.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
-
-        torch.manual_seed(0)
+        # Only some seeds will work both on CPU/GPU for a fixed `expectation` value.
+        # The selected seed is not guaranteed to work on all torch versions.
+        torch.manual_seed(1)
         eos_token_id = 846
         generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
         self.assertTrue(expectation == len(generated_tokens[0]))
 
-        torch.manual_seed(0)
-        eos_token_id = [846]
-        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
-        self.assertTrue(expectation == len(generated_tokens[0]))
-
-    def test_eos_token_id_int_and_list_beam_search(self):
-        generation_kwargs = {
-            "do_sample": False,
-            "num_beams": 3,
-        }
-        expectation = 13
-
-        tokenizer = GPT2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        text = """Hello, my dog is cute and"""
-        tokens = tokenizer(text, return_tensors="pt").to(torch_device)
-
-        model = GPT2LMHeadModel.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
-
-        torch.manual_seed(0)
-        eos_token_id = 873
+        torch.manual_seed(1)
+        eos_token_id = [846, 198]
         generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
         self.assertTrue(expectation == len(generated_tokens[0]))
 
-        torch.manual_seed(0)
-        eos_token_id = [873]
-        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
-        self.assertTrue(expectation == len(generated_tokens[0]))
+    def test_model_kwarg_encoder_signature_filtering(self):
+        # Has TF equivalent: ample use of framework-specific code
+        bart_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
+        article = """Hugging Face is a technology company based in New York and Paris."""
+        input_ids = bart_tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
+        bart_model = BartForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-bart").to(
+            torch_device
+        )
+        output = bart_model.generate(input_ids).cpu().numpy()
+
+        # Let's create a fake model that has a different signature. In particular, this fake model accepts "foo" as an
+        # argument. Because "foo" is not in the encoder signature and doesn't start with "decoder_", it will be part of
+        # the encoder kwargs prior to signature filtering, which would lead to an exception. But filtering kicks in and
+        # saves the day.
+        class FakeBart(BartForConditionalGeneration):
+            def forward(self, input_ids, foo=None, **kwargs):
+                return super().forward(input_ids, **kwargs)
+
+        bart_model = FakeBart.from_pretrained("hf-internal-testing/tiny-random-bart").to(torch_device)
+        fake_output = bart_model.generate(input_ids, foo="bar").cpu().numpy()
+        self.assertTrue(np.array_equal(output, fake_output))
+
+        # Encoder signature filtering only kicks in if it doesn't accept wildcard kwargs. The following test will fail
+        # because it doesn't do signature filtering.
+        class FakeEncoder(bart_model.model.encoder.__class__):
+            def forward(self, input_ids, **kwargs):
+                return super().forward(input_ids, **kwargs)
+
+        fake_encoder = FakeEncoder(bart_model.config, bart_model.model.shared).to(torch_device)
+        bart_model.model.encoder = fake_encoder
+
+        # Normal generation still works (the output will be different because the encoder weights are different)
+        fake_output = bart_model.generate(input_ids).cpu().numpy()
+        with self.assertRaises(TypeError):
+            # FakeEncoder.forward() accepts **kwargs -> no filtering -> type error due to unexpected input "foo"
+            bart_model.generate(input_ids, foo="bar")
+
+    def test_default_max_length_warning(self):
+        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        model.config.pad_token_id = tokenizer.eos_token_id
+
+        text = "Hello world"
+        tokenized_inputs = tokenizer([text], return_tensors="pt")
+        input_ids = tokenized_inputs.input_ids.to(torch_device)
+
+        # Default generation config value of 20 -> emits warning
+        with self.assertWarns(UserWarning):
+            model.generate(input_ids)
+
+        # Explicitly setting max_length to 20 -> no warning
+        with warnings.catch_warnings(record=True) as warning_list:
+            model.generate(input_ids, max_length=20)
+            self.assertEqual(len(warning_list), 0)
+
+        # Generation config max_length != 20 -> no warning
+        with warnings.catch_warnings(record=True) as warning_list:
+            # generation_config is modified -> legacy mode is disabled = generation_config takes precedence
+            model.generation_config.max_length = 10
+            model.generate(input_ids)
+            self.assertEqual(len(warning_list), 0)
diff --git a/tests/mixed_int8/README.md b/tests/mixed_int8/README.md
deleted file mode 100644
index 7a0f86dbb256..000000000000
--- a/tests/mixed_int8/README.md
+++ /dev/null
@@ -1,120 +0,0 @@
-# Testing mixed int8 quantization
-
-![HFxbitsandbytes.png](https://s3.amazonaws.com/moonup/production/uploads/1660567705337-62441d1d9fdefb55a0b7d12c.png)
-
-The following is the recipe on how to effectively debug `bitsandbytes` integration on Hugging Face `transformers`.
-
-## Library requirements
-
-+ `transformers>=4.22.0`
-+ `accelerate>=0.12.0` 
-+ `bitsandbytes>=0.31.5`.
-## Hardware requirements
-
-The following instructions are tested with 2 NVIDIA-Tesla T4 GPUs. To run successfully `bitsandbytes` you would need a 8-bit core tensor supported GPU. Note that Turing, Ampere or newer architectures - e.g. T4, RTX20s RTX30s, A40-A100, A6000 should be supported. 
-
-## Virutal envs
-
-```bash
-conda create --name int8-testing python==3.8
-pip install bitsandbytes>=0.31.5
-pip install accelerate>=0.12.0
-pip install transformers>=4.23.0
-```
-if `transformers>=4.23.0` is not released yet, then use:
-```
-pip install git+https://github.com/huggingface/transformers.git
-```
-
-## Troubleshooting
-
-A list of common errors:
-
-### Torch does not correctly do the operations on GPU
-
-First check that:
-
-```py
-import torch
-
-vec = torch.randn(1, 2, 3).to(0)
-```
-
-Works without any error. If not, install torch using `conda` like:
-
-```bash
-conda create --name int8-testing python==3.8
-conda install pytorch torchvision torchaudio cudatoolkit=11.6 -c pytorch -c conda-forge
-pip install bitsandbytes>=0.31.5
-pip install accelerate>=0.12.0
-pip install transformers>=4.23.0
-```
-For the latest pytorch instructions please see [this](https://pytorch.org/get-started/locally/)
-
-and the snippet above should work.
-
-### ` bitsandbytes operations are not supported under CPU!`
-
-This happens when some Linear weights are set to the CPU when using `accelerate`. Please check carefully `model.hf_device_map` and make sure that there is no `Linear` module that is assigned to CPU. It is fine to have the last module (usually the Lm_head) set on CPU.
-
-### `To use the type as a Parameter, please correct the detach() semantics defined by __torch_dispatch__() implementation.`
-
-Use the latest version of `accelerate` with a command such as: `pip install -U accelerate` and the problem should be solved.
-
-### `Parameter has no attribue .CB` 
-
-Same solution as above.
-
-### `RuntimeError: CUDA error: an illegal memory access was encountered ... consider passing CUDA_LAUNCH_BLOCKING=1`
-
-Run your script by pre-pending `CUDA_LAUNCH_BLOCKING=1` and you should observe an error as described in the next section.
-
-### `CUDA illegal memory error: an illegal memory access at line...`:
-
-Check the CUDA verisons with:
-```
-nvcc --version
-```
-and confirm it is the same version as the one detected by `bitsandbytes`. If not, run:
-```
-ls -l $CONDA_PREFIX/lib/libcudart.so
-```
-or 
-```
-ls -l $LD_LIBRARY_PATH
-```
-Check if `libcudart.so` has a correct symlink that is set. Sometimes `nvcc` detects the correct CUDA version but `bitsandbytes` doesn't. You have to make sure that the symlink that is set for the file `libcudart.so` is redirected to the correct CUDA file. 
-
-Here is an example of a badly configured CUDA installation:
-
-`nvcc --version` gives:
-
-![Screenshot 2022-08-15 at 15.12.23.png](https://s3.amazonaws.com/moonup/production/uploads/1660569220888-62441d1d9fdefb55a0b7d12c.png)
-
-which means that the detected CUDA version is 11.3 but `bitsandbytes` outputs:
-
-![image.png](https://s3.amazonaws.com/moonup/production/uploads/1660569284243-62441d1d9fdefb55a0b7d12c.png)
-
-First check:
-
-```bash
-echo $LD_LIBRARY_PATH
-```
-
-If this contains multiple paths separated by `:`. Then you have to make sure that the correct CUDA version is set. By doing:
-
-```bash
-ls -l $path/libcudart.so
-```
-
-On each path (`$path`) separated by `:`.
-If not, simply run
-```bash
-ls -l $LD_LIBRARY_PATH/libcudart.so
-```
-
-and you can see
-
-![Screenshot 2022-08-15 at 15.12.33.png](https://s3.amazonaws.com/moonup/production/uploads/1660569176504-62441d1d9fdefb55a0b7d12c.png)
-
-If you see that the file is linked to the wrong CUDA version (here 10.2), find the correct location for `libcudart.so` (`find --name libcudart.so`) and replace the environment variable `LD_LIBRARY_PATH` with the one containing the correct `libcudart.so` file.
\ No newline at end of file
diff --git a/tests/mixed_int8/test_mixed_int8.py b/tests/mixed_int8/test_mixed_int8.py
deleted file mode 100644
index 56ce10638d50..000000000000
--- a/tests/mixed_int8/test_mixed_int8.py
+++ /dev/null
@@ -1,273 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a clone of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import gc
-import tempfile
-import unittest
-
-from transformers import (
-    AutoModel,
-    AutoModelForCausalLM,
-    AutoModelForSeq2SeqLM,
-    AutoModelForSequenceClassification,
-    AutoTokenizer,
-    pipeline,
-)
-from transformers.testing_utils import (
-    is_torch_available,
-    require_accelerate,
-    require_bitsandbytes,
-    require_torch,
-    require_torch_gpu,
-    require_torch_multi_gpu,
-    slow,
-)
-
-
-if is_torch_available():
-    import torch
-
-
-@require_bitsandbytes
-@require_accelerate
-@require_torch
-@require_torch_gpu
-@slow
-class BaseMixedInt8Test(unittest.TestCase):
-    # We keep the constants inside the init function and model loading inside setUp function
-
-    # We need to test on relatively large models (aka >1b parameters otherwise the quantiztion may not work as expected)
-    # Therefore here we use only bloom-1b3 to test our module
-    model_name = "bigscience/bloom-1b7"
-
-    # Constant values
-    EXPECTED_RELATIVE_DIFFERENCE = (
-        1.540025  # This was obtained on a Quadro RTX 8000 so the number might slightly change
-    )
-
-    input_text = "Hello my name is"
-    EXPECTED_OUTPUT = "Hello my name is John.\nI am a friend of the family.\n"
-    MAX_NEW_TOKENS = 10
-
-    def setUp(self):
-        # Models and tokenizer
-        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
-
-
-class MixedInt8Test(BaseMixedInt8Test):
-    def setUp(self):
-        super().setUp()
-
-        # Models and tokenizer
-        self.model_fp16 = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", device_map="auto")
-        self.model_8bit = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_8bit=True, device_map="auto")
-
-    def tearDown(self):
-        r"""
-        TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
-        avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
-        """
-        del self.model_fp16
-        del self.model_8bit
-
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_memory_footprint(self):
-        r"""
-        A simple test to check if the model conversion has been done correctly by checking on the
-        memory footprint of the converted model and the class type of the linear layers of the converted models
-        """
-        from bitsandbytes.nn import Int8Params
-
-        mem_fp16 = self.model_fp16.get_memory_footprint()
-        mem_8bit = self.model_8bit.get_memory_footprint()
-
-        self.assertAlmostEqual(mem_fp16 / mem_8bit, self.EXPECTED_RELATIVE_DIFFERENCE)
-        self.assertTrue(self.model_8bit.transformer.h[0].mlp.dense_4h_to_h.weight.__class__ == Int8Params)
-
-    def test_generate_quality(self):
-        r"""
-        Test the generation quality of the quantized model and see that we are matching the expected output.
-        Given that we are operating on small numbers + the testing model is relatively small, we might not get
-        the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
-        """
-        encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
-        output_sequences = self.model_8bit.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
-
-        self.assertEqual(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
-
-    def test_warns_save_pretrained(self):
-        r"""
-        Test whether trying to save a model after converting it in 8-bit will throw a warning.
-        """
-        with self.assertWarns(UserWarning), tempfile.TemporaryDirectory() as tmpdirname:
-            self.model_8bit.save_pretrained(tmpdirname)
-
-    def test_device_and_dtype_assignment(self):
-        r"""
-        Test whether trying to cast (or assigning a device to) a model after converting it in 8-bit will throw an error.
-        Checks also if other models are casted correctly.
-        """
-        with self.assertRaises(ValueError):
-            # Tries with `str`
-            self.model_8bit.to("cpu")
-
-        with self.assertRaises(ValueError):
-            # Tries with a `dtype``
-            self.model_8bit.to(torch.float16)
-
-        with self.assertRaises(ValueError):
-            # Tries with a `device`
-            self.model_8bit.to(torch.device("cuda:0"))
-
-        with self.assertRaises(ValueError):
-            # Tries with a `device`
-            self.model_8bit.float()
-
-        with self.assertRaises(ValueError):
-            # Tries with a `device`
-            self.model_8bit.half()
-
-        # Test if we did not break anything
-        encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
-
-        self.model_fp16 = self.model_fp16.to(torch.float32)
-        _ = self.model_fp16.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
-
-        # Check this does not throw an error
-        _ = self.model_fp16.to("cpu")
-
-        # Check this does not throw an error
-        _ = self.model_fp16.half()
-
-        # Check this does not throw an error
-        _ = self.model_fp16.float()
-
-    def test_fp32_int8_conversion(self):
-        r"""
-        Test whether it is possible to mix both `int8` and `fp32` weights when using `keep_in_fp32_modules` correctly.
-        """
-        model = AutoModelForSeq2SeqLM.from_pretrained("t5-small", load_in_8bit=True, device_map="auto")
-        self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == torch.float32)
-
-
-class MixedInt8ModelClassesTest(BaseMixedInt8Test):
-    def setUp(self):
-        super().setUp()
-        # model_name
-        self.model_name = "bigscience/bloom-560m"
-        self.seq_to_seq_name = "t5-small"
-
-        # Different types of model
-
-        self.base_model = AutoModel.from_pretrained(self.model_name, load_in_8bit=True, device_map="auto")
-        # Sequence classification model
-        self.sequence_model = AutoModelForSequenceClassification.from_pretrained(
-            self.model_name, load_in_8bit=True, device_map="auto"
-        )
-        # CausalLM model
-        self.model_8bit = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_8bit=True, device_map="auto")
-        # Seq2seq model
-        self.seq_to_seq_model = AutoModelForSeq2SeqLM.from_pretrained(
-            self.seq_to_seq_name, load_in_8bit=True, device_map="auto"
-        )
-
-    def tearDown(self):
-        r"""
-        TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
-        avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
-        """
-        del self.base_model
-        del self.sequence_model
-        del self.model_8bit
-        del self.seq_to_seq_model
-
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_correct_head_class(self):
-        r"""
-        A simple test to check if the last modules for some classes (AutoModelForCausalLM or SequenceClassification)
-        are kept in their native class.
-        """
-        from bitsandbytes.nn import Int8Params
-
-        # last param of a base model should be a linear8bit module
-        self.assertTrue(self.base_model.h[-1].mlp.dense_4h_to_h.weight.__class__ == Int8Params)
-
-        # Other heads should be nn.Parameter
-        self.assertTrue(self.model_8bit.lm_head.weight.__class__ == torch.nn.Parameter)
-        self.assertTrue(self.sequence_model.score.weight.__class__ == torch.nn.Parameter)
-        self.assertTrue(self.seq_to_seq_model.lm_head.weight.__class__ == torch.nn.Parameter)
-
-
-class MixedInt8TestPipeline(BaseMixedInt8Test):
-    def setUp(self):
-        super().setUp()
-
-    def tearDown(self):
-        r"""
-        TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
-        avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
-        """
-        del self.pipe
-
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_pipeline(self):
-        r"""
-        The aim of this test is to verify that the mixed int8 is compatible with `pipeline` from transformers. Since
-        we used pipline for inference speed benchmarking we want to make sure that this feature does not break anything
-        on pipline.
-        """
-        # self._clear_cuda_cache()
-        self.pipe = pipeline(
-            "text-generation",
-            model=self.model_name,
-            model_kwargs={"device_map": "auto", "load_in_8bit": True},
-            max_new_tokens=self.MAX_NEW_TOKENS,
-        )
-
-        # Real second forward pass
-        pipeline_output = self.pipe(self.input_text)
-        self.assertEqual(pipeline_output[0]["generated_text"], self.EXPECTED_OUTPUT)
-
-
-@require_torch_multi_gpu
-class MixedInt8TestMultiGpu(BaseMixedInt8Test):
-    def setUp(self):
-        super().setUp()
-
-    def test_multi_gpu_loading(self):
-        r"""
-        This tests that the model has been loaded and can be used correctly on a multi-GPU setup.
-        Let's just try to load a model on 2 GPUs and see if it works. The model we test has ~2GB of total, 3GB should suffice
-        """
-
-        memory_mapping = {0: "1GB", 1: "2GB"}
-        model_parallel = AutoModelForCausalLM.from_pretrained(
-            self.model_name, load_in_8bit=True, max_memory=memory_mapping, device_map="auto"
-        )
-
-        # Check correct device map
-        self.assertEqual(set(model_parallel.hf_device_map.values()), {0, 1})
-
-        # Check that inference pass works on the model
-        encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
-
-        # Second real batch
-        output_parallel = model_parallel.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
-        self.assertEqual(self.tokenizer.decode(output_parallel[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
diff --git a/tests/models/albert/test_modeling_albert.py b/tests/models/albert/test_modeling_albert.py
index 9acb5ba99791..75c84ad0d3d3 100644
--- a/tests/models/albert/test_modeling_albert.py
+++ b/tests/models/albert/test_modeling_albert.py
@@ -22,6 +22,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -53,8 +54,9 @@ def __init__(
         vocab_size=99,
         embedding_size=16,
         hidden_size=36,
-        num_hidden_layers=6,
-        num_hidden_groups=6,
+        num_hidden_layers=2,
+        # this needs to be the same as `num_hidden_layers`!
+        num_hidden_groups=2,
         num_attention_heads=6,
         intermediate_size=37,
         hidden_act="gelu",
@@ -239,8 +241,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class AlbertModelTest(ModelTesterMixin, unittest.TestCase):
-
+class AlbertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             AlbertModel,
@@ -254,6 +255,18 @@ class AlbertModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": AlbertModel,
+            "fill-mask": AlbertForMaskedLM,
+            "question-answering": AlbertForQuestionAnswering,
+            "text-classification": AlbertForSequenceClassification,
+            "token-classification": AlbertForTokenClassification,
+            "zero-shot": AlbertForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
     fx_compatible = True
 
     # special case for ForPreTraining model
diff --git a/tests/models/albert/test_modeling_flax_albert.py b/tests/models/albert/test_modeling_flax_albert.py
index 802952e52cbd..0bdc8065bce9 100644
--- a/tests/models/albert/test_modeling_flax_albert.py
+++ b/tests/models/albert/test_modeling_flax_albert.py
@@ -24,6 +24,7 @@
 
 if is_flax_available():
     import jax.numpy as jnp
+
     from transformers.models.albert.modeling_flax_albert import (
         FlaxAlbertForMaskedLM,
         FlaxAlbertForMultipleChoice,
@@ -47,7 +48,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -117,7 +118,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_flax
 class FlaxAlbertModelTest(FlaxModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             FlaxAlbertModel,
diff --git a/tests/models/albert/test_modeling_tf_albert.py b/tests/models/albert/test_modeling_tf_albert.py
index ad10228a5182..7314eb4749a8 100644
--- a/tests/models/albert/test_modeling_tf_albert.py
+++ b/tests/models/albert/test_modeling_tf_albert.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import AlbertConfig, is_tf_available
@@ -22,6 +24,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -53,7 +56,7 @@ def __init__(
         vocab_size=99,
         embedding_size=16,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -77,7 +80,7 @@ def __init__(
         self.vocab_size = 99
         self.embedding_size = 16
         self.hidden_size = 32
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.num_attention_heads = 4
         self.intermediate_size = 37
         self.hidden_act = "gelu"
@@ -227,8 +230,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFAlbertModelTest(TFModelTesterMixin, unittest.TestCase):
-
+class TFAlbertModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             TFAlbertModel,
@@ -242,6 +244,18 @@ class TFAlbertModelTest(TFModelTesterMixin, unittest.TestCase):
         if is_tf_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": TFAlbertModel,
+            "fill-mask": TFAlbertForMaskedLM,
+            "question-answering": TFAlbertForQuestionAnswering,
+            "text-classification": TFAlbertForSequenceClassification,
+            "token-classification": TFAlbertForTokenClassification,
+            "zero-shot": TFAlbertForSequenceClassification,
+        }
+        if is_tf_available()
+        else {}
+    )
     test_head_masking = False
     test_onnx = False
 
@@ -286,27 +300,6 @@ def test_for_question_answering(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_albert_for_question_answering(*config_and_inputs)
 
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        list_lm_models = [TFAlbertForPreTraining, TFAlbertForMaskedLM]
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-
-            if model_class in list_lm_models:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert isinstance(name, dict)
-                for k, v in name.items():
-                    assert isinstance(v, tf.Variable)
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/albert/test_tokenization_albert.py b/tests/models/albert/test_tokenization_albert.py
index 5459917775d9..c25cfaec77b4 100644
--- a/tests/models/albert/test_tokenization_albert.py
+++ b/tests/models/albert/test_tokenization_albert.py
@@ -27,7 +27,6 @@
 @require_sentencepiece
 @require_tokenizers
 class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = AlbertTokenizer
     rust_tokenizer_class = AlbertTokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/mctct/__init__.py b/tests/models/align/__init__.py
similarity index 100%
rename from tests/models/mctct/__init__.py
rename to tests/models/align/__init__.py
diff --git a/tests/models/align/test_modeling_align.py b/tests/models/align/test_modeling_align.py
new file mode 100644
index 000000000000..47918bcd8389
--- /dev/null
+++ b/tests/models/align/test_modeling_align.py
@@ -0,0 +1,616 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch ALIGN model. """
+
+
+import inspect
+import os
+import tempfile
+import unittest
+
+import requests
+
+from transformers import AlignConfig, AlignProcessor, AlignTextConfig, AlignVisionConfig
+from transformers.testing_utils import (
+    is_flax_available,
+    require_torch,
+    require_vision,
+    slow,
+    torch_device,
+)
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import (
+    ModelTesterMixin,
+    _config_zero_init,
+    floats_tensor,
+    ids_tensor,
+    random_attention_mask,
+)
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        AlignModel,
+        AlignTextModel,
+        AlignVisionModel,
+    )
+    from transformers.models.align.modeling_align import ALIGN_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+if is_vision_available():
+    from PIL import Image
+
+
+if is_flax_available():
+    pass
+
+
+class AlignVisionModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        image_size=32,
+        num_channels=3,
+        kernel_sizes=[3, 3, 5],
+        in_channels=[32, 16, 24],
+        out_channels=[16, 24, 30],
+        hidden_dim=64,
+        strides=[1, 1, 2],
+        num_block_repeats=[1, 1, 2],
+        expand_ratios=[1, 6, 6],
+        is_training=True,
+        hidden_act="gelu",
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.num_channels = num_channels
+        self.kernel_sizes = kernel_sizes
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.hidden_dim = hidden_dim
+        self.strides = strides
+        self.num_block_repeats = num_block_repeats
+        self.expand_ratios = expand_ratios
+        self.is_training = is_training
+        self.hidden_act = hidden_act
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        config = self.get_config()
+
+        return config, pixel_values
+
+    def get_config(self):
+        return AlignVisionConfig(
+            num_channels=self.num_channels,
+            kernel_sizes=self.kernel_sizes,
+            in_channels=self.in_channels,
+            out_channels=self.out_channels,
+            hidden_dim=self.hidden_dim,
+            strides=self.strides,
+            num_block_repeats=self.num_block_repeats,
+            expand_ratios=self.expand_ratios,
+            hidden_act=self.hidden_act,
+        )
+
+    def create_and_check_model(self, config, pixel_values):
+        model = AlignVisionModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(pixel_values)
+
+        patch_size = self.image_size // 4
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size, config.hidden_dim, patch_size, patch_size)
+        )
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, config.hidden_dim))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class AlignVisionModelTest(ModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as ALIGN does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (AlignVisionModel,) if is_torch_available() else ()
+    fx_compatible = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+    has_attentions = False
+
+    def setUp(self):
+        self.model_tester = AlignVisionModelTester(self)
+        self.config_tester = ConfigTester(
+            self, config_class=AlignVisionConfig, has_text_modality=False, hidden_size=37
+        )
+
+    def test_config(self):
+        self.create_and_test_config_common_properties()
+        self.config_tester.create_and_test_config_to_json_string()
+        self.config_tester.create_and_test_config_to_json_file()
+        self.config_tester.create_and_test_config_from_and_save_pretrained()
+        self.config_tester.create_and_test_config_with_num_labels()
+        self.config_tester.check_config_can_be_init_without_params()
+        self.config_tester.check_config_arguments_init()
+
+    def create_and_test_config_common_properties(self):
+        return
+
+    @unittest.skip(reason="AlignVisionModel does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="AlignVisionModel does not support input and output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+            num_blocks = sum(config.num_block_repeats) * 4
+            self.assertEqual(len(hidden_states), num_blocks)
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [self.model_tester.image_size // 2, self.model_tester.image_size // 2],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    def test_training(self):
+        pass
+
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in ALIGN_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = AlignVisionModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+class AlignTextModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        config = self.get_config()
+
+        return config, input_ids, token_type_ids, input_mask
+
+    def get_config(self):
+        return AlignTextConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+        )
+
+    def create_and_check_model(self, config, input_ids, token_type_ids, input_mask):
+        model = AlignTextModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+            result = model(input_ids, token_type_ids=token_type_ids)
+            result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class AlignTextModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (AlignTextModel,) if is_torch_available() else ()
+    fx_compatible = False
+    test_pruning = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = AlignTextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=AlignTextConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_training(self):
+        pass
+
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(reason="ALIGN does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="AlignTextModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @unittest.skip(reason="AlignTextModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_to_base(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in ALIGN_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = AlignTextModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+class AlignModelTester:
+    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
+        if text_kwargs is None:
+            text_kwargs = {}
+        if vision_kwargs is None:
+            vision_kwargs = {}
+
+        self.parent = parent
+        self.text_model_tester = AlignTextModelTester(parent, **text_kwargs)
+        self.vision_model_tester = AlignVisionModelTester(parent, **vision_kwargs)
+        self.is_training = is_training
+
+    def prepare_config_and_inputs(self):
+        test_config, input_ids, token_type_ids, input_mask = self.text_model_tester.prepare_config_and_inputs()
+        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
+
+        config = self.get_config()
+
+        return config, input_ids, token_type_ids, input_mask, pixel_values
+
+    def get_config(self):
+        return AlignConfig.from_text_vision_configs(
+            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
+        )
+
+    def create_and_check_model(self, config, input_ids, token_type_ids, attention_mask, pixel_values):
+        model = AlignModel(config).to(torch_device).eval()
+        with torch.no_grad():
+            result = model(input_ids, pixel_values, attention_mask, token_type_ids)
+        self.parent.assertEqual(
+            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
+        )
+        self.parent.assertEqual(
+            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, token_type_ids, input_mask, pixel_values = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "token_type_ids": token_type_ids,
+            "attention_mask": input_mask,
+            "pixel_values": pixel_values,
+            "return_loss": True,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class AlignModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (AlignModel,) if is_torch_available() else ()
+    pipeline_model_mapping = {"feature-extraction": AlignModel} if is_torch_available() else {}
+    fx_compatible = False
+    test_head_masking = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_attention_outputs = False
+
+    def setUp(self):
+        self.model_tester = AlignModelTester(self)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip(reason="Start to fail after using torch `cu118`.")
+    def test_multi_gpu_data_parallel_forward(self):
+        super().test_multi_gpu_data_parallel_forward()
+
+    @unittest.skip(reason="Hidden_states is tested in individual model tests")
+    def test_hidden_states_output(self):
+        pass
+
+    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="Retain_grad is tested in individual model tests")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    @unittest.skip(reason="AlignModel does not have input/output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    # override as the `temperature` parameter initilization is different for ALIGN
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    # check if `temperature` is initilized as per the original implementation
+                    if name == "temperature":
+                        self.assertAlmostEqual(
+                            param.data.item(),
+                            1.0,
+                            delta=1e-3,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    elif name == "text_projection.weight":
+                        self.assertTrue(
+                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    else:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+    def _create_and_check_torchscript(self, config, inputs_dict):
+        if not self.test_torchscript:
+            return
+
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        configs_no_init.torchscript = True
+        configs_no_init.return_dict = False
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
+
+            try:
+                input_ids = inputs_dict["input_ids"]
+                pixel_values = inputs_dict["pixel_values"]  # ALIGN needs pixel_values
+                traced_model = torch.jit.trace(model, (input_ids, pixel_values))
+            except RuntimeError:
+                self.fail("Couldn't trace module.")
+
+            with tempfile.TemporaryDirectory() as tmp_dir_name:
+                pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
+
+                try:
+                    torch.jit.save(traced_model, pt_file_name)
+                except Exception:
+                    self.fail("Couldn't save module.")
+
+                try:
+                    loaded_model = torch.jit.load(pt_file_name)
+                except Exception:
+                    self.fail("Couldn't load module.")
+
+            model.to(torch_device)
+            model.eval()
+
+            loaded_model.to(torch_device)
+            loaded_model.eval()
+
+            model_state_dict = model.state_dict()
+            loaded_model_state_dict = loaded_model.state_dict()
+
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
+            self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
+
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
+            models_equal = True
+            for layer_name, p1 in model_state_dict.items():
+                p2 = loaded_model_state_dict[layer_name]
+                if p1.data.ne(p2.data).sum() > 0:
+                    models_equal = False
+
+            self.assertTrue(models_equal)
+
+    def test_load_vision_text_config(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        # Save AlignConfig and check if we can load AlignVisionConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            vision_config = AlignVisionConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
+
+        # Save AlignConfig and check if we can load AlignTextConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            text_config = AlignTextConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in ALIGN_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = AlignModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    return im
+
+
+@require_vision
+@require_torch
+class AlignModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference(self):
+        model_name = "kakaobrain/align-base"
+        model = AlignModel.from_pretrained(model_name).to(torch_device)
+        processor = AlignProcessor.from_pretrained(model_name)
+
+        image = prepare_img()
+        texts = ["a photo of a cat", "a photo of a dog"]
+        inputs = processor(text=texts, images=image, return_tensors="pt").to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # verify the logits
+        self.assertEqual(
+            outputs.logits_per_image.shape,
+            torch.Size((inputs.pixel_values.shape[0], inputs.input_ids.shape[0])),
+        )
+        self.assertEqual(
+            outputs.logits_per_text.shape,
+            torch.Size((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
+        )
+        expected_logits = torch.tensor([[9.7093, 3.4679]], device=torch_device)
+        self.assertTrue(torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3))
diff --git a/tests/models/align/test_processor_align.py b/tests/models/align/test_processor_align.py
new file mode 100644
index 000000000000..12fbea5a50cd
--- /dev/null
+++ b/tests/models/align/test_processor_align.py
@@ -0,0 +1,207 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import shutil
+import tempfile
+import unittest
+
+import numpy as np
+import pytest
+
+from transformers import BertTokenizer, BertTokenizerFast
+from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES
+from transformers.testing_utils import require_vision
+from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import AlignProcessor, EfficientNetImageProcessor
+
+
+@require_vision
+class AlignProcessorTest(unittest.TestCase):
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+
+        vocab_tokens = [
+            "[UNK]",
+            "[CLS]",
+            "[SEP]",
+            "[PAD]",
+            "[MASK]",
+            "want",
+            "##want",
+            "##ed",
+            "wa",
+            "un",
+            "runn",
+            "##ing",
+            ",",
+            "low",
+            "lowest",
+        ]
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+
+        image_processor_map = {
+            "do_resize": True,
+            "size": 20,
+            "do_center_crop": True,
+            "crop_size": 18,
+            "do_normalize": True,
+            "image_mean": [0.48145466, 0.4578275, 0.40821073],
+            "image_std": [0.26862954, 0.26130258, 0.27577711],
+        }
+        self.image_processor_file = os.path.join(self.tmpdirname, IMAGE_PROCESSOR_NAME)
+        with open(self.image_processor_file, "w", encoding="utf-8") as fp:
+            json.dump(image_processor_map, fp)
+
+    def get_tokenizer(self, **kwargs):
+        return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_rust_tokenizer(self, **kwargs):
+        return BertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_image_processor(self, **kwargs):
+        return EfficientNetImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def prepare_image_inputs(self):
+        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
+        or a list of PyTorch tensors if one specifies torchify=True.
+        """
+
+        image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
+        image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
+        return image_inputs
+
+    def test_save_load_pretrained_default(self):
+        tokenizer_slow = self.get_tokenizer()
+        tokenizer_fast = self.get_rust_tokenizer()
+        image_processor = self.get_image_processor()
+
+        processor_slow = AlignProcessor(tokenizer=tokenizer_slow, image_processor=image_processor)
+        processor_slow.save_pretrained(self.tmpdirname)
+        processor_slow = AlignProcessor.from_pretrained(self.tmpdirname, use_fast=False)
+
+        processor_fast = AlignProcessor(tokenizer=tokenizer_fast, image_processor=image_processor)
+        processor_fast.save_pretrained(self.tmpdirname)
+        processor_fast = AlignProcessor.from_pretrained(self.tmpdirname)
+
+        self.assertEqual(processor_slow.tokenizer.get_vocab(), tokenizer_slow.get_vocab())
+        self.assertEqual(processor_fast.tokenizer.get_vocab(), tokenizer_fast.get_vocab())
+        self.assertEqual(tokenizer_slow.get_vocab(), tokenizer_fast.get_vocab())
+        self.assertIsInstance(processor_slow.tokenizer, BertTokenizer)
+        self.assertIsInstance(processor_fast.tokenizer, BertTokenizerFast)
+
+        self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string())
+        self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string())
+        self.assertIsInstance(processor_slow.image_processor, EfficientNetImageProcessor)
+        self.assertIsInstance(processor_fast.image_processor, EfficientNetImageProcessor)
+
+    def test_save_load_pretrained_additional_features(self):
+        processor = AlignProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
+        processor.save_pretrained(self.tmpdirname)
+
+        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
+        image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
+
+        processor = AlignProcessor.from_pretrained(
+            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
+        )
+
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
+        self.assertIsInstance(processor.tokenizer, BertTokenizerFast)
+
+        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
+        self.assertIsInstance(processor.image_processor, EfficientNetImageProcessor)
+
+    def test_image_processor(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = AlignProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        image_input = self.prepare_image_inputs()
+
+        input_image_proc = image_processor(image_input, return_tensors="np")
+        input_processor = processor(images=image_input, return_tensors="np")
+
+        for key in input_image_proc.keys():
+            self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
+
+    def test_tokenizer(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = AlignProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        input_str = "lower newer"
+
+        encoded_processor = processor(text=input_str)
+
+        encoded_tok = tokenizer(input_str, padding="max_length", max_length=64)
+
+        for key in encoded_tok.keys():
+            self.assertListEqual(encoded_tok[key], encoded_processor[key])
+
+    def test_processor(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = AlignProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+
+        self.assertListEqual(list(inputs.keys()), ["input_ids", "token_type_ids", "attention_mask", "pixel_values"])
+
+        # test if it raises when no input is passed
+        with pytest.raises(ValueError):
+            processor()
+
+    def test_tokenizer_decode(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = AlignProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
+
+        decoded_processor = processor.batch_decode(predicted_ids)
+        decoded_tok = tokenizer.batch_decode(predicted_ids)
+
+        self.assertListEqual(decoded_tok, decoded_processor)
+
+    def test_model_input_names(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = AlignProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+
+        self.assertListEqual(list(inputs.keys()), processor.model_input_names)
diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py
index 63f3d621716f..e1510c640a7b 100755
--- a/tests/models/altclip/test_modeling_altclip.py
+++ b/tests/models/altclip/test_modeling_altclip.py
@@ -21,8 +21,8 @@
 import unittest
 
 import numpy as np
-
 import requests
+
 from transformers import AltCLIPConfig, AltCLIPProcessor, AltCLIPTextConfig, AltCLIPVisionConfig
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
 from transformers.utils import is_torch_available, is_vision_available
@@ -35,6 +35,7 @@
     ids_tensor,
     random_attention_mask,
 )
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -59,7 +60,7 @@ def __init__(
         is_training=True,
         hidden_size=32,
         projection_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -211,7 +212,7 @@ def __init__(
         hidden_size=32,
         projection_dim=32,
         project_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -292,12 +293,16 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class AltCLIPTextModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (AltCLIPTextModel,) if is_torch_available() else ()
     fx_compatible = True
     test_pruning = False
     test_head_masking = False
 
+    # TODO (@SunMarc): Fix me
+    @unittest.skip("It's broken.")
+    def test_resize_tokens_embeddings(self):
+        super().test_resize_tokens_embeddings()
+
     def setUp(self):
         self.model_tester = AltCLIPTextModelTester(self)
         self.config_tester = ConfigTester(self, config_class=AltCLIPTextConfig, hidden_size=37)
@@ -343,7 +348,6 @@ def test_model_from_pretrained(self):
 
 class AltCLIPModelTester:
     def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
-
         if text_kwargs is None:
             text_kwargs = {}
         if vision_kwargs is None:
@@ -394,15 +398,24 @@ def prepare_img():
 
 
 @require_torch
-class AltCLIPModelTest(ModelTesterMixin, unittest.TestCase):
-
+class AltCLIPModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (AltCLIPModel,) if is_torch_available() else ()
+    pipeline_model_mapping = {"feature-extraction": AltCLIPModel} if is_torch_available() else {}
     fx_compatible = True
     test_head_masking = False
     test_pruning = False
     test_resize_embeddings = False
     test_attention_outputs = False
 
+    # TODO: Fix the failed tests when this model gets more usage
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if pipeline_test_casse_name == "FeatureExtractionPipelineTests":
+            return True
+
+        return False
+
     def setUp(self):
         self.model_tester = AltCLIPModelTester(self)
 
@@ -490,8 +503,28 @@ def _create_and_check_torchscript(self, config, inputs_dict):
             model_state_dict = model.state_dict()
             loaded_model_state_dict = loaded_model.state_dict()
 
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
             self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
 
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
             models_equal = True
             for layer_name, p1 in model_state_dict.items():
                 p2 = loaded_model_state_dict[layer_name]
diff --git a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py
index cf6bb1d27f79..85d696479c2c 100644
--- a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py
+++ b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py
@@ -33,6 +33,7 @@
     import torch
 
 
+# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list
 def floats_list(shape, scale=1.0, rng=None, name=None):
     """Creates a random float32 tensor"""
     if rng is None:
@@ -102,7 +103,6 @@ def _flatten(list_of_lists):
 @require_torch
 @require_torchaudio
 class ASTFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
-
     feature_extraction_class = ASTFeatureExtractor
 
     def setUp(self):
@@ -126,6 +126,14 @@ def test_call(self):
         for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
             self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
 
+        # Test 2-D numpy arrays are batched.
+        speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
+        np_speech_inputs = np.asarray(speech_inputs)
+        encoded_sequences_1 = feat_extract(speech_inputs, return_tensors="np").input_values
+        encoded_sequences_2 = feat_extract(np_speech_inputs, return_tensors="np").input_values
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+
     @require_torch
     def test_double_precision_pad(self):
         import torch
@@ -161,6 +169,7 @@ def test_integration(self):
         # fmt: on
 
         input_speech = self._load_datasamples(1)
-        feaure_extractor = ASTFeatureExtractor()
-        input_values = feaure_extractor(input_speech, return_tensors="pt").input_values
+        feature_extractor = ASTFeatureExtractor()
+        input_values = feature_extractor(input_speech, return_tensors="pt").input_values
+        self.assertEquals(input_values.shape, (1, 1024, 128))
         self.assertTrue(torch.allclose(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, atol=1e-4))
diff --git a/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
index 90d748ebea4a..ce596d84e372 100644
--- a/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
+++ b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
@@ -18,12 +18,14 @@
 import unittest
 
 from huggingface_hub import hf_hub_download
+
 from transformers import ASTConfig
 from transformers.testing_utils import require_torch, require_torchaudio, slow, torch_device
 from transformers.utils import cached_property, is_torch_available, is_torchaudio_available
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -53,7 +55,7 @@ def __init__(
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -139,7 +141,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class ASTModelTest(ModelTesterMixin, unittest.TestCase):
+class ASTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_common.py, as AST does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
@@ -153,11 +155,25 @@ class ASTModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {"audio-classification": ASTForAudioClassification, "feature-extraction": ASTModel}
+        if is_torch_available()
+        else {}
+    )
     fx_compatible = False
     test_pruning = False
     test_resize_embeddings = False
     test_head_masking = False
 
+    # TODO: Fix the failed tests when this model gets more usage
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if pipeline_test_casse_name == "AudioClassificationPipelineTests":
+            return True
+
+        return False
+
     def setUp(self):
         self.model_tester = ASTModelTester(self)
         self.config_tester = ConfigTester(self, config_class=ASTConfig, has_text_modality=False, hidden_size=37)
@@ -225,7 +241,6 @@ def default_feature_extractor(self):
 
     @slow
     def test_inference_audio_classification(self):
-
         feature_extractor = self.default_feature_extractor
         model = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593").to(torch_device)
 
diff --git a/tests/models/auto/test_configuration_auto.py b/tests/models/auto/test_configuration_auto.py
index 030a03aa6d6c..fa05952d29a3 100644
--- a/tests/models/auto/test_configuration_auto.py
+++ b/tests/models/auto/test_configuration_auto.py
@@ -21,6 +21,7 @@
 import unittest
 from pathlib import Path
 
+import transformers
 import transformers.models.auto
 from transformers.models.auto.configuration_auto import CONFIG_MAPPING, AutoConfig
 from transformers.models.bert.configuration_bert import BertConfig
@@ -37,6 +38,9 @@
 
 
 class AutoConfigTest(unittest.TestCase):
+    def setUp(self):
+        transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0
+
     def test_module_spec(self):
         self.assertIsNotNone(transformers.models.auto.__spec__)
         self.assertIsNotNone(importlib.util.find_spec("transformers.models.auto"))
@@ -108,6 +112,13 @@ def test_configuration_not_found(self):
             _ = AutoConfig.from_pretrained("hf-internal-testing/no-config-test-repo")
 
     def test_from_pretrained_dynamic_config(self):
+        # If remote code is not set, we will time out when asking whether to load the model.
+        with self.assertRaises(ValueError):
+            config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model")
+        # If remote code is disabled, we can't load this config.
+        with self.assertRaises(ValueError):
+            config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=False)
+
         config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=True)
         self.assertEqual(config.__class__.__name__, "NewModelConfig")
 
@@ -116,3 +127,25 @@ def test_from_pretrained_dynamic_config(self):
             config.save_pretrained(tmp_dir)
             reloaded_config = AutoConfig.from_pretrained(tmp_dir, trust_remote_code=True)
         self.assertEqual(reloaded_config.__class__.__name__, "NewModelConfig")
+
+    def test_from_pretrained_dynamic_config_conflict(self):
+        class NewModelConfigLocal(BertConfig):
+            model_type = "new-model"
+
+        try:
+            AutoConfig.register("new-model", NewModelConfigLocal)
+            # If remote code is not set, the default is to use local
+            config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model")
+            self.assertEqual(config.__class__.__name__, "NewModelConfigLocal")
+
+            # If remote code is disabled, we load the local one.
+            config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=False)
+            self.assertEqual(config.__class__.__name__, "NewModelConfigLocal")
+
+            # If remote is enabled, we load from the Hub
+            config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=True)
+            self.assertEqual(config.__class__.__name__, "NewModelConfig")
+
+        finally:
+            if "new-model" in CONFIG_MAPPING._extra_content:
+                del CONFIG_MAPPING._extra_content["new-model"]
diff --git a/tests/models/auto/test_feature_extraction_auto.py b/tests/models/auto/test_feature_extraction_auto.py
index 35d3ac0fa4de..ed50006741eb 100644
--- a/tests/models/auto/test_feature_extraction_auto.py
+++ b/tests/models/auto/test_feature_extraction_auto.py
@@ -19,6 +19,7 @@
 import unittest
 from pathlib import Path
 
+import transformers
 from transformers import (
     CONFIG_MAPPING,
     FEATURE_EXTRACTOR_MAPPING,
@@ -42,6 +43,9 @@
 
 
 class AutoFeatureExtractorTest(unittest.TestCase):
+    def setUp(self):
+        transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0
+
     def test_feature_extractor_from_model_shortcut(self):
         config = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
         self.assertIsInstance(config, Wav2Vec2FeatureExtractor)
@@ -96,6 +100,17 @@ def test_feature_extractor_not_found(self):
             _ = AutoFeatureExtractor.from_pretrained("hf-internal-testing/config-no-model")
 
     def test_from_pretrained_dynamic_feature_extractor(self):
+        # If remote code is not set, we will time out when asking whether to load the model.
+        with self.assertRaises(ValueError):
+            feature_extractor = AutoFeatureExtractor.from_pretrained(
+                "hf-internal-testing/test_dynamic_feature_extractor"
+            )
+        # If remote code is disabled, we can't load this config.
+        with self.assertRaises(ValueError):
+            feature_extractor = AutoFeatureExtractor.from_pretrained(
+                "hf-internal-testing/test_dynamic_feature_extractor", trust_remote_code=False
+            )
+
         feature_extractor = AutoFeatureExtractor.from_pretrained(
             "hf-internal-testing/test_dynamic_feature_extractor", trust_remote_code=True
         )
@@ -127,3 +142,37 @@ def test_new_feature_extractor_registration(self):
                 del CONFIG_MAPPING._extra_content["custom"]
             if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content:
                 del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig]
+
+    def test_from_pretrained_dynamic_feature_extractor_conflict(self):
+        class NewFeatureExtractor(Wav2Vec2FeatureExtractor):
+            is_local = True
+
+        try:
+            AutoConfig.register("custom", CustomConfig)
+            AutoFeatureExtractor.register(CustomConfig, NewFeatureExtractor)
+            # If remote code is not set, the default is to use local
+            feature_extractor = AutoFeatureExtractor.from_pretrained(
+                "hf-internal-testing/test_dynamic_feature_extractor"
+            )
+            self.assertEqual(feature_extractor.__class__.__name__, "NewFeatureExtractor")
+            self.assertTrue(feature_extractor.is_local)
+
+            # If remote code is disabled, we load the local one.
+            feature_extractor = AutoFeatureExtractor.from_pretrained(
+                "hf-internal-testing/test_dynamic_feature_extractor", trust_remote_code=False
+            )
+            self.assertEqual(feature_extractor.__class__.__name__, "NewFeatureExtractor")
+            self.assertTrue(feature_extractor.is_local)
+
+            # If remote is enabled, we load from the Hub
+            feature_extractor = AutoFeatureExtractor.from_pretrained(
+                "hf-internal-testing/test_dynamic_feature_extractor", trust_remote_code=True
+            )
+            self.assertEqual(feature_extractor.__class__.__name__, "NewFeatureExtractor")
+            self.assertTrue(not hasattr(feature_extractor, "is_local"))
+
+        finally:
+            if "custom" in CONFIG_MAPPING._extra_content:
+                del CONFIG_MAPPING._extra_content["custom"]
+            if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content:
+                del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig]
diff --git a/tests/models/auto/test_image_processing_auto.py b/tests/models/auto/test_image_processing_auto.py
index 7b2296e71d22..0fb22b6c2b1f 100644
--- a/tests/models/auto/test_image_processing_auto.py
+++ b/tests/models/auto/test_image_processing_auto.py
@@ -19,6 +19,7 @@
 import unittest
 from pathlib import Path
 
+import transformers
 from transformers import (
     CONFIG_MAPPING,
     IMAGE_PROCESSOR_MAPPING,
@@ -37,6 +38,9 @@
 
 
 class AutoImageProcessorTest(unittest.TestCase):
+    def setUp(self):
+        transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0
+
     def test_image_processor_from_model_shortcut(self):
         config = AutoImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
         self.assertIsInstance(config, CLIPImageProcessor)
@@ -130,6 +134,15 @@ def test_image_processor_not_found(self):
             _ = AutoImageProcessor.from_pretrained("hf-internal-testing/config-no-model")
 
     def test_from_pretrained_dynamic_image_processor(self):
+        # If remote code is not set, we will time out when asking whether to load the model.
+        with self.assertRaises(ValueError):
+            image_processor = AutoImageProcessor.from_pretrained("hf-internal-testing/test_dynamic_image_processor")
+        # If remote code is disabled, we can't load this config.
+        with self.assertRaises(ValueError):
+            image_processor = AutoImageProcessor.from_pretrained(
+                "hf-internal-testing/test_dynamic_image_processor", trust_remote_code=False
+            )
+
         image_processor = AutoImageProcessor.from_pretrained(
             "hf-internal-testing/test_dynamic_image_processor", trust_remote_code=True
         )
@@ -171,3 +184,35 @@ def test_new_image_processor_registration(self):
                 del CONFIG_MAPPING._extra_content["custom"]
             if CustomConfig in IMAGE_PROCESSOR_MAPPING._extra_content:
                 del IMAGE_PROCESSOR_MAPPING._extra_content[CustomConfig]
+
+    def test_from_pretrained_dynamic_image_processor_conflict(self):
+        class NewImageProcessor(CLIPImageProcessor):
+            is_local = True
+
+        try:
+            AutoConfig.register("custom", CustomConfig)
+            AutoImageProcessor.register(CustomConfig, NewImageProcessor)
+            # If remote code is not set, the default is to use local
+            image_processor = AutoImageProcessor.from_pretrained("hf-internal-testing/test_dynamic_image_processor")
+            self.assertEqual(image_processor.__class__.__name__, "NewImageProcessor")
+            self.assertTrue(image_processor.is_local)
+
+            # If remote code is disabled, we load the local one.
+            image_processor = AutoImageProcessor.from_pretrained(
+                "hf-internal-testing/test_dynamic_image_processor", trust_remote_code=False
+            )
+            self.assertEqual(image_processor.__class__.__name__, "NewImageProcessor")
+            self.assertTrue(image_processor.is_local)
+
+            # If remote is enabled, we load from the Hub
+            image_processor = AutoImageProcessor.from_pretrained(
+                "hf-internal-testing/test_dynamic_image_processor", trust_remote_code=True
+            )
+            self.assertEqual(image_processor.__class__.__name__, "NewImageProcessor")
+            self.assertTrue(not hasattr(image_processor, "is_local"))
+
+        finally:
+            if "custom" in CONFIG_MAPPING._extra_content:
+                del CONFIG_MAPPING._extra_content["custom"]
+            if CustomConfig in IMAGE_PROCESSOR_MAPPING._extra_content:
+                del IMAGE_PROCESSOR_MAPPING._extra_content[CustomConfig]
diff --git a/tests/models/auto/test_modeling_auto.py b/tests/models/auto/test_modeling_auto.py
index 0008aa101b45..347cabd38a28 100644
--- a/tests/models/auto/test_modeling_auto.py
+++ b/tests/models/auto/test_modeling_auto.py
@@ -22,6 +22,7 @@
 
 import pytest
 
+import transformers
 from transformers import BertConfig, GPT2Model, is_safetensors_available, is_torch_available
 from transformers.models.auto.configuration_auto import CONFIG_MAPPING
 from transformers.testing_utils import (
@@ -42,9 +43,10 @@
 
 if is_torch_available():
     import torch
-
     from test_module.custom_modeling import CustomModel
+
     from transformers import (
+        AutoBackbone,
         AutoConfig,
         AutoModel,
         AutoModelForCausalLM,
@@ -66,11 +68,13 @@
         FunnelModel,
         GPT2Config,
         GPT2LMHeadModel,
+        ResNetBackbone,
         RobertaForMaskedLM,
         T5Config,
         T5ForConditionalGeneration,
         TapasConfig,
         TapasForQuestionAnswering,
+        TimmBackbone,
     )
     from transformers.models.auto.modeling_auto import (
         MODEL_FOR_CAUSAL_LM_MAPPING,
@@ -89,6 +93,9 @@
 
 @require_torch
 class AutoModelTest(unittest.TestCase):
+    def setUp(self):
+        transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
@@ -224,6 +231,42 @@ def test_token_classification_model_from_pretrained(self):
             self.assertIsNotNone(model)
             self.assertIsInstance(model, BertForTokenClassification)
 
+    @slow
+    def test_auto_backbone_timm_model_from_pretrained(self):
+        # Configs can't be loaded for timm models
+        model = AutoBackbone.from_pretrained("resnet18", use_timm_backbone=True)
+
+        with pytest.raises(ValueError):
+            # We can't pass output_loading_info=True as we're loading from timm
+            AutoBackbone.from_pretrained("resnet18", use_timm_backbone=True, output_loading_info=True)
+
+        self.assertIsNotNone(model)
+        self.assertIsInstance(model, TimmBackbone)
+
+        # Check kwargs are correctly passed to the backbone
+        model = AutoBackbone.from_pretrained("resnet18", use_timm_backbone=True, out_indices=(-1, -2))
+        self.assertEqual(model.out_indices, (-1, -2))
+
+        # Check out_features cannot be passed to Timm backbones
+        with self.assertRaises(ValueError):
+            _ = AutoBackbone.from_pretrained("resnet18", use_timm_backbone=True, out_features=["stage1"])
+
+    @slow
+    def test_auto_backbone_from_pretrained(self):
+        model = AutoBackbone.from_pretrained("microsoft/resnet-18")
+        model, loading_info = AutoBackbone.from_pretrained("microsoft/resnet-18", output_loading_info=True)
+        self.assertIsNotNone(model)
+        self.assertIsInstance(model, ResNetBackbone)
+
+        # Check kwargs are correctly passed to the backbone
+        model = AutoBackbone.from_pretrained("microsoft/resnet-18", out_indices=[-1, -2])
+        self.assertEqual(model.out_indices, [-1, -2])
+        self.assertEqual(model.out_features, ["stage4", "stage3"])
+
+        model = AutoBackbone.from_pretrained("microsoft/resnet-18", out_features=["stage2", "stage4"])
+        self.assertEqual(model.out_indices, [2, 4])
+        self.assertEqual(model.out_features, ["stage2", "stage4"])
+
     def test_from_pretrained_identifier(self):
         model = AutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER)
         self.assertIsInstance(model, BertForMaskedLM)
@@ -273,6 +316,13 @@ def test_from_pretrained_dynamic_model_local(self):
                 del MODEL_MAPPING._extra_content[CustomConfig]
 
     def test_from_pretrained_dynamic_model_distant(self):
+        # If remote code is not set, we will time out when asking whether to load the model.
+        with self.assertRaises(ValueError):
+            model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model")
+        # If remote code is disabled, we can't load this config.
+        with self.assertRaises(ValueError):
+            model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=False)
+
         model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=True)
         self.assertEqual(model.__class__.__name__, "NewModel")
 
@@ -298,6 +348,34 @@ def test_from_pretrained_dynamic_model_distant(self):
         for p1, p2 in zip(model.parameters(), reloaded_model.parameters()):
             self.assertTrue(torch.equal(p1, p2))
 
+    def test_from_pretrained_dynamic_model_distant_with_ref(self):
+        model = AutoModel.from_pretrained("hf-internal-testing/ref_to_test_dynamic_model", trust_remote_code=True)
+        self.assertEqual(model.__class__.__name__, "NewModel")
+
+        # Test model can be reloaded.
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir)
+            reloaded_model = AutoModel.from_pretrained(tmp_dir, trust_remote_code=True)
+
+        self.assertEqual(reloaded_model.__class__.__name__, "NewModel")
+        for p1, p2 in zip(model.parameters(), reloaded_model.parameters()):
+            self.assertTrue(torch.equal(p1, p2))
+
+        # This one uses a relative import to a util file, this checks it is downloaded and used properly.
+        model = AutoModel.from_pretrained(
+            "hf-internal-testing/ref_to_test_dynamic_model_with_util", trust_remote_code=True
+        )
+        self.assertEqual(model.__class__.__name__, "NewModel")
+
+        # Test model can be reloaded.
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir)
+            reloaded_model = AutoModel.from_pretrained(tmp_dir, trust_remote_code=True)
+
+        self.assertEqual(reloaded_model.__class__.__name__, "NewModel")
+        for p1, p2 in zip(model.parameters(), reloaded_model.parameters()):
+            self.assertTrue(torch.equal(p1, p2))
+
     def test_new_model_registration(self):
         AutoConfig.register("custom", CustomConfig)
 
@@ -349,6 +427,34 @@ def test_new_model_registration(self):
                 if CustomConfig in mapping._extra_content:
                     del mapping._extra_content[CustomConfig]
 
+    def test_from_pretrained_dynamic_model_conflict(self):
+        class NewModelConfigLocal(BertConfig):
+            model_type = "new-model"
+
+        class NewModel(BertModel):
+            config_class = NewModelConfigLocal
+
+        try:
+            AutoConfig.register("new-model", NewModelConfigLocal)
+            AutoModel.register(NewModelConfigLocal, NewModel)
+            # If remote code is not set, the default is to use local
+            model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model")
+            self.assertEqual(model.config.__class__.__name__, "NewModelConfigLocal")
+
+            # If remote code is disabled, we load the local one.
+            model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=False)
+            self.assertEqual(model.config.__class__.__name__, "NewModelConfigLocal")
+
+            # If remote is enabled, we load from the Hub
+            model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=True)
+            self.assertEqual(model.config.__class__.__name__, "NewModelConfig")
+
+        finally:
+            if "new-model" in CONFIG_MAPPING._extra_content:
+                del CONFIG_MAPPING._extra_content["new-model"]
+            if NewModelConfigLocal in MODEL_MAPPING._extra_content:
+                del MODEL_MAPPING._extra_content[NewModelConfigLocal]
+
     def test_repo_not_found(self):
         with self.assertRaisesRegex(
             EnvironmentError, "bert-base is not a local folder and is not a valid model identifier"
@@ -394,7 +500,6 @@ def test_cached_model_has_minimum_calls_to_head(self):
             self.assertEqual(counter.other_request_count, 0)
 
     def test_attr_not_existing(self):
-
         from transformers.models.auto.auto_factory import _LazyAutoMapping
 
         _CONFIG_MAPPING_NAMES = OrderedDict([("bert", "BertConfig")])
diff --git a/tests/models/auto/test_modeling_flax_auto.py b/tests/models/auto/test_modeling_flax_auto.py
index 26f80f970647..5880551f54da 100644
--- a/tests/models/auto/test_modeling_flax_auto.py
+++ b/tests/models/auto/test_modeling_flax_auto.py
@@ -20,6 +20,7 @@
 
 if is_flax_available():
     import jax
+
     from transformers.models.auto.modeling_flax_auto import FlaxAutoModel
     from transformers.models.bert.modeling_flax_bert import FlaxBertModel
     from transformers.models.roberta.modeling_flax_roberta import FlaxRobertaModel
diff --git a/tests/models/auto/test_modeling_tf_auto.py b/tests/models/auto/test_modeling_tf_auto.py
index 1a355d88bb5a..c8754ca42702 100644
--- a/tests/models/auto/test_modeling_tf_auto.py
+++ b/tests/models/auto/test_modeling_tf_auto.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import copy
 import tempfile
 import unittest
diff --git a/tests/models/auto/test_modeling_tf_pytorch.py b/tests/models/auto/test_modeling_tf_pytorch.py
index c60b8fc2f517..3e213f29562a 100644
--- a/tests/models/auto/test_modeling_tf_pytorch.py
+++ b/tests/models/auto/test_modeling_tf_pytorch.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import is_tf_available, is_torch_available
diff --git a/tests/models/auto/test_processor_auto.py b/tests/models/auto/test_processor_auto.py
index 91cd85a8933f..a4f371426877 100644
--- a/tests/models/auto/test_processor_auto.py
+++ b/tests/models/auto/test_processor_auto.py
@@ -21,8 +21,10 @@
 from pathlib import Path
 from shutil import copyfile
 
-from huggingface_hub import HfFolder, Repository, delete_repo, set_access_token
+from huggingface_hub import HfFolder, Repository, create_repo, delete_repo
 from requests.exceptions import HTTPError
+
+import transformers
 from transformers import (
     CONFIG_MAPPING,
     FEATURE_EXTRACTOR_MAPPING,
@@ -32,6 +34,8 @@
     AutoFeatureExtractor,
     AutoProcessor,
     AutoTokenizer,
+    BertTokenizer,
+    ProcessorMixin,
     Wav2Vec2Config,
     Wav2Vec2FeatureExtractor,
     Wav2Vec2Processor,
@@ -57,6 +61,9 @@
 class AutoFeatureExtractorTest(unittest.TestCase):
     vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "bla", "blou"]
 
+    def setUp(self):
+        transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0
+
     def test_processor_from_model_shortcut(self):
         processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
         self.assertIsInstance(processor, Wav2Vec2Processor)
@@ -143,6 +150,15 @@ def test_processor_from_local_directory_from_model_config(self):
         self.assertIsInstance(processor, Wav2Vec2Processor)
 
     def test_from_pretrained_dynamic_processor(self):
+        # If remote code is not set, we will time out when asking whether to load the model.
+        with self.assertRaises(ValueError):
+            processor = AutoProcessor.from_pretrained("hf-internal-testing/test_dynamic_processor")
+        # If remote code is disabled, we can't load this config.
+        with self.assertRaises(ValueError):
+            processor = AutoProcessor.from_pretrained(
+                "hf-internal-testing/test_dynamic_processor", trust_remote_code=False
+            )
+
         processor = AutoProcessor.from_pretrained("hf-internal-testing/test_dynamic_processor", trust_remote_code=True)
         self.assertTrue(processor.special_attribute_present)
         self.assertEqual(processor.__class__.__name__, "NewProcessor")
@@ -202,6 +218,58 @@ def test_new_processor_registration(self):
             if CustomConfig in PROCESSOR_MAPPING._extra_content:
                 del PROCESSOR_MAPPING._extra_content[CustomConfig]
 
+    def test_from_pretrained_dynamic_processor_conflict(self):
+        class NewFeatureExtractor(Wav2Vec2FeatureExtractor):
+            special_attribute_present = False
+
+        class NewTokenizer(BertTokenizer):
+            special_attribute_present = False
+
+        class NewProcessor(ProcessorMixin):
+            feature_extractor_class = "AutoFeatureExtractor"
+            tokenizer_class = "AutoTokenizer"
+            special_attribute_present = False
+
+        try:
+            AutoConfig.register("custom", CustomConfig)
+            AutoFeatureExtractor.register(CustomConfig, NewFeatureExtractor)
+            AutoTokenizer.register(CustomConfig, slow_tokenizer_class=NewTokenizer)
+            AutoProcessor.register(CustomConfig, NewProcessor)
+            # If remote code is not set, the default is to use local classes.
+            processor = AutoProcessor.from_pretrained("hf-internal-testing/test_dynamic_processor")
+            self.assertEqual(processor.__class__.__name__, "NewProcessor")
+            self.assertFalse(processor.special_attribute_present)
+            self.assertFalse(processor.feature_extractor.special_attribute_present)
+            self.assertFalse(processor.tokenizer.special_attribute_present)
+
+            # If remote code is disabled, we load the local ones.
+            processor = AutoProcessor.from_pretrained(
+                "hf-internal-testing/test_dynamic_processor", trust_remote_code=False
+            )
+            self.assertEqual(processor.__class__.__name__, "NewProcessor")
+            self.assertFalse(processor.special_attribute_present)
+            self.assertFalse(processor.feature_extractor.special_attribute_present)
+            self.assertFalse(processor.tokenizer.special_attribute_present)
+
+            # If remote is enabled, we load from the Hub.
+            processor = AutoProcessor.from_pretrained(
+                "hf-internal-testing/test_dynamic_processor", trust_remote_code=True
+            )
+            self.assertEqual(processor.__class__.__name__, "NewProcessor")
+            self.assertTrue(processor.special_attribute_present)
+            self.assertTrue(processor.feature_extractor.special_attribute_present)
+            self.assertTrue(processor.tokenizer.special_attribute_present)
+
+        finally:
+            if "custom" in CONFIG_MAPPING._extra_content:
+                del CONFIG_MAPPING._extra_content["custom"]
+            if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content:
+                del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig]
+            if CustomConfig in TOKENIZER_MAPPING._extra_content:
+                del TOKENIZER_MAPPING._extra_content[CustomConfig]
+            if CustomConfig in PROCESSOR_MAPPING._extra_content:
+                del PROCESSOR_MAPPING._extra_content[CustomConfig]
+
     def test_auto_processor_creates_tokenizer(self):
         processor = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-bert")
         self.assertEqual(processor.__class__.__name__, "BertTokenizerFast")
@@ -218,7 +286,6 @@ class ProcessorPushToHubTester(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         cls._token = TOKEN
-        set_access_token(TOKEN)
         HfFolder.save_token(TOKEN)
 
     @classmethod
@@ -282,7 +349,8 @@ def test_push_to_hub_dynamic_processor(self):
         processor = CustomProcessor(feature_extractor, tokenizer)
 
         with tempfile.TemporaryDirectory() as tmp_dir:
-            repo = Repository(tmp_dir, clone_from=f"{USER}/test-dynamic-processor", use_auth_token=self._token)
+            create_repo(f"{USER}/test-dynamic-processor", token=self._token)
+            repo = Repository(tmp_dir, clone_from=f"{USER}/test-dynamic-processor", token=self._token)
             processor.save_pretrained(tmp_dir)
 
             # This has added the proper auto_map field to the feature extractor config
diff --git a/tests/models/auto/test_tokenization_auto.py b/tests/models/auto/test_tokenization_auto.py
index 5814a76c374f..a3a776083893 100644
--- a/tests/models/auto/test_tokenization_auto.py
+++ b/tests/models/auto/test_tokenization_auto.py
@@ -22,6 +22,7 @@
 
 import pytest
 
+import transformers
 from transformers import (
     BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
     GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -65,6 +66,9 @@
 
 
 class AutoTokenizerTest(unittest.TestCase):
+    def setUp(self):
+        transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0
+
     @slow
     def test_tokenizer_from_pretrained(self):
         for model_name in (x for x in BERT_PRETRAINED_CONFIG_ARCHIVE_MAP.keys() if "japanese" not in x):
@@ -230,8 +234,6 @@ def test_get_tokenizer_config(self):
 
         # Check the class of the tokenizer was properly saved (note that it always saves the slow class).
         self.assertEqual(config["tokenizer_class"], "BertTokenizer")
-        # Check other keys just to make sure the config was properly saved /reloaded.
-        self.assertEqual(config["name_or_path"], SMALL_MODEL_IDENTIFIER)
 
     def test_new_tokenizer_registration(self):
         try:
@@ -300,6 +302,15 @@ def test_new_tokenizer_fast_registration(self):
                 del TOKENIZER_MAPPING._extra_content[CustomConfig]
 
     def test_from_pretrained_dynamic_tokenizer(self):
+        # If remote code is not set, we will time out when asking whether to load the model.
+        with self.assertRaises(ValueError):
+            tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/test_dynamic_tokenizer")
+        # If remote code is disabled, we can't load this config.
+        with self.assertRaises(ValueError):
+            tokenizer = AutoTokenizer.from_pretrained(
+                "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=False
+            )
+
         tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True)
         self.assertTrue(tokenizer.special_attribute_present)
         # Test tokenizer can be reloaded.
@@ -328,6 +339,57 @@ def test_from_pretrained_dynamic_tokenizer(self):
             self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
             self.assertEqual(reloaded_tokenizer.__class__.__name__, "NewTokenizer")
 
+    @require_tokenizers
+    def test_from_pretrained_dynamic_tokenizer_conflict(self):
+        class NewTokenizer(BertTokenizer):
+            special_attribute_present = False
+
+        class NewTokenizerFast(BertTokenizerFast):
+            slow_tokenizer_class = NewTokenizer
+            special_attribute_present = False
+
+        try:
+            AutoConfig.register("custom", CustomConfig)
+            AutoTokenizer.register(CustomConfig, slow_tokenizer_class=NewTokenizer)
+            AutoTokenizer.register(CustomConfig, fast_tokenizer_class=NewTokenizerFast)
+            # If remote code is not set, the default is to use local
+            tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/test_dynamic_tokenizer")
+            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
+            self.assertFalse(tokenizer.special_attribute_present)
+            tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/test_dynamic_tokenizer", use_fast=False)
+            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
+            self.assertFalse(tokenizer.special_attribute_present)
+
+            # If remote code is disabled, we load the local one.
+            tokenizer = AutoTokenizer.from_pretrained(
+                "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=False
+            )
+            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
+            self.assertFalse(tokenizer.special_attribute_present)
+            tokenizer = AutoTokenizer.from_pretrained(
+                "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=False, use_fast=False
+            )
+            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
+            self.assertFalse(tokenizer.special_attribute_present)
+
+            # If remote is enabled, we load from the Hub
+            tokenizer = AutoTokenizer.from_pretrained(
+                "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True
+            )
+            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
+            self.assertTrue(tokenizer.special_attribute_present)
+            tokenizer = AutoTokenizer.from_pretrained(
+                "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True, use_fast=False
+            )
+            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
+            self.assertTrue(tokenizer.special_attribute_present)
+
+        finally:
+            if "custom" in CONFIG_MAPPING._extra_content:
+                del CONFIG_MAPPING._extra_content["custom"]
+            if CustomConfig in TOKENIZER_MAPPING._extra_content:
+                del TOKENIZER_MAPPING._extra_content[CustomConfig]
+
     def test_from_pretrained_dynamic_tokenizer_legacy_format(self):
         tokenizer = AutoTokenizer.from_pretrained(
             "hf-internal-testing/test_dynamic_tokenizer_legacy", trust_remote_code=True
diff --git a/tests/models/retribert/__init__.py b/tests/models/autoformer/__init__.py
similarity index 100%
rename from tests/models/retribert/__init__.py
rename to tests/models/autoformer/__init__.py
diff --git a/tests/models/autoformer/test_modeling_autoformer.py b/tests/models/autoformer/test_modeling_autoformer.py
new file mode 100644
index 000000000000..ab62d0e395f5
--- /dev/null
+++ b/tests/models/autoformer/test_modeling_autoformer.py
@@ -0,0 +1,455 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Autoformer model. """
+
+import inspect
+import tempfile
+import unittest
+
+from huggingface_hub import hf_hub_download
+
+from transformers import is_torch_available
+from transformers.testing_utils import is_flaky, require_torch, slow, torch_device
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+TOLERANCE = 1e-4
+
+if is_torch_available():
+    import torch
+
+    from transformers import AutoformerConfig, AutoformerForPrediction, AutoformerModel
+    from transformers.models.autoformer.modeling_autoformer import AutoformerDecoder, AutoformerEncoder
+
+
+@require_torch
+class AutoformerModelTester:
+    def __init__(
+        self,
+        parent,
+        d_model=16,
+        batch_size=13,
+        prediction_length=7,
+        context_length=14,
+        label_length=10,
+        cardinality=19,
+        embedding_dimension=5,
+        num_time_features=4,
+        is_training=True,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        lags_sequence=[1, 2, 3, 4, 5],
+        moving_average=25,
+        autocorrelation_factor=5,
+    ):
+        self.d_model = d_model
+        self.parent = parent
+        self.batch_size = batch_size
+        self.prediction_length = prediction_length
+        self.context_length = context_length
+        self.cardinality = cardinality
+        self.num_time_features = num_time_features
+        self.lags_sequence = lags_sequence
+        self.embedding_dimension = embedding_dimension
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+
+        self.encoder_seq_length = context_length
+        self.decoder_seq_length = prediction_length + label_length
+        self.label_length = label_length
+
+        self.moving_average = moving_average
+        self.autocorrelation_factor = autocorrelation_factor
+
+    def get_config(self):
+        return AutoformerConfig(
+            d_model=self.d_model,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            prediction_length=self.prediction_length,
+            context_length=self.context_length,
+            label_length=self.label_length,
+            lags_sequence=self.lags_sequence,
+            num_time_features=self.num_time_features,
+            num_static_categorical_features=1,
+            cardinality=[self.cardinality],
+            embedding_dimension=[self.embedding_dimension],
+            moving_average=self.moving_average,
+        )
+
+    def prepare_autoformer_inputs_dict(self, config):
+        _past_length = config.context_length + max(config.lags_sequence)
+
+        static_categorical_features = ids_tensor([self.batch_size, 1], config.cardinality[0])
+        past_time_features = floats_tensor([self.batch_size, _past_length, config.num_time_features])
+        past_values = floats_tensor([self.batch_size, _past_length])
+        past_observed_mask = floats_tensor([self.batch_size, _past_length]) > 0.5
+
+        # decoder inputs
+        future_time_features = floats_tensor([self.batch_size, config.prediction_length, config.num_time_features])
+        future_values = floats_tensor([self.batch_size, config.prediction_length])
+
+        inputs_dict = {
+            "past_values": past_values,
+            "static_categorical_features": static_categorical_features,
+            "past_time_features": past_time_features,
+            "past_observed_mask": past_observed_mask,
+            "future_time_features": future_time_features,
+            "future_values": future_values,
+        }
+        return inputs_dict
+
+    def prepare_config_and_inputs(self):
+        config = self.get_config()
+        inputs_dict = self.prepare_autoformer_inputs_dict(config)
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
+        model = AutoformerModel(config=config).to(torch_device).eval()
+        outputs = model(**inputs_dict)
+
+        encoder_last_hidden_state = outputs.encoder_last_hidden_state
+        last_hidden_state = outputs.last_hidden_state
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            encoder = model.get_encoder()
+            encoder.save_pretrained(tmpdirname)
+            encoder = AutoformerEncoder.from_pretrained(tmpdirname).to(torch_device)
+
+        transformer_inputs, feature, _, _, _ = model.create_network_inputs(**inputs_dict)
+        seasonal_input, trend_input = model.decomposition_layer(transformer_inputs[:, : config.context_length, ...])
+
+        enc_input = torch.cat(
+            (transformer_inputs[:, : config.context_length, ...], feature[:, : config.context_length, ...]),
+            dim=-1,
+        )
+        encoder_last_hidden_state_2 = encoder(inputs_embeds=enc_input)[0]
+        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
+
+        mean = (
+            torch.mean(transformer_inputs[:, : config.context_length, ...], dim=1)
+            .unsqueeze(1)
+            .repeat(1, config.prediction_length, 1)
+        )
+        zeros = torch.zeros(
+            [transformer_inputs.shape[0], config.prediction_length, transformer_inputs.shape[2]],
+            device=enc_input.device,
+        )
+
+        dec_input = torch.cat(
+            (
+                torch.cat((seasonal_input[:, -config.label_length :, ...], zeros), dim=1),
+                feature[:, config.context_length - config.label_length :, ...],
+            ),
+            dim=-1,
+        )
+        trend_init = torch.cat(
+            (
+                torch.cat((trend_input[:, -config.label_length :, ...], mean), dim=1),
+                feature[:, config.context_length - config.label_length :, ...],
+            ),
+            dim=-1,
+        )
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            decoder = model.get_decoder()
+            decoder.save_pretrained(tmpdirname)
+            decoder = AutoformerDecoder.from_pretrained(tmpdirname).to(torch_device)
+
+        last_hidden_state_2 = decoder(
+            trend=trend_init,
+            inputs_embeds=dec_input,
+            encoder_hidden_states=encoder_last_hidden_state,
+        )[0]
+
+        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
+
+
+@require_torch
+class AutoformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (AutoformerModel, AutoformerForPrediction) if is_torch_available() else ()
+    all_generative_model_classes = (AutoformerForPrediction,) if is_torch_available() else ()
+    pipeline_model_mapping = {"feature-extraction": AutoformerModel} if is_torch_available() else {}
+    test_pruning = False
+    test_head_masking = False
+    test_missing_keys = False
+    test_torchscript = False
+    test_inputs_embeds = False
+    test_model_common_attributes = False
+
+    def setUp(self):
+        self.model_tester = AutoformerModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=AutoformerConfig, has_text_modality=False)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_save_load_strict(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
+            self.assertEqual(info["missing_keys"], [])
+
+    def test_encoder_decoder_model_standalone(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
+
+    @unittest.skip(reason="Model has no tokens embeddings")
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    # # Input is 'static_categorical_features' not 'input_ids'
+    def test_model_main_input_name(self):
+        model_signature = inspect.signature(getattr(AutoformerModel, "forward"))
+        # The main input is the name of the argument after `self`
+        observed_main_input_name = list(model_signature.parameters.keys())[1]
+        self.assertEqual(AutoformerModel.main_input_name, observed_main_input_name)
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = [
+                "past_values",
+                "past_time_features",
+                "past_observed_mask",
+                "static_categorical_features",
+                "static_real_features",
+                "future_values",
+                "future_time_features",
+            ]
+
+            if model.__class__.__name__ in ["AutoformerForPrediction"]:
+                expected_arg_names.append("future_observed_mask")
+
+            expected_arg_names.extend(
+                [
+                    "decoder_attention_mask",
+                    "head_mask",
+                    "decoder_head_mask",
+                    "cross_attn_head_mask",
+                    "encoder_outputs",
+                    "past_key_values",
+                    "output_hidden_states",
+                    "output_attentions",
+                    "use_cache",
+                    "return_dict",
+                ]
+            )
+
+            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        seq_len = getattr(self.model_tester, "seq_length", None)
+        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
+        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
+        d_model = getattr(self.model_tester, "d_model", None)
+        num_attention_heads = getattr(self.model_tester, "num_attention_heads", None)
+        dim = d_model // num_attention_heads
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, encoder_seq_length, dim],
+            )
+            out_len = len(outputs)
+
+            correct_outlen = 7
+
+            if "last_hidden_state" in outputs:
+                correct_outlen += 1
+
+            if "trend" in outputs:
+                correct_outlen += 1
+
+            if "past_key_values" in outputs:
+                correct_outlen += 1  # past_key_values have been returned
+
+            if "loss" in outputs:
+                correct_outlen += 1
+
+            if "params" in outputs:
+                correct_outlen += 1
+
+            self.assertEqual(out_len, correct_outlen)
+
+            # decoder attentions
+            decoder_attentions = outputs.decoder_attentions
+            self.assertIsInstance(decoder_attentions, (list, tuple))
+            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(decoder_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, decoder_seq_length, dim],
+            )
+
+            # cross attentions
+            cross_attentions = outputs.cross_attentions
+            self.assertIsInstance(cross_attentions, (list, tuple))
+            self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(cross_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, decoder_seq_length, dim],
+            )
+
+        # Check attention is always last and order is fine
+        inputs_dict["output_attentions"] = True
+        inputs_dict["output_hidden_states"] = True
+        model = model_class(config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+        self.assertEqual(out_len + 2, len(outputs))
+
+        self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+
+        self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+        self.assertListEqual(
+            list(self_attentions[0].shape[-3:]),
+            [self.model_tester.num_attention_heads, encoder_seq_length, dim],
+        )
+
+    @is_flaky()
+    def test_retain_grad_hidden_states_attentions(self):
+        super().test_retain_grad_hidden_states_attentions()
+
+
+def prepare_batch(filename="train-batch.pt"):
+    file = hf_hub_download(repo_id="hf-internal-testing/tourism-monthly-batch", filename=filename, repo_type="dataset")
+    batch = torch.load(file, map_location=torch_device)
+    return batch
+
+
+@require_torch
+@slow
+class AutoformerModelIntegrationTests(unittest.TestCase):
+    def test_inference_no_head(self):
+        model = AutoformerModel.from_pretrained("huggingface/autoformer-tourism-monthly").to(torch_device)
+        batch = prepare_batch()
+
+        with torch.no_grad():
+            output = model(
+                past_values=batch["past_values"],
+                past_time_features=batch["past_time_features"],
+                past_observed_mask=batch["past_observed_mask"],
+                static_categorical_features=batch["static_categorical_features"],
+                future_values=batch["future_values"],
+                future_time_features=batch["future_time_features"],
+            )[0]
+
+        expected_shape = torch.Size(
+            (64, model.config.prediction_length + model.config.label_length, model.config.feature_size)
+        )
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[0.3593, -1.3398, 0.6330], [0.2279, 1.5396, -0.1792], [0.0450, 1.3225, -0.2335]], device=torch_device
+        )
+        self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
+
+    def test_inference_head(self):
+        model = AutoformerForPrediction.from_pretrained("huggingface/autoformer-tourism-monthly").to(torch_device)
+        batch = prepare_batch("val-batch.pt")
+        with torch.no_grad():
+            output = model(
+                past_values=batch["past_values"],
+                past_time_features=batch["past_time_features"],
+                past_observed_mask=batch["past_observed_mask"],
+                static_categorical_features=batch["static_categorical_features"],
+            ).encoder_last_hidden_state
+        expected_shape = torch.Size((64, model.config.context_length, model.config.d_model))
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[-0.0734, -0.9036, 0.8358], [4.7186, 2.4113, 1.9581], [1.7953, 2.3558, 1.2970]], device=torch_device
+        )
+        self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
+
+    def test_seq_to_seq_generation(self):
+        model = AutoformerForPrediction.from_pretrained("huggingface/autoformer-tourism-monthly").to(torch_device)
+        batch = prepare_batch("val-batch.pt")
+        with torch.no_grad():
+            outputs = model.generate(
+                static_categorical_features=batch["static_categorical_features"],
+                past_time_features=batch["past_time_features"],
+                past_values=batch["past_values"],
+                future_time_features=batch["future_time_features"],
+                past_observed_mask=batch["past_observed_mask"],
+            )
+        expected_shape = torch.Size((64, model.config.num_parallel_samples, model.config.prediction_length))
+        self.assertEqual(outputs.sequences.shape, expected_shape)
+
+        expected_slice = torch.tensor([3130.6763, 4056.5293, 7053.0786], device=torch_device)
+        mean_prediction = outputs.sequences.mean(dim=1)
+        self.assertTrue(torch.allclose(mean_prediction[0, -3:], expected_slice, rtol=1e-1))
diff --git a/tests/models/tapex/__init__.py b/tests/models/bark/__init__.py
similarity index 100%
rename from tests/models/tapex/__init__.py
rename to tests/models/bark/__init__.py
diff --git a/tests/models/bark/test_modeling_bark.py b/tests/models/bark/test_modeling_bark.py
new file mode 100644
index 000000000000..3a5de30147e2
--- /dev/null
+++ b/tests/models/bark/test_modeling_bark.py
@@ -0,0 +1,1083 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Bark model. """
+
+
+import copy
+import inspect
+import tempfile
+import unittest
+
+from transformers import (
+    BarkCoarseConfig,
+    BarkConfig,
+    BarkFineConfig,
+    BarkSemanticConfig,
+    is_torch_available,
+)
+from transformers.models.bark.generation_configuration_bark import (
+    BarkCoarseGenerationConfig,
+    BarkFineGenerationConfig,
+    BarkSemanticGenerationConfig,
+)
+from transformers.testing_utils import require_torch, require_torch_gpu, slow, torch_device
+from transformers.utils import cached_property
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ..encodec.test_modeling_encodec import EncodecModelTester
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        BarkCausalModel,
+        BarkCoarseModel,
+        BarkFineModel,
+        BarkModel,
+        BarkProcessor,
+        BarkSemanticModel,
+    )
+
+
+class BarkSemanticModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=2,
+        seq_length=4,
+        is_training=False,  # for now training is not supported
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=33,
+        output_vocab_size=33,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=2,
+        intermediate_size=15,
+        dropout=0.1,
+        window_size=256,
+        initializer_range=0.02,
+        n_codes_total=8,  # for BarkFineModel
+        n_codes_given=1,  # for BarkFineModel
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.output_vocab_size = output_vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.window_size = window_size
+        self.initializer_range = initializer_range
+        self.bos_token_id = output_vocab_size - 1
+        self.eos_token_id = output_vocab_size - 1
+        self.pad_token_id = output_vocab_size - 1
+
+        self.n_codes_total = n_codes_total
+        self.n_codes_given = n_codes_given
+
+        self.is_encoder_decoder = False
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        config = self.get_config()
+
+        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "head_mask": head_mask,
+            "attention_mask": input_mask,
+        }
+
+        return config, inputs_dict
+
+    def get_config(self):
+        return BarkSemanticConfig(
+            vocab_size=self.vocab_size,
+            output_vocab_size=self.output_vocab_size,
+            hidden_size=self.hidden_size,
+            num_layers=self.num_hidden_layers,
+            num_heads=self.num_attention_heads,
+            use_cache=True,
+            bos_token_id=self.bos_token_id,
+            eos_token_id=self.eos_token_id,
+            pad_token_id=self.pad_token_id,
+            window_size=self.window_size,
+        )
+
+    def get_pipeline_config(self):
+        config = self.get_config()
+        config.vocab_size = 300
+        config.output_vocab_size = 300
+        return config
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
+        model = BarkSemanticModel(config=config).to(torch_device).eval()
+
+        input_ids = inputs_dict["input_ids"]
+        attention_mask = inputs_dict["attention_mask"]
+
+        # first forward pass
+        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
+
+        output, past_key_values = outputs.to_tuple()
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
+
+        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["logits"]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
+            "logits"
+        ]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+        # test no attention_mask works
+        outputs = model(input_ids, use_cache=True)
+        _, past_key_values = outputs.to_tuple()
+        output_from_no_past = model(next_input_ids)["logits"]
+
+        output_from_past = model(next_tokens, past_key_values=past_key_values)["logits"]
+
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+
+class BarkCoarseModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=2,
+        seq_length=4,
+        is_training=False,  # for now training is not supported
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=33,
+        output_vocab_size=33,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=2,
+        intermediate_size=15,
+        dropout=0.1,
+        window_size=256,
+        initializer_range=0.02,
+        n_codes_total=8,  # for BarkFineModel
+        n_codes_given=1,  # for BarkFineModel
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.output_vocab_size = output_vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.window_size = window_size
+        self.initializer_range = initializer_range
+        self.bos_token_id = output_vocab_size - 1
+        self.eos_token_id = output_vocab_size - 1
+        self.pad_token_id = output_vocab_size - 1
+
+        self.n_codes_total = n_codes_total
+        self.n_codes_given = n_codes_given
+
+        self.is_encoder_decoder = False
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        config = self.get_config()
+
+        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "head_mask": head_mask,
+            "attention_mask": input_mask,
+        }
+
+        return config, inputs_dict
+
+    def get_config(self):
+        return BarkCoarseConfig(
+            vocab_size=self.vocab_size,
+            output_vocab_size=self.output_vocab_size,
+            hidden_size=self.hidden_size,
+            num_layers=self.num_hidden_layers,
+            num_heads=self.num_attention_heads,
+            use_cache=True,
+            bos_token_id=self.bos_token_id,
+            eos_token_id=self.eos_token_id,
+            pad_token_id=self.pad_token_id,
+            window_size=self.window_size,
+        )
+
+    def get_pipeline_config(self):
+        config = self.get_config()
+        config.vocab_size = 300
+        config.output_vocab_size = 300
+        return config
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
+        model = BarkCoarseModel(config=config).to(torch_device).eval()
+
+        input_ids = inputs_dict["input_ids"]
+        attention_mask = inputs_dict["attention_mask"]
+
+        # first forward pass
+        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
+
+        output, past_key_values = outputs.to_tuple()
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
+
+        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["logits"]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
+            "logits"
+        ]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+        # test no attention_mask works
+        outputs = model(input_ids, use_cache=True)
+        _, past_key_values = outputs.to_tuple()
+        output_from_no_past = model(next_input_ids)["logits"]
+
+        output_from_past = model(next_tokens, past_key_values=past_key_values)["logits"]
+
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+
+class BarkFineModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=2,
+        seq_length=4,
+        is_training=False,  # for now training is not supported
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=33,
+        output_vocab_size=33,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=2,
+        intermediate_size=15,
+        dropout=0.1,
+        window_size=256,
+        initializer_range=0.02,
+        n_codes_total=8,  # for BarkFineModel
+        n_codes_given=1,  # for BarkFineModel
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.output_vocab_size = output_vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.window_size = window_size
+        self.initializer_range = initializer_range
+        self.bos_token_id = output_vocab_size - 1
+        self.eos_token_id = output_vocab_size - 1
+        self.pad_token_id = output_vocab_size - 1
+
+        self.n_codes_total = n_codes_total
+        self.n_codes_given = n_codes_given
+
+        self.is_encoder_decoder = False
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length, self.n_codes_total], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        config = self.get_config()
+
+        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
+
+        # randint between self.n_codes_given - 1 and self.n_codes_total - 1
+        codebook_idx = ids_tensor((1,), self.n_codes_total - self.n_codes_given).item() + self.n_codes_given
+
+        inputs_dict = {
+            "codebook_idx": codebook_idx,
+            "input_ids": input_ids,
+            "head_mask": head_mask,
+            "attention_mask": input_mask,
+        }
+
+        return config, inputs_dict
+
+    def get_config(self):
+        return BarkFineConfig(
+            vocab_size=self.vocab_size,
+            output_vocab_size=self.output_vocab_size,
+            hidden_size=self.hidden_size,
+            num_layers=self.num_hidden_layers,
+            num_heads=self.num_attention_heads,
+            use_cache=True,
+            bos_token_id=self.bos_token_id,
+            eos_token_id=self.eos_token_id,
+            pad_token_id=self.pad_token_id,
+            window_size=self.window_size,
+        )
+
+    def get_pipeline_config(self):
+        config = self.get_config()
+        config.vocab_size = 300
+        config.output_vocab_size = 300
+        return config
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
+        model = BarkFineModel(config=config).to(torch_device).eval()
+
+        input_ids = inputs_dict["input_ids"]
+        attention_mask = inputs_dict["attention_mask"]
+
+        # first forward pass
+        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
+
+        output, past_key_values = outputs.to_tuple()
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
+
+        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["logits"]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
+            "logits"
+        ]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+        # test no attention_mask works
+        outputs = model(input_ids, use_cache=True)
+        _, past_key_values = outputs.to_tuple()
+        output_from_no_past = model(next_input_ids)["logits"]
+
+        output_from_past = model(next_tokens, past_key_values=past_key_values)["logits"]
+
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+
+class BarkModelTester:
+    def __init__(
+        self,
+        parent,
+        semantic_kwargs=None,
+        coarse_acoustics_kwargs=None,
+        fine_acoustics_kwargs=None,
+        codec_kwargs=None,
+        is_training=False,  # for now training is not supported
+    ):
+        if semantic_kwargs is None:
+            semantic_kwargs = {}
+        if coarse_acoustics_kwargs is None:
+            coarse_acoustics_kwargs = {}
+        if fine_acoustics_kwargs is None:
+            fine_acoustics_kwargs = {}
+        if codec_kwargs is None:
+            codec_kwargs = {}
+
+        self.parent = parent
+        self.semantic_model_tester = BarkSemanticModelTester(parent, **semantic_kwargs)
+        self.coarse_acoustics_model_tester = BarkCoarseModelTester(parent, **coarse_acoustics_kwargs)
+        self.fine_acoustics_model_tester = BarkFineModelTester(parent, **fine_acoustics_kwargs)
+        self.codec_model_tester = EncodecModelTester(parent, **codec_kwargs)
+
+        self.is_training = is_training
+
+    def get_config(self):
+        return BarkConfig.from_sub_model_configs(
+            self.semantic_model_tester.get_config(),
+            self.coarse_acoustics_model_tester.get_config(),
+            self.fine_acoustics_model_tester.get_config(),
+            self.codec_model_tester.get_config(),
+        )
+
+    def get_pipeline_config(self):
+        config = self.get_config()
+
+        # follow the `get_pipeline_config` of the sub component models
+        config.semantic_config.vocab_size = 300
+        config.coarse_acoustics_config.vocab_size = 300
+        config.fine_acoustics_config.vocab_size = 300
+
+        config.semantic_config.output_vocab_size = 300
+        config.coarse_acoustics_config.output_vocab_size = 300
+        config.fine_acoustics_config.output_vocab_size = 300
+
+        return config
+
+
+@require_torch
+class BarkSemanticModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (BarkSemanticModel,) if is_torch_available() else ()
+    all_generative_model_classes = (BarkCausalModel,) if is_torch_available() else ()
+
+    is_encoder_decoder = False
+    fx_compatible = False
+    test_missing_keys = False
+    test_pruning = False
+    test_model_parallel = False
+    # no model_parallel for now
+
+    test_resize_embeddings = True
+
+    def setUp(self):
+        self.model_tester = BarkSemanticModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=BarkSemanticConfig, n_embd=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_save_load_strict(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
+            self.assertEqual(info["missing_keys"], [])
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
+
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+
+            wte = model.get_input_embeddings()
+            inputs["input_embeds"] = wte(input_ids)
+
+            with torch.no_grad():
+                model(**inputs)[0]
+
+    def test_generate_fp16(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs()
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        model = self.all_generative_model_classes[0](config).eval().to(torch_device)
+        if torch_device == "cuda":
+            model.half()
+        model.generate(input_ids, attention_mask=attention_mask)
+        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
+
+
+@require_torch
+class BarkCoarseModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    # Same tester as BarkSemanticModelTest, except for model_class and config_class
+    all_model_classes = (BarkCoarseModel,) if is_torch_available() else ()
+    all_generative_model_classes = (BarkCausalModel,) if is_torch_available() else ()
+
+    is_encoder_decoder = False
+    fx_compatible = False
+    test_missing_keys = False
+    test_pruning = False
+    test_model_parallel = False
+    # no model_parallel for now
+
+    test_resize_embeddings = True
+
+    def setUp(self):
+        self.model_tester = BarkCoarseModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=BarkCoarseConfig, n_embd=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_save_load_strict(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
+            self.assertEqual(info["missing_keys"], [])
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
+
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+
+            wte = model.get_input_embeddings()
+            inputs["input_embeds"] = wte(input_ids)
+
+            with torch.no_grad():
+                model(**inputs)[0]
+
+    def test_generate_fp16(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs()
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        model = self.all_generative_model_classes[0](config).eval().to(torch_device)
+        if torch_device == "cuda":
+            model.half()
+        model.generate(input_ids, attention_mask=attention_mask)
+        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
+
+
+@require_torch
+class BarkFineModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (BarkFineModel,) if is_torch_available() else ()
+
+    is_encoder_decoder = False
+    fx_compatible = False
+    test_missing_keys = False
+    test_pruning = False
+    # no model_parallel for now
+    test_model_parallel = False
+
+    # torchscript disabled for now because forward with an int
+    test_torchscript = False
+
+    test_resize_embeddings = True
+
+    def setUp(self):
+        self.model_tester = BarkFineModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=BarkFineConfig, n_embd=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_save_load_strict(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
+            self.assertEqual(info["missing_keys"], [])
+
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
+
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+
+            wte = model.get_input_embeddings()[inputs_dict["codebook_idx"]]
+
+            inputs["input_embeds"] = wte(input_ids[:, :, inputs_dict["codebook_idx"]])
+
+            with torch.no_grad():
+                model(**inputs)[0]
+
+    def test_generate_fp16(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs()
+        input_ids = input_dict["input_ids"]
+        # take first codebook channel
+
+        model = self.all_model_classes[0](config).eval().to(torch_device)
+        if torch_device == "cuda":
+            model.half()
+
+        # toy generation_configs
+        semantic_generation_config = BarkSemanticGenerationConfig(semantic_vocab_size=0)
+        coarse_generation_config = BarkCoarseGenerationConfig(n_coarse_codebooks=config.n_codes_given)
+        fine_generation_config = BarkFineGenerationConfig(
+            max_fine_history_length=config.block_size // 2,
+            max_fine_input_length=config.block_size,
+            n_fine_codebooks=config.n_codes_total,
+        )
+        codebook_size = config.vocab_size - 1
+
+        model.generate(
+            input_ids,
+            history_prompt=None,
+            temperature=None,
+            semantic_generation_config=semantic_generation_config,
+            coarse_generation_config=coarse_generation_config,
+            fine_generation_config=fine_generation_config,
+            codebook_size=codebook_size,
+        )
+
+        model.generate(
+            input_ids,
+            history_prompt=None,
+            temperature=0.7,
+            semantic_generation_config=semantic_generation_config,
+            coarse_generation_config=coarse_generation_config,
+            fine_generation_config=fine_generation_config,
+            codebook_size=codebook_size,
+        )
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["codebook_idx", "input_ids"]
+            self.assertListEqual(arg_names[:2], expected_arg_names)
+
+    def test_model_common_attributes(self):
+        # one embedding layer per codebook
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings()[0], (torch.nn.Embedding))
+            model.set_input_embeddings(
+                torch.nn.ModuleList([torch.nn.Embedding(10, 10) for _ in range(config.n_codes_total)])
+            )
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x[0], torch.nn.Linear))
+
+    def test_resize_tokens_embeddings(self):
+        # resizing tokens_embeddings of a ModuleList
+        original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        if not self.test_resize_embeddings:
+            return
+
+        for model_class in self.all_model_classes:
+            config = copy.deepcopy(original_config)
+            model = model_class(config)
+            model.to(torch_device)
+
+            if self.model_tester.is_training is False:
+                model.eval()
+
+            model_vocab_size = config.vocab_size
+            # Retrieve the embeddings and clone theme
+            model_embed_list = model.resize_token_embeddings(model_vocab_size)
+            cloned_embeddings_list = [model_embed.weight.clone() for model_embed in model_embed_list]
+
+            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
+            model_embed_list = model.resize_token_embeddings(model_vocab_size + 10)
+            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
+
+            # Check that it actually resizes the embeddings matrix for each codebook
+            for model_embed, cloned_embeddings in zip(model_embed_list, cloned_embeddings_list):
+                self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
+
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
+            model_embed_list = model.resize_token_embeddings(model_vocab_size - 15)
+            self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
+            for model_embed, cloned_embeddings in zip(model_embed_list, cloned_embeddings_list):
+                self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
+
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            # Input ids should be clamped to the maximum size of the vocabulary
+            inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 1)
+
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
+            # only check for the first embedding matrix
+            models_equal = True
+            for p1, p2 in zip(cloned_embeddings_list[0], model_embed_list[0].weight):
+                if p1.data.ne(p2.data).sum() > 0:
+                    models_equal = False
+
+            self.assertTrue(models_equal)
+
+    def test_resize_embeddings_untied(self):
+        # resizing tokens_embeddings of a ModuleList
+        original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        if not self.test_resize_embeddings:
+            return
+
+        original_config.tie_word_embeddings = False
+
+        for model_class in self.all_model_classes:
+            config = copy.deepcopy(original_config)
+            model = model_class(config).to(torch_device)
+
+            # if no output embeddings -> leave test
+            if model.get_output_embeddings() is None:
+                continue
+
+            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
+            model_vocab_size = config.vocab_size
+            model.resize_token_embeddings(model_vocab_size + 10)
+            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
+            output_embeds_list = model.get_output_embeddings()
+
+            for output_embeds in output_embeds_list:
+                self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10)
+
+                # Check bias if present
+                if output_embeds.bias is not None:
+                    self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10)
+
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
+            model.resize_token_embeddings(model_vocab_size - 15)
+            self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
+            # Check that it actually resizes the embeddings matrix
+            output_embeds_list = model.get_output_embeddings()
+
+            for output_embeds in output_embeds_list:
+                self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15)
+                # Check bias if present
+                if output_embeds.bias is not None:
+                    self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15)
+
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            # Input ids should be clamped to the maximum size of the vocabulary
+            inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 1)
+
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+
+@require_torch
+class BarkModelIntegrationTests(unittest.TestCase):
+    @cached_property
+    def model(self):
+        return BarkModel.from_pretrained("suno/bark").to(torch_device)
+
+    @cached_property
+    def processor(self):
+        return BarkProcessor.from_pretrained("suno/bark")
+
+    @cached_property
+    def inputs(self):
+        input_ids = self.processor("In the light of the moon, a little egg lay on a leaf", voice_preset="en_speaker_6")
+
+        input_ids = input_ids.to(torch_device)
+
+        return input_ids
+
+    @cached_property
+    def semantic_generation_config(self):
+        semantic_generation_config = BarkSemanticGenerationConfig(**self.model.generation_config.semantic_config)
+        return semantic_generation_config
+
+    @cached_property
+    def coarse_generation_config(self):
+        coarse_generation_config = BarkCoarseGenerationConfig(**self.model.generation_config.coarse_acoustics_config)
+        return coarse_generation_config
+
+    @cached_property
+    def fine_generation_config(self):
+        fine_generation_config = BarkFineGenerationConfig(**self.model.generation_config.fine_acoustics_config)
+        return fine_generation_config
+
+    @slow
+    def test_generate_semantic(self):
+        input_ids = self.inputs
+
+        # fmt: off
+        # check first ids
+        expected_output_ids = [7363, 321, 41, 1461, 6915, 952, 326, 41, 41, 927,]
+        # fmt: on
+
+        # greedy decoding
+        with torch.no_grad():
+            output_ids = self.model.semantic.generate(
+                **input_ids,
+                do_sample=False,
+                temperature=1.0,
+                semantic_generation_config=self.semantic_generation_config,
+            )
+
+        self.assertListEqual(output_ids[0, : len(expected_output_ids)].tolist(), expected_output_ids)
+
+    @slow
+    def test_generate_coarse(self):
+        input_ids = self.inputs
+
+        history_prompt = input_ids["history_prompt"]
+
+        # fmt: off
+        # check first ids
+        expected_output_ids = [11018, 11391, 10651, 11418, 10857, 11620, 10642, 11366, 10312, 11528, 10531, 11516, 10474, 11051, 10524, 11051, ]
+        # fmt: on
+
+        with torch.no_grad():
+            output_ids = self.model.semantic.generate(
+                **input_ids,
+                do_sample=False,
+                temperature=1.0,
+                semantic_generation_config=self.semantic_generation_config,
+            )
+
+            output_ids = self.model.coarse_acoustics.generate(
+                output_ids,
+                history_prompt=history_prompt,
+                do_sample=False,
+                temperature=1.0,
+                semantic_generation_config=self.semantic_generation_config,
+                coarse_generation_config=self.coarse_generation_config,
+                codebook_size=self.model.generation_config.codebook_size,
+            )
+
+        self.assertListEqual(output_ids[0, : len(expected_output_ids)].tolist(), expected_output_ids)
+
+    @slow
+    def test_generate_fine(self):
+        input_ids = self.inputs
+
+        history_prompt = input_ids["history_prompt"]
+
+        # fmt: off
+        expected_output_ids = [
+            [1018, 651, 857, 642, 312, 531, 474, 524, 524, 776,],
+            [367, 394, 596, 342, 504, 492, 27, 27, 822, 822,],
+            [961, 955, 221, 955, 955, 686, 939, 939, 479, 176,],
+            [638, 365, 218, 944, 853, 363, 639, 22, 884, 456,],
+            [302, 912, 524, 38, 174, 209, 879, 23, 910, 227,],
+            [440, 673, 861, 666, 372, 558, 49, 172, 232, 342,],
+            [244, 358, 123, 356, 586, 520, 499, 877, 542, 637,],
+            [806, 685, 905, 848, 803, 810, 921, 208, 625, 203,],
+        ]
+        # fmt: on
+
+        with torch.no_grad():
+            output_ids = self.model.semantic.generate(
+                **input_ids,
+                do_sample=False,
+                temperature=1.0,
+                semantic_generation_config=self.semantic_generation_config,
+            )
+
+            output_ids = self.model.coarse_acoustics.generate(
+                output_ids,
+                history_prompt=history_prompt,
+                do_sample=False,
+                temperature=1.0,
+                semantic_generation_config=self.semantic_generation_config,
+                coarse_generation_config=self.coarse_generation_config,
+                codebook_size=self.model.generation_config.codebook_size,
+            )
+
+            # greedy decoding
+            output_ids = self.model.fine_acoustics.generate(
+                output_ids,
+                history_prompt=history_prompt,
+                temperature=None,
+                semantic_generation_config=self.semantic_generation_config,
+                coarse_generation_config=self.coarse_generation_config,
+                fine_generation_config=self.fine_generation_config,
+                codebook_size=self.model.generation_config.codebook_size,
+            )
+
+        self.assertListEqual(output_ids[0, :, : len(expected_output_ids[0])].tolist(), expected_output_ids)
+
+    @slow
+    def test_generate_end_to_end(self):
+        input_ids = self.inputs
+
+        with torch.no_grad():
+            self.model.generate(**input_ids)
+            self.model.generate(**{key: val for (key, val) in input_ids.items() if key != "history_prompt"})
+
+    @slow
+    def test_generate_end_to_end_with_args(self):
+        input_ids = self.inputs
+
+        with torch.no_grad():
+            self.model.generate(**input_ids, do_sample=True, temperature=0.6, penalty_alpha=0.6)
+            self.model.generate(**input_ids, do_sample=True, temperature=0.6, num_beams=4)
+
+    @slow
+    def test_generate_end_to_end_with_sub_models_args(self):
+        input_ids = self.inputs
+
+        with torch.no_grad():
+            self.model.generate(
+                **input_ids, do_sample=False, temperature=1.0, coarse_do_sample=True, coarse_temperature=0.7
+            )
+            self.model.generate(
+                **input_ids,
+                do_sample=False,
+                temperature=1.0,
+                coarse_do_sample=True,
+                coarse_temperature=0.7,
+                fine_temperature=0.3,
+            )
+            self.model.generate(
+                **input_ids,
+                do_sample=True,
+                temperature=0.6,
+                penalty_alpha=0.6,
+                semantic_temperature=0.9,
+                coarse_temperature=0.2,
+                fine_temperature=0.1,
+            )
+
+    @require_torch_gpu
+    @slow
+    def test_generate_end_to_end_with_offload(self):
+        input_ids = self.inputs
+
+        with torch.no_grad():
+            # standard generation
+            output_with_no_offload = self.model.generate(**input_ids, do_sample=False, temperature=1.0)
+
+            torch.cuda.empty_cache()
+
+            memory_before_offload = torch.cuda.memory_allocated()
+            model_memory_footprint = self.model.get_memory_footprint()
+
+            # activate cpu offload
+            self.model.enable_cpu_offload()
+
+            memory_after_offload = torch.cuda.memory_allocated()
+
+            # checks if the model have been offloaded
+
+            # CUDA memory usage after offload should be near 0, leaving room to small differences
+            room_for_difference = 1.1
+            self.assertGreater(
+                (memory_before_offload - model_memory_footprint) * room_for_difference, memory_after_offload
+            )
+
+            # checks if device is the correct one
+            self.assertEqual(self.model.device.type, torch_device)
+
+            # checks if hooks exist
+            self.assertTrue(hasattr(self.model.semantic, "_hf_hook"))
+
+            # output with cpu offload
+            output_with_offload = self.model.generate(**input_ids, do_sample=False, temperature=1.0)
+
+        # checks if same output
+        self.assertListEqual(output_with_no_offload.tolist(), output_with_offload.tolist())
diff --git a/tests/models/bark/test_processor_bark.py b/tests/models/bark/test_processor_bark.py
new file mode 100644
index 000000000000..15b0871d8144
--- /dev/null
+++ b/tests/models/bark/test_processor_bark.py
@@ -0,0 +1,127 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+import tempfile
+import unittest
+
+import numpy as np
+
+from transformers import AutoTokenizer, BarkProcessor
+from transformers.testing_utils import require_torch, slow
+
+
+@require_torch
+class BarkProcessorTest(unittest.TestCase):
+    def setUp(self):
+        self.checkpoint = "suno/bark-small"
+        self.tmpdirname = tempfile.mkdtemp()
+        self.voice_preset = "en_speaker_1"
+        self.input_string = "This is a test string"
+        self.speaker_embeddings_dict_path = "speaker_embeddings_path.json"
+        self.speaker_embeddings_directory = "speaker_embeddings"
+
+    def get_tokenizer(self, **kwargs):
+        return AutoTokenizer.from_pretrained(self.checkpoint, **kwargs)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def test_save_load_pretrained_default(self):
+        tokenizer = self.get_tokenizer()
+
+        processor = BarkProcessor(tokenizer=tokenizer)
+
+        processor.save_pretrained(self.tmpdirname)
+        processor = BarkProcessor.from_pretrained(self.tmpdirname)
+
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
+
+    @slow
+    def test_save_load_pretrained_additional_features(self):
+        processor = BarkProcessor.from_pretrained(
+            pretrained_processor_name_or_path=self.checkpoint,
+            speaker_embeddings_dict_path=self.speaker_embeddings_dict_path,
+        )
+        processor.save_pretrained(
+            self.tmpdirname,
+            speaker_embeddings_dict_path=self.speaker_embeddings_dict_path,
+            speaker_embeddings_directory=self.speaker_embeddings_directory,
+        )
+
+        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
+
+        processor = BarkProcessor.from_pretrained(
+            self.tmpdirname,
+            self.speaker_embeddings_dict_path,
+            bos_token="(BOS)",
+            eos_token="(EOS)",
+        )
+
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
+
+    def test_speaker_embeddings(self):
+        processor = BarkProcessor.from_pretrained(
+            pretrained_processor_name_or_path=self.checkpoint,
+            speaker_embeddings_dict_path=self.speaker_embeddings_dict_path,
+        )
+
+        seq_len = 35
+        nb_codebooks_coarse = 2
+        nb_codebooks_total = 8
+
+        voice_preset = {
+            "semantic_prompt": np.ones(seq_len),
+            "coarse_prompt": np.ones((nb_codebooks_coarse, seq_len)),
+            "fine_prompt": np.ones((nb_codebooks_total, seq_len)),
+        }
+
+        # test providing already loaded voice_preset
+        inputs = processor(text=self.input_string, voice_preset=voice_preset)
+
+        processed_voice_preset = inputs["history_prompt"]
+        for key in voice_preset:
+            self.assertListEqual(voice_preset[key].tolist(), processed_voice_preset.get(key, np.array([])).tolist())
+
+        # test loading voice preset from npz file
+        tmpfilename = os.path.join(self.tmpdirname, "file.npz")
+        np.savez(tmpfilename, **voice_preset)
+        inputs = processor(text=self.input_string, voice_preset=tmpfilename)
+        processed_voice_preset = inputs["history_prompt"]
+
+        for key in voice_preset:
+            self.assertListEqual(voice_preset[key].tolist(), processed_voice_preset.get(key, np.array([])).tolist())
+
+        # test loading voice preset from the hub
+        inputs = processor(text=self.input_string, voice_preset=self.voice_preset)
+
+    def test_tokenizer(self):
+        tokenizer = self.get_tokenizer()
+
+        processor = BarkProcessor(tokenizer=tokenizer)
+
+        encoded_processor = processor(text=self.input_string)
+
+        encoded_tok = tokenizer(
+            self.input_string,
+            padding="max_length",
+            max_length=256,
+            add_special_tokens=False,
+            return_attention_mask=True,
+            return_token_type_ids=False,
+        )
+
+        for key in encoded_tok.keys():
+            self.assertListEqual(encoded_tok[key], encoded_processor[key].squeeze().tolist())
diff --git a/tests/models/bart/test_modeling_bart.py b/tests/models/bart/test_modeling_bart.py
index d6474c372f16..01189e562810 100644
--- a/tests/models/bart/test_modeling_bart.py
+++ b/tests/models/bart/test_modeling_bart.py
@@ -28,6 +28,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -414,13 +415,29 @@ def _get_embs(m):
 
 
 @require_torch
-class BartModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class BartModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (BartModel, BartForConditionalGeneration, BartForSequenceClassification, BartForQuestionAnswering)
         if is_torch_available()
         else ()
     )
     all_generative_model_classes = (BartForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "conversational": BartForConditionalGeneration,
+            "feature-extraction": BartModel,
+            "fill-mask": BartForConditionalGeneration,
+            "question-answering": BartForQuestionAnswering,
+            "summarization": BartForConditionalGeneration,
+            "text-classification": BartForSequenceClassification,
+            "text-generation": BartForCausalLM,
+            "text2text-generation": BartForConditionalGeneration,
+            "translation": BartForConditionalGeneration,
+            "zero-shot": BartForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
     is_encoder_decoder = True
     fx_compatible = False  # Fix me Michael
     test_pruning = False
@@ -939,7 +956,7 @@ def test_xsum_summarization_same_as_fairseq(self):
 
     def test_xsum_config_generation_params(self):
         config = BartConfig.from_pretrained("facebook/bart-large-xsum")
-        expected_params = dict(num_beams=6, do_sample=False, early_stopping=True, length_penalty=1.0)
+        expected_params = {"num_beams": 6, "do_sample": False, "early_stopping": True, "length_penalty": 1.0}
         config_params = {k: getattr(config, k, "MISSING") for k, v in expected_params.items()}
         self.assertDictEqual(expected_params, config_params)
 
@@ -1213,7 +1230,7 @@ def test_contrastive_search_bart(self):
             article, add_special_tokens=False, truncation=True, max_length=512, return_tensors="pt"
         ).input_ids.to(torch_device)
 
-        outputs = bart_model.generate(input_ids, penalty_alpha=0.5, top_k=5, max_length=64)
+        outputs = bart_model.generate(input_ids, penalty_alpha=0.5, top_k=5, max_length=64, num_beams=1)
         generated_text = bart_tokenizer.batch_decode(outputs, skip_special_tokens=True)
 
         self.assertListEqual(
@@ -1272,7 +1289,7 @@ def __init__(
         use_labels=True,
         decoder_start_token_id=2,
         decoder_ffn_dim=32,
-        decoder_layers=4,
+        decoder_layers=2,
         encoder_attention_heads=4,
         decoder_attention_heads=4,
         max_position_embeddings=30,
@@ -1496,3 +1513,10 @@ def test_decoder_model_attn_mask_past(self):
     def test_retain_grad_hidden_states_attentions(self):
         # decoder cannot keep gradients
         return
+
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
+    def test_left_padding_compatibility(self):
+        pass
diff --git a/tests/models/bart/test_modeling_flax_bart.py b/tests/models/bart/test_modeling_flax_bart.py
index 1289ae9ed483..f97f49149817 100644
--- a/tests/models/bart/test_modeling_flax_bart.py
+++ b/tests/models/bart/test_modeling_flax_bart.py
@@ -33,6 +33,7 @@
 
     import jax
     import jax.numpy as jnp
+
     from transformers.models.bart.modeling_flax_bart import (
         FlaxBartForConditionalGeneration,
         FlaxBartForQuestionAnswering,
@@ -426,7 +427,7 @@ def test_summarization_fast(self):
         )
 
         input_ids = tokenizer(input_str, return_tensors="np").input_ids
-        sequences = model.generate(input_ids, num_beams=2, max_length=20).sequences
+        sequences = model.generate(input_ids, num_beams=2, min_length=None, max_length=20).sequences
 
         output_str = tokenizer.batch_decode(sequences)[0]
 
diff --git a/tests/models/bart/test_modeling_tf_bart.py b/tests/models/bart/test_modeling_tf_bart.py
index 1b3682a76104..05720f297807 100644
--- a/tests/models/bart/test_modeling_tf_bart.py
+++ b/tests/models/bart/test_modeling_tf_bart.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import copy
 import tempfile
 import unittest
@@ -20,11 +22,12 @@
 import numpy as np
 
 from transformers import BartConfig, BartTokenizer, is_tf_available
-from transformers.testing_utils import require_tf, slow, tooslow
+from transformers.testing_utils import require_tf, slow
 from transformers.utils import cached_property
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 from ...utils.test_modeling_tf_core import TFCoreModelTesterMixin
 
 
@@ -49,7 +52,7 @@ def __init__(
         use_labels=False,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_dropout_prob=0.1,
@@ -188,11 +191,24 @@ def prepare_bart_inputs_dict(
 
 
 @require_tf
-class TFBartModelTest(TFModelTesterMixin, TFCoreModelTesterMixin, unittest.TestCase):
+class TFBartModelTest(TFModelTesterMixin, TFCoreModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (TFBartForConditionalGeneration, TFBartForSequenceClassification, TFBartModel) if is_tf_available() else ()
     )
     all_generative_model_classes = (TFBartForConditionalGeneration,) if is_tf_available() else ()
+    pipeline_model_mapping = (
+        {
+            "conversational": TFBartForConditionalGeneration,
+            "feature-extraction": TFBartModel,
+            "summarization": TFBartForConditionalGeneration,
+            "text-classification": TFBartForSequenceClassification,
+            "text2text-generation": TFBartForConditionalGeneration,
+            "translation": TFBartForConditionalGeneration,
+            "zero-shot": TFBartForSequenceClassification,
+        }
+        if is_tf_available()
+        else {}
+    )
     is_encoder_decoder = True
     test_pruning = False
     test_onnx = True
@@ -209,30 +225,6 @@ def test_decoder_model_past_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
 
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-
-            if model_class in self.all_generative_model_classes:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert isinstance(name, dict)
-                for k, v in name.items():
-                    assert isinstance(v, tf.Variable)
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
-
-    @tooslow
-    def test_saved_model_creation(self):
-        pass
-
     # TODO (Joao): fix me
     @unittest.skip("Onnx compliancy broke with TF 2.10")
     def test_onnx_compliancy(self):
@@ -312,7 +304,7 @@ def test_save_load_after_resize_token_embeddings(self):
             old_total_size = config.vocab_size
             new_total_size = old_total_size + new_tokens_size
             model = model_class(config=copy.deepcopy(config))  # `resize_token_embeddings` mutates `config`
-            model(model.dummy_inputs)  # builds the embeddings layer
+            model.build()
             model.resize_token_embeddings(new_total_size)
 
             # fetch the output for an input exclusively made of new members of the vocabulary
diff --git a/tests/models/bart/test_tokenization_bart.py b/tests/models/bart/test_tokenization_bart.py
index 24ea6e1e5cd9..746716161acd 100644
--- a/tests/models/bart/test_tokenization_bart.py
+++ b/tests/models/bart/test_tokenization_bart.py
@@ -132,7 +132,6 @@ def test_prepare_batch_not_longer_than_maxlen(self):
 
     @require_torch
     def test_special_tokens(self):
-
         src_text = ["A long paragraph for summarization."]
         tgt_text = [
             "Summary of the text.",
@@ -171,7 +170,6 @@ def test_embeded_special_tokens(self):
                 tokens_r_str = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
                 tokens_p_str = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
 
-                # Rust correctly handles the space before the mask while python doesnt
                 self.assertSequenceEqual(tokens_p["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
                 self.assertSequenceEqual(tokens_r["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
 
diff --git a/tests/models/barthez/test_tokenization_barthez.py b/tests/models/barthez/test_tokenization_barthez.py
index 38acf046b4f3..fa128f5091b9 100644
--- a/tests/models/barthez/test_tokenization_barthez.py
+++ b/tests/models/barthez/test_tokenization_barthez.py
@@ -25,7 +25,6 @@
 @require_sentencepiece
 @slow  # see https://github.com/huggingface/transformers/issues/11457
 class BarthezTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = BarthezTokenizer
     rust_tokenizer_class = BarthezTokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/bartpho/test_tokenization_bartpho.py b/tests/models/bartpho/test_tokenization_bartpho.py
index fc5ebfd19c4a..1fc06e38e045 100644
--- a/tests/models/bartpho/test_tokenization_bartpho.py
+++ b/tests/models/bartpho/test_tokenization_bartpho.py
@@ -26,7 +26,6 @@
 
 
 class BartphoTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = BartphoTokenizer
     test_rust_tokenizer = False
     test_sentencepiece = True
diff --git a/tests/models/beit/test_feature_extraction_beit.py b/tests/models/beit/test_feature_extraction_beit.py
deleted file mode 100644
index 545b4d79a9e8..000000000000
--- a/tests/models/beit/test_feature_extraction_beit.py
+++ /dev/null
@@ -1,359 +0,0 @@
-# coding=utf-8
-# Copyright 2021 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-from datasets import load_dataset
-
-from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torch_available, is_vision_available
-
-from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
-
-
-if is_torch_available():
-    import torch
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import BeitFeatureExtractor
-
-
-class BeitFeatureExtractionTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_center_crop=True,
-        crop_size=None,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-        do_reduce_labels=False,
-    ):
-        size = size if size is not None else {"height": 20, "width": 20}
-        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_center_crop = do_center_crop
-        self.crop_size = crop_size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.do_reduce_labels = do_reduce_labels
-
-    def prepare_feat_extract_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_center_crop": self.do_center_crop,
-            "crop_size": self.crop_size,
-            "do_normalize": self.do_normalize,
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_reduce_labels": self.do_reduce_labels,
-        }
-
-
-def prepare_semantic_single_inputs():
-    dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
-
-    image = Image.open(dataset[0]["file"])
-    map = Image.open(dataset[1]["file"])
-
-    return image, map
-
-
-def prepare_semantic_batch_inputs():
-    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
-
-    image1 = Image.open(ds[0]["file"])
-    map1 = Image.open(ds[1]["file"])
-    image2 = Image.open(ds[2]["file"])
-    map2 = Image.open(ds[3]["file"])
-
-    return [image1, image2], [map1, map2]
-
-
-@require_torch
-@require_vision
-class BeitFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
-
-    feature_extraction_class = BeitFeatureExtractor if is_vision_available() else None
-
-    def setUp(self):
-        self.feature_extract_tester = BeitFeatureExtractionTester(self)
-
-    @property
-    def feat_extract_dict(self):
-        return self.feature_extract_tester.prepare_feat_extract_dict()
-
-    def test_feat_extract_properties(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        self.assertTrue(hasattr(feature_extractor, "do_resize"))
-        self.assertTrue(hasattr(feature_extractor, "size"))
-        self.assertTrue(hasattr(feature_extractor, "do_center_crop"))
-        self.assertTrue(hasattr(feature_extractor, "center_crop"))
-        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
-        self.assertTrue(hasattr(feature_extractor, "image_mean"))
-        self.assertTrue(hasattr(feature_extractor, "image_std"))
-
-    def test_feat_extract_from_dict_with_kwargs(self):
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict)
-        self.assertEqual(feature_extractor.size, {"height": 20, "width": 20})
-        self.assertEqual(feature_extractor.crop_size, {"height": 18, "width": 18})
-        self.assertEqual(feature_extractor.do_reduce_labels, False)
-
-        feature_extractor = self.feature_extraction_class.from_dict(
-            self.feat_extract_dict, size=42, crop_size=84, reduce_labels=True
-        )
-        self.assertEqual(feature_extractor.size, {"height": 42, "width": 42})
-        self.assertEqual(feature_extractor.crop_size, {"height": 84, "width": 84})
-        self.assertEqual(feature_extractor.do_reduce_labels, True)
-
-    def test_batch_feature(self):
-        pass
-
-    def test_call_pil(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PIL images
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-    def test_call_numpy(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random numpy tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-    def test_call_pytorch(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-    def test_call_segmentation_maps(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
-        maps = []
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-            maps.append(torch.zeros(image.shape[-2:]).long())
-
-        # Test not batched input
-        encoding = feature_extractor(image_inputs[0], maps[0], return_tensors="pt")
-        self.assertEqual(
-            encoding["pixel_values"].shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-        self.assertEqual(
-            encoding["labels"].shape,
-            (
-                1,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-        self.assertEqual(encoding["labels"].dtype, torch.long)
-        self.assertTrue(encoding["labels"].min().item() >= 0)
-        self.assertTrue(encoding["labels"].max().item() <= 255)
-
-        # Test batched
-        encoding = feature_extractor(image_inputs, maps, return_tensors="pt")
-        self.assertEqual(
-            encoding["pixel_values"].shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-        self.assertEqual(
-            encoding["labels"].shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-        self.assertEqual(encoding["labels"].dtype, torch.long)
-        self.assertTrue(encoding["labels"].min().item() >= 0)
-        self.assertTrue(encoding["labels"].max().item() <= 255)
-
-        # Test not batched input (PIL images)
-        image, segmentation_map = prepare_semantic_single_inputs()
-
-        encoding = feature_extractor(image, segmentation_map, return_tensors="pt")
-        self.assertEqual(
-            encoding["pixel_values"].shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-        self.assertEqual(
-            encoding["labels"].shape,
-            (
-                1,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-        self.assertEqual(encoding["labels"].dtype, torch.long)
-        self.assertTrue(encoding["labels"].min().item() >= 0)
-        self.assertTrue(encoding["labels"].max().item() <= 255)
-
-        # Test batched input (PIL images)
-        images, segmentation_maps = prepare_semantic_batch_inputs()
-
-        encoding = feature_extractor(images, segmentation_maps, return_tensors="pt")
-        self.assertEqual(
-            encoding["pixel_values"].shape,
-            (
-                2,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-        self.assertEqual(
-            encoding["labels"].shape,
-            (
-                2,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-        self.assertEqual(encoding["labels"].dtype, torch.long)
-        self.assertTrue(encoding["labels"].min().item() >= 0)
-        self.assertTrue(encoding["labels"].max().item() <= 255)
-
-    def test_reduce_labels(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-
-        # ADE20k has 150 classes, and the background is included, so labels should be between 0 and 150
-        image, map = prepare_semantic_single_inputs()
-        encoding = feature_extractor(image, map, return_tensors="pt")
-        self.assertTrue(encoding["labels"].min().item() >= 0)
-        self.assertTrue(encoding["labels"].max().item() <= 150)
-
-        feature_extractor.reduce_labels = True
-        encoding = feature_extractor(image, map, return_tensors="pt")
-        self.assertTrue(encoding["labels"].min().item() >= 0)
-        self.assertTrue(encoding["labels"].max().item() <= 255)
diff --git a/tests/models/beit/test_image_processing_beit.py b/tests/models/beit/test_image_processing_beit.py
new file mode 100644
index 000000000000..b0c8ce4a4f49
--- /dev/null
+++ b/tests/models/beit/test_image_processing_beit.py
@@ -0,0 +1,272 @@
+# coding=utf-8
+# Copyright 2021 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from datasets import load_dataset
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import BeitImageProcessor
+
+
+class BeitImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_center_crop=True,
+        crop_size=None,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+        do_reduce_labels=False,
+    ):
+        size = size if size is not None else {"height": 20, "width": 20}
+        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_reduce_labels = do_reduce_labels
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_center_crop": self.do_center_crop,
+            "crop_size": self.crop_size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_reduce_labels": self.do_reduce_labels,
+        }
+
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.crop_size["height"], self.crop_size["width"]
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+def prepare_semantic_single_inputs():
+    dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+
+    image = Image.open(dataset[0]["file"])
+    map = Image.open(dataset[1]["file"])
+
+    return image, map
+
+
+def prepare_semantic_batch_inputs():
+    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+
+    image1 = Image.open(ds[0]["file"])
+    map1 = Image.open(ds[1]["file"])
+    image2 = Image.open(ds[2]["file"])
+    map2 = Image.open(ds[3]["file"])
+
+    return [image1, image2], [map1, map2]
+
+
+@require_torch
+@require_vision
+class BeitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = BeitImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = BeitImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "do_center_crop"))
+        self.assertTrue(hasattr(image_processing, "center_crop"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"height": 20, "width": 20})
+        self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
+        self.assertEqual(image_processor.do_reduce_labels, False)
+
+        image_processor = self.image_processing_class.from_dict(
+            self.image_processor_dict, size=42, crop_size=84, reduce_labels=True
+        )
+        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
+        self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
+        self.assertEqual(image_processor.do_reduce_labels, True)
+
+    def test_call_segmentation_maps(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PyTorch tensors
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+        maps = []
+        for image in image_inputs:
+            self.assertIsInstance(image, torch.Tensor)
+            maps.append(torch.zeros(image.shape[-2:]).long())
+
+        # Test not batched input
+        encoding = image_processing(image_inputs[0], maps[0], return_tensors="pt")
+        self.assertEqual(
+            encoding["pixel_values"].shape,
+            (
+                1,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.crop_size["height"],
+                self.image_processor_tester.crop_size["width"],
+            ),
+        )
+        self.assertEqual(
+            encoding["labels"].shape,
+            (
+                1,
+                self.image_processor_tester.crop_size["height"],
+                self.image_processor_tester.crop_size["width"],
+            ),
+        )
+        self.assertEqual(encoding["labels"].dtype, torch.long)
+        self.assertTrue(encoding["labels"].min().item() >= 0)
+        self.assertTrue(encoding["labels"].max().item() <= 255)
+
+        # Test batched
+        encoding = image_processing(image_inputs, maps, return_tensors="pt")
+        self.assertEqual(
+            encoding["pixel_values"].shape,
+            (
+                self.image_processor_tester.batch_size,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.crop_size["height"],
+                self.image_processor_tester.crop_size["width"],
+            ),
+        )
+        self.assertEqual(
+            encoding["labels"].shape,
+            (
+                self.image_processor_tester.batch_size,
+                self.image_processor_tester.crop_size["height"],
+                self.image_processor_tester.crop_size["width"],
+            ),
+        )
+        self.assertEqual(encoding["labels"].dtype, torch.long)
+        self.assertTrue(encoding["labels"].min().item() >= 0)
+        self.assertTrue(encoding["labels"].max().item() <= 255)
+
+        # Test not batched input (PIL images)
+        image, segmentation_map = prepare_semantic_single_inputs()
+
+        encoding = image_processing(image, segmentation_map, return_tensors="pt")
+        self.assertEqual(
+            encoding["pixel_values"].shape,
+            (
+                1,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.crop_size["height"],
+                self.image_processor_tester.crop_size["width"],
+            ),
+        )
+        self.assertEqual(
+            encoding["labels"].shape,
+            (
+                1,
+                self.image_processor_tester.crop_size["height"],
+                self.image_processor_tester.crop_size["width"],
+            ),
+        )
+        self.assertEqual(encoding["labels"].dtype, torch.long)
+        self.assertTrue(encoding["labels"].min().item() >= 0)
+        self.assertTrue(encoding["labels"].max().item() <= 255)
+
+        # Test batched input (PIL images)
+        images, segmentation_maps = prepare_semantic_batch_inputs()
+
+        encoding = image_processing(images, segmentation_maps, return_tensors="pt")
+        self.assertEqual(
+            encoding["pixel_values"].shape,
+            (
+                2,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.crop_size["height"],
+                self.image_processor_tester.crop_size["width"],
+            ),
+        )
+        self.assertEqual(
+            encoding["labels"].shape,
+            (
+                2,
+                self.image_processor_tester.crop_size["height"],
+                self.image_processor_tester.crop_size["width"],
+            ),
+        )
+        self.assertEqual(encoding["labels"].dtype, torch.long)
+        self.assertTrue(encoding["labels"].min().item() >= 0)
+        self.assertTrue(encoding["labels"].max().item() <= 255)
+
+    def test_reduce_labels(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+
+        # ADE20k has 150 classes, and the background is included, so labels should be between 0 and 150
+        image, map = prepare_semantic_single_inputs()
+        encoding = image_processing(image, map, return_tensors="pt")
+        self.assertTrue(encoding["labels"].min().item() >= 0)
+        self.assertTrue(encoding["labels"].max().item() <= 150)
+
+        image_processing.do_reduce_labels = True
+        encoding = image_processing(image, map, return_tensors="pt")
+        self.assertTrue(encoding["labels"].min().item() >= 0)
+        self.assertTrue(encoding["labels"].max().item() <= 255)
diff --git a/tests/models/beit/test_modeling_beit.py b/tests/models/beit/test_modeling_beit.py
index 377ed8e8e949..2a35cddf4057 100644
--- a/tests/models/beit/test_modeling_beit.py
+++ b/tests/models/beit/test_modeling_beit.py
@@ -28,6 +28,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -48,7 +49,7 @@
     import PIL
     from PIL import Image
 
-    from transformers import BeitFeatureExtractor
+    from transformers import BeitImageProcessor
 
 
 class BeitModelTester:
@@ -63,7 +64,7 @@ def __init__(
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -185,7 +186,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class BeitModelTest(ModelTesterMixin, unittest.TestCase):
+class BeitModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_common.py, as BEiT does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
@@ -196,6 +197,15 @@ class BeitModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": BeitModel,
+            "image-classification": BeitForImageClassification,
+            "image-segmentation": BeitForSemanticSegmentation,
+        }
+        if is_torch_available()
+        else {}
+    )
 
     test_pruning = False
     test_resize_embeddings = False
@@ -332,18 +342,16 @@ def prepare_img():
 @require_vision
 class BeitModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return (
-            BeitFeatureExtractor.from_pretrained("microsoft/beit-base-patch16-224") if is_vision_available() else None
-        )
+    def default_image_processor(self):
+        return BeitImageProcessor.from_pretrained("microsoft/beit-base-patch16-224") if is_vision_available() else None
 
     @slow
     def test_inference_masked_image_modeling_head(self):
         model = BeitForMaskedImageModeling.from_pretrained("microsoft/beit-base-patch16-224-pt22k").to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values.to(torch_device)
+        pixel_values = image_processor(images=image, return_tensors="pt").pixel_values.to(torch_device)
 
         # prepare bool_masked_pos
         bool_masked_pos = torch.ones((1, 196), dtype=torch.bool).to(torch_device)
@@ -367,9 +375,9 @@ def test_inference_masked_image_modeling_head(self):
     def test_inference_image_classification_head_imagenet_1k(self):
         model = BeitForImageClassification.from_pretrained("microsoft/beit-base-patch16-224").to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -393,9 +401,9 @@ def test_inference_image_classification_head_imagenet_22k(self):
             torch_device
         )
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -418,11 +426,11 @@ def test_inference_semantic_segmentation(self):
         model = BeitForSemanticSegmentation.from_pretrained("microsoft/beit-base-finetuned-ade-640-640")
         model = model.to(torch_device)
 
-        feature_extractor = BeitFeatureExtractor(do_resize=True, size=640, do_center_crop=False)
+        image_processor = BeitImageProcessor(do_resize=True, size=640, do_center_crop=False)
 
         ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
         image = Image.open(ds[0]["file"])
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -461,11 +469,11 @@ def test_post_processing_semantic_segmentation(self):
         model = BeitForSemanticSegmentation.from_pretrained("microsoft/beit-base-finetuned-ade-640-640")
         model = model.to(torch_device)
 
-        feature_extractor = BeitFeatureExtractor(do_resize=True, size=640, do_center_crop=False)
+        image_processor = BeitImageProcessor(do_resize=True, size=640, do_center_crop=False)
 
         ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
         image = Image.open(ds[0]["file"])
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -473,10 +481,10 @@ def test_post_processing_semantic_segmentation(self):
 
         outputs.logits = outputs.logits.detach().cpu()
 
-        segmentation = feature_extractor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[(500, 300)])
+        segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[(500, 300)])
         expected_shape = torch.Size((500, 300))
         self.assertEqual(segmentation[0].shape, expected_shape)
 
-        segmentation = feature_extractor.post_process_semantic_segmentation(outputs=outputs)
+        segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs)
         expected_shape = torch.Size((160, 160))
         self.assertEqual(segmentation[0].shape, expected_shape)
diff --git a/tests/models/beit/test_modeling_flax_beit.py b/tests/models/beit/test_modeling_flax_beit.py
index 94ffda61eb97..78c24220c2d2 100644
--- a/tests/models/beit/test_modeling_flax_beit.py
+++ b/tests/models/beit/test_modeling_flax_beit.py
@@ -27,12 +27,13 @@
 
 if is_flax_available():
     import jax
+
     from transformers import FlaxBeitForImageClassification, FlaxBeitForMaskedImageModeling, FlaxBeitModel
 
 if is_vision_available():
     from PIL import Image
 
-    from transformers import BeitFeatureExtractor
+    from transformers import BeitImageProcessor
 
 
 class FlaxBeitModelTester(unittest.TestCase):
@@ -47,7 +48,7 @@ def __init__(
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -140,7 +141,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_flax
 class FlaxBeitModelTest(FlaxModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (FlaxBeitModel, FlaxBeitForImageClassification, FlaxBeitForMaskedImageModeling) if is_flax_available() else ()
     )
@@ -219,18 +219,16 @@ def prepare_img():
 @require_flax
 class FlaxBeitModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return (
-            BeitFeatureExtractor.from_pretrained("microsoft/beit-base-patch16-224") if is_vision_available() else None
-        )
+    def default_image_processor(self):
+        return BeitImageProcessor.from_pretrained("microsoft/beit-base-patch16-224") if is_vision_available() else None
 
     @slow
     def test_inference_masked_image_modeling_head(self):
         model = FlaxBeitForMaskedImageModeling.from_pretrained("microsoft/beit-base-patch16-224-pt22k")
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        pixel_values = feature_extractor(images=image, return_tensors="np").pixel_values
+        pixel_values = image_processor(images=image, return_tensors="np").pixel_values
 
         # prepare bool_masked_pos
         bool_masked_pos = np.ones((1, 196), dtype=bool)
@@ -253,9 +251,9 @@ def test_inference_masked_image_modeling_head(self):
     def test_inference_image_classification_head_imagenet_1k(self):
         model = FlaxBeitForImageClassification.from_pretrained("microsoft/beit-base-patch16-224")
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="np")
+        inputs = image_processor(images=image, return_tensors="np")
 
         # forward pass
         outputs = model(**inputs)
@@ -276,9 +274,9 @@ def test_inference_image_classification_head_imagenet_1k(self):
     def test_inference_image_classification_head_imagenet_22k(self):
         model = FlaxBeitForImageClassification.from_pretrained("microsoft/beit-large-patch16-224-pt22k-ft22k")
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="np")
+        inputs = image_processor(images=image, return_tensors="np")
 
         # forward pass
         outputs = model(**inputs)
diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py
index 367e5ee53c40..9aec91367d8d 100644
--- a/tests/models/bert/test_modeling_bert.py
+++ b/tests/models/bert/test_modeling_bert.py
@@ -18,11 +18,12 @@
 
 from transformers import BertConfig, is_torch_available
 from transformers.models.auto import get_values
-from transformers.testing_utils import require_torch, require_torch_gpu, slow, torch_device
+from transformers.testing_utils import CaptureLogger, require_torch, require_torch_gpu, slow, torch_device
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -39,6 +40,7 @@
         BertForTokenClassification,
         BertLMHeadModel,
         BertModel,
+        logging,
     )
     from transformers.models.bert.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_LIST
 
@@ -55,7 +57,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -426,8 +428,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class BertModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
+class BertModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             BertModel,
@@ -444,6 +445,19 @@ class BertModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
         else ()
     )
     all_generative_model_classes = (BertLMHeadModel,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": BertModel,
+            "fill-mask": BertForMaskedLM,
+            "question-answering": BertForQuestionAnswering,
+            "text-classification": BertForSequenceClassification,
+            "text-generation": BertLMHeadModel,
+            "token-classification": BertForTokenClassification,
+            "zero-shot": BertForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
     fx_compatible = True
 
     # special case for ForPreTraining model
@@ -554,6 +568,32 @@ def test_for_token_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
 
+    def test_for_warning_if_padding_and_no_attention_mask(self):
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = self.model_tester.prepare_config_and_inputs()
+
+        # Set pad tokens in the input_ids
+        input_ids[0, 0] = config.pad_token_id
+
+        # Check for warnings if the attention_mask is missing.
+        logger = logging.get_logger("transformers.modeling_utils")
+        # clear cache so we can test the warning is emitted (from `warning_once`).
+        logger.warning_once.cache_clear()
+
+        with CaptureLogger(logger) as cl:
+            model = BertModel(config=config)
+            model.to(torch_device)
+            model.eval()
+            model(input_ids, attention_mask=None, token_type_ids=token_type_ids)
+        self.assertIn("We strongly recommend passing in an `attention_mask`", cl.out)
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
@@ -565,7 +605,6 @@ def test_model_from_pretrained(self):
     def test_torchscript_device_change(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
-
             # BertForMultipleChoice behaves incorrectly in JIT environments.
             if model_class == BertForMultipleChoice:
                 return
diff --git a/tests/models/bert/test_modeling_flax_bert.py b/tests/models/bert/test_modeling_flax_bert.py
index 5516c4d6fe67..822689917513 100644
--- a/tests/models/bert/test_modeling_flax_bert.py
+++ b/tests/models/bert/test_modeling_flax_bert.py
@@ -47,7 +47,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -133,7 +133,6 @@ def prepare_config_and_inputs_for_decoder(self):
 
 @require_flax
 class FlaxBertModelTest(FlaxModelTesterMixin, unittest.TestCase):
-
     test_head_masking = True
 
     all_model_classes = (
diff --git a/tests/models/bert/test_modeling_tf_bert.py b/tests/models/bert/test_modeling_tf_bert.py
index 451f54325d13..335a184d2929 100644
--- a/tests/models/bert/test_modeling_tf_bert.py
+++ b/tests/models/bert/test_modeling_tf_bert.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import BertConfig, is_tf_available
@@ -22,6 +24,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 from ...utils.test_modeling_tf_core import TFCoreModelTesterMixin
 
 
@@ -54,7 +57,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -77,7 +80,7 @@ def __init__(
         self.use_labels = True
         self.vocab_size = 99
         self.hidden_size = 32
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.num_attention_heads = 4
         self.intermediate_size = 37
         self.hidden_act = "gelu"
@@ -590,8 +593,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFBertModelTest(TFModelTesterMixin, TFCoreModelTesterMixin, unittest.TestCase):
-
+class TFBertModelTest(TFModelTesterMixin, TFCoreModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             TFBertModel,
@@ -607,6 +609,19 @@ class TFBertModelTest(TFModelTesterMixin, TFCoreModelTesterMixin, unittest.TestC
         if is_tf_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": TFBertModel,
+            "fill-mask": TFBertForMaskedLM,
+            "question-answering": TFBertForQuestionAnswering,
+            "text-classification": TFBertForSequenceClassification,
+            "text-generation": TFBertLMHeadModel,
+            "token-classification": TFBertForTokenClassification,
+            "zero-shot": TFBertForSequenceClassification,
+        }
+        if is_tf_available()
+        else {}
+    )
     test_head_masking = False
     test_onnx = True
     onnx_min_opset = 10
@@ -711,27 +726,6 @@ def test_model_from_pretrained(self):
         model = TFBertModel.from_pretrained("jplu/tiny-tf-bert-random")
         self.assertIsNotNone(model)
 
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        list_lm_models = [TFBertForMaskedLM, TFBertForPreTraining, TFBertLMHeadModel]
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-
-            if model_class in list_lm_models:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert isinstance(name, dict)
-                for k, v in name.items():
-                    assert isinstance(v, tf.Variable)
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
-
     def test_custom_load_tf_weights(self):
         model, output_loading_info = TFBertForTokenClassification.from_pretrained(
             "jplu/tiny-tf-bert-random", output_loading_info=True
diff --git a/tests/models/bert/test_tokenization_bert.py b/tests/models/bert/test_tokenization_bert.py
index dfbcd266c499..f9383756e3b2 100644
--- a/tests/models/bert/test_tokenization_bert.py
+++ b/tests/models/bert/test_tokenization_bert.py
@@ -34,7 +34,6 @@
 
 @require_tokenizers
 class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = BertTokenizer
     rust_tokenizer_class = BertTokenizerFast
     test_rust_tokenizer = True
@@ -183,6 +182,12 @@ def test_basic_tokenizer_respects_never_split_tokens(self):
             tokenizer.tokenize(" \tHeLLo!how  \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]
         )
 
+    def test_basic_tokenizer_splits_on_punctuation(self):
+        tokenizer = BasicTokenizer()
+        text = "a\n'll !!to?'d of, can't."
+        expected = ["a", "'", "ll", "!", "!", "to", "?", "'", "d", "of", ",", "can", "'", "t", "."]
+        self.assertListEqual(tokenizer.tokenize(text), expected)
+
     def test_wordpiece_tokenizer(self):
         vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
 
@@ -305,7 +310,6 @@ def test_change_tokenize_chinese_chars(self):
         text_with_chinese_char = "".join(list_of_commun_chinese_char)
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-
                 kwargs["tokenize_chinese_chars"] = True
                 tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
diff --git a/tests/models/bert/test_tokenization_bert_tf.py b/tests/models/bert/test_tokenization_bert_tf.py
index 5a3354f69666..14a1c12fb9a1 100644
--- a/tests/models/bert/test_tokenization_bert_tf.py
+++ b/tests/models/bert/test_tokenization_bert_tf.py
@@ -4,15 +4,15 @@
 
 from transformers import AutoConfig, TFAutoModel, is_tensorflow_text_available, is_tf_available
 from transformers.models.bert.tokenization_bert import BertTokenizer
-from transformers.testing_utils import require_tensorflow_text, slow
+from transformers.testing_utils import require_tensorflow_text, require_tf, slow
 
 
-if is_tensorflow_text_available():
-    from transformers.models.bert import TFBertTokenizer
-
 if is_tf_available():
     import tensorflow as tf
 
+if is_tensorflow_text_available():
+    from transformers.models.bert import TFBertTokenizer
+
 
 TOKENIZER_CHECKPOINTS = ["bert-base-uncased", "bert-base-cased"]
 TINY_MODEL_CHECKPOINT = "hf-internal-testing/tiny-bert-tf-only"
@@ -32,6 +32,7 @@ def call(self, inputs):
             return out["pooler_output"]
 
 
+@require_tf
 @require_tensorflow_text
 class BertTokenizationTest(unittest.TestCase):
     # The TF tokenizers are usually going to be used as pretrained tokenizers from existing model checkpoints,
diff --git a/tests/models/bert_generation/test_modeling_bert_generation.py b/tests/models/bert_generation/test_modeling_bert_generation.py
index ebd8af2bb6f3..ecd7a459e0ea 100644
--- a/tests/models/bert_generation/test_modeling_bert_generation.py
+++ b/tests/models/bert_generation/test_modeling_bert_generation.py
@@ -22,6 +22,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -40,7 +41,7 @@ def __init__(
         use_input_mask=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -240,10 +241,14 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class BertGenerationEncoderTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
+class BertGenerationEncoderTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (BertGenerationEncoder, BertGenerationDecoder) if is_torch_available() else ()
     all_generative_model_classes = (BertGenerationDecoder,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": BertGenerationEncoder, "text-generation": BertGenerationDecoder}
+        if is_torch_available()
+        else {}
+    )
 
     def setUp(self):
         self.model_tester = BertGenerationEncoderTester(self)
diff --git a/tests/models/bert_generation/test_tokenization_bert_generation.py b/tests/models/bert_generation/test_tokenization_bert_generation.py
index 581f249db050..12be95d53ebd 100644
--- a/tests/models/bert_generation/test_tokenization_bert_generation.py
+++ b/tests/models/bert_generation/test_tokenization_bert_generation.py
@@ -29,7 +29,6 @@
 
 @require_sentencepiece
 class BertGenerationTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = BertGenerationTokenizer
     test_rust_tokenizer = False
     test_sentencepiece = True
diff --git a/tests/models/bert_japanese/test_tokenization_bert_japanese.py b/tests/models/bert_japanese/test_tokenization_bert_japanese.py
index 038a334cebc7..3e840018bdc1 100644
--- a/tests/models/bert_japanese/test_tokenization_bert_japanese.py
+++ b/tests/models/bert_japanese/test_tokenization_bert_japanese.py
@@ -36,7 +36,6 @@
 
 @custom_tokenizers
 class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = BertJapaneseTokenizer
     test_rust_tokenizer = False
     space_between_special_tokens = True
@@ -369,7 +368,6 @@ def test_sequence_builders(self):
 
 @custom_tokenizers
 class BertJapaneseCharacterTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = BertJapaneseTokenizer
     test_rust_tokenizer = False
 
diff --git a/tests/models/bertweet/test_tokenization_bertweet.py b/tests/models/bertweet/test_tokenization_bertweet.py
index 5f82fba51675..2a4c643269c6 100644
--- a/tests/models/bertweet/test_tokenization_bertweet.py
+++ b/tests/models/bertweet/test_tokenization_bertweet.py
@@ -22,7 +22,6 @@
 
 
 class BertweetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = BertweetTokenizer
     test_rust_tokenizer = False
 
diff --git a/tests/models/big_bird/test_modeling_big_bird.py b/tests/models/big_bird/test_modeling_big_bird.py
index ec8705607d65..f86c6d0ac70a 100644
--- a/tests/models/big_bird/test_modeling_big_bird.py
+++ b/tests/models/big_bird/test_modeling_big_bird.py
@@ -24,6 +24,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -429,8 +430,7 @@ def create_and_check_for_change_to_full_attn(
 
 
 @require_torch
-class BigBirdModelTest(ModelTesterMixin, unittest.TestCase):
-
+class BigBirdModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     # head masking & pruning is currently not supported for big bird
     test_head_masking = False
     test_pruning = False
@@ -454,6 +454,19 @@ class BigBirdModelTest(ModelTesterMixin, unittest.TestCase):
         else ()
     )
     all_generative_model_classes = (BigBirdForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": BigBirdModel,
+            "fill-mask": BigBirdForMaskedLM,
+            "question-answering": BigBirdForQuestionAnswering,
+            "text-classification": BigBirdForSequenceClassification,
+            "text-generation": BigBirdForCausalLM,
+            "token-classification": BigBirdForTokenClassification,
+            "zero-shot": BigBirdForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
 
     # special case for ForPreTraining model
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
@@ -637,18 +650,19 @@ def test_inference_block_sparse_pretraining(self):
 
         expected_prediction_logits_slice = torch.tensor(
             [
-                [-0.2420, -0.6048, -0.0614, 7.8422],
-                [-0.0596, -0.0104, -1.8408, 9.3352],
-                [1.0588, 0.7999, 5.0770, 8.7555],
-                [-0.1385, -1.7199, -1.7613, 6.1094],
+                [-0.5583, 0.0475, -0.2508, 7.4423],
+                [0.7409, 1.4460, -0.7593, 7.7010],
+                [1.9150, 3.1395, 5.8840, 9.3498],
+                [-0.1854, -1.4640, -2.2052, 3.7968],
             ],
             device=torch_device,
         )
+
         self.assertTrue(
             torch.allclose(prediction_logits[0, 128:132, 128:132], expected_prediction_logits_slice, atol=1e-4)
         )
 
-        expected_seq_relationship_logits = torch.tensor([[58.8196, 56.3629]], device=torch_device)
+        expected_seq_relationship_logits = torch.tensor([[46.9465, 47.9517]], device=torch_device)
         self.assertTrue(torch.allclose(seq_relationship_logits, expected_seq_relationship_logits, atol=1e-4))
 
     def test_inference_full_pretraining(self):
@@ -760,22 +774,23 @@ def test_block_sparse_context_layer(self):
         blocked_mask, band_mask, from_mask, to_mask = model.create_masks_for_block_sparse_attn(
             attn_mask, config.block_size
         )
+
         targeted_cl = torch.tensor(
             [
-                [0.1874, 1.5260, 0.2335, -0.0473, -0.0961, 1.8384, -0.0141, 0.1250, 0.0085, -0.0048],
-                [-0.0554, 0.0728, 0.1683, -0.1332, 0.1741, 0.1337, -0.2380, -0.1849, -0.0390, -0.0259],
-                [-0.0419, 0.0767, 0.1591, -0.1399, 0.1789, 0.1257, -0.2406, -0.1772, -0.0261, -0.0079],
-                [0.1860, 1.5172, 0.2326, -0.0473, -0.0953, 1.8291, -0.0147, 0.1245, 0.0082, -0.0046],
-                [0.1879, 1.5296, 0.2335, -0.0471, -0.0975, 1.8433, -0.0136, 0.1260, 0.0086, -0.0054],
-                [0.1854, 1.5147, 0.2334, -0.0480, -0.0956, 1.8250, -0.0149, 0.1222, 0.0082, -0.0060],
-                [0.1859, 1.5184, 0.2334, -0.0474, -0.0955, 1.8297, -0.0143, 0.1234, 0.0079, -0.0054],
-                [0.1885, 1.5336, 0.2335, -0.0467, -0.0979, 1.8481, -0.0130, 0.1269, 0.0085, -0.0049],
-                [0.1881, 1.5305, 0.2335, -0.0471, -0.0976, 1.8445, -0.0135, 0.1262, 0.0086, -0.0053],
-                [0.1852, 1.5148, 0.2333, -0.0480, -0.0949, 1.8254, -0.0151, 0.1225, 0.0079, -0.0055],
-                [0.1877, 1.5292, 0.2335, -0.0470, -0.0972, 1.8431, -0.0135, 0.1259, 0.0084, -0.0052],
-                [0.1874, 1.5261, 0.2334, -0.0472, -0.0968, 1.8393, -0.0140, 0.1251, 0.0084, -0.0052],
-                [0.1853, 1.5151, 0.2331, -0.0478, -0.0948, 1.8256, -0.0154, 0.1228, 0.0086, -0.0052],
-                [0.1867, 1.5233, 0.2334, -0.0475, -0.0965, 1.8361, -0.0139, 0.1247, 0.0084, -0.0054],
+                [0.1870, 1.5248, 0.2333, -0.0483, -0.0952, 1.8359, -0.0142, 0.1239, 0.0083, -0.0045],
+                [-0.0601, 0.1243, 0.1329, -0.1524, 0.2347, 0.0894, -0.2248, -0.2461, -0.0645, -0.0109],
+                [-0.0418, 0.1463, 0.1290, -0.1638, 0.2489, 0.0799, -0.2341, -0.2406, -0.0524, 0.0106],
+                [0.1859, 1.5182, 0.2324, -0.0473, -0.0952, 1.8295, -0.0148, 0.1242, 0.0080, -0.0045],
+                [0.1879, 1.5300, 0.2334, -0.0480, -0.0967, 1.8428, -0.0137, 0.1256, 0.0087, -0.0050],
+                [0.1852, 1.5149, 0.2330, -0.0492, -0.0936, 1.8236, -0.0154, 0.1210, 0.0080, -0.0048],
+                [0.1857, 1.5186, 0.2331, -0.0484, -0.0940, 1.8285, -0.0148, 0.1224, 0.0077, -0.0045],
+                [0.1884, 1.5336, 0.2334, -0.0469, -0.0974, 1.8477, -0.0132, 0.1266, 0.0085, -0.0046],
+                [0.1881, 1.5308, 0.2334, -0.0479, -0.0969, 1.8438, -0.0136, 0.1258, 0.0088, -0.0050],
+                [0.1849, 1.5143, 0.2329, -0.0491, -0.0930, 1.8230, -0.0156, 0.1209, 0.0074, -0.0047],
+                [0.1878, 1.5299, 0.2333, -0.0472, -0.0967, 1.8434, -0.0137, 0.1257, 0.0084, -0.0048],
+                [0.1873, 1.5260, 0.2333, -0.0478, -0.0961, 1.8383, -0.0142, 0.1245, 0.0083, -0.0048],
+                [0.1849, 1.5145, 0.2327, -0.0491, -0.0935, 1.8237, -0.0156, 0.1215, 0.0083, -0.0046],
+                [0.1866, 1.5232, 0.2332, -0.0488, -0.0950, 1.8342, -0.0143, 0.1237, 0.0084, -0.0047],
             ],
             device=torch_device,
         )
@@ -824,21 +839,22 @@ def test_tokenizer_inference(self):
 
         expected_prediction = torch.tensor(
             [
-                [-0.0213, -0.2213, -0.0061, 0.0687],
-                [0.0977, 0.1858, 0.2374, 0.0483],
-                [0.2112, -0.2524, 0.5793, 0.0967],
-                [0.2473, -0.5070, -0.0630, 0.2174],
-                [0.2885, 0.1139, 0.6071, 0.2991],
-                [0.2328, -0.2373, 0.3648, 0.1058],
-                [0.2517, -0.0689, 0.0555, 0.0880],
-                [0.1021, -0.1495, -0.0635, 0.1891],
-                [0.0591, -0.0722, 0.2243, 0.2432],
-                [-0.2059, -0.2679, 0.3225, 0.6183],
-                [0.2280, -0.2618, 0.1693, 0.0103],
-                [0.0183, -0.1375, 0.2284, -0.1707],
+                [0.1887, -0.0474, 0.2604, 0.1453],
+                [0.0651, 0.1999, 0.1797, 0.1161],
+                [0.2833, -0.3036, 0.6910, 0.1123],
+                [0.2836, -0.4644, -0.0111, 0.1530],
+                [0.3919, -0.2823, 0.4192, 0.1687],
+                [0.2168, -0.1956, 0.4050, 0.0925],
+                [0.2597, -0.0884, 0.1258, 0.1119],
+                [0.1127, -0.1203, 0.1924, 0.2859],
+                [0.1362, -0.1315, 0.2693, 0.1027],
+                [-0.3169, -0.2266, 0.4419, 0.6740],
+                [0.2366, -0.1452, 0.2589, 0.0579],
+                [0.0358, -0.2021, 0.3112, -0.1392],
             ],
             device=torch_device,
         )
+
         self.assertTrue(torch.allclose(prediction[0, 52:64, 320:324], expected_prediction, atol=1e-4))
 
     def test_inference_question_answering(self):
@@ -881,11 +897,12 @@ def test_inference_question_answering(self):
 
         # fmt: off
         target_start_logits = torch.tensor(
-            [[-8.9304, -10.3849, -14.4997, -9.6497, -13.9469, -7.8134, -8.9687, -13.3585, -9.7987, -13.8869, -9.2632, -8.9294, -13.6721, -7.3198, -9.5434, -11.2641, -14.3245, -9.5705, -12.7367, -8.6168, -11.083, -13.7573, -8.1151, -14.5329, -7.6876, -15.706, -12.8558, -9.1135, 8.0909, -3.1925, -11.5812, -9.4822], [-11.5595, -14.5591, -10.2978, -14.8445, -10.2092, -11.1899, -13.8356, -10.5644, -14.7706, -9.9841, -11.0052, -14.1862, -8.8173, -11.1098, -12.4686, -15.0531, -11.0196, -13.6614, -10.0236, -11.8151, -14.8744, -9.5123, -15.1605, -8.6472, -15.4184, -8.898, -9.6328, -7.0258, -11.3365, -14.4065, -10.2587, -8.9103]],  # noqa: E231
+            [[-8.5622, -9.6209, -14.3351, -8.7032, -11.8596, -7.7446, -9.6730, -13.6063, -8.9651, -11.7417, -8.2641, -8.7056, -13.4116, -5.6600, -8.8316, -10.4148, -12.2180, -7.7979, -12.5274, -6.0685, -10.3373, -11.3128, -6.6456, -14.4030, -6.8292, -14.5383, -11.5638, -6.3326, 11.5293, -1.8434, -10.0013, -7.6150], [-10.7384, -13.1179, -10.1837, -13.7700, -10.0186, -11.7335, -13.3411, -10.0188, -13.4235, -9.9381, -10.4252, -13.1281, -8.2022, -10.4326, -11.5542, -14.1549, -10.7546, -13.4691, -8.2744, -11.4324, -13.3773, -9.8284, -14.5825, -8.7471, -14.7050, -8.0364, -11.3627, -6.4638, -11.7031, -14.3446, -9.9425, -8.0088]], # noqa: E231
             device=torch_device,
         )
+
         target_end_logits = torch.tensor(
-            [[-12.4131, -8.5959, -15.7163, -11.1524, -15.9913, -12.2038, -7.8902, -16.0296, -12.164, -16.5017, -13.3332, -6.9488, -15.7756, -13.8506, -11.0779, -9.2893, -15.0426, -10.1963, -17.3292, -12.2945, -11.5337, -16.4514, -9.1564, -17.5001, -9.1562, -16.2971, -13.3199, -7.5724, -5.1175, 7.2168, -10.3804, -11.9873], [-10.8654, -14.9967, -11.4144, -16.9189, -14.2673, -9.7068, -15.0182, -12.8846, -16.8716, -13.665, -10.3113, -15.1436, -14.9069, -13.3364, -11.2339, -16.0118, -11.8331, -17.0613, -13.8852, -12.4163, -16.8978, -10.7772, -17.2324, -10.6979, -16.9811, -10.3427, -9.497, -13.7104, -11.1107, -13.2936, -13.855, -14.1264]],  # noqa: E231
+            [[-12.1736, -8.8487, -14.8877, -11.6713, -15.1165, -12.2396, -7.6828, -15.4153, -12.2528, -14.3671, -12.3596, -7.4272, -14.9615, -13.6356, -11.7939, -9.9767, -14.8112, -8.9567, -15.8798, -11.5291, -9.4249, -14.7544, -7.9387, -16.2789, -8.9702, -15.3111, -11.5585, -7.9992, -4.1127, 10.3209, -8.3926, -10.2005], [-11.1375, -15.4027, -12.6861, -16.9884, -13.7093, -10.3560, -15.7228, -12.9290, -15.8519, -13.7953, -10.2460, -15.7198, -14.2078, -12.8477, -11.4861, -16.1017, -11.8900, -16.4488, -13.2959, -10.3980, -15.4874, -10.3539, -16.8263, -10.9973, -17.0344, -9.2751, -10.1196, -13.8907, -12.1025, -13.0628, -12.8530, -13.8173]], # noqa: E321
             device=torch_device,
         )
         # fmt: on
@@ -927,7 +944,7 @@ def test_auto_padding(self):
 
         # fmt: off
         target = torch.tensor(
-            [[-0.045136, -0.068013, 0.12246, -0.01356, 0.018386, 0.025333, -0.0044439, -0.0030996, -0.064031, 0.0006439], [-0.045018, -0.067638, 0.12317, -0.013998, 0.019216, 0.025695, -0.0043705, -0.0031895, -0.063153, 0.00088899], [-0.045042, -0.067305, 0.1234, -0.014512, 0.020057, 0.026084, -0.004615, -0.0031728, -0.062442, 0.0010263], [-0.044589, -0.067655, 0.12416, -0.014287, 0.019416, 0.026065, -0.0050958, -0.002702, -0.063158, 0.0004827], [-0.044627, -0.067535, 0.1239, -0.014319, 0.019491, 0.026213, -0.0059482, -0.0025906, -0.063116, 0.00014669], [-0.044899, -0.067704, 0.12337, -0.014231, 0.019256, 0.026345, -0.0065565, -0.0022938, -0.063433, -0.00011409], [-0.045599, -0.067764, 0.12235, -0.014151, 0.019206, 0.026417, -0.0068965, -0.0024494, -0.063313, -4.4499e-06], [-0.045557, -0.068372, 0.12199, -0.013747, 0.017962, 0.026103, -0.0070607, -0.0023552, -0.06447, -0.00048756], [-0.045334, -0.068913, 0.1217, -0.013566, 0.01693, 0.025745, -0.006311, -0.0024903, -0.065575, -0.0006719], [-0.045171, -0.068726, 0.12164, -0.013688, 0.017139, 0.025629, -0.005213, -0.0029412, -0.065237, -0.00020669], [-0.044411, -0.069267, 0.12206, -0.013645, 0.016212, 0.025589, -0.0044121, -0.002972, -0.066277, -0.00067963], [-0.043487, -0.069792, 0.1232, -0.013663, 0.015303, 0.02613, -0.0036294, -0.0030616, -0.067483, -0.0012642], [-0.042622, -0.069287, 0.12469, -0.013936, 0.016204, 0.026474, -0.0040534, -0.0027365, -0.066994, -0.0014148], [-0.041879, -0.070031, 0.12593, -0.014047, 0.015082, 0.027751, -0.0040683, -0.0027189, -0.068985, -0.0027146]],  # noqa: E231
+            [[-0.129420, -0.164740, 0.042422, -0.336030, 0.094379, 0.033794, 0.384590, 0.229660, -0.196500, 0.108020], [-0.000154, -0.168800, 0.165820, -0.313670, 0.101240, 0.035145, 0.381880, 0.213730, -0.201080, 0.077443], [0.053754, -0.166350, 0.225520, -0.272900, 0.119670, 0.019987, 0.348670, 0.199190, -0.181600, 0.084640], [0.063636, -0.187110, 0.237010, -0.297380, 0.126300, 0.020025, 0.268490, 0.191820, -0.192300, 0.035077], [0.073893, -0.184790, 0.188870, -0.297860, 0.134280, 0.028972, 0.174650, 0.186890, -0.180530, 0.006851], [0.005253, -0.169360, 0.123100, -0.302550, 0.126930, 0.024188, 0.133410, 0.200600, -0.168210, -0.001006], [-0.093336, -0.175370, -0.004768, -0.333170, 0.114330, 0.034168, 0.120960, 0.203570, -0.162810, -0.005757], [-0.160210, -0.169310, -0.049064, -0.331950, 0.115730, 0.027062, 0.143600, 0.205310, -0.144580, 0.026746], [-0.193200, -0.156820, -0.079422, -0.351600, 0.106450, 0.032174, 0.245690, 0.210250, -0.173480, 0.043914], [-0.167980, -0.153050, -0.059764, -0.357890,0.103910, 0.031481, 0.334190, 0.208960,-0.178180, 0.072165], [-0.136990, -0.156950, -0.012099, -0.353140,0.096996, 0.025864, 0.376340, 0.216050, -0.171820, 0.089963], [-0.041143, -0.167060, 0.079754, -0.353220, 0.093247, 0.019867, 0.385810, 0.214340, -0.191800, 0.065946],[0.040373, -0.158610, 0.152570, -0.312930, 0.110590, 0.012282, 0.345270, 0.204040, -0.176500, 0.064972], [0.043762, -0.166450, 0.179500, -0.317930, 0.117280, -0.004040, 0.304490, 0.201380, -0.182780, 0.044000]], # noqa: E231
             device=torch_device,
         )
         # fmt: on
diff --git a/tests/models/big_bird/test_modeling_flax_big_bird.py b/tests/models/big_bird/test_modeling_flax_big_bird.py
index 7c4c7267216a..63b2237fbddc 100644
--- a/tests/models/big_bird/test_modeling_flax_big_bird.py
+++ b/tests/models/big_bird/test_modeling_flax_big_bird.py
@@ -14,8 +14,6 @@
 
 import unittest
 
-import numpy as np
-
 from transformers import BigBirdConfig, is_flax_available
 from transformers.testing_utils import require_flax, slow
 
@@ -24,6 +22,7 @@
 
 if is_flax_available():
     import jax
+
     from transformers.models.big_bird.modeling_flax_big_bird import (
         FlaxBigBirdForCausalLM,
         FlaxBigBirdForMaskedLM,
@@ -128,13 +127,16 @@ def prepare_config_and_inputs(self):
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         config, input_ids, token_type_ids, attention_mask = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask}
+        inputs_dict = {
+            "input_ids": input_ids,
+            "token_type_ids": token_type_ids,
+            "attention_mask": attention_mask,
+        }
         return config, inputs_dict
 
 
 @require_flax
 class FlaxBigBirdModelTest(FlaxModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             FlaxBigBirdForCausalLM,
@@ -180,8 +182,7 @@ def test_hidden_states_output(self):
     def test_model_from_pretrained(self):
         for model_class_name in self.all_model_classes:
             model = model_class_name.from_pretrained("google/bigbird-roberta-base")
-            outputs = model(np.ones((1, 1)))
-            self.assertIsNotNone(outputs)
+            self.assertIsNotNone(model)
 
     def test_attention_outputs(self):
         if self.test_attn_probs:
@@ -210,7 +211,6 @@ def model_jitted(input_ids, attention_mask=None, **kwargs):
 
                 self.assertEqual(len(outputs), len(jitted_outputs))
                 for jitted_output, output in zip(jitted_outputs, outputs):
-
                     self.assertEqual(jitted_output.shape, output.shape)
 
     # overwrite from common in order to skip the check on `attentions`
diff --git a/tests/models/big_bird/test_tokenization_big_bird.py b/tests/models/big_bird/test_tokenization_big_bird.py
index ff6545100825..fd4323cb0f57 100644
--- a/tests/models/big_bird/test_tokenization_big_bird.py
+++ b/tests/models/big_bird/test_tokenization_big_bird.py
@@ -30,7 +30,6 @@
 @require_sentencepiece
 @require_tokenizers
 class BigBirdTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = BigBirdTokenizer
     rust_tokenizer_class = BigBirdTokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py b/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py
index b8ae01e398eb..aedbbb46341e 100644
--- a/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py
+++ b/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py
@@ -25,6 +25,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -232,7 +233,7 @@ def create_and_check_model(self, config, inputs_dict):
 
 
 @require_torch
-class BigBirdPegasusModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class BigBirdPegasusModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             BigBirdPegasusModel,
@@ -244,6 +245,21 @@ class BigBirdPegasusModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.
         else ()
     )
     all_generative_model_classes = (BigBirdPegasusForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "conversational": BigBirdPegasusForConditionalGeneration,
+            "feature-extraction": BigBirdPegasusModel,
+            "question-answering": BigBirdPegasusForQuestionAnswering,
+            "summarization": BigBirdPegasusForConditionalGeneration,
+            "text-classification": BigBirdPegasusForSequenceClassification,
+            "text-generation": BigBirdPegasusForCausalLM,
+            "text2text-generation": BigBirdPegasusForConditionalGeneration,
+            "translation": BigBirdPegasusForConditionalGeneration,
+            "zero-shot": BigBirdPegasusForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
     is_encoder_decoder = True
     test_missing_keys = False
     test_pruning = False
@@ -253,9 +269,18 @@ class BigBirdPegasusModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.
     # Also torchscript is not an important feature to have in the beginning.
     test_torchscript = False
 
+    # TODO: Fix the failed tests
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if pipeline_test_casse_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
+            return True
+
+        return False
+
     # overwrite from GenerationTesterMixin to solve problem
     # with conflicting random seeds
-    def _get_input_ids_and_config(self):
+    def _get_input_ids_and_config(self, batch_size=2):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.attention_type = "original_full"
 
@@ -263,10 +288,9 @@ def _get_input_ids_and_config(self):
         attention_mask = torch.ones_like(input_ids, dtype=torch.long)
 
         # cut to half length & take max batch_size 3
-        max_batch_size = 2
         sequence_length = input_ids.shape[-1] // 2
-        input_ids = input_ids[:max_batch_size, :sequence_length]
-        attention_mask = attention_mask[:max_batch_size, :sequence_length]
+        input_ids = input_ids[:batch_size, :sequence_length]
+        attention_mask = attention_mask[:batch_size, :sequence_length]
 
         # generate max 3 tokens
         max_length = input_ids.shape[-1] + 3
@@ -489,9 +513,10 @@ def test_inference_block_sparse(self):
         self.assertEqual(prediction_logits.shape, torch.Size((1, 16, 96103)))
         # fmt: off
         expected_prediction_logits_slice = torch.tensor(
-            [[1.7769, 5.8479, 6.2375, 2.2745, 8.6157, 4.7483, 5.0647, 6.5358, 2.3393, 7.8333, 3.8403, 0.0255, 7.219, 5.2759, 3.097, 6.387, 4.9341, 7.1409, 5.1179, 0.1144, 6.8268, 0.7598, 0.6258, 2.373, 0.4627, -1.9919, 1.8422, 3.4578], [1.8026, 5.9604, 5.954, 2.8642, 9.0608, 4.394, 5.3779, 7.0216, 1.543, 7.8744, 4.4231, -0.0398, 7.6091, 5.6611, 3.3536, 6.8624, 4.7699, 6.5241, 4.8893, 0.5791, 6.8368, 0.1034, 0.0338, 2.9393, 0.5034, -2.5509, 2.0172, 3.2858], [1.8426, 5.9151, 5.5374, 3.0426, 9.1762, 3.6287, 5.3916, 7.4621, 1.2582, 7.9244, 4.694, -0.1308, 7.4725, 5.5385, 3.4598, 7.0422, 4.2455, 5.797, 4.5927, 0.7478, 6.7467, -0.2695, -0.3207, 3.0269, 0.4714, -2.8134, 2.0406, 3.1089], [1.6527, 5.8416, 5.4558, 3.0044, 9.3478, 3.2607, 5.3887, 7.52, 0.9362, 7.8877, 4.8465, -0.1705, 7.3932, 5.6352, 3.5744, 7.2623, 4.0485, 5.2788, 4.5859, 0.8325, 6.6088, -0.3676, -0.6287, 3.1731, 0.4483, -3.1573, 2.0522, 2.8868]],  # noqa: E231
+            [[1.5118, 5.5227, 4.8125, 1.7603, 8.1704, 3.996, 4.8118, 6.7806, 2.2297, 6.9834, 3.1906, 0.103, 7.1515, 6.3679, 3.1896, 6.3054, 3.9741, 6.3772, 5.0042, -0.6338, 6.7868, 0.592, 0.5363, 1.87, -0.331, -2.4518, 1.8263, 3.1899], [1.5702, 5.8135, 4.6675, 2.3674, 8.9828, 3.7913, 5.4027, 7.6567, 1.9007, 7.3706, 3.8824, 0.0247, 7.6094, 6.6985, 3.2826, 7.0094, 3.8713, 5.6555, 5.0439, -0.3519, 7.1525, 0.4062, -0.2419, 2.2194, -0.6447, -2.9614, 2.0713, 3.248], [1.4527, 5.6003, 4.5381, 2.6382, 9.2809, 3.2969, 5.6811, 8.4011, 1.6909, 7.4937, 4.3185, -0.0878, 7.61, 6.6822, 3.4753, 7.3962, 3.5336, 4.9216, 4.943, -0.2043, 7.3326, 0.2199, -0.6016, 2.4367, -0.7043, -3.0689, 2.3215, 3.0611], [1.1084, 5.6308, 4.4886, 2.717, 9.4103, 3.0733, 5.5825, 8.4325, 1.3075, 7.5495, 4.4782, -0.1092, 7.8115, 6.6285, 3.5311, 7.6853, 3.509, 4.4994, 4.9224, -0.1384, 7.3069, -0.0473, -0.8578, 2.4632, -0.5249, -3.4627, 2.2671, 2.8818]],  # noqa: E231
             device=torch_device,
         )
+
         # fmt: on
         self.assertTrue(
             torch.allclose(prediction_logits[0, 4:8, 128:156], expected_prediction_logits_slice, atol=1e-4)
@@ -539,24 +564,23 @@ def test_seq_to_seq_generation(self):
         hypotheses_batch = model.generate(**inputs)
 
         EXPECTED_LEP = (
-            "motivated by some recent studies on the light cp - odd higgs boson @xmath0 in non - minimal"
-            " supersymmetric models, we investigate the rare @xmath1-decays @xmath2 ( @xmath3 ) in the two higgs"
-            " doublet model ( 2hdm ), the nearly minimal supersymmetric standard model ( nmssm ), the next - to -"
-            " minimal supersymmetric standard model ( nmssm ) and the minimal supersymmetric standard model ( mssm"
-            " ).<n> we find that the branching ratios of @xmath4 can reach @xmath5 in 2hdm, @xmath6 in nmssm and"
-            " @xmath7 in mssm, which are at the level of @xmath8 in 2hdm, @xmath9 in nmssm and @xmath10 in mssm,"
-            " respectively.<n> these rates can be significantly enhanced in new physics models which lie within the"
-            " expected sensitivity of the gigaz option of the international linear collider ( ilc ). <n> = # 1,nucl."
-            " <n> phys. <n> b * # 1"
+            "we study the rare decays @xmath0 ( @xmath1 ) at the gigaz option of the international linear collider "
+            "( ilc ).<n> we calculate the branching ratios of @xmath2 in the two higgs doublet model ( 2hdm ), the "
+            "minimal supersymmetric standard model ( mssm ), the next - to - minimal supersymmetric standard model "
+            "( nmssm ) and the nearly minimal supersymmetric standard model ( nmssm ).<n> we find that the branching "
+            "ratios of @xmath3 can reach @xmath4 in 2hdm, @xmath5 in mssm, @xmath6 in nmssm and @xmath7 in nmssm, "
+            "while they are much smaller than @xmath8 in 2hdm, @xmath9 in mssm, @xmath10 in nmssm and @xmath11 in "
+            "nmssm."
         )
 
         EXPECTED_MAGNET = (
-            "a positive, nonsaturating and dominantly linear magnetoresistance can appear within quite wide magnetic -"
-            " field range in the surface state of a topological insulator having a positive and finite effective g -"
-            " factor. this linear magnetoresistance shows up in the system of high carrier concentration and low"
-            " mobility when electrons are in extended states and spread over many smeared landau levels, and persists"
-            " up to room temperature, providing a possible mechanism for the recently observed linear"
-            " magnetoresistance in topological insulator bi@xmath0se@xmath1 nanoribbons."
+            "we investigate the two - dimensional magnetotransport in the surface state of a topological insulator "
+            "( ti ).<n> we find that a positive, nonsaturating and dominantly linear magnetoresistance can appear "
+            "within quite wide magnetic - field range in the ti surface state having a positive and finite effective g "
+            "- factor.<n> this linear magnetoresistance shows up in the system of high carrier concentration and low "
+            "mobility when electrons are in extended states and spread over many smeared landau levels, and persists "
+            "up to room temperature, providing a possible mechanism for the recently observed linear magnetoresistance "
+            "in topological insulator bi@xmath0se@xmath1 nanoribbons."
         )
 
         generated = tokenizer.batch_decode(
@@ -581,7 +605,7 @@ def __init__(
         use_labels=True,
         decoder_start_token_id=2,
         decoder_ffn_dim=32,
-        decoder_layers=4,
+        decoder_layers=2,
         encoder_attention_heads=4,
         decoder_attention_heads=4,
         max_position_embeddings=30,
@@ -787,3 +811,7 @@ def test_decoder_model_attn_mask_past(self):
     def test_retain_grad_hidden_states_attentions(self):
         # decoder cannot keep gradients
         return
+
+    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
+    def test_left_padding_compatibility(self):
+        pass
diff --git a/tests/models/biogpt/test_modeling_biogpt.py b/tests/models/biogpt/test_modeling_biogpt.py
index b0eb86d43fc6..e43fc1e41b8f 100644
--- a/tests/models/biogpt/test_modeling_biogpt.py
+++ b/tests/models/biogpt/test_modeling_biogpt.py
@@ -23,12 +23,19 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
     import torch
 
-    from transformers import BioGptForCausalLM, BioGptModel, BioGptTokenizer
+    from transformers import (
+        BioGptForCausalLM,
+        BioGptForSequenceClassification,
+        BioGptForTokenClassification,
+        BioGptModel,
+        BioGptTokenizer,
+    )
     from transformers.models.biogpt.modeling_biogpt import BIOGPT_PRETRAINED_MODEL_ARCHIVE_LIST
 
 
@@ -44,7 +51,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -246,6 +253,16 @@ def create_and_check_biogpt_weight_initialization(self, config, *args):
                 self.parent.assertLessEqual(abs(torch.std(model.state_dict()[key]) - model_std), 0.001)
                 self.parent.assertLessEqual(abs(torch.mean(model.state_dict()[key]) - 0.0), 0.01)
 
+    def create_and_check_biogpt_for_token_classification(
+        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
+    ):
+        config.num_labels = self.num_labels
+        model = BioGptForTokenClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         (
@@ -262,10 +279,24 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class BioGptModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
-    all_model_classes = (BioGptModel, BioGptForCausalLM) if is_torch_available() else ()
+class BioGptModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (BioGptModel, BioGptForCausalLM, BioGptForSequenceClassification, BioGptForTokenClassification)
+        if is_torch_available()
+        else ()
+    )
     all_generative_model_classes = (BioGptForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": BioGptModel,
+            "text-classification": BioGptForSequenceClassification,
+            "text-generation": BioGptForCausalLM,
+            "token-classification": BioGptForTokenClassification,
+            "zero-shot": BioGptForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
     test_pruning = False
 
     def setUp(self):
@@ -301,6 +332,10 @@ def test_biogpt_weight_initialization(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_biogpt_weight_initialization(*config_and_inputs)
 
+    def test_biogpt_token_classification_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_biogpt_for_token_classification(*config_and_inputs)
+
     @slow
     def test_batch_generation(self):
         model = BioGptForCausalLM.from_pretrained("microsoft/biogpt")
@@ -351,6 +386,35 @@ def test_model_from_pretrained(self):
             model = BioGptModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
+    # Copied from tests.models.opt.test_modeling_opt.OPTModelTest with OPT->BioGpt, prepare_config_and_inputs-> prepare_config_and_inputs_for_common
+    def test_biogpt_sequence_classification_model(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
+        model = BioGptForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    # Copied from tests.models.opt.test_modeling_opt.OPTModelTest with OPT->BioGpt, prepare_config_and_inputs-> prepare_config_and_inputs_for_common
+    def test_biogpt_sequence_classification_model_for_multi_label(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        config.problem_type = "multi_label_classification"
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        sequence_labels = ids_tensor(
+            [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size
+        ).to(torch.float)
+        model = BioGptForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
 
 @require_torch
 class BioGptModelIntegrationTest(unittest.TestCase):
diff --git a/tests/models/bit/test_modeling_bit.py b/tests/models/bit/test_modeling_bit.py
index 7b7e07cb8fb6..d7d2c60347f6 100644
--- a/tests/models/bit/test_modeling_bit.py
+++ b/tests/models/bit/test_modeling_bit.py
@@ -22,8 +22,10 @@
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
 from transformers.utils import cached_property, is_torch_available, is_vision_available
 
+from ...test_backbone_common import BackboneTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -54,6 +56,7 @@ def __init__(
         num_labels=3,
         scope=None,
         out_features=["stage2", "stage3", "stage4"],
+        out_indices=[2, 3, 4],
         num_groups=1,
     ):
         self.parent = parent
@@ -70,6 +73,7 @@ def __init__(
         self.scope = scope
         self.num_stages = len(hidden_sizes)
         self.out_features = out_features
+        self.out_indices = out_indices
         self.num_groups = num_groups
 
     def prepare_config_and_inputs(self):
@@ -92,6 +96,7 @@ def get_config(self):
             hidden_act=self.hidden_act,
             num_labels=self.num_labels,
             out_features=self.out_features,
+            out_indices=self.out_indices,
             num_groups=self.num_groups,
         )
 
@@ -150,13 +155,18 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class BitModelTest(ModelTesterMixin, unittest.TestCase):
+class BitModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_common.py, as Bit does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
     """
 
     all_model_classes = (BitModel, BitForImageClassification, BitBackbone) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": BitModel, "image-classification": BitForImageClassification}
+        if is_torch_available()
+        else {}
+    )
 
     fx_compatible = False
     test_pruning = False
@@ -287,7 +297,7 @@ def prepare_img():
 @require_vision
 class BitModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return (
             BitImageProcessor.from_pretrained(BIT_PRETRAINED_MODEL_ARCHIVE_LIST[0]) if is_vision_available() else None
         )
@@ -296,9 +306,9 @@ def default_feature_extractor(self):
     def test_inference_image_classification_head(self):
         model = BitForImageClassification.from_pretrained(BIT_PRETRAINED_MODEL_ARCHIVE_LIST[0]).to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -311,3 +321,14 @@ def test_inference_image_classification_head(self):
         expected_slice = torch.tensor([[-0.6526, -0.5263, -1.4398]]).to(torch_device)
 
         self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
+
+
+@require_torch
+class BitBackboneTest(BackboneTesterMixin, unittest.TestCase):
+    all_model_classes = (BitBackbone,) if is_torch_available() else ()
+    config_class = BitConfig
+
+    has_attentions = False
+
+    def setUp(self):
+        self.model_tester = BitModelTester(self)
diff --git a/tests/models/blenderbot/test_modeling_blenderbot.py b/tests/models/blenderbot/test_modeling_blenderbot.py
index 671541328dcc..ca1630b3cfd3 100644
--- a/tests/models/blenderbot/test_modeling_blenderbot.py
+++ b/tests/models/blenderbot/test_modeling_blenderbot.py
@@ -24,6 +24,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -223,9 +224,21 @@ def check_encoder_decoder_model_standalone(self, config, inputs_dict):
 
 
 @require_torch
-class BlenderbotModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class BlenderbotModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (BlenderbotModel, BlenderbotForConditionalGeneration) if is_torch_available() else ()
     all_generative_model_classes = (BlenderbotForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "conversational": BlenderbotForConditionalGeneration,
+            "feature-extraction": BlenderbotModel,
+            "summarization": BlenderbotForConditionalGeneration,
+            "text-generation": BlenderbotForCausalLM,
+            "text2text-generation": BlenderbotForConditionalGeneration,
+            "translation": BlenderbotForConditionalGeneration,
+        }
+        if is_torch_available()
+        else {}
+    )
     is_encoder_decoder = True
     fx_compatible = True
     test_pruning = False
@@ -299,8 +312,8 @@ def tokenizer(self):
 
     @slow
     def test_generation_from_short_input_same_as_parlai_3B(self):
-        FASTER_GEN_KWARGS = dict(num_beams=1, early_stopping=True, min_length=15, max_length=25)
-        TOK_DECODE_KW = dict(skip_special_tokens=True, clean_up_tokenization_spaces=True)
+        FASTER_GEN_KWARGS = {"num_beams": 1, "early_stopping": True, "min_length": 15, "max_length": 25}
+        TOK_DECODE_KW = {"skip_special_tokens": True, "clean_up_tokenization_spaces": True}
 
         torch.cuda.empty_cache()
         model = BlenderbotForConditionalGeneration.from_pretrained(self.ckpt).half().to(torch_device)
@@ -343,7 +356,7 @@ def __init__(
         use_labels=True,
         decoder_start_token_id=2,
         decoder_ffn_dim=32,
-        decoder_layers=4,
+        decoder_layers=2,
         encoder_attention_heads=4,
         decoder_attention_heads=4,
         max_position_embeddings=30,
@@ -548,3 +561,7 @@ def test_decoder_model_attn_mask_past(self):
     def test_retain_grad_hidden_states_attentions(self):
         # decoder cannot keep gradients
         return
+
+    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
+    def test_left_padding_compatibility(self):
+        pass
diff --git a/tests/models/blenderbot/test_modeling_flax_blenderbot.py b/tests/models/blenderbot/test_modeling_flax_blenderbot.py
index 70dd9c24e95c..ffcc9a7d04e6 100644
--- a/tests/models/blenderbot/test_modeling_flax_blenderbot.py
+++ b/tests/models/blenderbot/test_modeling_flax_blenderbot.py
@@ -34,6 +34,7 @@
 
     import jax
     import jax.numpy as jnp
+
     from transformers import BlenderbotTokenizer
     from transformers.models.blenderbot.modeling_flax_blenderbot import (
         FlaxBlenderbotForConditionalGeneration,
@@ -401,8 +402,8 @@ def test_model_from_pretrained(self):
     @unittest.skipUnless(jax_device != "cpu", "3B test too slow on CPU.")
     @slow
     def test_generation_from_short_input_same_as_parlai_3B(self):
-        FASTER_GEN_KWARGS = dict(num_beams=1, early_stopping=True, min_length=15, max_length=25)
-        TOK_DECODE_KW = dict(skip_special_tokens=True, clean_up_tokenization_spaces=True)
+        FASTER_GEN_KWARGS = {"num_beams": 1, "early_stopping": True, "min_length": 15, "max_length": 25}
+        TOK_DECODE_KW = {"skip_special_tokens": True, "clean_up_tokenization_spaces": True}
 
         model = FlaxBlenderbotForConditionalGeneration.from_pretrained("facebook/blenderbot-3B", from_pt=True)
         tokenizer = BlenderbotTokenizer.from_pretrained("facebook/blenderbot-3B")
diff --git a/tests/models/blenderbot/test_modeling_tf_blenderbot.py b/tests/models/blenderbot/test_modeling_tf_blenderbot.py
index f08c01ab0f81..26b03a5d6a3f 100644
--- a/tests/models/blenderbot/test_modeling_tf_blenderbot.py
+++ b/tests/models/blenderbot/test_modeling_tf_blenderbot.py
@@ -14,14 +14,17 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import BlenderbotConfig, BlenderbotTokenizer, is_tf_available
-from transformers.testing_utils import require_tf, require_tokenizers, slow, tooslow
+from transformers.testing_utils import require_tf, require_tokenizers, slow
 from transformers.utils import cached_property
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -45,7 +48,7 @@ def __init__(
         use_labels=False,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_dropout_prob=0.1,
@@ -175,9 +178,20 @@ def prepare_blenderbot_inputs_dict(
 
 
 @require_tf
-class TFBlenderbotModelTest(TFModelTesterMixin, unittest.TestCase):
+class TFBlenderbotModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (TFBlenderbotForConditionalGeneration, TFBlenderbotModel) if is_tf_available() else ()
     all_generative_model_classes = (TFBlenderbotForConditionalGeneration,) if is_tf_available() else ()
+    pipeline_model_mapping = (
+        {
+            "conversational": TFBlenderbotForConditionalGeneration,
+            "feature-extraction": TFBlenderbotModel,
+            "summarization": TFBlenderbotForConditionalGeneration,
+            "text2text-generation": TFBlenderbotForConditionalGeneration,
+            "translation": TFBlenderbotForConditionalGeneration,
+        }
+        if is_tf_available()
+        else {}
+    )
     is_encoder_decoder = True
     test_pruning = False
     test_onnx = False
@@ -193,30 +207,6 @@ def test_decoder_model_past_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
 
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-
-            if model_class in self.all_generative_model_classes:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert isinstance(name, dict)
-                for k, v in name.items():
-                    assert isinstance(v, tf.Variable)
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
-
-    @tooslow
-    def test_saved_model_creation(self):
-        pass
-
 
 @require_tokenizers
 @require_tf
diff --git a/tests/models/blenderbot/test_tokenization_blenderbot.py b/tests/models/blenderbot/test_tokenization_blenderbot.py
index 3a9c95027b32..7fbf2b7603f9 100644
--- a/tests/models/blenderbot/test_tokenization_blenderbot.py
+++ b/tests/models/blenderbot/test_tokenization_blenderbot.py
@@ -17,6 +17,7 @@
 import unittest
 
 from transformers import BlenderbotTokenizer, BlenderbotTokenizerFast
+from transformers.testing_utils import require_jinja
 from transformers.utils import cached_property
 
 
@@ -50,3 +51,24 @@ def test_3B_tokenization_same_as_parlai(self):
     def test_3B_tokenization_same_as_parlai_rust_tokenizer(self):
         assert self.rust_tokenizer_3b.add_prefix_space
         assert self.rust_tokenizer_3b([" Sam", "Sam"]).input_ids == [[5502, 2], [5502, 2]]
+
+    @require_jinja
+    def test_tokenization_for_chat(self):
+        tok = self.tokenizer_3b
+        test_chats = [
+            [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
+            [
+                {"role": "system", "content": "You are a helpful chatbot."},
+                {"role": "user", "content": "Hello!"},
+                {"role": "assistant", "content": "Nice to meet you."},
+            ],
+            [{"role": "assistant", "content": "Nice to meet you."}, {"role": "user", "content": "Hello!"}],
+        ]
+        tokenized_chats = [tok.apply_chat_template(test_chat) for test_chat in test_chats]
+        expected_tokens = [
+            [553, 366, 265, 4792, 3879, 73, 311, 21, 228, 228, 6950, 8, 2],
+            [553, 366, 265, 4792, 3879, 73, 311, 21, 228, 228, 6950, 8, 228, 3490, 287, 2273, 304, 21, 2],
+            [3490, 287, 2273, 304, 21, 228, 228, 6950, 8, 2],
+        ]
+        for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
+            self.assertListEqual(tokenized_chat, expected_tokens)
diff --git a/tests/models/blenderbot_small/test_modeling_blenderbot_small.py b/tests/models/blenderbot_small/test_modeling_blenderbot_small.py
index c0d58c0d1483..249a8a799a83 100644
--- a/tests/models/blenderbot_small/test_modeling_blenderbot_small.py
+++ b/tests/models/blenderbot_small/test_modeling_blenderbot_small.py
@@ -24,6 +24,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -217,14 +218,35 @@ def check_encoder_decoder_model_standalone(self, config, inputs_dict):
 
 
 @require_torch
-class BlenderbotSmallModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class BlenderbotSmallModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (BlenderbotSmallModel, BlenderbotSmallForConditionalGeneration) if is_torch_available() else ()
     all_generative_model_classes = (BlenderbotSmallForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "conversational": BlenderbotSmallForConditionalGeneration,
+            "feature-extraction": BlenderbotSmallModel,
+            "summarization": BlenderbotSmallForConditionalGeneration,
+            "text-generation": BlenderbotSmallForCausalLM,
+            "text2text-generation": BlenderbotSmallForConditionalGeneration,
+            "translation": BlenderbotSmallForConditionalGeneration,
+        }
+        if is_torch_available()
+        else {}
+    )
     is_encoder_decoder = True
     fx_compatible = True
     test_pruning = False
     test_missing_keys = False
 
+    # TODO: Fix the failed tests when this model gets more usage
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if pipeline_test_casse_name == "TextGenerationPipelineTests":
+            return True
+
+        return False
+
     def setUp(self):
         self.model_tester = BlenderbotSmallModelTester(self)
         self.config_tester = ConfigTester(self, config_class=BlenderbotSmallConfig)
@@ -297,7 +319,6 @@ def tokenizer(self):
 
     @slow
     def test_90_generation_from_long_input(self):
-
         src_text = [
             "Social anxiety\nWow, I am never shy. Do you have anxiety?\nYes. I end up sweating and blushing and feel"
             " like       i'm going to throw up.\nand why is that?"
@@ -344,7 +365,7 @@ def __init__(
         use_labels=True,
         decoder_start_token_id=2,
         decoder_ffn_dim=32,
-        decoder_layers=4,
+        decoder_layers=2,
         encoder_attention_heads=4,
         decoder_attention_heads=4,
         max_position_embeddings=30,
@@ -545,3 +566,7 @@ def test_decoder_model_attn_mask_past(self):
     def test_retain_grad_hidden_states_attentions(self):
         # decoder cannot keep gradients
         return
+
+    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
+    def test_left_padding_compatibility(self):
+        pass
diff --git a/tests/models/blenderbot_small/test_modeling_flax_blenderbot_small.py b/tests/models/blenderbot_small/test_modeling_flax_blenderbot_small.py
index 695eb3b30dad..d417ac3073d5 100644
--- a/tests/models/blenderbot_small/test_modeling_flax_blenderbot_small.py
+++ b/tests/models/blenderbot_small/test_modeling_flax_blenderbot_small.py
@@ -34,6 +34,7 @@
 
     import jax
     import jax.numpy as jnp
+
     from transformers.models.blenderbot_small.modeling_flax_blenderbot_small import (
         FlaxBlenderbotSmallForConditionalGeneration,
         FlaxBlenderbotSmallModel,
diff --git a/tests/models/blenderbot_small/test_modeling_tf_blenderbot_small.py b/tests/models/blenderbot_small/test_modeling_tf_blenderbot_small.py
index 9dbefa23b5d3..021f171789e4 100644
--- a/tests/models/blenderbot_small/test_modeling_tf_blenderbot_small.py
+++ b/tests/models/blenderbot_small/test_modeling_tf_blenderbot_small.py
@@ -14,14 +14,17 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import BlenderbotSmallConfig, BlenderbotSmallTokenizer, is_tf_available
-from transformers.testing_utils import require_tf, require_tokenizers, slow, tooslow
+from transformers.testing_utils import require_tf, require_tokenizers, slow
 from transformers.utils import cached_property
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -45,7 +48,7 @@ def __init__(
         use_labels=False,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_dropout_prob=0.1,
@@ -175,11 +178,22 @@ def prepare_blenderbot_small_inputs_dict(
 
 
 @require_tf
-class TFBlenderbotSmallModelTest(TFModelTesterMixin, unittest.TestCase):
+class TFBlenderbotSmallModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (TFBlenderbotSmallForConditionalGeneration, TFBlenderbotSmallModel) if is_tf_available() else ()
     )
     all_generative_model_classes = (TFBlenderbotSmallForConditionalGeneration,) if is_tf_available() else ()
+    pipeline_model_mapping = (
+        {
+            "conversational": TFBlenderbotSmallForConditionalGeneration,
+            "feature-extraction": TFBlenderbotSmallModel,
+            "summarization": TFBlenderbotSmallForConditionalGeneration,
+            "text2text-generation": TFBlenderbotSmallForConditionalGeneration,
+            "translation": TFBlenderbotSmallForConditionalGeneration,
+        }
+        if is_tf_available()
+        else {}
+    )
     is_encoder_decoder = True
     test_pruning = False
     test_onnx = False
@@ -195,30 +209,6 @@ def test_decoder_model_past_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
 
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-
-            if model_class in self.all_generative_model_classes:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert isinstance(name, dict)
-                for k, v in name.items():
-                    assert isinstance(v, tf.Variable)
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
-
-    @tooslow
-    def test_saved_model_creation(self):
-        pass
-
 
 @require_tokenizers
 @require_tf
diff --git a/tests/models/blenderbot_small/test_tokenization_blenderbot_small.py b/tests/models/blenderbot_small/test_tokenization_blenderbot_small.py
index 7ea7f09b5764..b022e77682bd 100644
--- a/tests/models/blenderbot_small/test_tokenization_blenderbot_small.py
+++ b/tests/models/blenderbot_small/test_tokenization_blenderbot_small.py
@@ -27,7 +27,6 @@
 
 
 class BlenderbotSmallTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = BlenderbotSmallTokenizer
     test_rust_tokenizer = False
 
diff --git a/tests/models/blip/test_image_processing_blip.py b/tests/models/blip/test_image_processing_blip.py
index ea31038b14ab..1d7e7f12ee9b 100644
--- a/tests/models/blip/test_image_processing_blip.py
+++ b/tests/models/blip/test_image_processing_blip.py
@@ -16,20 +16,13 @@
 
 import unittest
 
-import numpy as np
-
 from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torch_available, is_vision_available
-
-from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin
+from transformers.utils import is_vision_available
 
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
 
-if is_torch_available():
-    import torch
 
 if is_vision_available():
-    from PIL import Image
-
     from transformers import BlipImageProcessor
 
 
@@ -65,7 +58,7 @@ def __init__(
         self.do_pad = do_pad
         self.do_convert_rgb = do_convert_rgb
 
-    def prepare_feat_extract_dict(self):
+    def prepare_image_processor_dict(self):
         return {
             "do_resize": self.do_resize,
             "size": self.size,
@@ -76,213 +69,77 @@ def prepare_feat_extract_dict(self):
             "do_pad": self.do_pad,
         }
 
-    def prepare_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
-        or a list of PyTorch tensors if one specifies torchify=True.
-        """
-
-        assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
-
-        if equal_resolution:
-            image_inputs = []
-            for i in range(self.batch_size):
-                image_inputs.append(
-                    np.random.randint(
-                        255, size=(self.num_channels, self.max_resolution, self.max_resolution), dtype=np.uint8
-                    )
-                )
-        else:
-            image_inputs = []
-            for i in range(self.batch_size):
-                width, height = np.random.choice(np.arange(self.min_resolution, self.max_resolution), 2)
-                image_inputs.append(np.random.randint(255, size=(self.num_channels, width, height), dtype=np.uint8))
-
-        if not numpify and not torchify:
-            # PIL expects the channel dimension as last dimension
-            image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
-
-        if torchify:
-            image_inputs = [torch.from_numpy(x) for x in image_inputs]
-
-        return image_inputs
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.size["height"], self.size["width"]
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
 
 
 @require_torch
 @require_vision
-class BlipImageProcessingTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
-
-    feature_extraction_class = BlipImageProcessor if is_vision_available() else None
+class BlipImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = BlipImageProcessor if is_vision_available() else None
 
     def setUp(self):
-        self.feature_extract_tester = BlipImageProcessingTester(self)
+        self.image_processor_tester = BlipImageProcessingTester(self)
 
     @property
-    def feat_extract_dict(self):
-        return self.feature_extract_tester.prepare_feat_extract_dict()
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
 
-    def test_feat_extract_properties(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        self.assertTrue(hasattr(feature_extractor, "do_resize"))
-        self.assertTrue(hasattr(feature_extractor, "size"))
-        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
-        self.assertTrue(hasattr(feature_extractor, "image_mean"))
-        self.assertTrue(hasattr(feature_extractor, "image_std"))
-        self.assertTrue(hasattr(feature_extractor, "do_convert_rgb"))
-
-    def test_batch_feature(self):
-        pass
-
-    def test_call_pil(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PIL images
-        image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-    def test_call_numpy(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random numpy tensors
-        image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-    def test_call_pytorch(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PyTorch tensors
-        image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
+    def test_image_processor_properties(self):
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processor, "do_resize"))
+        self.assertTrue(hasattr(image_processor, "size"))
+        self.assertTrue(hasattr(image_processor, "do_normalize"))
+        self.assertTrue(hasattr(image_processor, "image_mean"))
+        self.assertTrue(hasattr(image_processor, "image_std"))
+        self.assertTrue(hasattr(image_processor, "do_convert_rgb"))
 
 
 @require_torch
 @require_vision
-class BlipImageProcessingTestFourChannels(FeatureExtractionSavingTestMixin, unittest.TestCase):
-
-    feature_extraction_class = BlipImageProcessor if is_vision_available() else None
+class BlipImageProcessingTestFourChannels(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = BlipImageProcessor if is_vision_available() else None
 
     def setUp(self):
-        self.feature_extract_tester = BlipImageProcessingTester(self, num_channels=4)
+        self.image_processor_tester = BlipImageProcessingTester(self, num_channels=4)
         self.expected_encoded_image_num_channels = 3
 
     @property
-    def feat_extract_dict(self):
-        return self.feature_extract_tester.prepare_feat_extract_dict()
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processor, "do_resize"))
+        self.assertTrue(hasattr(image_processor, "size"))
+        self.assertTrue(hasattr(image_processor, "do_normalize"))
+        self.assertTrue(hasattr(image_processor, "image_mean"))
+        self.assertTrue(hasattr(image_processor, "image_std"))
+        self.assertTrue(hasattr(image_processor, "do_convert_rgb"))
+
+    @unittest.skip("BlipImageProcessor does not support 4 channels yet")  # FIXME Amy
+    def test_call_numpy(self):
+        return super().test_call_numpy()
 
-    def test_feat_extract_properties(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        self.assertTrue(hasattr(feature_extractor, "do_resize"))
-        self.assertTrue(hasattr(feature_extractor, "size"))
-        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
-        self.assertTrue(hasattr(feature_extractor, "image_mean"))
-        self.assertTrue(hasattr(feature_extractor, "image_std"))
-        self.assertTrue(hasattr(feature_extractor, "do_convert_rgb"))
+    @unittest.skip("BlipImageProcessor does not support 4 channels yet")  # FIXME Amy
+    def test_call_pytorch(self):
+        return super().test_call_torch()
 
-    def test_batch_feature(self):
+    @unittest.skip("BLIP doesn't treat 4 channel PIL and numpy consistently yet")  # FIXME Amy
+    def test_call_pil(self):
         pass
 
-    def test_call_pil_four_channels(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PIL images
-        image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.expected_encoded_image_num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.expected_encoded_image_num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
+    @unittest.skip("BLIP doesn't treat 4 channel PIL and numpy consistently yet")  # FIXME Amy
+    def test_call_numpy_4_channels(self):
+        pass
diff --git a/tests/models/blip/test_modeling_blip.py b/tests/models/blip/test_modeling_blip.py
index 7431df7744b8..cf8c487082c7 100644
--- a/tests/models/blip/test_modeling_blip.py
+++ b/tests/models/blip/test_modeling_blip.py
@@ -21,10 +21,10 @@
 import unittest
 
 import numpy as np
-
 import requests
+
 from transformers import BlipConfig, BlipTextConfig, BlipVisionConfig
-from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.testing_utils import require_torch, require_torch_gpu, require_vision, slow, torch_device
 from transformers.utils import is_torch_available, is_vision_available
 
 from ...test_configuration_common import ConfigTester
@@ -35,6 +35,7 @@
     ids_tensor,
     random_attention_mask,
 )
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -69,7 +70,7 @@ def __init__(
         is_training=True,
         hidden_size=32,
         projection_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -220,7 +221,7 @@ def __init__(
         vocab_size=99,
         hidden_size=32,
         projection_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -301,7 +302,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class BlipTextModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (BlipTextModel,) if is_torch_available() else ()
     fx_compatible = False
     test_pruning = False
@@ -342,10 +342,12 @@ def test_model_from_pretrained(self):
             model = BlipTextModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
+    def test_pt_tf_model_equivalence(self):
+        super().test_pt_tf_model_equivalence(allow_missing_keys=True)
+
 
 class BlipModelTester:
     def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
-
         if text_kwargs is None:
             text_kwargs = {}
         if vision_kwargs is None:
@@ -393,8 +395,13 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class BlipModelTest(ModelTesterMixin, unittest.TestCase):
+class BlipModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (BlipModel,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": BlipModel, "image-to-text": BlipForConditionalGeneration}
+        if is_torch_available()
+        else {}
+    )
     fx_compatible = False
     test_head_masking = False
     test_pruning = False
@@ -489,8 +496,28 @@ def _create_and_check_torchscript(self, config, inputs_dict):
             model_state_dict = model.state_dict()
             loaded_model_state_dict = loaded_model.state_dict()
 
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
             self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
 
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
             models_equal = True
             for layer_name, p1 in model_state_dict.items():
                 p2 = loaded_model_state_dict[layer_name]
@@ -520,10 +547,107 @@ def test_model_from_pretrained(self):
             model = BlipModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
+    def test_pt_tf_model_equivalence(self):
+        super().test_pt_tf_model_equivalence(allow_missing_keys=True)
+
+
+class BlipTextRetrievalModelTester:
+    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
+        if text_kwargs is None:
+            text_kwargs = {}
+        if vision_kwargs is None:
+            vision_kwargs = {}
+
+        self.parent = parent
+        self.text_model_tester = BlipTextModelTester(parent, **text_kwargs)
+        self.vision_model_tester = BlipVisionModelTester(parent, **vision_kwargs)
+        self.is_training = is_training
+
+    def prepare_config_and_inputs(self):
+        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
+        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
+
+        config = self.get_config()
+
+        return config, input_ids, attention_mask, pixel_values
+
+    def get_config(self):
+        return BlipConfig.from_text_vision_configs(
+            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
+        )
+
+    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
+        model = BlipModel(config).to(torch_device).eval()
+        with torch.no_grad():
+            result = model(input_ids, pixel_values, attention_mask)
+        self.parent.assertEqual(
+            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
+        )
+        self.parent.assertEqual(
+            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, attention_mask, pixel_values = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "pixel_values": pixel_values,
+        }
+        return config, inputs_dict
+
 
 class BlipTextImageModelsModelTester:
     def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
+        if text_kwargs is None:
+            text_kwargs = {}
+        if vision_kwargs is None:
+            vision_kwargs = {}
+
+        self.parent = parent
+        self.text_model_tester = BlipTextModelTester(parent, **text_kwargs)
+        self.vision_model_tester = BlipVisionModelTester(parent, **vision_kwargs)
+        self.is_training = is_training
+
+    def prepare_config_and_inputs(self):
+        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
+        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
+
+        config = self.get_config()
+
+        return config, input_ids, attention_mask, pixel_values
+
+    def get_config(self):
+        return BlipConfig.from_text_vision_configs(
+            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
+        )
+
+    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
+        model = BlipModel(config).to(torch_device).eval()
+        with torch.no_grad():
+            result = model(input_ids, pixel_values, attention_mask)
+        self.parent.assertEqual(
+            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
+        )
+        self.parent.assertEqual(
+            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, attention_mask, pixel_values = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "labels": input_ids,
+            "attention_mask": attention_mask,
+            "pixel_values": pixel_values,
+        }
+        return config, inputs_dict
+
 
+class BlipVQAModelTester:
+    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
         if text_kwargs is None:
             text_kwargs = {}
         if vision_kwargs is None:
@@ -563,23 +687,314 @@ def prepare_config_and_inputs_for_common(self):
         config, input_ids, attention_mask, pixel_values = config_and_inputs
         inputs_dict = {
             "input_ids": input_ids,
+            "labels": input_ids,
+            "decoder_input_ids": input_ids,
             "attention_mask": attention_mask,
             "pixel_values": pixel_values,
         }
         return config, inputs_dict
 
 
+@require_torch
+@require_vision
+class BlipVQAModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (BlipForQuestionAnswering,) if is_torch_available() else ()
+    fx_compatible = False
+    test_head_masking = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_attention_outputs = False
+    test_torchscript = False
+
+    def setUp(self):
+        self.model_tester = BlipVQAModelTester(self)
+
+    def _prepare_inputs_for_vqa(self):
+        _, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        inputs_dict["labels"] = inputs_dict["input_ids"]
+        inputs_dict["decoder_input_ids"] = inputs_dict["input_ids"]
+        inputs_dict.pop("return_loss")
+        return inputs_dict
+
+    def test_class_name_consistency(self):
+        """
+        Tests that all VQA models have a class name that ends with "ForQuestionAnswering"
+        """
+        for model_class in self.all_model_classes:
+            model = model_class(self.model_tester.get_config())
+            self.assertTrue(
+                model.__class__.__name__.endswith("ForQuestionAnswering"),
+                f"Class name should end with 'ForVisualQuestionAnswering' got {model.__class__.__name__}",
+            )
+
+    def test_training(self):
+        """
+        Tests that all VQA models can be trained on a single batch
+        """
+        for model_class in self.all_model_classes:
+            model = model_class(self.model_tester.get_config()).to(torch_device)
+            model.train()
+            loss = model(**self.model_tester.prepare_config_and_inputs_for_common()[1]).loss
+            loss.backward()
+
+            # verify the gradients are not None
+            for name, param in model.named_parameters():
+                self.assertIsNotNone(param.grad, f"Gradients should not be None - got {param.grad} for {name}")
+
+    def test_forward_signature(self):
+        """
+        Test if the forward function has the expected arguments.
+        """
+        for model_class in self.all_model_classes:
+            model = model_class(self.model_tester.get_config())
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so args are the first n entries
+            args = list(signature.parameters.keys())
+            expected_args = [
+                "input_ids",
+                "attention_mask",
+                "labels",
+                "decoder_input_ids",
+                "decoder_attention_mask",
+            ]
+            for arg in expected_args:
+                self.assertTrue(
+                    arg in args,
+                    f"Argument {arg} of forward function signature should include {arg}. Found {args}.",
+                )
+
+    @unittest.skip(reason="Hidden_states is tested in individual model tests")
+    def test_hidden_states_output(self):
+        pass
+
+    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="BlipModel does not have input/output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+
+@require_torch
+class BlipTextRetrievalModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (BlipForImageTextRetrieval,) if is_torch_available() else ()
+    fx_compatible = False
+    test_head_masking = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_attention_outputs = False
+    test_torchscript = False
+
+    def setUp(self):
+        self.model_tester = BlipTextRetrievalModelTester(self)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip(reason="Hidden_states is tested in individual model tests")
+    def test_hidden_states_output(self):
+        pass
+
+    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="Retain_grad is tested in individual model tests")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    @unittest.skip(reason="BlipModel does not have input/output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            if model.config.is_encoder_decoder:
+                expected_arg_names = [
+                    "input_ids",
+                    "attention_mask",
+                    "decoder_input_ids",
+                    "decoder_attention_mask",
+                ]
+                expected_arg_names.extend(
+                    ["head_mask", "decoder_head_mask", "cross_attn_head_mask", "encoder_outputs"]
+                    if "head_mask" and "decoder_head_mask" and "cross_attn_head_mask" in arg_names
+                    else ["encoder_outputs"]
+                )
+                self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+            else:
+                expected_arg_names = ["input_ids"] if model_class != BlipForConditionalGeneration else ["pixel_values"]
+                self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_training(self):
+        if not self.model_tester.is_training:
+            return
+
+        for model_class in self.all_model_classes[:-1]:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            config.return_dict = True
+
+            model = model_class(config)
+            model.to(torch_device)
+            model.train()
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+
+            # hardcode labels to be the same as input_ids
+            inputs["labels"] = inputs["input_ids"]
+
+            loss = model(**inputs).loss
+            loss.backward()
+
+    def test_training_gradient_checkpointing(self):
+        if not self.model_tester.is_training:
+            return
+
+        for model_class in self.all_model_classes[:-1]:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            config.use_cache = False
+            config.return_dict = True
+
+            model = model_class(config)
+            model.to(torch_device)
+            model.gradient_checkpointing_enable()
+            model.train()
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+
+            # hardcode labels to be the same as input_ids
+            inputs["labels"] = inputs["input_ids"]
+
+            loss = model(**inputs).loss
+            loss.backward()
+
+    # override as the `logit_scale` parameter initilization is different for Blip
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    # check if `logit_scale` is initilized as per the original implementation
+                    if name == "logit_scale":
+                        self.assertAlmostEqual(
+                            param.data.item(),
+                            np.log(1 / 0.07),
+                            delta=1e-3,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    else:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+    def _create_and_check_torchscript(self, config, inputs_dict):
+        if not self.test_torchscript:
+            return
+
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        configs_no_init.torchscript = True
+        configs_no_init.return_dict = False
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
+
+            try:
+                input_ids = inputs_dict["input_ids"]
+                pixel_values = inputs_dict["pixel_values"]  # Blip needs pixel_values
+                traced_model = torch.jit.trace(model, (input_ids, pixel_values))
+            except RuntimeError:
+                self.fail("Couldn't trace module.")
+
+            with tempfile.TemporaryDirectory() as tmp_dir_name:
+                pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
+
+                try:
+                    torch.jit.save(traced_model, pt_file_name)
+                except Exception:
+                    self.fail("Couldn't save module.")
+
+                try:
+                    loaded_model = torch.jit.load(pt_file_name)
+                except Exception:
+                    self.fail("Couldn't load module.")
+
+            model.to(torch_device)
+            model.eval()
+
+            loaded_model.to(torch_device)
+            loaded_model.eval()
+
+            model_state_dict = model.state_dict()
+            loaded_model_state_dict = loaded_model.state_dict()
+
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
+            self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
+
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
+            models_equal = True
+            for layer_name, p1 in model_state_dict.items():
+                p2 = loaded_model_state_dict[layer_name]
+                if p1.data.ne(p2.data).sum() > 0:
+                    models_equal = False
+
+            self.assertTrue(models_equal)
+
+    def test_load_vision_text_config(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        # Save BlipConfig and check if we can load BlipVisionConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            vision_config = BlipVisionConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
+
+        # Save BlipConfig and check if we can load BlipTextConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            text_config = BlipTextConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in BLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = BlipModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
 @require_torch
 class BlipTextImageModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            BlipForConditionalGeneration,
-            BlipForQuestionAnswering,
-            BlipForImageTextRetrieval,
-        )
-        if is_torch_available()
-        else ()
-    )
+    all_model_classes = (BlipForConditionalGeneration,) if is_torch_available() else ()
     fx_compatible = False
     test_head_masking = False
     test_pruning = False
@@ -648,6 +1063,10 @@ def test_training(self):
             model.to(torch_device)
             model.train()
             inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+
+            # hardcode labels to be the same as input_ids
+            inputs["labels"] = inputs["input_ids"]
+
             loss = model(**inputs).loss
             loss.backward()
 
@@ -665,6 +1084,10 @@ def test_training_gradient_checkpointing(self):
             model.gradient_checkpointing_enable()
             model.train()
             inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+
+            # hardcode labels to be the same as input_ids
+            inputs["labels"] = inputs["input_ids"]
+
             loss = model(**inputs).loss
             loss.backward()
 
@@ -733,8 +1156,28 @@ def _create_and_check_torchscript(self, config, inputs_dict):
             model_state_dict = model.state_dict()
             loaded_model_state_dict = loaded_model.state_dict()
 
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
             self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
 
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
             models_equal = True
             for layer_name, p1 in model_state_dict.items():
                 p2 = loaded_model_state_dict[layer_name]
@@ -767,7 +1210,7 @@ def test_model_from_pretrained(self):
 
 # We will verify our results on an image of cute cats
 def prepare_img():
-    url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg"
+    url = "https://huggingface.co/hf-internal-testing/blip-test-image/resolve/main/demo.jpg"
     im = Image.open(requests.get(url, stream=True).raw)
     return im
 
@@ -798,9 +1241,10 @@ def test_inference_image_captioning(self):
         # Test output
         self.assertEqual(
             predictions[0].tolist(),
-            [30522, 1037, 3861, 1997, 1037, 2450, 3564, 2006, 1996, 3509, 2007, 2014, 3899, 102],
+            [30522, 1037, 3861, 1997, 1037, 2450, 1998, 2014, 3899, 2006, 1996, 3509, 102],
         )
 
+    @require_torch_gpu
     def test_inference_image_captioning_fp16(self):
         model = BlipForConditionalGeneration.from_pretrained(
             "Salesforce/blip-image-captioning-base", torch_dtype=torch.float16
@@ -825,7 +1269,7 @@ def test_inference_image_captioning_fp16(self):
         # Test output
         self.assertEqual(
             predictions[0].tolist(),
-            [30522, 1037, 3861, 1997, 1037, 2450, 3564, 2006, 1996, 3509, 2007, 2014, 3899, 102],
+            [30522, 1037, 3861, 1997, 1037, 2450, 1998, 2014, 3899, 2006, 1996, 3509, 102],
         )
 
     def test_inference_vqa(self):
@@ -853,7 +1297,7 @@ def test_inference_itm(self):
         out_itm = model(**inputs)
         out = model(**inputs, use_itm_head=False)
 
-        expected_scores = torch.Tensor([[0.9798, 0.0202]])
+        expected_scores = torch.Tensor([[0.0029, 0.9971]])
 
         self.assertTrue(torch.allclose(torch.nn.Softmax()(out_itm[0].cpu()), expected_scores, rtol=1e-3, atol=1e-3))
-        self.assertTrue(torch.allclose(out[0].cpu(), torch.Tensor([[0.5053]]), rtol=1e-3, atol=1e-3))
+        self.assertTrue(torch.allclose(out[0].cpu(), torch.Tensor([[0.5162]]), rtol=1e-3, atol=1e-3))
diff --git a/tests/models/blip/test_modeling_blip_text.py b/tests/models/blip/test_modeling_blip_text.py
index 2e5e37ce2e96..2301b776feb4 100644
--- a/tests/models/blip/test_modeling_blip_text.py
+++ b/tests/models/blip/test_modeling_blip_text.py
@@ -44,7 +44,7 @@ def __init__(
         vocab_size=99,
         hidden_size=32,
         projection_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -125,7 +125,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class BlipTextModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (BlipTextModel,) if is_torch_available() else ()
     fx_compatible = False
     test_pruning = False
@@ -165,3 +164,6 @@ def test_model_from_pretrained(self):
         for model_name in BLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
             model = BlipTextModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
+
+    def test_pt_tf_model_equivalence(self):
+        super().test_pt_tf_model_equivalence(allow_missing_keys=True)
diff --git a/tests/models/blip/test_modeling_tf_blip.py b/tests/models/blip/test_modeling_tf_blip.py
new file mode 100644
index 000000000000..ac6f8e3a67c9
--- /dev/null
+++ b/tests/models/blip/test_modeling_tf_blip.py
@@ -0,0 +1,902 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the TensorFlow Blip model. """
+
+
+from __future__ import annotations
+
+import inspect
+import tempfile
+import unittest
+
+import numpy as np
+import requests
+
+from transformers import BlipConfig, BlipTextConfig, BlipVisionConfig
+from transformers.testing_utils import require_tf, require_vision, slow
+from transformers.utils import is_tf_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers import (
+        TFBlipForConditionalGeneration,
+        TFBlipForImageTextRetrieval,
+        TFBlipForQuestionAnswering,
+        TFBlipModel,
+        TFBlipTextModel,
+        TFBlipVisionModel,
+    )
+    from transformers.models.blip.modeling_tf_blip import TF_BLIP_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import BlipProcessor
+
+
+class TFBlipVisionModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        image_size=30,
+        patch_size=2,
+        num_channels=3,
+        is_training=True,
+        hidden_size=32,
+        projection_dim=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        initializer_range=1e-10,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.scope = scope
+
+        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
+        num_patches = (image_size // patch_size) ** 2
+        self.seq_length = num_patches + 1
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        config = self.get_config()
+
+        return config, pixel_values
+
+    def get_config(self):
+        return BlipVisionConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            initializer_range=self.initializer_range,
+        )
+
+    def create_and_check_model(self, config, pixel_values):
+        model = TFBlipVisionModel(config=config)
+        result = model(pixel_values)
+        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
+        image_size = (self.image_size, self.image_size)
+        patch_size = (self.patch_size, self.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_tf
+class TFBlipVisionModelTest(TFModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as Blip does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (TFBlipVisionModel,) if is_tf_available() else ()
+    fx_compatible = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = TFBlipVisionModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=BlipVisionConfig, has_text_modality=False, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="Blip does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.call)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (tf.keras.layers.Layer))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, tf.keras.layers.Layer))
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip(reason="BlipVisionModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @unittest.skip(reason="BlipVisionModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_to_base(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in TF_BLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = TFBlipVisionModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+class TFBlipTextModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        projection_dim=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        bos_token_id=0,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.scope = scope
+        self.bos_token_id = bos_token_id
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        if input_mask is not None:
+            input_mask = input_mask.numpy()
+            batch_size, seq_length = input_mask.shape
+            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
+            for batch_idx, start_index in enumerate(rnd_start_indices):
+                input_mask[batch_idx, :start_index] = 1
+                input_mask[batch_idx, start_index:] = 0
+            input_mask = tf.convert_to_tensor(input_mask)
+
+        config = self.get_config()
+
+        return config, input_ids, input_mask
+
+    def get_config(self):
+        return BlipTextConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            max_position_embeddings=self.max_position_embeddings,
+            initializer_range=self.initializer_range,
+            bos_token_id=self.bos_token_id,
+        )
+
+    def create_and_check_model(self, config, input_ids, input_mask):
+        model = TFBlipTextModel(config=config)
+        result = model(input_ids, attention_mask=input_mask, training=False)
+        result = model(input_ids, training=False)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, input_mask = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_tf
+class TFBlipTextModelTest(TFModelTesterMixin, unittest.TestCase):
+    all_model_classes = (TFBlipTextModel,) if is_tf_available() else ()
+    fx_compatible = False
+    test_pruning = False
+    test_head_masking = False
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = TFBlipTextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=BlipTextConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip(reason="Blip does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="BlipTextModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @unittest.skip(reason="BlipTextModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_to_base(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in TF_BLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = TFBlipTextModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    def test_pt_tf_model_equivalence(self, allow_missing_keys=True):
+        super().test_pt_tf_model_equivalence(allow_missing_keys=allow_missing_keys)
+
+
+class TFBlipModelTester:
+    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
+        if text_kwargs is None:
+            text_kwargs = {}
+        if vision_kwargs is None:
+            vision_kwargs = {}
+
+        self.parent = parent
+        self.text_model_tester = TFBlipTextModelTester(parent, **text_kwargs)
+        self.vision_model_tester = TFBlipVisionModelTester(parent, **vision_kwargs)
+        self.is_training = is_training
+
+    def prepare_config_and_inputs(self):
+        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
+        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
+
+        config = self.get_config()
+
+        return config, input_ids, attention_mask, pixel_values
+
+    def get_config(self):
+        return BlipConfig.from_text_vision_configs(
+            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
+        )
+
+    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
+        model = TFBlipModel(config)
+        result = model(input_ids, pixel_values, attention_mask, training=False)
+        self.parent.assertEqual(
+            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
+        )
+        self.parent.assertEqual(
+            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, attention_mask, pixel_values = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "pixel_values": pixel_values,
+            "return_loss": True,
+        }
+        return config, inputs_dict
+
+
+@require_tf
+class TFBlipModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (TFBlipModel,) if is_tf_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": TFBlipModel, "image-to-text": TFBlipForConditionalGeneration}
+        if is_tf_available()
+        else {}
+    )
+    test_head_masking = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_attention_outputs = False
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = TFBlipModelTester(self)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip(reason="Hidden_states is tested in individual model tests")
+    def test_hidden_states_output(self):
+        pass
+
+    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="Retain_grad is tested in individual model tests")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    @unittest.skip(reason="BlipModel does not have input/output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    def test_load_vision_text_config(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        # Save BlipConfig and check if we can load BlipVisionConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            vision_config = BlipVisionConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
+
+        # Save BlipConfig and check if we can load BlipTextConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            text_config = BlipTextConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in TF_BLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = TFBlipModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    def test_pt_tf_model_equivalence(self, allow_missing_keys=True):
+        super().test_pt_tf_model_equivalence(allow_missing_keys=allow_missing_keys)
+
+    @unittest.skip("Matt: Re-enable this test when we have a proper export function for TF models.")
+    def test_saved_model_creation(self):
+        # This fails because the if return_loss: conditional can return None or a Tensor and TF hates that.
+        # We could fix that by setting the bool to a constant when exporting, but that requires a dedicated export
+        # function that we don't have yet.
+        pass
+
+
+class BlipTextRetrievalModelTester:
+    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
+        if text_kwargs is None:
+            text_kwargs = {}
+        if vision_kwargs is None:
+            vision_kwargs = {}
+
+        self.parent = parent
+        self.text_model_tester = TFBlipTextModelTester(parent, **text_kwargs)
+        self.vision_model_tester = TFBlipVisionModelTester(parent, **vision_kwargs)
+        self.is_training = is_training
+
+    def prepare_config_and_inputs(self):
+        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
+        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
+
+        config = self.get_config()
+
+        return config, input_ids, attention_mask, pixel_values
+
+    def get_config(self):
+        return BlipConfig.from_text_vision_configs(
+            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
+        )
+
+    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
+        model = TFBlipModel(config)
+        result = model(input_ids, pixel_values, attention_mask, training=False)
+        self.parent.assertEqual(
+            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
+        )
+        self.parent.assertEqual(
+            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, attention_mask, pixel_values = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "pixel_values": pixel_values,
+        }
+        return config, inputs_dict
+
+
+class BlipTextImageModelsModelTester:
+    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
+        if text_kwargs is None:
+            text_kwargs = {}
+        if vision_kwargs is None:
+            vision_kwargs = {}
+
+        self.parent = parent
+        self.text_model_tester = TFBlipTextModelTester(parent, **text_kwargs)
+        self.vision_model_tester = TFBlipVisionModelTester(parent, **vision_kwargs)
+        self.is_training = is_training
+
+    def prepare_config_and_inputs(self):
+        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
+        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
+
+        config = self.get_config()
+
+        return config, input_ids, attention_mask, pixel_values
+
+    def get_config(self):
+        return BlipConfig.from_text_vision_configs(
+            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
+        )
+
+    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
+        model = TFBlipModel(config)
+        result = model(input_ids, pixel_values, attention_mask, training=False)
+        self.parent.assertEqual(
+            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
+        )
+        self.parent.assertEqual(
+            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, attention_mask, pixel_values = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "labels": input_ids,
+            "attention_mask": attention_mask,
+            "pixel_values": pixel_values,
+        }
+        return config, inputs_dict
+
+
+class BlipVQAModelsModelTester:
+    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
+        if text_kwargs is None:
+            text_kwargs = {}
+        if vision_kwargs is None:
+            vision_kwargs = {}
+
+        self.parent = parent
+        self.text_model_tester = TFBlipTextModelTester(parent, **text_kwargs)
+        self.vision_model_tester = TFBlipVisionModelTester(parent, **vision_kwargs)
+        self.is_training = is_training
+
+    def prepare_config_and_inputs(self):
+        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
+        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
+
+        config = self.get_config()
+
+        return config, input_ids, attention_mask, pixel_values
+
+    def get_config(self):
+        return BlipConfig.from_text_vision_configs(
+            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
+        )
+
+    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
+        model = TFBlipModel(config)
+        result = model(input_ids, pixel_values, attention_mask, training=False)
+        self.parent.assertEqual(
+            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
+        )
+        self.parent.assertEqual(
+            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, attention_mask, pixel_values = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "decoder_input_ids": input_ids,
+            "labels": input_ids,
+            "attention_mask": attention_mask,
+            "pixel_values": pixel_values,
+        }
+        return config, inputs_dict
+
+
+@require_tf
+@require_vision
+class TFBlipVQAModelTest(TFModelTesterMixin, unittest.TestCase):
+    all_model_classes = (TFBlipForQuestionAnswering,) if is_tf_available() else ()
+    test_head_masking = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_attention_outputs = False
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = BlipVQAModelsModelTester(self)
+
+    def _prepare_inputs_for_vqa(self):
+        _, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        inputs_dict["labels"] = inputs_dict["input_ids"]
+        inputs_dict["decoder_input_ids"] = inputs_dict["input_ids"]
+        inputs_dict.pop("return_loss")
+        return inputs_dict
+
+    def test_class_name_consistency(self):
+        """
+        Tests that all VQA models have a class name that ends with "ForQuestionAnswering"
+        """
+        for model_class in self.all_model_classes:
+            model = model_class(self.model_tester.get_config())
+            self.assertTrue(
+                model.__class__.__name__.endswith("ForQuestionAnswering"),
+                f"Class name should end with 'ForVisualQuestionAnswering' got {model.__class__.__name__}",
+            )
+
+    def test_training(self):
+        """
+        Tests that all VQA models can be trained on a single batch
+        """
+        for model_class in self.all_model_classes:
+            model = model_class(self.model_tester.get_config())
+            loss = model(**self.model_tester.prepare_config_and_inputs_for_common()[1], training=True).loss
+
+            self.assertIsNotNone(loss, "Loss should not be None")
+
+    @unittest.skip(reason="Hidden_states is tested in individual model tests")
+    def test_hidden_states_output(self):
+        pass
+
+    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="Retain_grad is tested in individual model tests")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    @unittest.skip(reason="BlipModel does not have input/output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    @unittest.skip(reason="Tested in individual model tests")
+    def test_compile_tf_model(self):
+        pass
+
+    @unittest.skip("Model doesn't have a clean loss output.")
+    def test_keras_fit(self):
+        pass
+
+
+@require_tf
+class TFBlipTextRetrievalModelTest(TFModelTesterMixin, unittest.TestCase):
+    all_model_classes = (TFBlipForImageTextRetrieval,) if is_tf_available() else ()
+    test_head_masking = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_attention_outputs = False
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = BlipTextRetrievalModelTester(self)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip(reason="Hidden_states is tested in individual model tests")
+    def test_hidden_states_output(self):
+        pass
+
+    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="Retain_grad is tested in individual model tests")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    @unittest.skip(reason="BlipModel does not have input/output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    def test_training(self):
+        if not self.model_tester.is_training:
+            return
+
+        for model_class in self.all_model_classes[:-1]:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            config.return_dict = True
+
+            model = model_class(config)
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+
+            # hardcode labels to be the same as input_ids
+            inputs["labels"] = inputs["input_ids"]
+
+            loss = model(**inputs, training=True).loss
+            self.assertTrue(loss is not None)
+
+    def test_load_vision_text_config(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        # Save BlipConfig and check if we can load BlipVisionConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            vision_config = BlipVisionConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
+
+        # Save BlipConfig and check if we can load BlipTextConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            text_config = BlipTextConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in TF_BLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = TFBlipModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    @unittest.skip(reason="Tested in individual model tests")
+    def test_compile_tf_model(self):
+        pass
+
+    @unittest.skip("Model doesn't have a clean loss output.")
+    def test_keras_fit(self):
+        pass
+
+
+@require_tf
+class TFBlipTextImageModelTest(TFModelTesterMixin, unittest.TestCase):
+    all_model_classes = (TFBlipForConditionalGeneration,) if is_tf_available() else ()
+    test_head_masking = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_attention_outputs = False
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = BlipTextImageModelsModelTester(self)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip(reason="Hidden_states is tested in individual model tests")
+    def test_hidden_states_output(self):
+        pass
+
+    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
+    def test_inputs_embeds(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.call)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            if model.config.is_encoder_decoder:
+                expected_arg_names = [
+                    "input_ids",
+                    "attention_mask",
+                    "decoder_input_ids",
+                    "decoder_attention_mask",
+                ]
+                expected_arg_names.extend(
+                    ["head_mask", "decoder_head_mask", "cross_attn_head_mask", "encoder_outputs"]
+                    if "head_mask" and "decoder_head_mask" and "cross_attn_head_mask" in arg_names
+                    else ["encoder_outputs"]
+                )
+                self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+            else:
+                expected_arg_names = (
+                    ["input_ids"] if model_class != TFBlipForConditionalGeneration else ["pixel_values"]
+                )
+                self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    @unittest.skip(reason="Tested in individual model tests")
+    def test_compile_tf_model(self):
+        pass
+
+    @unittest.skip("Has some odd input names!")
+    def test_keras_fit(self):
+        pass
+
+    @unittest.skip(reason="Retain_grad is tested in individual model tests")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    @unittest.skip(reason="BlipModel does not have input/output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    def test_training(self):
+        if not self.model_tester.is_training:
+            return
+
+        for model_class in self.all_model_classes[:-1]:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            config.return_dict = True
+
+            model = model_class(config)
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+
+            # hardcode labels to be the same as input_ids
+            inputs["labels"] = inputs["input_ids"]
+
+            loss = model(**inputs, training=True).loss
+            self.assertIsNotNone(loss)
+
+    def test_load_vision_text_config(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        # Save BlipConfig and check if we can load BlipVisionConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            vision_config = BlipVisionConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
+
+        # Save BlipConfig and check if we can load BlipTextConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            text_config = BlipTextConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in TF_BLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = TFBlipModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "https://huggingface.co/hf-internal-testing/blip-test-image/resolve/main/demo.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    return im
+
+
+@require_vision
+@require_tf
+@slow
+class TFBlipModelIntegrationTest(unittest.TestCase):
+    def test_inference_image_captioning(self):
+        model = TFBlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+        processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+        image = prepare_img()
+
+        # image only
+        inputs = processor(images=image, return_tensors="tf")
+
+        predictions = model.generate(**inputs)
+
+        # Test output
+        self.assertEqual(
+            predictions[0].numpy().tolist(), [30522, 1037, 2450, 3564, 2006, 1996, 3509, 2007, 2014, 3899, 102]
+        )
+
+        # image and context
+        context = ["a picture of"]
+        inputs = processor(images=image, text=context, return_tensors="tf")
+
+        predictions = model.generate(**inputs)
+
+        # Test output
+        self.assertEqual(
+            predictions[0].numpy().tolist(),
+            [30522, 1037, 3861, 1997, 1037, 2450, 1998, 2014, 3899, 2006, 1996, 3509, 102],
+        )
+
+    def test_inference_vqa(self):
+        model = TFBlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
+        processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
+
+        image = prepare_img()
+        text = "how many dogs are in the picture?"
+        inputs = processor(image, text=text, return_tensors="tf")
+        out = model.generate(**inputs)
+
+        # Test output
+        self.assertEqual(out[0].numpy().tolist(), [30522, 1015, 102])
+
+    def test_inference_itm(self):
+        model = TFBlipForImageTextRetrieval.from_pretrained("Salesforce/blip-itm-base-coco")
+        processor = BlipProcessor.from_pretrained("Salesforce/blip-itm-base-coco")
+
+        image = prepare_img()
+        text = "A woman and her dog sitting in a beach"
+
+        inputs = processor(image, text, return_tensors="tf")
+
+        out_itm = model(**inputs)
+        out = model(**inputs, use_itm_head=False, training=False)
+
+        expected_scores = tf.convert_to_tensor([[0.0029, 0.9971]])
+        self.assertTrue(np.allclose(tf.nn.softmax(out_itm[0]).numpy(), expected_scores, rtol=1e-3, atol=1e-3))
+        self.assertTrue(np.allclose(out[0], tf.convert_to_tensor([[0.5162]]), rtol=1e-3, atol=1e-3))
diff --git a/tests/models/blip/test_modeling_tf_blip_text.py b/tests/models/blip/test_modeling_tf_blip_text.py
new file mode 100644
index 000000000000..a3da1a7f675d
--- /dev/null
+++ b/tests/models/blip/test_modeling_tf_blip_text.py
@@ -0,0 +1,169 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the TensorFlow Blip model. """
+from __future__ import annotations
+
+import unittest
+
+import numpy as np
+
+from transformers import BlipTextConfig
+from transformers.testing_utils import require_tf, slow
+from transformers.utils import is_tf_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers import TFBlipTextModel
+    from transformers.models.blip.modeling_tf_blip import TF_BLIP_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+class BlipTextModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        projection_dim=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        bos_token_id=0,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.scope = scope
+        self.bos_token_id = bos_token_id
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        if input_mask is not None:
+            input_mask = input_mask.numpy()
+            batch_size, seq_length = input_mask.shape
+            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
+            for batch_idx, start_index in enumerate(rnd_start_indices):
+                input_mask[batch_idx, :start_index] = 1
+                input_mask[batch_idx, start_index:] = 0
+
+        config = self.get_config()
+
+        return config, input_ids, tf.convert_to_tensor(input_mask)
+
+    def get_config(self):
+        return BlipTextConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            max_position_embeddings=self.max_position_embeddings,
+            initializer_range=self.initializer_range,
+            bos_token_id=self.bos_token_id,
+        )
+
+    def create_and_check_model(self, config, input_ids, input_mask):
+        model = TFBlipTextModel(config=config)
+        result = model(input_ids, attention_mask=input_mask, training=False)
+        result = model(input_ids, training=False)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, input_mask = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_tf
+class BlipTextModelTest(TFModelTesterMixin, unittest.TestCase):
+    all_model_classes = (TFBlipTextModel,) if is_tf_available() else ()
+    test_onnx = False
+    test_pruning = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = BlipTextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=BlipTextConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_training(self):
+        pass
+
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(reason="Blip does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="BlipTextModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @unittest.skip(reason="BlipTextModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_to_base(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in TF_BLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = TFBlipTextModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    def test_pt_tf_model_equivalence(self, allow_missing_keys=True):
+        super().test_pt_tf_model_equivalence(allow_missing_keys=allow_missing_keys)
diff --git a/tests/models/trajectory_transformer/__init__.py b/tests/models/blip_2/__init__.py
similarity index 100%
rename from tests/models/trajectory_transformer/__init__.py
rename to tests/models/blip_2/__init__.py
diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py
new file mode 100644
index 000000000000..66d59465a7c5
--- /dev/null
+++ b/tests/models/blip_2/test_modeling_blip_2.py
@@ -0,0 +1,995 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch BLIP-2 model. """
+
+
+import inspect
+import tempfile
+import unittest
+
+import numpy as np
+import requests
+
+from transformers import CONFIG_MAPPING, Blip2Config, Blip2QFormerConfig, Blip2VisionConfig
+from transformers.testing_utils import require_torch, require_torch_multi_gpu, require_vision, slow, torch_device
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import (
+    ModelTesterMixin,
+    _config_zero_init,
+    floats_tensor,
+    ids_tensor,
+    random_attention_mask,
+)
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import Blip2ForConditionalGeneration, Blip2Model, Blip2VisionModel
+    from transformers.models.blip_2.modeling_blip_2 import BLIP_2_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import Blip2Processor
+
+
+class Blip2VisionModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        image_size=30,
+        patch_size=2,
+        num_channels=3,
+        is_training=True,
+        hidden_size=32,
+        projection_dim=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        initializer_range=1e-10,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.scope = scope
+
+        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
+        num_patches = (image_size // patch_size) ** 2
+        self.seq_length = num_patches + 1
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        config = self.get_config()
+
+        return config, pixel_values
+
+    def get_config(self):
+        return Blip2VisionConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            initializer_range=self.initializer_range,
+        )
+
+    def create_and_check_model(self, config, pixel_values):
+        model = Blip2VisionModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(pixel_values)
+        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
+        image_size = (self.image_size, self.image_size)
+        patch_size = (self.patch_size, self.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class Blip2VisionModelTest(ModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as BLIP-2's vision encoder does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (Blip2VisionModel,) if is_torch_available() else ()
+    fx_compatible = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = Blip2VisionModelTester(self)
+        self.config_tester = ConfigTester(
+            self, config_class=Blip2VisionConfig, has_text_modality=False, hidden_size=37
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="BLIP-2's vision encoder does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_training(self):
+        pass
+
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(reason="Blip2VisionModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @unittest.skip(reason="Blip2VisionModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_to_base(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in BLIP_2_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = Blip2VisionModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+class Blip2QFormerModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        projection_dim=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        bos_token_id=0,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.scope = scope
+        self.bos_token_id = bos_token_id
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        if input_mask is not None:
+            batch_size, seq_length = input_mask.shape
+            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
+            for batch_idx, start_index in enumerate(rnd_start_indices):
+                input_mask[batch_idx, :start_index] = 1
+                input_mask[batch_idx, start_index:] = 0
+
+        config = self.get_config()
+
+        return config, input_ids, input_mask
+
+    def get_config(self):
+        return Blip2QFormerConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            max_position_embeddings=self.max_position_embeddings,
+            initializer_range=self.initializer_range,
+            bos_token_id=self.bos_token_id,
+        )
+
+
+# this class is based on `OPTModelTester` found in tests/models/opt/test_modeling_opt.py
+class Blip2TextModelDecoderOnlyTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        seq_length=7,
+        is_training=True,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=20,
+        eos_token_id=2,
+        pad_token_id=1,
+        bos_token_id=0,
+        embed_dim=16,
+        num_labels=3,
+        word_embed_proj_dim=16,
+        type_sequence_label_size=2,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.embed_dim = embed_dim
+        self.num_labels = num_labels
+        self.type_sequence_label_size = type_sequence_label_size
+        self.word_embed_proj_dim = word_embed_proj_dim
+        self.is_encoder_decoder = False
+
+    def prepare_config_and_inputs(self):
+        config = self.get_config()
+
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(3)
+        input_ids[:, -1] = self.eos_token_id  # Eos Token
+
+        attention_mask = input_ids.ne(self.pad_token_id)
+
+        return config, input_ids, attention_mask
+
+    def get_config(self):
+        return CONFIG_MAPPING["opt"](
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
+            embed_dim=self.embed_dim,
+            is_encoder_decoder=False,
+            word_embed_proj_dim=self.word_embed_proj_dim,
+        )
+
+
+# this model tester uses a decoder-only language model (OPT)
+class Blip2ForConditionalGenerationDecoderOnlyModelTester:
+    def __init__(
+        self, parent, vision_kwargs=None, qformer_kwargs=None, text_kwargs=None, is_training=True, num_query_tokens=10
+    ):
+        if vision_kwargs is None:
+            vision_kwargs = {}
+        if qformer_kwargs is None:
+            qformer_kwargs = {}
+        if text_kwargs is None:
+            text_kwargs = {}
+
+        self.parent = parent
+        self.vision_model_tester = Blip2VisionModelTester(parent, **vision_kwargs)
+        self.qformer_model_tester = Blip2QFormerModelTester(parent, **qformer_kwargs)
+        self.text_model_tester = Blip2TextModelDecoderOnlyTester(parent, **text_kwargs)
+        self.is_training = is_training
+        self.num_query_tokens = num_query_tokens
+
+    def prepare_config_and_inputs(self):
+        _, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
+        _, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
+
+        config = self.get_config()
+
+        return config, input_ids, attention_mask, pixel_values
+
+    def get_config(self):
+        return Blip2Config.from_vision_qformer_text_configs(
+            vision_config=self.vision_model_tester.get_config(),
+            qformer_config=self.qformer_model_tester.get_config(),
+            text_config=self.text_model_tester.get_config(),
+            num_query_tokens=self.num_query_tokens,
+        )
+
+    def create_and_check_for_conditional_generation(self, config, input_ids, attention_mask, pixel_values):
+        model = Blip2ForConditionalGeneration(config).to(torch_device).eval()
+        with torch.no_grad():
+            result = model(pixel_values, input_ids, attention_mask)
+
+        expected_seq_length = self.num_query_tokens + self.text_model_tester.seq_length
+        self.parent.assertEqual(
+            result.logits.shape,
+            (self.vision_model_tester.batch_size, expected_seq_length, self.text_model_tester.vocab_size),
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, attention_mask, pixel_values = config_and_inputs
+        inputs_dict = {
+            "pixel_values": pixel_values,
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "labels": input_ids,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (Blip2ForConditionalGeneration,) if is_torch_available() else ()
+    fx_compatible = False
+    test_head_masking = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_attention_outputs = False
+    test_torchscript = False
+
+    def setUp(self):
+        self.model_tester = Blip2ForConditionalGenerationDecoderOnlyModelTester(self)
+
+    def test_for_conditional_generation(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs)
+
+    @unittest.skip(reason="Hidden_states is tested in individual model tests")
+    def test_hidden_states_output(self):
+        pass
+
+    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="Retain_grad is tested in individual model tests")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    @unittest.skip(reason="Blip2Model does not have input/output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    @unittest.skip(reason="There's no base Blip2Model")
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @unittest.skip(reason="There's no base Blip2Model")
+    def test_save_load_fast_init_to_base(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_load_vision_qformer_text_config(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        # Save Blip2Config and check if we can load Blip2VisionConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            vision_config = Blip2VisionConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
+
+        # Save Blip2Config and check if we can load Blip2QFormerConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            qformer_config = Blip2QFormerConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.qformer_config.to_dict(), qformer_config.to_dict())
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in BLIP_2_PRETRAINED_MODEL_ARCHIVE_LIST:
+            model = Blip2ForConditionalGeneration.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+# this class is based on `T5ModelTester` found in tests/models/t5/test_modeling_t5.py
+class Blip2TextModelTester:
+    def __init__(
+        self,
+        parent,
+        vocab_size=99,
+        batch_size=12,
+        encoder_seq_length=7,
+        decoder_seq_length=9,
+        # For common tests
+        is_training=True,
+        use_attention_mask=True,
+        use_labels=True,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        d_ff=37,
+        relative_attention_num_buckets=8,
+        dropout_rate=0.1,
+        initializer_factor=0.002,
+        eos_token_id=1,
+        pad_token_id=0,
+        decoder_start_token_id=0,
+        scope=None,
+        decoder_layers=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.encoder_seq_length = encoder_seq_length
+        self.decoder_seq_length = decoder_seq_length
+        # For common tests
+        self.seq_length = self.decoder_seq_length
+        self.is_training = is_training
+        self.use_attention_mask = use_attention_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.d_ff = d_ff
+        self.relative_attention_num_buckets = relative_attention_num_buckets
+        self.dropout_rate = dropout_rate
+        self.initializer_factor = initializer_factor
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.decoder_start_token_id = decoder_start_token_id
+        self.scope = None
+        self.decoder_layers = decoder_layers
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
+        decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+
+        attention_mask = None
+        decoder_attention_mask = None
+        if self.use_attention_mask:
+            attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
+            decoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
+
+        lm_labels = None
+        if self.use_labels:
+            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+
+        config = self.get_config()
+
+        return (
+            config,
+            input_ids,
+            decoder_input_ids,
+            attention_mask,
+            decoder_attention_mask,
+            lm_labels,
+        )
+
+    def get_config(self):
+        return CONFIG_MAPPING["t5"](
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            d_ff=self.d_ff,
+            d_kv=self.hidden_size // self.num_attention_heads,
+            num_layers=self.num_hidden_layers,
+            num_decoder_layers=self.decoder_layers,
+            num_heads=self.num_attention_heads,
+            relative_attention_num_buckets=self.relative_attention_num_buckets,
+            dropout_rate=self.dropout_rate,
+            initializer_factor=self.initializer_factor,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.pad_token_id,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.decoder_start_token_id,
+        )
+
+
+# this model tester uses an encoder-decoder language model (T5)
+class Blip2ModelTester:
+    def __init__(
+        self, parent, vision_kwargs=None, qformer_kwargs=None, text_kwargs=None, is_training=True, num_query_tokens=10
+    ):
+        if vision_kwargs is None:
+            vision_kwargs = {}
+        if qformer_kwargs is None:
+            qformer_kwargs = {}
+        if text_kwargs is None:
+            text_kwargs = {}
+
+        self.parent = parent
+        self.vision_model_tester = Blip2VisionModelTester(parent, **vision_kwargs)
+        self.qformer_model_tester = Blip2QFormerModelTester(parent, **qformer_kwargs)
+        self.text_model_tester = Blip2TextModelTester(parent, **text_kwargs)
+        self.is_training = is_training
+        self.num_query_tokens = num_query_tokens
+
+    def prepare_config_and_inputs(self):
+        _, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
+        (
+            _,
+            input_ids,
+            decoder_input_ids,
+            attention_mask,
+            decoder_attention_mask,
+            lm_labels,
+        ) = self.text_model_tester.prepare_config_and_inputs()
+
+        config = self.get_config()
+
+        return config, input_ids, attention_mask, pixel_values, decoder_input_ids, decoder_attention_mask, lm_labels
+
+    def get_config(self):
+        return Blip2Config.from_vision_qformer_text_configs(
+            vision_config=self.vision_model_tester.get_config(),
+            qformer_config=self.qformer_model_tester.get_config(),
+            text_config=self.text_model_tester.get_config(),
+            num_query_tokens=self.num_query_tokens,
+        )
+
+    def create_and_check_for_conditional_generation(
+        self, config, input_ids, attention_mask, pixel_values, decoder_input_ids, decoder_attention_mask, labels
+    ):
+        model = Blip2ForConditionalGeneration(config).to(torch_device).eval()
+        with torch.no_grad():
+            result = model(pixel_values, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask)
+
+        self.parent.assertEqual(
+            result.logits.shape,
+            (
+                self.vision_model_tester.batch_size,
+                self.text_model_tester.seq_length,
+                self.text_model_tester.vocab_size,
+            ),
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            attention_mask,
+            pixel_values,
+            decoder_input_ids,
+            decoder_attention_mask,
+            labels,
+        ) = config_and_inputs
+        inputs_dict = {
+            "pixel_values": pixel_values,
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "decoder_input_ids": decoder_input_ids,
+            "decoder_attention_mask": decoder_attention_mask,
+            "labels": labels,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class Blip2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (Blip2ForConditionalGeneration, Blip2Model) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": Blip2Model,
+            "image-to-text": Blip2ForConditionalGeneration,
+            "visual-question-answering": Blip2ForConditionalGeneration,
+        }
+        if is_torch_available()
+        else {}
+    )
+    fx_compatible = False
+    test_head_masking = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_attention_outputs = False
+    test_torchscript = False
+
+    def setUp(self):
+        self.model_tester = Blip2ModelTester(self)
+
+    def test_for_conditional_generation(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs)
+
+    @unittest.skip(reason="Hidden_states is tested in individual model tests")
+    def test_hidden_states_output(self):
+        pass
+
+    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="Retain_grad is tested in individual model tests")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    @unittest.skip(reason="Blip2Model does not have input/output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    @unittest.skip(reason="There's no base Blip2Model")
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @unittest.skip(reason="There's no base Blip2Model")
+    def test_save_load_fast_init_to_base(self):
+        pass
+
+    @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
+    def test_cpu_offload(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_load_vision_qformer_text_config(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        # Save Blip2Config and check if we can load Blip2VisionConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            vision_config = Blip2VisionConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
+
+        # Save Blip2Config and check if we can load Blip2QFormerConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            qformer_config = Blip2QFormerConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.qformer_config.to_dict(), qformer_config.to_dict())
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in BLIP_2_PRETRAINED_MODEL_ARCHIVE_LIST:
+            model = Blip2ForConditionalGeneration.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    def test_get_text_features(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        inputs_dict = {
+            "input_ids": torch.LongTensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]).to(torch_device),
+            "attention_mask": torch.LongTensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]).to(torch_device),
+            "decoder_input_ids": torch.LongTensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]).to(torch_device),
+        }
+
+        model = Blip2Model(config).to(torch_device)
+        model.eval()
+        text_features = model.get_text_features(**inputs_dict)
+        self.assertEqual(text_features[0].shape, (1, 10, config.text_config.vocab_size))
+
+    def test_get_image_features(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        keys_to_pop = ["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"]
+
+        for key in keys_to_pop:
+            inputs_dict.pop(key)
+
+        model = Blip2Model(config).to(torch_device)
+        model.eval()
+        image_features = model.get_image_features(**inputs_dict)
+        self.assertEqual(
+            image_features[0].shape,
+            (
+                self.model_tester.vision_model_tester.batch_size,
+                self.model_tester.vision_model_tester.seq_length,
+                config.vision_config.hidden_size,
+            ),
+        )
+
+    def test_get_qformer_features(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        keys_to_pop = ["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"]
+
+        for key in keys_to_pop:
+            inputs_dict.pop(key)
+
+        model = Blip2Model(config).to(torch_device)
+        model.eval()
+        qformer_features = model.get_qformer_features(**inputs_dict)
+        self.assertEqual(
+            qformer_features[0].shape,
+            (self.model_tester.vision_model_tester.batch_size, 10, config.vision_config.hidden_size),
+        )
+
+    # override from common to deal with nested configurations (`vision_config`, `text_config` and `qformer_config`)
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for key in ["vision_config", "qformer_config", "text_config"]:
+            setattr(configs_no_init, key, _config_zero_init(getattr(configs_no_init, key)))
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    self.assertIn(
+                        ((param.data.mean() * 1e9).round() / 1e9).item(),
+                        [0.0, 1.0],
+                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                    )
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "https://huggingface.co/hf-internal-testing/blip-test-image/resolve/main/demo.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+    return image
+
+
+@require_vision
+@require_torch
+@slow
+class Blip2ModelIntegrationTest(unittest.TestCase):
+    def test_inference_opt(self):
+        processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
+        model = Blip2ForConditionalGeneration.from_pretrained(
+            "Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16
+        ).to(torch_device)
+
+        # prepare image
+        image = prepare_img()
+        inputs = processor(images=image, return_tensors="pt").to(torch_device, dtype=torch.float16)
+
+        predictions = model.generate(**inputs)
+        generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
+
+        # Test output
+        self.assertEqual(predictions[0].tolist(), [2, 102, 693, 2828, 15, 5, 4105, 19, 10, 2335, 50118])
+        self.assertEqual("a woman sitting on the beach with a dog", generated_text)
+
+        # image and context
+        prompt = "Question: which city is this? Answer:"
+        inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
+
+        predictions = model.generate(**inputs)
+        generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
+
+        # Test output
+        self.assertEqual(
+            predictions[0].tolist(),
+            [2, 24, 18, 45, 10, 343, 6, 24, 18, 10, 4105, 50118],
+        )
+        self.assertEqual(generated_text, "it's not a city, it's a beach")
+
+    def test_inference_opt_batched_beam_search(self):
+        processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
+        model = Blip2ForConditionalGeneration.from_pretrained(
+            "Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16
+        ).to(torch_device)
+
+        # prepare image
+        image = prepare_img()
+        inputs = processor(images=[image, image], return_tensors="pt").to(torch_device, dtype=torch.float16)
+
+        predictions = model.generate(**inputs, num_beams=2)
+
+        # Test output (in this case, slightly different from greedy search)
+        self.assertEqual(predictions[0].tolist(), [2, 102, 693, 2828, 15, 5, 4105, 19, 69, 2335, 50118])
+        self.assertEqual(predictions[1].tolist(), [2, 102, 693, 2828, 15, 5, 4105, 19, 69, 2335, 50118])
+
+    def test_inference_t5(self):
+        processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
+        model = Blip2ForConditionalGeneration.from_pretrained(
+            "Salesforce/blip2-flan-t5-xl", torch_dtype=torch.float16
+        ).to(torch_device)
+
+        # prepare image
+        image = prepare_img()
+        inputs = processor(images=image, return_tensors="pt").to(torch_device, dtype=torch.float16)
+
+        predictions = model.generate(**inputs)
+        generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
+
+        # Test output
+        self.assertEqual(predictions[0].tolist(), [0, 2335, 1556, 28, 1782, 30, 8, 2608, 1])
+        self.assertEqual("woman playing with dog on the beach", generated_text)
+
+        # image and context
+        prompt = "Question: which city is this? Answer:"
+        inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
+
+        predictions = model.generate(**inputs)
+        generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
+
+        # Test output
+        self.assertEqual(
+            predictions[0].tolist(),
+            [0, 3, 7, 152, 67, 839, 1],
+        )
+        self.assertEqual(generated_text, "san diego")
+
+    def test_inference_t5_batched_beam_search(self):
+        processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
+        model = Blip2ForConditionalGeneration.from_pretrained(
+            "Salesforce/blip2-flan-t5-xl", torch_dtype=torch.float16
+        ).to(torch_device)
+
+        # prepare image
+        image = prepare_img()
+        inputs = processor(images=[image, image], return_tensors="pt").to(torch_device, dtype=torch.float16)
+
+        predictions = model.generate(**inputs, num_beams=2)
+
+        # Test output (in this case, slightly different from greedy search)
+        self.assertEqual(predictions[0].tolist(), [0, 2335, 1556, 28, 1782, 30, 8, 2608, 1])
+        self.assertEqual(predictions[1].tolist(), [0, 2335, 1556, 28, 1782, 30, 8, 2608, 1])
+
+    @require_torch_multi_gpu
+    def test_inference_opt_multi_gpu(self):
+        processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
+        model = Blip2ForConditionalGeneration.from_pretrained(
+            "Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16, device_map="balanced"
+        )
+
+        # prepare image
+        image = prepare_img()
+        inputs = processor(images=image, return_tensors="pt").to(0, dtype=torch.float16)
+
+        predictions = model.generate(**inputs)
+        generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
+
+        # Test output
+        self.assertEqual(predictions[0].tolist(), [2, 102, 693, 2828, 15, 5, 4105, 19, 10, 2335, 50118])
+        self.assertEqual("a woman sitting on the beach with a dog", generated_text)
+
+        # image and context
+        prompt = "Question: which city is this? Answer:"
+        inputs = processor(images=image, text=prompt, return_tensors="pt").to(0, dtype=torch.float16)
+
+        predictions = model.generate(**inputs)
+        generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
+
+        # Test output
+        self.assertEqual(
+            predictions[0].tolist(),
+            [2, 24, 18, 45, 10, 343, 6, 24, 18, 10, 4105, 50118],
+        )
+        self.assertEqual(generated_text, "it's not a city, it's a beach")
+
+    @require_torch_multi_gpu
+    def test_inference_t5_multi_gpu(self):
+        processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
+        device_map = device_map = {
+            "query_tokens": 0,
+            "vision_model": 0,
+            "language_model": 1,
+            "language_projection": 0,
+            "qformer": 0,
+        }
+
+        model = Blip2ForConditionalGeneration.from_pretrained(
+            "Salesforce/blip2-flan-t5-xl", torch_dtype=torch.float16, device_map=device_map
+        )
+
+        # prepare image
+        image = prepare_img()
+        inputs = processor(images=image, return_tensors="pt").to(0, dtype=torch.float16)
+
+        predictions = model.generate(**inputs)
+        generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
+
+        # Test output
+        self.assertEqual(predictions[0].tolist(), [0, 2335, 1556, 28, 1782, 30, 8, 2608, 1])
+        self.assertEqual("woman playing with dog on the beach", generated_text)
+
+        # image and context
+        prompt = "Question: which city is this? Answer:"
+        inputs = processor(images=image, text=prompt, return_tensors="pt").to(0, dtype=torch.float16)
+
+        predictions = model.generate(**inputs)
+        generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
+
+        # Test output
+        self.assertEqual(
+            predictions[0].tolist(),
+            [0, 3, 7, 152, 67, 839, 1],
+        )
+        self.assertEqual(generated_text, "san diego")
diff --git a/tests/models/blip_2/test_processor_blip_2.py b/tests/models/blip_2/test_processor_blip_2.py
new file mode 100644
index 000000000000..5f13143c71cd
--- /dev/null
+++ b/tests/models/blip_2/test_processor_blip_2.py
@@ -0,0 +1,151 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import shutil
+import tempfile
+import unittest
+
+import numpy as np
+import pytest
+
+from transformers.testing_utils import require_vision
+from transformers.utils import is_vision_available
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import AutoProcessor, Blip2Processor, BlipImageProcessor, GPT2Tokenizer, PreTrainedTokenizerFast
+
+
+@require_vision
+class Blip2ProcessorTest(unittest.TestCase):
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+
+        image_processor = BlipImageProcessor()
+        tokenizer = GPT2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-GPT2Model")
+
+        processor = Blip2Processor(image_processor, tokenizer)
+
+        processor.save_pretrained(self.tmpdirname)
+
+    def get_tokenizer(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
+
+    def get_image_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def prepare_image_inputs(self):
+        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
+        or a list of PyTorch tensors if one specifies torchify=True.
+        """
+
+        image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
+
+        image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
+
+        return image_inputs
+
+    def test_save_load_pretrained_additional_features(self):
+        processor = Blip2Processor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
+        processor.save_pretrained(self.tmpdirname)
+
+        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
+        image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
+
+        processor = Blip2Processor.from_pretrained(
+            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
+        )
+
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
+        self.assertIsInstance(processor.tokenizer, PreTrainedTokenizerFast)
+
+        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
+        self.assertIsInstance(processor.image_processor, BlipImageProcessor)
+
+    def test_image_processor(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = Blip2Processor(tokenizer=tokenizer, image_processor=image_processor)
+
+        image_input = self.prepare_image_inputs()
+
+        input_feat_extract = image_processor(image_input, return_tensors="np")
+        input_processor = processor(images=image_input, return_tensors="np")
+
+        for key in input_feat_extract.keys():
+            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
+
+    def test_tokenizer(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = Blip2Processor(tokenizer=tokenizer, image_processor=image_processor)
+
+        input_str = "lower newer"
+
+        encoded_processor = processor(text=input_str)
+
+        encoded_tok = tokenizer(input_str, return_token_type_ids=False)
+
+        for key in encoded_tok.keys():
+            self.assertListEqual(encoded_tok[key], encoded_processor[key])
+
+    def test_processor(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = Blip2Processor(tokenizer=tokenizer, image_processor=image_processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+
+        self.assertListEqual(list(inputs.keys()), ["pixel_values", "input_ids", "attention_mask"])
+
+        # test if it raises when no input is passed
+        with pytest.raises(ValueError):
+            processor()
+
+    def test_tokenizer_decode(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = Blip2Processor(tokenizer=tokenizer, image_processor=image_processor)
+
+        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
+
+        decoded_processor = processor.batch_decode(predicted_ids)
+        decoded_tok = tokenizer.batch_decode(predicted_ids)
+
+        self.assertListEqual(decoded_tok, decoded_processor)
+
+    def test_model_input_names(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = Blip2Processor(tokenizer=tokenizer, image_processor=image_processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+
+        # For now the processor supports only ['pixel_values', 'input_ids', 'attention_mask']
+        self.assertListEqual(list(inputs.keys()), ["pixel_values", "input_ids", "attention_mask"])
diff --git a/tests/models/bloom/test_modeling_bloom.py b/tests/models/bloom/test_modeling_bloom.py
index ee703b035ac6..c05d45ebecc2 100644
--- a/tests/models/bloom/test_modeling_bloom.py
+++ b/tests/models/bloom/test_modeling_bloom.py
@@ -23,6 +23,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -37,10 +38,6 @@
         BloomModel,
         BloomTokenizerFast,
     )
-    from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_10, is_torch_less_than_1_9
-else:
-    is_torch_greater_or_equal_than_1_10 = False
-    is_torch_less_than_1_9 = True
 
 
 @require_torch
@@ -57,7 +54,7 @@ def __init__(
         use_mc_token_ids=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -319,8 +316,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class BloomModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
+class BloomModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             BloomModel,
@@ -334,6 +330,18 @@ class BloomModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase)
     )
 
     all_generative_model_classes = (BloomForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": BloomModel,
+            "question-answering": BloomForQuestionAnswering,
+            "text-classification": BloomForSequenceClassification,
+            "text-generation": BloomForCausalLM,
+            "token-classification": BloomForTokenClassification,
+            "zero-shot": BloomForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
     fx_compatible = True
     test_missing_keys = False
     test_pruning = False
@@ -382,6 +390,10 @@ def test_bloom_weight_initialization(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_bloom_weight_initialization(*config_and_inputs)
 
+    @unittest.skip("Bloom has a non-standard KV cache format.")
+    def test_past_key_values_format(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in BLOOM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
@@ -411,7 +423,7 @@ def test_simple_generation(self):
         # >=1b1 + allow_fp16_reduced_precision_reduction = False  + torch.bmm  ==> PASS
 
         path_560m = "bigscience/bloom-560m"
-        model = BloomForCausalLM.from_pretrained(path_560m, use_cache=True, revision="gs555750").cuda()
+        model = BloomForCausalLM.from_pretrained(path_560m, use_cache=True, revision="gs555750").to(torch_device)
         model = model.eval()
         tokenizer = BloomTokenizerFast.from_pretrained(path_560m)
 
@@ -423,7 +435,7 @@ def test_simple_generation(self):
         )
 
         input_ids = tokenizer.encode(input_sentence, return_tensors="pt")
-        greedy_output = model.generate(input_ids.cuda(), max_length=50)
+        greedy_output = model.generate(input_ids.to(torch_device), max_length=50)
 
         self.assertEqual(tokenizer.decode(greedy_output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
 
@@ -431,16 +443,16 @@ def test_simple_generation(self):
     @require_torch_gpu
     def test_batch_generation(self):
         path_560m = "bigscience/bloom-560m"
-        model = BloomForCausalLM.from_pretrained(path_560m, use_cache=True, revision="gs555750").cuda()
+        model = BloomForCausalLM.from_pretrained(path_560m, use_cache=True, revision="gs555750").to(torch_device)
         model = model.eval()
         tokenizer = BloomTokenizerFast.from_pretrained(path_560m, padding_side="left")
 
         input_sentence = ["I enjoy walking with my cute dog", "I enjoy walking with my cute dog"]
 
-        input_ids = tokenizer.batch_encode_plus(input_sentence, return_tensors="pt", padding=True)
-        greedy_output = model.generate(
-            input_ids["input_ids"].cuda(), attention_mask=input_ids["attention_mask"], max_length=50, do_sample=False
-        )
+        inputs = tokenizer.batch_encode_plus(input_sentence, return_tensors="pt", padding=True)
+        input_ids = inputs["input_ids"].to(torch_device)
+        attention_mask = inputs["attention_mask"]
+        greedy_output = model.generate(input_ids, attention_mask=attention_mask, max_length=50, do_sample=False)
 
         self.assertEqual(
             tokenizer.decode(greedy_output[0], skip_special_tokens=True),
@@ -450,9 +462,8 @@ def test_batch_generation(self):
     @slow
     @require_torch_gpu
     def test_batch_generation_padd(self):
-
         path_560m = "bigscience/bloom-560m"
-        model = BloomForCausalLM.from_pretrained(path_560m, use_cache=True, revision="gs555750").cuda()
+        model = BloomForCausalLM.from_pretrained(path_560m, use_cache=True, revision="gs555750").to(torch_device)
         model = model.eval()
         tokenizer = BloomTokenizerFast.from_pretrained(path_560m, padding_side="left")
 
@@ -462,10 +473,11 @@ def test_batch_generation_padd(self):
         input_ids = tokenizer.batch_encode_plus(input_sentence, return_tensors="pt", padding=True)
         input_ids_without_pad = tokenizer.encode(input_sentence_without_pad, return_tensors="pt")
 
-        greedy_output = model.generate(
-            input_ids["input_ids"].cuda(), attention_mask=input_ids["attention_mask"], max_length=50, do_sample=False
+        input_ids, attention_mask = input_ids["input_ids"].to(torch_device), input_ids["attention_mask"]
+        greedy_output = model.generate(input_ids, attention_mask=attention_mask, max_length=50, do_sample=False)
+        greedy_output_without_pad = model.generate(
+            input_ids_without_pad.to(torch_device), max_length=50, do_sample=False
         )
-        greedy_output_without_pad = model.generate(input_ids_without_pad.cuda(), max_length=50, do_sample=False)
 
         # test token values
         self.assertEqual(greedy_output[-1, 3:].tolist(), greedy_output_without_pad[0, :-3].tolist())
@@ -476,6 +488,33 @@ def test_batch_generation_padd(self):
             tokenizer.decode(greedy_output_without_pad[0, :-3], skip_special_tokens=True),
         )
 
+    @slow
+    @require_torch_gpu
+    def test_batch_generated_text(self):
+        path_560m = "bigscience/bloom-560m"
+
+        model = BloomForCausalLM.from_pretrained(path_560m, use_cache=True, revision="gs555750").to(torch_device)
+        model = model.eval()
+        tokenizer = BloomTokenizerFast.from_pretrained(path_560m, padding_side="left")
+
+        input_sentences = [
+            "Hello what is",
+            "Running a quick test with the",
+        ]
+        inputs = tokenizer(input_sentences, return_tensors="pt", padding=True, truncation=True)
+        generated_ids = model.generate(
+            inputs["input_ids"].to(torch_device), attention_mask=inputs["attention_mask"], max_length=20
+        )
+        generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+
+        # these generations match those of the PyTorch model
+        EXPECTED_GENERATIONS = [
+            "Hello what is the best way to get the data from the server? I have tried",
+            "Running a quick test with the following command:\nsudo apt-get install python3\nsudo apt-get install python2",
+        ]
+
+        self.assertListEqual(generated_text, EXPECTED_GENERATIONS)
+
 
 @require_torch
 class BloomEmbeddingTest(unittest.TestCase):
@@ -504,10 +543,6 @@ def setUp(self):
         super().setUp()
         self.path_bigscience_model = "bigscience/bigscience-small-testing"
 
-    @unittest.skipIf(
-        not is_torch_greater_or_equal_than_1_10,
-        "Test failed with torch < 1.10 (`LayerNormKernelImpl` not implemented for `BFloat16`)",
-    )
     @require_torch
     def test_embeddings(self):
         # The config in this checkpoint has `bfloat16` as `torch_dtype` -> model in `bfloat16`
@@ -740,9 +775,6 @@ def test_embeddings(self):
                 self.assertAlmostEqual(EMBEDDINGS_DS_AFTER_LN[key][idx], output_dict_norm[key][idx], places=1)
 
     @require_torch
-    @unittest.skipIf(
-        is_torch_less_than_1_9, reason="Test failed with torch < 1.9 (`min_cuda` not implemented for `BFloat16`)"
-    )
     def test_hidden_states_transformers(self):
         cuda_available = torch.cuda.is_available()
         model = BloomModel.from_pretrained(self.path_bigscience_model, use_cache=False, torch_dtype="auto").to(
diff --git a/tests/models/bloom/test_modeling_flax_bloom.py b/tests/models/bloom/test_modeling_flax_bloom.py
new file mode 100644
index 000000000000..91716615de9a
--- /dev/null
+++ b/tests/models/bloom/test_modeling_flax_bloom.py
@@ -0,0 +1,251 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import numpy as np
+
+from transformers import BloomConfig, BloomTokenizerFast, is_flax_available
+from transformers.testing_utils import require_flax, slow
+
+from ...generation.test_flax_utils import FlaxGenerationTesterMixin
+from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
+
+
+if is_flax_available():
+    import os
+
+    # The slow tests are often failing with OOM error on GPU
+    # This makes JAX allocate exactly what is needed on demand, and deallocate memory that is no longer needed
+    # but will be slower as stated here https://jax.readthedocs.io/en/latest/gpu_memory_allocation.html
+    os.environ["XLA_PYTHON_CLIENT_ALLOCATOR"] = "platform"
+
+    import jax.numpy as jnp
+
+    from transformers import FlaxBloomForCausalLM, FlaxBloomModel
+
+
+def prepare_bloom_inputs_dict(config, input_ids, attention_mask=None):
+    if attention_mask is None:
+        attention_mask = np.where(input_ids != config.pad_token_id, 1, 0)
+    return {"input_ids": input_ids, "attention_mask": attention_mask}
+
+
+@require_flax
+class FlaxBloomModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=16,
+        n_layer=2,
+        n_head=4,
+        hidden_act="gelu",
+        hidden_dropout=0.1,
+        attention_probs_dropout_prob=0.1,
+        eos_token_id=2,
+        pad_token_id=1,
+        bos_token_id=0,
+        initializer_range=0.02,
+        apply_residual_connection_post_layernorm=False,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = n_layer
+        self.num_attention_heads = n_head
+        self.hidden_act = hidden_act
+        self.hidden_dropout = hidden_dropout
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.initializer_range = initializer_range
+        self.is_encoder_decoder = False
+        self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
+
+    def prepare_config_and_inputs(self):
+        input_ids = np.clip(ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size), 3, self.vocab_size)
+        input_ids = np.concatenate((input_ids, 2 * np.ones((self.batch_size, 1), dtype=np.int64)), -1)
+
+        config = BloomConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            n_layer=self.num_hidden_layers,
+            n_head=self.num_attention_heads,
+            hidden_dropout=self.hidden_dropout,
+            attention_dropout=self.attention_probs_dropout_prob,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
+            is_encoder_decoder=False,
+            use_cache=False,
+        )
+        inputs_dict = prepare_bloom_inputs_dict(config, input_ids)
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def check_use_cache_forward(self, model_class_name, config, inputs_dict):
+        max_length = 20
+        model = model_class_name(config)
+
+        input_ids = inputs_dict["input_ids"]
+        attention_mask = jnp.ones((input_ids.shape[0], max_length), dtype="i4")
+
+        past_key_values = model.init_cache(input_ids.shape[0], max_length)
+
+        outputs_cache = model(
+            input_ids[:, :-1],
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+        )
+
+        outputs_cache_next = model(
+            input_ids[:, -1:],
+            attention_mask=attention_mask,
+            past_key_values=outputs_cache.past_key_values,
+        )
+
+        outputs = model(input_ids)
+
+        diff = np.max(np.abs((outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5])))
+        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
+
+    def check_use_cache_forward_with_attn_mask(self, model_class_name, config, inputs_dict):
+        max_length = 20
+        model = model_class_name(config)
+
+        input_ids, attention_mask = (
+            inputs_dict["input_ids"],
+            inputs_dict["attention_mask"],
+        )
+
+        attention_mask_cache = jnp.concatenate(
+            [
+                attention_mask,
+                jnp.zeros((attention_mask.shape[0], max_length - attention_mask.shape[1])),
+            ],
+            axis=-1,
+        )
+
+        past_key_values = model.init_cache(input_ids.shape[0], max_length)
+
+        outputs_cache = model(
+            input_ids[:, :-1],
+            attention_mask=attention_mask_cache,
+            past_key_values=past_key_values,
+        )
+        outputs_cache_next = model(
+            input_ids[:, -1:],
+            past_key_values=outputs_cache.past_key_values,
+            attention_mask=attention_mask_cache,
+        )
+
+        outputs = model(input_ids, attention_mask=attention_mask)
+
+        diff = np.max(np.abs((outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5])))
+        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
+
+
+@require_flax
+class FlaxBloomModelTest(FlaxModelTesterMixin, unittest.TestCase, FlaxGenerationTesterMixin):
+    all_model_classes = (FlaxBloomModel, FlaxBloomForCausalLM) if is_flax_available() else ()
+    all_generative_model_classes = () if is_flax_available() else ()
+
+    def setUp(self):
+        self.model_tester = FlaxBloomModelTester(self)
+
+    def test_use_cache_forward(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            self.model_tester.check_use_cache_forward(model_class, config, inputs_dict)
+
+    def test_use_cache_forward_with_attn_mask(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            self.model_tester.check_use_cache_forward_with_attn_mask(model_class, config, inputs_dict)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_class_name in self.all_model_classes:
+            model = model_class_name.from_pretrained("bigscience/bloom-560m")
+            input_ids = np.ones((1, 1)) * model.config.eos_token_id
+            outputs = model(input_ids)
+            self.assertIsNotNone(outputs)
+
+
+@slow
+@require_flax
+class FlaxBloomGenerationTest(unittest.TestCase):
+    all_model_classes = (FlaxBloomForCausalLM,) if is_flax_available() else ()
+    all_generative_model_classes = () if is_flax_available() else ()
+
+    def setUp(self):
+        self.model_id = "bigscience/bloom-560m"
+        self.tokenizer = BloomTokenizerFast.from_pretrained(self.model_id, padding_side="left")
+        self.model_tester = FlaxBloomModelTester(self)
+        self.model = FlaxBloomForCausalLM.from_pretrained(self.model_id, from_pt=True, revision="gs555750")
+
+    def test_model_batched_gen(self):
+        # tests if the model outputs the same generation for the same batched input
+        input_sentences = [
+            "Hello there is this string is definitely longer I believe that",
+            "Hello there is this string is definitely longer I believe that",
+        ]
+        inputs = self.tokenizer(input_sentences, return_tensors="np", padding=True, truncation=True)
+        sequences_fx = self.model.generate(**inputs, max_length=20).sequences
+        self.assertEqual(sequences_fx[0].tolist(), sequences_fx[1].tolist())
+
+    def test_model_batched_padding_left(self):
+        # tests if the model outputs the same generation for an input that is part of a batch
+        # and a single input
+        input_sentences_batch = [
+            "Hello there is this string is definitely longer I believe that",
+            "Hi I want to order",
+        ]
+        inputs = self.tokenizer(input_sentences_batch, return_tensors="np", padding=True, truncation=True)
+        sequences_fx_batch = self.model.generate(**inputs, max_length=20).sequences
+
+        input_sentence_simple = "Hi I want to order"
+        inputs_simple = self.tokenizer(input_sentence_simple, return_tensors="np")
+        sequences_fx_simple = self.model.generate(**inputs_simple, max_length=20).sequences
+
+        self.assertEqual(sequences_fx_batch[1][6:].tolist(), sequences_fx_simple[0][:-6].tolist())
+
+    def test_batch_generated_text(self):
+        input_sentences = [
+            "Hello what is",
+            "Running a quick test with the",
+        ]
+        inputs = self.tokenizer(input_sentences, return_tensors="np", padding=True, truncation=True)
+        generated_ids = self.model.generate(**inputs, max_length=20).sequences
+        generated_text = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+
+        # these generations match those of the PyTorch model, ensuring correctness
+        EXPECTED_GENERATIONS = [
+            "Hello what is the best way to get the data from the server? I have tried",
+            "Running a quick test with the following command:\nsudo apt-get install python3\nsudo apt-get install python2",
+        ]
+
+        self.assertListEqual(generated_text, EXPECTED_GENERATIONS)
diff --git a/tests/models/bloom/test_tokenization_bloom.py b/tests/models/bloom/test_tokenization_bloom.py
index 117240dbdae1..02491929d148 100644
--- a/tests/models/bloom/test_tokenization_bloom.py
+++ b/tests/models/bloom/test_tokenization_bloom.py
@@ -18,14 +18,13 @@
 from datasets import load_dataset
 
 from transformers import BloomTokenizerFast
-from transformers.testing_utils import require_tokenizers
+from transformers.testing_utils import require_jinja, require_tokenizers
 
 from ...test_tokenization_common import TokenizerTesterMixin
 
 
 @require_tokenizers
 class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     slow_tokenizer_class = None
     rust_tokenizer_class = BloomTokenizerFast
     tokenizer_class = BloomTokenizerFast
@@ -43,6 +42,10 @@ def get_rust_tokenizer(self, **kwargs):
         kwargs.update(self.special_tokens_map)
         return BloomTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
 
+    @unittest.skip("This needs a slow tokenizer. Bloom does not have one!")
+    def test_encode_decode_with_spaces(self):
+        return
+
     def test_encodings_from_sample_data(self):
         """
         Assert that the created tokens are the same than the hard-coded ones
@@ -125,7 +128,7 @@ def test_encodings_from_xnli_dataset(self):
         input_text = list(sample_data.values())
 
         output_tokens = list(map(tokenizer.encode, input_text))
-        predicted_text = list(map(lambda x: tokenizer.decode(x, clean_up_tokenization_spaces=False), output_tokens))
+        predicted_text = [tokenizer.decode(x, clean_up_tokenization_spaces=False) for x in output_tokens]
         self.assertListEqual(predicted_text, input_text)
 
     def test_pretrained_model_lists(self):
@@ -134,3 +137,31 @@ def test_pretrained_model_lists(self):
         # maximum sequence length of the positoonal embeddings.
         self.assertGreaterEqual(len(self.tokenizer_class.pretrained_vocab_files_map), 1)
         self.assertGreaterEqual(len(list(self.tokenizer_class.pretrained_vocab_files_map.values())[0]), 1)
+
+    @require_jinja
+    def test_tokenization_for_chat(self):
+        tokenizer = self.get_rust_tokenizer()
+        test_chats = [
+            [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
+            [
+                {"role": "system", "content": "You are a helpful chatbot."},
+                {"role": "user", "content": "Hello!"},
+                {"role": "assistant", "content": "Nice to meet you."},
+            ],
+            [{"role": "assistant", "content": "Nice to meet you."}, {"role": "user", "content": "Hello!"}],
+        ]
+        tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats]
+        expected_tokens = [
+            [5448, 1306, 267, 66799, 44799, 37143, 17, 2, 59414, 4, 2],
+            [5448, 1306, 267, 66799, 44799, 37143, 17, 2, 59414, 4, 2, 229126, 427, 11890, 1152, 17, 2],
+            [229126, 427, 11890, 1152, 17, 2, 59414, 4, 2],
+        ]
+        for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
+            self.assertListEqual(tokenized_chat, expected_tokens)
+
+    def test_add_prefix_space_fast(self):
+        tokenizer_w_prefix = self.get_rust_tokenizer(add_prefix_space=True)
+        tokenizer_wo_prefix = self.get_rust_tokenizer(add_prefix_space=False)
+        tokens_w_prefix = tokenizer_w_prefix.tokenize("Hey")
+        tokens_wo_prefix = tokenizer_wo_prefix.tokenize("Hey")
+        self.assertNotEqual(tokens_w_prefix, tokens_wo_prefix)
diff --git a/tests/models/bort/test_modeling_bort.py b/tests/models/bort/test_modeling_bort.py
deleted file mode 100644
index 79ca94080107..000000000000
--- a/tests/models/bort/test_modeling_bort.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-from transformers import is_torch_available
-from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import AutoModel
-
-
-@require_torch
-@require_sentencepiece
-@require_tokenizers
-class BortIntegrationTest(unittest.TestCase):
-    @slow
-    def test_output_embeds_base_model(self):
-        model = AutoModel.from_pretrained("amazon/bort")
-        model.to(torch_device)
-
-        input_ids = torch.tensor(
-            [[0, 18077, 4082, 7804, 8606, 6195, 2457, 3321, 11, 10489, 16, 269, 2579, 328, 2]],
-            device=torch_device,
-            dtype=torch.long,
-        )  # Schloß Nymphenburg in Munich is really nice!
-        output = model(input_ids)["last_hidden_state"]
-        expected_shape = torch.Size((1, 15, 1024))
-        self.assertEqual(output.shape, expected_shape)
-        # compare the actual values for a slice.
-        expected_slice = torch.tensor(
-            [[[-0.0349, 0.0436, -1.8654], [-0.6964, 0.0835, -1.7393], [-0.9819, 0.2956, -0.2868]]],
-            device=torch_device,
-            dtype=torch.float,
-        )
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
diff --git a/tests/models/bort/test_modeling_tf_bort.py b/tests/models/bort/test_modeling_tf_bort.py
deleted file mode 100644
index 8053afbd30cf..000000000000
--- a/tests/models/bort/test_modeling_tf_bort.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-from transformers import is_tf_available
-from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
-
-
-if is_tf_available():
-    import numpy as np
-    import tensorflow as tf
-
-    from transformers import TFAutoModel
-
-
-@require_tf
-@require_sentencepiece
-@require_tokenizers
-class TFBortIntegrationTest(unittest.TestCase):
-    @slow
-    def test_output_embeds_base_model(self):
-        model = TFAutoModel.from_pretrained("amazon/bort")
-
-        input_ids = tf.convert_to_tensor(
-            [[0, 18077, 4082, 7804, 8606, 6195, 2457, 3321, 11, 10489, 16, 269, 2579, 328, 2]],
-            dtype=tf.int32,
-        )  # Schloß Nymphenburg in Munich is really nice!
-
-        output = model(input_ids)["last_hidden_state"]
-        expected_shape = tf.TensorShape((1, 15, 1024))
-        self.assertEqual(output.shape, expected_shape)
-        # compare the actual values for a slice.
-        expected_slice = tf.convert_to_tensor(
-            [[[-0.0349, 0.0436, -1.8654], [-0.6964, 0.0835, -1.7393], [-0.9819, 0.2956, -0.2868]]],
-            dtype=tf.float32,
-        )
-
-        self.assertTrue(np.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-4))
diff --git a/tests/models/van/__init__.py b/tests/models/bridgetower/__init__.py
similarity index 100%
rename from tests/models/van/__init__.py
rename to tests/models/bridgetower/__init__.py
diff --git a/tests/models/bridgetower/test_image_processing_bridgetower.py b/tests/models/bridgetower/test_image_processing_bridgetower.py
new file mode 100644
index 000000000000..f8837fdc964a
--- /dev/null
+++ b/tests/models/bridgetower/test_image_processing_bridgetower.py
@@ -0,0 +1,152 @@
+# coding=utf-8
+# Copyright 2023 The Intel Labs Team Authors, The Microsoft Research Team Authors and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+from typing import Dict, List, Optional, Union
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import BridgeTowerImageProcessor
+
+
+class BridgeTowerImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        size_divisor: int = 32,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        do_center_crop: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = [0.48145466, 0.4578275, 0.40821073],
+        image_std: Optional[Union[float, List[float]]] = [0.26862954, 0.26130258, 0.27577711],
+        do_pad: bool = True,
+        batch_size=7,
+        min_resolution=30,
+        max_resolution=400,
+        num_channels=3,
+    ):
+        self.parent = parent
+        self.do_resize = do_resize
+        self.size = size if size is not None else {"shortest_edge": 288}
+        self.size_divisor = size_divisor
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.do_center_crop = do_center_crop
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_pad = do_pad
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+
+    def prepare_image_processor_dict(self):
+        return {
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_normalize": self.do_normalize,
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "size_divisor": self.size_divisor,
+        }
+
+    def get_expected_values(self, image_inputs, batched=False):
+        """
+        This function computes the expected height and width when providing images to BridgeTowerImageProcessor,
+        assuming do_resize is set to True with a scalar size and size_divisor.
+        """
+        if not batched:
+            size = self.size["shortest_edge"]
+            image = image_inputs[0]
+            if isinstance(image, Image.Image):
+                w, h = image.size
+            else:
+                h, w = image.shape[1], image.shape[2]
+            scale = size / min(w, h)
+            if h < w:
+                newh, neww = size, scale * w
+            else:
+                newh, neww = scale * h, size
+
+            max_size = int((1333 / 800) * size)
+            if max(newh, neww) > max_size:
+                scale = max_size / max(newh, neww)
+                newh = newh * scale
+                neww = neww * scale
+
+            newh, neww = int(newh + 0.5), int(neww + 0.5)
+            expected_height, expected_width = (
+                newh // self.size_divisor * self.size_divisor,
+                neww // self.size_divisor * self.size_divisor,
+            )
+
+        else:
+            expected_values = []
+            for image in image_inputs:
+                expected_height, expected_width = self.get_expected_values([image])
+                expected_values.append((expected_height, expected_width))
+            expected_height = max(expected_values, key=lambda item: item[0])[0]
+            expected_width = max(expected_values, key=lambda item: item[1])[1]
+
+        return expected_height, expected_width
+
+    def expected_output_image_shape(self, images):
+        height, width = self.get_expected_values(images, batched=True)
+        return self.num_channels, height, width
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class BridgeTowerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = BridgeTowerImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = BridgeTowerImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "size_divisor"))
diff --git a/tests/models/bridgetower/test_modeling_bridgetower.py b/tests/models/bridgetower/test_modeling_bridgetower.py
new file mode 100644
index 000000000000..8c7bd00ee668
--- /dev/null
+++ b/tests/models/bridgetower/test_modeling_bridgetower.py
@@ -0,0 +1,655 @@
+# coding=utf-8
+# Copyright 2023 The Intel Labs Team Authors, The Microsoft Research Team Authors and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch BridgeTower model. """
+
+import tempfile
+import unittest
+
+import numpy as np
+
+from transformers import (
+    BridgeTowerConfig,
+    BridgeTowerTextConfig,
+    BridgeTowerVisionConfig,
+    is_torch_available,
+    is_vision_available,
+)
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.utils import cached_property
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import (
+    ModelTesterMixin,
+    _config_zero_init,
+    floats_tensor,
+    ids_tensor,
+    random_attention_mask,
+)
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        BridgeTowerForContrastiveLearning,
+        BridgeTowerForImageAndTextRetrieval,
+        BridgeTowerForMaskedLM,
+        BridgeTowerModel,
+    )
+    from transformers.models.bridgetower.modeling_bridgetower import BRIDGETOWER_PRETRAINED_MODEL_ARCHIVE_LIST
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import BridgeTowerProcessor
+
+
+class BridgeTowerTextModelTester:
+    def __init__(
+        self,
+        parent,
+        hidden_act="gelu",
+        hidden_size=64,
+        initializer_factor=1,
+        layer_norm_eps=1e-05,
+        num_attention_heads=4,
+        num_hidden_layers=2,
+        intermediate_size=128,
+        tie_word_embeddings=False,
+        output_hidden_states=False,
+    ):
+        self.parent = parent
+        self.hidden_act = hidden_act
+        self.hidden_size = hidden_size
+        self.initializer_factor = initializer_factor
+        self.layer_norm_eps = layer_norm_eps
+        self.num_attention_heads = num_attention_heads
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+        self.tie_word_embeddings = tie_word_embeddings
+        self.vocab_size = 99
+        self.seq_length = 4
+        self.batch_size = 1
+        self.is_training = False
+        self.output_hidden_states = output_hidden_states
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        attention_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        config = self.get_config()
+
+        return config, input_ids, attention_mask
+
+    def get_config(self):
+        return BridgeTowerTextConfig(
+            hidden_act=self.hidden_act,
+            hidden_size=self.hidden_size,
+            initializer_factor=self.initializer_factor,
+            layer_norm_eps=self.layer_norm_eps,
+            num_attention_heads=self.num_attention_heads,
+            num_hidden_layers=self.num_hidden_layers,
+            intermediate_size=self.intermediate_size,
+            tie_word_embeddings=self.tie_word_embeddings,
+            output_hidden_states=self.output_hidden_states,
+            vocab_size=self.vocab_size,
+        )
+
+
+class BridgeTowerImageModelTester:
+    def __init__(
+        self,
+        parent,
+        hidden_size=64,
+        initializer_factor=1,
+        layer_norm_eps=1e-05,
+        num_hidden_layers=2,
+        init_layernorm_from_vision_encoder=False,
+        output_hidden_states=False,
+        image_size=64,
+    ):
+        self.parent = parent
+        self.hidden_size = hidden_size
+        self.initializer_factor = initializer_factor
+        self.layer_norm_eps = layer_norm_eps
+        self.num_hidden_layers = num_hidden_layers
+        self.init_layernorm_from_vision_encoder = init_layernorm_from_vision_encoder
+        self.num_channels = 3
+        self.num_image_features = 17
+        self.batch_size = 1
+        self.image_size = image_size
+        self.is_training = False
+        self.output_hidden_states = output_hidden_states
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        pixel_mask = random_attention_mask([self.batch_size, self.image_size, self.image_size])
+        config = self.get_config()
+
+        return config, pixel_values, pixel_mask
+
+    def get_config(self):
+        return BridgeTowerVisionConfig(
+            hidden_size=self.hidden_size,
+            initializer_factor=self.initializer_factor,
+            layer_norm_eps=self.layer_norm_eps,
+            num_hidden_layers=self.num_hidden_layers,
+            init_layernorm_from_vision_encoder=self.init_layernorm_from_vision_encoder,
+            num_channels=self.num_channels,
+            num_image_features=self.num_image_features,
+            batch_size=self.batch_size,
+            image_size=self.image_size,
+            is_training=self.is_training,
+            output_hidden_states=self.output_hidden_states,
+        )
+
+
+class BridgeTowerModelTester:
+    def __init__(
+        self,
+        parent,
+        text_kwargs=None,
+        vision_kwargs=None,
+        share_cross_modal_transformer_layers=True,
+        share_link_tower_layers=False,
+        link_tower_type="add",
+        init_layernorm_from_vision_encoder=False,
+        contrastive_hidden_size=512,
+        logit_scale_init_value=2.6592,
+        hidden_size=64,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=128,
+    ):
+        if text_kwargs is None:
+            text_kwargs = {}
+        if vision_kwargs is None:
+            vision_kwargs = {}
+
+        self.parent = parent
+        self.text_model_tester = BridgeTowerTextModelTester(parent, **text_kwargs)
+        self.vision_model_tester = BridgeTowerImageModelTester(parent, **vision_kwargs)
+
+        self.share_cross_modal_transformer_layers = share_cross_modal_transformer_layers
+        self.share_link_tower_layers = share_link_tower_layers
+        self.link_tower_type = link_tower_type
+        self.init_layernorm_from_vision_encoder = init_layernorm_from_vision_encoder
+        self.contrastive_hidden_size = contrastive_hidden_size
+        self.logit_scale_init_value = logit_scale_init_value
+
+        self.batch_size = 1
+        self.expected_num_hidden_layers = 8
+        self.is_training = False
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+
+    def prepare_config_and_inputs(self):
+        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
+        vision_config, pixel_values, pixel_mask = self.vision_model_tester.prepare_config_and_inputs()
+
+        config = self.get_config()
+
+        return (config, input_ids, attention_mask, pixel_values, pixel_mask)
+
+    def get_config(self):
+        return BridgeTowerConfig.from_text_vision_configs(
+            text_config=self.text_model_tester.get_config(),
+            vision_config=self.vision_model_tester.get_config(),
+            share_cross_modal_transformer_layers=self.share_cross_modal_transformer_layers,
+            share_link_tower_layers=self.share_link_tower_layers,
+            link_tower_type=self.link_tower_type,
+            init_layernorm_from_vision_encoder=self.init_layernorm_from_vision_encoder,
+            contrastive_hidden_size=self.contrastive_hidden_size,
+            logit_scale_init_value=self.logit_scale_init_value,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+        )
+
+    def create_and_check_model(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+        pixel_values,
+        pixel_mask,
+    ):
+        model = BridgeTowerModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, pixel_values=pixel_values, pixel_mask=pixel_mask)
+        result = model(input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
+        self.parent.assertEqual(
+            result["text_features"].shape,
+            (self.batch_size, self.text_model_tester.seq_length, self.text_model_tester.hidden_size),
+        )
+        self.parent.assertEqual(
+            result["image_features"].shape,
+            (self.batch_size, self.vision_model_tester.num_image_features, self.vision_model_tester.hidden_size),
+        )
+        self.parent.assertEqual(
+            result["pooler_output"].shape,
+            (self.batch_size, self.text_model_tester.hidden_size + self.vision_model_tester.hidden_size),
+        )
+
+    def create_and_check_for_image_and_text_retrieval(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+        pixel_values,
+        pixel_mask,
+    ):
+        bridgetower_itm_output_last_dimension = 2
+
+        model = BridgeTowerForImageAndTextRetrieval(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, pixel_values=pixel_values, pixel_mask=pixel_mask)
+        result = model(input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
+
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, bridgetower_itm_output_last_dimension))
+
+    def create_and_check_for_masked_language_modeling(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+        pixel_values,
+        pixel_mask,
+    ):
+        model = BridgeTowerForMaskedLM(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, pixel_values=pixel_values, pixel_mask=pixel_mask)
+        result = model(input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
+
+        self.parent.assertEqual(
+            result.logits.shape,
+            (self.batch_size, self.text_model_tester.seq_length, self.text_model_tester.vocab_size),
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (config, input_ids, attention_mask, pixel_values, pixel_mask) = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "pixel_values": pixel_values,
+            "pixel_mask": pixel_mask,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class BridgeTowerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            BridgeTowerModel,
+            BridgeTowerForImageAndTextRetrieval,
+            BridgeTowerForMaskedLM,
+            BridgeTowerForContrastiveLearning,
+        )
+        if is_torch_available()
+        else ()
+    )
+    pipeline_model_mapping = {"feature-extraction": BridgeTowerModel} if is_torch_available() else {}
+
+    is_training = False
+    test_headmasking = False
+    test_pruning = False
+    test_torchscript = False
+    test_resize_embeddings = False
+    has_attentions = False
+
+    @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
+    def test_cpu_offload(self):
+        pass
+
+    @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
+    def test_disk_offload(self):
+        pass
+
+    @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
+    def test_model_parallelism(self):
+        pass
+
+    # function to extract meaningful tensor from output per different model_class
+    def extract_output(self, outputs, model_class):
+        return outputs["pooler_output"] if model_class == "BridgeTowerModel" else outputs["logits"]
+
+    def setUp(self):
+        self.model_tester = BridgeTowerModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=BridgeTowerConfig, hidden_size=37, vocab_size=99)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_image_and_text_retrieval(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_image_and_text_retrieval(*config_and_inputs)
+
+    def test_for_masked_language_modeling(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_language_modeling(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in BRIDGETOWER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = BridgeTowerModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    @slow
+    def test_save_load_fast_init_from_base(self):
+        # Override as it is a slow test on this model
+        super().test_save_load_fast_init_from_base()
+
+    # Override as extracting meaningful tensor from output is different for BridgeTower
+    def test_save_load(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**input_dict)
+
+            out_2 = self.extract_output(outputs, model_class.__name__)
+            out_2 = out_2.cpu().numpy()
+            out_2[np.isnan(out_2)] = 0
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model = model_class.from_pretrained(tmpdirname)
+                model.to(torch_device)
+                with torch.no_grad():
+                    after_outputs = model(**input_dict)
+
+                # Make sure we don't have nans
+                out_1 = self.extract_output(after_outputs, model_class.__name__)
+                out_1 = out_1.cpu().numpy()
+                out_1[np.isnan(out_1)] = 0
+                max_diff = np.amax(np.abs(out_1 - out_2))
+                self.assertLessEqual(max_diff, 1e-5)
+
+    # Override this as `hidden states output` is different for BridgeTower
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states_text, hidden_states_vision, hidden_states_cross = (
+                outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+            )
+
+            expected_num_layers = self.model_tester.expected_num_hidden_layers
+            self.assertEqual(
+                sum((len(hidden_states_text), len(hidden_states_vision), len(hidden_states_cross))),
+                expected_num_layers,
+            )
+
+            seq_length = self.model_tester.text_model_tester.seq_length
+            num_image_features = self.model_tester.vision_model_tester.num_image_features
+
+            self.assertListEqual(
+                list(hidden_states_text[0].shape[-2:]),
+                [seq_length, self.model_tester.text_model_tester.hidden_size],
+            )
+            self.assertListEqual(
+                list(hidden_states_vision[0].shape),
+                [num_image_features, 1, self.model_tester.vision_model_tester.hidden_size],
+            )
+            self.assertListEqual(
+                list(hidden_states_cross[0][0].shape[-2:]),
+                [seq_length, self.model_tester.text_model_tester.hidden_size],
+            )
+            self.assertListEqual(
+                list(hidden_states_cross[0][1].shape[-2:]),
+                [num_image_features, self.model_tester.vision_model_tester.hidden_size],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    # Override as `hidden states output` is different for BridgeTower
+    def test_retain_grad_hidden_states_attentions(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        config.output_attentions = self.has_attentions
+
+        # no need to test all models as different heads yield the same functionality
+        model_class = self.all_model_classes[0]
+        model = model_class(config)
+        model.to(torch_device)
+
+        inputs = self._prepare_for_class(inputs_dict, model_class)
+
+        outputs = model(**inputs)
+
+        output = outputs[0]
+
+        # Encoder-/Decoder-only models
+        hidden_states = outputs.hidden_states[0][0]
+        hidden_states.retain_grad()
+
+        if self.has_attentions:
+            attentions = outputs.attentions[0][0]
+            attentions.retain_grad()
+
+        output.flatten()[0].backward(retain_graph=True)
+
+        self.assertIsNotNone(hidden_states.grad)
+
+        if self.has_attentions:
+            self.assertIsNotNone(attentions.grad)
+
+    # override as the `logit_scale` parameter initilization is different for BRIDGE TOWER
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    if name == "logit_scale":
+                        self.assertAlmostEqual(
+                            param.data.item(),
+                            config.logit_scale_init_value,
+                            delta=1e-3,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    else:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+    @unittest.skip(reason="""Bridge Tower does not have input/output embeddings. So this test is not applicable.""")
+    def test_model_common_attributes(self):
+        pass
+
+    @unittest.skip(reason="""Bridge Tower does not have input/output embeddings. Thus this test is not applicable.""")
+    def test_inputs_embeds(self):
+        pass
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_torch
+@require_vision
+class BridgeTowerModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_processor(self):
+        return (
+            BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")
+            if is_vision_available()
+            else None
+        )
+
+    @slow
+    def test_image_and_text_retrieval(self):
+        model = BridgeTowerForImageAndTextRetrieval.from_pretrained("BridgeTower/bridgetower-base-itm-mlm").to(
+            torch_device
+        )
+        model.eval()
+        processor = self.default_processor
+        image = prepare_img()
+        text = "a bunch of cats laying on a tower."
+        inputs = processor(image, text, return_tensors="pt").to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # verify the logits
+        expected_shape = torch.Size([1, 2])
+        self.assertEqual(outputs.logits.shape, expected_shape)
+        self.assertTrue(outputs.logits[0, 1].item() > outputs.logits[0, 0].item())
+
+        # verify loss
+        inputs["labels"] = torch.ones(1, dtype=torch.long, device=torch_device)
+        inputs = inputs.to(torch_device)
+        with torch.no_grad():
+            outputs = model(**inputs)
+        self.assertAlmostEqual(outputs.loss.item(), 0.5108, places=4)
+
+    @slow
+    def test_masked_language_modeling(self):
+        model = BridgeTowerForMaskedLM.from_pretrained("BridgeTower/bridgetower-base-itm-mlm").to(torch_device)
+        model.eval()
+        processor = self.default_processor
+        image = prepare_img()
+        text = "a bunch of <mask> laying on a tower."
+        inputs = processor(image, text, return_tensors="pt").to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # verify the logits
+        expected_shape = torch.Size([1, 11, 50265])
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        # verify predicted word
+        predicted_id = outputs.logits.argmax(dim=-1).squeeze(0).tolist()[4]
+        self.assertTrue(processor.decode([predicted_id]) == " cats")
+
+        # verify loss
+        inputs["labels"] = inputs["input_ids"].clone()
+        inputs = inputs.to(torch_device)
+        with torch.no_grad():
+            outputs = model(**inputs)
+        self.assertAlmostEqual(outputs.loss.item(), 5.7373, places=4)
+
+    @slow
+    def test_constrastive_learning(self):
+        model = BridgeTowerForContrastiveLearning.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc").to(
+            torch_device
+        )
+        model.eval()
+        processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc")
+        image = prepare_img()
+        text = "a bunch of cats laying on a tower."
+        inputs = processor(image, text, padding=True, return_tensors="pt").to(torch_device)
+        with torch.no_grad():
+            outputs = model(**inputs, output_hidden_states=True, return_loss=True)
+
+        # verify the logits
+        expected_shape = torch.Size([1, 3, 512])
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+
+@slow
+@require_torch
+class BridgeTowerModelTrainingTest(unittest.TestCase):
+    all_training_supported_model_classes = (
+        (BridgeTowerForImageAndTextRetrieval, BridgeTowerForMaskedLM, BridgeTowerForContrastiveLearning)
+        if is_torch_available()
+        else ()
+    )
+
+    def setUp(self):
+        self.model_tester = BridgeTowerModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=BridgeTowerConfig, hidden_size=37, vocab_size=99)
+
+    def _prepare_inputs_for_training(self, model_class):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        if model_class == BridgeTowerForMaskedLM:
+            inputs_dict["labels"] = inputs_dict["input_ids"]
+        elif model_class == BridgeTowerForImageAndTextRetrieval:
+            inputs_dict["labels"] = ids_tensor([1], 2)
+        elif model_class == BridgeTowerForContrastiveLearning:
+            inputs_dict["return_loss"] = True
+        return config, inputs_dict
+
+    def _get_non_used_layer_names(self, model_class):
+        non_used_layer_names = ["text_model.pooler"]
+        if model_class == BridgeTowerForMaskedLM:
+            non_used_layer_names = non_used_layer_names + [
+                # This number `1` actually depends on the number of layers in `cross_modal_image_layers` (by minus 1)
+                "cross_modal_image_layers.1",
+                "cross_modal_image_pooler",
+                "cross_modal_text_pooler",
+            ]
+        return non_used_layer_names
+
+    def _is_layer_used(self, model_class, layer_name):
+        non_used_layer_names = self._get_non_used_layer_names(model_class)
+        for non_used_layer_name in non_used_layer_names:
+            if non_used_layer_name in layer_name:
+                return False
+        return True
+
+    def test_training(self):
+        for model_class in self.all_training_supported_model_classes:
+            config, inputs_dict = self._prepare_inputs_for_training(model_class)
+            model = model_class(config)
+            model.to(torch_device)
+            model.train()
+
+            loss = model(**inputs_dict).loss
+            loss.backward()
+
+            # verify the gradients of used layers' weight are not None
+            for name, param in model.named_parameters():
+                if self._is_layer_used(model_class, name):
+                    self.assertIsNotNone(param.grad, f"Gradients should not be None - got {param.grad} for {name}")
diff --git a/tests/onnx/__init__.py b/tests/models/bros/__init__.py
similarity index 100%
rename from tests/onnx/__init__.py
rename to tests/models/bros/__init__.py
diff --git a/tests/models/bros/test_modeling_bros.py b/tests/models/bros/test_modeling_bros.py
new file mode 100644
index 000000000000..c4fbaa2f98d3
--- /dev/null
+++ b/tests/models/bros/test_modeling_bros.py
@@ -0,0 +1,450 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Bros model. """
+
+import copy
+import unittest
+
+from transformers.testing_utils import require_torch, slow, torch_device
+from transformers.utils import is_torch_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        BrosConfig,
+        BrosForTokenClassification,
+        BrosModel,
+        BrosSpadeEEForTokenClassification,
+        BrosSpadeELForTokenClassification,
+    )
+    from transformers.models.bros.modeling_bros import (
+        BROS_PRETRAINED_MODEL_ARCHIVE_LIST,
+    )
+
+
+class BrosModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_bbox_first_token_mask=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=64,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_bbox_first_token_mask = use_bbox_first_token_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        bbox = ids_tensor([self.batch_size, self.seq_length, 8], 1)
+        # Ensure that bbox is legal
+        for i in range(bbox.shape[0]):
+            for j in range(bbox.shape[1]):
+                if bbox[i, j, 3] < bbox[i, j, 1]:
+                    t = bbox[i, j, 3]
+                    bbox[i, j, 3] = bbox[i, j, 1]
+                    bbox[i, j, 1] = t
+                if bbox[i, j, 2] < bbox[i, j, 0]:
+                    t = bbox[i, j, 2]
+                    bbox[i, j, 2] = bbox[i, j, 0]
+                    bbox[i, j, 0] = t
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        bbox_first_token_mask = None
+        if self.use_bbox_first_token_mask:
+            bbox_first_token_mask = torch.ones([self.batch_size, self.seq_length], dtype=torch.bool).to(torch_device)
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        token_labels = None
+        if self.use_labels:
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            initial_token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            subsequent_token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+
+        config = self.get_config()
+
+        return (
+            config,
+            input_ids,
+            bbox,
+            token_type_ids,
+            input_mask,
+            bbox_first_token_mask,
+            token_labels,
+            initial_token_labels,
+            subsequent_token_labels,
+        )
+
+    def get_config(self):
+        return BrosConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+        )
+
+    def create_and_check_model(
+        self,
+        config,
+        input_ids,
+        bbox,
+        token_type_ids,
+        input_mask,
+        bbox_first_token_mask,
+        token_labels,
+        initial_token_labels,
+        subsequent_token_labels,
+    ):
+        model = BrosModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, bbox=bbox, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, bbox=bbox, token_type_ids=token_type_ids)
+        result = model(input_ids, bbox=bbox)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_token_classification(
+        self,
+        config,
+        input_ids,
+        bbox,
+        token_type_ids,
+        input_mask,
+        bbox_first_token_mask,
+        token_labels,
+        initial_token_labels,
+        subsequent_token_labels,
+    ):
+        config.num_labels = self.num_labels
+        model = BrosForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids, bbox=bbox, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_for_spade_ee_token_classification(
+        self,
+        config,
+        input_ids,
+        bbox,
+        token_type_ids,
+        input_mask,
+        bbox_first_token_mask,
+        token_labels,
+        initial_token_labels,
+        subsequent_token_labels,
+    ):
+        config.num_labels = self.num_labels
+        model = BrosSpadeEEForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            bbox=bbox,
+            attention_mask=input_mask,
+            bbox_first_token_mask=bbox_first_token_mask,
+            token_type_ids=token_type_ids,
+            initial_token_labels=token_labels,
+            subsequent_token_labels=token_labels,
+        )
+        self.parent.assertEqual(result.initial_token_logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+        self.parent.assertEqual(
+            result.subsequent_token_logits.shape, (self.batch_size, self.seq_length, self.seq_length + 1)
+        )
+
+    def create_and_check_for_spade_el_token_classification(
+        self,
+        config,
+        input_ids,
+        bbox,
+        token_type_ids,
+        input_mask,
+        bbox_first_token_mask,
+        token_labels,
+        initial_token_labels,
+        subsequent_token_labels,
+    ):
+        config.num_labels = self.num_labels
+        model = BrosSpadeELForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            bbox=bbox,
+            attention_mask=input_mask,
+            bbox_first_token_mask=bbox_first_token_mask,
+            token_type_ids=token_type_ids,
+            labels=token_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.seq_length + 1))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            bbox,
+            token_type_ids,
+            input_mask,
+            bbox_first_token_mask,
+            token_labels,
+            initial_token_labels,
+            subsequent_token_labels,
+        ) = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "bbox": bbox,
+            "token_type_ids": token_type_ids,
+            "attention_mask": input_mask,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class BrosModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    test_pruning = False
+    test_torchscript = False
+    test_mismatched_shapes = False
+
+    all_model_classes = (
+        (
+            BrosForTokenClassification,
+            BrosSpadeEEForTokenClassification,
+            BrosSpadeELForTokenClassification,
+            BrosModel,
+        )
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = () if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": BrosModel, "token-classification": BrosForTokenClassification}
+        if is_torch_available()
+        else {}
+    )
+
+    # BROS requires `bbox` in the inputs which doesn't fit into the above 2 pipelines' input formats.
+    # see https://github.com/huggingface/transformers/pull/26294
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        return True
+
+    def setUp(self):
+        self.model_tester = BrosModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=BrosConfig, hidden_size=37)
+
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = copy.deepcopy(inputs_dict)
+
+        if return_labels:
+            if model_class.__name__ in ["BrosForTokenClassification", "BrosSpadeELForTokenClassification"]:
+                inputs_dict["labels"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.seq_length),
+                    dtype=torch.long,
+                    device=torch_device,
+                )
+                inputs_dict["bbox_first_token_mask"] = torch.ones(
+                    [self.model_tester.batch_size, self.model_tester.seq_length],
+                    dtype=torch.bool,
+                    device=torch_device,
+                )
+            elif model_class.__name__ in ["BrosSpadeEEForTokenClassification"]:
+                inputs_dict["initial_token_labels"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.seq_length),
+                    dtype=torch.long,
+                    device=torch_device,
+                )
+                inputs_dict["subsequent_token_labels"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.seq_length),
+                    dtype=torch.long,
+                    device=torch_device,
+                )
+                inputs_dict["bbox_first_token_mask"] = torch.ones(
+                    [self.model_tester.batch_size, self.model_tester.seq_length],
+                    dtype=torch.bool,
+                    device=torch_device,
+                )
+
+        return inputs_dict
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_multi_gpu_data_parallel_forward(self):
+        super().test_multi_gpu_data_parallel_forward()
+
+    def test_model_various_embeddings(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        for type in ["absolute", "relative_key", "relative_key_query"]:
+            config_and_inputs[0].position_embedding_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
+
+    def test_for_spade_ee_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_spade_ee_token_classification(*config_and_inputs)
+
+    def test_for_spade_el_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_spade_el_token_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in BROS_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = BrosModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+def prepare_bros_batch_inputs():
+    attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
+
+    bbox = torch.tensor(
+        [
+            [
+                [0.0000, 0.0000, 0.0000, 0.0000],
+                [0.5223, 0.5590, 0.5787, 0.5720],
+                [0.5853, 0.5590, 0.6864, 0.5720],
+                [0.5853, 0.5590, 0.6864, 0.5720],
+                [0.1234, 0.5700, 0.2192, 0.5840],
+                [0.2231, 0.5680, 0.2782, 0.5780],
+                [0.2874, 0.5670, 0.3333, 0.5780],
+                [0.3425, 0.5640, 0.4344, 0.5750],
+                [0.0866, 0.7770, 0.1181, 0.7870],
+                [0.1168, 0.7770, 0.1522, 0.7850],
+                [0.1535, 0.7750, 0.1864, 0.7850],
+                [0.1890, 0.7750, 0.2572, 0.7850],
+                [1.0000, 1.0000, 1.0000, 1.0000],
+            ],
+            [
+                [0.0000, 0.0000, 0.0000, 0.0000],
+                [0.4396, 0.6720, 0.4659, 0.6850],
+                [0.4698, 0.6720, 0.4843, 0.6850],
+                [0.1575, 0.6870, 0.2021, 0.6980],
+                [0.2047, 0.6870, 0.2730, 0.7000],
+                [0.1299, 0.7010, 0.1430, 0.7140],
+                [0.1299, 0.7010, 0.1430, 0.7140],
+                [0.1562, 0.7010, 0.2441, 0.7120],
+                [0.1562, 0.7010, 0.2441, 0.7120],
+                [0.2454, 0.7010, 0.3150, 0.7120],
+                [0.3176, 0.7010, 0.3320, 0.7110],
+                [0.3333, 0.7000, 0.4029, 0.7140],
+                [1.0000, 1.0000, 1.0000, 1.0000],
+            ],
+        ]
+    )
+    input_ids = torch.tensor(
+        [
+            [101, 1055, 8910, 1012, 5719, 3296, 5366, 3378, 2146, 2846, 10807, 13494, 102],
+            [101, 2112, 1997, 3671, 6364, 1019, 1012, 5057, 1011, 4646, 2030, 2974, 102],
+        ]
+    )
+
+    return input_ids, bbox, attention_mask
+
+
+@require_torch
+class BrosModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_no_head(self):
+        model = BrosModel.from_pretrained("jinho8345/bros-base-uncased").to(torch_device)
+
+        input_ids, bbox, attention_mask = prepare_bros_batch_inputs()
+
+        with torch.no_grad():
+            outputs = model(
+                input_ids.to(torch_device),
+                bbox.to(torch_device),
+                attention_mask=attention_mask.to(torch_device),
+                return_dict=True,
+            )
+
+        # verify the logits
+        expected_shape = torch.Size((2, 13, 768))
+        self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[-0.3074, 0.1363, 0.3143], [0.0925, -0.1155, 0.1050], [0.0221, 0.0003, 0.1285]]
+        ).to(torch_device)
+        torch.set_printoptions(sci_mode=False)
+
+        self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
diff --git a/tests/models/byt5/test_tokenization_byt5.py b/tests/models/byt5/test_tokenization_byt5.py
index 85057c5278bb..486f9d1747fc 100644
--- a/tests/models/byt5/test_tokenization_byt5.py
+++ b/tests/models/byt5/test_tokenization_byt5.py
@@ -36,7 +36,6 @@
 
 
 class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = ByT5Tokenizer
     test_rust_tokenizer = False
 
@@ -206,7 +205,9 @@ def test_save_and_load_tokenizer(self):
                 tokenizer.add_tokens(["bim", "bambam"])
                 additional_special_tokens = tokenizer.additional_special_tokens
                 additional_special_tokens.append("new_additional_special_token")
-                tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+                tokenizer.add_special_tokens(
+                    {"additional_special_tokens": additional_special_tokens}, replace_additional_special_tokens=False
+                )
                 before_tokens = tokenizer.encode(sample_text, add_special_tokens=False)
                 tokenizer.save_pretrained(tmpdirname)
 
diff --git a/tests/models/camembert/test_modeling_camembert.py b/tests/models/camembert/test_modeling_camembert.py
index 3a40f6a8789e..a15ab8caa231 100644
--- a/tests/models/camembert/test_modeling_camembert.py
+++ b/tests/models/camembert/test_modeling_camembert.py
@@ -39,7 +39,8 @@ def test_output_embeds_base_model(self):
             device=torch_device,
             dtype=torch.long,
         )  # J'aime le camembert !
-        output = model(input_ids)["last_hidden_state"]
+        with torch.no_grad():
+            output = model(input_ids)["last_hidden_state"]
         expected_shape = torch.Size((1, 10, 768))
         self.assertEqual(output.shape, expected_shape)
         # compare the actual values for a slice.
diff --git a/tests/models/camembert/test_modeling_tf_camembert.py b/tests/models/camembert/test_modeling_tf_camembert.py
index dc542526852d..425bdbc4b0ac 100644
--- a/tests/models/camembert/test_modeling_tf_camembert.py
+++ b/tests/models/camembert/test_modeling_tf_camembert.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import is_tf_available
diff --git a/tests/models/camembert/test_tokenization_camembert.py b/tests/models/camembert/test_tokenization_camembert.py
index aff186d73cb0..18af2b73d6a4 100644
--- a/tests/models/camembert/test_tokenization_camembert.py
+++ b/tests/models/camembert/test_tokenization_camembert.py
@@ -31,7 +31,6 @@
 @require_sentencepiece
 @require_tokenizers
 class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = CamembertTokenizer
     rust_tokenizer_class = CamembertTokenizerFast
     test_rust_tokenizer = True
@@ -44,13 +43,19 @@ def setUp(self):
         tokenizer = CamembertTokenizer(SAMPLE_VOCAB)
         tokenizer.save_pretrained(self.tmpdirname)
 
+    @unittest.skip(
+        "Token maps are not equal because someone set the probability of ('<unk>NOTUSED', -100), so it's never encoded for fast"
+    )
+    def test_special_tokens_map_equal(self):
+        return
+
     def test_convert_token_and_id(self):
         """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
         token = "<pad>"
-        token_id = 1
+        token_id = 1  # 1 is the offset id, but in the spm vocab it's 3
 
-        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
-        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
+        self.assertEqual(self.get_tokenizer().convert_tokens_to_ids(token), token_id)
+        self.assertEqual(self.get_tokenizer().convert_ids_to_tokens(token_id), token)
 
     def test_get_vocab(self):
         vocab_keys = list(self.get_tokenizer().get_vocab().keys())
@@ -58,10 +63,10 @@ def test_get_vocab(self):
         self.assertEqual(vocab_keys[0], "<s>NOTUSED")
         self.assertEqual(vocab_keys[1], "<pad>")
         self.assertEqual(vocab_keys[-1], "<mask>")
-        self.assertEqual(len(vocab_keys), 1_004)
+        self.assertEqual(len(vocab_keys), 1_005)
 
     def test_vocab_size(self):
-        self.assertEqual(self.get_tokenizer().vocab_size, 1_005)
+        self.assertEqual(self.get_tokenizer().vocab_size, 1_000)
 
     def test_rust_and_python_bpe_tokenizers(self):
         tokenizer = CamembertTokenizer(SAMPLE_BPE_VOCAB)
diff --git a/tests/models/canine/test_modeling_canine.py b/tests/models/canine/test_modeling_canine.py
index cf45f10a833a..303d465ca905 100644
--- a/tests/models/canine/test_modeling_canine.py
+++ b/tests/models/canine/test_modeling_canine.py
@@ -23,6 +23,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, global_rng, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -49,9 +50,10 @@ def __init__(
         use_token_type_ids=True,
         use_labels=True,
         # let's use a vocab size that's way bigger than BERT's one
+        # NOTE: this is not a model parameter, just an input
         vocab_size=100000,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -63,6 +65,7 @@ def __init__(
         initializer_range=0.02,
         num_labels=3,
         num_choices=4,
+        num_hash_buckets=16,
         scope=None,
     ):
         self.parent = parent
@@ -86,6 +89,7 @@ def __init__(
         self.initializer_range = initializer_range
         self.num_labels = num_labels
         self.num_choices = num_choices
+        self.num_hash_buckets = num_hash_buckets
         self.scope = scope
 
     def prepare_config_and_inputs(self):
@@ -124,6 +128,7 @@ def get_config(self):
             type_vocab_size=self.type_vocab_size,
             is_decoder=False,
             initializer_range=self.initializer_range,
+            num_hash_buckets=self.num_hash_buckets,
         )
 
     def create_and_check_model(
@@ -207,8 +212,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class CanineModelTest(ModelTesterMixin, unittest.TestCase):
-
+class CanineModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             CanineModel,
@@ -220,6 +224,17 @@ class CanineModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": CanineModel,
+            "question-answering": CanineForQuestionAnswering,
+            "text-classification": CanineForSequenceClassification,
+            "token-classification": CanineForTokenClassification,
+            "zero-shot": CanineForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
 
     test_mismatched_shapes = False
     test_resize_embeddings = False
diff --git a/tests/models/canine/test_tokenization_canine.py b/tests/models/canine/test_tokenization_canine.py
index 6ae27082cceb..bfa5ae28aaa4 100644
--- a/tests/models/canine/test_tokenization_canine.py
+++ b/tests/models/canine/test_tokenization_canine.py
@@ -28,7 +28,6 @@
 
 
 class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = CanineTokenizer
     test_rust_tokenizer = False
 
@@ -123,7 +122,9 @@ def test_save_and_load_tokenizer(self):
                 # We can add a new special token for Canine as follows:
                 new_additional_special_token = chr(0xE007)
                 additional_special_tokens.append(new_additional_special_token)
-                tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+                tokenizer.add_special_tokens(
+                    {"additional_special_tokens": additional_special_tokens}, replace_additional_special_tokens=False
+                )
                 before_tokens = tokenizer.encode(sample_text, add_special_tokens=False)
                 tokenizer.save_pretrained(tmpdirname)
 
@@ -168,11 +169,7 @@ def test_tokenize_special_tokens(self):
             with self.subTest(f"{tokenizer.__class__.__name__}"):
                 SPECIAL_TOKEN_1 = chr(0xE005)
                 SPECIAL_TOKEN_2 = chr(0xE006)
-
-                # `add_tokens` method stores special tokens only in `tokenizer.unique_no_split_tokens`. (in tokenization_utils.py)
                 tokenizer.add_tokens([SPECIAL_TOKEN_1], special_tokens=True)
-                # `add_special_tokens` method stores special tokens in `tokenizer.additional_special_tokens`,
-                # which also occur in `tokenizer.all_special_tokens`. (in tokenization_utils_base.py)
                 tokenizer.add_special_tokens({"additional_special_tokens": [SPECIAL_TOKEN_2]})
 
                 token_1 = tokenizer.tokenize(SPECIAL_TOKEN_1)
@@ -188,7 +185,6 @@ def test_added_token_serializable(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 # a special token for Canine can be defined as follows:
                 NEW_TOKEN = 0xE006
                 new_token = chr(NEW_TOKEN)
@@ -262,7 +258,6 @@ def test_encode_decode_with_spaces(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 input = "hello world"
                 if self.space_between_special_tokens:
                     output = "[CLS] hello world [SEP]"
diff --git a/tests/models/chinese_clip/test_feature_extraction_chinese_clip.py b/tests/models/chinese_clip/test_feature_extraction_chinese_clip.py
deleted file mode 100644
index 616dfa3ffc7a..000000000000
--- a/tests/models/chinese_clip/test_feature_extraction_chinese_clip.py
+++ /dev/null
@@ -1,305 +0,0 @@
-# coding=utf-8
-# Copyright 2021 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-
-from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torch_available, is_vision_available
-
-from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin
-
-
-if is_torch_available():
-    import torch
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import ChineseCLIPFeatureExtractor
-
-
-class ChineseCLIPFeatureExtractionTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_center_crop=True,
-        crop_size=None,
-        do_normalize=True,
-        image_mean=[0.48145466, 0.4578275, 0.40821073],
-        image_std=[0.26862954, 0.26130258, 0.27577711],
-        do_convert_rgb=True,
-    ):
-        size = size if size is not None else {"height": 224, "width": 224}
-        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_center_crop = do_center_crop
-        self.crop_size = crop_size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.do_convert_rgb = do_convert_rgb
-
-    def prepare_feat_extract_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_center_crop": self.do_center_crop,
-            "crop_size": self.crop_size,
-            "do_normalize": self.do_normalize,
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_convert_rgb": self.do_convert_rgb,
-        }
-
-    def prepare_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
-        or a list of PyTorch tensors if one specifies torchify=True.
-        """
-
-        assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
-
-        if equal_resolution:
-            image_inputs = []
-            for i in range(self.batch_size):
-                image_inputs.append(
-                    np.random.randint(
-                        255, size=(self.num_channels, self.max_resolution, self.max_resolution), dtype=np.uint8
-                    )
-                )
-        else:
-            image_inputs = []
-            for i in range(self.batch_size):
-                width, height = np.random.choice(np.arange(self.min_resolution, self.max_resolution), 2)
-                image_inputs.append(np.random.randint(255, size=(self.num_channels, width, height), dtype=np.uint8))
-
-        if not numpify and not torchify:
-            # PIL expects the channel dimension as last dimension
-            image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
-
-        if torchify:
-            image_inputs = [torch.from_numpy(x) for x in image_inputs]
-
-        return image_inputs
-
-
-@require_torch
-@require_vision
-class ChineseCLIPFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
-
-    feature_extraction_class = ChineseCLIPFeatureExtractor if is_vision_available() else None
-
-    def setUp(self):
-        self.feature_extract_tester = ChineseCLIPFeatureExtractionTester(self, do_center_crop=True)
-
-    @property
-    def feat_extract_dict(self):
-        return self.feature_extract_tester.prepare_feat_extract_dict()
-
-    def test_feat_extract_properties(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        self.assertTrue(hasattr(feature_extractor, "do_resize"))
-        self.assertTrue(hasattr(feature_extractor, "size"))
-        self.assertTrue(hasattr(feature_extractor, "do_center_crop"))
-        self.assertTrue(hasattr(feature_extractor, "center_crop"))
-        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
-        self.assertTrue(hasattr(feature_extractor, "image_mean"))
-        self.assertTrue(hasattr(feature_extractor, "image_std"))
-        self.assertTrue(hasattr(feature_extractor, "do_convert_rgb"))
-
-    def test_feat_extract_from_dict_with_kwargs(self):
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict)
-        self.assertEqual(feature_extractor.size, {"height": 224, "width": 224})
-        self.assertEqual(feature_extractor.crop_size, {"height": 18, "width": 18})
-
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict, size=42, crop_size=84)
-        self.assertEqual(feature_extractor.size, {"shortest_edge": 42})
-        self.assertEqual(feature_extractor.crop_size, {"height": 84, "width": 84})
-
-    def test_batch_feature(self):
-        pass
-
-    def test_call_pil(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PIL images
-        image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-    def test_call_numpy(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random numpy tensors
-        image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-    def test_call_pytorch(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PyTorch tensors
-        image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-
-@require_torch
-@require_vision
-class ChineseCLIPFeatureExtractionTestFourChannels(FeatureExtractionSavingTestMixin, unittest.TestCase):
-
-    feature_extraction_class = ChineseCLIPFeatureExtractor if is_vision_available() else None
-
-    def setUp(self):
-        self.feature_extract_tester = ChineseCLIPFeatureExtractionTester(self, num_channels=4, do_center_crop=True)
-        self.expected_encoded_image_num_channels = 3
-
-    @property
-    def feat_extract_dict(self):
-        return self.feature_extract_tester.prepare_feat_extract_dict()
-
-    def test_feat_extract_properties(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        self.assertTrue(hasattr(feature_extractor, "do_resize"))
-        self.assertTrue(hasattr(feature_extractor, "size"))
-        self.assertTrue(hasattr(feature_extractor, "do_center_crop"))
-        self.assertTrue(hasattr(feature_extractor, "center_crop"))
-        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
-        self.assertTrue(hasattr(feature_extractor, "image_mean"))
-        self.assertTrue(hasattr(feature_extractor, "image_std"))
-        self.assertTrue(hasattr(feature_extractor, "do_convert_rgb"))
-
-    def test_batch_feature(self):
-        pass
-
-    def test_call_pil_four_channels(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PIL images
-        image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.expected_encoded_image_num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.expected_encoded_image_num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
diff --git a/tests/models/chinese_clip/test_image_processing_chinese_clip.py b/tests/models/chinese_clip/test_image_processing_chinese_clip.py
new file mode 100644
index 000000000000..7eea00f88520
--- /dev/null
+++ b/tests/models/chinese_clip/test_image_processing_chinese_clip.py
@@ -0,0 +1,166 @@
+# coding=utf-8
+# Copyright 2021 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_vision_available():
+    from transformers import ChineseCLIPImageProcessor
+
+
+if is_torch_available():
+    pass
+
+
+class ChineseCLIPImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_center_crop=True,
+        crop_size=None,
+        do_normalize=True,
+        image_mean=[0.48145466, 0.4578275, 0.40821073],
+        image_std=[0.26862954, 0.26130258, 0.27577711],
+        do_convert_rgb=True,
+    ):
+        size = size if size is not None else {"height": 224, "width": 224}
+        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_convert_rgb = do_convert_rgb
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_center_crop": self.do_center_crop,
+            "crop_size": self.crop_size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_convert_rgb": self.do_convert_rgb,
+        }
+
+    def expected_output_image_shape(self, images):
+        return 3, self.crop_size["height"], self.crop_size["width"]
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class ChineseCLIPImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = ChineseCLIPImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = ChineseCLIPImageProcessingTester(self, do_center_crop=True)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "do_center_crop"))
+        self.assertTrue(hasattr(image_processing, "center_crop"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"height": 224, "width": 224})
+        self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
+
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84)
+        self.assertEqual(image_processor.size, {"shortest_edge": 42})
+        self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
+
+    @unittest.skip("ChineseCLIPImageProcessor doesn't treat 4 channel PIL and numpy consistently yet")  # FIXME Amy
+    def test_call_numpy_4_channels(self):
+        pass
+
+
+@require_torch
+@require_vision
+class ChineseCLIPImageProcessingTestFourChannels(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = ChineseCLIPImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = ChineseCLIPImageProcessingTester(self, num_channels=4, do_center_crop=True)
+        self.expected_encoded_image_num_channels = 3
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "do_center_crop"))
+        self.assertTrue(hasattr(image_processing, "center_crop"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
+
+    @unittest.skip("ChineseCLIPImageProcessor does not support 4 channels yet")  # FIXME Amy
+    def test_call_numpy(self):
+        return super().test_call_numpy()
+
+    @unittest.skip("ChineseCLIPImageProcessor does not support 4 channels yet")  # FIXME Amy
+    def test_call_pytorch(self):
+        return super().test_call_torch()
+
+    @unittest.skip("ChineseCLIPImageProcessor doesn't treat 4 channel PIL and numpy consistently yet")  # FIXME Amy
+    def test_call_numpy_4_channels(self):
+        pass
diff --git a/tests/models/chinese_clip/test_modeling_chinese_clip.py b/tests/models/chinese_clip/test_modeling_chinese_clip.py
index 97e522b3b95e..137c3c2888dd 100644
--- a/tests/models/chinese_clip/test_modeling_chinese_clip.py
+++ b/tests/models/chinese_clip/test_modeling_chinese_clip.py
@@ -20,8 +20,8 @@
 import unittest
 
 import numpy as np
-
 import requests
+
 from transformers import ChineseCLIPConfig, ChineseCLIPTextConfig, ChineseCLIPVisionConfig
 from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
@@ -35,6 +35,7 @@
     ids_tensor,
     random_attention_mask,
 )
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -68,7 +69,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -245,7 +246,7 @@ def __init__(
         is_training=True,
         hidden_size=32,
         projection_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -316,7 +317,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class ChineseCLIPTextModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (ChineseCLIPTextModel,) if is_torch_available() else ()
     fx_compatible = False
 
@@ -478,7 +478,6 @@ def test_model_from_pretrained(self):
 
 class ChineseCLIPModelTester:
     def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
-
         if text_kwargs is None:
             text_kwargs = {}
         if vision_kwargs is None:
@@ -535,8 +534,9 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class ChineseCLIPModelTest(ModelTesterMixin, unittest.TestCase):
+class ChineseCLIPModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (ChineseCLIPModel,) if is_torch_available() else ()
+    pipeline_model_mapping = {"feature-extraction": ChineseCLIPModel} if is_torch_available() else {}
     fx_compatible = False
     test_head_masking = False
     test_pruning = False
@@ -647,6 +647,17 @@ def _create_and_check_torchscript(self, config, inputs_dict):
 
             self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
 
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
             models_equal = True
             for layer_name, p1 in model_state_dict.items():
                 p2 = loaded_model_state_dict[layer_name]
diff --git a/tests/models/clap/__init__.py b/tests/models/clap/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/clap/test_feature_extraction_clap.py b/tests/models/clap/test_feature_extraction_clap.py
new file mode 100644
index 000000000000..c49d045ba874
--- /dev/null
+++ b/tests/models/clap/test_feature_extraction_clap.py
@@ -0,0 +1,546 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import itertools
+import random
+import unittest
+
+import numpy as np
+
+from transformers import ClapFeatureExtractor
+from transformers.testing_utils import require_torch, require_torchaudio
+from transformers.trainer_utils import set_seed
+from transformers.utils.import_utils import is_torch_available
+
+from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
+
+
+if is_torch_available():
+    import torch
+
+global_rng = random.Random()
+
+
+# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list
+def floats_list(shape, scale=1.0, rng=None, name=None):
+    """Creates a random float32 tensor"""
+    if rng is None:
+        rng = global_rng
+
+    values = []
+    for batch_idx in range(shape[0]):
+        values.append([])
+        for _ in range(shape[1]):
+            values[-1].append(rng.random() * scale)
+
+    return values
+
+
+@require_torch
+@require_torchaudio
+# Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTester with Whisper->Clap
+class ClapFeatureExtractionTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        min_seq_length=400,
+        max_seq_length=2000,
+        feature_size=10,
+        hop_length=160,
+        chunk_length=8,
+        padding_value=0.0,
+        sampling_rate=4_000,
+        return_attention_mask=False,
+        do_normalize=True,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.min_seq_length = min_seq_length
+        self.max_seq_length = max_seq_length
+        self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
+        self.padding_value = padding_value
+        self.sampling_rate = sampling_rate
+        self.return_attention_mask = return_attention_mask
+        self.do_normalize = do_normalize
+        self.feature_size = feature_size
+        self.chunk_length = chunk_length
+        self.hop_length = hop_length
+
+    def prepare_feat_extract_dict(self):
+        return {
+            "feature_size": self.feature_size,
+            "hop_length": self.hop_length,
+            "chunk_length": self.chunk_length,
+            "padding_value": self.padding_value,
+            "sampling_rate": self.sampling_rate,
+            "return_attention_mask": self.return_attention_mask,
+            "do_normalize": self.do_normalize,
+        }
+
+    def prepare_inputs_for_common(self, equal_length=False, numpify=False):
+        def _flatten(list_of_lists):
+            return list(itertools.chain(*list_of_lists))
+
+        if equal_length:
+            speech_inputs = [floats_list((self.max_seq_length, self.feature_size)) for _ in range(self.batch_size)]
+        else:
+            # make sure that inputs increase in size
+            speech_inputs = [
+                floats_list((x, self.feature_size))
+                for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
+            ]
+        if numpify:
+            speech_inputs = [np.asarray(x) for x in speech_inputs]
+        return speech_inputs
+
+
+@require_torch
+@require_torchaudio
+# Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest with Whisper->Clap
+class ClapFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
+    feature_extraction_class = ClapFeatureExtractor
+
+    def setUp(self):
+        self.feat_extract_tester = ClapFeatureExtractionTester(self)
+
+    def test_call(self):
+        # Tests that all call wrap to encode_plus and batch_encode_plus
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        # create three inputs of length 800, 1000, and 1200
+        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
+        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
+
+        # Test feature size
+        input_features = feature_extractor(np_speech_inputs, padding="max_length", return_tensors="np").input_features
+        self.assertTrue(input_features.ndim == 4)
+
+        # Test not batched input
+        encoded_sequences_1 = feature_extractor(speech_inputs[0], return_tensors="np").input_features
+        encoded_sequences_2 = feature_extractor(np_speech_inputs[0], return_tensors="np").input_features
+        self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
+
+        # Test batched
+        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
+        encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+
+        # Test 2-D numpy arrays are batched.
+        speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
+        np_speech_inputs = np.asarray(speech_inputs)
+        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
+        encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+
+    def test_double_precision_pad(self):
+        import torch
+
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        np_speech_inputs = np.random.rand(100, 32).astype(np.float64)
+        py_speech_inputs = np_speech_inputs.tolist()
+
+        for inputs in [py_speech_inputs, np_speech_inputs]:
+            np_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="np")
+            self.assertTrue(np_processed.input_features.dtype == np.float32)
+            pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="pt")
+            self.assertTrue(pt_processed.input_features.dtype == torch.float32)
+
+    def _load_datasamples(self, num_samples):
+        from datasets import load_dataset
+
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        # automatic decoding with librispeech
+        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+
+        return [x["array"] for x in speech_samples]
+
+    def test_integration_fusion_short_input(self):
+        # fmt: off
+        EXPECTED_INPUT_FEATURES = torch.tensor(
+            [
+                [
+                    # "repeat"
+                    [
+                        -20.1049, -19.9764, -20.0731, -19.5055, -27.5018, -22.5761, -26.6071,
+                        -29.0091, -26.4659, -26.4236, -28.8808, -31.9190, -32.4848, -34.1186,
+                        -34.0340, -32.8803, -30.9895, -37.6238, -38.0347, -40.6263, -36.3496,
+                        -42.2533, -32.9132, -27.7068, -29.3704, -30.3208, -22.5972, -27.1494,
+                        -30.1975, -31.1005, -29.9372, -27.1917, -25.9806, -30.3489, -33.2380,
+                        -31.9062, -36.5498, -32.8721, -30.5629, -27.4674, -22.2232, -22.5653,
+                        -16.3868, -17.2713, -25.9738, -30.6256, -34.3766, -31.1292, -27.8950,
+                        -27.0588, -25.6206, -23.0712, -26.6050, -28.0112, -32.6847, -34.3396,
+                        -34.9738, -35.8463, -39.2324, -37.1188, -33.3705, -28.9230, -28.9112,
+                        -28.6578
+                    ],
+                    [
+                        -36.7233, -30.0587, -24.8431, -18.4611, -16.8149, -23.9319, -32.8580,
+                        -34.2264, -27.4332, -26.8027, -29.2721, -33.9033, -39.3403, -35.3232,
+                        -26.8076, -28.6460, -35.2780, -36.0738, -35.4996, -37.7631, -39.5056,
+                        -34.7112, -36.8741, -34.1066, -32.9474, -33.6604, -27.9937, -30.9594,
+                        -26.2928, -32.0485, -29.2151, -29.2917, -32.7308, -29.6542, -31.1454,
+                        -37.0088, -32.3388, -37.3086, -31.1024, -27.2889, -19.6788, -21.1488,
+                        -19.5144, -14.8889, -21.2006, -24.7488, -27.7940, -31.1058, -27.5068,
+                        -21.5737, -22.3780, -21.5151, -26.3086, -30.9223, -33.5043, -32.0307,
+                        -37.3806, -41.6188, -45.6650, -40.5131, -32.5023, -26.7385, -26.3709,
+                        -26.7761
+                    ]
+                ],
+                [
+                    # "repeatpad"
+                    [
+                        -25.7496, -24.9339, -24.1357, -23.1271, -23.7853, -26.1264, -29.1456,
+                        -33.2060, -37.8179, -42.4833, -41.9386, -41.2164, -42.3566, -44.2575,
+                        -40.0217, -36.6794, -36.6974, -38.7819, -42.0880, -45.5560, -39.9368,
+                        -36.3219, -35.5981, -36.6434, -35.1851, -33.0684, -30.0437, -30.2010,
+                        -34.3476, -42.1373, -38.8039, -37.3355, -40.4576, -41.0485, -40.6377,
+                        -38.2275, -42.7481, -34.6084, -34.7048, -29.5149, -26.3935, -26.8952,
+                        -34.1336, -26.2904, -28.2571, -32.5642, -36.7240, -35.5334, -38.2451,
+                        -34.8177, -28.9754, -25.1096, -27.9768, -32.3184, -37.0269, -40.5136,
+                        -40.8061, -36.4948, -40.3767, -38.9671, -38.3552, -34.1250, -30.9035,
+                        -31.6112
+                    ],
+                    [
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100.
+                    ]
+                ],
+                [
+                    # None, same as "repeatpad"
+                    [
+                        -25.7496, -24.9339, -24.1357, -23.1271, -23.7853, -26.1264, -29.1456,
+                        -33.2060, -37.8179, -42.4833, -41.9386, -41.2164, -42.3566, -44.2575,
+                        -40.0217, -36.6794, -36.6974, -38.7819, -42.0880, -45.5560, -39.9368,
+                        -36.3219, -35.5981, -36.6434, -35.1851, -33.0684, -30.0437, -30.2010,
+                        -34.3476, -42.1373, -38.8039, -37.3355, -40.4576, -41.0485, -40.6377,
+                        -38.2275, -42.7481, -34.6084, -34.7048, -29.5149, -26.3935, -26.8952,
+                        -34.1336, -26.2904, -28.2571, -32.5642, -36.7240, -35.5334, -38.2451,
+                        -34.8177, -28.9754, -25.1096, -27.9768, -32.3184, -37.0269, -40.5136,
+                        -40.8061, -36.4948, -40.3767, -38.9671, -38.3552, -34.1250, -30.9035,
+                        -31.6112
+                    ],
+                    [
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100.
+                    ]
+                ],
+                [
+                    # "pad"
+                    [
+                        -58.5260, -58.1155, -57.8623, -57.5059, -57.9178, -58.7171, -59.2343,
+                        -59.9833, -60.9764, -62.0722, -63.5723, -65.7111, -67.5153, -68.7088,
+                        -69.8325, -70.2987, -70.1548, -70.6233, -71.5702, -72.5159, -72.3821,
+                        -70.1817, -67.0315, -64.1387, -62.2202, -61.0717, -60.4951, -61.6005,
+                        -63.7358, -67.1400, -67.6185, -65.5635, -64.3593, -63.7138, -63.6209,
+                        -66.4950, -72.6284, -63.3961, -56.8334, -52.7319, -50.6310, -51.3728,
+                        -53.5619, -51.9190, -50.9708, -52.8684, -55.8073, -58.8227, -60.6991,
+                        -57.0547, -52.7611, -51.4388, -54.4892, -60.8950, -66.1024, -72.4352,
+                        -67.8538, -65.1463, -68.7588, -72.3080, -68.4864, -60.4688, -57.1516,
+                        -60.9460
+                    ],
+                    [
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100.
+                    ]
+                ]
+            ]
+        )
+        # fmt: on
+        MEL_BIN = [[976, 977], [976, 977], [976, 977], [196, 197]]
+        input_speech = self._load_datasamples(1)
+        feature_extractor = ClapFeatureExtractor()
+        for padding, EXPECTED_VALUES, idx_in_mel in zip(
+            ["repeat", "repeatpad", None, "pad"], EXPECTED_INPUT_FEATURES, MEL_BIN
+        ):
+            input_features = feature_extractor(input_speech, return_tensors="pt", padding=padding).input_features
+            self.assertEqual(input_features.shape, (1, 4, 1001, 64))
+
+            self.assertTrue(torch.allclose(input_features[0, 0, idx_in_mel[0]], EXPECTED_VALUES[0], atol=1e-4))
+            self.assertTrue(torch.allclose(input_features[0, 0, idx_in_mel[1]], EXPECTED_VALUES[1], atol=1e-4))
+
+            self.assertTrue(torch.all(input_features[0, 0] == input_features[0, 1]))
+            self.assertTrue(torch.all(input_features[0, 0] == input_features[0, 2]))
+            self.assertTrue(torch.all(input_features[0, 0] == input_features[0, 3]))
+
+    def test_integration_rand_trunc_short_input(self):
+        # fmt: off
+        EXPECTED_INPUT_FEATURES = torch.tensor(
+            [
+                [
+                    # "repeat"
+                    [
+                        -35.0483, -35.7865, -38.2884, -40.0220, -42.5349, -44.9489, -43.2228,
+                        -44.6499, -47.6253, -49.6983, -50.2127, -52.5483, -52.2223, -51.9157,
+                        -49.4082, -51.2024, -57.0476, -56.2803, -58.1618, -60.7474, -55.0389,
+                        -60.9514, -59.3080, -50.4419, -47.8172, -48.7570, -55.2552, -44.5036,
+                        -44.1148, -50.8218, -51.0968, -52.9408, -51.1037, -48.9789, -47.5897,
+                        -52.0915, -55.4216, -54.1529, -58.0149, -58.0866, -52.7798, -52.6154,
+                        -45.9144, -46.2008, -40.7603, -41.1703, -50.2250, -55.4112, -59.4818,
+                        -54.5795, -53.5552, -51.3668, -49.8358, -50.3186, -54.0452, -57.6030,
+                        -61.1589, -61.6415, -63.2756, -66.5890, -62.8543, -58.0665, -56.7203,
+                        -56.7632
+                    ],
+                    [
+                        -47.1320, -37.9961, -34.0076, -36.7109, -47.9057, -48.4924, -43.8371,
+                        -44.9728, -48.1689, -52.9141, -57.6077, -52.8520, -44.8502, -45.6764,
+                        -51.8389, -56.4284, -54.6972, -53.4889, -55.6077, -58.7149, -60.3760,
+                        -54.0136, -56.0730, -55.9870, -54.4017, -53.1094, -53.5640, -50.3064,
+                        -49.9520, -49.3239, -48.1668, -53.4852, -50.4561, -50.8688, -55.1970,
+                        -51.5538, -53.0260, -59.6933, -54.8183, -59.5895, -55.9589, -50.3761,
+                        -44.1282, -44.1463, -43.8540, -39.1168, -45.3893, -49.5542, -53.1505,
+                        -55.2870, -50.3921, -46.8511, -47.4444, -49.5633, -56.0034, -59.0815,
+                        -59.0018, -63.7589, -69.5745, -71.5789, -64.0498, -56.0558, -54.3475,
+                        -54.7004
+                    ]
+                ],
+                [
+                    # "repeatpad"
+                    [
+                        -40.3184, -39.7186, -39.8807, -41.6508, -45.3613, -50.4785, -57.0297,
+                        -60.4944, -59.1642, -58.9495, -60.4661, -62.5300, -58.4759, -55.2865,
+                        -54.8973, -56.0780, -57.5482, -59.6557, -64.3309, -65.0330, -59.4941,
+                        -56.8552, -55.0519, -55.9817, -56.9739, -55.2827, -54.5312, -51.4141,
+                        -50.4289, -51.9131, -57.5821, -63.9979, -59.9180, -58.9489, -62.3247,
+                        -62.6975, -63.7948, -60.5250, -64.6107, -58.7905, -57.0229, -54.3084,
+                        -49.8445, -50.4459, -57.0172, -50.6425, -52.5992, -57.4207, -61.6358,
+                        -60.6540, -63.1968, -57.4360, -52.3263, -51.7695, -57.1946, -62.9610,
+                        -66.7359, -67.0335, -63.7440, -68.1775, -66.3798, -62.8650, -59.8972,
+                        -59.3139
+                    ],
+                    [
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100.
+                    ]
+                ],
+                [
+                    # None, same as "repeatpad"
+                    [
+                        -40.3184, -39.7186, -39.8807, -41.6508, -45.3613, -50.4785, -57.0297,
+                        -60.4944, -59.1642, -58.9495, -60.4661, -62.5300, -58.4759, -55.2865,
+                        -54.8973, -56.0780, -57.5482, -59.6557, -64.3309, -65.0330, -59.4941,
+                        -56.8552, -55.0519, -55.9817, -56.9739, -55.2827, -54.5312, -51.4141,
+                        -50.4289, -51.9131, -57.5821, -63.9979, -59.9180, -58.9489, -62.3247,
+                        -62.6975, -63.7948, -60.5250, -64.6107, -58.7905, -57.0229, -54.3084,
+                        -49.8445, -50.4459, -57.0172, -50.6425, -52.5992, -57.4207, -61.6358,
+                        -60.6540, -63.1968, -57.4360, -52.3263, -51.7695, -57.1946, -62.9610,
+                        -66.7359, -67.0335, -63.7440, -68.1775, -66.3798, -62.8650, -59.8972,
+                        -59.3139
+                    ],
+                    [
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100.
+                    ]
+                ],
+                [
+                    # "pad"
+                    [
+                        -73.3190, -73.6349, -74.1451, -74.8539, -75.7476, -76.5438, -78.5540,
+                        -80.1339, -81.8911, -83.7560, -85.5387, -86.7466, -88.2072, -88.6090,
+                        -88.8243, -89.0784, -89.4364, -89.8179, -91.3146, -92.2833, -91.7221,
+                        -90.9440, -88.1315, -86.2425, -84.2281, -82.4893, -81.5993, -81.1328,
+                        -81.5759, -83.1068, -85.6525, -88.9520, -88.9187, -87.2703, -86.3052,
+                        -85.7188, -85.8802, -87.9996, -95.0464, -88.0133, -80.8561, -76.5597,
+                        -74.2816, -74.8109, -77.3615, -76.0719, -75.3426, -77.6428, -80.9663,
+                        -84.5275, -84.9907, -80.5205, -77.2851, -78.6259, -84.7740, -91.4535,
+                        -98.1894, -94.3872, -92.3735, -97.6807, -98.1501, -91.4344, -85.2842,
+                        -88.4338
+                    ],
+                    [
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100.
+                    ]
+                ]
+            ]
+        )
+        # fmt: on
+        MEL_BIN = [[976, 977], [976, 977], [976, 977], [196, 197]]
+        input_speech = self._load_datasamples(1)
+        feature_extractor = ClapFeatureExtractor()
+        for padding, EXPECTED_VALUES, idx_in_mel in zip(
+            ["repeat", "repeatpad", None, "pad"], EXPECTED_INPUT_FEATURES, MEL_BIN
+        ):
+            input_features = feature_extractor(
+                input_speech, return_tensors="pt", truncation="rand_trunc", padding=padding
+            ).input_features
+            self.assertEqual(input_features.shape, (1, 1, 1001, 64))
+            self.assertTrue(torch.allclose(input_features[0, 0, idx_in_mel[0]], EXPECTED_VALUES[0], atol=1e-4))
+            self.assertTrue(torch.allclose(input_features[0, 0, idx_in_mel[1]], EXPECTED_VALUES[1], atol=1e-4))
+
+    def test_integration_fusion_long_input(self):
+        # fmt: off
+        EXPECTED_INPUT_FEATURES = torch.tensor(
+            [
+                [
+                    -11.1830, -10.1894, -8.6051, -4.8578, -1.3268, -8.4606, -14.5453,
+                     -9.2017, 0.5781, 16.2129, 14.8289, 3.6326, -3.8794, -6.5544,
+                     -2.4408, 1.9531, 6.0967, 1.7590, -7.6730, -6.1571, 2.0052,
+                     16.6694, 20.6447, 21.2145, 13.4972, 15.9043, 16.8987, 4.1766,
+                     11.9428, 21.2372, 12.3016, 4.8604, 6.7241, 1.8543, 4.9235,
+                      5.3188, -0.9897, -1.2416, -6.5864, 2.9529, 2.9274, 6.4753,
+                     10.2300, 11.2127, 3.4042, -1.0055, -6.0475, -6.7524, -3.9801,
+                     -1.4434, 0.4740, -0.1584, -4.5457, -8.5746, -8.8428, -13.1475,
+                     -9.6079, -8.5798, -4.1143, -3.7966, -7.1651, -6.1517, -8.0258,
+                    -12.1486
+                ],
+                [
+                    -10.2017, -7.9924, -5.9517, -3.9372, -1.9735, -4.3130, 16.1647,
+                     25.0592, 23.5532, 14.4974, -7.0778, -10.2262, 6.4782, 20.3454,
+                     19.4269, 1.7976, -16.5070, 4.9380, 12.3390, 6.9285, -13.6325,
+                     -8.5298, 1.0839, -5.9629, -8.4812, 3.1331, -2.0963, -16.6046,
+                    -14.0070, -17.5707, -13.2080, -17.2168, -17.7770, -12.1111, -18.6184,
+                    -17.1897, -13.9801, -12.0426, -23.5400, -25.6823, -23.5813, -18.7847,
+                    -20.5473, -25.6458, -19.7585, -27.6007, -28.9276, -24.8948, -25.4458,
+                    -22.2807, -19.6613, -19.2669, -15.7813, -19.6821, -24.3439, -22.2598,
+                    -28.2631, -30.1017, -32.7646, -33.6525, -27.5639, -22.0548, -27.8054,
+                    -29.6947
+                ],
+                [
+                    -9.2078, -7.2963, -6.2095, -7.9959, -2.9280, -11.1843, -6.1490,
+                    5.0733, 19.2957, 21.4578, 14.6803, -3.3153, -6.3334, -2.3542,
+                    6.9509, 15.2965, 14.6620, 5.2075, -0.0873, 1.1919, 18.1986,
+                    20.8470, 10.8035, 2.2516, 7.6905, 7.7427, -1.2543, -5.0018,
+                    0.9809, -2.1584, -5.4580, -5.4760, -11.8888, -9.0605, -8.4638,
+                    -9.9897, -0.0540, -5.1629, 0.0483, -4.1504, -4.8140, -7.8236,
+                    -9.0622, -10.1742, -8.9597, -11.5380, -16.5603, -17.1858, -17.5032,
+                    -20.9326, -23.9543, -25.2602, -25.3429, -27.4536, -26.8859, -22.7852,
+                    -25.8288, -24.8399, -23.8893, -24.2096, -26.5415, -23.7281, -25.6851,
+                    -22.3629
+                ],
+                [
+                      1.3448, 2.9883, 4.0366, -0.8019, -10.4191, -10.0883, -4.3812,
+                      0.8136, 2.1579, 0.0832, 1.0949, -0.9759, -5.5319, -4.6009,
+                     -6.5452, -14.9155, -20.1584, -9.3611, -2.4271, 1.4031, 4.9910,
+                      8.6916, 8.6785, 10.1973, 9.9029, 5.3840, 7.5336, 5.2803,
+                      2.8144, -0.3138, 2.2216, 5.7328, 7.5574, 7.7402, 1.0681,
+                      3.1049, 7.0742, 6.5588, 7.3712, 5.7881, 8.6874, 8.7725,
+                      2.8133, -4.5809, -6.1317, -5.1719, -5.0192, -9.0977, -10.9391,
+                     -6.0769, 1.6016, -0.8965, -7.2252, -7.8632, -11.4468, -11.7446,
+                    -10.7447, -7.0601, -2.7748, -4.1798, -2.8433, -3.1352, 0.8097,
+                      6.4212
+                ]
+            ]
+        )
+        # fmt: on
+        MEL_BIN = 963
+        input_speech = torch.cat([torch.tensor(x) for x in self._load_datasamples(5)])
+        feature_extractor = ClapFeatureExtractor()
+        for padding, EXPECTED_VALUES, block_idx in zip(
+            ["repeat", "repeatpad", None, "pad"], EXPECTED_INPUT_FEATURES, [1, 2, 0, 3]
+        ):
+            set_seed(987654321)
+            input_features = feature_extractor(input_speech, return_tensors="pt", padding=padding).input_features
+            self.assertEqual(input_features.shape, (1, 4, 1001, 64))
+            self.assertTrue(torch.allclose(input_features[0, block_idx, MEL_BIN], EXPECTED_VALUES, atol=1e-3))
+
+    def test_integration_rand_trunc_long_input(self):
+        # fmt: off
+        EXPECTED_INPUT_FEATURES = torch.tensor(
+            [
+                [
+                    -35.4022, -32.7555, -31.2004, -32.7764, -42.5770, -41.6339, -43.1630,
+                    -44.5080, -44.3029, -48.9628, -39.5022, -39.2105, -43.1350, -43.2195,
+                    -48.4894, -52.2344, -57.6891, -52.2228, -45.5155, -44.2893, -43.4697,
+                    -46.6702, -43.7490, -40.4819, -42.7275, -46.3434, -46.8412, -41.2003,
+                    -43.1681, -46.2948, -46.1925, -47.8333, -45.6812, -44.9182, -41.7786,
+                    -43.3809, -44.3199, -42.8814, -45.4771, -46.7114, -46.9746, -42.7090,
+                    -41.6057, -38.3965, -40.1980, -41.0263, -34.1256, -28.3289, -29.0201,
+                    -30.4453, -29.5561, -30.1734, -25.9406, -19.0897, -15.8452, -20.1351,
+                    -23.6515, -23.1194, -17.1845, -19.4399, -23.6527, -22.8768, -20.7279,
+                    -22.7864
+                ],
+                [
+                    -35.7719, -27.2566, -23.6964, -27.5521, 0.2510, 7.4391, 1.3917,
+                    -13.3417, -28.1758, -17.0856, -5.7723, -0.8000, -7.8832, -15.5548,
+                    -30.5935, -24.7571, -13.7009, -10.3432, -21.2464, -24.8118, -19.4080,
+                    -14.9779, -11.7991, -18.4485, -20.1982, -17.3652, -20.6328, -28.2967,
+                    -25.7819, -21.8962, -28.5083, -29.5719, -30.2120, -35.7033, -31.8218,
+                    -34.0408, -37.7744, -33.9653, -31.3009, -30.9063, -28.6153, -32.2202,
+                    -28.5456, -28.8579, -32.5170, -37.9152, -43.0052, -46.4849, -44.0786,
+                    -39.1933, -33.2757, -31.6313, -42.6386, -52.3679, -53.5785, -55.6444,
+                    -47.0050, -47.6459, -56.6361, -60.6781, -61.5244, -55.8272, -60.4832,
+                    -58.1897
+                ],
+                [
+                    -38.2686, -36.6285, -32.5835, -35.1693, -37.7938, -37.4035, -35.3132,
+                    -35.6083, -36.3609, -40.9472, -36.7846, -36.1544, -38.9076, -39.3618,
+                    -35.4953, -34.2809, -39.9466, -39.7433, -34.8347, -37.5674, -41.5689,
+                    -38.9161, -34.3947, -30.2924, -30.4841, -34.5831, -28.9261, -24.8849,
+                    -31.2324, -27.1622, -27.2107, -25.9385, -30.1691, -30.9223, -23.9495,
+                    -25.6047, -26.7119, -28.5523, -27.7481, -32.8427, -35.4650, -31.0399,
+                    -31.2073, -30.5163, -22.9819, -20.8892, -19.2510, -24.7905, -28.9426,
+                    -28.1998, -26.7386, -25.0140, -27.9223, -32.9913, -33.1864, -34.9742,
+                    -38.5995, -39.6990, -29.3203, -22.4697, -25.6415, -33.5608, -33.0945,
+                    -27.1716
+                ],
+                [
+                    -33.2015, -28.7741, -21.9457, -23.4888, -32.1072, -8.6307, 3.2724,
+                      5.9157, -0.9221, -30.1814, -31.0015, -27.4508, -27.0477, -9.5342,
+                      0.3221, 0.6511, -7.1596, -25.9707, -32.8924, -32.2300, -13.8974,
+                     -0.4895, 0.9168, -10.7663, -27.1176, -35.0829, -11.6859, -4.8855,
+                    -11.8898, -26.6167, -5.6192, -3.8443, -19.7947, -14.4101, -8.6236,
+                    -21.2458, -21.0801, -17.9136, -24.4663, -18.6333, -24.8085, -15.5854,
+                    -15.4344, -11.5046, -22.3625, -27.3387, -32.4353, -30.9670, -31.3789,
+                    -35.4044, -34.4591, -25.2433, -28.0773, -33.8736, -33.0224, -33.3155,
+                    -38.5302, -39.2741, -36.6395, -34.7729, -32.4483, -42.4001, -49.2857,
+                    -39.1682
+                ]
+            ]
+        )
+        # fmt: on
+        MEL_BIN = 963
+        SEEDS = [987654321, 1234, 666, 5555]
+        input_speech = torch.cat([torch.tensor(x) for x in self._load_datasamples(5)])
+        feature_extractor = ClapFeatureExtractor()
+        for padding, EXPECTED_VALUES, seed in zip(
+            ["repeat", "repeatpad", None, "pad"], EXPECTED_INPUT_FEATURES, SEEDS
+        ):
+            set_seed(seed)
+            input_features = feature_extractor(
+                input_speech, return_tensors="pt", truncation="rand_trunc", padding=padding
+            ).input_features
+            self.assertEqual(input_features.shape, (1, 1, 1001, 64))
+            self.assertTrue(torch.allclose(input_features[0, 0, MEL_BIN], EXPECTED_VALUES, atol=1e-4))
diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py
new file mode 100644
index 000000000000..dc5718850f4e
--- /dev/null
+++ b/tests/models/clap/test_modeling_clap.py
@@ -0,0 +1,742 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch CLAP model. """
+
+
+import inspect
+import os
+import tempfile
+import unittest
+
+import numpy as np
+from datasets import load_dataset
+
+from transformers import ClapAudioConfig, ClapConfig, ClapProcessor, ClapTextConfig
+from transformers.testing_utils import require_torch, slow, torch_device
+from transformers.utils import is_torch_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import (
+    ModelTesterMixin,
+    _config_zero_init,
+    floats_tensor,
+    ids_tensor,
+    random_attention_mask,
+)
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import (
+        ClapAudioModel,
+        ClapAudioModelWithProjection,
+        ClapModel,
+        ClapTextModel,
+        ClapTextModelWithProjection,
+    )
+    from transformers.models.clap.modeling_clap import CLAP_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+class ClapAudioModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        image_size=60,
+        num_mel_bins=16,
+        window_size=4,
+        spec_size=64,
+        patch_size=2,
+        patch_stride=2,
+        seq_length=16,
+        freq_ratio=2,
+        num_channels=3,
+        is_training=True,
+        hidden_size=32,
+        patch_embeds_hidden_size=16,
+        projection_dim=32,
+        depths=[2, 2],
+        num_hidden_layers=2,
+        num_heads=[2, 2],
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        initializer_range=0.02,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.num_mel_bins = num_mel_bins
+        self.window_size = window_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.depths = depths
+        self.num_heads = num_heads
+        self.num_attention_heads = num_heads[0]
+        self.seq_length = seq_length
+        self.spec_size = spec_size
+        self.freq_ratio = freq_ratio
+        self.patch_stride = patch_stride
+        self.patch_embeds_hidden_size = patch_embeds_hidden_size
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_features = floats_tensor([self.batch_size, 1, self.hidden_size, self.num_mel_bins])
+        config = self.get_config()
+
+        return config, input_features
+
+    def get_config(self):
+        return ClapAudioConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_mel_bins=self.num_mel_bins,
+            window_size=self.window_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            patch_stride=self.patch_stride,
+            projection_dim=self.projection_dim,
+            depths=self.depths,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            initializer_range=self.initializer_range,
+            spec_size=self.spec_size,
+            freq_ratio=self.freq_ratio,
+            patch_embeds_hidden_size=self.patch_embeds_hidden_size,
+        )
+
+    def create_and_check_model(self, config, input_features):
+        model = ClapAudioModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(input_features)
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_model_with_projection(self, config, input_features):
+        model = ClapAudioModelWithProjection(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(input_features)
+        self.parent.assertEqual(result.audio_embeds.shape, (self.batch_size, self.projection_dim))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_features = config_and_inputs
+        inputs_dict = {"input_features": input_features}
+        return config, inputs_dict
+
+
+@require_torch
+class ClapAudioModelTest(ModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as CLAP does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (ClapAudioModel, ClapAudioModelWithProjection) if is_torch_available() else ()
+    fx_compatible = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = ClapAudioModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ClapAudioConfig, has_text_modality=False, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="ClapAudioModel does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.hidden_states
+
+            expected_num_layers = getattr(
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+            )
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [2 * self.model_tester.patch_embeds_hidden_size, 2 * self.model_tester.patch_embeds_hidden_size],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    @unittest.skip(reason="ClapAudioModel does not output any loss term in the forward pass")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["input_features"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_with_projection(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_with_projection(*config_and_inputs)
+
+    @unittest.skip(reason="ClapAudioModel does not output any loss term in the forward pass")
+    def test_training(self):
+        pass
+
+    @unittest.skip(reason="ClapAudioModel does not output any loss term in the forward pass")
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(reason="ClapAudioModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @unittest.skip(reason="ClapAudioModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_to_base(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in CLAP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = ClapAudioModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    @slow
+    def test_model_with_projection_from_pretrained(self):
+        for model_name in CLAP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = ClapAudioModelWithProjection.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+            self.assertTrue(hasattr(model, "audio_projection"))
+
+
+class ClapTextModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        projection_dim=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        scope=None,
+        projection_hidden_act="relu",
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.scope = scope
+        self.projection_hidden_act = projection_hidden_act
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        if input_mask is not None:
+            batch_size, seq_length = input_mask.shape
+            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
+            for batch_idx, start_index in enumerate(rnd_start_indices):
+                input_mask[batch_idx, :start_index] = 1
+                input_mask[batch_idx, start_index:] = 0
+
+        config = self.get_config()
+
+        return config, input_ids, input_mask
+
+    def get_config(self):
+        return ClapTextConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            max_position_embeddings=self.max_position_embeddings,
+            initializer_range=self.initializer_range,
+            projection_hidden_act=self.projection_hidden_act,
+        )
+
+    def create_and_check_model(self, config, input_ids, input_mask):
+        model = ClapTextModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(input_ids, attention_mask=input_mask)
+            result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_model_with_projection(self, config, input_ids, input_mask):
+        model = ClapTextModelWithProjection(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(input_ids, attention_mask=input_mask)
+            result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.text_embeds.shape, (self.batch_size, self.projection_dim))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, input_mask = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class ClapTextModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (ClapTextModel, ClapTextModelWithProjection) if is_torch_available() else ()
+    fx_compatible = False
+    test_pruning = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = ClapTextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ClapTextConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_with_projection(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_with_projection(*config_and_inputs)
+
+    @unittest.skip(reason="ClapTextModel does not output any loss term in the forward pass")
+    def test_training(self):
+        pass
+
+    @unittest.skip(reason="ClapTextModel does not output any loss term in the forward pass")
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(reason="ClapTextModel does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="ClapTextModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @unittest.skip(reason="ClapTextModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_to_base(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in CLAP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = ClapTextModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    @slow
+    def test_model_with_projection_from_pretrained(self):
+        for model_name in CLAP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = ClapTextModelWithProjection.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+            self.assertTrue(hasattr(model, "text_projection"))
+
+
+class ClapModelTester:
+    def __init__(self, parent, text_kwargs=None, audio_kwargs=None, is_training=True):
+        if text_kwargs is None:
+            text_kwargs = {}
+        if audio_kwargs is None:
+            audio_kwargs = {}
+
+        self.parent = parent
+        self.text_model_tester = ClapTextModelTester(parent, **text_kwargs)
+        self.audio_model_tester = ClapAudioModelTester(parent, **audio_kwargs)
+        self.is_training = is_training
+
+    def prepare_config_and_inputs(self):
+        _, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
+        _, input_features = self.audio_model_tester.prepare_config_and_inputs()
+
+        config = self.get_config()
+
+        return config, input_ids, attention_mask, input_features
+
+    def get_config(self):
+        return ClapConfig.from_text_audio_configs(
+            self.text_model_tester.get_config(), self.audio_model_tester.get_config(), projection_dim=64
+        )
+
+    def create_and_check_model(self, config, input_ids, attention_mask, input_features):
+        model = ClapModel(config).to(torch_device).eval()
+        with torch.no_grad():
+            result = model(input_ids, input_features, attention_mask)
+        self.parent.assertEqual(
+            result.logits_per_audio.shape, (self.audio_model_tester.batch_size, self.text_model_tester.batch_size)
+        )
+        self.parent.assertEqual(
+            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.audio_model_tester.batch_size)
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, attention_mask, input_features = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "input_features": input_features,
+            "return_loss": True,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class ClapModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (ClapModel,) if is_torch_available() else ()
+    pipeline_model_mapping = {"feature-extraction": ClapModel} if is_torch_available() else {}
+    fx_compatible = False
+    test_head_masking = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_attention_outputs = False
+
+    def setUp(self):
+        self.model_tester = ClapModelTester(self)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip(reason="Hidden_states is tested in individual model tests")
+    def test_hidden_states_output(self):
+        pass
+
+    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="Retain_grad is tested in individual model tests")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    @unittest.skip(reason="ClapModel does not have input/output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    # override as the `logit_scale` parameter initilization is different for CLAP
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    # check if `logit_scale` is initilized as per the original implementation
+                    if name == "logit_scale":
+                        self.assertAlmostEqual(
+                            param.data.item(),
+                            np.log(1 / 0.07),
+                            delta=1e-3,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    else:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+    def _create_and_check_torchscript(self, config, inputs_dict):
+        if not self.test_torchscript:
+            return
+
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        configs_no_init.torchscript = True
+        configs_no_init.return_dict = False
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
+
+            try:
+                input_ids = inputs_dict["input_ids"]
+                input_features = inputs_dict["input_features"]  # CLAP needs input_features
+                traced_model = torch.jit.trace(model, (input_ids, input_features))
+            except RuntimeError:
+                self.fail("Couldn't trace module.")
+
+            with tempfile.TemporaryDirectory() as tmp_dir_name:
+                pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
+
+                try:
+                    torch.jit.save(traced_model, pt_file_name)
+                except Exception:
+                    self.fail("Couldn't save module.")
+
+                try:
+                    loaded_model = torch.jit.load(pt_file_name)
+                except Exception:
+                    self.fail("Couldn't load module.")
+
+            model.to(torch_device)
+            model.eval()
+
+            loaded_model.to(torch_device)
+            loaded_model.eval()
+
+            model_state_dict = model.state_dict()
+            loaded_model_state_dict = loaded_model.state_dict()
+
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
+            self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
+
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
+            models_equal = True
+            for layer_name, p1 in model_state_dict.items():
+                p2 = loaded_model_state_dict[layer_name]
+                if p1.data.ne(p2.data).sum() > 0:
+                    models_equal = False
+
+            self.assertTrue(models_equal)
+
+    def test_load_audio_text_config(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        # Save ClapConfig and check if we can load ClapAudioConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            audio_config = ClapAudioConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.audio_config.to_dict(), audio_config.to_dict())
+
+        # Save ClapConfig and check if we can load ClapTextConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            text_config = ClapTextConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in CLAP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = ClapModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@slow
+@require_torch
+class ClapModelIntegrationTest(unittest.TestCase):
+    paddings = ["repeatpad", "repeat", "pad"]
+
+    def test_integration_unfused(self):
+        EXPECTED_MEANS_UNFUSED = {
+            "repeatpad": 0.0024,
+            "pad": 0.0020,
+            "repeat": 0.0023,
+        }
+
+        librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        audio_sample = librispeech_dummy[-1]
+
+        model_id = "laion/clap-htsat-unfused"
+
+        model = ClapModel.from_pretrained(model_id).to(torch_device)
+        processor = ClapProcessor.from_pretrained(model_id)
+
+        for padding in self.paddings:
+            inputs = processor(audios=audio_sample["audio"]["array"], return_tensors="pt", padding=padding).to(
+                torch_device
+            )
+
+            audio_embed = model.get_audio_features(**inputs)
+            expected_mean = EXPECTED_MEANS_UNFUSED[padding]
+
+            self.assertTrue(
+                torch.allclose(audio_embed.cpu().mean(), torch.tensor([expected_mean]), atol=1e-3, rtol=1e-3)
+            )
+
+    def test_integration_fused(self):
+        EXPECTED_MEANS_FUSED = {
+            "repeatpad": 0.00069,
+            "repeat": 0.00196,
+            "pad": -0.000379,
+        }
+
+        librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        audio_sample = librispeech_dummy[-1]
+
+        model_id = "laion/clap-htsat-fused"
+
+        model = ClapModel.from_pretrained(model_id).to(torch_device)
+        processor = ClapProcessor.from_pretrained(model_id)
+
+        for padding in self.paddings:
+            inputs = processor(
+                audios=audio_sample["audio"]["array"], return_tensors="pt", padding=padding, truncation="fusion"
+            ).to(torch_device)
+
+            audio_embed = model.get_audio_features(**inputs)
+            expected_mean = EXPECTED_MEANS_FUSED[padding]
+
+            self.assertTrue(
+                torch.allclose(audio_embed.cpu().mean(), torch.tensor([expected_mean]), atol=1e-3, rtol=1e-3)
+            )
+
+    def test_batched_fused(self):
+        EXPECTED_MEANS_FUSED = {
+            "repeatpad": 0.0010,
+            "repeat": 0.0020,
+            "pad": 0.0006,
+        }
+
+        librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        audio_samples = [sample["array"] for sample in librispeech_dummy[0:4]["audio"]]
+
+        model_id = "laion/clap-htsat-fused"
+
+        model = ClapModel.from_pretrained(model_id).to(torch_device)
+        processor = ClapProcessor.from_pretrained(model_id)
+
+        for padding in self.paddings:
+            inputs = processor(audios=audio_samples, return_tensors="pt", padding=padding, truncation="fusion").to(
+                torch_device
+            )
+
+            audio_embed = model.get_audio_features(**inputs)
+            expected_mean = EXPECTED_MEANS_FUSED[padding]
+
+            self.assertTrue(
+                torch.allclose(audio_embed.cpu().mean(), torch.tensor([expected_mean]), atol=1e-3, rtol=1e-3)
+            )
+
+    def test_batched_unfused(self):
+        EXPECTED_MEANS_FUSED = {
+            "repeatpad": 0.0016,
+            "repeat": 0.0019,
+            "pad": 0.0019,
+        }
+
+        librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        audio_samples = [sample["array"] for sample in librispeech_dummy[0:4]["audio"]]
+
+        model_id = "laion/clap-htsat-unfused"
+
+        model = ClapModel.from_pretrained(model_id).to(torch_device)
+        processor = ClapProcessor.from_pretrained(model_id)
+
+        for padding in self.paddings:
+            inputs = processor(audios=audio_samples, return_tensors="pt", padding=padding).to(torch_device)
+
+            audio_embed = model.get_audio_features(**inputs)
+            expected_mean = EXPECTED_MEANS_FUSED[padding]
+
+            self.assertTrue(
+                torch.allclose(audio_embed.cpu().mean(), torch.tensor([expected_mean]), atol=1e-3, rtol=1e-3)
+            )
diff --git a/tests/models/clap/test_processor_clap.py b/tests/models/clap/test_processor_clap.py
new file mode 100644
index 000000000000..49e9972ea02e
--- /dev/null
+++ b/tests/models/clap/test_processor_clap.py
@@ -0,0 +1,125 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import shutil
+import tempfile
+import unittest
+
+from transformers import ClapFeatureExtractor, ClapProcessor, RobertaTokenizer, RobertaTokenizerFast
+from transformers.testing_utils import require_sentencepiece, require_torchaudio
+
+from .test_feature_extraction_clap import floats_list
+
+
+@require_torchaudio
+@require_sentencepiece
+class ClapProcessorTest(unittest.TestCase):
+    def setUp(self):
+        self.checkpoint = "laion/clap-htsat-unfused"
+        self.tmpdirname = tempfile.mkdtemp()
+
+    def get_tokenizer(self, **kwargs):
+        return RobertaTokenizer.from_pretrained(self.checkpoint, **kwargs)
+
+    def get_feature_extractor(self, **kwargs):
+        return ClapFeatureExtractor.from_pretrained(self.checkpoint, **kwargs)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def test_save_load_pretrained_default(self):
+        tokenizer = self.get_tokenizer()
+        feature_extractor = self.get_feature_extractor()
+
+        processor = ClapProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        processor.save_pretrained(self.tmpdirname)
+        processor = ClapProcessor.from_pretrained(self.tmpdirname)
+
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
+        self.assertIsInstance(processor.tokenizer, RobertaTokenizerFast)
+
+        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
+        self.assertIsInstance(processor.feature_extractor, ClapFeatureExtractor)
+
+    def test_save_load_pretrained_additional_features(self):
+        processor = ClapProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
+        processor.save_pretrained(self.tmpdirname)
+
+        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
+        feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0)
+
+        processor = ClapProcessor.from_pretrained(
+            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
+        )
+
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
+        self.assertIsInstance(processor.tokenizer, RobertaTokenizerFast)
+
+        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
+        self.assertIsInstance(processor.feature_extractor, ClapFeatureExtractor)
+
+    def test_feature_extractor(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = ClapProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        raw_speech = floats_list((3, 1000))
+
+        input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
+        input_processor = processor(audios=raw_speech, return_tensors="np")
+
+        for key in input_feat_extract.keys():
+            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
+
+    def test_tokenizer(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = ClapProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        input_str = "This is a test string"
+
+        encoded_processor = processor(text=input_str)
+
+        encoded_tok = tokenizer(input_str)
+
+        for key in encoded_tok.keys():
+            self.assertListEqual(encoded_tok[key], encoded_processor[key])
+
+    def test_tokenizer_decode(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = ClapProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
+
+        decoded_processor = processor.batch_decode(predicted_ids)
+        decoded_tok = tokenizer.batch_decode(predicted_ids)
+
+        self.assertListEqual(decoded_tok, decoded_processor)
+
+    def test_model_input_names(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = ClapProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        self.assertListEqual(
+            processor.model_input_names[2:],
+            feature_extractor.model_input_names,
+            msg="`processor` and `feature_extractor` model input names do not match",
+        )
diff --git a/tests/models/clip/test_feature_extraction_clip.py b/tests/models/clip/test_feature_extraction_clip.py
deleted file mode 100644
index 8f29b63bbb55..000000000000
--- a/tests/models/clip/test_feature_extraction_clip.py
+++ /dev/null
@@ -1,305 +0,0 @@
-# coding=utf-8
-# Copyright 2021 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-
-from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torch_available, is_vision_available
-
-from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin
-
-
-if is_torch_available():
-    import torch
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import CLIPFeatureExtractor
-
-
-class CLIPFeatureExtractionTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_center_crop=True,
-        crop_size=None,
-        do_normalize=True,
-        image_mean=[0.48145466, 0.4578275, 0.40821073],
-        image_std=[0.26862954, 0.26130258, 0.27577711],
-        do_convert_rgb=True,
-    ):
-        size = size if size is not None else {"shortest_edge": 20}
-        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_center_crop = do_center_crop
-        self.crop_size = crop_size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.do_convert_rgb = do_convert_rgb
-
-    def prepare_feat_extract_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_center_crop": self.do_center_crop,
-            "crop_size": self.crop_size,
-            "do_normalize": self.do_normalize,
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_convert_rgb": self.do_convert_rgb,
-        }
-
-    def prepare_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
-        or a list of PyTorch tensors if one specifies torchify=True.
-        """
-
-        assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
-
-        if equal_resolution:
-            image_inputs = []
-            for i in range(self.batch_size):
-                image_inputs.append(
-                    np.random.randint(
-                        255, size=(self.num_channels, self.max_resolution, self.max_resolution), dtype=np.uint8
-                    )
-                )
-        else:
-            image_inputs = []
-            for i in range(self.batch_size):
-                width, height = np.random.choice(np.arange(self.min_resolution, self.max_resolution), 2)
-                image_inputs.append(np.random.randint(255, size=(self.num_channels, width, height), dtype=np.uint8))
-
-        if not numpify and not torchify:
-            # PIL expects the channel dimension as last dimension
-            image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
-
-        if torchify:
-            image_inputs = [torch.from_numpy(x) for x in image_inputs]
-
-        return image_inputs
-
-
-@require_torch
-@require_vision
-class CLIPFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
-
-    feature_extraction_class = CLIPFeatureExtractor if is_vision_available() else None
-
-    def setUp(self):
-        self.feature_extract_tester = CLIPFeatureExtractionTester(self)
-
-    @property
-    def feat_extract_dict(self):
-        return self.feature_extract_tester.prepare_feat_extract_dict()
-
-    def test_feat_extract_properties(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        self.assertTrue(hasattr(feature_extractor, "do_resize"))
-        self.assertTrue(hasattr(feature_extractor, "size"))
-        self.assertTrue(hasattr(feature_extractor, "do_center_crop"))
-        self.assertTrue(hasattr(feature_extractor, "center_crop"))
-        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
-        self.assertTrue(hasattr(feature_extractor, "image_mean"))
-        self.assertTrue(hasattr(feature_extractor, "image_std"))
-        self.assertTrue(hasattr(feature_extractor, "do_convert_rgb"))
-
-    def test_feat_extract_from_dict_with_kwargs(self):
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict)
-        self.assertEqual(feature_extractor.size, {"shortest_edge": 20})
-        self.assertEqual(feature_extractor.crop_size, {"height": 18, "width": 18})
-
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict, size=42, crop_size=84)
-        self.assertEqual(feature_extractor.size, {"shortest_edge": 42})
-        self.assertEqual(feature_extractor.crop_size, {"height": 84, "width": 84})
-
-    def test_batch_feature(self):
-        pass
-
-    def test_call_pil(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PIL images
-        image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-    def test_call_numpy(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random numpy tensors
-        image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-    def test_call_pytorch(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PyTorch tensors
-        image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-
-@require_torch
-@require_vision
-class CLIPFeatureExtractionTestFourChannels(FeatureExtractionSavingTestMixin, unittest.TestCase):
-
-    feature_extraction_class = CLIPFeatureExtractor if is_vision_available() else None
-
-    def setUp(self):
-        self.feature_extract_tester = CLIPFeatureExtractionTester(self, num_channels=4)
-        self.expected_encoded_image_num_channels = 3
-
-    @property
-    def feat_extract_dict(self):
-        return self.feature_extract_tester.prepare_feat_extract_dict()
-
-    def test_feat_extract_properties(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        self.assertTrue(hasattr(feature_extractor, "do_resize"))
-        self.assertTrue(hasattr(feature_extractor, "size"))
-        self.assertTrue(hasattr(feature_extractor, "do_center_crop"))
-        self.assertTrue(hasattr(feature_extractor, "center_crop"))
-        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
-        self.assertTrue(hasattr(feature_extractor, "image_mean"))
-        self.assertTrue(hasattr(feature_extractor, "image_std"))
-        self.assertTrue(hasattr(feature_extractor, "do_convert_rgb"))
-
-    def test_batch_feature(self):
-        pass
-
-    def test_call_pil_four_channels(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PIL images
-        image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.expected_encoded_image_num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.expected_encoded_image_num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
diff --git a/tests/models/clip/test_image_processing_clip.py b/tests/models/clip/test_image_processing_clip.py
new file mode 100644
index 000000000000..a35a23d8da9b
--- /dev/null
+++ b/tests/models/clip/test_image_processing_clip.py
@@ -0,0 +1,121 @@
+# coding=utf-8
+# Copyright 2021 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_vision_available():
+    from transformers import CLIPImageProcessor
+
+
+class CLIPImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_center_crop=True,
+        crop_size=None,
+        do_normalize=True,
+        image_mean=[0.48145466, 0.4578275, 0.40821073],
+        image_std=[0.26862954, 0.26130258, 0.27577711],
+        do_convert_rgb=True,
+    ):
+        size = size if size is not None else {"shortest_edge": 20}
+        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_convert_rgb = do_convert_rgb
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_center_crop": self.do_center_crop,
+            "crop_size": self.crop_size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_convert_rgb": self.do_convert_rgb,
+        }
+
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.crop_size["height"], self.crop_size["width"]
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class CLIPImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = CLIPImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = CLIPImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "do_center_crop"))
+        self.assertTrue(hasattr(image_processing, "center_crop"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"shortest_edge": 20})
+        self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
+
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84)
+        self.assertEqual(image_processor.size, {"shortest_edge": 42})
+        self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py
index 8fa3ce21bb31..0edd73f7ec60 100644
--- a/tests/models/clip/test_modeling_clip.py
+++ b/tests/models/clip/test_modeling_clip.py
@@ -21,8 +21,8 @@
 import unittest
 
 import numpy as np
-
 import requests
+
 import transformers
 from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
 from transformers.testing_utils import (
@@ -43,6 +43,7 @@
     ids_tensor,
     random_attention_mask,
 )
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -67,6 +68,7 @@
 
 if is_flax_available():
     import jax.numpy as jnp
+
     from transformers.modeling_flax_pytorch_utils import (
         convert_pytorch_state_dict_to_flax,
         load_flax_weights_in_pytorch_model,
@@ -84,7 +86,7 @@ def __init__(
         is_training=True,
         hidden_size=32,
         projection_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -259,7 +261,7 @@ def __init__(
         vocab_size=99,
         hidden_size=32,
         projection_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -347,11 +349,11 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class CLIPTextModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (CLIPTextModel, CLIPTextModelWithProjection) if is_torch_available() else ()
     fx_compatible = True
     test_pruning = False
     test_head_masking = False
+    model_split_percents = [0.5, 0.8, 0.9]
 
     def setUp(self):
         self.model_tester = CLIPTextModelTester(self)
@@ -402,7 +404,6 @@ def test_model_with_projection_from_pretrained(self):
 
 class CLIPModelTester:
     def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
-
         if text_kwargs is None:
             text_kwargs = {}
         if vision_kwargs is None:
@@ -450,8 +451,9 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class CLIPModelTest(ModelTesterMixin, unittest.TestCase):
+class CLIPModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (CLIPModel,) if is_torch_available() else ()
+    pipeline_model_mapping = {"feature-extraction": CLIPModel} if is_torch_available() else {}
     fx_compatible = True
     test_head_masking = False
     test_pruning = False
@@ -546,8 +548,28 @@ def _create_and_check_torchscript(self, config, inputs_dict):
             model_state_dict = model.state_dict()
             loaded_model_state_dict = loaded_model.state_dict()
 
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
             self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
 
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
             models_equal = True
             for layer_name, p1 in model_state_dict.items():
                 p2 = loaded_model_state_dict[layer_name]
@@ -579,7 +601,6 @@ def test_equivalence_pt_to_flax(self):
 
         for model_class in self.all_model_classes:
             with self.subTest(model_class.__name__):
-
                 # load PyTorch class
                 pt_model = model_class(config).eval()
                 # Flax models don't use the `use_cache` option and cache is not returned as a default.
@@ -611,7 +632,7 @@ def test_equivalence_pt_to_flax(self):
                     pt_outputs = pt_model(**pt_inputs).to_tuple()
 
                 # convert inputs to Flax
-                fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)}
+                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
                 fx_outputs = fx_model(**fx_inputs).to_tuple()
                 self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
                 for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs[:4]):
@@ -669,7 +690,7 @@ def test_equivalence_flax_to_pt(self):
                 with torch.no_grad():
                     pt_outputs = pt_model(**pt_inputs).to_tuple()
 
-                fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)}
+                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
 
                 fx_outputs = fx_model(**fx_inputs).to_tuple()
                 self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
diff --git a/tests/models/clip/test_modeling_flax_clip.py b/tests/models/clip/test_modeling_flax_clip.py
index b8a1030ad1b0..c1d05081ca53 100644
--- a/tests/models/clip/test_modeling_flax_clip.py
+++ b/tests/models/clip/test_modeling_flax_clip.py
@@ -14,11 +14,17 @@
 if is_flax_available():
     import jax
     import jax.numpy as jnp
+
     from transformers.modeling_flax_pytorch_utils import (
         convert_pytorch_state_dict_to_flax,
         load_flax_weights_in_pytorch_model,
     )
-    from transformers.models.clip.modeling_flax_clip import FlaxCLIPModel, FlaxCLIPTextModel, FlaxCLIPVisionModel
+    from transformers.models.clip.modeling_flax_clip import (
+        FlaxCLIPModel,
+        FlaxCLIPTextModel,
+        FlaxCLIPTextModelWithProjection,
+        FlaxCLIPVisionModel,
+    )
 
 if is_torch_available():
     import torch
@@ -34,7 +40,7 @@ def __init__(
         num_channels=3,
         is_training=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -251,7 +257,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -314,7 +320,7 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_flax
 class FlaxCLIPTextModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    all_model_classes = (FlaxCLIPTextModel,) if is_flax_available() else ()
+    all_model_classes = (FlaxCLIPTextModel, FlaxCLIPTextModelWithProjection) if is_flax_available() else ()
 
     def setUp(self):
         self.model_tester = FlaxCLIPTextModelTester(self)
diff --git a/tests/models/clip/test_modeling_tf_clip.py b/tests/models/clip/test_modeling_tf_clip.py
index 05b4c7920ebd..897b89d5c36b 100644
--- a/tests/models/clip/test_modeling_tf_clip.py
+++ b/tests/models/clip/test_modeling_tf_clip.py
@@ -15,6 +15,8 @@
 """ Testing suite for the TensorFlow CLIP model. """
 
 
+from __future__ import annotations
+
 import inspect
 import os
 import tempfile
@@ -22,12 +24,14 @@
 from importlib import import_module
 
 import requests
+
 from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
 from transformers.testing_utils import require_tf, require_vision, slow
 from transformers.utils import is_tf_available, is_vision_available
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -53,7 +57,7 @@ def __init__(
         num_channels=3,
         is_training=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -324,7 +328,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -396,7 +400,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_tf
 class TFCLIPTextModelTest(TFModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (TFCLIPTextModel,) if is_tf_available() else ()
     test_pruning = False
     test_head_masking = False
@@ -515,8 +518,9 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFCLIPModelTest(TFModelTesterMixin, unittest.TestCase):
+class TFCLIPModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (TFCLIPModel,) if is_tf_available() else ()
+    pipeline_model_mapping = {"feature-extraction": TFCLIPModel} if is_tf_available() else {}
     test_head_masking = False
     test_pruning = False
     test_resize_embeddings = False
@@ -551,7 +555,7 @@ def test_keras_save_load(self):
         if self.__class__.__name__ == "TFCLIPModelTest":
             inputs_dict.pop("return_loss", None)
 
-        tf_main_layer_classes = set(
+        tf_main_layer_classes = {
             module_member
             for model_class in self.all_model_classes
             for module in (import_module(model_class.__module__),)
@@ -563,7 +567,7 @@ def test_keras_save_load(self):
             if isinstance(module_member, type)
             and tf.keras.layers.Layer in module_member.__bases__
             and getattr(module_member, "_keras_serializable", False)
-        )
+        }
         for main_layer_class in tf_main_layer_classes:
             # T5MainLayer needs an embed_tokens parameter when called without the inputs_embeds parameter
             if "T5" in main_layer_class.__name__:
diff --git a/tests/models/clip/test_tokenization_clip.py b/tests/models/clip/test_tokenization_clip.py
index e9ba304b475d..27387be42bad 100644
--- a/tests/models/clip/test_tokenization_clip.py
+++ b/tests/models/clip/test_tokenization_clip.py
@@ -27,7 +27,6 @@
 
 @require_tokenizers
 class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = CLIPTokenizer
     rust_tokenizer_class = CLIPTokenizerFast
     test_rust_tokenizer = True
@@ -82,7 +81,7 @@ def test_check_encoding_slow_fast(self):
                 tokenizer_s = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
 
-                text = "A\n'll 11p223RF☆ho!!to?'d'd''d of a cat"
+                text = "A\n'll 11p223RF☆ho!!to?'d'd''d of a cat to-$''d."
                 text_tokenized_s = tokenizer_s.tokenize(text)
                 text_tokenized_r = tokenizer_r.tokenize(text)
 
@@ -123,7 +122,7 @@ def test_check_encoding_slow_fast(self):
                     # "\u0085", # (next line)
                 ]
 
-                # The tokenization is not identical for the character "\u0085" (next line). The slow version transforms
+                # The tokenization is not identical for the character "\u0085" (next line). The slow version using ftfy transforms
                 # it into the Horizontal Ellipsis character "…" ("\u2026") while the fast version transforms it into a
                 # space (and thus into an empty list).
 
diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py
index f170f6065338..0f97f381fc52 100644
--- a/tests/models/clipseg/test_modeling_clipseg.py
+++ b/tests/models/clipseg/test_modeling_clipseg.py
@@ -21,8 +21,8 @@
 import unittest
 
 import numpy as np
-
 import requests
+
 import transformers
 from transformers import MODEL_MAPPING, CLIPSegConfig, CLIPSegProcessor, CLIPSegTextConfig, CLIPSegVisionConfig
 from transformers.models.auto import get_values
@@ -44,6 +44,7 @@
     ids_tensor,
     random_attention_mask,
 )
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -60,6 +61,7 @@
 
 if is_flax_available():
     import jax.numpy as jnp
+
     from transformers.modeling_flax_pytorch_utils import (
         convert_pytorch_state_dict_to_flax,
         load_flax_weights_in_pytorch_model,
@@ -76,7 +78,7 @@ def __init__(
         num_channels=3,
         is_training=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -226,7 +228,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -302,11 +304,11 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class CLIPSegTextModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (CLIPSegTextModel,) if is_torch_available() else ()
     fx_compatible = False
     test_pruning = False
     test_head_masking = False
+    model_split_percents = [0.5, 0.8, 0.9]
 
     def setUp(self):
         self.model_tester = CLIPSegTextModelTester(self)
@@ -345,8 +347,15 @@ def test_model_from_pretrained(self):
 
 
 class CLIPSegModelTester:
-    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
-
+    def __init__(
+        self,
+        parent,
+        text_kwargs=None,
+        vision_kwargs=None,
+        is_training=True,
+        # This should respect the `num_hidden_layers` in `CLIPSegVisionModelTester`
+        extract_layers=(1,),
+    ):
         if text_kwargs is None:
             text_kwargs = {}
         if vision_kwargs is None:
@@ -356,6 +365,7 @@ def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=Tru
         self.text_model_tester = CLIPSegTextModelTester(parent, **text_kwargs)
         self.vision_model_tester = CLIPSegVisionModelTester(parent, **vision_kwargs)
         self.is_training = is_training
+        self.extract_layers = extract_layers
 
     def prepare_config_and_inputs(self):
         text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
@@ -371,7 +381,7 @@ def get_config(self):
             self.vision_model_tester.get_config(),
             projection_dim=64,
             reduce_dim=32,
-            extract_layers=[1, 2, 3],
+            extract_layers=self.extract_layers,
         )
 
     def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
@@ -413,8 +423,9 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class CLIPSegModelTest(ModelTesterMixin, unittest.TestCase):
+class CLIPSegModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (CLIPSegModel, CLIPSegForImageSegmentation) if is_torch_available() else ()
+    pipeline_model_mapping = {"feature-extraction": CLIPSegModel} if is_torch_available() else {}
     fx_compatible = False
     test_head_masking = False
     test_pruning = False
@@ -527,8 +538,28 @@ def _create_and_check_torchscript(self, config, inputs_dict):
             model_state_dict = model.state_dict()
             loaded_model_state_dict = loaded_model.state_dict()
 
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
             self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
 
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
             models_equal = True
             for layer_name, p1 in model_state_dict.items():
                 p2 = loaded_model_state_dict[layer_name]
@@ -560,7 +591,6 @@ def test_equivalence_pt_to_flax(self):
 
         for model_class in self.all_model_classes:
             with self.subTest(model_class.__name__):
-
                 # load PyTorch class
                 pt_model = model_class(config).eval()
                 # Flax models don't use the `use_cache` option and cache is not returned as a default.
@@ -592,7 +622,7 @@ def test_equivalence_pt_to_flax(self):
                     pt_outputs = pt_model(**pt_inputs).to_tuple()
 
                 # convert inputs to Flax
-                fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)}
+                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
                 fx_outputs = fx_model(**fx_inputs).to_tuple()
                 self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
                 for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs[:4]):
@@ -650,7 +680,7 @@ def test_equivalence_flax_to_pt(self):
                 with torch.no_grad():
                     pt_outputs = pt_model(**pt_inputs).to_tuple()
 
-                fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)}
+                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
 
                 fx_outputs = fx_model(**fx_inputs).to_tuple()
                 self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
diff --git a/tests/models/code_llama/__init__.py b/tests/models/code_llama/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/code_llama/test_tokenization_code_llama.py b/tests/models/code_llama/test_tokenization_code_llama.py
new file mode 100644
index 000000000000..267398152704
--- /dev/null
+++ b/tests/models/code_llama/test_tokenization_code_llama.py
@@ -0,0 +1,657 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import pickle
+import shutil
+import tempfile
+import unittest
+
+from datasets import load_dataset
+
+from transformers import (
+    SPIECE_UNDERLINE,
+    AddedToken,
+    CodeLlamaTokenizer,
+    CodeLlamaTokenizerFast,
+    is_torch_available,
+)
+from transformers.convert_slow_tokenizer import convert_slow_tokenizer
+from transformers.testing_utils import (
+    get_tests_dir,
+    nested_simplify,
+    require_sentencepiece,
+    require_tokenizers,
+    require_torch,
+    slow,
+)
+
+from ...test_tokenization_common import TokenizerTesterMixin
+
+
+SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
+
+
+if is_torch_available():
+    pass
+
+
+@require_sentencepiece
+@require_tokenizers
+class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+    tokenizer_class = CodeLlamaTokenizer
+    rust_tokenizer_class = CodeLlamaTokenizerFast
+    test_rust_tokenizer = False
+    test_sentencepiece = True
+    from_pretrained_kwargs = {}
+
+    def setUp(self):
+        super().setUp()
+
+        # We have a SentencePiece fixture for testing
+        tokenizer = CodeLlamaTokenizer(SAMPLE_VOCAB, keep_accents=True)
+        tokenizer.pad_token = tokenizer.eos_token
+        tokenizer.save_pretrained(self.tmpdirname)
+
+    def get_tokenizers(self, **kwargs):
+        kwargs.update({"pad_token": "<PAD>"})
+        return super().get_tokenizers(**kwargs)
+
+    def test_no_infilling_init(self):
+        tokenizer = CodeLlamaTokenizer(SAMPLE_VOCAB, prefix_token=None, keep_accents=True)
+        with self.assertRaises(ValueError):
+            tokenizer.tokenize("This is <FILL_ME> prefix")
+
+    def test_full_tokenizer(self):
+        tokenizer = CodeLlamaTokenizer(SAMPLE_VOCAB, keep_accents=True)
+
+        tokens = tokenizer.tokenize("This is a test")
+        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
+
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(tokens),
+            [285, 46, 10, 170, 382],
+        )
+
+        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
+        self.assertListEqual(
+            tokens,
+            [
+                SPIECE_UNDERLINE + "I",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "9",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "s",
+                "é",
+                ".",
+            ],
+        )
+        ids = tokenizer.convert_tokens_to_ids(tokens)
+        self.assertListEqual(
+            ids,
+            [8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4],
+        )
+
+        back_tokens = tokenizer.convert_ids_to_tokens(ids)
+        self.assertListEqual(
+            back_tokens,
+            [
+                SPIECE_UNDERLINE + "I",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "<unk>",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "s",
+                "<unk>",
+                ".",
+            ],
+        )
+
+    def test_save_pretrained(self):
+        self.tokenizers_list = [
+            (self.rust_tokenizer_class, "hf-internal-testing/llama-code-tokenizer", {}),
+            (self.tokenizer_class, "hf-internal-testing/llama-code-tokenizer", {}),
+        ]
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+                tmpdirname2 = tempfile.mkdtemp()
+
+                tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2)
+                tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
+
+                # Checks it save with the same files + the tokenizer.json file for the fast one
+                self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
+                tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f)
+                self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
+
+                # Checks everything loads correctly in the same way
+                tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
+                tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
+
+                # Check special tokens are set accordingly on Rust and Python
+                for key in tokenizer_pp.special_tokens_map:
+                    self.assertTrue(hasattr(tokenizer_rp, key))
+
+                shutil.rmtree(tmpdirname2)
+
+                # Save tokenizer rust, legacy_format=True
+                tmpdirname2 = tempfile.mkdtemp()
+
+                tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=True)
+                tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
+
+                # Checks it save with the same files
+                self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
+
+                # Checks everything loads correctly in the same way
+                tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
+                tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
+
+                # Check special tokens are set accordingly on Rust and Python
+                for key in tokenizer_pp.special_tokens_map:
+                    self.assertTrue(hasattr(tokenizer_rp, key))
+
+                shutil.rmtree(tmpdirname2)
+
+                # Save tokenizer rust, legacy_format=False
+                tmpdirname2 = tempfile.mkdtemp()
+
+                tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=False)
+                tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
+
+                # Checks it saved the tokenizer.json file
+                self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
+
+                # Checks everything loads correctly in the same way
+                tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
+                tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
+
+                # Check special tokens are set accordingly on Rust and Python
+                for key in tokenizer_pp.special_tokens_map:
+                    self.assertTrue(hasattr(tokenizer_rp, key))
+
+                shutil.rmtree(tmpdirname2)
+
+    @require_torch
+    def test_batch_tokenization(self):
+        if not self.test_seq2seq:
+            return
+
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                # Longer text that will definitely require truncation.
+                text = [
+                    " UN Chief Says There Is No Military Solution in Syria",
+                    " Secretary-General Ban Ki-moon says his response to Russia's stepped up military support for"
+                    " Syria is that 'there is no military solution' to the nearly five-year conflict and more weapons"
+                    " will only worsen the violence and misery for millions of people.",
+                ]
+                try:
+                    batch = tokenizer(
+                        text=text,
+                        max_length=3,
+                        max_target_length=10,
+                        return_tensors="pt",
+                    )
+                except NotImplementedError:
+                    return
+                self.assertEqual(batch.input_ids.shape[1], 3)
+                # max_target_length will default to max_length if not specified
+                batch = tokenizer(text, max_length=3, return_tensors="pt")
+                self.assertEqual(batch.input_ids.shape[1], 3)
+
+                batch_encoder_only = tokenizer(text=text, max_length=3, max_target_length=10, return_tensors="pt")
+                self.assertEqual(batch_encoder_only.input_ids.shape[1], 3)
+                self.assertEqual(batch_encoder_only.attention_mask.shape[1], 3)
+                self.assertNotIn("decoder_input_ids", batch_encoder_only)
+
+    @unittest.skip("Unfortunately way too slow to build a BPE with SentencePiece.")
+    def test_save_slow_from_fast_and_reload_fast(self):
+        pass
+
+    def test_special_tokens_initialization(self):
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                added_tokens = [AddedToken("<special>", lstrip=True)]
+
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
+                    pretrained_name, additional_special_tokens=added_tokens, **kwargs
+                )
+                r_output = tokenizer_r.encode("Hey this is a <special> token")
+
+                special_token_id = tokenizer_r.encode("<special>", add_special_tokens=False)[0]
+
+                self.assertTrue(special_token_id in r_output)
+
+                if self.test_slow_tokenizer:
+                    tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
+                        pretrained_name,
+                        additional_special_tokens=added_tokens,
+                        **kwargs,  # , from_slow=True <- unfortunately too slow to convert
+                    )
+                    tokenizer_p = self.tokenizer_class.from_pretrained(
+                        pretrained_name, additional_special_tokens=added_tokens, **kwargs
+                    )
+
+                    p_output = tokenizer_p.encode("Hey this is a <special> token")
+
+                    cr_output = tokenizer_cr.encode("Hey this is a <special> token")
+
+                    self.assertEqual(p_output, r_output)
+                    self.assertEqual(cr_output, r_output)
+                    self.assertTrue(special_token_id in p_output)
+                    self.assertTrue(special_token_id in cr_output)
+
+    @slow
+    def test_tokenizer_integration(self):
+        # fmt: off
+        expected_encoding = {'input_ids': [[1, 4103, 689, 414, 313, 24784, 368, 2998, 408, 282, 3637, 25350, 29899, 9067, 414, 322, 282, 3637, 25350, 29899, 1457, 3018, 1312, 29899, 2151, 29897, 8128, 2498, 29899, 15503, 4220, 6956, 1973, 313, 13635, 29911, 29892, 402, 7982, 29899, 29906, 29892, 1528, 13635, 29911, 29874, 29892, 1060, 26369, 29892, 6652, 309, 29933, 814, 29892, 1060, 29931, 6779, 11410, 363, 18385, 17088, 7634, 11235, 313, 25103, 29965, 29897, 322, 18385, 17088, 28203, 313, 25103, 29954, 29897, 411, 975, 29871, 29941, 29906, 29974, 758, 3018, 1312, 4733, 297, 29871, 29896, 29900, 29900, 29974, 10276, 322, 6483, 1006, 3372, 3097, 1546, 435, 1165, 29892, 10772, 29911, 25350, 322, 323, 6073, 17907, 29889], [1, 350, 20161, 338, 8688, 304, 758, 29899, 14968, 6483, 21000, 8684, 284, 22540, 515, 443, 29880, 24025, 1426, 491, 14002, 368, 4195, 292, 373, 1716, 2175, 322, 1492, 3030, 297, 599, 15359, 29889], [1, 450, 4996, 17354, 1701, 29916, 432, 17204, 975, 278, 17366, 11203, 29889]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}
+        # fmt: on
+
+        self.tokenizer_integration_test_util(
+            expected_encoding=expected_encoding,
+            model_name="hf-internal-testing/llama-code-tokenizer",
+            revision="6eb30c03ab6a9e2cdef4d523024909ec815ddb75",
+            padding=False,
+        )
+
+    def test_picklable(self):
+        with tempfile.NamedTemporaryFile() as f:
+            shutil.copyfile(SAMPLE_VOCAB, f.name)
+            tokenizer = CodeLlamaTokenizer(f.name, keep_accents=True)
+            pickled_tokenizer = pickle.dumps(tokenizer)
+        pickle.loads(pickled_tokenizer)
+
+    @unittest.skip("worker 'gw4' crashed on CI, passing locally.")
+    def test_pickle_subword_regularization_tokenizer(self):
+        pass
+
+    @unittest.skip("worker 'gw4' crashed on CI, passing locally.")
+    def test_subword_regularization_tokenizer(self):
+        pass
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+class LlamaIntegrationTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        checkpoint_name = "hf-internal-testing/llama-code-tokenizer"
+        cls.tokenizer: CodeLlamaTokenizer = CodeLlamaTokenizer.from_pretrained(checkpoint_name)
+        cls.rust_tokenizer = CodeLlamaTokenizerFast.from_pretrained(checkpoint_name)
+        return cls
+
+    @require_torch
+    def integration_tests(self):
+        inputs = self.tokenizer(
+            ["The following string should be properly encoded: Hello.", "But ird and ปี   ird   ด"],
+            return_tensors="pt",
+        )
+
+        self.assertEqual(
+            nested_simplify(inputs),
+            {
+                "input_ids": [
+                    [1, 450, 1494, 1347, 881, 367, 6284, 18511, 29901, 15043, 29889],
+                    [1, 1205, 29871, 1823, 322, 29871, 31010, 30691, 1678, 1823, 1678, 30718],
+                ],
+                "attention_mask": [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
+            },
+        )
+
+    def test_fast_special_tokens(self):
+        slow_tokenizer = self.tokenizer
+        fast_tokenizer = self.rust_tokenizer
+        slow = slow_tokenizer.encode("A sample test", add_special_tokens=True)
+        assert slow == [1, 319, 4559, 1243]
+
+        fast_tokenizer.add_eos_token = False
+        fast = fast_tokenizer.encode("A sample test", add_special_tokens=True)
+        assert fast == [1, 319, 4559, 1243]
+
+        fast_tokenizer.add_eos_token = True
+        fast = fast_tokenizer.encode("A sample test", add_special_tokens=True)
+        assert fast == [1, 319, 4559, 1243, 2]
+
+        slow_tokenizer.add_eos_token = True
+        slow = slow_tokenizer.encode("A sample test", add_special_tokens=True)
+        assert slow == [1, 319, 4559, 1243, 2]
+
+        fast_tokenizer = CodeLlamaTokenizerFast.from_pretrained(
+            "hf-internal-testing/llama-tokenizer", add_eos_token=True, add_bos_token=False
+        )
+        fast = fast_tokenizer.encode("A sample test", add_special_tokens=True)
+        assert fast == [319, 4559, 1243, 2]
+
+        slow_tokenzier = CodeLlamaTokenizer.from_pretrained(
+            "hf-internal-testing/llama-tokenizer", add_eos_token=True, add_bos_token=False
+        )
+        slow = slow_tokenzier.encode("A sample test", add_special_tokens=True)
+        assert slow == [319, 4559, 1243, 2]
+
+        self.tokenizer.add_eos_token = False
+        self.rust_tokenizer.add_eos_token = False
+
+    @slow
+    def test_conversion(self):
+        # This is excruciatingly slow since it has to recreate the entire merge
+        # list from the original vocabulary in spm
+        self.rust_tokenizer.save_pretrained("./out")
+        with tempfile.TemporaryDirectory() as dirname:
+            self.rust_tokenizer.save_pretrained(dirname)
+
+            with open(os.path.join(dirname, "tokenizer.json"), "r") as f:
+                old_serialized = f.read()
+
+        new_tokenizer = convert_slow_tokenizer(self.tokenizer)
+        with tempfile.NamedTemporaryFile() as f:
+            new_tokenizer.save(f.name)
+            # Re-opening since `f` is in bytes.
+            new_serialized = open(f.name, "r").read()
+            with open("out_tokenizer.json", "w") as g:
+                g.write(new_serialized)
+
+            self.assertEqual(old_serialized, new_serialized)
+
+    def test_simple_encode_decode(self):
+        pyth_tokenizer = self.tokenizer
+        rust_tokenizer = self.rust_tokenizer
+
+        self.assertEqual(pyth_tokenizer.encode("This is a test"), [1, 910, 338, 263, 1243])
+        self.assertEqual(rust_tokenizer.encode("This is a test"), [1, 910, 338, 263, 1243])
+        self.assertEqual(pyth_tokenizer.decode([1, 910, 338, 263, 1243], skip_special_tokens=True), "This is a test")
+        self.assertEqual(rust_tokenizer.decode([1, 910, 338, 263, 1243], skip_special_tokens=True), "This is a test")
+
+        # bytefallback showcase
+        self.assertEqual(pyth_tokenizer.encode("生活的真谛是"), [1, 29871, 30486, 31704, 30210, 30848, 235, 179, 158, 30392])
+        self.assertEqual(rust_tokenizer.encode("生活的真谛是"), [1, 29871, 30486, 31704, 30210, 30848, 235, 179, 158, 30392])
+        self.assertEqual(
+            pyth_tokenizer.decode(
+                [1, 29871, 30486, 31704, 30210, 30848, 235, 179, 158, 30392], skip_special_tokens=True
+            ),
+            "生活的真谛是",
+        )
+        self.assertEqual(
+            rust_tokenizer.decode(
+                [1, 29871, 30486, 31704, 30210, 30848, 235, 179, 158, 30392], skip_special_tokens=True
+            ),
+            "生活的真谛是",
+        )
+
+        # Inner spaces showcase
+        self.assertEqual(pyth_tokenizer.encode("Hi  Hello"), [1, 6324, 29871, 15043])
+        self.assertEqual(rust_tokenizer.encode("Hi  Hello"), [1, 6324, 29871, 15043])
+        self.assertEqual(pyth_tokenizer.decode([1, 6324, 29871, 15043], skip_special_tokens=True), "Hi  Hello")
+        self.assertEqual(rust_tokenizer.decode([1, 6324, 29871, 15043], skip_special_tokens=True), "Hi  Hello")
+
+        self.assertEqual(pyth_tokenizer.encode("Hi   Hello"), [1, 6324, 259, 15043])
+        self.assertEqual(rust_tokenizer.encode("Hi   Hello"), [1, 6324, 259, 15043])
+        self.assertEqual(pyth_tokenizer.decode([1, 6324, 259, 15043], skip_special_tokens=True), "Hi   Hello")
+        self.assertEqual(rust_tokenizer.decode([1, 6324, 259, 15043], skip_special_tokens=True), "Hi   Hello")
+
+        self.assertEqual(pyth_tokenizer.encode(""), [1])
+        self.assertEqual(rust_tokenizer.encode(""), [1])
+
+        self.assertEqual(pyth_tokenizer.encode(" "), [1, 259])
+        self.assertEqual(rust_tokenizer.encode(" "), [1, 259])
+
+        self.assertEqual(pyth_tokenizer.encode("  "), [1, 1678])
+        self.assertEqual(rust_tokenizer.encode("  "), [1, 1678])
+
+        self.assertEqual(pyth_tokenizer.encode(" Hello"), [1, 29871, 15043])
+        self.assertEqual(rust_tokenizer.encode(" Hello"), [1, 29871, 15043])
+
+    def test_no_differences_showcase(self):
+        pyth_tokenizer = self.tokenizer
+        rust_tokenizer = self.rust_tokenizer
+        self.assertEqual(pyth_tokenizer.encode(""), [1])
+        self.assertEqual(rust_tokenizer.encode(""), [1])
+
+        self.assertEqual(pyth_tokenizer.encode(" "), [1, 259])
+        self.assertEqual(rust_tokenizer.encode(" "), [1, 259])
+
+        self.assertEqual(pyth_tokenizer.encode("  "), [1, 1678])
+        self.assertEqual(rust_tokenizer.encode("  "), [1, 1678])
+
+        self.assertEqual(pyth_tokenizer.encode(" Hello"), [1, 29871, 15043])
+        self.assertEqual(rust_tokenizer.encode(" Hello"), [1, 29871, 15043])
+
+        self.assertEqual(pyth_tokenizer.encode("<s>"), [1, 1])
+        self.assertEqual(rust_tokenizer.encode("<s>"), [1, 1])
+
+    def test_no_differences_decode(self):
+        pyth_tokenizer = self.tokenizer
+        rust_tokenizer = self.rust_tokenizer
+
+        self.assertEqual(pyth_tokenizer.decode([869]), ".")
+        self.assertEqual(rust_tokenizer.decode([869]), ".")
+
+        self.assertEqual(pyth_tokenizer.decode([30112, 869]), "ا .")
+        self.assertEqual(rust_tokenizer.decode([30112, 869]), "ا .")
+
+    def test_no_differences_special_tokens(self):
+        pyth_tokenizer = self.tokenizer
+        rust_tokenizer = self.rust_tokenizer
+        self.assertEqual(pyth_tokenizer.encode(""), [1])
+        self.assertEqual(rust_tokenizer.encode(""), [1])
+
+        self.assertEqual(pyth_tokenizer.encode("<s>"), [1, 1])
+        self.assertEqual(rust_tokenizer.encode("<s>"), [1, 1])
+
+    @unittest.skipIf(
+        os.getenv("RUN_TOKENIZER_INTEGRATION", "0") == "0",
+        "RUN_TOKENIZER_INTEGRATION=1 to run tokenizer integration tests",
+    )
+    def test_integration_test_xnli(self):
+        import tqdm
+
+        pyth_tokenizer = self.tokenizer
+        rust_tokenizer = self.rust_tokenizer
+
+        dataset = load_dataset("code_x_glue_ct_code_to_text", "go")
+        for item in tqdm.tqdm(dataset["validation"]):
+            string = item["code"]
+            encoded1 = pyth_tokenizer.encode(string)
+            encoded2 = rust_tokenizer.encode(string)
+
+            self.assertEqual(encoded1, encoded2)
+
+            decoded1 = pyth_tokenizer.decode(encoded1, skip_special_tokens=True)
+            decoded2 = rust_tokenizer.decode(encoded2, skip_special_tokens=True)
+
+            self.assertEqual(decoded1, decoded2)
+
+        dataset = load_dataset("xnli", "all_languages")
+
+        for item in tqdm.tqdm(dataset["train"]):
+            for string in item["premise"].values():
+                encoded1 = pyth_tokenizer.encode(string)
+                encoded2 = rust_tokenizer.encode(string)
+
+                self.assertEqual(encoded1, encoded2)
+
+                decoded1 = pyth_tokenizer.decode(encoded1, skip_special_tokens=True)
+                decoded2 = rust_tokenizer.decode(encoded2, skip_special_tokens=True)
+
+                self.assertEqual(decoded1, decoded2)
+
+    def test_special_token_special_word(self):
+        # the word inform should be split as ['in', 'form']
+        tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-hf", legacy=False)
+        tokenizer.add_tokens(["<REPR_END>"], special_tokens=False)
+        out1 = tokenizer.decode(
+            tokenizer.encode("<REPR_END>inform", add_special_tokens=False), spaces_between_special_tokens=False
+        )
+        self.assertEqual(out1, "<REPR_END>inform")
+        out2 = tokenizer.decode(
+            tokenizer.encode("<REPR_END>inform", add_special_tokens=False), spaces_between_special_tokens=True
+        )
+        # the added prefix token should not be decoded
+        self.assertEqual(out2, "<REPR_END> inform")
+        input_ids = tokenizer.encode("<REPR_END>inform", add_special_tokens=False)
+        self.assertEqual(input_ids, [29871, 32016, 262, 689])  # 29871 is the spiece underline, '▁'
+
+        out2 = tokenizer.decode(
+            tokenizer.encode(" <REPR_END> inform", add_special_tokens=False), spaces_between_special_tokens=False
+        )
+        # TODO @ArthurZ currently we strip left and right, so this will not keep the spaces
+        self.assertEqual(out2, "<REPR_END>inform")
+
+        ### Let's make sure decoding does not add extra spaces here and there
+        # TODO @ArthurZ this should be affected by the lstrip/rstrip/single word /normalize refactoring
+        # Since currently we always strip left and right of the token, results are as such
+        input_ids = tokenizer.encode("<s> Hello<s>how", add_special_tokens=False)
+        self.assertEqual(input_ids, [1, 15043, 1, 3525])
+        tokens = tokenizer.tokenize("<s> Hello<s>how", add_special_tokens=False)
+        self.assertEqual(tokens, ["<s>", "▁Hello", "<s>", "how"])
+        decoded_tokens = tokenizer.decode(input_ids)
+        self.assertEqual(decoded_tokens, "<s> Hello<s>how")
+
+        # Let's make sure that if there are any spaces, we don't remove them!
+        input_ids = tokenizer.encode(" <s> Hello<s> how", add_special_tokens=False)
+        self.assertEqual(input_ids, [259, 1, 15043, 1, 920])
+        tokens = tokenizer.tokenize(" <s> Hello<s> how", add_special_tokens=False)
+        self.assertEqual(tokens, ["▁▁", "<s>", "▁Hello", "<s>", "▁how"])
+        decoded_tokens = tokenizer.decode(input_ids)
+        self.assertEqual(decoded_tokens, " <s> Hello<s> how")
+
+    def test_fill_token(self):
+        tokenizer = CodeLlamaTokenizerFast.from_pretrained(
+            "codellama/CodeLlama-7b-hf", fill_token=None, prefix_token=None, suffix_token=None, middle_token=None
+        )
+        tokenizer.encode_plus("Hey how are you").input_ids
+        tokenizer.fill_token = "<FILL_ME>"
+        with self.assertRaises(ValueError):
+            tokenizer.encode("Hey how <FILL_ME> are you")
+            tokenizer.encode_plus("Hey how <FILL_ME> are you", "mne too")
+            tokenizer.tokenize("Hey how are you", "mne too")
+
+        tokenizer = CodeLlamaTokenizerFast.from_pretrained(
+            "codellama/CodeLlama-7b-hf", revision="3773f63b4511b9e47a9a7ffc765eed7eb0169486"
+        )
+        tokenizer.encode("Hey how <FILL_ME> are you")
+        tokenizer.encode_plus("Hey how <FILL_ME> are you", "mne too")
+        tokenizer.tokenize("Hey how are you", "mne too")
+
+    def test_spm_edge_cases(self):
+        # the word inform should be split as ['in', 'form']
+        tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-hf", legacy=False)
+        tokens = tokenizer.tokenize("[INST] How are you doing?<s>[/INST]")
+        self.assertEqual(
+            tokens, ["▁[", "INST", "]", "▁How", "▁are", "▁you", "▁doing", "?", "<s>", "[", "/", "INST", "]"]
+        )
+        inputs_ids = tokenizer.encode("[INST] How are you doing?<s>[/INST]")
+        self.assertEqual(
+            inputs_ids, [1, 518, 25580, 29962, 1128, 526, 366, 2599, 29973, 1, 29961, 29914, 25580, 29962]
+        )
+
+    def test_infilling_tokenization(self):
+        PROMPTS = [
+            '''def remove_non_ascii(s: str) -> str:
+    """ <FILL_ME>
+    return result
+''',
+            """# Installation instructions:
+    ```bash
+<FILL_ME>
+    ```
+This downloads the LLaMA inference code and installs the repository as a local pip package.
+""",
+            """class InterfaceManagerFactory(AbstractManagerFactory):
+    def __init__(<FILL_ME>
+def main():
+    factory = InterfaceManagerFactory(start=datetime.now())
+    managers = []
+    for i in range(10):
+        managers.append(factory.build(id=i))
+""",
+            """/-- A quasi-prefunctoid is 1-connected iff all its etalisations are 1-connected. -/
+theorem connected_iff_etalisation [C D : precategoroid] (P : quasi_prefunctoid C D) :
+π₁ P = 0 ↔ <FILL_ME> = 0 :=
+begin
+split,
+{ intros h f,
+    rw pi_1_etalisation at h,
+    simp [h],
+    refl
+},
+{ intro h,
+    have := @quasi_adjoint C D P,
+    simp [←pi_1_etalisation, this, h],
+    refl
+}
+end
+""",
+        ]
+        tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-Instruct-hf")
+        tokenizer_fast = CodeLlamaTokenizerFast.from_pretrained("codellama/CodeLlama-7b-Instruct-hf")
+
+        formatted_prompt = tokenizer.tokenize(PROMPTS[0])
+        self.assertEqual(formatted_prompt, tokenizer_fast.tokenize(PROMPTS[0]))
+        prefix, suffix = PROMPTS[0].split("<FILL_ME>")
+        self.assertEqual(formatted_prompt, tokenizer.tokenize(prefix, suffix))
+        self.assertEqual(formatted_prompt, tokenizer_fast.tokenize(prefix, suffix))
+
+        input_ids = tokenizer.encode(PROMPTS[0], add_special_tokens=False)
+        self.assertEqual(input_ids, tokenizer_fast.encode(PROMPTS[0], add_special_tokens=False))
+
+        prefix, suffix = PROMPTS[0].split("<FILL_ME>")
+        input_ids = tokenizer.encode(PROMPTS[0])
+        self.assertEqual(input_ids, tokenizer.encode(prefix, suffix=suffix))
+        self.assertEqual(tokenizer.encode(prefix, suffix=suffix), tokenizer_fast.encode(prefix, suffix=suffix))
+
+        # Adding suffix_first check for infilling tasks
+        suffix_first_formatted_prompt = tokenizer.tokenize(PROMPTS[0], suffix_first=True)
+        self.assertEqual(suffix_first_formatted_prompt, tokenizer_fast.tokenize(PROMPTS[0], suffix_first=True))
+        prefix, suffix = PROMPTS[0].split("<FILL_ME>")
+        self.assertEqual(suffix_first_formatted_prompt, tokenizer.tokenize(prefix, suffix, suffix_first=True))
+        self.assertEqual(suffix_first_formatted_prompt, tokenizer_fast.tokenize(prefix, suffix, suffix_first=True))
+
+        prefix, suffix = PROMPTS[0].split("<FILL_ME>")
+        suffix_first_input_ids = tokenizer.encode(PROMPTS[0], suffix_first=True)
+        self.assertEqual(suffix_first_input_ids, tokenizer.encode(prefix, suffix=suffix, suffix_first=True))
+        self.assertEqual(suffix_first_input_ids, tokenizer_fast.encode(prefix, suffix=suffix, suffix_first=True))
diff --git a/tests/models/codegen/test_modeling_codegen.py b/tests/models/codegen/test_modeling_codegen.py
index 091a8b401d8d..34a32caa7ff8 100644
--- a/tests/models/codegen/test_modeling_codegen.py
+++ b/tests/models/codegen/test_modeling_codegen.py
@@ -18,11 +18,13 @@
 import unittest
 
 from transformers import CodeGenConfig, is_torch_available
-from transformers.testing_utils import require_torch, slow, torch_device
+from transformers.file_utils import cached_property
+from transformers.testing_utils import is_flaky, require_torch, slow, torch_device
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -45,7 +47,7 @@ def __init__(
         vocab_size=256,
         hidden_size=32,
         rotary_dim=4,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -348,10 +350,12 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class CodeGenModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
+class CodeGenModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (CodeGenModel, CodeGenForCausalLM) if is_torch_available() else ()
     all_generative_model_classes = (CodeGenForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": CodeGenModel, "text-generation": CodeGenForCausalLM} if is_torch_available() else {}
+    )
     fx_compatible = False
     test_pruning = False
     test_missing_keys = False
@@ -459,11 +463,19 @@ def test_model_from_pretrained(self):
 
 @require_torch
 class CodeGenModelLanguageGenerationTest(unittest.TestCase):
+    @cached_property
+    def cached_tokenizer(self):
+        return AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono")
+
+    @cached_property
+    def cached_model(self):
+        return CodeGenForCausalLM.from_pretrained("Salesforce/codegen-350M-mono")
+
     @slow
     def test_lm_generate_codegen(self):
-        tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono")
+        tokenizer = self.cached_tokenizer
         for checkpointing in [True, False]:
-            model = CodeGenForCausalLM.from_pretrained("Salesforce/codegen-350M-mono")
+            model = self.cached_model
 
             if checkpointing:
                 model.gradient_checkpointing_enable()
@@ -481,8 +493,8 @@ def test_lm_generate_codegen(self):
 
     @slow
     def test_codegen_sample(self):
-        tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono")
-        model = CodeGenForCausalLM.from_pretrained("Salesforce/codegen-350M-mono")
+        tokenizer = self.cached_tokenizer
+        model = self.cached_model
         model.to(torch_device)
 
         torch.manual_seed(0)
@@ -509,13 +521,14 @@ def test_codegen_sample(self):
 
         self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
         self.assertTrue(
-            all([output_seq_strs[idx] != output_seq_tt_strs[idx] for idx in range(len(output_seq_tt_strs))])
+            all(output_seq_strs[idx] != output_seq_tt_strs[idx] for idx in range(len(output_seq_tt_strs)))
         )  # token_type_ids should change output
 
+    @is_flaky(max_attempts=3, description="measure of timing is somehow flaky.")
     @slow
     def test_codegen_sample_max_time(self):
-        tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono")
-        model = CodeGenForCausalLM.from_pretrained("Salesforce/codegen-350M-mono")
+        tokenizer = self.cached_tokenizer
+        model = self.cached_model
         model.to(torch_device)
 
         torch.manual_seed(0)
diff --git a/tests/models/codegen/test_tokenization_codegen.py b/tests/models/codegen/test_tokenization_codegen.py
index c15c8236b8da..edffbeaec9a0 100644
--- a/tests/models/codegen/test_tokenization_codegen.py
+++ b/tests/models/codegen/test_tokenization_codegen.py
@@ -28,7 +28,6 @@
 
 @require_tokenizers
 class CodeGenTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = CodeGenTokenizer
     rust_tokenizer_class = CodeGenTokenizerFast
     test_rust_tokenizer = True
@@ -245,8 +244,8 @@ def test_add_bos_token_slow(self):
         decode_s = tokenizer.decode(out_s.input_ids)
         decode_s2 = tokenizer.batch_decode(out_s2.input_ids)
 
-        self.assertEqual(decode_s.split()[0], bos_token)
-        self.assertTrue(all(d.split()[0] == bos_token for d in decode_s2))
+        self.assertTrue(decode_s.startswith(bos_token))
+        self.assertTrue(all(d.startswith(bos_token) for d in decode_s2))
 
     @slow
     def test_truncation(self):
@@ -259,6 +258,7 @@ def test_truncation(self):
         truncation_pattern = ["^#", re.escape("<|endoftext|>"), "^'''", '^"""', "\n\n\n"]
         decoded_text = tokenizer.decode(input_ids, truncate_before_pattern=truncation_pattern)
         self.assertEqual(decoded_text, expected_trucated_text)
+        # TODO @ArthurZ outputs of the fast tokenizer are different in this case, un-related to the PR
 
     # tokenizer has no padding token
     def test_padding_different_model_input_name(self):
diff --git a/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py b/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
deleted file mode 100644
index 4f3a6e21e0c9..000000000000
--- a/tests/models/conditional_detr/test_feature_extraction_conditional_detr.py
+++ /dev/null
@@ -1,359 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-import pathlib
-import unittest
-
-import numpy as np
-
-from transformers.testing_utils import require_torch, require_vision, slow
-from transformers.utils import is_torch_available, is_vision_available
-
-from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
-
-
-if is_torch_available():
-    import torch
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import ConditionalDetrFeatureExtractor
-
-
-class ConditionalDetrFeatureExtractionTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-        do_rescale=True,
-        rescale_factor=1 / 255,
-        do_pad=True,
-    ):
-        # by setting size["longest_edge"] > max_resolution we're effectively not testing this :p
-        size = size if size is not None else {"shortest_edge": 18, "longest_edge": 1333}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.do_rescale = do_rescale
-        self.rescale_factor = rescale_factor
-        self.do_pad = do_pad
-
-    def prepare_feat_extract_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_normalize": self.do_normalize,
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_rescale": self.do_rescale,
-            "rescale_factor": self.rescale_factor,
-            "do_pad": self.do_pad,
-        }
-
-    def get_expected_values(self, image_inputs, batched=False):
-        """
-        This function computes the expected height and width when providing images to ConditionalDetrFeatureExtractor,
-        assuming do_resize is set to True with a scalar size.
-        """
-        if not batched:
-            image = image_inputs[0]
-            if isinstance(image, Image.Image):
-                w, h = image.size
-            else:
-                h, w = image.shape[1], image.shape[2]
-            if w < h:
-                expected_height = int(self.size["shortest_edge"] * h / w)
-                expected_width = self.size["shortest_edge"]
-            elif w > h:
-                expected_height = self.size["shortest_edge"]
-                expected_width = int(self.size["shortest_edge"] * w / h)
-            else:
-                expected_height = self.size["shortest_edge"]
-                expected_width = self.size["shortest_edge"]
-
-        else:
-            expected_values = []
-            for image in image_inputs:
-                expected_height, expected_width = self.get_expected_values([image])
-                expected_values.append((expected_height, expected_width))
-            expected_height = max(expected_values, key=lambda item: item[0])[0]
-            expected_width = max(expected_values, key=lambda item: item[1])[1]
-
-        return expected_height, expected_width
-
-
-@require_torch
-@require_vision
-class ConditionalDetrFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
-
-    feature_extraction_class = ConditionalDetrFeatureExtractor if is_vision_available() else None
-
-    def setUp(self):
-        self.feature_extract_tester = ConditionalDetrFeatureExtractionTester(self)
-
-    @property
-    def feat_extract_dict(self):
-        return self.feature_extract_tester.prepare_feat_extract_dict()
-
-    def test_feat_extract_properties(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        self.assertTrue(hasattr(feature_extractor, "image_mean"))
-        self.assertTrue(hasattr(feature_extractor, "image_std"))
-        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
-        self.assertTrue(hasattr(feature_extractor, "do_resize"))
-        self.assertTrue(hasattr(feature_extractor, "size"))
-
-    def test_feat_extract_from_dict_with_kwargs(self):
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict)
-        self.assertEqual(feature_extractor.size, {"shortest_edge": 18, "longest_edge": 1333})
-        self.assertEqual(feature_extractor.do_pad, True)
-
-        feature_extractor = self.feature_extraction_class.from_dict(
-            self.feat_extract_dict, size=42, max_size=84, pad_and_return_pixel_mask=False
-        )
-        self.assertEqual(feature_extractor.size, {"shortest_edge": 42, "longest_edge": 84})
-        self.assertEqual(feature_extractor.do_pad, False)
-
-    def test_batch_feature(self):
-        pass
-
-    def test_call_pil(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PIL images
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-
-        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs)
-
-        self.assertEqual(
-            encoded_images.shape,
-            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
-        )
-
-        # Test batched
-        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True)
-
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                expected_height,
-                expected_width,
-            ),
-        )
-
-    def test_call_numpy(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random numpy tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-
-        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs)
-
-        self.assertEqual(
-            encoded_images.shape,
-            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-
-        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True)
-
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                expected_height,
-                expected_width,
-            ),
-        )
-
-    def test_call_pytorch(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-
-        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs)
-
-        self.assertEqual(
-            encoded_images.shape,
-            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-
-        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True)
-
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                expected_height,
-                expected_width,
-            ),
-        )
-
-    def test_equivalence_pad_and_create_pixel_mask(self):
-        # Initialize feature_extractors
-        feature_extractor_1 = self.feature_extraction_class(**self.feat_extract_dict)
-        feature_extractor_2 = self.feature_extraction_class(do_resize=False, do_normalize=False, do_rescale=False)
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test whether the method "pad_and_return_pixel_mask" and calling the feature extractor return the same tensors
-        encoded_images_with_method = feature_extractor_1.pad_and_create_pixel_mask(image_inputs, return_tensors="pt")
-        encoded_images = feature_extractor_2(image_inputs, return_tensors="pt")
-
-        self.assertTrue(
-            torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4)
-        )
-        self.assertTrue(
-            torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4)
-        )
-
-    @slow
-    def test_call_pytorch_with_coco_detection_annotations(self):
-        # prepare image and target
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
-            target = json.loads(f.read())
-
-        target = {"image_id": 39769, "annotations": target}
-
-        # encode them
-        feature_extractor = ConditionalDetrFeatureExtractor.from_pretrained("microsoft/conditional-detr-resnet-50")
-        encoding = feature_extractor(images=image, annotations=target, return_tensors="pt")
-
-        # verify pixel values
-        expected_shape = torch.Size([1, 3, 800, 1066])
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
-        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
-
-        # verify area
-        expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
-        # verify boxes
-        expected_boxes_shape = torch.Size([6, 4])
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
-        expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
-        # verify image_id
-        expected_image_id = torch.tensor([39769])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
-        # verify is_crowd
-        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
-        # verify class_labels
-        expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
-        # verify orig_size
-        expected_orig_size = torch.tensor([480, 640])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
-        # verify size
-        expected_size = torch.tensor([800, 1066])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
-
-    @slow
-    def test_call_pytorch_with_coco_panoptic_annotations(self):
-        # prepare image, target and masks_path
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
-            target = json.loads(f.read())
-
-        target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
-
-        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
-
-        # encode them
-        feature_extractor = ConditionalDetrFeatureExtractor(format="coco_panoptic")
-        encoding = feature_extractor(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
-
-        # verify pixel values
-        expected_shape = torch.Size([1, 3, 800, 1066])
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
-        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
-
-        # verify area
-        expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
-        # verify boxes
-        expected_boxes_shape = torch.Size([6, 4])
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
-        expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
-        # verify image_id
-        expected_image_id = torch.tensor([39769])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
-        # verify is_crowd
-        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
-        # verify class_labels
-        expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
-        # verify masks
-        expected_masks_sum = 822873
-        self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
-        # verify orig_size
-        expected_orig_size = torch.tensor([480, 640])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
-        # verify size
-        expected_size = torch.tensor([800, 1066])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
diff --git a/tests/models/conditional_detr/test_image_processing_conditional_detr.py b/tests/models/conditional_detr/test_image_processing_conditional_detr.py
new file mode 100644
index 000000000000..cc3fd501a64e
--- /dev/null
+++ b/tests/models/conditional_detr/test_image_processing_conditional_detr.py
@@ -0,0 +1,250 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import pathlib
+import unittest
+
+from transformers.testing_utils import require_torch, require_vision, slow
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import ConditionalDetrImageProcessor
+
+
+class ConditionalDetrImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+        do_rescale=True,
+        rescale_factor=1 / 255,
+        do_pad=True,
+    ):
+        # by setting size["longest_edge"] > max_resolution we're effectively not testing this :p
+        size = size if size is not None else {"shortest_edge": 18, "longest_edge": 1333}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_pad = do_pad
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_rescale": self.do_rescale,
+            "rescale_factor": self.rescale_factor,
+            "do_pad": self.do_pad,
+        }
+
+    def get_expected_values(self, image_inputs, batched=False):
+        """
+        This function computes the expected height and width when providing images to ConditionalDetrImageProcessor,
+        assuming do_resize is set to True with a scalar size.
+        """
+        if not batched:
+            image = image_inputs[0]
+            if isinstance(image, Image.Image):
+                w, h = image.size
+            else:
+                h, w = image.shape[1], image.shape[2]
+            if w < h:
+                expected_height = int(self.size["shortest_edge"] * h / w)
+                expected_width = self.size["shortest_edge"]
+            elif w > h:
+                expected_height = self.size["shortest_edge"]
+                expected_width = int(self.size["shortest_edge"] * w / h)
+            else:
+                expected_height = self.size["shortest_edge"]
+                expected_width = self.size["shortest_edge"]
+
+        else:
+            expected_values = []
+            for image in image_inputs:
+                expected_height, expected_width = self.get_expected_values([image])
+                expected_values.append((expected_height, expected_width))
+            expected_height = max(expected_values, key=lambda item: item[0])[0]
+            expected_width = max(expected_values, key=lambda item: item[1])[1]
+
+        return expected_height, expected_width
+
+    def expected_output_image_shape(self, images):
+        height, width = self.get_expected_values(images, batched=True)
+        return self.num_channels, height, width
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class ConditionalDetrImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = ConditionalDetrImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = ConditionalDetrImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333})
+        self.assertEqual(image_processor.do_pad, True)
+
+        image_processor = self.image_processing_class.from_dict(
+            self.image_processor_dict, size=42, max_size=84, pad_and_return_pixel_mask=False
+        )
+        self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84})
+        self.assertEqual(image_processor.do_pad, False)
+
+    @slow
+    def test_call_pytorch_with_coco_detection_annotations(self):
+        # prepare image and target
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
+            target = json.loads(f.read())
+
+        target = {"image_id": 39769, "annotations": target}
+
+        # encode them
+        image_processing = ConditionalDetrImageProcessor.from_pretrained("microsoft/conditional-detr-resnet-50")
+        encoding = image_processing(images=image, annotations=target, return_tensors="pt")
+
+        # verify pixel values
+        expected_shape = torch.Size([1, 3, 800, 1066])
+        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+
+        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
+        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
+
+        # verify area
+        expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
+        # verify boxes
+        expected_boxes_shape = torch.Size([6, 4])
+        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
+        expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
+        # verify image_id
+        expected_image_id = torch.tensor([39769])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
+        # verify is_crowd
+        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
+        # verify class_labels
+        expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
+        # verify orig_size
+        expected_orig_size = torch.tensor([480, 640])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
+        # verify size
+        expected_size = torch.tensor([800, 1066])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
+
+    @slow
+    def test_call_pytorch_with_coco_panoptic_annotations(self):
+        # prepare image, target and masks_path
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
+            target = json.loads(f.read())
+
+        target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
+
+        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
+
+        # encode them
+        image_processing = ConditionalDetrImageProcessor(format="coco_panoptic")
+        encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
+
+        # verify pixel values
+        expected_shape = torch.Size([1, 3, 800, 1066])
+        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+
+        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
+        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
+
+        # verify area
+        expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
+        # verify boxes
+        expected_boxes_shape = torch.Size([6, 4])
+        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
+        expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
+        # verify image_id
+        expected_image_id = torch.tensor([39769])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
+        # verify is_crowd
+        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
+        # verify class_labels
+        expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
+        # verify masks
+        expected_masks_sum = 822873
+        self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
+        # verify orig_size
+        expected_orig_size = torch.tensor([480, 640])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
+        # verify size
+        expected_size = torch.tensor([800, 1066])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py
index 667caa384073..10d788bd692f 100644
--- a/tests/models/conditional_detr/test_modeling_conditional_detr.py
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -19,25 +19,30 @@
 import math
 import unittest
 
-from transformers import ConditionalDetrConfig, is_timm_available, is_vision_available
-from transformers.testing_utils import require_timm, require_vision, slow, torch_device
+from transformers import ConditionalDetrConfig, ResNetConfig, is_torch_available, is_vision_available
+from transformers.testing_utils import require_timm, require_torch, require_vision, slow, torch_device
 from transformers.utils import cached_property
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
-if is_timm_available():
+if is_torch_available():
     import torch
 
-    from transformers import ConditionalDetrForObjectDetection, ConditionalDetrForSegmentation, ConditionalDetrModel
+    from transformers import (
+        ConditionalDetrForObjectDetection,
+        ConditionalDetrForSegmentation,
+        ConditionalDetrModel,
+    )
 
 
 if is_vision_available():
     from PIL import Image
 
-    from transformers import ConditionalDetrFeatureExtractor
+    from transformers import ConditionalDetrImageProcessor
 
 
 class ConditionalDetrModelTester:
@@ -47,7 +52,7 @@ def __init__(
         batch_size=8,
         is_training=True,
         use_labels=True,
-        hidden_size=256,
+        hidden_size=32,
         num_hidden_layers=2,
         num_attention_heads=8,
         intermediate_size=4,
@@ -105,6 +110,16 @@ def prepare_config_and_inputs(self):
         return config, pixel_values, pixel_mask, labels
 
     def get_config(self):
+        resnet_config = ResNetConfig(
+            num_channels=3,
+            embeddings_size=10,
+            hidden_sizes=[10, 20, 30, 40],
+            depths=[1, 1, 2, 1],
+            hidden_act="relu",
+            num_labels=3,
+            out_features=["stage2", "stage3", "stage4"],
+            out_indices=[2, 3, 4],
+        )
         return ConditionalDetrConfig(
             d_model=self.hidden_size,
             encoder_layers=self.num_hidden_layers,
@@ -117,6 +132,8 @@ def get_config(self):
             attention_dropout=self.attention_probs_dropout_prob,
             num_queries=self.num_queries,
             num_labels=self.num_labels,
+            use_timm_backbone=False,
+            backbone_config=resnet_config,
         )
 
     def prepare_config_and_inputs_for_common(self):
@@ -154,17 +171,22 @@ def create_and_check_conditional_detr_object_detection_head_model(self, config,
         self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
 
 
-@require_timm
-class ConditionalDetrModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+@require_torch
+class ConditionalDetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             ConditionalDetrModel,
             ConditionalDetrForObjectDetection,
             ConditionalDetrForSegmentation,
         )
-        if is_timm_available()
+        if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {"feature-extraction": ConditionalDetrModel, "object-detection": ConditionalDetrForObjectDetection}
+        if is_torch_available()
+        else {}
+    )
     is_encoder_decoder = True
     test_torchscript = False
     test_pruning = False
@@ -213,6 +235,11 @@ def test_conditional_detr_object_detection_head_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_conditional_detr_object_detection_head_model(*config_and_inputs)
 
+    # TODO: check if this works again for PyTorch 2.x.y
+    @unittest.skip(reason="Got `CUDA error: misaligned address` with PyTorch 2.0.0.")
+    def test_multi_gpu_data_parallel_forward(self):
+        pass
+
     @unittest.skip(reason="Conditional DETR does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass
@@ -454,9 +481,9 @@ def prepare_img():
 @slow
 class ConditionalDetrModelIntegrationTests(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return (
-            ConditionalDetrFeatureExtractor.from_pretrained("microsoft/conditional-detr-resnet-50")
+            ConditionalDetrImageProcessor.from_pretrained("microsoft/conditional-detr-resnet-50")
             if is_vision_available()
             else None
         )
@@ -464,9 +491,9 @@ def default_feature_extractor(self):
     def test_inference_no_head(self):
         model = ConditionalDetrModel.from_pretrained("microsoft/conditional-detr-resnet-50").to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        encoding = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         with torch.no_grad():
             outputs = model(**encoding)
@@ -483,9 +510,9 @@ def test_inference_object_detection_head(self):
             torch_device
         )
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        encoding = image_processor(images=image, return_tensors="pt").to(torch_device)
         pixel_values = encoding["pixel_values"].to(torch_device)
         pixel_mask = encoding["pixel_mask"].to(torch_device)
 
@@ -508,7 +535,7 @@ def test_inference_object_detection_head(self):
         self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
 
         # verify postprocessing
-        results = feature_extractor.post_process_object_detection(
+        results = image_processor.post_process_object_detection(
             outputs, threshold=0.3, target_sizes=[image.size[::-1]]
         )[0]
         expected_scores = torch.tensor([0.8330, 0.8313, 0.8039, 0.6829, 0.5355]).to(torch_device)
diff --git a/tests/models/convbert/test_modeling_convbert.py b/tests/models/convbert/test_modeling_convbert.py
index f2b82aaadf32..754967ce0039 100644
--- a/tests/models/convbert/test_modeling_convbert.py
+++ b/tests/models/convbert/test_modeling_convbert.py
@@ -23,6 +23,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -52,7 +53,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -245,8 +246,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class ConvBertModelTest(ModelTesterMixin, unittest.TestCase):
-
+class ConvBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             ConvBertModel,
@@ -259,6 +259,18 @@ class ConvBertModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": ConvBertModel,
+            "fill-mask": ConvBertForMaskedLM,
+            "question-answering": ConvBertForQuestionAnswering,
+            "text-classification": ConvBertForSequenceClassification,
+            "token-classification": ConvBertForTokenClassification,
+            "zero-shot": ConvBertForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
     test_pruning = False
     test_head_masking = False
 
@@ -419,7 +431,6 @@ def test_attention_outputs(self):
     def test_torchscript_device_change(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
-
             # ConvBertForMultipleChoice behaves incorrectly in JIT environments.
             if model_class == ConvBertForMultipleChoice:
                 return
@@ -437,6 +448,22 @@ def test_torchscript_device_change(self):
                 loaded = torch.jit.load(os.path.join(tmp, "traced_model.pt"), map_location=torch_device)
                 loaded(inputs_dict["input_ids"].to(torch_device), inputs_dict["attention_mask"].to(torch_device))
 
+    def test_model_for_input_embeds(self):
+        batch_size = 2
+        seq_length = 10
+        inputs_embeds = torch.rand([batch_size, seq_length, 768], device=torch_device)
+        config = self.model_tester.get_config()
+        model = ConvBertModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(inputs_embeds=inputs_embeds)
+        self.assertEqual(result.last_hidden_state.shape, (batch_size, seq_length, config.hidden_size))
+
+    def test_reducing_attention_heads(self):
+        config, *inputs_dict = self.model_tester.prepare_config_and_inputs()
+        config.head_ratio = 4
+        self.model_tester.create_and_check_for_masked_lm(config, *inputs_dict)
+
 
 @require_torch
 class ConvBertModelIntegrationTest(unittest.TestCase):
diff --git a/tests/models/convbert/test_modeling_tf_convbert.py b/tests/models/convbert/test_modeling_tf_convbert.py
index ae675b878ed1..5c5d83de300a 100644
--- a/tests/models/convbert/test_modeling_tf_convbert.py
+++ b/tests/models/convbert/test_modeling_tf_convbert.py
@@ -12,6 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
+
 import os
 import tempfile
 import unittest
@@ -21,6 +23,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -48,7 +51,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -71,7 +74,7 @@ def __init__(
         self.use_labels = True
         self.vocab_size = 99
         self.hidden_size = 384
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.num_attention_heads = 4
         self.intermediate_size = 37
         self.hidden_act = "gelu"
@@ -223,8 +226,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFConvBertModelTest(TFModelTesterMixin, unittest.TestCase):
-
+class TFConvBertModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             TFConvBertModel,
@@ -237,6 +239,18 @@ class TFConvBertModelTest(TFModelTesterMixin, unittest.TestCase):
         if is_tf_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": TFConvBertModel,
+            "fill-mask": TFConvBertForMaskedLM,
+            "question-answering": TFConvBertForQuestionAnswering,
+            "text-classification": TFConvBertForSequenceClassification,
+            "token-classification": TFConvBertForTokenClassification,
+            "zero-shot": TFConvBertForSequenceClassification,
+        }
+        if is_tf_available()
+        else {}
+    )
     test_pruning = False
     test_head_masking = False
     test_onnx = False
diff --git a/tests/models/convnext/test_feature_extraction_convnext.py b/tests/models/convnext/test_feature_extraction_convnext.py
deleted file mode 100644
index 9777c3df6d06..000000000000
--- a/tests/models/convnext/test_feature_extraction_convnext.py
+++ /dev/null
@@ -1,203 +0,0 @@
-# coding=utf-8
-# Copyright 2022s HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-
-from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torch_available, is_vision_available
-
-from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
-
-
-if is_torch_available():
-    import torch
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import ConvNextFeatureExtractor
-
-
-class ConvNextFeatureExtractionTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        crop_pct=0.875,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-    ):
-        size = size if size is not None else {"shortest_edge": 20}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.crop_pct = crop_pct
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-
-    def prepare_feat_extract_dict(self):
-        return {
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_normalize": self.do_normalize,
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "crop_pct": self.crop_pct,
-        }
-
-
-@require_torch
-@require_vision
-class ConvNextFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
-
-    feature_extraction_class = ConvNextFeatureExtractor if is_vision_available() else None
-
-    def setUp(self):
-        self.feature_extract_tester = ConvNextFeatureExtractionTester(self)
-
-    @property
-    def feat_extract_dict(self):
-        return self.feature_extract_tester.prepare_feat_extract_dict()
-
-    def test_feat_extract_properties(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        self.assertTrue(hasattr(feature_extractor, "do_resize"))
-        self.assertTrue(hasattr(feature_extractor, "size"))
-        self.assertTrue(hasattr(feature_extractor, "crop_pct"))
-        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
-        self.assertTrue(hasattr(feature_extractor, "image_mean"))
-        self.assertTrue(hasattr(feature_extractor, "image_std"))
-
-    def test_feat_extract_from_dict_with_kwargs(self):
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict)
-        self.assertEqual(feature_extractor.size, {"shortest_edge": 20})
-
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict, size=42)
-        self.assertEqual(feature_extractor.size, {"shortest_edge": 42})
-
-    def test_batch_feature(self):
-        pass
-
-    def test_call_pil(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PIL images
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["shortest_edge"],
-                self.feature_extract_tester.size["shortest_edge"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["shortest_edge"],
-                self.feature_extract_tester.size["shortest_edge"],
-            ),
-        )
-
-    def test_call_numpy(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random numpy tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["shortest_edge"],
-                self.feature_extract_tester.size["shortest_edge"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["shortest_edge"],
-                self.feature_extract_tester.size["shortest_edge"],
-            ),
-        )
-
-    def test_call_pytorch(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["shortest_edge"],
-                self.feature_extract_tester.size["shortest_edge"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["shortest_edge"],
-                self.feature_extract_tester.size["shortest_edge"],
-            ),
-        )
diff --git a/tests/models/convnext/test_image_processing_convnext.py b/tests/models/convnext/test_image_processing_convnext.py
new file mode 100644
index 000000000000..0c331741807c
--- /dev/null
+++ b/tests/models/convnext/test_image_processing_convnext.py
@@ -0,0 +1,110 @@
+# coding=utf-8
+# Copyright 2022s HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_vision_available():
+    from transformers import ConvNextImageProcessor
+
+
+class ConvNextImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        crop_pct=0.875,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+    ):
+        size = size if size is not None else {"shortest_edge": 20}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.crop_pct = crop_pct
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def prepare_image_processor_dict(self):
+        return {
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_normalize": self.do_normalize,
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "crop_pct": self.crop_pct,
+        }
+
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.size["shortest_edge"], self.size["shortest_edge"]
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class ConvNextImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = ConvNextImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = ConvNextImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "crop_pct"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"shortest_edge": 20})
+
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
+        self.assertEqual(image_processor.size, {"shortest_edge": 42})
diff --git a/tests/models/convnext/test_modeling_convnext.py b/tests/models/convnext/test_modeling_convnext.py
index 6cdaafabec35..397fa596f102 100644
--- a/tests/models/convnext/test_modeling_convnext.py
+++ b/tests/models/convnext/test_modeling_convnext.py
@@ -22,21 +22,23 @@
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
 from transformers.utils import cached_property, is_torch_available, is_vision_available
 
+from ...test_backbone_common import BackboneTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
     import torch
 
-    from transformers import ConvNextForImageClassification, ConvNextModel
+    from transformers import ConvNextBackbone, ConvNextForImageClassification, ConvNextModel
     from transformers.models.convnext.modeling_convnext import CONVNEXT_PRETRAINED_MODEL_ARCHIVE_LIST
 
 
 if is_vision_available():
     from PIL import Image
 
-    from transformers import AutoFeatureExtractor
+    from transformers import AutoImageProcessor
 
 
 class ConvNextModelTester:
@@ -53,9 +55,10 @@ def __init__(
         use_labels=True,
         intermediate_size=37,
         hidden_act="gelu",
-        type_sequence_label_size=10,
+        num_labels=10,
         initializer_range=0.02,
-        num_labels=3,
+        out_features=["stage2", "stage3", "stage4"],
+        out_indices=[2, 3, 4],
         scope=None,
     ):
         self.parent = parent
@@ -69,8 +72,10 @@ def __init__(
         self.use_labels = use_labels
         self.intermediate_size = intermediate_size
         self.hidden_act = hidden_act
-        self.type_sequence_label_size = type_sequence_label_size
+        self.num_labels = num_labels
         self.initializer_range = initializer_range
+        self.out_features = out_features
+        self.out_indices = out_indices
         self.scope = scope
 
     def prepare_config_and_inputs(self):
@@ -78,10 +83,9 @@ def prepare_config_and_inputs(self):
 
         labels = None
         if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            labels = ids_tensor([self.batch_size], self.num_labels)
 
         config = self.get_config()
-
         return config, pixel_values, labels
 
     def get_config(self):
@@ -93,6 +97,9 @@ def get_config(self):
             hidden_act=self.hidden_act,
             is_decoder=False,
             initializer_range=self.initializer_range,
+            out_features=self.out_features,
+            out_indices=self.out_indices,
+            num_labels=self.num_labels,
         )
 
     def create_and_check_model(self, config, pixel_values, labels):
@@ -107,12 +114,40 @@ def create_and_check_model(self, config, pixel_values, labels):
         )
 
     def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.type_sequence_label_size
         model = ConvNextForImageClassification(config)
         model.to(torch_device)
         model.eval()
         result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_backbone(self, config, pixel_values, labels):
+        model = ConvNextBackbone(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+
+        # verify hidden states
+        self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
+        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, self.hidden_sizes[1], 4, 4])
+
+        # verify channels
+        self.parent.assertEqual(len(model.channels), len(config.out_features))
+        self.parent.assertListEqual(model.channels, config.hidden_sizes[1:])
+
+        # verify backbone works with out_features=None
+        config.out_features = None
+        model = ConvNextBackbone(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+
+        # verify feature maps
+        self.parent.assertEqual(len(result.feature_maps), 1)
+        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, self.hidden_sizes[-1], 1, 1])
+
+        # verify channels
+        self.parent.assertEqual(len(model.channels), 1)
+        self.parent.assertListEqual(model.channels, [config.hidden_sizes[-1]])
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
@@ -122,7 +157,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class ConvNextModelTest(ModelTesterMixin, unittest.TestCase):
+class ConvNextModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_common.py, as ConvNext does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
@@ -132,10 +167,16 @@ class ConvNextModelTest(ModelTesterMixin, unittest.TestCase):
         (
             ConvNextModel,
             ConvNextForImageClassification,
+            ConvNextBackbone,
         )
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {"feature-extraction": ConvNextModel, "image-classification": ConvNextForImageClassification}
+        if is_torch_available()
+        else {}
+    )
 
     fx_compatible = True
     test_pruning = False
@@ -167,6 +208,10 @@ def test_inputs_embeds(self):
     def test_model_common_attributes(self):
         pass
 
+    @unittest.skip(reason="ConvNext does not use feedforward chunking")
+    def test_feed_forward_chunking(self):
+        pass
+
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -183,6 +228,10 @@ def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    def test_backbone(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_backbone(*config_and_inputs)
+
     def test_hidden_states_output(self):
         def check_hidden_states_output(inputs_dict, config, model_class):
             model = model_class(config)
@@ -236,16 +285,16 @@ def prepare_img():
 @require_vision
 class ConvNextModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return AutoFeatureExtractor.from_pretrained("facebook/convnext-tiny-224") if is_vision_available() else None
+    def default_image_processor(self):
+        return AutoImageProcessor.from_pretrained("facebook/convnext-tiny-224") if is_vision_available() else None
 
     @slow
     def test_inference_image_classification_head(self):
         model = ConvNextForImageClassification.from_pretrained("facebook/convnext-tiny-224").to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -258,3 +307,14 @@ def test_inference_image_classification_head(self):
         expected_slice = torch.tensor([-0.0260, -0.4739, 0.1911]).to(torch_device)
 
         self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
+
+
+@require_torch
+class ConvNextBackboneTest(unittest.TestCase, BackboneTesterMixin):
+    all_model_classes = (ConvNextBackbone,) if is_torch_available() else ()
+    config_class = ConvNextConfig
+
+    has_attentions = False
+
+    def setUp(self):
+        self.model_tester = ConvNextModelTester(self)
diff --git a/tests/models/convnext/test_modeling_tf_convnext.py b/tests/models/convnext/test_modeling_tf_convnext.py
index cf674a9b85fd..4a06632513d5 100644
--- a/tests/models/convnext/test_modeling_tf_convnext.py
+++ b/tests/models/convnext/test_modeling_tf_convnext.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 """ Testing suite for the TensorFlow ConvNext model. """
 
+from __future__ import annotations
+
 import inspect
 import unittest
 from typing import List, Tuple
@@ -24,6 +26,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -35,7 +38,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import ConvNextFeatureExtractor
+    from transformers import ConvNextImageProcessor
 
 
 class TFConvNextModelTester:
@@ -117,13 +120,18 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFConvNextModelTest(TFModelTesterMixin, unittest.TestCase):
+class TFConvNextModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_common.py, as ConvNext does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
     """
 
     all_model_classes = (TFConvNextModel, TFConvNextForImageClassification) if is_tf_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": TFConvNextModel, "image-classification": TFConvNextForImageClassification}
+        if is_tf_available()
+        else {}
+    )
 
     test_pruning = False
     test_onnx = False
@@ -148,6 +156,7 @@ def test_inputs_embeds(self):
         not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0,
         reason="TF does not support backprop for grouped convolutions on CPU.",
     )
+    @slow
     def test_keras_fit(self):
         super().test_keras_fit()
 
@@ -270,18 +279,16 @@ def prepare_img():
 @require_vision
 class TFConvNextModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return (
-            ConvNextFeatureExtractor.from_pretrained("facebook/convnext-tiny-224") if is_vision_available() else None
-        )
+    def default_image_processor(self):
+        return ConvNextImageProcessor.from_pretrained("facebook/convnext-tiny-224") if is_vision_available() else None
 
     @slow
     def test_inference_image_classification_head(self):
         model = TFConvNextForImageClassification.from_pretrained("facebook/convnext-tiny-224")
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="tf")
+        inputs = image_processor(images=image, return_tensors="tf")
 
         # forward pass
         outputs = model(**inputs)
diff --git a/tests/models/convnextv2/__init__.py b/tests/models/convnextv2/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/convnextv2/test_modeling_convnextv2.py b/tests/models/convnextv2/test_modeling_convnextv2.py
new file mode 100644
index 000000000000..c3f8804f1cca
--- /dev/null
+++ b/tests/models/convnextv2/test_modeling_convnextv2.py
@@ -0,0 +1,357 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch ConvNextV2 model. """
+
+
+import inspect
+import unittest
+
+from transformers import ConvNextV2Config
+from transformers.models.auto import get_values
+from transformers.models.auto.modeling_auto import MODEL_FOR_BACKBONE_MAPPING_NAMES, MODEL_MAPPING_NAMES
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.utils import cached_property, is_torch_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import ConvNextV2Backbone, ConvNextV2ForImageClassification, ConvNextV2Model
+    from transformers.models.convnextv2.modeling_convnextv2 import CONVNEXTV2_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import AutoImageProcessor
+
+
+class ConvNextV2ModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        image_size=32,
+        num_channels=3,
+        num_stages=4,
+        hidden_sizes=[10, 20, 30, 40],
+        depths=[2, 2, 3, 2],
+        is_training=True,
+        use_labels=True,
+        intermediate_size=37,
+        hidden_act="gelu",
+        num_labels=10,
+        initializer_range=0.02,
+        out_features=["stage2", "stage3", "stage4"],
+        out_indices=[2, 3, 4],
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.num_channels = num_channels
+        self.num_stages = num_stages
+        self.hidden_sizes = hidden_sizes
+        self.depths = depths
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.num_labels = num_labels
+        self.initializer_range = initializer_range
+        self.out_features = out_features
+        self.out_indices = out_indices
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.num_labels)
+
+        config = self.get_config()
+
+        return config, pixel_values, labels
+
+    def get_config(self):
+        return ConvNextV2Config(
+            num_channels=self.num_channels,
+            hidden_sizes=self.hidden_sizes,
+            depths=self.depths,
+            num_stages=self.num_stages,
+            hidden_act=self.hidden_act,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+            out_features=self.out_features,
+            out_indices=self.out_indices,
+            num_labels=self.num_labels,
+        )
+
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = ConvNextV2Model(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        # expected last hidden states: B, C, H // 32, W // 32
+        self.parent.assertEqual(
+            result.last_hidden_state.shape,
+            (self.batch_size, self.hidden_sizes[-1], self.image_size // 32, self.image_size // 32),
+        )
+
+    def create_and_check_for_image_classification(self, config, pixel_values, labels):
+        model = ConvNextV2ForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values, labels=labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_backbone(self, config, pixel_values, labels):
+        model = ConvNextV2Backbone(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+
+        # verify hidden states
+        self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
+        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, self.hidden_sizes[1], 4, 4])
+
+        # verify channels
+        self.parent.assertEqual(len(model.channels), len(config.out_features))
+        self.parent.assertListEqual(model.channels, config.hidden_sizes[1:])
+
+        # verify backbone works with out_features=None
+        config.out_features = None
+        model = ConvNextV2Backbone(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+
+        # verify feature maps
+        self.parent.assertEqual(len(result.feature_maps), 1)
+        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, self.hidden_sizes[-1], 1, 1])
+
+        # verify channels
+        self.parent.assertEqual(len(model.channels), 1)
+        self.parent.assertListEqual(model.channels, [config.hidden_sizes[-1]])
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values, labels = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_with_labels(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values, labels = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values, "labels": labels}
+        return config, inputs_dict
+
+
+@require_torch
+class ConvNextV2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as ConvNextV2 does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (
+        (
+            ConvNextV2Model,
+            ConvNextV2ForImageClassification,
+            ConvNextV2Backbone,
+        )
+        if is_torch_available()
+        else ()
+    )
+    pipeline_model_mapping = (
+        {"feature-extraction": ConvNextV2Model, "image-classification": ConvNextV2ForImageClassification}
+        if is_torch_available()
+        else {}
+    )
+
+    fx_compatible = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+    has_attentions = False
+
+    def setUp(self):
+        self.model_tester = ConvNextV2ModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ConvNextV2Config, has_text_modality=False, hidden_size=37)
+
+    def test_config(self):
+        self.create_and_test_config_common_properties()
+        self.config_tester.create_and_test_config_to_json_string()
+        self.config_tester.create_and_test_config_to_json_file()
+        self.config_tester.create_and_test_config_from_and_save_pretrained()
+        self.config_tester.create_and_test_config_with_num_labels()
+        self.config_tester.check_config_can_be_init_without_params()
+        self.config_tester.check_config_arguments_init()
+
+    def create_and_test_config_common_properties(self):
+        return
+
+    @unittest.skip(reason="ConvNextV2 does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="ConvNextV2 does not support input and output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    @unittest.skip(reason="ConvNextV2 does not use feedforward chunking")
+    def test_feed_forward_chunking(self):
+        pass
+
+    def test_training(self):
+        if not self.model_tester.is_training:
+            return
+
+        for model_class in self.all_model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_with_labels()
+            config.return_dict = True
+
+            if model_class.__name__ in [
+                *get_values(MODEL_MAPPING_NAMES),
+                *get_values(MODEL_FOR_BACKBONE_MAPPING_NAMES),
+            ]:
+                continue
+
+            model = model_class(config)
+            model.to(torch_device)
+            model.train()
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            loss = model(**inputs).loss
+            loss.backward()
+
+    def test_training_gradient_checkpointing(self):
+        if not self.model_tester.is_training:
+            return
+
+        for model_class in self.all_model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_with_labels()
+            config.use_cache = False
+            config.return_dict = True
+
+            if (
+                model_class.__name__
+                in [*get_values(MODEL_MAPPING_NAMES), *get_values(MODEL_FOR_BACKBONE_MAPPING_NAMES)]
+                or not model_class.supports_gradient_checkpointing
+            ):
+                continue
+
+            model = model_class(config)
+            model.to(torch_device)
+            model.gradient_checkpointing_enable()
+            model.train()
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            loss = model(**inputs).loss
+            loss.backward()
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+            expected_num_stages = self.model_tester.num_stages
+            self.assertEqual(len(hidden_states), expected_num_stages + 1)
+
+            # ConvNextV2's feature maps are of shape (batch_size, num_channels, height, width)
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [self.model_tester.image_size // 4, self.model_tester.image_size // 4],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    def test_for_image_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in CONVNEXTV2_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = ConvNextV2Model.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_torch
+@require_vision
+class ConvNextV2ModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_image_processor(self):
+        return AutoImageProcessor.from_pretrained("facebook/convnextv2-tiny-1k-224") if is_vision_available() else None
+
+    @slow
+    def test_inference_image_classification_head(self):
+        model = ConvNextV2ForImageClassification.from_pretrained("facebook/convnextv2-tiny-1k-224").to(torch_device)
+
+        preprocessor = self.default_image_processor
+        image = prepare_img()
+        inputs = preprocessor(images=image, return_tensors="pt").to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # verify the logits
+        expected_shape = torch.Size((1, 1000))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        expected_slice = torch.tensor([0.9996, 0.1966, -0.4386]).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
diff --git a/tests/models/cpm/test_tokenization_cpm.py b/tests/models/cpm/test_tokenization_cpm.py
index 1d66778b8c7a..fa69a6aaa7dc 100644
--- a/tests/models/cpm/test_tokenization_cpm.py
+++ b/tests/models/cpm/test_tokenization_cpm.py
@@ -21,6 +21,12 @@
 
 @custom_tokenizers
 class CpmTokenizationTest(XLNetModelTest):
+    # There is no `CpmModel`
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        return True
+
     def test_pre_tokenization(self):
         tokenizer = CpmTokenizer.from_pretrained("TsinghuaAI/CPM-Generate")
         text = "Hugging Face大法好，谁用谁知道。"
diff --git a/tests/models/cpmant/__init__.py b/tests/models/cpmant/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/cpmant/test_modeling_cpmant.py b/tests/models/cpmant/test_modeling_cpmant.py
new file mode 100644
index 000000000000..6ecfe15c2ec7
--- /dev/null
+++ b/tests/models/cpmant/test_modeling_cpmant.py
@@ -0,0 +1,237 @@
+# coding=utf-8
+# Copyright 2022 The OpenBMB Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch CPMAnt model. """
+
+import unittest
+
+from transformers.testing_utils import is_torch_available, require_torch, tooslow
+
+from ...generation.test_utils import torch_device
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        CpmAntConfig,
+        CpmAntForCausalLM,
+        CpmAntModel,
+        CpmAntTokenizer,
+    )
+
+
+@require_torch
+class CpmAntModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=2,
+        seq_length=8,
+        is_training=True,
+        use_token_type_ids=False,
+        use_input_mask=False,
+        use_labels=False,
+        use_mc_token_ids=False,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        num_buckets=32,
+        max_distance=128,
+        prompt_length=8,
+        prompt_types=8,
+        segment_types=8,
+        init_std=1.0,
+        return_dict=True,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_token_type_ids = use_token_type_ids
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.use_mc_token_ids = use_mc_token_ids
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.num_buckets = num_buckets
+        self.max_distance = max_distance
+        self.prompt_length = prompt_length
+        self.prompt_types = prompt_types
+        self.segment_types = segment_types
+        self.init_std = init_std
+        self.return_dict = return_dict
+
+    def prepare_config_and_inputs(self):
+        input_ids = {}
+        input_ids["input_ids"] = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).type(torch.int32)
+        input_ids["use_cache"] = False
+
+        config = self.get_config()
+
+        return (config, input_ids)
+
+    def get_config(self):
+        return CpmAntConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            dim_ff=self.intermediate_size,
+            position_bias_num_buckets=self.num_buckets,
+            position_bias_max_distance=self.max_distance,
+            prompt_types=self.prompt_types,
+            prompt_length=self.prompt_length,
+            segment_types=self.segment_types,
+            use_cache=True,
+            init_std=self.init_std,
+            return_dict=self.return_dict,
+        )
+
+    def create_and_check_cpmant_model(self, config, input_ids, *args):
+        model = CpmAntModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        hidden_states = model(**input_ids).last_hidden_state
+
+        self.parent.assertEqual(hidden_states.shape, (self.batch_size, self.seq_length, config.hidden_size))
+
+    def create_and_check_lm_head_model(self, config, input_ids, *args):
+        model = CpmAntForCausalLM(config)
+        model.to(torch_device)
+        input_ids["input_ids"] = input_ids["input_ids"].to(torch_device)
+        model.eval()
+
+        model_output = model(**input_ids)
+        self.parent.assertEqual(
+            model_output.logits.shape,
+            (self.batch_size, self.seq_length, config.vocab_size + config.prompt_types * config.prompt_length),
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+
+@require_torch
+class CpmAntModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (CpmAntModel, CpmAntForCausalLM) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": CpmAntModel, "text-generation": CpmAntForCausalLM} if is_torch_available() else {}
+    )
+
+    test_pruning = False
+    test_missing_keys = False
+    test_mismatched_shapes = False
+    test_head_masking = False
+    test_resize_embeddings = False
+
+    def setUp(self):
+        self.model_tester = CpmAntModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=CpmAntConfig)
+
+    def test_config(self):
+        self.config_tester.create_and_test_config_common_properties()
+        self.config_tester.create_and_test_config_to_json_string()
+        self.config_tester.create_and_test_config_to_json_file()
+        self.config_tester.create_and_test_config_from_and_save_pretrained()
+        self.config_tester.check_config_can_be_init_without_params()
+        self.config_tester.check_config_arguments_init()
+
+    def test_inputs_embeds(self):
+        unittest.skip("CPMAnt doesn't support input_embeds.")(self.test_inputs_embeds)
+
+    def test_retain_grad_hidden_states_attentions(self):
+        unittest.skip(
+            "CPMAnt doesn't support retain grad in hidden_states or attentions, because prompt management will peel off the output.hidden_states from graph.\
+                 So is attentions. We strongly recommand you use loss to tune model."
+        )(self.test_retain_grad_hidden_states_attentions)
+
+    def test_cpmant_model(self):
+        config, inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_cpmant_model(config, inputs)
+
+    def test_cpmant_lm_head_model(self):
+        config, inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_lm_head_model(config, inputs)
+
+
+@require_torch
+class CpmAntModelIntegrationTest(unittest.TestCase):
+    @tooslow
+    def test_inference_masked_lm(self):
+        texts = "今天天气真好！"
+        model_path = "openbmb/cpm-ant-10b"
+        model = CpmAntModel.from_pretrained(model_path)
+        tokenizer = CpmAntTokenizer.from_pretrained(model_path)
+        inputs = tokenizer(texts, return_tensors="pt")
+        hidden_states = model(**inputs).last_hidden_state
+
+        expected_slice = torch.tensor(
+            [[[6.1708, 5.9244, 1.0835], [6.5207, 6.2893, -11.3324], [-1.0107, -0.0576, -5.9577]]],
+        )
+        self.assertTrue(torch.allclose(hidden_states[:, :3, :3], expected_slice, atol=1e-2))
+
+
+@require_torch
+class CpmAntForCausalLMlIntegrationTest(unittest.TestCase):
+    @tooslow
+    def test_inference_casual(self):
+        texts = "今天天气真好！"
+        model_path = "openbmb/cpm-ant-10b"
+        model = CpmAntForCausalLM.from_pretrained(model_path)
+        tokenizer = CpmAntTokenizer.from_pretrained(model_path)
+        inputs = tokenizer(texts, return_tensors="pt")
+        hidden_states = model(**inputs).logits
+
+        expected_slice = torch.tensor(
+            [[[-6.4267, -6.4083, -6.3958], [-5.8802, -5.9447, -5.7811], [-5.3896, -5.4820, -5.4295]]],
+        )
+        self.assertTrue(torch.allclose(hidden_states[:, :3, :3], expected_slice, atol=1e-2))
+
+    @tooslow
+    def test_simple_generation(self):
+        model_path = "openbmb/cpm-ant-10b"
+        model = CpmAntForCausalLM.from_pretrained(model_path)
+        tokenizer = CpmAntTokenizer.from_pretrained(model_path)
+        texts = "今天天气不错，"
+        expected_output = "今天天气不错，阳光明媚，我和妈妈一起去超市买东西。\n在超市里，我看到了一个很好玩的玩具，它的名字叫“机器人”。它有一个圆圆的脑袋，两只圆圆的眼睛，还有一个圆圆的"
+        model_inputs = tokenizer(texts, return_tensors="pt")
+        token_ids = model.generate(**model_inputs)
+        output_texts = tokenizer.batch_decode(token_ids)
+        self.assertEqual(expected_output, output_texts)
+
+    @tooslow
+    def test_batch_generation(self):
+        model_path = "openbmb/cpm-ant-10b"
+        model = CpmAntForCausalLM.from_pretrained(model_path)
+        tokenizer = CpmAntTokenizer.from_pretrained(model_path)
+        texts = ["今天天气不错，", "新年快乐，万事如意！"]
+        expected_output = [
+            "今天天气不错，阳光明媚，我和妈妈一起去超市买东西。\n在超市里，我看到了一个很好玩的玩具，它的名字叫“机器人”。它有一个圆圆的脑袋，两只圆圆的眼睛，还有一个圆圆的",
+            "新年快乐，万事如意！在这辞旧迎新的美好时刻，我谨代表《农村新技术》杂志社全体同仁，向一直以来关心、支持《农村新技术》杂志发展的各级领导、各界朋友和广大读者致以最诚挚的",
+        ]
+        model_inputs = tokenizer(texts, return_tensors="pt", padding=True)
+        token_ids = model.generate(**model_inputs)
+        output_texts = tokenizer.batch_decode(token_ids)
+        self.assertEqual(expected_output, output_texts)
diff --git a/tests/models/cpmant/test_tokenization_cpmant.py b/tests/models/cpmant/test_tokenization_cpmant.py
new file mode 100644
index 000000000000..f5d0ef32450b
--- /dev/null
+++ b/tests/models/cpmant/test_tokenization_cpmant.py
@@ -0,0 +1,69 @@
+# coding=utf-8
+# Copyright 2022 The OpenBMB Team and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+from transformers.models.cpmant.tokenization_cpmant import VOCAB_FILES_NAMES, CpmAntTokenizer
+from transformers.testing_utils import require_jieba, tooslow
+
+from ...test_tokenization_common import TokenizerTesterMixin
+
+
+@require_jieba
+class CPMAntTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+    tokenizer_class = CpmAntTokenizer
+    test_rust_tokenizer = False
+
+    def setUp(self):
+        super().setUp()
+
+        vocab_tokens = [
+            "<d>",
+            "</d>",
+            "<s>",
+            "</s>",
+            "</_>",
+            "<unk>",
+            "<pad>",
+            "</n>",
+            "我",
+            "是",
+            "C",
+            "P",
+            "M",
+            "A",
+            "n",
+            "t",
+        ]
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+
+    @tooslow
+    def test_pre_tokenization(self):
+        tokenizer = CpmAntTokenizer.from_pretrained("openbmb/cpm-ant-10b")
+        texts = "今天天气真好！"
+        jieba_tokens = ["今天", "天气", "真", "好", "！"]
+        tokens = tokenizer.tokenize(texts)
+        self.assertListEqual(tokens, jieba_tokens)
+        normalized_text = "今天天气真好！"
+        input_tokens = [tokenizer.bos_token] + tokens
+
+        input_jieba_tokens = [6, 9802, 14962, 2082, 831, 244]
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_jieba_tokens)
+
+        reconstructed_text = tokenizer.decode(input_jieba_tokens)
+        self.assertEqual(reconstructed_text, normalized_text)
diff --git a/tests/models/ctrl/test_modeling_ctrl.py b/tests/models/ctrl/test_modeling_ctrl.py
index 0f2149ecf9be..65d3cbebc4f1 100644
--- a/tests/models/ctrl/test_modeling_ctrl.py
+++ b/tests/models/ctrl/test_modeling_ctrl.py
@@ -22,6 +22,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -48,7 +49,7 @@ def __init__(
         use_mc_token_ids=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -132,7 +133,7 @@ def get_config(self):
             n_embd=self.hidden_size,
             n_layer=self.num_hidden_layers,
             n_head=self.num_attention_heads,
-            # intermediate_size=self.intermediate_size,
+            dff=self.intermediate_size,
             # hidden_act=self.hidden_act,
             # hidden_dropout_prob=self.hidden_dropout_prob,
             # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
@@ -192,14 +193,35 @@ def create_and_check_ctrl_for_sequence_classification(self, config, input_ids, h
 
 
 @require_torch
-class CTRLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
+class CTRLModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (CTRLModel, CTRLLMHeadModel, CTRLForSequenceClassification) if is_torch_available() else ()
     all_generative_model_classes = (CTRLLMHeadModel,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": CTRLModel,
+            "text-classification": CTRLForSequenceClassification,
+            "text-generation": CTRLLMHeadModel,
+            "zero-shot": CTRLForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
     test_pruning = True
     test_resize_embeddings = False
     test_head_masking = False
 
+    # TODO: Fix the failed tests
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if pipeline_test_casse_name == "ZeroShotClassificationPipelineTests":
+            # Get `tokenizer does not have a padding token` error for both fast/slow tokenizers.
+            # `CTRLConfig` was never used in pipeline tests, either because of a missing checkpoint or because a tiny
+            # config could not be created.
+            return True
+
+        return False
+
     def setUp(self):
         self.model_tester = CTRLModelTester(self)
         self.config_tester = ConfigTester(self, config_class=CTRLConfig, n_embd=37)
@@ -227,6 +249,10 @@ def test_model_from_pretrained(self):
             model = CTRLModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
+    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
+    def test_left_padding_compatibility(self):
+        pass
+
 
 @require_torch
 class CTRLModelLanguageGenerationTest(unittest.TestCase):
@@ -238,7 +264,7 @@ def tearDown(self):
 
     @slow
     def test_lm_generate_ctrl(self):
-        model = CTRLLMHeadModel.from_pretrained("ctrl")
+        model = CTRLLMHeadModel.from_pretrained("Salesforce/ctrl")
         model.to(torch_device)
         input_ids = torch.tensor(
             [[11859, 0, 1611, 8]], dtype=torch.long, device=torch_device
diff --git a/tests/models/ctrl/test_modeling_tf_ctrl.py b/tests/models/ctrl/test_modeling_tf_ctrl.py
index d3e82e57c9f2..170cd7b3c567 100644
--- a/tests/models/ctrl/test_modeling_tf_ctrl.py
+++ b/tests/models/ctrl/test_modeling_tf_ctrl.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import CTRLConfig, is_tf_available
@@ -21,6 +23,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -49,7 +52,7 @@ def __init__(
         self.use_mc_token_ids = True
         self.vocab_size = 99
         self.hidden_size = 32
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.num_attention_heads = 4
         self.intermediate_size = 37
         self.hidden_act = "gelu"
@@ -92,7 +95,7 @@ def prepare_config_and_inputs(self):
             n_embd=self.hidden_size,
             n_layer=self.num_hidden_layers,
             n_head=self.num_attention_heads,
-            # intermediate_size=self.intermediate_size,
+            dff=self.intermediate_size,
             # hidden_act=self.hidden_act,
             # hidden_dropout_prob=self.hidden_dropout_prob,
             # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
@@ -168,13 +171,34 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFCTRLModelTest(TFModelTesterMixin, unittest.TestCase):
-
+class TFCTRLModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (TFCTRLModel, TFCTRLLMHeadModel, TFCTRLForSequenceClassification) if is_tf_available() else ()
     all_generative_model_classes = (TFCTRLLMHeadModel,) if is_tf_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": TFCTRLModel,
+            "text-classification": TFCTRLForSequenceClassification,
+            "text-generation": TFCTRLLMHeadModel,
+            "zero-shot": TFCTRLForSequenceClassification,
+        }
+        if is_tf_available()
+        else {}
+    )
     test_head_masking = False
     test_onnx = False
 
+    # TODO: Fix the failed tests
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if pipeline_test_casse_name == "ZeroShotClassificationPipelineTests":
+            # Get `tokenizer does not have a padding token` error for both fast/slow tokenizers.
+            # `CTRLConfig` was never used in pipeline tests, either because of a missing checkpoint or because a tiny
+            # config could not be created.
+            return True
+
+        return False
+
     def setUp(self):
         self.model_tester = TFCTRLModelTester(self)
         self.config_tester = ConfigTester(self, config_class=CTRLConfig, n_embd=37)
@@ -201,6 +225,7 @@ def test_model_common_attributes(self):
 
         for model_class in self.all_model_classes:
             model = model_class(config)
+            model.build()  # may be needed for the get_bias() call below
             assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
 
             if model_class in list_lm_models:
@@ -232,7 +257,7 @@ def test_model_from_pretrained(self):
 class TFCTRLModelLanguageGenerationTest(unittest.TestCase):
     @slow
     def test_lm_generate_ctrl(self):
-        model = TFCTRLLMHeadModel.from_pretrained("ctrl")
+        model = TFCTRLLMHeadModel.from_pretrained("Salesforce/ctrl")
         input_ids = tf.convert_to_tensor([[11859, 0, 1611, 8]], dtype=tf.int32)  # Legal the president is
         expected_output_ids = [
             11859,
diff --git a/tests/models/ctrl/test_tokenization_ctrl.py b/tests/models/ctrl/test_tokenization_ctrl.py
index 0bd4d8c8065c..02c3459f9e04 100644
--- a/tests/models/ctrl/test_tokenization_ctrl.py
+++ b/tests/models/ctrl/test_tokenization_ctrl.py
@@ -23,7 +23,6 @@
 
 
 class CTRLTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = CTRLTokenizer
     test_rust_tokenizer = False
     test_seq2seq = False
diff --git a/tests/models/cvt/test_modeling_cvt.py b/tests/models/cvt/test_modeling_cvt.py
index b88f22d982be..6f4f63f0f9df 100644
--- a/tests/models/cvt/test_modeling_cvt.py
+++ b/tests/models/cvt/test_modeling_cvt.py
@@ -25,6 +25,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -37,7 +38,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import AutoFeatureExtractor
+    from transformers import AutoImageProcessor
 
 
 class CvtConfigTester(ConfigTester):
@@ -54,8 +55,8 @@ def __init__(
         batch_size=13,
         image_size=64,
         num_channels=3,
-        embed_dim=[16, 48, 96],
-        num_heads=[1, 3, 6],
+        embed_dim=[16, 32, 48],
+        num_heads=[1, 2, 3],
         depth=[1, 2, 10],
         patch_sizes=[7, 3, 3],
         patch_stride=[4, 2, 2],
@@ -143,13 +144,18 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class CvtModelTest(ModelTesterMixin, unittest.TestCase):
+class CvtModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_common.py, as Cvt does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
     """
 
     all_model_classes = (CvtModel, CvtForImageClassification) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": CvtModel, "image-classification": CvtForImageClassification}
+        if is_torch_available()
+        else {}
+    )
 
     test_pruning = False
     test_torchscript = False
@@ -258,16 +264,16 @@ def prepare_img():
 @require_vision
 class CvtModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return AutoFeatureExtractor.from_pretrained(CVT_PRETRAINED_MODEL_ARCHIVE_LIST[0])
+    def default_image_processor(self):
+        return AutoImageProcessor.from_pretrained(CVT_PRETRAINED_MODEL_ARCHIVE_LIST[0])
 
     @slow
     def test_inference_image_classification_head(self):
         model = CvtForImageClassification.from_pretrained(CVT_PRETRAINED_MODEL_ARCHIVE_LIST[0]).to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/cvt/test_modeling_tf_cvt.py b/tests/models/cvt/test_modeling_tf_cvt.py
index 9e261a5f25be..ecb672d422a7 100644
--- a/tests/models/cvt/test_modeling_tf_cvt.py
+++ b/tests/models/cvt/test_modeling_tf_cvt.py
@@ -1,6 +1,8 @@
 """ Testing suite for the Tensorflow CvT model. """
 
 
+from __future__ import annotations
+
 import inspect
 import unittest
 from math import floor
@@ -13,6 +15,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -25,7 +28,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import AutoFeatureExtractor
+    from transformers import AutoImageProcessor
 
 
 class TFCvtConfigTester(ConfigTester):
@@ -42,8 +45,8 @@ def __init__(
         batch_size=13,
         image_size=64,
         num_channels=3,
-        embed_dim=[16, 48, 96],
-        num_heads=[1, 3, 6],
+        embed_dim=[16, 32, 48],
+        num_heads=[1, 2, 3],
         depth=[1, 2, 10],
         patch_sizes=[7, 3, 3],
         patch_stride=[4, 2, 2],
@@ -128,13 +131,18 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFCvtModelTest(TFModelTesterMixin, unittest.TestCase):
+class TFCvtModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_common.py, as Cvt
     does not use input_ids, inputs_embeds, attention_mask and seq_length.
     """
 
     all_model_classes = (TFCvtModel, TFCvtForImageClassification) if is_tf_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": TFCvtModel, "image-classification": TFCvtForImageClassification}
+        if is_tf_available()
+        else {}
+    )
     test_pruning = False
     test_resize_embeddings = False
     test_head_masking = False
@@ -177,9 +185,17 @@ def test_dataset_conversion(self):
         not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0,
         reason="TF does not support backprop for grouped convolutions on CPU.",
     )
+    @slow
     def test_keras_fit(self):
         super().test_keras_fit()
 
+    @unittest.skip(reason="Get `Failed to determine best cudnn convolution algo.` error after using TF 2.12+cuda 11.8")
+    def test_keras_fit_mixed_precision(self):
+        policy = tf.keras.mixed_precision.Policy("mixed_float16")
+        tf.keras.mixed_precision.set_global_policy(policy)
+        super().test_keras_fit()
+        tf.keras.mixed_precision.set_global_policy("float32")
+
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -249,16 +265,16 @@ def prepare_img():
 @require_vision
 class TFCvtModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return AutoFeatureExtractor.from_pretrained(TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST[0])
+    def default_image_processor(self):
+        return AutoImageProcessor.from_pretrained(TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST[0])
 
     @slow
     def test_inference_image_classification_head(self):
         model = TFCvtForImageClassification.from_pretrained(TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST[0])
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="tf")
+        inputs = image_processor(images=image, return_tensors="tf")
 
         # forward pass
         outputs = model(**inputs)
diff --git a/tests/models/data2vec/test_modeling_data2vec_audio.py b/tests/models/data2vec/test_modeling_data2vec_audio.py
index e3fb96097d84..b9e3bff346e9 100644
--- a/tests/models/data2vec/test_modeling_data2vec_audio.py
+++ b/tests/models/data2vec/test_modeling_data2vec_audio.py
@@ -26,6 +26,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -58,7 +59,7 @@ def __init__(
         conv_bias=False,
         num_conv_pos_embeddings=16,
         num_conv_pos_embedding_groups=2,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         hidden_dropout_prob=0.1,
         intermediate_size=20,
@@ -282,8 +283,8 @@ def check_ctc_training(self, config, input_values, *args):
             input_values[i, input_lengths[i] :] = 0.0
 
             if max_length_labels[i] < labels.shape[-1]:
-                # it's important that we make sure that target lenghts are at least
-                # one shorter than logit lenghts to prevent -inf
+                # it's important that we make sure that target lengths are at least
+                # one shorter than logit lengths to prevent -inf
                 labels[i, max_length_labels[i] - 1 :] = -100
 
         loss = model(input_values, labels=labels).loss
@@ -358,7 +359,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class Data2VecAudioModelTest(ModelTesterMixin, unittest.TestCase):
+class Data2VecAudioModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             Data2VecAudioForCTC,
@@ -370,6 +371,15 @@ class Data2VecAudioModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "audio-classification": Data2VecAudioForSequenceClassification,
+            "automatic-speech-recognition": Data2VecAudioForCTC,
+            "feature-extraction": Data2VecAudioModel,
+        }
+        if is_torch_available()
+        else {}
+    )
     test_pruning = False
     test_headmasking = False
 
@@ -506,7 +516,7 @@ def test_initialization(self):
                     "objective.weight",
                 ]
                 if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
+                    if any(x in name for x in uniform_init_parms):
                         self.assertTrue(
                             -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
diff --git a/tests/models/data2vec/test_modeling_data2vec_text.py b/tests/models/data2vec/test_modeling_data2vec_text.py
index c3015c3f409e..afaa8a76addb 100644
--- a/tests/models/data2vec/test_modeling_data2vec_text.py
+++ b/tests/models/data2vec/test_modeling_data2vec_text.py
@@ -23,6 +23,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -56,7 +57,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -359,7 +360,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class Data2VecTextModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class Data2VecTextModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             Data2VecTextForCausalLM,
@@ -374,6 +375,20 @@ class Data2VecTextModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Te
         else ()
     )
     all_generative_model_classes = (Data2VecTextForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": Data2VecTextModel,
+            "fill-mask": Data2VecTextForMaskedLM,
+            "question-answering": Data2VecTextForQuestionAnswering,
+            "text-classification": Data2VecTextForSequenceClassification,
+            "text-generation": Data2VecTextForCausalLM,
+            "token-classification": Data2VecTextForTokenClassification,
+            "zero-shot": Data2VecTextForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
+    model_split_percents = [0.5, 0.9]
 
     def setUp(self):
         self.model_tester = Data2VecTextModelTester(self)
diff --git a/tests/models/data2vec/test_modeling_data2vec_vision.py b/tests/models/data2vec/test_modeling_data2vec_vision.py
index a7974e8cbd98..69a763a4f2ec 100644
--- a/tests/models/data2vec/test_modeling_data2vec_vision.py
+++ b/tests/models/data2vec/test_modeling_data2vec_vision.py
@@ -25,6 +25,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -43,7 +44,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import BeitFeatureExtractor
+    from transformers import BeitImageProcessor
 
 
 class Data2VecVisionModelTester:
@@ -58,7 +59,7 @@ def __init__(
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -165,7 +166,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class Data2VecVisionModelTest(ModelTesterMixin, unittest.TestCase):
+class Data2VecVisionModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_common.py, as Data2VecVision does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
@@ -176,6 +177,15 @@ class Data2VecVisionModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": Data2VecVisionModel,
+            "image-classification": Data2VecVisionForImageClassification,
+            "image-segmentation": Data2VecVisionForSemanticSegmentation,
+        }
+        if is_torch_available()
+        else {}
+    )
 
     test_pruning = False
     test_resize_embeddings = False
@@ -317,11 +327,9 @@ def prepare_img():
 @require_vision
 class Data2VecVisionModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return (
-            BeitFeatureExtractor.from_pretrained("facebook/data2vec-vision-base-ft1k")
-            if is_vision_available()
-            else None
+            BeitImageProcessor.from_pretrained("facebook/data2vec-vision-base-ft1k") if is_vision_available() else None
         )
 
     @slow
@@ -330,9 +338,9 @@ def test_inference_image_classification_head_imagenet_1k(self):
             torch_device
         )
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/data2vec/test_modeling_tf_data2vec_vision.py b/tests/models/data2vec/test_modeling_tf_data2vec_vision.py
index eb085af0d82b..fa6764344068 100644
--- a/tests/models/data2vec/test_modeling_tf_data2vec_vision.py
+++ b/tests/models/data2vec/test_modeling_tf_data2vec_vision.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 """ Testing suite for the TensorFlow Data2VecVision model. """
 
+from __future__ import annotations
+
 import collections.abc
 import inspect
 import unittest
@@ -26,6 +28,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -43,7 +46,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import BeitFeatureExtractor
+    from transformers import BeitImageProcessor
 
 
 class TFData2VecVisionModelTester:
@@ -58,7 +61,7 @@ def __init__(
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -172,7 +175,7 @@ def prepare_config_and_inputs_for_keras_fit(self):
 
 
 @require_tf
-class TFData2VecVisionModelTest(TFModelTesterMixin, unittest.TestCase):
+class TFData2VecVisionModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_common.py, as Data2VecVision does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
@@ -183,6 +186,11 @@ class TFData2VecVisionModelTest(TFModelTesterMixin, unittest.TestCase):
         if is_tf_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {"feature-extraction": TFData2VecVisionModel, "image-classification": TFData2VecVisionForImageClassification}
+        if is_tf_available()
+        else {}
+    )
 
     test_pruning = False
     test_onnx = False
@@ -232,10 +240,6 @@ def test_for_image_segmentation(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_image_segmentation(*config_and_inputs)
 
-    @unittest.skip("Test was written for TF 1.x and isn't really relevant here")
-    def test_compile_tf_model(self):
-        pass
-
     def test_attention_outputs(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
@@ -343,6 +347,7 @@ def check_hidden_states_output(inputs_dict, config, model_class):
             check_hidden_states_output(inputs_dict, config, model_class)
 
     # Overriding this method since the base method won't be compatible with Data2VecVision.
+    @slow
     def test_keras_fit(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
@@ -398,7 +403,7 @@ def test_loss_computation(self):
                     # The number of elements in the loss should be the same as the number of elements in the label
                     _, prepared_for_class = self.model_tester.prepare_config_and_inputs_for_keras_fit()
                     added_label = prepared_for_class[
-                        sorted(list(prepared_for_class.keys() - inputs_dict.keys()), reverse=True)[0]
+                        sorted(prepared_for_class.keys() - inputs_dict.keys(), reverse=True)[0]
                     ]
                     loss_size = tf.size(added_label)
 
@@ -464,20 +469,18 @@ def prepare_img():
 @require_vision
 class TFData2VecVisionModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return (
-            BeitFeatureExtractor.from_pretrained("facebook/data2vec-vision-base-ft1k")
-            if is_vision_available()
-            else None
+            BeitImageProcessor.from_pretrained("facebook/data2vec-vision-base-ft1k") if is_vision_available() else None
         )
 
     @slow
     def test_inference_image_classification_head_imagenet_1k(self):
         model = TFData2VecVisionForImageClassification.from_pretrained("facebook/data2vec-vision-base-ft1k")
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="tf")
+        inputs = image_processor(images=image, return_tensors="tf")
 
         # forward pass
         outputs = model(**inputs)
diff --git a/tests/models/deberta/test_modeling_deberta.py b/tests/models/deberta/test_modeling_deberta.py
index 940a82db4398..52758e2222ae 100644
--- a/tests/models/deberta/test_modeling_deberta.py
+++ b/tests/models/deberta/test_modeling_deberta.py
@@ -19,6 +19,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -46,7 +47,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -213,8 +214,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class DebertaModelTest(ModelTesterMixin, unittest.TestCase):
-
+class DebertaModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             DebertaModel,
@@ -226,6 +226,18 @@ class DebertaModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": DebertaModel,
+            "fill-mask": DebertaForMaskedLM,
+            "question-answering": DebertaForQuestionAnswering,
+            "text-classification": DebertaForSequenceClassification,
+            "token-classification": DebertaForTokenClassification,
+            "zero-shot": DebertaForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
 
     fx_compatible = True
     test_torchscript = False
diff --git a/tests/models/deberta/test_modeling_tf_deberta.py b/tests/models/deberta/test_modeling_tf_deberta.py
index c2584db30f19..14a99ea947ec 100644
--- a/tests/models/deberta/test_modeling_tf_deberta.py
+++ b/tests/models/deberta/test_modeling_tf_deberta.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import DebertaConfig, is_tf_available
@@ -21,6 +23,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -47,7 +50,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -70,7 +73,7 @@ def __init__(
         self.use_labels = True
         self.vocab_size = 99
         self.hidden_size = 32
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.num_attention_heads = 4
         self.intermediate_size = 37
         self.hidden_act = "gelu"
@@ -207,8 +210,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFDebertaModelTest(TFModelTesterMixin, unittest.TestCase):
-
+class TFDebertaModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             TFDebertaModel,
@@ -220,6 +222,18 @@ class TFDebertaModelTest(TFModelTesterMixin, unittest.TestCase):
         if is_tf_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": TFDebertaModel,
+            "fill-mask": TFDebertaForMaskedLM,
+            "question-answering": TFDebertaForQuestionAnswering,
+            "text-classification": TFDebertaForSequenceClassification,
+            "token-classification": TFDebertaForTokenClassification,
+            "zero-shot": TFDebertaForSequenceClassification,
+        }
+        if is_tf_available()
+        else {}
+    )
 
     test_head_masking = False
     test_onnx = False
diff --git a/tests/models/deberta/test_tokenization_deberta.py b/tests/models/deberta/test_tokenization_deberta.py
index 4aa53e13ff8d..81d7bd95bd80 100644
--- a/tests/models/deberta/test_tokenization_deberta.py
+++ b/tests/models/deberta/test_tokenization_deberta.py
@@ -26,7 +26,6 @@
 
 
 class DebertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = DebertaTokenizer
     test_rust_tokenizer = True
     rust_tokenizer_class = DebertaTokenizerFast
diff --git a/tests/models/deberta_v2/test_modeling_deberta_v2.py b/tests/models/deberta_v2/test_modeling_deberta_v2.py
index 8c9bf3bbf7e5..abfbe7402c93 100644
--- a/tests/models/deberta_v2/test_modeling_deberta_v2.py
+++ b/tests/models/deberta_v2/test_modeling_deberta_v2.py
@@ -19,6 +19,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -47,7 +48,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -226,8 +227,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class DebertaV2ModelTest(ModelTesterMixin, unittest.TestCase):
-
+class DebertaV2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             DebertaV2Model,
@@ -240,6 +240,18 @@ class DebertaV2ModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": DebertaV2Model,
+            "fill-mask": DebertaV2ForMaskedLM,
+            "question-answering": DebertaV2ForQuestionAnswering,
+            "text-classification": DebertaV2ForSequenceClassification,
+            "token-classification": DebertaV2ForTokenClassification,
+            "zero-shot": DebertaV2ForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
 
     fx_compatible = True
     test_torchscript = False
diff --git a/tests/models/deberta_v2/test_modeling_tf_deberta_v2.py b/tests/models/deberta_v2/test_modeling_tf_deberta_v2.py
index b2cc8896e46e..b46f68525d36 100644
--- a/tests/models/deberta_v2/test_modeling_tf_deberta_v2.py
+++ b/tests/models/deberta_v2/test_modeling_tf_deberta_v2.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import DebertaV2Config, is_tf_available
@@ -21,6 +23,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -28,6 +31,7 @@
 
     from transformers import (
         TFDebertaV2ForMaskedLM,
+        TFDebertaV2ForMultipleChoice,
         TFDebertaV2ForQuestionAnswering,
         TFDebertaV2ForSequenceClassification,
         TFDebertaV2ForTokenClassification,
@@ -47,7 +51,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -193,6 +197,22 @@ def create_and_check_for_question_answering(
         self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
         self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
 
+    def create_and_check_for_multiple_choice(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = TFDebertaV2ForMultipleChoice(config=config)
+        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
+        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
+        multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
+        inputs = {
+            "input_ids": multiple_choice_inputs_ids,
+            "attention_mask": multiple_choice_input_mask,
+            "token_type_ids": multiple_choice_token_type_ids,
+        }
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         (
@@ -209,19 +229,31 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFDebertaModelTest(TFModelTesterMixin, unittest.TestCase):
-
+class TFDebertaModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             TFDebertaV2Model,
             TFDebertaV2ForMaskedLM,
             TFDebertaV2ForQuestionAnswering,
+            TFDebertaV2ForMultipleChoice,
             TFDebertaV2ForSequenceClassification,
             TFDebertaV2ForTokenClassification,
         )
         if is_tf_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": TFDebertaV2Model,
+            "fill-mask": TFDebertaV2ForMaskedLM,
+            "question-answering": TFDebertaV2ForQuestionAnswering,
+            "text-classification": TFDebertaV2ForSequenceClassification,
+            "token-classification": TFDebertaV2ForTokenClassification,
+            "zero-shot": TFDebertaV2ForSequenceClassification,
+        }
+        if is_tf_available()
+        else {}
+    )
 
     test_head_masking = False
     test_onnx = False
diff --git a/tests/models/deberta_v2/test_tokenization_deberta_v2.py b/tests/models/deberta_v2/test_tokenization_deberta_v2.py
index f2831315e5c2..404aaa9e7e11 100644
--- a/tests/models/deberta_v2/test_tokenization_deberta_v2.py
+++ b/tests/models/deberta_v2/test_tokenization_deberta_v2.py
@@ -27,7 +27,6 @@
 @require_sentencepiece
 @require_tokenizers
 class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = DebertaV2Tokenizer
     rust_tokenizer_class = DebertaV2TokenizerFast
     test_sentencepiece = True
@@ -69,12 +68,12 @@ def test_do_lower_case(self):
         tokens_target = ["▁hello", "!", "how", "▁are", "▁you", "?"]
         # fmt: on
 
-        tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, do_lower_case=True)
+        tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="<unk>", do_lower_case=True)
         tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False))
 
         self.assertListEqual(tokens, tokens_target)
 
-        rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, do_lower_case=True)
+        rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, unk_token="<unk>", do_lower_case=True)
         rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False))
 
         self.assertListEqual(rust_tokens, tokens_target)
@@ -93,12 +92,12 @@ def test_split_by_punct(self):
         tokens_target = ["▁", "<unk>", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "<unk>", "▁", ".", ]
         # fmt: on
 
-        tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, split_by_punct=True)
+        tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="<unk>", split_by_punct=True)
         tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False))
 
         self.assertListEqual(tokens, tokens_target)
 
-        rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, split_by_punct=True)
+        rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, unk_token="<unk>", split_by_punct=True)
         rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False))
 
         self.assertListEqual(rust_tokens, tokens_target)
@@ -109,11 +108,13 @@ def test_do_lower_case_split_by_punct(self):
         tokens_target = ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "<unk>", "▁", ".", ]
         # fmt: on
 
-        tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, do_lower_case=True, split_by_punct=True)
+        tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="<unk>", do_lower_case=True, split_by_punct=True)
         tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False))
         self.assertListEqual(tokens, tokens_target)
 
-        rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, do_lower_case=True, split_by_punct=True)
+        rust_tokenizer = DebertaV2TokenizerFast(
+            SAMPLE_VOCAB, unk_token="<unk>", do_lower_case=True, split_by_punct=True
+        )
         rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False))
         self.assertListEqual(rust_tokens, tokens_target)
 
@@ -123,12 +124,14 @@ def test_do_lower_case_split_by_punct_false(self):
         tokens_target = ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "<unk>", ".", ]
         # fmt: on
 
-        tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, do_lower_case=True, split_by_punct=False)
+        tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="<unk>", do_lower_case=True, split_by_punct=False)
         tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False))
 
         self.assertListEqual(tokens, tokens_target)
 
-        rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, do_lower_case=True, split_by_punct=False)
+        rust_tokenizer = DebertaV2TokenizerFast(
+            SAMPLE_VOCAB, unk_token="<unk>", do_lower_case=True, split_by_punct=False
+        )
         rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False))
 
         self.assertListEqual(rust_tokens, tokens_target)
@@ -139,12 +142,14 @@ def test_do_lower_case_false_split_by_punct(self):
         tokens_target = ["▁", "<unk>", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "<unk>", "▁", ".", ]
         # fmt: on
 
-        tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, do_lower_case=False, split_by_punct=True)
+        tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="<unk>", do_lower_case=False, split_by_punct=True)
         tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False))
 
         self.assertListEqual(tokens, tokens_target)
 
-        rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, do_lower_case=False, split_by_punct=True)
+        rust_tokenizer = DebertaV2TokenizerFast(
+            SAMPLE_VOCAB, unk_token="<unk>", do_lower_case=False, split_by_punct=True
+        )
         rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False))
 
         self.assertListEqual(rust_tokens, tokens_target)
@@ -155,18 +160,19 @@ def test_do_lower_case_false_split_by_punct_false(self):
         tokens_target = ["▁", "<unk>", "e", "<unk>", "o", "!", "how", "▁", "<unk>", "re", "▁yo", "<unk>", "?"]
         # fmt: on
 
-        tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, do_lower_case=False, split_by_punct=False)
+        tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="<unk>", do_lower_case=False, split_by_punct=False)
         tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False))
 
         self.assertListEqual(tokens, tokens_target)
 
-        rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, do_lower_case=False, split_by_punct=False)
+        rust_tokenizer = DebertaV2TokenizerFast(
+            SAMPLE_VOCAB, unk_token="<unk>", do_lower_case=False, split_by_punct=False
+        )
         rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False))
 
         self.assertListEqual(rust_tokens, tokens_target)
 
     def test_rust_and_python_full_tokenizers(self):
-
         tokenizer = self.get_tokenizer()
         rust_tokenizer = self.get_rust_tokenizer()
 
@@ -191,8 +197,8 @@ def test_full_tokenizer(self):
         tokens_target = ["▁", "T", "his", "▁is", "▁a", "▁test"]
         back_tokens_target = ["▁", "<unk>", "his", "▁is", "▁a", "▁test"]
 
-        tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, keep_accents=True)
-        rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, keep_accents=True)
+        tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="<unk>", keep_accents=True)
+        rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, unk_token="<unk>", keep_accents=True)
 
         ids = tokenizer.encode(sequence, add_special_tokens=False)
         self.assertListEqual(ids, ids_target)
diff --git a/tests/models/decision_transformer/test_modeling_decision_transformer.py b/tests/models/decision_transformer/test_modeling_decision_transformer.py
index ece5ac333945..d99521b2f19e 100644
--- a/tests/models/decision_transformer/test_modeling_decision_transformer.py
+++ b/tests/models/decision_transformer/test_modeling_decision_transformer.py
@@ -24,6 +24,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -131,10 +132,10 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class DecisionTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
+class DecisionTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (DecisionTransformerModel,) if is_torch_available() else ()
     all_generative_model_classes = ()
+    pipeline_model_mapping = {"feature-extraction": DecisionTransformerModel} if is_torch_available() else {}
 
     # Ignoring of a failing test from GenerationTesterMixin, as the model does not use inputs_ids
     test_generate_without_input_ids = False
diff --git a/tests/models/deformable_detr/test_feature_extraction_deformable_detr.py b/tests/models/deformable_detr/test_feature_extraction_deformable_detr.py
deleted file mode 100644
index aaafb7ff2f23..000000000000
--- a/tests/models/deformable_detr/test_feature_extraction_deformable_detr.py
+++ /dev/null
@@ -1,362 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-import pathlib
-import unittest
-
-import numpy as np
-
-from transformers.testing_utils import require_torch, require_vision, slow
-from transformers.utils import is_torch_available, is_vision_available
-
-from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
-
-
-if is_torch_available():
-    import torch
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import DeformableDetrFeatureExtractor
-
-
-class DeformableDetrFeatureExtractionTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-        do_rescale=True,
-        rescale_factor=1 / 255,
-        do_pad=True,
-    ):
-        # by setting size["longest_edge"] > max_resolution we're effectively not testing this :p
-        size = size if size is not None else {"shortest_edge": 18, "longest_edge": 1333}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.do_rescale = do_rescale
-        self.rescale_factor = rescale_factor
-        self.do_pad = do_pad
-
-    def prepare_feat_extract_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_normalize": self.do_normalize,
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_rescale": self.do_rescale,
-            "rescale_factor": self.rescale_factor,
-            "do_pad": self.do_pad,
-        }
-
-    def get_expected_values(self, image_inputs, batched=False):
-        """
-        This function computes the expected height and width when providing images to DeformableDetrFeatureExtractor,
-        assuming do_resize is set to True with a scalar size.
-        """
-        if not batched:
-            image = image_inputs[0]
-            if isinstance(image, Image.Image):
-                w, h = image.size
-            else:
-                h, w = image.shape[1], image.shape[2]
-            if w < h:
-                expected_height = int(self.size["shortest_edge"] * h / w)
-                expected_width = self.size["shortest_edge"]
-            elif w > h:
-                expected_height = self.size["shortest_edge"]
-                expected_width = int(self.size["shortest_edge"] * w / h)
-            else:
-                expected_height = self.size["shortest_edge"]
-                expected_width = self.size["shortest_edge"]
-
-        else:
-            expected_values = []
-            for image in image_inputs:
-                expected_height, expected_width = self.get_expected_values([image])
-                expected_values.append((expected_height, expected_width))
-            expected_height = max(expected_values, key=lambda item: item[0])[0]
-            expected_width = max(expected_values, key=lambda item: item[1])[1]
-
-        return expected_height, expected_width
-
-
-@require_torch
-@require_vision
-class DeformableDetrFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
-
-    feature_extraction_class = DeformableDetrFeatureExtractor if is_vision_available() else None
-
-    def setUp(self):
-        self.feature_extract_tester = DeformableDetrFeatureExtractionTester(self)
-
-    @property
-    def feat_extract_dict(self):
-        return self.feature_extract_tester.prepare_feat_extract_dict()
-
-    def test_feat_extract_properties(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        self.assertTrue(hasattr(feature_extractor, "image_mean"))
-        self.assertTrue(hasattr(feature_extractor, "image_std"))
-        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
-        self.assertTrue(hasattr(feature_extractor, "do_resize"))
-        self.assertTrue(hasattr(feature_extractor, "do_rescale"))
-        self.assertTrue(hasattr(feature_extractor, "do_pad"))
-        self.assertTrue(hasattr(feature_extractor, "size"))
-
-    def test_feat_extract_from_dict_with_kwargs(self):
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict)
-        self.assertEqual(feature_extractor.size, {"shortest_edge": 18, "longest_edge": 1333})
-        self.assertEqual(feature_extractor.do_pad, True)
-
-        feature_extractor = self.feature_extraction_class.from_dict(
-            self.feat_extract_dict, size=42, max_size=84, pad_and_return_pixel_mask=False
-        )
-        self.assertEqual(feature_extractor.size, {"shortest_edge": 42, "longest_edge": 84})
-        self.assertEqual(feature_extractor.do_pad, False)
-
-    def test_batch_feature(self):
-        pass
-
-    def test_call_pil(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PIL images
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-
-        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs)
-
-        self.assertEqual(
-            encoded_images.shape,
-            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
-        )
-
-        # Test batched
-        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True)
-
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                expected_height,
-                expected_width,
-            ),
-        )
-
-    def test_call_numpy(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random numpy tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-
-        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs)
-
-        self.assertEqual(
-            encoded_images.shape,
-            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-
-        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True)
-
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                expected_height,
-                expected_width,
-            ),
-        )
-
-    def test_call_pytorch(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-
-        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs)
-
-        self.assertEqual(
-            encoded_images.shape,
-            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-
-        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True)
-
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                expected_height,
-                expected_width,
-            ),
-        )
-
-    def test_equivalence_pad_and_create_pixel_mask(self):
-        # Initialize feature_extractors
-        feature_extractor_1 = self.feature_extraction_class(**self.feat_extract_dict)
-        feature_extractor_2 = self.feature_extraction_class(do_resize=False, do_normalize=False, do_rescale=False)
-
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test whether the method "pad_and_return_pixel_mask" and calling the feature extractor return the same tensors
-        encoded_images_with_method = feature_extractor_1.pad_and_create_pixel_mask(image_inputs, return_tensors="pt")
-        encoded_images = feature_extractor_2(image_inputs, return_tensors="pt")
-
-        self.assertTrue(
-            torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4)
-        )
-        self.assertTrue(
-            torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4)
-        )
-
-    @slow
-    def test_call_pytorch_with_coco_detection_annotations(self):
-        # prepare image and target
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
-            target = json.loads(f.read())
-
-        target = {"image_id": 39769, "annotations": target}
-
-        # encode them
-        feature_extractor = DeformableDetrFeatureExtractor()
-        encoding = feature_extractor(images=image, annotations=target, return_tensors="pt")
-
-        # verify pixel values
-        expected_shape = torch.Size([1, 3, 800, 1066])
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
-        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
-
-        # verify area
-        expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
-        # verify boxes
-        expected_boxes_shape = torch.Size([6, 4])
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
-        expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
-        # verify image_id
-        expected_image_id = torch.tensor([39769])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
-        # verify is_crowd
-        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
-        # verify class_labels
-        expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
-        # verify orig_size
-        expected_orig_size = torch.tensor([480, 640])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
-        # verify size
-        expected_size = torch.tensor([800, 1066])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
-
-    @slow
-    def test_call_pytorch_with_coco_panoptic_annotations(self):
-        # prepare image, target and masks_path
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
-            target = json.loads(f.read())
-
-        target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
-
-        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
-
-        # encode them
-        feature_extractor = DeformableDetrFeatureExtractor(format="coco_panoptic")
-        encoding = feature_extractor(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
-
-        # verify pixel values
-        expected_shape = torch.Size([1, 3, 800, 1066])
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
-        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
-
-        # verify area
-        expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
-        # verify boxes
-        expected_boxes_shape = torch.Size([6, 4])
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
-        expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
-        # verify image_id
-        expected_image_id = torch.tensor([39769])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
-        # verify is_crowd
-        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
-        # verify class_labels
-        expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
-        # verify masks
-        expected_masks_sum = 822873
-        self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
-        # verify orig_size
-        expected_orig_size = torch.tensor([480, 640])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
-        # verify size
-        expected_size = torch.tensor([800, 1066])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
diff --git a/tests/models/deformable_detr/test_image_processing_deformable_detr.py b/tests/models/deformable_detr/test_image_processing_deformable_detr.py
new file mode 100644
index 000000000000..4fd2de49f78a
--- /dev/null
+++ b/tests/models/deformable_detr/test_image_processing_deformable_detr.py
@@ -0,0 +1,252 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import pathlib
+import unittest
+
+from transformers.testing_utils import require_torch, require_vision, slow
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import DeformableDetrImageProcessor
+
+
+class DeformableDetrImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+        do_rescale=True,
+        rescale_factor=1 / 255,
+        do_pad=True,
+    ):
+        # by setting size["longest_edge"] > max_resolution we're effectively not testing this :p
+        size = size if size is not None else {"shortest_edge": 18, "longest_edge": 1333}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_pad = do_pad
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_rescale": self.do_rescale,
+            "rescale_factor": self.rescale_factor,
+            "do_pad": self.do_pad,
+        }
+
+    def get_expected_values(self, image_inputs, batched=False):
+        """
+        This function computes the expected height and width when providing images to DeformableDetrImageProcessor,
+        assuming do_resize is set to True with a scalar size.
+        """
+        if not batched:
+            image = image_inputs[0]
+            if isinstance(image, Image.Image):
+                w, h = image.size
+            else:
+                h, w = image.shape[1], image.shape[2]
+            if w < h:
+                expected_height = int(self.size["shortest_edge"] * h / w)
+                expected_width = self.size["shortest_edge"]
+            elif w > h:
+                expected_height = self.size["shortest_edge"]
+                expected_width = int(self.size["shortest_edge"] * w / h)
+            else:
+                expected_height = self.size["shortest_edge"]
+                expected_width = self.size["shortest_edge"]
+
+        else:
+            expected_values = []
+            for image in image_inputs:
+                expected_height, expected_width = self.get_expected_values([image])
+                expected_values.append((expected_height, expected_width))
+            expected_height = max(expected_values, key=lambda item: item[0])[0]
+            expected_width = max(expected_values, key=lambda item: item[1])[1]
+
+        return expected_height, expected_width
+
+    def expected_output_image_shape(self, images):
+        height, width = self.get_expected_values(images, batched=True)
+        return self.num_channels, height, width
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class DeformableDetrImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = DeformableDetrImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = DeformableDetrImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "do_rescale"))
+        self.assertTrue(hasattr(image_processing, "do_pad"))
+        self.assertTrue(hasattr(image_processing, "size"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333})
+        self.assertEqual(image_processor.do_pad, True)
+
+        image_processor = self.image_processing_class.from_dict(
+            self.image_processor_dict, size=42, max_size=84, pad_and_return_pixel_mask=False
+        )
+        self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84})
+        self.assertEqual(image_processor.do_pad, False)
+
+    @slow
+    def test_call_pytorch_with_coco_detection_annotations(self):
+        # prepare image and target
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
+            target = json.loads(f.read())
+
+        target = {"image_id": 39769, "annotations": target}
+
+        # encode them
+        image_processing = DeformableDetrImageProcessor()
+        encoding = image_processing(images=image, annotations=target, return_tensors="pt")
+
+        # verify pixel values
+        expected_shape = torch.Size([1, 3, 800, 1066])
+        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+
+        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
+        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
+
+        # verify area
+        expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
+        # verify boxes
+        expected_boxes_shape = torch.Size([6, 4])
+        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
+        expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
+        # verify image_id
+        expected_image_id = torch.tensor([39769])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
+        # verify is_crowd
+        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
+        # verify class_labels
+        expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
+        # verify orig_size
+        expected_orig_size = torch.tensor([480, 640])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
+        # verify size
+        expected_size = torch.tensor([800, 1066])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
+
+    @slow
+    def test_call_pytorch_with_coco_panoptic_annotations(self):
+        # prepare image, target and masks_path
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
+            target = json.loads(f.read())
+
+        target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
+
+        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
+
+        # encode them
+        image_processing = DeformableDetrImageProcessor(format="coco_panoptic")
+        encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
+
+        # verify pixel values
+        expected_shape = torch.Size([1, 3, 800, 1066])
+        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+
+        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
+        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
+
+        # verify area
+        expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
+        # verify boxes
+        expected_boxes_shape = torch.Size([6, 4])
+        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
+        expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
+        # verify image_id
+        expected_image_id = torch.tensor([39769])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
+        # verify is_crowd
+        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
+        # verify class_labels
+        expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
+        # verify masks
+        expected_masks_sum = 822873
+        self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
+        # verify orig_size
+        expected_orig_size = torch.tensor([480, 640])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
+        # verify size
+        expected_size = torch.tensor([800, 1066])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
diff --git a/tests/models/deformable_detr/test_modeling_deformable_detr.py b/tests/models/deformable_detr/test_modeling_deformable_detr.py
index f69d8f15c19f..b44564f69193 100644
--- a/tests/models/deformable_detr/test_modeling_deformable_detr.py
+++ b/tests/models/deformable_detr/test_modeling_deformable_detr.py
@@ -20,16 +20,24 @@
 import unittest
 from typing import Dict, List, Tuple
 
-from transformers import DeformableDetrConfig, is_timm_available, is_vision_available
+from transformers import DeformableDetrConfig, ResNetConfig, is_torch_available, is_vision_available
 from transformers.file_utils import cached_property
-from transformers.testing_utils import require_timm, require_torch_gpu, require_vision, slow, torch_device
+from transformers.testing_utils import (
+    require_timm,
+    require_torch,
+    require_torch_gpu,
+    require_vision,
+    slow,
+    torch_device,
+)
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
-if is_timm_available():
+if is_torch_available():
     import torch
 
     from transformers import DeformableDetrForObjectDetection, DeformableDetrModel
@@ -38,7 +46,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import AutoFeatureExtractor
+    from transformers import AutoImageProcessor
 
 
 class DeformableDetrModelTester:
@@ -48,7 +56,7 @@ def __init__(
         batch_size=8,
         is_training=True,
         use_labels=True,
-        hidden_size=256,
+        hidden_size=32,
         num_hidden_layers=2,
         num_attention_heads=8,
         intermediate_size=4,
@@ -115,6 +123,16 @@ def prepare_config_and_inputs(self):
         return config, pixel_values, pixel_mask, labels
 
     def get_config(self):
+        resnet_config = ResNetConfig(
+            num_channels=3,
+            embeddings_size=10,
+            hidden_sizes=[10, 20, 30, 40],
+            depths=[1, 1, 2, 1],
+            hidden_act="relu",
+            num_labels=3,
+            out_features=["stage2", "stage3", "stage4"],
+            out_indices=[2, 3, 4],
+        )
         return DeformableDetrConfig(
             d_model=self.hidden_size,
             encoder_layers=self.num_hidden_layers,
@@ -130,6 +148,8 @@ def get_config(self):
             num_feature_levels=self.num_feature_levels,
             encoder_n_points=self.encoder_n_points,
             decoder_n_points=self.decoder_n_points,
+            use_timm_backbone=False,
+            backbone_config=resnet_config,
         )
 
     def prepare_config_and_inputs_for_common(self):
@@ -165,9 +185,14 @@ def create_and_check_deformable_detr_object_detection_head_model(self, config, p
         self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
 
 
-@require_timm
-class DeformableDetrModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (DeformableDetrModel, DeformableDetrForObjectDetection) if is_timm_available() else ()
+@require_torch
+class DeformableDetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (DeformableDetrModel, DeformableDetrForObjectDetection) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": DeformableDetrModel, "object-detection": DeformableDetrForObjectDetection}
+        if is_torch_available()
+        else {}
+    )
     is_encoder_decoder = True
     test_torchscript = False
     test_pruning = False
@@ -519,6 +544,21 @@ def test_initialization(self):
                         msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                     )
 
+    def test_two_stage_training(self):
+        model_class = DeformableDetrForObjectDetection
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+        config.two_stage = True
+        config.auxiliary_loss = True
+        config.with_box_refine = True
+
+        model = model_class(config)
+        model.to(torch_device)
+        model.train()
+        inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+        loss = model(**inputs).loss
+        loss.backward()
+
 
 TOLERANCE = 1e-4
 
@@ -534,15 +574,15 @@ def prepare_img():
 @slow
 class DeformableDetrModelIntegrationTests(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return AutoFeatureExtractor.from_pretrained("SenseTime/deformable-detr") if is_vision_available() else None
+    def default_image_processor(self):
+        return AutoImageProcessor.from_pretrained("SenseTime/deformable-detr") if is_vision_available() else None
 
     def test_inference_object_detection_head(self):
         model = DeformableDetrForObjectDetection.from_pretrained("SenseTime/deformable-detr").to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        encoding = image_processor(images=image, return_tensors="pt").to(torch_device)
         pixel_values = encoding["pixel_values"].to(torch_device)
         pixel_mask = encoding["pixel_mask"].to(torch_device)
 
@@ -566,7 +606,7 @@ def test_inference_object_detection_head(self):
         self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4))
 
         # verify postprocessing
-        results = feature_extractor.post_process_object_detection(
+        results = image_processor.post_process_object_detection(
             outputs, threshold=0.3, target_sizes=[image.size[::-1]]
         )[0]
         expected_scores = torch.tensor([0.7999, 0.7894, 0.6331, 0.4720, 0.4382]).to(torch_device)
@@ -583,9 +623,9 @@ def test_inference_object_detection_head_with_box_refine_two_stage(self):
             "SenseTime/deformable-detr-with-box-refine-two-stage"
         ).to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        encoding = image_processor(images=image, return_tensors="pt").to(torch_device)
         pixel_values = encoding["pixel_values"].to(torch_device)
         pixel_mask = encoding["pixel_mask"].to(torch_device)
 
@@ -610,9 +650,9 @@ def test_inference_object_detection_head_with_box_refine_two_stage(self):
 
     @require_torch_gpu
     def test_inference_object_detection_head_equivalence_cpu_gpu(self):
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        encoding = feature_extractor(images=image, return_tensors="pt")
+        encoding = image_processor(images=image, return_tensors="pt")
         pixel_values = encoding["pixel_values"]
         pixel_mask = encoding["pixel_mask"]
 
diff --git a/tests/models/deit/test_feature_extraction_deit.py b/tests/models/deit/test_feature_extraction_deit.py
deleted file mode 100644
index f684008ccc3f..000000000000
--- a/tests/models/deit/test_feature_extraction_deit.py
+++ /dev/null
@@ -1,212 +0,0 @@
-# coding=utf-8
-# Copyright 2021 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-
-from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torch_available, is_vision_available
-
-from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
-
-
-if is_torch_available():
-    import torch
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import DeiTFeatureExtractor
-
-
-class DeiTFeatureExtractionTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_center_crop=True,
-        crop_size=None,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-    ):
-        size = size if size is not None else {"height": 20, "width": 20}
-        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
-
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_center_crop = do_center_crop
-        self.crop_size = crop_size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-
-    def prepare_feat_extract_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_center_crop": self.do_center_crop,
-            "crop_size": self.crop_size,
-            "do_normalize": self.do_normalize,
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-        }
-
-
-@require_torch
-@require_vision
-class DeiTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
-
-    feature_extraction_class = DeiTFeatureExtractor if is_vision_available() else None
-    test_cast_dtype = True
-
-    def setUp(self):
-        self.feature_extract_tester = DeiTFeatureExtractionTester(self)
-
-    @property
-    def feat_extract_dict(self):
-        return self.feature_extract_tester.prepare_feat_extract_dict()
-
-    def test_feat_extract_properties(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        self.assertTrue(hasattr(feature_extractor, "do_resize"))
-        self.assertTrue(hasattr(feature_extractor, "size"))
-        self.assertTrue(hasattr(feature_extractor, "do_center_crop"))
-        self.assertTrue(hasattr(feature_extractor, "center_crop"))
-        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
-        self.assertTrue(hasattr(feature_extractor, "image_mean"))
-        self.assertTrue(hasattr(feature_extractor, "image_std"))
-
-    def test_feat_extract_from_dict_with_kwargs(self):
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict)
-        self.assertEqual(feature_extractor.size, {"height": 20, "width": 20})
-        self.assertEqual(feature_extractor.crop_size, {"height": 18, "width": 18})
-
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict, size=42, crop_size=84)
-        self.assertEqual(feature_extractor.size, {"height": 42, "width": 42})
-        self.assertEqual(feature_extractor.crop_size, {"height": 84, "width": 84})
-
-    def test_batch_feature(self):
-        pass
-
-    def test_call_pil(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PIL images
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-    def test_call_numpy(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random numpy tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-    def test_call_pytorch(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
diff --git a/tests/models/deit/test_image_processing_deit.py b/tests/models/deit/test_image_processing_deit.py
new file mode 100644
index 000000000000..21dc3d9e95a7
--- /dev/null
+++ b/tests/models/deit/test_image_processing_deit.py
@@ -0,0 +1,119 @@
+# coding=utf-8
+# Copyright 2021 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_vision_available():
+    from transformers import DeiTImageProcessor
+
+
+class DeiTImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_center_crop=True,
+        crop_size=None,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+    ):
+        size = size if size is not None else {"height": 20, "width": 20}
+        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
+
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_center_crop": self.do_center_crop,
+            "crop_size": self.crop_size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+        }
+
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.crop_size["height"], self.crop_size["width"]
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class DeiTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = DeiTImageProcessor if is_vision_available() else None
+    test_cast_dtype = True
+
+    def setUp(self):
+        self.image_processor_tester = DeiTImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "do_center_crop"))
+        self.assertTrue(hasattr(image_processing, "center_crop"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"height": 20, "width": 20})
+        self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
+
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84)
+        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
+        self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
diff --git a/tests/models/deit/test_modeling_deit.py b/tests/models/deit/test_modeling_deit.py
index 19858cb5b7f9..2685900afbb9 100644
--- a/tests/models/deit/test_modeling_deit.py
+++ b/tests/models/deit/test_modeling_deit.py
@@ -33,6 +33,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -54,7 +55,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import DeiTFeatureExtractor
+    from transformers import DeiTImageProcessor
 
 
 class DeiTModelTester:
@@ -68,7 +69,7 @@ def __init__(
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -144,7 +145,7 @@ def create_and_check_for_masked_image_modeling(self, config, pixel_values, label
         model.eval()
         result = model(pixel_values)
         self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.num_channels, self.image_size, self.image_size)
+            result.reconstruction.shape, (self.batch_size, self.num_channels, self.image_size, self.image_size)
         )
 
         # test greyscale images
@@ -155,7 +156,7 @@ def create_and_check_for_masked_image_modeling(self, config, pixel_values, label
 
         pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
         result = model(pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, 1, self.image_size, self.image_size))
+        self.parent.assertEqual(result.reconstruction.shape, (self.batch_size, 1, self.image_size, self.image_size))
 
     def create_and_check_for_image_classification(self, config, pixel_values, labels):
         config.num_labels = self.type_sequence_label_size
@@ -187,7 +188,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class DeiTModelTest(ModelTesterMixin, unittest.TestCase):
+class DeiTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_common.py, as DeiT does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
@@ -203,6 +204,14 @@ class DeiTModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": DeiTModel,
+            "image-classification": (DeiTForImageClassification, DeiTForImageClassificationWithTeacher),
+        }
+        if is_torch_available()
+        else {}
+    )
 
     test_pruning = False
     test_resize_embeddings = False
@@ -327,7 +336,6 @@ def test_problem_types(self):
 
             for problem_type in problem_types:
                 with self.subTest(msg=f"Testing {model_class} with {problem_type['title']}"):
-
                     config.problem_type = problem_type["title"]
                     config.num_labels = problem_type["num_labels"]
 
@@ -373,9 +381,9 @@ def prepare_img():
 @require_vision
 class DeiTModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return (
-            DeiTFeatureExtractor.from_pretrained("facebook/deit-base-distilled-patch16-224")
+            DeiTImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
             if is_vision_available()
             else None
         )
@@ -386,9 +394,9 @@ def test_inference_image_classification_head(self):
             torch_device
         )
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -412,10 +420,10 @@ def test_inference_fp16(self):
         model = DeiTModel.from_pretrained(
             "facebook/deit-base-distilled-patch16-224", torch_dtype=torch.float16, device_map="auto"
         )
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
 
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt")
+        inputs = image_processor(images=image, return_tensors="pt")
         pixel_values = inputs.pixel_values.to(torch_device)
 
         # forward pass to make sure inference works in fp16
diff --git a/tests/models/deit/test_modeling_tf_deit.py b/tests/models/deit/test_modeling_tf_deit.py
index 2a9638eda42e..0e34f35b60bb 100644
--- a/tests/models/deit/test_modeling_tf_deit.py
+++ b/tests/models/deit/test_modeling_tf_deit.py
@@ -15,6 +15,8 @@
 """ Testing suite for the TensorFlow DeiT model. """
 
 
+from __future__ import annotations
+
 import inspect
 import unittest
 
@@ -26,6 +28,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -43,7 +46,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import DeiTFeatureExtractor
+    from transformers import DeiTImageProcessor
 
 
 class TFDeiTModelTester:
@@ -57,7 +60,7 @@ def __init__(
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -129,7 +132,7 @@ def create_and_check_for_masked_image_modeling(self, config, pixel_values, label
         model = TFDeiTForMaskedImageModeling(config=config)
         result = model(pixel_values)
         self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.num_channels, self.image_size, self.image_size)
+            result.reconstruction.shape, (self.batch_size, self.num_channels, self.image_size, self.image_size)
         )
 
         # test greyscale images
@@ -138,7 +141,7 @@ def create_and_check_for_masked_image_modeling(self, config, pixel_values, label
 
         pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
         result = model(pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, 1, self.image_size, self.image_size))
+        self.parent.assertEqual(result.reconstruction.shape, (self.batch_size, 1, self.image_size, self.image_size))
 
     def create_and_check_for_image_classification(self, config, pixel_values, labels):
         config.num_labels = self.type_sequence_label_size
@@ -162,7 +165,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFDeiTModelTest(TFModelTesterMixin, unittest.TestCase):
+class TFDeiTModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_tf_common.py, as DeiT does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
@@ -178,6 +181,14 @@ class TFDeiTModelTest(TFModelTesterMixin, unittest.TestCase):
         if is_tf_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": TFDeiTModel,
+            "image-classification": (TFDeiTForImageClassification, TFDeiTForImageClassificationWithTeacher),
+        }
+        if is_tf_available()
+        else {}
+    )
 
     test_pruning = False
     test_resize_embeddings = False
@@ -233,7 +244,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
 
         if return_labels:
-            if model_class.__name__ == "DeiTForImageClassificationWithTeacher":
+            if "labels" in inputs_dict and "labels" not in inspect.signature(model_class.call).parameters:
                 del inputs_dict["labels"]
 
         return inputs_dict
@@ -255,9 +266,9 @@ def prepare_img():
 @require_vision
 class DeiTModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return (
-            DeiTFeatureExtractor.from_pretrained("facebook/deit-base-distilled-patch16-224")
+            DeiTImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
             if is_vision_available()
             else None
         )
@@ -266,9 +277,9 @@ def default_feature_extractor(self):
     def test_inference_image_classification_head(self):
         model = TFDeiTForImageClassificationWithTeacher.from_pretrained("facebook/deit-base-distilled-patch16-224")
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="tf")
+        inputs = image_processor(images=image, return_tensors="tf")
 
         # forward pass
         outputs = model(**inputs)
diff --git a/tests/models/deta/__init__.py b/tests/models/deta/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/deta/test_image_processing_deta.py b/tests/models/deta/test_image_processing_deta.py
new file mode 100644
index 000000000000..7cde8474bf10
--- /dev/null
+++ b/tests/models/deta/test_image_processing_deta.py
@@ -0,0 +1,246 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import pathlib
+import unittest
+
+from transformers.testing_utils import require_torch, require_vision, slow
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import DetaImageProcessor
+
+
+class DetaImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+        do_rescale=True,
+        rescale_factor=1 / 255,
+        do_pad=True,
+    ):
+        # by setting size["longest_edge"] > max_resolution we're effectively not testing this :p
+        size = size if size is not None else {"shortest_edge": 18, "longest_edge": 1333}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_pad = do_pad
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_rescale": self.do_rescale,
+            "rescale_factor": self.rescale_factor,
+            "do_pad": self.do_pad,
+        }
+
+    def get_expected_values(self, image_inputs, batched=False):
+        """
+        This function computes the expected height and width when providing images to DetaImageProcessor,
+        assuming do_resize is set to True with a scalar size.
+        """
+        if not batched:
+            image = image_inputs[0]
+            if isinstance(image, Image.Image):
+                w, h = image.size
+            else:
+                h, w = image.shape[1], image.shape[2]
+            if w < h:
+                expected_height = int(self.size["shortest_edge"] * h / w)
+                expected_width = self.size["shortest_edge"]
+            elif w > h:
+                expected_height = self.size["shortest_edge"]
+                expected_width = int(self.size["shortest_edge"] * w / h)
+            else:
+                expected_height = self.size["shortest_edge"]
+                expected_width = self.size["shortest_edge"]
+
+        else:
+            expected_values = []
+            for image in image_inputs:
+                expected_height, expected_width = self.get_expected_values([image])
+                expected_values.append((expected_height, expected_width))
+            expected_height = max(expected_values, key=lambda item: item[0])[0]
+            expected_width = max(expected_values, key=lambda item: item[1])[1]
+
+        return expected_height, expected_width
+
+    def expected_output_image_shape(self, images):
+        height, width = self.get_expected_values(images, batched=True)
+        return self.num_channels, height, width
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class DetaImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = DetaImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = DetaImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "do_rescale"))
+        self.assertTrue(hasattr(image_processing, "do_pad"))
+        self.assertTrue(hasattr(image_processing, "size"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333})
+        self.assertEqual(image_processor.do_pad, True)
+
+    @slow
+    def test_call_pytorch_with_coco_detection_annotations(self):
+        # prepare image and target
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
+            target = json.loads(f.read())
+
+        target = {"image_id": 39769, "annotations": target}
+
+        # encode them
+        image_processing = DetaImageProcessor()
+        encoding = image_processing(images=image, annotations=target, return_tensors="pt")
+
+        # verify pixel values
+        expected_shape = torch.Size([1, 3, 800, 1066])
+        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+
+        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
+        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
+
+        # verify area
+        expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
+        # verify boxes
+        expected_boxes_shape = torch.Size([6, 4])
+        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
+        expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
+        # verify image_id
+        expected_image_id = torch.tensor([39769])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
+        # verify is_crowd
+        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
+        # verify class_labels
+        expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
+        # verify orig_size
+        expected_orig_size = torch.tensor([480, 640])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
+        # verify size
+        expected_size = torch.tensor([800, 1066])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
+
+    @slow
+    def test_call_pytorch_with_coco_panoptic_annotations(self):
+        # prepare image, target and masks_path
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
+            target = json.loads(f.read())
+
+        target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
+
+        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
+
+        # encode them
+        image_processing = DetaImageProcessor(format="coco_panoptic")
+        encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
+
+        # verify pixel values
+        expected_shape = torch.Size([1, 3, 800, 1066])
+        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+
+        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
+        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
+
+        # verify area
+        expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
+        # verify boxes
+        expected_boxes_shape = torch.Size([6, 4])
+        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
+        expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
+        # verify image_id
+        expected_image_id = torch.tensor([39769])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
+        # verify is_crowd
+        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
+        # verify class_labels
+        expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
+        # verify masks
+        expected_masks_sum = 822873
+        self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
+        # verify orig_size
+        expected_orig_size = torch.tensor([480, 640])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
+        # verify size
+        expected_size = torch.tensor([800, 1066])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
diff --git a/tests/models/deta/test_modeling_deta.py b/tests/models/deta/test_modeling_deta.py
new file mode 100644
index 000000000000..d5bf32acaba7
--- /dev/null
+++ b/tests/models/deta/test_modeling_deta.py
@@ -0,0 +1,558 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch DETA model. """
+
+
+import inspect
+import math
+import unittest
+
+from transformers import DetaConfig, ResNetConfig, is_torch_available, is_torchvision_available, is_vision_available
+from transformers.file_utils import cached_property
+from transformers.testing_utils import require_torchvision, require_vision, slow, torch_device
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+if is_torchvision_available():
+    from transformers import DetaForObjectDetection, DetaModel
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import AutoImageProcessor
+
+
+class DetaModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=8,
+        is_training=True,
+        use_labels=True,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=8,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        num_queries=12,
+        num_channels=3,
+        image_size=196,
+        n_targets=8,
+        num_labels=91,
+        num_feature_levels=4,
+        encoder_n_points=2,
+        decoder_n_points=6,
+        two_stage=False,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.num_queries = num_queries
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.n_targets = n_targets
+        self.num_labels = num_labels
+        self.num_feature_levels = num_feature_levels
+        self.encoder_n_points = encoder_n_points
+        self.decoder_n_points = decoder_n_points
+        self.two_stage = two_stage
+
+        # we also set the expected seq length for both encoder and decoder
+        self.encoder_seq_length = (
+            math.ceil(self.image_size / 8) ** 2
+            + math.ceil(self.image_size / 16) ** 2
+            + math.ceil(self.image_size / 32) ** 2
+            + math.ceil(self.image_size / 64) ** 2
+        )
+        self.decoder_seq_length = self.num_queries
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        pixel_mask = torch.ones([self.batch_size, self.image_size, self.image_size], device=torch_device)
+
+        labels = None
+        if self.use_labels:
+            # labels is a list of Dict (each Dict being the labels for a given example in the batch)
+            labels = []
+            for i in range(self.batch_size):
+                target = {}
+                target["class_labels"] = torch.randint(
+                    high=self.num_labels, size=(self.n_targets,), device=torch_device
+                )
+                target["boxes"] = torch.rand(self.n_targets, 4, device=torch_device)
+                target["masks"] = torch.rand(self.n_targets, self.image_size, self.image_size, device=torch_device)
+                labels.append(target)
+
+        config = self.get_config()
+        return config, pixel_values, pixel_mask, labels
+
+    def get_config(self):
+        resnet_config = ResNetConfig(
+            num_channels=3,
+            embeddings_size=10,
+            hidden_sizes=[10, 20, 30, 40],
+            depths=[1, 1, 2, 1],
+            hidden_act="relu",
+            num_labels=3,
+            out_features=["stage2", "stage3", "stage4"],
+            out_indices=[2, 3, 4],
+        )
+        return DetaConfig(
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            num_queries=self.num_queries,
+            num_labels=self.num_labels,
+            num_feature_levels=self.num_feature_levels,
+            encoder_n_points=self.encoder_n_points,
+            decoder_n_points=self.decoder_n_points,
+            two_stage=self.two_stage,
+            backbone_config=resnet_config,
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config, pixel_values, pixel_mask, labels = self.prepare_config_and_inputs()
+        inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask}
+        return config, inputs_dict
+
+    def create_and_check_deta_model(self, config, pixel_values, pixel_mask, labels):
+        model = DetaModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
+        result = model(pixel_values)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.num_queries, self.hidden_size))
+
+    def create_and_check_deta_object_detection_head_model(self, config, pixel_values, pixel_mask, labels):
+        model = DetaForObjectDetection(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
+        result = model(pixel_values)
+
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels))
+        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
+
+        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)
+
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels))
+        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
+
+
+@require_torchvision
+class DetaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (DetaModel, DetaForObjectDetection) if is_torchvision_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": DetaModel, "object-detection": DetaForObjectDetection}
+        if is_torchvision_available()
+        else {}
+    )
+    is_encoder_decoder = True
+    test_torchscript = False
+    test_pruning = False
+    test_head_masking = False
+    test_missing_keys = False
+
+    # TODO: Fix the failed tests when this model gets more usage
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if pipeline_test_casse_name == "ObjectDetectionPipelineTests":
+            return True
+
+        return False
+
+    @unittest.skip("Skip for now. PR #22437 causes some loading issue. See (not merged) #22656 for some discussions.")
+    def test_can_use_safetensors(self):
+        super().test_can_use_safetensors()
+
+    # special case for head models
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+
+        if return_labels:
+            if model_class.__name__ == "DetaForObjectDetection":
+                labels = []
+                for i in range(self.model_tester.batch_size):
+                    target = {}
+                    target["class_labels"] = torch.ones(
+                        size=(self.model_tester.n_targets,), device=torch_device, dtype=torch.long
+                    )
+                    target["boxes"] = torch.ones(
+                        self.model_tester.n_targets, 4, device=torch_device, dtype=torch.float
+                    )
+                    target["masks"] = torch.ones(
+                        self.model_tester.n_targets,
+                        self.model_tester.image_size,
+                        self.model_tester.image_size,
+                        device=torch_device,
+                        dtype=torch.float,
+                    )
+                    labels.append(target)
+                inputs_dict["labels"] = labels
+
+        return inputs_dict
+
+    def setUp(self):
+        self.model_tester = DetaModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=DetaConfig, has_text_modality=False)
+
+    def test_config(self):
+        # we don't test common_properties and arguments_init as these don't apply for DETA
+        self.config_tester.create_and_test_config_to_json_string()
+        self.config_tester.create_and_test_config_to_json_file()
+        self.config_tester.create_and_test_config_from_and_save_pretrained()
+        self.config_tester.create_and_test_config_with_num_labels()
+        self.config_tester.check_config_can_be_init_without_params()
+
+    def test_deta_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_deta_model(*config_and_inputs)
+
+    def test_deta_object_detection_head_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_deta_object_detection_head_model(*config_and_inputs)
+
+    @unittest.skip(reason="DETA does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="DETA does not have a get_input_embeddings method")
+    def test_model_common_attributes(self):
+        pass
+
+    @unittest.skip(reason="DETA is not a generative model")
+    def test_generate_without_input_ids(self):
+        pass
+
+    @unittest.skip(reason="DETA does not use token embeddings")
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    @unittest.skip(reason="Feed forward chunking is not implemented")
+    def test_feed_forward_chunking(self):
+        pass
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [
+                    self.model_tester.num_attention_heads,
+                    self.model_tester.num_feature_levels,
+                    self.model_tester.encoder_n_points,
+                ],
+            )
+            out_len = len(outputs)
+
+            correct_outlen = 8
+
+            # loss is at first position
+            if "labels" in inputs_dict:
+                correct_outlen += 1  # loss is added to beginning
+            # Object Detection model returns pred_logits and pred_boxes
+            if model_class.__name__ == "DetaForObjectDetection":
+                correct_outlen += 2
+
+            self.assertEqual(out_len, correct_outlen)
+
+            # decoder attentions
+            decoder_attentions = outputs.decoder_attentions
+            self.assertIsInstance(decoder_attentions, (list, tuple))
+            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(decoder_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, self.model_tester.num_queries, self.model_tester.num_queries],
+            )
+
+            # cross attentions
+            cross_attentions = outputs.cross_attentions
+            self.assertIsInstance(cross_attentions, (list, tuple))
+            self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(cross_attentions[0].shape[-3:]),
+                [
+                    self.model_tester.num_attention_heads,
+                    self.model_tester.num_feature_levels,
+                    self.model_tester.decoder_n_points,
+                ],
+            )
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            if hasattr(self.model_tester, "num_hidden_states_types"):
+                added_hidden_states = self.model_tester.num_hidden_states_types
+            elif self.is_encoder_decoder:
+                added_hidden_states = 2
+            else:
+                added_hidden_states = 1
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs.encoder_attentions
+
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(self_attentions[0].shape[-3:]),
+                [
+                    self.model_tester.num_attention_heads,
+                    self.model_tester.num_feature_levels,
+                    self.model_tester.encoder_n_points,
+                ],
+            )
+
+    # removed retain_grad and grad on decoder_hidden_states, as queries don't require grad
+    def test_retain_grad_hidden_states_attentions(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        config.output_attentions = True
+
+        # no need to test all models as different heads yield the same functionality
+        model_class = self.all_model_classes[0]
+        model = model_class(config)
+        model.to(torch_device)
+
+        inputs = self._prepare_for_class(inputs_dict, model_class)
+
+        outputs = model(**inputs)
+
+        # we take the second output since last_hidden_state is the second item
+        output = outputs[1]
+
+        encoder_hidden_states = outputs.encoder_hidden_states[0]
+        encoder_attentions = outputs.encoder_attentions[0]
+        encoder_hidden_states.retain_grad()
+        encoder_attentions.retain_grad()
+
+        decoder_attentions = outputs.decoder_attentions[0]
+        decoder_attentions.retain_grad()
+
+        cross_attentions = outputs.cross_attentions[0]
+        cross_attentions.retain_grad()
+
+        output.flatten()[0].backward(retain_graph=True)
+
+        self.assertIsNotNone(encoder_hidden_states.grad)
+        self.assertIsNotNone(encoder_attentions.grad)
+        self.assertIsNotNone(decoder_attentions.grad)
+        self.assertIsNotNone(cross_attentions.grad)
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            if model.config.is_encoder_decoder:
+                expected_arg_names = ["pixel_values", "pixel_mask"]
+                expected_arg_names.extend(
+                    ["head_mask", "decoder_head_mask", "encoder_outputs"]
+                    if "head_mask" and "decoder_head_mask" in arg_names
+                    else []
+                )
+                self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+            else:
+                expected_arg_names = ["pixel_values", "pixel_mask"]
+                self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    @unittest.skip(reason="Model doesn't use tied weights")
+    def test_tied_model_weights_key_ignore(self):
+        pass
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            # Skip the check for the backbone
+            for name, module in model.named_modules():
+                if module.__class__.__name__ == "DetaBackboneWithPositionalEncodings":
+                    backbone_params = [f"{name}.{key}" for key in module.state_dict().keys()]
+                    break
+
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    if (
+                        "level_embed" in name
+                        or "sampling_offsets.bias" in name
+                        or "value_proj" in name
+                        or "output_proj" in name
+                        or "reference_points" in name
+                        or name in backbone_params
+                    ):
+                        continue
+                    self.assertIn(
+                        ((param.data.mean() * 1e9).round() / 1e9).item(),
+                        [0.0, 1.0],
+                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                    )
+
+
+TOLERANCE = 1e-4
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_torchvision
+@require_vision
+@slow
+class DetaModelIntegrationTests(unittest.TestCase):
+    @cached_property
+    def default_image_processor(self):
+        return AutoImageProcessor.from_pretrained("jozhang97/deta-resnet-50") if is_vision_available() else None
+
+    def test_inference_object_detection_head(self):
+        model = DetaForObjectDetection.from_pretrained("jozhang97/deta-resnet-50").to(torch_device)
+
+        image_processor = self.default_image_processor
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        expected_shape_logits = torch.Size((1, 300, model.config.num_labels))
+        self.assertEqual(outputs.logits.shape, expected_shape_logits)
+
+        expected_logits = torch.tensor(
+            [[-7.3978, -2.5406, -4.1668], [-8.2684, -3.9933, -3.8096], [-7.0515, -3.7973, -5.8516]]
+        ).to(torch_device)
+        expected_boxes = torch.tensor(
+            [[0.5043, 0.4973, 0.9998], [0.2542, 0.5489, 0.4748], [0.5490, 0.2765, 0.0570]]
+        ).to(torch_device)
+
+        self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-4))
+
+        expected_shape_boxes = torch.Size((1, 300, 4))
+        self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
+        self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4))
+
+        # verify postprocessing
+        results = image_processor.post_process_object_detection(
+            outputs, threshold=0.3, target_sizes=[image.size[::-1]]
+        )[0]
+        expected_scores = torch.tensor([0.6392, 0.6276, 0.5546, 0.5260, 0.4706], device=torch_device)
+        expected_labels = [75, 17, 17, 75, 63]
+        expected_slice_boxes = torch.tensor([40.5866, 73.2107, 176.1421, 117.1751], device=torch_device)
+
+        self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-4))
+        self.assertSequenceEqual(results["labels"].tolist(), expected_labels)
+        self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes))
+
+    def test_inference_object_detection_head_swin_backbone(self):
+        model = DetaForObjectDetection.from_pretrained("jozhang97/deta-swin-large").to(torch_device)
+
+        image_processor = self.default_image_processor
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        expected_shape_logits = torch.Size((1, 300, model.config.num_labels))
+        self.assertEqual(outputs.logits.shape, expected_shape_logits)
+
+        expected_logits = torch.tensor(
+            [[-7.6308, -2.8485, -5.3737], [-7.2037, -4.5505, -4.8027], [-7.2943, -4.2611, -4.6617]]
+        ).to(torch_device)
+        expected_boxes = torch.tensor(
+            [[0.4987, 0.4969, 0.9999], [0.2549, 0.5498, 0.4805], [0.5498, 0.2757, 0.0569]]
+        ).to(torch_device)
+
+        self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-4))
+
+        expected_shape_boxes = torch.Size((1, 300, 4))
+        self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
+        self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4))
+
+        # verify postprocessing
+        results = image_processor.post_process_object_detection(
+            outputs, threshold=0.3, target_sizes=[image.size[::-1]]
+        )[0]
+        expected_scores = torch.tensor([0.6831, 0.6826, 0.5684, 0.5464, 0.4392], device=torch_device)
+        expected_labels = [17, 17, 75, 75, 63]
+        expected_slice_boxes = torch.tensor([345.8478, 23.6754, 639.8562, 372.8265], device=torch_device)
+
+        self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-4))
+        self.assertSequenceEqual(results["labels"].tolist(), expected_labels)
+        self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes))
diff --git a/tests/models/detr/test_feature_extraction_detr.py b/tests/models/detr/test_feature_extraction_detr.py
deleted file mode 100644
index 6aafd62da4bd..000000000000
--- a/tests/models/detr/test_feature_extraction_detr.py
+++ /dev/null
@@ -1,362 +0,0 @@
-# coding=utf-8
-# Copyright 2021 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-import pathlib
-import unittest
-
-import numpy as np
-
-from transformers.testing_utils import require_torch, require_vision, slow
-from transformers.utils import is_torch_available, is_vision_available
-
-from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
-
-
-if is_torch_available():
-    import torch
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import DetrFeatureExtractor
-
-
-class DetrFeatureExtractionTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_rescale=True,
-        rescale_factor=1 / 255,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-        do_pad=True,
-    ):
-        # by setting size["longest_edge"] > max_resolution we're effectively not testing this :p
-        size = size if size is not None else {"shortest_edge": 18, "longest_edge": 1333}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_rescale = do_rescale
-        self.rescale_factor = rescale_factor
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.do_pad = do_pad
-
-    def prepare_feat_extract_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_rescale": self.do_rescale,
-            "rescale_factor": self.rescale_factor,
-            "do_normalize": self.do_normalize,
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_pad": self.do_pad,
-        }
-
-    def get_expected_values(self, image_inputs, batched=False):
-        """
-        This function computes the expected height and width when providing images to DetrFeatureExtractor,
-        assuming do_resize is set to True with a scalar size.
-        """
-        if not batched:
-            image = image_inputs[0]
-            if isinstance(image, Image.Image):
-                w, h = image.size
-            else:
-                h, w = image.shape[1], image.shape[2]
-            if w < h:
-                expected_height = int(self.size["shortest_edge"] * h / w)
-                expected_width = self.size["shortest_edge"]
-            elif w > h:
-                expected_height = self.size["shortest_edge"]
-                expected_width = int(self.size["shortest_edge"] * w / h)
-            else:
-                expected_height = self.size["shortest_edge"]
-                expected_width = self.size["shortest_edge"]
-
-        else:
-            expected_values = []
-            for image in image_inputs:
-                expected_height, expected_width = self.get_expected_values([image])
-                expected_values.append((expected_height, expected_width))
-            expected_height = max(expected_values, key=lambda item: item[0])[0]
-            expected_width = max(expected_values, key=lambda item: item[1])[1]
-
-        return expected_height, expected_width
-
-
-@require_torch
-@require_vision
-class DetrFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
-
-    feature_extraction_class = DetrFeatureExtractor if is_vision_available() else None
-
-    def setUp(self):
-        self.feature_extract_tester = DetrFeatureExtractionTester(self)
-
-    @property
-    def feat_extract_dict(self):
-        return self.feature_extract_tester.prepare_feat_extract_dict()
-
-    def test_feat_extract_properties(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        self.assertTrue(hasattr(feature_extractor, "image_mean"))
-        self.assertTrue(hasattr(feature_extractor, "image_std"))
-        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
-        self.assertTrue(hasattr(feature_extractor, "do_rescale"))
-        self.assertTrue(hasattr(feature_extractor, "rescale_factor"))
-        self.assertTrue(hasattr(feature_extractor, "do_resize"))
-        self.assertTrue(hasattr(feature_extractor, "size"))
-        self.assertTrue(hasattr(feature_extractor, "do_pad"))
-
-    def test_feat_extract_from_dict_with_kwargs(self):
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict)
-        self.assertEqual(feature_extractor.size, {"shortest_edge": 18, "longest_edge": 1333})
-        self.assertEqual(feature_extractor.do_pad, True)
-
-        feature_extractor = self.feature_extraction_class.from_dict(
-            self.feat_extract_dict, size=42, max_size=84, pad_and_return_pixel_mask=False
-        )
-        self.assertEqual(feature_extractor.size, {"shortest_edge": 42, "longest_edge": 84})
-        self.assertEqual(feature_extractor.do_pad, False)
-
-    def test_batch_feature(self):
-        pass
-
-    def test_call_pil(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PIL images
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-
-        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs)
-
-        self.assertEqual(
-            encoded_images.shape,
-            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
-        )
-
-        # Test batched
-        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True)
-
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                expected_height,
-                expected_width,
-            ),
-        )
-
-    def test_call_numpy(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random numpy tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-
-        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs)
-
-        self.assertEqual(
-            encoded_images.shape,
-            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-
-        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True)
-
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                expected_height,
-                expected_width,
-            ),
-        )
-
-    def test_call_pytorch(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-
-        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs)
-
-        self.assertEqual(
-            encoded_images.shape,
-            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-
-        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True)
-
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                expected_height,
-                expected_width,
-            ),
-        )
-
-    def test_equivalence_pad_and_create_pixel_mask(self):
-        # Initialize feature_extractors
-        feature_extractor_1 = self.feature_extraction_class(**self.feat_extract_dict)
-        feature_extractor_2 = self.feature_extraction_class(do_resize=False, do_normalize=False, do_rescale=False)
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test whether the method "pad_and_return_pixel_mask" and calling the feature extractor return the same tensors
-        encoded_images_with_method = feature_extractor_1.pad_and_create_pixel_mask(image_inputs, return_tensors="pt")
-        encoded_images = feature_extractor_2(image_inputs, return_tensors="pt")
-
-        self.assertTrue(
-            torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4)
-        )
-        self.assertTrue(
-            torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4)
-        )
-
-    @slow
-    def test_call_pytorch_with_coco_detection_annotations(self):
-        # prepare image and target
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
-            target = json.loads(f.read())
-
-        target = {"image_id": 39769, "annotations": target}
-
-        # encode them
-        feature_extractor = DetrFeatureExtractor.from_pretrained("facebook/detr-resnet-50")
-        encoding = feature_extractor(images=image, annotations=target, return_tensors="pt")
-
-        # verify pixel values
-        expected_shape = torch.Size([1, 3, 800, 1066])
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
-        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
-
-        # verify area
-        expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
-        # verify boxes
-        expected_boxes_shape = torch.Size([6, 4])
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
-        expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
-        # verify image_id
-        expected_image_id = torch.tensor([39769])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
-        # verify is_crowd
-        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
-        # verify class_labels
-        expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
-        # verify orig_size
-        expected_orig_size = torch.tensor([480, 640])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
-        # verify size
-        expected_size = torch.tensor([800, 1066])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
-
-    @slow
-    def test_call_pytorch_with_coco_panoptic_annotations(self):
-        # prepare image, target and masks_path
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
-            target = json.loads(f.read())
-
-        target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
-
-        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
-
-        # encode them
-        feature_extractor = DetrFeatureExtractor.from_pretrained("facebook/detr-resnet-50-panoptic")
-        encoding = feature_extractor(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
-
-        # verify pixel values
-        expected_shape = torch.Size([1, 3, 800, 1066])
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
-        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
-
-        # verify area
-        expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
-        # verify boxes
-        expected_boxes_shape = torch.Size([6, 4])
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
-        expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
-        # verify image_id
-        expected_image_id = torch.tensor([39769])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
-        # verify is_crowd
-        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
-        # verify class_labels
-        expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
-        # verify masks
-        expected_masks_sum = 822873
-        self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
-        # verify orig_size
-        expected_orig_size = torch.tensor([480, 640])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
-        # verify size
-        expected_size = torch.tensor([800, 1066])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
diff --git a/tests/models/detr/test_image_processing_detr.py b/tests/models/detr/test_image_processing_detr.py
new file mode 100644
index 000000000000..2a095b259ecd
--- /dev/null
+++ b/tests/models/detr/test_image_processing_detr.py
@@ -0,0 +1,253 @@
+# coding=utf-8
+# Copyright 2021 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import pathlib
+import unittest
+
+from transformers.testing_utils import require_torch, require_vision, slow
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import DetrImageProcessor
+
+
+class DetrImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_rescale=True,
+        rescale_factor=1 / 255,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+        do_pad=True,
+    ):
+        # by setting size["longest_edge"] > max_resolution we're effectively not testing this :p
+        size = size if size is not None else {"shortest_edge": 18, "longest_edge": 1333}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_pad = do_pad
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_rescale": self.do_rescale,
+            "rescale_factor": self.rescale_factor,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_pad": self.do_pad,
+        }
+
+    def get_expected_values(self, image_inputs, batched=False):
+        """
+        This function computes the expected height and width when providing images to DetrImageProcessor,
+        assuming do_resize is set to True with a scalar size.
+        """
+        if not batched:
+            image = image_inputs[0]
+            if isinstance(image, Image.Image):
+                w, h = image.size
+            else:
+                h, w = image.shape[1], image.shape[2]
+            if w < h:
+                expected_height = int(self.size["shortest_edge"] * h / w)
+                expected_width = self.size["shortest_edge"]
+            elif w > h:
+                expected_height = self.size["shortest_edge"]
+                expected_width = int(self.size["shortest_edge"] * w / h)
+            else:
+                expected_height = self.size["shortest_edge"]
+                expected_width = self.size["shortest_edge"]
+
+        else:
+            expected_values = []
+            for image in image_inputs:
+                expected_height, expected_width = self.get_expected_values([image])
+                expected_values.append((expected_height, expected_width))
+            expected_height = max(expected_values, key=lambda item: item[0])[0]
+            expected_width = max(expected_values, key=lambda item: item[1])[1]
+
+        return expected_height, expected_width
+
+    def expected_output_image_shape(self, images):
+        height, width = self.get_expected_values(images, batched=True)
+        return self.num_channels, height, width
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class DetrImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = DetrImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = DetrImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "do_rescale"))
+        self.assertTrue(hasattr(image_processing, "rescale_factor"))
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "do_pad"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333})
+        self.assertEqual(image_processor.do_pad, True)
+
+        image_processor = self.image_processing_class.from_dict(
+            self.image_processor_dict, size=42, max_size=84, pad_and_return_pixel_mask=False
+        )
+        self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84})
+        self.assertEqual(image_processor.do_pad, False)
+
+    @slow
+    def test_call_pytorch_with_coco_detection_annotations(self):
+        # prepare image and target
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
+            target = json.loads(f.read())
+
+        target = {"image_id": 39769, "annotations": target}
+
+        # encode them
+        image_processing = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
+        encoding = image_processing(images=image, annotations=target, return_tensors="pt")
+
+        # verify pixel values
+        expected_shape = torch.Size([1, 3, 800, 1066])
+        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+
+        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
+        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
+
+        # verify area
+        expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
+        # verify boxes
+        expected_boxes_shape = torch.Size([6, 4])
+        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
+        expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
+        # verify image_id
+        expected_image_id = torch.tensor([39769])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
+        # verify is_crowd
+        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
+        # verify class_labels
+        expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
+        # verify orig_size
+        expected_orig_size = torch.tensor([480, 640])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
+        # verify size
+        expected_size = torch.tensor([800, 1066])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
+
+    @slow
+    def test_call_pytorch_with_coco_panoptic_annotations(self):
+        # prepare image, target and masks_path
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
+            target = json.loads(f.read())
+
+        target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
+
+        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
+
+        # encode them
+        image_processing = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50-panoptic")
+        encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
+
+        # verify pixel values
+        expected_shape = torch.Size([1, 3, 800, 1066])
+        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+
+        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
+        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
+
+        # verify area
+        expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
+        # verify boxes
+        expected_boxes_shape = torch.Size([6, 4])
+        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
+        expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
+        # verify image_id
+        expected_image_id = torch.tensor([39769])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
+        # verify is_crowd
+        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
+        # verify class_labels
+        expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
+        # verify masks
+        expected_masks_sum = 822873
+        self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
+        # verify orig_size
+        expected_orig_size = torch.tensor([480, 640])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
+        # verify size
+        expected_size = torch.tensor([800, 1066])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
diff --git a/tests/models/detr/test_modeling_detr.py b/tests/models/detr/test_modeling_detr.py
index 745ffb26014c..abede6fa14c3 100644
--- a/tests/models/detr/test_modeling_detr.py
+++ b/tests/models/detr/test_modeling_detr.py
@@ -19,16 +19,17 @@
 import math
 import unittest
 
-from transformers import DetrConfig, is_timm_available, is_vision_available
-from transformers.testing_utils import require_timm, require_vision, slow, torch_device
+from transformers import DetrConfig, ResNetConfig, is_torch_available, is_vision_available
+from transformers.testing_utils import require_timm, require_torch, require_vision, slow, torch_device
 from transformers.utils import cached_property
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
-if is_timm_available():
+if is_torch_available():
     import torch
 
     from transformers import DetrForObjectDetection, DetrForSegmentation, DetrModel
@@ -37,7 +38,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import DetrFeatureExtractor
+    from transformers import DetrImageProcessor
 
 
 class DetrModelTester:
@@ -47,7 +48,7 @@ def __init__(
         batch_size=8,
         is_training=True,
         use_labels=True,
-        hidden_size=256,
+        hidden_size=32,
         num_hidden_layers=2,
         num_attention_heads=8,
         intermediate_size=4,
@@ -105,6 +106,16 @@ def prepare_config_and_inputs(self):
         return config, pixel_values, pixel_mask, labels
 
     def get_config(self):
+        resnet_config = ResNetConfig(
+            num_channels=3,
+            embeddings_size=10,
+            hidden_sizes=[10, 20, 30, 40],
+            depths=[1, 1, 2, 1],
+            hidden_act="relu",
+            num_labels=3,
+            out_features=["stage2", "stage3", "stage4"],
+            out_indices=[2, 3, 4],
+        )
         return DetrConfig(
             d_model=self.hidden_size,
             encoder_layers=self.num_hidden_layers,
@@ -117,6 +128,8 @@ def get_config(self):
             attention_dropout=self.attention_probs_dropout_prob,
             num_queries=self.num_queries,
             num_labels=self.num_labels,
+            use_timm_backbone=False,
+            backbone_config=resnet_config,
         )
 
     def prepare_config_and_inputs_for_common(self):
@@ -154,17 +167,26 @@ def create_and_check_detr_object_detection_head_model(self, config, pixel_values
         self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
 
 
-@require_timm
-class DetrModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+@require_torch
+class DetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             DetrModel,
             DetrForObjectDetection,
             DetrForSegmentation,
         )
-        if is_timm_available()
+        if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": DetrModel,
+            "image-segmentation": DetrForSegmentation,
+            "object-detection": DetrForObjectDetection,
+        }
+        if is_torch_available()
+        else {}
+    )
     is_encoder_decoder = True
     test_torchscript = False
     test_pruning = False
@@ -213,6 +235,11 @@ def test_detr_object_detection_head_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_detr_object_detection_head_model(*config_and_inputs)
 
+    # TODO: check if this works again for PyTorch 2.x.y
+    @unittest.skip(reason="Got `CUDA error: misaligned address` with PyTorch 2.0.0.")
+    def test_multi_gpu_data_parallel_forward(self):
+        pass
+
     @unittest.skip(reason="DETR does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass
@@ -426,6 +453,7 @@ def test_greyscale_images(self):
 
         # let's set num_channels to 1
         config.num_channels = 1
+        config.backbone_config.num_channels = 1
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -472,17 +500,17 @@ def prepare_img():
 @require_timm
 @require_vision
 @slow
-class DetrModelIntegrationTests(unittest.TestCase):
+class DetrModelIntegrationTestsTimmBackbone(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return DetrFeatureExtractor.from_pretrained("facebook/detr-resnet-50") if is_vision_available() else None
+    def default_image_processor(self):
+        return DetrImageProcessor.from_pretrained("facebook/detr-resnet-50") if is_vision_available() else None
 
     def test_inference_no_head(self):
         model = DetrModel.from_pretrained("facebook/detr-resnet-50").to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        encoding = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         with torch.no_grad():
             outputs = model(**encoding)
@@ -497,15 +525,16 @@ def test_inference_no_head(self):
     def test_inference_object_detection_head(self):
         model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50").to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        encoding = image_processor(images=image, return_tensors="pt").to(torch_device)
         pixel_values = encoding["pixel_values"].to(torch_device)
         pixel_mask = encoding["pixel_mask"].to(torch_device)
 
         with torch.no_grad():
             outputs = model(pixel_values, pixel_mask)
 
+        # verify outputs
         expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels + 1))
         self.assertEqual(outputs.logits.shape, expected_shape_logits)
         expected_slice_logits = torch.tensor(
@@ -520,18 +549,32 @@ def test_inference_object_detection_head(self):
         ).to(torch_device)
         self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
 
+        # verify postprocessing
+        results = image_processor.post_process_object_detection(
+            outputs, threshold=0.3, target_sizes=[image.size[::-1]]
+        )[0]
+        expected_scores = torch.tensor([0.9982, 0.9960, 0.9955, 0.9988, 0.9987]).to(torch_device)
+        expected_labels = [75, 75, 63, 17, 17]
+        expected_slice_boxes = torch.tensor([40.1633, 70.8115, 175.5471, 117.9841]).to(torch_device)
+
+        self.assertEqual(len(results["scores"]), 5)
+        self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-4))
+        self.assertSequenceEqual(results["labels"].tolist(), expected_labels)
+        self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes))
+
     def test_inference_panoptic_segmentation_head(self):
         model = DetrForSegmentation.from_pretrained("facebook/detr-resnet-50-panoptic").to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        encoding = image_processor(images=image, return_tensors="pt").to(torch_device)
         pixel_values = encoding["pixel_values"].to(torch_device)
         pixel_mask = encoding["pixel_mask"].to(torch_device)
 
         with torch.no_grad():
             outputs = model(pixel_values, pixel_mask)
 
+        # verify outputs
         expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels + 1))
         self.assertEqual(outputs.logits.shape, expected_shape_logits)
         expected_slice_logits = torch.tensor(
@@ -552,3 +595,54 @@ def test_inference_panoptic_segmentation_head(self):
             [[-7.7558, -10.8788, -11.9797], [-11.8881, -16.4329, -17.7451], [-14.7316, -19.7383, -20.3004]]
         ).to(torch_device)
         self.assertTrue(torch.allclose(outputs.pred_masks[0, 0, :3, :3], expected_slice_masks, atol=1e-3))
+
+        # verify postprocessing
+        results = image_processor.post_process_panoptic_segmentation(
+            outputs, threshold=0.3, target_sizes=[image.size[::-1]]
+        )[0]
+
+        expected_shape = torch.Size([480, 640])
+        expected_slice_segmentation = torch.tensor([[4, 4, 4], [4, 4, 4], [4, 4, 4]], dtype=torch.int32).to(
+            torch_device
+        )
+        expected_number_of_segments = 5
+        expected_first_segment = {"id": 1, "label_id": 17, "was_fused": False, "score": 0.994096}
+
+        number_of_unique_segments = len(torch.unique(results["segmentation"]))
+        self.assertTrue(
+            number_of_unique_segments, expected_number_of_segments + 1
+        )  # we add 1 for the background class
+        self.assertTrue(results["segmentation"].shape, expected_shape)
+        self.assertTrue(torch.allclose(results["segmentation"][:3, :3], expected_slice_segmentation, atol=1e-4))
+        self.assertTrue(len(results["segments_info"]), expected_number_of_segments)
+        self.assertDictEqual(results["segments_info"][0], expected_first_segment)
+
+
+@require_vision
+@require_torch
+@slow
+class DetrModelIntegrationTests(unittest.TestCase):
+    @cached_property
+    def default_image_processor(self):
+        return (
+            DetrImageProcessor.from_pretrained("facebook/detr-resnet-50", revision="no_timm")
+            if is_vision_available()
+            else None
+        )
+
+    def test_inference_no_head(self):
+        model = DetrModel.from_pretrained("facebook/detr-resnet-50", revision="no_timm").to(torch_device)
+
+        image_processor = self.default_image_processor
+        image = prepare_img()
+        encoding = image_processor(images=image, return_tensors="pt").to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(**encoding)
+
+        expected_shape = torch.Size((1, 100, 256))
+        assert outputs.last_hidden_state.shape == expected_shape
+        expected_slice = torch.tensor(
+            [[0.0616, -0.5146, -0.4032], [-0.7629, -0.4934, -1.7153], [-0.4768, -0.6403, -0.7826]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
diff --git a/tests/models/dinat/test_modeling_dinat.py b/tests/models/dinat/test_modeling_dinat.py
index 2f87f60697b2..a7e0b7d0650e 100644
--- a/tests/models/dinat/test_modeling_dinat.py
+++ b/tests/models/dinat/test_modeling_dinat.py
@@ -22,8 +22,10 @@
 from transformers.testing_utils import require_natten, require_torch, require_vision, slow, torch_device
 from transformers.utils import cached_property, is_torch_available, is_vision_available
 
+from ...test_backbone_common import BackboneTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -66,6 +68,7 @@ def __init__(
         use_labels=True,
         num_labels=10,
         out_features=["stage1", "stage2"],
+        out_indices=[1, 2],
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -91,6 +94,7 @@ def __init__(
         self.use_labels = use_labels
         self.num_labels = num_labels
         self.out_features = out_features
+        self.out_indices = out_indices
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
@@ -124,6 +128,7 @@ def get_config(self):
             layer_norm_eps=self.layer_norm_eps,
             initializer_range=self.initializer_range,
             out_features=self.out_features,
+            out_indices=self.out_indices,
         )
 
     def create_and_check_model(self, config, pixel_values, labels):
@@ -192,8 +197,7 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_natten
 @require_torch
-class DinatModelTest(ModelTesterMixin, unittest.TestCase):
-
+class DinatModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             DinatModel,
@@ -203,6 +207,11 @@ class DinatModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {"feature-extraction": DinatModel, "image-classification": DinatForImageClassification}
+        if is_torch_available()
+        else {}
+    )
     fx_compatible = False
 
     test_torchscript = False
@@ -358,16 +367,16 @@ def test_initialization(self):
 @require_torch
 class DinatModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return AutoImageProcessor.from_pretrained("shi-labs/dinat-mini-in1k-224") if is_vision_available() else None
 
     @slow
     def test_inference_image_classification_head(self):
         model = DinatForImageClassification.from_pretrained("shi-labs/dinat-mini-in1k-224").to(torch_device)
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
 
         image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -378,3 +387,13 @@ def test_inference_image_classification_head(self):
         self.assertEqual(outputs.logits.shape, expected_shape)
         expected_slice = torch.tensor([-0.1545, -0.7667, 0.4642]).to(torch_device)
         self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
+
+
+@require_torch
+@require_natten
+class DinatBackboneTest(unittest.TestCase, BackboneTesterMixin):
+    all_model_classes = (DinatBackbone,) if is_torch_available() else ()
+    config_class = DinatConfig
+
+    def setUp(self):
+        self.model_tester = DinatModelTester(self)
diff --git a/tests/models/dinov2/__init__.py b/tests/models/dinov2/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/dinov2/test_modeling_dinov2.py b/tests/models/dinov2/test_modeling_dinov2.py
new file mode 100644
index 000000000000..fa20833823e2
--- /dev/null
+++ b/tests/models/dinov2/test_modeling_dinov2.py
@@ -0,0 +1,329 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Dinov2 model. """
+
+
+import inspect
+import unittest
+
+from transformers import Dinov2Config
+from transformers.testing_utils import (
+    require_torch,
+    require_vision,
+    slow,
+    torch_device,
+)
+from transformers.utils import cached_property, is_torch_available, is_vision_available
+
+from ...test_backbone_common import BackboneTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import Dinov2Backbone, Dinov2ForImageClassification, Dinov2Model
+    from transformers.models.dinov2.modeling_dinov2 import DINOV2_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import AutoImageProcessor
+
+
+class Dinov2ModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        image_size=30,
+        patch_size=2,
+        num_channels=3,
+        is_training=True,
+        use_labels=True,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        type_sequence_label_size=10,
+        initializer_range=0.02,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.scope = scope
+
+        # in Dinov2, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
+        num_patches = (image_size // patch_size) ** 2
+        self.seq_length = num_patches + 1
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+
+        config = self.get_config()
+
+        return config, pixel_values, labels
+
+    def get_config(self):
+        return Dinov2Config(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+        )
+
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = Dinov2Model(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_backbone(self, config, pixel_values, labels):
+        model = Dinov2Backbone(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+
+        # verify hidden states
+        self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
+        expected_size = self.image_size // config.patch_size
+        self.parent.assertListEqual(
+            list(result.feature_maps[0].shape), [self.batch_size, model.channels[0], expected_size, expected_size]
+        )
+
+        # verify channels
+        self.parent.assertEqual(len(model.channels), len(config.out_features))
+
+        # verify backbone works with out_features=None
+        config.out_features = None
+        model = Dinov2Backbone(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+
+        # verify feature maps
+        self.parent.assertEqual(len(result.feature_maps), 1)
+        self.parent.assertListEqual(
+            list(result.feature_maps[0].shape), [self.batch_size, model.channels[0], expected_size, expected_size]
+        )
+
+        # verify channels
+        self.parent.assertEqual(len(model.channels), 1)
+
+        # verify backbone works with apply_layernorm=False and reshape_hidden_states=False
+        config.apply_layernorm = False
+        config.reshape_hidden_states = False
+
+        model = Dinov2Backbone(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+
+        # verify feature maps
+        self.parent.assertEqual(len(result.feature_maps), 1)
+        self.parent.assertListEqual(
+            list(result.feature_maps[0].shape), [self.batch_size, self.seq_length, self.hidden_size]
+        )
+
+    def create_and_check_for_image_classification(self, config, pixel_values, labels):
+        config.num_labels = self.type_sequence_label_size
+        model = Dinov2ForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values, labels=labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+
+        # test greyscale images
+        config.num_channels = 1
+        model = Dinov2ForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+
+        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
+        result = model(pixel_values)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            pixel_values,
+            labels,
+        ) = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class Dinov2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as Dinov2 does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (
+        (
+            Dinov2Model,
+            Dinov2ForImageClassification,
+            Dinov2Backbone,
+        )
+        if is_torch_available()
+        else ()
+    )
+    pipeline_model_mapping = (
+        {"feature-extraction": Dinov2Model, "image-classification": Dinov2ForImageClassification}
+        if is_torch_available()
+        else {}
+    )
+    fx_compatible = False
+
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = Dinov2ModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=Dinov2Config, has_text_modality=False, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="Dinov2 does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_backbone(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_backbone(*config_and_inputs)
+
+    def test_for_image_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
+
+    @unittest.skip(reason="Dinov2 does not support feedforward chunking yet")
+    def test_feed_forward_chunking(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in DINOV2_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = Dinov2Model.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_torch
+@require_vision
+class Dinov2ModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_image_processor(self):
+        return AutoImageProcessor.from_pretrained("facebook/dinov2-base") if is_vision_available() else None
+
+    @slow
+    def test_inference_no_head(self):
+        model = Dinov2Model.from_pretrained("facebook/dinov2-base").to(torch_device)
+
+        image_processor = self.default_image_processor
+        image = prepare_img()
+        inputs = image_processor(image, return_tensors="pt").to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # verify the last hidden states
+        expected_shape = torch.Size((1, 257, 768))
+        self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[-2.1747, -0.4729, 1.0936], [-3.2780, -0.8269, -0.9210], [-2.9129, 1.1284, -0.7306]],
+            device=torch_device,
+        )
+        self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
+
+
+@require_torch
+class Dinov2BackboneTest(unittest.TestCase, BackboneTesterMixin):
+    all_model_classes = (Dinov2Backbone,) if is_torch_available() else ()
+    config_class = Dinov2Config
+
+    has_attentions = False
+
+    def setUp(self):
+        self.model_tester = Dinov2ModelTester(self)
diff --git a/tests/models/distilbert/test_modeling_distilbert.py b/tests/models/distilbert/test_modeling_distilbert.py
index 9b4606b484ee..ff56afd0a981 100644
--- a/tests/models/distilbert/test_modeling_distilbert.py
+++ b/tests/models/distilbert/test_modeling_distilbert.py
@@ -21,6 +21,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -49,7 +50,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -195,8 +196,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class DistilBertModelTest(ModelTesterMixin, unittest.TestCase):
-
+class DistilBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             DistilBertModel,
@@ -209,6 +209,18 @@ class DistilBertModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else None
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": DistilBertModel,
+            "fill-mask": DistilBertForMaskedLM,
+            "question-answering": DistilBertForQuestionAnswering,
+            "text-classification": DistilBertForSequenceClassification,
+            "token-classification": DistilBertForTokenClassification,
+            "zero-shot": DistilBertForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
     fx_compatible = True
     test_pruning = True
     test_resize_embeddings = True
@@ -256,7 +268,6 @@ def test_model_from_pretrained(self):
     def test_torchscript_device_change(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
-
             # BertForMultipleChoice behaves incorrectly in JIT environments.
             if model_class == DistilBertForMultipleChoice:
                 return
diff --git a/tests/models/distilbert/test_modeling_flax_distilbert.py b/tests/models/distilbert/test_modeling_flax_distilbert.py
index e0f609b4ddf3..1f5a402e86ac 100644
--- a/tests/models/distilbert/test_modeling_flax_distilbert.py
+++ b/tests/models/distilbert/test_modeling_flax_distilbert.py
@@ -24,6 +24,7 @@
 
 if is_flax_available():
     import jax.numpy as jnp
+
     from transformers.models.distilbert.modeling_flax_distilbert import (
         FlaxDistilBertForMaskedLM,
         FlaxDistilBertForMultipleChoice,
@@ -46,7 +47,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -111,7 +112,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_flax
 class FlaxDistilBertModelTest(FlaxModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             FlaxDistilBertModel,
diff --git a/tests/models/distilbert/test_modeling_tf_distilbert.py b/tests/models/distilbert/test_modeling_tf_distilbert.py
index e52532d5618a..937dd24d6d77 100644
--- a/tests/models/distilbert/test_modeling_tf_distilbert.py
+++ b/tests/models/distilbert/test_modeling_tf_distilbert.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import DistilBertConfig, is_tf_available
@@ -21,6 +23,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -51,7 +54,7 @@ def __init__(
         self.use_labels = True
         self.vocab_size = 99
         self.hidden_size = 32
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.num_attention_heads = 4
         self.intermediate_size = 37
         self.hidden_act = "gelu"
@@ -169,8 +172,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFDistilBertModelTest(TFModelTesterMixin, unittest.TestCase):
-
+class TFDistilBertModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             TFDistilBertModel,
@@ -183,6 +185,18 @@ class TFDistilBertModelTest(TFModelTesterMixin, unittest.TestCase):
         if is_tf_available()
         else None
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": TFDistilBertModel,
+            "fill-mask": TFDistilBertForMaskedLM,
+            "question-answering": TFDistilBertForQuestionAnswering,
+            "text-classification": TFDistilBertForSequenceClassification,
+            "token-classification": TFDistilBertForTokenClassification,
+            "zero-shot": TFDistilBertForSequenceClassification,
+        }
+        if is_tf_available()
+        else {}
+    )
     test_head_masking = False
     test_onnx = False
 
diff --git a/tests/models/distilbert/test_tokenization_distilbert.py b/tests/models/distilbert/test_tokenization_distilbert.py
index 7b2c97d78a0f..094223957209 100644
--- a/tests/models/distilbert/test_tokenization_distilbert.py
+++ b/tests/models/distilbert/test_tokenization_distilbert.py
@@ -22,7 +22,6 @@
 
 @require_tokenizers
 class DistilBertTokenizationTest(BertTokenizationTest):
-
     tokenizer_class = DistilBertTokenizer
     rust_tokenizer_class = DistilBertTokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/dit/test_modeling_dit.py b/tests/models/dit/test_modeling_dit.py
index e7e1474f17cb..1804afdf8289 100644
--- a/tests/models/dit/test_modeling_dit.py
+++ b/tests/models/dit/test_modeling_dit.py
@@ -25,7 +25,7 @@
     from transformers import AutoModelForImageClassification
 
 if is_vision_available():
-    from transformers import AutoFeatureExtractor
+    from transformers import AutoImageProcessor
 
 
 @require_torch
@@ -33,7 +33,7 @@
 class DiTIntegrationTest(unittest.TestCase):
     @slow
     def test_for_image_classification(self):
-        feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/dit-base-finetuned-rvlcdip")
+        image_processor = AutoImageProcessor.from_pretrained("microsoft/dit-base-finetuned-rvlcdip")
         model = AutoModelForImageClassification.from_pretrained("microsoft/dit-base-finetuned-rvlcdip")
         model.to(torch_device)
 
@@ -43,7 +43,7 @@ def test_for_image_classification(self):
 
         image = dataset["train"][0]["image"].convert("RGB")
 
-        inputs = feature_extractor(image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/donut/test_feature_extraction_donut.py b/tests/models/donut/test_feature_extraction_donut.py
deleted file mode 100644
index 4d0f88ac988b..000000000000
--- a/tests/models/donut/test_feature_extraction_donut.py
+++ /dev/null
@@ -1,217 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-
-from transformers.testing_utils import is_flaky, require_torch, require_vision
-from transformers.utils import is_torch_available, is_vision_available
-
-from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
-
-
-if is_torch_available():
-    import torch
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import DonutFeatureExtractor
-
-
-class DonutFeatureExtractionTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_thumbnail=True,
-        do_align_axis=False,
-        do_pad=True,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size if size is not None else {"height": 18, "width": 20}
-        self.do_thumbnail = do_thumbnail
-        self.do_align_axis = do_align_axis
-        self.do_pad = do_pad
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-
-    def prepare_feat_extract_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_thumbnail": self.do_thumbnail,
-            "do_align_long_axis": self.do_align_axis,
-            "do_pad": self.do_pad,
-            "do_normalize": self.do_normalize,
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-        }
-
-
-@require_torch
-@require_vision
-class DonutFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
-
-    feature_extraction_class = DonutFeatureExtractor if is_vision_available() else None
-
-    def setUp(self):
-        self.feature_extract_tester = DonutFeatureExtractionTester(self)
-
-    @property
-    def feat_extract_dict(self):
-        return self.feature_extract_tester.prepare_feat_extract_dict()
-
-    def test_feat_extract_properties(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        self.assertTrue(hasattr(feature_extractor, "do_resize"))
-        self.assertTrue(hasattr(feature_extractor, "size"))
-        self.assertTrue(hasattr(feature_extractor, "do_thumbnail"))
-        self.assertTrue(hasattr(feature_extractor, "do_align_long_axis"))
-        self.assertTrue(hasattr(feature_extractor, "do_pad"))
-        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
-        self.assertTrue(hasattr(feature_extractor, "image_mean"))
-        self.assertTrue(hasattr(feature_extractor, "image_std"))
-
-    def test_feat_extract_from_dict_with_kwargs(self):
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict)
-        self.assertEqual(feature_extractor.size, {"height": 18, "width": 20})
-
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict, size=42)
-        self.assertEqual(feature_extractor.size, {"height": 42, "width": 42})
-
-        # Previous config had dimensions in (width, height) order
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict, size=(42, 84))
-        self.assertEqual(feature_extractor.size, {"height": 84, "width": 42})
-
-    def test_batch_feature(self):
-        pass
-
-    @is_flaky()
-    def test_call_pil(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PIL images
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-    @is_flaky()
-    def test_call_numpy(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random numpy tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-    @is_flaky()
-    def test_call_pytorch(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
diff --git a/tests/models/donut/test_image_processing_donut.py b/tests/models/donut/test_image_processing_donut.py
new file mode 100644
index 000000000000..c1a2bd3b26ec
--- /dev/null
+++ b/tests/models/donut/test_image_processing_donut.py
@@ -0,0 +1,227 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+
+from transformers.testing_utils import is_flaky, require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import DonutImageProcessor
+
+
+class DonutImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_thumbnail=True,
+        do_align_axis=False,
+        do_pad=True,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size if size is not None else {"height": 18, "width": 20}
+        self.do_thumbnail = do_thumbnail
+        self.do_align_axis = do_align_axis
+        self.do_pad = do_pad
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_thumbnail": self.do_thumbnail,
+            "do_align_long_axis": self.do_align_axis,
+            "do_pad": self.do_pad,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+        }
+
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.size["height"], self.size["width"]
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class DonutImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = DonutImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = DonutImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "do_thumbnail"))
+        self.assertTrue(hasattr(image_processing, "do_align_long_axis"))
+        self.assertTrue(hasattr(image_processing, "do_pad"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"height": 18, "width": 20})
+
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
+        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
+
+        # Previous config had dimensions in (width, height) order
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=(42, 84))
+        self.assertEqual(image_processor.size, {"height": 84, "width": 42})
+
+    @is_flaky()
+    def test_call_pil(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PIL images
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
+        for image in image_inputs:
+            self.assertIsInstance(image, Image.Image)
+
+        # Test not batched input
+        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                1,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.size["height"],
+                self.image_processor_tester.size["width"],
+            ),
+        )
+
+        # Test batched
+        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.image_processor_tester.batch_size,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.size["height"],
+                self.image_processor_tester.size["width"],
+            ),
+        )
+
+    @is_flaky()
+    def test_call_numpy(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random numpy tensors
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, np.ndarray)
+
+        # Test not batched input
+        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                1,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.size["height"],
+                self.image_processor_tester.size["width"],
+            ),
+        )
+
+        # Test batched
+        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.image_processor_tester.batch_size,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.size["height"],
+                self.image_processor_tester.size["width"],
+            ),
+        )
+
+    @is_flaky()
+    def test_call_pytorch(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PyTorch tensors
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, torch.Tensor)
+
+        # Test not batched input
+        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                1,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.size["height"],
+                self.image_processor_tester.size["width"],
+            ),
+        )
+
+        # Test batched
+        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.image_processor_tester.batch_size,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.size["height"],
+                self.image_processor_tester.size["width"],
+            ),
+        )
diff --git a/tests/models/donut/test_modeling_donut_swin.py b/tests/models/donut/test_modeling_donut_swin.py
index a35a65505981..2a0d9f5e17cb 100644
--- a/tests/models/donut/test_modeling_donut_swin.py
+++ b/tests/models/donut/test_modeling_donut_swin.py
@@ -24,6 +24,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -143,9 +144,9 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class DonutSwinModelTest(ModelTesterMixin, unittest.TestCase):
-
+class DonutSwinModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (DonutSwinModel,) if is_torch_available() else ()
+    pipeline_model_mapping = {"feature-extraction": DonutSwinModel} if is_torch_available() else {}
     fx_compatible = True
 
     test_pruning = False
diff --git a/tests/models/dpr/test_modeling_dpr.py b/tests/models/dpr/test_modeling_dpr.py
index 708f1d53c3a4..b6a687a351b0 100644
--- a/tests/models/dpr/test_modeling_dpr.py
+++ b/tests/models/dpr/test_modeling_dpr.py
@@ -22,6 +22,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -47,7 +48,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -178,8 +179,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class DPRModelTest(ModelTesterMixin, unittest.TestCase):
-
+class DPRModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             DPRContextEncoder,
@@ -189,6 +189,7 @@ class DPRModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = {"feature-extraction": DPRQuestionEncoder} if is_torch_available() else {}
 
     test_resize_embeddings = False
     test_missing_keys = False  # why?
diff --git a/tests/models/dpr/test_modeling_tf_dpr.py b/tests/models/dpr/test_modeling_tf_dpr.py
index 86ef3837f1fa..113514086233 100644
--- a/tests/models/dpr/test_modeling_tf_dpr.py
+++ b/tests/models/dpr/test_modeling_tf_dpr.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import is_tf_available
@@ -20,6 +22,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -50,7 +53,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -171,8 +174,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFDPRModelTest(TFModelTesterMixin, unittest.TestCase):
-
+class TFDPRModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             TFDPRContextEncoder,
@@ -182,6 +184,7 @@ class TFDPRModelTest(TFModelTesterMixin, unittest.TestCase):
         if is_tf_available()
         else ()
     )
+    pipeline_model_mapping = {"feature-extraction": TFDPRQuestionEncoder} if is_tf_available() else {}
 
     test_resize_embeddings = False
     test_missing_keys = False
diff --git a/tests/models/dpr/test_tokenization_dpr.py b/tests/models/dpr/test_tokenization_dpr.py
index 8ad2fea09c8b..db41052d4cd0 100644
--- a/tests/models/dpr/test_tokenization_dpr.py
+++ b/tests/models/dpr/test_tokenization_dpr.py
@@ -30,7 +30,6 @@
 
 @require_tokenizers
 class DPRContextEncoderTokenizationTest(BertTokenizationTest):
-
     tokenizer_class = DPRContextEncoderTokenizer
     rust_tokenizer_class = DPRContextEncoderTokenizerFast
     test_rust_tokenizer = True
@@ -38,7 +37,6 @@ class DPRContextEncoderTokenizationTest(BertTokenizationTest):
 
 @require_tokenizers
 class DPRQuestionEncoderTokenizationTest(BertTokenizationTest):
-
     tokenizer_class = DPRQuestionEncoderTokenizer
     rust_tokenizer_class = DPRQuestionEncoderTokenizerFast
     test_rust_tokenizer = True
@@ -46,7 +44,6 @@ class DPRQuestionEncoderTokenizationTest(BertTokenizationTest):
 
 @require_tokenizers
 class DPRReaderTokenizationTest(BertTokenizationTest):
-
     tokenizer_class = DPRReaderTokenizer
     rust_tokenizer_class = DPRReaderTokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/dpt/test_feature_extraction_dpt.py b/tests/models/dpt/test_feature_extraction_dpt.py
deleted file mode 100644
index 594b1451a74e..000000000000
--- a/tests/models/dpt/test_feature_extraction_dpt.py
+++ /dev/null
@@ -1,196 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-
-from transformers.file_utils import is_torch_available, is_vision_available
-from transformers.testing_utils import require_torch, require_vision
-
-from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
-
-
-if is_torch_available():
-    import torch
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import DPTFeatureExtractor
-
-
-class DPTFeatureExtractionTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-    ):
-        size = size if size is not None else {"height": 18, "width": 18}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-
-    def prepare_feat_extract_dict(self):
-        return {
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_normalize": self.do_normalize,
-            "do_resize": self.do_resize,
-            "size": self.size,
-        }
-
-
-@require_torch
-@require_vision
-class DPTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
-
-    feature_extraction_class = DPTFeatureExtractor if is_vision_available() else None
-
-    def setUp(self):
-        self.feature_extract_tester = DPTFeatureExtractionTester(self)
-
-    @property
-    def feat_extract_dict(self):
-        return self.feature_extract_tester.prepare_feat_extract_dict()
-
-    def test_feat_extract_properties(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        self.assertTrue(hasattr(feature_extractor, "image_mean"))
-        self.assertTrue(hasattr(feature_extractor, "image_std"))
-        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
-        self.assertTrue(hasattr(feature_extractor, "do_resize"))
-        self.assertTrue(hasattr(feature_extractor, "size"))
-
-    def test_feat_extract_from_dict_with_kwargs(self):
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict)
-        self.assertEqual(feature_extractor.size, {"height": 18, "width": 18})
-
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict, size=42)
-        self.assertEqual(feature_extractor.size, {"height": 42, "width": 42})
-
-    def test_call_pil(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PIL images
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-    def test_call_numpy(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random numpy tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-    def test_call_pytorch(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
diff --git a/tests/models/dpt/test_image_processing_dpt.py b/tests/models/dpt/test_image_processing_dpt.py
new file mode 100644
index 000000000000..5670d50b913b
--- /dev/null
+++ b/tests/models/dpt/test_image_processing_dpt.py
@@ -0,0 +1,106 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers.file_utils import is_vision_available
+from transformers.testing_utils import require_torch, require_vision
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_vision_available():
+    from transformers import DPTImageProcessor
+
+
+class DPTImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+    ):
+        size = size if size is not None else {"height": 18, "width": 18}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def prepare_image_processor_dict(self):
+        return {
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_normalize": self.do_normalize,
+            "do_resize": self.do_resize,
+            "size": self.size,
+        }
+
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.size["height"], self.size["width"]
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class DPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = DPTImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = DPTImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"height": 18, "width": 18})
+
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
+        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
diff --git a/tests/models/dpt/test_modeling_dpt.py b/tests/models/dpt/test_modeling_dpt.py
index 7393a2736431..247791ed4127 100644
--- a/tests/models/dpt/test_modeling_dpt.py
+++ b/tests/models/dpt/test_modeling_dpt.py
@@ -24,7 +24,8 @@
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -38,7 +39,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import DPTFeatureExtractor
+    from transformers import DPTImageProcessor
 
 
 class DPTModelTester:
@@ -52,7 +53,7 @@ def __init__(
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         backbone_out_indices=[0, 1, 2, 3],
         num_attention_heads=4,
         intermediate_size=37,
@@ -61,6 +62,7 @@ def __init__(
         attention_probs_dropout_prob=0.1,
         initializer_range=0.02,
         num_labels=3,
+        neck_hidden_sizes=[16, 32],
         is_hybrid=False,
         scope=None,
     ):
@@ -83,6 +85,7 @@ def __init__(
         self.num_labels = num_labels
         self.scope = scope
         self.is_hybrid = is_hybrid
+        self.neck_hidden_sizes = neck_hidden_sizes
         # sequence length of DPT = num_patches + 1 (we add 1 for the [CLS] token)
         num_patches = (image_size // patch_size) ** 2
         self.seq_length = num_patches + 1
@@ -104,6 +107,7 @@ def get_config(self):
             patch_size=self.patch_size,
             num_channels=self.num_channels,
             hidden_size=self.hidden_size,
+            fusion_hidden_size=self.hidden_size,
             num_hidden_layers=self.num_hidden_layers,
             backbone_out_indices=self.backbone_out_indices,
             num_attention_heads=self.num_attention_heads,
@@ -114,6 +118,7 @@ def get_config(self):
             is_decoder=False,
             initializer_range=self.initializer_range,
             is_hybrid=self.is_hybrid,
+            neck_hidden_sizes=self.neck_hidden_sizes,
         )
 
     def create_and_check_model(self, config, pixel_values, labels):
@@ -149,13 +154,22 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class DPTModelTest(ModelTesterMixin, unittest.TestCase):
+class DPTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_common.py, as DPT does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
     """
 
     all_model_classes = (DPTModel, DPTForDepthEstimation, DPTForSemanticSegmentation) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "depth-estimation": DPTForDepthEstimation,
+            "feature-extraction": DPTModel,
+            "image-segmentation": DPTForSemanticSegmentation,
+        }
+        if is_torch_available()
+        else {}
+    )
 
     test_pruning = False
     test_resize_embeddings = False
@@ -242,6 +256,29 @@ def test_training_gradient_checkpointing(self):
             loss = model(**inputs).loss
             loss.backward()
 
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            # Skip the check for the backbone
+            backbone_params = []
+            for name, module in model.named_modules():
+                if module.__class__.__name__ == "DPTViTHybridEmbeddings":
+                    backbone_params = [f"{name}.{key}" for key in module.state_dict().keys()]
+                    break
+
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    if name in backbone_params:
+                        continue
+                    self.assertIn(
+                        ((param.data.mean() * 1e9).round() / 1e9).item(),
+                        [0.0, 1.0],
+                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                    )
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in DPT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
@@ -260,11 +297,11 @@ def prepare_img():
 @slow
 class DPTModelIntegrationTest(unittest.TestCase):
     def test_inference_depth_estimation(self):
-        feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-large")
+        image_processor = DPTImageProcessor.from_pretrained("Intel/dpt-large")
         model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large").to(torch_device)
 
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -282,11 +319,11 @@ def test_inference_depth_estimation(self):
         self.assertTrue(torch.allclose(outputs.predicted_depth[0, :3, :3], expected_slice, atol=1e-4))
 
     def test_inference_semantic_segmentation(self):
-        feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-large-ade")
+        image_processor = DPTImageProcessor.from_pretrained("Intel/dpt-large-ade")
         model = DPTForSemanticSegmentation.from_pretrained("Intel/dpt-large-ade").to(torch_device)
 
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -303,11 +340,11 @@ def test_inference_semantic_segmentation(self):
         self.assertTrue(torch.allclose(outputs.logits[0, 0, :3, :3], expected_slice, atol=1e-4))
 
     def test_post_processing_semantic_segmentation(self):
-        feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-large-ade")
+        image_processor = DPTImageProcessor.from_pretrained("Intel/dpt-large-ade")
         model = DPTForSemanticSegmentation.from_pretrained("Intel/dpt-large-ade").to(torch_device)
 
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -315,10 +352,10 @@ def test_post_processing_semantic_segmentation(self):
 
         outputs.logits = outputs.logits.detach().cpu()
 
-        segmentation = feature_extractor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[(500, 300)])
+        segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[(500, 300)])
         expected_shape = torch.Size((500, 300))
         self.assertEqual(segmentation[0].shape, expected_shape)
 
-        segmentation = feature_extractor.post_process_semantic_segmentation(outputs=outputs)
+        segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs)
         expected_shape = torch.Size((480, 480))
         self.assertEqual(segmentation[0].shape, expected_shape)
diff --git a/tests/models/dpt/test_modeling_dpt_hybrid.py b/tests/models/dpt/test_modeling_dpt_hybrid.py
index 494d595a5a88..7270f609c2bc 100644
--- a/tests/models/dpt/test_modeling_dpt_hybrid.py
+++ b/tests/models/dpt/test_modeling_dpt_hybrid.py
@@ -24,7 +24,8 @@
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -38,7 +39,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import DPTFeatureExtractor
+    from transformers import DPTImageProcessor
 
 
 class DPTModelTester:
@@ -61,7 +62,8 @@ def __init__(
         attention_probs_dropout_prob=0.1,
         initializer_range=0.02,
         num_labels=3,
-        backbone_featmap_shape=[1, 384, 24, 24],
+        backbone_featmap_shape=[1, 32, 24, 24],
+        neck_hidden_sizes=[16, 16, 32, 32],
         is_hybrid=True,
         scope=None,
     ):
@@ -85,6 +87,7 @@ def __init__(
         self.backbone_featmap_shape = backbone_featmap_shape
         self.scope = scope
         self.is_hybrid = is_hybrid
+        self.neck_hidden_sizes = neck_hidden_sizes
         # sequence length of DPT = num_patches + 1 (we add 1 for the [CLS] token)
         num_patches = (image_size // patch_size) ** 2
         self.seq_length = num_patches + 1
@@ -107,7 +110,7 @@ def get_config(self):
             "depths": [3, 4, 9],
             "out_features": ["stage1", "stage2", "stage3"],
             "embedding_dynamic_padding": True,
-            "hidden_sizes": [96, 192, 384, 768],
+            "hidden_sizes": [16, 16, 32, 32],
             "num_groups": 2,
         }
 
@@ -116,6 +119,7 @@ def get_config(self):
             patch_size=self.patch_size,
             num_channels=self.num_channels,
             hidden_size=self.hidden_size,
+            fusion_hidden_size=self.hidden_size,
             num_hidden_layers=self.num_hidden_layers,
             backbone_out_indices=self.backbone_out_indices,
             num_attention_heads=self.num_attention_heads,
@@ -128,6 +132,7 @@ def get_config(self):
             is_hybrid=self.is_hybrid,
             backbone_config=backbone_config,
             backbone_featmap_shape=self.backbone_featmap_shape,
+            neck_hidden_sizes=self.neck_hidden_sizes,
         )
 
     def create_and_check_model(self, config, pixel_values, labels):
@@ -163,13 +168,22 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class DPTModelTest(ModelTesterMixin, unittest.TestCase):
+class DPTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_common.py, as DPT does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
     """
 
     all_model_classes = (DPTModel, DPTForDepthEstimation, DPTForSemanticSegmentation) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "depth-estimation": DPTForDepthEstimation,
+            "feature-extraction": DPTModel,
+            "image-segmentation": DPTForSemanticSegmentation,
+        }
+        if is_torch_available()
+        else {}
+    )
 
     test_pruning = False
     test_resize_embeddings = False
@@ -256,6 +270,29 @@ def test_training_gradient_checkpointing(self):
             loss = model(**inputs).loss
             loss.backward()
 
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            # Skip the check for the backbone
+            backbone_params = []
+            for name, module in model.named_modules():
+                if module.__class__.__name__ == "DPTViTHybridEmbeddings":
+                    backbone_params = [f"{name}.{key}" for key in module.state_dict().keys()]
+                    break
+
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    if name in backbone_params:
+                        continue
+                    self.assertIn(
+                        ((param.data.mean() * 1e9).round() / 1e9).item(),
+                        [0.0, 1.0],
+                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                    )
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in DPT_PRETRAINED_MODEL_ARCHIVE_LIST[1:]:
@@ -281,11 +318,11 @@ def prepare_img():
 @slow
 class DPTModelIntegrationTest(unittest.TestCase):
     def test_inference_depth_estimation(self):
-        feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-hybrid-midas")
+        image_processor = DPTImageProcessor.from_pretrained("Intel/dpt-hybrid-midas")
         model = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to(torch_device)
 
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/efficientformer/__init__.py b/tests/models/efficientformer/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/efficientformer/test_image_processing_efficientformer.py b/tests/models/efficientformer/test_image_processing_efficientformer.py
new file mode 100644
index 000000000000..bd91b7710616
--- /dev/null
+++ b/tests/models/efficientformer/test_image_processing_efficientformer.py
@@ -0,0 +1,99 @@
+# coding=utf-8
+# Copyright 2021 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_vision_available():
+    from transformers import ViTImageProcessor
+
+
+class EfficientFormerImageProcessorTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        num_channels=3,
+        image_size=224,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+    ):
+        size = size if size is not None else {"height": 18, "width": 18}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def prepare_image_processor_dict(self):
+        return {
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_normalize": self.do_normalize,
+            "do_resize": self.do_resize,
+            "size": self.size,
+        }
+
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.size["height"], self.size["width"]
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class EfficientFormerImageProcessorTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = ViTImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = EfficientFormerImageProcessorTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_proc_properties(self):
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processor, "image_mean"))
+        self.assertTrue(hasattr(image_processor, "image_std"))
+        self.assertTrue(hasattr(image_processor, "do_normalize"))
+        self.assertTrue(hasattr(image_processor, "do_resize"))
+        self.assertTrue(hasattr(image_processor, "size"))
diff --git a/tests/models/efficientformer/test_modeling_efficientformer.py b/tests/models/efficientformer/test_modeling_efficientformer.py
new file mode 100644
index 000000000000..2774a210da56
--- /dev/null
+++ b/tests/models/efficientformer/test_modeling_efficientformer.py
@@ -0,0 +1,494 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch EfficientFormer model. """
+
+
+import inspect
+import unittest
+import warnings
+from typing import List
+
+from transformers import EfficientFormerConfig
+from transformers.models.auto import get_values
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.utils import cached_property, is_torch_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
+        MODEL_MAPPING,
+        EfficientFormerForImageClassification,
+        EfficientFormerForImageClassificationWithTeacher,
+        EfficientFormerModel,
+    )
+    from transformers.models.efficientformer.modeling_efficientformer import (
+        EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+    )
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import EfficientFormerImageProcessor
+
+
+class EfficientFormerModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size: int = 13,
+        image_size: int = 64,
+        patch_size: int = 2,
+        embed_dim: int = 3,
+        num_channels: int = 3,
+        is_training: bool = True,
+        use_labels: bool = True,
+        hidden_size: int = 128,
+        hidden_sizes=[16, 32, 64, 128],
+        num_hidden_layers: int = 7,
+        num_attention_heads: int = 4,
+        intermediate_size: int = 37,
+        hidden_act: str = "gelu",
+        hidden_dropout_prob: float = 0.1,
+        attention_probs_dropout_prob: float = 0.1,
+        type_sequence_label_size: int = 10,
+        initializer_range: float = 0.02,
+        encoder_stride: int = 2,
+        num_attention_outputs: int = 1,
+        dim: int = 128,
+        depths: List[int] = [2, 2, 2, 2],
+        resolution: int = 2,
+        mlp_expansion_ratio: int = 2,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.encoder_stride = encoder_stride
+        self.num_attention_outputs = num_attention_outputs
+        self.embed_dim = embed_dim
+        self.seq_length = embed_dim + 1
+        self.resolution = resolution
+        self.depths = depths
+        self.hidden_sizes = hidden_sizes
+        self.dim = dim
+        self.mlp_expansion_ratio = mlp_expansion_ratio
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+
+        config = self.get_config()
+        return config, pixel_values, labels
+
+    def get_config(self):
+        return EfficientFormerConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+            encoder_stride=self.encoder_stride,
+            resolution=self.resolution,
+            depths=self.depths,
+            hidden_sizes=self.hidden_sizes,
+            dim=self.dim,
+            mlp_expansion_ratio=self.mlp_expansion_ratio,
+        )
+
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = EfficientFormerModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_image_classification(self, config, pixel_values, labels):
+        config.num_labels = self.type_sequence_label_size
+        model = EfficientFormerForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values, labels=labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+
+        # test greyscale images
+        config.num_channels = 1
+        model = EfficientFormerForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+
+        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
+        result = model(pixel_values)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            pixel_values,
+            labels,
+        ) = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class EfficientFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as EfficientFormer does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (
+        (
+            EfficientFormerModel,
+            EfficientFormerForImageClassificationWithTeacher,
+            EfficientFormerForImageClassification,
+        )
+        if is_torch_available()
+        else ()
+    )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": EfficientFormerModel,
+            "image-classification": (
+                EfficientFormerForImageClassification,
+                EfficientFormerForImageClassificationWithTeacher,
+            ),
+        }
+        if is_torch_available()
+        else {}
+    )
+    fx_compatible = False
+
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = EfficientFormerModelTester(self)
+        self.config_tester = ConfigTester(
+            self, config_class=EfficientFormerConfig, has_text_modality=False, hidden_size=37
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="EfficientFormer does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="EfficientFormer does not support input and output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+            expected_num_layers = getattr(
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+            )
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            if hasattr(self.model_tester, "encoder_seq_length"):
+                seq_length = self.model_tester.encoder_seq_length
+                if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1:
+                    seq_length = seq_length * self.model_tester.chunk_length
+            else:
+                seq_length = self.model_tester.seq_length
+
+            self.assertListEqual(
+                list(hidden_states[-1].shape[-2:]),
+                [seq_length, self.model_tester.hidden_size],
+            )
+
+            if config.is_encoder_decoder:
+                hidden_states = outputs.decoder_hidden_states
+
+                self.assertIsInstance(hidden_states, (list, tuple))
+                self.assertEqual(len(hidden_states), expected_num_layers)
+                seq_len = getattr(self.model_tester, "seq_length", None)
+                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
+
+                self.assertListEqual(
+                    list(hidden_states[-1].shape[-2:]),
+                    [decoder_seq_length, self.model_tester.hidden_size],
+                )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+
+        if return_labels:
+            if model_class.__name__ == "EfficientFormerForImageClassificationWithTeacher":
+                del inputs_dict["labels"]
+
+        return inputs_dict
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip(reason="EfficientFormer does not implement masked image modeling yet")
+    def test_for_masked_image_modeling(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_image_modeling(*config_and_inputs)
+
+    def test_for_image_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
+
+    # special case for EfficientFormerForImageClassificationWithTeacher model
+    def test_training(self):
+        if not self.model_tester.is_training:
+            return
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        for model_class in self.all_model_classes:
+            # EfficientFormerForImageClassificationWithTeacher supports inference-only
+            if (
+                model_class in get_values(MODEL_MAPPING)
+                or model_class.__name__ == "EfficientFormerForImageClassificationWithTeacher"
+            ):
+                continue
+            model = model_class(config)
+            model.to(torch_device)
+            model.train()
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            loss = model(**inputs).loss
+            loss.backward()
+
+    def test_problem_types(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        problem_types = [
+            {"title": "multi_label_classification", "num_labels": 2, "dtype": torch.float},
+            {"title": "single_label_classification", "num_labels": 1, "dtype": torch.long},
+            {"title": "regression", "num_labels": 1, "dtype": torch.float},
+        ]
+
+        for model_class in self.all_model_classes:
+            if (
+                model_class
+                not in [
+                    *get_values(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING),
+                ]
+                or model_class.__name__ == "EfficientFormerForImageClassificationWithTeacher"
+            ):
+                continue
+
+            for problem_type in problem_types:
+                with self.subTest(msg=f"Testing {model_class} with {problem_type['title']}"):
+                    config.problem_type = problem_type["title"]
+                    config.num_labels = problem_type["num_labels"]
+
+                    model = model_class(config)
+                    model.to(torch_device)
+                    model.train()
+
+                    inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+
+                    if problem_type["num_labels"] > 1:
+                        inputs["labels"] = inputs["labels"].unsqueeze(1).repeat(1, problem_type["num_labels"])
+
+                    inputs["labels"] = inputs["labels"].to(problem_type["dtype"])
+
+                    # This tests that we do not trigger the warning form PyTorch "Using a target size that is different
+                    # to the input size. This will likely lead to incorrect results due to broadcasting. Please ensure
+                    # they have the same size." which is a symptom something in wrong for the regression problem.
+                    # See https://github.com/huggingface/transformers/issues/11780
+                    with warnings.catch_warnings(record=True) as warning_list:
+                        loss = model(**inputs).loss
+                    for w in warning_list:
+                        if "Using a target size that is different to the input size" in str(w.message):
+                            raise ValueError(
+                                f"Something is going wrong in the regression problem: intercepted {w.message}"
+                            )
+
+                    loss.backward()
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = EfficientFormerModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        seq_len = getattr(self.model_tester, "seq_length", None)
+        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
+        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
+        chunk_length = getattr(self.model_tester, "chunk_length", None)
+
+        if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
+            encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_attention_outputs)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_attention_outputs)
+
+            if chunk_length is not None:
+                self.assertListEqual(
+                    list(attentions[0].shape[-4:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
+                )
+            else:
+                self.assertListEqual(
+                    list(attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                )
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_torch
+@require_vision
+class EfficientFormerModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_image_processor(self):
+        return (
+            EfficientFormerImageProcessor.from_pretrained("snap-research/efficientformer-l1-300")
+            if is_vision_available()
+            else None
+        )
+
+    @slow
+    def test_inference_image_classification_head(self):
+        model = EfficientFormerForImageClassification.from_pretrained("snap-research/efficientformer-l1-300").to(
+            torch_device
+        )
+
+        image_processor = self.default_image_processor
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # verify the logits
+        expected_shape = (1, 1000)
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        expected_slice = torch.tensor([-0.0555, 0.4825, -0.0852]).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.logits[0][:3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_image_classification_head_with_teacher(self):
+        model = EfficientFormerForImageClassificationWithTeacher.from_pretrained(
+            "snap-research/efficientformer-l1-300"
+        ).to(torch_device)
+
+        image_processor = self.default_image_processor
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # verify the logits
+        expected_shape = (1, 1000)
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        expected_slice = torch.tensor([-0.1312, 0.4353, -1.0499]).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.logits[0][:3], expected_slice, atol=1e-4))
diff --git a/tests/models/efficientformer/test_modeling_tf_efficientformer.py b/tests/models/efficientformer/test_modeling_tf_efficientformer.py
new file mode 100644
index 000000000000..059ff1ac1295
--- /dev/null
+++ b/tests/models/efficientformer/test_modeling_tf_efficientformer.py
@@ -0,0 +1,411 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the TensorFlow EfficientFormer model. """
+
+import inspect
+import unittest
+from typing import List
+
+import numpy as np
+
+from transformers import EfficientFormerConfig
+from transformers.testing_utils import require_tf, require_vision, slow
+from transformers.utils import cached_property, is_tf_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers import (
+        TFEfficientFormerForImageClassification,
+        TFEfficientFormerForImageClassificationWithTeacher,
+        TFEfficientFormerModel,
+    )
+    from transformers.models.efficientformer.modeling_tf_efficientformer import (
+        TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+    )
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import EfficientFormerImageProcessor
+
+
+class TFEfficientFormerModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size: int = 13,
+        image_size: int = 64,
+        patch_size: int = 2,
+        embed_dim: int = 3,
+        num_channels: int = 3,
+        is_training: bool = True,
+        use_labels: bool = True,
+        hidden_size: int = 128,
+        hidden_sizes=[16, 32, 64, 128],
+        num_hidden_layers: int = 7,
+        num_attention_heads: int = 4,
+        intermediate_size: int = 37,
+        hidden_act: str = "gelu",
+        hidden_dropout_prob: float = 0.1,
+        attention_probs_dropout_prob: float = 0.1,
+        type_sequence_label_size: int = 10,
+        initializer_range: float = 0.02,
+        encoder_stride: int = 2,
+        num_attention_outputs: int = 1,
+        dim: int = 128,
+        depths: List[int] = [2, 2, 2, 2],
+        resolution: int = 2,
+        mlp_expansion_ratio: int = 2,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.encoder_stride = encoder_stride
+        self.num_attention_outputs = num_attention_outputs
+        self.embed_dim = embed_dim
+        self.seq_length = embed_dim + 1
+        self.resolution = resolution
+        self.depths = depths
+        self.hidden_sizes = hidden_sizes
+        self.dim = dim
+        self.mlp_expansion_ratio = mlp_expansion_ratio
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+
+        config = self.get_config()
+
+        return config, pixel_values, labels
+
+    def get_config(self):
+        return EfficientFormerConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+            encoder_stride=self.encoder_stride,
+            resolution=self.resolution,
+            depths=self.depths,
+            hidden_sizes=self.hidden_sizes,
+            dim=self.dim,
+            mlp_expansion_ratio=self.mlp_expansion_ratio,
+        )
+
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = TFEfficientFormerModel(config=config)
+        result = model(pixel_values, training=False)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_image_classification(self, config, pixel_values, labels):
+        config.num_labels = self.type_sequence_label_size
+        model = TFEfficientFormerForImageClassification(config)
+        result = model(pixel_values, labels=labels, training=False)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+
+        # test greyscale images
+        config.num_channels = 1
+        model = TFEfficientFormerForImageClassification(config)
+
+        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
+        result = model(pixel_values, labels=labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values, labels = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_tf
+class TFEfficientFormerModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_tf_common.py, as EfficientFormer does not use input_ids,
+    inputs_embeds, attention_mask and seq_length.
+    """
+
+    all_model_classes = (
+        (
+            TFEfficientFormerModel,
+            TFEfficientFormerForImageClassificationWithTeacher,
+            TFEfficientFormerForImageClassification,
+        )
+        if is_tf_available()
+        else ()
+    )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": TFEfficientFormerModel,
+            "image-classification": (
+                TFEfficientFormerForImageClassification,
+                TFEfficientFormerForImageClassificationWithTeacher,
+            ),
+        }
+        if is_tf_available()
+        else {}
+    )
+
+    fx_compatible = False
+
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = TFEfficientFormerModelTester(self)
+        self.config_tester = ConfigTester(
+            self, config_class=EfficientFormerConfig, has_text_modality=False, hidden_size=37
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="EfficientFormer does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="EfficientFormer does not support input and output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.call)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+            expected_num_layers = getattr(
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+            )
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            if hasattr(self.model_tester, "encoder_seq_length"):
+                seq_length = self.model_tester.encoder_seq_length
+                if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1:
+                    seq_length = seq_length * self.model_tester.chunk_length
+            else:
+                seq_length = self.model_tester.seq_length
+
+            self.assertListEqual(
+                list(hidden_states[-1].shape[-2:]),
+                [seq_length, self.model_tester.hidden_size],
+            )
+
+            if config.is_encoder_decoder:
+                hidden_states = outputs.decoder_hidden_states
+
+                self.asseretIsInstance(hidden_states, (list, tuple))
+                self.assertEqual(len(hidden_states), expected_num_layers)
+                seq_len = getattr(self.model_tester, "seq_length", None)
+                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
+
+                self.assertListEqual(
+                    list(hidden_states[-1].shape[-2:]),
+                    [decoder_seq_length, self.model_tester.hidden_size],
+                )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+
+        if return_labels:
+            if model_class.__name__ == "TFEfficientFormerForImageClassificationWithTeacher":
+                del inputs_dict["labels"]
+
+        return inputs_dict
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip(reason="EfficientFormer does not implement masked image modeling yet")
+    def test_for_masked_image_modeling(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_image_modeling(*config_and_inputs)
+
+    def test_for_image_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = TFEfficientFormerModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        seq_len = getattr(self.model_tester, "seq_length", None)
+        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
+        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
+        chunk_length = getattr(self.model_tester, "chunk_length", None)
+
+        if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
+            encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_attention_outputs)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
+
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_attention_outputs)
+
+            if chunk_length is not None:
+                self.assertListEqual(
+                    list(attentions[0].shape[-4:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
+                )
+            else:
+                self.assertListEqual(
+                    list(attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                )
+
+    def test_compile_tf_model(self):
+        # We use a simplified version of this test for EfficientFormer because it requires training=False
+        # and Keras refuses to let us force that during functional construction
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            # Prepare our model
+            model = model_class(config)
+            # These are maximally general inputs for the model, with multiple None dimensions
+            # Hopefully this will catch any conditionals that fail for flexible shapes
+            functional_inputs = {
+                key: tf.keras.Input(shape=val.shape[1:], dtype=val.dtype, name=key)
+                for key, val in model.input_signature.items()
+                if key in model.dummy_inputs
+            }
+            outputs_dict = model(functional_inputs)
+            self.assertTrue(outputs_dict is not None)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_tf
+@require_vision
+class EfficientFormerModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_image_processor(self):
+        return (
+            EfficientFormerImageProcessor.from_pretrained("snap-research/efficientformer-l1-300")
+            if is_vision_available()
+            else None
+        )
+
+    @slow
+    def test_inference_image_classification_head(self):
+        model = TFEfficientFormerForImageClassification.from_pretrained("snap-research/efficientformer-l1-300")
+        image_processor = self.default_image_processor
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="tf")
+        # forward pass
+        outputs = model(**inputs, training=False)
+        # verify the logits
+        expected_shape = tf.TensorShape((1, 1000))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+        expected_slice = tf.constant([-0.0555, 0.4825, -0.0852])
+        self.assertTrue(np.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_image_classification_head_with_teacher(self):
+        model = TFEfficientFormerForImageClassificationWithTeacher.from_pretrained(
+            "snap-research/efficientformer-l1-300"
+        )
+        image_processor = self.default_image_processor
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="tf")
+        # forward pass
+        outputs = model(**inputs, training=False)
+        # verify the logits
+        expected_shape = tf.TensorShape((1, 1000))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+        expected_slice = tf.constant([-0.1312, 0.4353, -1.0499])
+        self.assertTrue(np.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
diff --git a/tests/models/efficientnet/__init__.py b/tests/models/efficientnet/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/efficientnet/test_image_processing_efficientnet.py b/tests/models/efficientnet/test_image_processing_efficientnet.py
new file mode 100644
index 000000000000..fd754d8eb9e9
--- /dev/null
+++ b/tests/models/efficientnet/test_image_processing_efficientnet.py
@@ -0,0 +1,122 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_vision_available():
+    from transformers import EfficientNetImageProcessor
+
+
+class EfficientNetImageProcessorTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+    ):
+        size = size if size is not None else {"height": 18, "width": 18}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def prepare_image_processor_dict(self):
+        return {
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_normalize": self.do_normalize,
+            "do_resize": self.do_resize,
+            "size": self.size,
+        }
+
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.size["height"], self.size["width"]
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class EfficientNetImageProcessorTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = EfficientNetImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = EfficientNetImageProcessorTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"height": 18, "width": 18})
+
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
+        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
+
+    def test_rescale(self):
+        # EfficientNet optionally rescales between -1 and 1 instead of the usual 0 and 1
+        image = np.arange(0, 256, 1, dtype=np.uint8).reshape(1, 8, 32)
+
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+
+        rescaled_image = image_processor.rescale(image, scale=1 / 127.5)
+        expected_image = (image * (1 / 127.5)).astype(np.float32) - 1
+        self.assertTrue(np.allclose(rescaled_image, expected_image))
+
+        rescaled_image = image_processor.rescale(image, scale=1 / 255, offset=False)
+        expected_image = (image / 255.0).astype(np.float32)
+        self.assertTrue(np.allclose(rescaled_image, expected_image))
diff --git a/tests/models/efficientnet/test_modeling_efficientnet.py b/tests/models/efficientnet/test_modeling_efficientnet.py
new file mode 100644
index 000000000000..38a359c574f8
--- /dev/null
+++ b/tests/models/efficientnet/test_modeling_efficientnet.py
@@ -0,0 +1,269 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch EfficientNet model. """
+
+
+import inspect
+import unittest
+
+from transformers import EfficientNetConfig
+from transformers.testing_utils import is_pipeline_test, require_torch, require_vision, slow, torch_device
+from transformers.utils import cached_property, is_torch_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import EfficientNetForImageClassification, EfficientNetModel
+    from transformers.models.efficientnet.modeling_efficientnet import EFFICIENTNET_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import AutoImageProcessor
+
+
+class EfficientNetModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        image_size=32,
+        num_channels=3,
+        kernel_sizes=[3, 3, 5],
+        in_channels=[32, 16, 24],
+        out_channels=[16, 24, 20],
+        strides=[1, 1, 2],
+        num_block_repeats=[1, 1, 2],
+        expand_ratios=[1, 6, 6],
+        is_training=True,
+        use_labels=True,
+        intermediate_size=37,
+        hidden_act="gelu",
+        num_labels=10,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.num_channels = num_channels
+        self.kernel_sizes = kernel_sizes
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.strides = strides
+        self.num_block_repeats = num_block_repeats
+        self.expand_ratios = expand_ratios
+        self.is_training = is_training
+        self.hidden_act = hidden_act
+        self.num_labels = num_labels
+        self.use_labels = use_labels
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.num_labels)
+
+        config = self.get_config()
+        return config, pixel_values, labels
+
+    def get_config(self):
+        return EfficientNetConfig(
+            num_channels=self.num_channels,
+            kernel_sizes=self.kernel_sizes,
+            in_channels=self.in_channels,
+            out_channels=self.out_channels,
+            strides=self.strides,
+            num_block_repeats=self.num_block_repeats,
+            expand_ratios=self.expand_ratios,
+            hidden_act=self.hidden_act,
+            num_labels=self.num_labels,
+        )
+
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = EfficientNetModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        # expected last hidden states: B, C, H // 4, W // 4
+        self.parent.assertEqual(
+            result.last_hidden_state.shape,
+            (self.batch_size, config.hidden_dim, self.image_size // 4, self.image_size // 4),
+        )
+
+    def create_and_check_for_image_classification(self, config, pixel_values, labels):
+        model = EfficientNetForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values, labels=labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values, labels = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class EfficientNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as EfficientNet does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (EfficientNetModel, EfficientNetForImageClassification) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": EfficientNetModel, "image-classification": EfficientNetForImageClassification}
+        if is_torch_available()
+        else {}
+    )
+
+    fx_compatible = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+    has_attentions = False
+
+    def setUp(self):
+        self.model_tester = EfficientNetModelTester(self)
+        self.config_tester = ConfigTester(
+            self, config_class=EfficientNetConfig, has_text_modality=False, hidden_size=37
+        )
+
+    def test_config(self):
+        self.create_and_test_config_common_properties()
+        self.config_tester.create_and_test_config_to_json_string()
+        self.config_tester.create_and_test_config_to_json_file()
+        self.config_tester.create_and_test_config_from_and_save_pretrained()
+        self.config_tester.create_and_test_config_with_num_labels()
+        self.config_tester.check_config_can_be_init_without_params()
+        self.config_tester.check_config_arguments_init()
+
+    def create_and_test_config_common_properties(self):
+        return
+
+    @unittest.skip(reason="EfficientNet does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="EfficientNet does not support input and output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    @unittest.skip(reason="EfficientNet does not use feedforward chunking")
+    def test_feed_forward_chunking(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+            num_blocks = sum(config.num_block_repeats) * 4
+            self.assertEqual(len(hidden_states), num_blocks)
+
+            # EfficientNet's feature maps are of shape (batch_size, num_channels, height, width)
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [self.model_tester.image_size // 2, self.model_tester.image_size // 2],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    def test_for_image_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in EFFICIENTNET_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = EfficientNetModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    @is_pipeline_test
+    @require_vision
+    @slow
+    def test_pipeline_image_classification(self):
+        super().test_pipeline_image_classification()
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_torch
+@require_vision
+class EfficientNetModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_image_processor(self):
+        return AutoImageProcessor.from_pretrained("google/efficientnet-b7") if is_vision_available() else None
+
+    @slow
+    def test_inference_image_classification_head(self):
+        model = EfficientNetForImageClassification.from_pretrained("google/efficientnet-b7").to(torch_device)
+
+        image_processor = self.default_image_processor
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # verify the logits
+        expected_shape = torch.Size((1, 1000))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        expected_slice = torch.tensor([-0.2962, 0.4487, 0.4499]).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
diff --git a/tests/models/electra/test_modeling_electra.py b/tests/models/electra/test_modeling_electra.py
index bdc0715f7458..a5d3fa585e1f 100644
--- a/tests/models/electra/test_modeling_electra.py
+++ b/tests/models/electra/test_modeling_electra.py
@@ -22,6 +22,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -53,7 +54,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -374,8 +375,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class ElectraModelTest(ModelTesterMixin, unittest.TestCase):
-
+class ElectraModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             ElectraModel,
@@ -390,6 +390,19 @@ class ElectraModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": ElectraModel,
+            "fill-mask": ElectraForMaskedLM,
+            "question-answering": ElectraForQuestionAnswering,
+            "text-classification": ElectraForSequenceClassification,
+            "text-generation": ElectraForCausalLM,
+            "token-classification": ElectraForTokenClassification,
+            "zero-shot": ElectraForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
     fx_compatible = True
 
     # special case for ForPreTraining model
diff --git a/tests/models/electra/test_modeling_flax_electra.py b/tests/models/electra/test_modeling_flax_electra.py
index cd1a795a19ef..19b35d894095 100644
--- a/tests/models/electra/test_modeling_flax_electra.py
+++ b/tests/models/electra/test_modeling_flax_electra.py
@@ -34,7 +34,7 @@ def __init__(
         vocab_size=99,
         embedding_size=24,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -105,7 +105,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_flax
 class FlaxElectraModelTest(FlaxModelTesterMixin, unittest.TestCase):
-
     test_head_masking = True
 
     all_model_classes = (
diff --git a/tests/models/electra/test_modeling_tf_electra.py b/tests/models/electra/test_modeling_tf_electra.py
index 0c0c4f77ab32..537cb1df2f9c 100644
--- a/tests/models/electra/test_modeling_tf_electra.py
+++ b/tests/models/electra/test_modeling_tf_electra.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import ElectraConfig, is_tf_available
@@ -21,6 +23,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -51,7 +54,7 @@ def __init__(
         self.use_labels = True
         self.vocab_size = 99
         self.hidden_size = 32
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.num_attention_heads = 4
         self.intermediate_size = 37
         self.hidden_act = "gelu"
@@ -487,8 +490,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFElectraModelTest(TFModelTesterMixin, unittest.TestCase):
-
+class TFElectraModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             TFElectraModel,
@@ -502,6 +504,18 @@ class TFElectraModelTest(TFModelTesterMixin, unittest.TestCase):
         if is_tf_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": TFElectraModel,
+            "fill-mask": TFElectraForMaskedLM,
+            "question-answering": TFElectraForQuestionAnswering,
+            "text-classification": TFElectraForSequenceClassification,
+            "token-classification": TFElectraForTokenClassification,
+            "zero-shot": TFElectraForSequenceClassification,
+        }
+        if is_tf_available()
+        else {}
+    )
     test_head_masking = False
     test_onnx = False
 
diff --git a/tests/models/electra/test_tokenization_electra.py b/tests/models/electra/test_tokenization_electra.py
new file mode 100644
index 000000000000..1c9b517f1f1d
--- /dev/null
+++ b/tests/models/electra/test_tokenization_electra.py
@@ -0,0 +1,335 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+from transformers import ElectraTokenizerFast
+from transformers.models.electra.tokenization_electra import (
+    VOCAB_FILES_NAMES,
+    BasicTokenizer,
+    ElectraTokenizer,
+    WordpieceTokenizer,
+    _is_control,
+    _is_punctuation,
+    _is_whitespace,
+)
+from transformers.testing_utils import require_tokenizers, slow
+
+from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english
+
+
+@require_tokenizers
+class ElectraTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+    tokenizer_class = ElectraTokenizer
+    rust_tokenizer_class = ElectraTokenizerFast
+    test_rust_tokenizer = True
+    space_between_special_tokens = True
+    from_pretrained_filter = filter_non_english
+
+    def setUp(self):
+        super().setUp()
+
+        vocab_tokens = [
+            "[UNK]",
+            "[CLS]",
+            "[SEP]",
+            "[PAD]",
+            "[MASK]",
+            "want",
+            "##want",
+            "##ed",
+            "wa",
+            "un",
+            "runn",
+            "##ing",
+            ",",
+            "low",
+            "lowest",
+        ]
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+
+    def get_input_output_texts(self, tokenizer):
+        input_text = "UNwant\u00E9d,running"
+        output_text = "unwanted, running"
+        return input_text, output_text
+
+    def test_full_tokenizer(self):
+        tokenizer = self.tokenizer_class(self.vocab_file)
+
+        tokens = tokenizer.tokenize("UNwant\u00E9d,running")
+        self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [9, 6, 7, 12, 10, 11])
+
+    def test_rust_and_python_full_tokenizers(self):
+        if not self.test_rust_tokenizer:
+            return
+
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+
+        sequence = "UNwant\u00E9d,running"
+
+        tokens = tokenizer.tokenize(sequence)
+        rust_tokens = rust_tokenizer.tokenize(sequence)
+        self.assertListEqual(tokens, rust_tokens)
+
+        ids = tokenizer.encode(sequence, add_special_tokens=False)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
+        self.assertListEqual(ids, rust_ids)
+
+        rust_tokenizer = self.get_rust_tokenizer()
+        ids = tokenizer.encode(sequence)
+        rust_ids = rust_tokenizer.encode(sequence)
+        self.assertListEqual(ids, rust_ids)
+
+        # With lower casing
+        tokenizer = self.get_tokenizer(do_lower_case=True)
+        rust_tokenizer = self.get_rust_tokenizer(do_lower_case=True)
+
+        sequence = "UNwant\u00E9d,running"
+
+        tokens = tokenizer.tokenize(sequence)
+        rust_tokens = rust_tokenizer.tokenize(sequence)
+        self.assertListEqual(tokens, rust_tokens)
+
+        ids = tokenizer.encode(sequence, add_special_tokens=False)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
+        self.assertListEqual(ids, rust_ids)
+
+        rust_tokenizer = self.get_rust_tokenizer()
+        ids = tokenizer.encode(sequence)
+        rust_ids = rust_tokenizer.encode(sequence)
+        self.assertListEqual(ids, rust_ids)
+
+    def test_chinese(self):
+        tokenizer = BasicTokenizer()
+
+        self.assertListEqual(tokenizer.tokenize("ah\u535A\u63A8zz"), ["ah", "\u535A", "\u63A8", "zz"])
+
+    def test_basic_tokenizer_lower(self):
+        tokenizer = BasicTokenizer(do_lower_case=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["hello", "!", "how", "are", "you", "?"]
+        )
+        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+
+    def test_basic_tokenizer_lower_strip_accents_false(self):
+        tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=False)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hällo", "!", "how", "are", "you", "?"]
+        )
+        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["h\u00E9llo"])
+
+    def test_basic_tokenizer_lower_strip_accents_true(self):
+        tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
+        )
+        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+
+    def test_basic_tokenizer_lower_strip_accents_default(self):
+        tokenizer = BasicTokenizer(do_lower_case=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
+        )
+        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+
+    def test_basic_tokenizer_no_lower(self):
+        tokenizer = BasicTokenizer(do_lower_case=False)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
+        )
+
+    def test_basic_tokenizer_no_lower_strip_accents_false(self):
+        tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=False)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["HäLLo", "!", "how", "Are", "yoU", "?"]
+        )
+
+    def test_basic_tokenizer_no_lower_strip_accents_true(self):
+        tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["HaLLo", "!", "how", "Are", "yoU", "?"]
+        )
+
+    def test_basic_tokenizer_respects_never_split_tokens(self):
+        tokenizer = BasicTokenizer(do_lower_case=False, never_split=["[UNK]"])
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]
+        )
+
+    def test_wordpiece_tokenizer(self):
+        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
+
+        vocab = {}
+        for i, token in enumerate(vocab_tokens):
+            vocab[token] = i
+        tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
+
+        self.assertListEqual(tokenizer.tokenize(""), [])
+
+        self.assertListEqual(tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"])
+
+        self.assertListEqual(tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
+
+    def test_is_whitespace(self):
+        self.assertTrue(_is_whitespace(" "))
+        self.assertTrue(_is_whitespace("\t"))
+        self.assertTrue(_is_whitespace("\r"))
+        self.assertTrue(_is_whitespace("\n"))
+        self.assertTrue(_is_whitespace("\u00A0"))
+
+        self.assertFalse(_is_whitespace("A"))
+        self.assertFalse(_is_whitespace("-"))
+
+    def test_is_control(self):
+        self.assertTrue(_is_control("\u0005"))
+
+        self.assertFalse(_is_control("A"))
+        self.assertFalse(_is_control(" "))
+        self.assertFalse(_is_control("\t"))
+        self.assertFalse(_is_control("\r"))
+
+    def test_is_punctuation(self):
+        self.assertTrue(_is_punctuation("-"))
+        self.assertTrue(_is_punctuation("$"))
+        self.assertTrue(_is_punctuation("`"))
+        self.assertTrue(_is_punctuation("."))
+
+        self.assertFalse(_is_punctuation("A"))
+        self.assertFalse(_is_punctuation(" "))
+
+    def test_clean_text(self):
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+
+        # Example taken from the issue https://github.com/huggingface/tokenizers/issues/340
+        self.assertListEqual([tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]])
+
+        self.assertListEqual(
+            [rust_tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]]
+        )
+
+    @slow
+    def test_sequence_builders(self):
+        tokenizer = self.tokenizer_class.from_pretrained("google/electra-base-discriminator")
+
+        text = tokenizer.encode("sequence builders", add_special_tokens=False)
+        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
+
+        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
+        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
+
+        assert encoded_sentence == [101] + text + [102]
+        assert encoded_pair == [101] + text + [102] + text_2 + [102]
+
+    def test_offsets_with_special_characters(self):
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+                sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
+                tokens = tokenizer_r.encode_plus(
+                    sentence,
+                    return_attention_mask=False,
+                    return_token_type_ids=False,
+                    return_offsets_mapping=True,
+                    add_special_tokens=True,
+                )
+
+                do_lower_case = tokenizer_r.do_lower_case if hasattr(tokenizer_r, "do_lower_case") else False
+                expected_results = (
+                    [
+                        ((0, 0), tokenizer_r.cls_token),
+                        ((0, 1), "A"),
+                        ((1, 2), ","),
+                        ((3, 5), "na"),
+                        ((5, 6), "##ï"),
+                        ((6, 8), "##ve"),
+                        ((9, 15), tokenizer_r.mask_token),
+                        ((16, 21), "Allen"),
+                        ((21, 23), "##NL"),
+                        ((23, 24), "##P"),
+                        ((25, 33), "sentence"),
+                        ((33, 34), "."),
+                        ((0, 0), tokenizer_r.sep_token),
+                    ]
+                    if not do_lower_case
+                    else [
+                        ((0, 0), tokenizer_r.cls_token),
+                        ((0, 1), "a"),
+                        ((1, 2), ","),
+                        ((3, 8), "naive"),
+                        ((9, 15), tokenizer_r.mask_token),
+                        ((16, 21), "allen"),
+                        ((21, 23), "##nl"),
+                        ((23, 24), "##p"),
+                        ((25, 33), "sentence"),
+                        ((33, 34), "."),
+                        ((0, 0), tokenizer_r.sep_token),
+                    ]
+                )
+
+                self.assertEqual(
+                    [e[1] for e in expected_results], tokenizer_r.convert_ids_to_tokens(tokens["input_ids"])
+                )
+                self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"])
+
+    def test_change_tokenize_chinese_chars(self):
+        list_of_commun_chinese_char = ["的", "人", "有"]
+        text_with_chinese_char = "".join(list_of_commun_chinese_char)
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                kwargs["tokenize_chinese_chars"] = True
+                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+                ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
+                ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
+
+                tokens_without_spe_char_r = tokenizer_r.convert_ids_to_tokens(ids_without_spe_char_r)
+                tokens_without_spe_char_p = tokenizer_p.convert_ids_to_tokens(ids_without_spe_char_p)
+
+                # it is expected that each Chinese character is not preceded by "##"
+                self.assertListEqual(tokens_without_spe_char_p, list_of_commun_chinese_char)
+                self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char)
+
+                kwargs["tokenize_chinese_chars"] = False
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+                ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
+                ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
+
+                tokens_without_spe_char_r = tokenizer_r.convert_ids_to_tokens(ids_without_spe_char_r)
+                tokens_without_spe_char_p = tokenizer_p.convert_ids_to_tokens(ids_without_spe_char_p)
+
+                # it is expected that only the first Chinese character is not preceded by "##".
+                expected_tokens = [
+                    f"##{token}" if idx != 0 else token for idx, token in enumerate(list_of_commun_chinese_char)
+                ]
+                self.assertListEqual(tokens_without_spe_char_p, expected_tokens)
+                self.assertListEqual(tokens_without_spe_char_r, expected_tokens)
diff --git a/tests/models/encodec/__init__.py b/tests/models/encodec/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/encodec/test_feature_extraction_encodec.py b/tests/models/encodec/test_feature_extraction_encodec.py
new file mode 100644
index 000000000000..5a8010d247cf
--- /dev/null
+++ b/tests/models/encodec/test_feature_extraction_encodec.py
@@ -0,0 +1,253 @@
+# coding=utf-8
+# Copyright 2021-2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for the EnCodec feature extractor."""
+
+import itertools
+import random
+import unittest
+
+import numpy as np
+
+from transformers import EncodecFeatureExtractor
+from transformers.testing_utils import require_torch
+from transformers.utils.import_utils import is_torch_available
+
+from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
+
+
+if is_torch_available():
+    import torch
+
+
+global_rng = random.Random()
+
+
+# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list
+def floats_list(shape, scale=1.0, rng=None, name=None):
+    """Creates a random float32 tensor"""
+    if rng is None:
+        rng = global_rng
+
+    values = []
+    for batch_idx in range(shape[0]):
+        values.append([])
+        for _ in range(shape[1]):
+            values[-1].append(rng.random() * scale)
+
+    return values
+
+
+@require_torch
+class EnCodecFeatureExtractionTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        min_seq_length=400,
+        max_seq_length=2000,
+        feature_size=1,
+        padding_value=0.0,
+        sampling_rate=24000,
+        return_attention_mask=True,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.min_seq_length = min_seq_length
+        self.max_seq_length = max_seq_length
+        self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
+        self.feature_size = feature_size
+        self.padding_value = padding_value
+        self.sampling_rate = sampling_rate
+        self.return_attention_mask = return_attention_mask
+
+    def prepare_feat_extract_dict(self):
+        return {
+            "feature_size": self.feature_size,
+            "padding_value": self.padding_value,
+            "sampling_rate": self.sampling_rate,
+            "return_attention_mask": self.return_attention_mask,
+        }
+
+    def prepare_inputs_for_common(self, equal_length=False, numpify=False):
+        def _flatten(list_of_lists):
+            return list(itertools.chain(*list_of_lists))
+
+        if equal_length:
+            audio_inputs = floats_list((self.batch_size, self.max_seq_length))
+        else:
+            # make sure that inputs increase in size
+            audio_inputs = [
+                _flatten(floats_list((x, self.feature_size)))
+                for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
+            ]
+
+        if numpify:
+            audio_inputs = [np.asarray(x) for x in audio_inputs]
+
+        return audio_inputs
+
+
+@require_torch
+class EnCodecFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
+    feature_extraction_class = EncodecFeatureExtractor
+
+    def setUp(self):
+        self.feat_extract_tester = EnCodecFeatureExtractionTester(self)
+
+    def test_call(self):
+        # Tests that all call wrap to encode_plus and batch_encode_plus
+        feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        # create three inputs of length 800, 1000, and 1200
+        audio_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
+        np_audio_inputs = [np.asarray(audio_input) for audio_input in audio_inputs]
+
+        # Test not batched input
+        encoded_sequences_1 = feat_extract(audio_inputs[0], return_tensors="np").input_values
+        encoded_sequences_2 = feat_extract(np_audio_inputs[0], return_tensors="np").input_values
+        self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
+
+        # Test batched
+        encoded_sequences_1 = feat_extract(audio_inputs, padding=True, return_tensors="np").input_values
+        encoded_sequences_2 = feat_extract(np_audio_inputs, padding=True, return_tensors="np").input_values
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+
+    def test_double_precision_pad(self):
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        np_audio_inputs = np.random.rand(100).astype(np.float64)
+        py_audio_inputs = np_audio_inputs.tolist()
+
+        for inputs in [py_audio_inputs, np_audio_inputs]:
+            np_processed = feature_extractor.pad([{"input_values": inputs}], return_tensors="np")
+            self.assertTrue(np_processed.input_values.dtype == np.float32)
+            pt_processed = feature_extractor.pad([{"input_values": inputs}], return_tensors="pt")
+            self.assertTrue(pt_processed.input_values.dtype == torch.float32)
+
+    def _load_datasamples(self, num_samples):
+        from datasets import load_dataset
+
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        # automatic decoding with librispeech
+        audio_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+
+        return [x["array"] for x in audio_samples]
+
+    def test_integration(self):
+        # fmt: off
+        EXPECTED_INPUT_VALUES = torch.tensor(
+            [2.3804e-03, 2.0752e-03, 1.9836e-03, 2.1057e-03, 1.6174e-03,
+             3.0518e-04, 9.1553e-05, 3.3569e-04, 9.7656e-04, 1.8311e-03,
+             2.0142e-03, 2.1057e-03, 1.7395e-03, 4.5776e-04, -3.9673e-04,
+             4.5776e-04, 1.0071e-03, 9.1553e-05, 4.8828e-04, 1.1597e-03,
+             7.3242e-04, 9.4604e-04, 1.8005e-03, 1.8311e-03, 8.8501e-04,
+             4.2725e-04, 4.8828e-04, 7.3242e-04, 1.0986e-03, 2.1057e-03]
+        )
+        # fmt: on
+        input_audio = self._load_datasamples(1)
+        feature_extractor = EncodecFeatureExtractor()
+        input_values = feature_extractor(input_audio, return_tensors="pt").input_values
+        self.assertEquals(input_values.shape, (1, 1, 93680))
+        self.assertTrue(torch.allclose(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, atol=1e-6))
+
+    def test_integration_stereo(self):
+        # fmt: off
+        EXPECTED_INPUT_VALUES = torch.tensor(
+            [2.3804e-03, 2.0752e-03, 1.9836e-03, 2.1057e-03, 1.6174e-03,
+             3.0518e-04, 9.1553e-05, 3.3569e-04, 9.7656e-04, 1.8311e-03,
+             2.0142e-03, 2.1057e-03, 1.7395e-03, 4.5776e-04, -3.9673e-04,
+             4.5776e-04, 1.0071e-03, 9.1553e-05, 4.8828e-04, 1.1597e-03,
+             7.3242e-04, 9.4604e-04, 1.8005e-03, 1.8311e-03, 8.8501e-04,
+             4.2725e-04, 4.8828e-04, 7.3242e-04, 1.0986e-03, 2.1057e-03]
+        )
+        # fmt: on
+        input_audio = self._load_datasamples(1)
+        input_audio = [np.tile(input_audio[0][None], reps=(2, 1))]
+        input_audio[0][1] *= 0.5
+        feature_extractor = EncodecFeatureExtractor(feature_size=2)
+        input_values = feature_extractor(input_audio, return_tensors="pt").input_values
+        self.assertEquals(input_values.shape, (1, 2, 93680))
+        self.assertTrue(torch.allclose(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, atol=1e-6))
+        self.assertTrue(torch.allclose(input_values[0, 1, :30], EXPECTED_INPUT_VALUES * 0.5, atol=1e-6))
+
+    def test_truncation_and_padding(self):
+        input_audio = self._load_datasamples(2)
+        # would be easier if the stride was like
+        feature_extractor = EncodecFeatureExtractor(feature_size=1, chunk_length_s=1, overlap=0.01)
+
+        # pad and trunc raise an error ?
+        with self.assertRaisesRegex(
+            ValueError,
+            "^Both padding and truncation were set. Make sure you only set one.$",
+        ):
+            truncated_outputs = feature_extractor(
+                input_audio, padding="max_length", truncation=True, return_tensors="pt"
+            ).input_values
+
+        # truncate to chunk
+        truncated_outputs = feature_extractor(input_audio, truncation=True, return_tensors="pt").input_values
+        self.assertEquals(truncated_outputs.shape, (2, 1, 71520))  # 2 chunks
+
+        # force truncate to max_length
+        truncated_outputs = feature_extractor(
+            input_audio, truncation=True, max_length=48000, return_tensors="pt"
+        ).input_values
+        self.assertEquals(truncated_outputs.shape, (2, 1, 48000))
+
+        # pad to chunk
+        padded_outputs = feature_extractor(input_audio, padding=True, return_tensors="pt").input_values
+        self.assertEquals(padded_outputs.shape, (2, 1, 95280))
+
+        # pad to chunk
+        truncated_outputs = feature_extractor(input_audio, return_tensors="pt").input_values
+        self.assertEquals(truncated_outputs.shape, (2, 1, 95280))
+
+        # force pad to max length
+        truncated_outputs = feature_extractor(
+            input_audio, padding="max_length", max_length=100000, return_tensors="pt"
+        ).input_values
+        self.assertEquals(truncated_outputs.shape, (2, 1, 100000))
+
+        # force no pad
+        with self.assertRaisesRegex(
+            ValueError,
+            "^Unable to create tensor, you should probably activate padding with 'padding=True' to have batched tensors with the same length.$",
+        ):
+            truncated_outputs = feature_extractor(input_audio, padding=False, return_tensors="pt").input_values
+
+        truncated_outputs = feature_extractor(input_audio[0], padding=False, return_tensors="pt").input_values
+        self.assertEquals(truncated_outputs.shape, (1, 1, 93680))
+
+        # no pad if no chunk_length_s
+        feature_extractor.chunk_length_s = None
+        with self.assertRaisesRegex(
+            ValueError,
+            "^Unable to create tensor, you should probably activate padding with 'padding=True' to have batched tensors with the same length.$",
+        ):
+            truncated_outputs = feature_extractor(input_audio, padding=False, return_tensors="pt").input_values
+
+        truncated_outputs = feature_extractor(input_audio[0], padding=False, return_tensors="pt").input_values
+        self.assertEquals(truncated_outputs.shape, (1, 1, 93680))
+
+        # no pad if no overlap
+        feature_extractor.chunk_length_s = 2
+        feature_extractor.overlap = None
+        with self.assertRaisesRegex(
+            ValueError,
+            "^Unable to create tensor, you should probably activate padding with 'padding=True' to have batched tensors with the same length.$",
+        ):
+            truncated_outputs = feature_extractor(input_audio, padding=False, return_tensors="pt").input_values
+
+        truncated_outputs = feature_extractor(input_audio[0], padding=False, return_tensors="pt").input_values
+        self.assertEquals(truncated_outputs.shape, (1, 1, 93680))
diff --git a/tests/models/encodec/test_modeling_encodec.py b/tests/models/encodec/test_modeling_encodec.py
new file mode 100644
index 000000000000..8f1b06da06c8
--- /dev/null
+++ b/tests/models/encodec/test_modeling_encodec.py
@@ -0,0 +1,606 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Encodec model. """
+
+import copy
+import inspect
+import os
+import tempfile
+import unittest
+from typing import Dict, List, Tuple
+
+import numpy as np
+from datasets import Audio, load_dataset
+
+from transformers import AutoProcessor, EncodecConfig
+from transformers.testing_utils import (
+    is_torch_available,
+    require_torch,
+    slow,
+    torch_device,
+)
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import (
+    ModelTesterMixin,
+    _config_zero_init,
+    floats_tensor,
+)
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import EncodecModel
+
+
+def prepare_inputs_dict(
+    config,
+    input_ids=None,
+    input_values=None,
+    decoder_input_ids=None,
+    attention_mask=None,
+    decoder_attention_mask=None,
+    head_mask=None,
+    decoder_head_mask=None,
+    cross_attn_head_mask=None,
+):
+    if input_ids is not None:
+        encoder_dict = {"input_ids": input_ids}
+    else:
+        encoder_dict = {"input_values": input_values}
+
+    decoder_dict = {"decoder_input_ids": decoder_input_ids} if decoder_input_ids is not None else {}
+
+    return {**encoder_dict, **decoder_dict}
+
+
+@require_torch
+class EncodecModelTester:
+    def __init__(
+        self,
+        parent,
+        # `batch_size` needs to be an even number if the model has some outputs with batch dim != 0.
+        batch_size=12,
+        num_channels=2,
+        is_training=False,
+        intermediate_size=40,
+        hidden_size=32,
+        num_filters=8,
+        num_residual_layers=1,
+        upsampling_ratios=[8, 4],
+        num_lstm_layers=1,
+        codebook_size=64,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.intermediate_size = intermediate_size
+        self.hidden_size = hidden_size
+        self.num_filters = num_filters
+        self.num_residual_layers = num_residual_layers
+        self.upsampling_ratios = upsampling_ratios
+        self.num_lstm_layers = num_lstm_layers
+        self.codebook_size = codebook_size
+
+    def prepare_config_and_inputs(self):
+        input_values = floats_tensor([self.batch_size, self.num_channels, self.intermediate_size], scale=1.0)
+        config = self.get_config()
+        inputs_dict = {"input_values": input_values}
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def get_config(self):
+        return EncodecConfig(
+            audio_channels=self.num_channels,
+            chunk_in_sec=None,
+            hidden_size=self.hidden_size,
+            num_filters=self.num_filters,
+            num_residual_layers=self.num_residual_layers,
+            upsampling_ratios=self.upsampling_ratios,
+            num_lstm_layers=self.num_lstm_layers,
+            codebook_size=self.codebook_size,
+        )
+
+    def create_and_check_model_forward(self, config, inputs_dict):
+        model = EncodecModel(config=config).to(torch_device).eval()
+
+        input_values = inputs_dict["input_values"]
+        result = model(input_values)
+        self.parent.assertEqual(
+            result.audio_values.shape, (self.batch_size, self.num_channels, self.intermediate_size)
+        )
+
+
+@require_torch
+class EncodecModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (EncodecModel,) if is_torch_available() else ()
+    is_encoder_decoder = True
+    test_pruning = False
+    test_headmasking = False
+    test_resize_embeddings = False
+    pipeline_model_mapping = {"feature-extraction": EncodecModel} if is_torch_available() else {}
+    input_name = "input_values"
+
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        # model does not have attention and does not support returning hidden states
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+        if "output_attentions" in inputs_dict:
+            inputs_dict.pop("output_attentions")
+        if "output_hidden_states" in inputs_dict:
+            inputs_dict.pop("output_hidden_states")
+        return inputs_dict
+
+    def setUp(self):
+        self.model_tester = EncodecModelTester(self)
+        self.config_tester = ConfigTester(
+            self, config_class=EncodecConfig, hidden_size=37, common_properties=[], has_text_modality=False
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model_forward(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_forward(*config_and_inputs)
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["input_values", "padding_mask", "bandwidth"]
+            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+
+    @unittest.skip("The EncodecModel is not transformers based, thus it does not have `inputs_embeds` logics")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip("The EncodecModel is not transformers based, thus it does not have `inputs_embeds` logics")
+    def test_model_common_attributes(self):
+        pass
+
+    @unittest.skip("The EncodecModel is not transformers based, thus it does not have the usual `attention` logic")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    @unittest.skip("The EncodecModel is not transformers based, thus it does not have the usual `attention` logic")
+    def test_torchscript_output_attentions(self):
+        pass
+
+    @unittest.skip("The EncodecModel is not transformers based, thus it does not have the usual `hidden_states` logic")
+    def test_torchscript_output_hidden_state(self):
+        pass
+
+    def _create_and_check_torchscript(self, config, inputs_dict):
+        if not self.test_torchscript:
+            return
+
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        configs_no_init.torchscript = True
+        configs_no_init.return_dict = False
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+
+            main_input_name = model_class.main_input_name
+
+            try:
+                main_input = inputs[main_input_name]
+                model(main_input)
+                traced_model = torch.jit.trace(model, main_input)
+            except RuntimeError:
+                self.fail("Couldn't trace module.")
+
+            with tempfile.TemporaryDirectory() as tmp_dir_name:
+                pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
+
+                try:
+                    torch.jit.save(traced_model, pt_file_name)
+                except Exception:
+                    self.fail("Couldn't save module.")
+
+                try:
+                    loaded_model = torch.jit.load(pt_file_name)
+                except Exception:
+                    self.fail("Couldn't load module.")
+
+            model.to(torch_device)
+            model.eval()
+
+            loaded_model.to(torch_device)
+            loaded_model.eval()
+
+            model_state_dict = model.state_dict()
+            loaded_model_state_dict = loaded_model.state_dict()
+
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
+            self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
+
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
+            models_equal = True
+            for layer_name, p1 in model_state_dict.items():
+                if layer_name in loaded_model_state_dict:
+                    p2 = loaded_model_state_dict[layer_name]
+                    if p1.data.ne(p2.data).sum() > 0:
+                        models_equal = False
+
+            self.assertTrue(models_equal)
+
+            # Avoid memory leak. Without this, each call increase RAM usage by ~20MB.
+            # (Even with this call, there are still memory leak by ~0.04MB)
+            self.clear_torch_jit_class_registry()
+
+    @unittest.skip("The EncodecModel is not transformers based, thus it does not have the usual `attention` logic")
+    def test_attention_outputs(self):
+        pass
+
+    def test_feed_forward_chunking(self):
+        (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            torch.manual_seed(0)
+            config = copy.deepcopy(original_config)
+            config.chunk_length_s = None
+            config.overlap = None
+            config.sampling_rate = 10
+
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            inputs["input_values"] = inputs["input_values"].repeat(1, 1, 10)
+
+            hidden_states_no_chunk = model(**inputs)[0]
+
+            torch.manual_seed(0)
+            config.chunk_length_s = 1
+            config.overlap = 0
+            config.sampling_rate = 10
+
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            hidden_states_with_chunk = model(**inputs)[0]
+            self.assertTrue(torch.allclose(hidden_states_no_chunk, hidden_states_with_chunk, atol=1e-3))
+
+    @unittest.skip("The EncodecModel is not transformers based, thus it does not have the usual `hidden_states` logic")
+    def test_hidden_states_output(self):
+        pass
+
+    def test_determinism(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def check_determinism(first, second):
+            # outputs are not tensors but list (since each sequence don't have the same frame_length)
+            out_1 = first.cpu().numpy()
+            out_2 = second.cpu().numpy()
+            out_1 = out_1[~np.isnan(out_1)]
+            out_2 = out_2[~np.isnan(out_2)]
+            max_diff = np.amax(np.abs(out_1 - out_2))
+            self.assertLessEqual(max_diff, 1e-5)
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                first = model(**self._prepare_for_class(inputs_dict, model_class))[0]
+                second = model(**self._prepare_for_class(inputs_dict, model_class))[0]
+
+            if isinstance(first, tuple) and isinstance(second, tuple):
+                for tensor1, tensor2 in zip(first, second):
+                    check_determinism(tensor1, tensor2)
+            else:
+                check_determinism(first, second)
+
+    def test_model_outputs_equivalence(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def set_nan_tensor_to_zero(t):
+            t[t != t] = 0
+            return t
+
+        def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
+            with torch.no_grad():
+                tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs)
+                dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs)
+
+                def recursive_check(tuple_object, dict_object):
+                    if isinstance(tuple_object, (List, Tuple)):
+                        for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
+                            recursive_check(tuple_iterable_value, dict_iterable_value)
+                    elif isinstance(tuple_object, Dict):
+                        for tuple_iterable_value, dict_iterable_value in zip(
+                            tuple_object.values(), dict_object.values()
+                        ):
+                            recursive_check(tuple_iterable_value, dict_iterable_value)
+                    elif tuple_object is None:
+                        return
+                    else:
+                        self.assertTrue(
+                            torch.allclose(
+                                set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5
+                            ),
+                            msg=(
+                                "Tuple and dict output are not equal. Difference:"
+                                f" {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`:"
+                                f" {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has"
+                                f" `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}."
+                            ),
+                        )
+
+                recursive_check(tuple_output, dict_output)
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs)
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                uniform_init_parms = ["conv"]
+                ignore_init = ["lstm"]
+                if param.requires_grad:
+                    if any(x in name for x in uniform_init_parms):
+                        self.assertTrue(
+                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    elif not any(x in name for x in ignore_init):
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+    def test_identity_shortcut(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        config.use_conv_shortcut = False
+        self.model_tester.create_and_check_model_forward(config, inputs_dict)
+
+
+def normalize(arr):
+    norm = np.linalg.norm(arr)
+    normalized_arr = arr / norm
+    return normalized_arr
+
+
+def compute_rmse(arr1, arr2):
+    arr1_normalized = normalize(arr1)
+    arr2_normalized = normalize(arr2)
+    return np.sqrt(((arr1_normalized - arr2_normalized) ** 2).mean())
+
+
+@slow
+@require_torch
+class EncodecIntegrationTest(unittest.TestCase):
+    def test_integration_24kHz(self):
+        expected_rmse = {
+            "1.5": 0.0025,
+            "24.0": 0.0015,
+        }
+        expected_codesums = {
+            "1.5": [371955],
+            "24.0": [6659962],
+        }
+        librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        model_id = "facebook/encodec_24khz"
+
+        model = EncodecModel.from_pretrained(model_id).to(torch_device)
+        processor = AutoProcessor.from_pretrained(model_id)
+
+        librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate))
+        audio_sample = librispeech_dummy[-1]["audio"]["array"]
+
+        inputs = processor(
+            raw_audio=audio_sample,
+            sampling_rate=processor.sampling_rate,
+            return_tensors="pt",
+        ).to(torch_device)
+
+        for bandwidth, expected_rmse in expected_rmse.items():
+            with torch.no_grad():
+                # use max bandwith for best possible reconstruction
+                encoder_outputs = model.encode(inputs["input_values"], bandwidth=float(bandwidth))
+
+                audio_code_sums = [a[0].sum().cpu().item() for a in encoder_outputs[0]]
+
+                # make sure audio encoded codes are correct
+                self.assertListEqual(audio_code_sums, expected_codesums[bandwidth])
+
+                audio_codes, scales = encoder_outputs.to_tuple()
+                input_values_dec = model.decode(audio_codes, scales, inputs["padding_mask"])[0]
+                input_values_enc_dec = model(
+                    inputs["input_values"], inputs["padding_mask"], bandwidth=float(bandwidth)
+                )[-1]
+
+            # make sure forward and decode gives same result
+            self.assertTrue(torch.allclose(input_values_dec, input_values_enc_dec, atol=1e-3))
+
+            # make sure shape matches
+            self.assertTrue(inputs["input_values"].shape == input_values_enc_dec.shape)
+
+            arr = inputs["input_values"][0].cpu().numpy()
+            arr_enc_dec = input_values_enc_dec[0].cpu().numpy()
+
+            # make sure audios are more or less equal
+            # the RMSE of two random gaussian noise vectors with ~N(0, 1) is around 1.0
+            rmse = compute_rmse(arr, arr_enc_dec)
+            self.assertTrue(rmse < expected_rmse)
+
+    def test_integration_48kHz(self):
+        expected_rmse = {
+            "3.0": 0.001,
+            "24.0": 0.0005,
+        }
+        expected_codesums = {
+            "3.0": [144259, 146765, 156435, 176871, 161971],
+            "24.0": [1568553, 1294948, 1306190, 1464747, 1663150],
+        }
+        librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        model_id = "facebook/encodec_48khz"
+
+        model = EncodecModel.from_pretrained(model_id).to(torch_device)
+        model = model.eval()
+        processor = AutoProcessor.from_pretrained(model_id)
+
+        librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate))
+        audio_sample = librispeech_dummy[-1]["audio"]["array"]
+
+        # transform mono to stereo
+        audio_sample = np.array([audio_sample, audio_sample])
+
+        inputs = processor(raw_audio=audio_sample, sampling_rate=processor.sampling_rate, return_tensors="pt").to(
+            torch_device
+        )
+
+        for bandwidth, expected_rmse in expected_rmse.items():
+            with torch.no_grad():
+                # use max bandwith for best possible reconstruction
+                encoder_outputs = model.encode(
+                    inputs["input_values"], inputs["padding_mask"], bandwidth=float(bandwidth), return_dict=False
+                )
+                audio_code_sums = [a[0].sum().cpu().item() for a in encoder_outputs[0]]
+
+                # make sure audio encoded codes are correct
+                self.assertListEqual(audio_code_sums, expected_codesums[bandwidth])
+                audio_codes, scales = encoder_outputs
+                input_values_dec = model.decode(audio_codes, scales, inputs["padding_mask"])[0]
+                input_values_enc_dec = model(
+                    inputs["input_values"], inputs["padding_mask"], bandwidth=float(bandwidth)
+                )[-1]
+
+            # make sure forward and decode gives same result
+            self.assertTrue(torch.allclose(input_values_dec, input_values_enc_dec, atol=1e-3))
+
+            # make sure shape matches
+            self.assertTrue(inputs["input_values"].shape == input_values_enc_dec.shape)
+
+            arr = inputs["input_values"][0].cpu().numpy()
+            arr_enc_dec = input_values_enc_dec[0].cpu().numpy()
+
+            # make sure audios are more or less equal
+            # the RMSE of two random gaussian noise vectors with ~N(0, 1) is around 1.0
+            rmse = compute_rmse(arr, arr_enc_dec)
+            self.assertTrue(rmse < expected_rmse)
+
+    def test_batch_48kHz(self):
+        expected_rmse = {
+            "3.0": 0.001,
+            "24.0": 0.0005,
+        }
+        expected_codesums = {
+            "3.0": [
+                [72410, 79137, 76694, 90854, 73023, 82980, 72707, 54842],
+                [85561, 81870, 76953, 48967, 79315, 85442, 81479, 107241],
+            ],
+            "24.0": [
+                [72410, 79137, 76694, 90854, 73023, 82980, 72707, 54842],
+                [85561, 81870, 76953, 48967, 79315, 85442, 81479, 107241],
+            ],
+        }
+        librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        model_id = "facebook/encodec_48khz"
+
+        model = EncodecModel.from_pretrained(model_id).to(torch_device)
+        processor = AutoProcessor.from_pretrained(model_id, chunk_length_s=1, overlap=0.01)
+
+        librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate))
+
+        audio_samples = [
+            np.array([audio_sample["array"], audio_sample["array"]])
+            for audio_sample in librispeech_dummy[-2:]["audio"]
+        ]
+
+        inputs = processor(raw_audio=audio_samples, sampling_rate=processor.sampling_rate, return_tensors="pt")
+        input_values = inputs["input_values"].to(torch_device)
+        for bandwidth, expected_rmse in expected_rmse.items():
+            with torch.no_grad():
+                # use max bandwith for best possible reconstruction
+                encoder_outputs = model.encode(input_values, bandwidth=float(bandwidth), return_dict=False)
+                audio_code_sums_0 = [a[0][0].sum().cpu().item() for a in encoder_outputs[0]]
+                audio_code_sums_1 = [a[0][1].sum().cpu().item() for a in encoder_outputs[0]]
+
+                # make sure audio encoded codes are correct
+                self.assertListEqual(audio_code_sums_0, expected_codesums[bandwidth][0])
+                self.assertListEqual(audio_code_sums_1, expected_codesums[bandwidth][1])
+
+                audio_codes, scales = encoder_outputs
+                input_values_dec = model.decode(audio_codes, scales)[0]
+                input_values_enc_dec = model(input_values, bandwidth=float(bandwidth))[-1]
+
+            # make sure forward and decode gives same result
+            self.assertTrue(torch.allclose(input_values_dec, input_values_enc_dec, atol=1e-3))
+
+            # make sure shape matches
+            self.assertTrue(input_values.shape == input_values_enc_dec.shape)
+
+            arr = input_values[0].cpu().numpy()
+            arr_enc_dec = input_values_enc_dec[0].cpu().numpy()
+
+            # make sure audios are more or less equal
+            # the RMSE of two random gaussian noise vectors with ~N(0, 1) is around 1.0
+            rmse = compute_rmse(arr, arr_enc_dec)
+            self.assertTrue(rmse < expected_rmse)
diff --git a/tests/models/encoder_decoder/test_modeling_encoder_decoder.py b/tests/models/encoder_decoder/test_modeling_encoder_decoder.py
index 8f565aec0610..c476744057e8 100644
--- a/tests/models/encoder_decoder/test_modeling_encoder_decoder.py
+++ b/tests/models/encoder_decoder/test_modeling_encoder_decoder.py
@@ -72,7 +72,7 @@ def check_encoder_decoder_model_from_pretrained_configs(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
         self.assertTrue(encoder_decoder_config.decoder.is_decoder)
@@ -106,7 +106,7 @@ def check_encoder_decoder_model(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         enc_dec_model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
@@ -167,7 +167,7 @@ def check_encoder_decoder_model_from_pretrained_using_model_paths(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         with tempfile.TemporaryDirectory() as encoder_tmp_dirname, tempfile.TemporaryDirectory() as decoder_tmp_dirname:
@@ -210,7 +210,7 @@ def check_encoder_decoder_model_from_pretrained(
         decoder_input_ids,
         decoder_attention_mask,
         return_dict,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict}
@@ -240,7 +240,7 @@ def check_save_and_load(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         enc_dec_model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
@@ -281,7 +281,7 @@ def check_save_and_load_encoder_decoder_model(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         enc_dec_model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
@@ -327,7 +327,7 @@ def check_encoder_decoder_model_labels(
         decoder_input_ids,
         decoder_attention_mask,
         labels,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         enc_dec_model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
@@ -395,7 +395,7 @@ def check_encoder_decoder_model_output_attentions(
         decoder_input_ids,
         decoder_attention_mask,
         labels,
-        **kwargs
+        **kwargs,
     ):
         # make the decoder inputs a different shape from the encoder inputs to harden the test
         decoder_input_ids = decoder_input_ids[:, :-1]
@@ -424,7 +424,7 @@ def check_encoder_decoder_model_output_attentions_from_config(
         decoder_input_ids,
         decoder_attention_mask,
         labels,
-        **kwargs
+        **kwargs,
     ):
         # Similar to `check_encoder_decoder_model_output_attentions`, but with `output_attentions` triggered from the
         # config file. Contrarily to most models, changing the model's config won't work -- the defaults are loaded
@@ -491,7 +491,7 @@ def create_and_check_encoder_decoder_shared_weights(
         decoder_input_ids,
         decoder_attention_mask,
         labels,
-        **kwargs
+        **kwargs,
     ):
         torch.manual_seed(0)
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
diff --git a/tests/models/encoder_decoder/test_modeling_flax_encoder_decoder.py b/tests/models/encoder_decoder/test_modeling_flax_encoder_decoder.py
index 9d807e9f650e..362a5f74a1b6 100644
--- a/tests/models/encoder_decoder/test_modeling_flax_encoder_decoder.py
+++ b/tests/models/encoder_decoder/test_modeling_flax_encoder_decoder.py
@@ -69,7 +69,7 @@ def check_encoder_decoder_model_from_pretrained_configs(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
         self.assertTrue(encoder_decoder_config.decoder.is_decoder)
@@ -102,7 +102,7 @@ def check_encoder_decoder_model_from_pretrained(
         decoder_input_ids,
         decoder_attention_mask,
         return_dict,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict}
@@ -131,7 +131,7 @@ def check_save_and_load(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model}
@@ -170,7 +170,7 @@ def check_encoder_decoder_model_from_encoder_decoder_pretrained(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         # assert that model attributes match those of configs
@@ -215,7 +215,7 @@ def check_encoder_decoder_model_output_attentions(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         # make the decoder inputs a different shape from the encoder inputs to harden the test
         decoder_input_ids = decoder_input_ids[:, :-1]
@@ -292,7 +292,6 @@ def check_encoder_decoder_model_generate(self, input_ids, config, decoder_config
         self.assertEqual(generated_sequences.shape, (input_ids.shape[0],) + (decoder_config.max_length,))
 
     def check_pt_flax_equivalence(self, pt_model, fx_model, inputs_dict):
-
         pt_model.to(torch_device)
         pt_model.eval()
 
@@ -334,7 +333,6 @@ def check_pt_flax_equivalence(self, pt_model, fx_model, inputs_dict):
             self.assert_almost_equals(fx_output, pt_output_loaded.numpy(), 1e-5)
 
     def check_equivalence_pt_to_flax(self, config, decoder_config, inputs_dict):
-
         encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
 
         pt_model = EncoderDecoderModel(encoder_decoder_config)
@@ -346,7 +344,6 @@ def check_equivalence_pt_to_flax(self, config, decoder_config, inputs_dict):
         self.check_pt_flax_equivalence(pt_model, fx_model, inputs_dict)
 
     def check_equivalence_flax_to_pt(self, config, decoder_config, inputs_dict):
-
         encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
 
         pt_model = EncoderDecoderModel(encoder_decoder_config)
@@ -390,7 +387,6 @@ def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float):
 
     @is_pt_flax_cross_test
     def test_pt_flax_equivalence(self):
-
         config_inputs_dict = self.prepare_config_and_inputs()
         config = config_inputs_dict.pop("config")
         decoder_config = config_inputs_dict.pop("decoder_config")
@@ -589,7 +585,6 @@ def get_from_encoderdecoder_pretrained_model(self):
         return FlaxEncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-cased", "gpt2")
 
     def _check_configuration_tie(self, model):
-
         module = model.module.bind(model.params)
 
         assert id(module.decoder.config) == id(model.config.decoder)
diff --git a/tests/models/encoder_decoder/test_modeling_tf_encoder_decoder.py b/tests/models/encoder_decoder/test_modeling_tf_encoder_decoder.py
index def75a43a401..ab5da3d41e6c 100644
--- a/tests/models/encoder_decoder/test_modeling_tf_encoder_decoder.py
+++ b/tests/models/encoder_decoder/test_modeling_tf_encoder_decoder.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import copy
 import os
 import tempfile
@@ -78,7 +80,7 @@ def check_encoder_decoder_model_from_pretrained_configs(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
         self.assertTrue(encoder_decoder_config.decoder.is_decoder)
@@ -111,7 +113,7 @@ def check_encoder_decoder_model(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         enc_dec_model = TFEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
@@ -160,7 +162,7 @@ def check_encoder_decoder_model_from_pretrained(
         decoder_input_ids,
         decoder_attention_mask,
         return_dict,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict}
@@ -190,7 +192,7 @@ def check_save_and_load(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         enc_dec_model = TFEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
@@ -231,7 +233,7 @@ def check_encoder_decoder_model_labels(
         decoder_input_ids,
         decoder_attention_mask,
         labels,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         enc_dec_model = TFEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
@@ -298,7 +300,7 @@ def check_encoder_decoder_model_output_attentions(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         # make the decoder inputs a different shape from the encoder inputs to harden the test
         decoder_input_ids = decoder_input_ids[:, :-1]
@@ -326,7 +328,7 @@ def check_encoder_decoder_model_output_attentions_from_config(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         # Similar to `check_encoder_decoder_model_output_attentions`, but with `output_attentions` triggered from the
         # config file. Contrarily to most models, changing the model's config won't work -- the defaults are loaded
@@ -470,7 +472,6 @@ def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=1e-5, nam
             )
 
     def prepare_pt_inputs_from_tf_inputs(self, tf_inputs_dict):
-
         pt_inputs_dict = {}
         for name, key in tf_inputs_dict.items():
             if type(key) == bool:
@@ -490,7 +491,6 @@ def prepare_pt_inputs_from_tf_inputs(self, tf_inputs_dict):
         return pt_inputs_dict
 
     def check_pt_tf_models(self, tf_model, pt_model, tf_inputs_dict):
-
         pt_inputs_dict = self.prepare_pt_inputs_from_tf_inputs(tf_inputs_dict)
 
         # send pytorch inputs to the correct device
@@ -607,7 +607,6 @@ def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float):
 
     @is_pt_tf_cross_test
     def test_pt_tf_model_equivalence(self):
-
         config_inputs_dict = self.prepare_config_and_inputs()
         labels = config_inputs_dict.pop("decoder_token_labels")
 
@@ -762,7 +761,6 @@ def prepare_config_and_inputs(self):
     @slow
     @is_pt_tf_cross_test
     def test_bert2bert_summarization(self):
-
         from transformers import EncoderDecoderModel
 
         tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
@@ -785,7 +783,7 @@ def test_bert2bert_summarization(self):
         EXPECTED_SUMMARY_STUDENTS = """sae was founded in 1856, five years before the civil war. the fraternity has had to work hard to change recently. the university of oklahoma president says the university's affiliation with the fraternity is permanently done. the sae has had a string of members in recent months."""
 
         input_dict = tokenizer(ARTICLE_STUDENTS, return_tensors="tf")
-        output_ids = model.generate(input_ids=input_dict["input_ids"], max_length=None).numpy().tolist()
+        output_ids = model.generate(input_ids=input_dict["input_ids"]).numpy().tolist()
         summary = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
 
         self.assertEqual(summary, [EXPECTED_SUMMARY_STUDENTS])
@@ -793,7 +791,7 @@ def test_bert2bert_summarization(self):
         # Test with the TF checkpoint
         model = TFEncoderDecoderModel.from_pretrained("ydshieh/bert2bert-cnn_dailymail-fp16")
 
-        output_ids = model.generate(input_ids=input_dict["input_ids"], max_length=None).numpy().tolist()
+        output_ids = model.generate(input_ids=input_dict["input_ids"]).numpy().tolist()
         summary = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
 
         self.assertEqual(summary, [EXPECTED_SUMMARY_STUDENTS])
@@ -863,7 +861,6 @@ def prepare_config_and_inputs(self):
     @slow
     @is_pt_tf_cross_test
     def test_bert2gpt2_summarization(self):
-
         from transformers import EncoderDecoderModel
 
         tokenizer_in = AutoTokenizer.from_pretrained("bert-base-cased")
@@ -887,7 +884,7 @@ def test_bert2gpt2_summarization(self):
         EXPECTED_SUMMARY_STUDENTS = """SAS Alpha Epsilon suspended the students, but university president says it's permanent.\nThe fraternity has had to deal with a string of student deaths since 2010.\nSAS has more than 200,000 members, many of whom are students.\nA student died while being forced into excessive alcohol consumption."""
 
         input_dict = tokenizer_in(ARTICLE_STUDENTS, return_tensors="tf")
-        output_ids = model.generate(input_ids=input_dict["input_ids"], max_length=None).numpy().tolist()
+        output_ids = model.generate(input_ids=input_dict["input_ids"]).numpy().tolist()
         summary = tokenizer_out.batch_decode(output_ids, skip_special_tokens=True)
 
         self.assertEqual(summary, [EXPECTED_SUMMARY_STUDENTS])
@@ -1073,9 +1070,9 @@ def test_encoder_decoder_save_load_from_encoder_decoder(self):
 
         # create two random BERT models for bert2bert & initialize weights (+cross_attention weights)
         encoder = TFBertModel(config.encoder)
-        encoder(encoder.dummy_inputs)
+        encoder.build()
         decoder = TFBertLMHeadModel(config.decoder)
-        decoder(decoder.dummy_inputs)
+        decoder.build()
 
         encoder_decoder_orig = TFEncoderDecoderModel(encoder=encoder, decoder=decoder)
 
@@ -1171,7 +1168,6 @@ def test_encoder_decoder_from_pretrained(self):
         decoder_input_ids = decoder_tokenizer("Linda Davis", return_tensors="tf").input_ids
 
         with tempfile.TemporaryDirectory() as tmp_dirname:
-
             # Since most of HF's models don't have pretrained cross-attention layers, they are randomly
             # initialized even if we create models using `from_pretrained` method.
             # For the tests, the decoder need to be a model with pretrained cross-attention layers.
diff --git a/tests/models/ernie/test_modeling_ernie.py b/tests/models/ernie/test_modeling_ernie.py
index ed0b4e1f3d41..f0bdec3efb91 100644
--- a/tests/models/ernie/test_modeling_ernie.py
+++ b/tests/models/ernie/test_modeling_ernie.py
@@ -23,6 +23,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -55,7 +56,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -426,7 +427,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class ErnieModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class ErnieModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             ErnieModel,
@@ -443,6 +444,19 @@ class ErnieModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase)
         else ()
     )
     all_generative_model_classes = (ErnieForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": ErnieModel,
+            "fill-mask": ErnieForMaskedLM,
+            "question-answering": ErnieForQuestionAnswering,
+            "text-classification": ErnieForSequenceClassification,
+            "text-generation": ErnieForCausalLM,
+            "token-classification": ErnieForTokenClassification,
+            "zero-shot": ErnieForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
     fx_compatible = False
 
     # special case for ForPreTraining model
diff --git a/tests/models/ernie_m/__init__.py b/tests/models/ernie_m/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/ernie_m/test_modeling_ernie_m.py b/tests/models/ernie_m/test_modeling_ernie_m.py
new file mode 100644
index 000000000000..1fafcd34bafc
--- /dev/null
+++ b/tests/models/ernie_m/test_modeling_ernie_m.py
@@ -0,0 +1,325 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. and Baidu team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch ErnieM model. """
+
+
+import unittest
+
+from transformers import ErnieMConfig, is_torch_available
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        ErnieMForInformationExtraction,
+        ErnieMForMultipleChoice,
+        ErnieMForQuestionAnswering,
+        ErnieMForSequenceClassification,
+        ErnieMForTokenClassification,
+        ErnieMModel,
+    )
+    from transformers.models.ernie_m.modeling_ernie_m import ERNIE_M_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+class ErnieMModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.get_config()
+
+        return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def prepare_config_and_inputs_for_uiem(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+        config = self.get_config()
+
+        return config, input_ids, input_mask
+
+    def get_config(self):
+        return ErnieMConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+        )
+
+    def create_and_check_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        model = ErnieMModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, return_dict=True)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_question_answering(
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = ErnieMForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def create_and_check_for_information_extraction(
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = ErnieMForInformationExtraction(config=config)
+        model.to(torch_device)
+        model.eval()
+        sequence_labels = torch.ones_like(input_ids, dtype=torch.float32)
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def create_and_check_for_sequence_classification(
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = ErnieMForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_for_token_classification(
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = ErnieMForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        input_ids.to(torch_device)
+        input_mask.to(torch_device)
+        token_labels.to(torch_device)
+
+        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
+
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_for_multiple_choice(
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = ErnieMForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        result = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            labels=choice_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class ErnieMModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            ErnieMModel,
+            ErnieMForMultipleChoice,
+            ErnieMForQuestionAnswering,
+            ErnieMForSequenceClassification,
+            ErnieMForTokenClassification,
+        )
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": ErnieMModel,
+            "question-answering": ErnieMForQuestionAnswering,
+            "text-classification": ErnieMForSequenceClassification,
+            "token-classification": ErnieMForTokenClassification,
+            "zero-shot": ErnieMForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
+    test_torchscript = False
+
+    # TODO: Fix the failed tests when this model gets more usage
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if pipeline_test_casse_name == "QAPipelineTests":
+            return True
+
+        return False
+
+    def setUp(self):
+        self.model_tester = ErnieMModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ErnieMConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_various_embeddings(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        for type in ["absolute", "relative_key", "relative_key_query"]:
+            config_and_inputs[0].position_embedding_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
+
+    def test_for_information_extraction(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_information_extraction(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in ERNIE_M_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = ErnieMModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@require_torch
+class ErnieMModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_model(self):
+        model = ErnieMModel.from_pretrained("susnato/ernie-m-base_pytorch")
+        model.eval()
+        input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
+        output = model(input_ids)[0]
+
+        # TODO Replace vocab size
+        hidden_size = 768
+
+        expected_shape = torch.Size((1, 6, hidden_size))
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[[-0.0012, 0.1245, -0.0214], [-0.0742, 0.0244, -0.0771], [-0.0333, 0.1164, -0.1554]]]
+        )
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3))
diff --git a/tests/models/ernie_m/test_tokenization_ernie_m.py b/tests/models/ernie_m/test_tokenization_ernie_m.py
new file mode 100644
index 000000000000..2e06bb20c10c
--- /dev/null
+++ b/tests/models/ernie_m/test_tokenization_ernie_m.py
@@ -0,0 +1,144 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. and Baidu team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch ErnieM model. """
+
+import unittest
+
+from transformers import ErnieMTokenizer
+from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow
+
+from ...test_tokenization_common import TokenizerTesterMixin
+
+
+SAMPLE_VOCAB = get_tests_dir("fixtures/spiece.model")
+
+
+@require_sentencepiece
+@require_tokenizers
+class ErnieMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+    tokenizer_class = ErnieMTokenizer
+    test_seq2seq = False
+    test_sentencepiece = True
+    test_rust_tokenizer = False
+    test_sentencepiece_ignore_case = False
+
+    def setUp(self):
+        super().setUp()
+
+        # We have a SentencePiece fixture for testing
+        tokenizer = ErnieMTokenizer(SAMPLE_VOCAB, unk_token="<unk>", pad_token="<pad>")
+        tokenizer.save_pretrained(self.tmpdirname)
+
+    def get_input_output_texts(self, tokenizer):
+        input_text = "this is a test"
+        output_text = "this is a test"
+        return input_text, output_text
+
+    def test_convert_token_and_id(self):
+        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
+        token = "<pad>"
+        token_id = 0
+
+        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
+        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
+
+    def test_get_vocab(self):
+        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
+
+        self.assertEqual(vocab_keys[0], "<pad>")
+        self.assertEqual(vocab_keys[1], "<unk>")
+        self.assertEqual(vocab_keys[-1], "▁eloquent")
+        self.assertEqual(len(vocab_keys), 30_000)
+
+    def test_vocab_size(self):
+        self.assertEqual(self.get_tokenizer().vocab_size, 30_000)
+
+    def test_rust_and_python_full_tokenizers(self):
+        if not self.test_rust_tokenizer:
+            return
+
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+
+        sequence = "I was born in 92000, and this is falsé."
+
+        tokens = tokenizer.tokenize(sequence)
+        rust_tokens = rust_tokenizer.tokenize(sequence)
+        self.assertListEqual(tokens, rust_tokens)
+
+        ids = tokenizer.encode(sequence, add_special_tokens=False)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
+        self.assertListEqual(ids, rust_ids)
+
+        rust_tokenizer = self.get_rust_tokenizer()
+        ids = tokenizer.encode(sequence)
+        rust_ids = rust_tokenizer.encode(sequence)
+        self.assertListEqual(ids, rust_ids)
+
+    def test_full_tokenizer(self):
+        tokenizer = ErnieMTokenizer(SAMPLE_VOCAB, do_lower_case=True, unk_token="<unk>", pad_token="<pad>")
+
+        tokens = tokenizer.tokenize("This is a test")
+        self.assertListEqual(tokens, ["▁this", "▁is", "▁a", "▁test"])
+
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [48, 25, 21, 1289])
+
+        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
+        # ErnieMTokenizer(paddlenlp implementation) outputs '9' instead of '_9' so to mimic that '_9' is changed to '9'
+        self.assertListEqual(
+            tokens, ["▁i", "▁was", "▁born", "▁in", "9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "é", "."]
+        )
+        ids = tokenizer.convert_tokens_to_ids(tokens)
+        self.assertListEqual(ids, [31, 23, 386, 19, 518, 3050, 15, 17, 48, 25, 8256, 18, 1, 9])
+
+        back_tokens = tokenizer.convert_ids_to_tokens(ids)
+        self.assertListEqual(
+            back_tokens,
+            ["▁i", "▁was", "▁born", "▁in", "9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "<unk>", "."],
+        )
+
+    def test_sequence_builders(self):
+        tokenizer = ErnieMTokenizer(SAMPLE_VOCAB, unk_token="<unk>", pad_token="<pad>")
+
+        text = tokenizer.encode("sequence builders")
+        text_2 = tokenizer.encode("multi-sequence build")
+
+        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
+        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
+
+        assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
+        assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + [
+            tokenizer.sep_token_id
+        ] + text_2 + [tokenizer.sep_token_id]
+
+    @slow
+    def test_tokenizer_integration(self):
+        # fmt: off
+        expected_encoding = {'input_ids': [[0, 11062, 82772, 7, 15, 82772, 538, 51529, 237, 17198, 1290, 206, 9, 215175, 1314, 136, 17198, 1290, 206, 9, 56359, 42, 122009, 9, 16466, 16, 87344, 4537, 9, 4717, 78381, 6, 159958, 7, 15, 24480, 618, 4, 527, 22693, 9, 304, 4, 2777, 24480, 9874, 4, 43523, 594, 4, 803, 18392, 33189, 18, 4, 43523, 24447, 5, 5, 5, 16, 100, 24955, 83658, 9626, 144057, 15, 839, 22335, 16, 136, 24955, 83658, 83479, 15, 39102, 724, 16, 678, 645, 6460, 1328, 4589, 42, 122009, 115774, 23, 3559, 1328, 46876, 7, 136, 53894, 1940, 42227, 41159, 17721, 823, 425, 4, 27512, 98722, 206, 136, 5531, 4970, 919, 17336, 5, 2], [0, 20080, 618, 83, 82775, 47, 479, 9, 1517, 73, 53894, 333, 80581, 110117, 18811, 5256, 1295, 51, 152526, 297, 7986, 390, 124416, 538, 35431, 214, 98, 15044, 25737, 136, 7108, 43701, 23, 756, 135355, 7, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 581, 63773, 119455, 6, 147797, 88203, 7, 645, 70, 21, 3285, 10269, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}
+        # fmt: on
+
+        self.tokenizer_integration_test_util(
+            expected_encoding=expected_encoding,
+            model_name="susnato/ernie-m-base_pytorch",
+            sequences=[
+                "Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides "
+                "general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural "
+                "Language Understanding (NLU) and Natural Language Generation (NLG) with over32+ pretrained "
+                "models in100+ languages and deep interoperability between Jax, PyTorch and TensorFlow.",
+                "BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly "
+                "conditioning on both left and right context in all layers.",
+                "The quick brown fox jumps over the lazy dog.",
+            ],
+        )
diff --git a/tests/models/esm/test_modeling_esm.py b/tests/models/esm/test_modeling_esm.py
index 8db290880edd..d09326df606b 100644
--- a/tests/models/esm/test_modeling_esm.py
+++ b/tests/models/esm/test_modeling_esm.py
@@ -22,6 +22,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -48,7 +49,7 @@ def __init__(
         use_labels=True,
         vocab_size=33,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -150,6 +151,24 @@ def create_and_check_for_token_classification(
         result = model(input_ids, attention_mask=input_mask, labels=token_labels)
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
 
+    def create_and_check_forward_and_backwards(
+        self,
+        config,
+        input_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        gradient_checkpointing=False,
+    ):
+        model = EsmForMaskedLM(config)
+        if gradient_checkpointing:
+            model.gradient_checkpointing_enable()
+        model.to(torch_device)
+        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+        result.loss.backward()
+
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         (
@@ -165,8 +184,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class EsmModelTest(ModelTesterMixin, unittest.TestCase):
-
+class EsmModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     test_mismatched_shapes = False
 
     all_model_classes = (
@@ -180,7 +198,19 @@ class EsmModelTest(ModelTesterMixin, unittest.TestCase):
         else ()
     )
     all_generative_model_classes = ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": EsmModel,
+            "fill-mask": EsmForMaskedLM,
+            "text-classification": EsmForSequenceClassification,
+            "token-classification": EsmForTokenClassification,
+            "zero-shot": EsmForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
     test_sequence_classification_problem_types = True
+    model_split_percents = [0.5, 0.8, 0.9]
 
     def setUp(self):
         self.model_tester = EsmModelTester(self)
@@ -207,6 +237,10 @@ def test_for_token_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
 
+    def test_esm_gradient_checkpointing(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs, gradient_checkpointing=True)
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in ESM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/esm/test_modeling_esmfold.py b/tests/models/esm/test_modeling_esmfold.py
index ed307beef1ee..1ec5ab8acbd0 100644
--- a/tests/models/esm/test_modeling_esmfold.py
+++ b/tests/models/esm/test_modeling_esmfold.py
@@ -22,6 +22,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -42,7 +43,7 @@ def __init__(
         use_labels=False,
         vocab_size=19,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -99,6 +100,28 @@ def prepare_config_and_inputs(self):
         return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
 
     def get_config(self):
+        esmfold_config = {
+            "trunk": {
+                "num_blocks": 2,
+                "sequence_state_dim": 64,
+                "pairwise_state_dim": 16,
+                "sequence_head_width": 4,
+                "pairwise_head_width": 4,
+                "position_bins": 4,
+                "chunk_size": 16,
+                "structure_module": {
+                    "ipa_dim": 16,
+                    "num_angles": 7,
+                    "num_blocks": 2,
+                    "num_heads_ipa": 4,
+                    "pairwise_dim": 16,
+                    "resnet_dim": 16,
+                    "sequence_dim": 48,
+                },
+            },
+            "fp16_esm": False,
+            "lddt_head_hid_dim": 16,
+        }
         config = EsmConfig(
             vocab_size=33,
             hidden_size=self.hidden_size,
@@ -113,7 +136,7 @@ def get_config(self):
             type_vocab_size=self.type_vocab_size,
             initializer_range=self.initializer_range,
             is_folding_model=True,
-            esmfold_config={"trunk": {"num_blocks": 2}, "fp16_esm": False},
+            esmfold_config=esmfold_config,
         )
         return config
 
@@ -125,8 +148,8 @@ def create_and_check_model(self, config, input_ids, input_mask, sequence_labels,
         result = model(input_ids)
         result = model(input_ids)
 
-        self.parent.assertEqual(result.positions.shape, (8, self.batch_size, self.seq_length, 14, 3))
-        self.parent.assertEqual(result.angles.shape, (8, self.batch_size, self.seq_length, 7, 2))
+        self.parent.assertEqual(result.positions.shape, (2, self.batch_size, self.seq_length, 14, 3))
+        self.parent.assertEqual(result.angles.shape, (2, self.batch_size, self.seq_length, 7, 2))
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
@@ -143,12 +166,12 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class EsmFoldModelTest(ModelTesterMixin, unittest.TestCase):
-
+class EsmFoldModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     test_mismatched_shapes = False
 
     all_model_classes = (EsmForProteinFolding,) if is_torch_available() else ()
     all_generative_model_classes = ()
+    pipeline_model_mapping = {} if is_torch_available() else {}
     test_sequence_classification_problem_types = False
 
     def setUp(self):
diff --git a/tests/models/esm/test_modeling_tf_esm.py b/tests/models/esm/test_modeling_tf_esm.py
index c6db0fe99f64..b687da355a31 100644
--- a/tests/models/esm/test_modeling_tf_esm.py
+++ b/tests/models/esm/test_modeling_tf_esm.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import EsmConfig, is_tf_available
@@ -21,6 +23,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -50,7 +53,7 @@ def __init__(
         self.use_labels = True
         self.vocab_size = 99
         self.hidden_size = 32
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.num_attention_heads = 4
         self.intermediate_size = 37
         self.hidden_act = "gelu"
@@ -194,8 +197,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFEsmModelTest(TFModelTesterMixin, unittest.TestCase):
-
+class TFEsmModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             TFEsmModel,
@@ -206,6 +208,17 @@ class TFEsmModelTest(TFModelTesterMixin, unittest.TestCase):
         if is_tf_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": TFEsmModel,
+            "fill-mask": TFEsmForMaskedLM,
+            "text-classification": TFEsmForSequenceClassification,
+            "token-classification": TFEsmForTokenClassification,
+            "zero-shot": TFEsmForSequenceClassification,
+        }
+        if is_tf_available()
+        else {}
+    )
     test_head_masking = False
     test_onnx = False
 
@@ -251,6 +264,24 @@ def test_resize_token_embeddings(self):
     def test_save_load_after_resize_token_embeddings(self):
         pass
 
+    def test_model_common_attributes(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
+            if model_class is TFEsmForMaskedLM:
+                # Output embedding test differs from the main test because they're a matrix, not a layer
+                name = model.get_bias()
+                assert isinstance(name, dict)
+                for k, v in name.items():
+                    assert isinstance(v, tf.Variable)
+            else:
+                x = model.get_output_embeddings()
+                assert x is None
+                name = model.get_bias()
+                assert name is None
+
 
 @require_tf
 class TFEsmModelIntegrationTest(unittest.TestCase):
diff --git a/tests/models/falcon/__init__.py b/tests/models/falcon/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/falcon/test_modeling_falcon.py b/tests/models/falcon/test_modeling_falcon.py
new file mode 100644
index 000000000000..1c3ac44180b9
--- /dev/null
+++ b/tests/models/falcon/test_modeling_falcon.py
@@ -0,0 +1,537 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Falcon model. """
+
+
+import unittest
+
+from parameterized import parameterized
+
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    FalconConfig,
+    is_torch_available,
+    set_seed,
+)
+from transformers.testing_utils import require_bitsandbytes, require_torch, slow, torch_device
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        FalconForCausalLM,
+        FalconForQuestionAnswering,
+        FalconForSequenceClassification,
+        FalconForTokenClassification,
+        FalconModel,
+    )
+
+
+class FalconModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=3,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=False,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.get_config()
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def get_config(self):
+        return FalconConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+            pad_token_id=1,
+            new_decoder_architecture=True,
+        )
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = FalconModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_model_as_decoder(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.add_cross_attention = True
+        model = FalconModel(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        result = model(input_ids, attention_mask=input_mask)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_causal_lm(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        model = FalconForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_decoder_model_past_large_inputs(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.is_decoder = True
+        config.add_cross_attention = True
+        model = FalconForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # first forward pass
+        outputs = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=True,
+        )
+        past_key_values = outputs.past_key_values
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+
+        output_from_no_past = model(
+            next_input_ids,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+        output_from_past = model(
+            next_tokens,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class FalconModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            FalconModel,
+            FalconForCausalLM,
+            FalconForSequenceClassification,
+            FalconForTokenClassification,
+            FalconForQuestionAnswering,
+        )
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (FalconForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": FalconModel,
+            "question-answering": FalconForQuestionAnswering,
+            "text-classification": FalconForSequenceClassification,
+            "text-generation": FalconForCausalLM,
+            "token-classification": FalconForTokenClassification,
+            "zero-shot": FalconForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
+    test_headmasking = False
+    test_pruning = False
+
+    def setUp(self):
+        self.model_tester = FalconModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=FalconConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_position_embedding_types(self):
+        config, *inputs = self.model_tester.prepare_config_and_inputs()
+        for alibi in [True, False]:
+            config.alibi = alibi
+            self.model_tester.create_and_check_model(config, *inputs)
+
+    def test_falcon_sequence_classification_model(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
+        model = FalconForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    def test_falcon_sequence_classification_model_for_single_label(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        config.problem_type = "single_label_classification"
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
+        model = FalconForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    def test_cache_conversions(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        input_ids = input_dict["input_ids"]
+        model = FalconForCausalLM(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, use_cache=True)
+        batch_size = input_ids.shape[0]
+        rw_cache = model._convert_to_rw_cache(result.past_key_values)
+        standard_cache = model._convert_cache_to_standard_format(rw_cache, batch_size)
+        for layer in range(len(rw_cache)):
+            for tensor_idx in range(2):
+                self.assertTrue(rw_cache[layer][tensor_idx].ndim == 3)
+                self.assertTrue(result.past_key_values[layer][tensor_idx].ndim == 4)
+                self.assertTrue(
+                    torch.all(result.past_key_values[layer][tensor_idx] == standard_cache[layer][tensor_idx])
+                )
+
+    def test_falcon_sequence_classification_model_for_multi_label(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        config.problem_type = "multi_label_classification"
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        sequence_labels = ids_tensor(
+            [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size
+        ).to(torch.float)
+        model = FalconForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    def test_past_key_values_format(self):
+        # Falcon can have different numbers of KV-heads than the number of query heads, so we need
+        # to override this test to use the right head counts.
+        for model_class in self.all_generative_model_classes:
+            config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
+
+            # If it doesn't support cache, pass the test
+            if not hasattr(config, "use_cache"):
+                return
+
+            model = model_class(config).to(torch_device)
+            if "use_cache" not in inputs:
+                inputs["use_cache"] = True
+            outputs = model(**inputs)
+
+            # If "past_key_values" is not returned, pass the test (e.g. RWKV uses a different cache name and format)
+            if "past_key_values" not in outputs:
+                return
+
+            num_hidden_layers = (
+                getattr(config, "decoder_layers", None)
+                or getattr(config, "num_decoder_layers", None)
+                or config.num_hidden_layers
+            )
+            num_attention_heads = getattr(config, "num_kv_heads", config.num_attention_heads)
+            embed_dim = getattr(config, "d_model", config.hidden_size)
+            per_head_embed_dim = embed_dim // num_attention_heads
+
+            past_kv = outputs["past_key_values"]
+            self.assertEqual(len(past_kv), num_hidden_layers)
+
+            batch_size, seq_length = inputs["input_ids"].shape
+            for i in range(num_hidden_layers):
+                if config.new_decoder_architecture:
+                    num_attention_heads = config.num_attention_heads
+                elif config.multi_query:
+                    num_attention_heads = 1
+                self.assertEqual(len(past_kv[0]), 2)  # K V for the decoder = 2
+                self.assertEqual(
+                    past_kv[i][0].shape, (batch_size, num_attention_heads, seq_length, per_head_embed_dim)
+                )
+                self.assertEqual(
+                    past_kv[i][1].shape, (batch_size, num_attention_heads, seq_length, per_head_embed_dim)
+                )
+
+    @parameterized.expand([("linear",), ("dynamic",)])
+    def test_model_rope_scaling(self, scaling_type):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        short_input = ids_tensor([1, 10], config.vocab_size)
+        long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size)
+
+        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
+        original_model = FalconModel(config)
+        original_model.to(torch_device)
+        original_model.eval()
+        original_short_output = original_model(short_input).last_hidden_state
+        original_long_output = original_model(long_input).last_hidden_state
+
+        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
+        config.rope_scaling = {"type": scaling_type, "factor": 10.0}
+        scaled_model = FalconModel(config)
+        scaled_model.to(torch_device)
+        scaled_model.eval()
+        scaled_short_output = scaled_model(short_input).last_hidden_state
+        scaled_long_output = scaled_model(long_input).last_hidden_state
+
+        # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original
+        # maximum sequence length, so the outputs for the short input should match.
+        if scaling_type == "dynamic":
+            self.assertTrue(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
+        else:
+            self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
+
+        # The output should be different for long inputs
+        self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5))
+
+
+@require_torch
+class FalconLanguageGenerationTest(unittest.TestCase):
+    @slow
+    def test_lm_generate_falcon(self):
+        tokenizer = AutoTokenizer.from_pretrained("Rocketknight1/falcon-rw-1b")
+        model = FalconForCausalLM.from_pretrained("Rocketknight1/falcon-rw-1b")
+        model.eval()
+        model.to(torch_device)
+        inputs = tokenizer("My favorite food is", return_tensors="pt").to(torch_device)
+
+        EXPECTED_OUTPUT = (
+            "My favorite food is pizza. I love it so much that I have a pizza party every year for my birthday."
+        )
+
+        output_ids = model.generate(**inputs, do_sample=False, max_new_tokens=19)
+        output_str = tokenizer.batch_decode(output_ids)[0]
+
+        self.assertEqual(output_str, EXPECTED_OUTPUT)
+
+    @slow
+    def test_lm_generation_big_models(self):
+        # The big models are way too big for the CI, so we use tiny random models that resemble their
+        # architectures but with much smaller and fewer layers
+        for repo in ["Rocketknight1/tiny-random-falcon-7b", "Rocketknight1/tiny-random-falcon-40b"]:
+            tokenizer = AutoTokenizer.from_pretrained(repo)
+            model = FalconForCausalLM.from_pretrained(repo)
+            model.eval()
+            model.to(torch_device)
+            inputs = tokenizer("My favorite food is", return_tensors="pt").to(torch_device)
+
+            # We just test that these run without errors - the models are randomly initialized
+            # and so the actual text outputs will be garbage
+            model.generate(**inputs, do_sample=False, max_new_tokens=4)
+            model.generate(**inputs, do_sample=True, max_new_tokens=4)
+            model.generate(**inputs, num_beams=2, max_new_tokens=4)
+
+    @slow
+    def test_lm_generation_use_cache(self):
+        # The big models are way too big for the CI, so we use tiny random models that resemble their
+        # architectures but with much smaller and fewer layers
+        with torch.no_grad():
+            for repo in [
+                "Rocketknight1/falcon-rw-1b",
+                "Rocketknight1/tiny-random-falcon-7b",
+                "Rocketknight1/tiny-random-falcon-40b",
+            ]:
+                tokenizer = AutoTokenizer.from_pretrained(repo)
+                model = FalconForCausalLM.from_pretrained(repo)
+                model.eval()
+                model.to(device=torch_device)
+                inputs = tokenizer("My favorite food is", return_tensors="pt").to(torch_device)
+
+                # Test results are the same with and without cache
+                outputs_no_cache = model.generate(**inputs, do_sample=False, max_new_tokens=20, use_cache=False)
+                outputs_cache = model.generate(**inputs, do_sample=False, max_new_tokens=20, use_cache=True)
+                self.assertTrue((outputs_cache - outputs_no_cache).sum().item() == 0)
+
+    @require_bitsandbytes
+    @slow
+    def test_batched_generation(self):
+        tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b", padding_side="left")
+        tokenizer.pad_token = tokenizer.eos_token
+        model = AutoModelForCausalLM.from_pretrained(
+            "tiiuae/falcon-7b",
+            device_map="auto",
+            load_in_4bit=True,
+        )
+
+        test_text = "A sequence: 1, 2"  # should generate the rest of the sequence
+
+        unpadded_inputs = tokenizer([test_text], return_tensors="pt").to("cuda:0")
+        unpadded_inputs.pop("token_type_ids")
+        unpadded_gen_out = model.generate(**unpadded_inputs, max_new_tokens=20)
+        unpadded_gen_text = tokenizer.batch_decode(unpadded_gen_out, skip_special_tokens=True)
+
+        dummy_text = "This is a longer text " * 2  # forces left-padding on `test_text`
+        padded_inputs = tokenizer([test_text, dummy_text], return_tensors="pt", padding=True).to("cuda:0")
+        padded_inputs.pop("token_type_ids")
+        padded_gen_out = model.generate(**padded_inputs, max_new_tokens=20)
+        padded_gen_text = tokenizer.batch_decode(padded_gen_out, skip_special_tokens=True)
+
+        expected_output = "A sequence: 1, 2, 3, 4, 5, 6, 7, 8, "
+        self.assertLess(unpadded_inputs.input_ids.shape[-1], padded_inputs.input_ids.shape[-1])  # left-padding exists
+        self.assertEqual(unpadded_gen_text[0], expected_output)
+        self.assertEqual(padded_gen_text[0], expected_output)
diff --git a/tests/models/flaubert/test_modeling_flaubert.py b/tests/models/flaubert/test_modeling_flaubert.py
index 2cd204ebc3dd..61806182bb7f 100644
--- a/tests/models/flaubert/test_modeling_flaubert.py
+++ b/tests/models/flaubert/test_modeling_flaubert.py
@@ -21,6 +21,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -56,7 +57,7 @@ def __init__(
         vocab_size=99,
         n_special=0,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         hidden_dropout_prob=0.1,
         attention_probs_dropout_prob=0.1,
@@ -362,8 +363,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class FlaubertModelTest(ModelTesterMixin, unittest.TestCase):
-
+class FlaubertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             FlaubertModel,
@@ -377,6 +377,34 @@ class FlaubertModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": FlaubertModel,
+            "fill-mask": FlaubertWithLMHeadModel,
+            "question-answering": FlaubertForQuestionAnsweringSimple,
+            "text-classification": FlaubertForSequenceClassification,
+            "token-classification": FlaubertForTokenClassification,
+            "zero-shot": FlaubertForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
+
+    # TODO: Fix the failed tests
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if (
+            pipeline_test_casse_name == "QAPipelineTests"
+            and tokenizer_name is not None
+            and not tokenizer_name.endswith("Fast")
+        ):
+            # `QAPipelineTests` fails for a few models when the slower tokenizer are used.
+            # (The slower tokenizers were never used for pipeline tests before the pipeline testing rework)
+            # TODO: check (and possibly fix) the `QAPipelineTests` with slower tokenizer
+            return True
+
+        return False
 
     # Flaubert has 2 QA models -> need to manually set the correct labels for one of them here
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
@@ -439,7 +467,6 @@ def test_model_from_pretrained(self):
     def test_torchscript_device_change(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
-
             # FlauBertForMultipleChoice behaves incorrectly in JIT environments.
             if model_class == FlaubertForMultipleChoice:
                 return
diff --git a/tests/models/flaubert/test_modeling_tf_flaubert.py b/tests/models/flaubert/test_modeling_tf_flaubert.py
index 09ba6f45d8d0..6d74b55ce344 100644
--- a/tests/models/flaubert/test_modeling_tf_flaubert.py
+++ b/tests/models/flaubert/test_modeling_tf_flaubert.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import is_tf_available
@@ -20,6 +22,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -58,7 +61,7 @@ def __init__(
         self.vocab_size = 99
         self.n_special = 0
         self.hidden_size = 32
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.num_attention_heads = 4
         self.hidden_dropout_prob = 0.1
         self.attention_probs_dropout_prob = 0.1
@@ -274,8 +277,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFFlaubertModelTest(TFModelTesterMixin, unittest.TestCase):
-
+class TFFlaubertModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             TFFlaubertModel,
@@ -291,9 +293,37 @@ class TFFlaubertModelTest(TFModelTesterMixin, unittest.TestCase):
     all_generative_model_classes = (
         (TFFlaubertWithLMHeadModel,) if is_tf_available() else ()
     )  # TODO (PVP): Check other models whether language generation is also applicable
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": TFFlaubertModel,
+            "fill-mask": TFFlaubertWithLMHeadModel,
+            "question-answering": TFFlaubertForQuestionAnsweringSimple,
+            "text-classification": TFFlaubertForSequenceClassification,
+            "token-classification": TFFlaubertForTokenClassification,
+            "zero-shot": TFFlaubertForSequenceClassification,
+        }
+        if is_tf_available()
+        else {}
+    )
     test_head_masking = False
     test_onnx = False
 
+    # TODO: Fix the failed tests
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if (
+            pipeline_test_casse_name == "QAPipelineTests"
+            and tokenizer_name is not None
+            and not tokenizer_name.endswith("Fast")
+        ):
+            # `QAPipelineTests` fails for a few models when the slower tokenizer are used.
+            # (The slower tokenizers were never used for pipeline tests before the pipeline testing rework)
+            # TODO: check (and possibly fix) the `QAPipelineTests` with slower tokenizer
+            return True
+
+        return False
+
     def setUp(self):
         self.model_tester = TFFlaubertModelTester(self)
         self.config_tester = ConfigTester(self, config_class=FlaubertConfig, emb_dim=37)
diff --git a/tests/models/flava/test_feature_extraction_flava.py b/tests/models/flava/test_feature_extraction_flava.py
deleted file mode 100644
index ba6379e6b348..000000000000
--- a/tests/models/flava/test_feature_extraction_flava.py
+++ /dev/null
@@ -1,373 +0,0 @@
-# coding=utf-8
-# Copyright 2022 Meta Platforms authors and HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-import unittest
-
-import numpy as np
-
-from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torch_available, is_vision_available
-
-from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
-
-
-if is_torch_available():
-    import torch
-
-if is_vision_available():
-    import PIL
-
-    from transformers import FlavaFeatureExtractor
-    from transformers.image_utils import PILImageResampling
-    from transformers.models.flava.image_processing_flava import (
-        FLAVA_CODEBOOK_MEAN,
-        FLAVA_CODEBOOK_STD,
-        FLAVA_IMAGE_MEAN,
-        FLAVA_IMAGE_STD,
-    )
-else:
-    FLAVA_IMAGE_MEAN = FLAVA_IMAGE_STD = FLAVA_CODEBOOK_MEAN = FLAVA_CODEBOOK_STD = None
-
-
-class FlavaFeatureExtractionTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_center_crop=True,
-        crop_size=None,
-        resample=None,
-        do_rescale=True,
-        rescale_factor=1 / 255,
-        do_normalize=True,
-        image_mean=FLAVA_IMAGE_MEAN,
-        image_std=FLAVA_IMAGE_STD,
-        input_size_patches=14,
-        total_mask_patches=75,
-        mask_group_max_patches=None,
-        mask_group_min_patches=16,
-        mask_group_min_aspect_ratio=0.3,
-        mask_group_max_aspect_ratio=None,
-        codebook_do_resize=True,
-        codebook_size=None,
-        codebook_resample=None,
-        codebook_do_center_crop=True,
-        codebook_crop_size=None,
-        codebook_do_map_pixels=True,
-        codebook_do_normalize=True,
-        codebook_image_mean=FLAVA_CODEBOOK_MEAN,
-        codebook_image_std=FLAVA_CODEBOOK_STD,
-    ):
-        size = size if size is not None else {"height": 224, "width": 224}
-        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
-        codebook_size = codebook_size if codebook_size is not None else {"height": 112, "width": 112}
-        codebook_crop_size = codebook_crop_size if codebook_crop_size is not None else {"height": 112, "width": 112}
-
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.do_resize = do_resize
-        self.do_rescale = do_rescale
-        self.rescale_factor = rescale_factor
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.size = size
-        self.resample = resample if resample is not None else PILImageResampling.BICUBIC
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.do_center_crop = do_center_crop
-        self.crop_size = crop_size
-
-        self.input_size_patches = input_size_patches
-        self.total_mask_patches = total_mask_patches
-        self.mask_group_max_patches = mask_group_max_patches
-        self.mask_group_min_patches = mask_group_min_patches
-        self.mask_group_min_aspect_ratio = mask_group_min_aspect_ratio
-        self.mask_group_max_aspect_ratio = mask_group_max_aspect_ratio
-
-        self.codebook_do_resize = codebook_do_resize
-        self.codebook_size = codebook_size
-        self.codebook_resample = codebook_resample if codebook_resample is not None else PILImageResampling.LANCZOS
-        self.codebook_do_center_crop = codebook_do_center_crop
-        self.codebook_crop_size = codebook_crop_size
-        self.codebook_do_map_pixels = codebook_do_map_pixels
-        self.codebook_do_normalize = codebook_do_normalize
-        self.codebook_image_mean = codebook_image_mean
-        self.codebook_image_std = codebook_image_std
-
-    def prepare_feat_extract_dict(self):
-        return {
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_normalize": self.do_normalize,
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "resample": self.resample,
-            "do_rescale": self.do_rescale,
-            "rescale_factor": self.rescale_factor,
-            "do_center_crop": self.do_center_crop,
-            "crop_size": self.crop_size,
-            "input_size_patches": self.input_size_patches,
-            "total_mask_patches": self.total_mask_patches,
-            "mask_group_max_patches": self.mask_group_max_patches,
-            "mask_group_min_patches": self.mask_group_min_patches,
-            "mask_group_min_aspect_ratio": self.mask_group_min_aspect_ratio,
-            "mask_group_max_aspect_ratio": self.mask_group_min_aspect_ratio,
-            "codebook_do_resize": self.codebook_do_resize,
-            "codebook_size": self.codebook_size,
-            "codebook_resample": self.codebook_resample,
-            "codebook_do_center_crop": self.codebook_do_center_crop,
-            "codebook_crop_size": self.codebook_crop_size,
-            "codebook_do_map_pixels": self.codebook_do_map_pixels,
-            "codebook_do_normalize": self.codebook_do_normalize,
-            "codebook_image_mean": self.codebook_image_mean,
-            "codebook_image_std": self.codebook_image_std,
-        }
-
-    def get_expected_image_size(self):
-        return (self.size["height"], self.size["width"])
-
-    def get_expected_mask_size(self):
-        return (
-            (self.input_size_patches, self.input_size_patches)
-            if not isinstance(self.input_size_patches, tuple)
-            else self.input_size_patches
-        )
-
-    def get_expected_codebook_image_size(self):
-        return (self.codebook_size["height"], self.codebook_size["width"])
-
-
-@require_torch
-@require_vision
-class FlavaFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
-
-    feature_extraction_class = FlavaFeatureExtractor if is_vision_available() else None
-    maxDiff = None
-
-    def setUp(self):
-        self.feature_extract_tester = FlavaFeatureExtractionTester(self)
-
-    @property
-    def feat_extract_dict(self):
-        return self.feature_extract_tester.prepare_feat_extract_dict()
-
-    def test_feat_extract_properties(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        self.assertTrue(hasattr(feature_extractor, "image_mean"))
-        self.assertTrue(hasattr(feature_extractor, "image_std"))
-        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
-        self.assertTrue(hasattr(feature_extractor, "do_resize"))
-        self.assertTrue(hasattr(feature_extractor, "resample"))
-        self.assertTrue(hasattr(feature_extractor, "crop_size"))
-        self.assertTrue(hasattr(feature_extractor, "do_center_crop"))
-        self.assertTrue(hasattr(feature_extractor, "do_rescale"))
-        self.assertTrue(hasattr(feature_extractor, "rescale_factor"))
-        self.assertTrue(hasattr(feature_extractor, "masking_generator"))
-        self.assertTrue(hasattr(feature_extractor, "codebook_do_resize"))
-        self.assertTrue(hasattr(feature_extractor, "codebook_size"))
-        self.assertTrue(hasattr(feature_extractor, "codebook_resample"))
-        self.assertTrue(hasattr(feature_extractor, "codebook_do_center_crop"))
-        self.assertTrue(hasattr(feature_extractor, "codebook_crop_size"))
-        self.assertTrue(hasattr(feature_extractor, "codebook_do_map_pixels"))
-        self.assertTrue(hasattr(feature_extractor, "codebook_do_normalize"))
-        self.assertTrue(hasattr(feature_extractor, "codebook_image_mean"))
-        self.assertTrue(hasattr(feature_extractor, "codebook_image_std"))
-
-    def test_feat_extract_from_dict_with_kwargs(self):
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict)
-        self.assertEqual(feature_extractor.size, {"height": 224, "width": 224})
-        self.assertEqual(feature_extractor.crop_size, {"height": 224, "width": 224})
-        self.assertEqual(feature_extractor.codebook_size, {"height": 112, "width": 112})
-        self.assertEqual(feature_extractor.codebook_crop_size, {"height": 112, "width": 112})
-
-        feature_extractor = self.feature_extraction_class.from_dict(
-            self.feat_extract_dict, size=42, crop_size=84, codebook_size=33, codebook_crop_size=66
-        )
-        self.assertEqual(feature_extractor.size, {"height": 42, "width": 42})
-        self.assertEqual(feature_extractor.crop_size, {"height": 84, "width": 84})
-        self.assertEqual(feature_extractor.codebook_size, {"height": 33, "width": 33})
-        self.assertEqual(feature_extractor.codebook_crop_size, {"height": 66, "width": 66})
-
-    def test_batch_feature(self):
-        pass
-
-    def test_call_pil(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PIL images
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, PIL.Image.Image)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt")
-
-        # Test no bool masked pos
-        self.assertFalse("bool_masked_pos" in encoded_images)
-
-        expected_height, expected_width = self.feature_extract_tester.get_expected_image_size()
-
-        self.assertEqual(
-            encoded_images.pixel_values.shape,
-            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt")
-        expected_height, expected_width = self.feature_extract_tester.get_expected_image_size()
-
-        # Test no bool masked pos
-        self.assertFalse("bool_masked_pos" in encoded_images)
-
-        self.assertEqual(
-            encoded_images.pixel_values.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                expected_height,
-                expected_width,
-            ),
-        )
-
-    def _test_call_framework(self, instance_class, prepare_kwargs):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, **prepare_kwargs)
-        for image in image_inputs:
-            self.assertIsInstance(image, instance_class)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt")
-
-        expected_height, expected_width = self.feature_extract_tester.get_expected_image_size()
-        self.assertEqual(
-            encoded_images.pixel_values.shape,
-            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
-        )
-
-        encoded_images = feature_extractor(image_inputs, return_image_mask=True, return_tensors="pt")
-
-        expected_height, expected_width = self.feature_extract_tester.get_expected_image_size()
-        self.assertEqual(
-            encoded_images.pixel_values.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                expected_height,
-                expected_width,
-            ),
-        )
-
-        expected_height, expected_width = self.feature_extract_tester.get_expected_mask_size()
-        self.assertEqual(
-            encoded_images.bool_masked_pos.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                expected_height,
-                expected_width,
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-
-        expected_height, expected_width = self.feature_extract_tester.get_expected_image_size()
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                expected_height,
-                expected_width,
-            ),
-        )
-
-        # Test masking
-        encoded_images = feature_extractor(image_inputs, return_image_mask=True, return_tensors="pt")
-
-        expected_height, expected_width = self.feature_extract_tester.get_expected_image_size()
-        self.assertEqual(
-            encoded_images.pixel_values.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                expected_height,
-                expected_width,
-            ),
-        )
-
-        expected_height, expected_width = self.feature_extract_tester.get_expected_mask_size()
-        self.assertEqual(
-            encoded_images.bool_masked_pos.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                expected_height,
-                expected_width,
-            ),
-        )
-
-    def test_call_numpy(self):
-        self._test_call_framework(np.ndarray, prepare_kwargs={"numpify": True})
-
-    def test_call_pytorch(self):
-        self._test_call_framework(torch.Tensor, prepare_kwargs={"torchify": True})
-
-    def test_masking(self):
-        # Initialize feature_extractor
-        random.seed(1234)
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_image_mask=True, return_tensors="pt")
-        self.assertEqual(encoded_images.bool_masked_pos.sum().item(), 75)
-
-    def test_codebook_pixels(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PIL images
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, PIL.Image.Image)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_codebook_pixels=True, return_tensors="pt")
-        expected_height, expected_width = self.feature_extract_tester.get_expected_codebook_image_size()
-        self.assertEqual(
-            encoded_images.codebook_pixel_values.shape,
-            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_codebook_pixels=True, return_tensors="pt")
-        expected_height, expected_width = self.feature_extract_tester.get_expected_codebook_image_size()
-        self.assertEqual(
-            encoded_images.codebook_pixel_values.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                expected_height,
-                expected_width,
-            ),
-        )
diff --git a/tests/models/flava/test_image_processing_flava.py b/tests/models/flava/test_image_processing_flava.py
new file mode 100644
index 000000000000..d89a1a6f6bfb
--- /dev/null
+++ b/tests/models/flava/test_image_processing_flava.py
@@ -0,0 +1,385 @@
+# coding=utf-8
+# Copyright 2022 Meta Platforms authors and HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import unittest
+
+import numpy as np
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    import PIL
+
+    from transformers import FlavaImageProcessor
+    from transformers.image_utils import PILImageResampling
+    from transformers.models.flava.image_processing_flava import (
+        FLAVA_CODEBOOK_MEAN,
+        FLAVA_CODEBOOK_STD,
+        FLAVA_IMAGE_MEAN,
+        FLAVA_IMAGE_STD,
+    )
+else:
+    FLAVA_IMAGE_MEAN = FLAVA_IMAGE_STD = FLAVA_CODEBOOK_MEAN = FLAVA_CODEBOOK_STD = None
+
+
+class FlavaImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_center_crop=True,
+        crop_size=None,
+        resample=None,
+        do_rescale=True,
+        rescale_factor=1 / 255,
+        do_normalize=True,
+        image_mean=FLAVA_IMAGE_MEAN,
+        image_std=FLAVA_IMAGE_STD,
+        input_size_patches=14,
+        total_mask_patches=75,
+        mask_group_max_patches=None,
+        mask_group_min_patches=16,
+        mask_group_min_aspect_ratio=0.3,
+        mask_group_max_aspect_ratio=None,
+        codebook_do_resize=True,
+        codebook_size=None,
+        codebook_resample=None,
+        codebook_do_center_crop=True,
+        codebook_crop_size=None,
+        codebook_do_map_pixels=True,
+        codebook_do_normalize=True,
+        codebook_image_mean=FLAVA_CODEBOOK_MEAN,
+        codebook_image_std=FLAVA_CODEBOOK_STD,
+    ):
+        size = size if size is not None else {"height": 224, "width": 224}
+        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
+        codebook_size = codebook_size if codebook_size is not None else {"height": 112, "width": 112}
+        codebook_crop_size = codebook_crop_size if codebook_crop_size is not None else {"height": 112, "width": 112}
+
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.do_resize = do_resize
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.size = size
+        self.resample = resample if resample is not None else PILImageResampling.BICUBIC
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+
+        self.input_size_patches = input_size_patches
+        self.total_mask_patches = total_mask_patches
+        self.mask_group_max_patches = mask_group_max_patches
+        self.mask_group_min_patches = mask_group_min_patches
+        self.mask_group_min_aspect_ratio = mask_group_min_aspect_ratio
+        self.mask_group_max_aspect_ratio = mask_group_max_aspect_ratio
+
+        self.codebook_do_resize = codebook_do_resize
+        self.codebook_size = codebook_size
+        self.codebook_resample = codebook_resample if codebook_resample is not None else PILImageResampling.LANCZOS
+        self.codebook_do_center_crop = codebook_do_center_crop
+        self.codebook_crop_size = codebook_crop_size
+        self.codebook_do_map_pixels = codebook_do_map_pixels
+        self.codebook_do_normalize = codebook_do_normalize
+        self.codebook_image_mean = codebook_image_mean
+        self.codebook_image_std = codebook_image_std
+
+    def prepare_image_processor_dict(self):
+        return {
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_normalize": self.do_normalize,
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "resample": self.resample,
+            "do_rescale": self.do_rescale,
+            "rescale_factor": self.rescale_factor,
+            "do_center_crop": self.do_center_crop,
+            "crop_size": self.crop_size,
+            "input_size_patches": self.input_size_patches,
+            "total_mask_patches": self.total_mask_patches,
+            "mask_group_max_patches": self.mask_group_max_patches,
+            "mask_group_min_patches": self.mask_group_min_patches,
+            "mask_group_min_aspect_ratio": self.mask_group_min_aspect_ratio,
+            "mask_group_max_aspect_ratio": self.mask_group_min_aspect_ratio,
+            "codebook_do_resize": self.codebook_do_resize,
+            "codebook_size": self.codebook_size,
+            "codebook_resample": self.codebook_resample,
+            "codebook_do_center_crop": self.codebook_do_center_crop,
+            "codebook_crop_size": self.codebook_crop_size,
+            "codebook_do_map_pixels": self.codebook_do_map_pixels,
+            "codebook_do_normalize": self.codebook_do_normalize,
+            "codebook_image_mean": self.codebook_image_mean,
+            "codebook_image_std": self.codebook_image_std,
+        }
+
+    def get_expected_image_size(self):
+        return (self.size["height"], self.size["width"])
+
+    def get_expected_mask_size(self):
+        return (
+            (self.input_size_patches, self.input_size_patches)
+            if not isinstance(self.input_size_patches, tuple)
+            else self.input_size_patches
+        )
+
+    def get_expected_codebook_image_size(self):
+        return (self.codebook_size["height"], self.codebook_size["width"])
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class FlavaImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = FlavaImageProcessor if is_vision_available() else None
+    maxDiff = None
+
+    def setUp(self):
+        self.image_processor_tester = FlavaImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "resample"))
+        self.assertTrue(hasattr(image_processing, "crop_size"))
+        self.assertTrue(hasattr(image_processing, "do_center_crop"))
+        self.assertTrue(hasattr(image_processing, "do_rescale"))
+        self.assertTrue(hasattr(image_processing, "rescale_factor"))
+        self.assertTrue(hasattr(image_processing, "masking_generator"))
+        self.assertTrue(hasattr(image_processing, "codebook_do_resize"))
+        self.assertTrue(hasattr(image_processing, "codebook_size"))
+        self.assertTrue(hasattr(image_processing, "codebook_resample"))
+        self.assertTrue(hasattr(image_processing, "codebook_do_center_crop"))
+        self.assertTrue(hasattr(image_processing, "codebook_crop_size"))
+        self.assertTrue(hasattr(image_processing, "codebook_do_map_pixels"))
+        self.assertTrue(hasattr(image_processing, "codebook_do_normalize"))
+        self.assertTrue(hasattr(image_processing, "codebook_image_mean"))
+        self.assertTrue(hasattr(image_processing, "codebook_image_std"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"height": 224, "width": 224})
+        self.assertEqual(image_processor.crop_size, {"height": 224, "width": 224})
+        self.assertEqual(image_processor.codebook_size, {"height": 112, "width": 112})
+        self.assertEqual(image_processor.codebook_crop_size, {"height": 112, "width": 112})
+
+        image_processor = self.image_processing_class.from_dict(
+            self.image_processor_dict, size=42, crop_size=84, codebook_size=33, codebook_crop_size=66
+        )
+        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
+        self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
+        self.assertEqual(image_processor.codebook_size, {"height": 33, "width": 33})
+        self.assertEqual(image_processor.codebook_crop_size, {"height": 66, "width": 66})
+
+    def test_call_pil(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PIL images
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
+        for image in image_inputs:
+            self.assertIsInstance(image, PIL.Image.Image)
+
+        # Test not batched input
+        encoded_images = image_processing(image_inputs[0], return_tensors="pt")
+
+        # Test no bool masked pos
+        self.assertFalse("bool_masked_pos" in encoded_images)
+
+        expected_height, expected_width = self.image_processor_tester.get_expected_image_size()
+
+        self.assertEqual(
+            encoded_images.pixel_values.shape,
+            (1, self.image_processor_tester.num_channels, expected_height, expected_width),
+        )
+
+        # Test batched
+        encoded_images = image_processing(image_inputs, return_tensors="pt")
+        expected_height, expected_width = self.image_processor_tester.get_expected_image_size()
+
+        # Test no bool masked pos
+        self.assertFalse("bool_masked_pos" in encoded_images)
+
+        self.assertEqual(
+            encoded_images.pixel_values.shape,
+            (
+                self.image_processor_tester.batch_size,
+                self.image_processor_tester.num_channels,
+                expected_height,
+                expected_width,
+            ),
+        )
+
+    def _test_call_framework(self, instance_class, prepare_kwargs):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random tensors
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, **prepare_kwargs)
+        for image in image_inputs:
+            self.assertIsInstance(image, instance_class)
+
+        # Test not batched input
+        encoded_images = image_processing(image_inputs[0], return_tensors="pt")
+
+        expected_height, expected_width = self.image_processor_tester.get_expected_image_size()
+        self.assertEqual(
+            encoded_images.pixel_values.shape,
+            (1, self.image_processor_tester.num_channels, expected_height, expected_width),
+        )
+
+        encoded_images = image_processing(image_inputs, return_image_mask=True, return_tensors="pt")
+
+        expected_height, expected_width = self.image_processor_tester.get_expected_image_size()
+        self.assertEqual(
+            encoded_images.pixel_values.shape,
+            (
+                self.image_processor_tester.batch_size,
+                self.image_processor_tester.num_channels,
+                expected_height,
+                expected_width,
+            ),
+        )
+
+        expected_height, expected_width = self.image_processor_tester.get_expected_mask_size()
+        self.assertEqual(
+            encoded_images.bool_masked_pos.shape,
+            (
+                self.image_processor_tester.batch_size,
+                expected_height,
+                expected_width,
+            ),
+        )
+
+        # Test batched
+        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+
+        expected_height, expected_width = self.image_processor_tester.get_expected_image_size()
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.image_processor_tester.batch_size,
+                self.image_processor_tester.num_channels,
+                expected_height,
+                expected_width,
+            ),
+        )
+
+        # Test masking
+        encoded_images = image_processing(image_inputs, return_image_mask=True, return_tensors="pt")
+
+        expected_height, expected_width = self.image_processor_tester.get_expected_image_size()
+        self.assertEqual(
+            encoded_images.pixel_values.shape,
+            (
+                self.image_processor_tester.batch_size,
+                self.image_processor_tester.num_channels,
+                expected_height,
+                expected_width,
+            ),
+        )
+
+        expected_height, expected_width = self.image_processor_tester.get_expected_mask_size()
+        self.assertEqual(
+            encoded_images.bool_masked_pos.shape,
+            (
+                self.image_processor_tester.batch_size,
+                expected_height,
+                expected_width,
+            ),
+        )
+
+    def test_call_numpy(self):
+        self._test_call_framework(np.ndarray, prepare_kwargs={"numpify": True})
+
+    def test_call_numpy_4_channels(self):
+        self.image_processing_class.num_channels = 4
+        self._test_call_framework(np.ndarray, prepare_kwargs={"numpify": True})
+        self.image_processing_class.num_channels = 3
+
+    def test_call_pytorch(self):
+        self._test_call_framework(torch.Tensor, prepare_kwargs={"torchify": True})
+
+    def test_masking(self):
+        # Initialize image_processing
+        random.seed(1234)
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+
+        # Test not batched input
+        encoded_images = image_processing(image_inputs[0], return_image_mask=True, return_tensors="pt")
+        self.assertEqual(encoded_images.bool_masked_pos.sum().item(), 75)
+
+    def test_codebook_pixels(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PIL images
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
+        for image in image_inputs:
+            self.assertIsInstance(image, PIL.Image.Image)
+
+        # Test not batched input
+        encoded_images = image_processing(image_inputs[0], return_codebook_pixels=True, return_tensors="pt")
+        expected_height, expected_width = self.image_processor_tester.get_expected_codebook_image_size()
+        self.assertEqual(
+            encoded_images.codebook_pixel_values.shape,
+            (1, self.image_processor_tester.num_channels, expected_height, expected_width),
+        )
+
+        # Test batched
+        encoded_images = image_processing(image_inputs, return_codebook_pixels=True, return_tensors="pt")
+        expected_height, expected_width = self.image_processor_tester.get_expected_codebook_image_size()
+        self.assertEqual(
+            encoded_images.codebook_pixel_values.shape,
+            (
+                self.image_processor_tester.batch_size,
+                self.image_processor_tester.num_channels,
+                expected_height,
+                expected_width,
+            ),
+        )
diff --git a/tests/models/flava/test_modeling_flava.py b/tests/models/flava/test_modeling_flava.py
index 44aff1025f2e..02241816373a 100644
--- a/tests/models/flava/test_modeling_flava.py
+++ b/tests/models/flava/test_modeling_flava.py
@@ -22,8 +22,8 @@
 import unittest
 
 import numpy as np
-
 import requests
+
 from transformers import (
     FlavaConfig,
     FlavaImageCodebookConfig,
@@ -42,6 +42,7 @@
     ids_tensor,
     random_attention_mask,
 )
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -78,7 +79,7 @@ def __init__(
         parent,
         batch_size=12,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -91,7 +92,7 @@ def __init__(
         num_channels=3,
         qkv_bias=True,
         mask_token=True,
-        vocab_size=8192,
+        vocab_size=99,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -336,12 +337,12 @@ def __init__(
         is_training=True,
         use_input_mask=True,
         use_token_type_ids=True,
-        vocab_size=30522,
+        vocab_size=102,
         type_vocab_size=2,
         max_position_embeddings=512,
         position_embedding_type="absolute",
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -435,7 +436,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class FlavaTextModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (FlavaTextModel,) if is_torch_available() else ()
     test_pruning = False
     test_head_masking = False
@@ -487,7 +487,7 @@ def __init__(
         seq_length=44,
         use_input_mask=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -569,7 +569,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class FlavaMultimodalModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (FlavaMultimodalModel,) if is_torch_available() else ()
     test_pruning = False
     test_head_masking = False
@@ -633,11 +632,23 @@ def test_model_from_pretrained(self):
 
 
 class FlavaImageCodebookTester:
-    def __init__(self, parent, batch_size=12, image_size=112, num_channels=3):
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        image_size=112,
+        num_channels=3,
+        hidden_size=32,
+        num_groups=2,
+        vocab_size=99,
+    ):
         self.parent = parent
         self.batch_size = batch_size
         self.image_size = image_size
         self.num_channels = num_channels
+        self.hidden_size = hidden_size
+        self.num_groups = num_groups
+        self.vocab_size = vocab_size
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
@@ -646,7 +657,9 @@ def prepare_config_and_inputs(self):
         return config, pixel_values
 
     def get_config(self):
-        return FlavaImageCodebookConfig()
+        return FlavaImageCodebookConfig(
+            hidden_size=self.hidden_size, num_groups=self.num_groups, vocab_size=self.vocab_size
+        )
 
     def create_and_check_model(self, config, pixel_values):
         model = FlavaImageCodebook(config=config)
@@ -667,7 +680,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class FlavaImageCodebookTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (FlavaImageCodebook,) if is_torch_available() else ()
     test_pruning = False
     test_head_masking = False
@@ -756,7 +768,6 @@ def __init__(
         initializer_range=0.02,
         layer_norm_eps=1e-12,
     ):
-
         if text_kwargs is None:
             text_kwargs = {}
         if image_kwargs is None:
@@ -860,8 +871,9 @@ def _test_model(self, config, inputs, test_image=False, test_text=False):
 
 
 @require_torch
-class FlavaModelTest(ModelTesterMixin, unittest.TestCase):
+class FlavaModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (FlavaModel,) if is_torch_available() else ()
+    pipeline_model_mapping = {"feature-extraction": FlavaModel} if is_torch_available() else {}
     class_for_tester = FlavaModelTester
     test_head_masking = False
     test_pruning = False
@@ -966,8 +978,28 @@ def _create_and_check_torchscript(self, config, inputs_dict):
             # Non persistent buffers won't be in original state dict
             loaded_model_state_dict.pop("text_model.embeddings.token_type_ids", None)
 
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
             self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
 
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
             models_equal = True
             for layer_name, p1 in model_state_dict.items():
                 p2 = loaded_model_state_dict[layer_name]
diff --git a/tests/models/fnet/test_modeling_fnet.py b/tests/models/fnet/test_modeling_fnet.py
index 5d975b061f75..cfc25125c3ec 100644
--- a/tests/models/fnet/test_modeling_fnet.py
+++ b/tests/models/fnet/test_modeling_fnet.py
@@ -24,6 +24,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -69,7 +70,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         intermediate_size=37,
         hidden_act="gelu",
         hidden_dropout_prob=0.1,
@@ -265,8 +266,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class FNetModelTest(ModelTesterMixin, unittest.TestCase):
-
+class FNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             FNetModel,
@@ -281,12 +281,33 @@ class FNetModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": FNetModel,
+            "fill-mask": FNetForMaskedLM,
+            "question-answering": FNetForQuestionAnswering,
+            "text-classification": FNetForSequenceClassification,
+            "token-classification": FNetForTokenClassification,
+            "zero-shot": FNetForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
 
     # Skip Tests
     test_pruning = False
     test_head_masking = False
     test_pruning = False
 
+    # TODO: Fix the failed tests
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if pipeline_test_casse_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
+            return True
+
+        return False
+
     # special case for ForPreTraining model
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
@@ -511,8 +532,6 @@ def test_inference_for_masked_lm(self):
     @slow
     @require_tokenizers
     def test_inference_long_sentence(self):
-        model = FNetForMaskedLM.from_pretrained("google/fnet-base")
-        model.to(torch_device)
         tokenizer = FNetTokenizerFast.from_pretrained("google/fnet-base")
 
         inputs = tokenizer(
@@ -522,8 +541,15 @@ def test_inference_long_sentence(self):
             padding="max_length",
             max_length=512,
         )
+
+        # fmt: off
+        torch.testing.assert_allclose(inputs["input_ids"], torch.tensor([[4, 13, 283, 2479, 106, 8, 6, 845, 5, 168, 65, 367, 6, 845, 5, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3]]))
+        # fmt: on
+
         inputs = {k: v.to(torch_device) for k, v in inputs.items()}
 
+        model = FNetForMaskedLM.from_pretrained("google/fnet-base")
+        model.to(torch_device)
         logits = model(**inputs).logits
         predictions_mask_1 = tokenizer.decode(logits[0, 6].topk(5).indices)
         predictions_mask_2 = tokenizer.decode(logits[0, 12].topk(5).indices)
diff --git a/tests/models/fnet/test_tokenization_fnet.py b/tests/models/fnet/test_tokenization_fnet.py
index 0058155bdb6d..17fe3e0dd308 100644
--- a/tests/models/fnet/test_tokenization_fnet.py
+++ b/tests/models/fnet/test_tokenization_fnet.py
@@ -28,7 +28,6 @@
 @require_sentencepiece
 @require_tokenizers
 class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = FNetTokenizer
     rust_tokenizer_class = FNetTokenizerFast
     test_rust_tokenizer = True
@@ -145,7 +144,6 @@ def test_sequence_builders(self):
     def test_special_tokens_initialization(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-
                 added_tokens = [AddedToken("<special>", lstrip=True)]
 
                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(
diff --git a/tests/models/focalnet/__init__.py b/tests/models/focalnet/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/focalnet/test_modeling_focalnet.py b/tests/models/focalnet/test_modeling_focalnet.py
new file mode 100644
index 000000000000..ce96f0ade414
--- /dev/null
+++ b/tests/models/focalnet/test_modeling_focalnet.py
@@ -0,0 +1,458 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch FocalNet model. """
+
+import collections
+import inspect
+import unittest
+
+from transformers import FocalNetConfig
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.utils import cached_property, is_torch_available, is_vision_available
+
+from ...test_backbone_common import BackboneTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import (
+        FocalNetBackbone,
+        FocalNetForImageClassification,
+        FocalNetForMaskedImageModeling,
+        FocalNetModel,
+    )
+    from transformers.models.focalnet.modeling_focalnet import FOCALNET_PRETRAINED_MODEL_ARCHIVE_LIST
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import AutoImageProcessor
+
+
+class FocalNetModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        image_size=32,
+        patch_size=2,
+        num_channels=3,
+        embed_dim=16,
+        hidden_sizes=[32, 64, 128],
+        depths=[1, 2, 1],
+        num_heads=[2, 2, 4],
+        window_size=2,
+        mlp_ratio=2.0,
+        qkv_bias=True,
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        drop_path_rate=0.1,
+        hidden_act="gelu",
+        use_absolute_embeddings=False,
+        patch_norm=True,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        is_training=True,
+        scope=None,
+        use_labels=True,
+        type_sequence_label_size=10,
+        encoder_stride=8,
+        out_features=["stage1", "stage2"],
+        out_indices=[1, 2],
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.embed_dim = embed_dim
+        self.hidden_sizes = hidden_sizes
+        self.depths = depths
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.drop_path_rate = drop_path_rate
+        self.hidden_act = hidden_act
+        self.use_absolute_embeddings = use_absolute_embeddings
+        self.patch_norm = patch_norm
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        self.is_training = is_training
+        self.scope = scope
+        self.use_labels = use_labels
+        self.type_sequence_label_size = type_sequence_label_size
+        self.encoder_stride = encoder_stride
+        self.out_features = out_features
+        self.out_indices = out_indices
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+
+        config = self.get_config()
+
+        return config, pixel_values, labels
+
+    def get_config(self):
+        return FocalNetConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            embed_dim=self.embed_dim,
+            hidden_sizes=self.hidden_sizes,
+            depths=self.depths,
+            num_heads=self.num_heads,
+            window_size=self.window_size,
+            mlp_ratio=self.mlp_ratio,
+            qkv_bias=self.qkv_bias,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            drop_path_rate=self.drop_path_rate,
+            hidden_act=self.hidden_act,
+            use_absolute_embeddings=self.use_absolute_embeddings,
+            path_norm=self.patch_norm,
+            layer_norm_eps=self.layer_norm_eps,
+            initializer_range=self.initializer_range,
+            encoder_stride=self.encoder_stride,
+            out_features=self.out_features,
+            out_indices=self.out_indices,
+        )
+
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = FocalNetModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+
+        expected_seq_len = ((config.image_size // config.patch_size) ** 2) // (4 ** (len(config.depths) - 1))
+        expected_dim = int(config.embed_dim * 2 ** (len(config.depths) - 1))
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, expected_seq_len, expected_dim))
+
+    def create_and_check_backbone(self, config, pixel_values, labels):
+        model = FocalNetBackbone(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+
+        # verify feature maps
+        self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
+        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, self.image_size, 8, 8])
+
+        # verify channels
+        self.parent.assertEqual(len(model.channels), len(config.out_features))
+        self.parent.assertListEqual(model.channels, config.hidden_sizes[:-1])
+
+        # verify backbone works with out_features=None
+        config.out_features = None
+        model = FocalNetBackbone(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+
+        # verify feature maps
+        self.parent.assertEqual(len(result.feature_maps), 1)
+        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, self.image_size * 2, 4, 4])
+
+        # verify channels
+        self.parent.assertEqual(len(model.channels), 1)
+        self.parent.assertListEqual(model.channels, [config.hidden_sizes[-1]])
+
+    def create_and_check_for_masked_image_modeling(self, config, pixel_values, labels):
+        model = FocalNetForMaskedImageModeling(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        self.parent.assertEqual(
+            result.reconstruction.shape, (self.batch_size, self.num_channels, self.image_size, self.image_size)
+        )
+
+        # test greyscale images
+        config.num_channels = 1
+        model = FocalNetForMaskedImageModeling(config)
+        model.to(torch_device)
+        model.eval()
+
+        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
+        result = model(pixel_values)
+        self.parent.assertEqual(result.reconstruction.shape, (self.batch_size, 1, self.image_size, self.image_size))
+
+    def create_and_check_for_image_classification(self, config, pixel_values, labels):
+        config.num_labels = self.type_sequence_label_size
+        model = FocalNetForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values, labels=labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+
+        # test greyscale images
+        config.num_channels = 1
+        model = FocalNetForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+
+        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
+        result = model(pixel_values)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+
+        config, pixel_values, labels = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class FocalNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            FocalNetModel,
+            FocalNetForImageClassification,
+            FocalNetForMaskedImageModeling,
+            FocalNetBackbone,
+        )
+        if is_torch_available()
+        else ()
+    )
+    pipeline_model_mapping = (
+        {"feature-extraction": FocalNetModel, "image-classification": FocalNetForImageClassification}
+        if is_torch_available()
+        else {}
+    )
+    fx_compatible = False
+
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+    has_attentions = False
+
+    def setUp(self):
+        self.model_tester = FocalNetModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=FocalNetConfig, embed_dim=37, has_text_modality=False)
+
+    def test_config(self):
+        self.create_and_test_config_common_properties()
+        self.config_tester.create_and_test_config_to_json_string()
+        self.config_tester.create_and_test_config_to_json_file()
+        self.config_tester.create_and_test_config_from_and_save_pretrained()
+        self.config_tester.create_and_test_config_with_num_labels()
+        self.config_tester.check_config_can_be_init_without_params()
+        self.config_tester.check_config_arguments_init()
+
+    def create_and_test_config_common_properties(self):
+        return
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_backbone(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_backbone(*config_and_inputs)
+
+    def test_for_masked_image_modeling(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_image_modeling(*config_and_inputs)
+
+    def test_for_image_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
+
+    @unittest.skip(reason="FocalNet does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="FocalNet does not use feedforward chunking")
+    def test_feed_forward_chunking(self):
+        pass
+
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes[:-1]:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes[:-1]:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def check_hidden_states_output(self, inputs_dict, config, model_class, image_size):
+        model = model_class(config)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+        hidden_states = outputs.hidden_states
+
+        expected_num_layers = getattr(
+            self.model_tester, "expected_num_hidden_layers", len(self.model_tester.depths) + 1
+        )
+        self.assertEqual(len(hidden_states), expected_num_layers)
+
+        # FocalNet has a different seq_length
+        patch_size = (
+            config.patch_size
+            if isinstance(config.patch_size, collections.abc.Iterable)
+            else (config.patch_size, config.patch_size)
+        )
+
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+
+        self.assertListEqual(
+            list(hidden_states[0].shape[-2:]),
+            [num_patches, self.model_tester.embed_dim],
+        )
+
+        reshaped_hidden_states = outputs.reshaped_hidden_states
+        self.assertEqual(len(reshaped_hidden_states), expected_num_layers)
+
+        batch_size, num_channels, height, width = reshaped_hidden_states[0].shape
+        reshaped_hidden_states = (
+            reshaped_hidden_states[0].view(batch_size, num_channels, height * width).permute(0, 2, 1)
+        )
+        self.assertListEqual(
+            list(reshaped_hidden_states.shape[-2:]),
+            [num_patches, self.model_tester.embed_dim],
+        )
+
+    def test_hidden_states_output(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        image_size = (
+            self.model_tester.image_size
+            if isinstance(self.model_tester.image_size, collections.abc.Iterable)
+            else (self.model_tester.image_size, self.model_tester.image_size)
+        )
+
+        for model_class in self.all_model_classes[:-1]:
+            inputs_dict["output_hidden_states"] = True
+            self.check_hidden_states_output(inputs_dict, config, model_class, image_size)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            self.check_hidden_states_output(inputs_dict, config, model_class, image_size)
+
+    def test_hidden_states_output_with_padding(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.patch_size = 3
+
+        image_size = (
+            self.model_tester.image_size
+            if isinstance(self.model_tester.image_size, collections.abc.Iterable)
+            else (self.model_tester.image_size, self.model_tester.image_size)
+        )
+        patch_size = (
+            config.patch_size
+            if isinstance(config.patch_size, collections.abc.Iterable)
+            else (config.patch_size, config.patch_size)
+        )
+
+        padded_height = image_size[0] + patch_size[0] - (image_size[0] % patch_size[0])
+        padded_width = image_size[1] + patch_size[1] - (image_size[1] % patch_size[1])
+
+        for model_class in self.all_model_classes[:-1]:
+            inputs_dict["output_hidden_states"] = True
+            self.check_hidden_states_output(inputs_dict, config, model_class, (padded_height, padded_width))
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+            self.check_hidden_states_output(inputs_dict, config, model_class, (padded_height, padded_width))
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in FOCALNET_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = FocalNetModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if "embeddings" not in name and param.requires_grad:
+                    self.assertIn(
+                        ((param.data.mean() * 1e9).round() / 1e9).item(),
+                        [0.0, 1.0],
+                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                    )
+
+
+@require_vision
+@require_torch
+class FocalNetModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_image_processor(self):
+        # TODO update organization
+        return AutoImageProcessor.from_pretrained("microsoft/focalnet-tiny") if is_vision_available() else None
+
+    @slow
+    def test_inference_image_classification_head(self):
+        model = FocalNetForImageClassification.from_pretrained("microsoft/focalnet-tiny").to(torch_device)
+        image_processor = self.default_image_processor
+
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # verify the logits
+        expected_shape = torch.Size((1, 1000))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+        expected_slice = torch.tensor([0.2166, -0.4368, 0.2191]).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
+        self.assertTrue(outputs.logits.argmax(dim=-1).item(), 281)
+
+
+@require_torch
+class FocalNetBackboneTest(BackboneTesterMixin, unittest.TestCase):
+    all_model_classes = (FocalNetBackbone,) if is_torch_available() else ()
+    config_class = FocalNetConfig
+
+    has_attentions = False
+
+    def setUp(self):
+        self.model_tester = FocalNetModelTester(self)
diff --git a/tests/models/fsmt/test_modeling_fsmt.py b/tests/models/fsmt/test_modeling_fsmt.py
index 7710152634ea..f533da772783 100644
--- a/tests/models/fsmt/test_modeling_fsmt.py
+++ b/tests/models/fsmt/test_modeling_fsmt.py
@@ -17,8 +17,8 @@
 import unittest
 
 import timeout_decorator  # noqa
-
 from parameterized import parameterized
+
 from transformers import FSMTConfig, is_torch_available
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
 from transformers.utils import cached_property
@@ -26,6 +26,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -153,9 +154,20 @@ def prepare_fsmt_inputs_dict(
 
 
 @require_torch
-class FSMTModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class FSMTModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (FSMTModel, FSMTForConditionalGeneration) if is_torch_available() else ()
     all_generative_model_classes = (FSMTForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "conversational": FSMTForConditionalGeneration,
+            "feature-extraction": FSMTModel,
+            "summarization": FSMTForConditionalGeneration,
+            "text2text-generation": FSMTForConditionalGeneration,
+            "translation": FSMTForConditionalGeneration,
+        }
+        if is_torch_available()
+        else {}
+    )
     is_encoder_decoder = True
     test_pruning = False
     test_missing_keys = False
@@ -259,6 +271,41 @@ def test_export_to_onnx(self):
                 input_names=["input_ids", "attention_mask"],
             )
 
+    def test_ensure_weights_are_shared(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+
+        config.tie_word_embeddings = True
+        model = FSMTForConditionalGeneration(config)
+
+        # FSMT shares three weights.
+        # Not an issue to not have these correctly tied for torch.load, but it is an issue for safetensors.
+        self.assertEqual(
+            len(
+                {
+                    model.get_output_embeddings().weight.data_ptr(),
+                    model.get_input_embeddings().weight.data_ptr(),
+                    model.base_model.decoder.output_projection.weight.data_ptr(),
+                }
+            ),
+            1,
+        )
+
+        config.tie_word_embeddings = False
+        model = FSMTForConditionalGeneration(config)
+
+        # FSMT shares three weights.
+        # Not an issue to not have these correctly tied for torch.load, but it is an issue for safetensors.
+        self.assertEqual(
+            len(
+                {
+                    model.get_output_embeddings().weight.data_ptr(),
+                    model.get_input_embeddings().weight.data_ptr(),
+                    model.base_model.decoder.output_projection.weight.data_ptr(),
+                }
+            ),
+            2,
+        )
+
     @unittest.skip("can't be implemented for FSMT due to dual vocab.")
     def test_resize_tokens_embeddings(self):
         pass
@@ -528,7 +575,6 @@ def test_odd_embed_dim(self):
 
     @unittest.skip("different from marian (needs more research)")
     def test_positional_emb_weights_against_marian(self):
-
         desired_weights = torch.tensor(
             [
                 [0, 0, 0, 0, 0],
diff --git a/tests/models/funnel/test_modeling_funnel.py b/tests/models/funnel/test_modeling_funnel.py
index c0520203a97f..e46e5dc58de6 100644
--- a/tests/models/funnel/test_modeling_funnel.py
+++ b/tests/models/funnel/test_modeling_funnel.py
@@ -22,6 +22,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -351,7 +352,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class FunnelModelTest(ModelTesterMixin, unittest.TestCase):
+class FunnelModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     test_head_masking = False
     test_pruning = False
     all_model_classes = (
@@ -365,6 +366,18 @@ class FunnelModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": (FunnelBaseModel, FunnelModel),
+            "fill-mask": FunnelForMaskedLM,
+            "question-answering": FunnelForQuestionAnswering,
+            "text-classification": FunnelForSequenceClassification,
+            "token-classification": FunnelForTokenClassification,
+            "zero-shot": FunnelForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
 
     # special case for ForPreTraining model
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
diff --git a/tests/models/funnel/test_modeling_tf_funnel.py b/tests/models/funnel/test_modeling_tf_funnel.py
index faeb9a799510..b8327c86acfa 100644
--- a/tests/models/funnel/test_modeling_tf_funnel.py
+++ b/tests/models/funnel/test_modeling_tf_funnel.py
@@ -14,13 +14,16 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import FunnelConfig, is_tf_available
-from transformers.testing_utils import require_tf, tooslow
+from transformers.testing_utils import require_tf
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -329,7 +332,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFFunnelModelTest(TFModelTesterMixin, unittest.TestCase):
+class TFFunnelModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             TFFunnelModel,
@@ -341,6 +344,18 @@ class TFFunnelModelTest(TFModelTesterMixin, unittest.TestCase):
         if is_tf_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": (TFFunnelBaseModel, TFFunnelModel),
+            "fill-mask": TFFunnelForMaskedLM,
+            "question-answering": TFFunnelForQuestionAnswering,
+            "text-classification": TFFunnelForSequenceClassification,
+            "token-classification": TFFunnelForTokenClassification,
+            "zero-shot": TFFunnelForSequenceClassification,
+        }
+        if is_tf_available()
+        else {}
+    )
     test_head_masking = False
     test_onnx = False
 
@@ -371,14 +386,6 @@ def test_for_question_answering(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
 
-    @tooslow
-    def test_saved_model_creation(self):
-        pass
-
-    def test_compile_tf_model(self):
-        # This test fails the CI. TODO Lysandre re-enable it
-        pass
-
 
 @require_tf
 class TFFunnelBaseModelTest(TFModelTesterMixin, unittest.TestCase):
@@ -406,7 +413,3 @@ def test_for_sequence_classification(self):
     def test_for_multiple_choice(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    @tooslow
-    def test_saved_model_creation(self):
-        pass
diff --git a/tests/models/funnel/test_tokenization_funnel.py b/tests/models/funnel/test_tokenization_funnel.py
index e46928a538fd..6c5eb87db17c 100644
--- a/tests/models/funnel/test_tokenization_funnel.py
+++ b/tests/models/funnel/test_tokenization_funnel.py
@@ -26,7 +26,6 @@
 
 @require_tokenizers
 class FunnelTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = FunnelTokenizer
     rust_tokenizer_class = FunnelTokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/git/test_modeling_git.py b/tests/models/git/test_modeling_git.py
index 4bef577ee796..0dde54a398e3 100644
--- a/tests/models/git/test_modeling_git.py
+++ b/tests/models/git/test_modeling_git.py
@@ -17,19 +17,22 @@
 import unittest
 
 from huggingface_hub import hf_hub_download
+
 from transformers import GitConfig, GitProcessor, GitVisionConfig, is_torch_available, is_vision_available
 from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
 
+from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
     import torch
     from torch import nn
 
-    from transformers import MODEL_FOR_PRETRAINING_MAPPING, GitForCausalLM, GitModel, GitVisionModel
+    from transformers import MODEL_FOR_CAUSAL_LM_MAPPING, GitForCausalLM, GitModel, GitVisionModel
     from transformers.models.git.modeling_git import GIT_PRETRAINED_MODEL_ARCHIVE_LIST
 
 
@@ -48,7 +51,7 @@ def __init__(
         is_training=True,
         hidden_size=32,
         projection_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -200,7 +203,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -252,13 +255,9 @@ def prepare_config_and_inputs(self):
 
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
 
-        token_labels = None
-        if self.use_labels:
-            token_labels = ids_tensor([self.batch_size, self.text_seq_length], self.num_labels)
-
         config = self.get_config()
 
-        return config, input_ids, input_mask, pixel_values, token_labels
+        return config, input_ids, input_mask, pixel_values
 
     def get_config(self):
         """
@@ -269,6 +268,10 @@ def get_config(self):
                 "num_channels": self.num_channels,
                 "image_size": self.image_size,
                 "patch_size": self.patch_size,
+                "hidden_size": self.hidden_size,
+                "projection_dim": 32,
+                "num_hidden_layers": self.num_hidden_layers,
+                "num_attention_heads": self.num_attention_heads,
             },
             vocab_size=self.vocab_size,
             hidden_size=self.hidden_size,
@@ -285,7 +288,7 @@ def get_config(self):
             pad_token_id=self.pad_token_id,
         )
 
-    def create_and_check_model(self, config, input_ids, input_mask, pixel_values, token_labels):
+    def create_and_check_model(self, config, input_ids, input_mask, pixel_values):
         model = GitModel(config=config)
         model.to(torch_device)
         model.eval()
@@ -303,7 +306,7 @@ def create_and_check_model(self, config, input_ids, input_mask, pixel_values, to
             result.last_hidden_state.shape, (self.batch_size, self.text_seq_length, self.hidden_size)
         )
 
-    def create_and_check_for_causal_lm(self, config, input_ids, input_mask, pixel_values, token_labels):
+    def create_and_check_for_causal_lm(self, config, input_ids, input_mask, pixel_values):
         model = GitForCausalLM(config=config)
         model.to(torch_device)
         model.eval()
@@ -317,10 +320,48 @@ def create_and_check_for_causal_lm(self, config, input_ids, input_mask, pixel_va
         result = model(input_ids)
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.text_seq_length, self.vocab_size))
 
-        # TODO training
-        # result = model(input_ids, attention_mask=input_mask, pixel_values=pixel_values)
-        # self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-        # self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+        # training
+        result = model(input_ids, attention_mask=input_mask, pixel_values=pixel_values, labels=input_ids)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertTrue(result.loss.item() > 0)
+
+    def _test_beam_search_generate(self, config, input_ids, input_mask, pixel_values):
+        model = GitForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # generate
+        generated_ids = model.generate(
+            input_ids,
+            attention_mask=input_mask,
+            pixel_values=pixel_values,
+            do_sample=False,
+            max_length=20,
+            num_beams=2,
+            num_return_sequences=2,
+        )
+
+        self.parent.assertEqual(generated_ids.shape, (self.batch_size * 2, 20))
+
+    def _test_batched_generate_captioning(self, config, input_ids, input_mask, pixel_values):
+        model = GitForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # generate
+        generated_ids = model.generate(
+            input_ids=None,  # captioning -> no input_ids
+            attention_mask=None,
+            pixel_values=pixel_values,
+            do_sample=False,
+            max_length=20,
+            num_beams=2,
+            num_return_sequences=2,
+        )
+
+        self.parent.assertEqual(generated_ids.shape, (self.batch_size * 2, 20))
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
@@ -330,7 +371,6 @@ def prepare_config_and_inputs_for_common(self):
             input_ids,
             input_mask,
             pixel_values,
-            token_labels,
         ) = config_and_inputs
 
         inputs_dict = {
@@ -343,24 +383,27 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class GitModelTest(ModelTesterMixin, unittest.TestCase):
-
+class GitModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (GitModel, GitForCausalLM) if is_torch_available() else ()
     all_generative_model_classes = (GitForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": GitModel, "image-to-text": GitForCausalLM, "text-generation": GitForCausalLM}
+        if is_torch_available()
+        else {}
+    )
     fx_compatible = False
     test_torchscript = False
 
-    # special case for ForPreTraining model
+    # special case for GitForCausalLM model
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
 
         if return_labels:
-            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
+            if model_class in get_values(MODEL_FOR_CAUSAL_LM_MAPPING):
                 inputs_dict["labels"] = torch.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
-                )
-                inputs_dict["next_sentence_label"] = torch.zeros(
-                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
+                    (self.model_tester.batch_size, self.model_tester.text_seq_length),
+                    dtype=torch.long,
+                    device=torch_device,
                 )
         return inputs_dict
 
@@ -375,22 +418,46 @@ def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    def test_for_causal_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
+
+    def test_beam_search_generate(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester._test_beam_search_generate(*config_and_inputs)
+
+    def test_batched_generate_captioning(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester._test_batched_generate_captioning(*config_and_inputs)
+
     def test_model_various_embeddings(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         for type in ["absolute", "relative_key", "relative_key_query"]:
             config_and_inputs[0].position_embedding_type = type
             self.model_tester.create_and_check_model(*config_and_inputs)
 
-    def test_for_causal_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in GIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
             model = GitModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
+    @unittest.skip(reason="GIT has pixel values as additional input")
+    def test_beam_search_generate_dict_outputs_use_cache(self):
+        pass
+
+    @unittest.skip(reason="GIT has pixel values as additional input")
+    def test_contrastive_generate(self):
+        pass
+
+    @unittest.skip(reason="GIT has pixel values as additional input")
+    def test_contrastive_generate_dict_outputs_use_cache(self):
+        pass
+
+    @unittest.skip(reason="GIT has pixel values as additional input")
+    def test_greedy_generate_dict_outputs_use_cache(self):
+        pass
+
 
 @require_torch
 @require_vision
@@ -460,3 +527,21 @@ def test_visual_question_answering(self):
         expected_shape = torch.Size((1, 15))
         self.assertEqual(generated_ids.shape, expected_shape)
         self.assertEquals(generated_caption, "what does the front of the bus say at the top? special")
+
+    def test_batched_generation(self):
+        processor = GitProcessor.from_pretrained("microsoft/git-base-coco")
+        model = GitForCausalLM.from_pretrained("microsoft/git-base-coco")
+        model.to(torch_device)
+
+        # create batch of size 2
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        inputs = processor(images=[image, image], return_tensors="pt")
+        pixel_values = inputs.pixel_values.to(torch_device)
+
+        # we have to prepare `input_ids` with the same batch size as `pixel_values`
+        start_token_id = model.config.bos_token_id
+        input_ids = torch.tensor([[start_token_id], [start_token_id]], device=torch_device)
+        generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50)
+        generated_captions = processor.batch_decode(generated_ids, skip_special_tokens=True)
+
+        self.assertEquals(generated_captions, ["two cats sleeping on a pink blanket next to remotes."] * 2)
diff --git a/tests/models/glpn/test_feature_extraction_glpn.py b/tests/models/glpn/test_feature_extraction_glpn.py
deleted file mode 100644
index 4e7f2bdf5c78..000000000000
--- a/tests/models/glpn/test_feature_extraction_glpn.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-
-from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torch_available, is_vision_available
-
-from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
-
-
-if is_torch_available():
-    import torch
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import GLPNFeatureExtractor
-
-
-class GLPNFeatureExtractionTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size_divisor=32,
-        do_rescale=True,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size_divisor = size_divisor
-        self.do_rescale = do_rescale
-
-    def prepare_feat_extract_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size_divisor": self.size_divisor,
-            "do_rescale": self.do_rescale,
-        }
-
-
-@require_torch
-@require_vision
-class GLPNFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
-
-    feature_extraction_class = GLPNFeatureExtractor if is_vision_available() else None
-
-    def setUp(self):
-        self.feature_extract_tester = GLPNFeatureExtractionTester(self)
-
-    @property
-    def feat_extract_dict(self):
-        return self.feature_extract_tester.prepare_feat_extract_dict()
-
-    def test_feat_extract_properties(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        self.assertTrue(hasattr(feature_extractor, "do_resize"))
-        self.assertTrue(hasattr(feature_extractor, "size_divisor"))
-        self.assertTrue(hasattr(feature_extractor, "resample"))
-        self.assertTrue(hasattr(feature_extractor, "do_rescale"))
-
-    def test_batch_feature(self):
-        pass
-
-    def test_call_pil(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PIL images
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input (GLPNFeatureExtractor doesn't support batching)
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertTrue(encoded_images.shape[-1] % self.feature_extract_tester.size_divisor == 0)
-        self.assertTrue(encoded_images.shape[-2] % self.feature_extract_tester.size_divisor == 0)
-
-    def test_call_numpy(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random numpy tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        # Test not batched input (GLPNFeatureExtractor doesn't support batching)
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertTrue(encoded_images.shape[-1] % self.feature_extract_tester.size_divisor == 0)
-        self.assertTrue(encoded_images.shape[-2] % self.feature_extract_tester.size_divisor == 0)
-
-    def test_call_pytorch(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test not batched input (GLPNFeatureExtractor doesn't support batching)
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertTrue(encoded_images.shape[-1] % self.feature_extract_tester.size_divisor == 0)
-        self.assertTrue(encoded_images.shape[-2] % self.feature_extract_tester.size_divisor == 0)
diff --git a/tests/models/glpn/test_image_processing_glpn.py b/tests/models/glpn/test_image_processing_glpn.py
new file mode 100644
index 000000000000..f9cadb331378
--- /dev/null
+++ b/tests/models/glpn/test_image_processing_glpn.py
@@ -0,0 +1,161 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import GLPNImageProcessor
+
+
+class GLPNImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size_divisor=32,
+        do_rescale=True,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size_divisor = size_divisor
+        self.do_rescale = do_rescale
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size_divisor": self.size_divisor,
+            "do_rescale": self.do_rescale,
+        }
+
+    def expected_output_image_shape(self, images):
+        if isinstance(images[0], Image.Image):
+            width, height = images[0].size
+        else:
+            height, width = images[0].shape[1], images[0].shape[2]
+
+        height = height // self.size_divisor * self.size_divisor
+        width = width // self.size_divisor * self.size_divisor
+
+        return self.num_channels, height, width
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            size_divisor=self.size_divisor,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class GLPNImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = GLPNImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = GLPNImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size_divisor"))
+        self.assertTrue(hasattr(image_processing, "resample"))
+        self.assertTrue(hasattr(image_processing, "do_rescale"))
+
+    def test_call_pil(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PIL images
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
+        for image in image_inputs:
+            self.assertIsInstance(image, Image.Image)
+
+        # Test not batched input (GLPNImageProcessor doesn't support batching)
+        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+        self.assertTrue(tuple(encoded_images.shape) == (1, *expected_output_image_shape))
+
+    def test_call_numpy(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random numpy tensors
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, np.ndarray)
+
+        # Test not batched input (GLPNImageProcessor doesn't support batching)
+        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+        self.assertTrue(tuple(encoded_images.shape) == (1, *expected_output_image_shape))
+
+    def test_call_pytorch(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PyTorch tensors
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, torch.Tensor)
+
+        # Test not batched input (GLPNImageProcessor doesn't support batching)
+        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+        self.assertTrue(tuple(encoded_images.shape) == (1, *expected_output_image_shape))
+
+    def test_call_numpy_4_channels(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random numpy tensors
+        self.image_processing_class.num_channels = 4
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, np.ndarray)
+
+        # Test not batched input (GLPNImageProcessor doesn't support batching)
+        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+        self.assertTrue(tuple(encoded_images.shape) == (1, *expected_output_image_shape))
+        self.image_processing_class.num_channels = 3
diff --git a/tests/models/glpn/test_modeling_glpn.py b/tests/models/glpn/test_modeling_glpn.py
index 7d34a7f4f30d..60e29b739f26 100644
--- a/tests/models/glpn/test_modeling_glpn.py
+++ b/tests/models/glpn/test_modeling_glpn.py
@@ -24,6 +24,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -36,7 +37,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import GLPNFeatureExtractor
+    from transformers import GLPNImageProcessor
 
 
 class GLPNConfigTester(ConfigTester):
@@ -143,9 +144,11 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class GLPNModelTest(ModelTesterMixin, unittest.TestCase):
-
+class GLPNModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (GLPNModel, GLPNForDepthEstimation) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"depth-estimation": GLPNForDepthEstimation, "feature-extraction": GLPNModel} if is_torch_available() else {}
+    )
 
     test_head_masking = False
     test_pruning = False
@@ -334,11 +337,11 @@ def prepare_img():
 class GLPNModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference_depth_estimation(self):
-        feature_extractor = GLPNFeatureExtractor.from_pretrained(GLPN_PRETRAINED_MODEL_ARCHIVE_LIST[0])
+        image_processor = GLPNImageProcessor.from_pretrained(GLPN_PRETRAINED_MODEL_ARCHIVE_LIST[0])
         model = GLPNForDepthEstimation.from_pretrained(GLPN_PRETRAINED_MODEL_ARCHIVE_LIST[0]).to(torch_device)
 
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/gpt2/test_modeling_flax_gpt2.py b/tests/models/gpt2/test_modeling_flax_gpt2.py
index cb3f3321291f..1e24ad0b00d0 100644
--- a/tests/models/gpt2/test_modeling_flax_gpt2.py
+++ b/tests/models/gpt2/test_modeling_flax_gpt2.py
@@ -29,6 +29,7 @@
 if is_flax_available():
     import jax
     import jax.numpy as jnp
+
     from transformers.modeling_flax_pytorch_utils import (
         convert_pytorch_state_dict_to_flax,
         load_flax_weights_in_pytorch_model,
@@ -51,7 +52,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -186,10 +187,29 @@ def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input
         diff = np.max(np.abs((outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5])))
         self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
 
+    def check_bool_attention_mask_in_generation(self, model_class_name, config, input_ids, attention_mask):
+        model = model_class_name(config)
+
+        output_int_att_mask = model.generate(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            max_new_tokens=3,
+        )
+
+        output_bool_att_mask = model.generate(
+            input_ids=input_ids,
+            attention_mask=attention_mask.astype(bool),
+            max_new_tokens=3,
+        )
+
+        self.parent.assertTrue(
+            (output_bool_att_mask.sequences == output_int_att_mask.sequences).all(),
+            "Generated response differ between boolean and integer attention mask",
+        )
+
 
 @require_flax
 class FlaxGPT2ModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
-
     all_model_classes = (FlaxGPT2Model, FlaxGPT2LMHeadModel) if is_flax_available() else ()
     all_generative_model_classes = (FlaxGPT2LMHeadModel,) if is_flax_available() else ()
 
@@ -208,6 +228,13 @@ def test_use_cache_forward_with_attn_mask(self):
                 model_class_name, config, input_ids, attention_mask
             )
 
+    def test_bool_attention_mask_in_generation(self):
+        for model_class_name in self.all_generative_model_classes:
+            config, input_ids, attention_mask = self.model_tester.prepare_config_and_inputs()
+            self.model_tester.check_bool_attention_mask_in_generation(
+                model_class_name, config, input_ids, attention_mask
+            )
+
     @slow
     def test_batch_generation(self):
         tokenizer = GPT2Tokenizer.from_pretrained("gpt2", pad_token="</s>", padding_side="left")
@@ -224,7 +251,7 @@ def test_batch_generation(self):
         output_string = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
 
         expected_string = [
-            "Hello this is a long string of words. I'm going to try to explain what I mean.",
+            "Hello this is a long string of words. I'm going to start with the first one.\n",
             "Hey, I'm not sure if I'm going to be able to do",
         ]
 
diff --git a/tests/models/gpt2/test_modeling_gpt2.py b/tests/models/gpt2/test_modeling_gpt2.py
index 2f6f8d12143d..c94103988849 100644
--- a/tests/models/gpt2/test_modeling_gpt2.py
+++ b/tests/models/gpt2/test_modeling_gpt2.py
@@ -15,6 +15,7 @@
 
 
 import datetime
+import gc
 import math
 import unittest
 
@@ -24,6 +25,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -32,6 +34,7 @@
     from transformers import (
         GPT2_PRETRAINED_MODEL_ARCHIVE_LIST,
         GPT2DoubleHeadsModel,
+        GPT2ForQuestionAnswering,
         GPT2ForSequenceClassification,
         GPT2ForTokenClassification,
         GPT2LMHeadModel,
@@ -53,7 +56,7 @@ def __init__(
         use_mc_token_ids=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -376,6 +379,17 @@ def create_and_check_double_lm_head_model(
         )
         self.parent.assertEqual(result.mc_logits.shape, (self.batch_size, self.num_choices))
 
+    def create_and_check_gpt2_for_question_answering(
+        self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args
+    ):
+        config.num_labels = self.num_labels
+        model = GPT2ForQuestionAnswering(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
     def create_and_check_gpt2_for_sequence_classification(
         self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args
     ):
@@ -429,14 +443,32 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class GPT2ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
+class GPT2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
-        (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel, GPT2ForSequenceClassification, GPT2ForTokenClassification)
+        (
+            GPT2Model,
+            GPT2LMHeadModel,
+            GPT2DoubleHeadsModel,
+            GPT2ForQuestionAnswering,
+            GPT2ForSequenceClassification,
+            GPT2ForTokenClassification,
+        )
         if is_torch_available()
         else ()
     )
     all_generative_model_classes = (GPT2LMHeadModel, GPT2DoubleHeadsModel) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": GPT2Model,
+            "question-answering": GPT2ForQuestionAnswering,
+            "text-classification": GPT2ForSequenceClassification,
+            "text-generation": GPT2LMHeadModel,
+            "token-classification": GPT2ForTokenClassification,
+            "zero-shot": GPT2ForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
     all_parallelizable_model_classes = (GPT2LMHeadModel, GPT2DoubleHeadsModel) if is_torch_available() else ()
     fx_compatible = True
     test_missing_keys = False
@@ -469,6 +501,12 @@ def setUp(self):
         self.model_tester = GPT2ModelTester(self)
         self.config_tester = ConfigTester(self, config_class=GPT2Config, n_embd=37)
 
+    def tearDown(self):
+        super().tearDown()
+        # clean-up as much as possible GPU memory occupied by PyTorch
+        gc.collect()
+        torch.cuda.empty_cache()
+
     def test_config(self):
         self.config_tester.run_common_tests()
 
@@ -496,6 +534,10 @@ def test_gpt2_double_lm_head_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs)
 
+    def test_gpt2_question_answering_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt2_for_question_answering(*config_and_inputs)
+
     def test_gpt2_sequence_classification_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_gpt2_for_sequence_classification(*config_and_inputs)
@@ -648,6 +690,12 @@ def test_model_from_pretrained(self):
 
 @require_torch
 class GPT2ModelLanguageGenerationTest(unittest.TestCase):
+    def tearDown(self):
+        super().tearDown()
+        # clean-up as much as possible GPU memory occupied by PyTorch
+        gc.collect()
+        torch.cuda.empty_cache()
+
     def _test_lm_generate_gpt2_helper(
         self,
         gradient_checkpointing=False,
@@ -720,7 +768,7 @@ def test_gpt2_sample(self):
         )
         self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
         self.assertTrue(
-            all([output_seq_strs[idx] != output_seq_tt_strs[idx] for idx in range(len(output_seq_tt_strs))])
+            all(output_seq_strs[idx] != output_seq_tt_strs[idx] for idx in range(len(output_seq_tt_strs)))
         )  # token_type_ids should change output
 
     @slow
diff --git a/tests/models/gpt2/test_modeling_tf_gpt2.py b/tests/models/gpt2/test_modeling_tf_gpt2.py
index 64cbea4de977..a88435acba3e 100644
--- a/tests/models/gpt2/test_modeling_tf_gpt2.py
+++ b/tests/models/gpt2/test_modeling_tf_gpt2.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import GPT2Config, is_tf_available
@@ -20,6 +22,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 from ...utils.test_modeling_tf_core import TFCoreModelTesterMixin
 
 
@@ -52,7 +55,7 @@ def __init__(
         self.use_mc_token_ids = True
         self.vocab_size = 99
         self.hidden_size = 32
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.num_attention_heads = 4
         self.intermediate_size = 37
         self.hidden_act = "gelu"
@@ -180,7 +183,7 @@ def create_and_check_gpt2_model_past(self, config, input_ids, input_mask, head_m
         self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
         self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
 
-        output, past = outputs.to_tuple()
+        output, past_key_values = outputs.to_tuple()
 
         # create hypothetical next token and extent to next_input_ids
         next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
@@ -191,7 +194,9 @@ def create_and_check_gpt2_model_past(self, config, input_ids, input_mask, head_m
         next_token_type_ids = tf.concat([token_type_ids, next_token_types], axis=-1)
 
         output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, token_type_ids=next_token_types, past=past)["last_hidden_state"]
+        output_from_past = model(next_tokens, token_type_ids=next_token_types, past_key_values=past_key_values)[
+            "last_hidden_state"
+        ]
 
         # select random slice
         random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1]))
@@ -213,7 +218,7 @@ def create_and_check_gpt2_model_attention_mask_past(
         attn_mask = tf.concat([attn_mask_begin, attn_mask_end], axis=1)
 
         # first forward pass
-        output, past = model(input_ids, attention_mask=attn_mask).to_tuple()
+        output, past_key_values = model(input_ids, attention_mask=attn_mask).to_tuple()
 
         # create hypothetical next token and extent to next_input_ids
         next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
@@ -233,7 +238,9 @@ def create_and_check_gpt2_model_attention_mask_past(
 
         # get two different outputs
         output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, past=past, attention_mask=attn_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, past_key_values=past_key_values, attention_mask=attn_mask)[
+            "last_hidden_state"
+        ]
 
         # select random slice
         random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1]))
@@ -256,7 +263,7 @@ def create_and_check_gpt2_model_past_large_inputs(
         # first forward pass
         outputs = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, use_cache=True)
 
-        output, past = outputs.to_tuple()
+        output, past_key_values = outputs.to_tuple()
 
         # create hypothetical next token and extent to next_input_ids
         next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
@@ -272,7 +279,10 @@ def create_and_check_gpt2_model_past_large_inputs(
             next_input_ids, token_type_ids=next_token_type_ids, attention_mask=next_attention_mask
         )["last_hidden_state"]
         output_from_past = model(
-            next_tokens, token_type_ids=next_token_types, attention_mask=next_attention_mask, past=past
+            next_tokens,
+            token_type_ids=next_token_types,
+            attention_mask=next_attention_mask,
+            past_key_values=past_key_values,
         )["last_hidden_state"]
         self.parent.assertTrue(output_from_past.shape[1] == next_tokens.shape[1])
 
@@ -354,14 +364,23 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFGPT2ModelTest(TFModelTesterMixin, TFCoreModelTesterMixin, unittest.TestCase):
-
+class TFGPT2ModelTest(TFModelTesterMixin, TFCoreModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (TFGPT2Model, TFGPT2LMHeadModel, TFGPT2ForSequenceClassification, TFGPT2DoubleHeadsModel)
         if is_tf_available()
         else ()
     )
     all_generative_model_classes = (TFGPT2LMHeadModel,) if is_tf_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": TFGPT2Model,
+            "text-classification": TFGPT2ForSequenceClassification,
+            "text-generation": TFGPT2LMHeadModel,
+            "zero-shot": TFGPT2ForSequenceClassification,
+        }
+        if is_tf_available()
+        else {}
+    )
     test_head_masking = False
     test_onnx = True
     onnx_min_opset = 10
@@ -397,24 +416,6 @@ def test_gpt2_double_head(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_gpt2_double_head(*config_and_inputs)
 
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-
-            if model_class in self.all_generative_model_classes:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert name is None
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
-
     def test_gpt2_sequence_classification_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_gpt2_for_sequence_classification(*config_and_inputs)
@@ -439,13 +440,12 @@ def test_onnx_runtime_optimize(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
-
             # Skip these 2 classes which uses `tf.gather` with `batch_dims=1`
             if model_class in [TFGPT2ForSequenceClassification, TFGPT2DoubleHeadsModel]:
                 continue
 
             model = model_class(config)
-            model(model.dummy_inputs)
+            model.build()
 
             onnx_model_proto, _ = tf2onnx.convert.from_keras(model, opset=self.onnx_min_opset)
 
diff --git a/tests/models/gpt2/test_tokenization_gpt2.py b/tests/models/gpt2/test_tokenization_gpt2.py
index 3273fbfce773..78906e3db327 100644
--- a/tests/models/gpt2/test_tokenization_gpt2.py
+++ b/tests/models/gpt2/test_tokenization_gpt2.py
@@ -20,14 +20,13 @@
 
 from transformers import AutoTokenizer, GPT2Tokenizer, GPT2TokenizerFast
 from transformers.models.gpt2.tokenization_gpt2 import VOCAB_FILES_NAMES
-from transformers.testing_utils import require_tokenizers
+from transformers.testing_utils import require_jinja, require_tokenizers
 
 from ...test_tokenization_common import TokenizerTesterMixin
 
 
 @require_tokenizers
 class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = GPT2Tokenizer
     rust_tokenizer_class = GPT2TokenizerFast
     test_rust_tokenizer = True
@@ -244,8 +243,8 @@ def test_add_bos_token_slow(self):
         decode_s = tokenizer.decode(out_s.input_ids)
         decode_s2 = tokenizer.batch_decode(out_s2.input_ids)
 
-        self.assertEqual(decode_s.split()[0], bos_token)
-        self.assertTrue(all(d.split()[0] == bos_token for d in decode_s2))
+        self.assertTrue(decode_s.startswith(bos_token))
+        self.assertTrue(all(d.startswith(bos_token) for d in decode_s2))
 
     # tokenizer has no padding token
     def test_padding_different_model_input_name(self):
@@ -276,6 +275,27 @@ def test_special_tokens_mask_input_pairs_and_bos_token(self):
                 filtered_sequence = [x for x in filtered_sequence if x is not None]
                 self.assertEqual(encoded_sequence, filtered_sequence)
 
+    @require_jinja
+    def test_tokenization_for_chat(self):
+        tokenizer = GPT2Tokenizer.from_pretrained(self.tmpdirname)
+        test_chats = [
+            [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
+            [
+                {"role": "system", "content": "You are a helpful chatbot."},
+                {"role": "user", "content": "Hello!"},
+                {"role": "assistant", "content": "Nice to meet you."},
+            ],
+            [{"role": "assistant", "content": "Nice to meet you."}, {"role": "user", "content": "Hello!"}],
+        ]
+        tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats]
+        # fmt: off
+        expected_tokens = [[20, 1, 20, 10, 20, 4, 3, 10, 20, 10, 20, 3, 0, 20, 20, 20, 0, 10, 20, 20, 20, 6, 20, 1, 6, 20, 20, 20, 3, 0, 0, 1, 20, 20],
+                          [20, 1, 20, 10, 20, 4, 3, 10, 20, 10, 20, 3, 0, 20, 20, 20, 0, 10, 20, 20, 20, 6, 20, 1, 6, 20, 20, 20, 3, 0, 0, 1, 20, 20, 20, 7, 20, 3, 10, 6, 1, 10, 20, 3, 3, 6, 10, 20, 1, 20, 20, 20],
+                          [20, 7, 20, 3, 10, 6, 1, 10, 20, 3, 3, 6, 10, 20, 1, 20, 20, 20, 20, 3, 0, 0, 1, 20, 20]]
+        # fmt: on
+        for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
+            self.assertListEqual(tokenized_chat, expected_tokens)
+
 
 @require_tokenizers
 class OPTTokenizationTest(unittest.TestCase):
@@ -310,6 +330,7 @@ def test_fast_slow_equivalence(self):
         # Same as above
         self.assertEqual(tokens_ids, [2, 250, 1345, 9, 10, 4758])
 
+    @unittest.skip("This test is failing because of a bug in the fast tokenizer")
     def test_users_can_modify_bos(self):
         tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m", from_slow=True)
 
diff --git a/tests/models/gpt2/test_tokenization_gpt2_tf.py b/tests/models/gpt2/test_tokenization_gpt2_tf.py
index 1af3dd1f4ff2..e92c9e65dfd3 100644
--- a/tests/models/gpt2/test_tokenization_gpt2_tf.py
+++ b/tests/models/gpt2/test_tokenization_gpt2_tf.py
@@ -4,15 +4,15 @@
 
 from transformers import AutoConfig, TFGPT2LMHeadModel, is_keras_nlp_available, is_tf_available
 from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
-from transformers.testing_utils import require_keras_nlp, slow
+from transformers.testing_utils import require_keras_nlp, require_tf, slow
 
 
-if is_keras_nlp_available():
-    from transformers.models.gpt2 import TFGPT2Tokenizer
-
 if is_tf_available():
     import tensorflow as tf
 
+if is_keras_nlp_available():
+    from transformers.models.gpt2 import TFGPT2Tokenizer
+
 
 TOKENIZER_CHECKPOINTS = ["gpt2"]
 TINY_MODEL_CHECKPOINT = "gpt2"
@@ -28,7 +28,6 @@ def __init__(self, tokenizer):
 
         @tf.function(input_signature=(tf.TensorSpec((None,), tf.string, name="text"),))
         def serving(self, text):
-
             tokenized = self.tokenizer(text)
             input_ids_dense = tokenized["input_ids"].to_tensor()
 
@@ -40,6 +39,7 @@ def serving(self, text):
             return outputs
 
 
+@require_tf
 @require_keras_nlp
 class GPTTokenizationTest(unittest.TestCase):
     # The TF tokenizers are usually going to be used as pretrained tokenizers from existing model checkpoints,
diff --git a/tests/models/gpt_bigcode/__init__.py b/tests/models/gpt_bigcode/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py
new file mode 100644
index 000000000000..3d4dd27fa472
--- /dev/null
+++ b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py
@@ -0,0 +1,628 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import unittest
+
+from parameterized import parameterized
+
+from transformers import GPTBigCodeConfig, is_torch_available
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        GPT2TokenizerFast,
+        GPTBigCodeForCausalLM,
+        GPTBigCodeForSequenceClassification,
+        GPTBigCodeForTokenClassification,
+        GPTBigCodeModel,
+    )
+    from transformers.models.gpt_bigcode.modeling_gpt_bigcode import GPTBigCodeAttention
+    from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_12
+else:
+    is_torch_greater_or_equal_than_1_12 = False
+
+
+class GPTBigCodeModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=14,
+        seq_length=7,
+        is_training=True,
+        use_token_type_ids=True,
+        use_input_mask=True,
+        use_labels=True,
+        use_mc_token_ids=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="relu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        multi_query=True,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_token_type_ids = use_token_type_ids
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.use_mc_token_ids = use_mc_token_ids
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = None
+        self.bos_token_id = vocab_size - 1
+        self.eos_token_id = vocab_size - 2
+        self.pad_token_id = vocab_size - 3
+        self.multi_query = multi_query
+
+    def get_large_model_config(self):
+        return GPTBigCodeConfig.from_pretrained("bigcode/gpt_bigcode-santacoder")
+
+    def prepare_config_and_inputs(
+        self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False
+    ):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        mc_token_ids = None
+        if self.use_mc_token_ids:
+            mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.get_config(
+            gradient_checkpointing=gradient_checkpointing,
+            scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx,
+            reorder_and_upcast_attn=reorder_and_upcast_attn,
+        )
+
+        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
+
+        return (
+            config,
+            input_ids,
+            input_mask,
+            head_mask,
+            token_type_ids,
+            mc_token_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        )
+
+    def get_config(
+        self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False
+    ):
+        return GPTBigCodeConfig(
+            vocab_size=self.vocab_size,
+            n_embd=self.hidden_size,
+            n_layer=self.num_hidden_layers,
+            n_head=self.num_attention_heads,
+            n_inner=self.intermediate_size,
+            activation_function=self.hidden_act,
+            resid_pdrop=self.hidden_dropout_prob,
+            attn_pdrop=self.attention_probs_dropout_prob,
+            n_positions=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+            use_cache=True,
+            bos_token_id=self.bos_token_id,
+            eos_token_id=self.eos_token_id,
+            pad_token_id=self.pad_token_id,
+            gradient_checkpointing=gradient_checkpointing,
+            scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx,
+            reorder_and_upcast_attn=reorder_and_upcast_attn,
+            attention_softmax_in_fp32=False,
+            scale_attention_softmax_in_fp32=False,
+            multi_query=self.multi_query,
+        )
+
+    def get_pipeline_config(self):
+        config = self.get_config()
+        config.vocab_size = 300
+        return config
+
+    def prepare_config_and_inputs_for_decoder(self):
+        (
+            config,
+            input_ids,
+            input_mask,
+            head_mask,
+            token_type_ids,
+            mc_token_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = self.prepare_config_and_inputs()
+
+        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
+        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        return (
+            config,
+            input_ids,
+            input_mask,
+            head_mask,
+            token_type_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def create_and_check_gpt_bigcode_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = GPTBigCodeModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(len(result.past_key_values), config.n_layer)
+
+    def create_and_check_gpt_bigcode_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = GPTBigCodeModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # first forward pass
+        outputs = model(input_ids, token_type_ids=token_type_ids, use_cache=True)
+        outputs_use_cache_conf = model(input_ids, token_type_ids=token_type_ids)
+        outputs_no_past = model(input_ids, token_type_ids=token_type_ids, use_cache=False)
+
+        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
+        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
+
+        output, past = outputs.to_tuple()
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+        next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size)
+
+        # append to next input_ids and token_type_ids
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1)
+
+        output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"]
+        output_from_past = model(next_tokens, token_type_ids=next_token_types, past_key_values=past)[
+            "last_hidden_state"
+        ]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_gpt_bigcode_model_attention_mask_past(
+        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
+    ):
+        model = GPTBigCodeModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # create attention mask
+        attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
+        half_seq_length = self.seq_length // 2
+        attn_mask[:, half_seq_length:] = 0
+
+        # first forward pass
+        output, past = model(input_ids, attention_mask=attn_mask).to_tuple()
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # change a random masked slice from input_ids
+        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
+        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
+        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
+
+        # append to next input_ids and attn_mask
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        attn_mask = torch.cat(
+            [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)],
+            dim=1,
+        )
+
+        # get two different outputs
+        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, past_key_values=past, attention_mask=attn_mask)["last_hidden_state"]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_gpt_bigcode_model_past_large_inputs(
+        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
+    ):
+        model = GPTBigCodeModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # first forward pass
+        outputs = model(input_ids, token_type_ids=token_type_ids, attention_mask=input_mask, use_cache=True)
+
+        output, past = outputs.to_tuple()
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_token_types = ids_tensor([self.batch_size, 3], self.type_vocab_size)
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+        # append to next input_ids and token_type_ids
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1)
+        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+
+        output_from_no_past = model(
+            next_input_ids, token_type_ids=next_token_type_ids, attention_mask=next_attention_mask
+        )["last_hidden_state"]
+        output_from_past = model(
+            next_tokens, token_type_ids=next_token_types, attention_mask=next_attention_mask, past_key_values=past
+        )["last_hidden_state"]
+        self.parent.assertTrue(output_from_past.shape[1] == next_tokens.shape[1])
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = GPTBigCodeForCausalLM(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_forward_and_backwards(
+        self, config, input_ids, input_mask, head_mask, token_type_ids, *args, gradient_checkpointing=False
+    ):
+        model = GPTBigCodeForCausalLM(config)
+        model.to(torch_device)
+        if gradient_checkpointing:
+            model.gradient_checkpointing_enable()
+
+        result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+        result.loss.backward()
+
+    def create_and_check_gpt_bigcode_for_sequence_classification(
+        self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args
+    ):
+        config.num_labels = self.num_labels
+        model = GPTBigCodeForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_gpt_bigcode_for_token_classification(
+        self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args
+    ):
+        config.num_labels = self.num_labels
+        model = GPTBigCodeForTokenClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_gpt_bigcode_weight_initialization(self, config, *args):
+        model = GPTBigCodeModel(config)
+        model_std = model.config.initializer_range / math.sqrt(2 * model.config.n_layer)
+        for key in model.state_dict().keys():
+            if "c_proj" in key and "weight" in key:
+                self.parent.assertLessEqual(abs(torch.std(model.state_dict()[key]) - model_std), 0.001)
+                self.parent.assertLessEqual(abs(torch.mean(model.state_dict()[key]) - 0.0), 0.01)
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+
+        (
+            config,
+            input_ids,
+            input_mask,
+            head_mask,
+            token_type_ids,
+            mc_token_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "token_type_ids": token_type_ids,
+            "head_mask": head_mask,
+        }
+
+        return config, inputs_dict
+
+
+@require_torch
+class GPTBigCodeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    # TODO: Update the tests to use valid pretrained models.
+    all_model_classes = (
+        (
+            GPTBigCodeModel,
+            GPTBigCodeForCausalLM,
+            GPTBigCodeForSequenceClassification,
+            GPTBigCodeForTokenClassification,
+        )
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (GPTBigCodeForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": GPTBigCodeModel,
+            "text-classification": GPTBigCodeForSequenceClassification,
+            "text-generation": GPTBigCodeForCausalLM,
+            "token-classification": GPTBigCodeForTokenClassification,
+            "zero-shot": GPTBigCodeForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
+    fx_compatible = False
+    test_missing_keys = False
+    test_pruning = False
+    test_torchscript = False
+    multi_query = True
+
+    # special case for DoubleHeads model
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+
+        return inputs_dict
+
+    def setUp(self):
+        self.model_tester = GPTBigCodeModelTester(self, multi_query=self.multi_query)
+        self.config_tester = ConfigTester(self, config_class=GPTBigCodeConfig, n_embd=37)
+
+    def tearDown(self):
+        import gc
+
+        gc.collect()
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip("MQA models does not support retain_grad")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    @unittest.skip("Contrastive search not supported due to non-standard caching mechanism")
+    def test_contrastive_generate(self):
+        pass
+
+    @unittest.skip("Contrastive search not supported due to non-standard caching mechanism")
+    def test_contrastive_generate_dict_outputs_use_cache(self):
+        pass
+
+    @unittest.skip("CPU offload seems to be broken for some reason - tiny models keep hitting corner cases")
+    def test_cpu_offload(self):
+        pass
+
+    @unittest.skip("Disk offload seems to be broken for some reason - tiny models keep hitting corner cases")
+    def test_disk_offload(self):
+        pass
+
+    @unittest.skip("BigCodeGPT has a non-standard KV cache format.")
+    def test_past_key_values_format(self):
+        pass
+
+    def test_gpt_bigcode_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt_bigcode_model(*config_and_inputs)
+
+    def test_gpt_bigcode_model_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt_bigcode_model_past(*config_and_inputs)
+
+    def test_gpt_bigcode_model_att_mask_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt_bigcode_model_attention_mask_past(*config_and_inputs)
+
+    def test_gpt_bigcode_model_past_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt_bigcode_model_past_large_inputs(*config_and_inputs)
+
+    def test_gpt_bigcode_lm_head_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
+
+    def test_gpt_bigcode_sequence_classification_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt_bigcode_for_sequence_classification(*config_and_inputs)
+
+    def test_gpt_bigcode_token_classification_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt_bigcode_for_token_classification(*config_and_inputs)
+
+    def test_gpt_bigcode_gradient_checkpointing(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs, gradient_checkpointing=True)
+
+    def test_gpt_bigcode_scale_attn_by_inverse_layer_idx(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs(scale_attn_by_inverse_layer_idx=True)
+        self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs)
+
+    def test_gpt_bigcode_reorder_and_upcast_attn(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs(reorder_and_upcast_attn=True)
+        self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs)
+
+    def test_gpt_bigcode_weight_initialization(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt_bigcode_weight_initialization(*config_and_inputs)
+
+
+@require_torch
+class GPTBigCodeMHAModelTest(GPTBigCodeModelTest):
+    # `parameterized_class` breaks with mixins, so we use inheritance instead
+    multi_query = False
+
+
+@unittest.skipIf(
+    not is_torch_greater_or_equal_than_1_12,
+    reason="`GPTBigCode` checkpoints use `PytorchGELUTanh` which requires `torch>=1.12.0`.",
+)
+@slow
+@require_torch
+class GPTBigCodeModelLanguageGenerationTest(unittest.TestCase):
+    def test_generate_simple(self):
+        model = GPTBigCodeForCausalLM.from_pretrained("bigcode/gpt_bigcode-santacoder").to(torch_device)
+        tokenizer = GPT2TokenizerFast.from_pretrained("bigcode/gpt_bigcode-santacoder")
+
+        input_ids = tokenizer("def print_hello_world():", return_tensors="pt").input_ids.to(torch_device)
+
+        output_sequence = model.generate(input_ids)
+        output_sentence = tokenizer.decode(output_sequence[0], skip_special_tokens=True)
+
+        expected_output = """def print_hello_world():\n    print("Hello World!")\n\n\ndef print_hello_"""
+        self.assertEqual(output_sentence, expected_output)
+
+    def test_generate_batched(self):
+        tokenizer = GPT2TokenizerFast.from_pretrained("bigcode/gpt_bigcode-santacoder")
+        tokenizer.pad_token = tokenizer.eos_token
+        tokenizer.padding_side = "left"
+
+        model = GPTBigCodeForCausalLM.from_pretrained("bigcode/gpt_bigcode-santacoder").to(torch_device)
+
+        inputs = tokenizer(["def print_hello_world():", "def say_hello():"], return_tensors="pt", padding=True).to(
+            torch_device
+        )
+        outputs = model.generate(**inputs)
+        outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+
+        expected_output = [
+            'def print_hello_world():\n    print("Hello World!")\n\n\ndef print_hello_',
+            'def say_hello():\n    print("Hello, World!")\n\n\nsay_hello()',
+        ]
+        self.assertListEqual(outputs, expected_output)
+
+
+@require_torch
+class GPTBigCodeMQATest(unittest.TestCase):
+    def get_attention(self, multi_query):
+        config = GPTBigCodeConfig.from_pretrained(
+            "bigcode/gpt_bigcode-santacoder",
+            multi_query=multi_query,
+            attn_pdrop=0,
+            resid_pdrop=0,
+        )
+        return GPTBigCodeAttention(config)
+
+    @parameterized.expand([(seed, is_train_mode) for seed in range(5) for is_train_mode in [True, False]])
+    def test_mqa_reduces_to_mha(self, seed, is_train_mode=True):
+        torch.manual_seed(seed)
+
+        # CREATE MQA AND MHA ATTENTIONS
+        attention_mqa = self.get_attention(True)
+        attention_mha = self.get_attention(False)
+
+        # ENFORCE MATCHING WEIGHTS
+        num_heads = attention_mqa.num_heads
+        embed_dim = attention_mqa.embed_dim
+        head_dim = attention_mqa.head_dim
+
+        with torch.no_grad():
+            mqa_q_weight = attention_mqa.c_attn.weight[:embed_dim, :].view(num_heads, 1, head_dim, embed_dim)
+            mqa_kv_weight = attention_mqa.c_attn.weight[embed_dim:, :].view(1, 2, head_dim, embed_dim)
+            mha_c_weight = torch.cat(
+                [mqa_q_weight, mqa_kv_weight.expand(num_heads, 2, head_dim, embed_dim)], dim=1
+            ).view(3 * num_heads * head_dim, embed_dim)
+
+            mqa_q_bias = attention_mqa.c_attn.bias[:embed_dim].view(num_heads, 1, head_dim)
+            mqa_kv_bias = attention_mqa.c_attn.bias[embed_dim:].view(1, 2, head_dim)
+            mha_c_bias = torch.cat([mqa_q_bias, mqa_kv_bias.expand(num_heads, 2, head_dim)], dim=1).view(
+                3 * num_heads * head_dim
+            )
+
+            attention_mha.c_attn.weight.copy_(mha_c_weight)
+            attention_mha.c_attn.bias.copy_(mha_c_bias)
+            attention_mha.c_proj.weight.copy_(attention_mqa.c_proj.weight)
+            attention_mha.c_proj.bias.copy_(attention_mqa.c_proj.bias)
+
+        # PUT THE MODEL INTO THE CORRECT MODE
+        attention_mha.train(is_train_mode)
+        attention_mqa.train(is_train_mode)
+
+        # RUN AN INPUT THROUGH THE MODELS
+        num_tokens = 5
+        hidden_states = torch.randn(1, num_tokens, embed_dim)
+        attention_mha_result = attention_mha(hidden_states)[0]
+        attention_mqa_result = attention_mqa(hidden_states)[0]
+
+        # CHECK THAT ALL OUTPUTS ARE THE SAME
+        self.assertTrue(torch.allclose(attention_mha_result, attention_mqa_result, atol=1e-5))
diff --git a/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py b/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py
index 706b7c6cabaf..58574a8b1da3 100644
--- a/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py
+++ b/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py
@@ -29,6 +29,7 @@
 if is_flax_available():
     import jax
     import jax.numpy as jnp
+
     from transformers.modeling_flax_pytorch_utils import (
         convert_pytorch_state_dict_to_flax,
         load_flax_weights_in_pytorch_model,
@@ -51,9 +52,9 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=4,
-        attention_types=[[["global", "local"], 2]],
+        attention_types=[[["global", "local"], 1]],
         intermediate_size=37,
         hidden_act="gelu",
         hidden_dropout_prob=0.1,
@@ -181,7 +182,6 @@ def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input
 
 @require_flax
 class FlaxGPTNeoModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
-
     all_model_classes = (FlaxGPTNeoModel, FlaxGPTNeoForCausalLM) if is_flax_available() else ()
     all_generative_model_classes = (FlaxGPTNeoForCausalLM,) if is_flax_available() else ()
 
diff --git a/tests/models/gpt_neo/test_modeling_gpt_neo.py b/tests/models/gpt_neo/test_modeling_gpt_neo.py
index 534c29b82bd4..075b9a26633c 100644
--- a/tests/models/gpt_neo/test_modeling_gpt_neo.py
+++ b/tests/models/gpt_neo/test_modeling_gpt_neo.py
@@ -24,6 +24,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -33,7 +34,9 @@
         GPT_NEO_PRETRAINED_MODEL_ARCHIVE_LIST,
         GPT2Tokenizer,
         GPTNeoForCausalLM,
+        GPTNeoForQuestionAnswering,
         GPTNeoForSequenceClassification,
+        GPTNeoForTokenClassification,
         GPTNeoModel,
     )
 
@@ -51,8 +54,8 @@ def __init__(
         use_mc_token_ids=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=4,
-        attention_types=[[["global", "local"], 2]],
+        num_hidden_layers=2,
+        attention_types=[[["global", "local"], 1]],
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -323,6 +326,17 @@ def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mas
         self.parent.assertEqual(result.loss.shape, ())
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
 
+    def create_and_check_gpt_neo_for_question_answering(
+        self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args
+    ):
+        config.num_labels = self.num_labels
+        model = GPTNeoForQuestionAnswering(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
     def create_and_check_gpt_neo_for_sequence_classification(
         self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args
     ):
@@ -333,6 +347,16 @@ def create_and_check_gpt_neo_for_sequence_classification(
         result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
 
+    def create_and_check_gpt_neo_for_token_classification(
+        self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args
+    ):
+        config.num_labels = self.num_labels
+        model = GPTNeoForTokenClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
     def create_and_check_forward_and_backwards(
         self, config, input_ids, input_mask, head_mask, token_type_ids, *args, gradient_checkpointing=False
     ):
@@ -371,12 +395,31 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class GPTNeoModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
+class GPTNeoModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
-        (GPTNeoModel, GPTNeoForCausalLM, GPTNeoForSequenceClassification) if is_torch_available() else ()
+        (
+            GPTNeoModel,
+            GPTNeoForCausalLM,
+            GPTNeoForQuestionAnswering,
+            GPTNeoForSequenceClassification,
+            GPTNeoForTokenClassification,
+        )
+        if is_torch_available()
+        else ()
     )
     all_generative_model_classes = (GPTNeoForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": GPTNeoModel,
+            "question-answering": GPTNeoForQuestionAnswering,
+            "text-classification": GPTNeoForSequenceClassification,
+            "text-generation": GPTNeoForCausalLM,
+            "token-classification": GPTNeoForTokenClassification,
+            "zero-shot": GPTNeoForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
     fx_compatible = True
     test_missing_keys = False
     test_pruning = False
@@ -414,10 +457,18 @@ def test_gpt_neo_lm_head_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
 
+    def test_gpt_neo_question_answering_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt_neo_for_question_answering(*config_and_inputs)
+
     def test_gpt_neo_sequence_classification_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_gpt_neo_for_sequence_classification(*config_and_inputs)
 
+    def test_gpt_neo_token_classification_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt_neo_for_token_classification(*config_and_inputs)
+
     def test_gpt_neo_gradient_checkpointing(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs, gradient_checkpointing=True)
diff --git a/tests/models/gpt_neox/test_modeling_gpt_neox.py b/tests/models/gpt_neox/test_modeling_gpt_neox.py
index 0435624f6f11..8777bd3abd62 100644
--- a/tests/models/gpt_neox/test_modeling_gpt_neox.py
+++ b/tests/models/gpt_neox/test_modeling_gpt_neox.py
@@ -17,17 +17,27 @@
 
 import unittest
 
-from transformers import GPTNeoXConfig, is_torch_available
-from transformers.testing_utils import require_torch, torch_device
+from parameterized import parameterized
 
+from transformers import AutoTokenizer, GPTNeoXConfig, is_torch_available, set_seed
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
     import torch
 
-    from transformers import GPTNeoXForCausalLM, GPTNeoXModel
+    from transformers import (
+        GPTNeoXForCausalLM,
+        GPTNeoXForQuestionAnswering,
+        GPTNeoXForSequenceClassification,
+        GPTNeoXForTokenClassification,
+        GPTNeoXModel,
+    )
 
 
 class GPTNeoXModelTester:
@@ -41,8 +51,8 @@ def __init__(
         use_token_type_ids=True,
         use_labels=True,
         vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=5,
+        hidden_size=64,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -78,6 +88,7 @@ def __init__(
         self.num_labels = num_labels
         self.num_choices = num_choices
         self.scope = scope
+        self.pad_token_id = vocab_size - 1
 
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
@@ -108,6 +119,7 @@ def get_config(self):
             type_vocab_size=self.type_vocab_size,
             is_decoder=False,
             initializer_range=self.initializer_range,
+            pad_token_id=self.pad_token_id,
         )
 
     def prepare_config_and_inputs_for_decoder(self):
@@ -140,6 +152,32 @@ def create_and_check_for_causal_lm(self, config, input_ids, input_mask, token_la
         result = model(input_ids, attention_mask=input_mask, labels=token_labels)
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
 
+    def create_and_check_for_question_answering(self, config, input_ids, input_mask, token_labels):
+        config.num_labels = self.num_labels
+        model = GPTNeoXForQuestionAnswering(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask)
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def create_and_check_for_sequence_classification(self, config, input_ids, input_mask, token_labels):
+        config.num_labels = self.num_labels
+        model = GPTNeoXForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+        result = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_for_token_classification(self, config, input_ids, input_mask, token_labels):
+        config.num_labels = self.num_labels
+        model = GPTNeoXForTokenClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
     def create_and_check_decoder_model_past_large_inputs(self, config, input_ids, input_mask):
         config.is_decoder = True
         model = GPTNeoXForCausalLM(config=config)
@@ -185,10 +223,31 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class GPTNeoXModelTest(ModelTesterMixin, unittest.TestCase):
-
-    all_model_classes = (GPTNeoXModel, GPTNeoXForCausalLM) if is_torch_available() else ()
+class GPTNeoXModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            GPTNeoXModel,
+            GPTNeoXForCausalLM,
+            GPTNeoXForQuestionAnswering,
+            GPTNeoXForSequenceClassification,
+            GPTNeoXForTokenClassification,
+        )
+        if is_torch_available()
+        else ()
+    )
     all_generative_model_classes = (GPTNeoXForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": GPTNeoXModel,
+            "question-answering": GPTNeoXForQuestionAnswering,
+            "text-classification": GPTNeoXForSequenceClassification,
+            "text-generation": GPTNeoXForCausalLM,
+            "token-classification": GPTNeoXForTokenClassification,
+            "zero-shot": GPTNeoXForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
     test_pruning = False
     test_missing_keys = False
     test_model_parallel = False
@@ -196,7 +255,7 @@ class GPTNeoXModelTest(ModelTesterMixin, unittest.TestCase):
 
     def setUp(self):
         self.model_tester = GPTNeoXModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=GPTNeoXConfig, hidden_size=37)
+        self.config_tester = ConfigTester(self, config_class=GPTNeoXConfig, hidden_size=64, num_attention_heads=8)
 
     def test_config(self):
         self.config_tester.run_common_tests()
@@ -225,6 +284,74 @@ def test_model_for_causal_lm(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
 
+    def test_model_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
+
+    def test_model_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
+
+    def test_model_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
+
     @unittest.skip(reason="Feed forward chunking is not implemented")
     def test_feed_forward_chunking(self):
         pass
+
+    @parameterized.expand([("linear",), ("dynamic",)])
+    def test_model_rope_scaling(self, scaling_type):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        short_input = ids_tensor([1, 10], config.vocab_size)
+        long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size)
+
+        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
+        original_model = GPTNeoXModel(config)
+        original_model.to(torch_device)
+        original_model.eval()
+        original_short_output = original_model(short_input).last_hidden_state
+        original_long_output = original_model(long_input).last_hidden_state
+
+        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
+        config.rope_scaling = {"type": scaling_type, "factor": 10.0}
+        scaled_model = GPTNeoXModel(config)
+        scaled_model.to(torch_device)
+        scaled_model.eval()
+        scaled_short_output = scaled_model(short_input).last_hidden_state
+        scaled_long_output = scaled_model(long_input).last_hidden_state
+
+        # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original
+        # maximum sequence length, so the outputs for the short input should match.
+        if scaling_type == "dynamic":
+            self.assertTrue(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
+        else:
+            self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
+
+        # The output should be different for long inputs
+        self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5))
+
+
+@require_torch
+class GPTNeoXLanguageGenerationTest(unittest.TestCase):
+    @slow
+    def test_lm_generate_gptneox(self):
+        tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-410m-deduped")
+        for checkpointing in [True, False]:
+            model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/pythia-410m-deduped")
+
+            if checkpointing:
+                model.gradient_checkpointing_enable()
+            else:
+                model.gradient_checkpointing_disable()
+            model.to(torch_device)
+
+            inputs = tokenizer("My favorite food is", return_tensors="pt").to(torch_device)
+            # The hub repo. is updated on 2023-04-04, resulting in poor outputs.
+            # See: https://github.com/huggingface/transformers/pull/24193
+            expected_output = "My favorite food is a good old-fashioned, old-fashioned, old-fashioned.\n\nI'm not sure"
+
+            output_ids = model.generate(**inputs, do_sample=False, max_new_tokens=20)
+            output_str = tokenizer.batch_decode(output_ids)[0]
+
+            self.assertEqual(output_str, expected_output)
diff --git a/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py b/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py
index 32f118ba0606..fc78b8bdd420 100644
--- a/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py
+++ b/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py
@@ -23,6 +23,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -43,7 +44,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_multiple_size=4,
         hidden_act="gelu",
@@ -189,10 +190,14 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class GPTNeoXModelJapaneseTest(ModelTesterMixin, unittest.TestCase):
-
+class GPTNeoXModelJapaneseTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (GPTNeoXJapaneseModel, GPTNeoXJapaneseForCausalLM) if is_torch_available() else ()
     all_generative_model_classes = (GPTNeoXJapaneseForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": GPTNeoXJapaneseModel, "text-generation": GPTNeoXJapaneseForCausalLM}
+        if is_torch_available()
+        else {}
+    )
     test_pruning = False
     test_missing_keys = False
     test_model_parallel = False
diff --git a/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py b/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py
index 4af4da30a7b5..293116a24e33 100644
--- a/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py
+++ b/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py
@@ -29,7 +29,6 @@
 
 @require_tokenizers
 class GPTNeoXJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = GPTNeoXJapaneseTokenizer
     test_rust_tokenizer = False
     from_pretrained_kwargs = {"do_clean_text": False, "add_prefix_space": False}
diff --git a/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py b/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py
index b030996e89dc..040f6c771176 100644
--- a/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py
+++ b/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py
@@ -16,7 +16,7 @@
 import unittest
 
 from transformers import GPTSw3Tokenizer
-from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow
+from transformers.testing_utils import get_tests_dir, require_jinja, require_sentencepiece, require_tokenizers, slow
 
 from ...test_tokenization_common import TokenizerTesterMixin
 
@@ -128,3 +128,27 @@ def test_tokenizer_integration(self):
             model_name="AI-Sweden/gpt-sw3-126m",
             sequences=sequences,
         )
+
+    @require_jinja
+    def test_tokenization_for_chat(self):
+        tokenizer = GPTSw3Tokenizer(SAMPLE_VOCAB)
+        # This is in English, but it's just here to make sure the chat control tokens are being added properly
+        test_chats = [
+            [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
+            [
+                {"role": "system", "content": "You are a helpful chatbot."},
+                {"role": "user", "content": "Hello!"},
+                {"role": "assistant", "content": "Nice to meet you."},
+            ],
+            [{"role": "assistant", "content": "Nice to meet you."}, {"role": "user", "content": "Hello!"}],
+        ]
+        tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats]
+        # fmt: off
+        expected_tokens = [
+            [2000, 1, 575, 541, 419, 530, 339, 265, 878, 708, 727, 275, 347, 541, 260, 1, 968, 263, 314, 419, 366, 354, 294, 360, 1, 575, 541, 419],
+            [2000, 1, 575, 541, 419, 530, 339, 265, 878, 708, 727, 275, 347, 541, 260, 1, 968, 263, 314, 419, 366, 354, 294, 360, 1, 575, 541, 419, 984, 429, 281, 264, 1261, 291, 260, 1, 575, 541, 419],
+            [2000, 1, 575, 541, 419, 984, 429, 281, 264, 1261, 291, 260, 1, 968, 263, 314, 419, 366, 354, 294, 360, 1, 575, 541, 419]
+            ]
+        # fmt: on
+        for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
+            self.assertListEqual(tokenized_chat, expected_tokens)
diff --git a/tests/models/gptj/test_modeling_flax_gptj.py b/tests/models/gptj/test_modeling_flax_gptj.py
index 9a6472bc92ee..48061f84d86c 100644
--- a/tests/models/gptj/test_modeling_flax_gptj.py
+++ b/tests/models/gptj/test_modeling_flax_gptj.py
@@ -29,6 +29,7 @@
 if is_flax_available():
     import jax
     import jax.numpy as jnp
+
     from transformers.modeling_flax_pytorch_utils import (
         convert_pytorch_state_dict_to_flax,
         load_flax_weights_in_pytorch_model,
@@ -52,7 +53,7 @@ def __init__(
         vocab_size=99,
         hidden_size=32,
         rotary_dim=4,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -178,7 +179,6 @@ def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input
 
 @require_flax
 class FlaxGPTJModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
-
     all_model_classes = (FlaxGPTJModel, FlaxGPTJForCausalLM) if is_flax_available() else ()
     all_generative_model_classes = (FlaxGPTJForCausalLM,) if is_flax_available() else ()
 
diff --git a/tests/models/gptj/test_modeling_gptj.py b/tests/models/gptj/test_modeling_gptj.py
index bb20c8cee6fa..f0e02700700c 100644
--- a/tests/models/gptj/test_modeling_gptj.py
+++ b/tests/models/gptj/test_modeling_gptj.py
@@ -23,6 +23,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -36,6 +37,9 @@
         GPTJForSequenceClassification,
         GPTJModel,
     )
+    from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_12
+else:
+    is_torch_greater_or_equal_than_1_12 = False
 
 
 class GPTJModelTester:
@@ -52,7 +56,7 @@ def __init__(
         vocab_size=99,
         hidden_size=32,
         rotary_dim=4,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -360,20 +364,58 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class GPTJModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
+class GPTJModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (GPTJModel, GPTJForCausalLM, GPTJForSequenceClassification, GPTJForQuestionAnswering)
         if is_torch_available()
         else ()
     )
     all_generative_model_classes = (GPTJForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": GPTJModel,
+            "question-answering": GPTJForQuestionAnswering,
+            "text-classification": GPTJForSequenceClassification,
+            "text-generation": GPTJForCausalLM,
+            "zero-shot": GPTJForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
     fx_compatible = True
     test_pruning = False
     test_missing_keys = False
     test_model_parallel = False
     test_head_masking = False
 
+    @unittest.skipIf(
+        not is_torch_greater_or_equal_than_1_12, reason="PR #22069 made changes that require torch v1.12+."
+    )
+    def test_torch_fx(self):
+        super().test_torch_fx()
+
+    @unittest.skipIf(
+        not is_torch_greater_or_equal_than_1_12, reason="PR #22069 made changes that require torch v1.12+."
+    )
+    def test_torch_fx_output_loss(self):
+        super().test_torch_fx_output_loss()
+
+    # TODO: Fix the failed tests
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if (
+            pipeline_test_casse_name == "QAPipelineTests"
+            and tokenizer_name is not None
+            and not tokenizer_name.endswith("Fast")
+        ):
+            # `QAPipelineTests` fails for a few models when the slower tokenizer are used.
+            # (The slower tokenizers were never used for pipeline tests before the pipeline testing rework)
+            # TODO: check (and possibly fix) the `QAPipelineTests` with slower tokenizer
+            return True
+
+        return False
+
     # special case for DoubleHeads model
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
@@ -529,7 +571,7 @@ def test_gptj_sample(self):
 
         self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
         self.assertTrue(
-            all([output_seq_strs[idx] != output_seq_tt_strs[idx] for idx in range(len(output_seq_tt_strs))])
+            all(output_seq_strs[idx] != output_seq_tt_strs[idx] for idx in range(len(output_seq_tt_strs)))
         )  # token_type_ids should change output
 
     @slow
diff --git a/tests/models/gptj/test_modeling_tf_gptj.py b/tests/models/gptj/test_modeling_tf_gptj.py
index ec6c15d3f6d6..896df148058c 100644
--- a/tests/models/gptj/test_modeling_tf_gptj.py
+++ b/tests/models/gptj/test_modeling_tf_gptj.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import AutoTokenizer, GPTJConfig, is_tf_available
@@ -20,6 +22,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 from ...utils.test_modeling_tf_core import TFCoreModelTesterMixin
 
 
@@ -48,7 +51,7 @@ def __init__(self, parent):
         self.vocab_size = 99
         self.hidden_size = 32
         self.rotary_dim = 4
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.num_attention_heads = 4
         self.intermediate_size = 37
         self.hidden_act = "gelu"
@@ -148,7 +151,7 @@ def create_and_check_gptj_model_past(self, config, input_ids, input_mask, head_m
         self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
         self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
 
-        output, past = outputs.to_tuple()
+        output, past_key_values = outputs.to_tuple()
 
         # create hypothetical next token and extent to next_input_ids
         next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
@@ -159,7 +162,9 @@ def create_and_check_gptj_model_past(self, config, input_ids, input_mask, head_m
         next_token_type_ids = tf.concat([token_type_ids, next_token_types], axis=-1)
 
         output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, token_type_ids=next_token_types, past=past)["last_hidden_state"]
+        output_from_past = model(next_tokens, token_type_ids=next_token_types, past_key_values=past_key_values)[
+            "last_hidden_state"
+        ]
 
         # select random slice
         random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1]))
@@ -181,7 +186,7 @@ def create_and_check_gptj_model_attention_mask_past(
         attn_mask = tf.concat([attn_mask_begin, attn_mask_end], axis=1)
 
         # first forward pass
-        output, past = model(input_ids, attention_mask=attn_mask).to_tuple()
+        output, past_key_values = model(input_ids, attention_mask=attn_mask).to_tuple()
 
         # create hypothetical next token and extent to next_input_ids
         next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
@@ -201,7 +206,9 @@ def create_and_check_gptj_model_attention_mask_past(
 
         # get two different outputs
         output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, past=past, attention_mask=attn_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, past_key_values=past_key_values, attention_mask=attn_mask)[
+            "last_hidden_state"
+        ]
 
         # select random slice
         random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1]))
@@ -224,7 +231,7 @@ def create_and_check_gptj_model_past_large_inputs(
         # first forward pass
         outputs = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, use_cache=True)
 
-        output, past = outputs.to_tuple()
+        output, past_key_values = outputs.to_tuple()
 
         # create hypothetical next token and extent to next_input_ids
         next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
@@ -240,7 +247,10 @@ def create_and_check_gptj_model_past_large_inputs(
             next_input_ids, token_type_ids=next_token_type_ids, attention_mask=next_attention_mask
         )["last_hidden_state"]
         output_from_past = model(
-            next_tokens, token_type_ids=next_token_types, attention_mask=next_attention_mask, past=past
+            next_tokens,
+            token_type_ids=next_token_types,
+            attention_mask=next_attention_mask,
+            past_key_values=past_key_values,
         )["last_hidden_state"]
         self.parent.assertTrue(output_from_past.shape[1] == next_tokens.shape[1])
 
@@ -286,8 +296,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFGPTJModelTest(TFModelTesterMixin, TFCoreModelTesterMixin, unittest.TestCase):
-
+class TFGPTJModelTest(TFModelTesterMixin, TFCoreModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (TFGPTJForCausalLM, TFGPTJForSequenceClassification, TFGPTJForQuestionAnswering, TFGPTJModel)
         if is_tf_available()
@@ -295,11 +304,38 @@ class TFGPTJModelTest(TFModelTesterMixin, TFCoreModelTesterMixin, unittest.TestC
     )
 
     all_generative_model_classes = (TFGPTJForCausalLM,) if is_tf_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": TFGPTJModel,
+            "question-answering": TFGPTJForQuestionAnswering,
+            "text-classification": TFGPTJForSequenceClassification,
+            "text-generation": TFGPTJForCausalLM,
+            "zero-shot": TFGPTJForSequenceClassification,
+        }
+        if is_tf_available()
+        else {}
+    )
     test_onnx = False
     test_pruning = False
     test_missing_keys = False
     test_head_masking = False
 
+    # TODO: Fix the failed tests
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if (
+            pipeline_test_casse_name == "QAPipelineTests"
+            and tokenizer_name is not None
+            and not tokenizer_name.endswith("Fast")
+        ):
+            # `QAPipelineTests` fails for a few models when the slower tokenizer are used.
+            # (The slower tokenizers were never used for pipeline tests before the pipeline testing rework)
+            # TODO: check (and possibly fix) the `QAPipelineTests` with slower tokenizer
+            return True
+
+        return False
+
     def setUp(self):
         self.model_tester = TFGPTJModelTester(self)
         self.config_tester = ConfigTester(self, config_class=GPTJConfig, n_embd=37)
@@ -327,24 +363,6 @@ def test_gptj_lm_head_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_gptj_lm_head_model(*config_and_inputs)
 
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-
-            if model_class in self.all_generative_model_classes:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert name is None
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
-
     @slow
     @unittest.skipIf(
         not is_tf_available() or len(tf.config.list_physical_devices("GPU")) > 0,
diff --git a/tests/models/gptsan_japanese/__init__.py b/tests/models/gptsan_japanese/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py b/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py
new file mode 100644
index 000000000000..1a86e23fdccf
--- /dev/null
+++ b/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py
@@ -0,0 +1,465 @@
+# coding=utf-8
+# Copyright 2023 Toshiyuki Sakamoto(tanreinama) and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+
+from transformers import (
+    GPTSanJapaneseConfig,
+    GPTSanJapaneseForConditionalGeneration,
+    GPTSanJapaneseModel,
+    GPTSanJapaneseTokenizer,
+    is_torch_available,
+)
+from transformers.generation import GenerationConfig
+from transformers.testing_utils import require_torch, slow, tooslow, torch_device
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+class GPTSanJapaneseTester:
+    def __init__(
+        self,
+        parent,
+        vocab_size=99,
+        batch_size=13,
+        num_contexts=7,
+        # For common tests
+        is_training=True,
+        hidden_size=32,
+        ext_size=42,
+        num_hidden_layers=2,
+        num_ext_layers=2,
+        num_attention_heads=4,
+        num_experts=2,
+        d_ff=32,
+        d_ext=80,
+        d_spout=33,
+        dropout_rate=0.0,
+        layer_norm_epsilon=1e-6,
+        expert_capacity=100,
+        router_jitter_noise=0.0,
+    ):
+        self.vocab_size = vocab_size
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_contexts = num_contexts
+        # For common tests
+        self.seq_length = self.num_contexts
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.num_ext_layers = num_ext_layers
+        self.ext_size = ext_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_experts = num_experts
+        self.d_ff = d_ff
+        self.d_ext = d_ext
+        self.d_spout = d_spout
+        self.dropout_rate = dropout_rate
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.expert_capacity = expert_capacity
+        self.router_jitter_noise = router_jitter_noise
+
+    def get_large_model_config(self):
+        return GPTSanJapaneseConfig.from_pretrained("Tanrei/GPTSAN-japanese")
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        config = self.get_config()
+
+        return (config, input_ids)
+
+    def prepare_config_and_inputs_for_common(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        config = self.get_config()
+
+        return (config, {"input_ids": input_ids})
+
+    def get_config(self):
+        return GPTSanJapaneseConfig(
+            vocab_size=self.vocab_size,
+            num_contexts=self.seq_length,
+            d_model=self.hidden_size,
+            d_ff=self.d_ff,
+            d_ext=self.d_ext,
+            d_spout=self.d_spout,
+            num_switch_layers=self.num_hidden_layers - self.num_ext_layers,
+            num_ext_layers=self.num_ext_layers,
+            num_heads=self.num_attention_heads,
+            num_experts=self.num_experts,
+            expert_capacity=self.expert_capacity,
+            dropout_rate=self.dropout_rate,
+            layer_norm_epsilon=self.layer_norm_epsilon,
+            router_jitter_noise=self.router_jitter_noise,
+        )
+
+    def create_and_check_model(
+        self,
+        config,
+        input_ids,
+    ):
+        model = GPTSanJapaneseForConditionalGeneration(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids=input_ids,
+        )
+        self.parent.assertIsNotNone(result)
+
+
+@require_torch
+class GPTSanJapaneseTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (GPTSanJapaneseModel,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "conversational": GPTSanJapaneseForConditionalGeneration,
+            "feature-extraction": GPTSanJapaneseForConditionalGeneration,
+            "summarization": GPTSanJapaneseForConditionalGeneration,
+            "text2text-generation": GPTSanJapaneseForConditionalGeneration,
+            "translation": GPTSanJapaneseForConditionalGeneration,
+        }
+        if is_torch_available()
+        else {}
+    )
+    fx_compatible = False
+    is_encoder_decoder = False
+    test_pruning = False
+    test_headmasking = False
+    test_cpu_offload = False
+    test_disk_offload = False
+    test_save_load_fast_init_to_base = False
+    test_training = False
+    # The small GPTSAN_JAPANESE model needs higher percentages for CPU/MP tests
+    model_split_percents = [0.8, 0.9]
+
+    # TODO: Fix the failed tests when this model gets more usage
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if pipeline_test_casse_name == "SummarizationPipelineTests":
+            # TODO: fix `_reorder_cache` is not implemented for this model
+            return True
+        elif pipeline_test_casse_name == "Text2TextGenerationPipelineTests":
+            # TODO: check this.
+            return True
+
+        return False
+
+    def setUp(self):
+        self.model_tester = GPTSanJapaneseTester(self)
+        self.config_tester = ConfigTester(self, config_class=GPTSanJapaneseConfig, d_model=37)
+
+    def test_config(self):
+        GPTSanJapaneseConfig()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip(
+        reason="skip for now as the computed `max_memory` by `model_split_percents` in the test method will be changed inside `from_pretrained`"
+    )
+    def test_model_parallelism(self):
+        super().test_model_parallelism()
+
+
+@require_torch
+class GPTSanJapaneseForConditionalGenerationTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (GPTSanJapaneseForConditionalGeneration,) if is_torch_available() else ()
+    fx_compatible = False
+    is_encoder_decoder = False
+    test_pruning = False
+    test_headmasking = False
+    test_cpu_offload = False
+    test_disk_offload = False
+    # The small GPTSAN_JAPANESE model needs higher percentages for CPU/MP tests
+    model_split_percents = [0.8, 0.9]
+
+    def setUp(self):
+        self.model_tester = GPTSanJapaneseTester(self)
+        self.config_tester = ConfigTester(self, config_class=GPTSanJapaneseConfig, d_model=37)
+
+    def test_config(self):
+        GPTSanJapaneseConfig()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip(
+        reason="skip for now as the computed `max_memory` by `model_split_percents` in the test method will be changed inside `from_pretrained`"
+    )
+    def test_model_parallelism(self):
+        super().test_model_parallelism()
+
+    @slow
+    def test_logits(self):
+        model = GPTSanJapaneseForConditionalGeneration.from_pretrained("Tanrei/GPTSAN-japanese")
+        tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
+        input_ids = tokenizer.encode("武田信玄は", return_tensors="pt")
+        outputs = model(input_ids)
+        output_logits = outputs.logits.detach().cpu().numpy()
+        # Output of original model created with mesh-tensoflow
+        target = [
+            # fmt: off
+            [-12.037839889526367, -12.433061599731445, -14.333840370178223, -12.450345993041992, -11.1661376953125,
+            -11.930137634277344, -10.659740447998047, -12.909574508666992, -13.241043090820312, -13.398579597473145,
+            -11.107524871826172, -12.3685941696167, -22.97943115234375, -10.481067657470703, -12.484030723571777,
+            -12.807360649108887, -14.769700050354004, -12.233579635620117, -13.428145408630371, -22.624177932739258],
+            [-7.511149883270264, -8.281851768493652, -7.943127155303955, -7.55021333694458, -6.49869966506958,
+            -7.586796283721924, -6.978085994720459, -7.839145183563232, -8.21964168548584, -8.695091247558594,
+            -6.706910610198975, -6.6585798263549805, -19.565698623657227, -5.353842735290527, -8.350686073303223,
+            -8.039388656616211, -10.856569290161133, -7.75154447555542, -8.819022178649902, -19.51532745361328],
+            [-9.73066234588623, -10.223922729492188, -9.932981491088867, -11.857836723327637, -7.662626266479492,
+            -11.13529109954834, -7.765097618103027, -11.472923278808594, -9.543149948120117, -11.905633926391602,
+            -9.366164207458496, -11.5734281539917, -23.699003219604492, -9.429590225219727, -10.42839241027832,
+            -10.585240364074707, -10.94771957397461, -11.095416069030762, -10.390240669250488, -23.769372940063477],
+            [-9.728265762329102, -9.859712600708008, -10.09729290008545, -9.678522109985352, -6.879519939422607,
+            -9.68487548828125, -4.2803425788879395, -10.018914222717285, -9.308445930480957, -10.63394546508789,
+            -8.083646774291992, -9.06301498413086, -21.904266357421875, -8.90160846710205, -8.841876029968262,
+            -11.856719970703125, -12.079398155212402, -11.233753204345703, -10.177338600158691, -21.87256622314453],
+            [-9.669764518737793, -9.614198684692383, -9.814510345458984, -9.996501922607422, -11.375690460205078,
+            -10.113405227661133, -10.546867370605469, -10.04369068145752, -10.907809257507324, -10.504216194152832,
+            -11.129199028015137, -10.151124000549316, -21.96586799621582, -9.086349487304688, -11.730339050292969,
+            -10.460667610168457, -10.298049926757812, -10.784148216247559, -10.840693473815918, -22.03152847290039],
+            # fmt: on
+        ]
+        target = np.array(target).flatten()
+        predict = output_logits[0, :, :20].flatten()
+
+        def check(a, b, epsilon=5e-4):
+            return abs(a - b) < epsilon * max(abs(a), abs(b))
+
+        self.assertTrue(np.all([check(target[i], predict[i]) for i in range(len(target))]))
+
+    @slow
+    def test_batch_generation(self):
+        model = GPTSanJapaneseForConditionalGeneration.from_pretrained("Tanrei/GPTSAN-japanese")
+        tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
+        model.to(torch_device)
+
+        # set deterministically
+        generation_config = GenerationConfig.from_pretrained("Tanrei/GPTSAN-japanese")
+        generation_config.top_k = 1
+
+        # use different length sentences to test batching
+        sentences = [
+            "甲斐なら武田と言うほど",
+            "織田信長は、",
+        ]
+
+        tokenizer.padding_side = "left"
+        inputs = tokenizer(sentences, return_tensors="pt", padding=True)
+        input_ids = inputs["input_ids"].to(torch_device)
+
+        self.assertNotEqual(inputs["attention_mask"][0].numpy().tolist(), inputs["attention_mask"][1].numpy().tolist())
+
+        outputs = model.generate(
+            input_ids=input_ids,
+            attention_mask=inputs["attention_mask"].to(torch_device),
+            max_new_tokens=3,
+            generation_config=generation_config,
+        )
+
+        inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
+        output_non_padded = model.generate(
+            input_ids=inputs_non_padded, max_new_tokens=3, generation_config=generation_config
+        )
+
+        inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
+        output_padded = model.generate(input_ids=inputs_padded, max_new_tokens=3, generation_config=generation_config)
+
+        self.assertNotEqual(inputs_non_padded.shape, inputs_padded.shape)
+
+        batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
+        padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
+
+        expected_output_sentence = [
+            "甲斐なら武田と言うほど甲斐の武田",
+            "織田信長は、このような",
+        ]
+        self.assertListEqual(expected_output_sentence, batch_out_sentence)
+        self.assertListEqual(batch_out_sentence, [non_padded_sentence, padded_sentence])
+
+    @tooslow
+    def test_sample(self):
+        model = GPTSanJapaneseForConditionalGeneration.from_pretrained("Tanrei/GPTSAN-japanese")
+        tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
+        # Output of original model created with mesh-tensoflow
+        target = [
+            ("武田信玄は", 35675),
+            ("武田信玄は、", 45),
+            ("武田信玄は、この", 29),
+            ("武田信玄は、このよう", 30642),
+            ("武田信玄は、このような", 35680),
+            ("武田信玄は、このような「", 8640),
+            ("武田信玄は、このような「武田", 31617),
+            ("武田信玄は、このような「武田家", 30646),
+            ("武田信玄は、このような「武田家の", 31617),
+            ("武田信玄は、このような「武田家の家", 31381),
+        ]
+        for input, output in target:
+            input_ids = tokenizer.encode(input, return_tensors="pt")
+            outputs = model(input_ids)
+            output_logits = outputs.logits.detach().cpu().numpy()[0]
+            output_id = np.argmax(output_logits[-1])
+            self.assertEqual(output_id, output)
+
+    @slow
+    def test_spout_generation(self):
+        model = GPTSanJapaneseForConditionalGeneration.from_pretrained("Tanrei/GPTSAN-japanese")
+        tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
+        model.to(torch_device)
+
+        # set deterministically
+        generation_config = GenerationConfig.from_pretrained("Tanrei/GPTSAN-japanese")
+        generation_config.top_k = 1
+
+        input_text = "武田信玄は、"
+        input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(torch_device)
+        input_ids_batch = tokenizer([input_text, input_text], return_tensors="pt").input_ids.to(torch_device)
+
+        # spout from uniform and one-hot
+        spouts = [
+            # fmt: off
+            [0.87882208, 0.38426396, 0.33220248, 0.43890406, 0.16562252,
+            0.04803985, 0.211572  , 0.23188473, 0.37153068, 0.7836377 ,
+            0.02160172, 0.38761719, 0.75290772, 0.90198857, 0.34365777,
+            0.64168169, 0.44318471, 0.14575746, 0.92562881, 0.40812148,
+            0.29019122, 0.88861599, 0.65524846, 0.43563456, 0.38177187,
+            0.70832965, 0.81527892, 0.68832812, 0.38833192, 0.4561522 ,
+            0.14828817, 0.47248213, 0.54357335, 0.82009566, 0.1338884 ,
+            0.02755417, 0.19764677, 0.2422084 , 0.04757674, 0.65409606,
+            0.0824589 , 0.03304383, 0.94387689, 0.98764509, 0.82433901,
+            0.27646741, 0.64907493, 0.76009406, 0.30087915, 0.17904689,
+            0.41601714, 0.67046398, 0.10422822, 0.08447374, 0.07354344,
+            0.61423565, 0.70284866, 0.7532333 , 0.1972038 , 0.29575659,
+            0.90583886, 0.29265307, 0.50000175, 0.70407655, 0.889363  ,
+            0.81904418, 0.66829128, 0.64468815, 0.56563723, 0.85601875,
+            0.94924672, 0.00166762, 0.25220643, 0.74540219, 0.67993247,
+            0.1549675 , 0.39385352, 0.92153607, 0.63745931, 0.27759043,
+            0.84702295, 0.65904271, 0.58676614, 0.8666936 , 0.39607438,
+            0.79954983, 0.42220697, 0.39650381, 0.7849864 , 0.56150201,
+            0.15678925, 0.14746032, 0.34542114, 0.47026783, 0.11956489,
+            0.25421435, 0.33788901, 0.68934842, 0.36424685, 0.71737898,
+            0.38983449, 0.94393779, 0.39575588, 0.36616553, 0.87104665,
+            0.64630203, 0.22516905, 0.88270804, 0.15031338, 0.75144345,
+            0.46459025, 0.85396454, 0.86355643, 0.65139851, 0.70266061,
+            0.30241389, 0.81056497, 0.88865969, 0.38773807, 0.70635849,
+            0.90718459, 0.43245789, 0.28000654, 0.45935562, 0.08773519,
+            0.9552151 , 0.93901511, 0.22489288], # uniform
+            [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
+             0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
+             0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
+             0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
+             0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
+             0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
+             0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
+             0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
+             0., 0., 0., 0., 0., 0., 0., 0.],
+            # fmt: on
+        ]
+
+        output1 = model.generate(
+            input_ids=input_ids,
+            spout=spouts[0],
+            max_new_tokens=20,
+            generation_config=generation_config,
+        )
+
+        output2 = model.generate(
+            input_ids=input_ids,
+            spout=spouts[1],
+            max_new_tokens=20,
+            generation_config=generation_config,
+        )
+
+        output3 = model.generate(
+            input_ids=input_ids_batch,
+            spout=spouts,
+            max_new_tokens=20,
+            generation_config=generation_config,
+        )
+
+        out1_sentence = tokenizer.decode(output1[0])
+        out2_sentence = tokenizer.decode(output2[0])
+        batch_out_sentence = tokenizer.batch_decode(output3)
+
+        expected_output_sentence = [
+            "武田信玄は、武田氏の滅亡後、武田氏の居城であった甲斐武田氏の居城である",
+            "武田信玄は、武田家の滅亡を防ぐため、武田家の家臣である武田信虎を討",
+        ]
+        self.assertListEqual(expected_output_sentence, batch_out_sentence)
+        self.assertListEqual(batch_out_sentence, [out1_sentence, out2_sentence])
+
+    @slow
+    def test_prefix_lm_generation(self):
+        model = GPTSanJapaneseForConditionalGeneration.from_pretrained("Tanrei/GPTSAN-japanese")
+        tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
+        model.to(torch_device)
+
+        # set deterministically
+        generation_config = GenerationConfig.from_pretrained("Tanrei/GPTSAN-japanese")
+        generation_config.top_k = 1
+
+        prefix_text_1 = "武田信玄"
+        prefix_text_2 = "織田信長"
+        input_text_1 = "は、"
+        input_text_2 = "が、"
+        input_tok_1 = tokenizer(input_text_1, prefix_text=prefix_text_1, return_tensors="pt")
+        input_tok_2 = tokenizer(input_text_2, prefix_text=prefix_text_2, return_tensors="pt")
+        input_tok_3 = tokenizer([[prefix_text_1, input_text_1], [prefix_text_2, input_text_2]], return_tensors="pt")
+
+        output1 = model.generate(
+            input_ids=input_tok_1.input_ids.to(torch_device),
+            token_type_ids=input_tok_1.token_type_ids.to(torch_device),
+            max_new_tokens=20,
+            generation_config=generation_config,
+        )
+
+        output2 = model.generate(
+            input_ids=input_tok_2.input_ids.to(torch_device),
+            token_type_ids=input_tok_2.token_type_ids.to(torch_device),
+            max_new_tokens=20,
+            generation_config=generation_config,
+        )
+
+        output3 = model.generate(
+            input_ids=input_tok_3.input_ids.to(torch_device),
+            token_type_ids=input_tok_3.token_type_ids.to(torch_device),
+            attention_mask=input_tok_3.attention_mask.to(torch_device),
+            max_new_tokens=20,
+            generation_config=generation_config,
+        )
+
+        out1_sentence = tokenizer.decode(output1[0])
+        out2_sentence = tokenizer.decode(output2[0])
+        batch_out_sentence = tokenizer.batch_decode(output3)
+
+        expected_output_sentence = [
+            "武田信玄は、武田氏の祖である武田信虎を、その子・武田信友を擁して",
+            "織田信長が、織田信長の妻・お市の方を妻として迎えたという逸話が残",
+        ]
+        self.assertListEqual(expected_output_sentence, batch_out_sentence)
+        self.assertListEqual(batch_out_sentence, [out1_sentence, out2_sentence])
diff --git a/tests/models/gptsan_japanese/test_tokenization_gptsan_japanese.py b/tests/models/gptsan_japanese/test_tokenization_gptsan_japanese.py
new file mode 100644
index 000000000000..2c6fd962edbd
--- /dev/null
+++ b/tests/models/gptsan_japanese/test_tokenization_gptsan_japanese.py
@@ -0,0 +1,219 @@
+# coding=utf-8
+# Copyright 2023 Toshiyuki Sakamoto(tanreinama) and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import os
+import unittest
+
+from transformers.models.gptsan_japanese.tokenization_gptsan_japanese import (
+    VOCAB_FILES_NAMES,
+    GPTSanJapaneseTokenizer,
+)
+from transformers.testing_utils import require_jinja, require_tokenizers, slow
+
+from ...test_tokenization_common import TokenizerTesterMixin
+
+
+@require_tokenizers
+class GPTSanJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+    tokenizer_class = GPTSanJapaneseTokenizer
+    test_rust_tokenizer = False
+    from_pretrained_kwargs = {"do_clean_text": False, "add_prefix_space": False}
+
+    def setUp(self):
+        super().setUp()
+
+        # fmt: off
+        vocab_tokens = ["こん", "こんに", "にちは", "ばんは", "世界,㔺界", "、", "。", "<BR>", "<SP>", "<TAB>", "<URL>", "<EMAIL>", "<TEL>", "<DATE>", "<PRICE>", "<BLOCK>", "<KIGOU>", "<U2000U2BFF>", "<|emoji1|>", "<unk>", "<|bagoftoken|>", "<|endoftext|>"]
+        # fmt: on
+        emoji_tokens = {"emoji": {"\ud83d\ude00": "<|emoji1|>"}, "emoji_inv": {"<|emoji1|>": "\ud83d\ude00"}}  # 😀
+        self.special_tokens_map = {"unk_token": "<unk>"}
+
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        self.emoji_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["emoji_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+        with open(self.emoji_file, "w") as emoji_writer:
+            emoji_writer.write(json.dumps(emoji_tokens))
+
+    def get_tokenizer(self, **kwargs):
+        kwargs.update(self.special_tokens_map)
+        return GPTSanJapaneseTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    # Copied from tests.models.gpt_neox_japanese.test_tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizationTest.get_input_output_texts
+    def get_input_output_texts(self, tokenizer):
+        input_text = "こんにちは、世界。 \nこんばんは、㔺界。😀"
+        output_text = "こんにちは、世界。 \nこんばんは、世界。😀"
+        return input_text, output_text
+
+    # Copied from tests.models.gpt_neox_japanese.test_tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizationTest.get_clean_sequence
+    def get_clean_sequence(self, tokenizer):
+        input_text, output_text = self.get_input_output_texts(tokenizer)
+        ids = tokenizer.encode(output_text, add_special_tokens=False)
+        text = tokenizer.decode(ids, clean_up_tokenization_spaces=False)
+        return text, ids
+
+    # Copied from tests.models.gpt_neox_japanese.test_tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizationTest.test_pretokenized_inputs
+    def test_pretokenized_inputs(self):
+        pass  # TODO add if relevant
+
+    # Copied from tests.models.gpt_neox_japanese.test_tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizationTest.test_maximum_encoding_length_pair_input
+    def test_maximum_encoding_length_pair_input(self):
+        pass  # TODO add if relevant
+
+    # Copied from tests.models.gpt_neox_japanese.test_tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizationTest.test_maximum_encoding_length_single_input
+    def test_maximum_encoding_length_single_input(self):
+        pass  # TODO add if relevant
+
+    # Copied from tests.models.gpt_neox_japanese.test_tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizationTest.test_full_tokenizer
+    def test_full_tokenizer(self):
+        tokenizer = self.get_tokenizer()
+
+        # Testing tokenization
+        input_text = "こんにちは、世界。　こんばんは、㔺界。"
+        expected_token = ["こん", "にちは", "、", "世界", "。", "<SP>", "こん", "ばんは", "、", "㔺界", "。"]
+        tokens = tokenizer.tokenize(input_text)
+        self.assertListEqual(tokens, expected_token)
+
+        # Testing conversion to ids without special tokens
+        expected_ids = [0, 2, 5, 4, 6, 8, 0, 3, 5, 4, 6]
+        input_ids = tokenizer.convert_tokens_to_ids(tokens)
+        self.assertListEqual(input_ids, expected_ids)
+
+        # Testing conversion to ids with special tokens
+        input_tokens = tokens + [tokenizer.unk_token]
+        expected_ids = [0, 2, 5, 4, 6, 8, 0, 3, 5, 4, 6, 19]
+        input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
+        self.assertListEqual(input_ids, expected_ids)
+
+    def test_token_bagging(self):
+        tokenizer = self.get_tokenizer()
+
+        # Testing tokenization
+        input_text = "こんにちは、<|bagoftoken|>世界。こんばんは、<|bagoftoken|>㔺界。"
+        expected_text = "こんにちは、、、、世界。こんばんは、、、、世界。"
+        tokens = tokenizer.encode(input_text)
+        output_text = tokenizer.decode(tokens)
+        self.assertEqual(output_text, expected_text)
+
+    @slow
+    def test_prefix_input(self):
+        tokenizer = self.tokenizer_class.from_pretrained("Tanrei/GPTSAN-japanese")
+
+        # Testing tokenization
+        prefix_text = "こんにちは、世界。"
+        input_text = "こんばんは、㔺界。😀"
+        expected_text = "こんにちは、世界。こんばんは、世界。😀"
+        tokens_1 = tokenizer.encode(prefix_text + input_text)
+        tokens_2 = tokenizer.encode("", prefix_text=prefix_text + input_text)
+        tokens_3 = tokenizer.encode(input_text, prefix_text=prefix_text)
+        output_text_1 = tokenizer.decode(tokens_1)
+        output_text_2 = tokenizer.decode(tokens_2)
+        output_text_3 = tokenizer.decode(tokens_3)
+        self.assertEqual(output_text_1, expected_text)
+        self.assertEqual(output_text_2, expected_text)
+        self.assertEqual(output_text_3, expected_text)
+
+    @slow
+    def test_token_type_ids(self):
+        tokenizer = self.tokenizer_class.from_pretrained("Tanrei/GPTSAN-japanese")
+
+        # Testing tokenization
+        prefix_text = "こんにちは、世界。"
+        input_text = "こんばんは、㔺界。😀"
+
+        len_prefix = len(tokenizer.encode(prefix_text)) - 2
+        len_text = len(tokenizer.encode(input_text)) - 2
+
+        expected_mask_1 = [1] + [0] * (len_prefix + len_text + 1)
+        expected_mask_2 = [1] * (len_prefix + len_text + 1) + [0]
+        expected_mask_3 = [1] + [1] * (len_prefix) + [0] * (len_text + 1)
+
+        type_id_1 = tokenizer(prefix_text + input_text).token_type_ids
+        type_id_2 = tokenizer("", prefix_text=prefix_text + input_text).token_type_ids
+        type_id_3 = tokenizer(input_text, prefix_text=prefix_text).token_type_ids
+        self.assertListEqual(type_id_1, expected_mask_1)
+        self.assertListEqual(type_id_2, expected_mask_2)
+        self.assertListEqual(type_id_3, expected_mask_3)
+
+    @slow
+    def test_prefix_tokens(self):
+        tokenizer = self.tokenizer_class.from_pretrained("Tanrei/GPTSAN-japanese")
+
+        x_token_1 = tokenizer.encode("あンいワ")
+        x_token_2 = tokenizer.encode("", prefix_text="あンいワ")
+        x_token_3 = tokenizer.encode("いワ", prefix_text="あン")
+
+        self.assertEqual(tokenizer.decode(x_token_1), tokenizer.decode(x_token_2))
+        self.assertEqual(tokenizer.decode(x_token_1), tokenizer.decode(x_token_3))
+        self.assertNotEqual(x_token_1, x_token_2)
+        self.assertNotEqual(x_token_1, x_token_3)
+        self.assertEqual(x_token_1[1], x_token_2[-1])  # SEG token
+        self.assertEqual(x_token_1[1], x_token_3[3])  # SEG token
+
+    @slow
+    def test_batch_encode(self):
+        tokenizer = self.tokenizer_class.from_pretrained("Tanrei/GPTSAN-japanese")
+
+        input_pairs = [["武田信玄", "は、"], ["織田信長", "の配下の、"]]
+        x_token = tokenizer(input_pairs, padding=True)
+        x_token_2 = tokenizer.batch_encode_plus(input_pairs, padding=True)
+
+        # fmt: off
+        expected_outputs = [[35993, 8640, 25948, 35998, 30647, 35675, 35999, 35999], [35993, 10382, 9868, 35998, 30646, 9459, 30646, 35675]]
+        expected_typeids = [[1, 1, 1, 0, 0, 0, 0, 0], [1, 1, 1, 0, 0, 0, 0, 0]]
+        expected_attmask = [[1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1]]
+        # fmt: on
+        self.assertListEqual(x_token.input_ids, expected_outputs)
+        self.assertListEqual(x_token.token_type_ids, expected_typeids)
+        self.assertListEqual(x_token.attention_mask, expected_attmask)
+        self.assertListEqual(x_token_2.input_ids, expected_outputs)
+        self.assertListEqual(x_token_2.token_type_ids, expected_typeids)
+        self.assertListEqual(x_token_2.attention_mask, expected_attmask)
+
+    # Copied from tests.models.gpt_neox_japanese.test_tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizationTest.test_conversion_reversible
+    def test_conversion_reversible(self):
+        # Intentionally convert some words to accommodate character fluctuations unique to Japanese
+        pass
+
+    # Copied from tests.models.gpt_neox_japanese.test_tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizationTest.test_padding_different_model_input_name
+    def test_padding_different_model_input_name(self):
+        # tokenizer has no padding token
+        pass
+
+    @require_jinja
+    def test_tokenization_for_chat(self):
+        tokenizer = self.tokenizer_class.from_pretrained("Tanrei/GPTSAN-japanese")
+        # This is in English, but it's just here to make sure the chat control tokens are being added properly
+        test_chats = [
+            [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
+            [
+                {"role": "system", "content": "You are a helpful chatbot."},
+                {"role": "user", "content": "Hello!"},
+                {"role": "assistant", "content": "Nice to meet you."},
+            ],
+            [{"role": "assistant", "content": "Nice to meet you."}, {"role": "user", "content": "Hello!"}],
+        ]
+        tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats]
+        # fmt: off
+        expected_tokens = [
+            [35993, 35998, 35637, 35659, 35665, 35716, 35645, 35662, 35649, 35716, 35645, 35716, 35652, 35649, 35656, 35660, 35650, 35665, 35656, 35716, 35647, 35652, 35645, 35664, 35646, 35659, 35664, 35595, 35716, 35999, 35993, 35998, 35620, 35649, 35656, 35656, 35659, 35582, 35716, 35999],
+            [35993, 35998, 35637, 35659, 35665, 35716, 35645, 35662, 35649, 35716, 35645, 35716, 35652, 35649, 35656, 35660, 35650, 35665, 35656, 35716, 35647, 35652, 35645, 35664, 35646, 35659, 35664, 35595, 35716, 35999, 35993, 35998, 35620, 35649, 35656, 35656, 35659, 35582, 35716, 35999, 35993, 35998, 35626, 35653, 35647, 35649, 35716, 35664, 35659, 35716, 35657, 35649, 35649, 35664, 35716, 35669, 35659, 35665, 35595, 35716, 35999],
+            [35993, 35998, 35626, 35653, 35647, 35649, 35716, 35664, 35659, 35716, 35657, 35649, 35649, 35664, 35716, 35669, 35659, 35665, 35595, 35716, 35999, 35993, 35998, 35620, 35649, 35656, 35656, 35659, 35582, 35716, 35999]
+        ]
+        # fmt: on
+        for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
+            self.assertListEqual(tokenized_chat, expected_tokens)
diff --git a/tests/models/graphormer/__init__.py b/tests/models/graphormer/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/graphormer/test_modeling_graphormer.py b/tests/models/graphormer/test_modeling_graphormer.py
new file mode 100644
index 000000000000..b6a994f4597f
--- /dev/null
+++ b/tests/models/graphormer/test_modeling_graphormer.py
@@ -0,0 +1,1302 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Graphormer model. """
+
+
+import copy
+import inspect
+import os
+import tempfile
+import unittest
+
+from transformers import GraphormerConfig, is_torch_available
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+    from torch import tensor
+
+    from transformers import GraphormerForGraphClassification, GraphormerModel
+    from transformers.models.graphormer.modeling_graphormer import GRAPHORMER_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+class GraphormerModelTester:
+    def __init__(
+        self,
+        parent,
+        num_classes=1,
+        num_atoms=32 * 9,
+        num_edges=32 * 3,
+        num_in_degree=32,
+        num_out_degree=32,
+        num_spatial=32,
+        num_edge_dis=16,
+        multi_hop_max_dist=5,  # sometimes is 20
+        spatial_pos_max=32,
+        edge_type="multi_hop",
+        init_fn=None,
+        max_nodes=32,
+        share_input_output_embed=False,
+        num_hidden_layers=2,
+        embedding_dim=32,
+        ffn_embedding_dim=32,
+        num_attention_heads=4,
+        dropout=0.1,
+        attention_dropout=0.1,
+        activation_dropout=0.1,
+        layerdrop=0.0,
+        encoder_normalize_before=False,
+        pre_layernorm=False,
+        apply_graphormer_init=False,
+        activation_fn="gelu",
+        embed_scale=None,
+        freeze_embeddings=False,
+        num_trans_layers_to_freeze=0,
+        traceable=False,
+        q_noise=0.0,
+        qn_block_size=8,
+        kdim=None,
+        vdim=None,
+        bias=True,
+        self_attention=True,
+        batch_size=10,
+        graph_size=20,
+        is_training=True,
+    ):
+        self.parent = parent
+        self.num_classes = num_classes
+        self.num_labels = num_classes
+        self.num_atoms = num_atoms
+        self.num_in_degree = num_in_degree
+        self.num_out_degree = num_out_degree
+        self.num_edges = num_edges
+        self.num_spatial = num_spatial
+        self.num_edge_dis = num_edge_dis
+        self.edge_type = edge_type
+        self.multi_hop_max_dist = multi_hop_max_dist
+        self.spatial_pos_max = spatial_pos_max
+        self.max_nodes = max_nodes
+        self.num_hidden_layers = num_hidden_layers
+        self.embedding_dim = embedding_dim
+        self.hidden_size = embedding_dim
+        self.ffn_embedding_dim = ffn_embedding_dim
+        self.num_attention_heads = num_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.layerdrop = layerdrop
+        self.encoder_normalize_before = encoder_normalize_before
+        self.pre_layernorm = pre_layernorm
+        self.apply_graphormer_init = apply_graphormer_init
+        self.activation_fn = activation_fn
+        self.embed_scale = embed_scale
+        self.freeze_embeddings = freeze_embeddings
+        self.num_trans_layers_to_freeze = num_trans_layers_to_freeze
+        self.share_input_output_embed = share_input_output_embed
+        self.traceable = traceable
+        self.q_noise = q_noise
+        self.qn_block_size = qn_block_size
+        self.init_fn = init_fn
+        self.kdim = kdim
+        self.vdim = vdim
+        self.self_attention = self_attention
+        self.bias = bias
+        self.batch_size = batch_size
+        self.graph_size = graph_size
+        self.is_training = is_training
+
+    def prepare_config_and_inputs(self):
+        attn_bias = ids_tensor(
+            [self.batch_size, self.graph_size + 1, self.graph_size + 1], self.num_atoms
+        )  # Def not sure here
+        attn_edge_type = ids_tensor([self.batch_size, self.graph_size, self.graph_size, 1], self.num_edges)
+        spatial_pos = ids_tensor([self.batch_size, self.graph_size, self.graph_size], self.num_spatial)
+        in_degree = ids_tensor([self.batch_size, self.graph_size], self.num_in_degree)
+        out_degree = ids_tensor([self.batch_size, self.graph_size], self.num_out_degree)
+        input_nodes = ids_tensor([self.batch_size, self.graph_size, 1], self.num_atoms)
+        input_edges = ids_tensor(
+            [self.batch_size, self.graph_size, self.graph_size, self.multi_hop_max_dist, 1], self.num_edges
+        )
+        labels = ids_tensor([self.batch_size], self.num_classes)
+
+        config = self.get_config()
+
+        return config, attn_bias, attn_edge_type, spatial_pos, in_degree, out_degree, input_nodes, input_edges, labels
+
+    def get_config(self):
+        return GraphormerConfig(
+            num_atoms=self.num_atoms,
+            num_in_degree=self.num_in_degree,
+            num_out_degree=self.num_out_degree,
+            num_edges=self.num_edges,
+            num_spatial=self.num_spatial,
+            num_edge_dis=self.num_edge_dis,
+            edge_type=self.edge_type,
+            multi_hop_max_dist=self.multi_hop_max_dist,
+            spatial_pos_max=self.spatial_pos_max,
+            max_nodes=self.max_nodes,
+            num_hidden_layers=self.num_hidden_layers,
+            embedding_dim=self.embedding_dim,
+            hidden_size=self.embedding_dim,
+            ffn_embedding_dim=self.ffn_embedding_dim,
+            num_attention_heads=self.num_attention_heads,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            activation_dropout=self.activation_dropout,
+            layerdrop=self.layerdrop,
+            encoder_normalize_before=self.encoder_normalize_before,
+            pre_layernorm=self.pre_layernorm,
+            apply_graphormer_init=self.apply_graphormer_init,
+            activation_fn=self.activation_fn,
+            embed_scale=self.embed_scale,
+            freeze_embeddings=self.freeze_embeddings,
+            num_trans_layers_to_freeze=self.num_trans_layers_to_freeze,
+            share_input_output_embed=self.share_input_output_embed,
+            traceable=self.traceable,
+            q_noise=self.q_noise,
+            qn_block_size=self.qn_block_size,
+            init_fn=self.init_fn,
+            kdim=self.kdim,
+            vdim=self.vdim,
+            self_attention=self.self_attention,
+            bias=self.bias,
+        )
+
+    def create_and_check_model(
+        self, config, attn_bias, attn_edge_type, spatial_pos, in_degree, out_degree, input_nodes, input_edges, labels
+    ):
+        model = GraphormerModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_nodes=input_nodes,
+            attn_bias=attn_bias,
+            in_degree=in_degree,
+            out_degree=out_degree,
+            spatial_pos=spatial_pos,
+            input_edges=input_edges,
+            attn_edge_type=attn_edge_type,
+            labels=labels,
+        )
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size, self.graph_size + 1, self.hidden_size)
+        )
+
+    def create_and_check_for_graph_classification(
+        self, config, attn_bias, attn_edge_type, spatial_pos, in_degree, out_degree, input_nodes, input_edges, labels
+    ):
+        model = GraphormerForGraphClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_nodes=input_nodes,
+            attn_bias=attn_bias,
+            in_degree=in_degree,
+            out_degree=out_degree,
+            spatial_pos=spatial_pos,
+            input_edges=input_edges,
+            attn_edge_type=attn_edge_type,
+            labels=labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            attn_bias,
+            attn_edge_type,
+            spatial_pos,
+            in_degree,
+            out_degree,
+            input_nodes,
+            input_edges,
+            labels,
+        ) = config_and_inputs
+        inputs_dict = {
+            "attn_bias": attn_bias,
+            "attn_edge_type": attn_edge_type,
+            "spatial_pos": spatial_pos,
+            "in_degree": in_degree,
+            "out_degree": out_degree,
+            "input_nodes": input_nodes,
+            "input_edges": input_edges,
+            "labels": labels,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class GraphormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (GraphormerForGraphClassification, GraphormerModel) if is_torch_available() else ()
+    all_generative_model_classes = ()
+    pipeline_model_mapping = {"feature-extraction": GraphormerModel} if is_torch_available() else {}
+    test_pruning = False
+    test_head_masking = False
+    test_resize_embeddings = False
+    main_input_name_nodes = "input_nodes"
+    main_input_name_edges = "input_edges"
+    has_attentions = False  # does not output attention
+
+    def setUp(self):
+        self.model_tester = GraphormerModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=GraphormerConfig, has_text_modality=False)
+
+    # overwrite from common as `Graphormer` requires more input arguments
+    def _create_and_check_torchscript(self, config, inputs_dict):
+        if not self.test_torchscript:
+            return
+
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        configs_no_init.torchscript = True
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+
+            try:
+                required_keys = (
+                    "input_nodes",
+                    "input_edges",
+                    "attn_bias",
+                    "in_degree",
+                    "out_degree",
+                    "spatial_pos",
+                    "attn_edge_type",
+                )
+                required_inputs = tuple(inputs[k] for k in required_keys)
+                model(*required_inputs)
+                traced_model = torch.jit.trace(model, required_inputs)
+            except RuntimeError:
+                self.fail("Couldn't trace module.")
+
+            with tempfile.TemporaryDirectory() as tmp_dir_name:
+                pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
+
+                try:
+                    torch.jit.save(traced_model, pt_file_name)
+                except Exception:
+                    self.fail("Couldn't save module.")
+
+                try:
+                    loaded_model = torch.jit.load(pt_file_name)
+                except Exception:
+                    self.fail("Couldn't load module.")
+
+            model.to(torch_device)
+            model.eval()
+
+            loaded_model.to(torch_device)
+            loaded_model.eval()
+
+            model_state_dict = model.state_dict()
+            loaded_model_state_dict = loaded_model.state_dict()
+
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
+            self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
+
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
+            models_equal = True
+            for layer_name, p1 in model_state_dict.items():
+                if layer_name in loaded_model_state_dict:
+                    p2 = loaded_model_state_dict[layer_name]
+                    if p1.data.ne(p2.data).sum() > 0:
+                        models_equal = False
+
+            self.assertTrue(models_equal)
+
+            # Avoid memory leak. Without this, each call increase RAM usage by ~20MB.
+            # (Even with this call, there are still memory leak by ~0.04MB)
+            self.clear_torch_jit_class_registry()
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="Graphormer does not use one single inputs_embedding but three")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="Graphormer does not implement feed forward chunking")
+    def test_feed_forward_chunking(self):
+        pass
+
+    @unittest.skip(reason="Graphormer does not share input and output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    def test_initialization(self):
+        def _config_zero_init(config):
+            configs_no_init = copy.deepcopy(config)
+            for key in configs_no_init.__dict__.keys():
+                if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key:
+                    setattr(configs_no_init, key, 1e-10)
+            return configs_no_init
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    self.assertTrue(
+                        -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
+                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                    )
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+            expected_num_layers = getattr(
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+            )
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            batch_size = self.model_tester.batch_size
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [batch_size, self.model_tester.hidden_size],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            # Always returns hidden_states
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    def test_retain_grad_hidden_states_attentions(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        config.output_attentions = False
+
+        # no need to test all models as different heads yield the same functionality
+        model_class = self.all_model_classes[0]
+        model = model_class(config)
+        model.to(torch_device)
+
+        outputs = model(**inputs_dict)
+        output = outputs[0]
+
+        hidden_states = outputs.hidden_states[0]
+        hidden_states.retain_grad()
+
+        output.flatten()[0].backward(retain_graph=True)
+
+        self.assertIsNotNone(hidden_states.grad)
+
+    # Inputs are 'input_nodes' and 'input_edges' not 'input_ids'
+    def test_model_main_input_name(self):
+        for model_class in self.all_model_classes:
+            model_signature = inspect.signature(getattr(model_class, "forward"))
+            # The main input is the name of the argument after `self`
+            observed_main_input_name_nodes = list(model_signature.parameters.keys())[1]
+            observed_main_input_name_edges = list(model_signature.parameters.keys())[2]
+            self.assertEqual(model_class.main_input_name_nodes, observed_main_input_name_nodes)
+            self.assertEqual(model_class.main_input_name_edges, observed_main_input_name_edges)
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["input_nodes", "input_edges"]
+            self.assertListEqual(arg_names[:2], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_graph_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_graph_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in GRAPHORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = GraphormerForGraphClassification.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@require_torch
+class GraphormerModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_graph_classification(self):
+        model = GraphormerForGraphClassification.from_pretrained("clefourrier/graphormer-base-pcqm4mv2")
+
+        # Actual real graph data from the MUTAG dataset
+        # fmt: off
+        model_input = {
+            "attn_bias": tensor(
+                [
+                    [
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                    ],
+                    [
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
+                    ],
+                ]
+            ),
+            "attn_edge_type": tensor(
+                [
+                    [
+                        [[0], [3], [0], [0], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
+                        [[3], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
+                        [[0], [3], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
+                        [[0], [0], [3], [0], [3], [0], [0], [0], [0], [3], [0], [0], [0], [0], [0], [0], [0]],
+                        [[0], [0], [0], [3], [0], [3], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
+                        [[3], [0], [0], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
+                        [[0], [0], [0], [0], [3], [0], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
+                        [[0], [0], [0], [0], [0], [0], [3], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0]],
+                        [[0], [0], [0], [0], [0], [0], [0], [3], [0], [3], [0], [0], [0], [3], [0], [0], [0]],
+                        [[0], [0], [0], [3], [0], [0], [0], [0], [3], [0], [3], [0], [0], [0], [0], [0], [0]],
+                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [3], [0], [0], [0], [0], [0]],
+                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [3], [0], [0], [0], [0]],
+                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [3], [3], [0], [0]],
+                        [[0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [0], [0], [3], [0], [0], [0], [0]],
+                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [0], [3], [3]],
+                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [0]],
+                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [0]],
+                    ],
+                    [
+                        [[0], [3], [0], [0], [0], [0], [0], [0], [0], [3], [0], [0], [0], [0], [0], [0], [0]],
+                        [[3], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
+                        [[0], [3], [0], [3], [0], [0], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
+                        [[0], [0], [3], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
+                        [[0], [0], [0], [3], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
+                        [[0], [0], [0], [0], [3], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
+                        [[0], [0], [0], [0], [0], [3], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
+                        [[0], [0], [3], [0], [0], [0], [3], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0]],
+                        [[0], [0], [0], [0], [0], [0], [0], [3], [0], [3], [3], [0], [0], [0], [0], [0], [0]],
+                        [[3], [0], [0], [0], [0], [0], [0], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0]],
+                        [[0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [0], [3], [3], [0], [0], [0], [0]],
+                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [0], [0], [0], [0], [0]],
+                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [0], [0], [0], [0], [0]],
+                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
+                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
+                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
+                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
+                    ],
+                ]
+            ),
+            # fmt: on
+            "spatial_pos": tensor(
+                [
+                    [
+                        [1, 2, 3, 4, 3, 2, 4, 5, 6, 5, 6, 7, 8, 7, 9, 10, 10],
+                        [2, 1, 2, 3, 4, 3, 5, 6, 5, 4, 5, 6, 7, 6, 8, 9, 9],
+                        [3, 2, 1, 2, 3, 4, 4, 5, 4, 3, 4, 5, 6, 5, 7, 8, 8],
+                        [4, 3, 2, 1, 2, 3, 3, 4, 3, 2, 3, 4, 5, 4, 6, 7, 7],
+                        [3, 4, 3, 2, 1, 2, 2, 3, 4, 3, 4, 5, 6, 5, 7, 8, 8],
+                        [2, 3, 4, 3, 2, 1, 3, 4, 5, 4, 5, 6, 7, 6, 8, 9, 9],
+                        [4, 5, 4, 3, 2, 3, 1, 2, 3, 4, 5, 6, 5, 4, 6, 7, 7],
+                        [5, 6, 5, 4, 3, 4, 2, 1, 2, 3, 4, 5, 4, 3, 5, 6, 6],
+                        [6, 5, 4, 3, 4, 5, 3, 2, 1, 2, 3, 4, 3, 2, 4, 5, 5],
+                        [5, 4, 3, 2, 3, 4, 4, 3, 2, 1, 2, 3, 4, 3, 5, 6, 6],
+                        [6, 5, 4, 3, 4, 5, 5, 4, 3, 2, 1, 2, 3, 4, 4, 5, 5],
+                        [7, 6, 5, 4, 5, 6, 6, 5, 4, 3, 2, 1, 2, 3, 3, 4, 4],
+                        [8, 7, 6, 5, 6, 7, 5, 4, 3, 4, 3, 2, 1, 2, 2, 3, 3],
+                        [7, 6, 5, 4, 5, 6, 4, 3, 2, 3, 4, 3, 2, 1, 3, 4, 4],
+                        [9, 8, 7, 6, 7, 8, 6, 5, 4, 5, 4, 3, 2, 3, 1, 2, 2],
+                        [10, 9, 8, 7, 8, 9, 7, 6, 5, 6, 5, 4, 3, 4, 2, 1, 3],
+                        [10, 9, 8, 7, 8, 9, 7, 6, 5, 6, 5, 4, 3, 4, 2, 3, 1],
+                    ],
+                    [
+                        [1, 2, 3, 4, 5, 6, 5, 4, 3, 2, 4, 5, 5, 0, 0, 0, 0],
+                        [2, 1, 2, 3, 4, 5, 4, 3, 4, 3, 5, 6, 6, 0, 0, 0, 0],
+                        [3, 2, 1, 2, 3, 4, 3, 2, 3, 4, 4, 5, 5, 0, 0, 0, 0],
+                        [4, 3, 2, 1, 2, 3, 4, 3, 4, 5, 5, 6, 6, 0, 0, 0, 0],
+                        [5, 4, 3, 2, 1, 2, 3, 4, 5, 6, 6, 7, 7, 0, 0, 0, 0],
+                        [6, 5, 4, 3, 2, 1, 2, 3, 4, 5, 5, 6, 6, 0, 0, 0, 0],
+                        [5, 4, 3, 4, 3, 2, 1, 2, 3, 4, 4, 5, 5, 0, 0, 0, 0],
+                        [4, 3, 2, 3, 4, 3, 2, 1, 2, 3, 3, 4, 4, 0, 0, 0, 0],
+                        [3, 4, 3, 4, 5, 4, 3, 2, 1, 2, 2, 3, 3, 0, 0, 0, 0],
+                        [2, 3, 4, 5, 6, 5, 4, 3, 2, 1, 3, 4, 4, 0, 0, 0, 0],
+                        [4, 5, 4, 5, 6, 5, 4, 3, 2, 3, 1, 2, 2, 0, 0, 0, 0],
+                        [5, 6, 5, 6, 7, 6, 5, 4, 3, 4, 2, 1, 3, 0, 0, 0, 0],
+                        [5, 6, 5, 6, 7, 6, 5, 4, 3, 4, 2, 3, 1, 0, 0, 0, 0],
+                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    ],
+                ]
+            ),
+            "in_degree": tensor(
+                [
+                    [3, 3, 3, 4, 4, 3, 3, 3, 4, 4, 3, 3, 4, 3, 4, 2, 2],
+                    [3, 3, 4, 3, 3, 3, 3, 4, 4, 3, 4, 2, 2, 0, 0, 0, 0],
+                ]
+            ),
+            "out_degree": tensor(
+                [
+                    [3, 3, 3, 4, 4, 3, 3, 3, 4, 4, 3, 3, 4, 3, 4, 2, 2],
+                    [3, 3, 4, 3, 3, 3, 3, 4, 4, 3, 4, 2, 2, 0, 0, 0, 0],
+                ]
+            ),
+            "input_nodes": tensor(
+                [
+                    [[3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3]],
+                    [[3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [0], [0], [0], [0]],
+                ]
+            ),
+            "input_edges": tensor(
+                [
+                    [
+                        [
+                            [[0], [0], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                        ],
+                        [
+                            [[4], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                        ],
+                        [
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                        ],
+                        [
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                        ],
+                        [
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                        ],
+                        [
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                        ],
+                        [
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                        ],
+                        [
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                        ],
+                        [
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [4], [0]],
+                        ],
+                        [
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                        ],
+                        [
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [4], [0]],
+                        ],
+                        [
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                        ],
+                        [
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                        ],
+                        [
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                        ],
+                        [
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                        ],
+                        [
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                        ],
+                        [
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                        ],
+                    ],
+                    [
+                        [
+                            [[0], [0], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                        ],
+                        [
+                            [[4], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                        ],
+                        [
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                        ],
+                        [
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                        ],
+                        [
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                        ],
+                        [
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                        ],
+                        [
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                        ],
+                        [
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                        ],
+                        [
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                        ],
+                        [
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                        ],
+                        [
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                        ],
+                        [
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                        ],
+                        [
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [4]],
+                            [[4], [4], [4], [4], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[4], [4], [4], [0], [0]],
+                            [[4], [0], [0], [0], [0]],
+                            [[4], [4], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                        ],
+                        [
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                        ],
+                        [
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                        ],
+                        [
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                        ],
+                        [
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                            [[0], [0], [0], [0], [0]],
+                        ],
+                    ],
+                ]
+            ),
+            "labels": tensor([1, 0]),
+        }
+
+        output = model(**model_input)["logits"]
+
+        expected_shape = torch.Size((2, 1))
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_logs = torch.tensor(
+            [[7.6060], [7.4126]]
+        )
+
+        self.assertTrue(torch.allclose(output, expected_logs, atol=1e-4))
diff --git a/tests/models/groupvit/test_modeling_groupvit.py b/tests/models/groupvit/test_modeling_groupvit.py
index 3b396daa677e..6d52b6b50185 100644
--- a/tests/models/groupvit/test_modeling_groupvit.py
+++ b/tests/models/groupvit/test_modeling_groupvit.py
@@ -22,8 +22,8 @@
 import unittest
 
 import numpy as np
-
 import requests
+
 from transformers import GroupViTConfig, GroupViTTextConfig, GroupViTVisionConfig
 from transformers.testing_utils import is_pt_tf_cross_test, require_torch, require_vision, slow, torch_device
 from transformers.utils import is_torch_available, is_vision_available
@@ -36,6 +36,7 @@
     ids_tensor,
     random_attention_mask,
 )
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -355,7 +356,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -432,7 +433,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class GroupViTTextModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (GroupViTTextModel,) if is_torch_available() else ()
     test_pruning = False
     test_head_masking = False
@@ -475,7 +475,6 @@ def test_model_from_pretrained(self):
 
 class GroupViTModelTester:
     def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
-
         if text_kwargs is None:
             text_kwargs = {}
         if vision_kwargs is None:
@@ -523,8 +522,9 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class GroupViTModelTest(ModelTesterMixin, unittest.TestCase):
+class GroupViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (GroupViTModel,) if is_torch_available() else ()
+    pipeline_model_mapping = {"feature-extraction": GroupViTModel} if is_torch_available() else {}
     test_head_masking = False
     test_pruning = False
     test_resize_embeddings = False
@@ -553,6 +553,10 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_common_attributes(self):
         pass
 
+    # overwritten from parent as this equivalent test needs a specific `seed` and hard to get a good one!
+    def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=2e-5, name="outputs", attributes=None):
+        super().check_pt_tf_outputs(tf_outputs, pt_outputs, model_class, tol=tol, name=name, attributes=attributes)
+
     @is_pt_tf_cross_test
     def test_pt_tf_model_equivalence(self):
         import tensorflow as tf
@@ -630,8 +634,28 @@ def _create_and_check_torchscript(self, config, inputs_dict):
             model_state_dict = model.state_dict()
             loaded_model_state_dict = loaded_model.state_dict()
 
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
             self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
 
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
             models_equal = True
             for layer_name, p1 in model_state_dict.items():
                 p2 = loaded_model_state_dict[layer_name]
diff --git a/tests/models/groupvit/test_modeling_tf_groupvit.py b/tests/models/groupvit/test_modeling_tf_groupvit.py
index 45bc8b8ec3b0..1a1a14e30188 100644
--- a/tests/models/groupvit/test_modeling_tf_groupvit.py
+++ b/tests/models/groupvit/test_modeling_tf_groupvit.py
@@ -15,6 +15,8 @@
 """ Testing suite for the TensorFlow GroupViT model. """
 
 
+from __future__ import annotations
+
 import inspect
 import os
 import random
@@ -23,8 +25,8 @@
 from importlib import import_module
 
 import numpy as np
-
 import requests
+
 from transformers import GroupViTConfig, GroupViTTextConfig, GroupViTVisionConfig
 from transformers.testing_utils import (
     is_pt_tf_cross_test,
@@ -37,6 +39,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -96,7 +99,6 @@ def __init__(
         self.seq_length = num_patches
 
     def prepare_config_and_inputs(self):
-
         rng = random.Random(0)
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size], rng=rng)
         config = self.get_config()
@@ -148,6 +150,10 @@ class TFGroupViTVisionModelTest(TFModelTesterMixin, unittest.TestCase):
     test_head_masking = False
     test_onnx = False
 
+    def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=1e-4, name="outputs", attributes=None):
+        # We override with a slightly higher tol value, as this model tends to diverge a bit more
+        super().check_pt_tf_outputs(tf_outputs, pt_outputs, model_class, tol, name, attributes)
+
     def setUp(self):
         self.model_tester = TFGroupViTVisionModelTester(self)
         self.config_tester = ConfigTester(
@@ -379,7 +385,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -452,12 +458,15 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_tf
 class TFGroupViTTextModelTest(TFModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (TFGroupViTTextModel,) if is_tf_available() else ()
     test_pruning = False
     test_head_masking = False
     test_onnx = False
 
+    def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=1e-4, name="outputs", attributes=None):
+        # We override with a slightly higher tol value, as this model tends to diverge a bit more
+        super().check_pt_tf_outputs(tf_outputs, pt_outputs, model_class, tol, name, attributes)
+
     def setUp(self):
         self.model_tester = TFGroupViTTextModelTester(self)
         self.config_tester = ConfigTester(self, config_class=GroupViTTextConfig, hidden_size=37)
@@ -571,14 +580,19 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFGroupViTModelTest(TFModelTesterMixin, unittest.TestCase):
+class TFGroupViTModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (TFGroupViTModel,) if is_tf_available() else ()
+    pipeline_model_mapping = {"feature-extraction": TFGroupViTModel} if is_tf_available() else {}
     test_head_masking = False
     test_pruning = False
     test_resize_embeddings = False
     test_attention_outputs = False
     test_onnx = False
 
+    def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=1e-4, name="outputs", attributes=None):
+        # We override with a slightly higher tol value, as this model tends to diverge a bit more
+        super().check_pt_tf_outputs(tf_outputs, pt_outputs, model_class, tol, name, attributes)
+
     def setUp(self):
         self.model_tester = TFGroupViTModelTester(self)
 
@@ -599,6 +613,7 @@ def test_model_common_attributes(self):
         pass
 
     @require_tensorflow_probability
+    @slow
     def test_keras_fit(self):
         super().test_keras_fit()
 
@@ -630,7 +645,7 @@ def test_keras_save_load(self):
         if self.__class__.__name__ == "TFGroupViTModelTest":
             inputs_dict.pop("return_loss", None)
 
-        tf_main_layer_classes = set(
+        tf_main_layer_classes = {
             module_member
             for model_class in self.all_model_classes
             for module in (import_module(model_class.__module__),)
@@ -642,7 +657,7 @@ def test_keras_save_load(self):
             if isinstance(module_member, type)
             and tf.keras.layers.Layer in module_member.__bases__
             and getattr(module_member, "_keras_serializable", False)
-        )
+        }
         for main_layer_class in tf_main_layer_classes:
             # T5MainLayer needs an embed_tokens parameter when called without the inputs_embeds parameter
             if "T5" in main_layer_class.__name__:
@@ -690,11 +705,6 @@ def test_model_from_pretrained(self):
     def test_saved_model_creation(self):
         pass
 
-    @unittest.skip(reason="Currently `saved_model` doesn't work with nested outputs.")
-    @slow
-    def test_saved_model_creation_extended(self):
-        pass
-
     @unittest.skip(reason="`saved_model` doesn't work with nested outputs so no preparation happens.")
     @slow
     def test_prepare_serving_output(self):
diff --git a/tests/models/herbert/test_tokenization_herbert.py b/tests/models/herbert/test_tokenization_herbert.py
index 3e8d3ac6ea29..1afea16bdd28 100644
--- a/tests/models/herbert/test_tokenization_herbert.py
+++ b/tests/models/herbert/test_tokenization_herbert.py
@@ -27,7 +27,6 @@
 
 @require_tokenizers
 class HerbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = HerbertTokenizer
     rust_tokenizer_class = HerbertTokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/hubert/test_modeling_hubert.py b/tests/models/hubert/test_modeling_hubert.py
index 71c317cd1292..d1a0558b4e5e 100644
--- a/tests/models/hubert/test_modeling_hubert.py
+++ b/tests/models/hubert/test_modeling_hubert.py
@@ -35,6 +35,7 @@
     ids_tensor,
     random_attention_mask,
 )
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -70,7 +71,7 @@ def __init__(
         conv_bias=False,
         num_conv_pos_embeddings=16,
         num_conv_pos_embedding_groups=2,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         hidden_dropout_prob=0.1,  # this is most likely not correctly set yet
         intermediate_size=20,
@@ -251,8 +252,8 @@ def check_ctc_training(self, config, input_values, *args):
             input_values[i, input_lengths[i] :] = 0.0
 
             if max_length_labels[i] < labels.shape[-1]:
-                # it's important that we make sure that target lenghts are at least
-                # one shorter than logit lenghts to prevent -inf
+                # it's important that we make sure that target lengths are at least
+                # one shorter than logit lengths to prevent -inf
                 labels[i, max_length_labels[i] - 1 :] = -100
 
         loss = model(input_values, labels=labels).loss
@@ -304,8 +305,17 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class HubertModelTest(ModelTesterMixin, unittest.TestCase):
+class HubertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (HubertForCTC, HubertForSequenceClassification, HubertModel) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "audio-classification": HubertForSequenceClassification,
+            "automatic-speech-recognition": HubertForCTC,
+            "feature-extraction": HubertModel,
+        }
+        if is_torch_available()
+        else {}
+    )
     fx_compatible = True
     test_pruning = False
     test_headmasking = False
@@ -409,11 +419,12 @@ def test_initialization(self):
             for name, param in model.named_parameters():
                 uniform_init_parms = [
                     "conv.weight",
+                    "conv.parametrizations.weight",
                     "masked_spec_embed",
                     "quantizer.weight_proj.weight",
                 ]
                 if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
+                    if any(x in name for x in uniform_init_parms):
                         self.assertTrue(
                             -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
@@ -427,6 +438,9 @@ def test_initialization(self):
 
     # Hubert cannot be TorchScripted because of torch.nn.utils.weight_norm
     def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False):
+        # TODO: fix it
+        self.skipTest("torch 2.1 breaks torch fx tests for wav2vec2/hubert.")
+
         if not is_torch_fx_available() or not self.fx_compatible:
             return
 
@@ -670,11 +684,12 @@ def test_initialization(self):
             for name, param in model.named_parameters():
                 uniform_init_parms = [
                     "conv.weight",
+                    "conv.parametrizations.weight",
                     "masked_spec_embed",
                     "quantizer.weight_proj.weight",
                 ]
                 if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
+                    if any(x in name for x in uniform_init_parms):
                         self.assertTrue(
                             -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
diff --git a/tests/models/hubert/test_modeling_tf_hubert.py b/tests/models/hubert/test_modeling_tf_hubert.py
index d37679831d0f..3685e6598740 100644
--- a/tests/models/hubert/test_modeling_tf_hubert.py
+++ b/tests/models/hubert/test_modeling_tf_hubert.py
@@ -14,19 +14,24 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import copy
 import inspect
 import math
+import os
+import tempfile
 import unittest
 
 import numpy as np
 import pytest
 
 from transformers import is_tf_available
-from transformers.testing_utils import require_soundfile, require_tf, slow
+from transformers.testing_utils import is_pt_tf_cross_test, require_soundfile, require_tf, slow
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -54,7 +59,7 @@ def __init__(
         conv_bias=False,
         num_conv_pos_embeddings=16,
         num_conv_pos_embedding_groups=2,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         hidden_dropout_prob=0.1,  # this is most likely not correctly set yet
         intermediate_size=20,
@@ -218,9 +223,9 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFHubertModelTest(TFModelTesterMixin, unittest.TestCase):
-
+class TFHubertModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (TFHubertModel, TFHubertForCTC) if is_tf_available() else ()
+    pipeline_model_mapping = {"feature-extraction": TFHubertModel} if is_tf_available() else {}
     test_resize_embeddings = False
     test_head_masking = False
     test_onnx = False
@@ -305,18 +310,15 @@ def test_labels_out_of_vocab(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
 
-    # Hubert has no inputs_embeds
+    @unittest.skip(reason="Hubert has no input embeddings")
     def test_inputs_embeds(self):
         pass
 
-    # Hubert cannot resize token embeddings
-    # since it has no tokens embeddings
+    @unittest.skip(reason="Hubert has no tokens embeddings")
     def test_resize_tokens_embeddings(self):
         pass
 
-    # Hubert has no inputs_embeds
-    # and thus the `get_input_embeddings` fn
-    # is not implemented
+    @unittest.skip(reason="Hubert has no input embeddings")
     def test_model_common_attributes(self):
         pass
 
@@ -325,10 +327,72 @@ def test_model_from_pretrained(self):
         model = TFHubertModel.from_pretrained("facebook/hubert-base-ls960")
         self.assertIsNotNone(model)
 
-    @unittest.skip("Loss shapes for CTC don't match the base test.")
-    def test_loss_computation(self):
+    @unittest.skip(reason="Fix me! Hubert hits OOM errors when loss is computed on full batch")
+    def test_dataset_conversion(self):
+        # TODO: (Amy) - check whether skipping CTC model resolves this issue and possible resolutions for CTC
         pass
 
+    @unittest.skip(reason="Fix me! Hubert hits OOM errors when loss is computed on full batch")
+    def test_keras_fit(self):
+        # TODO: (Amy) - check whether skipping CTC model resolves this issue and possible resolutions for CTC
+        pass
+
+    @is_pt_tf_cross_test
+    def test_pt_tf_model_equivalence(self, allow_missing_keys=False):
+        # We override the base test here to skip loss calculation for Hubert models because the loss is massive with
+        # the default labels and frequently overflows to inf or exceeds numerical tolerances between TF/PT
+        import torch
+
+        import transformers
+
+        for model_class in self.all_model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+            # Output all for aggressive testing
+            config.output_hidden_states = True
+            config.output_attentions = self.has_attentions
+
+            # Make sure no sequence has all zeros as attention mask, otherwise some tests fail due to the inconsistency
+            # of the usage `1e-4`, `1e-9`, `1e-30`, `-inf`.
+            # TODO: Use a uniform value for all models, make sure all tests pass without this processing, and remove it.
+            self._make_attention_mask_non_null(inputs_dict)
+
+            pt_model_class_name = model_class.__name__[2:]  # Skip the "TF" at the beginning
+            pt_model_class = getattr(transformers, pt_model_class_name)
+
+            tf_model = model_class(config)
+            pt_model = pt_model_class(config)
+
+            tf_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+
+            # Check we can load pt model in tf and vice-versa with model => model functions
+            tf_model = transformers.load_pytorch_model_in_tf2_model(
+                tf_model, pt_model, tf_inputs=tf_inputs_dict, allow_missing_keys=allow_missing_keys
+            )
+            pt_model = transformers.load_tf2_model_in_pytorch_model(
+                pt_model, tf_model, allow_missing_keys=allow_missing_keys
+            )
+
+            # Original test: check without `labels`
+            self.check_pt_tf_models(tf_model, pt_model, tf_inputs_dict)
+
+            # Check we can load pt model in tf and vice-versa with checkpoint => model functions
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin")
+                torch.save(pt_model.state_dict(), pt_checkpoint_path)
+                tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(
+                    tf_model, pt_checkpoint_path, allow_missing_keys=allow_missing_keys
+                )
+
+                tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5")
+                tf_model.save_weights(tf_checkpoint_path)
+                pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(
+                    pt_model, tf_checkpoint_path, allow_missing_keys=allow_missing_keys
+                )
+
+            # Original test: check without `labels`
+            self.check_pt_tf_models(tf_model, pt_model, tf_inputs_dict)
+
 
 @require_tf
 class TFHubertRobustModelTest(TFModelTesterMixin, unittest.TestCase):
@@ -427,18 +491,15 @@ def test_labels_out_of_vocab(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
 
-    # Hubert has no inputs_embeds
+    @unittest.skip(reason="Hubert has no input embeddings")
     def test_inputs_embeds(self):
         pass
 
-    # Hubert cannot resize token embeddings
-    # since it has no tokens embeddings
+    @unittest.skip(reason="Hubert has no tokens embeddings")
     def test_resize_tokens_embeddings(self):
         pass
 
-    # Hubert has no inputs_embeds
-    # and thus the `get_input_embeddings` fn
-    # is not implemented
+    @unittest.skip(reason="Hubert has no input embeddings or get_input_embeddings method")
     def test_model_common_attributes(self):
         pass
 
@@ -447,10 +508,72 @@ def test_model_from_pretrained(self):
         model = TFHubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
         self.assertIsNotNone(model)
 
-    @unittest.skip("Loss shapes for CTC don't match the base test.")
-    def test_loss_computation(self):
+    @unittest.skip(reason="Fix me! Hubert hits OOM errors when loss is computed on full batch")
+    def test_dataset_conversion(self):
+        # TODO: (Amy) - check whether skipping CTC model resolves this issue and possible resolutions for CTC
         pass
 
+    @unittest.skip(reason="Fix me! Hubert hits OOM errors when loss is computed on full batch")
+    def test_keras_fit(self):
+        # TODO: (Amy) - check whether skipping CTC model resolves this issue and possible resolutions for CTC
+        pass
+
+    @is_pt_tf_cross_test
+    def test_pt_tf_model_equivalence(self, allow_missing_keys=False):
+        # We override the base test here to skip loss calculation for Hubert models because the loss is massive with
+        # the default labels and frequently overflows to inf or exceeds numerical tolerances between TF/PT
+        import torch
+
+        import transformers
+
+        for model_class in self.all_model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+            # Output all for aggressive testing
+            config.output_hidden_states = True
+            config.output_attentions = self.has_attentions
+
+            # Make sure no sequence has all zeros as attention mask, otherwise some tests fail due to the inconsistency
+            # of the usage `1e-4`, `1e-9`, `1e-30`, `-inf`.
+            # TODO: Use a uniform value for all models, make sure all tests pass without this processing, and remove it.
+            self._make_attention_mask_non_null(inputs_dict)
+
+            pt_model_class_name = model_class.__name__[2:]  # Skip the "TF" at the beginning
+            pt_model_class = getattr(transformers, pt_model_class_name)
+
+            tf_model = model_class(config)
+            pt_model = pt_model_class(config)
+
+            tf_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+
+            # Check we can load pt model in tf and vice-versa with model => model functions
+            tf_model = transformers.load_pytorch_model_in_tf2_model(
+                tf_model, pt_model, tf_inputs=tf_inputs_dict, allow_missing_keys=allow_missing_keys
+            )
+            pt_model = transformers.load_tf2_model_in_pytorch_model(
+                pt_model, tf_model, allow_missing_keys=allow_missing_keys
+            )
+
+            # Original test: check without `labels`
+            self.check_pt_tf_models(tf_model, pt_model, tf_inputs_dict)
+
+            # Check we can load pt model in tf and vice-versa with checkpoint => model functions
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin")
+                torch.save(pt_model.state_dict(), pt_checkpoint_path)
+                tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(
+                    tf_model, pt_checkpoint_path, allow_missing_keys=allow_missing_keys
+                )
+
+                tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5")
+                tf_model.save_weights(tf_checkpoint_path)
+                pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(
+                    pt_model, tf_checkpoint_path, allow_missing_keys=allow_missing_keys
+                )
+
+            # Original test: check without `labels`
+            self.check_pt_tf_models(tf_model, pt_model, tf_inputs_dict)
+
 
 @require_tf
 class TFHubertUtilsTest(unittest.TestCase):
diff --git a/tests/models/ibert/test_modeling_ibert.py b/tests/models/ibert/test_modeling_ibert.py
index c8ca026688b6..096a55169a00 100644
--- a/tests/models/ibert/test_modeling_ibert.py
+++ b/tests/models/ibert/test_modeling_ibert.py
@@ -22,6 +22,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -61,7 +62,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -224,8 +225,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class IBertModelTest(ModelTesterMixin, unittest.TestCase):
-
+class IBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     test_pruning = False
     test_torchscript = False
     test_head_masking = False
@@ -243,6 +243,18 @@ class IBertModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": IBertModel,
+            "fill-mask": IBertForMaskedLM,
+            "question-answering": IBertForQuestionAnswering,
+            "text-classification": IBertForSequenceClassification,
+            "token-classification": IBertForTokenClassification,
+            "zero-shot": IBertForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
 
     def setUp(self):
         self.model_tester = IBertModelTester(self)
diff --git a/tests/models/idefics/__init__.py b/tests/models/idefics/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/idefics/test_image_processing_idefics.py b/tests/models/idefics/test_image_processing_idefics.py
new file mode 100644
index 000000000000..6c682ce4a8f8
--- /dev/null
+++ b/tests/models/idefics/test_image_processing_idefics.py
@@ -0,0 +1,203 @@
+# coding=utf-8
+# Copyright 2021 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers.testing_utils import require_torch, require_torchvision, require_vision
+from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_torchvision_available():
+    import torchvision.transforms as transforms
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import IdeficsImageProcessor
+
+
+class IdeficsImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        size=None,
+        image_mean=[0.48145466, 0.4578275, 0.40821073],
+        image_std=[0.26862954, 0.26130258, 0.27577711],
+    ):
+        size = size if size is not None else {"shortest_edge": 30}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        # self.size = size
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def prepare_image_processor_dict(self):
+        return {
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "image_size": self.image_size,
+        }
+
+    def get_expected_values(self, image_inputs, batched=False):
+        """
+        This function computes the expected height and width when providing images to IdeficsImageProcessor,
+        assuming do_resize is set to True with a scalar size and size_divisor.
+        """
+        if not batched:
+            size = self.image_size
+            image = image_inputs[0]
+            if isinstance(image, Image.Image):
+                w, h = image.size
+            else:
+                h, w = image.shape[1], image.shape[2]
+            scale = size / min(w, h)
+            if h < w:
+                newh, neww = size, scale * w
+            else:
+                newh, neww = scale * h, size
+
+            max_size = int((1333 / 800) * size)
+            if max(newh, neww) > max_size:
+                scale = max_size / max(newh, neww)
+                newh = newh * scale
+                neww = neww * scale
+
+            newh, neww = int(newh + 0.5), int(neww + 0.5)
+            expected_height, expected_width = (
+                newh // self.size_divisor * self.size_divisor,
+                neww // self.size_divisor * self.size_divisor,
+            )
+
+        else:
+            expected_values = []
+            for image in image_inputs:
+                expected_height, expected_width = self.get_expected_values([image])
+                expected_values.append((expected_height, expected_width))
+            expected_height = max(expected_values, key=lambda item: item[0])[0]
+            expected_width = max(expected_values, key=lambda item: item[1])[1]
+
+        return expected_height, expected_width
+
+    def expected_output_image_shape(self, images):
+        height, width = self.get_expected_values(images, batched=True)
+        return (self.num_channels, height, width)
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class IdeficsImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = IdeficsImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = IdeficsImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "image_size"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertNotEqual(image_processor.image_size, 30)
+
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, image_size=42)
+        self.assertEqual(image_processor.image_size, 42)
+
+    @require_torchvision
+    def test_torchvision_numpy_transforms_equivalency(self):
+        # as we had to reimplement the torchvision transforms using transformers utils we must check
+        # they both do the same
+
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+
+        print(image_inputs)
+
+        def convert_to_rgb(image):
+            # `image.convert("RGB")` would only work for .jpg images, as it creates a wrong background
+            # for transparent images. The call to `alpha_composite` handles this case
+            if image.mode == "RGB":
+                return image
+
+            image_rgba = image.convert("RGBA")
+            background = Image.new("RGBA", image_rgba.size, (255, 255, 255))
+            alpha_composite = Image.alpha_composite(background, image_rgba)
+            alpha_composite = alpha_composite.convert("RGB")
+            return alpha_composite
+
+        image_size = image_processor.image_size
+        image_mean = image_processor.image_mean
+        image_std = image_processor.image_std
+
+        transform = transforms.Compose(
+            [
+                convert_to_rgb,
+                transforms.Resize((image_size, image_size), interpolation=transforms.InterpolationMode.BICUBIC),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=image_mean, std=image_std),
+            ]
+        )
+
+        pixel_values_transform_implied = image_processor(image_inputs, transform=None)
+        pixel_values_transform_supplied = image_processor(image_inputs, transform=transform)
+
+        torch.testing.assert_close(pixel_values_transform_implied, pixel_values_transform_supplied, rtol=0.0, atol=0.0)
+
+    @unittest.skip("not supported")
+    def test_call_numpy(self):
+        pass
+
+    @unittest.skip("not supported")
+    def test_call_numpy_4_channels(self):
+        pass
+
+    @unittest.skip("not supported")
+    def test_call_pil(self):
+        pass
+
+    @unittest.skip("not supported")
+    def test_call_pytorch(self):
+        pass
diff --git a/tests/models/idefics/test_modeling_idefics.py b/tests/models/idefics/test_modeling_idefics.py
new file mode 100644
index 000000000000..040a5c2c087d
--- /dev/null
+++ b/tests/models/idefics/test_modeling_idefics.py
@@ -0,0 +1,557 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Idefics model. """
+
+import unittest
+
+from transformers import BitsAndBytesConfig, IdeficsConfig, is_torch_available, is_vision_available
+from transformers.testing_utils import (
+    TestCasePlus,
+    require_bitsandbytes,
+    require_torch,
+    require_vision,
+    slow,
+    torch_device,
+)
+from transformers.utils import cached_property
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import IdeficsForVisionText2Text, IdeficsModel, IdeficsProcessor
+    from transformers.models.idefics.configuration_idefics import IdeficsPerceiverConfig, IdeficsVisionConfig
+    from transformers.models.idefics.modeling_idefics import IDEFICS_PRETRAINED_MODEL_ARCHIVE_LIST
+    from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_0
+else:
+    is_torch_greater_or_equal_than_2_0 = False
+
+if is_vision_available():
+    from PIL import Image
+
+
+class IdeficsModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=1,
+        seq_length=7,
+        image_size=30,
+        patch_size=2,
+        num_channels=3,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        scope=None,
+        modality_type_vocab_size=2,
+        vision_embed_dim=32,
+        vision_patch_size=2,
+        vision_image_size=30,
+        vision_num_attention_heads=4,
+        vision_num_hidden_layers=5,
+        vision_intermediate_size=37,
+        perceiver_qk_layer_norms_perceiver=False,
+        perceiver_resampler_depth=2,
+        perceiver_resampler_head_dim=8,
+        perceiver_resampler_n_heads=2,
+        perceiver_resampler_n_latents=16,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.scope = scope
+        self.modality_type_vocab_size = modality_type_vocab_size
+
+        self.vision_embed_dim = vision_embed_dim
+        self.vision_patch_size = vision_patch_size
+        self.vision_image_size = vision_image_size
+        self.vision_num_attention_heads = vision_num_attention_heads
+        self.vision_num_hidden_layers = vision_num_hidden_layers
+        self.vision_intermediate_size = vision_intermediate_size
+
+        self.vision_config = IdeficsVisionConfig(
+            embed_dim=self.vision_embed_dim,
+            patch_size=self.vision_patch_size,
+            image_size=self.vision_image_size,
+            num_attention_heads=self.vision_num_attention_heads,
+            num_hidden_layers=self.vision_num_hidden_layers,
+            intermediate_size=self.vision_intermediate_size,
+        )
+
+        self.perceiver_qk_layer_norms_perceiver = perceiver_qk_layer_norms_perceiver
+        self.perceiver_resampler_depth = perceiver_resampler_depth
+        self.perceiver_resampler_head_dim = perceiver_resampler_head_dim
+        self.perceiver_resampler_n_heads = perceiver_resampler_n_heads
+        self.perceiver_resampler_n_latents = perceiver_resampler_n_latents
+
+        self.perceiver_config = IdeficsPerceiverConfig(
+            qk_layer_norms_perceiver=self.perceiver_qk_layer_norms_perceiver,
+            resampler_depth=self.perceiver_resampler_depth,
+            resampler_head_dim=self.perceiver_resampler_head_dim,
+            resampler_n_heads=self.perceiver_resampler_n_heads,
+            resampler_n_latents=self.perceiver_resampler_n_latents,
+        )
+
+        # we set the expected sequence length (which is used in several tests)
+        # this is equal to the seq length of the text tokens + number of image patches + 1 for the CLS token
+        self.expected_seq_len = self.seq_length + (self.image_size // self.patch_size) ** 2 + 1
+
+    def prepare_config_and_inputs(self, num_images=1, interpolate_pos_encoding=False, image_expansion=0):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        pixel_values = floats_tensor(
+            [
+                self.batch_size,
+                num_images,
+                self.num_channels,
+                self.image_size + image_expansion,
+                self.image_size + image_expansion,
+            ]
+        )
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        image_attention_mask = random_attention_mask([self.batch_size, self.seq_length, num_images])
+
+        config = self.get_config()
+        return (config, input_ids, input_mask, pixel_values, image_attention_mask, interpolate_pos_encoding)
+
+    def get_config(self):
+        return IdeficsConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+            num_labels=self.num_labels,
+            modality_type_vocab_size=self.modality_type_vocab_size,
+            vision_config=self.vision_config,
+        )
+
+    def create_and_check_model(
+        self,
+        config,
+        input_ids,
+        input_mask,
+        pixel_values,
+        image_attention_mask,
+        interpolate_pos_encoding,
+    ):
+        model = IdeficsModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            pixel_values=pixel_values,
+            image_attention_mask=image_attention_mask,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+        )
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size, input_ids.shape[1], self.hidden_size)
+        )
+
+    def create_and_check_model_gen(
+        self,
+        config,
+        input_ids,
+        input_mask,
+        pixel_values,
+        image_attention_mask,
+        interpolate_pos_encoding,
+    ):
+        model = IdeficsForVisionText2Text(config)
+        model.to(torch_device)
+        model.eval()
+        model.generate(
+            input_ids,
+            attention_mask=input_mask,
+            pixel_values=pixel_values,
+            image_attention_mask=image_attention_mask,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            max_length=self.seq_length + 2,
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            input_mask,
+            pixel_values,
+            image_attention_mask,
+            interpolate_pos_encoding,
+        ) = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+            "pixel_values": pixel_values,
+            "image_attention_mask": image_attention_mask,
+            "interpolate_pos_encoding": interpolate_pos_encoding,
+        }
+        return config, inputs_dict
+
+    def prepare_pixel_values(self):
+        return floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+
+@unittest.skipIf(not is_torch_greater_or_equal_than_2_0, reason="pytorch 2.0 or higher is required")
+@require_torch
+class IdeficsModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (IdeficsModel, IdeficsForVisionText2Text) if is_torch_available() else ()
+    pipeline_model_mapping = {"feature-extraction": IdeficsModel} if is_torch_available() else {}
+    test_pruning = False
+    test_headmasking = False
+    test_torchscript = False
+
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+        # XXX: IdeficsForVisionText2TextTest has no MODEL_FOR group yet, but it should be the same
+        # as MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, so for now manually changing to do the right thing
+        # as super won't do it
+        if return_labels:
+            inputs_dict["labels"] = torch.zeros(
+                (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
+            )
+
+        return inputs_dict
+
+    def test_model_outputs_equivalence(self):
+        try:
+            orig = self.all_model_classes
+            # IdeficsModel.forward doesn't have labels input arg - only IdeficsForVisionText2Text does
+            self.all_model_classes = (IdeficsForVisionText2Text,) if is_torch_available() else ()
+            super().test_model_outputs_equivalence()
+        finally:
+            self.all_model_classes = orig
+
+    def setUp(self):
+        self.model_tester = IdeficsModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=IdeficsConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model_single_image(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs(
+            num_images=1, interpolate_pos_encoding=False, image_expansion=0
+        )
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_multiple_images(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs(
+            num_images=2, interpolate_pos_encoding=False, image_expansion=0
+        )
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_with_image_pos_embeddings_interpolation_single_image(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs(
+            num_images=1, interpolate_pos_encoding=True, image_expansion=2
+        )
+        self.model_tester.create_and_check_model(*config_and_inputs)
+        config_and_inputs = self.model_tester.prepare_config_and_inputs(
+            num_images=1, interpolate_pos_encoding=True, image_expansion=0
+        )
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_with_image_pos_embeddings_interpolation_multiple_images(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs(
+            num_images=2, interpolate_pos_encoding=True, image_expansion=2
+        )
+        self.model_tester.create_and_check_model(*config_and_inputs)
+        config_and_inputs = self.model_tester.prepare_config_and_inputs(
+            num_images=2, interpolate_pos_encoding=True, image_expansion=0
+        )
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_generate_with_image_pos_embeddings_interpolation_single_image(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs(
+            num_images=1, interpolate_pos_encoding=True, image_expansion=2
+        )
+        self.model_tester.create_and_check_model_gen(*config_and_inputs)
+
+    def test_generate_with_image_pos_embeddings_interpolation_multiple_images(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs(
+            num_images=2, interpolate_pos_encoding=True, image_expansion=2
+        )
+        self.model_tester.create_and_check_model_gen(*config_and_inputs)
+
+    def test_training(self):
+        if not self.model_tester.is_training:
+            return
+
+        for model_class in self.all_model_classes:
+            # IdeficsModel does not support training, users should use
+            # IdeficsForVisionText2Text for this purpose
+            if model_class == IdeficsModel:
+                return
+
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            config.return_dict = True
+
+            model = model_class(config)
+            model.to(torch_device)
+            model.train()
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            loss = model(**inputs).loss
+            loss.backward()
+
+    def test_training_gradient_checkpointing(self):
+        if not self.model_tester.is_training:
+            return
+
+        for model_class in self.all_model_classes:
+            # IdeficsModel does not support training, users should use
+            # IdeficsForVisionText2Text for this purpose
+            if model_class == IdeficsModel:
+                return
+
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            config.use_cache = False
+            config.return_dict = True
+
+            model = model_class(config)
+            model.to(torch_device)
+            model.gradient_checkpointing_enable()
+            model.train()
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            loss = model(**inputs).loss
+            loss.backward()
+
+    @unittest.skip(reason="""IDEFICS does not support retaining the gradients of the hidden states and attention""")
+    def test_retain_grad_hidden_states_attentions(self):
+        return
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.attentions
+
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.attentions
+            # IDEFICS does not support outputting attention score becuase it uses SDPA under the hood
+            self.assertTrue(attentions[0] is None)
+            out_len = len(outputs)
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            self.assertEqual(out_len + 1, len(outputs))
+
+            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+            # IDEFICS does not support outputting attention score becuase it uses SDPA under the hood
+            self.assertTrue(self_attentions[0] is None)
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+            expected_num_layers = getattr(
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+            )
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            seq_length = self.model_tester.seq_length
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [seq_length, self.model_tester.hidden_size],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in IDEFICS_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = IdeficsModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@unittest.skipIf(not is_torch_greater_or_equal_than_2_0, reason="pytorch 2.0 or higher is required")
+@require_torch
+class IdeficsForVisionText2TextTest(IdeficsModelTest, unittest.TestCase):
+    all_model_classes = (IdeficsForVisionText2Text,) if is_torch_available() else ()
+
+    def setUp(self):
+        self.model_tester = IdeficsModelTester(
+            self,
+            modality_type_vocab_size=3,
+        )
+        self.config_tester = ConfigTester(self, config_class=IdeficsConfig, hidden_size=37)
+
+    @unittest.skip("We only test the model that takes in multiple images")
+    def test_model(self):
+        pass
+
+    @unittest.skip("We only test the model that takes in multiple images")
+    def test_for_token_classification(self):
+        pass
+
+    @unittest.skip(reason="""IDEFICS does not support retaining the gradients of the hidden states and attention""")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+
+@unittest.skipIf(not is_torch_greater_or_equal_than_2_0, reason="pytorch 2.0 or higher is required")
+@require_torch
+@require_vision
+class IdeficsModelIntegrationTest(TestCasePlus):
+    @cached_property
+    def default_processor(self):
+        return (
+            IdeficsProcessor.from_pretrained("HuggingFaceM4/idefics-9b", revision="refs/pr/11")
+            if is_vision_available()
+            else None
+        )
+
+    @require_bitsandbytes
+    @slow
+    def test_inference_natural_language_visual_reasoning(self):
+        cat_image_path = self.tests_dir / "fixtures/tests_samples/COCO/000000039769.png"
+        cats_image_obj = Image.open(cat_image_path)  # 2 cats
+        dogs_image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image1.jpeg"
+
+        prompts = [
+            [
+                "User:",
+                dogs_image_url,
+                "Describe this image.\nAssistant: An image of two dogs.\n",
+                "User:",
+                cats_image_obj,
+                "Describe this image.\nAssistant:",
+            ],
+            [
+                "User:",
+                cats_image_obj,
+                "Describe this image.\nAssistant: An image of two kittens.\n",
+                "User:",
+                dogs_image_url,
+                "Describe this image.\nAssistant:",
+            ],
+        ]
+
+        # the CI gpu is small so using quantization to fit
+        quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype="float16",
+        )
+        model = IdeficsForVisionText2Text.from_pretrained(
+            "HuggingFaceM4/idefics-9b", quantization_config=quantization_config, device_map="auto"
+        )
+        processor = self.default_processor
+        inputs = processor(prompts, return_tensors="pt").to(torch_device)
+        generated_ids = model.generate(**inputs, max_length=100)
+        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+
+        # keep for debugging
+        for i, t in enumerate(generated_text):
+            t = bytes(t, "utf-8").decode("unicode_escape")
+            print(f"{i}:\n{t}\n")
+
+        self.assertIn("image of two cats", generated_text[0])
+        self.assertIn("image of two dogs", generated_text[1])
diff --git a/tests/models/idefics/test_processor_idefics.py b/tests/models/idefics/test_processor_idefics.py
new file mode 100644
index 000000000000..523b7a551557
--- /dev/null
+++ b/tests/models/idefics/test_processor_idefics.py
@@ -0,0 +1,173 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from transformers.testing_utils import TestCasePlus, require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import (
+        AutoProcessor,
+        IdeficsImageProcessor,
+        IdeficsProcessor,
+        LlamaTokenizerFast,
+        PreTrainedTokenizerFast,
+    )
+
+
+@require_torch
+@require_vision
+class IdeficsProcessorTest(TestCasePlus):
+    def setUp(self):
+        super().setUp()
+
+        self.checkpoint_path = self.get_auto_remove_tmp_dir()
+
+        image_processor = IdeficsImageProcessor()
+        tokenizer = LlamaTokenizerFast.from_pretrained("HuggingFaceM4/tiny-random-idefics")
+
+        processor = IdeficsProcessor(image_processor, tokenizer)
+
+        processor.save_pretrained(self.checkpoint_path)
+
+        self.input_keys = ["pixel_values", "input_ids", "attention_mask", "image_attention_mask"]
+
+    def get_tokenizer(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.checkpoint_path, **kwargs).tokenizer
+
+    def get_image_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.checkpoint_path, **kwargs).image_processor
+
+    def prepare_prompts(self):
+        """This function prepares a list of PIL images"""
+
+        num_images = 2
+        images = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8) for x in range(num_images)]
+        images = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in images]
+
+        # print([type(x) for x in images])
+        # die
+
+        prompts = [
+            # text and 1 image
+            [
+                "User:",
+                images[0],
+                "Describe this image.\nAssistant:",
+            ],
+            # text and images
+            [
+                "User:",
+                images[0],
+                "Describe this image.\nAssistant: An image of two dogs.\n",
+                "User:",
+                images[1],
+                "Describe this image.\nAssistant:",
+            ],
+            # only text
+            [
+                "User:",
+                "Describe this image.\nAssistant: An image of two kittens.\n",
+                "User:",
+                "Describe this image.\nAssistant:",
+            ],
+            # only images
+            [
+                images[0],
+                images[1],
+            ],
+        ]
+
+        return prompts
+
+    def test_save_load_pretrained_additional_features(self):
+        processor = IdeficsProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
+        processor.save_pretrained(self.checkpoint_path)
+
+        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
+        image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
+
+        processor = IdeficsProcessor.from_pretrained(
+            self.checkpoint_path, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
+        )
+
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
+        self.assertIsInstance(processor.tokenizer, PreTrainedTokenizerFast)
+
+        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
+        self.assertIsInstance(processor.image_processor, IdeficsImageProcessor)
+
+    def test_processor(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = IdeficsProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        prompts = self.prepare_prompts()
+
+        # test that all prompts succeeded
+        input_processor = processor(prompts, return_tensors="pt")
+        for key in self.input_keys:
+            assert torch.is_tensor(input_processor[key])
+
+    def test_tokenizer_decode(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = IdeficsProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
+
+        decoded_processor = processor.batch_decode(predicted_ids)
+        decoded_tok = tokenizer.batch_decode(predicted_ids)
+
+        self.assertListEqual(decoded_tok, decoded_processor)
+
+    def test_tokenizer_padding(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer(padding_side="right")
+
+        processor = IdeficsProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        predicted_tokens = [
+            "<s>Describe this image.\nAssistant:<unk><unk><unk><unk><unk><unk><unk><unk><unk>",
+            "<s>Describe this image.\nAssistant:<unk><unk><unk><unk><unk><unk><unk><unk><unk><unk>",
+        ]
+
+        prompts = [[prompt] for prompt in self.prepare_prompts()[2]]
+        max_length = processor(prompts, padding="max_length", truncation=True, max_length=20)
+        longest = processor(prompts, padding="longest", truncation=True, max_length=30)
+        decoded_max_length = processor.tokenizer.decode(max_length["input_ids"][-1])
+        decoded_longest = processor.tokenizer.decode(longest["input_ids"][-1])
+        self.assertEqual(decoded_max_length, predicted_tokens[1])
+        self.assertEqual(decoded_longest, predicted_tokens[0])
+
+    def test_model_input_names(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = IdeficsProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        prompts = self.prepare_prompts()
+
+        inputs = processor(prompts)
+
+        # For now the processor supports only ['pixel_values', 'input_ids', 'attention_mask']
+        self.assertSetEqual(set(inputs.keys()), set(self.input_keys))
diff --git a/tests/models/imagegpt/test_feature_extraction_imagegpt.py b/tests/models/imagegpt/test_feature_extraction_imagegpt.py
deleted file mode 100644
index 465a6015a39a..000000000000
--- a/tests/models/imagegpt/test_feature_extraction_imagegpt.py
+++ /dev/null
@@ -1,185 +0,0 @@
-# coding=utf-8
-# Copyright 2021 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-import os
-import tempfile
-import unittest
-
-import numpy as np
-from datasets import load_dataset
-
-from transformers.testing_utils import require_torch, require_vision, slow
-from transformers.utils import is_torch_available, is_vision_available
-
-from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin
-
-
-if is_torch_available():
-    import torch
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import ImageGPTFeatureExtractor
-
-
-class ImageGPTFeatureExtractionTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_normalize=True,
-    ):
-        size = size if size is not None else {"height": 18, "width": 18}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_normalize = do_normalize
-
-    def prepare_feat_extract_dict(self):
-        return {
-            # here we create 2 clusters for the sake of simplicity
-            "clusters": np.asarray(
-                [
-                    [0.8866443634033203, 0.6618829369544983, 0.3891746401786804],
-                    [-0.6042559146881104, -0.02295008860528469, 0.5423797369003296],
-                ]
-            ),
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_normalize": self.do_normalize,
-        }
-
-
-@require_torch
-@require_vision
-class ImageGPTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
-
-    feature_extraction_class = ImageGPTFeatureExtractor if is_vision_available() else None
-
-    def setUp(self):
-        self.feature_extract_tester = ImageGPTFeatureExtractionTester(self)
-
-    @property
-    def feat_extract_dict(self):
-        return self.feature_extract_tester.prepare_feat_extract_dict()
-
-    def test_feat_extract_properties(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        self.assertTrue(hasattr(feature_extractor, "clusters"))
-        self.assertTrue(hasattr(feature_extractor, "do_resize"))
-        self.assertTrue(hasattr(feature_extractor, "size"))
-        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
-
-    def test_feat_extract_from_dict_with_kwargs(self):
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict)
-        self.assertEqual(feature_extractor.size, {"height": 18, "width": 18})
-
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict, size=42)
-        self.assertEqual(feature_extractor.size, {"height": 42, "width": 42})
-
-    def test_feat_extract_to_json_string(self):
-        feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
-        obj = json.loads(feat_extract.to_json_string())
-        for key, value in self.feat_extract_dict.items():
-            if key == "clusters":
-                self.assertTrue(np.array_equal(value, obj[key]))
-            else:
-                self.assertEqual(obj[key], value)
-
-    def test_feat_extract_to_json_file(self):
-        feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            json_file_path = os.path.join(tmpdirname, "feat_extract.json")
-            feat_extract_first.to_json_file(json_file_path)
-            feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path).to_dict()
-
-        feat_extract_first = feat_extract_first.to_dict()
-        for key, value in feat_extract_first.items():
-            if key == "clusters":
-                self.assertTrue(np.array_equal(value, feat_extract_second[key]))
-            else:
-                self.assertEqual(feat_extract_first[key], value)
-
-    def test_feat_extract_from_and_save_pretrained(self):
-        feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            feat_extract_first.save_pretrained(tmpdirname)
-            feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname).to_dict()
-
-        feat_extract_first = feat_extract_first.to_dict()
-        for key, value in feat_extract_first.items():
-            if key == "clusters":
-                self.assertTrue(np.array_equal(value, feat_extract_second[key]))
-            else:
-                self.assertEqual(feat_extract_first[key], value)
-
-    @unittest.skip("ImageGPT requires clusters at initialization")
-    def test_init_without_params(self):
-        pass
-
-
-def prepare_images():
-    dataset = load_dataset("hf-internal-testing/fixtures_image_utils", split="test")
-
-    image1 = Image.open(dataset[4]["file"])
-    image2 = Image.open(dataset[5]["file"])
-
-    images = [image1, image2]
-
-    return images
-
-
-@require_vision
-@require_torch
-class ImageGPTFeatureExtractorIntegrationTest(unittest.TestCase):
-    @slow
-    def test_image(self):
-        feature_extractor = ImageGPTFeatureExtractor.from_pretrained("openai/imagegpt-small")
-
-        images = prepare_images()
-
-        # test non-batched
-        encoding = feature_extractor(images[0], return_tensors="pt")
-
-        self.assertIsInstance(encoding.input_ids, torch.LongTensor)
-        self.assertEqual(encoding.input_ids.shape, (1, 1024))
-
-        expected_slice = [306, 191, 191]
-        self.assertEqual(encoding.input_ids[0, :3].tolist(), expected_slice)
-
-        # test batched
-        encoding = feature_extractor(images, return_tensors="pt")
-
-        self.assertIsInstance(encoding.input_ids, torch.LongTensor)
-        self.assertEqual(encoding.input_ids.shape, (2, 1024))
-
-        expected_slice = [303, 13, 13]
-        self.assertEqual(encoding.input_ids[1, -3:].tolist(), expected_slice)
diff --git a/tests/models/imagegpt/test_image_processing_imagegpt.py b/tests/models/imagegpt/test_image_processing_imagegpt.py
new file mode 100644
index 000000000000..a806f032435c
--- /dev/null
+++ b/tests/models/imagegpt/test_image_processing_imagegpt.py
@@ -0,0 +1,264 @@
+# coding=utf-8
+# Copyright 2021 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import os
+import tempfile
+import unittest
+
+import numpy as np
+from datasets import load_dataset
+
+from transformers.testing_utils import require_torch, require_vision, slow
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import ImageGPTImageProcessor
+
+
+class ImageGPTImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_normalize=True,
+    ):
+        size = size if size is not None else {"height": 18, "width": 18}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_normalize = do_normalize
+
+    def prepare_image_processor_dict(self):
+        return {
+            # here we create 2 clusters for the sake of simplicity
+            "clusters": np.asarray(
+                [
+                    [0.8866443634033203, 0.6618829369544983, 0.3891746401786804],
+                    [-0.6042559146881104, -0.02295008860528469, 0.5423797369003296],
+                ]
+            ),
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_normalize": self.do_normalize,
+        }
+
+    def expected_output_image_shape(self, images):
+        return (self.size["height"] * self.size["width"],)
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class ImageGPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = ImageGPTImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = ImageGPTImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "clusters"))
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"height": 18, "width": 18})
+
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
+        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
+
+    def test_image_processor_to_json_string(self):
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+        obj = json.loads(image_processor.to_json_string())
+        for key, value in self.image_processor_dict.items():
+            if key == "clusters":
+                self.assertTrue(np.array_equal(value, obj[key]))
+            else:
+                self.assertEqual(obj[key], value)
+
+    def test_image_processor_to_json_file(self):
+        image_processor_first = self.image_processing_class(**self.image_processor_dict)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            json_file_path = os.path.join(tmpdirname, "image_processor.json")
+            image_processor_first.to_json_file(json_file_path)
+            image_processor_second = self.image_processing_class.from_json_file(json_file_path).to_dict()
+
+        image_processor_first = image_processor_first.to_dict()
+        for key, value in image_processor_first.items():
+            if key == "clusters":
+                self.assertTrue(np.array_equal(value, image_processor_second[key]))
+            else:
+                self.assertEqual(image_processor_first[key], value)
+
+    def test_image_processor_from_and_save_pretrained(self):
+        image_processor_first = self.image_processing_class(**self.image_processor_dict)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            image_processor_first.save_pretrained(tmpdirname)
+            image_processor_second = self.image_processing_class.from_pretrained(tmpdirname).to_dict()
+
+        image_processor_first = image_processor_first.to_dict()
+        for key, value in image_processor_first.items():
+            if key == "clusters":
+                self.assertTrue(np.array_equal(value, image_processor_second[key]))
+            else:
+                self.assertEqual(image_processor_first[key], value)
+
+    @unittest.skip("ImageGPT requires clusters at initialization")
+    def test_init_without_params(self):
+        pass
+
+    # Override the test from ImageProcessingTestMixin as ImageGPT model takes input_ids as input
+    def test_call_pil(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PIL images
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
+        for image in image_inputs:
+            self.assertIsInstance(image, Image.Image)
+
+        # Test not batched input
+        encoded_images = image_processing(image_inputs[0], return_tensors="pt").input_ids
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(encoded_images)
+        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
+
+        # Test batched
+        encoded_images = image_processing(image_inputs, return_tensors="pt").input_ids
+        self.assertEqual(
+            tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
+        )
+
+    # Override the test from ImageProcessingTestMixin as ImageGPT model takes input_ids as input
+    def test_call_numpy(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random numpy tensors
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, np.ndarray)
+
+        # Test not batched input
+        encoded_images = image_processing(image_inputs[0], return_tensors="pt").input_ids
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(encoded_images)
+        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
+
+        # Test batched
+        encoded_images = image_processing(image_inputs, return_tensors="pt").input_ids
+        self.assertEqual(
+            tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
+        )
+
+    @unittest.skip("ImageGPT assumes clusters for 3 channels")
+    def test_call_numpy_4_channels(self):
+        pass
+
+    # Override the test from ImageProcessingTestMixin as ImageGPT model takes input_ids as input
+    def test_call_pytorch(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PyTorch tensors
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+
+        for image in image_inputs:
+            self.assertIsInstance(image, torch.Tensor)
+
+        # Test not batched input
+        encoded_images = image_processing(image_inputs[0], return_tensors="pt").input_ids
+        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
+
+        # Test batched
+        encoded_images = image_processing(image_inputs, return_tensors="pt").input_ids
+        self.assertEqual(
+            tuple(encoded_images.shape),
+            (self.image_processor_tester.batch_size, *expected_output_image_shape),
+        )
+
+
+def prepare_images():
+    dataset = load_dataset("hf-internal-testing/fixtures_image_utils", split="test")
+
+    image1 = Image.open(dataset[4]["file"])
+    image2 = Image.open(dataset[5]["file"])
+
+    images = [image1, image2]
+
+    return images
+
+
+@require_vision
+@require_torch
+class ImageGPTImageProcessorIntegrationTest(unittest.TestCase):
+    @slow
+    def test_image(self):
+        image_processing = ImageGPTImageProcessor.from_pretrained("openai/imagegpt-small")
+
+        images = prepare_images()
+
+        # test non-batched
+        encoding = image_processing(images[0], return_tensors="pt")
+
+        self.assertIsInstance(encoding.input_ids, torch.LongTensor)
+        self.assertEqual(encoding.input_ids.shape, (1, 1024))
+
+        expected_slice = [306, 191, 191]
+        self.assertEqual(encoding.input_ids[0, :3].tolist(), expected_slice)
+
+        # test batched
+        encoding = image_processing(images, return_tensors="pt")
+
+        self.assertIsInstance(encoding.input_ids, torch.LongTensor)
+        self.assertEqual(encoding.input_ids.shape, (2, 1024))
+
+        expected_slice = [303, 13, 13]
+        self.assertEqual(encoding.input_ids[1, -3:].tolist(), expected_slice)
diff --git a/tests/models/imagegpt/test_modeling_imagegpt.py b/tests/models/imagegpt/test_modeling_imagegpt.py
index 88e1e76c4508..b4e2cd5ab413 100644
--- a/tests/models/imagegpt/test_modeling_imagegpt.py
+++ b/tests/models/imagegpt/test_modeling_imagegpt.py
@@ -33,6 +33,7 @@
     ids_tensor,
     random_attention_mask,
 )
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -48,7 +49,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import ImageGPTFeatureExtractor
+    from transformers import ImageGPTImageProcessor
 
 
 class ImageGPTModelTester:
@@ -64,7 +65,7 @@ def __init__(
         use_mc_token_ids=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -264,12 +265,16 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class ImageGPTModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
+class ImageGPTModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (ImageGPTForCausalImageModeling, ImageGPTForImageClassification, ImageGPTModel) if is_torch_available() else ()
     )
     all_generative_model_classes = (ImageGPTForCausalImageModeling,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": ImageGPTModel, "image-classification": ImageGPTForImageClassification}
+        if is_torch_available()
+        else {}
+    )
     test_missing_keys = False
     input_name = "pixel_values"
 
@@ -495,6 +500,17 @@ def _create_and_check_torchscript(self, config, inputs_dict):
 
             self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
 
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
             model_buffers = list(model.buffers())
             for non_persistent_buffer in non_persistent_buffers.values():
                 found_buffer = False
@@ -515,6 +531,10 @@ def _create_and_check_torchscript(self, config, inputs_dict):
 
             self.assertTrue(models_equal)
 
+    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
+    def test_left_padding_compatibility(self):
+        pass
+
 
 # We will verify our results on an image of cute cats
 def prepare_img():
@@ -526,16 +546,16 @@ def prepare_img():
 @require_vision
 class ImageGPTModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return ImageGPTFeatureExtractor.from_pretrained("openai/imagegpt-small") if is_vision_available() else None
+    def default_image_processor(self):
+        return ImageGPTImageProcessor.from_pretrained("openai/imagegpt-small") if is_vision_available() else None
 
     @slow
     def test_inference_causal_lm_head(self):
         model = ImageGPTForCausalImageModeling.from_pretrained("openai/imagegpt-small").to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/informer/__init__.py b/tests/models/informer/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/informer/test_modeling_informer.py b/tests/models/informer/test_modeling_informer.py
new file mode 100644
index 000000000000..f3c8539d8450
--- /dev/null
+++ b/tests/models/informer/test_modeling_informer.py
@@ -0,0 +1,513 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Informer model. """
+
+import inspect
+import tempfile
+import unittest
+
+import numpy as np
+from huggingface_hub import hf_hub_download
+
+from transformers import is_torch_available
+from transformers.testing_utils import is_flaky, require_torch, slow, torch_device
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+TOLERANCE = 1e-4
+
+if is_torch_available():
+    import torch
+
+    from transformers import InformerConfig, InformerForPrediction, InformerModel
+    from transformers.models.informer.modeling_informer import InformerDecoder, InformerEncoder
+
+
+@require_torch
+class InformerModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        prediction_length=7,
+        context_length=14,
+        cardinality=19,
+        embedding_dimension=5,
+        num_time_features=4,
+        is_training=True,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        lags_sequence=[1, 2, 3, 4, 5],
+        sampling_factor=10,
+        distil=False,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.prediction_length = prediction_length
+        self.context_length = context_length
+        self.cardinality = cardinality
+        self.num_time_features = num_time_features
+        self.lags_sequence = lags_sequence
+        self.embedding_dimension = embedding_dimension
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+
+        self.encoder_seq_length = min(
+            sampling_factor * np.ceil(np.log1p(context_length)).astype("int").item(), context_length
+        )
+        self.decoder_seq_length = min(
+            sampling_factor * np.ceil(np.log1p(prediction_length)).astype("int").item(), prediction_length
+        )
+        self.sampling_factor = sampling_factor
+        self.distil = distil
+
+    def get_config(self):
+        return InformerConfig(
+            prediction_length=self.prediction_length,
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            context_length=self.context_length,
+            lags_sequence=self.lags_sequence,
+            num_time_features=self.num_time_features,
+            num_static_categorical_features=1,
+            num_static_real_features=1,
+            cardinality=[self.cardinality],
+            embedding_dimension=[self.embedding_dimension],
+            sampling_factor=self.sampling_factor,
+            distil=self.distil,
+        )
+
+    def prepare_informer_inputs_dict(self, config):
+        _past_length = config.context_length + max(config.lags_sequence)
+
+        static_categorical_features = ids_tensor([self.batch_size, 1], config.cardinality[0])
+        static_real_features = floats_tensor([self.batch_size, 1])
+
+        past_time_features = floats_tensor([self.batch_size, _past_length, config.num_time_features])
+        past_values = floats_tensor([self.batch_size, _past_length])
+        past_observed_mask = floats_tensor([self.batch_size, _past_length]) > 0.5
+
+        # decoder inputs
+        future_time_features = floats_tensor([self.batch_size, config.prediction_length, config.num_time_features])
+        future_values = floats_tensor([self.batch_size, config.prediction_length])
+
+        inputs_dict = {
+            "past_values": past_values,
+            "static_categorical_features": static_categorical_features,
+            "static_real_features": static_real_features,
+            "past_time_features": past_time_features,
+            "past_observed_mask": past_observed_mask,
+            "future_time_features": future_time_features,
+            "future_values": future_values,
+        }
+        return inputs_dict
+
+    def prepare_config_and_inputs(self):
+        config = self.get_config()
+        inputs_dict = self.prepare_informer_inputs_dict(config)
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
+        model = InformerModel(config=config).to(torch_device).eval()
+        outputs = model(**inputs_dict)
+
+        encoder_last_hidden_state = outputs.encoder_last_hidden_state
+        last_hidden_state = outputs.last_hidden_state
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            encoder = model.get_encoder()
+            encoder.save_pretrained(tmpdirname)
+            encoder = InformerEncoder.from_pretrained(tmpdirname).to(torch_device)
+
+        transformer_inputs, _, _, _ = model.create_network_inputs(**inputs_dict)
+        enc_input = transformer_inputs[:, : config.context_length, ...]
+        dec_input = transformer_inputs[:, config.context_length :, ...]
+
+        encoder_last_hidden_state_2 = encoder(inputs_embeds=enc_input)[0]
+
+        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            decoder = model.get_decoder()
+            decoder.save_pretrained(tmpdirname)
+            decoder = InformerDecoder.from_pretrained(tmpdirname).to(torch_device)
+
+        last_hidden_state_2 = decoder(
+            inputs_embeds=dec_input,
+            encoder_hidden_states=encoder_last_hidden_state,
+        )[0]
+
+        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
+
+
+@require_torch
+class InformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (InformerModel, InformerForPrediction) if is_torch_available() else ()
+    all_generative_model_classes = (InformerForPrediction,) if is_torch_available() else ()
+    pipeline_model_mapping = {"feature-extraction": InformerModel} if is_torch_available() else {}
+    is_encoder_decoder = True
+    test_pruning = False
+    test_head_masking = False
+    test_missing_keys = False
+    test_torchscript = False
+    test_inputs_embeds = False
+    test_model_common_attributes = False
+
+    def setUp(self):
+        self.model_tester = InformerModelTester(self)
+        self.config_tester = ConfigTester(
+            self,
+            config_class=InformerConfig,
+            has_text_modality=False,
+            prediction_length=self.model_tester.prediction_length,
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_save_load_strict(self):
+        config, _ = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
+            self.assertEqual(info["missing_keys"], [])
+
+    def test_encoder_decoder_model_standalone(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+            expected_num_layers = getattr(
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+            )
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            if hasattr(self.model_tester, "encoder_seq_length"):
+                seq_length = self.model_tester.context_length
+                if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1:
+                    seq_length = seq_length * self.model_tester.chunk_length
+            else:
+                seq_length = self.model_tester.seq_length
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [seq_length, self.model_tester.hidden_size],
+            )
+
+            if config.is_encoder_decoder:
+                hidden_states = outputs.decoder_hidden_states
+
+                self.assertIsInstance(hidden_states, (list, tuple))
+                self.assertEqual(len(hidden_states), expected_num_layers)
+                seq_len = getattr(self.model_tester, "seq_length", None)
+                decoder_seq_length = getattr(self.model_tester, "prediction_length", seq_len)
+
+                self.assertListEqual(
+                    list(hidden_states[0].shape[-2:]),
+                    [decoder_seq_length, self.model_tester.hidden_size],
+                )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    # Ignore since we have no tokens embeddings
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    def test_model_outputs_equivalence(self):
+        pass
+
+    def test_determinism(self):
+        pass
+
+    # # Input is 'static_categorical_features' not 'input_ids'
+    def test_model_main_input_name(self):
+        model_signature = inspect.signature(getattr(InformerModel, "forward"))
+        # The main input is the name of the argument after `self`
+        observed_main_input_name = list(model_signature.parameters.keys())[1]
+        self.assertEqual(InformerModel.main_input_name, observed_main_input_name)
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = [
+                "past_values",
+                "past_time_features",
+                "past_observed_mask",
+                "static_categorical_features",
+                "static_real_features",
+                "future_values",
+                "future_time_features",
+            ]
+
+            expected_arg_names.extend(
+                [
+                    "future_observed_mask",
+                    "decoder_attention_mask",
+                    "head_mask",
+                    "decoder_head_mask",
+                    "cross_attn_head_mask",
+                    "encoder_outputs",
+                    "past_key_values",
+                    "output_hidden_states",
+                    "output_attentions",
+                    "use_cache",
+                    "return_dict",
+                ]
+                if "future_observed_mask" in arg_names
+                else [
+                    "decoder_attention_mask",
+                    "head_mask",
+                    "decoder_head_mask",
+                    "cross_attn_head_mask",
+                    "encoder_outputs",
+                    "past_key_values",
+                    "output_hidden_states",
+                    "output_attentions",
+                    "use_cache",
+                    "return_dict",
+                ]
+            )
+
+            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        seq_len = getattr(self.model_tester, "seq_length", None)
+        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
+        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
+        context_length = getattr(self.model_tester, "context_length", seq_len)
+        prediction_length = getattr(self.model_tester, "prediction_length", seq_len)
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, encoder_seq_length, context_length],
+            )
+            out_len = len(outputs)
+
+            correct_outlen = 7
+
+            if "last_hidden_state" in outputs:
+                correct_outlen += 1
+
+            if "past_key_values" in outputs:
+                correct_outlen += 1  # past_key_values have been returned
+
+            if "loss" in outputs:
+                correct_outlen += 1
+
+            if "params" in outputs:
+                correct_outlen += 1
+
+            self.assertEqual(out_len, correct_outlen)
+
+            # decoder attentions
+            decoder_attentions = outputs.decoder_attentions
+            self.assertIsInstance(decoder_attentions, (list, tuple))
+            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(decoder_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, decoder_seq_length, prediction_length],
+            )
+
+            # cross attentions
+            cross_attentions = outputs.cross_attentions
+            self.assertIsInstance(cross_attentions, (list, tuple))
+            self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(cross_attentions[0].shape[-3:]),
+                [
+                    self.model_tester.num_attention_heads,
+                    decoder_seq_length,
+                    encoder_seq_length,
+                ],
+            )
+
+        # Check attention is always last and order is fine
+        inputs_dict["output_attentions"] = True
+        inputs_dict["output_hidden_states"] = True
+        model = model_class(config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+        self.assertEqual(out_len + 2, len(outputs))
+
+        self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+
+        self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+        self.assertListEqual(
+            list(self_attentions[0].shape[-3:]),
+            [self.model_tester.num_attention_heads, encoder_seq_length, context_length],
+        )
+
+    @is_flaky()
+    def test_retain_grad_hidden_states_attentions(self):
+        super().test_retain_grad_hidden_states_attentions()
+
+
+def prepare_batch(filename="train-batch.pt"):
+    file = hf_hub_download(repo_id="hf-internal-testing/tourism-monthly-batch", filename=filename, repo_type="dataset")
+    batch = torch.load(file, map_location=torch_device)
+    return batch
+
+
+@require_torch
+@slow
+class InformerModelIntegrationTests(unittest.TestCase):
+    def test_inference_no_head(self):
+        model = InformerModel.from_pretrained("huggingface/informer-tourism-monthly").to(torch_device)
+        batch = prepare_batch()
+
+        torch.manual_seed(0)
+        with torch.no_grad():
+            output = model(
+                past_values=batch["past_values"],
+                past_time_features=batch["past_time_features"],
+                past_observed_mask=batch["past_observed_mask"],
+                static_categorical_features=batch["static_categorical_features"],
+                future_values=batch["future_values"],
+                future_time_features=batch["future_time_features"],
+            ).last_hidden_state
+        expected_shape = torch.Size((64, model.config.context_length, model.config.d_model))
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[0.4699, 0.7295, 0.8967], [0.4858, 0.3810, 0.9641], [-0.0233, 0.3608, 1.0303]],
+            device=torch_device,
+        )
+        self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
+
+    def test_inference_head(self):
+        model = InformerForPrediction.from_pretrained("huggingface/informer-tourism-monthly").to(torch_device)
+        batch = prepare_batch("val-batch.pt")
+
+        torch.manual_seed(0)
+        with torch.no_grad():
+            output = model(
+                past_values=batch["past_values"],
+                past_time_features=batch["past_time_features"],
+                past_observed_mask=batch["past_observed_mask"],
+                static_categorical_features=batch["static_categorical_features"],
+                future_time_features=batch["future_time_features"],
+            ).encoder_last_hidden_state
+
+        # encoder distils the context length to 1/8th of the original length
+        expected_shape = torch.Size((64, model.config.context_length // 8, model.config.d_model))
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[0.4170, 0.9067, 0.8153], [0.3004, 0.7574, 0.7066], [0.6803, -0.6323, 1.2802]], device=torch_device
+        )
+        self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
+
+    def test_seq_to_seq_generation(self):
+        model = InformerForPrediction.from_pretrained("huggingface/informer-tourism-monthly").to(torch_device)
+        batch = prepare_batch("val-batch.pt")
+
+        torch.manual_seed(0)
+        with torch.no_grad():
+            outputs = model.generate(
+                static_categorical_features=batch["static_categorical_features"],
+                past_time_features=batch["past_time_features"],
+                past_values=batch["past_values"],
+                future_time_features=batch["future_time_features"],
+                past_observed_mask=batch["past_observed_mask"],
+            )
+        expected_shape = torch.Size((64, model.config.num_parallel_samples, model.config.prediction_length))
+        self.assertEqual(outputs.sequences.shape, expected_shape)
+
+        expected_slice = torch.tensor([3400.8005, 4289.2637, 7101.9209], device=torch_device)
+        mean_prediction = outputs.sequences.mean(dim=1)
+        self.assertTrue(torch.allclose(mean_prediction[0, -3:], expected_slice, rtol=1e-1))
diff --git a/tests/models/instructblip/__init__.py b/tests/models/instructblip/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py
new file mode 100644
index 000000000000..f0fd193b6488
--- /dev/null
+++ b/tests/models/instructblip/test_modeling_instructblip.py
@@ -0,0 +1,604 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch InstructBLIP model. """
+
+
+import inspect
+import tempfile
+import unittest
+
+import numpy as np
+import requests
+
+from transformers import (
+    CONFIG_MAPPING,
+    InstructBlipConfig,
+    InstructBlipProcessor,
+    InstructBlipQFormerConfig,
+    InstructBlipVisionConfig,
+)
+from transformers.testing_utils import (
+    require_accelerate,
+    require_bitsandbytes,
+    require_torch,
+    require_vision,
+    slow,
+    torch_device,
+)
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import (
+    ModelTesterMixin,
+    floats_tensor,
+    ids_tensor,
+    random_attention_mask,
+)
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import InstructBlipForConditionalGeneration, InstructBlipVisionModel
+    from transformers.models.instructblip.modeling_instructblip import INSTRUCTBLIP_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+if is_vision_available():
+    from PIL import Image
+
+
+class InstructBlipVisionModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        image_size=30,
+        patch_size=2,
+        num_channels=3,
+        is_training=True,
+        hidden_size=32,
+        projection_dim=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        initializer_range=1e-10,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.scope = scope
+
+        # in case of a vision transformer, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
+        num_patches = (image_size // patch_size) ** 2
+        self.seq_length = num_patches + 1
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        config = self.get_config()
+
+        return config, pixel_values
+
+    def get_config(self):
+        return InstructBlipVisionConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            initializer_range=self.initializer_range,
+        )
+
+    def create_and_check_model(self, config, pixel_values):
+        model = InstructBlipVisionModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(pixel_values)
+        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
+        image_size = (self.image_size, self.image_size)
+        patch_size = (self.patch_size, self.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class InstructBlipVisionModelTest(ModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as InstructBLIP's vision encoder does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (InstructBlipVisionModel,) if is_torch_available() else ()
+    fx_compatible = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = InstructBlipVisionModelTester(self)
+        self.config_tester = ConfigTester(
+            self, config_class=InstructBlipVisionConfig, has_text_modality=False, hidden_size=37
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="InstructBLIP's vision encoder does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip(reason="InstructBlipVisionModel is an internal building block, doesn't support standalone training")
+    def test_training(self):
+        pass
+
+    @unittest.skip(reason="InstructBlipVisionModel is an internal building block, doesn't support standalone training")
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(reason="InstructBlipVisionModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @unittest.skip(reason="InstructBlipVisionModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_to_base(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in INSTRUCTBLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = InstructBlipVisionModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+class InstructBlipQFormerModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        projection_dim=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        bos_token_id=0,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.scope = scope
+        self.bos_token_id = bos_token_id
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        qformer_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+            qformer_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        if input_mask is not None:
+            batch_size, seq_length = input_mask.shape
+            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
+            for batch_idx, start_index in enumerate(rnd_start_indices):
+                input_mask[batch_idx, :start_index] = 1
+                input_mask[batch_idx, start_index:] = 0
+
+        config = self.get_config()
+
+        return config, input_ids, input_mask, qformer_input_ids, qformer_attention_mask
+
+    def get_config(self):
+        return InstructBlipQFormerConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            max_position_embeddings=self.max_position_embeddings,
+            initializer_range=self.initializer_range,
+            bos_token_id=self.bos_token_id,
+        )
+
+
+# this class is based on `OPTModelTester` found in tests/models/opt/test_modeling_opt.py
+class InstructBlipTextModelDecoderOnlyTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        seq_length=7,
+        is_training=True,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=20,
+        eos_token_id=2,
+        pad_token_id=1,
+        bos_token_id=0,
+        embed_dim=16,
+        num_labels=3,
+        word_embed_proj_dim=16,
+        type_sequence_label_size=2,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.embed_dim = embed_dim
+        self.num_labels = num_labels
+        self.type_sequence_label_size = type_sequence_label_size
+        self.word_embed_proj_dim = word_embed_proj_dim
+        self.is_encoder_decoder = False
+
+    def prepare_config_and_inputs(self):
+        config = self.get_config()
+
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(3)
+        input_ids[:, -1] = self.eos_token_id  # Eos Token
+
+        attention_mask = input_ids.ne(self.pad_token_id)
+
+        return config, input_ids, attention_mask
+
+    def get_config(self):
+        return CONFIG_MAPPING["opt"](
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
+            embed_dim=self.embed_dim,
+            is_encoder_decoder=False,
+            word_embed_proj_dim=self.word_embed_proj_dim,
+        )
+
+
+# this model tester uses a decoder-only language model (OPT)
+class InstructBlipForConditionalGenerationDecoderOnlyModelTester:
+    def __init__(
+        self, parent, vision_kwargs=None, qformer_kwargs=None, text_kwargs=None, is_training=True, num_query_tokens=10
+    ):
+        if vision_kwargs is None:
+            vision_kwargs = {}
+        if qformer_kwargs is None:
+            qformer_kwargs = {}
+        if text_kwargs is None:
+            text_kwargs = {}
+
+        self.parent = parent
+        self.vision_model_tester = InstructBlipVisionModelTester(parent, **vision_kwargs)
+        self.qformer_model_tester = InstructBlipQFormerModelTester(parent, **qformer_kwargs)
+        self.text_model_tester = InstructBlipTextModelDecoderOnlyTester(parent, **text_kwargs)
+        self.is_training = is_training
+        self.num_query_tokens = num_query_tokens
+
+    def prepare_config_and_inputs(self):
+        _, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
+        _, _, _, qformer_input_ids, qformer_attention_mask = self.qformer_model_tester.prepare_config_and_inputs()
+        _, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
+
+        config = self.get_config()
+
+        return config, input_ids, attention_mask, qformer_input_ids, qformer_attention_mask, pixel_values
+
+    def get_config(self):
+        return InstructBlipConfig.from_vision_qformer_text_configs(
+            vision_config=self.vision_model_tester.get_config(),
+            qformer_config=self.qformer_model_tester.get_config(),
+            text_config=self.text_model_tester.get_config(),
+            num_query_tokens=self.num_query_tokens,
+        )
+
+    def create_and_check_for_conditional_generation(
+        self, config, input_ids, attention_mask, qformer_input_ids, qformer_attention_mask, pixel_values
+    ):
+        model = InstructBlipForConditionalGeneration(config).to(torch_device).eval()
+        with torch.no_grad():
+            result = model(
+                pixel_values,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                qformer_input_ids=qformer_input_ids,
+                qformer_attention_mask=qformer_attention_mask,
+            )
+
+        expected_seq_length = self.num_query_tokens + self.text_model_tester.seq_length
+        self.parent.assertEqual(
+            result.logits.shape,
+            (self.vision_model_tester.batch_size, expected_seq_length, self.text_model_tester.vocab_size),
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, attention_mask, qformer_input_ids, qformer_attention_mask, pixel_values = config_and_inputs
+        inputs_dict = {
+            "pixel_values": pixel_values,
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "qformer_input_ids": qformer_input_ids,
+            "qformer_attention_mask": qformer_attention_mask,
+            "labels": input_ids,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (InstructBlipForConditionalGeneration,) if is_torch_available() else ()
+    fx_compatible = False
+    test_head_masking = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_attention_outputs = False
+    test_torchscript = False
+
+    def setUp(self):
+        self.model_tester = InstructBlipForConditionalGenerationDecoderOnlyModelTester(self)
+
+    def test_for_conditional_generation(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs)
+
+    @unittest.skip(reason="Hidden_states is tested in individual model tests")
+    def test_hidden_states_output(self):
+        pass
+
+    @unittest.skip(reason="InstructBlipForConditionalGeneration doesn't support inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="Tied weights are tested in individual model tests")
+    def test_tied_weights_keys(self):
+        pass
+
+    @unittest.skip(reason="Retain_grad is tested in individual model tests")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    @unittest.skip(reason="InstructBlipModel does not have input/output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    @unittest.skip(reason="There's no base InstructBlipModel")
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @unittest.skip(reason="There's no base InstructBlipModel")
+    def test_save_load_fast_init_to_base(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_load_vision_qformer_text_config(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        # Save InstructBlipConfig and check if we can load InstructBlipVisionConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            vision_config = InstructBlipVisionConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
+
+        # Save InstructBlipConfig and check if we can load InstructBlipQFormerConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            qformer_config = InstructBlipQFormerConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.qformer_config.to_dict(), qformer_config.to_dict())
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in INSTRUCTBLIP_PRETRAINED_MODEL_ARCHIVE_LIST:
+            model = InstructBlipForConditionalGeneration.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "https://huggingface.co/hf-internal-testing/blip-test-image/resolve/main/demo.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+    return image
+
+
+@require_vision
+@require_torch
+@slow
+class InstructBlipModelIntegrationTest(unittest.TestCase):
+    @require_bitsandbytes
+    @require_accelerate
+    def test_inference_vicuna_7b(self):
+        processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")
+        model = InstructBlipForConditionalGeneration.from_pretrained(
+            "Salesforce/instructblip-vicuna-7b", load_in_8bit=True, low_cpu_mem_usage=True
+        )
+
+        url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg"
+        image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+        prompt = "What is unusual about this image?"
+        inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
+
+        # verify logits
+        with torch.no_grad():
+            logits = model(**inputs).logits
+
+        expected_slice = torch.tensor(
+            [[-3.4902, -12.5078, 8.4141], [-5.1211, -12.1328, 7.8281], [-4.0312, -13.5938, 9.1172]],
+            device=torch_device,
+        )
+        self.assertTrue(torch.allclose(logits[0, :3, :3].float(), expected_slice, atol=1e-3))
+
+        # verify generation
+        outputs = model.generate(**inputs, max_new_tokens=30)
+        generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
+
+        # fmt: off
+        expected_outputs = [2, 450, 22910, 9565, 310, 445, 1967, 338, 393, 263, 767, 338, 13977, 292, 22095, 373, 278, 1250, 310, 263, 13328, 20134, 29963, 1550, 19500, 1623, 263, 19587, 4272, 11952, 29889]
+        # fmt: on
+        self.assertEqual(outputs[0].tolist(), expected_outputs)
+        self.assertEqual(
+            generated_text,
+            "The unusual aspect of this image is that a man is ironing clothes on the back of a yellow SUV while driving down a busy city street.",
+        )
+
+    def test_inference_flant5_xl(self):
+        processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-flan-t5-xl")
+        model = InstructBlipForConditionalGeneration.from_pretrained(
+            "Salesforce/instructblip-flan-t5-xl",
+            torch_dtype=torch.bfloat16,
+            low_cpu_mem_usage=True,
+        ).to(torch_device)
+
+        url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg"
+        image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+        prompt = "What is unusual about this image?"
+        inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device)
+
+        for k, v in inputs.items():
+            if torch.is_floating_point(v):
+                inputs[k] = v.to(torch.bfloat16)
+
+        outputs = model.generate(
+            **inputs,
+            do_sample=False,
+            num_beams=5,
+            max_length=256,
+            min_length=1,
+            top_p=0.9,
+            repetition_penalty=1.5,
+            length_penalty=1.0,
+            temperature=1,
+        )
+        generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
+
+        # fmt: off
+        expected_outputs = [0, 37, 1023, 9850, 7, 3, 9, 388, 3575, 53, 4954, 30, 8, 223, 13, 3, 9, 4459, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 37, 388, 19, 5119, 3, 9, 4459, 8677, 28, 3, 9, 2756, 4459, 6177, 6, 11, 3, 88, 19, 338, 46, 3575, 53, 1476, 12, 743, 112, 2491, 5, 37, 1023, 19, 7225, 788, 12, 8, 685, 24, 34, 1267, 3, 9, 388, 3575, 53, 4954, 30, 8, 223, 13, 3, 9, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 94, 19, 487, 24, 8, 388, 19, 1119, 12, 1097, 540, 57, 692, 112, 10428, 30, 8, 223, 13, 8, 4049, 6, 68, 34, 19, 92, 487, 24, 3, 88, 19, 1119, 12, 1097, 97, 57, 692, 112, 10428, 30, 8, 223, 13, 8, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 3, 13865, 13, 8, 1053, 21, 8, 388, 31, 7, 2874, 6, 34, 19, 964, 24, 3, 88, 19, 1119, 12, 1097, 97, 57, 692, 112, 10428, 30, 8, 223, 13, 8, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 1]
+        # fmt: on
+        self.assertEqual(outputs[0].tolist(), expected_outputs)
+        self.assertEqual(
+            generated_text,
+            "The image depicts a man ironing clothes on the back of a yellow van in the middle of a busy city street. The man is wearing a yellow shirt with a bright yellow tie, and he is using an ironing board to complete his task. The image is unusual due to the fact that it shows a man ironing clothes on the back of a van in the middle of a busy city street. It is possible that the man is trying to save money by doing his laundry on the back of the van, but it is also possible that he is trying to save time by doing his laundry on the back of the van in the middle of a busy city street. Regardless of the reason for the man's actions, it is clear that he is trying to save time by doing his laundry on the back of the van in the middle of a busy city street.",
+        )
diff --git a/tests/models/instructblip/test_processor_instructblip.py b/tests/models/instructblip/test_processor_instructblip.py
new file mode 100644
index 000000000000..06c68a8a5807
--- /dev/null
+++ b/tests/models/instructblip/test_processor_instructblip.py
@@ -0,0 +1,191 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import shutil
+import tempfile
+import unittest
+
+import numpy as np
+import pytest
+
+from transformers.testing_utils import require_vision
+from transformers.utils import is_vision_available
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import (
+        AutoProcessor,
+        BertTokenizerFast,
+        BlipImageProcessor,
+        GPT2Tokenizer,
+        InstructBlipProcessor,
+        PreTrainedTokenizerFast,
+    )
+
+
+@require_vision
+class InstructBlipProcessorTest(unittest.TestCase):
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+
+        image_processor = BlipImageProcessor()
+        tokenizer = GPT2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-GPT2Model")
+        qformer_tokenizer = BertTokenizerFast.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        processor = InstructBlipProcessor(image_processor, tokenizer, qformer_tokenizer)
+
+        processor.save_pretrained(self.tmpdirname)
+
+    def get_tokenizer(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
+
+    def get_image_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
+
+    def get_qformer_tokenizer(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).qformer_tokenizer
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def prepare_image_inputs(self):
+        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
+        or a list of PyTorch tensors if one specifies torchify=True.
+        """
+
+        image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
+
+        image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
+
+        return image_inputs
+
+    def test_save_load_pretrained_additional_features(self):
+        processor = InstructBlipProcessor(
+            tokenizer=self.get_tokenizer(),
+            image_processor=self.get_image_processor(),
+            qformer_tokenizer=self.get_qformer_tokenizer(),
+        )
+        processor.save_pretrained(self.tmpdirname)
+
+        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
+        image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
+
+        processor = InstructBlipProcessor.from_pretrained(
+            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
+        )
+
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
+        self.assertIsInstance(processor.tokenizer, PreTrainedTokenizerFast)
+
+        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
+        self.assertIsInstance(processor.image_processor, BlipImageProcessor)
+        self.assertIsInstance(processor.qformer_tokenizer, BertTokenizerFast)
+
+    def test_image_processor(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+        qformer_tokenizer = self.get_qformer_tokenizer()
+
+        processor = InstructBlipProcessor(
+            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
+        )
+
+        image_input = self.prepare_image_inputs()
+
+        input_feat_extract = image_processor(image_input, return_tensors="np")
+        input_processor = processor(images=image_input, return_tensors="np")
+
+        for key in input_feat_extract.keys():
+            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
+
+    def test_tokenizer(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+        qformer_tokenizer = self.get_qformer_tokenizer()
+
+        processor = InstructBlipProcessor(
+            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
+        )
+
+        input_str = "lower newer"
+
+        encoded_processor = processor(text=input_str)
+
+        encoded_tokens = tokenizer(input_str, return_token_type_ids=False)
+        encoded_tokens_qformer = qformer_tokenizer(input_str, return_token_type_ids=False)
+
+        for key in encoded_tokens.keys():
+            self.assertListEqual(encoded_tokens[key], encoded_processor[key])
+
+        for key in encoded_tokens_qformer.keys():
+            self.assertListEqual(encoded_tokens_qformer[key], encoded_processor["qformer_" + key])
+
+    def test_processor(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+        qformer_tokenizer = self.get_qformer_tokenizer()
+
+        processor = InstructBlipProcessor(
+            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
+        )
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+
+        self.assertListEqual(
+            list(inputs.keys()),
+            ["input_ids", "attention_mask", "qformer_input_ids", "qformer_attention_mask", "pixel_values"],
+        )
+
+        # test if it raises when no input is passed
+        with pytest.raises(ValueError):
+            processor()
+
+    def test_tokenizer_decode(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+        qformer_tokenizer = self.get_qformer_tokenizer()
+
+        processor = InstructBlipProcessor(
+            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
+        )
+
+        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
+
+        decoded_processor = processor.batch_decode(predicted_ids)
+        decoded_tok = tokenizer.batch_decode(predicted_ids)
+
+        self.assertListEqual(decoded_tok, decoded_processor)
+
+    def test_model_input_names(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+        qformer_tokenizer = self.get_qformer_tokenizer()
+
+        processor = InstructBlipProcessor(
+            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
+        )
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+
+        self.assertListEqual(
+            list(inputs.keys()),
+            ["input_ids", "attention_mask", "qformer_input_ids", "qformer_attention_mask", "pixel_values"],
+        )
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index e77c8cb2eb9b..8de0696c04b1 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -16,7 +16,7 @@
 from unittest import skip
 
 from transformers import is_torch_available
-from transformers.testing_utils import require_torch, slow
+from transformers.testing_utils import require_torch, require_torch_gpu, slow, torch_device
 from transformers.trainer_utils import set_seed
 
 
@@ -30,10 +30,10 @@
 class Jukebox1bModelTester(unittest.TestCase):
     all_model_classes = (JukeboxModel,) if is_torch_available() else ()
     model_id = "openai/jukebox-1b-lyrics"
-    metas = dict(
-        artist="Zac Brown Band",
-        genres="Country",
-        lyrics="""I met a traveller from an antique land,
+    metas = {
+        "artist": "Zac Brown Band",
+        "genres": "Country",
+        "lyrics": """I met a traveller from an antique land,
     Who said "Two vast and trunkless legs of stone
     Stand in the desert. . . . Near them, on the sand,
     Half sunk a shattered visage lies, whose frown,
@@ -48,7 +48,7 @@ class Jukebox1bModelTester(unittest.TestCase):
     Of that colossal Wreck, boundless and bare
     The lone and level sands stretch far away
     """,
-    )
+    }
     # fmt: off
     EXPECTED_OUTPUT_2 = [
         1864, 1536, 1213, 1870, 1357, 1536, 519, 880, 1323, 789, 1082, 534,
@@ -57,12 +57,25 @@ class Jukebox1bModelTester(unittest.TestCase):
         1405, 1276, 1455, 1228
     ]
 
+    EXPECTED_OUTPUT_2_PT_2 = [
+        1489, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
+        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
+        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
+        653, 653, 653, 653
+    ]
+
     EXPECTED_OUTPUT_1 = [
         1125, 1751, 697, 1776, 1141, 1476, 391, 697, 1125, 684, 867, 416,
         844, 1372, 1274, 717, 1274, 844, 1299, 1419, 697, 1370, 317, 1125,
         191, 1440, 1370, 1440, 1370, 282, 1621, 1370, 368, 349, 867, 1872,
         1262, 869, 1728, 747
     ]
+    EXPECTED_OUTPUT_1_PT_2 = [
+        416, 416, 1125, 1125, 416, 416, 416, 416, 416, 416, 416, 416,
+        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
+        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
+        416, 416, 416, 416
+    ]
 
     EXPECTED_OUTPUT_0 = [
         1755, 842, 307, 1843, 1022, 1395, 234, 1554, 806, 739, 1022, 442,
@@ -70,6 +83,12 @@ class Jukebox1bModelTester(unittest.TestCase):
         293, 1443, 1671, 1141, 1533, 555, 1562, 1061, 287, 417, 1022, 2008,
         1186, 1015, 1777, 268
     ]
+    EXPECTED_OUTPUT_0_PT_2 = [
+        854, 842, 1353, 114, 1353, 842, 185, 842, 185, 114, 591, 842,
+        185, 417, 185, 842, 307, 842, 591, 842, 185, 842, 307, 842,
+        591, 842, 1353, 842, 185, 842, 591, 842, 591, 114, 591, 842,
+        185, 842, 591, 89
+    ]
 
     EXPECTED_Y_COND = [1058304, 0, 786432, 7169, 507, 76, 27, 40, 30, 76]
 
@@ -135,19 +154,20 @@ def test_sampling(self):
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(3)]
         zs = model._sample(zs, labels, [0], sample_length=40 * model.priors[0].raw_to_tokens, save_results=False)
-        torch.testing.assert_allclose(zs[0][0], torch.tensor(self.EXPECTED_OUTPUT_2))
+        self.assertIn(zs[0][0].detach().cpu().tolist(), [self.EXPECTED_OUTPUT_2, self.EXPECTED_OUTPUT_2_PT_2])
 
         set_seed(0)
         zs = model._sample(zs, labels, [1], sample_length=40 * model.priors[1].raw_to_tokens, save_results=False)
-        torch.testing.assert_allclose(zs[1][0], torch.tensor(self.EXPECTED_OUTPUT_1))
+        self.assertIn(zs[1][0].detach().cpu().tolist(), [self.EXPECTED_OUTPUT_1, self.EXPECTED_OUTPUT_1_PT_2])
 
         set_seed(0)
         zs = model._sample(zs, labels, [2], sample_length=40 * model.priors[2].raw_to_tokens, save_results=False)
-        torch.testing.assert_allclose(zs[2][0], torch.tensor(self.EXPECTED_OUTPUT_0))
+        self.assertIn(zs[2][0].detach().cpu().tolist(), [self.EXPECTED_OUTPUT_0, self.EXPECTED_OUTPUT_0_PT_2])
 
     @slow
     def test_conditioning(self):
         torch.backends.cuda.matmul.allow_tf32 = False
+        torch.backends.cudnn.allow_tf32 = False
         model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval()
 
         labels = self.prepare_inputs()
@@ -176,11 +196,12 @@ def test_conditioning(self):
     @slow
     def test_primed_sampling(self):
         torch.backends.cuda.matmul.allow_tf32 = False
+        torch.backends.cudnn.allow_tf32 = False
 
         model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval()
         set_seed(0)
         waveform = torch.rand((1, 5120, 1))
-        tokens = [i for i in self.prepare_inputs()]
+        tokens = list(self.prepare_inputs())
 
         zs = [model.vqvae.encode(waveform, start_level=2, bs_chunks=waveform.shape[0])[0], None, None]
         zs = model._sample(
@@ -220,10 +241,10 @@ def test_vqvae(self):
 class Jukebox5bModelTester(unittest.TestCase):
     all_model_classes = (JukeboxModel,) if is_torch_available() else ()
     model_id = "openai/jukebox-5b-lyrics"
-    metas = dict(
-        artist="Zac Brown Band",
-        genres="Country",
-        lyrics="""I met a traveller from an antique land,
+    metas = {
+        "artist": "Zac Brown Band",
+        "genres": "Country",
+        "lyrics": """I met a traveller from an antique land,
     Who said "Two vast and trunkless legs of stone
     Stand in the desert. . . . Near them, on the sand,
     Half sunk a shattered visage lies, whose frown,
@@ -238,7 +259,7 @@ class Jukebox5bModelTester(unittest.TestCase):
     Of that colossal Wreck, boundless and bare
     The lone and level sands stretch far away
     """,
-    )
+    }
 
     # fmt: off
     EXPECTED_OUTPUT_2 = [
@@ -248,6 +269,13 @@ class Jukebox5bModelTester(unittest.TestCase):
         653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
         1489, 1489, 1489, 1489, 1150, 1853, 1509, 1150, 1357, 1509, 6, 1272
     ]
+    EXPECTED_OUTPUT_2_PT_2 = [
+        1489, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
+        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
+        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
+        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
+        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653
+    ]
 
     EXPECTED_OUTPUT_1 = [
         1125, 416, 1125, 1125, 1125, 1125, 1125, 416, 416, 416, 416, 416,
@@ -256,6 +284,13 @@ class Jukebox5bModelTester(unittest.TestCase):
         416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
         416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416
     ]
+    EXPECTED_OUTPUT_1_PT_2 = [
+        416, 416, 1125, 1125, 416, 416, 416, 416, 416, 416, 416, 416, 416,
+        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
+        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
+        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
+        416, 416, 416, 416, 416, 416, 416, 416
+    ]
 
     EXPECTED_OUTPUT_0 = [
         1755, 1061, 234, 1755, 1061, 1755, 185, 290, 307, 307, 616, 616,
@@ -264,6 +299,13 @@ class Jukebox5bModelTester(unittest.TestCase):
         234, 234, 1755, 234, 185, 185, 307, 616, 616, 616, 616, 290,
         1755, 1755, 1755, 234, 234, 1755, 1572, 290, 307, 616, 34, 616
     ]
+    EXPECTED_OUTPUT_0_PT_2 = [
+        854, 842, 1353, 114, 1353, 842, 185, 842, 185, 114, 591, 842, 185,
+        417, 185, 842, 307, 842, 591, 842, 185, 842, 185, 842, 591, 842,
+        1353, 842, 185, 842, 591, 842, 591, 114, 591, 842, 185, 842, 591,
+        89, 591, 842, 591, 842, 591, 417, 1372, 842, 1372, 842, 34, 842,
+        185, 89, 591, 842, 185, 842, 591, 632
+    ]
 
     EXPECTED_GPU_OUTPUTS_2 = [
         1489, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
@@ -272,6 +314,15 @@ class Jukebox5bModelTester(unittest.TestCase):
         653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
         653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653
     ]
+    EXPECTED_GPU_OUTPUTS_2_PT_2 = [
+        1489, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
+        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
+        653, 653, 653, 653, 653, 653, 653, 1853, 1177, 1536, 1228,
+        710, 475, 1489, 1229, 1224, 231, 1224, 252, 1434, 653, 475,
+        1106, 1877, 1599, 1228, 1600, 1683, 1182, 1853, 475, 1864,
+        252, 1229, 1434, 2001
+    ]
+
     EXPECTED_GPU_OUTPUTS_1 = [
         1125, 1125, 416, 1125, 1125, 416, 1125, 1125, 416, 416, 1125, 416,
         416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
@@ -301,47 +352,49 @@ def test_sampling(self):
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(3)]
         zs = model._sample(zs, labels, [0], sample_length=60 * model.priors[0].raw_to_tokens, save_results=False)
-        torch.testing.assert_allclose(zs[0][0], torch.tensor(self.EXPECTED_OUTPUT_2))
+        self.assertIn(zs[0][0].detach().cpu().tolist(), [self.EXPECTED_OUTPUT_2, self.EXPECTED_OUTPUT_2_PT_2])
 
         set_seed(0)
         zs = model._sample(zs, labels, [1], sample_length=60 * model.priors[1].raw_to_tokens, save_results=False)
-        torch.testing.assert_allclose(zs[1][0], torch.tensor(self.EXPECTED_OUTPUT_1))
+        self.assertIn(zs[1][0].detach().cpu().tolist(), [self.EXPECTED_OUTPUT_1, self.EXPECTED_OUTPUT_1_PT_2])
 
         set_seed(0)
         zs = model._sample(zs, labels, [2], sample_length=60 * model.priors[2].raw_to_tokens, save_results=False)
-        torch.testing.assert_allclose(zs[2][0], torch.tensor(self.EXPECTED_OUTPUT_0))
+        self.assertIn(zs[2][0].detach().cpu().tolist(), [self.EXPECTED_OUTPUT_0, self.EXPECTED_OUTPUT_0_PT_2])
 
     @slow
+    @require_torch_gpu
     @skip("Not enough GPU memory on CI runners")
     def test_slow_sampling(self):
         model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval()
-        labels = [i.cuda() for i in self.prepare_inputs(self.model_id)]
+        labels = [i.to(torch_device) for i in self.prepare_inputs(self.model_id)]
 
         set_seed(0)
-        model.priors[0].cuda()
-        zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
+        model.priors[0].to(torch_device)
+        zs = [torch.zeros(1, 0, dtype=torch.long).to(torch_device) for _ in range(3)]
         zs = model._sample(zs, labels, [0], sample_length=60 * model.priors[0].raw_to_tokens, save_results=False)
         torch.testing.assert_allclose(zs[0][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_2))
         model.priors[0].cpu()
 
         set_seed(0)
-        model.priors[1].cuda()
+        model.priors[1].to(torch_device)
         zs = model._sample(zs, labels, [1], sample_length=60 * model.priors[1].raw_to_tokens, save_results=False)
         torch.testing.assert_allclose(zs[1][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_1))
         model.priors[1].cpu()
 
         set_seed(0)
-        model.priors[2].cuda()
+        model.priors[2].to(torch_device)
         zs = model._sample(zs, labels, [2], sample_length=60 * model.priors[2].raw_to_tokens, save_results=False)
         torch.testing.assert_allclose(zs[2][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_0))
 
     @slow
+    @require_torch_gpu
     def test_fp16_slow_sampling(self):
         prior_id = "ArthurZ/jukebox_prior_0"
-        model = JukeboxPrior.from_pretrained(prior_id, min_duration=0).eval().half().to("cuda")
+        model = JukeboxPrior.from_pretrained(prior_id, min_duration=0).eval().half().to(torch_device)
 
-        labels = self.prepare_inputs(prior_id)[0].cuda()
+        labels = self.prepare_inputs(prior_id)[0].to(torch_device)
         metadata = model.get_metadata(labels, 0, 7680, 0)
         set_seed(0)
         outputs = model.sample(1, metadata=metadata, sample_tokens=60)
-        torch.testing.assert_allclose(outputs[0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_2))
+        self.assertIn(outputs[0].cpu().tolist(), [self.EXPECTED_GPU_OUTPUTS_2, self.EXPECTED_GPU_OUTPUTS_2_PT_2])
diff --git a/tests/models/jukebox/test_tokenization_jukebox.py b/tests/models/jukebox/test_tokenization_jukebox.py
index 7ce2585bdd64..c434cf6aa17f 100644
--- a/tests/models/jukebox/test_tokenization_jukebox.py
+++ b/tests/models/jukebox/test_tokenization_jukebox.py
@@ -21,10 +21,10 @@
 
 class JukeboxTokenizationTest(unittest.TestCase):
     tokenizer_class = JukeboxTokenizer
-    metas = dict(
-        artist="Zac Brown Band",
-        genres="Country",
-        lyrics="""I met a traveller from an antique land,
+    metas = {
+        "artist": "Zac Brown Band",
+        "genres": "Country",
+        "lyrics": """I met a traveller from an antique land,
         Who said "Two vast and trunkless legs of stone
         Stand in the desert. . . . Near them, on the sand,
         Half sunk a shattered visage lies, whose frown,
@@ -39,7 +39,7 @@ class JukeboxTokenizationTest(unittest.TestCase):
         Of that colossal Wreck, boundless and bare
         The lone and level sands stretch far away
         """,
-    )
+    }
 
     @require_torch
     def test_1b_lyrics_tokenizer(self):
diff --git a/tests/models/layoutlm/test_modeling_layoutlm.py b/tests/models/layoutlm/test_modeling_layoutlm.py
index 16cacab88c86..aafa53969dfb 100644
--- a/tests/models/layoutlm/test_modeling_layoutlm.py
+++ b/tests/models/layoutlm/test_modeling_layoutlm.py
@@ -19,6 +19,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -47,7 +48,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -219,8 +220,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class LayoutLMModelTest(ModelTesterMixin, unittest.TestCase):
-
+class LayoutLMModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             LayoutLMModel,
@@ -232,6 +232,18 @@ class LayoutLMModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else None
     )
+    pipeline_model_mapping = (
+        {
+            "document-question-answering": LayoutLMForQuestionAnswering,
+            "feature-extraction": LayoutLMModel,
+            "fill-mask": LayoutLMForMaskedLM,
+            "text-classification": LayoutLMForSequenceClassification,
+            "token-classification": LayoutLMForTokenClassification,
+            "zero-shot": LayoutLMForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
     fx_compatible = True
 
     def setUp(self):
diff --git a/tests/models/layoutlm/test_modeling_tf_layoutlm.py b/tests/models/layoutlm/test_modeling_tf_layoutlm.py
index 7bcf6e590b98..96ce692a6682 100644
--- a/tests/models/layoutlm/test_modeling_tf_layoutlm.py
+++ b/tests/models/layoutlm/test_modeling_tf_layoutlm.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import unittest
 
 import numpy as np
@@ -22,6 +24,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -49,7 +52,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -206,8 +209,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFLayoutLMModelTest(TFModelTesterMixin, unittest.TestCase):
-
+class TFLayoutLMModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             TFLayoutLMModel,
@@ -219,6 +221,17 @@ class TFLayoutLMModelTest(TFModelTesterMixin, unittest.TestCase):
         if is_tf_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": TFLayoutLMModel,
+            "fill-mask": TFLayoutLMForMaskedLM,
+            "text-classification": TFLayoutLMForSequenceClassification,
+            "token-classification": TFLayoutLMForTokenClassification,
+            "zero-shot": TFLayoutLMForSequenceClassification,
+        }
+        if is_tf_available()
+        else {}
+    )
     test_head_masking = False
     test_onnx = True
     onnx_min_opset = 10
diff --git a/tests/models/layoutlm/test_tokenization_layoutlm.py b/tests/models/layoutlm/test_tokenization_layoutlm.py
index 3663355ee507..b73b2aa8e446 100644
--- a/tests/models/layoutlm/test_tokenization_layoutlm.py
+++ b/tests/models/layoutlm/test_tokenization_layoutlm.py
@@ -26,7 +26,6 @@
 
 @require_tokenizers
 class LayoutLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = LayoutLMTokenizer
     rust_tokenizer_class = LayoutLMTokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/layoutlmv2/test_feature_extraction_layoutlmv2.py b/tests/models/layoutlmv2/test_feature_extraction_layoutlmv2.py
deleted file mode 100644
index c26eaac16eba..000000000000
--- a/tests/models/layoutlmv2/test_feature_extraction_layoutlmv2.py
+++ /dev/null
@@ -1,221 +0,0 @@
-# coding=utf-8
-# Copyright 2021 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-
-from transformers.testing_utils import require_pytesseract, require_torch
-from transformers.utils import is_pytesseract_available, is_torch_available
-
-from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
-
-
-if is_torch_available():
-    import torch
-
-if is_pytesseract_available():
-    from PIL import Image
-
-    from transformers import LayoutLMv2FeatureExtractor
-
-
-class LayoutLMv2FeatureExtractionTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        apply_ocr=True,
-    ):
-        size = size if size is not None else {"height": 18, "width": 18}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.apply_ocr = apply_ocr
-
-    def prepare_feat_extract_dict(self):
-        return {"do_resize": self.do_resize, "size": self.size, "apply_ocr": self.apply_ocr}
-
-
-@require_torch
-@require_pytesseract
-class LayoutLMv2FeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
-
-    feature_extraction_class = LayoutLMv2FeatureExtractor if is_pytesseract_available() else None
-
-    def setUp(self):
-        self.feature_extract_tester = LayoutLMv2FeatureExtractionTester(self)
-
-    @property
-    def feat_extract_dict(self):
-        return self.feature_extract_tester.prepare_feat_extract_dict()
-
-    def test_feat_extract_properties(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        self.assertTrue(hasattr(feature_extractor, "do_resize"))
-        self.assertTrue(hasattr(feature_extractor, "size"))
-        self.assertTrue(hasattr(feature_extractor, "apply_ocr"))
-
-    def test_feat_extract_from_dict_with_kwargs(self):
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict)
-        self.assertEqual(feature_extractor.size, {"height": 18, "width": 18})
-
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict, size=42)
-        self.assertEqual(feature_extractor.size, {"height": 42, "width": 42})
-
-    def test_batch_feature(self):
-        pass
-
-    def test_call_pil(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PIL images
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoding = feature_extractor(image_inputs[0], return_tensors="pt")
-        self.assertEqual(
-            encoding.pixel_values.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-        self.assertIsInstance(encoding.words, list)
-        self.assertIsInstance(encoding.boxes, list)
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-    def test_call_numpy(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random numpy tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-    def test_call_pytorch(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-    def test_layoutlmv2_integration_test(self):
-        # with apply_OCR = True
-        feature_extractor = LayoutLMv2FeatureExtractor()
-
-        from datasets import load_dataset
-
-        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
-
-        image = Image.open(ds[0]["file"]).convert("RGB")
-
-        encoding = feature_extractor(image, return_tensors="pt")
-
-        self.assertEqual(encoding.pixel_values.shape, (1, 3, 224, 224))
-        self.assertEqual(len(encoding.words), len(encoding.boxes))
-
-        # fmt: off
-        # the words and boxes were obtained with Tesseract 4.1.1
-        expected_words = [['11:14', 'to', '11:39', 'a.m', '11:39', 'to', '11:44', 'a.m.', '11:44', 'a.m.', 'to', '12:25', 'p.m.', '12:25', 'to', '12:58', 'p.m.', '12:58', 'to', '4:00', 'p.m.', '2:00', 'to', '5:00', 'p.m.', 'Coffee', 'Break', 'Coffee', 'will', 'be', 'served', 'for', 'men', 'and', 'women', 'in', 'the', 'lobby', 'adjacent', 'to', 'exhibit', 'area.', 'Please', 'move', 'into', 'exhibit', 'area.', '(Exhibits', 'Open)', 'TRRF', 'GENERAL', 'SESSION', '(PART', '|)', 'Presiding:', 'Lee', 'A.', 'Waller', 'TRRF', 'Vice', 'President', '“Introductory', 'Remarks”', 'Lee', 'A.', 'Waller,', 'TRRF', 'Vice', 'Presi-', 'dent', 'Individual', 'Interviews', 'with', 'TRRF', 'Public', 'Board', 'Members', 'and', 'Sci-', 'entific', 'Advisory', 'Council', 'Mem-', 'bers', 'Conducted', 'by', 'TRRF', 'Treasurer', 'Philip', 'G.', 'Kuehn', 'to', 'get', 'answers', 'which', 'the', 'public', 'refrigerated', 'warehousing', 'industry', 'is', 'looking', 'for.', 'Plus', 'questions', 'from', 'the', 'floor.', 'Dr.', 'Emil', 'M.', 'Mrak,', 'University', 'of', 'Cal-', 'ifornia,', 'Chairman,', 'TRRF', 'Board;', 'Sam', 'R.', 'Cecil,', 'University', 'of', 'Georgia', 'College', 'of', 'Agriculture;', 'Dr.', 'Stanley', 'Charm,', 'Tufts', 'University', 'School', 'of', 'Medicine;', 'Dr.', 'Robert', 'H.', 'Cotton,', 'ITT', 'Continental', 'Baking', 'Company;', 'Dr.', 'Owen', 'Fennema,', 'University', 'of', 'Wis-', 'consin;', 'Dr.', 'Robert', 'E.', 'Hardenburg,', 'USDA.', 'Questions', 'and', 'Answers', 'Exhibits', 'Open', 'Capt.', 'Jack', 'Stoney', 'Room', 'TRRF', 'Scientific', 'Advisory', 'Council', 'Meeting', 'Ballroom', 'Foyer']]  # noqa: E231
-        expected_boxes = [[[141, 57, 214, 69], [228, 58, 252, 69], [141, 75, 216, 88], [230, 79, 280, 88], [142, 260, 218, 273], [230, 261, 255, 273], [143, 279, 218, 290], [231, 282, 290, 291], [143, 342, 218, 354], [231, 345, 289, 355], [202, 362, 227, 373], [143, 379, 220, 392], [231, 382, 291, 394], [144, 714, 220, 726], [231, 715, 256, 726], [144, 732, 220, 745], [232, 736, 291, 747], [144, 769, 218, 782], [231, 770, 256, 782], [141, 788, 202, 801], [215, 791, 274, 804], [143, 826, 204, 838], [215, 826, 240, 838], [142, 844, 202, 857], [215, 847, 274, 859], [334, 57, 427, 69], [440, 57, 522, 69], [369, 75, 461, 88], [469, 75, 516, 88], [528, 76, 562, 88], [570, 76, 667, 88], [675, 75, 711, 87], [721, 79, 778, 88], [789, 75, 840, 88], [369, 97, 470, 107], [484, 94, 507, 106], [518, 94, 562, 107], [576, 94, 655, 110], [668, 94, 792, 109], [804, 95, 829, 107], [369, 113, 465, 125], [477, 116, 547, 125], [562, 113, 658, 125], [671, 116, 748, 125], [761, 113, 811, 125], [369, 131, 465, 143], [477, 133, 548, 143], [563, 130, 698, 145], [710, 130, 802, 146], [336, 171, 412, 183], [423, 171, 572, 183], [582, 170, 716, 184], [728, 171, 817, 187], [829, 171, 844, 186], [338, 197, 482, 212], [507, 196, 557, 209], [569, 196, 595, 208], [610, 196, 702, 209], [505, 214, 583, 226], [595, 214, 656, 227], [670, 215, 807, 227], [335, 259, 543, 274], [556, 259, 708, 272], [372, 279, 422, 291], [435, 279, 460, 291], [474, 279, 574, 292], [587, 278, 664, 291], [676, 278, 738, 291], [751, 279, 834, 291], [372, 298, 434, 310], [335, 341, 483, 354], [497, 341, 655, 354], [667, 341, 728, 354], [740, 341, 825, 354], [335, 360, 430, 372], [442, 360, 534, 372], [545, 359, 687, 372], [697, 360, 754, 372], [765, 360, 823, 373], [334, 378, 428, 391], [440, 378, 577, 394], [590, 378, 705, 391], [720, 378, 801, 391], [334, 397, 400, 409], [370, 416, 529, 429], [544, 416, 576, 432], [587, 416, 665, 428], [677, 416, 814, 429], [372, 435, 452, 450], [465, 434, 495, 447], [511, 434, 600, 447], [611, 436, 637, 447], [649, 436, 694, 451], [705, 438, 824, 447], [369, 453, 452, 466], [464, 454, 509, 466], [522, 453, 611, 469], [625, 453, 792, 469], [370, 472, 556, 488], [570, 472, 684, 487], [697, 472, 718, 485], [732, 472, 835, 488], [369, 490, 411, 503], [425, 490, 484, 503], [496, 490, 635, 506], [645, 490, 707, 503], [718, 491, 761, 503], [771, 490, 840, 503], [336, 510, 374, 521], [388, 510, 447, 522], [460, 510, 489, 521], [503, 510, 580, 522], [592, 509, 736, 525], [745, 509, 770, 522], [781, 509, 840, 522], [338, 528, 434, 541], [448, 528, 596, 541], [609, 527, 687, 540], [700, 528, 792, 541], [336, 546, 397, 559], [407, 546, 431, 559], [443, 546, 525, 560], [537, 546, 680, 562], [688, 546, 714, 559], [722, 546, 837, 562], [336, 565, 449, 581], [461, 565, 485, 577], [497, 565, 665, 581], [681, 565, 718, 577], [732, 565, 837, 580], [337, 584, 438, 597], [452, 583, 521, 596], [535, 584, 677, 599], [690, 583, 787, 596], [801, 583, 825, 596], [338, 602, 478, 615], [492, 602, 530, 614], [543, 602, 638, 615], [650, 602, 676, 614], [688, 602, 788, 615], [802, 602, 843, 614], [337, 621, 502, 633], [516, 621, 615, 637], [629, 621, 774, 636], [789, 621, 827, 633], [337, 639, 418, 652], [432, 640, 571, 653], [587, 639, 731, 655], [743, 639, 769, 652], [780, 639, 841, 652], [338, 658, 440, 673], [455, 658, 491, 670], [508, 658, 602, 671], [616, 658, 638, 670], [654, 658, 835, 674], [337, 677, 429, 689], [337, 714, 482, 726], [495, 714, 548, 726], [561, 714, 683, 726], [338, 770, 461, 782], [474, 769, 554, 785], [489, 788, 562, 803], [576, 788, 643, 801], [656, 787, 751, 804], [764, 788, 844, 801], [334, 825, 421, 838], [430, 824, 574, 838], [584, 824, 723, 841], [335, 844, 450, 857], [464, 843, 583, 860], [628, 862, 755, 875], [769, 861, 848, 878]]]  # noqa: E231
-        # fmt: on
-
-        self.assertListEqual(encoding.words, expected_words)
-        self.assertListEqual(encoding.boxes, expected_boxes)
-
-        # with apply_OCR = False
-        feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
-
-        encoding = feature_extractor(image, return_tensors="pt")
-
-        self.assertEqual(encoding.pixel_values.shape, (1, 3, 224, 224))
diff --git a/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py b/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py
new file mode 100644
index 000000000000..b6200c3ee560
--- /dev/null
+++ b/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py
@@ -0,0 +1,127 @@
+# coding=utf-8
+# Copyright 2021 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers.testing_utils import require_pytesseract, require_torch
+from transformers.utils import is_pytesseract_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_pytesseract_available():
+    from PIL import Image
+
+    from transformers import LayoutLMv2ImageProcessor
+
+
+class LayoutLMv2ImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        apply_ocr=True,
+    ):
+        size = size if size is not None else {"height": 18, "width": 18}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.apply_ocr = apply_ocr
+
+    def prepare_image_processor_dict(self):
+        return {"do_resize": self.do_resize, "size": self.size, "apply_ocr": self.apply_ocr}
+
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.size["height"], self.size["width"]
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_pytesseract
+class LayoutLMv2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = LayoutLMv2ImageProcessor if is_pytesseract_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = LayoutLMv2ImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "apply_ocr"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"height": 18, "width": 18})
+
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
+        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
+
+    def test_layoutlmv2_integration_test(self):
+        # with apply_OCR = True
+        image_processing = LayoutLMv2ImageProcessor()
+
+        from datasets import load_dataset
+
+        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
+
+        image = Image.open(ds[0]["file"]).convert("RGB")
+
+        encoding = image_processing(image, return_tensors="pt")
+
+        self.assertEqual(encoding.pixel_values.shape, (1, 3, 224, 224))
+        self.assertEqual(len(encoding.words), len(encoding.boxes))
+
+        # fmt: off
+        # the words and boxes were obtained with Tesseract 4.1.1
+        expected_words = [['11:14', 'to', '11:39', 'a.m', '11:39', 'to', '11:44', 'a.m.', '11:44', 'a.m.', 'to', '12:25', 'p.m.', '12:25', 'to', '12:58', 'p.m.', '12:58', 'to', '4:00', 'p.m.', '2:00', 'to', '5:00', 'p.m.', 'Coffee', 'Break', 'Coffee', 'will', 'be', 'served', 'for', 'men', 'and', 'women', 'in', 'the', 'lobby', 'adjacent', 'to', 'exhibit', 'area.', 'Please', 'move', 'into', 'exhibit', 'area.', '(Exhibits', 'Open)', 'TRRF', 'GENERAL', 'SESSION', '(PART', '|)', 'Presiding:', 'Lee', 'A.', 'Waller', 'TRRF', 'Vice', 'President', '“Introductory', 'Remarks”', 'Lee', 'A.', 'Waller,', 'TRRF', 'Vice', 'Presi-', 'dent', 'Individual', 'Interviews', 'with', 'TRRF', 'Public', 'Board', 'Members', 'and', 'Sci-', 'entific', 'Advisory', 'Council', 'Mem-', 'bers', 'Conducted', 'by', 'TRRF', 'Treasurer', 'Philip', 'G.', 'Kuehn', 'to', 'get', 'answers', 'which', 'the', 'public', 'refrigerated', 'warehousing', 'industry', 'is', 'looking', 'for.', 'Plus', 'questions', 'from', 'the', 'floor.', 'Dr.', 'Emil', 'M.', 'Mrak,', 'University', 'of', 'Cal-', 'ifornia,', 'Chairman,', 'TRRF', 'Board;', 'Sam', 'R.', 'Cecil,', 'University', 'of', 'Georgia', 'College', 'of', 'Agriculture;', 'Dr.', 'Stanley', 'Charm,', 'Tufts', 'University', 'School', 'of', 'Medicine;', 'Dr.', 'Robert', 'H.', 'Cotton,', 'ITT', 'Continental', 'Baking', 'Company;', 'Dr.', 'Owen', 'Fennema,', 'University', 'of', 'Wis-', 'consin;', 'Dr.', 'Robert', 'E.', 'Hardenburg,', 'USDA.', 'Questions', 'and', 'Answers', 'Exhibits', 'Open', 'Capt.', 'Jack', 'Stoney', 'Room', 'TRRF', 'Scientific', 'Advisory', 'Council', 'Meeting', 'Ballroom', 'Foyer']]  # noqa: E231
+        expected_boxes = [[[141, 57, 214, 69], [228, 58, 252, 69], [141, 75, 216, 88], [230, 79, 280, 88], [142, 260, 218, 273], [230, 261, 255, 273], [143, 279, 218, 290], [231, 282, 290, 291], [143, 342, 218, 354], [231, 345, 289, 355], [202, 362, 227, 373], [143, 379, 220, 392], [231, 382, 291, 394], [144, 714, 220, 726], [231, 715, 256, 726], [144, 732, 220, 745], [232, 736, 291, 747], [144, 769, 218, 782], [231, 770, 256, 782], [141, 788, 202, 801], [215, 791, 274, 804], [143, 826, 204, 838], [215, 826, 240, 838], [142, 844, 202, 857], [215, 847, 274, 859], [334, 57, 427, 69], [440, 57, 522, 69], [369, 75, 461, 88], [469, 75, 516, 88], [528, 76, 562, 88], [570, 76, 667, 88], [675, 75, 711, 87], [721, 79, 778, 88], [789, 75, 840, 88], [369, 97, 470, 107], [484, 94, 507, 106], [518, 94, 562, 107], [576, 94, 655, 110], [668, 94, 792, 109], [804, 95, 829, 107], [369, 113, 465, 125], [477, 116, 547, 125], [562, 113, 658, 125], [671, 116, 748, 125], [761, 113, 811, 125], [369, 131, 465, 143], [477, 133, 548, 143], [563, 130, 698, 145], [710, 130, 802, 146], [336, 171, 412, 183], [423, 171, 572, 183], [582, 170, 716, 184], [728, 171, 817, 187], [829, 171, 844, 186], [338, 197, 482, 212], [507, 196, 557, 209], [569, 196, 595, 208], [610, 196, 702, 209], [505, 214, 583, 226], [595, 214, 656, 227], [670, 215, 807, 227], [335, 259, 543, 274], [556, 259, 708, 272], [372, 279, 422, 291], [435, 279, 460, 291], [474, 279, 574, 292], [587, 278, 664, 291], [676, 278, 738, 291], [751, 279, 834, 291], [372, 298, 434, 310], [335, 341, 483, 354], [497, 341, 655, 354], [667, 341, 728, 354], [740, 341, 825, 354], [335, 360, 430, 372], [442, 360, 534, 372], [545, 359, 687, 372], [697, 360, 754, 372], [765, 360, 823, 373], [334, 378, 428, 391], [440, 378, 577, 394], [590, 378, 705, 391], [720, 378, 801, 391], [334, 397, 400, 409], [370, 416, 529, 429], [544, 416, 576, 432], [587, 416, 665, 428], [677, 416, 814, 429], [372, 435, 452, 450], [465, 434, 495, 447], [511, 434, 600, 447], [611, 436, 637, 447], [649, 436, 694, 451], [705, 438, 824, 447], [369, 453, 452, 466], [464, 454, 509, 466], [522, 453, 611, 469], [625, 453, 792, 469], [370, 472, 556, 488], [570, 472, 684, 487], [697, 472, 718, 485], [732, 472, 835, 488], [369, 490, 411, 503], [425, 490, 484, 503], [496, 490, 635, 506], [645, 490, 707, 503], [718, 491, 761, 503], [771, 490, 840, 503], [336, 510, 374, 521], [388, 510, 447, 522], [460, 510, 489, 521], [503, 510, 580, 522], [592, 509, 736, 525], [745, 509, 770, 522], [781, 509, 840, 522], [338, 528, 434, 541], [448, 528, 596, 541], [609, 527, 687, 540], [700, 528, 792, 541], [336, 546, 397, 559], [407, 546, 431, 559], [443, 546, 525, 560], [537, 546, 680, 562], [688, 546, 714, 559], [722, 546, 837, 562], [336, 565, 449, 581], [461, 565, 485, 577], [497, 565, 665, 581], [681, 565, 718, 577], [732, 565, 837, 580], [337, 584, 438, 597], [452, 583, 521, 596], [535, 584, 677, 599], [690, 583, 787, 596], [801, 583, 825, 596], [338, 602, 478, 615], [492, 602, 530, 614], [543, 602, 638, 615], [650, 602, 676, 614], [688, 602, 788, 615], [802, 602, 843, 614], [337, 621, 502, 633], [516, 621, 615, 637], [629, 621, 774, 636], [789, 621, 827, 633], [337, 639, 418, 652], [432, 640, 571, 653], [587, 639, 731, 655], [743, 639, 769, 652], [780, 639, 841, 652], [338, 658, 440, 673], [455, 658, 491, 670], [508, 658, 602, 671], [616, 658, 638, 670], [654, 658, 835, 674], [337, 677, 429, 689], [337, 714, 482, 726], [495, 714, 548, 726], [561, 714, 683, 726], [338, 770, 461, 782], [474, 769, 554, 785], [489, 788, 562, 803], [576, 788, 643, 801], [656, 787, 751, 804], [764, 788, 844, 801], [334, 825, 421, 838], [430, 824, 574, 838], [584, 824, 723, 841], [335, 844, 450, 857], [464, 843, 583, 860], [628, 862, 755, 875], [769, 861, 848, 878]]]  # noqa: E231
+        # fmt: on
+
+        self.assertListEqual(encoding.words, expected_words)
+        self.assertListEqual(encoding.boxes, expected_boxes)
+
+        # with apply_OCR = False
+        image_processing = LayoutLMv2ImageProcessor(apply_ocr=False)
+
+        encoding = image_processing(image, return_tensors="pt")
+
+        self.assertEqual(encoding.pixel_values.shape, (1, 3, 224, 224))
diff --git a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
index 3c38373163e4..cffa09d6d0f1 100644
--- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
@@ -15,9 +15,6 @@
 """ Testing suite for the PyTorch LayoutLMv2 model. """
 
 
-import os
-import random
-import tempfile
 import unittest
 
 from transformers.testing_utils import require_detectron2, require_torch, require_torch_multi_gpu, slow, torch_device
@@ -25,13 +22,13 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
     import torch
 
     from transformers import (
-        MODEL_MAPPING,
         LayoutLMv2Config,
         LayoutLMv2ForQuestionAnswering,
         LayoutLMv2ForSequenceClassification,
@@ -58,7 +55,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=36,
-        num_hidden_layers=3,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -257,8 +254,7 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 @require_detectron2
-class LayoutLMv2ModelTest(ModelTesterMixin, unittest.TestCase):
-
+class LayoutLMv2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     test_pruning = False
     test_torchscript = True
     test_mismatched_shapes = False
@@ -273,6 +269,11 @@ class LayoutLMv2ModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {"document-question-answering": LayoutLMv2ForQuestionAnswering, "feature-extraction": LayoutLMv2Model}
+        if is_torch_available()
+        else {}
+    )
 
     def setUp(self):
         self.model_tester = LayoutLMv2ModelTester(self)
@@ -313,54 +314,6 @@ def test_for_question_answering(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
 
-    def test_save_load_fast_init_from_base(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = MODEL_MAPPING[config.__class__]
-
-        if isinstance(base_class, tuple):
-            base_class = base_class[0]
-
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-
-            # make a copy of model class to not break future tests
-            # from https://stackoverflow.com/questions/9541025/how-to-copy-a-python-class
-            class CopyClass(model_class):
-                pass
-
-            model_class_copy = CopyClass
-
-            # make sure that all keys are expected for test
-            model_class_copy._keys_to_ignore_on_load_missing = []
-
-            # make init deterministic, but make sure that
-            # non-initialized weights throw errors nevertheless
-            model_class_copy._init_weights = self._mock_init_weights
-
-            model = base_class(config)
-            state_dict = model.state_dict()
-
-            # this will often delete a single weight of a multi-weight module
-            # to test an edge case
-            random_key_to_del = random.choice(list(state_dict.keys()))
-            del state_dict[random_key_to_del]
-
-            # check that certain keys didn't get saved with the model
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                torch.save(state_dict, os.path.join(tmpdirname, "pytorch_model.bin"))
-
-                model_fast_init = model_class_copy.from_pretrained(tmpdirname)
-                model_slow_init = model_class_copy.from_pretrained(tmpdirname, _fast_init=False)
-
-                for key in model_fast_init.state_dict().keys():
-                    if key == "layoutlmv2.visual_segment_embedding":
-                        # we skip the visual segment embedding as it has a custom initialization scheme
-                        continue
-                    max_diff = (model_slow_init.state_dict()[key] - model_fast_init.state_dict()[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
     def test_attention_outputs(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
@@ -462,6 +415,10 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
+    @unittest.skip("We cannot configure detectron2 to output a smaller backbone")
+    def test_model_is_small(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in LAYOUTLMV2_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/layoutlmv2/test_processor_layoutlmv2.py b/tests/models/layoutlmv2/test_processor_layoutlmv2.py
index 18f4f8d5acd3..91a8da9cafb0 100644
--- a/tests/models/layoutlmv2/test_processor_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_processor_layoutlmv2.py
@@ -233,7 +233,7 @@ def test_processor_case_1(self):
 
             # verify keys
             expected_keys = ["attention_mask", "bbox", "image", "input_ids", "token_type_ids"]
-            actual_keys = sorted(list(input_processor.keys()))
+            actual_keys = sorted(input_processor.keys())
             self.assertListEqual(actual_keys, expected_keys)
 
             # verify image
@@ -253,7 +253,7 @@ def test_processor_case_1(self):
 
             # verify keys
             expected_keys = ["attention_mask", "bbox", "image", "input_ids", "token_type_ids"]
-            actual_keys = sorted(list(input_processor.keys()))
+            actual_keys = sorted(input_processor.keys())
             self.assertListEqual(actual_keys, expected_keys)
 
             # verify images
@@ -301,7 +301,7 @@ def test_processor_case_2(self):
 
             # verify keys
             expected_keys = ["attention_mask", "bbox", "image", "input_ids", "token_type_ids"]
-            actual_keys = sorted(list(input_processor.keys()))
+            actual_keys = sorted(input_processor.keys())
             self.assertListEqual(actual_keys, expected_keys)
 
             # verify input_ids
@@ -340,7 +340,7 @@ def test_processor_case_3(self):
 
             # verify keys
             expected_keys = ["attention_mask", "bbox", "image", "input_ids", "labels", "token_type_ids"]
-            actual_keys = sorted(list(input_processor.keys()))
+            actual_keys = sorted(input_processor.keys())
             self.assertListEqual(actual_keys, expected_keys)
 
             # verify input_ids
@@ -362,7 +362,7 @@ def test_processor_case_3(self):
 
             # verify keys
             expected_keys = ["attention_mask", "bbox", "image", "input_ids", "labels", "token_type_ids"]
-            actual_keys = sorted(list(input_processor.keys()))
+            actual_keys = sorted(input_processor.keys())
             self.assertListEqual(actual_keys, expected_keys)
 
             # verify input_ids
@@ -403,7 +403,7 @@ def test_processor_case_4(self):
 
             # verify keys
             expected_keys = ["attention_mask", "bbox", "image", "input_ids", "token_type_ids"]
-            actual_keys = sorted(list(input_processor.keys()))
+            actual_keys = sorted(input_processor.keys())
             self.assertListEqual(actual_keys, expected_keys)
 
             # verify input_ids
@@ -422,7 +422,7 @@ def test_processor_case_4(self):
 
             # verify keys
             expected_keys = ["attention_mask", "bbox", "image", "input_ids", "token_type_ids"]
-            actual_keys = sorted(list(input_processor.keys()))
+            actual_keys = sorted(input_processor.keys())
             self.assertListEqual(actual_keys, expected_keys)
 
             # verify input_ids
@@ -456,7 +456,7 @@ def test_processor_case_5(self):
 
             # verify keys
             expected_keys = ["attention_mask", "bbox", "image", "input_ids", "token_type_ids"]
-            actual_keys = sorted(list(input_processor.keys()))
+            actual_keys = sorted(input_processor.keys())
             self.assertListEqual(actual_keys, expected_keys)
 
             # verify input_ids
@@ -472,7 +472,7 @@ def test_processor_case_5(self):
 
             # verify keys
             expected_keys = ["attention_mask", "bbox", "image", "input_ids", "token_type_ids"]
-            actual_keys = sorted(list(input_processor.keys()))
+            actual_keys = sorted(input_processor.keys())
             self.assertListEqual(actual_keys, expected_keys)
 
             # verify input_ids
diff --git a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py
index 0aadd099f210..ca90eb9641ea 100644
--- a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py
@@ -38,7 +38,14 @@
     _is_punctuation,
     _is_whitespace,
 )
-from transformers.testing_utils import is_pt_tf_cross_test, require_pandas, require_tokenizers, require_torch, slow
+from transformers.testing_utils import (
+    is_pt_tf_cross_test,
+    require_detectron2,
+    require_pandas,
+    require_tokenizers,
+    require_torch,
+    slow,
+)
 
 from ...test_tokenization_common import (
     SMALL_TRAINING_CORPUS,
@@ -284,7 +291,6 @@ def test_add_special_tokens(self):
         tokenizers: List[LayoutLMv2Tokenizer] = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 special_token = "[SPECIAL_TOKEN]"
                 special_token_box = [1000, 1000, 1000, 1000]
 
@@ -378,6 +384,10 @@ def test_encode_decode_with_spaces(self):
     def test_right_and_left_truncation(self):
         pass
 
+    @unittest.skip("Not implemented")
+    def test_split_special_tokens(self):
+        pass
+
     def test_encode_plus_with_padding(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
@@ -519,7 +529,6 @@ def test_number_of_added_tokens(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 # test 1: single sequence
                 words, boxes = self.get_words_and_boxes()
 
@@ -1188,7 +1197,6 @@ def test_token_type_ids(self):
         tokenizers = self.get_tokenizers()
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 # test 1: single sequence
                 words, boxes = self.get_words_and_boxes()
 
@@ -1264,6 +1272,7 @@ def test_offsets_mapping(self):
                 self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens)
 
     @require_torch
+    @require_detectron2
     @slow
     def test_torch_encode_plus_sent_to_model(self):
         import torch
@@ -1275,7 +1284,6 @@ def test_torch_encode_plus_sent_to_model(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
                     return
 
@@ -1543,7 +1551,6 @@ def test_sequence_ids(self):
     def test_special_tokens_initialization(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-
                 added_tokens = [AddedToken("<special>", lstrip=True)]
 
                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(
@@ -1779,7 +1786,6 @@ def test_batch_encode_dynamic_overflowing(self):
             tokenizer = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
 
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name}, {tokenizer.__class__.__name__})"):
-
                 if is_torch_available():
                     returned_tensor = "pt"
                 elif is_tf_available():
@@ -2390,7 +2396,6 @@ def test_only_label_first_subword(self):
 
     @slow
     def test_layoutlmv2_integration_test(self):
-
         tokenizer_p = LayoutLMv2Tokenizer.from_pretrained("microsoft/layoutlmv2-base-uncased")
         tokenizer_r = LayoutLMv2TokenizerFast.from_pretrained("microsoft/layoutlmv2-base-uncased")
 
@@ -2481,3 +2486,7 @@ def test_layoutlmv2_integration_test(self):
     @unittest.skip("Doesn't support another framework than PyTorch")
     def test_np_encode_plus_sent_to_model(self):
         pass
+
+    @unittest.skip("Chat is not supported")
+    def test_chat_template(self):
+        pass
diff --git a/tests/models/layoutlmv3/test_feature_extraction_layoutlmv3.py b/tests/models/layoutlmv3/test_feature_extraction_layoutlmv3.py
deleted file mode 100644
index c8eb976bf584..000000000000
--- a/tests/models/layoutlmv3/test_feature_extraction_layoutlmv3.py
+++ /dev/null
@@ -1,221 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-
-from transformers.testing_utils import require_pytesseract, require_torch
-from transformers.utils import is_pytesseract_available, is_torch_available
-
-from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
-
-
-if is_torch_available():
-    import torch
-
-if is_pytesseract_available():
-    from PIL import Image
-
-    from transformers import LayoutLMv3FeatureExtractor
-
-
-class LayoutLMv3FeatureExtractionTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        apply_ocr=True,
-    ):
-        size = size if size is not None else {"height": 18, "width": 18}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.apply_ocr = apply_ocr
-
-    def prepare_feat_extract_dict(self):
-        return {"do_resize": self.do_resize, "size": self.size, "apply_ocr": self.apply_ocr}
-
-
-@require_torch
-@require_pytesseract
-class LayoutLMv3FeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
-
-    feature_extraction_class = LayoutLMv3FeatureExtractor if is_pytesseract_available() else None
-
-    def setUp(self):
-        self.feature_extract_tester = LayoutLMv3FeatureExtractionTester(self)
-
-    @property
-    def feat_extract_dict(self):
-        return self.feature_extract_tester.prepare_feat_extract_dict()
-
-    def test_feat_extract_properties(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        self.assertTrue(hasattr(feature_extractor, "do_resize"))
-        self.assertTrue(hasattr(feature_extractor, "size"))
-        self.assertTrue(hasattr(feature_extractor, "apply_ocr"))
-
-    def test_feat_extract_from_dict_with_kwargs(self):
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict)
-        self.assertEqual(feature_extractor.size, {"height": 18, "width": 18})
-
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict, size=42)
-        self.assertEqual(feature_extractor.size, {"height": 42, "width": 42})
-
-    def test_batch_feature(self):
-        pass
-
-    def test_call_pil(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PIL images
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoding = feature_extractor(image_inputs[0], return_tensors="pt")
-        self.assertEqual(
-            encoding.pixel_values.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-        self.assertIsInstance(encoding.words, list)
-        self.assertIsInstance(encoding.boxes, list)
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-    def test_call_numpy(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random numpy tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-    def test_call_pytorch(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-    def test_LayoutLMv3_integration_test(self):
-        # with apply_OCR = True
-        feature_extractor = LayoutLMv3FeatureExtractor()
-
-        from datasets import load_dataset
-
-        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
-
-        image = Image.open(ds[0]["file"]).convert("RGB")
-
-        encoding = feature_extractor(image, return_tensors="pt")
-
-        self.assertEqual(encoding.pixel_values.shape, (1, 3, 224, 224))
-        self.assertEqual(len(encoding.words), len(encoding.boxes))
-
-        # fmt: off
-        # the words and boxes were obtained with Tesseract 4.1.1
-        expected_words = [['11:14', 'to', '11:39', 'a.m', '11:39', 'to', '11:44', 'a.m.', '11:44', 'a.m.', 'to', '12:25', 'p.m.', '12:25', 'to', '12:58', 'p.m.', '12:58', 'to', '4:00', 'p.m.', '2:00', 'to', '5:00', 'p.m.', 'Coffee', 'Break', 'Coffee', 'will', 'be', 'served', 'for', 'men', 'and', 'women', 'in', 'the', 'lobby', 'adjacent', 'to', 'exhibit', 'area.', 'Please', 'move', 'into', 'exhibit', 'area.', '(Exhibits', 'Open)', 'TRRF', 'GENERAL', 'SESSION', '(PART', '|)', 'Presiding:', 'Lee', 'A.', 'Waller', 'TRRF', 'Vice', 'President', '“Introductory', 'Remarks”', 'Lee', 'A.', 'Waller,', 'TRRF', 'Vice', 'Presi-', 'dent', 'Individual', 'Interviews', 'with', 'TRRF', 'Public', 'Board', 'Members', 'and', 'Sci-', 'entific', 'Advisory', 'Council', 'Mem-', 'bers', 'Conducted', 'by', 'TRRF', 'Treasurer', 'Philip', 'G.', 'Kuehn', 'to', 'get', 'answers', 'which', 'the', 'public', 'refrigerated', 'warehousing', 'industry', 'is', 'looking', 'for.', 'Plus', 'questions', 'from', 'the', 'floor.', 'Dr.', 'Emil', 'M.', 'Mrak,', 'University', 'of', 'Cal-', 'ifornia,', 'Chairman,', 'TRRF', 'Board;', 'Sam', 'R.', 'Cecil,', 'University', 'of', 'Georgia', 'College', 'of', 'Agriculture;', 'Dr.', 'Stanley', 'Charm,', 'Tufts', 'University', 'School', 'of', 'Medicine;', 'Dr.', 'Robert', 'H.', 'Cotton,', 'ITT', 'Continental', 'Baking', 'Company;', 'Dr.', 'Owen', 'Fennema,', 'University', 'of', 'Wis-', 'consin;', 'Dr.', 'Robert', 'E.', 'Hardenburg,', 'USDA.', 'Questions', 'and', 'Answers', 'Exhibits', 'Open', 'Capt.', 'Jack', 'Stoney', 'Room', 'TRRF', 'Scientific', 'Advisory', 'Council', 'Meeting', 'Ballroom', 'Foyer']]  # noqa: E231
-        expected_boxes = [[[141, 57, 214, 69], [228, 58, 252, 69], [141, 75, 216, 88], [230, 79, 280, 88], [142, 260, 218, 273], [230, 261, 255, 273], [143, 279, 218, 290], [231, 282, 290, 291], [143, 342, 218, 354], [231, 345, 289, 355], [202, 362, 227, 373], [143, 379, 220, 392], [231, 382, 291, 394], [144, 714, 220, 726], [231, 715, 256, 726], [144, 732, 220, 745], [232, 736, 291, 747], [144, 769, 218, 782], [231, 770, 256, 782], [141, 788, 202, 801], [215, 791, 274, 804], [143, 826, 204, 838], [215, 826, 240, 838], [142, 844, 202, 857], [215, 847, 274, 859], [334, 57, 427, 69], [440, 57, 522, 69], [369, 75, 461, 88], [469, 75, 516, 88], [528, 76, 562, 88], [570, 76, 667, 88], [675, 75, 711, 87], [721, 79, 778, 88], [789, 75, 840, 88], [369, 97, 470, 107], [484, 94, 507, 106], [518, 94, 562, 107], [576, 94, 655, 110], [668, 94, 792, 109], [804, 95, 829, 107], [369, 113, 465, 125], [477, 116, 547, 125], [562, 113, 658, 125], [671, 116, 748, 125], [761, 113, 811, 125], [369, 131, 465, 143], [477, 133, 548, 143], [563, 130, 698, 145], [710, 130, 802, 146], [336, 171, 412, 183], [423, 171, 572, 183], [582, 170, 716, 184], [728, 171, 817, 187], [829, 171, 844, 186], [338, 197, 482, 212], [507, 196, 557, 209], [569, 196, 595, 208], [610, 196, 702, 209], [505, 214, 583, 226], [595, 214, 656, 227], [670, 215, 807, 227], [335, 259, 543, 274], [556, 259, 708, 272], [372, 279, 422, 291], [435, 279, 460, 291], [474, 279, 574, 292], [587, 278, 664, 291], [676, 278, 738, 291], [751, 279, 834, 291], [372, 298, 434, 310], [335, 341, 483, 354], [497, 341, 655, 354], [667, 341, 728, 354], [740, 341, 825, 354], [335, 360, 430, 372], [442, 360, 534, 372], [545, 359, 687, 372], [697, 360, 754, 372], [765, 360, 823, 373], [334, 378, 428, 391], [440, 378, 577, 394], [590, 378, 705, 391], [720, 378, 801, 391], [334, 397, 400, 409], [370, 416, 529, 429], [544, 416, 576, 432], [587, 416, 665, 428], [677, 416, 814, 429], [372, 435, 452, 450], [465, 434, 495, 447], [511, 434, 600, 447], [611, 436, 637, 447], [649, 436, 694, 451], [705, 438, 824, 447], [369, 453, 452, 466], [464, 454, 509, 466], [522, 453, 611, 469], [625, 453, 792, 469], [370, 472, 556, 488], [570, 472, 684, 487], [697, 472, 718, 485], [732, 472, 835, 488], [369, 490, 411, 503], [425, 490, 484, 503], [496, 490, 635, 506], [645, 490, 707, 503], [718, 491, 761, 503], [771, 490, 840, 503], [336, 510, 374, 521], [388, 510, 447, 522], [460, 510, 489, 521], [503, 510, 580, 522], [592, 509, 736, 525], [745, 509, 770, 522], [781, 509, 840, 522], [338, 528, 434, 541], [448, 528, 596, 541], [609, 527, 687, 540], [700, 528, 792, 541], [336, 546, 397, 559], [407, 546, 431, 559], [443, 546, 525, 560], [537, 546, 680, 562], [688, 546, 714, 559], [722, 546, 837, 562], [336, 565, 449, 581], [461, 565, 485, 577], [497, 565, 665, 581], [681, 565, 718, 577], [732, 565, 837, 580], [337, 584, 438, 597], [452, 583, 521, 596], [535, 584, 677, 599], [690, 583, 787, 596], [801, 583, 825, 596], [338, 602, 478, 615], [492, 602, 530, 614], [543, 602, 638, 615], [650, 602, 676, 614], [688, 602, 788, 615], [802, 602, 843, 614], [337, 621, 502, 633], [516, 621, 615, 637], [629, 621, 774, 636], [789, 621, 827, 633], [337, 639, 418, 652], [432, 640, 571, 653], [587, 639, 731, 655], [743, 639, 769, 652], [780, 639, 841, 652], [338, 658, 440, 673], [455, 658, 491, 670], [508, 658, 602, 671], [616, 658, 638, 670], [654, 658, 835, 674], [337, 677, 429, 689], [337, 714, 482, 726], [495, 714, 548, 726], [561, 714, 683, 726], [338, 770, 461, 782], [474, 769, 554, 785], [489, 788, 562, 803], [576, 788, 643, 801], [656, 787, 751, 804], [764, 788, 844, 801], [334, 825, 421, 838], [430, 824, 574, 838], [584, 824, 723, 841], [335, 844, 450, 857], [464, 843, 583, 860], [628, 862, 755, 875], [769, 861, 848, 878]]]  # noqa: E231
-        # fmt: on
-
-        self.assertListEqual(encoding.words, expected_words)
-        self.assertListEqual(encoding.boxes, expected_boxes)
-
-        # with apply_OCR = False
-        feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=False)
-
-        encoding = feature_extractor(image, return_tensors="pt")
-
-        self.assertEqual(encoding.pixel_values.shape, (1, 3, 224, 224))
diff --git a/tests/models/layoutlmv3/test_image_processing_layoutlmv3.py b/tests/models/layoutlmv3/test_image_processing_layoutlmv3.py
new file mode 100644
index 000000000000..9b19c376d90b
--- /dev/null
+++ b/tests/models/layoutlmv3/test_image_processing_layoutlmv3.py
@@ -0,0 +1,127 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers.testing_utils import require_pytesseract, require_torch
+from transformers.utils import is_pytesseract_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_pytesseract_available():
+    from PIL import Image
+
+    from transformers import LayoutLMv3ImageProcessor
+
+
+class LayoutLMv3ImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        apply_ocr=True,
+    ):
+        size = size if size is not None else {"height": 18, "width": 18}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.apply_ocr = apply_ocr
+
+    def prepare_image_processor_dict(self):
+        return {"do_resize": self.do_resize, "size": self.size, "apply_ocr": self.apply_ocr}
+
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.size["height"], self.size["width"]
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_pytesseract
+class LayoutLMv3ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = LayoutLMv3ImageProcessor if is_pytesseract_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = LayoutLMv3ImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "apply_ocr"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"height": 18, "width": 18})
+
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
+        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
+
+    def test_LayoutLMv3_integration_test(self):
+        # with apply_OCR = True
+        image_processing = LayoutLMv3ImageProcessor()
+
+        from datasets import load_dataset
+
+        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
+
+        image = Image.open(ds[0]["file"]).convert("RGB")
+
+        encoding = image_processing(image, return_tensors="pt")
+
+        self.assertEqual(encoding.pixel_values.shape, (1, 3, 224, 224))
+        self.assertEqual(len(encoding.words), len(encoding.boxes))
+
+        # fmt: off
+        # the words and boxes were obtained with Tesseract 4.1.1
+        expected_words = [['11:14', 'to', '11:39', 'a.m', '11:39', 'to', '11:44', 'a.m.', '11:44', 'a.m.', 'to', '12:25', 'p.m.', '12:25', 'to', '12:58', 'p.m.', '12:58', 'to', '4:00', 'p.m.', '2:00', 'to', '5:00', 'p.m.', 'Coffee', 'Break', 'Coffee', 'will', 'be', 'served', 'for', 'men', 'and', 'women', 'in', 'the', 'lobby', 'adjacent', 'to', 'exhibit', 'area.', 'Please', 'move', 'into', 'exhibit', 'area.', '(Exhibits', 'Open)', 'TRRF', 'GENERAL', 'SESSION', '(PART', '|)', 'Presiding:', 'Lee', 'A.', 'Waller', 'TRRF', 'Vice', 'President', '“Introductory', 'Remarks”', 'Lee', 'A.', 'Waller,', 'TRRF', 'Vice', 'Presi-', 'dent', 'Individual', 'Interviews', 'with', 'TRRF', 'Public', 'Board', 'Members', 'and', 'Sci-', 'entific', 'Advisory', 'Council', 'Mem-', 'bers', 'Conducted', 'by', 'TRRF', 'Treasurer', 'Philip', 'G.', 'Kuehn', 'to', 'get', 'answers', 'which', 'the', 'public', 'refrigerated', 'warehousing', 'industry', 'is', 'looking', 'for.', 'Plus', 'questions', 'from', 'the', 'floor.', 'Dr.', 'Emil', 'M.', 'Mrak,', 'University', 'of', 'Cal-', 'ifornia,', 'Chairman,', 'TRRF', 'Board;', 'Sam', 'R.', 'Cecil,', 'University', 'of', 'Georgia', 'College', 'of', 'Agriculture;', 'Dr.', 'Stanley', 'Charm,', 'Tufts', 'University', 'School', 'of', 'Medicine;', 'Dr.', 'Robert', 'H.', 'Cotton,', 'ITT', 'Continental', 'Baking', 'Company;', 'Dr.', 'Owen', 'Fennema,', 'University', 'of', 'Wis-', 'consin;', 'Dr.', 'Robert', 'E.', 'Hardenburg,', 'USDA.', 'Questions', 'and', 'Answers', 'Exhibits', 'Open', 'Capt.', 'Jack', 'Stoney', 'Room', 'TRRF', 'Scientific', 'Advisory', 'Council', 'Meeting', 'Ballroom', 'Foyer']]  # noqa: E231
+        expected_boxes = [[[141, 57, 214, 69], [228, 58, 252, 69], [141, 75, 216, 88], [230, 79, 280, 88], [142, 260, 218, 273], [230, 261, 255, 273], [143, 279, 218, 290], [231, 282, 290, 291], [143, 342, 218, 354], [231, 345, 289, 355], [202, 362, 227, 373], [143, 379, 220, 392], [231, 382, 291, 394], [144, 714, 220, 726], [231, 715, 256, 726], [144, 732, 220, 745], [232, 736, 291, 747], [144, 769, 218, 782], [231, 770, 256, 782], [141, 788, 202, 801], [215, 791, 274, 804], [143, 826, 204, 838], [215, 826, 240, 838], [142, 844, 202, 857], [215, 847, 274, 859], [334, 57, 427, 69], [440, 57, 522, 69], [369, 75, 461, 88], [469, 75, 516, 88], [528, 76, 562, 88], [570, 76, 667, 88], [675, 75, 711, 87], [721, 79, 778, 88], [789, 75, 840, 88], [369, 97, 470, 107], [484, 94, 507, 106], [518, 94, 562, 107], [576, 94, 655, 110], [668, 94, 792, 109], [804, 95, 829, 107], [369, 113, 465, 125], [477, 116, 547, 125], [562, 113, 658, 125], [671, 116, 748, 125], [761, 113, 811, 125], [369, 131, 465, 143], [477, 133, 548, 143], [563, 130, 698, 145], [710, 130, 802, 146], [336, 171, 412, 183], [423, 171, 572, 183], [582, 170, 716, 184], [728, 171, 817, 187], [829, 171, 844, 186], [338, 197, 482, 212], [507, 196, 557, 209], [569, 196, 595, 208], [610, 196, 702, 209], [505, 214, 583, 226], [595, 214, 656, 227], [670, 215, 807, 227], [335, 259, 543, 274], [556, 259, 708, 272], [372, 279, 422, 291], [435, 279, 460, 291], [474, 279, 574, 292], [587, 278, 664, 291], [676, 278, 738, 291], [751, 279, 834, 291], [372, 298, 434, 310], [335, 341, 483, 354], [497, 341, 655, 354], [667, 341, 728, 354], [740, 341, 825, 354], [335, 360, 430, 372], [442, 360, 534, 372], [545, 359, 687, 372], [697, 360, 754, 372], [765, 360, 823, 373], [334, 378, 428, 391], [440, 378, 577, 394], [590, 378, 705, 391], [720, 378, 801, 391], [334, 397, 400, 409], [370, 416, 529, 429], [544, 416, 576, 432], [587, 416, 665, 428], [677, 416, 814, 429], [372, 435, 452, 450], [465, 434, 495, 447], [511, 434, 600, 447], [611, 436, 637, 447], [649, 436, 694, 451], [705, 438, 824, 447], [369, 453, 452, 466], [464, 454, 509, 466], [522, 453, 611, 469], [625, 453, 792, 469], [370, 472, 556, 488], [570, 472, 684, 487], [697, 472, 718, 485], [732, 472, 835, 488], [369, 490, 411, 503], [425, 490, 484, 503], [496, 490, 635, 506], [645, 490, 707, 503], [718, 491, 761, 503], [771, 490, 840, 503], [336, 510, 374, 521], [388, 510, 447, 522], [460, 510, 489, 521], [503, 510, 580, 522], [592, 509, 736, 525], [745, 509, 770, 522], [781, 509, 840, 522], [338, 528, 434, 541], [448, 528, 596, 541], [609, 527, 687, 540], [700, 528, 792, 541], [336, 546, 397, 559], [407, 546, 431, 559], [443, 546, 525, 560], [537, 546, 680, 562], [688, 546, 714, 559], [722, 546, 837, 562], [336, 565, 449, 581], [461, 565, 485, 577], [497, 565, 665, 581], [681, 565, 718, 577], [732, 565, 837, 580], [337, 584, 438, 597], [452, 583, 521, 596], [535, 584, 677, 599], [690, 583, 787, 596], [801, 583, 825, 596], [338, 602, 478, 615], [492, 602, 530, 614], [543, 602, 638, 615], [650, 602, 676, 614], [688, 602, 788, 615], [802, 602, 843, 614], [337, 621, 502, 633], [516, 621, 615, 637], [629, 621, 774, 636], [789, 621, 827, 633], [337, 639, 418, 652], [432, 640, 571, 653], [587, 639, 731, 655], [743, 639, 769, 652], [780, 639, 841, 652], [338, 658, 440, 673], [455, 658, 491, 670], [508, 658, 602, 671], [616, 658, 638, 670], [654, 658, 835, 674], [337, 677, 429, 689], [337, 714, 482, 726], [495, 714, 548, 726], [561, 714, 683, 726], [338, 770, 461, 782], [474, 769, 554, 785], [489, 788, 562, 803], [576, 788, 643, 801], [656, 787, 751, 804], [764, 788, 844, 801], [334, 825, 421, 838], [430, 824, 574, 838], [584, 824, 723, 841], [335, 844, 450, 857], [464, 843, 583, 860], [628, 862, 755, 875], [769, 861, 848, 878]]]  # noqa: E231
+        # fmt: on
+
+        self.assertListEqual(encoding.words, expected_words)
+        self.assertListEqual(encoding.boxes, expected_boxes)
+
+        # with apply_OCR = False
+        image_processing = LayoutLMv3ImageProcessor(apply_ocr=False)
+
+        encoding = image_processing(image, return_tensors="pt")
+
+        self.assertEqual(encoding.pixel_values.shape, (1, 3, 224, 224))
diff --git a/tests/models/layoutlmv3/test_modeling_layoutlmv3.py b/tests/models/layoutlmv3/test_modeling_layoutlmv3.py
index d5c8d42d2217..bf9a0b83144a 100644
--- a/tests/models/layoutlmv3/test_modeling_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_modeling_layoutlmv3.py
@@ -23,6 +23,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -44,7 +45,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import LayoutLMv3FeatureExtractor
+    from transformers import LayoutLMv3ImageProcessor
 
 
 class LayoutLMv3ModelTester:
@@ -62,7 +63,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=36,
-        num_hidden_layers=3,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -269,8 +270,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class LayoutLMv3ModelTest(ModelTesterMixin, unittest.TestCase):
-
+class LayoutLMv3ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     test_pruning = False
     test_torchscript = False
     test_mismatched_shapes = False
@@ -285,6 +285,21 @@ class LayoutLMv3ModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {"document-question-answering": LayoutLMv3ForQuestionAnswering, "feature-extraction": LayoutLMv3Model}
+        if is_torch_available()
+        else {}
+    )
+
+    # TODO: Fix the failed tests
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        # `DocumentQuestionAnsweringPipeline` is expected to work with this model, but it combines the text and visual
+        # embedding along the sequence dimension (dim 1), which causes an error during post-processing as `p_mask` has
+        # the sequence dimension of the text embedding only.
+        # (see the line `embedding_output = torch.cat([embedding_output, visual_embeddings], dim=1)`)
+        return True
 
     def setUp(self):
         self.model_tester = LayoutLMv3ModelTester(self)
@@ -367,16 +382,16 @@ def prepare_img():
 @require_torch
 class LayoutLMv3ModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return LayoutLMv3FeatureExtractor(apply_ocr=False) if is_vision_available() else None
+    def default_image_processor(self):
+        return LayoutLMv3ImageProcessor(apply_ocr=False) if is_vision_available() else None
 
     @slow
     def test_inference_no_head(self):
         model = LayoutLMv3Model.from_pretrained("microsoft/layoutlmv3-base").to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values.to(torch_device)
+        pixel_values = image_processor(images=image, return_tensors="pt").pixel_values.to(torch_device)
 
         input_ids = torch.tensor([[1, 2]])
         bbox = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 8]]).unsqueeze(0)
diff --git a/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py b/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py
index f71aeb0aefb4..5ea4cb625c46 100644
--- a/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 """ Testing suite for the TensorFlow LayoutLMv3 model. """
 
+from __future__ import annotations
+
 import copy
 import inspect
 import unittest
@@ -27,6 +29,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -48,7 +51,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import LayoutLMv3FeatureExtractor
+    from transformers import LayoutLMv3ImageProcessor
 
 
 class TFLayoutLMv3ModelTester:
@@ -66,7 +69,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=36,
-        num_hidden_layers=3,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -263,8 +266,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFLayoutLMv3ModelTest(TFModelTesterMixin, unittest.TestCase):
-
+class TFLayoutLMv3ModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             TFLayoutLMv3Model,
@@ -275,11 +277,22 @@ class TFLayoutLMv3ModelTest(TFModelTesterMixin, unittest.TestCase):
         if is_tf_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {"document-question-answering": TFLayoutLMv3ForQuestionAnswering, "feature-extraction": TFLayoutLMv3Model}
+        if is_tf_available()
+        else {}
+    )
 
     test_pruning = False
     test_resize_embeddings = False
     test_onnx = False
 
+    # TODO: Fix the failed tests
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        return True
+
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> dict:
         inputs_dict = copy.deepcopy(inputs_dict)
 
@@ -321,7 +334,7 @@ def test_loss_computation(self):
                 # The number of elements in the loss should be the same as the number of elements in the label
                 prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
                 added_label = prepared_for_class[
-                    sorted(list(prepared_for_class.keys() - inputs_dict.keys()), reverse=True)[0]
+                    sorted(prepared_for_class.keys() - inputs_dict.keys(), reverse=True)[0]
                 ]
                 expected_loss_size = added_label.shape.as_list()[:1]
 
@@ -469,16 +482,16 @@ def prepare_img():
 @require_tf
 class TFLayoutLMv3ModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return LayoutLMv3FeatureExtractor(apply_ocr=False) if is_vision_available() else None
+    def default_image_processor(self):
+        return LayoutLMv3ImageProcessor(apply_ocr=False) if is_vision_available() else None
 
     @slow
     def test_inference_no_head(self):
         model = TFLayoutLMv3Model.from_pretrained("microsoft/layoutlmv3-base")
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        pixel_values = feature_extractor(images=image, return_tensors="tf").pixel_values
+        pixel_values = image_processor(images=image, return_tensors="tf").pixel_values
 
         input_ids = tf.constant([[1, 2]])
         bbox = tf.expand_dims(tf.constant([[1, 2, 3, 4], [5, 6, 7, 8]]), axis=0)
diff --git a/tests/models/layoutlmv3/test_processor_layoutlmv3.py b/tests/models/layoutlmv3/test_processor_layoutlmv3.py
index 56f792584681..f649e0c275a2 100644
--- a/tests/models/layoutlmv3/test_processor_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_processor_layoutlmv3.py
@@ -213,7 +213,7 @@ def test_processor_case_1(self):
 
             # verify keys
             expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
-            actual_keys = sorted(list(input_processor.keys()))
+            actual_keys = sorted(input_processor.keys())
             self.assertListEqual(actual_keys, expected_keys)
 
             # verify image
@@ -235,7 +235,7 @@ def test_processor_case_1(self):
 
             # verify keys
             expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
-            actual_keys = sorted(list(input_processor.keys()))
+            actual_keys = sorted(input_processor.keys())
             self.assertListEqual(actual_keys, expected_keys)
 
             # verify images
@@ -285,7 +285,7 @@ def test_processor_case_2(self):
 
             # verify keys
             expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
-            actual_keys = sorted(list(input_processor.keys()))
+            actual_keys = sorted(input_processor.keys())
             self.assertListEqual(actual_keys, expected_keys)
 
             # verify input_ids
@@ -324,7 +324,7 @@ def test_processor_case_3(self):
 
             # verify keys
             expected_keys = ["attention_mask", "bbox", "input_ids", "labels", "pixel_values"]
-            actual_keys = sorted(list(input_processor.keys()))
+            actual_keys = sorted(input_processor.keys())
             self.assertListEqual(actual_keys, expected_keys)
 
             # verify input_ids
@@ -346,7 +346,7 @@ def test_processor_case_3(self):
 
             # verify keys
             expected_keys = ["attention_mask", "bbox", "input_ids", "labels", "pixel_values"]
-            actual_keys = sorted(list(input_processor.keys()))
+            actual_keys = sorted(input_processor.keys())
             self.assertListEqual(actual_keys, expected_keys)
 
             # verify input_ids
@@ -387,7 +387,7 @@ def test_processor_case_4(self):
 
             # verify keys
             expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
-            actual_keys = sorted(list(input_processor.keys()))
+            actual_keys = sorted(input_processor.keys())
             self.assertListEqual(actual_keys, expected_keys)
 
             # verify input_ids
@@ -406,7 +406,7 @@ def test_processor_case_4(self):
 
             # verify keys
             expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
-            actual_keys = sorted(list(input_processor.keys()))
+            actual_keys = sorted(input_processor.keys())
             self.assertListEqual(actual_keys, expected_keys)
 
             # verify input_ids
@@ -440,7 +440,7 @@ def test_processor_case_5(self):
 
             # verify keys
             expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
-            actual_keys = sorted(list(input_processor.keys()))
+            actual_keys = sorted(input_processor.keys())
             self.assertListEqual(actual_keys, expected_keys)
 
             # verify input_ids
@@ -456,7 +456,7 @@ def test_processor_case_5(self):
 
             # verify keys
             expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
-            actual_keys = sorted(list(input_processor.keys()))
+            actual_keys = sorted(input_processor.keys())
             self.assertListEqual(actual_keys, expected_keys)
 
             # verify input_ids
diff --git a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
index 322ed6861ff6..1e2bb6610e30 100644
--- a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
@@ -171,7 +171,6 @@ def test_add_special_tokens(self):
         tokenizers: List[LayoutLMv3Tokenizer] = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 special_token = "[SPECIAL_TOKEN]"
                 special_token_box = [1000, 1000, 1000, 1000]
 
@@ -265,6 +264,10 @@ def test_encode_decode_with_spaces(self):
     def test_right_and_left_truncation(self):
         pass
 
+    @unittest.skip("Not implemented")
+    def test_split_special_tokens(self):
+        pass
+
     def test_encode_plus_with_padding(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
@@ -406,7 +409,6 @@ def test_number_of_added_tokens(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 # test 1: single sequence
                 words, boxes = self.get_words_and_boxes()
 
@@ -1075,7 +1077,6 @@ def test_token_type_ids(self):
         tokenizers = self.get_tokenizers()
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 # test 1: single sequence
                 words, boxes = self.get_words_and_boxes()
 
@@ -1161,7 +1162,6 @@ def test_torch_encode_plus_sent_to_model(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
                     return
 
@@ -1429,7 +1429,6 @@ def test_sequence_ids(self):
     def test_special_tokens_initialization(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-
                 added_tokens = [AddedToken("<special>", lstrip=True)]
 
                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(
@@ -1665,7 +1664,6 @@ def test_batch_encode_dynamic_overflowing(self):
             tokenizer = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
 
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name}, {tokenizer.__class__.__name__})"):
-
                 if is_torch_available():
                     returned_tensor = "pt"
                 elif is_tf_available():
@@ -1756,14 +1754,13 @@ def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20,
         return words, boxes, output_ids
 
     def test_added_token_with_space_before(self):
-
         tokenizer_s = self.get_tokenizer()
         tokenizer_f = self.get_rust_tokenizer()
 
         tokens_to_add = ["AAA", "bbb"]
 
-        words_with_space = [f" {token}" for token in tokens_to_add + tokenizer_s.unique_no_split_tokens]
-        words_without_space = tokens_to_add + tokenizer_s.unique_no_split_tokens
+        words_with_space = [f" {token}" for token in tokens_to_add + list(tokenizer_s.added_tokens_encoder.keys())]
+        words_without_space = tokens_to_add + list(tokenizer_s.added_tokens_encoder.keys())
         boxes = [[i, i, i, i] for i in range(len(words_with_space))]
 
         tokens_to_add_formated = [
@@ -2284,14 +2281,14 @@ def test_compare_prepare_for_model(self):
 
     @slow
     def test_only_label_first_subword(self):
-        words = ["hello", "niels"]
+        words = ["hello", "niels", "0000000000000000"]
         boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))]
-        word_labels = [0, 1]
+        word_labels = [0, 1, 2]
 
         # test slow tokenizer
         tokenizer_p = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base", add_visual_labels=False)
         encoding = tokenizer_p(words, boxes=boxes, word_labels=word_labels)
-        self.assertListEqual(encoding.labels, [-100, 0, 1, -100, -100])
+        self.assertListEqual(encoding.labels, [-100, 0, 1, -100, 2, -100, -100])
 
         tokenizer_p = LayoutLMv3Tokenizer.from_pretrained(
             "microsoft/layoutlmv3-base",
@@ -2299,12 +2296,12 @@ def test_only_label_first_subword(self):
             add_visual_labels=False,
         )
         encoding = tokenizer_p(words, boxes=boxes, word_labels=word_labels)
-        self.assertListEqual(encoding.labels, [-100, 0, 1, 1, -100])
+        self.assertListEqual(encoding.labels, [-100, 0, 1, 1, 2, 2, -100])
 
         # test fast tokenizer
         tokenizer_r = LayoutLMv3TokenizerFast.from_pretrained("microsoft/layoutlmv3-base", add_visual_labels=False)
         encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels)
-        self.assertListEqual(encoding.labels, [-100, 0, 1, -100, -100])
+        self.assertListEqual(encoding.labels, [-100, 0, 1, -100, 2, -100, -100])
 
         tokenizer_r = LayoutLMv3Tokenizer.from_pretrained(
             "microsoft/layoutlmv3-base",
@@ -2312,11 +2309,10 @@ def test_only_label_first_subword(self):
             add_visual_labels=False,
         )
         encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels)
-        self.assertListEqual(encoding.labels, [-100, 0, 1, 1, -100])
+        self.assertListEqual(encoding.labels, [-100, 0, 1, 1, 2, 2, -100])
 
     @slow
     def test_layoutlmv3_integration_test(self):
-
         tokenizer_p = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
         tokenizer_r = LayoutLMv3TokenizerFast.from_pretrained("microsoft/layoutlmv3-base")
 
@@ -2443,3 +2439,7 @@ def test_tf_encode_plus_sent_to_model(self):
                 # This should not fail
                 model(encoded_sequence)
                 model(batch_encoded_sequence)
+
+    @unittest.skip("Chat is not supported")
+    def test_chat_template(self):
+        pass
diff --git a/tests/models/layoutxlm/test_processor_layoutxlm.py b/tests/models/layoutxlm/test_processor_layoutxlm.py
index 2843528bae0b..57c349b26b7f 100644
--- a/tests/models/layoutxlm/test_processor_layoutxlm.py
+++ b/tests/models/layoutxlm/test_processor_layoutxlm.py
@@ -36,7 +36,7 @@
 if is_pytesseract_available():
     from PIL import Image
 
-    from transformers import LayoutLMv2FeatureExtractor, LayoutXLMProcessor
+    from transformers import LayoutLMv2ImageProcessor, LayoutXLMProcessor
 
 
 @require_pytesseract
@@ -47,7 +47,7 @@ class LayoutXLMProcessorTest(unittest.TestCase):
     rust_tokenizer_class = LayoutXLMTokenizerFast
 
     def setUp(self):
-        feature_extractor_map = {
+        image_processor_map = {
             "do_resize": True,
             "size": 224,
             "apply_ocr": True,
@@ -56,7 +56,7 @@ def setUp(self):
         self.tmpdirname = tempfile.mkdtemp()
         self.feature_extraction_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
         with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(feature_extractor_map) + "\n")
+            fp.write(json.dumps(image_processor_map) + "\n")
 
         # taken from `test_tokenization_layoutxlm.LayoutXLMTokenizationTest.test_save_pretrained`
         self.tokenizer_pretrained_name = "hf-internal-testing/tiny-random-layoutxlm"
@@ -70,8 +70,8 @@ def get_rust_tokenizer(self, **kwargs) -> PreTrainedTokenizerFast:
     def get_tokenizers(self, **kwargs) -> List[PreTrainedTokenizerBase]:
         return [self.get_tokenizer(**kwargs), self.get_rust_tokenizer(**kwargs)]
 
-    def get_feature_extractor(self, **kwargs):
-        return LayoutLMv2FeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
+    def get_image_processor(self, **kwargs):
+        return LayoutLMv2ImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
 
     def tearDown(self):
         shutil.rmtree(self.tmpdirname)
@@ -88,10 +88,10 @@ def prepare_image_inputs(self):
         return image_inputs
 
     def test_save_load_pretrained_default(self):
-        feature_extractor = self.get_feature_extractor()
+        image_processor = self.get_image_processor()
         tokenizers = self.get_tokenizers()
         for tokenizer in tokenizers:
-            processor = LayoutXLMProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+            processor = LayoutXLMProcessor(image_processor=image_processor, tokenizer=tokenizer)
 
             processor.save_pretrained(self.tmpdirname)
             processor = LayoutXLMProcessor.from_pretrained(self.tmpdirname)
@@ -99,16 +99,16 @@ def test_save_load_pretrained_default(self):
             self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
             self.assertIsInstance(processor.tokenizer, (LayoutXLMTokenizer, LayoutXLMTokenizerFast))
 
-            self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
-            self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)
+            self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string())
+            self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessor)
 
     def test_save_load_pretrained_additional_features(self):
-        processor = LayoutXLMProcessor(feature_extractor=self.get_feature_extractor(), tokenizer=self.get_tokenizer())
+        processor = LayoutXLMProcessor(image_processor=self.get_image_processor(), tokenizer=self.get_tokenizer())
         processor.save_pretrained(self.tmpdirname)
 
         # slow tokenizer
         tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
-        feature_extractor_add_kwargs = self.get_feature_extractor(do_resize=False, size=30)
+        image_processor_add_kwargs = self.get_image_processor(do_resize=False, size=30)
 
         processor = LayoutXLMProcessor.from_pretrained(
             self.tmpdirname,
@@ -122,12 +122,12 @@ def test_save_load_pretrained_additional_features(self):
         self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
         self.assertIsInstance(processor.tokenizer, LayoutXLMTokenizer)
 
-        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)
+        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
+        self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessor)
 
         # fast tokenizer
         tokenizer_add_kwargs = self.get_rust_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
-        feature_extractor_add_kwargs = self.get_feature_extractor(do_resize=False, size=30)
+        image_processor_add_kwargs = self.get_image_processor(do_resize=False, size=30)
 
         processor = LayoutXLMProcessor.from_pretrained(
             self.tmpdirname, use_xlm=True, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30
@@ -136,14 +136,14 @@ def test_save_load_pretrained_additional_features(self):
         self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
         self.assertIsInstance(processor.tokenizer, LayoutXLMTokenizerFast)
 
-        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)
+        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
+        self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessor)
 
     def test_model_input_names(self):
-        feature_extractor = self.get_feature_extractor()
+        image_processor = self.get_image_processor()
         tokenizer = self.get_tokenizer()
 
-        processor = LayoutXLMProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        processor = LayoutXLMProcessor(tokenizer=tokenizer, image_processor=image_processor)
 
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
@@ -215,20 +215,20 @@ def get_tokenizers(self):
     def test_processor_case_1(self):
         # case 1: document image classification (training, inference) + token classification (inference), apply_ocr = True
 
-        feature_extractor = LayoutLMv2FeatureExtractor()
+        image_processor = LayoutLMv2ImageProcessor()
         tokenizers = self.get_tokenizers
         images = self.get_images
 
         for tokenizer in tokenizers:
-            processor = LayoutXLMProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+            processor = LayoutXLMProcessor(image_processor=image_processor, tokenizer=tokenizer)
 
             # not batched
-            input_feat_extract = feature_extractor(images[0], return_tensors="pt")
+            input_feat_extract = image_processor(images[0], return_tensors="pt")
             input_processor = processor(images[0], return_tensors="pt")
 
             # verify keys
             expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
-            actual_keys = sorted(list(input_processor.keys()))
+            actual_keys = sorted(input_processor.keys())
             self.assertListEqual(actual_keys, expected_keys)
 
             # verify image
@@ -245,12 +245,12 @@ def test_processor_case_1(self):
             self.assertSequenceEqual(decoding, expected_decoding)
 
             # batched
-            input_feat_extract = feature_extractor(images, return_tensors="pt")
+            input_feat_extract = image_processor(images, return_tensors="pt")
             input_processor = processor(images, padding=True, return_tensors="pt")
 
             # verify keys
             expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
-            actual_keys = sorted(list(input_processor.keys()))
+            actual_keys = sorted(input_processor.keys())
             self.assertListEqual(actual_keys, expected_keys)
 
             # verify images
@@ -270,12 +270,12 @@ def test_processor_case_1(self):
     def test_processor_case_2(self):
         # case 2: document image classification (training, inference) + token classification (inference), apply_ocr=False
 
-        feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
+        image_processor = LayoutLMv2ImageProcessor(apply_ocr=False)
         tokenizers = self.get_tokenizers
         images = self.get_images
 
         for tokenizer in tokenizers:
-            processor = LayoutXLMProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+            processor = LayoutXLMProcessor(image_processor=image_processor, tokenizer=tokenizer)
 
             # not batched
             words = ["hello", "world"]
@@ -300,7 +300,7 @@ def test_processor_case_2(self):
 
             # verify keys
             expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
-            actual_keys = sorted(list(input_processor.keys()))
+            actual_keys = sorted(input_processor.keys())
             self.assertListEqual(actual_keys, expected_keys)
 
             # verify input_ids
@@ -324,12 +324,12 @@ def test_processor_case_2(self):
     def test_processor_case_3(self):
         # case 3: token classification (training), apply_ocr=False
 
-        feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
+        image_processor = LayoutLMv2ImageProcessor(apply_ocr=False)
         tokenizers = self.get_tokenizers
         images = self.get_images
 
         for tokenizer in tokenizers:
-            processor = LayoutXLMProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+            processor = LayoutXLMProcessor(image_processor=image_processor, tokenizer=tokenizer)
 
             # not batched
             words = ["weirdly", "world"]
@@ -339,7 +339,7 @@ def test_processor_case_3(self):
 
             # verify keys
             expected_keys = ["attention_mask", "bbox", "image", "input_ids", "labels"]
-            actual_keys = sorted(list(input_processor.keys()))
+            actual_keys = sorted(input_processor.keys())
             self.assertListEqual(actual_keys, expected_keys)
 
             # verify input_ids
@@ -361,7 +361,7 @@ def test_processor_case_3(self):
 
             # verify keys
             expected_keys = ["attention_mask", "bbox", "image", "input_ids", "labels"]
-            actual_keys = sorted(list(input_processor.keys()))
+            actual_keys = sorted(input_processor.keys())
             self.assertListEqual(actual_keys, expected_keys)
 
             # verify input_ids
@@ -389,12 +389,12 @@ def test_processor_case_3(self):
     def test_processor_case_4(self):
         # case 4: visual question answering (inference), apply_ocr=True
 
-        feature_extractor = LayoutLMv2FeatureExtractor()
+        image_processor = LayoutLMv2ImageProcessor()
         tokenizers = self.get_tokenizers
         images = self.get_images
 
         for tokenizer in tokenizers:
-            processor = LayoutXLMProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+            processor = LayoutXLMProcessor(image_processor=image_processor, tokenizer=tokenizer)
 
             # not batched
             question = "What's his name?"
@@ -402,7 +402,7 @@ def test_processor_case_4(self):
 
             # verify keys
             expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
-            actual_keys = sorted(list(input_processor.keys()))
+            actual_keys = sorted(input_processor.keys())
             self.assertListEqual(actual_keys, expected_keys)
 
             # verify input_ids
@@ -421,7 +421,7 @@ def test_processor_case_4(self):
 
             # verify keys
             expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
-            actual_keys = sorted(list(input_processor.keys()))
+            actual_keys = sorted(input_processor.keys())
             self.assertListEqual(actual_keys, expected_keys)
 
             # verify input_ids
@@ -440,12 +440,12 @@ def test_processor_case_4(self):
     def test_processor_case_5(self):
         # case 5: visual question answering (inference), apply_ocr=False
 
-        feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
+        image_processor = LayoutLMv2ImageProcessor(apply_ocr=False)
         tokenizers = self.get_tokenizers
         images = self.get_images
 
         for tokenizer in tokenizers:
-            processor = LayoutXLMProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+            processor = LayoutXLMProcessor(image_processor=image_processor, tokenizer=tokenizer)
 
             # not batched
             question = "What's his name?"
@@ -455,7 +455,7 @@ def test_processor_case_5(self):
 
             # verify keys
             expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
-            actual_keys = sorted(list(input_processor.keys()))
+            actual_keys = sorted(input_processor.keys())
             self.assertListEqual(actual_keys, expected_keys)
 
             # verify input_ids
@@ -471,7 +471,7 @@ def test_processor_case_5(self):
 
             # verify keys
             expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
-            actual_keys = sorted(list(input_processor.keys()))
+            actual_keys = sorted(input_processor.keys())
             self.assertListEqual(actual_keys, expected_keys)
 
             # verify input_ids
diff --git a/tests/models/layoutxlm/test_tokenization_layoutxlm.py b/tests/models/layoutxlm/test_tokenization_layoutxlm.py
index e74dfe496c1c..0b502748d131 100644
--- a/tests/models/layoutxlm/test_tokenization_layoutxlm.py
+++ b/tests/models/layoutxlm/test_tokenization_layoutxlm.py
@@ -144,6 +144,19 @@ def test_save_sentencepiece_tokenizer(self) -> None:
         self.assertEqual(encoding_tokenizer_slow_1, encoding_tokenizer_slow_2)
         self.assertEqual(encoding_tokenizer_slow_1, encoding_tokenizer_slow_3)
 
+    def test_split_special_tokens(self):
+        tokenizer = self.tokenizer_class.from_pretrained("microsoft/layoutxlm-base")
+        _, _, boxes = self.get_question_words_and_boxes()
+        special_token = "[SPECIAL_TOKEN]"
+        tokenizer.add_special_tokens({"additional_special_tokens": [special_token]})
+        encoded_special_token = tokenizer.tokenize(special_token, boxes=boxes, add_special_tokens=False)
+        self.assertEqual(len(encoded_special_token), 1)
+
+        encoded_split_special_token = tokenizer.tokenize(
+            special_token, add_special_tokens=False, split_special_tokens=True, boxes=boxes
+        )
+        self.assertTrue(len(encoded_split_special_token) > 1)
+
     @slow
     def test_sequence_builders(self):
         tokenizer = self.tokenizer_class.from_pretrained("microsoft/layoutxlm-base")
@@ -194,7 +207,6 @@ def test_add_special_tokens(self):
         tokenizers: List[LayoutXLMTokenizer] = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 special_token = "[SPECIAL_TOKEN]"
                 special_token_box = [1000, 1000, 1000, 1000]
 
@@ -425,7 +437,6 @@ def test_number_of_added_tokens(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 # test 1: single sequence
                 words, boxes = self.get_words_and_boxes()
 
@@ -1098,7 +1109,6 @@ def test_token_type_ids(self):
         tokenizers = self.get_tokenizers()
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 # test 1: single sequence
                 words, boxes = self.get_words_and_boxes()
 
@@ -1185,7 +1195,6 @@ def test_torch_encode_plus_sent_to_model(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
                     return
 
@@ -1448,7 +1457,6 @@ def test_sequence_ids(self):
     def test_special_tokens_initialization(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-
                 added_tokens = [AddedToken("<special>", lstrip=True)]
 
                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(
@@ -1684,7 +1692,6 @@ def test_batch_encode_dynamic_overflowing(self):
             tokenizer = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
 
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name}, {tokenizer.__class__.__name__})"):
-
                 if is_torch_available():
                     returned_tensor = "pt"
                 elif is_tf_available():
@@ -1853,7 +1860,6 @@ def test_only_label_first_subword(self):
 
     @slow
     def test_layoutxlm_integration_test(self):
-
         tokenizer_p = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base")
         tokenizer_r = LayoutXLMTokenizerFast.from_pretrained("microsoft/layoutxlm-base")
 
@@ -1952,3 +1958,7 @@ def test_sentencepiece_tokenize_and_convert_tokens_to_string(self):
     @unittest.skip("Doesn't use SentencePiece")
     def test_sentencepiece_tokenize_and_decode(self):
         pass
+
+    @unittest.skip("Chat is not supported")
+    def test_chat_template(self):
+        pass
diff --git a/tests/models/led/test_modeling_led.py b/tests/models/led/test_modeling_led.py
index 7a5d95bb413a..b6dfc3256b05 100644
--- a/tests/models/led/test_modeling_led.py
+++ b/tests/models/led/test_modeling_led.py
@@ -27,6 +27,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -268,18 +269,41 @@ def check_global_attention(self, config, inputs_dict):
 
 
 @require_torch
-class LEDModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class LEDModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (LEDModel, LEDForConditionalGeneration, LEDForSequenceClassification, LEDForQuestionAnswering)
         if is_torch_available()
         else ()
     )
     all_generative_model_classes = (LEDForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "conversational": LEDForConditionalGeneration,
+            "feature-extraction": LEDModel,
+            "question-answering": LEDForQuestionAnswering,
+            "summarization": LEDForConditionalGeneration,
+            "text-classification": LEDForSequenceClassification,
+            "text2text-generation": LEDForConditionalGeneration,
+            "translation": LEDForConditionalGeneration,
+            "zero-shot": LEDForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
     is_encoder_decoder = True
     test_pruning = False
     test_missing_keys = False
     test_torchscript = False
 
+    # TODO: Fix the failed tests when this model gets more usage
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if pipeline_test_casse_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
+            return True
+
+        return False
+
     def setUp(self):
         self.model_tester = LEDModelTester(self)
         self.config_tester = ConfigTester(self, config_class=LEDConfig)
diff --git a/tests/models/led/test_modeling_tf_led.py b/tests/models/led/test_modeling_tf_led.py
index 8c104627c8e5..a4f8ad6a9c58 100644
--- a/tests/models/led/test_modeling_tf_led.py
+++ b/tests/models/led/test_modeling_tf_led.py
@@ -14,13 +14,16 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import LEDConfig, is_tf_available
-from transformers.testing_utils import require_tf, slow, tooslow
+from transformers.testing_utils import require_tf, slow
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -44,7 +47,7 @@ def __init__(
         use_labels=False,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_dropout_prob=0.1,
@@ -189,9 +192,20 @@ def prepare_led_inputs_dict(
 
 
 @require_tf
-class TFLEDModelTest(TFModelTesterMixin, unittest.TestCase):
+class TFLEDModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (TFLEDForConditionalGeneration, TFLEDModel) if is_tf_available() else ()
     all_generative_model_classes = (TFLEDForConditionalGeneration,) if is_tf_available() else ()
+    pipeline_model_mapping = (
+        {
+            "conversational": TFLEDForConditionalGeneration,
+            "feature-extraction": TFLEDModel,
+            "summarization": TFLEDForConditionalGeneration,
+            "text2text-generation": TFLEDForConditionalGeneration,
+            "translation": TFLEDForConditionalGeneration,
+        }
+        if is_tf_available()
+        else {}
+    )
     is_encoder_decoder = True
     test_pruning = False
     test_head_masking = False
@@ -208,26 +222,6 @@ def test_decoder_model_past_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
 
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-
-            if model_class in self.all_generative_model_classes:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert isinstance(name, dict)
-                for k, v in name.items():
-                    assert isinstance(v, tf.Variable)
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
-
     def test_attention_outputs(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         inputs_dict["global_attention_mask"] = tf.zeros_like(inputs_dict["attention_mask"])
@@ -298,11 +292,7 @@ def check_encoder_attentions_output(outputs):
             self.assertEqual(model.config.output_hidden_states, True)
             check_encoder_attentions_output(outputs)
 
-    def test_xla_mode(self):
-        # TODO JP: Make LED XLA compliant
-        pass
-
-    @tooslow
+    @unittest.skip("LED keeps using potentially symbolic tensors in conditionals and breaks tracing.")
     def test_saved_model_creation(self):
         pass
 
diff --git a/tests/models/led/test_tokenization_led.py b/tests/models/led/test_tokenization_led.py
index 2c761ad17a9a..7ff81749946a 100644
--- a/tests/models/led/test_tokenization_led.py
+++ b/tests/models/led/test_tokenization_led.py
@@ -128,7 +128,6 @@ def test_prepare_batch_not_longer_than_maxlen(self):
 
     @require_torch
     def test_special_tokens(self):
-
         src_text = ["A long paragraph for summarization."]
         tgt_text = [
             "Summary of the text.",
diff --git a/tests/models/levit/test_feature_extraction_levit.py b/tests/models/levit/test_feature_extraction_levit.py
deleted file mode 100644
index 2b1472d9b62a..000000000000
--- a/tests/models/levit/test_feature_extraction_levit.py
+++ /dev/null
@@ -1,209 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-
-from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torch_available, is_vision_available
-
-from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
-
-
-if is_torch_available():
-    import torch
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import LevitFeatureExtractor
-
-
-class LevitFeatureExtractionTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_center_crop=True,
-        crop_size=None,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-    ):
-        size = size if size is not None else {"shortest_edge": 18}
-        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_center_crop = do_center_crop
-        self.crop_size = crop_size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-
-    def prepare_feat_extract_dict(self):
-        return {
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_normalize": self.do_normalize,
-            "do_resize": self.do_resize,
-            "do_center_crop": self.do_center_crop,
-            "size": self.size,
-            "crop_size": self.crop_size,
-        }
-
-
-@require_torch
-@require_vision
-class LevitFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
-
-    feature_extraction_class = LevitFeatureExtractor if is_vision_available() else None
-
-    def setUp(self):
-        self.feature_extract_tester = LevitFeatureExtractionTester(self)
-
-    @property
-    def feat_extract_dict(self):
-        return self.feature_extract_tester.prepare_feat_extract_dict()
-
-    def test_feat_extract_properties(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        self.assertTrue(hasattr(feature_extractor, "image_mean"))
-        self.assertTrue(hasattr(feature_extractor, "image_std"))
-        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
-        self.assertTrue(hasattr(feature_extractor, "do_resize"))
-        self.assertTrue(hasattr(feature_extractor, "do_center_crop"))
-        self.assertTrue(hasattr(feature_extractor, "size"))
-
-    def test_feat_extract_from_dict_with_kwargs(self):
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict)
-        self.assertEqual(feature_extractor.size, {"shortest_edge": 18})
-        self.assertEqual(feature_extractor.crop_size, {"height": 18, "width": 18})
-
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict, size=42, crop_size=84)
-        self.assertEqual(feature_extractor.size, {"shortest_edge": 42})
-        self.assertEqual(feature_extractor.crop_size, {"height": 84, "width": 84})
-
-    def test_batch_feature(self):
-        pass
-
-    def test_call_pil(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PIL images
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-    def test_call_numpy(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random numpy tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-    def test_call_pytorch(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
diff --git a/tests/models/levit/test_image_processing_levit.py b/tests/models/levit/test_image_processing_levit.py
new file mode 100644
index 000000000000..756993c6b674
--- /dev/null
+++ b/tests/models/levit/test_image_processing_levit.py
@@ -0,0 +1,116 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_vision_available():
+    from transformers import LevitImageProcessor
+
+
+class LevitImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_center_crop=True,
+        crop_size=None,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+    ):
+        size = size if size is not None else {"shortest_edge": 18}
+        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def prepare_image_processor_dict(self):
+        return {
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_normalize": self.do_normalize,
+            "do_resize": self.do_resize,
+            "do_center_crop": self.do_center_crop,
+            "size": self.size,
+            "crop_size": self.crop_size,
+        }
+
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.crop_size["height"], self.crop_size["width"]
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class LevitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = LevitImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = LevitImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "do_center_crop"))
+        self.assertTrue(hasattr(image_processing, "size"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"shortest_edge": 18})
+        self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
+
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84)
+        self.assertEqual(image_processor.size, {"shortest_edge": 42})
+        self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
diff --git a/tests/models/levit/test_modeling_levit.py b/tests/models/levit/test_modeling_levit.py
index 2b3436f3d05a..0e46f6f56dd7 100644
--- a/tests/models/levit/test_modeling_levit.py
+++ b/tests/models/levit/test_modeling_levit.py
@@ -20,8 +20,6 @@
 import warnings
 from math import ceil, floor
 
-from packaging import version
-
 from transformers import LevitConfig
 from transformers.file_utils import cached_property, is_torch_available, is_vision_available
 from transformers.models.auto import get_values
@@ -29,6 +27,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -47,7 +46,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import LevitFeatureExtractor
+    from transformers import LevitImageProcessor
 
 
 class LevitConfigTester(ConfigTester):
@@ -68,10 +67,10 @@ def __init__(
         stride=2,
         padding=1,
         patch_size=16,
-        hidden_sizes=[128, 256, 384],
-        num_attention_heads=[4, 6, 8],
+        hidden_sizes=[16, 32, 48],
+        num_attention_heads=[1, 2, 3],
         depths=[2, 3, 4],
-        key_dim=[16, 16, 16],
+        key_dim=[8, 8, 8],
         drop_path_rate=0,
         mlp_ratio=[2, 2, 2],
         attention_ratio=[2, 2, 2],
@@ -165,7 +164,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class LevitModelTest(ModelTesterMixin, unittest.TestCase):
+class LevitModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_common.py, as Levit does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
@@ -176,6 +175,14 @@ class LevitModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": LevitModel,
+            "image-classification": (LevitForImageClassification, LevitForImageClassificationWithTeacher),
+        }
+        if is_torch_available()
+        else {}
+    )
 
     test_pruning = False
     test_torchscript = False
@@ -337,11 +344,6 @@ def test_training_gradient_checkpointing(self):
             loss.backward()
 
     def test_problem_types(self):
-
-        parsed_torch_version_base = version.parse(version.parse(torch.__version__).base_version)
-        if parsed_torch_version_base.base_version.startswith("1.9"):
-            self.skipTest(reason="This test fails with PyTorch 1.9.x: some CUDA issue")
-
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         problem_types = [
@@ -362,7 +364,6 @@ def test_problem_types(self):
 
             for problem_type in problem_types:
                 with self.subTest(msg=f"Testing {model_class} with {problem_type['title']}"):
-
                     config.problem_type = problem_type["title"]
                     config.num_labels = problem_type["num_labels"]
 
@@ -408,8 +409,8 @@ def prepare_img():
 @require_vision
 class LevitModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return LevitFeatureExtractor.from_pretrained(LEVIT_PRETRAINED_MODEL_ARCHIVE_LIST[0])
+    def default_image_processor(self):
+        return LevitImageProcessor.from_pretrained(LEVIT_PRETRAINED_MODEL_ARCHIVE_LIST[0])
 
     @slow
     def test_inference_image_classification_head(self):
@@ -417,9 +418,9 @@ def test_inference_image_classification_head(self):
             torch_device
         )
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/lilt/test_modeling_lilt.py b/tests/models/lilt/test_modeling_lilt.py
index a4f189fc848a..1bb92300c3db 100644
--- a/tests/models/lilt/test_modeling_lilt.py
+++ b/tests/models/lilt/test_modeling_lilt.py
@@ -22,6 +22,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -218,8 +219,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class LiltModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
+class LiltModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             LiltModel,
@@ -230,9 +230,26 @@ class LiltModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": LiltModel,
+            "question-answering": LiltForQuestionAnswering,
+            "text-classification": LiltForSequenceClassification,
+            "token-classification": LiltForTokenClassification,
+            "zero-shot": LiltForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
     fx_compatible = False
     test_pruning = False
 
+    # TODO: Fix the failed tests
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        return True
+
     def setUp(self):
         self.model_tester = LiltModelTester(self)
         self.config_tester = ConfigTester(self, config_class=LiltConfig, hidden_size=37)
diff --git a/tests/models/llama/__init__.py b/tests/models/llama/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py
new file mode 100644
index 000000000000..0223acbbd72a
--- /dev/null
+++ b/tests/models/llama/test_modeling_llama.py
@@ -0,0 +1,576 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch LLaMA model. """
+
+
+import unittest
+
+from parameterized import parameterized
+from pytest import mark
+
+from transformers import LlamaConfig, is_torch_available, set_seed
+from transformers.testing_utils import require_flash_attn, require_torch, require_torch_gpu, slow, torch_device
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        CodeLlamaTokenizer,
+        LlamaForCausalLM,
+        LlamaForSequenceClassification,
+        LlamaModel,
+        LlamaTokenizer,
+    )
+
+
+class LlamaModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=False,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        pad_token_id=0,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.pad_token_id = pad_token_id
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.get_config()
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def get_config(self):
+        return LlamaConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+            pad_token_id=self.pad_token_id,
+        )
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = LlamaModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_model_as_decoder(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.add_cross_attention = True
+        model = LlamaModel(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        result = model(input_ids, attention_mask=input_mask)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_causal_lm(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        model = LlamaForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_decoder_model_past_large_inputs(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.is_decoder = True
+        config.add_cross_attention = True
+        model = LlamaForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # first forward pass
+        outputs = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=True,
+        )
+        past_key_values = outputs.past_key_values
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+
+        output_from_no_past = model(
+            next_input_ids,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+        output_from_past = model(
+            next_tokens,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class LlamaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (LlamaModel, LlamaForCausalLM, LlamaForSequenceClassification) if is_torch_available() else ()
+    all_generative_model_classes = (LlamaForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": LlamaModel,
+            "text-classification": LlamaForSequenceClassification,
+            "text-generation": LlamaForCausalLM,
+            "zero-shot": LlamaForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
+    test_headmasking = False
+    test_pruning = False
+
+    def setUp(self):
+        self.model_tester = LlamaModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=LlamaConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_various_embeddings(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        for type in ["absolute", "relative_key", "relative_key_query"]:
+            config_and_inputs[0].position_embedding_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_llama_sequence_classification_model(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
+        model = LlamaForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    def test_llama_sequence_classification_model_for_single_label(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        config.problem_type = "single_label_classification"
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
+        model = LlamaForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    def test_llama_sequence_classification_model_for_multi_label(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        config.problem_type = "multi_label_classification"
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        sequence_labels = ids_tensor(
+            [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size
+        ).to(torch.float)
+        model = LlamaForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    @unittest.skip("LLaMA buffers include complex numbers, which breaks this test")
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @parameterized.expand([("linear",), ("dynamic",)])
+    def test_model_rope_scaling(self, scaling_type):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        short_input = ids_tensor([1, 10], config.vocab_size)
+        long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size)
+
+        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
+        original_model = LlamaModel(config)
+        original_model.to(torch_device)
+        original_model.eval()
+        original_short_output = original_model(short_input).last_hidden_state
+        original_long_output = original_model(long_input).last_hidden_state
+
+        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
+        config.rope_scaling = {"type": scaling_type, "factor": 10.0}
+        scaled_model = LlamaModel(config)
+        scaled_model.to(torch_device)
+        scaled_model.eval()
+        scaled_short_output = scaled_model(short_input).last_hidden_state
+        scaled_long_output = scaled_model(long_input).last_hidden_state
+
+        # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original
+        # maximum sequence length, so the outputs for the short input should match.
+        if scaling_type == "dynamic":
+            self.assertTrue(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
+        else:
+            self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
+
+        # The output should be different for long inputs
+        self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5))
+
+    @require_flash_attn
+    @require_torch_gpu
+    @mark.flash_attn_test
+    @slow
+    def test_flash_attn_2_generate_padding_right(self):
+        """
+        Overwritting the common test as the test is flaky on tiny models
+        """
+        model = LlamaForCausalLM.from_pretrained(
+            "meta-llama/Llama-2-7b-hf",
+            load_in_4bit=True,
+            device_map={"": 0},
+        )
+
+        tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+
+        texts = ["hi", "Hello this is a very long sentence"]
+
+        tokenizer.padding_side = "right"
+        tokenizer.pad_token = tokenizer.eos_token
+
+        inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0)
+
+        output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False)
+        output_native = tokenizer.batch_decode(output_native)
+
+        model = LlamaForCausalLM.from_pretrained(
+            "meta-llama/Llama-2-7b-hf", load_in_4bit=True, device_map={"": 0}, use_flash_attention_2=True
+        )
+
+        output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False)
+        output_fa_2 = tokenizer.batch_decode(output_fa_2)
+
+        self.assertListEqual(output_native, output_fa_2)
+
+
+@require_torch
+class LlamaIntegrationTest(unittest.TestCase):
+    @unittest.skip("Logits are not exactly the same, once we fix the instabalities somehow, will update!")
+    @slow
+    def test_model_7b_logits(self):
+        input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338]
+        model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", device_map="auto")
+        out = model(torch.tensor([input_ids]))
+        # Expected mean on dim = -1
+        EXPECTED_MEAN = torch.tensor([[-6.6550, -4.1227, -4.9859, -3.2406, 0.8262, -3.0033, 1.2964, -3.3699]])
+        torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2)
+        # slicing logits[0, 0, 0:30]
+        # fmt: off
+        EXPECTED_SLICE = torch.tensor([-12.8281, -7.4453, -0.4639, -8.0625, -7.2500, -8.0000, -6.4883, -7.7695, -7.8438, -7.0312, -6.2188, -7.1328, -1.8496, 1.9961, -8.6250, -6.7227, -12.8281, -6.9492, -7.0742, -7.7852, -7.5820, -7.9062, -6.9375, -7.9805, -8.3438, -8.1562, -8.0469, -7.6250, -7.7422, -7.3398,])
+        # fmt: on
+        torch.testing.assert_close(out[0, 0, :30], EXPECTED_SLICE, atol=1e-5, rtol=1e-5)
+
+    @unittest.skip("Logits are not exactly the same, once we fix the instabalities somehow, will update!")
+    @slow
+    def test_model_13b_logits(self):
+        input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338]
+        model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-13b-hf", device_map="auto")
+        out = model(torch.tensor(input_ids))
+        # Expected mean on dim = -1
+        EXPECTED_MEAN = torch.tensor([[-2.0622, -1.2794, -1.1638, -0.9788, -1.4603, -1.0238, -1.7893, -1.4411]])
+        torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2)
+        # slicing logits[0, 0, 0:30]
+        # fmt: off
+        EXPECTED_SLICE = torch.tensor([-8.1406, -8.0547, 2.7461, -1.2344, -0.1448, -1.8262, -1.0020, -1.8154, -1.6895, -1.8516, -2.3574, -0.9277, 3.7598, 6.5742, -1.2998, -0.1177, -8.1406, -2.9688, -2.9199, -3.1699, -3.5254, -2.3555, -2.7988, -3.4141, -2.8262, -4.5195, -3.3379, -3.3164, -2.7832, -3.0273])
+        # fmt: on
+        torch.testing.assert_close(out[0, 0, :30], EXPECTED_SLICE, atol=1e-5, rtol=1e-5)
+
+    @unittest.skip("Logits are not exactly the same, once we fix the instabalities somehow, will update!")
+    @slow
+    def test_model_13bf_logits(self):
+        input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338]
+        model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-13b-chat-hf", device_map="auto")
+        out = model(torch.tensor(input_ids))
+        # Expected mean on dim = -1
+        EXPECTED_MEAN = torch.tensor([[-0.8562, -1.8520, -0.7551, -0.4162, -1.5161, -1.2038, -2.4823, -2.3254]])
+        torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2)
+        # slicing logits[0, 0, 0:30]
+        # fmt: off
+        EXPECTED_SLICE = torch.tensor([-2.2227, 4.8828, 0.9023, -0.4578, -0.7871, -0.1033, -0.6221, -0.5786, -0.7803, -1.0674, -1.2920, -0.1570, 0.8008, 2.0723, -0.9497, 0.2771, -2.2227, -0.7612, -1.4346, -1.2061, -1.6426, -0.3000, -0.7139, -1.1934, -1.8691, -1.6973, -1.5947, -1.2705, -0.3523, -0.5513])
+        # fmt: on
+        torch.testing.assert_close(out.mean(-1), EXPECTED_SLICE, atol=1e-2, rtol=1e-2)
+
+    @unittest.skip(
+        "Logits are not exactly the same, once we fix the instabalities somehow, will update! Also it is gonna be a `too_slow` test"
+    )
+    @slow
+    def test_model_70b_logits(self):
+        input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338]
+        model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-70b-hf", device_map="auto")
+        out = model(torch.tensor(input_ids))
+
+        EXPECTED_MEAN = torch.tensor(
+            [[-4.2327, -3.3360, -4.6665, -4.7631, -1.8180, -3.4170, -1.4211, -3.1810]], dtype=torch.float32
+        )
+        torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2)
+        # fmt: off
+        EXPECTED_SLICE = torch.tensor([-9.4922, -3.9551, 1.7998, -5.6758, -5.1055, -5.8984, -4.8320, -6.8086, -6.5391, -5.6172, -5.5820, -5.5352, 1.7881, 3.6289, -6.5117, -3.4785, -9.5000, -6.0352, -6.8125, -6.0195, -6.6836, -5.4727, -6.2812, -6.0391, -7.3398, -7.4297, -7.4844, -6.5820, -5.8789, -5.5312])
+        # fmt: on
+        torch.testing.assert_close(out[0, 0, :30], EXPECTED_SLICE, atol=1e-5, rtol=1e-5)
+
+    @unittest.skip("Model is curently gated")
+    @slow
+    def test_model_13b_greedy_generation(self):
+        EXPECTED_TEXT_COMPLETION = """Simply put, the theory of relativity states that 1) the laws of physics are the same everywhere in the universe and 2) the passage of time and the length of objects can vary depending on the observer\'s frame of reference.\n\nThe first part of the theory, that the laws of physics are the same everywhere, is known as the "princi"""
+        prompt = "Simply put, the theory of relativity states that "
+        tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-13b-chat-hf")
+        input_ids = tokenizer.encode(prompt, return_tensors="pt")
+        model = LlamaForCausalLM.from_pretrained(
+            "meta-llama/Llama-2-13b-chat-hf", device_map="sequential", use_safetensors=False
+        )
+
+        # greedy generation outputs
+        generated_ids = model.generate(input_ids, max_new_tokens=64, top_p=None, temperature=1, do_sample=False)
+        text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
+
+
+@require_torch
+class CodeLlamaIntegrationTest(unittest.TestCase):
+    PROMPTS = [
+        '''def remove_non_ascii(s: str) -> str:
+    """ <FILL_ME>
+    return result
+''',
+        """# Installation instructions:
+    ```bash
+<FILL_ME>
+    ```
+This downloads the LLaMA inference code and installs the repository as a local pip package.
+""",
+        """class InterfaceManagerFactory(AbstractManagerFactory):
+    def __init__(<FILL_ME>
+def main():
+    factory = InterfaceManagerFactory(start=datetime.now())
+    managers = []
+    for i in range(10):
+        managers.append(factory.build(id=i))
+""",
+        """/-- A quasi-prefunctoid is 1-connected iff all its etalisations are 1-connected. -/
+theorem connected_iff_etalisation [C D : precategoroid] (P : quasi_prefunctoid C D) :
+π₁ P = 0 ↔ <FILL_ME> = 0 :=
+begin
+split,
+{ intros h f,
+    rw pi_1_etalisation at h,
+    simp [h],
+    refl
+},
+{ intro h,
+    have := @quasi_adjoint C D P,
+    simp [←pi_1_etalisation, this, h],
+    refl
+}
+end
+""",
+    ]
+
+    @require_torch_gpu
+    @slow
+    def test_model_7b_logits(self):
+        model = LlamaForCausalLM.from_pretrained("codellama/CodeLlama-7b-hf").to(torch_device)
+        tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-hf")
+        # Tokenize and prepare for the model a list of sequences or a list of pairs of sequences.
+        # meaning by default this supports passing splitted list of inputs
+        processed_text = tokenizer.batch_decode(tokenizer(self.PROMPTS)["input_ids"], add_special_tokens=False)
+        # fmt: off
+        EXPECTED_TEXT = [
+            '<s> <PRE> def remove_non_ascii(s: str) -> str:\n    """  <SUF>\n    return result\n <MID>',
+            '<s> <PRE> # Installation instructions:\n    ```bash\n <SUF>\n    ```\nThis downloads the LLaMA inference code and installs the repository as a local pip package.\n <MID>',
+            '<s> <PRE> class InterfaceManagerFactory(AbstractManagerFactory):\n    def __init__( <SUF>\ndef main():\n    factory = InterfaceManagerFactory(start=datetime.now())\n    managers = []\n    for i in range(10):\n        managers.append(factory.build(id=i))\n <MID>',
+            '<s> <PRE> /-- A quasi-prefunctoid is 1-connected iff all its etalisations are 1-connected. -/\ntheorem connected_iff_etalisation [C D : precategoroid] (P : quasi_prefunctoid C D) :\nπ₁ P = 0 ↔  <SUF> = 0 :=\nbegin\nsplit,\n{ intros h f,\n    rw pi_1_etalisation at h,\n    simp [h],\n    refl\n},\n{ intro h,\n    have := @quasi_adjoint C D P,\n    simp [←pi_1_etalisation, this, h],\n    refl\n}\nend\n <MID>'
+        ]
+        # fmt: on
+        self.assertEqual(processed_text, EXPECTED_TEXT)
+        processed_text_suffix_first = tokenizer.batch_decode(
+            tokenizer(self.PROMPTS, suffix_first=True, add_special_tokens=False)["input_ids"]
+        )
+
+        # fmt: off
+        EXPECTED_TEXT = [
+            '<PRE> <SUF>\n    return result\n <MID> def remove_non_ascii(s: str) -> str:\n    """ ',
+            '<PRE> <SUF>\n    ```\nThis downloads the LLaMA inference code and installs the repository as a local pip package.\n <MID> # Installation instructions:\n    ```bash\n',
+            '<PRE> <SUF>\ndef main():\n    factory = InterfaceManagerFactory(start=datetime.now())\n    managers = []\n    for i in range(10):\n        managers.append(factory.build(id=i))\n <MID> class InterfaceManagerFactory(AbstractManagerFactory):\n    def __init__(',
+            '<PRE> <SUF> = 0 :=\nbegin\nsplit,\n{ intros h f,\n    rw pi_1_etalisation at h,\n    simp [h],\n    refl\n},\n{ intro h,\n    have := @quasi_adjoint C D P,\n    simp [←pi_1_etalisation, this, h],\n    refl\n}\nend\n <MID> /-- A quasi-prefunctoid is 1-connected iff all its etalisations are 1-connected. -/\ntheorem connected_iff_etalisation [C D : precategoroid] (P : quasi_prefunctoid C D) :\nπ₁ P = 0 ↔ '
+        ]
+        EXPECTED_IDS = torch.tensor([[    1, 32007, 822, 3349, 29918, 5464, 29918, 294, 18869, 29898,29879, 29901, 851, 29897, 1599, 851, 29901, 13, 1678, 9995, 29871, 32008, 13, 1678, 736, 1121, 13, 32009, 15941, 1661, 29899, 28599, 2687, 4890, 515, 263, 1347, 29889, 13, 13, 1678, 826, 3174, 29901, 13, 4706, 269, 29901, 450, 1347, 304, 3349, 1661, 29899, 28599, 2687, 4890, 515, 29889, 13, 13, 1678, 16969, 29901, 13, 4706, 450, 1347, 411, 1661, 29899, 28599, 2687, 4890, 6206, 29889, 13, 1678, 9995, 13, 1678, 1121, 353, 5124, 13, 1678, 363, 274, 297, 269, 29901, 13, 4706, 565, 4356, 29898, 29883, 29897, 529, 29871, 29896, 29906, 29947, 29901, 13, 9651, 1121, 4619, 274, 32010, 2]])
+        # fmt: on
+        self.assertEqual(processed_text_suffix_first, EXPECTED_TEXT)
+        input_ids = tokenizer(self.PROMPTS[0], return_tensors="pt")["input_ids"]
+        generated_ids = model.generate(input_ids.to(torch_device), max_new_tokens=128)
+        torch.testing.assert_close(generated_ids, EXPECTED_IDS)
+
+        EXPECTED_INFILLING = [
+            '<s> <PRE> def remove_non_ascii(s: str) -> str:\n    """  <SUF>\n    return result\n <MID>Remove non-ASCII characters from a string.\n\n    Args:\n        s: The string to remove non-ASCII characters from.\n\n    Returns:\n        The string with non-ASCII characters removed.\n    """\n    result = ""\n    for c in s:\n        if ord(c) < 128:\n            result += c <EOT></s>'
+        ]
+        infilling = tokenizer.batch_decode(generated_ids)
+        self.assertEqual(infilling, EXPECTED_INFILLING)
diff --git a/tests/models/llama/test_tokenization_llama.py b/tests/models/llama/test_tokenization_llama.py
new file mode 100644
index 000000000000..008ec83c6563
--- /dev/null
+++ b/tests/models/llama/test_tokenization_llama.py
@@ -0,0 +1,721 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import pickle
+import shutil
+import tempfile
+import unittest
+
+from datasets import load_dataset
+
+from transformers import (
+    SPIECE_UNDERLINE,
+    AddedToken,
+    LlamaTokenizer,
+    LlamaTokenizerFast,
+    is_torch_available,
+)
+from transformers.convert_slow_tokenizer import convert_slow_tokenizer
+from transformers.testing_utils import (
+    get_tests_dir,
+    nested_simplify,
+    require_jinja,
+    require_sentencepiece,
+    require_tokenizers,
+    require_torch,
+    slow,
+)
+
+from ...test_tokenization_common import TokenizerTesterMixin
+
+
+SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
+
+
+if is_torch_available():
+    pass
+
+
+@require_sentencepiece
+@require_tokenizers
+class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+    tokenizer_class = LlamaTokenizer
+    rust_tokenizer_class = LlamaTokenizerFast
+
+    test_rust_tokenizer = False
+    test_sentencepiece = True
+    from_pretrained_kwargs = {}
+
+    def setUp(self):
+        super().setUp()
+
+        # We have a SentencePiece fixture for testing
+        tokenizer = LlamaTokenizer(SAMPLE_VOCAB, keep_accents=True)
+        tokenizer.pad_token = tokenizer.eos_token
+        tokenizer.save_pretrained(self.tmpdirname)
+
+    def get_tokenizers(self, **kwargs):
+        kwargs.update({"pad_token": "<PAD>"})
+        return super().get_tokenizers(**kwargs)
+
+    def test_full_tokenizer(self):
+        tokenizer = LlamaTokenizer(SAMPLE_VOCAB, keep_accents=True)
+
+        tokens = tokenizer.tokenize("This is a test")
+        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
+
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(tokens),
+            [285, 46, 10, 170, 382],
+        )
+
+        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
+        self.assertListEqual(
+            tokens,
+            [
+                SPIECE_UNDERLINE + "I",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "9",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "s",
+                "é",
+                ".",
+            ],
+        )
+        ids = tokenizer.convert_tokens_to_ids(tokens)
+        self.assertListEqual(
+            ids,
+            [8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4],
+        )
+
+        back_tokens = tokenizer.convert_ids_to_tokens(ids)
+        self.assertListEqual(
+            back_tokens,
+            [
+                SPIECE_UNDERLINE + "I",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "<unk>",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "s",
+                "<unk>",
+                ".",
+            ],
+        )
+
+    @unittest.skip("Let's wait for the fast tokenizer!")
+    def test_save_pretrained(self):
+        self.tokenizers_list += (self.rust_tokenizer_class, "hf-internal-testing/llama-tokenizer", {})
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+                tmpdirname2 = tempfile.mkdtemp()
+
+                tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2)
+                tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
+
+                # Checks it save with the same files + the tokenizer.json file for the fast one
+                self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
+                tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f)
+                self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
+
+                # Checks everything loads correctly in the same way
+                tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
+                tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
+
+                # Check special tokens are set accordingly on Rust and Python
+                for key in tokenizer_pp.special_tokens_map:
+                    self.assertTrue(hasattr(tokenizer_rp, key))
+
+                shutil.rmtree(tmpdirname2)
+
+                # Save tokenizer rust, legacy_format=True
+                tmpdirname2 = tempfile.mkdtemp()
+
+                tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=True)
+                tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
+
+                # Checks it save with the same files
+                self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
+
+                # Checks everything loads correctly in the same way
+                tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
+                tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
+
+                # Check special tokens are set accordingly on Rust and Python
+                for key in tokenizer_pp.special_tokens_map:
+                    self.assertTrue(hasattr(tokenizer_rp, key))
+
+                shutil.rmtree(tmpdirname2)
+
+                # Save tokenizer rust, legacy_format=False
+                tmpdirname2 = tempfile.mkdtemp()
+
+                tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=False)
+                tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
+
+                # Checks it saved the tokenizer.json file
+                self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
+
+                # Checks everything loads correctly in the same way
+                tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
+                tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
+
+                # Check special tokens are set accordingly on Rust and Python
+                for key in tokenizer_pp.special_tokens_map:
+                    self.assertTrue(hasattr(tokenizer_rp, key))
+
+                shutil.rmtree(tmpdirname2)
+
+    @require_torch
+    def test_batch_tokenization(self):
+        if not self.test_seq2seq:
+            return
+
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                # Longer text that will definitely require truncation.
+                text = [
+                    " UN Chief Says There Is No Military Solution in Syria",
+                    " Secretary-General Ban Ki-moon says his response to Russia's stepped up military support for"
+                    " Syria is that 'there is no military solution' to the nearly five-year conflict and more weapons"
+                    " will only worsen the violence and misery for millions of people.",
+                ]
+                try:
+                    batch = tokenizer(
+                        text=text,
+                        max_length=3,
+                        max_target_length=10,
+                        return_tensors="pt",
+                    )
+                except NotImplementedError:
+                    return
+                self.assertEqual(batch.input_ids.shape[1], 3)
+                # max_target_length will default to max_length if not specified
+                batch = tokenizer(text, max_length=3, return_tensors="pt")
+                self.assertEqual(batch.input_ids.shape[1], 3)
+
+                batch_encoder_only = tokenizer(text=text, max_length=3, max_target_length=10, return_tensors="pt")
+                self.assertEqual(batch_encoder_only.input_ids.shape[1], 3)
+                self.assertEqual(batch_encoder_only.attention_mask.shape[1], 3)
+                self.assertNotIn("decoder_input_ids", batch_encoder_only)
+
+    @unittest.skip("Unfortunately way too slow to build a BPE with SentencePiece.")
+    def test_save_slow_from_fast_and_reload_fast(self):
+        pass
+
+    def test_special_tokens_initialization(self):
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                added_tokens = [AddedToken("<special>", lstrip=True)]
+
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
+                    pretrained_name, additional_special_tokens=added_tokens, **kwargs
+                )
+                r_output = tokenizer_r.encode("Hey this is a <special> token")
+
+                special_token_id = tokenizer_r.encode("<special>", add_special_tokens=False)[0]
+
+                self.assertTrue(special_token_id in r_output)
+
+                if self.test_slow_tokenizer:
+                    tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
+                        pretrained_name,
+                        additional_special_tokens=added_tokens,
+                        **kwargs,  # , from_slow=True <- unfortunately too slow to convert
+                    )
+                    tokenizer_p = self.tokenizer_class.from_pretrained(
+                        pretrained_name, additional_special_tokens=added_tokens, **kwargs
+                    )
+
+                    p_output = tokenizer_p.encode("Hey this is a <special> token")
+
+                    cr_output = tokenizer_cr.encode("Hey this is a <special> token")
+
+                    self.assertEqual(p_output, r_output)
+                    self.assertEqual(cr_output, r_output)
+                    self.assertTrue(special_token_id in p_output)
+                    self.assertTrue(special_token_id in cr_output)
+
+    @slow
+    def test_tokenizer_integration(self):
+        # fmt: off
+        expected_encoding = {'input_ids': [[1, 4103, 689, 414, 313, 24784, 368, 2998, 408, 282, 3637, 25350, 29899, 9067, 414, 322, 282, 3637, 25350, 29899, 1457, 3018, 1312, 29899, 2151, 29897, 8128, 2498, 29899, 15503, 4220, 6956, 1973, 313, 13635, 29911, 29892, 402, 7982, 29899, 29906, 29892, 1528, 13635, 29911, 29874, 29892, 1060, 26369, 29892, 6652, 309, 29933, 814, 29892, 1060, 29931, 6779, 11410, 363, 18385, 17088, 7634, 11235, 313, 25103, 29965, 29897, 322, 18385, 17088, 28203, 313, 25103, 29954, 29897, 411, 975, 29871, 29941, 29906, 29974, 758, 3018, 1312, 4733, 297, 29871, 29896, 29900, 29900, 29974, 10276, 322, 6483, 1006, 3372, 3097, 1546, 435, 1165, 29892, 10772, 29911, 25350, 322, 323, 6073, 17907, 29889], [1, 350, 20161, 338, 8688, 304, 758, 29899, 14968, 6483, 21000, 8684, 284, 22540, 515, 443, 29880, 24025, 1426, 491, 14002, 368, 4195, 292, 373, 1716, 2175, 322, 1492, 3030, 297, 599, 15359, 29889], [1, 450, 4996, 17354, 1701, 29916, 432, 17204, 975, 278, 17366, 11203, 29889]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}
+        # fmt: on
+
+        self.tokenizer_integration_test_util(
+            expected_encoding=expected_encoding,
+            model_name="hf-internal-testing/llama-tokenizer",
+            revision="0984d03108b1a041ed679bd253b6519b7e1a4778",
+            padding=False,
+        )
+
+    def test_picklable(self):
+        with tempfile.NamedTemporaryFile() as f:
+            shutil.copyfile(SAMPLE_VOCAB, f.name)
+            tokenizer = LlamaTokenizer(f.name, keep_accents=True)
+            pickled_tokenizer = pickle.dumps(tokenizer)
+        pickle.loads(pickled_tokenizer)
+
+    @unittest.skip("worker 'gw4' crashed on CI, passing locally.")
+    def test_pickle_subword_regularization_tokenizer(self):
+        pass
+
+    @unittest.skip("worker 'gw4' crashed on CI, passing locally.")
+    def test_subword_regularization_tokenizer(self):
+        pass
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+class LlamaIntegrationTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        checkpoint_name = "hf-internal-testing/llama-tokenizer-non-normalized"
+        cls.tokenizer: LlamaTokenizer = LlamaTokenizer.from_pretrained(checkpoint_name)
+        cls.rust_tokenizer = LlamaTokenizerFast.from_pretrained(checkpoint_name)
+        return cls
+
+    @require_torch
+    def integration_tests(self):
+        inputs = self.tokenizer(
+            ["The following string should be properly encoded: Hello.", "But ird and ปี   ird   ด"],
+            return_tensors="pt",
+        )
+
+        self.assertEqual(
+            nested_simplify(inputs),
+            {
+                "input_ids": [
+                    [1, 450, 1494, 1347, 881, 367, 6284, 18511, 29901, 15043, 29889],
+                    [1, 1205, 29871, 1823, 322, 29871, 31010, 30691, 1678, 1823, 1678, 30718],
+                ],
+                "attention_mask": [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
+            },
+        )
+
+    def test_fast_special_tokens(self):
+        slow_tokenizer = self.tokenizer
+        fast_tokenizer = self.rust_tokenizer
+        slow = slow_tokenizer.encode("A sample test", add_special_tokens=True)
+        assert slow == [1, 319, 4559, 1243]
+
+        fast_tokenizer.add_eos_token = False
+        fast = fast_tokenizer.encode("A sample test", add_special_tokens=True)
+        assert fast == [1, 319, 4559, 1243]
+
+        fast_tokenizer.add_eos_token = True
+        fast = fast_tokenizer.encode("A sample test", add_special_tokens=True)
+        assert fast == [1, 319, 4559, 1243, 2]
+
+        slow_tokenizer.add_eos_token = True
+        slow = slow_tokenizer.encode("A sample test", add_special_tokens=True)
+        assert slow == [1, 319, 4559, 1243, 2]
+
+        fast_tokenizer = LlamaTokenizerFast.from_pretrained(
+            "hf-internal-testing/llama-tokenizer", add_eos_token=True, add_bos_token=False
+        )
+        fast = fast_tokenizer.encode("A sample test", add_special_tokens=True)
+        assert fast == [319, 4559, 1243, 2]
+
+        slow_tokenzier = LlamaTokenizer.from_pretrained(
+            "hf-internal-testing/llama-tokenizer", add_eos_token=True, add_bos_token=False
+        )
+        slow = slow_tokenzier.encode("A sample test", add_special_tokens=True)
+        assert slow == [319, 4559, 1243, 2]
+
+        self.tokenizer.add_eos_token = False
+        self.rust_tokenizer.add_eos_token = False
+
+    @slow
+    def test_conversion(self):
+        # This is excruciatingly slow since it has to recreate the entire merge
+        # list from the original vocabulary in spm
+        self.rust_tokenizer.save_pretrained("./out")
+        with tempfile.TemporaryDirectory() as dirname:
+            self.rust_tokenizer.save_pretrained(dirname)
+
+            with open(os.path.join(dirname, "tokenizer.json"), "r") as f:
+                old_serialized = f.read()
+
+        new_tokenizer = convert_slow_tokenizer(self.tokenizer)
+        with tempfile.NamedTemporaryFile() as f:
+            new_tokenizer.save(f.name)
+            # Re-opening since `f` is in bytes.
+            new_serialized = open(f.name, "r").read()
+            with open("out_tokenizer.json", "w") as g:
+                g.write(new_serialized)
+
+            self.assertEqual(old_serialized, new_serialized)
+
+    def test_simple_encode_decode(self):
+        pyth_tokenizer = self.tokenizer
+        rust_tokenizer = self.rust_tokenizer
+
+        self.assertEqual(pyth_tokenizer.encode("This is a test"), [1, 910, 338, 263, 1243])
+        self.assertEqual(rust_tokenizer.encode("This is a test"), [1, 910, 338, 263, 1243])
+        self.assertEqual(pyth_tokenizer.decode([1, 910, 338, 263, 1243], skip_special_tokens=True), "This is a test")
+        self.assertEqual(rust_tokenizer.decode([1, 910, 338, 263, 1243], skip_special_tokens=True), "This is a test")
+
+        # bytefallback showcase
+        self.assertEqual(pyth_tokenizer.encode("生活的真谛是"), [1, 29871, 30486, 31704, 30210, 30848, 235, 179, 158, 30392])
+        self.assertEqual(rust_tokenizer.encode("生活的真谛是"), [1, 29871, 30486, 31704, 30210, 30848, 235, 179, 158, 30392])
+        self.assertEqual(
+            pyth_tokenizer.decode(
+                [1, 29871, 30486, 31704, 30210, 30848, 235, 179, 158, 30392], skip_special_tokens=True
+            ),
+            "生活的真谛是",
+        )
+        self.assertEqual(
+            rust_tokenizer.decode(
+                [1, 29871, 30486, 31704, 30210, 30848, 235, 179, 158, 30392], skip_special_tokens=True
+            ),
+            "生活的真谛是",
+        )
+
+        # Inner spaces showcase
+        self.assertEqual(pyth_tokenizer.encode("Hi  Hello"), [1, 6324, 29871, 15043])
+        self.assertEqual(rust_tokenizer.encode("Hi  Hello"), [1, 6324, 29871, 15043])
+        self.assertEqual(pyth_tokenizer.decode([1, 6324, 29871, 15043], skip_special_tokens=True), "Hi  Hello")
+        self.assertEqual(rust_tokenizer.decode([1, 6324, 29871, 15043], skip_special_tokens=True), "Hi  Hello")
+
+        self.assertEqual(pyth_tokenizer.encode("Hi   Hello"), [1, 6324, 259, 15043])
+        self.assertEqual(rust_tokenizer.encode("Hi   Hello"), [1, 6324, 259, 15043])
+        self.assertEqual(pyth_tokenizer.decode([1, 6324, 259, 15043], skip_special_tokens=True), "Hi   Hello")
+        self.assertEqual(rust_tokenizer.decode([1, 6324, 259, 15043], skip_special_tokens=True), "Hi   Hello")
+
+        self.assertEqual(pyth_tokenizer.encode(""), [1])
+        self.assertEqual(rust_tokenizer.encode(""), [1])
+
+        self.assertEqual(pyth_tokenizer.encode(" "), [1, 259])
+        self.assertEqual(rust_tokenizer.encode(" "), [1, 259])
+
+        self.assertEqual(pyth_tokenizer.encode("  "), [1, 1678])
+        self.assertEqual(rust_tokenizer.encode("  "), [1, 1678])
+
+        self.assertEqual(pyth_tokenizer.encode(" Hello"), [1, 29871, 15043])
+        self.assertEqual(rust_tokenizer.encode(" Hello"), [1, 29871, 15043])
+
+    def test_no_differences_showcase(self):
+        pyth_tokenizer = self.tokenizer
+        rust_tokenizer = self.rust_tokenizer
+        self.assertEqual(pyth_tokenizer.encode(""), [1])
+        self.assertEqual(rust_tokenizer.encode(""), [1])
+
+        self.assertEqual(pyth_tokenizer.encode(" "), [1, 259])
+        self.assertEqual(rust_tokenizer.encode(" "), [1, 259])
+
+        self.assertEqual(pyth_tokenizer.encode("  "), [1, 1678])
+        self.assertEqual(rust_tokenizer.encode("  "), [1, 1678])
+
+        self.assertEqual(pyth_tokenizer.encode(" Hello"), [1, 29871, 15043])
+        self.assertEqual(rust_tokenizer.encode(" Hello"), [1, 29871, 15043])
+
+        self.assertEqual(pyth_tokenizer.encode("<s>"), [1, 1])
+        self.assertEqual(rust_tokenizer.encode("<s>"), [1, 1])
+
+    def test_no_differences_decode(self):
+        pyth_tokenizer = self.tokenizer
+        rust_tokenizer = self.rust_tokenizer
+
+        self.assertEqual(pyth_tokenizer.decode([869]), ".")
+        self.assertEqual(rust_tokenizer.decode([869]), ".")
+
+        self.assertEqual(pyth_tokenizer.decode([30112, 869]), "ا .")
+        self.assertEqual(rust_tokenizer.decode([30112, 869]), "ا .")
+
+    def test_no_differences_special_tokens(self):
+        pyth_tokenizer = self.tokenizer
+        rust_tokenizer = self.rust_tokenizer
+        self.assertEqual(pyth_tokenizer.encode(""), [1])
+        self.assertEqual(rust_tokenizer.encode(""), [1])
+
+        self.assertEqual(pyth_tokenizer.encode("<s>"), [1, 1])
+        self.assertEqual(rust_tokenizer.encode("<s>"), [1, 1])
+
+    @unittest.skipIf(
+        os.getenv("RUN_TOKENIZER_INTEGRATION", "0") == "0",
+        "RUN_TOKENIZER_INTEGRATION=1 to run tokenizer integration tests",
+    )
+    def test_integration_test_xnli(self):
+        import tqdm
+
+        pyth_tokenizer = self.tokenizer
+        rust_tokenizer = self.rust_tokenizer
+
+        dataset = load_dataset("code_x_glue_ct_code_to_text", "go")
+        for item in tqdm.tqdm(dataset["validation"]):
+            string = item["code"]
+            encoded1 = pyth_tokenizer.encode(string)
+            encoded2 = rust_tokenizer.encode(string)
+
+            self.assertEqual(encoded1, encoded2)
+
+            decoded1 = pyth_tokenizer.decode(encoded1, skip_special_tokens=True)
+            decoded2 = rust_tokenizer.decode(encoded2, skip_special_tokens=True)
+
+            self.assertEqual(decoded1, decoded2)
+
+        dataset = load_dataset("xnli", "all_languages")
+
+        for item in tqdm.tqdm(dataset["train"]):
+            for string in item["premise"].values():
+                encoded1 = pyth_tokenizer.encode(string)
+                encoded2 = rust_tokenizer.encode(string)
+
+                self.assertEqual(encoded1, encoded2)
+
+                decoded1 = pyth_tokenizer.decode(encoded1, skip_special_tokens=True)
+                decoded2 = rust_tokenizer.decode(encoded2, skip_special_tokens=True)
+
+                self.assertEqual(decoded1, decoded2)
+
+    def test_special_token_special_word(self):
+        # the word inform should be split as ['in', 'form']
+        tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", legacy=False)
+        tokenizer.add_tokens(["<REPR_END>"], special_tokens=False)
+        out1 = tokenizer.decode(
+            tokenizer.encode("<REPR_END>inform", add_special_tokens=False), spaces_between_special_tokens=False
+        )
+        self.assertEqual(out1, "<REPR_END>inform")
+        out2 = tokenizer.decode(
+            tokenizer.encode("<REPR_END>inform", add_special_tokens=False), spaces_between_special_tokens=True
+        )
+        # decoding strips the added prefix space.
+        self.assertEqual(out2, "<REPR_END> inform")
+        input_ids = tokenizer.encode("<REPR_END>inform", add_special_tokens=False)
+        self.assertEqual(input_ids, [29871, 32000, 262, 689])  # 29871 is the spiece underline, '▁' added as it should
+
+        out2 = tokenizer.decode(
+            tokenizer.encode(" <REPR_END> inform", add_special_tokens=False), spaces_between_special_tokens=False
+        )
+        # TODO @ArthurZ currently we strip left and right, so this will not keep the spaces
+        self.assertEqual(out2, "<REPR_END>inform")
+
+        ### Let's make sure decoding does not add extra spaces here and there
+        # TODO @ArthurZ this should be affected by the lstrip/rstrip/single word /normalize refactoring
+        # Since currently we always strip left and right of the token, results are as such
+        input_ids = tokenizer.encode("<s> Hello<s>how", add_special_tokens=False)
+        self.assertEqual(input_ids, [1, 15043, 1, 3525])
+        tokens = tokenizer.tokenize("<s> Hello<s>how", add_special_tokens=False)
+        self.assertEqual(tokens, ["<s>", "▁Hello", "<s>", "how"])
+        decoded_tokens = tokenizer.decode(input_ids)
+        self.assertEqual(decoded_tokens, "<s> Hello<s>how")
+
+        # Let's make sure that if there are any spaces, we don't remove them!
+        input_ids = tokenizer.encode(" <s> Hello<s> how", add_special_tokens=False)
+        self.assertEqual(input_ids, [259, 1, 15043, 1, 920])
+        tokens = tokenizer.tokenize(" <s> Hello<s> how", add_special_tokens=False)
+        self.assertEqual(tokens, ["▁▁", "<s>", "▁Hello", "<s>", "▁how"])
+        decoded_tokens = tokenizer.decode(input_ids)
+        self.assertEqual(decoded_tokens, " <s> Hello<s> how")
+
+    def test_some_edge_cases(self):
+        tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", legacy=False)
+
+        sp_tokens = tokenizer.sp_model.encode("<s>>", out_type=str)
+        self.assertEqual(sp_tokens, ["<", "s", ">>"])
+        tokens = tokenizer.tokenize("<s>>")
+        self.assertNotEqual(sp_tokens, tokens)
+        self.assertEqual(tokens, ["<s>", ">"])
+
+        tokens = tokenizer.tokenize("")
+        self.assertEqual(tokens, [])
+        self.assertEqual(tokens, tokenizer.sp_model.encode("", out_type=str))
+
+        tokens = tokenizer.tokenize(" ")
+        self.assertEqual(tokens, ["▁▁"])
+        # a dummy prefix space is not added by the sp_model as it was de-activated
+        self.assertEqual(tokens, tokenizer.sp_model.encode("  ", out_type=str))
+
+        tokens = tokenizer.tokenize("▁")
+        self.assertEqual(tokens, ["▁▁"])
+        # a dummy prefix space is not added by the sp_model as it was de-activated
+        self.assertEqual(tokens, tokenizer.sp_model.encode("▁▁", out_type=str))
+
+        tokens = tokenizer.tokenize(" ▁")
+        self.assertEqual(tokens, ["▁▁▁"])
+        # a dummy prefix space is not added by the sp_model as it was de-activated
+        self.assertEqual(tokens, tokenizer.sp_model.encode("▁▁▁", out_type=str))
+
+    def test_fast_post_processor(self):
+        tokenizer = LlamaTokenizerFast(
+            SAMPLE_VOCAB, eos_token=None, bos_token=None, add_bos_token=False, add_eos_token=False
+        )
+        tokenizer.encode(" Hey ")
+
+        with self.assertRaises(ValueError):
+            tokenizer = LlamaTokenizerFast(
+                SAMPLE_VOCAB, bos_token=None, eos_token="<s>", add_bos_token=True, add_eos_token=False
+            )
+        with self.assertRaises(ValueError):
+            tokenizer = LlamaTokenizerFast(SAMPLE_VOCAB, eos_token=None, add_bos_token=True, add_eos_token=True)
+
+    @require_jinja
+    def test_tokenization_for_chat(self):
+        tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", legacy=False)
+
+        test_chats = [
+            [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
+            [
+                {"role": "system", "content": "You are a helpful chatbot."},
+                {"role": "user", "content": "Hello!"},
+                {"role": "assistant", "content": "Nice to meet you."},
+            ],
+            [{"role": "user", "content": "Hello!"}],
+        ]
+        # Matt: The third test case tests the default system message, but if this is ever changed in the
+        #       class/repo code then that test will fail, and the case will need to be updated.
+        tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats]
+        # fmt: off
+        expected_tokens = [
+            [1, 29961, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492, 526, 263, 8444, 13563, 7451, 29889, 13, 29966, 829, 14816, 29903, 6778, 13, 13, 10994, 29991, 518, 29914, 25580, 29962],
+            [1, 29961, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492, 526, 263, 8444, 13563, 7451, 29889, 13, 29966, 829, 14816, 29903, 6778, 13, 13, 10994, 29991, 518, 29914, 25580, 29962, 20103, 304, 5870, 366, 29889, 29871, 2],
+            [1, 29961, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492, 526, 263, 8444, 29892, 3390, 1319, 322, 15993, 20255, 29889, 29849, 1234, 408, 1371, 3730, 408, 1950, 29892, 1550, 1641, 9109, 29889, 3575, 6089, 881, 451, 3160, 738, 10311, 1319, 29892, 443, 621, 936, 29892, 11021, 391, 29892, 7916, 391, 29892, 304, 27375, 29892, 18215, 29892, 470, 27302, 2793, 29889, 3529, 9801, 393, 596, 20890, 526, 5374, 635, 443, 5365, 1463, 322, 6374, 297, 5469, 29889, 13, 13, 3644, 263, 1139, 947, 451, 1207, 738, 4060, 29892, 470, 338, 451, 2114, 1474, 16165, 261, 296, 29892, 5649, 2020, 2012, 310, 22862, 1554, 451, 1959, 29889, 960, 366, 1016, 29915, 29873, 1073, 278, 1234, 304, 263, 1139, 29892, 3113, 1016, 29915, 29873, 6232, 2089, 2472, 29889, 13, 29966, 829, 14816, 29903, 6778, 13, 13, 10994, 29991, 518, 29914, 25580, 29962]
+        ]
+        # fmt: on
+        for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
+            self.assertListEqual(tokenized_chat, expected_tokens)
+
+
+@require_sentencepiece
+@require_tokenizers
+class CommonSpmIntegrationTests(unittest.TestCase):
+    """
+    A class that regroups important test to make sure that we properly handle the special tokens.
+    """
+
+    @classmethod
+    def setUpClass(cls):
+        tokenizer = LlamaTokenizer(SAMPLE_VOCAB, extra_ids=0, add_bos_token=False, legacy=False)
+        tokenizer.add_special_tokens({"additional_special_tokens": [AddedToken("<s>", rstrip=False, lstrip=False)]})
+        cls.tokenizer = tokenizer
+        return cls
+
+    def test_add_dummy_prefix(self):
+        # make sure `'▁'` is prepended, and outputs match sp_model's
+        # `sentencepiece.NormalizerSpec.add_dummy_prefix` attribute
+        input_ids = self.tokenizer.encode(". Hello")
+        self.assertEqual(input_ids, [7, 4, 156, 86, 20])
+        sp_encode = self.tokenizer.sp_model.encode(". Hello")
+        self.assertEqual(input_ids, [7] + sp_encode)
+        tokens = self.tokenizer.tokenize(". Hello")
+        self.assertEqual(tokens, ["▁", ".", "▁He", "ll", "o"])
+
+        tokens = self.tokenizer.tokenize("")
+        self.assertEqual(tokens, [])
+        self.assertEqual(tokens, self.tokenizer.sp_model.encode("", out_type=str))
+
+        tokens = self.tokenizer.tokenize(" ")
+        self.assertEqual(tokens, [])
+        self.assertEqual(tokens, self.tokenizer.sp_model.encode(" ", out_type=str))
+
+        tokens = self.tokenizer.tokenize("▁")
+        self.assertEqual(tokens, [])
+        self.assertEqual(tokens, self.tokenizer.sp_model.encode("▁", out_type=str))
+
+    def test_remove_extra_whitespaces(self):
+        # make sure the extra spaces are eaten. Since the sample vocab does not have
+        # `______`. sentencepiece.NormalizerSpec.remove_extra_whitespaces attribute is set to False
+
+        input_ids = self.tokenizer.encode("       . Hello")
+        self.assertEqual(input_ids, [7, 4, 156, 86, 20])
+        sp_encode = self.tokenizer.sp_model.encode("       . Hello")
+        self.assertEqual(input_ids, [7] + sp_encode)
+        tokens = self.tokenizer.tokenize(" . Hello")
+        self.assertEqual(tokens, ["▁", ".", "▁He", "ll", "o"])
+
+        # `'▁'` is also a whitespace
+        input_ids = self.tokenizer.encode("▁He is not")
+        self.assertEqual(input_ids, [156, 46, 44])
+        tokens = self.tokenizer.tokenize("▁He is not")
+        sp_encode = [
+            self.tokenizer.sp_model.piece_to_id("▁He"),
+            self.tokenizer.sp_model.piece_to_id("▁is"),
+            self.tokenizer.sp_model.piece_to_id("▁not"),
+        ]
+        self.assertEqual(input_ids, sp_encode)
+        self.assertEqual(tokens, ["▁He", "▁is", "▁not"])  # no extra space added
+
+        input_ids = self.tokenizer.encode("▁He is not<s>             ▁He")
+        self.assertEqual(input_ids, [156, 46, 44, 1, 156])
+        tokens = self.tokenizer.tokenize("▁He is not<s>              ▁He")
+        self.assertEqual(tokens, ["▁He", "▁is", "▁not", "<s>", "▁He"])  # spaces are eaten by spm + our strip
+        # make sure that the output after the extra id is the same as if
+        # extra_id was not there
+        input_ids = self.tokenizer.encode("▁He is not             ▁He")
+        self.assertEqual(input_ids, [156, 46, 44, 156])
+        tokens = self.tokenizer.tokenize("▁He is not              ▁He")
+        self.assertEqual(tokens, ["▁He", "▁is", "▁not", "▁He"])  # spaces are eaten by spm even if not start
+
+    def test_character_after_special_token(self):
+        # Make sure that `tokenizer.tokenize` is similar to
+        # adding the equivalent special token to the vocab
+        input_ids = self.tokenizer.encode("Hey <s>I")
+        self.assertEqual(input_ids, [156, 30, 1, 100])
+        sp_encode = self.tokenizer.sp_model.encode("Hey .I")
+        # the last token should be 100
+        self.assertEqual(input_ids[-1], sp_encode[-1])
+        tokens = self.tokenizer.tokenize("<s>I")
+        self.assertEqual(tokens, ["<s>", "I"])
+
+        input_ids = self.tokenizer.encode("Hello, <s>,")
+        self.assertEqual(input_ids, [156, 86, 20, 3, 1, 3])
+        tokens = self.tokenizer.tokenize("Hello, <s>,")
+        self.assertEqual(tokens, ["▁He", "ll", "o", ",", "<s>", ","])
+
+    def test_special_tokens_strip(self):
+        input_ids = self.tokenizer.encode(" <s> ,")
+        self.assertEqual(input_ids, [1, 7, 3])
+        tokens = self.tokenizer.tokenize(" <s> ,")
+        # spaces are eaten by rstrip / lstrip + spm sp_model.encode("  ") = []
+        self.assertEqual(tokens, ["<s>", "▁", ","])
+
+        input_ids = self.tokenizer.encode("No <s> ▁He")
+        self.assertEqual(input_ids, [284, 1, 156])
+        tokens = self.tokenizer.tokenize("No <s> ▁He")
+        self.assertEqual(tokens, ["▁No", "<s>", "▁He"])  # spaces are eaten by rstrip / lstrip
diff --git a/tests/models/longformer/test_modeling_longformer.py b/tests/models/longformer/test_modeling_longformer.py
index 6bef4cbea14a..7edcd206ab41 100644
--- a/tests/models/longformer/test_modeling_longformer.py
+++ b/tests/models/longformer/test_modeling_longformer.py
@@ -21,6 +21,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -49,7 +50,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -296,7 +297,7 @@ def prepare_config_and_inputs_for_question_answering(self):
 
 
 @require_torch
-class LongformerModelTest(ModelTesterMixin, unittest.TestCase):
+class LongformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     test_pruning = False  # pruning is not supported
     test_torchscript = False
 
@@ -312,6 +313,37 @@ class LongformerModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": LongformerModel,
+            "fill-mask": LongformerForMaskedLM,
+            "question-answering": LongformerForQuestionAnswering,
+            "text-classification": LongformerForSequenceClassification,
+            "token-classification": LongformerForTokenClassification,
+            "zero-shot": LongformerForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
+
+    # Need to use `0.6` instead of `0.5` for `test_disk_offload`
+    model_split_percents = [0.6, 0.7, 0.9]
+
+    # TODO: Fix the failed tests
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if (
+            pipeline_test_casse_name == "QAPipelineTests"
+            and tokenizer_name is not None
+            and not tokenizer_name.endswith("Fast")
+        ):
+            # `QAPipelineTests` fails for a few models when the slower tokenizer are used.
+            # (The slower tokenizers were never used for pipeline tests before the pipeline testing rework)
+            # TODO: check (and possibly fix) the `QAPipelineTests` with slower tokenizer
+            return True
+
+        return False
 
     def setUp(self):
         self.model_tester = LongformerModelTester(self)
diff --git a/tests/models/longformer/test_modeling_tf_longformer.py b/tests/models/longformer/test_modeling_tf_longformer.py
index 60a8ce01f4af..0eda06522681 100644
--- a/tests/models/longformer/test_modeling_tf_longformer.py
+++ b/tests/models/longformer/test_modeling_tf_longformer.py
@@ -14,13 +14,16 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import is_tf_available
-from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow, tooslow
+from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -53,7 +56,7 @@ def __init__(
         self.use_labels = True
         self.vocab_size = 99
         self.hidden_size = 32
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.num_attention_heads = 4
         self.intermediate_size = 37
         self.hidden_act = "gelu"
@@ -270,8 +273,7 @@ def prepare_config_and_inputs_for_question_answering(self):
 
 
 @require_tf
-class TFLongformerModelTest(TFModelTesterMixin, unittest.TestCase):
-
+class TFLongformerModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             TFLongformerModel,
@@ -284,9 +286,37 @@ class TFLongformerModelTest(TFModelTesterMixin, unittest.TestCase):
         if is_tf_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": TFLongformerModel,
+            "fill-mask": TFLongformerForMaskedLM,
+            "question-answering": TFLongformerForQuestionAnswering,
+            "text-classification": TFLongformerForSequenceClassification,
+            "token-classification": TFLongformerForTokenClassification,
+            "zero-shot": TFLongformerForSequenceClassification,
+        }
+        if is_tf_available()
+        else {}
+    )
     test_head_masking = False
     test_onnx = False
 
+    # TODO: Fix the failed tests
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if (
+            pipeline_test_casse_name == "QAPipelineTests"
+            and tokenizer_name is not None
+            and not tokenizer_name.endswith("Fast")
+        ):
+            # `QAPipelineTests` fails for a few models when the slower tokenizer are used.
+            # (The slower tokenizers were never used for pipeline tests before the pipeline testing rework)
+            # TODO: check (and possibly fix) the `QAPipelineTests` with slower tokenizer
+            return True
+
+        return False
+
     def setUp(self):
         self.model_tester = TFLongformerModelTester(self)
         self.config_tester = ConfigTester(self, config_class=LongformerConfig, hidden_size=37)
@@ -326,12 +356,12 @@ def test_for_multiple_choice(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
 
-    @tooslow
+    @unittest.skip("Longformer keeps using potentially symbolic tensors in conditionals and breaks tracing.")
     def test_saved_model_creation(self):
         pass
 
-    def test_xla_mode(self):
-        # TODO JP: Make Longformer XLA compliant
+    @unittest.skip("Longformer keeps using potentially symbolic tensors in conditionals and breaks tracing.")
+    def test_compile_tf_model(self):
         pass
 
 
diff --git a/tests/models/longt5/test_modeling_flax_longt5.py b/tests/models/longt5/test_modeling_flax_longt5.py
index 1ad9c1c5ce5a..9449cfa5e35a 100644
--- a/tests/models/longt5/test_modeling_flax_longt5.py
+++ b/tests/models/longt5/test_modeling_flax_longt5.py
@@ -45,6 +45,7 @@
     import jax.numpy as jnp
     from flax.core.frozen_dict import unfreeze
     from flax.traverse_util import flatten_dict
+
     from transformers import FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING, FLAX_MODEL_MAPPING, AutoTokenizer, LongT5Config
     from transformers.modeling_flax_pytorch_utils import load_flax_weights_in_pytorch_model
     from transformers.models.longt5.modeling_flax_longt5 import (
@@ -70,7 +71,7 @@ def __init__(
         use_attention_mask=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         d_ff=37,
         relative_attention_num_buckets=8,
@@ -82,7 +83,6 @@ def __init__(
         scope=None,
         decoder_layers=None,
     ):
-
         self.parent = parent
         self.batch_size = batch_size
         self.encoder_seq_length = encoder_seq_length
@@ -236,7 +236,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_flax
 class FlaxLongT5ModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
-
     all_model_classes = (FlaxLongT5Model, FlaxLongT5ForConditionalGeneration) if is_flax_available() else ()
     all_generative_model_classes = (FlaxLongT5ForConditionalGeneration,) if is_flax_available() else ()
     is_encoder_decoder = True
diff --git a/tests/models/longt5/test_modeling_longt5.py b/tests/models/longt5/test_modeling_longt5.py
index ffc67376f862..b2d17dc0e67a 100644
--- a/tests/models/longt5/test_modeling_longt5.py
+++ b/tests/models/longt5/test_modeling_longt5.py
@@ -26,6 +26,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -58,7 +59,7 @@ def __init__(
         use_attention_mask=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         d_ff=37,
         relative_attention_num_buckets=8,
@@ -71,7 +72,6 @@ def __init__(
         decoder_layers=None,
         large_model_config_path="google/long-t5-local-large",
     ):
-
         self.parent = parent
         self.batch_size = batch_size
         self.encoder_seq_length = encoder_seq_length
@@ -501,10 +501,20 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class LongT5ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
+class LongT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (LongT5Model, LongT5ForConditionalGeneration) if is_torch_available() else ()
     all_generative_model_classes = (LongT5ForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "conversational": LongT5ForConditionalGeneration,
+            "feature-extraction": LongT5Model,
+            "summarization": LongT5ForConditionalGeneration,
+            "text2text-generation": LongT5ForConditionalGeneration,
+            "translation": LongT5ForConditionalGeneration,
+        }
+        if is_torch_available()
+        else {}
+    )
     fx_compatible = False
     test_pruning = False
     test_torchscript = True
@@ -906,7 +916,7 @@ def __init__(
         # For common tests
         use_attention_mask=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         d_ff=37,
         relative_attention_num_buckets=8,
@@ -919,7 +929,6 @@ def __init__(
         scope=None,
         large_model_config_path="google/long-t5-local-large",
     ):
-
         self.parent = parent
         self.batch_size = batch_size
         self.encoder_seq_length = encoder_seq_length
diff --git a/tests/models/luke/test_modeling_luke.py b/tests/models/luke/test_modeling_luke.py
index 789988d5ca35..95cb4f5d01a6 100644
--- a/tests/models/luke/test_modeling_luke.py
+++ b/tests/models/luke/test_modeling_luke.py
@@ -20,6 +20,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -60,7 +61,7 @@ def __init__(
         entity_vocab_size=10,
         entity_emb_size=6,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -585,8 +586,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class LukeModelTest(ModelTesterMixin, unittest.TestCase):
-
+class LukeModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             LukeModel,
@@ -602,11 +602,32 @@ class LukeModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": LukeModel,
+            "fill-mask": LukeForMaskedLM,
+            "question-answering": LukeForQuestionAnswering,
+            "text-classification": LukeForSequenceClassification,
+            "token-classification": LukeForTokenClassification,
+            "zero-shot": LukeForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
     test_pruning = False
     test_torchscript = False
     test_resize_embeddings = True
     test_head_masking = True
 
+    # TODO: Fix the failed tests
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if pipeline_test_casse_name in ["QAPipelineTests", "ZeroShotClassificationPipelineTests"]:
+            return True
+
+        return False
+
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         entity_inputs_dict = {k: v for k, v in inputs_dict.items() if k.startswith("entity")}
         inputs_dict = {k: v for k, v in inputs_dict.items() if not k.startswith("entity")}
diff --git a/tests/models/luke/test_tokenization_luke.py b/tests/models/luke/test_tokenization_luke.py
index aa208f950bf3..26797faf7758 100644
--- a/tests/models/luke/test_tokenization_luke.py
+++ b/tests/models/luke/test_tokenization_luke.py
@@ -46,7 +46,6 @@ def get_tokenizer(self, task=None, **kwargs):
             task=task,
             **kwargs,
         )
-        tokenizer.sanitize_special_tokens()
         return tokenizer
 
     def get_input_output_texts(self, tokenizer):
diff --git a/tests/models/lxmert/test_modeling_lxmert.py b/tests/models/lxmert/test_modeling_lxmert.py
index 1c51d02e96b7..489d22d92efe 100644
--- a/tests/models/lxmert/test_modeling_lxmert.py
+++ b/tests/models/lxmert/test_modeling_lxmert.py
@@ -25,6 +25,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -129,7 +130,6 @@ def __init__(
         self.num_hidden_layers = {"vision": r_layers, "cross_encoder": x_layers, "language": l_layers}
 
     def prepare_config_and_inputs(self):
-
         output_attentions = self.output_attentions
         input_ids = ids_tensor([self.batch_size, self.seq_length], vocab_size=self.vocab_size)
         visual_feats = torch.rand(self.batch_size, self.num_visual_features, self.visual_feat_dim, device=torch_device)
@@ -412,7 +412,6 @@ def resize_lxmert_num_qa_labels(
         ans,
         output_attentions,
     ):
-
         start_labels = config.num_qa_labels
         num_large_labels = config.num_qa_labels * 2
         num_small_labels = int(config.num_qa_labels * 2)
@@ -531,9 +530,13 @@ def prepare_config_and_inputs_for_common(self, return_obj_labels=False):
 
 
 @require_torch
-class LxmertModelTest(ModelTesterMixin, unittest.TestCase):
-
+class LxmertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (LxmertModel, LxmertForPreTraining, LxmertForQuestionAnswering) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": LxmertModel, "question-answering": LxmertForQuestionAnswering}
+        if is_torch_available()
+        else {}
+    )
 
     fx_compatible = True
     test_head_masking = False
@@ -741,7 +744,6 @@ def test_retain_grad_hidden_states_attentions(self):
         self.assertIsNotNone(attentions_vision.grad)
 
     def prepare_tf_inputs_from_pt_inputs(self, pt_inputs_dict):
-
         tf_inputs_dict = {}
         for key, value in pt_inputs_dict.items():
             # skip key that does not exist in tf
diff --git a/tests/models/lxmert/test_modeling_tf_lxmert.py b/tests/models/lxmert/test_modeling_tf_lxmert.py
index 73eda47eb950..52f64d614423 100644
--- a/tests/models/lxmert/test_modeling_tf_lxmert.py
+++ b/tests/models/lxmert/test_modeling_tf_lxmert.py
@@ -13,17 +13,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
+from __future__ import annotations
+
 import tempfile
 import unittest
 
 import numpy as np
 
 from transformers import LxmertConfig, is_tf_available
-from transformers.testing_utils import require_tf, slow, tooslow
+from transformers.testing_utils import require_tf, slow
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -363,9 +365,9 @@ def create_and_check_lxmert_for_pretraining(
 
 
 @require_tf
-class TFLxmertModelTest(TFModelTesterMixin, unittest.TestCase):
-
+class TFLxmertModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (TFLxmertModel, TFLxmertForPreTraining) if is_tf_available() else ()
+    pipeline_model_mapping = {"feature-extraction": TFLxmertModel} if is_tf_available() else {}
     test_head_masking = False
     test_onnx = False
 
@@ -493,7 +495,6 @@ def prepare_pt_inputs_from_tf_inputs(self, tf_inputs_dict):
 
         pt_inputs_dict = {}
         for key, value in tf_inputs_dict.items():
-
             if isinstance(value, dict):
                 pt_inputs_dict[key] = self.prepare_pt_inputs_from_tf_inputs(value)
             elif isinstance(value, (list, tuple)):
@@ -530,143 +531,6 @@ def test_save_load(self):
 
                 self.assert_outputs_same(after_outputs, outputs)
 
-    def test_compile_tf_model(self):
-        optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
-        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-        metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
-                return_obj_labels="PreTraining" in model_class.__name__
-            )
-
-            input_ids = tf.keras.Input(
-                batch_shape=(self.model_tester.batch_size, self.model_tester.seq_length),
-                name="input_ids",
-                dtype="int32",
-            )
-            visual_feats = tf.keras.Input(
-                batch_shape=(
-                    self.model_tester.batch_size,
-                    self.model_tester.num_visual_features,
-                    self.model_tester.visual_feat_dim,
-                ),
-                name="visual_feats",
-                dtype="int32",
-            )
-            visual_pos = tf.keras.Input(
-                batch_shape=(self.model_tester.batch_size, self.model_tester.num_visual_features, 4),
-                name="visual_pos",
-                dtype="int32",
-            )
-
-            # Prepare our model
-            model = model_class(config)
-
-            # Let's load it from the disk to be sure we can use pretrained weights
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                outputs = model(self._prepare_for_class(inputs_dict, model_class))  # build the model
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname)
-
-            outputs_dict = model(input_ids, visual_feats, visual_pos)
-            hidden_states = outputs_dict[0]
-
-            # Add a dense layer on top to test integration with other keras modules
-            outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states)
-
-            # Compile extended model
-            extended_model = tf.keras.Model(inputs=[input_ids, visual_feats, visual_pos], outputs=[outputs])
-            extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
-
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        list_lm_models = [TFLxmertForPreTraining]
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-
-            if model_class in list_lm_models:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert isinstance(name, dict)
-                for k, v in name.items():
-                    assert isinstance(v, tf.Variable)
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
-
-    @tooslow
-    def test_saved_model_creation(self):
-        pass
-
-    @slow
-    def test_saved_model_creation_extended(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        if hasattr(config, "use_cache"):
-            config.use_cache = True
-
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", self.model_tester.seq_length)
-        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
-
-        for model_class in self.all_model_classes:
-            class_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-            model = model_class(config)
-            num_out = len(model(class_inputs_dict))
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname, saved_model=True)
-                saved_model_dir = os.path.join(tmpdirname, "saved_model", "1")
-                model = tf.keras.models.load_model(saved_model_dir)
-                outputs = model(class_inputs_dict)
-                language_hidden_states = outputs["language_hidden_states"]
-                vision_hidden_states = outputs["vision_hidden_states"]
-                language_attentions = outputs["language_attentions"]
-                vision_attentions = outputs["vision_attentions"]
-                cross_encoder_attentions = outputs["cross_encoder_attentions"]
-
-                self.assertEqual(len(outputs), num_out)
-
-                self.assertEqual(len(language_hidden_states), self.model_tester.num_hidden_layers["language"] + 1)
-                self.assertEqual(len(vision_hidden_states), self.model_tester.num_hidden_layers["vision"] + 1)
-
-                seq_length = self.model_tester.seq_length
-                num_visual_features = self.model_tester.num_visual_features
-
-                self.assertListEqual(
-                    list(language_hidden_states[0].shape[-2:]),
-                    [seq_length, self.model_tester.hidden_size],
-                )
-                self.assertListEqual(
-                    list(vision_hidden_states[0].shape[-2:]),
-                    [num_visual_features, self.model_tester.hidden_size],
-                )
-
-                self.assertEqual(len(language_attentions), self.model_tester.num_hidden_layers["language"])
-                self.assertEqual(len(vision_attentions), self.model_tester.num_hidden_layers["vision"])
-                self.assertEqual(len(cross_encoder_attentions), self.model_tester.num_hidden_layers["cross_encoder"])
-
-                attentions = [language_attentions, vision_attentions, cross_encoder_attentions]
-                attention_shapes = [
-                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-                    [
-                        self.model_tester.num_attention_heads,
-                        self.model_tester.num_visual_features,
-                        self.model_tester.num_visual_features,
-                    ],
-                    [self.model_tester.num_attention_heads, encoder_key_length, self.model_tester.num_visual_features],
-                ]
-
-                for attention, attention_shape in zip(attentions, attention_shapes):
-                    self.assertListEqual(list(attention[0].shape[-3:]), attention_shape)
-
 
 @require_tf
 class TFLxmertModelIntegrationTest(unittest.TestCase):
diff --git a/tests/models/lxmert/test_tokenization_lxmert.py b/tests/models/lxmert/test_tokenization_lxmert.py
index 76047b1f44bc..e094427f7613 100644
--- a/tests/models/lxmert/test_tokenization_lxmert.py
+++ b/tests/models/lxmert/test_tokenization_lxmert.py
@@ -26,7 +26,6 @@
 
 @require_tokenizers
 class LxmertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = LxmertTokenizer
     rust_tokenizer_class = LxmertTokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/m2m_100/test_modeling_m2m_100.py b/tests/models/m2m_100/test_modeling_m2m_100.py
index b5f742c046b4..d081041978c0 100644
--- a/tests/models/m2m_100/test_modeling_m2m_100.py
+++ b/tests/models/m2m_100/test_modeling_m2m_100.py
@@ -26,6 +26,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -220,7 +221,7 @@ def check_encoder_decoder_model_standalone(self, config, inputs_dict):
 
 
 @require_torch
-class M2M100ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class M2M100ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             M2M100Model,
@@ -230,11 +231,33 @@ class M2M100ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase
         else ()
     )
     all_generative_model_classes = (M2M100ForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "conversational": M2M100ForConditionalGeneration,
+            "feature-extraction": M2M100Model,
+            "summarization": M2M100ForConditionalGeneration,
+            "text2text-generation": M2M100ForConditionalGeneration,
+            "translation": M2M100ForConditionalGeneration,
+        }
+        if is_torch_available()
+        else {}
+    )
     is_encoder_decoder = True
     fx_compatible = True
     test_pruning = False
     test_missing_keys = False
 
+    # TODO: Fix the failed tests
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if pipeline_test_casse_name == "TranslationPipelineTests":
+            # Get `ValueError: Translation requires a `src_lang` and a `tgt_lang` for this model`.
+            # `M2M100Config` was never used in pipeline tests: cannot create a simple tokenizer.
+            return True
+
+        return False
+
     def setUp(self):
         self.model_tester = M2M100ModelTester(self)
         self.config_tester = ConfigTester(self, config_class=M2M100Config)
diff --git a/tests/models/m2m_100/test_tokenization_m2m_100.py b/tests/models/m2m_100/test_tokenization_m2m_100.py
index f8c5f5b7badd..13345a899f68 100644
--- a/tests/models/m2m_100/test_tokenization_m2m_100.py
+++ b/tests/models/m2m_100/test_tokenization_m2m_100.py
@@ -30,7 +30,7 @@
 
 
 if is_sentencepiece_available():
-    from transformers.models.m2m_100.tokenization_m2m_100 import save_json, VOCAB_FILES_NAMES
+    from transformers.models.m2m_100.tokenization_m2m_100 import VOCAB_FILES_NAMES, save_json
 
 from ...test_tokenization_common import TokenizerTesterMixin
 
@@ -84,15 +84,14 @@ def test_convert_token_and_id(self):
         self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
 
     def test_get_vocab(self):
-        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
+        tokenizer = self.get_tokenizer()
+        vocab_keys = list(tokenizer.get_vocab().keys())
 
         self.assertEqual(vocab_keys[0], "</s>")
         self.assertEqual(vocab_keys[1], "<unk>")
         self.assertEqual(vocab_keys[-1], "<s>")
-        self.assertEqual(len(vocab_keys), 110)
-
-    def test_vocab_size(self):
-        self.assertEqual(self.get_tokenizer().vocab_size, 117)
+        # The length of the vocab keys can be different
+        # self.assertEqual(len(vocab_keys), tokenizer.vocab_size)
 
     @unittest.skip("Skip this test while all models are still to be uploaded.")
     def test_pretrained_model_lists(self):
@@ -161,7 +160,10 @@ def check_language_codes(self):
         self.assertEqual(self.tokenizer.get_lang_id("mr"), 128063)
 
     def test_get_vocab(self):
-        self.assertIn(self.tokenizer.get_lang_token("en"), self.tokenizer.get_vocab())
+        vocab = self.tokenizer.get_vocab()
+        self.assertEqual(len(vocab), len(self.tokenizer))
+        self.assertEqual(vocab["<unk>"], 3)
+        self.assertIn(self.tokenizer.get_lang_token("en"), vocab)
 
     def test_tokenizer_batch_encode_plus(self):
         self.tokenizer.src_lang = "en"
@@ -179,11 +181,11 @@ def test_tokenizer_decode_ignores_language_codes(self):
         self.assertNotIn(self.tokenizer.eos_token, result)
 
     def test_special_tokens_unaffacted_by_save_load(self):
-        tmpdirname = tempfile.mkdtemp()
-        original_special_tokens = self.tokenizer.lang_token_to_id
-        self.tokenizer.save_pretrained(tmpdirname)
-        new_tok = M2M100Tokenizer.from_pretrained(tmpdirname)
-        self.assertDictEqual(new_tok.lang_token_to_id, original_special_tokens)
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            original_special_tokens = self.tokenizer.lang_token_to_id
+            self.tokenizer.save_pretrained(tmpdirname)
+            new_tok = M2M100Tokenizer.from_pretrained(tmpdirname)
+            self.assertDictEqual(new_tok.lang_token_to_id, original_special_tokens)
 
     @require_torch
     def test_batch_fairseq_parity(self):
diff --git a/tests/models/marian/test_modeling_flax_marian.py b/tests/models/marian/test_modeling_flax_marian.py
index 14d8dbac8f2d..6510c0d732d3 100644
--- a/tests/models/marian/test_modeling_flax_marian.py
+++ b/tests/models/marian/test_modeling_flax_marian.py
@@ -35,6 +35,7 @@
 
     import jax
     import jax.numpy as jnp
+
     from transformers import MarianTokenizer
     from transformers.models.marian.modeling_flax_marian import FlaxMarianModel, FlaxMarianMTModel, shift_tokens_right
 
diff --git a/tests/models/marian/test_modeling_marian.py b/tests/models/marian/test_modeling_marian.py
index b1e4678e4ab1..0ae0876e5030 100644
--- a/tests/models/marian/test_modeling_marian.py
+++ b/tests/models/marian/test_modeling_marian.py
@@ -18,6 +18,7 @@
 import unittest
 
 from huggingface_hub.hf_api import list_models
+
 from transformers import MarianConfig, is_torch_available
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
 from transformers.utils import cached_property
@@ -25,6 +26,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -234,9 +236,21 @@ def check_encoder_decoder_model_standalone(self, config, inputs_dict):
 
 
 @require_torch
-class MarianModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class MarianModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (MarianModel, MarianMTModel) if is_torch_available() else ()
     all_generative_model_classes = (MarianMTModel,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "conversational": MarianMTModel,
+            "feature-extraction": MarianModel,
+            "summarization": MarianMTModel,
+            "text-generation": MarianForCausalLM,
+            "text2text-generation": MarianMTModel,
+            "translation": MarianMTModel,
+        }
+        if is_torch_available()
+        else {}
+    )
     is_encoder_decoder = True
     fx_compatible = True
     test_pruning = False
@@ -424,7 +438,11 @@ def translate_src_text(self, **tokenizer_kwargs):
         )
         self.assertEqual(self.model.device, model_inputs.input_ids.device)
         generated_ids = self.model.generate(
-            model_inputs.input_ids, attention_mask=model_inputs.attention_mask, num_beams=2, max_length=128
+            model_inputs.input_ids,
+            attention_mask=model_inputs.attention_mask,
+            num_beams=2,
+            max_length=128,
+            renormalize_logits=True,  # Marian should always renormalize its logits. See #25459
         )
         generated_words = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
         return generated_words
@@ -604,7 +622,7 @@ def setUpClass(cls) -> None:
         return cls
 
     @slow
-    def test_batch_generation_en_fr(self):
+    def test_batch_generation_fi_en(self):
         self._assert_generated_batch_equal_expected()
 
 
@@ -647,7 +665,7 @@ def __init__(
         use_labels=True,
         decoder_start_token_id=2,
         decoder_ffn_dim=32,
-        decoder_layers=4,
+        decoder_layers=2,
         encoder_attention_heads=4,
         decoder_attention_heads=4,
         max_position_embeddings=30,
@@ -848,3 +866,7 @@ def test_decoder_model_attn_mask_past(self):
     def test_retain_grad_hidden_states_attentions(self):
         # decoder cannot keep gradients
         return
+
+    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
+    def test_left_padding_compatibility(self):
+        pass
diff --git a/tests/models/marian/test_modeling_tf_marian.py b/tests/models/marian/test_modeling_tf_marian.py
index 95f6bc0d0e82..9cb9d0061f05 100644
--- a/tests/models/marian/test_modeling_tf_marian.py
+++ b/tests/models/marian/test_modeling_tf_marian.py
@@ -14,16 +14,18 @@
 # limitations under the License.
 
 
-import tempfile
+from __future__ import annotations
+
 import unittest
 import warnings
 
 from transformers import AutoTokenizer, MarianConfig, MarianTokenizer, TranslationPipeline, is_tf_available
-from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow, tooslow
+from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
 from transformers.utils import cached_property
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -47,7 +49,7 @@ def __init__(
         use_labels=False,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_dropout_prob=0.1,
@@ -177,9 +179,20 @@ def prepare_marian_inputs_dict(
 
 
 @require_tf
-class TFMarianModelTest(TFModelTesterMixin, unittest.TestCase):
+class TFMarianModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (TFMarianMTModel, TFMarianModel) if is_tf_available() else ()
     all_generative_model_classes = (TFMarianMTModel,) if is_tf_available() else ()
+    pipeline_model_mapping = (
+        {
+            "conversational": TFMarianMTModel,
+            "feature-extraction": TFMarianModel,
+            "summarization": TFMarianMTModel,
+            "text2text-generation": TFMarianMTModel,
+            "translation": TFMarianMTModel,
+        }
+        if is_tf_available()
+        else {}
+    )
     is_encoder_decoder = True
     test_pruning = False
     test_onnx = False
@@ -195,61 +208,6 @@ def test_decoder_model_past_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
 
-    def test_compile_tf_model(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
-        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-        metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
-
-        model_class = self.all_generative_model_classes[0]
-        input_ids = {
-            "decoder_input_ids": tf.keras.Input(batch_shape=(2, 2000), name="decoder_input_ids", dtype="int32"),
-            "input_ids": tf.keras.Input(batch_shape=(2, 2000), name="input_ids", dtype="int32"),
-        }
-
-        # Prepare our model
-        model = model_class(config)
-        model(self._prepare_for_class(inputs_dict, model_class))  # Model must be called before saving.
-        # Let's load it from the disk to be sure we can use pre-trained weights
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_pretrained(tmpdirname)
-            model = model_class.from_pretrained(tmpdirname)
-
-        outputs_dict = model(input_ids)
-        hidden_states = outputs_dict[0]
-
-        # Add a dense layer on top to test integration with other keras modules
-        outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states)
-
-        # Compile extended model
-        extended_model = tf.keras.Model(inputs=[input_ids], outputs=[outputs])
-        extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
-
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-
-            if model_class in self.all_generative_model_classes:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert isinstance(name, dict)
-                for k, v in name.items():
-                    assert isinstance(v, tf.Variable)
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
-
-    @tooslow
-    def test_saved_model_creation(self):
-        pass
-
 
 @require_tf
 class AbstractMarianIntegrationTest(unittest.TestCase):
diff --git a/tests/models/marian/test_tokenization_marian.py b/tests/models/marian/test_tokenization_marian.py
index 6a079036bb6d..f32026be1a0a 100644
--- a/tests/models/marian/test_tokenization_marian.py
+++ b/tests/models/marian/test_tokenization_marian.py
@@ -45,7 +45,6 @@
 
 @require_sentencepiece
 class MarianTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = MarianTokenizer
     test_rust_tokenizer = False
     test_sentencepiece = True
@@ -150,3 +149,10 @@ def test_tokenizer_integration_seperate_vocabs(self):
 
         decoded = tokenizer.decode(target_ids, skip_special_tokens=True)
         self.assertEqual(decoded, target_text)
+
+    def test_tokenizer_decode(self):
+        tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-es")
+        source_text = "Hello World"
+        ids = tokenizer(source_text)["input_ids"]
+        output_text = tokenizer.decode(ids, skip_special_tokens=True)
+        self.assertEqual(source_text, output_text)
diff --git a/tests/models/markuplm/test_modeling_markuplm.py b/tests/models/markuplm/test_modeling_markuplm.py
index 8fa1bb440a55..71757385e87c 100644
--- a/tests/models/markuplm/test_modeling_markuplm.py
+++ b/tests/models/markuplm/test_modeling_markuplm.py
@@ -22,6 +22,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -52,7 +53,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -275,7 +276,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class MarkupLMModelTest(ModelTesterMixin, unittest.TestCase):
+class MarkupLMModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             MarkupLMModel,
@@ -286,6 +287,25 @@ class MarkupLMModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else None
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": MarkupLMModel,
+            "question-answering": MarkupLMForQuestionAnswering,
+            "text-classification": MarkupLMForSequenceClassification,
+            "token-classification": MarkupLMForTokenClassification,
+            "zero-shot": MarkupLMForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
+
+    # TODO: Fix the failed tests
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        # ValueError: Nodes must be of type `List[str]` (single pretokenized example), or `List[List[str]]`
+        # (batch of pretokenized examples).
+        return True
 
     def setUp(self):
         self.model_tester = MarkupLMModelTester(self)
@@ -358,7 +378,7 @@ def test_forward_pass_no_head(self):
         self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
 
         expected_slice = torch.tensor(
-            [[0.0267, -0.1289, 0.4930], [-0.2376, -0.0342, 0.2381], [-0.0329, -0.3785, 0.0263]]
+            [[0.0675, -0.0052, 0.5001], [-0.2281, 0.0802, 0.2192], [-0.0583, -0.3311, 0.1185]]
         ).to(torch_device)
 
         self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
diff --git a/tests/models/markuplm/test_processor_markuplm.py b/tests/models/markuplm/test_processor_markuplm.py
index 141d7bae186a..3959b231ff74 100644
--- a/tests/models/markuplm/test_processor_markuplm.py
+++ b/tests/models/markuplm/test_processor_markuplm.py
@@ -204,7 +204,7 @@ def test_processor_case_1(self):
 
             # verify keys
             expected_keys = ["attention_mask", "input_ids", "token_type_ids", "xpath_subs_seq", "xpath_tags_seq"]
-            actual_keys = sorted(list(inputs.keys()))
+            actual_keys = sorted(inputs.keys())
             self.assertListEqual(actual_keys, expected_keys)
 
             # verify input_ids
@@ -216,7 +216,7 @@ def test_processor_case_1(self):
 
             # verify keys
             expected_keys = ["attention_mask", "input_ids", "token_type_ids", "xpath_subs_seq", "xpath_tags_seq"]
-            actual_keys = sorted(list(inputs.keys()))
+            actual_keys = sorted(inputs.keys())
             self.assertListEqual(actual_keys, expected_keys)
 
             # verify input_ids
@@ -260,7 +260,7 @@ def test_processor_case_2(self):
 
             # verify keys
             expected_keys = ["attention_mask", "input_ids", "token_type_ids", "xpath_subs_seq", "xpath_tags_seq"]
-            actual_keys = sorted(list(inputs.keys()))
+            actual_keys = sorted(inputs.keys())
             self.assertListEqual(actual_keys, expected_keys)
 
             # verify input_ids
@@ -294,7 +294,7 @@ def test_processor_case_3(self):
                 "xpath_subs_seq",
                 "xpath_tags_seq",
             ]
-            actual_keys = sorted(list(inputs.keys()))
+            actual_keys = sorted(inputs.keys())
             self.assertListEqual(actual_keys, expected_keys)
 
             # verify input_ids
@@ -331,7 +331,7 @@ def test_processor_case_3(self):
                 "xpath_subs_seq",
                 "xpath_tags_seq",
             ]
-            actual_keys = sorted(list(inputs.keys()))
+            actual_keys = sorted(inputs.keys())
             self.assertListEqual(actual_keys, expected_keys)
 
             # verify input_ids
@@ -340,7 +340,7 @@ def test_processor_case_3(self):
 
             # verify xpath_tags_seq
             # fmt: off
-            expected_xpaths_tags_seq = [[216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216]]  # noqa: 
+            expected_xpaths_tags_seq = [[216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216]]  # noqa:
             # fmt: on
             self.assertSequenceEqual(inputs.xpath_tags_seq[1].tolist(), expected_xpaths_tags_seq)
 
@@ -367,7 +367,7 @@ def test_processor_case_4(self):
 
             # verify keys
             expected_keys = ["attention_mask", "input_ids", "token_type_ids", "xpath_subs_seq", "xpath_tags_seq"]
-            actual_keys = sorted(list(inputs.keys()))
+            actual_keys = sorted(inputs.keys())
             self.assertListEqual(actual_keys, expected_keys)
 
             # verify input_ids
@@ -390,7 +390,7 @@ def test_processor_case_4(self):
 
             # verify keys
             expected_keys = ["attention_mask", "input_ids", "token_type_ids", "xpath_subs_seq", "xpath_tags_seq"]
-            actual_keys = sorted(list(inputs.keys()))
+            actual_keys = sorted(inputs.keys())
             self.assertListEqual(actual_keys, expected_keys)
 
             # verify input_ids
@@ -402,7 +402,7 @@ def test_processor_case_4(self):
 
             # verify xpath_subs_seq
             # fmt: off
-            expected_xpath_subs_seq = [[1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 99, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 99, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 148, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 148, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 148, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 148, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 148, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 148, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 148, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 148, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 148, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001]]  # noqa: E231
+            expected_xpath_subs_seq = [[1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001]]
             # fmt: on
             self.assertListEqual(inputs.xpath_subs_seq[1].tolist(), expected_xpath_subs_seq)
 
@@ -425,7 +425,7 @@ def test_processor_case_5(self):
 
             # verify keys
             expected_keys = ["attention_mask", "input_ids", "token_type_ids", "xpath_subs_seq", "xpath_tags_seq"]
-            actual_keys = sorted(list(inputs.keys()))
+            actual_keys = sorted(inputs.keys())
             self.assertListEqual(actual_keys, expected_keys)
 
             # verify input_ids
@@ -444,7 +444,7 @@ def test_processor_case_5(self):
 
             # verify keys
             expected_keys = ["attention_mask", "input_ids", "token_type_ids", "xpath_subs_seq", "xpath_tags_seq"]
-            actual_keys = sorted(list(inputs.keys()))
+            actual_keys = sorted(inputs.keys())
             self.assertListEqual(actual_keys, expected_keys)
 
             # verify input_ids
@@ -458,6 +458,6 @@ def test_processor_case_5(self):
 
             # verify xpath_subs_seq
             # fmt: off
-            expected_xpath_subs_seq = [[1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 50, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001]]  # noqa: E231
+            expected_xpath_subs_seq = [[1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001]]
             # fmt: on
             self.assertListEqual(inputs.xpath_subs_seq[1].tolist()[-5:], expected_xpath_subs_seq)
diff --git a/tests/models/markuplm/test_tokenization_markuplm.py b/tests/models/markuplm/test_tokenization_markuplm.py
index e59934e4d086..44b1d31a4e4b 100644
--- a/tests/models/markuplm/test_tokenization_markuplm.py
+++ b/tests/models/markuplm/test_tokenization_markuplm.py
@@ -136,13 +136,17 @@ def test_add_tokens_tokenizer(self):
                 # smaller than the original vocabs - let's not assert this
                 # self.assertEqual(vocab_size, all_size)
 
-                new_toks = ["aaaaa", "bbbbbb", "cccccccccdddddddd"]
+                new_toks = [
+                    AddedToken("aaaaa", rstrip=True, lstrip=True),
+                    AddedToken("bbbbbb", rstrip=True, lstrip=True),
+                    AddedToken("cccccccccdddddddd", rstrip=True, lstrip=True),
+                ]
                 added_toks = tokenizer.add_tokens(new_toks)
                 vocab_size_2 = tokenizer.vocab_size
                 all_size_2 = len(tokenizer)
 
                 self.assertNotEqual(vocab_size_2, 0)
-                self.assertEqual(vocab_size, vocab_size_2)
+                self.assertEqual(vocab_size + 3, vocab_size_2 + 3)
                 self.assertEqual(added_toks, len(new_toks))
                 self.assertEqual(all_size_2, all_size + len(new_toks))
 
@@ -1344,6 +1348,19 @@ def test_special_tokens_initialization(self):
                     self.assertTrue(special_token_id in p_output)
                     self.assertTrue(special_token_id in cr_output)
 
+    def test_split_special_tokens(self):
+        # TODO this is only possible for slow currently
+        tokenizer = self.get_tokenizer()
+        special_token = "[SPECIAL_TOKEN]"
+        tokenizer.add_special_tokens({"additional_special_tokens": [special_token]})
+        encoded_special_token = tokenizer.tokenize(special_token, add_special_tokens=False)
+        self.assertEqual(len(encoded_special_token), 1)
+
+        encoded_split_special_token = tokenizer.tokenize(
+            special_token, add_special_tokens=False, split_special_tokens=True
+        )
+        self.assertTrue(len(encoded_split_special_token) > 1)
+
     def test_training_new_tokenizer(self):
         # This feature only exists for fast tokenizers
         if not self.test_rust_tokenizer:
@@ -2186,8 +2203,7 @@ def test_markuplm_integration_test(self):
         nodes, xpaths = self.get_nodes_and_xpaths()
 
         # fmt: off
-        expected_results = {'input_ids': [0, 42891, 8331, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'xpath_tags_seq': [[216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216]], 'xpath_subs_seq': [[1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 50, 120, 50, 178, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 50, 120, 50, 178, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001]], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
-        # fmt: on
+        expected_results = {'input_ids': [0, 42891, 8331, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'xpath_tags_seq': [[216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216]], 'xpath_subs_seq': [[1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 1, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 1, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001]], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}  # fmt: on
 
         encoding_p = tokenizer_p(nodes, xpaths=xpaths, padding="max_length", max_length=20)
         encoding_r = tokenizer_r(nodes, xpaths=xpaths, padding="max_length", max_length=20)
@@ -2198,7 +2214,7 @@ def test_markuplm_integration_test(self):
         nodes, xpaths = self.get_nodes_and_xpaths_batch()
 
         # fmt: off
-        expected_results = {'input_ids': [[0, 42891, 232, 12364, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 42891, 127, 766, 16, 22401, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'xpath_tags_seq': [[[216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216]], [[216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216]]], 'xpath_subs_seq': [[[1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 50, 120, 50, 178, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 50, 120, 50, 178, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 50, 120, 50, 178, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001]], [[1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 50, 120, 50, 178, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 50, 120, 50, 178, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 50, 120, 50, 178, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 50, 120, 50, 178, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 50, 120, 50, 178, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001]]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}
+        expected_results = {'input_ids': [[0, 42891, 232, 12364, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 42891, 127, 766, 16, 22401, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'xpath_tags_seq': [[[216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216]], [[216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216]]], 'xpath_subs_seq': [[[1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 1, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 1, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 1, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001]], [[1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 2, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 2, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 2, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 2, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 2, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001]]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}
         # fmt: on
 
         encoding_p = tokenizer_p(nodes, xpaths=xpaths, padding="max_length", max_length=20)
@@ -2211,7 +2227,7 @@ def test_markuplm_integration_test(self):
         node_labels = [1, 2, 3]
 
         # fmt: off
-        expected_results = {'input_ids': [0, 42891, 8331, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'xpath_tags_seq': [[216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216]], 'xpath_subs_seq': [[1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 50, 120, 50, 178, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 50, 120, 50, 178, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001]], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [-100, 1, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100], 'attention_mask': [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
+        expected_results = {'input_ids': [0, 42891, 8331, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'xpath_tags_seq': [[216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216]], 'xpath_subs_seq': [[1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 1, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 1, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001]], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [-100, 1, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100], 'attention_mask': [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
         # fmt: on
 
         encoding_p = tokenizer_p(nodes, xpaths=xpaths, node_labels=node_labels, padding="max_length", max_length=20)
@@ -2224,8 +2240,7 @@ def test_markuplm_integration_test(self):
         node_labels = [[1, 2, 3], [2, 46, 17, 22, 3]]
 
         # fmt: off
-        expected_results = {'input_ids': [[0, 42891, 232, 12364, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 42891, 127, 766, 16, 22401, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'xpath_tags_seq': [[[216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216]], [[216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216]]], 'xpath_subs_seq': [[[1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 50, 120, 50, 178, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 50, 120, 50, 178, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 50, 120, 50, 178, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001]], [[1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 50, 120, 50, 178, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 50, 120, 50, 178, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 50, 120, 50, 178, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 50, 120, 50, 178, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 50, 120, 50, 178, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001]]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'labels': [[-100, 1, -100, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100], [-100, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]], 'attention_mask': [[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}
-        # fmt: on
+        expected_results = {'input_ids': [[0, 42891, 232, 12364, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 42891, 127, 766, 16, 22401, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'xpath_tags_seq': [[[216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216]], [[216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216]]], 'xpath_subs_seq': [[[1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 1, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 1, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 1, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001]], [[1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 2, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 2, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 2, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 2, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 2, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001]]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'labels': [[-100, 1, -100, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100], [-100, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]], 'attention_mask': [[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # fmt: on
 
         encoding_p = tokenizer_p(nodes, xpaths=xpaths, node_labels=node_labels, padding="max_length", max_length=20)
         encoding_r = tokenizer_r(nodes, xpaths=xpaths, node_labels=node_labels, padding="max_length", max_length=20)
@@ -2236,8 +2251,7 @@ def test_markuplm_integration_test(self):
         question, nodes, xpaths = self.get_question_nodes_and_xpaths()
 
         # fmt: off
-        expected_results = {'input_ids': [0, 12196, 18, 39, 766, 116, 2, 42891, 232, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                            'xpath_tags_seq': [[216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216]], 'xpath_subs_seq': [[1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 50, 120, 50, 178, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 50, 120, 50, 178, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001]], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
+        expected_results = {'input_ids': [0, 12196, 18, 39, 766, 116, 2, 42891, 232, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'xpath_tags_seq': [[216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216]], 'xpath_subs_seq': [[1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 1, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 1, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001]], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
         # fmt: on
 
         encoding_p = tokenizer_p(question, nodes, xpaths, padding="max_length", max_length=20)
@@ -2249,10 +2263,7 @@ def test_markuplm_integration_test(self):
         questions, nodes, xpaths = self.get_question_nodes_and_xpaths_batch()
 
         # fmt: off
-        expected_results = {'input_ids': [[0, 12196, 18, 39, 766, 116, 2, 42891, 232, 12364, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 9178, 16, 37, 373, 116, 2, 42891, 127, 766, 16, 22401, 2, 1, 1, 1, 1, 1, 1, 1]],
-                            'xpath_tags_seq': [[[216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216]], [[216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216]]],
-                            'xpath_subs_seq': [[[1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 50, 120, 50, 178, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 50, 120, 50, 178, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 50, 120, 50, 178, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001]], [[1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 50, 120, 50, 178, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 50, 120, 50, 178, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 50, 120, 50, 178, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 50, 120, 50, 178, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [109, 25, 50, 120, 50, 178, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001]]],
-                            'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]]}
+        expected_results = {'input_ids': [[0, 12196, 18, 39, 766, 116, 2, 42891, 232, 12364, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 9178, 16, 37, 373, 116, 2, 42891, 127, 766, 16, 22401, 2, 1, 1, 1, 1, 1, 1, 1]], 'xpath_tags_seq': [[[216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216]], [[216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [109, 25, 50, 120, 50, 178, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216], [216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216]]], 'xpath_subs_seq': [[[1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 1, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 1, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 1, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001]], [[1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 2, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 2, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 2, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 2, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [0, 0, 0, 2, 0, 0, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001], [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001]]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]]}
         # fmt: on
 
         encoding_p = tokenizer_p(questions, nodes, xpaths, padding="max_length", max_length=20)
@@ -2304,3 +2315,7 @@ def test_padding_warning_message_fast_tokenizer(self):
             "Dummy warning",
             cm.records[0].message,
         )
+
+    @unittest.skip("Chat is not supported")
+    def test_chat_template(self):
+        pass
diff --git a/tests/models/mask2former/__init__.py b/tests/models/mask2former/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/mask2former/test_image_processing_mask2former.py b/tests/models/mask2former/test_image_processing_mask2former.py
new file mode 100644
index 000000000000..b3fe50164e5f
--- /dev/null
+++ b/tests/models/mask2former/test_image_processing_mask2former.py
@@ -0,0 +1,496 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+from datasets import load_dataset
+from huggingface_hub import hf_hub_download
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+    if is_vision_available():
+        from transformers import Mask2FormerImageProcessor
+        from transformers.models.mask2former.image_processing_mask2former import binary_mask_to_rle
+        from transformers.models.mask2former.modeling_mask2former import Mask2FormerForUniversalSegmentationOutput
+
+if is_vision_available():
+    from PIL import Image
+
+
+class Mask2FormerImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        min_resolution=30,
+        max_resolution=400,
+        size=None,
+        do_resize=True,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+        num_labels=10,
+        do_reduce_labels=True,
+        ignore_index=255,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = {"shortest_edge": 32, "longest_edge": 1333} if size is None else size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.size_divisor = 0
+        # for the post_process_functions
+        self.batch_size = 2
+        self.num_queries = 3
+        self.num_classes = 2
+        self.height = 3
+        self.width = 4
+        self.num_labels = num_labels
+        self.do_reduce_labels = do_reduce_labels
+        self.ignore_index = ignore_index
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "size_divisor": self.size_divisor,
+            "num_labels": self.num_labels,
+            "do_reduce_labels": self.do_reduce_labels,
+            "ignore_index": self.ignore_index,
+        }
+
+    def get_expected_values(self, image_inputs, batched=False):
+        """
+        This function computes the expected height and width when providing images to Mask2FormerImageProcessor,
+        assuming do_resize is set to True with a scalar size.
+        """
+        if not batched:
+            image = image_inputs[0]
+            if isinstance(image, Image.Image):
+                w, h = image.size
+            else:
+                h, w = image.shape[1], image.shape[2]
+            if w < h:
+                expected_height = int(self.size["shortest_edge"] * h / w)
+                expected_width = self.size["shortest_edge"]
+            elif w > h:
+                expected_height = self.size["shortest_edge"]
+                expected_width = int(self.size["shortest_edge"] * w / h)
+            else:
+                expected_height = self.size["shortest_edge"]
+                expected_width = self.size["shortest_edge"]
+
+        else:
+            expected_values = []
+            for image in image_inputs:
+                expected_height, expected_width = self.get_expected_values([image])
+                expected_values.append((expected_height, expected_width))
+            expected_height = max(expected_values, key=lambda item: item[0])[0]
+            expected_width = max(expected_values, key=lambda item: item[1])[1]
+
+        return expected_height, expected_width
+
+    def get_fake_mask2former_outputs(self):
+        return Mask2FormerForUniversalSegmentationOutput(
+            # +1 for null class
+            class_queries_logits=torch.randn((self.batch_size, self.num_queries, self.num_classes + 1)),
+            masks_queries_logits=torch.randn((self.batch_size, self.num_queries, self.height, self.width)),
+        )
+
+    def expected_output_image_shape(self, images):
+        height, width = self.get_expected_values(images, batched=True)
+        return self.num_channels, height, width
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class Mask2FormerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = Mask2FormerImageProcessor if (is_vision_available() and is_torch_available()) else None
+
+    def setUp(self):
+        self.image_processor_tester = Mask2FormerImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "ignore_index"))
+        self.assertTrue(hasattr(image_processing, "num_labels"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"shortest_edge": 32, "longest_edge": 1333})
+        self.assertEqual(image_processor.size_divisor, 0)
+
+        image_processor = self.image_processing_class.from_dict(
+            self.image_processor_dict, size=42, max_size=84, size_divisibility=8
+        )
+        self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84})
+        self.assertEqual(image_processor.size_divisor, 8)
+
+    def comm_get_image_processing_inputs(
+        self, with_segmentation_maps=False, is_instance_map=False, segmentation_type="np"
+    ):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # prepare image and target
+        num_labels = self.image_processor_tester.num_labels
+        annotations = None
+        instance_id_to_semantic_id = None
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
+        if with_segmentation_maps:
+            high = num_labels
+            if is_instance_map:
+                labels_expanded = list(range(num_labels)) * 2
+                instance_id_to_semantic_id = dict(enumerate(labels_expanded))
+            annotations = [
+                np.random.randint(0, high * 2, (img.size[1], img.size[0])).astype(np.uint8) for img in image_inputs
+            ]
+            if segmentation_type == "pil":
+                annotations = [Image.fromarray(annotation) for annotation in annotations]
+
+        inputs = image_processing(
+            image_inputs,
+            annotations,
+            return_tensors="pt",
+            instance_id_to_semantic_id=instance_id_to_semantic_id,
+            pad_and_return_pixel_mask=True,
+        )
+
+        return inputs
+
+    def test_with_size_divisor(self):
+        size_divisors = [8, 16, 32]
+        weird_input_sizes = [(407, 802), (582, 1094)]
+        for size_divisor in size_divisors:
+            image_processor_dict = {**self.image_processor_dict, **{"size_divisor": size_divisor}}
+            image_processing = self.image_processing_class(**image_processor_dict)
+            for weird_input_size in weird_input_sizes:
+                inputs = image_processing([np.ones((3, *weird_input_size))], return_tensors="pt")
+                pixel_values = inputs["pixel_values"]
+                # check if divisible
+                self.assertTrue((pixel_values.shape[-1] % size_divisor) == 0)
+                self.assertTrue((pixel_values.shape[-2] % size_divisor) == 0)
+
+    def test_call_with_segmentation_maps(self):
+        def common(is_instance_map=False, segmentation_type=None):
+            inputs = self.comm_get_image_processing_inputs(
+                with_segmentation_maps=True, is_instance_map=is_instance_map, segmentation_type=segmentation_type
+            )
+
+            mask_labels = inputs["mask_labels"]
+            class_labels = inputs["class_labels"]
+            pixel_values = inputs["pixel_values"]
+
+            # check the batch_size
+            for mask_label, class_label in zip(mask_labels, class_labels):
+                self.assertEqual(mask_label.shape[0], class_label.shape[0])
+                # this ensure padding has happened
+                self.assertEqual(mask_label.shape[1:], pixel_values.shape[2:])
+
+        common()
+        common(is_instance_map=True)
+        common(is_instance_map=False, segmentation_type="pil")
+        common(is_instance_map=True, segmentation_type="pil")
+
+    def test_integration_instance_segmentation(self):
+        # load 2 images and corresponding annotations from the hub
+        repo_id = "nielsr/image-segmentation-toy-data"
+        image1 = Image.open(
+            hf_hub_download(repo_id=repo_id, filename="instance_segmentation_image_1.png", repo_type="dataset")
+        )
+        image2 = Image.open(
+            hf_hub_download(repo_id=repo_id, filename="instance_segmentation_image_2.png", repo_type="dataset")
+        )
+        annotation1 = Image.open(
+            hf_hub_download(repo_id=repo_id, filename="instance_segmentation_annotation_1.png", repo_type="dataset")
+        )
+        annotation2 = Image.open(
+            hf_hub_download(repo_id=repo_id, filename="instance_segmentation_annotation_2.png", repo_type="dataset")
+        )
+
+        # get instance segmentations and instance-to-segmentation mappings
+        def get_instance_segmentation_and_mapping(annotation):
+            instance_seg = np.array(annotation)[:, :, 1]
+            class_id_map = np.array(annotation)[:, :, 0]
+            class_labels = np.unique(class_id_map)
+
+            # create mapping between instance IDs and semantic category IDs
+            inst2class = {}
+            for label in class_labels:
+                instance_ids = np.unique(instance_seg[class_id_map == label])
+                inst2class.update({i: label for i in instance_ids})
+
+            return instance_seg, inst2class
+
+        instance_seg1, inst2class1 = get_instance_segmentation_and_mapping(annotation1)
+        instance_seg2, inst2class2 = get_instance_segmentation_and_mapping(annotation2)
+
+        # create a image processor
+        image_processing = Mask2FormerImageProcessor(reduce_labels=True, ignore_index=255, size=(512, 512))
+
+        # prepare the images and annotations
+        inputs = image_processing(
+            [image1, image2],
+            [instance_seg1, instance_seg2],
+            instance_id_to_semantic_id=[inst2class1, inst2class2],
+            return_tensors="pt",
+        )
+
+        # verify the pixel values and pixel mask
+        self.assertEqual(inputs["pixel_values"].shape, (2, 3, 512, 512))
+        self.assertEqual(inputs["pixel_mask"].shape, (2, 512, 512))
+
+        # verify the class labels
+        self.assertEqual(len(inputs["class_labels"]), 2)
+        self.assertTrue(torch.allclose(inputs["class_labels"][0], torch.tensor([30, 55])))
+        self.assertTrue(torch.allclose(inputs["class_labels"][1], torch.tensor([4, 4, 23, 55])))
+
+        # verify the mask labels
+        self.assertEqual(len(inputs["mask_labels"]), 2)
+        self.assertEqual(inputs["mask_labels"][0].shape, (2, 512, 512))
+        self.assertEqual(inputs["mask_labels"][1].shape, (4, 512, 512))
+        self.assertEquals(inputs["mask_labels"][0].sum().item(), 41527.0)
+        self.assertEquals(inputs["mask_labels"][1].sum().item(), 26259.0)
+
+    def test_integration_semantic_segmentation(self):
+        # load 2 images and corresponding semantic annotations from the hub
+        repo_id = "nielsr/image-segmentation-toy-data"
+        image1 = Image.open(
+            hf_hub_download(repo_id=repo_id, filename="semantic_segmentation_image_1.png", repo_type="dataset")
+        )
+        image2 = Image.open(
+            hf_hub_download(repo_id=repo_id, filename="semantic_segmentation_image_2.png", repo_type="dataset")
+        )
+        annotation1 = Image.open(
+            hf_hub_download(repo_id=repo_id, filename="semantic_segmentation_annotation_1.png", repo_type="dataset")
+        )
+        annotation2 = Image.open(
+            hf_hub_download(repo_id=repo_id, filename="semantic_segmentation_annotation_2.png", repo_type="dataset")
+        )
+
+        # create a image processor
+        image_processing = Mask2FormerImageProcessor(reduce_labels=True, ignore_index=255, size=(512, 512))
+
+        # prepare the images and annotations
+        inputs = image_processing(
+            [image1, image2],
+            [annotation1, annotation2],
+            return_tensors="pt",
+        )
+
+        # verify the pixel values and pixel mask
+        self.assertEqual(inputs["pixel_values"].shape, (2, 3, 512, 512))
+        self.assertEqual(inputs["pixel_mask"].shape, (2, 512, 512))
+
+        # verify the class labels
+        self.assertEqual(len(inputs["class_labels"]), 2)
+        self.assertTrue(torch.allclose(inputs["class_labels"][0], torch.tensor([2, 4, 60])))
+        self.assertTrue(torch.allclose(inputs["class_labels"][1], torch.tensor([0, 3, 7, 8, 15, 28, 30, 143])))
+
+        # verify the mask labels
+        self.assertEqual(len(inputs["mask_labels"]), 2)
+        self.assertEqual(inputs["mask_labels"][0].shape, (3, 512, 512))
+        self.assertEqual(inputs["mask_labels"][1].shape, (8, 512, 512))
+        self.assertEquals(inputs["mask_labels"][0].sum().item(), 170200.0)
+        self.assertEquals(inputs["mask_labels"][1].sum().item(), 257036.0)
+
+    def test_integration_panoptic_segmentation(self):
+        # load 2 images and corresponding panoptic annotations from the hub
+        dataset = load_dataset("nielsr/ade20k-panoptic-demo")
+        image1 = dataset["train"][0]["image"]
+        image2 = dataset["train"][1]["image"]
+        segments_info1 = dataset["train"][0]["segments_info"]
+        segments_info2 = dataset["train"][1]["segments_info"]
+        annotation1 = dataset["train"][0]["label"]
+        annotation2 = dataset["train"][1]["label"]
+
+        def rgb_to_id(color):
+            if isinstance(color, np.ndarray) and len(color.shape) == 3:
+                if color.dtype == np.uint8:
+                    color = color.astype(np.int32)
+                return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
+            return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
+
+        def create_panoptic_map(annotation, segments_info):
+            annotation = np.array(annotation)
+            # convert RGB to segment IDs per pixel
+            # 0 is the "ignore" label, for which we don't need to make binary masks
+            panoptic_map = rgb_to_id(annotation)
+
+            # create mapping between segment IDs and semantic classes
+            inst2class = {segment["id"]: segment["category_id"] for segment in segments_info}
+
+            return panoptic_map, inst2class
+
+        panoptic_map1, inst2class1 = create_panoptic_map(annotation1, segments_info1)
+        panoptic_map2, inst2class2 = create_panoptic_map(annotation2, segments_info2)
+
+        # create a image processor
+        image_processing = Mask2FormerImageProcessor(ignore_index=0, do_resize=False)
+
+        # prepare the images and annotations
+        pixel_values_list = [np.moveaxis(np.array(image1), -1, 0), np.moveaxis(np.array(image2), -1, 0)]
+        inputs = image_processing.encode_inputs(
+            pixel_values_list,
+            [panoptic_map1, panoptic_map2],
+            instance_id_to_semantic_id=[inst2class1, inst2class2],
+            return_tensors="pt",
+        )
+
+        # verify the pixel values and pixel mask
+        self.assertEqual(inputs["pixel_values"].shape, (2, 3, 512, 711))
+        self.assertEqual(inputs["pixel_mask"].shape, (2, 512, 711))
+
+        # verify the class labels
+        self.assertEqual(len(inputs["class_labels"]), 2)
+        # fmt: off
+        expected_class_labels = torch.tensor([4, 17, 32, 42, 42, 42, 42, 42, 42, 42, 32, 12, 12, 12, 12, 12, 42, 42, 12, 12, 12, 42, 12, 12, 12, 12, 12, 3, 12, 12, 12, 12, 42, 42, 42, 12, 42, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 5, 12, 12, 12, 12, 12, 12, 12, 0, 43, 43, 43, 96, 43, 104, 43, 31, 125, 31, 125, 138, 87, 125, 149, 138, 125, 87, 87])  # noqa: E231
+        # fmt: on
+        self.assertTrue(torch.allclose(inputs["class_labels"][0], torch.tensor(expected_class_labels)))
+        # fmt: off
+        expected_class_labels = torch.tensor([19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 67, 82, 19, 19, 17, 19, 19, 19, 19, 19, 19, 19, 19, 19, 12, 12, 42, 12, 12, 12, 12, 3, 14, 12, 12, 12, 12, 12, 12, 12, 12, 14, 5, 12, 12, 0, 115, 43, 43, 115, 43, 43, 43, 8, 8, 8, 138, 138, 125, 143])  # noqa: E231
+        # fmt: on
+        self.assertTrue(torch.allclose(inputs["class_labels"][1], expected_class_labels))
+
+        # verify the mask labels
+        self.assertEqual(len(inputs["mask_labels"]), 2)
+        self.assertEqual(inputs["mask_labels"][0].shape, (79, 512, 711))
+        self.assertEqual(inputs["mask_labels"][1].shape, (61, 512, 711))
+        self.assertEquals(inputs["mask_labels"][0].sum().item(), 315193.0)
+        self.assertEquals(inputs["mask_labels"][1].sum().item(), 350747.0)
+
+    def test_binary_mask_to_rle(self):
+        fake_binary_mask = np.zeros((20, 50))
+        fake_binary_mask[0, 20:] = 1
+        fake_binary_mask[1, :15] = 1
+        fake_binary_mask[5, :10] = 1
+
+        rle = binary_mask_to_rle(fake_binary_mask)
+        self.assertEqual(len(rle), 4)
+        self.assertEqual(rle[0], 21)
+        self.assertEqual(rle[1], 45)
+
+    def test_post_process_semantic_segmentation(self):
+        fature_extractor = self.image_processing_class(num_labels=self.image_processor_tester.num_classes)
+        outputs = self.image_processor_tester.get_fake_mask2former_outputs()
+
+        segmentation = fature_extractor.post_process_semantic_segmentation(outputs)
+
+        self.assertEqual(len(segmentation), self.image_processor_tester.batch_size)
+        self.assertEqual(segmentation[0].shape, (384, 384))
+
+        target_sizes = [(1, 4) for i in range(self.image_processor_tester.batch_size)]
+        segmentation = fature_extractor.post_process_semantic_segmentation(outputs, target_sizes=target_sizes)
+
+        self.assertEqual(segmentation[0].shape, target_sizes[0])
+
+    def test_post_process_instance_segmentation(self):
+        image_processor = self.image_processing_class(num_labels=self.image_processor_tester.num_classes)
+        outputs = self.image_processor_tester.get_fake_mask2former_outputs()
+        segmentation = image_processor.post_process_instance_segmentation(outputs, threshold=0)
+
+        self.assertTrue(len(segmentation) == self.image_processor_tester.batch_size)
+        for el in segmentation:
+            self.assertTrue("segmentation" in el)
+            self.assertTrue("segments_info" in el)
+            self.assertEqual(type(el["segments_info"]), list)
+            self.assertEqual(el["segmentation"].shape, (384, 384))
+
+        segmentation = image_processor.post_process_instance_segmentation(
+            outputs, threshold=0, return_binary_maps=True
+        )
+
+        self.assertTrue(len(segmentation) == self.image_processor_tester.batch_size)
+        for el in segmentation:
+            self.assertTrue("segmentation" in el)
+            self.assertTrue("segments_info" in el)
+            self.assertEqual(type(el["segments_info"]), list)
+            self.assertEqual(len(el["segmentation"].shape), 3)
+            self.assertEqual(el["segmentation"].shape[1:], (384, 384))
+
+    def test_post_process_panoptic_segmentation(self):
+        image_processing = self.image_processing_class(num_labels=self.image_processor_tester.num_classes)
+        outputs = self.image_processor_tester.get_fake_mask2former_outputs()
+        segmentation = image_processing.post_process_panoptic_segmentation(outputs, threshold=0)
+
+        self.assertTrue(len(segmentation) == self.image_processor_tester.batch_size)
+        for el in segmentation:
+            self.assertTrue("segmentation" in el)
+            self.assertTrue("segments_info" in el)
+            self.assertEqual(type(el["segments_info"]), list)
+            self.assertEqual(el["segmentation"].shape, (384, 384))
+
+    def test_post_process_label_fusing(self):
+        image_processor = self.image_processing_class(num_labels=self.image_processor_tester.num_classes)
+        outputs = self.image_processor_tester.get_fake_mask2former_outputs()
+
+        segmentation = image_processor.post_process_panoptic_segmentation(
+            outputs, threshold=0, mask_threshold=0, overlap_mask_area_threshold=0
+        )
+        unfused_segments = [el["segments_info"] for el in segmentation]
+
+        fused_segmentation = image_processor.post_process_panoptic_segmentation(
+            outputs, threshold=0, mask_threshold=0, overlap_mask_area_threshold=0, label_ids_to_fuse={1}
+        )
+        fused_segments = [el["segments_info"] for el in fused_segmentation]
+
+        for el_unfused, el_fused in zip(unfused_segments, fused_segments):
+            if len(el_unfused) == 0:
+                self.assertEqual(len(el_unfused), len(el_fused))
+                continue
+
+            # Get number of segments to be fused
+            fuse_targets = [1 for el in el_unfused if el["label_id"] in {1}]
+            num_to_fuse = 0 if len(fuse_targets) == 0 else sum(fuse_targets) - 1
+            # Expected number of segments after fusing
+            expected_num_segments = max([el["id"] for el in el_unfused]) - num_to_fuse
+            num_segments_fused = max([el["id"] for el in el_fused])
+            self.assertEqual(num_segments_fused, expected_num_segments)
diff --git a/tests/models/mask2former/test_modeling_mask2former.py b/tests/models/mask2former/test_modeling_mask2former.py
new file mode 100644
index 000000000000..c05901a9dccd
--- /dev/null
+++ b/tests/models/mask2former/test_modeling_mask2former.py
@@ -0,0 +1,461 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Mask2Former model. """
+
+import inspect
+import unittest
+
+import numpy as np
+
+from tests.test_modeling_common import floats_tensor
+from transformers import Mask2FormerConfig, is_torch_available, is_vision_available
+from transformers.testing_utils import (
+    require_torch,
+    require_torch_gpu,
+    require_torch_multi_gpu,
+    require_vision,
+    slow,
+    torch_device,
+)
+from transformers.utils import cached_property
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import Mask2FormerForUniversalSegmentation, Mask2FormerModel
+
+    if is_vision_available():
+        from transformers import Mask2FormerImageProcessor
+
+if is_vision_available():
+    from PIL import Image
+
+
+class Mask2FormerModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=2,
+        is_training=True,
+        use_auxiliary_loss=False,
+        num_queries=10,
+        num_channels=3,
+        min_size=32 * 8,
+        max_size=32 * 8,
+        num_labels=4,
+        hidden_dim=64,
+        num_attention_heads=4,
+        num_hidden_layers=2,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.is_training = is_training
+        self.use_auxiliary_loss = use_auxiliary_loss
+        self.num_queries = num_queries
+        self.num_channels = num_channels
+        self.min_size = min_size
+        self.max_size = max_size
+        self.num_labels = num_labels
+        self.hidden_dim = hidden_dim
+        self.mask_feature_size = hidden_dim
+        self.num_attention_heads = num_attention_heads
+        self.num_hidden_layers = num_hidden_layers
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.min_size, self.max_size]).to(
+            torch_device
+        )
+
+        pixel_mask = torch.ones([self.batch_size, self.min_size, self.max_size], device=torch_device)
+
+        mask_labels = (
+            torch.rand([self.batch_size, self.num_labels, self.min_size, self.max_size], device=torch_device) > 0.5
+        ).float()
+        class_labels = (torch.rand((self.batch_size, self.num_labels), device=torch_device) > 0.5).long()
+
+        config = self.get_config()
+        return config, pixel_values, pixel_mask, mask_labels, class_labels
+
+    def get_config(self):
+        config = Mask2FormerConfig(
+            hidden_size=self.hidden_dim,
+            num_attention_heads=self.num_attention_heads,
+            num_hidden_layers=self.num_hidden_layers,
+            encoder_feedforward_dim=16,
+            dim_feedforward=32,
+            num_queries=self.num_queries,
+            num_labels=self.num_labels,
+            decoder_layers=2,
+            encoder_layers=2,
+            feature_size=16,
+        )
+        config.num_queries = self.num_queries
+        config.num_labels = self.num_labels
+
+        config.backbone_config.embed_dim = 16
+        config.backbone_config.depths = [1, 1, 1, 1]
+        config.backbone_config.hidden_size = 16
+        config.backbone_config.num_channels = self.num_channels
+        config.backbone_config.num_heads = [1, 1, 2, 2]
+
+        config.hidden_dim = self.hidden_dim
+        config.mask_feature_size = self.hidden_dim
+        config.feature_size = self.hidden_dim
+        return config
+
+    def prepare_config_and_inputs_for_common(self):
+        config, pixel_values, pixel_mask, _, _ = self.prepare_config_and_inputs()
+        inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask}
+        return config, inputs_dict
+
+    def check_output_hidden_state(self, output, config):
+        encoder_hidden_states = output.encoder_hidden_states
+        pixel_decoder_hidden_states = output.pixel_decoder_hidden_states
+        transformer_decoder_hidden_states = output.transformer_decoder_hidden_states
+
+        self.parent.assertTrue(len(encoder_hidden_states), len(config.backbone_config.depths))
+        self.parent.assertTrue(len(pixel_decoder_hidden_states), len(config.backbone_config.depths))
+        self.parent.assertTrue(len(transformer_decoder_hidden_states), config.decoder_layers)
+
+    def create_and_check_mask2former_model(self, config, pixel_values, pixel_mask, output_hidden_states=False):
+        with torch.no_grad():
+            model = Mask2FormerModel(config=config)
+            model.to(torch_device)
+            model.eval()
+
+            output = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
+            output = model(pixel_values, output_hidden_states=True)
+
+        self.parent.assertEqual(
+            output.transformer_decoder_last_hidden_state.shape,
+            (self.batch_size, self.num_queries, self.hidden_dim),
+        )
+        # let's ensure the other two hidden state exists
+        self.parent.assertTrue(output.pixel_decoder_last_hidden_state is not None)
+        self.parent.assertTrue(output.encoder_last_hidden_state is not None)
+
+        if output_hidden_states:
+            self.check_output_hidden_state(output, config)
+
+    def create_and_check_mask2former_instance_segmentation_head_model(
+        self, config, pixel_values, pixel_mask, mask_labels, class_labels
+    ):
+        model = Mask2FormerForUniversalSegmentation(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        def comm_check_on_output(result):
+            # let's still check that all the required stuff is there
+            self.parent.assertTrue(result.transformer_decoder_last_hidden_state is not None)
+            self.parent.assertTrue(result.pixel_decoder_last_hidden_state is not None)
+            self.parent.assertTrue(result.encoder_last_hidden_state is not None)
+            # okay, now we need to check the logits shape
+            # due to the encoder compression, masks have a //4 spatial size
+            self.parent.assertEqual(
+                result.masks_queries_logits.shape,
+                (self.batch_size, self.num_queries, self.min_size // 4, self.max_size // 4),
+            )
+            # + 1 for null class
+            self.parent.assertEqual(
+                result.class_queries_logits.shape, (self.batch_size, self.num_queries, self.num_labels + 1)
+            )
+
+        with torch.no_grad():
+            result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
+            result = model(pixel_values)
+
+            comm_check_on_output(result)
+
+            result = model(
+                pixel_values=pixel_values, pixel_mask=pixel_mask, mask_labels=mask_labels, class_labels=class_labels
+            )
+
+        comm_check_on_output(result)
+
+        self.parent.assertTrue(result.loss is not None)
+        self.parent.assertEqual(result.loss.shape, torch.Size([1]))
+
+
+@require_torch
+class Mask2FormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (Mask2FormerModel, Mask2FormerForUniversalSegmentation) if is_torch_available() else ()
+    pipeline_model_mapping = {"feature-extraction": Mask2FormerModel} if is_torch_available() else {}
+
+    is_encoder_decoder = False
+    test_pruning = False
+    test_head_masking = False
+    test_missing_keys = False
+
+    def setUp(self):
+        self.model_tester = Mask2FormerModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=Mask2FormerConfig, has_text_modality=False)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_mask2former_model(self):
+        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.create_and_check_mask2former_model(config, **inputs, output_hidden_states=False)
+
+    def test_mask2former_instance_segmentation_head_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mask2former_instance_segmentation_head_model(*config_and_inputs)
+
+    @unittest.skip(reason="Mask2Former does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="Mask2Former does not have a get_input_embeddings method")
+    def test_model_common_attributes(self):
+        pass
+
+    @unittest.skip(reason="Mask2Former is not a generative model")
+    def test_generate_without_input_ids(self):
+        pass
+
+    @unittest.skip(reason="Mask2Former does not use token embeddings")
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    @require_torch_multi_gpu
+    @unittest.skip(
+        reason="Mask2Former has some layers using `add_module` which doesn't work well with `nn.DataParallel`"
+    )
+    def test_multi_gpu_data_parallel_forward(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in ["facebook/mask2former-swin-small-coco-instance"]:
+            model = Mask2FormerModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    def test_model_with_labels(self):
+        size = (self.model_tester.min_size,) * 2
+        inputs = {
+            "pixel_values": torch.randn((2, 3, *size), device=torch_device),
+            "mask_labels": torch.randn((2, 10, *size), device=torch_device),
+            "class_labels": torch.zeros(2, 10, device=torch_device).long(),
+        }
+        config = self.model_tester.get_config()
+
+        model = Mask2FormerForUniversalSegmentation(config).to(torch_device)
+        outputs = model(**inputs)
+        self.assertTrue(outputs.loss is not None)
+
+    def test_hidden_states_output(self):
+        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.create_and_check_mask2former_model(config, **inputs, output_hidden_states=True)
+
+    def test_attention_outputs(self):
+        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config).to(torch_device)
+            outputs = model(**inputs, output_attentions=True)
+            self.assertTrue(outputs.attentions is not None)
+
+    def test_training(self):
+        if not self.model_tester.is_training:
+            return
+
+        model_class = self.all_model_classes[1]
+        config, pixel_values, pixel_mask, mask_labels, class_labels = self.model_tester.prepare_config_and_inputs()
+
+        model = model_class(config)
+        model.to(torch_device)
+        model.train()
+
+        loss = model(pixel_values, mask_labels=mask_labels, class_labels=class_labels).loss
+        loss.backward()
+
+    def test_retain_grad_hidden_states_attentions(self):
+        model_class = self.all_model_classes[1]
+        config, pixel_values, pixel_mask, mask_labels, class_labels = self.model_tester.prepare_config_and_inputs()
+        config.output_hidden_states = True
+        config.output_attentions = True
+
+        model = model_class(config).to(torch_device)
+        model.train()
+
+        outputs = model(pixel_values, mask_labels=mask_labels, class_labels=class_labels)
+
+        encoder_hidden_states = outputs.encoder_hidden_states[0]
+        encoder_hidden_states.retain_grad()
+
+        pixel_decoder_hidden_states = outputs.pixel_decoder_hidden_states[0]
+        pixel_decoder_hidden_states.retain_grad()
+
+        transformer_decoder_hidden_states = outputs.transformer_decoder_hidden_states[0]
+        transformer_decoder_hidden_states.retain_grad()
+
+        attentions = outputs.attentions[0]
+        attentions.retain_grad()
+
+        outputs.loss.backward(retain_graph=True)
+
+        self.assertIsNotNone(encoder_hidden_states.grad)
+        self.assertIsNotNone(pixel_decoder_hidden_states.grad)
+        self.assertIsNotNone(transformer_decoder_hidden_states.grad)
+        self.assertIsNotNone(attentions.grad)
+
+
+TOLERANCE = 1e-4
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_vision
+@slow
+class Mask2FormerModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def model_checkpoints(self):
+        return "facebook/mask2former-swin-small-coco-instance"
+
+    @cached_property
+    def default_image_processor(self):
+        return Mask2FormerImageProcessor.from_pretrained(self.model_checkpoints) if is_vision_available() else None
+
+    def test_inference_no_head(self):
+        model = Mask2FormerModel.from_pretrained(self.model_checkpoints).to(torch_device)
+        image_processor = self.default_image_processor
+        image = prepare_img()
+        inputs = image_processor(image, return_tensors="pt").to(torch_device)
+        inputs_shape = inputs["pixel_values"].shape
+        # check size is divisible by 32
+        self.assertTrue((inputs_shape[-1] % 32) == 0 and (inputs_shape[-2] % 32) == 0)
+        # check size
+        self.assertEqual(inputs_shape, (1, 3, 384, 384))
+
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        expected_slice_hidden_state = torch.tensor(
+            [[-0.2790, -1.0717, -1.1668], [-0.5128, -0.3128, -0.4987], [-0.5832, 0.1971, -0.0197]]
+        ).to(torch_device)
+        self.assertTrue(
+            torch.allclose(
+                outputs.encoder_last_hidden_state[0, 0, :3, :3], expected_slice_hidden_state, atol=TOLERANCE
+            )
+        )
+
+        expected_slice_hidden_state = torch.tensor(
+            [[0.8973, 1.1847, 1.1776], [1.1934, 1.5040, 1.5128], [1.1153, 1.4486, 1.4951]]
+        ).to(torch_device)
+        self.assertTrue(
+            torch.allclose(
+                outputs.pixel_decoder_last_hidden_state[0, 0, :3, :3], expected_slice_hidden_state, atol=TOLERANCE
+            )
+        )
+
+        expected_slice_hidden_state = torch.tensor(
+            [[2.1152, 1.7000, -0.8603], [1.5808, 1.8004, -0.9353], [1.6043, 1.7495, -0.5999]]
+        ).to(torch_device)
+        self.assertTrue(
+            torch.allclose(
+                outputs.transformer_decoder_last_hidden_state[0, :3, :3], expected_slice_hidden_state, atol=TOLERANCE
+            )
+        )
+
+    def test_inference_universal_segmentation_head(self):
+        model = Mask2FormerForUniversalSegmentation.from_pretrained(self.model_checkpoints).to(torch_device).eval()
+        image_processor = self.default_image_processor
+        image = prepare_img()
+        inputs = image_processor(image, return_tensors="pt").to(torch_device)
+        inputs_shape = inputs["pixel_values"].shape
+        # check size is divisible by 32
+        self.assertTrue((inputs_shape[-1] % 32) == 0 and (inputs_shape[-2] % 32) == 0)
+        # check size
+        self.assertEqual(inputs_shape, (1, 3, 384, 384))
+
+        with torch.no_grad():
+            outputs = model(**inputs)
+        # masks_queries_logits
+        masks_queries_logits = outputs.masks_queries_logits
+        self.assertEqual(
+            masks_queries_logits.shape, (1, model.config.num_queries, inputs_shape[-2] // 4, inputs_shape[-1] // 4)
+        )
+        expected_slice = [
+            [-8.7839, -9.0056, -8.8121],
+            [-7.4104, -7.0313, -6.5401],
+            [-6.6105, -6.3427, -6.4675],
+        ]
+        expected_slice = torch.tensor(expected_slice).to(torch_device)
+        self.assertTrue(torch.allclose(masks_queries_logits[0, 0, :3, :3], expected_slice, atol=TOLERANCE))
+        # class_queries_logits
+        class_queries_logits = outputs.class_queries_logits
+        self.assertEqual(class_queries_logits.shape, (1, model.config.num_queries, model.config.num_labels + 1))
+        expected_slice = torch.tensor(
+            [
+                [1.8324, -8.0835, -4.1922],
+                [0.8450, -9.0050, -3.6053],
+                [0.3045, -7.7293, -3.0275],
+            ]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.class_queries_logits[0, :3, :3], expected_slice, atol=TOLERANCE))
+
+    @require_torch_gpu
+    def test_inference_fp16(self):
+        model = (
+            Mask2FormerForUniversalSegmentation.from_pretrained(self.model_checkpoints)
+            .to(torch_device, dtype=torch.float16)
+            .eval()
+        )
+        image_processor = self.default_image_processor
+        image = prepare_img()
+        inputs = image_processor(image, return_tensors="pt").to(torch_device, dtype=torch.float16)
+
+        with torch.no_grad():
+            _ = model(**inputs)
+
+    def test_with_segmentation_maps_and_loss(self):
+        model = Mask2FormerForUniversalSegmentation.from_pretrained(self.model_checkpoints).to(torch_device).eval()
+        image_processor = self.default_image_processor
+
+        inputs = image_processor(
+            [np.zeros((3, 800, 1333)), np.zeros((3, 800, 1333))],
+            segmentation_maps=[np.zeros((384, 384)).astype(np.float32), np.zeros((384, 384)).astype(np.float32)],
+            return_tensors="pt",
+        )
+
+        inputs["pixel_values"] = inputs["pixel_values"].to(torch_device)
+        inputs["mask_labels"] = [el.to(torch_device) for el in inputs["mask_labels"]]
+        inputs["class_labels"] = [el.to(torch_device) for el in inputs["class_labels"]]
+
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        self.assertTrue(outputs.loss is not None)
diff --git a/tests/models/maskformer/test_feature_extraction_maskformer.py b/tests/models/maskformer/test_feature_extraction_maskformer.py
deleted file mode 100644
index 2036d9f7d28f..000000000000
--- a/tests/models/maskformer/test_feature_extraction_maskformer.py
+++ /dev/null
@@ -1,618 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-from datasets import load_dataset
-
-from huggingface_hub import hf_hub_download
-from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torch_available, is_vision_available
-
-from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
-
-
-if is_torch_available():
-    import torch
-
-    if is_vision_available():
-        from transformers import MaskFormerFeatureExtractor
-        from transformers.models.maskformer.image_processing_maskformer import binary_mask_to_rle
-        from transformers.models.maskformer.modeling_maskformer import MaskFormerForInstanceSegmentationOutput
-
-if is_vision_available():
-    from PIL import Image
-
-
-class MaskFormerFeatureExtractionTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        min_resolution=30,
-        max_resolution=400,
-        size=None,
-        do_resize=True,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-        num_labels=10,
-        reduce_labels=True,
-        ignore_index=255,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = {"shortest_edge": 32, "longest_edge": 1333} if size is None else size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.size_divisor = 0
-        # for the post_process_functions
-        self.batch_size = 2
-        self.num_queries = 3
-        self.num_classes = 2
-        self.height = 3
-        self.width = 4
-        self.num_labels = num_labels
-        self.reduce_labels = reduce_labels
-        self.ignore_index = ignore_index
-
-    def prepare_feat_extract_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_normalize": self.do_normalize,
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "size_divisor": self.size_divisor,
-            "num_labels": self.num_labels,
-            "reduce_labels": self.reduce_labels,
-            "ignore_index": self.ignore_index,
-        }
-
-    def get_expected_values(self, image_inputs, batched=False):
-        """
-        This function computes the expected height and width when providing images to MaskFormerFeatureExtractor,
-        assuming do_resize is set to True with a scalar size.
-        """
-        if not batched:
-            image = image_inputs[0]
-            if isinstance(image, Image.Image):
-                w, h = image.size
-            else:
-                h, w = image.shape[1], image.shape[2]
-            if w < h:
-                expected_height = int(self.size["shortest_edge"] * h / w)
-                expected_width = self.size["shortest_edge"]
-            elif w > h:
-                expected_height = self.size["shortest_edge"]
-                expected_width = int(self.size["shortest_edge"] * w / h)
-            else:
-                expected_height = self.size["shortest_edge"]
-                expected_width = self.size["shortest_edge"]
-
-        else:
-            expected_values = []
-            for image in image_inputs:
-                expected_height, expected_width = self.get_expected_values([image])
-                expected_values.append((expected_height, expected_width))
-            expected_height = max(expected_values, key=lambda item: item[0])[0]
-            expected_width = max(expected_values, key=lambda item: item[1])[1]
-
-        return expected_height, expected_width
-
-    def get_fake_maskformer_outputs(self):
-        return MaskFormerForInstanceSegmentationOutput(
-            # +1 for null class
-            class_queries_logits=torch.randn((self.batch_size, self.num_queries, self.num_classes + 1)),
-            masks_queries_logits=torch.randn((self.batch_size, self.num_queries, self.height, self.width)),
-        )
-
-
-@require_torch
-@require_vision
-class MaskFormerFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
-
-    feature_extraction_class = MaskFormerFeatureExtractor if (is_vision_available() and is_torch_available()) else None
-
-    def setUp(self):
-        self.feature_extract_tester = MaskFormerFeatureExtractionTester(self)
-
-    @property
-    def feat_extract_dict(self):
-        return self.feature_extract_tester.prepare_feat_extract_dict()
-
-    def test_feat_extract_properties(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        self.assertTrue(hasattr(feature_extractor, "image_mean"))
-        self.assertTrue(hasattr(feature_extractor, "image_std"))
-        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
-        self.assertTrue(hasattr(feature_extractor, "do_resize"))
-        self.assertTrue(hasattr(feature_extractor, "size"))
-        self.assertTrue(hasattr(feature_extractor, "max_size"))
-        self.assertTrue(hasattr(feature_extractor, "ignore_index"))
-        self.assertTrue(hasattr(feature_extractor, "num_labels"))
-
-    def test_feat_extract_from_dict_with_kwargs(self):
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict)
-        self.assertEqual(feature_extractor.size, {"shortest_edge": 32, "longest_edge": 1333})
-        self.assertEqual(feature_extractor.size_divisor, 0)
-
-        feature_extractor = self.feature_extraction_class.from_dict(
-            self.feat_extract_dict, size=42, max_size=84, size_divisibility=8
-        )
-        self.assertEqual(feature_extractor.size, {"shortest_edge": 42, "longest_edge": 84})
-        self.assertEqual(feature_extractor.size_divisor, 8)
-
-    def test_batch_feature(self):
-        pass
-
-    def test_call_pil(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PIL images
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-
-        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs)
-
-        self.assertEqual(
-            encoded_images.shape,
-            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
-        )
-
-        # Test batched
-        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True)
-
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                expected_height,
-                expected_width,
-            ),
-        )
-
-    def test_call_numpy(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random numpy tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-
-        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs)
-
-        self.assertEqual(
-            encoded_images.shape,
-            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-
-        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True)
-
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                expected_height,
-                expected_width,
-            ),
-        )
-
-    def test_call_pytorch(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-
-        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs)
-
-        self.assertEqual(
-            encoded_images.shape,
-            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-
-        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True)
-
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                expected_height,
-                expected_width,
-            ),
-        )
-
-    def test_equivalence_pad_and_create_pixel_mask(self):
-        # Initialize feature_extractors
-        feature_extractor_1 = self.feature_extraction_class(**self.feat_extract_dict)
-        feature_extractor_2 = self.feature_extraction_class(
-            do_resize=False, do_normalize=False, do_rescale=False, num_labels=self.feature_extract_tester.num_classes
-        )
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test whether the method "pad_and_return_pixel_mask" and calling the feature extractor return the same tensors
-        encoded_images_with_method = feature_extractor_1.encode_inputs(image_inputs, return_tensors="pt")
-        encoded_images = feature_extractor_2(image_inputs, return_tensors="pt")
-
-        self.assertTrue(
-            torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4)
-        )
-        self.assertTrue(
-            torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4)
-        )
-
-    def comm_get_feature_extractor_inputs(
-        self, with_segmentation_maps=False, is_instance_map=False, segmentation_type="np"
-    ):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # prepare image and target
-        num_labels = self.feature_extract_tester.num_labels
-        annotations = None
-        instance_id_to_semantic_id = None
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
-        if with_segmentation_maps:
-            high = num_labels
-            if is_instance_map:
-                labels_expanded = list(range(num_labels)) * 2
-                instance_id_to_semantic_id = {
-                    instance_id: label_id for instance_id, label_id in enumerate(labels_expanded)
-                }
-            annotations = [
-                np.random.randint(0, high * 2, (img.size[1], img.size[0])).astype(np.uint8) for img in image_inputs
-            ]
-            if segmentation_type == "pil":
-                annotations = [Image.fromarray(annotation) for annotation in annotations]
-
-        inputs = feature_extractor(
-            image_inputs,
-            annotations,
-            return_tensors="pt",
-            instance_id_to_semantic_id=instance_id_to_semantic_id,
-            pad_and_return_pixel_mask=True,
-        )
-
-        return inputs
-
-    def test_init_without_params(self):
-        pass
-
-    def test_with_size_divisor(self):
-        size_divisors = [8, 16, 32]
-        weird_input_sizes = [(407, 802), (582, 1094)]
-        for size_divisor in size_divisors:
-            feat_extract_dict = {**self.feat_extract_dict, **{"size_divisor": size_divisor}}
-            feature_extractor = self.feature_extraction_class(**feat_extract_dict)
-            for weird_input_size in weird_input_sizes:
-                inputs = feature_extractor([np.ones((3, *weird_input_size))], return_tensors="pt")
-                pixel_values = inputs["pixel_values"]
-                # check if divisible
-                self.assertTrue((pixel_values.shape[-1] % size_divisor) == 0)
-                self.assertTrue((pixel_values.shape[-2] % size_divisor) == 0)
-
-    def test_call_with_segmentation_maps(self):
-        def common(is_instance_map=False, segmentation_type=None):
-            inputs = self.comm_get_feature_extractor_inputs(
-                with_segmentation_maps=True, is_instance_map=is_instance_map, segmentation_type=segmentation_type
-            )
-
-            mask_labels = inputs["mask_labels"]
-            class_labels = inputs["class_labels"]
-            pixel_values = inputs["pixel_values"]
-
-            # check the batch_size
-            for mask_label, class_label in zip(mask_labels, class_labels):
-                self.assertEqual(mask_label.shape[0], class_label.shape[0])
-                # this ensure padding has happened
-                self.assertEqual(mask_label.shape[1:], pixel_values.shape[2:])
-
-        common()
-        common(is_instance_map=True)
-        common(is_instance_map=False, segmentation_type="pil")
-        common(is_instance_map=True, segmentation_type="pil")
-
-    def test_integration_instance_segmentation(self):
-        # load 2 images and corresponding annotations from the hub
-        repo_id = "nielsr/image-segmentation-toy-data"
-        image1 = Image.open(
-            hf_hub_download(repo_id=repo_id, filename="instance_segmentation_image_1.png", repo_type="dataset")
-        )
-        image2 = Image.open(
-            hf_hub_download(repo_id=repo_id, filename="instance_segmentation_image_2.png", repo_type="dataset")
-        )
-        annotation1 = Image.open(
-            hf_hub_download(repo_id=repo_id, filename="instance_segmentation_annotation_1.png", repo_type="dataset")
-        )
-        annotation2 = Image.open(
-            hf_hub_download(repo_id=repo_id, filename="instance_segmentation_annotation_2.png", repo_type="dataset")
-        )
-
-        # get instance segmentations and instance-to-segmentation mappings
-        def get_instance_segmentation_and_mapping(annotation):
-            instance_seg = np.array(annotation)[:, :, 1]
-            class_id_map = np.array(annotation)[:, :, 0]
-            class_labels = np.unique(class_id_map)
-
-            # create mapping between instance IDs and semantic category IDs
-            inst2class = {}
-            for label in class_labels:
-                instance_ids = np.unique(instance_seg[class_id_map == label])
-                inst2class.update({i: label for i in instance_ids})
-
-            return instance_seg, inst2class
-
-        instance_seg1, inst2class1 = get_instance_segmentation_and_mapping(annotation1)
-        instance_seg2, inst2class2 = get_instance_segmentation_and_mapping(annotation2)
-
-        # create a feature extractor
-        feature_extractor = MaskFormerFeatureExtractor(reduce_labels=True, ignore_index=255, size=(512, 512))
-
-        # prepare the images and annotations
-        inputs = feature_extractor(
-            [image1, image2],
-            [instance_seg1, instance_seg2],
-            instance_id_to_semantic_id=[inst2class1, inst2class2],
-            return_tensors="pt",
-        )
-
-        # verify the pixel values and pixel mask
-        self.assertEqual(inputs["pixel_values"].shape, (2, 3, 512, 512))
-        self.assertEqual(inputs["pixel_mask"].shape, (2, 512, 512))
-
-        # verify the class labels
-        self.assertEqual(len(inputs["class_labels"]), 2)
-        self.assertTrue(torch.allclose(inputs["class_labels"][0], torch.tensor([30, 55])))
-        self.assertTrue(torch.allclose(inputs["class_labels"][1], torch.tensor([4, 4, 23, 55])))
-
-        # verify the mask labels
-        self.assertEqual(len(inputs["mask_labels"]), 2)
-        self.assertEqual(inputs["mask_labels"][0].shape, (2, 512, 512))
-        self.assertEqual(inputs["mask_labels"][1].shape, (4, 512, 512))
-        self.assertEquals(inputs["mask_labels"][0].sum().item(), 41527.0)
-        self.assertEquals(inputs["mask_labels"][1].sum().item(), 26259.0)
-
-    def test_integration_semantic_segmentation(self):
-        # load 2 images and corresponding semantic annotations from the hub
-        repo_id = "nielsr/image-segmentation-toy-data"
-        image1 = Image.open(
-            hf_hub_download(repo_id=repo_id, filename="semantic_segmentation_image_1.png", repo_type="dataset")
-        )
-        image2 = Image.open(
-            hf_hub_download(repo_id=repo_id, filename="semantic_segmentation_image_2.png", repo_type="dataset")
-        )
-        annotation1 = Image.open(
-            hf_hub_download(repo_id=repo_id, filename="semantic_segmentation_annotation_1.png", repo_type="dataset")
-        )
-        annotation2 = Image.open(
-            hf_hub_download(repo_id=repo_id, filename="semantic_segmentation_annotation_2.png", repo_type="dataset")
-        )
-
-        # create a feature extractor
-        feature_extractor = MaskFormerFeatureExtractor(reduce_labels=True, ignore_index=255, size=(512, 512))
-
-        # prepare the images and annotations
-        inputs = feature_extractor(
-            [image1, image2],
-            [annotation1, annotation2],
-            return_tensors="pt",
-        )
-
-        # verify the pixel values and pixel mask
-        self.assertEqual(inputs["pixel_values"].shape, (2, 3, 512, 512))
-        self.assertEqual(inputs["pixel_mask"].shape, (2, 512, 512))
-
-        # verify the class labels
-        self.assertEqual(len(inputs["class_labels"]), 2)
-        self.assertTrue(torch.allclose(inputs["class_labels"][0], torch.tensor([2, 4, 60])))
-        self.assertTrue(torch.allclose(inputs["class_labels"][1], torch.tensor([0, 3, 7, 8, 15, 28, 30, 143])))
-
-        # verify the mask labels
-        self.assertEqual(len(inputs["mask_labels"]), 2)
-        self.assertEqual(inputs["mask_labels"][0].shape, (3, 512, 512))
-        self.assertEqual(inputs["mask_labels"][1].shape, (8, 512, 512))
-        self.assertEquals(inputs["mask_labels"][0].sum().item(), 170200.0)
-        self.assertEquals(inputs["mask_labels"][1].sum().item(), 257036.0)
-
-    def test_integration_panoptic_segmentation(self):
-        # load 2 images and corresponding panoptic annotations from the hub
-        dataset = load_dataset("nielsr/ade20k-panoptic-demo")
-        image1 = dataset["train"][0]["image"]
-        image2 = dataset["train"][1]["image"]
-        segments_info1 = dataset["train"][0]["segments_info"]
-        segments_info2 = dataset["train"][1]["segments_info"]
-        annotation1 = dataset["train"][0]["label"]
-        annotation2 = dataset["train"][1]["label"]
-
-        def rgb_to_id(color):
-            if isinstance(color, np.ndarray) and len(color.shape) == 3:
-                if color.dtype == np.uint8:
-                    color = color.astype(np.int32)
-                return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
-            return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
-
-        def create_panoptic_map(annotation, segments_info):
-            annotation = np.array(annotation)
-            # convert RGB to segment IDs per pixel
-            # 0 is the "ignore" label, for which we don't need to make binary masks
-            panoptic_map = rgb_to_id(annotation)
-
-            # create mapping between segment IDs and semantic classes
-            inst2class = {segment["id"]: segment["category_id"] for segment in segments_info}
-
-            return panoptic_map, inst2class
-
-        panoptic_map1, inst2class1 = create_panoptic_map(annotation1, segments_info1)
-        panoptic_map2, inst2class2 = create_panoptic_map(annotation2, segments_info2)
-
-        # create a feature extractor
-        feature_extractor = MaskFormerFeatureExtractor(ignore_index=0, do_resize=False)
-
-        # prepare the images and annotations
-        pixel_values_list = [np.moveaxis(np.array(image1), -1, 0), np.moveaxis(np.array(image2), -1, 0)]
-        inputs = feature_extractor.encode_inputs(
-            pixel_values_list,
-            [panoptic_map1, panoptic_map2],
-            instance_id_to_semantic_id=[inst2class1, inst2class2],
-            return_tensors="pt",
-        )
-
-        # verify the pixel values and pixel mask
-        self.assertEqual(inputs["pixel_values"].shape, (2, 3, 512, 711))
-        self.assertEqual(inputs["pixel_mask"].shape, (2, 512, 711))
-
-        # verify the class labels
-        self.assertEqual(len(inputs["class_labels"]), 2)
-        # fmt: off
-        expected_class_labels = torch.tensor([4, 17, 32, 42, 42, 42, 42, 42, 42, 42, 32, 12, 12, 12, 12, 12, 42, 42, 12, 12, 12, 42, 12, 12, 12, 12, 12, 3, 12, 12, 12, 12, 42, 42, 42, 12, 42, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 5, 12, 12, 12, 12, 12, 12, 12, 0, 43, 43, 43, 96, 43, 104, 43, 31, 125, 31, 125, 138, 87, 125, 149, 138, 125, 87, 87])  # noqa: E231
-        # fmt: on
-        self.assertTrue(torch.allclose(inputs["class_labels"][0], torch.tensor(expected_class_labels)))
-        # fmt: off
-        expected_class_labels = torch.tensor([19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 67, 82, 19, 19, 17, 19, 19, 19, 19, 19, 19, 19, 19, 19, 12, 12, 42, 12, 12, 12, 12, 3, 14, 12, 12, 12, 12, 12, 12, 12, 12, 14, 5, 12, 12, 0, 115, 43, 43, 115, 43, 43, 43, 8, 8, 8, 138, 138, 125, 143])  # noqa: E231
-        # fmt: on
-        self.assertTrue(torch.allclose(inputs["class_labels"][1], expected_class_labels))
-
-        # verify the mask labels
-        self.assertEqual(len(inputs["mask_labels"]), 2)
-        self.assertEqual(inputs["mask_labels"][0].shape, (79, 512, 711))
-        self.assertEqual(inputs["mask_labels"][1].shape, (61, 512, 711))
-        self.assertEquals(inputs["mask_labels"][0].sum().item(), 315193.0)
-        self.assertEquals(inputs["mask_labels"][1].sum().item(), 350747.0)
-
-    def test_binary_mask_to_rle(self):
-        fake_binary_mask = np.zeros((20, 50))
-        fake_binary_mask[0, 20:] = 1
-        fake_binary_mask[1, :15] = 1
-        fake_binary_mask[5, :10] = 1
-
-        rle = binary_mask_to_rle(fake_binary_mask)
-        self.assertEqual(len(rle), 4)
-        self.assertEqual(rle[0], 21)
-        self.assertEqual(rle[1], 45)
-
-    def test_post_process_segmentation(self):
-        fature_extractor = self.feature_extraction_class(num_labels=self.feature_extract_tester.num_classes)
-        outputs = self.feature_extract_tester.get_fake_maskformer_outputs()
-        segmentation = fature_extractor.post_process_segmentation(outputs)
-
-        self.assertEqual(
-            segmentation.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_classes,
-                self.feature_extract_tester.height,
-                self.feature_extract_tester.width,
-            ),
-        )
-
-        target_size = (1, 4)
-        segmentation = fature_extractor.post_process_segmentation(outputs, target_size=target_size)
-
-        self.assertEqual(
-            segmentation.shape,
-            (self.feature_extract_tester.batch_size, self.feature_extract_tester.num_classes, *target_size),
-        )
-
-    def test_post_process_semantic_segmentation(self):
-        fature_extractor = self.feature_extraction_class(num_labels=self.feature_extract_tester.num_classes)
-        outputs = self.feature_extract_tester.get_fake_maskformer_outputs()
-
-        segmentation = fature_extractor.post_process_semantic_segmentation(outputs)
-
-        self.assertEqual(len(segmentation), self.feature_extract_tester.batch_size)
-        self.assertEqual(
-            segmentation[0].shape,
-            (
-                self.feature_extract_tester.height,
-                self.feature_extract_tester.width,
-            ),
-        )
-
-        target_sizes = [(1, 4) for i in range(self.feature_extract_tester.batch_size)]
-        segmentation = fature_extractor.post_process_semantic_segmentation(outputs, target_sizes=target_sizes)
-
-        self.assertEqual(segmentation[0].shape, target_sizes[0])
-
-    def test_post_process_panoptic_segmentation(self):
-        feature_extractor = self.feature_extraction_class(num_labels=self.feature_extract_tester.num_classes)
-        outputs = self.feature_extract_tester.get_fake_maskformer_outputs()
-        segmentation = feature_extractor.post_process_panoptic_segmentation(outputs, threshold=0)
-
-        self.assertTrue(len(segmentation) == self.feature_extract_tester.batch_size)
-        for el in segmentation:
-            self.assertTrue("segmentation" in el)
-            self.assertTrue("segments_info" in el)
-            self.assertEqual(type(el["segments_info"]), list)
-            self.assertEqual(
-                el["segmentation"].shape, (self.feature_extract_tester.height, self.feature_extract_tester.width)
-            )
-
-    def test_post_process_label_fusing(self):
-        feature_extractor = self.feature_extraction_class(num_labels=self.feature_extract_tester.num_classes)
-        outputs = self.feature_extract_tester.get_fake_maskformer_outputs()
-
-        segmentation = feature_extractor.post_process_panoptic_segmentation(
-            outputs, threshold=0, mask_threshold=0, overlap_mask_area_threshold=0
-        )
-        unfused_segments = [el["segments_info"] for el in segmentation]
-
-        fused_segmentation = feature_extractor.post_process_panoptic_segmentation(
-            outputs, threshold=0, mask_threshold=0, overlap_mask_area_threshold=0, label_ids_to_fuse={1}
-        )
-        fused_segments = [el["segments_info"] for el in fused_segmentation]
-
-        for el_unfused, el_fused in zip(unfused_segments, fused_segments):
-            if len(el_unfused) == 0:
-                self.assertEqual(len(el_unfused), len(el_fused))
-                continue
-
-            # Get number of segments to be fused
-            fuse_targets = [1 for el in el_unfused if el["label_id"] in {1}]
-            num_to_fuse = 0 if len(fuse_targets) == 0 else sum(fuse_targets) - 1
-            # Expected number of segments after fusing
-            expected_num_segments = max([el["id"] for el in el_unfused]) - num_to_fuse
-            num_segments_fused = max([el["id"] for el in el_fused])
-            self.assertEqual(num_segments_fused, expected_num_segments)
diff --git a/tests/models/maskformer/test_image_processing_maskformer.py b/tests/models/maskformer/test_image_processing_maskformer.py
new file mode 100644
index 000000000000..e7dc0077765a
--- /dev/null
+++ b/tests/models/maskformer/test_image_processing_maskformer.py
@@ -0,0 +1,531 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+from datasets import load_dataset
+from huggingface_hub import hf_hub_download
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+    if is_vision_available():
+        from transformers import MaskFormerImageProcessor
+        from transformers.models.maskformer.image_processing_maskformer import binary_mask_to_rle
+        from transformers.models.maskformer.modeling_maskformer import MaskFormerForInstanceSegmentationOutput
+
+if is_vision_available():
+    from PIL import Image
+
+
+class MaskFormerImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        min_resolution=30,
+        max_resolution=400,
+        size=None,
+        do_resize=True,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+        num_labels=10,
+        do_reduce_labels=True,
+        ignore_index=255,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = {"shortest_edge": 32, "longest_edge": 1333} if size is None else size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.size_divisor = 0
+        # for the post_process_functions
+        self.batch_size = 2
+        self.num_queries = 3
+        self.num_classes = 2
+        self.height = 3
+        self.width = 4
+        self.num_labels = num_labels
+        self.do_reduce_labels = do_reduce_labels
+        self.ignore_index = ignore_index
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "size_divisor": self.size_divisor,
+            "num_labels": self.num_labels,
+            "do_reduce_labels": self.do_reduce_labels,
+            "ignore_index": self.ignore_index,
+        }
+
+    def get_expected_values(self, image_inputs, batched=False):
+        """
+        This function computes the expected height and width when providing images to MaskFormerImageProcessor,
+        assuming do_resize is set to True with a scalar size.
+        """
+        if not batched:
+            image = image_inputs[0]
+            if isinstance(image, Image.Image):
+                w, h = image.size
+            else:
+                h, w = image.shape[1], image.shape[2]
+            if w < h:
+                expected_height = int(self.size["shortest_edge"] * h / w)
+                expected_width = self.size["shortest_edge"]
+            elif w > h:
+                expected_height = self.size["shortest_edge"]
+                expected_width = int(self.size["shortest_edge"] * w / h)
+            else:
+                expected_height = self.size["shortest_edge"]
+                expected_width = self.size["shortest_edge"]
+
+        else:
+            expected_values = []
+            for image in image_inputs:
+                expected_height, expected_width = self.get_expected_values([image])
+                expected_values.append((expected_height, expected_width))
+            expected_height = max(expected_values, key=lambda item: item[0])[0]
+            expected_width = max(expected_values, key=lambda item: item[1])[1]
+
+        return expected_height, expected_width
+
+    def get_fake_maskformer_outputs(self):
+        return MaskFormerForInstanceSegmentationOutput(
+            # +1 for null class
+            class_queries_logits=torch.randn((self.batch_size, self.num_queries, self.num_classes + 1)),
+            masks_queries_logits=torch.randn((self.batch_size, self.num_queries, self.height, self.width)),
+        )
+
+    def expected_output_image_shape(self, images):
+        height, width = self.get_expected_values(images, batched=True)
+        return self.num_channels, height, width
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class MaskFormerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = MaskFormerImageProcessor if (is_vision_available() and is_torch_available()) else None
+
+    def setUp(self):
+        self.image_processor_tester = MaskFormerImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "ignore_index"))
+        self.assertTrue(hasattr(image_processing, "num_labels"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"shortest_edge": 32, "longest_edge": 1333})
+        self.assertEqual(image_processor.size_divisor, 0)
+
+        image_processor = self.image_processing_class.from_dict(
+            self.image_processor_dict, size=42, max_size=84, size_divisibility=8
+        )
+        self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84})
+        self.assertEqual(image_processor.size_divisor, 8)
+
+    def comm_get_image_processing_inputs(
+        self, with_segmentation_maps=False, is_instance_map=False, segmentation_type="np"
+    ):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # prepare image and target
+        num_labels = self.image_processor_tester.num_labels
+        annotations = None
+        instance_id_to_semantic_id = None
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
+        if with_segmentation_maps:
+            high = num_labels
+            if is_instance_map:
+                labels_expanded = list(range(num_labels)) * 2
+                instance_id_to_semantic_id = dict(enumerate(labels_expanded))
+            annotations = [
+                np.random.randint(0, high * 2, (img.size[1], img.size[0])).astype(np.uint8) for img in image_inputs
+            ]
+            if segmentation_type == "pil":
+                annotations = [Image.fromarray(annotation) for annotation in annotations]
+
+        inputs = image_processing(
+            image_inputs,
+            annotations,
+            return_tensors="pt",
+            instance_id_to_semantic_id=instance_id_to_semantic_id,
+            pad_and_return_pixel_mask=True,
+        )
+
+        return inputs
+
+    def test_with_size_divisor(self):
+        size_divisors = [8, 16, 32]
+        weird_input_sizes = [(407, 802), (582, 1094)]
+        for size_divisor in size_divisors:
+            image_processor_dict = {**self.image_processor_dict, **{"size_divisor": size_divisor}}
+            image_processing = self.image_processing_class(**image_processor_dict)
+            for weird_input_size in weird_input_sizes:
+                inputs = image_processing([np.ones((3, *weird_input_size))], return_tensors="pt")
+                pixel_values = inputs["pixel_values"]
+                # check if divisible
+                self.assertTrue((pixel_values.shape[-1] % size_divisor) == 0)
+                self.assertTrue((pixel_values.shape[-2] % size_divisor) == 0)
+
+    def test_call_with_segmentation_maps(self):
+        def common(is_instance_map=False, segmentation_type=None):
+            inputs = self.comm_get_image_processing_inputs(
+                with_segmentation_maps=True, is_instance_map=is_instance_map, segmentation_type=segmentation_type
+            )
+
+            mask_labels = inputs["mask_labels"]
+            class_labels = inputs["class_labels"]
+            pixel_values = inputs["pixel_values"]
+
+            # check the batch_size
+            for mask_label, class_label in zip(mask_labels, class_labels):
+                self.assertEqual(mask_label.shape[0], class_label.shape[0])
+                # this ensure padding has happened
+                self.assertEqual(mask_label.shape[1:], pixel_values.shape[2:])
+
+        common()
+        common(is_instance_map=True)
+        common(is_instance_map=False, segmentation_type="pil")
+        common(is_instance_map=True, segmentation_type="pil")
+
+    def test_integration_instance_segmentation(self):
+        # load 2 images and corresponding annotations from the hub
+        repo_id = "nielsr/image-segmentation-toy-data"
+        image1 = Image.open(
+            hf_hub_download(repo_id=repo_id, filename="instance_segmentation_image_1.png", repo_type="dataset")
+        )
+        image2 = Image.open(
+            hf_hub_download(repo_id=repo_id, filename="instance_segmentation_image_2.png", repo_type="dataset")
+        )
+        annotation1 = Image.open(
+            hf_hub_download(repo_id=repo_id, filename="instance_segmentation_annotation_1.png", repo_type="dataset")
+        )
+        annotation2 = Image.open(
+            hf_hub_download(repo_id=repo_id, filename="instance_segmentation_annotation_2.png", repo_type="dataset")
+        )
+
+        # get instance segmentations and instance-to-segmentation mappings
+        def get_instance_segmentation_and_mapping(annotation):
+            instance_seg = np.array(annotation)[:, :, 1]
+            class_id_map = np.array(annotation)[:, :, 0]
+            class_labels = np.unique(class_id_map)
+
+            # create mapping between instance IDs and semantic category IDs
+            inst2class = {}
+            for label in class_labels:
+                instance_ids = np.unique(instance_seg[class_id_map == label])
+                inst2class.update({i: label for i in instance_ids})
+
+            return instance_seg, inst2class
+
+        instance_seg1, inst2class1 = get_instance_segmentation_and_mapping(annotation1)
+        instance_seg2, inst2class2 = get_instance_segmentation_and_mapping(annotation2)
+
+        # create a image processor
+        image_processing = MaskFormerImageProcessor(reduce_labels=True, ignore_index=255, size=(512, 512))
+
+        # prepare the images and annotations
+        inputs = image_processing(
+            [image1, image2],
+            [instance_seg1, instance_seg2],
+            instance_id_to_semantic_id=[inst2class1, inst2class2],
+            return_tensors="pt",
+        )
+
+        # verify the pixel values and pixel mask
+        self.assertEqual(inputs["pixel_values"].shape, (2, 3, 512, 512))
+        self.assertEqual(inputs["pixel_mask"].shape, (2, 512, 512))
+
+        # verify the class labels
+        self.assertEqual(len(inputs["class_labels"]), 2)
+        self.assertTrue(torch.allclose(inputs["class_labels"][0], torch.tensor([30, 55])))
+        self.assertTrue(torch.allclose(inputs["class_labels"][1], torch.tensor([4, 4, 23, 55])))
+
+        # verify the mask labels
+        self.assertEqual(len(inputs["mask_labels"]), 2)
+        self.assertEqual(inputs["mask_labels"][0].shape, (2, 512, 512))
+        self.assertEqual(inputs["mask_labels"][1].shape, (4, 512, 512))
+        self.assertEquals(inputs["mask_labels"][0].sum().item(), 41527.0)
+        self.assertEquals(inputs["mask_labels"][1].sum().item(), 26259.0)
+
+    def test_integration_semantic_segmentation(self):
+        # load 2 images and corresponding semantic annotations from the hub
+        repo_id = "nielsr/image-segmentation-toy-data"
+        image1 = Image.open(
+            hf_hub_download(repo_id=repo_id, filename="semantic_segmentation_image_1.png", repo_type="dataset")
+        )
+        image2 = Image.open(
+            hf_hub_download(repo_id=repo_id, filename="semantic_segmentation_image_2.png", repo_type="dataset")
+        )
+        annotation1 = Image.open(
+            hf_hub_download(repo_id=repo_id, filename="semantic_segmentation_annotation_1.png", repo_type="dataset")
+        )
+        annotation2 = Image.open(
+            hf_hub_download(repo_id=repo_id, filename="semantic_segmentation_annotation_2.png", repo_type="dataset")
+        )
+
+        # create a image processor
+        image_processing = MaskFormerImageProcessor(reduce_labels=True, ignore_index=255, size=(512, 512))
+
+        # prepare the images and annotations
+        inputs = image_processing(
+            [image1, image2],
+            [annotation1, annotation2],
+            return_tensors="pt",
+        )
+
+        # verify the pixel values and pixel mask
+        self.assertEqual(inputs["pixel_values"].shape, (2, 3, 512, 512))
+        self.assertEqual(inputs["pixel_mask"].shape, (2, 512, 512))
+
+        # verify the class labels
+        self.assertEqual(len(inputs["class_labels"]), 2)
+        self.assertTrue(torch.allclose(inputs["class_labels"][0], torch.tensor([2, 4, 60])))
+        self.assertTrue(torch.allclose(inputs["class_labels"][1], torch.tensor([0, 3, 7, 8, 15, 28, 30, 143])))
+
+        # verify the mask labels
+        self.assertEqual(len(inputs["mask_labels"]), 2)
+        self.assertEqual(inputs["mask_labels"][0].shape, (3, 512, 512))
+        self.assertEqual(inputs["mask_labels"][1].shape, (8, 512, 512))
+        self.assertEquals(inputs["mask_labels"][0].sum().item(), 170200.0)
+        self.assertEquals(inputs["mask_labels"][1].sum().item(), 257036.0)
+
+    def test_integration_panoptic_segmentation(self):
+        # load 2 images and corresponding panoptic annotations from the hub
+        dataset = load_dataset("nielsr/ade20k-panoptic-demo")
+        image1 = dataset["train"][0]["image"]
+        image2 = dataset["train"][1]["image"]
+        segments_info1 = dataset["train"][0]["segments_info"]
+        segments_info2 = dataset["train"][1]["segments_info"]
+        annotation1 = dataset["train"][0]["label"]
+        annotation2 = dataset["train"][1]["label"]
+
+        def rgb_to_id(color):
+            if isinstance(color, np.ndarray) and len(color.shape) == 3:
+                if color.dtype == np.uint8:
+                    color = color.astype(np.int32)
+                return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
+            return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
+
+        def create_panoptic_map(annotation, segments_info):
+            annotation = np.array(annotation)
+            # convert RGB to segment IDs per pixel
+            # 0 is the "ignore" label, for which we don't need to make binary masks
+            panoptic_map = rgb_to_id(annotation)
+
+            # create mapping between segment IDs and semantic classes
+            inst2class = {segment["id"]: segment["category_id"] for segment in segments_info}
+
+            return panoptic_map, inst2class
+
+        panoptic_map1, inst2class1 = create_panoptic_map(annotation1, segments_info1)
+        panoptic_map2, inst2class2 = create_panoptic_map(annotation2, segments_info2)
+
+        # create a image processor
+        image_processing = MaskFormerImageProcessor(ignore_index=0, do_resize=False)
+
+        # prepare the images and annotations
+        pixel_values_list = [np.moveaxis(np.array(image1), -1, 0), np.moveaxis(np.array(image2), -1, 0)]
+        inputs = image_processing.encode_inputs(
+            pixel_values_list,
+            [panoptic_map1, panoptic_map2],
+            instance_id_to_semantic_id=[inst2class1, inst2class2],
+            return_tensors="pt",
+        )
+
+        # verify the pixel values and pixel mask
+        self.assertEqual(inputs["pixel_values"].shape, (2, 3, 512, 711))
+        self.assertEqual(inputs["pixel_mask"].shape, (2, 512, 711))
+
+        # verify the class labels
+        self.assertEqual(len(inputs["class_labels"]), 2)
+        # fmt: off
+        expected_class_labels = torch.tensor([4, 17, 32, 42, 42, 42, 42, 42, 42, 42, 32, 12, 12, 12, 12, 12, 42, 42, 12, 12, 12, 42, 12, 12, 12, 12, 12, 3, 12, 12, 12, 12, 42, 42, 42, 12, 42, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 5, 12, 12, 12, 12, 12, 12, 12, 0, 43, 43, 43, 96, 43, 104, 43, 31, 125, 31, 125, 138, 87, 125, 149, 138, 125, 87, 87])  # noqa: E231
+        # fmt: on
+        self.assertTrue(torch.allclose(inputs["class_labels"][0], torch.tensor(expected_class_labels)))
+        # fmt: off
+        expected_class_labels = torch.tensor([19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 67, 82, 19, 19, 17, 19, 19, 19, 19, 19, 19, 19, 19, 19, 12, 12, 42, 12, 12, 12, 12, 3, 14, 12, 12, 12, 12, 12, 12, 12, 12, 14, 5, 12, 12, 0, 115, 43, 43, 115, 43, 43, 43, 8, 8, 8, 138, 138, 125, 143])  # noqa: E231
+        # fmt: on
+        self.assertTrue(torch.allclose(inputs["class_labels"][1], expected_class_labels))
+
+        # verify the mask labels
+        self.assertEqual(len(inputs["mask_labels"]), 2)
+        self.assertEqual(inputs["mask_labels"][0].shape, (79, 512, 711))
+        self.assertEqual(inputs["mask_labels"][1].shape, (61, 512, 711))
+        self.assertEquals(inputs["mask_labels"][0].sum().item(), 315193.0)
+        self.assertEquals(inputs["mask_labels"][1].sum().item(), 350747.0)
+
+    def test_binary_mask_to_rle(self):
+        fake_binary_mask = np.zeros((20, 50))
+        fake_binary_mask[0, 20:] = 1
+        fake_binary_mask[1, :15] = 1
+        fake_binary_mask[5, :10] = 1
+
+        rle = binary_mask_to_rle(fake_binary_mask)
+        self.assertEqual(len(rle), 4)
+        self.assertEqual(rle[0], 21)
+        self.assertEqual(rle[1], 45)
+
+    def test_post_process_segmentation(self):
+        fature_extractor = self.image_processing_class(num_labels=self.image_processor_tester.num_classes)
+        outputs = self.image_processor_tester.get_fake_maskformer_outputs()
+        segmentation = fature_extractor.post_process_segmentation(outputs)
+
+        self.assertEqual(
+            segmentation.shape,
+            (
+                self.image_processor_tester.batch_size,
+                self.image_processor_tester.num_classes,
+                self.image_processor_tester.height,
+                self.image_processor_tester.width,
+            ),
+        )
+
+        target_size = (1, 4)
+        segmentation = fature_extractor.post_process_segmentation(outputs, target_size=target_size)
+
+        self.assertEqual(
+            segmentation.shape,
+            (self.image_processor_tester.batch_size, self.image_processor_tester.num_classes, *target_size),
+        )
+
+    def test_post_process_semantic_segmentation(self):
+        fature_extractor = self.image_processing_class(num_labels=self.image_processor_tester.num_classes)
+        outputs = self.image_processor_tester.get_fake_maskformer_outputs()
+
+        segmentation = fature_extractor.post_process_semantic_segmentation(outputs)
+
+        self.assertEqual(len(segmentation), self.image_processor_tester.batch_size)
+        self.assertEqual(
+            segmentation[0].shape,
+            (
+                self.image_processor_tester.height,
+                self.image_processor_tester.width,
+            ),
+        )
+
+        target_sizes = [(1, 4) for i in range(self.image_processor_tester.batch_size)]
+        segmentation = fature_extractor.post_process_semantic_segmentation(outputs, target_sizes=target_sizes)
+
+        self.assertEqual(segmentation[0].shape, target_sizes[0])
+
+    def test_post_process_instance_segmentation(self):
+        image_processor = self.image_processing_class(num_labels=self.image_processor_tester.num_classes)
+        outputs = self.image_processor_tester.get_fake_maskformer_outputs()
+        segmentation = image_processor.post_process_instance_segmentation(outputs, threshold=0)
+
+        self.assertTrue(len(segmentation) == self.image_processor_tester.batch_size)
+        for el in segmentation:
+            self.assertTrue("segmentation" in el)
+            self.assertTrue("segments_info" in el)
+            self.assertEqual(type(el["segments_info"]), list)
+            self.assertEqual(
+                el["segmentation"].shape, (self.image_processor_tester.height, self.image_processor_tester.width)
+            )
+
+        segmentation = image_processor.post_process_instance_segmentation(
+            outputs, threshold=0, return_binary_maps=True
+        )
+
+        self.assertTrue(len(segmentation) == self.image_processor_tester.batch_size)
+        for el in segmentation:
+            self.assertTrue("segmentation" in el)
+            self.assertTrue("segments_info" in el)
+            self.assertEqual(type(el["segments_info"]), list)
+            self.assertEqual(len(el["segmentation"].shape), 3)
+            self.assertEqual(
+                el["segmentation"].shape[1:], (self.image_processor_tester.height, self.image_processor_tester.width)
+            )
+
+    def test_post_process_panoptic_segmentation(self):
+        image_processing = self.image_processing_class(num_labels=self.image_processor_tester.num_classes)
+        outputs = self.image_processor_tester.get_fake_maskformer_outputs()
+        segmentation = image_processing.post_process_panoptic_segmentation(outputs, threshold=0)
+
+        self.assertTrue(len(segmentation) == self.image_processor_tester.batch_size)
+        for el in segmentation:
+            self.assertTrue("segmentation" in el)
+            self.assertTrue("segments_info" in el)
+            self.assertEqual(type(el["segments_info"]), list)
+            self.assertEqual(
+                el["segmentation"].shape, (self.image_processor_tester.height, self.image_processor_tester.width)
+            )
+
+    def test_post_process_label_fusing(self):
+        image_processor = self.image_processing_class(num_labels=self.image_processor_tester.num_classes)
+        outputs = self.image_processor_tester.get_fake_maskformer_outputs()
+
+        segmentation = image_processor.post_process_panoptic_segmentation(
+            outputs, threshold=0, mask_threshold=0, overlap_mask_area_threshold=0
+        )
+        unfused_segments = [el["segments_info"] for el in segmentation]
+
+        fused_segmentation = image_processor.post_process_panoptic_segmentation(
+            outputs, threshold=0, mask_threshold=0, overlap_mask_area_threshold=0, label_ids_to_fuse={1}
+        )
+        fused_segments = [el["segments_info"] for el in fused_segmentation]
+
+        for el_unfused, el_fused in zip(unfused_segments, fused_segments):
+            if len(el_unfused) == 0:
+                self.assertEqual(len(el_unfused), len(el_fused))
+                continue
+
+            # Get number of segments to be fused
+            fuse_targets = [1 for el in el_unfused if el["label_id"] in {1}]
+            num_to_fuse = 0 if len(fuse_targets) == 0 else sum(fuse_targets) - 1
+            # Expected number of segments after fusing
+            expected_num_segments = max([el["id"] for el in el_unfused]) - num_to_fuse
+            num_segments_fused = max([el["id"] for el in el_fused])
+            self.assertEqual(num_segments_fused, expected_num_segments)
diff --git a/tests/models/maskformer/test_modeling_maskformer.py b/tests/models/maskformer/test_modeling_maskformer.py
index 52c811591bba..a2e3ee306a82 100644
--- a/tests/models/maskformer/test_modeling_maskformer.py
+++ b/tests/models/maskformer/test_modeling_maskformer.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 """ Testing suite for the PyTorch MaskFormer model. """
 
+import copy
 import inspect
 import unittest
 
@@ -21,11 +22,19 @@
 
 from tests.test_modeling_common import floats_tensor
 from transformers import DetrConfig, MaskFormerConfig, SwinConfig, is_torch_available, is_vision_available
-from transformers.testing_utils import require_torch, require_torch_multi_gpu, require_vision, slow, torch_device
+from transformers.testing_utils import (
+    require_torch,
+    require_torch_gpu,
+    require_torch_multi_gpu,
+    require_vision,
+    slow,
+    torch_device,
+)
 from transformers.utils import cached_property
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -34,7 +43,7 @@
     from transformers import MaskFormerForInstanceSegmentation, MaskFormerModel
 
     if is_vision_available():
-        from transformers import MaskFormerFeatureExtractor
+        from transformers import MaskFormerImageProcessor
 
 if is_vision_available():
     from PIL import Image
@@ -53,6 +62,8 @@ def __init__(
         max_size=32 * 6,
         num_labels=4,
         mask_feature_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=2,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -64,6 +75,9 @@ def __init__(
         self.max_size = max_size
         self.num_labels = num_labels
         self.mask_feature_size = mask_feature_size
+        # This is passed to the decoder config. We add it to the model tester here for testing
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.min_size, self.max_size]).to(
@@ -84,11 +98,18 @@ def get_config(self):
         return MaskFormerConfig.from_backbone_and_decoder_configs(
             backbone_config=SwinConfig(
                 depths=[1, 1, 1, 1],
+                embed_dim=16,
+                hidden_size=32,
+                num_heads=[1, 1, 2, 2],
             ),
             decoder_config=DetrConfig(
-                decoder_ffn_dim=128,
+                decoder_ffn_dim=64,
+                decoder_layers=self.num_hidden_layers,
+                decoder_attention_heads=self.num_attention_heads,
+                encoder_ffn_dim=64,
+                encoder_layers=self.num_hidden_layers,
+                encoder_attention_heads=self.num_attention_heads,
                 num_queries=self.num_queries,
-                decoder_attention_heads=2,
                 d_model=self.mask_feature_size,
             ),
             mask_feature_size=self.mask_feature_size,
@@ -172,9 +193,13 @@ def comm_check_on_output(result):
 
 
 @require_torch
-class MaskFormerModelTest(ModelTesterMixin, unittest.TestCase):
-
+class MaskFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (MaskFormerModel, MaskFormerForInstanceSegmentation) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": MaskFormerModel, "image-segmentation": MaskFormerForInstanceSegmentation}
+        if is_torch_available()
+        else {}
+    )
 
     is_encoder_decoder = False
     test_pruning = False
@@ -185,6 +210,27 @@ def setUp(self):
         self.model_tester = MaskFormerModelTester(self)
         self.config_tester = ConfigTester(self, config_class=MaskFormerConfig, has_text_modality=False)
 
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = copy.deepcopy(inputs_dict)
+
+        if return_labels:
+            if model_class in [MaskFormerForInstanceSegmentation]:
+                inputs_dict["mask_labels"] = torch.zeros(
+                    (
+                        self.model_tester.batch_size,
+                        self.model_tester.num_labels,
+                        self.model_tester.min_size,
+                        self.model_tester.max_size,
+                    ),
+                    dtype=torch.float32,
+                    device=torch_device,
+                )
+                inputs_dict["class_labels"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.num_labels), dtype=torch.long, device=torch_device
+                )
+
+        return inputs_dict
+
     def test_config(self):
         self.config_tester.run_common_tests()
 
@@ -254,26 +300,47 @@ def test_hidden_states_output(self):
         self.model_tester.create_and_check_maskformer_model(config, **inputs, output_hidden_states=True)
 
     def test_attention_outputs(self):
-        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
 
         for model_class in self.all_model_classes:
-            model = model_class(config).to(torch_device)
-            outputs = model(**inputs, output_attentions=True)
-            self.assertTrue(outputs.attentions is not None)
-
-    def test_training(self):
-        if not self.model_tester.is_training:
-            return
-        # only MaskFormerForInstanceSegmentation has the loss
-        model_class = self.all_model_classes[1]
-        config, pixel_values, pixel_mask, mask_labels, class_labels = self.model_tester.prepare_config_and_inputs()
-
-        model = model_class(config)
-        model.to(torch_device)
-        model.train()
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # Check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+            out_len = len(outputs)
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            # encoder_hidden_states, pixel_decoder_hidden_states, transformer_decoder_hidden_states, hidden_states
+            added_hidden_states = 4
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
 
-        loss = model(pixel_values, mask_labels=mask_labels, class_labels=class_labels).loss
-        loss.backward()
+            self_attentions = outputs.attentions
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
 
     def test_retain_grad_hidden_states_attentions(self):
         # only MaskFormerForInstanceSegmentation has the loss
@@ -321,18 +388,18 @@ def prepare_img():
 @slow
 class MaskFormerModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return (
-            MaskFormerFeatureExtractor.from_pretrained("facebook/maskformer-swin-small-coco")
+            MaskFormerImageProcessor.from_pretrained("facebook/maskformer-swin-small-coco")
             if is_vision_available()
             else None
         )
 
     def test_inference_no_head(self):
         model = MaskFormerModel.from_pretrained("facebook/maskformer-swin-small-coco").to(torch_device)
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(image, return_tensors="pt").to(torch_device)
         inputs_shape = inputs["pixel_values"].shape
         # check size is divisible by 32
         self.assertTrue((inputs_shape[-1] % 32) == 0 and (inputs_shape[-2] % 32) == 0)
@@ -375,9 +442,9 @@ def test_inference_instance_segmentation_head(self):
             .to(torch_device)
             .eval()
         )
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(image, return_tensors="pt").to(torch_device)
         inputs_shape = inputs["pixel_values"].shape
         # check size is divisible by 32
         self.assertTrue((inputs_shape[-1] % 32) == 0 and (inputs_shape[-2] % 32) == 0)
@@ -419,9 +486,9 @@ def test_inference_instance_segmentation_head_resnet_backbone(self):
             .to(torch_device)
             .eval()
         )
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(image, return_tensors="pt").to(torch_device)
         inputs_shape = inputs["pixel_values"].shape
         # check size is divisible by 32
         self.assertTrue((inputs_shape[-1] % 32) == 0 and (inputs_shape[-2] % 32) == 0)
@@ -449,16 +516,30 @@ def test_inference_instance_segmentation_head_resnet_backbone(self):
         ).to(torch_device)
         self.assertTrue(torch.allclose(outputs.class_queries_logits[0, :3, :3], expected_slice, atol=TOLERANCE))
 
+    @require_torch_gpu
+    def test_inference_fp16(self):
+        model = (
+            MaskFormerForInstanceSegmentation.from_pretrained("facebook/maskformer-resnet101-coco-stuff")
+            .to(torch_device, dtype=torch.float16)
+            .eval()
+        )
+        image_processor = self.default_image_processor
+        image = prepare_img()
+        inputs = image_processor(image, return_tensors="pt").to(torch_device, dtype=torch.float16)
+
+        with torch.no_grad():
+            _ = model(**inputs)
+
     def test_with_segmentation_maps_and_loss(self):
         model = (
             MaskFormerForInstanceSegmentation.from_pretrained("facebook/maskformer-swin-small-coco")
             .to(torch_device)
             .eval()
         )
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
 
-        inputs = feature_extractor(
-            [np.zeros((3, 800, 1333)), np.zeros((3, 800, 1333))],
+        inputs = image_processor(
+            [np.zeros((3, 400, 333)), np.zeros((3, 400, 333))],
             segmentation_maps=[np.zeros((384, 384)).astype(np.float32), np.zeros((384, 384)).astype(np.float32)],
             return_tensors="pt",
         )
diff --git a/tests/models/maskformer/test_modeling_maskformer_swin.py b/tests/models/maskformer/test_modeling_maskformer_swin.py
index 3d4de9c8e691..4125f36db798 100644
--- a/tests/models/maskformer/test_modeling_maskformer_swin.py
+++ b/tests/models/maskformer/test_modeling_maskformer_swin.py
@@ -23,8 +23,10 @@
 from transformers.testing_utils import require_torch, require_torch_multi_gpu, torch_device
 from transformers.utils import is_torch_available
 
+from ...test_backbone_common import BackboneTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -63,6 +65,7 @@ def __init__(
         type_sequence_label_size=10,
         encoder_stride=8,
         out_features=["stage1", "stage2", "stage3"],
+        out_indices=[1, 2, 3],
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -89,6 +92,7 @@ def __init__(
         self.type_sequence_label_size = type_sequence_label_size
         self.encoder_stride = encoder_stride
         self.out_features = out_features
+        self.out_indices = out_indices
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
@@ -122,6 +126,7 @@ def get_config(self):
             initializer_range=self.initializer_range,
             encoder_stride=self.encoder_stride,
             out_features=self.out_features,
+            out_indices=self.out_indices,
         )
 
     def create_and_check_model(self, config, pixel_values, labels):
@@ -162,8 +167,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class MaskFormerSwinModelTest(ModelTesterMixin, unittest.TestCase):
-
+class MaskFormerSwinModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             MaskFormerSwinModel,
@@ -172,6 +176,7 @@ class MaskFormerSwinModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = {"feature-extraction": MaskFormerSwinModel} if is_torch_available() else {}
     fx_compatible = False
     test_torchscript = False
     test_pruning = False
@@ -394,3 +399,48 @@ def recursive_check(tuple_object, dict_object):
             tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
             dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
             check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
+
+
+@require_torch
+class MaskFormerSwinBackboneTest(unittest.TestCase, BackboneTesterMixin):
+    all_model_classes = (MaskFormerSwinBackbone,) if is_torch_available() else ()
+    config_class = MaskFormerSwinConfig
+
+    def setUp(self):
+        self.model_tester = MaskFormerSwinModelTester(self)
+
+    # Overriding as returned hidden states are tuples of tensors instead of a single tensor
+    def test_backbone_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        batch_size = inputs_dict["pixel_values"].shape[0]
+
+        for backbone_class in self.all_model_classes:
+            backbone = backbone_class(config)
+            backbone.to(torch_device)
+            backbone.eval()
+
+            outputs = backbone(**inputs_dict)
+
+            # Test default outputs and verify feature maps
+            self.assertIsInstance(outputs.feature_maps, tuple)
+            self.assertTrue(len(outputs.feature_maps) == len(backbone.channels))
+            for feature_map, n_channels in zip(outputs.feature_maps, backbone.channels):
+                self.assertTrue(feature_map.shape[:2], (batch_size, n_channels))
+            self.assertIsNone(outputs.hidden_states)
+            self.assertIsNone(outputs.attentions)
+
+            # Test output_hidden_states=True
+            outputs = backbone(**inputs_dict, output_hidden_states=True)
+            self.assertIsNotNone(outputs.hidden_states)
+            self.assertTrue(len(outputs.hidden_states), len(backbone.stage_names))
+            # We skip the stem layer
+            for hidden_states, n_channels in zip(outputs.hidden_states[1:], backbone.channels):
+                for hidden_state in hidden_states:
+                    # Hidden states are in the format (batch_size, (height * width), n_channels)
+                    h_batch_size, _, h_n_channels = hidden_state.shape
+                    self.assertTrue((h_batch_size, h_n_channels), (batch_size, n_channels))
+
+            # Test output_attentions=True
+            if self.has_attentions:
+                outputs = backbone(**inputs_dict, output_attentions=True)
+                self.assertIsNotNone(outputs.attentions)
diff --git a/tests/models/mbart/test_modeling_flax_mbart.py b/tests/models/mbart/test_modeling_flax_mbart.py
index 1be81583575f..a642b2344c9e 100644
--- a/tests/models/mbart/test_modeling_flax_mbart.py
+++ b/tests/models/mbart/test_modeling_flax_mbart.py
@@ -35,6 +35,7 @@
 
     import jax
     import jax.numpy as jnp
+
     from transformers import AutoTokenizer
     from transformers.models.mbart.modeling_flax_mbart import (
         FlaxMBartForConditionalGeneration,
diff --git a/tests/models/mbart/test_modeling_mbart.py b/tests/models/mbart/test_modeling_mbart.py
index 59545ccd2f73..deaa8b5dafe6 100644
--- a/tests/models/mbart/test_modeling_mbart.py
+++ b/tests/models/mbart/test_modeling_mbart.py
@@ -26,6 +26,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -224,18 +225,43 @@ def check_encoder_decoder_model_standalone(self, config, inputs_dict):
 
 
 @require_torch
-class MBartModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class MBartModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (MBartModel, MBartForConditionalGeneration, MBartForSequenceClassification, MBartForQuestionAnswering)
         if is_torch_available()
         else ()
     )
     all_generative_model_classes = (MBartForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "conversational": MBartForConditionalGeneration,
+            "feature-extraction": MBartModel,
+            "fill-mask": MBartForConditionalGeneration,
+            "question-answering": MBartForQuestionAnswering,
+            "summarization": MBartForConditionalGeneration,
+            "text-classification": MBartForSequenceClassification,
+            "text-generation": MBartForCausalLM,
+            "text2text-generation": MBartForConditionalGeneration,
+            "translation": MBartForConditionalGeneration,
+            "zero-shot": MBartForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
     is_encoder_decoder = True
     fx_compatible = False  # Fix me Michael
     test_pruning = False
     test_missing_keys = False
 
+    # TODO: Fix the failed tests
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if pipeline_test_casse_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
+            return True
+
+        return False
+
     def setUp(self):
         self.model_tester = MBartModelTester(self)
         self.config_tester = ConfigTester(self, config_class=MBartConfig)
@@ -301,6 +327,43 @@ def test_generate_fp16(self):
         model.generate(input_ids, attention_mask=attention_mask)
         model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
 
+    def test_ensure_weights_are_shared(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+
+        config.tie_word_embeddings = True
+        model = MBartForConditionalGeneration(config)
+
+        # MBart shares four weights.
+        # Not an issue to not have these correctly tied for torch.load, but it is an issue for safetensors.
+        self.assertEqual(
+            len(
+                {
+                    model.get_output_embeddings().weight.data_ptr(),
+                    model.get_input_embeddings().weight.data_ptr(),
+                    model.base_model.decoder.embed_tokens.weight.data_ptr(),
+                    model.base_model.encoder.embed_tokens.weight.data_ptr(),
+                }
+            ),
+            1,
+        )
+
+        config.tie_word_embeddings = False
+        model = MBartForConditionalGeneration(config)
+
+        # MBart shares four weights.
+        # Not an issue to not have these correctly tied for torch.load, but it is an issue for safetensors.
+        self.assertEqual(
+            len(
+                {
+                    model.get_output_embeddings().weight.data_ptr(),
+                    model.get_input_embeddings().weight.data_ptr(),
+                    model.base_model.decoder.embed_tokens.weight.data_ptr(),
+                    model.base_model.encoder.embed_tokens.weight.data_ptr(),
+                }
+            ),
+            2,
+        )
+
 
 def assert_tensors_close(a, b, atol=1e-12, prefix=""):
     """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
@@ -465,7 +528,7 @@ def __init__(
         use_labels=True,
         decoder_start_token_id=2,
         decoder_ffn_dim=32,
-        decoder_layers=4,
+        decoder_layers=2,
         encoder_attention_heads=4,
         decoder_attention_heads=4,
         max_position_embeddings=30,
@@ -666,3 +729,7 @@ def test_decoder_model_attn_mask_past(self):
     def test_retain_grad_hidden_states_attentions(self):
         # decoder cannot keep gradients
         return
+
+    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
+    def test_left_padding_compatibility(self):
+        pass
diff --git a/tests/models/mbart/test_modeling_tf_mbart.py b/tests/models/mbart/test_modeling_tf_mbart.py
index 166fd6bcb40f..04aad6cfc635 100644
--- a/tests/models/mbart/test_modeling_tf_mbart.py
+++ b/tests/models/mbart/test_modeling_tf_mbart.py
@@ -13,15 +13,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import tempfile
+from __future__ import annotations
+
 import unittest
 
 from transformers import AutoTokenizer, MBartConfig, is_tf_available
-from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow, tooslow
+from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
 from transformers.utils import cached_property
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -45,7 +47,7 @@ def __init__(
         use_labels=False,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_dropout_prob=0.1,
@@ -115,32 +117,6 @@ def check_decoder_model_past_large_inputs(self, config, inputs_dict):
         output, past_key_values = outputs.to_tuple()
         past_key_values = past_key_values[1]
 
-    def test_compile_tf_model(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
-        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-        metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
-        model_class = self.all_generative_model_classes[0]
-        input_ids = {
-            "decoder_input_ids": tf.keras.Input(batch_shape=(2, 2000), name="decoder_input_ids", dtype="int32"),
-            "input_ids": tf.keras.Input(batch_shape=(2, 2000), name="input_ids", dtype="int32"),
-        }
-        # Prepare our model
-        model = model_class(config)
-        model(self._prepare_for_class(inputs_dict, model_class))  # Model must be called before saving.
-        # Let's load it from the disk to be sure we can use pretrained weights
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_pretrained(tmpdirname)
-            model = model_class.from_pretrained(tmpdirname)
-        outputs_dict = model(input_ids)
-        hidden_states = outputs_dict[0]
-        # Add a dense layer on top to test integration with other keras modules
-        outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states)
-        # Compile extended model
-        extended_model = tf.keras.Model(inputs=[input_ids], outputs=[outputs])
-        extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
-
 
 def prepare_mbart_inputs_dict(
     config,
@@ -180,13 +156,34 @@ def prepare_mbart_inputs_dict(
 
 
 @require_tf
-class TFMBartModelTest(TFModelTesterMixin, unittest.TestCase):
+class TFMBartModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (TFMBartForConditionalGeneration, TFMBartModel) if is_tf_available() else ()
     all_generative_model_classes = (TFMBartForConditionalGeneration,) if is_tf_available() else ()
+    pipeline_model_mapping = (
+        {
+            "conversational": TFMBartForConditionalGeneration,
+            "feature-extraction": TFMBartModel,
+            "summarization": TFMBartForConditionalGeneration,
+            "text2text-generation": TFMBartForConditionalGeneration,
+            "translation": TFMBartForConditionalGeneration,
+        }
+        if is_tf_available()
+        else {}
+    )
     is_encoder_decoder = True
     test_pruning = False
     test_onnx = False
 
+    # TODO: Fix the failed tests
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if pipeline_test_casse_name != "FeatureExtractionPipelineTests":
+            # Exception encountered when calling layer '...'
+            return True
+
+        return False
+
     def setUp(self):
         self.model_tester = TFMBartModelTester(self)
         self.config_tester = ConfigTester(self, config_class=MBartConfig)
@@ -198,30 +195,6 @@ def test_decoder_model_past_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
 
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-
-            if model_class in self.all_generative_model_classes:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert isinstance(name, dict)
-                for k, v in name.items():
-                    assert isinstance(v, tf.Variable)
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
-
-    @tooslow
-    def test_saved_model_creation(self):
-        pass
-
 
 @require_sentencepiece
 @require_tokenizers
diff --git a/tests/models/mbart/test_tokenization_mbart.py b/tests/models/mbart/test_tokenization_mbart.py
index f65662dbe247..e5a9d8c07f27 100644
--- a/tests/models/mbart/test_tokenization_mbart.py
+++ b/tests/models/mbart/test_tokenization_mbart.py
@@ -201,6 +201,10 @@ def test_save_pretrained(self):
 
                 shutil.rmtree(tmpdirname2)
 
+    @unittest.skip("Need to fix this after #26538")
+    def test_training_new_tokenizer(self):
+        pass
+
 
 @require_torch
 @require_sentencepiece
diff --git a/tests/models/mctct/test_feature_extraction_mctct.py b/tests/models/mctct/test_feature_extraction_mctct.py
deleted file mode 100644
index e0c77ad450fd..000000000000
--- a/tests/models/mctct/test_feature_extraction_mctct.py
+++ /dev/null
@@ -1,274 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import itertools
-import random
-import unittest
-
-import numpy as np
-
-from transformers import is_speech_available
-from transformers.testing_utils import require_torch, require_torchaudio
-
-from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
-
-
-if is_speech_available():
-    from transformers import MCTCTFeatureExtractor
-
-global_rng = random.Random()
-
-
-def floats_list(shape, scale=1.0, rng=None, name=None):
-    """Creates a random float32 tensor"""
-    if rng is None:
-        rng = global_rng
-
-    values = []
-    for _batch_idx in range(shape[0]):
-        values.append([])
-        for _ in range(shape[1]):
-            values[-1].append(rng.random() * scale)
-
-    return values
-
-
-@require_torch
-@require_torchaudio
-class MCTCTFeatureExtractionTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        min_seq_length=400,
-        max_seq_length=2000,
-        feature_size=24,
-        num_mel_bins=24,
-        padding_value=0.0,
-        sampling_rate=16_000,
-        return_attention_mask=True,
-        do_normalize=True,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.min_seq_length = min_seq_length
-        self.max_seq_length = max_seq_length
-        self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
-        self.feature_size = feature_size
-        self.num_mel_bins = num_mel_bins
-        self.padding_value = padding_value
-        self.sampling_rate = sampling_rate
-        self.return_attention_mask = return_attention_mask
-        self.do_normalize = do_normalize
-
-    def prepare_feat_extract_dict(self):
-        return {
-            "feature_size": self.feature_size,
-            "num_mel_bins": self.num_mel_bins,
-            "padding_value": self.padding_value,
-            "sampling_rate": self.sampling_rate,
-            "return_attention_mask": self.return_attention_mask,
-            "do_normalize": self.do_normalize,
-        }
-
-    def prepare_inputs_for_common(self, equal_length=False, numpify=False):
-        def _flatten(list_of_lists):
-            return list(itertools.chain(*list_of_lists))
-
-        if equal_length:
-            speech_inputs = [floats_list((self.max_seq_length, self.feature_size)) for _ in range(self.batch_size)]
-        else:
-            # make sure that inputs increase in size
-            speech_inputs = [
-                floats_list((x, self.feature_size))
-                for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
-            ]
-        if numpify:
-            speech_inputs = [np.asarray(x) for x in speech_inputs]
-        return speech_inputs
-
-
-@require_torch
-@require_torchaudio
-class MCTCTFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
-
-    feature_extraction_class = MCTCTFeatureExtractor if is_speech_available() else None
-
-    def setUp(self):
-        self.feat_extract_tester = MCTCTFeatureExtractionTester(self)
-
-    def _check_zero_mean_unit_variance(self, input_vector):
-        self.assertTrue(np.all(np.mean(input_vector) < 1e-3))
-        self.assertTrue(np.all(np.abs(np.var(input_vector) - 1) < 1e-3))
-
-    def test_call(self):
-        # Tests that all call wrap to encode_plus and batch_encode_plus
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        # create three inputs of length 800, 1000, and 12000
-        speech_inputs = [floats_list((1, x))[0] for x in range(8000, 14000, 2000)]
-        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
-
-        # Test feature size
-        input_features = feature_extractor(np_speech_inputs, padding=True, return_tensors="np").input_features
-        self.assertTrue(input_features.ndim == 3)
-        self.assertTrue(input_features.shape[-1] == feature_extractor.feature_size)
-
-        # Test not batched input
-        encoded_sequences_1 = feature_extractor(speech_inputs[0], return_tensors="np").input_features
-        encoded_sequences_2 = feature_extractor(np_speech_inputs[0], return_tensors="np").input_features
-        self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
-
-        # Test batched
-        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
-        encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
-        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
-            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
-
-    def test_cepstral_mean_and_variance_normalization(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        speech_inputs = [floats_list((1, x))[0] for x in range(8000, 14000, 2000)]
-
-        paddings = ["longest", "max_length", "do_not_pad"]
-        max_lengths = [None, 16, None]
-        for max_length, padding in zip(max_lengths, paddings):
-            inputs = feature_extractor(
-                speech_inputs,
-                padding=padding,
-                max_length=max_length,
-                return_attention_mask=True,
-                truncation=max_length is not None,  # reference to #16419
-            )
-            input_features = inputs.input_features
-            attention_mask = inputs.attention_mask
-            fbank_feat_lengths = [np.sum(x) for x in attention_mask]
-            self._check_zero_mean_unit_variance(input_features[0][: fbank_feat_lengths[0]])
-            self._check_zero_mean_unit_variance(input_features[1][: fbank_feat_lengths[1]])
-            self._check_zero_mean_unit_variance(input_features[2][: fbank_feat_lengths[2]])
-
-    def test_cepstral_mean_and_variance_normalization_np(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        speech_inputs = [floats_list((1, x))[0] for x in range(8000, 14000, 2000)]
-
-        paddings = ["longest", "max_length", "do_not_pad"]
-        max_lengths = [None, 16, None]
-        for max_length, padding in zip(max_lengths, paddings):
-            inputs = feature_extractor(
-                speech_inputs,
-                max_length=max_length,
-                padding=padding,
-                return_tensors="np",
-                return_attention_mask=True,
-                truncation=max_length is not None,
-            )
-            input_features = inputs.input_features
-            attention_mask = inputs.attention_mask
-            fbank_feat_lengths = [np.sum(x) for x in attention_mask]
-
-            self._check_zero_mean_unit_variance(input_features[0][: fbank_feat_lengths[0]])
-            self.assertTrue(input_features[0][fbank_feat_lengths[0] :].sum() < 1e-6)
-            self._check_zero_mean_unit_variance(input_features[1][: fbank_feat_lengths[1]])
-            self.assertTrue(input_features[0][fbank_feat_lengths[1] :].sum() < 1e-6)
-            self._check_zero_mean_unit_variance(input_features[2][: fbank_feat_lengths[2]])
-
-    def test_cepstral_mean_and_variance_normalization_trunc_max_length(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        speech_inputs = [floats_list((1, x))[0] for x in range(8000, 14000, 2000)]
-        inputs = feature_extractor(
-            speech_inputs,
-            padding="max_length",
-            max_length=4,
-            truncation=True,
-            return_tensors="np",
-            return_attention_mask=True,
-        )
-        input_features = inputs.input_features
-        attention_mask = inputs.attention_mask
-        fbank_feat_lengths = np.sum(attention_mask == 1, axis=1)
-
-        self._check_zero_mean_unit_variance(input_features[0, : fbank_feat_lengths[0]])
-        self._check_zero_mean_unit_variance(input_features[1])
-        self._check_zero_mean_unit_variance(input_features[2])
-
-    def test_cepstral_mean_and_variance_normalization_trunc_longest(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        speech_inputs = [floats_list((1, x))[0] for x in range(8000, 14000, 2000)]
-        inputs = feature_extractor(
-            speech_inputs,
-            padding="longest",
-            max_length=4,
-            truncation=True,
-            return_tensors="np",
-            return_attention_mask=True,
-        )
-        input_features = inputs.input_features
-        attention_mask = inputs.attention_mask
-        fbank_feat_lengths = np.sum(attention_mask == 1, axis=1)
-
-        self._check_zero_mean_unit_variance(input_features[0, : fbank_feat_lengths[0]])
-        self._check_zero_mean_unit_variance(input_features[1, : fbank_feat_lengths[1]])
-        self._check_zero_mean_unit_variance(input_features[2])
-
-        # make sure that if max_length < longest -> then pad to max_length
-        self.assertEqual(input_features.shape, (3, 4, 24))
-
-        speech_inputs = [floats_list((1, x))[0] for x in range(8000, 14000, 2000)]
-        inputs = feature_extractor(
-            speech_inputs,
-            padding="longest",
-            max_length=16,
-            truncation=True,
-            return_tensors="np",
-            return_attention_mask=True,
-        )
-        input_features = inputs.input_features
-        attention_mask = inputs.attention_mask
-        fbank_feat_lengths = np.sum(attention_mask == 1, axis=1)
-
-        self._check_zero_mean_unit_variance(input_features[0, : fbank_feat_lengths[0]])
-        self._check_zero_mean_unit_variance(input_features[1, : fbank_feat_lengths[1]])
-        self._check_zero_mean_unit_variance(input_features[2])
-
-        # make sure that if max_length < longest -> then pad to max_length
-        self.assertEqual(input_features.shape, (3, 16, 24))
-
-    def test_double_precision_pad(self):
-        import torch
-
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        np_speech_inputs = np.random.rand(100, 32).astype(np.float64)
-        py_speech_inputs = np_speech_inputs.tolist()
-
-        for inputs in [py_speech_inputs, np_speech_inputs]:
-            np_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="np")
-            self.assertTrue(np_processed.input_features.dtype == np.float32)
-            pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="pt")
-            self.assertTrue(pt_processed.input_features.dtype == torch.float32)
-
-    def test_different_window(self):
-        import torch
-
-        init_dict = self.feat_extract_tester.prepare_feat_extract_dict()
-        init_dict["win_function"] = "hann_window"
-
-        feature_extractor = self.feature_extraction_class(**init_dict)
-        np_speech_inputs = np.random.rand(100, 32).astype(np.float64)
-        py_speech_inputs = np_speech_inputs.tolist()
-
-        for inputs in [py_speech_inputs, np_speech_inputs]:
-            np_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="np")
-            self.assertTrue(np_processed.input_features.dtype == np.float32)
-            pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="pt")
-            self.assertTrue(pt_processed.input_features.dtype == torch.float32)
diff --git a/tests/models/mctct/test_modeling_mctct.py b/tests/models/mctct/test_modeling_mctct.py
deleted file mode 100644
index a4e0997e3f10..000000000000
--- a/tests/models/mctct/test_modeling_mctct.py
+++ /dev/null
@@ -1,651 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch MCTCT model. """
-
-import inspect
-import math
-import unittest
-
-from datasets import load_dataset
-
-from transformers import MCTCTConfig, is_torch_available
-from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import MCTCTForCTC, MCTCTModel, MCTCTProcessor
-    from transformers.pytorch_utils import is_torch_less_than_1_9
-else:
-    is_torch_less_than_1_9 = True
-
-
-class MCTCTModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=10,
-        seq_length=40,  # speech is longer
-        is_training=False,
-        vocab_size=32,
-        hidden_size=128 * 4,
-        num_hidden_layers=4,
-        intermediate_size=20,
-        num_attention_heads=4,
-        attention_head_dim=128,
-        max_position_embeddings=920,
-        layer_norm_eps=1e-5,
-        layerdrop=0.3,
-        hidden_act="relu",
-        initializer_range=0.02,
-        hidden_dropout_prob=0.3,
-        attention_probs_dropout_prob=0.3,
-        conv_glu_dim=1,
-        conv_dropout=0.3,
-        num_conv_layers=1,
-        conv_kernel=(7,),
-        conv_stride=(3,),
-        input_feat_per_channel=80,
-        input_channels=1,
-        conv_channels=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length  # speech is longer
-        self.is_training = is_training
-
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.intermediate_size = intermediate_size
-        self.num_attention_heads = num_attention_heads
-
-        self.attention_head_dim = attention_head_dim
-        self.max_position_embeddings = max_position_embeddings
-
-        self.layer_norm_eps = layer_norm_eps
-        self.layerdrop = layerdrop
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-
-        self.conv_glu_dim = conv_glu_dim
-        self.conv_dropout = conv_dropout
-        self.num_conv_layers = num_conv_layers
-        self.conv_kernel = conv_kernel
-        self.conv_stride = conv_stride
-        self.input_feat_per_channel = input_feat_per_channel
-        self.input_channels = input_channels
-        self.conv_channels = conv_channels
-
-        output_seq_length = self.seq_length
-        dilation = 1
-        for _, kernel_sz, stride in zip(range(self.num_conv_layers), self.conv_kernel, self.conv_stride):
-            padding = kernel_sz // 2
-            output_seq_length = output_seq_length + 2 * padding - dilation * (kernel_sz - 1) - 1
-            output_seq_length = torch.div(output_seq_length, stride, rounding_mode="trunc") + 1
-
-        self.output_seq_length = int(math.ceil(output_seq_length))
-        self.encoder_seq_length = self.output_seq_length
-
-    def prepare_config_and_inputs(self):
-        input_features = floats_tensor(
-            [self.batch_size, self.seq_length, self.input_feat_per_channel], self.vocab_size
-        )
-        attention_mask = torch.ones([self.batch_size, self.seq_length], dtype=torch.long, device=torch_device)
-
-        config = self.get_config()
-
-        return config, input_features, attention_mask
-
-    def get_config(self):
-        return MCTCTConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            intermediate_size=self.intermediate_size,
-            num_attention_heads=self.num_attention_heads,
-            attention_head_dim=self.attention_head_dim,
-            max_position_embeddings=self.max_position_embeddings,
-            layer_norm_eps=self.layer_norm_eps,
-            layerdrop=self.layerdrop,
-            hidden_act=self.hidden_act,
-            initializer_range=self.initializer_range,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            conv_glu_dim=self.conv_glu_dim,
-            conv_dropout=self.conv_dropout,
-            num_conv_layers=self.num_conv_layers,
-            conv_kernel=self.conv_kernel,
-            conv_stride=self.conv_stride,
-            input_feat_per_channel=self.input_feat_per_channel,
-            input_channels=self.input_channels,
-            conv_channels=self.conv_channels,
-        )
-
-    def create_and_check_model(self, config, input_features, attention_mask):
-        model = MCTCTModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_features, attention_mask=attention_mask)
-
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
-        )
-
-    def create_and_check_model_for_ctc(self, config, input_features, attention_mask):
-        config.add_adapter = True
-        config.output_hidden_size = 2 * config.hidden_size
-        model = MCTCTForCTC(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_features, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.adapter_output_seq_length, self.vocab_size)
-        )
-
-    def create_and_check_batch_inference(self, config, input_features, *args):
-        # test does not pass for models making use of `group_norm`
-        # check: https://github.com/pytorch/fairseq/issues/3227
-        model = MCTCTModel(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        input_features = input_features[:3]
-        attention_mask = torch.ones(input_features.shape[:-1], device=torch_device, dtype=torch.bool)
-
-        input_lengths = [input_features.shape[-1] // i for i in [2, 2, 1]]
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_features[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0.0
-
-        batch_outputs = model(input_features, attention_mask=attention_mask).last_hidden_state
-
-        for i in range(input_features.shape[0]):
-            input_slice = input_features[i : i + 1, : input_lengths[i]]
-            output = model(input_slice).last_hidden_state
-
-            batch_output = batch_outputs[i : i + 1, : output.shape[1]]
-            self.parent.assertTrue(torch.allclose(output, batch_output, atol=1e-3))
-
-    def check_ctc_loss(self, config, input_features, *args):
-        model = MCTCTForCTC(config=config)
-        model.to(torch_device)
-
-        # make sure that dropout is disabled
-        model.eval()
-
-        input_features = input_features[:3]
-
-        # input_features is a 2D window for each sequence
-        attention_mask = torch.ones(input_features.shape[:-1], device=torch_device, dtype=torch.long)
-
-        # -2 since input_features is a 2D window for each sequence in batch
-        input_lengths = [input_features.shape[-2] // i for i in [2, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
-        labels = ids_tensor((input_features.shape[0], min(max_length_labels) - 1), model.config.vocab_size)
-        # pad input
-        for i in range(len(input_lengths)):
-            input_features[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0
-
-        model.config.ctc_loss_reduction = "sum"
-        sum_loss = model(input_features, attention_mask=attention_mask, labels=labels).loss.item()
-
-        model.config.ctc_loss_reduction = "mean"
-        mean_loss = model(input_features, attention_mask=attention_mask, labels=labels).loss.item()
-
-        self.parent.assertTrue(isinstance(sum_loss, float))
-        self.parent.assertTrue(isinstance(mean_loss, float))
-
-    def check_ctc_training(self, config, input_features, *args):
-        config.ctc_zero_infinity = True
-        model = MCTCTForCTC(config=config)
-        model.to(torch_device)
-        model.train()
-
-        input_features = input_features[:3]
-
-        input_lengths = [input_features.shape[-2] // i for i in [2, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
-        labels = ids_tensor((input_features.shape[0], max(max_length_labels) - 1), model.config.vocab_size)
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_features[i, input_lengths[i] :] = 0.0
-
-            if max_length_labels[i] < labels.shape[-1]:
-                # it's important that we make sure that target lenghts are at least
-                # one shorter than logit lenghts to prevent -inf
-                labels[i, max_length_labels[i] - 1 :] = -100
-
-        loss = model(input_features, labels=labels).loss
-        self.parent.assertFalse(torch.isinf(loss).item())
-
-        loss.backward()
-
-    def check_labels_out_of_vocab(self, config, input_features, *args):
-        model = MCTCTForCTC(config)
-        model.to(torch_device)
-        model.train()
-
-        input_features = input_features[:3]
-
-        input_lengths = [input_features.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
-        labels = ids_tensor((input_features.shape[0], max(max_length_labels) - 2), model.config.vocab_size + 100)
-
-        with self.parent.assertRaises(ValueError):
-            model(input_features, labels=labels)
-
-    def prepare_config_and_inputs_for_common(self):
-        config, input_features, attention_mask = self.prepare_config_and_inputs()
-        inputs_dict = {"input_features": input_features, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-
-@require_torch
-@unittest.skipIf(is_torch_less_than_1_9, "MCTCT is only available in torch v1.9+")
-class MCTCTModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (MCTCTForCTC, MCTCTModel) if is_torch_available() else ()
-    test_pruning = False
-    test_headmasking = False
-    test_torchscript = False
-
-    def setUp(self):
-        self.model_tester = MCTCTModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=MCTCTConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_ctc_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_loss(*config_and_inputs)
-
-    def test_ctc_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_training(*config_and_inputs)
-
-    def test_labels_out_of_vocab(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
-
-    # MCTCT has no inputs_embeds
-    def test_inputs_embeds(self):
-        pass
-
-    # `input_ids` is renamed to `input_features`
-    def test_forward_signature(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = [
-                "input_features",
-                "attention_mask",
-                "head_mask",
-                "output_attentions",
-                "output_hidden_states",
-                "return_dict",
-            ]
-            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-
-    # MCTCT cannot resize token embeddings
-    # since it has no tokens embeddings
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    # MCTCT has no inputs_embeds
-    def test_model_common_attributes(self):
-        pass
-
-    def test_retain_grad_hidden_states_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-        config.layerdrop = 0.0
-
-        # no need to test all models as different heads yield the same functionality
-        model_class = self.all_model_classes[0]
-        model = model_class(config)
-        model.to(torch_device)
-
-        input_features = inputs_dict["input_features"]
-
-        input_lengths = torch.tensor(
-            [input_features.shape[1] for _ in range(input_features.shape[0])], dtype=torch.long, device=torch_device
-        )
-        output_lengths = model._get_feat_extract_output_lengths(input_lengths)
-
-        labels = ids_tensor((input_features.shape[0], output_lengths[0] - 2), self.model_tester.vocab_size)
-        inputs_dict["attention_mask"] = torch.ones_like(inputs_dict["attention_mask"])
-        inputs_dict["labels"] = labels
-
-        outputs = model(**inputs_dict)
-
-        output = outputs[0]
-
-        # Encoder-/Decoder-only models
-        hidden_states = outputs.hidden_states[0]
-        attentions = outputs.attentions[0]
-
-        hidden_states.retain_grad()
-        attentions.retain_grad()
-
-        output.flatten()[0].backward(retain_graph=True)
-
-        self.assertIsNotNone(hidden_states.grad)
-        self.assertIsNotNone(attentions.grad)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "objective.weight",
-                ]
-                if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    # overwrite from test_modeling_common
-    def _mock_init_weights(self, module):
-        if hasattr(module, "weight") and module.weight is not None:
-            module.weight.data.fill_(3)
-        if hasattr(module, "weight_g") and module.weight_g is not None:
-            module.weight_g.data.fill_(3)
-        if hasattr(module, "weight_v") and module.weight_v is not None:
-            module.weight_v.data.fill_(3)
-        if hasattr(module, "bias") and module.bias is not None:
-            module.bias.data.fill_(3)
-        if hasattr(module, "codevectors") and module.codevectors is not None:
-            module.codevectors.data.fill_(3)
-        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
-            module.masked_spec_embed.data.fill_(3)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = MCTCTModel.from_pretrained("speechbrain/m-ctc-t-large")
-        self.assertIsNotNone(model)
-
-
-@require_torch
-class MCTCTRobustModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (MCTCTForCTC, MCTCTModel) if is_torch_available() else ()
-    test_pruning = False
-    test_headmasking = False
-    test_torchscript = False
-
-    def setUp(self):
-        self.model_tester = MCTCTModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=MCTCTConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_batched_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_batch_inference(*config_and_inputs)
-
-    def test_ctc_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_loss(*config_and_inputs)
-
-    def test_ctc_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_training(*config_and_inputs)
-
-    def test_labels_out_of_vocab(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
-
-    # MCTCT has no inputs_embeds
-    def test_inputs_embeds(self):
-        pass
-
-    # `input_ids` is renamed to `input_features`
-    def test_forward_signature(self):
-        pass
-
-    # MCTCT cannot resize token embeddings
-    # since it has no tokens embeddings
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    # MCTCT has no inputs_embeds
-    # and thus the `get_input_embeddings` fn
-    # is not implemented
-    def test_model_common_attributes(self):
-        pass
-
-    def test_retain_grad_hidden_states_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        # no need to test all models as different heads yield the same functionality
-        model_class = self.all_model_classes[0]
-        model = model_class(config)
-        model.to(torch_device)
-
-        # set layer drop to 0
-        model.config.layerdrop = 0.0
-
-        input_features = inputs_dict["input_features"]
-
-        input_lengths = torch.tensor(
-            [input_features.shape[1] for _ in range(input_features.shape[0])], dtype=torch.long, device=torch_device
-        )
-        output_lengths = model._get_feat_extract_output_lengths(input_lengths)
-
-        labels = ids_tensor((input_features.shape[0], output_lengths[0] - 2), self.model_tester.vocab_size)
-        inputs_dict["attention_mask"] = torch.ones_like(inputs_dict["attention_mask"])
-        inputs_dict["labels"] = labels
-
-        outputs = model(**inputs_dict)
-
-        output = outputs[0]
-
-        # Encoder-/Decoder-only models
-        hidden_states = outputs.hidden_states[0]
-        attentions = outputs.attentions[0]
-
-        hidden_states.retain_grad()
-        attentions.retain_grad()
-
-        output.flatten()[0].backward(retain_graph=True)
-
-        self.assertIsNotNone(hidden_states.grad)
-        self.assertIsNotNone(attentions.grad)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "objective.weight",
-                ]
-                if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    # overwrite from test_modeling_common
-    def _mock_init_weights(self, module):
-        if hasattr(module, "weight") and module.weight is not None:
-            module.weight.data.fill_(3)
-        if hasattr(module, "weight_g") and module.weight_g is not None:
-            module.weight_g.data.fill_(3)
-        if hasattr(module, "weight_v") and module.weight_v is not None:
-            module.weight_v.data.fill_(3)
-        if hasattr(module, "bias") and module.bias is not None:
-            module.bias.data.fill_(3)
-        if hasattr(module, "codevectors") and module.codevectors is not None:
-            module.codevectors.data.fill_(3)
-        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
-            module.masked_spec_embed.data.fill_(3)
-
-    @unittest.skip(reason="Feed forward chunking is not implemented")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = MCTCTModel.from_pretrained("speechbrain/m-ctc-t-large")
-        self.assertIsNotNone(model)
-
-
-@require_torch
-@require_soundfile
-@slow
-class MCTCTModelIntegrationTest(unittest.TestCase):
-    def _load_datasamples(self, num_samples):
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").filter(
-            lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
-        )[:num_samples]["audio"]
-
-        return [x["array"] for x in speech_samples]
-
-    def test_inference_ctc_normal(self):
-        model = MCTCTForCTC.from_pretrained("speechbrain/m-ctc-t-large")
-        model.to(torch_device)
-        processor = MCTCTProcessor.from_pretrained("speechbrain/m-ctc-t-large", do_lower_case=True)
-        input_speech = self._load_datasamples(1)
-
-        input_features = processor(input_speech, return_tensors="pt").input_features.to(torch_device)
-
-        with torch.no_grad():
-            logits = model(input_features).logits
-
-        predicted_ids = torch.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = ["a man said to the universe, sir, i exist."]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    def test_inference_ctc_normal_batched(self):
-        model = MCTCTForCTC.from_pretrained("speechbrain/m-ctc-t-large")
-        model.to(torch_device)
-        processor = MCTCTProcessor.from_pretrained("speechbrain/m-ctc-t-large", do_lower_case=True)
-
-        input_speech = self._load_datasamples(2)
-
-        inputs = processor(input_speech, return_tensors="pt", padding=True)
-
-        input_features = inputs.input_features.to(torch_device)
-        attention_mask = inputs.attention_mask.to(torch_device)
-
-        with torch.no_grad():
-            logits = model(input_features, attention_mask=attention_mask).logits
-
-        predicted_ids = torch.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "a man said to the universe, sir, i exist.",
-            '"sweat-covered brion\'s body, trickling into the tight-lowing clossa was the only germent huor."',
-        ]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    def test_inference_ctc_robust_batched(self):
-        model = MCTCTForCTC.from_pretrained("speechbrain/m-ctc-t-large").to(torch_device)
-        processor = MCTCTProcessor.from_pretrained("speechbrain/m-ctc-t-large", do_lower_case=True)
-
-        input_speech = self._load_datasamples(4)
-
-        inputs = processor(input_speech, return_tensors="pt", padding=True, return_attention_mask=True)
-
-        input_features = inputs.input_features.to(torch_device)
-        attention_mask = inputs.attention_mask.to(torch_device)
-
-        with torch.no_grad():
-            logits = model(input_features, attention_mask=attention_mask).logits
-
-        predicted_ids = torch.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "a man said to the universe, sir, i exist.",
-            '"sweat-covered brion\'s body, trickling into the tight-lowing clossa was the only germent huor." "',
-            "\"the cadona's chest still-dripping bloodthe acofis overstrained eyes, even the soring arena around him"
-            " with thousands of spectators retrivialities not worth-thinking about.",
-            "his instant panic was followed by a small sharp blow high on his chestr.",
-        ]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
diff --git a/tests/models/mctct/test_processor_mctct.py b/tests/models/mctct/test_processor_mctct.py
deleted file mode 100644
index 306d4b174fb7..000000000000
--- a/tests/models/mctct/test_processor_mctct.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-import shutil
-import tempfile
-import unittest
-
-from transformers import MCTCTProcessor, is_speech_available, is_torch_available
-from transformers.file_utils import FEATURE_EXTRACTOR_NAME
-from transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES, Wav2Vec2CTCTokenizer
-from transformers.testing_utils import require_torch, require_torchaudio
-
-
-if is_speech_available() and is_torch_available():
-    from transformers import MCTCTFeatureExtractor
-
-    from .test_feature_extraction_mctct import floats_list
-
-
-@require_torch
-@require_torchaudio
-class MCTCTProcessorTest(unittest.TestCase):
-    def setUp(self):
-        vocab = "<pad> <s> </s> <unk> | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split(" ")
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-
-        self.add_kwargs_tokens_map = {
-            "pad_token": "<pad>",
-            "unk_token": "<unk>",
-            "bos_token": "<s>",
-            "eos_token": "</s>",
-        }
-        feature_extractor_map = {
-            "feature_size": 1,
-            "padding_value": 0.0,
-            "sampling_rate": 16000,
-            "return_attention_mask": False,
-            "do_normalize": True,
-        }
-
-        self.tmpdirname = tempfile.mkdtemp()
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.feature_extraction_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-
-        with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(feature_extractor_map) + "\n")
-
-    def get_tokenizer(self, **kwargs_init):
-        kwargs = self.add_kwargs_tokens_map.copy()
-        kwargs.update(kwargs_init)
-        return Wav2Vec2CTCTokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_feature_extractor(self, **kwargs):
-        return MCTCTFeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
-
-    def tearDown(self):
-        shutil.rmtree(self.tmpdirname)
-
-    def test_save_load_pretrained_default(self):
-        tokenizer = self.get_tokenizer()
-        feature_extractor = self.get_feature_extractor()
-
-        processor = MCTCTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        processor.save_pretrained(self.tmpdirname)
-        processor = MCTCTProcessor.from_pretrained(self.tmpdirname)
-
-        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
-        self.assertIsInstance(processor.tokenizer, Wav2Vec2CTCTokenizer)
-
-        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
-        self.assertIsInstance(processor.feature_extractor, MCTCTFeatureExtractor)
-
-    def test_save_load_pretrained_additional_features(self):
-        processor = MCTCTProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
-        processor.save_pretrained(self.tmpdirname)
-
-        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
-        feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0)
-
-        processor = MCTCTProcessor.from_pretrained(
-            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
-        )
-
-        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
-        self.assertIsInstance(processor.tokenizer, Wav2Vec2CTCTokenizer)
-
-        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.feature_extractor, MCTCTFeatureExtractor)
-
-    def test_feature_extractor(self):
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = MCTCTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        raw_speech = floats_list((3, 1000))
-
-        input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
-        input_processor = processor(raw_speech, return_tensors="np")
-
-        for key in input_feat_extract.keys():
-            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
-
-    def test_tokenizer(self):
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = MCTCTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        input_str = "This is a test string"
-
-        encoded_processor = processor(text=input_str)
-
-        encoded_tok = tokenizer(input_str)
-
-        for key in encoded_tok.keys():
-            self.assertListEqual(encoded_tok[key], encoded_processor[key])
-
-    def test_tokenizer_decode(self):
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = MCTCTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
-
-        decoded_processor = processor.batch_decode(predicted_ids)
-        decoded_tok = tokenizer.batch_decode(predicted_ids)
-
-        self.assertListEqual(decoded_tok, decoded_processor)
-
-    def test_model_input_names(self):
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = MCTCTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        self.assertListEqual(
-            processor.model_input_names,
-            feature_extractor.model_input_names,
-            msg="`processor` and `feature_extractor` model input names do not match",
-        )
diff --git a/tests/models/mega/__init__.py b/tests/models/mega/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/mega/test_modeling_mega.py b/tests/models/mega/test_modeling_mega.py
new file mode 100644
index 000000000000..10df7a555e5d
--- /dev/null
+++ b/tests/models/mega/test_modeling_mega.py
@@ -0,0 +1,726 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import MegaConfig, is_torch_available
+from transformers.testing_utils import TestCasePlus, require_torch, slow, torch_device
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        MegaForCausalLM,
+        MegaForMaskedLM,
+        MegaForMultipleChoice,
+        MegaForQuestionAnswering,
+        MegaForSequenceClassification,
+        MegaForTokenClassification,
+        MegaModel,
+    )
+    from transformers.models.mega.modeling_mega import MEGA_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+class MegaModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=2,
+        intermediate_size=37,
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_positions=1024,
+        bidirectional=False,  # needed for decoding, and can't modify common generation tests; test separately by overriding
+        ema_projection_size=16,
+        shared_representation_size=64,
+        use_chunking=False,
+        chunk_size=32,
+        attention_activation="softmax",
+        use_normalized_ffn=True,
+        nffn_hidden_size=24,
+        add_token_type_embeddings=True,
+        type_vocab_size=2,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.add_token_type_embeddings = add_token_type_embeddings
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_positions = max_positions
+        self.bidirectional = bidirectional
+        self.ema_projection_size = ema_projection_size
+        self.shared_representation_size = shared_representation_size
+        self.use_chunking = use_chunking
+        self.chunk_size = chunk_size
+        self.attention_activation = attention_activation
+        self.use_normalized_ffn = use_normalized_ffn
+        self.nffn_hidden_size = nffn_hidden_size
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = scope
+        self.num_attention_heads = 1
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.add_token_type_embeddings:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.get_config()
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def get_config(self):
+        return MegaConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            intermediate_size=self.intermediate_size,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+            # added args
+            add_token_type_embeddings=self.add_token_type_embeddings,
+            max_positions=self.max_positions,
+            bidirectional=self.bidirectional,
+            ema_projection_size=self.ema_projection_size,
+            shared_representation_size=self.shared_representation_size,
+            use_chunking=self.use_chunking,
+            chunk_size=self.chunk_size,
+            attention_activation=self.attention_activation,
+            use_normalized_ffn=self.use_normalized_ffn,
+            nffn_hidden_size=self.nffn_hidden_size,
+        )
+
+    def get_pipeline_config(self):
+        config = self.get_config()
+        config.vocab_size = 300
+        return config
+
+    def prepare_config_and_inputs_for_decoder(self):
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = self.prepare_config_and_inputs()
+
+        config.is_decoder = True
+        config.bidirectional = False
+        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
+        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        return (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MegaModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_model_as_decoder(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.add_cross_attention = True
+        model = MegaModel(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_for_causal_lm(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        model = MegaForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_decoder_model_past_large_inputs(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.is_decoder = True
+        config.bidirectional = False
+        config.add_cross_attention = True
+        model = MegaForCausalLM(config=config).to(torch_device).eval()
+
+        # make sure that ids don't start with pad token
+        mask = input_ids.ne(config.pad_token_id).long()
+        input_ids = input_ids * mask
+
+        # first forward pass
+        outputs = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=True,
+        )
+        past_key_values = outputs.past_key_values
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # make sure that ids don't start with pad token
+        mask = next_tokens.ne(config.pad_token_id).long()
+        next_tokens = next_tokens * mask
+        next_mask = ids_tensor((self.batch_size, 1), vocab_size=2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+
+        output_from_no_past = model(
+            next_input_ids,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+        output_from_past = model(
+            next_tokens,
+            attention_mask=next_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -1:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_decoder_model_with_chunking(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.use_chunking = True
+        config.output_attentions = True
+        config.attention_activation = "laplace"
+        config.chunk_size = input_ids.size(1) * 2
+
+        model = MegaForCausalLM(config).to(torch_device).eval()
+
+        input_ids = input_ids.repeat(1, 8)
+        # multiply the sequence length by 8 since we repeat the same ids 8 times in input_ids
+        input_mask = random_attention_mask([self.batch_size, self.seq_length * 8])
+
+        result = model(input_ids, attention_mask=input_mask)
+
+        # test if the sequence length of attentions is same provided chunk_size
+        self.parent.assertEqual(result["attentions"][0].shape[-1], config.chunk_size)
+
+    def create_and_check_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MegaForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = MegaForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_for_multiple_choice(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = MegaForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        result = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def create_and_check_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MegaForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    # extra checks for Mega-specific model functionality
+    def create_and_check_bidirectionality(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.bidirectional = True
+        model = MegaModel(config)
+        model.to(torch_device)
+        model.eval()
+        # no mask
+        result = model(input_ids)
+        # with mask & token types
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+
+        self.parent.assertEqual(result[0].shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def check_chunking_shorter_sequence(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.use_chunking = True
+        config.chunk_size = input_ids.size(1) + 25
+        model = MegaModel(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+
+        self.parent.assertEqual(result[0].shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def check_chunking_longer_sequence(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.use_chunking = True
+
+        # we want the chunk size to be < sequence length, and the sequence length to be a multiple of chunk size
+        config.chunk_size = input_ids.size(1) * 2
+        model = MegaModel(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(
+            input_ids.repeat(1, 8),
+        )
+
+        self.parent.assertEqual(result[0].shape, (self.batch_size, self.seq_length * 8, self.hidden_size))
+
+    def check_laplace_self_attention(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.attention_activation = "laplace"
+        model = MegaModel(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+
+        self.parent.assertEqual(result[0].shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def check_relu2_self_attention(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.attention_activation = "relu2"
+        model = MegaModel(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+
+        self.parent.assertEqual(result[0].shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def check_sequence_length_beyond_max_positions(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.max_positions = self.seq_length - 2
+        model = MegaModel(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+
+        self.parent.assertEqual(result[0].shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class MegaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            MegaForCausalLM,
+            MegaForMaskedLM,
+            MegaModel,
+            MegaForSequenceClassification,
+            MegaForTokenClassification,
+            MegaForMultipleChoice,
+            MegaForQuestionAnswering,
+        )
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (MegaForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": MegaModel,
+            "fill-mask": MegaForMaskedLM,
+            "question-answering": MegaForQuestionAnswering,
+            "text-classification": MegaForSequenceClassification,
+            "text-generation": MegaForCausalLM,
+            "token-classification": MegaForTokenClassification,
+            "zero-shot": MegaForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
+
+    fx_compatible = False
+    test_head_masking = False
+    test_pruning = False
+
+    def setUp(self):
+        self.model_tester = MegaModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=MegaConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_as_decoder(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
+
+    def test_model_as_decoder_with_default_input_mask(self):
+        # This regression test was failing with PyTorch < 1.3
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
+
+        input_mask = None
+
+        self.model_tester.create_and_check_model_as_decoder(
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def test_for_causal_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_decoder_model_with_chunking(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_decoder_model_with_chunking(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
+
+    def test_for_bidirectionality(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bidirectionality(*config_and_inputs)
+
+    def test_for_chunking_shorter_sequence(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_chunking_shorter_sequence(*config_and_inputs)
+
+    def test_for_chunking_longer_sequence(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_chunking_longer_sequence(*config_and_inputs)
+
+    def test_for_laplace_attention(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_laplace_self_attention(*config_and_inputs)
+
+    def test_for_relu2_attention(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_relu2_self_attention(*config_and_inputs)
+
+    def test_for_sequence_length_beyond_max_positions(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_sequence_length_beyond_max_positions(*config_and_inputs)
+
+    def test_generate_fp16(self):
+        config, input_ids, _, attention_mask, *_ = self.model_tester.prepare_config_and_inputs_for_decoder()
+        # attention_mask = torch.LongTensor(input_ids.ne(1)).to(torch_device)
+        model = MegaForCausalLM(config).eval().to(torch_device)
+        if torch_device == "cuda":
+            model.half()
+        model.generate(input_ids, attention_mask=attention_mask)
+        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
+
+    def test_sequence_classification_model(self):
+        config, input_ids, _, attention_mask, *_ = self.model_tester.prepare_config_and_inputs()
+        config.num_labels = self.model_tester.num_labels
+        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
+        model = MegaForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    def test_sequence_classification_model_for_multi_label(self):
+        config, input_ids, _, attention_mask, *_ = self.model_tester.prepare_config_and_inputs()
+        config.num_labels = self.model_tester.num_labels
+        config.problem_type = "multi_label_classification"
+        sequence_labels = ids_tensor(
+            [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size
+        ).to(torch.float)
+        model = MegaForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in MEGA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = MegaModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
+    def test_cpu_offload(self):
+        super().test_cpu_offload()
+
+    @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
+    def test_disk_offload(self):
+        super().test_disk_offload()
+
+    @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
+    def test_model_parallelism(self):
+        super().test_model_parallelism()
+
+    @unittest.skip(
+        reason=(
+            "Calling `self.attention_function` in `MegaMovingAverageGatedAttention.forward` changes the submodules on "
+            "device 1 to device 0 (also changes `requires_grad`). No idea how this could happen for now."
+        )
+    )
+    def test_multi_gpu_data_parallel_forward(self):
+        super().test_multi_gpu_data_parallel_forward()
+
+    @unittest.skip(reason="Tracing of the dynamically computed `MegaMultiDimensionDampedEma._kernel` doesn't work.")
+    def test_torchscript_simple(self):
+        super().test_torchscript_simple()
+
+    @unittest.skip(reason="Tracing of the dynamically computed `MegaMultiDimensionDampedEma._kernel` doesn't work.")
+    def test_torchscript_output_hidden_state(self):
+        super().test_torchscript_output_hidden_state()
+
+    @unittest.skip(reason="Tracing of the dynamically computed `MegaMultiDimensionDampedEma._kernel` doesn't work.")
+    def test_torchscript_output_attentions(self):
+        super().test_torchscript_output_attentions()
+
+
+@require_torch
+class MegaModelIntegrationTest(TestCasePlus):
+    @slow
+    def test_inference_masked_lm(self):
+        model = MegaForMaskedLM.from_pretrained("mnaylor/mega-base-wikitext")
+
+        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        with torch.no_grad():
+            output = model(input_ids)[0]
+        expected_shape = torch.Size((1, 11, 50265))
+        self.assertEqual(output.shape, expected_shape)
+        # compare the actual values for a slice.
+        expected_slice = torch.tensor(
+            [[[67.8389, 10.1470, -32.7148], [-11.1655, 29.1152, 23.1304], [-3.8015, 66.0397, 29.6733]]]
+        )
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_no_head(self):
+        model = MegaModel.from_pretrained("mnaylor/mega-base-wikitext")
+
+        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        with torch.no_grad():
+            output = model(input_ids)[0]
+        expected_shape = torch.Size((1, 11, 128))
+        self.assertEqual(output.shape, expected_shape)
+        # compare the actual values for a slice. taken from output[:, :3, :3]
+        expected_slice = torch.tensor(
+            [[[1.1767, -0.6349, 2.8494], [-0.5109, -0.7745, 1.9495], [-0.3287, -0.2111, 3.3367]]]
+        )
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
diff --git a/tests/models/megatron_bert/test_modeling_megatron_bert.py b/tests/models/megatron_bert/test_modeling_megatron_bert.py
index 4ea3ddcb7be0..818f65d80c56 100644
--- a/tests/models/megatron_bert/test_modeling_megatron_bert.py
+++ b/tests/models/megatron_bert/test_modeling_megatron_bert.py
@@ -25,6 +25,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -57,7 +58,7 @@ def __init__(
         vocab_size=99,
         hidden_size=64,
         embedding_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -266,8 +267,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class MegatronBertModelTest(ModelTesterMixin, unittest.TestCase):
-
+class MegatronBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             MegatronBertModel,
@@ -283,6 +283,19 @@ class MegatronBertModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": MegatronBertModel,
+            "fill-mask": MegatronBertForMaskedLM,
+            "question-answering": MegatronBertForQuestionAnswering,
+            "text-classification": MegatronBertForSequenceClassification,
+            "text-generation": MegatronBertForCausalLM,
+            "token-classification": MegatronBertForTokenClassification,
+            "zero-shot": MegatronBertForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
     fx_compatible = True
     # test_resize_embeddings = False
     test_head_masking = False
diff --git a/tests/models/mgp_str/__init__.py b/tests/models/mgp_str/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/mgp_str/test_modeling_mgp_str.py b/tests/models/mgp_str/test_modeling_mgp_str.py
new file mode 100644
index 000000000000..d8ba50a35002
--- /dev/null
+++ b/tests/models/mgp_str/test_modeling_mgp_str.py
@@ -0,0 +1,271 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch MGP-STR model. """
+
+import inspect
+import unittest
+
+import requests
+
+from transformers import MgpstrConfig
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import MgpstrForSceneTextRecognition
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import MgpstrProcessor
+
+
+class MgpstrModelTester:
+    def __init__(
+        self,
+        parent,
+        is_training=False,
+        batch_size=13,
+        image_size=(32, 128),
+        patch_size=4,
+        num_channels=3,
+        max_token_length=27,
+        num_character_labels=38,
+        num_bpe_labels=99,
+        num_wordpiece_labels=99,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        mlp_ratio=4.0,
+        patch_embeds_hidden_size=257,
+        output_hidden_states=None,
+    ):
+        self.parent = parent
+        self.is_training = is_training
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.max_token_length = max_token_length
+        self.num_character_labels = num_character_labels
+        self.num_bpe_labels = num_bpe_labels
+        self.num_wordpiece_labels = num_wordpiece_labels
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.mlp_ratio = mlp_ratio
+        self.patch_embeds_hidden_size = patch_embeds_hidden_size
+        self.output_hidden_states = output_hidden_states
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size[0], self.image_size[1]])
+        config = self.get_config()
+        return config, pixel_values
+
+    def get_config(self):
+        return MgpstrConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            max_token_length=self.max_token_length,
+            num_character_labels=self.num_character_labels,
+            num_bpe_labels=self.num_bpe_labels,
+            num_wordpiece_labels=self.num_wordpiece_labels,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            mlp_ratio=self.mlp_ratio,
+            output_hidden_states=self.output_hidden_states,
+        )
+
+    def create_and_check_model(self, config, pixel_values):
+        model = MgpstrForSceneTextRecognition(config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            generated_ids = model(pixel_values)
+        self.parent.assertEqual(
+            generated_ids[0][0].shape, (self.batch_size, self.max_token_length, self.num_character_labels)
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class MgpstrModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (MgpstrForSceneTextRecognition,) if is_torch_available() else ()
+    pipeline_model_mapping = {"feature-extraction": MgpstrForSceneTextRecognition} if is_torch_available() else {}
+    fx_compatible = False
+
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+    test_attention_outputs = False
+
+    def setUp(self):
+        self.model_tester = MgpstrModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=MgpstrConfig, has_text_modality=False)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip(reason="MgpstrModel does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    @unittest.skip(reason="MgpstrModel does not support feedforward chunking")
+    def test_feed_forward_chunking(self):
+        pass
+
+    def test_gradient_checkpointing_backward_compatibility(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            if not model_class.supports_gradient_checkpointing:
+                continue
+
+            config.gradient_checkpointing = True
+            model = model_class(config)
+            self.assertTrue(model.is_gradient_checkpointing)
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.hidden_states
+
+            expected_num_layers = getattr(
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+            )
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [self.model_tester.patch_embeds_hidden_size, self.model_tester.hidden_size],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    # override as the `logit_scale` parameter initilization is different for MgpstrModel
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if isinstance(param, (nn.Linear, nn.Conv2d, nn.LayerNorm)):
+                    if param.requires_grad:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+    @unittest.skip(reason="Retain_grad is tested in individual model tests")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+
+# We will verify our results on an image from the IIIT-5k dataset
+def prepare_img():
+    url = "https://i.postimg.cc/ZKwLg2Gw/367-14.png"
+    im = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+    return im
+
+
+@require_vision
+@require_torch
+class MgpstrModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference(self):
+        model_name = "alibaba-damo/mgp-str-base"
+        model = MgpstrForSceneTextRecognition.from_pretrained(model_name).to(torch_device)
+        processor = MgpstrProcessor.from_pretrained(model_name)
+
+        image = prepare_img()
+        inputs = processor(images=image, return_tensors="pt").pixel_values.to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(inputs)
+
+        # verify the logits
+        self.assertEqual(outputs.logits[0].shape, torch.Size((1, 27, 38)))
+
+        out_strs = processor.batch_decode(outputs.logits)
+        expected_text = "ticket"
+
+        self.assertEqual(out_strs["generated_text"][0], expected_text)
+
+        expected_slice = torch.tensor(
+            [[[-39.5397, -44.4024, -36.1844], [-61.4709, -63.8639, -58.3454], [-74.0225, -68.5494, -71.2164]]],
+            device=torch_device,
+        )
+
+        self.assertTrue(torch.allclose(outputs.logits[0][:, 1:4, 1:4], expected_slice, atol=1e-4))
diff --git a/tests/models/mgp_str/test_processor_mgp_str.py b/tests/models/mgp_str/test_processor_mgp_str.py
new file mode 100644
index 000000000000..7c373b447369
--- /dev/null
+++ b/tests/models/mgp_str/test_processor_mgp_str.py
@@ -0,0 +1,211 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the MgpstrProcessor. """
+
+import json
+import os
+import shutil
+import tempfile
+import unittest
+
+import numpy as np
+import pytest
+
+from transformers import MgpstrTokenizer
+from transformers.models.mgp_str.tokenization_mgp_str import VOCAB_FILES_NAMES
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import IMAGE_PROCESSOR_NAME, is_torch_available, is_vision_available
+
+
+if is_torch_available():
+    import torch
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import MgpstrProcessor, ViTImageProcessor
+
+
+@require_torch
+@require_vision
+class MgpstrProcessorTest(unittest.TestCase):
+    image_processing_class = ViTImageProcessor if is_vision_available() else None
+
+    @property
+    def image_processor_dict(self):
+        return self.prepare_image_processor_dict()
+
+    def setUp(self):
+        self.image_size = (3, 32, 128)
+        self.tmpdirname = tempfile.mkdtemp()
+
+        # fmt: off
+        vocab = ['[GO]', '[s]', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
+        # fmt: on
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(vocab_tokens) + "\n")
+
+        image_processor_map = {
+            "do_normalize": False,
+            "do_resize": True,
+            "image_processor_type": "ViTImageProcessor",
+            "resample": 3,
+            "size": {"height": 32, "width": 128},
+        }
+        self.image_processor_file = os.path.join(self.tmpdirname, IMAGE_PROCESSOR_NAME)
+        with open(self.image_processor_file, "w", encoding="utf-8") as fp:
+            json.dump(image_processor_map, fp)
+
+    def get_tokenizer(self, **kwargs):
+        return MgpstrTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_image_processor(self, **kwargs):
+        return ViTImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def prepare_image_inputs(self):
+        """This function prepares a list of PIL images."""
+
+        image_input = np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)
+
+        image_input = Image.fromarray(np.moveaxis(image_input, 0, -1))
+
+        return image_input
+
+    def test_save_load_pretrained_default(self):
+        tokenizer = self.get_tokenizer()
+        image_processor = self.get_image_processor()
+
+        processor = MgpstrProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        processor.save_pretrained(self.tmpdirname)
+        processor = MgpstrProcessor.from_pretrained(self.tmpdirname, use_fast=False)
+
+        self.assertEqual(processor.char_tokenizer.get_vocab(), tokenizer.get_vocab())
+        self.assertIsInstance(processor.char_tokenizer, MgpstrTokenizer)
+
+        self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string())
+        self.assertIsInstance(processor.image_processor, ViTImageProcessor)
+
+    def test_save_load_pretrained_additional_features(self):
+        tokenizer = self.get_tokenizer()
+        image_processor = self.get_image_processor()
+
+        processor = MgpstrProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        processor.save_pretrained(self.tmpdirname)
+
+        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
+        image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
+
+        processor = MgpstrProcessor.from_pretrained(
+            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
+        )
+
+        self.assertEqual(processor.char_tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
+        self.assertIsInstance(processor.char_tokenizer, MgpstrTokenizer)
+
+        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
+        self.assertIsInstance(processor.image_processor, ViTImageProcessor)
+
+    def test_image_processor(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = MgpstrProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        image_input = self.prepare_image_inputs()
+
+        input_image_proc = image_processor(image_input, return_tensors="np")
+        input_processor = processor(images=image_input, return_tensors="np")
+
+        for key in input_image_proc.keys():
+            self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
+
+    def test_tokenizer(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = MgpstrProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        input_str = "test"
+
+        encoded_processor = processor(text=input_str)
+
+        encoded_tok = tokenizer(input_str)
+        for key in encoded_tok.keys():
+            self.assertListEqual(encoded_tok[key], encoded_processor[key])
+
+    def test_processor(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = MgpstrProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        input_str = "test"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+
+        self.assertListEqual(list(inputs.keys()), ["pixel_values", "labels"])
+
+        # test if it raises when no input is passed
+        with pytest.raises(ValueError):
+            processor()
+
+    def test_tokenizer_decode(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = MgpstrProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9], [3, 4, 3, 1, 1, 8, 9]]
+
+        decoded_processor = processor.char_decode(predicted_ids)
+        decoded_tok = tokenizer.batch_decode(predicted_ids)
+        decode_strs = [seq.replace(" ", "") for seq in decoded_tok]
+
+        self.assertListEqual(decode_strs, decoded_processor)
+
+    def test_model_input_names(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = MgpstrProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        input_str = None
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+
+        self.assertListEqual(list(inputs.keys()), processor.model_input_names)
+
+    def test_processor_batch_decode(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = MgpstrProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        char_input = torch.randn(1, 27, 38)
+        bpe_input = torch.randn(1, 27, 50257)
+        wp_input = torch.randn(1, 27, 30522)
+
+        results = processor.batch_decode([char_input, bpe_input, wp_input])
+
+        self.assertListEqual(list(results.keys()), ["generated_text", "scores", "char_preds", "bpe_preds", "wp_preds"])
diff --git a/tests/models/mgp_str/test_tokenization_mgp_str.py b/tests/models/mgp_str/test_tokenization_mgp_str.py
new file mode 100644
index 000000000000..a05d7f3cbf90
--- /dev/null
+++ b/tests/models/mgp_str/test_tokenization_mgp_str.py
@@ -0,0 +1,96 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import os
+import unittest
+
+from transformers import MgpstrTokenizer
+from transformers.models.mgp_str.tokenization_mgp_str import VOCAB_FILES_NAMES
+from transformers.testing_utils import require_tokenizers
+
+from ...test_tokenization_common import TokenizerTesterMixin
+
+
+@require_tokenizers
+class MgpstrTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+    tokenizer_class = MgpstrTokenizer
+    test_rust_tokenizer = False
+    from_pretrained_kwargs = {}
+    test_seq2seq = False
+
+    def setUp(self):
+        super().setUp()
+
+        # fmt: off
+        vocab = ['[GO]', '[s]', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
+        # fmt: on
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(vocab_tokens) + "\n")
+
+    def get_tokenizer(self, **kwargs):
+        return MgpstrTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_input_output_texts(self, tokenizer):
+        input_text = "tester"
+        output_text = "tester"
+        return input_text, output_text
+
+    @unittest.skip("MGP-STR always lower cases letters.")
+    def test_added_tokens_do_lower_case(self):
+        pass
+
+    def test_add_special_tokens(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                special_token = "[SPECIAL_TOKEN]"
+
+                tokenizer.add_special_tokens({"cls_token": special_token})
+                encoded_special_token = tokenizer.encode([special_token], add_special_tokens=False)
+                self.assertEqual(len(encoded_special_token), 1)
+
+                decoded = tokenizer.decode(encoded_special_token, skip_special_tokens=True)
+                self.assertTrue(special_token not in decoded)
+
+    def test_internal_consistency(self):
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                input_text, output_text = self.get_input_output_texts(tokenizer)
+
+                tokens = tokenizer.tokenize(input_text)
+                ids = tokenizer.convert_tokens_to_ids(tokens)
+                ids_2 = tokenizer.encode(input_text, add_special_tokens=False)
+                self.assertListEqual(ids, ids_2)
+
+                tokens_2 = tokenizer.convert_ids_to_tokens(ids)
+                self.assertNotEqual(len(tokens_2), 0)
+                text_2 = tokenizer.decode(ids)
+                self.assertIsInstance(text_2, str)
+
+                self.assertEqual(text_2.replace(" ", ""), output_text)
+
+    @unittest.skip("MGP-STR tokenizer only handles one sequence.")
+    def test_maximum_encoding_length_pair_input(self):
+        pass
+
+    @unittest.skip("inputs cannot be pretokenized in MgpstrTokenizer")
+    def test_pretokenized_inputs(self):
+        pass
diff --git a/tests/models/mistral/__init__.py b/tests/models/mistral/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/mistral/test_modeling_mistral.py b/tests/models/mistral/test_modeling_mistral.py
new file mode 100644
index 000000000000..df1143d2516a
--- /dev/null
+++ b/tests/models/mistral/test_modeling_mistral.py
@@ -0,0 +1,458 @@
+# coding=utf-8
+# Copyright 2023 Mistral AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Mistral model. """
+
+
+import tempfile
+import unittest
+
+from pytest import mark
+
+from transformers import AutoTokenizer, MistralConfig, is_torch_available
+from transformers.testing_utils import require_flash_attn, require_torch, require_torch_gpu, slow, torch_device
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        MistralForCausalLM,
+        MistralForSequenceClassification,
+        MistralModel,
+    )
+
+
+# Copied from transformers.tests.mistral.test_modelling_mistral.MistralModelTest with Llama->Mistral
+class MistralModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=False,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        num_key_value_heads=2,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        pad_token_id=0,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.pad_token_id = pad_token_id
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.get_config()
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def get_config(self):
+        return MistralConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            num_key_value_heads=self.num_key_value_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+            pad_token_id=self.pad_token_id,
+        )
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MistralModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_model_as_decoder(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.add_cross_attention = True
+        model = MistralModel(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        result = model(input_ids, attention_mask=input_mask)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_causal_lm(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        model = MistralForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_decoder_model_past_large_inputs(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.is_decoder = True
+        config.add_cross_attention = True
+        model = MistralForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # first forward pass
+        outputs = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=True,
+        )
+        past_key_values = outputs.past_key_values
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+
+        output_from_no_past = model(
+            next_input_ids,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+        output_from_past = model(
+            next_tokens,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class MistralModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (MistralModel, MistralForCausalLM, MistralForSequenceClassification) if is_torch_available() else ()
+    )
+    all_generative_model_classes = (MistralForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": MistralModel,
+            "text-classification": MistralForSequenceClassification,
+            "text-generation": MistralForCausalLM,
+            "zero-shot": MistralForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
+    test_headmasking = False
+    test_pruning = False
+
+    def setUp(self):
+        self.model_tester = MistralModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=MistralConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_various_embeddings(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        for type in ["absolute", "relative_key", "relative_key_query"]:
+            config_and_inputs[0].position_embedding_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_Mistral_sequence_classification_model(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        print(config)
+        config.num_labels = 3
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
+        model = MistralForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    def test_Mistral_sequence_classification_model_for_single_label(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        config.problem_type = "single_label_classification"
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
+        model = MistralForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    def test_Mistral_sequence_classification_model_for_multi_label(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        config.problem_type = "multi_label_classification"
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        sequence_labels = ids_tensor(
+            [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size
+        ).to(torch.float)
+        model = MistralForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    @unittest.skip("Mistral buffers include complex numbers, which breaks this test")
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @unittest.skip("Mistral uses GQA on all models so the KV cache is a non standard format")
+    def test_past_key_values_format(self):
+        pass
+
+    @require_flash_attn
+    @require_torch_gpu
+    @mark.flash_attn_test
+    @slow
+    def test_flash_attn_2_generate_padding_right(self):
+        import torch
+
+        for model_class in self.all_generative_model_classes:
+            if not model_class._supports_flash_attn_2:
+                return
+
+            config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model = model_class.from_pretrained(
+                    tmpdirname, torch_dtype=torch.float16, use_flash_attention_2=False, low_cpu_mem_usage=True
+                ).to(torch_device)
+
+                dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device)
+                dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device)
+
+                model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False)
+
+                model = model_class.from_pretrained(
+                    tmpdirname, torch_dtype=torch.float16, use_flash_attention_2=True, low_cpu_mem_usage=True
+                ).to(torch_device)
+
+                with self.assertRaises(ValueError):
+                    _ = model.generate(
+                        dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
+                    )
+
+    @require_flash_attn
+    @require_torch_gpu
+    @mark.flash_attn_test
+    @slow
+    def test_flash_attn_2_inference_padding_right(self):
+        import torch
+
+        for model_class in self.all_model_classes:
+            if not model_class._supports_flash_attn_2:
+                return
+
+            config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model_fa = model_class.from_pretrained(
+                    tmpdirname, torch_dtype=torch.bfloat16, use_flash_attention_2=True
+                )
+                model_fa.to(torch_device)
+
+                model = model_class.from_pretrained(
+                    tmpdirname, torch_dtype=torch.bfloat16, use_flash_attention_2=False
+                )
+                model.to(torch_device)
+
+                dummy_input = torch.LongTensor([[1, 2, 3, 4, 5]]).to(torch_device)
+                dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1, 0]]).to(torch_device)
+
+                _ = model(dummy_input, output_hidden_states=True).hidden_states[-1]
+                with self.assertRaises(ValueError):
+                    _ = model_fa(
+                        dummy_input, attention_mask=dummy_attention_mask, output_hidden_states=True
+                    ).hidden_states[-1]
+
+
+@require_torch
+class MistralIntegrationTest(unittest.TestCase):
+    @slow
+    def test_model_7b_logits(self):
+        input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338]
+        model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", device_map="auto")
+        input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device)
+        out = model(input_ids).logits.cpu()
+        # Expected mean on dim = -1
+        EXPECTED_MEAN = torch.tensor([[-2.5548, -2.5737, -3.0600, -2.5906, -2.8478, -2.8118, -2.9325, -2.7694]])
+        torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2)
+        # slicing logits[0, 0, 0:30]
+        # fmt: off
+        EXPECTED_SLICE = torch.tensor([-5.8781, -5.8616, -0.1052, -4.7200, -5.8781, -5.8774, -5.8773, -5.8777, -5.8781, -5.8780, -5.8781, -5.8779, -1.0787,  1.7583, -5.8779, -5.8780, -5.8783, -5.8778, -5.8776, -5.8781, -5.8784, -5.8778, -5.8778, -5.8777, -5.8779, -5.8778, -5.8776, -5.8780, -5.8779, -5.8781])
+        # fmt: on
+        print(out[0, 0, :30])
+        torch.testing.assert_close(out[0, 0, :30], EXPECTED_SLICE, atol=1e-4, rtol=1e-4)
+
+    @slow
+    def test_model_7b_generation(self):
+        EXPECTED_TEXT_COMPLETION = (
+            """My favourite condiment is mayonnaise. I love it on sandwiches, in salads, on burgers"""
+        )
+        prompt = "My favourite condiment is "
+        tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", use_fast=False)
+        input_ids = tokenizer.encode(prompt, return_tensors="pt").to(torch_device)
+        model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1").to(torch_device)
+
+        # greedy generation outputs
+        generated_ids = model.generate(input_ids, max_new_tokens=20, temperature=0)
+        text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
diff --git a/tests/models/mluke/test_tokenization_mluke.py b/tests/models/mluke/test_tokenization_mluke.py
index 681825c7dccf..a466ae547cef 100644
--- a/tests/models/mluke/test_tokenization_mluke.py
+++ b/tests/models/mluke/test_tokenization_mluke.py
@@ -41,7 +41,6 @@ def get_tokenizer(self, task=None, **kwargs):
         kwargs.update(self.special_tokens_map)
         kwargs.update({"task": task})
         tokenizer = MLukeTokenizer(vocab_file=SAMPLE_VOCAB, entity_vocab_file=SAMPLE_ENTITY_VOCAB, **kwargs)
-        tokenizer.sanitize_special_tokens()
         return tokenizer
 
     def get_input_output_texts(self, tokenizer):
diff --git a/tests/models/mobilebert/test_modeling_mobilebert.py b/tests/models/mobilebert/test_modeling_mobilebert.py
index 04301962c3cd..e4ebca4b6e5b 100644
--- a/tests/models/mobilebert/test_modeling_mobilebert.py
+++ b/tests/models/mobilebert/test_modeling_mobilebert.py
@@ -22,6 +22,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -53,7 +54,7 @@ def __init__(
         vocab_size=99,
         hidden_size=64,
         embedding_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -253,8 +254,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class MobileBertModelTest(ModelTesterMixin, unittest.TestCase):
-
+class MobileBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             MobileBertModel,
@@ -269,6 +269,18 @@ class MobileBertModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": MobileBertModel,
+            "fill-mask": MobileBertForMaskedLM,
+            "question-answering": MobileBertForQuestionAnswering,
+            "text-classification": MobileBertForSequenceClassification,
+            "token-classification": MobileBertForTokenClassification,
+            "zero-shot": MobileBertForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
     fx_compatible = True
 
     # special case for ForPreTraining model
@@ -285,6 +297,11 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
                 )
         return inputs_dict
 
+    # TODO (@SunMarc): Fix me
+    @unittest.skip("It's broken.")
+    def test_resize_tokens_embeddings(self):
+        super().test_resize_tokens_embeddings()
+
     def setUp(self):
         self.model_tester = MobileBertModelTester(self)
         self.config_tester = ConfigTester(self, config_class=MobileBertConfig, hidden_size=37)
diff --git a/tests/models/mobilebert/test_modeling_tf_mobilebert.py b/tests/models/mobilebert/test_modeling_tf_mobilebert.py
index 75334e294509..b2b1e58ec0b3 100644
--- a/tests/models/mobilebert/test_modeling_tf_mobilebert.py
+++ b/tests/models/mobilebert/test_modeling_tf_mobilebert.py
@@ -14,14 +14,17 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import MobileBertConfig, is_tf_available
 from transformers.models.auto import get_values
-from transformers.testing_utils import require_tf, slow, tooslow
+from transformers.testing_utils import require_tf, slow
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -41,8 +44,7 @@
 
 
 @require_tf
-class TFMobileBertModelTest(TFModelTesterMixin, unittest.TestCase):
-
+class TFMobileBertModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             TFMobileBertModel,
@@ -57,6 +59,18 @@ class TFMobileBertModelTest(TFModelTesterMixin, unittest.TestCase):
         if is_tf_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": TFMobileBertModel,
+            "fill-mask": TFMobileBertForMaskedLM,
+            "question-answering": TFMobileBertForQuestionAnswering,
+            "text-classification": TFMobileBertForSequenceClassification,
+            "token-classification": TFMobileBertForTokenClassification,
+            "zero-shot": TFMobileBertForSequenceClassification,
+        }
+        if is_tf_available()
+        else {}
+    )
     test_head_masking = False
     test_onnx = False
 
@@ -83,7 +97,7 @@ def __init__(
             vocab_size=99,
             hidden_size=32,
             embedding_size=32,
-            num_hidden_layers=5,
+            num_hidden_layers=2,
             num_attention_heads=4,
             intermediate_size=37,
             hidden_act="gelu",
@@ -297,31 +311,6 @@ def test_for_token_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_mobilebert_for_token_classification(*config_and_inputs)
 
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        list_lm_models = [TFMobileBertForMaskedLM, TFMobileBertForPreTraining]
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-
-            if model_class in list_lm_models:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert isinstance(name, dict)
-                for k, v in name.items():
-                    assert isinstance(v, tf.Variable)
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
-
-    @tooslow
-    def test_saved_model_creation(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         # for model_name in TF_MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/mobilebert/test_tokenization_mobilebert.py b/tests/models/mobilebert/test_tokenization_mobilebert.py
index 395f4a2aab2c..3ecc2e3238d5 100644
--- a/tests/models/mobilebert/test_tokenization_mobilebert.py
+++ b/tests/models/mobilebert/test_tokenization_mobilebert.py
@@ -35,7 +35,6 @@
 # Copied from transformers.tests.models.bert.test_modeling_bert.py with Bert->MobileBert and pathfix
 @require_tokenizers
 class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = MobileBertTokenizer
     rust_tokenizer_class = MobileBertTokenizerFast
     test_rust_tokenizer = True
@@ -312,7 +311,6 @@ def test_change_tokenize_chinese_chars(self):
         text_with_chinese_char = "".join(list_of_commun_chinese_char)
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-
                 kwargs["tokenize_chinese_chars"] = True
                 tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
diff --git a/tests/models/mobilenet_v1/test_feature_extraction_mobilenet_v1.py b/tests/models/mobilenet_v1/test_feature_extraction_mobilenet_v1.py
deleted file mode 100644
index 270d38d5b818..000000000000
--- a/tests/models/mobilenet_v1/test_feature_extraction_mobilenet_v1.py
+++ /dev/null
@@ -1,198 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-
-from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torch_available, is_vision_available
-
-from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
-
-
-if is_torch_available():
-    import torch
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import MobileNetV1FeatureExtractor
-
-
-class MobileNetV1FeatureExtractionTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_center_crop=True,
-        crop_size=None,
-    ):
-        size = size if size is not None else {"shortest_edge": 20}
-        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_center_crop = do_center_crop
-        self.crop_size = crop_size
-
-    def prepare_feat_extract_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_center_crop": self.do_center_crop,
-            "crop_size": self.crop_size,
-        }
-
-
-@require_torch
-@require_vision
-class MobileNetV1FeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
-
-    feature_extraction_class = MobileNetV1FeatureExtractor if is_vision_available() else None
-
-    def setUp(self):
-        self.feature_extract_tester = MobileNetV1FeatureExtractionTester(self)
-
-    @property
-    def feat_extract_dict(self):
-        return self.feature_extract_tester.prepare_feat_extract_dict()
-
-    def test_feat_extract_properties(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        self.assertTrue(hasattr(feature_extractor, "do_resize"))
-        self.assertTrue(hasattr(feature_extractor, "size"))
-        self.assertTrue(hasattr(feature_extractor, "do_center_crop"))
-        self.assertTrue(hasattr(feature_extractor, "center_crop"))
-
-    def test_feat_extract_from_dict_with_kwargs(self):
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict)
-        self.assertEqual(feature_extractor.size, {"shortest_edge": 20})
-        self.assertEqual(feature_extractor.crop_size, {"height": 18, "width": 18})
-
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict, size=42, crop_size=84)
-        self.assertEqual(feature_extractor.size, {"shortest_edge": 42})
-        self.assertEqual(feature_extractor.crop_size, {"height": 84, "width": 84})
-
-    def test_batch_feature(self):
-        pass
-
-    def test_call_pil(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PIL images
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-    def test_call_numpy(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random numpy tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-    def test_call_pytorch(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
diff --git a/tests/models/mobilenet_v1/test_image_processing_mobilenet_v1.py b/tests/models/mobilenet_v1/test_image_processing_mobilenet_v1.py
new file mode 100644
index 000000000000..ce0ecba34c0e
--- /dev/null
+++ b/tests/models/mobilenet_v1/test_image_processing_mobilenet_v1.py
@@ -0,0 +1,105 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_vision_available():
+    from transformers import MobileNetV1ImageProcessor
+
+
+class MobileNetV1ImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_center_crop=True,
+        crop_size=None,
+    ):
+        size = size if size is not None else {"shortest_edge": 20}
+        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_center_crop": self.do_center_crop,
+            "crop_size": self.crop_size,
+        }
+
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.crop_size["height"], self.crop_size["width"]
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class MobileNetV1ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = MobileNetV1ImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = MobileNetV1ImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "do_center_crop"))
+        self.assertTrue(hasattr(image_processing, "center_crop"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"shortest_edge": 20})
+        self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
+
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84)
+        self.assertEqual(image_processor.size, {"shortest_edge": 42})
+        self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
diff --git a/tests/models/mobilenet_v1/test_modeling_mobilenet_v1.py b/tests/models/mobilenet_v1/test_modeling_mobilenet_v1.py
index 4bf3cc1c1354..8c24935800b9 100644
--- a/tests/models/mobilenet_v1/test_modeling_mobilenet_v1.py
+++ b/tests/models/mobilenet_v1/test_modeling_mobilenet_v1.py
@@ -24,6 +24,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -36,7 +37,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import MobileNetV1FeatureExtractor
+    from transformers import MobileNetV1ImageProcessor
 
 
 class MobileNetV1ConfigTester(ConfigTester):
@@ -139,13 +140,18 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class MobileNetV1ModelTest(ModelTesterMixin, unittest.TestCase):
+class MobileNetV1ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_common.py, as MobileNetV1 does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
     """
 
     all_model_classes = (MobileNetV1Model, MobileNetV1ForImageClassification) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": MobileNetV1Model, "image-classification": MobileNetV1ForImageClassification}
+        if is_torch_available()
+        else {}
+    )
 
     test_pruning = False
     test_resize_embeddings = False
@@ -234,20 +240,18 @@ def prepare_img():
 @require_vision
 class MobileNetV1ModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return (
-            MobileNetV1FeatureExtractor.from_pretrained("google/mobilenet_v1_1.0_224")
-            if is_vision_available()
-            else None
+            MobileNetV1ImageProcessor.from_pretrained("google/mobilenet_v1_1.0_224") if is_vision_available() else None
         )
 
     @slow
     def test_inference_image_classification_head(self):
         model = MobileNetV1ForImageClassification.from_pretrained("google/mobilenet_v1_1.0_224").to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/mobilenet_v2/test_feature_extraction_mobilenet_v2.py b/tests/models/mobilenet_v2/test_feature_extraction_mobilenet_v2.py
deleted file mode 100644
index 3cb4eea21842..000000000000
--- a/tests/models/mobilenet_v2/test_feature_extraction_mobilenet_v2.py
+++ /dev/null
@@ -1,198 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-
-from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torch_available, is_vision_available
-
-from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
-
-
-if is_torch_available():
-    import torch
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import MobileNetV2FeatureExtractor
-
-
-class MobileNetV2FeatureExtractionTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_center_crop=True,
-        crop_size=None,
-    ):
-        size = size if size is not None else {"shortest_edge": 20}
-        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_center_crop = do_center_crop
-        self.crop_size = crop_size
-
-    def prepare_feat_extract_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_center_crop": self.do_center_crop,
-            "crop_size": self.crop_size,
-        }
-
-
-@require_torch
-@require_vision
-class MobileNetV2FeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
-
-    feature_extraction_class = MobileNetV2FeatureExtractor if is_vision_available() else None
-
-    def setUp(self):
-        self.feature_extract_tester = MobileNetV2FeatureExtractionTester(self)
-
-    @property
-    def feat_extract_dict(self):
-        return self.feature_extract_tester.prepare_feat_extract_dict()
-
-    def test_feat_extract_properties(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        self.assertTrue(hasattr(feature_extractor, "do_resize"))
-        self.assertTrue(hasattr(feature_extractor, "size"))
-        self.assertTrue(hasattr(feature_extractor, "do_center_crop"))
-        self.assertTrue(hasattr(feature_extractor, "crop_size"))
-
-    def test_feat_extract_from_dict_with_kwargs(self):
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict)
-        self.assertEqual(feature_extractor.size, {"shortest_edge": 20})
-        self.assertEqual(feature_extractor.crop_size, {"height": 18, "width": 18})
-
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict, size=42, crop_size=84)
-        self.assertEqual(feature_extractor.size, {"shortest_edge": 42})
-        self.assertEqual(feature_extractor.crop_size, {"height": 84, "width": 84})
-
-    def test_batch_feature(self):
-        pass
-
-    def test_call_pil(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PIL images
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-    def test_call_numpy(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random numpy tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-    def test_call_pytorch(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
diff --git a/tests/models/mobilenet_v2/test_image_processing_mobilenet_v2.py b/tests/models/mobilenet_v2/test_image_processing_mobilenet_v2.py
new file mode 100644
index 000000000000..4c94be47212f
--- /dev/null
+++ b/tests/models/mobilenet_v2/test_image_processing_mobilenet_v2.py
@@ -0,0 +1,105 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_vision_available():
+    from transformers import MobileNetV2ImageProcessor
+
+
+class MobileNetV2ImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_center_crop=True,
+        crop_size=None,
+    ):
+        size = size if size is not None else {"shortest_edge": 20}
+        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_center_crop": self.do_center_crop,
+            "crop_size": self.crop_size,
+        }
+
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.crop_size["height"], self.crop_size["width"]
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class MobileNetV2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = MobileNetV2ImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = MobileNetV2ImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processor, "do_resize"))
+        self.assertTrue(hasattr(image_processor, "size"))
+        self.assertTrue(hasattr(image_processor, "do_center_crop"))
+        self.assertTrue(hasattr(image_processor, "crop_size"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"shortest_edge": 20})
+        self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
+
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84)
+        self.assertEqual(image_processor.size, {"shortest_edge": 42})
+        self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
diff --git a/tests/models/mobilenet_v2/test_modeling_mobilenet_v2.py b/tests/models/mobilenet_v2/test_modeling_mobilenet_v2.py
index 70a6d710a71f..06b2bd9d3fc4 100644
--- a/tests/models/mobilenet_v2/test_modeling_mobilenet_v2.py
+++ b/tests/models/mobilenet_v2/test_modeling_mobilenet_v2.py
@@ -24,6 +24,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -36,7 +37,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import MobileNetV2FeatureExtractor
+    from transformers import MobileNetV2ImageProcessor
 
 
 class MobileNetV2ConfigTester(ConfigTester):
@@ -182,7 +183,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class MobileNetV2ModelTest(ModelTesterMixin, unittest.TestCase):
+class MobileNetV2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_common.py, as MobileNetV2 does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
@@ -193,6 +194,15 @@ class MobileNetV2ModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": MobileNetV2Model,
+            "image-classification": MobileNetV2ForImageClassification,
+            "image-segmentation": MobileNetV2ForSemanticSegmentation,
+        }
+        if is_torch_available()
+        else {}
+    )
 
     test_pruning = False
     test_resize_embeddings = False
@@ -285,20 +295,18 @@ def prepare_img():
 @require_vision
 class MobileNetV2ModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return (
-            MobileNetV2FeatureExtractor.from_pretrained("google/mobilenet_v2_1.0_224")
-            if is_vision_available()
-            else None
+            MobileNetV2ImageProcessor.from_pretrained("google/mobilenet_v2_1.0_224") if is_vision_available() else None
         )
 
     @slow
     def test_inference_image_classification_head(self):
         model = MobileNetV2ForImageClassification.from_pretrained("google/mobilenet_v2_1.0_224").to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -317,10 +325,10 @@ def test_inference_semantic_segmentation(self):
         model = MobileNetV2ForSemanticSegmentation.from_pretrained("google/deeplabv3_mobilenet_v2_1.0_513")
         model = model.to(torch_device)
 
-        feature_extractor = MobileNetV2FeatureExtractor.from_pretrained("google/deeplabv3_mobilenet_v2_1.0_513")
+        image_processor = MobileNetV2ImageProcessor.from_pretrained("google/deeplabv3_mobilenet_v2_1.0_513")
 
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/mobilevit/test_feature_extraction_mobilevit.py b/tests/models/mobilevit/test_feature_extraction_mobilevit.py
deleted file mode 100644
index 468c4689e4dc..000000000000
--- a/tests/models/mobilevit/test_feature_extraction_mobilevit.py
+++ /dev/null
@@ -1,202 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-
-from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torch_available, is_vision_available
-
-from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
-
-
-if is_torch_available():
-    import torch
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import MobileViTFeatureExtractor
-
-
-class MobileViTFeatureExtractionTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_center_crop=True,
-        crop_size=None,
-        do_flip_channel_order=True,
-    ):
-        size = size if size is not None else {"shortest_edge": 20}
-        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_center_crop = do_center_crop
-        self.crop_size = crop_size
-        self.do_flip_channel_order = do_flip_channel_order
-
-    def prepare_feat_extract_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_center_crop": self.do_center_crop,
-            "crop_size": self.crop_size,
-            "do_flip_channel_order": self.do_flip_channel_order,
-        }
-
-
-@require_torch
-@require_vision
-class MobileViTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
-
-    feature_extraction_class = MobileViTFeatureExtractor if is_vision_available() else None
-
-    def setUp(self):
-        self.feature_extract_tester = MobileViTFeatureExtractionTester(self)
-
-    @property
-    def feat_extract_dict(self):
-        return self.feature_extract_tester.prepare_feat_extract_dict()
-
-    def test_feat_extract_properties(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        self.assertTrue(hasattr(feature_extractor, "do_resize"))
-        self.assertTrue(hasattr(feature_extractor, "size"))
-        self.assertTrue(hasattr(feature_extractor, "do_center_crop"))
-        self.assertTrue(hasattr(feature_extractor, "center_crop"))
-        self.assertTrue(hasattr(feature_extractor, "do_flip_channel_order"))
-
-    def test_feat_extract_from_dict_with_kwargs(self):
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict)
-        self.assertEqual(feature_extractor.size, {"shortest_edge": 20})
-        self.assertEqual(feature_extractor.crop_size, {"height": 18, "width": 18})
-
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict, size=42, crop_size=84)
-        self.assertEqual(feature_extractor.size, {"shortest_edge": 42})
-        self.assertEqual(feature_extractor.crop_size, {"height": 84, "width": 84})
-
-    def test_batch_feature(self):
-        pass
-
-    def test_call_pil(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PIL images
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-    def test_call_numpy(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random numpy tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-    def test_call_pytorch(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
diff --git a/tests/models/mobilevit/test_image_processing_mobilevit.py b/tests/models/mobilevit/test_image_processing_mobilevit.py
new file mode 100644
index 000000000000..edfdf0aed559
--- /dev/null
+++ b/tests/models/mobilevit/test_image_processing_mobilevit.py
@@ -0,0 +1,109 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_vision_available():
+    from transformers import MobileViTImageProcessor
+
+
+class MobileViTImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_center_crop=True,
+        crop_size=None,
+        do_flip_channel_order=True,
+    ):
+        size = size if size is not None else {"shortest_edge": 20}
+        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_flip_channel_order = do_flip_channel_order
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_center_crop": self.do_center_crop,
+            "crop_size": self.crop_size,
+            "do_flip_channel_order": self.do_flip_channel_order,
+        }
+
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.crop_size["height"], self.crop_size["width"]
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class MobileViTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = MobileViTImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = MobileViTImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "do_center_crop"))
+        self.assertTrue(hasattr(image_processing, "center_crop"))
+        self.assertTrue(hasattr(image_processing, "do_flip_channel_order"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"shortest_edge": 20})
+        self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
+
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84)
+        self.assertEqual(image_processor.size, {"shortest_edge": 42})
+        self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
diff --git a/tests/models/mobilevit/test_modeling_mobilevit.py b/tests/models/mobilevit/test_modeling_mobilevit.py
index bb86cbc451fe..2c01ea0c99bb 100644
--- a/tests/models/mobilevit/test_modeling_mobilevit.py
+++ b/tests/models/mobilevit/test_modeling_mobilevit.py
@@ -24,6 +24,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -36,7 +37,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import MobileViTFeatureExtractor
+    from transformers import MobileViTImageProcessor
 
 
 class MobileViTConfigTester(ConfigTester):
@@ -55,7 +56,7 @@ def __init__(
         image_size=32,
         patch_size=2,
         num_channels=3,
-        last_hidden_size=640,
+        last_hidden_size=32,
         num_attention_heads=4,
         hidden_act="silu",
         conv_kernel_size=3,
@@ -114,6 +115,8 @@ def get_config(self):
             attention_probs_dropout_prob=self.attention_probs_dropout_prob,
             classifier_dropout_prob=self.classifier_dropout_prob,
             initializer_range=self.initializer_range,
+            hidden_sizes=[12, 16, 20],
+            neck_hidden_sizes=[8, 8, 16, 16, 32, 32, 32],
         )
 
     def create_and_check_model(self, config, pixel_values, labels, pixel_labels):
@@ -173,7 +176,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class MobileViTModelTest(ModelTesterMixin, unittest.TestCase):
+class MobileViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_common.py, as MobileViT does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
@@ -184,6 +187,15 @@ class MobileViTModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": MobileViTModel,
+            "image-classification": MobileViTForImageClassification,
+            "image-segmentation": MobileViTForSemanticSegmentation,
+        }
+        if is_torch_available()
+        else {}
+    )
 
     test_pruning = False
     test_resize_embeddings = False
@@ -288,16 +300,16 @@ def prepare_img():
 @require_vision
 class MobileViTModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return MobileViTFeatureExtractor.from_pretrained("apple/mobilevit-xx-small") if is_vision_available() else None
+    def default_image_processor(self):
+        return MobileViTImageProcessor.from_pretrained("apple/mobilevit-xx-small") if is_vision_available() else None
 
     @slow
     def test_inference_image_classification_head(self):
         model = MobileViTForImageClassification.from_pretrained("apple/mobilevit-xx-small").to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -316,10 +328,10 @@ def test_inference_semantic_segmentation(self):
         model = MobileViTForSemanticSegmentation.from_pretrained("apple/deeplabv3-mobilevit-xx-small")
         model = model.to(torch_device)
 
-        feature_extractor = MobileViTFeatureExtractor.from_pretrained("apple/deeplabv3-mobilevit-xx-small")
+        image_processor = MobileViTImageProcessor.from_pretrained("apple/deeplabv3-mobilevit-xx-small")
 
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -346,10 +358,10 @@ def test_post_processing_semantic_segmentation(self):
         model = MobileViTForSemanticSegmentation.from_pretrained("apple/deeplabv3-mobilevit-xx-small")
         model = model.to(torch_device)
 
-        feature_extractor = MobileViTFeatureExtractor.from_pretrained("apple/deeplabv3-mobilevit-xx-small")
+        image_processor = MobileViTImageProcessor.from_pretrained("apple/deeplabv3-mobilevit-xx-small")
 
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -357,10 +369,10 @@ def test_post_processing_semantic_segmentation(self):
 
         outputs.logits = outputs.logits.detach().cpu()
 
-        segmentation = feature_extractor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[(50, 60)])
+        segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[(50, 60)])
         expected_shape = torch.Size((50, 60))
         self.assertEqual(segmentation[0].shape, expected_shape)
 
-        segmentation = feature_extractor.post_process_semantic_segmentation(outputs=outputs)
+        segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs)
         expected_shape = torch.Size((32, 32))
         self.assertEqual(segmentation[0].shape, expected_shape)
diff --git a/tests/models/mobilevit/test_modeling_tf_mobilevit.py b/tests/models/mobilevit/test_modeling_tf_mobilevit.py
index d46ee895ed71..289d739774ab 100644
--- a/tests/models/mobilevit/test_modeling_tf_mobilevit.py
+++ b/tests/models/mobilevit/test_modeling_tf_mobilevit.py
@@ -15,6 +15,8 @@
 """ Testing suite for the TensorFlow MobileViT model. """
 
 
+from __future__ import annotations
+
 import inspect
 import unittest
 
@@ -24,6 +26,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -37,7 +40,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import MobileViTFeatureExtractor
+    from transformers import MobileViTImageProcessor
 
 
 class TFMobileViTConfigTester(ConfigTester):
@@ -56,7 +59,7 @@ def __init__(
         image_size=32,
         patch_size=2,
         num_channels=3,
-        last_hidden_size=640,
+        last_hidden_size=32,
         num_attention_heads=4,
         hidden_act="silu",
         conv_kernel_size=3,
@@ -115,6 +118,8 @@ def get_config(self):
             attention_probs_dropout_prob=self.attention_probs_dropout_prob,
             classifier_dropout_prob=self.classifier_dropout_prob,
             initializer_range=self.initializer_range,
+            hidden_sizes=[12, 16, 20],
+            neck_hidden_sizes=[8, 8, 16, 16, 32, 32, 32],
         )
 
     def create_and_check_model(self, config, pixel_values, labels, pixel_labels):
@@ -154,7 +159,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class MobileViTModelTest(TFModelTesterMixin, unittest.TestCase):
+class TFMobileViTModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_common.py, as MobileViT does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
@@ -165,6 +170,11 @@ class MobileViTModelTest(TFModelTesterMixin, unittest.TestCase):
         if is_tf_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {"feature-extraction": TFMobileViTModel, "image-classification": TFMobileViTForImageClassification}
+        if is_tf_available()
+        else {}
+    )
 
     test_pruning = False
     test_resize_embeddings = False
@@ -191,10 +201,6 @@ def test_model_common_attributes(self):
     def test_attention_outputs(self):
         pass
 
-    @unittest.skip("Test was written for TF 1.x and isn't really relevant here")
-    def test_compile_tf_model(self):
-        pass
-
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -268,6 +274,7 @@ def check_keras_fit_results(self, val_loss1, val_loss2, atol=2e-1, rtol=2e-1):
         not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0,
         reason="TF does not support backprop for grouped convolutions on CPU.",
     )
+    @slow
     def test_keras_fit(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -294,7 +301,7 @@ def test_loss_computation(self):
                 # The number of elements in the loss should be the same as the number of elements in the label
                 prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
                 added_label = prepared_for_class[
-                    sorted(list(prepared_for_class.keys() - inputs_dict.keys()), reverse=True)[0]
+                    sorted(prepared_for_class.keys() - inputs_dict.keys(), reverse=True)[0]
                 ]
                 expected_loss_size = added_label.shape.as_list()[:1]
 
@@ -376,9 +383,9 @@ class TFMobileViTModelIntegrationTest(unittest.TestCase):
     def test_inference_image_classification_head(self):
         model = TFMobileViTForImageClassification.from_pretrained("apple/mobilevit-xx-small")
 
-        feature_extractor = MobileViTFeatureExtractor.from_pretrained("apple/mobilevit-xx-small")
+        image_processor = MobileViTImageProcessor.from_pretrained("apple/mobilevit-xx-small")
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="tf")
+        inputs = image_processor(images=image, return_tensors="tf")
 
         # forward pass
         outputs = model(**inputs, training=False)
@@ -396,10 +403,10 @@ def test_inference_semantic_segmentation(self):
         # `from_pt` will be removed
         model = TFMobileViTForSemanticSegmentation.from_pretrained("apple/deeplabv3-mobilevit-xx-small")
 
-        feature_extractor = MobileViTFeatureExtractor.from_pretrained("apple/deeplabv3-mobilevit-xx-small")
+        image_processor = MobileViTImageProcessor.from_pretrained("apple/deeplabv3-mobilevit-xx-small")
 
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="tf")
+        inputs = image_processor(images=image, return_tensors="tf")
 
         # forward pass
         outputs = model(inputs.pixel_values, training=False)
diff --git a/tests/models/mobilevitv2/__init__.py b/tests/models/mobilevitv2/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/mobilevitv2/test_modeling_mobilevitv2.py b/tests/models/mobilevitv2/test_modeling_mobilevitv2.py
new file mode 100644
index 000000000000..b1961b2e6d4a
--- /dev/null
+++ b/tests/models/mobilevitv2/test_modeling_mobilevitv2.py
@@ -0,0 +1,391 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch MobileViTV2 model. """
+
+
+import inspect
+import unittest
+
+from transformers import MobileViTV2Config
+from transformers.testing_utils import require_torch, require_torch_multi_gpu, require_vision, slow, torch_device
+from transformers.utils import cached_property, is_torch_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import MobileViTV2ForImageClassification, MobileViTV2ForSemanticSegmentation, MobileViTV2Model
+    from transformers.models.mobilevitv2.modeling_mobilevitv2 import (
+        MOBILEVITV2_PRETRAINED_MODEL_ARCHIVE_LIST,
+        make_divisible,
+    )
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import MobileViTImageProcessor
+
+
+class MobileViTV2ConfigTester(ConfigTester):
+    def create_and_test_config_common_properties(self):
+        config = self.config_class(**self.inputs_dict)
+        self.parent.assertTrue(hasattr(config, "width_multiplier"))
+
+
+class MobileViTV2ModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        image_size=64,
+        patch_size=2,
+        num_channels=3,
+        hidden_act="swish",
+        conv_kernel_size=3,
+        output_stride=32,
+        classifier_dropout_prob=0.1,
+        initializer_range=0.02,
+        is_training=True,
+        use_labels=True,
+        num_labels=10,
+        scope=None,
+        width_multiplier=0.25,
+        ffn_dropout=0.0,
+        attn_dropout=0.0,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.last_hidden_size = make_divisible(512 * width_multiplier, divisor=8)
+        self.hidden_act = hidden_act
+        self.conv_kernel_size = conv_kernel_size
+        self.output_stride = output_stride
+        self.classifier_dropout_prob = classifier_dropout_prob
+        self.use_labels = use_labels
+        self.is_training = is_training
+        self.num_labels = num_labels
+        self.initializer_range = initializer_range
+        self.scope = scope
+        self.width_multiplier = width_multiplier
+        self.ffn_dropout_prob = ffn_dropout
+        self.attn_dropout_prob = attn_dropout
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        pixel_labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.num_labels)
+            pixel_labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels)
+
+        config = self.get_config()
+
+        return config, pixel_values, labels, pixel_labels
+
+    def get_config(self):
+        return MobileViTV2Config(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_act=self.hidden_act,
+            conv_kernel_size=self.conv_kernel_size,
+            output_stride=self.output_stride,
+            classifier_dropout_prob=self.classifier_dropout_prob,
+            initializer_range=self.initializer_range,
+            width_multiplier=self.width_multiplier,
+            ffn_dropout=self.ffn_dropout_prob,
+            attn_dropout=self.attn_dropout_prob,
+            base_attn_unit_dims=[16, 24, 32],
+            n_attn_blocks=[1, 1, 2],
+            aspp_out_channels=32,
+        )
+
+    def create_and_check_model(self, config, pixel_values, labels, pixel_labels):
+        model = MobileViTV2Model(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        self.parent.assertEqual(
+            result.last_hidden_state.shape,
+            (
+                self.batch_size,
+                self.last_hidden_size,
+                self.image_size // self.output_stride,
+                self.image_size // self.output_stride,
+            ),
+        )
+
+    def create_and_check_for_image_classification(self, config, pixel_values, labels, pixel_labels):
+        config.num_labels = self.num_labels
+        model = MobileViTV2ForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values, labels=labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_for_semantic_segmentation(self, config, pixel_values, labels, pixel_labels):
+        config.num_labels = self.num_labels
+        model = MobileViTV2ForSemanticSegmentation(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        self.parent.assertEqual(
+            result.logits.shape,
+            (
+                self.batch_size,
+                self.num_labels,
+                self.image_size // self.output_stride,
+                self.image_size // self.output_stride,
+            ),
+        )
+        result = model(pixel_values, labels=pixel_labels)
+        self.parent.assertEqual(
+            result.logits.shape,
+            (
+                self.batch_size,
+                self.num_labels,
+                self.image_size // self.output_stride,
+                self.image_size // self.output_stride,
+            ),
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values, labels, pixel_labels = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class MobileViTV2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as MobileViTV2 does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (
+        (MobileViTV2Model, MobileViTV2ForImageClassification, MobileViTV2ForSemanticSegmentation)
+        if is_torch_available()
+        else ()
+    )
+
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": MobileViTV2Model,
+            "image-classification": MobileViTV2ForImageClassification,
+            "image-segmentation": MobileViTV2ForSemanticSegmentation,
+        }
+        if is_torch_available()
+        else {}
+    )
+
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+    has_attentions = False
+
+    def setUp(self):
+        self.model_tester = MobileViTV2ModelTester(self)
+        self.config_tester = MobileViTV2ConfigTester(self, config_class=MobileViTV2Config, has_text_modality=False)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="MobileViTV2 does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="MobileViTV2 does not support input and output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    @unittest.skip(reason="MobileViTV2 does not output attentions")
+    def test_attention_outputs(self):
+        pass
+
+    @require_torch_multi_gpu
+    @unittest.skip(reason="Got `CUDA error: misaligned address` for tests after this one being run.")
+    def test_multi_gpu_data_parallel_forward(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.hidden_states
+
+            expected_num_stages = 5
+            self.assertEqual(len(hidden_states), expected_num_stages)
+
+            # MobileViTV2's feature maps are of shape (batch_size, num_channels, height, width)
+            # with the width and height being successively divided by 2.
+            divisor = 2
+            for i in range(len(hidden_states)):
+                self.assertListEqual(
+                    list(hidden_states[i].shape[-2:]),
+                    [self.model_tester.image_size // divisor, self.model_tester.image_size // divisor],
+                )
+                divisor *= 2
+
+            self.assertEqual(self.model_tester.output_stride, divisor // 2)
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    def test_for_image_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
+
+    def test_for_semantic_segmentation(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_semantic_segmentation(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in MOBILEVITV2_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = MobileViTV2Model.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_torch
+@require_vision
+class MobileViTV2ModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_image_processor(self):
+        return (
+            MobileViTImageProcessor.from_pretrained("apple/mobilevitv2-1.0-imagenet1k-256")
+            if is_vision_available()
+            else None
+        )
+
+    @slow
+    def test_inference_image_classification_head(self):
+        model = MobileViTV2ForImageClassification.from_pretrained("apple/mobilevitv2-1.0-imagenet1k-256").to(
+            torch_device
+        )
+
+        image_processor = self.default_image_processor
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # verify the logits
+        expected_shape = torch.Size((1, 1000))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        expected_slice = torch.tensor([-1.6336e00, -7.3204e-02, -5.1883e-01]).to(torch_device)
+
+        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_semantic_segmentation(self):
+        model = MobileViTV2ForSemanticSegmentation.from_pretrained("shehan97/mobilevitv2-1.0-voc-deeplabv3")
+        model = model.to(torch_device)
+
+        image_processor = MobileViTImageProcessor.from_pretrained("shehan97/mobilevitv2-1.0-voc-deeplabv3")
+
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+        logits = outputs.logits
+
+        # verify the logits
+        expected_shape = torch.Size((1, 21, 32, 32))
+        self.assertEqual(logits.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [
+                [[7.0863, 7.1525, 6.8201], [6.6931, 6.8770, 6.8933], [6.2978, 7.0366, 6.9636]],
+                [[-3.7134, -3.6712, -3.6675], [-3.5825, -3.3549, -3.4777], [-3.3435, -3.3979, -3.2857]],
+                [[-2.9329, -2.8003, -2.7369], [-3.0564, -2.4780, -2.0207], [-2.6889, -1.9298, -1.7640]],
+            ],
+            device=torch_device,
+        )
+
+        self.assertTrue(torch.allclose(logits[0, :3, :3, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_post_processing_semantic_segmentation(self):
+        model = MobileViTV2ForSemanticSegmentation.from_pretrained("shehan97/mobilevitv2-1.0-voc-deeplabv3")
+        model = model.to(torch_device)
+
+        image_processor = MobileViTImageProcessor.from_pretrained("shehan97/mobilevitv2-1.0-voc-deeplabv3")
+
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        outputs.logits = outputs.logits.detach().cpu()
+
+        segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[(50, 60)])
+        expected_shape = torch.Size((50, 60))
+        self.assertEqual(segmentation[0].shape, expected_shape)
+
+        segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs)
+        expected_shape = torch.Size((32, 32))
+        self.assertEqual(segmentation[0].shape, expected_shape)
diff --git a/tests/models/mpnet/test_modeling_mpnet.py b/tests/models/mpnet/test_modeling_mpnet.py
index 1e72870fdadd..fc167641742a 100644
--- a/tests/models/mpnet/test_modeling_mpnet.py
+++ b/tests/models/mpnet/test_modeling_mpnet.py
@@ -21,6 +21,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -48,7 +49,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=64,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=64,
         hidden_act="gelu",
@@ -190,8 +191,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class MPNetModelTest(ModelTesterMixin, unittest.TestCase):
-
+class MPNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             MPNetForMaskedLM,
@@ -204,6 +204,18 @@ class MPNetModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": MPNetModel,
+            "fill-mask": MPNetForMaskedLM,
+            "question-answering": MPNetForQuestionAnswering,
+            "text-classification": MPNetForSequenceClassification,
+            "token-classification": MPNetForTokenClassification,
+            "zero-shot": MPNetForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
     test_pruning = False
     test_resize_embeddings = True
 
diff --git a/tests/models/mpnet/test_modeling_tf_mpnet.py b/tests/models/mpnet/test_modeling_tf_mpnet.py
index a0a4964d57e9..b27b33d8103d 100644
--- a/tests/models/mpnet/test_modeling_tf_mpnet.py
+++ b/tests/models/mpnet/test_modeling_tf_mpnet.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import MPNetConfig, is_tf_available
@@ -21,6 +23,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -48,7 +51,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=64,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=64,
         hidden_act="gelu",
@@ -184,8 +187,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFMPNetModelTest(TFModelTesterMixin, unittest.TestCase):
-
+class TFMPNetModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             TFMPNetForMaskedLM,
@@ -198,6 +200,18 @@ class TFMPNetModelTest(TFModelTesterMixin, unittest.TestCase):
         if is_tf_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": TFMPNetModel,
+            "fill-mask": TFMPNetForMaskedLM,
+            "question-answering": TFMPNetForQuestionAnswering,
+            "text-classification": TFMPNetForSequenceClassification,
+            "token-classification": TFMPNetForTokenClassification,
+            "zero-shot": TFMPNetForSequenceClassification,
+        }
+        if is_tf_available()
+        else {}
+    )
     test_head_masking = False
     test_onnx = False
 
diff --git a/tests/models/mpnet/test_tokenization_mpnet.py b/tests/models/mpnet/test_tokenization_mpnet.py
index f761b0280953..e30dd3a9145e 100644
--- a/tests/models/mpnet/test_tokenization_mpnet.py
+++ b/tests/models/mpnet/test_tokenization_mpnet.py
@@ -26,7 +26,6 @@
 
 @require_tokenizers
 class MPNetTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = MPNetTokenizer
     rust_tokenizer_class = MPNetTokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/mpt/__init__.py b/tests/models/mpt/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/mpt/test_modeling_mpt.py b/tests/models/mpt/test_modeling_mpt.py
new file mode 100644
index 000000000000..c2d3ae0d0111
--- /dev/null
+++ b/tests/models/mpt/test_modeling_mpt.py
@@ -0,0 +1,513 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import math
+import unittest
+
+from transformers import MptConfig, is_torch_available
+from transformers.testing_utils import require_bitsandbytes, require_torch, require_torch_gpu, slow, torch_device
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        MPT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        AutoTokenizer,
+        MptForCausalLM,
+        MptForQuestionAnswering,
+        MptForSequenceClassification,
+        MptForTokenClassification,
+        MptModel,
+    )
+
+
+@require_torch
+class MptModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=14,
+        seq_length=7,
+        is_training=True,
+        use_token_type_ids=False,
+        use_input_mask=True,
+        use_labels=True,
+        use_mc_token_ids=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_token_type_ids = use_token_type_ids
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.use_mc_token_ids = use_mc_token_ids
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_dropout_prob = attention_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = None
+        self.bos_token_id = vocab_size - 1
+        self.eos_token_id = vocab_size - 1
+        self.pad_token_id = vocab_size - 1
+
+    def get_large_model_config(self):
+        return MptConfig.from_pretrained("mosaicml/mpt-7b")
+
+    def prepare_config_and_inputs(self, gradient_checkpointing=False):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        sequence_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+
+        config = self.get_config(gradient_checkpointing=gradient_checkpointing)
+
+        return (config, input_ids, input_mask, sequence_labels)
+
+    def get_config(self, gradient_checkpointing=False):
+        return MptConfig(
+            vocab_size=self.vocab_size,
+            seq_length=self.seq_length,
+            hidden_size=self.hidden_size,
+            n_layers=self.num_hidden_layers,
+            n_heads=self.num_attention_heads,
+            hidden_dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_dropout_prob,
+            n_positions=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+            use_cache=True,
+            bos_token_id=self.bos_token_id,
+            eos_token_id=self.eos_token_id,
+            pad_token_id=self.pad_token_id,
+            num_labels=self.num_labels,
+            gradient_checkpointing=gradient_checkpointing,
+            dtype="float32",
+        )
+
+    def create_and_check_mpt_model(self, config, input_ids, input_mask, *args):
+        model = MptModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(len(result.past_key_values), config.n_layers)
+
+    def create_and_check_mpt_model_past(self, config, input_ids, input_mask, *args):
+        model = MptModel(config=config)
+
+        model.to(torch_device)
+        model.eval()
+
+        # first forward pass
+        outputs = model(input_ids, attention_mask=torch.ones_like(input_ids), use_cache=True)
+        outputs_use_cache_conf = model(input_ids, attention_mask=torch.ones_like(input_ids))
+        outputs_no_past = model(input_ids, use_cache=False, attention_mask=torch.ones_like(input_ids))
+
+        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
+        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
+
+        past = outputs["past_key_values"]
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # append to next input_ids and token_type_ids
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+
+        output_from_no_past = model(next_input_ids)["last_hidden_state"]
+        output_from_past = model(next_tokens, past_key_values=past)["last_hidden_state"]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_mpt_model_attention_mask_past(self, config, input_ids, input_mask, *args):
+        model = MptModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # create attention mask
+        attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
+        half_seq_length = self.seq_length // 2
+        attn_mask[:, half_seq_length:] = 0
+
+        # first forward pass
+        output, past = model(input_ids, attention_mask=attn_mask).to_tuple()
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # change a random masked slice from input_ids
+        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
+        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
+        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
+
+        # append to next input_ids and attn_mask
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        attn_mask = torch.cat(
+            [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)],
+            dim=1,
+        )
+
+        # get two different outputs
+        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, past_key_values=past, attention_mask=attn_mask)["last_hidden_state"]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_mpt_model_past_large_inputs(self, config, input_ids, input_mask, *args):
+        model = MptModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # first forward pass
+        outputs = model(
+            input_ids,
+            attention_mask=input_mask,
+            use_cache=True,
+        )
+        past_key_values = outputs.past_key_values
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+
+        output_from_no_past = model(
+            next_input_ids,
+            attention_mask=next_attention_mask,
+            output_hidden_states=True,
+        )
+        hidden_states_from_no_past = output_from_no_past["hidden_states"][0]
+
+        output_from_past = model(
+            next_tokens,
+            attention_mask=next_attention_mask,
+            past_key_values=past_key_values,
+            output_hidden_states=True,
+        )
+        hidden_states_from_past = output_from_past["hidden_states"][0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), hidden_states_from_past.shape[-1]).item()
+        output_from_no_past_slice = hidden_states_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = hidden_states_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_lm_head_model(self, config, input_ids, input_mask, *args):
+        model = MptForCausalLM(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids, labels=input_ids)
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_sequence_classification_model(self, config, input_ids, input_mask, *args):
+        config.num_labels = self.num_labels
+        model = MptForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids, attention_mask=input_mask)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_token_classification_model(self, config, input_ids, input_mask, *args):
+        model = MptForTokenClassification(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids, attention_mask=input_mask)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_question_answering_model(self, config, input_ids, input_mask, *args):
+        model = MptForQuestionAnswering(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids, attention_mask=input_mask)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_forward_and_backwards(
+        self, config, input_ids, input_mask, *args, gradient_checkpointing=False
+    ):
+        model = MptForCausalLM(config)
+        model.to(torch_device)
+        if gradient_checkpointing:
+            model.gradient_checkpointing_enable()
+
+        result = model(input_ids, labels=input_ids)
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+        result.loss.backward()
+
+    def create_and_check_mpt_weight_initialization(self, config, *args):
+        model = MptModel(config)
+        model_std = model.config.initializer_range / math.sqrt(2 * model.config.n_layers)
+        for key in model.state_dict().keys():
+            if "c_proj" in key and "weight" in key:
+                self.parent.assertLessEqual(abs(torch.std(model.state_dict()[key]) - model_std), 0.001)
+                self.parent.assertLessEqual(abs(torch.mean(model.state_dict()[key]) - 0.0), 0.01)
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+
+        config, input_ids, input_mask, sequence_labels = config_and_inputs
+
+        inputs_dict = {"input_ids": input_ids}
+
+        return config, inputs_dict
+
+
+class MptConfigTester(ConfigTester):
+    def __init__(self, parent, config_class=None, has_text_modality=True, common_properties=None, **kwargs):
+        super().__init__(parent, config_class, has_text_modality, common_properties, **kwargs)
+
+    def test_attn_config_as_dict(self):
+        config = self.config_class(**self.inputs_dict, attn_config={"attn_impl": "flash", "softmax_scale": None})
+        self.parent.assertTrue(config.attn_config.attn_impl == "flash")
+        self.parent.assertTrue(config.attn_config.softmax_scale is None)
+
+    def run_common_tests(self):
+        self.test_attn_config_as_dict()
+        return super().run_common_tests()
+
+
+@require_torch
+class MptModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            MptModel,
+            MptForCausalLM,
+            MptForSequenceClassification,
+            MptForTokenClassification,
+            MptForQuestionAnswering,
+        )
+        if is_torch_available()
+        else ()
+    )
+
+    all_generative_model_classes = (MptForCausalLM,) if is_torch_available() else ()
+    fx_compatible = False
+    test_missing_keys = False
+    test_pruning = False
+    test_torchscript = False
+    test_head_masking = False
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": MptModel,
+            "question-answering": MptForQuestionAnswering,
+            "text-classification": MptForSequenceClassification,
+            "text-generation": MptForCausalLM,
+            "token-classification": MptForTokenClassification,
+            "zero-shot": MptForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
+
+    def setUp(self):
+        self.model_tester = MptModelTester(self)
+        self.config_tester = MptConfigTester(self, config_class=MptConfig, n_embd=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_mpt_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mpt_model(*config_and_inputs)
+
+    def test_mpt_model_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mpt_model_past(*config_and_inputs)
+
+    def test_mpt_model_att_mask_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mpt_model_attention_mask_past(*config_and_inputs)
+
+    def test_mpt_model_past_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mpt_model_past_large_inputs(*config_and_inputs)
+
+    def test_mpt_lm_head_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
+
+    def test_mpt_sequence_classification_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_sequence_classification_model(*config_and_inputs)
+
+    def test_mpt_token_classification_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_token_classification_model(*config_and_inputs)
+
+    def test_mpt_gradient_checkpointing(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs, gradient_checkpointing=True)
+
+    def test_mpt_weight_initialization(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mpt_weight_initialization(*config_and_inputs)
+
+    @unittest.skip("For backward compatibility the lm_head is not in the model's state dict on the Hub.")
+    def test_model_weights_reload_no_missing_tied_weights(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in MPT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = MptModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@slow
+@require_torch_gpu
+@require_bitsandbytes
+class MptIntegrationTests(unittest.TestCase):
+    def test_generation_8k(self):
+        model_id = "mosaicml/mpt-7b-8k"
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+        # Load in 4bit to fit the daily CI runner GPU RAM
+        model = MptForCausalLM.from_pretrained(
+            model_id, torch_dtype=torch.bfloat16, device_map={"": 0}, load_in_4bit=True
+        )
+
+        input_text = "Hello"
+        expected_output = 'Hello, I\'m a new user of the forum. I have a question about the "Safety"'
+
+        inputs = tokenizer(input_text, return_tensors="pt")
+        outputs = model.generate(**inputs, max_new_tokens=20)
+
+        decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        self.assertEqual(decoded_output, expected_output)
+
+    def test_generation(self):
+        model_id = "mosaicml/mpt-7b"
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+        # Load in 4bit to fit the daily CI runner GPU RAM
+        model = MptForCausalLM.from_pretrained(
+            model_id, torch_dtype=torch.bfloat16, device_map={"": 0}, load_in_4bit=True
+        )
+
+        input_text = "Hello"
+        expected_output = (
+            "Hello and welcome to the first day of the new release countdown for the month of May!\nToday"
+        )
+
+        inputs = tokenizer(input_text, return_tensors="pt")
+        outputs = model.generate(**inputs, max_new_tokens=20)
+
+        decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        self.assertEqual(decoded_output, expected_output)
+
+    def test_generation_batched(self):
+        model_id = "mosaicml/mpt-7b"
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+        # Load in 4bit to fit the daily CI runner GPU RAM
+        model = MptForCausalLM.from_pretrained(
+            model_id, torch_dtype=torch.bfloat16, device_map={"": 0}, load_in_4bit=True
+        )
+
+        input_texts = ["Hello my name is", "Today I am going at the gym and"]
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+        tokenizer.padding_side = "left"
+
+        inputs = tokenizer(input_texts, return_tensors="pt", padding=True).to(torch_device)
+
+        expected_output = [
+            "Hello my name is Tiffany and I am a mother of two beautiful children. I have been a nanny for over",
+            "Today I am going at the gym and then I am going to go to the grocery store and get some food. I am going to make",
+        ]
+        outputs = model.generate(**inputs, max_new_tokens=20)
+
+        decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        for i, predicted_output in enumerate(decoded_outputs):
+            self.assertEqual(predicted_output, expected_output[i])
+
+    def test_model_logits(self):
+        model_id = "mosaicml/mpt-7b"
+
+        # Load in 4bit to fit the daily CI runner GPU RAM
+        model = MptForCausalLM.from_pretrained(
+            model_id, torch_dtype=torch.bfloat16, device_map={"": 0}, load_in_4bit=True
+        )
+
+        dummy_input = torch.LongTensor([[1, 2, 3, 4, 5]]).to(torch_device)
+
+        outputs = model(dummy_input, output_hidden_states=True)
+
+        expected_slice = torch.Tensor([-0.2539, -0.2178, -0.1953]).to(torch_device, torch.bfloat16)
+        predicted_slice = outputs.hidden_states[-1][0, 0, :3]
+
+        self.assertTrue(torch.allclose(expected_slice, predicted_slice, atol=1e-3, rtol=1e-3))
diff --git a/tests/models/mra/__init__.py b/tests/models/mra/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/mra/test_modeling_mra.py b/tests/models/mra/test_modeling_mra.py
new file mode 100644
index 000000000000..aac9ce5bc160
--- /dev/null
+++ b/tests/models/mra/test_modeling_mra.py
@@ -0,0 +1,419 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch MRA model. """
+
+
+import unittest
+
+from transformers import MraConfig, is_torch_available
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        MraForMaskedLM,
+        MraForMultipleChoice,
+        MraForQuestionAnswering,
+        MraForSequenceClassification,
+        MraForTokenClassification,
+        MraModel,
+    )
+    from transformers.models.mra.modeling_mra import MRA_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+class MraModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=2,
+        seq_length=8,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=2,
+        intermediate_size=36,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.get_config()
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def get_config(self):
+        return MraConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+        )
+
+    def get_pipeline_config(self):
+        config = self.get_config()
+        config.vocab_size = 300
+        return config
+
+    def prepare_config_and_inputs_for_decoder(self):
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = self.prepare_config_and_inputs()
+
+        config.is_decoder = True
+        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
+        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        return (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MraModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_model_as_decoder(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.add_cross_attention = True
+        model = MraModel(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MraForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MraForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def create_and_check_for_sequence_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = MraForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = MraForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_for_multiple_choice(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = MraForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        result = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class MraModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            MraModel,
+            MraForMaskedLM,
+            MraForMultipleChoice,
+            MraForQuestionAnswering,
+            MraForSequenceClassification,
+            MraForTokenClassification,
+        )
+        if is_torch_available()
+        else ()
+    )
+    test_pruning = False
+    test_headmasking = False
+    test_torchscript = False
+    has_attentions = False
+
+    all_generative_model_classes = ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": MraModel,
+            "fill-mask": MraForMaskedLM,
+            "question-answering": MraForQuestionAnswering,
+            "text-classification": MraForSequenceClassification,
+            "token-classification": MraForTokenClassification,
+            "zero-shot": MraForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
+
+    def setUp(self):
+        self.model_tester = MraModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=MraConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_various_embeddings(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        for type in ["absolute", "relative_key", "relative_key_query"]:
+            config_and_inputs[0].position_embedding_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in MRA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = MraModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    @unittest.skip(reason="MRA does not output attentions")
+    def test_attention_outputs(self):
+        return
+
+
+@require_torch
+class MraModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_no_head(self):
+        model = MraModel.from_pretrained("uw-madison/mra-base-512-4")
+        input_ids = torch.arange(256).unsqueeze(0)
+
+        with torch.no_grad():
+            output = model(input_ids)[0]
+
+        expected_shape = torch.Size((1, 256, 768))
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[[-0.0140, 0.0830, -0.0381], [0.1546, 0.1402, 0.0220], [0.1162, 0.0851, 0.0165]]]
+        )
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_masked_lm(self):
+        model = MraForMaskedLM.from_pretrained("uw-madison/mra-base-512-4")
+        input_ids = torch.arange(256).unsqueeze(0)
+
+        with torch.no_grad():
+            output = model(input_ids)[0]
+
+        vocab_size = 50265
+
+        expected_shape = torch.Size((1, 256, vocab_size))
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[[9.2595, -3.6038, 11.8819], [9.3869, -3.2693, 11.0956], [11.8524, -3.4938, 13.1210]]]
+        )
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_masked_lm_long_input(self):
+        model = MraForMaskedLM.from_pretrained("uw-madison/mra-base-4096-8-d3")
+        input_ids = torch.arange(4096).unsqueeze(0)
+
+        with torch.no_grad():
+            output = model(input_ids)[0]
+
+        vocab_size = 50265
+
+        expected_shape = torch.Size((1, 4096, vocab_size))
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[[5.4789, -2.3564, 7.5064], [7.9067, -1.3369, 9.9668], [9.0712, -1.8106, 7.0380]]]
+        )
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
diff --git a/tests/models/mt5/test_modeling_flax_mt5.py b/tests/models/mt5/test_modeling_flax_mt5.py
index f9ef2d5e1847..34a5731fd059 100644
--- a/tests/models/mt5/test_modeling_flax_mt5.py
+++ b/tests/models/mt5/test_modeling_flax_mt5.py
@@ -21,6 +21,7 @@
 if is_flax_available():
     import optax
     from flax.training.common_utils import onehot
+
     from transformers import AutoTokenizer, FlaxMT5ForConditionalGeneration
     from transformers.models.t5.modeling_flax_t5 import shift_tokens_right
 
diff --git a/tests/models/mt5/test_modeling_tf_mt5.py b/tests/models/mt5/test_modeling_tf_mt5.py
index 0c934f0314c8..facb63dd7931 100644
--- a/tests/models/mt5/test_modeling_tf_mt5.py
+++ b/tests/models/mt5/test_modeling_tf_mt5.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import is_tf_available
diff --git a/tests/models/musicgen/__init__.py b/tests/models/musicgen/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/musicgen/test_modeling_musicgen.py b/tests/models/musicgen/test_modeling_musicgen.py
new file mode 100644
index 000000000000..02ab3b538c26
--- /dev/null
+++ b/tests/models/musicgen/test_modeling_musicgen.py
@@ -0,0 +1,1353 @@
+# coding=utf-8
+# Copyright 2021, The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Musicgen model. """
+import copy
+import inspect
+import math
+import unittest
+
+import numpy as np
+
+from transformers import (
+    EncodecConfig,
+    MusicgenConfig,
+    MusicgenDecoderConfig,
+    MusicgenProcessor,
+    PretrainedConfig,
+    T5Config,
+)
+from transformers.testing_utils import is_torch_available, require_torch, slow, torch_device
+from transformers.utils import cached_property
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        MusicgenForCausalLM,
+        MusicgenForConditionalGeneration,
+        MusicgenModel,
+        set_seed,
+    )
+    from transformers.generation import (
+        GreedySearchDecoderOnlyOutput,
+        GreedySearchEncoderDecoderOutput,
+        InfNanRemoveLogitsProcessor,
+        LogitsProcessorList,
+        SampleDecoderOnlyOutput,
+        SampleEncoderDecoderOutput,
+    )
+
+
+def _config_zero_init(config):
+    configs_no_init = copy.deepcopy(config)
+    for key in configs_no_init.__dict__.keys():
+        if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key:
+            setattr(configs_no_init, key, 1e-10)
+        if isinstance(getattr(configs_no_init, key, None), PretrainedConfig):
+            no_init_subconfig = _config_zero_init(getattr(configs_no_init, key))
+            setattr(configs_no_init, key, no_init_subconfig)
+    return configs_no_init
+
+
+def prepare_musicgen_decoder_inputs_dict(
+    config,
+    input_ids,
+    attention_mask=None,
+    head_mask=None,
+    encoder_hidden_states=None,
+    encoder_attention_mask=None,
+    cross_attn_head_mask=None,
+):
+    if attention_mask is None:
+        attention_mask = input_ids.reshape(-1, config.num_codebooks, input_ids.shape[-1])[:, 0, :]
+        attention_mask = attention_mask.ne(config.pad_token_id)
+    if head_mask is None:
+        head_mask = torch.ones(config.num_hidden_layers, config.num_attention_heads, device=torch_device)
+    if encoder_attention_mask is None and encoder_hidden_states is not None:
+        encoder_attention_mask = torch.ones(encoder_hidden_states.shape[:2], device=torch_device)
+    if cross_attn_head_mask is None:
+        cross_attn_head_mask = torch.ones(config.num_hidden_layers, config.num_attention_heads, device=torch_device)
+    return {
+        "input_ids": input_ids,
+        "attention_mask": attention_mask,
+        "encoder_hidden_states": encoder_hidden_states,
+        "encoder_attention_mask": encoder_attention_mask,
+        "head_mask": head_mask,
+        "cross_attn_head_mask": cross_attn_head_mask,
+    }
+
+
+class MusicgenDecoderTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=2,
+        seq_length=7,
+        is_training=False,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=100,
+        pad_token_id=99,
+        bos_token_id=99,
+        num_codebooks=4,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.num_codebooks = num_codebooks
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size * self.num_codebooks, self.seq_length], self.vocab_size)
+        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
+
+        config = self.get_config()
+        inputs_dict = prepare_musicgen_decoder_inputs_dict(
+            config, input_ids, encoder_hidden_states=encoder_hidden_states
+        )
+        return config, inputs_dict
+
+    def get_config(self):
+        config = MusicgenDecoderConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            d_ff=self.intermediate_size,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.bos_token_id,
+            bos_token_id=self.bos_token_id,
+            num_codebooks=self.num_codebooks,
+            tie_word_embeddings=False,
+        )
+        return config
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+
+@require_torch
+class MusicgenDecoderTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (MusicgenModel, MusicgenForCausalLM) if is_torch_available() else ()
+    greedy_sample_model_classes = (
+        (MusicgenForCausalLM,) if is_torch_available() else ()
+    )  # we don't want to run all the generation tests, only a specific subset
+    pipeline_model_mapping = {}
+    test_pruning = False
+    test_resize_embeddings = False
+
+    def setUp(self):
+        self.model_tester = MusicgenDecoderTester(self)
+        self.config_tester = ConfigTester(self, config_class=MusicgenDecoderConfig, hidden_size=16)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    # override since we have to compute the input embeddings over codebooks
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
+
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+
+            embed_tokens = model.get_input_embeddings()
+
+            input_ids = input_ids.reshape(-1, config.num_codebooks, input_ids.shape[-1])
+
+            inputs["inputs_embeds"] = sum(
+                [embed_tokens[codebook](input_ids[:, codebook]) for codebook in range(config.num_codebooks)]
+            )
+
+            with torch.no_grad():
+                model(**inputs)[0]
+
+    # override since we have embeddings / LM heads over multiple codebooks
+    def test_model_common_attributes(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            first_embed = model.get_input_embeddings()[0]
+            self.assertIsInstance(first_embed, torch.nn.Embedding)
+            lm_heads = model.get_output_embeddings()
+            self.assertTrue(lm_heads is None or isinstance(lm_heads[0], torch.nn.Linear))
+
+    # skip as this model doesn't support all arguments tested
+    def test_model_outputs_equivalence(self):
+        pass
+
+    # skip as this model has multiple inputs embeds and lm heads that should not be tied
+    def test_tie_model_weights(self):
+        pass
+
+    # skip as this model has multiple inputs embeds and lm heads that should not be tied
+    def test_tied_weights_keys(self):
+        pass
+
+    def _get_input_ids_and_config(self, batch_size=2):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        input_ids = inputs_dict["input_ids"]
+
+        # take max batch_size
+        sequence_length = input_ids.shape[-1]
+        input_ids = input_ids[: batch_size * config.num_codebooks, :]
+
+        # generate max 3 tokens
+        max_length = input_ids.shape[-1] + 3
+        attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long)
+        return config, input_ids, attention_mask, max_length
+
+    @staticmethod
+    def _get_logits_processor_and_kwargs(
+        input_length,
+        eos_token_id,
+        forced_bos_token_id=None,
+        forced_eos_token_id=None,
+        max_length=None,
+        diversity_penalty=None,
+    ):
+        process_kwargs = {
+            "min_length": input_length + 1 if max_length is None else max_length - 1,
+        }
+        logits_processor = LogitsProcessorList()
+        return process_kwargs, logits_processor
+
+    # override since we don't expect the outputs of `.generate` and `.greedy_search` to be the same, since we perform
+    # additional post-processing in the former
+    def test_greedy_generate_dict_outputs(self):
+        for model_class in self.greedy_sample_model_classes:
+            # disable cache
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
+            config.use_cache = False
+            model = model_class(config).to(torch_device).eval()
+            output_greedy, output_generate = self._greedy_generate(
+                model=model,
+                input_ids=input_ids.to(torch_device),
+                attention_mask=attention_mask.to(torch_device),
+                max_length=max_length,
+                output_scores=True,
+                output_hidden_states=True,
+                output_attentions=True,
+                return_dict_in_generate=True,
+            )
+
+            self.assertIsInstance(output_greedy, GreedySearchDecoderOnlyOutput)
+            self.assertIsInstance(output_generate, GreedySearchDecoderOnlyOutput)
+
+            self.assertNotIn(config.pad_token_id, output_generate)
+
+    # override since we don't expect the outputs of `.generate` and `.greedy_search` to be the same, since we perform
+    # additional post-processing in the former
+    def test_greedy_generate_dict_outputs_use_cache(self):
+        for model_class in self.greedy_sample_model_classes:
+            # enable cache
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
+
+            config.use_cache = True
+            config.is_decoder = True
+            model = model_class(config).to(torch_device).eval()
+            output_greedy, output_generate = self._greedy_generate(
+                model=model,
+                input_ids=input_ids.to(torch_device),
+                attention_mask=attention_mask.to(torch_device),
+                max_length=max_length,
+                output_scores=True,
+                output_hidden_states=True,
+                output_attentions=True,
+                return_dict_in_generate=True,
+            )
+
+            self.assertIsInstance(output_greedy, GreedySearchDecoderOnlyOutput)
+            self.assertIsInstance(output_generate, GreedySearchDecoderOnlyOutput)
+
+    # override since we don't expect the outputs of `.generate` and `.sample` to be the same, since we perform
+    # additional post-processing in the former
+    def test_sample_generate(self):
+        for model_class in self.greedy_sample_model_classes:
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
+            model = model_class(config).to(torch_device).eval()
+
+            process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
+                input_ids.shape[-1],
+                model.config.eos_token_id,
+                forced_bos_token_id=model.config.forced_bos_token_id,
+                forced_eos_token_id=model.config.forced_eos_token_id,
+                max_length=max_length,
+            )
+            logits_warper_kwargs, logits_warper = self._get_warper_and_kwargs(num_beams=2)
+
+            # check `generate()` and `sample()` are equal
+            output_sample, output_generate = self._sample_generate(
+                model=model,
+                input_ids=input_ids.to(torch_device),
+                attention_mask=attention_mask.to(torch_device),
+                max_length=max_length,
+                num_return_sequences=3,
+                logits_processor=logits_processor,
+                logits_warper=logits_warper,
+                logits_warper_kwargs=logits_warper_kwargs,
+                process_kwargs=process_kwargs,
+            )
+            self.assertIsInstance(output_sample, torch.Tensor)
+            self.assertIsInstance(output_generate, torch.Tensor)
+
+    # override since we don't expect the outputs of `.generate` and `.sample` to be the same, since we perform
+    # additional post-processing in the former
+    def test_sample_generate_dict_output(self):
+        for model_class in self.greedy_sample_model_classes:
+            # disable cache
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
+            config.use_cache = False
+            model = model_class(config).to(torch_device).eval()
+
+            process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
+                input_ids.shape[-1],
+                model.config.eos_token_id,
+                forced_bos_token_id=model.config.forced_bos_token_id,
+                forced_eos_token_id=model.config.forced_eos_token_id,
+                max_length=max_length,
+            )
+            logits_warper_kwargs, logits_warper = self._get_warper_and_kwargs(num_beams=1)
+
+            output_sample, output_generate = self._sample_generate(
+                model=model,
+                input_ids=input_ids.to(torch_device),
+                attention_mask=attention_mask.to(torch_device),
+                max_length=max_length,
+                num_return_sequences=1,
+                logits_processor=logits_processor,
+                logits_warper=logits_warper,
+                logits_warper_kwargs=logits_warper_kwargs,
+                process_kwargs=process_kwargs,
+                output_scores=True,
+                output_hidden_states=True,
+                output_attentions=True,
+                return_dict_in_generate=True,
+            )
+
+            self.assertIsInstance(output_sample, SampleDecoderOnlyOutput)
+            self.assertIsInstance(output_generate, SampleDecoderOnlyOutput)
+
+
+def prepare_musicgen_inputs_dict(
+    config,
+    input_ids,
+    decoder_input_ids,
+    attention_mask=None,
+    decoder_attention_mask=None,
+    head_mask=None,
+    decoder_head_mask=None,
+    cross_attn_head_mask=None,
+):
+    if decoder_attention_mask is None:
+        decoder_attention_mask = decoder_input_ids.reshape(
+            -1, config.decoder.num_codebooks, decoder_input_ids.shape[-1]
+        )[:, 0, :]
+        decoder_attention_mask = decoder_attention_mask.ne(config.decoder.pad_token_id)
+    if head_mask is None:
+        head_mask = torch.ones(
+            config.text_encoder.num_hidden_layers, config.text_encoder.num_attention_heads, device=torch_device
+        )
+    if decoder_head_mask is None:
+        decoder_head_mask = torch.ones(
+            config.decoder.num_hidden_layers, config.decoder.num_attention_heads, device=torch_device
+        )
+    if cross_attn_head_mask is None:
+        cross_attn_head_mask = torch.ones(
+            config.decoder.num_hidden_layers, config.decoder.num_attention_heads, device=torch_device
+        )
+    return {
+        "input_ids": input_ids,
+        "attention_mask": attention_mask,
+        "decoder_input_ids": decoder_input_ids,
+        "decoder_attention_mask": decoder_attention_mask,
+        "head_mask": head_mask,
+        "decoder_head_mask": decoder_head_mask,
+        "cross_attn_head_mask": cross_attn_head_mask,
+    }
+
+
+class MusicgenTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=2,
+        seq_length=7,
+        is_training=False,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=100,
+        pad_token_id=99,
+        bos_token_id=99,
+        num_codebooks=4,
+        num_filters=4,
+        codebook_size=128,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.num_codebooks = num_codebooks
+        self.num_filters = num_filters
+        self.codebook_size = codebook_size
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        decoder_input_ids = ids_tensor([self.batch_size * self.num_codebooks, self.seq_length], self.vocab_size)
+
+        config = self.get_config()
+        inputs_dict = prepare_musicgen_inputs_dict(config, input_ids, decoder_input_ids=decoder_input_ids)
+        return config, inputs_dict
+
+    def get_config(self):
+        text_encoder_config = T5Config(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            d_ff=self.intermediate_size,
+            num_layers=self.num_hidden_layers,
+            num_heads=self.num_attention_heads,
+        )
+        audio_encoder_config = EncodecConfig(
+            hidden_size=self.vocab_size,
+            compress=1,
+            num_filters=self.num_filters,
+            codebook_size=self.codebook_size,
+            codebook_dim=self.vocab_size,
+        )
+        decoder_config = MusicgenDecoderConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            ffn_dim=self.intermediate_size,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.bos_token_id,
+            bos_token_id=self.bos_token_id,
+            num_codebooks=self.num_codebooks,
+            tie_word_embeddings=False,
+        )
+        config = MusicgenConfig.from_sub_models_config(text_encoder_config, audio_encoder_config, decoder_config)
+        return config
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+
+@require_torch
+class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (MusicgenForConditionalGeneration,) if is_torch_available() else ()
+    greedy_sample_model_classes = (MusicgenForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = {"text-to-audio": MusicgenForConditionalGeneration} if is_torch_available() else {}
+    test_pruning = False  # training is not supported yet for MusicGen
+    test_headmasking = False
+    test_resize_embeddings = False
+    # not to test torchscript as the model tester doesn't prepare `input_values` and `padding_mask`
+    # (and `torchscript` hates `None` values).
+    test_torchscript = False
+
+    def setUp(self):
+        self.model_tester = MusicgenTester(self)
+
+    def _check_output_with_attentions(self, outputs, config, input_ids, decoder_input_ids):
+        text_encoder_config = config.text_encoder
+        decoder_config = config.decoder
+
+        encoder_attentions = outputs["encoder_attentions"]
+        self.assertEqual(len(encoder_attentions), text_encoder_config.num_hidden_layers)
+
+        self.assertEqual(
+            encoder_attentions[0].shape[-3:],
+            (text_encoder_config.num_attention_heads, input_ids.shape[-1], input_ids.shape[-1]),
+        )
+
+        decoder_attentions = outputs["decoder_attentions"]
+        num_decoder_layers = decoder_config.num_hidden_layers
+        self.assertEqual(len(decoder_attentions), num_decoder_layers)
+
+        self.assertEqual(
+            decoder_attentions[0].shape[-3:],
+            (decoder_config.num_attention_heads, decoder_input_ids.shape[-1], decoder_input_ids.shape[-1]),
+        )
+
+        cross_attentions = outputs["cross_attentions"]
+        self.assertEqual(len(cross_attentions), num_decoder_layers)
+
+        cross_attention_input_seq_len = decoder_input_ids.shape[-1]
+        self.assertEqual(
+            cross_attentions[0].shape[-3:],
+            (decoder_config.num_attention_heads, cross_attention_input_seq_len, input_ids.shape[-1]),
+        )
+
+    def check_musicgen_model_output_attentions(
+        self,
+        model_class,
+        config,
+        input_ids,
+        attention_mask,
+        decoder_input_ids,
+        decoder_attention_mask,
+        **kwargs,
+    ):
+        model = model_class(config)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            outputs = model(
+                input_ids=input_ids,
+                decoder_input_ids=decoder_input_ids,
+                attention_mask=attention_mask,
+                decoder_attention_mask=decoder_attention_mask,
+                output_attentions=True,
+                **kwargs,
+            )
+        self._check_output_with_attentions(outputs, config, input_ids, decoder_input_ids)
+
+    def check_musicgen_model_output_attentions_from_config(
+        self,
+        model_class,
+        config,
+        input_ids,
+        attention_mask,
+        decoder_input_ids,
+        decoder_attention_mask,
+        **kwargs,
+    ):
+        # Similar to `check_musicgen_model_output_attentions`, but with `output_attentions` triggered from the
+        # config file. Contrarily to most models, changing the model's config won't work -- the defaults are loaded
+        # from the inner models' configurations.
+        config.output_attentions = True  # model config -> won't work
+
+        model = model_class(config)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            outputs = model(
+                input_ids=input_ids,
+                decoder_input_ids=decoder_input_ids,
+                attention_mask=attention_mask,
+                decoder_attention_mask=decoder_attention_mask,
+                **kwargs,
+            )
+        self.assertTrue(
+            all(key not in outputs for key in ["encoder_attentions", "decoder_attentions", "cross_attentions"])
+        )
+        config.text_encoder.output_attentions = True  # inner model config -> will work
+        config.audio_encoder.output_attentions = True
+        config.decoder.output_attentions = True
+
+        model = model_class(config)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            outputs = model(
+                input_ids=input_ids,
+                decoder_input_ids=decoder_input_ids,
+                attention_mask=attention_mask,
+                decoder_attention_mask=decoder_attention_mask,
+                **kwargs,
+            )
+        self._check_output_with_attentions(outputs, config, input_ids, decoder_input_ids)
+
+    # override since changing `output_attentions` from the top-level model config won't work
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            self.check_musicgen_model_output_attentions(model_class, config, **inputs_dict)
+            self.check_musicgen_model_output_attentions_from_config(model_class, config, **inputs_dict)
+
+    # override since we have a specific forward signature for musicgen
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = [
+                "input_ids",
+                "attention_mask",
+                "input_values",
+                "padding_mask",
+                "decoder_input_ids",
+                "decoder_attention_mask",
+            ]
+            expected_arg_names.extend(
+                ["head_mask", "decoder_head_mask", "cross_attn_head_mask", "encoder_outputs"]
+                if "head_mask" and "decoder_head_mask" and "cross_attn_head_mask" in arg_names
+                else ["encoder_outputs"]
+            )
+            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+
+    # override since changing `gradient_checkpointing` from the top-level model config won't work
+    def test_gradient_checkpointing_backward_compatibility(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            if not model_class.supports_gradient_checkpointing:
+                continue
+
+            config.text_encoder.gradient_checkpointing = True
+            config.audio_encoder.gradient_checkpointing = True
+            config.decoder.gradient_checkpointing = True
+            model = model_class(config)
+            self.assertTrue(model.is_gradient_checkpointing)
+
+    # skip as this model has multiple inputs embeds and lm heads that should not be tied
+    def test_tie_model_weights(self):
+        pass
+
+    # skip as this model has multiple inputs embeds and lm heads that should not be tied
+    def test_tied_model_weights_key_ignore(self):
+        pass
+
+    # skip as this model has multiple inputs embeds and lm heads that should not be tied
+    def test_tied_weights_keys(self):
+        pass
+
+    # override since changing `output_hidden_states` / `output_attentions` from the top-level model config won't work
+    def test_retain_grad_hidden_states_attentions(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.text_encoder.output_hidden_states = True
+        config.audio_encoder.output_hidden_states = True
+        config.decoder.output_hidden_states = True
+
+        config.text_encoder.output_attentions = True
+        config.decoder.output_attentions = True
+
+        # no need to test all models as different heads yield the same functionality
+        model_class = self.all_model_classes[0]
+        model = model_class(config)
+        model.to(torch_device)
+
+        inputs = self._prepare_for_class(inputs_dict, model_class)
+
+        outputs = model(**inputs)
+
+        output = outputs[0]
+
+        encoder_hidden_states = outputs.encoder_hidden_states[0]
+        encoder_hidden_states.retain_grad()
+
+        decoder_hidden_states = outputs.decoder_hidden_states[0]
+        decoder_hidden_states.retain_grad()
+
+        if self.has_attentions:
+            encoder_attentions = outputs.encoder_attentions[0]
+            encoder_attentions.retain_grad()
+
+            decoder_attentions = outputs.decoder_attentions[0]
+            decoder_attentions.retain_grad()
+
+            cross_attentions = outputs.cross_attentions[0]
+            cross_attentions.retain_grad()
+
+        output.flatten()[0].backward(retain_graph=True)
+
+        self.assertIsNotNone(encoder_hidden_states.grad)
+        self.assertIsNotNone(decoder_hidden_states.grad)
+
+        if self.has_attentions:
+            self.assertIsNotNone(encoder_attentions.grad)
+            self.assertIsNotNone(decoder_attentions.grad)
+            self.assertIsNotNone(cross_attentions.grad)
+
+    # override since changing `output_hidden_states` from the top-level model config won't work
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states
+
+            expected_num_layers = self.model_tester.num_hidden_layers + 1
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            seq_length = self.model_tester.seq_length
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [seq_length, self.model_tester.hidden_size],
+            )
+
+            hidden_states = outputs.decoder_hidden_states
+            self.assertIsInstance(hidden_states, (list, tuple))
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [seq_length, self.model_tester.hidden_size],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.text_encoder.output_hidden_states = True
+            config.audio_encoder.output_hidden_states = True
+            config.decoder.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    # override since the conv layers and lstm's in encodec are exceptions
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                uniform_init_parms = ["conv"]
+                ignore_init = ["lstm"]
+                if param.requires_grad:
+                    if any(x in name for x in uniform_init_parms):
+                        self.assertTrue(
+                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    elif not any(x in name for x in ignore_init):
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+    # override since we have embeddings / LM heads over multiple codebooks
+    def test_model_common_attributes(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), torch.nn.Embedding)
+            lm_heads = model.get_output_embeddings()
+            self.assertTrue(lm_heads is None or isinstance(lm_heads[0], torch.nn.Linear))
+
+    def _get_input_ids_and_config(self, batch_size=2):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        input_ids = inputs_dict["input_ids"]
+
+        # take max batch_size
+        sequence_length = input_ids.shape[-1]
+        input_ids = input_ids[:batch_size, :]
+        attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long)
+
+        # generate max 3 tokens
+        decoder_input_ids = inputs_dict["decoder_input_ids"]
+        max_length = decoder_input_ids.shape[-1] + 3
+        decoder_input_ids = decoder_input_ids[: batch_size * config.decoder.num_codebooks, :]
+        return config, input_ids, attention_mask, decoder_input_ids, max_length
+
+    # override since the `input_ids` cannot be used as the `decoder_input_ids` for musicgen (input / outputs are
+    # different modalities -> different shapes)
+    def _greedy_generate(
+        self,
+        model,
+        input_ids,
+        attention_mask,
+        decoder_input_ids,
+        max_length,
+        output_scores=False,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict_in_generate=False,
+    ):
+        logits_process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
+            input_ids.shape[-1],
+            eos_token_id=model.config.eos_token_id,
+            forced_bos_token_id=model.config.forced_bos_token_id,
+            forced_eos_token_id=model.config.forced_eos_token_id,
+            max_length=max_length,
+        )
+
+        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
+        output_generate = model.generate(
+            input_ids,
+            do_sample=False,
+            num_beams=1,
+            max_length=max_length,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            output_scores=output_scores,
+            return_dict_in_generate=return_dict_in_generate,
+            remove_invalid_values=True,
+            **logits_process_kwargs,
+            **model_kwargs,
+        )
+
+        encoder_outputs, input_ids, attention_mask = self._get_encoder_outputs(
+            model,
+            input_ids,
+            attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+        with torch.no_grad():
+            model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
+            output_greedy = model.greedy_search(
+                decoder_input_ids,
+                max_length=max_length,
+                logits_processor=logits_processor,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                output_scores=output_scores,
+                return_dict_in_generate=return_dict_in_generate,
+                encoder_outputs=encoder_outputs,
+                **model_kwargs,
+            )
+        return output_greedy, output_generate
+
+    # override since the `input_ids` cannot be used as the `decoder_input_ids` for musicgen (input / outputs are
+    # different modalities -> different shapes)
+    def _sample_generate(
+        self,
+        model,
+        input_ids,
+        attention_mask,
+        decoder_input_ids,
+        max_length,
+        num_return_sequences,
+        logits_processor,
+        logits_warper,
+        logits_warper_kwargs,
+        process_kwargs,
+        output_scores=False,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict_in_generate=False,
+    ):
+        torch.manual_seed(0)
+        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
+        output_generate = model.generate(
+            input_ids,
+            do_sample=True,
+            num_beams=1,
+            max_length=max_length,
+            num_return_sequences=num_return_sequences,
+            output_scores=output_scores,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict_in_generate=return_dict_in_generate,
+            remove_invalid_values=True,
+            **logits_warper_kwargs,
+            **process_kwargs,
+            **model_kwargs,
+        )
+
+        torch.manual_seed(0)
+        encoder_outputs, input_ids, attention_mask = self._get_encoder_outputs(
+            model,
+            input_ids,
+            attention_mask,
+            num_interleave=num_return_sequences,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+        # prevent flaky generation test failures
+        logits_processor.append(InfNanRemoveLogitsProcessor())
+
+        with torch.no_grad():
+            model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
+            output_sample = model.sample(
+                decoder_input_ids.repeat_interleave(num_return_sequences, dim=0),
+                max_length=max_length,
+                logits_processor=logits_processor,
+                logits_warper=logits_warper,
+                output_scores=output_scores,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict_in_generate=return_dict_in_generate,
+                encoder_outputs=encoder_outputs,
+                **model_kwargs,
+            )
+
+        return output_sample, output_generate
+
+    @staticmethod
+    def _get_logits_processor_and_kwargs(
+        input_length,
+        eos_token_id,
+        forced_bos_token_id=None,
+        forced_eos_token_id=None,
+        max_length=None,
+        diversity_penalty=None,
+    ):
+        process_kwargs = {
+            "min_length": input_length + 1 if max_length is None else max_length - 1,
+        }
+        logits_processor = LogitsProcessorList()
+        return process_kwargs, logits_processor
+
+    def test_greedy_generate_dict_outputs(self):
+        for model_class in self.greedy_sample_model_classes:
+            # disable cache
+            config, input_ids, attention_mask, decoder_input_ids, max_length = self._get_input_ids_and_config()
+            config.use_cache = False
+            model = model_class(config).to(torch_device).eval()
+            output_greedy, output_generate = self._greedy_generate(
+                model=model,
+                input_ids=input_ids.to(torch_device),
+                attention_mask=attention_mask.to(torch_device),
+                decoder_input_ids=decoder_input_ids,
+                max_length=max_length,
+                output_scores=True,
+                output_hidden_states=True,
+                output_attentions=True,
+                return_dict_in_generate=True,
+            )
+
+            self.assertIsInstance(output_greedy, GreedySearchEncoderDecoderOutput)
+            self.assertIsInstance(output_generate, GreedySearchEncoderDecoderOutput)
+
+            self.assertNotIn(config.pad_token_id, output_generate)
+
+    def test_greedy_generate_dict_outputs_use_cache(self):
+        for model_class in self.greedy_sample_model_classes:
+            # enable cache
+            config, input_ids, attention_mask, decoder_input_ids, max_length = self._get_input_ids_and_config()
+
+            config.use_cache = True
+            config.is_decoder = True
+            model = model_class(config).to(torch_device).eval()
+            output_greedy, output_generate = self._greedy_generate(
+                model=model,
+                input_ids=input_ids.to(torch_device),
+                attention_mask=attention_mask.to(torch_device),
+                decoder_input_ids=decoder_input_ids,
+                max_length=max_length,
+                output_scores=True,
+                output_hidden_states=True,
+                output_attentions=True,
+                return_dict_in_generate=True,
+            )
+
+            self.assertIsInstance(output_greedy, GreedySearchEncoderDecoderOutput)
+            self.assertIsInstance(output_generate, GreedySearchEncoderDecoderOutput)
+
+    def test_sample_generate(self):
+        for model_class in self.greedy_sample_model_classes:
+            config, input_ids, attention_mask, decoder_input_ids, max_length = self._get_input_ids_and_config()
+            model = model_class(config).to(torch_device).eval()
+
+            process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
+                input_ids.shape[-1],
+                model.config.eos_token_id,
+                forced_bos_token_id=model.config.forced_bos_token_id,
+                forced_eos_token_id=model.config.forced_eos_token_id,
+                max_length=max_length,
+            )
+            logits_warper_kwargs, logits_warper = self._get_warper_and_kwargs(num_beams=2)
+
+            # check `generate()` and `sample()` are equal
+            output_sample, output_generate = self._sample_generate(
+                model=model,
+                input_ids=input_ids.to(torch_device),
+                attention_mask=attention_mask.to(torch_device),
+                decoder_input_ids=decoder_input_ids,
+                max_length=max_length,
+                num_return_sequences=1,
+                logits_processor=logits_processor,
+                logits_warper=logits_warper,
+                logits_warper_kwargs=logits_warper_kwargs,
+                process_kwargs=process_kwargs,
+            )
+            self.assertIsInstance(output_sample, torch.Tensor)
+            self.assertIsInstance(output_generate, torch.Tensor)
+
+    def test_sample_generate_dict_output(self):
+        for model_class in self.greedy_sample_model_classes:
+            # disable cache
+            config, input_ids, attention_mask, decoder_input_ids, max_length = self._get_input_ids_and_config()
+            config.use_cache = False
+            model = model_class(config).to(torch_device).eval()
+
+            process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
+                input_ids.shape[-1],
+                model.config.eos_token_id,
+                forced_bos_token_id=model.config.forced_bos_token_id,
+                forced_eos_token_id=model.config.forced_eos_token_id,
+                max_length=max_length,
+            )
+            logits_warper_kwargs, logits_warper = self._get_warper_and_kwargs(num_beams=1)
+
+            output_sample, output_generate = self._sample_generate(
+                model=model,
+                input_ids=input_ids.to(torch_device),
+                attention_mask=attention_mask.to(torch_device),
+                decoder_input_ids=decoder_input_ids,
+                max_length=max_length,
+                num_return_sequences=3,
+                logits_processor=logits_processor,
+                logits_warper=logits_warper,
+                logits_warper_kwargs=logits_warper_kwargs,
+                process_kwargs=process_kwargs,
+                output_scores=True,
+                output_hidden_states=True,
+                output_attentions=True,
+                return_dict_in_generate=True,
+            )
+
+            self.assertIsInstance(output_sample, SampleEncoderDecoderOutput)
+            self.assertIsInstance(output_generate, SampleEncoderDecoderOutput)
+
+    def test_generate_without_input_ids(self):
+        config, _, _, _, max_length = self._get_input_ids_and_config()
+
+        # if no bos token id => cannot generate from None
+        if config.bos_token_id is None:
+            return
+
+        for model_class in self.greedy_sample_model_classes:
+            model = model_class(config).to(torch_device)
+            model.eval()
+
+            output_ids_generate = model.generate(do_sample=False, max_length=max_length, remove_invalid_values=True)
+            self.assertIsNotNone(output_ids_generate)
+
+    def test_generate_fp16(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs()
+
+        for model_class in self.greedy_sample_model_classes:
+            model = model_class(config).eval().to(torch_device)
+            if torch_device == "cuda":
+                model.half()
+            # greedy
+            model.generate(input_dict["input_ids"], attention_mask=input_dict["attention_mask"], max_new_tokens=10)
+            # sampling
+            model.generate(
+                input_dict["input_ids"], attention_mask=input_dict["attention_mask"], do_sample=True, max_new_tokens=10
+            )
+
+
+def get_bip_bip(bip_duration=0.125, duration=0.5, sample_rate=32000):
+    """Produces a series of 'bip bip' sounds at a given frequency."""
+    timesteps = np.arange(int(duration * sample_rate)) / sample_rate
+    wav = np.cos(2 * math.pi * 440 * timesteps)
+    time_period = (timesteps % (2 * bip_duration)) / (2 * bip_duration)
+    envelope = time_period >= 0.5
+    return wav * envelope
+
+
+def place_dict_on_device(dict_to_place, device):
+    for key in dict_to_place:
+        if dict_to_place[key] is not None and isinstance(dict_to_place[key], torch.Tensor):
+            dict_to_place[key] = dict_to_place[key].to(device)
+    return dict_to_place
+
+
+@require_torch
+class MusicgenIntegrationTests(unittest.TestCase):
+    @cached_property
+    def model(self):
+        return MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small").to(torch_device)
+
+    @cached_property
+    def processor(self):
+        return MusicgenProcessor.from_pretrained("facebook/musicgen-small")
+
+    @slow
+    def test_logits_text_prompt(self):
+        model = self.model
+        processor = self.processor
+
+        inputs = processor(text=["80s music", "Club techno"], padding=True, return_tensors="pt")
+
+        # prepare the encoder inputs
+        input_ids = inputs.input_ids.to(torch_device)
+        attention_mask = inputs.attention_mask.to(torch_device)
+
+        # prepare the decoder inputs
+        pad_token_id = model.generation_config.pad_token_id
+        decoder_input_ids = (
+            torch.ones((input_ids.shape[0] * model.decoder.num_codebooks, 1), dtype=torch.long).to(torch_device)
+            * pad_token_id
+        )
+
+        with torch.no_grad():
+            logits = model(
+                input_ids,
+                attention_mask=attention_mask,
+                decoder_input_ids=decoder_input_ids,
+            ).logits
+
+        # fmt: off
+        EXPECTED_LOGITS = torch.tensor(
+            [
+                -0.9708, -3.0149, -4.6415, -1.4754, -0.2786, -2.3523, -2.6049, -6.7467,
+                -1.0206, -3.2984, -3.3968, -1.5108, -1.5786, -3.1493, -1.1503, -0.0545,
+            ]
+        )
+        # fmt: on
+
+        self.assertTrue(logits.shape == (*decoder_input_ids.shape, model.decoder.config.vocab_size))
+        self.assertTrue(torch.allclose(logits[0, 0, :16].cpu(), EXPECTED_LOGITS, atol=1e-4))
+
+    @slow
+    def test_logits_text_audio_prompt(self):
+        model = self.model
+        processor = self.processor
+
+        audio = [get_bip_bip(duration=0.5), get_bip_bip(duration=1.0)]
+        text = ["80s music", "Club techno"]
+
+        inputs = processor(audio=audio, text=text, padding=True, return_tensors="pt")
+
+        # prepare the text encoder inputs
+        input_ids = inputs.input_ids.to(torch_device)
+        attention_mask = inputs.attention_mask.to(torch_device)
+
+        # prepare the audio encoder inputs
+        input_values = inputs.input_values.to(torch_device)
+        padding_mask = inputs.padding_mask.to(torch_device)
+
+        with torch.no_grad():
+            logits = model(
+                input_ids,
+                attention_mask=attention_mask,
+                input_values=input_values,
+                padding_mask=padding_mask,
+            ).logits
+
+        # fmt: off
+        EXPECTED_LOGITS = torch.tensor(
+            [
+                0.1841, -2.9324, -0.7898, 0.1857, 0.4971, -2.8685, -1.6525, -1.6541,
+                2.7757, -2.5942, -3.0959, -1.0120, -1.0147, -0.4605, -0.8885, 0.6820,
+            ]
+        )
+        # fmt: on
+
+        self.assertTrue(logits.shape == (8, 50, 2048))
+        self.assertTrue(torch.allclose(logits[0, -1, :16].cpu(), EXPECTED_LOGITS, atol=1e-4))
+
+    @slow
+    def test_generate_unconditional_greedy(self):
+        model = self.model
+
+        # only generate 1 sample with greedy - since it's deterministic all elements of the batch will be the same
+        unconditional_inputs = model.get_unconditional_inputs(num_samples=1)
+        unconditional_inputs = place_dict_on_device(unconditional_inputs, device=torch_device)
+
+        output_values = model.generate(**unconditional_inputs, do_sample=False, max_new_tokens=5)
+
+        # fmt: off
+        EXPECTED_VALUES = torch.tensor(
+            [
+                0.0056, 0.0064, 0.0063, 0.0054, 0.0042, 0.0033, 0.0024, 0.0015,
+                0.0015, 0.0010, 0.0004, -0.0012, -0.0036, -0.0055, -0.0067, -0.0071,
+            ]
+        )
+        # fmt: on
+
+        self.assertTrue(output_values.shape == (1, 1, 3200))
+        self.assertTrue(torch.allclose(output_values[0, 0, :16].cpu(), EXPECTED_VALUES, atol=1e-4))
+
+    @slow
+    def test_generate_unconditional_sampling(self):
+        model = self.model
+
+        # for stochastic sampling we can generate multiple outputs
+        unconditional_inputs = model.get_unconditional_inputs(num_samples=2)
+        unconditional_inputs = place_dict_on_device(unconditional_inputs, device=torch_device)
+
+        set_seed(0)
+        output_values = model.generate(**unconditional_inputs, do_sample=True, max_new_tokens=10)
+
+        # fmt: off
+        EXPECTED_VALUES = torch.tensor(
+            [
+                -0.0099, -0.0140, 0.0079, 0.0080, -0.0046,  0.0065, -0.0068, -0.0185,
+                 0.0105,  0.0059, 0.0329, 0.0249, -0.0204, -0.0341, -0.0465,  0.0053,
+            ]
+        )
+        # fmt: on
+
+        self.assertTrue(output_values.shape == (2, 1, 4480))
+        self.assertTrue(torch.allclose(output_values[0, 0, :16].cpu(), EXPECTED_VALUES, atol=1e-4))
+
+    @slow
+    def test_generate_text_prompt_greedy(self):
+        model = self.model
+        processor = self.processor
+
+        inputs = processor(text=["80s music", "Club techno"], padding=True, return_tensors="pt")
+
+        # prepare the encoder inputs
+        input_ids = inputs.input_ids.to(torch_device)
+        attention_mask = inputs.attention_mask.to(torch_device)
+
+        output_values = model.generate(
+            input_ids, attention_mask=attention_mask, do_sample=False, guidance_scale=None, max_new_tokens=10
+        )
+
+        # fmt: off
+        EXPECTED_VALUES = torch.tensor(
+            [
+                -1.1998e-04, -2.2302e-04, 4.6296e-04, 1.0524e-03, 2.4827e-04,
+                -4.0288e-05, -1.2468e-04, 4.9846e-05, 7.1485e-04, 4.4197e-04,
+            ]
+        )
+        # fmt: on
+
+        self.assertTrue(output_values.shape == (2, 1, 4480))
+        self.assertTrue(torch.allclose(output_values[0, 0, :10].cpu(), EXPECTED_VALUES, atol=1e-4))
+
+    @slow
+    def test_generate_text_prompt_greedy_with_classifier_free_guidance(self):
+        model = self.model
+        processor = self.processor
+
+        inputs = processor(text=["80s music", "Club techno"], padding=True, return_tensors="pt")
+
+        # prepare the encoder inputs
+        input_ids = inputs.input_ids.to(torch_device)
+        attention_mask = inputs.attention_mask.to(torch_device)
+
+        output_values = model.generate(
+            input_ids, attention_mask=attention_mask, do_sample=False, guidance_scale=3, max_new_tokens=10
+        )
+
+        # fmt: off
+        EXPECTED_VALUES = torch.tensor(
+            [
+                0.0283, 0.0246, 0.0650, 0.0640, 0.0599, 0.0711, 0.0420, 0.0112,
+                0.0511, 0.0746, 0.1363, 0.1213, 0.0185, -0.0578, -0.0908, 0.0443,
+            ]
+        )
+        # fmt: on
+
+        self.assertTrue(output_values.shape == (2, 1, 4480))
+        self.assertTrue(torch.allclose(output_values[0, 0, :16].cpu(), EXPECTED_VALUES, atol=1e-4))
+
+    @slow
+    def test_generate_text_prompt_sampling(self):
+        model = self.model
+        processor = self.processor
+
+        inputs = processor(text=["80s music", "Club techno"], padding=True, return_tensors="pt")
+
+        # prepare the encoder inputs
+        input_ids = inputs.input_ids.to(torch_device)
+        attention_mask = inputs.attention_mask.to(torch_device)
+
+        set_seed(0)
+        output_values = model.generate(
+            input_ids, attention_mask=attention_mask, do_sample=True, guidance_scale=None, max_new_tokens=10
+        )
+
+        # fmt: off
+        EXPECTED_VALUES = torch.tensor(
+            [
+                -0.0111, -0.0154, 0.0047, 0.0058, -0.0068,  0.0012, -0.0109, -0.0229,
+                 0.0010, -0.0038, 0.0167, 0.0042, -0.0421, -0.0610, -0.0764, -0.0326,
+            ]
+        )
+        # fmt: on
+
+        self.assertTrue(output_values.shape == (2, 1, 4480))
+        self.assertTrue(torch.allclose(output_values[0, 0, :16].cpu(), EXPECTED_VALUES, atol=1e-4))
+
+    @slow
+    def test_generate_text_audio_prompt(self):
+        model = self.model
+        processor = self.processor
+
+        audio = [get_bip_bip(duration=0.5), get_bip_bip(duration=1.0)]
+        text = ["80s music", "Club techno"]
+
+        inputs = processor(audio=audio, text=text, padding=True, return_tensors="pt")
+        inputs = place_dict_on_device(inputs, device=torch_device)
+
+        output_values = model.generate(**inputs, do_sample=False, guidance_scale=None, max_new_tokens=10)
+
+        # fmt: off
+        EXPECTED_VALUES = torch.tensor(
+            [
+                -0.0036, -0.0130, -0.0261, -0.0384, -0.0557, -0.0718, -0.0680, -0.0632,
+                -0.0529, -0.0403, -0.0289, -0.0198, -0.0136, -0.0101, -0.0095, -0.0040,
+            ]
+        )
+        # fmt: on
+
+        self.assertTrue(
+            output_values.shape == (2, 1, 36480)
+        )  # input values take shape 32000 and we generate from there
+        self.assertTrue(torch.allclose(output_values[0, 0, -16:].cpu(), EXPECTED_VALUES, atol=1e-4))
diff --git a/tests/models/musicgen/test_processing_musicgen.py b/tests/models/musicgen/test_processing_musicgen.py
new file mode 100644
index 000000000000..c0157ed0b555
--- /dev/null
+++ b/tests/models/musicgen/test_processing_musicgen.py
@@ -0,0 +1,174 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for the MusicGen processor."""
+
+import random
+import shutil
+import tempfile
+import unittest
+
+import numpy as np
+
+from transformers import T5Tokenizer, T5TokenizerFast
+from transformers.testing_utils import require_sentencepiece, require_torch
+from transformers.utils.import_utils import is_speech_available, is_torch_available
+
+
+if is_torch_available():
+    pass
+
+if is_speech_available():
+    from transformers import EncodecFeatureExtractor, MusicgenProcessor
+
+
+global_rng = random.Random()
+
+
+# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list
+def floats_list(shape, scale=1.0, rng=None, name=None):
+    """Creates a random float32 tensor"""
+    if rng is None:
+        rng = global_rng
+
+    values = []
+    for batch_idx in range(shape[0]):
+        values.append([])
+        for _ in range(shape[1]):
+            values[-1].append(rng.random() * scale)
+
+    return values
+
+
+@require_torch
+@require_sentencepiece
+class MusicgenProcessorTest(unittest.TestCase):
+    def setUp(self):
+        self.checkpoint = "facebook/musicgen-small"
+        self.tmpdirname = tempfile.mkdtemp()
+
+    def get_tokenizer(self, **kwargs):
+        return T5Tokenizer.from_pretrained(self.checkpoint, **kwargs)
+
+    def get_feature_extractor(self, **kwargs):
+        return EncodecFeatureExtractor.from_pretrained(self.checkpoint, **kwargs)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def test_save_load_pretrained_default(self):
+        tokenizer = self.get_tokenizer()
+        feature_extractor = self.get_feature_extractor()
+
+        processor = MusicgenProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        processor.save_pretrained(self.tmpdirname)
+        processor = MusicgenProcessor.from_pretrained(self.tmpdirname)
+
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
+        self.assertIsInstance(processor.tokenizer, T5TokenizerFast)
+
+        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
+        self.assertIsInstance(processor.feature_extractor, EncodecFeatureExtractor)
+
+    def test_save_load_pretrained_additional_features(self):
+        processor = MusicgenProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
+        processor.save_pretrained(self.tmpdirname)
+
+        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
+        feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0)
+
+        processor = MusicgenProcessor.from_pretrained(
+            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
+        )
+
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
+        self.assertIsInstance(processor.tokenizer, T5TokenizerFast)
+
+        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
+        self.assertIsInstance(processor.feature_extractor, EncodecFeatureExtractor)
+
+    def test_feature_extractor(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = MusicgenProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        raw_speech = floats_list((3, 1000))
+
+        input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
+        input_processor = processor(raw_speech, return_tensors="np")
+
+        for key in input_feat_extract.keys():
+            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
+
+    def test_tokenizer(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = MusicgenProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        input_str = "This is a test string"
+
+        encoded_processor = processor(text=input_str)
+
+        encoded_tok = tokenizer(input_str)
+
+        for key in encoded_tok.keys():
+            self.assertListEqual(encoded_tok[key], encoded_processor[key])
+
+    def test_tokenizer_decode(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = MusicgenProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
+
+        decoded_processor = processor.batch_decode(sequences=predicted_ids)
+        decoded_tok = tokenizer.batch_decode(predicted_ids)
+
+        self.assertListEqual(decoded_tok, decoded_processor)
+
+    def test_model_input_names(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = MusicgenProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        self.assertListEqual(
+            processor.model_input_names,
+            feature_extractor.model_input_names,
+            msg="`processor` and `feature_extractor` model input names do not match",
+        )
+
+    def test_decode_audio(self):
+        feature_extractor = self.get_feature_extractor(padding_side="left")
+        tokenizer = self.get_tokenizer()
+
+        processor = MusicgenProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        raw_speech = [floats_list((1, x))[0] for x in range(5, 20, 5)]
+        padding_mask = processor(raw_speech).padding_mask
+
+        generated_speech = np.asarray(floats_list((3, 20)))[:, None, :]
+        decoded_audios = processor.batch_decode(generated_speech, padding_mask=padding_mask)
+
+        self.assertIsInstance(decoded_audios, list)
+
+        for audio in decoded_audios:
+            self.assertIsInstance(audio, np.ndarray)
+
+        self.assertTrue(decoded_audios[0].shape == (1, 10))
+        self.assertTrue(decoded_audios[1].shape == (1, 15))
+        self.assertTrue(decoded_audios[2].shape == (1, 20))
diff --git a/tests/models/mvp/test_modeling_mvp.py b/tests/models/mvp/test_modeling_mvp.py
index edeefb3804ac..8e6143529a80 100644
--- a/tests/models/mvp/test_modeling_mvp.py
+++ b/tests/models/mvp/test_modeling_mvp.py
@@ -28,6 +28,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -405,18 +406,50 @@ def _get_embs(m):
 
 
 @require_torch
-class MvpModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class MvpModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (MvpModel, MvpForConditionalGeneration, MvpForSequenceClassification, MvpForQuestionAnswering)
         if is_torch_available()
         else ()
     )
     all_generative_model_classes = (MvpForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "conversational": MvpForConditionalGeneration,
+            "feature-extraction": MvpModel,
+            "fill-mask": MvpForConditionalGeneration,
+            "question-answering": MvpForQuestionAnswering,
+            "summarization": MvpForConditionalGeneration,
+            "text-classification": MvpForSequenceClassification,
+            "text-generation": MvpForCausalLM,
+            "text2text-generation": MvpForConditionalGeneration,
+            "translation": MvpForConditionalGeneration,
+            "zero-shot": MvpForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
     is_encoder_decoder = True
     fx_compatible = False
     test_pruning = False
     test_missing_keys = False
 
+    # TODO: Fix the failed tests
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if (
+            pipeline_test_casse_name == "QAPipelineTests"
+            and tokenizer_name is not None
+            and not tokenizer_name.endswith("Fast")
+        ):
+            # `QAPipelineTests` fails for a few models when the slower tokenizer are used.
+            # (The slower tokenizers were never used for pipeline tests before the pipeline testing rework)
+            # TODO: check (and possibly fix) the `QAPipelineTests` with slower tokenizer
+            return True
+
+        return False
+
     def setUp(self):
         self.model_tester = MvpModelTester(self)
         self.config_tester = ConfigTester(self, config_class=MvpConfig)
@@ -562,7 +595,7 @@ def __init__(
         use_labels=True,
         decoder_start_token_id=2,
         decoder_ffn_dim=32,
-        decoder_layers=4,
+        decoder_layers=2,
         encoder_attention_heads=4,
         decoder_attention_heads=4,
         max_position_embeddings=30,
@@ -785,3 +818,7 @@ def test_decoder_model_attn_mask_past(self):
     def test_retain_grad_hidden_states_attentions(self):
         # decoder cannot keep gradients
         return
+
+    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
+    def test_left_padding_compatibility(self):
+        pass
diff --git a/tests/models/mvp/test_tokenization_mvp.py b/tests/models/mvp/test_tokenization_mvp.py
index 71e83fba0e16..8bddb8443b64 100644
--- a/tests/models/mvp/test_tokenization_mvp.py
+++ b/tests/models/mvp/test_tokenization_mvp.py
@@ -132,7 +132,6 @@ def test_prepare_batch_not_longer_than_maxlen(self):
 
     @require_torch
     def test_special_tokens(self):
-
         src_text = ["A long paragraph for summarization."]
         tgt_text = [
             "Summary of the text.",
diff --git a/tests/models/nat/test_modeling_nat.py b/tests/models/nat/test_modeling_nat.py
index b89d4c1bb75a..a27b087ce519 100644
--- a/tests/models/nat/test_modeling_nat.py
+++ b/tests/models/nat/test_modeling_nat.py
@@ -22,8 +22,10 @@
 from transformers.testing_utils import require_natten, require_torch, require_vision, slow, torch_device
 from transformers.utils import cached_property, is_torch_available, is_vision_available
 
+from ...test_backbone_common import BackboneTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -65,6 +67,7 @@ def __init__(
         use_labels=True,
         num_labels=10,
         out_features=["stage1", "stage2"],
+        out_indices=[1, 2],
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -89,6 +92,7 @@ def __init__(
         self.use_labels = use_labels
         self.num_labels = num_labels
         self.out_features = out_features
+        self.out_indices = out_indices
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
@@ -121,6 +125,7 @@ def get_config(self):
             layer_norm_eps=self.layer_norm_eps,
             initializer_range=self.initializer_range,
             out_features=self.out_features,
+            out_indices=self.out_indices,
         )
 
     def create_and_check_model(self, config, pixel_values, labels):
@@ -189,8 +194,7 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_natten
 @require_torch
-class NatModelTest(ModelTesterMixin, unittest.TestCase):
-
+class NatModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             NatModel,
@@ -200,6 +204,11 @@ class NatModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {"feature-extraction": NatModel, "image-classification": NatForImageClassification}
+        if is_torch_available()
+        else {}
+    )
     fx_compatible = False
 
     test_torchscript = False
@@ -355,16 +364,16 @@ def test_initialization(self):
 @require_torch
 class NatModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return AutoImageProcessor.from_pretrained("shi-labs/nat-mini-in1k-224") if is_vision_available() else None
 
     @slow
     def test_inference_image_classification_head(self):
         model = NatForImageClassification.from_pretrained("shi-labs/nat-mini-in1k-224").to(torch_device)
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
 
         image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -375,3 +384,13 @@ def test_inference_image_classification_head(self):
         self.assertEqual(outputs.logits.shape, expected_shape)
         expected_slice = torch.tensor([0.3805, -0.8676, -0.3912]).to(torch_device)
         self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
+
+
+@require_torch
+@require_natten
+class NatBackboneTest(unittest.TestCase, BackboneTesterMixin):
+    all_model_classes = (NatBackbone,) if is_torch_available() else ()
+    config_class = NatConfig
+
+    def setUp(self):
+        self.model_tester = NatModelTester(self)
diff --git a/tests/models/nezha/test_modeling_nezha.py b/tests/models/nezha/test_modeling_nezha.py
index 6c91d8e7fb18..a71823d8a5f6 100644
--- a/tests/models/nezha/test_modeling_nezha.py
+++ b/tests/models/nezha/test_modeling_nezha.py
@@ -23,6 +23,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -54,7 +55,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -315,8 +316,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class NezhaModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
+class NezhaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             NezhaModel,
@@ -331,6 +331,18 @@ class NezhaModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase)
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": NezhaModel,
+            "fill-mask": NezhaForMaskedLM,
+            "question-answering": NezhaForQuestionAnswering,
+            "text-classification": NezhaForSequenceClassification,
+            "token-classification": NezhaForTokenClassification,
+            "zero-shot": NezhaForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
     fx_compatible = True
 
     # special case for ForPreTraining model
@@ -429,7 +441,6 @@ def test_model_from_pretrained(self):
     def test_torchscript_device_change(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
-
             # NezhaForMultipleChoice behaves incorrectly in JIT environments.
             if model_class == NezhaForMultipleChoice:
                 return
diff --git a/tests/models/nllb/test_tokenization_nllb.py b/tests/models/nllb/test_tokenization_nllb.py
index d77b101fa766..8b6605204cbe 100644
--- a/tests/models/nllb/test_tokenization_nllb.py
+++ b/tests/models/nllb/test_tokenization_nllb.py
@@ -258,7 +258,6 @@ def test_save_slow_from_fast_and_reload_fast(self):
     def test_special_tokens_initialization(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-
                 added_tokens = [AddedToken("<special>", lstrip=True)]
 
                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(
@@ -289,6 +288,10 @@ def test_special_tokens_initialization(self):
                     self.assertTrue(special_token_id in p_output)
                     self.assertTrue(special_token_id in cr_output)
 
+    @unittest.skip("Need to fix this after #26538")
+    def test_training_new_tokenizer(self):
+        pass
+
 
 @require_torch
 @require_sentencepiece
@@ -306,6 +309,7 @@ class NllbDistilledIntegrationTest(unittest.TestCase):
         " face decât să înrăutăţească violenţele şi mizeria pentru milioane de oameni.",
     ]
     expected_src_tokens = [
+        256047,
         16297,
         134408,
         8165,
@@ -320,7 +324,6 @@ class NllbDistilledIntegrationTest(unittest.TestCase):
         108,
         49486,
         2,
-        256047,
     ]
 
     @classmethod
@@ -356,8 +359,8 @@ def test_enro_tokenizer_truncation(self):
         assert isinstance(src_text[0], str)
         desired_max_length = 10
         ids = self.tokenizer(src_text, max_length=desired_max_length, truncation=True).input_ids[0]
-        self.assertEqual(ids[-2], 2)
-        self.assertEqual(ids[-1], EN_CODE)
+        self.assertEqual(ids[-1], 2)
+        self.assertEqual(ids[0], EN_CODE)
         self.assertEqual(len(ids), desired_max_length)
 
     def test_mask_token(self):
@@ -390,10 +393,10 @@ def test_enro_tokenizer_prepare_batch(self):
         self.assertEqual((2, 15), batch.attention_mask.shape)
         result = batch.input_ids.tolist()[0]
         self.assertListEqual(self.expected_src_tokens, result)
-        self.assertEqual(2, batch.decoder_input_ids[0, -1])  # EOS
+        self.assertEqual(RO_CODE, batch.decoder_input_ids[0, 0])  # EOS
         # Test that special tokens are reset
-        self.assertEqual(self.tokenizer.prefix_tokens, [])
-        self.assertEqual(self.tokenizer.suffix_tokens, [self.tokenizer.eos_token_id, EN_CODE])
+        self.assertEqual(self.tokenizer.prefix_tokens, [EN_CODE])
+        self.assertEqual(self.tokenizer.suffix_tokens, [self.tokenizer.eos_token_id])
 
     def test_seq2seq_max_length(self):
         batch = self.tokenizer(self.src_text, padding=True, truncation=True, max_length=3, return_tensors="pt")
@@ -420,9 +423,27 @@ def test_tokenizer_translation(self):
             nested_simplify(inputs),
             {
                 # A, test, EOS, en_XX
-                "input_ids": [[70, 7356, 2, 256047]],
+                "input_ids": [[256047, 70, 7356, 2]],
                 "attention_mask": [[1, 1, 1, 1]],
                 # ar_AR
                 "forced_bos_token_id": 256057,
             },
         )
+
+    @require_torch
+    def test_legacy_behaviour(self):
+        self.tokenizer.legacy_behaviour = True
+        inputs = self.tokenizer(
+            "UN Chief says there is no military solution in Syria", src_lang="eng_Latn", tgt_lang="fra_Latn"
+        )
+        self.assertEqual(
+            inputs.input_ids, [16297, 134408, 25653, 6370, 248, 254, 103929, 94995, 108, 49486, 2, 256047]
+        )
+
+        self.tokenizer.legacy_behaviour = False
+        inputs = self.tokenizer(
+            "UN Chief says there is no military solution in Syria", src_lang="eng_Latn", tgt_lang="fra_Latn"
+        )
+        self.assertEqual(
+            inputs.input_ids, [256047, 16297, 134408, 25653, 6370, 248, 254, 103929, 94995, 108, 49486, 2]
+        )
diff --git a/tests/models/nllb_moe/__init__.py b/tests/models/nllb_moe/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/nllb_moe/test_modeling_nllb_moe.py b/tests/models/nllb_moe/test_modeling_nllb_moe.py
new file mode 100644
index 000000000000..409db2207e20
--- /dev/null
+++ b/tests/models/nllb_moe/test_modeling_nllb_moe.py
@@ -0,0 +1,575 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch NLLB-MoE model. """
+
+
+import copy
+import tempfile
+import unittest
+
+from transformers import NllbMoeConfig, is_torch_available, set_seed
+from transformers.testing_utils import (
+    require_sentencepiece,
+    require_tokenizers,
+    require_torch,
+    slow,
+    torch_device,
+)
+from transformers.utils import cached_property
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import NllbMoeForConditionalGeneration, NllbMoeModel, NllbTokenizer
+    from transformers.models.nllb_moe.modeling_nllb_moe import NllbMoeDecoder, NllbMoeEncoder, NllbMoeTop2Router
+
+
+class NllbMoeModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=4,
+        hidden_act="relu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        max_position_embeddings=20,
+        eos_token_id=2,
+        pad_token_id=1,
+        bos_token_id=0,
+        num_experts=4,
+        encoder_sparse_step=2,
+        decoder_sparse_step=1,
+        expert_capacity=100,
+        router_jitter_noise=0.0,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.max_position_embeddings = max_position_embeddings
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.encoder_sparse_step = encoder_sparse_step
+        self.decoder_sparse_step = decoder_sparse_step
+        self.expert_capacity = expert_capacity
+        self.router_jitter_noise = router_jitter_noise
+        self.num_experts = num_experts
+
+    def prepare_nllb_moe_inputs_dict(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask=None,
+        decoder_attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+    ):
+        if attention_mask is None:
+            attention_mask = input_ids.ne(config.pad_token_id)
+        if decoder_attention_mask is None:
+            decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
+        if head_mask is None:
+            head_mask = torch.ones(config.encoder_layers, config.encoder_attention_heads, device=torch_device)
+        if decoder_head_mask is None:
+            decoder_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads, device=torch_device)
+        if cross_attn_head_mask is None:
+            cross_attn_head_mask = torch.ones(
+                config.decoder_layers, config.decoder_attention_heads, device=torch_device
+            )
+        return {
+            "input_ids": input_ids,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "decoder_attention_mask": attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+        }
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_ids[:, -1] = self.eos_token_id  # Eos Token
+        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        # we need to clamp the input ids here to avoid having pad token in between
+        # this is because for NllbMoe the position_ids are prepared such that
+        # all pad tokens have pos id = 2 and rest are between 2..seq_length
+        # and the seq_length here is seq_length - num_pad_tokens
+        # but when using past, there is no way of knowing if the past input ids had
+        # pad tokens in them, which results in incorrect seq_lenth and which in turn results in
+        # position_ids being off by num_pad_tokens in past input
+        input_ids = input_ids.clamp(self.pad_token_id + 1)
+        decoder_input_ids = decoder_input_ids.clamp(self.pad_token_id + 1)
+
+        config = self.get_config()
+        inputs_dict = self.prepare_nllb_moe_inputs_dict(config, input_ids, decoder_input_ids)
+        return config, inputs_dict
+
+    def get_config(self):
+        return NllbMoeConfig(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            encoder_layerdrop=self.encoder_layerdrop,
+            decoder_layerdrop=self.decoder_layerdrop,
+            max_position_embeddings=self.max_position_embeddings,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
+            expert_capacity=self.expert_capacity,
+            router_jitter_noise=self.router_jitter_noise,
+            decoder_sparse_step=self.decoder_sparse_step,
+            encoder_sparse_step=self.encoder_sparse_step,
+            num_experts=self.num_experts,
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    @require_torch
+    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
+        model = NllbMoeModel(config=config).get_decoder().to(torch_device).eval()
+        input_ids = inputs_dict["input_ids"]
+        attention_mask = inputs_dict["attention_mask"]
+        head_mask = inputs_dict["head_mask"]
+
+        # first forward pass
+        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
+
+        output, past_key_values = outputs.to_tuple()
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
+
+        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
+            "last_hidden_state"
+        ]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
+        model = NllbMoeModel(config=config).to(torch_device).eval()
+        outputs = model(**inputs_dict)
+
+        encoder_last_hidden_state = outputs.encoder_last_hidden_state
+        last_hidden_state = outputs.last_hidden_state
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            encoder = model.get_encoder()
+            encoder.save_pretrained(tmpdirname)
+            encoder = NllbMoeEncoder.from_pretrained(tmpdirname).to(torch_device)
+
+        encoder_last_hidden_state_2 = encoder(inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"])[
+            0
+        ]
+
+        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            decoder = model.get_decoder()
+            decoder.save_pretrained(tmpdirname)
+            decoder = NllbMoeDecoder.from_pretrained(tmpdirname).to(torch_device)
+
+        last_hidden_state_2 = decoder(
+            input_ids=inputs_dict["decoder_input_ids"],
+            attention_mask=inputs_dict["decoder_attention_mask"],
+            encoder_hidden_states=encoder_last_hidden_state,
+            encoder_attention_mask=inputs_dict["attention_mask"],
+        )[0]
+
+        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
+
+
+@require_torch
+class NllbMoeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (NllbMoeModel, NllbMoeForConditionalGeneration) if is_torch_available() else ()
+    all_generative_model_classes = (NllbMoeForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "conversational": NllbMoeForConditionalGeneration,
+            "feature-extraction": NllbMoeModel,
+            "summarization": NllbMoeForConditionalGeneration,
+            "text2text-generation": NllbMoeForConditionalGeneration,
+            "translation": NllbMoeForConditionalGeneration,
+        }
+        if is_torch_available()
+        else {}
+    )
+    is_encoder_decoder = True
+    fx_compatible = False
+    test_pruning = False
+    test_missing_keys = True
+    test_torchscript = False
+
+    # TODO: Fix the failed tests when this model gets more usage
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        # Saving the slow tokenizer after saving the fast tokenizer causes the loading of the later hanging forever.
+        return True
+
+    def setUp(self):
+        self.model_tester = NllbMoeModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=NllbMoeConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_save_load_strict(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
+            self.assertEqual(info["missing_keys"], [])
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        config.decoder_sparse_step = 0
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(config, inputs_dict)
+
+    def test_encoder_decoder_model_standalone(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
+
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in (NllbMoeModel, NllbMoeForConditionalGeneration):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
+
+            if not self.is_encoder_decoder:
+                input_ids = inputs["input_ids"]
+                del inputs["input_ids"]
+            else:
+                encoder_input_ids = inputs["input_ids"]
+                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
+                del inputs["input_ids"]
+                inputs.pop("decoder_input_ids", None)
+
+            wte = model.get_input_embeddings()
+            if not self.is_encoder_decoder:
+                inputs["inputs_embeds"] = wte(input_ids)
+            else:
+                inputs["inputs_embeds"] = wte(encoder_input_ids)
+                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
+
+            with torch.no_grad():
+                model(**inputs)[0]
+
+    def test_generate_fp16(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs()
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        model = NllbMoeForConditionalGeneration(config).eval().to(torch_device)
+        if torch_device == "cuda":
+            model.half()
+        model.generate(input_ids, attention_mask=attention_mask)
+        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
+
+    def test_get_loss(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs()
+        input_dict["output_router_logits"] = True
+        input_dict["labels"] = input_dict["input_ids"]
+        model = NllbMoeForConditionalGeneration(config).eval().to(torch_device)
+        out = model(**input_dict)
+        self.assertIsNotNone(out.loss)
+        self.assertIsNotNone(model(**input_dict)["encoder_router_logits"][1])
+        self.assertIsNotNone(model(**input_dict)["decoder_router_logits"][0])
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+@slow
+class NllbMoeModelIntegrationTests(unittest.TestCase):
+    @require_torch
+    @cached_property
+    def model_inputs(self):
+        return {
+            "input_ids": torch.LongTensor(
+                [
+                    [28768, 248, 6399, 9, 65972, 452, 1925, 629, 123543, 248075, 2, 256047],
+                    [117, 7027, 7195, 202, 44778, 248075, 2, 256047, 1, 1, 1, 1],
+                ]
+            ),
+            "attention_mask": torch.Tensor(
+                [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]]
+            ),
+            "decoder_input_ids": torch.LongTensor([[2, 256057], [2, 256057]]),
+        }
+
+    @cached_property
+    def tokenizer(self):
+        return NllbTokenizer.from_pretrained("hf-internal-testing/random-nllb-moe-2-experts")
+
+    @cached_property
+    def big_model(self):
+        return NllbMoeForConditionalGeneration.from_pretrained("facebook/nllb-moe-54b")
+
+    def inference_no_head(self):
+        model = NllbMoeModel.from_pretrained("hf-internal-testing/random-nllb-moe-2-experts").eval()
+        with torch.no_grad():
+            output = model(**self.model_inputs)
+        # fmt: off
+        EXPECTED_ENCODER_STATE = torch.Tensor([ 0.3920, -0.1974, -0.0279,  0.3463, -0.8306, -1.0629, -0.4643,  2.0563, 1.1123,  0.3566, -0.9291, -0.3840, -0.2527, -0.9858,  1.5185, -1.1346, 0.0323, -0.9103, -0.3647, -0.4462, -0.9720, -0.3541,  0.1777, -0.4647, 1.6970, -0.9062,  0.2727, -1.0737,  0.8785,  0.4324])
+        EXPECTED_DECODER_STATE = torch.Tensor([-6.0425e-02, -2.0015e-01,  6.0575e-02, -8.6366e-01, -1.1310e+00, 6.8369e-01,  7.5615e-01,  7.3555e-01,  2.3071e-01,  1.5954e+00, -7.0728e-01, -2.2647e-01, -1.3292e+00,  4.8246e-01, -6.9153e-01, -1.8199e-02, -7.3664e-01,  1.5902e-03,  1.0760e-01,  1.0298e-01, -9.3933e-01, -4.6567e-01,  8.0417e-01,  1.5243e+00,  5.5844e-01, -9.9239e-02,  1.4885e+00,  7.1527e-02, -5.2612e-01,  9.4435e-02])
+        # fmt: on
+
+        torch.testing.assert_allclose(
+            output.encoder_last_hidden_state[1, 0, :30], EXPECTED_ENCODER_STATE, rtol=6e-3, atol=9e-3
+        )
+        torch.testing.assert_allclose(
+            output.last_hidden_state[1, 0, :30], EXPECTED_DECODER_STATE, rtol=6e-3, atol=9e-3
+        )
+
+    def test_inference_logits(self):
+        r"""
+        Logits testing to check implementation consistency between `fairseq` implementation
+        and `transformers` implementation of NLLB-MoE transformers. We only check the logits
+        of the second sample of the batch, as it is padded.
+        """
+        model = NllbMoeForConditionalGeneration.from_pretrained("hf-internal-testing/random-nllb-moe-2-experts").eval()
+        with torch.no_grad():
+            output = model(**self.model_inputs)
+
+        # fmt: off
+        EXPECTED_LOGTIS = torch.Tensor([-0.3059, 0.0000, 9.3029, 0.6456, -0.9148, 1.7836, 0.6478, 0.9438, -0.5272, -0.6617, -1.2717, 0.4564, 0.1345, -0.2301, -1.0140, 1.1427, -1.5535, 0.1337, 0.2082, -0.8112, -0.3842, -0.3377, 0.1256, 0.6450, -0.0452, 0.0219, 1.4274, -0.4991, -0.2063, -0.4409,])
+        # fmt: on
+        torch.testing.assert_allclose(output.logits[1, 0, :30], EXPECTED_LOGTIS, rtol=6e-3, atol=9e-3)
+
+    @unittest.skip("This requires 300GB of RAM")
+    def test_large_logits(self):
+        model = self.big_model
+        with torch.no_grad():
+            output = model(**self.model_inputs)
+
+        # fmt: off
+        EXPECTED_ENCODER_STATE = torch.Tensor([ 0.1696, -0.0059,  0.0489,  0.0479, -0.4222, -0.2178, -0.1372, -0.0860, -0.4249, -0.0081, -0.1186,  0.6678,  0.0160,  0.4140,  0.1799,  0.0672, -0.4941,  0.0173, -0.0740,  0.0845, -0.2197,  0.4465,  0.2268, -0.1752, -0.0562,  0.1033, -0.0869, -0.5490,  0.0582,  0.2165])
+        EXPECTED_DECODER_STATE = torch.Tensor([ 0.0374, -0.1055, -0.1060, -0.1711, -0.0540, -0.1183, -0.0779,  0.0610, -0.0279, -0.0848,  0.0222,  0.0372, -0.0298, -0.0861, -0.0354, -0.0103,  0.0538, -0.0148, -0.0105,  0.0224,  0.0629, -0.0291, -0.0671,  0.0173, -0.0066, -0.0245, -0.0499,  0.0760, -0.0067,  0.0086])
+        EXPECTED_LOGTIS = torch.Tensor([ 0.3834,  0.2057,  4.5399,  0.8301,  0.4810,  0.9325,  0.9928,  0.9574,  0.5517,  0.9156,  0.2698,  0.6728,  0.7121,  0.3080,  0.4693,  0.5756,  1.0407,  0.2219,  0.3714,  0.5699,  0.5547,  0.8472,  0.3178,  0.1286,  0.1791,  0.9391,  0.5153, -0.2146,  0.1689,  0.6816])
+        # fmt: on
+
+        torch.testing.assert_allclose(
+            output.encoder_last_hidden_state[1, 0, :30], EXPECTED_ENCODER_STATE, rtol=6e-3, atol=9e-3
+        )
+        torch.testing.assert_allclose(
+            output.last_hidden_state[1, 0, :30], EXPECTED_DECODER_STATE, rtol=6e-3, atol=9e-3
+        )
+        torch.testing.assert_allclose(output.logits[1, 0, :30], EXPECTED_LOGTIS, rtol=6e-3, atol=9e-3)
+
+    @unittest.skip("This requires 300GB of RAM")
+    def test_seq_to_seq_generation(self):
+        model = self.big_model
+        tokenizer = NllbTokenizer.from_pretrained("facebook/nllb-moe-54b")
+
+        # first 6 samples of load_dataset("facebook/flores", "eng_Latn-fra_Latn"), devtest. Truth are very similar to the fairseq translation files
+        FIRST_6_FLORES_200 = [
+            'We now have 4-month-old mice that are non-diabetic that used to be diabetic," he added.',
+            "Dr. Ehud Ur, professor of medicine at Dalhousie University in Halifax, Nova Scotia and chair of the clinical and scientific division of the Canadian Diabetes Association cautioned that the research is still in its early days.",
+            "Like some other experts, he is skeptical about whether diabetes can be cured, noting that these findings have no relevance to people who already have Type 1 diabetes.",
+            "On Monday, Sara Danius, permanent secretary of the Nobel Committee for Literature at the Swedish Academy, publicly announced during a radio program on Sveriges Radio in Sweden the committee, unable to reach Bob Dylan directly about winning the 2016 Nobel Prize in Literature, had abandoned its efforts to reach him.",
+            'Danius said, "Right now we are doing nothing. I have called and sent emails to his closest collaborator and received very friendly replies. For now, that is certainly enough."',
+            "Previously, Ring's CEO, Jamie Siminoff, remarked the company started when his doorbell wasn't audible from his shop in his garage.",
+        ]
+        inputs = tokenizer(FIRST_6_FLORES_200, padding=True, return_tensors="pt").to(torch_device)
+        batch_translation = model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id["fra_Latn"])
+
+        EXPECTED_FAIRSEQ_TRANSLATION = [
+            '"Nous avons maintenant des souris de 4 mois non diabétiques qui étaient diabétiques", a-t-il ajouté.',
+            "Le docteur Ehud Ur, professeur de médecine à l'université Dalhousie, à Halifax, en Nouvelle-Écosse, et président de la division clinique et scientifique de l'Association canadienne du diabète, prévient que la recherche n'en est qu'à ses débuts.",
+            "Comme d'autres spécialistes, il est sceptique quant à la guérison du diabète.",
+            "Lundi, Sara Danius, secrétaire permanente du Comité Nobel de littérature à l'Académie suédoise, a annoncé publiquement lors d'une émission de radio sur Sveriges Radio en Suède que le comité, incapable de joindre Bob Dylan directement pour lui annoncer le prix Nobel de littérature 2016, avait abandonné ses efforts pour le joindre.",
+            "Danius a déclaré: \"Pour l'instant, nous ne faisons rien. J'ai appelé et envoyé des courriels à son plus proche collaborateur et j'ai reçu des réponses très amicales. Pour l'instant, c'est certainement suffisant\".",
+            "Auparavant, le PDG de Ring, Jamie Siminoff, a fait remarquer que la société avait commencé lorsque sa sonnette n'était pas audible depuis son magasin dans son garage.",
+        ]
+
+        translation = tokenizer.batch_decode(
+            batch_translation.tolist(), clean_up_tokenization_spaces=True, skip_special_tokens=True
+        )
+        assert translation == EXPECTED_FAIRSEQ_TRANSLATION
+
+
+@require_torch
+class NllbMoeRouterTest(unittest.TestCase):
+    r"""
+    Switch Transformers has different blocks from classic transformer based models.
+    The Swift MLP contains a Router class, that has to be tested to check if it is correctly implemented
+
+    Original implementation of the routers here:
+
+    """
+    config = NllbMoeConfig(
+        num_experts=4,
+        hidden_size=32,
+        d_ff=16,
+        expert_capacity=4,
+    )
+    batch_size = 2
+    sequence_length = 20
+
+    def test_top_2_routing(self):
+        # test routing with minimal reproduction
+        mask = torch.ones((self.batch_size, self.sequence_length), dtype=torch.bool)
+        mask[0][0] = False
+        mask[1][0] = False
+        mask = mask.reshape(-1)
+        set_seed(0)
+        hidden_states = torch.rand((self.batch_size, self.sequence_length, self.config.hidden_size))
+        classfier = torch.nn.Linear(self.config.hidden_size, self.config.num_experts)
+        hf_router = NllbMoeTop2Router(self.config)
+
+        _, _, hidden_dim = hidden_states.shape
+        logits = classfier(hidden_states.reshape((self.batch_size * self.sequence_length), hidden_dim))
+        top_1_mask, router_probs = hf_router.route_tokens(logits, padding_mask=mask)
+        torch.argmax(top_1_mask, dim=-1)
+        router_mask = router_probs.bool()
+        set_seed(0)
+        experts = [
+            torch.nn.Linear(hidden_dim, hidden_dim),
+            torch.nn.Linear(hidden_dim, hidden_dim),
+            torch.nn.Linear(hidden_dim, hidden_dim),
+            torch.nn.Linear(hidden_dim, hidden_dim),
+        ]
+        hidden_states = hidden_states.reshape((self.batch_size * self.sequence_length), hidden_dim)
+        masked_hidden_states = torch.einsum("bm,be->ebm", hidden_states, router_mask)
+        for idx, expert in enumerate(experts):
+            token_indices = router_mask[:, idx]
+            combining_weights = router_probs[token_indices, idx]
+            expert_output = expert(masked_hidden_states[idx, token_indices])
+            expert_output *= 1 - self.config.moe_token_dropout
+            masked_hidden_states[idx, token_indices] = torch.einsum("b,be->be", combining_weights, expert_output)
+        hidden_states = masked_hidden_states.sum(dim=0).reshape(self.batch_size, self.sequence_length, hidden_dim)
+
+        # fmt: off
+        EXPECTED_MEAN_FAIRSEQ_HIDDEN_STATES = torch.Tensor([[ 7.0340e-04,  2.7997e-03, -1.3351e-02, -7.6705e-03, -3.5089e-03,3.9773e-03,  7.4593e-03,  1.2566e-02,  3.5860e-03, -2.7448e-02,-1.3731e-02, -1.0534e-02, -1.3606e-02, -1.5048e-02, -2.8914e-03,-5.0371e-03, -1.3963e-03,  6.0076e-03, -1.1380e-02, -1.4620e-02, 5.2401e-03,  8.4660e-04, -1.5319e-03, -1.6735e-02,  1.1302e-02, 3.6119e-03,  4.6084e-03, -1.3458e-02,  7.7792e-05,  1.4312e-02, 4.9107e-03, -5.0936e-03], [-4.4538e-03,  3.1026e-03,  1.4121e-04, -4.8121e-03, -5.6279e-03, 7.2493e-03,  3.9769e-03,  1.1114e-02, -1.5666e-03, -2.3477e-02, 8.7268e-03,  1.3446e-02, -2.8845e-05, -1.7287e-02,  8.7619e-03, -4.5316e-03, -1.2164e-02,  5.7461e-03, -4.5861e-03, -9.3907e-03, 2.9808e-02,  8.9206e-04, -7.6232e-04, -1.4173e-02,  3.0208e-03, 1.5310e-02,  9.7717e-03,  3.1014e-03,  7.8042e-03,  8.0197e-03, 3.4784e-03, -7.1728e-03]])
+        # fmt: on
+        self.assertTrue(torch.allclose(hidden_states.mean(1), EXPECTED_MEAN_FAIRSEQ_HIDDEN_STATES, 1e-4))
+
+    def test_batch_prioritized_routing(self):
+        set_seed(0)
+        config = NllbMoeConfig(
+            num_experts=4, hidden_size=32, d_ff=16, expert_capacity=4, second_expert_policy="random"
+        )
+        mask = torch.zeros((self.batch_size * self.sequence_length), dtype=torch.bool)
+        logits = torch.rand((self.batch_size * self.sequence_length, 4))
+        config.batch_prioritized_routing = True
+        router = NllbMoeTop2Router(config)
+        top_1_mask, _ = router.route_tokens(logits, padding_mask=mask)
+        # check that the routing is batch first. One of the last token is routed while expert capacity is very small
+        # this means that it had a greater probability of being routed
+        assert top_1_mask[-1, 0] == 1
+
+    def test_second_expert_policy(self):
+        config = NllbMoeConfig(
+            num_experts=4,
+            hidden_size=32,
+            d_ff=16,
+            expert_capacity=40,
+        )
+        set_seed(0)
+        mask = torch.zeros((self.batch_size * self.sequence_length), dtype=torch.bool)
+        logits = torch.rand((self.batch_size * self.sequence_length, 4))
+
+        set_seed(0)
+        config.second_expert_policy = "random"
+        router = NllbMoeTop2Router(config)
+        top_1_mask, router_probs = router.route_tokens(logits, padding_mask=mask)
+
+        set_seed(0)
+        config.second_expert_policy = "sampling"
+        router = NllbMoeTop2Router(config)
+        top_1_mask_sp, router_probs_sp = router.route_tokens(logits, padding_mask=mask)
+
+        set_seed(0)
+        config.second_expert_policy = "all"
+        router = NllbMoeTop2Router(config)
+        top_1_mask_all, router_probs_all = router.route_tokens(logits, padding_mask=mask)
+
+        # fmt: off
+        EXPECTED_ROUTER_ALL = torch.tensor([[0.3902, 0.0000, 0.0000, 0.6098], [0.0000, 0.0000, 0.7770, 0.2230], [0.0000, 0.0000, 0.2726, 0.7274], [0.4221, 0.0000, 0.5779, 0.0000], [0.0000, 0.0000, 0.7810, 0.2190], [0.5518, 0.4482, 0.0000, 0.0000], [0.0000, 0.4060, 0.5940, 0.0000], [0.7340, 0.0000, 0.0000, 0.2660], [0.4778, 0.5222, 0.0000, 0.0000], [0.0000, 0.3984, 0.0000, 0.6016], [0.0000, 0.0548, 0.9452, 0.0000], [0.6796, 0.0000, 0.0000, 0.3204], [0.0700, 0.0000, 0.9300, 0.0000], [0.1854, 0.0000, 0.8146, 0.0000], [0.6775, 0.3225, 0.0000, 0.0000], [0.0000, 0.0000, 0.5027, 0.4973], [0.0000, 0.6577, 0.0000, 0.3423], [0.0000, 0.7767, 0.0000, 0.2233], [0.1944, 0.8056, 0.0000, 0.0000], [0.0000, 0.3073, 0.0000, 0.6927], [0.0000, 0.5655, 0.4345, 0.0000], [0.5791, 0.0000, 0.0000, 0.4209], [0.0440, 0.0000, 0.9560, 0.0000], [0.0083, 0.9917, 0.0000, 0.0000], [0.0000, 0.8395, 0.0000, 0.1605], [0.0000, 0.1458, 0.0000, 0.8542], [0.0000, 0.8534, 0.1466, 0.0000], [0.4938, 0.0000, 0.0000, 0.5062], [0.1329, 0.8671, 0.0000, 0.0000], [0.3058, 0.0000, 0.6942, 0.0000], [0.4458, 0.0000, 0.0000, 0.5542], [0.9053, 0.0947, 0.0000, 0.0000], [0.0000, 0.7563, 0.2437, 0.0000], [0.0000, 0.0000, 0.4096, 0.5904], [0.4551, 0.0000, 0.0000, 0.5449], [0.8502, 0.1498, 0.0000, 0.0000], [0.0000, 0.6312, 0.3688, 0.0000], [0.8920, 0.0000, 0.0000, 0.1080], [0.1913, 0.0000, 0.0000, 0.8087], [0.2491, 0.7509, 0.0000, 0.0000]])
+        EXPECTED_ROUTER_SP = torch.tensor([[0.0000, 0.6539, 0.0000, 0.3461], [0.0000, 0.0000, 0.3998, 0.6002], [0.0000, 0.5574, 0.0000, 0.4426], [0.0000, 0.0000, 0.4441, 0.5559], [0.0000, 0.6545, 0.3455, 0.0000], [0.4419, 0.5581, 0.0000, 0.0000], [0.0000, 0.4014, 0.5986, 0.0000], [0.3215, 0.0000, 0.0000, 0.6785], [0.4765, 0.5235, 0.0000, 0.0000], [0.0000, 0.5467, 0.0000, 0.4533], [0.0000, 0.4156, 0.5844, 0.0000], [0.3370, 0.0000, 0.6630, 0.0000], [0.0000, 0.0000, 0.4558, 0.5442], [0.4659, 0.0000, 0.5341, 0.0000], [0.6179, 0.3821, 0.0000, 0.0000], [0.6277, 0.0000, 0.3723, 0.0000], [0.5836, 0.4164, 0.0000, 0.0000], [0.0000, 0.6600, 0.0000, 0.3400], [0.0000, 0.4933, 0.0000, 0.5067], [0.6016, 0.0000, 0.0000, 0.3984], [0.0000, 0.5160, 0.4840, 0.0000], [0.5799, 0.0000, 0.0000, 0.4201], [0.0000, 0.0000, 0.4826, 0.5174], [0.5426, 0.4574, 0.0000, 0.0000], [0.5362, 0.4638, 0.0000, 0.0000], [0.6448, 0.0000, 0.0000, 0.3552], [0.0000, 0.5909, 0.4091, 0.0000], [0.4196, 0.0000, 0.0000, 0.5804], [0.3191, 0.6809, 0.0000, 0.0000], [0.0000, 0.0000, 0.4886, 0.5114], [0.4899, 0.0000, 0.0000, 0.5101], [0.4123, 0.0000, 0.5877, 0.0000], [0.0000, 0.3736, 0.0000, 0.6264], [0.0000, 0.0000, 0.6009, 0.3991], [0.4246, 0.0000, 0.0000, 0.5754], [0.4997, 0.0000, 0.5003, 0.0000], [0.0000, 0.3595, 0.6405, 0.0000], [0.5433, 0.0000, 0.0000, 0.4567], [0.0000, 0.6806, 0.0000, 0.3194], [0.6689, 0.3311, 0.0000, 0.0000]])
+        EXPECTED_ROUTER = torch.tensor([[0.4324, 0.5676, 0.0000, 0.0000], [0.0000, 0.4348, 0.0000, 0.5652], [0.4559, 0.5441, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000, 1.0000], [0.4744, 0.5256, 0.0000, 0.0000], [0.0000, 0.5103, 0.0000, 0.4897], [0.0000, 0.0000, 1.0000, 0.0000], [0.0000, 0.0000, 0.0000, 1.0000], [0.0000, 1.0000, 0.0000, 0.0000], [0.0000, 0.5467, 0.0000, 0.4533], [0.0000, 0.0000, 1.0000, 0.0000], [0.0000, 0.0000, 1.0000, 0.0000], [0.0000, 0.0000, 0.0000, 1.0000], [0.0000, 0.0000, 1.0000, 0.0000], [1.0000, 0.0000, 0.0000, 0.0000], [0.5063, 0.4937, 0.0000, 0.0000], [0.5396, 0.0000, 0.0000, 0.4604], [0.4576, 0.5424, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000, 1.0000], [0.5134, 0.0000, 0.4866, 0.0000], [0.0000, 0.5160, 0.4840, 0.0000], [0.5439, 0.0000, 0.4561, 0.0000], [0.4849, 0.0000, 0.0000, 0.5151], [0.5426, 0.4574, 0.0000, 0.0000], [0.5362, 0.4638, 0.0000, 0.0000], [1.0000, 0.0000, 0.0000, 0.0000], [0.0000, 1.0000, 0.0000, 0.0000], [0.0000, 0.4448, 0.0000, 0.5552], [0.0000, 1.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.4886, 0.5114], [0.4899, 0.0000, 0.0000, 0.5101], [0.0000, 0.0000, 0.5296, 0.4704], [0.0000, 0.0000, 0.4469, 0.5531], [0.0000, 0.4053, 0.5947, 0.0000], [0.0000, 0.0000, 0.4460, 0.5540], [0.4997, 0.0000, 0.5003, 0.0000], [0.0000, 0.0000, 0.5851, 0.4149], [1.0000, 0.0000, 0.0000, 0.0000], [0.0000, 0.5010, 0.4990, 0.0000], [1.0000, 0.0000, 0.0000, 0.0000]])
+
+        EXPECTED_TOP_1_ALL = torch.LongTensor([[0, 0, 0, 1], [0, 0, 1, 0], [0, 0, 0, 1], [0, 0, 1, 0], [0, 0, 1, 0], [1, 0, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0], [1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0], [1, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0, 1, 0, 0], [0, 1, 0, 0], [0, 0, 0, 1], [0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0, 1, 0, 0], [0, 0, 0, 1], [0, 1, 0, 0], [0, 0, 0, 1], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 0, 1], [0, 0, 0, 1], [1, 0, 0, 0], [0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 1, 0, 0]])
+        EXPECTED_TOP_1_SP = torch.LongTensor([[0, 1, 0, 0], [0, 0, 0, 1], [0, 1, 0, 0], [0, 0, 0, 1], [0, 1, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [0, 1, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0], [0, 0, 0, 1], [0, 0, 1, 0], [1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 0, 1], [1, 0, 0, 0], [0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 0, 1], [0, 1, 0, 0], [0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 1, 0], [0, 0, 0, 1], [0, 0, 1, 0], [0, 0, 0, 1], [0, 0, 1, 0], [0, 0, 1, 0], [1, 0, 0, 0], [0, 1, 0, 0], [1, 0, 0, 0]])
+        # `sampling` and `random` do not affect the mask of the top_1 router
+        # fmt: on
+
+        torch.testing.assert_allclose(router_probs_all, EXPECTED_ROUTER_ALL, 1e-4, 1e-4)
+        torch.testing.assert_allclose(router_probs_sp, EXPECTED_ROUTER_SP, 1e-4, 1e-4)
+        torch.testing.assert_allclose(router_probs, EXPECTED_ROUTER, 1e-4, 1e-4)
+
+        torch.testing.assert_allclose(top_1_mask_all, EXPECTED_TOP_1_ALL, 1e-4, 1e-4)
+        torch.testing.assert_allclose(top_1_mask_sp, EXPECTED_TOP_1_SP, 1e-4, 1e-4)
+        torch.testing.assert_allclose(top_1_mask, EXPECTED_TOP_1_SP, 1e-4, 1e-4)
diff --git a/tests/models/nougat/__init__.py b/tests/models/nougat/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/nougat/test_image_processing_nougat.py b/tests/models/nougat/test_image_processing_nougat.py
new file mode 100644
index 000000000000..fc61ecbc1988
--- /dev/null
+++ b/tests/models/nougat/test_image_processing_nougat.py
@@ -0,0 +1,194 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+from huggingface_hub import hf_hub_download
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import cached_property, is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import NougatImageProcessor
+
+
+class NougatImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_crop_margin=True,
+        do_resize=True,
+        size=None,
+        do_thumbnail=True,
+        do_align_long_axis: bool = False,
+        do_pad=True,
+        do_normalize: bool = True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+    ):
+        size = size if size is not None else {"height": 20, "width": 20}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_crop_margin = do_crop_margin
+        self.do_resize = do_resize
+        self.size = size
+        self.do_thumbnail = do_thumbnail
+        self.do_align_long_axis = do_align_long_axis
+        self.do_pad = do_pad
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_crop_margin": self.do_crop_margin,
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_thumbnail": self.do_thumbnail,
+            "do_align_long_axis": self.do_align_long_axis,
+            "do_pad": self.do_pad,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+        }
+
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.size["height"], self.size["width"]
+
+    def prepare_dummy_image(self):
+        filepath = hf_hub_download(
+            repo_id="hf-internal-testing/fixtures_docvqa", filename="nougat_pdf.png", repo_type="dataset"
+        )
+        image = Image.open(filepath).convert("RGB")
+        return image
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class NougatImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = NougatImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = NougatImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    @cached_property
+    def image_processor(self):
+        return self.image_processing_class(**self.image_processor_dict)
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"height": 20, "width": 20})
+
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
+        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
+
+    def test_expected_output(self):
+        dummy_image = self.image_processor_tester.prepare_dummy_image()
+        image_processor = self.image_processor
+        inputs = image_processor(dummy_image, return_tensors="pt")
+        self.assertTrue(torch.allclose(inputs["pixel_values"].mean(), torch.tensor(0.4906), atol=1e-3, rtol=1e-3))
+
+    def test_crop_margin_all_white(self):
+        image = np.uint8(np.ones((100, 100, 3)) * 255)
+        image_processor = self.image_processor
+        cropped_image = image_processor.crop_margin(image)
+        self.assertTrue(np.array_equal(image, cropped_image))
+
+    def test_crop_margin_centered_black_square(self):
+        image = np.ones((100, 100, 3), dtype=np.uint8) * 255
+        image[45:55, 45:55, :] = 0
+        image_processor = self.image_processor
+        cropped_image = image_processor.crop_margin(image)
+        expected_cropped = image[45:55, 45:55, :]
+        self.assertTrue(np.array_equal(expected_cropped, cropped_image))
+
+    def test_align_long_axis_no_rotation(self):
+        image = np.uint8(np.ones((100, 200, 3)) * 255)
+        image_processor = self.image_processor
+        size = {"height": 200, "width": 300}
+        aligned_image = image_processor.align_long_axis(image, size)
+        self.assertEqual(image.shape, aligned_image.shape)
+
+    def test_align_long_axis_with_rotation(self):
+        image = np.uint8(np.ones((200, 100, 3)) * 255)
+        image_processor = self.image_processor
+        size = {"height": 300, "width": 200}
+        aligned_image = image_processor.align_long_axis(image, size)
+        self.assertEqual((200, 100, 3), aligned_image.shape)
+
+    def test_align_long_axis_data_format(self):
+        image = np.uint8(np.ones((100, 200, 3)) * 255)
+        data_format = "channels_first"
+        size = {"height": 200, "width": 300}
+        image_processor = self.image_processor
+        aligned_image = image_processor.align_long_axis(image, size, data_format=data_format)
+        self.assertEqual((3, 100, 200), aligned_image.shape)
+
+    def prepare_dummy_np_image(self):
+        filepath = hf_hub_download(
+            repo_id="hf-internal-testing/fixtures_docvqa", filename="nougat_pdf.png", repo_type="dataset"
+        )
+        image = Image.open(filepath).convert("RGB")
+        return np.array(image)
+
+    def test_crop_margin_equality_cv2_python(self):
+        image = self.prepare_dummy_np_image()
+        image_processor = self.image_processor
+        image_cropped_python = image_processor.crop_margin(image)
+
+        self.assertEqual(image_cropped_python.shape, (850, 685, 3))
+        self.assertEqual(image_cropped_python.mean(), 237.43881150708458)
diff --git a/tests/models/nougat/test_tokenization_nougat.py b/tests/models/nougat/test_tokenization_nougat.py
new file mode 100644
index 000000000000..bfb1090dadae
--- /dev/null
+++ b/tests/models/nougat/test_tokenization_nougat.py
@@ -0,0 +1,196 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import NougatTokenizerFast
+from transformers.models.nougat.tokenization_nougat_fast import markdown_compatible, normalize_list_like_lines
+from transformers.testing_utils import require_levenshtein, require_nltk, require_tokenizers
+
+from ...test_tokenization_common import TokenizerTesterMixin
+
+
+@require_tokenizers
+class NougatTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+    slow_tokenizer_class = None
+    rust_tokenizer_class = NougatTokenizerFast
+    tokenizer_class = NougatTokenizerFast
+    test_rust_tokenizer = True
+    test_slow_tokenizer = False
+    from_pretrained_vocab_key = "tokenizer_file"
+    special_tokens_map = {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
+
+    def setUp(self):
+        super().setUp()
+        tokenizer = NougatTokenizerFast.from_pretrained("facebook/nougat-base")
+        tokenizer.save_pretrained(self.tmpdirname)
+
+    def get_rust_tokenizer(self, **kwargs):
+        kwargs.update(self.special_tokens_map)
+        return NougatTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
+
+    def test_padding(self, max_length=6):
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                # Simple input
+                sentence1 = "This is a simple input"
+                sentence2 = ["This is a simple input 1", "This is a simple input 2"]
+                pair1 = ("This is a simple input", "This is a pair")
+                pair2 = [
+                    ("This is a simple input 1", "This is a simple input 2"),
+                    ("This is a simple pair 1", "This is a simple pair 2"),
+                ]
+
+                # Simple input tests
+                try:
+                    tokenizer_r.encode(sentence1, max_length=max_length)
+                    tokenizer_r.encode_plus(sentence1, max_length=max_length)
+
+                    tokenizer_r.batch_encode_plus(sentence2, max_length=max_length)
+                    tokenizer_r.encode(pair1, max_length=max_length)
+                    tokenizer_r.batch_encode_plus(pair2, max_length=max_length)
+                except ValueError:
+                    self.fail("Nougat Tokenizer should be able to deal with padding")
+
+                tokenizer_r.pad_token = None  # Hotfixing padding = None
+                self.assertRaises(
+                    ValueError, tokenizer_r.encode, sentence1, max_length=max_length, padding="max_length"
+                )
+
+                # Simple input
+                self.assertRaises(
+                    ValueError, tokenizer_r.encode_plus, sentence1, max_length=max_length, padding="max_length"
+                )
+
+                # Simple input
+                self.assertRaises(
+                    ValueError,
+                    tokenizer_r.batch_encode_plus,
+                    sentence2,
+                    max_length=max_length,
+                    padding="max_length",
+                )
+
+                # Pair input
+                self.assertRaises(ValueError, tokenizer_r.encode, pair1, max_length=max_length, padding="max_length")
+
+                # Pair input
+                self.assertRaises(
+                    ValueError, tokenizer_r.encode_plus, pair1, max_length=max_length, padding="max_length"
+                )
+
+                # Pair input
+                self.assertRaises(
+                    ValueError,
+                    tokenizer_r.batch_encode_plus,
+                    pair2,
+                    max_length=max_length,
+                    padding="max_length",
+                )
+
+    @unittest.skip("NougatTokenizerFast does not have tokenizer_file in its signature")
+    def test_rust_tokenizer_signature(self):
+        pass
+
+    @unittest.skip("NougatTokenizerFast does not support pretokenized inputs")
+    def test_pretokenized_inputs(self):
+        pass
+
+    @unittest.skip("NougatTokenizerFast directly inherits from PreTrainedTokenizerFast")
+    def test_prepare_for_model(self):
+        pass
+
+    @unittest.skip("This needs a slow tokenizer. Nougat does not have one!")
+    def test_encode_decode_with_spaces(self):
+        pass
+
+
+class MarkdownCompatibleTest(unittest.TestCase):
+    def test_equation_tag(self):
+        input_text = "(3.2) \\[Equation Text\\]"
+        excepted_output = "\\[Equation Text \\tag{3.2}\\]"
+        self.assertEqual(markdown_compatible(input_text), excepted_output)
+
+    def test_equation_tag_letters(self):
+        input_text = "(18a) \\[Equation Text\\]"
+        excepted_output = "\\[Equation Text \\tag{18a}\\]"
+        self.assertEqual(markdown_compatible(input_text), excepted_output)
+
+    def test_bold_formatting(self):
+        input_text = r"This is \bm{bold} text."
+        expected_output = r"This is \mathbf{bold} text."
+        self.assertEqual(markdown_compatible(input_text), expected_output)
+
+    def test_url_conversion(self):
+        input_text = "Visit my website at https://www.example.com"
+        expected_output = "Visit my website at [https://www.example.com](https://www.example.com)"
+        self.assertEqual(markdown_compatible(input_text), expected_output)
+
+    def test_algorithm_code_block(self):
+        input_text = "```python\nprint('Hello, world!')\n```"
+        expected_output = "```\npython\nprint('Hello, world!')\n```"
+        self.assertEqual(markdown_compatible(input_text), expected_output)
+
+    def test_escape_characters(self):
+        input_text = r"Escaped characters like \n should not be \\[affected\\]"
+        expected_output = r"Escaped characters like \n should not be \\[affected\\]"
+        self.assertEqual(markdown_compatible(input_text), expected_output)
+
+    def test_nested_tags(self):
+        input_text = r"This is a super nested \bm{\bm{\bm{\bm{\bm{bold}}}}} tag."
+        expected_output = r"This is a super nested \mathbf{\mathbf{\mathbf{\mathbf{\mathbf{bold}}}}} tag."
+        self.assertEqual(markdown_compatible(input_text), expected_output)
+
+
+class TestNormalizeListLikeLines(unittest.TestCase):
+    def test_two_level_lines(self):
+        input_str = "* Item 1 * Item 2"
+        expected_output = "* Item 1\n* Item 2\n"
+        self.assertEqual(normalize_list_like_lines(input_str), expected_output)
+
+    def test_three_level_lines(self):
+        input_str = "- I. Item 1 - II. Item 2 - III. Item 3"
+        expected_output = "- I. Item 1\n- II. Item 2\n- III. Item 3\n"
+        self.assertEqual(normalize_list_like_lines(input_str), expected_output)
+
+    def test_nested_lines(self):
+        input_str = "- I. Item 1 - I.1 Sub-item 1 - I.1.1 Sub-sub-item 1 - II. Item 2"
+        expected_output = "- I. Item 1\n\t- I.1 Sub-item 1\n\t\t- I.1.1 Sub-sub-item 1\n- II. Item 2\n"
+        self.assertEqual(normalize_list_like_lines(input_str), expected_output)
+
+
+@require_tokenizers
+class NougatPostProcessingTest(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        self.tokenizer = NougatTokenizerFast.from_pretrained("facebook/nougat-base")
+
+    def test_correct_tables_basic(self):
+        input_str = "\\begin{table} \\begin{tabular}{l l}  & \\ \\end{tabular} \\end{table}"
+        expected_output = "\\begin{table}\n\\begin{tabular}{l l}  & \\ \\end{tabular}\n\\end{table}"
+        self.assertEqual(self.tokenizer.correct_tables(input_str), expected_output)
+
+    def test_correct_tables_high_count(self):
+        input_str = "\\begin{tabular}" * 20
+        expected_output = ""
+        self.assertEqual(self.tokenizer.correct_tables(input_str), expected_output)
+
+    @require_levenshtein
+    @require_nltk
+    def test_postprocess_as_nougat_no_markdown(self):
+        input_str = "# Nougat: Neural Optical Understanding for Academic Documents\n\n Lukas Blecher\n\nCorrespondence to: lblecher@meta.com\n\nGuillem Cucurull\n\nThomas Scialom\n\nRobert Stojnic\n\nMeta AI\n\nThe paper reports 8.1M papers but the authors recently updated the numbers on the GitHub page https://github.com/allenai/s2orc\n\n###### Abstract\n\nScientific knowledge is predominantly stored in books and scientific journals, often in the form of PDFs. However, the PDF format leads to a loss of semantic information, particularly for mathematical expressions. We propose Nougat (**N**eural **O**ptical **U**nderstanding for **A**cademic Documents), a Visual Transformer model that performs an _Optical Character Recognition_ (OCR) task for processing scientific documents into a markup language, and demonstrate the effectiveness of our model on a new dataset of scientific documents. The proposed approach offers a promising solution to enhance the accessibility of scientific knowledge in the digital age, by bridging the gap between human-readable documents and machine-readable text. We release the models and code to accelerate future work on scientific text recognition.\n\n## 1 Introduction\n\nThe majority of scientific knowledge is stored in books or published in scientific journals, most commonly in the Portable Document Format (PDF). Next to HTML, PDFs are the second most prominent data format on the internet, making up 2.4% of common crawl [1]. However, the information stored in these files is very difficult to extract into any other formats. This is especially true for highly specialized documents, such as scientific research papers, where the semantic information of mathematical expressions is lost.\n\nExisting Optical Character Recognition (OCR) engines, such as Tesseract OCR [2], excel at detecting and classifying individual characters and words in an image, but fail to understand the relationship between them due to their line-by-line approach. This means that they treat superscripts and subscripts in the same way as the surrounding text, which is a significant drawback for mathematical expressions. In mathematical notations like fractions, exponents, and matrices, relative positions of characters are crucial.\n\nConverting academic research papers into machine-readable text also enables accessibility and searchability of science as a whole. The information of millions of academic papers can not be fully accessed because they are locked behind an unreadable format. Existing corpora, such as the S2ORC dataset [3], capture the text of 12M2 papers using GROBID [4], but are missing meaningful representations of the mathematical equations.\n\nFootnote 2: The paper reports 8.1M papers but the authors recently updated the numbers on the GitHub page https://github.com/allenai/s2orc\n\nTo this end, we introduce Nougat, a transformer based model that can convert images of document pages to formatted markup text.\n\nThe primary contributions in this paper are\n\n* Release of a pre-trained model capable of converting a PDF to a lightweight markup language. We release the code and the model on GitHub3 Footnote 3: https://github.com/facebookresearch/nougat\n* We introduce a pipeline to create dataset for pairing PDFs to source code\n* Our method is only dependent on the image of a page, allowing access to scanned papers and books"  # noqa: E231
+        expected_output = "\n\n# Nougat: Neural Optical Understanding for Academic Documents\n\n Lukas Blecher\n\nCorrespondence to: lblecher@meta.com\n\nGuillem Cucurull\n\nThomas Scialom\n\nRobert Stojnic\n\nMeta AI\n\nThe paper reports 8.1M papers but the authors recently updated the numbers on the GitHub page https://github.com/allenai/s2orc\n\n###### Abstract\n\nScientific knowledge is predominantly stored in books and scientific journals, often in the form of PDFs. However, the PDF format leads to a loss of semantic information, particularly for mathematical expressions. We propose Nougat (**N**eural **O**ptical **U**nderstanding for **A**cademic Documents), a Visual Transformer model that performs an _Optical Character Recognition_ (OCR) task for processing scientific documents into a markup language, and demonstrate the effectiveness of our model on a new dataset of scientific documents. The proposed approach offers a promising solution to enhance the accessibility of scientific knowledge in the digital age, by bridging the gap between human-readable documents and machine-readable text. We release the models and code to accelerate future work on scientific text recognition.\n\n## 1 Introduction\n\nThe majority of scientific knowledge is stored in books or published in scientific journals, most commonly in the Portable Document Format (PDF). Next to HTML, PDFs are the second most prominent data format on the internet, making up 2.4% of common crawl [1]. However, the information stored in these files is very difficult to extract into any other formats. This is especially true for highly specialized documents, such as scientific research papers, where the semantic information of mathematical expressions is lost.\n\nExisting Optical Character Recognition (OCR) engines, such as Tesseract OCR [2], excel at detecting and classifying individual characters and words in an image, but fail to understand the relationship between them due to their line-by-line approach. This means that they treat superscripts and subscripts in the same way as the surrounding text, which is a significant drawback for mathematical expressions. In mathematical notations like fractions, exponents, and matrices, relative positions of characters are crucial.\n\nConverting academic research papers into machine-readable text also enables accessibility and searchability of science as a whole. The information of millions of academic papers can not be fully accessed because they are locked behind an unreadable format. Existing corpora, such as the S2ORC dataset [3], capture the text of 12M2 papers using GROBID [4], but are missing meaningful representations of the mathematical equations.\n\nFootnote 2: The paper reports 8.1M papers but the authors recently updated the numbers on the GitHub page https://github.com/allenai/s2orc\n\nTo this end, we introduce Nougat, a transformer based model that can convert images of document pages to formatted markup text.\n\nThe primary contributions in this paper are\n\n* Release of a pre-trained model capable of converting a PDF to a lightweight markup language. We release the code and the model on GitHub3 Footnote 3: https://github.com/facebookresearch/nougat\n* We introduce a pipeline to create dataset for pairing PDFs to source code\n* Our method is only dependent on the image of a page, allowing access to scanned papers and books"  # noqa: E231
+        self.assertEqual(self.tokenizer.post_process_single(input_str, fix_markdown=False), expected_output)
diff --git a/tests/models/nystromformer/test_modeling_nystromformer.py b/tests/models/nystromformer/test_modeling_nystromformer.py
index b93c074bf683..ae06670103c8 100644
--- a/tests/models/nystromformer/test_modeling_nystromformer.py
+++ b/tests/models/nystromformer/test_modeling_nystromformer.py
@@ -22,6 +22,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -50,7 +51,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -216,8 +217,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class NystromformerModelTest(ModelTesterMixin, unittest.TestCase):
-
+class NystromformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             NystromformerModel,
@@ -230,6 +230,18 @@ class NystromformerModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": NystromformerModel,
+            "fill-mask": NystromformerForMaskedLM,
+            "question-answering": NystromformerForQuestionAnswering,
+            "text-classification": NystromformerForSequenceClassification,
+            "token-classification": NystromformerForTokenClassification,
+            "zero-shot": NystromformerForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
     test_pruning = False
     test_headmasking = False
 
diff --git a/tests/models/oneformer/__init__.py b/tests/models/oneformer/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/oneformer/test_image_processing_oneformer.py b/tests/models/oneformer/test_image_processing_oneformer.py
new file mode 100644
index 000000000000..6fa95f234147
--- /dev/null
+++ b/tests/models/oneformer/test_image_processing_oneformer.py
@@ -0,0 +1,334 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import unittest
+
+import numpy as np
+from huggingface_hub import hf_hub_download
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+    if is_vision_available():
+        from transformers import OneFormerImageProcessor
+        from transformers.models.oneformer.image_processing_oneformer import binary_mask_to_rle
+        from transformers.models.oneformer.modeling_oneformer import OneFormerForUniversalSegmentationOutput
+
+if is_vision_available():
+    from PIL import Image
+
+
+def prepare_metadata(class_info_file, repo_path="shi-labs/oneformer_demo"):
+    with open(hf_hub_download(repo_path, class_info_file, repo_type="dataset"), "r") as f:
+        class_info = json.load(f)
+    metadata = {}
+    class_names = []
+    thing_ids = []
+    for key, info in class_info.items():
+        metadata[key] = info["name"]
+        class_names.append(info["name"])
+        if info["isthing"]:
+            thing_ids.append(int(key))
+    metadata["thing_ids"] = thing_ids
+    metadata["class_names"] = class_names
+    return metadata
+
+
+class OneFormerImageProcessorTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        min_resolution=30,
+        max_resolution=400,
+        size=None,
+        do_resize=True,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+        num_labels=10,
+        do_reduce_labels=False,
+        ignore_index=255,
+        repo_path="shi-labs/oneformer_demo",
+        class_info_file="ade20k_panoptic.json",
+        num_text=10,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = {"shortest_edge": 32, "longest_edge": 1333} if size is None else size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.class_info_file = class_info_file
+        self.metadata = prepare_metadata(class_info_file, repo_path)
+        self.num_text = num_text
+        self.repo_path = repo_path
+
+        # for the post_process_functions
+        self.batch_size = 2
+        self.num_queries = 10
+        self.num_classes = 10
+        self.height = 3
+        self.width = 4
+        self.num_labels = num_labels
+        self.do_reduce_labels = do_reduce_labels
+        self.ignore_index = ignore_index
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "num_labels": self.num_labels,
+            "do_reduce_labels": self.do_reduce_labels,
+            "ignore_index": self.ignore_index,
+            "class_info_file": self.class_info_file,
+            "metadata": self.metadata,
+            "num_text": self.num_text,
+        }
+
+    def get_expected_values(self, image_inputs, batched=False):
+        """
+        This function computes the expected height and width when providing images to OneFormerImageProcessor,
+        assuming do_resize is set to True with a scalar size.
+        """
+        if not batched:
+            image = image_inputs[0]
+            if isinstance(image, Image.Image):
+                w, h = image.size
+            else:
+                h, w = image.shape[1], image.shape[2]
+            if w < h:
+                expected_height = int(self.size["shortest_edge"] * h / w)
+                expected_width = self.size["shortest_edge"]
+            elif w > h:
+                expected_height = self.size["shortest_edge"]
+                expected_width = int(self.size["shortest_edge"] * w / h)
+            else:
+                expected_height = self.size["shortest_edge"]
+                expected_width = self.size["shortest_edge"]
+
+        else:
+            expected_values = []
+            for image in image_inputs:
+                expected_height, expected_width = self.get_expected_values([image])
+                expected_values.append((expected_height, expected_width))
+            expected_height = max(expected_values, key=lambda item: item[0])[0]
+            expected_width = max(expected_values, key=lambda item: item[1])[1]
+
+        return expected_height, expected_width
+
+    def get_fake_oneformer_outputs(self):
+        return OneFormerForUniversalSegmentationOutput(
+            # +1 for null class
+            class_queries_logits=torch.randn((self.batch_size, self.num_queries, self.num_classes + 1)),
+            masks_queries_logits=torch.randn((self.batch_size, self.num_queries, self.height, self.width)),
+        )
+
+    def expected_output_image_shape(self, images):
+        height, width = self.get_expected_values(images, batched=True)
+        return self.num_channels, height, width
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class OneFormerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = OneFormerImageProcessor if (is_vision_available() and is_torch_available()) else None
+    # only for test_image_processing_common.test_image_proc_to_json_string
+    image_processing_class = image_processing_class
+
+    def setUp(self):
+        self.image_processor_tester = OneFormerImageProcessorTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_proc_properties(self):
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processor, "image_mean"))
+        self.assertTrue(hasattr(image_processor, "image_std"))
+        self.assertTrue(hasattr(image_processor, "do_normalize"))
+        self.assertTrue(hasattr(image_processor, "do_resize"))
+        self.assertTrue(hasattr(image_processor, "size"))
+        self.assertTrue(hasattr(image_processor, "ignore_index"))
+        self.assertTrue(hasattr(image_processor, "class_info_file"))
+        self.assertTrue(hasattr(image_processor, "num_text"))
+        self.assertTrue(hasattr(image_processor, "repo_path"))
+        self.assertTrue(hasattr(image_processor, "metadata"))
+        self.assertTrue(hasattr(image_processor, "do_reduce_labels"))
+
+    def comm_get_image_processor_inputs(
+        self, with_segmentation_maps=False, is_instance_map=False, segmentation_type="np"
+    ):
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+        # prepare image and target
+        num_labels = self.image_processor_tester.num_labels
+        annotations = None
+        instance_id_to_semantic_id = None
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
+        if with_segmentation_maps:
+            high = num_labels
+            if is_instance_map:
+                labels_expanded = list(range(num_labels)) * 2
+                instance_id_to_semantic_id = dict(enumerate(labels_expanded))
+            annotations = [
+                np.random.randint(0, high * 2, (img.size[1], img.size[0])).astype(np.uint8) for img in image_inputs
+            ]
+            if segmentation_type == "pil":
+                annotations = [Image.fromarray(annotation) for annotation in annotations]
+
+        inputs = image_processor(
+            image_inputs,
+            ["semantic"] * len(image_inputs),
+            annotations,
+            return_tensors="pt",
+            instance_id_to_semantic_id=instance_id_to_semantic_id,
+            pad_and_return_pixel_mask=True,
+        )
+
+        return inputs
+
+    def test_init_without_params(self):
+        pass
+
+    def test_call_with_segmentation_maps(self):
+        def common(is_instance_map=False, segmentation_type=None):
+            inputs = self.comm_get_image_processor_inputs(
+                with_segmentation_maps=True, is_instance_map=is_instance_map, segmentation_type=segmentation_type
+            )
+
+            mask_labels = inputs["mask_labels"]
+            class_labels = inputs["class_labels"]
+            pixel_values = inputs["pixel_values"]
+            text_inputs = inputs["text_inputs"]
+
+            # check the batch_size
+            for mask_label, class_label, text_input in zip(mask_labels, class_labels, text_inputs):
+                self.assertEqual(mask_label.shape[0], class_label.shape[0])
+                # this ensure padding has happened
+                self.assertEqual(mask_label.shape[1:], pixel_values.shape[2:])
+                self.assertEqual(len(text_input), self.image_processor_tester.num_text)
+
+        common()
+        common(is_instance_map=True)
+        common(is_instance_map=False, segmentation_type="pil")
+        common(is_instance_map=True, segmentation_type="pil")
+
+    def test_binary_mask_to_rle(self):
+        fake_binary_mask = np.zeros((20, 50))
+        fake_binary_mask[0, 20:] = 1
+        fake_binary_mask[1, :15] = 1
+        fake_binary_mask[5, :10] = 1
+
+        rle = binary_mask_to_rle(fake_binary_mask)
+        self.assertEqual(len(rle), 4)
+        self.assertEqual(rle[0], 21)
+        self.assertEqual(rle[1], 45)
+
+    def test_post_process_semantic_segmentation(self):
+        fature_extractor = self.image_processing_class(
+            num_labels=self.image_processor_tester.num_classes,
+            max_seq_length=77,
+            task_seq_length=77,
+            class_info_file="ade20k_panoptic.json",
+            num_text=self.image_processor_tester.num_text,
+            repo_path="shi-labs/oneformer_demo",
+        )
+        outputs = self.image_processor_tester.get_fake_oneformer_outputs()
+
+        segmentation = fature_extractor.post_process_semantic_segmentation(outputs)
+
+        self.assertEqual(len(segmentation), self.image_processor_tester.batch_size)
+        self.assertEqual(
+            segmentation[0].shape,
+            (
+                self.image_processor_tester.height,
+                self.image_processor_tester.width,
+            ),
+        )
+
+        target_sizes = [(1, 4) for i in range(self.image_processor_tester.batch_size)]
+        segmentation = fature_extractor.post_process_semantic_segmentation(outputs, target_sizes=target_sizes)
+
+        self.assertEqual(segmentation[0].shape, target_sizes[0])
+
+    def test_post_process_instance_segmentation(self):
+        image_processor = self.image_processing_class(
+            num_labels=self.image_processor_tester.num_classes,
+            max_seq_length=77,
+            task_seq_length=77,
+            class_info_file="ade20k_panoptic.json",
+            num_text=self.image_processor_tester.num_text,
+            repo_path="shi-labs/oneformer_demo",
+        )
+        outputs = self.image_processor_tester.get_fake_oneformer_outputs()
+        segmentation = image_processor.post_process_instance_segmentation(outputs, threshold=0)
+
+        self.assertTrue(len(segmentation) == self.image_processor_tester.batch_size)
+        for el in segmentation:
+            self.assertTrue("segmentation" in el)
+            self.assertTrue("segments_info" in el)
+            self.assertEqual(type(el["segments_info"]), list)
+            self.assertEqual(
+                el["segmentation"].shape, (self.image_processor_tester.height, self.image_processor_tester.width)
+            )
+
+    def test_post_process_panoptic_segmentation(self):
+        image_processor = self.image_processing_class(
+            num_labels=self.image_processor_tester.num_classes,
+            max_seq_length=77,
+            task_seq_length=77,
+            class_info_file="ade20k_panoptic.json",
+            num_text=self.image_processor_tester.num_text,
+            repo_path="shi-labs/oneformer_demo",
+        )
+        outputs = self.image_processor_tester.get_fake_oneformer_outputs()
+        segmentation = image_processor.post_process_panoptic_segmentation(outputs, threshold=0)
+
+        self.assertTrue(len(segmentation) == self.image_processor_tester.batch_size)
+        for el in segmentation:
+            self.assertTrue("segmentation" in el)
+            self.assertTrue("segments_info" in el)
+            self.assertEqual(type(el["segments_info"]), list)
+            self.assertEqual(
+                el["segmentation"].shape, (self.image_processor_tester.height, self.image_processor_tester.width)
+            )
diff --git a/tests/models/oneformer/test_modeling_oneformer.py b/tests/models/oneformer/test_modeling_oneformer.py
new file mode 100644
index 000000000000..1fa400a22855
--- /dev/null
+++ b/tests/models/oneformer/test_modeling_oneformer.py
@@ -0,0 +1,581 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch OneFormer model. """
+
+import copy
+import inspect
+import unittest
+
+import numpy as np
+
+from tests.test_modeling_common import floats_tensor
+from transformers import OneFormerConfig, is_torch_available, is_vision_available
+from transformers.testing_utils import (
+    require_torch,
+    require_torch_gpu,
+    require_torch_multi_gpu,
+    require_vision,
+    slow,
+    torch_device,
+)
+from transformers.utils import cached_property
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import OneFormerForUniversalSegmentation, OneFormerModel
+
+    if is_vision_available():
+        from transformers import OneFormerProcessor
+
+if is_vision_available():
+    from PIL import Image
+
+
+def _config_zero_init(config):
+    configs_no_init = copy.deepcopy(config)
+    for key in configs_no_init.__dict__.keys():
+        if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key:
+            setattr(configs_no_init, key, 1e-10)
+    return configs_no_init
+
+
+class OneFormerModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=2,
+        is_training=True,
+        vocab_size=99,
+        use_auxiliary_loss=False,
+        num_queries=10,
+        num_channels=3,
+        min_size=32 * 8,
+        max_size=32 * 8,
+        num_labels=4,
+        hidden_dim=64,
+        sequence_length=77,
+        n_ctx=4,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.is_training = is_training
+        self.vocab_size = vocab_size
+        self.use_auxiliary_loss = use_auxiliary_loss
+        self.num_queries = num_queries
+        self.num_channels = num_channels
+        self.min_size = min_size
+        self.max_size = max_size
+        self.num_labels = num_labels
+        self.hidden_dim = hidden_dim
+        self.sequence_length = sequence_length
+        self.n_ctx = n_ctx
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.min_size, self.max_size]).to(
+            torch_device
+        )
+
+        task_inputs = (
+            torch.randint(high=self.vocab_size, size=(self.batch_size, self.sequence_length)).to(torch_device).long()
+        )
+
+        pixel_mask = torch.ones([self.batch_size, self.min_size, self.max_size], device=torch_device)
+
+        text_inputs = (
+            torch.randint(
+                high=self.vocab_size, size=(self.batch_size, self.num_queries - self.n_ctx, self.sequence_length)
+            )
+            .to(torch_device)
+            .long()
+        )
+
+        mask_labels = (
+            torch.rand([self.batch_size, self.num_labels, self.min_size, self.max_size], device=torch_device) > 0.5
+        ).float()
+        class_labels = (torch.rand((self.batch_size, self.num_labels), device=torch_device) > 0.5).long()
+
+        config = self.get_config()
+        return config, pixel_values, task_inputs, text_inputs, pixel_mask, mask_labels, class_labels
+
+    def get_config(self):
+        config = OneFormerConfig(
+            text_encoder_vocab_size=self.vocab_size,
+            hidden_size=self.hidden_dim,
+            num_queries=self.num_queries,
+            num_labels=self.num_labels,
+            encoder_feedforward_dim=32,
+            dim_feedforward=64,
+            encoder_layers=2,
+            decoder_layers=2,
+        )
+
+        config.backbone_config.embed_dim = 16
+        config.backbone_config.depths = [1, 1, 1, 1]
+        config.backbone_config.hidden_size = 16
+        config.backbone_config.num_channels = self.num_channels
+        config.backbone_config.num_heads = [1, 1, 2, 2]
+
+        config.hidden_dim = self.hidden_dim
+        config.mask_dim = self.hidden_dim
+        config.conv_dim = self.hidden_dim
+
+        config.text_encoder_width = self.hidden_dim
+        config.task_seq_len = self.sequence_length
+        config.max_seq_len = self.sequence_length
+        config.text_encoder_context_length = self.sequence_length
+        config.text_encoder_n_ctx = self.n_ctx
+
+        return config
+
+    def prepare_config_and_inputs_for_common(self):
+        config, pixel_values, task_inputs, pixel_mask, _, _, _ = self.prepare_config_and_inputs()
+        inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask, "task_inputs": task_inputs}
+        return config, inputs_dict
+
+    def check_output_hidden_state(self, output, config):
+        encoder_hidden_states = output.encoder_hidden_states
+        pixel_decoder_hidden_states = output.pixel_decoder_hidden_states
+        transformer_decoder_hidden_states = output.transformer_decoder_hidden_states
+
+        self.parent.assertTrue(len(encoder_hidden_states), len(config.backbone_config.depths))
+        self.parent.assertTrue(len(pixel_decoder_hidden_states), config.encoder_layers)
+        self.parent.assertTrue(len(transformer_decoder_hidden_states), config.decoder_layers - 1)
+
+    def create_and_check_oneformer_model(
+        self, config, pixel_values, task_inputs, pixel_mask, output_hidden_states=False
+    ):
+        with torch.no_grad():
+            model = OneFormerModel(config=config)
+            model.to(torch_device)
+            model.eval()
+
+            output = model(pixel_values=pixel_values, task_inputs=task_inputs, pixel_mask=pixel_mask)
+            output = model(pixel_values, task_inputs=task_inputs, output_hidden_states=True)
+        # the correct shape of output.transformer_decoder_hidden_states ensure the correcteness of the
+        # encoder and pixel decoder
+        self.parent.assertEqual(
+            output.transformer_decoder_object_queries.shape,
+            (self.batch_size, self.num_queries, self.hidden_dim),
+        )
+        # let's ensure the other two hidden state exists
+        self.parent.assertTrue(output.pixel_decoder_hidden_states is not None)
+        self.parent.assertTrue(output.encoder_hidden_states is not None)
+
+        if output_hidden_states:
+            self.check_output_hidden_state(output, config)
+
+    def create_and_check_oneformer_universal_segmentation_head_model(
+        self, config, pixel_values, task_inputs, text_inputs, pixel_mask, mask_labels, class_labels
+    ):
+        model = OneFormerForUniversalSegmentation(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        def comm_check_on_output(result):
+            # let's still check that all the required stuff is there
+            self.parent.assertTrue(result.transformer_decoder_hidden_states is not None)
+            self.parent.assertTrue(result.pixel_decoder_hidden_states is not None)
+            self.parent.assertTrue(result.encoder_hidden_states is not None)
+            # okay, now we need to check the logits shape
+            # due to the encoder compression, masks have a //4 spatial size
+            self.parent.assertEqual(
+                result.masks_queries_logits.shape,
+                (self.batch_size, self.num_queries, self.min_size // 4, self.max_size // 4),
+            )
+            # + 1 for null class
+            self.parent.assertEqual(
+                result.class_queries_logits.shape, (self.batch_size, self.num_queries, self.num_labels + 1)
+            )
+
+        with torch.no_grad():
+            result = model(pixel_values=pixel_values, task_inputs=task_inputs, pixel_mask=pixel_mask)
+            result = model(pixel_values, task_inputs)
+
+            comm_check_on_output(result)
+
+        config.is_training = True
+        model = OneFormerForUniversalSegmentation(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            result = model(
+                pixel_values=pixel_values,
+                task_inputs=task_inputs,
+                pixel_mask=pixel_mask,
+                mask_labels=mask_labels,
+                class_labels=class_labels,
+                text_inputs=text_inputs,
+            )
+
+        comm_check_on_output(result)
+
+        self.parent.assertTrue(result.loss is not None)
+        self.parent.assertEqual(result.loss.shape, torch.Size([1]))
+
+
+@require_torch
+class OneFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (OneFormerModel, OneFormerForUniversalSegmentation) if is_torch_available() else ()
+    pipeline_model_mapping = {"feature-extraction": OneFormerModel} if is_torch_available() else {}
+
+    is_encoder_decoder = False
+    test_pruning = False
+    test_head_masking = False
+    test_missing_keys = False
+
+    # TODO: Fix the failed tests when this model gets more usage
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if pipeline_test_casse_name == "FeatureExtractionPipelineTests":
+            return True
+
+        return False
+
+    def setUp(self):
+        self.model_tester = OneFormerModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=OneFormerConfig, has_text_modality=False)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_oneformer_model(self):
+        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.create_and_check_oneformer_model(config, **inputs, output_hidden_states=False)
+
+    def test_oneformer_universal_segmentation_head_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_oneformer_universal_segmentation_head_model(*config_and_inputs)
+
+    def test_model_main_input_name(self):
+        for model_class in self.all_model_classes:
+            model_signature = inspect.signature(getattr(model_class, "forward"))
+            # The main input is the name of the argument after `self`
+            observed_main_input_name = list(model_signature.parameters.keys())[1:3]
+            self.assertEqual(model_class.main_input_name, observed_main_input_name)
+
+    @unittest.skip(reason="OneFormer uses two main inputs")
+    def test_torchscript_simple(self):
+        pass
+
+    @unittest.skip(reason="OneFormer uses two main inputs")
+    def test_torchscript_output_attentions(self):
+        pass
+
+    @unittest.skip(reason="OneFormer uses two main inputs")
+    def test_torchscript_output_hidden_state(self):
+        pass
+
+    @unittest.skip(reason="OneFormer does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="OneFormer does not have a get_input_embeddings method")
+    def test_model_common_attributes(self):
+        pass
+
+    @unittest.skip(reason="OneFormer is not a generative model")
+    def test_generate_without_input_ids(self):
+        pass
+
+    @unittest.skip(reason="OneFormer does not use token embeddings")
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    @require_torch_multi_gpu
+    @unittest.skip(
+        reason="OneFormer has some layers using `add_module` which doesn't work well with `nn.DataParallel`"
+    )
+    def test_multi_gpu_data_parallel_forward(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values", "task_inputs"]
+            self.assertListEqual(arg_names[:2], expected_arg_names)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in ["shi-labs/oneformer_ade20k_swin_tiny"]:
+            model = OneFormerModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    def test_model_with_labels(self):
+        size = (self.model_tester.min_size,) * 2
+        inputs = {
+            "pixel_values": torch.randn((2, 3, *size), device=torch_device),
+            "task_inputs": torch.randint(high=self.model_tester.vocab_size, size=(2, 77), device=torch_device).long(),
+            "text_inputs": torch.randint(
+                high=self.model_tester.vocab_size, size=(2, 6, 77), device=torch_device
+            ).long(),
+            "mask_labels": torch.randn((2, 150, *size), device=torch_device),
+            "class_labels": torch.zeros(2, 150, device=torch_device).long(),
+        }
+
+        config = self.model_tester.get_config()
+        config.is_training = True
+
+        model = OneFormerForUniversalSegmentation(config).to(torch_device)
+        outputs = model(**inputs)
+        self.assertTrue(outputs.loss is not None)
+
+    def test_hidden_states_output(self):
+        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.create_and_check_oneformer_model(config, **inputs, output_hidden_states=True)
+
+    def test_attention_outputs(self):
+        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config).to(torch_device)
+            outputs = model(**inputs, output_attentions=True)
+            self.assertTrue(outputs.attentions is not None)
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.contrastive_temperature = 1
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    self.assertIn(
+                        ((param.data.mean() * 1e9).round() / 1e9).item(),
+                        [0.0, 1.0],
+                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                    )
+
+    def test_training(self):
+        if not self.model_tester.is_training:
+            return
+        # only OneFormerForUniversalSegmentation has the loss
+        model_class = self.all_model_classes[1]
+        (
+            config,
+            pixel_values,
+            task_inputs,
+            text_inputs,
+            pixel_mask,
+            mask_labels,
+            class_labels,
+        ) = self.model_tester.prepare_config_and_inputs()
+        config.is_training = True
+
+        model = model_class(config)
+        model.to(torch_device)
+        model.train()
+
+        loss = model(
+            pixel_values, task_inputs, text_inputs=text_inputs, mask_labels=mask_labels, class_labels=class_labels
+        ).loss
+        loss.backward()
+
+    def test_retain_grad_hidden_states_attentions(self):
+        # only OneFormerForUniversalSegmentation has the loss
+        model_class = self.all_model_classes[1]
+        (
+            config,
+            pixel_values,
+            task_inputs,
+            text_inputs,
+            pixel_mask,
+            mask_labels,
+            class_labels,
+        ) = self.model_tester.prepare_config_and_inputs()
+        config.output_hidden_states = True
+        config.output_attentions = True
+        config.is_training = True
+
+        model = model_class(config)
+        model.to(torch_device)
+        model.train()
+
+        outputs = model(
+            pixel_values, task_inputs, text_inputs=text_inputs, mask_labels=mask_labels, class_labels=class_labels
+        )
+
+        encoder_hidden_states = outputs.encoder_hidden_states[0]
+        encoder_hidden_states.retain_grad()
+
+        pixel_decoder_hidden_states = outputs.pixel_decoder_hidden_states[0]
+        pixel_decoder_hidden_states.retain_grad()
+
+        transformer_decoder_class_predictions = outputs.transformer_decoder_class_predictions
+        transformer_decoder_class_predictions.retain_grad()
+
+        transformer_decoder_mask_predictions = outputs.transformer_decoder_mask_predictions
+        transformer_decoder_mask_predictions.retain_grad()
+
+        attentions = outputs.attentions[0][0]
+        attentions.retain_grad()
+
+        outputs.loss.backward(retain_graph=True)
+
+        self.assertIsNotNone(encoder_hidden_states.grad)
+        self.assertIsNotNone(pixel_decoder_hidden_states.grad)
+        self.assertIsNotNone(transformer_decoder_class_predictions.grad)
+        self.assertIsNotNone(transformer_decoder_mask_predictions.grad)
+        self.assertIsNotNone(attentions.grad)
+
+
+TOLERANCE = 1e-4
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_vision
+@slow
+class OneFormerModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def model_checkpoints(self):
+        return "shi-labs/oneformer_ade20k_swin_tiny"
+
+    @cached_property
+    def default_processor(self):
+        return OneFormerProcessor.from_pretrained(self.model_checkpoints) if is_vision_available() else None
+
+    def test_inference_no_head(self):
+        model = OneFormerModel.from_pretrained(self.model_checkpoints).to(torch_device)
+        processor = self.default_processor
+        image = prepare_img()
+        inputs = processor(image, ["semantic"], return_tensors="pt").to(torch_device)
+        inputs_shape = inputs["pixel_values"].shape
+        # check size
+        self.assertEqual(inputs_shape, (1, 3, 512, 682))
+
+        task_inputs_shape = inputs["task_inputs"].shape
+        # check size
+        self.assertEqual(task_inputs_shape, (1, 77))
+
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        expected_slice_hidden_state = torch.tensor(
+            [[0.2723, 0.8280, 0.6026], [1.2699, 1.1257, 1.1444], [1.1344, 0.6153, 0.4177]]
+        ).to(torch_device)
+        self.assertTrue(
+            torch.allclose(
+                outputs.encoder_hidden_states[-1][0, 0, :3, :3], expected_slice_hidden_state, atol=TOLERANCE
+            )
+        )
+
+        expected_slice_hidden_state = torch.tensor(
+            [[1.0581, 1.2276, 1.2003], [1.1903, 1.2925, 1.2862], [1.158, 1.2559, 1.3216]]
+        ).to(torch_device)
+        self.assertTrue(
+            torch.allclose(
+                outputs.pixel_decoder_hidden_states[0][0, 0, :3, :3], expected_slice_hidden_state, atol=TOLERANCE
+            )
+        )
+
+        expected_slice_hidden_state = torch.tensor(
+            [[3.0668, -1.1833, -5.1103], [3.344, -3.362, -5.1101], [2.6017, -4.3613, -4.1444]]
+        ).to(torch_device)
+        self.assertTrue(
+            torch.allclose(
+                outputs.transformer_decoder_class_predictions[0, :3, :3], expected_slice_hidden_state, atol=TOLERANCE
+            )
+        )
+
+    def test_inference_universal_segmentation_head(self):
+        model = OneFormerForUniversalSegmentation.from_pretrained(self.model_checkpoints).to(torch_device).eval()
+        processor = self.default_processor
+        image = prepare_img()
+        inputs = processor(image, ["semantic"], return_tensors="pt").to(torch_device)
+        inputs_shape = inputs["pixel_values"].shape
+        # check size
+        self.assertEqual(inputs_shape, (1, 3, 512, 682))
+
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # masks_queries_logits
+        masks_queries_logits = outputs.masks_queries_logits
+        self.assertEqual(
+            masks_queries_logits.shape,
+            (1, model.config.num_queries, inputs_shape[-2] // 4, (inputs_shape[-1] + 2) // 4),
+        )
+        expected_slice = [[[3.1848, 4.2141, 4.1993], [2.9000, 3.5721, 3.6603], [2.5358, 3.0883, 3.6168]]]
+        expected_slice = torch.tensor(expected_slice).to(torch_device)
+        self.assertTrue(torch.allclose(masks_queries_logits[0, 0, :3, :3], expected_slice, atol=TOLERANCE))
+        # class_queries_logits
+        class_queries_logits = outputs.class_queries_logits
+        self.assertEqual(
+            class_queries_logits.shape,
+            (1, model.config.num_queries, model.config.num_labels + 1),
+        )
+        expected_slice = torch.tensor(
+            [[3.0668, -1.1833, -5.1103], [3.3440, -3.3620, -5.1101], [2.6017, -4.3613, -4.1444]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(class_queries_logits[0, :3, :3], expected_slice, atol=TOLERANCE))
+
+    @require_torch_gpu
+    def test_inference_fp16(self):
+        model = (
+            OneFormerForUniversalSegmentation.from_pretrained(self.model_checkpoints)
+            .to(torch_device, dtype=torch.float16)
+            .eval()
+        )
+        processor = self.default_processor
+        image = prepare_img()
+        inputs = processor(image, ["semantic"], return_tensors="pt").to(torch_device, dtype=torch.float16)
+
+        with torch.no_grad():
+            _ = model(**inputs)
+
+    def test_with_segmentation_maps_and_loss(self):
+        dummy_model = OneFormerForUniversalSegmentation.from_pretrained(self.model_checkpoints)
+        processor = self.default_processor
+        processor.image_processor.num_text = dummy_model.config.num_queries - dummy_model.config.text_encoder_n_ctx
+        dummy_model.config.is_training = True
+        model = OneFormerForUniversalSegmentation(dummy_model.config).to(torch_device).eval()
+        del dummy_model
+
+        inputs = processor(
+            [np.zeros((3, 512, 640)), np.zeros((3, 512, 640))],
+            ["semantic", "semantic"],
+            segmentation_maps=[np.zeros((384, 384)).astype(np.float32), np.zeros((384, 384)).astype(np.float32)],
+            return_tensors="pt",
+        )
+
+        inputs["pixel_values"] = inputs["pixel_values"].to(torch_device)
+        inputs["task_inputs"] = inputs["task_inputs"].to(torch_device)
+        inputs["text_inputs"] = inputs["text_inputs"].to(torch_device)
+        inputs["mask_labels"] = [el.to(torch_device) for el in inputs["mask_labels"]]
+        inputs["class_labels"] = [el.to(torch_device) for el in inputs["class_labels"]]
+
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        self.assertTrue(outputs.loss is not None)
diff --git a/tests/models/oneformer/test_processor_oneformer.py b/tests/models/oneformer/test_processor_oneformer.py
new file mode 100644
index 000000000000..f6d976438106
--- /dev/null
+++ b/tests/models/oneformer/test_processor_oneformer.py
@@ -0,0 +1,807 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import os
+import tempfile
+import unittest
+
+import numpy as np
+from datasets import load_dataset
+from huggingface_hub import hf_hub_download
+
+from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+    if is_vision_available():
+        from transformers import CLIPTokenizer, OneFormerImageProcessor, OneFormerProcessor
+        from transformers.models.oneformer.image_processing_oneformer import binary_mask_to_rle
+        from transformers.models.oneformer.modeling_oneformer import OneFormerForUniversalSegmentationOutput
+
+if is_vision_available():
+    from PIL import Image
+
+
+def prepare_metadata(class_info_file, repo_path="shi-labs/oneformer_demo"):
+    with open(hf_hub_download(repo_path, class_info_file, repo_type="dataset"), "r") as f:
+        class_info = json.load(f)
+    metadata = {}
+    class_names = []
+    thing_ids = []
+
+    for key, info in class_info.items():
+        metadata[key] = info["name"]
+        class_names.append(info["name"])
+        if info["isthing"]:
+            thing_ids.append(int(key))
+
+    metadata["thing_ids"] = thing_ids
+    metadata["class_names"] = class_names
+    return metadata
+
+
+class OneFormerProcessorTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        min_resolution=30,
+        max_resolution=400,
+        size=None,
+        do_resize=True,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+        num_labels=10,
+        reduce_labels=False,
+        ignore_index=255,
+        max_seq_length=77,
+        task_seq_length=77,
+        model_repo="shi-labs/oneformer_ade20k_swin_tiny",
+        class_info_file="ade20k_panoptic.json",
+        num_text=10,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = {"shortest_edge": 32, "longest_edge": 1333} if size is None else size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.max_seq_length = max_seq_length
+        self.task_seq_length = task_seq_length
+        self.class_info_file = class_info_file
+        self.metadata = prepare_metadata(class_info_file)
+        self.num_text = num_text
+        self.model_repo = model_repo
+
+        # for the post_process_functions
+        self.batch_size = 2
+        self.num_queries = 10
+        self.num_classes = 10
+        self.height = 3
+        self.width = 4
+        self.num_labels = num_labels
+        self.reduce_labels = reduce_labels
+        self.ignore_index = ignore_index
+
+    def prepare_processor_dict(self):
+        image_processor_dict = {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "num_labels": self.num_labels,
+            "reduce_labels": self.reduce_labels,
+            "ignore_index": self.ignore_index,
+            "class_info_file": self.class_info_file,
+            "metadata": self.metadata,
+            "num_text": self.num_text,
+        }
+
+        image_processor = OneFormerImageProcessor(**image_processor_dict)
+        tokenizer = CLIPTokenizer.from_pretrained(self.model_repo)
+
+        return {
+            "image_processor": image_processor,
+            "tokenizer": tokenizer,
+            "max_seq_length": self.max_seq_length,
+            "task_seq_length": self.task_seq_length,
+        }
+
+    def get_expected_values(self, image_inputs, batched=False):
+        """
+        This function computes the expected height and width when providing images to OneFormerProcessor,
+        assuming do_resize is set to True with a scalar size. It also provides the expected sequence length
+        for the task_inputs and text_list_input.
+        """
+        if not batched:
+            image = image_inputs[0]
+            if isinstance(image, Image.Image):
+                w, h = image.size
+            else:
+                h, w = image.shape[1], image.shape[2]
+            if w < h:
+                expected_height = int(self.size["shortest_edge"] * h / w)
+                expected_width = self.size["shortest_edge"]
+            elif w > h:
+                expected_height = self.size["shortest_edge"]
+                expected_width = int(self.size["shortest_edge"] * w / h)
+            else:
+                expected_height = self.size["shortest_edge"]
+                expected_width = self.size["shortest_edge"]
+
+        else:
+            expected_values = []
+            for image in image_inputs:
+                expected_height, expected_width, expected_sequence_length = self.get_expected_values([image])
+                expected_values.append((expected_height, expected_width, expected_sequence_length))
+            expected_height = max(expected_values, key=lambda item: item[0])[0]
+            expected_width = max(expected_values, key=lambda item: item[1])[1]
+
+        expected_sequence_length = self.max_seq_length
+
+        return expected_height, expected_width, expected_sequence_length
+
+    def get_fake_oneformer_outputs(self):
+        return OneFormerForUniversalSegmentationOutput(
+            # +1 for null class
+            class_queries_logits=torch.randn((self.batch_size, self.num_queries, self.num_classes + 1)),
+            masks_queries_logits=torch.randn((self.batch_size, self.num_queries, self.height, self.width)),
+        )
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class OneFormerProcessingTest(unittest.TestCase):
+    processing_class = OneFormerProcessor if (is_vision_available() and is_torch_available()) else None
+    # only for test_feat_extracttion_common.test_feat_extract_to_json_string
+    feature_extraction_class = processing_class
+
+    def setUp(self):
+        self.processing_tester = OneFormerProcessorTester(self)
+
+    @property
+    def processor_dict(self):
+        return self.processing_tester.prepare_processor_dict()
+
+    def test_feat_extract_properties(self):
+        processor = self.processing_class(**self.processor_dict)
+        self.assertTrue(hasattr(processor, "image_processor"))
+        self.assertTrue(hasattr(processor, "tokenizer"))
+        self.assertTrue(hasattr(processor, "max_seq_length"))
+        self.assertTrue(hasattr(processor, "task_seq_length"))
+
+    def test_batch_feature(self):
+        pass
+
+    def test_call_pil(self):
+        # Initialize processor
+        processor = self.processing_class(**self.processor_dict)
+        # create random PIL images
+        image_inputs = self.processing_tester.prepare_image_inputs(equal_resolution=False)
+        for image in image_inputs:
+            self.assertIsInstance(image, Image.Image)
+
+        # Test not batched input
+        encoded_images = processor(image_inputs[0], ["semantic"], return_tensors="pt").pixel_values
+
+        expected_height, expected_width, expected_sequence_length = self.processing_tester.get_expected_values(
+            image_inputs
+        )
+
+        self.assertEqual(
+            encoded_images.shape,
+            (1, self.processing_tester.num_channels, expected_height, expected_width),
+        )
+
+        tokenized_task_inputs = processor(image_inputs[0], ["semantic"], return_tensors="pt").task_inputs
+
+        self.assertEqual(
+            tokenized_task_inputs.shape,
+            (1, expected_sequence_length),
+        )
+
+        # Test batched
+        expected_height, expected_width, expected_sequence_length = self.processing_tester.get_expected_values(
+            image_inputs, batched=True
+        )
+
+        encoded_images = processor(image_inputs, ["semantic"] * len(image_inputs), return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.processing_tester.batch_size,
+                self.processing_tester.num_channels,
+                expected_height,
+                expected_width,
+            ),
+        )
+
+        tokenized_task_inputs = processor(
+            image_inputs, ["semantic"] * len(image_inputs), return_tensors="pt"
+        ).task_inputs
+
+        self.assertEqual(
+            tokenized_task_inputs.shape,
+            (self.processing_tester.batch_size, expected_sequence_length),
+        )
+
+    def test_call_numpy(self):
+        # Initialize processor
+        processor = self.processing_class(**self.processor_dict)
+        # create random numpy tensors
+        image_inputs = self.processing_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, np.ndarray)
+
+        # Test not batched input
+        encoded_images = processor(image_inputs[0], ["semantic"], return_tensors="pt").pixel_values
+
+        expected_height, expected_width, expected_sequence_length = self.processing_tester.get_expected_values(
+            image_inputs
+        )
+
+        self.assertEqual(
+            encoded_images.shape,
+            (1, self.processing_tester.num_channels, expected_height, expected_width),
+        )
+
+        tokenized_task_inputs = processor(image_inputs[0], ["semantic"], return_tensors="pt").task_inputs
+
+        self.assertEqual(
+            tokenized_task_inputs.shape,
+            (1, expected_sequence_length),
+        )
+
+        # Test batched
+        expected_height, expected_width, expected_sequence_length = self.processing_tester.get_expected_values(
+            image_inputs, batched=True
+        )
+
+        encoded_images = processor(image_inputs, ["semantic"] * len(image_inputs), return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.processing_tester.batch_size,
+                self.processing_tester.num_channels,
+                expected_height,
+                expected_width,
+            ),
+        )
+
+        tokenized_task_inputs = processor(
+            image_inputs, ["semantic"] * len(image_inputs), return_tensors="pt"
+        ).task_inputs
+
+        self.assertEqual(
+            tokenized_task_inputs.shape,
+            (self.processing_tester.batch_size, expected_sequence_length),
+        )
+
+    def test_call_pytorch(self):
+        # Initialize processor
+        processor = self.processing_class(**self.processor_dict)
+        # create random PyTorch tensors
+        image_inputs = self.processing_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, torch.Tensor)
+
+        # Test not batched input
+        encoded_images = processor(image_inputs[0], ["semantic"], return_tensors="pt").pixel_values
+
+        expected_height, expected_width, expected_sequence_length = self.processing_tester.get_expected_values(
+            image_inputs
+        )
+
+        self.assertEqual(
+            encoded_images.shape,
+            (1, self.processing_tester.num_channels, expected_height, expected_width),
+        )
+
+        tokenized_task_inputs = processor(image_inputs[0], ["semantic"], return_tensors="pt").task_inputs
+
+        self.assertEqual(
+            tokenized_task_inputs.shape,
+            (1, expected_sequence_length),
+        )
+
+        # Test batched
+        expected_height, expected_width, expected_sequence_length = self.processing_tester.get_expected_values(
+            image_inputs, batched=True
+        )
+
+        encoded_images = processor(image_inputs, ["semantic"] * len(image_inputs), return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.processing_tester.batch_size,
+                self.processing_tester.num_channels,
+                expected_height,
+                expected_width,
+            ),
+        )
+
+        tokenized_task_inputs = processor(
+            image_inputs, ["semantic"] * len(image_inputs), return_tensors="pt"
+        ).task_inputs
+
+        self.assertEqual(
+            tokenized_task_inputs.shape,
+            (self.processing_tester.batch_size, expected_sequence_length),
+        )
+
+    def comm_get_processor_inputs(self, with_segmentation_maps=False, is_instance_map=False, segmentation_type="np"):
+        processor = self.processing_class(**self.processor_dict)
+        # prepare image and target
+        num_labels = self.processing_tester.num_labels
+        annotations = None
+        instance_id_to_semantic_id = None
+        image_inputs = self.processing_tester.prepare_image_inputs(equal_resolution=False)
+        if with_segmentation_maps:
+            high = num_labels
+            if is_instance_map:
+                labels_expanded = list(range(num_labels)) * 2
+                instance_id_to_semantic_id = dict(enumerate(labels_expanded))
+            annotations = [
+                np.random.randint(0, high * 2, (img.size[1], img.size[0])).astype(np.uint8) for img in image_inputs
+            ]
+            if segmentation_type == "pil":
+                annotations = [Image.fromarray(annotation) for annotation in annotations]
+
+        inputs = processor(
+            image_inputs,
+            ["semantic"] * len(image_inputs),
+            annotations,
+            return_tensors="pt",
+            instance_id_to_semantic_id=instance_id_to_semantic_id,
+            pad_and_return_pixel_mask=True,
+        )
+
+        return inputs
+
+    def test_init_without_params(self):
+        pass
+
+    def test_feat_extract_from_and_save_pretrained(self):
+        feat_extract_first = self.feature_extraction_class(**self.processor_dict)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            feat_extract_first.save_pretrained(tmpdirname)
+            check_json_file_has_correct_format(os.path.join(tmpdirname, "preprocessor_config.json"))
+            feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname)
+
+        self.assertEqual(feat_extract_second.image_processor.to_dict(), feat_extract_first.image_processor.to_dict())
+        self.assertIsInstance(feat_extract_first.image_processor, OneFormerImageProcessor)
+        self.assertIsInstance(feat_extract_first.tokenizer, CLIPTokenizer)
+
+    def test_call_with_segmentation_maps(self):
+        def common(is_instance_map=False, segmentation_type=None):
+            inputs = self.comm_get_processor_inputs(
+                with_segmentation_maps=True, is_instance_map=is_instance_map, segmentation_type=segmentation_type
+            )
+
+            mask_labels = inputs["mask_labels"]
+            class_labels = inputs["class_labels"]
+            pixel_values = inputs["pixel_values"]
+            text_inputs = inputs["text_inputs"]
+
+            # check the batch_size
+            for mask_label, class_label, text_input in zip(mask_labels, class_labels, text_inputs):
+                self.assertEqual(mask_label.shape[0], class_label.shape[0])
+                # this ensure padding has happened
+                self.assertEqual(mask_label.shape[1:], pixel_values.shape[2:])
+                self.assertEqual(text_input.shape[0], self.processing_tester.num_text)
+
+        common()
+        common(is_instance_map=True)
+        common(is_instance_map=False, segmentation_type="pil")
+        common(is_instance_map=True, segmentation_type="pil")
+
+    def test_integration_semantic_segmentation(self):
+        # load 2 images and corresponding panoptic annotations from the hub
+        dataset = load_dataset("nielsr/ade20k-panoptic-demo")
+        image1 = dataset["train"][0]["image"]
+        image2 = dataset["train"][1]["image"]
+        segments_info1 = dataset["train"][0]["segments_info"]
+        segments_info2 = dataset["train"][1]["segments_info"]
+        annotation1 = dataset["train"][0]["label"]
+        annotation2 = dataset["train"][1]["label"]
+
+        def rgb_to_id(color):
+            if isinstance(color, np.ndarray) and len(color.shape) == 3:
+                if color.dtype == np.uint8:
+                    color = color.astype(np.int32)
+                return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
+            return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
+
+        def create_panoptic_map(annotation, segments_info):
+            annotation = np.array(annotation)
+            # convert RGB to segment IDs per pixel
+            # 0 is the "ignore" label, for which we don't need to make binary masks
+            panoptic_map = rgb_to_id(annotation)
+
+            # create mapping between segment IDs and semantic classes
+            inst2class = {segment["id"]: segment["category_id"] for segment in segments_info}
+
+            return panoptic_map, inst2class
+
+        panoptic_map1, inst2class1 = create_panoptic_map(annotation1, segments_info1)
+        panoptic_map2, inst2class2 = create_panoptic_map(annotation2, segments_info2)
+
+        image_processor = OneFormerImageProcessor(
+            reduce_labels=True,
+            ignore_index=0,
+            size=(512, 512),
+            class_info_file="ade20k_panoptic.json",
+            num_text=self.processing_tester.num_text,
+        )
+
+        tokenizer = CLIPTokenizer.from_pretrained("shi-labs/oneformer_ade20k_swin_tiny")
+
+        processor = OneFormerProcessor(
+            image_processor=image_processor,
+            tokenizer=tokenizer,
+            max_seq_length=77,
+            task_seq_length=77,
+        )
+
+        # prepare the images and annotations
+        pixel_values_list = [np.moveaxis(np.array(image1), -1, 0), np.moveaxis(np.array(image2), -1, 0)]
+        inputs = processor.encode_inputs(
+            pixel_values_list,
+            ["semantic", "semantic"],
+            [panoptic_map1, panoptic_map2],
+            instance_id_to_semantic_id=[inst2class1, inst2class2],
+            return_tensors="pt",
+        )
+
+        # verify the pixel values, task inputs, text inputs and pixel mask
+        self.assertEqual(inputs["pixel_values"].shape, (2, 3, 512, 711))
+        self.assertEqual(inputs["pixel_mask"].shape, (2, 512, 711))
+        self.assertEqual(inputs["task_inputs"].shape, (2, 77))
+        self.assertEqual(inputs["text_inputs"].shape, (2, self.processing_tester.num_text, 77))
+
+        # verify the class labels
+        self.assertEqual(len(inputs["class_labels"]), 2)
+        # fmt: off
+        expected_class_labels = torch.tensor([4, 17, 32, 42, 12, 3, 5, 0, 43, 96, 104, 31, 125, 138, 87, 149])  # noqa: E231
+        # fmt: on
+        self.assertTrue(torch.allclose(inputs["class_labels"][0], expected_class_labels))
+        # fmt: off
+        expected_class_labels = torch.tensor([19, 67, 82, 17, 12, 42, 3, 14, 5, 0, 115, 43, 8, 138, 125, 143])  # noqa: E231
+        # fmt: on
+        self.assertTrue(torch.allclose(inputs["class_labels"][1], expected_class_labels))
+
+        # verify the task inputs
+        self.assertEqual(len(inputs["task_inputs"]), 2)
+        self.assertEqual(inputs["task_inputs"][0].sum().item(), 141082)
+        self.assertEqual(inputs["task_inputs"][0].sum().item(), inputs["task_inputs"][1].sum().item())
+
+        # verify the text inputs
+        self.assertEqual(len(inputs["text_inputs"]), 2)
+        self.assertEqual(inputs["text_inputs"][0].sum().item(), 1095752)
+        self.assertEqual(inputs["text_inputs"][1].sum().item(), 1062468)
+
+        # verify the mask labels
+        self.assertEqual(len(inputs["mask_labels"]), 2)
+        self.assertEqual(inputs["mask_labels"][0].shape, (16, 512, 711))
+        self.assertEqual(inputs["mask_labels"][1].shape, (16, 512, 711))
+        self.assertEqual(inputs["mask_labels"][0].sum().item(), 315193.0)
+        self.assertEqual(inputs["mask_labels"][1].sum().item(), 350747.0)
+
+    def test_integration_instance_segmentation(self):
+        # load 2 images and corresponding panoptic annotations from the hub
+        dataset = load_dataset("nielsr/ade20k-panoptic-demo")
+        image1 = dataset["train"][0]["image"]
+        image2 = dataset["train"][1]["image"]
+        segments_info1 = dataset["train"][0]["segments_info"]
+        segments_info2 = dataset["train"][1]["segments_info"]
+        annotation1 = dataset["train"][0]["label"]
+        annotation2 = dataset["train"][1]["label"]
+
+        def rgb_to_id(color):
+            if isinstance(color, np.ndarray) and len(color.shape) == 3:
+                if color.dtype == np.uint8:
+                    color = color.astype(np.int32)
+                return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
+            return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
+
+        def create_panoptic_map(annotation, segments_info):
+            annotation = np.array(annotation)
+            # convert RGB to segment IDs per pixel
+            # 0 is the "ignore" label, for which we don't need to make binary masks
+            panoptic_map = rgb_to_id(annotation)
+
+            # create mapping between segment IDs and semantic classes
+            inst2class = {segment["id"]: segment["category_id"] for segment in segments_info}
+
+            return panoptic_map, inst2class
+
+        panoptic_map1, inst2class1 = create_panoptic_map(annotation1, segments_info1)
+        panoptic_map2, inst2class2 = create_panoptic_map(annotation2, segments_info2)
+
+        image_processor = OneFormerImageProcessor(
+            reduce_labels=True,
+            ignore_index=0,
+            size=(512, 512),
+            class_info_file="ade20k_panoptic.json",
+            num_text=self.processing_tester.num_text,
+        )
+
+        tokenizer = CLIPTokenizer.from_pretrained("shi-labs/oneformer_ade20k_swin_tiny")
+
+        processor = OneFormerProcessor(
+            image_processor=image_processor,
+            tokenizer=tokenizer,
+            max_seq_length=77,
+            task_seq_length=77,
+        )
+
+        # prepare the images and annotations
+        pixel_values_list = [np.moveaxis(np.array(image1), -1, 0), np.moveaxis(np.array(image2), -1, 0)]
+        inputs = processor.encode_inputs(
+            pixel_values_list,
+            ["instance", "instance"],
+            [panoptic_map1, panoptic_map2],
+            instance_id_to_semantic_id=[inst2class1, inst2class2],
+            return_tensors="pt",
+        )
+
+        # verify the pixel values, task inputs, text inputs and pixel mask
+        self.assertEqual(inputs["pixel_values"].shape, (2, 3, 512, 711))
+        self.assertEqual(inputs["pixel_mask"].shape, (2, 512, 711))
+        self.assertEqual(inputs["task_inputs"].shape, (2, 77))
+        self.assertEqual(inputs["text_inputs"].shape, (2, self.processing_tester.num_text, 77))
+
+        # verify the class labels
+        self.assertEqual(len(inputs["class_labels"]), 2)
+        # fmt: off
+        expected_class_labels = torch.tensor([32, 42, 42, 42, 42, 42, 42, 42, 32, 12, 12, 12, 12, 12, 42, 42, 12, 12, 12, 42, 12, 12, 12, 12, 12, 12, 12, 12, 12, 42, 42, 42, 12, 42, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 43, 43, 43, 43, 104, 43, 31, 125, 31, 125, 138, 87, 125, 149, 138, 125, 87, 87])  # noqa: E231
+        # fmt: on
+        self.assertTrue(torch.allclose(inputs["class_labels"][0], expected_class_labels))
+        # fmt: off
+        expected_class_labels = torch.tensor([19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 67, 82, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 12, 12, 42, 12, 12, 12, 12, 14, 12, 12, 12, 12, 12, 12, 12, 12, 14, 12, 12, 115, 43, 43, 115, 43, 43, 43, 8, 8, 8, 138, 138, 125, 143])  # noqa: E231
+        # fmt: on
+        self.assertTrue(torch.allclose(inputs["class_labels"][1], expected_class_labels))
+
+        # verify the task inputs
+        self.assertEqual(len(inputs["task_inputs"]), 2)
+        self.assertEqual(inputs["task_inputs"][0].sum().item(), 144985)
+        self.assertEqual(inputs["task_inputs"][0].sum().item(), inputs["task_inputs"][1].sum().item())
+
+        # verify the text inputs
+        self.assertEqual(len(inputs["text_inputs"]), 2)
+        self.assertEqual(inputs["text_inputs"][0].sum().item(), 1037040)
+        self.assertEqual(inputs["text_inputs"][1].sum().item(), 1044078)
+
+        # verify the mask labels
+        self.assertEqual(len(inputs["mask_labels"]), 2)
+        self.assertEqual(inputs["mask_labels"][0].shape, (73, 512, 711))
+        self.assertEqual(inputs["mask_labels"][1].shape, (57, 512, 711))
+        self.assertEqual(inputs["mask_labels"][0].sum().item(), 35040.0)
+        self.assertEqual(inputs["mask_labels"][1].sum().item(), 98228.0)
+
+    def test_integration_panoptic_segmentation(self):
+        # load 2 images and corresponding panoptic annotations from the hub
+        dataset = load_dataset("nielsr/ade20k-panoptic-demo")
+        image1 = dataset["train"][0]["image"]
+        image2 = dataset["train"][1]["image"]
+        segments_info1 = dataset["train"][0]["segments_info"]
+        segments_info2 = dataset["train"][1]["segments_info"]
+        annotation1 = dataset["train"][0]["label"]
+        annotation2 = dataset["train"][1]["label"]
+
+        def rgb_to_id(color):
+            if isinstance(color, np.ndarray) and len(color.shape) == 3:
+                if color.dtype == np.uint8:
+                    color = color.astype(np.int32)
+                return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
+            return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
+
+        def create_panoptic_map(annotation, segments_info):
+            annotation = np.array(annotation)
+            # convert RGB to segment IDs per pixel
+            # 0 is the "ignore" label, for which we don't need to make binary masks
+            panoptic_map = rgb_to_id(annotation)
+
+            # create mapping between segment IDs and semantic classes
+            inst2class = {segment["id"]: segment["category_id"] for segment in segments_info}
+
+            return panoptic_map, inst2class
+
+        panoptic_map1, inst2class1 = create_panoptic_map(annotation1, segments_info1)
+        panoptic_map2, inst2class2 = create_panoptic_map(annotation2, segments_info2)
+
+        image_processor = OneFormerImageProcessor(
+            reduce_labels=True,
+            ignore_index=0,
+            size=(512, 512),
+            class_info_file="ade20k_panoptic.json",
+            num_text=self.processing_tester.num_text,
+        )
+
+        tokenizer = CLIPTokenizer.from_pretrained("shi-labs/oneformer_ade20k_swin_tiny")
+
+        processor = OneFormerProcessor(
+            image_processor=image_processor,
+            tokenizer=tokenizer,
+            max_seq_length=77,
+            task_seq_length=77,
+        )
+
+        # prepare the images and annotations
+        pixel_values_list = [np.moveaxis(np.array(image1), -1, 0), np.moveaxis(np.array(image2), -1, 0)]
+        inputs = processor.encode_inputs(
+            pixel_values_list,
+            ["panoptic", "panoptic"],
+            [panoptic_map1, panoptic_map2],
+            instance_id_to_semantic_id=[inst2class1, inst2class2],
+            return_tensors="pt",
+        )
+
+        # verify the pixel values, task inputs, text inputs and pixel mask
+        self.assertEqual(inputs["pixel_values"].shape, (2, 3, 512, 711))
+        self.assertEqual(inputs["pixel_mask"].shape, (2, 512, 711))
+        self.assertEqual(inputs["task_inputs"].shape, (2, 77))
+        self.assertEqual(inputs["text_inputs"].shape, (2, self.processing_tester.num_text, 77))
+
+        # verify the class labels
+        self.assertEqual(len(inputs["class_labels"]), 2)
+        # fmt: off
+        expected_class_labels = torch.tensor([4, 17, 32, 42, 42, 42, 42, 42, 42, 42, 32, 12, 12, 12, 12, 12, 42, 42, 12, 12, 12, 42, 12, 12, 12, 12, 12, 3, 12, 12, 12, 12, 42, 42, 42, 12, 42, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 5, 12, 12, 12, 12, 12, 12, 12, 0, 43, 43, 43, 96, 43, 104, 43, 31, 125, 31, 125, 138, 87, 125, 149, 138, 125, 87, 87])  # noqa: E231
+        # fmt: on
+        self.assertTrue(torch.allclose(inputs["class_labels"][0], expected_class_labels))
+        # fmt: off
+        expected_class_labels = torch.tensor([19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 67, 82, 19, 19, 17, 19, 19, 19, 19, 19, 19, 19, 19, 19, 12, 12, 42, 12, 12, 12, 12, 3, 14, 12, 12, 12, 12, 12, 12, 12, 12, 14, 5, 12, 12, 0, 115, 43, 43, 115, 43, 43, 43, 8, 8, 8, 138, 138, 125, 143])  # noqa: E231
+        # fmt: on
+        self.assertTrue(torch.allclose(inputs["class_labels"][1], expected_class_labels))
+
+        # verify the task inputs
+        self.assertEqual(len(inputs["task_inputs"]), 2)
+        self.assertEqual(inputs["task_inputs"][0].sum().item(), 136240)
+        self.assertEqual(inputs["task_inputs"][0].sum().item(), inputs["task_inputs"][1].sum().item())
+
+        # verify the text inputs
+        self.assertEqual(len(inputs["text_inputs"]), 2)
+        self.assertEqual(inputs["text_inputs"][0].sum().item(), 1048653)
+        self.assertEqual(inputs["text_inputs"][1].sum().item(), 1067160)
+
+        # verify the mask labels
+        self.assertEqual(len(inputs["mask_labels"]), 2)
+        self.assertEqual(inputs["mask_labels"][0].shape, (79, 512, 711))
+        self.assertEqual(inputs["mask_labels"][1].shape, (61, 512, 711))
+        self.assertEqual(inputs["mask_labels"][0].sum().item(), 315193.0)
+        self.assertEqual(inputs["mask_labels"][1].sum().item(), 350747.0)
+
+    def test_binary_mask_to_rle(self):
+        fake_binary_mask = np.zeros((20, 50))
+        fake_binary_mask[0, 20:] = 1
+        fake_binary_mask[1, :15] = 1
+        fake_binary_mask[5, :10] = 1
+
+        rle = binary_mask_to_rle(fake_binary_mask)
+        self.assertEqual(len(rle), 4)
+        self.assertEqual(rle[0], 21)
+        self.assertEqual(rle[1], 45)
+
+    def test_post_process_semantic_segmentation(self):
+        image_processor = OneFormerImageProcessor(
+            reduce_labels=True,
+            ignore_index=0,
+            size=(512, 512),
+            class_info_file="ade20k_panoptic.json",
+            num_text=self.processing_tester.num_text,
+        )
+        tokenizer = CLIPTokenizer.from_pretrained("shi-labs/oneformer_ade20k_swin_tiny")
+        processor = OneFormerProcessor(
+            image_processor=image_processor,
+            tokenizer=tokenizer,
+            max_seq_length=77,
+            task_seq_length=77,
+        )
+
+        outputs = self.processing_tester.get_fake_oneformer_outputs()
+
+        segmentation = processor.post_process_semantic_segmentation(outputs)
+
+        self.assertEqual(len(segmentation), self.processing_tester.batch_size)
+        self.assertEqual(
+            segmentation[0].shape,
+            (
+                self.processing_tester.height,
+                self.processing_tester.width,
+            ),
+        )
+
+        target_sizes = [(1, 4) for i in range(self.processing_tester.batch_size)]
+        segmentation = processor.post_process_semantic_segmentation(outputs, target_sizes=target_sizes)
+
+        self.assertEqual(segmentation[0].shape, target_sizes[0])
+
+    def test_post_process_instance_segmentation(self):
+        image_processor = OneFormerImageProcessor(
+            reduce_labels=True,
+            ignore_index=0,
+            size=(512, 512),
+            class_info_file="ade20k_panoptic.json",
+            num_text=self.processing_tester.num_text,
+        )
+        tokenizer = CLIPTokenizer.from_pretrained("shi-labs/oneformer_ade20k_swin_tiny")
+        processor = OneFormerProcessor(
+            image_processor=image_processor,
+            tokenizer=tokenizer,
+            max_seq_length=77,
+            task_seq_length=77,
+        )
+
+        outputs = self.processing_tester.get_fake_oneformer_outputs()
+        segmentation = processor.post_process_instance_segmentation(outputs, threshold=0)
+
+        self.assertTrue(len(segmentation) == self.processing_tester.batch_size)
+        for el in segmentation:
+            self.assertTrue("segmentation" in el)
+            self.assertTrue("segments_info" in el)
+            self.assertEqual(type(el["segments_info"]), list)
+            self.assertEqual(el["segmentation"].shape, (self.processing_tester.height, self.processing_tester.width))
+
+    def test_post_process_panoptic_segmentation(self):
+        image_processor = OneFormerImageProcessor(
+            reduce_labels=True,
+            ignore_index=0,
+            size=(512, 512),
+            class_info_file="ade20k_panoptic.json",
+            num_text=self.processing_tester.num_text,
+        )
+        tokenizer = CLIPTokenizer.from_pretrained("shi-labs/oneformer_ade20k_swin_tiny")
+        processor = OneFormerProcessor(
+            image_processor=image_processor,
+            tokenizer=tokenizer,
+            max_seq_length=77,
+            task_seq_length=77,
+        )
+
+        outputs = self.processing_tester.get_fake_oneformer_outputs()
+        segmentation = processor.post_process_panoptic_segmentation(outputs, threshold=0)
+
+        self.assertTrue(len(segmentation) == self.processing_tester.batch_size)
+        for el in segmentation:
+            self.assertTrue("segmentation" in el)
+            self.assertTrue("segments_info" in el)
+            self.assertEqual(type(el["segments_info"]), list)
+            self.assertEqual(el["segmentation"].shape, (self.processing_tester.height, self.processing_tester.width))
diff --git a/tests/models/openai/test_modeling_openai.py b/tests/models/openai/test_modeling_openai.py
index 6c91808421f4..98d74ee5f807 100644
--- a/tests/models/openai/test_modeling_openai.py
+++ b/tests/models/openai/test_modeling_openai.py
@@ -22,6 +22,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -48,7 +49,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -189,8 +190,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class OpenAIGPTModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
+class OpenAIGPTModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel, OpenAIGPTForSequenceClassification)
         if is_torch_available()
@@ -199,6 +199,28 @@ class OpenAIGPTModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestC
     all_generative_model_classes = (
         (OpenAIGPTLMHeadModel,) if is_torch_available() else ()
     )  # TODO (PVP): Add Double HeadsModel when generate() function is changed accordingly
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": OpenAIGPTModel,
+            "text-classification": OpenAIGPTForSequenceClassification,
+            "text-generation": OpenAIGPTLMHeadModel,
+            "zero-shot": OpenAIGPTForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
+
+    # TODO: Fix the failed tests
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if pipeline_test_casse_name == "ZeroShotClassificationPipelineTests":
+            # Get `tokenizer does not have a padding token` error for both fast/slow tokenizers.
+            # `OpenAIGPTConfig` was never used in pipeline tests, either because of a missing checkpoint or because a
+            # tiny config could not be created.
+            return True
+
+        return False
 
     # special case for DoubleHeads model
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
diff --git a/tests/models/openai/test_modeling_tf_openai.py b/tests/models/openai/test_modeling_tf_openai.py
index 7cdc2a8bb187..231758064f2d 100644
--- a/tests/models/openai/test_modeling_tf_openai.py
+++ b/tests/models/openai/test_modeling_tf_openai.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import OpenAIGPTConfig, is_tf_available
@@ -21,6 +23,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -50,7 +53,7 @@ def __init__(
         self.use_mc_token_ids = True
         self.vocab_size = 99
         self.hidden_size = 32
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.num_attention_heads = 4
         self.intermediate_size = 37
         self.hidden_act = "gelu"
@@ -191,8 +194,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFOpenAIGPTModelTest(TFModelTesterMixin, unittest.TestCase):
-
+class TFOpenAIGPTModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, TFOpenAIGPTDoubleHeadsModel, TFOpenAIGPTForSequenceClassification)
         if is_tf_available()
@@ -201,9 +203,31 @@ class TFOpenAIGPTModelTest(TFModelTesterMixin, unittest.TestCase):
     all_generative_model_classes = (
         (TFOpenAIGPTLMHeadModel,) if is_tf_available() else ()
     )  # TODO (PVP): Add Double HeadsModel when generate() function is changed accordingly
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": TFOpenAIGPTModel,
+            "text-classification": TFOpenAIGPTForSequenceClassification,
+            "text-generation": TFOpenAIGPTLMHeadModel,
+            "zero-shot": TFOpenAIGPTForSequenceClassification,
+        }
+        if is_tf_available()
+        else {}
+    )
     test_head_masking = False
     test_onnx = False
 
+    # TODO: Fix the failed tests
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if pipeline_test_casse_name == "ZeroShotClassificationPipelineTests":
+            # Get `tokenizer does not have a padding token` error for both fast/slow tokenizers.
+            # `OpenAIGPTConfig` was never used in pipeline tests, either because of a missing checkpoint or because a
+            # tiny config could not be created.
+            return True
+
+        return False
+
     def setUp(self):
         self.model_tester = TFOpenAIGPTModelTester(self)
         self.config_tester = ConfigTester(self, config_class=OpenAIGPTConfig, n_embd=37)
@@ -223,24 +247,6 @@ def test_openai_gpt_double_head(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_openai_gpt_double_head(*config_and_inputs)
 
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-
-            if model_class in self.all_generative_model_classes:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert name is None
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
-
     def test_openai_gpt_sequence_classification_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_openai_gpt_for_sequence_classification(*config_and_inputs)
diff --git a/tests/models/opt/test_modeling_flax_opt.py b/tests/models/opt/test_modeling_flax_opt.py
index 402e556cefa1..ef94633f22a8 100644
--- a/tests/models/opt/test_modeling_flax_opt.py
+++ b/tests/models/opt/test_modeling_flax_opt.py
@@ -33,6 +33,7 @@
 
     import jax
     import jax.numpy as jnp
+
     from transformers import FlaxOPTForCausalLM, FlaxOPTModel, GPT2Tokenizer
 
 
@@ -364,43 +365,39 @@ def test_jitted_batch_generation(self):
 
         self.assertIsNotNone(output_string, EXPECTED_OUTPUTS)
 
-    # TODO fix in the following PR
-    # def test_batch_generation(self):
-    #     model_id = "facebook/opt-350m"
+    def test_batch_generation(self):
+        model_id = "facebook/opt-350m"
 
-    #     tokenizer = GPT2Tokenizer.from_pretrained(model_id)
-    #     model = FlaxOPTForCausalLM.from_pretrained(model_id)
+        tokenizer = GPT2Tokenizer.from_pretrained(model_id)
+        model = FlaxOPTForCausalLM.from_pretrained(model_id)
 
-    #     tokenizer.padding_side = "left"
+        tokenizer.padding_side = "left"
 
-    #     # use different length sentences to test batching
-    #     sentences = [
-    #         "Hello, my dog is a little",
-    #         "Today, I",
-    #     ]
+        # use different length sentences to test batching
+        sentences = [
+            "Hello, my dog is a little",
+            "Today, I",
+        ]
 
-    #     inputs = tokenizer(sentences, return_tensors="jax", padding=True)
-    #     input_ids = inputs["input_ids"]
+        inputs = tokenizer(sentences, return_tensors="jax", padding=True)
+        input_ids = inputs["input_ids"]
 
-    #     outputs = model.generate(input_ids=input_ids, attention_mask=inputs["attention_mask"], trace=False)
+        outputs = model.generate(input_ids=input_ids, attention_mask=inputs["attention_mask"], trace=False)
 
-    #     inputs_non_padded = tokenizer(sentences[0], return_tensors="jax").input_ids
-    #     output_non_padded = model.generate(input_ids=inputs_non_padded)
+        inputs_non_padded = tokenizer(sentences[0], return_tensors="jax").input_ids
+        output_non_padded = model.generate(input_ids=inputs_non_padded)
 
-    #     num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].sum()
-    #     inputs_padded = tokenizer(sentences[1], return_tensors="jax").input_ids
-    #     output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
+        num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].sum()
+        inputs_padded = tokenizer(sentences[1], return_tensors="jax").input_ids
+        output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
 
-    #     batch_out_sentence = tokenizer.batch_decode(outputs[0], skip_special_tokens=True)
-    #     non_padded_sentence = tokenizer.decode(output_non_padded[0][0], skip_special_tokens=True)
-    #     padded_sentence = tokenizer.decode(output_padded[0][0], skip_special_tokens=True)
+        batch_out_sentence = tokenizer.batch_decode(outputs[0], skip_special_tokens=True)
+        non_padded_sentence = tokenizer.decode(output_non_padded[0][0], skip_special_tokens=True)
+        padded_sentence = tokenizer.decode(output_padded[0][0], skip_special_tokens=True)
 
-    #     expected_output_sentence = [
-    #         "Hello, my dog is a little bit of a dork.\nI'm a little bit",
-    #         "Today, I<s><s><s><s><s><s><s><s><s><s><s><s>"
-    #         # TODO fix this test in next PR
-    #         # "Today, I was in the middle of a conversation with a friend about the",
-    #     ]
-    #     self.assertListEqual(expected_output_sentence, batch_out_sentence)
-    #     # TODO outputs will be similar, fix in next PR
-    #     self.assertListEqual(batch_out_sentence, [non_padded_sentence, padded_sentence])
+        expected_output_sentence = [
+            "Hello, my dog is a little bit of a dork.\nI'm a little bit",
+            "Today, I was in the middle of a conversation with a friend about the",
+        ]
+        self.assertListEqual(expected_output_sentence, batch_out_sentence)
+        self.assertListEqual(batch_out_sentence, [non_padded_sentence, padded_sentence])
diff --git a/tests/models/opt/test_modeling_opt.py b/tests/models/opt/test_modeling_opt.py
index 5aefc14acf34..232a29d4f50e 100644
--- a/tests/models/opt/test_modeling_opt.py
+++ b/tests/models/opt/test_modeling_opt.py
@@ -27,6 +27,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -69,7 +70,7 @@ def __init__(
         use_labels=False,
         vocab_size=99,
         hidden_size=16,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=4,
         hidden_act="gelu",
@@ -181,20 +182,60 @@ def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
         # test that outputs are equal for slice
         self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
 
+        # test no attention_mask works
+        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
+        _, past_key_values = outputs.to_tuple()
+        output_from_no_past = model(next_input_ids)["last_hidden_state"]
+
+        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
+
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
 
 @require_torch
-class OPTModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class OPTModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (OPTModel, OPTForCausalLM, OPTForSequenceClassification, OPTForQuestionAnswering)
         if is_torch_available()
         else ()
     )
     all_generative_model_classes = (OPTForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": OPTModel,
+            "question-answering": OPTForQuestionAnswering,
+            "text-classification": OPTForSequenceClassification,
+            "text-generation": OPTForCausalLM,
+            "zero-shot": OPTForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
     is_encoder_decoder = False
     fx_compatible = True
     test_pruning = False
     test_missing_keys = False
 
+    # TODO: Fix the failed tests
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if (
+            pipeline_test_casse_name == "QAPipelineTests"
+            and tokenizer_name is not None
+            and not tokenizer_name.endswith("Fast")
+        ):
+            # `QAPipelineTests` fails for a few models when the slower tokenizer are used.
+            # (The slower tokenizers were never used for pipeline tests before the pipeline testing rework)
+            # TODO: check (and possibly fix) the `QAPipelineTests` with slower tokenizer
+            return True
+
+        return False
+
     def setUp(self):
         self.model_tester = OPTModelTester(self)
         self.config_tester = ConfigTester(self, config_class=OPTConfig)
@@ -282,6 +323,10 @@ def test_opt_sequence_classification_model_for_multi_label(self):
         result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
         self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
 
+    @unittest.skip("Does not work on the tiny model as we keep hitting edge cases.")
+    def test_model_parallelism(self):
+        super().test_model_parallelism()
+
 
 def assert_tensors_close(a, b, atol=1e-12, prefix=""):
     """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
@@ -477,13 +522,13 @@ def test_batched_nan_fp16(self):
         model_name = "facebook/opt-1.3b"
         tokenizer = GPT2Tokenizer.from_pretrained(model_name, use_fast=False, padding_side="left")
 
-        model = OPTForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, use_cache=True).cuda()
+        model = OPTForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, use_cache=True).to(torch_device)
         model = model.eval()
 
         batch = tokenizer(["Who are you?", "Joe Biden is the president of"], padding=True, return_tensors="pt")
 
-        input_ids = batch["input_ids"].cuda()
-        attention_mask = batch["attention_mask"].cuda()
+        input_ids = batch["input_ids"].to(torch_device)
+        attention_mask = batch["attention_mask"].to(torch_device)
 
         with torch.no_grad():
             outputs = model(input_ids, attention_mask=attention_mask)
diff --git a/tests/models/opt/test_modeling_tf_opt.py b/tests/models/opt/test_modeling_tf_opt.py
index 4e9972e4aaa1..1847ad50a949 100644
--- a/tests/models/opt/test_modeling_tf_opt.py
+++ b/tests/models/opt/test_modeling_tf_opt.py
@@ -13,15 +13,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import unittest
 
 import numpy as np
 
 from transformers import OPTConfig, is_tf_available
-from transformers.testing_utils import require_sentencepiece, require_tf, slow, tooslow
+from transformers.testing_utils import require_sentencepiece, require_tf, slow
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -146,9 +149,12 @@ def check_decoder_model_past_large_inputs(self, config, inputs_dict):
 
 
 @require_tf
-class TFOPTModelTest(TFModelTesterMixin, unittest.TestCase):
+class TFOPTModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (TFOPTModel, TFOPTForCausalLM) if is_tf_available() else ()
     all_generative_model_classes = (TFOPTForCausalLM,) if is_tf_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": TFOPTModel, "text-generation": TFOPTForCausalLM} if is_tf_available() else {}
+    )
     is_encoder_decoder = False
     test_pruning = False
     test_onnx = False
@@ -165,20 +171,6 @@ def test_decoder_model_past_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
 
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-
-            if model_class in self.all_generative_model_classes:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-
     def test_resize_token_embeddings(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -188,7 +180,7 @@ def _get_word_embedding_weight(model, embedding_layer):
             else:
                 # Here we build the word embeddings weights if not exists.
                 # And then we retry to get the attribute once built.
-                model(model.dummy_inputs)
+                model.build()
                 if hasattr(embedding_layer, "weight"):
                     return embedding_layer.weight
                 else:
@@ -227,10 +219,6 @@ def _get_word_embedding_weight(model, embedding_layer):
                             models_equal = False
                     self.assertTrue(models_equal)
 
-    @tooslow
-    def test_saved_model_creation(self):
-        pass
-
 
 def _long_tensor(tok_lst):
     return tf.constant(tok_lst, dtype=tf.int32)
diff --git a/tests/models/owlvit/test_feature_extraction_owlvit.py b/tests/models/owlvit/test_feature_extraction_owlvit.py
deleted file mode 100644
index fe259b11696f..000000000000
--- a/tests/models/owlvit/test_feature_extraction_owlvit.py
+++ /dev/null
@@ -1,210 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-
-from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torch_available, is_vision_available
-
-from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
-
-
-if is_torch_available():
-    import torch
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import OwlViTFeatureExtractor
-
-
-class OwlViTFeatureExtractionTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_center_crop=True,
-        crop_size=None,
-        do_normalize=True,
-        image_mean=[0.48145466, 0.4578275, 0.40821073],
-        image_std=[0.26862954, 0.26130258, 0.27577711],
-        do_convert_rgb=True,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size if size is not None else {"height": 18, "width": 18}
-        self.do_center_crop = do_center_crop
-        self.crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.do_convert_rgb = do_convert_rgb
-
-    def prepare_feat_extract_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_center_crop": self.do_center_crop,
-            "crop_size": self.crop_size,
-            "do_normalize": self.do_normalize,
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_convert_rgb": self.do_convert_rgb,
-        }
-
-
-@require_torch
-@require_vision
-class OwlViTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
-
-    feature_extraction_class = OwlViTFeatureExtractor if is_vision_available() else None
-
-    def setUp(self):
-        self.feature_extract_tester = OwlViTFeatureExtractionTester(self)
-
-    @property
-    def feat_extract_dict(self):
-        return self.feature_extract_tester.prepare_feat_extract_dict()
-
-    def test_feat_extract_properties(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        self.assertTrue(hasattr(feature_extractor, "do_resize"))
-        self.assertTrue(hasattr(feature_extractor, "size"))
-        self.assertTrue(hasattr(feature_extractor, "do_center_crop"))
-        self.assertTrue(hasattr(feature_extractor, "center_crop"))
-        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
-        self.assertTrue(hasattr(feature_extractor, "image_mean"))
-        self.assertTrue(hasattr(feature_extractor, "image_std"))
-        self.assertTrue(hasattr(feature_extractor, "do_convert_rgb"))
-
-    def test_feat_extract_from_dict_with_kwargs(self):
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict)
-        self.assertEqual(feature_extractor.size, {"height": 18, "width": 18})
-        self.assertEqual(feature_extractor.crop_size, {"height": 18, "width": 18})
-
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict, size=42, crop_size=84)
-        self.assertEqual(feature_extractor.size, {"height": 42, "width": 42})
-        self.assertEqual(feature_extractor.crop_size, {"height": 84, "width": 84})
-
-    def test_call_pil(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PIL images
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
-
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-    def test_call_numpy(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random numpy tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-    def test_call_pytorch(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
diff --git a/tests/models/owlvit/test_image_processing_owlvit.py b/tests/models/owlvit/test_image_processing_owlvit.py
new file mode 100644
index 000000000000..f4897c051ec3
--- /dev/null
+++ b/tests/models/owlvit/test_image_processing_owlvit.py
@@ -0,0 +1,119 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_vision_available():
+    from transformers import OwlViTImageProcessor
+
+
+class OwlViTImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_center_crop=True,
+        crop_size=None,
+        do_normalize=True,
+        image_mean=[0.48145466, 0.4578275, 0.40821073],
+        image_std=[0.26862954, 0.26130258, 0.27577711],
+        do_convert_rgb=True,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size if size is not None else {"height": 18, "width": 18}
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_convert_rgb = do_convert_rgb
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_center_crop": self.do_center_crop,
+            "crop_size": self.crop_size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_convert_rgb": self.do_convert_rgb,
+        }
+
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.crop_size["height"], self.crop_size["width"]
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class OwlViTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = OwlViTImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = OwlViTImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "do_center_crop"))
+        self.assertTrue(hasattr(image_processing, "center_crop"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"height": 18, "width": 18})
+        self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
+
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84)
+        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
+        self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
diff --git a/tests/models/owlvit/test_modeling_owlvit.py b/tests/models/owlvit/test_modeling_owlvit.py
index 957533980151..8360b9f2a232 100644
--- a/tests/models/owlvit/test_modeling_owlvit.py
+++ b/tests/models/owlvit/test_modeling_owlvit.py
@@ -21,8 +21,8 @@
 import unittest
 
 import numpy as np
-
 import requests
+
 from transformers import OwlViTConfig, OwlViTTextConfig, OwlViTVisionConfig
 from transformers.testing_utils import require_torch, require_torch_gpu, require_vision, slow, torch_device
 from transformers.utils import is_torch_available, is_vision_available
@@ -35,6 +35,7 @@
     ids_tensor,
     random_attention_mask,
 )
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -61,7 +62,7 @@ def __init__(
         num_channels=3,
         is_training=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -293,7 +294,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class OwlViTTextModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (OwlViTTextModel,) if is_torch_available() else ()
     fx_compatible = False
     test_pruning = False
@@ -339,7 +339,6 @@ def test_model_from_pretrained(self):
 
 class OwlViTModelTester:
     def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
-
         if text_kwargs is None:
             text_kwargs = {}
         if vision_kwargs is None:
@@ -395,8 +394,13 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class OwlViTModelTest(ModelTesterMixin, unittest.TestCase):
+class OwlViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (OwlViTModel,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": OwlViTModel, "zero-shot-object-detection": OwlViTForObjectDetection}
+        if is_torch_available()
+        else {}
+    )
     fx_compatible = False
     test_head_masking = False
     test_pruning = False
@@ -487,8 +491,28 @@ def _create_and_check_torchscript(self, config, inputs_dict):
             model_state_dict = model.state_dict()
             loaded_model_state_dict = loaded_model.state_dict()
 
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
             self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
 
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
             models_equal = True
             for layer_name, p1 in model_state_dict.items():
                 p2 = loaded_model_state_dict[layer_name]
@@ -666,8 +690,28 @@ def _create_and_check_torchscript(self, config, inputs_dict):
             model_state_dict = model.state_dict()
             loaded_model_state_dict = loaded_model.state_dict()
 
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
             self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
 
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
             models_equal = True
             for layer_name, p1 in model_state_dict.items():
                 p2 = loaded_model_state_dict[layer_name]
diff --git a/tests/models/owlvit/test_processor_owlvit.py b/tests/models/owlvit/test_processor_owlvit.py
index 2ab23a10f26c..b8bd17e027c6 100644
--- a/tests/models/owlvit/test_processor_owlvit.py
+++ b/tests/models/owlvit/test_processor_owlvit.py
@@ -120,7 +120,7 @@ def test_save_load_pretrained_additional_features(self):
         image_processor_add_kwargs = self.get_image_processor(do_normalize=False)
 
         processor = OwlViTProcessor.from_pretrained(
-            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False
+            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", pad_token="!", do_normalize=False
         )
 
         self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
diff --git a/tests/models/pegasus/test_modeling_flax_pegasus.py b/tests/models/pegasus/test_modeling_flax_pegasus.py
index 61c356bfb0ce..62b9077f0d47 100644
--- a/tests/models/pegasus/test_modeling_flax_pegasus.py
+++ b/tests/models/pegasus/test_modeling_flax_pegasus.py
@@ -30,10 +30,10 @@
     # This makes JAX allocate exactly what is needed on demand, and deallocate memory that is no longer needed
     # but will be slower as stated here https://jax.readthedocs.io/en/latest/gpu_memory_allocation.html
     os.environ["XLA_PYTHON_CLIENT_ALLOCATOR"] = "platform"
-    import numpy as np
-
     import jax
     import jax.numpy as jnp
+    import numpy as np
+
     from transformers import FlaxPegasusForConditionalGeneration, FlaxPegasusModel
 
 
@@ -52,7 +52,7 @@ def __init__(
         use_labels=False,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_dropout_prob=0.1,
diff --git a/tests/models/pegasus/test_modeling_pegasus.py b/tests/models/pegasus/test_modeling_pegasus.py
index 7f8cc58d3f6f..4011fe2c6824 100644
--- a/tests/models/pegasus/test_modeling_pegasus.py
+++ b/tests/models/pegasus/test_modeling_pegasus.py
@@ -24,6 +24,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 from ..mbart.test_modeling_mbart import AbstractSeq2SeqIntegrationTest
 
 
@@ -233,9 +234,21 @@ def check_encoder_decoder_model_standalone(self, config, inputs_dict):
 
 
 @require_torch
-class PegasusModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class PegasusModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (PegasusModel, PegasusForConditionalGeneration) if is_torch_available() else ()
     all_generative_model_classes = (PegasusForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "conversational": PegasusForConditionalGeneration,
+            "feature-extraction": PegasusModel,
+            "summarization": PegasusForConditionalGeneration,
+            "text-generation": PegasusForCausalLM,
+            "text2text-generation": PegasusForConditionalGeneration,
+            "translation": PegasusForConditionalGeneration,
+        }
+        if is_torch_available()
+        else {}
+    )
     is_encoder_decoder = True
     fx_compatible = True
     test_resize_position_embeddings = True
@@ -358,7 +371,7 @@ def __init__(
         use_labels=True,
         decoder_start_token_id=2,
         decoder_ffn_dim=32,
-        decoder_layers=4,
+        decoder_layers=2,
         encoder_attention_heads=4,
         decoder_attention_heads=4,
         max_position_embeddings=30,
@@ -560,3 +573,7 @@ def test_decoder_model_attn_mask_past(self):
     def test_retain_grad_hidden_states_attentions(self):
         # decoder cannot keep gradients
         return
+
+    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
+    def test_left_padding_compatibility(self):
+        pass
diff --git a/tests/models/pegasus/test_modeling_tf_pegasus.py b/tests/models/pegasus/test_modeling_tf_pegasus.py
index 5dc6be73033f..21ddaa9f3451 100644
--- a/tests/models/pegasus/test_modeling_tf_pegasus.py
+++ b/tests/models/pegasus/test_modeling_tf_pegasus.py
@@ -13,15 +13,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import tempfile
+from __future__ import annotations
+
 import unittest
 
 from transformers import AutoTokenizer, PegasusConfig, is_tf_available
-from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow, tooslow
+from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
 from transformers.utils import cached_property
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -45,7 +47,7 @@ def __init__(
         use_labels=False,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_dropout_prob=0.1,
@@ -175,9 +177,20 @@ def prepare_pegasus_inputs_dict(
 
 
 @require_tf
-class TFPegasusModelTest(TFModelTesterMixin, unittest.TestCase):
+class TFPegasusModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (TFPegasusForConditionalGeneration, TFPegasusModel) if is_tf_available() else ()
     all_generative_model_classes = (TFPegasusForConditionalGeneration,) if is_tf_available() else ()
+    pipeline_model_mapping = (
+        {
+            "conversational": TFPegasusForConditionalGeneration,
+            "feature-extraction": TFPegasusModel,
+            "summarization": TFPegasusForConditionalGeneration,
+            "text2text-generation": TFPegasusForConditionalGeneration,
+            "translation": TFPegasusForConditionalGeneration,
+        }
+        if is_tf_available()
+        else {}
+    )
     is_encoder_decoder = True
     test_pruning = False
     test_onnx = False
@@ -193,61 +206,6 @@ def test_decoder_model_past_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
 
-    def test_compile_tf_model(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
-        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-        metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
-
-        model_class = self.all_generative_model_classes[0]
-        input_ids = {
-            "decoder_input_ids": tf.keras.Input(batch_shape=(2, 2000), name="decoder_input_ids", dtype="int32"),
-            "input_ids": tf.keras.Input(batch_shape=(2, 2000), name="input_ids", dtype="int32"),
-        }
-
-        # Prepare our model
-        model = model_class(config)
-        model(self._prepare_for_class(inputs_dict, model_class))  # Model must be called before saving.
-        # Let's load it from the disk to be sure we can use pretrained weights
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_pretrained(tmpdirname)
-            model = model_class.from_pretrained(tmpdirname)
-
-        outputs_dict = model(input_ids)
-        hidden_states = outputs_dict[0]
-
-        # Add a dense layer on top to test integration with other keras modules
-        outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states)
-
-        # Compile extended model
-        extended_model = tf.keras.Model(inputs=[input_ids], outputs=[outputs])
-        extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
-
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-
-            if model_class in self.all_generative_model_classes:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert isinstance(name, dict)
-                for k, v in name.items():
-                    assert isinstance(v, tf.Variable)
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
-
-    @tooslow
-    def test_saved_model_creation(self):
-        pass
-
 
 @require_sentencepiece
 @require_tokenizers
diff --git a/tests/models/pegasus/test_tokenization_pegasus.py b/tests/models/pegasus/test_tokenization_pegasus.py
index de2886a5e120..6d843349513c 100644
--- a/tests/models/pegasus/test_tokenization_pegasus.py
+++ b/tests/models/pegasus/test_tokenization_pegasus.py
@@ -27,7 +27,6 @@
 @require_sentencepiece
 @require_tokenizers
 class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = PegasusTokenizer
     rust_tokenizer_class = PegasusTokenizerFast
     test_rust_tokenizer = True
@@ -55,16 +54,16 @@ def test_convert_token_and_id(self):
         token = "</s>"
         token_id = 1
 
-        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
-        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
+        self.assertEqual(self.get_tokenizer().convert_tokens_to_ids(token), token_id)
+        self.assertEqual(self.get_tokenizer().convert_ids_to_tokens(token_id), token)
 
     def test_get_vocab(self):
         vocab_keys = list(self.get_tokenizer().get_vocab().keys())
 
         self.assertEqual(vocab_keys[0], "<pad>")
         self.assertEqual(vocab_keys[1], "</s>")
-        self.assertEqual(vocab_keys[-1], "v")
-        self.assertEqual(len(vocab_keys), 1_103)
+        self.assertEqual(vocab_keys[-1], "<unk_102>")
+        self.assertEqual(len(vocab_keys), 1_104)
 
     def test_vocab_size(self):
         self.assertEqual(self.get_tokenizer().vocab_size, 1_103)
@@ -130,11 +129,18 @@ def test_tokenizer_integration(self):
             revision="ba85d0851d708441f91440d509690f1ab6353415",
         )
 
+    @unittest.skip("Need to fix this after #26538")
+    def test_training_new_tokenizer(self):
+        pass
+
+    @unittest.skip("Need to fix this after #26538")
+    def test_training_new_tokenizer_with_special_tokens_change(self):
+        pass
+
 
 @require_sentencepiece
 @require_tokenizers
 class BigBirdPegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = PegasusTokenizer
     rust_tokenizer_class = PegasusTokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/pegasus_x/test_modeling_pegasus_x.py b/tests/models/pegasus_x/test_modeling_pegasus_x.py
index 1e53e0ec4e2a..22d7b0c8634a 100644
--- a/tests/models/pegasus_x/test_modeling_pegasus_x.py
+++ b/tests/models/pegasus_x/test_modeling_pegasus_x.py
@@ -27,6 +27,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -194,9 +195,20 @@ def check_encoder_decoder_model_standalone(self, config, inputs_dict):
 
 
 @require_torch
-class PegasusXModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class PegasusXModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (PegasusXModel, PegasusXForConditionalGeneration) if is_torch_available() else ()
     all_generative_model_classes = (PegasusXForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "conversational": PegasusXForConditionalGeneration,
+            "feature-extraction": PegasusXModel,
+            "summarization": PegasusXForConditionalGeneration,
+            "text2text-generation": PegasusXForConditionalGeneration,
+            "translation": PegasusXForConditionalGeneration,
+        }
+        if is_torch_available()
+        else {}
+    )
     is_encoder_decoder = True
     test_pruning = False
     test_head_masking = False
@@ -658,7 +670,7 @@ def __init__(
         use_labels=True,
         decoder_start_token_id=2,
         decoder_ffn_dim=32,
-        decoder_layers=4,
+        decoder_layers=2,
         encoder_attention_heads=4,
         decoder_attention_heads=4,
         max_position_embeddings=30,
diff --git a/tests/models/perceiver/test_modeling_perceiver.py b/tests/models/perceiver/test_modeling_perceiver.py
index 5f69b9ff69ee..91fac90e7bd3 100644
--- a/tests/models/perceiver/test_modeling_perceiver.py
+++ b/tests/models/perceiver/test_modeling_perceiver.py
@@ -32,6 +32,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -60,7 +61,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import PerceiverFeatureExtractor
+    from transformers import PerceiverImageProcessor
 
 
 class PerceiverModelTester:
@@ -78,6 +79,7 @@ def __init__(
         nchunks=20,
         num_latents=10,
         d_latents=20,
+        d_model=64,
         num_blocks=1,
         num_self_attends_per_block=2,
         num_self_attention_heads=1,
@@ -107,6 +109,7 @@ def __init__(
         self.nchunks = nchunks
         self.num_latents = num_latents
         self.d_latents = d_latents
+        self.d_model = d_model
         self.num_blocks = num_blocks
         self.num_self_attends_per_block = num_self_attends_per_block
         self.num_self_attention_heads = num_self_attention_heads
@@ -166,9 +169,11 @@ def prepare_config_and_inputs(self, model_class=None):
             audio = torch.randn(
                 (self.batch_size, self.num_frames * self.audio_samples_per_frame, 1), device=torch_device
             )
-            inputs = dict(
-                image=images, audio=audio, label=torch.zeros((self.batch_size, self.num_labels), device=torch_device)
-            )
+            inputs = {
+                "image": images,
+                "audio": audio,
+                "label": torch.zeros((self.batch_size, self.num_labels), device=torch_device),
+            }
         else:
             raise ValueError(f"Model class {model_class} not supported")
 
@@ -178,6 +183,7 @@ def get_config(self):
         return PerceiverConfig(
             num_latents=self.num_latents,
             d_latents=self.d_latents,
+            d_model=self.d_model,
             qk_channels=self.d_latents,
             v_channels=self.d_latents,
             num_blocks=self.num_blocks,
@@ -197,6 +203,8 @@ def get_config(self):
             audio_samples_per_frame=self.audio_samples_per_frame,
             samples_per_patch=self.samples_per_patch,
             num_labels=self.num_labels,
+            output_num_channels=32,
+            _label_trainable_num_channels=16,
         )
 
     def get_pipeline_config(self):
@@ -262,8 +270,7 @@ def prepare_config_and_inputs_for_model_class(self, model_class):
 
 
 @require_torch
-class PerceiverModelTest(ModelTesterMixin, unittest.TestCase):
-
+class PerceiverModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             PerceiverModel,
@@ -278,6 +285,21 @@ class PerceiverModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": PerceiverModel,
+            "fill-mask": PerceiverForMaskedLM,
+            "image-classification": (
+                PerceiverForImageClassificationConvProcessing,
+                PerceiverForImageClassificationFourier,
+                PerceiverForImageClassificationLearned,
+            ),
+            "text-classification": PerceiverForSequenceClassification,
+            "zero-shot": PerceiverForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
     test_pruning = False
     test_head_masking = False
     test_torchscript = False
@@ -735,11 +757,10 @@ def test_problem_types(self):
                 continue
 
             config, inputs, input_mask, _, _ = self.model_tester.prepare_config_and_inputs(model_class=model_class)
-            inputs_dict = dict(inputs=inputs, attention_mask=input_mask)
+            inputs_dict = {"inputs": inputs, "attention_mask": input_mask}
 
             for problem_type in problem_types:
                 with self.subTest(msg=f"Testing {model_class} with {problem_type['title']}"):
-
                     config.problem_type = problem_type["title"]
                     config.num_labels = problem_type["num_labels"]
 
@@ -849,7 +870,6 @@ def extract_image_patches(x, kernel, stride=1, dilation=1):
 class PerceiverModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference_masked_lm(self):
-
         tokenizer = PerceiverTokenizer.from_pretrained("deepmind/language-perceiver")
         model = PerceiverForMaskedLM.from_pretrained("deepmind/language-perceiver")
         model.to(torch_device)
@@ -884,14 +904,13 @@ def test_inference_masked_lm(self):
 
     @slow
     def test_inference_image_classification(self):
-
-        feature_extractor = PerceiverFeatureExtractor()
+        image_processor = PerceiverImageProcessor()
         model = PerceiverForImageClassificationLearned.from_pretrained("deepmind/vision-perceiver-learned")
         model.to(torch_device)
 
         # prepare inputs
         image = prepare_img()
-        inputs = feature_extractor(image, return_tensors="pt").pixel_values.to(torch_device)
+        inputs = image_processor(image, return_tensors="pt").pixel_values.to(torch_device)
         input_mask = None
 
         # forward pass
@@ -909,14 +928,13 @@ def test_inference_image_classification(self):
 
     @slow
     def test_inference_image_classification_fourier(self):
-
-        feature_extractor = PerceiverFeatureExtractor()
+        image_processor = PerceiverImageProcessor()
         model = PerceiverForImageClassificationFourier.from_pretrained("deepmind/vision-perceiver-fourier")
         model.to(torch_device)
 
         # prepare inputs
         image = prepare_img()
-        inputs = feature_extractor(image, return_tensors="pt").pixel_values.to(torch_device)
+        inputs = image_processor(image, return_tensors="pt").pixel_values.to(torch_device)
         input_mask = None
 
         # forward pass
@@ -934,14 +952,13 @@ def test_inference_image_classification_fourier(self):
 
     @slow
     def test_inference_image_classification_conv(self):
-
-        feature_extractor = PerceiverFeatureExtractor()
+        image_processor = PerceiverImageProcessor()
         model = PerceiverForImageClassificationConvProcessing.from_pretrained("deepmind/vision-perceiver-conv")
         model.to(torch_device)
 
         # prepare inputs
         image = prepare_img()
-        inputs = feature_extractor(image, return_tensors="pt").pixel_values.to(torch_device)
+        inputs = image_processor(image, return_tensors="pt").pixel_values.to(torch_device)
         input_mask = None
 
         # forward pass
diff --git a/tests/models/perceiver/test_tokenization_perceiver.py b/tests/models/perceiver/test_tokenization_perceiver.py
index 3c7a67bcd2b9..e08f2e4c5c79 100644
--- a/tests/models/perceiver/test_tokenization_perceiver.py
+++ b/tests/models/perceiver/test_tokenization_perceiver.py
@@ -36,7 +36,6 @@
 
 
 class PerceiverTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = PerceiverTokenizer
     test_rust_tokenizer = False
 
@@ -186,7 +185,9 @@ def test_save_and_load_tokenizer(self):
                 tokenizer.add_tokens(["bim", "bambam"])
                 additional_special_tokens = tokenizer.additional_special_tokens
                 additional_special_tokens.append("new_additional_special_token")
-                tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+                tokenizer.add_special_tokens(
+                    {"additional_special_tokens": additional_special_tokens}, replace_additional_special_tokens=False
+                )
                 before_tokens = tokenizer.encode(sample_text, add_special_tokens=False)
                 tokenizer.save_pretrained(tmpdirname)
 
diff --git a/tests/models/persimmon/__init__.py b/tests/models/persimmon/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/persimmon/test_modeling_persimmon.py b/tests/models/persimmon/test_modeling_persimmon.py
new file mode 100644
index 000000000000..3b67128c3b73
--- /dev/null
+++ b/tests/models/persimmon/test_modeling_persimmon.py
@@ -0,0 +1,417 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Persimmon model. """
+
+
+import unittest
+
+from parameterized import parameterized
+
+from transformers import PersimmonConfig, is_torch_available, set_seed
+from transformers.testing_utils import require_torch, require_torch_gpu, slow, torch_device
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        AutoTokenizer,
+        PersimmonForCausalLM,
+        PersimmonForSequenceClassification,
+        PersimmonModel,
+    )
+
+
+# Copied from transformers.tests.llama.test_modelling_llama.LlamaModelTest with Llama->Persimmon
+class PersimmonModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=False,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        pad_token_id=0,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.pad_token_id = pad_token_id
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.get_config()
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def get_config(self):
+        return PersimmonConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+            pad_token_id=self.pad_token_id,
+        )
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = PersimmonModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_model_as_decoder(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.add_cross_attention = True
+        model = PersimmonModel(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        result = model(input_ids, attention_mask=input_mask)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_causal_lm(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        model = PersimmonForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_decoder_model_past_large_inputs(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.is_decoder = True
+        config.add_cross_attention = True
+        model = PersimmonForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # first forward pass
+        outputs = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=True,
+        )
+        past_key_values = outputs.past_key_values
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+
+        output_from_no_past = model(
+            next_input_ids,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+        output_from_past = model(
+            next_tokens,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+# Copied from transformers.tests.llama.test_modelling_llama.LlamaModelTest with Llama->Persimmon
+@require_torch
+class PersimmonModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (PersimmonModel, PersimmonForCausalLM, PersimmonForSequenceClassification) if is_torch_available() else ()
+    )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": PersimmonModel,
+            "text-classification": PersimmonForSequenceClassification,
+            # TODO (ydshieh): check why these two fail. Fix them or skip them in a better way.
+            # "text-generation": PersimmonForCausalLM,
+            # "zero-shot": PersimmonForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
+
+    all_generative_model_classes = (PersimmonForCausalLM,) if is_torch_available() else ()
+    test_headmasking = False
+    test_pruning = False
+
+    def setUp(self):
+        self.model_tester = PersimmonModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=PersimmonConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_various_embeddings(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        for type in ["absolute", "relative_key", "relative_key_query"]:
+            config_and_inputs[0].position_embedding_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_persimmon_sequence_classification_model(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
+        model = PersimmonForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    def test_persimmon_sequence_classification_model_for_single_label(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        config.problem_type = "single_label_classification"
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
+        model = PersimmonForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    def test_persimmon_sequence_classification_model_for_multi_label(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        config.problem_type = "multi_label_classification"
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        sequence_labels = ids_tensor(
+            [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size
+        ).to(torch.float)
+        model = PersimmonForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    @unittest.skip("Persimmon buffers include complex numbers, which breaks this test")
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @parameterized.expand([("linear",), ("dynamic",)])
+    def test_model_rope_scaling(self, scaling_type):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        short_input = ids_tensor([1, 10], config.vocab_size)
+        long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size)
+
+        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
+        original_model = PersimmonModel(config)
+        original_model.to(torch_device)
+        original_model.eval()
+        original_short_output = original_model(short_input).last_hidden_state
+        original_long_output = original_model(long_input).last_hidden_state
+
+        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
+        config.rope_scaling = {"type": scaling_type, "factor": 10.0}
+        scaled_model = PersimmonModel(config)
+        scaled_model.to(torch_device)
+        scaled_model.eval()
+        scaled_short_output = scaled_model(short_input).last_hidden_state
+        scaled_long_output = scaled_model(long_input).last_hidden_state
+
+        # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original
+        # maximum sequence length, so the outputs for the short input should match.
+        if scaling_type == "dynamic":
+            self.assertTrue(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
+        else:
+            self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
+
+        # The output should be different for long inputs
+        self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5))
+
+
+@require_torch
+class PersimmonIntegrationTest(unittest.TestCase):
+    @slow
+    def test_model_8b_chat_logits(self):
+        input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338]
+        model = PersimmonForCausalLM.from_pretrained(
+            "adept/persimmon-8b-chat", device_map="auto", torch_dtype=torch.float16
+        )
+        out = model(torch.tensor([input_ids])).logits
+
+        EXPECTED_MEAN = torch.tensor(
+            [[-11.2879, -11.2628, -11.2498, -11.2534, -11.2676, -11.2638, -11.2501, -11.2431]], dtype=torch.float16
+        )
+        torch.testing.assert_close(out.cpu().mean(-1), EXPECTED_MEAN, atol=1e-4, rtol=1e-4)
+        # fmt: off
+        EXPECTED_SLICE = torch.tensor([-16.9670, -16.9647, -16.9649, -16.9630, -16.9577, -16.9623, -17.0164, -16.9673, -16.9648, -16.9668, -17.0160, -16.9651, -17.0156, -16.9668, -16.9655, -16.9653, -16.9665, -16.9682, -17.0112, -16.9667, -16.9717, -16.9654, -16.9650, -16.9701, -16.9657, -17.0160, -16.9676, -17.0138, -16.9610, -16.9695])
+        # fmt: on
+        torch.testing.assert_close(out.cpu()[0, 0, :30], EXPECTED_SLICE, atol=1e-5, rtol=1e-5)
+
+    @slow
+    @require_torch_gpu
+    def test_model_8b_chat_greedy_generation(self):
+        EXPECTED_TEXT_COMPLETION = """human: Simply put, the theory of relativity states that?\n\nadept: The theory of relativity states that the laws of physics are the same for all observers, regardless of their relative motion."""
+        prompt = "human: Simply put, the theory of relativity states that?\n\nadept:"
+        tokenizer = AutoTokenizer.from_pretrained("adept/persimmon-8b-chat", use_fast=False)
+        input_ids = tokenizer.encode(prompt, return_tensors="pt").to(torch_device)
+        model = PersimmonForCausalLM.from_pretrained("adept/persimmon-8b-chat", torch_dtype=torch.float16).to(
+            torch_device
+        )
+
+        # greedy generation outputs
+        generated_ids = model.generate(input_ids, max_new_tokens=64)
+        text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
diff --git a/tests/models/phobert/test_tokenization_phobert.py b/tests/models/phobert/test_tokenization_phobert.py
index de16c154c925..6624957531b0 100644
--- a/tests/models/phobert/test_tokenization_phobert.py
+++ b/tests/models/phobert/test_tokenization_phobert.py
@@ -22,7 +22,6 @@
 
 
 class PhobertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = PhobertTokenizer
     test_rust_tokenizer = False
 
diff --git a/tests/models/pix2struct/__init__.py b/tests/models/pix2struct/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/pix2struct/test_image_processing_pix2struct.py b/tests/models/pix2struct/test_image_processing_pix2struct.py
new file mode 100644
index 000000000000..4b06573fe61f
--- /dev/null
+++ b/tests/models/pix2struct/test_image_processing_pix2struct.py
@@ -0,0 +1,358 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+import requests
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+    from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_11
+else:
+    is_torch_greater_or_equal_than_1_11 = False
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import Pix2StructImageProcessor
+
+
+class Pix2StructImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        size=None,
+        do_normalize=True,
+        do_convert_rgb=True,
+        patch_size=None,
+    ):
+        size = size if size is not None else {"height": 20, "width": 20}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.size = size
+        self.do_normalize = do_normalize
+        self.do_convert_rgb = do_convert_rgb
+        self.max_patches = [512, 1024, 2048, 4096]
+        self.patch_size = patch_size if patch_size is not None else {"height": 16, "width": 16}
+
+    def prepare_image_processor_dict(self):
+        return {"do_normalize": self.do_normalize, "do_convert_rgb": self.do_convert_rgb}
+
+    def prepare_dummy_image(self):
+        img_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg"
+        raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
+        return raw_image
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@unittest.skipIf(
+    not is_torch_greater_or_equal_than_1_11,
+    reason="`Pix2StructImageProcessor` requires `torch>=1.11.0`.",
+)
+@require_torch
+@require_vision
+class Pix2StructImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = Pix2StructImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = Pix2StructImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processor, "do_normalize"))
+        self.assertTrue(hasattr(image_processor, "do_convert_rgb"))
+
+    def test_expected_patches(self):
+        dummy_image = self.image_processor_tester.prepare_dummy_image()
+
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+        max_patch = 2048
+
+        inputs = image_processor(dummy_image, return_tensors="pt", max_patches=max_patch)
+        self.assertTrue(torch.allclose(inputs.flattened_patches.mean(), torch.tensor(0.0606), atol=1e-3, rtol=1e-3))
+
+    def test_call_pil(self):
+        # Initialize image_processor
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+        # create random PIL images
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
+        for image in image_inputs:
+            self.assertIsInstance(image, Image.Image)
+
+        # Test not batched input
+        expected_hidden_dim = (
+            (self.image_processor_tester.patch_size["height"] * self.image_processor_tester.patch_size["width"])
+            * self.image_processor_tester.num_channels
+        ) + 2
+
+        for max_patch in self.image_processor_tester.max_patches:
+            # Test not batched input
+            encoded_images = image_processor(
+                image_inputs[0], return_tensors="pt", max_patches=max_patch
+            ).flattened_patches
+            self.assertEqual(
+                encoded_images.shape,
+                (1, max_patch, expected_hidden_dim),
+            )
+
+            # Test batched
+            encoded_images = image_processor(
+                image_inputs, return_tensors="pt", max_patches=max_patch
+            ).flattened_patches
+            self.assertEqual(
+                encoded_images.shape,
+                (self.image_processor_tester.batch_size, max_patch, expected_hidden_dim),
+            )
+
+    def test_call_vqa(self):
+        # Initialize image_processor
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+        # create random PIL images
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
+        for image in image_inputs:
+            self.assertIsInstance(image, Image.Image)
+
+        # Test not batched input
+        expected_hidden_dim = (
+            (self.image_processor_tester.patch_size["height"] * self.image_processor_tester.patch_size["width"])
+            * self.image_processor_tester.num_channels
+        ) + 2
+
+        image_processor.is_vqa = True
+
+        for max_patch in self.image_processor_tester.max_patches:
+            # Test not batched input
+            with self.assertRaises(ValueError):
+                encoded_images = image_processor(
+                    image_inputs[0], return_tensors="pt", max_patches=max_patch
+                ).flattened_patches
+
+            dummy_text = "Hello"
+
+            encoded_images = image_processor(
+                image_inputs[0], return_tensors="pt", max_patches=max_patch, header_text=dummy_text
+            ).flattened_patches
+            self.assertEqual(
+                encoded_images.shape,
+                (1, max_patch, expected_hidden_dim),
+            )
+
+            # Test batched
+            encoded_images = image_processor(
+                image_inputs, return_tensors="pt", max_patches=max_patch, header_text=dummy_text
+            ).flattened_patches
+            self.assertEqual(
+                encoded_images.shape,
+                (self.image_processor_tester.batch_size, max_patch, expected_hidden_dim),
+            )
+
+    def test_call_numpy(self):
+        # Initialize image_processor
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+        # create random numpy tensors
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, np.ndarray)
+
+        expected_hidden_dim = (
+            (self.image_processor_tester.patch_size["height"] * self.image_processor_tester.patch_size["width"])
+            * self.image_processor_tester.num_channels
+        ) + 2
+
+        for max_patch in self.image_processor_tester.max_patches:
+            # Test not batched input
+            encoded_images = image_processor(
+                image_inputs[0], return_tensors="pt", max_patches=max_patch
+            ).flattened_patches
+            self.assertEqual(
+                encoded_images.shape,
+                (1, max_patch, expected_hidden_dim),
+            )
+
+            # Test batched
+            encoded_images = image_processor(
+                image_inputs, return_tensors="pt", max_patches=max_patch
+            ).flattened_patches
+            self.assertEqual(
+                encoded_images.shape,
+                (self.image_processor_tester.batch_size, max_patch, expected_hidden_dim),
+            )
+
+    def test_call_numpy_4_channels(self):
+        # Initialize image_processor
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+        # create random numpy tensors
+        self.image_processor_tester.num_channels = 4
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, np.ndarray)
+
+        expected_hidden_dim = (
+            (self.image_processor_tester.patch_size["height"] * self.image_processor_tester.patch_size["width"])
+            * self.image_processor_tester.num_channels
+        ) + 2
+
+        for max_patch in self.image_processor_tester.max_patches:
+            # Test not batched input
+            encoded_images = image_processor(
+                image_inputs[0], return_tensors="pt", max_patches=max_patch, input_data_format="channels_first"
+            ).flattened_patches
+            self.assertEqual(
+                encoded_images.shape,
+                (1, max_patch, expected_hidden_dim),
+            )
+
+            # Test batched
+            encoded_images = image_processor(
+                image_inputs, return_tensors="pt", max_patches=max_patch, input_data_format="channels_first"
+            ).flattened_patches
+            self.assertEqual(
+                encoded_images.shape,
+                (self.image_processor_tester.batch_size, max_patch, expected_hidden_dim),
+            )
+        self.image_processor_tester.num_channels = 3
+
+    def test_call_pytorch(self):
+        # Initialize image_processor
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+        # create random PyTorch tensors
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, torch.Tensor)
+
+        # Test not batched input
+        expected_hidden_dim = (
+            (self.image_processor_tester.patch_size["height"] * self.image_processor_tester.patch_size["width"])
+            * self.image_processor_tester.num_channels
+        ) + 2
+
+        for max_patch in self.image_processor_tester.max_patches:
+            # Test not batched input
+            encoded_images = image_processor(
+                image_inputs[0], return_tensors="pt", max_patches=max_patch
+            ).flattened_patches
+            self.assertEqual(
+                encoded_images.shape,
+                (1, max_patch, expected_hidden_dim),
+            )
+
+            # Test batched
+            encoded_images = image_processor(
+                image_inputs, return_tensors="pt", max_patches=max_patch
+            ).flattened_patches
+            self.assertEqual(
+                encoded_images.shape,
+                (self.image_processor_tester.batch_size, max_patch, expected_hidden_dim),
+            )
+
+
+@unittest.skipIf(
+    not is_torch_greater_or_equal_than_1_11,
+    reason="`Pix2StructImageProcessor` requires `torch>=1.11.0`.",
+)
+@require_torch
+@require_vision
+class Pix2StructImageProcessingTestFourChannels(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = Pix2StructImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = Pix2StructImageProcessingTester(self, num_channels=4)
+        self.expected_encoded_image_num_channels = 3
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processor, "do_normalize"))
+        self.assertTrue(hasattr(image_processor, "do_convert_rgb"))
+
+    def test_call_pil(self):
+        # Initialize image_processor
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+        # create random PIL images
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
+        for image in image_inputs:
+            self.assertIsInstance(image, Image.Image)
+
+        # Test not batched input
+        expected_hidden_dim = (
+            (self.image_processor_tester.patch_size["height"] * self.image_processor_tester.patch_size["width"])
+            * (self.image_processor_tester.num_channels - 1)
+        ) + 2
+
+        for max_patch in self.image_processor_tester.max_patches:
+            # Test not batched input
+            encoded_images = image_processor(
+                image_inputs[0], return_tensors="pt", max_patches=max_patch
+            ).flattened_patches
+            self.assertEqual(
+                encoded_images.shape,
+                (1, max_patch, expected_hidden_dim),
+            )
+
+            # Test batched
+            encoded_images = image_processor(
+                image_inputs, return_tensors="pt", max_patches=max_patch
+            ).flattened_patches
+            self.assertEqual(
+                encoded_images.shape,
+                (self.image_processor_tester.batch_size, max_patch, expected_hidden_dim),
+            )
+
+    @unittest.skip("Pix2StructImageProcessor does not support 4 channels yet")  # FIXME Amy
+    def test_call_numpy(self):
+        return super().test_call_numpy()
+
+    @unittest.skip("Pix2StructImageProcessor does not support 4 channels yet")  # FIXME Amy
+    def test_call_pytorch(self):
+        return super().test_call_torch()
+
+    @unittest.skip("Pix2StructImageProcessor does treat numpy and PIL 4 channel images consistently")  # FIXME Amy
+    def test_call_numpy_4_channels(self):
+        return super().test_call_torch()
diff --git a/tests/models/pix2struct/test_modeling_pix2struct.py b/tests/models/pix2struct/test_modeling_pix2struct.py
new file mode 100644
index 000000000000..34ca767d6b01
--- /dev/null
+++ b/tests/models/pix2struct/test_modeling_pix2struct.py
@@ -0,0 +1,840 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Pix2Struct model. """
+
+import copy
+import inspect
+import os
+import tempfile
+import unittest
+
+import numpy as np
+import requests
+
+from transformers import Pix2StructConfig, Pix2StructTextConfig, Pix2StructVisionConfig
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import (
+    ModelTesterMixin,
+    _config_zero_init,
+    floats_tensor,
+    ids_tensor,
+    random_attention_mask,
+)
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import (
+        Pix2StructForConditionalGeneration,
+        Pix2StructProcessor,
+        Pix2StructTextModel,
+        Pix2StructVisionModel,
+    )
+    from transformers.models.pix2struct.modeling_pix2struct import PIX2STRUCT_PRETRAINED_MODEL_ARCHIVE_LIST
+    from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_11
+else:
+    is_torch_greater_or_equal_than_1_11 = False
+
+
+if is_vision_available():
+    from PIL import Image
+
+
+class Pix2StructVisionModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        image_size=30,
+        patch_size=2,
+        num_channels=3,
+        is_training=True,
+        hidden_size=12,
+        patch_embed_hidden_size=12,
+        projection_dim=32,
+        max_patches=64,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        initializer_range=1e-10,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_embed_hidden_size = patch_embed_hidden_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.max_patches = max_patches
+        self.seq_length = self.max_patches
+        self.patch_proj_dim = ((patch_size**2) * num_channels) + 2
+
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        flattened_patches = floats_tensor([self.batch_size, self.max_patches, self.patch_proj_dim])
+        config = self.get_config()
+
+        return config, flattened_patches
+
+    def get_config(self):
+        return Pix2StructVisionConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            initializer_range=self.initializer_range,
+            patch_embed_hidden_size=self.patch_embed_hidden_size,
+        )
+
+    def create_and_check_model(self, config, flattened_patches):
+        model = Pix2StructVisionModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(flattened_patches)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, flattened_patches = config_and_inputs
+        inputs_dict = {
+            "flattened_patches": flattened_patches,
+            "attention_mask": torch.randint(0, 2, (self.batch_size, self.max_patches)),
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class Pix2StructVisionModelTest(ModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as Pix2Struct does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (Pix2StructVisionModel,) if is_torch_available() else ()
+    fx_compatible = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = Pix2StructVisionModelTester(self)
+        self.config_tester = ConfigTester(
+            self, config_class=Pix2StructVisionConfig, has_text_modality=False, hidden_size=37
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="Pix2StructVision does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["flattened_patches"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip(reason="Training is tested directly on `Pix2StructTextImageModelTest`")
+    def test_training(self):
+        pass
+
+    @unittest.skip(reason="Training is tested directly on `Pix2StructTextImageModelTest`")
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(reason="Training is tested directly on `Pix2StructTextImageModelTest`")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    @unittest.skip(reason="Pix2StructVisionModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @unittest.skip(reason="Pix2StructVisionModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_to_base(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in PIX2STRUCT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = Pix2StructVisionModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+class Pix2StructTextModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=12,
+        projection_dim=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        bos_token_id=0,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.d_kv = hidden_size // num_attention_heads
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.scope = scope
+        self.bos_token_id = bos_token_id
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        if input_mask is not None:
+            batch_size, seq_length = input_mask.shape
+            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
+            for batch_idx, start_index in enumerate(rnd_start_indices):
+                input_mask[batch_idx, :start_index] = 1
+                input_mask[batch_idx, start_index:] = 0
+
+        config = self.get_config()
+
+        return config, input_ids, input_mask
+
+    def get_config(self):
+        return Pix2StructTextConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            max_position_embeddings=self.max_position_embeddings,
+            initializer_range=self.initializer_range,
+            bos_token_id=self.bos_token_id,
+            d_kv=self.d_kv,
+        )
+
+    def create_and_check_model(self, config, input_ids, input_mask):
+        model = Pix2StructTextModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(input_ids, attention_mask=input_mask)
+            result = model(input_ids)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, input_mask = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class Pix2StructTextModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (Pix2StructTextModel,) if is_torch_available() else ()
+    fx_compatible = False
+    test_pruning = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = Pix2StructTextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=Pix2StructTextConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip(reason="Training is tested directly on `Pix2StructTextImageModelTest`")
+    def test_training(self):
+        pass
+
+    @unittest.skip(reason="Training is tested directly on `Pix2StructTextImageModelTest`")
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(reason="Pix2Struct does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="Pix2StructTextModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @unittest.skip(reason="Pix2StructTextModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_to_base(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in PIX2STRUCT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = Pix2StructTextModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+class Pix2StructModelTester:
+    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
+        if text_kwargs is None:
+            text_kwargs = {}
+        if vision_kwargs is None:
+            vision_kwargs = {}
+
+        self.parent = parent
+        self.text_model_tester = Pix2StructTextModelTester(parent, **text_kwargs)
+        self.vision_model_tester = Pix2StructVisionModelTester(parent, **vision_kwargs)
+        self.is_training = is_training
+
+    def prepare_config_and_inputs(self):
+        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
+        vision_config, flattened_patches = self.vision_model_tester.prepare_config_and_inputs()
+
+        config = self.get_config(text_config, vision_config)
+
+        return config, input_ids, attention_mask, flattened_patches
+
+    def get_config(self, text_config, vision_config):
+        return Pix2StructConfig.from_text_vision_configs(text_config, vision_config, projection_dim=64)
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, decoder_attention_mask, flattened_patches = config_and_inputs
+
+        attention_mask = (flattened_patches.sum(dim=-1) != 0).float()
+
+        inputs_dict = {
+            "decoder_input_ids": input_ids,
+            "labels": input_ids,
+            "decoder_attention_mask": decoder_attention_mask,
+            "flattened_patches": flattened_patches,
+            "attention_mask": attention_mask,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class Pix2StructModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (Pix2StructForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = {"image-to-text": Pix2StructForConditionalGeneration} if is_torch_available() else {}
+    fx_compatible = False
+    test_head_masking = False
+    test_pruning = False
+    test_resize_embeddings = True
+    test_attention_outputs = False
+    test_torchscript = False
+
+    def setUp(self):
+        self.model_tester = Pix2StructModelTester(self)
+
+    def test_model(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config).to(torch_device)
+
+            output = model(**input_dict)
+            self.assertEqual(
+                output[1].shape,
+                (
+                    self.model_tester.vision_model_tester.batch_size,
+                    self.model_tester.text_model_tester.seq_length,
+                    self.model_tester.text_model_tester.vocab_size,
+                ),
+            )
+
+    @unittest.skip(reason="Hidden_states is tested in individual model tests")
+    def test_hidden_states_output(self):
+        pass
+
+    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="Retain_grad is tested in individual model tests")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    @unittest.skip(reason="Pix2StructModel does not have input/output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = [
+                "flattened_patches",
+                "attention_mask",
+                "decoder_input_ids",
+                "decoder_attention_mask",
+                "head_mask",
+                "decoder_head_mask",
+                "cross_attn_head_mask",
+                "encoder_outputs",
+                "past_key_values",
+                "labels",
+                "decoder_inputs_embeds",
+                "use_cache",
+            ]
+
+            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+
+    def test_training(self):
+        if not self.model_tester.is_training:
+            return
+
+        for model_class in self.all_model_classes[:-1]:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            config.return_dict = True
+
+            model = model_class(config)
+            model.to(torch_device)
+            model.train()
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+
+            # hardcode labels to be the same as input_ids
+            inputs["labels"] = inputs["input_ids"]
+
+            loss = model(**inputs).loss
+            loss.backward()
+
+    def test_training_gradient_checkpointing(self):
+        if not self.model_tester.is_training:
+            return
+
+        for model_class in self.all_model_classes[:-1]:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            config.use_cache = False
+            config.return_dict = True
+
+            model = model_class(config)
+            model.to(torch_device)
+            model.gradient_checkpointing_enable()
+            model.train()
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+
+            # hardcode labels to be the same as input_ids
+            inputs["labels"] = inputs["input_ids"]
+
+            loss = model(**inputs).loss
+            loss.backward()
+
+    # override as the `logit_scale` parameter initilization is different for Pix2Struct
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    # check if `logit_scale` is initilized as per the original implementation
+                    if name == "logit_scale":
+                        self.assertAlmostEqual(
+                            param.data.item(),
+                            np.log(1 / 0.07),
+                            delta=1e-3,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    else:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+    # overwrite because `vocab_size` is not an attribute of `Pix2StructConfig` but rather `Pix2StructTextConfig`
+    def test_resize_tokens_embeddings(self):
+        original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        if not self.test_resize_embeddings:
+            return
+
+        for model_class in self.all_model_classes:
+            config = copy.deepcopy(original_config)
+            model = model_class(config)
+            model.to(torch_device)
+
+            if self.model_tester.is_training is False:
+                model.eval()
+
+            model_vocab_size = config.text_config.vocab_size
+            # Retrieve the embeddings and clone theme
+            model_embed = model.resize_token_embeddings(model_vocab_size)
+            cloned_embeddings = model_embed.weight.clone()
+
+            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
+            model_embed = model.resize_token_embeddings(model_vocab_size + 10)
+            self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10)
+            # Check that it actually resizes the embeddings matrix
+            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
+            model_embed = model.resize_token_embeddings(model_vocab_size - 15)
+            self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15)
+            # Check that it actually resizes the embeddings matrix
+            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
+
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            # Decoder input ids should be clamped to the maximum size of the vocabulary
+            if "decoder_input_ids" in inputs_dict:
+                inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
+            models_equal = True
+            for p1, p2 in zip(cloned_embeddings, model_embed.weight):
+                if p1.data.ne(p2.data).sum() > 0:
+                    models_equal = False
+
+            self.assertTrue(models_equal)
+
+    # overwrite because `vocab_size` is not an attribute of `Pix2StructConfig` but rather `Pix2StructTextConfig`
+    def test_resize_embeddings_untied(self):
+        original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        if not self.test_resize_embeddings:
+            return
+
+        original_config.tie_word_embeddings = False
+
+        # if model cannot untied embeddings -> leave test
+        if original_config.tie_word_embeddings:
+            return
+
+        for model_class in self.all_model_classes:
+            config = copy.deepcopy(original_config)
+            model = model_class(config).to(torch_device)
+
+            # if no output embeddings -> leave test
+            if model.get_output_embeddings() is None:
+                continue
+
+            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
+            model_vocab_size = config.text_config.vocab_size
+            model.resize_token_embeddings(model_vocab_size + 10)
+            self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10)
+            output_embeds = model.get_output_embeddings()
+            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10)
+            # Check bias if present
+            if output_embeds.bias is not None:
+                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
+            model.resize_token_embeddings(model_vocab_size - 15)
+            self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15)
+            # Check that it actually resizes the embeddings matrix
+            output_embeds = model.get_output_embeddings()
+            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15)
+            # Check bias if present
+            if output_embeds.bias is not None:
+                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            # Decoder input ids should be clamped to the maximum size of the vocabulary
+            if "decoder_input_ids" in inputs_dict:
+                inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+    @unittest.skip(reason="Pix2Struct doesn't use tied weights")
+    def test_tied_model_weights_key_ignore(self):
+        pass
+
+    def _create_and_check_torchscript(self, config, inputs_dict):
+        if not self.test_torchscript:
+            return
+
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        configs_no_init.torchscript = True
+        configs_no_init.return_dict = False
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
+
+            try:
+                input_ids = inputs_dict["input_ids"]
+                flattened_patches = inputs_dict["flattened_patches"]  # Pix2Struct needs flattened_patches
+                traced_model = torch.jit.trace(model, (input_ids, flattened_patches))
+            except RuntimeError:
+                self.fail("Couldn't trace module.")
+
+            with tempfile.TemporaryDirectory() as tmp_dir_name:
+                pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
+
+                try:
+                    torch.jit.save(traced_model, pt_file_name)
+                except Exception:
+                    self.fail("Couldn't save module.")
+
+                try:
+                    loaded_model = torch.jit.load(pt_file_name)
+                except Exception:
+                    self.fail("Couldn't load module.")
+
+            model.to(torch_device)
+            model.eval()
+
+            loaded_model.to(torch_device)
+            loaded_model.eval()
+
+            model_state_dict = model.state_dict()
+            loaded_model_state_dict = loaded_model.state_dict()
+
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
+            self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
+
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
+            models_equal = True
+            for layer_name, p1 in model_state_dict.items():
+                p2 = loaded_model_state_dict[layer_name]
+                if p1.data.ne(p2.data).sum() > 0:
+                    models_equal = False
+
+            self.assertTrue(models_equal)
+
+    def test_load_vision_text_config(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        # Save Pix2StructConfig and check if we can load Pix2StructVisionConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            vision_config = Pix2StructVisionConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
+
+        # Save Pix2StructConfig and check if we can load Pix2StructTextConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            text_config = Pix2StructTextConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
+
+
+# We will verify our results on an image of a stop sign
+def prepare_img():
+    url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    return im
+
+
+@unittest.skipIf(
+    not is_torch_greater_or_equal_than_1_11,
+    reason="`Pix2StructImageProcessor` requires `torch>=1.11.0`.",
+)
+@require_vision
+@require_torch
+@slow
+class Pix2StructIntegrationTest(unittest.TestCase):
+    def test_inference_image_captioning(self):
+        model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-textcaps-base").to(torch_device)
+        processor = Pix2StructProcessor.from_pretrained("google/pix2struct-textcaps-base")
+        image = prepare_img()
+
+        # image only
+        inputs = processor(images=image, return_tensors="pt").to(torch_device)
+
+        predictions = model.generate(**inputs)
+
+        self.assertEqual(
+            processor.decode(predictions[0], skip_special_tokens=True), "A stop sign is on a street corner."
+        )
+
+    def test_batched_inference_image_captioning(self):
+        model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-textcaps-base").to(torch_device)
+        processor = Pix2StructProcessor.from_pretrained("google/pix2struct-textcaps-base")
+        image_1 = prepare_img()
+
+        second_url = (
+            "https://www.connollycove.com/wp-content/uploads/2019/06/temple-bar-dublin-world-famous-irish-pub.jpg"
+        )
+        image_2 = Image.open(requests.get(second_url, stream=True).raw)
+
+        # image only
+        inputs = processor(images=[image_1, image_2], return_tensors="pt").to(torch_device)
+
+        predictions = model.generate(**inputs)
+
+        self.assertEqual(
+            processor.decode(predictions[0], skip_special_tokens=True), "A stop sign is on a street corner."
+        )
+
+        self.assertEqual(
+            processor.decode(predictions[1], skip_special_tokens=True),
+            "A row of books including The Temple Bar and Guiness.",
+        )
+
+    def test_batched_inference_image_captioning_conditioned(self):
+        model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-textcaps-base").to(torch_device)
+        processor = Pix2StructProcessor.from_pretrained("google/pix2struct-textcaps-base")
+        image_1 = prepare_img()
+
+        second_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/temple-bar-dublin-world-famous-irish-pub.jpg"
+        image_2 = Image.open(requests.get(second_url, stream=True).raw)
+        texts = ["A picture of", "An photography of"]
+
+        # image only
+        inputs = processor(images=[image_1, image_2], text=texts, return_tensors="pt", add_special_tokens=False).to(
+            torch_device
+        )
+
+        predictions = model.generate(**inputs)
+
+        self.assertEqual(
+            processor.decode(predictions[0], skip_special_tokens=True),
+            "A picture of a stop sign with a red stop sign",
+        )
+
+        self.assertEqual(
+            processor.decode(predictions[1], skip_special_tokens=True),
+            "An photography of the Temple Bar and other places in the city.",
+        )
+
+    def test_vqa_model(self):
+        model_id = "google/pix2struct-ai2d-base"
+
+        image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg"
+        image = Image.open(requests.get(image_url, stream=True).raw)
+
+        model = Pix2StructForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(
+            torch_device
+        )
+        processor = Pix2StructProcessor.from_pretrained(model_id)
+
+        # image only
+        text = "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud"
+
+        inputs = processor(images=image, return_tensors="pt", text=text).to(torch_device, torch.bfloat16)
+
+        predictions = model.generate(**inputs)
+        self.assertEqual(processor.decode(predictions[0], skip_special_tokens=True), "ash cloud")
+
+    def test_vqa_model_batched(self):
+        model_id = "google/pix2struct-ai2d-base"
+
+        image_urls = [
+            "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg",
+            "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo-2.png",
+        ]
+
+        images = [Image.open(requests.get(image_url, stream=True).raw) for image_url in image_urls]
+
+        texts = [
+            "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud",
+            "What is the producer in the diagram? (1) Phytoplankton (2) Zooplankton (3) Large fish (4) Small fish",
+        ]
+
+        model = Pix2StructForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(
+            torch_device
+        )
+        processor = Pix2StructProcessor.from_pretrained(model_id)
+
+        inputs = processor(images=images, return_tensors="pt", text=texts).to(torch_device, torch.bfloat16)
+
+        predictions = model.generate(**inputs)
+        self.assertEqual(processor.decode(predictions[0], skip_special_tokens=True), "ash cloud")
+        self.assertEqual(processor.decode(predictions[1], skip_special_tokens=True), "Phytoplankton")
diff --git a/tests/models/pix2struct/test_processor_pix2struct.py b/tests/models/pix2struct/test_processor_pix2struct.py
new file mode 100644
index 000000000000..e0ee398b3a4d
--- /dev/null
+++ b/tests/models/pix2struct/test_processor_pix2struct.py
@@ -0,0 +1,201 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import shutil
+import tempfile
+import unittest
+
+import numpy as np
+import pytest
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+
+if is_torch_available():
+    from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_11
+else:
+    is_torch_greater_or_equal_than_1_11 = False
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import (
+        AutoProcessor,
+        Pix2StructImageProcessor,
+        Pix2StructProcessor,
+        PreTrainedTokenizerFast,
+        T5Tokenizer,
+    )
+
+
+@unittest.skipIf(
+    not is_torch_greater_or_equal_than_1_11,
+    reason="`Pix2StructImageProcessor` requires `torch>=1.11.0`.",
+)
+@require_vision
+@require_torch
+class Pix2StructProcessorTest(unittest.TestCase):
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+
+        image_processor = Pix2StructImageProcessor()
+        tokenizer = T5Tokenizer.from_pretrained("t5-small")
+
+        processor = Pix2StructProcessor(image_processor, tokenizer)
+
+        processor.save_pretrained(self.tmpdirname)
+
+    def get_tokenizer(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
+
+    def get_image_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def prepare_image_inputs(self):
+        """
+        This function prepares a list of random PIL images of the same fixed size.
+        """
+
+        image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
+
+        image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
+
+        return image_inputs
+
+    def test_save_load_pretrained_additional_features(self):
+        processor = Pix2StructProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
+        processor.save_pretrained(self.tmpdirname)
+
+        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
+        image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
+
+        processor = Pix2StructProcessor.from_pretrained(
+            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
+        )
+
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
+        self.assertIsInstance(processor.tokenizer, PreTrainedTokenizerFast)
+
+        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
+        self.assertIsInstance(processor.image_processor, Pix2StructImageProcessor)
+
+    def test_image_processor(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = Pix2StructProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        image_input = self.prepare_image_inputs()
+
+        input_feat_extract = image_processor(image_input, return_tensors="np")
+        input_processor = processor(images=image_input, return_tensors="np")
+
+        for key in input_feat_extract.keys():
+            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
+
+    def test_tokenizer(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = Pix2StructProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        input_str = "lower newer"
+
+        encoded_processor = processor(text=input_str)
+
+        encoded_tok = tokenizer(input_str, return_token_type_ids=False, add_special_tokens=True)
+
+        for key in encoded_tok.keys():
+            self.assertListEqual(encoded_tok[key], encoded_processor[key])
+
+    def test_processor(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = Pix2StructProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+
+        self.assertListEqual(
+            list(inputs.keys()), ["flattened_patches", "attention_mask", "decoder_attention_mask", "decoder_input_ids"]
+        )
+
+        # test if it raises when no input is passed
+        with pytest.raises(ValueError):
+            processor()
+
+    def test_processor_max_patches(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = Pix2StructProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+
+        max_patches = [512, 1024, 2048, 4096]
+        expected_hidden_size = [770, 770, 770, 770]
+        # with text
+        for i, max_patch in enumerate(max_patches):
+            inputs = processor(text=input_str, images=image_input, max_patches=max_patch)
+            self.assertEqual(inputs["flattened_patches"][0].shape[0], max_patch)
+            self.assertEqual(inputs["flattened_patches"][0].shape[1], expected_hidden_size[i])
+
+        # without text input
+        for i, max_patch in enumerate(max_patches):
+            inputs = processor(images=image_input, max_patches=max_patch)
+            self.assertEqual(inputs["flattened_patches"][0].shape[0], max_patch)
+            self.assertEqual(inputs["flattened_patches"][0].shape[1], expected_hidden_size[i])
+
+    def test_tokenizer_decode(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = Pix2StructProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
+
+        decoded_processor = processor.batch_decode(predicted_ids)
+        decoded_tok = tokenizer.batch_decode(predicted_ids)
+
+        self.assertListEqual(decoded_tok, decoded_processor)
+
+    def test_model_input_names(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = Pix2StructProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+
+        # For now the processor supports only ["flattened_patches", "input_ids", "attention_mask", "decoder_attention_mask"]
+        self.assertListEqual(
+            list(inputs.keys()), ["flattened_patches", "attention_mask", "decoder_attention_mask", "decoder_input_ids"]
+        )
+
+        inputs = processor(text=input_str)
+
+        # For now the processor supports only ["flattened_patches", "input_ids", "attention_mask", "decoder_attention_mask"]
+        self.assertListEqual(list(inputs.keys()), ["input_ids", "attention_mask"])
diff --git a/tests/models/plbart/test_modeling_plbart.py b/tests/models/plbart/test_modeling_plbart.py
index 38eca39b28d1..4cd8ecd14fe8 100644
--- a/tests/models/plbart/test_modeling_plbart.py
+++ b/tests/models/plbart/test_modeling_plbart.py
@@ -26,6 +26,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -213,16 +214,41 @@ def check_encoder_decoder_model_standalone(self, config, inputs_dict):
 
 
 @require_torch
-class PLBartModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class PLBartModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (PLBartModel, PLBartForConditionalGeneration, PLBartForSequenceClassification) if is_torch_available() else ()
     )
     all_generative_model_classes = (PLBartForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "conversational": PLBartForConditionalGeneration,
+            "feature-extraction": PLBartModel,
+            "summarization": PLBartForConditionalGeneration,
+            "text-classification": PLBartForSequenceClassification,
+            "text-generation": PLBartForCausalLM,
+            "text2text-generation": PLBartForConditionalGeneration,
+            "translation": PLBartForConditionalGeneration,
+            "zero-shot": PLBartForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
     is_encoder_decoder = True
     fx_compatible = False  # Fix me Michael
     test_pruning = False
     test_missing_keys = False
 
+    # TODO: Fix the failed tests
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if pipeline_test_casse_name == "TranslationPipelineTests":
+            # Get `ValueError: Translation requires a `src_lang` and a `tgt_lang` for this model`.
+            # `PLBartConfig` was never used in pipeline tests: cannot create a simple tokenizer.
+            return True
+
+        return False
+
     def setUp(self):
         self.model_tester = PLBartModelTester(self)
         self.config_tester = ConfigTester(self, config_class=PLBartConfig)
@@ -447,7 +473,7 @@ def __init__(
         use_labels=True,
         decoder_start_token_id=2,
         decoder_ffn_dim=32,
-        decoder_layers=4,
+        decoder_layers=2,
         encoder_attention_heads=4,
         decoder_attention_heads=4,
         max_position_embeddings=30,
@@ -632,3 +658,7 @@ def test_decoder_model_attn_mask_past(self):
     def test_retain_grad_hidden_states_attentions(self):
         # decoder cannot keep gradients
         return
+
+    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
+    def test_left_padding_compatibility(self):
+        pass
diff --git a/tests/models/poolformer/test_feature_extraction_poolformer.py b/tests/models/poolformer/test_feature_extraction_poolformer.py
deleted file mode 100644
index b1fffe8a5a72..000000000000
--- a/tests/models/poolformer/test_feature_extraction_poolformer.py
+++ /dev/null
@@ -1,207 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-
-from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torch_available, is_vision_available
-
-from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
-
-
-if is_torch_available():
-    import torch
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import PoolFormerFeatureExtractor
-
-
-class PoolFormerFeatureExtractionTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize_and_center_crop=True,
-        size=None,
-        crop_pct=0.9,
-        crop_size=None,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-    ):
-        size = size if size is not None else {"shortest_edge": 30}
-        crop_size = crop_size if crop_size is not None else {"height": 30, "width": 30}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize_and_center_crop = do_resize_and_center_crop
-        self.size = size
-        self.crop_pct = crop_pct
-        self.crop_size = crop_size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-
-    def prepare_feat_extract_dict(self):
-        return {
-            "size": self.size,
-            "do_resize_and_center_crop": self.do_resize_and_center_crop,
-            "crop_pct": self.crop_pct,
-            "crop_size": self.crop_size,
-            "do_normalize": self.do_normalize,
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-        }
-
-
-@require_torch
-@require_vision
-class PoolFormerFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
-
-    feature_extraction_class = PoolFormerFeatureExtractor if is_vision_available() else None
-
-    def setUp(self):
-        self.feature_extract_tester = PoolFormerFeatureExtractionTester(self)
-
-    @property
-    def feat_extract_dict(self):
-        return self.feature_extract_tester.prepare_feat_extract_dict()
-
-    def test_feat_extract_properties(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        self.assertTrue(hasattr(feature_extractor, "do_resize_and_center_crop"))
-        self.assertTrue(hasattr(feature_extractor, "size"))
-        self.assertTrue(hasattr(feature_extractor, "crop_pct"))
-        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
-        self.assertTrue(hasattr(feature_extractor, "image_mean"))
-        self.assertTrue(hasattr(feature_extractor, "image_std"))
-
-    def test_feat_extract_from_dict_with_kwargs(self):
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict)
-        self.assertEqual(feature_extractor.size, {"shortest_edge": 30})
-        self.assertEqual(feature_extractor.crop_size, {"height": 30, "width": 30})
-
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict, size=42, crop_size=84)
-        self.assertEqual(feature_extractor.size, {"shortest_edge": 42})
-        self.assertEqual(feature_extractor.crop_size, {"height": 84, "width": 84})
-
-    def test_batch_feature(self):
-        pass
-
-    def test_call_pil(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PIL images
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-    def test_call_numpy(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random numpy tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-    def test_call_pytorch(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
diff --git a/tests/models/poolformer/test_image_processing_poolformer.py b/tests/models/poolformer/test_image_processing_poolformer.py
new file mode 100644
index 000000000000..017a511c4085
--- /dev/null
+++ b/tests/models/poolformer/test_image_processing_poolformer.py
@@ -0,0 +1,113 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_vision_available():
+    from transformers import PoolFormerImageProcessor
+
+
+class PoolFormerImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize_and_center_crop=True,
+        size=None,
+        crop_pct=0.9,
+        crop_size=None,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+    ):
+        size = size if size is not None else {"shortest_edge": 30}
+        crop_size = crop_size if crop_size is not None else {"height": 30, "width": 30}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize_and_center_crop = do_resize_and_center_crop
+        self.size = size
+        self.crop_pct = crop_pct
+        self.crop_size = crop_size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def prepare_image_processor_dict(self):
+        return {
+            "size": self.size,
+            "do_resize_and_center_crop": self.do_resize_and_center_crop,
+            "crop_pct": self.crop_pct,
+            "crop_size": self.crop_size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+        }
+
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.crop_size["height"], self.crop_size["width"]
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class PoolFormerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = PoolFormerImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = PoolFormerImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "do_resize_and_center_crop"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "crop_pct"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"shortest_edge": 30})
+        self.assertEqual(image_processor.crop_size, {"height": 30, "width": 30})
+
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84)
+        self.assertEqual(image_processor.size, {"shortest_edge": 42})
+        self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
diff --git a/tests/models/poolformer/test_modeling_poolformer.py b/tests/models/poolformer/test_modeling_poolformer.py
index 9bb8fa2e29cd..99667d6f1b45 100644
--- a/tests/models/poolformer/test_modeling_poolformer.py
+++ b/tests/models/poolformer/test_modeling_poolformer.py
@@ -24,6 +24,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -36,7 +37,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import PoolFormerFeatureExtractor
+    from transformers import PoolFormerImageProcessor
 
 
 class PoolFormerConfigTester(ConfigTester):
@@ -121,9 +122,13 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class PoolFormerModelTest(ModelTesterMixin, unittest.TestCase):
-
+class PoolFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (PoolFormerModel, PoolFormerForImageClassification) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": PoolFormerModel, "image-classification": PoolFormerForImageClassification}
+        if is_torch_available()
+        else {}
+    )
 
     test_head_masking = False
     test_pruning = False
@@ -232,10 +237,10 @@ def prepare_img():
 class PoolFormerModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference_image_classification_head(self):
-        feature_extractor = PoolFormerFeatureExtractor()
+        image_processor = PoolFormerImageProcessor()
         model = PoolFormerForImageClassification.from_pretrained("sail/poolformer_s12").to(torch_device)
 
-        inputs = feature_extractor(images=prepare_img(), return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=prepare_img(), return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/pop2piano/__init__.py b/tests/models/pop2piano/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/pop2piano/test_feature_extraction_pop2piano.py b/tests/models/pop2piano/test_feature_extraction_pop2piano.py
new file mode 100644
index 000000000000..fcc424fbf6e0
--- /dev/null
+++ b/tests/models/pop2piano/test_feature_extraction_pop2piano.py
@@ -0,0 +1,291 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+import unittest
+
+import numpy as np
+from datasets import load_dataset
+
+from transformers.testing_utils import (
+    check_json_file_has_correct_format,
+    require_essentia,
+    require_librosa,
+    require_scipy,
+    require_tf,
+    require_torch,
+)
+from transformers.utils.import_utils import (
+    is_essentia_available,
+    is_librosa_available,
+    is_scipy_available,
+    is_torch_available,
+)
+
+from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
+
+
+requirements_available = (
+    is_torch_available() and is_essentia_available() and is_scipy_available() and is_librosa_available()
+)
+
+if requirements_available:
+    import torch
+
+    from transformers import Pop2PianoFeatureExtractor
+
+
+class Pop2PianoFeatureExtractionTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        n_bars=2,
+        sample_rate=22050,
+        use_mel=True,
+        padding_value=0,
+        vocab_size_special=4,
+        vocab_size_note=128,
+        vocab_size_velocity=2,
+        vocab_size_time=100,
+    ):
+        self.parent = parent
+        self.n_bars = n_bars
+        self.sample_rate = sample_rate
+        self.use_mel = use_mel
+        self.padding_value = padding_value
+        self.vocab_size_special = vocab_size_special
+        self.vocab_size_note = vocab_size_note
+        self.vocab_size_velocity = vocab_size_velocity
+        self.vocab_size_time = vocab_size_time
+
+    def prepare_feat_extract_dict(self):
+        return {
+            "n_bars": self.n_bars,
+            "sample_rate": self.sample_rate,
+            "use_mel": self.use_mel,
+            "padding_value": self.padding_value,
+            "vocab_size_special": self.vocab_size_special,
+            "vocab_size_note": self.vocab_size_note,
+            "vocab_size_velocity": self.vocab_size_velocity,
+            "vocab_size_time": self.vocab_size_time,
+        }
+
+
+@require_torch
+@require_essentia
+@require_librosa
+@require_scipy
+class Pop2PianoFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
+    feature_extraction_class = Pop2PianoFeatureExtractor if requirements_available else None
+
+    def setUp(self):
+        self.feat_extract_tester = Pop2PianoFeatureExtractionTester(self)
+
+    def test_feat_extract_from_and_save_pretrained(self):
+        feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            saved_file = feat_extract_first.save_pretrained(tmpdirname)[0]
+            check_json_file_has_correct_format(saved_file)
+            feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname)
+
+        dict_first = feat_extract_first.to_dict()
+        dict_second = feat_extract_second.to_dict()
+        mel_1 = feat_extract_first.use_mel
+        mel_2 = feat_extract_second.use_mel
+        self.assertTrue(np.allclose(mel_1, mel_2))
+        self.assertEqual(dict_first, dict_second)
+
+    def test_feat_extract_to_json_file(self):
+        feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            json_file_path = os.path.join(tmpdirname, "feat_extract.json")
+            feat_extract_first.to_json_file(json_file_path)
+            feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path)
+
+        dict_first = feat_extract_first.to_dict()
+        dict_second = feat_extract_second.to_dict()
+        mel_1 = feat_extract_first.use_mel
+        mel_2 = feat_extract_second.use_mel
+        self.assertTrue(np.allclose(mel_1, mel_2))
+        self.assertEqual(dict_first, dict_second)
+
+    def test_call(self):
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        speech_input = np.zeros([1000000], dtype=np.float32)
+
+        input_features = feature_extractor(speech_input, sampling_rate=16_000, return_tensors="np")
+        self.assertTrue(input_features.input_features.ndim == 3)
+        self.assertEqual(input_features.input_features.shape[-1], 512)
+
+        self.assertTrue(input_features.beatsteps.ndim == 2)
+        self.assertTrue(input_features.extrapolated_beatstep.ndim == 2)
+
+    def test_integration(self):
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        speech_samples = ds.sort("id").select([0])["audio"]
+        input_speech = [x["array"] for x in speech_samples][0]
+        sampling_rate = [x["sampling_rate"] for x in speech_samples][0]
+        feaure_extractor = Pop2PianoFeatureExtractor.from_pretrained("sweetcocoa/pop2piano")
+        input_features = feaure_extractor(
+            input_speech, sampling_rate=sampling_rate, return_tensors="pt"
+        ).input_features
+
+        EXPECTED_INPUT_FEATURES = torch.tensor(
+            [[-7.1493, -6.8701, -4.3214], [-5.9473, -5.7548, -3.8438], [-6.1324, -5.9018, -4.3778]]
+        )
+        self.assertTrue(torch.allclose(input_features[0, :3, :3], EXPECTED_INPUT_FEATURES, atol=1e-4))
+
+    def test_attention_mask(self):
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        speech_input1 = np.zeros([1_000_000], dtype=np.float32)
+        speech_input2 = np.random.randint(low=0, high=10, size=500_000).astype(np.float32)
+        input_features = feature_extractor(
+            [speech_input1, speech_input2],
+            sampling_rate=[44_100, 16_000],
+            return_tensors="np",
+            return_attention_mask=True,
+        )
+
+        self.assertTrue(hasattr(input_features, "attention_mask"))
+
+        # check shapes
+        self.assertTrue(input_features["attention_mask"].ndim == 2)
+        self.assertEqual(input_features["attention_mask_beatsteps"].shape[0], 2)
+        self.assertEqual(input_features["attention_mask_extrapolated_beatstep"].shape[0], 2)
+
+        # check if they are any values except 0 and 1
+        self.assertTrue(np.max(input_features["attention_mask"]) == 1)
+        self.assertTrue(np.max(input_features["attention_mask_beatsteps"]) == 1)
+        self.assertTrue(np.max(input_features["attention_mask_extrapolated_beatstep"]) == 1)
+
+        self.assertTrue(np.min(input_features["attention_mask"]) == 0)
+        self.assertTrue(np.min(input_features["attention_mask_beatsteps"]) == 0)
+        self.assertTrue(np.min(input_features["attention_mask_extrapolated_beatstep"]) == 0)
+
+    def test_batch_feature(self):
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        speech_input1 = np.zeros([1_000_000], dtype=np.float32)
+        speech_input2 = np.ones([2_000_000], dtype=np.float32)
+        speech_input3 = np.random.randint(low=0, high=10, size=500_000).astype(np.float32)
+
+        input_features = feature_extractor(
+            [speech_input1, speech_input2, speech_input3],
+            sampling_rate=[44_100, 16_000, 48_000],
+            return_attention_mask=True,
+        )
+
+        self.assertEqual(len(input_features["input_features"].shape), 3)
+        # check shape
+        self.assertEqual(input_features["beatsteps"].shape[0], 3)
+        self.assertEqual(input_features["extrapolated_beatstep"].shape[0], 3)
+
+    def test_batch_feature_np(self):
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        speech_input1 = np.zeros([1_000_000], dtype=np.float32)
+        speech_input2 = np.ones([2_000_000], dtype=np.float32)
+        speech_input3 = np.random.randint(low=0, high=10, size=500_000).astype(np.float32)
+
+        input_features = feature_extractor(
+            [speech_input1, speech_input2, speech_input3],
+            sampling_rate=[44_100, 16_000, 48_000],
+            return_tensors="np",
+            return_attention_mask=True,
+        )
+
+        # check np array or not
+        self.assertEqual(type(input_features["input_features"]), np.ndarray)
+
+        # check shape
+        self.assertEqual(len(input_features["input_features"].shape), 3)
+
+    def test_batch_feature_pt(self):
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        speech_input1 = np.zeros([1_000_000], dtype=np.float32)
+        speech_input2 = np.ones([2_000_000], dtype=np.float32)
+        speech_input3 = np.random.randint(low=0, high=10, size=500_000).astype(np.float32)
+
+        input_features = feature_extractor(
+            [speech_input1, speech_input2, speech_input3],
+            sampling_rate=[44_100, 16_000, 48_000],
+            return_tensors="pt",
+            return_attention_mask=True,
+        )
+
+        # check pt tensor or not
+        self.assertEqual(type(input_features["input_features"]), torch.Tensor)
+
+        # check shape
+        self.assertEqual(len(input_features["input_features"].shape), 3)
+
+    @require_tf
+    def test_batch_feature_tf(self):
+        import tensorflow as tf
+
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        speech_input1 = np.zeros([1_000_000], dtype=np.float32)
+        speech_input2 = np.ones([2_000_000], dtype=np.float32)
+        speech_input3 = np.random.randint(low=0, high=10, size=500_000).astype(np.float32)
+
+        input_features = feature_extractor(
+            [speech_input1, speech_input2, speech_input3],
+            sampling_rate=[44_100, 16_000, 48_000],
+            return_tensors="tf",
+            return_attention_mask=True,
+        )
+
+        # check tf tensor or not
+        self.assertTrue(tf.is_tensor(input_features["input_features"]))
+
+        # check shape
+        self.assertEqual(len(input_features["input_features"].shape), 3)
+
+    @unittest.skip(
+        "Pop2PianoFeatureExtractor does not supports padding externally (while processing audios in batches padding is automatically applied to max_length)"
+    )
+    def test_padding_accepts_tensors_pt(self):
+        pass
+
+    @unittest.skip(
+        "Pop2PianoFeatureExtractor does not supports padding externally (while processing audios in batches padding is automatically applied to max_length)"
+    )
+    def test_padding_accepts_tensors_tf(self):
+        pass
+
+    @unittest.skip(
+        "Pop2PianoFeatureExtractor does not supports padding externally (while processing audios in batches padding is automatically applied to max_length)"
+    )
+    def test_padding_from_list(self):
+        pass
+
+    @unittest.skip(
+        "Pop2PianoFeatureExtractor does not supports padding externally (while processing audios in batches padding is automatically applied to max_length)"
+    )
+    def test_padding_from_array(self):
+        pass
+
+    @unittest.skip("Pop2PianoFeatureExtractor does not support truncation")
+    def test_attention_mask_with_truncation(self):
+        pass
+
+    @unittest.skip("Pop2PianoFeatureExtractor does not supports truncation")
+    def test_truncation_from_array(self):
+        pass
+
+    @unittest.skip("Pop2PianoFeatureExtractor does not supports truncation")
+    def test_truncation_from_list(self):
+        pass
diff --git a/tests/models/pop2piano/test_modeling_pop2piano.py b/tests/models/pop2piano/test_modeling_pop2piano.py
new file mode 100644
index 000000000000..d19ddc10e153
--- /dev/null
+++ b/tests/models/pop2piano/test_modeling_pop2piano.py
@@ -0,0 +1,782 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Pop2Piano model. """
+
+import copy
+import tempfile
+import unittest
+
+import numpy as np
+from datasets import load_dataset
+
+from transformers import Pop2PianoConfig
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.testing_utils import (
+    require_essentia,
+    require_librosa,
+    require_onnx,
+    require_scipy,
+    require_torch,
+    slow,
+    torch_device,
+)
+from transformers.utils import is_essentia_available, is_librosa_available, is_scipy_available, is_torch_available
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import Pop2PianoForConditionalGeneration
+    from transformers.models.pop2piano.modeling_pop2piano import POP2PIANO_PRETRAINED_MODEL_ARCHIVE_LIST
+    from transformers.pytorch_utils import is_torch_1_8_0
+
+else:
+    is_torch_1_8_0 = False
+
+
+@require_torch
+class Pop2PianoModelTester:
+    def __init__(
+        self,
+        parent,
+        vocab_size=99,
+        batch_size=13,
+        encoder_seq_length=7,
+        decoder_seq_length=9,
+        # For common tests
+        is_training=False,
+        use_attention_mask=True,
+        use_labels=True,
+        hidden_size=64,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        d_ff=37,
+        relative_attention_num_buckets=8,
+        dropout_rate=0.1,
+        initializer_factor=0.002,
+        eos_token_id=1,
+        pad_token_id=0,
+        decoder_start_token_id=0,
+        scope=None,
+        decoder_layers=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.encoder_seq_length = encoder_seq_length
+        self.decoder_seq_length = decoder_seq_length
+        # For common tests
+        self.seq_length = self.decoder_seq_length
+        self.is_training = is_training
+        self.use_attention_mask = use_attention_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.d_ff = d_ff
+        self.relative_attention_num_buckets = relative_attention_num_buckets
+        self.dropout_rate = dropout_rate
+        self.initializer_factor = initializer_factor
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.decoder_start_token_id = decoder_start_token_id
+        self.scope = None
+        self.decoder_layers = decoder_layers
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
+        decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+
+        attention_mask = None
+        decoder_attention_mask = None
+        if self.use_attention_mask:
+            attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
+            decoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
+
+        lm_labels = (
+            ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size) if self.use_labels else None
+        )
+
+        return self.get_config(), input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels
+
+    def get_pipeline_config(self):
+        return Pop2PianoConfig(
+            vocab_size=166,  # Pop2Piano forces 100 extra tokens
+            d_model=self.hidden_size,
+            d_ff=self.d_ff,
+            d_kv=self.hidden_size // self.num_attention_heads,
+            num_layers=self.num_hidden_layers,
+            num_decoder_layers=self.decoder_layers,
+            num_heads=self.num_attention_heads,
+            relative_attention_num_buckets=self.relative_attention_num_buckets,
+            dropout_rate=self.dropout_rate,
+            initializer_factor=self.initializer_factor,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.pad_token_id,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.decoder_start_token_id,
+        )
+
+    def get_config(self):
+        return Pop2PianoConfig(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            d_ff=self.d_ff,
+            d_kv=self.hidden_size // self.num_attention_heads,
+            num_layers=self.num_hidden_layers,
+            num_decoder_layers=self.decoder_layers,
+            num_heads=self.num_attention_heads,
+            relative_attention_num_buckets=self.relative_attention_num_buckets,
+            dropout_rate=self.dropout_rate,
+            initializer_factor=self.initializer_factor,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.pad_token_id,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.decoder_start_token_id,
+        )
+
+    def check_prepare_lm_labels_via_shift_left(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = Pop2PianoForConditionalGeneration(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # make sure that lm_labels are correctly padded from the right
+        lm_labels.masked_fill_((lm_labels == self.decoder_start_token_id), self.eos_token_id)
+
+        # add causal pad token mask
+        triangular_mask = torch.tril(lm_labels.new_ones(lm_labels.shape)).logical_not()
+        lm_labels.masked_fill_(triangular_mask, self.pad_token_id)
+        decoder_input_ids = model._shift_right(lm_labels)
+
+        for i, (decoder_input_ids_slice, lm_labels_slice) in enumerate(zip(decoder_input_ids, lm_labels)):
+            # first item
+            self.parent.assertEqual(decoder_input_ids_slice[0].item(), self.decoder_start_token_id)
+            if i < decoder_input_ids_slice.shape[-1]:
+                if i < decoder_input_ids.shape[-1] - 1:
+                    # items before diagonal
+                    self.parent.assertListEqual(
+                        decoder_input_ids_slice[1 : i + 1].tolist(), lm_labels_slice[:i].tolist()
+                    )
+                # pad items after diagonal
+                if i < decoder_input_ids.shape[-1] - 2:
+                    self.parent.assertListEqual(
+                        decoder_input_ids_slice[i + 2 :].tolist(), lm_labels_slice[i + 1 : -1].tolist()
+                    )
+            else:
+                # all items after square
+                self.parent.assertListEqual(decoder_input_ids_slice[1:].tolist(), lm_labels_slice[:-1].tolist())
+
+    def create_and_check_model(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = Pop2PianoForConditionalGeneration(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids=input_ids,
+            decoder_input_ids=decoder_input_ids,
+            attention_mask=attention_mask,
+            decoder_attention_mask=decoder_attention_mask,
+        )
+        result = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+        decoder_past = result.past_key_values
+        encoder_output = result.encoder_last_hidden_state
+
+        self.parent.assertEqual(encoder_output.size(), (self.batch_size, self.encoder_seq_length, self.hidden_size))
+        # There should be `num_layers` key value embeddings stored in decoder_past
+        self.parent.assertEqual(len(decoder_past), config.num_layers)
+        # There should be a self attn key, a self attn value, a cross attn key and a cross attn value stored in each decoder_past tuple
+        self.parent.assertEqual(len(decoder_past[0]), 4)
+
+    def create_and_check_with_lm_head(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = Pop2PianoForConditionalGeneration(config=config).to(torch_device).eval()
+        outputs = model(
+            input_ids=input_ids,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            labels=lm_labels,
+        )
+        self.parent.assertEqual(len(outputs), 4)
+        self.parent.assertEqual(outputs["logits"].size(), (self.batch_size, self.decoder_seq_length, self.vocab_size))
+        self.parent.assertEqual(outputs["loss"].size(), ())
+
+    def create_and_check_decoder_model_past(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = Pop2PianoForConditionalGeneration(config=config).get_decoder().to(torch_device).eval()
+        # first forward pass
+        outputs = model(input_ids, use_cache=True)
+        outputs_use_cache_conf = model(input_ids)
+        outputs_no_past = model(input_ids, use_cache=False)
+
+        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
+        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
+
+        output, past_key_values = outputs.to_tuple()
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+
+        output_from_no_past = model(next_input_ids)["last_hidden_state"]
+        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_decoder_model_attention_mask_past(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = Pop2PianoForConditionalGeneration(config=config).get_decoder()
+        model.to(torch_device)
+        model.eval()
+
+        # create attention mask
+        attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
+
+        half_seq_length = input_ids.shape[-1] // 2
+        attn_mask[:, half_seq_length:] = 0
+
+        # first forward pass
+        output, past_key_values = model(input_ids, attention_mask=attn_mask, use_cache=True).to_tuple()
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # change a random masked slice from input_ids
+        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
+        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
+        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
+
+        # append to next input_ids and attn_mask
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        attn_mask = torch.cat(
+            [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)],
+            dim=1,
+        )
+
+        # get two different outputs
+        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, past_key_values=past_key_values, attention_mask=attn_mask)[
+            "last_hidden_state"
+        ]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_decoder_model_past_large_inputs(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = Pop2PianoForConditionalGeneration(config=config).get_decoder().to(torch_device).eval()
+        # first forward pass
+        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
+
+        output, past_key_values = outputs.to_tuple()
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([attention_mask, next_mask], dim=-1)
+
+        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
+            "last_hidden_state"
+        ]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_generate_with_past_key_values(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = Pop2PianoForConditionalGeneration(config=config).to(torch_device).eval()
+        torch.manual_seed(0)
+        output_without_past_cache = model.generate(
+            input_ids[:1], num_beams=2, max_length=5, do_sample=True, use_cache=False
+        )
+        torch.manual_seed(0)
+        output_with_past_cache = model.generate(input_ids[:1], num_beams=2, max_length=5, do_sample=True)
+        self.parent.assertTrue(torch.all(output_with_past_cache == output_without_past_cache))
+
+    def create_and_check_model_fp16_forward(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = Pop2PianoForConditionalGeneration(config=config).to(torch_device).half().eval()
+        output = model(input_ids, decoder_input_ids=input_ids, attention_mask=attention_mask)[
+            "encoder_last_hidden_state"
+        ]
+        self.parent.assertFalse(torch.isnan(output).any().item())
+
+    def create_and_check_encoder_decoder_shared_weights(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        for model_class in [Pop2PianoForConditionalGeneration]:
+            torch.manual_seed(0)
+            model = model_class(config=config).to(torch_device).eval()
+            # load state dict copies weights but does not tie them
+            model.encoder.load_state_dict(model.decoder.state_dict(), strict=False)
+
+            torch.manual_seed(0)
+            tied_config = copy.deepcopy(config)
+            tied_config.tie_encoder_decoder = True
+            tied_model = model_class(config=tied_config).to(torch_device).eval()
+
+            model_result = model(
+                input_ids=input_ids,
+                decoder_input_ids=decoder_input_ids,
+                attention_mask=attention_mask,
+                decoder_attention_mask=decoder_attention_mask,
+            )
+
+            tied_model_result = tied_model(
+                input_ids=input_ids,
+                decoder_input_ids=decoder_input_ids,
+                attention_mask=attention_mask,
+                decoder_attention_mask=decoder_attention_mask,
+            )
+
+            # check that models has less parameters
+            self.parent.assertLess(
+                sum(p.numel() for p in tied_model.parameters()), sum(p.numel() for p in model.parameters())
+            )
+            random_slice_idx = ids_tensor((1,), model_result[0].shape[-1]).item()
+
+            # check that outputs are equal
+            self.parent.assertTrue(
+                torch.allclose(
+                    model_result[0][0, :, random_slice_idx], tied_model_result[0][0, :, random_slice_idx], atol=1e-4
+                )
+            )
+
+            # check that outputs after saving and loading are equal
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                tied_model.save_pretrained(tmpdirname)
+                tied_model = model_class.from_pretrained(tmpdirname)
+                tied_model.to(torch_device)
+                tied_model.eval()
+
+                # check that models has less parameters
+                self.parent.assertLess(
+                    sum(p.numel() for p in tied_model.parameters()), sum(p.numel() for p in model.parameters())
+                )
+                random_slice_idx = ids_tensor((1,), model_result[0].shape[-1]).item()
+
+                tied_model_result = tied_model(
+                    input_ids=input_ids,
+                    decoder_input_ids=decoder_input_ids,
+                    attention_mask=attention_mask,
+                    decoder_attention_mask=decoder_attention_mask,
+                )
+
+                # check that outputs are equal
+                self.parent.assertTrue(
+                    torch.allclose(
+                        model_result[0][0, :, random_slice_idx],
+                        tied_model_result[0][0, :, random_slice_idx],
+                        atol=1e-4,
+                    )
+                )
+
+    def check_resize_embeddings_pop2piano_v1_1(
+        self,
+        config,
+    ):
+        prev_vocab_size = config.vocab_size
+
+        config.tie_word_embeddings = False
+        model = Pop2PianoForConditionalGeneration(config=config).to(torch_device).eval()
+        model.resize_token_embeddings(prev_vocab_size - 10)
+
+        self.parent.assertEqual(model.get_input_embeddings().weight.shape[0], prev_vocab_size - 10)
+        self.parent.assertEqual(model.get_output_embeddings().weight.shape[0], prev_vocab_size - 10)
+        self.parent.assertEqual(model.config.vocab_size, prev_vocab_size - 10)
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            decoder_input_ids,
+            attention_mask,
+            decoder_attention_mask,
+            lm_labels,
+        ) = config_and_inputs
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "decoder_input_ids": decoder_input_ids,
+            "decoder_attention_mask": decoder_attention_mask,
+            "use_cache": False,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class Pop2PianoModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (Pop2PianoForConditionalGeneration,) if is_torch_available() else ()
+    all_generative_model_classes = ()
+    pipeline_model_mapping = (
+        {"automatic-speech-recognition": Pop2PianoForConditionalGeneration} if is_torch_available() else {}
+    )
+    all_parallelizable_model_classes = ()
+    fx_compatible = False
+    test_pruning = False
+    test_resize_embeddings = True
+    test_model_parallel = False
+    is_encoder_decoder = True
+
+    def setUp(self):
+        self.model_tester = Pop2PianoModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=Pop2PianoConfig, d_model=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_shift_right(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_prepare_lm_labels_via_shift_left(*config_and_inputs)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_v1_1(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        # check that gated gelu feed forward and different word embeddings work
+        config = config_and_inputs[0]
+        config.tie_word_embeddings = False
+        config.feed_forward_proj = "gated-gelu"
+        self.model_tester.create_and_check_model(config, *config_and_inputs[1:])
+
+    def test_config_and_model_silu_gated(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        config = config_and_inputs[0]
+        config.feed_forward_proj = "gated-silu"
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_with_lm_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_with_lm_head(*config_and_inputs)
+
+    def test_decoder_model_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
+
+    def test_decoder_model_past_with_attn_mask(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
+
+    def test_decoder_model_past_with_3d_attn_mask(self):
+        (
+            config,
+            input_ids,
+            decoder_input_ids,
+            attention_mask,
+            decoder_attention_mask,
+            lm_labels,
+        ) = self.model_tester.prepare_config_and_inputs()
+
+        attention_mask = ids_tensor(
+            [self.model_tester.batch_size, self.model_tester.encoder_seq_length, self.model_tester.encoder_seq_length],
+            vocab_size=2,
+        )
+        decoder_attention_mask = ids_tensor(
+            [self.model_tester.batch_size, self.model_tester.decoder_seq_length, self.model_tester.decoder_seq_length],
+            vocab_size=2,
+        )
+
+        self.model_tester.create_and_check_decoder_model_attention_mask_past(
+            config,
+            input_ids,
+            decoder_input_ids,
+            attention_mask,
+            decoder_attention_mask,
+            lm_labels,
+        )
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_encoder_decoder_shared_weights(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_encoder_decoder_shared_weights(*config_and_inputs)
+
+    @unittest.skipIf(torch_device == "cpu", "Cant do half precision")
+    def test_model_fp16_forward(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
+
+    def test_v1_1_resize_embeddings(self):
+        config = self.model_tester.prepare_config_and_inputs()[0]
+        self.model_tester.check_resize_embeddings_pop2piano_v1_1(config)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in POP2PIANO_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = Pop2PianoForConditionalGeneration.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    @require_onnx
+    @unittest.skipIf(
+        is_torch_1_8_0,
+        reason="Test has a segmentation fault on torch 1.8.0",
+    )
+    def test_export_to_onnx(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        model = Pop2PianoForConditionalGeneration(config_and_inputs[0]).to(torch_device)
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            torch.onnx.export(
+                model,
+                (config_and_inputs[1], config_and_inputs[3], config_and_inputs[2]),
+                f"{tmpdirname}/Pop2Piano_test.onnx",
+                export_params=True,
+                opset_version=9,
+                input_names=["input_ids", "decoder_input_ids"],
+            )
+
+    def test_pass_with_input_features(self):
+        input_features = BatchFeature(
+            {
+                "input_features": torch.rand((75, 100, 512)).type(torch.float32),
+                "beatsteps": torch.randint(size=(1, 955), low=0, high=100).type(torch.float32),
+                "extrapolated_beatstep": torch.randint(size=(1, 900), low=0, high=100).type(torch.float32),
+            }
+        )
+        model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano")
+        model_opts = model.generate(input_features=input_features["input_features"], return_dict_in_generate=True)
+
+        self.assertEqual(model_opts.sequences.ndim, 2)
+
+    def test_pass_with_batched_input_features(self):
+        input_features = BatchFeature(
+            {
+                "input_features": torch.rand((220, 70, 512)).type(torch.float32),
+                "beatsteps": torch.randint(size=(5, 955), low=0, high=100).type(torch.float32),
+                "extrapolated_beatstep": torch.randint(size=(5, 900), low=0, high=100).type(torch.float32),
+                "attention_mask": torch.concatenate(
+                    [
+                        torch.ones([120, 70], dtype=torch.int32),
+                        torch.zeros([1, 70], dtype=torch.int32),
+                        torch.ones([50, 70], dtype=torch.int32),
+                        torch.zeros([1, 70], dtype=torch.int32),
+                        torch.ones([47, 70], dtype=torch.int32),
+                        torch.zeros([1, 70], dtype=torch.int32),
+                    ],
+                    axis=0,
+                ),
+                "attention_mask_beatsteps": torch.ones((5, 955)).type(torch.int32),
+                "attention_mask_extrapolated_beatstep": torch.ones((5, 900)).type(torch.int32),
+            }
+        )
+        model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano")
+        model_opts = model.generate(
+            input_features=input_features["input_features"],
+            attention_mask=input_features["attention_mask"],
+            return_dict_in_generate=True,
+        )
+
+        self.assertEqual(model_opts.sequences.ndim, 2)
+
+
+@require_torch
+class Pop2PianoModelIntegrationTests(unittest.TestCase):
+    @slow
+    def test_mel_conditioner_integration(self):
+        composer = "composer1"
+        model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano")
+        input_embeds = torch.ones([10, 100, 512])
+
+        composer_value = model.generation_config.composer_to_feature_token[composer]
+        composer_value = torch.tensor(composer_value)
+        composer_value = composer_value.repeat(input_embeds.size(0))
+        outputs = model.mel_conditioner(
+            input_embeds, composer_value, min(model.generation_config.composer_to_feature_token.values())
+        )
+
+        # check shape
+        self.assertEqual(outputs.size(), torch.Size([10, 101, 512]))
+
+        # check values
+        EXPECTED_OUTPUTS = torch.tensor(
+            [[1.0475305318832397, 0.29052114486694336, -0.47778210043907166], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]]
+        )
+
+        self.assertTrue(torch.allclose(outputs[0, :3, :3], EXPECTED_OUTPUTS, atol=1e-4))
+
+    @slow
+    @require_essentia
+    @require_librosa
+    @require_scipy
+    def test_full_model_integration(self):
+        if is_librosa_available() and is_scipy_available() and is_essentia_available() and is_torch_available():
+            from transformers import Pop2PianoProcessor
+
+            speech_input1 = np.zeros([1_000_000], dtype=np.float32)
+            sampling_rate = 44_100
+
+            processor = Pop2PianoProcessor.from_pretrained("sweetcocoa/pop2piano")
+            input_features = processor.feature_extractor(
+                speech_input1, sampling_rate=sampling_rate, return_tensors="pt"
+            )
+
+            model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano")
+            outputs = model.generate(
+                input_features=input_features["input_features"], return_dict_in_generate=True
+            ).sequences
+
+            # check for shapes
+            self.assertEqual(outputs.size(0), 70)
+
+            # check for values
+            self.assertEqual(outputs[0, :2].detach().cpu().numpy().tolist(), [0, 1])
+
+    # This is the test for a real music from K-Pop genre.
+    @slow
+    @require_essentia
+    @require_librosa
+    @require_scipy
+    def test_real_music(self):
+        if is_librosa_available() and is_scipy_available() and is_essentia_available() and is_torch_available():
+            from transformers import Pop2PianoFeatureExtractor, Pop2PianoTokenizer
+
+            model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano")
+            model.eval()
+            feature_extractor = Pop2PianoFeatureExtractor.from_pretrained("sweetcocoa/pop2piano")
+            tokenizer = Pop2PianoTokenizer.from_pretrained("sweetcocoa/pop2piano")
+            ds = load_dataset("sweetcocoa/pop2piano_ci", split="test")
+
+            output_fe = feature_extractor(
+                ds["audio"][0]["array"], sampling_rate=ds["audio"][0]["sampling_rate"], return_tensors="pt"
+            )
+            output_model = model.generate(input_features=output_fe["input_features"], composer="composer1")
+            output_tokenizer = tokenizer.batch_decode(token_ids=output_model, feature_extractor_output=output_fe)
+            pretty_midi_object = output_tokenizer["pretty_midi_objects"][0]
+
+            # Checking if no of notes are same
+            self.assertEqual(len(pretty_midi_object.instruments[0].notes), 59)
+            predicted_timings = []
+            for i in pretty_midi_object.instruments[0].notes:
+                predicted_timings.append(i.start)
+
+            # Checking note start timings(first 6)
+            EXPECTED_START_TIMINGS = [
+                0.4876190423965454,
+                0.7314285635948181,
+                0.9752380847930908,
+                1.4396371841430664,
+                1.6718367338180542,
+                1.904036283493042,
+            ]
+
+            np.allclose(EXPECTED_START_TIMINGS, predicted_timings[:6])
+
+            # Checking note end timings(last 6)
+            EXPECTED_END_TIMINGS = [
+                12.341403007507324,
+                12.567797183990479,
+                12.567797183990479,
+                12.567797183990479,
+                12.794191360473633,
+                12.794191360473633,
+            ]
+
+            np.allclose(EXPECTED_END_TIMINGS, predicted_timings[-6:])
diff --git a/tests/models/pop2piano/test_processor_pop2piano.py b/tests/models/pop2piano/test_processor_pop2piano.py
new file mode 100644
index 000000000000..06a8bacfd8a4
--- /dev/null
+++ b/tests/models/pop2piano/test_processor_pop2piano.py
@@ -0,0 +1,264 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import shutil
+import tempfile
+import unittest
+
+import numpy as np
+import pytest
+from datasets import load_dataset
+
+from transformers.testing_utils import (
+    require_essentia,
+    require_librosa,
+    require_pretty_midi,
+    require_scipy,
+    require_torch,
+)
+from transformers.tokenization_utils import BatchEncoding
+from transformers.utils.import_utils import (
+    is_essentia_available,
+    is_librosa_available,
+    is_pretty_midi_available,
+    is_scipy_available,
+    is_torch_available,
+)
+
+
+requirements_available = (
+    is_torch_available()
+    and is_essentia_available()
+    and is_scipy_available()
+    and is_librosa_available()
+    and is_pretty_midi_available()
+)
+
+if requirements_available:
+    import pretty_midi
+
+    from transformers import (
+        Pop2PianoFeatureExtractor,
+        Pop2PianoForConditionalGeneration,
+        Pop2PianoProcessor,
+        Pop2PianoTokenizer,
+    )
+
+
+@require_scipy
+@require_torch
+@require_librosa
+@require_essentia
+@require_pretty_midi
+class Pop2PianoProcessorTest(unittest.TestCase):
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+
+        feature_extractor = Pop2PianoFeatureExtractor.from_pretrained("sweetcocoa/pop2piano")
+        tokenizer = Pop2PianoTokenizer.from_pretrained("sweetcocoa/pop2piano")
+        processor = Pop2PianoProcessor(feature_extractor, tokenizer)
+
+        processor.save_pretrained(self.tmpdirname)
+
+    def get_tokenizer(self, **kwargs):
+        return Pop2PianoTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_feature_extractor(self, **kwargs):
+        return Pop2PianoFeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def test_save_load_pretrained_additional_features(self):
+        processor = Pop2PianoProcessor(
+            tokenizer=self.get_tokenizer(),
+            feature_extractor=self.get_feature_extractor(),
+        )
+        processor.save_pretrained(self.tmpdirname)
+
+        tokenizer_add_kwargs = self.get_tokenizer(
+            unk_token="-1",
+            eos_token="1",
+            pad_token="0",
+            bos_token="2",
+        )
+        feature_extractor_add_kwargs = self.get_feature_extractor()
+
+        processor = Pop2PianoProcessor.from_pretrained(
+            self.tmpdirname,
+            unk_token="-1",
+            eos_token="1",
+            pad_token="0",
+            bos_token="2",
+        )
+
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
+        self.assertIsInstance(processor.tokenizer, Pop2PianoTokenizer)
+
+        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
+        self.assertIsInstance(processor.feature_extractor, Pop2PianoFeatureExtractor)
+
+    def get_inputs(self):
+        """get inputs for both feature extractor and tokenizer"""
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        speech_samples = ds.sort("id").select([0])["audio"]
+        input_speech = [x["array"] for x in speech_samples][0]
+        sampling_rate = [x["sampling_rate"] for x in speech_samples][0]
+
+        feature_extractor_outputs = self.get_feature_extractor()(
+            audio=input_speech, sampling_rate=sampling_rate, return_tensors="pt"
+        )
+        model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano")
+        token_ids = model.generate(input_features=feature_extractor_outputs["input_features"], composer="composer1")
+        dummy_notes = [
+            [
+                pretty_midi.Note(start=0.441179, end=2.159456, pitch=70, velocity=77),
+                pretty_midi.Note(start=0.673379, end=0.905578, pitch=73, velocity=77),
+                pretty_midi.Note(start=0.905578, end=2.159456, pitch=73, velocity=77),
+                pretty_midi.Note(start=1.114558, end=2.159456, pitch=78, velocity=77),
+                pretty_midi.Note(start=1.323537, end=1.532517, pitch=80, velocity=77),
+            ],
+            [
+                pretty_midi.Note(start=0.441179, end=2.159456, pitch=70, velocity=77),
+            ],
+        ]
+
+        return input_speech, sampling_rate, token_ids, dummy_notes
+
+    def test_feature_extractor(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = Pop2PianoProcessor(
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+        )
+
+        input_speech, sampling_rate, _, _ = self.get_inputs()
+
+        feature_extractor_outputs = feature_extractor(
+            audio=input_speech, sampling_rate=sampling_rate, return_tensors="np"
+        )
+        processor_outputs = processor(audio=input_speech, sampling_rate=sampling_rate, return_tensors="np")
+
+        for key in feature_extractor_outputs.keys():
+            self.assertTrue(np.allclose(feature_extractor_outputs[key], processor_outputs[key], atol=1e-4))
+
+    def test_processor_batch_decode(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = Pop2PianoProcessor(
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+        )
+
+        audio, sampling_rate, token_ids, _ = self.get_inputs()
+        feature_extractor_output = feature_extractor(audio=audio, sampling_rate=sampling_rate, return_tensors="pt")
+
+        encoded_processor = processor.batch_decode(
+            token_ids=token_ids,
+            feature_extractor_output=feature_extractor_output,
+            return_midi=True,
+        )
+
+        encoded_tokenizer = tokenizer.batch_decode(
+            token_ids=token_ids,
+            feature_extractor_output=feature_extractor_output,
+            return_midi=True,
+        )
+        # check start timings
+        encoded_processor_start_timings = [token.start for token in encoded_processor["notes"]]
+        encoded_tokenizer_start_timings = [token.start for token in encoded_tokenizer["notes"]]
+        self.assertListEqual(encoded_processor_start_timings, encoded_tokenizer_start_timings)
+
+        # check end timings
+        encoded_processor_end_timings = [token.end for token in encoded_processor["notes"]]
+        encoded_tokenizer_end_timings = [token.end for token in encoded_tokenizer["notes"]]
+        self.assertListEqual(encoded_processor_end_timings, encoded_tokenizer_end_timings)
+
+        # check pitch
+        encoded_processor_pitch = [token.pitch for token in encoded_processor["notes"]]
+        encoded_tokenizer_pitch = [token.pitch for token in encoded_tokenizer["notes"]]
+        self.assertListEqual(encoded_processor_pitch, encoded_tokenizer_pitch)
+
+        # check velocity
+        encoded_processor_velocity = [token.velocity for token in encoded_processor["notes"]]
+        encoded_tokenizer_velocity = [token.velocity for token in encoded_tokenizer["notes"]]
+        self.assertListEqual(encoded_processor_velocity, encoded_tokenizer_velocity)
+
+    def test_tokenizer_call(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = Pop2PianoProcessor(
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+        )
+
+        _, _, _, notes = self.get_inputs()
+
+        encoded_processor = processor(
+            notes=notes,
+        )
+
+        self.assertTrue(isinstance(encoded_processor, BatchEncoding))
+
+    def test_processor(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = Pop2PianoProcessor(
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+        )
+
+        audio, sampling_rate, _, notes = self.get_inputs()
+
+        inputs = processor(
+            audio=audio,
+            sampling_rate=sampling_rate,
+            notes=notes,
+        )
+
+        self.assertListEqual(
+            list(inputs.keys()),
+            ["input_features", "beatsteps", "extrapolated_beatstep", "token_ids"],
+        )
+
+        # test if it raises when no input is passed
+        with pytest.raises(ValueError):
+            processor()
+
+    def test_model_input_names(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = Pop2PianoProcessor(
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+        )
+
+        audio, sampling_rate, _, notes = self.get_inputs()
+        feature_extractor(audio, sampling_rate, return_tensors="pt")
+
+        inputs = processor(
+            audio=audio,
+            sampling_rate=sampling_rate,
+            notes=notes,
+        )
+        self.assertListEqual(
+            list(inputs.keys()),
+            ["input_features", "beatsteps", "extrapolated_beatstep", "token_ids"],
+        )
diff --git a/tests/models/pop2piano/test_tokenization_pop2piano.py b/tests/models/pop2piano/test_tokenization_pop2piano.py
new file mode 100644
index 000000000000..6c5301b8f3bf
--- /dev/null
+++ b/tests/models/pop2piano/test_tokenization_pop2piano.py
@@ -0,0 +1,415 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Please note that Pop2PianoTokenizer is too far from our usual tokenizers and thus cannot use the TokenizerTesterMixin class.
+"""
+
+import os
+import pickle
+import shutil
+import tempfile
+import unittest
+
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.testing_utils import (
+    is_pretty_midi_available,
+    is_torch_available,
+    require_pretty_midi,
+    require_torch,
+)
+from transformers.tokenization_utils import BatchEncoding
+
+
+if is_torch_available():
+    import torch
+
+
+requirements_available = is_torch_available() and is_pretty_midi_available()
+if requirements_available:
+    import pretty_midi
+
+    from transformers import Pop2PianoTokenizer
+
+
+@require_torch
+@require_pretty_midi
+class Pop2PianoTokenizerTest(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        self.tokenizer = Pop2PianoTokenizer.from_pretrained("sweetcocoa/pop2piano")
+
+    def get_input_notes(self):
+        notes = [
+            [
+                pretty_midi.Note(start=0.441179, end=2.159456, pitch=70, velocity=77),
+                pretty_midi.Note(start=0.673379, end=0.905578, pitch=73, velocity=77),
+                pretty_midi.Note(start=0.905578, end=2.159456, pitch=73, velocity=77),
+                pretty_midi.Note(start=1.114558, end=2.159456, pitch=78, velocity=77),
+                pretty_midi.Note(start=1.323537, end=1.532517, pitch=80, velocity=77),
+            ],
+            [
+                pretty_midi.Note(start=0.441179, end=2.159456, pitch=70, velocity=77),
+            ],
+        ]
+
+        return notes
+
+    def test_call(self):
+        notes = self.get_input_notes()
+
+        output = self.tokenizer(
+            notes,
+            return_tensors="pt",
+            padding="max_length",
+            truncation=True,
+            max_length=10,
+            return_attention_mask=True,
+        )
+
+        # check the output type
+        self.assertTrue(isinstance(output, BatchEncoding))
+
+        # check the values
+        expected_output_token_ids = torch.tensor(
+            [[134, 133, 74, 135, 77, 132, 77, 133, 77, 82], [134, 133, 74, 136, 132, 74, 134, 134, 134, 134]]
+        )
+        expected_output_attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]])
+
+        self.assertTrue(torch.allclose(output["token_ids"], expected_output_token_ids, atol=1e-4))
+        self.assertTrue(torch.allclose(output["attention_mask"], expected_output_attention_mask, atol=1e-4))
+
+    def test_batch_decode(self):
+        # test batch decode with model, feature-extractor outputs(beatsteps, extrapolated_beatstep)
+
+        # Please note that this test does not test the accuracy of the outputs, instead it is designed to make sure that
+        # the tokenizer's batch_decode can deal with attention_mask in feature-extractor outputs. For the accuracy check
+        # please see the `test_batch_decode_outputs` test.
+
+        model_output = torch.concatenate(
+            [
+                torch.randint(size=[120, 96], low=0, high=70, dtype=torch.long),
+                torch.zeros(size=[1, 96], dtype=torch.long),
+                torch.randint(size=[50, 96], low=0, high=40, dtype=torch.long),
+                torch.zeros(size=[1, 96], dtype=torch.long),
+            ],
+            axis=0,
+        )
+        input_features = BatchFeature(
+            {
+                "beatsteps": torch.ones([2, 955]),
+                "extrapolated_beatstep": torch.ones([2, 1000]),
+                "attention_mask": torch.concatenate(
+                    [
+                        torch.ones([120, 96], dtype=torch.long),
+                        torch.zeros([1, 96], dtype=torch.long),
+                        torch.ones([50, 96], dtype=torch.long),
+                        torch.zeros([1, 96], dtype=torch.long),
+                    ],
+                    axis=0,
+                ),
+                "attention_mask_beatsteps": torch.ones([2, 955]),
+                "attention_mask_extrapolated_beatstep": torch.ones([2, 1000]),
+            }
+        )
+
+        output = self.tokenizer.batch_decode(token_ids=model_output, feature_extractor_output=input_features)[
+            "pretty_midi_objects"
+        ]
+
+        # check length
+        self.assertTrue(len(output) == 2)
+
+        # check object type
+        self.assertTrue(isinstance(output[0], pretty_midi.pretty_midi.PrettyMIDI))
+        self.assertTrue(isinstance(output[1], pretty_midi.pretty_midi.PrettyMIDI))
+
+    def test_batch_decode_outputs(self):
+        # test batch decode with model, feature-extractor outputs(beatsteps, extrapolated_beatstep)
+
+        # Please note that this test tests the accuracy of the outputs of the tokenizer's `batch_decode` method.
+
+        model_output = torch.tensor(
+            [
+                [134, 133, 74, 135, 77, 82, 84, 136, 132, 74, 77, 82, 84],
+                [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+            ]
+        )
+        input_features = BatchEncoding(
+            {
+                "beatsteps": torch.tensor([[0.0697, 0.1103, 0.1509, 0.1916]]),
+                "extrapolated_beatstep": torch.tensor([[0.0000, 0.0406, 0.0813, 0.1219]]),
+            }
+        )
+
+        output = self.tokenizer.batch_decode(token_ids=model_output, feature_extractor_output=input_features)
+
+        # check outputs
+        self.assertEqual(len(output["notes"]), 4)
+
+        predicted_start_timings, predicted_end_timings = [], []
+        for i in output["notes"]:
+            predicted_start_timings.append(i.start)
+            predicted_end_timings.append(i.end)
+
+        # Checking note start timings
+        expected_start_timings = torch.tensor(
+            [
+                0.069700,
+                0.110300,
+                0.110300,
+                0.110300,
+            ]
+        )
+        predicted_start_timings = torch.tensor(predicted_start_timings)
+
+        self.assertTrue(torch.allclose(expected_start_timings, predicted_start_timings, atol=1e-4))
+
+        # Checking note end timings
+        expected_end_timings = torch.tensor(
+            [
+                0.191600,
+                0.191600,
+                0.191600,
+                0.191600,
+            ]
+        )
+        predicted_end_timings = torch.tensor(predicted_end_timings)
+
+        self.assertTrue(torch.allclose(expected_end_timings, predicted_end_timings, atol=1e-4))
+
+    def test_get_vocab(self):
+        vocab_dict = self.tokenizer.get_vocab()
+        self.assertIsInstance(vocab_dict, dict)
+        self.assertGreaterEqual(len(self.tokenizer), len(vocab_dict))
+
+        vocab = [self.tokenizer.convert_ids_to_tokens(i) for i in range(len(self.tokenizer))]
+        self.assertEqual(len(vocab), len(self.tokenizer))
+
+        self.tokenizer.add_tokens(["asdfasdfasdfasdf"])
+        vocab = [self.tokenizer.convert_ids_to_tokens(i) for i in range(len(self.tokenizer))]
+        self.assertEqual(len(vocab), len(self.tokenizer))
+
+    def test_save_and_load_tokenizer(self):
+        tmpdirname = tempfile.mkdtemp()
+
+        sample_notes = self.get_input_notes()
+
+        self.tokenizer.add_tokens(["bim", "bambam"])
+        additional_special_tokens = self.tokenizer.additional_special_tokens
+        additional_special_tokens.append("new_additional_special_token")
+        self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+        before_token_ids = self.tokenizer(sample_notes)["token_ids"]
+        before_vocab = self.tokenizer.get_vocab()
+        self.tokenizer.save_pretrained(tmpdirname)
+
+        after_tokenizer = self.tokenizer.__class__.from_pretrained(tmpdirname)
+        after_token_ids = after_tokenizer(sample_notes)["token_ids"]
+        after_vocab = after_tokenizer.get_vocab()
+        self.assertDictEqual(before_vocab, after_vocab)
+        self.assertListEqual(before_token_ids, after_token_ids)
+        self.assertIn("bim", after_vocab)
+        self.assertIn("bambam", after_vocab)
+        self.assertIn("new_additional_special_token", after_tokenizer.additional_special_tokens)
+
+        shutil.rmtree(tmpdirname)
+
+    def test_pickle_tokenizer(self):
+        tmpdirname = tempfile.mkdtemp()
+
+        notes = self.get_input_notes()
+        subwords = self.tokenizer(notes)["token_ids"]
+
+        filename = os.path.join(tmpdirname, "tokenizer.bin")
+        with open(filename, "wb") as handle:
+            pickle.dump(self.tokenizer, handle)
+
+        with open(filename, "rb") as handle:
+            tokenizer_new = pickle.load(handle)
+
+        subwords_loaded = tokenizer_new(notes)["token_ids"]
+
+        self.assertListEqual(subwords, subwords_loaded)
+
+    def test_padding_side_in_kwargs(self):
+        tokenizer_p = Pop2PianoTokenizer.from_pretrained("sweetcocoa/pop2piano", padding_side="left")
+        self.assertEqual(tokenizer_p.padding_side, "left")
+
+        tokenizer_p = Pop2PianoTokenizer.from_pretrained("sweetcocoa/pop2piano", padding_side="right")
+        self.assertEqual(tokenizer_p.padding_side, "right")
+
+        self.assertRaises(
+            ValueError,
+            Pop2PianoTokenizer.from_pretrained,
+            "sweetcocoa/pop2piano",
+            padding_side="unauthorized",
+        )
+
+    def test_truncation_side_in_kwargs(self):
+        tokenizer_p = Pop2PianoTokenizer.from_pretrained("sweetcocoa/pop2piano", truncation_side="left")
+        self.assertEqual(tokenizer_p.truncation_side, "left")
+
+        tokenizer_p = Pop2PianoTokenizer.from_pretrained("sweetcocoa/pop2piano", truncation_side="right")
+        self.assertEqual(tokenizer_p.truncation_side, "right")
+
+        self.assertRaises(
+            ValueError,
+            Pop2PianoTokenizer.from_pretrained,
+            "sweetcocoa/pop2piano",
+            truncation_side="unauthorized",
+        )
+
+    def test_right_and_left_padding(self):
+        tokenizer = self.tokenizer
+        notes = self.get_input_notes()
+        notes = notes[0]
+        max_length = 20
+
+        padding_idx = tokenizer.pad_token_id
+
+        # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
+        tokenizer.padding_side = "right"
+        padded_notes = tokenizer(notes, padding="max_length", max_length=max_length)["token_ids"]
+        padded_notes_length = len(padded_notes)
+        notes_without_padding = tokenizer(notes, padding="do_not_pad")["token_ids"]
+        padding_size = max_length - len(notes_without_padding)
+
+        self.assertEqual(padded_notes_length, max_length)
+        self.assertEqual(notes_without_padding + [padding_idx] * padding_size, padded_notes)
+
+        # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
+        tokenizer.padding_side = "left"
+        padded_notes = tokenizer(notes, padding="max_length", max_length=max_length)["token_ids"]
+        padded_notes_length = len(padded_notes)
+        notes_without_padding = tokenizer(notes, padding="do_not_pad")["token_ids"]
+        padding_size = max_length - len(notes_without_padding)
+
+        self.assertEqual(padded_notes_length, max_length)
+        self.assertEqual([padding_idx] * padding_size + notes_without_padding, padded_notes)
+
+        # RIGHT & LEFT PADDING - Check that nothing is done for 'longest' and 'no_padding'
+        notes_without_padding = tokenizer(notes)["token_ids"]
+
+        tokenizer.padding_side = "right"
+        padded_notes_right = tokenizer(notes, padding=False)["token_ids"]
+        self.assertEqual(len(padded_notes_right), len(notes_without_padding))
+        self.assertEqual(padded_notes_right, notes_without_padding)
+
+        tokenizer.padding_side = "left"
+        padded_notes_left = tokenizer(notes, padding="longest")["token_ids"]
+        self.assertEqual(len(padded_notes_left), len(notes_without_padding))
+        self.assertEqual(padded_notes_left, notes_without_padding)
+
+        tokenizer.padding_side = "right"
+        padded_notes_right = tokenizer(notes, padding="longest")["token_ids"]
+        self.assertEqual(len(padded_notes_right), len(notes_without_padding))
+        self.assertEqual(padded_notes_right, notes_without_padding)
+
+        tokenizer.padding_side = "left"
+        padded_notes_left = tokenizer(notes, padding=False)["token_ids"]
+        self.assertEqual(len(padded_notes_left), len(notes_without_padding))
+        self.assertEqual(padded_notes_left, notes_without_padding)
+
+    def test_right_and_left_truncation(self):
+        tokenizer = self.tokenizer
+        notes = self.get_input_notes()
+        notes = notes[0]
+        truncation_size = 3
+
+        # RIGHT TRUNCATION - Check that it correctly truncates when a maximum length is specified along with the truncation flag set to True
+        tokenizer.truncation_side = "right"
+        full_encoded_notes = tokenizer(notes)["token_ids"]
+        full_encoded_notes_length = len(full_encoded_notes)
+        truncated_notes = tokenizer(notes, max_length=full_encoded_notes_length - truncation_size, truncation=True)[
+            "token_ids"
+        ]
+        self.assertEqual(full_encoded_notes_length, len(truncated_notes) + truncation_size)
+        self.assertEqual(full_encoded_notes[:-truncation_size], truncated_notes)
+
+        # LEFT TRUNCATION - Check that it correctly truncates when a maximum length is specified along with the truncation flag set to True
+        tokenizer.truncation_side = "left"
+        full_encoded_notes = tokenizer(notes)["token_ids"]
+        full_encoded_notes_length = len(full_encoded_notes)
+        truncated_notes = tokenizer(notes, max_length=full_encoded_notes_length - truncation_size, truncation=True)[
+            "token_ids"
+        ]
+        self.assertEqual(full_encoded_notes_length, len(truncated_notes) + truncation_size)
+        self.assertEqual(full_encoded_notes[truncation_size:], truncated_notes)
+
+        # RIGHT & LEFT TRUNCATION - Check that nothing is done for 'longest' and 'no_truncation'
+        tokenizer.truncation_side = "right"
+        truncated_notes_right = tokenizer(notes, truncation=True)["token_ids"]
+        self.assertEqual(full_encoded_notes_length, len(truncated_notes_right))
+        self.assertEqual(full_encoded_notes, truncated_notes_right)
+
+        tokenizer.truncation_side = "left"
+        truncated_notes_left = tokenizer(notes, truncation="longest_first")["token_ids"]
+        self.assertEqual(len(truncated_notes_left), full_encoded_notes_length)
+        self.assertEqual(truncated_notes_left, full_encoded_notes)
+
+        tokenizer.truncation_side = "right"
+        truncated_notes_right = tokenizer(notes, truncation="longest_first")["token_ids"]
+        self.assertEqual(len(truncated_notes_right), full_encoded_notes_length)
+        self.assertEqual(truncated_notes_right, full_encoded_notes)
+
+        tokenizer.truncation_side = "left"
+        truncated_notes_left = tokenizer(notes, truncation=True)["token_ids"]
+        self.assertEqual(len(truncated_notes_left), full_encoded_notes_length)
+        self.assertEqual(truncated_notes_left, full_encoded_notes)
+
+    def test_padding_to_multiple_of(self):
+        notes = self.get_input_notes()
+
+        if self.tokenizer.pad_token is None:
+            self.skipTest("No padding token.")
+        else:
+            normal_tokens = self.tokenizer(notes[0], padding=True, pad_to_multiple_of=8)
+            for key, value in normal_tokens.items():
+                self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
+
+            normal_tokens = self.tokenizer(notes[0], pad_to_multiple_of=8)
+            for key, value in normal_tokens.items():
+                self.assertNotEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
+
+            # Should also work with truncation
+            normal_tokens = self.tokenizer(notes[0], padding=True, truncation=True, pad_to_multiple_of=8)
+            for key, value in normal_tokens.items():
+                self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
+
+            # truncation to something which is not a multiple of pad_to_multiple_of raises an error
+            self.assertRaises(
+                ValueError,
+                self.tokenizer.__call__,
+                notes[0],
+                padding=True,
+                truncation=True,
+                max_length=12,
+                pad_to_multiple_of=8,
+            )
+
+    def test_padding_with_attention_mask(self):
+        if self.tokenizer.pad_token is None:
+            self.skipTest("No padding token.")
+        if "attention_mask" not in self.tokenizer.model_input_names:
+            self.skipTest("This model does not use attention mask.")
+
+        features = [
+            {"token_ids": [1, 2, 3, 4, 5, 6], "attention_mask": [1, 1, 1, 1, 1, 0]},
+            {"token_ids": [1, 2, 3], "attention_mask": [1, 1, 0]},
+        ]
+        padded_features = self.tokenizer.pad(features)
+        if self.tokenizer.padding_side == "right":
+            self.assertListEqual(padded_features["attention_mask"], [[1, 1, 1, 1, 1, 0], [1, 1, 0, 0, 0, 0]])
+        else:
+            self.assertListEqual(padded_features["attention_mask"], [[1, 1, 1, 1, 1, 0], [0, 0, 0, 1, 1, 0]])
diff --git a/tests/models/prophetnet/test_modeling_prophetnet.py b/tests/models/prophetnet/test_modeling_prophetnet.py
index 9258d797884b..eee03134d34e 100644
--- a/tests/models/prophetnet/test_modeling_prophetnet.py
+++ b/tests/models/prophetnet/test_modeling_prophetnet.py
@@ -23,6 +23,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -54,10 +55,10 @@ def __init__(
         use_labels=True,
         decoder_start_token_id=0,
         encoder_ffn_dim=32,
-        num_encoder_layers=4,
+        num_encoder_layers=2,
         num_encoder_attention_heads=4,
         decoder_ffn_dim=32,
-        num_decoder_layers=4,
+        num_decoder_layers=2,
         num_decoder_attention_heads=4,
         max_position_embeddings=30,
         is_encoder_decoder=True,
@@ -70,7 +71,6 @@ def __init__(
         disable_ngram_loss=False,
         scope=None,
     ):
-
         self.parent = parent
         self.batch_size = batch_size
         self.encoder_seq_length = encoder_seq_length
@@ -437,10 +437,10 @@ def check_fast_integration(
                 decoder_attention_mask=decoder_attention_mask,
                 labels=lm_labels,
             )
-        self.parent.assertTrue(torch.allclose(result.loss, torch.tensor(4.5819, device=torch_device), atol=1e-3))
+        self.parent.assertTrue(torch.allclose(result.loss, torch.tensor(4.5892, device=torch_device), atol=1e-3))
 
         expected_logit_slice = torch.tensor(
-            [-0.1565, 0.0418, 0.1207, 0.0030, 0.0665, 0.0467, 0.0412], device=torch_device
+            [-0.0184, 0.0758, -0.0543, -0.0093, 0.0050, -0.0660, -0.1453], device=torch_device
         )
         self.parent.assertTrue(torch.allclose(result.logits[0, :, 1], expected_logit_slice, atol=1e-3))
 
@@ -551,10 +551,10 @@ def __init__(
         use_labels=True,
         decoder_start_token_id=0,
         encoder_ffn_dim=32,
-        num_encoder_layers=4,
+        num_encoder_layers=2,
         num_encoder_attention_heads=4,
         decoder_ffn_dim=32,
-        num_decoder_layers=4,
+        num_decoder_layers=2,
         num_decoder_attention_heads=4,
         max_position_embeddings=30,
         is_encoder_decoder=False,
@@ -782,10 +782,10 @@ def __init__(
         use_labels=True,
         decoder_start_token_id=0,
         encoder_ffn_dim=32,
-        num_encoder_layers=4,
+        num_encoder_layers=2,
         num_encoder_attention_heads=4,
         decoder_ffn_dim=32,
-        num_decoder_layers=4,
+        num_decoder_layers=2,
         num_decoder_attention_heads=4,
         max_position_embeddings=30,
         is_encoder_decoder=False,
@@ -886,13 +886,37 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class ProphetNetModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class ProphetNetModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (ProphetNetModel, ProphetNetForConditionalGeneration) if is_torch_available() else ()
     all_generative_model_classes = (ProphetNetForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "conversational": ProphetNetForConditionalGeneration,
+            "feature-extraction": ProphetNetModel,
+            "summarization": ProphetNetForConditionalGeneration,
+            "text-generation": ProphetNetForCausalLM,
+            "text2text-generation": ProphetNetForConditionalGeneration,
+            "translation": ProphetNetForConditionalGeneration,
+        }
+        if is_torch_available()
+        else {}
+    )
     test_pruning = False
     test_resize_embeddings = False
     is_encoder_decoder = True
 
+    # TODO: Fix the failed tests
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if pipeline_test_casse_name == "TextGenerationPipelineTests":
+            # Get `ValueError: AttributeError: 'NoneType' object has no attribute 'new_ones'` or `AssertionError`.
+            # `ProphetNetConfig` was never used in pipeline tests: cannot create a simple
+            # tokenizer.
+            return True
+
+        return False
+
     def setUp(self):
         self.model_tester = ProphetNetModelTester(self)
         self.config_tester = ConfigTester(self, config_class=ProphetNetConfig)
@@ -1122,6 +1146,10 @@ def test_retain_grad_hidden_states_attentions(self):
         # decoder cannot keep gradients
         return
 
+    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
+    def test_left_padding_compatibility(self):
+        pass
+
 
 @require_torch
 class ProphetNetStandaloneEncoderModelTest(ModelTesterMixin, unittest.TestCase):
@@ -1195,7 +1223,7 @@ def test_pretrained_checkpoint_hidden_states(self):
         expected_shape = torch.Size((1, 12, 30522))
         self.assertEqual(output_predited_logits.shape, expected_shape)
         expected_slice = torch.tensor(
-            [[[-7.6213, -7.9008, -7.9979], [-7.6834, -7.8467, -8.2187], [-7.5326, -7.4762, -8.1914]]]
+            [[[-7.7729, -8.0343, -8.26001], [-7.74213, -7.8629, -8.6000], [-7.7328, -7.8269, -8.5264]]]
         ).to(torch_device)
         #        self.assertTrue(torch.allclose(output_predited_logits[:, :3, :3], expected_slice, atol=1e-4))
         assert torch.allclose(output_predited_logits[:, :3, :3], expected_slice, atol=1e-4)
@@ -1295,7 +1323,7 @@ def test_question_gen_inference(self):
         EXPECTED_QUESTIONS = [
             "along with paul allen, who founded microsoft?",
             "what year was microsoft founded?",
-            "on what date was microsoft founded?",
+            "when was microsoft founded?",
         ]
 
         self.assertListEqual(
diff --git a/tests/models/prophetnet/test_tokenization_prophetnet.py b/tests/models/prophetnet/test_tokenization_prophetnet.py
index 8d95eb310025..cf4317b3a669 100644
--- a/tests/models/prophetnet/test_tokenization_prophetnet.py
+++ b/tests/models/prophetnet/test_tokenization_prophetnet.py
@@ -32,7 +32,6 @@
 
 
 class ProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = ProphetNetTokenizer
     test_rust_tokenizer = False
 
diff --git a/tests/models/pvt/__init__.py b/tests/models/pvt/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/pvt/test_image_processing_pvt.py b/tests/models/pvt/test_image_processing_pvt.py
new file mode 100644
index 000000000000..d6b11313d811
--- /dev/null
+++ b/tests/models/pvt/test_image_processing_pvt.py
@@ -0,0 +1,106 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_vision_available():
+    from transformers import PvtImageProcessor
+
+
+class PvtImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_normalize=True,
+        image_mean=[0.485, 0.456, 0.406],
+        image_std=[0.229, 0.224, 0.225],
+    ):
+        size = size if size is not None else {"height": 18, "width": 18}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def prepare_image_processor_dict(self):
+        return {
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_normalize": self.do_normalize,
+            "do_resize": self.do_resize,
+            "size": self.size,
+        }
+
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.size["height"], self.size["width"]
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class PvtImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = PvtImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = PvtImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"height": 18, "width": 18})
+
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
+        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
diff --git a/tests/models/pvt/test_modeling_pvt.py b/tests/models/pvt/test_modeling_pvt.py
new file mode 100644
index 000000000000..eb1370d0bc29
--- /dev/null
+++ b/tests/models/pvt/test_modeling_pvt.py
@@ -0,0 +1,336 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Pvt model. """
+
+
+import inspect
+import unittest
+
+from transformers import is_torch_available, is_vision_available
+from transformers.models.auto import get_values
+from transformers.testing_utils import (
+    require_accelerate,
+    require_torch,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import MODEL_MAPPING, PvtConfig, PvtForImageClassification, PvtImageProcessor, PvtModel
+    from transformers.models.pvt.modeling_pvt import PVT_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+if is_vision_available():
+    from PIL import Image
+
+
+class PvtConfigTester(ConfigTester):
+    def run_common_tests(self):
+        config = self.config_class(**self.inputs_dict)
+        self.parent.assertTrue(hasattr(config, "hidden_sizes"))
+        self.parent.assertTrue(hasattr(config, "num_encoder_blocks"))
+
+
+class PvtModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        image_size=64,
+        num_channels=3,
+        num_encoder_blocks=4,
+        depths=[2, 2, 2, 2],
+        sr_ratios=[8, 4, 2, 1],
+        hidden_sizes=[16, 32, 64, 128],
+        downsampling_rates=[1, 4, 8, 16],
+        num_attention_heads=[1, 2, 4, 8],
+        is_training=True,
+        use_labels=True,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        initializer_range=0.02,
+        num_labels=3,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.num_channels = num_channels
+        self.num_encoder_blocks = num_encoder_blocks
+        self.sr_ratios = sr_ratios
+        self.depths = depths
+        self.hidden_sizes = hidden_sizes
+        self.downsampling_rates = downsampling_rates
+        self.num_attention_heads = num_attention_heads
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels)
+
+        config = self.get_config()
+        return config, pixel_values, labels
+
+    def get_config(self):
+        return PvtConfig(
+            image_size=self.image_size,
+            num_channels=self.num_channels,
+            num_encoder_blocks=self.num_encoder_blocks,
+            depths=self.depths,
+            hidden_sizes=self.hidden_sizes,
+            num_attention_heads=self.num_attention_heads,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            initializer_range=self.initializer_range,
+        )
+
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = PvtModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        self.parent.assertIsNotNone(result.last_hidden_state)
+
+    def create_and_check_for_image_classification(self, config, pixel_values, labels):
+        config.num_labels = self.type_sequence_label_size
+        model = PvtForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values, labels=labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+
+        # test greyscale images
+        config.num_channels = 1
+        model = PvtForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+
+        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
+        result = model(pixel_values)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values, labels = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_torch
+class PvtModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (PvtModel, PvtForImageClassification) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": PvtModel, "image-classification": PvtForImageClassification}
+        if is_torch_available()
+        else {}
+    )
+
+    test_head_masking = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_torchscript = False
+    has_attentions = False
+
+    def setUp(self):
+        self.model_tester = PvtModelTester(self)
+        self.config_tester = PvtConfigTester(self, config_class=PvtConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip("Pvt does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip("Pvt does not have get_input_embeddings method and get_output_embeddings methods")
+    def test_model_common_attributes(self):
+        pass
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config=config)
+            for name, param in model.named_parameters():
+                self.assertTrue(
+                    -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
+                    msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                )
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.hidden_states
+
+            expected_num_layers = sum(self.model_tester.depths) + 1
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            # verify the first hidden states (first block)
+            self.assertListEqual(
+                list(hidden_states[0].shape[-3:]),
+                [
+                    self.model_tester.batch_size,
+                    (self.model_tester.image_size // 4) ** 2,
+                    self.model_tester.image_size // 4,
+                ],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    def test_training(self):
+        if not self.model_tester.is_training:
+            return
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        for model_class in self.all_model_classes:
+            if model_class in get_values(MODEL_MAPPING):
+                continue
+            model = model_class(config)
+            model.to(torch_device)
+            model.train()
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            loss = model(**inputs).loss
+            loss.backward()
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in PVT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = PvtModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@require_torch
+class PvtModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_image_classification(self):
+        # only resize + normalize
+        image_processor = PvtImageProcessor.from_pretrained("Zetatech/pvt-tiny-224")
+        model = PvtForImageClassification.from_pretrained("Zetatech/pvt-tiny-224").to(torch_device).eval()
+
+        image = prepare_img()
+        encoded_inputs = image_processor(images=image, return_tensors="pt")
+        pixel_values = encoded_inputs.pixel_values.to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(pixel_values)
+
+        expected_shape = torch.Size((1, model.config.num_labels))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        expected_slice = torch.tensor([-1.4192, -1.9158, -0.9702]).to(torch_device)
+
+        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_model(self):
+        model = PvtModel.from_pretrained("Zetatech/pvt-tiny-224").to(torch_device).eval()
+
+        image_processor = PvtImageProcessor.from_pretrained("Zetatech/pvt-tiny-224")
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="pt")
+        pixel_values = inputs.pixel_values.to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(pixel_values)
+
+        # verify the logits
+        expected_shape = torch.Size((1, 50, 512))
+        self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[-0.3086, 1.0402, 1.1816], [-0.2880, 0.5781, 0.6124], [0.1480, 0.6129, -0.0590]]
+        ).to(torch_device)
+
+        self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
+
+    @slow
+    @require_accelerate
+    @require_torch_gpu
+    def test_inference_fp16(self):
+        r"""
+        A small test to make sure that inference work in half precision without any problem.
+        """
+        model = PvtForImageClassification.from_pretrained("Zetatech/pvt-tiny-224", torch_dtype=torch.float16)
+        model.to(torch_device)
+        image_processor = PvtImageProcessor(size=224)
+
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="pt")
+        pixel_values = inputs.pixel_values.to(torch_device, dtype=torch.float16)
+
+        # forward pass to make sure inference works in fp16
+        with torch.no_grad():
+            _ = model(pixel_values)
diff --git a/tests/models/qdqbert/test_modeling_qdqbert.py b/tests/models/qdqbert/test_modeling_qdqbert.py
index 82bf5e3e3364..d10abb733e07 100644
--- a/tests/models/qdqbert/test_modeling_qdqbert.py
+++ b/tests/models/qdqbert/test_modeling_qdqbert.py
@@ -23,6 +23,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -53,7 +54,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -419,8 +420,7 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 @require_pytorch_quantization
-class QDQBertModelTest(ModelTesterMixin, unittest.TestCase):
-
+class QDQBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             QDQBertModel,
@@ -436,6 +436,19 @@ class QDQBertModelTest(ModelTesterMixin, unittest.TestCase):
         else ()
     )
     all_generative_model_classes = (QDQBertLMHeadModel,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": QDQBertModel,
+            "fill-mask": QDQBertForMaskedLM,
+            "question-answering": QDQBertForQuestionAnswering,
+            "text-classification": QDQBertForSequenceClassification,
+            "text-generation": QDQBertLMHeadModel,
+            "token-classification": QDQBertForTokenClassification,
+            "zero-shot": QDQBertForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
 
     def setUp(self):
         self.model_tester = QDQBertModelTester(self)
diff --git a/tests/models/rag/test_modeling_rag.py b/tests/models/rag/test_modeling_rag.py
index 80819663a107..48c7099620f3 100644
--- a/tests/models/rag/test_modeling_rag.py
+++ b/tests/models/rag/test_modeling_rag.py
@@ -48,10 +48,10 @@
 
 T5_SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
 if is_torch_available() and is_datasets_available() and is_faiss_available():
+    import faiss
     import torch
     from datasets import Dataset
 
-    import faiss
     from transformers import (
         AutoConfig,
         AutoModel,
@@ -99,7 +99,6 @@ def require_retrieval(test_case):
 @require_retrieval
 @require_sentencepiece
 class RagTestMixin:
-
     all_model_classes = (
         (RagModel, RagTokenForGeneration, RagSequenceForGeneration)
         if is_torch_available() and is_datasets_available() and is_faiss_available()
@@ -493,7 +492,7 @@ def check_model_with_mismatch_n_docs_value(
         decoder_attention_mask,
         retriever_n_docs,
         generator_n_docs,
-        **kwargs
+        **kwargs,
     ):
         self.assertIsNotNone(config.question_encoder)
         self.assertIsNotNone(config.generator)
diff --git a/tests/models/rag/test_modeling_tf_rag.py b/tests/models/rag/test_modeling_tf_rag.py
index 314ce099baf6..0041b0e6c49a 100644
--- a/tests/models/rag/test_modeling_tf_rag.py
+++ b/tests/models/rag/test_modeling_tf_rag.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import json
 import os
 import shutil
@@ -16,9 +18,9 @@
 
 
 if is_tf_available() and is_datasets_available() and is_faiss_available():
+    import faiss
     import tensorflow as tf
     from datasets import Dataset
-    import faiss
 
     from transformers import (
         AutoConfig,
@@ -31,7 +33,6 @@
         TFRagSequenceForGeneration,
         TFRagTokenForGeneration,
     )
-
     from transformers.modeling_tf_outputs import TFBaseModelOutput
 
 from ..bart.test_modeling_tf_bart import TFBartModelTester
@@ -58,7 +59,6 @@ def require_retrieval(test_case):
 @require_retrieval
 @require_sentencepiece
 class TFRagTestMixin:
-
     all_model_classes = (
         (TFRagModel, TFRagTokenForGeneration, TFRagSequenceForGeneration)
         if is_tf_available() and is_datasets_available() and is_faiss_available()
@@ -262,6 +262,7 @@ def check_model_generate(
                 num_beams=2,
                 num_return_sequences=2,
                 decoder_start_token_id=config.generator.eos_token_id,
+                max_new_tokens=5,
             )
 
             self.assertIsNotNone(outputs)
@@ -392,7 +393,7 @@ def check_model_with_mismatch_n_docs_value(
         decoder_attention_mask,
         retriever_n_docs,
         generator_n_docs,
-        **kwargs
+        **kwargs,
     ):
         self.assertIsNotNone(config.question_encoder)
         self.assertIsNotNone(config.generator)
@@ -489,6 +490,7 @@ def test_model_without_retriever(self):
         inputs_dict = self.config_and_inputs
         self.check_model_without_retriever(**inputs_dict)
 
+    @slow
     def test_model_generate_from_context_input_ids(self):
         inputs_dict = self.config_and_inputs
         self.check_model_generate_from_context_input_ids(**inputs_dict)
@@ -497,6 +499,7 @@ def test_model_with_encoder_outputs(self):
         inputs_dict = self.config_and_inputs
         self.check_model_with_encoder_outputs(**inputs_dict)
 
+    @slow
     def test_model_generate(self):
         inputs_dict = self.config_and_inputs
         self.check_model_generate(**inputs_dict)
diff --git a/tests/models/rag/test_retrieval_rag.py b/tests/models/rag/test_retrieval_rag.py
index c6c1e11360f8..d4c119815c96 100644
--- a/tests/models/rag/test_retrieval_rag.py
+++ b/tests/models/rag/test_retrieval_rag.py
@@ -360,7 +360,6 @@ def test_hf_index_retriever_call(self):
     @require_tokenizers
     @require_sentencepiece
     def test_custom_hf_index_end2end_retriever_call(self):
-
         context_encoder_tokenizer = self.get_dpr_ctx_encoder_tokenizer()
         n_docs = 1
         retriever = self.get_dummy_custom_hf_index_retriever(from_disk=False)
diff --git a/tests/models/rag/test_tokenization_rag.py b/tests/models/rag/test_tokenization_rag.py
index ae9909248471..3ac5b0efe02e 100644
--- a/tests/models/rag/test_tokenization_rag.py
+++ b/tests/models/rag/test_tokenization_rag.py
@@ -110,7 +110,6 @@ def tearDown(self):
 
     @require_tokenizers
     def test_save_load_pretrained_with_saved_config(self):
-
         save_dir = os.path.join(self.tmpdirname, "rag_tokenizer")
         rag_config = RagConfig(question_encoder=DPRConfig().to_dict(), generator=BartConfig().to_dict())
         rag_tokenizer = RagTokenizer(question_encoder=self.get_dpr_tokenizer(), generator=self.get_bart_tokenizer())
diff --git a/tests/models/realm/test_modeling_realm.py b/tests/models/realm/test_modeling_realm.py
index e084cf5a4e18..4d6d9fd0ff11 100644
--- a/tests/models/realm/test_modeling_realm.py
+++ b/tests/models/realm/test_modeling_realm.py
@@ -24,6 +24,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -53,7 +54,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -303,8 +304,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class RealmModelTest(ModelTesterMixin, unittest.TestCase):
-
+class RealmModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             RealmEmbedder,
@@ -317,6 +317,7 @@ class RealmModelTest(ModelTesterMixin, unittest.TestCase):
         else ()
     )
     all_generative_model_classes = ()
+    pipeline_model_mapping = {} if is_torch_available() else {}
 
     # disable these tests because there is no base_model in Realm
     test_save_load_fast_init_from_base = False
@@ -391,7 +392,7 @@ def test_training(self):
                 b"This is the fourth record.",
                 b"This is the fifth record.",
             ],
-            dtype=np.object,
+            dtype=object,
         )
         retriever = RealmRetriever(block_records, tokenizer)
         model = RealmForOpenQA(openqa_config, retriever)
@@ -480,15 +481,12 @@ def test_inference_encoder(self):
     def test_inference_open_qa(self):
         from transformers.models.realm.retrieval_realm import RealmRetriever
 
-        config = RealmConfig()
-
         tokenizer = RealmTokenizer.from_pretrained("google/realm-orqa-nq-openqa")
         retriever = RealmRetriever.from_pretrained("google/realm-orqa-nq-openqa")
 
         model = RealmForOpenQA.from_pretrained(
             "google/realm-orqa-nq-openqa",
             retriever=retriever,
-            config=config,
         )
 
         question = "Who is the pioneer in modern computer science?"
diff --git a/tests/models/realm/test_retrieval_realm.py b/tests/models/realm/test_retrieval_realm.py
index 939d98440049..ba65a6afdd67 100644
--- a/tests/models/realm/test_retrieval_realm.py
+++ b/tests/models/realm/test_retrieval_realm.py
@@ -100,7 +100,7 @@ def get_dummy_block_records(self):
                 b"This is the fifth record",
                 b"This is a longer longer longer record",
             ],
-            dtype=np.object,
+            dtype=object,
         )
         return block_records
 
@@ -116,7 +116,7 @@ def test_retrieve(self):
         retriever = self.get_dummy_retriever()
         tokenizer = retriever.tokenizer
 
-        retrieved_block_ids = np.array([0, 3], dtype=np.long)
+        retrieved_block_ids = np.array([0, 3], dtype="long")
         question_input_ids = tokenizer(["Test question"]).input_ids
         answer_ids = tokenizer(
             ["the fourth"],
@@ -151,7 +151,7 @@ def test_block_has_answer(self):
         retriever = self.get_dummy_retriever()
         tokenizer = retriever.tokenizer
 
-        retrieved_block_ids = np.array([0, 3, 5], dtype=np.long)
+        retrieved_block_ids = np.array([0, 3, 5], dtype="long")
         question_input_ids = tokenizer(["Test question"]).input_ids
         answer_ids = tokenizer(
             ["the fourth", "longer longer"],
diff --git a/tests/models/realm/test_tokenization_realm.py b/tests/models/realm/test_tokenization_realm.py
index 2a065ceee66a..6a5a3878fd43 100644
--- a/tests/models/realm/test_tokenization_realm.py
+++ b/tests/models/realm/test_tokenization_realm.py
@@ -33,7 +33,6 @@
 
 @require_tokenizers
 class RealmTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = RealmTokenizer
     rust_tokenizer_class = RealmTokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/reformer/test_modeling_reformer.py b/tests/models/reformer/test_modeling_reformer.py
index 4193607897b8..c84f729633cc 100644
--- a/tests/models/reformer/test_modeling_reformer.py
+++ b/tests/models/reformer/test_modeling_reformer.py
@@ -28,6 +28,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -681,19 +682,53 @@ def _check_hidden_states_for_generate(
                 [expected_shape] * len(iter_hidden_states),
             )
 
+    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
+    def test_left_padding_compatibility(self):
+        pass
+
 
 @require_torch
-class ReformerLSHAttnModelTest(ReformerTesterMixin, ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class ReformerLSHAttnModelTest(
+    ReformerTesterMixin, ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase
+):
     all_model_classes = (
         (ReformerModel, ReformerModelWithLMHead, ReformerForSequenceClassification, ReformerForQuestionAnswering)
         if is_torch_available()
         else ()
     )
     all_generative_model_classes = (ReformerModelWithLMHead,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": ReformerModel,
+            "fill-mask": ReformerForMaskedLM,
+            "question-answering": ReformerForQuestionAnswering,
+            "text-classification": ReformerForSequenceClassification,
+            "text-generation": ReformerModelWithLMHead,
+            "zero-shot": ReformerForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
     test_pruning = False
     test_headmasking = False
     test_torchscript = False
 
+    # TODO: Fix the failed tests
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if (
+            pipeline_test_casse_name == "QAPipelineTests"
+            and tokenizer_name is not None
+            and not tokenizer_name.endswith("Fast")
+        ):
+            # `QAPipelineTests` fails for a few models when the slower tokenizer are used.
+            # (The slower tokenizers were never used for pipeline tests before the pipeline testing rework)
+            # TODO: check (and possibly fix) the `QAPipelineTests` with slower tokenizer
+            return True
+
+        return False
+
     def setUp(self):
         self.model_tester = ReformerModelTester(
             self,
@@ -800,8 +835,16 @@ def _check_hidden_states_for_generate(
                 [expected_shape] * len(iter_hidden_states),
             )
 
+    @unittest.skip("Fails because the sequence length is not a multiple of 4")
     def test_problem_types(self):
-        # Fails because the sequence length is not a multiple of 4
+        pass
+
+    @unittest.skip("Fails because the sequence length is not a multiple of 4")
+    def test_past_key_values_format(self):
+        pass
+
+    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
+    def test_left_padding_compatibility(self):
         pass
 
 
@@ -1145,10 +1188,11 @@ def test_lm_model_forward(self):
         hidden_states = model(input_ids=input_ids, attention_mask=attn_mask)[0]
         output_slice = hidden_states[1, -1, :5]
         expected_output_slice = torch.tensor(
-            [0.0256, -0.0121, 0.0636, 0.0024, -0.0393],
+            [0.1018, -0.2026, 0.2116, 0.0270, -0.1233],
             dtype=torch.float,
             device=torch_device,
         )
+
         self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
 
     def test_local_lm_model_grad(self):
@@ -1163,25 +1207,25 @@ def test_local_lm_model_grad(self):
         input_ids, _ = self._get_input_ids_and_mask()
         loss = model(input_ids=input_ids, labels=input_ids)[0]
 
-        self.assertTrue(torch.allclose(loss, torch.tensor(5.7786, dtype=torch.float, device=torch_device), atol=1e-3))
+        self.assertTrue(torch.allclose(loss, torch.tensor(5.8019, dtype=torch.float, device=torch_device), atol=1e-3))
         loss.backward()
 
         # check last grads to cover all proable errors
         grad_slice_word = model.reformer.embeddings.word_embeddings.weight.grad[0, :5]
         expected_grad_slice_word = torch.tensor(
-            [-0.0005, 0.0001, 0.0002, 0.0003, 0.0006],
+            [-0.0005, -0.0001, -0.0002, -0.0006, -0.0006],
             dtype=torch.float,
             device=torch_device,
         )
         grad_slice_position_factor_1 = model.reformer.embeddings.position_embeddings.weights[0][1, 0, -5:]
         expected_grad_slice_pos_fac_1 = torch.tensor(
-            [0.0037, -1.3793, -1.0231, -1.5230, -2.5306],
+            [-0.5235, 0.5704, 0.0922, -0.3140, 0.9928],
             dtype=torch.float,
             device=torch_device,
         )
         grad_slice_position_factor_2 = model.reformer.embeddings.position_embeddings.weights[1][0, 1, :5]
         expected_grad_slice_pos_fac_2 = torch.tensor(
-            [-1.3165, 0.5168, 0.7785, 1.0811, -0.9830],
+            [1.7960, 1.7668, 0.5593, 0.0907, 1.8342],
             dtype=torch.float,
             device=torch_device,
         )
@@ -1203,24 +1247,24 @@ def test_lsh_lm_model_grad(self):
         input_ids, _ = self._get_input_ids_and_mask()
         loss = model(input_ids=input_ids, labels=input_ids)[0]
 
-        self.assertTrue(torch.allclose(loss, torch.tensor(5.7819, dtype=torch.float, device=torch_device), atol=1e-3))
+        self.assertTrue(torch.allclose(loss, torch.tensor(5.7854, dtype=torch.float, device=torch_device), atol=1e-3))
         loss.backward()
         # check last grads to cover all proable errors
         grad_slice_word = model.reformer.embeddings.word_embeddings.weight.grad[0, :5]
         expected_grad_slice_word = torch.tensor(
-            [2.6357e-05, 4.3358e-04, -8.4985e-04, 1.0094e-04, 3.8954e-04],
+            [0.0004, 0.0003, 0.0006, -0.0004, 0.0002],
             dtype=torch.float,
             device=torch_device,
         )
         grad_slice_position_factor_1 = model.reformer.embeddings.position_embeddings.weights[0][1, 0, -5:]
         expected_grad_slice_pos_fac_1 = torch.tensor(
-            [-0.0984, 0.6283, 0.4282, 1.2960, 0.6897],
+            [-0.3792, 0.5593, -1.6993, 0.2033, 0.4131],
             dtype=torch.float,
             device=torch_device,
         )
         grad_slice_position_factor_2 = model.reformer.embeddings.position_embeddings.weights[1][0, 1, :5]
         expected_grad_slice_pos_fac_2 = torch.tensor(
-            [0.4626, -0.0231, -0.0172, 0.1081, 0.3805],
+            [-1.4212, -0.3201, -1.1944, 0.1258, 0.2856],
             dtype=torch.float,
             device=torch_device,
         )
diff --git a/tests/models/reformer/test_tokenization_reformer.py b/tests/models/reformer/test_tokenization_reformer.py
index 37ea66847f2d..a2a0db6c3705 100644
--- a/tests/models/reformer/test_tokenization_reformer.py
+++ b/tests/models/reformer/test_tokenization_reformer.py
@@ -27,7 +27,6 @@
 @require_sentencepiece
 @require_tokenizers
 class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = ReformerTokenizer
     rust_tokenizer_class = ReformerTokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/regnet/test_modeling_flax_regnet.py b/tests/models/regnet/test_modeling_flax_regnet.py
new file mode 100644
index 000000000000..911d595c56e6
--- /dev/null
+++ b/tests/models/regnet/test_modeling_flax_regnet.py
@@ -0,0 +1,237 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import unittest
+
+from transformers import RegNetConfig, is_flax_available
+from transformers.testing_utils import require_flax, slow
+from transformers.utils import cached_property, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_flax_common import FlaxModelTesterMixin, floats_tensor
+
+
+if is_flax_available():
+    import jax
+    import jax.numpy as jnp
+
+    from transformers.models.regnet.modeling_flax_regnet import FlaxRegNetForImageClassification, FlaxRegNetModel
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import AutoImageProcessor
+
+
+class FlaxRegNetModelTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=3,
+        image_size=32,
+        num_channels=3,
+        embeddings_size=10,
+        hidden_sizes=[10, 20, 30, 40],
+        depths=[1, 1, 2, 1],
+        is_training=True,
+        use_labels=True,
+        hidden_act="relu",
+        num_labels=3,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.num_channels = num_channels
+        self.embeddings_size = embeddings_size
+        self.hidden_sizes = hidden_sizes
+        self.depths = depths
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_act = hidden_act
+        self.num_labels = num_labels
+        self.scope = scope
+        self.num_stages = len(hidden_sizes)
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        config = self.get_config()
+
+        return config, pixel_values
+
+    def get_config(self):
+        return RegNetConfig(
+            num_channels=self.num_channels,
+            embeddings_size=self.embeddings_size,
+            hidden_sizes=self.hidden_sizes,
+            depths=self.depths,
+            hidden_act=self.hidden_act,
+            num_labels=self.num_labels,
+            image_size=self.image_size,
+        )
+
+    def create_and_check_model(self, config, pixel_values):
+        model = FlaxRegNetModel(config=config)
+        result = model(pixel_values)
+
+        # Output shape (b, c, h, w)
+        self.parent.assertEqual(
+            result.last_hidden_state.shape,
+            (self.batch_size, self.hidden_sizes[-1], self.image_size // 32, self.image_size // 32),
+        )
+
+    def create_and_check_for_image_classification(self, config, pixel_values):
+        config.num_labels = self.num_labels
+        model = FlaxRegNetForImageClassification(config=config)
+        result = model(pixel_values)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_flax
+class FlaxResNetModelTest(FlaxModelTesterMixin, unittest.TestCase):
+    all_model_classes = (FlaxRegNetModel, FlaxRegNetForImageClassification) if is_flax_available() else ()
+
+    is_encoder_decoder = False
+    test_head_masking = False
+    has_attentions = False
+
+    def setUp(self) -> None:
+        self.model_tester = FlaxRegNetModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=RegNetConfig, has_text_modality=False)
+
+    def test_config(self):
+        self.create_and_test_config_common_properties()
+        self.config_tester.create_and_test_config_to_json_string()
+        self.config_tester.create_and_test_config_to_json_file()
+        self.config_tester.create_and_test_config_from_and_save_pretrained()
+        self.config_tester.create_and_test_config_with_num_labels()
+        self.config_tester.check_config_can_be_init_without_params()
+        self.config_tester.check_config_arguments_init()
+
+    def create_and_test_config_common_properties(self):
+        return
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_image_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
+
+    @unittest.skip(reason="RegNet does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="RegNet does not support input and output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.__call__)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+            expected_num_stages = self.model_tester.num_stages
+            self.assertEqual(len(hidden_states), expected_num_stages + 1)
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    def test_jit_compilation(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
+                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+                model = model_class(config)
+
+                @jax.jit
+                def model_jitted(pixel_values, **kwargs):
+                    return model(pixel_values=pixel_values, **kwargs)
+
+                with self.subTest("JIT Enabled"):
+                    jitted_outputs = model_jitted(**prepared_inputs_dict).to_tuple()
+
+                with self.subTest("JIT Disabled"):
+                    with jax.disable_jit():
+                        outputs = model_jitted(**prepared_inputs_dict).to_tuple()
+
+                self.assertEqual(len(outputs), len(jitted_outputs))
+                for jitted_output, output in zip(jitted_outputs, outputs):
+                    self.assertEqual(jitted_output.shape, output.shape)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_flax
+class FlaxRegNetModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_image_processor(self):
+        return AutoImageProcessor.from_pretrained("facebook/regnet-y-040") if is_vision_available() else None
+
+    @slow
+    def test_inference_image_classification_head(self):
+        model = FlaxRegNetForImageClassification.from_pretrained("facebook/regnet-y-040")
+
+        image_processor = self.default_image_processor
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="np")
+
+        outputs = model(**inputs)
+
+        # verify the logits
+        expected_shape = (1, 1000)
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        expected_slice = jnp.array([-0.4180, -1.5051, -3.4836])
+
+        self.assertTrue(jnp.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
diff --git a/tests/models/regnet/test_modeling_regnet.py b/tests/models/regnet/test_modeling_regnet.py
index 02695dbf6434..debd8271b5c6 100644
--- a/tests/models/regnet/test_modeling_regnet.py
+++ b/tests/models/regnet/test_modeling_regnet.py
@@ -24,6 +24,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -37,7 +38,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import AutoFeatureExtractor
+    from transformers import AutoImageProcessor
 
 
 class RegNetModelTester:
@@ -118,13 +119,18 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class RegNetModelTest(ModelTesterMixin, unittest.TestCase):
+class RegNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_common.py, as RegNet does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
     """
 
     all_model_classes = (RegNetModel, RegNetForImageClassification) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": RegNetModel, "image-classification": RegNetForImageClassification}
+        if is_torch_available()
+        else {}
+    )
 
     test_pruning = False
     test_resize_embeddings = False
@@ -242,9 +248,9 @@ def prepare_img():
 @require_vision
 class RegNetModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return (
-            AutoFeatureExtractor.from_pretrained(REGNET_PRETRAINED_MODEL_ARCHIVE_LIST[0])
+            AutoImageProcessor.from_pretrained(REGNET_PRETRAINED_MODEL_ARCHIVE_LIST[0])
             if is_vision_available()
             else None
         )
@@ -253,9 +259,9 @@ def default_feature_extractor(self):
     def test_inference_image_classification_head(self):
         model = RegNetForImageClassification.from_pretrained(REGNET_PRETRAINED_MODEL_ARCHIVE_LIST[0]).to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/regnet/test_modeling_tf_regnet.py b/tests/models/regnet/test_modeling_tf_regnet.py
index 2977d43c06c4..a2f2bf92ef46 100644
--- a/tests/models/regnet/test_modeling_tf_regnet.py
+++ b/tests/models/regnet/test_modeling_tf_regnet.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 """ Testing suite for the TensorFlow RegNet model. """
 
+from __future__ import annotations
+
 import inspect
 import unittest
 from typing import List, Tuple
@@ -24,6 +26,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -35,7 +38,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import AutoFeatureExtractor
+    from transformers import AutoImageProcessor
 
 
 class TFRegNetModelTester:
@@ -111,13 +114,18 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFRegNetModelTest(TFModelTesterMixin, unittest.TestCase):
+class TFRegNetModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_common.py, as RegNet does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
     """
 
     all_model_classes = (TFRegNetModel, TFRegNetForImageClassification) if is_tf_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": TFRegNetModel, "image-classification": TFRegNetForImageClassification}
+        if is_tf_available()
+        else {}
+    )
 
     test_pruning = False
     test_onnx = False
@@ -140,6 +148,7 @@ def test_inputs_embeds(self):
         not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0,
         reason="TF does not support backprop for grouped convolutions on CPU.",
     )
+    @slow
     def test_keras_fit(self):
         super().test_keras_fit()
 
@@ -258,9 +267,9 @@ def prepare_img():
 @require_vision
 class RegNetModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return (
-            AutoFeatureExtractor.from_pretrained(TF_REGNET_PRETRAINED_MODEL_ARCHIVE_LIST[0])
+            AutoImageProcessor.from_pretrained(TF_REGNET_PRETRAINED_MODEL_ARCHIVE_LIST[0])
             if is_vision_available()
             else None
         )
@@ -269,9 +278,9 @@ def default_feature_extractor(self):
     def test_inference_image_classification_head(self):
         model = TFRegNetForImageClassification.from_pretrained(TF_REGNET_PRETRAINED_MODEL_ARCHIVE_LIST[0])
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="tf")
+        inputs = image_processor(images=image, return_tensors="tf")
 
         # forward pass
         outputs = model(**inputs, training=False)
diff --git a/tests/models/rembert/test_modeling_rembert.py b/tests/models/rembert/test_modeling_rembert.py
index a3ffd6dfd5a1..557a42243df8 100644
--- a/tests/models/rembert/test_modeling_rembert.py
+++ b/tests/models/rembert/test_modeling_rembert.py
@@ -22,6 +22,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -54,7 +55,7 @@ def __init__(
         hidden_size=32,
         input_embedding_size=18,
         output_embedding_size=43,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -360,8 +361,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class RemBertModelTest(ModelTesterMixin, unittest.TestCase):
-
+class RemBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             RemBertModel,
@@ -376,6 +376,19 @@ class RemBertModelTest(ModelTesterMixin, unittest.TestCase):
         else ()
     )
     all_generative_model_classes = (RemBertForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": RemBertModel,
+            "fill-mask": RemBertForMaskedLM,
+            "question-answering": RemBertForQuestionAnswering,
+            "text-classification": RemBertForSequenceClassification,
+            "text-generation": RemBertForCausalLM,
+            "token-classification": RemBertForTokenClassification,
+            "zero-shot": RemBertForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
 
     def setUp(self):
         self.model_tester = RemBertModelTester(self)
@@ -465,7 +478,8 @@ def test_inference_model(self):
         model = RemBertModel.from_pretrained("google/rembert")
         input_ids = torch.tensor([[312, 56498, 313, 2125, 313]])
         segment_ids = torch.tensor([[0, 0, 0, 1, 1]])
-        output = model(input_ids, token_type_ids=segment_ids, output_hidden_states=True)
+        with torch.no_grad():
+            output = model(input_ids, token_type_ids=segment_ids, output_hidden_states=True)
 
         hidden_size = 1152
 
diff --git a/tests/models/rembert/test_modeling_tf_rembert.py b/tests/models/rembert/test_modeling_tf_rembert.py
index 6d4cf0a523b9..8698d3febc84 100644
--- a/tests/models/rembert/test_modeling_tf_rembert.py
+++ b/tests/models/rembert/test_modeling_tf_rembert.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import RemBertConfig, is_tf_available
@@ -21,6 +23,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -51,7 +54,7 @@ def __init__(
         hidden_size=32,
         input_embedding_size=18,
         output_embedding_size=43,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -76,7 +79,7 @@ def __init__(
         self.hidden_size = 32
         self.input_embedding_size = input_embedding_size
         self.output_embedding_size = output_embedding_size
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.num_attention_heads = 4
         self.intermediate_size = 37
         self.hidden_act = "gelu"
@@ -570,8 +573,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFRemBertModelTest(TFModelTesterMixin, unittest.TestCase):
-
+class TFRemBertModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             TFRemBertModel,
@@ -585,6 +587,19 @@ class TFRemBertModelTest(TFModelTesterMixin, unittest.TestCase):
         if is_tf_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": TFRemBertModel,
+            "fill-mask": TFRemBertForMaskedLM,
+            "question-answering": TFRemBertForQuestionAnswering,
+            "text-classification": TFRemBertForSequenceClassification,
+            "text-generation": TFRemBertForCausalLM,
+            "token-classification": TFRemBertForTokenClassification,
+            "zero-shot": TFRemBertForSequenceClassification,
+        }
+        if is_tf_available()
+        else {}
+    )
 
     test_head_masking = False
     test_onnx = False
diff --git a/tests/models/resnet/test_modeling_flax_resnet.py b/tests/models/resnet/test_modeling_flax_resnet.py
new file mode 100644
index 000000000000..e9566e2e2fd5
--- /dev/null
+++ b/tests/models/resnet/test_modeling_flax_resnet.py
@@ -0,0 +1,228 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import unittest
+
+from transformers import ResNetConfig, is_flax_available
+from transformers.testing_utils import require_flax, slow
+from transformers.utils import cached_property, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_flax_common import FlaxModelTesterMixin, floats_tensor
+
+
+if is_flax_available():
+    import jax
+    import jax.numpy as jnp
+
+    from transformers.models.resnet.modeling_flax_resnet import FlaxResNetForImageClassification, FlaxResNetModel
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import AutoImageProcessor
+
+
+class FlaxResNetModelTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=3,
+        image_size=32,
+        num_channels=3,
+        embeddings_size=10,
+        hidden_sizes=[10, 20, 30, 40],
+        depths=[1, 1, 2, 1],
+        is_training=True,
+        use_labels=True,
+        hidden_act="relu",
+        num_labels=3,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.num_channels = num_channels
+        self.embeddings_size = embeddings_size
+        self.hidden_sizes = hidden_sizes
+        self.depths = depths
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_act = hidden_act
+        self.num_labels = num_labels
+        self.scope = scope
+        self.num_stages = len(hidden_sizes)
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        config = self.get_config()
+
+        return config, pixel_values
+
+    def get_config(self):
+        return ResNetConfig(
+            num_channels=self.num_channels,
+            embeddings_size=self.embeddings_size,
+            hidden_sizes=self.hidden_sizes,
+            depths=self.depths,
+            hidden_act=self.hidden_act,
+            num_labels=self.num_labels,
+            image_size=self.image_size,
+        )
+
+    def create_and_check_model(self, config, pixel_values):
+        model = FlaxResNetModel(config=config)
+        result = model(pixel_values)
+
+        # Output shape (b, c, h, w)
+        self.parent.assertEqual(
+            result.last_hidden_state.shape,
+            (self.batch_size, self.hidden_sizes[-1], self.image_size // 32, self.image_size // 32),
+        )
+
+    def create_and_check_for_image_classification(self, config, pixel_values):
+        config.num_labels = self.num_labels
+        model = FlaxResNetForImageClassification(config=config)
+        result = model(pixel_values)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_flax
+class FlaxResNetModelTest(FlaxModelTesterMixin, unittest.TestCase):
+    all_model_classes = (FlaxResNetModel, FlaxResNetForImageClassification) if is_flax_available() else ()
+
+    is_encoder_decoder = False
+    test_head_masking = False
+    has_attentions = False
+
+    def setUp(self) -> None:
+        self.model_tester = FlaxResNetModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ResNetConfig, has_text_modality=False)
+
+    def test_config(self):
+        self.create_and_test_config_common_properties()
+        self.config_tester.create_and_test_config_to_json_string()
+        self.config_tester.create_and_test_config_to_json_file()
+        self.config_tester.create_and_test_config_from_and_save_pretrained()
+        self.config_tester.create_and_test_config_with_num_labels()
+        self.config_tester.check_config_can_be_init_without_params()
+        self.config_tester.check_config_arguments_init()
+
+    def create_and_test_config_common_properties(self):
+        return
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_image_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
+
+    @unittest.skip(reason="ResNet does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="ResNet does not support input and output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.__call__)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+            expected_num_stages = self.model_tester.num_stages
+            self.assertEqual(len(hidden_states), expected_num_stages + 1)
+
+    @unittest.skip(reason="ResNet does not use feedforward chunking")
+    def test_feed_forward_chunking(self):
+        pass
+
+    def test_jit_compilation(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
+                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+                model = model_class(config)
+
+                @jax.jit
+                def model_jitted(pixel_values, **kwargs):
+                    return model(pixel_values=pixel_values, **kwargs)
+
+                with self.subTest("JIT Enabled"):
+                    jitted_outputs = model_jitted(**prepared_inputs_dict).to_tuple()
+
+                with self.subTest("JIT Disabled"):
+                    with jax.disable_jit():
+                        outputs = model_jitted(**prepared_inputs_dict).to_tuple()
+
+                self.assertEqual(len(outputs), len(jitted_outputs))
+                for jitted_output, output in zip(jitted_outputs, outputs):
+                    self.assertEqual(jitted_output.shape, output.shape)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_flax
+class FlaxResNetModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_image_processor(self):
+        return AutoImageProcessor.from_pretrained("microsoft/resnet-50") if is_vision_available() else None
+
+    @slow
+    def test_inference_image_classification_head(self):
+        model = FlaxResNetForImageClassification.from_pretrained("microsoft/resnet-50")
+
+        image_processor = self.default_image_processor
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="np")
+
+        outputs = model(**inputs)
+
+        # verify the logits
+        expected_shape = (1, 1000)
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        expected_slice = jnp.array([-11.1069, -9.7877, -8.3777])
+
+        self.assertTrue(jnp.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
diff --git a/tests/models/resnet/test_modeling_resnet.py b/tests/models/resnet/test_modeling_resnet.py
index f4b79fb24b25..4cfa18d6bcf4 100644
--- a/tests/models/resnet/test_modeling_resnet.py
+++ b/tests/models/resnet/test_modeling_resnet.py
@@ -22,8 +22,10 @@
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
 from transformers.utils import cached_property, is_torch_available, is_vision_available
 
+from ...test_backbone_common import BackboneTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -37,7 +39,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import AutoFeatureExtractor
+    from transformers import AutoImageProcessor
 
 
 class ResNetModelTester:
@@ -56,6 +58,7 @@ def __init__(
         num_labels=3,
         scope=None,
         out_features=["stage2", "stage3", "stage4"],
+        out_indices=[2, 3, 4],
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -71,6 +74,7 @@ def __init__(
         self.scope = scope
         self.num_stages = len(hidden_sizes)
         self.out_features = out_features
+        self.out_indices = out_indices
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
@@ -92,6 +96,7 @@ def get_config(self):
             hidden_act=self.hidden_act,
             num_labels=self.num_labels,
             out_features=self.out_features,
+            out_indices=self.out_indices,
         )
 
     def create_and_check_model(self, config, pixel_values, labels):
@@ -150,7 +155,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class ResNetModelTest(ModelTesterMixin, unittest.TestCase):
+class ResNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_common.py, as ResNet does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
@@ -165,6 +170,11 @@ class ResNetModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {"feature-extraction": ResNetModel, "image-classification": ResNetForImageClassification}
+        if is_torch_available()
+        else {}
+    )
 
     fx_compatible = True
     test_pruning = False
@@ -291,9 +301,9 @@ def prepare_img():
 @require_vision
 class ResNetModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return (
-            AutoFeatureExtractor.from_pretrained(RESNET_PRETRAINED_MODEL_ARCHIVE_LIST[0])
+            AutoImageProcessor.from_pretrained(RESNET_PRETRAINED_MODEL_ARCHIVE_LIST[0])
             if is_vision_available()
             else None
         )
@@ -302,9 +312,9 @@ def default_feature_extractor(self):
     def test_inference_image_classification_head(self):
         model = ResNetForImageClassification.from_pretrained(RESNET_PRETRAINED_MODEL_ARCHIVE_LIST[0]).to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -317,3 +327,13 @@ def test_inference_image_classification_head(self):
         expected_slice = torch.tensor([-11.1069, -9.7877, -8.3777]).to(torch_device)
 
         self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
+
+
+@require_torch
+class ResNetBackboneTest(BackboneTesterMixin, unittest.TestCase):
+    all_model_classes = (ResNetBackbone,) if is_torch_available() else ()
+    has_attentions = False
+    config_class = ResNetConfig
+
+    def setUp(self):
+        self.model_tester = ResNetModelTester(self)
diff --git a/tests/models/resnet/test_modeling_tf_resnet.py b/tests/models/resnet/test_modeling_tf_resnet.py
index fdcbc1c734fb..827fc807dfe5 100644
--- a/tests/models/resnet/test_modeling_tf_resnet.py
+++ b/tests/models/resnet/test_modeling_tf_resnet.py
@@ -15,6 +15,8 @@
 """ Testing suite for the Tensorflow ResNet model. """
 
 
+from __future__ import annotations
+
 import inspect
 import unittest
 
@@ -26,6 +28,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -38,7 +41,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import AutoFeatureExtractor
+    from transformers import AutoImageProcessor
 
 
 class TFResNetModelTester:
@@ -116,13 +119,18 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFResNetModelTest(TFModelTesterMixin, unittest.TestCase):
+class TFResNetModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_common.py, as ResNet does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
     """
 
     all_model_classes = (TFResNetModel, TFResNetForImageClassification) if is_tf_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": TFResNetModel, "image-classification": TFResNetForImageClassification}
+        if is_tf_available()
+        else {}
+    )
 
     test_pruning = False
     test_resize_embeddings = False
@@ -221,9 +229,9 @@ def prepare_img():
 @require_vision
 class TFResNetModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return (
-            AutoFeatureExtractor.from_pretrained(TF_RESNET_PRETRAINED_MODEL_ARCHIVE_LIST[0])
+            AutoImageProcessor.from_pretrained(TF_RESNET_PRETRAINED_MODEL_ARCHIVE_LIST[0])
             if is_vision_available()
             else None
         )
@@ -232,9 +240,9 @@ def default_feature_extractor(self):
     def test_inference_image_classification_head(self):
         model = TFResNetForImageClassification.from_pretrained(TF_RESNET_PRETRAINED_MODEL_ARCHIVE_LIST[0])
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="tf")
+        inputs = image_processor(images=image, return_tensors="tf")
 
         # forward pass
         outputs = model(**inputs)
diff --git a/tests/models/retribert/test_tokenization_retribert.py b/tests/models/retribert/test_tokenization_retribert.py
deleted file mode 100644
index e2bf4e61b1ac..000000000000
--- a/tests/models/retribert/test_tokenization_retribert.py
+++ /dev/null
@@ -1,384 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the RetriBERT tokenizer. """
-
-
-import os
-import unittest
-
-from transformers import RetriBertTokenizer, RetriBertTokenizerFast
-from transformers.models.bert.tokenization_bert import (
-    VOCAB_FILES_NAMES,
-    BasicTokenizer,
-    WordpieceTokenizer,
-    _is_control,
-    _is_punctuation,
-    _is_whitespace,
-)
-from transformers.testing_utils import require_tokenizers, require_torch, slow
-
-from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english, merge_model_tokenizer_mappings
-
-
-# Copied from transformers.tests.bert.test_modeling_bert.py with Bert->RetriBert
-@require_tokenizers
-class RetriBertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
-    tokenizer_class = RetriBertTokenizer
-    test_slow_tokenizer = True
-    rust_tokenizer_class = RetriBertTokenizerFast
-    test_rust_tokenizer = True
-    space_between_special_tokens = True
-    from_pretrained_filter = filter_non_english
-
-    def setUp(self):
-        super().setUp()
-
-        vocab_tokens = [
-            "[UNK]",
-            "[CLS]",
-            "[SEP]",
-            "[PAD]",
-            "[MASK]",
-            "want",
-            "##want",
-            "##ed",
-            "wa",
-            "un",
-            "runn",
-            "##ing",
-            ",",
-            "low",
-            "lowest",
-        ]
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
-            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-
-    def get_input_output_texts(self, tokenizer):
-        input_text = "UNwant\u00E9d,running"
-        output_text = "unwanted, running"
-        return input_text, output_text
-
-    def test_full_tokenizer(self):
-        tokenizer = self.tokenizer_class(self.vocab_file)
-
-        tokens = tokenizer.tokenize("UNwant\u00E9d,running")
-        self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [9, 6, 7, 12, 10, 11])
-
-    def test_rust_and_python_full_tokenizers(self):
-        if not self.test_rust_tokenizer:
-            return
-
-        tokenizer = self.get_tokenizer()
-        rust_tokenizer = self.get_rust_tokenizer()
-
-        sequence = "UNwant\u00E9d,running"
-
-        tokens = tokenizer.tokenize(sequence)
-        rust_tokens = rust_tokenizer.tokenize(sequence)
-        self.assertListEqual(tokens, rust_tokens)
-
-        ids = tokenizer.encode(sequence, add_special_tokens=False)
-        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
-        self.assertListEqual(ids, rust_ids)
-
-        rust_tokenizer = self.get_rust_tokenizer()
-        ids = tokenizer.encode(sequence)
-        rust_ids = rust_tokenizer.encode(sequence)
-        self.assertListEqual(ids, rust_ids)
-
-        # With lower casing
-        tokenizer = self.get_tokenizer(do_lower_case=True)
-        rust_tokenizer = self.get_rust_tokenizer(do_lower_case=True)
-
-        sequence = "UNwant\u00E9d,running"
-
-        tokens = tokenizer.tokenize(sequence)
-        rust_tokens = rust_tokenizer.tokenize(sequence)
-        self.assertListEqual(tokens, rust_tokens)
-
-        ids = tokenizer.encode(sequence, add_special_tokens=False)
-        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
-        self.assertListEqual(ids, rust_ids)
-
-        rust_tokenizer = self.get_rust_tokenizer()
-        ids = tokenizer.encode(sequence)
-        rust_ids = rust_tokenizer.encode(sequence)
-        self.assertListEqual(ids, rust_ids)
-
-    def test_chinese(self):
-        tokenizer = BasicTokenizer()
-
-        self.assertListEqual(tokenizer.tokenize("ah\u535A\u63A8zz"), ["ah", "\u535A", "\u63A8", "zz"])
-
-    def test_basic_tokenizer_lower(self):
-        tokenizer = BasicTokenizer(do_lower_case=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["hello", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
-
-    def test_basic_tokenizer_lower_strip_accents_false(self):
-        tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=False)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hällo", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["h\u00E9llo"])
-
-    def test_basic_tokenizer_lower_strip_accents_true(self):
-        tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
-
-    def test_basic_tokenizer_lower_strip_accents_default(self):
-        tokenizer = BasicTokenizer(do_lower_case=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
-
-    def test_basic_tokenizer_no_lower(self):
-        tokenizer = BasicTokenizer(do_lower_case=False)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
-        )
-
-    def test_basic_tokenizer_no_lower_strip_accents_false(self):
-        tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=False)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["HäLLo", "!", "how", "Are", "yoU", "?"]
-        )
-
-    def test_basic_tokenizer_no_lower_strip_accents_true(self):
-        tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["HaLLo", "!", "how", "Are", "yoU", "?"]
-        )
-
-    def test_basic_tokenizer_respects_never_split_tokens(self):
-        tokenizer = BasicTokenizer(do_lower_case=False, never_split=["[UNK]"])
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]
-        )
-
-    def test_wordpiece_tokenizer(self):
-        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
-
-        vocab = {}
-        for i, token in enumerate(vocab_tokens):
-            vocab[token] = i
-        tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
-
-        self.assertListEqual(tokenizer.tokenize(""), [])
-
-        self.assertListEqual(tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"])
-
-        self.assertListEqual(tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
-
-    def test_is_whitespace(self):
-        self.assertTrue(_is_whitespace(" "))
-        self.assertTrue(_is_whitespace("\t"))
-        self.assertTrue(_is_whitespace("\r"))
-        self.assertTrue(_is_whitespace("\n"))
-        self.assertTrue(_is_whitespace("\u00A0"))
-
-        self.assertFalse(_is_whitespace("A"))
-        self.assertFalse(_is_whitespace("-"))
-
-    def test_is_control(self):
-        self.assertTrue(_is_control("\u0005"))
-
-        self.assertFalse(_is_control("A"))
-        self.assertFalse(_is_control(" "))
-        self.assertFalse(_is_control("\t"))
-        self.assertFalse(_is_control("\r"))
-
-    def test_is_punctuation(self):
-        self.assertTrue(_is_punctuation("-"))
-        self.assertTrue(_is_punctuation("$"))
-        self.assertTrue(_is_punctuation("`"))
-        self.assertTrue(_is_punctuation("."))
-
-        self.assertFalse(_is_punctuation("A"))
-        self.assertFalse(_is_punctuation(" "))
-
-    def test_clean_text(self):
-        tokenizer = self.get_tokenizer()
-        rust_tokenizer = self.get_rust_tokenizer()
-
-        # Example taken from the issue https://github.com/huggingface/tokenizers/issues/340
-        self.assertListEqual([tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]])
-
-        self.assertListEqual(
-            [rust_tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]]
-        )
-
-    @slow
-    def test_sequence_builders(self):
-        tokenizer = self.tokenizer_class.from_pretrained("yjernite/retribert-base-uncased")
-
-        text = tokenizer.encode("sequence builders", add_special_tokens=False)
-        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
-
-        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
-        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
-
-        assert encoded_sentence == [101] + text + [102]
-        assert encoded_pair == [101] + text + [102] + text_2 + [102]
-
-    def test_offsets_with_special_characters(self):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-                sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
-                tokens = tokenizer_r.encode_plus(
-                    sentence,
-                    return_attention_mask=False,
-                    return_token_type_ids=False,
-                    return_offsets_mapping=True,
-                    add_special_tokens=True,
-                )
-
-                do_lower_case = tokenizer_r.do_lower_case if hasattr(tokenizer_r, "do_lower_case") else False
-                expected_results = (
-                    [
-                        ((0, 0), tokenizer_r.cls_token),
-                        ((0, 1), "A"),
-                        ((1, 2), ","),
-                        ((3, 5), "na"),
-                        ((5, 6), "##ï"),
-                        ((6, 8), "##ve"),
-                        ((9, 15), tokenizer_r.mask_token),
-                        ((16, 21), "Allen"),
-                        ((21, 23), "##NL"),
-                        ((23, 24), "##P"),
-                        ((25, 33), "sentence"),
-                        ((33, 34), "."),
-                        ((0, 0), tokenizer_r.sep_token),
-                    ]
-                    if not do_lower_case
-                    else [
-                        ((0, 0), tokenizer_r.cls_token),
-                        ((0, 1), "a"),
-                        ((1, 2), ","),
-                        ((3, 8), "naive"),
-                        ((9, 15), tokenizer_r.mask_token),
-                        ((16, 21), "allen"),
-                        ((21, 23), "##nl"),
-                        ((23, 24), "##p"),
-                        ((25, 33), "sentence"),
-                        ((33, 34), "."),
-                        ((0, 0), tokenizer_r.sep_token),
-                    ]
-                )
-
-                self.assertEqual(
-                    [e[1] for e in expected_results], tokenizer_r.convert_ids_to_tokens(tokens["input_ids"])
-                )
-                self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"])
-
-    def test_change_tokenize_chinese_chars(self):
-        list_of_commun_chinese_char = ["的", "人", "有"]
-        text_with_chinese_char = "".join(list_of_commun_chinese_char)
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-
-                kwargs["tokenize_chinese_chars"] = True
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-                ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
-                ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
-
-                tokens_without_spe_char_r = tokenizer_r.convert_ids_to_tokens(ids_without_spe_char_r)
-                tokens_without_spe_char_p = tokenizer_p.convert_ids_to_tokens(ids_without_spe_char_p)
-
-                # it is expected that each Chinese character is not preceded by "##"
-                self.assertListEqual(tokens_without_spe_char_p, list_of_commun_chinese_char)
-                self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char)
-
-                kwargs["tokenize_chinese_chars"] = False
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-                ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
-                ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
-
-                tokens_without_spe_char_r = tokenizer_r.convert_ids_to_tokens(ids_without_spe_char_r)
-                tokens_without_spe_char_p = tokenizer_p.convert_ids_to_tokens(ids_without_spe_char_p)
-
-                # it is expected that only the first Chinese character is not preceded by "##".
-                expected_tokens = [
-                    f"##{token}" if idx != 0 else token for idx, token in enumerate(list_of_commun_chinese_char)
-                ]
-                self.assertListEqual(tokens_without_spe_char_p, expected_tokens)
-                self.assertListEqual(tokens_without_spe_char_r, expected_tokens)
-
-    # RetriBertModel doesn't define `get_input_embeddings` and it's forward method doesn't take only the output of the tokenizer as input
-    @require_torch
-    @slow
-    def test_torch_encode_plus_sent_to_model(self):
-        import torch
-
-        from transformers import MODEL_MAPPING, TOKENIZER_MAPPING
-
-        MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings(MODEL_MAPPING, TOKENIZER_MAPPING)
-
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-
-                if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
-                    return
-
-                config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
-                config = config_class()
-
-                if config.is_encoder_decoder or config.pad_token_id is None:
-                    return
-
-                model = model_class(config)
-
-                # The following test is different from the common's one
-                self.assertGreaterEqual(model.bert_query.get_input_embeddings().weight.shape[0], len(tokenizer))
-
-                # Build sequence
-                first_ten_tokens = list(tokenizer.get_vocab().keys())[:10]
-                sequence = " ".join(first_ten_tokens)
-                encoded_sequence = tokenizer.encode_plus(sequence, return_tensors="pt")
-
-                # Ensure that the BatchEncoding.to() method works.
-                encoded_sequence.to(model.device)
-
-                batch_encoded_sequence = tokenizer.batch_encode_plus([sequence, sequence], return_tensors="pt")
-                # This should not fail
-
-                with torch.no_grad():  # saves some time
-                    # The following lines are different from the common's ones
-                    model.embed_questions(**encoded_sequence)
-                    model.embed_questions(**batch_encoded_sequence)
diff --git a/tests/models/roberta/test_modeling_flax_roberta.py b/tests/models/roberta/test_modeling_flax_roberta.py
index 5bd8a56022ce..f82479aa706f 100644
--- a/tests/models/roberta/test_modeling_flax_roberta.py
+++ b/tests/models/roberta/test_modeling_flax_roberta.py
@@ -46,7 +46,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -132,7 +132,6 @@ def prepare_config_and_inputs_for_decoder(self):
 
 @require_flax
 class FlaxRobertaModelTest(FlaxModelTesterMixin, unittest.TestCase):
-
     test_head_masking = True
 
     all_model_classes = (
diff --git a/tests/models/roberta/test_modeling_roberta.py b/tests/models/roberta/test_modeling_roberta.py
index 5128789d41a5..6cacf605a26a 100644
--- a/tests/models/roberta/test_modeling_roberta.py
+++ b/tests/models/roberta/test_modeling_roberta.py
@@ -15,7 +15,6 @@
 
 
 import unittest
-from copy import deepcopy
 
 from transformers import RobertaConfig, is_torch_available
 from transformers.testing_utils import TestCasePlus, require_torch, slow, torch_device
@@ -23,6 +22,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -58,7 +58,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -366,8 +366,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class RobertaModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
+class RobertaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             RobertaForCausalLM,
@@ -382,7 +381,21 @@ class RobertaModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCas
         else ()
     )
     all_generative_model_classes = (RobertaForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": RobertaModel,
+            "fill-mask": RobertaForMaskedLM,
+            "question-answering": RobertaForQuestionAnswering,
+            "text-classification": RobertaForSequenceClassification,
+            "text-generation": RobertaForCausalLM,
+            "token-classification": RobertaForTokenClassification,
+            "zero-shot": RobertaForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
     fx_compatible = True
+    model_split_percents = [0.5, 0.8, 0.9]
 
     def setUp(self):
         self.model_tester = RobertaModelTester(self)
@@ -566,23 +579,3 @@ def test_inference_classification_head(self):
         # expected_tensor = roberta.predict("mnli", input_ids, return_logits=True).detach()
 
         self.assertTrue(torch.allclose(output, expected_tensor, atol=1e-4))
-
-    # XXX: this might be a candidate for common tests if we have many of those
-    def test_lm_head_ignore_keys(self):
-        keys_to_ignore_on_save_tied = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-        keys_to_ignore_on_save_untied = [r"lm_head.decoder.bias"]
-        config = RobertaConfig.from_pretrained(ROBERTA_TINY)
-        config_tied = deepcopy(config)
-        config_tied.tie_word_embeddings = True
-        config_untied = deepcopy(config)
-        config_untied.tie_word_embeddings = False
-        for cls in [RobertaForMaskedLM, RobertaForCausalLM]:
-            model = cls(config_tied)
-            self.assertEqual(model._keys_to_ignore_on_save, keys_to_ignore_on_save_tied, cls)
-
-            # the keys should be different when embeddings aren't tied
-            model = cls(config_untied)
-            self.assertEqual(model._keys_to_ignore_on_save, keys_to_ignore_on_save_untied, cls)
-
-            # test that saving works with updated ignore keys - just testing that it doesn't fail
-            model.save_pretrained(self.get_auto_remove_tmp_dir())
diff --git a/tests/models/roberta/test_modeling_tf_roberta.py b/tests/models/roberta/test_modeling_tf_roberta.py
index f9408b84171d..2f2859391ad3 100644
--- a/tests/models/roberta/test_modeling_tf_roberta.py
+++ b/tests/models/roberta/test_modeling_tf_roberta.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import RobertaConfig, is_tf_available
@@ -21,6 +23,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -53,7 +56,7 @@ def __init__(
         self.use_labels = True
         self.vocab_size = 99
         self.hidden_size = 32
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.num_attention_heads = 4
         self.intermediate_size = 37
         self.hidden_act = "gelu"
@@ -547,8 +550,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFRobertaModelTest(TFModelTesterMixin, unittest.TestCase):
-
+class TFRobertaModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             TFRobertaModel,
@@ -561,6 +563,19 @@ class TFRobertaModelTest(TFModelTesterMixin, unittest.TestCase):
         if is_tf_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": TFRobertaModel,
+            "fill-mask": TFRobertaForMaskedLM,
+            "question-answering": TFRobertaForQuestionAnswering,
+            "text-classification": TFRobertaForSequenceClassification,
+            "text-generation": TFRobertaForCausalLM,
+            "token-classification": TFRobertaForTokenClassification,
+            "zero-shot": TFRobertaForSequenceClassification,
+        }
+        if is_tf_available()
+        else {}
+    )
     test_head_masking = False
     test_onnx = False
 
diff --git a/tests/models/roberta/test_tokenization_roberta.py b/tests/models/roberta/test_tokenization_roberta.py
index 46ce5983f081..78bac218351b 100644
--- a/tests/models/roberta/test_tokenization_roberta.py
+++ b/tests/models/roberta/test_tokenization_roberta.py
@@ -77,6 +77,7 @@ def get_tokenizer(self, **kwargs):
     def get_rust_tokenizer(self, **kwargs):
         kwargs.update(self.special_tokens_map)
         return RobertaTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
+        return RobertaTokenizerFast(self.vocab_file, self.merges_file, **kwargs)
 
     def get_input_output_texts(self, tokenizer):
         input_text = "lower newer"
diff --git a/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py b/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py
index 357c05fb02a1..8500dfcb67a8 100644
--- a/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py
+++ b/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py
@@ -24,6 +24,7 @@
 
 if is_flax_available():
     import jax.numpy as jnp
+
     from transformers.models.roberta_prelayernorm.modeling_flax_roberta_prelayernorm import (
         FlaxRobertaPreLayerNormForCausalLM,
         FlaxRobertaPreLayerNormForMaskedLM,
@@ -48,7 +49,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -135,7 +136,6 @@ def prepare_config_and_inputs_for_decoder(self):
 @require_flax
 # Copied from tests.models.roberta.test_modelling_flax_roberta.FlaxRobertaPreLayerNormModelTest with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta-base->andreasmadsen/efficient_mlm_m0.40
 class FlaxRobertaPreLayerNormModelTest(FlaxModelTesterMixin, unittest.TestCase):
-
     test_head_masking = True
 
     all_model_classes = (
diff --git a/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py b/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py
index 971f87c4ee63..ee0972eec329 100644
--- a/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py
+++ b/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py
@@ -22,6 +22,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -56,7 +57,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -365,8 +366,7 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 # Copied from tests.models.roberta.test_modelling_roberta.RobertaPreLayerNormModelTest with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm
-class RobertaPreLayerNormModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
+class RobertaPreLayerNormModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             RobertaPreLayerNormForCausalLM,
@@ -381,7 +381,21 @@ class RobertaPreLayerNormModelTest(ModelTesterMixin, GenerationTesterMixin, unit
         else ()
     )
     all_generative_model_classes = (RobertaPreLayerNormForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": RobertaPreLayerNormModel,
+            "fill-mask": RobertaPreLayerNormForMaskedLM,
+            "question-answering": RobertaPreLayerNormForQuestionAnswering,
+            "text-classification": RobertaPreLayerNormForSequenceClassification,
+            "text-generation": RobertaPreLayerNormForCausalLM,
+            "token-classification": RobertaPreLayerNormForTokenClassification,
+            "zero-shot": RobertaPreLayerNormForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
     fx_compatible = False
+    model_split_percents = [0.5, 0.8, 0.9]
 
     def setUp(self):
         self.model_tester = RobertaPreLayerNormModelTester(self)
diff --git a/tests/models/roberta_prelayernorm/test_modeling_tf_roberta_prelayernorm.py b/tests/models/roberta_prelayernorm/test_modeling_tf_roberta_prelayernorm.py
index a7263218709e..9c1a25ccb982 100644
--- a/tests/models/roberta_prelayernorm/test_modeling_tf_roberta_prelayernorm.py
+++ b/tests/models/roberta_prelayernorm/test_modeling_tf_roberta_prelayernorm.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import RobertaPreLayerNormConfig, is_tf_available
@@ -21,6 +23,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -54,7 +57,7 @@ def __init__(
         self.use_labels = True
         self.vocab_size = 99
         self.hidden_size = 32
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.num_attention_heads = 4
         self.intermediate_size = 37
         self.hidden_act = "gelu"
@@ -549,8 +552,7 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_tf
 # Copied from tests.models.roberta.test_modelling_tf_roberta.TFRobertaPreLayerNormModelTest with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm
-class TFRobertaPreLayerNormModelTest(TFModelTesterMixin, unittest.TestCase):
-
+class TFRobertaPreLayerNormModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             TFRobertaPreLayerNormModel,
@@ -563,6 +565,19 @@ class TFRobertaPreLayerNormModelTest(TFModelTesterMixin, unittest.TestCase):
         if is_tf_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": TFRobertaPreLayerNormModel,
+            "fill-mask": TFRobertaPreLayerNormForMaskedLM,
+            "question-answering": TFRobertaPreLayerNormForQuestionAnswering,
+            "text-classification": TFRobertaPreLayerNormForSequenceClassification,
+            "text-generation": TFRobertaPreLayerNormForCausalLM,
+            "token-classification": TFRobertaPreLayerNormForTokenClassification,
+            "zero-shot": TFRobertaPreLayerNormForSequenceClassification,
+        }
+        if is_tf_available()
+        else {}
+    )
     test_head_masking = False
     test_onnx = False
 
diff --git a/tests/models/roc_bert/test_modeling_roc_bert.py b/tests/models/roc_bert/test_modeling_roc_bert.py
index bc7893b9c525..d1caca6b6f94 100644
--- a/tests/models/roc_bert/test_modeling_roc_bert.py
+++ b/tests/models/roc_bert/test_modeling_roc_bert.py
@@ -22,6 +22,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -57,7 +58,7 @@ def __init__(
         pronunciation_embed_dim=32,
         shape_embed_dim=32,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -555,7 +556,7 @@ def create_and_check_for_pretraining(
 
 
 @require_torch
-class RoCBertModelTest(ModelTesterMixin, unittest.TestCase):
+class RoCBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             RoCBertModel,
@@ -571,6 +572,37 @@ class RoCBertModelTest(ModelTesterMixin, unittest.TestCase):
         else ()
     )
     all_generative_model_classes = (RoCBertForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": RoCBertModel,
+            "fill-mask": RoCBertForMaskedLM,
+            "question-answering": RoCBertForQuestionAnswering,
+            "text-classification": RoCBertForSequenceClassification,
+            "text-generation": RoCBertForCausalLM,
+            "token-classification": RoCBertForTokenClassification,
+            "zero-shot": RoCBertForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
+
+    # TODO: Fix the failed tests when this model gets more usage
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if pipeline_test_casse_name in [
+            "FillMaskPipelineTests",
+            "FeatureExtractionPipelineTests",
+            "TextClassificationPipelineTests",
+            "TokenClassificationPipelineTests",
+        ]:
+            # Get error: IndexError: index out of range in self.
+            # `word_shape_file` and `word_pronunciation_file` should be shrunk during tiny model creation,
+            # otherwise `IndexError` could occur in some embedding layers. Skip for now until this model has
+            # more usage.
+            return True
+
+        return False
 
     # special case for ForPreTraining model
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
diff --git a/tests/models/roc_bert/test_tokenization_roc_bert.py b/tests/models/roc_bert/test_tokenization_roc_bert.py
index 334a347a1ef2..0f8fe08efd15 100644
--- a/tests/models/roc_bert/test_tokenization_roc_bert.py
+++ b/tests/models/roc_bert/test_tokenization_roc_bert.py
@@ -44,8 +44,8 @@ def setUp(self):
         super().setUp()
 
         vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "你", "好", "是", "谁", "a", "b", "c", "d"]
-        word_shape = dict()
-        word_pronunciation = dict()
+        word_shape = {}
+        word_pronunciation = {}
         for i, value in enumerate(vocab_tokens):
             word_shape[value] = i
             word_pronunciation[value] = i
diff --git a/tests/models/roformer/test_modeling_flax_roformer.py b/tests/models/roformer/test_modeling_flax_roformer.py
index d45c08efdbb3..8364e121b42a 100644
--- a/tests/models/roformer/test_modeling_flax_roformer.py
+++ b/tests/models/roformer/test_modeling_flax_roformer.py
@@ -24,6 +24,7 @@
 
 if is_flax_available():
     import jax.numpy as jnp
+
     from transformers.models.roformer.modeling_flax_roformer import (
         FlaxRoFormerForMaskedLM,
         FlaxRoFormerForMultipleChoice,
@@ -46,7 +47,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -116,7 +117,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_flax
 class FlaxRoFormerModelTest(FlaxModelTesterMixin, unittest.TestCase):
-
     test_head_masking = True
 
     all_model_classes = (
diff --git a/tests/models/roformer/test_modeling_roformer.py b/tests/models/roformer/test_modeling_roformer.py
index dadb0d8e747b..e54d31d15468 100644
--- a/tests/models/roformer/test_modeling_roformer.py
+++ b/tests/models/roformer/test_modeling_roformer.py
@@ -22,6 +22,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -55,7 +56,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -219,6 +220,25 @@ def create_and_check_for_causal_lm(
         result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
 
+    def create_and_check_for_generate_causal_lm(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+    ):
+        model = RoFormerForCausalLM(config=config).to(torch_device).eval()
+        torch.manual_seed(0)
+        output_without_past_cache = model.generate(
+            input_ids[:1], num_beams=2, max_length=15, do_sample=True, use_cache=False
+        )
+        torch.manual_seed(0)
+        output_with_past_cache = model.generate(input_ids[:1], num_beams=2, max_length=15, do_sample=True)
+        self.parent.assertTrue(torch.all(output_with_past_cache == output_without_past_cache))
+
     def create_and_check_for_masked_lm(
         self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
     ):
@@ -360,8 +380,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class RoFormerModelTest(ModelTesterMixin, unittest.TestCase):
-
+class RoFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             RoFormerModel,
@@ -376,6 +395,19 @@ class RoFormerModelTest(ModelTesterMixin, unittest.TestCase):
         else ()
     )
     all_generative_model_classes = (RoFormerForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": RoFormerModel,
+            "fill-mask": RoFormerForMaskedLM,
+            "question-answering": RoFormerForQuestionAnswering,
+            "text-classification": RoFormerForSequenceClassification,
+            "text-generation": RoFormerForCausalLM,
+            "token-classification": RoFormerForTokenClassification,
+            "zero-shot": RoFormerForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
 
     def setUp(self):
         self.model_tester = RoFormerModelTester(self)
@@ -392,6 +424,10 @@ def test_for_masked_lm(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
 
+    def test_for_generate_causal_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_generate_causal_lm(*config_and_inputs)
+
     def test_for_multiple_choice(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
@@ -491,7 +527,6 @@ def test_basic(self):
         )
 
     def test_positional_emb_weights_against_roformer(self):
-
         desired_weights = torch.tensor(
             [
                 [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
diff --git a/tests/models/roformer/test_modeling_tf_roformer.py b/tests/models/roformer/test_modeling_tf_roformer.py
index d32d30ae8ad9..d0e795b6dd9c 100644
--- a/tests/models/roformer/test_modeling_tf_roformer.py
+++ b/tests/models/roformer/test_modeling_tf_roformer.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import RoFormerConfig, is_tf_available
@@ -21,6 +23,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -53,7 +56,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -76,7 +79,7 @@ def __init__(
         self.use_labels = True
         self.vocab_size = 99
         self.hidden_size = 32
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.num_attention_heads = 4
         self.intermediate_size = 37
         self.hidden_act = "gelu"
@@ -239,8 +242,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFRoFormerModelTest(TFModelTesterMixin, unittest.TestCase):
-
+class TFRoFormerModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             TFRoFormerModel,
@@ -254,10 +256,32 @@ class TFRoFormerModelTest(TFModelTesterMixin, unittest.TestCase):
         if is_tf_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": TFRoFormerModel,
+            "fill-mask": TFRoFormerForMaskedLM,
+            "question-answering": TFRoFormerForQuestionAnswering,
+            "text-classification": TFRoFormerForSequenceClassification,
+            "text-generation": TFRoFormerForCausalLM,
+            "token-classification": TFRoFormerForTokenClassification,
+            "zero-shot": TFRoFormerForSequenceClassification,
+        }
+        if is_tf_available()
+        else {}
+    )
 
     test_head_masking = False
     test_onnx = False
 
+    # TODO: add `prepare_inputs_for_generation` for `TFRoFormerForCausalLM`
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if pipeline_test_casse_name == "TextGenerationPipelineTests":
+            return True
+
+        return False
+
     def setUp(self):
         self.model_tester = TFRoFormerModelTester(self)
         self.config_tester = ConfigTester(self, config_class=RoFormerConfig, hidden_size=37)
@@ -344,7 +368,6 @@ def test_basic(self):
         tf.debugging.assert_near(emb, desired_weights, atol=self.tolerance)
 
     def test_positional_emb_weights_against_roformer(self):
-
         desired_weights = tf.constant(
             [
                 [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
diff --git a/tests/models/roformer/test_tokenization_roformer.py b/tests/models/roformer/test_tokenization_roformer.py
index 7546bc2e41dd..2d674100f02b 100644
--- a/tests/models/roformer/test_tokenization_roformer.py
+++ b/tests/models/roformer/test_tokenization_roformer.py
@@ -24,7 +24,6 @@
 @require_rjieba
 @require_tokenizers
 class RoFormerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = RoFormerTokenizer
     rust_tokenizer_class = RoFormerTokenizerFast
     space_between_special_tokens = True
diff --git a/tests/models/rwkv/__init__.py b/tests/models/rwkv/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/rwkv/test_modeling_rwkv.py b/tests/models/rwkv/test_modeling_rwkv.py
new file mode 100644
index 000000000000..4ca5cfdf9e13
--- /dev/null
+++ b/tests/models/rwkv/test_modeling_rwkv.py
@@ -0,0 +1,455 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+from unittest.util import safe_repr
+
+from transformers import AutoTokenizer, RwkvConfig, is_torch_available
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        RWKV_PRETRAINED_MODEL_ARCHIVE_LIST,
+        RwkvForCausalLM,
+        RwkvModel,
+    )
+    from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_0
+else:
+    is_torch_greater_or_equal_than_2_0 = False
+
+
+class RwkvModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=14,
+        seq_length=7,
+        is_training=True,
+        use_token_type_ids=False,
+        use_input_mask=True,
+        use_labels=True,
+        use_mc_token_ids=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=2,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_token_type_ids = use_token_type_ids
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.use_mc_token_ids = use_mc_token_ids
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = scope
+        self.bos_token_id = vocab_size - 1
+        self.eos_token_id = vocab_size - 1
+        self.pad_token_id = vocab_size - 1
+
+    def get_large_model_config(self):
+        return RwkvConfig.from_pretrained("sgugger/rwkv-4-pile-7b")
+
+    def prepare_config_and_inputs(
+        self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False
+    ):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        mc_token_ids = None
+        if self.use_mc_token_ids:
+            mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.get_config(
+            gradient_checkpointing=gradient_checkpointing,
+            scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx,
+            reorder_and_upcast_attn=reorder_and_upcast_attn,
+        )
+
+        return (
+            config,
+            input_ids,
+            input_mask,
+            None,
+            token_type_ids,
+            mc_token_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        )
+
+    def get_config(
+        self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False
+    ):
+        return RwkvConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            intermediate_size=self.intermediate_size,
+            activation_function=self.hidden_act,
+            resid_pdrop=self.hidden_dropout_prob,
+            attn_pdrop=self.attention_probs_dropout_prob,
+            n_positions=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            use_cache=True,
+            bos_token_id=self.bos_token_id,
+            eos_token_id=self.eos_token_id,
+            pad_token_id=self.pad_token_id,
+            gradient_checkpointing=gradient_checkpointing,
+            scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx,
+            reorder_and_upcast_attn=reorder_and_upcast_attn,
+        )
+
+    def get_pipeline_config(self):
+        config = self.get_config()
+        config.vocab_size = 300
+        return config
+
+    def prepare_config_and_inputs_for_decoder(self):
+        (
+            config,
+            input_ids,
+            input_mask,
+            head_mask,
+            token_type_ids,
+            mc_token_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = self.prepare_config_and_inputs()
+
+        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
+        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        return (
+            config,
+            input_ids,
+            input_mask,
+            head_mask,
+            token_type_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def create_and_check_rwkv_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        config.output_hidden_states = True
+        model = RwkvModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(len(result.hidden_states), config.num_hidden_layers + 1)
+
+    def create_and_check_causl_lm(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = RwkvForCausalLM(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids, labels=input_ids)
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_state_equivalency(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = RwkvModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        outputs = model(input_ids)
+        output_whole = outputs.last_hidden_state
+
+        outputs = model(input_ids[:, :2])
+        output_one = outputs.last_hidden_state
+
+        # Using the state computed on the first inputs, we will get the same output
+        outputs = model(input_ids[:, 2:], state=outputs.state)
+        output_two = outputs.last_hidden_state
+
+        self.parent.assertTrue(torch.allclose(torch.cat([output_one, output_two], dim=1), output_whole, atol=1e-5))
+
+    def create_and_check_forward_and_backwards(
+        self, config, input_ids, input_mask, head_mask, token_type_ids, *args, gradient_checkpointing=False
+    ):
+        model = RwkvForCausalLM(config)
+        model.to(torch_device)
+        if gradient_checkpointing:
+            model.gradient_checkpointing_enable()
+
+        result = model(input_ids, labels=input_ids)
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+        result.loss.backward()
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+
+        (
+            config,
+            input_ids,
+            input_mask,
+            head_mask,
+            token_type_ids,
+            mc_token_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+
+        inputs_dict = {"input_ids": input_ids}
+
+        return config, inputs_dict
+
+
+@unittest.skipIf(
+    not is_torch_greater_or_equal_than_2_0, reason="See https://github.com/huggingface/transformers/pull/24204"
+)
+@require_torch
+class RwkvModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (RwkvModel, RwkvForCausalLM) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": RwkvModel, "text-generation": RwkvForCausalLM} if is_torch_available() else {}
+    )
+    # all_generative_model_classes = (RwkvForCausalLM,) if is_torch_available() else ()
+    fx_compatible = False
+    test_missing_keys = False
+    test_model_parallel = False
+    test_pruning = False
+    test_head_masking = False  # Rwkv does not support head masking
+
+    def setUp(self):
+        self.model_tester = RwkvModelTester(self)
+        self.config_tester = ConfigTester(
+            self, config_class=RwkvConfig, n_embd=37, common_properties=["hidden_size", "num_hidden_layers"]
+        )
+
+    def assertInterval(self, member, container, msg=None):
+        r"""
+        Simple utility function to check if a member is inside an interval.
+        """
+        if isinstance(member, torch.Tensor):
+            max_value, min_value = member.max().item(), member.min().item()
+        elif isinstance(member, list) or isinstance(member, tuple):
+            max_value, min_value = max(member), min(member)
+
+        if not isinstance(container, list):
+            raise TypeError("container should be a list or tuple")
+        elif len(container) != 2:
+            raise ValueError("container should have 2 elements")
+
+        expected_min, expected_max = container
+
+        is_inside_interval = (min_value >= expected_min) and (max_value <= expected_max)
+
+        if not is_inside_interval:
+            standardMsg = "%s not found in %s" % (safe_repr(member), safe_repr(container))
+            self.fail(self._formatMessage(msg, standardMsg))
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_rwkv_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_rwkv_model(*config_and_inputs)
+
+    def test_rwkv_lm_head_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_causl_lm(*config_and_inputs)
+
+    def test_state_equivalency(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_state_equivalency(*config_and_inputs)
+
+    def test_initialization(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config=config)
+            for name, param in model.named_parameters():
+                if "time_decay" in name:
+                    if param.requires_grad:
+                        self.assertTrue(param.data.max().item() == 3.0)
+                        self.assertTrue(param.data.min().item() == -5.0)
+                elif "time_first" in name:
+                    if param.requires_grad:
+                        # check if it's a ones like
+                        self.assertTrue(torch.allclose(param.data, torch.ones_like(param.data), atol=1e-5, rtol=1e-5))
+                elif any(x in name for x in ["time_mix_key", "time_mix_receptance"]):
+                    if param.requires_grad:
+                        self.assertInterval(
+                            param.data,
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                elif "time_mix_value" in name:
+                    if param.requires_grad:
+                        self.assertInterval(
+                            param.data,
+                            [0.0, 1.3],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+    def test_attention_outputs(self):
+        r"""
+        Overriding the test_attention_outputs test as the attention outputs of Rwkv are different from other models
+        it has a shape `batch_size, seq_len, hidden_size`.
+        """
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        seq_len = getattr(self.model_tester, "seq_length", None)
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            batch_size = inputs["input_ids"].shape[0]
+            with torch.no_grad():
+                outputs = model(**inputs)
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            batch_size = inputs["input_ids"].shape[0]
+            with torch.no_grad():
+                outputs = model(**inputs)
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [batch_size, seq_len, config.hidden_size],
+            )
+            out_len = len(outputs)
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            batch_size = inputs["input_ids"].shape[0]
+            with torch.no_grad():
+                outputs = model(**inputs)
+
+            added_hidden_states = 1
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs.attentions
+
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(self_attentions[0].shape[-3:]),
+                [batch_size, seq_len, config.hidden_size],
+            )
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in RWKV_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = RwkvModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@unittest.skipIf(
+    not is_torch_greater_or_equal_than_2_0, reason="See https://github.com/huggingface/transformers/pull/24204"
+)
+@slow
+class RWKVIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        self.model_id = "RWKV/rwkv-4-169m-pile"
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
+
+    def test_simple_generate(self):
+        expected_output = "Hello my name is Jasmine and I am a newbie to the"
+        model = RwkvForCausalLM.from_pretrained(self.model_id).to(torch_device)
+
+        input_ids = self.tokenizer("Hello my name is", return_tensors="pt").input_ids.to(torch_device)
+        output = model.generate(input_ids, max_new_tokens=10)
+        output_sentence = self.tokenizer.decode(output[0].tolist())
+
+        self.assertEqual(output_sentence, expected_output)
+
+    def test_simple_generate_bf16(self):
+        expected_output = "Hello my name is Jasmine and I am a newbie to the"
+
+        input_ids = self.tokenizer("Hello my name is", return_tensors="pt").input_ids.to(torch_device)
+        model = RwkvForCausalLM.from_pretrained(self.model_id, torch_dtype=torch.bfloat16).to(torch_device)
+
+        output = model.generate(input_ids, max_new_tokens=10)
+        output_sentence = self.tokenizer.decode(output[0].tolist())
+
+        self.assertEqual(output_sentence, expected_output)
diff --git a/tests/models/sam/__init__.py b/tests/models/sam/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/sam/test_modeling_sam.py b/tests/models/sam/test_modeling_sam.py
new file mode 100644
index 000000000000..a0f39a401355
--- /dev/null
+++ b/tests/models/sam/test_modeling_sam.py
@@ -0,0 +1,768 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch SAM model. """
+
+
+import gc
+import inspect
+import unittest
+
+import requests
+
+from transformers import SamConfig, SamMaskDecoderConfig, SamPromptEncoderConfig, SamVisionConfig, pipeline
+from transformers.testing_utils import require_torch, slow, torch_device
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import SamModel, SamProcessor
+    from transformers.models.sam.modeling_sam import SAM_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+if is_vision_available():
+    from PIL import Image
+
+
+class SamPromptEncoderTester:
+    def __init__(
+        self,
+        hidden_size=32,
+        input_image_size=24,
+        patch_size=2,
+        mask_input_channels=4,
+        num_point_embeddings=4,
+        hidden_act="gelu",
+    ):
+        self.hidden_size = hidden_size
+        self.input_image_size = input_image_size
+        self.patch_size = patch_size
+        self.mask_input_channels = mask_input_channels
+        self.num_point_embeddings = num_point_embeddings
+        self.hidden_act = hidden_act
+
+    def get_config(self):
+        return SamPromptEncoderConfig(
+            image_size=self.input_image_size,
+            patch_size=self.patch_size,
+            mask_input_channels=self.mask_input_channels,
+            hidden_size=self.hidden_size,
+            num_point_embeddings=self.num_point_embeddings,
+            hidden_act=self.hidden_act,
+        )
+
+    def prepare_config_and_inputs(self):
+        dummy_points = floats_tensor([self.batch_size, 3, 2])
+        config = self.get_config()
+
+        return config, dummy_points
+
+
+class SamMaskDecoderTester:
+    def __init__(
+        self,
+        hidden_size=32,
+        hidden_act="relu",
+        mlp_dim=64,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        attention_downsample_rate=2,
+        num_multimask_outputs=3,
+        iou_head_depth=3,
+        iou_head_hidden_dim=32,
+        layer_norm_eps=1e-6,
+    ):
+        self.hidden_size = hidden_size
+        self.hidden_act = hidden_act
+        self.mlp_dim = mlp_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.attention_downsample_rate = attention_downsample_rate
+        self.num_multimask_outputs = num_multimask_outputs
+        self.iou_head_depth = iou_head_depth
+        self.iou_head_hidden_dim = iou_head_hidden_dim
+        self.layer_norm_eps = layer_norm_eps
+
+    def get_config(self):
+        return SamMaskDecoderConfig(
+            hidden_size=self.hidden_size,
+            hidden_act=self.hidden_act,
+            mlp_dim=self.mlp_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            attention_downsample_rate=self.attention_downsample_rate,
+            num_multimask_outputs=self.num_multimask_outputs,
+            iou_head_depth=self.iou_head_depth,
+            iou_head_hidden_dim=self.iou_head_hidden_dim,
+            layer_norm_eps=self.layer_norm_eps,
+        )
+
+    def prepare_config_and_inputs(self):
+        config = self.get_config()
+
+        dummy_inputs = {
+            "image_embedding": floats_tensor([self.batch_size, self.hidden_size]),
+        }
+
+        return config, dummy_inputs
+
+
+class SamModelTester:
+    def __init__(
+        self,
+        parent,
+        hidden_size=36,
+        intermediate_size=72,
+        projection_dim=62,
+        output_channels=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        num_channels=3,
+        image_size=24,
+        patch_size=2,
+        hidden_act="gelu",
+        layer_norm_eps=1e-06,
+        dropout=0.0,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        qkv_bias=True,
+        mlp_ratio=4.0,
+        use_abs_pos=True,
+        use_rel_pos=True,
+        rel_pos_zero_init=False,
+        window_size=14,
+        global_attn_indexes=[2, 5, 8, 11],
+        num_pos_feats=16,
+        mlp_dim=None,
+        batch_size=2,
+    ):
+        self.parent = parent
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.output_channels = output_channels
+        self.num_channels = num_channels
+        self.hidden_size = hidden_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.hidden_act = hidden_act
+        self.layer_norm_eps = layer_norm_eps
+        self.qkv_bias = qkv_bias
+        self.mlp_ratio = mlp_ratio
+        self.use_abs_pos = use_abs_pos
+        self.use_rel_pos = use_rel_pos
+        self.rel_pos_zero_init = rel_pos_zero_init
+        self.window_size = window_size
+        self.global_attn_indexes = global_attn_indexes
+        self.num_pos_feats = num_pos_feats
+        self.mlp_dim = mlp_dim
+        self.batch_size = batch_size
+
+        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
+        num_patches = (image_size // patch_size) ** 2
+        self.seq_length = num_patches + 1
+
+        self.prompt_encoder_tester = SamPromptEncoderTester()
+        self.mask_decoder_tester = SamMaskDecoderTester()
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        config = self.get_config()
+
+        return config, pixel_values
+
+    def get_config(self):
+        vision_config = SamVisionConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            initializer_range=self.initializer_range,
+            initializer_factor=self.initializer_factor,
+            output_channels=self.output_channels,
+            qkv_bias=self.qkv_bias,
+            mlp_ratio=self.mlp_ratio,
+            use_abs_pos=self.use_abs_pos,
+            use_rel_pos=self.use_rel_pos,
+            rel_pos_zero_init=self.rel_pos_zero_init,
+            window_size=self.window_size,
+            global_attn_indexes=self.global_attn_indexes,
+            num_pos_feats=self.num_pos_feats,
+            mlp_dim=self.mlp_dim,
+        )
+
+        prompt_encoder_config = self.prompt_encoder_tester.get_config()
+
+        mask_decoder_config = self.mask_decoder_tester.get_config()
+
+        return SamConfig(
+            vision_config=vision_config,
+            prompt_encoder_config=prompt_encoder_config,
+            mask_decoder_config=mask_decoder_config,
+        )
+
+    def create_and_check_model(self, config, pixel_values):
+        model = SamModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(pixel_values)
+        self.parent.assertEqual(result.iou_scores.shape, (self.batch_size, 1, 3))
+        self.parent.assertEqual(result.pred_masks.shape[:3], (self.batch_size, 1, 3))
+
+    def create_and_check_get_image_features(self, config, pixel_values):
+        model = SamModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model.get_image_embeddings(pixel_values)
+        self.parent.assertEqual(result[0].shape, (self.output_channels, 12, 12))
+
+    def create_and_check_get_image_hidden_states(self, config, pixel_values):
+        model = SamModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model.vision_encoder(
+                pixel_values,
+                output_hidden_states=True,
+                return_dict=True,
+            )
+
+        # after computing the convolutional features
+        expected_hidden_states_shape = (self.batch_size, 12, 12, 36)
+        self.parent.assertEqual(len(result[1]), self.num_hidden_layers + 1)
+        self.parent.assertEqual(result[1][0].shape, expected_hidden_states_shape)
+
+        with torch.no_grad():
+            result = model.vision_encoder(
+                pixel_values,
+                output_hidden_states=True,
+                return_dict=False,
+            )
+
+        # after computing the convolutional features
+        expected_hidden_states_shape = (self.batch_size, 12, 12, 36)
+        self.parent.assertEqual(len(result[1]), self.num_hidden_layers + 1)
+        self.parent.assertEqual(result[1][0].shape, expected_hidden_states_shape)
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class SamModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as SAM's vision encoder does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (SamModel,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": SamModel, "mask-generation": SamModel} if is_torch_available() else {}
+    )
+    fx_compatible = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+    test_torchscript = False
+
+    # TODO: Fix me @Arthur: `run_batch_test` in `tests/test_pipeline_mixin.py` not working
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        return True
+
+    def setUp(self):
+        self.model_tester = SamModelTester(self)
+        self.vision_config_tester = ConfigTester(self, config_class=SamVisionConfig, has_text_modality=False)
+        self.prompt_encoder_config_tester = ConfigTester(
+            self,
+            config_class=SamPromptEncoderConfig,
+            has_text_modality=False,
+            num_attention_heads=12,
+            num_hidden_layers=2,
+        )
+        self.mask_decoder_config_tester = ConfigTester(
+            self, config_class=SamMaskDecoderConfig, has_text_modality=False
+        )
+
+    def test_config(self):
+        self.vision_config_tester.run_common_tests()
+        self.prompt_encoder_config_tester.run_common_tests()
+        self.mask_decoder_config_tester.run_common_tests()
+
+    @unittest.skip(reason="SAM's vision encoder does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_get_image_features(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_get_image_features(*config_and_inputs)
+
+    def test_image_hidden_states(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_get_image_hidden_states(*config_and_inputs)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        expected_vision_attention_shape = (
+            self.model_tester.batch_size * self.model_tester.num_attention_heads,
+            196,
+            196,
+        )
+        expected_mask_decoder_attention_shape = (self.model_tester.batch_size, 1, 144, 32)
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            vision_attentions = outputs.vision_attentions
+            self.assertEqual(len(vision_attentions), self.model_tester.num_hidden_layers)
+
+            mask_decoder_attentions = outputs.mask_decoder_attentions
+            self.assertEqual(len(mask_decoder_attentions), self.model_tester.mask_decoder_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            vision_attentions = outputs.vision_attentions
+            self.assertEqual(len(vision_attentions), self.model_tester.num_hidden_layers)
+
+            mask_decoder_attentions = outputs.mask_decoder_attentions
+            self.assertEqual(len(mask_decoder_attentions), self.model_tester.mask_decoder_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(vision_attentions[0].shape[-4:]),
+                list(expected_vision_attention_shape),
+            )
+
+            self.assertListEqual(
+                list(mask_decoder_attentions[0].shape[-4:]),
+                list(expected_mask_decoder_attention_shape),
+            )
+
+    @unittest.skip(reason="SamModel does not support training")
+    def test_training(self):
+        pass
+
+    @unittest.skip(reason="SamModel does not support training")
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(reason="SamModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @unittest.skip(reason="SamModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_to_base(self):
+        pass
+
+    @unittest.skip(reason="SamModel does not support training")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    @unittest.skip(reason="Hidden_states is tested in create_and_check_model tests")
+    def test_hidden_states_output(self):
+        pass
+
+    def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=5e-5, name="outputs", attributes=None):
+        # Use a slightly higher default tol to make the tests non-flaky
+        super().check_pt_tf_outputs(tf_outputs, pt_outputs, model_class, tol=tol, name=name, attributes=attributes)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in SAM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = SamModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+def prepare_image():
+    img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
+    raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
+    return raw_image
+
+
+def prepare_dog_img():
+    img_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/dog-sam.png"
+    raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
+    return raw_image
+
+
+@slow
+class SamModelIntegrationTest(unittest.TestCase):
+    def tearDown(self):
+        super().tearDown()
+        # clean-up as much as possible GPU memory occupied by PyTorch
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_inference_mask_generation_no_point(self):
+        model = SamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
+
+        model.to(torch_device)
+        model.eval()
+
+        raw_image = prepare_image()
+        inputs = processor(images=raw_image, return_tensors="pt").to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(**inputs)
+        scores = outputs.iou_scores.squeeze()
+        masks = outputs.pred_masks[0, 0, 0, 0, :3]
+        self.assertTrue(torch.allclose(scores[-1], torch.tensor(0.4515), atol=2e-4))
+        self.assertTrue(torch.allclose(masks, torch.tensor([-4.1800, -3.4948, -3.4481]).to(torch_device), atol=2e-4))
+
+    def test_inference_mask_generation_one_point_one_bb(self):
+        model = SamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
+
+        model.to(torch_device)
+        model.eval()
+
+        raw_image = prepare_image()
+        input_boxes = [[[650, 900, 1000, 1250]]]
+        input_points = [[[820, 1080]]]
+
+        inputs = processor(
+            images=raw_image, input_boxes=input_boxes, input_points=input_points, return_tensors="pt"
+        ).to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(**inputs)
+        scores = outputs.iou_scores.squeeze()
+        masks = outputs.pred_masks[0, 0, 0, 0, :3]
+        self.assertTrue(torch.allclose(scores[-1], torch.tensor(0.9566), atol=2e-4))
+        self.assertTrue(
+            torch.allclose(masks, torch.tensor([-12.7729, -12.3665, -12.6061]).to(torch_device), atol=2e-4)
+        )
+
+    def test_inference_mask_generation_batched_points_batched_images(self):
+        model = SamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
+
+        model.to(torch_device)
+        model.eval()
+
+        raw_image = prepare_image()
+        input_points = [
+            [[[820, 1080]], [[820, 1080]], [[820, 1080]], [[820, 1080]]],
+            [[[510, 1080]], [[820, 1080]], [[820, 1080]], [[820, 1080]]],
+        ]
+
+        inputs = processor(images=[raw_image, raw_image], input_points=input_points, return_tensors="pt").to(
+            torch_device
+        )
+
+        with torch.no_grad():
+            outputs = model(**inputs)
+        scores = outputs.iou_scores.squeeze().cpu()
+        masks = outputs.pred_masks[0, 0, 0, 0, :3].cpu()
+
+        EXPECTED_SCORES = torch.tensor(
+            [
+                [
+                    [0.6765, 0.9379, 0.8803],
+                    [0.6765, 0.9379, 0.8803],
+                    [0.6765, 0.9379, 0.8803],
+                    [0.6765, 0.9379, 0.8803],
+                ],
+                [
+                    [0.3317, 0.7264, 0.7646],
+                    [0.6765, 0.9379, 0.8803],
+                    [0.6765, 0.9379, 0.8803],
+                    [0.6765, 0.9379, 0.8803],
+                ],
+            ]
+        )
+        EXPECTED_MASKS = torch.tensor([-2.8550, -2.7988, -2.9625])
+        self.assertTrue(torch.allclose(scores, EXPECTED_SCORES, atol=1e-3))
+        self.assertTrue(torch.allclose(masks, EXPECTED_MASKS, atol=1e-3))
+
+    def test_inference_mask_generation_one_point_one_bb_zero(self):
+        model = SamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
+
+        model.to(torch_device)
+        model.eval()
+
+        raw_image = prepare_image()
+        input_boxes = [[[620, 900, 1000, 1255]]]
+        input_points = [[[820, 1080]]]
+        labels = [[0]]
+
+        inputs = processor(
+            images=raw_image,
+            input_boxes=input_boxes,
+            input_points=input_points,
+            input_labels=labels,
+            return_tensors="pt",
+        ).to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(**inputs)
+        scores = outputs.iou_scores.squeeze()
+
+        self.assertTrue(torch.allclose(scores[-1], torch.tensor(0.7894), atol=1e-4))
+
+    def test_inference_mask_generation_one_point(self):
+        model = SamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
+
+        model.to(torch_device)
+        model.eval()
+
+        raw_image = prepare_image()
+
+        input_points = [[[400, 650]]]
+        input_labels = [[1]]
+
+        inputs = processor(
+            images=raw_image, input_points=input_points, input_labels=input_labels, return_tensors="pt"
+        ).to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(**inputs)
+        scores = outputs.iou_scores.squeeze()
+        self.assertTrue(torch.allclose(scores[-1], torch.tensor(0.9675), atol=1e-4))
+
+        # With no label
+        input_points = [[[400, 650]]]
+
+        inputs = processor(images=raw_image, input_points=input_points, return_tensors="pt").to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(**inputs)
+        scores = outputs.iou_scores.squeeze()
+        self.assertTrue(torch.allclose(scores[-1], torch.tensor(0.9675), atol=1e-4))
+
+    def test_inference_mask_generation_two_points(self):
+        model = SamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
+
+        model.to(torch_device)
+        model.eval()
+
+        raw_image = prepare_image()
+
+        input_points = [[[400, 650], [800, 650]]]
+        input_labels = [[1, 1]]
+
+        inputs = processor(
+            images=raw_image, input_points=input_points, input_labels=input_labels, return_tensors="pt"
+        ).to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(**inputs)
+        scores = outputs.iou_scores.squeeze()
+        self.assertTrue(torch.allclose(scores[-1], torch.tensor(0.9762), atol=1e-4))
+
+        # no labels
+        inputs = processor(images=raw_image, input_points=input_points, return_tensors="pt").to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(**inputs)
+        scores = outputs.iou_scores.squeeze()
+
+        self.assertTrue(torch.allclose(scores[-1], torch.tensor(0.9762), atol=1e-4))
+
+    def test_inference_mask_generation_two_points_batched(self):
+        model = SamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
+
+        model.to(torch_device)
+        model.eval()
+
+        raw_image = prepare_image()
+
+        input_points = [[[400, 650], [800, 650]], [[400, 650]]]
+        input_labels = [[1, 1], [1]]
+
+        inputs = processor(
+            images=[raw_image, raw_image], input_points=input_points, input_labels=input_labels, return_tensors="pt"
+        ).to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(**inputs)
+        scores = outputs.iou_scores.squeeze()
+        self.assertTrue(torch.allclose(scores[0][-1], torch.tensor(0.9762), atol=1e-4))
+        self.assertTrue(torch.allclose(scores[1][-1], torch.tensor(0.9637), atol=1e-4))
+
+    def test_inference_mask_generation_one_box(self):
+        model = SamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
+
+        model.to(torch_device)
+        model.eval()
+
+        raw_image = prepare_image()
+
+        input_boxes = [[[75, 275, 1725, 850]]]
+
+        inputs = processor(images=raw_image, input_boxes=input_boxes, return_tensors="pt").to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(**inputs)
+        scores = outputs.iou_scores.squeeze()
+        self.assertTrue(torch.allclose(scores[-1], torch.tensor(0.7937), atol=1e-4))
+
+    def test_inference_mask_generation_batched_image_one_point(self):
+        model = SamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
+
+        model.to(torch_device)
+        model.eval()
+
+        raw_image = prepare_image()
+        raw_dog_image = prepare_dog_img()
+
+        input_points = [[[820, 1080]], [[220, 470]]]
+
+        inputs = processor(images=[raw_image, raw_dog_image], input_points=input_points, return_tensors="pt").to(
+            torch_device
+        )
+
+        with torch.no_grad():
+            outputs = model(**inputs)
+        scores_batched = outputs.iou_scores.squeeze()
+
+        input_points = [[[220, 470]]]
+
+        inputs = processor(images=raw_dog_image, input_points=input_points, return_tensors="pt").to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(**inputs)
+        scores_single = outputs.iou_scores.squeeze()
+        self.assertTrue(torch.allclose(scores_batched[1, :], scores_single, atol=1e-4))
+
+    def test_inference_mask_generation_two_points_point_batch(self):
+        model = SamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
+
+        model.to(torch_device)
+        model.eval()
+
+        raw_image = prepare_image()
+
+        # fmt: off
+        input_points = torch.Tensor([[[400, 650]], [[220, 470]]]).cpu()
+        # fmt: on
+
+        input_points = input_points.unsqueeze(0)
+
+        inputs = processor(raw_image, input_points=input_points, return_tensors="pt").to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        iou_scores = outputs.iou_scores.cpu()
+        self.assertTrue(iou_scores.shape == (1, 2, 3))
+        torch.testing.assert_allclose(
+            iou_scores, torch.tensor([[[0.9105, 0.9825, 0.9675], [0.7646, 0.7943, 0.7774]]]), atol=1e-4, rtol=1e-4
+        )
+
+    def test_inference_mask_generation_three_boxes_point_batch(self):
+        model = SamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
+
+        model.to(torch_device)
+        model.eval()
+
+        raw_image = prepare_image()
+
+        # fmt: off
+        input_boxes = torch.Tensor([[[620, 900, 1000, 1255]], [[75, 275, 1725, 850]],  [[75, 275, 1725, 850]]]).cpu()
+        EXPECTED_IOU = torch.tensor([[[0.9773, 0.9881, 0.9522],
+         [0.5996, 0.7661, 0.7937],
+         [0.5996, 0.7661, 0.7937]]])
+        # fmt: on
+        input_boxes = input_boxes.unsqueeze(0)
+
+        inputs = processor(raw_image, input_boxes=input_boxes, return_tensors="pt").to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        iou_scores = outputs.iou_scores.cpu()
+        self.assertTrue(iou_scores.shape == (1, 3, 3))
+        torch.testing.assert_allclose(iou_scores, EXPECTED_IOU, atol=1e-4, rtol=1e-4)
+
+    def test_dummy_pipeline_generation(self):
+        generator = pipeline(
+            "mask-generation", model="facebook/sam-vit-base", device=0 if torch.cuda.is_available() else -1
+        )
+        raw_image = prepare_image()
+
+        _ = generator(raw_image, points_per_batch=64)
diff --git a/tests/models/sam/test_modeling_tf_sam.py b/tests/models/sam/test_modeling_tf_sam.py
new file mode 100644
index 000000000000..a14b99128671
--- /dev/null
+++ b/tests/models/sam/test_modeling_tf_sam.py
@@ -0,0 +1,672 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the TensorFlow SAM model. """
+
+
+from __future__ import annotations
+
+import inspect
+import unittest
+
+import numpy as np
+import requests
+
+from transformers import SamConfig, SamMaskDecoderConfig, SamPromptEncoderConfig, SamVisionConfig
+from transformers.testing_utils import require_tf, slow
+from transformers.utils import is_tf_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers import SamProcessor, TFSamModel
+
+if is_vision_available():
+    from PIL import Image
+
+
+class TFSamPromptEncoderTester:
+    def __init__(
+        self,
+        hidden_size=32,
+        input_image_size=24,
+        patch_size=2,
+        mask_input_channels=4,
+        num_point_embeddings=4,
+        hidden_act="gelu",
+    ):
+        self.hidden_size = hidden_size
+        self.input_image_size = input_image_size
+        self.patch_size = patch_size
+        self.mask_input_channels = mask_input_channels
+        self.num_point_embeddings = num_point_embeddings
+        self.hidden_act = hidden_act
+
+    def get_config(self):
+        return SamPromptEncoderConfig(
+            image_size=self.input_image_size,
+            patch_size=self.patch_size,
+            mask_input_channels=self.mask_input_channels,
+            hidden_size=self.hidden_size,
+            num_point_embeddings=self.num_point_embeddings,
+            hidden_act=self.hidden_act,
+        )
+
+    def prepare_config_and_inputs(self):
+        dummy_points = floats_tensor([self.batch_size, 3, 2])
+        config = self.get_config()
+
+        return config, dummy_points
+
+
+class TFSamMaskDecoderTester:
+    def __init__(
+        self,
+        hidden_size=32,
+        hidden_act="relu",
+        mlp_dim=64,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        attention_downsample_rate=2,
+        num_multimask_outputs=3,
+        iou_head_depth=3,
+        iou_head_hidden_dim=32,
+        layer_norm_eps=1e-6,
+    ):
+        self.hidden_size = hidden_size
+        self.hidden_act = hidden_act
+        self.mlp_dim = mlp_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.attention_downsample_rate = attention_downsample_rate
+        self.num_multimask_outputs = num_multimask_outputs
+        self.iou_head_depth = iou_head_depth
+        self.iou_head_hidden_dim = iou_head_hidden_dim
+        self.layer_norm_eps = layer_norm_eps
+
+    def get_config(self):
+        return SamMaskDecoderConfig(
+            hidden_size=self.hidden_size,
+            hidden_act=self.hidden_act,
+            mlp_dim=self.mlp_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            attention_downsample_rate=self.attention_downsample_rate,
+            num_multimask_outputs=self.num_multimask_outputs,
+            iou_head_depth=self.iou_head_depth,
+            iou_head_hidden_dim=self.iou_head_hidden_dim,
+            layer_norm_eps=self.layer_norm_eps,
+        )
+
+    def prepare_config_and_inputs(self):
+        config = self.get_config()
+
+        dummy_inputs = {
+            "image_embedding": floats_tensor([self.batch_size, self.hidden_size]),
+        }
+
+        return config, dummy_inputs
+
+
+class TFSamModelTester:
+    def __init__(
+        self,
+        parent,
+        hidden_size=36,
+        intermediate_size=72,
+        projection_dim=62,
+        output_channels=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        num_channels=3,
+        image_size=24,
+        patch_size=2,
+        hidden_act="gelu",
+        layer_norm_eps=1e-06,
+        dropout=0.0,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        qkv_bias=True,
+        mlp_ratio=4.0,
+        use_abs_pos=True,
+        use_rel_pos=True,
+        rel_pos_zero_init=False,
+        window_size=14,
+        global_attn_indexes=[2, 5, 8, 11],
+        num_pos_feats=16,
+        mlp_dim=None,
+        batch_size=2,
+    ):
+        self.parent = parent
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.output_channels = output_channels
+        self.num_channels = num_channels
+        self.hidden_size = hidden_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.hidden_act = hidden_act
+        self.layer_norm_eps = layer_norm_eps
+        self.qkv_bias = qkv_bias
+        self.mlp_ratio = mlp_ratio
+        self.use_abs_pos = use_abs_pos
+        self.use_rel_pos = use_rel_pos
+        self.rel_pos_zero_init = rel_pos_zero_init
+        self.window_size = window_size
+        self.global_attn_indexes = global_attn_indexes
+        self.num_pos_feats = num_pos_feats
+        self.mlp_dim = mlp_dim
+        self.batch_size = batch_size
+
+        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
+        num_patches = (image_size // patch_size) ** 2
+        self.seq_length = num_patches + 1
+
+        self.prompt_encoder_tester = TFSamPromptEncoderTester()
+        self.mask_decoder_tester = TFSamMaskDecoderTester()
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        config = self.get_config()
+
+        return config, pixel_values
+
+    def get_config(self):
+        vision_config = SamVisionConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            initializer_range=self.initializer_range,
+            initializer_factor=self.initializer_factor,
+            output_channels=self.output_channels,
+            qkv_bias=self.qkv_bias,
+            mlp_ratio=self.mlp_ratio,
+            use_abs_pos=self.use_abs_pos,
+            use_rel_pos=self.use_rel_pos,
+            rel_pos_zero_init=self.rel_pos_zero_init,
+            window_size=self.window_size,
+            global_attn_indexes=self.global_attn_indexes,
+            num_pos_feats=self.num_pos_feats,
+            mlp_dim=self.mlp_dim,
+        )
+
+        prompt_encoder_config = self.prompt_encoder_tester.get_config()
+
+        mask_decoder_config = self.mask_decoder_tester.get_config()
+
+        return SamConfig(
+            vision_config=vision_config,
+            prompt_encoder_config=prompt_encoder_config,
+            mask_decoder_config=mask_decoder_config,
+        )
+
+    def create_and_check_model(self, config, pixel_values):
+        model = TFSamModel(config=config)
+        result = model(pixel_values)
+        self.parent.assertEqual(result.iou_scores.shape, (self.batch_size, 1, 3))
+        self.parent.assertEqual(result.pred_masks.shape[:3], (self.batch_size, 1, 3))
+
+    def create_and_check_get_image_features(self, config, pixel_values):
+        model = TFSamModel(config=config)
+        result = model.get_image_embeddings(pixel_values)
+        self.parent.assertEqual(result[0].shape, (self.output_channels, 12, 12))
+
+    def create_and_check_get_image_hidden_states(self, config, pixel_values):
+        model = TFSamModel(config=config)
+        result = model.vision_encoder(
+            pixel_values,
+            output_hidden_states=True,
+            return_dict=True,
+        )
+
+        # after computing the convolutional features
+        expected_hidden_states_shape = (self.batch_size, 12, 12, 36)
+        self.parent.assertEqual(len(result[1]), self.num_hidden_layers + 1)
+        self.parent.assertEqual(result[1][0].shape, expected_hidden_states_shape)
+
+        result = model.vision_encoder(
+            pixel_values,
+            output_hidden_states=True,
+            return_dict=False,
+        )
+
+        # after computing the convolutional features
+        expected_hidden_states_shape = (self.batch_size, 12, 12, 36)
+        self.parent.assertEqual(len(result[1]), self.num_hidden_layers + 1)
+        self.parent.assertEqual(result[1][0].shape, expected_hidden_states_shape)
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_tf
+class TFSamModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as SAM's vision encoder does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (TFSamModel,) if is_tf_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": TFSamModel, "mask-generation": TFSamModel} if is_tf_available() else {}
+    )
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+    test_onnx = False
+
+    # TODO: Fix me @Arthur: `run_batch_test` in `tests/test_pipeline_mixin.py` not working
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        return True
+
+    def setUp(self):
+        self.model_tester = TFSamModelTester(self)
+        self.vision_config_tester = ConfigTester(self, config_class=SamVisionConfig, has_text_modality=False)
+        self.prompt_encoder_config_tester = ConfigTester(
+            self,
+            config_class=SamPromptEncoderConfig,
+            has_text_modality=False,
+            num_attention_heads=12,
+            num_hidden_layers=2,
+        )
+        self.mask_decoder_config_tester = ConfigTester(
+            self, config_class=SamMaskDecoderConfig, has_text_modality=False
+        )
+
+    def test_config(self):
+        self.vision_config_tester.run_common_tests()
+        self.prompt_encoder_config_tester.run_common_tests()
+        self.mask_decoder_config_tester.run_common_tests()
+
+    @unittest.skip(reason="SAM's vision encoder does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (tf.keras.layers.Layer))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, tf.keras.layers.Dense))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.call)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_get_image_features(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_get_image_features(*config_and_inputs)
+
+    def test_image_hidden_states(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_get_image_hidden_states(*config_and_inputs)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        expected_vision_attention_shape = (
+            self.model_tester.batch_size * self.model_tester.num_attention_heads,
+            196,
+            196,
+        )
+        expected_mask_decoder_attention_shape = (self.model_tester.batch_size, 1, 144, 32)
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            vision_attentions = outputs.vision_attentions
+            self.assertEqual(len(vision_attentions), self.model_tester.num_hidden_layers)
+
+            mask_decoder_attentions = outputs.mask_decoder_attentions
+            self.assertEqual(len(mask_decoder_attentions), self.model_tester.mask_decoder_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            vision_attentions = outputs.vision_attentions
+            self.assertEqual(len(vision_attentions), self.model_tester.num_hidden_layers)
+
+            mask_decoder_attentions = outputs.mask_decoder_attentions
+            self.assertEqual(len(mask_decoder_attentions), self.model_tester.mask_decoder_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(vision_attentions[0].shape[-4:]),
+                list(expected_vision_attention_shape),
+            )
+
+            self.assertListEqual(
+                list(mask_decoder_attentions[0].shape[-4:]),
+                list(expected_mask_decoder_attention_shape),
+            )
+
+    @unittest.skip(reason="Hidden_states is tested in create_and_check_model tests")
+    def test_hidden_states_output(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        model = TFSamModel.from_pretrained("facebook/sam-vit-base")  # sam-vit-huge blows out our memory
+        self.assertIsNotNone(model)
+
+    def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=5e-4, name="outputs", attributes=None):
+        super().check_pt_tf_outputs(
+            tf_outputs=tf_outputs,
+            pt_outputs=pt_outputs,
+            model_class=model_class,
+            tol=tol,
+            name=name,
+            attributes=attributes,
+        )
+
+
+def prepare_image():
+    img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
+    raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
+    return raw_image
+
+
+def prepare_dog_img():
+    img_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/dog-sam.png"
+    raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
+    return raw_image
+
+
+@require_tf
+@slow
+class TFSamModelIntegrationTest(unittest.TestCase):
+    def test_inference_mask_generation_no_point(self):
+        model = TFSamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
+
+        raw_image = prepare_image()
+        inputs = processor(images=raw_image, return_tensors="tf")
+
+        outputs = model(**inputs)
+        scores = tf.squeeze(outputs.iou_scores)
+        masks = outputs.pred_masks[0, 0, 0, 0, :3]
+        self.assertTrue(np.allclose(scores[-1].numpy(), np.array(0.4515), atol=2e-4))
+        self.assertTrue(np.allclose(masks.numpy(), np.array([-4.1807, -3.4949, -3.4483]), atol=1e-2))
+
+    def test_inference_mask_generation_one_point_one_bb(self):
+        model = TFSamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
+
+        raw_image = prepare_image()
+        input_boxes = [[[650, 900, 1000, 1250]]]
+        input_points = [[[820, 1080]]]
+
+        inputs = processor(images=raw_image, input_boxes=input_boxes, input_points=input_points, return_tensors="tf")
+
+        outputs = model(**inputs)
+        scores = tf.squeeze(outputs.iou_scores)
+        masks = outputs.pred_masks[0, 0, 0, 0, :3]
+
+        self.assertTrue(np.allclose(scores[-1], np.array(0.9566), atol=2e-4))
+        self.assertTrue(np.allclose(masks.numpy(), np.array([-12.7657, -12.3683, -12.5985]), atol=2e-2))
+
+    def test_inference_mask_generation_batched_points_batched_images(self):
+        model = TFSamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
+
+        raw_image = prepare_image()
+        input_points = [
+            [[[820, 1080]], [[820, 1080]], [[820, 1080]], [[820, 1080]]],
+            [[[510, 1080]], [[820, 1080]], [[820, 1080]], [[820, 1080]]],
+        ]
+
+        inputs = processor(images=[raw_image, raw_image], input_points=input_points, return_tensors="tf")
+
+        outputs = model(**inputs)
+        scores = tf.squeeze(outputs.iou_scores)
+        masks = outputs.pred_masks[0, 0, 0, 0, :3]
+
+        EXPECTED_SCORES = np.array(
+            [
+                [
+                    [0.6765, 0.9379, 0.8803],
+                    [0.6765, 0.9379, 0.8803],
+                    [0.6765, 0.9379, 0.8803],
+                    [0.6765, 0.9379, 0.8803],
+                ],
+                [
+                    [0.3317, 0.7264, 0.7646],
+                    [0.6765, 0.9379, 0.8803],
+                    [0.6765, 0.9379, 0.8803],
+                    [0.6765, 0.9379, 0.8803],
+                ],
+            ]
+        )
+        EXPECTED_MASKS = np.array([-2.8552, -2.7990, -2.9612])
+        self.assertTrue(np.allclose(scores.numpy(), EXPECTED_SCORES, atol=1e-3))
+        self.assertTrue(np.allclose(masks.numpy(), EXPECTED_MASKS, atol=3e-2))
+
+    def test_inference_mask_generation_one_point_one_bb_zero(self):
+        model = TFSamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
+
+        raw_image = prepare_image()
+        input_boxes = [[[620, 900, 1000, 1255]]]
+        input_points = [[[820, 1080]]]
+        labels = [[0]]
+
+        inputs = processor(
+            images=raw_image,
+            input_boxes=input_boxes,
+            input_points=input_points,
+            input_labels=labels,
+            return_tensors="tf",
+        )
+
+        outputs = model(**inputs)
+        scores = tf.squeeze(outputs.iou_scores)
+        self.assertTrue(np.allclose(scores[-1].numpy(), np.array(0.7894), atol=1e-4))
+
+    def test_inference_mask_generation_one_point(self):
+        model = TFSamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
+
+        raw_image = prepare_image()
+
+        input_points = [[[400, 650]]]
+        input_labels = [[1]]
+
+        inputs = processor(images=raw_image, input_points=input_points, input_labels=input_labels, return_tensors="tf")
+
+        outputs = model(**inputs)
+        scores = tf.squeeze(outputs.iou_scores)
+
+        self.assertTrue(np.allclose(scores[-1], np.array(0.9675), atol=1e-4))
+
+        # With no label
+        input_points = [[[400, 650]]]
+
+        inputs = processor(images=raw_image, input_points=input_points, return_tensors="tf")
+
+        outputs = model(**inputs)
+        scores = tf.squeeze(outputs.iou_scores)
+
+        self.assertTrue(np.allclose(scores[-1].numpy(), np.array(0.9675), atol=1e-4))
+
+    def test_inference_mask_generation_two_points(self):
+        model = TFSamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
+        raw_image = prepare_image()
+
+        input_points = [[[400, 650], [800, 650]]]
+        input_labels = [[1, 1]]
+
+        inputs = processor(images=raw_image, input_points=input_points, input_labels=input_labels, return_tensors="tf")
+
+        outputs = model(**inputs)
+        scores = tf.squeeze(outputs.iou_scores)
+
+        self.assertTrue(np.allclose(scores[-1].numpy(), np.array(0.9762), atol=1e-4))
+
+        # no labels
+        inputs = processor(images=raw_image, input_points=input_points, return_tensors="tf")
+
+        outputs = model(**inputs)
+        scores = tf.squeeze(outputs.iou_scores)
+
+        self.assertTrue(np.allclose(scores[-1].numpy(), np.array(0.9762), atol=1e-4))
+
+    def test_inference_mask_generation_two_points_batched(self):
+        model = TFSamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
+
+        raw_image = prepare_image()
+
+        input_points = [[[400, 650], [800, 650]], [[400, 650]]]
+        input_labels = [[1, 1], [1]]
+
+        inputs = processor(
+            images=[raw_image, raw_image], input_points=input_points, input_labels=input_labels, return_tensors="tf"
+        )
+
+        outputs = model(**inputs)
+        scores = tf.squeeze(outputs.iou_scores)
+
+        self.assertTrue(np.allclose(scores[0][-1].numpy(), np.array(0.9762), atol=1e-4))
+        self.assertTrue(np.allclose(scores[1][-1], np.array(0.9637), atol=1e-4))
+
+    def test_inference_mask_generation_one_box(self):
+        model = TFSamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
+
+        raw_image = prepare_image()
+
+        input_boxes = [[[75, 275, 1725, 850]]]
+
+        inputs = processor(images=raw_image, input_boxes=input_boxes, return_tensors="tf")
+
+        outputs = model(**inputs)
+        scores = tf.squeeze(outputs.iou_scores)
+
+        self.assertTrue(np.allclose(scores[-1].numpy(), np.array(0.7937), atol=1e-4))
+
+    def test_inference_mask_generation_batched_image_one_point(self):
+        model = TFSamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
+
+        raw_image = prepare_image()
+        raw_dog_image = prepare_dog_img()
+
+        input_points = [[[820, 1080]], [[220, 470]]]
+
+        inputs = processor(images=[raw_image, raw_dog_image], input_points=input_points, return_tensors="tf")
+
+        outputs = model(**inputs)
+        scores_batched = tf.squeeze(outputs.iou_scores)
+
+        input_points = [[[220, 470]]]
+
+        inputs = processor(images=raw_dog_image, input_points=input_points, return_tensors="tf")
+
+        outputs = model(**inputs)
+        scores_single = tf.squeeze(outputs.iou_scores)
+        self.assertTrue(np.allclose(scores_batched[1, :].numpy(), scores_single.numpy(), atol=1e-4))
+
+    def test_inference_mask_generation_two_points_point_batch(self):
+        model = TFSamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
+
+        raw_image = prepare_image()
+
+        # fmt: off
+        input_points = tf.convert_to_tensor([[[400, 650]], [[220, 470]]])
+        # fmt: on
+
+        input_points = tf.expand_dims(input_points, 0)
+
+        inputs = processor(raw_image, input_points=input_points, return_tensors="tf")
+
+        outputs = model(**inputs)
+
+        iou_scores = outputs.iou_scores
+        self.assertTrue(iou_scores.shape == (1, 2, 3))
+        self.assertTrue(
+            np.allclose(
+                iou_scores.numpy(),
+                np.array([[[0.9105, 0.9825, 0.9675], [0.7646, 0.7943, 0.7774]]]),
+                atol=1e-4,
+                rtol=1e-4,
+            )
+        )
+
+    def test_inference_mask_generation_three_boxes_point_batch(self):
+        model = TFSamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
+
+        raw_image = prepare_image()
+
+        # fmt: off
+        input_boxes = tf.convert_to_tensor([[[620, 900, 1000, 1255]], [[75, 275, 1725, 850]],  [[75, 275, 1725, 850]]])
+        EXPECTED_IOU = np.array([[[0.9773, 0.9881, 0.9522],
+         [0.5996, 0.7661, 0.7937],
+         [0.5996, 0.7661, 0.7937]]])
+        # fmt: on
+        input_boxes = tf.expand_dims(input_boxes, 0)
+
+        inputs = processor(raw_image, input_boxes=input_boxes, return_tensors="tf")
+
+        outputs = model(**inputs)
+
+        iou_scores = outputs.iou_scores
+        self.assertTrue(iou_scores.shape == (1, 3, 3))
+        self.assertTrue(np.allclose(iou_scores.numpy(), EXPECTED_IOU, atol=1e-4, rtol=1e-4))
diff --git a/tests/models/sam/test_processor_sam.py b/tests/models/sam/test_processor_sam.py
new file mode 100644
index 000000000000..7d669bb96914
--- /dev/null
+++ b/tests/models/sam/test_processor_sam.py
@@ -0,0 +1,276 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import shutil
+import tempfile
+import unittest
+
+import numpy as np
+
+from transformers.testing_utils import (
+    is_pt_tf_cross_test,
+    require_tf,
+    require_torch,
+    require_torchvision,
+    require_vision,
+)
+from transformers.utils import is_tf_available, is_torch_available, is_vision_available
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import AutoProcessor, SamImageProcessor, SamProcessor
+
+if is_torch_available():
+    import torch
+
+if is_tf_available():
+    import tensorflow as tf
+
+
+@require_vision
+@require_torchvision
+class SamProcessorTest(unittest.TestCase):
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+        image_processor = SamImageProcessor()
+        processor = SamProcessor(image_processor)
+        processor.save_pretrained(self.tmpdirname)
+
+    def get_image_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def prepare_image_inputs(self):
+        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
+        or a list of PyTorch tensors if one specifies torchify=True.
+        """
+
+        image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
+
+        image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
+
+        return image_inputs
+
+    def test_save_load_pretrained_additional_features(self):
+        processor = SamProcessor(image_processor=self.get_image_processor())
+        processor.save_pretrained(self.tmpdirname)
+
+        image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
+
+        processor = SamProcessor.from_pretrained(self.tmpdirname, do_normalize=False, padding_value=1.0)
+
+        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
+        self.assertIsInstance(processor.image_processor, SamImageProcessor)
+
+    def test_image_processor(self):
+        image_processor = self.get_image_processor()
+
+        processor = SamProcessor(image_processor=image_processor)
+
+        image_input = self.prepare_image_inputs()
+
+        input_feat_extract = image_processor(image_input, return_tensors="np")
+        input_processor = processor(images=image_input, return_tensors="np")
+
+        input_feat_extract.pop("original_sizes")  # pop original_sizes as it is popped in the processor
+        input_feat_extract.pop("reshaped_input_sizes")  # pop original_sizes as it is popped in the processor
+
+        for key in input_feat_extract.keys():
+            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
+
+    @require_torch
+    def test_post_process_masks(self):
+        image_processor = self.get_image_processor()
+
+        processor = SamProcessor(image_processor=image_processor)
+        dummy_masks = [torch.ones((1, 3, 5, 5))]
+
+        original_sizes = [[1764, 2646]]
+
+        reshaped_input_size = [[683, 1024]]
+        masks = processor.post_process_masks(dummy_masks, original_sizes, reshaped_input_size)
+        self.assertEqual(masks[0].shape, (1, 3, 1764, 2646))
+
+        masks = processor.post_process_masks(
+            dummy_masks, torch.tensor(original_sizes), torch.tensor(reshaped_input_size)
+        )
+        self.assertEqual(masks[0].shape, (1, 3, 1764, 2646))
+
+        # should also work with np
+        dummy_masks = [np.ones((1, 3, 5, 5))]
+        masks = processor.post_process_masks(dummy_masks, np.array(original_sizes), np.array(reshaped_input_size))
+
+        self.assertEqual(masks[0].shape, (1, 3, 1764, 2646))
+
+        dummy_masks = [[1, 0], [0, 1]]
+        with self.assertRaises(ValueError):
+            masks = processor.post_process_masks(dummy_masks, np.array(original_sizes), np.array(reshaped_input_size))
+
+
+@require_vision
+@require_tf
+class TFSamProcessorTest(unittest.TestCase):
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+        image_processor = SamImageProcessor()
+        processor = SamProcessor(image_processor)
+        processor.save_pretrained(self.tmpdirname)
+
+    def get_image_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def prepare_image_inputs(self):
+        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
+        or a list of PyTorch tensors if one specifies torchify=True.
+        """
+
+        image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
+
+        image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
+
+        return image_inputs
+
+    def test_save_load_pretrained_additional_features(self):
+        processor = SamProcessor(image_processor=self.get_image_processor())
+        processor.save_pretrained(self.tmpdirname)
+
+        image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
+
+        processor = SamProcessor.from_pretrained(self.tmpdirname, do_normalize=False, padding_value=1.0)
+
+        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
+        self.assertIsInstance(processor.image_processor, SamImageProcessor)
+
+    def test_image_processor(self):
+        image_processor = self.get_image_processor()
+
+        processor = SamProcessor(image_processor=image_processor)
+
+        image_input = self.prepare_image_inputs()
+
+        input_feat_extract = image_processor(image_input, return_tensors="np")
+        input_processor = processor(images=image_input, return_tensors="np")
+
+        input_feat_extract.pop("original_sizes")  # pop original_sizes as it is popped in the processor
+        input_feat_extract.pop("reshaped_input_sizes")  # pop reshaped_input_sizes as it is popped in the processor
+
+        for key in input_feat_extract.keys():
+            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
+
+    @require_tf
+    def test_post_process_masks(self):
+        image_processor = self.get_image_processor()
+
+        processor = SamProcessor(image_processor=image_processor)
+        dummy_masks = [tf.ones((1, 3, 5, 5))]
+
+        original_sizes = [[1764, 2646]]
+
+        reshaped_input_size = [[683, 1024]]
+        masks = processor.post_process_masks(dummy_masks, original_sizes, reshaped_input_size, return_tensors="tf")
+        self.assertEqual(masks[0].shape, (1, 3, 1764, 2646))
+
+        masks = processor.post_process_masks(
+            dummy_masks,
+            tf.convert_to_tensor(original_sizes),
+            tf.convert_to_tensor(reshaped_input_size),
+            return_tensors="tf",
+        )
+        self.assertEqual(masks[0].shape, (1, 3, 1764, 2646))
+
+        # should also work with np
+        dummy_masks = [np.ones((1, 3, 5, 5))]
+        masks = processor.post_process_masks(
+            dummy_masks, np.array(original_sizes), np.array(reshaped_input_size), return_tensors="tf"
+        )
+
+        self.assertEqual(masks[0].shape, (1, 3, 1764, 2646))
+
+        dummy_masks = [[1, 0], [0, 1]]
+        with self.assertRaises(tf.errors.InvalidArgumentError):
+            masks = processor.post_process_masks(
+                dummy_masks, np.array(original_sizes), np.array(reshaped_input_size), return_tensors="tf"
+            )
+
+
+@require_vision
+@require_torchvision
+class SamProcessorEquivalenceTest(unittest.TestCase):
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+        image_processor = SamImageProcessor()
+        processor = SamProcessor(image_processor)
+        processor.save_pretrained(self.tmpdirname)
+
+    def get_image_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def prepare_image_inputs(self):
+        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
+        or a list of PyTorch tensors if one specifies torchify=True.
+        """
+
+        image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
+
+        image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
+
+        return image_inputs
+
+    @is_pt_tf_cross_test
+    def test_post_process_masks_equivalence(self):
+        image_processor = self.get_image_processor()
+
+        processor = SamProcessor(image_processor=image_processor)
+        dummy_masks = np.random.randint(0, 2, size=(1, 3, 5, 5)).astype(np.float32)
+        tf_dummy_masks = [tf.convert_to_tensor(dummy_masks)]
+        pt_dummy_masks = [torch.tensor(dummy_masks)]
+
+        original_sizes = [[1764, 2646]]
+
+        reshaped_input_size = [[683, 1024]]
+        tf_masks = processor.post_process_masks(
+            tf_dummy_masks, original_sizes, reshaped_input_size, return_tensors="tf"
+        )
+        pt_masks = processor.post_process_masks(
+            pt_dummy_masks, original_sizes, reshaped_input_size, return_tensors="pt"
+        )
+
+        self.assertTrue(np.all(tf_masks[0].numpy() == pt_masks[0].numpy()))
+
+    @is_pt_tf_cross_test
+    def test_image_processor_equivalence(self):
+        image_processor = self.get_image_processor()
+
+        processor = SamProcessor(image_processor=image_processor)
+
+        image_input = self.prepare_image_inputs()
+
+        pt_input_feat_extract = image_processor(image_input, return_tensors="pt")["pixel_values"].numpy()
+        pt_input_processor = processor(images=image_input, return_tensors="pt")["pixel_values"].numpy()
+
+        tf_input_feat_extract = image_processor(image_input, return_tensors="tf")["pixel_values"].numpy()
+        tf_input_processor = processor(images=image_input, return_tensors="tf")["pixel_values"].numpy()
+
+        self.assertTrue(np.allclose(pt_input_feat_extract, pt_input_processor))
+        self.assertTrue(np.allclose(pt_input_feat_extract, tf_input_feat_extract))
+        self.assertTrue(np.allclose(pt_input_feat_extract, tf_input_processor))
diff --git a/tests/models/segformer/test_feature_extraction_segformer.py b/tests/models/segformer/test_feature_extraction_segformer.py
deleted file mode 100644
index 4257b27b814e..000000000000
--- a/tests/models/segformer/test_feature_extraction_segformer.py
+++ /dev/null
@@ -1,347 +0,0 @@
-# coding=utf-8
-# Copyright 2021 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-from datasets import load_dataset
-
-from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torch_available, is_vision_available
-
-from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
-
-
-if is_torch_available():
-    import torch
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import SegformerFeatureExtractor
-
-
-class SegformerFeatureExtractionTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-        do_reduce_labels=False,
-    ):
-        size = size if size is not None else {"height": 30, "width": 30}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.do_reduce_labels = do_reduce_labels
-
-    def prepare_feat_extract_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_normalize": self.do_normalize,
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_reduce_labels": self.do_reduce_labels,
-        }
-
-
-def prepare_semantic_single_inputs():
-    dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
-
-    image = Image.open(dataset[0]["file"])
-    map = Image.open(dataset[1]["file"])
-
-    return image, map
-
-
-def prepare_semantic_batch_inputs():
-    dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
-
-    image1 = Image.open(dataset[0]["file"])
-    map1 = Image.open(dataset[1]["file"])
-    image2 = Image.open(dataset[2]["file"])
-    map2 = Image.open(dataset[3]["file"])
-
-    return [image1, image2], [map1, map2]
-
-
-@require_torch
-@require_vision
-class SegformerFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
-
-    feature_extraction_class = SegformerFeatureExtractor if is_vision_available() else None
-
-    def setUp(self):
-        self.feature_extract_tester = SegformerFeatureExtractionTester(self)
-
-    @property
-    def feat_extract_dict(self):
-        return self.feature_extract_tester.prepare_feat_extract_dict()
-
-    def test_feat_extract_properties(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        self.assertTrue(hasattr(feature_extractor, "do_resize"))
-        self.assertTrue(hasattr(feature_extractor, "size"))
-        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
-        self.assertTrue(hasattr(feature_extractor, "image_mean"))
-        self.assertTrue(hasattr(feature_extractor, "image_std"))
-        self.assertTrue(hasattr(feature_extractor, "do_reduce_labels"))
-
-    def test_feat_extract_from_dict_with_kwargs(self):
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict)
-        self.assertEqual(feature_extractor.size, {"height": 30, "width": 30})
-        self.assertEqual(feature_extractor.do_reduce_labels, False)
-
-        feature_extractor = self.feature_extraction_class.from_dict(
-            self.feat_extract_dict, size=42, reduce_labels=True
-        )
-        self.assertEqual(feature_extractor.size, {"height": 42, "width": 42})
-        self.assertEqual(feature_extractor.do_reduce_labels, True)
-
-    def test_batch_feature(self):
-        pass
-
-    def test_call_pil(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PIL images
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-    def test_call_numpy(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random numpy tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-    def test_call_pytorch(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-    def test_call_segmentation_maps(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
-        maps = []
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-            maps.append(torch.zeros(image.shape[-2:]).long())
-
-        # Test not batched input
-        encoding = feature_extractor(image_inputs[0], maps[0], return_tensors="pt")
-        self.assertEqual(
-            encoding["pixel_values"].shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-        self.assertEqual(
-            encoding["labels"].shape,
-            (
-                1,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-        self.assertEqual(encoding["labels"].dtype, torch.long)
-        self.assertTrue(encoding["labels"].min().item() >= 0)
-        self.assertTrue(encoding["labels"].max().item() <= 255)
-
-        # Test batched
-        encoding = feature_extractor(image_inputs, maps, return_tensors="pt")
-        self.assertEqual(
-            encoding["pixel_values"].shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-        self.assertEqual(
-            encoding["labels"].shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-        self.assertEqual(encoding["labels"].dtype, torch.long)
-        self.assertTrue(encoding["labels"].min().item() >= 0)
-        self.assertTrue(encoding["labels"].max().item() <= 255)
-
-        # Test not batched input (PIL images)
-        image, segmentation_map = prepare_semantic_single_inputs()
-
-        encoding = feature_extractor(image, segmentation_map, return_tensors="pt")
-        self.assertEqual(
-            encoding["pixel_values"].shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-        self.assertEqual(
-            encoding["labels"].shape,
-            (
-                1,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-        self.assertEqual(encoding["labels"].dtype, torch.long)
-        self.assertTrue(encoding["labels"].min().item() >= 0)
-        self.assertTrue(encoding["labels"].max().item() <= 255)
-
-        # Test batched input (PIL images)
-        images, segmentation_maps = prepare_semantic_batch_inputs()
-
-        encoding = feature_extractor(images, segmentation_maps, return_tensors="pt")
-        self.assertEqual(
-            encoding["pixel_values"].shape,
-            (
-                2,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-        self.assertEqual(
-            encoding["labels"].shape,
-            (
-                2,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-        self.assertEqual(encoding["labels"].dtype, torch.long)
-        self.assertTrue(encoding["labels"].min().item() >= 0)
-        self.assertTrue(encoding["labels"].max().item() <= 255)
-
-    def test_reduce_labels(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-
-        # ADE20k has 150 classes, and the background is included, so labels should be between 0 and 150
-        image, map = prepare_semantic_single_inputs()
-        encoding = feature_extractor(image, map, return_tensors="pt")
-        self.assertTrue(encoding["labels"].min().item() >= 0)
-        self.assertTrue(encoding["labels"].max().item() <= 150)
-
-        feature_extractor.reduce_labels = True
-        encoding = feature_extractor(image, map, return_tensors="pt")
-        self.assertTrue(encoding["labels"].min().item() >= 0)
-        self.assertTrue(encoding["labels"].max().item() <= 255)
diff --git a/tests/models/segformer/test_image_processing_segformer.py b/tests/models/segformer/test_image_processing_segformer.py
new file mode 100644
index 000000000000..d84afdaa5746
--- /dev/null
+++ b/tests/models/segformer/test_image_processing_segformer.py
@@ -0,0 +1,258 @@
+# coding=utf-8
+# Copyright 2021 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from datasets import load_dataset
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import SegformerImageProcessor
+
+
+class SegformerImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+        do_reduce_labels=False,
+    ):
+        size = size if size is not None else {"height": 30, "width": 30}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_reduce_labels = do_reduce_labels
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_reduce_labels": self.do_reduce_labels,
+        }
+
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.size["height"], self.size["width"]
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+def prepare_semantic_single_inputs():
+    dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+
+    image = Image.open(dataset[0]["file"])
+    map = Image.open(dataset[1]["file"])
+
+    return image, map
+
+
+def prepare_semantic_batch_inputs():
+    dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+
+    image1 = Image.open(dataset[0]["file"])
+    map1 = Image.open(dataset[1]["file"])
+    image2 = Image.open(dataset[2]["file"])
+    map2 = Image.open(dataset[3]["file"])
+
+    return [image1, image2], [map1, map2]
+
+
+@require_torch
+@require_vision
+class SegformerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = SegformerImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = SegformerImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_reduce_labels"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"height": 30, "width": 30})
+        self.assertEqual(image_processor.do_reduce_labels, False)
+
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42, reduce_labels=True)
+        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
+        self.assertEqual(image_processor.do_reduce_labels, True)
+
+    def test_call_segmentation_maps(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PyTorch tensors
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+        maps = []
+        for image in image_inputs:
+            self.assertIsInstance(image, torch.Tensor)
+            maps.append(torch.zeros(image.shape[-2:]).long())
+
+        # Test not batched input
+        encoding = image_processing(image_inputs[0], maps[0], return_tensors="pt")
+        self.assertEqual(
+            encoding["pixel_values"].shape,
+            (
+                1,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.size["height"],
+                self.image_processor_tester.size["width"],
+            ),
+        )
+        self.assertEqual(
+            encoding["labels"].shape,
+            (
+                1,
+                self.image_processor_tester.size["height"],
+                self.image_processor_tester.size["width"],
+            ),
+        )
+        self.assertEqual(encoding["labels"].dtype, torch.long)
+        self.assertTrue(encoding["labels"].min().item() >= 0)
+        self.assertTrue(encoding["labels"].max().item() <= 255)
+
+        # Test batched
+        encoding = image_processing(image_inputs, maps, return_tensors="pt")
+        self.assertEqual(
+            encoding["pixel_values"].shape,
+            (
+                self.image_processor_tester.batch_size,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.size["height"],
+                self.image_processor_tester.size["width"],
+            ),
+        )
+        self.assertEqual(
+            encoding["labels"].shape,
+            (
+                self.image_processor_tester.batch_size,
+                self.image_processor_tester.size["height"],
+                self.image_processor_tester.size["width"],
+            ),
+        )
+        self.assertEqual(encoding["labels"].dtype, torch.long)
+        self.assertTrue(encoding["labels"].min().item() >= 0)
+        self.assertTrue(encoding["labels"].max().item() <= 255)
+
+        # Test not batched input (PIL images)
+        image, segmentation_map = prepare_semantic_single_inputs()
+
+        encoding = image_processing(image, segmentation_map, return_tensors="pt")
+        self.assertEqual(
+            encoding["pixel_values"].shape,
+            (
+                1,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.size["height"],
+                self.image_processor_tester.size["width"],
+            ),
+        )
+        self.assertEqual(
+            encoding["labels"].shape,
+            (
+                1,
+                self.image_processor_tester.size["height"],
+                self.image_processor_tester.size["width"],
+            ),
+        )
+        self.assertEqual(encoding["labels"].dtype, torch.long)
+        self.assertTrue(encoding["labels"].min().item() >= 0)
+        self.assertTrue(encoding["labels"].max().item() <= 255)
+
+        # Test batched input (PIL images)
+        images, segmentation_maps = prepare_semantic_batch_inputs()
+
+        encoding = image_processing(images, segmentation_maps, return_tensors="pt")
+        self.assertEqual(
+            encoding["pixel_values"].shape,
+            (
+                2,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.size["height"],
+                self.image_processor_tester.size["width"],
+            ),
+        )
+        self.assertEqual(
+            encoding["labels"].shape,
+            (
+                2,
+                self.image_processor_tester.size["height"],
+                self.image_processor_tester.size["width"],
+            ),
+        )
+        self.assertEqual(encoding["labels"].dtype, torch.long)
+        self.assertTrue(encoding["labels"].min().item() >= 0)
+        self.assertTrue(encoding["labels"].max().item() <= 255)
+
+    def test_reduce_labels(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+
+        # ADE20k has 150 classes, and the background is included, so labels should be between 0 and 150
+        image, map = prepare_semantic_single_inputs()
+        encoding = image_processing(image, map, return_tensors="pt")
+        self.assertTrue(encoding["labels"].min().item() >= 0)
+        self.assertTrue(encoding["labels"].max().item() <= 150)
+
+        image_processing.do_reduce_labels = True
+        encoding = image_processing(image, map, return_tensors="pt")
+        self.assertTrue(encoding["labels"].min().item() >= 0)
+        self.assertTrue(encoding["labels"].max().item() <= 255)
diff --git a/tests/models/segformer/test_modeling_segformer.py b/tests/models/segformer/test_modeling_segformer.py
index 6037170fb1f3..0506be9b1f11 100644
--- a/tests/models/segformer/test_modeling_segformer.py
+++ b/tests/models/segformer/test_modeling_segformer.py
@@ -24,6 +24,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -41,7 +42,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import SegformerFeatureExtractor
+    from transformers import SegformerImageProcessor
 
 
 class SegformerConfigTester(ConfigTester):
@@ -60,11 +61,11 @@ def __init__(
         image_size=64,
         num_channels=3,
         num_encoder_blocks=4,
-        depths=[2, 2, 2, 2],
+        depths=[1, 1, 1, 1],
         sr_ratios=[8, 4, 2, 1],
-        hidden_sizes=[16, 32, 64, 128],
+        hidden_sizes=[8, 8, 16, 16],
         downsampling_rates=[1, 4, 8, 16],
-        num_attention_heads=[1, 2, 4, 8],
+        num_attention_heads=[1, 1, 2, 2],
         is_training=True,
         use_labels=True,
         hidden_act="gelu",
@@ -159,8 +160,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class SegformerModelTest(ModelTesterMixin, unittest.TestCase):
-
+class SegformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             SegformerModel,
@@ -170,6 +170,15 @@ class SegformerModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": SegformerModel,
+            "image-classification": SegformerForImageClassification,
+            "image-segmentation": SegformerForSemanticSegmentation,
+        }
+        if is_torch_available()
+        else {}
+    )
 
     fx_compatible = True
     test_head_masking = False
@@ -356,7 +365,7 @@ class SegformerModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference_image_segmentation_ade(self):
         # only resize + normalize
-        feature_extractor = SegformerFeatureExtractor(
+        image_processor = SegformerImageProcessor(
             image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False
         )
         model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512").to(
@@ -364,7 +373,7 @@ def test_inference_image_segmentation_ade(self):
         )
 
         image = prepare_img()
-        encoded_inputs = feature_extractor(images=image, return_tensors="pt")
+        encoded_inputs = image_processor(images=image, return_tensors="pt")
         pixel_values = encoded_inputs.pixel_values.to(torch_device)
 
         with torch.no_grad():
@@ -385,7 +394,7 @@ def test_inference_image_segmentation_ade(self):
     @slow
     def test_inference_image_segmentation_city(self):
         # only resize + normalize
-        feature_extractor = SegformerFeatureExtractor(
+        image_processor = SegformerImageProcessor(
             image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False
         )
         model = SegformerForSemanticSegmentation.from_pretrained(
@@ -393,7 +402,7 @@ def test_inference_image_segmentation_city(self):
         ).to(torch_device)
 
         image = prepare_img()
-        encoded_inputs = feature_extractor(images=image, return_tensors="pt")
+        encoded_inputs = image_processor(images=image, return_tensors="pt")
         pixel_values = encoded_inputs.pixel_values.to(torch_device)
 
         with torch.no_grad():
@@ -414,7 +423,7 @@ def test_inference_image_segmentation_city(self):
     @slow
     def test_post_processing_semantic_segmentation(self):
         # only resize + normalize
-        feature_extractor = SegformerFeatureExtractor(
+        image_processor = SegformerImageProcessor(
             image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False
         )
         model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512").to(
@@ -422,7 +431,7 @@ def test_post_processing_semantic_segmentation(self):
         )
 
         image = prepare_img()
-        encoded_inputs = feature_extractor(images=image, return_tensors="pt")
+        encoded_inputs = image_processor(images=image, return_tensors="pt")
         pixel_values = encoded_inputs.pixel_values.to(torch_device)
 
         with torch.no_grad():
@@ -430,10 +439,10 @@ def test_post_processing_semantic_segmentation(self):
 
         outputs.logits = outputs.logits.detach().cpu()
 
-        segmentation = feature_extractor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[(500, 300)])
+        segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[(500, 300)])
         expected_shape = torch.Size((500, 300))
         self.assertEqual(segmentation[0].shape, expected_shape)
 
-        segmentation = feature_extractor.post_process_semantic_segmentation(outputs=outputs)
+        segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs)
         expected_shape = torch.Size((128, 128))
         self.assertEqual(segmentation[0].shape, expected_shape)
diff --git a/tests/models/segformer/test_modeling_tf_segformer.py b/tests/models/segformer/test_modeling_tf_segformer.py
index dfdb24f37b17..aca621f5097d 100644
--- a/tests/models/segformer/test_modeling_tf_segformer.py
+++ b/tests/models/segformer/test_modeling_tf_segformer.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 """ Testing suite for the TensorFlow SegFormer model. """
 
+from __future__ import annotations
+
 import inspect
 import unittest
 from typing import List, Tuple
@@ -24,6 +26,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -36,7 +39,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import SegformerFeatureExtractor
+    from transformers import SegformerImageProcessor
 
 
 class TFSegformerConfigTester(ConfigTester):
@@ -55,11 +58,11 @@ def __init__(
         image_size=64,
         num_channels=3,
         num_encoder_blocks=4,
-        depths=[2, 2, 2, 2],
+        depths=[1, 1, 1, 1],
         sr_ratios=[8, 4, 2, 1],
-        hidden_sizes=[16, 32, 64, 128],
+        hidden_sizes=[8, 8, 16, 16],
         downsampling_rates=[1, 4, 8, 16],
-        num_attention_heads=[1, 2, 4, 8],
+        num_attention_heads=[1, 1, 2, 2],
         is_training=True,
         use_labels=True,
         hidden_act="gelu",
@@ -150,12 +153,17 @@ def prepare_config_and_inputs_for_keras_fit(self, for_segmentation: bool = False
 
 
 @require_tf
-class TFSegformerModelTest(TFModelTesterMixin, unittest.TestCase):
+class TFSegformerModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (TFSegformerModel, TFSegformerForImageClassification, TFSegformerForSemanticSegmentation)
         if is_tf_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {"feature-extraction": TFSegformerModel, "image-classification": TFSegformerForImageClassification}
+        if is_tf_available()
+        else {}
+    )
 
     test_head_masking = False
     test_onnx = False
@@ -178,10 +186,6 @@ def test_inputs_embeds(self):
     def test_model_common_attributes(self):
         pass
 
-    @unittest.skip("Test was written for TF 1.x and isn't really relevant here")
-    def test_compile_tf_model(self):
-        pass
-
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -288,7 +292,6 @@ def check_hidden_states_output(inputs_dict, config, model_class):
             check_hidden_states_output(inputs_dict, config, model_class)
 
     def test_model_outputs_equivalence(self):
-
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
@@ -344,6 +347,7 @@ def check_keras_fit_results(self, val_loss1, val_loss2, atol=2e-1, rtol=2e-1):
         not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0,
         reason="TF does not support backprop for grouped convolutions on CPU.",
     )
+    @slow
     def test_keras_fit(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -363,9 +367,7 @@ def apply(model):
             _, prepared_for_class = self.model_tester.prepare_config_and_inputs_for_keras_fit(
                 for_segmentation=for_segmentation
             )
-            added_label = prepared_for_class[
-                sorted(list(prepared_for_class.keys() - inputs_dict.keys()), reverse=True)[0]
-            ]
+            added_label = prepared_for_class[sorted(prepared_for_class.keys() - inputs_dict.keys(), reverse=True)[0]]
             loss_size = tf.size(added_label)
 
             # Test that model correctly compute the loss with kwargs
@@ -452,13 +454,13 @@ class TFSegformerModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference_image_segmentation_ade(self):
         # only resize + normalize
-        feature_extractor = SegformerFeatureExtractor(
+        image_processor = SegformerImageProcessor(
             image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False
         )
         model = TFSegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
 
         image = prepare_img()
-        encoded_inputs = feature_extractor(images=image, return_tensors="tf")
+        encoded_inputs = image_processor(images=image, return_tensors="tf")
         pixel_values = encoded_inputs.pixel_values
 
         outputs = model(pixel_values, training=False)
@@ -478,7 +480,7 @@ def test_inference_image_segmentation_ade(self):
     @slow
     def test_inference_image_segmentation_city(self):
         # only resize + normalize
-        feature_extractor = SegformerFeatureExtractor(
+        image_processor = SegformerImageProcessor(
             image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False
         )
         model = TFSegformerForSemanticSegmentation.from_pretrained(
@@ -486,7 +488,7 @@ def test_inference_image_segmentation_city(self):
         )
 
         image = prepare_img()
-        encoded_inputs = feature_extractor(images=image, return_tensors="tf")
+        encoded_inputs = image_processor(images=image, return_tensors="tf")
         pixel_values = encoded_inputs.pixel_values
 
         outputs = model(pixel_values, training=False)
diff --git a/tests/models/sew/test_modeling_sew.py b/tests/models/sew/test_modeling_sew.py
index 9df69f84677e..528d5f84185e 100644
--- a/tests/models/sew/test_modeling_sew.py
+++ b/tests/models/sew/test_modeling_sew.py
@@ -31,6 +31,7 @@
     ids_tensor,
     random_attention_mask,
 )
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -64,7 +65,7 @@ def __init__(
         num_conv_pos_embeddings=31,
         num_conv_pos_embedding_groups=2,
         squeeze_factor=2,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         hidden_dropout=0.1,
         intermediate_size=20,
@@ -221,8 +222,8 @@ def check_ctc_training(self, config, input_values, *args):
             input_values[i, input_lengths[i] :] = 0.0
 
             if max_length_labels[i] < labels.shape[-1]:
-                # it's important that we make sure that target lenghts are at least
-                # one shorter than logit lenghts to prevent -inf
+                # it's important that we make sure that target lengths are at least
+                # one shorter than logit lengths to prevent -inf
                 labels[i, max_length_labels[i] - 1 :] = -100
 
         loss = model(input_values, labels=labels).loss
@@ -299,8 +300,17 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class SEWModelTest(ModelTesterMixin, unittest.TestCase):
+class SEWModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (SEWForCTC, SEWModel, SEWForSequenceClassification) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "audio-classification": SEWForSequenceClassification,
+            "automatic-speech-recognition": SEWForCTC,
+            "feature-extraction": SEWModel,
+        }
+        if is_torch_available()
+        else {}
+    )
     test_pruning = False
     test_headmasking = False
 
@@ -407,7 +417,7 @@ def test_initialization(self):
                     "quantizer.weight_proj.weight",
                 ]
                 if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
+                    if any(x in name for x in uniform_init_parms):
                         self.assertTrue(
                             -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
diff --git a/tests/models/sew_d/test_modeling_sew_d.py b/tests/models/sew_d/test_modeling_sew_d.py
index 334b10abf3ec..6fda7963a800 100644
--- a/tests/models/sew_d/test_modeling_sew_d.py
+++ b/tests/models/sew_d/test_modeling_sew_d.py
@@ -31,6 +31,7 @@
     ids_tensor,
     random_attention_mask,
 )
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -71,7 +72,7 @@ def __init__(
         position_biased_input=False,
         pos_att_type=("p2c", "c2p"),
         norm_rel_ebd="layer_norm",
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         hidden_dropout=0.1,
         intermediate_size=20,
@@ -242,8 +243,8 @@ def check_ctc_training(self, config, input_values, *args):
             input_values[i, input_lengths[i] :] = 0.0
 
             if max_length_labels[i] < labels.shape[-1]:
-                # it's important that we make sure that target lenghts are at least
-                # one shorter than logit lenghts to prevent -inf
+                # it's important that we make sure that target lengths are at least
+                # one shorter than logit lengths to prevent -inf
                 labels[i, max_length_labels[i] - 1 :] = -100
 
         loss = model(input_values, labels=labels).loss
@@ -320,8 +321,17 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class SEWDModelTest(ModelTesterMixin, unittest.TestCase):
+class SEWDModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (SEWDForCTC, SEWDModel, SEWDForSequenceClassification) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "audio-classification": SEWDForSequenceClassification,
+            "automatic-speech-recognition": SEWDForCTC,
+            "feature-extraction": SEWDModel,
+        }
+        if is_torch_available()
+        else {}
+    )
     test_pruning = False
     test_headmasking = False
     test_torchscript = False
@@ -421,7 +431,7 @@ def test_initialization(self):
                     "quantizer.weight_proj.weight",
                 ]
                 if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
+                    if any(x in name for x in uniform_init_parms):
                         self.assertTrue(
                             -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
diff --git a/tests/models/speech_encoder_decoder/test_modeling_flax_speech_encoder_decoder.py b/tests/models/speech_encoder_decoder/test_modeling_flax_speech_encoder_decoder.py
index c42da75bf635..f2c75e702bf7 100644
--- a/tests/models/speech_encoder_decoder/test_modeling_flax_speech_encoder_decoder.py
+++ b/tests/models/speech_encoder_decoder/test_modeling_flax_speech_encoder_decoder.py
@@ -33,6 +33,7 @@
     import jax.numpy as jnp
     from flax.training.common_utils import onehot
     from flax.traverse_util import flatten_dict
+
     from transformers import (
         FlaxBartForCausalLM,
         FlaxBertForCausalLM,
@@ -73,7 +74,7 @@ def check_encoder_decoder_model_from_pretrained_configs(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_decoder_config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
         self.assertTrue(encoder_decoder_config.decoder.is_decoder)
@@ -103,7 +104,7 @@ def check_encoder_decoder_model(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         enc_dec_model = SpeechEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
@@ -142,7 +143,7 @@ def check_encoder_decoder_model_from_pretrained(
         decoder_input_ids,
         decoder_attention_mask,
         return_dict,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict}
@@ -169,7 +170,7 @@ def check_save_and_load(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model}
@@ -208,7 +209,7 @@ def check_encoder_decoder_model_from_encoder_decoder_pretrained(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         # assert that loading encoder and decoder models from configs has been correctly executed
@@ -253,7 +254,7 @@ def check_encoder_decoder_model_output_attentions(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         # make the decoder inputs a different shape from the encoder inputs to harden the test
         decoder_input_ids = decoder_input_ids[:, :-1]
@@ -336,7 +337,7 @@ def check_freeze_feature_encoder(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_decoder_config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
         enc_dec_model = FlaxSpeechEncoderDecoderModel(encoder_decoder_config)
@@ -406,7 +407,6 @@ def compute_loss(
             self.assertTrue((grad == grad_frozen).all())
 
     def check_pt_flax_equivalence(self, pt_model, fx_model, inputs_dict):
-
         pt_model.to(torch_device)
         pt_model.eval()
 
@@ -448,7 +448,6 @@ def check_pt_flax_equivalence(self, pt_model, fx_model, inputs_dict):
             self.assert_almost_equals(fx_output, pt_output_loaded.numpy(), 1e-5)
 
     def check_equivalence_pt_to_flax(self, config, decoder_config, inputs_dict):
-
         encoder_decoder_config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
 
         pt_model = SpeechEncoderDecoderModel(encoder_decoder_config)
@@ -460,7 +459,6 @@ def check_equivalence_pt_to_flax(self, config, decoder_config, inputs_dict):
         self.check_pt_flax_equivalence(pt_model, fx_model, inputs_dict)
 
     def check_equivalence_flax_to_pt(self, config, decoder_config, inputs_dict):
-
         encoder_decoder_config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
 
         pt_model = SpeechEncoderDecoderModel(encoder_decoder_config)
@@ -508,7 +506,6 @@ def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float):
 
     @is_pt_flax_cross_test
     def test_pt_flax_equivalence(self):
-
         config_inputs_dict = self.prepare_config_and_inputs()
         config = config_inputs_dict.pop("config")
         decoder_config = config_inputs_dict.pop("decoder_config")
diff --git a/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py b/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py
index 3ecca17324a3..368232331a2a 100644
--- a/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py
+++ b/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py
@@ -62,7 +62,7 @@ def check_encoder_decoder_model_from_pretrained_configs(
         decoder_attention_mask,
         input_values=None,
         input_features=None,
-        **kwargs
+        **kwargs,
     ):
         encoder_decoder_config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
         self.assertTrue(encoder_decoder_config.decoder.is_decoder)
@@ -95,7 +95,7 @@ def check_encoder_decoder_model(
         decoder_attention_mask,
         input_values=None,
         input_features=None,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         enc_dec_model = SpeechEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
@@ -135,7 +135,7 @@ def check_encoder_decoder_model_with_inputs(
         decoder_attention_mask,
         input_values=None,
         input_features=None,
-        **kwargs
+        **kwargs,
     ):
         inputs = input_values if input_features is None else input_features
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
@@ -173,7 +173,7 @@ def check_encoder_decoder_model_from_pretrained(
         return_dict,
         input_values=None,
         input_features=None,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict}
@@ -202,7 +202,7 @@ def check_save_and_load(
         decoder_attention_mask,
         input_values=None,
         input_features=None,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         enc_dec_model = SpeechEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
@@ -245,7 +245,7 @@ def check_save_and_load_encoder_decoder_model(
         decoder_attention_mask,
         input_values=None,
         input_features=None,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         enc_dec_model = SpeechEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
@@ -292,7 +292,7 @@ def check_encoder_decoder_model_output_attentions(
         labels=None,
         input_values=None,
         input_features=None,
-        **kwargs
+        **kwargs,
     ):
         # make the decoder inputs a different shape from the encoder inputs to harden the test
         decoder_input_ids = decoder_input_ids[:, :-1]
diff --git a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py
index 244b748c7139..d8929c4ef0d2 100644
--- a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py
+++ b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py
@@ -32,6 +32,7 @@
 global_rng = random.Random()
 
 
+# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list
 def floats_list(shape, scale=1.0, rng=None, name=None):
     """Creates a random float32 tensor"""
     if rng is None:
@@ -104,7 +105,6 @@ def _flatten(list_of_lists):
 @require_torch
 @require_torchaudio
 class Speech2TextFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
-
     feature_extraction_class = Speech2TextFeatureExtractor if is_speech_available() else None
 
     def setUp(self):
@@ -137,6 +137,14 @@ def test_call(self):
         for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
             self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
 
+        # Test 2-D numpy arrays are batched.
+        speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
+        np_speech_inputs = np.asarray(speech_inputs)
+        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
+        encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+
     def test_cepstral_mean_and_variance_normalization(self):
         feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
         speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
@@ -248,3 +256,27 @@ def test_double_precision_pad(self):
             self.assertTrue(np_processed.input_features.dtype == np.float32)
             pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="pt")
             self.assertTrue(pt_processed.input_features.dtype == torch.float32)
+
+    def _load_datasamples(self, num_samples):
+        from datasets import load_dataset
+
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        # automatic decoding with librispeech
+        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+
+        return [x["array"] for x in speech_samples]
+
+    def test_integration(self):
+        # fmt: off
+        expected = np.array([
+            -1.5745, -1.7713, -1.7020, -1.6069, -1.2250, -1.1105, -0.9072, -0.8241,
+            -1.2310, -0.8098, -0.3320, -0.4101, -0.7985, -0.4996, -0.8213, -0.9128,
+            -1.0420, -1.1286, -1.0440, -0.7999, -0.8405, -1.2275, -1.5443, -1.4625,
+        ])
+        # fmt: on
+
+        input_speech = self._load_datasamples(1)
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        input_features = feature_extractor(input_speech, return_tensors="pt").input_features
+        self.assertEquals(input_features.shape, (1, 584, 24))
+        self.assertTrue(np.allclose(input_features[0, 0, :30], expected, atol=1e-4))
diff --git a/tests/models/speech_to_text/test_modeling_speech_to_text.py b/tests/models/speech_to_text/test_modeling_speech_to_text.py
index 627c2560b84b..d86fc43a8268 100644
--- a/tests/models/speech_to_text/test_modeling_speech_to_text.py
+++ b/tests/models/speech_to_text/test_modeling_speech_to_text.py
@@ -35,6 +35,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -267,9 +268,14 @@ def check_encoder_decoder_model_standalone(self, config, inputs_dict):
 
 
 @require_torch
-class Speech2TextModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class Speech2TextModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (Speech2TextModel, Speech2TextForConditionalGeneration) if is_torch_available() else ()
     all_generative_model_classes = (Speech2TextForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"automatic-speech-recognition": Speech2TextForConditionalGeneration, "feature-extraction": Speech2TextModel}
+        if is_torch_available()
+        else {}
+    )
     is_encoder_decoder = True
     fx_compatible = True
     test_pruning = False
@@ -706,8 +712,28 @@ def _create_and_check_torchscript(self, config, inputs_dict):
             model_state_dict = model.state_dict()
             loaded_model_state_dict = loaded_model.state_dict()
 
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
             self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
 
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
             models_equal = True
             for layer_name, p1 in model_state_dict.items():
                 p2 = loaded_model_state_dict[layer_name]
@@ -716,6 +742,10 @@ def _create_and_check_torchscript(self, config, inputs_dict):
 
             self.assertTrue(models_equal)
 
+    def test_pt_tf_model_equivalence(self, allow_missing_keys=True):
+        # Allow missing keys since TF doesn't cache the sinusoidal embeddings in an attribute
+        super().test_pt_tf_model_equivalence(allow_missing_keys=allow_missing_keys)
+
 
 @require_torch
 @require_torchaudio
diff --git a/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py b/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py
index 613af6be0cd0..c874d5c5c3ce 100644
--- a/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py
+++ b/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 """ Testing suite for the TensorFlow Speech2Text model. """
 
+from __future__ import annotations
+
 import inspect
 import unittest
 
@@ -23,6 +25,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -209,9 +212,10 @@ def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
 
 
 @require_tf
-class TFSpeech2TextModelTest(TFModelTesterMixin, unittest.TestCase):
+class TFSpeech2TextModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (TFSpeech2TextModel, TFSpeech2TextForConditionalGeneration) if is_tf_available() else ()
     all_generative_model_classes = (TFSpeech2TextForConditionalGeneration,) if is_tf_available() else ()
+    pipeline_model_mapping = {"feature-extraction": TFSpeech2TextModel} if is_tf_available() else {}
     is_encoder_decoder = True
     test_pruning = False
     test_missing_keys = False
@@ -554,6 +558,10 @@ def test_forward_signature(self):
             ]
             self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
 
+    def test_pt_tf_model_equivalence(self, allow_missing_keys=True):
+        # Allow missing keys since TF doesn't cache the sinusoidal embeddings in an attribute
+        super().test_pt_tf_model_equivalence(allow_missing_keys=allow_missing_keys)
+
 
 @require_tf
 @require_sentencepiece
diff --git a/tests/models/speech_to_text/test_tokenization_speech_to_text.py b/tests/models/speech_to_text/test_tokenization_speech_to_text.py
index 3b2ef9f456f4..46c242796727 100644
--- a/tests/models/speech_to_text/test_tokenization_speech_to_text.py
+++ b/tests/models/speech_to_text/test_tokenization_speech_to_text.py
@@ -24,7 +24,7 @@
 from ...test_tokenization_common import TokenizerTesterMixin
 
 
-SAMPLE_SP = get_tests_dir("fixtures/test_sentencepiece.model")
+SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
 
 if is_sentencepiece_available():
     import sentencepiece as sp
@@ -45,7 +45,7 @@ def setUp(self):
         super().setUp()
 
         spm_model = sp.SentencePieceProcessor()
-        spm_model.Load(SAMPLE_SP)
+        spm_model.Load(SAMPLE_VOCAB)
         vocab = ["<s>", "<pad>", "</s>", "<unk>"]
 
         vocab += [spm_model.IdToPiece(id_) for id_ in range(len(spm_model))]
@@ -54,7 +54,7 @@ def setUp(self):
         save_dir = Path(self.tmpdirname)
         save_json(vocab_tokens, save_dir / VOCAB_FILES_NAMES["vocab_file"])
         if not (save_dir / VOCAB_FILES_NAMES["spm_file"]).exists():
-            copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["spm_file"])
+            copyfile(SAMPLE_VOCAB, save_dir / VOCAB_FILES_NAMES["spm_file"])
 
         tokenizer = Speech2TextTokenizer.from_pretrained(self.tmpdirname)
         tokenizer.save_pretrained(self.tmpdirname)
diff --git a/tests/models/speech_to_text_2/test_modeling_speech_to_text_2.py b/tests/models/speech_to_text_2/test_modeling_speech_to_text_2.py
index 42899bd29a7d..cbb449c6e7ae 100644
--- a/tests/models/speech_to_text_2/test_modeling_speech_to_text_2.py
+++ b/tests/models/speech_to_text_2/test_modeling_speech_to_text_2.py
@@ -22,6 +22,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -49,7 +50,7 @@ def __init__(
         use_labels=True,
         decoder_start_token_id=2,
         decoder_ffn_dim=32,
-        decoder_layers=4,
+        decoder_layers=2,
         decoder_attention_heads=4,
         max_position_embeddings=30,
         pad_token_id=0,
@@ -176,9 +177,12 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class Speech2Text2StandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class Speech2Text2StandaloneDecoderModelTest(
+    ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase
+):
     all_model_classes = (Speech2Text2Decoder, Speech2Text2ForCausalLM) if is_torch_available() else ()
     all_generative_model_classes = (Speech2Text2ForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = {"text-generation": Speech2Text2ForCausalLM} if is_torch_available() else {}
     fx_compatible = True
     test_pruning = False
 
diff --git a/tests/models/speecht5/__init__.py b/tests/models/speecht5/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/speecht5/test_feature_extraction_speecht5.py b/tests/models/speecht5/test_feature_extraction_speecht5.py
new file mode 100644
index 000000000000..22d99a818047
--- /dev/null
+++ b/tests/models/speecht5/test_feature_extraction_speecht5.py
@@ -0,0 +1,421 @@
+# coding=utf-8
+# Copyright 2021-2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for the SpeechT5 feature extractors."""
+
+import itertools
+import random
+import unittest
+
+import numpy as np
+
+from transformers import BatchFeature, SpeechT5FeatureExtractor
+from transformers.testing_utils import require_torch
+from transformers.utils.import_utils import is_torch_available
+
+from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
+
+
+if is_torch_available():
+    import torch
+
+
+global_rng = random.Random()
+
+
+# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list
+def floats_list(shape, scale=1.0, rng=None, name=None):
+    """Creates a random float32 tensor"""
+    if rng is None:
+        rng = global_rng
+
+    values = []
+    for batch_idx in range(shape[0]):
+        values.append([])
+        for _ in range(shape[1]):
+            values[-1].append(rng.random() * scale)
+
+    return values
+
+
+@require_torch
+class SpeechT5FeatureExtractionTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        min_seq_length=400,
+        max_seq_length=2000,
+        feature_size=1,
+        padding_value=0.0,
+        sampling_rate=16000,
+        do_normalize=True,
+        num_mel_bins=80,
+        hop_length=16,
+        win_length=64,
+        win_function="hann_window",
+        fmin=80,
+        fmax=7600,
+        mel_floor=1e-10,
+        return_attention_mask=True,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.min_seq_length = min_seq_length
+        self.max_seq_length = max_seq_length
+        self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
+        self.feature_size = feature_size
+        self.padding_value = padding_value
+        self.sampling_rate = sampling_rate
+        self.do_normalize = do_normalize
+        self.num_mel_bins = num_mel_bins
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.win_function = win_function
+        self.fmin = fmin
+        self.fmax = fmax
+        self.mel_floor = mel_floor
+        self.return_attention_mask = return_attention_mask
+
+    def prepare_feat_extract_dict(self):
+        return {
+            "feature_size": self.feature_size,
+            "padding_value": self.padding_value,
+            "sampling_rate": self.sampling_rate,
+            "do_normalize": self.do_normalize,
+            "num_mel_bins": self.num_mel_bins,
+            "hop_length": self.hop_length,
+            "win_length": self.win_length,
+            "win_function": self.win_function,
+            "fmin": self.fmin,
+            "fmax": self.fmax,
+            "mel_floor": self.mel_floor,
+            "return_attention_mask": self.return_attention_mask,
+        }
+
+    def prepare_inputs_for_common(self, equal_length=False, numpify=False):
+        def _flatten(list_of_lists):
+            return list(itertools.chain(*list_of_lists))
+
+        if equal_length:
+            speech_inputs = floats_list((self.batch_size, self.max_seq_length))
+        else:
+            # make sure that inputs increase in size
+            speech_inputs = [
+                _flatten(floats_list((x, self.feature_size)))
+                for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
+            ]
+
+        if numpify:
+            speech_inputs = [np.asarray(x) for x in speech_inputs]
+
+        return speech_inputs
+
+    def prepare_inputs_for_target(self, equal_length=False, numpify=False):
+        if equal_length:
+            speech_inputs = [floats_list((self.max_seq_length, self.num_mel_bins)) for _ in range(self.batch_size)]
+        else:
+            # make sure that inputs increase in size
+            speech_inputs = [
+                floats_list((x, self.num_mel_bins))
+                for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
+            ]
+
+        if numpify:
+            speech_inputs = [np.asarray(x) for x in speech_inputs]
+
+        return speech_inputs
+
+
+@require_torch
+class SpeechT5FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
+    feature_extraction_class = SpeechT5FeatureExtractor
+
+    def setUp(self):
+        self.feat_extract_tester = SpeechT5FeatureExtractionTester(self)
+
+    def _check_zero_mean_unit_variance(self, input_vector):
+        self.assertTrue(np.all(np.mean(input_vector, axis=0) < 1e-3))
+        self.assertTrue(np.all(np.abs(np.var(input_vector, axis=0) - 1) < 1e-3))
+
+    def test_call(self):
+        # Tests that all call wrap to encode_plus and batch_encode_plus
+        feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        # create three inputs of length 800, 1000, and 1200
+        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
+        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
+
+        # Test not batched input
+        encoded_sequences_1 = feat_extract(speech_inputs[0], return_tensors="np").input_values
+        encoded_sequences_2 = feat_extract(np_speech_inputs[0], return_tensors="np").input_values
+        self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
+
+        # Test batched
+        encoded_sequences_1 = feat_extract(speech_inputs, return_tensors="np").input_values
+        encoded_sequences_2 = feat_extract(np_speech_inputs, return_tensors="np").input_values
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+
+    def test_zero_mean_unit_variance_normalization_np(self):
+        feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
+
+        paddings = ["longest", "max_length", "do_not_pad"]
+        max_lengths = [None, 1600, None]
+        for max_length, padding in zip(max_lengths, paddings):
+            processed = feat_extract(speech_inputs, padding=padding, max_length=max_length, return_tensors="np")
+            input_values = processed.input_values
+
+            self._check_zero_mean_unit_variance(input_values[0][:800])
+            self.assertTrue(input_values[0][800:].sum() < 1e-6)
+            self._check_zero_mean_unit_variance(input_values[1][:1000])
+            self.assertTrue(input_values[0][1000:].sum() < 1e-6)
+            self._check_zero_mean_unit_variance(input_values[2][:1200])
+
+    def test_zero_mean_unit_variance_normalization(self):
+        feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        lengths = range(800, 1400, 200)
+        speech_inputs = [floats_list((1, x))[0] for x in lengths]
+
+        paddings = ["longest", "max_length", "do_not_pad"]
+        max_lengths = [None, 1600, None]
+
+        for max_length, padding in zip(max_lengths, paddings):
+            processed = feat_extract(speech_inputs, max_length=max_length, padding=padding)
+            input_values = processed.input_values
+
+            self._check_zero_mean_unit_variance(input_values[0][:800])
+            self._check_zero_mean_unit_variance(input_values[1][:1000])
+            self._check_zero_mean_unit_variance(input_values[2][:1200])
+
+    def test_zero_mean_unit_variance_normalization_trunc_np_max_length(self):
+        feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
+        processed = feat_extract(
+            speech_inputs, truncation=True, max_length=1000, padding="max_length", return_tensors="np"
+        )
+        input_values = processed.input_values
+
+        self._check_zero_mean_unit_variance(input_values[0, :800])
+        self._check_zero_mean_unit_variance(input_values[1])
+        self._check_zero_mean_unit_variance(input_values[2])
+
+    def test_zero_mean_unit_variance_normalization_trunc_np_longest(self):
+        feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
+        processed = feat_extract(
+            speech_inputs, truncation=True, max_length=1000, padding="longest", return_tensors="np"
+        )
+        input_values = processed.input_values
+
+        self._check_zero_mean_unit_variance(input_values[0, :800])
+        self._check_zero_mean_unit_variance(input_values[1, :1000])
+        self._check_zero_mean_unit_variance(input_values[2])
+
+        # make sure that if max_length < longest -> then pad to max_length
+        self.assertTrue(input_values.shape == (3, 1000))
+
+        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
+        processed = feat_extract(
+            speech_inputs, truncation=True, max_length=2000, padding="longest", return_tensors="np"
+        )
+        input_values = processed.input_values
+
+        self._check_zero_mean_unit_variance(input_values[0, :800])
+        self._check_zero_mean_unit_variance(input_values[1, :1000])
+        self._check_zero_mean_unit_variance(input_values[2])
+
+        # make sure that if max_length > longest -> then pad to longest
+        self.assertTrue(input_values.shape == (3, 1200))
+
+    def test_double_precision_pad(self):
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        np_speech_inputs = np.random.rand(100).astype(np.float64)
+        py_speech_inputs = np_speech_inputs.tolist()
+
+        for inputs in [py_speech_inputs, np_speech_inputs]:
+            np_processed = feature_extractor.pad([{"input_values": inputs}], return_tensors="np")
+            self.assertTrue(np_processed.input_values.dtype == np.float32)
+            pt_processed = feature_extractor.pad([{"input_values": inputs}], return_tensors="pt")
+            self.assertTrue(pt_processed.input_values.dtype == torch.float32)
+
+    def test_call_target(self):
+        # Tests that all call wrap to encode_plus and batch_encode_plus
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        # create three inputs of length 800, 1000, and 1200
+        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
+        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
+
+        # Test feature size
+        input_values = feature_extractor(audio_target=np_speech_inputs, padding=True, return_tensors="np").input_values
+        self.assertTrue(input_values.ndim == 3)
+        self.assertTrue(input_values.shape[-1] == feature_extractor.num_mel_bins)
+
+        # Test not batched input
+        encoded_sequences_1 = feature_extractor(speech_inputs[0], return_tensors="np").input_values
+        encoded_sequences_2 = feature_extractor(np_speech_inputs[0], return_tensors="np").input_values
+        self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
+
+        # Test batched
+        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_values
+        encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_values
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+
+        # Test 2-D numpy arrays are batched.
+        speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
+        np_speech_inputs = np.asarray(speech_inputs)
+        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_values
+        encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_values
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+
+    def test_batch_feature_target(self):
+        speech_inputs = self.feat_extract_tester.prepare_inputs_for_target()
+        feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
+        input_name = feat_extract.model_input_names[0]
+
+        processed_features = BatchFeature({input_name: speech_inputs})
+
+        self.assertTrue(all(len(x) == len(y) for x, y in zip(speech_inputs, processed_features[input_name])))
+
+        speech_inputs = self.feat_extract_tester.prepare_inputs_for_target(equal_length=True)
+        processed_features = BatchFeature({input_name: speech_inputs}, tensor_type="np")
+
+        batch_features_input = processed_features[input_name]
+
+        if len(batch_features_input.shape) < 3:
+            batch_features_input = batch_features_input[:, :, None]
+
+        self.assertTrue(
+            batch_features_input.shape
+            == (self.feat_extract_tester.batch_size, len(speech_inputs[0]), self.feat_extract_tester.num_mel_bins)
+        )
+
+    @require_torch
+    def test_batch_feature_target_pt(self):
+        speech_inputs = self.feat_extract_tester.prepare_inputs_for_target(equal_length=True)
+        feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
+        input_name = feat_extract.model_input_names[0]
+
+        processed_features = BatchFeature({input_name: speech_inputs}, tensor_type="pt")
+
+        batch_features_input = processed_features[input_name]
+
+        if len(batch_features_input.shape) < 3:
+            batch_features_input = batch_features_input[:, :, None]
+
+        self.assertTrue(
+            batch_features_input.shape
+            == (self.feat_extract_tester.batch_size, len(speech_inputs[0]), self.feat_extract_tester.num_mel_bins)
+        )
+
+    @require_torch
+    def test_padding_accepts_tensors_target_pt(self):
+        feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
+        speech_inputs = self.feat_extract_tester.prepare_inputs_for_target()
+        input_name = feat_extract.model_input_names[0]
+
+        processed_features = BatchFeature({input_name: speech_inputs})
+
+        feat_extract.feature_size = feat_extract.num_mel_bins  # hack!
+
+        input_np = feat_extract.pad(processed_features, padding="longest", return_tensors="np")[input_name]
+        input_pt = feat_extract.pad(processed_features, padding="longest", return_tensors="pt")[input_name]
+
+        self.assertTrue(abs(input_np.astype(np.float32).sum() - input_pt.numpy().astype(np.float32).sum()) < 1e-2)
+
+    def test_attention_mask_target(self):
+        feat_dict = self.feat_extract_dict
+        feat_dict["return_attention_mask"] = True
+        feat_extract = self.feature_extraction_class(**feat_dict)
+        speech_inputs = self.feat_extract_tester.prepare_inputs_for_target()
+        input_lengths = [len(x) for x in speech_inputs]
+        input_name = feat_extract.model_input_names[0]
+
+        processed = BatchFeature({input_name: speech_inputs})
+
+        feat_extract.feature_size = feat_extract.num_mel_bins  # hack!
+
+        processed = feat_extract.pad(processed, padding="longest", return_tensors="np")
+        self.assertIn("attention_mask", processed)
+        self.assertListEqual(list(processed.attention_mask.shape), list(processed[input_name].shape[:2]))
+        self.assertListEqual(processed.attention_mask.sum(-1).tolist(), input_lengths)
+
+    def test_attention_mask_with_truncation_target(self):
+        feat_dict = self.feat_extract_dict
+        feat_dict["return_attention_mask"] = True
+        feat_extract = self.feature_extraction_class(**feat_dict)
+        speech_inputs = self.feat_extract_tester.prepare_inputs_for_target()
+        input_lengths = [len(x) for x in speech_inputs]
+        input_name = feat_extract.model_input_names[0]
+
+        processed = BatchFeature({input_name: speech_inputs})
+        max_length = min(input_lengths)
+
+        feat_extract.feature_size = feat_extract.num_mel_bins  # hack!
+
+        processed_pad = feat_extract.pad(
+            processed, padding="max_length", max_length=max_length, truncation=True, return_tensors="np"
+        )
+        self.assertIn("attention_mask", processed_pad)
+        self.assertListEqual(
+            list(processed_pad.attention_mask.shape), [processed_pad[input_name].shape[0], max_length]
+        )
+        self.assertListEqual(
+            processed_pad.attention_mask[:, :max_length].sum(-1).tolist(), [max_length for x in speech_inputs]
+        )
+
+    def _load_datasamples(self, num_samples):
+        from datasets import load_dataset
+
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        # automatic decoding with librispeech
+        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+
+        return [x["array"] for x in speech_samples]
+
+    def test_integration(self):
+        # fmt: off
+        EXPECTED_INPUT_VALUES = torch.tensor(
+            [2.3804e-03, 2.0752e-03, 1.9836e-03, 2.1057e-03, 1.6174e-03,
+             3.0518e-04, 9.1553e-05, 3.3569e-04, 9.7656e-04, 1.8311e-03,
+             2.0142e-03, 2.1057e-03, 1.7395e-03, 4.5776e-04, -3.9673e-04,
+             4.5776e-04, 1.0071e-03, 9.1553e-05, 4.8828e-04, 1.1597e-03,
+             7.3242e-04, 9.4604e-04, 1.8005e-03, 1.8311e-03, 8.8501e-04,
+             4.2725e-04, 4.8828e-04, 7.3242e-04, 1.0986e-03, 2.1057e-03]
+        )
+        # fmt: on
+
+        input_speech = self._load_datasamples(1)
+        feature_extractor = SpeechT5FeatureExtractor()
+        input_values = feature_extractor(input_speech, return_tensors="pt").input_values
+        self.assertEquals(input_values.shape, (1, 93680))
+        self.assertTrue(torch.allclose(input_values[0, :30], EXPECTED_INPUT_VALUES, atol=1e-6))
+
+    def test_integration_target(self):
+        # fmt: off
+        EXPECTED_INPUT_VALUES = torch.tensor(
+            [-2.6870, -3.0104, -3.1356, -3.5352, -3.0044, -3.0353, -3.4719, -3.6777,
+             -3.1520, -2.9435, -2.6553, -2.8795, -2.9944, -2.5921, -3.0279, -3.0386,
+             -3.0864, -3.1291, -3.2353, -2.7444, -2.6831, -2.7287, -3.1761, -3.1571,
+             -3.2726, -3.0582, -3.1007, -3.4533, -3.4695, -3.0998]
+        )
+        # fmt: on
+
+        input_speech = self._load_datasamples(1)
+        feature_extractor = SpeechT5FeatureExtractor()
+        input_values = feature_extractor(audio_target=input_speech, return_tensors="pt").input_values
+        self.assertEquals(input_values.shape, (1, 366, 80))
+        self.assertTrue(torch.allclose(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, atol=1e-4))
diff --git a/tests/models/speecht5/test_modeling_speecht5.py b/tests/models/speecht5/test_modeling_speecht5.py
new file mode 100644
index 000000000000..784461eb9a23
--- /dev/null
+++ b/tests/models/speecht5/test_modeling_speecht5.py
@@ -0,0 +1,1608 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch SpeechT5 model. """
+
+import copy
+import inspect
+import tempfile
+import unittest
+
+from transformers import SpeechT5Config, SpeechT5HifiGanConfig
+from transformers.testing_utils import (
+    is_torch_available,
+    require_sentencepiece,
+    require_tokenizers,
+    require_torch,
+    slow,
+    torch_device,
+)
+from transformers.trainer_utils import set_seed
+from transformers.utils import cached_property
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import (
+    ModelTesterMixin,
+    _config_zero_init,
+    floats_tensor,
+    ids_tensor,
+    random_attention_mask,
+)
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        SpeechT5ForSpeechToSpeech,
+        SpeechT5ForSpeechToText,
+        SpeechT5ForTextToSpeech,
+        SpeechT5HifiGan,
+        SpeechT5Model,
+        SpeechT5Processor,
+    )
+
+
+def prepare_inputs_dict(
+    config,
+    input_ids=None,
+    input_values=None,
+    decoder_input_ids=None,
+    decoder_input_values=None,
+    attention_mask=None,
+    decoder_attention_mask=None,
+    head_mask=None,
+    decoder_head_mask=None,
+    cross_attn_head_mask=None,
+):
+    if input_ids is not None:
+        encoder_dict = {"input_ids": input_ids}
+    else:
+        encoder_dict = {"input_values": input_values}
+
+    if decoder_input_ids is not None:
+        decoder_dict = {"decoder_input_ids": decoder_input_ids}
+    else:
+        decoder_dict = {"decoder_input_values": decoder_input_values}
+
+    if head_mask is None:
+        head_mask = torch.ones(config.encoder_layers, config.encoder_attention_heads, device=torch_device)
+    if decoder_head_mask is None:
+        decoder_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads, device=torch_device)
+    if cross_attn_head_mask is None:
+        cross_attn_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads, device=torch_device)
+
+    return {
+        **encoder_dict,
+        **decoder_dict,
+        "attention_mask": attention_mask,
+        "decoder_attention_mask": decoder_attention_mask,
+        "head_mask": head_mask,
+        "decoder_head_mask": decoder_head_mask,
+        "cross_attn_head_mask": cross_attn_head_mask,
+    }
+
+
+@require_torch
+class SpeechT5ModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=False,
+        vocab_size=81,
+        hidden_size=24,
+        num_hidden_layers=2,
+        num_attention_heads=2,
+        intermediate_size=4,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+
+    def prepare_config_and_inputs(self):
+        input_values = floats_tensor([self.batch_size, self.seq_length, self.hidden_size], scale=1.0)
+        attention_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        decoder_input_values = floats_tensor([self.batch_size, self.seq_length, self.hidden_size], scale=1.0)
+        decoder_attention_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        config = self.get_config()
+        inputs_dict = prepare_inputs_dict(
+            config,
+            input_values=input_values,
+            decoder_input_values=decoder_input_values,
+            attention_mask=attention_mask,
+            decoder_attention_mask=decoder_attention_mask,
+        )
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def get_config(self):
+        return SpeechT5Config(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+        )
+
+    def create_and_check_model_forward(self, config, inputs_dict):
+        model = SpeechT5Model(config=config).to(torch_device).eval()
+
+        input_values = inputs_dict["input_values"]
+        attention_mask = inputs_dict["attention_mask"]
+        decoder_input_values = inputs_dict["decoder_input_values"]
+
+        result = model(input_values, attention_mask=attention_mask, decoder_input_values=decoder_input_values)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+
+@require_torch
+class SpeechT5ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (SpeechT5Model,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"automatic-speech-recognition": SpeechT5ForSpeechToText, "feature-extraction": SpeechT5Model}
+        if is_torch_available()
+        else {}
+    )
+    is_encoder_decoder = True
+    test_pruning = False
+    test_headmasking = False
+    test_resize_embeddings = False
+
+    input_name = "input_values"
+
+    def setUp(self):
+        self.model_tester = SpeechT5ModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=SpeechT5Config, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model_forward(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_forward(*config_and_inputs)
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = [
+                "input_values",
+                "attention_mask",
+                "decoder_input_values",
+                "decoder_attention_mask",
+            ]
+            expected_arg_names.extend(
+                ["head_mask", "decoder_head_mask", "cross_attn_head_mask", "encoder_outputs"]
+                if "head_mask" and "decoder_head_mask" and "cross_attn_head_mask" in arg_names
+                else ["encoder_outputs"]
+            )
+            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+
+    # this model has no inputs_embeds
+    def test_inputs_embeds(self):
+        pass
+
+    # this model has no input embeddings
+    def test_model_common_attributes(self):
+        pass
+
+    def test_retain_grad_hidden_states_attentions(self):
+        # decoder cannot keep gradients
+        pass
+
+    @slow
+    def test_torchscript_output_attentions(self):
+        # disabled because this model doesn't have decoder_input_ids
+        pass
+
+    @slow
+    def test_torchscript_output_hidden_state(self):
+        # disabled because this model doesn't have decoder_input_ids
+        pass
+
+    @slow
+    def test_torchscript_simple(self):
+        # disabled because this model doesn't have decoder_input_ids
+        pass
+
+
+@require_torch
+class SpeechT5ForSpeechToTextTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        encoder_seq_length=1024,  # speech is longer
+        decoder_seq_length=7,
+        is_training=False,
+        hidden_size=24,
+        num_hidden_layers=2,
+        num_attention_heads=2,
+        intermediate_size=4,
+        conv_dim=(32, 32, 32),
+        conv_stride=(4, 4, 4),
+        conv_kernel=(8, 8, 8),
+        conv_bias=False,
+        num_conv_pos_embeddings=16,
+        num_conv_pos_embedding_groups=2,
+        vocab_size=81,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.encoder_seq_length = encoder_seq_length
+        self.decoder_seq_length = decoder_seq_length
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.conv_dim = conv_dim
+        self.conv_stride = conv_stride
+        self.conv_kernel = conv_kernel
+        self.conv_bias = conv_bias
+        self.num_conv_pos_embeddings = num_conv_pos_embeddings
+        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
+        self.vocab_size = vocab_size
+
+    def prepare_config_and_inputs(self):
+        input_values = floats_tensor([self.batch_size, self.encoder_seq_length], scale=1.0)
+        attention_mask = random_attention_mask([self.batch_size, self.encoder_seq_length])
+
+        decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size).clamp(2)
+        decoder_attention_mask = random_attention_mask([self.batch_size, self.decoder_seq_length])
+
+        config = self.get_config()
+        inputs_dict = prepare_inputs_dict(
+            config,
+            input_values=input_values,
+            decoder_input_ids=decoder_input_ids,
+            attention_mask=attention_mask,
+            decoder_attention_mask=decoder_attention_mask,
+        )
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def get_config(self):
+        return SpeechT5Config(
+            hidden_size=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            conv_dim=self.conv_dim,
+            conv_stride=self.conv_stride,
+            conv_kernel=self.conv_kernel,
+            conv_bias=self.conv_bias,
+            num_conv_pos_embeddings=self.num_conv_pos_embeddings,
+            num_conv_pos_embedding_groups=self.num_conv_pos_embedding_groups,
+            vocab_size=self.vocab_size,
+        )
+
+    def create_and_check_model_forward(self, config, inputs_dict):
+        model = SpeechT5ForSpeechToText(config=config).to(torch_device).eval()
+
+        input_values = inputs_dict["input_values"]
+        attention_mask = inputs_dict["attention_mask"]
+        decoder_input_ids = inputs_dict["decoder_input_ids"]
+
+        result = model(input_values, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.decoder_seq_length, self.vocab_size))
+
+    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
+        model = SpeechT5ForSpeechToText(config=config).get_decoder().to(torch_device).eval()
+        input_ids = inputs_dict["decoder_input_ids"]
+        attention_mask = inputs_dict["decoder_attention_mask"]
+
+        # first forward pass
+        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
+
+        output, past_key_values = outputs.to_tuple()
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size).clamp(2)
+        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
+
+        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
+            "last_hidden_state"
+        ]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-2))
+
+
+@require_torch
+class SpeechT5ForSpeechToTextTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (SpeechT5ForSpeechToText,) if is_torch_available() else ()
+    all_generative_model_classes = (SpeechT5ForSpeechToText,) if is_torch_available() else ()
+    is_encoder_decoder = True
+    test_pruning = False
+    test_headmasking = False
+
+    input_name = "input_values"
+
+    def setUp(self):
+        self.model_tester = SpeechT5ForSpeechToTextTester(self)
+        self.config_tester = ConfigTester(self, config_class=SpeechT5Config, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_save_load_strict(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
+            self.assertEqual(info["missing_keys"], [])
+
+    def test_model_forward(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_forward(*config_and_inputs)
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        seq_len = getattr(self.model_tester, "seq_length", None)
+        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
+        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
+        decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length)
+        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            subsampled_encoder_seq_length = model.speecht5.encoder.prenet._get_feat_extract_output_lengths(
+                encoder_seq_length
+            )
+            subsampled_encoder_key_length = model.speecht5.encoder.prenet._get_feat_extract_output_lengths(
+                encoder_key_length
+            )
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, subsampled_encoder_seq_length, subsampled_encoder_key_length],
+            )
+            out_len = len(outputs)
+
+            correct_outlen = 5
+
+            # loss is at first position
+            if "labels" in inputs_dict:
+                correct_outlen += 1  # loss is added to beginning
+            if "past_key_values" in outputs:
+                correct_outlen += 1  # past_key_values have been returned
+
+            self.assertEqual(out_len, correct_outlen)
+
+            # decoder attentions
+            decoder_attentions = outputs.decoder_attentions
+            self.assertIsInstance(decoder_attentions, (list, tuple))
+            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(decoder_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
+            )
+
+            # cross attentions
+            cross_attentions = outputs.cross_attentions
+            self.assertIsInstance(cross_attentions, (list, tuple))
+            self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(cross_attentions[0].shape[-3:]),
+                [
+                    self.model_tester.num_attention_heads,
+                    decoder_seq_length,
+                    subsampled_encoder_key_length,
+                ],
+            )
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            added_hidden_states = 2
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(self_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, subsampled_encoder_seq_length, subsampled_encoder_key_length],
+            )
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = [
+                "input_values",
+                "attention_mask",
+                "decoder_input_ids",
+                "decoder_attention_mask",
+            ]
+            expected_arg_names.extend(
+                ["head_mask", "decoder_head_mask", "cross_attn_head_mask", "encoder_outputs"]
+                if "head_mask" and "decoder_head_mask" and "cross_attn_head_mask" in arg_names
+                else ["encoder_outputs"]
+            )
+            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+            expected_num_layers = getattr(
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+            )
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            if hasattr(self.model_tester, "encoder_seq_length"):
+                seq_length = self.model_tester.encoder_seq_length
+            else:
+                seq_length = self.model_tester.seq_length
+
+            subsampled_seq_length = model.speecht5.encoder.prenet._get_feat_extract_output_lengths(seq_length)
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [subsampled_seq_length, self.model_tester.hidden_size],
+            )
+
+            if config.is_encoder_decoder:
+                hidden_states = outputs.decoder_hidden_states
+
+                self.assertIsInstance(hidden_states, (list, tuple))
+                self.assertEqual(len(hidden_states), expected_num_layers)
+                seq_len = getattr(self.model_tester, "seq_length", None)
+                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
+
+                self.assertListEqual(
+                    list(hidden_states[0].shape[-2:]),
+                    [decoder_seq_length, self.model_tester.hidden_size],
+                )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                uniform_init_parms = [
+                    "conv.weight",
+                    "conv.parametrizations.weight",
+                    "masked_spec_embed",
+                    "feature_projection.projection.weight",
+                    "feature_projection.projection.bias",
+                ]
+                if param.requires_grad:
+                    if any(x in name for x in uniform_init_parms):
+                        self.assertTrue(
+                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    else:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+    # this model has no inputs_embeds
+    def test_inputs_embeds(self):
+        pass
+
+    def test_resize_embeddings_untied(self):
+        original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        if not self.test_resize_embeddings:
+            return
+
+        original_config.tie_word_embeddings = False
+
+        # if model cannot untied embeddings -> leave test
+        if original_config.tie_word_embeddings:
+            return
+
+        for model_class in self.all_model_classes:
+            config = copy.deepcopy(original_config)
+            model = model_class(config).to(torch_device)
+
+            # if no output embeddings -> leave test
+            if model.get_output_embeddings() is None:
+                continue
+
+            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
+            model_vocab_size = config.vocab_size
+            model.resize_token_embeddings(model_vocab_size + 10)
+            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
+            output_embeds = model.get_output_embeddings()
+            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10)
+            # Check bias if present
+            if output_embeds.bias is not None:
+                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
+            model.resize_token_embeddings(model_vocab_size - 15)
+            self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
+            # Check that it actually resizes the embeddings matrix
+            output_embeds = model.get_output_embeddings()
+            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15)
+            # Check bias if present
+            if output_embeds.bias is not None:
+                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            if "decoder_input_ids" in inputs_dict:
+                inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+    def test_resize_tokens_embeddings(self):
+        original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        if not self.test_resize_embeddings:
+            return
+
+        for model_class in self.all_model_classes:
+            config = copy.deepcopy(original_config)
+            model = model_class(config)
+            model.to(torch_device)
+
+            if self.model_tester.is_training is False:
+                model.eval()
+
+            model_vocab_size = config.vocab_size
+            # Retrieve the embeddings and clone theme
+            model_embed = model.resize_token_embeddings(model_vocab_size)
+            cloned_embeddings = model_embed.weight.clone()
+
+            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
+            model_embed = model.resize_token_embeddings(model_vocab_size + 10)
+            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
+            # Check that it actually resizes the embeddings matrix
+            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
+            model_embed = model.resize_token_embeddings(model_vocab_size - 15)
+            self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
+            # Check that it actually resizes the embeddings matrix
+            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
+
+            # make sure that decoder_input_ids are resized
+            if "decoder_input_ids" in inputs_dict:
+                inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
+            models_equal = True
+            for p1, p2 in zip(cloned_embeddings, model_embed.weight):
+                if p1.data.ne(p2.data).sum() > 0:
+                    models_equal = False
+
+            self.assertTrue(models_equal)
+
+    def test_retain_grad_hidden_states_attentions(self):
+        # decoder cannot keep gradients
+        pass
+
+    # training is not supported yet
+    def test_training(self):
+        pass
+
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    # overwrite from test_modeling_common
+    def _mock_init_weights(self, module):
+        if hasattr(module, "weight") and module.weight is not None:
+            module.weight.data.fill_(3)
+        if hasattr(module, "weight_g") and module.weight_g is not None:
+            module.weight_g.data.fill_(3)
+        if hasattr(module, "weight_v") and module.weight_v is not None:
+            module.weight_v.data.fill_(3)
+        if hasattr(module, "bias") and module.bias is not None:
+            module.bias.data.fill_(3)
+        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
+            module.masked_spec_embed.data.fill_(3)
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+@slow
+class SpeechT5ForSpeechToTextIntegrationTests(unittest.TestCase):
+    @cached_property
+    def default_processor(self):
+        return SpeechT5Processor.from_pretrained("microsoft/speecht5_asr")
+
+    def _load_datasamples(self, num_samples):
+        from datasets import load_dataset
+
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        # automatic decoding with librispeech
+        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+
+        return [x["array"] for x in speech_samples]
+
+    def test_generation_librispeech(self):
+        model = SpeechT5ForSpeechToText.from_pretrained("microsoft/speecht5_asr")
+        model.to(torch_device)
+        processor = self.default_processor
+
+        input_speech = self._load_datasamples(1)
+
+        input_values = processor(audio=input_speech, return_tensors="pt").input_values.to(torch_device)
+
+        generated_ids = model.generate(input_values)
+        generated_transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)
+
+        EXPECTED_TRANSCRIPTIONS = [
+            "mister quilter is the apostle of the middle classes and we are glad to welcome his gospel"
+        ]
+        self.assertListEqual(generated_transcript, EXPECTED_TRANSCRIPTIONS)
+
+    def test_generation_librispeech_batched(self):
+        model = SpeechT5ForSpeechToText.from_pretrained("microsoft/speecht5_asr")
+        model.to(torch_device)
+        processor = self.default_processor
+
+        input_speech = self._load_datasamples(4)
+
+        inputs = processor(audio=input_speech, return_tensors="pt", padding=True)
+
+        input_values = inputs.input_values.to(torch_device)
+        attention_mask = inputs.attention_mask.to(torch_device)
+
+        generated_ids = model.generate(input_values, attention_mask=attention_mask)
+        generated_transcripts = processor.batch_decode(generated_ids, skip_special_tokens=True)
+
+        EXPECTED_TRANSCRIPTIONS = [
+            "mister quilter is the apostle of the middle classes and we are glad to welcome his gospel",
+            "nor is mister quilter's manner less interesting than his matter",
+            "he tells us that at this festive season of the year with christmas and rosebeaf looming before us"
+            " similars drawn from eating and its results occur most readily to the mind",
+            "he has grave doubts whether sir frederick latin's work is really greek after all and can discover in it"
+            " but little of rocky ithica",
+        ]
+        self.assertListEqual(generated_transcripts, EXPECTED_TRANSCRIPTIONS)
+
+
+@require_torch
+class SpeechT5ForTextToSpeechTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        encoder_seq_length=7,
+        decoder_seq_length=1024,  # speech is longer
+        is_training=False,
+        hidden_size=24,
+        num_hidden_layers=2,
+        num_attention_heads=2,
+        intermediate_size=4,
+        vocab_size=81,
+        num_mel_bins=20,
+        reduction_factor=2,
+        speech_decoder_postnet_layers=2,
+        speech_decoder_postnet_units=32,
+        speech_decoder_prenet_units=32,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.encoder_seq_length = encoder_seq_length
+        self.decoder_seq_length = decoder_seq_length
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.vocab_size = vocab_size
+        self.num_mel_bins = num_mel_bins
+        self.reduction_factor = reduction_factor
+        self.speech_decoder_postnet_layers = speech_decoder_postnet_layers
+        self.speech_decoder_postnet_units = speech_decoder_postnet_units
+        self.speech_decoder_prenet_units = speech_decoder_prenet_units
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size).clamp(2)
+        attention_mask = random_attention_mask([self.batch_size, self.encoder_seq_length])
+
+        decoder_input_values = floats_tensor([self.batch_size, self.decoder_seq_length, self.num_mel_bins], scale=1.0)
+        decoder_attention_mask = random_attention_mask([self.batch_size, self.decoder_seq_length])
+
+        config = self.get_config()
+        inputs_dict = prepare_inputs_dict(
+            config,
+            input_ids=input_ids,
+            decoder_input_values=decoder_input_values,
+            attention_mask=attention_mask,
+            decoder_attention_mask=decoder_attention_mask,
+        )
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def get_config(self):
+        return SpeechT5Config(
+            hidden_size=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            vocab_size=self.vocab_size,
+            num_mel_bins=self.num_mel_bins,
+            reduction_factor=self.reduction_factor,
+            speech_decoder_postnet_layers=self.speech_decoder_postnet_layers,
+            speech_decoder_postnet_units=self.speech_decoder_postnet_units,
+            speech_decoder_prenet_units=self.speech_decoder_prenet_units,
+        )
+
+    def create_and_check_model_forward(self, config, inputs_dict):
+        model = SpeechT5ForTextToSpeech(config=config).to(torch_device).eval()
+
+        input_ids = inputs_dict["input_ids"]
+        attention_mask = inputs_dict["attention_mask"]
+        decoder_input_values = inputs_dict["decoder_input_values"]
+
+        result = model(input_ids, attention_mask=attention_mask, decoder_input_values=decoder_input_values)
+        self.parent.assertEqual(
+            result.spectrogram.shape,
+            (self.batch_size, self.decoder_seq_length * self.reduction_factor, self.num_mel_bins),
+        )
+
+
+@require_torch
+class SpeechT5ForTextToSpeechTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (SpeechT5ForTextToSpeech,) if is_torch_available() else ()
+    all_generative_model_classes = (SpeechT5ForTextToSpeech,) if is_torch_available() else ()
+    is_encoder_decoder = True
+    test_pruning = False
+    test_headmasking = False
+
+    input_name = "input_ids"
+
+    def setUp(self):
+        self.model_tester = SpeechT5ForTextToSpeechTester(self)
+        self.config_tester = ConfigTester(self, config_class=SpeechT5Config, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_save_load_strict(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
+            self.assertEqual(info["missing_keys"], [])
+
+    def test_model_forward(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_forward(*config_and_inputs)
+
+    # skipped because there is always dropout in SpeechT5SpeechDecoderPrenet
+    def test_decoder_model_past_with_large_inputs(self):
+        pass
+
+    # skipped because there is always dropout in SpeechT5SpeechDecoderPrenet
+    def test_determinism(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = [
+                "input_ids",
+                "attention_mask",
+                "decoder_input_values",
+                "decoder_attention_mask",
+            ]
+            expected_arg_names.extend(
+                ["head_mask", "decoder_head_mask", "cross_attn_head_mask", "encoder_outputs"]
+                if "head_mask" and "decoder_head_mask" and "cross_attn_head_mask" in arg_names
+                else ["encoder_outputs"]
+            )
+            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                uniform_init_parms = [
+                    "conv.weight",
+                ]
+                if param.requires_grad:
+                    if any(x in name for x in uniform_init_parms):
+                        self.assertTrue(
+                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    else:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+    # this model has no inputs_embeds
+    def test_inputs_embeds(self):
+        pass
+
+    # skipped because there is always dropout in SpeechT5SpeechDecoderPrenet
+    def test_model_outputs_equivalence(self):
+        pass
+
+    # skipped because there is always dropout in SpeechT5SpeechDecoderPrenet
+    def test_save_load(self):
+        pass
+
+    def test_retain_grad_hidden_states_attentions(self):
+        # decoder cannot keep gradients
+        pass
+
+    @slow
+    def test_torchscript_output_attentions(self):
+        # disabled because this model doesn't have decoder_input_ids
+        pass
+
+    @slow
+    def test_torchscript_output_hidden_state(self):
+        # disabled because this model doesn't have decoder_input_ids
+        pass
+
+    @slow
+    def test_torchscript_simple(self):
+        # disabled because this model doesn't have decoder_input_ids
+        pass
+
+    # training is not supported yet
+    def test_training(self):
+        pass
+
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    # overwrite from test_modeling_common
+    def _mock_init_weights(self, module):
+        if hasattr(module, "weight") and module.weight is not None:
+            module.weight.data.fill_(3)
+        if hasattr(module, "weight_g") and module.weight_g is not None:
+            module.weight_g.data.fill_(3)
+        if hasattr(module, "weight_v") and module.weight_v is not None:
+            module.weight_v.data.fill_(3)
+        if hasattr(module, "bias") and module.bias is not None:
+            module.bias.data.fill_(3)
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+@slow
+class SpeechT5ForTextToSpeechIntegrationTests(unittest.TestCase):
+    @cached_property
+    def default_processor(self):
+        return SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+
+    def test_generation(self):
+        model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
+        model.to(torch_device)
+        processor = self.default_processor
+
+        set_seed(555)  # make deterministic
+
+        input_text = "mister quilter is the apostle of the middle classes and we are glad to welcome his gospel"
+        input_ids = processor(text=input_text, return_tensors="pt").input_ids.to(torch_device)
+
+        generated_speech = model.generate_speech(input_ids)
+        self.assertEqual(generated_speech.shape, (1820, model.config.num_mel_bins))
+
+        # test model.generate, same method than generate_speech but with additional kwargs to absorb kwargs such as attention_mask
+        generated_speech_with_generate = model.generate(input_ids, attention_mask=None)
+        self.assertEqual(generated_speech_with_generate.shape, (1820, model.config.num_mel_bins))
+
+
+@require_torch
+class SpeechT5ForSpeechToSpeechTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        encoder_seq_length=1024,  # speech is longer
+        decoder_seq_length=1024,
+        is_training=False,
+        hidden_size=24,
+        num_hidden_layers=2,
+        num_attention_heads=2,
+        intermediate_size=4,
+        conv_dim=(32, 32, 32),
+        conv_stride=(4, 4, 4),
+        conv_kernel=(8, 8, 8),
+        conv_bias=False,
+        num_conv_pos_embeddings=16,
+        num_conv_pos_embedding_groups=2,
+        vocab_size=81,
+        num_mel_bins=20,
+        reduction_factor=2,
+        speech_decoder_postnet_layers=2,
+        speech_decoder_postnet_units=32,
+        speech_decoder_prenet_units=32,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.encoder_seq_length = encoder_seq_length
+        self.decoder_seq_length = decoder_seq_length
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.conv_dim = conv_dim
+        self.conv_stride = conv_stride
+        self.conv_kernel = conv_kernel
+        self.conv_bias = conv_bias
+        self.num_conv_pos_embeddings = num_conv_pos_embeddings
+        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
+        self.vocab_size = vocab_size
+        self.num_mel_bins = num_mel_bins
+        self.reduction_factor = reduction_factor
+        self.speech_decoder_postnet_layers = speech_decoder_postnet_layers
+        self.speech_decoder_postnet_units = speech_decoder_postnet_units
+        self.speech_decoder_prenet_units = speech_decoder_prenet_units
+
+    def prepare_config_and_inputs(self):
+        input_values = floats_tensor([self.batch_size, self.encoder_seq_length], scale=1.0)
+        attention_mask = random_attention_mask([self.batch_size, self.encoder_seq_length])
+
+        decoder_input_values = floats_tensor([self.batch_size, self.decoder_seq_length, self.num_mel_bins], scale=1.0)
+        decoder_attention_mask = random_attention_mask([self.batch_size, self.decoder_seq_length])
+
+        config = self.get_config()
+        inputs_dict = prepare_inputs_dict(
+            config,
+            input_values=input_values,
+            decoder_input_values=decoder_input_values,
+            attention_mask=attention_mask,
+            decoder_attention_mask=decoder_attention_mask,
+        )
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def get_config(self):
+        return SpeechT5Config(
+            hidden_size=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            conv_dim=self.conv_dim,
+            conv_stride=self.conv_stride,
+            conv_kernel=self.conv_kernel,
+            conv_bias=self.conv_bias,
+            num_conv_pos_embeddings=self.num_conv_pos_embeddings,
+            num_conv_pos_embedding_groups=self.num_conv_pos_embedding_groups,
+            vocab_size=self.vocab_size,
+            num_mel_bins=self.num_mel_bins,
+            reduction_factor=self.reduction_factor,
+            speech_decoder_postnet_layers=self.speech_decoder_postnet_layers,
+            speech_decoder_postnet_units=self.speech_decoder_postnet_units,
+            speech_decoder_prenet_units=self.speech_decoder_prenet_units,
+        )
+
+    def create_and_check_model_forward(self, config, inputs_dict):
+        model = SpeechT5ForSpeechToSpeech(config=config).to(torch_device).eval()
+
+        input_values = inputs_dict["input_values"]
+        attention_mask = inputs_dict["attention_mask"]
+        decoder_input_values = inputs_dict["decoder_input_values"]
+
+        result = model(input_values, attention_mask=attention_mask, decoder_input_values=decoder_input_values)
+        self.parent.assertEqual(
+            result.spectrogram.shape,
+            (self.batch_size, self.decoder_seq_length * self.reduction_factor, self.num_mel_bins),
+        )
+
+
+@require_torch
+class SpeechT5ForSpeechToSpeechTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (SpeechT5ForSpeechToSpeech,) if is_torch_available() else ()
+    all_generative_model_classes = (SpeechT5ForSpeechToSpeech,) if is_torch_available() else ()
+    is_encoder_decoder = True
+    test_pruning = False
+    test_headmasking = False
+    test_resize_embeddings = False
+
+    input_name = "input_values"
+
+    def setUp(self):
+        self.model_tester = SpeechT5ForSpeechToSpeechTester(self)
+        self.config_tester = ConfigTester(self, config_class=SpeechT5Config, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_save_load_strict(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
+            self.assertEqual(info["missing_keys"], [])
+
+    def test_model_forward(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_forward(*config_and_inputs)
+
+    # skipped because there is always dropout in SpeechT5SpeechDecoderPrenet
+    def test_decoder_model_past_with_large_inputs(self):
+        pass
+
+    # skipped because there is always dropout in SpeechT5SpeechDecoderPrenet
+    def test_determinism(self):
+        pass
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        seq_len = getattr(self.model_tester, "seq_length", None)
+        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
+        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
+        decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length)
+        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            subsampled_encoder_seq_length = model.speecht5.encoder.prenet._get_feat_extract_output_lengths(
+                encoder_seq_length
+            )
+            subsampled_encoder_key_length = model.speecht5.encoder.prenet._get_feat_extract_output_lengths(
+                encoder_key_length
+            )
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, subsampled_encoder_seq_length, subsampled_encoder_key_length],
+            )
+            out_len = len(outputs)
+
+            correct_outlen = 5
+
+            # loss is at first position
+            if "labels" in inputs_dict:
+                correct_outlen += 1  # loss is added to beginning
+            if "past_key_values" in outputs:
+                correct_outlen += 1  # past_key_values have been returned
+
+            self.assertEqual(out_len, correct_outlen)
+
+            # decoder attentions
+            decoder_attentions = outputs.decoder_attentions
+            self.assertIsInstance(decoder_attentions, (list, tuple))
+            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(decoder_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
+            )
+
+            # cross attentions
+            cross_attentions = outputs.cross_attentions
+            self.assertIsInstance(cross_attentions, (list, tuple))
+            self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(cross_attentions[0].shape[-3:]),
+                [
+                    self.model_tester.num_attention_heads,
+                    decoder_seq_length,
+                    subsampled_encoder_key_length,
+                ],
+            )
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            added_hidden_states = 2
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(self_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, subsampled_encoder_seq_length, subsampled_encoder_key_length],
+            )
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = [
+                "input_values",
+                "attention_mask",
+                "decoder_input_values",
+                "decoder_attention_mask",
+            ]
+            expected_arg_names.extend(
+                ["head_mask", "decoder_head_mask", "cross_attn_head_mask", "encoder_outputs"]
+                if "head_mask" and "decoder_head_mask" and "cross_attn_head_mask" in arg_names
+                else ["encoder_outputs"]
+            )
+            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+            expected_num_layers = getattr(
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+            )
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            if hasattr(self.model_tester, "encoder_seq_length"):
+                seq_length = self.model_tester.encoder_seq_length
+            else:
+                seq_length = self.model_tester.seq_length
+
+            subsampled_seq_length = model.speecht5.encoder.prenet._get_feat_extract_output_lengths(seq_length)
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [subsampled_seq_length, self.model_tester.hidden_size],
+            )
+
+            if config.is_encoder_decoder:
+                hidden_states = outputs.decoder_hidden_states
+
+                self.assertIsInstance(hidden_states, (list, tuple))
+                self.assertEqual(len(hidden_states), expected_num_layers)
+                seq_len = getattr(self.model_tester, "seq_length", None)
+                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
+
+                self.assertListEqual(
+                    list(hidden_states[0].shape[-2:]),
+                    [decoder_seq_length, self.model_tester.hidden_size],
+                )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                uniform_init_parms = [
+                    "conv.weight",
+                    "conv.parametrizations.weight",
+                    "masked_spec_embed",
+                    "feature_projection.projection.weight",
+                    "feature_projection.projection.bias",
+                ]
+                if param.requires_grad:
+                    if any(x in name for x in uniform_init_parms):
+                        self.assertTrue(
+                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    else:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+    # this model has no inputs_embeds
+    def test_inputs_embeds(self):
+        pass
+
+    # this model has no input embeddings
+    def test_model_common_attributes(self):
+        pass
+
+    # skipped because there is always dropout in SpeechT5SpeechDecoderPrenet
+    def test_model_outputs_equivalence(self):
+        pass
+
+    def test_retain_grad_hidden_states_attentions(self):
+        # decoder cannot keep gradients
+        pass
+
+    # skipped because there is always dropout in SpeechT5SpeechDecoderPrenet
+    def test_save_load(self):
+        pass
+
+    @slow
+    def test_torchscript_output_attentions(self):
+        # disabled because this model doesn't have decoder_input_ids
+        pass
+
+    @slow
+    def test_torchscript_output_hidden_state(self):
+        # disabled because this model doesn't have decoder_input_ids
+        pass
+
+    @slow
+    def test_torchscript_simple(self):
+        # disabled because this model doesn't have decoder_input_ids
+        pass
+
+    # training is not supported yet
+    def test_training(self):
+        pass
+
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    # overwrite from test_modeling_common
+    def _mock_init_weights(self, module):
+        if hasattr(module, "weight") and module.weight is not None:
+            module.weight.data.fill_(3)
+        if hasattr(module, "weight_g") and module.weight_g is not None:
+            module.weight_g.data.fill_(3)
+        if hasattr(module, "weight_v") and module.weight_v is not None:
+            module.weight_v.data.fill_(3)
+        if hasattr(module, "bias") and module.bias is not None:
+            module.bias.data.fill_(3)
+        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
+            module.masked_spec_embed.data.fill_(3)
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+@slow
+class SpeechT5ForSpeechToSpeechIntegrationTests(unittest.TestCase):
+    @cached_property
+    def default_processor(self):
+        return SpeechT5Processor.from_pretrained("microsoft/speecht5_vc")
+
+    def _load_datasamples(self, num_samples):
+        from datasets import load_dataset
+
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        # automatic decoding with librispeech
+        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+
+        return [x["array"] for x in speech_samples]
+
+    def test_generation_librispeech(self):
+        model = SpeechT5ForSpeechToSpeech.from_pretrained("microsoft/speecht5_vc")
+        model.to(torch_device)
+        processor = self.default_processor
+
+        input_speech = self._load_datasamples(1)
+        input_values = processor(audio=input_speech, return_tensors="pt").input_values.to(torch_device)
+
+        speaker_embeddings = torch.zeros((1, 512), device=torch_device)
+        generated_speech = model.generate_speech(input_values, speaker_embeddings=speaker_embeddings)
+
+        self.assertEqual(generated_speech.shape[1], model.config.num_mel_bins)
+        self.assertGreaterEqual(generated_speech.shape[0], 300)
+        self.assertLessEqual(generated_speech.shape[0], 310)
+
+
+class SpeechT5HifiGanTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=False,
+        num_mel_bins=20,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.num_mel_bins = num_mel_bins
+
+    def prepare_config_and_inputs(self):
+        input_values = floats_tensor([self.seq_length, self.num_mel_bins], scale=1.0)
+        config = self.get_config()
+        return config, input_values
+
+    def get_config(self):
+        return SpeechT5HifiGanConfig(
+            model_in_dim=self.num_mel_bins,
+            upsample_initial_channel=32,
+        )
+
+    def create_and_check_model(self, config, input_values):
+        model = SpeechT5HifiGan(config=config).to(torch_device).eval()
+        result = model(input_values)
+        self.parent.assertEqual(result.shape, (self.seq_length * 256,))
+
+    def prepare_config_and_inputs_for_common(self):
+        config, input_values = self.prepare_config_and_inputs()
+        inputs_dict = {"spectrogram": input_values}
+        return config, inputs_dict
+
+
+@require_torch
+class SpeechT5HifiGanTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (SpeechT5HifiGan,) if is_torch_available() else ()
+    test_torchscript = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_resize_position_embeddings = False
+    test_head_masking = False
+    test_mismatched_shapes = False
+    test_missing_keys = False
+    test_model_parallel = False
+    is_encoder_decoder = False
+    has_attentions = False
+
+    input_name = "spectrogram"
+
+    def setUp(self):
+        self.model_tester = SpeechT5HifiGanTester(self)
+        self.config_tester = ConfigTester(self, config_class=SpeechT5HifiGanConfig)
+
+    def test_config(self):
+        self.config_tester.create_and_test_config_to_json_string()
+        self.config_tester.create_and_test_config_to_json_file()
+        self.config_tester.create_and_test_config_from_and_save_pretrained()
+        self.config_tester.create_and_test_config_from_and_save_pretrained_subfolder()
+        self.config_tester.create_and_test_config_with_num_labels()
+        self.config_tester.check_config_can_be_init_without_params()
+        self.config_tester.check_config_arguments_init()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = [
+                "spectrogram",
+            ]
+            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+
+    # this model does not output hidden states
+    def test_hidden_states_output(self):
+        pass
+
+    # skip
+    def test_initialization(self):
+        pass
+
+    # this model has no inputs_embeds
+    def test_inputs_embeds(self):
+        pass
+
+    # this model has no input embeddings
+    def test_model_common_attributes(self):
+        pass
+
+    # skip as this model doesn't support all arguments tested
+    def test_model_outputs_equivalence(self):
+        pass
+
+    # this model does not output hidden states
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    # skip because it fails on automapping of SpeechT5HifiGanConfig
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    # skip because it fails on automapping of SpeechT5HifiGanConfig
+    def test_save_load_fast_init_to_base(self):
+        pass
+
+    def test_batched_inputs_outputs(self):
+        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            batched_inputs = inputs["spectrogram"].unsqueeze(0).repeat(2, 1, 1)
+            with torch.no_grad():
+                batched_outputs = model(batched_inputs.to(torch_device))
+
+            self.assertEqual(
+                batched_inputs.shape[0], batched_outputs.shape[0], msg="Got different batch dims for input and output"
+            )
+
+    def test_unbatched_inputs_outputs(self):
+        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(inputs["spectrogram"].to(torch_device))
+            self.assertTrue(outputs.dim() == 1, msg="Got un-batched inputs but batched output")
diff --git a/tests/models/speecht5/test_processor_speecht5.py b/tests/models/speecht5/test_processor_speecht5.py
new file mode 100644
index 000000000000..97d3842f105a
--- /dev/null
+++ b/tests/models/speecht5/test_processor_speecht5.py
@@ -0,0 +1,185 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for the SpeechT5 processors."""
+
+import json
+import os
+import shutil
+import tempfile
+import unittest
+
+from transformers import is_speech_available, is_torch_available
+from transformers.models.speecht5 import SpeechT5Tokenizer
+from transformers.testing_utils import get_tests_dir, require_torch
+from transformers.utils import FEATURE_EXTRACTOR_NAME
+
+
+if is_speech_available() and is_torch_available():
+    from transformers import SpeechT5FeatureExtractor, SpeechT5Processor
+
+    from .test_feature_extraction_speecht5 import floats_list
+
+
+SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_bpe_char.model")
+
+
+@require_torch
+class SpeechT5ProcessorTest(unittest.TestCase):
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+
+        tokenizer = SpeechT5Tokenizer(SAMPLE_VOCAB)
+        tokenizer.save_pretrained(self.tmpdirname)
+
+        feature_extractor_map = {
+            "feature_size": 1,
+            "padding_value": 0.0,
+            "sampling_rate": 16000,
+            "do_normalize": False,
+            "num_mel_bins": 80,
+            "hop_length": 16,
+            "win_length": 64,
+            "win_function": "hann_window",
+            "fmin": 80,
+            "fmax": 7600,
+            "mel_floor": 1e-10,
+            "reduction_factor": 2,
+            "return_attention_mask": True,
+        }
+
+        self.feature_extraction_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
+        with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(feature_extractor_map) + "\n")
+
+    def get_tokenizer(self, **kwargs):
+        return SpeechT5Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_feature_extractor(self, **kwargs):
+        return SpeechT5FeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def test_save_load_pretrained_default(self):
+        tokenizer = self.get_tokenizer()
+        feature_extractor = self.get_feature_extractor()
+
+        processor = SpeechT5Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        processor.save_pretrained(self.tmpdirname)
+        processor = SpeechT5Processor.from_pretrained(self.tmpdirname)
+
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
+        self.assertIsInstance(processor.tokenizer, SpeechT5Tokenizer)
+
+        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
+        self.assertIsInstance(processor.feature_extractor, SpeechT5FeatureExtractor)
+
+    def test_save_load_pretrained_additional_features(self):
+        processor = SpeechT5Processor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
+        processor.save_pretrained(self.tmpdirname)
+
+        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
+        feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0)
+
+        processor = SpeechT5Processor.from_pretrained(
+            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
+        )
+
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
+        self.assertIsInstance(processor.tokenizer, SpeechT5Tokenizer)
+
+        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
+        self.assertIsInstance(processor.feature_extractor, SpeechT5FeatureExtractor)
+
+    def test_feature_extractor(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = SpeechT5Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        raw_speech = floats_list((3, 1000))
+
+        input_feat_extract = feature_extractor(audio=raw_speech, return_tensors="np")
+        input_processor = processor(audio=raw_speech, return_tensors="np")
+
+        for key in input_feat_extract.keys():
+            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
+
+    def test_feature_extractor_target(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = SpeechT5Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        raw_speech = floats_list((3, 1000))
+
+        input_feat_extract = feature_extractor(audio_target=raw_speech, return_tensors="np")
+        input_processor = processor(audio_target=raw_speech, return_tensors="np")
+
+        for key in input_feat_extract.keys():
+            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
+
+    def test_tokenizer(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = SpeechT5Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        input_str = "This is a test string"
+
+        encoded_processor = processor(text=input_str)
+        encoded_tok = tokenizer(input_str)
+
+        for key in encoded_tok.keys():
+            self.assertListEqual(encoded_tok[key], encoded_processor[key])
+
+    def test_tokenizer_target(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = SpeechT5Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        input_str = "This is a test string"
+
+        encoded_processor = processor(text_target=input_str)
+        encoded_tok = tokenizer(input_str)
+
+        for key in encoded_tok.keys():
+            self.assertListEqual(encoded_tok[key], encoded_processor[key])
+
+    def test_tokenizer_decode(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = SpeechT5Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
+
+        decoded_processor = processor.batch_decode(predicted_ids)
+        decoded_tok = tokenizer.batch_decode(predicted_ids)
+
+        self.assertListEqual(decoded_tok, decoded_processor)
+
+    def test_model_input_names(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = SpeechT5Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        self.assertListEqual(
+            processor.model_input_names,
+            feature_extractor.model_input_names,
+            msg="`processor` and `feature_extractor` model input names do not match",
+        )
diff --git a/tests/models/speecht5/test_tokenization_speecht5.py b/tests/models/speecht5/test_tokenization_speecht5.py
new file mode 100644
index 000000000000..32468abe6ad6
--- /dev/null
+++ b/tests/models/speecht5/test_tokenization_speecht5.py
@@ -0,0 +1,218 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for the SpeechT5 tokenizers."""
+
+import unittest
+
+from transformers import SPIECE_UNDERLINE
+from transformers.models.speecht5 import SpeechT5Tokenizer
+from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow
+from transformers.tokenization_utils import AddedToken
+
+from ...test_tokenization_common import TokenizerTesterMixin
+
+
+SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_bpe_char.model")
+
+
+@require_sentencepiece
+@require_tokenizers
+class SpeechT5TokenizerTest(TokenizerTesterMixin, unittest.TestCase):
+    tokenizer_class = SpeechT5Tokenizer
+    test_rust_tokenizer = False
+    test_sentencepiece = True
+
+    def setUp(self):
+        super().setUp()
+
+        # We have a SentencePiece fixture for testing
+        tokenizer = SpeechT5Tokenizer(SAMPLE_VOCAB)
+
+        mask_token = AddedToken("<mask>", lstrip=True, rstrip=False)
+        tokenizer.mask_token = mask_token
+        tokenizer.add_special_tokens({"mask_token": mask_token})
+        tokenizer.add_tokens(["<ctc_blank>"])
+
+        tokenizer.save_pretrained(self.tmpdirname)
+
+    def get_input_output_texts(self, tokenizer):
+        input_text = "this is a test"
+        output_text = "this is a test"
+        return input_text, output_text
+
+    def get_numeric_input_output_texts(self):
+        input_text = "I have $123.45 and owe €59.78. My balance is -₴876.90 and have 73% stocks in my company which equals to ₦72649201"
+        output_text = "I have one hundred and twenty three point four five dollars and owe fifty nine point seven eight euros. My balance is minus eight hundred and seventy six point nine zero ukrainian hryvnia and have seventy three percent stocks in my company which equals to seventy two million six hundred and forty nine thousand two hundred and one nigerian naira"
+        return input_text, output_text
+
+    def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5):
+        input_text, output_text = self.get_input_output_texts(tokenizer)
+        ids = tokenizer.encode(output_text, add_special_tokens=False)
+        text = tokenizer.decode(ids, clean_up_tokenization_spaces=False)
+        return text, ids
+
+    def test_tokenizer_normalization(self):
+        tokenizer = self.get_tokenizer(normalize=True)
+        input_text, expected_text = self.get_numeric_input_output_texts()
+        input_ids = tokenizer.encode(input_text)
+        output_text = tokenizer.decode(input_ids, skip_special_tokens=True)
+        self.assertEqual(output_text, expected_text)
+
+    def test_convert_token_and_id(self):
+        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
+        token = "<pad>"
+        token_id = 1
+
+        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
+        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
+
+    def test_get_vocab(self):
+        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
+
+        self.assertEqual(vocab_keys[0], "<s>")
+        self.assertEqual(vocab_keys[1], "<pad>")
+        self.assertEqual(vocab_keys[-4], "œ")
+        self.assertEqual(vocab_keys[-2], "<mask>")
+        self.assertEqual(vocab_keys[-1], "<ctc_blank>")
+        self.assertEqual(len(vocab_keys), 81)
+
+    def test_vocab_size(self):
+        self.assertEqual(self.get_tokenizer().vocab_size, 79)
+
+    def test_add_tokens_tokenizer(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                vocab_size = tokenizer.vocab_size
+                all_size = len(tokenizer)
+
+                self.assertNotEqual(vocab_size, 0)
+
+                # We usually have added tokens from the start in tests because our vocab fixtures are
+                # smaller than the original vocabs - let's not assert this
+                # self.assertEqual(vocab_size, all_size)
+
+                new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"]
+                added_toks = tokenizer.add_tokens(new_toks)
+                vocab_size_2 = tokenizer.vocab_size
+                all_size_2 = len(tokenizer)
+
+                self.assertNotEqual(vocab_size_2, 0)
+                self.assertEqual(vocab_size, vocab_size_2)
+                self.assertEqual(added_toks, len(new_toks))
+                self.assertEqual(all_size_2, all_size + len(new_toks))
+
+                tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False)
+
+                self.assertGreaterEqual(len(tokens), 4)
+                self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
+                self.assertGreater(tokens[-3], tokenizer.vocab_size - 1)
+
+                new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"}
+                added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
+                vocab_size_3 = tokenizer.vocab_size
+                all_size_3 = len(tokenizer)
+
+                self.assertNotEqual(vocab_size_3, 0)
+                self.assertEqual(vocab_size, vocab_size_3)
+                self.assertEqual(added_toks_2, len(new_toks_2))
+                self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
+
+                tokens = tokenizer.encode(
+                    ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l", add_special_tokens=False
+                )
+
+                self.assertGreaterEqual(len(tokens), 6)
+                self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
+                self.assertGreater(tokens[0], tokens[1])
+                self.assertGreater(tokens[-3], tokenizer.vocab_size - 1)
+                self.assertGreater(tokens[-3], tokens[-4])
+                self.assertEqual(tokens[0], tokenizer.eos_token_id)
+                self.assertEqual(tokens[-3], tokenizer.pad_token_id)
+
+    def test_pickle_subword_regularization_tokenizer(self):
+        pass
+
+    def test_subword_regularization_tokenizer(self):
+        pass
+
+    def test_full_tokenizer(self):
+        tokenizer = self.get_tokenizer(normalize=True)
+
+        tokens = tokenizer.tokenize("This is a test")
+        # fmt: off
+        self.assertListEqual(tokens, [SPIECE_UNDERLINE, 'T', 'h', 'i', 's', SPIECE_UNDERLINE, 'i', 's', SPIECE_UNDERLINE, 'a', SPIECE_UNDERLINE, 't', 'e', 's', 't'])
+        # fmt: on
+
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(tokens),
+            [4, 32, 11, 10, 12, 4, 10, 12, 4, 7, 4, 6, 5, 12, 6],
+        )
+
+        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
+        self.assertListEqual(
+            tokens,
+            # fmt: off
+            [SPIECE_UNDERLINE, 'I', SPIECE_UNDERLINE, 'w', 'a', 's', SPIECE_UNDERLINE, 'b', 'o', 'r', 'n', SPIECE_UNDERLINE, 'i', 'n', SPIECE_UNDERLINE, 'n', 'i', 'n', 'e', 't', 'y', SPIECE_UNDERLINE, 't', 'w', 'o', SPIECE_UNDERLINE, 't', 'h', 'o', 'u', 's', 'a', 'n', 'd', ',', SPIECE_UNDERLINE, 'a', 'n', 'd', SPIECE_UNDERLINE, 't', 'h', 'i', 's', SPIECE_UNDERLINE, 'i', 's', SPIECE_UNDERLINE, 'f', 'a', 'l', 's', 'é', '.']
+            # fmt: on
+        )
+
+        ids = tokenizer.convert_tokens_to_ids(tokens)
+        # fmt: off
+        self.assertListEqual(ids, [4, 30, 4, 20, 7, 12, 4, 25, 8, 13, 9, 4, 10, 9, 4, 9, 10, 9, 5, 6, 22, 4, 6, 20, 8, 4, 6, 11, 8, 16, 12, 7, 9, 14, 23, 4, 7, 9, 14, 4, 6, 11, 10, 12, 4, 10, 12, 4, 19, 7, 15, 12, 73, 26])
+        # fmt: on
+
+        back_tokens = tokenizer.convert_ids_to_tokens(ids)
+        self.assertListEqual(
+            back_tokens,
+            # fmt: off
+            [SPIECE_UNDERLINE, 'I', SPIECE_UNDERLINE, 'w', 'a', 's', SPIECE_UNDERLINE, 'b', 'o', 'r', 'n', SPIECE_UNDERLINE, 'i', 'n', SPIECE_UNDERLINE, 'n', 'i', 'n', 'e', 't', 'y', SPIECE_UNDERLINE, 't', 'w', 'o', SPIECE_UNDERLINE, 't', 'h', 'o', 'u', 's', 'a', 'n', 'd', ',', SPIECE_UNDERLINE, 'a', 'n', 'd', SPIECE_UNDERLINE, 't', 'h', 'i', 's', SPIECE_UNDERLINE, 'i', 's', SPIECE_UNDERLINE, 'f', 'a', 'l', 's', 'é', '.']
+            # fmt: on
+        )
+
+    @slow
+    def test_tokenizer_integration(self):
+        # Use custom sequence because this tokenizer does not handle numbers.
+        sequences = [
+            "Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides "
+            "general-purpose architectures (BERT, GPT, RoBERTa, XLM, DistilBert, XLNet...) for Natural "
+            "Language Understanding (NLU) and Natural Language Generation (NLG) with over thirty-two pretrained "
+            "models in one hundred plus languages and deep interoperability between Jax, PyTorch and TensorFlow.",
+            "BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly "
+            "conditioning on both left and right context in all layers.",
+            "The quick brown fox jumps over the lazy dog.",
+        ]
+
+        # fmt: off
+        expected_encoding = {
+            'input_ids': [
+                [4, 32, 13, 7, 9, 12, 19, 8, 13, 18, 5, 13, 12, 4, 64, 19, 8, 13, 18, 5, 13, 15, 22, 4, 28, 9, 8, 20, 9, 4, 7, 12, 4, 24, 22, 6, 8, 13, 17, 11, 39, 6, 13, 7, 9, 12, 19, 8, 13, 18, 5, 13, 12, 4, 7, 9, 14, 4, 24, 22, 6, 8, 13, 17, 11, 39, 24, 13, 5, 6, 13, 7, 10, 9, 5, 14, 39, 25, 5, 13, 6, 63, 4, 24, 13, 8, 27, 10, 14, 5, 12, 4, 21, 5, 9, 5, 13, 7, 15, 39, 24, 16, 13, 24, 8, 12, 5, 4, 7, 13, 17, 11, 10, 6, 5, 17, 6, 16, 13, 5, 12, 4, 64, 40, 47, 54, 32, 23, 4, 53, 49, 32, 23, 4, 54, 8, 40, 47, 54, 32, 7, 23, 4, 69, 52, 43, 23, 4, 51, 10, 12, 6, 10, 15, 40, 5, 13, 6, 23, 4, 69, 52, 48, 5, 6, 26, 26, 26, 63, 4, 19, 8, 13, 4, 48, 7, 6, 16, 13, 7, 15, 4, 52, 7, 9, 21, 16, 7, 21, 5, 4, 61, 9, 14, 5, 13, 12, 6, 7, 9, 14, 10, 9, 21, 4, 64, 48, 52, 61, 63, 4, 7, 9, 14, 4, 48, 7, 6, 16, 13, 7, 15, 4, 52, 7, 9, 21, 16, 7, 21, 5, 4, 53, 5, 9, 5, 13, 7, 6, 10, 8, 9, 4, 64, 48, 52, 53, 63, 4, 20, 10, 6, 11, 4, 8, 27, 5, 13, 4, 6, 11, 10, 13, 6, 22, 39, 6, 20, 8, 4, 24, 13, 5, 6, 13, 7, 10, 9, 5, 14, 4, 18, 8, 14, 5, 15, 12, 4, 10, 9, 4, 8, 9, 5, 4, 11, 16, 9, 14, 13, 5, 14, 4, 24, 15, 16, 12, 4, 15, 7, 9, 21, 16, 7, 21, 5, 12, 4, 7, 9, 14, 4, 14, 5, 5, 24, 4, 10, 9, 6, 5, 13, 8, 24, 5, 13, 7, 25, 10, 15, 10, 6, 22, 4, 25, 5, 6, 20, 5, 5, 9, 4, 58, 7, 37, 23, 4, 49, 22, 32, 8, 13, 17, 11, 4, 7, 9, 14, 4, 32, 5, 9, 12, 8, 13, 55, 15, 8, 20, 26, 2],
+                [4, 40, 47, 54, 32, 4, 10, 12, 4, 14, 5, 12, 10, 21, 9, 5, 14, 4, 6, 8, 4, 24, 13, 5, 39, 6, 13, 7, 10, 9, 4, 14, 5, 5, 24, 4, 25, 10, 14, 10, 13, 5, 17, 6, 10, 8, 9, 7, 15, 4, 13, 5, 24, 13, 5, 12, 5, 9, 6, 7, 6, 10, 8, 9, 12, 4, 19, 13, 8, 18, 4, 16, 9, 15, 7, 25, 5, 15, 5, 14, 4, 6, 5, 37, 6, 4, 25, 22, 4, 46, 8, 10, 9, 6, 15, 22, 4, 17, 8, 9, 14, 10, 6, 10, 8, 9, 10, 9, 21, 4, 8, 9, 4, 25, 8, 6, 11, 4, 15, 5, 19, 6, 4, 7, 9, 14, 4, 13, 10, 21, 11, 6, 4, 17, 8, 9, 6, 5, 37, 6, 4, 10, 9, 4, 7, 15, 15, 4, 15, 7, 22, 5, 13, 12, 26, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                [4, 32, 11, 5, 4, 45, 16, 10, 17, 28, 4, 25, 13, 8, 20, 9, 4, 19, 8, 37, 4, 46, 16, 18, 24, 12, 4, 8, 27, 5, 13, 4, 6, 11, 5, 4, 15, 7, 57, 22, 4, 14, 8, 21, 26, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+            ],
+            'attention_mask': [
+                [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+            ]
+        }
+        # fmt: on
+
+        self.tokenizer_integration_test_util(
+            expected_encoding=expected_encoding,
+            model_name="microsoft/speecht5_asr",
+            revision="c5ef64c71905caeccde0e4462ef3f9077224c524",
+            sequences=sequences,
+        )
diff --git a/tests/models/splinter/test_modeling_splinter.py b/tests/models/splinter/test_modeling_splinter.py
index f064611b6a9e..90ee07c35458 100644
--- a/tests/models/splinter/test_modeling_splinter.py
+++ b/tests/models/splinter/test_modeling_splinter.py
@@ -22,6 +22,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -45,7 +46,7 @@ def __init__(
         vocab_size=99,
         hidden_size=32,
         question_token_id=1,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -207,8 +208,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class SplinterModelTest(ModelTesterMixin, unittest.TestCase):
-
+class SplinterModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             SplinterModel,
@@ -218,6 +218,22 @@ class SplinterModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {"feature-extraction": SplinterModel, "question-answering": SplinterForQuestionAnswering}
+        if is_torch_available()
+        else {}
+    )
+
+    # TODO: Fix the failed tests when this model gets more usage
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if pipeline_test_casse_name == "QAPipelineTests":
+            return True
+        elif pipeline_test_casse_name == "FeatureExtractionPipelineTests" and tokenizer_name.endswith("Fast"):
+            return True
+
+        return False
 
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = copy.deepcopy(inputs_dict)
@@ -338,7 +354,6 @@ def test_multi_gpu_data_parallel_forward(self):
                 inputs_dict[k] = v.to(0)
 
         for model_class in self.all_model_classes:
-
             # Skip this case since it will fail sometimes, as described above.
             if model_class == SplinterForPreTraining:
                 continue
diff --git a/tests/models/squeezebert/test_modeling_squeezebert.py b/tests/models/squeezebert/test_modeling_squeezebert.py
index cffc4570a059..bf86792f57f1 100644
--- a/tests/models/squeezebert/test_modeling_squeezebert.py
+++ b/tests/models/squeezebert/test_modeling_squeezebert.py
@@ -21,6 +21,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -49,7 +50,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=64,
         hidden_act="gelu",
@@ -214,8 +215,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class SqueezeBertModelTest(ModelTesterMixin, unittest.TestCase):
-
+class SqueezeBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             SqueezeBertModel,
@@ -228,6 +228,18 @@ class SqueezeBertModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else None
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": SqueezeBertModel,
+            "fill-mask": SqueezeBertForMaskedLM,
+            "question-answering": SqueezeBertForQuestionAnswering,
+            "text-classification": SqueezeBertForSequenceClassification,
+            "token-classification": SqueezeBertForTokenClassification,
+            "zero-shot": SqueezeBertForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
     test_pruning = False
     test_resize_embeddings = True
     test_head_masking = False
diff --git a/tests/models/squeezebert/test_tokenization_squeezebert.py b/tests/models/squeezebert/test_tokenization_squeezebert.py
index 88d715bcc140..a65862556405 100644
--- a/tests/models/squeezebert/test_tokenization_squeezebert.py
+++ b/tests/models/squeezebert/test_tokenization_squeezebert.py
@@ -22,7 +22,6 @@
 
 @require_tokenizers
 class SqueezeBertTokenizationTest(BertTokenizationTest):
-
     tokenizer_class = SqueezeBertTokenizer
     rust_tokenizer_class = SqueezeBertTokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/swiftformer/__init__.py b/tests/models/swiftformer/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/swiftformer/test_modeling_swiftformer.py b/tests/models/swiftformer/test_modeling_swiftformer.py
new file mode 100644
index 000000000000..3e286cc32048
--- /dev/null
+++ b/tests/models/swiftformer/test_modeling_swiftformer.py
@@ -0,0 +1,306 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch SwiftFormer model. """
+
+
+import copy
+import inspect
+import unittest
+
+from transformers import PretrainedConfig, SwiftFormerConfig
+from transformers.testing_utils import (
+    require_torch,
+    require_vision,
+    slow,
+    torch_device,
+)
+from transformers.utils import cached_property, is_torch_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import SwiftFormerForImageClassification, SwiftFormerModel
+    from transformers.models.swiftformer.modeling_swiftformer import SWIFTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import ViTImageProcessor
+
+
+class SwiftFormerModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        num_channels=3,
+        is_training=True,
+        use_labels=True,
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        image_size=224,
+        num_labels=3,
+        layer_depths=[1, 1, 1, 1],
+        embed_dims=[16, 16, 32, 32],
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.num_labels = num_labels
+        self.image_size = image_size
+        self.layer_depths = layer_depths
+        self.embed_dims = embed_dims
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.num_labels)
+
+        config = self.get_config()
+
+        return config, pixel_values, labels
+
+    def get_config(self):
+        return SwiftFormerConfig(
+            depths=self.layer_depths,
+            embed_dims=self.embed_dims,
+            mlp_ratio=4,
+            downsamples=[True, True, True, True],
+            hidden_act="gelu",
+            num_labels=self.num_labels,
+            down_patch_size=3,
+            down_stride=2,
+            down_pad=1,
+            drop_rate=0.0,
+            drop_path_rate=0.0,
+            use_layer_scale=True,
+            layer_scale_init_value=1e-5,
+        )
+
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = SwiftFormerModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.embed_dims[-1], 7, 7))
+
+    def create_and_check_for_image_classification(self, config, pixel_values, labels):
+        config.num_labels = self.num_labels
+        model = SwiftFormerForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values, labels=labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+        model = SwiftFormerForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        result = model(pixel_values)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def prepare_config_and_inputs_for_common(self):
+        (config, pixel_values, labels) = self.prepare_config_and_inputs()
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class SwiftFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as SwiftFormer does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (SwiftFormerModel, SwiftFormerForImageClassification) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": SwiftFormerModel, "image-classification": SwiftFormerForImageClassification}
+        if is_torch_available()
+        else {}
+    )
+
+    fx_compatible = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+    has_attentions = False
+
+    def setUp(self):
+        self.model_tester = SwiftFormerModelTester(self)
+        self.config_tester = ConfigTester(
+            self,
+            config_class=SwiftFormerConfig,
+            has_text_modality=False,
+            hidden_size=37,
+            num_attention_heads=12,
+            num_hidden_layers=12,
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="SwiftFormer does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_image_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in SWIFTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = SwiftFormerModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    @unittest.skip(reason="SwiftFormer does not output attentions")
+    def test_attention_outputs(self):
+        pass
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.hidden_states
+
+            expected_num_stages = 8
+            self.assertEqual(len(hidden_states), expected_num_stages)  # TODO
+
+            # SwiftFormer's feature maps are of shape (batch_size, embed_dims, height, width)
+            # with the width and height being successively divided by 2, after every 2 blocks
+            for i in range(len(hidden_states)):
+                self.assertEqual(
+                    hidden_states[i].shape,
+                    torch.Size(
+                        [
+                            self.model_tester.batch_size,
+                            self.model_tester.embed_dims[i // 2],
+                            (self.model_tester.image_size // 4) // 2 ** (i // 2),
+                            (self.model_tester.image_size // 4) // 2 ** (i // 2),
+                        ]
+                    ),
+                )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    def test_initialization(self):
+        def _config_zero_init(config):
+            configs_no_init = copy.deepcopy(config)
+            for key in configs_no_init.__dict__.keys():
+                if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key:
+                    setattr(configs_no_init, key, 1e-10)
+                if isinstance(getattr(configs_no_init, key, None), PretrainedConfig):
+                    no_init_subconfig = _config_zero_init(getattr(configs_no_init, key))
+                    setattr(configs_no_init, key, no_init_subconfig)
+            return configs_no_init
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    self.assertIn(
+                        ((param.data.mean() * 1e9) / 1e9).round().item(),
+                        [0.0, 1.0],
+                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                    )
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_torch
+@require_vision
+class SwiftFormerModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_image_processor(self):
+        return ViTImageProcessor.from_pretrained("MBZUAI/swiftformer-xs") if is_vision_available() else None
+
+    @slow
+    def test_inference_image_classification_head(self):
+        model = SwiftFormerForImageClassification.from_pretrained("MBZUAI/swiftformer-xs").to(torch_device)
+
+        image_processor = self.default_image_processor
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # verify the logits
+        expected_shape = torch.Size((1, 1000))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        expected_slice = torch.tensor([[-2.1703e00, 2.1107e00, -2.0811e00]]).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
diff --git a/tests/models/swin/test_modeling_swin.py b/tests/models/swin/test_modeling_swin.py
index 0b780d74b5b3..383f0fe867d4 100644
--- a/tests/models/swin/test_modeling_swin.py
+++ b/tests/models/swin/test_modeling_swin.py
@@ -22,8 +22,10 @@
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
 from transformers.utils import cached_property, is_torch_available, is_vision_available
 
+from ...test_backbone_common import BackboneTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -32,14 +34,12 @@
 
     from transformers import SwinBackbone, SwinForImageClassification, SwinForMaskedImageModeling, SwinModel
     from transformers.models.swin.modeling_swin import SWIN_PRETRAINED_MODEL_ARCHIVE_LIST
-    from transformers.pytorch_utils import is_torch_less_than_1_9
-else:
-    is_torch_less_than_1_9 = True
+
 
 if is_vision_available():
     from PIL import Image
 
-    from transformers import AutoFeatureExtractor
+    from transformers import AutoImageProcessor
 
 
 class SwinModelTester:
@@ -70,6 +70,7 @@ def __init__(
         type_sequence_label_size=10,
         encoder_stride=8,
         out_features=["stage1", "stage2"],
+        out_indices=[1, 2],
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -96,6 +97,7 @@ def __init__(
         self.type_sequence_label_size = type_sequence_label_size
         self.encoder_stride = encoder_stride
         self.out_features = out_features
+        self.out_indices = out_indices
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
@@ -129,6 +131,7 @@ def get_config(self):
             initializer_range=self.initializer_range,
             encoder_stride=self.encoder_stride,
             out_features=self.out_features,
+            out_indices=self.out_indices,
         )
 
     def create_and_check_model(self, config, pixel_values, labels):
@@ -218,8 +221,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class SwinModelTest(ModelTesterMixin, unittest.TestCase):
-
+class SwinModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             SwinModel,
@@ -230,6 +232,11 @@ class SwinModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {"feature-extraction": SwinModel, "image-classification": SwinForImageClassification}
+        if is_torch_available()
+        else {}
+    )
     fx_compatible = True
 
     test_pruning = False
@@ -256,7 +263,11 @@ def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
-    @unittest.skipIf(is_torch_less_than_1_9, reason="This test fails for SwinModel when torch < 1.9")
+    # TODO: check if this works again for PyTorch 2.x.y
+    @unittest.skip(reason="Got `CUDA error: misaligned address` with PyTorch 2.0.0.")
+    def test_multi_gpu_data_parallel_forward(self):
+        pass
+
     def test_training_gradient_checkpointing(self):
         super().test_training_gradient_checkpointing()
 
@@ -471,9 +482,9 @@ def test_initialization(self):
 @require_torch
 class SwinModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return (
-            AutoFeatureExtractor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
+            AutoImageProcessor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
             if is_vision_available()
             else None
         )
@@ -481,10 +492,10 @@ def default_feature_extractor(self):
     @slow
     def test_inference_image_classification_head(self):
         model = SwinForImageClassification.from_pretrained("microsoft/swin-tiny-patch4-window7-224").to(torch_device)
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
 
         image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -495,3 +506,12 @@ def test_inference_image_classification_head(self):
         self.assertEqual(outputs.logits.shape, expected_shape)
         expected_slice = torch.tensor([-0.0948, -0.6454, -0.0921]).to(torch_device)
         self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
+
+
+@require_torch
+class SwinBackboneTest(unittest.TestCase, BackboneTesterMixin):
+    all_model_classes = (SwinBackbone,) if is_torch_available() else ()
+    config_class = SwinConfig
+
+    def setUp(self):
+        self.model_tester = SwinModelTester(self)
diff --git a/tests/models/swin/test_modeling_tf_swin.py b/tests/models/swin/test_modeling_tf_swin.py
index be5861ce48b4..597643936f95 100644
--- a/tests/models/swin/test_modeling_tf_swin.py
+++ b/tests/models/swin/test_modeling_tf_swin.py
@@ -15,17 +15,20 @@
 """ Testing suite for the TF 2.0 Swin model. """
 
 
+from __future__ import annotations
+
 import inspect
 import unittest
 
 import numpy as np
 
 from transformers import SwinConfig
-from transformers.testing_utils import require_tf, require_vision, slow, to_2tuple, tooslow
+from transformers.testing_utils import require_tf, require_vision, slow, to_2tuple
 from transformers.utils import cached_property, is_tf_available, is_vision_available
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -42,7 +45,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import AutoFeatureExtractor
+    from transformers import AutoImageProcessor
 
 
 class TFSwinModelTester:
@@ -176,8 +179,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFSwinModelTest(TFModelTesterMixin, unittest.TestCase):
-
+class TFSwinModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             TFSwinModel,
@@ -187,6 +189,11 @@ class TFSwinModelTest(TFModelTesterMixin, unittest.TestCase):
         if is_tf_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {"feature-extraction": TFSwinModel, "image-classification": TFSwinForImageClassification}
+        if is_tf_available()
+        else {}
+    )
 
     test_pruning = False
     test_resize_embeddings = False
@@ -225,10 +232,6 @@ def test_for_image_classification(self):
     def test_inputs_embeds(self):
         pass
 
-    @tooslow
-    def test_saved_model_creation(self):
-        pass
-
     def test_model_common_attributes(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -335,7 +338,6 @@ def check_hidden_states_output(self, inputs_dict, config, model_class, image_siz
         )
 
     def test_hidden_states_output(self):
-
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         image_size = to_2tuple(self.model_tester.image_size)
@@ -380,9 +382,9 @@ def test_model_from_pretrained(self):
 @require_tf
 class TFSwinModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return (
-            AutoFeatureExtractor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
+            AutoImageProcessor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
             if is_vision_available()
             else None
         )
@@ -390,10 +392,10 @@ def default_feature_extractor(self):
     @slow
     def test_inference_image_classification_head(self):
         model = TFSwinForImageClassification.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
 
         image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        inputs = feature_extractor(images=image, return_tensors="tf")
+        inputs = image_processor(images=image, return_tensors="tf")
 
         # forward pass
         outputs = model(inputs)
diff --git a/tests/models/swin2sr/test_image_processing_swin2sr.py b/tests/models/swin2sr/test_image_processing_swin2sr.py
index 393a44ecface..719ac79d09db 100644
--- a/tests/models/swin2sr/test_image_processing_swin2sr.py
+++ b/tests/models/swin2sr/test_image_processing_swin2sr.py
@@ -21,7 +21,7 @@
 from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import is_torch_available, is_vision_available
 
-from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
 
 
 if is_torch_available():
@@ -59,7 +59,7 @@ def __init__(
         self.do_pad = do_pad
         self.pad_size = pad_size
 
-    def prepare_feat_extract_dict(self):
+    def prepare_image_processor_dict(self):
         return {
             "do_rescale": self.do_rescale,
             "rescale_factor": self.rescale_factor,
@@ -67,127 +67,115 @@ def prepare_feat_extract_dict(self):
             "pad_size": self.pad_size,
         }
 
-    def prepare_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
-        or a list of PyTorch tensors if one specifies torchify=True.
-        """
-
-        assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
-
-        if equal_resolution:
-            image_inputs = []
-            for i in range(self.batch_size):
-                image_inputs.append(
-                    np.random.randint(
-                        255, size=(self.num_channels, self.max_resolution, self.max_resolution), dtype=np.uint8
-                    )
-                )
-        else:
-            image_inputs = []
-            for i in range(self.batch_size):
-                width, height = np.random.choice(np.arange(self.min_resolution, self.max_resolution), 2)
-                image_inputs.append(np.random.randint(255, size=(self.num_channels, width, height), dtype=np.uint8))
-
-        if not numpify and not torchify:
-            # PIL expects the channel dimension as last dimension
-            image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
+    def expected_output_image_shape(self, images):
+        img = images[0]
 
-        if torchify:
-            image_inputs = [torch.from_numpy(x) for x in image_inputs]
-
-        return image_inputs
+        if isinstance(img, Image.Image):
+            input_width, input_height = img.size
+        else:
+            input_height, input_width = img.shape[-2:]
+
+        pad_height = (input_height // self.pad_size + 1) * self.pad_size - input_height
+        pad_width = (input_width // self.pad_size + 1) * self.pad_size - input_width
+
+        return self.num_channels, input_height + pad_height, input_width + pad_width
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
 
 
 @require_torch
 @require_vision
-class Swin2SRImageProcessingTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
-
-    feature_extraction_class = Swin2SRImageProcessor if is_vision_available() else None
+class Swin2SRImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = Swin2SRImageProcessor if is_vision_available() else None
 
     def setUp(self):
-        self.feature_extract_tester = Swin2SRImageProcessingTester(self)
+        self.image_processor_tester = Swin2SRImageProcessingTester(self)
 
     @property
-    def feat_extract_dict(self):
-        return self.feature_extract_tester.prepare_feat_extract_dict()
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
 
-    def test_feat_extract_properties(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        self.assertTrue(hasattr(feature_extractor, "do_rescale"))
-        self.assertTrue(hasattr(feature_extractor, "rescale_factor"))
-        self.assertTrue(hasattr(feature_extractor, "do_pad"))
-        self.assertTrue(hasattr(feature_extractor, "pad_size"))
-
-    def test_batch_feature(self):
-        pass
+    def test_image_processor_properties(self):
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processor, "do_rescale"))
+        self.assertTrue(hasattr(image_processor, "rescale_factor"))
+        self.assertTrue(hasattr(image_processor, "do_pad"))
+        self.assertTrue(hasattr(image_processor, "pad_size"))
 
     def calculate_expected_size(self, image):
         old_height, old_width = get_image_size(image)
-        size = self.feature_extract_tester.pad_size
+        size = self.image_processor_tester.pad_size
 
         pad_height = (old_height // size + 1) * size - old_height
         pad_width = (old_width // size + 1) * size - old_width
         return old_height + pad_height, old_width + pad_width
 
+    # Swin2SRImageProcessor does not support batched input
     def test_call_pil(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
         # create random PIL images
-        image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False)
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
         for image in image_inputs:
             self.assertIsInstance(image, Image.Image)
 
         # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        expected_height, expected_width = self.calculate_expected_size(np.array(image_inputs[0]))
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                expected_height,
-                expected_width,
-            ),
-        )
+        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
+        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
 
+    # Swin2SRImageProcessor does not support batched input
     def test_call_numpy(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
         # create random numpy tensors
-        image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False, numpify=True)
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
         for image in image_inputs:
             self.assertIsInstance(image, np.ndarray)
 
         # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        expected_height, expected_width = self.calculate_expected_size(image_inputs[0])
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                expected_height,
-                expected_width,
-            ),
-        )
+        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
+        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
+
+    # Swin2SRImageProcessor does not support batched input
+    def test_call_numpy_4_channels(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random numpy tensors
+        self.image_processor_tester.num_channels = 4
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, np.ndarray)
 
+        # Test not batched input
+        encoded_images = image_processing(
+            image_inputs[0], return_tensors="pt", input_data_format="channels_first"
+        ).pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
+        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
+        self.image_processor_tester.num_channels = 3
+
+    # Swin2SRImageProcessor does not support batched input
     def test_call_pytorch(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
         # create random PyTorch tensors
-        image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False, torchify=True)
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+
         for image in image_inputs:
             self.assertIsInstance(image, torch.Tensor)
 
         # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        expected_height, expected_width = self.calculate_expected_size(image_inputs[0])
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                expected_height,
-                expected_width,
-            ),
-        )
+        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
+        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
diff --git a/tests/models/swin2sr/test_modeling_swin2sr.py b/tests/models/swin2sr/test_modeling_swin2sr.py
index 5bd54d7a79f9..e43c45d0d47a 100644
--- a/tests/models/swin2sr/test_modeling_swin2sr.py
+++ b/tests/models/swin2sr/test_modeling_swin2sr.py
@@ -22,6 +22,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -45,6 +46,7 @@ def __init__(
         image_size=32,
         patch_size=1,
         num_channels=3,
+        num_channels_out=1,
         embed_dim=16,
         depths=[1, 2, 1],
         num_heads=[2, 2, 4],
@@ -69,6 +71,7 @@ def __init__(
         self.image_size = image_size
         self.patch_size = patch_size
         self.num_channels = num_channels
+        self.num_channels_out = num_channels_out
         self.embed_dim = embed_dim
         self.depths = depths
         self.num_heads = num_heads
@@ -109,6 +112,7 @@ def get_config(self):
             image_size=self.image_size,
             patch_size=self.patch_size,
             num_channels=self.num_channels,
+            num_channels_out=self.num_channels_out,
             embed_dim=self.embed_dim,
             depths=self.depths,
             num_heads=self.num_heads,
@@ -144,7 +148,8 @@ def create_and_check_for_image_super_resolution(self, config, pixel_values, labe
 
         expected_image_size = self.image_size * self.upscale
         self.parent.assertEqual(
-            result.reconstruction.shape, (self.batch_size, self.num_channels, expected_image_size, expected_image_size)
+            result.reconstruction.shape,
+            (self.batch_size, self.num_channels_out, expected_image_size, expected_image_size),
         )
 
     def prepare_config_and_inputs_for_common(self):
@@ -155,9 +160,9 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class Swin2SRModelTest(ModelTesterMixin, unittest.TestCase):
-
+class Swin2SRModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (Swin2SRModel, Swin2SRForImageSuperResolution) if is_torch_available() else ()
+    pipeline_model_mapping = {"feature-extraction": Swin2SRModel} if is_torch_available() else {}
 
     fx_compatible = False
     test_pruning = False
@@ -185,6 +190,11 @@ def test_model_for_image_super_resolution(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_image_super_resolution(*config_and_inputs)
 
+    # TODO: check if this works again for PyTorch 2.x.y
+    @unittest.skip(reason="Got `CUDA error: misaligned address` with PyTorch 2.0.0.")
+    def test_multi_gpu_data_parallel_forward(self):
+        pass
+
     @unittest.skip(reason="Swin2SR does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass
diff --git a/tests/models/swinv2/test_modeling_swinv2.py b/tests/models/swinv2/test_modeling_swinv2.py
index 13a39b139c81..5a771b2c4b63 100644
--- a/tests/models/swinv2/test_modeling_swinv2.py
+++ b/tests/models/swinv2/test_modeling_swinv2.py
@@ -23,6 +23,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -35,7 +36,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import AutoFeatureExtractor
+    from transformers import AutoImageProcessor
 
 
 class Swinv2ModelTester:
@@ -170,11 +171,15 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class Swinv2ModelTest(ModelTesterMixin, unittest.TestCase):
-
+class Swinv2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (Swinv2Model, Swinv2ForImageClassification, Swinv2ForMaskedImageModeling) if is_torch_available() else ()
     )
+    pipeline_model_mapping = (
+        {"feature-extraction": Swinv2Model, "image-classification": Swinv2ForImageClassification}
+        if is_torch_available()
+        else {}
+    )
 
     fx_compatible = False
     test_pruning = False
@@ -197,6 +202,11 @@ def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    # TODO: check if this works again for PyTorch 2.x.y
+    @unittest.skip(reason="Got `CUDA error: misaligned address` with PyTorch 2.0.0.")
+    def test_multi_gpu_data_parallel_forward(self):
+        pass
+
     @unittest.skip(reason="Swinv2 does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass
@@ -402,9 +412,9 @@ def test_initialization(self):
 @require_torch
 class Swinv2ModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return (
-            AutoFeatureExtractor.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256")
+            AutoImageProcessor.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256")
             if is_vision_available()
             else None
         )
@@ -414,10 +424,10 @@ def test_inference_image_classification_head(self):
         model = Swinv2ForImageClassification.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256").to(
             torch_device
         )
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
 
         image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/switch_transformers/test_modeling_switch_transformers.py b/tests/models/switch_transformers/test_modeling_switch_transformers.py
index 1afeb2e48451..54e17b91b7b2 100644
--- a/tests/models/switch_transformers/test_modeling_switch_transformers.py
+++ b/tests/models/switch_transformers/test_modeling_switch_transformers.py
@@ -24,6 +24,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -36,7 +37,6 @@
         SwitchTransformersModel,
         SwitchTransformersTop1Router,
     )
-    from transformers.generation import BeamSampleDecoderOnlyOutput, BeamSampleEncoderDecoderOutput
     from transformers.models.switch_transformers.modeling_switch_transformers import (
         SWITCH_TRANSFORMERS_PRETRAINED_MODEL_ARCHIVE_LIST,
         load_balancing_loss_func,
@@ -57,7 +57,7 @@ def __init__(
         use_attention_mask=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         d_ff=37,
         relative_attention_num_buckets=8,
@@ -73,7 +73,6 @@ def __init__(
         expert_capacity=100,
         router_jitter_noise=0.0,
     ):
-
         self.parent = parent
         self.batch_size = batch_size
         self.encoder_seq_length = encoder_seq_length
@@ -547,12 +546,22 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class SwitchTransformersModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
+class SwitchTransformersModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (SwitchTransformersModel, SwitchTransformersForConditionalGeneration) if is_torch_available() else ()
     )
     all_generative_model_classes = (SwitchTransformersForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "conversational": SwitchTransformersForConditionalGeneration,
+            "feature-extraction": SwitchTransformersModel,
+            "summarization": SwitchTransformersForConditionalGeneration,
+            "text2text-generation": SwitchTransformersForConditionalGeneration,
+            "translation": SwitchTransformersForConditionalGeneration,
+        }
+        if is_torch_available()
+        else {}
+    )
     fx_compatible = False
     test_pruning = False
     test_resize_embeddings = True
@@ -603,101 +612,6 @@ def test_decoder_model_past_with_attn_mask(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
 
-    @slow
-    def test_beam_sample_generate_dict_output(self):
-        r"""
-        This test needs to be overriden with a larger model since it fails for very small models due to precision issues.
-        """
-        for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
-
-            # disable cache
-            config.use_cache = False
-
-            # It is important set set the eos_token_id to None to ensure that no sequences
-            # shorter than `max_length` can be generated which could lead to flaky circle ci
-            # failures if the top `num_return_sequences` beams are all shorter than the longest beam
-            config.eos_token_id = None
-            config.forced_eos_token_id = None
-
-            model = model_class.from_pretrained("google/switch-base-8").to(torch_device).eval()
-            logits_warper_kwargs, logits_warper = self._get_warper_and_kwargs(num_beams=1)
-
-            num_return_sequences = 2
-            if model.config.is_encoder_decoder:
-                max_length = 4
-            beam_kwargs, beam_scorer = self._get_beam_scorer_and_kwargs(
-                input_ids.shape[0] * num_return_sequences, max_length
-            )
-            beam_kwargs["num_return_sequences"] = num_return_sequences
-
-            output_beam_sample, output_generate = self._beam_sample_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                max_length=max_length,
-                num_return_sequences=num_return_sequences,
-                beam_scorer=beam_scorer,
-                beam_kwargs=beam_kwargs,
-                logits_warper=logits_warper,
-                logits_warper_kwargs=logits_warper_kwargs,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-
-            if model.config.is_encoder_decoder:
-                self.assertIsInstance(output_beam_sample, BeamSampleEncoderDecoderOutput)
-                self.assertIsInstance(output_generate, BeamSampleEncoderDecoderOutput)
-            else:
-                self.assertIsInstance(output_beam_sample, BeamSampleDecoderOnlyOutput)
-                self.assertIsInstance(output_generate, BeamSampleDecoderOnlyOutput)
-
-            self.assertListEqual(output_generate.sequences.tolist(), output_beam_sample.sequences.tolist())
-
-    @slow
-    def test_beam_sample_generate(self):
-        r"""
-        This test needs to be overriden with a larger model since it fails for very small models due to precision issues.
-        """
-        for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
-
-            # It is important set set the eos_token_id to None to ensure that no sequences
-            # shorter than `max_length` can be generated which could lead to flaky circle ci
-            # failures if the top `num_return_sequences` beams are all shorter than the longest beam
-            config.eos_token_id = None
-            config.forced_eos_token_id = None
-
-            logits_warper_kwargs, logits_warper = self._get_warper_and_kwargs(num_beams=1)
-
-            model = model_class.from_pretrained("google/switch-base-8").to(torch_device).eval()
-
-            # check `generate()` and `beam_search()` are equal
-            # change `num_return_sequences = 2` but not for `beam_scorer`
-            num_return_sequences = 2
-            if model.config.is_encoder_decoder:
-                max_length = 4
-            beam_kwargs, beam_scorer = self._get_beam_scorer_and_kwargs(
-                input_ids.shape[0] * num_return_sequences, max_length
-            )
-            beam_kwargs["num_return_sequences"] = num_return_sequences
-
-            output_generate, output_beam_sample = self._beam_sample_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                max_length=max_length,
-                num_return_sequences=num_return_sequences,
-                beam_scorer=beam_scorer,
-                beam_kwargs=beam_kwargs,
-                logits_warper=logits_warper,
-                logits_warper_kwargs=logits_warper_kwargs,
-            )
-
-            self.assertListEqual(output_generate.tolist(), output_beam_sample.tolist())
-
     def test_decoder_model_past_with_3d_attn_mask(self):
         (
             config,
@@ -816,7 +730,7 @@ def __init__(
         # For common tests
         use_attention_mask=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         d_ff=37,
         relative_attention_num_buckets=8,
@@ -828,7 +742,6 @@ def __init__(
         pad_token_id=0,
         scope=None,
     ):
-
         self.parent = parent
         self.batch_size = batch_size
         self.encoder_seq_length = encoder_seq_length
@@ -1134,13 +1047,16 @@ def test_small_logits(self):
 
         torch.testing.assert_allclose(hf_logits, EXPECTED_MEAN_LOGITS, rtol=6e-3, atol=9e-3)
 
+    @unittest.skip(
+        "Unless we stop stripping left and right by default for all special tokens, the expected ids obtained here will not match the original ones. Wait for https://github.com/huggingface/transformers/pull/23909 to be merged"
+    )
     def test_small_generate(self):
         # Generate test using the smalled switch-C model.
 
         model = SwitchTransformersForConditionalGeneration.from_pretrained(
             "google/switch-base-8", torch_dtype=torch.bfloat16
         ).eval()
-        tokenizer = AutoTokenizer.from_pretrained("t5-small")
+        tokenizer = AutoTokenizer.from_pretrained("t5-small", use_fast=False, legacy=False)
         model = model.to(torch_device)
 
         input_ids = tokenizer(
@@ -1151,24 +1067,27 @@ def test_small_generate(self):
         self.assertEqual(output_str, "drink.")
 
         input_ids = tokenizer(
-            "A <extra_id_0> walks into a bar a orders a <extra_id_1> with <extra_id_2> pinch of <extra_id_3>.",
+            "A <extra_id_0> walks into a bar and orders a <extra_id_1> with <extra_id_2> pinch of <extra_id_3>.",
             return_tensors="pt",
         ).input_ids.to(torch_device)
         sequences = model.generate(input_ids)
         output_str = tokenizer.batch_decode(sequences, skip_special_tokens=False)[0]
 
-        EXPECTED_OUTPUT = "<pad><extra_id_0> man<extra_id_1> beer<extra_id_2> a<extra_id_3> salt<extra_id_4>.</s>"
+        EXPECTED_OUTPUT = "<pad><extra_id_0> man<extra_id_1> beer<extra_id_2> a<extra_id_3> whiskey<extra_id_4>.</s>"
         self.assertEqual(output_str, EXPECTED_OUTPUT)
 
+    @unittest.skip(
+        "Unless we stop stripping left and right by default for all special tokens, the expected ids obtained here will not match the original ones. Wait for https://github.com/huggingface/transformers/pull/23909 to be merged"
+    )
     def test_small_batch_generate(self):
         BATCH_SIZE = 4
         model = SwitchTransformersForConditionalGeneration.from_pretrained(
             "google/switch-base-8", torch_dtype=torch.bfloat16
         ).eval()
-        tokenizer = AutoTokenizer.from_pretrained("t5-small")
+        tokenizer = AutoTokenizer.from_pretrained("t5-small", use_fast=False, legacy=False)
 
         inputs = [
-            "A <extra_id_0> walks into a bar a orders a <extra_id_1> with <extra_id_2> pinch of <extra_id_3>."
+            "A <extra_id_0> walks into a bar and orders a <extra_id_1> with <extra_id_2> pinch of <extra_id_3>."
         ] * BATCH_SIZE
         encoded_input = tokenizer.batch_encode_plus(inputs, return_tensors="pt")
 
diff --git a/tests/models/t5/test_modeling_flax_t5.py b/tests/models/t5/test_modeling_flax_t5.py
index f4bd54e97af1..d5d729dac9af 100644
--- a/tests/models/t5/test_modeling_flax_t5.py
+++ b/tests/models/t5/test_modeling_flax_t5.py
@@ -46,6 +46,7 @@
     from flax.core.frozen_dict import unfreeze
     from flax.training.common_utils import onehot
     from flax.traverse_util import flatten_dict
+
     from transformers import FLAX_MODEL_MAPPING, ByT5Tokenizer, T5Config, T5Tokenizer
     from transformers.modeling_flax_pytorch_utils import load_flax_weights_in_pytorch_model
     from transformers.models.t5.modeling_flax_t5 import (
@@ -69,7 +70,7 @@ def __init__(
         use_attention_mask=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         d_ff=37,
         relative_attention_num_buckets=8,
@@ -81,7 +82,6 @@ def __init__(
         scope=None,
         decoder_layers=None,
     ):
-
         self.parent = parent
         self.batch_size = batch_size
         self.encoder_seq_length = encoder_seq_length
@@ -228,7 +228,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_flax
 class FlaxT5ModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
-
     all_model_classes = (FlaxT5Model, FlaxT5ForConditionalGeneration) if is_flax_available() else ()
     all_generative_model_classes = (FlaxT5ForConditionalGeneration,) if is_flax_available() else ()
     is_encoder_decoder = True
@@ -478,7 +477,7 @@ def __init__(
         use_attention_mask=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         d_ff=37,
         relative_attention_num_buckets=8,
@@ -489,7 +488,6 @@ def __init__(
         decoder_start_token_id=0,
         scope=None,
     ):
-
         self.parent = parent
         self.batch_size = batch_size
         self.encoder_seq_length = encoder_seq_length
@@ -576,7 +574,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_flax
 class FlaxT5EncoderOnlyModelTest(FlaxModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (FlaxT5EncoderModel,) if is_flax_available() else ()
     is_encoder_decoder = False
 
@@ -1076,7 +1073,7 @@ def test_summarization(self):
         expected_summaries = [
             'prosecutor: "so far no videos were used in the crash investigation" two magazines claim to have found a'
             " cell phone video of the final seconds . \"one can hear cries of 'My God' in several languages,\" one"
-            " magazine says . all 150 on board were killed when germanwings flight 9525 crashed .",
+            " magazine says . all 150 on board were killed in the crash .",
             "the formal accession was marked by a ceremony at The Hague, in the Netherlands . the ICC opened a"
             " preliminary examination into the situation in the occupied Palestinian territory . as members of the"
             " court, Palestinians may be subject to counter-charges as well .",
diff --git a/tests/models/t5/test_modeling_t5.py b/tests/models/t5/test_modeling_t5.py
index fe3ce7597bfe..c94bfc1f1148 100644
--- a/tests/models/t5/test_modeling_t5.py
+++ b/tests/models/t5/test_modeling_t5.py
@@ -15,10 +15,13 @@
 
 
 import copy
+import os
+import pickle
 import tempfile
 import unittest
 
 from transformers import T5Config, is_torch_available
+from transformers.models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
 from transformers.testing_utils import (
     require_accelerate,
     require_sentencepiece,
@@ -27,11 +30,16 @@
     slow,
     torch_device,
 )
-from transformers.utils import cached_property
+from transformers.utils import cached_property, is_torch_fx_available
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_fx_available():
+    from transformers.utils.fx import symbolic_trace
 
 
 if is_torch_available():
@@ -42,6 +50,8 @@
         ByT5Tokenizer,
         T5EncoderModel,
         T5ForConditionalGeneration,
+        T5ForQuestionAnswering,
+        T5ForSequenceClassification,
         T5Model,
         T5Tokenizer,
     )
@@ -55,13 +65,13 @@ def __init__(
         vocab_size=99,
         batch_size=13,
         encoder_seq_length=7,
-        decoder_seq_length=9,
+        decoder_seq_length=7,
         # For common tests
         is_training=True,
         use_attention_mask=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         d_ff=37,
         relative_attention_num_buckets=8,
@@ -73,7 +83,6 @@ def __init__(
         scope=None,
         decoder_layers=None,
     ):
-
         self.parent = parent
         self.batch_size = batch_size
         self.encoder_seq_length = encoder_seq_length
@@ -101,7 +110,8 @@ def get_large_model_config(self):
         return T5Config.from_pretrained("t5-base")
 
     def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
+        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size).clamp(2)
+        input_ids[:, -1] = self.eos_token_id  # Eos Token
         decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
 
         attention_mask = None
@@ -250,6 +260,26 @@ def create_and_check_with_lm_head(
         self.parent.assertEqual(outputs["logits"].size(), (self.batch_size, self.decoder_seq_length, self.vocab_size))
         self.parent.assertEqual(outputs["loss"].size(), ())
 
+    def create_and_check_with_sequence_classification_head(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        labels = torch.tensor([1] * self.batch_size, dtype=torch.long, device=torch_device)
+        model = T5ForSequenceClassification(config=config).to(torch_device).eval()
+        outputs = model(
+            input_ids=input_ids,
+            decoder_input_ids=input_ids,
+            labels=labels,
+        )
+        # self.parent.assertEqual(len(outputs), 4)
+        self.parent.assertEqual(outputs["logits"].size(), (self.batch_size, config.num_labels))
+        self.parent.assertEqual(outputs["loss"].size(), ())
+
     def create_and_check_decoder_model_past(
         self,
         config,
@@ -519,10 +549,27 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class T5ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
-    all_model_classes = (T5Model, T5ForConditionalGeneration) if is_torch_available() else ()
+class T5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (T5Model, T5ForConditionalGeneration, T5ForSequenceClassification, T5ForQuestionAnswering)
+        if is_torch_available()
+        else ()
+    )
     all_generative_model_classes = (T5ForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "conversational": T5ForConditionalGeneration,
+            "feature-extraction": T5Model,
+            "question-answering": T5ForQuestionAnswering,
+            "summarization": T5ForConditionalGeneration,
+            "text-classification": T5ForSequenceClassification,
+            "text2text-generation": T5ForConditionalGeneration,
+            "translation": T5ForConditionalGeneration,
+            "zero-shot": T5ForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
     all_parallelizable_model_classes = (T5Model, T5ForConditionalGeneration) if is_torch_available() else ()
     fx_compatible = True
     test_pruning = False
@@ -536,6 +583,136 @@ def setUp(self):
         self.model_tester = T5ModelTester(self)
         self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
 
+    # `QAPipelineTests` is not working well with slow tokenizers (for some models) and we don't want to touch the file
+    # `src/transformers/data/processors/squad.py` (where this test fails for this model)
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if pipeline_test_casse_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
+            return True
+
+        return False
+
+    def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False):
+        if not is_torch_fx_available() or not self.fx_compatible:
+            return
+
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        configs_no_init.return_dict = False
+
+        for model_class in self.all_model_classes:
+            if model_class.__name__ == "T5ForSequenceClassification":
+                continue
+            model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=output_loss)
+
+            try:
+                if model.config.is_encoder_decoder:
+                    model.config.use_cache = False  # FSTM still requires this hack -> FSTM should probably be refactored similar to BART afterward
+                    labels = inputs.get("labels", None)
+                    input_names = [
+                        "attention_mask",
+                        "decoder_attention_mask",
+                        "decoder_input_ids",
+                        "input_features",
+                        "input_ids",
+                        "input_values",
+                    ]
+                    if labels is not None:
+                        input_names.append("labels")
+
+                    filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names}
+                    input_names = list(filtered_inputs.keys())
+
+                    model_output = model(**filtered_inputs)
+
+                    traced_model = symbolic_trace(model, input_names)
+                    traced_output = traced_model(**filtered_inputs)
+                else:
+                    input_names = [
+                        "attention_mask",
+                        "bbox",
+                        "input_features",
+                        "input_ids",
+                        "input_values",
+                        "pixel_values",
+                        "token_type_ids",
+                        "visual_feats",
+                        "visual_pos",
+                    ]
+
+                    labels = inputs.get("labels", None)
+                    start_positions = inputs.get("start_positions", None)
+                    end_positions = inputs.get("end_positions", None)
+                    if labels is not None:
+                        input_names.append("labels")
+                    if start_positions is not None:
+                        input_names.append("start_positions")
+                    if end_positions is not None:
+                        input_names.append("end_positions")
+
+                    filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names}
+                    input_names = list(filtered_inputs.keys())
+
+                    if model.__class__.__name__ in set(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES.values()) and (
+                        not hasattr(model.config, "problem_type") or model.config.problem_type is None
+                    ):
+                        model.config.problem_type = "single_label_classification"
+
+                    traced_model = symbolic_trace(model, input_names)
+                    traced_output = traced_model(**filtered_inputs)
+                    model_output = model(**filtered_inputs)
+
+            except Exception as e:
+                self.fail(f"Couldn't trace module: {e}")
+
+            def flatten_output(output):
+                flatten = []
+                for x in output:
+                    if isinstance(x, (tuple, list)):
+                        flatten += flatten_output(x)
+                    elif not isinstance(x, torch.Tensor):
+                        continue
+                    else:
+                        flatten.append(x)
+                return flatten
+
+            model_output = flatten_output(model_output)
+            traced_output = flatten_output(traced_output)
+            num_outputs = len(model_output)
+
+            for i in range(num_outputs):
+                self.assertTrue(
+                    torch.allclose(model_output[i], traced_output[i]),
+                    f"traced {i}th output doesn't match model {i}th output for {model_class}",
+                )
+
+            # Test that the model can be serialized and restored properly
+            with tempfile.TemporaryDirectory() as tmp_dir_name:
+                pkl_file_name = os.path.join(tmp_dir_name, "model.pkl")
+                try:
+                    with open(pkl_file_name, "wb") as f:
+                        pickle.dump(traced_model, f)
+                    with open(pkl_file_name, "rb") as f:
+                        loaded = pickle.load(f)
+                except Exception as e:
+                    self.fail(f"Couldn't serialize / deserialize the traced model: {e}")
+
+                loaded_output = loaded(**filtered_inputs)
+                loaded_output = flatten_output(loaded_output)
+
+                for i in range(num_outputs):
+                    self.assertTrue(
+                        torch.allclose(model_output[i], loaded_output[i]),
+                        f"serialized model {i}th output doesn't match model {i}th output for {model_class}",
+                    )
+
+            # Avoid memory leak. Without this, each call increase RAM usage by ~20MB.
+            # (Even with this call, there are still memory leak by ~0.04MB)
+            self.clear_torch_jit_class_registry()
+
     def test_config(self):
         self.config_tester.run_common_tests()
 
@@ -555,6 +732,36 @@ def test_model_v1_1(self):
         config.feed_forward_proj = "gated-gelu"
         self.model_tester.create_and_check_model(config, *config_and_inputs[1:])
 
+    # T5ForSequenceClassification does not support inputs_embeds
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in (T5Model, T5ForConditionalGeneration, T5ForQuestionAnswering):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
+
+            if not self.is_encoder_decoder:
+                input_ids = inputs["input_ids"]
+                del inputs["input_ids"]
+            else:
+                encoder_input_ids = inputs["input_ids"]
+                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
+                del inputs["input_ids"]
+                inputs.pop("decoder_input_ids", None)
+
+            wte = model.get_input_embeddings()
+            if not self.is_encoder_decoder:
+                inputs["inputs_embeds"] = wte(input_ids)
+            else:
+                inputs["inputs_embeds"] = wte(encoder_input_ids)
+                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
+
+            with torch.no_grad():
+                model(**inputs)[0]
+
     def test_config_and_model_silu_gated(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         config = config_and_inputs[0]
@@ -565,6 +772,10 @@ def test_with_lm_head(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_with_lm_head(*config_and_inputs)
 
+    def test_with_sequence_classification_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_with_sequence_classification_head(*config_and_inputs)
+
     def test_decoder_model_past(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
@@ -691,7 +902,7 @@ def __init__(
         # For common tests
         use_attention_mask=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         d_ff=37,
         relative_attention_num_buckets=8,
@@ -703,7 +914,6 @@ def __init__(
         pad_token_id=0,
         scope=None,
     ):
-
         self.parent = parent
         self.batch_size = batch_size
         self.encoder_seq_length = encoder_seq_length
@@ -836,15 +1046,30 @@ def test_fp16_fp32_conversion(self):
         r"""
         A test to check whether the argument `keep_in_fp32_modules` correctly does its job
         """
+        orig_import = __import__
+        accelerate_mock = unittest.mock.Mock()
+
+        # mock import of accelerate
+        def import_accelerate_mock(name, *args, **kwargs):
+            if name == "accelerate":
+                if accelerate_available:
+                    return accelerate_mock
+                else:
+                    raise ImportError
+            return orig_import(name, *args, **kwargs)
+
         # Load without using `accelerate`
-        model = T5ForConditionalGeneration.from_pretrained("t5-small", torch_dtype=torch.float16)
-        self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == torch.float32)
-        self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wi.weight.dtype == torch.float16)
+        with unittest.mock.patch("builtins.__import__", side_effect=import_accelerate_mock):
+            accelerate_available = False
 
-        # Load without in bf16
-        model = T5ForConditionalGeneration.from_pretrained("t5-small", torch_dtype=torch.bfloat16)
-        self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == torch.bfloat16)
-        self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wi.weight.dtype == torch.bfloat16)
+            model = T5ForConditionalGeneration.from_pretrained("t5-small", torch_dtype=torch.float16)
+            self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == torch.float32)
+            self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wi.weight.dtype == torch.float16)
+
+            # Load without in bf16
+            model = T5ForConditionalGeneration.from_pretrained("t5-small", torch_dtype=torch.bfloat16)
+            self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == torch.bfloat16)
+            self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wi.weight.dtype == torch.bfloat16)
 
         # Load using `accelerate` in bf16
         model = T5ForConditionalGeneration.from_pretrained("t5-small", torch_dtype=torch.bfloat16, device_map="auto")
@@ -883,6 +1108,19 @@ def model(self):
     def tokenizer(self):
         return T5Tokenizer.from_pretrained("t5-base")
 
+    @slow
+    def test_torch_quant(self):
+        r"""
+        Test that a simple `torch.quantization.quantize_dynamic` call works on a T5 model.
+        """
+        model_name = "google/flan-t5-small"
+        tokenizer = T5Tokenizer.from_pretrained(model_name)
+        model = T5ForConditionalGeneration.from_pretrained(model_name)
+        model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
+        input_text = "Answer the following yes/no question by reasoning step-by-step. Can you write a whole Haiku in a single tweet?"
+        input_ids = tokenizer(input_text, return_tensors="pt").input_ids
+        _ = model.generate(input_ids)
+
     @slow
     def test_small_generation(self):
         model = T5ForConditionalGeneration.from_pretrained("t5-small").to(torch_device)
diff --git a/tests/models/t5/test_modeling_tf_t5.py b/tests/models/t5/test_modeling_tf_t5.py
index 57c991f9f15a..ec7488e4c34b 100644
--- a/tests/models/t5/test_modeling_tf_t5.py
+++ b/tests/models/t5/test_modeling_tf_t5.py
@@ -13,14 +13,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import T5Config, is_tf_available
-from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow, tooslow
+from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
 from transformers.utils import cached_property
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -43,7 +46,7 @@ def __init__(
         self.vocab_size = 99
         self.n_positions = 14
         self.hidden_size = 32
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.num_attention_heads = 4
         self.d_ff = 37
         self.relative_attention_num_buckets = 8
@@ -239,11 +242,21 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFT5ModelTest(TFModelTesterMixin, unittest.TestCase):
-
+class TFT5ModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     is_encoder_decoder = True
     all_model_classes = (TFT5Model, TFT5ForConditionalGeneration) if is_tf_available() else ()
     all_generative_model_classes = (TFT5ForConditionalGeneration,) if is_tf_available() else ()
+    pipeline_model_mapping = (
+        {
+            "conversational": TFT5ForConditionalGeneration,
+            "feature-extraction": TFT5Model,
+            "summarization": TFT5ForConditionalGeneration,
+            "text2text-generation": TFT5ForConditionalGeneration,
+            "translation": TFT5ForConditionalGeneration,
+        }
+        if is_tf_available()
+        else {}
+    )
     test_onnx = False
 
     def setUp(self):
@@ -287,28 +300,6 @@ def test_t5_decoder_model_past_large_inputs(self):
 
         self.model_tester.create_and_check_t5_decoder_model_past_large_inputs(*config_and_inputs)
 
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-
-            if model_class in self.all_generative_model_classes:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert name is None
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
-
-    @tooslow
-    def test_saved_model_creation(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model = TFT5Model.from_pretrained("t5-small")
@@ -334,7 +325,7 @@ def __init__(
         # For common tests
         use_attention_mask=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         d_ff=37,
         relative_attention_num_buckets=8,
@@ -346,7 +337,6 @@ def __init__(
         pad_token_id=0,
         scope=None,
     ):
-
         self.parent = parent
         self.batch_size = batch_size
         self.encoder_seq_length = encoder_seq_length
@@ -559,6 +549,8 @@ def test_sample_generate(self):
 
         self.assertListEqual(expected_output_string, output_strings)
 
+    # TODO (ydshieh): undo skip once a fix is done on TF side.
+    @unittest.skip("Skip for now as TF 2.13 breaks it on GPU")
     @slow
     def test_beam_search_xla_generate_simple(self):
         model = TFT5ForConditionalGeneration.from_pretrained("t5-small")
diff --git a/tests/models/t5/test_tokenization_t5.py b/tests/models/t5/test_tokenization_t5.py
index 4a8ffb1ced78..2c64e1bf0941 100644
--- a/tests/models/t5/test_tokenization_t5.py
+++ b/tests/models/t5/test_tokenization_t5.py
@@ -19,7 +19,7 @@
 import unittest
 
 from transformers import SPIECE_UNDERLINE, AddedToken, BatchEncoding, T5Tokenizer, T5TokenizerFast
-from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow
+from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_seqio, require_tokenizers, slow
 from transformers.utils import cached_property, is_tf_available, is_torch_available
 
 from ...test_tokenization_common import TokenizerTesterMixin
@@ -38,7 +38,6 @@
 @require_sentencepiece
 @require_tokenizers
 class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = T5Tokenizer
     rust_tokenizer_class = T5TokenizerFast
     test_rust_tokenizer = True
@@ -64,11 +63,12 @@ def test_get_vocab(self):
 
         self.assertEqual(vocab_keys[0], "<unk>")
         self.assertEqual(vocab_keys[1], "<s>")
-        self.assertEqual(vocab_keys[-1], "<pad>")
+        self.assertEqual(vocab_keys[1100], "<pad>")
         self.assertEqual(len(vocab_keys), 1_101)
 
     def test_vocab_size(self):
-        self.assertEqual(self.get_tokenizer().vocab_size, 1_100)
+        self.assertEqual(self.get_tokenizer().vocab_size, 1000)
+        self.assertEqual(len(self.get_tokenizer()), 1101)
 
     def test_full_tokenizer(self):
         tokenizer = T5Tokenizer(SAMPLE_VOCAB)
@@ -272,7 +272,6 @@ def test_fast_and_slow_same_result(self):
     def test_special_tokens_initialization(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-
                 added_tokens = [f"<extra_id_{i}>" for i in range(100)] + [AddedToken("<special>", lstrip=True)]
 
                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(
@@ -306,7 +305,6 @@ def test_special_tokens_initialization_with_non_empty_additional_special_tokens(
             tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer()))
 
         for tokenizer_class, tokenizer_utils in tokenizer_list:
-
             with tempfile.TemporaryDirectory() as tmp_dir:
                 tokenizer_utils.save_pretrained(tmp_dir)
 
@@ -384,21 +382,206 @@ def test_tokenizer_integration(self):
     def test_get_sentinel_tokens(self):
         tokenizer = T5Tokenizer(SAMPLE_VOCAB, extra_ids=10)
         sentinel_tokens = tokenizer.get_sentinel_tokens()
-        self.assertEquals(len(sentinel_tokens), 10)
+        self.assertEqual(len(sentinel_tokens), 10)
         self.assertListEqual(sorted(sentinel_tokens), sorted([f"<extra_id_{str(i)}>" for i in range(0, 10)]))
-        self.assertTrue([re.search("<extra_id_\d+>", token) is not None for token in sentinel_tokens])
+        self.assertTrue([re.search(r"<extra_id_\d+>", token) is not None for token in sentinel_tokens])
 
     def test_get_sentinel_token_ids(self):
         tokenizer = T5Tokenizer(SAMPLE_VOCAB, extra_ids=10)
-        self.assertListEqual(sorted(tokenizer.get_sentinel_token_ids()), sorted([i for i in range(1000, 1010)]))
+        self.assertListEqual(sorted(tokenizer.get_sentinel_token_ids()), sorted(range(1000, 1010)))
 
     def test_get_sentinel_tokens_for_fasttokenizer(self):
         tokenizer = T5TokenizerFast(SAMPLE_VOCAB, extra_ids=10)
         sentinel_tokens = tokenizer.get_sentinel_tokens()
-        self.assertEquals(len(sentinel_tokens), 10)
+        self.assertEqual(len(sentinel_tokens), 10)
         self.assertListEqual(sorted(sentinel_tokens), sorted([f"<extra_id_{str(i)}>" for i in range(0, 10)]))
-        self.assertTrue([re.search("<extra_id_\d+>", token) is not None for token in sentinel_tokens])
+        self.assertTrue([re.search(r"<extra_id_\d+>", token) is not None for token in sentinel_tokens])
 
     def test_get_sentinel_token_ids_for_fasttokenizer(self):
         tokenizer = T5TokenizerFast(SAMPLE_VOCAB, extra_ids=10)
-        self.assertListEqual(sorted(tokenizer.get_sentinel_token_ids()), sorted([i for i in range(1000, 1010)]))
+        self.assertListEqual(sorted(tokenizer.get_sentinel_token_ids()), sorted(range(1000, 1010)))
+
+    def test_some_edge_cases(self):
+        tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=False)
+
+        sp_tokens = tokenizer.sp_model.encode("</s>>", out_type=str)
+        self.assertEqual(sp_tokens, ["<", "/", "s", ">", ">"])
+        tokens = tokenizer.tokenize("</s>>")
+        self.assertNotEqual(sp_tokens, tokens)
+        self.assertEqual(tokens, ["</s>", ">"])
+
+        tokens = tokenizer.tokenize("")
+        self.assertEqual(tokens, [])
+        self.assertEqual(tokens, tokenizer.sp_model.encode("", out_type=str))
+
+        tokens = tokenizer.tokenize(" ")
+        self.assertEqual(tokens, [])
+        self.assertEqual(tokens, tokenizer.sp_model.encode(" ", out_type=str))
+
+        tokens = tokenizer.tokenize("▁")
+        self.assertEqual(tokens, [])
+        self.assertEqual(tokens, tokenizer.sp_model.encode("▁", out_type=str))
+
+        tokens = tokenizer.tokenize(" ▁")
+        self.assertEqual(tokens, [])
+        self.assertEqual(tokens, tokenizer.sp_model.encode("▁", out_type=str))
+
+
+@require_sentencepiece
+@require_tokenizers
+class CommonSpmIntegrationTests(unittest.TestCase):
+    """
+    A class that regroups important test to make sure that we properly handle the special tokens.
+    """
+
+    @classmethod
+    def setUpClass(cls):
+        tokenizer = T5Tokenizer(SAMPLE_VOCAB, extra_ids=0, legacy=False)
+        tokenizer.add_special_tokens(
+            {"additional_special_tokens": [AddedToken("<extra_id_0>", rstrip=False, lstrip=False)]}
+        )
+        # TODO ArthurZ the above is necessary as addedTokens / intialization sucks. Trie is not correctly created
+        # So the extra ids are split....
+        cls.tokenizer = tokenizer
+
+    def test_add_dummy_prefix(self):
+        # make sure `'▁'` is prepended, and outputs match sp_model's
+        # `sentencepiece.NormalizerSpec.add_dummy_prefix` attribute
+        input_ids = self.tokenizer.encode(". Hello", add_special_tokens=False)
+        self.assertEqual(input_ids, [7, 4, 156, 86, 20])
+        sp_encode = self.tokenizer.sp_model.encode(". Hello")
+        self.assertEqual(input_ids, [7] + sp_encode)
+        tokens = self.tokenizer.tokenize(". Hello")
+        self.assertEqual(tokens, ["▁", ".", "▁He", "ll", "o"])
+
+        tokens = self.tokenizer.tokenize("")
+        self.assertEqual(tokens, [])
+        self.assertEqual(tokens, self.tokenizer.sp_model.encode("", out_type=str))
+
+        tokens = self.tokenizer.tokenize(" ")
+        self.assertEqual(tokens, [])
+        self.assertEqual(tokens, self.tokenizer.sp_model.encode(" ", out_type=str))
+
+        tokens = self.tokenizer.tokenize("▁")
+        self.assertEqual(tokens, [])
+        self.assertEqual(tokens, self.tokenizer.sp_model.encode("▁", out_type=str))
+
+    def test_remove_extra_whitespaces(self):
+        # make sure the extra spaces are eaten
+        # sentencepiece.NormalizerSpec.remove_extra_whitespaces attribute
+        input_ids = self.tokenizer.encode("       . Hello", add_special_tokens=False)
+        self.assertEqual(input_ids, [7, 4, 156, 86, 20])
+        sp_encode = self.tokenizer.sp_model.encode("       . Hello")
+        self.assertEqual(input_ids, [7] + sp_encode)
+        tokens = self.tokenizer.tokenize(" . Hello")
+        self.assertEqual(tokens, ["▁", ".", "▁He", "ll", "o"])
+
+        # `'▁'` is also a whitespace
+        input_ids = self.tokenizer.encode("▁He is not")
+        self.assertEqual(input_ids, [156, 46, 44, 2])
+        tokens = self.tokenizer.tokenize("▁He is not")
+        self.assertEqual(tokens, ["▁He", "▁is", "▁not"])  # no extra space added
+
+        input_ids = self.tokenizer.encode("▁He is not<extra_id_0>             ▁He")
+        # here t5x does not eat with lstrip, so there is and extra ▁He in the original one
+        self.assertEqual(input_ids, [156, 46, 44, 1001, 156, 2])
+        tokens = self.tokenizer.tokenize("▁He is not<extra_id_0>              ▁He")
+        self.assertEqual(tokens, ["▁He", "▁is", "▁not", "<extra_id_0>", "▁He"])  # spaces are eaten by spm
+        # make sure that the output after the extra id is the same as if
+        # extra_id was not there
+        input_ids = self.tokenizer.encode("▁He is not             ▁He")
+        self.assertEqual(input_ids, [156, 46, 44, 156, 2])
+        tokens = self.tokenizer.tokenize("▁He is not              ▁He")
+        self.assertEqual(tokens, ["▁He", "▁is", "▁not", "▁He"])  # spaces are eaten by spm even if not start
+
+    def test_character_after_special_token(self):
+        # Make sure that `tokenizer.tokenize` is similar to
+        # adding the equivalent special token to the vocab
+        input_ids = self.tokenizer.encode("Hey <extra_id_0>I")
+        self.assertEqual(input_ids, [156, 30, 1001, 100, 2])
+        tokens = self.tokenizer.tokenize("Hey <extra_id_0>I")
+        self.assertEqual(tokens, ["▁He", "y", "<extra_id_0>", "I"])
+
+        input_ids = self.tokenizer.encode("Hello, <extra_id_0>,")
+        self.assertEqual(input_ids, [156, 86, 20, 3, 1001, 3, 2])
+        tokens = self.tokenizer.tokenize("Hello, <extra_id_0>,")
+        self.assertEqual(tokens, ["▁He", "ll", "o", ",", "<extra_id_0>", ","])
+
+    def test_special_tokens_strip(self):
+        input_ids = self.tokenizer.encode(" <extra_id_0> ,")
+        self.assertEqual(input_ids, [1001, 7, 3, 2])
+        tokens = self.tokenizer.tokenize(" <extra_id_0> ,")
+        # spaces are not longer eaten by rstrip and lstrip
+        self.assertEqual(tokens, ["<extra_id_0>", "▁", ","])
+
+        # test with a begin of word like `▁He`
+        input_ids = self.tokenizer.encode("No <extra_id_0> He")
+        self.assertEqual(input_ids, [284, 1001, 156, 2])
+        # spaces are eaten by rstrip / lstrip, so this is expected. Don't strip otherwise you break
+        tokens = self.tokenizer.tokenize("No <extra_id_0> He")
+        self.assertEqual(tokens, ["▁No", "<extra_id_0>", "▁He"])
+
+        # Make sure this does not happen if we don't strip
+        tokenizer = T5Tokenizer(SAMPLE_VOCAB, extra_ids=0)
+        tokenizer.add_special_tokens({"bos_token": AddedToken("<bos>")})
+        input_ids = tokenizer.encode("No <bos> He")
+        self.assertEqual(input_ids, [284, 1001, 156, 2])
+        tokens = tokenizer.tokenize("No <bos> He")
+        # the first `' '` after `'No'` is eaten by spm:
+        self.assertEqual(tokenizer.sp_model.encode("No         ", out_type=str), ["▁No"])
+        self.assertEqual(tokens, ["▁No", "<bos>", "▁He"])
+
+    @require_seqio
+    @unittest.skipIf(
+        os.getenv("RUN_TOKENIZER_INTEGRATION", "0") == "0",
+        "RUN_TOKENIZER_INTEGRATION=1 to run tokenizer integration tests",
+    )
+    def test_integration_seqio(self):
+        from datasets import load_dataset
+        from seqio import SentencePieceVocabulary
+
+        ds = load_dataset("xnli", "all_languages", split="train+test+validation")
+
+        # TODO @ArthurZucker fix the 3 commented tests with #23909
+        input_texts = [
+            "Bonjour <extra_id_0>.",
+            # "Bonjour<extra_id_0>.",  # this will fail. In T5 the special token has to be at the end.
+            # because in T5 they add `_<extra_id_0>` to the vocab, not `<extra_id_0>`.
+            "                   Hey <extra_id_0>I love you",
+            # "Hey <extra_id_0> I love you", # this will fail, we strip left, to _I vs I
+            # "Hey <extra_id_0>▁He", # this will fail for the same reason, we replace `_` then strip
+        ]
+
+        import tqdm
+
+        # Test with umt5
+        vocab_path = "gs://t5-data/vocabs/umt5.256000/sentencepiece.model"
+        t5x_tokenizer = SentencePieceVocabulary(vocab_path, extra_ids=300)
+        hf_tokenizer = T5Tokenizer.from_pretrained("google/umt5-small", legacy=False)
+        for text in input_texts:
+            self.assertEqual(
+                hf_tokenizer.encode(text, add_special_tokens=False), t5x_tokenizer.tokenizer.tokenize(text), f"{text}"
+            )
+        for texts in tqdm.tqdm(ds["premise"]):
+            for text in texts:
+                self.assertEqual(
+                    hf_tokenizer.encode(text, add_special_tokens=False),
+                    t5x_tokenizer.tokenizer.tokenize(text),
+                    f"{text}",
+                )
+
+        # Test with T5
+        hf_tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        vocab_path = "gs://t5-data/vocabs/cc_all.32000/sentencepiece.model"
+        t5x_tokenizer = SentencePieceVocabulary(vocab_path, extra_ids=300)
+        for text in input_texts:
+            self.assertEqual(
+                hf_tokenizer.encode(text, add_special_tokens=False), t5x_tokenizer.tokenizer.tokenize(text), f"{text}"
+            )
+        for texts in tqdm.tqdm(ds["premise"]):
+            for text in texts:
+                self.assertEqual(
+                    hf_tokenizer.encode(text, add_special_tokens=False),
+                    t5x_tokenizer.tokenizer.tokenize(text),
+                    f"{text}",
+                )
diff --git a/tests/models/table_transformer/test_modeling_table_transformer.py b/tests/models/table_transformer/test_modeling_table_transformer.py
index 1060a551308b..d81c52ff1307 100644
--- a/tests/models/table_transformer/test_modeling_table_transformer.py
+++ b/tests/models/table_transformer/test_modeling_table_transformer.py
@@ -20,15 +20,17 @@
 import unittest
 
 from huggingface_hub import hf_hub_download
-from transformers import TableTransformerConfig, is_timm_available, is_vision_available
-from transformers.testing_utils import require_timm, require_vision, slow, torch_device
+
+from transformers import ResNetConfig, TableTransformerConfig, is_torch_available, is_vision_available
+from transformers.testing_utils import require_timm, require_torch, require_vision, slow, torch_device
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
-if is_timm_available():
+if is_torch_available():
     import torch
 
     from transformers import TableTransformerForObjectDetection, TableTransformerModel
@@ -37,7 +39,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import AutoFeatureExtractor
+    from transformers import AutoImageProcessor
 
 
 class TableTransformerModelTester:
@@ -47,7 +49,7 @@ def __init__(
         batch_size=8,
         is_training=True,
         use_labels=True,
-        hidden_size=256,
+        hidden_size=32,
         num_hidden_layers=2,
         num_attention_heads=8,
         intermediate_size=4,
@@ -59,7 +61,7 @@ def __init__(
         min_size=200,
         max_size=200,
         n_targets=8,
-        num_labels=91,
+        num_labels=3,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -105,6 +107,16 @@ def prepare_config_and_inputs(self):
         return config, pixel_values, pixel_mask, labels
 
     def get_config(self):
+        resnet_config = ResNetConfig(
+            num_channels=3,
+            embeddings_size=10,
+            hidden_sizes=[10, 20, 30, 40],
+            depths=[1, 1, 2, 1],
+            hidden_act="relu",
+            num_labels=3,
+            out_features=["stage2", "stage3", "stage4"],
+            out_indices=[2, 3, 4],
+        )
         return TableTransformerConfig(
             d_model=self.hidden_size,
             encoder_layers=self.num_hidden_layers,
@@ -117,6 +129,8 @@ def get_config(self):
             attention_dropout=self.attention_probs_dropout_prob,
             num_queries=self.num_queries,
             num_labels=self.num_labels,
+            use_timm_backbone=False,
+            backbone_config=resnet_config,
         )
 
     def prepare_config_and_inputs_for_common(self):
@@ -153,17 +167,41 @@ def create_and_check_table_transformer_object_detection_head_model(self, config,
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels + 1))
         self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
 
+    def create_and_check_table_transformer_no_timm_backbone(self, config, pixel_values, pixel_mask, labels):
+        config.use_timm_backbone = False
+        config.backbone_config = ResNetConfig()
+        model = TableTransformerForObjectDetection(config=config)
+        model.to(torch_device)
+        model.eval()
 
-@require_timm
-class TableTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
+        result = model(pixel_values)
+
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels + 1))
+        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
+
+        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)
+
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels + 1))
+        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
+
+
+@require_torch
+class TableTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             TableTransformerModel,
             TableTransformerForObjectDetection,
         )
-        if is_timm_available()
+        if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {"feature-extraction": TableTransformerModel, "object-detection": TableTransformerForObjectDetection}
+        if is_torch_available()
+        else {}
+    )
     is_encoder_decoder = True
     test_torchscript = False
     test_pruning = False
@@ -212,6 +250,10 @@ def test_table_transformer_object_detection_head_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_table_transformer_object_detection_head_model(*config_and_inputs)
 
+    def test_table_transformer_no_timm_backbone(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_table_transformer_no_timm_backbone(*config_and_inputs)
+
     @unittest.skip(reason="Table Transformer does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass
@@ -423,6 +465,7 @@ def test_greyscale_images(self):
 
         # let's set num_channels to 1
         config.num_channels = 1
+        config.backbone_config.num_channels = 1
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -471,13 +514,13 @@ def prepare_img():
 @slow
 class TableTransformerModelIntegrationTests(unittest.TestCase):
     def test_table_detection(self):
-        feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/table-transformer-detection")
+        image_processor = AutoImageProcessor.from_pretrained("microsoft/table-transformer-detection")
         model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-transformer-detection")
         model.to(torch_device)
 
         file_path = hf_hub_download(repo_id="nielsr/example-pdf", repo_type="dataset", filename="example_pdf.png")
         image = Image.open(file_path).convert("RGB")
-        inputs = feature_extractor(image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/tapas/test_modeling_tapas.py b/tests/models/tapas/test_modeling_tapas.py
index 504f3e278ea8..6a482d03bed9 100644
--- a/tests/models/tapas/test_modeling_tapas.py
+++ b/tests/models/tapas/test_modeling_tapas.py
@@ -37,6 +37,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -59,6 +60,9 @@
         reduce_mean,
         reduce_sum,
     )
+    from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_12
+else:
+    is_torch_greater_or_equal_than_1_12 = False
 
 
 class TapasModelTester:
@@ -75,7 +79,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -407,9 +411,9 @@ def prepare_config_and_inputs_for_common(self):
         return config, inputs_dict
 
 
+@unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+")
 @require_torch
-class TapasModelTest(ModelTesterMixin, unittest.TestCase):
-
+class TapasModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             TapasModel,
@@ -420,6 +424,17 @@ class TapasModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else None
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": TapasModel,
+            "fill-mask": TapasForMaskedLM,
+            "table-question-answering": TapasForQuestionAnswering,
+            "text-classification": TapasForSequenceClassification,
+            "zero-shot": TapasForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
     test_pruning = False
     test_resize_embeddings = True
     test_head_masking = False
@@ -475,6 +490,12 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
                 )
         return inputs_dict
 
+    # TODO: Fix the failed tests
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        return True
+
     def setUp(self):
         self.model_tester = TapasModelTester(self)
         self.config_tester = ConfigTester(self, config_class=TapasConfig, dim=37)
@@ -545,6 +566,7 @@ def prepare_tapas_batch_inputs_for_training():
     return table, queries, answer_coordinates, answer_text, float_answer
 
 
+@unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+")
 @require_torch
 class TapasModelIntegrationTest(unittest.TestCase):
     @cached_property
@@ -899,6 +921,7 @@ def test_inference_classification_head(self):
 # Below: tests for Tapas utilities which are defined in modeling_tapas.py.
 # These are based on segmented_tensor_test.py of the original implementation.
 # URL: https://github.com/google-research/tapas/blob/master/tapas/models/segmented_tensor_test.py
+@unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+")
 @require_torch
 class TapasUtilitiesTest(unittest.TestCase):
     def _prepare_tables(self):
diff --git a/tests/models/tapas/test_modeling_tf_tapas.py b/tests/models/tapas/test_modeling_tf_tapas.py
index 2f49b57445ba..7687144eaf2f 100644
--- a/tests/models/tapas/test_modeling_tf_tapas.py
+++ b/tests/models/tapas/test_modeling_tf_tapas.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import copy
 import unittest
 
@@ -39,6 +41,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -74,7 +77,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -418,8 +421,7 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_tensorflow_probability
 @require_tf
-class TFTapasModelTest(TFModelTesterMixin, unittest.TestCase):
-
+class TFTapasModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             TFTapasModel,
@@ -430,9 +432,25 @@ class TFTapasModelTest(TFModelTesterMixin, unittest.TestCase):
         if is_tf_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": TFTapasModel,
+            "fill-mask": TFTapasForMaskedLM,
+            "text-classification": TFTapasForSequenceClassification,
+            "zero-shot": TFTapasForSequenceClassification,
+        }
+        if is_tf_available()
+        else {}
+    )
     test_head_masking = False
     test_onnx = False
 
+    # TODO: Fix the failed tests
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        return True
+
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> dict:
         inputs_dict = copy.deepcopy(inputs_dict)
 
diff --git a/tests/models/tapas/test_tokenization_tapas.py b/tests/models/tapas/test_tokenization_tapas.py
index 89865a78e733..9d82c468aa30 100644
--- a/tests/models/tapas/test_tokenization_tapas.py
+++ b/tests/models/tapas/test_tokenization_tapas.py
@@ -22,7 +22,7 @@
 import numpy as np
 import pandas as pd
 
-from transformers import AddedToken
+from transformers import AddedToken, is_torch_available
 from transformers.models.tapas.tokenization_tapas import (
     VOCAB_FILES_NAMES,
     BasicTokenizer,
@@ -44,6 +44,12 @@
 from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english, merge_model_tokenizer_mappings
 
 
+if is_torch_available():
+    from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_12
+else:
+    is_torch_greater_or_equal_than_1_12 = False
+
+
 @require_tokenizers
 @require_pandas
 class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@@ -90,7 +96,6 @@ def get_clean_sequence(
         add_special_tokens: bool = True,
         return_table_and_query: bool = False,
     ):
-
         toks = [tokenizer.decode([i], clean_up_tokenization_spaces=False) for i in range(len(tokenizer))]
 
         if empty_table:
@@ -635,7 +640,6 @@ def test_number_of_added_tokens(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 table, query = self.get_table_and_query(tokenizer)
 
                 sequences = tokenizer.encode(table, query, add_special_tokens=False)
@@ -1028,6 +1032,7 @@ def test_token_type_ids(self):
                 # Do the same test as modeling common.
                 self.assertIn(0, output["token_type_ids"][0])
 
+    @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+")
     @require_torch
     @slow
     def test_torch_encode_plus_sent_to_model(self):
@@ -1040,7 +1045,6 @@ def test_torch_encode_plus_sent_to_model(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
                     return
 
@@ -1270,3 +1274,7 @@ def test_pretrained_model_lists(self):
     @unittest.skip("Doesn't support another framework than PyTorch")
     def test_np_encode_plus_sent_to_model(self):
         pass
+
+    @unittest.skip("Chat is not supported")
+    def test_chat_template(self):
+        pass
diff --git a/tests/models/tapex/test_tokenization_tapex.py b/tests/models/tapex/test_tokenization_tapex.py
deleted file mode 100644
index dec0f507ed3c..000000000000
--- a/tests/models/tapex/test_tokenization_tapex.py
+++ /dev/null
@@ -1,906 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-import os
-import shutil
-import tempfile
-import unittest
-from typing import List
-
-import pandas as pd
-
-from transformers import AddedToken, TapexTokenizer
-from transformers.models.tapex.tokenization_tapex import VOCAB_FILES_NAMES
-from transformers.testing_utils import is_pt_tf_cross_test, require_pandas, slow
-
-from ...test_tokenization_common import TokenizerTesterMixin
-
-
-@require_pandas
-class TapexTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    tokenizer_class = TapexTokenizer
-    test_rust_tokenizer = False
-    from_pretrained_kwargs = {"cls_token": "<s>"}
-    test_seq2seq = False
-
-    def setUp(self):
-        super().setUp()
-
-        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
-        # fmt: off
-        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "\u0120", "\u0120l", "\u0120n", "\u0120lo", "\u0120low", "er", "\u0120lowest", "\u0120newer", "\u0120wider", "<unk>"]  # noqa: E231
-        # fmt: on
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
-        self.special_tokens_map = {"unk_token": "<unk>"}
-
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(self.merges_file, "w", encoding="utf-8") as fp:
-            fp.write("\n".join(merges))
-
-    def get_table(self, tokenizer, length=5):
-        toks = [tokenizer.decode([i], clean_up_tokenization_spaces=False) for i in range(len(tokenizer))]
-
-        if length == 0:
-            data = {}
-        else:
-            data = {toks[0]: [toks[tok] for tok in range(1, length)]}
-
-        table = pd.DataFrame.from_dict(data)
-
-        return table
-
-    def get_table_and_query(self, tokenizer, length=5):
-        toks = [tokenizer.decode([i], clean_up_tokenization_spaces=False) for i in range(len(tokenizer))]
-        table = self.get_table(tokenizer, length=length - 3)
-        query = " ".join(toks[:3])
-
-        return table, query
-
-    def get_clean_sequence(
-        self,
-        tokenizer,
-        with_prefix_space=False,
-        max_length=20,
-        min_length=5,
-        empty_table: bool = False,
-        add_special_tokens: bool = True,
-        return_table_and_query: bool = False,
-    ):
-
-        toks = [tokenizer.decode([i], clean_up_tokenization_spaces=False) for i in range(len(tokenizer))]
-
-        if empty_table:
-            table = pd.DataFrame.from_dict({})
-            query = " ".join(toks[:min_length])
-        else:
-            data = {toks[0]: [toks[tok] for tok in range(1, min_length - 3)]}
-            table = pd.DataFrame.from_dict(data)
-            query = " ".join(toks[:3])
-
-        output_ids = tokenizer.encode(table, query, add_special_tokens=add_special_tokens)
-        output_txt = tokenizer.decode(output_ids)
-
-        if len(output_ids) < min_length:
-            raise ValueError("Update the code to generate the sequences so that they are larger")
-        if len(output_ids) > max_length:
-            raise ValueError("Update the code to generate the sequences so that they are smaller")
-
-        if return_table_and_query:
-            return output_txt, output_ids, table, query
-
-        return output_txt, output_ids
-
-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_input_output_texts(self, tokenizer):
-        input_text = "lower newer"
-        output_text = "lower newer"
-        return input_text, output_text
-
-    def test_full_tokenizer_roberta(self):
-        tokenizer = self.tokenizer_class(self.vocab_file, self.merges_file, **self.special_tokens_map)
-        text = "lower newer"
-        bpe_tokens = ["l", "o", "w", "er", "\u0120", "n", "e", "w", "er"]
-        tokens = tokenizer.tokenize(text)
-        self.assertListEqual(tokens, bpe_tokens)
-
-        input_tokens = tokens + [tokenizer.unk_token]
-        input_bpe_tokens = [0, 1, 2, 15, 10, 9, 3, 2, 15, 19]
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
-
-    def roberta_dict_integration_testing(self):
-        tokenizer = self.get_tokenizer()
-
-        self.assertListEqual(tokenizer.encode("Hello world!", add_special_tokens=False), [0, 31414, 232, 328, 2])
-        self.assertListEqual(
-            tokenizer.encode("Hello world! cécé herlolip 418", add_special_tokens=False),
-            [0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2],
-        )
-
-    def test_add_tokens_tokenizer(self):
-        tokenizers: List[TapexTokenizer] = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                vocab_size = tokenizer.vocab_size
-                all_size = len(tokenizer)
-
-                self.assertNotEqual(vocab_size, 0)
-
-                # We usually have added tokens from the start in tests because our vocab fixtures are
-                # smaller than the original vocabs - let's not assert this
-                # self.assertEqual(vocab_size, all_size)
-
-                new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"]
-                added_toks = tokenizer.add_tokens(new_toks)
-                vocab_size_2 = tokenizer.vocab_size
-                all_size_2 = len(tokenizer)
-
-                self.assertNotEqual(vocab_size_2, 0)
-                self.assertEqual(vocab_size, vocab_size_2)
-                self.assertEqual(added_toks, len(new_toks))
-                self.assertEqual(all_size_2, all_size + len(new_toks))
-
-                tokens = tokenizer.encode(table, "aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False)
-
-                self.assertGreaterEqual(len(tokens), 4)
-                self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
-                self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
-
-                new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"}
-                added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
-                vocab_size_3 = tokenizer.vocab_size
-                all_size_3 = len(tokenizer)
-
-                self.assertNotEqual(vocab_size_3, 0)
-                self.assertEqual(vocab_size, vocab_size_3)
-                self.assertEqual(added_toks_2, len(new_toks_2))
-                self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
-
-                tokens = tokenizer.encode(
-                    table,
-                    ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l",
-                    add_special_tokens=False,
-                )
-
-                self.assertGreaterEqual(len(tokens), 6)
-                self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
-                self.assertGreater(tokens[0], tokens[1])
-                self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
-                self.assertGreater(tokens[-2], tokens[-3])
-                self.assertEqual(tokens[0], tokenizer.eos_token_id)
-                self.assertEqual(tokens[-2], tokenizer.pad_token_id)
-
-    def test_token_type_ids(self):
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                empty_table = self.get_table(tokenizer, length=0)
-                seq_0 = "Test this method."
-
-                # We want to have sequence 0 and sequence 1 are tagged
-                # respectively with 0 and 1 token_ids
-                # (regardless of whether the model use token type ids)
-                # We use this assumption in the QA pipeline among other place
-                output = tokenizer(empty_table, seq_0, return_token_type_ids=True)
-
-                # Assert that the token type IDs have the same length as the input IDs
-                self.assertEqual(len(output["token_type_ids"]), len(output["input_ids"]))
-                self.assertIn(0, output["token_type_ids"])
-
-    def test_add_special_tokens(self):
-        tokenizers: List[TapexTokenizer] = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                input_table = self.get_table(tokenizer, length=0)
-
-                special_token = "[SPECIAL_TOKEN]"
-
-                tokenizer.add_special_tokens({"cls_token": special_token})
-                encoded_special_token = tokenizer.encode(input_table, special_token, add_special_tokens=False)
-                self.assertEqual(len(encoded_special_token), 1)
-
-                decoded = tokenizer.decode(encoded_special_token, skip_special_tokens=True)
-                self.assertTrue(special_token not in decoded)
-
-    def test_batch_encode_plus_overflowing_tokens(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            table = self.get_table(tokenizer, length=10)
-            string_sequences = ["Testing the prepare_for_model method.", "Test"]
-
-            if tokenizer.pad_token is None:
-                tokenizer.add_special_tokens({"pad_token": "[PAD]"})
-
-            tokenizer.batch_encode_plus(
-                table, string_sequences, return_overflowing_tokens=True, truncation=True, padding=True, max_length=3
-            )
-
-    @is_pt_tf_cross_test
-    def test_batch_encode_plus_tensors(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                sequences = [
-                    "Testing batch encode plus",
-                    "Testing batch encode plus with different sequence lengths",
-                    "Testing batch encode plus with different sequence lengths correctly pads",
-                ]
-
-                table = self.get_table(tokenizer, length=0)
-
-                # A Tensor cannot be build by sequences which are not the same size
-                self.assertRaises(ValueError, tokenizer.batch_encode_plus, table, sequences, return_tensors="pt")
-                self.assertRaises(ValueError, tokenizer.batch_encode_plus, table, sequences, return_tensors="tf")
-
-                if tokenizer.pad_token_id is None:
-                    self.assertRaises(
-                        ValueError,
-                        tokenizer.batch_encode_plus,
-                        table,
-                        sequences,
-                        padding=True,
-                        return_tensors="pt",
-                    )
-                    self.assertRaises(
-                        ValueError,
-                        tokenizer.batch_encode_plus,
-                        table,
-                        sequences,
-                        padding="longest",
-                        return_tensors="tf",
-                    )
-                else:
-                    pytorch_tensor = tokenizer.batch_encode_plus(table, sequences, padding=True, return_tensors="pt")
-                    tensorflow_tensor = tokenizer.batch_encode_plus(
-                        table, sequences, padding="longest", return_tensors="tf"
-                    )
-                    encoded_sequences = tokenizer.batch_encode_plus(table, sequences, padding=True)
-
-                    for key in encoded_sequences.keys():
-                        pytorch_value = pytorch_tensor[key].tolist()
-                        tensorflow_value = tensorflow_tensor[key].numpy().tolist()
-                        encoded_value = encoded_sequences[key]
-
-                        self.assertEqual(pytorch_value, tensorflow_value, encoded_value)
-
-    def test_call(self):
-        # Tests that all call wrap to encode_plus and batch_encode_plus
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                sequences = [
-                    "Testing batch encode plus",
-                    "Testing batch encode plus with different sequence lengths",
-                    "Testing batch encode plus with different sequence lengths correctly pads",
-                ]
-
-                # Test not batched
-                table = self.get_table(tokenizer, length=0)
-                encoded_sequences_1 = tokenizer.encode_plus(table, sequences[0])
-                encoded_sequences_2 = tokenizer(table, sequences[0])
-                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
-
-                # Test not batched pairs
-                table = self.get_table(tokenizer, length=10)
-                encoded_sequences_1 = tokenizer.encode_plus(table, sequences[1])
-                encoded_sequences_2 = tokenizer(table, sequences[1])
-                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
-
-                # Test batched
-                table = self.get_table(tokenizer, length=0)
-                encoded_sequences_1 = tokenizer.batch_encode_plus(table, sequences)
-                encoded_sequences_2 = tokenizer(table, sequences)
-                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
-
-    def test_internal_consistency(self):
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                input_text, output_text = self.get_input_output_texts(tokenizer)
-
-                tokens = tokenizer.tokenize(input_text)
-                ids = tokenizer.convert_tokens_to_ids(tokens)
-                ids_2 = tokenizer.encode(table, input_text, add_special_tokens=False)
-                self.assertListEqual(ids, ids_2)
-
-                tokens_2 = tokenizer.convert_ids_to_tokens(ids)
-                self.assertNotEqual(len(tokens_2), 0)
-                text_2 = tokenizer.decode(ids)
-                self.assertIsInstance(text_2, str)
-
-                self.assertEqual(text_2, output_text)
-
-    def test_save_and_load_tokenizer(self):
-        # safety check on max_len default value so we are sure the test works
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                self.assertNotEqual(tokenizer.model_max_length, 42)
-
-        # Now let's start the test
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                # Isolate this from the other tests because we save additional tokens/etc
-                table = self.get_table(tokenizer, length=0)
-                tmpdirname = tempfile.mkdtemp()
-
-                sample_text = " He is very happy, UNwant\u00E9d,running"
-                before_tokens = tokenizer.encode(table, sample_text, add_special_tokens=False)
-                before_vocab = tokenizer.get_vocab()
-                tokenizer.save_pretrained(tmpdirname)
-
-                after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
-                after_tokens = after_tokenizer.encode(table, sample_text, add_special_tokens=False)
-                after_vocab = after_tokenizer.get_vocab()
-                self.assertListEqual(before_tokens, after_tokens)
-                self.assertDictEqual(before_vocab, after_vocab)
-
-                shutil.rmtree(tmpdirname)
-
-    def test_number_of_added_tokens(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-
-                table, query = self.get_table_and_query(tokenizer)
-
-                sequences = tokenizer.encode(table, query, add_special_tokens=False)
-                attached_sequences = tokenizer.encode(table, query, add_special_tokens=True)
-
-                self.assertEqual(2, len(attached_sequences) - len(sequences))
-
-    @unittest.skip("TAPEX cannot handle `prepare_for_model` without passing by `encode_plus` or `batch_encode_plus`")
-    def test_prepare_for_model(self):
-        pass
-
-    @unittest.skip("TAPEX tokenizer does not support pairs.")
-    def test_maximum_encoding_length_pair_input(self):
-        pass
-
-    @unittest.skip("TAPEX tokenizer does not support pairs.")
-    def test_maximum_encoding_length_single_input(self):
-        pass
-
-    @unittest.skip("Not implemented")
-    def test_right_and_left_truncation(self):
-        pass
-
-    def test_encode_decode_with_spaces(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-
-                new_toks = [AddedToken("[ABC]", normalized=False), AddedToken("[DEF]", normalized=False)]
-                tokenizer.add_tokens(new_toks)
-                input = "[ABC][DEF][ABC][DEF]"
-                if self.space_between_special_tokens:
-                    output = "[ABC] [DEF] [ABC] [DEF]"
-                else:
-                    output = input
-                encoded = tokenizer.encode(table, input, add_special_tokens=False)
-                decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens)
-                self.assertIn(decoded, [output, output.lower()])
-
-    def test_tokenize_special_tokens(self):
-        """Test `tokenize` with special tokens."""
-        tokenizers = self.get_tokenizers(fast=True, do_lower_case=True)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                SPECIAL_TOKEN_1 = "[SPECIAL_TOKEN_1]"
-                SPECIAL_TOKEN_2 = "[SPECIAL_TOKEN_2]"
-
-                # TODO:
-                # Can we combine `unique_no_split_tokens` and `all_special_tokens`(and properties related to it)
-                # with one variable(property) for a better maintainability?
-
-                # `add_tokens` method stores special tokens only in `tokenizer.unique_no_split_tokens`. (in tokenization_utils.py)
-                tokenizer.add_tokens([SPECIAL_TOKEN_1], special_tokens=True)
-                # `add_special_tokens` method stores special tokens in `tokenizer.additional_special_tokens`,
-                # which also occur in `tokenizer.all_special_tokens`. (in tokenization_utils_base.py)
-                tokenizer.add_special_tokens({"additional_special_tokens": [SPECIAL_TOKEN_2]})
-
-                token_1 = tokenizer.tokenize(SPECIAL_TOKEN_1)
-                token_2 = tokenizer.tokenize(SPECIAL_TOKEN_2)
-
-                self.assertEqual(len(token_1), 1)
-                self.assertEqual(len(token_2), 1)
-                self.assertEqual(token_1[0], SPECIAL_TOKEN_1)
-                self.assertEqual(token_2[0], SPECIAL_TOKEN_2)
-
-    def test_special_tokens_mask(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                sequence_0 = "Encode this."
-                # Testing single inputs
-                encoded_sequence = tokenizer.encode(table, sequence_0, add_special_tokens=False)
-                encoded_sequence_dict = tokenizer.encode_plus(
-                    table, sequence_0, add_special_tokens=True, return_special_tokens_mask=True
-                )
-                encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
-                special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
-                self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
-
-                filtered_sequence = [x for i, x in enumerate(encoded_sequence_w_special) if not special_tokens_mask[i]]
-                self.assertEqual(encoded_sequence, filtered_sequence)
-
-    def test_padding_to_max_length(self):
-        """We keep this test for backward compatibility but it should be removed when `pad_to_max_length` will be deprecated"""
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer)
-                sequence = "Sequence"
-                padding_size = 10
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, sequence)
-
-                padding_idx = tokenizer.pad_token_id
-
-                # Check that it correctly pads when a maximum length is specified along with the padding flag set to True
-                tokenizer.padding_side = "right"
-                encoded_sequence = tokenizer.encode(table, sequence)
-                sequence_length = len(encoded_sequence)
-                padded_sequence = tokenizer.encode(
-                    table,
-                    sequence,
-                    max_length=sequence_length + padding_size,
-                    pad_to_max_length=True,
-                )
-                padded_sequence_length = len(padded_sequence)
-                self.assertEqual(sequence_length + padding_size, padded_sequence_length)
-                self.assertListEqual(encoded_sequence + [padding_idx] * padding_size, padded_sequence)
-
-                # Check that nothing is done when a maximum length is not specified
-                encoded_sequence = tokenizer.encode(table, sequence)
-                sequence_length = len(encoded_sequence)
-
-                tokenizer.padding_side = "right"
-                padded_sequence_right = tokenizer.encode(table, sequence, pad_to_max_length=True)
-                padded_sequence_right_length = len(padded_sequence_right)
-                self.assertEqual(sequence_length, padded_sequence_right_length)
-                self.assertListEqual(encoded_sequence, padded_sequence_right)
-
-    def test_padding_to_multiple_of(self):
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                if tokenizer.pad_token is None:
-                    self.skipTest("No padding token.")
-                else:
-                    empty_tokens = tokenizer(table, padding=True, pad_to_multiple_of=8)
-                    normal_tokens = tokenizer(table, "This is a sample input", padding=True, pad_to_multiple_of=8)
-                    for key, value in empty_tokens.items():
-                        self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
-                    for key, value in normal_tokens.items():
-                        self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
-
-                    normal_tokens = tokenizer(table, "This", pad_to_multiple_of=8)
-                    for key, value in normal_tokens.items():
-                        self.assertNotEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
-
-                    # Should also work with truncation
-                    normal_tokens = tokenizer(table, "This", padding=True, truncation=True, pad_to_multiple_of=8)
-                    for key, value in normal_tokens.items():
-                        self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
-
-    def test_right_and_left_padding(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                sequence = "Sequence"
-                padding_size = 10
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, sequence)
-
-                padding_idx = tokenizer.pad_token_id
-
-                # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
-                tokenizer.padding_side = "right"
-                encoded_sequence = tokenizer.encode(table, sequence)
-                sequence_length = len(encoded_sequence)
-                padded_sequence = tokenizer.encode(
-                    table, sequence, max_length=sequence_length + padding_size, padding="max_length"
-                )
-                padded_sequence_length = len(padded_sequence)
-                self.assertEqual(sequence_length + padding_size, padded_sequence_length)
-                self.assertListEqual(encoded_sequence + [padding_idx] * padding_size, padded_sequence)
-
-                # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
-                tokenizer.padding_side = "left"
-                encoded_sequence = tokenizer.encode(table, sequence)
-                sequence_length = len(encoded_sequence)
-                padded_sequence = tokenizer.encode(
-                    table, sequence, max_length=sequence_length + padding_size, padding="max_length"
-                )
-                padded_sequence_length = len(padded_sequence)
-                self.assertEqual(sequence_length + padding_size, padded_sequence_length)
-                self.assertListEqual([padding_idx] * padding_size + encoded_sequence, padded_sequence)
-
-                # RIGHT & LEFT PADDING - Check that nothing is done for 'longest' and 'no_padding'
-                encoded_sequence = tokenizer.encode(table, sequence)
-                sequence_length = len(encoded_sequence)
-
-                tokenizer.padding_side = "right"
-                padded_sequence_right = tokenizer.encode(table, sequence, padding=True)
-                padded_sequence_right_length = len(padded_sequence_right)
-                self.assertEqual(sequence_length, padded_sequence_right_length)
-                self.assertListEqual(encoded_sequence, padded_sequence_right)
-
-                tokenizer.padding_side = "left"
-                padded_sequence_left = tokenizer.encode(table, sequence, padding="longest")
-                padded_sequence_left_length = len(padded_sequence_left)
-                self.assertEqual(sequence_length, padded_sequence_left_length)
-                self.assertListEqual(encoded_sequence, padded_sequence_left)
-
-                tokenizer.padding_side = "right"
-                padded_sequence_right = tokenizer.encode(table, sequence)
-                padded_sequence_right_length = len(padded_sequence_right)
-                self.assertEqual(sequence_length, padded_sequence_right_length)
-                self.assertListEqual(encoded_sequence, padded_sequence_right)
-
-                tokenizer.padding_side = "left"
-                padded_sequence_left = tokenizer.encode(table, sequence, padding=False)
-                padded_sequence_left_length = len(padded_sequence_left)
-                self.assertEqual(sequence_length, padded_sequence_left_length)
-                self.assertListEqual(encoded_sequence, padded_sequence_left)
-
-    def test_encode_plus_with_padding(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                sequence = "Sequence"
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, sequence)
-
-                padding_size = 10
-                padding_idx = tokenizer.pad_token_id
-                token_type_padding_idx = tokenizer.pad_token_type_id
-
-                encoded_sequence = tokenizer.encode_plus(table, sequence, return_special_tokens_mask=True)
-                input_ids = encoded_sequence["input_ids"]
-                special_tokens_mask = encoded_sequence["special_tokens_mask"]
-                sequence_length = len(input_ids)
-
-                # Test 'longest' and 'no_padding' don't do anything
-                tokenizer.padding_side = "right"
-
-                not_padded_sequence = tokenizer.encode_plus(
-                    table,
-                    sequence,
-                    padding=False,
-                    return_special_tokens_mask=True,
-                )
-                not_padded_input_ids = not_padded_sequence["input_ids"]
-
-                not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
-                not_padded_sequence_length = len(not_padded_input_ids)
-
-                self.assertEqual(sequence_length, not_padded_sequence_length)
-                self.assertListEqual(input_ids, not_padded_input_ids)
-                self.assertListEqual(special_tokens_mask, not_padded_special_tokens_mask)
-
-                not_padded_sequence = tokenizer.encode_plus(
-                    table,
-                    sequence,
-                    padding=False,
-                    return_special_tokens_mask=True,
-                )
-                not_padded_input_ids = not_padded_sequence["input_ids"]
-
-                not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
-                not_padded_sequence_length = len(not_padded_input_ids)
-
-                self.assertEqual(sequence_length, not_padded_sequence_length)
-                self.assertListEqual(input_ids, not_padded_input_ids)
-                self.assertListEqual(special_tokens_mask, not_padded_special_tokens_mask)
-
-                # Test right padding
-                tokenizer.padding_side = "right"
-
-                right_padded_sequence = tokenizer.encode_plus(
-                    table,
-                    sequence,
-                    max_length=sequence_length + padding_size,
-                    padding="max_length",
-                    return_special_tokens_mask=True,
-                )
-                right_padded_input_ids = right_padded_sequence["input_ids"]
-
-                right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
-                right_padded_sequence_length = len(right_padded_input_ids)
-
-                self.assertEqual(sequence_length + padding_size, right_padded_sequence_length)
-                self.assertListEqual(input_ids + [padding_idx] * padding_size, right_padded_input_ids)
-                self.assertListEqual(special_tokens_mask + [1] * padding_size, right_padded_special_tokens_mask)
-
-                # Test left padding
-                tokenizer.padding_side = "left"
-                left_padded_sequence = tokenizer.encode_plus(
-                    table,
-                    sequence,
-                    max_length=sequence_length + padding_size,
-                    padding="max_length",
-                    return_special_tokens_mask=True,
-                )
-                left_padded_input_ids = left_padded_sequence["input_ids"]
-                left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
-                left_padded_sequence_length = len(left_padded_input_ids)
-
-                self.assertEqual(sequence_length + padding_size, left_padded_sequence_length)
-                self.assertListEqual([padding_idx] * padding_size + input_ids, left_padded_input_ids)
-                self.assertListEqual([1] * padding_size + special_tokens_mask, left_padded_special_tokens_mask)
-
-                if "token_type_ids" in tokenizer.model_input_names:
-                    token_type_ids = encoded_sequence["token_type_ids"]
-                    left_padded_token_type_ids = left_padded_sequence["token_type_ids"]
-                    right_padded_token_type_ids = right_padded_sequence["token_type_ids"]
-
-                    self.assertListEqual(
-                        (token_type_ids + [[token_type_padding_idx] * 7] * padding_size, right_padded_token_type_ids)
-                    )
-                    self.assertListEqual(
-                        [[token_type_padding_idx] * 7] * padding_size + token_type_ids, left_padded_token_type_ids
-                    )
-
-                if "attention_mask" in tokenizer.model_input_names:
-                    attention_mask = encoded_sequence["attention_mask"]
-                    right_padded_attention_mask = right_padded_sequence["attention_mask"]
-                    left_padded_attention_mask = left_padded_sequence["attention_mask"]
-
-                    self.assertListEqual(attention_mask + [0] * padding_size, right_padded_attention_mask)
-                    self.assertListEqual([0] * padding_size + attention_mask, left_padded_attention_mask)
-
-    def test_batch_encode_plus_padding(self):
-        # Test that padded sequences are equivalent between batch_encode_plus and encode_plus
-
-        # Right padding tests
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                sequences = [
-                    "Testing batch encode plus",
-                    "Testing batch encode plus with different sequence lengths",
-                    "Testing batch encode plus with different sequence lengths correctly pads",
-                ]
-
-                max_length = 100
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, sequences)
-
-                encoded_sequences = [
-                    tokenizer.encode_plus(table, sequence, max_length=max_length, padding="max_length")
-                    for sequence in sequences
-                ]
-                encoded_sequences_batch = tokenizer.batch_encode_plus(
-                    table, sequences, max_length=max_length, padding="max_length"
-                )
-                self.assertListEqual(
-                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
-                )
-
-        # Left padding tests
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                tokenizer.padding_side = "left"
-                sequences = [
-                    "Testing batch encode plus",
-                    "Testing batch encode plus with different sequence lengths",
-                    "Testing batch encode plus with different sequence lengths correctly pads",
-                ]
-
-                max_length = 100
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, sequences)
-
-                encoded_sequences = [
-                    tokenizer.encode_plus(table, sequence, max_length=max_length, padding="max_length")
-                    for sequence in sequences
-                ]
-                encoded_sequences_batch = tokenizer.batch_encode_plus(
-                    table, sequences, max_length=max_length, padding="max_length"
-                )
-                self.assertListEqual(
-                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
-                )
-
-    def test_batch_encode_plus_batch_sequence_length(self):
-        # Tests that all encoded values have the correct size
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                sequences = [
-                    "Testing batch encode plus",
-                    "Testing batch encode plus with different sequence lengths",
-                    "Testing batch encode plus with different sequence lengths correctly pads",
-                ]
-
-                encoded_sequences = [tokenizer.encode_plus(table, sequence) for sequence in sequences]
-                encoded_sequences_batch = tokenizer.batch_encode_plus(table, sequences, padding=False)
-                self.assertListEqual(
-                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
-                )
-
-                maximum_length = len(
-                    max([encoded_sequence["input_ids"] for encoded_sequence in encoded_sequences], key=len)
-                )
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, sequences)
-
-                encoded_sequences_padded = [
-                    tokenizer.encode_plus(table, sequence, max_length=maximum_length, padding="max_length")
-                    for sequence in sequences
-                ]
-
-                encoded_sequences_batch_padded = tokenizer.batch_encode_plus(table, sequences, padding=True)
-                self.assertListEqual(
-                    encoded_sequences_padded,
-                    self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch_padded),
-                )
-
-                # check 'longest' is unsensitive to a max length
-                encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus(table, sequences, padding=True)
-                encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
-                    table, sequences, max_length=maximum_length + 10, padding="longest"
-                )
-                for key in encoded_sequences_batch_padded_1.keys():
-                    self.assertListEqual(
-                        encoded_sequences_batch_padded_1[key],
-                        encoded_sequences_batch_padded_2[key],
-                    )
-
-                # check 'no_padding' is unsensitive to a max length
-                encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus(table, sequences, padding=False)
-                encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
-                    table, sequences, max_length=maximum_length + 10, padding=False
-                )
-                for key in encoded_sequences_batch_padded_1.keys():
-                    self.assertListEqual(
-                        encoded_sequences_batch_padded_1[key],
-                        encoded_sequences_batch_padded_2[key],
-                    )
-
-    def test_special_tokens_mask_input_pairs(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                sequence_0 = "Encode this."
-                empty_table = self.get_table(tokenizer, length=0)
-                table = self.get_table(tokenizer, length=10)
-                encoded_sequence = tokenizer.encode(empty_table, sequence_0, add_special_tokens=False)
-                number_of_tokens = len(encoded_sequence)
-                encoded_sequence += tokenizer.encode(table, "", add_special_tokens=False)
-                encoded_sequence_dict = tokenizer.encode_plus(
-                    table,
-                    sequence_0,
-                    add_special_tokens=True,
-                    return_special_tokens_mask=True,
-                )
-                encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
-                special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
-                self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
-
-                filtered_sequence = [
-                    (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
-                ]
-                # NOTE: as TAPEX adds a space between a table and a sequence, we need to remove it
-                # in order to have equivalent results with encoding an empty table or empty sequence
-                del filtered_sequence[number_of_tokens + 1]
-                filtered_sequence = [x for x in filtered_sequence if x is not None]
-                print("Encoded sequence:", encoded_sequence)
-                print("Filtered sequence:", filtered_sequence)
-                self.assertEqual(encoded_sequence, filtered_sequence)
-
-    @slow
-    def test_full_tokenizer(self):
-        question = "Greece held its last Summer Olympics in 2004"
-        table_dict = {
-            "header": ["Year", "City", "Country", "Nations"],
-            "rows": [
-                [1896, "Athens", "Greece", 14],
-                [1900, "Paris", "France", 24],
-                [1904, "St. Louis", "USA", 12],
-                [2004, "Athens", "Greece", 201],
-                [2008, "Beijing", "China", 204],
-                [2012, "London", "UK", 204],
-            ],
-        }
-        table = pd.DataFrame.from_dict(table_dict["rows"])
-        table.columns = table_dict["header"]
-
-        tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-large-finetuned-wtq")
-        encoding = tokenizer(table, question)
-
-        # fmt: off
-        expected_results = {'input_ids': [0, 821, 5314, 1755, 547, 63, 94, 1035, 1021, 31434, 2857, 11, 4482, 11311, 4832, 76, 1721, 343, 1721, 247, 1721, 3949, 3236, 112, 4832, 42773, 1721, 23, 27859, 1721, 821, 5314, 1755, 1721, 501, 3236, 132, 4832, 23137, 1721, 2242, 354, 1721, 6664, 2389, 1721, 706, 3236, 155, 4832, 42224, 1721, 1690, 4, 26120, 354, 1721, 201, 102, 1721, 316, 3236, 204, 4832, 4482, 1721, 23, 27859, 1721, 821, 5314, 1755, 1721, 21458, 3236, 195, 4832, 2266, 1721, 28, 40049, 1721, 1855, 1243, 1721, 28325, 3236, 231, 4832, 1125, 1721, 784, 24639, 1721, 1717, 330, 1721, 28325, 2]}
-        # fmt: on
-
-        self.assertListEqual(encoding.input_ids, expected_results["input_ids"])
-
-    def test_tokenizer_as_target(self):
-        # by default the tokenizer do_lower_case
-        tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-base")
-        answer_text = "tapex is a good model!"
-        expected_src_tokens = [0, 90, 5776, 1178, 16, 10, 205, 1421, 328, 2]
-        answer_encoding = tokenizer(answer=answer_text)
-        self.assertListEqual(answer_encoding.input_ids, expected_src_tokens)
-
-    @slow
-    def test_tokenizer_lower_case(self):
-        cased_tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-base", do_lower_case=False)
-        uncased_tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-base", do_lower_case=True)
-        answer_text = "Beijing, London, Paris"
-        answer_text_lower = "beijing, london, paris"
-
-        self.assertNotEqual(
-            cased_tokenizer(answer=answer_text).input_ids, uncased_tokenizer(answer=answer_text).input_ids
-        )
-        self.assertEqual(
-            cased_tokenizer(answer=answer_text_lower).input_ids,
-            uncased_tokenizer(answer=answer_text).input_ids,
-        )
-        # batched encoding assert
-        self.assertNotEqual(
-            cased_tokenizer(answer=[answer_text]).input_ids, uncased_tokenizer(answer=[answer_text]).input_ids
-        )
-        self.assertEqual(
-            cased_tokenizer(answer=[answer_text_lower]).input_ids,
-            uncased_tokenizer(answer=[answer_text]).input_ids,
-        )
-        # test input encoding lowercase
-        question = "Greece held its last Summer Olympics in 2004"
-        table_dict = {
-            "header": ["Year", "City", "Country", "Nations"],
-            "rows": [
-                [1896, "Athens", "Greece", 14],
-                [1900, "Paris", "France", 24],
-                [1904, "St. Louis", "USA", 12],
-                [2004, "Athens", "Greece", 201],
-                [2008, "Beijing", "China", 204],
-                [2012, "London", "UK", 204],
-            ],
-        }
-        table = pd.DataFrame.from_dict(table_dict["rows"])
-        table.columns = table_dict["header"]
-
-        self.assertNotEqual(
-            cased_tokenizer(table=table, query=question).input_ids,
-            uncased_tokenizer(table=table, query=question).input_ids,
-        )
diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index a3973a39edd8..42319a1dd0a2 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -19,11 +19,14 @@
 import unittest
 
 from huggingface_hub import hf_hub_download
+from parameterized import parameterized
+
 from transformers import is_torch_available
 from transformers.testing_utils import is_flaky, require_torch, slow, torch_device
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 TOLERANCE = 1e-4
@@ -54,7 +57,7 @@ def __init__(
         embedding_dimension=5,
         num_time_features=4,
         is_training=True,
-        hidden_size=16,
+        hidden_size=64,
         num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=4,
@@ -97,6 +100,7 @@ def get_config(self):
             context_length=self.context_length,
             lags_sequence=self.lags_sequence,
             num_time_features=self.num_time_features,
+            num_static_real_features=1,
             num_static_categorical_features=1,
             cardinality=[self.cardinality],
             embedding_dimension=[self.embedding_dimension],
@@ -110,7 +114,7 @@ def prepare_time_series_transformer_inputs_dict(self, config):
 
         past_time_features = floats_tensor([self.batch_size, _past_length, config.num_time_features])
         past_values = floats_tensor([self.batch_size, _past_length])
-        past_observed_mask = floats_tensor([self.batch_size, _past_length])
+        past_observed_mask = floats_tensor([self.batch_size, _past_length]) > 0.5
 
         # decoder inputs
         future_time_features = floats_tensor([self.batch_size, config.prediction_length, config.num_time_features])
@@ -148,7 +152,7 @@ def check_encoder_decoder_model_standalone(self, config, inputs_dict):
             encoder.save_pretrained(tmpdirname)
             encoder = TimeSeriesTransformerEncoder.from_pretrained(tmpdirname).to(torch_device)
 
-        transformer_inputs, _, _ = model.create_network_inputs(**inputs_dict)
+        transformer_inputs, _, _, _ = model.create_network_inputs(**inputs_dict)
         enc_input = transformer_inputs[:, : config.context_length, ...]
         dec_input = transformer_inputs[:, config.context_length :, ...]
 
@@ -170,11 +174,12 @@ def check_encoder_decoder_model_standalone(self, config, inputs_dict):
 
 
 @require_torch
-class TimeSeriesTransformerModelTest(ModelTesterMixin, unittest.TestCase):
+class TimeSeriesTransformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (TimeSeriesTransformerModel, TimeSeriesTransformerForPrediction) if is_torch_available() else ()
     )
     all_generative_model_classes = (TimeSeriesTransformerForPrediction,) if is_torch_available() else ()
+    pipeline_model_mapping = {"feature-extraction": TimeSeriesTransformerModel} if is_torch_available() else {}
     is_encoder_decoder = True
     test_pruning = False
     test_head_masking = False
@@ -185,13 +190,18 @@ class TimeSeriesTransformerModelTest(ModelTesterMixin, unittest.TestCase):
 
     def setUp(self):
         self.model_tester = TimeSeriesTransformerModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=TimeSeriesTransformerConfig, has_text_modality=False)
+        self.config_tester = ConfigTester(
+            self,
+            config_class=TimeSeriesTransformerConfig,
+            has_text_modality=False,
+            prediction_length=self.model_tester.prediction_length,
+        )
 
     def test_config(self):
         self.config_tester.run_common_tests()
 
     def test_save_load_strict(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        config, _ = self.model_tester.prepare_config_and_inputs()
         for model_class in self.all_model_classes:
             model = model_class(config)
 
@@ -302,7 +312,7 @@ def test_attention_outputs(self):
             )
             out_len = len(outputs)
 
-            correct_outlen = 6
+            correct_outlen = 7
 
             if "last_hidden_state" in outputs:
                 correct_outlen += 1
@@ -359,13 +369,97 @@ def test_attention_outputs(self):
             [self.model_tester.num_attention_heads, encoder_seq_length, encoder_seq_length],
         )
 
+    @parameterized.expand(
+        [
+            (1, 5, [1]),
+            (1, 5, [1, 10, 15]),
+            (1, 5, [3, 6, 9, 10]),
+            (2, 5, [1, 2, 7]),
+            (2, 5, [2, 3, 4, 6]),
+            (4, 5, [1, 5, 9, 11]),
+            (4, 5, [7, 8, 13, 14]),
+        ],
+    )
+    def test_create_network_inputs(self, prediction_length, context_length, lags_sequence):
+        history_length = max(lags_sequence) + context_length
+
+        config = TimeSeriesTransformerConfig(
+            prediction_length=prediction_length,
+            context_length=context_length,
+            lags_sequence=lags_sequence,
+            scaling=False,
+            num_parallel_samples=10,
+            num_static_categorical_features=1,
+            cardinality=[1],
+            embedding_dimension=[2],
+            num_static_real_features=1,
+        )
+        model = TimeSeriesTransformerModel(config)
+
+        batch = {
+            "static_categorical_features": torch.tensor([[0]], dtype=torch.int64),
+            "static_real_features": torch.tensor([[0.0]], dtype=torch.float32),
+            "past_time_features": torch.arange(history_length, dtype=torch.float32).view(1, history_length, 1),
+            "past_values": torch.arange(history_length, dtype=torch.float32).view(1, history_length),
+            "past_observed_mask": torch.arange(history_length, dtype=torch.float32).view(1, history_length),
+        }
+
+        # test with no future_target (only one step prediction)
+        batch["future_time_features"] = torch.arange(history_length, history_length + 1, dtype=torch.float32).view(
+            1, 1, 1
+        )
+        transformer_inputs, loc, scale, _ = model.create_network_inputs(**batch)
+
+        self.assertTrue((scale == 1.0).all())
+        assert (loc == 0.0).all()
+
+        ref = torch.arange(max(lags_sequence), history_length, dtype=torch.float32)
+
+        for idx, lag in enumerate(lags_sequence):
+            assert torch.isclose(ref - lag, transformer_inputs[0, :, idx]).all()
+
+        # test with all future data
+        batch["future_time_features"] = torch.arange(
+            history_length, history_length + prediction_length, dtype=torch.float32
+        ).view(1, prediction_length, 1)
+        batch["future_values"] = torch.arange(
+            history_length, history_length + prediction_length, dtype=torch.float32
+        ).view(1, prediction_length)
+        transformer_inputs, loc, scale, _ = model.create_network_inputs(**batch)
+
+        assert (scale == 1.0).all()
+        assert (loc == 0.0).all()
+
+        ref = torch.arange(max(lags_sequence), history_length + prediction_length, dtype=torch.float32)
+
+        for idx, lag in enumerate(lags_sequence):
+            assert torch.isclose(ref - lag, transformer_inputs[0, :, idx]).all()
+
+        # test for generation
+        batch.pop("future_values")
+        transformer_inputs, loc, scale, _ = model.create_network_inputs(**batch)
+
+        lagged_sequence = model.get_lagged_subsequences(
+            sequence=batch["past_values"],
+            subsequences_length=1,
+            shift=1,
+        )
+        # assert that the last element of the lagged sequence is the one after the encoders input
+        assert transformer_inputs[0, ..., 0][-1] + 1 == lagged_sequence[0, ..., 0][-1]
+
+        future_values = torch.arange(history_length, history_length + prediction_length, dtype=torch.float32).view(
+            1, prediction_length
+        )
+        # assert that the first element of the future_values is offset by lag after the decoders input
+        assert lagged_sequence[0, ..., 0][-1] + lags_sequence[0] == future_values[0, ..., 0]
+
     @is_flaky()
     def test_retain_grad_hidden_states_attentions(self):
         super().test_retain_grad_hidden_states_attentions()
 
 
 def prepare_batch(filename="train-batch.pt"):
-    file = hf_hub_download(repo_id="kashif/tourism-monthly-batch", filename=filename, repo_type="dataset")
+    file = hf_hub_download(repo_id="hf-internal-testing/tourism-monthly-batch", filename=filename, repo_type="dataset")
     batch = torch.load(file, map_location=torch_device)
     return batch
 
@@ -388,13 +482,13 @@ def test_inference_no_head(self):
                 static_real_features=batch["static_real_features"],
                 future_values=batch["future_values"],
                 future_time_features=batch["future_time_features"],
-            )[0]
+            ).last_hidden_state
 
-        expected_shape = torch.Size((64, model.config.prediction_length, model.config.d_model))
+        expected_shape = torch.Size((64, model.config.context_length, model.config.d_model))
         self.assertEqual(output.shape, expected_shape)
 
         expected_slice = torch.tensor(
-            [[-0.3125, -1.2884, -1.1118], [-0.5801, -1.4907, -0.7782], [0.0849, -1.6557, -0.9755]], device=torch_device
+            [[0.8196, -1.5131, 1.4620], [1.1268, -1.3238, 1.5997], [1.5098, -1.0715, 1.7359]], device=torch_device
         )
         self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
 
@@ -411,12 +505,12 @@ def test_inference_head(self):
                 static_categorical_features=batch["static_categorical_features"],
                 static_real_features=batch["static_real_features"],
                 future_time_features=batch["future_time_features"],
-            )[1]
-        expected_shape = torch.Size((64, model.config.prediction_length, model.config.d_model))
+            ).encoder_last_hidden_state
+        expected_shape = torch.Size((64, model.config.context_length, model.config.d_model))
         self.assertEqual(output.shape, expected_shape)
 
         expected_slice = torch.tensor(
-            [[0.9127, -0.2056, -0.5259], [1.0572, 1.4104, -0.1964], [0.1358, 2.0348, 0.5739]], device=torch_device
+            [[-1.2957, -1.0280, -0.6045], [-0.7017, -0.8193, -0.3717], [-1.0449, -0.8149, 0.1405]], device=torch_device
         )
         self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
 
@@ -437,6 +531,6 @@ def test_seq_to_seq_generation(self):
         expected_shape = torch.Size((64, model.config.num_parallel_samples, model.config.prediction_length))
         self.assertEqual(outputs.sequences.shape, expected_shape)
 
-        expected_slice = torch.tensor([2289.5203, 2778.3054, 4648.1313], device=torch_device)
+        expected_slice = torch.tensor([2825.2749, 3584.9207, 6763.9951], device=torch_device)
         mean_prediction = outputs.sequences.mean(dim=1)
         self.assertTrue(torch.allclose(mean_prediction[0, -3:], expected_slice, rtol=1e-1))
diff --git a/tests/models/timesformer/test_modeling_timesformer.py b/tests/models/timesformer/test_modeling_timesformer.py
index 8f95b1d0189d..2b7a5e279fe7 100644
--- a/tests/models/timesformer/test_modeling_timesformer.py
+++ b/tests/models/timesformer/test_modeling_timesformer.py
@@ -20,8 +20,8 @@
 import unittest
 
 import numpy as np
-
 from huggingface_hub import hf_hub_download
+
 from transformers import TimesformerConfig
 from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
@@ -29,6 +29,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -44,7 +45,7 @@
 
 
 if is_vision_available():
-    from transformers import VideoMAEFeatureExtractor
+    from transformers import VideoMAEImageProcessor
 
 
 class TimesformerModelTester:
@@ -59,7 +60,7 @@ def __init__(
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -152,13 +153,18 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class TimesformerModelTest(ModelTesterMixin, unittest.TestCase):
+class TimesformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_common.py, as TimeSformer does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
     """
 
     all_model_classes = (TimesformerModel, TimesformerForVideoClassification) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": TimesformerModel, "video-classification": TimesformerForVideoClassification}
+        if is_torch_available()
+        else {}
+    )
 
     test_pruning = False
     test_torchscript = False
@@ -333,10 +339,10 @@ def prepare_video():
 @require_vision
 class TimesformerModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         # logits were tested with a different mean and std, so we use the same here
         return (
-            VideoMAEFeatureExtractor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
+            VideoMAEImageProcessor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
             if is_vision_available()
             else None
         )
@@ -347,9 +353,9 @@ def test_inference_for_video_classification(self):
             torch_device
         )
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         video = prepare_video()
-        inputs = feature_extractor(video[:8], return_tensors="pt").to(torch_device)
+        inputs = image_processor(video[:8], return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/timm_backbone/__init__.py b/tests/models/timm_backbone/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/timm_backbone/test_modeling_timm_backbone.py b/tests/models/timm_backbone/test_modeling_timm_backbone.py
new file mode 100644
index 000000000000..60ab9e2a217e
--- /dev/null
+++ b/tests/models/timm_backbone/test_modeling_timm_backbone.py
@@ -0,0 +1,275 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import inspect
+import unittest
+
+from transformers import AutoBackbone
+from transformers.configuration_utils import PretrainedConfig
+from transformers.testing_utils import require_timm, require_torch, torch_device
+from transformers.utils.import_utils import is_torch_available
+
+from ...test_backbone_common import BackboneTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import TimmBackbone, TimmBackboneConfig
+
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+class TimmBackboneModelTester:
+    def __init__(
+        self,
+        parent,
+        out_indices=None,
+        out_features=None,
+        stage_names=None,
+        backbone="resnet18",
+        batch_size=3,
+        image_size=32,
+        num_channels=3,
+        is_training=True,
+        use_pretrained_backbone=True,
+    ):
+        self.parent = parent
+        self.out_indices = out_indices if out_indices is not None else [4]
+        self.stage_names = stage_names
+        self.out_features = out_features
+        self.backbone = backbone
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.num_channels = num_channels
+        self.use_pretrained_backbone = use_pretrained_backbone
+        self.is_training = is_training
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        config = self.get_config()
+
+        return config, pixel_values
+
+    def get_config(self):
+        return TimmBackboneConfig(
+            image_size=self.image_size,
+            num_channels=self.num_channels,
+            out_features=self.out_features,
+            out_indices=self.out_indices,
+            stage_names=self.stage_names,
+            use_pretrained_backbone=self.use_pretrained_backbone,
+            backbone=self.backbone,
+        )
+
+    def create_and_check_model(self, config, pixel_values):
+        model = TimmBackbone(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(pixel_values)
+        self.parent.assertEqual(
+            result.feature_map[-1].shape,
+            (self.batch_size, model.channels[-1], 14, 14),
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+@require_timm
+class TimmBackboneModelTest(ModelTesterMixin, BackboneTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (TimmBackbone,) if is_torch_available() else ()
+    pipeline_model_mapping = {"feature-extraction": TimmBackbone} if is_torch_available() else {}
+    test_resize_embeddings = False
+    test_head_masking = False
+    test_pruning = False
+    has_attentions = False
+
+    def setUp(self):
+        self.config_class = PretrainedConfig
+        self.model_tester = TimmBackboneModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=self.config_class, has_text_modality=False)
+
+    def test_config(self):
+        self.config_tester.create_and_test_config_to_json_string()
+        self.config_tester.create_and_test_config_to_json_file()
+        self.config_tester.create_and_test_config_from_and_save_pretrained()
+        self.config_tester.create_and_test_config_with_num_labels()
+        self.config_tester.check_config_can_be_init_without_params()
+        self.config_tester.check_config_arguments_init()
+
+    def test_timm_transformer_backbone_equivalence(self):
+        timm_checkpoint = "resnet18"
+        transformers_checkpoint = "microsoft/resnet-18"
+
+        timm_model = AutoBackbone.from_pretrained(timm_checkpoint, use_timm_backbone=True)
+        transformers_model = AutoBackbone.from_pretrained(transformers_checkpoint)
+
+        self.assertEqual(len(timm_model.out_features), len(transformers_model.out_features))
+        self.assertEqual(len(timm_model.stage_names), len(transformers_model.stage_names))
+        self.assertEqual(timm_model.channels, transformers_model.channels)
+        # Out indices are set to the last layer by default. For timm models, we don't know
+        # the number of layers in advance, so we set it to (-1,), whereas for transformers
+        # models, we set it to [len(stage_names) - 1] (kept for backward compatibility).
+        self.assertEqual(timm_model.out_indices, (-1,))
+        self.assertEqual(transformers_model.out_indices, [len(timm_model.stage_names) - 1])
+
+        timm_model = AutoBackbone.from_pretrained(timm_checkpoint, use_timm_backbone=True, out_indices=[1, 2, 3])
+        transformers_model = AutoBackbone.from_pretrained(transformers_checkpoint, out_indices=[1, 2, 3])
+
+        self.assertEqual(timm_model.out_indices, transformers_model.out_indices)
+        self.assertEqual(len(timm_model.out_features), len(transformers_model.out_features))
+        self.assertEqual(timm_model.channels, transformers_model.channels)
+
+    @unittest.skip("TimmBackbone doesn't support feed forward chunking")
+    def test_feed_forward_chunking(self):
+        pass
+
+    @unittest.skip("TimmBackbone doesn't have num_hidden_layers attribute")
+    def test_hidden_states_output(self):
+        pass
+
+    @unittest.skip("TimmBackbone initialization is managed on the timm side")
+    def test_initialization(self):
+        pass
+
+    @unittest.skip("TimmBackbone models doesn't have inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip("TimmBackbone models doesn't have inputs_embeds")
+    def test_model_common_attributes(self):
+        pass
+
+    @unittest.skip("TimmBackbone model cannot be created without specifying a backbone checkpoint")
+    def test_from_pretrained_no_checkpoint(self):
+        pass
+
+    @unittest.skip("Only checkpoints on timm can be loaded into TimmBackbone")
+    def test_save_load(self):
+        pass
+
+    @unittest.skip("model weights aren't tied in TimmBackbone.")
+    def test_tie_model_weights(self):
+        pass
+
+    @unittest.skip("model weights aren't tied in TimmBackbone.")
+    def test_tied_model_weights_key_ignore(self):
+        pass
+
+    @unittest.skip("Only checkpoints on timm can be loaded into TimmBackbone")
+    def test_load_save_without_tied_weights(self):
+        pass
+
+    @unittest.skip("Only checkpoints on timm can be loaded into TimmBackbone")
+    def test_model_weights_reload_no_missing_tied_weights(self):
+        pass
+
+    @unittest.skip("TimmBackbone doesn't have hidden size info in its configuration.")
+    def test_channels(self):
+        pass
+
+    @unittest.skip("TimmBackbone doesn't support output_attentions.")
+    def test_torchscript_output_attentions(self):
+        pass
+
+    @unittest.skip("Safetensors is not supported by timm.")
+    def test_can_use_safetensors(self):
+        pass
+
+    @unittest.skip("Need to use a timm backbone and there is no tiny model available.")
+    def test_model_is_small(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_retain_grad_hidden_states_attentions(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        config.output_attentions = self.has_attentions
+
+        # no need to test all models as different heads yield the same functionality
+        model_class = self.all_model_classes[0]
+        model = model_class(config)
+        model.to(torch_device)
+
+        inputs = self._prepare_for_class(inputs_dict, model_class)
+        outputs = model(**inputs)
+        output = outputs[0][-1]
+
+        # Encoder-/Decoder-only models
+        hidden_states = outputs.hidden_states[0]
+        hidden_states.retain_grad()
+
+        if self.has_attentions:
+            attentions = outputs.attentions[0]
+            attentions.retain_grad()
+
+        output.flatten()[0].backward(retain_graph=True)
+
+        self.assertIsNotNone(hidden_states.grad)
+
+        if self.has_attentions:
+            self.assertIsNotNone(attentions.grad)
+
+    # TimmBackbone config doesn't have out_features attribute
+    def test_create_from_modified_config(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            result = model(**inputs_dict)
+
+            self.assertEqual(len(result.feature_maps), len(config.out_indices))
+            self.assertEqual(len(model.channels), len(config.out_indices))
+
+            # Check output of last stage is taken if out_features=None, out_indices=None
+            modified_config = copy.deepcopy(config)
+            modified_config.out_indices = None
+            model = model_class(modified_config)
+            model.to(torch_device)
+            model.eval()
+            result = model(**inputs_dict)
+
+            self.assertEqual(len(result.feature_maps), 1)
+            self.assertEqual(len(model.channels), 1)
+
+            # Check backbone can be initialized with fresh weights
+            modified_config = copy.deepcopy(config)
+            modified_config.use_pretrained_backbone = False
+            model = model_class(modified_config)
+            model.to(torch_device)
+            model.eval()
+            result = model(**inputs_dict)
diff --git a/tests/models/trajectory_transformer/test_modeling_trajectory_transformer.py b/tests/models/trajectory_transformer/test_modeling_trajectory_transformer.py
deleted file mode 100644
index 5bdb52450dbf..000000000000
--- a/tests/models/trajectory_transformer/test_modeling_trajectory_transformer.py
+++ /dev/null
@@ -1,279 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch TrajectoryTransformer model. """
-
-
-import inspect
-import unittest
-
-import numpy as np
-
-from transformers import TrajectoryTransformerConfig, is_torch_available
-from transformers.testing_utils import require_torch, slow, torch_device
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, random_attention_mask
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import TrajectoryTransformerModel
-    from transformers.models.trajectory_transformer.modeling_trajectory_transformer import (
-        TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
-    )
-    from transformers.pytorch_utils import is_torch_less_than_1_9
-else:
-    is_torch_less_than_1_9 = True
-
-
-class TrajectoryTransformerModelTester:
-    def __init__(self, parent, batch_size=13, n_embd=128, action_dim=6, observation_dim=17, is_training=True):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.n_embd = n_embd
-        self.action_dim = action_dim
-        self.observation_dim = observation_dim
-        self.is_training = is_training
-        self.seq_length = self.action_dim + self.observation_dim + 1
-
-    def prepare_config_and_inputs(self):
-        trajectories = torch.LongTensor([np.random.permutation(self.seq_length) for _ in range(self.batch_size)]).to(
-            torch_device
-        )
-        attention_mask = random_attention_mask((self.batch_size, self.seq_length)).to(torch_device)
-        targets = torch.LongTensor([np.random.permutation(self.seq_length) for _ in range(self.batch_size)]).to(
-            torch_device
-        )
-
-        config = self.get_config()
-        return config, trajectories, attention_mask, targets
-
-    def get_config(self):
-        return TrajectoryTransformerConfig(
-            batch_size=self.batch_size,
-            n_embd=self.n_embd,
-            action_dim=self.action_dim,
-            observation_dim=self.observation_dim,
-        )
-
-    def create_and_check_model(self, config, input_dict):
-        model = TrajectoryTransformerModel(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        result = model(trajectories=input_dict["trajectories"], attention_mask=input_dict["attention_mask"])
-        result = model(
-            trajectories=input_dict["trajectories"],
-            output_hidden_states=True,
-            output_attentions=True,
-            use_cache=True,
-            return_dict=True,
-        )
-
-        self.parent.assertEqual(result.hidden_states[-1].shape, (self.batch_size, self.seq_length, self.n_embd))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (config, trajectories, attention_mask, targets) = config_and_inputs
-        inputs_dict = {"trajectories": trajectories, "attention_mask": attention_mask, "targets": targets}
-        return config, inputs_dict
-
-
-@require_torch
-class TrajectoryTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
-    all_model_classes = (TrajectoryTransformerModel,) if is_torch_available() else ()
-
-    # Ignoring of a failing test from GenerationTesterMixin, as the model does not use inputs_ids
-    test_generate_without_input_ids = False
-
-    # Ignoring of a failing tests from ModelTesterMixin, as the model does not implement these features
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    test_attention_outputs = False
-    test_hidden_states_output = False
-    test_inputs_embeds = False
-    test_model_common_attributes = False
-    test_torchscript = False
-
-    def setUp(self):
-        self.model_tester = TrajectoryTransformerModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=TrajectoryTransformerConfig, n_embd=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_conditional_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["trajectories"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    # # Input is 'trajectories' not 'input_ids'
-    def test_model_main_input_name(self):
-        model_signature = inspect.signature(getattr(TrajectoryTransformerModel, "forward"))
-        # The main input is the name of the argument after `self`
-        observed_main_input_name = list(model_signature.parameters.keys())[1]
-        self.assertEqual(TrajectoryTransformerModel.main_input_name, observed_main_input_name)
-
-    def test_retain_grad_hidden_states_attentions(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = self.has_attentions
-
-        model = TrajectoryTransformerModel(config)
-        model.to(torch_device)
-
-        outputs = model(
-            trajectories=input_dict["trajectories"],
-            attention_mask=input_dict["attention_mask"],
-            targets=input_dict["targets"],
-            output_hidden_states=True,
-            output_attentions=True,
-            use_cache=True,
-            return_dict=True,
-        )
-
-        output = outputs[0]
-        hidden_states = outputs.hidden_states[0]
-        hidden_states.retain_grad()
-
-        if self.has_attentions:
-            attentions = outputs.attentions[0]
-            attentions.retain_grad()
-
-        output.flatten()[0].backward(retain_graph=True)
-
-        self.assertIsNotNone(hidden_states.grad)
-
-        if self.has_attentions:
-            self.assertIsNotNone(attentions.grad)
-
-    def test_training(self):
-        if not self.model_tester.is_training:
-            return
-
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        model = TrajectoryTransformerModel(config)
-        model.to(torch_device)
-        model.train()
-        loss = model(
-            trajectories=input_dict["trajectories"],
-            attention_mask=input_dict["attention_mask"],
-            targets=input_dict["targets"],
-            output_hidden_states=True,
-            output_attentions=True,
-            use_cache=True,
-            return_dict=True,
-        ).loss
-        loss.backward()
-
-    @unittest.skipIf(is_torch_less_than_1_9, reason="This test fails for TrajectoryTransformerModel when torch < 1.9")
-    def test_training_gradient_checkpointing(self):
-        if not self.model_tester.is_training:
-            return
-
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        model = TrajectoryTransformerModel(config)
-        model.gradient_checkpointing_enable()
-        model.to(torch_device)
-        model.train()
-        loss = model(
-            trajectories=input_dict["trajectories"],
-            attention_mask=input_dict["attention_mask"],
-            targets=input_dict["targets"],
-            output_hidden_states=True,
-            output_attentions=True,
-            use_cache=False,
-            return_dict=True,
-        ).loss
-        loss.backward()
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = TrajectoryTransformerModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-
-@require_torch
-class TrajectoryTransformerModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_prediction(self):
-        batch_size = 1
-
-        config = TrajectoryTransformerConfig.from_pretrained("CarlCochet/trajectory-transformer-halfcheetah-medium-v2")
-        model = TrajectoryTransformerModel.from_pretrained(
-            "CarlCochet/trajectory-transformer-halfcheetah-medium-v2", config=config
-        )
-        model.to(torch_device)
-        model.eval()
-
-        seq_length = model.config.action_dim + model.config.observation_dim + 1
-
-        trajectories = torch.LongTensor(
-            [[3, 19, 20, 22, 9, 7, 23, 10, 18, 14, 13, 4, 17, 11, 5, 6, 15, 21, 2, 8, 1, 0, 12, 16]]
-        ).to(torch_device)
-        outputs = model(
-            trajectories=trajectories,
-            output_hidden_states=True,
-            output_attentions=True,
-            use_cache=True,
-            return_dict=True,
-        )
-
-        output = outputs.logits
-
-        expected_shape = torch.Size((batch_size, seq_length, model.config.vocab_size + 1))
-        expected_slice = torch.tensor(
-            [[[-0.7193, -0.2532, -0.0898], [1.9429, 2.0434, 2.3975], [-3.3651, -2.8744, -2.4532]]]
-        ).to(torch_device)
-        output_slice = output[:, :3, :3]
-
-        self.assertEqual(output.shape, expected_shape)
-        self.assertTrue(torch.allclose(output_slice, expected_slice, atol=1e-4))
diff --git a/tests/models/transfo_xl/test_modeling_tf_transfo_xl.py b/tests/models/transfo_xl/test_modeling_tf_transfo_xl.py
index 84e25d8716f5..88e759307ed6 100644
--- a/tests/models/transfo_xl/test_modeling_tf_transfo_xl.py
+++ b/tests/models/transfo_xl/test_modeling_tf_transfo_xl.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import random
 import unittest
 
@@ -22,6 +24,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -56,7 +59,7 @@ def __init__(
         self.d_head = 8
         self.d_inner = 128
         self.div_val = 2
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.scope = None
         self.seed = 1
         self.eos_token_id = 0
@@ -155,18 +158,39 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFTransfoXLModelTest(TFModelTesterMixin, unittest.TestCase):
-
+class TFTransfoXLModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (TFTransfoXLModel, TFTransfoXLLMHeadModel, TFTransfoXLForSequenceClassification) if is_tf_available() else ()
     )
     all_generative_model_classes = () if is_tf_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": TFTransfoXLModel,
+            "text-classification": TFTransfoXLForSequenceClassification,
+            "text-generation": TFTransfoXLLMHeadModel,
+            "zero-shot": TFTransfoXLForSequenceClassification,
+        }
+        if is_tf_available()
+        else {}
+    )
     # TODO: add this test when TFTransfoXLLMHead has a linear output layer implemented
     test_resize_embeddings = False
     test_head_masking = False
     test_onnx = False
     test_mismatched_shapes = False
 
+    # TODO: Fix the failed tests
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if pipeline_test_casse_name == "TextGenerationPipelineTests":
+            # Get `ValueError: AttributeError: 'NoneType' object has no attribute 'new_ones'` or `AssertionError`.
+            # `TransfoXLConfig` was never used in pipeline tests: cannot create a simple
+            # tokenizer.
+            return True
+
+        return False
+
     def setUp(self):
         self.model_tester = TFTransfoXLModelTester(self)
         self.config_tester = ConfigTester(self, config_class=TransfoXLConfig, d_embed=37)
diff --git a/tests/models/transfo_xl/test_modeling_transfo_xl.py b/tests/models/transfo_xl/test_modeling_transfo_xl.py
index 7375475a9547..63afd438d97d 100644
--- a/tests/models/transfo_xl/test_modeling_transfo_xl.py
+++ b/tests/models/transfo_xl/test_modeling_transfo_xl.py
@@ -23,6 +23,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -51,7 +52,7 @@ def __init__(
         d_head=8,
         d_inner=128,
         div_val=2,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         scope=None,
         seed=1,
         eos_token_id=0,
@@ -251,15 +252,37 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class TransfoXLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class TransfoXLModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (TransfoXLModel, TransfoXLLMHeadModel, TransfoXLForSequenceClassification) if is_torch_available() else ()
     )
     all_generative_model_classes = (TransfoXLLMHeadModel,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": TransfoXLModel,
+            "text-classification": TransfoXLForSequenceClassification,
+            "text-generation": TransfoXLLMHeadModel,
+            "zero-shot": TransfoXLForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
     test_pruning = False
     test_resize_embeddings = True
     test_mismatched_shapes = False
 
+    # TODO: Fix the failed tests
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if pipeline_test_casse_name == "TextGenerationPipelineTests":
+            # Get `ValueError: AttributeError: 'NoneType' object has no attribute 'new_ones'` or `AssertionError`.
+            # `TransfoXLConfig` was never used in pipeline tests: cannot create a simple
+            # tokenizer.
+            return True
+
+        return False
+
     def check_cutoffs_and_n_token(
         self, copied_cutoffs, layer, model_embed, model, model_class, resized_value, vocab_size
     ):
@@ -347,7 +370,7 @@ def test_resize_tokens_embeddings(self):
             # Retrieve the cutoffs and copy them
             copied_cutoffs = copy.copy(model_embed.cutoffs)
 
-            test_layers = [x for x in range(config.div_val)]
+            test_layers = list(range(config.div_val))
             for layer in test_layers:
                 # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
                 model_embed = model.resize_token_embeddings(model_vocab_size + 10, layer)
@@ -463,6 +486,10 @@ def _mock_init_weights(self, module):
                 weight = getattr(module, param)
                 weight.data.fill_(3)
 
+    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
+    def test_left_padding_compatibility(self):
+        pass
+
 
 @require_torch
 class TransfoXLModelLanguageGenerationTest(unittest.TestCase):
diff --git a/tests/models/transfo_xl/test_tokenization_transfo_xl.py b/tests/models/transfo_xl/test_tokenization_transfo_xl.py
index 3f7065c51b47..15b712ff3784 100644
--- a/tests/models/transfo_xl/test_tokenization_transfo_xl.py
+++ b/tests/models/transfo_xl/test_tokenization_transfo_xl.py
@@ -23,7 +23,6 @@
 
 
 class TransfoXLTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = TransfoXLTokenizer
     test_rust_tokenizer = False
     test_seq2seq = False
diff --git a/tests/models/trocr/test_modeling_trocr.py b/tests/models/trocr/test_modeling_trocr.py
index 5ef0d9852dff..da24c7dd4309 100644
--- a/tests/models/trocr/test_modeling_trocr.py
+++ b/tests/models/trocr/test_modeling_trocr.py
@@ -22,6 +22,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -46,7 +47,7 @@ def __init__(
         use_labels=True,
         decoder_start_token_id=2,
         decoder_ffn_dim=32,
-        decoder_layers=4,
+        decoder_layers=2,
         decoder_attention_heads=4,
         max_position_embeddings=30,
         pad_token_id=0,
@@ -158,9 +159,10 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class TrOCRStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class TrOCRStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (TrOCRDecoder, TrOCRForCausalLM) if is_torch_available() else ()
     all_generative_model_classes = (TrOCRForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = {"text-generation": TrOCRForCausalLM} if is_torch_available() else {}
     fx_compatible = True
     test_pruning = False
 
@@ -190,3 +192,7 @@ def test_decoder_model_past(self):
     # decoder cannot keep gradients
     def test_retain_grad_hidden_states_attentions(self):
         return
+
+    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
+    def test_left_padding_compatibility(self):
+        pass
diff --git a/tests/models/tvlt/__init__.py b/tests/models/tvlt/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/tvlt/test_feature_extraction_tvlt.py b/tests/models/tvlt/test_feature_extraction_tvlt.py
new file mode 100644
index 000000000000..166f31021cde
--- /dev/null
+++ b/tests/models/tvlt/test_feature_extraction_tvlt.py
@@ -0,0 +1,214 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the TVLT feature extraction. """
+
+import itertools
+import os
+import random
+import tempfile
+import unittest
+
+import numpy as np
+
+from transformers import TvltFeatureExtractor, is_datasets_available
+from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_torchaudio
+from transformers.utils.import_utils import is_torch_available
+
+from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
+
+
+if is_torch_available():
+    import torch
+
+if is_datasets_available():
+    from datasets import load_dataset
+
+global_rng = random.Random()
+
+
+# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list
+def floats_list(shape, scale=1.0, rng=None, name=None):
+    """Creates a random float32 tensor"""
+    if rng is None:
+        rng = global_rng
+
+    values = []
+    for batch_idx in range(shape[0]):
+        values.append([])
+        for _ in range(shape[1]):
+            values[-1].append(rng.random() * scale)
+
+    return values
+
+
+class TvltFeatureExtractionTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        min_seq_length=400,
+        max_seq_length=2000,
+        spectrogram_length=2048,
+        feature_size=128,
+        num_audio_channels=1,
+        hop_length=512,
+        chunk_length=30,
+        sampling_rate=44100,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.min_seq_length = min_seq_length
+        self.max_seq_length = max_seq_length
+        self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
+        self.spectrogram_length = spectrogram_length
+        self.feature_size = feature_size
+        self.num_audio_channels = num_audio_channels
+        self.hop_length = hop_length
+        self.chunk_length = chunk_length
+        self.sampling_rate = sampling_rate
+
+    def prepare_feat_extract_dict(self):
+        return {
+            "spectrogram_length": self.spectrogram_length,
+            "feature_size": self.feature_size,
+            "num_audio_channels": self.num_audio_channels,
+            "hop_length": self.hop_length,
+            "chunk_length": self.chunk_length,
+            "sampling_rate": self.sampling_rate,
+        }
+
+    def prepare_inputs_for_common(self, equal_length=False, numpify=False):
+        def _flatten(list_of_lists):
+            return list(itertools.chain(*list_of_lists))
+
+        if equal_length:
+            speech_inputs = [floats_list((self.max_seq_length, self.feature_size)) for _ in range(self.batch_size)]
+        else:
+            # make sure that inputs increase in size
+            speech_inputs = [
+                floats_list((x, self.feature_size))
+                for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
+            ]
+        if numpify:
+            speech_inputs = [np.asarray(x) for x in speech_inputs]
+        return speech_inputs
+
+
+@require_torch
+@require_torchaudio
+class TvltFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
+    feature_extraction_class = TvltFeatureExtractor
+
+    def setUp(self):
+        self.feat_extract_tester = TvltFeatureExtractionTester(self)
+
+    def test_feat_extract_properties(self):
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        self.assertTrue(hasattr(feature_extractor, "spectrogram_length"))
+        self.assertTrue(hasattr(feature_extractor, "feature_size"))
+        self.assertTrue(hasattr(feature_extractor, "num_audio_channels"))
+        self.assertTrue(hasattr(feature_extractor, "hop_length"))
+        self.assertTrue(hasattr(feature_extractor, "chunk_length"))
+        self.assertTrue(hasattr(feature_extractor, "sampling_rate"))
+
+    def test_feat_extract_from_and_save_pretrained(self):
+        feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            saved_file = feat_extract_first.save_pretrained(tmpdirname)[0]
+            check_json_file_has_correct_format(saved_file)
+            feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname)
+
+        dict_first = feat_extract_first.to_dict()
+        dict_second = feat_extract_second.to_dict()
+        mel_1 = dict_first.pop("mel_filters")
+        mel_2 = dict_second.pop("mel_filters")
+        self.assertTrue(np.allclose(mel_1, mel_2))
+        self.assertEqual(dict_first, dict_second)
+
+    def test_feat_extract_to_json_file(self):
+        feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            json_file_path = os.path.join(tmpdirname, "feat_extract.json")
+            feat_extract_first.to_json_file(json_file_path)
+            feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path)
+
+        dict_first = feat_extract_first.to_dict()
+        dict_second = feat_extract_second.to_dict()
+        mel_1 = dict_first.pop("mel_filters")
+        mel_2 = dict_second.pop("mel_filters")
+        self.assertTrue(np.allclose(mel_1, mel_2))
+        self.assertEqual(dict_first, dict_second)
+
+    def test_call(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+
+        # create three inputs of length 800, 1000, and 1200
+        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
+        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
+
+        # Test not batched input
+        encoded_audios = feature_extractor(np_speech_inputs[0], return_tensors="np", sampling_rate=44100).audio_values
+
+        self.assertTrue(encoded_audios.ndim == 4)
+        self.assertTrue(encoded_audios.shape[-1] == feature_extractor.feature_size)
+        self.assertTrue(encoded_audios.shape[-2] <= feature_extractor.spectrogram_length)
+        self.assertTrue(encoded_audios.shape[-3] == feature_extractor.num_channels)
+
+        # Test batched
+        encoded_audios = feature_extractor(np_speech_inputs, return_tensors="np", sampling_rate=44100).audio_values
+
+        self.assertTrue(encoded_audios.ndim == 4)
+        self.assertTrue(encoded_audios.shape[-1] == feature_extractor.feature_size)
+        self.assertTrue(encoded_audios.shape[-2] <= feature_extractor.spectrogram_length)
+        self.assertTrue(encoded_audios.shape[-3] == feature_extractor.num_channels)
+
+        # Test audio masking
+        encoded_audios = feature_extractor(
+            np_speech_inputs, return_tensors="np", sampling_rate=44100, mask_audio=True
+        ).audio_values
+
+        self.assertTrue(encoded_audios.ndim == 4)
+        self.assertTrue(encoded_audios.shape[-1] == feature_extractor.feature_size)
+        self.assertTrue(encoded_audios.shape[-2] <= feature_extractor.spectrogram_length)
+        self.assertTrue(encoded_audios.shape[-3] == feature_extractor.num_channels)
+
+        # Test 2-D numpy arrays are batched.
+        speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
+        np_speech_inputs = np.asarray(speech_inputs)
+        encoded_audios = feature_extractor(np_speech_inputs, return_tensors="np", sampling_rate=44100).audio_values
+        self.assertTrue(encoded_audios.ndim == 4)
+        self.assertTrue(encoded_audios.shape[-1] == feature_extractor.feature_size)
+        self.assertTrue(encoded_audios.shape[-2] <= feature_extractor.spectrogram_length)
+        self.assertTrue(encoded_audios.shape[-3] == feature_extractor.num_channels)
+
+    def _load_datasamples(self, num_samples):
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        # automatic decoding with librispeech
+        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+
+        return [x["array"] for x in speech_samples]
+
+    def test_integration(self):
+        input_speech = self._load_datasamples(1)
+        feature_extractor = TvltFeatureExtractor()
+        audio_values = feature_extractor(input_speech, return_tensors="pt").audio_values
+
+        self.assertEquals(audio_values.shape, (1, 1, 192, 128))
+
+        expected_slice = torch.tensor([[-0.3032, -0.2708], [-0.4434, -0.4007]])
+        self.assertTrue(torch.allclose(audio_values[0, 0, :2, :2], expected_slice, atol=1e-4))
diff --git a/tests/models/tvlt/test_image_processor_tvlt.py b/tests/models/tvlt/test_image_processor_tvlt.py
new file mode 100644
index 000000000000..8677a86665f3
--- /dev/null
+++ b/tests/models/tvlt/test_image_processor_tvlt.py
@@ -0,0 +1,294 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the TVLT image processor. """
+
+import unittest
+
+import numpy as np
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import TvltImageProcessor
+
+
+def prepare_video(image_processor_tester, width=10, height=10, numpify=False, torchify=False):
+    """This function prepares a video as a list of PIL images/NumPy arrays/PyTorch tensors."""
+
+    video = []
+    for i in range(image_processor_tester.num_frames):
+        video.append(np.random.randint(255, size=(image_processor_tester.num_channels, width, height), dtype=np.uint8))
+
+    if not numpify and not torchify:
+        # PIL expects the channel dimension as last dimension
+        video = [Image.fromarray(np.moveaxis(frame, 0, -1)) for frame in video]
+
+    if torchify:
+        video = [torch.from_numpy(frame) for frame in video]
+
+    return video
+
+
+def prepare_video_inputs(image_processor_tester, equal_resolution=False, numpify=False, torchify=False):
+    """This function prepares a batch of videos: a list of list of PIL images, or a list of list of numpy arrays if
+    one specifies numpify=True, or a list of list of PyTorch tensors if one specifies torchify=True.
+    One can specify whether the videos are of the same resolution or not.
+    """
+
+    assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
+
+    video_inputs = []
+    for i in range(image_processor_tester.batch_size):
+        if equal_resolution:
+            width = height = image_processor_tester.max_resolution
+        else:
+            width, height = np.random.choice(
+                np.arange(image_processor_tester.min_resolution, image_processor_tester.max_resolution), 2
+            )
+            video = prepare_video(
+                image_processor_tester=image_processor_tester,
+                width=width,
+                height=height,
+                numpify=numpify,
+                torchify=torchify,
+            )
+        video_inputs.append(video)
+
+    return video_inputs
+
+
+class TvltImageProcessorTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        num_frames=4,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+        do_center_crop=True,
+        crop_size=None,
+    ):
+        size = size if size is not None else {"shortest_edge": 18}
+        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
+
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.num_frames = num_frames
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+
+    def prepare_image_processor_dict(self):
+        return {
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_normalize": self.do_normalize,
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_center_crop": self.do_center_crop,
+            "crop_size": self.crop_size,
+        }
+
+
+@require_torch
+@require_vision
+class TvltImageProcessorTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = TvltImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = TvltImageProcessorTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processor, "image_mean"))
+        self.assertTrue(hasattr(image_processor, "image_std"))
+        self.assertTrue(hasattr(image_processor, "do_normalize"))
+        self.assertTrue(hasattr(image_processor, "do_resize"))
+        self.assertTrue(hasattr(image_processor, "do_center_crop"))
+        self.assertTrue(hasattr(image_processor, "size"))
+
+    def test_call_pil(self):
+        # Initialize image_processor
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+        # create random PIL videos
+        video_inputs = prepare_video_inputs(self.image_processor_tester, equal_resolution=False)
+        for video in video_inputs:
+            self.assertIsInstance(video, list)
+            self.assertIsInstance(video[0], Image.Image)
+
+        # Test not batched input
+        encoded_videos = image_processor(video_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_videos.shape,
+            (
+                1,
+                self.image_processor_tester.num_frames,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.crop_size["height"],
+                self.image_processor_tester.crop_size["width"],
+            ),
+        )
+
+        # Test batched
+        encoded_videos = image_processor(video_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_videos.shape,
+            (
+                self.image_processor_tester.batch_size,
+                self.image_processor_tester.num_frames,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.crop_size["height"],
+                self.image_processor_tester.crop_size["width"],
+            ),
+        )
+
+    def test_call_numpy(self):
+        # Initialize image_processor
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+        # create random numpy tensors
+        video_inputs = prepare_video_inputs(self.image_processor_tester, equal_resolution=False, numpify=True)
+        for video in video_inputs:
+            self.assertIsInstance(video, list)
+            self.assertIsInstance(video[0], np.ndarray)
+
+        # Test not batched input
+        encoded_videos = image_processor(video_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_videos.shape,
+            (
+                1,
+                self.image_processor_tester.num_frames,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.crop_size["height"],
+                self.image_processor_tester.crop_size["width"],
+            ),
+        )
+
+        # Test batched
+        encoded_videos = image_processor(video_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_videos.shape,
+            (
+                self.image_processor_tester.batch_size,
+                self.image_processor_tester.num_frames,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.crop_size["height"],
+                self.image_processor_tester.crop_size["width"],
+            ),
+        )
+
+    def test_call_numpy_4_channels(self):
+        # Initialize image_processor
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+        # create random numpy tensors
+        self.image_processor_tester.num_channels = 4
+        video_inputs = prepare_video_inputs(self.image_processor_tester, equal_resolution=False, numpify=True)
+        for video in video_inputs:
+            self.assertIsInstance(video, list)
+            self.assertIsInstance(video[0], np.ndarray)
+
+        # Test not batched input
+        encoded_videos = image_processor(
+            video_inputs[0], return_tensors="pt", input_data_format="channels_first", image_mean=0, image_std=1
+        ).pixel_values
+        self.assertEqual(
+            encoded_videos.shape,
+            (
+                1,
+                self.image_processor_tester.num_frames,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.crop_size["height"],
+                self.image_processor_tester.crop_size["width"],
+            ),
+        )
+
+        # Test batched
+        encoded_videos = image_processor(
+            video_inputs, return_tensors="pt", input_data_format="channels_first", image_mean=0, image_std=1
+        ).pixel_values
+        self.assertEqual(
+            encoded_videos.shape,
+            (
+                self.image_processor_tester.batch_size,
+                self.image_processor_tester.num_frames,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.crop_size["height"],
+                self.image_processor_tester.crop_size["width"],
+            ),
+        )
+        self.image_processor_tester.num_channels = 3
+
+    def test_call_pytorch(self):
+        # Initialize image_processor
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+        # create random PyTorch tensors
+        video_inputs = prepare_video_inputs(self.image_processor_tester, equal_resolution=False, torchify=True)
+        for video in video_inputs:
+            self.assertIsInstance(video, list)
+            self.assertIsInstance(video[0], torch.Tensor)
+
+        # Test not batched input
+        encoded_videos = image_processor(video_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_videos.shape,
+            (
+                1,
+                self.image_processor_tester.num_frames,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.crop_size["height"],
+                self.image_processor_tester.crop_size["width"],
+            ),
+        )
+
+        # Test batched
+        encoded_videos = image_processor(video_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_videos.shape,
+            (
+                self.image_processor_tester.batch_size,
+                self.image_processor_tester.num_frames,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.crop_size["height"],
+                self.image_processor_tester.crop_size["width"],
+            ),
+        )
diff --git a/tests/models/tvlt/test_modeling_tvlt.py b/tests/models/tvlt/test_modeling_tvlt.py
new file mode 100644
index 000000000000..3ee7f7adc7ff
--- /dev/null
+++ b/tests/models/tvlt/test_modeling_tvlt.py
@@ -0,0 +1,626 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch TVLT model. """
+
+import copy
+import inspect
+import unittest
+
+import numpy as np
+from huggingface_hub import hf_hub_download
+
+from transformers import (
+    TvltConfig,
+    is_datasets_available,
+    is_speech_available,
+    is_torch_available,
+    is_vision_available,
+)
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.utils import cached_property
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+    import torch.nn as nn
+
+    from transformers import TvltForAudioVisualClassification, TvltForPreTraining, TvltModel
+    from transformers.models.tvlt.modeling_tvlt import TVLT_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+if is_datasets_available():
+    from datasets import load_dataset
+
+if is_vision_available():
+    from transformers import TvltImageProcessor
+
+if is_speech_available():
+    from transformers import TvltFeatureExtractor
+
+
+class TvltModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=2,
+        image_size=32,
+        spectrogram_length=32,
+        frequency_length=16,
+        image_patch_size=[2, 2],
+        audio_patch_size=[2, 2],
+        num_image_channels=3,
+        num_audio_channels=1,
+        num_frames=2,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=128,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        qkv_bias=True,
+        use_mean_pooling=True,
+        decoder_num_attention_heads=4,
+        decoder_hidden_size=32,
+        decoder_num_hidden_layers=2,
+        decoder_intermediate_size=128,
+        image_mask_ratio=0.75,
+        audio_mask_ratio=0.15,
+        audio_mask_type="frame-level",
+        task_matching=True,
+        task_mae=True,
+        num_labels=1,
+        is_training=True,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.spectrogram_length = spectrogram_length
+        self.frequency_length = frequency_length
+        self.image_patch_size = image_patch_size
+        self.audio_patch_size = audio_patch_size
+        self.num_image_channels = num_image_channels
+        self.num_audio_channels = num_audio_channels
+        self.num_frames = num_frames
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.qkv_bias = qkv_bias
+        self.use_mean_pooling = use_mean_pooling
+
+        self.decoder_num_attention_heads = decoder_num_attention_heads
+        self.decoder_hidden_size = decoder_hidden_size
+        self.decoder_num_hidden_layers = decoder_num_hidden_layers
+        self.decoder_intermediate_size = decoder_intermediate_size
+        self.image_mask_ratio = image_mask_ratio
+        self.audio_mask_ratio = audio_mask_ratio
+
+        self.task_matching = task_matching
+        self.task_mae = task_mae
+        self.num_labels = num_labels
+
+        self.expected_pixel_seq_len = (self.image_size // self.image_patch_size[0]) ** 2 * self.num_frames
+        self.expected_audio_seq_len = (self.spectrogram_length // self.audio_patch_size[0]) * (
+            self.frequency_length // self.audio_patch_size[1]
+        )
+        # we set the expected sequence length (which is used in several tests)
+        # this is equal to the seq length of number of image/video patches + number of audio patches
+        self.expected_seq_len = self.expected_pixel_seq_len + self.expected_audio_seq_len + 1
+
+        self.image_mae_output_dim = image_patch_size[0] ** 2 * num_image_channels
+        self.audio_mae_output_dim = audio_patch_size[0] * audio_patch_size[1] * num_audio_channels
+        self.is_training = is_training
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor(
+            [self.batch_size, self.num_frames, self.num_image_channels, self.image_size, self.image_size]
+        )
+        audio_values = floats_tensor(
+            [self.batch_size, self.num_audio_channels, self.spectrogram_length, self.frequency_length]
+        )
+
+        pixel_mask = floats_tensor([self.batch_size, self.expected_pixel_seq_len])
+        audio_mask = floats_tensor([self.batch_size, self.expected_audio_seq_len])
+
+        config = self.get_config()
+
+        return (config, pixel_values, audio_values, pixel_mask, audio_mask)
+
+    def prepare_config_and_inputs_for_pretraining(self):
+        pixel_values = floats_tensor(
+            [self.batch_size, self.num_frames, self.num_image_channels, self.image_size, self.image_size]
+        )
+        audio_values = floats_tensor(
+            [self.batch_size, self.num_audio_channels, self.spectrogram_length, self.frequency_length]
+        )
+
+        pixel_mask = floats_tensor([self.batch_size, self.expected_pixel_seq_len])
+        audio_mask = floats_tensor([self.batch_size, self.expected_audio_seq_len])
+
+        pixel_values_mixed = floats_tensor(
+            [self.batch_size, self.num_frames, self.num_image_channels, self.image_size, self.image_size]
+        )
+        pixel_mask_mixed = floats_tensor([self.batch_size, self.expected_pixel_seq_len])
+        labels = floats_tensor([self.batch_size])
+        config = self.get_config()
+
+        return (
+            config,
+            pixel_values,
+            audio_values,
+            pixel_mask,
+            audio_mask,
+            pixel_values_mixed,
+            pixel_mask_mixed,
+            labels,
+        )
+
+    def get_config(self):
+        return TvltConfig(
+            image_size=self.image_size,
+            spectrogram_length=self.spectrogram_length,
+            frequency_length=self.frequency_length,
+            image_patch_size=self.image_patch_size,
+            audio_patch_size=self.audio_patch_size,
+            num_image_channels=self.num_image_channels,
+            num_audio_channels=self.num_audio_channels,
+            num_frames=self.num_frames,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            initializer_range=self.initializer_range,
+            layer_norm_eps=self.layer_norm_eps,
+            qkv_bias=self.qkv_bias,
+            use_mean_pooling=self.use_mean_pooling,
+            decoder_num_attention_heads=self.decoder_num_attention_heads,
+            decoder_hidden_size=self.decoder_hidden_size,
+            decoder_num_hidden_layers=self.decoder_num_hidden_layers,
+            decoder_intermediate_size=self.decoder_intermediate_size,
+            image_mask_ratio=self.image_mask_ratio,
+            audio_mask_ratio=self.audio_mask_ratio,
+            task_matching=self.task_matching,
+            task_mae=self.task_mae,
+            num_labels=self.num_labels,
+        )
+
+    def create_and_check_model(self, config, pixel_values, audio_values, pixel_mask, audio_mask):
+        model = TvltModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values, audio_values, pixel_mask=pixel_mask, audio_mask=audio_mask)
+        result = model(pixel_values, audio_values)
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size, self.expected_seq_len, self.hidden_size)
+        )
+
+    def create_and_check_for_audiovisual_classification(
+        self, config, pixel_values, audio_values, pixel_mask, audio_mask
+    ):
+        model = TvltForAudioVisualClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values, audio_values, pixel_mask=pixel_mask, audio_mask=audio_mask)
+        result = model(pixel_values, audio_values)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_for_pretraining(
+        self,
+        config,
+        pixel_values,
+        audio_values,
+        pixel_mask,
+        audio_mask,
+        pixel_values_mixed,
+        pixel_mask_mixed,
+        labels,
+    ):
+        model = TvltForPreTraining(config=config)
+        model.to(torch_device)
+        model.train()
+        result = model(
+            pixel_values,
+            audio_values,
+            pixel_mask,
+            audio_mask,
+            pixel_values_mixed=pixel_values_mixed,
+            pixel_mask_mixed=pixel_mask_mixed,
+            labels=labels,
+        )
+        self.parent.assertEqual(
+            result.pixel_logits.shape, (self.batch_size, self.expected_pixel_seq_len, self.image_mae_output_dim)
+        )
+        self.parent.assertEqual(
+            result.audio_logits.shape, (self.batch_size, self.expected_audio_seq_len, self.audio_mae_output_dim)
+        )
+        self.parent.assertEqual(result.matching_logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_for_pretraining_inference(
+        self,
+        config,
+        pixel_values,
+        audio_values,
+        pixel_mask,
+        audio_mask,
+        pixel_values_mixed,
+        pixel_mask_mixed,
+        labels,
+    ):
+        model = TvltForPreTraining(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            pixel_values,
+            audio_values,
+            pixel_mask,
+            audio_mask,
+            pixel_values_mixed=pixel_values_mixed,
+            pixel_mask_mixed=pixel_mask_mixed,
+            labels=labels,
+        )
+        if result.pixel_logits is not None:
+            self.parent.assertEqual(
+                result.pixel_logits.shape, (self.batch_size, self.expected_pixel_seq_len, self.image_mae_output_dim)
+            )
+        if result.audio_logits is not None:
+            self.parent.assertEqual(
+                result.audio_logits.shape, (self.batch_size, self.expected_audio_seq_len, self.audio_mae_output_dim)
+            )
+        self.parent.assertEqual(result.matching_logits.shape, (self.batch_size, self.num_labels))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (config, pixel_values, audio_values, pixel_mask, audio_mask) = config_and_inputs
+        inputs_dict = {
+            "pixel_values": pixel_values,
+            "audio_values": audio_values,
+            "pixel_mask": pixel_mask,
+            "audio_mask": audio_mask,
+        }
+        return config, inputs_dict
+
+    def prepare_pixel_values(self):
+        return floats_tensor(
+            [self.batch_size, self.num_frames, self.num_image_channels, self.image_size, self.image_size]
+        )
+
+    def prepare_audio_values(self):
+        return floats_tensor(
+            [self.batch_size, self.num_audio_channels, self.spectrogram_length, self.frequency_length]
+        )
+
+
+@require_torch
+class TvltModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (TvltModel, TvltForPreTraining, TvltForAudioVisualClassification) if is_torch_available() else ()
+    )
+    pipeline_model_mapping = {"feature-extraction": TvltModel} if is_torch_available() else {}
+
+    fx_compatible = False
+    test_pruning = False
+    test_headmasking = False
+    test_torchscript = False
+    test_resize_embeddings = False
+    main_input_name = "pixel_values"
+
+    # TvltForAudioVisualClassification and TvltForPreTraining require special treatment
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=True):
+        inputs_dict = copy.deepcopy(inputs_dict)
+
+        if return_labels:
+            if model_class.__name__ == "TvltForAudioVisualClassification":
+                inputs_dict["labels"] = torch.zeros(
+                    (self.model_tester.batch_size,), dtype=torch.long, device=torch_device
+                )
+            elif model_class.__name__ == "TvltForPreTraining":
+                inputs_dict["labels"] = torch.zeros(
+                    (self.model_tester.batch_size,), dtype=torch.float, device=torch_device
+                )
+                inputs_dict["pixel_values_mixed"] = torch.zeros(
+                    (
+                        self.model_tester.batch_size,
+                        self.model_tester.num_frames,
+                        self.model_tester.num_image_channels,
+                        self.model_tester.image_size,
+                        self.model_tester.image_size,
+                    ),
+                    dtype=torch.float,
+                    device=torch_device,
+                )
+                inputs_dict["pixel_mask_mixed"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.expected_pixel_seq_len),
+                    dtype=torch.float,
+                    device=torch_device,
+                )
+
+        return inputs_dict
+
+    def setUp(self):
+        self.model_tester = TvltModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=TvltConfig, has_text_modality=False, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="TVLT does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            input_embeddings = model.get_input_embeddings()
+            self.assertIsInstance(input_embeddings, (tuple))
+            for embedding in input_embeddings:
+                self.assertIsInstance(embedding, (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values", "audio_values"]
+            self.assertListEqual(arg_names[:2], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_audiovisual_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_audiovisual_classification(*config_and_inputs)
+
+    def test_for_pretraining(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_pretraining()
+        self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
+        self.model_tester.create_and_check_for_pretraining_inference(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in TVLT_PRETRAINED_MODEL_ARCHIVE_LIST:
+            model = TvltModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    def test_training(self):
+        if not self.model_tester.is_training:
+            return
+
+        for model_class in self.all_model_classes[1:]:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            config.return_dict = True
+
+            model = model_class(config)
+            model.to(torch_device)
+            model.train()
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            for k, v in inputs.items():
+                print(k, v.shape)
+            loss = model(**inputs).loss
+            loss.backward()
+
+    def test_training_gradient_checkpointing(self):
+        if not self.model_tester.is_training:
+            return
+
+        for model_class in self.all_model_classes[1:]:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            config.use_cache = False
+            config.return_dict = True
+
+            model = model_class(config)
+            model.to(torch_device)
+            model.gradient_checkpointing_enable()
+            model.train()
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            loss = model(**inputs).loss
+            loss.backward()
+
+    def test_attention_outputs(self):
+        if not self.has_attentions:
+            pass
+
+        else:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            config.return_dict = True
+
+            for model_class in self.all_model_classes[2:]:
+                seq_len = self.model_tester.expected_seq_len
+
+                inputs_dict["output_attentions"] = True
+                inputs_dict["output_hidden_states"] = False
+                config.return_dict = True
+                model = model_class(config)
+                model.to(torch_device)
+                model.eval()
+                with torch.no_grad():
+                    outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+                attentions = outputs.attentions
+                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+                # check that output_attentions also work using config
+                del inputs_dict["output_attentions"]
+                config.output_attentions = True
+                model = model_class(config)
+                model.to(torch_device)
+                model.eval()
+                with torch.no_grad():
+                    outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+                attentions = outputs.attentions
+                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+                self.assertListEqual(
+                    list(attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, seq_len, seq_len],
+                )
+                out_len = len(outputs)
+
+                # Check attention is always last and order is fine
+                inputs_dict["output_attentions"] = True
+                inputs_dict["output_hidden_states"] = True
+                model = model_class(config)
+                model.to(torch_device)
+                model.eval()
+                with torch.no_grad():
+                    outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+                self.assertEqual(out_len + 1, len(outputs))
+
+                self_attentions = outputs.attentions
+
+                self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(self_attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, seq_len, seq_len],
+                )
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.hidden_states
+            expected_num_layers = self.model_tester.num_hidden_layers + 1
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            seq_length = self.model_tester.expected_seq_len
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [seq_length, self.model_tester.hidden_size],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes[2:]:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+
+# We will verify our results on a video of eating spaghetti
+# Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
+def prepare_video(num_frames=8):
+    file = hf_hub_download(
+        repo_id="hf-internal-testing/spaghetti-video", filename="eating_spaghetti.npy", repo_type="dataset"
+    )
+    video = np.load(file)[:num_frames]
+    return list(video)
+
+
+def prepare_audio(num_samples=1):
+    ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+    # automatic decoding with librispeech
+    speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+    return [x["array"] for x in speech_samples]
+
+
+@require_torch
+@require_vision
+class TvltModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_processors(self):
+        # logits were tested with a different mean and std, so we use the same here
+        return (
+            TvltImageProcessor() if is_vision_available() else None,
+            TvltFeatureExtractor(),
+        )
+
+    def test_inference_for_base_model(self):
+        model = TvltModel.from_pretrained("ZinengTang/tvlt-base").to(torch_device)
+
+        image_processor, audio_feature_extractor = self.default_processors
+        video = prepare_video()
+        audio = prepare_audio()
+        video_inputs = image_processor(video, return_tensors="pt").to(torch_device)
+        audio_inputs = audio_feature_extractor(audio, return_tensors="pt").to(torch_device)
+        inputs = {}
+        inputs.update(video_inputs)
+        inputs.update(audio_inputs)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # verify the logits
+        expected_last_hidden_state_slice = torch.tensor([[-0.0186, -0.0691], [0.0242, -0.0398]], device=torch_device)
+        self.assertTrue(
+            torch.allclose(outputs.last_hidden_state[:, :2, :2], expected_last_hidden_state_slice, atol=1e-4)
+        )
+
+    def test_inference_for_pretraining(self):
+        model = TvltForPreTraining.from_pretrained("ZinengTang/tvlt-base").to(torch_device)
+
+        image_processor, audio_feature_extractor = self.default_processors
+        video = prepare_video()
+        video_mixed = prepare_video()
+        audio = prepare_audio()
+        video_inputs = image_processor(video, return_tensors="pt", mask_pixel=True).to(torch_device)
+        video_mixed_inputs = image_processor(video_mixed, is_mixed=True, return_tensors="pt").to(torch_device)
+        audio_inputs = audio_feature_extractor(audio, return_tensors="pt", mask_audio=True).to(torch_device)
+        labels = torch.tensor([[0.0]], device=torch_device)
+        inputs = {}
+        inputs.update(video_inputs)
+        inputs.update(video_mixed_inputs)
+        inputs.update(audio_inputs)
+        inputs.update({"labels": labels})
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # verify the logits
+        expected_pixel_logits_shape = torch.Size([1, 1568, 768])
+        expected_audio_logits_shape = torch.Size([1, 96, 256])
+        expected_matching_logits_shape = torch.Size([1, 1])
+
+        if outputs.pixel_logits is not None:
+            self.assertEqual(outputs.pixel_logits.shape, expected_pixel_logits_shape)
+        if outputs.audio_logits is not None:
+            self.assertEqual(outputs.audio_logits.shape, expected_audio_logits_shape)
+        self.assertTrue(outputs.matching_logits.shape, expected_matching_logits_shape)
diff --git a/tests/models/tvlt/test_processor_tvlt.py b/tests/models/tvlt/test_processor_tvlt.py
new file mode 100644
index 000000000000..83f59860fee4
--- /dev/null
+++ b/tests/models/tvlt/test_processor_tvlt.py
@@ -0,0 +1,116 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import shutil
+import tempfile
+import unittest
+
+import numpy as np
+import pytest
+
+from transformers import is_speech_available, is_vision_available
+from transformers.testing_utils import require_torch
+
+
+if is_vision_available():
+    from transformers import TvltImageProcessor
+
+if is_speech_available():
+    from transformers import TvltFeatureExtractor
+
+from transformers import TvltProcessor
+
+
+@require_torch
+class TvltProcessorTest(unittest.TestCase):
+    def setUp(self):
+        self.checkpoint = "ZinengTang/tvlt-base"
+        self.tmpdirname = tempfile.mkdtemp()
+
+    def get_image_processor(self, **kwargs):
+        return TvltImageProcessor.from_pretrained(self.checkpoint, **kwargs)
+
+    def get_feature_extractor(self, **kwargs):
+        return TvltFeatureExtractor.from_pretrained(self.checkpoint, **kwargs)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def test_save_load_pretrained_default(self):
+        image_processor = self.get_image_processor()
+        feature_extractor = self.get_feature_extractor()
+
+        processor = TvltProcessor(image_processor=image_processor, feature_extractor=feature_extractor)
+        processor.save_pretrained(self.tmpdirname)
+        processor = TvltProcessor.from_pretrained(self.tmpdirname)
+
+        self.assertIsInstance(processor.feature_extractor, TvltFeatureExtractor)
+        self.assertIsInstance(processor.image_processor, TvltImageProcessor)
+
+    def test_feature_extractor(self):
+        image_processor = self.get_image_processor()
+        feature_extractor = self.get_feature_extractor()
+
+        processor = TvltProcessor(image_processor=image_processor, feature_extractor=feature_extractor)
+
+        audio = np.ones([12000])
+
+        audio_dict = feature_extractor(audio, return_tensors="np")
+        input_processor = processor(audio=audio, return_tensors="np")
+
+        for key in audio_dict.keys():
+            self.assertAlmostEqual(audio_dict[key].sum(), input_processor[key].sum(), delta=1e-2)
+
+    def test_image_processor(self):
+        image_processor = self.get_image_processor()
+        feature_extractor = self.get_feature_extractor()
+
+        processor = TvltProcessor(image_processor=image_processor, feature_extractor=feature_extractor)
+
+        images = np.ones([3, 224, 224])
+
+        image_dict = image_processor(images, return_tensors="np")
+        input_processor = processor(images=images, return_tensors="np")
+
+        for key in image_dict.keys():
+            self.assertAlmostEqual(image_dict[key].sum(), input_processor[key].sum(), delta=1e-2)
+
+    def test_processor(self):
+        image_processor = self.get_image_processor()
+        feature_extractor = self.get_feature_extractor()
+
+        processor = TvltProcessor(image_processor=image_processor, feature_extractor=feature_extractor)
+
+        audio = np.ones([12000])
+        images = np.ones([3, 224, 224])
+
+        inputs = processor(audio=audio, images=images)
+
+        self.assertListEqual(list(inputs.keys()), ["audio_values", "audio_mask", "pixel_values", "pixel_mask"])
+
+        # test if it raises when no input is passed
+        with pytest.raises(ValueError):
+            processor()
+
+    def test_model_input_names(self):
+        image_processor = self.get_image_processor()
+        feature_extractor = self.get_feature_extractor()
+
+        processor = TvltProcessor(image_processor=image_processor, feature_extractor=feature_extractor)
+
+        self.assertListEqual(
+            processor.model_input_names,
+            image_processor.model_input_names + feature_extractor.model_input_names,
+            msg="`processor` and `image_processor`+`feature_extractor` model input names do not match",
+        )
diff --git a/tests/models/umt5/__init__.py b/tests/models/umt5/__init__.py
new file mode 100644
index 000000000000..8b137891791f
--- /dev/null
+++ b/tests/models/umt5/__init__.py
@@ -0,0 +1 @@
+
diff --git a/tests/models/umt5/test_modeling_umt5.py b/tests/models/umt5/test_modeling_umt5.py
new file mode 100644
index 000000000000..d9fd852c884a
--- /dev/null
+++ b/tests/models/umt5/test_modeling_umt5.py
@@ -0,0 +1,586 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import os
+import pickle
+import tempfile
+import unittest
+
+from transformers import T5Config, is_torch_available
+from transformers.models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
+from transformers.testing_utils import (
+    require_sentencepiece,
+    require_tokenizers,
+    require_torch,
+    slow,
+    torch_device,
+)
+from transformers.utils import is_torch_fx_available
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_fx_available():
+    from transformers.utils.fx import symbolic_trace
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        AutoTokenizer,
+        UMT5ForConditionalGeneration,
+        UMT5ForQuestionAnswering,
+        UMT5ForSequenceClassification,
+        UMT5Model,
+    )
+
+
+# Copied from test.models.t5.test_modeling_t5.T5ModelTester with T5->UMT5
+class UMT5ModelTester:
+    def __init__(
+        self,
+        parent,
+        vocab_size=99,
+        batch_size=13,
+        encoder_seq_length=7,
+        decoder_seq_length=7,
+        # For common tests
+        is_training=True,
+        use_attention_mask=True,
+        use_labels=False,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        d_ff=37,
+        relative_attention_num_buckets=8,
+        dropout_rate=0.1,
+        initializer_factor=0.002,
+        eos_token_id=1,
+        pad_token_id=0,
+        decoder_start_token_id=0,
+        scope=None,
+        decoder_layers=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.encoder_seq_length = encoder_seq_length
+        self.decoder_seq_length = decoder_seq_length
+        # For common tests
+        self.seq_length = self.decoder_seq_length
+        self.is_training = is_training
+        self.use_attention_mask = use_attention_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.d_ff = d_ff
+        self.relative_attention_num_buckets = relative_attention_num_buckets
+        self.dropout_rate = dropout_rate
+        self.initializer_factor = initializer_factor
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.decoder_start_token_id = decoder_start_token_id
+        self.scope = None
+        self.decoder_layers = decoder_layers
+
+    def get_large_model_config(self):
+        return T5Config.from_pretrained("google/umt5-base")
+
+    def prepare_inputs_dict(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask=None,
+        decoder_attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+    ):
+        if attention_mask is None:
+            attention_mask = input_ids.ne(config.pad_token_id)
+        if decoder_attention_mask is None:
+            decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
+        if head_mask is None:
+            head_mask = torch.ones(config.num_hidden_layers, config.num_attention_heads, device=torch_device)
+        if decoder_head_mask is None:
+            decoder_head_mask = torch.ones(config.num_decoder_layers, config.num_attention_heads, device=torch_device)
+        if cross_attn_head_mask is None:
+            cross_attn_head_mask = torch.ones(
+                config.num_decoder_layers, config.num_attention_heads, device=torch_device
+            )
+        return {
+            "input_ids": input_ids,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "decoder_attention_mask": decoder_attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+        }
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
+        decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+
+        # we need to clamp the input ids here to avoid having pad token in between
+        # this is because for NllbMoe the position_ids are prepared such that
+        # all pad tokens have pos id = 2 and rest are between 2..seq_length
+        # and the seq_length here is seq_length - num_pad_tokens
+        # but when using past, there is no way of knowing if the past input ids had
+        # pad tokens in them, which results in incorrect seq_lenth and which in turn results in
+        # position_ids being off by num_pad_tokens in past input
+        input_ids = input_ids.clamp(self.pad_token_id + 2)
+        input_ids[:, -1] = self.eos_token_id  # Eos Token
+        decoder_input_ids = decoder_input_ids.clamp(self.pad_token_id + 1)
+
+        config = self.get_config()
+        config.encoder_attention_heads = config.num_attention_heads
+        input_dict = self.prepare_inputs_dict(config, input_ids, decoder_input_ids)
+        return config, input_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def get_pipeline_config(self):
+        return T5Config(
+            vocab_size=166,  # t5 forces 100 extra tokens
+            d_model=self.hidden_size,
+            d_ff=self.d_ff,
+            d_kv=self.hidden_size // self.num_attention_heads,
+            num_layers=self.num_hidden_layers,
+            num_decoder_layers=self.decoder_layers,
+            num_heads=self.num_attention_heads,
+            relative_attention_num_buckets=self.relative_attention_num_buckets,
+            dropout_rate=self.dropout_rate,
+            initializer_factor=self.initializer_factor,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.pad_token_id,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.decoder_start_token_id,
+        )
+
+    def get_config(self):
+        return T5Config(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            d_ff=self.d_ff,
+            d_kv=self.hidden_size // self.num_attention_heads,
+            num_layers=self.num_hidden_layers,
+            num_decoder_layers=self.decoder_layers,
+            num_heads=self.num_attention_heads,
+            relative_attention_num_buckets=self.relative_attention_num_buckets,
+            dropout_rate=self.dropout_rate,
+            initializer_factor=self.initializer_factor,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.pad_token_id,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.decoder_start_token_id,
+        )
+
+    def create_and_check_model(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = UMT5Model(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids=input_ids,
+            decoder_input_ids=decoder_input_ids,
+            attention_mask=attention_mask,
+            decoder_attention_mask=decoder_attention_mask,
+        )
+        result = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+        decoder_output = result.last_hidden_state
+        decoder_past = result.past_key_values
+        encoder_output = result.encoder_last_hidden_state
+
+        self.parent.assertEqual(encoder_output.size(), (self.batch_size, self.encoder_seq_length, self.hidden_size))
+        self.parent.assertEqual(decoder_output.size(), (self.batch_size, self.decoder_seq_length, self.hidden_size))
+        # There should be `num_layers` key value embeddings stored in decoder_past
+        self.parent.assertEqual(len(decoder_past), config.num_layers)
+        # There should be a self attn key, a self attn value, a cross attn key and a cross attn value stored in each decoder_past tuple
+        self.parent.assertEqual(len(decoder_past[0]), 4)
+
+    def create_and_check_decoder_model_past(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = UMT5Model(config=config).get_decoder().to(torch_device).eval()
+        # first forward pass
+        outputs = model(input_ids, use_cache=True)
+        outputs_use_cache_conf = model(input_ids)
+        outputs_no_past = model(input_ids, use_cache=False)
+
+        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
+        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
+
+        output, past_key_values = outputs.to_tuple()
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+
+        output_from_no_past = model(next_input_ids)["last_hidden_state"]
+        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_model_fp16_forward(
+        self,
+        config,
+        input_dict,
+    ):
+        model = UMT5Model(config=config).to(torch_device).half().eval()
+        output = model(**input_dict)["last_hidden_state"]
+        self.parent.assertFalse(torch.isnan(output).any().item())
+
+    def create_and_check_with_sequence_classification_head(
+        self,
+        config,
+        input_dict,
+    ):
+        labels = torch.tensor([1] * self.batch_size, dtype=torch.long, device=torch_device)
+        model = UMT5ForSequenceClassification(config=config).to(torch_device).eval()
+        outputs = model(**input_dict, labels=labels)
+        # self.parent.assertEqual(len(outputs), 4)
+        self.parent.assertEqual(outputs["logits"].size(), (self.batch_size, config.num_labels))
+        self.parent.assertEqual(outputs["loss"].size(), ())
+
+
+@require_torch
+class UMT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (UMT5Model, UMT5ForConditionalGeneration, UMT5ForSequenceClassification, UMT5ForQuestionAnswering)
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (UMT5ForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "conversational": UMT5ForConditionalGeneration,
+            "feature-extraction": UMT5Model,
+            "question-answering": UMT5ForQuestionAnswering,
+            "summarization": UMT5ForConditionalGeneration,
+            "text-classification": UMT5ForSequenceClassification,
+            "text2text-generation": UMT5ForConditionalGeneration,
+            "translation": UMT5ForConditionalGeneration,
+            "zero-shot": UMT5ForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
+    is_encoder_decoder = True
+    fx_compatible = False
+    test_pruning = False
+    test_missing_keys = True
+    test_torchscript = True
+    # The small UMT5 model needs higher percentages for CPU/MP tests
+    model_split_percents = [0.8, 0.9]
+
+    def setUp(self):
+        self.model_tester = UMT5ModelTester(self)
+
+    # `QAPipelineTests` is not working well with slow tokenizers (for some models) and we don't want to touch the file
+    # `src/transformers/data/processors/squad.py` (where this test fails for this model)
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if pipeline_test_casse_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
+            return True
+
+        return False
+
+    def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False):
+        if not is_torch_fx_available() or not self.fx_compatible:
+            return
+
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        configs_no_init.return_dict = False
+
+        for model_class in self.all_model_classes:
+            if model_class.__name__ == "UMT5ForSequenceClassification":
+                continue
+            model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=output_loss)
+
+            try:
+                if model.config.is_encoder_decoder:
+                    model.config.use_cache = False  # FSTM still requires this hack -> FSTM should probably be refactored similar to BART afterward
+                    labels = inputs.get("labels", None)
+                    input_names = [
+                        "attention_mask",
+                        "decoder_attention_mask",
+                        "decoder_input_ids",
+                        "input_features",
+                        "input_ids",
+                        "input_values",
+                    ]
+                    if labels is not None:
+                        input_names.append("labels")
+
+                    filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names}
+                    input_names = list(filtered_inputs.keys())
+
+                    model_output = model(**filtered_inputs)
+
+                    traced_model = symbolic_trace(model, input_names)
+                    traced_output = traced_model(**filtered_inputs)
+                else:
+                    input_names = [
+                        "attention_mask",
+                        "bbox",
+                        "input_features",
+                        "input_ids",
+                        "input_values",
+                        "pixel_values",
+                        "token_type_ids",
+                        "visual_feats",
+                        "visual_pos",
+                    ]
+
+                    labels = inputs.get("labels", None)
+                    start_positions = inputs.get("start_positions", None)
+                    end_positions = inputs.get("end_positions", None)
+                    if labels is not None:
+                        input_names.append("labels")
+                    if start_positions is not None:
+                        input_names.append("start_positions")
+                    if end_positions is not None:
+                        input_names.append("end_positions")
+
+                    filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names}
+                    input_names = list(filtered_inputs.keys())
+
+                    if model.__class__.__name__ in set(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES.values()) and (
+                        not hasattr(model.config, "problem_type") or model.config.problem_type is None
+                    ):
+                        model.config.problem_type = "single_label_classification"
+
+                    traced_model = symbolic_trace(model, input_names)
+                    traced_output = traced_model(**filtered_inputs)
+                    model_output = model(**filtered_inputs)
+
+            except Exception as e:
+                self.fail(f"Couldn't trace module: {e}")
+
+            def flatten_output(output):
+                flatten = []
+                for x in output:
+                    if isinstance(x, (tuple, list)):
+                        flatten += flatten_output(x)
+                    elif not isinstance(x, torch.Tensor):
+                        continue
+                    else:
+                        flatten.append(x)
+                return flatten
+
+            model_output = flatten_output(model_output)
+            traced_output = flatten_output(traced_output)
+            num_outputs = len(model_output)
+
+            for i in range(num_outputs):
+                self.assertTrue(
+                    torch.allclose(model_output[i], traced_output[i]),
+                    f"traced {i}th output doesn't match model {i}th output for {model_class}",
+                )
+
+            # Test that the model can be serialized and restored properly
+            with tempfile.TemporaryDirectory() as tmp_dir_name:
+                pkl_file_name = os.path.join(tmp_dir_name, "model.pkl")
+                try:
+                    with open(pkl_file_name, "wb") as f:
+                        pickle.dump(traced_model, f)
+                    with open(pkl_file_name, "rb") as f:
+                        loaded = pickle.load(f)
+                except Exception as e:
+                    self.fail(f"Couldn't serialize / deserialize the traced model: {e}")
+
+                loaded_output = loaded(**filtered_inputs)
+                loaded_output = flatten_output(loaded_output)
+
+                for i in range(num_outputs):
+                    self.assertTrue(
+                        torch.allclose(model_output[i], loaded_output[i]),
+                        f"serialized model {i}th output doesn't match model {i}th output for {model_class}",
+                    )
+
+            # Avoid memory leak. Without this, each call increase RAM usage by ~20MB.
+            # (Even with this call, there are still memory leak by ~0.04MB)
+            self.clear_torch_jit_class_registry()
+
+    # UMT5ForSequenceClassification does not support inputs_embeds
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in (UMT5Model, UMT5ForConditionalGeneration, UMT5ForQuestionAnswering):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
+
+            if not self.is_encoder_decoder:
+                input_ids = inputs["input_ids"]
+                del inputs["input_ids"]
+            else:
+                encoder_input_ids = inputs["input_ids"]
+                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
+                del inputs["input_ids"]
+                inputs.pop("decoder_input_ids", None)
+
+            wte = model.get_input_embeddings()
+            if not self.is_encoder_decoder:
+                inputs["inputs_embeds"] = wte(input_ids)
+            else:
+                inputs["inputs_embeds"] = wte(encoder_input_ids)
+                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
+
+            with torch.no_grad():
+                model(**inputs)[0]
+
+    def test_with_sequence_classification_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_with_sequence_classification_head(*config_and_inputs)
+
+    @unittest.skip("Test has a segmentation fault on torch 1.8.0")
+    def test_export_to_onnx(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        model = UMT5Model(config_and_inputs[0]).to(torch_device)
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            torch.onnx.export(
+                model,
+                (config_and_inputs[1], config_and_inputs[3], config_and_inputs[2]),
+                f"{tmpdirname}/t5_test.onnx",
+                export_params=True,
+                opset_version=9,
+                input_names=["input_ids", "decoder_input_ids"],
+            )
+
+    @unittest.skipIf(torch_device == "cpu", "Cant do half precision")
+    def test_model_fp16_forward(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
+
+    def test_generate_with_head_masking(self):
+        attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        config = config_and_inputs[0]
+        model = UMT5ForConditionalGeneration(config).eval()
+        model.to(torch_device)
+
+        head_masking = {
+            "head_mask": torch.zeros(config.num_layers, config.num_heads, device=torch_device),
+            "decoder_head_mask": torch.zeros(config.num_decoder_layers, config.num_heads, device=torch_device),
+            "cross_attn_head_mask": torch.zeros(config.num_decoder_layers, config.num_heads, device=torch_device),
+        }
+
+        for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
+            head_masks = {name: mask}
+            # Explicitly pass decoder_head_mask as it is required from T5 model when head_mask specified
+            if name == "head_mask":
+                head_masks["decoder_head_mask"] = torch.ones(
+                    config.num_decoder_layers, config.num_heads, device=torch_device
+                )
+
+            out = model.generate(
+                config_and_inputs[1]["input_ids"],
+                num_beams=1,
+                max_length=3,
+                output_attentions=True,
+                return_dict_in_generate=True,
+                **head_masks,
+            )
+            # We check the state of decoder_attentions and cross_attentions just from the last step
+            attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
+            self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
+
+    @unittest.skip("Does not work on the tiny model as we keep hitting edge cases.")
+    def test_disk_offload(self):
+        pass
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+class Umt5IntegrationTest(unittest.TestCase):
+    @slow
+    @unittest.skip(
+        "Unless we stop stripping left and right by default for all special tokens, the expected ids obtained here will not match the original ones. Wait for https://github.com/huggingface/transformers/pull/23909 to be merged"
+    )
+    def test_small_integration_test(self):
+        """
+        For comparison run the kaggle notbook available here : https://www.kaggle.com/arthurzucker/umt5-inference
+        """
+
+        model = UMT5ForConditionalGeneration.from_pretrained("google/umt5-small", return_dict=True).to(torch_device)
+        tokenizer = AutoTokenizer.from_pretrained("google/umt5-small", use_fast=False, legacy=False)
+        input_text = [
+            "Bonjour monsieur <extra_id_0> bien <extra_id_1>.",
+            "No se como puedo <extra_id_0>.",
+            "This is the reason why we <extra_id_0> them.",
+            "The <extra_id_0> walks in <extra_id_1>, seats",
+            "A <extra_id_0> walks into a bar and orders a <extra_id_1> with <extra_id_2> pinch of <extra_id_3>.",
+        ]
+        input_ids = tokenizer(input_text, return_tensors="pt", padding=True).input_ids
+        # fmt: off
+        EXPECTED_IDS = torch.tensor(
+            [
+                [ 38530, 210703, 256299, 1410, 256298, 274, 1, 0,0, 0, 0, 0, 0, 0, 0, 0,0, 0],
+                [   826, 321, 671, 25922, 256299, 274, 1, 0,0, 0, 0, 0, 0, 0, 0, 0,0, 0],
+                [  1460, 339, 312, 19014, 10620, 758, 256299, 2355,274, 1, 0, 0, 0, 0, 0, 0,0, 0],
+                [   517, 256299, 14869, 281, 301, 256298, 275, 119983,1, 0, 0, 0, 0, 0, 0, 0,0, 0],
+                [   320, 256299, 14869, 281, 2234, 289, 2275, 333,61391, 289, 256298, 543, 256297, 168714, 329, 256296,274, 1],
+            ]
+        )
+        # fmt: on
+        torch.testing.assert_allclose(input_ids, EXPECTED_IDS)
+
+        generated_ids = model.generate(input_ids.to(torch_device))
+        EXPECTED_FILLING = [
+            "<pad><extra_id_0> et<extra_id_1> [eod] <extra_id_2><extra_id_55>.. [eod] 💐 💐 💐 💐 💐 💐 💐 💐 💐 💐 💐 <extra_id_56>ajšietosto<extra_id_56>lleux<extra_id_19><extra_id_6>ajšie</s>",
+            "<pad><extra_id_0>.<extra_id_1>.,<0x0A>...spech <0x0A><extra_id_20> <extra_id_21></s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>",
+            "<pad><extra_id_0> are not going to be a part of the world. We are not going to be a part of<extra_id_1> and<extra_id_2><0x0A><extra_id_48>.<extra_id_48></s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>",
+            "<pad><extra_id_0> door<extra_id_1>, the door<extra_id_2> 피해[/</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>",
+            "<pad><extra_id_0>nyone who<extra_id_1> drink<extra_id_2> a<extra_id_3> alcohol<extra_id_4> A<extra_id_5> A. This<extra_id_6> I<extra_id_7><extra_id_52><extra_id_53></s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>",
+        ]
+        filling = tokenizer.batch_decode(generated_ids)
+        self.assertEqual(filling, EXPECTED_FILLING)
diff --git a/tests/models/unispeech/test_modeling_unispeech.py b/tests/models/unispeech/test_modeling_unispeech.py
index 228b0dd175f8..a286274828e2 100644
--- a/tests/models/unispeech/test_modeling_unispeech.py
+++ b/tests/models/unispeech/test_modeling_unispeech.py
@@ -32,6 +32,7 @@
     ids_tensor,
     random_attention_mask,
 )
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -64,7 +65,7 @@ def __init__(
         conv_bias=False,
         num_conv_pos_embeddings=16,
         num_conv_pos_embedding_groups=2,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         hidden_dropout_prob=0.1,  # this is most likely not correctly set yet
         intermediate_size=20,
@@ -244,8 +245,8 @@ def check_ctc_training(self, config, input_values, *args):
             input_values[i, input_lengths[i] :] = 0.0
 
             if max_length_labels[i] < labels.shape[-1]:
-                # it's important that we make sure that target lenghts are at least
-                # one shorter than logit lenghts to prevent -inf
+                # it's important that we make sure that target lengths are at least
+                # one shorter than logit lengths to prevent -inf
                 labels[i, max_length_labels[i] - 1 :] = -100
 
         loss = model(input_values, labels=labels).loss
@@ -297,12 +298,21 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class UniSpeechRobustModelTest(ModelTesterMixin, unittest.TestCase):
+class UniSpeechRobustModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (UniSpeechForCTC, UniSpeechModel, UniSpeechForSequenceClassification, UniSpeechForPreTraining)
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "audio-classification": UniSpeechForSequenceClassification,
+            "automatic-speech-recognition": UniSpeechForCTC,
+            "feature-extraction": UniSpeechModel,
+        }
+        if is_torch_available()
+        else {}
+    )
     test_pruning = False
     test_headmasking = False
 
@@ -411,6 +421,7 @@ def test_initialization(self):
             for name, param in model.named_parameters():
                 uniform_init_parms = [
                     "conv.weight",
+                    "conv.parametrizations.weight",
                     "masked_spec_embed",
                     "codevectors",
                     "quantizer.weight_proj.weight",
@@ -422,7 +433,7 @@ def test_initialization(self):
                     "feature_projection.projection.bias",
                 ]
                 if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
+                    if any(x in name for x in uniform_init_parms):
                         self.assertTrue(
                             -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
@@ -546,7 +557,6 @@ def _load_datasamples(self, num_samples):
         return [x["array"] for x in speech_samples]
 
     def _load_superb(self, task, num_samples):
-
         ds = load_dataset("anton-l/superb_dummy", task, split="test")
 
         return ds[:num_samples]
diff --git a/tests/models/unispeech_sat/test_modeling_unispeech_sat.py b/tests/models/unispeech_sat/test_modeling_unispeech_sat.py
index 6ac06e4db9be..79fa54717378 100644
--- a/tests/models/unispeech_sat/test_modeling_unispeech_sat.py
+++ b/tests/models/unispeech_sat/test_modeling_unispeech_sat.py
@@ -32,6 +32,7 @@
     ids_tensor,
     random_attention_mask,
 )
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -66,7 +67,7 @@ def __init__(
         conv_bias=False,
         num_conv_pos_embeddings=16,
         num_conv_pos_embedding_groups=2,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         hidden_dropout_prob=0.1,  # this is most likely not correctly set yet
         intermediate_size=20,
@@ -264,8 +265,8 @@ def check_ctc_training(self, config, input_values, *args):
             input_values[i, input_lengths[i] :] = 0.0
 
             if max_length_labels[i] < labels.shape[-1]:
-                # it's important that we make sure that target lenghts are at least
-                # one shorter than logit lenghts to prevent -inf
+                # it's important that we make sure that target lengths are at least
+                # one shorter than logit lengths to prevent -inf
                 labels[i, max_length_labels[i] - 1 :] = -100
 
         loss = model(input_values, labels=labels).loss
@@ -341,7 +342,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class UniSpeechSatModelTest(ModelTesterMixin, unittest.TestCase):
+class UniSpeechSatModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             UniSpeechSatForCTC,
@@ -354,6 +355,15 @@ class UniSpeechSatModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "audio-classification": UniSpeechSatForSequenceClassification,
+            "automatic-speech-recognition": UniSpeechSatForCTC,
+            "feature-extraction": UniSpeechSatModel,
+        }
+        if is_torch_available()
+        else {}
+    )
     test_pruning = False
     test_headmasking = False
     test_torchscript = False
@@ -461,6 +471,7 @@ def test_initialization(self):
             for name, param in model.named_parameters():
                 uniform_init_parms = [
                     "conv.weight",
+                    "conv.parametrizations.weight",
                     "masked_spec_embed",
                     "codevectors",
                     "quantizer.weight_proj.weight",
@@ -474,7 +485,7 @@ def test_initialization(self):
                     "objective.weight",
                 ]
                 if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
+                    if any(x in name for x in uniform_init_parms):
                         self.assertTrue(
                             -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
@@ -672,6 +683,7 @@ def test_initialization(self):
             for name, param in model.named_parameters():
                 uniform_init_parms = [
                     "conv.weight",
+                    "conv.parametrizations.weight",
                     "masked_spec_embed",
                     "codevectors",
                     "quantizer.weight_proj.weight",
@@ -685,7 +697,7 @@ def test_initialization(self):
                     "objective.weight",
                 ]
                 if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
+                    if any(x in name for x in uniform_init_parms):
                         self.assertTrue(
                             -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
diff --git a/tests/models/upernet/__init__.py b/tests/models/upernet/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/upernet/test_modeling_upernet.py b/tests/models/upernet/test_modeling_upernet.py
new file mode 100644
index 000000000000..84c32f7233e7
--- /dev/null
+++ b/tests/models/upernet/test_modeling_upernet.py
@@ -0,0 +1,315 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch UperNet framework. """
+
+
+import inspect
+import unittest
+
+from huggingface_hub import hf_hub_download
+
+from transformers import ConvNextConfig, UperNetConfig
+from transformers.testing_utils import require_torch, require_torch_multi_gpu, require_vision, slow, torch_device
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import UperNetForSemanticSegmentation
+    from transformers.models.upernet.modeling_upernet import UPERNET_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import AutoImageProcessor
+
+
+class UperNetModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        image_size=32,
+        num_channels=3,
+        num_stages=4,
+        hidden_sizes=[10, 20, 30, 40],
+        depths=[1, 1, 1, 1],
+        is_training=True,
+        use_labels=True,
+        intermediate_size=37,
+        hidden_act="gelu",
+        type_sequence_label_size=10,
+        initializer_range=0.02,
+        out_features=["stage2", "stage3", "stage4"],
+        num_labels=3,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.num_channels = num_channels
+        self.num_stages = num_stages
+        self.hidden_sizes = hidden_sizes
+        self.depths = depths
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.out_features = out_features
+        self.num_labels = num_labels
+        self.scope = scope
+        self.num_hidden_layers = num_stages
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+
+        config = self.get_config()
+
+        return config, pixel_values, labels
+
+    def get_backbone_config(self):
+        return ConvNextConfig(
+            num_channels=self.num_channels,
+            num_stages=self.num_stages,
+            hidden_sizes=self.hidden_sizes,
+            depths=self.depths,
+            is_training=self.is_training,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            out_features=self.out_features,
+        )
+
+    def get_config(self):
+        return UperNetConfig(
+            backbone_config=self.get_backbone_config(),
+            hidden_size=64,
+            pool_scales=[1, 2, 3, 6],
+            use_auxiliary_head=True,
+            auxiliary_loss_weight=0.4,
+            auxiliary_in_channels=40,
+            auxiliary_channels=32,
+            auxiliary_num_convs=1,
+            auxiliary_concat_input=False,
+            loss_ignore_index=255,
+            num_labels=self.num_labels,
+        )
+
+    def create_and_check_for_semantic_segmentation(self, config, pixel_values, labels):
+        model = UperNetForSemanticSegmentation(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        self.parent.assertEqual(
+            result.logits.shape, (self.batch_size, self.num_labels, self.image_size, self.image_size)
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            pixel_values,
+            labels,
+        ) = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class UperNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as UperNet does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (UperNetForSemanticSegmentation,) if is_torch_available() else ()
+    pipeline_model_mapping = {"image-segmentation": UperNetForSemanticSegmentation} if is_torch_available() else {}
+    fx_compatible = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+    test_torchscript = False
+    has_attentions = False
+
+    def setUp(self):
+        self.model_tester = UperNetModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=UperNetConfig, has_text_modality=False, hidden_size=37)
+
+    def test_config(self):
+        self.create_and_test_config_common_properties()
+        self.config_tester.create_and_test_config_to_json_string()
+        self.config_tester.create_and_test_config_to_json_file()
+        self.config_tester.create_and_test_config_from_and_save_pretrained()
+        self.config_tester.create_and_test_config_with_num_labels()
+        self.config_tester.check_config_can_be_init_without_params()
+        self.config_tester.check_config_arguments_init()
+
+    def create_and_test_config_common_properties(self):
+        return
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_for_semantic_segmentation(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_semantic_segmentation(*config_and_inputs)
+
+    @unittest.skip(reason="UperNet does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="UperNet does not support input and output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    @unittest.skip(reason="UperNet does not have a base model")
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @unittest.skip(reason="UperNet does not have a base model")
+    def test_save_load_fast_init_to_base(self):
+        pass
+
+    @require_torch_multi_gpu
+    @unittest.skip(reason="UperNet has some layers using `add_module` which doesn't work well with `nn.DataParallel`")
+    def test_multi_gpu_data_parallel_forward(self):
+        pass
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+            expected_num_stages = self.model_tester.num_stages
+            self.assertEqual(len(hidden_states), expected_num_stages + 1)
+
+            # ConvNext's feature maps are of shape (batch_size, num_channels, height, width)
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [self.model_tester.image_size // 4, self.model_tester.image_size // 4],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        configs_no_init.backbone_config = _config_zero_init(configs_no_init.backbone_config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    self.assertIn(
+                        ((param.data.mean() * 1e9).round() / 1e9).item(),
+                        [0.0, 1.0],
+                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                    )
+
+    @unittest.skip(reason="UperNet does not have tied weights")
+    def test_tied_model_weights_key_ignore(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in UPERNET_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = UperNetForSemanticSegmentation.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+# We will verify our results on an image of ADE20k
+def prepare_img():
+    filepath = hf_hub_download(
+        repo_id="hf-internal-testing/fixtures_ade20k", repo_type="dataset", filename="ADE_val_00000001.jpg"
+    )
+    image = Image.open(filepath).convert("RGB")
+    return image
+
+
+@require_torch
+@require_vision
+@slow
+class UperNetModelIntegrationTest(unittest.TestCase):
+    def test_inference_swin_backbone(self):
+        processor = AutoImageProcessor.from_pretrained("openmmlab/upernet-swin-tiny")
+        model = UperNetForSemanticSegmentation.from_pretrained("openmmlab/upernet-swin-tiny").to(torch_device)
+
+        image = prepare_img()
+        inputs = processor(images=image, return_tensors="pt").to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        expected_shape = torch.Size((1, model.config.num_labels, 512, 512))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[-7.5958, -7.5958, -7.4302], [-7.5958, -7.5958, -7.4302], [-7.4797, -7.4797, -7.3068]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.logits[0, 0, :3, :3], expected_slice, atol=1e-4))
+
+    def test_inference_convnext_backbone(self):
+        processor = AutoImageProcessor.from_pretrained("openmmlab/upernet-convnext-tiny")
+        model = UperNetForSemanticSegmentation.from_pretrained("openmmlab/upernet-convnext-tiny").to(torch_device)
+
+        image = prepare_img()
+        inputs = processor(images=image, return_tensors="pt").to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        expected_shape = torch.Size((1, model.config.num_labels, 512, 512))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[-8.8110, -8.8110, -8.6521], [-8.8110, -8.8110, -8.6521], [-8.7746, -8.7746, -8.6130]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.logits[0, 0, :3, :3], expected_slice, atol=1e-4))
diff --git a/tests/models/van/test_modeling_van.py b/tests/models/van/test_modeling_van.py
deleted file mode 100644
index 3e5b7fb1dfc7..000000000000
--- a/tests/models/van/test_modeling_van.py
+++ /dev/null
@@ -1,272 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch Van model. """
-
-
-import inspect
-import math
-import unittest
-
-from transformers import VanConfig
-from transformers.testing_utils import require_scipy, require_torch, require_vision, slow, torch_device
-from transformers.utils import cached_property, is_scipy_available, is_torch_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
-
-
-if is_scipy_available():
-    from scipy import stats
-
-if is_torch_available():
-    import torch
-    from torch import nn
-
-    from transformers import VanForImageClassification, VanModel
-    from transformers.models.van.modeling_van import VAN_PRETRAINED_MODEL_ARCHIVE_LIST
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import AutoFeatureExtractor
-
-
-class VanModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=2,
-        image_size=224,
-        num_channels=3,
-        hidden_sizes=[16, 32, 64, 128],
-        depths=[1, 1, 1, 1],
-        is_training=True,
-        use_labels=True,
-        num_labels=3,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.num_channels = num_channels
-        self.hidden_sizes = hidden_sizes
-        self.depths = depths
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.num_labels = num_labels
-        self.type_sequence_label_size = num_labels
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.num_labels)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return VanConfig(
-            num_channels=self.num_channels,
-            hidden_sizes=self.hidden_sizes,
-            depths=self.depths,
-            num_labels=self.num_labels,
-            is_decoder=False,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = VanModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values)
-        # expected last hidden states: B, C, H // 32, W // 32
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.hidden_sizes[-1], self.image_size // 32, self.image_size // 32),
-        )
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        model = VanForImageClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_torch
-class VanModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as Van does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (VanModel, VanForImageClassification) if is_torch_available() else ()
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    has_attentions = False
-
-    def setUp(self):
-        self.model_tester = VanModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=VanConfig, has_text_modality=False, hidden_size=37)
-
-    def test_config(self):
-        self.create_and_test_config_common_properties()
-        self.config_tester.create_and_test_config_to_json_string()
-        self.config_tester.create_and_test_config_to_json_file()
-        self.config_tester.create_and_test_config_from_and_save_pretrained()
-        self.config_tester.create_and_test_config_with_num_labels()
-        self.config_tester.check_config_can_be_init_without_params()
-        self.config_tester.check_config_arguments_init()
-
-    def create_and_test_config_common_properties(self):
-        return
-
-    @unittest.skip(reason="Van does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Van does not support input and output embeddings")
-    def test_model_common_attributes(self):
-        pass
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @require_scipy
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        configs_no_init = _config_zero_init(config)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, module in model.named_modules():
-                if isinstance(module, (nn.BatchNorm2d, nn.GroupNorm, nn.LayerNorm)):
-                    self.assertTrue(
-                        torch.all(module.weight == 1),
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-                    self.assertTrue(
-                        torch.all(module.bias == 0),
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-                elif isinstance(module, nn.Conv2d):
-                    fan_out = module.kernel_size[0] * module.kernel_size[1] * module.out_channels
-                    fan_out //= module.groups
-                    std = math.sqrt(2.0 / fan_out)
-                    # divide by std -> mean = 0, std = 1
-                    data = module.weight.data.cpu().flatten().numpy() / std
-                    test = stats.anderson(data)
-                    self.assertTrue(test.statistic > 0.05)
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_stages = len(self.model_tester.hidden_sizes)
-            # van has no embeddings
-            self.assertEqual(len(hidden_states), expected_num_stages)
-
-            # Van's feature maps are of shape (batch_size, num_channels, height, width)
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [self.model_tester.image_size // 4, self.model_tester.image_size // 4],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in VAN_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = VanModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_torch
-@require_vision
-class VanModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_feature_extractor(self):
-        return AutoFeatureExtractor.from_pretrained(VAN_PRETRAINED_MODEL_ARCHIVE_LIST[0])
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = VanForImageClassification.from_pretrained(VAN_PRETRAINED_MODEL_ARCHIVE_LIST[0]).to(torch_device)
-
-        feature_extractor = self.default_feature_extractor
-        image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
-
-        # forward pass
-        with torch.no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = torch.Size((1, 1000))
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = torch.tensor([0.1029, -0.0904, -0.6365]).to(torch_device)
-
-        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
diff --git a/tests/models/videomae/test_feature_extraction_videomae.py b/tests/models/videomae/test_feature_extraction_videomae.py
deleted file mode 100644
index f792a9be844e..000000000000
--- a/tests/models/videomae/test_feature_extraction_videomae.py
+++ /dev/null
@@ -1,218 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-
-from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torch_available, is_vision_available
-
-from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_video_inputs
-
-
-if is_torch_available():
-    import torch
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import VideoMAEFeatureExtractor
-
-
-class VideoMAEFeatureExtractionTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        num_frames=10,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-        crop_size=None,
-    ):
-        size = size if size is not None else {"shortest_edge": 18}
-        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
-
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.num_frames = num_frames
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.crop_size = crop_size
-
-    def prepare_feat_extract_dict(self):
-        return {
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_normalize": self.do_normalize,
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "crop_size": self.crop_size,
-        }
-
-
-@require_torch
-@require_vision
-class VideoMAEFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
-
-    feature_extraction_class = VideoMAEFeatureExtractor if is_vision_available() else None
-
-    def setUp(self):
-        self.feature_extract_tester = VideoMAEFeatureExtractionTester(self)
-
-    @property
-    def feat_extract_dict(self):
-        return self.feature_extract_tester.prepare_feat_extract_dict()
-
-    def test_feat_extract_properties(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        self.assertTrue(hasattr(feature_extractor, "image_mean"))
-        self.assertTrue(hasattr(feature_extractor, "image_std"))
-        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
-        self.assertTrue(hasattr(feature_extractor, "do_resize"))
-        self.assertTrue(hasattr(feature_extractor, "do_center_crop"))
-        self.assertTrue(hasattr(feature_extractor, "size"))
-
-    def test_feat_extract_from_dict_with_kwargs(self):
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict)
-        self.assertEqual(feature_extractor.size, {"shortest_edge": 18})
-        self.assertEqual(feature_extractor.crop_size, {"height": 18, "width": 18})
-
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict, size=42, crop_size=84)
-        self.assertEqual(feature_extractor.size, {"shortest_edge": 42})
-        self.assertEqual(feature_extractor.crop_size, {"height": 84, "width": 84})
-
-    def test_batch_feature(self):
-        pass
-
-    def test_call_pil(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PIL videos
-        video_inputs = prepare_video_inputs(self.feature_extract_tester, equal_resolution=False)
-        for video in video_inputs:
-            self.assertIsInstance(video, list)
-            self.assertIsInstance(video[0], Image.Image)
-
-        # Test not batched input
-        encoded_videos = feature_extractor(video_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_videos.shape,
-            (
-                1,
-                self.feature_extract_tester.num_frames,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_videos = feature_extractor(video_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_videos.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_frames,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-    def test_call_numpy(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random numpy tensors
-        video_inputs = prepare_video_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
-        for video in video_inputs:
-            self.assertIsInstance(video, list)
-            self.assertIsInstance(video[0], np.ndarray)
-
-        # Test not batched input
-        encoded_videos = feature_extractor(video_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_videos.shape,
-            (
-                1,
-                self.feature_extract_tester.num_frames,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_videos = feature_extractor(video_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_videos.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_frames,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-    def test_call_pytorch(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PyTorch tensors
-        video_inputs = prepare_video_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
-        for video in video_inputs:
-            self.assertIsInstance(video, list)
-            self.assertIsInstance(video[0], torch.Tensor)
-
-        # Test not batched input
-        encoded_videos = feature_extractor(video_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_videos.shape,
-            (
-                1,
-                self.feature_extract_tester.num_frames,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_videos = feature_extractor(video_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_videos.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_frames,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.crop_size["height"],
-                self.feature_extract_tester.crop_size["width"],
-            ),
-        )
diff --git a/tests/models/videomae/test_image_processing_videomae.py b/tests/models/videomae/test_image_processing_videomae.py
new file mode 100644
index 000000000000..4a6f0b93c4dd
--- /dev/null
+++ b/tests/models/videomae/test_image_processing_videomae.py
@@ -0,0 +1,214 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_video_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import VideoMAEImageProcessor
+
+
+class VideoMAEImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        num_frames=10,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+        crop_size=None,
+    ):
+        size = size if size is not None else {"shortest_edge": 18}
+        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
+
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.num_frames = num_frames
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.crop_size = crop_size
+
+    def prepare_image_processor_dict(self):
+        return {
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_normalize": self.do_normalize,
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "crop_size": self.crop_size,
+        }
+
+    def expected_output_image_shape(self, images):
+        return self.num_frames, self.num_channels, self.crop_size["height"], self.crop_size["width"]
+
+    def prepare_video_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_video_inputs(
+            batch_size=self.batch_size,
+            num_frames=self.num_frames,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class VideoMAEImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = VideoMAEImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = VideoMAEImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "do_center_crop"))
+        self.assertTrue(hasattr(image_processing, "size"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"shortest_edge": 18})
+        self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
+
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84)
+        self.assertEqual(image_processor.size, {"shortest_edge": 42})
+        self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
+
+    def test_call_pil(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PIL videos
+        video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=False)
+        for video in video_inputs:
+            self.assertIsInstance(video, list)
+            self.assertIsInstance(video[0], Image.Image)
+
+        # Test not batched input
+        encoded_videos = image_processing(video_inputs[0], return_tensors="pt").pixel_values
+        expected_output_video_shape = self.image_processor_tester.expected_output_image_shape([encoded_videos[0]])
+        self.assertEqual(tuple(encoded_videos.shape), (1, *expected_output_video_shape))
+
+        # Test batched
+        encoded_videos = image_processing(video_inputs, return_tensors="pt").pixel_values
+        expected_output_video_shape = self.image_processor_tester.expected_output_image_shape(encoded_videos)
+        self.assertEqual(
+            tuple(encoded_videos.shape), (self.image_processor_tester.batch_size, *expected_output_video_shape)
+        )
+
+    def test_call_numpy(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random numpy tensors
+        video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=False, numpify=True)
+        for video in video_inputs:
+            self.assertIsInstance(video, list)
+            self.assertIsInstance(video[0], np.ndarray)
+
+        # Test not batched input
+        encoded_videos = image_processing(video_inputs[0], return_tensors="pt").pixel_values
+        expected_output_video_shape = self.image_processor_tester.expected_output_image_shape([encoded_videos[0]])
+        self.assertEqual(tuple(encoded_videos.shape), (1, *expected_output_video_shape))
+
+        # Test batched
+        encoded_videos = image_processing(video_inputs, return_tensors="pt").pixel_values
+        expected_output_video_shape = self.image_processor_tester.expected_output_image_shape(encoded_videos)
+        self.assertEqual(
+            tuple(encoded_videos.shape), (self.image_processor_tester.batch_size, *expected_output_video_shape)
+        )
+
+    def test_call_numpy_4_channels(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random numpy tensors
+        self.image_processor_tester.num_channels = 4
+        video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=False, numpify=True)
+        for video in video_inputs:
+            self.assertIsInstance(video, list)
+            self.assertIsInstance(video[0], np.ndarray)
+
+        # Test not batched input
+        encoded_videos = image_processing(
+            video_inputs[0], return_tensors="pt", image_mean=0, image_std=1, input_data_format="channels_first"
+        ).pixel_values
+        expected_output_video_shape = self.image_processor_tester.expected_output_image_shape([encoded_videos[0]])
+        self.assertEqual(tuple(encoded_videos.shape), (1, *expected_output_video_shape))
+
+        # Test batched
+        encoded_videos = image_processing(
+            video_inputs, return_tensors="pt", image_mean=0, image_std=1, input_data_format="channels_first"
+        ).pixel_values
+        expected_output_video_shape = self.image_processor_tester.expected_output_image_shape(encoded_videos)
+        self.assertEqual(
+            tuple(encoded_videos.shape), (self.image_processor_tester.batch_size, *expected_output_video_shape)
+        )
+        self.image_processor_tester.num_channels = 3
+
+    def test_call_pytorch(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PyTorch tensors
+        video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=False, torchify=True)
+        for video in video_inputs:
+            self.assertIsInstance(video, list)
+            self.assertIsInstance(video[0], torch.Tensor)
+
+        # Test not batched input
+        encoded_videos = image_processing(video_inputs[0], return_tensors="pt").pixel_values
+        expected_output_video_shape = self.image_processor_tester.expected_output_image_shape([encoded_videos[0]])
+        self.assertEqual(tuple(encoded_videos.shape), (1, *expected_output_video_shape))
+
+        # Test batched
+        encoded_videos = image_processing(video_inputs, return_tensors="pt").pixel_values
+        expected_output_video_shape = self.image_processor_tester.expected_output_image_shape(encoded_videos)
+        self.assertEqual(
+            tuple(encoded_videos.shape), (self.image_processor_tester.batch_size, *expected_output_video_shape)
+        )
diff --git a/tests/models/videomae/test_modeling_videomae.py b/tests/models/videomae/test_modeling_videomae.py
index bc665410b6c6..9fb9c9e7f376 100644
--- a/tests/models/videomae/test_modeling_videomae.py
+++ b/tests/models/videomae/test_modeling_videomae.py
@@ -20,8 +20,8 @@
 import unittest
 
 import numpy as np
-
 from huggingface_hub import hf_hub_download
+
 from transformers import VideoMAEConfig
 from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
@@ -29,6 +29,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -45,7 +46,7 @@
 
 
 if is_vision_available():
-    from transformers import VideoMAEFeatureExtractor
+    from transformers import VideoMAEImageProcessor
 
 
 class VideoMAEModelTester:
@@ -61,7 +62,7 @@ def __init__(
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -129,6 +130,10 @@ def get_config(self):
             attention_probs_dropout_prob=self.attention_probs_dropout_prob,
             is_decoder=False,
             initializer_range=self.initializer_range,
+            decoder_hidden_size=self.hidden_size,
+            decoder_intermediate_size=self.intermediate_size,
+            decoder_num_attention_heads=self.num_attention_heads,
+            decoder_num_hidden_layers=self.num_hidden_layers,
         )
 
     def create_and_check_model(self, config, pixel_values, labels):
@@ -162,7 +167,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class VideoMAEModelTest(ModelTesterMixin, unittest.TestCase):
+class VideoMAEModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_common.py, as VideoMAE does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
@@ -171,6 +176,11 @@ class VideoMAEModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (
         (VideoMAEModel, VideoMAEForPreTraining, VideoMAEForVideoClassification) if is_torch_available() else ()
     )
+    pipeline_model_mapping = (
+        {"feature-extraction": VideoMAEModel, "video-classification": VideoMAEForVideoClassification}
+        if is_torch_available()
+        else {}
+    )
 
     test_pruning = False
     test_torchscript = False
@@ -353,10 +363,10 @@ def prepare_video():
 @require_vision
 class VideoMAEModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         # logits were tested with a different mean and std, so we use the same here
         return (
-            VideoMAEFeatureExtractor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
+            VideoMAEImageProcessor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
             if is_vision_available()
             else None
         )
@@ -367,9 +377,9 @@ def test_inference_for_video_classification(self):
             torch_device
         )
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         video = prepare_video()
-        inputs = feature_extractor(video, return_tensors="pt").to(torch_device)
+        inputs = image_processor(video, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -387,9 +397,9 @@ def test_inference_for_video_classification(self):
     def test_inference_for_pretraining(self):
         model = VideoMAEForPreTraining.from_pretrained("MCG-NJU/videomae-base-short").to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         video = prepare_video()
-        inputs = feature_extractor(video, return_tensors="pt").to(torch_device)
+        inputs = image_processor(video, return_tensors="pt").to(torch_device)
 
         # add boolean mask, indicating which patches to mask
         local_path = hf_hub_download(repo_id="hf-internal-testing/bool-masked-pos", filename="bool_masked_pos.pt")
diff --git a/tests/models/vilt/test_feature_extraction_vilt.py b/tests/models/vilt/test_feature_extraction_vilt.py
deleted file mode 100644
index 5816eacf8359..000000000000
--- a/tests/models/vilt/test_feature_extraction_vilt.py
+++ /dev/null
@@ -1,260 +0,0 @@
-# coding=utf-8
-# Copyright 2021 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-
-from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torch_available, is_vision_available
-
-from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
-
-
-if is_torch_available():
-    import torch
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import ViltFeatureExtractor
-
-
-class ViltFeatureExtractionTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        size_divisor=2,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-    ):
-        size = size if size is not None else {"shortest_edge": 30}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.size_divisor = size_divisor
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-
-    def prepare_feat_extract_dict(self):
-        return {
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_normalize": self.do_normalize,
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "size_divisor": self.size_divisor,
-        }
-
-    def get_expected_values(self, image_inputs, batched=False):
-        """
-        This function computes the expected height and width when providing images to ViltFeatureExtractor,
-        assuming do_resize is set to True with a scalar size and size_divisor.
-        """
-        if not batched:
-            size = self.size["shortest_edge"]
-            image = image_inputs[0]
-            if isinstance(image, Image.Image):
-                w, h = image.size
-            else:
-                h, w = image.shape[1], image.shape[2]
-            scale = size / min(w, h)
-            if h < w:
-                newh, neww = size, scale * w
-            else:
-                newh, neww = scale * h, size
-
-            max_size = int((1333 / 800) * size)
-            if max(newh, neww) > max_size:
-                scale = max_size / max(newh, neww)
-                newh = newh * scale
-                neww = neww * scale
-
-            newh, neww = int(newh + 0.5), int(neww + 0.5)
-            expected_height, expected_width = (
-                newh // self.size_divisor * self.size_divisor,
-                neww // self.size_divisor * self.size_divisor,
-            )
-
-        else:
-            expected_values = []
-            for image in image_inputs:
-                expected_height, expected_width = self.get_expected_values([image])
-                expected_values.append((expected_height, expected_width))
-            expected_height = max(expected_values, key=lambda item: item[0])[0]
-            expected_width = max(expected_values, key=lambda item: item[1])[1]
-
-        return expected_height, expected_width
-
-
-@require_torch
-@require_vision
-class ViltFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
-
-    feature_extraction_class = ViltFeatureExtractor if is_vision_available() else None
-
-    def setUp(self):
-        self.feature_extract_tester = ViltFeatureExtractionTester(self)
-
-    @property
-    def feat_extract_dict(self):
-        return self.feature_extract_tester.prepare_feat_extract_dict()
-
-    def test_feat_extract_properties(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        self.assertTrue(hasattr(feature_extractor, "image_mean"))
-        self.assertTrue(hasattr(feature_extractor, "image_std"))
-        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
-        self.assertTrue(hasattr(feature_extractor, "do_resize"))
-        self.assertTrue(hasattr(feature_extractor, "size"))
-        self.assertTrue(hasattr(feature_extractor, "size_divisor"))
-
-    def test_feat_extract_from_dict_with_kwargs(self):
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict)
-        self.assertEqual(feature_extractor.size, {"shortest_edge": 30})
-
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict, size=42)
-        self.assertEqual(feature_extractor.size, {"shortest_edge": 42})
-
-    def test_batch_feature(self):
-        pass
-
-    def test_call_pil(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PIL images
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-
-        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs)
-        self.assertEqual(
-            encoded_images.shape,
-            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-
-        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True)
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                expected_height,
-                expected_width,
-            ),
-        )
-
-    def test_call_numpy(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random numpy tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-
-        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs)
-        self.assertEqual(
-            encoded_images.shape,
-            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-
-        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True)
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                expected_height,
-                expected_width,
-            ),
-        )
-
-    def test_call_pytorch(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-
-        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs)
-        self.assertEqual(
-            encoded_images.shape,
-            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-
-        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True)
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                expected_height,
-                expected_width,
-            ),
-        )
-
-    def test_equivalence_pad_and_create_pixel_mask(self):
-        # Initialize feature_extractors
-        feature_extractor_1 = self.feature_extraction_class(**self.feat_extract_dict)
-        feature_extractor_2 = self.feature_extraction_class(do_resize=False, do_normalize=False, do_rescale=False)
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test whether the method "pad_and_return_pixel_mask" and calling the feature extractor return the same tensors
-        encoded_images_with_method = feature_extractor_1.pad_and_create_pixel_mask(image_inputs, return_tensors="pt")
-        encoded_images = feature_extractor_2(image_inputs, return_tensors="pt")
-
-        self.assertTrue(
-            torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4)
-        )
-        self.assertTrue(
-            torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4)
-        )
diff --git a/tests/models/vilt/test_image_processing_vilt.py b/tests/models/vilt/test_image_processing_vilt.py
new file mode 100644
index 000000000000..607a8b929d1f
--- /dev/null
+++ b/tests/models/vilt/test_image_processing_vilt.py
@@ -0,0 +1,153 @@
+# coding=utf-8
+# Copyright 2021 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import ViltImageProcessor
+
+
+class ViltImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        size_divisor=2,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+    ):
+        size = size if size is not None else {"shortest_edge": 30}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.size_divisor = size_divisor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def prepare_image_processor_dict(self):
+        return {
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_normalize": self.do_normalize,
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "size_divisor": self.size_divisor,
+        }
+
+    def get_expected_values(self, image_inputs, batched=False):
+        """
+        This function computes the expected height and width when providing images to ViltImageProcessor,
+        assuming do_resize is set to True with a scalar size and size_divisor.
+        """
+        if not batched:
+            size = self.size["shortest_edge"]
+            image = image_inputs[0]
+            if isinstance(image, Image.Image):
+                w, h = image.size
+            else:
+                h, w = image.shape[1], image.shape[2]
+            scale = size / min(w, h)
+            if h < w:
+                newh, neww = size, scale * w
+            else:
+                newh, neww = scale * h, size
+
+            max_size = int((1333 / 800) * size)
+            if max(newh, neww) > max_size:
+                scale = max_size / max(newh, neww)
+                newh = newh * scale
+                neww = neww * scale
+
+            newh, neww = int(newh + 0.5), int(neww + 0.5)
+            expected_height, expected_width = (
+                newh // self.size_divisor * self.size_divisor,
+                neww // self.size_divisor * self.size_divisor,
+            )
+
+        else:
+            expected_values = []
+            for image in image_inputs:
+                expected_height, expected_width = self.get_expected_values([image])
+                expected_values.append((expected_height, expected_width))
+            expected_height = max(expected_values, key=lambda item: item[0])[0]
+            expected_width = max(expected_values, key=lambda item: item[1])[1]
+
+        return expected_height, expected_width
+
+    def expected_output_image_shape(self, images):
+        height, width = self.get_expected_values(images, batched=True)
+        return (self.num_channels, height, width)
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class ViltImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = ViltImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = ViltImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "size_divisor"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"shortest_edge": 30})
+
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
+        self.assertEqual(image_processor.size, {"shortest_edge": 42})
diff --git a/tests/models/vilt/test_modeling_vilt.py b/tests/models/vilt/test_modeling_vilt.py
index 30e44d4d488c..399f0710c779 100644
--- a/tests/models/vilt/test_modeling_vilt.py
+++ b/tests/models/vilt/test_modeling_vilt.py
@@ -26,6 +26,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -41,9 +42,6 @@
         ViltModel,
     )
     from transformers.models.vilt.modeling_vilt import VILT_PRETRAINED_MODEL_ARCHIVE_LIST
-    from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_10
-else:
-    is_torch_greater_or_equal_than_1_10 = False
 
 if is_vision_available():
     import PIL
@@ -67,7 +65,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -217,8 +215,7 @@ def prepare_pixel_values(self):
 
 
 @require_torch
-@unittest.skipIf(not is_torch_greater_or_equal_than_1_10, "Vilt is only available in torch v1.10+")
-class ViltModelTest(ModelTesterMixin, unittest.TestCase):
+class ViltModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             ViltModel,
@@ -230,9 +227,15 @@ class ViltModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {"feature-extraction": ViltModel, "visual-question-answering": ViltForQuestionAnswering}
+        if is_torch_available()
+        else {}
+    )
     test_pruning = False
     test_headmasking = False
     test_torchscript = False
+    model_split_percents = [0.5, 0.8, 0.9]
 
     # ViltForMaskedLM, ViltForQuestionAnswering and ViltForImagesAndTextClassification require special treatment
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
@@ -514,7 +517,6 @@ def test_model_from_pretrained(self):
 
 
 @require_torch
-@unittest.skipIf(not is_torch_greater_or_equal_than_1_10, "Vilt is only available in torch v1.10+")
 class ViltForImagesAndTextClassificationModelTest(ViltModelTest, unittest.TestCase):
     all_model_classes = (ViltForImagesAndTextClassification,) if is_torch_available() else ()
 
@@ -539,7 +541,6 @@ def prepare_img():
 
 @require_torch
 @require_vision
-@unittest.skipIf(not is_torch_greater_or_equal_than_1_10, "Vilt is only available in torch v1.10+")
 class ViltModelIntegrationTest(unittest.TestCase):
     @cached_property
     def default_processor(self):
diff --git a/tests/models/vision_encoder_decoder/test_modeling_flax_vision_encoder_decoder.py b/tests/models/vision_encoder_decoder/test_modeling_flax_vision_encoder_decoder.py
index aaaf62c5a0b4..c6926e002a9b 100644
--- a/tests/models/vision_encoder_decoder/test_modeling_flax_vision_encoder_decoder.py
+++ b/tests/models/vision_encoder_decoder/test_modeling_flax_vision_encoder_decoder.py
@@ -48,7 +48,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import ViTFeatureExtractor
+    from transformers import ViTImageProcessor
 
 
 @require_flax
@@ -70,7 +70,7 @@ def check_encoder_decoder_model_from_pretrained_configs(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_decoder_config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
         self.assertTrue(encoder_decoder_config.decoder.is_decoder)
@@ -100,7 +100,7 @@ def check_encoder_decoder_model_from_pretrained(
         decoder_input_ids,
         decoder_attention_mask,
         return_dict,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict}
@@ -126,7 +126,7 @@ def check_save_and_load(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model}
@@ -162,7 +162,7 @@ def check_encoder_decoder_model_output_attentions(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         # make the decoder inputs a different shape from the encoder inputs to harden the test
         decoder_input_ids = decoder_input_ids[:, :-1]
@@ -236,7 +236,6 @@ def check_encoder_decoder_model_generate(self, pixel_values, config, decoder_con
         self.assertEqual(generated_sequences.shape, (pixel_values.shape[0],) + (decoder_config.max_length,))
 
     def check_pt_flax_equivalence(self, pt_model, fx_model, inputs_dict):
-
         pt_model.to(torch_device)
         pt_model.eval()
 
@@ -278,7 +277,6 @@ def check_pt_flax_equivalence(self, pt_model, fx_model, inputs_dict):
             self.assert_almost_equals(fx_output, pt_output_loaded.numpy(), 1e-5)
 
     def check_equivalence_pt_to_flax(self, config, decoder_config, inputs_dict):
-
         encoder_decoder_config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
 
         pt_model = VisionEncoderDecoderModel(encoder_decoder_config)
@@ -290,7 +288,6 @@ def check_equivalence_pt_to_flax(self, config, decoder_config, inputs_dict):
         self.check_pt_flax_equivalence(pt_model, fx_model, inputs_dict)
 
     def check_equivalence_flax_to_pt(self, config, decoder_config, inputs_dict):
-
         encoder_decoder_config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
 
         pt_model = VisionEncoderDecoderModel(encoder_decoder_config)
@@ -330,7 +327,6 @@ def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float):
 
     @is_pt_flax_cross_test
     def test_pt_flax_equivalence(self):
-
         config_inputs_dict = self.prepare_config_and_inputs()
         config = config_inputs_dict.pop("config")
         decoder_config = config_inputs_dict.pop("decoder_config")
@@ -442,7 +438,6 @@ def get_from_encoderdecoder_pretrained_model(self):
         )
 
     def _check_configuration_tie(self, model):
-
         module = model.module.bind(model.params)
 
         assert id(module.decoder.config) == id(model.config.decoder)
@@ -465,15 +460,14 @@ def prepare_img():
 class FlaxViT2GPT2ModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference_coco_en(self):
-
         loc = "ydshieh/vit-gpt2-coco-en"
 
-        feature_extractor = ViTFeatureExtractor.from_pretrained(loc)
+        image_processor = ViTImageProcessor.from_pretrained(loc)
         tokenizer = AutoTokenizer.from_pretrained(loc)
         model = FlaxVisionEncoderDecoderModel.from_pretrained(loc)
 
         img = prepare_img()
-        pixel_values = feature_extractor(images=img, return_tensors="np").pixel_values
+        pixel_values = image_processor(images=img, return_tensors="np").pixel_values
 
         decoder_input_ids = np.array([[model.config.decoder_start_token_id]])
         logits = model(pixel_values, decoder_input_ids)[0]
@@ -501,7 +495,6 @@ def test_inference_coco_en(self):
         self.assertLessEqual(max_diff, 1e-4)
 
         def generate_step(pixel_values):
-
             outputs = model.generate(pixel_values, max_length=16, num_beams=4)
             output_ids = outputs.sequences
             preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
diff --git a/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py b/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py
index ada036e2aa54..e173e21a9b5d 100644
--- a/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py
+++ b/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py
@@ -15,6 +15,8 @@
 """ Testing suite for the TensorFlow VisionEncoderDecoder model. """
 
 
+from __future__ import annotations
+
 import copy
 import os
 import tempfile
@@ -43,7 +45,7 @@
 
     from transformers import (
         AutoConfig,
-        AutoFeatureExtractor,
+        AutoImageProcessor,
         AutoTokenizer,
         TFAutoModel,
         TFAutoModelForCausalLM,
@@ -62,7 +64,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import ViTFeatureExtractor
+    from transformers import ViTImageProcessor
 
 
 @require_tf
@@ -84,7 +86,7 @@ def check_encoder_decoder_model_from_pretrained_configs(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_decoder_config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
         self.assertTrue(encoder_decoder_config.decoder.is_decoder)
@@ -114,7 +116,7 @@ def check_encoder_decoder_model(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         enc_dec_model = TFVisionEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
@@ -158,7 +160,7 @@ def check_encoder_decoder_model_from_pretrained(
         decoder_input_ids,
         decoder_attention_mask,
         return_dict,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict}
@@ -185,7 +187,7 @@ def check_save_and_load(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         enc_dec_model = TFVisionEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
@@ -223,7 +225,7 @@ def check_encoder_decoder_model_labels(
         decoder_input_ids,
         decoder_attention_mask,
         labels,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         enc_dec_model = TFVisionEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
@@ -253,7 +255,7 @@ def check_encoder_decoder_model_output_attentions(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         # make the decoder inputs a different shape from the encoder inputs to harden the test
         decoder_input_ids = decoder_input_ids[:, :-1]
@@ -403,7 +405,6 @@ def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=1e-5, nam
             )
 
     def prepare_pt_inputs_from_tf_inputs(self, tf_inputs_dict):
-
         pt_inputs_dict = {}
         for name, key in tf_inputs_dict.items():
             if type(key) == bool:
@@ -423,7 +424,6 @@ def prepare_pt_inputs_from_tf_inputs(self, tf_inputs_dict):
         return pt_inputs_dict
 
     def check_pt_tf_models(self, tf_model, pt_model, tf_inputs_dict):
-
         pt_inputs_dict = self.prepare_pt_inputs_from_tf_inputs(tf_inputs_dict)
 
         # send pytorch inputs to the correct device
@@ -463,7 +463,6 @@ def check_pt_tf_equivalence(self, tf_model, pt_model, tf_inputs_dict):
         self.check_pt_tf_models(tf_model, pt_model, tf_inputs_dict)
 
     def check_pt_to_tf_equivalence(self, config, decoder_config, tf_inputs_dict):
-
         encoder_decoder_config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
         # Output all for aggressive testing
         encoder_decoder_config.output_hidden_states = True
@@ -479,7 +478,6 @@ def check_pt_to_tf_equivalence(self, config, decoder_config, tf_inputs_dict):
         self.check_pt_tf_equivalence(tf_model, pt_model, tf_inputs_dict)
 
     def check_tf_to_pt_equivalence(self, config, decoder_config, tf_inputs_dict):
-
         encoder_decoder_config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
         # Output all for aggressive testing
         encoder_decoder_config.output_hidden_states = True
@@ -534,7 +532,6 @@ def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float):
 
     @is_pt_tf_cross_test
     def test_pt_tf_model_equivalence(self):
-
         config_inputs_dict = self.prepare_config_and_inputs()
         labels = config_inputs_dict.pop("decoder_token_labels")
 
@@ -732,9 +729,9 @@ def test_encoder_decoder_save_load_from_encoder_decoder(self):
 
         # create two random ViT/GPT2 models for vit-gpt2 & initialize weights (+cross_attention weights)
         encoder = TFViTModel(config.encoder)
-        encoder(encoder.dummy_inputs)
+        encoder.build()
         decoder = TFGPT2LMHeadModel(config.decoder)
-        decoder(decoder.dummy_inputs)
+        decoder.build()
 
         encoder_decoder_orig = TFVisionEncoderDecoderModel(encoder=encoder, decoder=decoder)
 
@@ -831,15 +828,14 @@ def test_encoder_decoder_from_pretrained(self):
         load_weight_prefix = TFVisionEncoderDecoderModel.load_weight_prefix
 
         config = self.get_encoder_decoder_config()
-        feature_extractor = AutoFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
+        image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
         decoder_tokenizer = AutoTokenizer.from_pretrained("gpt2")
 
         img = prepare_img()
-        pixel_values = feature_extractor(images=img, return_tensors="tf").pixel_values
+        pixel_values = image_processor(images=img, return_tensors="tf").pixel_values
         decoder_input_ids = decoder_tokenizer("Linda Davis", return_tensors="tf").input_ids
 
         with tempfile.TemporaryDirectory() as tmp_dirname:
-
             # Since most of HF's models don't have pretrained cross-attention layers, they are randomly
             # initialized even if we create models using `from_pretrained` method.
             # For the tests, the decoder need to be a model with pretrained cross-attention layers.
@@ -895,16 +891,15 @@ def test_encoder_decoder_from_pretrained(self):
 class TFViT2GPT2ModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference_coco_en(self):
-
         loc = "ydshieh/vit-gpt2-coco-en"
 
-        feature_extractor = ViTFeatureExtractor.from_pretrained(loc)
+        image_processor = ViTImageProcessor.from_pretrained(loc)
         tokenizer = AutoTokenizer.from_pretrained(loc)
         model = TFVisionEncoderDecoderModel.from_pretrained(loc)
 
         # We will verify our results on an image of cute cats
         img = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        pixel_values = feature_extractor(images=img, return_tensors="tf").pixel_values
+        pixel_values = image_processor(images=img, return_tensors="tf").pixel_values
 
         decoder_input_ids = tf.constant([[model.config.decoder_start_token_id]])
 
@@ -932,16 +927,14 @@ def test_inference_coco_en(self):
         self.assertLessEqual(max_diff, 1e-4)
 
         def generate_step(pixel_values):
-            outputs = model.generate(
-                pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True, output_scores=True
-            )
+            outputs = model.generate(pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True)
             output_ids = outputs.sequences
             preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
             preds = [pred.strip() for pred in preds]
 
-            return preds, outputs.scores.numpy()
+            return preds
 
-        preds, scores = generate_step(pixel_values)
+        preds = generate_step(pixel_values)
 
         # should produce
         # ["a cat laying on top of a couch next to another cat"]
diff --git a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
index 6228cb51fd5a..755d59e71cfa 100644
--- a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
+++ b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
@@ -18,10 +18,13 @@
 import unittest
 
 from datasets import load_dataset
+from huggingface_hub import hf_hub_download
 from packaging import version
 
-from transformers import DonutProcessor, TrOCRProcessor
+from transformers import DonutProcessor, NougatProcessor, TrOCRProcessor
 from transformers.testing_utils import (
+    require_levenshtein,
+    require_nltk,
     require_sentencepiece,
     require_torch,
     require_vision,
@@ -62,7 +65,7 @@
     import PIL
     from PIL import Image
 
-    from transformers import ViTFeatureExtractor
+    from transformers import ViTImageProcessor
 
 
 @require_torch
@@ -135,7 +138,7 @@ def check_encoder_decoder_model_from_pretrained(
         decoder_attention_mask,
         return_dict,
         pixel_values=None,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict}
@@ -226,7 +229,7 @@ def check_encoder_decoder_model_output_attentions(
         decoder_attention_mask,
         labels=None,
         pixel_values=None,
-        **kwargs
+        **kwargs,
     ):
         # make the decoder inputs a different shape from the encoder inputs to harden the test
         decoder_input_ids = decoder_input_ids[:, :-1]
@@ -402,7 +405,7 @@ def check_encoder_decoder_model_output_attentions(
         decoder_attention_mask,
         labels=None,
         pixel_values=None,
-        **kwargs
+        **kwargs,
     ):
         # make the decoder inputs a different shape from the encoder inputs to harden the test
         decoder_input_ids = decoder_input_ids[:, :-1]
@@ -590,7 +593,7 @@ def check_encoder_decoder_model_output_attentions(
         decoder_attention_mask,
         labels=None,
         pixel_values=None,
-        **kwargs
+        **kwargs,
     ):
         # make the decoder inputs a different shape from the encoder inputs to harden the test
         decoder_input_ids = decoder_input_ids[:, :-1]
@@ -747,10 +750,9 @@ def test_inference_printed(self):
 class ViT2GPT2ModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference_coco_en(self):
-
         loc = "ydshieh/vit-gpt2-coco-en"
 
-        feature_extractor = ViTFeatureExtractor.from_pretrained(loc)
+        image_processor = ViTImageProcessor.from_pretrained(loc)
         tokenizer = AutoTokenizer.from_pretrained(loc)
         model = VisionEncoderDecoderModel.from_pretrained(loc)
         model.to(torch_device)
@@ -758,7 +760,7 @@ def test_inference_coco_en(self):
 
         # We will verify our results on an image of cute cats
         img = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        pixel_values = feature_extractor(images=img, return_tensors="pt").pixel_values.to(torch_device)
+        pixel_values = image_processor(images=img, return_tensors="pt").pixel_values.to(torch_device)
 
         decoder_input_ids = torch.tensor([[model.config.decoder_start_token_id]]).to(torch_device)
 
@@ -787,7 +789,6 @@ def test_inference_coco_en(self):
         self.assertLessEqual(max_diff, 1e-4)
 
         def generate_step(pixel_values):
-
             outputs = model.generate(
                 pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True, output_scores=True
             )
@@ -1000,3 +1001,79 @@ def test_inference_rvlcdip(self):
                 outputs.scores[0][0, :3], torch.tensor([-17.6490, -4.8381, -15.7577], device=torch_device), atol=1e-4
             )
         )
+
+
+@require_levenshtein
+@require_nltk
+@require_torch
+@require_vision
+@slow
+class NougatModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_processor(self):
+        return NougatProcessor.from_pretrained("facebook/nougat-base") if is_vision_available() else None
+
+    @cached_property
+    def default_model(self):
+        return VisionEncoderDecoderModel.from_pretrained("facebook/nougat-base").to(torch_device)
+
+    @cached_property
+    def default_image(self):
+        filepath = hf_hub_download(
+            repo_id="hf-internal-testing/fixtures_docvqa", filename="nougat_pdf.png", repo_type="dataset"
+        )
+        image = Image.open(filepath).convert("RGB")
+        return image
+
+    def test_forward_pass(self):
+        processor = self.default_processor
+        model = self.default_model
+        image = self.default_image
+        pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(torch_device)
+
+        decoder_input_ids = torch.tensor([[0]]).to(torch_device)
+        outputs = model(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids)
+        logits = outputs.logits
+
+        # verify the logits
+        expected_shape = torch.Size((1, 1, model.decoder.config.vocab_size))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [1.6253, -4.2179, 5.8532, -2.7911, -5.0609, -4.7397, -4.2890, -5.1073, -4.8908, -4.9729]
+        ).to(torch_device)
+
+        self.assertTrue(torch.allclose(logits[0, 0, :10], expected_slice, atol=1e-4))
+
+    def test_generation(self):
+        processor = self.default_processor
+        model = self.default_model
+        image = self.default_image
+        pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(torch_device)
+
+        outputs = model.generate(
+            pixel_values,
+            min_length=1,
+            max_length=3584,
+            bad_words_ids=[[processor.tokenizer.unk_token_id]],
+            return_dict_in_generate=True,
+            output_scores=True,
+        )
+
+        # verify generated sequence
+        generated = processor.batch_decode(outputs.sequences, skip_special_tokens=True)[0]
+        expected_raw_generation = "# Nougat: Neural Optical Understanding for Academic Documents\n\n Lukas Blecher\n\nCorrespondence to: lblecher@meta.com\n\nGuillem Cucurull\n\nThomas Scialom\n\nRobert Stojnic\n\nMeta AI\n\nThe paper reports 8.1M papers but the authors recently updated the numbers on the GitHub page https://github.com/allenai/s2orc\n\n###### Abstract\n\nScientific knowledge is predominantly stored in books and scientific journals, often in the form of PDFs. However, the PDF format leads to a loss of semantic information, particularly for mathematical expressions. We propose Nougat (**N**eural **O**ptical **U**nderstanding for **A**cademic Documents), a Visual Transformer model that performs an _Optical Character Recognition_ (OCR) task for processing scientific documents into a markup language, and demonstrate the effectiveness of our model on a new dataset of scientific documents. The proposed approach offers a promising solution to enhance the accessibility of scientific knowledge in the digital age, by bridging the gap between human-readable documents and machine-readable text. We release the models and code to accelerate future work on scientific text recognition.\n\n## 1 Introduction\n\nThe majority of scientific knowledge is stored in books or published in scientific journals, most commonly in the Portable Document Format (PDF). Next to HTML, PDFs are the second most prominent data format on the internet, making up 2.4% of common crawl [1]. However, the information stored in these files is very difficult to extract into any other formats. This is especially true for highly specialized documents, such as scientific research papers, where the semantic information of mathematical expressions is lost.\n\nExisting Optical Character Recognition (OCR) engines, such as Tesseract OCR [2], excel at detecting and classifying individual characters and words in an image, but fail to understand the relationship between them due to their line-by-line approach. This means that they treat superscripts and subscripts in the same way as the surrounding text, which is a significant drawback for mathematical expressions. In mathematical notations like fractions, exponents, and matrices, relative positions of characters are crucial.\n\nConverting academic research papers into machine-readable text also enables accessibility and searchability of science as a whole. The information of millions of academic papers can not be fully accessed because they are locked behind an unreadable format. Existing corpora, such as the S2ORC dataset [3], capture the text of 12M2 papers using GROBID [4], but are missing meaningful representations of the mathematical equations.\n\nFootnote 2: The paper reports 8.1M papers but the authors recently updated the numbers on the GitHub page https://github.com/allenai/s2orc\n\nTo this end, we introduce Nougat, a transformer based model that can convert images of document pages to formatted markup text.\n\nThe primary contributions in this paper are\n\n* Release of a pre-trained model capable of converting a PDF to a lightweight markup language. We release the code and the model on GitHub3 Footnote 3: https://github.com/facebookresearch/nougat\n* We introduce a pipeline to create dataset for pairing PDFs to source code\n* Our method is only dependent on the image of a page, allowing access to scanned papers and books"
+        self.assertTrue(generated == expected_raw_generation)
+
+        # verify postprocessed sequence
+        generated = processor.post_process_generation(generated, fix_markdown=False)
+        expected_generation = "\n\n# Nougat: Neural Optical Understanding for Academic Documents\n\n Lukas Blecher\n\nCorrespondence to: lblecher@meta.com\n\nGuillem Cucurull\n\nThomas Scialom\n\nRobert Stojnic\n\nMeta AI\n\nThe paper reports 8.1M papers but the authors recently updated the numbers on the GitHub page https://github.com/allenai/s2orc\n\n###### Abstract\n\nScientific knowledge is predominantly stored in books and scientific journals, often in the form of PDFs. However, the PDF format leads to a loss of semantic information, particularly for mathematical expressions. We propose Nougat (**N**eural **O**ptical **U**nderstanding for **A**cademic Documents), a Visual Transformer model that performs an _Optical Character Recognition_ (OCR) task for processing scientific documents into a markup language, and demonstrate the effectiveness of our model on a new dataset of scientific documents. The proposed approach offers a promising solution to enhance the accessibility of scientific knowledge in the digital age, by bridging the gap between human-readable documents and machine-readable text. We release the models and code to accelerate future work on scientific text recognition.\n\n## 1 Introduction\n\nThe majority of scientific knowledge is stored in books or published in scientific journals, most commonly in the Portable Document Format (PDF). Next to HTML, PDFs are the second most prominent data format on the internet, making up 2.4% of common crawl [1]. However, the information stored in these files is very difficult to extract into any other formats. This is especially true for highly specialized documents, such as scientific research papers, where the semantic information of mathematical expressions is lost.\n\nExisting Optical Character Recognition (OCR) engines, such as Tesseract OCR [2], excel at detecting and classifying individual characters and words in an image, but fail to understand the relationship between them due to their line-by-line approach. This means that they treat superscripts and subscripts in the same way as the surrounding text, which is a significant drawback for mathematical expressions. In mathematical notations like fractions, exponents, and matrices, relative positions of characters are crucial.\n\nConverting academic research papers into machine-readable text also enables accessibility and searchability of science as a whole. The information of millions of academic papers can not be fully accessed because they are locked behind an unreadable format. Existing corpora, such as the S2ORC dataset [3], capture the text of 12M2 papers using GROBID [4], but are missing meaningful representations of the mathematical equations.\n\nFootnote 2: The paper reports 8.1M papers but the authors recently updated the numbers on the GitHub page https://github.com/allenai/s2orc\n\nTo this end, we introduce Nougat, a transformer based model that can convert images of document pages to formatted markup text.\n\nThe primary contributions in this paper are\n\n* Release of a pre-trained model capable of converting a PDF to a lightweight markup language. We release the code and the model on GitHub3 Footnote 3: https://github.com/facebookresearch/nougat\n* We introduce a pipeline to create dataset for pairing PDFs to source code\n* Our method is only dependent on the image of a page, allowing access to scanned papers and books"
+        self.assertTrue(generated == expected_generation)
+
+        # verify scores
+        self.assertEqual(len(outputs.scores), 741)
+        self.assertTrue(
+            torch.allclose(
+                outputs.scores[0][0, :3], torch.tensor([1.6253, -4.2179, 5.8532], device=torch_device), atol=1e-4
+            )
+        )
diff --git a/tests/models/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py b/tests/models/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py
index cb476c128aa6..ddf8b4335f5d 100644
--- a/tests/models/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py
+++ b/tests/models/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py
@@ -100,7 +100,6 @@ def check_model_from_pretrained_configs(
     def check_vision_text_dual_encoder_from_pretrained(
         self, text_config, input_ids, attention_mask, vision_config, pixel_values=None, **kwargs
     ):
-
         vision_model, text_model = self.get_vision_text_model(vision_config, text_config)
         kwargs = {"vision_model": vision_model, "text_model": text_model}
         model = FlaxVisionTextDualEncoderModel.from_vision_text_pretrained(**kwargs)
@@ -157,7 +156,6 @@ def check_vision_text_output_attention(
         )
 
     def check_pt_flax_equivalence(self, pt_model, fx_model, inputs_dict):
-
         pt_model.to(torch_device)
         pt_model.eval()
 
@@ -199,7 +197,6 @@ def check_pt_flax_equivalence(self, pt_model, fx_model, inputs_dict):
             self.assert_almost_equals(fx_output, pt_output_loaded.numpy(), 4e-2)
 
     def check_equivalence_pt_to_flax(self, vision_config, text_config, inputs_dict):
-
         config = VisionTextDualEncoderConfig.from_vision_text_configs(vision_config, text_config)
 
         pt_model = VisionTextDualEncoderModel(config)
@@ -211,7 +208,6 @@ def check_equivalence_pt_to_flax(self, vision_config, text_config, inputs_dict):
         self.check_pt_flax_equivalence(pt_model, fx_model, inputs_dict)
 
     def check_equivalence_flax_to_pt(self, vision_config, text_config, inputs_dict):
-
         config = VisionTextDualEncoderConfig.from_vision_text_configs(vision_config, text_config)
 
         pt_model = VisionTextDualEncoderModel(config)
@@ -239,7 +235,6 @@ def test_vision_text_output_attention(self):
 
     @is_pt_flax_cross_test
     def test_pt_flax_equivalence(self):
-
         config_inputs_dict = self.prepare_config_and_inputs()
         vision_config = config_inputs_dict.pop("vision_config")
         text_config = config_inputs_dict.pop("text_config")
@@ -311,7 +306,6 @@ def prepare_config_and_inputs(self):
             "vision_config": vision_config,
             "pixel_values": pixel_values,
             "attention_mask": attention_mask,
-            "text_config": text_config,
             "input_ids": input_ids,
             "token_type_ids": token_type_ids,
         }
@@ -362,7 +356,6 @@ def prepare_config_and_inputs(self):
             "vision_config": vision_config,
             "pixel_values": pixel_values,
             "attention_mask": attention_mask,
-            "text_config": text_config,
             "input_ids": input_ids,
             "token_type_ids": token_type_ids,
         }
@@ -373,7 +366,7 @@ def prepare_config_and_inputs(self):
 class FlaxVisionTextDualEncoderIntegrationTest(unittest.TestCase):
     @slow
     def test_inference(self):
-        model = FlaxVisionTextDualEncoderModel.from_pretrained("clip-italian/clip-italian", logit_scale_init_value=1)
+        model = FlaxVisionTextDualEncoderModel.from_pretrained("clip-italian/clip-italian", logit_scale_init_value=1.0)
         processor = VisionTextDualEncoderProcessor.from_pretrained("clip-italian/clip-italian")
 
         image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
diff --git a/tests/models/vision_text_dual_encoder/test_modeling_tf_vision_text_dual_encoder.py b/tests/models/vision_text_dual_encoder/test_modeling_tf_vision_text_dual_encoder.py
new file mode 100644
index 000000000000..10baef00594f
--- /dev/null
+++ b/tests/models/vision_text_dual_encoder/test_modeling_tf_vision_text_dual_encoder.py
@@ -0,0 +1,421 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch VisionTextDualEncoder model. """
+
+
+from __future__ import annotations
+
+import collections
+import tempfile
+import unittest
+
+import numpy as np
+
+from transformers.testing_utils import require_tf, require_vision, slow
+from transformers.utils import is_tf_available, is_vision_available
+
+from ...test_modeling_tf_common import floats_tensor, ids_tensor, random_attention_mask
+from ..bert.test_modeling_tf_bert import TFBertModelTester
+from ..clip.test_modeling_tf_clip import TFCLIPVisionModelTester
+from ..deit.test_modeling_tf_deit import TFDeiTModelTester
+from ..roberta.test_modeling_tf_roberta import TFRobertaModelTester
+from ..vit.test_modeling_tf_vit import TFViTModelTester
+
+
+if is_tf_available():
+    from transformers import (
+        TFBertModel,
+        TFCLIPVisionModel,
+        TFDeiTModel,
+        TFRobertaModel,
+        TFVisionTextDualEncoderModel,
+        TFViTModel,
+        VisionTextDualEncoderConfig,
+    )
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import VisionTextDualEncoderProcessor
+
+
+# Inspired by
+# https://github.com/rwightman/pytorch-image-models/blob/b9bd960a032c75ca6b808ddeed76bee5f3ed4972/timm/models/layers/helpers.py
+# From PyTorch internals
+def to_2tuple(x):
+    if isinstance(x, collections.abc.Iterable):
+        return x
+    return (x, x)
+
+
+@require_tf
+class TFVisionTextDualEncoderMixin:
+    def get_vision_text_model(self, config, text_config):
+        pass
+
+    def prepare_config_and_inputs(self):
+        pass
+
+    def get_pretrained_model_and_inputs(self):
+        pass
+
+    def check_model_from_pretrained_configs(
+        self, text_config, input_ids, attention_mask, vision_config, pixel_values=None, **kwargs
+    ):
+        config = VisionTextDualEncoderConfig.from_vision_text_configs(vision_config, text_config)
+
+        model = TFVisionTextDualEncoderModel(config)
+
+        output = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask)
+
+        self.assertEqual(output["text_embeds"].shape, (input_ids.shape[0], config.projection_dim))
+        self.assertEqual(output["image_embeds"].shape, (pixel_values.shape[0], config.projection_dim))
+
+    def check_vision_text_dual_encoder_model(
+        self, text_config, input_ids, attention_mask, vision_config, pixel_values=None, **kwargs
+    ):
+        vision_model, text_model = self.get_vision_text_model(vision_config, text_config)
+        model = TFVisionTextDualEncoderModel(vision_model=vision_model, text_model=text_model)
+
+        output = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask)
+
+        self.assertEqual(output["text_embeds"].shape, (input_ids.shape[0], model.config.projection_dim))
+        self.assertEqual(output["image_embeds"].shape, (pixel_values.shape[0], model.config.projection_dim))
+
+    def check_vision_text_dual_encoder_from_pretrained(
+        self, text_config, input_ids, attention_mask, vision_config, pixel_values=None, **kwargs
+    ):
+        vision_model, text_model = self.get_vision_text_model(vision_config, text_config)
+        kwargs = {"vision_model": vision_model, "text_model": text_model}
+        model = TFVisionTextDualEncoderModel.from_vision_text_pretrained(**kwargs)
+
+        output = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask)
+
+        self.assertEqual(output["text_embeds"].shape, (input_ids.shape[0], model.config.projection_dim))
+        self.assertEqual(output["image_embeds"].shape, (pixel_values.shape[0], model.config.projection_dim))
+
+    def check_save_load(self, text_config, input_ids, attention_mask, vision_config, pixel_values=None, **kwargs):
+        vision_model, text_model = self.get_vision_text_model(vision_config, text_config)
+        model = TFVisionTextDualEncoderModel(vision_model=vision_model, text_model=text_model)
+
+        output = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask)
+        out_1 = output[0].numpy()
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_pretrained(tmpdirname)
+            model = TFVisionTextDualEncoderModel.from_pretrained(tmpdirname)
+
+            after_output = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask)
+            out_2 = after_output[0].numpy()
+            max_diff = np.amax(np.abs(out_2 - out_1))
+            self.assertLessEqual(max_diff, 1e-5)
+
+    def check_vision_text_output_attention(
+        self, text_config, input_ids, attention_mask, vision_config, pixel_values=None, **kwargs
+    ):
+        vision_model, text_model = self.get_vision_text_model(vision_config, text_config)
+        model = TFVisionTextDualEncoderModel(vision_model=vision_model, text_model=text_model)
+
+        output = model(
+            input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask, output_attentions=True
+        )
+
+        vision_attentions = output.vision_model_output.attentions
+        self.assertEqual(len(vision_attentions), vision_config.num_hidden_layers)
+
+        # in ViT, the seq_len equals the number of patches + 1 (we add 1 for the [CLS] token)
+        image_size = to_2tuple(vision_model.config.image_size)
+        patch_size = to_2tuple(vision_model.config.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        seq_len = num_patches + 1
+        self.assertEqual(vision_attentions[0].shape[-3:], (vision_config.num_attention_heads, seq_len, seq_len))
+
+        text_attentions = output.text_model_output.attentions
+        self.assertEqual(len(text_attentions), text_config.num_hidden_layers)
+
+        self.assertEqual(
+            text_attentions[0].shape[-3:],
+            (text_config.num_attention_heads, input_ids.shape[-1], input_ids.shape[-1]),
+        )
+
+    def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float):
+        diff = np.abs((a - b)).max()
+        self.assertLessEqual(diff, tol, f"Difference between torch and flax is {diff} (>= {tol}).")
+
+    def test_vision_text_dual_encoder_model(self):
+        inputs_dict = self.prepare_config_and_inputs()
+        self.check_vision_text_dual_encoder_model(**inputs_dict)
+
+    def test_model_from_pretrained_configs(self):
+        inputs_dict = self.prepare_config_and_inputs()
+        self.check_model_from_pretrained_configs(**inputs_dict)
+
+    def test_vision_text_dual_encoder_from_pretrained(self):
+        inputs_dict = self.prepare_config_and_inputs()
+        self.check_vision_text_dual_encoder_from_pretrained(**inputs_dict)
+
+    def test_save_load(self):
+        inputs_dict = self.prepare_config_and_inputs()
+        self.check_save_load(**inputs_dict)
+
+    def test_vision_text_output_attention(self):
+        inputs_dict = self.prepare_config_and_inputs()
+        self.check_vision_text_output_attention(**inputs_dict)
+
+    @slow
+    def test_real_model_save_load_from_pretrained(self):
+        model_2, inputs = self.get_pretrained_model_and_inputs()
+
+        outputs = model_2(**inputs)
+        out_2 = outputs[0].numpy()
+
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            model_2.save_pretrained(tmp_dirname)
+            model_1 = TFVisionTextDualEncoderModel.from_pretrained(tmp_dirname)
+
+            after_outputs = model_1(**inputs)
+            out_1 = after_outputs[0].numpy()
+            max_diff = np.amax(np.abs(out_1 - out_2))
+            self.assertLessEqual(max_diff, 1e-5)
+
+
+@require_tf
+class TFViTBertModelTest(TFVisionTextDualEncoderMixin, unittest.TestCase):
+    def get_pretrained_model_and_inputs(self):
+        model = TFVisionTextDualEncoderModel.from_vision_text_pretrained(
+            "hf-internal-testing/tiny-random-vit", "hf-internal-testing/tiny-random-bert"
+        )
+        batch_size = 13
+        pixel_values = floats_tensor(
+            [
+                batch_size,
+                model.vision_model.config.num_channels,
+                model.vision_model.config.image_size,
+                model.vision_model.config.image_size,
+            ]
+        )
+        input_ids = ids_tensor([batch_size, 4], model.text_model.config.vocab_size)
+        attention_mask = random_attention_mask([batch_size, 4])
+        inputs = {"pixel_values": pixel_values, "input_ids": input_ids, "attention_mask": attention_mask}
+
+        return model, inputs
+
+    def get_vision_text_model(self, vision_config, text_config):
+        vision_model = TFViTModel(vision_config, name="vision_model")
+        text_model = TFBertModel(text_config, name="text_model")
+        return vision_model, text_model
+
+    def prepare_config_and_inputs(self):
+        vit_model_tester = TFViTModelTester(self)
+        bert_model_tester = TFBertModelTester(self)
+        vision_config_and_inputs = vit_model_tester.prepare_config_and_inputs()
+        text_config_and_inputs = bert_model_tester.prepare_config_and_inputs()
+
+        vision_config, pixel_values, _ = vision_config_and_inputs
+
+        (
+            text_config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = text_config_and_inputs
+
+        return {
+            "text_config": text_config,
+            "vision_config": vision_config,
+            "pixel_values": pixel_values,
+            "attention_mask": input_mask,
+            "input_ids": input_ids,
+            "text_token_type_ids": token_type_ids,
+            "text_sequence_labels": sequence_labels,
+            "text_token_labels": token_labels,
+            "text_choice_labels": choice_labels,
+        }
+
+
+@require_tf
+class TFDeiTRobertaModelTest(TFVisionTextDualEncoderMixin, unittest.TestCase):
+    def get_pretrained_model_and_inputs(self):
+        # DeiT repo doesn't have TF weights, but we don't actually use the weights at all so let's
+        # just reinitialize it.
+        model = TFVisionTextDualEncoderModel.from_vision_text_pretrained(
+            "Rocketknight1/tiny-random-deit-tf", "hf-internal-testing/tiny-random-roberta"
+        )
+        batch_size = 13
+        pixel_values = floats_tensor(
+            [
+                batch_size,
+                model.vision_model.config.num_channels,
+                model.vision_model.config.image_size,
+                model.vision_model.config.image_size,
+            ]
+        )
+        input_ids = ids_tensor([batch_size, 4], model.text_model.config.vocab_size)
+        attention_mask = random_attention_mask([batch_size, 4])
+        inputs = {"pixel_values": pixel_values, "input_ids": input_ids, "attention_mask": attention_mask}
+
+        return model, inputs
+
+    def check_vision_text_output_attention(
+        self, text_config, input_ids, attention_mask, vision_config, pixel_values=None, **kwargs
+    ):
+        vision_model, text_model = self.get_vision_text_model(vision_config, text_config)
+        model = TFVisionTextDualEncoderModel(vision_model=vision_model, text_model=text_model)
+
+        output = model(
+            input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask, output_attentions=True
+        )
+
+        vision_attentions = output.vision_model_output.attentions
+        self.assertEqual(len(vision_attentions), vision_config.num_hidden_layers)
+
+        # in DEiT, the seq_len equals the number of patches + 2 (we add 2 for the [CLS] and distillation tokens)
+        image_size = to_2tuple(vision_model.config.image_size)
+        patch_size = to_2tuple(vision_model.config.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        seq_len = num_patches + 2
+        self.assertEqual(vision_attentions[0].shape[-3:], (vision_config.num_attention_heads, seq_len, seq_len))
+
+        text_attentions = output.text_model_output.attentions
+        self.assertEqual(len(text_attentions), text_config.num_hidden_layers)
+
+        self.assertEqual(
+            text_attentions[0].shape[-3:],
+            (text_config.num_attention_heads, input_ids.shape[-1], input_ids.shape[-1]),
+        )
+
+    def get_vision_text_model(self, vision_config, text_config):
+        vision_model = TFDeiTModel(vision_config, name="vision_model")
+        text_model = TFRobertaModel(text_config, name="text_model")
+        return vision_model, text_model
+
+    def prepare_config_and_inputs(self):
+        vit_model_tester = TFDeiTModelTester(self)
+        bert_model_tester = TFRobertaModelTester(self)
+        vision_config_and_inputs = vit_model_tester.prepare_config_and_inputs()
+        text_config_and_inputs = bert_model_tester.prepare_config_and_inputs()
+
+        vision_config, pixel_values, _ = vision_config_and_inputs
+
+        (
+            text_config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = text_config_and_inputs
+
+        return {
+            "text_config": text_config,
+            "vision_config": vision_config,
+            "pixel_values": pixel_values,
+            "attention_mask": input_mask,
+            "input_ids": input_ids,
+            "text_token_type_ids": token_type_ids,
+            "text_sequence_labels": sequence_labels,
+            "text_token_labels": token_labels,
+            "text_choice_labels": choice_labels,
+        }
+
+
+@require_tf
+class TFCLIPVisionBertModelTest(TFVisionTextDualEncoderMixin, unittest.TestCase):
+    def get_pretrained_model_and_inputs(self):
+        model = TFVisionTextDualEncoderModel.from_vision_text_pretrained(
+            "Rocketknight1/tiny-random-clip-tf", "hf-internal-testing/tiny-random-bert"
+        )
+        batch_size = 13
+        pixel_values = floats_tensor(
+            [
+                batch_size,
+                model.vision_model.config.num_channels,
+                model.vision_model.config.image_size,
+                model.vision_model.config.image_size,
+            ]
+        )
+        input_ids = ids_tensor([batch_size, 4], model.text_model.config.vocab_size)
+        attention_mask = random_attention_mask([batch_size, 4])
+        inputs = {"pixel_values": pixel_values, "input_ids": input_ids, "attention_mask": attention_mask}
+
+        return model, inputs
+
+    def get_vision_text_model(self, vision_config, text_config):
+        vision_model = TFCLIPVisionModel(vision_config, name="vision_model")
+        text_model = TFBertModel(text_config, name="text_model")
+        return vision_model, text_model
+
+    def prepare_config_and_inputs(self):
+        clip_model_tester = TFCLIPVisionModelTester(self)
+        bert_model_tester = TFBertModelTester(self)
+        vision_config_and_inputs = clip_model_tester.prepare_config_and_inputs()
+        text_config_and_inputs = bert_model_tester.prepare_config_and_inputs()
+
+        vision_config, pixel_values = vision_config_and_inputs
+
+        (
+            text_config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = text_config_and_inputs
+
+        return {
+            "text_config": text_config,
+            "vision_config": vision_config,
+            "pixel_values": pixel_values,
+            "attention_mask": input_mask,
+            "input_ids": input_ids,
+            "text_token_type_ids": token_type_ids,
+            "text_sequence_labels": sequence_labels,
+            "text_token_labels": token_labels,
+            "text_choice_labels": choice_labels,
+        }
+
+
+@require_vision
+@require_tf
+class TFVisionTextDualEncoderIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference(self):
+        model = TFVisionTextDualEncoderModel.from_pretrained(
+            "clip-italian/clip-italian", logit_scale_init_value=1.0, from_pt=True
+        )
+        processor = VisionTextDualEncoderProcessor.from_pretrained("clip-italian/clip-italian")
+
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        inputs = processor(
+            text=["una foto di un gatto", "una foto di un cane"], images=image, padding=True, return_tensors="np"
+        )
+
+        outputs = model(**inputs)
+
+        # verify the logits
+        self.assertEqual(outputs.logits_per_image.shape, (inputs.pixel_values.shape[0], inputs.input_ids.shape[0]))
+        self.assertEqual(
+            outputs.logits_per_text.shape,
+            (inputs.input_ids.shape[0], inputs.pixel_values.shape[0]),
+        )
+
+        expected_logits = np.array([[1.2284727, 0.3104122]])
+
+        self.assertTrue(np.allclose(outputs.logits_per_image.numpy(), expected_logits, atol=1e-3))
diff --git a/tests/models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py b/tests/models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py
index 18182047d664..4a1ee2462e4f 100644
--- a/tests/models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py
+++ b/tests/models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py
@@ -108,7 +108,6 @@ def check_vision_text_dual_encoder_model(
     def check_vision_text_dual_encoder_from_pretrained(
         self, text_config, input_ids, attention_mask, vision_config, pixel_values=None, **kwargs
     ):
-
         vision_model, text_model = self.get_vision_text_model(vision_config, text_config)
         kwargs = {"vision_model": vision_model, "text_model": text_model}
         model = VisionTextDualEncoderModel.from_vision_text_pretrained(**kwargs)
@@ -175,7 +174,6 @@ def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float):
         self.assertLessEqual(diff, tol, f"Difference between torch and flax is {diff} (>= {tol}).")
 
     def check_pt_flax_equivalence(self, pt_model, fx_model, input_ids, attention_mask, pixel_values, **kwargs):
-
         pt_model.to(torch_device)
         pt_model.eval()
 
@@ -218,7 +216,6 @@ def check_pt_flax_equivalence(self, pt_model, fx_model, input_ids, attention_mas
             self.assert_almost_equals(fx_output, pt_output_loaded.numpy(), 4e-2)
 
     def check_equivalence_pt_to_flax(self, vision_config, text_config, inputs_dict):
-
         config = VisionTextDualEncoderConfig.from_vision_text_configs(vision_config, text_config)
 
         pt_model = VisionTextDualEncoderModel(config)
@@ -230,7 +227,6 @@ def check_equivalence_pt_to_flax(self, vision_config, text_config, inputs_dict):
         self.check_pt_flax_equivalence(pt_model, fx_model, **inputs_dict)
 
     def check_equivalence_flax_to_pt(self, vision_config, text_config, inputs_dict):
-
         config = VisionTextDualEncoderConfig.from_vision_text_configs(vision_config, text_config)
 
         pt_model = VisionTextDualEncoderModel(config)
@@ -262,7 +258,6 @@ def test_vision_text_output_attention(self):
 
     @is_pt_flax_cross_test
     def test_pt_flax_equivalence(self):
-
         config_inputs_dict = self.prepare_config_and_inputs()
         vision_config = config_inputs_dict.pop("vision_config")
         text_config = config_inputs_dict.pop("text_config")
@@ -341,7 +336,6 @@ def prepare_config_and_inputs(self):
             "vision_config": vision_config,
             "pixel_values": pixel_values,
             "attention_mask": input_mask,
-            "text_config": text_config,
             "input_ids": input_ids,
             "text_token_type_ids": token_type_ids,
             "text_sequence_labels": sequence_labels,
@@ -429,7 +423,6 @@ def prepare_config_and_inputs(self):
             "vision_config": vision_config,
             "pixel_values": pixel_values,
             "attention_mask": input_mask,
-            "text_config": text_config,
             "input_ids": input_ids,
             "text_token_type_ids": token_type_ids,
             "text_sequence_labels": sequence_labels,
@@ -491,7 +484,6 @@ def prepare_config_and_inputs(self):
             "vision_config": vision_config,
             "pixel_values": pixel_values,
             "attention_mask": input_mask,
-            "text_config": text_config,
             "input_ids": input_ids,
             "text_token_type_ids": token_type_ids,
             "text_sequence_labels": sequence_labels,
@@ -505,7 +497,7 @@ def prepare_config_and_inputs(self):
 class VisionTextDualEncoderIntegrationTest(unittest.TestCase):
     @slow
     def test_inference(self):
-        model = VisionTextDualEncoderModel.from_pretrained("clip-italian/clip-italian", logit_scale_init_value=1)
+        model = VisionTextDualEncoderModel.from_pretrained("clip-italian/clip-italian", logit_scale_init_value=1.0)
         processor = VisionTextDualEncoderProcessor.from_pretrained("clip-italian/clip-italian")
 
         image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
diff --git a/tests/models/vision_text_dual_encoder/test_processor_vision_text_dual_encoder.py b/tests/models/vision_text_dual_encoder/test_processor_vision_text_dual_encoder.py
index 30630256f970..7cd8b7645db9 100644
--- a/tests/models/vision_text_dual_encoder/test_processor_vision_text_dual_encoder.py
+++ b/tests/models/vision_text_dual_encoder/test_processor_vision_text_dual_encoder.py
@@ -170,10 +170,10 @@ def test_tokenizer_decode(self):
         self.assertListEqual(decoded_tok, decoded_processor)
 
     def test_model_input_names(self):
-        feature_extractor = self.get_image_processor()
+        image_processor = self.get_image_processor()
         tokenizer = self.get_tokenizer()
 
-        processor = VisionTextDualEncoderProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        processor = VisionTextDualEncoderProcessor(tokenizer=tokenizer, image_processor=image_processor)
 
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
diff --git a/tests/models/visual_bert/test_modeling_visual_bert.py b/tests/models/visual_bert/test_modeling_visual_bert.py
index 92ed812fe47d..9000be33ab7e 100644
--- a/tests/models/visual_bert/test_modeling_visual_bert.py
+++ b/tests/models/visual_bert/test_modeling_visual_bert.py
@@ -22,6 +22,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -53,7 +54,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -299,8 +300,7 @@ def create_and_check_for_flickr(self, config, input_dict):
 
 
 @require_torch
-class VisualBertModelTest(ModelTesterMixin, unittest.TestCase):
-
+class VisualBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             VisualBertModel,
@@ -313,6 +313,7 @@ class VisualBertModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = {"feature-extraction": VisualBertModel} if is_torch_available() else {}
     test_torchscript = False
     test_pruning = False
 
diff --git a/tests/models/vit/test_feature_extraction_vit.py b/tests/models/vit/test_feature_extraction_vit.py
deleted file mode 100644
index f4197425099d..000000000000
--- a/tests/models/vit/test_feature_extraction_vit.py
+++ /dev/null
@@ -1,199 +0,0 @@
-# coding=utf-8
-# Copyright 2021 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-
-from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torch_available, is_vision_available
-
-from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
-
-
-if is_torch_available():
-    import torch
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import ViTFeatureExtractor
-
-
-class ViTFeatureExtractionTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-    ):
-        size = size if size is not None else {"height": 18, "width": 18}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-
-    def prepare_feat_extract_dict(self):
-        return {
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_normalize": self.do_normalize,
-            "do_resize": self.do_resize,
-            "size": self.size,
-        }
-
-
-@require_torch
-@require_vision
-class ViTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
-
-    feature_extraction_class = ViTFeatureExtractor if is_vision_available() else None
-
-    def setUp(self):
-        self.feature_extract_tester = ViTFeatureExtractionTester(self)
-
-    @property
-    def feat_extract_dict(self):
-        return self.feature_extract_tester.prepare_feat_extract_dict()
-
-    def test_feat_extract_properties(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        self.assertTrue(hasattr(feature_extractor, "image_mean"))
-        self.assertTrue(hasattr(feature_extractor, "image_std"))
-        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
-        self.assertTrue(hasattr(feature_extractor, "do_resize"))
-        self.assertTrue(hasattr(feature_extractor, "size"))
-
-    def test_feat_extract_from_dict_with_kwargs(self):
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict)
-        self.assertEqual(feature_extractor.size, {"height": 18, "width": 18})
-
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict, size=42)
-        self.assertEqual(feature_extractor.size, {"height": 42, "width": 42})
-
-    def test_batch_feature(self):
-        pass
-
-    def test_call_pil(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PIL images
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-    def test_call_numpy(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random numpy tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-    def test_call_pytorch(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size["height"],
-                self.feature_extract_tester.size["width"],
-            ),
-        )
diff --git a/tests/models/vit/test_image_processing_vit.py b/tests/models/vit/test_image_processing_vit.py
new file mode 100644
index 000000000000..c1c22c0a800a
--- /dev/null
+++ b/tests/models/vit/test_image_processing_vit.py
@@ -0,0 +1,106 @@
+# coding=utf-8
+# Copyright 2021 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_vision_available():
+    from transformers import ViTImageProcessor
+
+
+class ViTImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+    ):
+        size = size if size is not None else {"height": 18, "width": 18}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def prepare_image_processor_dict(self):
+        return {
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_normalize": self.do_normalize,
+            "do_resize": self.do_resize,
+            "size": self.size,
+        }
+
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.size["height"], self.size["width"]
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class ViTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = ViTImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = ViTImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"height": 18, "width": 18})
+
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
+        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
diff --git a/tests/models/vit/test_modeling_flax_vit.py b/tests/models/vit/test_modeling_flax_vit.py
index 611f93648854..af56f4717b88 100644
--- a/tests/models/vit/test_modeling_flax_vit.py
+++ b/tests/models/vit/test_modeling_flax_vit.py
@@ -25,8 +25,8 @@
 
 
 if is_flax_available():
-
     import jax
+
     from transformers.models.vit.modeling_flax_vit import FlaxViTForImageClassification, FlaxViTModel
 
 
@@ -41,7 +41,7 @@ def __init__(
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -125,7 +125,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_flax
 class FlaxViTModelTest(FlaxModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (FlaxViTModel, FlaxViTForImageClassification) if is_flax_available() else ()
 
     def setUp(self) -> None:
diff --git a/tests/models/vit/test_modeling_tf_vit.py b/tests/models/vit/test_modeling_tf_vit.py
index 7f452886f150..0db27dfb2ebf 100644
--- a/tests/models/vit/test_modeling_tf_vit.py
+++ b/tests/models/vit/test_modeling_tf_vit.py
@@ -15,6 +15,8 @@
 """ Testing suite for the TensorFlow ViT model. """
 
 
+from __future__ import annotations
+
 import inspect
 import unittest
 
@@ -24,6 +26,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -35,7 +38,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import ViTFeatureExtractor
+    from transformers import ViTImageProcessor
 
 
 class TFViTModelTester:
@@ -49,7 +52,7 @@ def __init__(
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -148,13 +151,18 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFViTModelTest(TFModelTesterMixin, unittest.TestCase):
+class TFViTModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_tf_common.py, as ViT does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
     """
 
     all_model_classes = (TFViTModel, TFViTForImageClassification) if is_tf_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": TFViTModel, "image-classification": TFViTForImageClassification}
+        if is_tf_available()
+        else {}
+    )
 
     test_resize_embeddings = False
     test_head_masking = False
@@ -206,7 +214,6 @@ def test_for_image_classification(self):
 
     @slow
     def test_model_from_pretrained(self):
-
         model = TFViTModel.from_pretrained("google/vit-base-patch16-224")
         self.assertIsNotNone(model)
 
@@ -221,16 +228,16 @@ def prepare_img():
 @require_vision
 class TFViTModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224") if is_vision_available() else None
+    def default_image_processor(self):
+        return ViTImageProcessor.from_pretrained("google/vit-base-patch16-224") if is_vision_available() else None
 
     @slow
     def test_inference_image_classification_head(self):
         model = TFViTForImageClassification.from_pretrained("google/vit-base-patch16-224")
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="tf")
+        inputs = image_processor(images=image, return_tensors="tf")
 
         # forward pass
         outputs = model(**inputs)
diff --git a/tests/models/vit/test_modeling_vit.py b/tests/models/vit/test_modeling_vit.py
index 52e09aab774a..82ba910ec869 100644
--- a/tests/models/vit/test_modeling_vit.py
+++ b/tests/models/vit/test_modeling_vit.py
@@ -31,6 +31,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -44,7 +45,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import ViTFeatureExtractor
+    from transformers import ViTImageProcessor
 
 
 class ViTModelTester:
@@ -58,7 +59,7 @@ def __init__(
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -133,7 +134,7 @@ def create_and_check_for_masked_image_modeling(self, config, pixel_values, label
         model.eval()
         result = model(pixel_values)
         self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.num_channels, self.image_size, self.image_size)
+            result.reconstruction.shape, (self.batch_size, self.num_channels, self.image_size, self.image_size)
         )
 
         # test greyscale images
@@ -144,7 +145,7 @@ def create_and_check_for_masked_image_modeling(self, config, pixel_values, label
 
         pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
         result = model(pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, 1, self.image_size, self.image_size))
+        self.parent.assertEqual(result.reconstruction.shape, (self.batch_size, 1, self.image_size, self.image_size))
 
     def create_and_check_for_image_classification(self, config, pixel_values, labels):
         config.num_labels = self.type_sequence_label_size
@@ -176,7 +177,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class ViTModelTest(ModelTesterMixin, unittest.TestCase):
+class ViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_common.py, as ViT does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
@@ -191,6 +192,11 @@ class ViTModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {"feature-extraction": ViTModel, "image-classification": ViTForImageClassification}
+        if is_torch_available()
+        else {}
+    )
     fx_compatible = True
 
     test_pruning = False
@@ -258,16 +264,16 @@ def prepare_img():
 @require_vision
 class ViTModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224") if is_vision_available() else None
+    def default_image_processor(self):
+        return ViTImageProcessor.from_pretrained("google/vit-base-patch16-224") if is_vision_available() else None
 
     @slow
     def test_inference_image_classification_head(self):
         model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224").to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -289,9 +295,9 @@ def test_inference_interpolate_pos_encoding(self):
         # to visualize self-attention on higher resolution images.
         model = ViTModel.from_pretrained("facebook/dino-vits8").to(torch_device)
 
-        feature_extractor = ViTFeatureExtractor.from_pretrained("facebook/dino-vits8", size=480)
+        image_processor = ViTImageProcessor.from_pretrained("facebook/dino-vits8", size=480)
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt")
+        inputs = image_processor(images=image, return_tensors="pt")
         pixel_values = inputs.pixel_values.to(torch_device)
 
         # forward pass
@@ -316,10 +322,10 @@ def test_inference_fp16(self):
         A small test to make sure that inference work in half precision without any problem.
         """
         model = ViTModel.from_pretrained("facebook/dino-vits8", torch_dtype=torch.float16, device_map="auto")
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
 
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt")
+        inputs = image_processor(images=image, return_tensors="pt")
         pixel_values = inputs.pixel_values.to(torch_device)
 
         # forward pass to make sure inference works in fp16
diff --git a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
index cf8d4b48e23e..3ea407eafd4e 100644
--- a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
+++ b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
@@ -23,7 +23,8 @@
 from transformers.utils import cached_property, is_torch_available, is_vision_available
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -49,7 +50,7 @@ def __init__(
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -147,16 +148,22 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class ViTHybridModelTest(ModelTesterMixin, unittest.TestCase):
+class ViTHybridModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_common.py, as ViT does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
     """
 
     all_model_classes = (ViTHybridModel, ViTHybridForImageClassification) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": ViTHybridModel, "image-classification": ViTHybridForImageClassification}
+        if is_torch_available()
+        else {}
+    )
     test_pruning = False
     test_resize_embeddings = False
     test_head_masking = False
+    model_split_percents = [0.5, 0.9]
 
     def setUp(self):
         self.model_tester = ViTHybridModelTester(self)
@@ -198,6 +205,28 @@ def test_for_image_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
 
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            # Skip the check for the backbone
+            for name, module in model.named_modules():
+                if module.__class__.__name__ == "ViTHybridPatchEmbeddings":
+                    backbone_params = [f"{name}.{key}" for key in module.state_dict().keys()]
+                    break
+
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    if name in backbone_params:
+                        continue
+                    self.assertIn(
+                        ((param.data.mean() * 1e9).round() / 1e9).item(),
+                        [0.0, 1.0],
+                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                    )
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in VIT_HYBRID_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
@@ -215,7 +244,7 @@ def prepare_img():
 @require_vision
 class ViTModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return (
             ViTHybridImageProcessor.from_pretrained(VIT_HYBRID_PRETRAINED_MODEL_ARCHIVE_LIST[0])
             if is_vision_available()
@@ -228,9 +257,9 @@ def test_inference_image_classification_head(self):
             torch_device
         )
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -247,12 +276,12 @@ def test_inference_image_classification_head(self):
     @slow
     @require_accelerate
     def test_accelerate_inference(self):
-        feature_extractor = ViTHybridImageProcessor.from_pretrained("google/vit-hybrid-base-bit-384")
+        image_processor = ViTHybridImageProcessor.from_pretrained("google/vit-hybrid-base-bit-384")
         model = ViTHybridForImageClassification.from_pretrained("google/vit-hybrid-base-bit-384", device_map="auto")
 
         image = prepare_img()
 
-        inputs = feature_extractor(images=image, return_tensors="pt")
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
         outputs = model(**inputs)
         logits = outputs.logits
         # model predicts one of the 1000 ImageNet classes
diff --git a/tests/models/vit_mae/test_modeling_tf_vit_mae.py b/tests/models/vit_mae/test_modeling_tf_vit_mae.py
index 3bc582cb1fcd..8f6064e01650 100644
--- a/tests/models/vit_mae/test_modeling_tf_vit_mae.py
+++ b/tests/models/vit_mae/test_modeling_tf_vit_mae.py
@@ -15,6 +15,8 @@
 """ Testing suite for the TensorFlow ViTMAE model. """
 
 
+from __future__ import annotations
+
 import copy
 import inspect
 import json
@@ -32,6 +34,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -43,7 +46,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import ViTFeatureExtractor
+    from transformers import ViTImageProcessor
 
 
 class TFViTMAEModelTester:
@@ -57,7 +60,7 @@ def __init__(
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -155,13 +158,14 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFViTMAEModelTest(TFModelTesterMixin, unittest.TestCase):
+class TFViTMAEModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_common.py, as ViTMAE does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
     """
 
     all_model_classes = (TFViTMAEModel, TFViTMAEForPreTraining) if is_tf_available() else ()
+    pipeline_model_mapping = {"feature-extraction": TFViTMAEModel} if is_tf_available() else {}
 
     test_pruning = False
     test_onnx = False
@@ -266,7 +270,6 @@ def prepare_numpy_arrays(inputs_dict):
     # overwrite from common since TFViTMAEForPretraining has random masking, we need to fix the noise
     # to generate masks during test
     def check_pt_tf_models(self, tf_model, pt_model, tf_inputs_dict):
-
         # make masks reproducible
         np.random.seed(2)
 
@@ -280,52 +283,6 @@ def check_pt_tf_models(self, tf_model, pt_model, tf_inputs_dict):
 
         super().check_pt_tf_models(tf_model, pt_model, tf_inputs_dict)
 
-    # overwrite from common since TFViTMAEForPretraining outputs loss along with
-    # logits and mask indices. loss and mask indices are not suitable for integration
-    # with other keras modules.
-    def test_compile_tf_model(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
-        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-        metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
-
-        for model_class in self.all_model_classes:
-            # `pixel_values` implies that the input is an image
-            inputs = tf.keras.Input(
-                batch_shape=(
-                    3,
-                    self.model_tester.num_channels,
-                    self.model_tester.image_size,
-                    self.model_tester.image_size,
-                ),
-                name="pixel_values",
-                dtype="float32",
-            )
-
-            # Prepare our model
-            model = model_class(config)
-            model(self._prepare_for_class(inputs_dict, model_class))  # Model must be called before saving.
-            # Let's load it from the disk to be sure we can use pretrained weights
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname, saved_model=False)
-                model = model_class.from_pretrained(tmpdirname)
-
-            outputs_dict = model(inputs)
-            hidden_states = outputs_dict[0]
-
-            # `TFViTMAEForPreTraining` outputs are not recommended to be used for
-            #  downstream application. This is just to check if the outputs of
-            # `TFViTMAEForPreTraining` can be integrated with other keras modules.
-            if model_class.__name__ == "TFViTMAEForPreTraining":
-                hidden_states = outputs_dict["logits"]
-
-            # Add a dense layer on top to test integration with other keras modules
-            outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states)
-
-            # Compile extended model
-            extended_model = tf.keras.Model(inputs=[inputs], outputs=[outputs])
-            extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
-
     # overwrite from common since TFViTMAEForPretraining has random masking, we need to fix the noise
     # to generate masks during test
     def test_keras_save_load(self):
@@ -334,7 +291,7 @@ def test_keras_save_load(self):
 
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-        tf_main_layer_classes = set(
+        tf_main_layer_classes = {
             module_member
             for model_class in self.all_model_classes
             for module in (import_module(model_class.__module__),)
@@ -346,7 +303,7 @@ def test_keras_save_load(self):
             if isinstance(module_member, type)
             and tf.keras.layers.Layer in module_member.__bases__
             and getattr(module_member, "_keras_serializable", False)
-        )
+        }
 
         num_patches = int((config.image_size // config.patch_size) ** 2)
         noise = np.random.uniform(size=(self.model_tester.batch_size, num_patches))
@@ -453,7 +410,6 @@ def test_model_outputs_equivalence(self):
 
     @slow
     def test_model_from_pretrained(self):
-
         model = TFViTMAEModel.from_pretrained("google/vit-base-patch16-224")
         self.assertIsNotNone(model)
 
@@ -468,8 +424,8 @@ def prepare_img():
 @require_vision
 class TFViTMAEModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return ViTFeatureExtractor.from_pretrained("facebook/vit-mae-base") if is_vision_available() else None
+    def default_image_processor(self):
+        return ViTImageProcessor.from_pretrained("facebook/vit-mae-base") if is_vision_available() else None
 
     @slow
     def test_inference_for_pretraining(self):
@@ -478,9 +434,9 @@ def test_inference_for_pretraining(self):
 
         model = TFViTMAEForPreTraining.from_pretrained("facebook/vit-mae-base")
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="tf")
+        inputs = image_processor(images=image, return_tensors="tf")
 
         # prepare a noise vector that will be also used for testing the TF model
         # (this way we can ensure that the PT and TF models operate on the same inputs)
diff --git a/tests/models/vit_mae/test_modeling_vit_mae.py b/tests/models/vit_mae/test_modeling_vit_mae.py
index 5a48d253a385..89a3a0d803e4 100644
--- a/tests/models/vit_mae/test_modeling_vit_mae.py
+++ b/tests/models/vit_mae/test_modeling_vit_mae.py
@@ -28,6 +28,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -41,7 +42,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import ViTFeatureExtractor
+    from transformers import ViTImageProcessor
 
 
 class ViTMAEModelTester:
@@ -55,7 +56,7 @@ def __init__(
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -117,6 +118,10 @@ def get_config(self):
             is_decoder=False,
             initializer_range=self.initializer_range,
             mask_ratio=self.mask_ratio,
+            decoder_hidden_size=self.hidden_size,
+            decoder_intermediate_size=self.intermediate_size,
+            decoder_num_attention_heads=self.num_attention_heads,
+            decoder_num_hidden_layers=self.num_hidden_layers,
         )
 
     def create_and_check_model(self, config, pixel_values, labels):
@@ -153,13 +158,14 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class ViTMAEModelTest(ModelTesterMixin, unittest.TestCase):
+class ViTMAEModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_common.py, as ViTMAE does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
     """
 
     all_model_classes = (ViTMAEModel, ViTMAEForPreTraining) if is_torch_available() else ()
+    pipeline_model_mapping = {"feature-extraction": ViTMAEModel} if is_torch_available() else {}
 
     test_pruning = False
     test_torchscript = False
@@ -209,7 +215,6 @@ def test_for_pretraining(self):
     # overwrite from common since ViTMAEForPretraining has random masking, we need to fix the noise
     # to generate masks during test
     def check_pt_tf_models(self, tf_model, pt_model, pt_inputs_dict):
-
         # make masks reproducible
         np.random.seed(2)
 
@@ -224,7 +229,6 @@ def check_pt_tf_models(self, tf_model, pt_model, pt_inputs_dict):
         super().check_pt_tf_models(tf_model, pt_model, pt_inputs_dict)
 
     def test_save_load(self):
-
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
@@ -296,8 +300,8 @@ def prepare_img():
 @require_vision
 class ViTMAEModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return ViTFeatureExtractor.from_pretrained("facebook/vit-mae-base") if is_vision_available() else None
+    def default_image_processor(self):
+        return ViTImageProcessor.from_pretrained("facebook/vit-mae-base") if is_vision_available() else None
 
     @slow
     def test_inference_for_pretraining(self):
@@ -306,9 +310,9 @@ def test_inference_for_pretraining(self):
 
         model = ViTMAEForPreTraining.from_pretrained("facebook/vit-mae-base").to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # prepare a noise vector that will be also used for testing the TF model
         # (this way we can ensure that the PT and TF models operate on the same inputs)
diff --git a/tests/models/vit_msn/test_modeling_vit_msn.py b/tests/models/vit_msn/test_modeling_vit_msn.py
index b93e5c72d06c..a53163775150 100644
--- a/tests/models/vit_msn/test_modeling_vit_msn.py
+++ b/tests/models/vit_msn/test_modeling_vit_msn.py
@@ -24,6 +24,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -37,7 +38,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import ViTFeatureExtractor
+    from transformers import ViTImageProcessor
 
 
 class ViTMSNModelTester:
@@ -51,7 +52,7 @@ def __init__(
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -144,13 +145,18 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class ViTMSNModelTest(ModelTesterMixin, unittest.TestCase):
+class ViTMSNModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_common.py, as ViTMSN does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
     """
 
     all_model_classes = (ViTMSNModel, ViTMSNForImageClassification) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": ViTMSNModel, "image-classification": ViTMSNForImageClassification}
+        if is_torch_available()
+        else {}
+    )
 
     test_pruning = False
     test_torchscript = False
@@ -214,17 +220,17 @@ def prepare_img():
 @require_vision
 class ViTMSNModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return ViTFeatureExtractor.from_pretrained("facebook/vit-msn-small") if is_vision_available() else None
+    def default_image_processor(self):
+        return ViTImageProcessor.from_pretrained("facebook/vit-msn-small") if is_vision_available() else None
 
     @slow
     def test_inference_image_classification_head(self):
         torch.manual_seed(2)
         model = ViTMSNForImageClassification.from_pretrained("facebook/vit-msn-small").to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/vitdet/__init__.py b/tests/models/vitdet/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/vitdet/test_modeling_vitdet.py b/tests/models/vitdet/test_modeling_vitdet.py
new file mode 100644
index 000000000000..d6ffd03cbd7f
--- /dev/null
+++ b/tests/models/vitdet/test_modeling_vitdet.py
@@ -0,0 +1,306 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch ViTDet model. """
+
+
+import inspect
+import unittest
+
+from transformers import VitDetConfig
+from transformers.testing_utils import require_torch, torch_device
+from transformers.utils import is_torch_available
+
+from ...test_backbone_common import BackboneTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import VitDetBackbone, VitDetModel
+
+
+class VitDetModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        image_size=30,
+        patch_size=2,
+        num_channels=3,
+        is_training=True,
+        use_labels=True,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        type_sequence_label_size=10,
+        initializer_range=0.02,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.scope = scope
+
+        self.num_patches_one_direction = self.image_size // self.patch_size
+        self.seq_length = (self.image_size // self.patch_size) ** 2
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+
+        config = self.get_config()
+
+        return config, pixel_values, labels
+
+    def get_config(self):
+        return VitDetConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+        )
+
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = VitDetModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        self.parent.assertEqual(
+            result.last_hidden_state.shape,
+            (self.batch_size, self.hidden_size, self.num_patches_one_direction, self.num_patches_one_direction),
+        )
+
+    def create_and_check_backbone(self, config, pixel_values, labels):
+        model = VitDetBackbone(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+
+        # verify hidden states
+        self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
+        self.parent.assertListEqual(
+            list(result.feature_maps[0].shape),
+            [self.batch_size, self.hidden_size, self.num_patches_one_direction, self.num_patches_one_direction],
+        )
+
+        # verify channels
+        self.parent.assertEqual(len(model.channels), len(config.out_features))
+        self.parent.assertListEqual(model.channels, [config.hidden_size])
+
+        # verify backbone works with out_features=None
+        config.out_features = None
+        model = VitDetBackbone(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+
+        # verify feature maps
+        self.parent.assertEqual(len(result.feature_maps), 1)
+        self.parent.assertListEqual(
+            list(result.feature_maps[0].shape),
+            [self.batch_size, self.hidden_size, self.num_patches_one_direction, self.num_patches_one_direction],
+        )
+
+        # verify channels
+        self.parent.assertEqual(len(model.channels), 1)
+        self.parent.assertListEqual(model.channels, [config.hidden_size])
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values, labels = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class VitDetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as VitDet does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (VitDetModel, VitDetBackbone) if is_torch_available() else ()
+    pipeline_model_mapping = {"feature-extraction": VitDetModel} if is_torch_available() else {}
+
+    fx_compatible = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = VitDetModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=VitDetConfig, has_text_modality=False, hidden_size=37)
+
+    # TODO: Fix me (once this model gets more usage)
+    @unittest.skip("Does not work on the tiny model as we keep hitting edge cases.")
+    def test_cpu_offload(self):
+        super().test_cpu_offload()
+
+    # TODO: Fix me (once this model gets more usage)
+    @unittest.skip("Does not work on the tiny model as we keep hitting edge cases.")
+    def test_disk_offload(self):
+        super().test_disk_offload()
+
+    # TODO: Fix me (once this model gets more usage)
+    @unittest.skip("Does not work on the tiny model as we keep hitting edge cases.")
+    def test_model_parallelism(self):
+        super().test_model_parallelism()
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="VitDet does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_backbone(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_backbone(*config_and_inputs)
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.hidden_states
+
+            expected_num_stages = self.model_tester.num_hidden_layers
+            self.assertEqual(len(hidden_states), expected_num_stages + 1)
+
+            # VitDet's feature maps are of shape (batch_size, num_channels, height, width)
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [
+                    self.model_tester.num_patches_one_direction,
+                    self.model_tester.num_patches_one_direction,
+                ],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    # overwrite since VitDet only supports retraining gradients of hidden states
+    def test_retain_grad_hidden_states_attentions(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        config.output_attentions = self.has_attentions
+
+        # no need to test all models as different heads yield the same functionality
+        model_class = self.all_model_classes[0]
+        model = model_class(config)
+        model.to(torch_device)
+
+        inputs = self._prepare_for_class(inputs_dict, model_class)
+
+        outputs = model(**inputs)
+
+        output = outputs[0]
+
+        # Encoder-/Decoder-only models
+        hidden_states = outputs.hidden_states[0]
+        hidden_states.retain_grad()
+
+        output.flatten()[0].backward(retain_graph=True)
+
+        self.assertIsNotNone(hidden_states.grad)
+
+    @unittest.skip(reason="VitDet does not support feedforward chunking")
+    def test_feed_forward_chunking(self):
+        pass
+
+    @unittest.skip(reason="VitDet does not have standalone checkpoints since it used as backbone in other models")
+    def test_model_from_pretrained(self):
+        pass
+
+
+@require_torch
+class VitDetBackboneTest(unittest.TestCase, BackboneTesterMixin):
+    all_model_classes = (VitDetBackbone,) if is_torch_available() else ()
+    config_class = VitDetConfig
+
+    has_attentions = False
+
+    def setUp(self):
+        self.model_tester = VitDetModelTester(self)
diff --git a/tests/models/vitmatte/__init__.py b/tests/models/vitmatte/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/vitmatte/test_image_processing_vitmatte.py b/tests/models/vitmatte/test_image_processing_vitmatte.py
new file mode 100644
index 000000000000..e1009c759283
--- /dev/null
+++ b/tests/models/vitmatte/test_image_processing_vitmatte.py
@@ -0,0 +1,194 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import VitMatteImageProcessor
+
+
+class VitMatteImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_rescale=True,
+        rescale_factor=0.5,
+        do_pad=True,
+        size_divisibility=10,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_pad = do_pad
+        self.size_divisibility = size_divisibility
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def prepare_image_processor_dict(self):
+        return {
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_normalize": self.do_normalize,
+            "do_rescale": self.do_rescale,
+            "rescale_factor": self.rescale_factor,
+            "do_pad": self.do_pad,
+            "size_divisibility": self.size_divisibility,
+        }
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class VitMatteImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = VitMatteImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = VitMatteImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "do_rescale"))
+        self.assertTrue(hasattr(image_processing, "rescale_factor"))
+        self.assertTrue(hasattr(image_processing, "do_pad"))
+        self.assertTrue(hasattr(image_processing, "size_divisibility"))
+
+    def test_call_numpy(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random numpy tensors
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, np.ndarray)
+
+        # Test not batched input (image processor does not support batched inputs)
+        image = image_inputs[0]
+        trimap = np.random.randint(0, 3, size=image.shape[:2])
+        encoded_images = image_processing(images=image, trimaps=trimap, return_tensors="pt").pixel_values
+
+        # Verify that width and height can be divided by size_divisibility
+        self.assertTrue(encoded_images.shape[-1] % self.image_processor_tester.size_divisibility == 0)
+        self.assertTrue(encoded_images.shape[-2] % self.image_processor_tester.size_divisibility == 0)
+
+    def test_call_pytorch(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PyTorch tensors
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+
+        for image in image_inputs:
+            self.assertIsInstance(image, torch.Tensor)
+
+        # Test not batched input (image processor does not support batched inputs)
+        image = image_inputs[0]
+        trimap = np.random.randint(0, 3, size=image.shape[:2])
+        encoded_images = image_processing(images=image, trimaps=trimap, return_tensors="pt").pixel_values
+
+        # Verify that width and height can be divided by size_divisibility
+        self.assertTrue(encoded_images.shape[-1] % self.image_processor_tester.size_divisibility == 0)
+        self.assertTrue(encoded_images.shape[-2] % self.image_processor_tester.size_divisibility == 0)
+
+    def test_call_pil(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PIL images
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
+        for image in image_inputs:
+            self.assertIsInstance(image, Image.Image)
+
+        # Test not batched input (image processor does not support batched inputs)
+        image = image_inputs[0]
+        trimap = np.random.randint(0, 3, size=image.size[::-1])
+        encoded_images = image_processing(images=image, trimaps=trimap, return_tensors="pt").pixel_values
+
+        # Verify that width and height can be divided by size_divisibility
+        self.assertTrue(encoded_images.shape[-1] % self.image_processor_tester.size_divisibility == 0)
+        self.assertTrue(encoded_images.shape[-2] % self.image_processor_tester.size_divisibility == 0)
+
+    def test_call_numpy_4_channels(self):
+        # Test that can process images which have an arbitrary number of channels
+        # Initialize image_processing
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+
+        # create random numpy tensors
+        self.image_processor_tester.num_channels = 4
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
+
+        # Test not batched input (image processor does not support batched inputs)
+        image = image_inputs[0]
+        trimap = np.random.randint(0, 3, size=image.shape[:2])
+        encoded_images = image_processor(
+            images=image,
+            trimaps=trimap,
+            input_data_format="channels_first",
+            image_mean=0,
+            image_std=1,
+            return_tensors="pt",
+        ).pixel_values
+
+        # Verify that width and height can be divided by size_divisibility
+        self.assertTrue(encoded_images.shape[-1] % self.image_processor_tester.size_divisibility == 0)
+        self.assertTrue(encoded_images.shape[-2] % self.image_processor_tester.size_divisibility == 0)
+
+    def test_padding(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        image = np.random.randn(3, 249, 491)
+        images = image_processing.pad_image(image)
+        assert images.shape == (3, 256, 512)
diff --git a/tests/models/vitmatte/test_modeling_vitmatte.py b/tests/models/vitmatte/test_modeling_vitmatte.py
new file mode 100644
index 000000000000..09e3f60966b0
--- /dev/null
+++ b/tests/models/vitmatte/test_modeling_vitmatte.py
@@ -0,0 +1,270 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch VitMatte model. """
+
+
+import inspect
+import unittest
+
+from huggingface_hub import hf_hub_download
+
+from transformers import VitMatteConfig
+from transformers.testing_utils import (
+    require_torch,
+    slow,
+    torch_device,
+)
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import VitDetConfig, VitMatteForImageMatting
+    from transformers.models.vitmatte.modeling_vitmatte import VITMATTE_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import VitMatteImageProcessor
+
+
+class VitMatteModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        image_size=32,
+        patch_size=16,
+        num_channels=4,
+        is_training=True,
+        use_labels=False,
+        hidden_size=2,
+        num_hidden_layers=2,
+        num_attention_heads=2,
+        hidden_act="gelu",
+        type_sequence_label_size=10,
+        initializer_range=0.02,
+        scope=None,
+        out_features=["stage1"],
+        fusion_hidden_sizes=[128, 64, 32, 16],
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.scope = scope
+        self.out_features = out_features
+        self.fusion_hidden_sizes = fusion_hidden_sizes
+
+        self.seq_length = (self.image_size // self.patch_size) ** 2
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        if self.use_labels:
+            raise NotImplementedError("Training is not yet supported")
+
+        config = self.get_config()
+
+        return config, pixel_values, labels
+
+    def get_backbone_config(self):
+        return VitDetConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            hidden_size=self.hidden_size,
+            is_training=self.is_training,
+            hidden_act=self.hidden_act,
+            out_features=self.out_features,
+        )
+
+    def get_config(self):
+        return VitMatteConfig(
+            backbone_config=self.get_backbone_config(),
+            hidden_size=self.hidden_size,
+            fusion_hidden_sizes=self.fusion_hidden_sizes,
+        )
+
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = VitMatteForImageMatting(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        self.parent.assertEqual(result.alphas.shape, (self.batch_size, 1, self.image_size, self.image_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values, labels = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class VitMatteModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as VitMatte does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (VitMatteForImageMatting,) if is_torch_available() else ()
+    pipeline_model_mapping = {}
+
+    fx_compatible = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = VitMatteModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=VitMatteConfig, has_text_modality=False, hidden_size=37)
+
+    def test_config(self):
+        self.create_and_test_config_common_properties()
+        self.config_tester.create_and_test_config_to_json_string()
+        self.config_tester.create_and_test_config_to_json_file()
+        self.config_tester.create_and_test_config_from_and_save_pretrained()
+        self.config_tester.create_and_test_config_with_num_labels()
+        self.config_tester.check_config_can_be_init_without_params()
+        self.config_tester.check_config_arguments_init()
+
+    def create_and_test_config_common_properties(self):
+        return
+
+    @unittest.skip(reason="VitMatte does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="Training is not yet supported")
+    def test_training(self):
+        pass
+
+    @unittest.skip(reason="Training is not yet supported")
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(reason="ViTMatte does not support input and output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in VITMATTE_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = VitMatteForImageMatting.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    @unittest.skip(reason="ViTMatte does not support retaining gradient on attention logits")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+            expected_num_layers = getattr(
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+            )
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [2, 2],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            print("Hello we're here")
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+
+@require_torch
+class VitMatteModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference(self):
+        processor = VitMatteImageProcessor.from_pretrained("hustvl/vitmatte-small-composition-1k")
+        model = VitMatteForImageMatting.from_pretrained("hustvl/vitmatte-small-composition-1k").to(torch_device)
+
+        filepath = hf_hub_download(
+            repo_id="hf-internal-testing/image-matting-fixtures", filename="image.png", repo_type="dataset"
+        )
+        image = Image.open(filepath).convert("RGB")
+        filepath = hf_hub_download(
+            repo_id="hf-internal-testing/image-matting-fixtures", filename="trimap.png", repo_type="dataset"
+        )
+        trimap = Image.open(filepath).convert("L")
+
+        # prepare image + trimap for the model
+        inputs = processor(images=image, trimaps=trimap, return_tensors="pt").to(torch_device)
+
+        with torch.no_grad():
+            alphas = model(**inputs).alphas
+
+        expected_shape = torch.Size((1, 1, 640, 960))
+        self.assertEqual(alphas.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[0.9977, 0.9987, 0.9990], [0.9980, 0.9998, 0.9998], [0.9983, 0.9998, 0.9998]], device=torch_device
+        )
+        self.assertTrue(torch.allclose(alphas[0, 0, :3, :3], expected_slice, atol=1e-4))
diff --git a/tests/models/vits/__init__.py b/tests/models/vits/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/vits/test_modeling_vits.py b/tests/models/vits/test_modeling_vits.py
new file mode 100644
index 000000000000..abae6999cdfc
--- /dev/null
+++ b/tests/models/vits/test_modeling_vits.py
@@ -0,0 +1,425 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch VITS model. """
+
+import copy
+import os
+import tempfile
+import unittest
+from typing import Dict, List, Tuple
+
+import numpy as np
+
+from transformers import PretrainedConfig, VitsConfig
+from transformers.testing_utils import (
+    is_flaky,
+    is_torch_available,
+    require_torch,
+    require_torch_multi_gpu,
+    slow,
+    torch_device,
+)
+from transformers.trainer_utils import set_seed
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import (
+    ModelTesterMixin,
+    global_rng,
+    ids_tensor,
+    random_attention_mask,
+)
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import VitsModel, VitsTokenizer
+
+
+CONFIG_NAME = "config.json"
+GENERATION_CONFIG_NAME = "generation_config.json"
+
+
+def _config_zero_init(config):
+    configs_no_init = copy.deepcopy(config)
+    for key in configs_no_init.__dict__.keys():
+        if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key:
+            setattr(configs_no_init, key, 1e-10)
+        if isinstance(getattr(configs_no_init, key, None), PretrainedConfig):
+            no_init_subconfig = _config_zero_init(getattr(configs_no_init, key))
+            setattr(configs_no_init, key, no_init_subconfig)
+    return configs_no_init
+
+
+@require_torch
+class VitsModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=2,
+        seq_length=7,
+        is_training=False,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=2,
+        intermediate_size=64,
+        flow_size=16,
+        vocab_size=38,
+        spectrogram_bins=8,
+        duration_predictor_num_flows=2,
+        duration_predictor_filter_channels=16,
+        prior_encoder_num_flows=2,
+        upsample_initial_channel=16,
+        upsample_rates=[8, 2],
+        upsample_kernel_sizes=[16, 4],
+        resblock_kernel_sizes=[3, 7],
+        resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5]],
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.flow_size = flow_size
+        self.vocab_size = vocab_size
+        self.spectrogram_bins = spectrogram_bins
+        self.duration_predictor_num_flows = duration_predictor_num_flows
+        self.duration_predictor_filter_channels = duration_predictor_filter_channels
+        self.prior_encoder_num_flows = prior_encoder_num_flows
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_rates = upsample_rates
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(2)
+        attention_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        config = self.get_config()
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+        }
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def get_config(self):
+        return VitsConfig(
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            ffn_dim=self.intermediate_size,
+            flow_size=self.flow_size,
+            vocab_size=self.vocab_size,
+            spectrogram_bins=self.spectrogram_bins,
+            duration_predictor_num_flows=self.duration_predictor_num_flows,
+            prior_encoder_num_flows=self.prior_encoder_num_flows,
+            duration_predictor_filter_channels=self.duration_predictor_filter_channels,
+            posterior_encoder_num_wavenet_layers=self.num_hidden_layers,
+            upsample_initial_channel=self.upsample_initial_channel,
+            upsample_rates=self.upsample_rates,
+            upsample_kernel_sizes=self.upsample_kernel_sizes,
+            resblock_kernel_sizes=self.resblock_kernel_sizes,
+            resblock_dilation_sizes=self.resblock_dilation_sizes,
+        )
+
+    def create_and_check_model_forward(self, config, inputs_dict):
+        model = VitsModel(config=config).to(torch_device).eval()
+
+        input_ids = inputs_dict["input_ids"]
+        attention_mask = inputs_dict["attention_mask"]
+
+        result = model(input_ids, attention_mask=attention_mask)
+        self.parent.assertEqual((self.batch_size, 624), result.waveform.shape)
+
+
+@require_torch
+class VitsModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (VitsModel,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": VitsModel, "text-to-audio": VitsModel} if is_torch_available() else {}
+    )
+    is_encoder_decoder = False
+    test_pruning = False
+    test_headmasking = False
+    test_resize_embeddings = False
+    test_head_masking = False
+    test_torchscript = False
+    has_attentions = False
+
+    input_name = "input_ids"
+
+    def setUp(self):
+        self.model_tester = VitsModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=VitsConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip("Need to fix this after #26538")
+    def test_model_forward(self):
+        set_seed(12345)
+        global_rng.seed(12345)
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_forward(*config_and_inputs)
+
+    @require_torch_multi_gpu
+    # override to force all elements of the batch to have the same sequence length across GPUs
+    def test_multi_gpu_data_parallel_forward(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.use_stochastic_duration_prediction = False
+
+        # move input tensors to cuda:O
+        for key, value in inputs_dict.items():
+            if torch.is_tensor(value):
+                # make all elements of the batch the same -> ensures the output seq lengths are the same for DP
+                value[1:] = value[0]
+                inputs_dict[key] = value.to(0)
+
+        for model_class in self.all_model_classes:
+            model = model_class(config=config)
+            model.to(0)
+            model.eval()
+
+            # Wrap model in nn.DataParallel
+            model = torch.nn.DataParallel(model)
+            set_seed(555)
+            with torch.no_grad():
+                _ = model(**self._prepare_for_class(inputs_dict, model_class)).waveform
+
+    @unittest.skip("VITS is not deterministic")
+    def test_determinism(self):
+        pass
+
+    @is_flaky(
+        max_attempts=3,
+        description="Weight initialisation for the VITS conv layers sometimes exceeds the kaiming normal range",
+    )
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        uniform_init_parms = [
+            "emb_rel_k",
+            "emb_rel_v",
+            "conv_1",
+            "conv_2",
+            "conv_pre",
+            "conv_post",
+            "conv_proj",
+            "conv_dds",
+            "project",
+            "wavenet.in_layers",
+            "wavenet.res_skip_layers",
+            "upsampler",
+            "resblocks",
+        ]
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    if any(x in name for x in uniform_init_parms):
+                        self.assertTrue(
+                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    else:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+    @unittest.skip("VITS has no inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip("VITS has no input embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    # override since the model is not deterministic, so we need to set the seed for each forward pass
+    def test_model_outputs_equivalence(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def set_nan_tensor_to_zero(t):
+            t[t != t] = 0
+            return t
+
+        def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
+            with torch.no_grad():
+                set_seed(0)
+                tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs)
+                set_seed(0)
+                dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs).to_tuple()
+
+                def recursive_check(tuple_object, dict_object):
+                    if isinstance(tuple_object, (List, Tuple)):
+                        for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
+                            recursive_check(tuple_iterable_value, dict_iterable_value)
+                    elif isinstance(tuple_object, Dict):
+                        for tuple_iterable_value, dict_iterable_value in zip(
+                            tuple_object.values(), dict_object.values()
+                        ):
+                            recursive_check(tuple_iterable_value, dict_iterable_value)
+                    elif tuple_object is None:
+                        return
+                    else:
+                        self.assertTrue(
+                            torch.allclose(
+                                set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5
+                            ),
+                            msg=(
+                                "Tuple and dict output are not equal. Difference:"
+                                f" {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`:"
+                                f" {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has"
+                                f" `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}."
+                            ),
+                        )
+
+                recursive_check(tuple_output, dict_output)
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs)
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(model, tuple_inputs, dict_inputs)
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
+
+            if self.has_attentions:
+                tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+                dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+                check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
+
+                tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+                dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+                check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
+
+                tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+                dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+                check_equivalence(
+                    model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True}
+                )
+
+    # override since the model is not deterministic, so we need to set the seed for each forward pass
+    def test_save_load(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def check_save_load(out1, out2):
+            # make sure we don't have nans
+            out_2 = out2.cpu().numpy()
+            out_2[np.isnan(out_2)] = 0
+
+            out_1 = out1.cpu().numpy()
+            out_1[np.isnan(out_1)] = 0
+            max_diff = np.amax(np.abs(out_1 - out_2))
+            self.assertLessEqual(max_diff, 1e-5)
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                set_seed(0)
+                first = model(**self._prepare_for_class(inputs_dict, model_class))[0]
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+
+                # the config file (and the generation config file, if it can generate) should be saved
+                self.assertTrue(os.path.exists(os.path.join(tmpdirname, CONFIG_NAME)))
+                self.assertEqual(
+                    model.can_generate(), os.path.exists(os.path.join(tmpdirname, GENERATION_CONFIG_NAME))
+                )
+
+                model = model_class.from_pretrained(tmpdirname)
+                model.to(torch_device)
+                with torch.no_grad():
+                    set_seed(0)
+                    second = model(**self._prepare_for_class(inputs_dict, model_class))[0]
+
+            if isinstance(first, tuple) and isinstance(second, tuple):
+                for tensor1, tensor2 in zip(first, second):
+                    check_save_load(tensor1, tensor2)
+            else:
+                check_save_load(first, second)
+
+    # overwrite from test_modeling_common
+    def _mock_init_weights(self, module):
+        if hasattr(module, "weight") and module.weight is not None:
+            module.weight.data.fill_(3)
+        if hasattr(module, "weight_g") and module.weight_g is not None:
+            module.weight_g.data.fill_(3)
+        if hasattr(module, "weight_v") and module.weight_v is not None:
+            module.weight_v.data.fill_(3)
+        if hasattr(module, "bias") and module.bias is not None:
+            module.bias.data.fill_(3)
+
+
+@require_torch
+@slow
+class VitsModelIntegrationTests(unittest.TestCase):
+    def test_forward(self):
+        # GPU gives different results than CPU
+        torch_device = "cpu"
+
+        model = VitsModel.from_pretrained("facebook/mms-tts-eng")
+        model.to(torch_device)
+
+        tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
+
+        set_seed(555)  # make deterministic
+
+        input_text = "Mister quilter is the apostle of the middle classes and we are glad to welcome his gospel!"
+        input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(input_ids)
+
+        self.assertEqual(outputs.waveform.shape, (1, 87040))
+        # fmt: off
+        EXPECTED_LOGITS = torch.tensor(
+            [
+                -0.0042,  0.0176,  0.0354,  0.0504,  0.0621,  0.0777,  0.0980,  0.1224,
+                 0.1475,  0.1679,  0.1817,  0.1832,  0.1713,  0.1542,  0.1384,  0.1256,
+                 0.1147,  0.1066,  0.1026,  0.0958,  0.0823,  0.0610,  0.0340,  0.0022,
+                -0.0337, -0.0677, -0.0969, -0.1178, -0.1311, -0.1363
+            ]
+        )
+        # fmt: on
+        self.assertTrue(torch.allclose(outputs.waveform[0, 10000:10030].cpu(), EXPECTED_LOGITS, atol=1e-4))
diff --git a/tests/models/vits/test_tokenization_vits.py b/tests/models/vits/test_tokenization_vits.py
new file mode 100644
index 000000000000..c02caaaa9083
--- /dev/null
+++ b/tests/models/vits/test_tokenization_vits.py
@@ -0,0 +1,186 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for the VITS tokenizer."""
+import json
+import os
+import shutil
+import tempfile
+import unittest
+
+from transformers import VitsTokenizer
+from transformers.models.vits.tokenization_vits import VOCAB_FILES_NAMES
+from transformers.testing_utils import slow
+
+from ...test_tokenization_common import TokenizerTesterMixin
+
+
+class VitsTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
+    tokenizer_class = VitsTokenizer
+    test_rust_tokenizer = False
+
+    def setUp(self):
+        super().setUp()
+
+        vocab = (
+            "k ' z y u d h e s w – 3 c p - 1 j m i X f l o 0 b r a 4 2 n _ x v t q 5 6 g ț ţ < > | <pad> <unk>".split(
+                " "
+            )
+        )
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        vocab_tokens[" "] = vocab_tokens["X"]
+        del vocab_tokens["X"]
+
+        self.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>"}
+
+        self.tmpdirname = tempfile.mkdtemp()
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(vocab_tokens) + "\n")
+
+    def get_tokenizer(self, **kwargs):
+        kwargs.update(self.special_tokens_map)
+        kwargs["phonemize"] = False
+        kwargs["normalize"] = False
+        return VitsTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5):
+        txt = "beyonce lives in los angeles"
+        ids = tokenizer.encode(txt, add_special_tokens=False)
+        return txt, ids
+
+    @unittest.skip("Adding multicharacter tokens does not work with the VITS tokenizer")
+    def test_add_tokens_tokenizer(self):
+        pass
+
+    @unittest.skip("Adding multicharacter tokens does not work with the VITS tokenizer")
+    def test_encode_decode_with_spaces(self):
+        pass
+
+    @unittest.skip("The VITS tokenizer does not support `is_split_into_words`")
+    def test_pretokenized_inputs(self):
+        pass
+
+    def test_save_and_load_tokenizer(self):
+        # safety check on max_len default value so we are sure the test works
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                self.assertNotEqual(tokenizer.model_max_length, 42)
+
+        # Now let's start the test
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                # Isolate this from the other tests because we save additional tokens/etc
+                tmpdirname = tempfile.mkdtemp()
+
+                sample_text = " He is very happy, UNwant\u00E9d,running"
+                before_tokens = tokenizer.encode(sample_text, add_special_tokens=False)
+                before_vocab = tokenizer.get_vocab()
+                tokenizer.save_pretrained(tmpdirname)
+
+                after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
+                after_tokens = after_tokenizer.encode(sample_text, add_special_tokens=False)
+                after_vocab = after_tokenizer.get_vocab()
+                self.assertListEqual(before_tokens, after_tokens)
+                self.assertDictEqual(before_vocab, after_vocab)
+
+                shutil.rmtree(tmpdirname)
+
+    @unittest.skip("Adding multicharacter tokens does not work the VITS tokenizer")
+    def test_special_tokens_initialization_with_non_empty_additional_special_tokens(self):
+        pass
+
+    def test_ron_normalization(self):
+        tokenizer = self.get_tokenizer()
+        tokenizer.language = "ron"
+
+        sequences = ["vițs"]
+        normalized_sequences = ["viţs"]
+
+        encoded_ids = tokenizer(sequences, normalize=True)["input_ids"]
+        decoded_sequences = tokenizer.batch_decode(encoded_ids)
+        self.assertEqual(normalized_sequences, decoded_sequences)
+
+    def test_normalization(self):
+        tokenizer = self.get_tokenizer()
+
+        sequences = ["VITS; is a model for t-t-s!"]
+        normalized_sequences = ["vits is a model for t-t-s"]
+        unnormalized_sequences = [
+            "<unk><unk><unk><unk><unk> is a model for t-t-s<unk>"
+        ]  # can't handle upper-case or certain punctuations
+
+        encoded_normalized_ids = tokenizer(sequences, normalize=True)
+        encoded_unnormalized_ids = tokenizer(sequences, normalize=False)
+
+        decoded_normalized_sequences = [
+            tokenizer.decode(seq, skip_special_tokens=False) for seq in encoded_normalized_ids["input_ids"]
+        ]
+        decoded_unnormalized_sequences = [
+            tokenizer.decode(seq, skip_special_tokens=False) for seq in encoded_unnormalized_ids["input_ids"]
+        ]
+
+        self.assertEqual(decoded_normalized_sequences, normalized_sequences)
+        self.assertEqual(decoded_unnormalized_sequences, unnormalized_sequences)
+
+    @slow
+    def test_tokenizer_integration(self):
+        sequences = [
+            "BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly "
+            "conditioning on both left and right context in all layers.",
+            "The quick brown fox! Jumps over the lazy dog...",
+            "We use k as our padding token",
+        ]
+
+        normalized_sequences = [
+            "bert is designed to pre-train deep bidirectional representations from unlabeled text by jointly "
+            "conditioning on both left and right context in all layers",
+            "the quick brown fox jumps over the lazy dog",
+            "we use k as our padding token",
+        ]
+
+        # fmt: off
+        expected_encoding = {
+            'input_ids': [
+                [0, 24, 0, 7, 0, 25, 0, 33, 0, 19, 0, 18, 0, 8, 0, 19, 0, 5, 0, 7, 0, 8, 0, 18, 0, 37, 0, 29, 0, 7, 0, 5, 0, 19, 0, 33, 0, 22, 0, 19, 0, 13, 0, 25, 0, 7, 0, 14, 0, 33, 0, 25, 0, 26, 0, 18, 0, 29, 0, 19, 0, 5, 0, 7, 0, 7, 0, 13, 0, 19, 0, 24, 0, 18, 0, 5, 0, 18, 0, 25, 0, 7, 0, 12, 0, 33, 0, 18, 0, 22, 0, 29, 0, 26, 0, 21, 0, 19, 0, 25, 0, 7, 0, 13, 0, 25, 0, 7, 0, 8, 0, 7, 0, 29, 0, 33, 0, 26, 0, 33, 0, 18, 0, 22, 0, 29, 0, 8, 0, 19, 0, 20, 0, 25, 0, 22, 0, 17, 0, 19, 0, 4, 0, 29, 0, 21, 0, 26, 0, 24, 0, 7, 0, 21, 0, 7, 0, 5, 0, 19, 0, 33, 0, 7, 0, 31, 0, 33, 0, 19, 0, 24, 0, 3, 0, 19, 0, 16, 0, 22, 0, 18, 0, 29, 0, 33, 0, 21, 0, 3, 0, 19, 0, 12, 0, 22, 0, 29, 0, 5, 0, 18, 0, 33, 0, 18, 0, 22, 0, 29, 0, 18, 0, 29, 0, 37, 0, 19, 0, 22, 0, 29, 0, 19, 0, 24, 0, 22, 0, 33, 0, 6, 0, 19, 0, 21, 0, 7, 0, 20, 0, 33, 0, 19, 0, 26, 0, 29, 0, 5, 0, 19, 0, 25, 0, 18, 0, 37, 0, 6, 0, 33, 0, 19, 0, 12, 0, 22, 0, 29, 0, 33, 0, 7, 0, 31, 0, 33, 0, 19, 0, 18, 0, 29, 0, 19, 0, 26, 0, 21, 0, 21, 0, 19, 0, 21, 0, 26, 0, 3, 0, 7, 0, 25, 0, 8, 0],
+                [0, 33, 0, 6, 0, 7, 0, 19, 0, 34, 0, 4, 0, 18, 0, 12, 0, 0, 0, 19, 0, 24, 0, 25, 0, 22, 0, 9, 0, 29, 0, 19, 0, 20, 0, 22, 0, 31, 0, 19, 0, 16, 0, 4, 0, 17, 0, 13, 0, 8, 0, 19, 0, 22, 0, 32, 0, 7, 0, 25, 0, 19, 0, 33, 0, 6, 0, 7, 0, 19, 0, 21, 0, 26, 0, 2, 0, 3, 0, 19, 0, 5, 0, 22, 0, 37, 0, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39],
+                [0, 9, 0, 7, 0, 19, 0, 4, 0, 8, 0, 7, 0, 19, 0, 0, 0, 19, 0, 26, 0, 8, 0, 19, 0, 22, 0, 4, 0, 25, 0, 19, 0, 13, 0, 26, 0, 5, 0, 5, 0, 18, 0, 29, 0, 37, 0, 19, 0, 33, 0, 22, 0, 0, 0, 7, 0, 29, 0, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39],
+            ],
+            'attention_mask': [
+                [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+            ]
+        }
+        # fmt: on
+        tokenizer_classes = [self.tokenizer_class]
+        if self.test_rust_tokenizer:
+            tokenizer_classes.append(self.rust_tokenizer_class)
+        for tokenizer_class in tokenizer_classes:
+            tokenizer = tokenizer_class.from_pretrained(
+                "facebook/mms-tts-eng",
+                revision="28cedf176aa99de5023a4344fd8a2cc477126fb8",  # to pin the tokenizer version
+                pad_token="<pad>",
+            )
+
+            encoding = tokenizer(sequences, padding=True, normalize=True)
+            decoded_sequences = [tokenizer.decode(seq, skip_special_tokens=True) for seq in encoding["input_ids"]]
+
+            encoding_data = encoding.data
+            self.assertDictEqual(encoding_data, expected_encoding)
+
+            for expected, decoded in zip(normalized_sequences, decoded_sequences):
+                self.assertEqual(expected, decoded)
diff --git a/tests/models/vivit/__init__.py b/tests/models/vivit/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/vivit/test_image_processing_vivit.py b/tests/models/vivit/test_image_processing_vivit.py
new file mode 100644
index 000000000000..dad120ef818e
--- /dev/null
+++ b/tests/models/vivit/test_image_processing_vivit.py
@@ -0,0 +1,228 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_video_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import VivitImageProcessor
+
+
+class VivitImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        num_frames=10,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+        crop_size=None,
+    ):
+        size = size if size is not None else {"shortest_edge": 18}
+        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
+
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.num_frames = num_frames
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.crop_size = crop_size
+
+    def prepare_image_processor_dict(self):
+        return {
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_normalize": self.do_normalize,
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "crop_size": self.crop_size,
+        }
+
+    def expected_output_image_shape(self, images):
+        return self.num_frames, self.num_channels, self.crop_size["height"], self.crop_size["width"]
+
+    def prepare_video_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_video_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            num_frames=self.num_frames,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class VivitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = VivitImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = VivitImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "do_center_crop"))
+        self.assertTrue(hasattr(image_processing, "size"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"shortest_edge": 18})
+        self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
+
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84)
+        self.assertEqual(image_processor.size, {"shortest_edge": 42})
+        self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
+
+    def test_rescale(self):
+        # ViVit optionally rescales between -1 and 1 instead of the usual 0 and 1
+        image = np.arange(0, 256, 1, dtype=np.uint8).reshape(1, 8, 32)
+
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+
+        rescaled_image = image_processor.rescale(image, scale=1 / 127.5)
+        expected_image = (image * (1 / 127.5)).astype(np.float32) - 1
+        self.assertTrue(np.allclose(rescaled_image, expected_image))
+
+        rescaled_image = image_processor.rescale(image, scale=1 / 255, offset=False)
+        expected_image = (image / 255.0).astype(np.float32)
+        self.assertTrue(np.allclose(rescaled_image, expected_image))
+
+    def test_call_pil(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PIL videos
+        video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=False)
+        for video in video_inputs:
+            self.assertIsInstance(video, list)
+            self.assertIsInstance(video[0], Image.Image)
+
+        # Test not batched input
+        encoded_videos = image_processing(video_inputs[0], return_tensors="pt").pixel_values
+        expected_output_video_shape = self.image_processor_tester.expected_output_image_shape([encoded_videos[0]])
+        self.assertEqual(tuple(encoded_videos.shape), (1, *expected_output_video_shape))
+
+        # Test batched
+        encoded_videos = image_processing(video_inputs, return_tensors="pt").pixel_values
+        expected_output_video_shape = self.image_processor_tester.expected_output_image_shape(encoded_videos)
+        self.assertEqual(
+            tuple(encoded_videos.shape), (self.image_processor_tester.batch_size, *expected_output_video_shape)
+        )
+
+    def test_call_numpy(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random numpy tensors
+        video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=False, numpify=True)
+        for video in video_inputs:
+            self.assertIsInstance(video, list)
+            self.assertIsInstance(video[0], np.ndarray)
+
+        # Test not batched input
+        encoded_videos = image_processing(video_inputs[0], return_tensors="pt").pixel_values
+        expected_output_video_shape = self.image_processor_tester.expected_output_image_shape([encoded_videos[0]])
+        self.assertEqual(tuple(encoded_videos.shape), (1, *expected_output_video_shape))
+
+        # Test batched
+        encoded_videos = image_processing(video_inputs, return_tensors="pt").pixel_values
+        expected_output_video_shape = self.image_processor_tester.expected_output_image_shape(encoded_videos)
+        self.assertEqual(
+            tuple(encoded_videos.shape), (self.image_processor_tester.batch_size, *expected_output_video_shape)
+        )
+
+    def test_call_numpy_4_channels(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random numpy tensors
+        self.image_processor_tester.num_channels = 4
+        video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=False, numpify=True)
+        for video in video_inputs:
+            self.assertIsInstance(video, list)
+            self.assertIsInstance(video[0], np.ndarray)
+
+        # Test not batched input
+        encoded_videos = image_processing(
+            video_inputs[0], return_tensors="pt", image_mean=0, image_std=1, input_data_format="channels_first"
+        ).pixel_values
+        expected_output_video_shape = self.image_processor_tester.expected_output_image_shape([encoded_videos[0]])
+        self.assertEqual(tuple(encoded_videos.shape), (1, *expected_output_video_shape))
+
+        # Test batched
+        encoded_videos = image_processing(
+            video_inputs, return_tensors="pt", image_mean=0, image_std=1, input_data_format="channels_first"
+        ).pixel_values
+        expected_output_video_shape = self.image_processor_tester.expected_output_image_shape(encoded_videos)
+        self.assertEqual(
+            tuple(encoded_videos.shape), (self.image_processor_tester.batch_size, *expected_output_video_shape)
+        )
+        self.image_processor_tester.num_channels = 3
+
+    def test_call_pytorch(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PyTorch tensors
+        video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=False, torchify=True)
+        for video in video_inputs:
+            self.assertIsInstance(video, list)
+            self.assertIsInstance(video[0], torch.Tensor)
+
+        # Test not batched input
+        encoded_videos = image_processing(video_inputs[0], return_tensors="pt").pixel_values
+        expected_output_video_shape = self.image_processor_tester.expected_output_image_shape([encoded_videos[0]])
+        self.assertEqual(tuple(encoded_videos.shape), (1, *expected_output_video_shape))
+
+        # Test batched
+        encoded_videos = image_processing(video_inputs, return_tensors="pt").pixel_values
+        expected_output_video_shape = self.image_processor_tester.expected_output_image_shape(encoded_videos)
+        self.assertEqual(
+            tuple(encoded_videos.shape), (self.image_processor_tester.batch_size, *expected_output_video_shape)
+        )
diff --git a/tests/models/vivit/test_modeling_vivit.py b/tests/models/vivit/test_modeling_vivit.py
new file mode 100644
index 000000000000..152cfac155b8
--- /dev/null
+++ b/tests/models/vivit/test_modeling_vivit.py
@@ -0,0 +1,356 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch ViViT model. """
+
+
+import copy
+import inspect
+import unittest
+
+import numpy as np
+from huggingface_hub import hf_hub_download
+
+from transformers import VivitConfig
+from transformers.models.auto import get_values
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.utils import cached_property, is_torch_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING, VivitForVideoClassification, VivitModel
+    from transformers.models.vivit.modeling_vivit import VIVIT_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+if is_vision_available():
+    from transformers import VivitImageProcessor
+
+
+class VivitModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=2,
+        is_training=True,
+        use_labels=True,
+        num_labels=10,
+        image_size=10,
+        num_frames=8,  # decreased, because default 32 takes too much RAM at inference
+        tubelet_size=[2, 4, 4],
+        num_channels=3,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu_fast",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-06,
+        qkv_bias=True,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.num_labels = num_labels
+        self.image_size = image_size
+        self.num_frames = num_frames
+        self.tubelet_size = tubelet_size
+        self.num_channels = num_channels
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.qkv_bias = qkv_bias
+        self.scope = scope
+
+        self.seq_length = (
+            (self.image_size // self.tubelet_size[2])
+            * (self.image_size // self.tubelet_size[1])
+            * (self.num_frames // self.tubelet_size[0])
+        ) + 1  # CLS token
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor(
+            [self.batch_size, self.num_frames, self.num_channels, self.image_size, self.image_size]
+        )
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.num_labels)
+
+        config = self.get_config()
+
+        return config, pixel_values, labels
+
+    def get_config(self):
+        config = VivitConfig(
+            num_frames=self.num_frames,
+            image_size=self.image_size,
+            tubelet_size=self.tubelet_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            initializer_range=self.initializer_range,
+            layer_norm_eps=self.layer_norm_eps,
+            qkv_bias=self.qkv_bias,
+        )
+        config.num_labels = self.num_labels
+        return config
+
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = VivitModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_video_classification(self, config, pixel_values, labels):
+        model = VivitForVideoClassification(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(pixel_values)
+
+        # verify the logits shape
+        expected_shape = torch.Size((self.batch_size, self.num_labels))
+        self.parent.assertEqual(result.logits.shape, expected_shape)
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values, labels = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class VivitModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as Vivit does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (VivitModel, VivitForVideoClassification) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": VivitModel, "video-classification": VivitForVideoClassification}
+        if is_torch_available()
+        else {}
+    )
+
+    test_pruning = False
+    test_torchscript = False
+    test_resize_embeddings = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = VivitModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=VivitConfig, has_text_modality=False, hidden_size=37)
+
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = copy.deepcopy(inputs_dict)
+
+        if return_labels:
+            if model_class in get_values(MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING):
+                inputs_dict["labels"] = torch.zeros(
+                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
+                )
+
+        return inputs_dict
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="Vivit does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values", "head_mask"]
+            self.assertListEqual(arg_names[:2], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_video_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_video_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in VIVIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = VivitModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        for model_class in self.all_model_classes:
+            seq_len = self.model_tester.seq_length
+
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, seq_len, seq_len],
+            )
+            out_len = len(outputs)
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            self.assertEqual(out_len + 1, len(outputs))
+
+            self_attentions = outputs.attentions
+
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(self_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, seq_len, seq_len],
+            )
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.hidden_states
+            expected_num_layers = self.model_tester.num_hidden_layers + 1
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            seq_length = self.model_tester.seq_length
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [seq_length, self.model_tester.hidden_size],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+
+# We will verify our results on a video of eating spaghetti
+# Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
+def prepare_video():
+    file = hf_hub_download(
+        repo_id="hf-internal-testing/spaghetti-video", filename="eating_spaghetti_32_frames.npy", repo_type="dataset"
+    )
+    video = np.load(file)
+    return list(video)
+
+
+@require_torch
+@require_vision
+class VivitModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_image_processor(self):
+        return VivitImageProcessor() if is_vision_available() else None
+
+    @slow
+    def test_inference_for_video_classification(self):
+        model = VivitForVideoClassification.from_pretrained("google/vivit-b-16x2-kinetics400").to(torch_device)
+
+        image_processor = self.default_image_processor
+        video = prepare_video()
+        inputs = image_processor(video, return_tensors="pt").to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # verify the logits
+        expected_shape = torch.Size((1, 400))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        # taken from original model
+        expected_slice = torch.tensor([-0.9498, 2.7971, -1.4049, 0.1024, -1.8353]).to(torch_device)
+
+        self.assertTrue(torch.allclose(outputs.logits[0, :5], expected_slice, atol=1e-4))
diff --git a/tests/models/wav2vec2/test_feature_extraction_wav2vec2.py b/tests/models/wav2vec2/test_feature_extraction_wav2vec2.py
index 98cf2f1c495b..c4ab09b6069a 100644
--- a/tests/models/wav2vec2/test_feature_extraction_wav2vec2.py
+++ b/tests/models/wav2vec2/test_feature_extraction_wav2vec2.py
@@ -29,6 +29,7 @@
 global_rng = random.Random()
 
 
+# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list
 def floats_list(shape, scale=1.0, rng=None, name=None):
     """Creates a random float32 tensor"""
     if rng is None:
@@ -96,7 +97,6 @@ def _flatten(list_of_lists):
 
 
 class Wav2Vec2FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
-
     feature_extraction_class = Wav2Vec2FeatureExtractor
 
     def setUp(self):
@@ -124,6 +124,14 @@ def test_call(self):
         for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
             self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
 
+        # Test 2-D numpy arrays are batched.
+        speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
+        np_speech_inputs = np.asarray(speech_inputs)
+        encoded_sequences_1 = feat_extract(speech_inputs, return_tensors="np").input_values
+        encoded_sequences_2 = feat_extract(np_speech_inputs, return_tensors="np").input_values
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+
     def test_zero_mean_unit_variance_normalization_np(self):
         feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
         speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
diff --git a/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py b/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py
index ac1dd3bcb44a..4cff7dca41ca 100644
--- a/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py
@@ -15,7 +15,6 @@
 import inspect
 import math
 import multiprocessing
-import os
 import traceback
 import unittest
 
@@ -45,6 +44,7 @@
     import jax.numpy as jnp
     import optax
     from flax.traverse_util import flatten_dict
+
     from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor
     from transformers.models.wav2vec2.modeling_flax_wav2vec2 import (
         FlaxWav2Vec2ForCTC,
@@ -58,6 +58,7 @@
 
 if is_pyctcdecode_available():
     import pyctcdecode.decoder
+
     from transformers import Wav2Vec2ProcessorWithLM
     from transformers.models.wav2vec2_with_lm import processing_wav2vec2_with_lm
 
@@ -67,7 +68,6 @@
 
 
 def _test_wav2vec2_with_lm_invalid_pool(in_queue, out_queue, timeout):
-
     error = None
     try:
         _ = in_queue.get(timeout=timeout)
@@ -123,7 +123,7 @@ def __init__(
         conv_bias=False,
         num_conv_pos_embeddings=16,
         num_conv_pos_embedding_groups=2,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         hidden_dropout_prob=0.1,  # this is most likely not correctly set yet
         intermediate_size=20,
@@ -276,7 +276,6 @@ def model_jitted(input_values, attention_mask=None, **kwargs):
 
                 self.assertEqual(len(outputs), len(jitted_outputs))
                 for jitted_output, output in zip(jitted_outputs, outputs):
-
                     self.assertEqual(jitted_output.shape, output.shape)
 
     def test_freeze_feature_encoder(self):
@@ -465,7 +464,7 @@ def test_sample_negatives_with_attn_mask(self):
         negative_indices = _sample_negative_indices(features.shape, num_negatives, attention_mask=attention_mask)
 
         # make sure that no padding tokens are sampled
-        self.assertTrue(all([idx not in negative_indices for idx in forbidden_indices]))
+        self.assertTrue(all(idx not in negative_indices for idx in forbidden_indices))
 
         features = features.reshape(-1, hidden_size)  # BTC => (BxT)C
         # take negative vectors from sampled indices
@@ -637,7 +636,4 @@ def test_wav2vec2_with_lm_pool(self):
     @require_pyctcdecode
     @require_librosa
     def test_wav2vec2_with_lm_invalid_pool(self):
-        timeout = os.environ.get("PYTEST_TIMEOUT", 600)
-        run_test_in_subprocess(
-            test_case=self, target_func=_test_wav2vec2_with_lm_invalid_pool, inputs=None, timeout=timeout
-        )
+        run_test_in_subprocess(test_case=self, target_func=_test_wav2vec2_with_lm_invalid_pool, inputs=None)
diff --git a/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py b/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py
index 8f9a8f0bd73b..393e056b84d4 100644
--- a/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py
@@ -14,24 +14,29 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import copy
+import gc
 import glob
 import inspect
 import math
 import multiprocessing
 import os
+import tempfile
 import traceback
 import unittest
 
 import numpy as np
 import pytest
 from datasets import load_dataset
-
 from huggingface_hub import snapshot_download
+
 from transformers import Wav2Vec2Config, is_tf_available
 from transformers.testing_utils import (
     CaptureLogger,
     is_flaky,
+    is_pt_tf_cross_test,
     require_librosa,
     require_pyctcdecode,
     require_tf,
@@ -42,17 +47,25 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
     import tensorflow as tf
 
-    from transformers import TFWav2Vec2ForCTC, TFWav2Vec2Model, Wav2Vec2Processor
+    from transformers import (
+        AutoFeatureExtractor,
+        TFWav2Vec2ForCTC,
+        TFWav2Vec2ForSequenceClassification,
+        TFWav2Vec2Model,
+        Wav2Vec2Processor,
+    )
     from transformers.models.wav2vec2.modeling_tf_wav2vec2 import _compute_mask_indices
 
 
 if is_pyctcdecode_available():
     import pyctcdecode.decoder
+
     from transformers import Wav2Vec2ProcessorWithLM
     from transformers.models.wav2vec2_with_lm import processing_wav2vec2_with_lm
 
@@ -62,7 +75,6 @@
 
 
 def _test_wav2vec2_with_lm_invalid_pool(in_queue, out_queue, timeout):
-
     error = None
     try:
         _ = in_queue.get(timeout=timeout)
@@ -118,7 +130,7 @@ def __init__(
         conv_bias=False,
         num_conv_pos_embeddings=16,
         num_conv_pos_embedding_groups=2,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         hidden_dropout_prob=0.1,  # this is most likely not correctly set yet
         intermediate_size=20,
@@ -244,6 +256,29 @@ def check_ctc_loss(self, config, input_values, *args):
 
         self.parent.assertTrue(abs(labels.shape[0] * mean_loss - sum_loss) < 1e-2)
 
+    def check_seq_classifier_loss(self, loss, config, input_values, *args):
+        model = TFWav2Vec2ForSequenceClassification(config)
+
+        input_values = input_values[:3]
+        attention_mask = tf.ones(input_values.shape, dtype=tf.int32)
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        labels = tf.random.uniform((input_values.shape[0],), maxval=len(model.config.id2label), dtype=tf.int32)
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+            attention_mask[i, input_lengths[i] :] = 0
+        training = False
+        masked_loss = (
+            model(input_values, attention_mask=attention_mask, labels=labels, training=training).loss.numpy().item()
+        )
+        unmasked_loss = model(input_values, labels=labels, training=training).loss.numpy().item()
+
+        assert isinstance(masked_loss, float)
+        assert isinstance(unmasked_loss, float)
+        assert masked_loss != unmasked_loss
+
     def check_training(self, config, input_values, *args):
         model = TFWav2Vec2ForCTC(config)
 
@@ -271,7 +306,7 @@ def check_labels_out_of_vocab(self, config, input_values, *args):
         model = TFWav2Vec2ForCTC(config)
         input_lengths = tf.constant([input_values.shape[-1] // i for i in [4, 2, 1]])
         max_length_labels = model.wav2vec2._get_feat_extract_output_lengths(input_lengths)
-        labels = ids_tensor((input_values.shape[0], min(max_length_labels) - 1), model.config.vocab_size + 100)
+        labels = ids_tensor((input_values.shape[0], min(max_length_labels) - 1), model.config.vocab_size + 500)
         with pytest.raises(ValueError):
             model(input_values, labels=labels)
 
@@ -282,9 +317,15 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFWav2Vec2ModelTest(TFModelTesterMixin, unittest.TestCase):
-
-    all_model_classes = (TFWav2Vec2Model, TFWav2Vec2ForCTC) if is_tf_available() else ()
+class TFWav2Vec2ModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (TFWav2Vec2Model, TFWav2Vec2ForCTC, TFWav2Vec2ForSequenceClassification) if is_tf_available() else ()
+    )
+    pipeline_model_mapping = (
+        {"audio-classification": TFWav2Vec2ForSequenceClassification, "feature-extraction": TFWav2Vec2Model}
+        if is_tf_available()
+        else {}
+    )
     test_resize_embeddings = False
     test_head_masking = False
     test_onnx = False
@@ -370,18 +411,15 @@ def test_train(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.check_training(*config_and_inputs)
 
-    # Wav2Vec2 has no inputs_embeds
+    @unittest.skip(reason="Wav2Vec2 has no input embeddings")
     def test_inputs_embeds(self):
         pass
 
-    # Wav2Vec2 cannot resize token embeddings
-    # since it has no tokens embeddings
+    @unittest.skip(reason="Wav2Vec2 has no tokens embeddings")
     def test_resize_tokens_embeddings(self):
         pass
 
-    # Wav2Vec2 has no inputs_embeds
-    # and thus the `get_input_embeddings` fn
-    # is not implemented
+    @unittest.skip(reason="Wav2Vec2 has no input embeddings")
     def test_model_common_attributes(self):
         pass
 
@@ -390,18 +428,78 @@ def test_model_from_pretrained(self):
         model = TFWav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
         self.assertIsNotNone(model)
 
-    @unittest.skip(reason="Dataset conversion goes OOM and crashes with the default options!")
+    @unittest.skip(reason="Fix me! Wav2Vec2 hits OOM errors when loss is computed on full batch")
     def test_dataset_conversion(self):
+        # TODO: (Amy) - check whether skipping CTC model resolves this issue and possible resolutions for CTC
         pass
 
-    @unittest.skip(reason="Training goes OOM and crashes with the default options!")
+    @unittest.skip(reason="Fix me! Wav2Vec2 hits OOM errors when loss is computed on full batch")
     def test_keras_fit(self):
+        # TODO: (Amy) - check whether skipping CTC model resolves this issue and possible resolutions for CTC
         pass
 
+    @is_pt_tf_cross_test
+    def test_pt_tf_model_equivalence(self, allow_missing_keys=False):
+        # We override the base test here to skip loss calculation for Wav2Vec2 models because the loss is massive with
+        # the default labels and frequently overflows to inf or exceeds numerical tolerances between TF/PT
+        import torch
+
+        import transformers
+
+        for model_class in self.all_model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+            # Output all for aggressive testing
+            config.output_hidden_states = True
+            config.output_attentions = self.has_attentions
+
+            # Make sure no sequence has all zeros as attention mask, otherwise some tests fail due to the inconsistency
+            # of the usage `1e-4`, `1e-9`, `1e-30`, `-inf`.
+            # TODO: Use a uniform value for all models, make sure all tests pass without this processing, and remove it.
+            self._make_attention_mask_non_null(inputs_dict)
+
+            pt_model_class_name = model_class.__name__[2:]  # Skip the "TF" at the beginning
+            pt_model_class = getattr(transformers, pt_model_class_name)
+
+            tf_model = model_class(config)
+            pt_model = pt_model_class(config)
+
+            tf_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+
+            # Check we can load pt model in tf and vice-versa with model => model functions
+            tf_model = transformers.load_pytorch_model_in_tf2_model(
+                tf_model, pt_model, tf_inputs=tf_inputs_dict, allow_missing_keys=allow_missing_keys
+            )
+            pt_model = transformers.load_tf2_model_in_pytorch_model(
+                pt_model, tf_model, allow_missing_keys=allow_missing_keys
+            )
+
+            # Original test: check without `labels`
+            self.check_pt_tf_models(tf_model, pt_model, tf_inputs_dict)
+
+            # Check we can load pt model in tf and vice-versa with checkpoint => model functions
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin")
+                torch.save(pt_model.state_dict(), pt_checkpoint_path)
+                tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(
+                    tf_model, pt_checkpoint_path, allow_missing_keys=allow_missing_keys
+                )
+
+                tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5")
+                tf_model.save_weights(tf_checkpoint_path)
+                pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(
+                    pt_model, tf_checkpoint_path, allow_missing_keys=allow_missing_keys
+                )
+
+            # Original test: check without `labels`
+            self.check_pt_tf_models(tf_model, pt_model, tf_inputs_dict)
+
 
 @require_tf
 class TFWav2Vec2RobustModelTest(TFModelTesterMixin, unittest.TestCase):
-    all_model_classes = (TFWav2Vec2Model, TFWav2Vec2ForCTC) if is_tf_available() else ()
+    all_model_classes = (
+        (TFWav2Vec2Model, TFWav2Vec2ForCTC, TFWav2Vec2ForSequenceClassification) if is_tf_available() else ()
+    )
     test_resize_embeddings = False
     test_head_masking = False
     test_onnx = False
@@ -498,18 +596,15 @@ def test_train(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.check_training(*config_and_inputs)
 
-    # Wav2Vec2 has no inputs_embeds
+    @unittest.skip(reason="Wav2Vec2 has no input embeddings")
     def test_inputs_embeds(self):
         pass
 
-    # Wav2Vec2 cannot resize token embeddings
-    # since it has no tokens embeddings
+    @unittest.skip(reason="Wav2Vec2 has no tokens embeddings")
     def test_resize_tokens_embeddings(self):
         pass
 
-    # Wav2Vec2 has no inputs_embeds
-    # and thus the `get_input_embeddings` fn
-    # is not implemented
+    @unittest.skip(reason="Wav2Vec2 has no input embeddings")
     def test_model_common_attributes(self):
         pass
 
@@ -518,14 +613,72 @@ def test_model_from_pretrained(self):
         model = TFWav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
         self.assertIsNotNone(model)
 
-    @unittest.skip(reason="Dataset conversion goes OOM and crashes with the default options!")
+    @unittest.skip(reason="Fix me! Wav2Vec2 hits OOM errors when loss is computed on full batch")
     def test_dataset_conversion(self):
+        # TODO: (Amy) - check whether skipping CTC model resolves this issue and possible resolutions for CTC
         pass
 
-    @unittest.skip(reason="Training goes OOM and crashes with the default options!")
+    @unittest.skip(reason="Fix me! Wav2Vec2 hits OOM errors when loss is computed on full batch")
     def test_keras_fit(self):
+        # TODO: (Amy) - check whether skipping CTC model resolves this issue and possible resolutions for CTC
         pass
 
+    @is_pt_tf_cross_test
+    def test_pt_tf_model_equivalence(self, allow_missing_keys=False):
+        # We override the base test here to skip loss calculation for Wav2Vec2 models because the loss is massive with
+        # the default labels and frequently overflows to inf or exceeds numerical tolerances between TF/PT
+        import torch
+
+        import transformers
+
+        for model_class in self.all_model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+            # Output all for aggressive testing
+            config.output_hidden_states = True
+            config.output_attentions = self.has_attentions
+
+            # Make sure no sequence has all zeros as attention mask, otherwise some tests fail due to the inconsistency
+            # of the usage `1e-4`, `1e-9`, `1e-30`, `-inf`.
+            # TODO: Use a uniform value for all models, make sure all tests pass without this processing, and remove it.
+            self._make_attention_mask_non_null(inputs_dict)
+
+            pt_model_class_name = model_class.__name__[2:]  # Skip the "TF" at the beginning
+            pt_model_class = getattr(transformers, pt_model_class_name)
+
+            tf_model = model_class(config)
+            pt_model = pt_model_class(config)
+
+            tf_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+
+            # Check we can load pt model in tf and vice-versa with model => model functions
+            tf_model = transformers.load_pytorch_model_in_tf2_model(
+                tf_model, pt_model, tf_inputs=tf_inputs_dict, allow_missing_keys=allow_missing_keys
+            )
+            pt_model = transformers.load_tf2_model_in_pytorch_model(
+                pt_model, tf_model, allow_missing_keys=allow_missing_keys
+            )
+
+            # Original test: check without `labels`
+            self.check_pt_tf_models(tf_model, pt_model, tf_inputs_dict)
+
+            # Check we can load pt model in tf and vice-versa with checkpoint => model functions
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin")
+                torch.save(pt_model.state_dict(), pt_checkpoint_path)
+                tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(
+                    tf_model, pt_checkpoint_path, allow_missing_keys=allow_missing_keys
+                )
+
+                tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5")
+                tf_model.save_weights(tf_checkpoint_path)
+                pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(
+                    pt_model, tf_checkpoint_path, allow_missing_keys=allow_missing_keys
+                )
+
+            # Original test: check without `labels`
+            self.check_pt_tf_models(tf_model, pt_model, tf_inputs_dict)
+
 
 @require_tf
 class TFWav2Vec2UtilsTest(unittest.TestCase):
@@ -557,6 +710,11 @@ def test_compute_mask_indices_overlap(self):
 @require_tf
 @slow
 class TFWav2Vec2ModelIntegrationTest(unittest.TestCase):
+    def tearDown(self):
+        super().tearDown()
+        # clean-up as much as possible GPU memory occupied by PyTorch
+        gc.collect()
+
     def _load_datasamples(self, num_samples):
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         # automatic decoding with librispeech
@@ -566,6 +724,11 @@ def _load_datasamples(self, num_samples):
 
         return [x["array"] for x in speech_samples]
 
+    def _load_superb(self, task, num_samples):
+        ds = load_dataset("anton-l/superb_dummy", task, split="test")
+
+        return ds[:num_samples]
+
     def test_inference_ctc_normal(self):
         model = TFWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
         processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h", do_lower_case=True)
@@ -677,7 +840,88 @@ def test_wav2vec2_with_lm_pool(self):
     @require_pyctcdecode
     @require_librosa
     def test_wav2vec2_with_lm_invalid_pool(self):
-        timeout = os.environ.get("PYTEST_TIMEOUT", 600)
-        run_test_in_subprocess(
-            test_case=self, target_func=_test_wav2vec2_with_lm_invalid_pool, inputs=None, timeout=timeout
+        run_test_in_subprocess(test_case=self, target_func=_test_wav2vec2_with_lm_invalid_pool, inputs=None)
+
+    def test_inference_keyword_spotting(self):
+        model = TFWav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-ks", from_pt=True)
+        processor = AutoFeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ks")
+        input_data = self._load_superb("ks", 4)
+        inputs = processor(input_data["speech"], return_tensors="tf", padding=True)
+        input_values = inputs.input_values
+        attention_mask = inputs.attention_mask
+        outputs = model(input_values, attention_mask)
+        predicted_logits, predicted_ids = tf.math.reduce_max(outputs.logits, axis=-1), tf.argmax(
+            outputs.logits, axis=-1
         )
+        expected_labels = [7, 6, 10, 9]
+        expected_logits = tf.convert_to_tensor([6.1186, 11.8961, 10.2931, 6.0898])
+        self.assertListEqual(predicted_ids.numpy().tolist(), expected_labels)
+        self.assertTrue(np.allclose(predicted_logits, expected_logits, atol=1e-2))
+
+    def test_inference_intent_classification(self):
+        model = TFWav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-ic", from_pt=True)
+        processor = AutoFeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
+        input_data = self._load_superb("ic", 4)
+        inputs = processor(input_data["speech"], return_tensors="tf", padding=True)
+        input_values = inputs.input_values
+        attention_mask = inputs.attention_mask
+        outputs = model(input_values, attention_mask=attention_mask)
+        predicted_logits_action, predicted_ids_action = tf.math.reduce_max(outputs.logits[:, :6], axis=-1), tf.argmax(
+            outputs.logits[:, :6], axis=-1
+        )
+        predicted_logits_object, predicted_ids_object = tf.math.reduce_max(
+            outputs.logits[:, 6:20], axis=-1
+        ), tf.argmax(outputs.logits[:, 6:20], axis=-1)
+        predicted_logits_location, predicted_ids_location = tf.math.reduce_max(
+            outputs.logits[:, 20:24], axis=-1
+        ), tf.argmax(outputs.logits[:, 20:24], axis=-1)
+        expected_labels_action = [0, 0, 2, 3]
+        expected_logits_action = tf.convert_to_tensor([0.4568, 11.0848, 1.6621, 9.3841])
+        expected_labels_object = [3, 10, 3, 4]
+        expected_logits_object = tf.convert_to_tensor([1.5322, 10.7094, 5.2469, 22.1318])
+        expected_labels_location = [0, 0, 0, 1]
+        expected_logits_location = tf.convert_to_tensor([1.5335, 6.5096, 10.5704, 11.0569])
+
+        self.assertListEqual(predicted_ids_action.numpy().tolist(), expected_labels_action)
+        self.assertListEqual(predicted_ids_object.numpy().tolist(), expected_labels_object)
+        self.assertListEqual(predicted_ids_location.numpy().tolist(), expected_labels_location)
+
+        self.assertTrue(np.allclose(predicted_logits_action, expected_logits_action, atol=1e-2))
+        self.assertTrue(np.allclose(predicted_logits_object, expected_logits_object, atol=1e-2))
+        self.assertTrue(np.allclose(predicted_logits_location, expected_logits_location, atol=1e-2))
+
+    def test_inference_speaker_identification(self):
+        model = TFWav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-sid", from_pt=True)
+        processor = AutoFeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-sid")
+        input_data = self._load_superb("si", 4)
+        output_logits = []
+        for example in input_data["speech"]:
+            input = processor(example, return_tensors="tf", padding=True)
+            output = model(input.input_values, attention_mask=None)
+            output_logits.append(output.logits[0])
+        output_logits = tf.stack(output_logits)
+        predicted_logits, predicted_ids = tf.math.reduce_max(output_logits, axis=-1), tf.argmax(output_logits, axis=-1)
+        expected_labels = [251, 1, 1, 3]
+        expected_logits = tf.convert_to_tensor([37.5627, 71.6362, 64.2419, 31.7778])
+        self.assertListEqual(predicted_ids.numpy().tolist(), expected_labels)
+        self.assertTrue(np.allclose(predicted_logits, expected_logits, atol=1e-2))
+
+    def test_inference_emotion_recognition(self):
+        model = TFWav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-er", from_pt=True)
+        processor = AutoFeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-er")
+        input_data = self._load_superb("er", 4)
+        inputs = processor(input_data["speech"], return_tensors="tf", padding=True)
+
+        input_values = inputs.input_values
+        attention_mask = inputs.attention_mask
+        outputs = model(input_values, attention_mask=attention_mask)
+        predicted_logits, predicted_ids = tf.math.reduce_max(outputs.logits, axis=-1), tf.argmax(
+            outputs.logits, axis=-1
+        )
+
+        expected_labels = [1, 1, 2, 2]
+        # s3prl logits for the same batch
+        expected_logits = tf.convert_to_tensor([2.1722, 3.0779, 8.0287, 6.6797])
+
+        self.assertListEqual(predicted_ids.numpy().tolist(), expected_labels)
+        self.assertTrue(np.allclose(predicted_logits, expected_logits, atol=1e-2))
diff --git a/tests/models/wav2vec2/test_modeling_wav2vec2.py b/tests/models/wav2vec2/test_modeling_wav2vec2.py
index 9fe18fdf57c8..16fb9ddab7b1 100644
--- a/tests/models/wav2vec2/test_modeling_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 """ Testing suite for the PyTorch Wav2Vec2 model. """
 
+import gc
 import math
 import multiprocessing
 import os
@@ -49,10 +50,12 @@
     ids_tensor,
     random_attention_mask,
 )
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
     import torch
+    from safetensors.torch import save_file as safe_save_file
 
     from transformers import (
         Wav2Vec2FeatureExtractor,
@@ -66,13 +69,12 @@
         Wav2Vec2Processor,
     )
     from transformers.models.wav2vec2.modeling_wav2vec2 import (
+        WAV2VEC2_ADAPTER_PT_FILE,
+        WAV2VEC2_ADAPTER_SAFE_FILE,
         Wav2Vec2GumbelVectorQuantizer,
         _compute_mask_indices,
         _sample_negative_indices,
     )
-    from transformers.pytorch_utils import is_torch_less_than_1_9
-else:
-    is_torch_less_than_1_9 = True
 
 
 if is_torchaudio_available():
@@ -81,6 +83,7 @@
 
 if is_pyctcdecode_available():
     import pyctcdecode.decoder
+
     from transformers import Wav2Vec2ProcessorWithLM
     from transformers.models.wav2vec2_with_lm import processing_wav2vec2_with_lm
 
@@ -90,7 +93,6 @@
 
 
 def _test_wav2vec2_with_lm_invalid_pool(in_queue, out_queue, timeout):
-
     error = None
     try:
         _ = in_queue.get(timeout=timeout)
@@ -151,7 +153,7 @@ def __init__(
         conv_bias=False,
         num_conv_pos_embeddings=16,
         num_conv_pos_embedding_groups=2,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         hidden_dropout_prob=0.1,  # this is most likely not correctly set yet
         intermediate_size=20,
@@ -292,6 +294,17 @@ def create_and_check_model_with_adapter_proj_dim(self, config, input_values, att
             (self.batch_size, self.adapter_output_seq_length, config.output_hidden_size),
         )
 
+    def create_and_check_model_with_attn_adapter(self, config, input_values, attention_mask):
+        config.adapter_attn_dim = 16
+        model = Wav2Vec2ForCTC(config=config)
+
+        self.parent.assertIsNotNone(model._get_adapters())
+
+        model.to(torch_device)
+        model.eval()
+        result = model(input_values, attention_mask=attention_mask)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.output_seq_length, self.vocab_size))
+
     def create_and_check_batch_inference(self, config, input_values, *args):
         # test does not pass for models making use of `group_norm`
         # check: https://github.com/pytorch/fairseq/issues/3227
@@ -391,8 +404,8 @@ def check_ctc_training(self, config, input_values, *args):
             input_values[i, input_lengths[i] :] = 0.0
 
             if max_length_labels[i] < labels.shape[-1]:
-                # it's important that we make sure that target lenghts are at least
-                # one shorter than logit lenghts to prevent -inf
+                # it's important that we make sure that target lengths are at least
+                # one shorter than logit lengths to prevent -inf
                 labels[i, max_length_labels[i] - 1 :] = -100
 
         loss = model(input_values, labels=labels).loss
@@ -467,12 +480,22 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class Wav2Vec2ModelTest(ModelTesterMixin, unittest.TestCase):
+class Wav2Vec2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (Wav2Vec2ForCTC, Wav2Vec2Model, Wav2Vec2ForMaskedLM, Wav2Vec2ForSequenceClassification, Wav2Vec2ForPreTraining)
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "audio-classification": Wav2Vec2ForSequenceClassification,
+            "automatic-speech-recognition": Wav2Vec2ForCTC,
+            "feature-extraction": Wav2Vec2Model,
+            "fill-mask": Wav2Vec2ForMaskedLM,
+        }
+        if is_torch_available()
+        else {}
+    )
     fx_compatible = True
     test_pruning = False
     test_headmasking = False
@@ -602,6 +625,7 @@ def test_initialization(self):
             for name, param in model.named_parameters():
                 uniform_init_parms = [
                     "conv.weight",
+                    "conv.parametrizations.weight",
                     "masked_spec_embed",
                     "codevectors",
                     "quantizer.weight_proj.weight",
@@ -614,7 +638,7 @@ def test_initialization(self):
                     "objective.weight",
                 ]
                 if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
+                    if any(x in name for x in uniform_init_parms):
                         self.assertTrue(
                             -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
@@ -698,6 +722,9 @@ def test_model_from_pretrained(self):
 
     # Wav2Vec2 cannot be torchscripted because of group norm.
     def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False):
+        # TODO: fix it
+        self.skipTest("torch 2.1 breaks torch fx tests for wav2vec2/hubert.")
+
         if not is_torch_fx_available() or not self.fx_compatible:
             return
 
@@ -836,6 +863,10 @@ def test_model_with_adapter_proj_dim(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model_with_adapter_proj_dim(*config_and_inputs)
 
+    def test_model_with_attn_adapter(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_with_attn_adapter(*config_and_inputs)
+
     def test_batched_inference(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_batch_inference(*config_and_inputs)
@@ -932,6 +963,7 @@ def test_initialization(self):
             for name, param in model.named_parameters():
                 uniform_init_parms = [
                     "conv.weight",
+                    "conv.parametrizations.weight",
                     "masked_spec_embed",
                     "codevectors",
                     "quantizer.weight_proj.weight",
@@ -944,7 +976,7 @@ def test_initialization(self):
                     "objective.weight",
                 ]
                 if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
+                    if any(x in name for x in uniform_init_parms):
                         self.assertTrue(
                             -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
@@ -1090,6 +1122,156 @@ def test_mask_time_feature_prob_ctc_single_batch(self):
     def test_feed_forward_chunking(self):
         pass
 
+    def test_load_and_set_attn_adapter(self):
+        processor = Wav2Vec2Processor.from_pretrained(
+            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
+        )
+
+        def get_logits(model, input_features):
+            model = model.to(torch_device)
+            batch = processor(
+                input_features,
+                padding=True,
+                sampling_rate=processor.feature_extractor.sampling_rate,
+                return_tensors="pt",
+            )
+
+            with torch.no_grad():
+                logits = model(
+                    input_values=batch["input_values"].to(torch_device),
+                    attention_mask=batch["attention_mask"].to(torch_device),
+                ).logits
+            return logits
+
+        input_features = [np.random.random(16_000 * s) for s in [1, 3, 2, 6]]
+
+        model = Wav2Vec2ForCTC.from_pretrained("hf-internal-testing/tiny-random-wav2vec2-adapter", target_lang="it")
+
+        logits = get_logits(model, input_features)
+
+        model_2 = Wav2Vec2ForCTC.from_pretrained("hf-internal-testing/tiny-random-wav2vec2-adapter")
+        model_2.load_adapter("it")
+
+        logits_2 = get_logits(model_2, input_features)
+
+        self.assertTrue(torch.allclose(logits, logits_2, atol=1e-3))
+
+    # test that loading adapter weights with mismatched vocab sizes can be loaded
+    def test_load_target_lang_with_mismatched_size(self):
+        processor = Wav2Vec2Processor.from_pretrained(
+            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
+        )
+
+        def get_logits(model, input_features):
+            model = model.to(torch_device)
+            batch = processor(
+                input_features,
+                padding=True,
+                sampling_rate=processor.feature_extractor.sampling_rate,
+                return_tensors="pt",
+            )
+
+            with torch.no_grad():
+                logits = model(
+                    input_values=batch["input_values"].to(torch_device),
+                    attention_mask=batch["attention_mask"].to(torch_device),
+                ).logits
+            return logits
+
+        input_features = [np.random.random(16_000 * s) for s in [1, 3, 2, 6]]
+
+        model = Wav2Vec2ForCTC.from_pretrained(
+            "hf-internal-testing/tiny-random-wav2vec2-adapter", target_lang="fr", ignore_mismatched_sizes=True
+        )
+
+        logits = get_logits(model, input_features)
+
+        model_2 = Wav2Vec2ForCTC.from_pretrained("hf-internal-testing/tiny-random-wav2vec2-adapter")
+        model_2.load_adapter("fr")
+
+        logits_2 = get_logits(model_2, input_features)
+
+        self.assertTrue(torch.allclose(logits, logits_2, atol=1e-3))
+
+    def test_load_attn_adapter(self):
+        processor = Wav2Vec2Processor.from_pretrained(
+            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
+        )
+
+        def get_logits(model, input_features):
+            model = model.to(torch_device)
+            batch = processor(
+                input_features,
+                padding=True,
+                sampling_rate=processor.feature_extractor.sampling_rate,
+                return_tensors="pt",
+            )
+
+            with torch.no_grad():
+                logits = model(
+                    input_values=batch["input_values"].to(torch_device),
+                    attention_mask=batch["attention_mask"].to(torch_device),
+                ).logits
+            return logits
+
+        input_features = [np.random.random(16_000 * s) for s in [1, 3, 2, 6]]
+
+        model = Wav2Vec2ForCTC.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", adapter_attn_dim=16)
+
+        with tempfile.TemporaryDirectory() as tempdir:
+            model.save_pretrained(tempdir)
+            model = Wav2Vec2ForCTC.from_pretrained(tempdir)
+
+            logits = get_logits(model, input_features)
+            adapter_weights = model._get_adapters()
+
+            # save safe weights
+            safe_filepath = os.path.join(tempdir, WAV2VEC2_ADAPTER_SAFE_FILE.format("eng"))
+            safe_save_file(adapter_weights, safe_filepath, metadata={"format": "pt"})
+
+            model.load_adapter("eng")
+            model.load_adapter("eng", use_safetensors=True)
+
+            with self.assertRaises(OSError):
+                model.load_adapter("eng", use_safetensors=False)
+            with self.assertRaises(Exception):
+                model.load_adapter("ita", use_safetensors=True)
+            logits_2 = get_logits(model, input_features)
+
+            self.assertTrue(torch.allclose(logits, logits_2, atol=1e-3))
+
+        with tempfile.TemporaryDirectory() as tempdir:
+            model.save_pretrained(tempdir)
+            model = Wav2Vec2ForCTC.from_pretrained(tempdir)
+
+            logits = get_logits(model, input_features)
+            adapter_weights = model._get_adapters()
+
+            # save pt weights
+            pt_filepath = os.path.join(tempdir, WAV2VEC2_ADAPTER_PT_FILE.format("eng"))
+            torch.save(adapter_weights, pt_filepath)
+
+            model.load_adapter("eng")
+            model.load_adapter("eng", use_safetensors=False)
+
+            with self.assertRaises(OSError):
+                model.load_adapter("eng", use_safetensors=True)
+
+            logits_2 = get_logits(model, input_features)
+
+            self.assertTrue(torch.allclose(logits, logits_2, atol=1e-3))
+
+        model = Wav2Vec2ForCTC.from_pretrained("hf-internal-testing/tiny-random-wav2vec2-adapter")
+        logits = get_logits(model, input_features)
+
+        model.load_adapter("eng")
+        model.load_adapter("eng", use_safetensors=False)
+        model.load_adapter("eng", use_safetensors=True)
+
+        logits_2 = get_logits(model, input_features)
+
+        self.assertTrue(torch.allclose(logits, logits_2, atol=1e-3))
+
     @slow
     def test_model_from_pretrained(self):
         model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
@@ -1206,10 +1388,10 @@ def test_sample_negatives(self):
         sequence_length = 10
         hidden_size = 4
         num_negatives = 3
-
-        features = (torch.arange(sequence_length * hidden_size, device=torch_device) // hidden_size).view(
-            sequence_length, hidden_size
-        )  # each value in vector consits of same value
+        sequence = torch.div(
+            torch.arange(sequence_length * hidden_size, device=torch_device), hidden_size, rounding_mode="floor"
+        )
+        features = sequence.view(sequence_length, hidden_size)  # each value in vector consits of same value
         features = features[None, :].expand(batch_size, sequence_length, hidden_size).contiguous()
 
         # sample negative indices
@@ -1236,9 +1418,10 @@ def test_sample_negatives_with_mask(self):
         mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=torch_device)
         mask[-1, sequence_length // 2 :] = 0
 
-        features = (torch.arange(sequence_length * hidden_size, device=torch_device) // hidden_size).view(
-            sequence_length, hidden_size
-        )  # each value in vector consits of same value
+        sequence = torch.div(
+            torch.arange(sequence_length * hidden_size, device=torch_device), hidden_size, rounding_mode="floor"
+        )
+        features = sequence.view(sequence_length, hidden_size)  # each value in vector consits of same value
         features = features[None, :].expand(batch_size, sequence_length, hidden_size).contiguous()
 
         # replace masked feature vectors with -100 to test that those are not sampled
@@ -1268,6 +1451,12 @@ def test_sample_negatives_with_mask(self):
 @require_soundfile
 @slow
 class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
+    def tearDown(self):
+        super().tearDown()
+        # clean-up as much as possible GPU memory occupied by PyTorch
+        gc.collect()
+        torch.cuda.empty_cache()
+
     def _load_datasamples(self, num_samples):
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         # automatic decoding with librispeech
@@ -1643,10 +1832,6 @@ def test_phoneme_recognition(self):
 
     @require_pyctcdecode
     @require_torchaudio
-    @unittest.skipIf(
-        is_torch_less_than_1_9,
-        reason="`torchaudio.functional.resample` needs torchaudio >= 0.9 which requires torch >= 0.9",
-    )
     def test_wav2vec2_with_lm(self):
         ds = load_dataset("common_voice", "es", split="test", streaming=True)
         sample = next(iter(ds))
@@ -1671,10 +1856,6 @@ def test_wav2vec2_with_lm(self):
 
     @require_pyctcdecode
     @require_torchaudio
-    @unittest.skipIf(
-        is_torch_less_than_1_9,
-        reason="`torchaudio.functional.resample` needs torchaudio >= 0.9 which requires torch >= 0.9",
-    )
     def test_wav2vec2_with_lm_pool(self):
         ds = load_dataset("common_voice", "es", split="test", streaming=True)
         sample = next(iter(ds))
@@ -1713,10 +1894,7 @@ def test_wav2vec2_with_lm_pool(self):
     @require_pyctcdecode
     @require_torchaudio
     def test_wav2vec2_with_lm_invalid_pool(self):
-        timeout = os.environ.get("PYTEST_TIMEOUT", 600)
-        run_test_in_subprocess(
-            test_case=self, target_func=_test_wav2vec2_with_lm_invalid_pool, inputs=None, timeout=timeout
-        )
+        run_test_in_subprocess(test_case=self, target_func=_test_wav2vec2_with_lm_invalid_pool, inputs=None)
 
     def test_inference_diarization(self):
         model = Wav2Vec2ForAudioFrameClassification.from_pretrained("anton-l/wav2vec2-base-superb-sd").to(torch_device)
@@ -1770,3 +1948,45 @@ def test_inference_speaker_verification(self):
 
         # TODO: update the tolerance after the CI moves to torch 1.10
         self.assertAlmostEqual(outputs.loss.item(), 17.7963, 2)
+
+    @require_torchaudio
+    def test_inference_mms_1b_all(self):
+        model = Wav2Vec2ForCTC.from_pretrained("facebook/mms-1b-all").to(torch_device)
+        processor = Wav2Vec2Processor.from_pretrained("facebook/mms-1b-all")
+
+        LANG_MAP = {"it": "ita", "es": "spa", "fr": "fra", "en": "eng"}
+
+        def run_model(lang):
+            ds = load_dataset("common_voice", lang, split="test", streaming=True)
+            sample = next(iter(ds))
+
+            wav2vec2_lang = LANG_MAP[lang]
+
+            model.load_adapter(wav2vec2_lang)
+            processor.tokenizer.set_target_lang(wav2vec2_lang)
+
+            resampled_audio = torchaudio.functional.resample(
+                torch.tensor(sample["audio"]["array"]), 48_000, 16_000
+            ).numpy()
+
+            inputs = processor(resampled_audio, sampling_rate=16_000, return_tensors="pt")
+            input_values = inputs.input_values.to(torch_device)
+            attention_mask = inputs.attention_mask.to(torch_device)
+
+            with torch.no_grad():
+                outputs = model(input_values, attention_mask=attention_mask).logits
+
+            ids = torch.argmax(outputs, dim=-1)[0]
+
+            transcription = processor.decode(ids)
+            return transcription
+
+        TRANSCRIPTIONS = {
+            "it": "mi hanno fatto un'offerta che non potevo proprio rifiutare",
+            "es": "bien y qué regalo vas a abrir primero",
+            "fr": "un vrai travail intéressant va enfin être mené sur ce sujet",
+            "en": "twas the time of day and olof spen slept during the summer",
+        }
+
+        for lang in LANG_MAP.keys():
+            assert run_model(lang) == TRANSCRIPTIONS[lang]
diff --git a/tests/models/wav2vec2/test_tokenization_wav2vec2.py b/tests/models/wav2vec2/test_tokenization_wav2vec2.py
index 4027e0cefc4d..3ab8717d8199 100644
--- a/tests/models/wav2vec2/test_tokenization_wav2vec2.py
+++ b/tests/models/wav2vec2/test_tokenization_wav2vec2.py
@@ -25,6 +25,7 @@
 
 from transformers import (
     WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST,
+    AddedToken,
     Wav2Vec2Config,
     Wav2Vec2CTCTokenizer,
     Wav2Vec2Tokenizer,
@@ -38,6 +39,7 @@
 global_rng = random.Random()
 
 
+# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list
 def floats_list(shape, scale=1.0, rng=None, name=None):
     """Creates a random float32 tensor"""
     if rng is None:
@@ -164,6 +166,14 @@ def test_call(self):
         for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
             self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
 
+        # Test 2-D numpy arrays are batched.
+        speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
+        np_speech_inputs = np.asarray(speech_inputs)
+        encoded_sequences_1 = tokenizer(speech_inputs, return_tensors="np").input_values
+        encoded_sequences_2 = tokenizer(np_speech_inputs, return_tensors="np").input_values
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+
     def test_padding(self, max_length=50):
         def _input_values_have_equal_length(input_values):
             length = len(input_values[0])
@@ -231,7 +241,7 @@ def test_save_pretrained(self):
         tokenizer_files = tokenizer.save_pretrained(tmpdirname2)
         self.assertSequenceEqual(
             sorted(tuple(VOCAB_FILES_NAMES.values()) + ("special_tokens_map.json", "added_tokens.json")),
-            sorted(tuple(x.split(os.path.sep)[-1] for x in tokenizer_files)),
+            sorted(x.split(os.path.sep)[-1] for x in tokenizer_files),
         )
 
         # Checks everything loads correctly in the same way
@@ -285,7 +295,9 @@ def test_save_and_load_tokenizer(self):
         tokenizer.add_tokens(["?", "!"])
         additional_special_tokens = tokenizer.additional_special_tokens
         additional_special_tokens.append("&")
-        tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+        tokenizer.add_special_tokens(
+            {"additional_special_tokens": additional_special_tokens}, replace_additional_special_tokens=False
+        )
         before_tokens = tokenizer.decode(sample_ids)
         before_vocab = tokenizer.get_vocab()
         tokenizer.save_pretrained(tmpdirname)
@@ -456,13 +468,13 @@ def test_tokenizer_decode_added_tokens(self):
     def test_special_characters_in_vocab(self):
         sent = "ʈʰ æ æ̃ ˧ kʰ"
 
-        vocab_dict = {k: v for v, k in enumerate({phoneme for phoneme in sent.split()})}
+        vocab_dict = {k: v for v, k in enumerate(set(sent.split()))}
         vocab_file = os.path.join(self.tmpdirname, "vocab_special.json")
 
         with open(vocab_file, "w") as f:
             json.dump(vocab_dict, f)
 
-        tokenizer = Wav2Vec2CTCTokenizer(vocab_file)
+        tokenizer = Wav2Vec2CTCTokenizer(vocab_file)  # , unk_token="<unk>")
 
         expected_sent = tokenizer.decode(tokenizer(sent).input_ids, spaces_between_special_tokens=True)
         self.assertEqual(sent, expected_sent)
@@ -724,7 +736,10 @@ def test_add_tokens_tokenizer(self):
                 self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
                 self.assertGreater(tokens[-3], tokenizer.vocab_size - 1)
 
-                new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"}
+                new_toks_2 = {
+                    "eos_token": AddedToken(">>>>|||<||<<|<<", lstrip=False, rstrip=False),
+                    "pad_token": AddedToken("<<<<<|||>|>>>>|>", rstrip=False, lstrip=False),
+                }
                 added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
                 vocab_size_3 = tokenizer.vocab_size
                 all_size_3 = len(tokenizer)
@@ -764,3 +779,48 @@ def test_convert_tokens_to_string_format(self):
                 output = tokenizer.convert_tokens_to_string(tokens)
 
                 self.assertIsInstance(output["text"], str)
+
+    def test_nested_vocab(self):
+        eng_vocab = {"a": 7, "b": 8}
+        spa_vocab = {"a": 23, "c": 88}
+        ita_vocab = {"a": 6, "d": 9}
+
+        nested_vocab = {"eng": eng_vocab, "spa": spa_vocab, "ita": ita_vocab}
+
+        def check_tokenizer(tokenizer, check_ita_first=False):
+            if check_ita_first:
+                self.assertEqual(tokenizer.decode([6, 9, 9]), "ad")
+                self.assertEqual(tokenizer.encoder, ita_vocab)
+                tokenizer.set_target_lang("eng")
+
+            self.assertEqual(tokenizer.encoder, eng_vocab)
+            self.assertEqual(tokenizer.decode([7, 8, 7]), "aba")
+
+            tokenizer.set_target_lang("spa")
+            self.assertEqual(tokenizer.decode([23, 88, 23]), "aca")
+            self.assertEqual(tokenizer.encoder, spa_vocab)
+
+            tokenizer.set_target_lang("eng")
+            self.assertEqual(tokenizer.encoder, eng_vocab)
+            self.assertEqual(tokenizer.decode([7, 7, 8]), "ab")
+
+            tokenizer.set_target_lang("ita")
+            self.assertEqual(tokenizer.decode([6, 9, 9]), "ad")
+            self.assertEqual(tokenizer.encoder, ita_vocab)
+
+        with tempfile.TemporaryDirectory() as tempdir:
+            tempfile_path = os.path.join(tempdir, "vocab.json")
+            with open(tempfile_path, "w") as temp_file:
+                json.dump(nested_vocab, temp_file)
+
+            tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(tempdir, target_lang="eng")
+
+        check_tokenizer(tokenizer)
+
+        with tempfile.TemporaryDirectory() as tempdir:
+            # should have saved target lang as "ita" since it was last one
+            tokenizer.save_pretrained(tempdir)
+            tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(tempdir)
+
+            self.assertEqual(tokenizer.target_lang, "ita")
+            check_tokenizer(tokenizer, check_ita_first=True)
diff --git a/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py b/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
index cb2719a591b6..33d37a073be9 100644
--- a/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
+++ b/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
@@ -13,15 +13,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Testing suite for the PyTorch Wav2Vec2-Conformer model. """
-
 import math
+import tempfile
 import unittest
 
 import numpy as np
 from datasets import load_dataset
 
 from transformers import Wav2Vec2ConformerConfig, is_torch_available
-from transformers.testing_utils import is_pt_flax_cross_test, require_torch, slow, torch_device
+from transformers.testing_utils import is_pt_flax_cross_test, require_torch, require_torch_gpu, slow, torch_device
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
@@ -31,6 +31,7 @@
     ids_tensor,
     random_attention_mask,
 )
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -70,7 +71,7 @@ def __init__(
         conv_bias=False,
         num_conv_pos_embeddings=16,
         num_conv_pos_embedding_groups=2,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         hidden_dropout_prob=0.1,
         intermediate_size=20,
@@ -214,6 +215,23 @@ def create_and_check_model_with_adapter_proj_dim(self, config, input_values, att
             (self.batch_size, self.adapter_output_seq_length, config.output_hidden_size),
         )
 
+    def create_and_check_model_float16(self, config, input_values, attention_mask):
+        model = Wav2Vec2ConformerModel(config=config)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_pretrained(tmpdirname)
+            model = Wav2Vec2ConformerModel.from_pretrained(tmpdirname, torch_dtype=torch.float16)
+
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            result = model(input_values.type(dtype=torch.float16), attention_mask=attention_mask)
+
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
+        )
+
     def create_and_check_batch_inference(self, config, input_values, *args):
         # test does not pass for models making use of `group_norm`
         # check: https://github.com/pytorch/fairseq/issues/3227
@@ -313,8 +331,8 @@ def check_ctc_training(self, config, input_values, *args):
             input_values[i, input_lengths[i] :] = 0.0
 
             if max_length_labels[i] < labels.shape[-1]:
-                # it's important that we make sure that target lenghts are at least
-                # one shorter than logit lenghts to prevent -inf
+                # it's important that we make sure that target lengths are at least
+                # one shorter than logit lengths to prevent -inf
                 labels[i, max_length_labels[i] - 1 :] = -100
 
         loss = model(input_values, labels=labels).loss
@@ -389,7 +407,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class Wav2Vec2ConformerModelTest(ModelTesterMixin, unittest.TestCase):
+class Wav2Vec2ConformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             Wav2Vec2ConformerForCTC,
@@ -402,6 +420,15 @@ class Wav2Vec2ConformerModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "audio-classification": Wav2Vec2ConformerForSequenceClassification,
+            "automatic-speech-recognition": Wav2Vec2ConformerForCTC,
+            "feature-extraction": Wav2Vec2ConformerModel,
+        }
+        if is_torch_available()
+        else {}
+    )
     test_pruning = False
     test_headmasking = False
     test_torchscript = False
@@ -441,6 +468,16 @@ def test_model_with_adapter_proj_dim(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model_with_adapter_proj_dim(*config_and_inputs)
 
+    @require_torch_gpu
+    def test_model_float16_with_relative(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs(position_embeddings_type="relative")
+        self.model_tester.create_and_check_model_float16(*config_and_inputs)
+
+    @require_torch_gpu
+    def test_model_float16_with_rotary(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs(position_embeddings_type="rotary")
+        self.model_tester.create_and_check_model_float16(*config_and_inputs)
+
     def test_ctc_loss_inference(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.check_ctc_loss(*config_and_inputs)
@@ -543,6 +580,7 @@ def test_initialization(self):
             for name, param in model.named_parameters():
                 uniform_init_parms = [
                     "conv.weight",
+                    "conv.parametrizations.weight",
                     "masked_spec_embed",
                     "codevectors",
                     "quantizer.weight_proj.weight",
@@ -559,7 +597,7 @@ def test_initialization(self):
                     "objective.weight",
                 ]
                 if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
+                    if any(x in name for x in uniform_init_parms):
                         self.assertTrue(
                             -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
diff --git a/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py b/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
index 92e185bdc73b..bd1582ceb134 100644
--- a/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
+++ b/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
@@ -23,9 +23,8 @@
 import datasets
 import numpy as np
 from datasets import load_dataset
-from packaging import version
-
 from parameterized import parameterized
+
 from transformers import AutoProcessor
 from transformers.models.wav2vec2 import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor
 from transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES
@@ -38,6 +37,7 @@
 if is_pyctcdecode_available():
     from huggingface_hub import snapshot_download
     from pyctcdecode import BeamSearchDecoderCTC
+
     from transformers.models.wav2vec2_with_lm import Wav2Vec2ProcessorWithLM
     from transformers.models.wav2vec2_with_lm.processing_wav2vec2_with_lm import Wav2Vec2DecoderWithLMOutput
 
@@ -214,7 +214,7 @@ def test_decoder_batch(self, pool_context):
             with get_context(pool_context).Pool() as pool:
                 decoded_processor = processor.batch_decode(logits, pool)
 
-        logits_list = [array for array in logits]
+        logits_list = list(logits)
 
         with get_context("fork").Pool() as p:
             decoded_beams = decoder.decode_beams_batch(p, logits_list)
@@ -239,7 +239,7 @@ def test_decoder_with_params(self):
 
         logits = self._get_dummy_logits()
 
-        beam_width = 20
+        beam_width = 15
         beam_prune_logp = -20.0
         token_min_logp = -4.0
 
@@ -251,7 +251,7 @@ def test_decoder_with_params(self):
         )
         decoded_processor = decoded_processor_out.text
 
-        logits_list = [array for array in logits]
+        logits_list = list(logits)
 
         with get_context("fork").Pool() as pool:
             decoded_decoder_out = decoder.decode_beams_batch(
@@ -263,9 +263,17 @@ def test_decoder_with_params(self):
             )
 
         decoded_decoder = [d[0][0] for d in decoded_decoder_out]
+        logit_scores = [d[0][2] for d in decoded_decoder_out]
+        lm_scores = [d[0][3] for d in decoded_decoder_out]
 
         self.assertListEqual(decoded_decoder, decoded_processor)
-        self.assertListEqual(["<s> </s> </s>", "<s> <s> </s>"], decoded_processor)
+        self.assertListEqual(["</s> <s> <s>", "<s> <s> <s>"], decoded_processor)
+
+        self.assertTrue(np.array_equal(logit_scores, decoded_processor_out.logit_score))
+        self.assertTrue(np.allclose([-20.054, -18.447], logit_scores, atol=1e-3))
+
+        self.assertTrue(np.array_equal(lm_scores, decoded_processor_out.lm_score))
+        self.assertTrue(np.allclose([-15.554, -13.9474], lm_scores, atol=1e-3))
 
     def test_decoder_with_params_of_lm(self):
         feature_extractor = self.get_feature_extractor()
@@ -290,7 +298,7 @@ def test_decoder_with_params_of_lm(self):
         )
         decoded_processor = decoded_processor_out.text
 
-        logits_list = [array for array in logits]
+        logits_list = list(logits)
         decoder.reset_params(
             alpha=alpha,
             beta=beta,
@@ -452,7 +460,7 @@ def test_word_time_stamp_integration(self):
             for d in output["word_offsets"]
         ]
 
-        EXPECTED_TEXT = "WHY DOES A MILE SANDRA LOOK LIKE SHE WANTS TO CONSUME JOHN SNOW ON THE RIVER AT THE WALL"
+        EXPECTED_TEXT = "WHY DOES MILISANDRA LOOK LIKE SHE WANTS TO CONSUME JOHN SNOW ON THE RIVER AT THE WALL"
 
         # output words
         self.assertEqual(" ".join(self.get_from_offsets(word_time_stamps, "word")), EXPECTED_TEXT)
@@ -463,14 +471,8 @@ def test_word_time_stamp_integration(self):
         end_times = torch.tensor(self.get_from_offsets(word_time_stamps, "end_time"))
 
         # fmt: off
-        expected_start_tensor = torch.tensor([1.42, 1.64, 2.12, 2.26, 2.54, 3.0, 3.24, 3.6, 3.8, 4.1, 4.26, 4.94, 5.28, 5.66, 5.78, 5.94, 6.32, 6.54, 6.66])
-
-        # TODO(Patrick): This if-else version statement should be removed once
-        # https://github.com/huggingface/datasets/issues/4889 is resolved
-        if version.parse(version.parse(torch.__version__).base_version) >= version.parse("1.12.0"):
-            expected_end_tensor = torch.tensor([1.54, 1.88, 2.14, 2.46, 2.9, 3.16, 3.54, 3.72, 4.02, 4.18, 4.76, 5.16, 5.56, 5.7, 5.86, 6.2, 6.38, 6.62, 6.94])
-        else:
-            expected_end_tensor = torch.tensor([1.54, 1.88, 2.14, 2.46, 2.9, 3.18, 3.54, 3.72, 4.02, 4.18, 4.76, 5.16, 5.56, 5.7, 5.86, 6.2, 6.38, 6.62, 6.94])
+        expected_start_tensor = torch.tensor([1.4199, 1.6599, 2.2599, 3.0, 3.24, 3.5999, 3.7999, 4.0999, 4.26, 4.94, 5.28, 5.6599, 5.78, 5.94, 6.32, 6.5399, 6.6599])
+        expected_end_tensor = torch.tensor([1.5399, 1.8999, 2.9, 3.16, 3.5399, 3.72, 4.0199, 4.1799, 4.76, 5.1599, 5.5599, 5.6999, 5.86, 6.1999, 6.38, 6.6199, 6.94])
         # fmt: on
 
         self.assertTrue(torch.allclose(start_times, expected_start_tensor, atol=0.01))
diff --git a/tests/models/wavlm/test_modeling_wavlm.py b/tests/models/wavlm/test_modeling_wavlm.py
index 297d207af3e5..6db04e1841f1 100644
--- a/tests/models/wavlm/test_modeling_wavlm.py
+++ b/tests/models/wavlm/test_modeling_wavlm.py
@@ -31,6 +31,7 @@
     ids_tensor,
     random_attention_mask,
 )
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -63,7 +64,7 @@ def __init__(
         conv_bias=False,
         num_conv_pos_embeddings=16,
         num_conv_pos_embedding_groups=2,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         hidden_dropout_prob=0.1,  # this is most likely not correctly set yet
         intermediate_size=20,
@@ -255,8 +256,8 @@ def check_ctc_training(self, config, input_values, *args):
             input_values[i, input_lengths[i] :] = 0.0
 
             if max_length_labels[i] < labels.shape[-1]:
-                # it's important that we make sure that target lenghts are at least
-                # one shorter than logit lenghts to prevent -inf
+                # it's important that we make sure that target lengths are at least
+                # one shorter than logit lengths to prevent -inf
                 labels[i, max_length_labels[i] - 1 :] = -100
 
         loss = model(input_values, labels=labels).loss
@@ -308,12 +309,21 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class WavLMModelTest(ModelTesterMixin, unittest.TestCase):
+class WavLMModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (WavLMForCTC, WavLMModel, WavLMForAudioFrameClassification, WavLMForSequenceClassification, WavLMForXVector)
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {
+            "audio-classification": WavLMForSequenceClassification,
+            "automatic-speech-recognition": WavLMForCTC,
+            "feature-extraction": WavLMModel,
+        }
+        if is_torch_available()
+        else {}
+    )
     test_pruning = False
     test_headmasking = False
 
@@ -414,6 +424,7 @@ def test_initialization(self):
             for name, param in model.named_parameters():
                 uniform_init_parms = [
                     "conv.weight",
+                    "conv.parametrizations.weight",
                     "masked_spec_embed",
                     "codevectors",
                     "quantizer.weight_proj.weight",
@@ -428,7 +439,7 @@ def test_initialization(self):
                     "objective.weight",
                 ]
                 if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
+                    if any(x in name for x in uniform_init_parms):
                         self.assertTrue(
                             -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
diff --git a/tests/models/whisper/test_feature_extraction_whisper.py b/tests/models/whisper/test_feature_extraction_whisper.py
index c03763cdf63f..90cbfc21c04f 100644
--- a/tests/models/whisper/test_feature_extraction_whisper.py
+++ b/tests/models/whisper/test_feature_extraction_whisper.py
@@ -21,6 +21,7 @@
 import unittest
 
 import numpy as np
+from datasets import load_dataset
 
 from transformers import is_speech_available
 from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_torchaudio
@@ -113,7 +114,6 @@ def _flatten(list_of_lists):
 @require_torch
 @require_torchaudio
 class WhisperFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
-
     feature_extraction_class = WhisperFeatureExtractor if is_speech_available() else None
 
     def setUp(self):
@@ -129,8 +129,8 @@ def test_feat_extract_from_and_save_pretrained(self):
 
         dict_first = feat_extract_first.to_dict()
         dict_second = feat_extract_second.to_dict()
-        mel_1 = dict_first.pop("mel_filters")
-        mel_2 = dict_second.pop("mel_filters")
+        mel_1 = feat_extract_first.mel_filters
+        mel_2 = feat_extract_second.mel_filters
         self.assertTrue(np.allclose(mel_1, mel_2))
         self.assertEqual(dict_first, dict_second)
 
@@ -144,8 +144,8 @@ def test_feat_extract_to_json_file(self):
 
         dict_first = feat_extract_first.to_dict()
         dict_second = feat_extract_second.to_dict()
-        mel_1 = dict_first.pop("mel_filters")
-        mel_2 = dict_second.pop("mel_filters")
+        mel_1 = feat_extract_first.mel_filters
+        mel_2 = feat_extract_second.mel_filters
         self.assertTrue(np.allclose(mel_1, mel_2))
         self.assertEqual(dict_first, dict_second)
 
@@ -173,6 +173,14 @@ def test_call(self):
         for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
             self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
 
+        # Test 2-D numpy arrays are batched.
+        speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
+        np_speech_inputs = np.asarray(speech_inputs)
+        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
+        encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+
         # Test truncation required
         speech_inputs = [floats_list((1, x))[0] for x in range(200, (feature_extractor.n_samples + 500), 200)]
         np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
@@ -199,8 +207,6 @@ def test_double_precision_pad(self):
             self.assertTrue(pt_processed.input_features.dtype == torch.float32)
 
     def _load_datasamples(self, num_samples):
-        from datasets import load_dataset
-
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         # automatic decoding with librispeech
         speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
@@ -220,6 +226,16 @@ def test_integration(self):
         # fmt: on
 
         input_speech = self._load_datasamples(1)
-        feaure_extractor = WhisperFeatureExtractor()
-        input_features = feaure_extractor(input_speech, return_tensors="pt").input_features
+        feature_extractor = WhisperFeatureExtractor()
+        input_features = feature_extractor(input_speech, return_tensors="pt").input_features
+        self.assertEqual(input_features.shape, (1, 80, 3000))
         self.assertTrue(torch.allclose(input_features[0, 0, :30], EXPECTED_INPUT_FEATURES, atol=1e-4))
+
+    def test_zero_mean_unit_variance_normalization_trunc_np_longest(self):
+        feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        audio = self._load_datasamples(1)[0]
+        audio = ((audio - audio.min()) / (audio.max() - audio.min())) * 65535  # Rescale to [0, 65535] to show issue
+        audio = feat_extract.zero_mean_unit_var_norm([audio], attention_mask=None)[0]
+
+        self.assertTrue(np.all(np.mean(audio) < 1e-3))
+        self.assertTrue(np.all(np.abs(np.var(audio) - 1) < 1e-3))
diff --git a/tests/models/whisper/test_modeling_flax_whisper.py b/tests/models/whisper/test_modeling_flax_whisper.py
new file mode 100644
index 000000000000..7ec5f90f0fcd
--- /dev/null
+++ b/tests/models/whisper/test_modeling_flax_whisper.py
@@ -0,0 +1,911 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+import inspect
+import tempfile
+import unittest
+
+import transformers
+from transformers import WhisperConfig, is_flax_available
+from transformers.testing_utils import is_pt_flax_cross_test, require_flax, slow
+from transformers.utils import cached_property
+from transformers.utils.import_utils import is_datasets_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_flax_common import FlaxModelTesterMixin, floats_tensor
+
+
+if is_datasets_available():
+    import datasets
+    from datasets import load_dataset
+
+if is_flax_available():
+    import jax
+    import numpy as np
+    from flax.core.frozen_dict import unfreeze
+    from flax.traverse_util import flatten_dict
+
+    from transformers import (
+        FLAX_MODEL_MAPPING,
+        FlaxWhisperForAudioClassification,
+        FlaxWhisperForConditionalGeneration,
+        FlaxWhisperModel,
+        WhisperFeatureExtractor,
+        WhisperProcessor,
+    )
+    from transformers.modeling_flax_pytorch_utils import load_flax_weights_in_pytorch_model
+
+
+@require_flax
+class FlaxWhisperModelTester:
+    config_cls = WhisperConfig
+    config_updates = {}
+    hidden_act = "gelu"
+
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=60,
+        is_training=True,
+        use_labels=False,
+        vocab_size=99,
+        d_model=16,
+        decoder_attention_heads=4,
+        decoder_ffn_dim=16,
+        decoder_layers=2,
+        encoder_attention_heads=4,
+        encoder_ffn_dim=16,
+        encoder_layers=2,
+        input_channels=1,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=70,
+        max_source_positions=30,
+        max_target_positions=40,
+        bos_token_id=98,
+        eos_token_id=98,
+        pad_token_id=0,
+        num_mel_bins=80,
+        decoder_start_token_id=85,
+        num_conv_layers=1,
+        suppress_tokens=None,
+        begin_suppress_tokens=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        self.hidden_size = d_model
+        self.num_hidden_layers = encoder_layers
+        self.num_attention_heads = encoder_attention_heads
+        self.decoder_attention_heads = decoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_seq_length = seq_length // 2
+        self.decoder_seq_length = 1
+        self.input_channels = input_channels
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.num_mel_bins = num_mel_bins
+        self.max_position_embeddings = max_position_embeddings
+        self.max_source_positions = max_source_positions
+        self.max_target_positions = max_target_positions
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.decoder_start_token_id = decoder_start_token_id
+        self.num_conv_layers = num_conv_layers
+        self.suppress_tokens = suppress_tokens
+        self.begin_suppress_tokens = begin_suppress_tokens
+
+    def prepare_config_and_inputs_for_common(self):
+        input_features = floats_tensor([self.batch_size, self.num_mel_bins, self.seq_length], self.vocab_size)
+
+        decoder_input_ids = np.array(self.batch_size * [[self.decoder_start_token_id]])
+
+        config = WhisperConfig(
+            vocab_size=self.vocab_size,
+            num_mel_bins=self.num_mel_bins,
+            decoder_start_token_id=self.decoder_start_token_id,
+            is_encoder_decoder=True,
+            activation_function=self.hidden_act,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            max_source_positions=self.max_source_positions,
+            max_target_positions=self.max_target_positions,
+            pad_token_id=self.pad_token_id,
+            bos_token_id=self.bos_token_id,
+            eos_token_id=self.eos_token_id,
+            tie_word_embeddings=True,
+            d_model=self.d_model,
+            decoder_attention_heads=self.decoder_attention_heads,
+            decoder_ffn_dim=self.decoder_ffn_dim,
+            decoder_layers=self.decoder_layers,
+            encoder_attention_heads=self.encoder_attention_heads,
+            encoder_ffn_dim=self.encoder_ffn_dim,
+            encoder_layers=self.encoder_layers,
+            suppress_tokens=self.suppress_tokens,
+            begin_suppress_tokens=self.begin_suppress_tokens,
+        )
+        inputs_dict = prepare_whisper_inputs_dict(config, input_features, decoder_input_ids)
+        return config, inputs_dict
+
+
+def prepare_whisper_inputs_dict(
+    config,
+    input_ids,
+    decoder_input_ids,
+    attention_mask=None,
+    decoder_attention_mask=None,
+):
+    if decoder_attention_mask is None:
+        decoder_attention_mask = np.concatenate(
+            [
+                np.ones(decoder_input_ids[:, :1].shape, dtype=np.int8),
+                np.not_equal(decoder_input_ids[:, 1:], config.pad_token_id).astype(np.int8),
+            ],
+            axis=-1,
+        )
+    return {
+        "input_features": input_ids,
+        "decoder_input_ids": decoder_input_ids,
+        "decoder_attention_mask": decoder_attention_mask,
+    }
+
+
+def partialclass(cls, *args, **kwargs):
+    class NewCls(cls):
+        __init__ = functools.partialmethod(cls.__init__, *args, **kwargs)
+
+    return NewCls
+
+
+def make_partial_class(full_class, *args, **kwargs):
+    partial_class = partialclass(full_class, *args, **kwargs)
+    partial_class.__name__ = full_class.__name__
+    partial_class.__module__ = full_class.__module__
+
+    return partial_class
+
+
+@require_flax
+class FlaxWhisperModelTest(FlaxModelTesterMixin, unittest.TestCase):
+    all_model_classes = (FlaxWhisperForConditionalGeneration, FlaxWhisperModel) if is_flax_available() else ()
+    all_generative_model_classes = (FlaxWhisperForConditionalGeneration,) if is_flax_available() else ()
+    is_encoder_decoder = True
+    test_pruning = False
+    test_head_masking = False
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = FlaxWhisperModelTester(self)
+        _, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        self.init_shape = (1,) + inputs_dict["input_features"].shape[1:]
+
+        self.all_model_classes = (
+            make_partial_class(model_class, input_shape=self.init_shape) for model_class in self.all_model_classes
+        )
+        self.config_tester = ConfigTester(self, config_class=WhisperConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    # overwrite because of `input_features`
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.__call__)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["input_features", "decoder_input_ids"]
+            self.assertListEqual(arg_names[:2], expected_arg_names)
+
+    # overwrite because of `input_features`
+    def test_jit_compilation(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
+                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+                model = model_class(config)
+
+                @jax.jit
+                def model_jitted(input_features, decoder_input_ids, **kwargs):
+                    return model(input_features=input_features, decoder_input_ids=decoder_input_ids, **kwargs)
+
+                with self.subTest("JIT Enabled"):
+                    jitted_outputs = model_jitted(**prepared_inputs_dict).to_tuple()
+
+                with self.subTest("JIT Disabled"):
+                    with jax.disable_jit():
+                        outputs = model_jitted(**prepared_inputs_dict).to_tuple()
+
+                self.assertEqual(len(outputs), len(jitted_outputs))
+                for jitted_output, output in zip(jitted_outputs, outputs):
+                    self.assertEqual(jitted_output.shape, output.shape)
+
+    def check_pt_flax_outputs(self, fx_outputs, pt_outputs, model_class, tol=5e-5, name="outputs", attributes=None):
+        # We override with a slightly higher tol value, as test recently became flaky
+        super().check_pt_flax_outputs(fx_outputs, pt_outputs, model_class, tol, name, attributes)
+
+    # overwrite because of `input_features`
+    @is_pt_flax_cross_test
+    def test_save_load_bf16_to_base_pt(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        base_class = make_partial_class(FLAX_MODEL_MAPPING[config.__class__], input_shape=self.init_shape)
+
+        for model_class in self.all_model_classes:
+            if model_class.__name__ == base_class.__name__:
+                continue
+
+            model = model_class(config)
+            model.params = model.to_bf16(model.params)
+            base_params_from_head = flatten_dict(unfreeze(model.params[model.base_model_prefix]))
+
+            # convert Flax model to PyTorch model
+            pt_model_class = getattr(transformers, model_class.__name__[4:])  # Skip the "Flax" at the beginning
+            pt_model = pt_model_class(config).eval()
+            pt_model = load_flax_weights_in_pytorch_model(pt_model, model.params)
+
+            # check that all base model weights are loaded correctly
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                pt_model.save_pretrained(tmpdirname)
+                base_model = base_class.from_pretrained(tmpdirname, from_pt=True)
+
+                base_params = flatten_dict(unfreeze(base_model.params))
+
+                for key in base_params_from_head.keys():
+                    max_diff = (base_params[key] - base_params_from_head[key]).sum().item()
+                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
+
+    # overwrite because of `input_features`
+    @is_pt_flax_cross_test
+    def test_save_load_from_base_pt(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        base_class = make_partial_class(FLAX_MODEL_MAPPING[config.__class__], input_shape=self.init_shape)
+
+        for model_class in self.all_model_classes:
+            if model_class.__name__ == base_class.__name__:
+                continue
+
+            model = base_class(config)
+            base_params = flatten_dict(unfreeze(model.params))
+
+            # convert Flax model to PyTorch model
+            pt_model_class = getattr(transformers, base_class.__name__[4:])  # Skip the "Flax" at the beginning
+            pt_model = pt_model_class(config).eval()
+            pt_model = load_flax_weights_in_pytorch_model(pt_model, model.params)
+
+            # check that all base model weights are loaded correctly
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                # save pt model
+                pt_model.save_pretrained(tmpdirname)
+                head_model = model_class.from_pretrained(tmpdirname, from_pt=True)
+
+                base_param_from_head = flatten_dict(unfreeze(head_model.params[head_model.base_model_prefix]))
+
+                for key in base_param_from_head.keys():
+                    max_diff = (base_params[key] - base_param_from_head[key]).sum().item()
+                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
+
+    # overwrite because of `input_features`
+    @is_pt_flax_cross_test
+    def test_save_load_to_base_pt(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        base_class = make_partial_class(FLAX_MODEL_MAPPING[config.__class__], input_shape=self.init_shape)
+
+        for model_class in self.all_model_classes:
+            if model_class.__name__ == base_class.__name__:
+                continue
+
+            model = model_class(config)
+            base_params_from_head = flatten_dict(unfreeze(model.params[model.base_model_prefix]))
+
+            # convert Flax model to PyTorch model
+            pt_model_class = getattr(transformers, model_class.__name__[4:])  # Skip the "Flax" at the beginning
+            pt_model = pt_model_class(config).eval()
+            pt_model = load_flax_weights_in_pytorch_model(pt_model, model.params)
+
+            # check that all base model weights are loaded correctly
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                pt_model.save_pretrained(tmpdirname)
+                base_model = base_class.from_pretrained(tmpdirname, from_pt=True)
+
+                base_params = flatten_dict(unfreeze(base_model.params))
+
+                for key in base_params_from_head.keys():
+                    max_diff = (base_params[key] - base_params_from_head[key]).sum().item()
+                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
+
+    # overwrite because of `input_features`
+    def test_save_load_from_base(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        base_class = make_partial_class(FLAX_MODEL_MAPPING[config.__class__], input_shape=self.init_shape)
+
+        for model_class in self.all_model_classes:
+            if model_class.__name__ == base_class.__name__:
+                continue
+
+            model = base_class(config)
+            base_params = flatten_dict(unfreeze(model.params))
+
+            # check that all base model weights are loaded correctly
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                head_model = model_class.from_pretrained(tmpdirname)
+
+                base_param_from_head = flatten_dict(unfreeze(head_model.params[head_model.base_model_prefix]))
+
+                for key in base_param_from_head.keys():
+                    max_diff = (base_params[key] - base_param_from_head[key]).sum().item()
+                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
+
+    # overwrite because of `input_features`
+    def test_save_load_to_base(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        base_class = make_partial_class(FLAX_MODEL_MAPPING[config.__class__], input_shape=self.init_shape)
+
+        for model_class in self.all_model_classes:
+            if model_class.__name__ == base_class.__name__:
+                continue
+
+            model = model_class(config)
+            base_params_from_head = flatten_dict(unfreeze(model.params[model.base_model_prefix]))
+
+            # check that all base model weights are loaded correctly
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                base_model = base_class.from_pretrained(tmpdirname)
+
+                base_params = flatten_dict(unfreeze(base_model.params))
+
+                for key in base_params_from_head.keys():
+                    max_diff = (base_params[key] - base_params_from_head[key]).sum().item()
+                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
+
+
+@slow
+@require_flax
+class FlaxWhisperModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_processor(self):
+        return WhisperProcessor.from_pretrained("openai/whisper-base")
+
+    def _load_datasamples(self, num_samples):
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        # automatic decoding with librispeech
+        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+
+        return [x["array"] for x in speech_samples]
+
+    def test_tiny_logits_librispeech(self):
+        model = FlaxWhisperModel.from_pretrained("openai/whisper-tiny", from_pt=True)
+        input_speech = self._load_datasamples(1)
+        feature_extractor = WhisperFeatureExtractor()
+        input_features = feature_extractor(input_speech, return_tensors="np").input_features
+
+        logits = model(
+            input_features,
+            decoder_input_ids=np.array([[50258, 50259, 50359]]),
+            output_hidden_states=False,
+            output_attentions=False,
+            return_dict=False,
+        )
+
+        # fmt: off
+        EXPECTED_LOGITS = np.array(
+            [
+                2.9892, -6.7607, 5.7348, 3.6096, 0.2152, -5.7321, 4.8855, -1.6407,
+                0.2823, -1.5718, 10.4269, 3.4427, 0.0219, -8.0612, 3.4784, 8.4246,
+                4.0575, -2.2864, 11.1084, 0.9963, 0.9884, -8.5154, -3.5469, -9.3713,
+                0.9786, 3.5435, 7.4850, -5.2579, -1.4366, 10.4841
+            ]
+        )
+        # fmt: on
+        self.assertTrue(np.allclose(logits[0][0, 0, :30], EXPECTED_LOGITS, atol=1e-4))
+
+    def test_small_en_logits_librispeech(self):
+        model = FlaxWhisperModel.from_pretrained("openai/whisper-small.en", from_pt=True)
+        input_speech = self._load_datasamples(1)
+        feature_extractor = WhisperFeatureExtractor()
+        input_features = feature_extractor(input_speech, return_tensors="np").input_features
+
+        logits = model(
+            input_features,
+            decoder_input_ids=np.array([model.config.decoder_start_token_id]),
+            output_hidden_states=False,
+            output_attentions=False,
+            return_dict=False,
+        )
+
+        logits = logits[0] @ model.params["model"]["decoder"]["embed_tokens"]["embedding"].T
+
+        # fmt: off
+        EXPECTED_LOGITS = np.array(
+            [
+                -3.6784, -7.7211, -9.5070, -11.9286, -7.6489, -9.7026, -5.6188,
+                -8.0104, -4.6238, -5.1833, -9.0485, -3.4079, -5.4874, -2.6935,
+                -6.3479, -7.3398, -6.9558, -7.6867, -7.4748, -8.3463, -9.9781,
+                -10.8389, -10.3105, -11.7201, -9.7261, -7.1590, -5.9272, -12.4509,
+                -11.1146, -8.1918
+            ]
+        )
+        # fmt: on
+        self.assertTrue(np.allclose(logits[0, 0, :30], EXPECTED_LOGITS, atol=1e-4))
+
+    def test_large_logits_librispeech(self):
+        model = FlaxWhisperModel.from_pretrained("openai/whisper-large", from_pt=True)
+        input_speech = self._load_datasamples(1)
+        processor = WhisperProcessor.from_pretrained("openai/whisper-large")
+        processed_inputs = processor(
+            audio=input_speech, text="This part of the speech", add_special_tokens=False, return_tensors="np"
+        )
+        input_features = processed_inputs.input_features
+        decoder_input_ids = processed_inputs.labels
+
+        logits = model(
+            input_features,
+            decoder_input_ids=decoder_input_ids,
+            output_hidden_states=False,
+            output_attentions=False,
+            return_dict=False,
+        )
+
+        logits = logits[0] @ model.params["model"]["decoder"]["embed_tokens"]["embedding"].T
+
+        # fmt: off
+        EXPECTED_LOGITS = np.array(
+            [
+                2.1382, 0.9381, 4.4671, 3.5589, 2.4022, 3.8576, -0.6521, 2.5472,
+                1.8301, 1.9957, 2.3432, 1.4678, 0.5459, 2.2597, 1.5179, 2.5357,
+                1.1624, 0.6194, 1.0757, 1.8259, 2.4076, 1.6601, 2.3503, 1.3376,
+                1.9891, 1.8635, 3.8931, 5.3699, 4.4772, 3.9184
+            ]
+        )
+        # fmt: on
+        self.assertTrue(np.allclose(logits[0, 0, :30], EXPECTED_LOGITS, atol=1e-4))
+
+    def test_tiny_en_generation(self):
+        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+        model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True)
+        model.config.decoder_start_token_id = 50257
+
+        input_speech = self._load_datasamples(1)
+        input_features = processor.feature_extractor(
+            raw_speech=input_speech, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="jax"
+        ).input_features
+
+        generated_ids = model.generate(input_features, num_beams=5, max_length=20).sequences
+        transcript = processor.tokenizer.decode(generated_ids[0])
+
+        EXPECTED_TRANSCRIPT = (
+            "<|startoftranscript|><|en|><|transcribe|><|notimestamps|> Mr. Quilter is the apostle of the middle"
+            " classes and we are glad to"
+        )
+        self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
+
+    def test_tiny_generation(self):
+        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
+        model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny", from_pt=True)
+
+        input_speech = self._load_datasamples(1)
+        input_features = processor.feature_extractor(
+            raw_speech=input_speech, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="jax"
+        ).input_features
+
+        generated_ids = model.generate(input_features, num_beams=5, max_length=20).sequences
+        transcript = processor.tokenizer.decode(generated_ids[0])
+
+        EXPECTED_TRANSCRIPT = (
+            "<|startoftranscript|><|en|><|transcribe|><|notimestamps|> Mr. Quilter is the apostle of the middle"
+            " classes and we are glad"
+        )
+        self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
+
+    def test_large_generation(self):
+        processor = WhisperProcessor.from_pretrained("openai/whisper-large")
+        model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-large", from_pt=True)
+
+        input_speech = self._load_datasamples(1)
+        input_features = processor.feature_extractor(
+            raw_speech=input_speech, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="jax"
+        ).input_features
+
+        model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="en", task="transcribe")
+
+        generated_ids = model.generate(input_features, num_beams=5, max_length=20).sequences
+        transcript = processor.tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+
+        EXPECTED_TRANSCRIPT = " Mr. Quilter is the apostle of the middle classes and we are glad"
+        self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
+
+    def test_large_generation_multilingual(self):
+        processor = WhisperProcessor.from_pretrained("openai/whisper-large")
+        model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-large", from_pt=True)
+
+        ds = load_dataset("common_voice", "ja", split="test", streaming=True)
+        ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))
+        input_speech = next(iter(ds))["audio"]["array"]
+        input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="np")
+
+        model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="ja", task="transcribe")
+        generated_ids = model.generate(input_features, do_sample=False, max_length=20).sequences
+        transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+
+        EXPECTED_TRANSCRIPT = "木村さんに電話を貸してもらいました"
+        self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
+
+        model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="en", task="transcribe")
+        generated_ids = model.generate(
+            input_features,
+            do_sample=False,
+            max_length=20,
+        ).sequences
+        transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+
+        EXPECTED_TRANSCRIPT = " Kimura-san called me."
+        self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
+
+        model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="ja", task="translate")
+        generated_ids = model.generate(input_features, do_sample=False, max_length=20).sequences
+        transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+
+        EXPECTED_TRANSCRIPT = " I borrowed a phone from Kimura san"
+        self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
+
+    def test_large_batched_generation(self):
+        processor = WhisperProcessor.from_pretrained("openai/whisper-large")
+        model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-large", from_pt=True)
+
+        input_speech = self._load_datasamples(4)
+        input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="np").input_features
+        generated_ids = model.generate(input_features, max_length=20).sequences
+
+        # fmt: off
+        EXPECTED_LOGITS = np.array(
+            [
+                [50258, 50358, 50363, 2221, 13, 2326, 388, 391, 307, 264, 50244, 295, 264, 2808, 5359, 293, 321, 366, 5404, 281],
+                [50258, 50358, 50363, 6966, 307, 2221, 13, 2326, 388, 391, 311, 9060, 1570, 1880, 813, 702, 1871, 13, 50257, 50257],
+                [50258, 50358, 50363, 634, 5112, 505, 300, 412, 341, 42729, 3196, 295, 264, 1064, 11, 365, 5272, 293, 12904, 9256],
+                [50258, 50358, 50363, 634, 575, 12525, 22618, 1968, 6144, 35617, 20084, 1756, 311, 589, 307, 534, 10281, 934, 439, 11]
+            ]
+        )
+        # fmt: on
+
+        self.assertTrue(np.allclose(generated_ids, EXPECTED_LOGITS))
+
+        # fmt: off
+        EXPECTED_TRANSCRIPT = [
+            " Mr. Quilter is the apostle of the middle classes and we are glad to",
+            " Nor is Mr. Quilter's manner less interesting than his matter.",
+            " He tells us that at this festive season of the year, with Christmas and roast beef",
+            " He has grave doubts whether Sir Frederick Layton's work is really Greek after all,",
+        ]
+        # fmt: on
+
+        transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)
+        self.assertListEqual(transcript, EXPECTED_TRANSCRIPT)
+
+    def test_tiny_en_batched_generation(self):
+        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+        model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True)
+
+        input_speech = self._load_datasamples(4)
+        input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="np").input_features
+        generated_ids = model.generate(input_features, max_length=20).sequences
+
+        # fmt: off
+        EXPECTED_LOGITS = np.array(
+            [
+                [50257, 50362, 1770, 13, 2264, 346, 353, 318, 262, 46329, 286, 262, 3504, 6097, 11, 290, 356, 389, 9675, 284],
+                [50257, 50362, 5414, 318, 1770, 13, 2264, 346, 353, 338, 5642, 1342, 3499, 621, 465, 2300, 13, 50256, 50256, 50256],
+                [50257, 50362, 679, 4952, 514, 326, 379, 428, 43856, 1622, 286, 262, 614, 11, 351, 6786, 290, 32595, 12023, 28236],
+                [50257, 50362, 679, 468, 12296, 17188, 1771, 7361, 26113, 18881, 1122, 338, 670, 318, 1107, 8312, 706, 477, 290, 460]
+            ]
+
+        )
+        # fmt: on
+
+        self.assertTrue(np.allclose(generated_ids, EXPECTED_LOGITS))
+
+        # fmt: off
+        EXPECTED_TRANSCRIPT = [
+            " Mr. Quilter is the apostle of the middle classes, and we are glad to",
+            " Nor is Mr. Quilter's manner less interesting than his matter.",
+            " He tells us that at this festive season of the year, with Christmas and roast beef looming",
+            " He has grave doubts whether Sir Frederick Layton's work is really Greek after all and can",
+        ]
+        # fmt: on
+
+        transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)
+        self.assertListEqual(transcript, EXPECTED_TRANSCRIPT)
+
+    @slow
+    def test_tiny_timestamp_generation(self):
+        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
+        model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
+
+        input_speech = np.concatenate(self._load_datasamples(4))
+        input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="jax").input_features
+
+        generate_fn = jax.jit(functools.partial(model.generate, max_length=448, return_timestamps=True))
+
+        generated_ids = generate_fn(input_features)
+
+        # fmt: off
+        EXPECTED_OUTPUT = np.array([50258, 50259, 50359, 50364, 2221, 13, 2326, 388, 391, 307, 264, 50244, 295, 264, 2808, 5359, 11, 293, 321, 366, 5404, 281, 2928, 702, 14943, 13, 50692, 50692, 6966, 307, 2221, 13, 2326, 388, 391, 311, 9060, 1570, 1880, 813, 702, 1871, 13, 50926, 50926, 634, 5112, 505, 300, 412, 341, 42729, 3196, 295, 264, 1064, 11, 365, 5272, 293, 12904, 9256, 450, 10539, 51208, 51208, 949, 505, 11, 14138, 10117, 490, 3936, 293, 1080, 3542, 5160, 881, 26336, 281, 264, 1575, 13, 51552, 51552, 634, 575, 12525, 22618, 1968, 6144, 35617, 7354, 1292, 6, 589, 307, 534, 10281, 934, 439, 11, 293, 51836, 51836, 50257])
+        # fmt: on
+
+        self.assertTrue(np.allclose(generated_ids, EXPECTED_OUTPUT))
+
+        EXPECTED_TRANSCRIPT = [
+            {
+                "text": (
+                    " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is"
+                    " Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season"
+                    " of the year, with Christmas and roast beef looming before us, similarly drawn from eating and"
+                    " its results occur most readily to the mind. He has grave doubts whether Sir Frederick Latins'"
+                    " work is really Greek after all, and"
+                ),
+                "offsets": [
+                    {
+                        "text": (
+                            " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel."
+                        ),
+                        "timestamp": (0.0, 6.5600000000000005),
+                    },
+                    {
+                        "text": " Nor is Mr. Quilter's manner less interesting than his matter.",
+                        "timestamp": (6.5600000000000005, 11.24),
+                    },
+                    {
+                        "text": (
+                            " He tells us that at this festive season of the year, with Christmas and roast beef"
+                            " looming"
+                        ),
+                        "timestamp": (11.24, 16.88),
+                    },
+                    {
+                        "text": (
+                            " before us, similarly drawn from eating and its results occur most readily to the mind."
+                        ),
+                        "timestamp": (16.88, 23.76),
+                    },
+                    {
+                        "text": (
+                            " He has grave doubts whether Sir Frederick Latins' work is really Greek after all, and"
+                        ),
+                        "timestamp": (23.76, 29.44),
+                    },
+                ],
+            }
+        ]
+
+        transcript = processor.batch_decode(generated_ids, skip_special_tokens=True, output_offsets=True)
+        self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
+
+
+class FlaxWhisperEncoderModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=60,
+        is_training=True,
+        use_labels=True,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        input_channels=1,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=20,
+        max_source_positions=30,
+        num_mel_bins=80,
+        num_conv_layers=1,
+        suppress_tokens=None,
+        begin_suppress_tokens=None,
+        classifier_proj_size=4,
+        num_labels=2,
+        is_encoder_decoder=False,
+        is_decoder=False,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.input_channels = input_channels
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.num_mel_bins = num_mel_bins
+        self.max_position_embeddings = max_position_embeddings
+        self.max_source_positions = max_source_positions
+        self.num_conv_layers = num_conv_layers
+        self.suppress_tokens = suppress_tokens
+        self.begin_suppress_tokens = begin_suppress_tokens
+        self.classifier_proj_size = classifier_proj_size
+        self.num_labels = num_labels
+        self.is_encoder_decoder = is_encoder_decoder
+        self.is_decoder = is_decoder
+
+    def get_config(self):
+        return WhisperConfig(
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            input_channels=self.input_channels,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            max_source_positions=self.max_source_positions,
+            decoder_ffn_dim=self.hidden_size,
+            encoder_ffn_dim=self.hidden_size,
+            suppress_tokens=self.suppress_tokens,
+            begin_suppress_tokens=self.begin_suppress_tokens,
+            classifier_proj_size=self.classifier_proj_size,
+            num_labels=self.num_labels,
+            is_encoder_decoder=self.is_encoder_decoder,
+            is_decoder=self.is_decoder,
+        )
+
+    def prepare_whisper_encoder_inputs_dict(
+        self,
+        input_features,
+    ):
+        return {
+            "input_features": input_features,
+        }
+
+    def prepare_config_and_inputs(self):
+        input_features = floats_tensor([self.batch_size, self.num_mel_bins, self.seq_length])
+
+        config = self.get_config()
+        inputs_dict = self.prepare_whisper_encoder_inputs_dict(
+            input_features=input_features,
+        )
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def get_subsampled_output_lengths(self, input_lengths):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        for i in range(self.num_conv_layers):
+            input_lengths = (input_lengths - 1) // 2 + 1
+
+        return input_lengths
+
+    @property
+    def encoder_seq_length(self):
+        return self.get_subsampled_output_lengths(self.seq_length)
+
+
+@require_flax
+class WhisperEncoderModelTest(FlaxModelTesterMixin, unittest.TestCase):
+    all_model_classes = (FlaxWhisperForAudioClassification,) if is_flax_available() else ()
+    is_encoder_decoder = False
+    fx_compatible = False
+    test_pruning = False
+    test_missing_keys = False
+
+    input_name = "input_features"
+
+    def setUp(self):
+        self.model_tester = FlaxWhisperEncoderModelTester(self)
+        _, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        self.init_shape = (1,) + inputs_dict["input_features"].shape[1:]
+
+        self.all_model_classes = (
+            make_partial_class(model_class, input_shape=self.init_shape) for model_class in self.all_model_classes
+        )
+        self.config_tester = ConfigTester(self, config_class=WhisperConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    # overwrite because of `input_features`
+    def test_jit_compilation(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
+                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+                model = model_class(config)
+
+                @jax.jit
+                def model_jitted(input_features, **kwargs):
+                    return model(input_features=input_features, **kwargs)
+
+                with self.subTest("JIT Enabled"):
+                    jitted_outputs = model_jitted(**prepared_inputs_dict).to_tuple()
+
+                with self.subTest("JIT Disabled"):
+                    with jax.disable_jit():
+                        outputs = model_jitted(**prepared_inputs_dict).to_tuple()
+
+                self.assertEqual(len(outputs), len(jitted_outputs))
+                for jitted_output, output in zip(jitted_outputs, outputs):
+                    self.assertEqual(jitted_output.shape, output.shape)
+
+    # overwrite because of `input_features`
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.__call__)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["input_features", "attention_mask", "output_attentions"]
+            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+
+    def test_inputs_embeds(self):
+        pass
+
+    # WhisperEncoder has no inputs_embeds and thus the `get_input_embeddings` fn is not implemented
+    def test_model_common_attributes(self):
+        pass
+
+    # WhisperEncoder cannot resize token embeddings since it has no tokens embeddings
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    # WhisperEncoder does not have any base model
+    def test_save_load_to_base(self):
+        pass
+
+    # WhisperEncoder does not have any base model
+    def test_save_load_from_base(self):
+        pass
+
+    # WhisperEncoder does not have any base model
+    @is_pt_flax_cross_test
+    def test_save_load_from_base_pt(self):
+        pass
+
+    # WhisperEncoder does not have any base model
+    @is_pt_flax_cross_test
+    def test_save_load_to_base_pt(self):
+        pass
+
+    # WhisperEncoder does not have any base model
+    @is_pt_flax_cross_test
+    def test_save_load_bf16_to_base_pt(self):
+        pass
diff --git a/tests/models/whisper/test_modeling_tf_whisper.py b/tests/models/whisper/test_modeling_tf_whisper.py
index 7facdd28743d..7fae1e466e7a 100644
--- a/tests/models/whisper/test_modeling_tf_whisper.py
+++ b/tests/models/whisper/test_modeling_tf_whisper.py
@@ -14,8 +14,9 @@
 # limitations under the License.
 """ Testing suite for the TensorFlow Whisper model. """
 
+from __future__ import annotations
+
 import inspect
-import os
 import tempfile
 import traceback
 import unittest
@@ -29,6 +30,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_datasets_available():
@@ -80,7 +82,7 @@ def __init__(
         seq_length=60,
         is_training=True,
         use_labels=False,
-        vocab_size=99,
+        vocab_size=200,
         hidden_size=16,
         num_hidden_layers=2,
         num_attention_heads=4,
@@ -254,9 +256,10 @@ def check_encoder_decoder_model_standalone(self, config, inputs_dict):
 
 
 @require_tf
-class TFWhisperModelTest(TFModelTesterMixin, unittest.TestCase):
+class TFWhisperModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (TFWhisperModel, TFWhisperForConditionalGeneration) if is_tf_available() else ()
     all_generative_model_classes = (TFWhisperForConditionalGeneration,) if is_tf_available() else ()
+    pipeline_model_mapping = {"feature-extraction": TFWhisperModel} if is_tf_available() else {}
     is_encoder_decoder = True
     fx_compatible = False
     test_pruning = False
@@ -265,6 +268,11 @@ class TFWhisperModelTest(TFModelTesterMixin, unittest.TestCase):
 
     input_name = "input_features"
 
+    # TODO (ydshieh): undo skip once a fix is done on TF side.
+    @unittest.skip("Skip for now as TF 2.13 breaks it on GPU")
+    def test_xla_generate_slow(self):
+        super().test_xla_generate_slow()
+
     def setUp(self):
         self.model_tester = TFWhisperModelTester(self)
         self.config_tester = ConfigTester(self, config_class=WhisperConfig)
@@ -278,7 +286,7 @@ def test_save_load_strict(self):
         for model_class in self.all_model_classes:
             model = model_class(config)
 
-            model(model.dummy_inputs)
+            model.build()
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 model.save_pretrained(tmpdirname, saved_model=False)
@@ -302,7 +310,7 @@ def _get_input_ids_and_config(self):
         input_ids = input_ids[:max_batch_size, :, :]
 
         # generate max 3 tokens
-        max_length = input_ids.shape[-1] + 3
+        max_length = 4
         if config.eos_token_id is not None and config.pad_token_id is None:
             # hack to allow generate for models such as GPT2 as is done in `generate()`
             config.pad_token_id = config.eos_token_id
@@ -399,6 +407,10 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
+    def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=5e-5, name="outputs", attributes=None):
+        # We override with a slightly higher tol value, as test recently became flaky
+        super().check_pt_tf_outputs(tf_outputs, pt_outputs, model_class, tol, name, attributes)
+
     def test_attention_outputs(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
@@ -627,9 +639,50 @@ def test_lm_head_model_random_beam_search_generate(self):
             generated_ids = output_tokens[:, input_features.shape[-1] :]
             self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
 
+    def test_generate_with_prompt_ids_and_task_and_language(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        model = TFWhisperForConditionalGeneration(config)
+        input_features = input_dict["input_features"]
+        prompt_ids = np.arange(5)
+        language = "<|de|>"
+        task = "translate"
+        lang_id = 6
+        task_id = 7
+        model.generation_config.__setattr__("lang_to_id", {language: lang_id})
+        model.generation_config.__setattr__("task_to_id", {task: task_id})
+
+        output = model.generate(input_features, max_new_tokens=5, task=task, language=language, prompt_ids=prompt_ids)
+
+        expected_output_start = [
+            *prompt_ids.tolist(),
+            model.generation_config.decoder_start_token_id,
+            lang_id,
+            task_id,
+        ]
+        for row in output.numpy().tolist():
+            self.assertListEqual(row[: len(expected_output_start)], expected_output_start)
 
-def _load_datasamples(num_samples):
+    def test_generate_with_prompt_ids_and_forced_decoder_ids(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        model = TFWhisperForConditionalGeneration(config)
+        input_features = input_dict["input_features"]
+        prompt_ids = np.asarray(range(5))
+        forced_decoder_ids = [(1, 6), (2, 7), (3, 8)]
 
+        output = model.generate(
+            input_features, max_new_tokens=5, forced_decoder_ids=forced_decoder_ids, prompt_ids=prompt_ids
+        )
+
+        expected_output_start = [
+            *prompt_ids.tolist(),
+            model.generation_config.decoder_start_token_id,
+            *[token for _rank, token in forced_decoder_ids],
+        ]
+        for row in output.numpy().tolist():
+            self.assertListEqual(row[: len(expected_output_start)], expected_output_start)
+
+
+def _load_datasamples(num_samples):
     ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
     # automatic decoding with librispeech
     speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
@@ -638,7 +691,6 @@ def _load_datasamples(num_samples):
 
 
 def _test_large_logits_librispeech(in_queue, out_queue, timeout):
-
     error = None
     try:
         _ = in_queue.get(timeout=timeout)
@@ -687,7 +739,6 @@ def _test_large_logits_librispeech(in_queue, out_queue, timeout):
 
 
 def _test_large_generation(in_queue, out_queue, timeout):
-
     error = None
     try:
         _ = in_queue.get(timeout=timeout)
@@ -699,8 +750,9 @@ def _test_large_generation(in_queue, out_queue, timeout):
         input_speech = _load_datasamples(1)
         input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="tf").input_features
 
-        model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="en", task="transcribe")
-        generated_ids = model.generate(input_features, do_sample=False, max_length=20)
+        generated_ids = model.generate(
+            input_features, do_sample=False, max_length=20, language="<|en|>", task="transcribe"
+        )
         transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
         EXPECTED_TRANSCRIPT = " Mr. Quilter is the apostle of the middle classes and we are glad"
@@ -714,7 +766,6 @@ def _test_large_generation(in_queue, out_queue, timeout):
 
 
 def _test_large_generation_multilingual(in_queue, out_queue, timeout):
-
     error = None
     try:
         _ = in_queue.get(timeout=timeout)
@@ -728,26 +779,25 @@ def _test_large_generation_multilingual(in_queue, out_queue, timeout):
         input_speech = next(iter(ds))["audio"]["array"]
         input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="tf").input_features
 
-        model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="ja", task="transcribe")
-        generated_ids = model.generate(input_features, do_sample=False, max_length=20)
+        generated_ids = model.generate(
+            input_features, do_sample=False, max_length=20, language="<|ja|>", task="transcribe"
+        )
         transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
         EXPECTED_TRANSCRIPT = "木村さんに電話を貸してもらいました"
         unittest.TestCase().assertEqual(transcript, EXPECTED_TRANSCRIPT)
 
-        model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="en", task="transcribe")
         generated_ids = model.generate(
-            input_features,
-            do_sample=False,
-            max_length=20,
+            input_features, do_sample=False, max_length=20, language="<|en|>", task="transcribe"
         )
         transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
         EXPECTED_TRANSCRIPT = " Kimura-san called me."
         unittest.TestCase().assertEqual(transcript, EXPECTED_TRANSCRIPT)
 
-        model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="ja", task="translate")
-        generated_ids = model.generate(input_features, do_sample=False, max_length=20)
+        generated_ids = model.generate(
+            input_features, do_sample=False, max_length=20, language="<|ja|>", task="translate"
+        )
         transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
         EXPECTED_TRANSCRIPT = " I borrowed a phone from Kimura san"
@@ -761,7 +811,6 @@ def _test_large_generation_multilingual(in_queue, out_queue, timeout):
 
 
 def _test_large_batched_generation(in_queue, out_queue, timeout):
-
     error = None
     try:
         _ = in_queue.get(timeout=timeout)
@@ -777,21 +826,19 @@ def _test_large_batched_generation(in_queue, out_queue, timeout):
         generated_ids = np.concatenate([generated_ids_1, generated_ids_2])
 
         # fmt: off
-        EXPECTED_LOGITS = tf.convert_to_tensor(
-            [
-                [50258, 50358, 50363, 2221, 13, 2326, 388, 391, 307, 264, 50244, 295, 264, 2808, 5359, 293, 321, 366, 5404, 281],
-                [50258, 50358, 50363, 6966, 307, 2221, 13, 2326, 388, 391, 311, 9060, 1570, 1880, 813, 702, 1871, 13, 50257, 50257],
-                [50258, 50358, 50363, 634, 5112, 505, 300, 412, 341, 42729, 3196, 295, 264, 1064, 11, 365, 5272, 293, 12904, 9256],
-                [50258, 50358, 50363, 634, 575, 12525, 22618, 1968, 6144, 35617, 20084, 1756, 311, 589, 307, 534, 10281, 934, 439, 11]
-            ]
-        )
+        EXPECTED_IDS = [
+            [50258, 50358, 50363, 2221, 13, 2326, 388, 391, 307, 264, 50244, 295, 264, 2808, 5359, 293, 321, 366, 5404, 281],
+            [50258, 50358, 50363, 6966, 307, 2221, 13, 2326, 388, 391, 311, 9060, 1570, 1880, 813, 702, 1871, 13, 50257, 50257],
+            [50258, 50358, 50363, 634, 5112, 505, 300, 412, 341, 42729, 3196, 295, 264, 1064, 11, 365, 5272, 293, 12904, 9256],
+            [50258, 50358, 50363, 634, 575, 12525, 22618, 1968, 6144, 35617, 20084, 1756, 311, 589, 307, 534, 10281, 934, 439, 11]
+        ]
         # fmt: on
 
-        unittest.TestCase().assertTrue(np.allclose(generated_ids, EXPECTED_LOGITS))
+        unittest.TestCase().assertEqual(generated_ids.tolist(), EXPECTED_IDS)
 
         # fmt: off
         EXPECTED_TRANSCRIPT = [
-            ' Mr. Quilter is the apostle of the middle classes and we are glad to',
+            " Mr. Quilter is the apostle of the middle classes and we are glad to",
             " Nor is Mr. Quilter's manner less interesting than his matter.",
             " He tells us that at this festive season of the year, with Christmas and roast beef",
             " He has grave doubts whether Sir Frederick Layton's work is really Greek after all,"
@@ -896,10 +943,7 @@ def test_small_en_logits_librispeech(self):
 
     @slow
     def test_large_logits_librispeech(self):
-        timeout = os.environ.get("PYTEST_TIMEOUT", 600)
-        run_test_in_subprocess(
-            test_case=self, target_func=_test_large_logits_librispeech, inputs=None, timeout=timeout
-        )
+        run_test_in_subprocess(test_case=self, target_func=_test_large_logits_librispeech, inputs=None)
 
     @slow
     def test_tiny_en_generation(self):
@@ -964,22 +1008,15 @@ def test_tiny_xla_generation(self):
 
     @slow
     def test_large_generation(self):
-        timeout = os.environ.get("PYTEST_TIMEOUT", 600)
-        run_test_in_subprocess(test_case=self, target_func=_test_large_generation, inputs=None, timeout=timeout)
+        run_test_in_subprocess(test_case=self, target_func=_test_large_generation, inputs=None)
 
     @slow
     def test_large_generation_multilingual(self):
-        timeout = os.environ.get("PYTEST_TIMEOUT", 600)
-        run_test_in_subprocess(
-            test_case=self, target_func=_test_large_generation_multilingual, inputs=None, timeout=timeout
-        )
+        run_test_in_subprocess(test_case=self, target_func=_test_large_generation_multilingual, inputs=None)
 
     @slow
     def test_large_batched_generation(self):
-        timeout = os.environ.get("PYTEST_TIMEOUT", 600)
-        run_test_in_subprocess(
-            test_case=self, target_func=_test_large_batched_generation, inputs=None, timeout=timeout
-        )
+        run_test_in_subprocess(test_case=self, target_func=_test_large_batched_generation, inputs=None)
 
     @slow
     def test_tiny_en_batched_generation(self):
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 8b854a2b20e0..9decb7192aee 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -20,14 +20,18 @@
 import tempfile
 import unittest
 
+import numpy as np
+
+import transformers
 from transformers import WhisperConfig
-from transformers.testing_utils import is_torch_available, require_torch, require_torchaudio, slow, torch_device
-from transformers.utils import cached_property
+from transformers.testing_utils import is_pt_flax_cross_test, require_torch, require_torchaudio, slow, torch_device
+from transformers.utils import cached_property, is_flax_available, is_torch_available
 from transformers.utils.import_utils import is_datasets_available
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_datasets_available():
@@ -39,6 +43,7 @@
 
     from transformers import (
         WhisperFeatureExtractor,
+        WhisperForAudioClassification,
         WhisperForConditionalGeneration,
         WhisperModel,
         WhisperProcessor,
@@ -46,6 +51,14 @@
     )
     from transformers.models.whisper.modeling_whisper import WhisperDecoder, WhisperEncoder
 
+if is_flax_available():
+    import jax.numpy as jnp
+
+    from transformers.modeling_flax_pytorch_utils import (
+        convert_pytorch_state_dict_to_flax,
+        load_flax_weights_in_pytorch_model,
+    )
+
 
 def prepare_whisper_inputs_dict(
     config,
@@ -81,11 +94,11 @@ class WhisperModelTester:
     def __init__(
         self,
         parent,
-        batch_size=13,
+        batch_size=2,
         seq_length=60,
         is_training=True,
         use_labels=False,
-        vocab_size=99,
+        vocab_size=200,
         hidden_size=16,
         num_hidden_layers=2,
         num_attention_heads=4,
@@ -260,16 +273,42 @@ def check_encoder_decoder_model_standalone(self, config, inputs_dict):
 
 
 @require_torch
-class WhisperModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class WhisperModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (WhisperModel, WhisperForConditionalGeneration) if is_torch_available() else ()
     all_generative_model_classes = (WhisperForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "audio-classification": WhisperForAudioClassification,
+            "automatic-speech-recognition": WhisperForConditionalGeneration,
+            "feature-extraction": WhisperModel,
+        }
+        if is_torch_available()
+        else {}
+    )
     is_encoder_decoder = True
     fx_compatible = False
     test_pruning = False
     test_missing_keys = False
+    # Needs higher percentages after model tester's vocab_size is changed to 200 (PR #21222)
+    # `0.5` is for `test_disk_offload` (which also works for `test_model_parallelism`)
+    model_split_percents = [0.5, 0.8, 0.9]
 
     input_name = "input_features"
 
+    # TODO: Fix the failed tests
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if pipeline_test_casse_name in [
+            "AutomaticSpeechRecognitionPipelineTests",
+            "AudioClassificationPipelineTests",
+        ]:
+            # RuntimeError: The size of tensor a (1500) must match the size of tensor b (30) at non-singleton
+            # dimension 1
+            return True
+
+        return False
+
     def setUp(self):
         self.model_tester = WhisperModelTester(self)
         self.config_tester = ConfigTester(self, config_class=WhisperConfig)
@@ -320,25 +359,39 @@ def test_encoder_decoder_model_standalone(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
 
-    def _get_input_ids_and_config(self):
+    def _get_input_ids_and_config(self, batch_size=3):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         input_ids = inputs_dict[self.input_name]
 
-        # cut to half length & take max batch_size 3
-        max_batch_size = 3
-        input_ids = input_ids[:max_batch_size, :, :]
+        # cut to half length & take max batch_size=batch_size
+        input_ids = input_ids[:batch_size, :, :]
 
         # generate max 3 tokens
-        max_length = input_ids.shape[-1] + 3
+        max_length = 4
         if config.eos_token_id is not None and config.pad_token_id is None:
             # hack to allow generate for models such as GPT2 as is done in `generate()`
             config.pad_token_id = config.eos_token_id
 
         return config, input_ids, None, max_length
 
-    # not implemented currently
     def test_inputs_embeds(self):
-        pass
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
+
+            decoder_input_ids = inputs.pop("decoder_input_ids", None)
+            inputs.pop("decoder_attention_mask", None)
+
+            wte = model.get_input_embeddings()
+            inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
+
+            with torch.no_grad():
+                model(**inputs)[0]
 
     # training is not supported yet
     def test_training(self):
@@ -361,6 +414,21 @@ def test_generate_fp16(self):
         model.generate(input_features)
         model.generate(input_features, num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
 
+    def test_generate_language(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs()
+        input_features = input_dict["input_features"]
+        model = WhisperForConditionalGeneration(config).to(torch_device)
+        # Hack to keep the test fast and not require downloading a model with a generation_config
+        model.generation_config.__setattr__("lang_to_id", {"<|en|>": 1})
+        model.generation_config.__setattr__("task_to_id", {"transcribe": 2})
+
+        # test language code
+        model.generate(input_features, language="en")
+        # test tokenizer code
+        model.generate(input_features, language="<|en|>")
+        # test language name
+        model.generate(input_features, language="English")
+
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -372,6 +440,7 @@ def test_forward_signature(self):
 
             expected_arg_names = [
                 "input_features",
+                "attention_mask",
                 "decoder_input_ids",
                 "decoder_attention_mask",
             ]
@@ -709,7 +778,17 @@ def _create_and_check_torchscript(self, config, inputs_dict):
                 input_features = inputs["input_features"]
                 decoder_input_ids = inputs["decoder_input_ids"]
                 decoder_attention_mask = inputs["decoder_attention_mask"]
-                traced_model = torch.jit.trace(model, (input_features, decoder_input_ids, decoder_attention_mask))
+                # prepare `attention_mask` with shape (batch_size, sequence_length)
+                attention_mask = torch.ones(
+                    input_features.shape[0],
+                    input_features.shape[-1],
+                    device=input_features.device,
+                    dtype=input_features.dtype,
+                )
+                traced_model = torch.jit.trace(
+                    model, (input_features, attention_mask, decoder_input_ids, decoder_attention_mask)
+                )
+
             except RuntimeError:
                 self.fail("Couldn't trace module.")
 
@@ -735,8 +814,28 @@ def _create_and_check_torchscript(self, config, inputs_dict):
             model_state_dict = model.state_dict()
             loaded_model_state_dict = loaded_model.state_dict()
 
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
             self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
 
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
             models_equal = True
             for layer_name, p1 in model_state_dict.items():
                 p2 = loaded_model_state_dict[layer_name]
@@ -745,6 +844,260 @@ def _create_and_check_torchscript(self, config, inputs_dict):
 
             self.assertTrue(models_equal)
 
+    def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=5e-5, name="outputs", attributes=None):
+        # We override with a slightly higher tol value, as test recently became flaky
+        super().check_pt_tf_outputs(tf_outputs, pt_outputs, model_class, tol, name, attributes)
+
+    def check_pt_flax_outputs(self, fx_outputs, pt_outputs, model_class, tol=5e-5, name="outputs", attributes=None):
+        # We override with a slightly higher tol value, as test recently became flaky
+        super().check_pt_flax_outputs(fx_outputs, pt_outputs, model_class, tol, name, attributes)
+
+    @is_pt_flax_cross_test
+    def test_equivalence_pt_to_flax(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        init_shape = (1,) + inputs_dict["input_features"].shape[1:]
+
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
+                fx_model_class_name = "Flax" + model_class.__name__
+
+                if not hasattr(transformers, fx_model_class_name):
+                    # no flax model exists for this class
+                    return
+
+                # Output all for aggressive testing
+                config.output_hidden_states = True
+                config.output_attentions = self.has_attentions
+
+                fx_model_class = getattr(transformers, fx_model_class_name)
+
+                # load PyTorch class
+                pt_model = model_class(config).eval()
+                # Flax models don't use the `use_cache` option and cache is not returned as a default.
+                # So we disable `use_cache` here for PyTorch model.
+                pt_model.config.use_cache = False
+
+                # load Flax class
+                fx_model = fx_model_class(config, input_shape=init_shape, dtype=jnp.float32)
+
+                # make sure only flax inputs are forward that actually exist in function args
+                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
+
+                # prepare inputs
+                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
+
+                # remove function args that don't exist in Flax
+                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
+
+                # send pytorch inputs to the correct device
+                pt_inputs = {
+                    k: v.to(device=torch_device) if isinstance(v, torch.Tensor) else v for k, v in pt_inputs.items()
+                }
+
+                # convert inputs to Flax
+                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
+
+                fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
+                fx_model.params = fx_state
+
+                # send pytorch model to the correct device
+                pt_model.to(torch_device)
+
+                with torch.no_grad():
+                    pt_outputs = pt_model(**pt_inputs)
+                fx_outputs = fx_model(**fx_inputs)
+
+                fx_keys = tuple([k for k, v in fx_outputs.items() if v is not None])
+                pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
+
+                self.assertEqual(fx_keys, pt_keys)
+                self.check_pt_flax_outputs(fx_outputs, pt_outputs, model_class)
+
+                with tempfile.TemporaryDirectory() as tmpdirname:
+                    pt_model.save_pretrained(tmpdirname)
+                    fx_model_loaded = fx_model_class.from_pretrained(tmpdirname, input_shape=init_shape, from_pt=True)
+
+                fx_outputs_loaded = fx_model_loaded(**fx_inputs)
+
+                fx_keys = tuple([k for k, v in fx_outputs_loaded.items() if v is not None])
+                pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
+
+                self.assertEqual(fx_keys, pt_keys)
+                self.check_pt_flax_outputs(fx_outputs_loaded, pt_outputs, model_class)
+
+    @is_pt_flax_cross_test
+    def test_equivalence_flax_to_pt(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        init_shape = (1,) + inputs_dict["input_features"].shape[1:]
+
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
+                fx_model_class_name = "Flax" + model_class.__name__
+
+                if not hasattr(transformers, fx_model_class_name):
+                    # no flax model exists for this class
+                    return
+
+                # Output all for aggressive testing
+                config.output_hidden_states = True
+                config.output_attentions = self.has_attentions
+
+                fx_model_class = getattr(transformers, fx_model_class_name)
+
+                # load PyTorch class
+                pt_model = model_class(config).eval()
+                # Flax models don't use the `use_cache` option and cache is not returned as a default.
+                # So we disable `use_cache` here for PyTorch model.
+                pt_model.config.use_cache = False
+
+                # load Flax class
+                fx_model = fx_model_class(config, input_shape=init_shape, dtype=jnp.float32)
+
+                # make sure only flax inputs are forward that actually exist in function args
+                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
+
+                # prepare inputs
+                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
+
+                # remove function args that don't exist in Flax
+                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
+
+                # send pytorch inputs to the correct device
+                pt_inputs = {
+                    k: v.to(device=torch_device) if isinstance(v, torch.Tensor) else v for k, v in pt_inputs.items()
+                }
+
+                # convert inputs to Flax
+                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
+
+                pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
+
+                # make sure weights are tied in PyTorch
+                pt_model.tie_weights()
+
+                # send pytorch model to the correct device
+                pt_model.to(torch_device)
+
+                with torch.no_grad():
+                    pt_outputs = pt_model(**pt_inputs)
+                fx_outputs = fx_model(**fx_inputs)
+
+                fx_keys = tuple([k for k, v in fx_outputs.items() if v is not None])
+                pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
+
+                self.assertEqual(fx_keys, pt_keys)
+                self.check_pt_flax_outputs(fx_outputs, pt_outputs, model_class)
+
+                with tempfile.TemporaryDirectory() as tmpdirname:
+                    fx_model.save_pretrained(tmpdirname)
+                    pt_model_loaded = model_class.from_pretrained(tmpdirname, from_flax=True)
+
+                # send pytorch model to the correct device
+                pt_model_loaded.to(torch_device)
+                pt_model_loaded.eval()
+
+                with torch.no_grad():
+                    pt_outputs_loaded = pt_model_loaded(**pt_inputs)
+
+                fx_keys = tuple([k for k, v in fx_outputs.items() if v is not None])
+                pt_keys = tuple([k for k, v in pt_outputs_loaded.items() if v is not None])
+
+                self.assertEqual(fx_keys, pt_keys)
+                self.check_pt_flax_outputs(fx_outputs, pt_outputs_loaded, model_class)
+
+    def test_mask_feature_prob(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.mask_feature_prob = 0.2
+        config.mask_feature_length = 2
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.train()
+
+            # forward pass
+            encoder_last_hidden_state = model(**input_dict).encoder_last_hidden_state
+            self.assertTrue(encoder_last_hidden_state.shape, (13, 30, 16))
+
+    def test_mask_time_prob(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.mask_time_prob = 0.2
+        config.mask_time_length = 2
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.train()
+
+            # forward pass
+            encoder_last_hidden_state = model(**input_dict).encoder_last_hidden_state
+            self.assertTrue(encoder_last_hidden_state.shape, (13, 30, 16))
+
+    def test_generate_with_prompt_ids_and_task_and_language(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        model = WhisperForConditionalGeneration(config).eval().to(torch_device)
+        input_features = input_dict["input_features"]
+        prompt_ids = np.arange(5)
+        language = "<|de|>"
+        task = "translate"
+        lang_id = 6
+        task_id = 7
+        model.generation_config.__setattr__("lang_to_id", {language: lang_id})
+        model.generation_config.__setattr__("task_to_id", {task: task_id})
+
+        output = model.generate(input_features, max_new_tokens=5, task=task, language=language, prompt_ids=prompt_ids)
+
+        expected_output_start = [
+            *prompt_ids.tolist(),
+            model.generation_config.decoder_start_token_id,
+            lang_id,
+            task_id,
+        ]
+        for row in output.tolist():
+            self.assertListEqual(row[: len(expected_output_start)], expected_output_start)
+
+    def test_generate_with_prompt_ids_and_forced_decoder_ids(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        model = WhisperForConditionalGeneration(config).eval().to(torch_device)
+        input_features = input_dict["input_features"]
+        prompt_ids = np.asarray(range(5))
+        forced_decoder_ids = [(1, 6), (2, 7), (3, 8)]
+
+        output = model.generate(
+            input_features, max_new_tokens=5, forced_decoder_ids=forced_decoder_ids, prompt_ids=prompt_ids
+        )
+
+        expected_output_start = [
+            *prompt_ids.tolist(),
+            model.generation_config.decoder_start_token_id,
+            *[token for _rank, token in forced_decoder_ids],
+        ]
+        for row in output.tolist():
+            self.assertListEqual(row[: len(expected_output_start)], expected_output_start)
+
+    def test_generate_with_prompt_ids_max_length(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.max_target_positions = 5
+
+        model = WhisperForConditionalGeneration(config).eval().to(torch_device)
+        input_features = input_dict["input_features"]
+        prompt_ids = np.asarray(range(4))
+        sliced_prompt_ids = prompt_ids[1:]
+        sliced_prompt_ids = sliced_prompt_ids[-config.max_target_positions // 2 - 1 :]
+        max_new_tokens = 5
+
+        with self.assertRaisesRegex(
+            ValueError,
+            f"The length of the sliced `prompt_ids` is {len(sliced_prompt_ids)}, and the `max_new_tokens` "
+            f"{max_new_tokens}. Thus, the combined length of the sliced `prompt_ids` and `max_new_tokens` is: "
+            f"{len(sliced_prompt_ids) + max_new_tokens}. This exceeds the `max_target_positions` of the Whisper model: "
+            f"{config.max_target_positions}. You should either reduce the length of your prompt, or reduce the "
+            f"value of `max_new_tokens`, so that their combined length is less that {config.max_target_positions}.",
+        ):
+            model.generate(input_features, max_new_tokens=max_new_tokens, prompt_ids=prompt_ids)
+
+        model.generate(input_features, max_new_tokens=1, prompt_ids=prompt_ids)
+
 
 @require_torch
 @require_torchaudio
@@ -754,7 +1107,6 @@ def default_processor(self):
         return WhisperProcessor.from_pretrained("openai/whisper-base")
 
     def _load_datasamples(self, num_samples):
-
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         # automatic decoding with librispeech
         speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
@@ -884,7 +1236,6 @@ def test_large_logits_librispeech(self):
 
     @slow
     def test_tiny_en_generation(self):
-
         torch_device = "cpu"
         set_seed(0)
         processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
@@ -908,7 +1259,6 @@ def test_tiny_en_generation(self):
 
     @slow
     def test_tiny_generation(self):
-
         torch_device = "cpu"
         set_seed(0)
         processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
@@ -942,11 +1292,8 @@ def test_large_generation(self):
             torch_device
         )
 
-        model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="en", task="transcribe")
         generated_ids = model.generate(
-            input_features,
-            do_sample=False,
-            max_length=20,
+            input_features, do_sample=False, max_length=20, language="<|en|>", task="transcribe"
         )
         transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
@@ -968,26 +1315,25 @@ def test_large_generation_multilingual(self):
             torch_device
         )
 
-        model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="ja", task="transcribe")
-        generated_ids = model.generate(input_features, do_sample=False, max_length=20)
+        generated_ids = model.generate(
+            input_features, do_sample=False, max_length=20, language="<|ja|>", task="transcribe"
+        )
         transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
         EXPECTED_TRANSCRIPT = "木村さんに電話を貸してもらいました"
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
 
-        model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="en", task="transcribe")
         generated_ids = model.generate(
-            input_features,
-            do_sample=False,
-            max_length=20,
+            input_features, do_sample=False, max_length=20, language="<|en|>", task="transcribe"
         )
         transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
         EXPECTED_TRANSCRIPT = " Kimura-san called me."
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
 
-        model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="ja", task="translate")
-        generated_ids = model.generate(input_features, do_sample=False, max_length=20)
+        generated_ids = model.generate(
+            input_features, do_sample=False, max_length=20, language="<|ja|>", task="translate"
+        )
         transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
         EXPECTED_TRANSCRIPT = " I borrowed a phone from Kimura san"
@@ -1001,15 +1347,15 @@ def test_large_batched_generation(self):
 
         input_speech = self._load_datasamples(4)
         input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="pt").input_features
-        generated_ids = model.generate(input_features, max_length=20)
+        generated_ids = model.generate(input_features, max_length=20, task="translate")
 
         # fmt: off
         EXPECTED_LOGITS = torch.tensor(
             [
-                [50258, 50358, 50363, 2221, 13, 2326, 388, 391, 307, 264, 50244, 295, 264, 2808, 5359, 293, 321, 366, 5404, 281],
-                [50258, 50358, 50363, 6966, 307, 2221, 13, 2326, 388, 391, 311, 9060, 1570, 1880, 813, 702, 1871, 13, 50257, 50257],
-                [50258, 50358, 50363, 634, 5112, 505, 300, 412, 341, 42729, 3196, 295, 264, 1064, 11, 365, 5272, 293, 12904, 9256],
-                [50258, 50358, 50363, 634, 575, 12525, 22618, 1968, 6144, 35617, 20084, 1756, 311, 589, 307, 534, 10281, 934, 439, 11]
+                [50258, 50259, 50358, 50363, 2221, 13, 2326, 388, 391, 307, 264, 50244, 295, 264, 2808, 5359, 293, 321, 366, 5404],
+                [50258, 50259, 50358, 50363, 6966, 307, 2221, 13, 2326, 388, 391, 311, 9060, 1570, 1880, 813, 702, 1871, 13, 50257],
+                [50258, 50259, 50358, 50363, 634, 5112, 505, 300, 412, 341, 42729, 3196, 295, 264, 1064, 11, 365, 5272, 293, 12904],
+                [50258, 50259, 50358, 50363, 634, 575, 12525, 22618, 1968, 6144, 35617, 20084, 1756, 311, 589, 307, 534, 10281, 934, 439]
             ]
         )
         # fmt: on
@@ -1018,10 +1364,10 @@ def test_large_batched_generation(self):
 
         # fmt: off
         EXPECTED_TRANSCRIPT = [
-            " Mr. Quilter is the apostle of the middle classes and we are glad to",
+            " Mr. Quilter is the apostle of the middle classes and we are glad",
             " Nor is Mr. Quilter's manner less interesting than his matter.",
-            " He tells us that at this festive season of the year, with Christmas and roast beef",
-            " He has grave doubts whether Sir Frederick Layton's work is really Greek after all,",
+            " He tells us that at this festive season of the year, with Christmas and roast",
+            " He has grave doubts whether Sir Frederick Layton's work is really Greek after all",
         ]
         # fmt: on
 
@@ -1030,7 +1376,6 @@ def test_large_batched_generation(self):
 
     @slow
     def test_tiny_en_batched_generation(self):
-        torch_device = "cuda"
         set_seed(0)
         processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
@@ -1067,3 +1412,548 @@ def test_tiny_en_batched_generation(self):
 
         transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)
         self.assertListEqual(transcript, EXPECTED_TRANSCRIPT)
+
+    @slow
+    def test_tiny_timestamp_generation(self):
+        set_seed(0)
+        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
+        model.to(torch_device)
+
+        input_speech = np.concatenate(self._load_datasamples(4))
+        input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="pt").input_features.to(
+            torch_device
+        )
+
+        generated_ids = model.generate(input_features, max_length=448, return_timestamps=True).to("cpu")
+
+        # fmt: off
+        EXPECTED_OUTPUT = torch.tensor([50258, 50259, 50359, 50364, 2221, 13, 2326, 388, 391, 307, 264, 50244, 295, 264, 2808, 5359, 11, 293, 321, 366, 5404, 281, 2928, 702, 14943, 13, 50692, 50692, 6966, 307, 2221, 13, 2326, 388, 391, 311, 9060, 1570, 1880, 813, 702, 1871, 13, 50926, 50926, 634, 5112, 505, 300, 412, 341, 42729, 3196, 295, 264, 1064, 11, 365, 5272, 293, 12904, 9256, 450, 10539, 51208, 51208, 949, 505, 11, 14138, 10117, 490, 3936, 293, 1080, 3542, 5160, 881, 26336, 281, 264, 1575, 13, 51552, 51552, 634, 575, 12525, 22618, 1968, 6144, 35617, 7354, 1292, 6, 589, 307, 534, 10281, 934, 439, 11, 293, 51836, 51836, 50257])
+        # fmt: on
+
+        self.assertTrue(torch.allclose(generated_ids, EXPECTED_OUTPUT))
+
+        EXPECTED_TRANSCRIPT = [
+            {
+                "text": (
+                    " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is"
+                    " Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season"
+                    " of the year, with Christmas and roast beef looming before us, similarly drawn from eating and"
+                    " its results occur most readily to the mind. He has grave doubts whether Sir Frederick Latins'"
+                    " work is really Greek after all, and"
+                ),
+                "offsets": [
+                    {
+                        "text": (
+                            " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel."
+                        ),
+                        "timestamp": (0.0, 6.5600000000000005),
+                    },
+                    {
+                        "text": " Nor is Mr. Quilter's manner less interesting than his matter.",
+                        "timestamp": (6.5600000000000005, 11.24),
+                    },
+                    {
+                        "text": (
+                            " He tells us that at this festive season of the year, with Christmas and roast beef"
+                            " looming"
+                        ),
+                        "timestamp": (11.24, 16.88),
+                    },
+                    {
+                        "text": (
+                            " before us, similarly drawn from eating and its results occur most readily to the mind."
+                        ),
+                        "timestamp": (16.88, 23.76),
+                    },
+                    {
+                        "text": (
+                            " He has grave doubts whether Sir Frederick Latins' work is really Greek after all, and"
+                        ),
+                        "timestamp": (23.76, 29.44),
+                    },
+                ],
+            }
+        ]
+
+        transcript = processor.batch_decode(generated_ids, skip_special_tokens=True, output_offsets=True)
+        self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
+
+    @slow
+    def test_tiny_token_timestamp_generation(self):
+        set_seed(0)
+        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
+        model.to(torch_device)
+        model.generation_config.alignment_heads = [[2, 2], [3, 0], [3, 2], [3, 3], [3, 4], [3, 5]]
+
+        input_speech = self._load_datasamples(4)
+        input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="pt").input_features.to(
+            torch_device
+        )
+
+        generate_outputs = model.generate(
+            input_features, max_length=448, return_timestamps=True, return_token_timestamps=True
+        )
+
+        self.assertEqual(generate_outputs.sequences.shape, generate_outputs.token_timestamps.shape)
+
+        # fmt: off
+        EXPECTED_OUTPUT = torch.tensor([
+            [ 0.0000, 0.0000, 0.0000, 0.0000, 0.4800, 0.8200, 0.9600, 1.1200, 1.1200, 1.2200, 1.5000, 1.7200, 2.0000, 2.3400, 2.5000, 2.6600, 3.1800, 3.5600, 3.6800, 3.8000, 4.1000, 4.3000, 4.5800, 4.9400, 5.3800, 12.4200, 12.8400, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9400, 26.9400, 26.9400, 26.9400, 29.8400 ],
+            [ 0.0000, 0.0000, 0.0000, 0.0000, 0.5200, 0.9000, 1.1400, 1.4200, 1.5200, 1.6800, 1.6800, 1.8800, 2.1000, 2.2200, 2.6200, 3.1400, 3.5800, 3.9600, 4.4000, 17.3000, 17.3000, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7400, 26.7400, 26.7400, 26.7400, 26.7400, 26.7400, 28.0000 ],
+            [ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.7600, 1.0000, 1.4200, 1.8000, 1.9400, 2.1800, 2.5200, 3.0200, 3.3200, 3.5400, 3.9400, 4.5600, 4.9200, 5.2800, 5.5600, 5.9000, 6.1600, 6.3000, 6.4800, 6.4800, 6.6400, 7.8200, 7.9600, 8.2200, 8.6000, 8.9200, 9.2200, 9.5200, 9.7200, 10.0600, 10.5400, 10.8800, 11.2600, 11.5400, 11.7400, 12.0800, 15.6800, 15.6800],
+            [ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.7400, 1.0400, 1.3200, 1.6800, 2.1400, 2.4800, 2.7800, 3.0800, 3.1600, 3.4000, 3.6000, 4.0200, 4.2200, 4.8600, 5.2400, 5.7400, 6.3400, 6.6200, 6.7600, 6.7600, 6.8600, 7.2400, 7.4200, 7.6800, 7.9200, 8.4800, 8.7600, 9.2000, 9.2000, 9.4200, 15.8200, 15.8200, 29.6400, 29.6600, 29.6600, 29.6600, 29.6600, 29.7600]
+        ])
+        # fmt: on
+
+        self.assertTrue(torch.allclose(generate_outputs.token_timestamps.to("cpu"), EXPECTED_OUTPUT))
+
+    @slow
+    def test_tiny_specaugment_librispeech(self):
+        torch_device = "cpu"
+        set_seed(0)
+        # Apply SpecAugment
+        model = WhisperModel.from_pretrained("openai/whisper-tiny", apply_spec_augment=True)
+        # Set model to training mode to enable SpecAugment
+        model.train()
+        model.to(torch_device)
+        input_speech = self._load_datasamples(1)
+        feature_extractor = WhisperFeatureExtractor()
+        input_features = feature_extractor(input_speech, return_tensors="pt").input_features
+
+        with torch.no_grad():
+            logits = model(
+                input_features,
+                decoder_input_ids=torch.tensor([[50258, 50259, 50359]]),
+                output_hidden_states=False,
+                output_attentions=False,
+                return_dict=False,
+                use_cache=False,
+            )
+
+        # fmt: off
+        EXPECTED_LOGITS = torch.tensor(
+            [
+                0.9362, -4.7105, 5.0879, 3.9642, 1.0013, -6.0096, 4.7285, -3.1847,
+                -0.8648, 1.9631, 6.2653, 3.6936, 0.3575, -4.5818, 3.0564, 7.8712,
+                2.9951, 0.6848, 9.9497, -2.6638, 1.1571, -6.8546, -1.4333, -7.7584,
+                1.1200, 3.9030, 4.4655, -4.4919, -1.1703, 9.6241
+            ]
+        )
+        # fmt: on
+        self.assertTrue(torch.allclose(logits[0][0, 0, :30].cpu(), EXPECTED_LOGITS, atol=1e-4))
+
+    @slow
+    def test_generate_with_prompt_ids(self):
+        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
+        model.to(torch_device)
+        input_speech = self._load_datasamples(4)[-1:]
+        input_features = processor(input_speech, return_tensors="pt").input_features.to(torch_device)
+
+        output_without_prompt = model.generate(input_features)
+        prompt_ids = processor.get_prompt_ids("Leighton")
+        output_with_prompt = model.generate(input_features, prompt_ids=prompt_ids)
+
+        expected_without_prompt = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|> He has grave doubts whether Sir Frederick Layton's work is really Greek after all and can discover in it but little of Rocky Ithaca.<|endoftext|>"
+        expected_with_prompt = "<|startofprev|> Leighton<|startoftranscript|><|en|><|transcribe|><|notimestamps|> He has grave doubts whether Sir Frederick Leighton's work is really Greek after all and can discover in it but little of Rocky Ithaca.<|endoftext|>"
+        self.assertEqual(processor.decode(output_without_prompt[0]), expected_without_prompt)
+        self.assertEqual(processor.decode(output_with_prompt[0]), expected_with_prompt)
+
+    @slow
+    def test_generate_with_prompt_ids_and_forced_decoder_ids(self):
+        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
+        model.to(torch_device)
+        input_speech = self._load_datasamples(1)
+        input_features = processor(input_speech, return_tensors="pt").input_features.to(torch_device)
+        task = "translate"
+        language = "de"
+        expected_tokens = [f"<|{task}|>", f"<|{language}|>"]
+        prompt = "test prompt"
+        prompt_ids = processor.get_prompt_ids(prompt)
+
+        output = model.generate(input_features, task=task, language=language, prompt_ids=prompt_ids)
+        text = processor.decode(output[0])
+
+        self.assertTrue(prompt in text)
+        self.assertTrue(all(token in text for token in expected_tokens))
+
+    @slow
+    def test_generate_with_prompt_ids_and_no_non_prompt_forced_decoder_ids(self):
+        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
+        model.to(torch_device)
+        input_speech = self._load_datasamples(1)
+        input_features = processor(input_speech, return_tensors="pt").input_features.to(torch_device)
+        prompt = "test prompt"
+        prompt_ids = processor.get_prompt_ids(prompt)
+
+        model.generation_config.forced_decoder_ids = None
+        model.config.forced_decoder_ids = None
+
+        output = model.generate(input_features, prompt_ids=prompt_ids, return_timestamps=True)
+        text = processor.decode(output[0])
+
+        self.assertTrue(prompt in text)
+
+
+def prepare_whisper_encoder_inputs_dict(config, input_features, head_mask=None):
+    if head_mask is None:
+        head_mask = torch.ones(config.encoder_layers, config.encoder_attention_heads, device=torch_device)
+    return {"input_features": input_features, "head_mask": head_mask}
+
+
+@require_torch
+class WhisperEncoderModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=2,
+        seq_length=60,
+        is_training=True,
+        use_labels=True,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        input_channels=1,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=20,
+        max_source_positions=30,
+        num_mel_bins=80,
+        num_conv_layers=1,
+        suppress_tokens=None,
+        begin_suppress_tokens=None,
+        classifier_proj_size=4,
+        num_labels=2,
+        is_encoder_decoder=False,
+        is_decoder=False,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.input_channels = input_channels
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.num_mel_bins = num_mel_bins
+        self.max_position_embeddings = max_position_embeddings
+        self.max_source_positions = max_source_positions
+        self.num_conv_layers = num_conv_layers
+        self.suppress_tokens = suppress_tokens
+        self.begin_suppress_tokens = begin_suppress_tokens
+        self.classifier_proj_size = classifier_proj_size
+        self.num_labels = num_labels
+        self.is_encoder_decoder = is_encoder_decoder
+        self.is_decoder = is_decoder
+
+    def get_config(self):
+        return WhisperConfig(
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            input_channels=self.input_channels,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            max_source_positions=self.max_source_positions,
+            decoder_ffn_dim=self.hidden_size,
+            encoder_ffn_dim=self.hidden_size,
+            suppress_tokens=self.suppress_tokens,
+            begin_suppress_tokens=self.begin_suppress_tokens,
+            classifier_proj_size=self.classifier_proj_size,
+            num_labels=self.num_labels,
+            is_encoder_decoder=self.is_encoder_decoder,
+            is_decoder=self.is_decoder,
+        )
+
+    def prepare_config_and_inputs(self):
+        input_features = floats_tensor([self.batch_size, self.num_mel_bins, self.seq_length])
+
+        config = self.get_config()
+        inputs_dict = prepare_whisper_encoder_inputs_dict(
+            config,
+            input_features=input_features,
+        )
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def get_subsampled_output_lengths(self, input_lengths):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        for i in range(self.num_conv_layers):
+            input_lengths = (input_lengths - 1) // 2 + 1
+
+        return input_lengths
+
+    @property
+    def encoder_seq_length(self):
+        return self.get_subsampled_output_lengths(self.seq_length)
+
+    def create_and_check_model_forward(self, config, inputs_dict, freeze_encoder=False):
+        model = WhisperForAudioClassification(config=config).to(torch_device).eval()
+
+        if freeze_encoder:
+            model.freeze_encoder()
+
+        input_features = inputs_dict["input_features"]
+
+        # first forward pass
+        last_hidden_state = model(input_features).logits
+
+        self.parent.assertTrue(last_hidden_state.shape, (13, 2))
+
+
+@require_torch
+class WhisperEncoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (WhisperForAudioClassification,) if is_torch_available() else ()
+    is_encoder_decoder = False
+    fx_compatible = False
+    test_pruning = False
+    test_missing_keys = False
+
+    input_name = "input_features"
+
+    def setUp(self):
+        self.model_tester = WhisperEncoderModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=WhisperConfig)
+        self.maxDiff = 3000
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["input_features", "head_mask", "encoder_outputs"]
+            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+
+    @unittest.skip(reason="Some undefined behavior encountered with tiny versions of this model. Skip for now.")
+    def test_cpu_offload(self):
+        pass
+
+    @unittest.skip(reason="Some undefined behavior encountered with tiny versions of this model. Skip for now.")
+    def test_disk_offload(self):
+        pass
+
+    @unittest.skip(reason="Some undefined behavior encountered with tiny versions of this model. Skip for now.")
+    def test_model_parallelism(self):
+        pass
+
+    # input embeds is meaningless for an encoder-only acoustic model
+    def test_inputs_embeds(self):
+        pass
+
+    # the equivalent test is passing the encoder outputs directly to the model
+    def test_encoder_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
+
+            with torch.no_grad():
+                outputs = model(**inputs)[0]
+
+            input_ids = inputs["input_features"]
+            del inputs["input_features"]
+
+            encoder = model.encoder
+
+            with torch.no_grad():
+                inputs["encoder_outputs"] = encoder(input_ids)
+                outputs_embeds = model(**inputs)[0]
+
+            self.assertTrue((outputs_embeds == outputs).all())
+
+    # Needs to override as the encoder input embedding is a Conv1d
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (torch.nn.Conv1d))
+            model.set_input_embeddings(torch.nn.Conv1d(10, 10, 3))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, torch.nn.Conv1d))
+
+    # WhisperEncoder cannot resize token embeddings since it has no tokens embeddings
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    @is_pt_flax_cross_test
+    def test_equivalence_pt_to_flax(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        init_shape = (1,) + inputs_dict["input_features"].shape[1:]
+
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
+                fx_model_class_name = "Flax" + model_class.__name__
+
+                if not hasattr(transformers, fx_model_class_name):
+                    # no flax model exists for this class
+                    return
+
+                # Output all for aggressive testing
+                config.output_hidden_states = True
+                config.output_attentions = self.has_attentions
+
+                fx_model_class = getattr(transformers, fx_model_class_name)
+
+                # load PyTorch class
+                pt_model = model_class(config).eval()
+                # Flax models don't use the `use_cache` option and cache is not returned as a default.
+                # So we disable `use_cache` here for PyTorch model.
+                pt_model.config.use_cache = False
+
+                # load Flax class
+                fx_model = fx_model_class(config, input_shape=init_shape, dtype=jnp.float32)
+
+                # make sure only flax inputs are forward that actually exist in function args
+                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
+
+                # prepare inputs
+                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
+
+                # remove function args that don't exist in Flax
+                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
+
+                # send pytorch inputs to the correct device
+                pt_inputs = {
+                    k: v.to(device=torch_device) if isinstance(v, torch.Tensor) else v for k, v in pt_inputs.items()
+                }
+
+                # convert inputs to Flax
+                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
+
+                fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
+                fx_model.params = fx_state
+
+                # send pytorch model to the correct device
+                pt_model.to(torch_device)
+
+                with torch.no_grad():
+                    pt_outputs = pt_model(**pt_inputs)
+                fx_outputs = fx_model(**fx_inputs)
+
+                fx_keys = tuple([k for k, v in fx_outputs.items() if v is not None])
+                pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
+
+                self.assertEqual(fx_keys, pt_keys)
+                self.check_pt_flax_outputs(fx_outputs, pt_outputs, model_class)
+
+                with tempfile.TemporaryDirectory() as tmpdirname:
+                    pt_model.save_pretrained(tmpdirname)
+                    fx_model_loaded = fx_model_class.from_pretrained(tmpdirname, input_shape=init_shape, from_pt=True)
+
+                fx_outputs_loaded = fx_model_loaded(**fx_inputs)
+
+                fx_keys = tuple([k for k, v in fx_outputs_loaded.items() if v is not None])
+                pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
+
+                self.assertEqual(fx_keys, pt_keys)
+                self.check_pt_flax_outputs(fx_outputs_loaded, pt_outputs, model_class)
+
+    @is_pt_flax_cross_test
+    def test_equivalence_flax_to_pt(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        init_shape = (1,) + inputs_dict["input_features"].shape[1:]
+
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
+                fx_model_class_name = "Flax" + model_class.__name__
+
+                if not hasattr(transformers, fx_model_class_name):
+                    # no flax model exists for this class
+                    return
+
+                # Output all for aggressive testing
+                config.output_hidden_states = True
+                config.output_attentions = self.has_attentions
+
+                fx_model_class = getattr(transformers, fx_model_class_name)
+
+                # load PyTorch class
+                pt_model = model_class(config).eval()
+                # Flax models don't use the `use_cache` option and cache is not returned as a default.
+                # So we disable `use_cache` here for PyTorch model.
+                pt_model.config.use_cache = False
+
+                # load Flax class
+                fx_model = fx_model_class(config, input_shape=init_shape, dtype=jnp.float32)
+
+                # make sure only flax inputs are forward that actually exist in function args
+                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
+
+                # prepare inputs
+                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
+
+                # remove function args that don't exist in Flax
+                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
+
+                # send pytorch inputs to the correct device
+                pt_inputs = {
+                    k: v.to(device=torch_device) if isinstance(v, torch.Tensor) else v for k, v in pt_inputs.items()
+                }
+
+                # convert inputs to Flax
+                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
+
+                pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
+
+                # make sure weights are tied in PyTorch
+                pt_model.tie_weights()
+
+                # send pytorch model to the correct device
+                pt_model.to(torch_device)
+
+                with torch.no_grad():
+                    pt_outputs = pt_model(**pt_inputs)
+                fx_outputs = fx_model(**fx_inputs)
+
+                fx_keys = tuple([k for k, v in fx_outputs.items() if v is not None])
+                pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
+
+                self.assertEqual(fx_keys, pt_keys)
+                self.check_pt_flax_outputs(fx_outputs, pt_outputs, model_class)
+
+                with tempfile.TemporaryDirectory() as tmpdirname:
+                    fx_model.save_pretrained(tmpdirname)
+                    pt_model_loaded = model_class.from_pretrained(tmpdirname, from_flax=True)
+
+                # send pytorch model to the correct device
+                pt_model_loaded.to(torch_device)
+                pt_model_loaded.eval()
+
+                with torch.no_grad():
+                    pt_outputs_loaded = pt_model_loaded(**pt_inputs)
+
+                fx_keys = tuple([k for k, v in fx_outputs.items() if v is not None])
+                pt_keys = tuple([k for k, v in pt_outputs_loaded.items() if v is not None])
+
+                self.assertEqual(fx_keys, pt_keys)
+                self.check_pt_flax_outputs(fx_outputs, pt_outputs_loaded, model_class)
diff --git a/tests/models/whisper/test_processor_whisper.py b/tests/models/whisper/test_processor_whisper.py
index b844d433ed33..e96f4260e94c 100644
--- a/tests/models/whisper/test_processor_whisper.py
+++ b/tests/models/whisper/test_processor_whisper.py
@@ -16,6 +16,8 @@
 import tempfile
 import unittest
 
+import pytest
+
 from transformers import WhisperTokenizer, is_speech_available
 from transformers.testing_utils import require_sentencepiece, require_torch, require_torchaudio
 
@@ -146,3 +148,32 @@ def test_get_decoder_prompt_ids(self):
 
         expected_ids = [TRANSCRIBE, NOTIMESTAMPS]
         self.assertListEqual([ids[-1] for ids in forced_decoder_ids], expected_ids)
+
+    def test_get_prompt_ids(self):
+        processor = WhisperProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
+        prompt_ids = processor.get_prompt_ids("Mr. Quilter")
+        decoded_prompt = processor.tokenizer.decode(prompt_ids)
+
+        self.assertListEqual(prompt_ids.tolist(), [50360, 1770, 13, 2264, 346, 353])
+        self.assertEqual(decoded_prompt, "<|startofprev|> Mr. Quilter")
+
+    def test_empty_get_prompt_ids(self):
+        processor = WhisperProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
+        prompt_ids = processor.get_prompt_ids("")
+        decoded_prompt = processor.tokenizer.decode(prompt_ids)
+
+        self.assertListEqual(prompt_ids.tolist(), [50360, 220])
+        self.assertEqual(decoded_prompt, "<|startofprev|> ")
+
+    def test_get_prompt_ids_with_special_tokens(self):
+        processor = WhisperProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
+
+        def _test_prompt_error_raised_helper(prompt, special_token):
+            with pytest.raises(ValueError) as excinfo:
+                processor.get_prompt_ids(prompt)
+            expected = f"Encountered text in the prompt corresponding to disallowed special token: {special_token}."
+            self.assertEqual(expected, str(excinfo.value))
+
+        _test_prompt_error_raised_helper("<|startofprev|> test", "<|startofprev|>")
+        _test_prompt_error_raised_helper("test <|notimestamps|>", "<|notimestamps|>")
+        _test_prompt_error_raised_helper("test <|zh|> test <|transcribe|>", "<|zh|>")
diff --git a/tests/models/whisper/test_tokenization_whisper.py b/tests/models/whisper/test_tokenization_whisper.py
index 272df8e33cb1..be9e11de5401 100644
--- a/tests/models/whisper/test_tokenization_whisper.py
+++ b/tests/models/whisper/test_tokenization_whisper.py
@@ -14,8 +14,9 @@
 
 import unittest
 
-from transformers.models.whisper import WhisperTokenizer
-from transformers.testing_utils import slow
+from transformers.models.whisper import WhisperTokenizer, WhisperTokenizerFast
+from transformers.models.whisper.tokenization_whisper import _combine_tokens_into_words, _find_longest_common_sequence
+from transformers.testing_utils import require_jinja, slow
 
 from ...test_tokenization_common import TokenizerTesterMixin
 
@@ -31,7 +32,8 @@
 
 class WhisperTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
     tokenizer_class = WhisperTokenizer
-    test_rust_tokenizer = False
+    rust_tokenizer_class = WhisperTokenizerFast
+    test_rust_tokenizer = True
     test_sentencepiece = False
     test_seq2seq = False
 
@@ -55,11 +57,11 @@ def test_get_vocab(self):
 
         self.assertEqual(vocab_keys[0], "!")
         self.assertEqual(vocab_keys[1], '"')
-        self.assertEqual(vocab_keys[-1], "<|notimestamps|>")
-        self.assertEqual(len(vocab_keys), 50364)
+        self.assertEqual(vocab_keys[-1], "<|30.00|>")
+        self.assertEqual(len(vocab_keys), 51865)
 
     def test_vocab_size(self):
-        self.assertEqual(self.get_tokenizer().vocab_size, 50257)
+        self.assertEqual(self.get_tokenizer().vocab_size, 50258)
 
     def test_full_tokenizer(self):
         tokenizer = WhisperTokenizer.from_pretrained(self.tmpdirname)
@@ -93,6 +95,17 @@ def test_full_tokenizer(self):
     def test_tokenizer_slow_store_full_signature(self):
         pass
 
+    def test_tokenizer_fast_store_full_signature(self):
+        pass
+
+    def test_special_tokens_initialization(self):
+        # Whisper relies on specific additional special tokens, so we skip this
+        # general test. In particular, this test loads fast tokenizer from slow
+        # tokenizer, and the conversion uses prefix_tokens, where we reference
+        # additional special tokens by specific indices, hence overriding the
+        # list with less tokens leads to out of index error
+        pass
+
     @slow
     def test_tokenizer_integration(self):
         # fmt: off
@@ -103,6 +116,197 @@ def test_tokenizer_integration(self):
             expected_encoding=expected_encoding, model_name="openai/whisper-tiny.en", padding=False
         )
 
+    def test_output_offsets(self):
+        tokenizer = self.get_tokenizer()
+        previous_sequence = [51492, 406, 3163, 1953, 466, 13, 51612, 51612]
+        self.assertEqual(
+            tokenizer.decode(previous_sequence, output_offsets=True),
+            {
+                "text": " not worth thinking about.",
+                "offsets": [{"text": " not worth thinking about.", "timestamp": (22.56, 24.96)}],
+            },
+        )
+
+        # Merge when the previous sequence is a suffix of the next sequence
+        # fmt: off
+        next_sequences_1 = [50364, 295, 6177, 3391, 11, 19817, 3337, 507, 307, 406, 3163, 1953, 466, 13, 50614, 50614, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 50834, 50257]
+        # fmt: on
+        self.assertEqual(
+            tokenizer.decode(next_sequences_1, output_offsets=True),
+            {
+                "text": (
+                    " of spectators, retrievality is not worth thinking about. His instant panic was followed by a"
+                    " small, sharp blow high on his chest.<|endoftext|>"
+                ),
+                "offsets": [
+                    {"text": " of spectators, retrievality is not worth thinking about.", "timestamp": (0.0, 5.0)},
+                    {
+                        "text": " His instant panic was followed by a small, sharp blow high on his chest.",
+                        "timestamp": (5.0, 9.4),
+                    },
+                ],
+            },
+        )
+
+    def test_find_longest_common_subsequence(self):
+        previous_sequence = [1, 2, 3]
+        next_sequence = [2, 3, 4, 5]
+        merge = _find_longest_common_sequence([previous_sequence, next_sequence])
+        self.assertEqual(merge, [1, 2, 3, 4, 5])
+
+        # Now previous is larger than next.
+        # We merge what we can and remove the extra right side of the left sequence
+        previous_sequence = [1, 2, 3, 4, 5, 6, 7]
+        next_sequence = [2, 3, 4, 5]
+        merge = _find_longest_common_sequence([previous_sequence, next_sequence])
+        self.assertEqual(merge, [1, 2, 3, 4, 5])
+
+        # Nothing in common
+        previous_sequence = [1, 2, 3]
+        next_sequence = [4, 5, 6]
+        merge = _find_longest_common_sequence([previous_sequence, next_sequence])
+        self.assertEqual(merge, [1, 2, 3, 4, 5, 6])
+
+        # Some errors in the overlap.
+        # We take from previous on the left, from the next on the right of the overlap
+        previous_sequence = [1, 2, 3, 4, 99]
+        next_sequence = [2, 98, 4, 5, 6]
+        merge = _find_longest_common_sequence([previous_sequence, next_sequence])
+        self.assertEqual(merge, [1, 2, 3, 4, 5, 6])
+
+        # We take from previous on the left, from the next on the right of the overlap
+        previous_sequence = [1, 2, 99, 4, 5]
+        next_sequence = [2, 3, 4, 98, 6]
+        merge = _find_longest_common_sequence([previous_sequence, next_sequence])
+        self.assertEqual(merge, [1, 2, 99, 4, 98, 6])
+
+        # This works on 3 sequences
+        seq1 = [1, 2, 3]
+        seq2 = [2, 3, 4]
+        seq3 = [3, 4, 5]
+        merge = _find_longest_common_sequence([seq1, seq2, seq3])
+        self.assertEqual(merge, [1, 2, 3, 4, 5])
+
+        # This works on 3 sequences with errors
+        seq1 = [1, 2, 3, 98, 5]
+        seq2 = [2, 99, 4, 5, 6, 7]
+        seq3 = [4, 97, 6, 7, 8]
+        merge = _find_longest_common_sequence([seq1, seq2, seq3])
+        self.assertEqual(merge, [1, 2, 3, 4, 5, 6, 7, 8])
+
+    def test_skip_special_tokens_skips_prompt_ids(self):
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+        # fmt: off
+        encoded_input = [
+            50361, 2221, 13, 2326, 388, 391, 50258, 50259, 50359,
+            50363, 1282, 264, 2674, 9156, 295, 1523, 11, 2221, 13,
+            2326, 388, 391, 13657, 365, 2681, 21296, 17711, 13, 50257,
+        ]
+        # fmt: on
+        expected_with_special_tokens = "<|startofprev|> Mr. Quilter<|startoftranscript|><|en|><|transcribe|><|notimestamps|> On the general principles of art, Mr. Quilter writes with equal lucidity.<|endoftext|>"
+        expected_without_special_tokens = " On the general principles of art, Mr. Quilter writes with equal lucidity."
+        self.assertEqual(tokenizer.decode(encoded_input, skip_special_tokens=False), expected_with_special_tokens)
+        self.assertEqual(tokenizer.decode(encoded_input, skip_special_tokens=True), expected_without_special_tokens)
+        self.assertEqual(rust_tokenizer.decode(encoded_input, skip_special_tokens=False), expected_with_special_tokens)
+        self.assertEqual(
+            rust_tokenizer.decode(encoded_input, skip_special_tokens=True), expected_without_special_tokens
+        )
+
+    def test_skip_special_tokens_with_timestamps(self):
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+
+        # fmt: off
+        encoded_input = [
+            50258, 50363, 50364, 634, 575, 12525, 22618, 1968, 6144,
+            35617, 20084, 1756, 311, 589, 307, 534, 10281, 934,
+            439, 293, 50676, 50676, 393, 4411, 294, 309, 457,
+            707, 295, 33301, 286, 392, 6628, 13, 50836, 50257,
+        ]
+        # fmt: on
+
+        expected_with_special_tokens = "<|startoftranscript|><|notimestamps|><|0.00|> He has grave doubts whether Sir Frederick Layton's work is really Greek after all and<|6.24|><|6.24|> can discover in it but little of rocky Ithaca.<|9.44|><|endoftext|>"
+        expected_without_special_tokens = "<|0.00|> He has grave doubts whether Sir Frederick Layton's work is really Greek after all and<|6.24|><|6.24|> can discover in it but little of rocky Ithaca.<|9.44|>"
+        self.assertEqual(
+            tokenizer.decode(encoded_input, decode_with_timestamps=True, skip_special_tokens=False),
+            expected_with_special_tokens,
+        )
+        self.assertEqual(
+            tokenizer.decode(encoded_input, decode_with_timestamps=True, skip_special_tokens=True),
+            expected_without_special_tokens,
+        )
+        self.assertEqual(
+            rust_tokenizer.decode(encoded_input, decode_with_timestamps=True, skip_special_tokens=False),
+            expected_with_special_tokens,
+        )
+        self.assertEqual(
+            rust_tokenizer.decode(encoded_input, decode_with_timestamps=True, skip_special_tokens=True),
+            expected_without_special_tokens,
+        )
+
+    def test_fast_tokenizer_get_prompt_ids(self):
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+
+        prompt = "This is test prompt text."
+        tokenizer_prompt_ids = tokenizer.get_prompt_ids(prompt)
+        fast_tokenizer_prompt_ids = rust_tokenizer.get_prompt_ids(prompt)
+
+        self.assertListEqual(tokenizer_prompt_ids.tolist(), fast_tokenizer_prompt_ids.tolist())
+
+    def test_combine_tokens_into_words(self):
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+
+        # 'whatever "whatever" said someone, clever!?'
+        encoded_input = [1363, 7969, 503, 1363, 7969, 1, 848, 1580, 11, 13494, 7323]
+        expected_words = ["whatever", ' "whatever"', " said", " someone,", " clever!?"]
+        expected_tokens = [[1363, 7969], [503, 1363, 7969, 1], [848], [1580, 11], [13494, 7323]]
+        expected_indices = [[0, 1], [2, 3, 4, 5], [6], [7, 8], [9, 10]]
+        output = _combine_tokens_into_words(tokenizer, encoded_input)
+        self.assertEqual(expected_words, output[0])
+        self.assertEqual(expected_tokens, output[1])
+        self.assertEqual(expected_indices, output[2])
+        output_rust = _combine_tokens_into_words(rust_tokenizer, encoded_input)
+        self.assertEqual(expected_words, output_rust[0])
+        self.assertEqual(expected_tokens, output_rust[1])
+        self.assertEqual(expected_indices, output_rust[2])
+
+    def test_basic_normalizer(self):
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+
+        input_str = "Hola güey!"
+        expected_output_normalize = "hola güey "
+        expected_output_diacritics = "hola guey "
+
+        # tokenizer tests
+        encoded_input = tokenizer(input_str).input_ids
+        decoded_output = tokenizer.decode(encoded_input, skip_special_tokens=True, basic_normalize=False)
+        self.assertEqual(decoded_output, input_str)
+
+        decoded_output_normalize = tokenizer.decode(encoded_input, skip_special_tokens=True, basic_normalize=True)
+        self.assertEqual(decoded_output_normalize, expected_output_normalize)
+
+        decoded_output_diacritics = tokenizer.decode(
+            encoded_input, skip_special_tokens=True, basic_normalize=True, remove_diacritics=True
+        )
+        self.assertEqual(decoded_output_diacritics, expected_output_diacritics)
+
+        # fast tokenizer tests
+        encoded_input = rust_tokenizer(input_str).input_ids
+        decoded_output = rust_tokenizer.decode(encoded_input, skip_special_tokens=True, basic_normalize=False)
+        self.assertEqual(decoded_output, input_str)
+
+        decoded_output_normalize = rust_tokenizer.decode(encoded_input, skip_special_tokens=True, basic_normalize=True)
+        self.assertEqual(decoded_output_normalize, expected_output_normalize)
+
+        decoded_output_diacritics = rust_tokenizer.decode(
+            encoded_input, skip_special_tokens=True, basic_normalize=True, remove_diacritics=True
+        )
+        self.assertEqual(decoded_output_diacritics, expected_output_diacritics)
+
 
 class SpeechToTextTokenizerMultilinguialTest(unittest.TestCase):
     checkpoint_name = "openai/whisper-small.en"
@@ -227,3 +431,101 @@ def test_batch_encoding_decoding(self):
         batch_encoding = multilingual_tokenizer.batch_encode_plus(batch, padding=True).input_ids
         transcription = multilingual_tokenizer.batch_decode(batch_encoding, skip_special_tokens=True)
         self.assertListEqual(batch, transcription)
+
+    def test_offset_decoding(self):
+        multilingual_tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny")
+        # fmt: off
+        INPUT_TOKENS = [
+            50258, 50259, 50359, 50364, 441, 1857, 4174, 11, 5242, 366,
+            257, 1333, 295, 493, 2794, 2287, 293, 12018, 14880, 11,
+            293, 25730, 311, 454, 34152, 4496, 904, 50724, 50724, 366,
+            382, 4048, 382, 257, 361, 18459, 13065, 13, 2221, 13,
+            7145, 74, 325, 38756, 311, 29822, 7563, 412, 472, 709,
+            294, 264, 51122, 51122, 912, 636, 300, 2221, 13, 2741,
+            5767, 1143, 281, 7319, 702, 7798, 13, 400, 2221, 13,
+            2619, 4004, 811, 2709, 702, 51449, 51449, 50257
+        ]
+        # fmt: on
+        output = multilingual_tokenizer.decode(INPUT_TOKENS, output_offsets=True)["offsets"]
+
+        self.assertEqual(
+            output,
+            [
+                {
+                    "text": (
+                        " Lennils, pictures are a sort of upguards and atom paintings, and Mason's exquisite idles"
+                    ),
+                    "timestamp": (0.0, 7.2),
+                },
+                {
+                    "text": (
+                        " are as national as a jingo poem. Mr. Birkut Foster's landscapes smile at one much in the"
+                    ),
+                    "timestamp": (7.2, 15.16),
+                },
+                {
+                    "text": " same way that Mr. Carker used to flash his teeth. And Mr. John Colier gives his",
+                    "timestamp": (15.16, 21.7),
+                },
+            ],
+        )
+        # test `decode_with_offsets`
+        output = multilingual_tokenizer.decode(INPUT_TOKENS, decode_with_timestamps=True)
+        self.assertEqual(
+            output,
+            "<|startoftranscript|><|en|><|transcribe|><|0.00|> Lennils, pictures are a sort of upguards and atom"
+            " paintings, and Mason's exquisite idles<|7.20|><|7.20|> are as national as a jingo poem. Mr. Birkut"
+            " Foster's landscapes smile at one much in the<|15.16|><|15.16|> same way that Mr. Carker used to flash"
+            " his teeth. And Mr. John Colier gives his<|21.70|><|21.70|><|endoftext|>",
+        )
+        # test a single sequence with timestamps
+        # fmt: off
+        INPUT_TOKENS = [
+            50364, 441, 1857, 4174, 11, 5242, 366,
+            257, 1333, 295, 493, 2794, 2287, 293, 12018, 14880, 11,
+            293, 25730, 311, 454, 34152, 4496, 904, 50724
+        ]
+        # fmt: on
+
+        output = multilingual_tokenizer.decode(INPUT_TOKENS, output_offsets=True)["offsets"]
+        self.assertEqual(
+            output[0],
+            {
+                "text": " Lennils, pictures are a sort of upguards and atom paintings, and Mason's exquisite idles",
+                "timestamp": (0.0, 7.2),
+            },
+        )
+
+        # test a sequence without a single timestamps
+        # fmt: off
+        INPUT_TOKENS = [
+            441, 1857, 4174, 11, 5242, 366,
+            257, 1333, 295, 493, 2794, 2287, 293, 12018, 14880, 11,
+            293, 25730, 311, 454, 34152, 4496, 904, 50724
+        ]
+        # fmt: on
+
+        output = multilingual_tokenizer.decode(INPUT_TOKENS, output_offsets=True)["offsets"]
+        self.assertEqual(output, [])
+
+    @require_jinja
+    def test_tokenization_for_chat(self):
+        multilingual_tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny")
+        # This is in English, but it's just here to make sure the chat control tokens are being added properly
+        test_chats = [
+            [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
+            [
+                {"role": "system", "content": "You are a helpful chatbot."},
+                {"role": "user", "content": "Hello!"},
+                {"role": "assistant", "content": "Nice to meet you."},
+            ],
+            [{"role": "assistant", "content": "Nice to meet you."}, {"role": "user", "content": "Hello!"}],
+        ]
+        tokenized_chats = [multilingual_tokenizer.apply_chat_template(test_chat) for test_chat in test_chats]
+        expected_tokens = [
+            [3223, 366, 257, 4961, 5081, 18870, 13, 50257, 15947, 0, 50257],
+            [3223, 366, 257, 4961, 5081, 18870, 13, 50257, 15947, 0, 50257, 37717, 220, 1353, 1677, 291, 13, 50257],
+            [37717, 220, 1353, 1677, 291, 13, 50257, 15947, 0, 50257],
+        ]
+        for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
+            self.assertListEqual(tokenized_chat, expected_tokens)
diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py
index b4f3252e2fa4..5c602d3d3ef7 100644
--- a/tests/models/x_clip/test_modeling_x_clip.py
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -21,8 +21,8 @@
 import unittest
 
 import numpy as np
-
 from huggingface_hub import hf_hub_download
+
 from transformers import XCLIPConfig, XCLIPTextConfig, XCLIPVisionConfig
 from transformers.testing_utils import require_torch, require_torch_multi_gpu, require_vision, slow, torch_device
 from transformers.utils import is_torch_available, is_vision_available
@@ -35,6 +35,7 @@
     ids_tensor,
     random_attention_mask,
 )
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -60,7 +61,7 @@ def __init__(
         num_frames=8,  # important; the batch size * time must be divisible by the number of frames
         is_training=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         mit_hidden_size=64,
@@ -317,7 +318,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -393,7 +394,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class XCLIPTextModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (XCLIPTextModel,) if is_torch_available() else ()
     fx_compatible = False
     test_pruning = False
@@ -445,7 +445,6 @@ def __init__(
         mit_hidden_size=64,
         is_training=True,
     ):
-
         if text_kwargs is None:
             text_kwargs = {}
         if vision_kwargs is None:
@@ -508,8 +507,9 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class XCLIPModelTest(ModelTesterMixin, unittest.TestCase):
+class XCLIPModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (XCLIPModel,) if is_torch_available() else ()
+    pipeline_model_mapping = {"feature-extraction": XCLIPModel} if is_torch_available() else {}
     fx_compatible = False
     test_head_masking = False
     test_pruning = False
@@ -612,8 +612,28 @@ def _create_and_check_torchscript(self, config, inputs_dict):
             model_state_dict = model.state_dict()
             loaded_model_state_dict = loaded_model.state_dict()
 
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
             self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
 
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
             models_equal = True
             for layer_name, p1 in model_state_dict.items():
                 p2 = loaded_model_state_dict[layer_name]
diff --git a/tests/models/xglm/test_modeling_flax_xglm.py b/tests/models/xglm/test_modeling_flax_xglm.py
index 924c73321e90..8f1c9a5e2a74 100644
--- a/tests/models/xglm/test_modeling_flax_xglm.py
+++ b/tests/models/xglm/test_modeling_flax_xglm.py
@@ -26,10 +26,10 @@
 
 
 if is_flax_available():
-    import numpy as np
-
     import jax
     import jax.numpy as jnp
+    import numpy as np
+
     from transformers.modeling_flax_pytorch_utils import (
         convert_pytorch_state_dict_to_flax,
         load_flax_weights_in_pytorch_model,
@@ -53,7 +53,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         d_model=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         ffn_dim=37,
         activation_function="gelu",
@@ -196,7 +196,6 @@ def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input
 @require_sentencepiece
 @require_flax
 class FlaxXGLMModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
-
     all_model_classes = (FlaxXGLMModel, FlaxXGLMForCausalLM) if is_flax_available() else ()
     all_generative_model_classes = (FlaxXGLMForCausalLM,) if is_flax_available() else ()
 
diff --git a/tests/models/xglm/test_modeling_tf_xglm.py b/tests/models/xglm/test_modeling_tf_xglm.py
index b6387901dc95..54641693c771 100644
--- a/tests/models/xglm/test_modeling_tf_xglm.py
+++ b/tests/models/xglm/test_modeling_tf_xglm.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import XGLMConfig, XGLMTokenizer, is_tf_available
@@ -20,6 +22,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -48,7 +51,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         d_model=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         ffn_dim=37,
         activation_function="gelu",
@@ -139,10 +142,12 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFXGLMModelTest(TFModelTesterMixin, unittest.TestCase):
-
+class TFXGLMModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (TFXGLMModel, TFXGLMForCausalLM) if is_tf_available() else ()
     all_generative_model_classes = (TFXGLMForCausalLM,) if is_tf_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": TFXGLMModel, "text-generation": TFXGLMForCausalLM} if is_tf_available() else {}
+    )
     test_onnx = False
     test_missing_keys = False
     test_pruning = False
@@ -154,62 +159,6 @@ def setUp(self):
     def test_config(self):
         self.config_tester.run_common_tests()
 
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-
-            if model_class in self.all_generative_model_classes:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert name is None
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
-
-    @slow
-    def test_batch_generation(self):
-        model = TFXGLMForCausalLM.from_pretrained("facebook/xglm-564M")
-        tokenizer = XGLMTokenizer.from_pretrained("facebook/xglm-564M")
-
-        tokenizer.padding_side = "left"
-
-        # use different length sentences to test batching
-        sentences = [
-            "Hello, my dog is a little",
-            "Today, I",
-        ]
-
-        inputs = tokenizer(sentences, return_tensors="tf", padding=True)
-
-        outputs = model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
-
-        inputs_non_padded = tokenizer(sentences[0], return_tensors="tf").input_ids
-        output_non_padded = model.generate(input_ids=inputs_non_padded)
-
-        num_paddings = (
-            inputs_non_padded.shape[-1]
-            - tf.math.reduce_sum(tf.cast(inputs["attention_mask"][-1], dtype=tf.int64)).numpy()
-        )
-        inputs_padded = tokenizer(sentences[1], return_tensors="tf").input_ids
-        output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
-
-        batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
-        padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
-
-        expected_output_sentence = [
-            "Hello, my dog is a little bit of a shy one, but he is very friendly",
-            "Today, I am going to share with you a few of my favorite things",
-        ]
-        self.assertListEqual(expected_output_sentence, batch_out_sentence)
-        self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in TF_XGLM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
@@ -243,7 +192,9 @@ def test_xglm_sample(self):
         tf.random.set_seed(0)
         tokenized = tokenizer("Today is a nice day and", return_tensors="tf")
         input_ids = tokenized.input_ids
-        output_ids = model.generate(input_ids, do_sample=True, seed=[7, 0])
+        # forces the generation to happen on CPU, to avoid GPU-related quirks (and assure same output regardless of the available devices)
+        with tf.device(":/CPU:0"):
+            output_ids = model.generate(input_ids, do_sample=True, seed=[7, 0])
         output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)
 
         EXPECTED_OUTPUT_STR = (
@@ -252,33 +203,41 @@ def test_xglm_sample(self):
         self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
 
     @slow
-    def test_lm_generate_xglm_left_padding(self):
-        """Tests that the generated text is the same, regarless of left padding"""
-        tokenizer = XGLMTokenizer.from_pretrained("facebook/xglm-564M")
+    def test_batch_generation(self):
         model = TFXGLMForCausalLM.from_pretrained("facebook/xglm-564M")
+        tokenizer = XGLMTokenizer.from_pretrained("facebook/xglm-564M")
 
         tokenizer.padding_side = "left"
 
-        generation_kwargs = {
-            "bad_words_ids": [tokenizer("is").input_ids, tokenizer("angry about").input_ids],
-            "no_repeat_ngram_size": 2,
-            "do_sample": False,
-            "repetition_penalty": 1.3,
-        }
-        expected_output_string = (
-            "Today is a beautiful day and I am so glad that we have the opportunity to spend time with"
-        )
+        # use different length sentences to test batching
+        sentences = [
+            "This is an extremelly long sentence that only exists to test the ability of the model to cope with "
+            "left-padding, such as in batched generation. The output for the sequence below should be the same "
+            "regardless of whether left padding is applied or not. When",
+            "Hello, my dog is a little",
+        ]
+
+        inputs = tokenizer(sentences, return_tensors="tf", padding=True)
+        input_ids = inputs["input_ids"]
+
+        outputs = model.generate(input_ids=input_ids, attention_mask=inputs["attention_mask"], max_new_tokens=12)
+
+        inputs_non_padded = tokenizer(sentences[0], return_tensors="tf").input_ids
+        output_non_padded = model.generate(input_ids=inputs_non_padded, max_new_tokens=12)
 
-        sentences = ["Today is a beautiful day and"]
-        input_ids = tokenizer(sentences, return_tensors="tf", padding=True)
-        # using default length
-        output_ids = model.generate(**input_ids, **generation_kwargs)
-        output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-        self.assertEqual(output_strings[0], expected_output_string)
-
-        sentences = ["Today is a beautiful day and", "This is a very long input that we absolutely don't care about"]
-        input_ids = tokenizer(sentences, return_tensors="tf", padding=True)
-        # longer max length to capture the full length (remember: it is left padded)
-        output_ids = model.generate(**input_ids, **generation_kwargs, max_length=28)
-        output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-        self.assertEqual(output_strings[0], expected_output_string)
+        inputs_padded = tokenizer(sentences[1], return_tensors="tf").input_ids
+        output_padded = model.generate(input_ids=inputs_padded, max_new_tokens=12)
+
+        batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
+        padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
+
+        expected_output_sentence = [
+            "This is an extremelly long sentence that only exists to test the ability of the model to cope with "
+            "left-padding, such as in batched generation. The output for the sequence below should be the same "
+            "regardless of whether left padding is applied or not. When left padding is applied, the sequence will be "
+            "a single",
+            "Hello, my dog is a little bit of a shy one, but he is very friendly",
+        ]
+        self.assertListEqual(expected_output_sentence, batch_out_sentence)
+        self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
diff --git a/tests/models/xglm/test_modeling_xglm.py b/tests/models/xglm/test_modeling_xglm.py
index 662299fb7eb1..105ad5c44e99 100644
--- a/tests/models/xglm/test_modeling_xglm.py
+++ b/tests/models/xglm/test_modeling_xglm.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import datetime
+import gc
 import math
 import unittest
 
@@ -23,6 +24,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -42,7 +44,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         d_model=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         ffn_dim=37,
         activation_function="gelu",
@@ -294,10 +296,12 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class XGLMModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
+class XGLMModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (XGLMModel, XGLMForCausalLM) if is_torch_available() else ()
     all_generative_model_classes = (XGLMForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": XGLMModel, "text-generation": XGLMForCausalLM} if is_torch_available() else {}
+    )
     fx_compatible = True
     test_missing_keys = False
     test_pruning = False
@@ -337,6 +341,45 @@ def test_xglm_weight_initialization(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_xglm_weight_initialization(*config_and_inputs)
 
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in XGLM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = XGLMModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    @unittest.skip("Does not work on the tiny model as we keep hitting edge cases.")
+    def test_model_parallelism(self):
+        super().test_model_parallelism()
+
+
+@require_torch
+class XGLMModelLanguageGenerationTest(unittest.TestCase):
+    def tearDown(self):
+        super().tearDown()
+        # clean-up as much as possible GPU memory occupied by PyTorch
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def _test_lm_generate_xglm_helper(
+        self,
+        gradient_checkpointing=False,
+        verify_outputs=True,
+    ):
+        model = XGLMForCausalLM.from_pretrained("facebook/xglm-564M")
+        if gradient_checkpointing:
+            model.gradient_checkpointing_enable()
+        else:
+            model.gradient_checkpointing_disable()
+        model.to(torch_device)
+        input_ids = torch.tensor([[2, 268, 9865]], dtype=torch.long, device=torch_device)  # The dog
+        # </s> The dog is a very friendly dog. He is very affectionate and loves to play with other
+        # fmt: off
+        expected_output_ids = [2, 268, 9865, 67, 11, 1988, 57252, 9865, 5, 984, 67, 1988, 213838, 1658, 53, 70446, 33, 6657, 278, 1581]
+        # fmt: on
+        output_ids = model.generate(input_ids, do_sample=False, num_beams=1)
+        if verify_outputs:
+            self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
+
     @slow
     def test_batch_generation(self):
         model = XGLMForCausalLM.from_pretrained("facebook/xglm-564M")
@@ -347,65 +390,39 @@ def test_batch_generation(self):
 
         # use different length sentences to test batching
         sentences = [
+            "This is an extremelly long sentence that only exists to test the ability of the model to cope with "
+            "left-padding, such as in batched generation. The output for the sequence below should be the same "
+            "regardless of whether left padding is applied or not. When",
             "Hello, my dog is a little",
-            "Today, I",
         ]
 
         inputs = tokenizer(sentences, return_tensors="pt", padding=True)
         input_ids = inputs["input_ids"].to(torch_device)
 
         outputs = model.generate(
-            input_ids=input_ids,
-            attention_mask=inputs["attention_mask"].to(torch_device),
+            input_ids=input_ids, attention_mask=inputs["attention_mask"].to(torch_device), max_new_tokens=12
         )
 
         inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
-        output_non_padded = model.generate(input_ids=inputs_non_padded)
+        output_non_padded = model.generate(input_ids=inputs_non_padded, max_new_tokens=12)
 
-        num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item()
         inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
-        output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
+        output_padded = model.generate(input_ids=inputs_padded, max_new_tokens=12)
 
         batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
         non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
         padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
 
         expected_output_sentence = [
+            "This is an extremelly long sentence that only exists to test the ability of the model to cope with "
+            "left-padding, such as in batched generation. The output for the sequence below should be the same "
+            "regardless of whether left padding is applied or not. When left padding is applied, the sequence will be "
+            "a single",
             "Hello, my dog is a little bit of a shy one, but he is very friendly",
-            "Today, I am going to share with you a few of my favorite things",
         ]
         self.assertListEqual(expected_output_sentence, batch_out_sentence)
         self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
 
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in XGLM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = XGLMModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-
-@require_torch
-class XGLMModelLanguageGenerationTest(unittest.TestCase):
-    def _test_lm_generate_xglm_helper(
-        self,
-        gradient_checkpointing=False,
-        verify_outputs=True,
-    ):
-        model = XGLMForCausalLM.from_pretrained("facebook/xglm-564M")
-        if gradient_checkpointing:
-            model.gradient_checkpointing_enable()
-        else:
-            model.gradient_checkpointing_disable()
-        model.to(torch_device)
-        input_ids = torch.tensor([[2, 268, 9865]], dtype=torch.long, device=torch_device)  # The dog
-        # </s> The dog is a very friendly dog. He is very affectionate and loves to play with other
-        # fmt: off
-        expected_output_ids = [2, 268, 9865, 67, 11, 1988, 57252, 9865, 5, 984, 67, 1988, 213838, 1658, 53, 70446, 33, 6657, 278, 1581]
-        # fmt: on
-        output_ids = model.generate(input_ids, do_sample=False, num_beams=1)
-        if verify_outputs:
-            self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
-
     @slow
     def test_lm_generate_xglm(self):
         self._test_lm_generate_xglm_helper()
@@ -425,8 +442,14 @@ def test_xglm_sample(self):
         output_ids = model.generate(input_ids, do_sample=True, num_beams=1)
         output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)
 
-        EXPECTED_OUTPUT_STR = "Today is a nice day and the sun is shining. A nice day with warm rainy"
-        self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
+        EXPECTED_OUTPUT_STRS = [
+            # TODO: remove this once we move to torch 2.0
+            # torch 1.13.1 + cu116
+            "Today is a nice day and the sun is shining. A nice day with warm rainy",
+            # torch 2.0 + cu117
+            "Today is a nice day and the water is still cold. We just stopped off for some fresh",
+        ]
+        self.assertIn(output_str, EXPECTED_OUTPUT_STRS)
 
     @slow
     def test_xglm_sample_max_time(self):
@@ -474,13 +497,13 @@ def test_batched_nan_fp16(self):
         model_name = "facebook/xglm-564M"
         tokenizer = XGLMTokenizer.from_pretrained(model_name, use_fast=False, padding_side="left")
 
-        model = XGLMForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, use_cache=True).cuda()
+        model = XGLMForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, use_cache=True).to(torch_device)
         model = model.eval()
 
         batch = tokenizer(["Who are you?", "Joe Biden is the president of"], padding=True, return_tensors="pt")
 
-        input_ids = batch["input_ids"].cuda()
-        attention_mask = batch["attention_mask"].cuda()
+        input_ids = batch["input_ids"].to(torch_device)
+        attention_mask = batch["attention_mask"].to(torch_device)
 
         with torch.no_grad():
             outputs = model(input_ids, attention_mask=attention_mask)
diff --git a/tests/models/xglm/test_tokenization_xglm.py b/tests/models/xglm/test_tokenization_xglm.py
index 05259ffaf9a3..74dd4dab5e3e 100644
--- a/tests/models/xglm/test_tokenization_xglm.py
+++ b/tests/models/xglm/test_tokenization_xglm.py
@@ -31,7 +31,6 @@
 @require_sentencepiece
 @require_tokenizers
 class XGLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = XGLMTokenizer
     rust_tokenizer_class = XGLMTokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/xlm/test_modeling_tf_xlm.py b/tests/models/xlm/test_modeling_tf_xlm.py
index 00e77cee64ba..7bfa33828f70 100644
--- a/tests/models/xlm/test_modeling_tf_xlm.py
+++ b/tests/models/xlm/test_modeling_tf_xlm.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import is_tf_available
@@ -21,6 +23,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -58,7 +61,7 @@ def __init__(
         self.vocab_size = 99
         self.n_special = 0
         self.hidden_size = 32
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.num_attention_heads = 4
         self.hidden_dropout_prob = 0.1
         self.attention_probs_dropout_prob = 0.1
@@ -276,8 +279,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase):
-
+class TFXLMModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             TFXLMModel,
@@ -293,9 +295,38 @@ class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase):
     all_generative_model_classes = (
         (TFXLMWithLMHeadModel,) if is_tf_available() else ()
     )  # TODO (PVP): Check other models whether language generation is also applicable
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": TFXLMModel,
+            "fill-mask": TFXLMWithLMHeadModel,
+            "question-answering": TFXLMForQuestionAnsweringSimple,
+            "text-classification": TFXLMForSequenceClassification,
+            "text-generation": TFXLMWithLMHeadModel,
+            "token-classification": TFXLMForTokenClassification,
+            "zero-shot": TFXLMForSequenceClassification,
+        }
+        if is_tf_available()
+        else {}
+    )
     test_head_masking = False
     test_onnx = False
 
+    # TODO: Fix the failed tests
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if (
+            pipeline_test_casse_name == "QAPipelineTests"
+            and tokenizer_name is not None
+            and not tokenizer_name.endswith("Fast")
+        ):
+            # `QAPipelineTests` fails for a few models when the slower tokenizer are used.
+            # (The slower tokenizers were never used for pipeline tests before the pipeline testing rework)
+            # TODO: check (and possibly fix) the `QAPipelineTests` with slower tokenizer
+            return True
+
+        return False
+
     def setUp(self):
         self.model_tester = TFXLMModelTester(self)
         self.config_tester = ConfigTester(self, config_class=XLMConfig, emb_dim=37)
diff --git a/tests/models/xlm/test_modeling_xlm.py b/tests/models/xlm/test_modeling_xlm.py
index 190e1e958377..b551e7e645d5 100644
--- a/tests/models/xlm/test_modeling_xlm.py
+++ b/tests/models/xlm/test_modeling_xlm.py
@@ -21,6 +21,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -56,7 +57,7 @@ def __init__(
         vocab_size=99,
         n_special=0,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         hidden_dropout_prob=0.1,
         attention_probs_dropout_prob=0.1,
@@ -359,8 +360,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class XLMModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
+class XLMModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             XLMModel,
@@ -377,6 +377,35 @@ class XLMModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     all_generative_model_classes = (
         (XLMWithLMHeadModel,) if is_torch_available() else ()
     )  # TODO (PVP): Check other models whether language generation is also applicable
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": XLMModel,
+            "fill-mask": XLMWithLMHeadModel,
+            "question-answering": XLMForQuestionAnsweringSimple,
+            "text-classification": XLMForSequenceClassification,
+            "text-generation": XLMWithLMHeadModel,
+            "token-classification": XLMForTokenClassification,
+            "zero-shot": XLMForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
+
+    # TODO: Fix the failed tests
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if (
+            pipeline_test_casse_name == "QAPipelineTests"
+            and tokenizer_name is not None
+            and not tokenizer_name.endswith("Fast")
+        ):
+            # `QAPipelineTests` fails for a few models when the slower tokenizer are used.
+            # (The slower tokenizers were never used for pipeline tests before the pipeline testing rework)
+            # TODO: check (and possibly fix) the `QAPipelineTests` with slower tokenizer
+            return True
+
+        return False
 
     # XLM has 2 QA models -> need to manually set the correct labels for one of them here
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
diff --git a/tests/models/xlm/test_tokenization_xlm.py b/tests/models/xlm/test_tokenization_xlm.py
index adb4835eda40..6e3103521585 100644
--- a/tests/models/xlm/test_tokenization_xlm.py
+++ b/tests/models/xlm/test_tokenization_xlm.py
@@ -25,7 +25,6 @@
 
 
 class XLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = XLMTokenizer
     test_rust_tokenizer = False
 
diff --git a/tests/models/xlm_prophetnet/test_modeling_xlm_prophetnet.py b/tests/models/xlm_prophetnet/test_modeling_xlm_prophetnet.py
index 5dec186bc7b9..1af9ecef6c18 100644
--- a/tests/models/xlm_prophetnet/test_modeling_xlm_prophetnet.py
+++ b/tests/models/xlm_prophetnet/test_modeling_xlm_prophetnet.py
@@ -45,7 +45,7 @@ def test_pretrained_checkpoint_hidden_states(self):
         expected_shape = torch.Size((1, 14, 250012))
         self.assertEqual(output_predited_logis.shape, expected_shape)
         expected_slice = torch.tensor(
-            [[[-6.6042, -8.3838, 12.4717], [-6.4426, -8.1994, 12.4542], [-6.0851, -7.8209, 12.9493]]]
+            [[[-6.3986, -8.2391, 12.5189], [-6.3289, -8.0864, 12.6211], [-6.2418, -8.0445, 12.7968]]]
         ).to(torch_device)
         self.assertTrue(torch.allclose(output_predited_logis[:, :3, :3], expected_slice, atol=1e-4))
 
@@ -87,7 +87,7 @@ def test_ntg_hidden_states(self):
         self.assertEqual(output_predited_logis.shape, expected_shape)
         # compare the actual values for a slice.
         expected_slice = torch.tensor(
-            [[[-8.8815, -9.2996, -4.4506], [-6.7202, -7.8944, -0.9402], [-8.6890, -7.4528, -1.9437]]]
+            [[[-9.2253, -9.7173, -6.3529], [-7.6701, -9.0145, -1.9382], [-8.0195, -7.0004, -0.1523]]]
         ).to(torch_device)
 
         self.assertTrue(torch.allclose(output_predited_logis[:, :3, :3], expected_slice, atol=1e-4))
@@ -142,9 +142,8 @@ def test_xprophetnet_ntg_inference(self):
             tokenizer.convert_ids_to_tokens(g, skip_special_tokens=True) for g in summary_ids_beam1
         ]
         EXPECTED_TITLE_EN_BEAM1_TOK = "▁Microsoft ▁to ▁end ▁free ▁support ▁for ▁Windows ▁7".split(" ")
-        EXPECTED_TITLE_RU_BEAM1_TOK = (
-            "▁Microsoft ▁намерен а ▁прекрати ть ▁бес плат ную ▁поддержку ▁Windows ▁7 ▁после ▁14 ▁января ▁2020 ▁года"
-            .split(" ")
+        EXPECTED_TITLE_RU_BEAM1_TOK = "▁Microsoft ▁намерен а ▁прекрати ть ▁бес плат ную ▁поддержку ▁Windows ▁7 ▁после ▁14 ▁января ▁2020 ▁года".split(
+            " "
         )
         EXPECTED_TITLE_ZH_BEAM1_TOK = "微软 公司 打算 终止 对 Windows ▁7 操作 系统的 免费 支持".split(" ")
         self.assertListEqual(
diff --git a/tests/models/xlm_prophetnet/test_tokenization_xlm_prophetnet.py b/tests/models/xlm_prophetnet/test_tokenization_xlm_prophetnet.py
index d560007fe316..13c02b5415f8 100644
--- a/tests/models/xlm_prophetnet/test_tokenization_xlm_prophetnet.py
+++ b/tests/models/xlm_prophetnet/test_tokenization_xlm_prophetnet.py
@@ -27,7 +27,6 @@
 
 @require_sentencepiece
 class XLMProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = XLMProphetNetTokenizer
     test_rust_tokenizer = False
     test_sentencepiece = True
diff --git a/tests/models/xlm_roberta/test_modeling_flax_xlm_roberta.py b/tests/models/xlm_roberta/test_modeling_flax_xlm_roberta.py
index c821cda6f3ce..0ceaa739f3fa 100644
--- a/tests/models/xlm_roberta/test_modeling_flax_xlm_roberta.py
+++ b/tests/models/xlm_roberta/test_modeling_flax_xlm_roberta.py
@@ -22,6 +22,7 @@
 
 if is_flax_available():
     import jax.numpy as jnp
+
     from transformers import FlaxXLMRobertaModel
 
 
diff --git a/tests/models/xlm_roberta/test_modeling_tf_xlm_roberta.py b/tests/models/xlm_roberta/test_modeling_tf_xlm_roberta.py
index 695a403b7b0b..1ecac55310fb 100644
--- a/tests/models/xlm_roberta/test_modeling_tf_xlm_roberta.py
+++ b/tests/models/xlm_roberta/test_modeling_tf_xlm_roberta.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import is_tf_available
diff --git a/tests/models/xlm_roberta/test_modeling_xlm_roberta.py b/tests/models/xlm_roberta/test_modeling_xlm_roberta.py
index 35ce2bd88185..ca9db17270dc 100644
--- a/tests/models/xlm_roberta/test_modeling_xlm_roberta.py
+++ b/tests/models/xlm_roberta/test_modeling_xlm_roberta.py
@@ -43,8 +43,8 @@ def test_xlm_roberta_base(self):
         #  xlmr = torch.hub.load('pytorch/fairseq', 'xlmr.base')
         #  xlmr.eval()
         #  expected_output_values_last_dim = xlmr.extract_features(input_ids[0])[:, :, -1]
-
-        output = model(input_ids)["last_hidden_state"].detach()
+        with torch.no_grad():
+            output = model(input_ids)["last_hidden_state"].detach()
         self.assertEqual(output.shape, expected_output_shape)
         # compare the actual values for a slice of last dim
         self.assertTrue(torch.allclose(output[:, :, -1], expected_output_values_last_dim, atol=1e-3))
@@ -62,8 +62,8 @@ def test_xlm_roberta_large(self):
         #  xlmr = torch.hub.load('pytorch/fairseq', 'xlmr.large')
         #  xlmr.eval()
         #  expected_output_values_last_dim = xlmr.extract_features(input_ids[0])[:, :, -1]
-
-        output = model(input_ids)["last_hidden_state"].detach()
+        with torch.no_grad():
+            output = model(input_ids)["last_hidden_state"].detach()
         self.assertEqual(output.shape, expected_output_shape)
         # compare the actual values for a slice of last dim
         self.assertTrue(torch.allclose(output[:, :, -1], expected_output_values_last_dim, atol=1e-3))
diff --git a/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py b/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py
index c8f934b258b9..0dde56481cc7 100644
--- a/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py
+++ b/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py
@@ -31,7 +31,6 @@
 @require_sentencepiece
 @require_tokenizers
 class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = XLMRobertaTokenizer
     rust_tokenizer_class = XLMRobertaTokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py b/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py
index 6c9577be777f..828d6a02a6a3 100644
--- a/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py
+++ b/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py
@@ -22,6 +22,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -54,7 +55,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -357,8 +358,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class XLMRobertaXLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
+class XLMRobertaXLModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             XLMRobertaXLForCausalLM,
@@ -373,6 +373,28 @@ class XLMRobertaXLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Te
         else ()
     )
     all_generative_model_classes = (XLMRobertaXLForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": XLMRobertaXLModel,
+            "fill-mask": XLMRobertaXLForMaskedLM,
+            "question-answering": XLMRobertaXLForQuestionAnswering,
+            "text-classification": XLMRobertaXLForSequenceClassification,
+            "text-generation": XLMRobertaXLForCausalLM,
+            "token-classification": XLMRobertaXLForTokenClassification,
+            "zero-shot": XLMRobertaXLForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
+
+    # TODO: Fix the failed tests
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if pipeline_test_casse_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
+            return True
+
+        return False
 
     def setUp(self):
         self.model_tester = XLMRobertaXLModelTester(self)
diff --git a/tests/models/xlnet/test_modeling_tf_xlnet.py b/tests/models/xlnet/test_modeling_tf_xlnet.py
index bc8f31006bd4..03eba74f4065 100644
--- a/tests/models/xlnet/test_modeling_tf_xlnet.py
+++ b/tests/models/xlnet/test_modeling_tf_xlnet.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import inspect
 import random
 import unittest
@@ -23,6 +25,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_tf_available():
@@ -58,7 +61,7 @@ def __init__(
         self.hidden_size = 32
         self.num_attention_heads = 4
         self.d_inner = 128
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.type_sequence_label_size = 2
         self.untie_r = True
         self.bi_data = False
@@ -331,8 +334,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFXLNetModelTest(TFModelTesterMixin, unittest.TestCase):
-
+class TFXLNetModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             TFXLNetModel,
@@ -348,9 +350,34 @@ class TFXLNetModelTest(TFModelTesterMixin, unittest.TestCase):
     all_generative_model_classes = (
         (TFXLNetLMHeadModel,) if is_tf_available() else ()
     )  # TODO (PVP): Check other models whether language generation is also applicable
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": TFXLNetModel,
+            "question-answering": TFXLNetForQuestionAnsweringSimple,
+            "text-classification": TFXLNetForSequenceClassification,
+            "text-generation": TFXLNetLMHeadModel,
+            "token-classification": TFXLNetForTokenClassification,
+            "zero-shot": TFXLNetForSequenceClassification,
+        }
+        if is_tf_available()
+        else {}
+    )
     test_head_masking = False
     test_onnx = False
 
+    # Note that `TFXLNetModelTest` is not a subclass of `GenerationTesterMixin`, so no contrastive generation tests
+    # from there is run against `TFXLNetModel`.
+    @unittest.skip("XLNet has special cache mechanism and is currently not working with contrastive generation")
+    def test_xla_generate_contrastive(self):
+        super().test_xla_generate_contrastive()
+
+    # TODO: Fix the failed tests
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        # Exception encountered when calling layer '...'
+        return True
+
     def setUp(self):
         self.model_tester = TFXLNetModelTester(self)
         self.config_tester = ConfigTester(self, config_class=XLNetConfig, d_inner=37)
@@ -392,6 +419,10 @@ def test_model_from_pretrained(self):
             model = TFXLNetModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
+    @unittest.skip("Some of the XLNet models misbehave with flexible input shapes.")
+    def test_compile_tf_model(self):
+        pass
+
     # overwrite since `TFXLNetLMHeadModel` doesn't cut logits/labels
     def test_loss_computation(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -401,7 +432,7 @@ def test_loss_computation(self):
                 # The number of elements in the loss should be the same as the number of elements in the label
                 prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
                 added_label = prepared_for_class[
-                    sorted(list(prepared_for_class.keys() - inputs_dict.keys()), reverse=True)[0]
+                    sorted(prepared_for_class.keys() - inputs_dict.keys(), reverse=True)[0]
                 ]
                 expected_loss_size = added_label.shape.as_list()[:1]
 
diff --git a/tests/models/xlnet/test_modeling_xlnet.py b/tests/models/xlnet/test_modeling_xlnet.py
index 7fd0b2ee7051..2b0c95cd6d13 100644
--- a/tests/models/xlnet/test_modeling_xlnet.py
+++ b/tests/models/xlnet/test_modeling_xlnet.py
@@ -22,6 +22,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -55,7 +56,7 @@ def __init__(
         hidden_size=32,
         num_attention_heads=4,
         d_inner=128,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         type_sequence_label_size=2,
         untie_r=True,
         bi_data=False,
@@ -509,7 +510,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class XLNetModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class XLNetModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             XLNetModel,
@@ -526,9 +527,30 @@ class XLNetModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase)
     all_generative_model_classes = (
         (XLNetLMHeadModel,) if is_torch_available() else ()
     )  # TODO (PVP): Check other models whether language generation is also applicable
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": XLNetModel,
+            "question-answering": XLNetForQuestionAnsweringSimple,
+            "text-classification": XLNetForSequenceClassification,
+            "text-generation": XLNetLMHeadModel,
+            "token-classification": XLNetForTokenClassification,
+            "zero-shot": XLNetForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
     fx_compatible = False
     test_pruning = False
 
+    # TODO: Fix the failed tests
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if pipeline_test_casse_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
+            return True
+
+        return False
+
     # XLNet has 2 QA models -> need to manually set the correct labels for one of them here
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
diff --git a/tests/models/xlnet/test_tokenization_xlnet.py b/tests/models/xlnet/test_tokenization_xlnet.py
index 6125a1dffd77..216eb0f637a9 100644
--- a/tests/models/xlnet/test_tokenization_xlnet.py
+++ b/tests/models/xlnet/test_tokenization_xlnet.py
@@ -27,7 +27,6 @@
 @require_sentencepiece
 @require_tokenizers
 class XLNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = XLNetTokenizer
     rust_tokenizer_class = XLNetTokenizerFast
     test_rust_tokenizer = True
@@ -38,7 +37,6 @@ def setUp(self):
 
         # We have a SentencePiece fixture for testing
         tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
-        tokenizer.sanitize_special_tokens()
         tokenizer.save_pretrained(self.tmpdirname)
 
     def test_convert_token_and_id(self):
diff --git a/tests/models/xmod/__init__.py b/tests/models/xmod/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/xmod/test_modeling_xmod.py b/tests/models/xmod/test_modeling_xmod.py
new file mode 100644
index 000000000000..fc1ce44e35d8
--- /dev/null
+++ b/tests/models/xmod/test_modeling_xmod.py
@@ -0,0 +1,674 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+from transformers import XLMRobertaTokenizer, is_torch_available
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        XmodConfig,
+        XmodForCausalLM,
+        XmodForMaskedLM,
+        XmodForMultipleChoice,
+        XmodForQuestionAnswering,
+        XmodForSequenceClassification,
+        XmodForTokenClassification,
+        XmodModel,
+    )
+    from transformers.models.xmod.modeling_xmod import XmodEmbeddings, create_position_ids_from_input_ids
+
+
+class XmodModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.get_config()
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def get_config(self):
+        return XmodConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+            default_language="en_XX",
+        )
+
+    def prepare_config_and_inputs_for_decoder(self):
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = self.prepare_config_and_inputs()
+
+        config.is_decoder = True
+        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
+        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        return (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = XmodModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_model_as_decoder(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.add_cross_attention = True
+        model = XmodModel(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_for_causal_lm(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        model = XmodForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_decoder_model_past_large_inputs(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.is_decoder = True
+        config.add_cross_attention = True
+        model = XmodForCausalLM(config=config).to(torch_device).eval()
+
+        # make sure that ids don't start with pad token
+        mask = input_ids.ne(config.pad_token_id).long()
+        input_ids = input_ids * mask
+
+        # first forward pass
+        outputs = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=True,
+        )
+        past_key_values = outputs.past_key_values
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+
+        # make sure that ids don't start with pad token
+        mask = next_tokens.ne(config.pad_token_id).long()
+        next_tokens = next_tokens * mask
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+
+        output_from_no_past = model(
+            next_input_ids,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+        output_from_past = model(
+            next_tokens,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = XmodForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = XmodForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_for_multiple_choice(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = XmodForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        result = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def create_and_check_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = XmodForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class XmodModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            XmodForCausalLM,
+            XmodForMaskedLM,
+            XmodModel,
+            XmodForSequenceClassification,
+            XmodForTokenClassification,
+            XmodForMultipleChoice,
+            XmodForQuestionAnswering,
+        )
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (XmodForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": XmodModel,
+            "fill-mask": XmodForMaskedLM,
+            "question-answering": XmodForQuestionAnswering,
+            "text-classification": XmodForSequenceClassification,
+            "text-generation": XmodForCausalLM,
+            "token-classification": XmodForTokenClassification,
+            "zero-shot": XmodForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
+
+    # TODO: Fix the failed tests
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if pipeline_test_casse_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
+            return True
+
+        return False
+
+    def setUp(self):
+        self.model_tester = XmodModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=XmodConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_various_embeddings(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        for type in ["absolute", "relative_key", "relative_key_query"]:
+            config_and_inputs[0].position_embedding_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_as_decoder(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
+
+    def test_model_as_decoder_with_default_input_mask(self):
+        # This regression test was failing with PyTorch < 1.3
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
+
+        input_mask = None
+
+        self.model_tester.create_and_check_model_as_decoder(
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def test_for_causal_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_decoder_model_past_with_large_inputs_relative_pos_emb(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        config_and_inputs[0].position_embedding_type = "relative_key"
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
+
+    def test_create_position_ids_respects_padding_index(self):
+        """Ensure that the default position ids only assign a sequential . This is a regression
+        test for https://github.com/huggingface/transformers/issues/1761
+
+        The position ids should be masked with the embedding object's padding index. Therefore, the
+        first available non-padding position index is XmodEmbeddings.padding_idx + 1
+        """
+        config = self.model_tester.prepare_config_and_inputs()[0]
+        model = XmodEmbeddings(config=config)
+
+        input_ids = torch.as_tensor([[12, 31, 13, model.padding_idx]])
+        expected_positions = torch.as_tensor(
+            [[0 + model.padding_idx + 1, 1 + model.padding_idx + 1, 2 + model.padding_idx + 1, model.padding_idx]]
+        )
+
+        position_ids = create_position_ids_from_input_ids(input_ids, model.padding_idx)
+        self.assertEqual(position_ids.shape, expected_positions.shape)
+        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
+
+    def test_create_position_ids_from_inputs_embeds(self):
+        """Ensure that the default position ids only assign a sequential . This is a regression
+        test for https://github.com/huggingface/transformers/issues/1761
+
+        The position ids should be masked with the embedding object's padding index. Therefore, the
+        first available non-padding position index is XmodEmbeddings.padding_idx + 1
+        """
+        config = self.model_tester.prepare_config_and_inputs()[0]
+        embeddings = XmodEmbeddings(config=config)
+
+        inputs_embeds = torch.empty(2, 4, 30)
+        expected_single_positions = [
+            0 + embeddings.padding_idx + 1,
+            1 + embeddings.padding_idx + 1,
+            2 + embeddings.padding_idx + 1,
+            3 + embeddings.padding_idx + 1,
+        ]
+        expected_positions = torch.as_tensor([expected_single_positions, expected_single_positions])
+        position_ids = embeddings.create_position_ids_from_inputs_embeds(inputs_embeds)
+        self.assertEqual(position_ids.shape, expected_positions.shape)
+        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
+
+    def test_set_default_language(self):
+        config = self.model_tester.prepare_config_and_inputs()[0]
+        model = XmodForMaskedLM(config=config)
+        model.set_default_language("en_XX")
+        self.assertEqual(model.config.default_language, "en_XX")
+        with self.assertRaises(ValueError):
+            model.set_default_language("xx_XX")
+
+    def test_freeze_embeddings_and_language_adapters(self):
+        config = self.model_tester.prepare_config_and_inputs()[0]
+        model = XmodForMaskedLM(config=config)
+        num_trainable_params_before = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        model.freeze_embeddings_and_language_adapters()
+        num_trainable_params_after = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        self.assertLess(num_trainable_params_after, num_trainable_params_before)
+
+
+@require_sentencepiece
+@require_tokenizers
+@require_torch
+class XmodModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_xmod_base(self):
+        model = XmodModel.from_pretrained("facebook/xmod-base")
+
+        # language en_XX
+        model.set_default_language("en_XX")
+        input_ids = torch.tensor([[0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2]])
+        # The dog is cute and lives in the garden house
+        expected_output_shape = torch.Size((1, 12, 768))  # batch_size, sequence_length, embedding_vector_dim
+        expected_output_values_last_dim = torch.tensor(
+            [[-0.2394, -0.0036, 0.1252, -0.0087, 0.1325, 0.0580, -0.2049, -0.1978, -0.1223, 0.0648, -0.2599, -0.3724]]
+        )
+        output = model(input_ids)["last_hidden_state"].detach()
+        self.assertEqual(output.shape, expected_output_shape)
+        # compare the actual values for a slice of last dim
+        self.assertTrue(torch.allclose(output[:, :, -1], expected_output_values_last_dim, atol=1e-3))
+
+        # language de_DE
+        model.set_default_language("de_DE")
+        input_ids = torch.tensor([[0, 1310, 49083, 443, 269, 71, 5486, 165, 60429, 660, 23, 2315, 58761, 18391, 5, 2]])
+        # Der Hund ist niedlich und wohnt in einem Gartenhaus.
+        expected_output_shape = torch.Size((1, 16, 768))  # batch_size, sequence_length, embedding_vector_dim
+        # fmt: off
+        expected_output_values_last_dim = torch.tensor(
+            [[0.0162, 0.0075, -0.1882, 0.2335, -0.0952, -0.3994, -0.0317, -0.1174, 0.0177, 0.4280, -0.0240, -0.2138,
+              0.0785, -0.1045, -0.2811, -0.3220]]
+        )
+        # fmt: on
+        output = model(input_ids)["last_hidden_state"].detach()
+        self.assertEqual(output.shape, expected_output_shape)
+        # compare the actual values for a slice of last dim
+        self.assertTrue(torch.allclose(output[:, :, -1], expected_output_values_last_dim, atol=1e-3))
+
+    @slow
+    def test_xmod_large_prenorm(self):
+        model = XmodModel.from_pretrained("facebook/xmod-large-prenorm")
+
+        # language en_XX
+        model.set_default_language("en_XX")
+        input_ids = torch.tensor([[0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2]])
+        # The dog is cute and lives in the garden house
+        expected_output_shape = torch.Size((1, 12, 1024))  # batch_size, sequence_length, embedding_vector_dim
+        # fmt: off
+        expected_output_values_last_dim = torch.tensor(
+            [[-0.0121, -0.0194, -0.0240, -0.0160, -0.0205, -0.0159, -0.0243, -0.0206, -0.0161, -0.0335, -0.0196,
+              -0.0141]]
+        )
+        # fmt: on
+        output = model(input_ids)["last_hidden_state"].detach()
+        self.assertEqual(output.shape, expected_output_shape)
+        # compare the actual values for a slice of last dim
+        self.assertTrue(torch.allclose(output[:, :, -1], expected_output_values_last_dim, atol=1e-3))
+
+        # language de_DE
+        model.set_default_language("de_DE")
+        input_ids = torch.tensor([[0, 1310, 49083, 443, 269, 71, 5486, 165, 60429, 660, 23, 2315, 58761, 18391, 5, 2]])
+        # Der Hund ist niedlich und wohnt in einem Gartenhaus.
+        expected_output_shape = torch.Size((1, 16, 1024))  # batch_size, sequence_length, embedding_vector_dim
+        # fmt: off
+        expected_output_values_last_dim = torch.tensor(
+            [[-0.0120, -0.0262, -0.0253, -0.0112, -0.0128, -0.0164, -0.0080, -0.0081, -0.0192, -0.0117, -0.0170,
+              -0.0120, -0.0210, -0.0173, -0.0078, -0.0122]]
+        )
+        # fmt: on
+        output = model(input_ids)["last_hidden_state"].detach()
+        self.assertEqual(output.shape, expected_output_shape)
+        # compare the actual values for a slice of last dim
+        self.assertTrue(torch.allclose(output[:, :, -1], expected_output_values_last_dim, atol=1e-3))
+
+    @slow
+    def test_multilingual_batch(self):
+        model = XmodModel.from_pretrained("facebook/xmod-base")
+        # fmt: off
+        input_ids = torch.tensor([
+            [0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2],
+            [0, 1310, 49083, 443, 269, 71, 5486, 165, 60429, 660, 23, 2],
+            [0, 1310, 49083, 443, 269, 71, 5486, 165, 60429, 660, 23, 2],
+            [0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2],
+        ])
+        # fmt: on
+        lang_ids = torch.LongTensor([0, 8, 8, 0])
+        expected_output_shape = torch.Size((4, 12, 768))  # batch_size, sequence_length, embedding_vector_dim
+        # fmt: off
+        expected_output_values_last_dim = torch.tensor([
+            [-0.2394, -0.0036, 0.1252, -0.0087, 0.1325, 0.0580, -0.2049, -0.1978, -0.1223, 0.0648, -0.2599, -0.3724],
+            [-0.2668, -0.0235, -0.1739, 0.2266, -0.0901, -0.3482, 0.0105, -0.1915, 0.0397, 0.3822, 0.1836, -0.3407],
+            [-0.2668, -0.0235, -0.1739, 0.2266, -0.0901, -0.3482, 0.0105, -0.1915, 0.0397, 0.3822, 0.1836, -0.3407],
+            [-0.2394, -0.0036, 0.1252, -0.0087, 0.1325, 0.0580, -0.2049, -0.1978, -0.1223, 0.0648, -0.2599, -0.3724],
+        ])
+        # fmt: on
+        output = model(input_ids, lang_ids=lang_ids)["last_hidden_state"].detach()
+        self.assertEqual(output.shape, expected_output_shape)
+        # compare the actual values for a slice of last dim
+        self.assertTrue(torch.allclose(output[:, :, -1], expected_output_values_last_dim, atol=1e-3))
+
+    @slow
+    def test_end_to_end_mask_fill(self):
+        tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
+        model = XmodForMaskedLM.from_pretrained("facebook/xmod-base", default_language="en_XX")
+        model.to(torch_device)
+
+        sentences = [
+            "Hello, my dog is a little <mask>.",
+            "Hi <mask>!",
+        ]
+
+        inputs = tokenizer(sentences, return_tensors="pt", padding=True)
+        input_ids = inputs["input_ids"].to(torch_device)
+
+        outputs = model(
+            input_ids=input_ids,
+            attention_mask=inputs["attention_mask"].to(torch_device),
+        )
+        probs = outputs.logits.softmax(dim=-1)
+        _, predictions = probs.topk(1)
+        predictions = predictions.squeeze(-1)
+
+        inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
+        output_non_padded = model(input_ids=inputs_non_padded)
+        probs_non_padded = output_non_padded.logits.softmax(dim=-1)
+        _, predictions_non_padded = probs_non_padded.topk(1)
+        predictions_non_padded = predictions_non_padded.squeeze(-1)
+
+        inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
+        output_padded = model(input_ids=inputs_padded)
+        probs_padded = output_padded.logits.softmax(dim=-1)
+        _, predictions_padded = probs_padded.topk(1)
+        predictions_padded = predictions_padded.squeeze(-1)
+
+        batch_out_sentence = tokenizer.batch_decode(predictions, skip_special_tokens=True)
+        non_padded_sentence = tokenizer.decode(predictions_non_padded[0], skip_special_tokens=True)
+        padded_sentence = tokenizer.decode(predictions_padded[0], skip_special_tokens=True)
+
+        expected_output_sentence = [
+            "Hello, my dog is a little girl.",
+            "Hi everyone!",
+        ]
+        self.assertListEqual(expected_output_sentence, batch_out_sentence)
+        self.assertListEqual(batch_out_sentence, [non_padded_sentence, padded_sentence])
diff --git a/tests/models/yolos/test_feature_extraction_yolos.py b/tests/models/yolos/test_feature_extraction_yolos.py
deleted file mode 100644
index 2c1571d7f7de..000000000000
--- a/tests/models/yolos/test_feature_extraction_yolos.py
+++ /dev/null
@@ -1,356 +0,0 @@
-# coding=utf-8
-# Copyright 2021 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-import pathlib
-import unittest
-
-import numpy as np
-
-from transformers.testing_utils import require_torch, require_vision, slow
-from transformers.utils import is_torch_available, is_vision_available
-
-from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
-
-
-if is_torch_available():
-    import torch
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import YolosFeatureExtractor
-
-
-class YolosFeatureExtractionTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-        do_rescale=True,
-        rescale_factor=1 / 255,
-        do_pad=True,
-    ):
-        # by setting size["longest_edge"] > max_resolution we're effectively not testing this :p
-        size = size if size is not None else {"shortest_edge": 18, "longest_edge": 1333}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.do_rescale = do_rescale
-        self.rescale_factor = rescale_factor
-        self.do_pad = do_pad
-
-    def prepare_feat_extract_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_normalize": self.do_normalize,
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_rescale": self.do_rescale,
-            "rescale_factor": self.rescale_factor,
-            "do_pad": self.do_pad,
-        }
-
-    def get_expected_values(self, image_inputs, batched=False):
-        """
-        This function computes the expected height and width when providing images to YolosFeatureExtractor,
-        assuming do_resize is set to True with a scalar size.
-        """
-        if not batched:
-            image = image_inputs[0]
-            if isinstance(image, Image.Image):
-                w, h = image.size
-            else:
-                h, w = image.shape[1], image.shape[2]
-            if w < h:
-                expected_height = int(self.size["shortest_edge"] * h / w)
-                expected_width = self.size["shortest_edge"]
-            elif w > h:
-                expected_height = self.size["shortest_edge"]
-                expected_width = int(self.size["shortest_edge"] * w / h)
-            else:
-                expected_height = self.size["shortest_edge"]
-                expected_width = self.size["shortest_edge"]
-
-        else:
-            expected_values = []
-            for image in image_inputs:
-                expected_height, expected_width = self.get_expected_values([image])
-                expected_values.append((expected_height, expected_width))
-            expected_height = max(expected_values, key=lambda item: item[0])[0]
-            expected_width = max(expected_values, key=lambda item: item[1])[1]
-
-        return expected_height, expected_width
-
-
-@require_torch
-@require_vision
-class YolosFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
-
-    feature_extraction_class = YolosFeatureExtractor if is_vision_available() else None
-
-    def setUp(self):
-        self.feature_extract_tester = YolosFeatureExtractionTester(self)
-
-    @property
-    def feat_extract_dict(self):
-        return self.feature_extract_tester.prepare_feat_extract_dict()
-
-    def test_feat_extract_properties(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        self.assertTrue(hasattr(feature_extractor, "image_mean"))
-        self.assertTrue(hasattr(feature_extractor, "image_std"))
-        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
-        self.assertTrue(hasattr(feature_extractor, "do_resize"))
-        self.assertTrue(hasattr(feature_extractor, "size"))
-
-    def test_feat_extract_from_dict_with_kwargs(self):
-        feature_extractor = self.feature_extraction_class.from_dict(self.feat_extract_dict)
-        self.assertEqual(feature_extractor.size, {"shortest_edge": 18, "longest_edge": 1333})
-        self.assertEqual(feature_extractor.do_pad, True)
-
-        feature_extractor = self.feature_extraction_class.from_dict(
-            self.feat_extract_dict, size=42, max_size=84, pad_and_return_pixel_mask=False
-        )
-        self.assertEqual(feature_extractor.size, {"shortest_edge": 42, "longest_edge": 84})
-        self.assertEqual(feature_extractor.do_pad, False)
-
-    def test_batch_feature(self):
-        pass
-
-    def test_call_pil(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PIL images
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-
-        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs)
-
-        self.assertEqual(
-            encoded_images.shape,
-            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
-        )
-
-        # Test batched
-        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True)
-
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                expected_height,
-                expected_width,
-            ),
-        )
-
-    def test_call_numpy(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random numpy tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-
-        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs)
-
-        self.assertEqual(
-            encoded_images.shape,
-            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-
-        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True)
-
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                expected_height,
-                expected_width,
-            ),
-        )
-
-    def test_call_pytorch(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-
-        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs)
-
-        self.assertEqual(
-            encoded_images.shape,
-            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
-        )
-
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-
-        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True)
-
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                expected_height,
-                expected_width,
-            ),
-        )
-
-    def test_equivalence_padding(self):
-        # Initialize feature_extractors
-        feature_extractor_1 = self.feature_extraction_class(**self.feat_extract_dict)
-        feature_extractor_2 = self.feature_extraction_class(do_resize=False, do_normalize=False, do_rescale=False)
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test whether the method "pad" and calling the feature extractor return the same tensors
-        encoded_images_with_method = feature_extractor_1.pad(image_inputs, return_tensors="pt")
-        encoded_images = feature_extractor_2(image_inputs, return_tensors="pt")
-
-        self.assertTrue(
-            torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4)
-        )
-
-    @slow
-    def test_call_pytorch_with_coco_detection_annotations(self):
-        # prepare image and target
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
-            target = json.loads(f.read())
-
-        target = {"image_id": 39769, "annotations": target}
-
-        # encode them
-        feature_extractor = YolosFeatureExtractor.from_pretrained("hustvl/yolos-small")
-        encoding = feature_extractor(images=image, annotations=target, return_tensors="pt")
-
-        # verify pixel values
-        expected_shape = torch.Size([1, 3, 800, 1066])
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
-        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
-
-        # verify area
-        expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
-        # verify boxes
-        expected_boxes_shape = torch.Size([6, 4])
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
-        expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
-        # verify image_id
-        expected_image_id = torch.tensor([39769])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
-        # verify is_crowd
-        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
-        # verify class_labels
-        expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
-        # verify orig_size
-        expected_orig_size = torch.tensor([480, 640])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
-        # verify size
-        expected_size = torch.tensor([800, 1066])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
-
-    @slow
-    def test_call_pytorch_with_coco_panoptic_annotations(self):
-        # prepare image, target and masks_path
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
-            target = json.loads(f.read())
-
-        target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
-
-        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
-
-        # encode them
-        feature_extractor = YolosFeatureExtractor(format="coco_panoptic")
-        encoding = feature_extractor(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
-
-        # verify pixel values
-        expected_shape = torch.Size([1, 3, 800, 1066])
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
-        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
-
-        # verify area
-        expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
-        # verify boxes
-        expected_boxes_shape = torch.Size([6, 4])
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
-        expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
-        # verify image_id
-        expected_image_id = torch.tensor([39769])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
-        # verify is_crowd
-        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
-        # verify class_labels
-        expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
-        # verify masks
-        expected_masks_sum = 822873
-        self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
-        # verify orig_size
-        expected_orig_size = torch.tensor([480, 640])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
-        # verify size
-        expected_size = torch.tensor([800, 1066])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
diff --git a/tests/models/yolos/test_image_processing_yolos.py b/tests/models/yolos/test_image_processing_yolos.py
new file mode 100644
index 000000000000..003a00611059
--- /dev/null
+++ b/tests/models/yolos/test_image_processing_yolos.py
@@ -0,0 +1,267 @@
+# coding=utf-8
+# Copyright 2021 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import pathlib
+import unittest
+
+from transformers.testing_utils import require_torch, require_vision, slow
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import YolosImageProcessor
+
+
+class YolosImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+        do_rescale=True,
+        rescale_factor=1 / 255,
+        do_pad=True,
+    ):
+        # by setting size["longest_edge"] > max_resolution we're effectively not testing this :p
+        size = size if size is not None else {"shortest_edge": 18, "longest_edge": 1333}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_pad = do_pad
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_rescale": self.do_rescale,
+            "rescale_factor": self.rescale_factor,
+            "do_pad": self.do_pad,
+        }
+
+    def get_expected_values(self, image_inputs, batched=False):
+        """
+        This function computes the expected height and width when providing images to YolosImageProcessor,
+        assuming do_resize is set to True with a scalar size.
+        """
+        if not batched:
+            image = image_inputs[0]
+            if isinstance(image, Image.Image):
+                w, h = image.size
+            else:
+                h, w = image.shape[1], image.shape[2]
+            if w < h:
+                expected_height = int(self.size["shortest_edge"] * h / w)
+                expected_width = self.size["shortest_edge"]
+            elif w > h:
+                expected_height = self.size["shortest_edge"]
+                expected_width = int(self.size["shortest_edge"] * w / h)
+            else:
+                expected_height = self.size["shortest_edge"]
+                expected_width = self.size["shortest_edge"]
+
+        else:
+            expected_values = []
+            for image in image_inputs:
+                expected_height, expected_width = self.get_expected_values([image])
+                expected_values.append((expected_height, expected_width))
+            expected_height = max(expected_values, key=lambda item: item[0])[0]
+            expected_width = max(expected_values, key=lambda item: item[1])[1]
+
+        return expected_height, expected_width
+
+    def expected_output_image_shape(self, images):
+        height, width = self.get_expected_values(images, batched=True)
+        return self.num_channels, height, width
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class YolosImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = YolosImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = YolosImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333})
+        self.assertEqual(image_processor.do_pad, True)
+
+        image_processor = self.image_processing_class.from_dict(
+            self.image_processor_dict, size=42, max_size=84, pad_and_return_pixel_mask=False
+        )
+        self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84})
+        self.assertEqual(image_processor.do_pad, False)
+
+    def test_equivalence_padding(self):
+        # Initialize image_processings
+        image_processing_1 = self.image_processing_class(**self.image_processor_dict)
+        image_processing_2 = self.image_processing_class(do_resize=False, do_normalize=False, do_rescale=False)
+        # create random PyTorch tensors
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, torch.Tensor)
+
+        # Test whether the method "pad" and calling the image processor return the same tensors
+        encoded_images_with_method = image_processing_1.pad(image_inputs, return_tensors="pt")
+        encoded_images = image_processing_2(image_inputs, return_tensors="pt")
+
+        self.assertTrue(
+            torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4)
+        )
+
+    @slow
+    def test_call_pytorch_with_coco_detection_annotations(self):
+        # prepare image and target
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
+            target = json.loads(f.read())
+
+        target = {"image_id": 39769, "annotations": target}
+
+        # encode them
+        image_processing = YolosImageProcessor.from_pretrained("hustvl/yolos-small")
+        encoding = image_processing(images=image, annotations=target, return_tensors="pt")
+
+        # verify pixel values
+        expected_shape = torch.Size([1, 3, 800, 1066])
+        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+
+        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
+        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
+
+        # verify area
+        expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
+        # verify boxes
+        expected_boxes_shape = torch.Size([6, 4])
+        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
+        expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
+        # verify image_id
+        expected_image_id = torch.tensor([39769])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
+        # verify is_crowd
+        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
+        # verify class_labels
+        expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
+        # verify orig_size
+        expected_orig_size = torch.tensor([480, 640])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
+        # verify size
+        expected_size = torch.tensor([800, 1066])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
+
+    @slow
+    def test_call_pytorch_with_coco_panoptic_annotations(self):
+        # prepare image, target and masks_path
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
+            target = json.loads(f.read())
+
+        target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
+
+        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
+
+        # encode them
+        image_processing = YolosImageProcessor(format="coco_panoptic")
+        encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
+
+        # verify pixel values
+        expected_shape = torch.Size([1, 3, 800, 1066])
+        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+
+        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
+        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
+
+        # verify area
+        expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
+        # verify boxes
+        expected_boxes_shape = torch.Size([6, 4])
+        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
+        expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
+        # verify image_id
+        expected_image_id = torch.tensor([39769])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
+        # verify is_crowd
+        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
+        # verify class_labels
+        expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
+        # verify masks
+        expected_masks_sum = 822873
+        self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
+        # verify orig_size
+        expected_orig_size = torch.tensor([480, 640])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
+        # verify size
+        expected_size = torch.tensor([800, 1066])
+        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
diff --git a/tests/models/yolos/test_modeling_yolos.py b/tests/models/yolos/test_modeling_yolos.py
index 1d07e50ce7b2..c1fb50e30b76 100644
--- a/tests/models/yolos/test_modeling_yolos.py
+++ b/tests/models/yolos/test_modeling_yolos.py
@@ -24,6 +24,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -37,7 +38,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import AutoFeatureExtractor
+    from transformers import AutoImageProcessor
 
 
 class YolosModelTester:
@@ -51,7 +52,7 @@ def __init__(
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -160,13 +161,16 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class YolosModelTest(ModelTesterMixin, unittest.TestCase):
+class YolosModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_common.py, as YOLOS does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
     """
 
     all_model_classes = (YolosModel, YolosForObjectDetection) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": YolosModel, "object-detection": YolosForObjectDetection} if is_torch_available() else {}
+    )
 
     test_pruning = False
     test_resize_embeddings = False
@@ -341,22 +345,22 @@ def prepare_img():
 @require_vision
 class YolosModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return AutoFeatureExtractor.from_pretrained("hustvl/yolos-small") if is_vision_available() else None
+    def default_image_processor(self):
+        return AutoImageProcessor.from_pretrained("hustvl/yolos-small") if is_vision_available() else None
 
     @slow
     def test_inference_object_detection_head(self):
         model = YolosForObjectDetection.from_pretrained("hustvl/yolos-small").to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
             outputs = model(inputs.pixel_values)
 
-        # verify the logits
+        # verify outputs
         expected_shape = torch.Size((1, 100, 92))
         self.assertEqual(outputs.logits.shape, expected_shape)
 
@@ -369,3 +373,16 @@ def test_inference_object_detection_head(self):
         )
         self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_slice_logits, atol=1e-4))
         self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
+
+        # verify postprocessing
+        results = image_processor.post_process_object_detection(
+            outputs, threshold=0.3, target_sizes=[image.size[::-1]]
+        )[0]
+        expected_scores = torch.tensor([0.9994, 0.9790, 0.9964, 0.9972, 0.9861]).to(torch_device)
+        expected_labels = [75, 75, 17, 63, 17]
+        expected_slice_boxes = torch.tensor([335.0609, 79.3848, 375.4216, 187.2495]).to(torch_device)
+
+        self.assertEqual(len(results["scores"]), 5)
+        self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-4))
+        self.assertSequenceEqual(results["labels"].tolist(), expected_labels)
+        self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes))
diff --git a/tests/models/yoso/test_modeling_yoso.py b/tests/models/yoso/test_modeling_yoso.py
index 0a0749dd7d9b..67d7b9edc4e4 100644
--- a/tests/models/yoso/test_modeling_yoso.py
+++ b/tests/models/yoso/test_modeling_yoso.py
@@ -22,6 +22,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -50,7 +51,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -280,8 +281,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class YosoModelTest(ModelTesterMixin, unittest.TestCase):
-
+class YosoModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             YosoModel,
@@ -299,6 +299,18 @@ class YosoModelTest(ModelTesterMixin, unittest.TestCase):
     test_torchscript = False
 
     all_generative_model_classes = ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": YosoModel,
+            "fill-mask": YosoForMaskedLM,
+            "question-answering": YosoForQuestionAnswering,
+            "text-classification": YosoForSequenceClassification,
+            "token-classification": YosoForTokenClassification,
+            "zero-shot": YosoForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
 
     def setUp(self):
         self.model_tester = YosoModelTester(self)
diff --git a/tests/onnx/test_features.py b/tests/onnx/test_features.py
deleted file mode 100644
index 4590ff0cc86c..000000000000
--- a/tests/onnx/test_features.py
+++ /dev/null
@@ -1,111 +0,0 @@
-from tempfile import TemporaryDirectory
-from unittest import TestCase
-from unittest.mock import MagicMock, patch
-
-from transformers import AutoModel, TFAutoModel
-from transformers.onnx import FeaturesManager
-from transformers.testing_utils import SMALL_MODEL_IDENTIFIER, require_tf, require_torch
-
-
-@require_torch
-@require_tf
-class DetermineFrameworkTest(TestCase):
-    """
-    Test `FeaturesManager.determine_framework`
-    """
-
-    def setUp(self):
-        self.test_model = SMALL_MODEL_IDENTIFIER
-        self.framework_pt = "pt"
-        self.framework_tf = "tf"
-
-    def _setup_pt_ckpt(self, save_dir):
-        model_pt = AutoModel.from_pretrained(self.test_model)
-        model_pt.save_pretrained(save_dir)
-
-    def _setup_tf_ckpt(self, save_dir):
-        model_tf = TFAutoModel.from_pretrained(self.test_model, from_pt=True)
-        model_tf.save_pretrained(save_dir)
-
-    def test_framework_provided(self):
-        """
-        Ensure the that the provided framework is returned.
-        """
-        mock_framework = "mock_framework"
-
-        # Framework provided - return whatever the user provides
-        result = FeaturesManager.determine_framework(self.test_model, mock_framework)
-        self.assertEqual(result, mock_framework)
-
-        # Local checkpoint and framework provided - return provided framework
-        # PyTorch checkpoint
-        with TemporaryDirectory() as local_pt_ckpt:
-            self._setup_pt_ckpt(local_pt_ckpt)
-            result = FeaturesManager.determine_framework(local_pt_ckpt, mock_framework)
-            self.assertEqual(result, mock_framework)
-
-        # TensorFlow checkpoint
-        with TemporaryDirectory() as local_tf_ckpt:
-            self._setup_tf_ckpt(local_tf_ckpt)
-            result = FeaturesManager.determine_framework(local_tf_ckpt, mock_framework)
-            self.assertEqual(result, mock_framework)
-
-    def test_checkpoint_provided(self):
-        """
-        Ensure that the determined framework is the one used for the local checkpoint.
-
-        For the functionality to execute, local checkpoints are provided but framework is not.
-        """
-        # PyTorch checkpoint
-        with TemporaryDirectory() as local_pt_ckpt:
-            self._setup_pt_ckpt(local_pt_ckpt)
-            result = FeaturesManager.determine_framework(local_pt_ckpt)
-            self.assertEqual(result, self.framework_pt)
-
-        # TensorFlow checkpoint
-        with TemporaryDirectory() as local_tf_ckpt:
-            self._setup_tf_ckpt(local_tf_ckpt)
-            result = FeaturesManager.determine_framework(local_tf_ckpt)
-            self.assertEqual(result, self.framework_tf)
-
-        # Invalid local checkpoint
-        with TemporaryDirectory() as local_invalid_ckpt:
-            with self.assertRaises(FileNotFoundError):
-                result = FeaturesManager.determine_framework(local_invalid_ckpt)
-
-    def test_from_environment(self):
-        """
-        Ensure that the determined framework is the one available in the environment.
-
-        For the functionality to execute, framework and local checkpoints are not provided.
-        """
-        # Framework not provided, hub model is used (no local checkpoint directory)
-        # TensorFlow not in environment -> use PyTorch
-        mock_tf_available = MagicMock(return_value=False)
-        with patch("transformers.onnx.features.is_tf_available", mock_tf_available):
-            result = FeaturesManager.determine_framework(self.test_model)
-            self.assertEqual(result, self.framework_pt)
-
-        # PyTorch not in environment -> use TensorFlow
-        mock_torch_available = MagicMock(return_value=False)
-        with patch("transformers.onnx.features.is_torch_available", mock_torch_available):
-            result = FeaturesManager.determine_framework(self.test_model)
-            self.assertEqual(result, self.framework_tf)
-
-        # Both in environment -> use PyTorch
-        mock_tf_available = MagicMock(return_value=True)
-        mock_torch_available = MagicMock(return_value=True)
-        with patch("transformers.onnx.features.is_tf_available", mock_tf_available), patch(
-            "transformers.onnx.features.is_torch_available", mock_torch_available
-        ):
-            result = FeaturesManager.determine_framework(self.test_model)
-            self.assertEqual(result, self.framework_pt)
-
-        # Both not in environment -> raise error
-        mock_tf_available = MagicMock(return_value=False)
-        mock_torch_available = MagicMock(return_value=False)
-        with patch("transformers.onnx.features.is_tf_available", mock_tf_available), patch(
-            "transformers.onnx.features.is_torch_available", mock_torch_available
-        ):
-            with self.assertRaises(EnvironmentError):
-                result = FeaturesManager.determine_framework(self.test_model)
diff --git a/tests/onnx/test_onnx.py b/tests/onnx/test_onnx.py
deleted file mode 100644
index db1fc6ac4542..000000000000
--- a/tests/onnx/test_onnx.py
+++ /dev/null
@@ -1,195 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-from pathlib import Path
-from tempfile import NamedTemporaryFile, TemporaryDirectory
-
-from transformers import BertConfig, BertTokenizerFast, FeatureExtractionPipeline
-from transformers.convert_graph_to_onnx import (
-    convert,
-    ensure_valid_input,
-    generate_identified_filename,
-    infer_shapes,
-    quantize,
-)
-from transformers.testing_utils import require_tf, require_tokenizers, require_torch, slow
-
-
-class FuncContiguousArgs:
-    def forward(self, input_ids, token_type_ids, attention_mask):
-        return None
-
-
-class FuncNonContiguousArgs:
-    def forward(self, input_ids, some_other_args, token_type_ids, attention_mask):
-        return None
-
-
-class OnnxExportTestCase(unittest.TestCase):
-    MODEL_TO_TEST = [
-        # (model_name, model_kwargs)
-        ("bert-base-cased", {}),
-        ("gpt2", {"use_cache": False}),  # We don't support exporting GPT2 past keys anymore
-    ]
-
-    @require_tf
-    @slow
-    def test_export_tensorflow(self):
-        for model, model_kwargs in OnnxExportTestCase.MODEL_TO_TEST:
-            self._test_export(model, "tf", 12, **model_kwargs)
-
-    @require_torch
-    @slow
-    def test_export_pytorch(self):
-        for model, model_kwargs in OnnxExportTestCase.MODEL_TO_TEST:
-            self._test_export(model, "pt", 12, **model_kwargs)
-
-    @require_torch
-    @slow
-    def test_export_custom_bert_model(self):
-        from transformers import BertModel
-
-        vocab = ["[UNK]", "[SEP]", "[CLS]", "[PAD]", "[MASK]", "some", "other", "words"]
-        with NamedTemporaryFile(mode="w+t") as vocab_file:
-            vocab_file.write("\n".join(vocab))
-            vocab_file.flush()
-            tokenizer = BertTokenizerFast(vocab_file.name)
-
-        with TemporaryDirectory() as bert_save_dir:
-            model = BertModel(BertConfig(vocab_size=len(vocab)))
-            model.save_pretrained(bert_save_dir)
-            self._test_export(bert_save_dir, "pt", 12, tokenizer)
-
-    @require_tf
-    @slow
-    def test_quantize_tf(self):
-        for model, model_kwargs in OnnxExportTestCase.MODEL_TO_TEST:
-            path = self._test_export(model, "tf", 12, **model_kwargs)
-            quantized_path = quantize(Path(path))
-
-            # Ensure the actual quantized model is not bigger than the original one
-            if quantized_path.stat().st_size >= Path(path).stat().st_size:
-                self.fail("Quantized model is bigger than initial ONNX model")
-
-    @require_torch
-    @slow
-    def test_quantize_pytorch(self):
-        for model, model_kwargs in OnnxExportTestCase.MODEL_TO_TEST:
-            path = self._test_export(model, "pt", 12, **model_kwargs)
-            quantized_path = quantize(path)
-
-            # Ensure the actual quantized model is not bigger than the original one
-            if quantized_path.stat().st_size >= Path(path).stat().st_size:
-                self.fail("Quantized model is bigger than initial ONNX model")
-
-    def _test_export(self, model, framework, opset, tokenizer=None, **model_kwargs):
-        try:
-            # Compute path
-            with TemporaryDirectory() as tempdir:
-                path = Path(tempdir).joinpath("model.onnx")
-
-            # Remove folder if exists
-            if path.parent.exists():
-                path.parent.rmdir()
-
-            # Export
-            convert(framework, model, path, opset, tokenizer, **model_kwargs)
-
-            return path
-        except Exception as e:
-            self.fail(e)
-
-    @require_torch
-    @require_tokenizers
-    @slow
-    def test_infer_dynamic_axis_pytorch(self):
-        """
-        Validate the dynamic axis generated for each parameters are correct
-        """
-        from transformers import BertModel
-
-        model = BertModel(BertConfig.from_pretrained("lysandre/tiny-bert-random"))
-        tokenizer = BertTokenizerFast.from_pretrained("lysandre/tiny-bert-random")
-        self._test_infer_dynamic_axis(model, tokenizer, "pt")
-
-    @require_tf
-    @require_tokenizers
-    @slow
-    def test_infer_dynamic_axis_tf(self):
-        """
-        Validate the dynamic axis generated for each parameters are correct
-        """
-        from transformers import TFBertModel
-
-        model = TFBertModel(BertConfig.from_pretrained("lysandre/tiny-bert-random"))
-        tokenizer = BertTokenizerFast.from_pretrained("lysandre/tiny-bert-random")
-        self._test_infer_dynamic_axis(model, tokenizer, "tf")
-
-    def _test_infer_dynamic_axis(self, model, tokenizer, framework):
-        feature_extractor = FeatureExtractionPipeline(model, tokenizer)
-
-        variable_names = ["input_ids", "token_type_ids", "attention_mask", "output_0", "output_1"]
-        input_vars, output_vars, shapes, tokens = infer_shapes(feature_extractor, framework)
-
-        # Assert all variables are present
-        self.assertEqual(len(shapes), len(variable_names))
-        self.assertTrue(all([var_name in shapes for var_name in variable_names]))
-        self.assertSequenceEqual(variable_names[:3], input_vars)
-        self.assertSequenceEqual(variable_names[3:], output_vars)
-
-        # Assert inputs are {0: batch, 1: sequence}
-        for var_name in ["input_ids", "token_type_ids", "attention_mask"]:
-            self.assertDictEqual(shapes[var_name], {0: "batch", 1: "sequence"})
-
-        # Assert outputs are {0: batch, 1: sequence} and {0: batch}
-        self.assertDictEqual(shapes["output_0"], {0: "batch", 1: "sequence"})
-        self.assertDictEqual(shapes["output_1"], {0: "batch"})
-
-    def test_ensure_valid_input(self):
-        """
-        Validate parameters are correctly exported
-        GPT2 has "past" parameter in the middle of input_ids, token_type_ids and attention_mask.
-        ONNX doesn't support export with a dictionary, only a tuple. Thus we need to ensure we remove
-        token_type_ids and attention_mask for now to not having a None tensor in the middle
-        """
-        # All generated args are valid
-        input_names = ["input_ids", "attention_mask", "token_type_ids"]
-        tokens = {"input_ids": [1, 2, 3, 4], "attention_mask": [0, 0, 0, 0], "token_type_ids": [1, 1, 1, 1]}
-        ordered_input_names, inputs_args = ensure_valid_input(FuncContiguousArgs(), tokens, input_names)
-
-        # Should have exactly the same number of args (all are valid)
-        self.assertEqual(len(inputs_args), 3)
-
-        # Should have exactly the same input names
-        self.assertEqual(set(ordered_input_names), set(input_names))
-
-        # Parameter should be reordered according to their respective place in the function:
-        # (input_ids, token_type_ids, attention_mask)
-        self.assertEqual(inputs_args, (tokens["input_ids"], tokens["token_type_ids"], tokens["attention_mask"]))
-
-        # Generated args are interleaved with another args (for instance parameter "past" in GPT2)
-        ordered_input_names, inputs_args = ensure_valid_input(FuncNonContiguousArgs(), tokens, input_names)
-
-        # Should have exactly the one arg (all before the one not provided "some_other_args")
-        self.assertEqual(len(inputs_args), 1)
-        self.assertEqual(len(ordered_input_names), 1)
-
-        # Should have only "input_ids"
-        self.assertEqual(inputs_args[0], tokens["input_ids"])
-        self.assertEqual(ordered_input_names[0], "input_ids")
-
-    def test_generate_identified_name(self):
-        generated = generate_identified_filename(Path("/home/something/my_fake_model.onnx"), "-test")
-        self.assertEqual("/home/something/my_fake_model-test.onnx", generated.as_posix())
diff --git a/tests/onnx/test_onnx_v2.py b/tests/onnx/test_onnx_v2.py
deleted file mode 100644
index e7a0e15d243d..000000000000
--- a/tests/onnx/test_onnx_v2.py
+++ /dev/null
@@ -1,533 +0,0 @@
-import os
-from pathlib import Path
-from tempfile import NamedTemporaryFile
-from unittest import TestCase
-from unittest.mock import patch
-
-import pytest
-
-from parameterized import parameterized
-from transformers import AutoConfig, PreTrainedTokenizerBase, is_tf_available, is_torch_available
-from transformers.onnx import (
-    EXTERNAL_DATA_FORMAT_SIZE_LIMIT,
-    OnnxConfig,
-    OnnxConfigWithPast,
-    ParameterFormat,
-    export,
-    validate_model_outputs,
-)
-from transformers.onnx.utils import (
-    compute_effective_axis_dimension,
-    compute_serialized_parameters_size,
-    get_preprocessor,
-)
-from transformers.testing_utils import require_onnx, require_rjieba, require_tf, require_torch, require_vision, slow
-
-
-if is_torch_available() or is_tf_available():
-    from transformers.onnx.features import FeaturesManager
-
-if is_torch_available():
-    import torch
-
-    from transformers.models.deberta import modeling_deberta
-
-
-@require_onnx
-class OnnxUtilsTestCaseV2(TestCase):
-    """
-    Cover all the utilities involved to export ONNX models
-    """
-
-    @require_torch
-    @patch("transformers.onnx.convert.is_torch_onnx_dict_inputs_support_available", return_value=False)
-    def test_ensure_pytorch_version_ge_1_8_0(self, mock_is_torch_onnx_dict_inputs_support_available):
-        """
-        Ensure we raise an Exception if the pytorch version is unsupported (< 1.8.0)
-        """
-        self.assertRaises(AssertionError, export, None, None, None, None, None)
-        mock_is_torch_onnx_dict_inputs_support_available.assert_called()
-
-    def test_compute_effective_axis_dimension(self):
-        """
-        When exporting ONNX model with dynamic axis (batch or sequence) we set batch_size and/or sequence_length = -1.
-        We cannot generate an effective tensor with axis dim == -1, so we trick by using some "fixed" values
-        (> 1 to avoid ONNX squeezing the axis).
-
-        This test ensure we are correctly replacing generated batch / sequence tensor with axis > 1
-        """
-
-        # Dynamic axis (batch, no token added by the tokenizer)
-        self.assertEqual(compute_effective_axis_dimension(-1, fixed_dimension=2, num_token_to_add=0), 2)
-
-        # Static axis (batch, no token added by the tokenizer)
-        self.assertEqual(compute_effective_axis_dimension(0, fixed_dimension=2, num_token_to_add=0), 2)
-
-        # Dynamic axis (sequence, token added by the tokenizer 2 (no pair))
-        self.assertEqual(compute_effective_axis_dimension(0, fixed_dimension=8, num_token_to_add=2), 6)
-        self.assertEqual(compute_effective_axis_dimension(0, fixed_dimension=8, num_token_to_add=2), 6)
-
-        # Dynamic axis (sequence, token added by the tokenizer 3 (pair))
-        self.assertEqual(compute_effective_axis_dimension(0, fixed_dimension=8, num_token_to_add=3), 5)
-        self.assertEqual(compute_effective_axis_dimension(0, fixed_dimension=8, num_token_to_add=3), 5)
-
-    def test_compute_parameters_serialized_size(self):
-        """
-        This test ensures we compute a "correct" approximation of the underlying storage requirement (size) for all the
-        parameters for the specified parameter's dtype.
-        """
-        self.assertEqual(compute_serialized_parameters_size(2, ParameterFormat.Float), 2 * ParameterFormat.Float.size)
-
-    def test_flatten_output_collection_property(self):
-        """
-        This test ensures we correctly flatten nested collection such as the one we use when returning past_keys.
-        past_keys = Tuple[Tuple]
-
-        ONNX exporter will export nested collections as ${collection_name}.${level_idx_0}.${level_idx_1}...${idx_n}
-        """
-        self.assertEqual(
-            OnnxConfig.flatten_output_collection_property("past_key", [[0], [1], [2]]),
-            {
-                "past_key.0": 0,
-                "past_key.1": 1,
-                "past_key.2": 2,
-            },
-        )
-
-
-class OnnxConfigTestCaseV2(TestCase):
-    """
-    Cover the test for models default.
-
-    Default means no specific features is being enabled on the model.
-    """
-
-    @patch.multiple(OnnxConfig, __abstractmethods__=set())
-    def test_use_external_data_format(self):
-        """
-        External data format is required only if the serialized size of the parameters if bigger than 2Gb
-        """
-        TWO_GB_LIMIT = EXTERNAL_DATA_FORMAT_SIZE_LIMIT
-
-        # No parameters
-        self.assertFalse(OnnxConfig.use_external_data_format(0))
-
-        # Some parameters
-        self.assertFalse(OnnxConfig.use_external_data_format(1))
-
-        # Almost 2Gb parameters
-        self.assertFalse(OnnxConfig.use_external_data_format((TWO_GB_LIMIT - 1) // ParameterFormat.Float.size))
-
-        # Exactly 2Gb parameters
-        self.assertTrue(OnnxConfig.use_external_data_format(TWO_GB_LIMIT))
-
-        # More than 2Gb parameters
-        self.assertTrue(OnnxConfig.use_external_data_format((TWO_GB_LIMIT + 1) // ParameterFormat.Float.size))
-
-
-class OnnxConfigWithPastTestCaseV2(TestCase):
-    """
-    Cover the tests for model which have use_cache feature (i.e. "with_past" for ONNX)
-    """
-
-    SUPPORTED_WITH_PAST_CONFIGS = {}
-    # SUPPORTED_WITH_PAST_CONFIGS = {
-    #     ("BART", BartConfig),
-    #     ("GPT2", GPT2Config),
-    #     # ("T5", T5Config)
-    # }
-
-    @patch.multiple(OnnxConfigWithPast, __abstractmethods__=set())
-    def test_use_past(self):
-        """
-        Ensure the use_past variable is correctly being set
-        """
-        for name, config in OnnxConfigWithPastTestCaseV2.SUPPORTED_WITH_PAST_CONFIGS:
-            with self.subTest(name):
-                self.assertFalse(
-                    OnnxConfigWithPast.from_model_config(config()).use_past,
-                    "OnnxConfigWithPast.from_model_config() should not use_past",
-                )
-
-                self.assertTrue(
-                    OnnxConfigWithPast.with_past(config()).use_past,
-                    "OnnxConfigWithPast.from_model_config() should use_past",
-                )
-
-    @patch.multiple(OnnxConfigWithPast, __abstractmethods__=set())
-    def test_values_override(self):
-        """
-        Ensure the use_past variable correctly set the `use_cache` value in model's configuration
-        """
-        for name, config in OnnxConfigWithPastTestCaseV2.SUPPORTED_WITH_PAST_CONFIGS:
-            with self.subTest(name):
-                # without past
-                onnx_config_default = OnnxConfigWithPast.from_model_config(config())
-                self.assertIsNotNone(onnx_config_default.values_override, "values_override should not be None")
-                self.assertIn("use_cache", onnx_config_default.values_override, "use_cache should be present")
-                self.assertFalse(
-                    onnx_config_default.values_override["use_cache"], "use_cache should be False if not using past"
-                )
-
-                # with past
-                onnx_config_default = OnnxConfigWithPast.with_past(config())
-                self.assertIsNotNone(onnx_config_default.values_override, "values_override should not be None")
-                self.assertIn("use_cache", onnx_config_default.values_override, "use_cache should be present")
-                self.assertTrue(
-                    onnx_config_default.values_override["use_cache"], "use_cache should be False if not using past"
-                )
-
-
-PYTORCH_EXPORT_MODELS = {
-    ("albert", "hf-internal-testing/tiny-random-AlbertModel"),
-    ("bert", "hf-internal-testing/tiny-random-BertModel"),
-    ("beit", "microsoft/beit-base-patch16-224"),
-    ("big-bird", "hf-internal-testing/tiny-random-BigBirdModel"),
-    ("camembert", "camembert-base"),
-    ("clip", "hf-internal-testing/tiny-random-CLIPModel"),
-    ("convbert", "hf-internal-testing/tiny-random-ConvBertModel"),
-    ("codegen", "hf-internal-testing/tiny-random-CodeGenModel"),
-    ("data2vec-text", "hf-internal-testing/tiny-random-Data2VecTextModel"),
-    ("data2vec-vision", "facebook/data2vec-vision-base"),
-    ("deberta", "hf-internal-testing/tiny-random-DebertaModel"),
-    ("deberta-v2", "hf-internal-testing/tiny-random-DebertaV2Model"),
-    ("deit", "facebook/deit-small-patch16-224"),
-    ("convnext", "facebook/convnext-tiny-224"),
-    ("detr", "facebook/detr-resnet-50"),
-    ("distilbert", "hf-internal-testing/tiny-random-DistilBertModel"),
-    ("electra", "hf-internal-testing/tiny-random-ElectraModel"),
-    ("groupvit", "nvidia/groupvit-gcc-yfcc"),
-    ("ibert", "kssteven/ibert-roberta-base"),
-    ("imagegpt", "openai/imagegpt-small"),
-    ("levit", "facebook/levit-128S"),
-    ("layoutlm", "hf-internal-testing/tiny-random-LayoutLMModel"),
-    ("layoutlmv3", "microsoft/layoutlmv3-base"),
-    ("longformer", "allenai/longformer-base-4096"),
-    ("mobilebert", "hf-internal-testing/tiny-random-MobileBertModel"),
-    ("mobilenet_v1", "google/mobilenet_v1_0.75_192"),
-    ("mobilenet_v2", "google/mobilenet_v2_0.35_96"),
-    ("mobilevit", "apple/mobilevit-small"),
-    ("owlvit", "google/owlvit-base-patch32"),
-    ("perceiver", "hf-internal-testing/tiny-random-PerceiverModel", ("masked-lm", "sequence-classification")),
-    ("perceiver", "hf-internal-testing/tiny-random-PerceiverModel", ("image-classification",)),
-    ("poolformer", "sail/poolformer_s12"),
-    ("rembert", "google/rembert"),
-    ("resnet", "microsoft/resnet-50"),
-    ("roberta", "hf-internal-testing/tiny-random-RobertaModel"),
-    ("roformer", "hf-internal-testing/tiny-random-RoFormerModel"),
-    ("segformer", "nvidia/segformer-b0-finetuned-ade-512-512"),
-    ("squeezebert", "hf-internal-testing/tiny-random-SqueezeBertModel"),
-    ("swin", "microsoft/swin-tiny-patch4-window7-224"),
-    ("vit", "google/vit-base-patch16-224"),
-    ("yolos", "hustvl/yolos-tiny"),
-    ("whisper", "openai/whisper-tiny.en"),
-    ("xlm", "hf-internal-testing/tiny-random-XLMModel"),
-    ("xlm-roberta", "hf-internal-testing/tiny-random-XLMRobertaXLModel"),
-}
-
-PYTORCH_EXPORT_ENCODER_DECODER_MODELS = {
-    ("vision-encoder-decoder", "nlpconnect/vit-gpt2-image-captioning"),
-}
-
-PYTORCH_EXPORT_WITH_PAST_MODELS = {
-    ("bloom", "hf-internal-testing/tiny-random-BloomModel"),
-    ("gpt2", "hf-internal-testing/tiny-random-GPT2Model"),
-    ("gpt-neo", "hf-internal-testing/tiny-random-GPTNeoModel"),
-}
-
-PYTORCH_EXPORT_SEQ2SEQ_WITH_PAST_MODELS = {
-    ("bart", "hf-internal-testing/tiny-random-BartModel"),
-    ("bigbird-pegasus", "hf-internal-testing/tiny-random-BigBirdPegasusModel"),
-    ("blenderbot-small", "facebook/blenderbot_small-90M"),
-    ("blenderbot", "hf-internal-testing/tiny-random-BlenderbotModel"),
-    ("longt5", "hf-internal-testing/tiny-random-LongT5Model"),
-    ("marian", "Helsinki-NLP/opus-mt-en-de"),
-    ("mbart", "sshleifer/tiny-mbart"),
-    ("mt5", "google/mt5-base"),
-    ("m2m-100", "hf-internal-testing/tiny-random-M2M100Model"),
-    ("t5", "hf-internal-testing/tiny-random-T5Model"),
-}
-
-# TODO(lewtun): Include the same model types in `PYTORCH_EXPORT_MODELS` once TensorFlow has parity with the PyTorch model implementations.
-TENSORFLOW_EXPORT_DEFAULT_MODELS = {
-    ("albert", "hf-internal-testing/tiny-albert"),
-    ("bert", "hf-internal-testing/tiny-random-BertModel"),
-    ("camembert", "camembert-base"),
-    ("distilbert", "hf-internal-testing/tiny-random-DistilBertModel"),
-    ("roberta", "hf-internal-testing/tiny-random-RobertaModel"),
-}
-
-# TODO(lewtun): Include the same model types in `PYTORCH_EXPORT_WITH_PAST_MODELS` once TensorFlow has parity with the PyTorch model implementations.
-TENSORFLOW_EXPORT_WITH_PAST_MODELS = {}
-
-# TODO(lewtun): Include the same model types in `PYTORCH_EXPORT_SEQ2SEQ_WITH_PAST_MODELS` once TensorFlow has parity with the PyTorch model implementations.
-TENSORFLOW_EXPORT_SEQ2SEQ_WITH_PAST_MODELS = {}
-
-
-def _get_models_to_test(export_models_list):
-    models_to_test = []
-    if is_torch_available() or is_tf_available():
-        for name, model, *features in export_models_list:
-            if features:
-                feature_config_mapping = {
-                    feature: FeaturesManager.get_config(name, feature) for _ in features for feature in _
-                }
-            else:
-                # pre-process the model names
-                model_type = name.replace("_", "-")
-                model_name = getattr(model, "name", "")
-                feature_config_mapping = FeaturesManager.get_supported_features_for_model_type(
-                    model_type, model_name=model_name
-                )
-
-            for feature, onnx_config_class_constructor in feature_config_mapping.items():
-                models_to_test.append((f"{name}_{feature}", name, model, feature, onnx_config_class_constructor))
-        return sorted(models_to_test)
-    else:
-        # Returning some dummy test that should not be ever called because of the @require_torch / @require_tf
-        # decorators.
-        # The reason for not returning an empty list is because parameterized.expand complains when it's empty.
-        return [("dummy", "dummy", "dummy", "dummy", OnnxConfig.from_model_config)]
-
-
-class OnnxExportTestCaseV2(TestCase):
-    """
-    Integration tests ensuring supported models are correctly exported
-    """
-
-    def _onnx_export(
-        self, test_name, name, model_name, feature, onnx_config_class_constructor, device="cpu", framework="pt"
-    ):
-        from transformers.onnx import export
-
-        model_class = FeaturesManager.get_model_class_for_feature(feature, framework=framework)
-        config = AutoConfig.from_pretrained(model_name)
-        model = model_class.from_config(config)
-
-        # Dynamic axes aren't supported for YOLO-like models. This means they cannot be exported to ONNX on CUDA devices.
-        # See: https://github.com/ultralytics/yolov5/pull/8378
-        if model.__class__.__name__.startswith("Yolos") and device != "cpu":
-            return
-
-        # ONNX inference fails with the following name, feature, framework parameterizations
-        # See: https://github.com/huggingface/transformers/issues/19357
-        if (name, feature, framework) in {
-            ("deberta-v2", "question-answering", "pt"),
-            ("deberta-v2", "multiple-choice", "pt"),
-            ("roformer", "multiple-choice", "pt"),
-            ("groupvit", "default", "pt"),
-            ("perceiver", "masked-lm", "pt"),
-            ("perceiver", "sequence-classification", "pt"),
-            ("perceiver", "image-classification", "pt"),
-            ("bert", "multiple-choice", "tf"),
-            ("camembert", "multiple-choice", "tf"),
-            ("roberta", "multiple-choice", "tf"),
-        }:
-            return
-
-        onnx_config = onnx_config_class_constructor(model.config)
-
-        if is_torch_available():
-            from transformers.utils import torch_version
-
-            if torch_version < onnx_config.torch_onnx_minimum_version:
-                pytest.skip(
-                    "Skipping due to incompatible PyTorch version. Minimum required is"
-                    f" {onnx_config.torch_onnx_minimum_version}, got: {torch_version}"
-                )
-
-        preprocessor = get_preprocessor(model_name)
-
-        # Useful for causal lm models that do not use pad tokens.
-        if isinstance(preprocessor, PreTrainedTokenizerBase) and not getattr(config, "pad_token_id", None):
-            config.pad_token_id = preprocessor.eos_token_id
-
-        with NamedTemporaryFile("w") as output:
-            try:
-                onnx_inputs, onnx_outputs = export(
-                    preprocessor, model, onnx_config, onnx_config.default_onnx_opset, Path(output.name), device=device
-                )
-                validate_model_outputs(
-                    onnx_config,
-                    preprocessor,
-                    model,
-                    Path(output.name),
-                    onnx_outputs,
-                    onnx_config.atol_for_validation,
-                )
-            except (RuntimeError, ValueError) as e:
-                self.fail(f"{name}, {feature} -> {e}")
-
-    def _onnx_export_encoder_decoder_models(
-        self, test_name, name, model_name, feature, onnx_config_class_constructor, device="cpu"
-    ):
-        from transformers import AutoFeatureExtractor, AutoTokenizer
-        from transformers.onnx import export
-
-        model_class = FeaturesManager.get_model_class_for_feature(feature)
-        config = AutoConfig.from_pretrained(model_name)
-        model = model_class.from_config(config)
-
-        onnx_config = onnx_config_class_constructor(model.config)
-
-        if is_torch_available():
-            from transformers.utils import torch_version
-
-            if torch_version < onnx_config.torch_onnx_minimum_version:
-                pytest.skip(
-                    "Skipping due to incompatible PyTorch version. Minimum required is"
-                    f" {onnx_config.torch_onnx_minimum_version}, got: {torch_version}"
-                )
-
-        encoder_model = model.get_encoder()
-        decoder_model = model.get_decoder()
-
-        encoder_onnx_config = onnx_config.get_encoder_config(encoder_model.config)
-        decoder_onnx_config = onnx_config.get_decoder_config(encoder_model.config, decoder_model.config, feature)
-
-        preprocessor = AutoFeatureExtractor.from_pretrained(model_name)
-
-        onnx_opset = max(encoder_onnx_config.default_onnx_opset, decoder_onnx_config.default_onnx_opset)
-
-        with NamedTemporaryFile("w") as encoder_output:
-            onnx_inputs, onnx_outputs = export(
-                preprocessor, encoder_model, encoder_onnx_config, onnx_opset, Path(encoder_output.name), device=device
-            )
-            validate_model_outputs(
-                encoder_onnx_config,
-                preprocessor,
-                encoder_model,
-                Path(encoder_output.name),
-                onnx_outputs,
-                encoder_onnx_config.atol_for_validation,
-            )
-
-        preprocessor = AutoTokenizer.from_pretrained(model_name)
-
-        with NamedTemporaryFile("w") as decoder_output:
-            _, onnx_outputs = export(
-                preprocessor,
-                decoder_model,
-                decoder_onnx_config,
-                onnx_config.default_onnx_opset,
-                Path(decoder_output.name),
-                device=device,
-            )
-            validate_model_outputs(
-                decoder_onnx_config,
-                preprocessor,
-                decoder_model,
-                Path(decoder_output.name),
-                onnx_outputs,
-                decoder_onnx_config.atol_for_validation,
-            )
-
-    @parameterized.expand(_get_models_to_test(PYTORCH_EXPORT_MODELS))
-    @slow
-    @require_torch
-    @require_vision
-    @require_rjieba
-    def test_pytorch_export(self, test_name, name, model_name, feature, onnx_config_class_constructor):
-        self._onnx_export(test_name, name, model_name, feature, onnx_config_class_constructor)
-
-    @parameterized.expand(_get_models_to_test(PYTORCH_EXPORT_MODELS))
-    @slow
-    @require_torch
-    @require_vision
-    @require_rjieba
-    def test_pytorch_export_on_cuda(self, test_name, name, model_name, feature, onnx_config_class_constructor):
-        self._onnx_export(test_name, name, model_name, feature, onnx_config_class_constructor, device="cuda")
-
-    @parameterized.expand(_get_models_to_test(PYTORCH_EXPORT_ENCODER_DECODER_MODELS))
-    @slow
-    @require_torch
-    @require_vision
-    @require_rjieba
-    def test_pytorch_export_encoder_decoder_models(
-        self, test_name, name, model_name, feature, onnx_config_class_constructor
-    ):
-        self._onnx_export_encoder_decoder_models(test_name, name, model_name, feature, onnx_config_class_constructor)
-
-    @parameterized.expand(_get_models_to_test(PYTORCH_EXPORT_ENCODER_DECODER_MODELS))
-    @slow
-    @require_torch
-    @require_vision
-    @require_rjieba
-    def test_pytorch_export_encoder_decoder_models_on_cuda(
-        self, test_name, name, model_name, feature, onnx_config_class_constructor
-    ):
-        self._onnx_export_encoder_decoder_models(
-            test_name, name, model_name, feature, onnx_config_class_constructor, device="cuda"
-        )
-
-    @parameterized.expand(_get_models_to_test(PYTORCH_EXPORT_WITH_PAST_MODELS))
-    @slow
-    @require_torch
-    def test_pytorch_export_with_past(self, test_name, name, model_name, feature, onnx_config_class_constructor):
-        self._onnx_export(test_name, name, model_name, feature, onnx_config_class_constructor)
-
-    @parameterized.expand(_get_models_to_test(PYTORCH_EXPORT_SEQ2SEQ_WITH_PAST_MODELS))
-    @slow
-    @require_torch
-    def test_pytorch_export_seq2seq_with_past(
-        self, test_name, name, model_name, feature, onnx_config_class_constructor
-    ):
-        self._onnx_export(test_name, name, model_name, feature, onnx_config_class_constructor)
-
-    @parameterized.expand(_get_models_to_test(TENSORFLOW_EXPORT_DEFAULT_MODELS))
-    @slow
-    @require_tf
-    @require_vision
-    def test_tensorflow_export(self, test_name, name, model_name, feature, onnx_config_class_constructor):
-        self._onnx_export(test_name, name, model_name, feature, onnx_config_class_constructor, framework="tf")
-
-    @parameterized.expand(_get_models_to_test(TENSORFLOW_EXPORT_WITH_PAST_MODELS), skip_on_empty=True)
-    @slow
-    @require_tf
-    def test_tensorflow_export_with_past(self, test_name, name, model_name, feature, onnx_config_class_constructor):
-        self._onnx_export(test_name, name, model_name, feature, onnx_config_class_constructor, framework="tf")
-
-    @parameterized.expand(_get_models_to_test(TENSORFLOW_EXPORT_SEQ2SEQ_WITH_PAST_MODELS), skip_on_empty=True)
-    @slow
-    @require_tf
-    def test_tensorflow_export_seq2seq_with_past(
-        self, test_name, name, model_name, feature, onnx_config_class_constructor
-    ):
-        self._onnx_export(test_name, name, model_name, feature, onnx_config_class_constructor, framework="tf")
-
-
-class StableDropoutTestCase(TestCase):
-    """Tests export of StableDropout module."""
-
-    @require_torch
-    @pytest.mark.filterwarnings("ignore:.*Dropout.*:UserWarning:torch.onnx.*")  # torch.onnx is spammy.
-    def test_training(self):
-        """Tests export of StableDropout in training mode."""
-        devnull = open(os.devnull, "wb")
-        # drop_prob must be > 0 for the test to be meaningful
-        sd = modeling_deberta.StableDropout(0.1)
-        # Avoid warnings in training mode
-        do_constant_folding = False
-        # Dropout is a no-op in inference mode
-        training = torch.onnx.TrainingMode.PRESERVE
-        input = (torch.randn(2, 2),)
-
-        torch.onnx.export(
-            sd,
-            input,
-            devnull,
-            opset_version=12,  # Minimum supported
-            do_constant_folding=do_constant_folding,
-            training=training,
-        )
-
-        # Expected to fail with opset_version < 12
-        with self.assertRaises(Exception):
-            torch.onnx.export(
-                sd,
-                input,
-                devnull,
-                opset_version=11,
-                do_constant_folding=do_constant_folding,
-                training=training,
-            )
diff --git a/tests/optimization/test_optimization.py b/tests/optimization/test_optimization.py
index c0c5a31a3a49..0ee8513dacde 100644
--- a/tests/optimization/test_optimization.py
+++ b/tests/optimization/test_optimization.py
@@ -33,6 +33,7 @@
         get_constant_schedule_with_warmup,
         get_cosine_schedule_with_warmup,
         get_cosine_with_hard_restarts_schedule_with_warmup,
+        get_inverse_sqrt_schedule,
         get_linear_schedule_with_warmup,
         get_polynomial_decay_schedule_with_warmup,
     )
@@ -120,7 +121,6 @@ def assertListAlmostEqual(self, list1, list2, tol, msg=None):
             self.assertAlmostEqual(a, b, delta=tol, msg=msg)
 
     def test_schedulers(self):
-
         common_kwargs = {"num_warmup_steps": 2, "num_training_steps": 10}
         # schedulers doct format
         # function: (sched_args_dict, expected_learning_rates)
@@ -146,6 +146,10 @@ def test_schedulers(self):
                 {**common_kwargs, "power": 2.0, "lr_end": 1e-7},
                 [0.0, 5.0, 10.0, 7.656, 5.625, 3.906, 2.5, 1.406, 0.625, 0.156],
             ),
+            get_inverse_sqrt_schedule: (
+                {"num_warmup_steps": 2},
+                [0.0, 5.0, 10.0, 8.165, 7.071, 6.325, 5.774, 5.345, 5.0, 4.714],
+            ),
         }
 
         for scheduler_func, data in scheds.items():
@@ -162,5 +166,21 @@ def test_schedulers(self):
             )
 
             scheduler = scheduler_func(self.optimizer, **kwargs)
+            if scheduler_func.__name__ != "get_constant_schedule":
+                LambdaScheduleWrapper.wrap_scheduler(scheduler)  # wrap to test picklability of the schedule
             lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
             self.assertListEqual(lrs_1, lrs_2, msg=f"failed for {scheduler_func} in save and reload")
+
+
+class LambdaScheduleWrapper:
+    """See https://github.com/huggingface/transformers/issues/21689"""
+
+    def __init__(self, fn):
+        self.fn = fn
+
+    def __call__(self, *args, **kwargs):
+        return self.fn(*args, **kwargs)
+
+    @classmethod
+    def wrap_scheduler(self, scheduler):
+        scheduler.lr_lambdas = list(map(self, scheduler.lr_lambdas))
diff --git a/tests/peft_integration/test_peft_integration.py b/tests/peft_integration/test_peft_integration.py
new file mode 100644
index 000000000000..809282c7704b
--- /dev/null
+++ b/tests/peft_integration/test_peft_integration.py
@@ -0,0 +1,416 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import tempfile
+import unittest
+
+from huggingface_hub import hf_hub_download
+
+from transformers import AutoModelForCausalLM, OPTForCausalLM
+from transformers.testing_utils import require_peft, require_torch, require_torch_gpu, slow, torch_device
+from transformers.utils import is_torch_available
+
+
+if is_torch_available():
+    import torch
+
+
+@require_peft
+@require_torch
+class PeftTesterMixin:
+    peft_test_model_ids = ("peft-internal-testing/tiny-OPTForCausalLM-lora",)
+    transformers_test_model_ids = ("hf-internal-testing/tiny-random-OPTForCausalLM",)
+    transformers_test_model_classes = (AutoModelForCausalLM, OPTForCausalLM)
+
+
+# TODO: run it with CI after PEFT release.
+@slow
+class PeftIntegrationTester(unittest.TestCase, PeftTesterMixin):
+    """
+    A testing suite that makes sure that the PeftModel class is correctly integrated into the transformers library.
+    """
+
+    def _check_lora_correctly_converted(self, model):
+        """
+        Utility method to check if the model has correctly adapters injected on it.
+        """
+        from peft.tuners.tuners_utils import BaseTunerLayer
+
+        is_peft_loaded = False
+
+        for _, m in model.named_modules():
+            if isinstance(m, BaseTunerLayer):
+                is_peft_loaded = True
+                break
+
+        return is_peft_loaded
+
+    def test_peft_from_pretrained(self):
+        """
+        Simple test that tests the basic usage of PEFT model through `from_pretrained`.
+        This checks if we pass a remote folder that contains an adapter config and adapter weights, it
+        should correctly load a model that has adapters injected on it.
+        """
+        for model_id in self.peft_test_model_ids:
+            for transformers_class in self.transformers_test_model_classes:
+                peft_model = transformers_class.from_pretrained(model_id).to(torch_device)
+
+                self.assertTrue(self._check_lora_correctly_converted(peft_model))
+                self.assertTrue(peft_model._hf_peft_config_loaded)
+                # dummy generation
+                _ = peft_model.generate(input_ids=torch.LongTensor([[0, 1, 2, 3, 4, 5, 6, 7]]).to(torch_device))
+
+    def test_peft_state_dict(self):
+        """
+        Simple test that checks if the returned state dict of `get_adapter_state_dict()` method contains
+        the expected keys.
+        """
+        for model_id in self.peft_test_model_ids:
+            for transformers_class in self.transformers_test_model_classes:
+                peft_model = transformers_class.from_pretrained(model_id).to(torch_device)
+
+                state_dict = peft_model.get_adapter_state_dict()
+
+                for key in state_dict.keys():
+                    self.assertTrue("lora" in key)
+
+    def test_peft_save_pretrained(self):
+        """
+        Test that checks various combinations of `save_pretrained` with a model that has adapters loaded
+        on it. This checks if the saved model contains the expected files (adapter weights and adapter config).
+        """
+        for model_id in self.peft_test_model_ids:
+            for transformers_class in self.transformers_test_model_classes:
+                peft_model = transformers_class.from_pretrained(model_id).to(torch_device)
+
+                with tempfile.TemporaryDirectory() as tmpdirname:
+                    peft_model.save_pretrained(tmpdirname)
+
+                    self.assertTrue("adapter_model.bin" in os.listdir(tmpdirname))
+                    self.assertTrue("adapter_config.json" in os.listdir(tmpdirname))
+
+                    self.assertTrue("config.json" not in os.listdir(tmpdirname))
+                    self.assertTrue("pytorch_model.bin" not in os.listdir(tmpdirname))
+
+                    peft_model = transformers_class.from_pretrained(tmpdirname).to(torch_device)
+                    self.assertTrue(self._check_lora_correctly_converted(peft_model))
+
+                    peft_model.save_pretrained(tmpdirname, safe_serialization=True)
+                    self.assertTrue("adapter_model.safetensors" in os.listdir(tmpdirname))
+                    self.assertTrue("adapter_config.json" in os.listdir(tmpdirname))
+
+                    peft_model = transformers_class.from_pretrained(tmpdirname).to(torch_device)
+                    self.assertTrue(self._check_lora_correctly_converted(peft_model))
+
+    def test_peft_enable_disable_adapters(self):
+        """
+        A test that checks if `enable_adapters` and `disable_adapters` methods work as expected.
+        """
+        from peft import LoraConfig
+
+        dummy_input = torch.LongTensor([[0, 1, 2, 3, 4, 5, 6, 7]]).to(torch_device)
+
+        for model_id in self.transformers_test_model_ids:
+            for transformers_class in self.transformers_test_model_classes:
+                peft_model = transformers_class.from_pretrained(model_id).to(torch_device)
+
+                peft_config = LoraConfig(init_lora_weights=False)
+
+                peft_model.add_adapter(peft_config)
+
+                peft_logits = peft_model(dummy_input).logits
+
+                peft_model.disable_adapters()
+
+                peft_logits_disabled = peft_model(dummy_input).logits
+
+                peft_model.enable_adapters()
+
+                peft_logits_enabled = peft_model(dummy_input).logits
+
+                self.assertTrue(torch.allclose(peft_logits, peft_logits_enabled, atol=1e-12, rtol=1e-12))
+                self.assertFalse(torch.allclose(peft_logits_enabled, peft_logits_disabled, atol=1e-12, rtol=1e-12))
+
+    def test_peft_add_adapter(self):
+        """
+        Simple test that tests if `add_adapter` works as expected
+        """
+        from peft import LoraConfig
+
+        for model_id in self.transformers_test_model_ids:
+            for transformers_class in self.transformers_test_model_classes:
+                model = transformers_class.from_pretrained(model_id).to(torch_device)
+
+                peft_config = LoraConfig(init_lora_weights=False)
+
+                model.add_adapter(peft_config)
+
+                self.assertTrue(self._check_lora_correctly_converted(model))
+                # dummy generation
+                _ = model.generate(input_ids=torch.LongTensor([[0, 1, 2, 3, 4, 5, 6, 7]]).to(torch_device))
+
+    def test_peft_add_adapter_from_pretrained(self):
+        """
+        Simple test that tests if `add_adapter` works as expected
+        """
+        from peft import LoraConfig
+
+        for model_id in self.transformers_test_model_ids:
+            for transformers_class in self.transformers_test_model_classes:
+                model = transformers_class.from_pretrained(model_id).to(torch_device)
+
+                peft_config = LoraConfig(init_lora_weights=False)
+
+                model.add_adapter(peft_config)
+
+                self.assertTrue(self._check_lora_correctly_converted(model))
+                with tempfile.TemporaryDirectory() as tmpdirname:
+                    model.save_pretrained(tmpdirname)
+                    model_from_pretrained = transformers_class.from_pretrained(tmpdirname).to(torch_device)
+                    self.assertTrue(self._check_lora_correctly_converted(model_from_pretrained))
+
+    def test_peft_add_adapter_training_gradient_checkpointing(self):
+        """
+        Simple test that tests if `add_adapter` works as expected when training with
+        gradient checkpointing.
+        """
+        from peft import LoraConfig
+
+        for model_id in self.transformers_test_model_ids:
+            for transformers_class in self.transformers_test_model_classes:
+                model = transformers_class.from_pretrained(model_id).to(torch_device)
+
+                peft_config = LoraConfig(init_lora_weights=False)
+
+                model.add_adapter(peft_config)
+
+                self.assertTrue(self._check_lora_correctly_converted(model))
+
+                # When attaching adapters the input embeddings will stay frozen, this will
+                # lead to the output embedding having requires_grad=False.
+                dummy_input = torch.LongTensor([[0, 1, 2, 3, 4, 5, 6, 7]]).to(torch_device)
+                frozen_output = model.get_input_embeddings()(dummy_input)
+                self.assertTrue(frozen_output.requires_grad is False)
+
+                model.gradient_checkpointing_enable()
+
+                # Since here we attached the hook, the input should have requires_grad to set
+                # properly
+                non_frozen_output = model.get_input_embeddings()(dummy_input)
+                self.assertTrue(non_frozen_output.requires_grad is True)
+
+                # To repro the Trainer issue
+                dummy_input.requires_grad = False
+
+                for name, param in model.named_parameters():
+                    if "lora" in name.lower():
+                        self.assertTrue(param.requires_grad)
+
+                logits = model(dummy_input).logits
+                loss = logits.mean()
+                loss.backward()
+
+                for name, param in model.named_parameters():
+                    if param.requires_grad:
+                        self.assertTrue("lora" in name.lower())
+                        self.assertTrue(param.grad is not None)
+
+    def test_peft_add_multi_adapter(self):
+        """
+        Simple test that tests the basic usage of PEFT model through `from_pretrained`. This test tests if
+        add_adapter works as expected in multi-adapter setting.
+        """
+        from peft import LoraConfig
+        from peft.tuners.tuners_utils import BaseTunerLayer
+
+        dummy_input = torch.LongTensor([[0, 1, 2, 3, 4, 5, 6, 7]]).to(torch_device)
+
+        for model_id in self.transformers_test_model_ids:
+            for transformers_class in self.transformers_test_model_classes:
+                is_peft_loaded = False
+                model = transformers_class.from_pretrained(model_id).to(torch_device)
+
+                logits_original_model = model(dummy_input).logits
+
+                peft_config = LoraConfig(init_lora_weights=False)
+
+                model.add_adapter(peft_config)
+
+                logits_adapter_1 = model(dummy_input)
+
+                model.add_adapter(peft_config, adapter_name="adapter-2")
+
+                logits_adapter_2 = model(dummy_input)
+
+                for _, m in model.named_modules():
+                    if isinstance(m, BaseTunerLayer):
+                        is_peft_loaded = True
+                        break
+
+                self.assertTrue(is_peft_loaded)
+
+                # dummy generation
+                _ = model.generate(input_ids=dummy_input)
+
+                model.set_adapter("default")
+                self.assertTrue(model.active_adapters() == ["default"])
+                self.assertTrue(model.active_adapter() == "default")
+
+                model.set_adapter("adapter-2")
+                self.assertTrue(model.active_adapters() == ["adapter-2"])
+                self.assertTrue(model.active_adapter() == "adapter-2")
+
+                # Logits comparison
+                self.assertFalse(
+                    torch.allclose(logits_adapter_1.logits, logits_adapter_2.logits, atol=1e-6, rtol=1e-6)
+                )
+                self.assertFalse(torch.allclose(logits_original_model, logits_adapter_2.logits, atol=1e-6, rtol=1e-6))
+
+                model.set_adapter(["adapter-2", "default"])
+                self.assertTrue(model.active_adapters() == ["adapter-2", "default"])
+                self.assertTrue(model.active_adapter() == "adapter-2")
+
+                logits_adapter_mixed = model(dummy_input)
+                self.assertFalse(
+                    torch.allclose(logits_adapter_1.logits, logits_adapter_mixed.logits, atol=1e-6, rtol=1e-6)
+                )
+
+                self.assertFalse(
+                    torch.allclose(logits_adapter_2.logits, logits_adapter_mixed.logits, atol=1e-6, rtol=1e-6)
+                )
+
+                # multi active adapter saving not supported
+                with self.assertRaises(ValueError), tempfile.TemporaryDirectory() as tmpdirname:
+                    model.save_pretrained(tmpdirname)
+
+    @require_torch_gpu
+    def test_peft_from_pretrained_kwargs(self):
+        """
+        Simple test that tests the basic usage of PEFT model through `from_pretrained` + additional kwargs
+        and see if the integraiton behaves as expected.
+        """
+        for model_id in self.peft_test_model_ids:
+            for transformers_class in self.transformers_test_model_classes:
+                peft_model = transformers_class.from_pretrained(model_id, load_in_8bit=True, device_map="auto")
+
+                module = peft_model.model.decoder.layers[0].self_attn.v_proj
+                self.assertTrue(module.__class__.__name__ == "Linear8bitLt")
+                self.assertTrue(peft_model.hf_device_map is not None)
+
+                # dummy generation
+                _ = peft_model.generate(input_ids=torch.LongTensor([[0, 1, 2, 3, 4, 5, 6, 7]]).to(torch_device))
+
+    @require_torch_gpu
+    def test_peft_save_quantized(self):
+        """
+        Simple test that tests the basic usage of PEFT model save_pretrained with quantized base models
+        """
+        # 4bit
+        for model_id in self.peft_test_model_ids:
+            for transformers_class in self.transformers_test_model_classes:
+                peft_model = transformers_class.from_pretrained(model_id, load_in_4bit=True, device_map="auto")
+
+                module = peft_model.model.decoder.layers[0].self_attn.v_proj
+                self.assertTrue(module.__class__.__name__ == "Linear4bit")
+                self.assertTrue(peft_model.hf_device_map is not None)
+
+                with tempfile.TemporaryDirectory() as tmpdirname:
+                    peft_model.save_pretrained(tmpdirname)
+                    self.assertTrue("adapter_model.bin" in os.listdir(tmpdirname))
+                    self.assertTrue("adapter_config.json" in os.listdir(tmpdirname))
+                    self.assertTrue("pytorch_model.bin" not in os.listdir(tmpdirname))
+
+        # 8-bit
+        for model_id in self.peft_test_model_ids:
+            for transformers_class in self.transformers_test_model_classes:
+                peft_model = transformers_class.from_pretrained(model_id, load_in_8bit=True, device_map="auto")
+
+                module = peft_model.model.decoder.layers[0].self_attn.v_proj
+                self.assertTrue(module.__class__.__name__ == "Linear8bitLt")
+                self.assertTrue(peft_model.hf_device_map is not None)
+
+                with tempfile.TemporaryDirectory() as tmpdirname:
+                    peft_model.save_pretrained(tmpdirname)
+
+                    self.assertTrue("adapter_model.bin" in os.listdir(tmpdirname))
+                    self.assertTrue("adapter_config.json" in os.listdir(tmpdirname))
+                    self.assertTrue("pytorch_model.bin" not in os.listdir(tmpdirname))
+
+    def test_peft_pipeline(self):
+        """
+        Simple test that tests the basic usage of PEFT model + pipeline
+        """
+        from transformers import pipeline
+
+        for model_id in self.peft_test_model_ids:
+            pipe = pipeline("text-generation", model_id)
+            _ = pipe("Hello")
+
+    def test_peft_add_adapter_with_state_dict(self):
+        """
+        Simple test that tests the basic usage of PEFT model through `from_pretrained`. This test tests if
+        add_adapter works as expected with a state_dict being passed.
+        """
+        from peft import LoraConfig
+
+        dummy_input = torch.LongTensor([[0, 1, 2, 3, 4, 5, 6, 7]]).to(torch_device)
+
+        for model_id, peft_model_id in zip(self.transformers_test_model_ids, self.peft_test_model_ids):
+            for transformers_class in self.transformers_test_model_classes:
+                model = transformers_class.from_pretrained(model_id).to(torch_device)
+
+                peft_config = LoraConfig(init_lora_weights=False)
+
+                with self.assertRaises(ValueError):
+                    model.load_adapter(peft_model_id=None)
+
+                state_dict_path = hf_hub_download(peft_model_id, "adapter_model.bin")
+
+                dummy_state_dict = torch.load(state_dict_path)
+
+                model.load_adapter(adapter_state_dict=dummy_state_dict, peft_config=peft_config)
+                with self.assertRaises(ValueError):
+                    model.load_adapter(model.load_adapter(adapter_state_dict=dummy_state_dict, peft_config=None))
+                self.assertTrue(self._check_lora_correctly_converted(model))
+
+                # dummy generation
+                _ = model.generate(input_ids=dummy_input)
+
+    def test_peft_from_pretrained_hub_kwargs(self):
+        """
+        Tests different combinations of PEFT model + from_pretrained + hub kwargs
+        """
+        peft_model_id = "peft-internal-testing/tiny-opt-lora-revision"
+
+        # This should not work
+        with self.assertRaises(OSError):
+            _ = AutoModelForCausalLM.from_pretrained(peft_model_id)
+
+        adapter_kwargs = {"revision": "test"}
+
+        # This should work
+        model = AutoModelForCausalLM.from_pretrained(peft_model_id, adapter_kwargs=adapter_kwargs)
+        self.assertTrue(self._check_lora_correctly_converted(model))
+
+        model = OPTForCausalLM.from_pretrained(peft_model_id, adapter_kwargs=adapter_kwargs)
+        self.assertTrue(self._check_lora_correctly_converted(model))
+
+        adapter_kwargs = {"revision": "main", "subfolder": "test_subfolder"}
+
+        model = AutoModelForCausalLM.from_pretrained(peft_model_id, adapter_kwargs=adapter_kwargs)
+        self.assertTrue(self._check_lora_correctly_converted(model))
+
+        model = OPTForCausalLM.from_pretrained(peft_model_id, adapter_kwargs=adapter_kwargs)
+        self.assertTrue(self._check_lora_correctly_converted(model))
diff --git a/tests/pipelines/test_pipelines_audio_classification.py b/tests/pipelines/test_pipelines_audio_classification.py
index 3f957132fd82..48c39ff663fb 100644
--- a/tests/pipelines/test_pipelines_audio_classification.py
+++ b/tests/pipelines/test_pipelines_audio_classification.py
@@ -16,19 +16,27 @@
 
 import numpy as np
 
-from transformers import MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
+from transformers import MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING, TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
 from transformers.pipelines import AudioClassificationPipeline, pipeline
-from transformers.testing_utils import nested_simplify, require_tf, require_torch, require_torchaudio, slow
+from transformers.testing_utils import (
+    is_pipeline_test,
+    nested_simplify,
+    require_tf,
+    require_torch,
+    require_torchaudio,
+    slow,
+)
 
-from .test_pipelines_common import ANY, PipelineTestCaseMeta
+from .test_pipelines_common import ANY
 
 
-@require_torch
-class AudioClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
+@is_pipeline_test
+class AudioClassificationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
+    tf_model_mapping = TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
-        audio_classifier = AudioClassificationPipeline(model=model, feature_extractor=feature_extractor)
+    def get_test_pipeline(self, model, tokenizer, processor):
+        audio_classifier = AudioClassificationPipeline(model=model, feature_extractor=processor)
 
         # test with a raw waveform
         audio = np.zeros((34000,))
@@ -80,15 +88,24 @@ def test_small_model_pt(self):
 
         audio = np.ones((8000,))
         output = audio_classifier(audio, top_k=4)
-        self.assertEqual(
-            nested_simplify(output, decimals=4),
-            [
-                {"score": 0.0842, "label": "no"},
-                {"score": 0.0838, "label": "up"},
-                {"score": 0.0837, "label": "go"},
-                {"score": 0.0834, "label": "right"},
-            ],
-        )
+
+        EXPECTED_OUTPUT = [
+            {"score": 0.0842, "label": "no"},
+            {"score": 0.0838, "label": "up"},
+            {"score": 0.0837, "label": "go"},
+            {"score": 0.0834, "label": "right"},
+        ]
+        EXPECTED_OUTPUT_PT_2 = [
+            {"score": 0.0845, "label": "stop"},
+            {"score": 0.0844, "label": "on"},
+            {"score": 0.0841, "label": "right"},
+            {"score": 0.0834, "label": "left"},
+        ]
+        self.assertIn(nested_simplify(output, decimals=4), [EXPECTED_OUTPUT, EXPECTED_OUTPUT_PT_2])
+
+        audio_dict = {"array": np.ones((8000,)), "sampling_rate": audio_classifier.feature_extractor.sampling_rate}
+        output = audio_classifier(audio_dict, top_k=4)
+        self.assertIn(nested_simplify(output, decimals=4), [EXPECTED_OUTPUT, EXPECTED_OUTPUT_PT_2])
 
     @require_torch
     @slow
diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
index 5487d3ff1246..d989db3ef2e9 100644
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -17,32 +17,34 @@
 import numpy as np
 import pytest
 from datasets import load_dataset
+from huggingface_hub import hf_hub_download, snapshot_download
 
-from huggingface_hub import snapshot_download
 from transformers import (
     MODEL_FOR_CTC_MAPPING,
     MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
     AutoFeatureExtractor,
+    AutoProcessor,
     AutoTokenizer,
     Speech2TextForConditionalGeneration,
     Wav2Vec2ForCTC,
     WhisperForConditionalGeneration,
-    WhisperProcessor,
 )
 from transformers.pipelines import AutomaticSpeechRecognitionPipeline, pipeline
 from transformers.pipelines.audio_utils import chunk_bytes_iter
-from transformers.pipelines.automatic_speech_recognition import chunk_iter
+from transformers.pipelines.automatic_speech_recognition import _find_timestamp_sequence, chunk_iter
 from transformers.testing_utils import (
+    is_pipeline_test,
     is_torch_available,
     nested_simplify,
     require_pyctcdecode,
     require_tf,
     require_torch,
+    require_torch_gpu,
     require_torchaudio,
     slow,
 )
 
-from .test_pipelines_common import ANY, PipelineTestCaseMeta
+from .test_pipelines_common import ANY
 
 
 if is_torch_available():
@@ -53,14 +55,14 @@
 # from .test_pipelines_common import CustomInputPipelineCommonMixin
 
 
-class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
-    model_mapping = {
-        k: v
-        for k, v in (list(MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING.items()) if MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING else [])
+@is_pipeline_test
+class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
+    model_mapping = dict(
+        (list(MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING.items()) if MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING else [])
         + (MODEL_FOR_CTC_MAPPING.items() if MODEL_FOR_CTC_MAPPING else [])
-    }
+    )
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    def get_test_pipeline(self, model, tokenizer, processor):
         if tokenizer is None:
             # Side effect of no Fast Tokenizer class for these model, so skipping
             # But the slow tokenizer test should still run as they're quite small
@@ -69,7 +71,7 @@ def get_test_pipeline(self, model, tokenizer, feature_extractor):
             # return None, None
 
         speech_recognizer = AutomaticSpeechRecognitionPipeline(
-            model=model, tokenizer=tokenizer, feature_extractor=feature_extractor
+            model=model, tokenizer=tokenizer, feature_extractor=processor
         )
 
         # test with a raw waveform
@@ -87,7 +89,9 @@ def run_pipeline_test(self, speech_recognizer, examples):
         if speech_recognizer.type == "ctc":
             outputs = speech_recognizer(audio)
             self.assertEqual(outputs, {"text": ANY(str)})
-
+        elif "Whisper" in speech_recognizer.model.__class__.__name__:
+            outputs = speech_recognizer(audio)
+            self.assertEqual(outputs, {"text": ANY(str)})
         else:
             # Non CTC models cannot use striding.
             with self.assertRaises(ValueError):
@@ -117,9 +121,23 @@ def run_pipeline_test(self, speech_recognizer, examples):
                     "chunks": [{"text": ANY(str), "timestamp": (ANY(float), ANY(float))} for i in range(n)],
                 },
             )
+        elif "Whisper" in speech_recognizer.model.__class__.__name__:
+            outputs = speech_recognizer(audio, return_timestamps=True)
+            self.assertIsInstance(outputs["chunks"], list)
+            nb_chunks = len(outputs["chunks"])
+            self.assertGreater(nb_chunks, 0)
+            self.assertEqual(
+                outputs,
+                {
+                    "text": ANY(str),
+                    "chunks": [{"text": ANY(str), "timestamp": (ANY(float), ANY(float))} for i in range(nb_chunks)],
+                },
+            )
         else:
             # Non CTC models cannot use return_timestamps
-            with self.assertRaisesRegex(ValueError, "^We cannot return_timestamps yet on non-ctc models !$"):
+            with self.assertRaisesRegex(
+                ValueError, "^We cannot return_timestamps yet on non-CTC models apart from Whisper!$"
+            ):
                 outputs = speech_recognizer(audio, return_timestamps="char")
 
     @require_torch
@@ -142,7 +160,9 @@ def test_small_model_pt(self):
         self.assertEqual(output, {"text": "(Applaudissements)"})
 
         # Non CTC models cannot use return_timestamps
-        with self.assertRaisesRegex(ValueError, "^We cannot return_timestamps yet on non-ctc models !$"):
+        with self.assertRaisesRegex(
+            ValueError, "^We cannot return_timestamps yet on non-CTC models apart from Whisper!$"
+        ):
             _ = speech_recognizer(waveform, return_timestamps="char")
 
     @slow
@@ -184,8 +204,9 @@ def test_small_model_pt_seq2seq_gen_kwargs(self):
     @require_torch
     @require_pyctcdecode
     def test_large_model_pt_with_lm(self):
-        dataset = load_dataset("Narsil/asr_dummy")
-        filename = dataset["test"][3]["file"]
+        dataset = load_dataset("Narsil/asr_dummy", streaming=True)
+        third_item = next(iter(dataset["test"].skip(3)))
+        filename = third_item["file"]
 
         speech_recognizer = pipeline(
             task="automatic-speech-recognition",
@@ -240,6 +261,11 @@ def test_large_model_pt_with_lm(self):
                 ],
             },
         )
+        # CTC + LM models cannot use return_timestamps="char"
+        with self.assertRaisesRegex(
+            ValueError, "^CTC with LM can only predict word level timestamps, set `return_timestamps='word'`$"
+        ):
+            _ = speech_recognizer(filename, return_timestamps="char")
 
     @require_tf
     def test_small_model_tf(self):
@@ -258,7 +284,6 @@ def test_torch_small_no_tokenizer_files(self):
     @require_torch
     @slow
     def test_torch_large(self):
-
         speech_recognizer = pipeline(
             task="automatic-speech-recognition",
             model="facebook/wav2vec2-base-960h",
@@ -274,6 +299,104 @@ def test_torch_large(self):
         output = speech_recognizer(filename)
         self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"})
 
+    @slow
+    @require_torch
+    @slow
+    def test_return_timestamps_in_preprocess(self):
+        pipe = pipeline(
+            task="automatic-speech-recognition",
+            model="openai/whisper-tiny",
+            chunk_length_s=8,
+            stride_length_s=1,
+        )
+        data = load_dataset("librispeech_asr", "clean", split="test", streaming=True)
+        sample = next(iter(data))
+        pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language="en", task="transcribe")
+
+        res = pipe(sample["audio"]["array"])
+        self.assertEqual(res, {"text": " Conquered returned to its place amidst the tents."})
+        res = pipe(sample["audio"]["array"], return_timestamps=True)
+        self.assertEqual(
+            res,
+            {
+                "text": " Conquered returned to its place amidst the tents.",
+                "chunks": [{"timestamp": (0.0, 3.36), "text": " Conquered returned to its place amidst the tents."}],
+            },
+        )
+        pipe.model.generation_config.alignment_heads = [[2, 2], [3, 0], [3, 2], [3, 3], [3, 4], [3, 5]]
+        res = pipe(sample["audio"]["array"], return_timestamps="word")
+
+        # fmt: off
+        self.assertEqual(
+            res,
+            {
+                "text": " Conquered returned to its place amidst the tents.",
+                "chunks": [
+                    {"text": " Conquered", "timestamp": (0.5, 1.2)},
+                    {"text": " returned", "timestamp": (1.2, 1.64)},
+                    {"text": " to", "timestamp": (1.64, 1.84)},
+                    {"text": " its", "timestamp": (1.84, 2.02)},
+                    {"text": " place", "timestamp": (2.02, 2.28)},
+                    {"text": " amidst", "timestamp": (2.28, 2.78)},
+                    {"text": " the", "timestamp": (2.78, 2.96)},
+                    {"text": " tents.", "timestamp": (2.96, 3.48)},
+                ],
+            },
+        )
+        # fmt: on
+
+    @require_torch
+    def test_return_timestamps_in_init(self):
+        # segment-level timestamps are accepted
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
+        tokenizer = AutoTokenizer.from_pretrained("openai/whisper-tiny")
+        feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-tiny")
+
+        dummy_speech = np.ones(100)
+
+        pipe = pipeline(
+            task="automatic-speech-recognition",
+            model=model,
+            feature_extractor=feature_extractor,
+            tokenizer=tokenizer,
+            chunk_length_s=8,
+            stride_length_s=1,
+            return_timestamps=True,
+        )
+
+        _ = pipe(dummy_speech)
+
+        # word-level timestamps are accepted
+        pipe = pipeline(
+            task="automatic-speech-recognition",
+            model=model,
+            feature_extractor=feature_extractor,
+            tokenizer=tokenizer,
+            chunk_length_s=8,
+            stride_length_s=1,
+            return_timestamps="word",
+        )
+
+        _ = pipe(dummy_speech)
+
+        # char-level timestamps are not accepted
+        with self.assertRaisesRegex(
+            ValueError,
+            "^Whisper cannot return `char` timestamps, only word level or segment level timestamps. "
+            "Use `return_timestamps='word'` or `return_timestamps=True` respectively.$",
+        ):
+            pipe = pipeline(
+                task="automatic-speech-recognition",
+                model=model,
+                feature_extractor=feature_extractor,
+                tokenizer=tokenizer,
+                chunk_length_s=8,
+                stride_length_s=1,
+                return_timestamps="char",
+            )
+
+            _ = pipe(dummy_speech)
+
     @require_torch
     @slow
     def test_torch_whisper(self):
@@ -290,6 +413,276 @@ def test_torch_whisper(self):
         output = speech_recognizer([filename], chunk_length_s=5, batch_size=4)
         self.assertEqual(output, [{"text": " A man said to the universe, Sir, I exist."}])
 
+    @slow
+    def test_find_longest_common_subsequence(self):
+        max_source_positions = 1500
+        processor = AutoProcessor.from_pretrained("openai/whisper-tiny")
+
+        previous_sequence = [[51492, 406, 3163, 1953, 466, 13, 51612, 51612]]
+        self.assertEqual(
+            processor.decode(previous_sequence[0], output_offsets=True),
+            {
+                "text": " not worth thinking about.",
+                "offsets": [{"text": " not worth thinking about.", "timestamp": (22.56, 24.96)}],
+            },
+        )
+
+        # Merge when the previous sequence is a suffix of the next sequence
+        # fmt: off
+        next_sequences_1 = [
+            [50364, 295, 6177, 3391, 11, 19817, 3337, 507, 307, 406, 3163, 1953, 466, 13, 50614, 50614, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 50834, 50257]
+        ]
+        # fmt: on
+        self.assertEqual(
+            processor.decode(next_sequences_1[0], output_offsets=True),
+            {
+                "text": (
+                    " of spectators, retrievality is not worth thinking about. His instant panic was followed by a"
+                    " small, sharp blow high on his chest.<|endoftext|>"
+                ),
+                "offsets": [
+                    {"text": " of spectators, retrievality is not worth thinking about.", "timestamp": (0.0, 5.0)},
+                    {
+                        "text": " His instant panic was followed by a small, sharp blow high on his chest.",
+                        "timestamp": (5.0, 9.4),
+                    },
+                ],
+            },
+        )
+        merge = _find_timestamp_sequence(
+            [[previous_sequence, (480_000, 0, 0)], [next_sequences_1, (480_000, 120_000, 0)]],
+            processor.tokenizer,
+            processor.feature_extractor,
+            max_source_positions,
+        )
+
+        # fmt: off
+        self.assertEqual(
+            merge,
+            [51492, 406, 3163, 1953, 466, 13, 51739, 51739, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 51959],
+        )
+        # fmt: on
+        self.assertEqual(
+            processor.decode(merge, output_offsets=True),
+            {
+                "text": (
+                    " not worth thinking about. His instant panic was followed by a small, sharp blow high on his"
+                    " chest."
+                ),
+                "offsets": [
+                    {"text": " not worth thinking about.", "timestamp": (22.56, 27.5)},
+                    {
+                        "text": " His instant panic was followed by a small, sharp blow high on his chest.",
+                        "timestamp": (27.5, 31.900000000000002),
+                    },
+                ],
+            },
+        )
+
+        # Merge when the sequence is in the middle of the 1st next sequence
+        # fmt: off
+        next_sequences_2 = [
+            [50364, 295, 6177, 3391, 11, 19817, 3337, 507, 307, 406, 3163, 1953, 466, 13, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 50834, 50257]
+        ]
+        # fmt: on
+        # {'text': ' of spectators, retrievality is not worth thinking about. His instant panic was followed by a small, sharp blow high on his chest.','timestamp': (0.0, 9.4)}
+        merge = _find_timestamp_sequence(
+            [[previous_sequence, (480_000, 0, 0)], [next_sequences_2, (480_000, 120_000, 0)]],
+            processor.tokenizer,
+            processor.feature_extractor,
+            max_source_positions,
+        )
+        # fmt: off
+        self.assertEqual(
+            merge,
+            [51492, 406, 3163, 1953, 466, 13, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 51959],
+        )
+        # fmt: on
+        self.assertEqual(
+            processor.decode(merge, output_offsets=True),
+            {
+                "text": (
+                    " not worth thinking about. His instant panic was followed by a small, sharp blow high on his"
+                    " chest."
+                ),
+                "offsets": [
+                    {
+                        "text": (
+                            " not worth thinking about. His instant panic was followed by a small, sharp blow high on"
+                            " his chest."
+                        ),
+                        "timestamp": (22.56, 31.900000000000002),
+                    },
+                ],
+            },
+        )
+
+        # Merge when the previous sequence is not included in the current sequence
+        # fmt: off
+        next_sequences_3 = [[50364, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 50584, 50257]]
+        # fmt: on
+        # {'text': ' His instant panic was followed by a small, sharp blow high on his chest.','timestamp': (0.0, 9.4)}
+        merge = _find_timestamp_sequence(
+            [[previous_sequence, (480_000, 0, 0)], [next_sequences_3, (480_000, 120_000, 0)]],
+            processor.tokenizer,
+            processor.feature_extractor,
+            max_source_positions,
+        )
+        # fmt: off
+        self.assertEqual(
+            merge,
+            [51492, 406, 3163, 1953, 466, 13, 51612, 51612, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 51832],
+        )
+        # fmt: on
+        self.assertEqual(
+            processor.decode(merge, output_offsets=True),
+            {
+                "text": (
+                    " not worth thinking about. His instant panic was followed by a small, sharp blow high on his"
+                    " chest."
+                ),
+                "offsets": [
+                    {"text": " not worth thinking about.", "timestamp": (22.56, 24.96)},
+                    {
+                        "text": " His instant panic was followed by a small, sharp blow high on his chest.",
+                        "timestamp": (24.96, 29.36),
+                    },
+                ],
+            },
+        )
+        # last case is when the sequence is not in the first next predicted start and end of timestamp
+        # fmt: off
+        next_sequences_3 = [
+            [50364, 2812, 9836, 14783, 390, 406, 3163, 1953, 466, 13, 50634, 50634, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 50934]
+        ]
+        # fmt: on
+        merge = _find_timestamp_sequence(
+            [[previous_sequence, (480_000, 0, 0)], [next_sequences_3, (480_000, 167_000, 0)]],
+            processor.tokenizer,
+            processor.feature_extractor,
+            max_source_positions,
+        )
+        # fmt: off
+        self.assertEqual(
+            merge,
+            [51492, 406, 3163, 1953, 466, 13, 51612, 51612, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 51912]
+        )
+        # fmt: on
+        self.assertEqual(
+            processor.decode(merge, output_offsets=True),
+            {
+                "text": (
+                    " not worth thinking about. His instant panic was followed by a small, sharp blow high on his"
+                    " chest."
+                ),
+                "offsets": [
+                    {"text": " not worth thinking about.", "timestamp": (22.56, 24.96)},
+                    {
+                        "text": " His instant panic was followed by a small, sharp blow high on his chest.",
+                        "timestamp": (24.96, 30.96),
+                    },
+                ],
+            },
+        )
+
+    @slow
+    @require_torch
+    def test_whisper_timestamp_prediction(self):
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
+        array = np.concatenate(
+            [ds[40]["audio"]["array"], ds[41]["audio"]["array"], ds[42]["audio"]["array"], ds[43]["audio"]["array"]]
+        )
+        pipe = pipeline(
+            model="openai/whisper-small",
+            return_timestamps=True,
+        )
+
+        output = pipe(ds[40]["audio"])
+        self.assertDictEqual(
+            output,
+            {
+                "text": " A man said to the universe, Sir, I exist.",
+                "chunks": [{"text": " A man said to the universe, Sir, I exist.", "timestamp": (0.0, 4.26)}],
+            },
+        )
+
+        output = pipe(array, chunk_length_s=10)
+        self.assertDictEqual(
+            nested_simplify(output),
+            {
+                "chunks": [
+                    {"text": " A man said to the universe, Sir, I exist.", "timestamp": (0.0, 5.5)},
+                    {
+                        "text": (
+                            " Sweat covered Brion's body, trickling into the "
+                            "tight-loan cloth that was the only garment he wore, the "
+                            "cut"
+                        ),
+                        "timestamp": (5.5, 11.95),
+                    },
+                    {
+                        "text": (
+                            " on his chest still dripping blood, the ache of his "
+                            "overstrained eyes, even the soaring arena around him "
+                            "with"
+                        ),
+                        "timestamp": (11.95, 19.61),
+                    },
+                    {
+                        "text": " the thousands of spectators, retrievality is not worth thinking about.",
+                        "timestamp": (19.61, 25.0),
+                    },
+                    {
+                        "text": " His instant panic was followed by a small, sharp blow high on his chest.",
+                        "timestamp": (25.0, 29.4),
+                    },
+                ],
+                "text": (
+                    " A man said to the universe, Sir, I exist. Sweat covered Brion's "
+                    "body, trickling into the tight-loan cloth that was the only garment "
+                    "he wore, the cut on his chest still dripping blood, the ache of his "
+                    "overstrained eyes, even the soaring arena around him with the "
+                    "thousands of spectators, retrievality is not worth thinking about. "
+                    "His instant panic was followed by a small, sharp blow high on his "
+                    "chest."
+                ),
+            },
+        )
+
+        output = pipe(array)
+        self.assertDictEqual(
+            output,
+            {
+                "chunks": [
+                    {"text": " A man said to the universe, Sir, I exist.", "timestamp": (0.0, 5.5)},
+                    {
+                        "text": (
+                            " Sweat covered Brion's body, trickling into the "
+                            "tight-loan cloth that was the only garment"
+                        ),
+                        "timestamp": (5.5, 10.18),
+                    },
+                    {"text": " he wore.", "timestamp": (10.18, 11.68)},
+                    {"text": " The cut on his chest still dripping blood.", "timestamp": (11.68, 14.92)},
+                    {"text": " The ache of his overstrained eyes.", "timestamp": (14.92, 17.6)},
+                    {
+                        "text": (
+                            " Even the soaring arena around him with the thousands of spectators were trivialities"
+                        ),
+                        "timestamp": (17.6, 22.56),
+                    },
+                    {"text": " not worth thinking about.", "timestamp": (22.56, 24.96)},
+                ],
+                "text": (
+                    " A man said to the universe, Sir, I exist. Sweat covered Brion's "
+                    "body, trickling into the tight-loan cloth that was the only garment "
+                    "he wore. The cut on his chest still dripping blood. The ache of his "
+                    "overstrained eyes. Even the soaring arena around him with the "
+                    "thousands of spectators were trivialities not worth thinking about."
+                ),
+            },
+        )
+
     @require_torch
     @slow
     def test_torch_speech_encoder_decoder(self):
@@ -333,7 +726,6 @@ def test_simple_wav2vec2(self):
     @require_torch
     @require_torchaudio
     def test_simple_s2t(self):
-
         model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-mustc-en-it-st")
         tokenizer = AutoTokenizer.from_pretrained("facebook/s2t-small-mustc-en-it-st")
         feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/s2t-small-mustc-en-it-st")
@@ -372,6 +764,58 @@ def test_simple_whisper_asr(self):
             output,
             {"text": " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel."},
         )
+        output = speech_recognizer(filename, return_timestamps=True)
+        self.assertEqual(
+            output,
+            {
+                "text": " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.",
+                "chunks": [
+                    {
+                        "text": (
+                            " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel."
+                        ),
+                        "timestamp": (0.0, 5.44),
+                    }
+                ],
+            },
+        )
+        speech_recognizer.model.generation_config.alignment_heads = [[2, 2], [3, 0], [3, 2], [3, 3], [3, 4], [3, 5]]
+        output = speech_recognizer(filename, return_timestamps="word")
+        # fmt: off
+        self.assertEqual(
+            output,
+            {
+                "text": " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.",
+                "chunks": [
+                    {'text': ' Mr.', 'timestamp': (0.0, 1.02)},
+                    {'text': ' Quilter', 'timestamp': (1.02, 1.18)},
+                    {'text': ' is', 'timestamp': (1.18, 1.44)},
+                    {'text': ' the', 'timestamp': (1.44, 1.58)},
+                    {'text': ' apostle', 'timestamp': (1.58, 1.98)},
+                    {'text': ' of', 'timestamp': (1.98, 2.3)},
+                    {'text': ' the', 'timestamp': (2.3, 2.46)},
+                    {'text': ' middle', 'timestamp': (2.46, 2.56)},
+                    {'text': ' classes,', 'timestamp': (2.56, 3.38)},
+                    {'text': ' and', 'timestamp': (3.38, 3.52)},
+                    {'text': ' we', 'timestamp': (3.52, 3.6)},
+                    {'text': ' are', 'timestamp': (3.6, 3.72)},
+                    {'text': ' glad', 'timestamp': (3.72, 4.0)},
+                    {'text': ' to', 'timestamp': (4.0, 4.26)},
+                    {'text': ' welcome', 'timestamp': (4.26, 4.54)},
+                    {'text': ' his', 'timestamp': (4.54, 4.92)},
+                    {'text': ' gospel.', 'timestamp': (4.92, 6.66)},
+                ],
+            },
+        )
+        # fmt: on
+
+        # Whisper can only predict segment level timestamps or word level, not character level
+        with self.assertRaisesRegex(
+            ValueError,
+            "^Whisper cannot return `char` timestamps, only word level or segment level timestamps. "
+            "Use `return_timestamps='word'` or `return_timestamps=True` respectively.$",
+        ):
+            _ = speech_recognizer(filename, return_timestamps="char")
 
     @slow
     @require_torch
@@ -397,10 +841,14 @@ def test_simple_whisper_translation(self):
         output_2 = speech_recognizer_2(filename)
         self.assertEqual(output, output_2)
 
-        processor = WhisperProcessor(feature_extractor, tokenizer)
-        model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(task="transcribe", language="it")
+        # either use generate_kwargs or set the model's generation_config
+        # model.generation_config.task = "transcribe"
+        # model.generation_config.lang = "<|it|>"
         speech_translator = AutomaticSpeechRecognitionPipeline(
-            model=model, tokenizer=tokenizer, feature_extractor=feature_extractor
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            generate_kwargs={"task": "transcribe", "language": "<|it|>"},
         )
         output_3 = speech_translator(filename)
         self.assertEqual(output_3, {"text": " Un uomo ha detto all'universo, Sir, esiste."})
@@ -455,6 +903,26 @@ def test_speech_to_text_leveraged(self):
         output = speech_recognizer(filename)
         self.assertEqual(output, {"text": "a man said to the universe sir i exist"})
 
+    @slow
+    @require_torch_gpu
+    def test_wav2vec2_conformer_float16(self):
+        speech_recognizer = pipeline(
+            task="automatic-speech-recognition",
+            model="facebook/wav2vec2-conformer-rope-large-960h-ft",
+            device="cuda:0",
+            torch_dtype=torch.float16,
+            framework="pt",
+        )
+
+        dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        sample = dataset[0]["audio"]
+
+        output = speech_recognizer(sample)
+        self.assertEqual(
+            output,
+            {"text": "MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL"},
+        )
+
     @require_torch
     def test_chunking_fast(self):
         speech_recognizer = pipeline(
@@ -701,6 +1169,13 @@ def test_chunking_and_timestamps(self):
                 ],
             },
         )
+        # CTC models must specify return_timestamps type - cannot set `return_timestamps=True` blindly
+        with self.assertRaisesRegex(
+            ValueError,
+            "^CTC can either predict character level timestamps, or word level timestamps."
+            "Set `return_timestamps='char'` or `return_timestamps='word'` as required.$",
+        ):
+            _ = speech_recognizer(audio, return_timestamps=True)
 
     @require_torch
     @slow
@@ -724,22 +1199,22 @@ def test_chunking_with_lm(self):
     def test_chunk_iterator(self):
         feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
         inputs = torch.arange(100).long()
-
-        outs = list(chunk_iter(inputs, feature_extractor, 100, 0, 0))
+        ratio = 1
+        outs = list(chunk_iter(inputs, feature_extractor, 100, 0, 0, ratio))
         self.assertEqual(len(outs), 1)
         self.assertEqual([o["stride"] for o in outs], [(100, 0, 0)])
         self.assertEqual([o["input_values"].shape for o in outs], [(1, 100)])
         self.assertEqual([o["is_last"] for o in outs], [True])
 
         # two chunks no stride
-        outs = list(chunk_iter(inputs, feature_extractor, 50, 0, 0))
+        outs = list(chunk_iter(inputs, feature_extractor, 50, 0, 0, ratio))
         self.assertEqual(len(outs), 2)
         self.assertEqual([o["stride"] for o in outs], [(50, 0, 0), (50, 0, 0)])
         self.assertEqual([o["input_values"].shape for o in outs], [(1, 50), (1, 50)])
         self.assertEqual([o["is_last"] for o in outs], [False, True])
 
         # two chunks incomplete last
-        outs = list(chunk_iter(inputs, feature_extractor, 80, 0, 0))
+        outs = list(chunk_iter(inputs, feature_extractor, 80, 0, 0, ratio))
         self.assertEqual(len(outs), 2)
         self.assertEqual([o["stride"] for o in outs], [(80, 0, 0), (20, 0, 0)])
         self.assertEqual([o["input_values"].shape for o in outs], [(1, 80), (1, 20)])
@@ -750,7 +1225,7 @@ def test_chunk_iterator(self):
         # This test is specifically crafted to trigger a bug if next chunk
         # would be ignored by the fact that all the data would be
         # contained in the strided left data.
-        outs = list(chunk_iter(inputs, feature_extractor, 105, 5, 5))
+        outs = list(chunk_iter(inputs, feature_extractor, 105, 5, 5, ratio))
         self.assertEqual(len(outs), 1)
         self.assertEqual([o["stride"] for o in outs], [(100, 0, 0)])
         self.assertEqual([o["input_values"].shape for o in outs], [(1, 100)])
@@ -763,29 +1238,34 @@ def test_chunk_iterator_stride(self):
         input_values = feature_extractor(inputs, sampling_rate=feature_extractor.sampling_rate, return_tensors="pt")[
             "input_values"
         ]
-
-        outs = list(chunk_iter(inputs, feature_extractor, 100, 20, 10))
+        ratio = 1
+        outs = list(chunk_iter(inputs, feature_extractor, 100, 20, 10, ratio))
         self.assertEqual(len(outs), 2)
         self.assertEqual([o["stride"] for o in outs], [(100, 0, 10), (30, 20, 0)])
         self.assertEqual([o["input_values"].shape for o in outs], [(1, 100), (1, 30)])
         self.assertEqual([o["is_last"] for o in outs], [False, True])
 
-        outs = list(chunk_iter(inputs, feature_extractor, 80, 20, 10))
+        outs = list(chunk_iter(inputs, feature_extractor, 80, 20, 10, ratio))
         self.assertEqual(len(outs), 2)
         self.assertEqual([o["stride"] for o in outs], [(80, 0, 10), (50, 20, 0)])
         self.assertEqual([o["input_values"].shape for o in outs], [(1, 80), (1, 50)])
         self.assertEqual([o["is_last"] for o in outs], [False, True])
 
-        outs = list(chunk_iter(inputs, feature_extractor, 90, 20, 0))
+        outs = list(chunk_iter(inputs, feature_extractor, 90, 20, 0, ratio))
         self.assertEqual(len(outs), 2)
         self.assertEqual([o["stride"] for o in outs], [(90, 0, 0), (30, 20, 0)])
         self.assertEqual([o["input_values"].shape for o in outs], [(1, 90), (1, 30)])
 
+        outs = list(chunk_iter(inputs, feature_extractor, 36, 6, 6, ratio))
+        self.assertEqual(len(outs), 4)
+        self.assertEqual([o["stride"] for o in outs], [(36, 0, 6), (36, 6, 6), (36, 6, 6), (28, 6, 0)])
+        self.assertEqual([o["input_values"].shape for o in outs], [(1, 36), (1, 36), (1, 36), (1, 28)])
+
         inputs = torch.LongTensor([i % 2 for i in range(100)])
         input_values = feature_extractor(inputs, sampling_rate=feature_extractor.sampling_rate, return_tensors="pt")[
             "input_values"
         ]
-        outs = list(chunk_iter(inputs, feature_extractor, 30, 5, 5))
+        outs = list(chunk_iter(inputs, feature_extractor, 30, 5, 5, ratio))
         self.assertEqual(len(outs), 5)
         self.assertEqual([o["stride"] for o in outs], [(30, 0, 5), (30, 5, 5), (30, 5, 5), (30, 5, 5), (20, 5, 0)])
         self.assertEqual([o["input_values"].shape for o in outs], [(1, 30), (1, 30), (1, 30), (1, 30), (1, 20)])
@@ -823,6 +1303,36 @@ def test_stride(self):
         output = speech_recognizer({"raw": waveform, "stride": (1000, 8000), "sampling_rate": 16_000})
         self.assertEqual(output, {"text": "XB"})
 
+    @slow
+    @require_torch_gpu
+    def test_slow_unfinished_sequence(self):
+        from transformers import GenerationConfig
+
+        pipe = pipeline(
+            "automatic-speech-recognition",
+            model="vasista22/whisper-hindi-large-v2",
+            device="cuda:0",
+        )
+        # Original model wasn't trained with timestamps and has incorrect generation config
+        pipe.model.generation_config = GenerationConfig.from_pretrained("openai/whisper-large-v2")
+
+        audio = hf_hub_download("Narsil/asr_dummy", filename="hindi.ogg", repo_type="dataset")
+
+        out = pipe(
+            audio,
+            return_timestamps=True,
+        )
+        self.assertEqual(
+            out,
+            {
+                "chunks": [
+                    {"text": "", "timestamp": (18.94, 0.0)},
+                    {"text": "मिर्ची में कितने विभिन्न प्रजातियां हैं", "timestamp": (None, None)},
+                ],
+                "text": "मिर्ची में कितने विभिन्न प्रजातियां हैं",
+            },
+        )
+
 
 def require_ffmpeg(test_case):
     """
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index c06bd644c639..8c7c66939c33 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -12,28 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import copy
-import importlib
+import gc
 import logging
 import os
-import random
-import string
 import sys
 import tempfile
 import unittest
-from abc import abstractmethod
-from functools import lru_cache
 from pathlib import Path
-from unittest import skipIf
 
+import datasets
 import numpy as np
-
-from huggingface_hub import HfFolder, Repository, delete_repo, set_access_token
+from huggingface_hub import HfFolder, Repository, create_repo, delete_repo
 from requests.exceptions import HTTPError
+
 from transformers import (
-    FEATURE_EXTRACTOR_MAPPING,
-    TOKENIZER_MAPPING,
-    AutoFeatureExtractor,
     AutoModelForSequenceClassification,
     AutoTokenizer,
     DistilBertForSequenceClassification,
@@ -48,15 +40,17 @@
     USER,
     CaptureLogger,
     RequestCounter,
+    is_pipeline_test,
     is_staging_test,
     nested_simplify,
     require_tensorflow_probability,
     require_tf,
     require_torch,
+    require_torch_gpu,
     require_torch_or_tf,
     slow,
 )
-from transformers.utils import is_tf_available, is_torch_available
+from transformers.utils import direct_transformers_import, is_tf_available, is_torch_available
 from transformers.utils import logging as transformers_logging
 
 
@@ -68,106 +62,11 @@
 logger = logging.getLogger(__name__)
 
 
-ROBERTA_EMBEDDING_ADJUSMENT_CONFIGS = [
-    "CamembertConfig",
-    "IBertConfig",
-    "LongformerConfig",
-    "MarkupLMConfig",
-    "RobertaConfig",
-    "RobertaPreLayerNormConfig",
-    "XLMRobertaConfig",
-]
-
-
-def get_checkpoint_from_architecture(architecture):
-    try:
-        module = importlib.import_module(architecture.__module__)
-    except ImportError:
-        logger.error(f"Ignoring architecture {architecture}")
-        return
-
-    if hasattr(module, "_CHECKPOINT_FOR_DOC"):
-        return module._CHECKPOINT_FOR_DOC
-    else:
-        logger.warning(f"Can't retrieve checkpoint from {architecture.__name__}")
-
-
-def get_tiny_config_from_class(configuration_class):
-    if "OpenAIGPT" in configuration_class.__name__:
-        # This is the only file that is inconsistent with the naming scheme.
-        # Will rename this file if we decide this is the way to go
-        return
-
-    model_type = configuration_class.model_type
-    camel_case_model_name = configuration_class.__name__.split("Config")[0]
-
-    try:
-        model_slug = model_type.replace("-", "_")
-        module = importlib.import_module(f".test_modeling_{model_slug}", package=f"tests.models.{model_slug}")
-        model_tester_class = getattr(module, f"{camel_case_model_name}ModelTester", None)
-    except (ImportError, AttributeError):
-        logger.error(f"No model tester class for {configuration_class.__name__}")
-        return
-
-    if model_tester_class is None:
-        logger.warning(f"No model tester class for {configuration_class.__name__}")
-        return
-
-    model_tester = model_tester_class(parent=None)
-
-    if hasattr(model_tester, "get_pipeline_config"):
-        config = model_tester.get_pipeline_config()
-    elif hasattr(model_tester, "get_config"):
-        config = model_tester.get_config()
-    else:
-        config = None
-        logger.warning(f"Model tester {model_tester_class.__name__} has no `get_config()`.")
-
-    return config
-
-
-@lru_cache(maxsize=100)
-def get_tiny_tokenizer_from_checkpoint(checkpoint):
-    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-    if tokenizer.vocab_size < 300:
-        # Wav2Vec2ForCTC for instance
-        # ByT5Tokenizer
-        # all are already small enough and have no Fast version that can
-        # be retrained
-        return tokenizer
-    logger.info("Training new from iterator ...")
-    vocabulary = string.ascii_letters + string.digits + " "
-    tokenizer = tokenizer.train_new_from_iterator(vocabulary, vocab_size=len(vocabulary), show_progress=False)
-    logger.info("Trained.")
-    return tokenizer
-
-
-def get_tiny_feature_extractor_from_checkpoint(checkpoint, tiny_config, feature_extractor_class):
-    try:
-        feature_extractor = AutoFeatureExtractor.from_pretrained(checkpoint)
-    except Exception:
-        try:
-            if feature_extractor_class is not None:
-                feature_extractor = feature_extractor_class()
-            else:
-                feature_extractor = None
-        except Exception:
-            feature_extractor = None
-    if hasattr(tiny_config, "image_size") and feature_extractor:
-        feature_extractor = feature_extractor.__class__(size=tiny_config.image_size, crop_size=tiny_config.image_size)
-
-    # Audio Spectogram Transformer specific.
-    if feature_extractor.__class__.__name__ == "ASTFeatureExtractor":
-        feature_extractor = feature_extractor.__class__(
-            max_length=tiny_config.max_length, num_mel_bins=tiny_config.num_mel_bins
-        )
+PATH_TO_TRANSFORMERS = os.path.join(Path(__file__).parent.parent.parent, "src/transformers")
 
-    # Speech2TextModel specific.
-    if hasattr(tiny_config, "input_feat_per_channel") and feature_extractor:
-        feature_extractor = feature_extractor.__class__(
-            feature_size=tiny_config.input_feat_per_channel, num_mel_bins=tiny_config.input_feat_per_channel
-        )
-    return feature_extractor
+
+# Dynamically import the Transformers module to grab the attribute classes of the processor form their names.
+transformers_module = direct_transformers_import(PATH_TO_TRANSFORMERS)
 
 
 class ANY:
@@ -181,146 +80,7 @@ def __repr__(self):
         return f"ANY({', '.join(_type.__name__ for _type in self._types)})"
 
 
-class PipelineTestCaseMeta(type):
-    def __new__(mcs, name, bases, dct):
-        def gen_test(ModelClass, checkpoint, tiny_config, tokenizer_class, feature_extractor_class):
-            @skipIf(
-                tiny_config is None,
-                "TinyConfig does not exist, make sure that you defined a `_CONFIG_FOR_DOC` variable in the modeling"
-                " file",
-            )
-            @skipIf(
-                checkpoint is None,
-                "checkpoint does not exist, make sure that you defined a `_CHECKPOINT_FOR_DOC` variable in the"
-                " modeling file",
-            )
-            def test(self):
-                if ModelClass.__name__.endswith("ForCausalLM"):
-                    tiny_config.is_encoder_decoder = False
-                    if hasattr(tiny_config, "encoder_no_repeat_ngram_size"):
-                        # specific for blenderbot which supports both decoder-only
-                        # encoder/decoder but the test config  only reflects
-                        # encoder/decoder arch
-                        tiny_config.encoder_no_repeat_ngram_size = 0
-                if ModelClass.__name__.endswith("WithLMHead"):
-                    tiny_config.is_decoder = True
-                try:
-                    model = ModelClass(tiny_config)
-                except ImportError as e:
-                    self.skipTest(
-                        f"Cannot run with {tiny_config} as the model requires a library that isn't installed: {e}"
-                    )
-                if hasattr(model, "eval"):
-                    model = model.eval()
-                if tokenizer_class is not None:
-                    try:
-                        tokenizer = get_tiny_tokenizer_from_checkpoint(checkpoint)
-                        # XLNet actually defines it as -1.
-                        if model.config.__class__.__name__ in ROBERTA_EMBEDDING_ADJUSMENT_CONFIGS:
-                            tokenizer.model_max_length = model.config.max_position_embeddings - 2
-                        elif (
-                            hasattr(model.config, "max_position_embeddings")
-                            and model.config.max_position_embeddings > 0
-                        ):
-                            tokenizer.model_max_length = model.config.max_position_embeddings
-                    # Rust Panic exception are NOT Exception subclass
-                    # Some test tokenizer contain broken vocabs or custom PreTokenizer, so we
-                    # provide some default tokenizer and hope for the best.
-                    except:  # noqa: E722
-                        self.skipTest(f"Ignoring {ModelClass}, cannot create a simple tokenizer")
-                else:
-                    tokenizer = None
-                feature_extractor = get_tiny_feature_extractor_from_checkpoint(
-                    checkpoint, tiny_config, feature_extractor_class
-                )
-
-                if tokenizer is None and feature_extractor is None:
-                    self.skipTest(
-                        f"Ignoring {ModelClass}, cannot create a tokenizer or feature_extractor (PerceiverConfig with"
-                        " no FastTokenizer ?)"
-                    )
-                pipeline, examples = self.get_test_pipeline(model, tokenizer, feature_extractor)
-                if pipeline is None:
-                    # The test can disable itself, but it should be very marginal
-                    # Concerns: Wav2Vec2ForCTC without tokenizer test (FastTokenizer don't exist)
-                    return
-                self.run_pipeline_test(pipeline, examples)
-
-                def run_batch_test(pipeline, examples):
-                    # Need to copy because `Conversation` are stateful
-                    if pipeline.tokenizer is not None and pipeline.tokenizer.pad_token_id is None:
-                        return  # No batching for this and it's OK
-
-                    # 10 examples with batch size 4 means there needs to be a unfinished batch
-                    # which is important for the unbatcher
-                    def data(n):
-                        for _ in range(n):
-                            # Need to copy because Conversation object is mutated
-                            yield copy.deepcopy(random.choice(examples))
-
-                    out = []
-                    for item in pipeline(data(10), batch_size=4):
-                        out.append(item)
-                    self.assertEqual(len(out), 10)
-
-                run_batch_test(pipeline, examples)
-
-            return test
-
-        for prefix, key in [("pt", "model_mapping"), ("tf", "tf_model_mapping")]:
-            mapping = dct.get(key, {})
-            if mapping:
-                for configuration, model_architectures in mapping.items():
-                    if not isinstance(model_architectures, tuple):
-                        model_architectures = (model_architectures,)
-
-                    for model_architecture in model_architectures:
-                        checkpoint = get_checkpoint_from_architecture(model_architecture)
-                        tiny_config = get_tiny_config_from_class(configuration)
-                        tokenizer_classes = TOKENIZER_MAPPING.get(configuration, [])
-                        feature_extractor_class = FEATURE_EXTRACTOR_MAPPING.get(configuration, None)
-                        feature_extractor_name = (
-                            feature_extractor_class.__name__ if feature_extractor_class else "nofeature_extractor"
-                        )
-                        if not tokenizer_classes:
-                            # We need to test even if there are no tokenizers.
-                            tokenizer_classes = [None]
-                        else:
-                            # Remove the non defined tokenizers
-                            # ByT5 and Perceiver are bytes-level and don't define
-                            # FastTokenizer, we can just ignore those.
-                            tokenizer_classes = [
-                                tokenizer_class for tokenizer_class in tokenizer_classes if tokenizer_class is not None
-                            ]
-
-                        for tokenizer_class in tokenizer_classes:
-                            if tokenizer_class is not None:
-                                tokenizer_name = tokenizer_class.__name__
-                            else:
-                                tokenizer_name = "notokenizer"
-
-                            test_name = f"test_{prefix}_{configuration.__name__}_{model_architecture.__name__}_{tokenizer_name}_{feature_extractor_name}"
-
-                            if tokenizer_class is not None or feature_extractor_class is not None:
-                                dct[test_name] = gen_test(
-                                    model_architecture,
-                                    checkpoint,
-                                    tiny_config,
-                                    tokenizer_class,
-                                    feature_extractor_class,
-                                )
-
-        @abstractmethod
-        def inner(self):
-            raise NotImplementedError("Not implemented test")
-
-        # Force these 2 methods to exist
-        dct["test_small_model_pt"] = dct.get("test_small_model_pt", inner)
-        dct["test_small_model_tf"] = dct.get("test_small_model_tf", inner)
-
-        return type.__new__(mcs, name, bases, dct)
-
-
+@is_pipeline_test
 class CommonPipelineTest(unittest.TestCase):
     @require_torch
     def test_pipeline_iteration(self):
@@ -438,6 +198,7 @@ def test_unbatch_attentions_hidden_states(self):
         self.assertEqual(len(outputs), 20)
 
 
+@is_pipeline_test
 class PipelineScikitCompatTest(unittest.TestCase):
     @require_torch
     def test_pipeline_predict_pt(self):
@@ -488,6 +249,7 @@ def test_pipeline_transform_tf(self):
         self.assertEqual(expected_output, actual_output)
 
 
+@is_pipeline_test
 class PipelinePadTest(unittest.TestCase):
     @require_torch
     def test_pipeline_padding(self):
@@ -569,6 +331,7 @@ def test_pipeline_offset_mapping(self):
         )
 
 
+@is_pipeline_test
 class PipelineUtilsTest(unittest.TestCase):
     @require_torch
     def test_pipeline_dataset(self):
@@ -596,7 +359,7 @@ def add(number, extra=0):
         dataset = PipelineIterator(dummy_dataset, add, {"extra": 2})
         self.assertEqual(len(dataset), 4)
 
-        outputs = [item for item in dataset]
+        outputs = list(dataset)
         self.assertEqual(outputs, [2, 3, 4, 5])
 
     @require_torch
@@ -614,7 +377,7 @@ def add(number, extra=0):
         with self.assertRaises(TypeError):
             len(dataset)
 
-        outputs = [item for item in dataset]
+        outputs = list(dataset)
         self.assertEqual(outputs, [2, 3, 4, 5])
 
     @require_torch
@@ -628,7 +391,7 @@ def add(number, extra=0):
 
         dataset = PipelineIterator(dummy_dataset, add, {"extra": 2}, loader_batch_size=3)
 
-        outputs = [item for item in dataset]
+        outputs = list(dataset)
         self.assertEqual(outputs, [{"id": 2}, {"id": 3}, {"id": 4}, {"id": 5}])
 
     @require_torch
@@ -644,7 +407,7 @@ def add(number, extra=0):
 
         dataset = PipelineIterator(dummy_dataset, add, {"extra": 2}, loader_batch_size=3)
 
-        outputs = [item for item in dataset]
+        outputs = list(dataset)
         self.assertEqual(
             nested_simplify(outputs), [{"id": [[12, 22]]}, {"id": [[2, 3]]}, {"id": [[2, 4]]}, {"id": [[5]]}]
         )
@@ -661,7 +424,7 @@ def preprocess_chunk(n: int):
 
         dataset = PipelineChunkIterator(dataset, preprocess_chunk, {}, loader_batch_size=3)
 
-        outputs = [item for item in dataset]
+        outputs = list(dataset)
 
         self.assertEqual(outputs, [0, 1, 0, 1, 2])
 
@@ -682,7 +445,7 @@ def pack(item):
 
         dataset = PipelinePackIterator(dataset, pack, {})
 
-        outputs = [item for item in dataset]
+        outputs = list(dataset)
         self.assertEqual(
             outputs,
             [
@@ -709,7 +472,7 @@ def add(number, extra=0):
 
         dataset = PipelinePackIterator(dummy_dataset, add, {"extra": 2}, loader_batch_size=3)
 
-        outputs = [item for item in dataset]
+        outputs = list(dataset)
         self.assertEqual(outputs, [[{"id": 2}, {"id": 3}], [{"id": 4}, {"id": 5}]])
 
         # is_false Across batch
@@ -720,9 +483,17 @@ def add(number, extra=0):
 
         dataset = PipelinePackIterator(dummy_dataset, add, {"extra": 2}, loader_batch_size=3)
 
-        outputs = [item for item in dataset]
+        outputs = list(dataset)
         self.assertEqual(outputs, [[{"id": 2}, {"id": 3}, {"id": 4}, {"id": 5}]])
 
+    def test_pipeline_negative_device(self):
+        # To avoid regressing, pipeline used to accept device=-1
+        classifier = pipeline("text-generation", "hf-internal-testing/tiny-random-bert", device=-1)
+
+        expected_output = [{"generated_text": ANY(str)}]
+        actual_output = classifier("Test input.")
+        self.assertEqual(expected_output, actual_output)
+
     @slow
     @require_torch
     def test_load_default_pipelines_pt(self):
@@ -738,6 +509,10 @@ def test_load_default_pipelines_pt(self):
 
             self.check_default_pipeline(task, "pt", set_seed_fn, self.check_models_equal_pt)
 
+            # clean-up as much as possible GPU memory occupied by PyTorch
+            gc.collect()
+            torch.cuda.empty_cache()
+
     @slow
     @require_tf
     def test_load_default_pipelines_tf(self):
@@ -753,6 +528,9 @@ def test_load_default_pipelines_tf(self):
 
             self.check_default_pipeline(task, "tf", set_seed_fn, self.check_models_equal_tf)
 
+            # clean-up as much as possible GPU memory occupied by PyTorch
+            gc.collect()
+
     @slow
     @require_torch
     def test_load_default_pipelines_pt_table_qa(self):
@@ -761,6 +539,24 @@ def test_load_default_pipelines_pt_table_qa(self):
         set_seed_fn = lambda: torch.manual_seed(0)  # noqa: E731
         self.check_default_pipeline("table-question-answering", "pt", set_seed_fn, self.check_models_equal_pt)
 
+        # clean-up as much as possible GPU memory occupied by PyTorch
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    @slow
+    @require_torch
+    @require_torch_gpu
+    def test_pipeline_cuda(self):
+        pipe = pipeline("text-generation", device="cuda")
+        _ = pipe("Hello")
+
+    @slow
+    @require_torch
+    @require_torch_gpu
+    def test_pipeline_cuda_indexed(self):
+        pipe = pipeline("text-generation", device="cuda:0")
+        _ = pipe("Hello")
+
     @slow
     @require_tf
     @require_tensorflow_probability
@@ -770,6 +566,9 @@ def test_load_default_pipelines_tf_table_qa(self):
         set_seed_fn = lambda: tf.random.set_seed(0)  # noqa: E731
         self.check_default_pipeline("table-question-answering", "tf", set_seed_fn, self.check_models_equal_tf)
 
+        # clean-up as much as possible GPU memory occupied by PyTorch
+        gc.collect()
+
     def check_default_pipeline(self, task, framework, set_seed_fn, check_models_equal_fn):
         from transformers.pipelines import SUPPORTED_TASKS, pipeline
 
@@ -864,6 +663,7 @@ def postprocess(self, model_outputs):
         return model_outputs["logits"].softmax(-1).numpy()
 
 
+@is_pipeline_test
 class CustomPipelineTest(unittest.TestCase):
     def test_warning_logs(self):
         transformers_logging.set_verbosity_debug()
@@ -965,6 +765,29 @@ def test_cached_pipeline_has_minimum_calls_to_head(self):
             self.assertEqual(counter.head_request_count, 1)
             self.assertEqual(counter.other_request_count, 0)
 
+    @require_torch
+    def test_chunk_pipeline_batching_single_file(self):
+        # Make sure we have cached the pipeline.
+        pipe = pipeline(model="hf-internal-testing/tiny-random-Wav2Vec2ForCTC")
+        ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
+        audio = ds[40]["audio"]["array"]
+
+        pipe = pipeline(model="hf-internal-testing/tiny-random-Wav2Vec2ForCTC")
+        # For some reason scoping doesn't work if not using `self.`
+        self.COUNT = 0
+        forward = pipe.model.forward
+
+        def new_forward(*args, **kwargs):
+            self.COUNT += 1
+            return forward(*args, **kwargs)
+
+        pipe.model.forward = new_forward
+
+        for out in pipe(audio, return_timestamps="char", chunk_length_s=3, stride_length_s=[1, 1], batch_size=1024):
+            pass
+
+        self.assertEqual(self.COUNT, 1)
+
 
 @require_torch
 @is_staging_test
@@ -974,7 +797,6 @@ class DynamicPipelineTester(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         cls._token = TOKEN
-        set_access_token(TOKEN)
         HfFolder.save_token(TOKEN)
 
     @classmethod
@@ -999,7 +821,8 @@ def test_push_to_hub_dynamic_pipeline(self):
         model = BertForSequenceClassification(config).eval()
 
         with tempfile.TemporaryDirectory() as tmp_dir:
-            repo = Repository(tmp_dir, clone_from=f"{USER}/test-dynamic-pipeline", use_auth_token=self._token)
+            create_repo(f"{USER}/test-dynamic-pipeline", token=self._token)
+            repo = Repository(tmp_dir, clone_from=f"{USER}/test-dynamic-pipeline", token=self._token)
 
             vocab_file = os.path.join(tmp_dir, "vocab.txt")
             with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
diff --git a/tests/pipelines/test_pipelines_conversational.py b/tests/pipelines/test_pipelines_conversational.py
index 39ad2175dcff..2f6ba61340f6 100644
--- a/tests/pipelines/test_pipelines_conversational.py
+++ b/tests/pipelines/test_pipelines_conversational.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import gc
 import unittest
 
 from transformers import (
@@ -29,15 +30,32 @@
     TFAutoModelForCausalLM,
     pipeline,
 )
-from transformers.testing_utils import require_tf, require_torch, slow, torch_device
+from transformers.testing_utils import (
+    is_pipeline_test,
+    is_torch_available,
+    require_tf,
+    require_torch,
+    slow,
+    torch_device,
+)
 
-from .test_pipelines_common import ANY, PipelineTestCaseMeta
+from .test_pipelines_common import ANY
 
 
 DEFAULT_DEVICE_NUM = -1 if torch_device == "cpu" else 0
 
 
-class ConversationalPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
+@is_pipeline_test
+class ConversationalPipelineTests(unittest.TestCase):
+    def tearDown(self):
+        super().tearDown()
+        # clean-up as much as possible GPU memory occupied by PyTorch
+        gc.collect()
+        if is_torch_available():
+            import torch
+
+            torch.cuda.empty_cache()
+
     model_mapping = dict(
         list(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.items())
         if MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
@@ -53,24 +71,30 @@ class ConversationalPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseM
         else []
     )
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    def get_test_pipeline(self, model, tokenizer, processor):
         conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer)
         return conversation_agent, [Conversation("Hi there!")]
 
     def run_pipeline_test(self, conversation_agent, _):
         # Simple
         outputs = conversation_agent(Conversation("Hi there!"))
-        self.assertEqual(outputs, Conversation(past_user_inputs=["Hi there!"], generated_responses=[ANY(str)]))
+        self.assertEqual(
+            outputs,
+            Conversation([{"role": "user", "content": "Hi there!"}, {"role": "assistant", "content": ANY(str)}]),
+        )
 
         # Single list
         outputs = conversation_agent([Conversation("Hi there!")])
-        self.assertEqual(outputs, Conversation(past_user_inputs=["Hi there!"], generated_responses=[ANY(str)]))
+        self.assertEqual(
+            outputs,
+            Conversation([{"role": "user", "content": "Hi there!"}, {"role": "assistant", "content": ANY(str)}]),
+        )
 
         # Batch
         conversation_1 = Conversation("Going to the movies tonight - any suggestions?")
         conversation_2 = Conversation("What's the last book you have read?")
-        self.assertEqual(len(conversation_1.past_user_inputs), 0)
-        self.assertEqual(len(conversation_2.past_user_inputs), 0)
+        self.assertEqual(len(conversation_1), 1)
+        self.assertEqual(len(conversation_2), 1)
 
         outputs = conversation_agent([conversation_1, conversation_2])
         self.assertEqual(outputs, [conversation_1, conversation_2])
@@ -78,32 +102,35 @@ def run_pipeline_test(self, conversation_agent, _):
             outputs,
             [
                 Conversation(
-                    past_user_inputs=["Going to the movies tonight - any suggestions?"],
-                    generated_responses=[ANY(str)],
+                    [
+                        {"role": "user", "content": "Going to the movies tonight - any suggestions?"},
+                        {"role": "assistant", "content": ANY(str)},
+                    ],
+                ),
+                Conversation(
+                    [
+                        {"role": "user", "content": "What's the last book you have read?"},
+                        {"role": "assistant", "content": ANY(str)},
+                    ]
                 ),
-                Conversation(past_user_inputs=["What's the last book you have read?"], generated_responses=[ANY(str)]),
             ],
         )
 
         # One conversation with history
-        conversation_2.add_user_input("Why do you recommend it?")
+        conversation_2.add_message({"role": "user", "content": "Why do you recommend it?"})
         outputs = conversation_agent(conversation_2)
         self.assertEqual(outputs, conversation_2)
         self.assertEqual(
             outputs,
             Conversation(
-                past_user_inputs=["What's the last book you have read?", "Why do you recommend it?"],
-                generated_responses=[ANY(str), ANY(str)],
+                [
+                    {"role": "user", "content": "What's the last book you have read?"},
+                    {"role": "assistant", "content": ANY(str)},
+                    {"role": "user", "content": "Why do you recommend it?"},
+                    {"role": "assistant", "content": ANY(str)},
+                ]
             ),
         )
-        with self.assertRaises(ValueError):
-            conversation_agent("Hi there!")
-        with self.assertRaises(ValueError):
-            conversation_agent(Conversation())
-        # Conversation have been consumed and are not valid anymore
-        # Inactive conversations passed to the pipeline raise a ValueError
-        with self.assertRaises(ValueError):
-            conversation_agent(conversation_2)
 
     @require_torch
     @slow
@@ -113,8 +140,8 @@ def test_integration_torch_conversation(self):
         conversation_1 = Conversation("Going to the movies tonight - any suggestions?")
         conversation_2 = Conversation("What's the last book you have read?")
         # Then
-        self.assertEqual(len(conversation_1.past_user_inputs), 0)
-        self.assertEqual(len(conversation_2.past_user_inputs), 0)
+        self.assertEqual(len(conversation_1.past_user_inputs), 1)
+        self.assertEqual(len(conversation_2.past_user_inputs), 1)
         # When
         result = conversation_agent([conversation_1, conversation_2], do_sample=False, max_length=1000)
         # Then
@@ -144,7 +171,7 @@ def test_integration_torch_conversation_truncated_history(self):
         conversation_agent = pipeline(task="conversational", min_length_for_response=24, device=DEFAULT_DEVICE_NUM)
         conversation_1 = Conversation("Going to the movies tonight - any suggestions?")
         # Then
-        self.assertEqual(len(conversation_1.past_user_inputs), 0)
+        self.assertEqual(len(conversation_1.past_user_inputs), 1)
         # When
         result = conversation_agent(conversation_1, do_sample=False, max_length=36)
         # Then
@@ -198,6 +225,41 @@ def test_integration_torch_conversation_dialogpt_input_ids(self):
             inputs["input_ids"].tolist(), [[31373, 50256, 17250, 612, 0, 50256, 4919, 389, 345, 5633, 50256]]
         )
 
+    @unittest.skip("Model is curently gated")
+    @require_torch
+    @slow
+    def test_integration_torch_conversation_llama2_input_ids(self):
+        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", use_default_system_prompt=True)
+
+        conversation = Conversation(
+            "What is so great about #1?",
+            past_user_inputs=["I am going to Paris, what should I see?"],
+            generated_responses=[
+                """\
+Paris, the capital of France, is known for its stunning architecture, art museums, historical landmarks, and romantic atmosphere. Here are some of the top attractions to see in Paris:
+
+1. The Eiffel Tower: The iconic Eiffel Tower is one of the most recognizable landmarks in the world and offers breathtaking views of the city.
+2. The Louvre Museum: The Louvre is one of the world's largest and most famous museums, housing an impressive collection of art and artifacts, including the Mona Lisa.
+3. Notre-Dame Cathedral: This beautiful cathedral is one of the most famous landmarks in Paris and is known for its Gothic architecture and stunning stained glass windows.
+
+These are just a few of the many attractions that Paris has to offer. With so much to see and do, it's no wonder that Paris is one of the most popular tourist destinations in the world."""
+            ],
+        )
+        inputs = tokenizer._build_conversation_input_ids(conversation)
+        # fmt: off
+        EXPECTED_INPUTS_IDS = [ 1, 518, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492, 526, 263, 8444, 29892, 3390, 1319, 322, 15993, 20255, 29889, 29849, 1234, 408, 1371, 3730, 408, 1950, 29892, 1550, 1641, 9109, 29889, 29871, 3575, 6089, 881, 451, 3160, 738, 10311, 1319, 29892, 443, 621, 936, 29892, 11021, 391, 29892, 7916, 391, 29892, 304, 27375, 29892, 18215, 29892, 470, 27302, 2793, 29889, 3529, 9801, 393, 596, 20890, 526, 5374, 635, 443, 5365, 1463, 322, 6374, 297, 5469, 29889, 13, 13, 3644, 263, 1139, 947, 451, 1207, 738, 4060, 29892, 470, 338, 451, 2114, 1474, 16165, 261, 296, 29892, 5649, 2020, 2012, 310, 22862, 1554, 451, 1959, 29889, 960, 366, 1016, 29915, 29873, 1073, 278, 1234, 304, 263, 1139, 29892, 3113, 1016, 29915, 29873, 6232, 2089, 2472, 29889, 13, 29966, 829, 14816, 29903, 6778, 13, 13, 29902, 626, 2675, 304, 3681, 29892, 825, 881, 306, 1074, 29973, 518, 29914, 25580, 29962, 3681, 29892, 278, 7483, 310, 3444, 29892, 338, 2998, 363, 967, 380, 27389, 11258, 29892, 1616, 19133, 29879, 29892, 15839, 2982, 22848, 29892, 322, 6017, 7716, 25005, 29889, 2266, 526, 777, 310, 278, 2246, 19650, 1953, 304, 1074, 297, 3681, 29901, 13, 13, 29896, 29889, 450, 382, 2593, 295, 23615, 29901, 450, 9849, 293, 382, 2593, 295, 23615, 338, 697, 310, 278, 1556, 5936, 13902, 2982, 22848, 297, 278, 3186, 322, 16688, 2078, 271, 400, 5086, 8386, 310, 278, 4272, 29889, 13, 29906, 29889, 450, 4562, 12675, 6838, 29901, 450, 4562, 12675, 338, 697, 310, 278, 3186, 29915, 29879, 10150, 322, 1556, 13834, 19133, 29879, 29892, 27261, 385, 21210, 573, 4333, 310, 1616, 322, 24238, 29879, 29892, 3704, 278, 2598, 29874, 29420, 29889, 13, 29941, 29889, 24337, 29899, 29928, 420, 315, 21471, 29901, 910, 9560, 274, 21471, 338, 697, 310, 278, 1556, 13834, 2982, 22848, 297, 3681, 322, 338, 2998, 363, 967, 22883, 293, 11258, 322, 380, 27389, 380, 7114, 12917, 5417, 29889, 13, 13, 1349, 968, 526, 925, 263, 2846, 310, 278, 1784, 19650, 1953, 393, 3681, 756, 304, 5957, 29889, 2973, 577, 1568, 304, 1074, 322, 437, 29892, 372, 29915, 29879, 694, 4997, 393, 3681, 338, 697, 310, 278, 1556, 5972, 6282, 391, 15422, 800, 297, 278, 3186, 29889, 29871, 2, 1, 518, 25580, 29962, 1724, 338, 577, 2107, 1048, 396, 29896, 29973, 518, 29914, 25580, 29962]
+        # fmt: on
+        self.assertEqual(inputs, EXPECTED_INPUTS_IDS)
+
+        model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
+        conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer)
+        EXPECTED_TEXT = "what topic you want to focus on and create content around it. This will help you stand out from other creators and attract a specific audience.\n\nStep 2: Set Up Your Channel\nCreate your YouTube account and customize your channel with your branding and logo. Make sure your channel name and profile picture are consistent with your niche.\n\nStep 3: Plan Your Content\nDevelop a content strategy that includes the type of content you want to create, how often you will post, and when you will post. Consider creating a content calendar to help you stay organized.\n\nStep 4: Invest in Quality Equipment\nInvest in good quality camera and microphone equipment to ensure your videos look and sound professional. You don't need to break the bank, but investing in good equipment will make a big difference in the quality of your videos.\n\nStep 5: Optimize Your Videos for Search\nUse keywords in your video titles, descriptions, and tags to help people find your videos when they search for topics related to your niche"
+        conversation = Conversation(
+            "<<SYS>>\n Only answer with emojis, and charades\n<</SYS>>\n\nHow can I build a house in 10 steps?"
+        )
+        result = conversation_agent(conversation)
+        self.assertEqual(result.generated_responses[-1], EXPECTED_TEXT)
+
     @require_torch
     @slow
     def test_integration_torch_conversation_blenderbot_400M_input_ids(self):
@@ -317,8 +379,8 @@ def test_integration_torch_conversation_encoder_decoder(self):
         conversation_1 = Conversation("My name is Sarah and I live in London")
         conversation_2 = Conversation("Going to the movies tonight, What movie would you recommend? ")
         # Then
-        self.assertEqual(len(conversation_1.past_user_inputs), 0)
-        self.assertEqual(len(conversation_2.past_user_inputs), 0)
+        self.assertEqual(len(conversation_1.past_user_inputs), 1)
+        self.assertEqual(len(conversation_2.past_user_inputs), 1)
         # When
         result = conversation_agent([conversation_1, conversation_2], do_sample=False, max_length=1000)
         # Then
diff --git a/tests/pipelines/test_pipelines_depth_estimation.py b/tests/pipelines/test_pipelines_depth_estimation.py
index d42ba2a067cb..054574b4fd80 100644
--- a/tests/pipelines/test_pipelines_depth_estimation.py
+++ b/tests/pipelines/test_pipelines_depth_estimation.py
@@ -17,9 +17,17 @@
 
 from transformers import MODEL_FOR_DEPTH_ESTIMATION_MAPPING, is_torch_available, is_vision_available
 from transformers.pipelines import DepthEstimationPipeline, pipeline
-from transformers.testing_utils import nested_simplify, require_tf, require_timm, require_torch, require_vision, slow
+from transformers.testing_utils import (
+    is_pipeline_test,
+    nested_simplify,
+    require_tf,
+    require_timm,
+    require_torch,
+    require_vision,
+    slow,
+)
 
-from .test_pipelines_common import ANY, PipelineTestCaseMeta
+from .test_pipelines_common import ANY
 
 
 if is_torch_available():
@@ -40,15 +48,15 @@ def hashimage(image: Image) -> str:
     return m.hexdigest()
 
 
+@is_pipeline_test
 @require_vision
 @require_timm
 @require_torch
-class DepthEstimationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
-
+class DepthEstimationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_DEPTH_ESTIMATION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
-        depth_estimator = DepthEstimationPipeline(model=model, feature_extractor=feature_extractor)
+    def get_test_pipeline(self, model, tokenizer, processor):
+        depth_estimator = DepthEstimationPipeline(model=model, image_processor=processor)
         return depth_estimator, [
             "./tests/fixtures/tests_samples/COCO/000000039769.png",
             "./tests/fixtures/tests_samples/COCO/000000039769.png",
diff --git a/tests/pipelines/test_pipelines_document_question_answering.py b/tests/pipelines/test_pipelines_document_question_answering.py
index c73decda0a42..388be9247b39 100644
--- a/tests/pipelines/test_pipelines_document_question_answering.py
+++ b/tests/pipelines/test_pipelines_document_question_answering.py
@@ -18,6 +18,7 @@
 from transformers.pipelines import pipeline
 from transformers.pipelines.document_question_answering import apply_tesseract
 from transformers.testing_utils import (
+    is_pipeline_test,
     nested_simplify,
     require_detectron2,
     require_pytesseract,
@@ -27,7 +28,7 @@
     slow,
 )
 
-from .test_pipelines_common import ANY, PipelineTestCaseMeta
+from .test_pipelines_common import ANY
 
 
 if is_vision_available():
@@ -52,16 +53,17 @@ def load_image(_):
 )
 
 
+@is_pipeline_test
 @require_torch
 @require_vision
-class DocumentQuestionAnsweringPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
+class DocumentQuestionAnsweringPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING
 
     @require_pytesseract
     @require_vision
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    def get_test_pipeline(self, model, tokenizer, processor):
         dqa_pipeline = pipeline(
-            "document-question-answering", model=model, tokenizer=tokenizer, feature_extractor=feature_extractor
+            "document-question-answering", model=model, tokenizer=tokenizer, image_processor=processor
         )
 
         image = INVOICE_URL
@@ -81,11 +83,6 @@ def get_test_pipeline(self, model, tokenizer, feature_extractor):
                 "question": question,
                 "word_boxes": word_boxes,
             },
-            {
-                "image": None,
-                "question": question,
-                "word_boxes": word_boxes,
-            },
         ]
         return dqa_pipeline, examples
 
@@ -99,7 +96,7 @@ def run_pipeline_test(self, dqa_pipeline, examples):
                     {"score": ANY(float), "answer": ANY(str), "start": ANY(int), "end": ANY(int)},
                 ]
             ]
-            * 4,
+            * 3,
         )
 
     @require_torch
diff --git a/tests/pipelines/test_pipelines_feature_extraction.py b/tests/pipelines/test_pipelines_feature_extraction.py
index 28cde51a8e5b..87c5a151175c 100644
--- a/tests/pipelines/test_pipelines_feature_extraction.py
+++ b/tests/pipelines/test_pipelines_feature_extraction.py
@@ -27,9 +27,7 @@
     is_torch_available,
     pipeline,
 )
-from transformers.testing_utils import nested_simplify, require_tf, require_torch
-
-from .test_pipelines_common import PipelineTestCaseMeta
+from transformers.testing_utils import is_pipeline_test, nested_simplify, require_tf, require_torch
 
 
 if is_torch_available():
@@ -39,7 +37,8 @@
     import tensorflow as tf
 
 
-class FeatureExtractionPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
+@is_pipeline_test
+class FeatureExtractionPipelineTests(unittest.TestCase):
     model_mapping = MODEL_MAPPING
     tf_model_mapping = TF_MODEL_MAPPING
 
@@ -175,7 +174,7 @@ def get_shape(self, input_, shape=None):
             raise ValueError("We expect lists of floats, nothing else")
         return shape
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    def get_test_pipeline(self, model, tokenizer, processor):
         if tokenizer is None:
             self.skipTest("No tokenizer")
             return
@@ -196,9 +195,7 @@ def get_test_pipeline(self, model, tokenizer, feature_extractor):
             )
 
             return
-        feature_extractor = FeatureExtractionPipeline(
-            model=model, tokenizer=tokenizer, feature_extractor=feature_extractor
-        )
+        feature_extractor = FeatureExtractionPipeline(model=model, tokenizer=tokenizer, feature_extractor=processor)
         return feature_extractor, ["This is a test", "This is another test"]
 
     def run_pipeline_test(self, feature_extractor, examples):
diff --git a/tests/pipelines/test_pipelines_fill_mask.py b/tests/pipelines/test_pipelines_fill_mask.py
index 760c475524f1..3794e88613d4 100644
--- a/tests/pipelines/test_pipelines_fill_mask.py
+++ b/tests/pipelines/test_pipelines_fill_mask.py
@@ -12,19 +12,38 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import gc
 import unittest
 
 from transformers import MODEL_FOR_MASKED_LM_MAPPING, TF_MODEL_FOR_MASKED_LM_MAPPING, FillMaskPipeline, pipeline
 from transformers.pipelines import PipelineException
-from transformers.testing_utils import nested_simplify, require_tf, require_torch, require_torch_gpu, slow
+from transformers.testing_utils import (
+    is_pipeline_test,
+    is_torch_available,
+    nested_simplify,
+    require_tf,
+    require_torch,
+    require_torch_gpu,
+    slow,
+)
 
-from .test_pipelines_common import ANY, PipelineTestCaseMeta
+from .test_pipelines_common import ANY
 
 
-class FillMaskPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
+@is_pipeline_test
+class FillMaskPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_MASKED_LM_MAPPING
     tf_model_mapping = TF_MODEL_FOR_MASKED_LM_MAPPING
 
+    def tearDown(self):
+        super().tearDown()
+        # clean-up as much as possible GPU memory occupied by PyTorch
+        gc.collect()
+        if is_torch_available():
+            import torch
+
+            torch.cuda.empty_cache()
+
     @require_tf
     def test_small_model_tf(self):
         unmasker = pipeline(task="fill-mask", model="sshleifer/tiny-distilroberta-base", top_k=2, framework="tf")
@@ -192,6 +211,18 @@ def run_large_test(self, unmasker):
             ],
         )
 
+        outputs = unmasker(
+            "My name is <mask>" + "Lorem ipsum dolor sit amet, consectetur adipiscing elit," * 100,
+            tokenizer_kwargs={"truncation": True},
+        )
+        self.assertEqual(
+            nested_simplify(outputs, decimals=6),
+            [
+                {"sequence": "My name is grouped", "score": 2.2e-05, "token": 38015, "token_str": " grouped"},
+                {"sequence": "My name is accuser", "score": 2.1e-05, "token": 25506, "token_str": " accuser"},
+            ],
+        )
+
     @require_torch
     def test_model_no_pad_pt(self):
         unmasker = pipeline(task="fill-mask", model="sshleifer/tiny-distilroberta-base", framework="pt")
@@ -206,7 +237,7 @@ def test_model_no_pad_tf(self):
         unmasker.tokenizer.pad_token = None
         self.run_pipeline_test(unmasker, [])
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    def get_test_pipeline(self, model, tokenizer, processor):
         if tokenizer is None or tokenizer.mask_token_id is None:
             self.skipTest("The provided tokenizer has no mask token, (probably reformer or wav2vec2)")
 
@@ -281,7 +312,7 @@ def run_pipeline_test(self, fill_masker, examples):
 
     def run_test_targets(self, model, tokenizer):
         vocab = tokenizer.get_vocab()
-        targets = list(sorted(vocab.keys()))[:2]
+        targets = sorted(vocab.keys())[:2]
         # Pipeline argument
         fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer, targets=targets)
         outputs = fill_masker(f"This is a {tokenizer.mask_token}")
@@ -293,8 +324,9 @@ def run_test_targets(self, model, tokenizer):
             ],
         )
         target_ids = {vocab[el] for el in targets}
-        self.assertEqual(set(el["token"] for el in outputs), target_ids)
-        self.assertEqual(set(el["token_str"] for el in outputs), set(targets))
+        self.assertEqual({el["token"] for el in outputs}, target_ids)
+        processed_targets = [tokenizer.decode([x]) for x in target_ids]
+        self.assertEqual({el["token_str"] for el in outputs}, set(processed_targets))
 
         # Call argument
         fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer)
@@ -307,25 +339,30 @@ def run_test_targets(self, model, tokenizer):
             ],
         )
         target_ids = {vocab[el] for el in targets}
-        self.assertEqual(set(el["token"] for el in outputs), target_ids)
-        self.assertEqual(set(el["token_str"] for el in outputs), set(targets))
+        self.assertEqual({el["token"] for el in outputs}, target_ids)
+        processed_targets = [tokenizer.decode([x]) for x in target_ids]
+        self.assertEqual({el["token_str"] for el in outputs}, set(processed_targets))
 
         # Score equivalence
         outputs = fill_masker(f"This is a {tokenizer.mask_token}", targets=targets)
         tokens = [top_mask["token_str"] for top_mask in outputs]
         scores = [top_mask["score"] for top_mask in outputs]
 
-        unmasked_targets = fill_masker(f"This is a {tokenizer.mask_token}", targets=tokens)
-        target_scores = [top_mask["score"] for top_mask in unmasked_targets]
-        self.assertEqual(nested_simplify(scores), nested_simplify(target_scores))
+        # For some BPE tokenizers, `</w>` is removed during decoding, so `token_str` won't be the same as in `targets`.
+        if set(tokens) == set(targets):
+            unmasked_targets = fill_masker(f"This is a {tokenizer.mask_token}", targets=tokens)
+            target_scores = [top_mask["score"] for top_mask in unmasked_targets]
+            self.assertEqual(nested_simplify(scores), nested_simplify(target_scores))
 
         # Raises with invalid
-        with self.assertRaises(ValueError):
-            outputs = fill_masker(f"This is a {tokenizer.mask_token}", targets=[""])
         with self.assertRaises(ValueError):
             outputs = fill_masker(f"This is a {tokenizer.mask_token}", targets=[])
-        with self.assertRaises(ValueError):
-            outputs = fill_masker(f"This is a {tokenizer.mask_token}", targets="")
+        # For some tokenizers, `""` is actually in the vocabulary and the expected error won't raised
+        if "" not in tokenizer.get_vocab():
+            with self.assertRaises(ValueError):
+                outputs = fill_masker(f"This is a {tokenizer.mask_token}", targets=[""])
+            with self.assertRaises(ValueError):
+                outputs = fill_masker(f"This is a {tokenizer.mask_token}", targets="")
 
     def run_test_top_k(self, model, tokenizer):
         fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer, top_k=2)
@@ -354,22 +391,23 @@ def run_test_top_k_targets(self, model, tokenizer):
         fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer)
 
         # top_k=2, ntargets=3
-        targets = list(sorted(vocab.keys()))[:3]
+        targets = sorted(vocab.keys())[:3]
         outputs = fill_masker(f"This is a {tokenizer.mask_token}", top_k=2, targets=targets)
 
         # If we use the most probably targets, and filter differently, we should still
         # have the same results
         targets2 = [el["token_str"] for el in sorted(outputs, key=lambda x: x["score"], reverse=True)]
-        outputs2 = fill_masker(f"This is a {tokenizer.mask_token}", top_k=3, targets=targets2)
-
-        # They should yield exactly the same result
-        self.assertEqual(nested_simplify(outputs), nested_simplify(outputs2))
+        # For some BPE tokenizers, `</w>` is removed during decoding, so `token_str` won't be the same as in `targets`.
+        if set(targets2).issubset(targets):
+            outputs2 = fill_masker(f"This is a {tokenizer.mask_token}", top_k=3, targets=targets2)
+            # They should yield exactly the same result
+            self.assertEqual(nested_simplify(outputs), nested_simplify(outputs2))
 
     def fill_mask_with_duplicate_targets_and_top_k(self, model, tokenizer):
         fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer)
         vocab = tokenizer.get_vocab()
         # String duplicates + id duplicates
-        targets = list(sorted(vocab.keys()))[:3]
+        targets = sorted(vocab.keys())[:3]
         targets = [targets[0], targets[1], targets[0], targets[2], targets[1]]
         outputs = fill_masker(f"My name is {tokenizer.mask_token}", targets=targets, top_k=10)
 
diff --git a/tests/pipelines/test_pipelines_image_classification.py b/tests/pipelines/test_pipelines_image_classification.py
index 8c83888ba02b..7af16371a020 100644
--- a/tests/pipelines/test_pipelines_image_classification.py
+++ b/tests/pipelines/test_pipelines_image_classification.py
@@ -17,11 +17,12 @@
 from transformers import (
     MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
     TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
-    PreTrainedTokenizer,
+    PreTrainedTokenizerBase,
     is_vision_available,
 )
 from transformers.pipelines import ImageClassificationPipeline, pipeline
 from transformers.testing_utils import (
+    is_pipeline_test,
     nested_simplify,
     require_tf,
     require_torch,
@@ -30,7 +31,7 @@
     slow,
 )
 
-from .test_pipelines_common import ANY, PipelineTestCaseMeta
+from .test_pipelines_common import ANY
 
 
 if is_vision_available():
@@ -43,14 +44,15 @@ def open(*args, **kwargs):
             pass
 
 
+@is_pipeline_test
 @require_torch_or_tf
 @require_vision
-class ImageClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
+class ImageClassificationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
     tf_model_mapping = TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
-        image_classifier = ImageClassificationPipeline(model=model, feature_extractor=feature_extractor, top_k=2)
+    def get_test_pipeline(self, model, tokenizer, processor):
+        image_classifier = ImageClassificationPipeline(model=model, image_processor=processor, top_k=2)
         examples = [
             Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
             "http://images.cocodataset.org/val2017/000000039769.jpg",
@@ -164,7 +166,7 @@ def test_small_model_tf(self):
         )
 
     def test_custom_tokenizer(self):
-        tokenizer = PreTrainedTokenizer()
+        tokenizer = PreTrainedTokenizerBase()
 
         # Assert that the pipeline can be initialized with a feature extractor that is not in any mapping
         image_classifier = pipeline(
diff --git a/tests/pipelines/test_pipelines_image_segmentation.py b/tests/pipelines/test_pipelines_image_segmentation.py
index 889a4ba04198..dbc0c0db809a 100644
--- a/tests/pipelines/test_pipelines_image_segmentation.py
+++ b/tests/pipelines/test_pipelines_image_segmentation.py
@@ -13,19 +13,20 @@
 # limitations under the License.
 
 import hashlib
+import tempfile
 import unittest
 from typing import Dict
 
 import datasets
 import numpy as np
+import requests
 from datasets import load_dataset
 
-import requests
 from transformers import (
     MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
     MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
     MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING,
-    AutoFeatureExtractor,
+    AutoImageProcessor,
     AutoModelForImageSegmentation,
     AutoModelForInstanceSegmentation,
     DetrForSegmentation,
@@ -34,9 +35,17 @@
     is_vision_available,
     pipeline,
 )
-from transformers.testing_utils import nested_simplify, require_tf, require_timm, require_torch, require_vision, slow
+from transformers.testing_utils import (
+    is_pipeline_test,
+    nested_simplify,
+    require_tf,
+    require_timm,
+    require_torch,
+    require_vision,
+    slow,
+)
 
-from .test_pipelines_common import ANY, PipelineTestCaseMeta
+from .test_pipelines_common import ANY
 
 
 if is_vision_available():
@@ -67,21 +76,19 @@ def mask_to_test_readable_only_shape(mask: Image) -> Dict:
     return {"shape": shape}
 
 
+@is_pipeline_test
 @require_vision
 @require_timm
 @require_torch
-class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
-    model_mapping = {
-        k: v
-        for k, v in (
-            list(MODEL_FOR_IMAGE_SEGMENTATION_MAPPING.items()) if MODEL_FOR_IMAGE_SEGMENTATION_MAPPING else []
-        )
+class ImageSegmentationPipelineTests(unittest.TestCase):
+    model_mapping = dict(
+        (list(MODEL_FOR_IMAGE_SEGMENTATION_MAPPING.items()) if MODEL_FOR_IMAGE_SEGMENTATION_MAPPING else [])
         + (MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING.items() if MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING else [])
         + (MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING.items() if MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING else [])
-    }
+    )
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
-        image_segmenter = ImageSegmentationPipeline(model=model, feature_extractor=feature_extractor)
+    def get_test_pipeline(self, model, tokenizer, processor):
+        image_segmenter = ImageSegmentationPipeline(model=model, image_processor=processor)
         return image_segmenter, [
             "./tests/fixtures/tests_samples/COCO/000000039769.png",
             "./tests/fixtures/tests_samples/COCO/000000039769.png",
@@ -139,7 +146,11 @@ def run_pipeline_test(self, image_segmenter, examples):
             "./tests/fixtures/tests_samples/COCO/000000039769.png",
         ]
         outputs = image_segmenter(
-            batch, threshold=0.0, mask_threshold=0, overlap_mask_area_threshold=0, batch_size=batch_size
+            batch,
+            threshold=0.0,
+            mask_threshold=0,
+            overlap_mask_area_threshold=0,
+            batch_size=batch_size,
         )
         self.assertEqual(len(batch), len(outputs))
         self.assertEqual(len(outputs[0]), n)
@@ -188,10 +199,10 @@ def test_small_model_pt(self):
         model_id = "hf-internal-testing/tiny-detr-mobilenetsv3-panoptic"
 
         model = AutoModelForImageSegmentation.from_pretrained(model_id)
-        feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
+        image_processor = AutoImageProcessor.from_pretrained(model_id)
         image_segmenter = ImageSegmentationPipeline(
             model=model,
-            feature_extractor=feature_extractor,
+            image_processor=image_processor,
             subtask="panoptic",
             threshold=0.0,
             mask_threshold=0.0,
@@ -550,9 +561,9 @@ def test_maskformer(self):
         model_id = "facebook/maskformer-swin-base-ade"
 
         model = AutoModelForInstanceSegmentation.from_pretrained(model_id)
-        feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
+        image_processor = AutoImageProcessor.from_pretrained(model_id)
 
-        image_segmenter = pipeline("image-segmentation", model=model, feature_extractor=feature_extractor)
+        image_segmenter = pipeline("image-segmentation", model=model, image_processor=image_processor)
 
         image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
         file = image[0]["file"]
@@ -602,3 +613,119 @@ def test_maskformer(self):
                 },
             ],
         )
+
+    @require_torch
+    @slow
+    def test_oneformer(self):
+        image_segmenter = pipeline(model="shi-labs/oneformer_ade20k_swin_tiny")
+
+        image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+        file = image[0]["file"]
+        outputs = image_segmenter(file, threshold=0.99)
+        # Shortening by hashing
+        for o in outputs:
+            o["mask"] = mask_to_test_readable(o["mask"])
+
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {
+                    "score": 0.9981,
+                    "label": "grass",
+                    "mask": {"hash": "3a92904d4c", "white_pixels": 118131, "shape": (512, 683)},
+                },
+                {
+                    "score": 0.9992,
+                    "label": "sky",
+                    "mask": {"hash": "fa2300cc9a", "white_pixels": 231565, "shape": (512, 683)},
+                },
+            ],
+        )
+
+        # Different task
+        outputs = image_segmenter(file, threshold=0.99, subtask="instance")
+        # Shortening by hashing
+        for o in outputs:
+            o["mask"] = mask_to_test_readable(o["mask"])
+
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {
+                    "score": 0.9991,
+                    "label": "sky",
+                    "mask": {"hash": "8b1ffad016", "white_pixels": 230566, "shape": (512, 683)},
+                },
+                {
+                    "score": 0.9981,
+                    "label": "grass",
+                    "mask": {"hash": "9bbdf83d3d", "white_pixels": 119130, "shape": (512, 683)},
+                },
+            ],
+        )
+
+        # Different task
+        outputs = image_segmenter(file, subtask="semantic")
+        # Shortening by hashing
+        for o in outputs:
+            o["mask"] = mask_to_test_readable(o["mask"])
+
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {
+                    "score": None,
+                    "label": "wall",
+                    "mask": {"hash": "897fb20b7f", "white_pixels": 14506, "shape": (512, 683)},
+                },
+                {
+                    "score": None,
+                    "label": "building",
+                    "mask": {"hash": "f2a68c63e4", "white_pixels": 125019, "shape": (512, 683)},
+                },
+                {
+                    "score": None,
+                    "label": "sky",
+                    "mask": {"hash": "e0ca3a548e", "white_pixels": 135330, "shape": (512, 683)},
+                },
+                {
+                    "score": None,
+                    "label": "tree",
+                    "mask": {"hash": "7c9544bcac", "white_pixels": 16263, "shape": (512, 683)},
+                },
+                {
+                    "score": None,
+                    "label": "road, route",
+                    "mask": {"hash": "2c7704e491", "white_pixels": 2143, "shape": (512, 683)},
+                },
+                {
+                    "score": None,
+                    "label": "grass",
+                    "mask": {"hash": "bf6c2867e0", "white_pixels": 53040, "shape": (512, 683)},
+                },
+                {
+                    "score": None,
+                    "label": "plant",
+                    "mask": {"hash": "93c4b7199e", "white_pixels": 3335, "shape": (512, 683)},
+                },
+                {
+                    "score": None,
+                    "label": "house",
+                    "mask": {"hash": "93ec419ad5", "white_pixels": 60, "shape": (512, 683)},
+                },
+            ],
+        )
+
+    def test_save_load(self):
+        model_id = "hf-internal-testing/tiny-detr-mobilenetsv3-panoptic"
+
+        model = AutoModelForImageSegmentation.from_pretrained(model_id)
+        image_processor = AutoImageProcessor.from_pretrained(model_id)
+        image_segmenter = pipeline(
+            task="image-segmentation",
+            model=model,
+            image_processor=image_processor,
+        )
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            image_segmenter.save_pretrained(tmpdirname)
+            pipeline(task="image-segmentation", model=tmpdirname)
diff --git a/tests/pipelines/test_pipelines_image_to_image.py b/tests/pipelines/test_pipelines_image_to_image.py
new file mode 100644
index 000000000000..e9110bb69295
--- /dev/null
+++ b/tests/pipelines/test_pipelines_image_to_image.py
@@ -0,0 +1,85 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import (
+    MODEL_FOR_IMAGE_TO_IMAGE_MAPPING,
+    AutoImageProcessor,
+    AutoModelForImageToImage,
+    ImageToImagePipeline,
+    is_vision_available,
+    pipeline,
+)
+from transformers.testing_utils import (
+    is_pipeline_test,
+    require_torch,
+    require_vision,
+    slow,
+)
+
+
+if is_vision_available():
+    from PIL import Image
+
+else:
+
+    class Image:
+        @staticmethod
+        def open(*args, **kwargs):
+            pass
+
+
+@is_pipeline_test
+@require_torch
+@require_vision
+class ImageToImagePipelineTests(unittest.TestCase):
+    model_mapping = MODEL_FOR_IMAGE_TO_IMAGE_MAPPING
+    examples = [
+        Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
+        "http://images.cocodataset.org/val2017/000000039769.jpg",
+    ]
+
+    @require_torch
+    @require_vision
+    @slow
+    def test_pipeline(self):
+        model_id = "caidas/swin2SR-classical-sr-x2-64"
+        upscaler = pipeline("image-to-image", model=model_id)
+        upscaled_list = upscaler(self.examples)
+
+        self.assertEqual(len(upscaled_list), len(self.examples))
+        for output in upscaled_list:
+            self.assertIsInstance(output, Image.Image)
+
+        self.assertEqual(upscaled_list[0].size, (1296, 976))
+        self.assertEqual(upscaled_list[1].size, (1296, 976))
+
+    @require_torch
+    @require_vision
+    @slow
+    def test_pipeline_model_processor(self):
+        model_id = "caidas/swin2SR-classical-sr-x2-64"
+        model = AutoModelForImageToImage.from_pretrained(model_id)
+        image_processor = AutoImageProcessor.from_pretrained(model_id)
+
+        upscaler = ImageToImagePipeline(model=model, image_processor=image_processor)
+        upscaled_list = upscaler(self.examples)
+
+        self.assertEqual(len(upscaled_list), len(self.examples))
+        for output in upscaled_list:
+            self.assertIsInstance(output, Image.Image)
+
+        self.assertEqual(upscaled_list[0].size, (1296, 976))
+        self.assertEqual(upscaled_list[1].size, (1296, 976))
diff --git a/tests/pipelines/test_pipelines_image_to_text.py b/tests/pipelines/test_pipelines_image_to_text.py
index 0e1e805f9b43..7514f17919b1 100644
--- a/tests/pipelines/test_pipelines_image_to_text.py
+++ b/tests/pipelines/test_pipelines_image_to_text.py
@@ -14,11 +14,26 @@
 
 import unittest
 
+import requests
+
 from transformers import MODEL_FOR_VISION_2_SEQ_MAPPING, TF_MODEL_FOR_VISION_2_SEQ_MAPPING, is_vision_available
 from transformers.pipelines import pipeline
-from transformers.testing_utils import require_tf, require_torch, require_vision, slow
+from transformers.testing_utils import (
+    is_pipeline_test,
+    is_torch_available,
+    require_tf,
+    require_torch,
+    require_vision,
+    slow,
+)
+
+from .test_pipelines_common import ANY
 
-from .test_pipelines_common import ANY, PipelineTestCaseMeta
+
+if is_torch_available():
+    from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_11
+else:
+    is_torch_greater_or_equal_than_1_11 = False
 
 
 if is_vision_available():
@@ -31,13 +46,14 @@ def open(*args, **kwargs):
             pass
 
 
+@is_pipeline_test
 @require_vision
-class ImageToTextPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
+class ImageToTextPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_VISION_2_SEQ_MAPPING
     tf_model_mapping = TF_MODEL_FOR_VISION_2_SEQ_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
-        pipe = pipeline("image-to-text", model=model, tokenizer=tokenizer, feature_extractor=feature_extractor)
+    def get_test_pipeline(self, model, tokenizer, processor):
+        pipe = pipeline("image-to-text", model=model, tokenizer=tokenizer, image_processor=processor)
         examples = [
             Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
             "./tests/fixtures/tests_samples/COCO/000000039769.png",
@@ -124,6 +140,15 @@ def test_small_model_pt(self):
             ],
         )
 
+    @require_torch
+    def test_small_model_pt_conditional(self):
+        pipe = pipeline("image-to-text", model="hf-internal-testing/tiny-random-BlipForConditionalGeneration")
+        image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
+        prompt = "a photo of"
+
+        outputs = pipe(image, prompt=prompt)
+        self.assertTrue(outputs[0]["generated_text"].startswith(prompt))
+
     @slow
     @require_torch
     def test_large_model_pt(self):
@@ -142,6 +167,74 @@ def test_large_model_pt(self):
             ],
         )
 
+    @slow
+    @require_torch
+    def test_generation_pt_blip(self):
+        pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
+        url = "https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/pokemon.png"
+        image = Image.open(requests.get(url, stream=True).raw)
+
+        outputs = pipe(image)
+        self.assertEqual(outputs, [{"generated_text": "a pink pokemon pokemon with a blue shirt and a blue shirt"}])
+
+    @slow
+    @require_torch
+    def test_generation_pt_git(self):
+        pipe = pipeline("image-to-text", model="microsoft/git-base-coco")
+        url = "https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/pokemon.png"
+        image = Image.open(requests.get(url, stream=True).raw)
+
+        outputs = pipe(image)
+        self.assertEqual(outputs, [{"generated_text": "a cartoon of a purple character."}])
+
+    @slow
+    @require_torch
+    def test_conditional_generation_pt_blip(self):
+        pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
+        url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg"
+        image = Image.open(requests.get(url, stream=True).raw)
+
+        prompt = "a photography of"
+
+        outputs = pipe(image, prompt=prompt)
+        self.assertEqual(outputs, [{"generated_text": "a photography of a volcano"}])
+
+        with self.assertRaises(ValueError):
+            outputs = pipe([image, image], prompt=[prompt, prompt])
+
+    @slow
+    @require_torch
+    def test_conditional_generation_pt_git(self):
+        pipe = pipeline("image-to-text", model="microsoft/git-base-coco")
+        url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg"
+        image = Image.open(requests.get(url, stream=True).raw)
+
+        prompt = "a photo of a"
+
+        outputs = pipe(image, prompt=prompt)
+        self.assertEqual(outputs, [{"generated_text": "a photo of a tent with a tent and a tent in the background."}])
+
+        with self.assertRaises(ValueError):
+            outputs = pipe([image, image], prompt=[prompt, prompt])
+
+    @unittest.skipIf(
+        not is_torch_greater_or_equal_than_1_11, reason="`Pix2StructImageProcessor` requires `torch>=1.11.0`."
+    )
+    @slow
+    @require_torch
+    def test_conditional_generation_pt_pix2struct(self):
+        pipe = pipeline("image-to-text", model="google/pix2struct-ai2d-base")
+        url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg"
+        image = Image.open(requests.get(url, stream=True).raw)
+
+        prompt = "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud"
+
+        outputs = pipe(image, prompt=prompt)
+        self.assertEqual(outputs, [{"generated_text": "ash cloud"}])
+
+        with self.assertRaises(ValueError):
+            outputs = pipe([image, image], prompt=[prompt, prompt])
+
     @slow
     @require_tf
     def test_large_model_tf(self):
diff --git a/tests/pipelines/test_pipelines_mask_generation.py b/tests/pipelines/test_pipelines_mask_generation.py
new file mode 100644
index 000000000000..cf1703906515
--- /dev/null
+++ b/tests/pipelines/test_pipelines_mask_generation.py
@@ -0,0 +1,160 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import hashlib
+import unittest
+from typing import Dict
+
+import numpy as np
+
+from transformers import (
+    MODEL_FOR_MASK_GENERATION_MAPPING,
+    TF_MODEL_FOR_MASK_GENERATION_MAPPING,
+    is_vision_available,
+    pipeline,
+)
+from transformers.pipelines import MaskGenerationPipeline
+from transformers.testing_utils import (
+    is_pipeline_test,
+    nested_simplify,
+    require_tf,
+    require_torch,
+    require_vision,
+    slow,
+)
+
+
+if is_vision_available():
+    from PIL import Image
+else:
+
+    class Image:
+        @staticmethod
+        def open(*args, **kwargs):
+            pass
+
+
+def hashimage(image: Image) -> str:
+    m = hashlib.md5(image.tobytes())
+    return m.hexdigest()[:10]
+
+
+def mask_to_test_readable(mask: Image) -> Dict:
+    npimg = np.array(mask)
+    shape = npimg.shape
+    return {"hash": hashimage(mask), "shape": shape}
+
+
+@is_pipeline_test
+@require_vision
+@require_torch
+class MaskGenerationPipelineTests(unittest.TestCase):
+    model_mapping = dict(
+        (list(MODEL_FOR_MASK_GENERATION_MAPPING.items()) if MODEL_FOR_MASK_GENERATION_MAPPING else [])
+    )
+    tf_model_mapping = dict(
+        (list(TF_MODEL_FOR_MASK_GENERATION_MAPPING.items()) if TF_MODEL_FOR_MASK_GENERATION_MAPPING else [])
+    )
+
+    def get_test_pipeline(self, model, tokenizer, processor):
+        image_segmenter = MaskGenerationPipeline(model=model, image_processor=processor)
+        return image_segmenter, [
+            "./tests/fixtures/tests_samples/COCO/000000039769.png",
+            "./tests/fixtures/tests_samples/COCO/000000039769.png",
+        ]
+
+    # TODO: Implement me @Arthur
+    def run_pipeline_test(self, mask_generator, examples):
+        pass
+
+    @require_tf
+    @unittest.skip("Image segmentation not implemented in TF")
+    def test_small_model_tf(self):
+        pass
+
+    @slow
+    @require_torch
+    def test_small_model_pt(self):
+        image_segmenter = pipeline("mask-generation", model="facebook/sam-vit-huge")
+
+        outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg", points_per_batch=256)
+
+        # Shortening by hashing
+        new_outupt = []
+        for i, o in enumerate(outputs["masks"]):
+            new_outupt += [{"mask": mask_to_test_readable(o), "scores": outputs["scores"][i]}]
+
+        # fmt: off
+        self.assertEqual(
+            nested_simplify(new_outupt, decimals=4),
+            [
+                {'mask': {'hash': '115ad19f5f', 'shape': (480, 640)}, 'scores': 1.0444},
+                {'mask': {'hash': '6affa964c6', 'shape': (480, 640)}, 'scores': 1.021},
+                {'mask': {'hash': 'dfe28a0388', 'shape': (480, 640)}, 'scores': 1.0167},
+                {'mask': {'hash': 'c0a5f4a318', 'shape': (480, 640)}, 'scores': 1.0132},
+                {'mask': {'hash': 'fe8065c197', 'shape': (480, 640)}, 'scores': 1.0053},
+                {'mask': {'hash': 'e2d0b7a0b7', 'shape': (480, 640)}, 'scores': 0.9967},
+                {'mask': {'hash': '453c7844bd', 'shape': (480, 640)}, 'scores': 0.993},
+                {'mask': {'hash': '3d44f2926d', 'shape': (480, 640)}, 'scores': 0.9909},
+                {'mask': {'hash': '64033ddc3f', 'shape': (480, 640)}, 'scores': 0.9879},
+                {'mask': {'hash': '801064ff79', 'shape': (480, 640)}, 'scores': 0.9834},
+                {'mask': {'hash': '6172f276ef', 'shape': (480, 640)}, 'scores': 0.9716},
+                {'mask': {'hash': 'b49e60e084', 'shape': (480, 640)}, 'scores': 0.9612},
+                {'mask': {'hash': 'a811e775fd', 'shape': (480, 640)}, 'scores': 0.9599},
+                {'mask': {'hash': 'a6a8ebcf4b', 'shape': (480, 640)}, 'scores': 0.9552},
+                {'mask': {'hash': '9d8257e080', 'shape': (480, 640)}, 'scores': 0.9532},
+                {'mask': {'hash': '32de6454a8', 'shape': (480, 640)}, 'scores': 0.9516},
+                {'mask': {'hash': 'af3d4af2c8', 'shape': (480, 640)}, 'scores': 0.9499},
+                {'mask': {'hash': '3c6db475fb', 'shape': (480, 640)}, 'scores': 0.9483},
+                {'mask': {'hash': 'c290813fb9', 'shape': (480, 640)}, 'scores': 0.9464},
+                {'mask': {'hash': 'b6f0b8f606', 'shape': (480, 640)}, 'scores': 0.943},
+                {'mask': {'hash': '92ce16bfdf', 'shape': (480, 640)}, 'scores': 0.943},
+                {'mask': {'hash': 'c749b25868', 'shape': (480, 640)}, 'scores': 0.9408},
+                {'mask': {'hash': 'efb6cab859', 'shape': (480, 640)}, 'scores': 0.9335},
+                {'mask': {'hash': '1ff2eafb30', 'shape': (480, 640)}, 'scores': 0.9326},
+                {'mask': {'hash': '788b798e24', 'shape': (480, 640)}, 'scores': 0.9262},
+                {'mask': {'hash': 'abea804f0e', 'shape': (480, 640)}, 'scores': 0.8999},
+                {'mask': {'hash': '7b9e8ddb73', 'shape': (480, 640)}, 'scores': 0.8986},
+                {'mask': {'hash': 'cd24047c8a', 'shape': (480, 640)}, 'scores': 0.8984},
+                {'mask': {'hash': '6943e6bcbd', 'shape': (480, 640)}, 'scores': 0.8873},
+                {'mask': {'hash': 'b5f47c9191', 'shape': (480, 640)}, 'scores': 0.8871}
+            ],
+        )
+        # fmt: on
+
+    @require_torch
+    @slow
+    def test_threshold(self):
+        model_id = "facebook/sam-vit-huge"
+        image_segmenter = pipeline("mask-generation", model=model_id)
+
+        outputs = image_segmenter(
+            "http://images.cocodataset.org/val2017/000000039769.jpg", pred_iou_thresh=1, points_per_batch=256
+        )
+
+        # Shortening by hashing
+        new_outupt = []
+        for i, o in enumerate(outputs["masks"]):
+            new_outupt += [{"mask": mask_to_test_readable(o), "scores": outputs["scores"][i]}]
+
+        self.assertEqual(
+            nested_simplify(new_outupt, decimals=4),
+            [
+                {"mask": {"hash": "115ad19f5f", "shape": (480, 640)}, "scores": 1.0444},
+                {"mask": {"hash": "6affa964c6", "shape": (480, 640)}, "scores": 1.0210},
+                {"mask": {"hash": "dfe28a0388", "shape": (480, 640)}, "scores": 1.0167},
+                {"mask": {"hash": "c0a5f4a318", "shape": (480, 640)}, "scores": 1.0132},
+                {"mask": {"hash": "fe8065c197", "shape": (480, 640)}, "scores": 1.0053},
+            ],
+        )
diff --git a/tests/pipelines/test_pipelines_object_detection.py b/tests/pipelines/test_pipelines_object_detection.py
index 680f9deabde5..4196db36d765 100644
--- a/tests/pipelines/test_pipelines_object_detection.py
+++ b/tests/pipelines/test_pipelines_object_detection.py
@@ -22,9 +22,18 @@
     is_vision_available,
     pipeline,
 )
-from transformers.testing_utils import nested_simplify, require_tf, require_timm, require_torch, require_vision, slow
+from transformers.testing_utils import (
+    is_pipeline_test,
+    nested_simplify,
+    require_pytesseract,
+    require_tf,
+    require_timm,
+    require_torch,
+    require_vision,
+    slow,
+)
 
-from .test_pipelines_common import ANY, PipelineTestCaseMeta
+from .test_pipelines_common import ANY
 
 
 if is_vision_available():
@@ -37,14 +46,15 @@ def open(*args, **kwargs):
             pass
 
 
+@is_pipeline_test
 @require_vision
 @require_timm
 @require_torch
-class ObjectDetectionPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
+class ObjectDetectionPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_OBJECT_DETECTION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
-        object_detector = ObjectDetectionPipeline(model=model, feature_extractor=feature_extractor)
+    def get_test_pipeline(self, model, tokenizer, processor):
+        object_detector = ObjectDetectionPipeline(model=model, image_processor=processor)
         return object_detector, ["./tests/fixtures/tests_samples/COCO/000000039769.png"]
 
     def run_pipeline_test(self, object_detector, examples):
@@ -245,6 +255,7 @@ def test_threshold(self):
         )
 
     @require_torch
+    @require_pytesseract
     @slow
     def test_layoutlm(self):
         model_id = "Narsil/layoutlmv3-finetuned-funsd"
diff --git a/tests/pipelines/test_pipelines_question_answering.py b/tests/pipelines/test_pipelines_question_answering.py
index afb7b95731d9..0ab574f5f2c5 100644
--- a/tests/pipelines/test_pipelines_question_answering.py
+++ b/tests/pipelines/test_pipelines_question_answering.py
@@ -22,16 +22,35 @@
 )
 from transformers.data.processors.squad import SquadExample
 from transformers.pipelines import QuestionAnsweringArgumentHandler, pipeline
-from transformers.testing_utils import nested_simplify, require_tf, require_torch, require_torch_or_tf, slow
+from transformers.testing_utils import (
+    is_pipeline_test,
+    nested_simplify,
+    require_tf,
+    require_torch,
+    require_torch_or_tf,
+    slow,
+)
+
+from .test_pipelines_common import ANY
 
-from .test_pipelines_common import ANY, PipelineTestCaseMeta
 
+# These 2 model types require different inputs than those of the usual text models.
+_TO_SKIP = {"LayoutLMv2Config", "LayoutLMv3Config"}
 
-class QAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
+
+@is_pipeline_test
+class QAPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_QUESTION_ANSWERING_MAPPING
     tf_model_mapping = TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    if model_mapping is not None:
+        model_mapping = {config: model for config, model in model_mapping.items() if config.__name__ not in _TO_SKIP}
+    if tf_model_mapping is not None:
+        tf_model_mapping = {
+            config: model for config, model in tf_model_mapping.items() if config.__name__ not in _TO_SKIP
+        }
+
+    def get_test_pipeline(self, model, tokenizer, processor):
         if isinstance(model.config, LxmertConfig):
             # This is an bimodal model, we need to find a more consistent way
             # to switch on those models.
@@ -106,11 +125,13 @@ def run_pipeline_test(self, question_answerer, _):
         self.assertEqual(outputs, {"answer": ANY(str), "start": ANY(int), "end": ANY(int), "score": ANY(float)})
 
         # Using batch is OK
+        if question_answerer.tokenizer.pad_token_id is None:
+            question_answerer.tokenizer.pad_token_id = question_answerer.model.config.eos_token_id
         new_outputs = question_answerer(
             question="Where was HuggingFace founded ?", context="HuggingFace was founded in Paris." * 20, batch_size=2
         )
         self.assertEqual(new_outputs, {"answer": ANY(str), "start": ANY(int), "end": ANY(int), "score": ANY(float)})
-        self.assertEqual(outputs, new_outputs)
+        self.assertEqual(nested_simplify(outputs), nested_simplify(new_outputs))
 
     @require_torch
     def test_small_model_pt(self):
diff --git a/tests/pipelines/test_pipelines_summarization.py b/tests/pipelines/test_pipelines_summarization.py
index 781716b5ba37..e6aaebb31d02 100644
--- a/tests/pipelines/test_pipelines_summarization.py
+++ b/tests/pipelines/test_pipelines_summarization.py
@@ -21,20 +21,21 @@
     TFPreTrainedModel,
     pipeline,
 )
-from transformers.testing_utils import get_gpu_count, require_tf, require_torch, slow, torch_device
+from transformers.testing_utils import get_gpu_count, is_pipeline_test, require_tf, require_torch, slow, torch_device
 from transformers.tokenization_utils import TruncationStrategy
 
-from .test_pipelines_common import ANY, PipelineTestCaseMeta
+from .test_pipelines_common import ANY
 
 
 DEFAULT_DEVICE_NUM = -1 if torch_device == "cpu" else 0
 
 
-class SummarizationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
+@is_pipeline_test
+class SummarizationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
     tf_model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    def get_test_pipeline(self, model, tokenizer, processor):
         summarizer = SummarizationPipeline(model=model, tokenizer=tokenizer)
         return summarizer, ["(CNN)The Palestinian Authority officially became", "Some other text"]
 
diff --git a/tests/pipelines/test_pipelines_table_question_answering.py b/tests/pipelines/test_pipelines_table_question_answering.py
index 3527f3ef2264..a30763fc096d 100644
--- a/tests/pipelines/test_pipelines_table_question_answering.py
+++ b/tests/pipelines/test_pipelines_table_question_answering.py
@@ -20,14 +20,27 @@
     AutoTokenizer,
     TableQuestionAnsweringPipeline,
     TFAutoModelForTableQuestionAnswering,
+    is_torch_available,
     pipeline,
 )
-from transformers.testing_utils import require_pandas, require_tensorflow_probability, require_tf, require_torch, slow
+from transformers.testing_utils import (
+    is_pipeline_test,
+    require_pandas,
+    require_tensorflow_probability,
+    require_tf,
+    require_torch,
+    slow,
+)
+
 
-from .test_pipelines_common import PipelineTestCaseMeta
+if is_torch_available():
+    from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_12
+else:
+    is_torch_greater_or_equal_than_1_12 = False
 
 
-class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
+@is_pipeline_test
+class TQAPipelineTests(unittest.TestCase):
     # Putting it there for consistency, but TQA do not have fast tokenizer
     # which are needed to generate automatic tests
     model_mapping = MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING
@@ -137,6 +150,7 @@ def test_small_model_tf(self):
                 },
             )
 
+    @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+")
     @require_torch
     def test_small_model_pt(self):
         model_id = "lysandre/tiny-tapas-random-wtq"
@@ -239,6 +253,7 @@ def test_small_model_pt(self):
                 },
             )
 
+    @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+")
     @require_torch
     def test_slow_tokenizer_sqa_pt(self):
         model_id = "lysandre/tiny-tapas-random-sqa"
@@ -480,6 +495,7 @@ def test_slow_tokenizer_sqa_tf(self):
                 },
             )
 
+    @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+")
     @slow
     @require_torch
     def test_integration_wtq_pt(self):
@@ -574,6 +590,7 @@ def test_integration_wtq_tf(self):
         ]
         self.assertListEqual(results, expected_results)
 
+    @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+")
     @slow
     @require_torch
     def test_integration_sqa_pt(self):
diff --git a/tests/pipelines/test_pipelines_text2text_generation.py b/tests/pipelines/test_pipelines_text2text_generation.py
index 772190fb63e4..eccae9850b3b 100644
--- a/tests/pipelines/test_pipelines_text2text_generation.py
+++ b/tests/pipelines/test_pipelines_text2text_generation.py
@@ -20,21 +20,22 @@
     Text2TextGenerationPipeline,
     pipeline,
 )
-from transformers.testing_utils import require_tf, require_torch
+from transformers.testing_utils import is_pipeline_test, require_tf, require_torch
 from transformers.utils import is_torch_available
 
-from .test_pipelines_common import ANY, PipelineTestCaseMeta
+from .test_pipelines_common import ANY
 
 
 if is_torch_available():
     import torch
 
 
-class Text2TextGenerationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
+@is_pipeline_test
+class Text2TextGenerationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
     tf_model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    def get_test_pipeline(self, model, tokenizer, processor):
         generator = Text2TextGenerationPipeline(model=model, tokenizer=tokenizer)
         return generator, ["Something to write", "Something else"]
 
diff --git a/tests/pipelines/test_pipelines_text_classification.py b/tests/pipelines/test_pipelines_text_classification.py
index 80e8e2559f17..d203fd5bcdd5 100644
--- a/tests/pipelines/test_pipelines_text_classification.py
+++ b/tests/pipelines/test_pipelines_text_classification.py
@@ -20,15 +20,27 @@
     TextClassificationPipeline,
     pipeline,
 )
-from transformers.testing_utils import nested_simplify, require_tf, require_torch, slow
+from transformers.testing_utils import is_pipeline_test, nested_simplify, require_tf, require_torch, slow
 
-from .test_pipelines_common import ANY, PipelineTestCaseMeta
+from .test_pipelines_common import ANY
 
 
-class TextClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
+# These 2 model types require different inputs than those of the usual text models.
+_TO_SKIP = {"LayoutLMv2Config", "LayoutLMv3Config"}
+
+
+@is_pipeline_test
+class TextClassificationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
     tf_model_mapping = TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
 
+    if model_mapping is not None:
+        model_mapping = {config: model for config, model in model_mapping.items() if config.__name__ not in _TO_SKIP}
+    if tf_model_mapping is not None:
+        tf_model_mapping = {
+            config: model for config, model in tf_model_mapping.items() if config.__name__ not in _TO_SKIP
+        }
+
     @require_torch
     def test_small_model_pt(self):
         text_classifier = pipeline(
@@ -129,7 +141,7 @@ def test_tf_bert(self):
         outputs = text_classifier("Birds are a type of animal")
         self.assertEqual(nested_simplify(outputs), [{"label": "POSITIVE", "score": 0.988}])
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    def get_test_pipeline(self, model, tokenizer, processor):
         text_classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer)
         return text_classifier, ["HuggingFace is in", "This is another test"]
 
diff --git a/tests/pipelines/test_pipelines_text_generation.py b/tests/pipelines/test_pipelines_text_generation.py
index 922a4e24b244..44a29a673d81 100644
--- a/tests/pipelines/test_pipelines_text_generation.py
+++ b/tests/pipelines/test_pipelines_text_generation.py
@@ -14,8 +14,16 @@
 
 import unittest
 
-from transformers import MODEL_FOR_CAUSAL_LM_MAPPING, TF_MODEL_FOR_CAUSAL_LM_MAPPING, TextGenerationPipeline, pipeline
+from transformers import (
+    MODEL_FOR_CAUSAL_LM_MAPPING,
+    TF_MODEL_FOR_CAUSAL_LM_MAPPING,
+    TextGenerationPipeline,
+    logging,
+    pipeline,
+)
 from transformers.testing_utils import (
+    CaptureLogger,
+    is_pipeline_test,
     require_accelerate,
     require_tf,
     require_torch,
@@ -23,11 +31,12 @@
     require_torch_or_tf,
 )
 
-from .test_pipelines_common import ANY, PipelineTestCaseMeta
+from .test_pipelines_common import ANY
 
 
+@is_pipeline_test
 @require_torch_or_tf
-class TextGenerationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
+class TextGenerationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_CAUSAL_LM_MAPPING
     tf_model_mapping = TF_MODEL_FOR_CAUSAL_LM_MAPPING
 
@@ -143,7 +152,7 @@ def test_small_model_tf(self):
             ],
         )
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    def get_test_pipeline(self, model, tokenizer, processor):
         text_generator = TextGenerationPipeline(model=model, tokenizer=tokenizer)
         return text_generator, ["This is a test", "Another test"]
 
@@ -231,7 +240,11 @@ def run_pipeline_test(self, text_generator, _):
         # We don't care about infinite range models.
         # They already work.
         # Skip this test for XGLM, since it uses sinusoidal positional embeddings which are resized on-the-fly.
-        if tokenizer.model_max_length < 10000 and "XGLM" not in tokenizer.__class__.__name__:
+        EXTRA_MODELS_CAN_HANDLE_LONG_INPUTS = ["RwkvForCausalLM", "XGLMForCausalLM", "GPTNeoXForCausalLM"]
+        if (
+            tokenizer.model_max_length < 10000
+            and text_generator.model.__class__.__name__ not in EXTRA_MODELS_CAN_HANDLE_LONG_INPUTS
+        ):
             # Handling of large generations
             with self.assertRaises((RuntimeError, IndexError, ValueError, AssertionError)):
                 text_generator("This is a test" * 500, max_new_tokens=20)
@@ -312,3 +325,35 @@ def test_small_model_fp16(self):
 
         pipe = pipeline(model="hf-internal-testing/tiny-random-bloom", device=0, torch_dtype=torch.float16)
         pipe("This is a test")
+
+    @require_torch
+    @require_accelerate
+    @require_torch_gpu
+    def test_pipeline_accelerate_top_p(self):
+        import torch
+
+        pipe = pipeline(model="hf-internal-testing/tiny-random-bloom", device_map="auto", torch_dtype=torch.float16)
+        pipe("This is a test", do_sample=True, top_p=0.5)
+
+    def test_pipeline_length_setting_warning(self):
+        prompt = """Hello world"""
+        text_generator = pipeline("text-generation", model="hf-internal-testing/tiny-random-gpt2")
+        if text_generator.model.framework == "tf":
+            logger = logging.get_logger("transformers.generation.tf_utils")
+        else:
+            logger = logging.get_logger("transformers.generation.utils")
+        logger_msg = "Both `max_new_tokens`"  # The beggining of the message to be checked in this test
+
+        # Both are set by the user -> log warning
+        with CaptureLogger(logger) as cl:
+            _ = text_generator(prompt, max_length=10, max_new_tokens=1)
+        self.assertIn(logger_msg, cl.out)
+
+        # The user only sets one -> no warning
+        with CaptureLogger(logger) as cl:
+            _ = text_generator(prompt, max_new_tokens=1)
+        self.assertNotIn(logger_msg, cl.out)
+
+        with CaptureLogger(logger) as cl:
+            _ = text_generator(prompt, max_length=10)
+        self.assertNotIn(logger_msg, cl.out)
diff --git a/tests/pipelines/test_pipelines_text_to_audio.py b/tests/pipelines/test_pipelines_text_to_audio.py
new file mode 100644
index 000000000000..04acd8fdf822
--- /dev/null
+++ b/tests/pipelines/test_pipelines_text_to_audio.py
@@ -0,0 +1,189 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+from transformers import (
+    MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING,
+    AutoProcessor,
+    TextToAudioPipeline,
+    pipeline,
+)
+from transformers.testing_utils import (
+    is_pipeline_test,
+    require_torch,
+    require_torch_gpu,
+    require_torch_or_tf,
+    slow,
+)
+
+from .test_pipelines_common import ANY
+
+
+@is_pipeline_test
+@require_torch_or_tf
+class TextToAudioPipelineTests(unittest.TestCase):
+    model_mapping = MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING
+    # for now only test text_to_waveform and not text_to_spectrogram
+
+    @slow
+    @require_torch
+    def test_small_musicgen_pt(self):
+        music_generator = pipeline(task="text-to-audio", model="facebook/musicgen-small", framework="pt")
+
+        forward_params = {
+            "do_sample": False,
+            "max_new_tokens": 250,
+        }
+
+        outputs = music_generator("This is a test", forward_params=forward_params)
+        self.assertEqual({"audio": ANY(np.ndarray), "sampling_rate": 32000}, outputs)
+
+        # test two examples side-by-side
+        outputs = music_generator(["This is a test", "This is a second test"], forward_params=forward_params)
+        audio = [output["audio"] for output in outputs]
+        self.assertEqual([ANY(np.ndarray), ANY(np.ndarray)], audio)
+
+        # test batching
+        outputs = music_generator(
+            ["This is a test", "This is a second test"], forward_params=forward_params, batch_size=2
+        )
+        audio = [output["audio"] for output in outputs]
+        self.assertEqual([ANY(np.ndarray), ANY(np.ndarray)], audio)
+
+    @slow
+    @require_torch
+    def test_small_bark_pt(self):
+        speech_generator = pipeline(task="text-to-audio", model="suno/bark-small", framework="pt")
+
+        forward_params = {
+            # Using `do_sample=False` to force deterministic output
+            "do_sample": False,
+            "semantic_max_new_tokens": 100,
+        }
+
+        outputs = speech_generator("This is a test", forward_params=forward_params)
+        self.assertEqual(
+            {"audio": ANY(np.ndarray), "sampling_rate": 24000},
+            outputs,
+        )
+
+        # test two examples side-by-side
+        outputs = speech_generator(
+            ["This is a test", "This is a second test"],
+            forward_params=forward_params,
+        )
+        audio = [output["audio"] for output in outputs]
+        self.assertEqual([ANY(np.ndarray), ANY(np.ndarray)], audio)
+
+        # test other generation strategy
+        forward_params = {
+            "do_sample": True,
+            "semantic_max_new_tokens": 100,
+            "semantic_num_return_sequences": 2,
+        }
+
+        outputs = speech_generator("This is a test", forward_params=forward_params)
+        audio = outputs["audio"]
+        self.assertEqual(ANY(np.ndarray), audio)
+
+        # test using a speaker embedding
+        processor = AutoProcessor.from_pretrained("suno/bark-small")
+        temp_inp = processor("hey, how are you?", voice_preset="v2/en_speaker_5")
+        history_prompt = temp_inp["history_prompt"]
+        forward_params["history_prompt"] = history_prompt
+
+        outputs = speech_generator(
+            ["This is a test", "This is a second test"],
+            forward_params=forward_params,
+            batch_size=2,
+        )
+        audio = [output["audio"] for output in outputs]
+        self.assertEqual([ANY(np.ndarray), ANY(np.ndarray)], audio)
+
+    @slow
+    @require_torch_gpu
+    def test_conversion_additional_tensor(self):
+        speech_generator = pipeline(task="text-to-audio", model="suno/bark-small", framework="pt", device=0)
+        processor = AutoProcessor.from_pretrained("suno/bark-small")
+
+        forward_params = {
+            "do_sample": True,
+            "semantic_max_new_tokens": 100,
+        }
+
+        # atm, must do to stay coherent with BarkProcessor
+        preprocess_params = {
+            "max_length": 256,
+            "add_special_tokens": False,
+            "return_attention_mask": True,
+            "return_token_type_ids": False,
+            "padding": "max_length",
+        }
+        outputs = speech_generator(
+            "This is a test",
+            forward_params=forward_params,
+            preprocess_params=preprocess_params,
+        )
+
+        temp_inp = processor("hey, how are you?", voice_preset="v2/en_speaker_5")
+        history_prompt = temp_inp["history_prompt"]
+        forward_params["history_prompt"] = history_prompt
+
+        # history_prompt is a torch.Tensor passed as a forward_param
+        # if generation is successful, it means that it was passed to the right device
+        outputs = speech_generator(
+            "This is a test", forward_params=forward_params, preprocess_params=preprocess_params
+        )
+        self.assertEqual(
+            {"audio": ANY(np.ndarray), "sampling_rate": 24000},
+            outputs,
+        )
+
+    @slow
+    @require_torch
+    def test_vits_model_pt(self):
+        speech_generator = pipeline(task="text-to-audio", model="facebook/mms-tts-eng", framework="pt")
+
+        outputs = speech_generator("This is a test")
+        self.assertEqual(outputs["sampling_rate"], 16000)
+
+        audio = outputs["audio"]
+        self.assertEqual(ANY(np.ndarray), audio)
+
+        # test two examples side-by-side
+        outputs = speech_generator(["This is a test", "This is a second test"])
+        audio = [output["audio"] for output in outputs]
+        self.assertEqual([ANY(np.ndarray), ANY(np.ndarray)], audio)
+
+        # test batching
+        outputs = speech_generator(["This is a test", "This is a second test"], batch_size=2)
+        self.assertEqual(ANY(np.ndarray), outputs[0]["audio"])
+
+    def get_test_pipeline(self, model, tokenizer, processor):
+        speech_generator = TextToAudioPipeline(model=model, tokenizer=tokenizer)
+        return speech_generator, ["This is a test", "Another test"]
+
+    def run_pipeline_test(self, speech_generator, _):
+        outputs = speech_generator("This is a test")
+        self.assertEqual(ANY(np.ndarray), outputs["audio"])
+
+        forward_params = (
+            {"num_return_sequences": 2, "do_sample": True} if speech_generator.model.can_generate() else {}
+        )
+        outputs = speech_generator(["This is great !", "Something else"], forward_params=forward_params)
+        audio = [output["audio"] for output in outputs]
+        self.assertEqual([ANY(np.ndarray), ANY(np.ndarray)], audio)
diff --git a/tests/pipelines/test_pipelines_token_classification.py b/tests/pipelines/test_pipelines_token_classification.py
index 2e44448e1336..7f6658c93a2e 100644
--- a/tests/pipelines/test_pipelines_token_classification.py
+++ b/tests/pipelines/test_pipelines_token_classification.py
@@ -25,19 +25,37 @@
     pipeline,
 )
 from transformers.pipelines import AggregationStrategy, TokenClassificationArgumentHandler
-from transformers.testing_utils import nested_simplify, require_tf, require_torch, require_torch_gpu, slow
+from transformers.testing_utils import (
+    is_pipeline_test,
+    nested_simplify,
+    require_tf,
+    require_torch,
+    require_torch_gpu,
+    slow,
+)
 
-from .test_pipelines_common import ANY, PipelineTestCaseMeta
+from .test_pipelines_common import ANY
 
 
 VALID_INPUTS = ["A simple string", ["list of strings", "A simple string that is quite a bit longer"]]
 
+# These 2 model types require different inputs than those of the usual text models.
+_TO_SKIP = {"LayoutLMv2Config", "LayoutLMv3Config"}
+
 
-class TokenClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
+@is_pipeline_test
+class TokenClassificationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
     tf_model_mapping = TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    if model_mapping is not None:
+        model_mapping = {config: model for config, model in model_mapping.items() if config.__name__ not in _TO_SKIP}
+    if tf_model_mapping is not None:
+        tf_model_mapping = {
+            config: model for config, model in tf_model_mapping.items() if config.__name__ not in _TO_SKIP
+        }
+
+    def get_test_pipeline(self, model, tokenizer, processor):
         token_classifier = TokenClassificationPipeline(model=model, tokenizer=tokenizer)
         return token_classifier, ["A simple string", "A simple string that is quite a bit longer"]
 
@@ -188,6 +206,127 @@ def run_aggregation_strategy(self, model, tokenizer):
             )
         self.assertEqual(token_classifier._postprocess_params["aggregation_strategy"], AggregationStrategy.FIRST)
 
+    @slow
+    @require_torch
+    def test_chunking(self):
+        NER_MODEL = "elastic/distilbert-base-uncased-finetuned-conll03-english"
+        model = AutoModelForTokenClassification.from_pretrained(NER_MODEL)
+        tokenizer = AutoTokenizer.from_pretrained(NER_MODEL, use_fast=True)
+        tokenizer.model_max_length = 10
+        stride = 5
+        sentence = (
+            "Hugging Face, Inc. is a French company that develops tools for building applications using machine learning. "
+            "The company, based in New York City was founded in 2016 by French entrepreneurs Clément Delangue, Julien Chaumond, and Thomas Wolf."
+        )
+
+        token_classifier = TokenClassificationPipeline(
+            model=model, tokenizer=tokenizer, aggregation_strategy="simple", stride=stride
+        )
+        output = token_classifier(sentence)
+        self.assertEqual(
+            nested_simplify(output),
+            [
+                {"entity_group": "ORG", "score": 0.978, "word": "hugging face, inc.", "start": 0, "end": 18},
+                {"entity_group": "MISC", "score": 0.999, "word": "french", "start": 24, "end": 30},
+                {"entity_group": "LOC", "score": 0.997, "word": "new york city", "start": 131, "end": 144},
+                {"entity_group": "MISC", "score": 0.999, "word": "french", "start": 168, "end": 174},
+                {"entity_group": "PER", "score": 0.999, "word": "clement delangue", "start": 189, "end": 205},
+                {"entity_group": "PER", "score": 0.999, "word": "julien chaumond", "start": 207, "end": 222},
+                {"entity_group": "PER", "score": 0.999, "word": "thomas wolf", "start": 228, "end": 239},
+            ],
+        )
+
+        token_classifier = TokenClassificationPipeline(
+            model=model, tokenizer=tokenizer, aggregation_strategy="first", stride=stride
+        )
+        output = token_classifier(sentence)
+        self.assertEqual(
+            nested_simplify(output),
+            [
+                {"entity_group": "ORG", "score": 0.978, "word": "hugging face, inc.", "start": 0, "end": 18},
+                {"entity_group": "MISC", "score": 0.999, "word": "french", "start": 24, "end": 30},
+                {"entity_group": "LOC", "score": 0.997, "word": "new york city", "start": 131, "end": 144},
+                {"entity_group": "MISC", "score": 0.999, "word": "french", "start": 168, "end": 174},
+                {"entity_group": "PER", "score": 0.999, "word": "clement delangue", "start": 189, "end": 205},
+                {"entity_group": "PER", "score": 0.999, "word": "julien chaumond", "start": 207, "end": 222},
+                {"entity_group": "PER", "score": 0.999, "word": "thomas wolf", "start": 228, "end": 239},
+            ],
+        )
+
+        token_classifier = TokenClassificationPipeline(
+            model=model, tokenizer=tokenizer, aggregation_strategy="max", stride=stride
+        )
+        output = token_classifier(sentence)
+        self.assertEqual(
+            nested_simplify(output),
+            [
+                {"entity_group": "ORG", "score": 0.978, "word": "hugging face, inc.", "start": 0, "end": 18},
+                {"entity_group": "MISC", "score": 0.999, "word": "french", "start": 24, "end": 30},
+                {"entity_group": "LOC", "score": 0.997, "word": "new york city", "start": 131, "end": 144},
+                {"entity_group": "MISC", "score": 0.999, "word": "french", "start": 168, "end": 174},
+                {"entity_group": "PER", "score": 0.999, "word": "clement delangue", "start": 189, "end": 205},
+                {"entity_group": "PER", "score": 0.999, "word": "julien chaumond", "start": 207, "end": 222},
+                {"entity_group": "PER", "score": 0.999, "word": "thomas wolf", "start": 228, "end": 239},
+            ],
+        )
+
+        token_classifier = TokenClassificationPipeline(
+            model=model, tokenizer=tokenizer, aggregation_strategy="average", stride=stride
+        )
+        output = token_classifier(sentence)
+        self.assertEqual(
+            nested_simplify(output),
+            [
+                {"entity_group": "ORG", "score": 0.978, "word": "hugging face, inc.", "start": 0, "end": 18},
+                {"entity_group": "MISC", "score": 0.999, "word": "french", "start": 24, "end": 30},
+                {"entity_group": "LOC", "score": 0.997, "word": "new york city", "start": 131, "end": 144},
+                {"entity_group": "MISC", "score": 0.999, "word": "french", "start": 168, "end": 174},
+                {"entity_group": "PER", "score": 0.999, "word": "clement delangue", "start": 189, "end": 205},
+                {"entity_group": "PER", "score": 0.999, "word": "julien chaumond", "start": 207, "end": 222},
+                {"entity_group": "PER", "score": 0.999, "word": "thomas wolf", "start": 228, "end": 239},
+            ],
+        )
+
+    @require_torch
+    def test_chunking_fast(self):
+        # Note: We cannot run the test on "conflicts" on the chunking.
+        # The problem is that the model is random, and thus the results do heavily
+        # depend on the chunking, so we cannot expect "abcd" and "bcd" to find
+        # the same entities. We defer to slow tests for this.
+        pipe = pipeline(model="hf-internal-testing/tiny-bert-for-token-classification")
+        sentence = "The company, based in New York City was founded in 2016 by French entrepreneurs"
+
+        results = pipe(sentence, aggregation_strategy="first")
+        # This is what this random model gives on the full sentence
+        self.assertEqual(
+            nested_simplify(results),
+            [
+                # This is 2 actual tokens
+                {"end": 39, "entity_group": "MISC", "score": 0.115, "start": 31, "word": "city was"},
+                {"end": 79, "entity_group": "MISC", "score": 0.115, "start": 66, "word": "entrepreneurs"},
+            ],
+        )
+
+        # This will force the tokenizer to split after "city was".
+        pipe.tokenizer.model_max_length = 12
+        self.assertEqual(
+            pipe.tokenizer.decode(pipe.tokenizer.encode(sentence, truncation=True)),
+            "[CLS] the company, based in new york city was [SEP]",
+        )
+
+        stride = 4
+        results = pipe(sentence, aggregation_strategy="first", stride=stride)
+        self.assertEqual(
+            nested_simplify(results),
+            [
+                {"end": 39, "entity_group": "MISC", "score": 0.115, "start": 31, "word": "city was"},
+                # This is an extra entity found by this random model, but at least both original
+                # entities are there
+                {"end": 58, "entity_group": "MISC", "score": 0.115, "start": 56, "word": "by"},
+                {"end": 79, "entity_group": "MISC", "score": 0.115, "start": 66, "word": "entrepreneurs"},
+            ],
+        )
+
     @require_torch
     @slow
     def test_spanish_bert(self):
diff --git a/tests/pipelines/test_pipelines_translation.py b/tests/pipelines/test_pipelines_translation.py
index d8de606f698d..61d390fe76eb 100644
--- a/tests/pipelines/test_pipelines_translation.py
+++ b/tests/pipelines/test_pipelines_translation.py
@@ -25,16 +25,17 @@
     TranslationPipeline,
     pipeline,
 )
-from transformers.testing_utils import require_tf, require_torch, slow
+from transformers.testing_utils import is_pipeline_test, require_tf, require_torch, slow
 
-from .test_pipelines_common import ANY, PipelineTestCaseMeta
+from .test_pipelines_common import ANY
 
 
-class TranslationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
+@is_pipeline_test
+class TranslationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
     tf_model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    def get_test_pipeline(self, model, tokenizer, processor):
         if isinstance(model.config, MBartConfig):
             src_lang, tgt_lang = list(tokenizer.lang_code_to_id.keys())[:2]
             translator = TranslationPipeline(model=model, tokenizer=tokenizer, src_lang=src_lang, tgt_lang=tgt_lang)
diff --git a/tests/pipelines/test_pipelines_video_classification.py b/tests/pipelines/test_pipelines_video_classification.py
index 25ddcfaf2d33..33e06e30f5ae 100644
--- a/tests/pipelines/test_pipelines_video_classification.py
+++ b/tests/pipelines/test_pipelines_video_classification.py
@@ -15,9 +15,11 @@
 import unittest
 
 from huggingface_hub import hf_hub_download
+
 from transformers import MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING, VideoMAEFeatureExtractor
 from transformers.pipelines import VideoClassificationPipeline, pipeline
 from transformers.testing_utils import (
+    is_pipeline_test,
     nested_simplify,
     require_decord,
     require_tf,
@@ -26,20 +28,21 @@
     require_vision,
 )
 
-from .test_pipelines_common import ANY, PipelineTestCaseMeta
+from .test_pipelines_common import ANY
 
 
+@is_pipeline_test
 @require_torch_or_tf
 @require_vision
 @require_decord
-class VideoClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
+class VideoClassificationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    def get_test_pipeline(self, model, tokenizer, processor):
         example_video_filepath = hf_hub_download(
             repo_id="nateraw/video-demo", filename="archery.mp4", repo_type="dataset"
         )
-        video_classifier = VideoClassificationPipeline(model=model, feature_extractor=feature_extractor, top_k=2)
+        video_classifier = VideoClassificationPipeline(model=model, image_processor=processor, top_k=2)
         examples = [
             example_video_filepath,
             "https://huggingface.co/datasets/nateraw/video-demo/resolve/main/archery.mp4",
@@ -47,7 +50,6 @@ def get_test_pipeline(self, model, tokenizer, feature_extractor):
         return video_classifier, examples
 
     def run_pipeline_test(self, video_classifier, examples):
-
         for example in examples:
             outputs = video_classifier(example)
 
@@ -63,7 +65,7 @@ def run_pipeline_test(self, video_classifier, examples):
     def test_small_model_pt(self):
         small_model = "hf-internal-testing/tiny-random-VideoMAEForVideoClassification"
         small_feature_extractor = VideoMAEFeatureExtractor(
-            size=dict(shortest_edge=10), crop_size=dict(height=10, width=10)
+            size={"shortest_edge": 10}, crop_size={"height": 10, "width": 10}
         )
         video_classifier = pipeline(
             "video-classification", model=small_model, feature_extractor=small_feature_extractor, frame_sampling_rate=4
diff --git a/tests/pipelines/test_pipelines_visual_question_answering.py b/tests/pipelines/test_pipelines_visual_question_answering.py
index bf3a532b10cb..55ad44ef8d1d 100644
--- a/tests/pipelines/test_pipelines_visual_question_answering.py
+++ b/tests/pipelines/test_pipelines_visual_question_answering.py
@@ -16,9 +16,22 @@
 
 from transformers import MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING, is_vision_available
 from transformers.pipelines import pipeline
-from transformers.testing_utils import nested_simplify, require_tf, require_torch, require_vision, slow
+from transformers.testing_utils import (
+    is_pipeline_test,
+    is_torch_available,
+    nested_simplify,
+    require_tf,
+    require_torch,
+    require_torch_gpu,
+    require_vision,
+    slow,
+)
 
-from .test_pipelines_common import ANY, PipelineTestCaseMeta
+from .test_pipelines_common import ANY
+
+
+if is_torch_available():
+    import torch
 
 
 if is_vision_available():
@@ -31,12 +44,13 @@ def open(*args, **kwargs):
             pass
 
 
+@is_pipeline_test
 @require_torch
 @require_vision
-class VisualQuestionAnsweringPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
+class VisualQuestionAnsweringPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    def get_test_pipeline(self, model, tokenizer, processor):
         vqa_pipeline = pipeline("visual-question-answering", model="hf-internal-testing/tiny-vilt-random-vqa")
         examples = [
             {
@@ -76,6 +90,37 @@ def test_small_model_pt(self):
             outputs, [{"score": ANY(float), "answer": ANY(str)}, {"score": ANY(float), "answer": ANY(str)}]
         )
 
+    @require_torch
+    @require_torch_gpu
+    def test_small_model_pt_blip2(self):
+        vqa_pipeline = pipeline(
+            "visual-question-answering", model="hf-internal-testing/tiny-random-Blip2ForConditionalGeneration"
+        )
+        image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
+        question = "How many cats are there?"
+
+        outputs = vqa_pipeline(image=image, question=question)
+        self.assertEqual(outputs, [{"answer": ANY(str)}])
+
+        outputs = vqa_pipeline({"image": image, "question": question})
+        self.assertEqual(outputs, [{"answer": ANY(str)}])
+
+        outputs = vqa_pipeline([{"image": image, "question": question}, {"image": image, "question": question}])
+        self.assertEqual(outputs, [[{"answer": ANY(str)}]] * 2)
+
+        vqa_pipeline = pipeline(
+            "visual-question-answering",
+            model="hf-internal-testing/tiny-random-Blip2ForConditionalGeneration",
+            model_kwargs={"torch_dtype": torch.float16},
+            device=0,
+        )
+        self.assertEqual(vqa_pipeline.model.device, torch.device(0))
+        self.assertEqual(vqa_pipeline.model.language_model.dtype, torch.float16)
+        self.assertEqual(vqa_pipeline.model.vision_model.dtype, torch.float16)
+
+        outputs = vqa_pipeline(image=image, question=question)
+        self.assertEqual(outputs, [{"answer": ANY(str)}])
+
     @slow
     @require_torch
     def test_large_model_pt(self):
@@ -101,6 +146,31 @@ def test_large_model_pt(self):
             [[{"score": 0.8799, "answer": "2"}, {"score": 0.296, "answer": "1"}]] * 2,
         )
 
+    @slow
+    @require_torch
+    @require_torch_gpu
+    def test_large_model_pt_blip2(self):
+        vqa_pipeline = pipeline(
+            "visual-question-answering",
+            model="Salesforce/blip2-opt-2.7b",
+            model_kwargs={"torch_dtype": torch.float16},
+            device=0,
+        )
+        self.assertEqual(vqa_pipeline.model.device, torch.device(0))
+        self.assertEqual(vqa_pipeline.model.language_model.dtype, torch.float16)
+
+        image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
+        question = "Question: how many cats are there? Answer:"
+
+        outputs = vqa_pipeline(image=image, question=question)
+        self.assertEqual(outputs, [{"answer": "two"}])
+
+        outputs = vqa_pipeline({"image": image, "question": question})
+        self.assertEqual(outputs, [{"answer": "two"}])
+
+        outputs = vqa_pipeline([{"image": image, "question": question}, {"image": image, "question": question}])
+        self.assertEqual(outputs, [[{"answer": "two"}]] * 2)
+
     @require_tf
     @unittest.skip("Visual question answering not implemented in TF")
     def test_small_model_tf(self):
diff --git a/tests/pipelines/test_pipelines_zero_shot.py b/tests/pipelines/test_pipelines_zero_shot.py
index 5c78db1aa961..9c37014ab81d 100644
--- a/tests/pipelines/test_pipelines_zero_shot.py
+++ b/tests/pipelines/test_pipelines_zero_shot.py
@@ -21,16 +21,28 @@
     ZeroShotClassificationPipeline,
     pipeline,
 )
-from transformers.testing_utils import nested_simplify, require_tf, require_torch, slow
+from transformers.testing_utils import is_pipeline_test, nested_simplify, require_tf, require_torch, slow
 
-from .test_pipelines_common import ANY, PipelineTestCaseMeta
+from .test_pipelines_common import ANY
 
 
-class ZeroShotClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
+# These 2 model types require different inputs than those of the usual text models.
+_TO_SKIP = {"LayoutLMv2Config", "LayoutLMv3Config"}
+
+
+@is_pipeline_test
+class ZeroShotClassificationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
     tf_model_mapping = TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    if model_mapping is not None:
+        model_mapping = {config: model for config, model in model_mapping.items() if config.__name__ not in _TO_SKIP}
+    if tf_model_mapping is not None:
+        tf_model_mapping = {
+            config: model for config, model in tf_model_mapping.items() if config.__name__ not in _TO_SKIP
+        }
+
+    def get_test_pipeline(self, model, tokenizer, processor):
         classifier = ZeroShotClassificationPipeline(
             model=model, tokenizer=tokenizer, candidate_labels=["polics", "health"]
         )
diff --git a/tests/pipelines/test_pipelines_zero_shot_audio_classification.py b/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
new file mode 100644
index 000000000000..87f91a7d27ef
--- /dev/null
+++ b/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
@@ -0,0 +1,94 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from datasets import load_dataset
+
+from transformers.pipelines import pipeline
+from transformers.testing_utils import is_pipeline_test, nested_simplify, require_torch, slow
+
+
+@is_pipeline_test
+@require_torch
+class ZeroShotAudioClassificationPipelineTests(unittest.TestCase):
+    # Deactivating auto tests since we don't have a good MODEL_FOR_XX mapping,
+    # and only CLAP would be there for now.
+    # model_mapping = {CLAPConfig: CLAPModel}
+
+    @require_torch
+    def test_small_model_pt(self):
+        audio_classifier = pipeline(
+            task="zero-shot-audio-classification", model="hf-internal-testing/tiny-clap-htsat-unfused"
+        )
+        dataset = load_dataset("ashraq/esc50")
+        audio = dataset["train"]["audio"][-1]["array"]
+        output = audio_classifier(audio, candidate_labels=["Sound of a dog", "Sound of vaccum cleaner"])
+        self.assertEqual(
+            nested_simplify(output),
+            [{"score": 0.501, "label": "Sound of a dog"}, {"score": 0.499, "label": "Sound of vaccum cleaner"}],
+        )
+
+    @unittest.skip("No models are available in TF")
+    def test_small_model_tf(self):
+        pass
+
+    @slow
+    @require_torch
+    def test_large_model_pt(self):
+        audio_classifier = pipeline(
+            task="zero-shot-audio-classification",
+            model="laion/clap-htsat-unfused",
+        )
+        # This is an audio of a dog
+        dataset = load_dataset("ashraq/esc50")
+        audio = dataset["train"]["audio"][-1]["array"]
+        output = audio_classifier(audio, candidate_labels=["Sound of a dog", "Sound of vaccum cleaner"])
+
+        self.assertEqual(
+            nested_simplify(output),
+            [
+                {"score": 0.999, "label": "Sound of a dog"},
+                {"score": 0.001, "label": "Sound of vaccum cleaner"},
+            ],
+        )
+
+        output = audio_classifier([audio] * 5, candidate_labels=["Sound of a dog", "Sound of vaccum cleaner"])
+        self.assertEqual(
+            nested_simplify(output),
+            [
+                [
+                    {"score": 0.999, "label": "Sound of a dog"},
+                    {"score": 0.001, "label": "Sound of vaccum cleaner"},
+                ],
+            ]
+            * 5,
+        )
+        output = audio_classifier(
+            [audio] * 5, candidate_labels=["Sound of a dog", "Sound of vaccum cleaner"], batch_size=5
+        )
+        self.assertEqual(
+            nested_simplify(output),
+            [
+                [
+                    {"score": 0.999, "label": "Sound of a dog"},
+                    {"score": 0.001, "label": "Sound of vaccum cleaner"},
+                ],
+            ]
+            * 5,
+        )
+
+    @unittest.skip("No models are available in TF")
+    def test_large_model_tf(self):
+        pass
diff --git a/tests/pipelines/test_pipelines_zero_shot_image_classification.py b/tests/pipelines/test_pipelines_zero_shot_image_classification.py
index d0396f4e9a99..197019f42e7b 100644
--- a/tests/pipelines/test_pipelines_zero_shot_image_classification.py
+++ b/tests/pipelines/test_pipelines_zero_shot_image_classification.py
@@ -16,9 +16,16 @@
 
 from transformers import is_vision_available
 from transformers.pipelines import pipeline
-from transformers.testing_utils import nested_simplify, require_tf, require_torch, require_vision, slow
+from transformers.testing_utils import (
+    is_pipeline_test,
+    nested_simplify,
+    require_tf,
+    require_torch,
+    require_vision,
+    slow,
+)
 
-from .test_pipelines_common import ANY, PipelineTestCaseMeta
+from .test_pipelines_common import ANY
 
 
 if is_vision_available():
@@ -31,13 +38,14 @@ def open(*args, **kwargs):
             pass
 
 
+@is_pipeline_test
 @require_vision
-class ZeroShotImageClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
+class ZeroShotImageClassificationPipelineTests(unittest.TestCase):
     # Deactivating auto tests since we don't have a good MODEL_FOR_XX mapping,
     # and only CLIP would be there for now.
     # model_mapping = {CLIPConfig: CLIPModel}
 
-    # def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    # def get_test_pipeline(self, model, tokenizer, processor):
     #     if tokenizer is None:
     #         # Side effect of no Fast Tokenizer class for these model, so skipping
     #         # But the slow tokenizer test should still run as they're quite small
@@ -46,7 +54,7 @@ class ZeroShotImageClassificationPipelineTests(unittest.TestCase, metaclass=Pipe
     #         # return None, None
 
     #     image_classifier = ZeroShotImageClassificationPipeline(
-    #         model=model, tokenizer=tokenizer, feature_extractor=feature_extractor
+    #         model=model, tokenizer=tokenizer, feature_extractor=processor
     #     )
 
     #     # test with a raw waveform
@@ -70,9 +78,15 @@ def test_small_model_pt(self):
         image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
         output = image_classifier(image, candidate_labels=["a", "b", "c"])
 
-        self.assertEqual(
+        # The floating scores are so close, we enter floating error approximation and the order is not guaranteed across
+        # python and torch versions.
+        self.assertIn(
             nested_simplify(output),
-            [{"score": 0.333, "label": "a"}, {"score": 0.333, "label": "b"}, {"score": 0.333, "label": "c"}],
+            [
+                [{"score": 0.333, "label": "a"}, {"score": 0.333, "label": "b"}, {"score": 0.333, "label": "c"}],
+                [{"score": 0.333, "label": "a"}, {"score": 0.333, "label": "c"}, {"score": 0.333, "label": "b"}],
+                [{"score": 0.333, "label": "b"}, {"score": 0.333, "label": "a"}, {"score": 0.333, "label": "c"}],
+            ],
         )
 
         output = image_classifier([image] * 5, candidate_labels=["A", "B", "C"], batch_size=2)
diff --git a/tests/pipelines/test_pipelines_zero_shot_object_detection.py b/tests/pipelines/test_pipelines_zero_shot_object_detection.py
index c48b8c381d65..c8b424483fa2 100644
--- a/tests/pipelines/test_pipelines_zero_shot_object_detection.py
+++ b/tests/pipelines/test_pipelines_zero_shot_object_detection.py
@@ -15,9 +15,16 @@
 import unittest
 
 from transformers import MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING, is_vision_available, pipeline
-from transformers.testing_utils import nested_simplify, require_tf, require_torch, require_vision, slow
+from transformers.testing_utils import (
+    is_pipeline_test,
+    nested_simplify,
+    require_tf,
+    require_torch,
+    require_vision,
+    slow,
+)
 
-from .test_pipelines_common import ANY, PipelineTestCaseMeta
+from .test_pipelines_common import ANY
 
 
 if is_vision_available():
@@ -30,13 +37,13 @@ def open(*args, **kwargs):
             pass
 
 
+@is_pipeline_test
 @require_vision
 @require_torch
-class ZeroShotObjectDetectionPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
-
+class ZeroShotObjectDetectionPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    def get_test_pipeline(self, model, tokenizer, processor):
         object_detector = pipeline(
             "zero-shot-object-detection", model="hf-internal-testing/tiny-random-owlvit-object-detection"
         )
diff --git a/tests/quantization/bnb/README.md b/tests/quantization/bnb/README.md
new file mode 100644
index 000000000000..3c1d3a079188
--- /dev/null
+++ b/tests/quantization/bnb/README.md
@@ -0,0 +1,120 @@
+# Testing mixed int8 quantization
+
+![HFxbitsandbytes.png](https://cdn-uploads.huggingface.co/production/uploads/1660567705337-62441d1d9fdefb55a0b7d12c.png)
+
+The following is the recipe on how to effectively debug `bitsandbytes` integration on Hugging Face `transformers`.
+
+## Library requirements
+
++ `transformers>=4.22.0`
++ `accelerate>=0.12.0` 
++ `bitsandbytes>=0.31.5`.
+## Hardware requirements
+
+The following instructions are tested with 2 NVIDIA-Tesla T4 GPUs. To run successfully `bitsandbytes` you would need a 8-bit core tensor supported GPU. Note that Turing, Ampere or newer architectures - e.g. T4, RTX20s RTX30s, A40-A100, A6000 should be supported. 
+
+## Virutal envs
+
+```bash
+conda create --name int8-testing python==3.8
+pip install bitsandbytes>=0.31.5
+pip install accelerate>=0.12.0
+pip install transformers>=4.23.0
+```
+if `transformers>=4.23.0` is not released yet, then use:
+```
+pip install git+https://github.com/huggingface/transformers.git
+```
+
+## Troubleshooting
+
+A list of common errors:
+
+### Torch does not correctly do the operations on GPU
+
+First check that:
+
+```py
+import torch
+
+vec = torch.randn(1, 2, 3).to(0)
+```
+
+Works without any error. If not, install torch using `conda` like:
+
+```bash
+conda create --name int8-testing python==3.8
+conda install pytorch torchvision torchaudio cudatoolkit=11.6 -c pytorch -c conda-forge
+pip install bitsandbytes>=0.31.5
+pip install accelerate>=0.12.0
+pip install transformers>=4.23.0
+```
+For the latest pytorch instructions please see [this](https://pytorch.org/get-started/locally/)
+
+and the snippet above should work.
+
+### ` bitsandbytes operations are not supported under CPU!`
+
+This happens when some Linear weights are set to the CPU when using `accelerate`. Please check carefully `model.hf_device_map` and make sure that there is no `Linear` module that is assigned to CPU. It is fine to have the last module (usually the Lm_head) set on CPU.
+
+### `To use the type as a Parameter, please correct the detach() semantics defined by __torch_dispatch__() implementation.`
+
+Use the latest version of `accelerate` with a command such as: `pip install -U accelerate` and the problem should be solved.
+
+### `Parameter has no attribue .CB` 
+
+Same solution as above.
+
+### `RuntimeError: CUDA error: an illegal memory access was encountered ... consider passing CUDA_LAUNCH_BLOCKING=1`
+
+Run your script by pre-pending `CUDA_LAUNCH_BLOCKING=1` and you should observe an error as described in the next section.
+
+### `CUDA illegal memory error: an illegal memory access at line...`:
+
+Check the CUDA verisons with:
+```
+nvcc --version
+```
+and confirm it is the same version as the one detected by `bitsandbytes`. If not, run:
+```
+ls -l $CONDA_PREFIX/lib/libcudart.so
+```
+or 
+```
+ls -l $LD_LIBRARY_PATH
+```
+Check if `libcudart.so` has a correct symlink that is set. Sometimes `nvcc` detects the correct CUDA version but `bitsandbytes` doesn't. You have to make sure that the symlink that is set for the file `libcudart.so` is redirected to the correct CUDA file. 
+
+Here is an example of a badly configured CUDA installation:
+
+`nvcc --version` gives:
+
+![Screenshot 2022-08-15 at 15.12.23.png](https://cdn-uploads.huggingface.co/production/uploads/1660569220888-62441d1d9fdefb55a0b7d12c.png)
+
+which means that the detected CUDA version is 11.3 but `bitsandbytes` outputs:
+
+![image.png](https://cdn-uploads.huggingface.co/production/uploads/1660569284243-62441d1d9fdefb55a0b7d12c.png)
+
+First check:
+
+```bash
+echo $LD_LIBRARY_PATH
+```
+
+If this contains multiple paths separated by `:`. Then you have to make sure that the correct CUDA version is set. By doing:
+
+```bash
+ls -l $path/libcudart.so
+```
+
+On each path (`$path`) separated by `:`.
+If not, simply run
+```bash
+ls -l $LD_LIBRARY_PATH/libcudart.so
+```
+
+and you can see
+
+![Screenshot 2022-08-15 at 15.12.33.png](https://cdn-uploads.huggingface.co/production/uploads/1660569176504-62441d1d9fdefb55a0b7d12c.png)
+
+If you see that the file is linked to the wrong CUDA version (here 10.2), find the correct location for `libcudart.so` (`find --name libcudart.so`) and replace the environment variable `LD_LIBRARY_PATH` with the one containing the correct `libcudart.so` file.
\ No newline at end of file
diff --git a/tests/quantization/bnb/__init__.py b/tests/quantization/bnb/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py
new file mode 100644
index 000000000000..801173da792b
--- /dev/null
+++ b/tests/quantization/bnb/test_4bit.py
@@ -0,0 +1,515 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Team Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a clone of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import gc
+import importlib.metadata
+import tempfile
+import unittest
+
+from packaging import version
+
+from transformers import (
+    AutoModel,
+    AutoModelForCausalLM,
+    AutoModelForSeq2SeqLM,
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    pipeline,
+)
+from transformers.testing_utils import (
+    is_torch_available,
+    require_accelerate,
+    require_bitsandbytes,
+    require_torch,
+    require_torch_gpu,
+    require_torch_multi_gpu,
+    slow,
+)
+
+
+def get_some_linear_layer(model):
+    if model.config.model_type == "gpt2":
+        return model.transformer.h[0].mlp.c_fc
+    return model.transformer.h[0].mlp.dense_4h_to_h
+
+
+if is_torch_available():
+    import torch
+    import torch.nn as nn
+
+    class LoRALayer(nn.Module):
+        """Wraps a linear layer with LoRA-like adapter - Used for testing purposes only"""
+
+        def __init__(self, module: nn.Module, rank: int):
+            super().__init__()
+            self.module = module
+            self.adapter = nn.Sequential(
+                nn.Linear(module.in_features, rank, bias=False),
+                nn.Linear(rank, module.out_features, bias=False),
+            )
+            small_std = (2.0 / (5 * min(module.in_features, module.out_features))) ** 0.5
+            nn.init.normal_(self.adapter[0].weight, std=small_std)
+            nn.init.zeros_(self.adapter[1].weight)
+            self.adapter.to(module.weight.device)
+
+        def forward(self, input, *args, **kwargs):
+            return self.module(input, *args, **kwargs) + self.adapter(input)
+
+
+@require_bitsandbytes
+@require_accelerate
+@require_torch
+@require_torch_gpu
+@slow
+class Base4bitTest(unittest.TestCase):
+    # We keep the constants inside the init function and model loading inside setUp function
+
+    # We need to test on relatively large models (aka >1b parameters otherwise the quantiztion may not work as expected)
+    # Therefore here we use only bloom-1b3 to test our module
+    model_name = "bigscience/bloom-1b7"
+
+    # Constant values
+    EXPECTED_RELATIVE_DIFFERENCE = (
+        2.109659552692574  # This was obtained on a RTX Titan so the number might slightly change
+    )
+
+    input_text = "Hello my name is"
+    EXPECTED_OUTPUTS = set()
+    EXPECTED_OUTPUTS.add("Hello my name is John and I am a professional photographer. I")
+    EXPECTED_OUTPUTS.add("Hello my name is John.\nI am a friend of your father.\n")
+    EXPECTED_OUTPUTS.add("Hello my name is John Doe, I am a student at the University")
+    MAX_NEW_TOKENS = 10
+
+    def setUp(self):
+        # Models and tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+
+
+class Bnb4BitTest(Base4bitTest):
+    def setUp(self):
+        super().setUp()
+
+        # Models and tokenizer
+        self.model_fp16 = AutoModelForCausalLM.from_pretrained(
+            self.model_name, torch_dtype=torch.float16, device_map="auto"
+        )
+        self.model_4bit = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_4bit=True, device_map="auto")
+
+    def tearDown(self):
+        r"""
+        TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
+        avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
+        """
+        del self.model_fp16
+        del self.model_4bit
+
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_quantization_num_parameters(self):
+        r"""
+        Test if the number of returned parameters is correct
+
+        See: https://github.com/huggingface/transformers/issues/25978
+        """
+        num_params_4bit = self.model_4bit.num_parameters()
+        num_params_fp16 = self.model_fp16.num_parameters()
+
+        self.assertEqual(num_params_4bit, num_params_fp16)
+
+    def test_quantization_config_json_serialization(self):
+        r"""
+        A simple test to check if the quantization config is correctly serialized and deserialized
+        """
+        config = self.model_4bit.config
+
+        self.assertTrue(hasattr(config, "quantization_config"))
+
+        _ = config.to_dict()
+        _ = config.to_diff_dict()
+
+        _ = config.to_json_string()
+
+    def test_memory_footprint(self):
+        r"""
+        A simple test to check if the model conversion has been done correctly by checking on the
+        memory footprint of the converted model and the class type of the linear layers of the converted models
+        """
+        from bitsandbytes.nn import Params4bit
+
+        mem_fp16 = self.model_fp16.get_memory_footprint()
+        mem_4bit = self.model_4bit.get_memory_footprint()
+
+        self.assertAlmostEqual(mem_fp16 / mem_4bit, self.EXPECTED_RELATIVE_DIFFERENCE)
+        linear = get_some_linear_layer(self.model_4bit)
+        self.assertTrue(linear.weight.__class__ == Params4bit)
+
+    def test_linear_are_4bit(self):
+        r"""
+        A simple test to check if the model conversion has been done correctly by checking on the
+        memory footprint of the converted model and the class type of the linear layers of the converted models
+        """
+        from transformers import T5PreTrainedModel
+
+        self.model_fp16.get_memory_footprint()
+        self.model_4bit.get_memory_footprint()
+
+        for name, module in self.model_4bit.named_modules():
+            if isinstance(module, torch.nn.Linear):
+                if name not in ["lm_head"] + T5PreTrainedModel._keep_in_fp32_modules:
+                    # 4-bit parameters are packed in uint8 variables
+                    self.assertTrue(module.weight.dtype == torch.uint8)
+
+    def test_rwkv_4bit(self):
+        r"""
+        A simple test to check if 4-bit RWKV inference works as expected.
+        """
+        model_id = "RWKV/rwkv-4-169m-pile"
+
+        quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True)
+
+        model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
+        tok = AutoTokenizer.from_pretrained(model_id)
+
+        text = "Hello my name is"
+        input_ids = tok.encode(text, return_tensors="pt").to(0)
+
+        _ = model.generate(input_ids, max_new_tokens=30)
+
+    def test_generate_quality(self):
+        r"""
+        Test the generation quality of the quantized model and see that we are matching the expected output.
+        Given that we are operating on small numbers + the testing model is relatively small, we might not get
+        the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
+        """
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
+        output_sequences = self.model_4bit.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+
+        self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
+
+    def test_generate_quality_config(self):
+        r"""
+        Test that loading the model with the config is equivalent
+        """
+        bnb_config = BitsAndBytesConfig()
+        bnb_config.load_in_4bit = True
+
+        model_4bit_from_config = AutoModelForCausalLM.from_pretrained(
+            self.model_name, quantization_config=bnb_config, device_map="auto"
+        )
+
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
+        output_sequences = model_4bit_from_config.generate(
+            input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10
+        )
+
+        self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
+
+    def test_raise_on_save_pretrained(self):
+        r"""
+        Test whether trying to save a model after converting it in 8-bit will throw a warning.
+        """
+        with self.assertRaises(NotImplementedError), tempfile.TemporaryDirectory() as tmpdirname:
+            self.model_4bit.save_pretrained(tmpdirname)
+
+    def test_raise_if_config_and_load_in_4bit(self):
+        r"""
+        Test that loading the model with the config and `load_in_4bit` raises an error
+        """
+        bnb_config = BitsAndBytesConfig()
+
+        with self.assertRaises(ValueError):
+            _ = AutoModelForCausalLM.from_pretrained(
+                self.model_name,
+                quantization_config=bnb_config,
+                load_in_4bit=True,
+                device_map="auto",
+                bnb_4bit_quant_type="nf4",
+            )
+
+    def test_device_and_dtype_assignment(self):
+        r"""
+        Test whether trying to cast (or assigning a device to) a model after converting it in 8-bit will throw an error.
+        Checks also if other models are casted correctly.
+        """
+        with self.assertRaises(ValueError):
+            # Tries with `str`
+            self.model_4bit.to("cpu")
+
+        with self.assertRaises(ValueError):
+            # Tries with a `dtype``
+            self.model_4bit.to(torch.float16)
+
+        with self.assertRaises(ValueError):
+            # Tries with a `device`
+            self.model_4bit.to(torch.device("cuda:0"))
+
+        with self.assertRaises(ValueError):
+            # Tries with a `device`
+            self.model_4bit.float()
+
+        with self.assertRaises(ValueError):
+            # Tries with a `device`
+            self.model_4bit.half()
+
+        # Test if we did not break anything
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
+
+        self.model_fp16 = self.model_fp16.to(torch.float32)
+        _ = self.model_fp16.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+
+        # Check this does not throw an error
+        _ = self.model_fp16.to("cpu")
+
+        # Check this does not throw an error
+        _ = self.model_fp16.half()
+
+        # Check this does not throw an error
+        _ = self.model_fp16.float()
+
+    def test_fp32_4bit_conversion(self):
+        r"""
+        Test whether it is possible to mix both `4bit` and `fp32` weights when using `keep_in_fp32_modules` correctly.
+        """
+        model = AutoModelForSeq2SeqLM.from_pretrained("t5-small", load_in_4bit=True, device_map="auto")
+        self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == torch.float32)
+
+
+@require_bitsandbytes
+@require_accelerate
+@require_torch
+@require_torch_gpu
+@slow
+class Bnb4BitT5Test(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model_name = "t5-small"
+        cls.dense_act_model_name = "google/flan-t5-small"  # flan-t5 uses dense-act instead of dense-relu-dense
+        cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
+        cls.input_text = "Translate in German: Hello, my dog is cute"
+
+    def tearDown(self):
+        r"""
+        TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
+        avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
+        """
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_inference_without_keep_in_fp32(self):
+        r"""
+        Test whether it is possible to mix both `4bit` and `fp32` weights when using `keep_in_fp32_modules` correctly.
+        `flan-t5-small` uses `T5DenseGatedActDense` whereas `t5-small` uses `T5DenseReluDense`. We need to test
+        both cases.
+        """
+        from transformers import T5ForConditionalGeneration
+
+        modules = T5ForConditionalGeneration._keep_in_fp32_modules
+        T5ForConditionalGeneration._keep_in_fp32_modules = None
+
+        # test with `t5-small`
+        model = T5ForConditionalGeneration.from_pretrained(self.model_name, load_in_4bit=True, device_map="auto")
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
+        _ = model.generate(**encoded_input)
+
+        # test with `flan-t5-small`
+        model = T5ForConditionalGeneration.from_pretrained(
+            self.dense_act_model_name, load_in_4bit=True, device_map="auto"
+        )
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
+        _ = model.generate(**encoded_input)
+        T5ForConditionalGeneration._keep_in_fp32_modules = modules
+
+    def test_inference_with_keep_in_fp32(self):
+        r"""
+        Test whether it is possible to mix both `4bit` and `fp32` weights when using `keep_in_fp32_modules` correctly.
+        `flan-t5-small` uses `T5DenseGatedActDense` whereas `t5-small` uses `T5DenseReluDense`. We need to test
+        both cases.
+        """
+        import bitsandbytes as bnb
+
+        from transformers import T5ForConditionalGeneration
+
+        # test with `t5-small`
+        model = T5ForConditionalGeneration.from_pretrained(self.model_name, load_in_4bit=True, device_map="auto")
+
+        # there was a bug with decoders - this test checks that it is fixed
+        self.assertTrue(isinstance(model.decoder.block[0].layer[0].SelfAttention.q, bnb.nn.Linear4bit))
+
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
+        _ = model.generate(**encoded_input)
+
+        # test with `flan-t5-small`
+        model = T5ForConditionalGeneration.from_pretrained(
+            self.dense_act_model_name, load_in_4bit=True, device_map="auto"
+        )
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
+        _ = model.generate(**encoded_input)
+
+
+class Classes4BitModelTest(Base4bitTest):
+    def setUp(self):
+        super().setUp()
+        # model_name
+        self.model_name = "bigscience/bloom-560m"
+        self.seq_to_seq_name = "t5-small"
+
+        # Different types of model
+
+        self.base_model = AutoModel.from_pretrained(self.model_name, load_in_4bit=True, device_map="auto")
+        # Sequence classification model
+        self.sequence_model = AutoModelForSequenceClassification.from_pretrained(
+            self.model_name, load_in_4bit=True, device_map="auto"
+        )
+        # CausalLM model
+        self.model_4bit = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_4bit=True, device_map="auto")
+        # Seq2seq model
+        self.seq_to_seq_model = AutoModelForSeq2SeqLM.from_pretrained(
+            self.seq_to_seq_name, load_in_4bit=True, device_map="auto"
+        )
+
+    def tearDown(self):
+        r"""
+        TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
+        avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
+        """
+        del self.base_model
+        del self.sequence_model
+        del self.model_4bit
+        del self.seq_to_seq_model
+
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_correct_head_class(self):
+        r"""
+        A simple test to check if the last modules for some classes (AutoModelForCausalLM or SequenceClassification)
+        are kept in their native class.
+        """
+        from bitsandbytes.nn import Params4bit
+
+        self.assertTrue(self.base_model.h[-1].mlp.dense_4h_to_h.weight.__class__ == Params4bit)
+
+        # Other heads should be nn.Parameter
+        self.assertTrue(self.model_4bit.lm_head.weight.__class__ == torch.nn.Parameter)
+        self.assertTrue(self.sequence_model.score.weight.__class__ == torch.nn.Parameter)
+        self.assertTrue(self.seq_to_seq_model.lm_head.weight.__class__ == torch.nn.Parameter)
+
+
+class Pipeline4BitTest(Base4bitTest):
+    def setUp(self):
+        super().setUp()
+
+    def tearDown(self):
+        r"""
+        TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
+        avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
+        """
+        del self.pipe
+
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_pipeline(self):
+        r"""
+        The aim of this test is to verify that the mixed 4bit is compatible with `pipeline` from transformers. Since
+        we used pipline for inference speed benchmarking we want to make sure that this feature does not break anything
+        on pipline.
+        """
+        # self._clear_cuda_cache()
+        self.pipe = pipeline(
+            "text-generation",
+            model=self.model_name,
+            model_kwargs={"device_map": "auto", "load_in_4bit": True, "torch_dtype": torch.float16},
+            max_new_tokens=self.MAX_NEW_TOKENS,
+        )
+
+        # Real second forward pass
+        pipeline_output = self.pipe(self.input_text)
+        self.assertIn(pipeline_output[0]["generated_text"], self.EXPECTED_OUTPUTS)
+
+
+@require_torch_multi_gpu
+class Bnb4bitTestMultiGpu(Base4bitTest):
+    def setUp(self):
+        super().setUp()
+
+    def test_multi_gpu_loading(self):
+        r"""
+        This tests that the model has been loaded and can be used correctly on a multi-GPU setup.
+        Let's just try to load a model on 2 GPUs and see if it works. The model we test has ~2GB of total, 3GB should suffice
+        """
+
+        model_parallel = AutoModelForCausalLM.from_pretrained(
+            self.model_name, load_in_4bit=True, device_map="balanced"
+        )
+
+        # Check correct device map
+        self.assertEqual(set(model_parallel.hf_device_map.values()), {0, 1})
+
+        # Check that inference pass works on the model
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
+
+        # Second real batch
+        output_parallel = model_parallel.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+        self.assertIn(self.tokenizer.decode(output_parallel[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
+
+
+class Bnb4BitTestTraining(Base4bitTest):
+    def setUp(self):
+        self.model_name = "facebook/opt-350m"
+        super().setUp()
+
+    def test_training(self):
+        if version.parse(importlib.metadata.version("bitsandbytes")) < version.parse("0.37.0"):
+            return
+
+        # Step 1: freeze all parameters
+        model = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_4bit=True)
+
+        self.assertEqual(set(model.hf_device_map.values()), {torch.cuda.current_device()})
+
+        for param in model.parameters():
+            param.requires_grad = False  # freeze the model - train adapters later
+            if param.ndim == 1:
+                # cast the small parameters (e.g. layernorm) to fp32 for stability
+                param.data = param.data.to(torch.float32)
+
+        # Step 2: add adapters
+        for _, module in model.named_modules():
+            if "OPTAttention" in repr(type(module)):
+                module.q_proj = LoRALayer(module.q_proj, rank=16)
+                module.k_proj = LoRALayer(module.k_proj, rank=16)
+                module.v_proj = LoRALayer(module.v_proj, rank=16)
+
+        # Step 3: dummy batch
+        batch = self.tokenizer("Test batch ", return_tensors="pt").to(0)
+
+        # Step 4: Check if the gradient is not None
+        with torch.cuda.amp.autocast():
+            out = model.forward(**batch)
+            out.logits.norm().backward()
+
+        for module in model.modules():
+            if isinstance(module, LoRALayer):
+                self.assertTrue(module.adapter[1].weight.grad is not None)
+                self.assertTrue(module.adapter[1].weight.grad.norm().item() > 0)
+            elif isinstance(module, nn.Embedding):
+                self.assertTrue(module.weight.grad is None)
+
+
+class Bnb4BitGPT2Test(Bnb4BitTest):
+    model_name = "gpt2-xl"
+    EXPECTED_RELATIVE_DIFFERENCE = 3.3191854854152187
diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py
new file mode 100644
index 000000000000..670be57d0ce9
--- /dev/null
+++ b/tests/quantization/bnb/test_mixed_int8.py
@@ -0,0 +1,828 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Team Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a clone of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import gc
+import importlib.metadata
+import tempfile
+import unittest
+
+from packaging import version
+
+from transformers import (
+    AutoConfig,
+    AutoModel,
+    AutoModelForCausalLM,
+    AutoModelForSeq2SeqLM,
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    pipeline,
+)
+from transformers.testing_utils import (
+    is_accelerate_available,
+    is_torch_available,
+    require_accelerate,
+    require_bitsandbytes,
+    require_torch,
+    require_torch_gpu,
+    require_torch_multi_gpu,
+    slow,
+)
+
+
+def get_some_linear_layer(model):
+    if model.config.model_type == "gpt2":
+        return model.transformer.h[0].mlp.c_fc
+    return model.transformer.h[0].mlp.dense_4h_to_h
+
+
+if is_accelerate_available():
+    from accelerate import PartialState
+    from accelerate.logging import get_logger
+
+    logger = get_logger(__name__)
+    _ = PartialState()
+
+if is_torch_available():
+    import torch
+    import torch.nn as nn
+
+    class LoRALayer(nn.Module):
+        """Wraps a linear layer with LoRA-like adapter - Used for testing purposes only"""
+
+        def __init__(self, module: nn.Module, rank: int):
+            super().__init__()
+            self.module = module
+            self.adapter = nn.Sequential(
+                nn.Linear(module.in_features, rank, bias=False),
+                nn.Linear(rank, module.out_features, bias=False),
+            )
+            small_std = (2.0 / (5 * min(module.in_features, module.out_features))) ** 0.5
+            nn.init.normal_(self.adapter[0].weight, std=small_std)
+            nn.init.zeros_(self.adapter[1].weight)
+            self.adapter.to(module.weight.device)
+
+        def forward(self, input, *args, **kwargs):
+            return self.module(input, *args, **kwargs) + self.adapter(input)
+
+
+@require_bitsandbytes
+@require_accelerate
+@require_torch
+@require_torch_gpu
+@slow
+class BaseMixedInt8Test(unittest.TestCase):
+    # We keep the constants inside the init function and model loading inside setUp function
+
+    # We need to test on relatively large models (aka >1b parameters otherwise the quantiztion may not work as expected)
+    # Therefore here we use only bloom-1b3 to test our module
+    model_name = "bigscience/bloom-1b7"
+
+    # Constant values
+    EXPECTED_RELATIVE_DIFFERENCE = (
+        1.540025  # This was obtained on a Quadro RTX 8000 so the number might slightly change
+    )
+
+    input_text = "Hello my name is"
+    EXPECTED_OUTPUT = "Hello my name is John.\nI am a friend of the family.\n"
+    MAX_NEW_TOKENS = 10
+
+    def setUp(self):
+        # Models and tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+
+
+class MixedInt8Test(BaseMixedInt8Test):
+    def setUp(self):
+        super().setUp()
+
+        # Models and tokenizer
+        self.model_fp16 = AutoModelForCausalLM.from_pretrained(
+            self.model_name, torch_dtype=torch.float16, device_map="auto"
+        )
+        self.model_8bit = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_8bit=True, device_map="auto")
+
+    def tearDown(self):
+        r"""
+        TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
+        avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
+        """
+        del self.model_fp16
+        del self.model_8bit
+
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_get_keys_to_not_convert(self):
+        r"""
+        Test the `get_keys_to_not_convert` function.
+        """
+        from accelerate import init_empty_weights
+
+        from transformers import AutoModelForMaskedLM, Blip2ForConditionalGeneration, MptForCausalLM, OPTForCausalLM
+        from transformers.integrations.bitsandbytes import get_keys_to_not_convert
+
+        model_id = "mosaicml/mpt-7b"
+        config = AutoConfig.from_pretrained(
+            model_id, trust_remote_code=True, revision="72e5f594ce36f9cabfa2a9fd8f58b491eb467ee7"
+        )
+        with init_empty_weights():
+            model = AutoModelForCausalLM.from_config(
+                config, trust_remote_code=True, code_revision="72e5f594ce36f9cabfa2a9fd8f58b491eb467ee7"
+            )
+        self.assertEqual(get_keys_to_not_convert(model), ["transformer.wte"])
+        # without trust_remote_code
+        config = AutoConfig.from_pretrained(model_id, revision="72e5f594ce36f9cabfa2a9fd8f58b491eb467ee7")
+        with init_empty_weights():
+            model = MptForCausalLM(config)
+        # The order of the keys does not matter, so we sort them before comparing, same for the other tests.
+        self.assertEqual(get_keys_to_not_convert(model).sort(), ["lm_head", "transformer.wte"].sort())
+
+        model_id = "Salesforce/blip2-opt-2.7b"
+        config = AutoConfig.from_pretrained(model_id, revision="1ef7f63a8f0a144c13fdca8103eb7b4691c74cec")
+        with init_empty_weights():
+            model = Blip2ForConditionalGeneration(config)
+        self.assertEqual(
+            get_keys_to_not_convert(model).sort(),
+            ["language_model.lm_head", "language_model.model.decoder.embed_tokens"].sort(),
+        )
+
+        model_id = "facebook/opt-350m"
+        config = AutoConfig.from_pretrained(model_id, revision="cb32f77e905cccbca1d970436fb0f5e6b58ee3c5")
+        with init_empty_weights():
+            model = OPTForCausalLM(config)
+        self.assertEqual(get_keys_to_not_convert(model).sort(), ["lm_head", "model.decoder.embed_tokens"].sort())
+
+        model_id = "roberta-large"
+        config = AutoConfig.from_pretrained(model_id, revision="716877d372b884cad6d419d828bac6c85b3b18d9")
+        with init_empty_weights():
+            model = AutoModelForMaskedLM.from_config(config)
+        self.assertEqual(
+            get_keys_to_not_convert(model).sort(),
+            ["'roberta.embeddings.word_embeddings', 'lm_head', 'lm_head.decoder"].sort(),
+        )
+
+    def test_quantization_config_json_serialization(self):
+        r"""
+        A simple test to check if the quantization config is correctly serialized and deserialized
+        """
+        config = self.model_8bit.config
+
+        self.assertTrue(hasattr(config, "quantization_config"))
+
+        _ = config.to_dict()
+        _ = config.to_diff_dict()
+
+        _ = config.to_json_string()
+
+    def test_memory_footprint(self):
+        r"""
+        A simple test to check if the model conversion has been done correctly by checking on the
+        memory footprint of the converted model and the class type of the linear layers of the converted models
+        """
+        from bitsandbytes.nn import Int8Params
+
+        mem_fp16 = self.model_fp16.get_memory_footprint()
+        mem_8bit = self.model_8bit.get_memory_footprint()
+
+        self.assertAlmostEqual(mem_fp16 / mem_8bit, self.EXPECTED_RELATIVE_DIFFERENCE)
+        self.assertTrue(get_some_linear_layer(self.model_8bit).weight.__class__ == Int8Params)
+
+    def test_linear_are_8bit(self):
+        r"""
+        A simple test to check if the model conversion has been done correctly by checking on the
+        memory footprint of the converted model and the class type of the linear layers of the converted models
+        """
+        from transformers import T5PreTrainedModel
+
+        self.model_fp16.get_memory_footprint()
+        self.model_8bit.get_memory_footprint()
+
+        for name, module in self.model_8bit.named_modules():
+            if isinstance(module, torch.nn.Linear):
+                if name not in ["lm_head"] + T5PreTrainedModel._keep_in_fp32_modules:
+                    self.assertTrue(module.weight.dtype == torch.int8)
+
+    def test_llm_skip(self):
+        r"""
+        A simple test to check if `llm_int8_skip_modules` works as expected
+        """
+        import bitsandbytes as bnb
+
+        quantization_config = BitsAndBytesConfig(load_in_8bit=True, llm_int8_skip_modules=["classifier"])
+        seq_classification_model = AutoModelForSequenceClassification.from_pretrained(
+            "roberta-large-mnli", quantization_config=quantization_config
+        )
+        self.assertTrue(seq_classification_model.roberta.encoder.layer[0].output.dense.weight.dtype == torch.int8)
+        self.assertTrue(
+            isinstance(seq_classification_model.roberta.encoder.layer[0].output.dense, bnb.nn.Linear8bitLt)
+        )
+
+        self.assertTrue(isinstance(seq_classification_model.classifier.dense, nn.Linear))
+        self.assertTrue(seq_classification_model.classifier.dense.weight.dtype != torch.int8)
+        self.assertTrue(isinstance(seq_classification_model.classifier.out_proj, nn.Linear))
+        self.assertTrue(seq_classification_model.classifier.out_proj != torch.int8)
+
+    def test_generate_quality(self):
+        r"""
+        Test the generation quality of the quantized model and see that we are matching the expected output.
+        Given that we are operating on small numbers + the testing model is relatively small, we might not get
+        the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
+        """
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
+        output_sequences = self.model_8bit.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+
+        self.assertEqual(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
+
+    def test_generate_quality_config(self):
+        r"""
+        Test that loading the model with the config is equivalent
+        """
+        bnb_config = BitsAndBytesConfig()
+        bnb_config.load_in_8bit = True
+
+        model_8bit_from_config = AutoModelForCausalLM.from_pretrained(
+            self.model_name, quantization_config=bnb_config, device_map="auto"
+        )
+
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
+        output_sequences = model_8bit_from_config.generate(
+            input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10
+        )
+
+        self.assertEqual(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
+
+    def test_raise_if_config_and_load_in_8bit(self):
+        r"""
+        Test that loading the model with the config and `load_in_8bit` raises an error
+        """
+        bnb_config = BitsAndBytesConfig()
+
+        with self.assertRaises(ValueError):
+            _ = AutoModelForCausalLM.from_pretrained(
+                self.model_name,
+                quantization_config=bnb_config,
+                load_in_8bit=True,
+                device_map="auto",
+                llm_int8_enable_fp32_cpu_offload=True,
+            )
+
+    def test_device_and_dtype_assignment(self):
+        r"""
+        Test whether trying to cast (or assigning a device to) a model after converting it in 8-bit will throw an error.
+        Checks also if other models are casted correctly.
+        """
+        with self.assertRaises(ValueError):
+            # Tries with `str`
+            self.model_8bit.to("cpu")
+
+        with self.assertRaises(ValueError):
+            # Tries with a `dtype``
+            self.model_8bit.to(torch.float16)
+
+        with self.assertRaises(ValueError):
+            # Tries with a `device`
+            self.model_8bit.to(torch.device("cuda:0"))
+
+        with self.assertRaises(ValueError):
+            # Tries with a `device`
+            self.model_8bit.float()
+
+        with self.assertRaises(ValueError):
+            # Tries with a `device`
+            self.model_8bit.half()
+
+        # Test if we did not break anything
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
+
+        self.model_fp16 = self.model_fp16.to(torch.float32)
+        _ = self.model_fp16.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+
+        # Check this does not throw an error
+        _ = self.model_fp16.to("cpu")
+
+        # Check this does not throw an error
+        _ = self.model_fp16.half()
+
+        # Check this does not throw an error
+        _ = self.model_fp16.float()
+
+    def test_fp32_int8_conversion(self):
+        r"""
+        Test whether it is possible to mix both `int8` and `fp32` weights when using `keep_in_fp32_modules` correctly.
+        """
+        model = AutoModelForSeq2SeqLM.from_pretrained("t5-small", load_in_8bit=True, device_map="auto")
+        self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == torch.float32)
+
+    def test_int8_serialization(self):
+        r"""
+        Test whether it is possible to serialize a model in 8-bit.
+        """
+        from bitsandbytes.nn import Int8Params
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            self.model_8bit.save_pretrained(tmpdirname)
+
+            # check that the file `quantization_config` is present
+            config = AutoConfig.from_pretrained(tmpdirname)
+            self.assertTrue(hasattr(config, "quantization_config"))
+
+            model_from_saved = AutoModelForCausalLM.from_pretrained(tmpdirname, load_in_8bit=True, device_map="auto")
+
+            linear = get_some_linear_layer(model_from_saved)
+            self.assertTrue(linear.weight.__class__ == Int8Params)
+            self.assertTrue(hasattr(linear.weight, "SCB"))
+
+            # generate
+            encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
+            output_sequences = model_from_saved.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+
+            self.assertEqual(
+                self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUT
+            )
+
+    def test_int8_serialization_sharded(self):
+        r"""
+        Test whether it is possible to serialize a model in 8-bit - sharded version.
+        """
+        from bitsandbytes.nn import Int8Params
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            self.model_8bit.save_pretrained(tmpdirname, max_shard_size="200MB")
+
+            # check that the file `quantization_config` is present
+            config = AutoConfig.from_pretrained(tmpdirname)
+            self.assertTrue(hasattr(config, "quantization_config"))
+
+            model_from_saved = AutoModelForCausalLM.from_pretrained(tmpdirname)
+
+            linear = get_some_linear_layer(model_from_saved)
+            self.assertTrue(linear.weight.__class__ == Int8Params)
+            self.assertTrue(hasattr(linear.weight, "SCB"))
+
+            # generate
+            encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
+            output_sequences = model_from_saved.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+
+            self.assertEqual(
+                self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUT
+            )
+
+    def test_int8_from_pretrained(self):
+        r"""
+        Test whether loading a 8bit model from the Hub works as expected
+        """
+        from bitsandbytes.nn import Int8Params
+
+        model_id = "ybelkada/bloom-1b7-8bit"
+
+        model = AutoModelForCausalLM.from_pretrained(model_id)
+
+        linear = get_some_linear_layer(model)
+        self.assertTrue(linear.weight.__class__ == Int8Params)
+        self.assertTrue(hasattr(linear.weight, "SCB"))
+
+        # generate
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
+        output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+
+        self.assertEqual(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
+
+
+@require_bitsandbytes
+@require_accelerate
+@require_torch
+@require_torch_gpu
+@slow
+class MixedInt8T5Test(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model_name = "t5-small"
+        cls.dense_act_model_name = "google/flan-t5-small"  # flan-t5 uses dense-act instead of dense-relu-dense
+        cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
+        cls.input_text = "Translate in German: Hello, my dog is cute"
+
+    def tearDown(self):
+        r"""
+        TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
+        avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
+        """
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_inference_without_keep_in_fp32(self):
+        r"""
+        Test whether it is possible to mix both `int8` and `fp32` weights when using `keep_in_fp32_modules` correctly.
+        `flan-t5-small` uses `T5DenseGatedActDense` whereas `t5-small` uses `T5DenseReluDense`. We need to test
+        both cases.
+        """
+        from transformers import T5ForConditionalGeneration
+
+        modules = T5ForConditionalGeneration._keep_in_fp32_modules
+        T5ForConditionalGeneration._keep_in_fp32_modules = None
+
+        # test with `t5-small`
+        model = T5ForConditionalGeneration.from_pretrained(self.model_name, load_in_8bit=True, device_map="auto")
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
+        _ = model.generate(**encoded_input)
+
+        # test with `flan-t5-small`
+        model = T5ForConditionalGeneration.from_pretrained(
+            self.dense_act_model_name, load_in_8bit=True, device_map="auto"
+        )
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
+        _ = model.generate(**encoded_input)
+        T5ForConditionalGeneration._keep_in_fp32_modules = modules
+
+    def test_inference_with_keep_in_fp32(self):
+        r"""
+        Test whether it is possible to mix both `int8` and `fp32` weights when using `keep_in_fp32_modules` correctly.
+        `flan-t5-small` uses `T5DenseGatedActDense` whereas `t5-small` uses `T5DenseReluDense`. We need to test
+        both cases.
+        """
+        import bitsandbytes as bnb
+
+        from transformers import T5ForConditionalGeneration
+
+        # test with `t5-small`
+        model = T5ForConditionalGeneration.from_pretrained(self.model_name, load_in_8bit=True, device_map="auto")
+
+        # there was a bug with decoders - this test checks that it is fixed
+        self.assertTrue(isinstance(model.decoder.block[0].layer[0].SelfAttention.q, bnb.nn.Linear8bitLt))
+
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
+        _ = model.generate(**encoded_input)
+
+        # test with `flan-t5-small`
+        model = T5ForConditionalGeneration.from_pretrained(
+            self.dense_act_model_name, load_in_8bit=True, device_map="auto"
+        )
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
+        _ = model.generate(**encoded_input)
+
+    def test_inference_with_keep_in_fp32_serialized(self):
+        r"""
+        Test whether it is possible to mix both `int8` and `fp32` weights when using `keep_in_fp32_modules` correctly on
+        a serialized model.
+        `flan-t5-small` uses `T5DenseGatedActDense` whereas `t5-small` uses `T5DenseReluDense`. We need to test
+        both cases.
+        """
+        import bitsandbytes as bnb
+
+        from transformers import T5ForConditionalGeneration
+
+        # test with `t5-small`
+        model = T5ForConditionalGeneration.from_pretrained(self.model_name, load_in_8bit=True, device_map="auto")
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir)
+
+            model = T5ForConditionalGeneration.from_pretrained(tmp_dir)
+
+            # there was a bug with decoders - this test checks that it is fixed
+            self.assertTrue(isinstance(model.decoder.block[0].layer[0].SelfAttention.q, bnb.nn.Linear8bitLt))
+
+            encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
+            _ = model.generate(**encoded_input)
+
+            # test with `flan-t5-small`
+            model = T5ForConditionalGeneration.from_pretrained(
+                self.dense_act_model_name, load_in_8bit=True, device_map="auto"
+            )
+            encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
+            _ = model.generate(**encoded_input)
+
+
+class MixedInt8ModelClassesTest(BaseMixedInt8Test):
+    def setUp(self):
+        super().setUp()
+        # model_name
+        self.model_name = "bigscience/bloom-560m"
+        self.seq_to_seq_name = "t5-small"
+
+        # Different types of model
+
+        self.base_model = AutoModel.from_pretrained(self.model_name, load_in_8bit=True, device_map="auto")
+        # Sequence classification model
+        self.sequence_model = AutoModelForSequenceClassification.from_pretrained(
+            self.model_name, load_in_8bit=True, device_map="auto"
+        )
+        # CausalLM model
+        self.model_8bit = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_8bit=True, device_map="auto")
+        # Seq2seq model
+        self.seq_to_seq_model = AutoModelForSeq2SeqLM.from_pretrained(
+            self.seq_to_seq_name, load_in_8bit=True, device_map="auto"
+        )
+
+    def tearDown(self):
+        r"""
+        TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
+        avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
+        """
+        del self.base_model
+        del self.sequence_model
+        del self.model_8bit
+        del self.seq_to_seq_model
+
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_correct_head_class(self):
+        r"""
+        A simple test to check if the last modules for some classes (AutoModelForCausalLM or SequenceClassification)
+        are kept in their native class.
+        """
+        from bitsandbytes.nn import Int8Params
+
+        # last param of a base model should be a linear8bit module
+        self.assertTrue(self.base_model.h[-1].mlp.dense_4h_to_h.weight.__class__ == Int8Params)
+
+        # Other heads should be nn.Parameter
+        self.assertTrue(self.model_8bit.lm_head.weight.__class__ == torch.nn.Parameter)
+        self.assertTrue(self.sequence_model.score.weight.__class__ == torch.nn.Parameter)
+        self.assertTrue(self.seq_to_seq_model.lm_head.weight.__class__ == torch.nn.Parameter)
+
+
+class MixedInt8TestPipeline(BaseMixedInt8Test):
+    def setUp(self):
+        super().setUp()
+
+    def tearDown(self):
+        r"""
+        TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
+        avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
+        """
+        del self.pipe
+
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_pipeline(self):
+        r"""
+        The aim of this test is to verify that the mixed int8 is compatible with `pipeline` from transformers. Since
+        we used pipline for inference speed benchmarking we want to make sure that this feature does not break anything
+        on pipline.
+        """
+        # self._clear_cuda_cache()
+        self.pipe = pipeline(
+            "text-generation",
+            model=self.model_name,
+            model_kwargs={"device_map": "auto", "load_in_8bit": True},
+            max_new_tokens=self.MAX_NEW_TOKENS,
+        )
+
+        # Real second forward pass
+        pipeline_output = self.pipe(self.input_text)
+        self.assertEqual(pipeline_output[0]["generated_text"], self.EXPECTED_OUTPUT)
+
+
+@require_torch_multi_gpu
+class MixedInt8TestMultiGpu(BaseMixedInt8Test):
+    def setUp(self):
+        super().setUp()
+
+    def test_multi_gpu_loading(self):
+        r"""
+        This tests that the model has been loaded and can be used correctly on a multi-GPU setup.
+        Let's just try to load a model on 2 GPUs and see if it works. The model we test has ~2GB of total, 3GB should suffice
+        """
+
+        model_parallel = AutoModelForCausalLM.from_pretrained(
+            self.model_name, load_in_8bit=True, device_map="balanced"
+        )
+
+        # Check correct device map
+        self.assertEqual(set(model_parallel.hf_device_map.values()), {0, 1})
+
+        # Check that inference pass works on the model
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
+
+        # Second real batch
+        output_parallel = model_parallel.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+        self.assertEqual(self.tokenizer.decode(output_parallel[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
+
+
+@require_torch_multi_gpu
+class MixedInt8TestCpuGpu(BaseMixedInt8Test):
+    def setUp(self):
+        super().setUp()
+
+    def check_inference_correctness(self, model):
+        # Check that inference pass works on the model
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
+
+        # Check the exactness of the results
+        output_parallel = model.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+
+        # Get the generation
+        output_text = self.tokenizer.decode(output_parallel[0], skip_special_tokens=True)
+        self.assertEqual(output_text, self.EXPECTED_OUTPUT)
+
+    def test_cpu_gpu_loading_random_device_map(self):
+        r"""
+        A test to check is dispatching a model on cpu & gpu works correctly using a random `device_map`.
+        """
+        device_map = {
+            "transformer.word_embeddings": 0,
+            "transformer.word_embeddings_layernorm": 0,
+            "lm_head": 0,
+            "transformer.h.0": "cpu",
+            "transformer.h.1": "cpu",
+            "transformer.h.2": 0,
+            "transformer.h.3": 0,
+            "transformer.h.4": 0,
+            "transformer.h.5": 0,
+            "transformer.h.6": 0,
+            "transformer.h.7": 0,
+            "transformer.h.8": 0,
+            "transformer.h.9": 1,
+            "transformer.h.10": 0,
+            "transformer.h.11": 1,
+            "transformer.h.12": 0,
+            "transformer.h.13": 0,
+            "transformer.h.14": 1,
+            "transformer.h.15": 0,
+            "transformer.h.16": 0,
+            "transformer.h.17": 1,
+            "transformer.h.18": 1,
+            "transformer.h.19": 0,
+            "transformer.h.20": 1,
+            "transformer.h.21": 1,
+            "transformer.h.22": 0,
+            "transformer.h.23": 0,
+            "transformer.ln_f": 1,
+        }
+
+        bnb_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)
+
+        model_8bit = AutoModelForCausalLM.from_pretrained(
+            self.model_name,
+            device_map=device_map,
+            quantization_config=bnb_config,
+        )
+
+        # Check that the model has been correctly set on device 0, 1, and `cpu`.
+        self.assertEqual(set(model_8bit.hf_device_map.values()), {0, 1, "cpu"})
+
+        self.check_inference_correctness(model_8bit)
+
+    def test_cpu_gpu_loading_custom_device_map(self):
+        r"""
+        A test to check is dispatching a model on cpu & gpu works correctly using a custom `device_map`.
+        This time the device map is more organized than the test above and uses the abstraction
+        `transformer.h` to encapsulate all the decoder layers.
+        """
+        device_map = {
+            "transformer.word_embeddings": "cpu",
+            "transformer.word_embeddings_layernorm": "cpu",
+            "lm_head": "cpu",
+            "transformer.h": 0,
+            "transformer.ln_f": 1,
+        }
+        bnb_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)
+
+        # Load model
+        model_8bit = AutoModelForCausalLM.from_pretrained(
+            self.model_name,
+            device_map=device_map,
+            quantization_config=bnb_config,
+        )
+
+        # Check that the model has been correctly set on device 0, 1, and `cpu`.
+        self.assertEqual(set(model_8bit.hf_device_map.values()), {0, 1, "cpu"})
+
+        self.check_inference_correctness(model_8bit)
+
+    def test_cpu_gpu_disk_loading_custom_device_map(self):
+        r"""
+        A test to check is dispatching a model on cpu & gpu works correctly using a custom `device_map`.
+        This time we also add `disk` on the device_map.
+        """
+        device_map = {
+            "transformer.word_embeddings": 0,
+            "transformer.word_embeddings_layernorm": "cpu",
+            "lm_head": 0,
+            "transformer.h": 1,
+            "transformer.ln_f": "disk",
+        }
+        bnb_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            # Load model
+            model_8bit = AutoModelForCausalLM.from_pretrained(
+                self.model_name,
+                device_map=device_map,
+                quantization_config=bnb_config,
+                offload_folder=tmpdirname,
+            )
+
+            # Check that the model has been correctly set on device 0, 1, and `cpu`.
+            self.assertEqual(set(model_8bit.hf_device_map.values()), {0, 1, "cpu", "disk"})
+
+            self.check_inference_correctness(model_8bit)
+
+    def test_cpu_gpu_disk_loading_custom_device_map_kwargs(self):
+        r"""
+        A test to check is dispatching a model on cpu & gpu works correctly using a custom `device_map`.
+        This time we also add `disk` on the device_map - using the kwargs directly instead of the quantization config
+        """
+        device_map = {
+            "transformer.word_embeddings": 0,
+            "transformer.word_embeddings_layernorm": "cpu",
+            "lm_head": 0,
+            "transformer.h": 1,
+            "transformer.ln_f": "disk",
+        }
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            # Load model
+            model_8bit = AutoModelForCausalLM.from_pretrained(
+                self.model_name,
+                device_map=device_map,
+                load_in_8bit=True,
+                llm_int8_enable_fp32_cpu_offload=True,
+                offload_folder=tmpdirname,
+            )
+
+            # Check that the model has been correctly set on device 0, 1, and `cpu`.
+            self.assertEqual(set(model_8bit.hf_device_map.values()), {0, 1, "cpu", "disk"})
+
+            self.check_inference_correctness(model_8bit)
+
+
+class MixedInt8TestTraining(BaseMixedInt8Test):
+    def setUp(self):
+        self.model_name = "facebook/opt-350m"
+        super().setUp()
+
+    def test_training(self):
+        if version.parse(importlib.metadata.version("bitsandbytes")) < version.parse("0.37.0"):
+            return
+
+        # Step 1: freeze all parameters
+        model = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_8bit=True)
+
+        self.assertEqual(set(model.hf_device_map.values()), {torch.cuda.current_device()})
+
+        for param in model.parameters():
+            param.requires_grad = False  # freeze the model - train adapters later
+            if param.ndim == 1:
+                # cast the small parameters (e.g. layernorm) to fp32 for stability
+                param.data = param.data.to(torch.float32)
+
+        # Step 2: add adapters
+        for _, module in model.named_modules():
+            if "OPTAttention" in repr(type(module)):
+                module.q_proj = LoRALayer(module.q_proj, rank=16)
+                module.k_proj = LoRALayer(module.k_proj, rank=16)
+                module.v_proj = LoRALayer(module.v_proj, rank=16)
+
+        # Step 3: dummy batch
+        batch = self.tokenizer("Test batch ", return_tensors="pt").to(0)
+
+        # Step 4: Check if the gradient is not None
+        with torch.cuda.amp.autocast():
+            out = model.forward(**batch)
+            out.logits.norm().backward()
+
+        for module in model.modules():
+            if isinstance(module, LoRALayer):
+                self.assertTrue(module.adapter[1].weight.grad is not None)
+                self.assertTrue(module.adapter[1].weight.grad.norm().item() > 0)
+            elif isinstance(module, nn.Embedding):
+                self.assertTrue(module.weight.grad is None)
+
+
+class MixedInt8GPT2Test(MixedInt8Test):
+    model_name = "gpt2-xl"
+    EXPECTED_RELATIVE_DIFFERENCE = 1.8720077507258357
+    EXPECTED_OUTPUT = "Hello my name is John Doe, and I'm a big fan of"
+
+    def test_int8_from_pretrained(self):
+        r"""
+        Test whether loading a 8bit model from the Hub works as expected
+        """
+        from bitsandbytes.nn import Int8Params
+
+        model_id = "ybelkada/gpt2-xl-8bit"
+
+        model = AutoModelForCausalLM.from_pretrained(model_id)
+
+        linear = get_some_linear_layer(model)
+        self.assertTrue(linear.weight.__class__ == Int8Params)
+        self.assertTrue(hasattr(linear.weight, "SCB"))
+
+        # generate
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
+        output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+
+        self.assertEqual(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
diff --git a/tests/quantization/gptq/__init__.py b/tests/quantization/gptq/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/quantization/gptq/test_gptq.py b/tests/quantization/gptq/test_gptq.py
new file mode 100644
index 000000000000..d715bd56c02e
--- /dev/null
+++ b/tests/quantization/gptq/test_gptq.py
@@ -0,0 +1,351 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tempfile
+import unittest
+
+import pytest
+
+from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
+from transformers.testing_utils import (
+    is_torch_available,
+    require_accelerate,
+    require_auto_gptq,
+    require_optimum,
+    require_torch_gpu,
+    require_torch_multi_gpu,
+    slow,
+)
+
+
+if is_torch_available():
+    import torch
+
+
+class GPTQConfigTest(unittest.TestCase):
+    def test_bits(self):
+        with self.assertRaises(ValueError):
+            GPTQConfig(bits="")
+            GPTQConfig(bits=1)
+        GPTQConfig(bits=2)
+        GPTQConfig(bits=4)
+
+    def test_dataset(self):
+        with self.assertRaises(ValueError):
+            GPTQConfig(bits=2, dataset="auto_gpt")
+        GPTQConfig(bits=2, dataset="c4")
+        GPTQConfig(bits=2, dataset="ptb-new")
+
+    def test_damp_percent(self):
+        with self.assertRaises(ValueError):
+            GPTQConfig(bits=2, damp_percent=10)
+            GPTQConfig(bits=2, damp_percent=-1)
+            GPTQConfig(bits=2, damp_percent="0")
+        GPTQConfig(bits=2, damp_percent=0.01)
+
+    def test_to_dict(self):
+        quantization_config = GPTQConfig(bits=2)
+        quantization_config.to_dict()
+
+    def test_from_dict(self):
+        dict = {"bits": 2}
+        quantization_config = GPTQConfig.from_dict(dict)
+        self.assertEqual(dict["bits"], quantization_config.bits)
+
+    @require_optimum
+    def test_optimum_config(self):
+        from optimum.gptq import GPTQQuantizer
+
+        config = GPTQConfig(bits=2)
+        optimum_config = GPTQQuantizer.from_dict(config.to_dict())
+        self.assertEqual(optimum_config.bits, config.bits)
+        new_config = GPTQConfig.from_dict(optimum_config.to_dict())
+        self.assertEqual(optimum_config.bits, new_config.bits)
+
+
+@slow
+@require_optimum
+@require_auto_gptq
+@require_torch_gpu
+class GPTQTest(unittest.TestCase):
+    model_name = "bigscience/bloom-560m"
+
+    input_text = "Hello my name is"
+
+    EXPECTED_OUTPUTS = set()
+    EXPECTED_OUTPUTS.add("Hello my name is John and I am a professional photographer. I")
+    EXPECTED_OUTPUTS.add("Hello my name is John, I am a professional photographer and I")
+    EXPECTED_OUTPUTS.add("Hello my name is John, I am a student in the University of")
+    EXPECTED_OUTPUTS.add("Hello my name is John and I am a very good looking man.")
+    EXPECTED_OUTPUTS.add("Hello my name is Alyson, I am a student in the")
+    EXPECTED_OUTPUTS.add("Hello my name is Alyson and I am a very sweet,")
+
+    # this seems a little small considering that we are doing 4bit quant but we have a small model and ww don't quantize the embeddings
+    EXPECTED_RELATIVE_DIFFERENCE = 1.664253062
+
+    bits = 4
+    group_size = 128
+    desc_act = False
+    disable_exllama = True
+
+    dataset = [
+        "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
+    ]
+
+    device_map = None
+
+    # called only once for all test in this class
+    @classmethod
+    def setUpClass(cls):
+        """
+        Setup quantized model
+        """
+        cls.model_fp16 = AutoModelForCausalLM.from_pretrained(
+            cls.model_name, torch_dtype=torch.float16, device_map=cls.device_map
+        )
+        cls.mem_fp16 = cls.model_fp16.get_memory_footprint()
+
+        cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name, use_fast=True)
+
+        quantization_config = GPTQConfig(
+            bits=cls.bits,
+            dataset=cls.dataset,
+            tokenizer=cls.tokenizer,
+            group_size=cls.group_size,
+            desc_act=cls.desc_act,
+            disable_exllama=cls.disable_exllama,
+        )
+
+        cls.quantized_model = AutoModelForCausalLM.from_pretrained(
+            cls.model_name,
+            torch_dtype=torch.float16,
+            device_map=cls.device_map,
+            quantization_config=quantization_config,
+        )
+
+    def test_memory_footprint(self):
+        r"""
+        A simple test to check if the model conversion has been done correctly by checking on the
+        memory footprint of the converted model
+        """
+
+        mem_quantized = self.quantized_model.get_memory_footprint()
+
+        self.assertAlmostEqual(self.mem_fp16 / mem_quantized, self.EXPECTED_RELATIVE_DIFFERENCE)
+
+    def test_quantized_layers_class(self):
+        """
+        Simple test to check if the model conversion has been done correctly by checking on
+        the class type of the linear layers of the converted models
+        """
+        from auto_gptq.utils.import_utils import dynamically_import_QuantLinear
+
+        QuantLinear = dynamically_import_QuantLinear(
+            use_triton=False,
+            desc_act=self.desc_act,
+            group_size=self.group_size,
+            bits=self.bits,
+            disable_exllama=self.disable_exllama,
+        )
+        self.assertTrue(self.quantized_model.transformer.h[0].mlp.dense_4h_to_h.__class__ == QuantLinear)
+
+    def check_inference_correctness(self, model):
+        r"""
+        Test the generation quality of the quantized model and see that we are matching the expected output.
+        Given that we are operating on small numbers + the testing model is relatively small, we might not get
+        the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
+        """
+        # Check that inference pass works on the model
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
+
+        # Check the exactness of the results
+        output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+
+        # Get the generation
+        self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
+
+    def test_generate_quality(self):
+        """
+        Simple test to check the quality of the model by comapring the the generated tokens with the expected tokens
+        """
+        if self.device_map is None:
+            self.check_inference_correctness(self.quantized_model.to(0))
+        else:
+            self.check_inference_correctness(self.quantized_model)
+
+    def test_serialization(self):
+        """
+        Test the serialization of the model and the loading of the quantized weights works
+        """
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            self.quantized_model.save_pretrained(tmpdirname)
+            if self.disable_exllama:
+                quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(tmpdirname).to(0)
+            else:
+                # we need to put it directly to the gpu. Otherwise, we won't be able to initialize the exllama kernel
+                quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map={"": 0})
+            self.check_inference_correctness(quantized_model_from_saved)
+
+    @require_accelerate
+    def test_serialization_big_model_inference(self):
+        """
+        Test the serialization of the model and the loading of the quantized weights with big model inference
+        """
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            self.quantized_model.save_pretrained(tmpdirname)
+            quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map="auto")
+            self.check_inference_correctness(quantized_model_from_saved)
+
+    def test_change_loading_attributes(self):
+        """
+        Test the serialization of the model and the loading of the quantized weights works with another config file
+        """
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            self.quantized_model.save_pretrained(tmpdirname)
+            if self.disable_exllama:
+                self.assertEqual(self.quantized_model.config.quantization_config.disable_exllama, True)
+                # we need to put it directly to the gpu. Otherwise, we won't be able to initialize the exllama kernel
+                quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
+                    tmpdirname, quantization_config=GPTQConfig(disable_exllama=False, bits=4), device_map={"": 0}
+                )
+                self.assertEqual(quantized_model_from_saved.config.quantization_config.disable_exllama, False)
+                self.assertEqual(quantized_model_from_saved.config.quantization_config.bits, self.bits)
+                self.check_inference_correctness(quantized_model_from_saved)
+
+
+@require_accelerate
+@require_torch_multi_gpu
+class GPTQTestDeviceMap(GPTQTest):
+    device_map = "auto"
+
+
+@require_accelerate
+@require_torch_multi_gpu
+class GPTQTestDeviceMapExllama(GPTQTest):
+    device_map = "auto"
+    disable_exllama = False
+
+
+@slow
+@require_optimum
+@require_auto_gptq
+@require_torch_gpu
+@require_accelerate
+class GPTQTestActOrderExllama(unittest.TestCase):
+    """
+    Test GPTQ model with exllama kernel and desc_act=True (also known as act-order).
+    More information on those arguments here:
+    https://huggingface.co/docs/transformers/main_classes/quantization#transformers.GPTQConfig
+    """
+
+    EXPECTED_OUTPUTS = set()
+    EXPECTED_OUTPUTS.add("Hello my name is Katie and I am a 20 year")
+    model_name = "hf-internal-testing/Llama-2-7B-GPTQ"
+    revision = "gptq-4bit-128g-actorder_True"
+    input_text = "Hello my name is"
+
+    @classmethod
+    def setUpClass(cls):
+        """
+        Setup quantized model
+        """
+
+        cls.quantization_config = GPTQConfig(bits=4, disable_exllama=False, max_input_length=4028)
+        cls.quantized_model = AutoModelForCausalLM.from_pretrained(
+            cls.model_name,
+            revision=cls.revision,
+            torch_dtype=torch.float16,
+            device_map={"": 0},
+            quantization_config=cls.quantization_config,
+        )
+        cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name, use_fast=True)
+
+    def check_inference_correctness(self, model):
+        """
+        Test the generation quality of the quantized model and see that we are matching the expected output.
+        Given that we are operating on small numbers + the testing model is relatively small, we might not get
+        the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
+        """
+
+        # Check that inference pass works on the model
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
+
+        # Check the exactness of the results
+        output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+
+        # Get the generation
+        self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
+
+    def test_generate_quality(self):
+        """
+        Simple test to check the quality of the model by comapring the the generated tokens with the expected tokens
+        """
+        self.check_inference_correctness(self.quantized_model)
+
+    # this test will fail until the next release of optimum
+    @pytest.mark.skip
+    def test_max_input_length(self):
+        """
+        Test if the max_input_length works. It modifies the maximum input length that of the model that runs with exllama backend.
+        """
+
+        prompt = "I am in Paris and" * 1000
+        inp = self.tokenizer(prompt, return_tensors="pt").to(0)
+        self.assertTrue(inp["input_ids"].shape[1] > 4028)
+        with self.assertRaises(RuntimeError) as cm:
+            self.quantized_model.generate(**inp, num_beams=1, min_new_tokens=3, max_new_tokens=3)
+            self.assertTrue("temp_state buffer is too small" in str(cm.exception))
+
+        prompt = "I am in Paris and" * 500
+        inp = self.tokenizer(prompt, return_tensors="pt").to(0)
+        self.assertTrue(inp["input_ids"].shape[1] < 4028)
+        self.quantized_model.generate(**inp, num_beams=1, min_new_tokens=3, max_new_tokens=3)
+
+
+# fail when run all together
+@pytest.mark.skip
+@require_accelerate
+@require_torch_multi_gpu
+class GPTQTestDeviceMapCPUOffload(GPTQTest):
+    device_map = {
+        "transformer.word_embeddings": 0,
+        "transformer.word_embeddings_layernorm": 0,
+        "lm_head": 0,
+        "transformer.h.0": 0,
+        "transformer.h.1": 0,
+        "transformer.h.2": 0,
+        "transformer.h.3": 0,
+        "transformer.h.4": 0,
+        "transformer.h.5": 0,
+        "transformer.h.6": 0,
+        "transformer.h.7": 0,
+        "transformer.h.8": 0,
+        "transformer.h.9": 0,
+        "transformer.h.10": 1,
+        "transformer.h.11": 1,
+        "transformer.h.12": 1,
+        "transformer.h.13": 1,
+        "transformer.h.14": 1,
+        "transformer.h.15": 1,
+        "transformer.h.16": 1,
+        "transformer.h.17": 0,
+        "transformer.h.18": "cpu",
+        "transformer.h.19": "cpu",
+        "transformer.h.20": "cpu",
+        "transformer.h.21": "cpu",
+        "transformer.h.22": "cpu",
+        "transformer.h.23": 1,
+        "transformer.ln_f": 0,
+    }
diff --git a/tests/repo_utils/test_check_copies.py b/tests/repo_utils/test_check_copies.py
index 57cecf6653ff..e3e8e47a873f 100644
--- a/tests/repo_utils/test_check_copies.py
+++ b/tests/repo_utils/test_check_copies.py
@@ -13,19 +13,19 @@
 # limitations under the License.
 
 import os
-import re
 import shutil
 import sys
 import tempfile
 import unittest
-
-import black
+from contextlib import contextmanager
+from pathlib import Path
 
 
 git_repo_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
 sys.path.append(os.path.join(git_repo_path, "utils"))
 
 import check_copies  # noqa: E402
+from check_copies import convert_to_localized_md, find_code_in_transformers, is_copy_consistent  # noqa: E402
 
 
 # This is the reference code that will be used in the tests.
@@ -49,78 +49,137 @@ def forward(self, hidden_states):
         return hidden_states
 """
 
+MOCK_BERT_CODE = """from ...modeling_utils import PreTrainedModel
+
+def bert_function(x):
+    return x
+
+
+class BertAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+
+class BertModel(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__()
+        self.bert = BertEncoder(config)
+
+    @add_docstring(BERT_DOCSTRING)
+    def forward(self, x):
+        return self.bert(x)
+"""
+
+MOCK_BERT_COPY_CODE = """from ...modeling_utils import PreTrainedModel
+
+# Copied from transformers.models.bert.modeling_bert.bert_function
+def bert_copy_function(x):
+    return x
+
+
+# Copied from transformers.models.bert.modeling_bert.BertAttention
+class BertCopyAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
 
-class CopyCheckTester(unittest.TestCase):
-    def setUp(self):
-        self.transformer_dir = tempfile.mkdtemp()
-        os.makedirs(os.path.join(self.transformer_dir, "models/bert/"))
-        check_copies.TRANSFORMER_PATH = self.transformer_dir
-        shutil.copy(
-            os.path.join(git_repo_path, "src/transformers/models/bert/modeling_bert.py"),
-            os.path.join(self.transformer_dir, "models/bert/modeling_bert.py"),
-        )
 
-    def tearDown(self):
-        check_copies.TRANSFORMER_PATH = "src/transformers"
-        shutil.rmtree(self.transformer_dir)
-
-    def check_copy_consistency(self, comment, class_name, class_code, overwrite_result=None):
-        code = comment + f"\nclass {class_name}(nn.Module):\n" + class_code
-        if overwrite_result is not None:
-            expected = comment + f"\nclass {class_name}(nn.Module):\n" + overwrite_result
-        mode = black.Mode(target_versions={black.TargetVersion.PY35}, line_length=119)
-        code = black.format_str(code, mode=mode)
-        fname = os.path.join(self.transformer_dir, "new_code.py")
-        with open(fname, "w", newline="\n") as f:
+# Copied from transformers.models.bert.modeling_bert.BertModel with Bert->BertCopy all-casing
+class BertCopyModel(BertCopyPreTrainedModel):
+    def __init__(self, config):
+        super().__init__()
+        self.bertcopy = BertCopyEncoder(config)
+
+    @add_docstring(BERTCOPY_DOCSTRING)
+    def forward(self, x):
+        return self.bertcopy(x)
+"""
+
+
+def replace_in_file(filename, old, new):
+    with open(filename, "r", encoding="utf-8") as f:
+        content = f.read()
+
+    content = content.replace(old, new)
+
+    with open(filename, "w", encoding="utf-8") as f:
+        f.write(content)
+
+
+def create_tmp_repo(tmp_dir):
+    """
+    Creates a mock repository in a temporary folder for testing.
+    """
+    tmp_dir = Path(tmp_dir)
+    if tmp_dir.exists():
+        shutil.rmtree(tmp_dir)
+    tmp_dir.mkdir(exist_ok=True)
+
+    model_dir = tmp_dir / "src" / "transformers" / "models"
+    model_dir.mkdir(parents=True, exist_ok=True)
+
+    models = {"bert": MOCK_BERT_CODE, "bertcopy": MOCK_BERT_COPY_CODE}
+    for model, code in models.items():
+        model_subdir = model_dir / model
+        model_subdir.mkdir(exist_ok=True)
+        with open(model_subdir / f"modeling_{model}.py", "w", encoding="utf-8") as f:
             f.write(code)
-        if overwrite_result is None:
-            self.assertTrue(len(check_copies.is_copy_consistent(fname)) == 0)
-        else:
-            check_copies.is_copy_consistent(f.name, overwrite=True)
-            with open(fname, "r") as f:
-                self.assertTrue(f.read(), expected)
 
+
+@contextmanager
+def patch_transformer_repo_path(new_folder):
+    """
+    Temporarily patches the variables defines in `check_copies` to use a different location for the repo.
+    """
+    old_repo_path = check_copies.REPO_PATH
+    old_doc_path = check_copies.PATH_TO_DOCS
+    old_transformer_path = check_copies.TRANSFORMERS_PATH
+    repo_path = Path(new_folder).resolve()
+    check_copies.REPO_PATH = str(repo_path)
+    check_copies.PATH_TO_DOCS = str(repo_path / "docs" / "source" / "en")
+    check_copies.TRANSFORMERS_PATH = str(repo_path / "src" / "transformers")
+    try:
+        yield
+    finally:
+        check_copies.REPO_PATH = old_repo_path
+        check_copies.PATH_TO_DOCS = old_doc_path
+        check_copies.TRANSFORMERS_PATH = old_transformer_path
+
+
+class CopyCheckTester(unittest.TestCase):
     def test_find_code_in_transformers(self):
-        code = check_copies.find_code_in_transformers("models.bert.modeling_bert.BertLMPredictionHead")
-        self.assertEqual(code, REFERENCE_CODE)
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            create_tmp_repo(tmp_folder)
+            with patch_transformer_repo_path(tmp_folder):
+                code = find_code_in_transformers("models.bert.modeling_bert.BertAttention")
 
-    def test_is_copy_consistent(self):
-        # Base copy consistency
-        self.check_copy_consistency(
-            "# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead",
-            "BertLMPredictionHead",
-            REFERENCE_CODE + "\n",
+        reference_code = (
+            "class BertAttention(nn.Module):\n    def __init__(self, config):\n        super().__init__()\n"
         )
+        self.assertEqual(code, reference_code)
 
-        # With no empty line at the end
-        self.check_copy_consistency(
-            "# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead",
-            "BertLMPredictionHead",
-            REFERENCE_CODE,
-        )
+    def test_is_copy_consistent(self):
+        path_to_check = ["src", "transformers", "models", "bertcopy", "modeling_bertcopy.py"]
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            # Base check
+            create_tmp_repo(tmp_folder)
+            with patch_transformer_repo_path(tmp_folder):
+                file_to_check = os.path.join(tmp_folder, *path_to_check)
+                diffs = is_copy_consistent(file_to_check)
+                self.assertEqual(diffs, [])
 
-        # Copy consistency with rename
-        self.check_copy_consistency(
-            "# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->TestModel",
-            "TestModelLMPredictionHead",
-            re.sub("Bert", "TestModel", REFERENCE_CODE),
-        )
+            # Base check with an inconsistency
+            create_tmp_repo(tmp_folder)
+            with patch_transformer_repo_path(tmp_folder):
+                file_to_check = os.path.join(tmp_folder, *path_to_check)
 
-        # Copy consistency with a really long name
-        long_class_name = "TestModelWithAReallyLongNameBecauseSomePeopleLikeThatForSomeReason"
-        self.check_copy_consistency(
-            f"# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->{long_class_name}",
-            f"{long_class_name}LMPredictionHead",
-            re.sub("Bert", long_class_name, REFERENCE_CODE),
-        )
+                replace_in_file(file_to_check, "self.bertcopy(x)", "self.bert(x)")
+                diffs = is_copy_consistent(file_to_check)
+                self.assertEqual(diffs, [["models.bert.modeling_bert.BertModel", 22]])
 
-        # Copy consistency with overwrite
-        self.check_copy_consistency(
-            "# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->TestModel",
-            "TestModelLMPredictionHead",
-            REFERENCE_CODE,
-            overwrite_result=re.sub("Bert", "TestModel", REFERENCE_CODE),
-        )
+                diffs = is_copy_consistent(file_to_check, overwrite=True)
+
+                with open(file_to_check, "r", encoding="utf-8") as f:
+                    self.assertEqual(f.read(), MOCK_BERT_COPY_CODE)
 
     def test_convert_to_localized_md(self):
         localized_readme = check_copies.LOCALIZED_READMES["README_zh-hans.md"]
@@ -168,14 +227,14 @@ def test_convert_to_localized_md(self):
             " Christopher D. Manning 发布。\n"
         )
 
-        num_models_equal, converted_md_list = check_copies.convert_to_localized_md(
+        num_models_equal, converted_md_list = convert_to_localized_md(
             md_list, localized_md_list, localized_readme["format_model_list"]
         )
 
         self.assertFalse(num_models_equal)
         self.assertEqual(converted_md_list, converted_md_list_sample)
 
-        num_models_equal, converted_md_list = check_copies.convert_to_localized_md(
+        num_models_equal, converted_md_list = convert_to_localized_md(
             md_list, converted_md_list, localized_readme["format_model_list"]
         )
 
@@ -201,7 +260,7 @@ def test_convert_to_localized_md(self):
             " Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut 发布。\n"
         )
 
-        num_models_equal, converted_md_list = check_copies.convert_to_localized_md(
+        num_models_equal, converted_md_list = convert_to_localized_md(
             link_changed_md_list, link_unchanged_md_list, localized_readme["format_model_list"]
         )
 
diff --git a/tests/repo_utils/test_check_docstrings.py b/tests/repo_utils/test_check_docstrings.py
new file mode 100644
index 000000000000..f761514a084d
--- /dev/null
+++ b/tests/repo_utils/test_check_docstrings.py
@@ -0,0 +1,98 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import os
+import sys
+import unittest
+
+
+git_repo_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+sys.path.append(os.path.join(git_repo_path, "utils"))
+
+from check_docstrings import get_default_description, replace_default_in_arg_description  # noqa: E402
+
+
+class CheckDostringsTested(unittest.TestCase):
+    def test_replace_default_in_arg_description(self):
+        # Standard docstring with default.
+        desc_with_default = "`float`, *optional*, defaults to 2.0"
+        self.assertEqual(
+            replace_default_in_arg_description(desc_with_default, 2.0), "`float`, *optional*, defaults to 2.0"
+        )
+        self.assertEqual(
+            replace_default_in_arg_description(desc_with_default, 1.0), "`float`, *optional*, defaults to 1.0"
+        )
+        self.assertEqual(replace_default_in_arg_description(desc_with_default, inspect._empty), "`float`")
+
+        # Standard docstring with default but optional is not using the stars.
+        desc_with_default_typo = "`float`, `optional`, defaults to 2.0"
+        self.assertEqual(
+            replace_default_in_arg_description(desc_with_default_typo, 2.0), "`float`, *optional*, defaults to 2.0"
+        )
+        self.assertEqual(
+            replace_default_in_arg_description(desc_with_default_typo, 1.0), "`float`, *optional*, defaults to 1.0"
+        )
+
+        # If the default is None we do not erase the value in the docstring.
+        self.assertEqual(
+            replace_default_in_arg_description(desc_with_default, None), "`float`, *optional*, defaults to 2.0"
+        )
+        # If the default is None (and set as such in the docstring), we do not include it.
+        desc_with_default = "`float`, *optional*, defaults to None"
+        self.assertEqual(replace_default_in_arg_description(desc_with_default, None), "`float`, *optional*")
+        desc_with_default = "`float`, *optional*, defaults to `None`"
+        self.assertEqual(replace_default_in_arg_description(desc_with_default, None), "`float`, *optional*")
+
+        # Operations are not replaced, but put in backtiks.
+        desc_with_default = "`float`, *optional*, defaults to 1/255"
+        self.assertEqual(
+            replace_default_in_arg_description(desc_with_default, 1 / 255), "`float`, *optional*, defaults to `1/255`"
+        )
+        desc_with_default = "`float`, *optional*, defaults to `1/255`"
+        self.assertEqual(
+            replace_default_in_arg_description(desc_with_default, 1 / 255), "`float`, *optional*, defaults to `1/255`"
+        )
+
+        desc_with_optional = "`float`, *optional*"
+        self.assertEqual(
+            replace_default_in_arg_description(desc_with_optional, 2.0), "`float`, *optional*, defaults to 2.0"
+        )
+        self.assertEqual(
+            replace_default_in_arg_description(desc_with_optional, 1.0), "`float`, *optional*, defaults to 1.0"
+        )
+        self.assertEqual(replace_default_in_arg_description(desc_with_optional, None), "`float`, *optional*")
+        self.assertEqual(replace_default_in_arg_description(desc_with_optional, inspect._empty), "`float`")
+
+        desc_with_no_optional = "`float`"
+        self.assertEqual(
+            replace_default_in_arg_description(desc_with_no_optional, 2.0), "`float`, *optional*, defaults to 2.0"
+        )
+        self.assertEqual(
+            replace_default_in_arg_description(desc_with_no_optional, 1.0), "`float`, *optional*, defaults to 1.0"
+        )
+        self.assertEqual(replace_default_in_arg_description(desc_with_no_optional, None), "`float`, *optional*")
+        self.assertEqual(replace_default_in_arg_description(desc_with_no_optional, inspect._empty), "`float`")
+
+    def test_get_default_description(self):
+        # Fake function to have arguments to test.
+        def _fake_function(a, b: int, c=1, d: float = 2.0, e: str = "blob"):
+            pass
+
+        params = inspect.signature(_fake_function).parameters
+        assert get_default_description(params["a"]) == "`<fill_type>`"
+        assert get_default_description(params["b"]) == "`int`"
+        assert get_default_description(params["c"]) == "`<fill_type>`, *optional*, defaults to 1"
+        assert get_default_description(params["d"]) == "`float`, *optional*, defaults to 2.0"
+        assert get_default_description(params["e"]) == '`str`, *optional*, defaults to `"blob"`'
diff --git a/tests/repo_utils/test_check_dummies.py b/tests/repo_utils/test_check_dummies.py
index 8dde0f49443b..25461b2a8c15 100644
--- a/tests/repo_utils/test_check_dummies.py
+++ b/tests/repo_utils/test_check_dummies.py
@@ -20,7 +20,7 @@
 git_repo_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
 sys.path.append(os.path.join(git_repo_path, "utils"))
 
-import check_dummies
+import check_dummies  # noqa: E402
 from check_dummies import create_dummy_files, create_dummy_object, find_backend, read_init  # noqa: E402
 
 
@@ -106,7 +106,6 @@ def __init__(self, *args, **kwargs):
 
     def test_create_dummy_files(self):
         expected_dummy_pytorch_file = """# This file is autogenerated by the command `make fix-copies`, do not edit.
-# flake8: noqa
 from ..utils import DummyObject, requires_backends
 
 
diff --git a/tests/repo_utils/test_get_test_info.py b/tests/repo_utils/test_get_test_info.py
new file mode 100644
index 000000000000..a6d4a9984d32
--- /dev/null
+++ b/tests/repo_utils/test_get_test_info.py
@@ -0,0 +1,109 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import unittest
+
+
+git_repo_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+sys.path.append(os.path.join(git_repo_path, "utils"))
+
+import get_test_info  # noqa: E402
+from get_test_info import (  # noqa: E402
+    get_model_to_test_mapping,
+    get_model_to_tester_mapping,
+    get_test_to_tester_mapping,
+)
+
+
+BERT_TEST_FILE = os.path.join("tests", "models", "bert", "test_modeling_bert.py")
+BLIP_TEST_FILE = os.path.join("tests", "models", "blip", "test_modeling_blip.py")
+
+
+class GetTestInfoTester(unittest.TestCase):
+    def test_get_test_to_tester_mapping(self):
+        bert_test_tester_mapping = get_test_to_tester_mapping(BERT_TEST_FILE)
+        blip_test_tester_mapping = get_test_to_tester_mapping(BLIP_TEST_FILE)
+
+        EXPECTED_BERT_MAPPING = {"BertModelTest": "BertModelTester"}
+
+        EXPECTED_BLIP_MAPPING = {
+            "BlipModelTest": "BlipModelTester",
+            "BlipTextImageModelTest": "BlipTextImageModelsModelTester",
+            "BlipTextModelTest": "BlipTextModelTester",
+            "BlipTextRetrievalModelTest": "BlipTextRetrievalModelTester",
+            "BlipVQAModelTest": "BlipVQAModelTester",
+            "BlipVisionModelTest": "BlipVisionModelTester",
+        }
+
+        self.assertEqual(get_test_info.to_json(bert_test_tester_mapping), EXPECTED_BERT_MAPPING)
+        self.assertEqual(get_test_info.to_json(blip_test_tester_mapping), EXPECTED_BLIP_MAPPING)
+
+    def test_get_model_to_test_mapping(self):
+        bert_model_test_mapping = get_model_to_test_mapping(BERT_TEST_FILE)
+        blip_model_test_mapping = get_model_to_test_mapping(BLIP_TEST_FILE)
+
+        EXPECTED_BERT_MAPPING = {
+            "BertForMaskedLM": ["BertModelTest"],
+            "BertForMultipleChoice": ["BertModelTest"],
+            "BertForNextSentencePrediction": ["BertModelTest"],
+            "BertForPreTraining": ["BertModelTest"],
+            "BertForQuestionAnswering": ["BertModelTest"],
+            "BertForSequenceClassification": ["BertModelTest"],
+            "BertForTokenClassification": ["BertModelTest"],
+            "BertLMHeadModel": ["BertModelTest"],
+            "BertModel": ["BertModelTest"],
+        }
+
+        EXPECTED_BLIP_MAPPING = {
+            "BlipForConditionalGeneration": ["BlipTextImageModelTest"],
+            "BlipForImageTextRetrieval": ["BlipTextRetrievalModelTest"],
+            "BlipForQuestionAnswering": ["BlipVQAModelTest"],
+            "BlipModel": ["BlipModelTest"],
+            "BlipTextModel": ["BlipTextModelTest"],
+            "BlipVisionModel": ["BlipVisionModelTest"],
+        }
+
+        self.assertEqual(get_test_info.to_json(bert_model_test_mapping), EXPECTED_BERT_MAPPING)
+        self.assertEqual(get_test_info.to_json(blip_model_test_mapping), EXPECTED_BLIP_MAPPING)
+
+    def test_get_model_to_tester_mapping(self):
+        bert_model_tester_mapping = get_model_to_tester_mapping(BERT_TEST_FILE)
+        blip_model_tester_mapping = get_model_to_tester_mapping(BLIP_TEST_FILE)
+
+        EXPECTED_BERT_MAPPING = {
+            "BertForMaskedLM": ["BertModelTester"],
+            "BertForMultipleChoice": ["BertModelTester"],
+            "BertForNextSentencePrediction": ["BertModelTester"],
+            "BertForPreTraining": ["BertModelTester"],
+            "BertForQuestionAnswering": ["BertModelTester"],
+            "BertForSequenceClassification": ["BertModelTester"],
+            "BertForTokenClassification": ["BertModelTester"],
+            "BertLMHeadModel": ["BertModelTester"],
+            "BertModel": ["BertModelTester"],
+        }
+
+        EXPECTED_BLIP_MAPPING = {
+            "BlipForConditionalGeneration": ["BlipTextImageModelsModelTester"],
+            "BlipForImageTextRetrieval": ["BlipTextRetrievalModelTester"],
+            "BlipForQuestionAnswering": ["BlipVQAModelTester"],
+            "BlipModel": ["BlipModelTester"],
+            "BlipTextModel": ["BlipTextModelTester"],
+            "BlipVisionModel": ["BlipVisionModelTester"],
+        }
+
+        self.assertEqual(get_test_info.to_json(bert_model_tester_mapping), EXPECTED_BERT_MAPPING)
+        self.assertEqual(get_test_info.to_json(blip_model_tester_mapping), EXPECTED_BLIP_MAPPING)
diff --git a/tests/repo_utils/test_tests_fetcher.py b/tests/repo_utils/test_tests_fetcher.py
index 0541b72d9581..a897bb3f0d08 100644
--- a/tests/repo_utils/test_tests_fetcher.py
+++ b/tests/repo_utils/test_tests_fetcher.py
@@ -13,52 +13,808 @@
 # limitations under the License.
 
 import os
+import shutil
 import sys
+import tempfile
 import unittest
+from contextlib import contextmanager
+from pathlib import Path
 
 from git import Repo
 
+from transformers.testing_utils import CaptureStdout
 
-git_repo_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
-sys.path.append(os.path.join(git_repo_path, "utils"))
 
-transformers_path = os.path.join(git_repo_path, "src", "transformers")
-# Tests are run against this specific commit for reproducibility
-# https://github.com/huggingface/transformers/tree/07f6690206e39ed7a4d9dbc58824314f7089bb38
-GIT_TEST_SHA = "07f6690206e39ed7a4d9dbc58824314f7089bb38"
+REPO_PATH = os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+sys.path.append(os.path.join(REPO_PATH, "utils"))
 
-from tests_fetcher import checkout_commit, clean_code, get_module_dependencies  # noqa: E402
+import tests_fetcher  # noqa: E402
+from tests_fetcher import (  # noqa: E402
+    checkout_commit,
+    clean_code,
+    create_module_to_test_map,
+    create_reverse_dependency_map,
+    create_reverse_dependency_tree,
+    diff_is_docstring_only,
+    extract_imports,
+    get_all_tests,
+    get_diff,
+    get_module_dependencies,
+    get_tree_starting_at,
+    infer_tests_to_run,
+    init_test_examples_dependencies,
+    parse_commit_message,
+    print_tree_deps_of,
+)
 
 
-class CheckDummiesTester(unittest.TestCase):
+BERT_MODELING_FILE = "src/transformers/models/bert/modeling_bert.py"
+BERT_MODEL_FILE = """from ...modeling_utils import PreTrainedModel
+from ...utils import is_torch_available
+from .configuration_bert import BertConfig
+
+class BertModel:
+    '''
+    This is the docstring.
+    '''
+    This is the code
+"""
+
+BERT_MODEL_FILE_NEW_DOCSTRING = """from ...modeling_utils import PreTrainedModel
+from ...utils import is_torch_available
+from .configuration_bert import BertConfig
+
+class BertModel:
+    '''
+    This is the docstring. It has been updated.
+    '''
+    This is the code
+"""
+
+BERT_MODEL_FILE_NEW_CODE = """from ...modeling_utils import PreTrainedModel
+from ...utils import is_torch_available
+from .configuration_bert import BertConfig
+
+class BertModel:
+    '''
+    This is the docstring.
+    '''
+    This is the code. It has been updated
+"""
+
+
+def create_tmp_repo(tmp_dir, models=None):
+    """
+    Creates a repository in a temporary directory mimicking the structure of Transformers. Uses the list of models
+    provided (which defaults to just `["bert"]`).
+    """
+    tmp_dir = Path(tmp_dir)
+    if tmp_dir.exists():
+        shutil.rmtree(tmp_dir)
+    tmp_dir.mkdir(exist_ok=True)
+    repo = Repo.init(tmp_dir)
+
+    if models is None:
+        models = ["bert"]
+    class_names = [model[0].upper() + model[1:] for model in models]
+
+    transformers_dir = tmp_dir / "src" / "transformers"
+    transformers_dir.mkdir(parents=True, exist_ok=True)
+    with open(transformers_dir / "__init__.py", "w") as f:
+        init_lines = ["from .utils import cached_file, is_torch_available"]
+        init_lines.extend(
+            [f"from .models.{model} import {cls}Config, {cls}Model" for model, cls in zip(models, class_names)]
+        )
+        f.write("\n".join(init_lines) + "\n")
+    with open(transformers_dir / "configuration_utils.py", "w") as f:
+        f.write("from .utils import cached_file\n\ncode")
+    with open(transformers_dir / "modeling_utils.py", "w") as f:
+        f.write("from .utils import cached_file\n\ncode")
+
+    utils_dir = tmp_dir / "src" / "transformers" / "utils"
+    utils_dir.mkdir(exist_ok=True)
+    with open(utils_dir / "__init__.py", "w") as f:
+        f.write("from .hub import cached_file\nfrom .imports import is_torch_available\n")
+    with open(utils_dir / "hub.py", "w") as f:
+        f.write("import huggingface_hub\n\ncode")
+    with open(utils_dir / "imports.py", "w") as f:
+        f.write("code")
+
+    model_dir = tmp_dir / "src" / "transformers" / "models"
+    model_dir.mkdir(parents=True, exist_ok=True)
+    with open(model_dir / "__init__.py", "w") as f:
+        f.write("\n".join([f"import {model}" for model in models]))
+
+    for model, cls in zip(models, class_names):
+        model_dir = tmp_dir / "src" / "transformers" / "models" / model
+        model_dir.mkdir(parents=True, exist_ok=True)
+        with open(model_dir / "__init__.py", "w") as f:
+            f.write(f"from .configuration_{model} import {cls}Config\nfrom .modeling_{model} import {cls}Model\n")
+        with open(model_dir / f"configuration_{model}.py", "w") as f:
+            f.write("from ...configuration_utils import PretrainedConfig\ncode")
+        with open(model_dir / f"modeling_{model}.py", "w") as f:
+            modeling_code = BERT_MODEL_FILE.replace("bert", model).replace("Bert", cls)
+            f.write(modeling_code)
+
+    test_dir = tmp_dir / "tests"
+    test_dir.mkdir(exist_ok=True)
+    with open(test_dir / "test_modeling_common.py", "w") as f:
+        f.write("from transformers.modeling_utils import PreTrainedModel\ncode")
+
+    for model, cls in zip(models, class_names):
+        test_model_dir = test_dir / "models" / model
+        test_model_dir.mkdir(parents=True, exist_ok=True)
+        (test_model_dir / "__init__.py").touch()
+        with open(test_model_dir / f"test_modeling_{model}.py", "w") as f:
+            f.write(
+                f"from transformers import {cls}Config, {cls}Model\nfrom ...test_modeling_common import ModelTesterMixin\n\ncode"
+            )
+
+    example_dir = tmp_dir / "examples"
+    example_dir.mkdir(exist_ok=True)
+    for framework in ["flax", "pytorch", "tensorflow"]:
+        framework_dir = example_dir / framework
+        framework_dir.mkdir(exist_ok=True)
+        with open(framework_dir / f"test_{framework}_examples.py", "w") as f:
+            f.write("""test_args = "run_glue.py"\n""")
+        glue_dir = framework_dir / "text-classification"
+        glue_dir.mkdir(exist_ok=True)
+        with open(glue_dir / "run_glue.py", "w") as f:
+            f.write("from transformers import BertModel\n\ncode")
+
+    repo.index.add(["examples", "src", "tests"])
+    repo.index.commit("Initial commit")
+    repo.create_head("main")
+    repo.head.reference = repo.refs.main
+    repo.delete_head("master")
+    return repo
+
+
+@contextmanager
+def patch_transformer_repo_path(new_folder):
+    """
+    Temporarily patches the variables defines in `tests_fetcher` to use a different location for the repo.
+    """
+    old_repo_path = tests_fetcher.PATH_TO_REPO
+    tests_fetcher.PATH_TO_REPO = Path(new_folder).resolve()
+    tests_fetcher.PATH_TO_EXAMPLES = tests_fetcher.PATH_TO_REPO / "examples"
+    tests_fetcher.PATH_TO_TRANFORMERS = tests_fetcher.PATH_TO_REPO / "src/transformers"
+    tests_fetcher.PATH_TO_TESTS = tests_fetcher.PATH_TO_REPO / "tests"
+    try:
+        yield
+    finally:
+        tests_fetcher.PATH_TO_REPO = old_repo_path
+        tests_fetcher.PATH_TO_EXAMPLES = tests_fetcher.PATH_TO_REPO / "examples"
+        tests_fetcher.PATH_TO_TRANFORMERS = tests_fetcher.PATH_TO_REPO / "src/transformers"
+        tests_fetcher.PATH_TO_TESTS = tests_fetcher.PATH_TO_REPO / "tests"
+
+
+def commit_changes(filenames, contents, repo, commit_message="Commit"):
+    """
+    Commit new `contents` to `filenames` inside a given `repo`.
+    """
+    if not isinstance(filenames, list):
+        filenames = [filenames]
+    if not isinstance(contents, list):
+        contents = [contents]
+
+    folder = Path(repo.working_dir)
+    for filename, content in zip(filenames, contents):
+        with open(folder / filename, "w") as f:
+            f.write(content)
+    repo.index.add(filenames)
+    commit = repo.index.commit(commit_message)
+    return commit.hexsha
+
+
+class TestFetcherTester(unittest.TestCase):
+    def test_checkout_commit(self):
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            tmp_folder = Path(tmp_folder)
+            repo = create_tmp_repo(tmp_folder)
+            initial_sha = repo.head.commit.hexsha
+            new_sha = commit_changes(BERT_MODELING_FILE, BERT_MODEL_FILE_NEW_DOCSTRING, repo)
+
+            assert repo.head.commit.hexsha == new_sha
+            with checkout_commit(repo, initial_sha):
+                assert repo.head.commit.hexsha == initial_sha
+                with open(tmp_folder / BERT_MODELING_FILE) as f:
+                    assert f.read() == BERT_MODEL_FILE
+
+            assert repo.head.commit.hexsha == new_sha
+            with open(tmp_folder / BERT_MODELING_FILE) as f:
+                assert f.read() == BERT_MODEL_FILE_NEW_DOCSTRING
+
     def test_clean_code(self):
         # Clean code removes all strings in triple quotes
-        self.assertEqual(clean_code('"""\nDocstring\n"""\ncode\n"""Long string"""\ncode\n'), "code\ncode")
-        self.assertEqual(clean_code("'''\nDocstring\n'''\ncode\n'''Long string'''\ncode\n'''"), "code\ncode")
+        assert clean_code('"""\nDocstring\n"""\ncode\n"""Long string"""\ncode\n') == "code\ncode"
+        assert clean_code("'''\nDocstring\n'''\ncode\n'''Long string'''\ncode\n'''") == "code\ncode"
 
         # Clean code removes all comments
-        self.assertEqual(clean_code("code\n# Comment\ncode"), "code\ncode")
-        self.assertEqual(clean_code("code  # inline comment\ncode"), "code  \ncode")
+        assert clean_code("code\n# Comment\ncode") == "code\ncode"
+        assert clean_code("code  # inline comment\ncode") == "code  \ncode"
 
-    def test_checkout_commit(self):
-        repo = Repo(git_repo_path)
-        self.assertNotEqual(repo.head.commit.hexsha, GIT_TEST_SHA)
-        with checkout_commit(repo, GIT_TEST_SHA):
-            self.assertEqual(repo.head.commit.hexsha, GIT_TEST_SHA)
-        self.assertNotEqual(repo.head.commit.hexsha, GIT_TEST_SHA)
+    def test_get_all_tests(self):
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            tmp_folder = Path(tmp_folder)
+            create_tmp_repo(tmp_folder)
+            with patch_transformer_repo_path(tmp_folder):
+                assert get_all_tests() == ["tests/models/bert", "tests/test_modeling_common.py"]
+
+    def test_get_all_tests_on_full_repo(self):
+        all_tests = get_all_tests()
+        assert "tests/models/albert" in all_tests
+        assert "tests/models/bert" in all_tests
+        assert "tests/repo_utils" in all_tests
+        assert "tests/test_pipeline_mixin.py" in all_tests
+        assert "tests/models" not in all_tests
+        assert "tests/__pycache__" not in all_tests
+        assert "tests/models/albert/test_modeling_albert.py" not in all_tests
+        assert "tests/repo_utils/test_tests_fetcher.py" not in all_tests
+
+    def test_diff_is_docstring_only(self):
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            tmp_folder = Path(tmp_folder)
+            repo = create_tmp_repo(tmp_folder)
+
+            branching_point = repo.refs.main.commit
+            bert_file = BERT_MODELING_FILE
+            commit_changes(bert_file, BERT_MODEL_FILE_NEW_DOCSTRING, repo)
+            assert diff_is_docstring_only(repo, branching_point, bert_file)
+
+            commit_changes(bert_file, BERT_MODEL_FILE_NEW_CODE, repo)
+            assert not diff_is_docstring_only(repo, branching_point, bert_file)
+
+    def test_get_diff(self):
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            tmp_folder = Path(tmp_folder)
+            repo = create_tmp_repo(tmp_folder)
+
+            initial_commit = repo.refs.main.commit
+            bert_file = BERT_MODELING_FILE
+            commit_changes(bert_file, BERT_MODEL_FILE_NEW_DOCSTRING, repo)
+            assert get_diff(repo, repo.head.commit, repo.head.commit.parents) == []
+
+            commit_changes(bert_file, BERT_MODEL_FILE_NEW_DOCSTRING + "\n# Adding a comment\n", repo)
+            assert get_diff(repo, repo.head.commit, repo.head.commit.parents) == []
+
+            commit_changes(bert_file, BERT_MODEL_FILE_NEW_CODE, repo)
+            assert get_diff(repo, repo.head.commit, repo.head.commit.parents) == [
+                "src/transformers/models/bert/modeling_bert.py"
+            ]
+
+            commit_changes("src/transformers/utils/hub.py", "import huggingface_hub\n\nnew code", repo)
+            assert get_diff(repo, repo.head.commit, repo.head.commit.parents) == ["src/transformers/utils/hub.py"]
+            assert get_diff(repo, repo.head.commit, [initial_commit]) == [
+                "src/transformers/models/bert/modeling_bert.py",
+                "src/transformers/utils/hub.py",
+            ]
+
+    def test_extract_imports_relative(self):
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            tmp_folder = Path(tmp_folder)
+            create_tmp_repo(tmp_folder)
+
+            expected_bert_imports = [
+                ("src/transformers/modeling_utils.py", ["PreTrainedModel"]),
+                ("src/transformers/utils/__init__.py", ["is_torch_available"]),
+                ("src/transformers/models/bert/configuration_bert.py", ["BertConfig"]),
+            ]
+            expected_utils_imports = [
+                ("src/transformers/utils/hub.py", ["cached_file"]),
+                ("src/transformers/utils/imports.py", ["is_torch_available"]),
+            ]
+            with patch_transformer_repo_path(tmp_folder):
+                assert extract_imports(BERT_MODELING_FILE) == expected_bert_imports
+                assert extract_imports("src/transformers/utils/__init__.py") == expected_utils_imports
+
+            with open(tmp_folder / BERT_MODELING_FILE, "w") as f:
+                f.write(
+                    "from ...utils import cached_file, is_torch_available\nfrom .configuration_bert import BertConfig\n"
+                )
+            expected_bert_imports = [
+                ("src/transformers/utils/__init__.py", ["cached_file", "is_torch_available"]),
+                ("src/transformers/models/bert/configuration_bert.py", ["BertConfig"]),
+            ]
+            with patch_transformer_repo_path(tmp_folder):
+                assert extract_imports(BERT_MODELING_FILE) == expected_bert_imports
+
+            # Test with multi-line imports
+            with open(tmp_folder / BERT_MODELING_FILE, "w") as f:
+                f.write(
+                    "from ...utils import (\n    cached_file,\n    is_torch_available\n)\nfrom .configuration_bert import BertConfig\n"
+                )
+            expected_bert_imports = [
+                ("src/transformers/models/bert/configuration_bert.py", ["BertConfig"]),
+                ("src/transformers/utils/__init__.py", ["cached_file", "is_torch_available"]),
+            ]
+            with patch_transformer_repo_path(tmp_folder):
+                assert extract_imports(BERT_MODELING_FILE) == expected_bert_imports
+
+    def test_extract_imports_absolute(self):
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            tmp_folder = Path(tmp_folder)
+            create_tmp_repo(tmp_folder)
+
+            with open(tmp_folder / BERT_MODELING_FILE, "w") as f:
+                f.write(
+                    "from transformers.utils import cached_file, is_torch_available\nfrom transformers.models.bert.configuration_bert import BertConfig\n"
+                )
+            expected_bert_imports = [
+                ("src/transformers/utils/__init__.py", ["cached_file", "is_torch_available"]),
+                ("src/transformers/models/bert/configuration_bert.py", ["BertConfig"]),
+            ]
+            with patch_transformer_repo_path(tmp_folder):
+                assert extract_imports(BERT_MODELING_FILE) == expected_bert_imports
+
+            # Test with multi-line imports
+            with open(tmp_folder / BERT_MODELING_FILE, "w") as f:
+                f.write(
+                    "from transformers.utils import (\n    cached_file,\n    is_torch_available\n)\nfrom transformers.models.bert.configuration_bert import BertConfig\n"
+                )
+            expected_bert_imports = [
+                ("src/transformers/models/bert/configuration_bert.py", ["BertConfig"]),
+                ("src/transformers/utils/__init__.py", ["cached_file", "is_torch_available"]),
+            ]
+            with patch_transformer_repo_path(tmp_folder):
+                assert extract_imports(BERT_MODELING_FILE) == expected_bert_imports
+
+            # Test with base imports
+            with open(tmp_folder / BERT_MODELING_FILE, "w") as f:
+                f.write(
+                    "from transformers.utils import (\n    cached_file,\n    is_torch_available\n)\nfrom transformers import BertConfig\n"
+                )
+            expected_bert_imports = [
+                ("src/transformers/__init__.py", ["BertConfig"]),
+                ("src/transformers/utils/__init__.py", ["cached_file", "is_torch_available"]),
+            ]
+            with patch_transformer_repo_path(tmp_folder):
+                assert extract_imports(BERT_MODELING_FILE) == expected_bert_imports
 
     def test_get_module_dependencies(self):
-        bert_module = os.path.join(transformers_path, "models", "bert", "modeling_bert.py")
-        expected_deps = [
-            "activations.py",
-            "modeling_outputs.py",
-            "modeling_utils.py",
-            "pytorch_utils.py",
-            "models/bert/configuration_bert.py",
-        ]
-        expected_deps = set(os.path.join(transformers_path, f) for f in expected_deps)
-        repo = Repo(git_repo_path)
-        with checkout_commit(repo, GIT_TEST_SHA):
-            deps = get_module_dependencies(bert_module)
-        deps = set(os.path.expanduser(f) for f in deps)
-        self.assertEqual(deps, expected_deps)
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            tmp_folder = Path(tmp_folder)
+            create_tmp_repo(tmp_folder)
+
+            expected_bert_dependencies = [
+                "src/transformers/modeling_utils.py",
+                "src/transformers/models/bert/configuration_bert.py",
+                "src/transformers/utils/imports.py",
+            ]
+            with patch_transformer_repo_path(tmp_folder):
+                assert get_module_dependencies(BERT_MODELING_FILE) == expected_bert_dependencies
+
+            expected_test_bert_dependencies = [
+                "tests/test_modeling_common.py",
+                "src/transformers/models/bert/configuration_bert.py",
+                "src/transformers/models/bert/modeling_bert.py",
+            ]
+
+            with patch_transformer_repo_path(tmp_folder):
+                assert (
+                    get_module_dependencies("tests/models/bert/test_modeling_bert.py")
+                    == expected_test_bert_dependencies
+                )
+
+            # Test with a submodule
+            (tmp_folder / "src/transformers/utils/logging.py").touch()
+            with open(tmp_folder / BERT_MODELING_FILE, "a") as f:
+                f.write("from ...utils import logging\n")
+
+            expected_bert_dependencies = [
+                "src/transformers/modeling_utils.py",
+                "src/transformers/models/bert/configuration_bert.py",
+                "src/transformers/utils/logging.py",
+                "src/transformers/utils/imports.py",
+            ]
+            with patch_transformer_repo_path(tmp_folder):
+                assert get_module_dependencies(BERT_MODELING_FILE) == expected_bert_dependencies
+
+            # Test with an object non-imported in the init
+            create_tmp_repo(tmp_folder)
+            with open(tmp_folder / BERT_MODELING_FILE, "a") as f:
+                f.write("from ...utils import CONSTANT\n")
+
+            expected_bert_dependencies = [
+                "src/transformers/modeling_utils.py",
+                "src/transformers/models/bert/configuration_bert.py",
+                "src/transformers/utils/__init__.py",
+                "src/transformers/utils/imports.py",
+            ]
+            with patch_transformer_repo_path(tmp_folder):
+                assert get_module_dependencies(BERT_MODELING_FILE) == expected_bert_dependencies
+
+            # Test with an example
+            create_tmp_repo(tmp_folder)
+
+            expected_example_dependencies = ["src/transformers/models/bert/modeling_bert.py"]
+
+            with patch_transformer_repo_path(tmp_folder):
+                assert (
+                    get_module_dependencies("examples/pytorch/text-classification/run_glue.py")
+                    == expected_example_dependencies
+                )
+
+    def test_create_reverse_dependency_tree(self):
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            tmp_folder = Path(tmp_folder)
+            create_tmp_repo(tmp_folder)
+            with patch_transformer_repo_path(tmp_folder):
+                tree = create_reverse_dependency_tree()
+
+            init_edges = [
+                "src/transformers/utils/hub.py",
+                "src/transformers/utils/imports.py",
+                "src/transformers/models/bert/configuration_bert.py",
+                "src/transformers/models/bert/modeling_bert.py",
+            ]
+            assert {f for f, g in tree if g == "src/transformers/__init__.py"} == set(init_edges)
+
+            bert_edges = [
+                "src/transformers/modeling_utils.py",
+                "src/transformers/utils/imports.py",
+                "src/transformers/models/bert/configuration_bert.py",
+            ]
+            assert {f for f, g in tree if g == "src/transformers/models/bert/modeling_bert.py"} == set(bert_edges)
+
+            test_bert_edges = [
+                "tests/test_modeling_common.py",
+                "src/transformers/models/bert/configuration_bert.py",
+                "src/transformers/models/bert/modeling_bert.py",
+            ]
+            assert {f for f, g in tree if g == "tests/models/bert/test_modeling_bert.py"} == set(test_bert_edges)
+
+    def test_get_tree_starting_at(self):
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            tmp_folder = Path(tmp_folder)
+            create_tmp_repo(tmp_folder)
+            with patch_transformer_repo_path(tmp_folder):
+                edges = create_reverse_dependency_tree()
+
+                bert_tree = get_tree_starting_at("src/transformers/models/bert/modeling_bert.py", edges)
+                config_utils_tree = get_tree_starting_at("src/transformers/configuration_utils.py", edges)
+
+            expected_bert_tree = [
+                "src/transformers/models/bert/modeling_bert.py",
+                [("src/transformers/models/bert/modeling_bert.py", "tests/models/bert/test_modeling_bert.py")],
+            ]
+            assert bert_tree == expected_bert_tree
+
+            expected_config_tree = [
+                "src/transformers/configuration_utils.py",
+                [("src/transformers/configuration_utils.py", "src/transformers/models/bert/configuration_bert.py")],
+                [
+                    ("src/transformers/models/bert/configuration_bert.py", "tests/models/bert/test_modeling_bert.py"),
+                    (
+                        "src/transformers/models/bert/configuration_bert.py",
+                        "src/transformers/models/bert/modeling_bert.py",
+                    ),
+                ],
+            ]
+            # Order of the edges is random
+            assert [set(v) for v in config_utils_tree] == [set(v) for v in expected_config_tree]
+
+    def test_print_tree_deps_of(self):
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            tmp_folder = Path(tmp_folder)
+            create_tmp_repo(tmp_folder)
+
+            # There are two possible outputs since the order of the last two lines is non-deterministic.
+            expected_std_out = """src/transformers/models/bert/modeling_bert.py
+  tests/models/bert/test_modeling_bert.py
+src/transformers/configuration_utils.py
+  src/transformers/models/bert/configuration_bert.py
+    src/transformers/models/bert/modeling_bert.py
+    tests/models/bert/test_modeling_bert.py"""
+
+            expected_std_out_2 = """src/transformers/models/bert/modeling_bert.py
+  tests/models/bert/test_modeling_bert.py
+src/transformers/configuration_utils.py
+  src/transformers/models/bert/configuration_bert.py
+    tests/models/bert/test_modeling_bert.py
+    src/transformers/models/bert/modeling_bert.py"""
+
+            with patch_transformer_repo_path(tmp_folder), CaptureStdout() as cs:
+                print_tree_deps_of("src/transformers/models/bert/modeling_bert.py")
+                print_tree_deps_of("src/transformers/configuration_utils.py")
+
+            assert cs.out.strip() in [expected_std_out, expected_std_out_2]
+
+    def test_init_test_examples_dependencies(self):
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            tmp_folder = Path(tmp_folder)
+            create_tmp_repo(tmp_folder)
+
+            expected_example_deps = {
+                "examples/flax/test_flax_examples.py": [
+                    "examples/flax/text-classification/run_glue.py",
+                    "examples/flax/test_flax_examples.py",
+                ],
+                "examples/pytorch/test_pytorch_examples.py": [
+                    "examples/pytorch/text-classification/run_glue.py",
+                    "examples/pytorch/test_pytorch_examples.py",
+                ],
+                "examples/tensorflow/test_tensorflow_examples.py": [
+                    "examples/tensorflow/text-classification/run_glue.py",
+                    "examples/tensorflow/test_tensorflow_examples.py",
+                ],
+            }
+
+            expected_examples = {
+                "examples/flax/test_flax_examples.py",
+                "examples/flax/text-classification/run_glue.py",
+                "examples/pytorch/test_pytorch_examples.py",
+                "examples/pytorch/text-classification/run_glue.py",
+                "examples/tensorflow/test_tensorflow_examples.py",
+                "examples/tensorflow/text-classification/run_glue.py",
+            }
+
+            with patch_transformer_repo_path(tmp_folder):
+                example_deps, all_examples = init_test_examples_dependencies()
+                assert example_deps == expected_example_deps
+                assert {str(f.relative_to(tmp_folder)) for f in all_examples} == expected_examples
+
+    def test_create_reverse_dependency_map(self):
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            tmp_folder = Path(tmp_folder)
+            create_tmp_repo(tmp_folder)
+            with patch_transformer_repo_path(tmp_folder):
+                reverse_map = create_reverse_dependency_map()
+
+            # impact of BERT modeling file (note that we stop at the inits and don't go down further)
+            expected_bert_deps = {
+                "src/transformers/__init__.py",
+                "src/transformers/models/bert/__init__.py",
+                "tests/models/bert/test_modeling_bert.py",
+                "examples/flax/test_flax_examples.py",
+                "examples/flax/text-classification/run_glue.py",
+                "examples/pytorch/test_pytorch_examples.py",
+                "examples/pytorch/text-classification/run_glue.py",
+                "examples/tensorflow/test_tensorflow_examples.py",
+                "examples/tensorflow/text-classification/run_glue.py",
+            }
+            assert set(reverse_map["src/transformers/models/bert/modeling_bert.py"]) == expected_bert_deps
+
+            # init gets the direct deps (and their recursive deps)
+            expected_init_deps = {
+                "src/transformers/utils/__init__.py",
+                "src/transformers/utils/hub.py",
+                "src/transformers/utils/imports.py",
+                "src/transformers/models/bert/__init__.py",
+                "src/transformers/models/bert/configuration_bert.py",
+                "src/transformers/models/bert/modeling_bert.py",
+                "src/transformers/configuration_utils.py",
+                "src/transformers/modeling_utils.py",
+                "tests/test_modeling_common.py",
+                "tests/models/bert/test_modeling_bert.py",
+                "examples/flax/test_flax_examples.py",
+                "examples/flax/text-classification/run_glue.py",
+                "examples/pytorch/test_pytorch_examples.py",
+                "examples/pytorch/text-classification/run_glue.py",
+                "examples/tensorflow/test_tensorflow_examples.py",
+                "examples/tensorflow/text-classification/run_glue.py",
+            }
+            assert set(reverse_map["src/transformers/__init__.py"]) == expected_init_deps
+
+            expected_init_deps = {
+                "src/transformers/__init__.py",
+                "src/transformers/models/bert/configuration_bert.py",
+                "src/transformers/models/bert/modeling_bert.py",
+                "tests/models/bert/test_modeling_bert.py",
+                "examples/flax/test_flax_examples.py",
+                "examples/flax/text-classification/run_glue.py",
+                "examples/pytorch/test_pytorch_examples.py",
+                "examples/pytorch/text-classification/run_glue.py",
+                "examples/tensorflow/test_tensorflow_examples.py",
+                "examples/tensorflow/text-classification/run_glue.py",
+            }
+            assert set(reverse_map["src/transformers/models/bert/__init__.py"]) == expected_init_deps
+
+            # Test that with more models init of bert only gets deps to bert.
+            create_tmp_repo(tmp_folder, models=["bert", "gpt2"])
+            with patch_transformer_repo_path(tmp_folder):
+                reverse_map = create_reverse_dependency_map()
+
+            # init gets the direct deps (and their recursive deps)
+            expected_init_deps = {
+                "src/transformers/__init__.py",
+                "src/transformers/models/bert/configuration_bert.py",
+                "src/transformers/models/bert/modeling_bert.py",
+                "tests/models/bert/test_modeling_bert.py",
+                "examples/flax/test_flax_examples.py",
+                "examples/flax/text-classification/run_glue.py",
+                "examples/pytorch/test_pytorch_examples.py",
+                "examples/pytorch/text-classification/run_glue.py",
+                "examples/tensorflow/test_tensorflow_examples.py",
+                "examples/tensorflow/text-classification/run_glue.py",
+            }
+            assert set(reverse_map["src/transformers/models/bert/__init__.py"]) == expected_init_deps
+
+    def test_create_module_to_test_map(self):
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            tmp_folder = Path(tmp_folder)
+            models = models = ["bert", "gpt2"] + [f"bert{i}" for i in range(10)]
+            create_tmp_repo(tmp_folder, models=models)
+            with patch_transformer_repo_path(tmp_folder):
+                test_map = create_module_to_test_map(filter_models=True)
+
+            expected_bert_tests = {
+                "examples/flax/test_flax_examples.py",
+                "examples/pytorch/test_pytorch_examples.py",
+                "examples/tensorflow/test_tensorflow_examples.py",
+                "tests/models/bert/test_modeling_bert.py",
+            }
+
+            for model in models:
+                if model != "bert":
+                    assert test_map[f"src/transformers/models/{model}/modeling_{model}.py"] == [
+                        f"tests/models/{model}/test_modeling_{model}.py"
+                    ]
+                else:
+                    assert set(test_map[f"src/transformers/models/{model}/modeling_{model}.py"]) == expected_bert_tests
+
+            # Init got filtered
+            expected_init_tests = {
+                "examples/flax/test_flax_examples.py",
+                "examples/pytorch/test_pytorch_examples.py",
+                "examples/tensorflow/test_tensorflow_examples.py",
+                "tests/test_modeling_common.py",
+                "tests/models/bert/test_modeling_bert.py",
+                "tests/models/gpt2/test_modeling_gpt2.py",
+            }
+            assert set(test_map["src/transformers/__init__.py"]) == expected_init_tests
+
+    def test_infer_tests_to_run(self):
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            tmp_folder = Path(tmp_folder)
+            models = ["bert", "gpt2"] + [f"bert{i}" for i in range(10)]
+            repo = create_tmp_repo(tmp_folder, models=models)
+
+            commit_changes("src/transformers/models/bert/modeling_bert.py", BERT_MODEL_FILE_NEW_CODE, repo)
+
+            example_tests = {
+                "examples/flax/test_flax_examples.py",
+                "examples/pytorch/test_pytorch_examples.py",
+                "examples/tensorflow/test_tensorflow_examples.py",
+            }
+
+            with patch_transformer_repo_path(tmp_folder):
+                infer_tests_to_run(tmp_folder / "test-output.txt", diff_with_last_commit=True)
+                with open(tmp_folder / "test-output.txt", "r") as f:
+                    tests_to_run = f.read()
+                with open(tmp_folder / "examples_test_list.txt", "r") as f:
+                    example_tests_to_run = f.read()
+
+            assert tests_to_run == "tests/models/bert/test_modeling_bert.py"
+            assert set(example_tests_to_run.split(" ")) == example_tests
+
+            # Fake a new model addition
+            repo = create_tmp_repo(tmp_folder, models=models)
+
+            branch = repo.create_head("new_model")
+            branch.checkout()
+
+            with open(tmp_folder / "src/transformers/__init__.py", "a") as f:
+                f.write("from .models.t5 import T5Config, T5Model\n")
+
+            model_dir = tmp_folder / "src/transformers/models/t5"
+            model_dir.mkdir(exist_ok=True)
+
+            with open(model_dir / "__init__.py", "w") as f:
+                f.write("from .configuration_t5 import T5Config\nfrom .modeling_t5 import T5Model\n")
+            with open(model_dir / "configuration_t5.py", "w") as f:
+                f.write("from ...configuration_utils import PretrainedConfig\ncode")
+            with open(model_dir / "modeling_t5.py", "w") as f:
+                modeling_code = BERT_MODEL_FILE.replace("bert", "t5").replace("Bert", "T5")
+                f.write(modeling_code)
+
+            test_dir = tmp_folder / "tests/models/t5"
+            test_dir.mkdir(exist_ok=True)
+            (test_dir / "__init__.py").touch()
+            with open(test_dir / "test_modeling_t5.py", "w") as f:
+                f.write(
+                    "from transformers import T5Config, T5Model\nfrom ...test_modeling_common import ModelTesterMixin\n\ncode"
+                )
+
+            repo.index.add(["src", "tests"])
+            repo.index.commit("Add T5 model")
+
+            with patch_transformer_repo_path(tmp_folder):
+                infer_tests_to_run(tmp_folder / "test-output.txt")
+                with open(tmp_folder / "test-output.txt", "r") as f:
+                    tests_to_run = f.read()
+                with open(tmp_folder / "examples_test_list.txt", "r") as f:
+                    example_tests_to_run = f.read()
+
+            expected_tests = {
+                "tests/models/bert/test_modeling_bert.py",
+                "tests/models/gpt2/test_modeling_gpt2.py",
+                "tests/models/t5/test_modeling_t5.py",
+                "tests/test_modeling_common.py",
+            }
+            assert set(tests_to_run.split(" ")) == expected_tests
+            assert set(example_tests_to_run.split(" ")) == example_tests
+
+            with patch_transformer_repo_path(tmp_folder):
+                infer_tests_to_run(tmp_folder / "test-output.txt", filter_models=False)
+                with open(tmp_folder / "test-output.txt", "r") as f:
+                    tests_to_run = f.read()
+                with open(tmp_folder / "examples_test_list.txt", "r") as f:
+                    example_tests_to_run = f.read()
+
+            expected_tests = [f"tests/models/{name}/test_modeling_{name}.py" for name in models + ["t5"]]
+            expected_tests = set(expected_tests + ["tests/test_modeling_common.py"])
+            assert set(tests_to_run.split(" ")) == expected_tests
+            assert set(example_tests_to_run.split(" ")) == example_tests
+
+    def test_infer_tests_to_run_with_test_modifs(self):
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            tmp_folder = Path(tmp_folder)
+            models = ["bert", "gpt2"] + [f"bert{i}" for i in range(10)]
+            repo = create_tmp_repo(tmp_folder, models=models)
+
+            commit_changes(
+                "tests/models/bert/test_modeling_bert.py",
+                "from transformers import BertConfig, BertModel\nfrom ...test_modeling_common import ModelTesterMixin\n\ncode1",
+                repo,
+            )
+
+            with patch_transformer_repo_path(tmp_folder):
+                infer_tests_to_run(tmp_folder / "test-output.txt", diff_with_last_commit=True)
+                with open(tmp_folder / "test-output.txt", "r") as f:
+                    tests_to_run = f.read()
+
+            assert tests_to_run == "tests/models/bert/test_modeling_bert.py"
+
+    def test_infer_tests_to_run_with_examples_modifs(self):
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            tmp_folder = Path(tmp_folder)
+            models = ["bert", "gpt2"]
+            repo = create_tmp_repo(tmp_folder, models=models)
+
+            # Modification in one example trigger the corresponding test
+            commit_changes(
+                "examples/pytorch/text-classification/run_glue.py",
+                "from transformers import BertModeln\n\ncode1",
+                repo,
+            )
+
+            with patch_transformer_repo_path(tmp_folder):
+                infer_tests_to_run(tmp_folder / "test-output.txt", diff_with_last_commit=True)
+                with open(tmp_folder / "examples_test_list.txt", "r") as f:
+                    example_tests_to_run = f.read()
+
+            assert example_tests_to_run == "examples/pytorch/test_pytorch_examples.py"
+
+            # Modification in one test example file trigger that test
+            repo = create_tmp_repo(tmp_folder, models=models)
+            commit_changes(
+                "examples/pytorch/test_pytorch_examples.py",
+                """test_args = "run_glue.py"\nmore_code""",
+                repo,
+            )
+
+            with patch_transformer_repo_path(tmp_folder):
+                infer_tests_to_run(tmp_folder / "test-output.txt", diff_with_last_commit=True)
+                with open(tmp_folder / "examples_test_list.txt", "r") as f:
+                    example_tests_to_run = f.read()
+
+            assert example_tests_to_run == "examples/pytorch/test_pytorch_examples.py"
+
+    def test_parse_commit_message(self):
+        assert parse_commit_message("Normal commit") == {"skip": False, "no_filter": False, "test_all": False}
+
+        assert parse_commit_message("[skip ci] commit") == {"skip": True, "no_filter": False, "test_all": False}
+        assert parse_commit_message("[ci skip] commit") == {"skip": True, "no_filter": False, "test_all": False}
+        assert parse_commit_message("[skip-ci] commit") == {"skip": True, "no_filter": False, "test_all": False}
+        assert parse_commit_message("[skip_ci] commit") == {"skip": True, "no_filter": False, "test_all": False}
+
+        assert parse_commit_message("[no filter] commit") == {"skip": False, "no_filter": True, "test_all": False}
+        assert parse_commit_message("[no-filter] commit") == {"skip": False, "no_filter": True, "test_all": False}
+        assert parse_commit_message("[no_filter] commit") == {"skip": False, "no_filter": True, "test_all": False}
+        assert parse_commit_message("[filter-no] commit") == {"skip": False, "no_filter": True, "test_all": False}
+
+        assert parse_commit_message("[test all] commit") == {"skip": False, "no_filter": False, "test_all": True}
+        assert parse_commit_message("[all test] commit") == {"skip": False, "no_filter": False, "test_all": True}
+        assert parse_commit_message("[test-all] commit") == {"skip": False, "no_filter": False, "test_all": True}
+        assert parse_commit_message("[all_test] commit") == {"skip": False, "no_filter": False, "test_all": True}
diff --git a/tests/sagemaker/conftest.py b/tests/sagemaker/conftest.py
index 8e7c0bbf1d0c..9d987bc8d006 100644
--- a/tests/sagemaker/conftest.py
+++ b/tests/sagemaker/conftest.py
@@ -4,7 +4,6 @@
 import os
 
 import pytest
-
 from attr import dataclass
 
 
@@ -33,15 +32,15 @@ class SageMakerTestEnvironment:
     def metric_definitions(self) -> str:
         if self.framework == "pytorch":
             return [
-                {"Name": "train_runtime", "Regex": "train_runtime.*=\D*(.*?)$"},
-                {"Name": "eval_accuracy", "Regex": "eval_accuracy.*=\D*(.*?)$"},
-                {"Name": "eval_loss", "Regex": "eval_loss.*=\D*(.*?)$"},
+                {"Name": "train_runtime", "Regex": r"train_runtime.*=\D*(.*?)$"},
+                {"Name": "eval_accuracy", "Regex": r"eval_accuracy.*=\D*(.*?)$"},
+                {"Name": "eval_loss", "Regex": r"eval_loss.*=\D*(.*?)$"},
             ]
         else:
             return [
-                {"Name": "train_runtime", "Regex": "train_runtime.*=\D*(.*?)$"},
-                {"Name": "eval_accuracy", "Regex": "loss.*=\D*(.*?)]?$"},
-                {"Name": "eval_loss", "Regex": "sparse_categorical_accuracy.*=\D*(.*?)]?$"},
+                {"Name": "train_runtime", "Regex": r"train_runtime.*=\D*(.*?)$"},
+                {"Name": "eval_accuracy", "Regex": r"loss.*=\D*(.*?)]?$"},
+                {"Name": "eval_loss", "Regex": r"sparse_categorical_accuracy.*=\D*(.*?)]?$"},
             ]
 
     @property
diff --git a/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py b/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py
index 01185fdabac5..ecbe714a16b9 100644
--- a/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py
+++ b/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py
@@ -362,12 +362,12 @@ def main():
     ):
         # Some have all caps in their config, some don't.
         label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()}
-        if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
+        if sorted(label_name_to_id.keys()) == sorted(label_list):
             label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)}
         else:
             logger.warning(
                 "Your model seems to have been trained with labels, but they don't match the dataset: ",
-                f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
+                f"model labels: {sorted(label_name_to_id.keys())}, dataset labels: {sorted(label_list)}."
                 "\nIgnoring the model labels as a result.",
             )
     elif data_args.task_name is None and not is_regression:
diff --git a/tests/sagemaker/scripts/tensorflow/run_tf.py b/tests/sagemaker/scripts/tensorflow/run_tf.py
index a47e76c09d61..03f631d26679 100644
--- a/tests/sagemaker/scripts/tensorflow/run_tf.py
+++ b/tests/sagemaker/scripts/tensorflow/run_tf.py
@@ -10,7 +10,6 @@
 
 
 if __name__ == "__main__":
-
     parser = argparse.ArgumentParser()
 
     # Hyperparameters sent by the client are passed as command-line arguments to the script.
diff --git a/tests/sagemaker/scripts/tensorflow/run_tf_dist.py b/tests/sagemaker/scripts/tensorflow/run_tf_dist.py
index 84f4275aafce..f8f2e4bcf29d 100644
--- a/tests/sagemaker/scripts/tensorflow/run_tf_dist.py
+++ b/tests/sagemaker/scripts/tensorflow/run_tf_dist.py
@@ -85,7 +85,6 @@ def get_datasets(tokenizer, train_batch_size, eval_batch_size):
 
 
 if __name__ == "__main__":
-
     parser = argparse.ArgumentParser()
 
     # Hyperparameters sent by the client are passed as command-line arguments to the script.
@@ -143,7 +142,6 @@ def get_datasets(tokenizer, train_batch_size, eval_batch_size):
 
     # Training
     if args.do_train:
-
         # train_results = model.fit(tf_train_dataset, epochs=args.epochs, batch_size=args.train_batch_size)
         start_train_time = time.time()
         train_results = fit(
@@ -171,7 +169,6 @@ def get_datasets(tokenizer, train_batch_size, eval_batch_size):
 
     # Evaluation
     if args.do_eval and (not SDP_ENABLED or sdp.rank() == 0):
-
         result = model.evaluate(tf_test_dataset, batch_size=args.per_device_eval_batch_size, return_dict=True)
         logger.info("*** Evaluate ***")
 
diff --git a/tests/sagemaker/test_multi_node_data_parallel.py b/tests/sagemaker/test_multi_node_data_parallel.py
index 8fb60d64a61f..cc7f9e5e84f8 100644
--- a/tests/sagemaker/test_multi_node_data_parallel.py
+++ b/tests/sagemaker/test_multi_node_data_parallel.py
@@ -5,7 +5,6 @@
 from ast import literal_eval
 
 import pytest
-
 from parameterized import parameterized, parameterized_class
 
 from . import is_sagemaker_available
diff --git a/tests/sagemaker/test_multi_node_model_parallel.py b/tests/sagemaker/test_multi_node_model_parallel.py
index 38a1c9a6b3b7..95d5b9fa8559 100644
--- a/tests/sagemaker/test_multi_node_model_parallel.py
+++ b/tests/sagemaker/test_multi_node_model_parallel.py
@@ -5,7 +5,6 @@
 from ast import literal_eval
 
 import pytest
-
 from parameterized import parameterized, parameterized_class
 
 from . import is_sagemaker_available
@@ -50,7 +49,6 @@ def setUp(self):
         assert hasattr(self, "env")
 
     def create_estimator(self, instance_count):
-
         # configuration for running training on smdistributed Model Parallel
         mpi_options = {
             "enabled": True,
diff --git a/tests/sagemaker/test_single_node_gpu.py b/tests/sagemaker/test_single_node_gpu.py
index e71f82d31634..f2a62547e787 100644
--- a/tests/sagemaker/test_single_node_gpu.py
+++ b/tests/sagemaker/test_single_node_gpu.py
@@ -5,7 +5,6 @@
 from ast import literal_eval
 
 import pytest
-
 from parameterized import parameterized_class
 
 from . import is_sagemaker_available
diff --git a/tests/test_backbone_common.py b/tests/test_backbone_common.py
new file mode 100644
index 000000000000..7735107d4f36
--- /dev/null
+++ b/tests/test_backbone_common.py
@@ -0,0 +1,203 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import inspect
+import tempfile
+
+from transformers.testing_utils import require_torch, torch_device
+from transformers.utils.backbone_utils import BackboneType
+
+
+@require_torch
+class BackboneTesterMixin:
+    all_model_classes = ()
+    has_attentions = True
+
+    def test_config(self):
+        config_class = self.config_class
+
+        # test default config
+        config = config_class()
+        self.assertIsNotNone(config)
+        num_stages = len(config.depths) if hasattr(config, "depths") else config.num_hidden_layers
+        expected_stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_stages + 1)]
+        self.assertEqual(config.stage_names, expected_stage_names)
+        self.assertTrue(set(config.out_features).issubset(set(config.stage_names)))
+
+        # Test out_features and out_indices are correctly set
+        # out_features and out_indices both None
+        config = config_class(out_features=None, out_indices=None)
+        self.assertEqual(config.out_features, [config.stage_names[-1]])
+        self.assertEqual(config.out_indices, [len(config.stage_names) - 1])
+
+        # out_features and out_indices both set
+        config = config_class(out_features=["stem", "stage1"], out_indices=[0, 1])
+        self.assertEqual(config.out_features, ["stem", "stage1"])
+        self.assertEqual(config.out_indices, [0, 1])
+
+        # Only out_features set
+        config = config_class(out_features=["stage1", "stage3"])
+        self.assertEqual(config.out_features, ["stage1", "stage3"])
+        self.assertEqual(config.out_indices, [1, 3])
+
+        # Only out_indices set
+        config = config_class(out_indices=[0, 2])
+        self.assertEqual(config.out_features, [config.stage_names[0], config.stage_names[2]])
+        self.assertEqual(config.out_indices, [0, 2])
+
+        # Error raised when out_indices do not correspond to out_features
+        with self.assertRaises(ValueError):
+            config = config_class(out_features=["stage1", "stage2"], out_indices=[0, 2])
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_config_save_pretrained(self):
+        config_class = self.config_class
+        config_first = config_class(out_indices=[0, 1, 2, 3])
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            config_first.save_pretrained(tmpdirname)
+            config_second = self.config_class.from_pretrained(tmpdirname)
+
+        self.assertEqual(config_second.to_dict(), config_first.to_dict())
+
+    def test_channels(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertEqual(len(model.channels), len(config.out_features))
+            num_features = model.num_features
+            out_indices = [config.stage_names.index(feat) for feat in config.out_features]
+            out_channels = [num_features[idx] for idx in out_indices]
+            self.assertListEqual(model.channels, out_channels)
+
+            new_config = copy.deepcopy(config)
+            new_config.out_features = None
+            model = model_class(new_config)
+            self.assertEqual(len(model.channels), 1)
+            self.assertListEqual(model.channels, [num_features[-1]])
+
+            new_config = copy.deepcopy(config)
+            new_config.out_indices = None
+            model = model_class(new_config)
+            self.assertEqual(len(model.channels), 1)
+            self.assertListEqual(model.channels, [num_features[-1]])
+
+    def test_create_from_modified_config(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            result = model(**inputs_dict)
+
+            self.assertEqual(len(result.feature_maps), len(config.out_features))
+            self.assertEqual(len(model.channels), len(config.out_features))
+            self.assertEqual(len(result.feature_maps), len(config.out_indices))
+            self.assertEqual(len(model.channels), len(config.out_indices))
+
+            # Check output of last stage is taken if out_features=None, out_indices=None
+            modified_config = copy.deepcopy(config)
+            modified_config.out_features = None
+            model = model_class(modified_config)
+            model.to(torch_device)
+            model.eval()
+            result = model(**inputs_dict)
+
+            self.assertEqual(len(result.feature_maps), 1)
+            self.assertEqual(len(model.channels), 1)
+
+            modified_config = copy.deepcopy(config)
+            modified_config.out_indices = None
+            model = model_class(modified_config)
+            model.to(torch_device)
+            model.eval()
+            result = model(**inputs_dict)
+
+            self.assertEqual(len(result.feature_maps), 1)
+            self.assertEqual(len(model.channels), 1)
+
+            # Check backbone can be initialized with fresh weights
+            modified_config = copy.deepcopy(config)
+            modified_config.use_pretrained_backbone = False
+            model = model_class(modified_config)
+            model.to(torch_device)
+            model.eval()
+            result = model(**inputs_dict)
+
+    def test_backbone_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for backbone_class in self.all_model_classes:
+            backbone = backbone_class(config)
+
+            self.assertTrue(hasattr(backbone, "backbone_type"))
+            self.assertTrue(hasattr(backbone, "stage_names"))
+            self.assertTrue(hasattr(backbone, "num_features"))
+            self.assertTrue(hasattr(backbone, "out_indices"))
+            self.assertTrue(hasattr(backbone, "out_features"))
+            self.assertTrue(hasattr(backbone, "out_feature_channels"))
+            self.assertTrue(hasattr(backbone, "channels"))
+
+            self.assertIsInstance(backbone.backbone_type, BackboneType)
+            # Verify num_features has been initialized in the backbone init
+            self.assertIsNotNone(backbone.num_features)
+            self.assertTrue(len(backbone.channels) == len(backbone.out_indices))
+            self.assertTrue(len(backbone.stage_names) == len(backbone.num_features))
+            self.assertTrue(len(backbone.channels) <= len(backbone.num_features))
+            self.assertTrue(len(backbone.out_feature_channels) == len(backbone.stage_names))
+
+    def test_backbone_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        batch_size = inputs_dict["pixel_values"].shape[0]
+
+        for backbone_class in self.all_model_classes:
+            backbone = backbone_class(config)
+            backbone.to(torch_device)
+            backbone.eval()
+
+            outputs = backbone(**inputs_dict)
+
+            # Test default outputs and verify feature maps
+            self.assertIsInstance(outputs.feature_maps, tuple)
+            self.assertTrue(len(outputs.feature_maps) == len(backbone.channels))
+            for feature_map, n_channels in zip(outputs.feature_maps, backbone.channels):
+                self.assertTrue(feature_map.shape[:2], (batch_size, n_channels))
+            self.assertIsNone(outputs.hidden_states)
+            self.assertIsNone(outputs.attentions)
+
+            # Test output_hidden_states=True
+            outputs = backbone(**inputs_dict, output_hidden_states=True)
+            self.assertIsNotNone(outputs.hidden_states)
+            self.assertTrue(len(outputs.hidden_states), len(backbone.stage_names))
+            for hidden_state, n_channels in zip(outputs.hidden_states, backbone.channels):
+                self.assertTrue(hidden_state.shape[:2], (batch_size, n_channels))
+
+            # Test output_attentions=True
+            if self.has_attentions:
+                outputs = backbone(**inputs_dict, output_attentions=True)
+                self.assertIsNotNone(outputs.attentions)
diff --git a/tests/test_configuration_common.py b/tests/test_configuration_common.py
index df05d2a4ac84..5fb93f71eb04 100644
--- a/tests/test_configuration_common.py
+++ b/tests/test_configuration_common.py
@@ -16,91 +16,28 @@
 import copy
 import json
 import os
-import shutil
-import sys
 import tempfile
-import unittest
-import unittest.mock as mock
-from pathlib import Path
 
-from huggingface_hub import HfFolder, delete_repo, set_access_token
-from requests.exceptions import HTTPError
-from transformers import AutoConfig, BertConfig, GPT2Config, is_torch_available
-from transformers.configuration_utils import PretrainedConfig
-from transformers.testing_utils import TOKEN, USER, is_staging_test
+from transformers import is_torch_available
 
-
-sys.path.append(str(Path(__file__).parent.parent / "utils"))
-
-from test_module.custom_configuration import CustomConfig  # noqa E402
-
-
-config_common_kwargs = {
-    "return_dict": False,
-    "output_hidden_states": True,
-    "output_attentions": True,
-    "torchscript": True,
-    "torch_dtype": "float16",
-    "use_bfloat16": True,
-    "tf_legacy_loss": True,
-    "pruned_heads": {"a": 1},
-    "tie_word_embeddings": False,
-    "is_decoder": True,
-    "cross_attention_hidden_size": 128,
-    "add_cross_attention": True,
-    "tie_encoder_decoder": True,
-    "max_length": 50,
-    "min_length": 3,
-    "do_sample": True,
-    "early_stopping": True,
-    "num_beams": 3,
-    "num_beam_groups": 3,
-    "diversity_penalty": 0.5,
-    "temperature": 2.0,
-    "top_k": 10,
-    "top_p": 0.7,
-    "typical_p": 0.2,
-    "repetition_penalty": 0.8,
-    "length_penalty": 0.8,
-    "no_repeat_ngram_size": 5,
-    "encoder_no_repeat_ngram_size": 5,
-    "bad_words_ids": [1, 2, 3],
-    "num_return_sequences": 3,
-    "chunk_size_feed_forward": 5,
-    "output_scores": True,
-    "return_dict_in_generate": True,
-    "forced_bos_token_id": 2,
-    "forced_eos_token_id": 3,
-    "remove_invalid_values": True,
-    "architectures": ["BertModel"],
-    "finetuning_task": "translation",
-    "id2label": {0: "label"},
-    "label2id": {"label": "0"},
-    "tokenizer_class": "BertTokenizerFast",
-    "prefix": "prefix",
-    "bos_token_id": 6,
-    "pad_token_id": 7,
-    "eos_token_id": 8,
-    "sep_token_id": 9,
-    "decoder_start_token_id": 10,
-    "exponential_decay_length_penalty": (5, 1.01),
-    "suppress_tokens": [0, 1],
-    "begin_suppress_tokens": 2,
-    "task_specific_params": {"translation": "some_params"},
-    "problem_type": "regression",
-}
+from .test_configuration_utils import config_common_kwargs
 
 
 class ConfigTester(object):
-    def __init__(self, parent, config_class=None, has_text_modality=True, **kwargs):
+    def __init__(self, parent, config_class=None, has_text_modality=True, common_properties=None, **kwargs):
         self.parent = parent
         self.config_class = config_class
         self.has_text_modality = has_text_modality
         self.inputs_dict = kwargs
+        self.common_properties = common_properties
 
     def create_and_test_config_common_properties(self):
         config = self.config_class(**self.inputs_dict)
-        common_properties = ["hidden_size", "num_attention_heads", "num_hidden_layers"]
+        common_properties = (
+            ["hidden_size", "num_attention_heads", "num_hidden_layers"]
+            if self.common_properties is None
+            else self.common_properties
+        )
 
         # Add common fields for text models
         if self.has_text_modality:
@@ -181,9 +118,11 @@ def create_and_test_config_with_num_labels(self):
 
     def check_config_can_be_init_without_params(self):
         if self.config_class.is_composition:
-            return
-        config = self.config_class()
-        self.parent.assertIsNotNone(config)
+            with self.parent.assertRaises(ValueError):
+                config = self.config_class()
+        else:
+            config = self.config_class()
+            self.parent.assertIsNotNone(config)
 
     def check_config_arguments_init(self):
         kwargs = copy.deepcopy(config_common_kwargs)
@@ -214,201 +153,3 @@ def run_common_tests(self):
         self.create_and_test_config_with_num_labels()
         self.check_config_can_be_init_without_params()
         self.check_config_arguments_init()
-
-
-@is_staging_test
-class ConfigPushToHubTester(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls._token = TOKEN
-        set_access_token(TOKEN)
-        HfFolder.save_token(TOKEN)
-
-    @classmethod
-    def tearDownClass(cls):
-        try:
-            delete_repo(token=cls._token, repo_id="test-config")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="valid_org/test-config-org")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="test-dynamic-config")
-        except HTTPError:
-            pass
-
-    def test_push_to_hub(self):
-        config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
-        )
-        config.push_to_hub("test-config", use_auth_token=self._token)
-
-        new_config = BertConfig.from_pretrained(f"{USER}/test-config")
-        for k, v in config.to_dict().items():
-            if k != "transformers_version":
-                self.assertEqual(v, getattr(new_config, k))
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="test-config")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            config.save_pretrained(tmp_dir, repo_id="test-config", push_to_hub=True, use_auth_token=self._token)
-
-        new_config = BertConfig.from_pretrained(f"{USER}/test-config")
-        for k, v in config.to_dict().items():
-            if k != "transformers_version":
-                self.assertEqual(v, getattr(new_config, k))
-
-    def test_push_to_hub_in_organization(self):
-        config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
-        )
-        config.push_to_hub("valid_org/test-config-org", use_auth_token=self._token)
-
-        new_config = BertConfig.from_pretrained("valid_org/test-config-org")
-        for k, v in config.to_dict().items():
-            if k != "transformers_version":
-                self.assertEqual(v, getattr(new_config, k))
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="valid_org/test-config-org")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            config.save_pretrained(
-                tmp_dir, repo_id="valid_org/test-config-org", push_to_hub=True, use_auth_token=self._token
-            )
-
-        new_config = BertConfig.from_pretrained("valid_org/test-config-org")
-        for k, v in config.to_dict().items():
-            if k != "transformers_version":
-                self.assertEqual(v, getattr(new_config, k))
-
-    def test_push_to_hub_dynamic_config(self):
-        CustomConfig.register_for_auto_class()
-        config = CustomConfig(attribute=42)
-
-        config.push_to_hub("test-dynamic-config", use_auth_token=self._token)
-
-        # This has added the proper auto_map field to the config
-        self.assertDictEqual(config.auto_map, {"AutoConfig": "custom_configuration.CustomConfig"})
-
-        new_config = AutoConfig.from_pretrained(f"{USER}/test-dynamic-config", trust_remote_code=True)
-        # Can't make an isinstance check because the new_config is from the FakeConfig class of a dynamic module
-        self.assertEqual(new_config.__class__.__name__, "CustomConfig")
-        self.assertEqual(new_config.attribute, 42)
-
-
-class ConfigTestUtils(unittest.TestCase):
-    def test_config_from_string(self):
-        c = GPT2Config()
-
-        # attempt to modify each of int/float/bool/str config records and verify they were updated
-        n_embd = c.n_embd + 1  # int
-        resid_pdrop = c.resid_pdrop + 1.0  # float
-        scale_attn_weights = not c.scale_attn_weights  # bool
-        summary_type = c.summary_type + "foo"  # str
-        c.update_from_string(
-            f"n_embd={n_embd},resid_pdrop={resid_pdrop},scale_attn_weights={scale_attn_weights},summary_type={summary_type}"
-        )
-        self.assertEqual(n_embd, c.n_embd, "mismatch for key: n_embd")
-        self.assertEqual(resid_pdrop, c.resid_pdrop, "mismatch for key: resid_pdrop")
-        self.assertEqual(scale_attn_weights, c.scale_attn_weights, "mismatch for key: scale_attn_weights")
-        self.assertEqual(summary_type, c.summary_type, "mismatch for key: summary_type")
-
-    def test_config_common_kwargs_is_complete(self):
-        base_config = PretrainedConfig()
-        missing_keys = [key for key in base_config.__dict__ if key not in config_common_kwargs]
-        # If this part of the test fails, you have arguments to addin config_common_kwargs above.
-        self.assertListEqual(
-            missing_keys, ["is_encoder_decoder", "_name_or_path", "_commit_hash", "transformers_version"]
-        )
-        keys_with_defaults = [key for key, value in config_common_kwargs.items() if value == getattr(base_config, key)]
-        if len(keys_with_defaults) > 0:
-            raise ValueError(
-                "The following keys are set with the default values in"
-                " `test_configuration_common.config_common_kwargs` pick another value for them:"
-                f" {', '.join(keys_with_defaults)}."
-            )
-
-    def test_from_pretrained_subfolder(self):
-        with self.assertRaises(OSError):
-            # config is in subfolder, the following should not work without specifying the subfolder
-            _ = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert-subfolder")
-
-        config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert-subfolder", subfolder="bert")
-
-        self.assertIsNotNone(config)
-
-    def test_cached_files_are_used_when_internet_is_down(self):
-        # A mock response for an HTTP head request to emulate server down
-        response_mock = mock.Mock()
-        response_mock.status_code = 500
-        response_mock.headers = {}
-        response_mock.raise_for_status.side_effect = HTTPError
-        response_mock.json.return_value = {}
-
-        # Download this model to make sure it's in the cache.
-        _ = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        # Under the mock environment we get a 500 error when trying to reach the model.
-        with mock.patch("requests.request", return_value=response_mock) as mock_head:
-            _ = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
-            # This check we did call the fake head request
-            mock_head.assert_called()
-
-    def test_legacy_load_from_url(self):
-        # This test is for deprecated behavior and can be removed in v5
-        _ = BertConfig.from_pretrained(
-            "https://huggingface.co/hf-internal-testing/tiny-random-bert/resolve/main/config.json"
-        )
-
-
-class ConfigurationVersioningTest(unittest.TestCase):
-    def test_local_versioning(self):
-        configuration = AutoConfig.from_pretrained("bert-base-cased")
-        configuration.configuration_files = ["config.4.0.0.json"]
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            configuration.save_pretrained(tmp_dir)
-            configuration.hidden_size = 2
-            json.dump(configuration.to_dict(), open(os.path.join(tmp_dir, "config.4.0.0.json"), "w"))
-
-            # This should pick the new configuration file as the version of Transformers is > 4.0.0
-            new_configuration = AutoConfig.from_pretrained(tmp_dir)
-            self.assertEqual(new_configuration.hidden_size, 2)
-
-            # Will need to be adjusted if we reach v42 and this test is still here.
-            # Should pick the old configuration file as the version of Transformers is < 4.42.0
-            configuration.configuration_files = ["config.42.0.0.json"]
-            configuration.hidden_size = 768
-            configuration.save_pretrained(tmp_dir)
-            shutil.move(os.path.join(tmp_dir, "config.4.0.0.json"), os.path.join(tmp_dir, "config.42.0.0.json"))
-            new_configuration = AutoConfig.from_pretrained(tmp_dir)
-            self.assertEqual(new_configuration.hidden_size, 768)
-
-    def test_repo_versioning_before(self):
-        # This repo has two configuration files, one for v4.0.0 and above with a different hidden size.
-        repo = "hf-internal-testing/test-two-configs"
-
-        import transformers as new_transformers
-
-        new_transformers.configuration_utils.__version__ = "v4.0.0"
-        new_configuration, kwargs = new_transformers.models.auto.AutoConfig.from_pretrained(
-            repo, return_unused_kwargs=True
-        )
-        self.assertEqual(new_configuration.hidden_size, 2)
-        # This checks `_configuration_file` ia not kept in the kwargs by mistake.
-        self.assertDictEqual(kwargs, {})
-
-        # Testing an older version by monkey-patching the version in the module it's used.
-        import transformers as old_transformers
-
-        old_transformers.configuration_utils.__version__ = "v3.0.0"
-        old_configuration = old_transformers.models.auto.AutoConfig.from_pretrained(repo)
-        self.assertEqual(old_configuration.hidden_size, 768)
diff --git a/tests/test_configuration_utils.py b/tests/test_configuration_utils.py
new file mode 100644
index 000000000000..1b8136bfbb42
--- /dev/null
+++ b/tests/test_configuration_utils.py
@@ -0,0 +1,293 @@
+# coding=utf-8
+# Copyright 2019 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import shutil
+import sys
+import tempfile
+import unittest
+import unittest.mock as mock
+from pathlib import Path
+
+from huggingface_hub import HfFolder, delete_repo
+from requests.exceptions import HTTPError
+
+from transformers import AutoConfig, BertConfig, GPT2Config
+from transformers.configuration_utils import PretrainedConfig
+from transformers.testing_utils import TOKEN, USER, is_staging_test
+
+
+sys.path.append(str(Path(__file__).parent.parent / "utils"))
+
+from test_module.custom_configuration import CustomConfig  # noqa E402
+
+
+config_common_kwargs = {
+    "return_dict": False,
+    "output_hidden_states": True,
+    "output_attentions": True,
+    "torchscript": True,
+    "torch_dtype": "float16",
+    "use_bfloat16": True,
+    "tf_legacy_loss": True,
+    "pruned_heads": {"a": 1},
+    "tie_word_embeddings": False,
+    "is_decoder": True,
+    "cross_attention_hidden_size": 128,
+    "add_cross_attention": True,
+    "tie_encoder_decoder": True,
+    "max_length": 50,
+    "min_length": 3,
+    "do_sample": True,
+    "early_stopping": True,
+    "num_beams": 3,
+    "num_beam_groups": 3,
+    "diversity_penalty": 0.5,
+    "temperature": 2.0,
+    "top_k": 10,
+    "top_p": 0.7,
+    "typical_p": 0.2,
+    "repetition_penalty": 0.8,
+    "length_penalty": 0.8,
+    "no_repeat_ngram_size": 5,
+    "encoder_no_repeat_ngram_size": 5,
+    "bad_words_ids": [1, 2, 3],
+    "num_return_sequences": 3,
+    "chunk_size_feed_forward": 5,
+    "output_scores": True,
+    "return_dict_in_generate": True,
+    "forced_bos_token_id": 2,
+    "forced_eos_token_id": 3,
+    "remove_invalid_values": True,
+    "architectures": ["BertModel"],
+    "finetuning_task": "translation",
+    "id2label": {0: "label"},
+    "label2id": {"label": "0"},
+    "tokenizer_class": "BertTokenizerFast",
+    "prefix": "prefix",
+    "bos_token_id": 6,
+    "pad_token_id": 7,
+    "eos_token_id": 8,
+    "sep_token_id": 9,
+    "decoder_start_token_id": 10,
+    "exponential_decay_length_penalty": (5, 1.01),
+    "suppress_tokens": [0, 1],
+    "begin_suppress_tokens": 2,
+    "task_specific_params": {"translation": "some_params"},
+    "problem_type": "regression",
+}
+
+
+@is_staging_test
+class ConfigPushToHubTester(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._token = TOKEN
+        HfFolder.save_token(TOKEN)
+
+    @classmethod
+    def tearDownClass(cls):
+        try:
+            delete_repo(token=cls._token, repo_id="test-config")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="valid_org/test-config-org")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="test-dynamic-config")
+        except HTTPError:
+            pass
+
+    def test_push_to_hub(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        config.push_to_hub("test-config", token=self._token)
+
+        new_config = BertConfig.from_pretrained(f"{USER}/test-config")
+        for k, v in config.to_dict().items():
+            if k != "transformers_version":
+                self.assertEqual(v, getattr(new_config, k))
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="test-config")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            config.save_pretrained(tmp_dir, repo_id="test-config", push_to_hub=True, token=self._token)
+
+        new_config = BertConfig.from_pretrained(f"{USER}/test-config")
+        for k, v in config.to_dict().items():
+            if k != "transformers_version":
+                self.assertEqual(v, getattr(new_config, k))
+
+    def test_push_to_hub_in_organization(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        config.push_to_hub("valid_org/test-config-org", use_auth_token=self._token)
+
+        new_config = BertConfig.from_pretrained("valid_org/test-config-org")
+        for k, v in config.to_dict().items():
+            if k != "transformers_version":
+                self.assertEqual(v, getattr(new_config, k))
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="valid_org/test-config-org")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            config.save_pretrained(
+                tmp_dir, repo_id="valid_org/test-config-org", push_to_hub=True, use_auth_token=self._token
+            )
+
+        new_config = BertConfig.from_pretrained("valid_org/test-config-org")
+        for k, v in config.to_dict().items():
+            if k != "transformers_version":
+                self.assertEqual(v, getattr(new_config, k))
+
+    def test_push_to_hub_dynamic_config(self):
+        CustomConfig.register_for_auto_class()
+        config = CustomConfig(attribute=42)
+
+        config.push_to_hub("test-dynamic-config", use_auth_token=self._token)
+
+        # This has added the proper auto_map field to the config
+        self.assertDictEqual(config.auto_map, {"AutoConfig": "custom_configuration.CustomConfig"})
+
+        new_config = AutoConfig.from_pretrained(f"{USER}/test-dynamic-config", trust_remote_code=True)
+        # Can't make an isinstance check because the new_config is from the FakeConfig class of a dynamic module
+        self.assertEqual(new_config.__class__.__name__, "CustomConfig")
+        self.assertEqual(new_config.attribute, 42)
+
+
+class ConfigTestUtils(unittest.TestCase):
+    def test_config_from_string(self):
+        c = GPT2Config()
+
+        # attempt to modify each of int/float/bool/str config records and verify they were updated
+        n_embd = c.n_embd + 1  # int
+        resid_pdrop = c.resid_pdrop + 1.0  # float
+        scale_attn_weights = not c.scale_attn_weights  # bool
+        summary_type = c.summary_type + "foo"  # str
+        c.update_from_string(
+            f"n_embd={n_embd},resid_pdrop={resid_pdrop},scale_attn_weights={scale_attn_weights},summary_type={summary_type}"
+        )
+        self.assertEqual(n_embd, c.n_embd, "mismatch for key: n_embd")
+        self.assertEqual(resid_pdrop, c.resid_pdrop, "mismatch for key: resid_pdrop")
+        self.assertEqual(scale_attn_weights, c.scale_attn_weights, "mismatch for key: scale_attn_weights")
+        self.assertEqual(summary_type, c.summary_type, "mismatch for key: summary_type")
+
+    def test_config_common_kwargs_is_complete(self):
+        base_config = PretrainedConfig()
+        missing_keys = [key for key in base_config.__dict__ if key not in config_common_kwargs]
+        # If this part of the test fails, you have arguments to addin config_common_kwargs above.
+        self.assertListEqual(
+            missing_keys, ["is_encoder_decoder", "_name_or_path", "_commit_hash", "transformers_version"]
+        )
+        keys_with_defaults = [key for key, value in config_common_kwargs.items() if value == getattr(base_config, key)]
+        if len(keys_with_defaults) > 0:
+            raise ValueError(
+                "The following keys are set with the default values in"
+                " `test_configuration_common.config_common_kwargs` pick another value for them:"
+                f" {', '.join(keys_with_defaults)}."
+            )
+
+    def test_nested_config_load_from_dict(self):
+        config = AutoConfig.from_pretrained(
+            "hf-internal-testing/tiny-random-CLIPModel", text_config={"num_hidden_layers": 2}
+        )
+        self.assertNotIsInstance(config.text_config, dict)
+        self.assertEqual(config.text_config.__class__.__name__, "CLIPTextConfig")
+
+    def test_from_pretrained_subfolder(self):
+        with self.assertRaises(OSError):
+            # config is in subfolder, the following should not work without specifying the subfolder
+            _ = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert-subfolder")
+
+        config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert-subfolder", subfolder="bert")
+
+        self.assertIsNotNone(config)
+
+    def test_cached_files_are_used_when_internet_is_down(self):
+        # A mock response for an HTTP head request to emulate server down
+        response_mock = mock.Mock()
+        response_mock.status_code = 500
+        response_mock.headers = {}
+        response_mock.raise_for_status.side_effect = HTTPError
+        response_mock.json.return_value = {}
+
+        # Download this model to make sure it's in the cache.
+        _ = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        # Under the mock environment we get a 500 error when trying to reach the model.
+        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
+            _ = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
+            # This check we did call the fake head request
+            mock_head.assert_called()
+
+    def test_legacy_load_from_url(self):
+        # This test is for deprecated behavior and can be removed in v5
+        _ = BertConfig.from_pretrained(
+            "https://huggingface.co/hf-internal-testing/tiny-random-bert/resolve/main/config.json"
+        )
+
+    def test_local_versioning(self):
+        configuration = AutoConfig.from_pretrained("bert-base-cased")
+        configuration.configuration_files = ["config.4.0.0.json"]
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            configuration.save_pretrained(tmp_dir)
+            configuration.hidden_size = 2
+            json.dump(configuration.to_dict(), open(os.path.join(tmp_dir, "config.4.0.0.json"), "w"))
+
+            # This should pick the new configuration file as the version of Transformers is > 4.0.0
+            new_configuration = AutoConfig.from_pretrained(tmp_dir)
+            self.assertEqual(new_configuration.hidden_size, 2)
+
+            # Will need to be adjusted if we reach v42 and this test is still here.
+            # Should pick the old configuration file as the version of Transformers is < 4.42.0
+            configuration.configuration_files = ["config.42.0.0.json"]
+            configuration.hidden_size = 768
+            configuration.save_pretrained(tmp_dir)
+            shutil.move(os.path.join(tmp_dir, "config.4.0.0.json"), os.path.join(tmp_dir, "config.42.0.0.json"))
+            new_configuration = AutoConfig.from_pretrained(tmp_dir)
+            self.assertEqual(new_configuration.hidden_size, 768)
+
+    def test_repo_versioning_before(self):
+        # This repo has two configuration files, one for v4.0.0 and above with a different hidden size.
+        repo = "hf-internal-testing/test-two-configs"
+
+        import transformers as new_transformers
+
+        new_transformers.configuration_utils.__version__ = "v4.0.0"
+        new_configuration, kwargs = new_transformers.models.auto.AutoConfig.from_pretrained(
+            repo, return_unused_kwargs=True
+        )
+        self.assertEqual(new_configuration.hidden_size, 2)
+        # This checks `_configuration_file` ia not kept in the kwargs by mistake.
+        self.assertDictEqual(kwargs, {})
+
+        # Testing an older version by monkey-patching the version in the module it's used.
+        import transformers as old_transformers
+
+        old_transformers.configuration_utils.__version__ = "v3.0.0"
+        old_configuration = old_transformers.models.auto.AutoConfig.from_pretrained(repo)
+        self.assertEqual(old_configuration.hidden_size, 768)
diff --git a/tests/test_feature_extraction_common.py b/tests/test_feature_extraction_common.py
index fe8d02480644..49937309d0e0 100644
--- a/tests/test_feature_extraction_common.py
+++ b/tests/test_feature_extraction_common.py
@@ -16,129 +16,9 @@
 
 import json
 import os
-import sys
 import tempfile
-import unittest
-import unittest.mock as mock
-from pathlib import Path
 
-from huggingface_hub import HfFolder, delete_repo, set_access_token
-from requests.exceptions import HTTPError
-from transformers import AutoFeatureExtractor, Wav2Vec2FeatureExtractor
-from transformers.testing_utils import (
-    TOKEN,
-    USER,
-    check_json_file_has_correct_format,
-    get_tests_dir,
-    is_staging_test,
-    require_torch,
-    require_vision,
-)
-from transformers.utils import is_torch_available, is_vision_available
-
-
-sys.path.append(str(Path(__file__).parent.parent / "utils"))
-
-from test_module.custom_feature_extraction import CustomFeatureExtractor  # noqa E402
-
-
-if is_torch_available():
-    import numpy as np
-    import torch
-
-if is_vision_available():
-    from PIL import Image
-
-
-SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR = get_tests_dir("fixtures")
-
-
-def prepare_image_inputs(feature_extract_tester, equal_resolution=False, numpify=False, torchify=False):
-    """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
-    or a list of PyTorch tensors if one specifies torchify=True.
-
-    One can specify whether the images are of the same resolution or not.
-    """
-
-    assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
-
-    image_inputs = []
-    for i in range(feature_extract_tester.batch_size):
-        if equal_resolution:
-            width = height = feature_extract_tester.max_resolution
-        else:
-            # To avoid getting image width/height 0
-            min_resolution = feature_extract_tester.min_resolution
-            if getattr(feature_extract_tester, "size_divisor", None):
-                # If `size_divisor` is defined, the image needs to have width/size >= `size_divisor`
-                min_resolution = max(feature_extract_tester.size_divisor, min_resolution)
-            width, height = np.random.choice(np.arange(min_resolution, feature_extract_tester.max_resolution), 2)
-        image_inputs.append(
-            np.random.randint(
-                255,
-                size=(
-                    feature_extract_tester.num_channels,
-                    width,
-                    height,
-                ),
-                dtype=np.uint8,
-            )
-        )
-
-    if not numpify and not torchify:
-        # PIL expects the channel dimension as last dimension
-        image_inputs = [Image.fromarray(np.moveaxis(image, 0, -1)) for image in image_inputs]
-
-    if torchify:
-        image_inputs = [torch.from_numpy(image) for image in image_inputs]
-
-    return image_inputs
-
-
-def prepare_video(feature_extract_tester, width=10, height=10, numpify=False, torchify=False):
-    """This function prepares a video as a list of PIL images/NumPy arrays/PyTorch tensors."""
-
-    video = []
-    for i in range(feature_extract_tester.num_frames):
-        video.append(np.random.randint(255, size=(feature_extract_tester.num_channels, width, height), dtype=np.uint8))
-
-    if not numpify and not torchify:
-        # PIL expects the channel dimension as last dimension
-        video = [Image.fromarray(np.moveaxis(frame, 0, -1)) for frame in video]
-
-    if torchify:
-        video = [torch.from_numpy(frame) for frame in video]
-
-    return video
-
-
-def prepare_video_inputs(feature_extract_tester, equal_resolution=False, numpify=False, torchify=False):
-    """This function prepares a batch of videos: a list of list of PIL images, or a list of list of numpy arrays if
-    one specifies numpify=True, or a list of list of PyTorch tensors if one specifies torchify=True.
-
-    One can specify whether the videos are of the same resolution or not.
-    """
-
-    assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
-
-    video_inputs = []
-    for i in range(feature_extract_tester.batch_size):
-        if equal_resolution:
-            width = height = feature_extract_tester.max_resolution
-        else:
-            width, height = np.random.choice(
-                np.arange(feature_extract_tester.min_resolution, feature_extract_tester.max_resolution), 2
-            )
-            video = prepare_video(
-                feature_extract_tester=feature_extract_tester,
-                width=width,
-                height=height,
-                numpify=numpify,
-                torchify=torchify,
-            )
-        video_inputs.append(video)
-
-    return video_inputs
+from transformers.testing_utils import check_json_file_has_correct_format
 
 
 class FeatureExtractionSavingTestMixin:
@@ -173,148 +53,3 @@ def test_feat_extract_from_and_save_pretrained(self):
     def test_init_without_params(self):
         feat_extract = self.feature_extraction_class()
         self.assertIsNotNone(feat_extract)
-
-    @require_torch
-    @require_vision
-    def test_cast_dtype_device(self):
-        if self.test_cast_dtype is not None:
-            # Initialize feature_extractor
-            feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-
-            # create random PyTorch tensors
-            image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
-
-            encoding = feature_extractor(image_inputs, return_tensors="pt")
-            # for layoutLM compatiblity
-            self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
-            self.assertEqual(encoding.pixel_values.dtype, torch.float32)
-
-            encoding = feature_extractor(image_inputs, return_tensors="pt").to(torch.float16)
-            self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
-            self.assertEqual(encoding.pixel_values.dtype, torch.float16)
-
-            encoding = feature_extractor(image_inputs, return_tensors="pt").to("cpu", torch.bfloat16)
-            self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
-            self.assertEqual(encoding.pixel_values.dtype, torch.bfloat16)
-
-            with self.assertRaises(TypeError):
-                _ = feature_extractor(image_inputs, return_tensors="pt").to(torch.bfloat16, "cpu")
-
-            # Try with text + image feature
-            encoding = feature_extractor(image_inputs, return_tensors="pt")
-            encoding.update({"input_ids": torch.LongTensor([[1, 2, 3], [4, 5, 6]])})
-            encoding = encoding.to(torch.float16)
-
-            self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
-            self.assertEqual(encoding.pixel_values.dtype, torch.float16)
-            self.assertEqual(encoding.input_ids.dtype, torch.long)
-
-
-class FeatureExtractorUtilTester(unittest.TestCase):
-    def test_cached_files_are_used_when_internet_is_down(self):
-        # A mock response for an HTTP head request to emulate server down
-        response_mock = mock.Mock()
-        response_mock.status_code = 500
-        response_mock.headers = {}
-        response_mock.raise_for_status.side_effect = HTTPError
-        response_mock.json.return_value = {}
-
-        # Download this model to make sure it's in the cache.
-        _ = Wav2Vec2FeatureExtractor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2")
-        # Under the mock environment we get a 500 error when trying to reach the model.
-        with mock.patch("requests.request", return_value=response_mock) as mock_head:
-            _ = Wav2Vec2FeatureExtractor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2")
-            # This check we did call the fake head request
-            mock_head.assert_called()
-
-    def test_legacy_load_from_url(self):
-        # This test is for deprecated behavior and can be removed in v5
-        _ = Wav2Vec2FeatureExtractor.from_pretrained(
-            "https://huggingface.co/hf-internal-testing/tiny-random-wav2vec2/resolve/main/preprocessor_config.json"
-        )
-
-
-@is_staging_test
-class FeatureExtractorPushToHubTester(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls._token = TOKEN
-        set_access_token(TOKEN)
-        HfFolder.save_token(TOKEN)
-
-    @classmethod
-    def tearDownClass(cls):
-        try:
-            delete_repo(token=cls._token, repo_id="test-feature-extractor")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="valid_org/test-feature-extractor-org")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="test-dynamic-feature-extractor")
-        except HTTPError:
-            pass
-
-    def test_push_to_hub(self):
-        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR)
-        feature_extractor.push_to_hub("test-feature-extractor", use_auth_token=self._token)
-
-        new_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(f"{USER}/test-feature-extractor")
-        for k, v in feature_extractor.__dict__.items():
-            self.assertEqual(v, getattr(new_feature_extractor, k))
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="test-feature-extractor")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            feature_extractor.save_pretrained(
-                tmp_dir, repo_id="test-feature-extractor", push_to_hub=True, use_auth_token=self._token
-            )
-
-        new_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(f"{USER}/test-feature-extractor")
-        for k, v in feature_extractor.__dict__.items():
-            self.assertEqual(v, getattr(new_feature_extractor, k))
-
-    def test_push_to_hub_in_organization(self):
-        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR)
-        feature_extractor.push_to_hub("valid_org/test-feature-extractor", use_auth_token=self._token)
-
-        new_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("valid_org/test-feature-extractor")
-        for k, v in feature_extractor.__dict__.items():
-            self.assertEqual(v, getattr(new_feature_extractor, k))
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="valid_org/test-feature-extractor")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            feature_extractor.save_pretrained(
-                tmp_dir, repo_id="valid_org/test-feature-extractor-org", push_to_hub=True, use_auth_token=self._token
-            )
-
-        new_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("valid_org/test-feature-extractor-org")
-        for k, v in feature_extractor.__dict__.items():
-            self.assertEqual(v, getattr(new_feature_extractor, k))
-
-    def test_push_to_hub_dynamic_feature_extractor(self):
-        CustomFeatureExtractor.register_for_auto_class()
-        feature_extractor = CustomFeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR)
-
-        feature_extractor.push_to_hub("test-dynamic-feature-extractor", use_auth_token=self._token)
-
-        # This has added the proper auto_map field to the config
-        self.assertDictEqual(
-            feature_extractor.auto_map,
-            {"AutoFeatureExtractor": "custom_feature_extraction.CustomFeatureExtractor"},
-        )
-
-        new_feature_extractor = AutoFeatureExtractor.from_pretrained(
-            f"{USER}/test-dynamic-feature-extractor", trust_remote_code=True
-        )
-        # Can't make an isinstance check because the new_feature_extractor is from the CustomFeatureExtractor class of a dynamic module
-        self.assertEqual(new_feature_extractor.__class__.__name__, "CustomFeatureExtractor")
diff --git a/tests/test_feature_extraction_utils.py b/tests/test_feature_extraction_utils.py
new file mode 100644
index 000000000000..b17c48ff120d
--- /dev/null
+++ b/tests/test_feature_extraction_utils.py
@@ -0,0 +1,144 @@
+# coding=utf-8
+# Copyright 2021 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import sys
+import tempfile
+import unittest
+import unittest.mock as mock
+from pathlib import Path
+
+from huggingface_hub import HfFolder, delete_repo
+from requests.exceptions import HTTPError
+
+from transformers import AutoFeatureExtractor, Wav2Vec2FeatureExtractor
+from transformers.testing_utils import TOKEN, USER, get_tests_dir, is_staging_test
+
+
+sys.path.append(str(Path(__file__).parent.parent / "utils"))
+
+from test_module.custom_feature_extraction import CustomFeatureExtractor  # noqa E402
+
+
+SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR = get_tests_dir("fixtures")
+
+
+class FeatureExtractorUtilTester(unittest.TestCase):
+    def test_cached_files_are_used_when_internet_is_down(self):
+        # A mock response for an HTTP head request to emulate server down
+        response_mock = mock.Mock()
+        response_mock.status_code = 500
+        response_mock.headers = {}
+        response_mock.raise_for_status.side_effect = HTTPError
+        response_mock.json.return_value = {}
+
+        # Download this model to make sure it's in the cache.
+        _ = Wav2Vec2FeatureExtractor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2")
+        # Under the mock environment we get a 500 error when trying to reach the model.
+        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
+            _ = Wav2Vec2FeatureExtractor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2")
+            # This check we did call the fake head request
+            mock_head.assert_called()
+
+    def test_legacy_load_from_url(self):
+        # This test is for deprecated behavior and can be removed in v5
+        _ = Wav2Vec2FeatureExtractor.from_pretrained(
+            "https://huggingface.co/hf-internal-testing/tiny-random-wav2vec2/resolve/main/preprocessor_config.json"
+        )
+
+
+@is_staging_test
+class FeatureExtractorPushToHubTester(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._token = TOKEN
+        HfFolder.save_token(TOKEN)
+
+    @classmethod
+    def tearDownClass(cls):
+        try:
+            delete_repo(token=cls._token, repo_id="test-feature-extractor")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="valid_org/test-feature-extractor-org")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="test-dynamic-feature-extractor")
+        except HTTPError:
+            pass
+
+    def test_push_to_hub(self):
+        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR)
+        feature_extractor.push_to_hub("test-feature-extractor", use_auth_token=self._token)
+
+        new_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(f"{USER}/test-feature-extractor")
+        for k, v in feature_extractor.__dict__.items():
+            self.assertEqual(v, getattr(new_feature_extractor, k))
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="test-feature-extractor")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            feature_extractor.save_pretrained(
+                tmp_dir, repo_id="test-feature-extractor", push_to_hub=True, use_auth_token=self._token
+            )
+
+        new_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(f"{USER}/test-feature-extractor")
+        for k, v in feature_extractor.__dict__.items():
+            self.assertEqual(v, getattr(new_feature_extractor, k))
+
+    def test_push_to_hub_in_organization(self):
+        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR)
+        feature_extractor.push_to_hub("valid_org/test-feature-extractor", use_auth_token=self._token)
+
+        new_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("valid_org/test-feature-extractor")
+        for k, v in feature_extractor.__dict__.items():
+            self.assertEqual(v, getattr(new_feature_extractor, k))
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="valid_org/test-feature-extractor")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            feature_extractor.save_pretrained(
+                tmp_dir, repo_id="valid_org/test-feature-extractor-org", push_to_hub=True, use_auth_token=self._token
+            )
+
+        new_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("valid_org/test-feature-extractor-org")
+        for k, v in feature_extractor.__dict__.items():
+            self.assertEqual(v, getattr(new_feature_extractor, k))
+
+    def test_push_to_hub_dynamic_feature_extractor(self):
+        CustomFeatureExtractor.register_for_auto_class()
+        feature_extractor = CustomFeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR)
+
+        feature_extractor.push_to_hub("test-dynamic-feature-extractor", use_auth_token=self._token)
+
+        # This has added the proper auto_map field to the config
+        self.assertDictEqual(
+            feature_extractor.auto_map,
+            {"AutoFeatureExtractor": "custom_feature_extraction.CustomFeatureExtractor"},
+        )
+
+        new_feature_extractor = AutoFeatureExtractor.from_pretrained(
+            f"{USER}/test-dynamic-feature-extractor", trust_remote_code=True
+        )
+        # Can't make an isinstance check because the new_feature_extractor is from the CustomFeatureExtractor class of a dynamic module
+        self.assertEqual(new_feature_extractor.__class__.__name__, "CustomFeatureExtractor")
diff --git a/tests/test_image_processing_common.py b/tests/test_image_processing_common.py
new file mode 100644
index 000000000000..cb78b3337556
--- /dev/null
+++ b/tests/test_image_processing_common.py
@@ -0,0 +1,287 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import tempfile
+
+from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+
+if is_torch_available():
+    import numpy as np
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+
+def prepare_image_inputs(
+    batch_size,
+    min_resolution,
+    max_resolution,
+    num_channels,
+    size_divisor=None,
+    equal_resolution=False,
+    numpify=False,
+    torchify=False,
+):
+    """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
+    or a list of PyTorch tensors if one specifies torchify=True.
+
+    One can specify whether the images are of the same resolution or not.
+    """
+
+    assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
+
+    image_inputs = []
+    for i in range(batch_size):
+        if equal_resolution:
+            width = height = max_resolution
+        else:
+            # To avoid getting image width/height 0
+            if size_divisor is not None:
+                # If `size_divisor` is defined, the image needs to have width/size >= `size_divisor`
+                min_resolution = max(size_divisor, min_resolution)
+            width, height = np.random.choice(np.arange(min_resolution, max_resolution), 2)
+        image_inputs.append(np.random.randint(255, size=(num_channels, width, height), dtype=np.uint8))
+
+    if not numpify and not torchify:
+        # PIL expects the channel dimension as last dimension
+        image_inputs = [Image.fromarray(np.moveaxis(image, 0, -1)) for image in image_inputs]
+
+    if torchify:
+        image_inputs = [torch.from_numpy(image) for image in image_inputs]
+
+    return image_inputs
+
+
+def prepare_video(num_frames, num_channels, width=10, height=10, numpify=False, torchify=False):
+    """This function prepares a video as a list of PIL images/NumPy arrays/PyTorch tensors."""
+
+    video = []
+    for i in range(num_frames):
+        video.append(np.random.randint(255, size=(num_channels, width, height), dtype=np.uint8))
+
+    if not numpify and not torchify:
+        # PIL expects the channel dimension as last dimension
+        video = [Image.fromarray(np.moveaxis(frame, 0, -1)) for frame in video]
+
+    if torchify:
+        video = [torch.from_numpy(frame) for frame in video]
+
+    return video
+
+
+def prepare_video_inputs(
+    batch_size,
+    num_frames,
+    num_channels,
+    min_resolution,
+    max_resolution,
+    equal_resolution=False,
+    numpify=False,
+    torchify=False,
+):
+    """This function prepares a batch of videos: a list of list of PIL images, or a list of list of numpy arrays if
+    one specifies numpify=True, or a list of list of PyTorch tensors if one specifies torchify=True.
+
+    One can specify whether the videos are of the same resolution or not.
+    """
+
+    assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
+
+    video_inputs = []
+    for i in range(batch_size):
+        if equal_resolution:
+            width = height = max_resolution
+        else:
+            width, height = np.random.choice(np.arange(min_resolution, max_resolution), 2)
+            video = prepare_video(
+                num_frames=num_frames,
+                num_channels=num_channels,
+                width=width,
+                height=height,
+                numpify=numpify,
+                torchify=torchify,
+            )
+        video_inputs.append(video)
+
+    return video_inputs
+
+
+class ImageProcessingTestMixin:
+    test_cast_dtype = None
+
+    def test_image_processor_to_json_string(self):
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+        obj = json.loads(image_processor.to_json_string())
+        for key, value in self.image_processor_dict.items():
+            self.assertEqual(obj[key], value)
+
+    def test_image_processor_to_json_file(self):
+        image_processor_first = self.image_processing_class(**self.image_processor_dict)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            json_file_path = os.path.join(tmpdirname, "image_processor.json")
+            image_processor_first.to_json_file(json_file_path)
+            image_processor_second = self.image_processing_class.from_json_file(json_file_path)
+
+        self.assertEqual(image_processor_second.to_dict(), image_processor_first.to_dict())
+
+    def test_image_processor_from_and_save_pretrained(self):
+        image_processor_first = self.image_processing_class(**self.image_processor_dict)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            saved_file = image_processor_first.save_pretrained(tmpdirname)[0]
+            check_json_file_has_correct_format(saved_file)
+            image_processor_second = self.image_processing_class.from_pretrained(tmpdirname)
+
+        self.assertEqual(image_processor_second.to_dict(), image_processor_first.to_dict())
+
+    def test_init_without_params(self):
+        image_processor = self.image_processing_class()
+        self.assertIsNotNone(image_processor)
+
+    @require_torch
+    @require_vision
+    def test_cast_dtype_device(self):
+        if self.test_cast_dtype is not None:
+            # Initialize image_processor
+            image_processor = self.image_processing_class(**self.image_processor_dict)
+
+            # create random PyTorch tensors
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+
+            encoding = image_processor(image_inputs, return_tensors="pt")
+            # for layoutLM compatiblity
+            self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
+            self.assertEqual(encoding.pixel_values.dtype, torch.float32)
+
+            encoding = image_processor(image_inputs, return_tensors="pt").to(torch.float16)
+            self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
+            self.assertEqual(encoding.pixel_values.dtype, torch.float16)
+
+            encoding = image_processor(image_inputs, return_tensors="pt").to("cpu", torch.bfloat16)
+            self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
+            self.assertEqual(encoding.pixel_values.dtype, torch.bfloat16)
+
+            with self.assertRaises(TypeError):
+                _ = image_processor(image_inputs, return_tensors="pt").to(torch.bfloat16, "cpu")
+
+            # Try with text + image feature
+            encoding = image_processor(image_inputs, return_tensors="pt")
+            encoding.update({"input_ids": torch.LongTensor([[1, 2, 3], [4, 5, 6]])})
+            encoding = encoding.to(torch.float16)
+
+            self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
+            self.assertEqual(encoding.pixel_values.dtype, torch.float16)
+            self.assertEqual(encoding.input_ids.dtype, torch.long)
+
+    def test_call_pil(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PIL images
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
+        for image in image_inputs:
+            self.assertIsInstance(image, Image.Image)
+
+        # Test not batched input
+        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
+        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
+
+        # Test batched
+        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+        self.assertEqual(
+            tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
+        )
+
+    def test_call_numpy(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random numpy tensors
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, np.ndarray)
+
+        # Test not batched input
+        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
+        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
+
+        # Test batched
+        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+        self.assertEqual(
+            tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
+        )
+
+    def test_call_pytorch(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PyTorch tensors
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+
+        for image in image_inputs:
+            self.assertIsInstance(image, torch.Tensor)
+
+        # Test not batched input
+        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
+        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
+
+        # Test batched
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            tuple(encoded_images.shape),
+            (self.image_processor_tester.batch_size, *expected_output_image_shape),
+        )
+
+    def test_call_numpy_4_channels(self):
+        # Test that can process images which have an arbitrary number of channels
+        # Initialize image_processing
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+
+        # create random numpy tensors
+        self.image_processor_tester.num_channels = 4
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
+
+        # Test not batched input
+        encoded_images = image_processor(
+            image_inputs[0],
+            return_tensors="pt",
+            input_data_format="channels_first",
+            image_mean=0,
+            image_std=1,
+        ).pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
+        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
+
+        # Test batched
+        encoded_images = image_processor(
+            image_inputs,
+            return_tensors="pt",
+            input_data_format="channels_first",
+            image_mean=0,
+            image_std=1,
+        ).pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+        self.assertEqual(
+            tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
+        )
diff --git a/tests/test_image_processing_utils.py b/tests/test_image_processing_utils.py
new file mode 100644
index 000000000000..5211a541d3ec
--- /dev/null
+++ b/tests/test_image_processing_utils.py
@@ -0,0 +1,154 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import tempfile
+import unittest
+import unittest.mock as mock
+from pathlib import Path
+
+from huggingface_hub import HfFolder, delete_repo
+from requests.exceptions import HTTPError
+
+from transformers import AutoImageProcessor, ViTImageProcessor
+from transformers.testing_utils import TOKEN, USER, get_tests_dir, is_staging_test
+
+
+sys.path.append(str(Path(__file__).parent.parent / "utils"))
+
+from test_module.custom_image_processing import CustomImageProcessor  # noqa E402
+
+
+SAMPLE_IMAGE_PROCESSING_CONFIG_DIR = get_tests_dir("fixtures")
+
+
+class ImageProcessorUtilTester(unittest.TestCase):
+    def test_cached_files_are_used_when_internet_is_down(self):
+        # A mock response for an HTTP head request to emulate server down
+        response_mock = mock.Mock()
+        response_mock.status_code = 500
+        response_mock.headers = {}
+        response_mock.raise_for_status.side_effect = HTTPError
+        response_mock.json.return_value = {}
+
+        # Download this model to make sure it's in the cache.
+        _ = ViTImageProcessor.from_pretrained("hf-internal-testing/tiny-random-vit")
+        # Under the mock environment we get a 500 error when trying to reach the model.
+        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
+            _ = ViTImageProcessor.from_pretrained("hf-internal-testing/tiny-random-vit")
+            # This check we did call the fake head request
+            mock_head.assert_called()
+
+    def test_legacy_load_from_url(self):
+        # This test is for deprecated behavior and can be removed in v5
+        _ = ViTImageProcessor.from_pretrained(
+            "https://huggingface.co/hf-internal-testing/tiny-random-vit/resolve/main/preprocessor_config.json"
+        )
+
+    def test_image_processor_from_pretrained_subfolder(self):
+        with self.assertRaises(OSError):
+            # config is in subfolder, the following should not work without specifying the subfolder
+            _ = AutoImageProcessor.from_pretrained("hf-internal-testing/stable-diffusion-all-variants")
+
+        config = AutoImageProcessor.from_pretrained(
+            "hf-internal-testing/stable-diffusion-all-variants", subfolder="feature_extractor"
+        )
+
+        self.assertIsNotNone(config)
+
+
+@is_staging_test
+class ImageProcessorPushToHubTester(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._token = TOKEN
+        HfFolder.save_token(TOKEN)
+
+    @classmethod
+    def tearDownClass(cls):
+        try:
+            delete_repo(token=cls._token, repo_id="test-image-processor")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="valid_org/test-image-processor-org")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="test-dynamic-image-processor")
+        except HTTPError:
+            pass
+
+    def test_push_to_hub(self):
+        image_processor = ViTImageProcessor.from_pretrained(SAMPLE_IMAGE_PROCESSING_CONFIG_DIR)
+        image_processor.push_to_hub("test-image-processor", use_auth_token=self._token)
+
+        new_image_processor = ViTImageProcessor.from_pretrained(f"{USER}/test-image-processor")
+        for k, v in image_processor.__dict__.items():
+            self.assertEqual(v, getattr(new_image_processor, k))
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="test-image-processor")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            image_processor.save_pretrained(
+                tmp_dir, repo_id="test-image-processor", push_to_hub=True, use_auth_token=self._token
+            )
+
+        new_image_processor = ViTImageProcessor.from_pretrained(f"{USER}/test-image-processor")
+        for k, v in image_processor.__dict__.items():
+            self.assertEqual(v, getattr(new_image_processor, k))
+
+    def test_push_to_hub_in_organization(self):
+        image_processor = ViTImageProcessor.from_pretrained(SAMPLE_IMAGE_PROCESSING_CONFIG_DIR)
+        image_processor.push_to_hub("valid_org/test-image-processor", use_auth_token=self._token)
+
+        new_image_processor = ViTImageProcessor.from_pretrained("valid_org/test-image-processor")
+        for k, v in image_processor.__dict__.items():
+            self.assertEqual(v, getattr(new_image_processor, k))
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="valid_org/test-image-processor")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            image_processor.save_pretrained(
+                tmp_dir, repo_id="valid_org/test-image-processor-org", push_to_hub=True, use_auth_token=self._token
+            )
+
+        new_image_processor = ViTImageProcessor.from_pretrained("valid_org/test-image-processor-org")
+        for k, v in image_processor.__dict__.items():
+            self.assertEqual(v, getattr(new_image_processor, k))
+
+    def test_push_to_hub_dynamic_image_processor(self):
+        CustomImageProcessor.register_for_auto_class()
+        image_processor = CustomImageProcessor.from_pretrained(SAMPLE_IMAGE_PROCESSING_CONFIG_DIR)
+
+        image_processor.push_to_hub("test-dynamic-image-processor", use_auth_token=self._token)
+
+        # This has added the proper auto_map field to the config
+        self.assertDictEqual(
+            image_processor.auto_map,
+            {"AutoImageProcessor": "custom_image_processing.CustomImageProcessor"},
+        )
+
+        new_image_processor = AutoImageProcessor.from_pretrained(
+            f"{USER}/test-dynamic-image-processor", trust_remote_code=True
+        )
+        # Can't make an isinstance check because the new_image_processor is from the CustomImageProcessor class of a dynamic module
+        self.assertEqual(new_image_processor.__class__.__name__, "CustomImageProcessor")
diff --git a/tests/test_image_transforms.py b/tests/test_image_transforms.py
index 206c8dc5b8fb..2941685e6952 100644
--- a/tests/test_image_transforms.py
+++ b/tests/test_image_transforms.py
@@ -16,8 +16,8 @@
 import unittest
 
 import numpy as np
-
 from parameterized import parameterized
+
 from transformers.testing_utils import require_flax, require_tf, require_torch, require_vision
 from transformers.utils.import_utils import is_flax_available, is_tf_available, is_torch_available, is_vision_available
 
@@ -39,6 +39,7 @@
         center_to_corners_format,
         convert_to_rgb,
         corners_to_center_format,
+        flip_channel_order,
         get_resize_output_image_size,
         id_to_rgb,
         normalize,
@@ -96,6 +97,32 @@ def test_to_pil_image_from_float(self, name, image_shape, dtype):
         # make sure image is correctly rescaled
         self.assertTrue(np.abs(np.asarray(pil_image)).sum() > 0)
 
+        # Make sure that an exception is raised if image is not in [0, 1]
+        image = np.random.randn(*image_shape).astype(dtype)
+        with self.assertRaises(ValueError):
+            to_pil_image(image)
+
+    @require_vision
+    def test_to_pil_image_from_mask(self):
+        # Make sure binary mask remains a binary mask
+        image = np.random.randint(0, 2, (3, 4, 5)).astype(np.uint8)
+        pil_image = to_pil_image(image)
+        self.assertIsInstance(pil_image, PIL.Image.Image)
+        self.assertEqual(pil_image.size, (5, 4))
+
+        np_img = np.asarray(pil_image)
+        self.assertTrue(np_img.min() == 0)
+        self.assertTrue(np_img.max() == 1)
+
+        image = np.random.randint(0, 2, (3, 4, 5)).astype(np.float32)
+        pil_image = to_pil_image(image)
+        self.assertIsInstance(pil_image, PIL.Image.Image)
+        self.assertEqual(pil_image.size, (5, 4))
+
+        np_img = np.asarray(pil_image)
+        self.assertTrue(np_img.min() == 0)
+        self.assertTrue(np_img.max() == 1)
+
     @require_tf
     def test_to_pil_image_from_tensorflow(self):
         # channels_first
@@ -158,6 +185,11 @@ def test_to_channel_dimension_format(self):
         image = to_channel_dimension_format(image, "channels_first")
         self.assertEqual(image.shape, (3, 4, 5))
 
+        # Can pass in input_data_format and works if data format is ambiguous or unknown.
+        image = np.random.rand(4, 5, 6)
+        image = to_channel_dimension_format(image, "channels_first", input_channel_dim="channels_last")
+        self.assertEqual(image.shape, (6, 4, 5))
+
     def test_get_resize_output_image_size(self):
         image = np.random.randint(0, 256, (3, 224, 224))
 
@@ -185,6 +217,14 @@ def test_get_resize_output_image_size(self):
         image = np.random.randint(0, 256, (3, 50, 40))
         self.assertEqual(get_resize_output_image_size(image, 20, default_to_square=False, max_size=22), (22, 17))
 
+        # Test output size = (int(size * height / width), size) if size is an int and height > width and
+        # input has 4 channels
+        image = np.random.randint(0, 256, (4, 50, 40))
+        self.assertEqual(
+            get_resize_output_image_size(image, 20, default_to_square=False, input_data_format="channels_first"),
+            (25, 20),
+        )
+
         # Test correct channel dimension is returned if output size if height == 3
         # Defaults to input format - channels first
         image = np.random.randint(0, 256, (3, 18, 97))
@@ -217,12 +257,26 @@ def test_resize(self):
         self.assertIsInstance(resized_image, np.ndarray)
         self.assertEqual(resized_image.shape, (30, 40, 3))
 
-        # Check PIL.Image.Image is return if return_numpy=False
+        # Check PIL.Image.Image is returned if return_numpy=False
         resized_image = resize(image, (30, 40), return_numpy=False)
         self.assertIsInstance(resized_image, PIL.Image.Image)
         # PIL size is in (width, height) order
         self.assertEqual(resized_image.size, (40, 30))
 
+        # Check an image with float values between 0-1 is returned with values in this range
+        image = np.random.rand(3, 224, 224)
+        resized_image = resize(image, (30, 40))
+        self.assertIsInstance(resized_image, np.ndarray)
+        self.assertEqual(resized_image.shape, (3, 30, 40))
+        self.assertTrue(np.all(resized_image >= 0))
+        self.assertTrue(np.all(resized_image <= 1))
+
+        # Check that an image with 4 channels is resized correctly
+        image = np.random.randint(0, 256, (4, 224, 224))
+        resized_image = resize(image, (30, 40), input_data_format="channels_first")
+        self.assertIsInstance(resized_image, np.ndarray)
+        self.assertEqual(resized_image.shape, (4, 30, 40))
+
     def test_normalize(self):
         image = np.random.randint(0, 256, (224, 224, 3)) / 255
 
@@ -250,6 +304,15 @@ def test_normalize(self):
         self.assertEqual(normalized_image.shape, (3, 224, 224))
         self.assertTrue(np.allclose(normalized_image, expected_image))
 
+        # Test image with 4 channels is normalized correctly
+        image = np.random.randint(0, 256, (224, 224, 4)) / 255
+        mean = (0.5, 0.6, 0.7, 0.8)
+        std = (0.1, 0.2, 0.3, 0.4)
+        expected_image = (image - mean) / std
+        self.assertTrue(
+            np.allclose(normalize(image, mean=mean, std=std, input_data_format="channels_last"), expected_image)
+        )
+
     def test_center_crop(self):
         image = np.random.randint(0, 256, (3, 224, 224))
 
@@ -273,6 +336,11 @@ def test_center_crop(self):
         self.assertEqual(cropped_image.shape, (300, 260, 3))
         self.assertTrue(np.allclose(cropped_image, expected_image))
 
+        # Test image with 4 channels is cropped correctly
+        image = np.random.randint(0, 256, (224, 224, 4))
+        expected_image = image[52:172, 82:142, :]
+        self.assertTrue(np.allclose(center_crop(image, (120, 60), input_data_format="channels_last"), expected_image))
+
     def test_center_to_corners_format(self):
         bbox_center = np.array([[10, 20, 4, 8], [15, 16, 3, 4]])
         expected = np.array([[8, 16, 12, 24], [13.5, 14, 16.5, 18]])
@@ -458,6 +526,22 @@ def test_pad(self):
             np.allclose(expected_image, pad(image, ((0, 2), (2, 1)), mode="reflect", data_format="channels_last"))
         )
 
+        # Test we can pad on an image with 2 channels
+        # fmt: off
+        image = np.array([
+            [[0, 1], [2, 3]],
+        ])
+        expected_image = np.array([
+            [[0, 0], [0, 1], [2, 3]],
+            [[0, 0], [0, 0], [0, 0]],
+        ])
+        # fmt: on
+        self.assertTrue(
+            np.allclose(
+                expected_image, pad(image, ((0, 1), (1, 0)), mode="constant", input_data_format="channels_last")
+            )
+        )
+
     @require_vision
     def test_convert_to_rgb(self):
         # Test that an RGBA image is converted to RGB
@@ -486,3 +570,58 @@ def test_convert_to_rgb(self):
         self.assertEqual(rgb_image.mode, "RGB")
         self.assertEqual(rgb_image.size, (2, 1))
         self.assertTrue(np.allclose(np.array(rgb_image), np.array([[[0, 0, 0], [255, 255, 255]]], dtype=np.uint8)))
+
+    def test_flip_channel_order(self):
+        # fmt: off
+        img_channels_first = np.array([
+            [[ 0,  1,  2,  3],
+             [ 4,  5,  6,  7]],
+
+            [[ 8,  9, 10, 11],
+             [12, 13, 14, 15]],
+
+            [[16, 17, 18, 19],
+             [20, 21, 22, 23]],
+        ])
+        # fmt: on
+        img_channels_last = np.moveaxis(img_channels_first, 0, -1)
+        # fmt: off
+        flipped_img_channels_first = np.array([
+            [[16, 17, 18, 19],
+             [20, 21, 22, 23]],
+
+            [[ 8,  9, 10, 11],
+             [12, 13, 14, 15]],
+
+            [[ 0,  1,  2,  3],
+             [ 4,  5,  6,  7]],
+        ])
+        # fmt: on
+        flipped_img_channels_last = np.moveaxis(flipped_img_channels_first, 0, -1)
+
+        self.assertTrue(np.allclose(flip_channel_order(img_channels_first), flipped_img_channels_first))
+        self.assertTrue(
+            np.allclose(flip_channel_order(img_channels_first, "channels_last"), flipped_img_channels_last)
+        )
+
+        self.assertTrue(np.allclose(flip_channel_order(img_channels_last), flipped_img_channels_last))
+        self.assertTrue(
+            np.allclose(flip_channel_order(img_channels_last, "channels_first"), flipped_img_channels_first)
+        )
+
+        # Can flip when the image has 2 channels
+        # fmt: off
+        img_channels_first = np.array([
+            [[ 0,  1,  2,  3],
+             [ 4,  5,  6,  7]],
+
+            [[ 8,  9, 10, 11],
+             [12, 13, 14, 15]],
+        ])
+        # fmt: on
+        flipped_img_channels_first = img_channels_first[::-1, :, :]
+        self.assertTrue(
+            np.allclose(
+                flip_channel_order(img_channels_first, input_data_format="channels_first"), flipped_img_channels_first
+            )
+        )
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index a05d729a18cc..5a239cf0fb3b 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -13,30 +13,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import collections
 import copy
 import gc
 import inspect
-import json
 import os
 import os.path
 import pickle
 import random
-import sys
+import re
 import tempfile
-import unittest
-import unittest.mock as mock
 import warnings
-from pathlib import Path
+from collections import defaultdict
 from typing import Dict, List, Tuple
 
 import numpy as np
+from pytest import mark
 
 import transformers
-from huggingface_hub import HfFolder, delete_repo, set_access_token
-from huggingface_hub.file_download import http_get
-from requests.exceptions import HTTPError
 from transformers import (
-    AutoConfig,
     AutoModel,
     AutoModelForSequenceClassification,
     PretrainedConfig,
@@ -44,28 +39,42 @@
     logging,
 )
 from transformers.models.auto import get_values
+from transformers.models.auto.modeling_auto import (
+    MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES,
+    MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES,
+    MODEL_FOR_BACKBONE_MAPPING_NAMES,
+    MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING_NAMES,
+    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
+    MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES,
+    MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES,
+    MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES,
+    MODEL_FOR_MASKED_LM_MAPPING_NAMES,
+    MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES,
+    MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES,
+    MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES,
+    MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES,
+    MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
+    MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES,
+    MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES,
+    MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES,
+    MODEL_MAPPING_NAMES,
+)
 from transformers.testing_utils import (
-    TOKEN,
-    USER,
     CaptureLogger,
-    TestCasePlus,
-    is_flaky,
     is_pt_flax_cross_test,
     is_pt_tf_cross_test,
-    is_staging_test,
     require_accelerate,
+    require_flash_attn,
     require_safetensors,
     require_torch,
     require_torch_gpu,
     require_torch_multi_gpu,
-    require_usr_bin_time,
     slow,
     torch_device,
 )
 from transformers.utils import (
-    SAFE_WEIGHTS_INDEX_NAME,
-    SAFE_WEIGHTS_NAME,
-    WEIGHTS_INDEX_NAME,
+    CONFIG_NAME,
+    GENERATION_CONFIG_NAME,
     WEIGHTS_NAME,
     is_accelerate_available,
     is_flax_available,
@@ -75,11 +84,6 @@
 from transformers.utils.generic import ModelOutput
 
 
-sys.path.append(str(Path(__file__).parent.parent / "utils"))
-
-from test_module.custom_configuration import CustomConfig, NoSuperInitConfig  # noqa E402
-
-
 if is_accelerate_available():
     from accelerate.utils import compute_module_sizes
 
@@ -88,66 +92,8 @@
     import torch
     from torch import nn
 
-    from test_module.custom_modeling import CustomModel, NoSuperInitModel
-    from transformers import (
-        BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
-        MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
-        MODEL_FOR_AUDIO_XVECTOR_MAPPING,
-        MODEL_FOR_BACKBONE_MAPPING,
-        MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING,
-        MODEL_FOR_CAUSAL_LM_MAPPING,
-        MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING,
-        MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
-        MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING,
-        MODEL_FOR_MASKED_LM_MAPPING,
-        MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
-        MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,
-        MODEL_FOR_QUESTION_ANSWERING_MAPPING,
-        MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING,
-        MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
-        MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-        MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
-        MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING,
-        MODEL_MAPPING,
-        AdaptiveEmbedding,
-        AutoModelForCausalLM,
-        AutoTokenizer,
-        BertConfig,
-        BertModel,
-        PreTrainedModel,
-        T5Config,
-        T5ForConditionalGeneration,
-    )
-    from transformers.modeling_utils import shard_checkpoint
-
-    # Fake pretrained models for tests
-    class BaseModel(PreTrainedModel):
-        config_class = PretrainedConfig
-
-        def __init__(self, config):
-            super().__init__(config)
-            self.linear = nn.Linear(4, 5)
-            self.linear_2 = nn.Linear(5, 6)
-
-        def forward(self, x):
-            return self.linear_2(self.linear(x))
-
-    class ModelWithHead(PreTrainedModel):
-        base_model_prefix = "base"
-        config_class = PretrainedConfig
-
-        def _init_weights(self, module):
-            pass
-
-        def __init__(self, config):
-            super().__init__(config)
-            self.base = BaseModel(config)
-            # linear is a common name between Base and Head on purpose.
-            self.linear = nn.Linear(6, 3)
-            self.linear2 = nn.Linear(3, 5)
-
-        def forward(self, x):
-            return self.linear2(self.linear(self.base(x)))
+    from transformers import MODEL_MAPPING, AdaptiveEmbedding
+    from transformers.pytorch_utils import id_tensor_storage
 
 
 if is_tf_available():
@@ -155,6 +101,7 @@ def forward(self, x):
 
 if is_flax_available():
     import jax.numpy as jnp
+
     from transformers.modeling_flax_pytorch_utils import (
         convert_pytorch_state_dict_to_flax,
         load_flax_weights_in_pytorch_model,
@@ -169,16 +116,39 @@ def _config_zero_init(config):
     for key in configs_no_init.__dict__.keys():
         if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key:
             setattr(configs_no_init, key, 1e-10)
+        if isinstance(getattr(configs_no_init, key, None), PretrainedConfig):
+            no_init_subconfig = _config_zero_init(getattr(configs_no_init, key))
+            setattr(configs_no_init, key, no_init_subconfig)
     return configs_no_init
 
 
-TINY_T5 = "patrickvonplaten/t5-tiny-random"
-TINY_BERT_FOR_TOKEN_CLASSIFICATION = "hf-internal-testing/tiny-bert-for-token-classification"
+def _mock_init_weights(self, module):
+    for name, param in module.named_parameters(recurse=False):
+        # Use the first letter of the name to get a value and go from a <> -13 to z <> 12
+        value = ord(name[0].lower()) - 110
+        param.data.fill_(value)
+
+
+def _mock_all_init_weights(self):
+    # Prune heads if needed
+    if self.config.pruned_heads:
+        self.prune_heads(self.config.pruned_heads)
+
+    import transformers.modeling_utils
+
+    if transformers.modeling_utils._init_weights:
+        for module in self.modules():
+            module._is_hf_initialized = False
+        # Initialize weights
+        self.apply(self._initialize_weights)
+
+        # Tie weights should be skipped when not initializing all weights
+        # since from_pretrained(...) calls tie weights anyways
+        self.tie_weights()
 
 
 @require_torch
 class ModelTesterMixin:
-
     model_tester = None
     all_model_classes = ()
     all_generative_model_classes = ()
@@ -197,22 +167,22 @@ class ModelTesterMixin:
 
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = copy.deepcopy(inputs_dict)
-        if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
+        if model_class.__name__ in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES):
             inputs_dict = {
                 k: v.unsqueeze(1).expand(-1, self.model_tester.num_choices, -1).contiguous()
                 if isinstance(v, torch.Tensor) and v.ndim > 1
                 else v
                 for k, v in inputs_dict.items()
             }
-        elif model_class in get_values(MODEL_FOR_AUDIO_XVECTOR_MAPPING):
+        elif model_class.__name__ in get_values(MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES):
             inputs_dict.pop("attention_mask")
 
         if return_labels:
-            if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
+            if model_class.__name__ in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES):
                 inputs_dict["labels"] = torch.ones(self.model_tester.batch_size, dtype=torch.long, device=torch_device)
-            elif model_class in [
-                *get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING),
-                *get_values(MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING),
+            elif model_class.__name__ in [
+                *get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES),
+                *get_values(MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES),
             ]:
                 inputs_dict["start_positions"] = torch.zeros(
                     self.model_tester.batch_size, dtype=torch.long, device=torch_device
@@ -220,32 +190,32 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
                 inputs_dict["end_positions"] = torch.zeros(
                     self.model_tester.batch_size, dtype=torch.long, device=torch_device
                 )
-            elif model_class in [
-                *get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING),
-                *get_values(MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING),
-                *get_values(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING),
-                *get_values(MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING),
-                *get_values(MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING),
+            elif model_class.__name__ in [
+                *get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES),
+                *get_values(MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES),
+                *get_values(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES),
+                *get_values(MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES),
+                *get_values(MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES),
             ]:
                 inputs_dict["labels"] = torch.zeros(
                     self.model_tester.batch_size, dtype=torch.long, device=torch_device
                 )
-            elif model_class in [
-                *get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING),
-                *get_values(MODEL_FOR_CAUSAL_LM_MAPPING),
-                *get_values(MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING),
-                *get_values(MODEL_FOR_MASKED_LM_MAPPING),
-                *get_values(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING),
+            elif model_class.__name__ in [
+                *get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES),
+                *get_values(MODEL_FOR_CAUSAL_LM_MAPPING_NAMES),
+                *get_values(MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING_NAMES),
+                *get_values(MODEL_FOR_MASKED_LM_MAPPING_NAMES),
+                *get_values(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES),
             ]:
                 inputs_dict["labels"] = torch.zeros(
                     (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
                 )
-            elif model_class in get_values(MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING):
+            elif model_class.__name__ in get_values(MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES):
                 num_patches = self.model_tester.image_size // self.model_tester.patch_size
                 inputs_dict["bool_masked_pos"] = torch.zeros(
                     (self.model_tester.batch_size, num_patches**2), dtype=torch.long, device=torch_device
                 )
-            elif model_class in get_values(MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING):
+            elif model_class.__name__ in get_values(MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES):
                 batch_size, num_channels, height, width = inputs_dict["pixel_values"].shape
                 inputs_dict["labels"] = torch.zeros(
                     [self.model_tester.batch_size, height, width], device=torch_device
@@ -275,6 +245,13 @@ def check_save_load(out1, out2):
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 model.save_pretrained(tmpdirname)
+
+                # the config file (and the generation config file, if it can generate) should be saved
+                self.assertTrue(os.path.exists(os.path.join(tmpdirname, CONFIG_NAME)))
+                self.assertEqual(
+                    model.can_generate(), os.path.exists(os.path.join(tmpdirname, GENERATION_CONFIG_NAME))
+                )
+
                 model = model_class.from_pretrained(tmpdirname)
                 model.to(torch_device)
                 with torch.no_grad():
@@ -286,6 +263,36 @@ def check_save_load(out1, out2):
             else:
                 check_save_load(first, second)
 
+    def test_from_pretrained_no_checkpoint(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            state_dict = model.state_dict()
+
+            new_model = model_class.from_pretrained(
+                pretrained_model_name_or_path=None, config=config, state_dict=state_dict
+            )
+            for p1, p2 in zip(model.parameters(), new_model.parameters()):
+                self.assertTrue(torch.equal(p1, p2))
+
+    def test_keep_in_fp32_modules(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            if model_class._keep_in_fp32_modules is None:
+                return
+
+            model = model_class(config)
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+
+                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16)
+
+                for name, param in model.named_parameters():
+                    if any(n in model_class._keep_in_fp32_modules for n in name.split(".")):
+                        self.assertTrue(param.dtype == torch.float32)
+                    else:
+                        self.assertTrue(param.dtype == torch.float16, name)
+
     def test_save_load_keys_to_ignore_on_save(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -345,15 +352,10 @@ def test_gradient_checkpointing_enable_disable(self):
             model.gradient_checkpointing_disable()
             self.assertFalse(model.is_gradient_checkpointing)
 
-    def _mock_init_weights(self, module):
-        if hasattr(module, "weight") and module.weight is not None:
-            module.weight.data.fill_(3)
-        if hasattr(module, "bias") and module.bias is not None:
-            module.bias.data.fill_(3)
-
-    @is_flaky()
     def test_save_load_fast_init_from_base(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        if config.__class__ not in MODEL_MAPPING:
+            return
         base_class = MODEL_MAPPING[config.__class__]
 
         if isinstance(base_class, tuple):
@@ -375,7 +377,8 @@ class CopyClass(model_class):
 
             # make init deterministic, but make sure that
             # non-initialized weights throw errors nevertheless
-            model_class_copy._init_weights = self._mock_init_weights
+            model_class_copy._init_weights = _mock_init_weights
+            model_class_copy.init_weights = _mock_all_init_weights
 
             model = base_class(config)
             state_dict = model.state_dict()
@@ -392,20 +395,25 @@ class CopyClass(model_class):
 
                 model_fast_init = model_class_copy.from_pretrained(tmpdirname)
                 model_slow_init = model_class_copy.from_pretrained(tmpdirname, _fast_init=False)
+                # Before we test anything
 
                 for key in model_fast_init.state_dict().keys():
-                    max_diff = (model_slow_init.state_dict()[key] - model_fast_init.state_dict()[key]).sum().item()
+                    if isinstance(model_slow_init.state_dict()[key], torch.BoolTensor):
+                        max_diff = (model_slow_init.state_dict()[key] ^ model_fast_init.state_dict()[key]).sum().item()
+                    else:
+                        max_diff = (model_slow_init.state_dict()[key] - model_fast_init.state_dict()[key]).sum().item()
                     self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
 
     def test_save_load_fast_init_to_base(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        if config.__class__ not in MODEL_MAPPING:
+            return
         base_class = MODEL_MAPPING[config.__class__]
 
         if isinstance(base_class, tuple):
             base_class = base_class[0]
 
         for model_class in self.all_model_classes:
-
             if model_class == base_class:
                 continue
 
@@ -421,7 +429,8 @@ class CopyClass(base_class):
 
             # make init deterministic, but make sure that
             # non-initialized weights throw errors nevertheless
-            base_class_copy._init_weights = self._mock_init_weights
+            base_class_copy._init_weights = _mock_init_weights
+            base_class_copy.init_weights = _mock_all_init_weights
 
             model = model_class(config)
             state_dict = model.state_dict()
@@ -440,9 +449,14 @@ class CopyClass(base_class):
                 model_slow_init = base_class_copy.from_pretrained(tmpdirname, _fast_init=False)
 
                 for key in model_fast_init.state_dict().keys():
-                    max_diff = torch.max(
-                        torch.abs(model_slow_init.state_dict()[key] - model_fast_init.state_dict()[key])
-                    ).item()
+                    if isinstance(model_slow_init.state_dict()[key], torch.BoolTensor):
+                        max_diff = torch.max(
+                            model_slow_init.state_dict()[key] ^ model_fast_init.state_dict()[key]
+                        ).item()
+                    else:
+                        max_diff = torch.max(
+                            torch.abs(model_slow_init.state_dict()[key] - model_fast_init.state_dict()[key])
+                        ).item()
                     self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
 
     def test_initialization(self):
@@ -518,9 +532,9 @@ def test_training(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
             config.return_dict = True
 
-            if model_class in [
-                *get_values(MODEL_MAPPING),
-                *get_values(MODEL_FOR_BACKBONE_MAPPING),
+            if model_class.__name__ in [
+                *get_values(MODEL_MAPPING_NAMES),
+                *get_values(MODEL_FOR_BACKBONE_MAPPING_NAMES),
             ]:
                 continue
 
@@ -541,7 +555,8 @@ def test_training_gradient_checkpointing(self):
             config.return_dict = True
 
             if (
-                model_class in [*get_values(MODEL_MAPPING), *get_values(MODEL_FOR_BACKBONE_MAPPING)]
+                model_class.__name__
+                in [*get_values(MODEL_MAPPING_NAMES), *get_values(MODEL_FOR_BACKBONE_MAPPING_NAMES)]
                 or not model_class.supports_gradient_checkpointing
             ):
                 continue
@@ -611,9 +626,9 @@ def test_attention_outputs(self):
                 if "labels" in inputs_dict:
                     correct_outlen += 1  # loss is added to beginning
                 # Question Answering model returns start_logits and end_logits
-                if model_class in [
-                    *get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING),
-                    *get_values(MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING),
+                if model_class.__name__ in [
+                    *get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES),
+                    *get_values(MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES),
                 ]:
                     correct_outlen += 1  # start_logits and end_logits instead of only 1 output
                 if "past_key_values" in outputs:
@@ -693,7 +708,6 @@ def test_torchscript_output_hidden_state(self):
 
     # This is copied from `torch/testing/_internal/jit_utils.py::clear_class_registry`
     def clear_torch_jit_class_registry(self):
-
         torch._C._jit_clear_class_registry()
         torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore()
         # torch 1.8 has no `_clear_class_state` in `torch.jit._state`
@@ -733,6 +747,13 @@ def _create_and_check_torchscript(self, config, inputs_dict):
                     traced_model = torch.jit.trace(
                         model, (input_ids, bbox, image), check_trace=False
                     )  # when traced model is checked, an error is produced due to name mangling
+                elif "bbox" in inputs:  # Bros requires additional inputs (bbox)
+                    input_ids = inputs["input_ids"]
+                    bbox = inputs["bbox"]
+                    model(input_ids, bbox)
+                    traced_model = torch.jit.trace(
+                        model, (input_ids, bbox), check_trace=False
+                    )  # when traced model is checked, an error is produced due to name mangling
                 else:
                     main_input = inputs[main_input_name]
                     model(main_input)
@@ -866,7 +887,7 @@ def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=Fa
                     filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names}
                     input_names = list(filtered_inputs.keys())
 
-                    if isinstance(model, tuple(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.values())) and (
+                    if model.__class__.__name__ in set(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES.values()) and (
                         not hasattr(model.config, "problem_type") or model.config.problem_type is None
                     ):
                         model.config.problem_type = "single_label_classification"
@@ -1022,7 +1043,8 @@ def test_head_pruning(self):
             attentions = outputs[-1]
 
             self.assertEqual(attentions[0].shape[-3], 1)
-            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
+            # TODO: To have this check, we will need at least 3 layers. Do we really need it?
+            # self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
             self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
 
     def test_head_pruning_save_load_from_pretrained(self):
@@ -1058,7 +1080,8 @@ def test_head_pruning_save_load_from_pretrained(self):
                 outputs = model(**self._prepare_for_class(inputs_dict, model_class))
             attentions = outputs[-1]
             self.assertEqual(attentions[0].shape[-3], 1)
-            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
+            # TODO: To have this check, we will need at least 3 layers. Do we really need it?
+            # self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
             self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
 
     def test_head_pruning_save_load_from_config_init(self):
@@ -1092,7 +1115,8 @@ def test_head_pruning_save_load_from_config_init(self):
             attentions = outputs[-1]
 
             self.assertEqual(attentions[0].shape[-3], 1)
-            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
+            # TODO: To have this check, we will need at least 3 layers. Do we really need it?
+            # self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
             self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
 
     def test_head_pruning_integration(self):
@@ -1111,7 +1135,7 @@ def test_head_pruning_integration(self):
             inputs_dict["output_attentions"] = True
             config.output_hidden_states = False
 
-            heads_to_prune = {0: [0], 1: [1, 2]}
+            heads_to_prune = {1: [1, 2]}
             config.pruned_heads = heads_to_prune
 
             model = model_class(config=config)
@@ -1122,10 +1146,8 @@ def test_head_pruning_integration(self):
                 outputs = model(**self._prepare_for_class(inputs_dict, model_class))
             attentions = outputs[-1]
 
-            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
+            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 0)
             self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
-            self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
-            self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
 
             with tempfile.TemporaryDirectory() as temp_dir_name:
                 model.save_pretrained(temp_dir_name)
@@ -1136,12 +1158,10 @@ def test_head_pruning_integration(self):
                 outputs = model(**self._prepare_for_class(inputs_dict, model_class))
             attentions = outputs[-1]
 
-            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
+            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 0)
             self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
-            self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
-            self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
 
-            heads_to_prune = {0: [0], 2: [1, 2]}
+            heads_to_prune = {0: [0], 1: [1, 2]}
             model.prune_heads(heads_to_prune)
 
             with torch.no_grad():
@@ -1150,10 +1170,8 @@ def test_head_pruning_integration(self):
 
             self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
             self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
-            self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads - 2)
-            self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
 
-            self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2], 2: [1, 2]})
+            self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2]})
 
     def test_hidden_states_output(self):
         def check_hidden_states_output(inputs_dict, config, model_class):
@@ -1421,6 +1439,34 @@ def test_resize_tokens_embeddings(self):
 
             self.assertTrue(models_equal)
 
+            config = copy.deepcopy(original_config)
+            model = model_class(config)
+            model.to(torch_device)
+
+            model_vocab_size = config.vocab_size
+            model.resize_token_embeddings(model_vocab_size + 10, pad_to_multiple_of=1)
+            self.assertTrue(model.config.vocab_size + 10, model_vocab_size)
+
+            model_embed = model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=64)
+            self.assertTrue(model_embed.weight.shape[0] // 64, 0)
+
+            self.assertTrue(model_embed.weight.shape[0], model.config.vocab_size)
+            self.assertTrue(model.config.vocab_size, model.vocab_size)
+
+            model_embed = model.resize_token_embeddings(model_vocab_size + 13, pad_to_multiple_of=64)
+            self.assertTrue(model_embed.weight.shape[0] // 64, 0)
+
+            # Check that resizing a model to a multiple of pad_to_multiple leads to a model of exactly that size
+            target_dimension = 128
+            model_embed = model.resize_token_embeddings(target_dimension, pad_to_multiple_of=64)
+            self.assertTrue(model_embed.weight.shape[0], target_dimension)
+
+            with self.assertRaisesRegex(
+                ValueError,
+                "Asking to pad the embedding matrix to a multiple of `1.3`, which is not and integer. Please make sure to pass an integer",
+            ):
+                model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=1.3)
+
     def test_resize_embeddings_untied(self):
         (
             original_config,
@@ -1499,7 +1545,6 @@ def test_correct_missing_keys(self):
             base_model_prefix = model.base_model_prefix
 
             if hasattr(model, base_model_prefix):
-
                 extra_params = {k: v for k, v in model.named_parameters() if not k.startswith(base_model_prefix)}
                 extra_params.update({k: v for k, v in model.named_buffers() if not k.startswith(base_model_prefix)})
                 # Some models define this as None
@@ -1566,53 +1611,153 @@ def check_same_values(layer_1, layer_2):
             # self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape)
             # self.assertTrue(check_same_values(model.transformer.wte, model.lm_head))
 
-    def test_tied_model_weights_key_ignore(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+    @require_safetensors
+    def test_can_use_safetensors(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
             model_tied = model_class(config)
             with tempfile.TemporaryDirectory() as d:
-                model_tied.save_pretrained(d)
+                try:
+                    model_tied.save_pretrained(d, safe_serialization=True)
+                except Exception as e:
+                    raise Exception(f"Class {model_class.__name__} cannot be saved using safetensors: {e}")
+
+                model_reloaded, infos = model_class.from_pretrained(d, output_loading_info=True)
+                # Checking the state dicts are correct
+                reloaded_state = model_reloaded.state_dict()
+                for k, v in model_tied.state_dict().items():
+                    self.assertIn(k, reloaded_state, f"Key {k} is missing from reloaded")
+                    torch.testing.assert_close(
+                        v, reloaded_state[k], msg=lambda x: f"{model_class.__name__}: Tensor {k}: {x}"
+                    )
+                # Checking there was no complain of missing weights
+                self.assertEqual(infos["missing_keys"], [])
+
+                # Checking the tensor sharing are correct
+                ptrs = defaultdict(list)
+                for k, v in model_tied.state_dict().items():
+                    ptrs[v.data_ptr()].append(k)
+
+                shared_ptrs = {k: v for k, v in ptrs.items() if len(v) > 1}
+
+                for _, shared_names in shared_ptrs.items():
+                    reloaded_ptrs = {reloaded_state[k].data_ptr() for k in shared_names}
+                    self.assertEqual(
+                        len(reloaded_ptrs),
+                        1,
+                        f"The shared pointers are incorrect, found different pointers for keys {shared_names}",
+                    )
+
+    def test_load_save_without_tied_weights(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        config.tie_word_embeddings = False
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            with tempfile.TemporaryDirectory() as d:
+                model.save_pretrained(d)
+
+                model_reloaded, infos = model_class.from_pretrained(d, output_loading_info=True)
+                # Checking the state dicts are correct
+                reloaded_state = model_reloaded.state_dict()
+                for k, v in model.state_dict().items():
+                    self.assertIn(k, reloaded_state, f"Key {k} is missing from reloaded")
+                    torch.testing.assert_close(
+                        v, reloaded_state[k], msg=lambda x: f"{model_class.__name__}: Tensor {k}: {x}"
+                    )
+                # Checking there was no complain of missing weights
+                self.assertEqual(infos["missing_keys"], [])
+
+    def test_tied_weights_keys(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        config.tie_word_embeddings = True
+        for model_class in self.all_model_classes:
+            model_tied = model_class(config)
+
+            ptrs = collections.defaultdict(list)
+            for name, tensor in model_tied.state_dict().items():
+                ptrs[id_tensor_storage(tensor)].append(name)
+
+            # These are all the pointers of shared tensors.
+            tied_params = [names for _, names in ptrs.items() if len(names) > 1]
+
+            tied_weight_keys = model_tied._tied_weights_keys if model_tied._tied_weights_keys is not None else []
+            # Detect we get a hit for each key
+            for key in tied_weight_keys:
+                if not any(re.search(key, p) for group in tied_params for p in group):
+                    raise ValueError(f"{key} is not a tied weight key for {model_class}.")
+
+            # Removed tied weights found from tied params -> there should only be one left after
+            for key in tied_weight_keys:
+                for i in range(len(tied_params)):
+                    tied_params[i] = [p for p in tied_params[i] if re.search(key, p) is None]
+
+            tied_params = [group for group in tied_params if len(group) > 1]
+            self.assertListEqual(
+                tied_params,
+                [],
+                f"Missing `_tied_weights_keys` for {model_class}: add all of {tied_params} except one.",
+            )
+
+    def test_model_weights_reload_no_missing_tied_weights(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                model.save_pretrained(tmp_dir)
 
                 # We are nuking ALL weights on file, so every parameter should
                 # yell on load. We're going to detect if we yell too much, or too little.
-                with open(os.path.join(d, "pytorch_model.bin"), "wb") as f:
+                with open(os.path.join(tmp_dir, "pytorch_model.bin"), "wb") as f:
                     torch.save({}, f)
-                model_reloaded, infos = model_class.from_pretrained(d, output_loading_info=True)
-
-                # ! Actually we could use `state_dict()` and check iteratively the tensors which are the same (for instance using `tensor.data_ptr()`). to detect the duplicates.
-                # ```python
-                # model = GPT2LMHeadModel.from_pretrained("gpt2")
-                # "lm_head.weight" in model.state_dict().keys()  # True
-                # "lm_head.weight" in model.named_parameters() # False
-                # In [6]: model.lm_head.weight.data_ptr()
-                # Out[6]: 139901378371648
-                # In [9]: model.transformer.wte.weight.data_ptr()
-                # Out[9]: 139901378371648  # Same PTR, it's the same DATA ! we would need to check for stride too to be 100% accurate.
-                # ```
+                model_reloaded, infos = model_class.from_pretrained(tmp_dir, output_loading_info=True)
 
                 prefix = f"{model_reloaded.base_model_prefix}."
                 params = dict(model_reloaded.named_parameters())
                 params.update(dict(model_reloaded.named_buffers()))
-                # param_names = set(k[len(prefix) :] if k.startswith(prefix) else k for k in params.keys())
-                param_names = set(k[len(prefix) :] if k.startswith(prefix) else k for k in params.keys())
+                param_names = {k[len(prefix) :] if k.startswith(prefix) else k for k in params.keys()}
 
                 missing_keys = set(infos["missing_keys"])
 
                 extra_missing = missing_keys - param_names
-                # missed_missing = param_names - missing_keys
+                # Remove tied weights from extra missing: they are normally not warned as missing if their tied
+                # counterpart is present but here there are no weights at all so we do get the warning.
+                ptrs = collections.defaultdict(list)
+                for name, tensor in model_reloaded.state_dict().items():
+                    ptrs[id_tensor_storage(tensor)].append(name)
+                tied_params = [names for _, names in ptrs.items() if len(names) > 1]
+                for group in tied_params:
+                    group = {k[len(prefix) :] if k.startswith(prefix) else k for k in group}
+                    # We remove the group from extra_missing if not all weights from group are in it
+                    if len(group - extra_missing) > 0:
+                        extra_missing = extra_missing - set(group)
 
                 self.assertEqual(
                     extra_missing,
                     set(),
-                    f"This model {model_class.__name__} might be missing some `keys_to_ignore`: {extra_missing}",
+                    f"This model {model_class.__name__} might be missing some `keys_to_ignore`: {extra_missing}. "
+                    f"For debugging, tied parameters are {tied_params}",
                 )
 
-                # self.assertEqual(
-                #     missed_missing,
-                #     set(),
-                #     f"This model {model_class.__name__} ignores keys {missed_missing} but they look like real"
-                #     " parameters",
-                # )
+                missed_missing = param_names - missing_keys
+                # Remove nonpersistent buffers from missed_missing
+                buffers = [n for n, _ in model_reloaded.named_buffers()]
+                nonpersistent_buffers = {n for n in buffers if n not in model_reloaded.state_dict()}
+                nonpersistent_buffers = {
+                    k[len(prefix) :] if k.startswith(prefix) else k for k in nonpersistent_buffers
+                }
+                missed_missing = missed_missing - nonpersistent_buffers
+
+                if model_reloaded._keys_to_ignore_on_load_missing is None:
+                    expected_missing = set()
+                else:
+                    expected_missing = set(model_reloaded._keys_to_ignore_on_load_missing)
+                self.assertEqual(
+                    missed_missing,
+                    expected_missing,
+                    f"This model {model_class.__name__} ignores keys {missed_missing} but they look like real"
+                    " parameters. If they are non persistent buffers make sure to instantiate them with"
+                    " `persistent=False`",
+                )
 
     def test_model_outputs_equivalence(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -1721,8 +1866,8 @@ def _make_attention_mask_non_null(self, inputs_dict):
     def _postprocessing_to_ignore_test_cases(self, tf_outputs, pt_outputs, model_class):
         """For temporarily ignoring some failed test cases (issues to be fixed)"""
 
-        tf_keys = set([k for k, v in tf_outputs.items() if v is not None])
-        pt_keys = set([k for k, v in pt_outputs.items() if v is not None])
+        tf_keys = {k for k, v in tf_outputs.items() if v is not None}
+        pt_keys = {k for k, v in pt_outputs.items() if v is not None}
 
         key_differences = tf_keys.symmetric_difference(pt_keys)
 
@@ -1841,7 +1986,6 @@ def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=1e-5, nam
             )
 
     def prepare_tf_inputs_from_pt_inputs(self, pt_inputs_dict):
-
         tf_inputs_dict = {}
         for key, tensor in pt_inputs_dict.items():
             # skip key that does not exist in tf
@@ -1862,7 +2006,6 @@ def prepare_tf_inputs_from_pt_inputs(self, pt_inputs_dict):
         return tf_inputs_dict
 
     def check_pt_tf_models(self, tf_model, pt_model, pt_inputs_dict):
-
         tf_inputs_dict = self.prepare_tf_inputs_from_pt_inputs(pt_inputs_dict)
 
         # send pytorch inputs to the correct device
@@ -1890,11 +2033,10 @@ def check_pt_tf_models(self, tf_model, pt_model, pt_inputs_dict):
         self.check_pt_tf_outputs(tf_outputs, pt_outputs, type(pt_model))
 
     @is_pt_tf_cross_test
-    def test_pt_tf_model_equivalence(self):
+    def test_pt_tf_model_equivalence(self, allow_missing_keys=False):
         import transformers
 
         for model_class in self.all_model_classes:
-
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
             tf_model_class_name = "TF" + model_class.__name__  # Add the "TF" at the beginning
@@ -1937,14 +2079,18 @@ def test_pt_tf_model_equivalence(self):
 
             # For some models (e.g. base models), there is no label returned.
             # Set the input dict to `None` to avoid check outputs twice for the same input dicts.
-            if set(pt_inputs_dict_with_labels.keys()).symmetric_difference(pt_inputs_dict.keys()):
+            if not set(pt_inputs_dict_with_labels.keys()).symmetric_difference(pt_inputs_dict.keys()):
                 pt_inputs_dict_with_labels = None
 
             # Check we can load pt model in tf and vice-versa with model => model functions
             # Here requires `tf_inputs_dict` to build `tf_model`
             tf_inputs_dict = self.prepare_tf_inputs_from_pt_inputs(pt_inputs_dict)
-            tf_model = transformers.load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=tf_inputs_dict)
-            pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model)
+            tf_model = transformers.load_pytorch_model_in_tf2_model(
+                tf_model, pt_model, tf_inputs=tf_inputs_dict, allow_missing_keys=allow_missing_keys
+            )
+            pt_model = transformers.load_tf2_model_in_pytorch_model(
+                pt_model, tf_model, allow_missing_keys=allow_missing_keys
+            )
 
             # Original test: check without `labels`
             self.check_pt_tf_models(tf_model, pt_model, pt_inputs_dict)
@@ -1956,11 +2102,15 @@ def test_pt_tf_model_equivalence(self):
             with tempfile.TemporaryDirectory() as tmpdirname:
                 pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin")
                 torch.save(pt_model.state_dict(), pt_checkpoint_path)
-                tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path)
+                tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(
+                    tf_model, pt_checkpoint_path, allow_missing_keys=allow_missing_keys
+                )
 
                 tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5")
                 tf_model.save_weights(tf_checkpoint_path)
-                pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path)
+                pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(
+                    pt_model, tf_checkpoint_path, allow_missing_keys=allow_missing_keys
+                )
 
             # Original test: check without `labels`
             self.check_pt_tf_models(tf_model, pt_model, pt_inputs_dict)
@@ -2107,7 +2257,7 @@ def test_equivalence_pt_to_flax(self):
                 }
 
                 # convert inputs to Flax
-                fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)}
+                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
 
                 fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
                 fx_model.params = fx_state
@@ -2179,7 +2329,7 @@ def test_equivalence_flax_to_pt(self):
                 }
 
                 # convert inputs to Flax
-                fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)}
+                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
 
                 pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
 
@@ -2366,34 +2516,6 @@ def cast_to_device(dictionary, device):
                     for value_, parallel_value_ in zip(value, parallel_value):
                         self.assertTrue(torch.allclose(value_, parallel_value_.to("cpu"), atol=1e-7))
 
-    @require_torch_multi_gpu
-    def test_model_parallel_beam_search(self):
-        if not self.test_model_parallel:
-            return
-
-        all_generative_and_parallelizable_model_classes = tuple(
-            set(self.all_generative_model_classes).intersection(self.all_parallelizable_model_classes)
-        )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in all_generative_and_parallelizable_model_classes:
-            inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-            model = model_class(config)
-
-            def cast_to_device(dictionary, device):
-                output = {}
-                for k, v in dictionary.items():
-                    if isinstance(v, torch.Tensor):
-                        output[k] = v.to(device)
-                    else:
-                        output[k] = v
-
-                return output
-
-            model.parallelize()
-            model.generate(**cast_to_device(inputs_dict, "cuda:0"), num_beams=2)
-
     def check_device_map_is_respected(self, model, device_map):
         for param_name, param in model.named_parameters():
             # Find device in device_map
@@ -2409,6 +2531,7 @@ def check_device_map_is_respected(self, model, device_map):
                 self.assertEqual(param.device, torch.device(param_device))
 
     @require_accelerate
+    @mark.accelerate_tests
     @require_torch_gpu
     def test_disk_offload(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -2424,15 +2547,17 @@ def test_disk_offload(self):
             base_output = model(**inputs_dict_class)
 
             model_size = compute_module_sizes(model)[""]
-            max_size = int(self.model_split_percents[0] * model_size)
             with tempfile.TemporaryDirectory() as tmp_dir:
                 model.cpu().save_pretrained(tmp_dir)
 
-                max_memory = {0: max_size, "cpu": max_size}
                 with self.assertRaises(ValueError):
+                    max_size = int(self.model_split_percents[0] * model_size)
+                    max_memory = {0: max_size, "cpu": max_size}
                     # This errors out cause it's missing an offload folder
                     new_model = model_class.from_pretrained(tmp_dir, device_map="auto", max_memory=max_memory)
 
+                max_size = int(self.model_split_percents[1] * model_size)
+                max_memory = {0: max_size, "cpu": max_size}
                 new_model = model_class.from_pretrained(
                     tmp_dir, device_map="auto", max_memory=max_memory, offload_folder=tmp_dir
                 )
@@ -2441,9 +2566,10 @@ def test_disk_offload(self):
                 torch.manual_seed(0)
                 new_output = new_model(**inputs_dict_class)
 
-                self.assertTrue(torch.allclose(base_output[0], new_output[0]))
+                self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))
 
     @require_accelerate
+    @mark.accelerate_tests
     @require_torch_gpu
     def test_cpu_offload(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -2461,7 +2587,7 @@ def test_cpu_offload(self):
 
             model_size = compute_module_sizes(model)[""]
             # We test several splits of sizes to make sure it works.
-            max_gpu_sizes = [int(p * model_size) for p in self.model_split_percents]
+            max_gpu_sizes = [int(p * model_size) for p in self.model_split_percents[1:]]
             with tempfile.TemporaryDirectory() as tmp_dir:
                 model.cpu().save_pretrained(tmp_dir)
 
@@ -2476,9 +2602,10 @@ def test_cpu_offload(self):
                     torch.manual_seed(0)
                     new_output = new_model(**inputs_dict_class)
 
-                    self.assertTrue(torch.allclose(base_output[0], new_output[0]))
+                    self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))
 
     @require_accelerate
+    @mark.accelerate_tests
     @require_torch_multi_gpu
     def test_model_parallelism(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -2496,7 +2623,7 @@ def test_model_parallelism(self):
 
             model_size = compute_module_sizes(model)[""]
             # We test several splits of sizes to make sure it works.
-            max_gpu_sizes = [int(p * model_size) for p in self.model_split_percents]
+            max_gpu_sizes = [int(p * model_size) for p in self.model_split_percents[1:]]
             with tempfile.TemporaryDirectory() as tmp_dir:
                 model.cpu().save_pretrained(tmp_dir)
 
@@ -2511,7 +2638,7 @@ def test_model_parallelism(self):
                     torch.manual_seed(0)
                     new_output = new_model(**inputs_dict_class)
 
-                    self.assertTrue(torch.allclose(base_output[0], new_output[0]))
+                    self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))
 
     def test_problem_types(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -2523,15 +2650,14 @@ def test_problem_types(self):
         ]
 
         for model_class in self.all_model_classes:
-            if model_class not in [
-                *get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING),
-                *get_values(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING),
+            if model_class.__name__ not in [
+                *get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES),
+                *get_values(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES),
             ]:
                 continue
 
             for problem_type in problem_types:
                 with self.subTest(msg=f"Testing {model_class} with {problem_type['title']}"):
-
                     config.problem_type = problem_type["title"]
                     config.num_labels = problem_type["num_labels"]
 
@@ -2566,7 +2692,7 @@ def test_load_with_mismatched_shapes(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
-            if model_class not in get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING):
+            if model_class.__name__ not in get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES):
                 continue
 
             with self.subTest(msg=f"Testing {model_class}"):
@@ -2604,661 +2730,270 @@ def test_load_with_mismatched_shapes(self):
                     else:
                         new_model_without_prefix(input_ids)
 
+    def test_model_is_small(self):
+        # Just a consistency check to make sure we are not running tests on 80M parameter models.
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
-global_rng = random.Random()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            num_params = model.num_parameters()
+            assert (
+                num_params < 1000000
+            ), f"{model_class} is too big for the common tests ({num_params})! It should have 1M max."
 
+    @require_flash_attn
+    @require_torch_gpu
+    @mark.flash_attn_test
+    @slow
+    def test_flash_attn_2_conversion(self):
+        import torch
 
-def ids_tensor(shape, vocab_size, rng=None, name=None):
-    #  Creates a random int32 tensor of the shape within the vocab size
-    if rng is None:
-        rng = global_rng
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
-    total_dims = 1
-    for dim in shape:
-        total_dims *= dim
+        for model_class in self.all_model_classes:
+            if not model_class._supports_flash_attn_2:
+                return
 
-    values = []
-    for _ in range(total_dims):
-        values.append(rng.randint(0, vocab_size - 1))
+            model = model_class(config)
 
-    return torch.tensor(data=values, dtype=torch.long, device=torch_device).view(shape).contiguous()
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model = model_class.from_pretrained(
+                    tmpdirname, torch_dtype=torch.float16, use_flash_attention_2=True
+                ).to(torch_device)
 
+                for _, module in model.named_modules():
+                    if "FlashAttention" in module.__class__.__name__:
+                        return
 
-def random_attention_mask(shape, rng=None, name=None):
-    attn_mask = ids_tensor(shape, vocab_size=2, rng=None, name=None)
-    # make sure that at least one token is attended to for each batch
-    attn_mask[:, -1] = 1
-    return attn_mask
+                self.assertTrue(False, "FlashAttention2 modules not found in model")
 
+    @require_flash_attn
+    @require_torch_gpu
+    @mark.flash_attn_test
+    @slow
+    def test_flash_attn_2_inference(self):
+        import torch
 
-def floats_tensor(shape, scale=1.0, rng=None, name=None):
-    """Creates a random float32 tensor"""
-    if rng is None:
-        rng = global_rng
+        for model_class in self.all_model_classes:
+            if not model_class._supports_flash_attn_2:
+                return
 
-    total_dims = 1
-    for dim in shape:
-        total_dims *= dim
+            config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+            model = model_class(config)
 
-    values = []
-    for _ in range(total_dims):
-        values.append(rng.random() * scale)
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model_fa = model_class.from_pretrained(
+                    tmpdirname, torch_dtype=torch.bfloat16, use_flash_attention_2=True
+                )
+                model_fa.to(torch_device)
 
-    return torch.tensor(data=values, dtype=torch.float, device=torch_device).view(shape).contiguous()
+                model = model_class.from_pretrained(
+                    tmpdirname, torch_dtype=torch.bfloat16, use_flash_attention_2=False
+                )
+                model.to(torch_device)
 
+                dummy_input = torch.LongTensor([[1, 2, 3, 4, 5]]).to(torch_device)
+                dummy_attention_mask = torch.LongTensor([[0, 1, 1, 1, 1]]).to(torch_device)
 
-def check_models_equal(model1, model2):
-    models_are_equal = True
-    for model1_p, model2_p in zip(model1.parameters(), model2.parameters()):
-        if model1_p.data.ne(model2_p.data).sum() > 0:
-            models_are_equal = False
+                logits = model(dummy_input, output_hidden_states=True).hidden_states[-1]
+                logits_fa = model_fa(dummy_input, output_hidden_states=True).hidden_states[-1]
 
-    return models_are_equal
+                self.assertTrue(torch.allclose(logits_fa, logits, atol=4e-2, rtol=4e-2))
 
+                output_fa = model_fa(dummy_input, attention_mask=dummy_attention_mask, output_hidden_states=True)
+                logits_fa = output_fa.hidden_states[-1]
 
-@require_torch
-class ModelUtilsTest(TestCasePlus):
+                output = model(dummy_input, attention_mask=dummy_attention_mask, output_hidden_states=True)
+                logits = output.hidden_states[-1]
+
+                self.assertTrue(torch.allclose(logits_fa[1:], logits[1:], atol=4e-2, rtol=4e-2))
+
+    @require_flash_attn
+    @require_torch_gpu
+    @mark.flash_attn_test
     @slow
-    def test_model_from_pretrained(self):
-        for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            config = BertConfig.from_pretrained(model_name)
-            self.assertIsNotNone(config)
-            self.assertIsInstance(config, PretrainedConfig)
+    def test_flash_attn_2_inference_padding_right(self):
+        import torch
 
-            model = BertModel.from_pretrained(model_name)
-            model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True)
-            self.assertIsNotNone(model)
-            self.assertIsInstance(model, PreTrainedModel)
+        for model_class in self.all_model_classes:
+            if not model_class._supports_flash_attn_2:
+                return
 
-            self.assertEqual(len(loading_info["missing_keys"]), 0)
-            self.assertEqual(len(loading_info["unexpected_keys"]), 8)
-            self.assertEqual(len(loading_info["mismatched_keys"]), 0)
-            self.assertEqual(len(loading_info["error_msgs"]), 0)
+            config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+            model = model_class(config)
 
-            config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model_fa = model_class.from_pretrained(
+                    tmpdirname, torch_dtype=torch.bfloat16, use_flash_attention_2=True
+                )
+                model_fa.to(torch_device)
 
-            # Not sure this is the intended behavior. TODO fix Lysandre & Thom
-            config.name_or_path = model_name
+                model = model_class.from_pretrained(
+                    tmpdirname, torch_dtype=torch.bfloat16, use_flash_attention_2=False
+                )
+                model.to(torch_device)
 
-            model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
-            self.assertEqual(model.config.output_hidden_states, True)
-            self.assertEqual(model.config, config)
+                dummy_input = torch.LongTensor([[1, 2, 3, 4, 5]]).to(torch_device)
+                dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1, 0]]).to(torch_device)
 
-    def test_model_from_pretrained_subfolder(self):
-        config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
-        model = BertModel(config)
+                logits = model(dummy_input, output_hidden_states=True).hidden_states[-1]
+                logits_fa = model_fa(dummy_input, output_hidden_states=True).hidden_states[-1]
 
-        subfolder = "bert"
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(os.path.join(tmp_dir, subfolder))
-
-            with self.assertRaises(OSError):
-                _ = BertModel.from_pretrained(tmp_dir)
-
-            model_loaded = BertModel.from_pretrained(tmp_dir, subfolder=subfolder)
-
-        self.assertTrue(check_models_equal(model, model_loaded))
-
-    def test_model_from_pretrained_subfolder_sharded(self):
-        config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
-        model = BertModel(config)
-
-        subfolder = "bert"
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(os.path.join(tmp_dir, subfolder), max_shard_size="10KB")
-
-            with self.assertRaises(OSError):
-                _ = BertModel.from_pretrained(tmp_dir)
-
-            model_loaded = BertModel.from_pretrained(tmp_dir, subfolder=subfolder)
-
-        self.assertTrue(check_models_equal(model, model_loaded))
-
-    def test_model_from_pretrained_hub_subfolder(self):
-        subfolder = "bert"
-        model_id = "hf-internal-testing/tiny-random-bert-subfolder"
-        with self.assertRaises(OSError):
-            _ = BertModel.from_pretrained(model_id)
-
-        model = BertModel.from_pretrained(model_id, subfolder=subfolder)
-
-        self.assertIsNotNone(model)
-
-    def test_model_from_pretrained_hub_subfolder_sharded(self):
-        subfolder = "bert"
-        model_id = "hf-internal-testing/tiny-random-bert-sharded-subfolder"
-        with self.assertRaises(OSError):
-            _ = BertModel.from_pretrained(model_id)
-
-        model = BertModel.from_pretrained(model_id, subfolder=subfolder)
-
-        self.assertIsNotNone(model)
-
-    def test_model_from_pretrained_with_different_pretrained_model_name(self):
-        model = T5ForConditionalGeneration.from_pretrained(TINY_T5)
-        self.assertIsNotNone(model)
-
-        logger = logging.get_logger("transformers.configuration_utils")
-        with CaptureLogger(logger) as cl:
-            BertModel.from_pretrained(TINY_T5)
-        self.assertTrue("You are using a model of type t5 to instantiate a model of type bert" in cl.out)
-
-    @require_torch
-    def test_model_from_config_torch_dtype(self):
-        # test that the model can be instantiated with dtype of user's choice - as long as it's a
-        # float dtype. To make it happen config.torch_dtype needs to be set before instantiating the
-        # model from the config object.
-
-        config = T5Config.from_pretrained(TINY_T5)
-        model = AutoModel.from_config(config)
-        # XXX: isn't supported
-        # model = T5ForConditionalGeneration.from_config(config)
-        self.assertEqual(model.dtype, torch.float32)
-
-        model = AutoModel.from_config(config, torch_dtype=torch.float16)
-        self.assertEqual(model.dtype, torch.float16)
-
-        # torch.set_default_dtype() supports only float dtypes, so will fail with non-float type
-        with self.assertRaises(ValueError):
-            model = AutoModel.from_config(config, torch_dtype=torch.int64)
-
-    @require_torch
-    def test_model_from_pretrained_torch_dtype(self):
-        # test that the model can be instantiated with dtype of either
-        # 1. explicit from_pretrained's torch_dtype argument
-        # 2. via autodiscovery by looking at model weights (torch_dtype="auto")
-        # so if a model.half() was saved, we want it to be instantiated as such.
-        #
-        # test an explicit model class, but also AutoModel separately as the latter goes through a different code path
-        model_path = self.get_auto_remove_tmp_dir()
-
-        # baseline - we know TINY_T5 is fp32 model
-        model = T5ForConditionalGeneration.from_pretrained(TINY_T5)
-        self.assertEqual(model.dtype, torch.float32)
-
-        # test the default fp32 save_pretrained => from_pretrained cycle
-        model.save_pretrained(model_path)
-        model = T5ForConditionalGeneration.from_pretrained(model_path)
-        self.assertEqual(model.dtype, torch.float32)
-        # test with auto-detection
-        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype="auto")
-        self.assertEqual(model.dtype, torch.float32)
-
-        # test forced loading in fp16 (even though the weights are in fp32)
-        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16)
-        self.assertEqual(model.dtype, torch.float16)
-
-        # test fp16 save_pretrained, loaded with auto-detection
-        model = model.half()
-        model.save_pretrained(model_path)
-        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype="auto")
-        self.assertEqual(model.config.torch_dtype, torch.float16)
-        self.assertEqual(model.dtype, torch.float16)
-
-        # tests `config.torch_dtype` saving
-        with open(f"{model_path}/config.json") as f:
-            config_dict = json.load(f)
-        self.assertEqual(config_dict["torch_dtype"], "float16")
-
-        # test fp16 save_pretrained, loaded with the explicit fp16
-        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16)
-        self.assertEqual(model.dtype, torch.float16)
-
-        # test AutoModel separately as it goes through a different path
-        # test auto-detection
-        model = AutoModel.from_pretrained(TINY_T5, torch_dtype="auto")
-        self.assertEqual(model.dtype, torch.float32)
-        # test forcing an explicit dtype
-        model = AutoModel.from_pretrained(TINY_T5, torch_dtype=torch.float16)
-        self.assertEqual(model.dtype, torch.float16)
-
-        # test model whose first param is not of a floating type, but int
-        model = AutoModel.from_pretrained(TINY_BERT_FOR_TOKEN_CLASSIFICATION, torch_dtype="auto")
-        self.assertEqual(model.dtype, torch.float32)
-
-    def test_no_super_init_config_and_model(self):
-        config = NoSuperInitConfig(attribute=32)
-        model = NoSuperInitModel(config)
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir)
-
-            new_model = NoSuperInitModel.from_pretrained(tmp_dir)
-
-        for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            self.assertTrue(torch.equal(p1, p2))
-
-    def test_shard_checkpoint(self):
-        # This is the model we will use, total size 340,000 bytes.
-        model = torch.nn.Sequential(
-            torch.nn.Linear(100, 200, bias=False),  # size 80,000
-            torch.nn.Linear(200, 200, bias=False),  # size 160,000
-            torch.nn.Linear(200, 100, bias=False),  # size 80,000
-            torch.nn.Linear(100, 50, bias=False),  # size 20,000
-        )
-        state_dict = model.state_dict()
-
-        with self.subTest("No shard when max size is bigger than model size"):
-            shards, index = shard_checkpoint(state_dict)
-            self.assertIsNone(index)
-            self.assertDictEqual(shards, {WEIGHTS_NAME: state_dict})
-
-        with self.subTest("Test sharding, no weights bigger than max size"):
-            shards, index = shard_checkpoint(state_dict, max_shard_size="300kB")
-            # Split is first two layers then last two.
-            self.assertDictEqual(
-                index,
-                {
-                    "metadata": {"total_size": 340000},
-                    "weight_map": {
-                        "0.weight": "pytorch_model-00001-of-00002.bin",
-                        "1.weight": "pytorch_model-00001-of-00002.bin",
-                        "2.weight": "pytorch_model-00002-of-00002.bin",
-                        "3.weight": "pytorch_model-00002-of-00002.bin",
-                    },
-                },
-            )
+                self.assertTrue(torch.allclose(logits_fa, logits, atol=4e-2, rtol=4e-2))
 
-            shard1 = {"0.weight": state_dict["0.weight"], "1.weight": state_dict["1.weight"]}
-            shard2 = {"2.weight": state_dict["2.weight"], "3.weight": state_dict["3.weight"]}
-            self.assertDictEqual(
-                shards, {"pytorch_model-00001-of-00002.bin": shard1, "pytorch_model-00002-of-00002.bin": shard2}
-            )
+                output_fa = model_fa(dummy_input, attention_mask=dummy_attention_mask, output_hidden_states=True)
+                logits_fa = output_fa.hidden_states[-1]
 
-        with self.subTest("Test sharding with weights bigger than max size"):
-            shards, index = shard_checkpoint(state_dict, max_shard_size="100kB")
-            # Split is first layer, second layer then last 2.
-            self.assertDictEqual(
-                index,
-                {
-                    "metadata": {"total_size": 340000},
-                    "weight_map": {
-                        "0.weight": "pytorch_model-00001-of-00003.bin",
-                        "1.weight": "pytorch_model-00002-of-00003.bin",
-                        "2.weight": "pytorch_model-00003-of-00003.bin",
-                        "3.weight": "pytorch_model-00003-of-00003.bin",
-                    },
-                },
-            )
+                output = model(dummy_input, attention_mask=dummy_attention_mask, output_hidden_states=True)
+                logits = output.hidden_states[-1]
 
-            shard1 = {"0.weight": state_dict["0.weight"]}
-            shard2 = {"1.weight": state_dict["1.weight"]}
-            shard3 = {"2.weight": state_dict["2.weight"], "3.weight": state_dict["3.weight"]}
-            self.assertDictEqual(
-                shards,
-                {
-                    "pytorch_model-00001-of-00003.bin": shard1,
-                    "pytorch_model-00002-of-00003.bin": shard2,
-                    "pytorch_model-00003-of-00003.bin": shard3,
-                },
-            )
+                self.assertTrue(torch.allclose(logits_fa[:-1], logits[:-1], atol=4e-2, rtol=4e-2))
 
-    def test_checkpoint_sharding_local(self):
-        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            # We use the same folder for various sizes to make sure a new save erases the old checkpoint.
-            for max_size in ["50kB", "50kiB", "100kB", "100kiB", "200kB", "200kiB"]:
-                model.save_pretrained(tmp_dir, max_shard_size=max_size)
-
-                # Get each shard file and its size
-                shard_to_size = {}
-                for shard in os.listdir(tmp_dir):
-                    if shard.endswith(".bin"):
-                        shard_file = os.path.join(tmp_dir, shard)
-                        shard_to_size[shard_file] = os.path.getsize(shard_file)
-
-                index_file = os.path.join(tmp_dir, WEIGHTS_INDEX_NAME)
-                # Check there is an index but no regular weight file
-                self.assertTrue(os.path.isfile(index_file))
-                self.assertFalse(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_NAME)))
-
-                # Check a file is bigger than max_size only when it has a single weight
-                for shard_file, size in shard_to_size.items():
-                    if max_size.endswith("kiB"):
-                        max_size_int = int(max_size[:-3]) * 2**10
-                    else:
-                        max_size_int = int(max_size[:-2]) * 10**3
-                    # Note: pickle adds some junk so the weight of the file can end up being slightly bigger than
-                    # the size asked for (since we count parameters)
-                    if size >= max_size_int + 50000:
-                        state_dict = torch.load(shard_file)
-                        self.assertEqual(len(state_dict), 1)
-
-                # Check the index and the shard files found match
-                with open(index_file, "r", encoding="utf-8") as f:
-                    index = json.loads(f.read())
-
-                all_shards = set(index["weight_map"].values())
-                shards_found = set(f for f in os.listdir(tmp_dir) if f.endswith(".bin"))
-                self.assertSetEqual(all_shards, shards_found)
-
-                # Finally, check the model can be reloaded
-                new_model = BertModel.from_pretrained(tmp_dir)
-                for p1, p2 in zip(model.parameters(), new_model.parameters()):
-                    self.assertTrue(torch.allclose(p1, p2))
-
-    def test_checkpoint_sharding_from_hub(self):
-        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-sharded")
-        # the model above is the same as the model below, just a sharded version.
-        ref_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-        for p1, p2 in zip(model.parameters(), ref_model.parameters()):
-            self.assertTrue(torch.allclose(p1, p2))
+    @require_flash_attn
+    @require_torch_gpu
+    @mark.flash_attn_test
+    @slow
+    def test_flash_attn_2_generate_left_padding(self):
+        import torch
 
-    @require_accelerate
-    def test_from_pretrained_low_cpu_mem_usage_functional(self):
-        # test that we can use `from_pretrained(..., low_cpu_mem_usage=True)` with normal and
-        # sharded models
+        for model_class in self.all_generative_model_classes:
+            if not model_class._supports_flash_attn_2:
+                return
 
-        mnames = [
-            "hf-internal-testing/tiny-random-bert-sharded",
-            "hf-internal-testing/tiny-random-bert",
-        ]
-        for mname in mnames:
-            _ = BertModel.from_pretrained(mname, low_cpu_mem_usage=True)
+            config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+            model = model_class(config)
 
-    @require_usr_bin_time
-    @require_accelerate
-    def test_from_pretrained_low_cpu_mem_usage_measured(self):
-        # test that `from_pretrained(..., low_cpu_mem_usage=True)` uses less cpu memory than default
-
-        mname = "bert-base-cased"
-
-        preamble = "from transformers import AutoModel"
-        one_liner_str = f'{preamble}; AutoModel.from_pretrained("{mname}", low_cpu_mem_usage=False)'
-        max_rss_normal = self.python_one_liner_max_rss(one_liner_str)
-        # print(f"{max_rss_normal=}")
-
-        one_liner_str = f'{preamble};  AutoModel.from_pretrained("{mname}", low_cpu_mem_usage=True)'
-        max_rss_low_mem = self.python_one_liner_max_rss(one_liner_str)
-        # print(f"{max_rss_low_mem=}")
-
-        diff_bytes = max_rss_normal - max_rss_low_mem
-        diff_percent = diff_bytes / max_rss_low_mem
-        # print(f"{diff_bytes=}, {diff_percent=}")
-        # ideally we would compare that the diff is close to ~1x checkpoint size in bytes, but
-        # measuring cpu memory on linux is very tricky and inconsistent, so instead let's check that
-        # it's at least 15% less cpu memory consumed
-
-        self.assertGreater(
-            diff_percent,
-            0.15,
-            "should use less CPU memory for low_cpu_mem_usage=True, "
-            f"but got max_rss_normal={max_rss_normal} and max_rss_low_mem={max_rss_low_mem}",
-        )
-
-        # if you want to compare things manually, let's first look at the size of the model in bytes
-        # model = BertModel.from_pretrained(mname, low_cpu_mem_usage=False)
-        # total_numel = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())
-        # total_bytes = total_numel * 4  # 420MB
-        # Now the diff_bytes should be very close to total_bytes, but the reports are inconsistent.
-        # The easiest way to test this is to switch the model and torch.load to do all the work on
-        # gpu - that way one can measure exactly the total and peak memory used. Perhaps once we add
-        # functionality to load models directly on gpu, this test can be rewritten to use torch's
-        # cuda memory tracking and then we should be able to do a much more precise test.
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model = model_class.from_pretrained(
+                    tmpdirname, torch_dtype=torch.float16, use_flash_attention_2=False, low_cpu_mem_usage=True
+                ).to(torch_device)
 
-    @require_accelerate
-    @require_torch_multi_gpu
+                dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device)
+                dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [0, 1, 1, 1]]).to(torch_device)
+
+                out = model.generate(
+                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
+                )
+
+                model = model_class.from_pretrained(
+                    tmpdirname, torch_dtype=torch.float16, use_flash_attention_2=True, low_cpu_mem_usage=True
+                ).to(torch_device)
+
+                out_fa = model.generate(
+                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
+                )
+
+                self.assertTrue(torch.equal(out, out_fa))
+
+    @require_flash_attn
+    @require_torch_gpu
+    @mark.flash_attn_test
     @slow
-    def test_model_parallelism_gpt2(self):
-        device_map = {"transformer.wte": 0, "transformer.wpe": 0, "lm_head": 0, "transformer.ln_f": 1}
-        for i in range(12):
-            device_map[f"transformer.h.{i}"] = 0 if i <= 5 else 1
+    def test_flash_attn_2_generate_padding_right(self):
+        import torch
 
-        model = AutoModelForCausalLM.from_pretrained("gpt2", device_map=device_map)
+        for model_class in self.all_generative_model_classes:
+            if not model_class._supports_flash_attn_2:
+                return
 
-        tokenizer = AutoTokenizer.from_pretrained("gpt2")
-        inputs = tokenizer("Hello, my name is", return_tensors="pt")
-        output = model.generate(inputs["input_ids"].to(0))
+            config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+            model = model_class(config)
 
-        text_output = tokenizer.decode(output[0].tolist())
-        self.assertEqual(text_output, "Hello, my name is John. I'm a writer, and I'm a writer. I'm")
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model = model_class.from_pretrained(
+                    tmpdirname, torch_dtype=torch.float16, use_flash_attention_2=False, low_cpu_mem_usage=True
+                ).to(torch_device)
 
-    @require_accelerate
+                dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device)
+                dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device)
+
+                out = model.generate(
+                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
+                )
+
+                model = model_class.from_pretrained(
+                    tmpdirname, torch_dtype=torch.float16, use_flash_attention_2=True, low_cpu_mem_usage=True
+                ).to(torch_device)
+
+                out_fa = model.generate(
+                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
+                )
+
+                self.assertTrue(torch.equal(out, out_fa))
+
+    @require_flash_attn
     @require_torch_gpu
-    def test_from_pretrained_disk_offload_task_model(self):
-        model = AutoModel.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        device_map = {
-            "transformer.wte": 0,
-            "transformer.wpe": 0,
-            "transformer.h.0": "cpu",
-            "transformer.h.1": "cpu",
-            "transformer.h.2": "cpu",
-            "transformer.h.3": "disk",
-            "transformer.h.4": "disk",
-            "transformer.ln_f": 0,
-            "lm_head": 0,
-        }
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            inputs = torch.tensor([[1, 2, 3]]).to(0)
+    @mark.flash_attn_test
+    @slow
+    def test_flash_attn_2_generate_use_cache(self):
+        import torch
+
+        for model_class in self.all_generative_model_classes:
+            if not model_class._supports_flash_attn_2:
+                return
 
-            model.save_pretrained(tmp_dir)
-            new_model = AutoModelForCausalLM.from_pretrained(tmp_dir).to(0)
-            outputs1 = new_model.to(0)(inputs)
+            config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+            model = model_class(config)
 
-            offload_folder = os.path.join(tmp_dir, "offload")
-            new_model_with_offload = AutoModelForCausalLM.from_pretrained(
-                tmp_dir, device_map=device_map, offload_folder=offload_folder
-            )
-            outputs2 = new_model_with_offload(inputs)
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
 
-            self.assertTrue(torch.allclose(outputs1.logits.cpu(), outputs2.logits.cpu()))
+                dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device)
+                dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [0, 1, 1, 1]]).to(torch_device)
 
-            # With state dict temp offload
-            offload_folder = os.path.join(tmp_dir, "offload")
-            new_model_with_offload = AutoModelForCausalLM.from_pretrained(
-                tmp_dir,
-                device_map=device_map,
-                offload_folder=offload_folder,
-                offload_state_dict=True,
-            )
-            outputs2 = new_model_with_offload(inputs)
-
-            self.assertTrue(torch.allclose(outputs1.logits.cpu(), outputs2.logits.cpu()))
-
-    def test_cached_files_are_used_when_internet_is_down(self):
-        # A mock response for an HTTP head request to emulate server down
-        response_mock = mock.Mock()
-        response_mock.status_code = 500
-        response_mock.headers = {}
-        response_mock.raise_for_status.side_effect = HTTPError
-        response_mock.json.return_value = {}
-
-        # Download this model to make sure it's in the cache.
-        _ = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        # Under the mock environment we get a 500 error when trying to reach the model.
-        with mock.patch("requests.request", return_value=response_mock) as mock_head:
-            _ = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-            # This check we did call the fake head request
-            mock_head.assert_called()
-
-    def test_load_from_one_file(self):
-        try:
-            tmp_file = tempfile.mktemp()
-            with open(tmp_file, "wb") as f:
-                http_get(
-                    "https://huggingface.co/hf-internal-testing/tiny-random-bert/resolve/main/pytorch_model.bin", f
+                model = model_class.from_pretrained(
+                    tmpdirname, torch_dtype=torch.float16, use_flash_attention_2=True, low_cpu_mem_usage=True
+                ).to(torch_device)
+
+                # Just test that a large cache works as expected
+                _ = model.generate(
+                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=30, do_sample=False
                 )
 
-            config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
-            _ = BertModel.from_pretrained(tmp_file, config=config)
-        finally:
-            os.remove(tmp_file)
 
-    def test_legacy_load_from_url(self):
-        # This test is for deprecated behavior and can be removed in v5
-        config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
-        _ = BertModel.from_pretrained(
-            "https://huggingface.co/hf-internal-testing/tiny-random-bert/resolve/main/pytorch_model.bin", config=config
-        )
+global_rng = random.Random()
 
-    @require_safetensors
-    def test_safetensors_save_and_load(self):
-        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, safe_serialization=True)
-            # No pytorch_model.bin file, only a model.safetensors
-            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_NAME)))
-            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_NAME)))
 
-            new_model = BertModel.from_pretrained(tmp_dir)
+def ids_tensor(shape, vocab_size, rng=None, name=None):
+    #  Creates a random int32 tensor of the shape within the vocab size
+    if rng is None:
+        rng = global_rng
 
-            # Check models are equal
-            for p1, p2 in zip(model.parameters(), new_model.parameters()):
-                self.assertTrue(torch.allclose(p1, p2))
+    total_dims = 1
+    for dim in shape:
+        total_dims *= dim
 
-    @require_safetensors
-    def test_safetensors_load_from_hub(self):
-        safetensors_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-safetensors")
-        pytorch_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+    values = []
+    for _ in range(total_dims):
+        values.append(rng.randint(0, vocab_size - 1))
 
-        # Check models are equal
-        for p1, p2 in zip(safetensors_model.parameters(), pytorch_model.parameters()):
-            self.assertTrue(torch.allclose(p1, p2))
+    return torch.tensor(data=values, dtype=torch.long, device=torch_device).view(shape).contiguous()
 
-    @require_safetensors
-    def test_safetensors_save_and_load_sharded(self):
-        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, safe_serialization=True, max_shard_size="100kB")
-            # No pytorch_model.bin index file, only a model.safetensors index
-            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_INDEX_NAME)))
-            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME)))
-            # No regular weights file
-            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_NAME)))
-            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_NAME)))
-
-            new_model = BertModel.from_pretrained(tmp_dir)
-
-            # Check models are equal
-            for p1, p2 in zip(model.parameters(), new_model.parameters()):
-                self.assertTrue(torch.allclose(p1, p2))
 
-    @require_safetensors
-    def test_safetensors_load_from_hub_sharded(self):
-        safetensors_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-sharded-safetensors")
-        pytorch_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-sharded")
-
-        # Check models are equal
-        for p1, p2 in zip(safetensors_model.parameters(), pytorch_model.parameters()):
-            self.assertTrue(torch.allclose(p1, p2))
-
-    def test_base_model_to_head_model_load(self):
-        base_model = BaseModel(PretrainedConfig())
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            base_model.save_pretrained(tmp_dir)
-
-            # Can load a base model in a model with head
-            model = ModelWithHead.from_pretrained(tmp_dir)
-            for p1, p2 in zip(model.base.parameters(), base_model.parameters()):
-                self.assertTrue(torch.allclose(p1, p2))
-
-            # It doesn't work if the state dict has a mix of keys of the head and base without prefix though.
-            base_state_dict = base_model.state_dict()
-            head_state_dict = model.state_dict()
-            base_state_dict["linear2.weight"] = head_state_dict["linear2.weight"]
-            base_state_dict["linear2.bias"] = head_state_dict["linear2.bias"]
-            torch.save(base_state_dict, os.path.join(tmp_dir, WEIGHTS_NAME))
+def random_attention_mask(shape, rng=None, name=None):
+    attn_mask = ids_tensor(shape, vocab_size=2, rng=None, name=None)
+    # make sure that at least one token is attended to for each batch
+    # we choose the 1st token so this property of `at least one being non-zero` still holds after applying causal mask
+    attn_mask[:, 0] = 1
+    return attn_mask
 
-            with self.assertRaisesRegex(
-                ValueError, "The state dictionary of the model you are trying to load is corrupted."
-            ):
-                _ = ModelWithHead.from_pretrained(tmp_dir)
 
+def floats_tensor(shape, scale=1.0, rng=None, name=None):
+    """Creates a random float32 tensor"""
+    if rng is None:
+        rng = global_rng
 
-@require_torch
-@is_staging_test
-class ModelPushToHubTester(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls._token = TOKEN
-        set_access_token(TOKEN)
-        HfFolder.save_token(TOKEN)
-
-    @classmethod
-    def tearDownClass(cls):
-        try:
-            delete_repo(token=cls._token, repo_id="test-model")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="valid_org/test-model-org")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="test-dynamic-model")
-        except HTTPError:
-            pass
-
-    def test_push_to_hub(self):
-        config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
-        )
-        model = BertModel(config)
-        model.push_to_hub("test-model", use_auth_token=self._token)
-
-        new_model = BertModel.from_pretrained(f"{USER}/test-model")
-        for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            self.assertTrue(torch.equal(p1, p2))
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="test-model")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, repo_id="test-model", push_to_hub=True, use_auth_token=self._token)
-
-        new_model = BertModel.from_pretrained(f"{USER}/test-model")
-        for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            self.assertTrue(torch.equal(p1, p2))
-
-    def test_push_to_hub_in_organization(self):
-        config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
-        )
-        model = BertModel(config)
-        model.push_to_hub("valid_org/test-model-org", use_auth_token=self._token)
-
-        new_model = BertModel.from_pretrained("valid_org/test-model-org")
-        for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            self.assertTrue(torch.equal(p1, p2))
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="valid_org/test-model-org")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(
-                tmp_dir, push_to_hub=True, use_auth_token=self._token, repo_id="valid_org/test-model-org"
-            )
+    total_dims = 1
+    for dim in shape:
+        total_dims *= dim
 
-        new_model = BertModel.from_pretrained("valid_org/test-model-org")
-        for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            self.assertTrue(torch.equal(p1, p2))
-
-    def test_push_to_hub_dynamic_model(self):
-        CustomConfig.register_for_auto_class()
-        CustomModel.register_for_auto_class()
-
-        config = CustomConfig(hidden_size=32)
-        model = CustomModel(config)
-
-        model.push_to_hub("test-dynamic-model", use_auth_token=self._token)
-        # checks
-        self.assertDictEqual(
-            config.auto_map,
-            {"AutoConfig": "custom_configuration.CustomConfig", "AutoModel": "custom_modeling.CustomModel"},
-        )
-
-        new_model = AutoModel.from_pretrained(f"{USER}/test-dynamic-model", trust_remote_code=True)
-        # Can't make an isinstance check because the new_model is from the CustomModel class of a dynamic module
-        self.assertEqual(new_model.__class__.__name__, "CustomModel")
-        for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            self.assertTrue(torch.equal(p1, p2))
-
-        config = AutoConfig.from_pretrained(f"{USER}/test-dynamic-model", trust_remote_code=True)
-        new_model = AutoModel.from_config(config, trust_remote_code=True)
-        self.assertEqual(new_model.__class__.__name__, "CustomModel")
+    values = []
+    for _ in range(total_dims):
+        values.append(rng.random() * scale)
+
+    return torch.tensor(data=values, dtype=torch.float, device=torch_device).view(shape).contiguous()
diff --git a/tests/test_modeling_flax_common.py b/tests/test_modeling_flax_common.py
index 81ae33074609..58ada0226a51 100644
--- a/tests/test_modeling_flax_common.py
+++ b/tests/test_modeling_flax_common.py
@@ -17,26 +17,15 @@
 import json
 import random
 import tempfile
-import unittest
 from typing import List, Tuple
 
 import numpy as np
 
 import transformers
-from huggingface_hub import HfFolder, delete_repo, set_access_token
-from requests.exceptions import HTTPError
-from transformers import BertConfig, is_flax_available, is_torch_available
+from transformers import is_flax_available, is_torch_available
 from transformers.models.auto import get_values
-from transformers.testing_utils import (
-    TOKEN,
-    USER,
-    CaptureLogger,
-    is_pt_flax_cross_test,
-    is_staging_test,
-    require_flax,
-    torch_device,
-)
-from transformers.utils import logging
+from transformers.testing_utils import CaptureLogger, is_pt_flax_cross_test, require_flax, torch_device
+from transformers.utils import CONFIG_NAME, GENERATION_CONFIG_NAME, logging
 from transformers.utils.generic import ModelOutput
 
 
@@ -48,6 +37,7 @@
     from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
     from flax.serialization import from_bytes
     from flax.traverse_util import flatten_dict, unflatten_dict
+
     from transformers import (
         FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
         FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
@@ -68,14 +58,6 @@
     import torch
 
 
-def _config_zero_init(config):
-    configs_no_init = copy.deepcopy(config)
-    for key in configs_no_init.__dict__.keys():
-        if "_range" in key or "_std" in key or "initializer_factor" in key:
-            setattr(configs_no_init, key, 1e-10)
-    return configs_no_init
-
-
 def ids_tensor(shape, vocab_size, rng=None):
     """Creates a random int32 tensor of the shape within the vocab size."""
     if rng is None:
@@ -117,6 +99,30 @@ def random_attention_mask(shape, rng=None):
     return attn_mask
 
 
+def get_params(params, from_head_prefix=None):
+    """Function extracts relevant parameters into flatten dict from model params,
+    appends batch normalization statistics if present"""
+
+    # If Both parameters and batch normalization statistics are present
+    if "batch_stats" in params:
+        # Extract only parameters for the specified head prefix (if specified) and add batch statistics
+        if from_head_prefix is not None:
+            extracted_params = flatten_dict(unfreeze(params["params"][from_head_prefix]))
+            extracted_params.update(flatten_dict(params["batch_stats"][from_head_prefix]))
+        else:
+            extracted_params = flatten_dict(unfreeze(params["params"]))
+            extracted_params.update(flatten_dict(params["batch_stats"]))
+
+    # Only parameters are present
+    else:
+        if from_head_prefix is not None:
+            extracted_params = flatten_dict(unfreeze(params[from_head_prefix]))
+        else:
+            extracted_params = flatten_dict(unfreeze(params))
+
+    return extracted_params
+
+
 @require_flax
 class FlaxModelTesterMixin:
     model_tester = None
@@ -133,7 +139,7 @@ def _prepare_for_class(self, inputs_dict, model_class):
         if "ForMultipleChoice" in model_class.__name__:
             inputs_dict = {
                 k: jnp.broadcast_to(v[:, None], (v.shape[0], self.model_tester.num_choices, v.shape[-1]))
-                if isinstance(v, (jnp.ndarray, np.ndarray))
+                if isinstance(v, (jnp.ndarray, np.ndarray)) and k != "indices_prng_key"
                 else v
                 for k, v in inputs_dict.items()
             }
@@ -275,7 +281,6 @@ def test_equivalence_pt_to_flax(self):
 
         for model_class in self.all_model_classes:
             with self.subTest(model_class.__name__):
-
                 # Output all for aggressive testing
                 config.output_hidden_states = True
                 config.output_attentions = self.has_attentions
@@ -328,7 +333,6 @@ def test_equivalence_flax_to_pt(self):
 
         for model_class in self.all_model_classes:
             with self.subTest(model_class.__name__):
-
                 # Output all for aggressive testing
                 config.output_hidden_states = True
                 config.output_attentions = self.has_attentions
@@ -395,6 +399,13 @@ def test_from_pretrained_save_pretrained(self):
                 # verify that normal save_pretrained works as expected
                 with tempfile.TemporaryDirectory() as tmpdirname:
                     model.save_pretrained(tmpdirname)
+
+                    # the config file (and the generation config file, if it can generate) should be saved
+                    self.assertTrue(os.path.exists(os.path.join(tmpdirname, CONFIG_NAME)))
+                    self.assertEqual(
+                        model.can_generate(), os.path.exists(os.path.join(tmpdirname, GENERATION_CONFIG_NAME))
+                    )
+
                     model_loaded = model_class.from_pretrained(tmpdirname)
 
                 outputs_loaded = model_loaded(**prepared_inputs_dict).to_tuple()
@@ -420,14 +431,14 @@ def test_save_load_from_base(self):
                 continue
 
             model = base_class(config)
-            base_params = flatten_dict(unfreeze(model.params))
+            base_params = get_params(model.params)
 
             # check that all base model weights are loaded correctly
             with tempfile.TemporaryDirectory() as tmpdirname:
                 model.save_pretrained(tmpdirname)
                 head_model = model_class.from_pretrained(tmpdirname)
 
-                base_param_from_head = flatten_dict(unfreeze(head_model.params[head_model.base_model_prefix]))
+                base_param_from_head = get_params(head_model.params, from_head_prefix=head_model.base_model_prefix)
 
                 for key in base_param_from_head.keys():
                     max_diff = (base_params[key] - base_param_from_head[key]).sum().item()
@@ -442,14 +453,14 @@ def test_save_load_to_base(self):
                 continue
 
             model = model_class(config)
-            base_params_from_head = flatten_dict(unfreeze(model.params[model.base_model_prefix]))
+            base_params_from_head = get_params(model.params, from_head_prefix=model.base_model_prefix)
 
             # check that all base model weights are loaded correctly
             with tempfile.TemporaryDirectory() as tmpdirname:
                 model.save_pretrained(tmpdirname)
                 base_model = base_class.from_pretrained(tmpdirname)
 
-                base_params = flatten_dict(unfreeze(base_model.params))
+                base_params = get_params(base_model.params)
 
                 for key in base_params_from_head.keys():
                     max_diff = (base_params[key] - base_params_from_head[key]).sum().item()
@@ -465,7 +476,7 @@ def test_save_load_from_base_pt(self):
                 continue
 
             model = base_class(config)
-            base_params = flatten_dict(unfreeze(model.params))
+            base_params = get_params(model.params)
 
             # convert Flax model to PyTorch model
             pt_model_class = getattr(transformers, base_class.__name__[4:])  # Skip the "Flax" at the beginning
@@ -478,7 +489,7 @@ def test_save_load_from_base_pt(self):
                 pt_model.save_pretrained(tmpdirname)
                 head_model = model_class.from_pretrained(tmpdirname, from_pt=True)
 
-                base_param_from_head = flatten_dict(unfreeze(head_model.params[head_model.base_model_prefix]))
+                base_param_from_head = get_params(head_model.params, from_head_prefix=head_model.base_model_prefix)
 
                 for key in base_param_from_head.keys():
                     max_diff = (base_params[key] - base_param_from_head[key]).sum().item()
@@ -494,7 +505,7 @@ def test_save_load_to_base_pt(self):
                 continue
 
             model = model_class(config)
-            base_params_from_head = flatten_dict(unfreeze(model.params[model.base_model_prefix]))
+            base_params_from_head = get_params(model.params, from_head_prefix=model.base_model_prefix)
 
             # convert Flax model to PyTorch model
             pt_model_class = getattr(transformers, model_class.__name__[4:])  # Skip the "Flax" at the beginning
@@ -506,7 +517,7 @@ def test_save_load_to_base_pt(self):
                 pt_model.save_pretrained(tmpdirname)
                 base_model = base_class.from_pretrained(tmpdirname, from_pt=True)
 
-                base_params = flatten_dict(unfreeze(base_model.params))
+                base_params = get_params(base_model.params)
 
                 for key in base_params_from_head.keys():
                     max_diff = (base_params[key] - base_params_from_head[key]).sum().item()
@@ -523,7 +534,7 @@ def test_save_load_bf16_to_base_pt(self):
 
             model = model_class(config)
             model.params = model.to_bf16(model.params)
-            base_params_from_head = flatten_dict(unfreeze(model.params[model.base_model_prefix]))
+            base_params_from_head = get_params(model.params, from_head_prefix=model.base_model_prefix)
 
             # convert Flax model to PyTorch model
             pt_model_class = getattr(transformers, model_class.__name__[4:])  # Skip the "Flax" at the beginning
@@ -535,7 +546,7 @@ def test_save_load_bf16_to_base_pt(self):
                 pt_model.save_pretrained(tmpdirname)
                 base_model = base_class.from_pretrained(tmpdirname, from_pt=True)
 
-                base_params = flatten_dict(unfreeze(base_model.params))
+                base_params = get_params(base_model.params)
 
                 for key in base_params_from_head.keys():
                     max_diff = (base_params[key] - base_params_from_head[key]).sum().item()
@@ -562,7 +573,6 @@ def model_jitted(input_ids, attention_mask=None, **kwargs):
 
                 self.assertEqual(len(outputs), len(jitted_outputs))
                 for jitted_output, output in zip(jitted_outputs, outputs):
-
                     self.assertEqual(jitted_output.shape, output.shape)
 
     def test_forward_signature(self):
@@ -600,7 +610,6 @@ def test_naming_convention(self):
     def test_hidden_states_output(self):
         def check_hidden_states_output(inputs_dict, config, model_class):
             model = model_class(config)
-
             outputs = model(**self._prepare_for_class(inputs_dict, model_class))
             hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
 
@@ -645,6 +654,9 @@ def check_hidden_states_output(inputs_dict, config, model_class):
             check_hidden_states_output(inputs_dict, config, model_class)
 
     def test_attention_outputs(self):
+        if not self.has_attentions:
+            self.skipTest(reason="Model does not output attentions")
+
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
 
@@ -1091,7 +1103,7 @@ def test_checkpoint_sharding_local(self):
                     index = json.loads(f.read())
 
                 all_shards = set(index["weight_map"].values())
-                shards_found = set(f for f in os.listdir(tmp_dir) if f.endswith(".msgpack"))
+                shards_found = {f for f in os.listdir(tmp_dir) if f.endswith(".msgpack")}
                 self.assertSetEqual(all_shards, shards_found)
 
                 # Finally, check the model can be reloaded
@@ -1133,156 +1145,3 @@ def test_gradient_checkpointing(self):
             # ensure that the outputs remain precisely equal
             for output, remat_output in zip(outputs, remat_outputs):
                 self.assertTrue((output == remat_output).all())
-
-
-@require_flax
-@is_staging_test
-class FlaxModelPushToHubTester(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls._token = TOKEN
-        set_access_token(TOKEN)
-        HfFolder.save_token(TOKEN)
-
-    @classmethod
-    def tearDownClass(cls):
-        try:
-            delete_repo(token=cls._token, repo_id="test-model-flax")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="valid_org/test-model-flax-org")
-        except HTTPError:
-            pass
-
-    def test_push_to_hub(self):
-        config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
-        )
-        model = FlaxBertModel(config)
-        model.push_to_hub("test-model-flax", use_auth_token=self._token)
-
-        new_model = FlaxBertModel.from_pretrained(f"{USER}/test-model-flax")
-
-        base_params = flatten_dict(unfreeze(model.params))
-        new_params = flatten_dict(unfreeze(new_model.params))
-
-        for key in base_params.keys():
-            max_diff = (base_params[key] - new_params[key]).sum().item()
-            self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="test-model-flax")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, repo_id="test-model-flax", push_to_hub=True, use_auth_token=self._token)
-
-        new_model = FlaxBertModel.from_pretrained(f"{USER}/test-model-flax")
-
-        base_params = flatten_dict(unfreeze(model.params))
-        new_params = flatten_dict(unfreeze(new_model.params))
-
-        for key in base_params.keys():
-            max_diff = (base_params[key] - new_params[key]).sum().item()
-            self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-    def test_push_to_hub_in_organization(self):
-        config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
-        )
-        model = FlaxBertModel(config)
-        model.push_to_hub("valid_org/test-model-flax-org", use_auth_token=self._token)
-
-        new_model = FlaxBertModel.from_pretrained("valid_org/test-model-flax-org")
-
-        base_params = flatten_dict(unfreeze(model.params))
-        new_params = flatten_dict(unfreeze(new_model.params))
-
-        for key in base_params.keys():
-            max_diff = (base_params[key] - new_params[key]).sum().item()
-            self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="valid_org/test-model-flax-org")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(
-                tmp_dir, repo_id="valid_org/test-model-flax-org", push_to_hub=True, use_auth_token=self._token
-            )
-
-        new_model = FlaxBertModel.from_pretrained("valid_org/test-model-flax-org")
-
-        base_params = flatten_dict(unfreeze(model.params))
-        new_params = flatten_dict(unfreeze(new_model.params))
-
-        for key in base_params.keys():
-            max_diff = (base_params[key] - new_params[key]).sum().item()
-            self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-
-def check_models_equal(model1, model2):
-    models_are_equal = True
-    flat_params_1 = flatten_dict(model1.params)
-    flat_params_2 = flatten_dict(model2.params)
-    for key in flat_params_1.keys():
-        if np.sum(np.abs(flat_params_1[key] - flat_params_2[key])) > 1e-4:
-            models_are_equal = False
-
-    return models_are_equal
-
-
-@require_flax
-class FlaxModelUtilsTest(unittest.TestCase):
-    def test_model_from_pretrained_subfolder(self):
-        config = BertConfig.from_pretrained("hf-internal-testing/tiny-bert-flax-only")
-        model = FlaxBertModel(config)
-
-        subfolder = "bert"
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(os.path.join(tmp_dir, subfolder))
-
-            with self.assertRaises(OSError):
-                _ = FlaxBertModel.from_pretrained(tmp_dir)
-
-            model_loaded = FlaxBertModel.from_pretrained(tmp_dir, subfolder=subfolder)
-
-        self.assertTrue(check_models_equal(model, model_loaded))
-
-    def test_model_from_pretrained_subfolder_sharded(self):
-        config = BertConfig.from_pretrained("hf-internal-testing/tiny-bert-flax-only")
-        model = FlaxBertModel(config)
-
-        subfolder = "bert"
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(os.path.join(tmp_dir, subfolder), max_shard_size="10KB")
-
-            with self.assertRaises(OSError):
-                _ = FlaxBertModel.from_pretrained(tmp_dir)
-
-            model_loaded = FlaxBertModel.from_pretrained(tmp_dir, subfolder=subfolder)
-
-        self.assertTrue(check_models_equal(model, model_loaded))
-
-    def test_model_from_pretrained_hub_subfolder(self):
-        subfolder = "bert"
-        model_id = "hf-internal-testing/tiny-random-bert-subfolder"
-
-        with self.assertRaises(OSError):
-            _ = FlaxBertModel.from_pretrained(model_id)
-
-        model = FlaxBertModel.from_pretrained(model_id, subfolder=subfolder)
-
-        self.assertIsNotNone(model)
-
-    def test_model_from_pretrained_hub_subfolder_sharded(self):
-        subfolder = "bert"
-        model_id = "hf-internal-testing/tiny-random-bert-sharded-subfolder"
-        with self.assertRaises(OSError):
-            _ = FlaxBertModel.from_pretrained(model_id)
-
-        model = FlaxBertModel.from_pretrained(model_id, subfolder=subfolder)
-
-        self.assertIsNotNone(model)
diff --git a/tests/test_modeling_flax_utils.py b/tests/test_modeling_flax_utils.py
new file mode 100644
index 000000000000..d8fb71a6104c
--- /dev/null
+++ b/tests/test_modeling_flax_utils.py
@@ -0,0 +1,186 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tempfile
+import unittest
+
+import numpy as np
+from huggingface_hub import HfFolder, delete_repo
+from requests.exceptions import HTTPError
+
+from transformers import BertConfig, is_flax_available
+from transformers.testing_utils import TOKEN, USER, is_staging_test, require_flax
+
+
+if is_flax_available():
+    import os
+
+    from flax.core.frozen_dict import unfreeze
+    from flax.traverse_util import flatten_dict
+
+    from transformers import FlaxBertModel
+
+    os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"] = "0.12"  # assumed parallelism: 8
+
+
+@require_flax
+@is_staging_test
+class FlaxModelPushToHubTester(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._token = TOKEN
+        HfFolder.save_token(TOKEN)
+
+    @classmethod
+    def tearDownClass(cls):
+        try:
+            delete_repo(token=cls._token, repo_id="test-model-flax")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="valid_org/test-model-flax-org")
+        except HTTPError:
+            pass
+
+    def test_push_to_hub(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        model = FlaxBertModel(config)
+        model.push_to_hub("test-model-flax", use_auth_token=self._token)
+
+        new_model = FlaxBertModel.from_pretrained(f"{USER}/test-model-flax")
+
+        base_params = flatten_dict(unfreeze(model.params))
+        new_params = flatten_dict(unfreeze(new_model.params))
+
+        for key in base_params.keys():
+            max_diff = (base_params[key] - new_params[key]).sum().item()
+            self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="test-model-flax")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, repo_id="test-model-flax", push_to_hub=True, use_auth_token=self._token)
+
+        new_model = FlaxBertModel.from_pretrained(f"{USER}/test-model-flax")
+
+        base_params = flatten_dict(unfreeze(model.params))
+        new_params = flatten_dict(unfreeze(new_model.params))
+
+        for key in base_params.keys():
+            max_diff = (base_params[key] - new_params[key]).sum().item()
+            self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
+
+    def test_push_to_hub_in_organization(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        model = FlaxBertModel(config)
+        model.push_to_hub("valid_org/test-model-flax-org", use_auth_token=self._token)
+
+        new_model = FlaxBertModel.from_pretrained("valid_org/test-model-flax-org")
+
+        base_params = flatten_dict(unfreeze(model.params))
+        new_params = flatten_dict(unfreeze(new_model.params))
+
+        for key in base_params.keys():
+            max_diff = (base_params[key] - new_params[key]).sum().item()
+            self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="valid_org/test-model-flax-org")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(
+                tmp_dir, repo_id="valid_org/test-model-flax-org", push_to_hub=True, use_auth_token=self._token
+            )
+
+        new_model = FlaxBertModel.from_pretrained("valid_org/test-model-flax-org")
+
+        base_params = flatten_dict(unfreeze(model.params))
+        new_params = flatten_dict(unfreeze(new_model.params))
+
+        for key in base_params.keys():
+            max_diff = (base_params[key] - new_params[key]).sum().item()
+            self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
+
+
+def check_models_equal(model1, model2):
+    models_are_equal = True
+    flat_params_1 = flatten_dict(model1.params)
+    flat_params_2 = flatten_dict(model2.params)
+    for key in flat_params_1.keys():
+        if np.sum(np.abs(flat_params_1[key] - flat_params_2[key])) > 1e-4:
+            models_are_equal = False
+
+    return models_are_equal
+
+
+@require_flax
+class FlaxModelUtilsTest(unittest.TestCase):
+    def test_model_from_pretrained_subfolder(self):
+        config = BertConfig.from_pretrained("hf-internal-testing/tiny-bert-flax-only")
+        model = FlaxBertModel(config)
+
+        subfolder = "bert"
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(os.path.join(tmp_dir, subfolder))
+
+            with self.assertRaises(OSError):
+                _ = FlaxBertModel.from_pretrained(tmp_dir)
+
+            model_loaded = FlaxBertModel.from_pretrained(tmp_dir, subfolder=subfolder)
+
+        self.assertTrue(check_models_equal(model, model_loaded))
+
+    def test_model_from_pretrained_subfolder_sharded(self):
+        config = BertConfig.from_pretrained("hf-internal-testing/tiny-bert-flax-only")
+        model = FlaxBertModel(config)
+
+        subfolder = "bert"
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(os.path.join(tmp_dir, subfolder), max_shard_size="10KB")
+
+            with self.assertRaises(OSError):
+                _ = FlaxBertModel.from_pretrained(tmp_dir)
+
+            model_loaded = FlaxBertModel.from_pretrained(tmp_dir, subfolder=subfolder)
+
+        self.assertTrue(check_models_equal(model, model_loaded))
+
+    def test_model_from_pretrained_hub_subfolder(self):
+        subfolder = "bert"
+        model_id = "hf-internal-testing/tiny-random-bert-subfolder"
+
+        with self.assertRaises(OSError):
+            _ = FlaxBertModel.from_pretrained(model_id)
+
+        model = FlaxBertModel.from_pretrained(model_id, subfolder=subfolder)
+
+        self.assertIsNotNone(model)
+
+    def test_model_from_pretrained_hub_subfolder_sharded(self):
+        subfolder = "bert"
+        model_id = "hf-internal-testing/tiny-random-bert-sharded-subfolder"
+        with self.assertRaises(OSError):
+            _ = FlaxBertModel.from_pretrained(model_id)
+
+        model = FlaxBertModel.from_pretrained(model_id, subfolder=subfolder)
+
+        self.assertIsNotNone(model)
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index f8ca8506262a..0d7bf373a4d4 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import copy
 import inspect
 import json
@@ -21,36 +23,24 @@
 import random
 import tempfile
 import unittest
-import unittest.mock as mock
-from dataclasses import fields
 from importlib import import_module
 from math import isnan
-from typing import List, Tuple, get_type_hints
+from typing import List, Tuple
 
 from datasets import Dataset
 
-from huggingface_hub import HfFolder, Repository, delete_repo, set_access_token
-from huggingface_hub.file_download import http_get
-from requests.exceptions import HTTPError
 from transformers import is_tf_available, is_torch_available
-from transformers.configuration_utils import PretrainedConfig
 from transformers.models.auto import get_values
 from transformers.testing_utils import (  # noqa: F401
-    TOKEN,
-    USER,
     CaptureLogger,
-    CaptureStdout,
     _tf_gpu_memory_limit,
     is_pt_tf_cross_test,
-    is_staging_test,
-    require_safetensors,
     require_tf,
     require_tf2onnx,
     slow,
-    tooslow,
     torch_device,
 )
-from transformers.utils import SAFE_WEIGHTS_NAME, TF2_WEIGHTS_INDEX_NAME, TF2_WEIGHTS_NAME, logging
+from transformers.utils import CONFIG_NAME, GENERATION_CONFIG_NAME, logging
 from transformers.utils.generic import ModelOutput
 
 
@@ -58,7 +48,6 @@
 
 
 if is_tf_available():
-    import h5py
     import numpy as np
     import tensorflow as tf
 
@@ -77,14 +66,8 @@
         TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
         TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
         TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
-        BertConfig,
-        PushToHubCallback,
-        RagRetriever,
         TFAutoModel,
         TFAutoModelForSequenceClassification,
-        TFBertForMaskedLM,
-        TFBertModel,
-        TFRagModel,
         TFSharedEmbeddings,
     )
     from transformers.generation import (
@@ -97,8 +80,8 @@
         TFSampleDecoderOnlyOutput,
         TFSampleEncoderDecoderOutput,
     )
-    from transformers.modeling_tf_utils import tf_shard_checkpoint, unpack_inputs
-    from transformers.tf_utils import stable_softmax
+
+    tf.config.experimental.enable_tensor_float_32_execution(False)
 
     if _tf_gpu_memory_limit is not None:
         gpus = tf.config.list_physical_devices("GPU")
@@ -117,8 +100,6 @@
 if is_torch_available():
     import torch
 
-    from transformers import BertModel
-
 
 def _config_zero_init(config):
     configs_no_init = copy.deepcopy(config)
@@ -128,29 +109,8 @@ def _config_zero_init(config):
     return configs_no_init
 
 
-def _return_type_has_loss(model):
-    return_type = get_type_hints(model.call)
-    if "return" not in return_type:
-        return False
-    return_type = return_type["return"]
-    if hasattr(return_type, "__args__"):  # Awkward check for union because UnionType only turns up in 3.10
-        for type_annotation in return_type.__args__:
-            if inspect.isclass(type_annotation) and issubclass(type_annotation, ModelOutput):
-                field_names = [field.name for field in fields(type_annotation)]
-                if "loss" in field_names:
-                    return True
-        return False
-    elif isinstance(return_type, tuple):
-        return False
-    elif isinstance(return_type, ModelOutput):
-        class_fields = fields(return_type)
-        return "loss" in class_fields
-    return False
-
-
 @require_tf
 class TFModelTesterMixin:
-
     model_tester = None
     all_model_classes = ()
     all_generative_model_classes = ()
@@ -226,6 +186,13 @@ def test_save_load(self):
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 model.save_pretrained(tmpdirname, saved_model=False)
+
+                # the config file (and the generation config file, if it can generate) should be saved
+                self.assertTrue(os.path.exists(os.path.join(tmpdirname, CONFIG_NAME)))
+                self.assertEqual(
+                    model.can_generate(), os.path.exists(os.path.join(tmpdirname, GENERATION_CONFIG_NAME))
+                )
+
                 model = model_class.from_pretrained(tmpdirname)
                 after_outputs = model(self._prepare_for_class(inputs_dict, model_class))
 
@@ -349,7 +316,7 @@ def test_onnx_compliancy(self):
 
             with tf.Graph().as_default() as g:
                 model = model_class(config)
-                model(model.dummy_inputs)
+                model.build()
 
                 for op in g.get_operations():
                     model_op_names.add(op.node_def.op)
@@ -363,6 +330,9 @@ def test_onnx_compliancy(self):
 
             self.assertEqual(len(incompatible_ops), 0, incompatible_ops)
 
+    # `tf2onnx` issue page: https://github.com/onnx/tensorflow-onnx/issues/2172
+    # TODO: undo skip once a fix is done in `tf2onnx`
+    @unittest.skip("`tf2onnx` broke with TF 2.13")
     @require_tf2onnx
     @slow
     def test_onnx_runtime_optimize(self):
@@ -374,9 +344,9 @@ def test_onnx_runtime_optimize(self):
 
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-        for model_class in self.all_model_classes:
+        for model_class in self.all_model_classes[:2]:
             model = model_class(config)
-            model(model.dummy_inputs)
+            model.build()
 
             onnx_model_proto, _ = tf2onnx.convert.from_keras(model, opset=self.onnx_min_opset)
 
@@ -385,7 +355,7 @@ def test_onnx_runtime_optimize(self):
     def test_keras_save_load(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-        tf_main_layer_classes = set(
+        tf_main_layer_classes = {
             module_member
             for model_class in self.all_model_classes
             for module in (import_module(model_class.__module__),)
@@ -397,7 +367,7 @@ def test_keras_save_load(self):
             if isinstance(module_member, type)
             and tf.keras.layers.Layer in module_member.__bases__
             and getattr(module_member, "_keras_serializable", False)
-        )
+        }
         for main_layer_class in tf_main_layer_classes:
             # T5MainLayer needs an embed_tokens parameter when called without the inputs_embeds parameter
             if "T5" in main_layer_class.__name__:
@@ -485,8 +455,8 @@ def _make_attention_mask_non_null(self, inputs_dict):
     def _postprocessing_to_ignore_test_cases(self, tf_outputs, pt_outputs, model_class):
         """For temporarily ignoring some failed test cases (issues to be fixed)"""
 
-        tf_keys = set([k for k, v in tf_outputs.items() if v is not None])
-        pt_keys = set([k for k, v in pt_outputs.items() if v is not None])
+        tf_keys = {k for k, v in tf_outputs.items() if v is not None}
+        pt_keys = {k for k, v in pt_outputs.items() if v is not None}
 
         key_differences = tf_keys.symmetric_difference(pt_keys)
 
@@ -604,7 +574,6 @@ def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=1e-5, nam
             )
 
     def prepare_pt_inputs_from_tf_inputs(self, tf_inputs_dict):
-
         pt_inputs_dict = {}
         for name, key in tf_inputs_dict.items():
             if type(key) == bool:
@@ -624,7 +593,6 @@ def prepare_pt_inputs_from_tf_inputs(self, tf_inputs_dict):
         return pt_inputs_dict
 
     def check_pt_tf_models(self, tf_model, pt_model, tf_inputs_dict):
-
         pt_inputs_dict = self.prepare_pt_inputs_from_tf_inputs(tf_inputs_dict)
 
         # send pytorch inputs to the correct device
@@ -652,11 +620,10 @@ def check_pt_tf_models(self, tf_model, pt_model, tf_inputs_dict):
         self.check_pt_tf_outputs(tf_outputs, pt_outputs, type(tf_model))
 
     @is_pt_tf_cross_test
-    def test_pt_tf_model_equivalence(self):
+    def test_pt_tf_model_equivalence(self, allow_missing_keys=False):
         import transformers
 
         for model_class in self.all_model_classes:
-
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
             # Output all for aggressive testing
@@ -684,12 +651,16 @@ def test_pt_tf_model_equivalence(self):
 
             # For some models (e.g. base models), there is no label returned.
             # Set the input dict to `None` to avoid check outputs twice for the same input dicts.
-            if set(tf_inputs_dict_with_labels.keys()).symmetric_difference(tf_inputs_dict.keys()):
+            if not set(tf_inputs_dict_with_labels.keys()).symmetric_difference(tf_inputs_dict.keys()):
                 tf_inputs_dict_with_labels = None
 
             # Check we can load pt model in tf and vice-versa with model => model functions
-            tf_model = transformers.load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=tf_inputs_dict)
-            pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model)
+            tf_model = transformers.load_pytorch_model_in_tf2_model(
+                tf_model, pt_model, tf_inputs=tf_inputs_dict, allow_missing_keys=allow_missing_keys
+            )
+            pt_model = transformers.load_tf2_model_in_pytorch_model(
+                pt_model, tf_model, allow_missing_keys=allow_missing_keys
+            )
 
             # Original test: check without `labels`
             self.check_pt_tf_models(tf_model, pt_model, tf_inputs_dict)
@@ -701,11 +672,15 @@ def test_pt_tf_model_equivalence(self):
             with tempfile.TemporaryDirectory() as tmpdirname:
                 pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin")
                 torch.save(pt_model.state_dict(), pt_checkpoint_path)
-                tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path)
+                tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(
+                    tf_model, pt_checkpoint_path, allow_missing_keys=allow_missing_keys
+                )
 
                 tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5")
                 tf_model.save_weights(tf_checkpoint_path)
-                pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path)
+                pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(
+                    pt_model, tf_checkpoint_path, allow_missing_keys=allow_missing_keys
+                )
 
             # Original test: check without `labels`
             self.check_pt_tf_models(tf_model, pt_model, tf_inputs_dict)
@@ -713,105 +688,30 @@ def test_pt_tf_model_equivalence(self):
             if tf_inputs_dict_with_labels:
                 self.check_pt_tf_models(tf_model, pt_model, tf_inputs_dict_with_labels)
 
+    @slow
     def test_compile_tf_model(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        max_input = getattr(self.model_tester, "max_position_embeddings", 512)
-        optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
-        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-        metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
-
-        for model_class in self.all_model_classes:
-            if model_class.__name__ in ["TFSpeech2TextModel", "TFSpeech2TextForConditionalGeneration"]:
-                inputs = {
-                    "decoder_input_ids": tf.keras.Input(
-                        batch_shape=(2, max_input),
-                        name="decoder_input_ids",
-                        dtype="int32",
-                    ),
-                    "input_features": tf.keras.Input(
-                        batch_shape=(
-                            2,
-                            max_input,
-                            self.model_tester.input_feat_per_channel * self.model_tester.input_channels,
-                        ),
-                        name="input_features",
-                        dtype="float32",
-                    ),
-                }
-            elif model_class.__name__ in ["TFWhisperModel", "TFWhisperForConditionalGeneration"]:
-                inputs = {
-                    "decoder_input_ids": tf.keras.Input(
-                        batch_shape=(2, max_input),
-                        name="decoder_input_ids",
-                        dtype="int32",
-                    ),
-                    "input_features": tf.keras.Input(
-                        batch_shape=(
-                            2,
-                            self.model_tester.num_mel_bins,
-                            self.model_tester.seq_length,
-                        ),
-                        name="input_features",
-                        dtype="float32",
-                    ),
-                }
-            elif self.is_encoder_decoder:
-                inputs = {
-                    "decoder_input_ids": tf.keras.Input(
-                        batch_shape=(2, max_input),
-                        name="decoder_input_ids",
-                        dtype="int32",
-                    ),
-                    "input_ids": tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32"),
-                }
-            # `pixel_values` implies that the input is an image
-            elif model_class.main_input_name == "pixel_values":
-                inputs = tf.keras.Input(
-                    batch_shape=(
-                        3,
-                        self.model_tester.num_channels,
-                        self.model_tester.image_size,
-                        self.model_tester.image_size,
-                    ),
-                    name="pixel_values",
-                    dtype="float32",
-                )
-            elif model_class.__name__ in ["TFCLIPModel", "TFGroupViTModel"]:
-                inputs = {
-                    "input_ids": tf.keras.Input(batch_shape=(3, max_input), name="input_ids", dtype="int32"),
-                    "pixel_values": tf.keras.Input(
-                        batch_shape=(
-                            3,
-                            self.model_tester.vision_model_tester.num_channels,
-                            self.model_tester.vision_model_tester.image_size,
-                            self.model_tester.vision_model_tester.image_size,
-                        ),
-                        name="pixel_values",
-                        dtype="float32",
-                    ),
-                }
-            elif model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
-                inputs = tf.keras.Input(batch_shape=(4, 2, max_input), name="input_ids", dtype="int32")
-            else:
-                inputs = tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32")
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
+        for model_class in self.all_model_classes[:2]:
             # Prepare our model
             model = model_class(config)
-            model(self._prepare_for_class(inputs_dict, model_class))  # Model must be called before saving.
-            # Let's load it from the disk to be sure we can use pretrained weights
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname, saved_model=False)
-                model = model_class.from_pretrained(tmpdirname)
+            # These are maximally general inputs for the model, with multiple None dimensions
+            # Hopefully this will catch any conditionals that fail for flexible shapes
+            functional_inputs = {
+                key: tf.keras.Input(shape=val.shape[1:], dtype=val.dtype, name=key)
+                for key, val in model.input_signature.items()
+                if key in model.dummy_inputs
+            }
+            outputs_dict = model(functional_inputs)
 
-            outputs_dict = model(inputs)
             hidden_states = outputs_dict[0]
 
-            # Add a dense layer on top to test integration with other keras modules
-            outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states)
-
             # Compile extended model
-            extended_model = tf.keras.Model(inputs=[inputs], outputs=[outputs])
-            extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
+            functional_model = tf.keras.Model(inputs=functional_inputs, outputs=hidden_states)
+            model_out = functional_model.predict(model.dummy_inputs)  # Check we can pass inputs with the Keras API
+            self.assertTrue(model_out is not None)
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                functional_model.save(tmpdirname)  # Ensure we can save/export the whole functional model
 
     def test_keyword_and_dict_args(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -1009,7 +909,7 @@ def check_hidden_states_output(config, inputs_dict, model_class):
             check_hidden_states_output(config, inputs_dict, model_class)
 
     def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
         text_in_text_out_models = (
             get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING)
             + get_values(TF_MODEL_FOR_MASKED_LM_MAPPING)
@@ -1019,24 +919,27 @@ def test_model_common_attributes(self):
 
         for model_class in self.all_model_classes:
             model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-            if model_class in text_in_text_out_models:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert isinstance(name, dict)
-                for k, v in name.items():
-                    assert isinstance(v, tf.Variable)
+            self.assertIsInstance(model.get_input_embeddings(), tf.keras.layers.Layer)
+
+            legacy_text_in_text_out = model.get_lm_head() is not None
+            if model_class in text_in_text_out_models or legacy_text_in_text_out:
+                out_embeddings = model.get_output_embeddings()
+                self.assertIsInstance(out_embeddings, tf.keras.layers.Layer)
+                bias = model.get_bias()
+                if bias is not None:
+                    self.assertIsInstance(bias, dict)
+                    for _, v in bias.items():
+                        self.assertIsInstance(v, tf.Variable)
             elif model_class in speech_in_text_out_models:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert name is None
+                out_embeddings = model.get_output_embeddings()
+                self.assertIsInstance(out_embeddings, tf.keras.layers.Layer)
+                bias = model.get_bias()
+                self.assertIsNone(bias)
             else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
+                out_embeddings = model.get_output_embeddings()
+                assert out_embeddings is None
+                bias = model.get_bias()
+                self.assertIsNone(bias)
 
     def test_determinism(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -1055,7 +958,6 @@ def test_determinism(self):
             self.assertLessEqual(max_diff, 1e-5)
 
     def test_model_outputs_equivalence(self):
-
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
@@ -1166,6 +1068,16 @@ def prepare_numpy_arrays(inputs_dict):
             output_for_kw_input = model(**inputs_np)
             self.assert_outputs_same(output_for_dict_input, output_for_kw_input)
 
+    def test_valid_input_signature_and_dummies(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            call_args = inspect.signature(model.call).parameters
+            for key in model.input_signature:
+                self.assertIn(key, call_args)
+            for key in model.dummy_inputs:
+                self.assertIn(key, call_args)
+
     def test_resize_token_embeddings(self):
         # TODO (joao): after the embeddings refactor is complete, rework this test so as to rely exclusively on
         # tf.keras.layers.Embedding
@@ -1177,7 +1089,7 @@ def test_resize_token_embeddings(self):
         def _get_word_embedding_weight(model, embedding_layer):
             if isinstance(embedding_layer, tf.keras.layers.Embedding):
                 # builds the embeddings layer
-                model(model.dummy_inputs)
+                model.build()
                 return embedding_layer.embeddings
             else:
                 return model._get_word_embedding_weight(embedding_layer)
@@ -1240,7 +1152,7 @@ def test_save_load_after_resize_token_embeddings(self):
             old_total_size = config.vocab_size
             new_total_size = old_total_size + new_tokens_size
             model = model_class(config=copy.deepcopy(config))  # `resize_token_embeddings` mutates `config`
-            model(model.dummy_inputs)  # builds the embeddings layer
+            model.build()
             model.resize_token_embeddings(new_total_size)
 
             # fetch the output for an input exclusively made of new members of the vocabulary
@@ -1442,11 +1354,9 @@ def test_loss_computation(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
             model = model_class(config)
-            if not getattr(model, "hf_compute_loss", None) and not _return_type_has_loss(model):
-                continue
             # The number of elements in the loss should be the same as the number of elements in the label
             prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
-            added_label_names = sorted(list(prepared_for_class.keys() - inputs_dict.keys()), reverse=True)
+            added_label_names = sorted(prepared_for_class.keys() - inputs_dict.keys(), reverse=True)
             if not added_label_names:
                 continue  # This test is only for models with easily-separable labels
             added_label = prepared_for_class[added_label_names[0]]
@@ -1458,7 +1368,11 @@ def test_loss_computation(self):
             input_name = possible_input_names.intersection(set(prepared_for_class)).pop()
             model_input = prepared_for_class.pop(input_name)
 
-            loss = model(model_input, **prepared_for_class)[0]
+            outputs = model(model_input, **prepared_for_class)
+            if not isinstance(outputs, ModelOutput) or not hasattr(outputs, "loss"):
+                continue
+
+            loss = outputs.loss
             self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1])
 
             # Test that model correctly compute the loss when we mask some positions
@@ -1514,22 +1428,21 @@ def test_loss_computation(self):
     def check_keras_fit_results(self, val_loss1, val_loss2, atol=1e-2, rtol=1e-3):
         self.assertTrue(np.allclose(val_loss1, val_loss2, atol=atol, rtol=rtol))
 
+    @slow
     def test_keras_fit(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
             model = model_class(config)
-            if not getattr(model, "hf_compute_loss", False) and not _return_type_has_loss(model):
-                continue
             # Test that model correctly compute the loss with kwargs
             prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
-            # Is there a better way to remove these decoder inputs?
             # We also remove "return_loss" as this is covered by the train_step when using fit()
             prepared_for_class = {
                 key: val
                 for key, val in prepared_for_class.items()
-                if key
-                not in ("head_mask", "decoder_head_mask", "cross_attn_head_mask", "decoder_input_ids", "return_loss")
+                if key not in ("head_mask", "decoder_head_mask", "cross_attn_head_mask", "return_loss")
             }
+            if "labels" in prepared_for_class and "decoder_input_ids" in prepared_for_class:
+                del prepared_for_class["decoder_input_ids"]
 
             accuracy_classes = [
                 "ForPreTraining",
@@ -1553,8 +1466,10 @@ def test_keras_fit(self):
                 sample_weight = tf.convert_to_tensor([0.5] * self.model_tester.batch_size, dtype=tf.float32)
             else:
                 sample_weight = None
-
-            model(model.dummy_inputs)  # Build the model so we can get some constant weights
+            # Build the model so we can get some constant weights and check outputs
+            outputs = model(prepared_for_class)
+            if getattr(outputs, "loss", None) is None:
+                continue
             model_weights = model.get_weights()
 
             # Run eagerly to save some expensive compilation times
@@ -1615,37 +1530,6 @@ def test_keras_fit(self):
             if metrics:
                 self.assertTrue(len(accuracy1) == len(accuracy2) > 0, "Missing metrics!")
 
-            # Make sure fit works with tf.data.Dataset and results are consistent
-            dataset = tf.data.Dataset.from_tensor_slices(prepared_for_class)
-
-            if sample_weight is not None:
-                # Add in the sample weight
-                weighted_dataset = dataset.map(lambda x: (x, None, tf.convert_to_tensor(0.5, dtype=tf.float32)))
-            else:
-                weighted_dataset = dataset
-            # Pass in all samples as a batch to match other `fit` calls
-            weighted_dataset = weighted_dataset.batch(len(dataset))
-            dataset = dataset.batch(len(dataset))
-
-            # Reinitialize to fix batchnorm again
-            model.set_weights(model_weights)
-
-            # To match the other calls, don't pass sample weights in the validation data
-            history3 = model.fit(
-                weighted_dataset,
-                validation_data=dataset,
-                steps_per_epoch=1,
-                validation_steps=1,
-                shuffle=False,
-            )
-            val_loss3 = history3.history["val_loss"][0]
-            self.assertTrue(not isnan(val_loss3))
-            accuracy3 = {key: val[0] for key, val in history3.history.items() if key.endswith("accuracy")}
-            self.check_keras_fit_results(val_loss1, val_loss3)
-            self.assertEqual(history1.history.keys(), history3.history.keys())
-            if metrics:
-                self.assertTrue(len(accuracy1) == len(accuracy3) > 0, "Missing metrics!")
-
     def test_int_support(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
@@ -1655,7 +1539,7 @@ def test_int_support(self):
                 return_labels=True if "labels" in inspect.signature(model_class.call).parameters.keys() else False,
             )
             if not any(
-                [tensor.dtype.is_integer for tensor in prepared_for_class.values() if isinstance(tensor, tf.Tensor)]
+                tensor.dtype.is_integer for tensor in prepared_for_class.values() if isinstance(tensor, tf.Tensor)
             ):
                 return  # No integer inputs means no need for this test
 
@@ -1673,18 +1557,17 @@ def test_int_support(self):
 
             # After testing that the model accepts all int inputs, confirm that its dummies are int32
             for key, tensor in model.dummy_inputs.items():
-                self.assertTrue(isinstance(tensor, tf.Tensor), "Dummy inputs should be tf.Tensor!")
+                self.assertTrue(
+                    isinstance(tensor, tf.Tensor) or tf.keras.backend.is_keras_tensor(tensor),
+                    "Dummy inputs should be tf.Tensor!",
+                )
                 if tensor.dtype.is_integer:
                     self.assertTrue(tensor.dtype == tf.int32, "Integer dummy inputs should be tf.int32!")
 
-            # Also confirm that the serving sig uses int32
-            if hasattr(model, "serving"):
-                serving_sig = model.serving.input_signature
-                for key, tensor_spec in serving_sig[0].items():
-                    if tensor_spec.dtype.is_integer:
-                        self.assertTrue(
-                            tensor_spec.dtype == tf.int32, "Serving signatures should use tf.int32 for ints!"
-                        )
+            # Also confirm that the input_signature uses int32
+            for key, tensor_spec in model.input_signature.items():
+                if tensor_spec.dtype.is_integer:
+                    self.assertTrue(tensor_spec.dtype == tf.int32, "Input signatures should use tf.int32 for ints!")
 
     def test_generate_with_headmasking(self):
         attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
@@ -1704,7 +1587,7 @@ def test_generate_with_headmasking(self):
             }
 
             signature = inspect.signature(model.call)
-            if set(head_masking.keys()) < set([*signature.parameters.keys()]):
+            if set(head_masking.keys()) < {*signature.parameters.keys()}:
                 continue
 
             for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
@@ -1778,6 +1661,8 @@ def test_dataset_conversion(self):
         for model_class in self.all_model_classes:
             model = model_class(config)
             tf_inputs_dict = self._prepare_for_class(inputs_dict, model_class, return_labels=False)
+            if "labels" in tf_inputs_dict:
+                return  # This is some kinda funky decoder model that needs labels in its forward pass
             tf_inputs_dict = {
                 key: val
                 for key, val in tf_inputs_dict.items()
@@ -1791,14 +1676,14 @@ def test_dataset_conversion(self):
             test_batch = next(iter(tf_dataset))
             if isinstance(test_batch, tf.Tensor):
                 self.assertEqual(len(test_batch), len(input_dataset))  # Assert we didn't lose any data
-            else:
+            elif isinstance(test_batch, dict):
                 # Assert we discarded the unwanted extra column but kept everything else
                 self.assertEqual(len(test_batch), len(input_dataset.features) - 1)
                 self.assertNotIn("extra_unwanted_column", test_batch)
                 for tensor in test_batch.values():
                     self.assertTrue(isinstance(tensor, tf.Tensor))
                     self.assertEqual(len(tensor), len(input_dataset))  # Assert we didn't lose any data
-                    model(test_batch, training=False)
+            model(test_batch, training=False)
 
             if "labels" in inspect.signature(model_class.call).parameters.keys():
                 tf_inputs_dict = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
@@ -1844,13 +1729,35 @@ def _generate_and_check_results(model, inputs_dict):
             generated = model.generate(inputs, **generate_kwargs).numpy()
             generate_xla = tf.function(model.generate, jit_compile=True)
             generated_xla = generate_xla(inputs, **generate_kwargs).numpy()
-            self.assertListEqual(generated.tolist(), generated_xla.tolist())
+
+            # Due to numerical instability, let's fail the test only if there are more than 10% of input sequences give
+            # different outputs between XLA and non-XLA versions. If there are less than 10 examples, let's be strict
+            # and not allow any difference.
+            diff = [[], []]
+            for _generated, _generated_xla in zip(generated.tolist(), generated_xla.tolist()):
+                if _generated != _generated_xla:
+                    diff[0].append(_generated)
+                    diff[1].append(_generated_xla)
+            ratio = len(diff[0]) / len(generated)
+            if ratio > 0.1 or (len(diff[0]) > 0 and len(generated) < 10):
+                self.assertListEqual(diff[0], diff[1])
 
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
             config.eos_token_id = None  # Generate until max length
             config.do_sample = False
 
+            # fix config for models with additional sequence-length limiting settings
+            for var_name in ["max_position_embeddings", "max_target_positions"]:
+                attr = getattr(config, var_name, None)
+                if attr is not None and attr < generate_kwargs["max_new_tokens"]:
+                    try:
+                        setattr(config, var_name, generate_kwargs["max_new_tokens"])
+                    except NotImplementedError:
+                        # xlnet will raise an exception when trying to set
+                        # max_position_embeddings.
+                        pass
+
             model = model_class(config)
 
             if model.supports_xla_generation:
@@ -1965,515 +1872,3 @@ def floats_tensor(shape, scale=1.0, rng=None, name=None, dtype=None):
         values.append(rng.random() * scale)
 
     return tf.reshape(tf.constant(values, dtype=dtype if dtype is not None else tf.float32), shape=shape)
-
-
-@require_tf
-class UtilsFunctionsTest(unittest.TestCase):
-    def test_cached_files_are_used_when_internet_is_down(self):
-        # A mock response for an HTTP head request to emulate server down
-        response_mock = mock.Mock()
-        response_mock.status_code = 500
-        response_mock.headers = {}
-        response_mock.raise_for_status.side_effect = HTTPError
-        response_mock.json.return_value = {}
-
-        # Download this model to make sure it's in the cache.
-        _ = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        # Under the mock environment we get a 500 error when trying to reach the model.
-        with mock.patch("requests.request", return_value=response_mock) as mock_head:
-            _ = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-            # This check we did call the fake head request
-            mock_head.assert_called()
-
-    def test_load_from_one_file(self):
-        try:
-            tmp_file = tempfile.mktemp()
-            with open(tmp_file, "wb") as f:
-                http_get("https://huggingface.co/hf-internal-testing/tiny-random-bert/resolve/main/tf_model.h5", f)
-
-            config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
-            _ = TFBertModel.from_pretrained(tmp_file, config=config)
-        finally:
-            os.remove(tmp_file)
-
-    def test_legacy_load_from_url(self):
-        # This test is for deprecated behavior and can be removed in v5
-        config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
-        _ = TFBertModel.from_pretrained(
-            "https://huggingface.co/hf-internal-testing/tiny-random-bert/resolve/main/tf_model.h5", config=config
-        )
-
-    # tests whether the unpack_inputs function behaves as expected
-    def test_unpack_inputs(self):
-        class DummyModel:
-            def __init__(self):
-                config_kwargs = {"output_attentions": False, "output_hidden_states": False, "return_dict": False}
-                self.config = PretrainedConfig(**config_kwargs)
-                self.main_input_name = "input_ids"
-
-            @unpack_inputs
-            def call(
-                self,
-                input_ids=None,
-                past_key_values=None,
-                output_attentions=None,
-                output_hidden_states=None,
-                return_dict=None,
-            ):
-                return input_ids, past_key_values, output_attentions, output_hidden_states, return_dict
-
-            @unpack_inputs
-            def foo(self, pixel_values, output_attentions=None, output_hidden_states=None, return_dict=None):
-                return pixel_values, output_attentions, output_hidden_states, return_dict
-
-        dummy_model = DummyModel()
-        input_ids = tf.constant([0, 1, 2, 3], dtype=tf.int32)
-        past_key_values = tf.constant([4, 5, 6, 7], dtype=tf.int32)
-        pixel_values = tf.constant([8, 9, 10, 11], dtype=tf.int32)
-
-        # test case 1: Pass inputs as keyword arguments; Booleans are inherited from the config.
-        output = dummy_model.call(input_ids=input_ids, past_key_values=past_key_values)
-        tf.debugging.assert_equal(output[0], input_ids)
-        tf.debugging.assert_equal(output[1], past_key_values)
-        self.assertFalse(output[2])
-        self.assertFalse(output[3])
-        self.assertFalse(output[4])
-
-        # test case 2: Same as above, but with positional arguments.
-        output = dummy_model.call(input_ids, past_key_values)
-        tf.debugging.assert_equal(output[0], input_ids)
-        tf.debugging.assert_equal(output[1], past_key_values)
-        self.assertFalse(output[2])
-        self.assertFalse(output[3])
-        self.assertFalse(output[4])
-
-        # test case 3: We can also pack everything in the first input.
-        output = dummy_model.call(input_ids={"input_ids": input_ids, "past_key_values": past_key_values})
-        tf.debugging.assert_equal(output[0], input_ids)
-        tf.debugging.assert_equal(output[1], past_key_values)
-        self.assertFalse(output[2])
-        self.assertFalse(output[3])
-        self.assertFalse(output[4])
-
-        # test case 4: Explicit boolean arguments should override the config.
-        output = dummy_model.call(
-            input_ids=input_ids, past_key_values=past_key_values, output_attentions=False, return_dict=True
-        )
-        tf.debugging.assert_equal(output[0], input_ids)
-        tf.debugging.assert_equal(output[1], past_key_values)
-        self.assertFalse(output[2])
-        self.assertFalse(output[3])
-        self.assertTrue(output[4])
-
-        # test case 5: Unexpected arguments should raise an exception.
-        with self.assertRaises(ValueError):
-            output = dummy_model.call(input_ids=input_ids, past_key_values=past_key_values, foo="bar")
-
-        # test case 6: the decorator is independent from `main_input_name` -- it treats the first argument of the
-        # decorated function as its main input.
-        output = dummy_model.foo(pixel_values=pixel_values)
-        tf.debugging.assert_equal(output[0], pixel_values)
-        self.assertFalse(output[1])
-        self.assertFalse(output[2])
-        self.assertFalse(output[3])
-
-    # Tests whether the stable softmax is stable on CPU, with and without XLA
-    def test_xla_stable_softmax(self):
-        large_penalty = -1e9
-        n_tokens = 10
-        batch_size = 8
-
-        def masked_softmax(x, boolean_mask):
-            numerical_mask = (1.0 - tf.cast(boolean_mask, dtype=tf.float32)) * large_penalty
-            masked_x = x + numerical_mask
-            return stable_softmax(masked_x)
-
-        xla_masked_softmax = tf.function(masked_softmax, jit_compile=True)
-        xla_stable_softmax = tf.function(stable_softmax, jit_compile=True)
-        x = tf.random.normal((batch_size, n_tokens))
-
-        # Same outcome regardless of the boolean mask here
-        masked_tokens = random.randint(0, n_tokens)
-        boolean_mask = tf.convert_to_tensor([[1] * (n_tokens - masked_tokens) + [0] * masked_tokens], dtype=tf.int32)
-
-        # We can randomly mask a random numerical input OUTSIDE XLA
-        numerical_mask = (1.0 - tf.cast(boolean_mask, dtype=tf.float32)) * large_penalty
-        masked_x = x + numerical_mask
-        xla_out = xla_stable_softmax(masked_x)
-        out = stable_softmax(masked_x)
-        assert tf.experimental.numpy.allclose(xla_out, out)
-
-        # The stable softmax has the same output as the original softmax
-        unstable_out = tf.nn.softmax(masked_x)
-        assert tf.experimental.numpy.allclose(unstable_out, out)
-
-        # We can randomly mask a random numerical input INSIDE XLA
-        xla_out = xla_masked_softmax(x, boolean_mask)
-        out = masked_softmax(x, boolean_mask)
-        assert tf.experimental.numpy.allclose(xla_out, out)
-
-    def test_checkpoint_sharding_from_hub(self):
-        model = TFBertModel.from_pretrained("ArthurZ/tiny-random-bert-sharded")
-        # the model above is the same as the model below, just a sharded version.
-        ref_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-        for p1, p2 in zip(model.weights, ref_model.weights):
-            assert np.allclose(p1.numpy(), p2.numpy())
-
-    @is_pt_tf_cross_test
-    def test_checkpoint_sharding_local_from_pt(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            _ = Repository(local_dir=tmp_dir, clone_from="hf-internal-testing/tiny-random-bert-sharded")
-            model = TFBertModel.from_pretrained(tmp_dir, from_pt=True)
-            # the model above is the same as the model below, just a sharded pytorch version.
-            ref_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-            for p1, p2 in zip(model.weights, ref_model.weights):
-                assert np.allclose(p1.numpy(), p2.numpy())
-
-    @is_pt_tf_cross_test
-    def test_checkpoint_sharding_hub_from_pt(self):
-        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert-sharded", from_pt=True)
-        # the model above is the same as the model below, just a sharded pytorch version.
-        ref_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-        for p1, p2 in zip(model.weights, ref_model.weights):
-            assert np.allclose(p1.numpy(), p2.numpy())
-
-    def test_shard_checkpoint(self):
-        # This is the model we will use, total size 340,000 bytes.
-        model = tf.keras.Sequential(
-            [
-                tf.keras.layers.Dense(200, use_bias=False),  # size 80,000
-                tf.keras.layers.Dense(200, use_bias=False),  # size 160,000
-                tf.keras.layers.Dense(100, use_bias=False),  # size 80,000
-                tf.keras.layers.Dense(50, use_bias=False),  # size 20,000
-            ]
-        )
-        inputs = tf.zeros((1, 100), dtype=tf.float32)
-        model(inputs)
-        weights = model.weights
-        weights_dict = {w.name: w for w in weights}
-        with self.subTest("No shard when max size is bigger than model size"):
-            shards, index = tf_shard_checkpoint(weights)
-            self.assertIsNone(index)
-            self.assertDictEqual(shards, {TF2_WEIGHTS_NAME: weights})
-
-        with self.subTest("Test sharding, no weights bigger than max size"):
-            shards, index = tf_shard_checkpoint(weights, max_shard_size="300kB")
-            # Split is first two layers then last two.
-            self.assertDictEqual(
-                index,
-                {
-                    "metadata": {"total_size": 340000},
-                    "weight_map": {
-                        "dense/kernel:0": "tf_model-00001-of-00002.h5",
-                        "dense_1/kernel:0": "tf_model-00001-of-00002.h5",
-                        "dense_2/kernel:0": "tf_model-00002-of-00002.h5",
-                        "dense_3/kernel:0": "tf_model-00002-of-00002.h5",
-                    },
-                },
-            )
-
-            shard1 = [weights_dict["dense/kernel:0"], weights_dict["dense_1/kernel:0"]]
-            shard2 = [weights_dict["dense_2/kernel:0"], weights_dict["dense_3/kernel:0"]]
-            self.assertDictEqual(shards, {"tf_model-00001-of-00002.h5": shard1, "tf_model-00002-of-00002.h5": shard2})
-
-        with self.subTest("Test sharding with weights bigger than max size"):
-            shards, index = tf_shard_checkpoint(weights, max_shard_size="100kB")
-            # Split is first layer, second layer then last 2.
-            self.assertDictEqual(
-                index,
-                {
-                    "metadata": {"total_size": 340000},
-                    "weight_map": {
-                        "dense/kernel:0": "tf_model-00001-of-00003.h5",
-                        "dense_1/kernel:0": "tf_model-00002-of-00003.h5",
-                        "dense_2/kernel:0": "tf_model-00003-of-00003.h5",
-                        "dense_3/kernel:0": "tf_model-00003-of-00003.h5",
-                    },
-                },
-            )
-
-            shard1 = [weights_dict["dense/kernel:0"]]
-            shard2 = [weights_dict["dense_1/kernel:0"]]
-            shard3 = [weights_dict["dense_2/kernel:0"], weights_dict["dense_3/kernel:0"]]
-            self.assertDictEqual(
-                shards,
-                {
-                    "tf_model-00001-of-00003.h5": shard1,
-                    "tf_model-00002-of-00003.h5": shard2,
-                    "tf_model-00003-of-00003.h5": shard3,
-                },
-            )
-
-    @slow
-    def test_special_layer_name_sharding(self):
-        retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True)
-        model = TFRagModel.from_pretrained("facebook/rag-token-nq", retriever=retriever)
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            for max_size in ["150kB", "150kiB", "200kB", "200kiB"]:
-                model.save_pretrained(tmp_dir, max_shard_size=max_size)
-                ref_model = TFRagModel.from_pretrained(tmp_dir, retriever=retriever)
-                for p1, p2 in zip(model.weights, ref_model.weights):
-                    assert np.allclose(p1.numpy(), p2.numpy())
-
-    def test_checkpoint_sharding_local(self):
-        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            # We use the same folder for various sizes to make sure a new save erases the old checkpoint.
-            for max_size in ["150kB", "150kiB", "200kB", "200kiB"]:
-                model.save_pretrained(tmp_dir, max_shard_size=max_size)
-
-                # Get each shard file and its size
-                shard_to_size = {}
-                for shard in os.listdir(tmp_dir):
-                    if shard.endswith(".h5"):
-                        shard_file = os.path.join(tmp_dir, shard)
-                        shard_to_size[shard_file] = os.path.getsize(shard_file)
-
-                index_file = os.path.join(tmp_dir, TF2_WEIGHTS_INDEX_NAME)
-                # Check there is an index but no regular weight file
-                self.assertTrue(os.path.isfile(index_file))
-                self.assertFalse(os.path.isfile(os.path.join(tmp_dir, TF2_WEIGHTS_NAME)))
-
-                # Check a file is bigger than max_size only when it has a single weight
-                for shard_file, size in shard_to_size.items():
-                    if max_size.endswith("kiB"):
-                        max_size_int = int(max_size[:-3]) * 2**10
-                    else:
-                        max_size_int = int(max_size[:-2]) * 10**3
-                    # Note: pickle adds some junk so the weight of the file can end up being slightly bigger than
-                    # the size asked for (since we count parameters)
-                    if size >= max_size_int + 50000:
-                        with h5py.File(shard_file, "r") as state_file:
-                            self.assertEqual(len(state_file), 1)
-
-                # Check the index and the shard files found match
-                with open(index_file, "r", encoding="utf-8") as f:
-                    index = json.loads(f.read())
-
-                all_shards = set(index["weight_map"].values())
-                shards_found = set(f for f in os.listdir(tmp_dir) if f.endswith(".h5"))
-                self.assertSetEqual(all_shards, shards_found)
-
-                # Finally, check the model can be reloaded
-                new_model = TFBertModel.from_pretrained(tmp_dir)
-
-                model(model.dummy_inputs)
-                new_model(model.dummy_inputs)
-
-                for p1, p2 in zip(model.weights, new_model.weights):
-                    self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
-
-    def test_save_pretrained_signatures(self):
-        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        # Short custom TF signature function.
-        # `input_signature` is specific to BERT.
-        @tf.function(
-            input_signature=[
-                [
-                    tf.TensorSpec([None, None], tf.int32, name="input_ids"),
-                    tf.TensorSpec([None, None], tf.int32, name="token_type_ids"),
-                    tf.TensorSpec([None, None], tf.int32, name="attention_mask"),
-                ]
-            ]
-        )
-        def serving_fn(input):
-            return model(input)
-
-        # Using default signature (default behavior) overrides 'serving_default'
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, saved_model=True, signatures=None)
-            model_loaded = tf.keras.models.load_model(f"{tmp_dir}/saved_model/1")
-            self.assertTrue("serving_default" in list(model_loaded.signatures.keys()))
-
-        # Providing custom signature function
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, saved_model=True, signatures={"custom_signature": serving_fn})
-            model_loaded = tf.keras.models.load_model(f"{tmp_dir}/saved_model/1")
-            self.assertTrue("custom_signature" in list(model_loaded.signatures.keys()))
-
-        # Providing multiple custom signature function
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(
-                tmp_dir,
-                saved_model=True,
-                signatures={"custom_signature_1": serving_fn, "custom_signature_2": serving_fn},
-            )
-            model_loaded = tf.keras.models.load_model(f"{tmp_dir}/saved_model/1")
-            self.assertTrue("custom_signature_1" in list(model_loaded.signatures.keys()))
-            self.assertTrue("custom_signature_2" in list(model_loaded.signatures.keys()))
-
-    @require_safetensors
-    def test_safetensors_save_and_load(self):
-        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, safe_serialization=True)
-            # No tf_model.h5 file, only a model.safetensors
-            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_NAME)))
-            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, TF2_WEIGHTS_NAME)))
-
-            new_model = TFBertModel.from_pretrained(tmp_dir)
-
-            # Check models are equal
-            for p1, p2 in zip(model.weights, new_model.weights):
-                self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
-
-    @is_pt_tf_cross_test
-    def test_safetensors_save_and_load_pt_to_tf(self):
-        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-        pt_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            pt_model.save_pretrained(tmp_dir, safe_serialization=True)
-            # Check we have a model.safetensors file
-            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_NAME)))
-
-            new_model = TFBertModel.from_pretrained(tmp_dir)
-
-            # Check models are equal
-            for p1, p2 in zip(model.weights, new_model.weights):
-                self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
-
-    @require_safetensors
-    def test_safetensors_load_from_hub(self):
-        tf_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        # Can load from the TF-formatted checkpoint
-        safetensors_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert-safetensors-tf")
-
-        # Check models are equal
-        for p1, p2 in zip(safetensors_model.weights, tf_model.weights):
-            self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
-
-        # Can load from the PyTorch-formatted checkpoint
-        safetensors_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert-safetensors")
-
-        # Check models are equal
-        for p1, p2 in zip(safetensors_model.weights, tf_model.weights):
-            self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
-
-
-@require_tf
-@is_staging_test
-class TFModelPushToHubTester(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls._token = TOKEN
-        set_access_token(TOKEN)
-        HfFolder.save_token(TOKEN)
-
-    @classmethod
-    def tearDownClass(cls):
-        try:
-            delete_repo(token=cls._token, repo_id="test-model-tf")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="test-model-tf-callback")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="valid_org/test-model-tf-org")
-        except HTTPError:
-            pass
-
-    def test_push_to_hub(self):
-        config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
-        )
-        model = TFBertModel(config)
-        # Make sure model is properly initialized
-        _ = model(model.dummy_inputs)
-
-        logging.set_verbosity_info()
-        logger = logging.get_logger("transformers.utils.hub")
-        with CaptureLogger(logger) as cl:
-            model.push_to_hub("test-model-tf", use_auth_token=self._token)
-        logging.set_verbosity_warning()
-        # Check the model card was created and uploaded.
-        self.assertIn("Uploading the following files to __DUMMY_TRANSFORMERS_USER__/test-model-tf", cl.out)
-
-        new_model = TFBertModel.from_pretrained(f"{USER}/test-model-tf")
-        models_equal = True
-        for p1, p2 in zip(model.weights, new_model.weights):
-            if not tf.math.reduce_all(p1 == p2):
-                models_equal = False
-                break
-        self.assertTrue(models_equal)
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="test-model-tf")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, repo_id="test-model-tf", push_to_hub=True, use_auth_token=self._token)
-
-        new_model = TFBertModel.from_pretrained(f"{USER}/test-model-tf")
-        models_equal = True
-        for p1, p2 in zip(model.weights, new_model.weights):
-            if not tf.math.reduce_all(p1 == p2):
-                models_equal = False
-                break
-        self.assertTrue(models_equal)
-
-    def test_push_to_hub_callback(self):
-        config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
-        )
-        model = TFBertForMaskedLM(config)
-        model.compile()
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            push_to_hub_callback = PushToHubCallback(
-                output_dir=tmp_dir,
-                hub_model_id="test-model-tf-callback",
-                hub_token=self._token,
-            )
-            model.fit(model.dummy_inputs, model.dummy_inputs, epochs=1, callbacks=[push_to_hub_callback])
-
-        new_model = TFBertForMaskedLM.from_pretrained(f"{USER}/test-model-tf-callback")
-        models_equal = True
-        for p1, p2 in zip(model.weights, new_model.weights):
-            if not tf.math.reduce_all(p1 == p2):
-                models_equal = False
-                break
-        self.assertTrue(models_equal)
-
-    def test_push_to_hub_in_organization(self):
-        config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
-        )
-        model = TFBertModel(config)
-        # Make sure model is properly initialized
-        _ = model(model.dummy_inputs)
-
-        model.push_to_hub("valid_org/test-model-tf-org", use_auth_token=self._token)
-
-        new_model = TFBertModel.from_pretrained("valid_org/test-model-tf-org")
-        models_equal = True
-        for p1, p2 in zip(model.weights, new_model.weights):
-            if not tf.math.reduce_all(p1 == p2):
-                models_equal = False
-                break
-        self.assertTrue(models_equal)
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="valid_org/test-model-tf-org")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(
-                tmp_dir, push_to_hub=True, use_auth_token=self._token, repo_id="valid_org/test-model-tf-org"
-            )
-
-        new_model = TFBertModel.from_pretrained("valid_org/test-model-tf-org")
-        models_equal = True
-        for p1, p2 in zip(model.weights, new_model.weights):
-            if not tf.math.reduce_all(p1 == p2):
-                models_equal = False
-                break
-        self.assertTrue(models_equal)
diff --git a/tests/test_modeling_tf_utils.py b/tests/test_modeling_tf_utils.py
new file mode 100644
index 000000000000..862a2cffa8a0
--- /dev/null
+++ b/tests/test_modeling_tf_utils.py
@@ -0,0 +1,627 @@
+# coding=utf-8
+# Copyright 2019 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from __future__ import annotations
+
+import inspect
+import json
+import os
+import random
+import tempfile
+import unittest
+import unittest.mock as mock
+
+from huggingface_hub import HfFolder, Repository, delete_repo
+from huggingface_hub.file_download import http_get
+from requests.exceptions import HTTPError
+
+from transformers import is_tf_available, is_torch_available
+from transformers.configuration_utils import PretrainedConfig
+from transformers.testing_utils import (  # noqa: F401
+    TOKEN,
+    USER,
+    CaptureLogger,
+    _tf_gpu_memory_limit,
+    is_pt_tf_cross_test,
+    is_staging_test,
+    require_safetensors,
+    require_tf,
+    slow,
+)
+from transformers.utils import SAFE_WEIGHTS_NAME, TF2_WEIGHTS_INDEX_NAME, TF2_WEIGHTS_NAME, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_tf_available():
+    import h5py
+    import numpy as np
+    import tensorflow as tf
+
+    from transformers import (
+        BertConfig,
+        PreTrainedModel,
+        PushToHubCallback,
+        RagRetriever,
+        TFBertForMaskedLM,
+        TFBertForSequenceClassification,
+        TFBertModel,
+        TFPreTrainedModel,
+        TFRagModel,
+    )
+    from transformers.modeling_tf_utils import tf_shard_checkpoint, unpack_inputs
+    from transformers.tf_utils import stable_softmax
+
+    tf.config.experimental.enable_tensor_float_32_execution(False)
+
+    if _tf_gpu_memory_limit is not None:
+        gpus = tf.config.list_physical_devices("GPU")
+        for gpu in gpus:
+            # Restrict TensorFlow to only allocate x GB of memory on the GPUs
+            try:
+                tf.config.set_logical_device_configuration(
+                    gpu, [tf.config.LogicalDeviceConfiguration(memory_limit=_tf_gpu_memory_limit)]
+                )
+                logical_gpus = tf.config.list_logical_devices("GPU")
+                print("Logical GPUs", logical_gpus)
+            except RuntimeError as e:
+                # Virtual devices must be set before GPUs have been initialized
+                print(e)
+
+if is_torch_available():
+    from transformers import BertModel
+
+
+@require_tf
+class TFModelUtilsTest(unittest.TestCase):
+    def test_cached_files_are_used_when_internet_is_down(self):
+        # A mock response for an HTTP head request to emulate server down
+        response_mock = mock.Mock()
+        response_mock.status_code = 500
+        response_mock.headers = {}
+        response_mock.raise_for_status.side_effect = HTTPError
+        response_mock.json.return_value = {}
+
+        # Download this model to make sure it's in the cache.
+        _ = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        # Under the mock environment we get a 500 error when trying to reach the model.
+        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
+            _ = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+            # This check we did call the fake head request
+            mock_head.assert_called()
+
+    def test_load_from_one_file(self):
+        try:
+            tmp_file = tempfile.mktemp()
+            with open(tmp_file, "wb") as f:
+                http_get("https://huggingface.co/hf-internal-testing/tiny-random-bert/resolve/main/tf_model.h5", f)
+
+            config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
+            _ = TFBertModel.from_pretrained(tmp_file, config=config)
+        finally:
+            os.remove(tmp_file)
+
+    def test_legacy_load_from_url(self):
+        # This test is for deprecated behavior and can be removed in v5
+        config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
+        _ = TFBertModel.from_pretrained(
+            "https://huggingface.co/hf-internal-testing/tiny-random-bert/resolve/main/tf_model.h5", config=config
+        )
+
+    # tests whether the unpack_inputs function behaves as expected
+    def test_unpack_inputs(self):
+        class DummyModel:
+            def __init__(self):
+                config_kwargs = {"output_attentions": False, "output_hidden_states": False, "return_dict": False}
+                self.config = PretrainedConfig(**config_kwargs)
+                self.main_input_name = "input_ids"
+
+            @unpack_inputs
+            def call(
+                self,
+                input_ids=None,
+                past_key_values=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+            ):
+                return input_ids, past_key_values, output_attentions, output_hidden_states, return_dict
+
+            @unpack_inputs
+            def foo(self, pixel_values, output_attentions=None, output_hidden_states=None, return_dict=None):
+                return pixel_values, output_attentions, output_hidden_states, return_dict
+
+        dummy_model = DummyModel()
+        input_ids = tf.constant([0, 1, 2, 3], dtype=tf.int32)
+        past_key_values = tf.constant([4, 5, 6, 7], dtype=tf.int32)
+        pixel_values = tf.constant([8, 9, 10, 11], dtype=tf.int32)
+
+        # test case 1: Pass inputs as keyword arguments; Booleans are inherited from the config.
+        output = dummy_model.call(input_ids=input_ids, past_key_values=past_key_values)
+        tf.debugging.assert_equal(output[0], input_ids)
+        tf.debugging.assert_equal(output[1], past_key_values)
+        self.assertFalse(output[2])
+        self.assertFalse(output[3])
+        self.assertFalse(output[4])
+
+        # test case 2: Same as above, but with positional arguments.
+        output = dummy_model.call(input_ids, past_key_values)
+        tf.debugging.assert_equal(output[0], input_ids)
+        tf.debugging.assert_equal(output[1], past_key_values)
+        self.assertFalse(output[2])
+        self.assertFalse(output[3])
+        self.assertFalse(output[4])
+
+        # test case 3: We can also pack everything in the first input.
+        output = dummy_model.call(input_ids={"input_ids": input_ids, "past_key_values": past_key_values})
+        tf.debugging.assert_equal(output[0], input_ids)
+        tf.debugging.assert_equal(output[1], past_key_values)
+        self.assertFalse(output[2])
+        self.assertFalse(output[3])
+        self.assertFalse(output[4])
+
+        # test case 4: Explicit boolean arguments should override the config.
+        output = dummy_model.call(
+            input_ids=input_ids, past_key_values=past_key_values, output_attentions=False, return_dict=True
+        )
+        tf.debugging.assert_equal(output[0], input_ids)
+        tf.debugging.assert_equal(output[1], past_key_values)
+        self.assertFalse(output[2])
+        self.assertFalse(output[3])
+        self.assertTrue(output[4])
+
+        # test case 5: Unexpected arguments should raise an exception.
+        with self.assertRaises(ValueError):
+            output = dummy_model.call(input_ids=input_ids, past_key_values=past_key_values, foo="bar")
+
+        # test case 6: the decorator is independent from `main_input_name` -- it treats the first argument of the
+        # decorated function as its main input.
+        output = dummy_model.foo(pixel_values=pixel_values)
+        tf.debugging.assert_equal(output[0], pixel_values)
+        self.assertFalse(output[1])
+        self.assertFalse(output[2])
+        self.assertFalse(output[3])
+
+    # Tests whether the stable softmax is stable on CPU, with and without XLA
+    def test_xla_stable_softmax(self):
+        large_penalty = -1e9
+        n_tokens = 10
+        batch_size = 8
+
+        def masked_softmax(x, boolean_mask):
+            numerical_mask = (1.0 - tf.cast(boolean_mask, dtype=tf.float32)) * large_penalty
+            masked_x = x + numerical_mask
+            return stable_softmax(masked_x)
+
+        xla_masked_softmax = tf.function(masked_softmax, jit_compile=True)
+        xla_stable_softmax = tf.function(stable_softmax, jit_compile=True)
+        x = tf.random.normal((batch_size, n_tokens))
+
+        # Same outcome regardless of the boolean mask here
+        masked_tokens = random.randint(0, n_tokens)
+        boolean_mask = tf.convert_to_tensor([[1] * (n_tokens - masked_tokens) + [0] * masked_tokens], dtype=tf.int32)
+
+        # We can randomly mask a random numerical input OUTSIDE XLA
+        numerical_mask = (1.0 - tf.cast(boolean_mask, dtype=tf.float32)) * large_penalty
+        masked_x = x + numerical_mask
+        xla_out = xla_stable_softmax(masked_x)
+        out = stable_softmax(masked_x)
+        assert tf.experimental.numpy.allclose(xla_out, out)
+
+        # The stable softmax has the same output as the original softmax
+        unstable_out = tf.nn.softmax(masked_x)
+        assert tf.experimental.numpy.allclose(unstable_out, out)
+
+        # We can randomly mask a random numerical input INSIDE XLA
+        xla_out = xla_masked_softmax(x, boolean_mask)
+        out = masked_softmax(x, boolean_mask)
+        assert tf.experimental.numpy.allclose(xla_out, out)
+
+    def test_checkpoint_sharding_from_hub(self):
+        model = TFBertModel.from_pretrained("ArthurZ/tiny-random-bert-sharded")
+        # the model above is the same as the model below, just a sharded version.
+        ref_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+        for p1, p2 in zip(model.weights, ref_model.weights):
+            assert np.allclose(p1.numpy(), p2.numpy())
+
+    def test_sharded_checkpoint_with_prefix(self):
+        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert", load_weight_prefix="a/b")
+        sharded_model = TFBertModel.from_pretrained("ArthurZ/tiny-random-bert-sharded", load_weight_prefix="a/b")
+        for p1, p2 in zip(model.weights, sharded_model.weights):
+            self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
+            self.assertTrue(p1.name.startswith("a/b/"))
+            self.assertTrue(p2.name.startswith("a/b/"))
+
+    def test_sharded_checkpoint_transfer(self):
+        # If this doesn't throw an error then the test passes
+        TFBertForSequenceClassification.from_pretrained("ArthurZ/tiny-random-bert-sharded")
+
+    @is_pt_tf_cross_test
+    def test_checkpoint_sharding_local_from_pt(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            _ = Repository(local_dir=tmp_dir, clone_from="hf-internal-testing/tiny-random-bert-sharded")
+            model = TFBertModel.from_pretrained(tmp_dir, from_pt=True)
+            # the model above is the same as the model below, just a sharded pytorch version.
+            ref_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+            for p1, p2 in zip(model.weights, ref_model.weights):
+                assert np.allclose(p1.numpy(), p2.numpy())
+
+    @is_pt_tf_cross_test
+    def test_checkpoint_loading_with_prefix_from_pt(self):
+        model = TFBertModel.from_pretrained(
+            "hf-internal-testing/tiny-random-bert", from_pt=True, load_weight_prefix="a/b"
+        )
+        ref_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert", from_pt=True)
+        for p1, p2 in zip(model.weights, ref_model.weights):
+            self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
+            self.assertTrue(p1.name.startswith("a/b/"))
+
+    @is_pt_tf_cross_test
+    def test_checkpoint_sharding_hub_from_pt(self):
+        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert-sharded", from_pt=True)
+        # the model above is the same as the model below, just a sharded pytorch version.
+        ref_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+        for p1, p2 in zip(model.weights, ref_model.weights):
+            assert np.allclose(p1.numpy(), p2.numpy())
+
+    def test_shard_checkpoint(self):
+        # This is the model we will use, total size 340,000 bytes.
+        model = tf.keras.Sequential(
+            [
+                tf.keras.layers.Dense(200, use_bias=False),  # size 80,000
+                tf.keras.layers.Dense(200, use_bias=False),  # size 160,000
+                tf.keras.layers.Dense(100, use_bias=False),  # size 80,000
+                tf.keras.layers.Dense(50, use_bias=False),  # size 20,000
+            ]
+        )
+        inputs = tf.zeros((1, 100), dtype=tf.float32)
+        model(inputs)
+        weights = model.weights
+        weights_dict = {w.name: w for w in weights}
+        with self.subTest("No shard when max size is bigger than model size"):
+            shards, index = tf_shard_checkpoint(weights)
+            self.assertIsNone(index)
+            self.assertDictEqual(shards, {TF2_WEIGHTS_NAME: weights})
+
+        with self.subTest("Test sharding, no weights bigger than max size"):
+            shards, index = tf_shard_checkpoint(weights, max_shard_size="300kB")
+            # Split is first two layers then last two.
+            self.assertDictEqual(
+                index,
+                {
+                    "metadata": {"total_size": 340000},
+                    "weight_map": {
+                        "dense/kernel:0": "tf_model-00001-of-00002.h5",
+                        "dense_1/kernel:0": "tf_model-00001-of-00002.h5",
+                        "dense_2/kernel:0": "tf_model-00002-of-00002.h5",
+                        "dense_3/kernel:0": "tf_model-00002-of-00002.h5",
+                    },
+                },
+            )
+
+            shard1 = [weights_dict["dense/kernel:0"], weights_dict["dense_1/kernel:0"]]
+            shard2 = [weights_dict["dense_2/kernel:0"], weights_dict["dense_3/kernel:0"]]
+            self.assertDictEqual(shards, {"tf_model-00001-of-00002.h5": shard1, "tf_model-00002-of-00002.h5": shard2})
+
+        with self.subTest("Test sharding with weights bigger than max size"):
+            shards, index = tf_shard_checkpoint(weights, max_shard_size="100kB")
+            # Split is first layer, second layer then last 2.
+            self.assertDictEqual(
+                index,
+                {
+                    "metadata": {"total_size": 340000},
+                    "weight_map": {
+                        "dense/kernel:0": "tf_model-00001-of-00003.h5",
+                        "dense_1/kernel:0": "tf_model-00002-of-00003.h5",
+                        "dense_2/kernel:0": "tf_model-00003-of-00003.h5",
+                        "dense_3/kernel:0": "tf_model-00003-of-00003.h5",
+                    },
+                },
+            )
+
+            shard1 = [weights_dict["dense/kernel:0"]]
+            shard2 = [weights_dict["dense_1/kernel:0"]]
+            shard3 = [weights_dict["dense_2/kernel:0"], weights_dict["dense_3/kernel:0"]]
+            self.assertDictEqual(
+                shards,
+                {
+                    "tf_model-00001-of-00003.h5": shard1,
+                    "tf_model-00002-of-00003.h5": shard2,
+                    "tf_model-00003-of-00003.h5": shard3,
+                },
+            )
+
+    @slow
+    def test_special_layer_name_sharding(self):
+        retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True)
+        model = TFRagModel.from_pretrained("facebook/rag-token-nq", retriever=retriever)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            for max_size in ["150kB", "150kiB", "200kB", "200kiB"]:
+                model.save_pretrained(tmp_dir, max_shard_size=max_size)
+                ref_model = TFRagModel.from_pretrained(tmp_dir, retriever=retriever)
+                for p1, p2 in zip(model.weights, ref_model.weights):
+                    assert np.allclose(p1.numpy(), p2.numpy())
+
+    def test_checkpoint_sharding_local(self):
+        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            # We use the same folder for various sizes to make sure a new save erases the old checkpoint.
+            for max_size in ["150kB", "150kiB", "200kB", "200kiB"]:
+                model.save_pretrained(tmp_dir, max_shard_size=max_size)
+
+                # Get each shard file and its size
+                shard_to_size = {}
+                for shard in os.listdir(tmp_dir):
+                    if shard.endswith(".h5"):
+                        shard_file = os.path.join(tmp_dir, shard)
+                        shard_to_size[shard_file] = os.path.getsize(shard_file)
+
+                index_file = os.path.join(tmp_dir, TF2_WEIGHTS_INDEX_NAME)
+                # Check there is an index but no regular weight file
+                self.assertTrue(os.path.isfile(index_file))
+                self.assertFalse(os.path.isfile(os.path.join(tmp_dir, TF2_WEIGHTS_NAME)))
+
+                # Check a file is bigger than max_size only when it has a single weight
+                for shard_file, size in shard_to_size.items():
+                    if max_size.endswith("kiB"):
+                        max_size_int = int(max_size[:-3]) * 2**10
+                    else:
+                        max_size_int = int(max_size[:-2]) * 10**3
+                    # Note: pickle adds some junk so the weight of the file can end up being slightly bigger than
+                    # the size asked for (since we count parameters)
+                    if size >= max_size_int + 50000:
+                        with h5py.File(shard_file, "r") as state_file:
+                            self.assertEqual(len(state_file), 1)
+
+                # Check the index and the shard files found match
+                with open(index_file, "r", encoding="utf-8") as f:
+                    index = json.loads(f.read())
+
+                all_shards = set(index["weight_map"].values())
+                shards_found = {f for f in os.listdir(tmp_dir) if f.endswith(".h5")}
+                self.assertSetEqual(all_shards, shards_found)
+
+                # Finally, check the model can be reloaded
+                new_model = TFBertModel.from_pretrained(tmp_dir)
+
+                model.build()
+                new_model.build()
+
+                for p1, p2 in zip(model.weights, new_model.weights):
+                    self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
+
+    @slow
+    def test_save_pretrained_signatures(self):
+        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        # Short custom TF signature function.
+        # `input_signature` is specific to BERT.
+        @tf.function(
+            input_signature=[
+                [
+                    tf.TensorSpec([None, None], tf.int32, name="input_ids"),
+                    tf.TensorSpec([None, None], tf.int32, name="token_type_ids"),
+                    tf.TensorSpec([None, None], tf.int32, name="attention_mask"),
+                ]
+            ]
+        )
+        def serving_fn(input):
+            return model(input)
+
+        # Using default signature (default behavior) overrides 'serving_default'
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, saved_model=True, signatures=None)
+            model_loaded = tf.keras.models.load_model(f"{tmp_dir}/saved_model/1")
+            self.assertTrue("serving_default" in list(model_loaded.signatures.keys()))
+
+        # Providing custom signature function
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, saved_model=True, signatures={"custom_signature": serving_fn})
+            model_loaded = tf.keras.models.load_model(f"{tmp_dir}/saved_model/1")
+            self.assertTrue("custom_signature" in list(model_loaded.signatures.keys()))
+
+        # Providing multiple custom signature function
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(
+                tmp_dir,
+                saved_model=True,
+                signatures={"custom_signature_1": serving_fn, "custom_signature_2": serving_fn},
+            )
+            model_loaded = tf.keras.models.load_model(f"{tmp_dir}/saved_model/1")
+            self.assertTrue("custom_signature_1" in list(model_loaded.signatures.keys()))
+            self.assertTrue("custom_signature_2" in list(model_loaded.signatures.keys()))
+
+    @require_safetensors
+    def test_safetensors_save_and_load(self):
+        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, safe_serialization=True)
+            # No tf_model.h5 file, only a model.safetensors
+            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_NAME)))
+            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, TF2_WEIGHTS_NAME)))
+
+            new_model = TFBertModel.from_pretrained(tmp_dir)
+
+            # Check models are equal
+            for p1, p2 in zip(model.weights, new_model.weights):
+                self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
+
+    @is_pt_tf_cross_test
+    def test_safetensors_save_and_load_pt_to_tf(self):
+        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+        pt_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            pt_model.save_pretrained(tmp_dir, safe_serialization=True)
+            # Check we have a model.safetensors file
+            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_NAME)))
+
+            new_model = TFBertModel.from_pretrained(tmp_dir)
+
+            # Check models are equal
+            for p1, p2 in zip(model.weights, new_model.weights):
+                self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
+
+    @require_safetensors
+    def test_safetensors_load_from_hub(self):
+        tf_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        # Can load from the TF-formatted checkpoint
+        safetensors_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert-safetensors-tf")
+
+        # Check models are equal
+        for p1, p2 in zip(safetensors_model.weights, tf_model.weights):
+            self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
+
+        # Can load from the PyTorch-formatted checkpoint
+        safetensors_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert-safetensors")
+
+        # Check models are equal
+        for p1, p2 in zip(safetensors_model.weights, tf_model.weights):
+            self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
+
+
+@require_tf
+@is_staging_test
+class TFModelPushToHubTester(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._token = TOKEN
+        HfFolder.save_token(TOKEN)
+
+    @classmethod
+    def tearDownClass(cls):
+        try:
+            delete_repo(token=cls._token, repo_id="test-model-tf")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="test-model-tf-callback")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="valid_org/test-model-tf-org")
+        except HTTPError:
+            pass
+
+    def test_push_to_hub(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        model = TFBertModel(config)
+        # Make sure model is properly initialized
+        model.build()
+
+        logging.set_verbosity_info()
+        logger = logging.get_logger("transformers.utils.hub")
+        with CaptureLogger(logger) as cl:
+            model.push_to_hub("test-model-tf", use_auth_token=self._token)
+        logging.set_verbosity_warning()
+        # Check the model card was created and uploaded.
+        self.assertIn("Uploading the following files to __DUMMY_TRANSFORMERS_USER__/test-model-tf", cl.out)
+
+        new_model = TFBertModel.from_pretrained(f"{USER}/test-model-tf")
+        models_equal = True
+        for p1, p2 in zip(model.weights, new_model.weights):
+            if not tf.math.reduce_all(p1 == p2):
+                models_equal = False
+                break
+        self.assertTrue(models_equal)
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="test-model-tf")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, repo_id="test-model-tf", push_to_hub=True, use_auth_token=self._token)
+
+        new_model = TFBertModel.from_pretrained(f"{USER}/test-model-tf")
+        models_equal = True
+        for p1, p2 in zip(model.weights, new_model.weights):
+            if not tf.math.reduce_all(p1 == p2):
+                models_equal = False
+                break
+        self.assertTrue(models_equal)
+
+    @is_pt_tf_cross_test
+    def test_push_to_hub_callback(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        model = TFBertForMaskedLM(config)
+        model.compile()
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            push_to_hub_callback = PushToHubCallback(
+                output_dir=tmp_dir,
+                hub_model_id="test-model-tf-callback",
+                hub_token=self._token,
+            )
+            model.fit(model.dummy_inputs, model.dummy_inputs, epochs=1, callbacks=[push_to_hub_callback])
+
+        new_model = TFBertForMaskedLM.from_pretrained(f"{USER}/test-model-tf-callback")
+        models_equal = True
+        for p1, p2 in zip(model.weights, new_model.weights):
+            if not tf.math.reduce_all(p1 == p2):
+                models_equal = False
+                break
+        self.assertTrue(models_equal)
+
+        tf_push_to_hub_params = dict(inspect.signature(TFPreTrainedModel.push_to_hub).parameters)
+        tf_push_to_hub_params.pop("base_model_card_args")
+        pt_push_to_hub_params = dict(inspect.signature(PreTrainedModel.push_to_hub).parameters)
+        pt_push_to_hub_params.pop("deprecated_kwargs")
+        self.assertDictEaual(tf_push_to_hub_params, pt_push_to_hub_params)
+
+    def test_push_to_hub_in_organization(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        model = TFBertModel(config)
+        # Make sure model is properly initialized
+        model.build()
+
+        model.push_to_hub("valid_org/test-model-tf-org", use_auth_token=self._token)
+
+        new_model = TFBertModel.from_pretrained("valid_org/test-model-tf-org")
+        models_equal = True
+        for p1, p2 in zip(model.weights, new_model.weights):
+            if not tf.math.reduce_all(p1 == p2):
+                models_equal = False
+                break
+        self.assertTrue(models_equal)
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="valid_org/test-model-tf-org")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(
+                tmp_dir, push_to_hub=True, use_auth_token=self._token, repo_id="valid_org/test-model-tf-org"
+            )
+
+        new_model = TFBertModel.from_pretrained("valid_org/test-model-tf-org")
+        models_equal = True
+        for p1, p2 in zip(model.weights, new_model.weights):
+            if not tf.math.reduce_all(p1 == p2):
+                models_equal = False
+                break
+        self.assertTrue(models_equal)
diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py
new file mode 100755
index 000000000000..bccde5af5083
--- /dev/null
+++ b/tests/test_modeling_utils.py
@@ -0,0 +1,1169 @@
+# coding=utf-8
+# Copyright 2019 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import json
+import os
+import os.path
+import sys
+import tempfile
+import unittest
+import unittest.mock as mock
+from pathlib import Path
+
+from huggingface_hub import HfFolder, delete_repo
+from huggingface_hub.file_download import http_get
+from pytest import mark
+from requests.exceptions import HTTPError
+
+from transformers import (
+    AutoConfig,
+    AutoModel,
+    PretrainedConfig,
+    is_torch_available,
+    logging,
+)
+from transformers.testing_utils import (
+    TOKEN,
+    USER,
+    CaptureLogger,
+    TestCasePlus,
+    is_staging_test,
+    require_accelerate,
+    require_safetensors,
+    require_torch,
+    require_torch_gpu,
+    require_torch_multi_gpu,
+    require_usr_bin_time,
+    slow,
+)
+from transformers.utils import (
+    SAFE_WEIGHTS_INDEX_NAME,
+    SAFE_WEIGHTS_NAME,
+    WEIGHTS_INDEX_NAME,
+    WEIGHTS_NAME,
+)
+from transformers.utils.import_utils import is_torchdynamo_available
+
+
+sys.path.append(str(Path(__file__).parent.parent / "utils"))
+
+from test_module.custom_configuration import CustomConfig, NoSuperInitConfig  # noqa E402
+
+
+if is_torch_available():
+    import torch
+    from test_module.custom_modeling import CustomModel, NoSuperInitModel
+    from torch import nn
+
+    from transformers import (
+        BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        AutoModelForCausalLM,
+        AutoTokenizer,
+        BertConfig,
+        BertModel,
+        CLIPTextModel,
+        PreTrainedModel,
+        T5Config,
+        T5ForConditionalGeneration,
+    )
+    from transformers.modeling_utils import shard_checkpoint
+
+    # Fake pretrained models for tests
+    class BaseModel(PreTrainedModel):
+        base_model_prefix = "base"
+        config_class = PretrainedConfig
+
+        def __init__(self, config):
+            super().__init__(config)
+            self.linear = nn.Linear(5, 5)
+            self.linear_2 = nn.Linear(5, 5)
+
+        def forward(self, x):
+            return self.linear_2(self.linear(x))
+
+    class BaseModelWithTiedWeights(PreTrainedModel):
+        config_class = PretrainedConfig
+
+        def __init__(self, config):
+            super().__init__(config)
+            self.linear = nn.Linear(5, 5)
+            self.linear_2 = nn.Linear(5, 5)
+
+        def forward(self, x):
+            return self.linear_2(self.linear(x))
+
+        def tie_weights(self):
+            self.linear_2.weight = self.linear.weight
+
+    class ModelWithHead(PreTrainedModel):
+        base_model_prefix = "base"
+        config_class = PretrainedConfig
+
+        def _init_weights(self, module):
+            pass
+
+        def __init__(self, config):
+            super().__init__(config)
+            self.base = BaseModel(config)
+            # linear is a common name between Base and Head on purpose.
+            self.linear = nn.Linear(5, 5)
+            self.linear2 = nn.Linear(5, 5)
+
+        def forward(self, x):
+            return self.linear2(self.linear(self.base(x)))
+
+    class ModelWithHeadAndTiedWeights(PreTrainedModel):
+        base_model_prefix = "base"
+        config_class = PretrainedConfig
+
+        def _init_weights(self, module):
+            pass
+
+        def __init__(self, config):
+            super().__init__(config)
+            self.base = BaseModel(config)
+            self.decoder = nn.Linear(5, 5)
+
+        def forward(self, x):
+            return self.decoder(self.base(x))
+
+        def tie_weights(self):
+            self.decoder.weight = self.base.linear.weight
+
+
+TINY_T5 = "patrickvonplaten/t5-tiny-random"
+TINY_BERT_FOR_TOKEN_CLASSIFICATION = "hf-internal-testing/tiny-bert-for-token-classification"
+
+
+def check_models_equal(model1, model2):
+    models_are_equal = True
+    for model1_p, model2_p in zip(model1.parameters(), model2.parameters()):
+        if model1_p.data.ne(model2_p.data).sum() > 0:
+            models_are_equal = False
+
+    return models_are_equal
+
+
+@require_torch
+class ModelUtilsTest(TestCasePlus):
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            config = BertConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, PretrainedConfig)
+
+            model = BertModel.from_pretrained(model_name)
+            model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, PreTrainedModel)
+
+            self.assertEqual(len(loading_info["missing_keys"]), 0)
+            self.assertEqual(len(loading_info["unexpected_keys"]), 8)
+            self.assertEqual(len(loading_info["mismatched_keys"]), 0)
+            self.assertEqual(len(loading_info["error_msgs"]), 0)
+
+            config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
+
+            # Not sure this is the intended behavior. TODO fix Lysandre & Thom
+            config.name_or_path = model_name
+
+            model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
+            self.assertEqual(model.config.output_hidden_states, True)
+            self.assertEqual(model.config, config)
+
+    def test_model_from_pretrained_subfolder(self):
+        config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
+        model = BertModel(config)
+
+        subfolder = "bert"
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(os.path.join(tmp_dir, subfolder))
+
+            with self.assertRaises(OSError):
+                _ = BertModel.from_pretrained(tmp_dir)
+
+            model_loaded = BertModel.from_pretrained(tmp_dir, subfolder=subfolder)
+
+        self.assertTrue(check_models_equal(model, model_loaded))
+
+    def test_model_from_pretrained_subfolder_sharded(self):
+        config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
+        model = BertModel(config)
+
+        subfolder = "bert"
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(os.path.join(tmp_dir, subfolder), max_shard_size="10KB")
+
+            with self.assertRaises(OSError):
+                _ = BertModel.from_pretrained(tmp_dir)
+
+            model_loaded = BertModel.from_pretrained(tmp_dir, subfolder=subfolder)
+
+        self.assertTrue(check_models_equal(model, model_loaded))
+
+    def test_model_from_pretrained_hub_subfolder(self):
+        subfolder = "bert"
+        model_id = "hf-internal-testing/tiny-random-bert-subfolder"
+        with self.assertRaises(OSError):
+            _ = BertModel.from_pretrained(model_id)
+
+        model = BertModel.from_pretrained(model_id, subfolder=subfolder)
+
+        self.assertIsNotNone(model)
+
+    def test_model_from_pretrained_hub_subfolder_sharded(self):
+        subfolder = "bert"
+        model_id = "hf-internal-testing/tiny-random-bert-sharded-subfolder"
+        with self.assertRaises(OSError):
+            _ = BertModel.from_pretrained(model_id)
+
+        model = BertModel.from_pretrained(model_id, subfolder=subfolder)
+
+        self.assertIsNotNone(model)
+
+    def test_model_from_pretrained_with_different_pretrained_model_name(self):
+        model = T5ForConditionalGeneration.from_pretrained(TINY_T5)
+        self.assertIsNotNone(model)
+
+        logger = logging.get_logger("transformers.configuration_utils")
+        with CaptureLogger(logger) as cl:
+            BertModel.from_pretrained(TINY_T5)
+        self.assertTrue("You are using a model of type t5 to instantiate a model of type bert" in cl.out)
+
+    def test_model_from_config_torch_dtype(self):
+        # test that the model can be instantiated with dtype of user's choice - as long as it's a
+        # float dtype. To make it happen config.torch_dtype needs to be set before instantiating the
+        # model from the config object.
+
+        config = T5Config.from_pretrained(TINY_T5)
+        model = AutoModel.from_config(config)
+        # XXX: isn't supported
+        # model = T5ForConditionalGeneration.from_config(config)
+        self.assertEqual(model.dtype, torch.float32)
+
+        model = AutoModel.from_config(config, torch_dtype=torch.float16)
+        self.assertEqual(model.dtype, torch.float16)
+
+        # torch.set_default_dtype() supports only float dtypes, so will fail with non-float type
+        with self.assertRaises(ValueError):
+            model = AutoModel.from_config(config, torch_dtype=torch.int64)
+
+    def test_model_from_pretrained_torch_dtype(self):
+        # test that the model can be instantiated with dtype of either
+        # 1. explicit from_pretrained's torch_dtype argument
+        # 2. via autodiscovery by looking at model weights (torch_dtype="auto")
+        # so if a model.half() was saved, we want it to be instantiated as such.
+        #
+        # test an explicit model class, but also AutoModel separately as the latter goes through a different code path
+        model_path = self.get_auto_remove_tmp_dir()
+
+        # baseline - we know TINY_T5 is fp32 model
+        model = T5ForConditionalGeneration.from_pretrained(TINY_T5)
+        self.assertEqual(model.dtype, torch.float32)
+
+        def remove_torch_dtype(model_path):
+            file = f"{model_path}/config.json"
+            with open(file, "r", encoding="utf-8") as f:
+                s = json.load(f)
+            s.pop("torch_dtype")
+            with open(file, "w", encoding="utf-8") as f:
+                json.dump(s, f)
+
+        # test the default fp32 save_pretrained => from_pretrained cycle
+        model.save_pretrained(model_path)
+        model = T5ForConditionalGeneration.from_pretrained(model_path)
+        self.assertEqual(model.dtype, torch.float32)
+        # 1. test torch_dtype="auto" via `config.torch_dtype`
+        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype="auto")
+        self.assertEqual(model.dtype, torch.float32)
+        # 2. test torch_dtype="auto" via auto-derivation
+        # now remove the torch_dtype entry from config.json and try "auto" again which should
+        # perform auto-derivation from weights
+        remove_torch_dtype(model_path)
+        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype="auto")
+        self.assertEqual(model.dtype, torch.float32)
+
+        # test forced loading in fp16 (even though the weights are in fp32)
+        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16)
+        self.assertEqual(model.dtype, torch.float16)
+
+        # test fp16 save_pretrained, loaded with auto-detection
+        model = model.half()
+        model.save_pretrained(model_path)
+        # 1. test torch_dtype="auto" via `config.torch_dtype`
+        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype="auto")
+        self.assertEqual(model.config.torch_dtype, torch.float16)
+        self.assertEqual(model.dtype, torch.float16)
+        # tests `config.torch_dtype` saving
+        with open(f"{model_path}/config.json") as f:
+            config_dict = json.load(f)
+        self.assertEqual(config_dict["torch_dtype"], "float16")
+        # 2. test torch_dtype="auto" via auto-derivation
+        # now same with using config info
+        remove_torch_dtype(model_path)
+        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype="auto")
+        self.assertEqual(model.dtype, torch.float16)
+
+        # 3. now retest that AutoModel behaves the same wrt torch_dtype="auto" as T5ForConditionalGeneration
+        model = AutoModel.from_pretrained(model_path, torch_dtype="auto")
+        self.assertEqual(model.dtype, torch.float16)
+
+        # test fp16 save_pretrained, loaded with the explicit fp16
+        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16)
+        self.assertEqual(model.dtype, torch.float16)
+
+        # test AutoModel separately as it goes through a different path
+        # test auto-detection - as currently TINY_T5 doesn't have torch_dtype entry
+        model = AutoModel.from_pretrained(TINY_T5, torch_dtype="auto")
+        # test that the config object didn't get polluted with torch_dtype="auto"
+        # there was a bug that after this call we ended up with config.torch_dtype=="auto"
+        self.assertNotEqual(model.config.torch_dtype, "auto")
+        # now test the outcome
+        self.assertEqual(model.dtype, torch.float32)
+        model = AutoModel.from_pretrained(TINY_T5, torch_dtype=torch.float16)
+        self.assertEqual(model.dtype, torch.float16)
+
+        # test model whose first param is not of a floating type, but int
+        model = AutoModel.from_pretrained(TINY_BERT_FOR_TOKEN_CLASSIFICATION, torch_dtype="auto")
+        self.assertEqual(model.dtype, torch.float32)
+
+    def test_no_super_init_config_and_model(self):
+        config = NoSuperInitConfig(attribute=32)
+        model = NoSuperInitModel(config)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir)
+
+            new_model = NoSuperInitModel.from_pretrained(tmp_dir)
+
+        for p1, p2 in zip(model.parameters(), new_model.parameters()):
+            self.assertTrue(torch.equal(p1, p2))
+
+    def test_shard_checkpoint(self):
+        # This is the model we will use, total size 340,000 bytes.
+        model = torch.nn.Sequential(
+            torch.nn.Linear(100, 200, bias=False),  # size 80,000
+            torch.nn.Linear(200, 200, bias=False),  # size 160,000
+            torch.nn.Linear(200, 100, bias=False),  # size 80,000
+            torch.nn.Linear(100, 50, bias=False),  # size 20,000
+        )
+        state_dict = model.state_dict()
+
+        with self.subTest("No shard when max size is bigger than model size"):
+            shards, index = shard_checkpoint(state_dict)
+            self.assertIsNone(index)
+            self.assertDictEqual(shards, {WEIGHTS_NAME: state_dict})
+
+        with self.subTest("Test sharding, no weights bigger than max size"):
+            shards, index = shard_checkpoint(state_dict, max_shard_size="300kB")
+            # Split is first two layers then last two.
+            self.assertDictEqual(
+                index,
+                {
+                    "metadata": {"total_size": 340000},
+                    "weight_map": {
+                        "0.weight": "pytorch_model-00001-of-00002.bin",
+                        "1.weight": "pytorch_model-00001-of-00002.bin",
+                        "2.weight": "pytorch_model-00002-of-00002.bin",
+                        "3.weight": "pytorch_model-00002-of-00002.bin",
+                    },
+                },
+            )
+
+            shard1 = {"0.weight": state_dict["0.weight"], "1.weight": state_dict["1.weight"]}
+            shard2 = {"2.weight": state_dict["2.weight"], "3.weight": state_dict["3.weight"]}
+            self.assertDictEqual(
+                shards, {"pytorch_model-00001-of-00002.bin": shard1, "pytorch_model-00002-of-00002.bin": shard2}
+            )
+
+        with self.subTest("Test sharding with weights bigger than max size"):
+            shards, index = shard_checkpoint(state_dict, max_shard_size="100kB")
+            # Split is first layer, second layer then last 2.
+            self.assertDictEqual(
+                index,
+                {
+                    "metadata": {"total_size": 340000},
+                    "weight_map": {
+                        "0.weight": "pytorch_model-00001-of-00003.bin",
+                        "1.weight": "pytorch_model-00002-of-00003.bin",
+                        "2.weight": "pytorch_model-00003-of-00003.bin",
+                        "3.weight": "pytorch_model-00003-of-00003.bin",
+                    },
+                },
+            )
+
+            shard1 = {"0.weight": state_dict["0.weight"]}
+            shard2 = {"1.weight": state_dict["1.weight"]}
+            shard3 = {"2.weight": state_dict["2.weight"], "3.weight": state_dict["3.weight"]}
+            self.assertDictEqual(
+                shards,
+                {
+                    "pytorch_model-00001-of-00003.bin": shard1,
+                    "pytorch_model-00002-of-00003.bin": shard2,
+                    "pytorch_model-00003-of-00003.bin": shard3,
+                },
+            )
+
+    def test_checkpoint_sharding_local(self):
+        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            # We use the same folder for various sizes to make sure a new save erases the old checkpoint.
+            for max_size in ["50kB", "50kiB", "100kB", "100kiB", "200kB", "200kiB"]:
+                model.save_pretrained(tmp_dir, max_shard_size=max_size)
+
+                # Get each shard file and its size
+                shard_to_size = {}
+                for shard in os.listdir(tmp_dir):
+                    if shard.endswith(".bin"):
+                        shard_file = os.path.join(tmp_dir, shard)
+                        shard_to_size[shard_file] = os.path.getsize(shard_file)
+
+                index_file = os.path.join(tmp_dir, WEIGHTS_INDEX_NAME)
+                # Check there is an index but no regular weight file
+                self.assertTrue(os.path.isfile(index_file))
+                self.assertFalse(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_NAME)))
+
+                # Check a file is bigger than max_size only when it has a single weight
+                for shard_file, size in shard_to_size.items():
+                    if max_size.endswith("kiB"):
+                        max_size_int = int(max_size[:-3]) * 2**10
+                    else:
+                        max_size_int = int(max_size[:-2]) * 10**3
+                    # Note: pickle adds some junk so the weight of the file can end up being slightly bigger than
+                    # the size asked for (since we count parameters)
+                    if size >= max_size_int + 50000:
+                        state_dict = torch.load(shard_file)
+                        self.assertEqual(len(state_dict), 1)
+
+                # Check the index and the shard files found match
+                with open(index_file, "r", encoding="utf-8") as f:
+                    index = json.loads(f.read())
+
+                all_shards = set(index["weight_map"].values())
+                shards_found = {f for f in os.listdir(tmp_dir) if f.endswith(".bin")}
+                self.assertSetEqual(all_shards, shards_found)
+
+                # Finally, check the model can be reloaded
+                new_model = BertModel.from_pretrained(tmp_dir)
+                for p1, p2 in zip(model.parameters(), new_model.parameters()):
+                    self.assertTrue(torch.allclose(p1, p2))
+
+    def test_checkpoint_sharding_from_hub(self):
+        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-sharded")
+        # the model above is the same as the model below, just a sharded version.
+        ref_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+        for p1, p2 in zip(model.parameters(), ref_model.parameters()):
+            self.assertTrue(torch.allclose(p1, p2))
+
+    def test_checkpoint_variant_local(self):
+        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, variant="v2")
+
+            weights_name = ".".join(WEIGHTS_NAME.split(".")[:-1] + ["v2"] + ["bin"])
+
+            weights_file = os.path.join(tmp_dir, weights_name)
+            self.assertTrue(os.path.isfile(weights_file))
+            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_NAME)))
+
+            with self.assertRaises(EnvironmentError):
+                _ = BertModel.from_pretrained(tmp_dir)
+
+            new_model = BertModel.from_pretrained(tmp_dir, variant="v2")
+
+        for p1, p2 in zip(model.parameters(), new_model.parameters()):
+            self.assertTrue(torch.allclose(p1, p2))
+
+    def test_checkpoint_variant_local_sharded(self):
+        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, variant="v2", max_shard_size="50kB")
+
+            weights_index_name = ".".join(WEIGHTS_INDEX_NAME.split(".")[:-1] + ["v2"] + ["json"])
+            weights_index_file = os.path.join(tmp_dir, weights_index_name)
+            self.assertTrue(os.path.isfile(weights_index_file))
+            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_INDEX_NAME)))
+
+            for i in range(1, 5):
+                weights_name = ".".join(WEIGHTS_NAME.split(".")[:-1] + [f"v2-0000{i}-of-00005"] + ["bin"])
+                weights_name_file = os.path.join(tmp_dir, weights_name)
+                self.assertTrue(os.path.isfile(weights_name_file))
+
+            with self.assertRaises(EnvironmentError):
+                _ = BertModel.from_pretrained(tmp_dir)
+
+            new_model = BertModel.from_pretrained(tmp_dir, variant="v2")
+
+        for p1, p2 in zip(model.parameters(), new_model.parameters()):
+            self.assertTrue(torch.allclose(p1, p2))
+
+    @require_safetensors
+    def test_checkpoint_variant_local_safe(self):
+        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, variant="v2", safe_serialization=True)
+
+            weights_name = ".".join(SAFE_WEIGHTS_NAME.split(".")[:-1] + ["v2"] + ["safetensors"])
+
+            weights_file = os.path.join(tmp_dir, weights_name)
+            self.assertTrue(os.path.isfile(weights_file))
+            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_NAME)))
+
+            with self.assertRaises(EnvironmentError):
+                _ = BertModel.from_pretrained(tmp_dir)
+
+            new_model = BertModel.from_pretrained(tmp_dir, variant="v2")
+
+        for p1, p2 in zip(model.parameters(), new_model.parameters()):
+            self.assertTrue(torch.allclose(p1, p2))
+
+    @require_safetensors
+    def test_checkpoint_variant_local_sharded_safe(self):
+        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, variant="v2", max_shard_size="50kB", safe_serialization=True)
+
+            weights_index_name = ".".join(SAFE_WEIGHTS_INDEX_NAME.split(".")[:-1] + ["v2"] + ["json"])
+            weights_index_file = os.path.join(tmp_dir, weights_index_name)
+            self.assertTrue(os.path.isfile(weights_index_file))
+            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME)))
+
+            for i in range(1, 5):
+                weights_name = ".".join(SAFE_WEIGHTS_NAME.split(".")[:-1] + [f"v2-0000{i}-of-00005"] + ["safetensors"])
+                weights_name_file = os.path.join(tmp_dir, weights_name)
+                self.assertTrue(os.path.isfile(weights_name_file))
+
+            with self.assertRaises(EnvironmentError):
+                _ = BertModel.from_pretrained(tmp_dir)
+
+            new_model = BertModel.from_pretrained(tmp_dir, variant="v2")
+
+        for p1, p2 in zip(model.parameters(), new_model.parameters()):
+            self.assertTrue(torch.allclose(p1, p2))
+
+    def test_checkpoint_variant_hub(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            with self.assertRaises(EnvironmentError):
+                _ = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-variant", cache_dir=tmp_dir)
+            model = BertModel.from_pretrained(
+                "hf-internal-testing/tiny-random-bert-variant", cache_dir=tmp_dir, variant="v2"
+            )
+        self.assertIsNotNone(model)
+
+    def test_checkpoint_variant_hub_sharded(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            with self.assertRaises(EnvironmentError):
+                _ = BertModel.from_pretrained(
+                    "hf-internal-testing/tiny-random-bert-variant-sharded", cache_dir=tmp_dir
+                )
+            model = BertModel.from_pretrained(
+                "hf-internal-testing/tiny-random-bert-variant-sharded", cache_dir=tmp_dir, variant="v2"
+            )
+        self.assertIsNotNone(model)
+
+    @require_safetensors
+    def test_checkpoint_variant_hub_safe(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            with self.assertRaises(EnvironmentError):
+                _ = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-variant-safe", cache_dir=tmp_dir)
+            model = BertModel.from_pretrained(
+                "hf-internal-testing/tiny-random-bert-variant-safe", cache_dir=tmp_dir, variant="v2"
+            )
+        self.assertIsNotNone(model)
+
+    @require_safetensors
+    def test_checkpoint_variant_hub_sharded_safe(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            with self.assertRaises(EnvironmentError):
+                _ = BertModel.from_pretrained(
+                    "hf-internal-testing/tiny-random-bert-variant-sharded-safe", cache_dir=tmp_dir
+                )
+            model = BertModel.from_pretrained(
+                "hf-internal-testing/tiny-random-bert-variant-sharded-safe", cache_dir=tmp_dir, variant="v2"
+            )
+        self.assertIsNotNone(model)
+
+    def test_checkpoint_variant_save_load(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model = BertModel.from_pretrained(
+                "hf-internal-testing/tiny-random-bert-variant", cache_dir=tmp_dir, variant="v2"
+            )
+            weights_name = ".".join(WEIGHTS_NAME.split(".")[:-1] + ["v2"] + ["bin"])
+
+            model.save_pretrained(tmp_dir, variant="v2")
+            # saving will create a variant checkpoint
+            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, weights_name)))
+
+            model.save_pretrained(tmp_dir)
+            # saving shouldn't delete variant checkpoints
+            weights_name = ".".join(WEIGHTS_NAME.split(".")[:-1] + ["v2"] + ["bin"])
+            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, weights_name)))
+
+            # there should be a normal checkpoint
+            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_NAME)))
+
+        self.assertIsNotNone(model)
+
+    @require_accelerate
+    @mark.accelerate_tests
+    def test_from_pretrained_low_cpu_mem_usage_functional(self):
+        # test that we can use `from_pretrained(..., low_cpu_mem_usage=True)` with normal and
+        # sharded models
+
+        mnames = [
+            "hf-internal-testing/tiny-random-bert-sharded",
+            "hf-internal-testing/tiny-random-bert",
+        ]
+        for mname in mnames:
+            _ = BertModel.from_pretrained(mname, low_cpu_mem_usage=True)
+
+    @require_usr_bin_time
+    @require_accelerate
+    @mark.accelerate_tests
+    def test_from_pretrained_low_cpu_mem_usage_measured(self):
+        # test that `from_pretrained(..., low_cpu_mem_usage=True)` uses less cpu memory than default
+
+        mname = "bert-base-cased"
+
+        preamble = "from transformers import AutoModel"
+        one_liner_str = f'{preamble}; AutoModel.from_pretrained("{mname}", low_cpu_mem_usage=False)'
+        max_rss_normal = self.python_one_liner_max_rss(one_liner_str)
+        # print(f"{max_rss_normal=}")
+
+        one_liner_str = f'{preamble};  AutoModel.from_pretrained("{mname}", low_cpu_mem_usage=True)'
+        max_rss_low_mem = self.python_one_liner_max_rss(one_liner_str)
+        # print(f"{max_rss_low_mem=}")
+
+        diff_bytes = max_rss_normal - max_rss_low_mem
+        diff_percent = diff_bytes / max_rss_low_mem
+        # print(f"{diff_bytes=}, {diff_percent=}")
+        # ideally we would compare that the diff is close to ~1x checkpoint size in bytes, but
+        # measuring cpu memory on linux is very tricky and inconsistent, so instead let's check that
+        # it's at least 15% less cpu memory consumed
+
+        self.assertGreater(
+            diff_percent,
+            0.15,
+            "should use less CPU memory for low_cpu_mem_usage=True, "
+            f"but got max_rss_normal={max_rss_normal} and max_rss_low_mem={max_rss_low_mem}",
+        )
+
+        # if you want to compare things manually, let's first look at the size of the model in bytes
+        # model = BertModel.from_pretrained(mname, low_cpu_mem_usage=False)
+        # total_numel = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())
+        # total_bytes = total_numel * 4  # 420MB
+        # Now the diff_bytes should be very close to total_bytes, but the reports are inconsistent.
+        # The easiest way to test this is to switch the model and torch.load to do all the work on
+        # gpu - that way one can measure exactly the total and peak memory used. Perhaps once we add
+        # functionality to load models directly on gpu, this test can be rewritten to use torch's
+        # cuda memory tracking and then we should be able to do a much more precise test.
+
+    @require_accelerate
+    @mark.accelerate_tests
+    @require_torch_multi_gpu
+    @slow
+    def test_model_parallelism_gpt2(self):
+        device_map = {"transformer.wte": 0, "transformer.wpe": 0, "lm_head": 0, "transformer.ln_f": 1}
+        for i in range(12):
+            device_map[f"transformer.h.{i}"] = 0 if i <= 5 else 1
+
+        model = AutoModelForCausalLM.from_pretrained("gpt2", device_map=device_map)
+
+        tokenizer = AutoTokenizer.from_pretrained("gpt2")
+        inputs = tokenizer("Hello, my name is", return_tensors="pt")
+        output = model.generate(inputs["input_ids"].to(0))
+
+        text_output = tokenizer.decode(output[0].tolist())
+        self.assertEqual(text_output, "Hello, my name is John. I'm a writer, and I'm a writer. I'm")
+
+    @require_accelerate
+    @mark.accelerate_tests
+    @require_torch_gpu
+    def test_from_pretrained_disk_offload_task_model(self):
+        model = AutoModel.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        device_map = {
+            "transformer.wte": 0,
+            "transformer.wpe": 0,
+            "transformer.h.0": "cpu",
+            "transformer.h.1": "cpu",
+            "transformer.h.2": "cpu",
+            "transformer.h.3": "disk",
+            "transformer.h.4": "disk",
+            "transformer.ln_f": 0,
+            "lm_head": 0,
+        }
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            inputs = torch.tensor([[1, 2, 3]]).to(0)
+
+            model.save_pretrained(tmp_dir)
+            new_model = AutoModelForCausalLM.from_pretrained(tmp_dir).to(0)
+            outputs1 = new_model.to(0)(inputs)
+
+            offload_folder = os.path.join(tmp_dir, "offload")
+            new_model_with_offload = AutoModelForCausalLM.from_pretrained(
+                tmp_dir, device_map=device_map, offload_folder=offload_folder
+            )
+            outputs2 = new_model_with_offload(inputs)
+
+            self.assertTrue(torch.allclose(outputs1.logits.cpu(), outputs2.logits.cpu()))
+
+            # With state dict temp offload
+            offload_folder = os.path.join(tmp_dir, "offload")
+            new_model_with_offload = AutoModelForCausalLM.from_pretrained(
+                tmp_dir,
+                device_map=device_map,
+                offload_folder=offload_folder,
+                offload_state_dict=True,
+            )
+            outputs2 = new_model_with_offload(inputs)
+
+            self.assertTrue(torch.allclose(outputs1.logits.cpu(), outputs2.logits.cpu()))
+
+    def test_cached_files_are_used_when_internet_is_down(self):
+        # A mock response for an HTTP head request to emulate server down
+        response_mock = mock.Mock()
+        response_mock.status_code = 500
+        response_mock.headers = {}
+        response_mock.raise_for_status.side_effect = HTTPError
+        response_mock.json.return_value = {}
+
+        # Download this model to make sure it's in the cache.
+        _ = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        # Under the mock environment we get a 500 error when trying to reach the model.
+        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
+            _ = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+            # This check we did call the fake head request
+            mock_head.assert_called()
+
+    def test_load_from_one_file(self):
+        try:
+            tmp_file = tempfile.mktemp()
+            with open(tmp_file, "wb") as f:
+                http_get(
+                    "https://huggingface.co/hf-internal-testing/tiny-random-bert/resolve/main/pytorch_model.bin", f
+                )
+
+            config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
+            _ = BertModel.from_pretrained(tmp_file, config=config)
+        finally:
+            os.remove(tmp_file)
+
+    def test_legacy_load_from_url(self):
+        # This test is for deprecated behavior and can be removed in v5
+        config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
+        _ = BertModel.from_pretrained(
+            "https://huggingface.co/hf-internal-testing/tiny-random-bert/resolve/main/pytorch_model.bin", config=config
+        )
+
+    @require_safetensors
+    def test_use_safetensors(self):
+        # test nice error message if no safetensor files available
+        with self.assertRaises(OSError) as env_error:
+            AutoModel.from_pretrained("hf-internal-testing/tiny-random-RobertaModel", use_safetensors=True)
+
+        self.assertTrue(
+            "model.safetensors or model.safetensors.index.json and thus cannot be loaded with `safetensors`"
+            in str(env_error.exception)
+        )
+
+        # test that error if only safetensors is available
+        with self.assertRaises(OSError) as env_error:
+            BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-safetensors", use_safetensors=False)
+
+        self.assertTrue("does not appear to have a file named pytorch_model.bin" in str(env_error.exception))
+
+        # test that only safetensors if both available and use_safetensors=False
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            CLIPTextModel.from_pretrained(
+                "hf-internal-testing/diffusers-stable-diffusion-tiny-all",
+                subfolder="text_encoder",
+                use_safetensors=False,
+                cache_dir=tmp_dir,
+            )
+
+            all_downloaded_files = glob.glob(os.path.join(tmp_dir, "*", "snapshots", "*", "*", "*"))
+            self.assertTrue(any(f.endswith("bin") for f in all_downloaded_files))
+            self.assertFalse(any(f.endswith("safetensors") for f in all_downloaded_files))
+
+        # test that no safetensors if both available and use_safetensors=True
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            CLIPTextModel.from_pretrained(
+                "hf-internal-testing/diffusers-stable-diffusion-tiny-all",
+                subfolder="text_encoder",
+                use_safetensors=True,
+                cache_dir=tmp_dir,
+            )
+
+            all_downloaded_files = glob.glob(os.path.join(tmp_dir, "*", "snapshots", "*", "*", "*"))
+            self.assertTrue(any(f.endswith("safetensors") for f in all_downloaded_files))
+            self.assertFalse(any(f.endswith("bin") for f in all_downloaded_files))
+
+    @require_safetensors
+    def test_safetensors_save_and_load(self):
+        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, safe_serialization=True)
+            # No pytorch_model.bin file, only a model.safetensors
+            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_NAME)))
+            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_NAME)))
+
+            new_model = BertModel.from_pretrained(tmp_dir)
+
+            # Check models are equal
+            for p1, p2 in zip(model.parameters(), new_model.parameters()):
+                self.assertTrue(torch.allclose(p1, p2))
+
+    @require_safetensors
+    def test_safetensors_load_from_hub(self):
+        safetensors_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-safetensors")
+        pytorch_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        # Check models are equal
+        for p1, p2 in zip(safetensors_model.parameters(), pytorch_model.parameters()):
+            self.assertTrue(torch.allclose(p1, p2))
+
+    @require_safetensors
+    def test_safetensors_save_and_load_sharded(self):
+        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, safe_serialization=True, max_shard_size="100kB")
+            # No pytorch_model.bin index file, only a model.safetensors index
+            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_INDEX_NAME)))
+            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME)))
+            # No regular weights file
+            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_NAME)))
+            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_NAME)))
+
+            new_model = BertModel.from_pretrained(tmp_dir)
+
+            # Check models are equal
+            for p1, p2 in zip(model.parameters(), new_model.parameters()):
+                self.assertTrue(torch.allclose(p1, p2))
+
+    @require_safetensors
+    def test_safetensors_load_from_hub_sharded(self):
+        safetensors_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-sharded-safetensors")
+        pytorch_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-sharded")
+
+        # Check models are equal
+        for p1, p2 in zip(safetensors_model.parameters(), pytorch_model.parameters()):
+            self.assertTrue(torch.allclose(p1, p2))
+
+    def test_base_model_to_head_model_load(self):
+        base_model = BaseModel(PretrainedConfig())
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            base_model.save_pretrained(tmp_dir)
+
+            # Can load a base model in a model with head
+            model = ModelWithHead.from_pretrained(tmp_dir)
+            for p1, p2 in zip(model.base.parameters(), base_model.parameters()):
+                self.assertTrue(torch.allclose(p1, p2))
+
+            # It doesn't work if the state dict has a mix of keys of the head and base without prefix though.
+            base_state_dict = base_model.state_dict()
+            head_state_dict = model.state_dict()
+            base_state_dict["linear2.weight"] = head_state_dict["linear2.weight"]
+            base_state_dict["linear2.bias"] = head_state_dict["linear2.bias"]
+            torch.save(base_state_dict, os.path.join(tmp_dir, WEIGHTS_NAME))
+
+            with self.assertRaisesRegex(
+                ValueError, "The state dictionary of the model you are trying to load is corrupted."
+            ):
+                _ = ModelWithHead.from_pretrained(tmp_dir)
+
+    def test_tied_weights_reload(self):
+        # Base
+        model = BaseModelWithTiedWeights(PretrainedConfig())
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir)
+
+            new_model = BaseModelWithTiedWeights.from_pretrained(tmp_dir)
+            self.assertIs(new_model.linear.weight, new_model.linear_2.weight)
+
+            state_dict = model.state_dict()
+            # Remove tied weight from state_dict -> model should load with no complain of missing keys
+            del state_dict["linear_2.weight"]
+            torch.save(state_dict, os.path.join(tmp_dir, WEIGHTS_NAME))
+            new_model, load_info = BaseModelWithTiedWeights.from_pretrained(tmp_dir, output_loading_info=True)
+            self.assertListEqual(load_info["missing_keys"], [])
+            self.assertIs(new_model.linear.weight, new_model.linear_2.weight)
+
+            # With head
+            model.save_pretrained(tmp_dir)
+            new_model, load_info = ModelWithHeadAndTiedWeights.from_pretrained(tmp_dir, output_loading_info=True)
+            self.assertIs(new_model.base.linear.weight, new_model.decoder.weight)
+            # Should only complain about the missing bias
+            self.assertListEqual(load_info["missing_keys"], ["decoder.bias"])
+
+    def test_unexpected_keys_warnings(self):
+        model = ModelWithHead(PretrainedConfig())
+        logger = logging.get_logger("transformers.modeling_utils")
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir)
+
+            # Loading the model with a new class, we don't get a warning for unexpected weights, just an info
+            with CaptureLogger(logger) as cl:
+                _, loading_info = BaseModel.from_pretrained(tmp_dir, output_loading_info=True)
+            self.assertNotIn("were not used when initializing ModelWithHead", cl.out)
+            self.assertEqual(
+                set(loading_info["unexpected_keys"]),
+                {"linear.weight", "linear.bias", "linear2.weight", "linear2.bias"},
+            )
+
+            # Loading the model with the same class, we do get a warning for unexpected weights
+            state_dict = model.state_dict()
+            state_dict["added_key"] = state_dict["linear.weight"]
+            torch.save(state_dict, os.path.join(tmp_dir, WEIGHTS_NAME))
+            with CaptureLogger(logger) as cl:
+                _, loading_info = ModelWithHead.from_pretrained(tmp_dir, output_loading_info=True)
+            self.assertIn("were not used when initializing ModelWithHead: ['added_key']", cl.out)
+            self.assertEqual(loading_info["unexpected_keys"], ["added_key"])
+
+    def test_warn_if_padding_and_no_attention_mask(self):
+        logger = logging.get_logger("transformers.modeling_utils")
+
+        with self.subTest("Ensure no warnings when pad_token_id is None."):
+            logger.warning_once.cache_clear()
+            with CaptureLogger(logger) as cl:
+                config_no_pad_token = PretrainedConfig()
+                config_no_pad_token.pad_token_id = None
+                model = ModelWithHead(config_no_pad_token)
+                input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 0, 0]])
+                model.warn_if_padding_and_no_attention_mask(input_ids, attention_mask=None)
+            self.assertNotIn("We strongly recommend passing in an `attention_mask`", cl.out)
+
+        with self.subTest("Ensure no warnings when there is an attention_mask."):
+            logger.warning_once.cache_clear()
+            with CaptureLogger(logger) as cl:
+                config = PretrainedConfig()
+                config.pad_token_id = 0
+                model = ModelWithHead(config)
+                input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 0, 0]])
+                attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])
+                model.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            self.assertNotIn("We strongly recommend passing in an `attention_mask`", cl.out)
+
+        with self.subTest("Ensure no warnings when there are no pad_token_ids in the input_ids."):
+            logger.warning_once.cache_clear()
+            with CaptureLogger(logger) as cl:
+                config = PretrainedConfig()
+                config.pad_token_id = 0
+                model = ModelWithHead(config)
+                input_ids = torch.tensor([[1, 345, 232, 328, 740, 140, 1695, 69, 6078, 2341, 25]])
+                model.warn_if_padding_and_no_attention_mask(input_ids, attention_mask=None)
+            self.assertNotIn("We strongly recommend passing in an `attention_mask`", cl.out)
+
+        with self.subTest("Ensure a warning is shown when the input_ids start with a pad_token_id."):
+            logger.warning_once.cache_clear()
+            with CaptureLogger(logger) as cl:
+                config = PretrainedConfig()
+                config.pad_token_id = 0
+                model = ModelWithHead(config)
+                input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 432, 5232]])
+                model.warn_if_padding_and_no_attention_mask(input_ids, attention_mask=None)
+            self.assertIn("We strongly recommend passing in an `attention_mask`", cl.out)
+
+        with self.subTest("Ensure a warning is shown when the input_ids end with a pad_token_id."):
+            logger.warning_once.cache_clear()
+            with CaptureLogger(logger) as cl:
+                config = PretrainedConfig()
+                config.pad_token_id = 0
+                model = ModelWithHead(config)
+                input_ids = torch.tensor([[432, 345, 232, 328, 740, 140, 1695, 69, 6078, 0, 0]])
+                model.warn_if_padding_and_no_attention_mask(input_ids, attention_mask=None)
+            self.assertIn("We strongly recommend passing in an `attention_mask`", cl.out)
+
+        with self.subTest("Ensure that the warning is shown at most once."):
+            logger.warning_once.cache_clear()
+            with CaptureLogger(logger) as cl:
+                config = PretrainedConfig()
+                config.pad_token_id = 0
+                model = ModelWithHead(config)
+                input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 0, 0]])
+                model.warn_if_padding_and_no_attention_mask(input_ids, attention_mask=None)
+                model.warn_if_padding_and_no_attention_mask(input_ids, attention_mask=None)
+            self.assertEqual(cl.out.count("We strongly recommend passing in an `attention_mask`"), 1)
+
+        with self.subTest("Ensure a different warning is shown when the pad_token_id is equal to the bos_token_id."):
+            logger.warning_once.cache_clear()
+            with CaptureLogger(logger) as cl:
+                config = PretrainedConfig()
+                config.pad_token_id = 0
+                config.bos_token_id = config.pad_token_id
+                model = ModelWithHead(config)
+                input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 0, 0]])
+                model.warn_if_padding_and_no_attention_mask(input_ids, attention_mask=None)
+            self.assertIn("You may ignore this warning if your `pad_token_id`", cl.out)
+
+        if not is_torchdynamo_available():
+            return
+        with self.subTest("Ensure that the warning code is skipped when compiling with torchdynamo."):
+            logger.warning_once.cache_clear()
+            from torch._dynamo import config, testing
+
+            config = PretrainedConfig()
+            config.pad_token_id = 0
+            model = ModelWithHead(config)
+            input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 432, 5232]])
+
+            def f(input_ids):
+                model.warn_if_padding_and_no_attention_mask(input_ids, attention_mask=None)
+
+            compile_counter = testing.CompileCounter()
+            opt_fn = torch.compile(f, dynamic=True, backend=compile_counter)
+            opt_fn(input_ids)
+            self.assertEqual(compile_counter.frame_count, 0)
+
+    @require_torch_gpu
+    @slow
+    def test_pretrained_low_mem_new_config(self):
+        # Checking for 1 model(the same one which was described in the issue) .
+        model_ids = ["gpt2"]
+
+        for model_id in model_ids:
+            model_config = AutoConfig.from_pretrained(pretrained_model_name_or_path=model_id)
+            model_config.n_layer = 48
+            model_config.n_head = 25
+            model_config.n_embd = 1600
+            model = AutoModelForCausalLM.from_pretrained(
+                pretrained_model_name_or_path=model_id,
+                config=model_config,
+                ignore_mismatched_sizes=True,
+                torch_dtype=torch.float16,
+                low_cpu_mem_usage=True,
+            )
+            model_ref = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id)
+
+            self.assertEqual(model.__class__.__name__, model_ref.__class__.__name__)
+
+    def test_generation_config_is_loaded_with_model(self):
+        # Note: `joaogante/tiny-random-gpt2-with-generation-config` has a `generation_config.json` containing a dummy
+        # `transformers_version` field set to `foo`. If loading the file fails, this test also fails.
+
+        # 1. Load without further parameters
+        model = AutoModelForCausalLM.from_pretrained("joaogante/tiny-random-gpt2-with-generation-config")
+        self.assertEqual(model.generation_config.transformers_version, "foo")
+
+        # 2. Load with `device_map`
+        model = AutoModelForCausalLM.from_pretrained(
+            "joaogante/tiny-random-gpt2-with-generation-config", device_map="auto"
+        )
+        self.assertEqual(model.generation_config.transformers_version, "foo")
+
+
+@require_torch
+@is_staging_test
+class ModelPushToHubTester(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._token = TOKEN
+        HfFolder.save_token(TOKEN)
+
+    @classmethod
+    def tearDownClass(cls):
+        try:
+            delete_repo(token=cls._token, repo_id="test-model")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="valid_org/test-model-org")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="test-dynamic-model")
+        except HTTPError:
+            pass
+
+    @unittest.skip("This test is flaky")
+    def test_push_to_hub(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        model = BertModel(config)
+        model.push_to_hub("test-model", use_auth_token=self._token)
+
+        new_model = BertModel.from_pretrained(f"{USER}/test-model")
+        for p1, p2 in zip(model.parameters(), new_model.parameters()):
+            self.assertTrue(torch.equal(p1, p2))
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="test-model")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, repo_id="test-model", push_to_hub=True, use_auth_token=self._token)
+
+        new_model = BertModel.from_pretrained(f"{USER}/test-model")
+        for p1, p2 in zip(model.parameters(), new_model.parameters()):
+            self.assertTrue(torch.equal(p1, p2))
+
+    @unittest.skip("This test is flaky")
+    def test_push_to_hub_in_organization(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        model = BertModel(config)
+        model.push_to_hub("valid_org/test-model-org", use_auth_token=self._token)
+
+        new_model = BertModel.from_pretrained("valid_org/test-model-org")
+        for p1, p2 in zip(model.parameters(), new_model.parameters()):
+            self.assertTrue(torch.equal(p1, p2))
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="valid_org/test-model-org")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(
+                tmp_dir, push_to_hub=True, use_auth_token=self._token, repo_id="valid_org/test-model-org"
+            )
+
+        new_model = BertModel.from_pretrained("valid_org/test-model-org")
+        for p1, p2 in zip(model.parameters(), new_model.parameters()):
+            self.assertTrue(torch.equal(p1, p2))
+
+    def test_push_to_hub_dynamic_model(self):
+        CustomConfig.register_for_auto_class()
+        CustomModel.register_for_auto_class()
+
+        config = CustomConfig(hidden_size=32)
+        model = CustomModel(config)
+
+        model.push_to_hub("test-dynamic-model", use_auth_token=self._token)
+        # checks
+        self.assertDictEqual(
+            config.auto_map,
+            {"AutoConfig": "custom_configuration.CustomConfig", "AutoModel": "custom_modeling.CustomModel"},
+        )
+
+        new_model = AutoModel.from_pretrained(f"{USER}/test-dynamic-model", trust_remote_code=True)
+        # Can't make an isinstance check because the new_model is from the CustomModel class of a dynamic module
+        self.assertEqual(new_model.__class__.__name__, "CustomModel")
+        for p1, p2 in zip(model.parameters(), new_model.parameters()):
+            self.assertTrue(torch.equal(p1, p2))
+
+        config = AutoConfig.from_pretrained(f"{USER}/test-dynamic-model", trust_remote_code=True)
+        new_model = AutoModel.from_config(config, trust_remote_code=True)
+        self.assertEqual(new_model.__class__.__name__, "CustomModel")
diff --git a/tests/test_pipeline_mixin.py b/tests/test_pipeline_mixin.py
new file mode 100644
index 000000000000..3447577ac579
--- /dev/null
+++ b/tests/test_pipeline_mixin.py
@@ -0,0 +1,514 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import json
+import os
+import random
+import unittest
+from pathlib import Path
+
+from transformers.testing_utils import (
+    is_pipeline_test,
+    require_decord,
+    require_pytesseract,
+    require_timm,
+    require_torch,
+    require_torch_or_tf,
+    require_vision,
+)
+from transformers.utils import direct_transformers_import, logging
+
+from .pipelines.test_pipelines_audio_classification import AudioClassificationPipelineTests
+from .pipelines.test_pipelines_automatic_speech_recognition import AutomaticSpeechRecognitionPipelineTests
+from .pipelines.test_pipelines_conversational import ConversationalPipelineTests
+from .pipelines.test_pipelines_depth_estimation import DepthEstimationPipelineTests
+from .pipelines.test_pipelines_document_question_answering import DocumentQuestionAnsweringPipelineTests
+from .pipelines.test_pipelines_feature_extraction import FeatureExtractionPipelineTests
+from .pipelines.test_pipelines_fill_mask import FillMaskPipelineTests
+from .pipelines.test_pipelines_image_classification import ImageClassificationPipelineTests
+from .pipelines.test_pipelines_image_segmentation import ImageSegmentationPipelineTests
+from .pipelines.test_pipelines_image_to_image import ImageToImagePipelineTests
+from .pipelines.test_pipelines_image_to_text import ImageToTextPipelineTests
+from .pipelines.test_pipelines_mask_generation import MaskGenerationPipelineTests
+from .pipelines.test_pipelines_object_detection import ObjectDetectionPipelineTests
+from .pipelines.test_pipelines_question_answering import QAPipelineTests
+from .pipelines.test_pipelines_summarization import SummarizationPipelineTests
+from .pipelines.test_pipelines_table_question_answering import TQAPipelineTests
+from .pipelines.test_pipelines_text2text_generation import Text2TextGenerationPipelineTests
+from .pipelines.test_pipelines_text_classification import TextClassificationPipelineTests
+from .pipelines.test_pipelines_text_generation import TextGenerationPipelineTests
+from .pipelines.test_pipelines_text_to_audio import TextToAudioPipelineTests
+from .pipelines.test_pipelines_token_classification import TokenClassificationPipelineTests
+from .pipelines.test_pipelines_translation import TranslationPipelineTests
+from .pipelines.test_pipelines_video_classification import VideoClassificationPipelineTests
+from .pipelines.test_pipelines_visual_question_answering import VisualQuestionAnsweringPipelineTests
+from .pipelines.test_pipelines_zero_shot import ZeroShotClassificationPipelineTests
+from .pipelines.test_pipelines_zero_shot_audio_classification import ZeroShotAudioClassificationPipelineTests
+from .pipelines.test_pipelines_zero_shot_image_classification import ZeroShotImageClassificationPipelineTests
+from .pipelines.test_pipelines_zero_shot_object_detection import ZeroShotObjectDetectionPipelineTests
+
+
+pipeline_test_mapping = {
+    "audio-classification": {"test": AudioClassificationPipelineTests},
+    "automatic-speech-recognition": {"test": AutomaticSpeechRecognitionPipelineTests},
+    "conversational": {"test": ConversationalPipelineTests},
+    "depth-estimation": {"test": DepthEstimationPipelineTests},
+    "document-question-answering": {"test": DocumentQuestionAnsweringPipelineTests},
+    "feature-extraction": {"test": FeatureExtractionPipelineTests},
+    "fill-mask": {"test": FillMaskPipelineTests},
+    "image-classification": {"test": ImageClassificationPipelineTests},
+    "image-segmentation": {"test": ImageSegmentationPipelineTests},
+    "image-to-image": {"test": ImageToImagePipelineTests},
+    "image-to-text": {"test": ImageToTextPipelineTests},
+    "mask-generation": {"test": MaskGenerationPipelineTests},
+    "object-detection": {"test": ObjectDetectionPipelineTests},
+    "question-answering": {"test": QAPipelineTests},
+    "summarization": {"test": SummarizationPipelineTests},
+    "table-question-answering": {"test": TQAPipelineTests},
+    "text2text-generation": {"test": Text2TextGenerationPipelineTests},
+    "text-classification": {"test": TextClassificationPipelineTests},
+    "text-generation": {"test": TextGenerationPipelineTests},
+    "text-to-audio": {"test": TextToAudioPipelineTests},
+    "token-classification": {"test": TokenClassificationPipelineTests},
+    "translation": {"test": TranslationPipelineTests},
+    "video-classification": {"test": VideoClassificationPipelineTests},
+    "visual-question-answering": {"test": VisualQuestionAnsweringPipelineTests},
+    "zero-shot": {"test": ZeroShotClassificationPipelineTests},
+    "zero-shot-audio-classification": {"test": ZeroShotAudioClassificationPipelineTests},
+    "zero-shot-image-classification": {"test": ZeroShotImageClassificationPipelineTests},
+    "zero-shot-object-detection": {"test": ZeroShotObjectDetectionPipelineTests},
+}
+
+for task, task_info in pipeline_test_mapping.items():
+    test = task_info["test"]
+    task_info["mapping"] = {
+        "pt": getattr(test, "model_mapping", None),
+        "tf": getattr(test, "tf_model_mapping", None),
+    }
+
+
+# The default value `hf-internal-testing` is for running the pipeline testing against the tiny models on the Hub.
+# For debugging purpose, we can specify a local path which is the `output_path` argument of a previous run of
+# `utils/create_dummy_models.py`.
+TRANSFORMERS_TINY_MODEL_PATH = os.environ.get("TRANSFORMERS_TINY_MODEL_PATH", "hf-internal-testing")
+if TRANSFORMERS_TINY_MODEL_PATH == "hf-internal-testing":
+    TINY_MODEL_SUMMARY_FILE_PATH = os.path.join(Path(__file__).parent.parent, "tests/utils/tiny_model_summary.json")
+else:
+    TINY_MODEL_SUMMARY_FILE_PATH = os.path.join(TRANSFORMERS_TINY_MODEL_PATH, "reports", "tiny_model_summary.json")
+with open(TINY_MODEL_SUMMARY_FILE_PATH) as fp:
+    tiny_model_summary = json.load(fp)
+
+
+PATH_TO_TRANSFORMERS = os.path.join(Path(__file__).parent.parent, "src/transformers")
+
+
+# Dynamically import the Transformers module to grab the attribute classes of the processor form their names.
+transformers_module = direct_transformers_import(PATH_TO_TRANSFORMERS)
+
+logger = logging.get_logger(__name__)
+
+
+class PipelineTesterMixin:
+    model_tester = None
+    pipeline_model_mapping = None
+    supported_frameworks = ["pt", "tf"]
+
+    def run_task_tests(self, task):
+        """Run pipeline tests for a specific `task`
+
+        Args:
+            task (`str`):
+                A task name. This should be a key in the mapping `pipeline_test_mapping`.
+        """
+        if task not in self.pipeline_model_mapping:
+            self.skipTest(
+                f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')} is skipped: `{task}` is not in "
+                f"`self.pipeline_model_mapping` for `{self.__class__.__name__}`."
+            )
+
+        model_architectures = self.pipeline_model_mapping[task]
+        if not isinstance(model_architectures, tuple):
+            model_architectures = (model_architectures,)
+        if not isinstance(model_architectures, tuple):
+            raise ValueError(f"`model_architectures` must be a tuple. Got {type(model_architectures)} instead.")
+
+        for model_architecture in model_architectures:
+            model_arch_name = model_architecture.__name__
+
+            # Get the canonical name
+            for _prefix in ["Flax", "TF"]:
+                if model_arch_name.startswith(_prefix):
+                    model_arch_name = model_arch_name[len(_prefix) :]
+                    break
+
+            tokenizer_names = []
+            processor_names = []
+            commit = None
+            if model_arch_name in tiny_model_summary:
+                tokenizer_names = tiny_model_summary[model_arch_name]["tokenizer_classes"]
+                processor_names = tiny_model_summary[model_arch_name]["processor_classes"]
+                if "sha" in tiny_model_summary[model_arch_name]:
+                    commit = tiny_model_summary[model_arch_name]["sha"]
+            # Adding `None` (if empty) so we can generate tests
+            tokenizer_names = [None] if len(tokenizer_names) == 0 else tokenizer_names
+            processor_names = [None] if len(processor_names) == 0 else processor_names
+
+            repo_name = f"tiny-random-{model_arch_name}"
+            if TRANSFORMERS_TINY_MODEL_PATH != "hf-internal-testing":
+                repo_name = model_arch_name
+
+            self.run_model_pipeline_tests(
+                task, repo_name, model_architecture, tokenizer_names, processor_names, commit
+            )
+
+    def run_model_pipeline_tests(self, task, repo_name, model_architecture, tokenizer_names, processor_names, commit):
+        """Run pipeline tests for a specific `task` with the give model class and tokenizer/processor class names
+
+        Args:
+            task (`str`):
+                A task name. This should be a key in the mapping `pipeline_test_mapping`.
+            repo_name (`str`):
+                A model repository id on the Hub.
+            model_architecture (`type`):
+                A subclass of `PretrainedModel` or `PretrainedModel`.
+            tokenizer_names (`List[str]`):
+                A list of names of a subclasses of `PreTrainedTokenizerFast` or `PreTrainedTokenizer`.
+            processor_names (`List[str]`):
+                A list of names of subclasses of `BaseImageProcessor` or `FeatureExtractionMixin`.
+        """
+        # Get an instance of the corresponding class `XXXPipelineTests` in order to use `get_test_pipeline` and
+        # `run_pipeline_test`.
+        pipeline_test_class_name = pipeline_test_mapping[task]["test"].__name__
+
+        for tokenizer_name in tokenizer_names:
+            for processor_name in processor_names:
+                if self.is_pipeline_test_to_skip(
+                    pipeline_test_class_name,
+                    model_architecture.config_class,
+                    model_architecture,
+                    tokenizer_name,
+                    processor_name,
+                ):
+                    logger.warning(
+                        f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')} is skipped: test is "
+                        f"currently known to fail for: model `{model_architecture.__name__}` | tokenizer "
+                        f"`{tokenizer_name}` | processor `{processor_name}`."
+                    )
+                    continue
+                self.run_pipeline_test(task, repo_name, model_architecture, tokenizer_name, processor_name, commit)
+
+    def run_pipeline_test(self, task, repo_name, model_architecture, tokenizer_name, processor_name, commit):
+        """Run pipeline tests for a specific `task` with the give model class and tokenizer/processor class name
+
+        The model will be loaded from a model repository on the Hub.
+
+        Args:
+            task (`str`):
+                A task name. This should be a key in the mapping `pipeline_test_mapping`.
+            repo_name (`str`):
+                A model repository id on the Hub.
+            model_architecture (`type`):
+                A subclass of `PretrainedModel` or `PretrainedModel`.
+            tokenizer_name (`str`):
+                The name of a subclass of `PreTrainedTokenizerFast` or `PreTrainedTokenizer`.
+            processor_name (`str`):
+                The name of a subclass of `BaseImageProcessor` or `FeatureExtractionMixin`.
+        """
+        repo_id = f"{TRANSFORMERS_TINY_MODEL_PATH}/{repo_name}"
+        if TRANSFORMERS_TINY_MODEL_PATH != "hf-internal-testing":
+            model_type = model_architecture.config_class.model_type
+            repo_id = os.path.join(TRANSFORMERS_TINY_MODEL_PATH, model_type, repo_name)
+
+        tokenizer = None
+        if tokenizer_name is not None:
+            tokenizer_class = getattr(transformers_module, tokenizer_name)
+            tokenizer = tokenizer_class.from_pretrained(repo_id, revision=commit)
+
+        processor = None
+        if processor_name is not None:
+            processor_class = getattr(transformers_module, processor_name)
+            # If the required packages (like `Pillow` or `torchaudio`) are not installed, this will fail.
+            try:
+                processor = processor_class.from_pretrained(repo_id, revision=commit)
+            except Exception:
+                logger.warning(
+                    f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')} is skipped: Could not load the "
+                    f"processor from `{repo_id}` with `{processor_name}`."
+                )
+                return
+
+        # TODO: Maybe not upload such problematic tiny models to Hub.
+        if tokenizer is None and processor is None:
+            logger.warning(
+                f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')} is skipped: Could not find or load "
+                f"any tokenizer / processor from `{repo_id}`."
+            )
+            return
+
+        # TODO: We should check if a model file is on the Hub repo. instead.
+        try:
+            model = model_architecture.from_pretrained(repo_id, revision=commit)
+        except Exception:
+            logger.warning(
+                f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')} is skipped: Could not find or load "
+                f"the model from `{repo_id}` with `{model_architecture}`."
+            )
+            return
+
+        pipeline_test_class_name = pipeline_test_mapping[task]["test"].__name__
+        if self.is_pipeline_test_to_skip_more(pipeline_test_class_name, model.config, model, tokenizer, processor):
+            logger.warning(
+                f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')} is skipped: test is "
+                f"currently known to fail for: model `{model_architecture.__name__}` | tokenizer "
+                f"`{tokenizer_name}` | processor `{processor_name}`."
+            )
+            return
+
+        # validate
+        validate_test_components(self, task, model, tokenizer, processor)
+
+        if hasattr(model, "eval"):
+            model = model.eval()
+
+        # Get an instance of the corresponding class `XXXPipelineTests` in order to use `get_test_pipeline` and
+        # `run_pipeline_test`.
+        task_test = pipeline_test_mapping[task]["test"]()
+
+        pipeline, examples = task_test.get_test_pipeline(model, tokenizer, processor)
+        if pipeline is None:
+            # The test can disable itself, but it should be very marginal
+            # Concerns: Wav2Vec2ForCTC without tokenizer test (FastTokenizer don't exist)
+            logger.warning(
+                f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')} is skipped: Could not get the "
+                "pipeline for testing."
+            )
+            return
+
+        task_test.run_pipeline_test(pipeline, examples)
+
+        def run_batch_test(pipeline, examples):
+            # Need to copy because `Conversation` are stateful
+            if pipeline.tokenizer is not None and pipeline.tokenizer.pad_token_id is None:
+                return  # No batching for this and it's OK
+
+            # 10 examples with batch size 4 means there needs to be a unfinished batch
+            # which is important for the unbatcher
+            def data(n):
+                for _ in range(n):
+                    # Need to copy because Conversation object is mutated
+                    yield copy.deepcopy(random.choice(examples))
+
+            out = []
+            for item in pipeline(data(10), batch_size=4):
+                out.append(item)
+            self.assertEqual(len(out), 10)
+
+        run_batch_test(pipeline, examples)
+
+    @is_pipeline_test
+    def test_pipeline_audio_classification(self):
+        self.run_task_tests(task="audio-classification")
+
+    @is_pipeline_test
+    def test_pipeline_automatic_speech_recognition(self):
+        self.run_task_tests(task="automatic-speech-recognition")
+
+    @is_pipeline_test
+    def test_pipeline_conversational(self):
+        self.run_task_tests(task="conversational")
+
+    @is_pipeline_test
+    @require_vision
+    @require_timm
+    @require_torch
+    def test_pipeline_depth_estimation(self):
+        self.run_task_tests(task="depth-estimation")
+
+    @is_pipeline_test
+    @require_pytesseract
+    @require_torch
+    @require_vision
+    def test_pipeline_document_question_answering(self):
+        self.run_task_tests(task="document-question-answering")
+
+    @is_pipeline_test
+    def test_pipeline_feature_extraction(self):
+        self.run_task_tests(task="feature-extraction")
+
+    @is_pipeline_test
+    def test_pipeline_fill_mask(self):
+        self.run_task_tests(task="fill-mask")
+
+    @is_pipeline_test
+    @require_torch_or_tf
+    @require_vision
+    def test_pipeline_image_classification(self):
+        self.run_task_tests(task="image-classification")
+
+    @is_pipeline_test
+    @require_vision
+    @require_timm
+    @require_torch
+    def test_pipeline_image_segmentation(self):
+        self.run_task_tests(task="image-segmentation")
+
+    @is_pipeline_test
+    @require_vision
+    def test_pipeline_image_to_text(self):
+        self.run_task_tests(task="image-to-text")
+
+    @unittest.skip(reason="`run_pipeline_test` is currently not implemented.")
+    @is_pipeline_test
+    @require_vision
+    @require_torch
+    def test_pipeline_mask_generation(self):
+        self.run_task_tests(task="mask-generation")
+
+    @is_pipeline_test
+    @require_vision
+    @require_timm
+    @require_torch
+    def test_pipeline_object_detection(self):
+        self.run_task_tests(task="object-detection")
+
+    @is_pipeline_test
+    def test_pipeline_question_answering(self):
+        self.run_task_tests(task="question-answering")
+
+    @is_pipeline_test
+    def test_pipeline_summarization(self):
+        self.run_task_tests(task="summarization")
+
+    @is_pipeline_test
+    def test_pipeline_table_question_answering(self):
+        self.run_task_tests(task="table-question-answering")
+
+    @is_pipeline_test
+    def test_pipeline_text2text_generation(self):
+        self.run_task_tests(task="text2text-generation")
+
+    @is_pipeline_test
+    def test_pipeline_text_classification(self):
+        self.run_task_tests(task="text-classification")
+
+    @is_pipeline_test
+    @require_torch_or_tf
+    def test_pipeline_text_generation(self):
+        self.run_task_tests(task="text-generation")
+
+    @is_pipeline_test
+    @require_torch
+    def test_pipeline_text_to_audio(self):
+        self.run_task_tests(task="text-to-audio")
+
+    @is_pipeline_test
+    def test_pipeline_token_classification(self):
+        self.run_task_tests(task="token-classification")
+
+    @is_pipeline_test
+    def test_pipeline_translation(self):
+        self.run_task_tests(task="translation")
+
+    @is_pipeline_test
+    @require_torch_or_tf
+    @require_vision
+    @require_decord
+    def test_pipeline_video_classification(self):
+        self.run_task_tests(task="video-classification")
+
+    @is_pipeline_test
+    @require_torch
+    @require_vision
+    def test_pipeline_visual_question_answering(self):
+        self.run_task_tests(task="visual-question-answering")
+
+    @is_pipeline_test
+    def test_pipeline_zero_shot(self):
+        self.run_task_tests(task="zero-shot")
+
+    @is_pipeline_test
+    @require_torch
+    def test_pipeline_zero_shot_audio_classification(self):
+        self.run_task_tests(task="zero-shot-audio-classification")
+
+    @is_pipeline_test
+    @require_vision
+    def test_pipeline_zero_shot_image_classification(self):
+        self.run_task_tests(task="zero-shot-image-classification")
+
+    @is_pipeline_test
+    @require_vision
+    @require_torch
+    def test_pipeline_zero_shot_object_detection(self):
+        self.run_task_tests(task="zero-shot-object-detection")
+
+    # This contains the test cases to be skipped without model architecture being involved.
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        """Skip some tests based on the classes or their names without the instantiated objects.
+
+        This is to avoid calling `from_pretrained` (so reducing the runtime) if we already know the tests will fail.
+        """
+        # No fix is required for this case.
+        if (
+            pipeline_test_casse_name == "DocumentQuestionAnsweringPipelineTests"
+            and tokenizer_name is not None
+            and not tokenizer_name.endswith("Fast")
+        ):
+            # `DocumentQuestionAnsweringPipelineTests` requires a fast tokenizer.
+            return True
+
+        return False
+
+    def is_pipeline_test_to_skip_more(self, pipeline_test_casse_name, config, model, tokenizer, processor):  # noqa
+        """Skip some more tests based on the information from the instantiated objects."""
+        # No fix is required for this case.
+        if (
+            pipeline_test_casse_name == "QAPipelineTests"
+            and tokenizer is not None
+            and getattr(tokenizer, "pad_token", None) is None
+            and not tokenizer.__class__.__name__.endswith("Fast")
+        ):
+            # `QAPipelineTests` doesn't work with a slow tokenizer that has no pad token.
+            return True
+
+        return False
+
+
+def validate_test_components(test_case, task, model, tokenizer, processor):
+    # TODO: Move this to tiny model creation script
+    # head-specific (within a model type) necessary changes to the config
+    # 1. for `BlenderbotForCausalLM`
+    if model.__class__.__name__ == "BlenderbotForCausalLM":
+        model.config.encoder_no_repeat_ngram_size = 0
+
+    # TODO: Change the tiny model creation script: don't create models with problematic tokenizers
+    # Avoid `IndexError` in embedding layers
+    CONFIG_WITHOUT_VOCAB_SIZE = ["CanineConfig"]
+    if tokenizer is not None:
+        config_vocab_size = getattr(model.config, "vocab_size", None)
+        # For CLIP-like models
+        if config_vocab_size is None:
+            if hasattr(model.config, "text_config"):
+                config_vocab_size = getattr(model.config.text_config, "vocab_size", None)
+            elif hasattr(model.config, "text_encoder"):
+                config_vocab_size = getattr(model.config.text_encoder, "vocab_size", None)
+
+        if config_vocab_size is None and model.config.__class__.__name__ not in CONFIG_WITHOUT_VOCAB_SIZE:
+            raise ValueError(
+                "Could not determine `vocab_size` from model configuration while `tokenizer` is not `None`."
+            )
diff --git a/tests/test_sequence_feature_extraction_common.py b/tests/test_sequence_feature_extraction_common.py
index 0114f4016da6..f63ad1ede4f5 100644
--- a/tests/test_sequence_feature_extraction_common.py
+++ b/tests/test_sequence_feature_extraction_common.py
@@ -23,7 +23,6 @@
 
 
 class SequenceFeatureExtractionTestMixin(FeatureExtractionSavingTestMixin):
-
     # to overwrite at feature extractactor specific tests
     feat_extract_tester = None
     feature_extraction_class = None
@@ -392,7 +391,7 @@ def test_attention_mask(self):
         feat_dict["return_attention_mask"] = True
         feat_extract = self.feature_extraction_class(**feat_dict)
         speech_inputs = self.feat_extract_tester.prepare_inputs_for_common()
-        input_lenghts = [len(x) for x in speech_inputs]
+        input_lengths = [len(x) for x in speech_inputs]
         input_name = feat_extract.model_input_names[0]
 
         processed = BatchFeature({input_name: speech_inputs})
@@ -400,25 +399,25 @@ def test_attention_mask(self):
         processed = feat_extract.pad(processed, padding="longest", return_tensors="np")
         self.assertIn("attention_mask", processed)
         self.assertListEqual(list(processed.attention_mask.shape), list(processed[input_name].shape[:2]))
-        self.assertListEqual(processed.attention_mask.sum(-1).tolist(), input_lenghts)
+        self.assertListEqual(processed.attention_mask.sum(-1).tolist(), input_lengths)
 
     def test_attention_mask_with_truncation(self):
         feat_dict = self.feat_extract_dict
         feat_dict["return_attention_mask"] = True
         feat_extract = self.feature_extraction_class(**feat_dict)
         speech_inputs = self.feat_extract_tester.prepare_inputs_for_common()
-        input_lenghts = [len(x) for x in speech_inputs]
+        input_lengths = [len(x) for x in speech_inputs]
         input_name = feat_extract.model_input_names[0]
 
         processed = BatchFeature({input_name: speech_inputs})
-        max_length = min(input_lenghts)
+        max_length = min(input_lengths)
 
         processed_pad = feat_extract.pad(
             processed, padding="max_length", max_length=max_length, truncation=True, return_tensors="np"
         )
         self.assertIn("attention_mask", processed_pad)
         self.assertListEqual(
-            list(processed_pad.attention_mask.shape), list((processed_pad[input_name].shape[0], max_length))
+            list(processed_pad.attention_mask.shape), [processed_pad[input_name].shape[0], max_length]
         )
         self.assertListEqual(
             processed_pad.attention_mask[:, :max_length].sum(-1).tolist(), [max_length for x in speech_inputs]
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index 8ca460449e24..523d49bc9d34 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -21,26 +21,20 @@
 import pickle
 import re
 import shutil
-import sys
 import tempfile
+import traceback
 import unittest
-import unittest.mock as mock
 from collections import OrderedDict
 from itertools import takewhile
-from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union
 
-from huggingface_hub import HfFolder, delete_repo, set_access_token
-from huggingface_hub.file_download import http_get
 from parameterized import parameterized
-from requests.exceptions import HTTPError
+
 from transformers import (
     AlbertTokenizer,
     AlbertTokenizerFast,
-    AutoTokenizer,
     BertTokenizer,
     BertTokenizerFast,
-    GPT2TokenizerFast,
     PreTrainedTokenizer,
     PreTrainedTokenizerBase,
     PreTrainedTokenizerFast,
@@ -49,23 +43,21 @@
     TrainingArguments,
     is_flax_available,
     is_tf_available,
-    is_tokenizers_available,
     is_torch_available,
     logging,
 )
 from transformers.testing_utils import (
-    TOKEN,
-    USER,
     check_json_file_has_correct_format,
     get_tests_dir,
     is_pt_tf_cross_test,
-    is_staging_test,
+    require_jinja,
     require_tf,
     require_tokenizers,
     require_torch,
+    run_test_in_subprocess,
     slow,
 )
-from transformers.tokenization_utils import AddedToken, Trie
+from transformers.tokenization_utils import AddedToken
 
 
 if is_torch_available():
@@ -76,15 +68,6 @@
     from transformers import PretrainedConfig, PreTrainedModel, TFPreTrainedModel
 
 
-sys.path.append(str(Path(__file__).parent.parent / "utils"))
-
-from test_module.custom_tokenization import CustomTokenizer  # noqa E402
-
-
-if is_tokenizers_available():
-    from test_module.custom_tokenization_fast import CustomTokenizerFast
-
-
 logger = logging.get_logger(__name__)
 
 NON_ENGLISH_TAGS = ["chinese", "dutch", "french", "finnish", "german", "multilingual"]
@@ -97,7 +80,7 @@
 
 def filter_non_english(_, pretrained_name: str):
     """Filter all the model for non-english language"""
-    return not any([lang in pretrained_name for lang in NON_ENGLISH_TAGS])
+    return not any(lang in pretrained_name for lang in NON_ENGLISH_TAGS)
 
 
 def filter_roberta_detectors(_, pretrained_name: str):
@@ -130,8 +113,72 @@ def merge_model_tokenizer_mappings(
     return model_tokenizer_mapping
 
 
-class TokenizerTesterMixin:
+def _test_subword_regularization_tokenizer(in_queue, out_queue, timeout):
+    error = None
+
+    try:
+        inputs = in_queue.get(timeout=timeout)
+        tokenizer = inputs["tokenizer"]
+        sp_model_kwargs = inputs["sp_model_kwargs"]
+        test_sentencepiece_ignore_case = inputs["test_sentencepiece_ignore_case"]
+
+        unittest.TestCase().assertTrue(hasattr(tokenizer, "sp_model_kwargs"))
+        unittest.TestCase().assertIsNotNone(tokenizer.sp_model_kwargs)
+        unittest.TestCase().assertTrue(isinstance(tokenizer.sp_model_kwargs, dict))
+        unittest.TestCase().assertDictEqual(tokenizer.sp_model_kwargs, sp_model_kwargs)
+        check_subword_sampling(tokenizer, test_sentencepiece_ignore_case=test_sentencepiece_ignore_case)
+
+    except Exception:
+        error = f"{traceback.format_exc()}"
+
+    results = {"error": error}
+    out_queue.put(results, timeout=timeout)
+    out_queue.join()
+
+
+def check_subword_sampling(
+    tokenizer: PreTrainedTokenizer,
+    text: str = None,
+    test_sentencepiece_ignore_case: bool = True,
+) -> None:
+    """
+    Check if the tokenizer generates different results when subword regularization is enabled.
+
+    Subword regularization augments training data with subword sampling.
+    This has a random component.
+
+    Args:
+        tokenizer: The tokenizer to check.
+        text: The text to use for the checks.
+        test_sentencepiece_ignore_case: See `TokenizerTesterMixin.test_sentencepiece_ignore_case`.
+    """
+    text = "This is a test for subword regularization." if text is None else text
+    if test_sentencepiece_ignore_case:
+        text = text.lower()
+
+    tokens_list = []
+    for _ in range(5):
+        tokens_list.append(tokenizer.tokenize(text))
+
+    # the list of different pairs of tokens_list
+    combinations = itertools.combinations(tokens_list, 2)
+
+    # check of sampling is done
+    subword_sampling_found = False
+    for combination in combinations:
+        if combination[0] != combination[1]:
+            subword_sampling_found = True
+    unittest.TestCase().assertTrue(subword_sampling_found)
+
+    # check if converting back to original text works
+    for tokens in tokens_list:
+        if test_sentencepiece_ignore_case:
+            unittest.TestCase().assertEqual(text, tokenizer.convert_tokens_to_string(tokens).lower())
+        else:
+            unittest.TestCase().assertEqual(text, tokenizer.convert_tokens_to_string(tokens))
+
 
+class TokenizerTesterMixin:
     tokenizer_class = None
     rust_tokenizer_class = None
     test_slow_tokenizer = True
@@ -181,7 +228,10 @@ def get_input_output_texts(self, tokenizer):
         return input_txt, input_txt
 
     def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5) -> Tuple[str, list]:
-        toks = [(i, tokenizer.decode([i], clean_up_tokenization_spaces=False)) for i in range(len(tokenizer))]
+        # the length of the tokenizer does not always represent the tokens that it can encode: what if there are holes?
+        toks = [
+            (i, tokenizer.decode([i], clean_up_tokenization_spaces=False)) for i in set(tokenizer.get_vocab().values())
+        ]
         toks = list(filter(lambda t: re.match(r"^[ a-zA-Z]+$", t[1]), toks))
         toks = list(filter(lambda t: [t[0]] == tokenizer.encode(t[1], add_special_tokens=False), toks))
         if max_length is not None and len(toks) > max_length:
@@ -343,15 +393,11 @@ def test_tokenize_special_tokens(self):
                 SPECIAL_TOKEN_1 = "[SPECIAL_TOKEN_1]"
                 SPECIAL_TOKEN_2 = "[SPECIAL_TOKEN_2]"
 
-                # TODO:
-                # Can we combine `unique_no_split_tokens` and `all_special_tokens`(and properties related to it)
-                # with one variable(property) for a better maintainability?
-
-                # `add_tokens` method stores special tokens only in `tokenizer.unique_no_split_tokens`. (in tokenization_utils.py)
+                # Both methods should add the token to `_additional_special_tokens` and `added_tokens_decoder`
                 tokenizer.add_tokens([SPECIAL_TOKEN_1], special_tokens=True)
-                # `add_special_tokens` method stores special tokens in `tokenizer.additional_special_tokens`,
-                # which also occur in `tokenizer.all_special_tokens`. (in tokenization_utils_base.py)
-                tokenizer.add_special_tokens({"additional_special_tokens": [SPECIAL_TOKEN_2]})
+                tokenizer.add_special_tokens(
+                    {"additional_special_tokens": [SPECIAL_TOKEN_2]}, replace_additional_special_tokens=False
+                )
 
                 token_1 = tokenizer.tokenize(SPECIAL_TOKEN_1)
                 token_2 = tokenizer.tokenize(SPECIAL_TOKEN_2)
@@ -420,11 +466,15 @@ def test_subword_regularization_tokenizer(self) -> None:
         sp_model_kwargs = {"enable_sampling": True, "alpha": 0.1, "nbest_size": -1}
         tokenizer = self.get_tokenizer(sp_model_kwargs=sp_model_kwargs)
 
-        self.assertTrue(hasattr(tokenizer, "sp_model_kwargs"))
-        self.assertIsNotNone(tokenizer.sp_model_kwargs)
-        self.assertTrue(isinstance(tokenizer.sp_model_kwargs, dict))
-        self.assertEqual(tokenizer.sp_model_kwargs, sp_model_kwargs)
-        self.check_subword_sampling(tokenizer)
+        run_test_in_subprocess(
+            test_case=self,
+            target_func=_test_subword_regularization_tokenizer,
+            inputs={
+                "tokenizer": tokenizer,
+                "sp_model_kwargs": sp_model_kwargs,
+                "test_sentencepiece_ignore_case": self.test_sentencepiece_ignore_case,
+            },
+        )
 
     def test_pickle_subword_regularization_tokenizer(self) -> None:
         if not self.test_sentencepiece:
@@ -438,11 +488,15 @@ def test_pickle_subword_regularization_tokenizer(self) -> None:
         del tokenizer
         tokenizer_new = pickle.loads(tokenizer_bin)
 
-        self.assertTrue(hasattr(tokenizer_new, "sp_model_kwargs"))
-        self.assertIsNotNone(tokenizer_new.sp_model_kwargs)
-        self.assertTrue(isinstance(tokenizer_new.sp_model_kwargs, dict))
-        self.assertEqual(tokenizer_new.sp_model_kwargs, sp_model_kwargs)
-        self.check_subword_sampling(tokenizer_new)
+        run_test_in_subprocess(
+            test_case=self,
+            target_func=_test_subword_regularization_tokenizer,
+            inputs={
+                "tokenizer": tokenizer_new,
+                "sp_model_kwargs": sp_model_kwargs,
+                "test_sentencepiece_ignore_case": self.test_sentencepiece_ignore_case,
+            },
+        )
 
     def test_save_sentencepiece_tokenizer(self) -> None:
         if not self.test_sentencepiece or not self.test_slow_tokenizer:
@@ -671,7 +725,9 @@ def test_save_and_load_tokenizer(self):
                 tokenizer.add_tokens(["bim", "bambam"])
                 additional_special_tokens = tokenizer.additional_special_tokens
                 additional_special_tokens.append("new_additional_special_token")
-                tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+                tokenizer.add_special_tokens(
+                    {"additional_special_tokens": additional_special_tokens}, replace_additional_special_tokens=False
+                )
                 before_tokens = tokenizer.encode(sample_text, add_special_tokens=False)
                 before_vocab = tokenizer.get_vocab()
                 tokenizer.save_pretrained(tmpdirname)
@@ -680,6 +736,7 @@ def test_save_and_load_tokenizer(self):
                 after_tokens = after_tokenizer.encode(sample_text, add_special_tokens=False)
                 after_vocab = after_tokenizer.get_vocab()
                 self.assertListEqual(before_tokens, after_tokens)
+
                 self.assertDictEqual(before_vocab, after_vocab)
                 self.assertIn("bim", after_vocab)
                 self.assertIn("bambam", after_vocab)
@@ -704,7 +761,9 @@ def test_save_and_load_tokenizer(self):
                 tokenizer.add_tokens(["bim", "bambam"])
                 additional_special_tokens = tokenizer.additional_special_tokens
                 additional_special_tokens.append("new_additional_special_token")
-                tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+                tokenizer.add_special_tokens(
+                    {"additional_special_tokens": additional_special_tokens}, replace_additional_special_tokens=False
+                )
                 before_tokens = tokenizer.encode(sample_text, add_special_tokens=False)
                 before_vocab = tokenizer.get_vocab()
                 tokenizer.save_pretrained(tmpdirname)
@@ -789,7 +848,7 @@ def test_added_tokens_do_lower_case(self):
                 tokenized_sequence = "".join(tokenizer.tokenize(sequence_with_special_tokens))
 
                 for special_token in tokenizer.all_special_tokens:
-                    self.assertTrue(special_token in tokenized_sequence)
+                    self.assertTrue(special_token in tokenized_sequence or special_token.lower() in tokenized_sequence)
 
         tokenizers = self.get_tokenizers(do_lower_case=True)
         for tokenizer in tokenizers:
@@ -819,6 +878,7 @@ def test_added_tokens_do_lower_case(self):
                     len(toks_before_adding) > len(toks_after_adding),  # toks_before_adding should be longer
                 )
 
+    # TODO @ArthurZ Nuke this
     def test_add_tokens_tokenizer(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
@@ -828,7 +888,7 @@ def test_add_tokens_tokenizer(self):
 
                 self.assertNotEqual(vocab_size, 0)
 
-                # We usually have added tokens from the start in tests because our vocab fixtures are
+                # We usually have added tokens from the start in tests (but also otherwise) because our vocab fixtures are
                 # smaller than the original vocabs - let's not assert this
                 # self.assertEqual(vocab_size, all_size)
 
@@ -848,7 +908,10 @@ def test_add_tokens_tokenizer(self):
                 self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
                 self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
 
-                new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"}
+                new_toks_2 = {
+                    "eos_token": AddedToken(">>>>|||<||<<|<<", rstrip=True, lstrip=True),
+                    "pad_token": AddedToken("<<<<<|||>|>>>>|>", rstrip=True, lstrip=True),
+                }
                 added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
                 vocab_size_3 = tokenizer.vocab_size
                 all_size_3 = len(tokenizer)
@@ -859,12 +922,13 @@ def test_add_tokens_tokenizer(self):
                 self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
 
                 tokens = tokenizer.encode(
-                    ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l", add_special_tokens=False
+                    ">>>>|||<||<<|<< aaaaa bbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l", add_special_tokens=False
                 )
 
                 self.assertGreaterEqual(len(tokens), 6)
                 self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
                 self.assertGreater(tokens[0], tokens[1])
+
                 self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
                 self.assertGreater(tokens[-2], tokens[-3])
                 self.assertEqual(tokens[0], tokenizer.eos_token_id)
@@ -876,9 +940,10 @@ def test_add_special_tokens(self):
             with self.subTest(f"{tokenizer.__class__.__name__}"):
                 input_text, ids = self.get_clean_sequence(tokenizer)
 
-                special_token = "[SPECIAL_TOKEN]"
+                special_token = AddedToken("[SPECIAL_TOKEN]", lstrip=True, rstrip=True)
 
                 tokenizer.add_special_tokens({"cls_token": special_token})
+                special_token = str(special_token)
                 encoded_special_token = tokenizer.encode(special_token, add_special_tokens=False)
                 self.assertEqual(len(encoded_special_token), 1)
 
@@ -912,16 +977,17 @@ def test_internal_consistency(self):
 
     @require_tokenizers
     def test_encode_decode_with_spaces(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
+        tokenizers = self.get_tokenizers(do_lower_case=False, fast=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 new_toks = [
-                    AddedToken("[ABC]", normalized=False),
-                    AddedToken("[DEF]", normalized=False),
-                    AddedToken("GHI IHG", normalized=False),
+                    # These are added tokens, they will be normalized....
+                    AddedToken("[ABC]", normalized=True, lstrip=True, rstrip=True),
+                    AddedToken("[DEF]", normalized=True, lstrip=True, rstrip=True),
+                    AddedToken("GHI IHG", normalized=True, lstrip=True, rstrip=True),
                 ]
                 tokenizer.add_tokens(new_toks)
+                tokenizer.add_tokens([AddedToken("[SAMPLE]", normalized=True)], special_tokens=True)
                 input = "[ABC][DEF][ABC]GHI IHG[DEF]"
                 if self.space_between_special_tokens:
                     output = "[ABC] [DEF] [ABC] GHI IHG [DEF]"
@@ -929,7 +995,23 @@ def test_encode_decode_with_spaces(self):
                     output = input
                 encoded = tokenizer.encode(input, add_special_tokens=False)
                 decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens)
+
                 self.assertIn(decoded, [output, output.lower()])
+                return
+                # TODO  @ArthurZ Refactor testing as now the do_normalize works for special and non special
+                encoded = tokenizer.encode("[ABC] [DEF][SAMPLE]", add_special_tokens=False)
+                decoded = tokenizer.decode(encoded, spaces_between_special_tokens=True, skip_special_tokens=False)
+                self.assertIn(decoded, ["[ABC] [DEF] [SAMPLE]", "[ABC] [DEF] [SAMPLE]".lower()])
+
+                decoded = tokenizer.decode(encoded, spaces_between_special_tokens=True, skip_special_tokens=True)
+                self.assertIn(decoded, ["[ABC] [DEF]", "[ABC] [DEF]".lower()])
+
+                encoded = tokenizer.encode("[ABC][SAMPLE][DEF]", add_special_tokens=False)
+                decoded = tokenizer.decode(encoded, spaces_between_special_tokens=True)
+                self.assertIn(decoded, ["[ABC] [SAMPLE] [DEF]", "[ABC][SAMPLE][DEF]".lower()])
+
+                decoded = tokenizer.decode(encoded, spaces_between_special_tokens=False)
+                self.assertIn(decoded, ["[ABC][SAMPLE][DEF]", "[ABC][SAMPLE][DEF]".lower()])
 
     def test_pretrained_model_lists(self):
         # We should have at least one default checkpoint for each tokenizer
@@ -953,7 +1035,6 @@ def test_mask_output(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 if (
                     tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer"
                     and "token_type_ids" in tokenizer.model_input_names
@@ -1000,11 +1081,44 @@ def test_sequence_ids(self):
                 if tokenizer.num_special_tokens_to_add(pair=True):
                     self.assertIn(None, output.sequence_ids())
 
+    @require_jinja
+    def test_chat_template(self):
+        dummy_template = "{% for message in messages %}{{message['role'] + message['content']}}{% endfor %}"
+        dummy_conversation = [
+            {"role": "system", "content": "system message"},
+            {"role": "user", "content": "user message"},
+            {"role": "assistant", "content": "assistant message"},
+        ]
+        expected_output = "systemsystem messageuseruser messageassistantassistant message"
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                output = tokenizer.apply_chat_template(
+                    dummy_conversation, chat_template=dummy_template, tokenize=False
+                )
+                self.assertEqual(output, expected_output)  # Test we can pass chat_template arg
+                # Check that no error raised when tokenize=True
+                tokenizer.apply_chat_template(dummy_conversation, chat_template=dummy_template, tokenize=True)
+
+                tokenizer.chat_template = dummy_template
+                self.assertEqual(tokenizer.chat_template, dummy_template)  # Test property setter
+                output = tokenizer.apply_chat_template(dummy_conversation, tokenize=False)
+                self.assertEqual(output, expected_output)  # Test chat_template attribute is used if no arg is passed
+                tokenizer.apply_chat_template(dummy_conversation, tokenize=True)  # Check that no error raised
+
+                with tempfile.TemporaryDirectory() as tmp_dir_name:
+                    tokenizer.save_pretrained(tmp_dir_name)
+                    tokenizer = tokenizer.from_pretrained(tmp_dir_name)
+
+                self.assertEqual(tokenizer.chat_template, dummy_template)  # Test template has persisted
+                output = tokenizer.apply_chat_template(dummy_conversation, tokenize=False)
+                self.assertEqual(output, expected_output)  # Test output is the same after reloading
+                tokenizer.apply_chat_template(dummy_conversation, tokenize=True)  # Check that no error raised
+
     def test_number_of_added_tokens(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 seq_0 = "Test this method."
                 seq_1 = "With these inputs."
 
@@ -2068,11 +2182,12 @@ def test_added_token_are_matched_longest_first(self):
 
     @require_tokenizers
     def test_added_token_serializable(self):
+        # TODO this is tested 10_000 times....
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
                 new_token = AddedToken("new_token", lstrip=True)
-                tokenizer.add_special_tokens({"additional_special_tokens": [new_token]})
+                tokenizer.add_tokens([new_token])
 
                 with tempfile.TemporaryDirectory() as tmp_dir_name:
                     tokenizer.save_pretrained(tmp_dir_name)
@@ -2140,7 +2255,6 @@ def test_pretokenized_inputs(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)  # , add_prefix_space=True)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 if hasattr(tokenizer, "add_prefix_space") and not tokenizer.add_prefix_space:
                     continue
 
@@ -2321,46 +2435,6 @@ def _check_no_pad_token_padding(self, tokenizer, sequences):
             # add pad_token_id to pass subsequent tests
             tokenizer.add_special_tokens({"pad_token": "<PAD>"})
 
-    def check_subword_sampling(
-        self,
-        tokenizer: PreTrainedTokenizer,
-        text: str = None,
-    ) -> None:
-        """
-        Check if the tokenizer generates different results when subword regularization is enabled.
-
-        Subword regularization augments training data with subword sampling.
-        This has a random component.
-
-        Args:
-            tokenizer: The tokenizer to check.
-            text: The text to use for the checks.
-        """
-        text = "This is a test for subword regularization." if text is None else text
-        if self.test_sentencepiece_ignore_case:
-            text = text.lower()
-
-        tokens_list = []
-        for _ in range(5):
-            tokens_list.append(tokenizer.tokenize(text))
-
-        # the list of different pairs of tokens_list
-        combinations = itertools.combinations(tokens_list, 2)
-
-        # check of sampling is done
-        subword_sampling_found = False
-        for combination in combinations:
-            if combination[0] != combination[1]:
-                subword_sampling_found = True
-        self.assertTrue(subword_sampling_found)
-
-        # check if converting back to original text works
-        for tokens in tokens_list:
-            if self.test_sentencepiece_ignore_case:
-                self.assertEqual(text, tokenizer.convert_tokens_to_string(tokens).lower())
-            else:
-                self.assertEqual(text, tokenizer.convert_tokens_to_string(tokens))
-
     @require_torch
     @slow
     def test_torch_encode_plus_sent_to_model(self):
@@ -2373,7 +2447,6 @@ def test_torch_encode_plus_sent_to_model(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
                     return
 
@@ -2474,7 +2547,7 @@ def test_np_encode_plus_sent_to_model(self):
                 batch_encoded_sequence = tokenizer.batch_encode_plus([sequence, sequence], return_tensors="np")
 
                 # TODO: add forward through JAX/Flax when PR is merged
-                # This is currently here to make flake8 happy !
+                # This is currently here to make ruff happy !
                 if encoded_sequence is None:
                     raise ValueError("Cannot convert list to numpy tensor on  encode_plus()")
 
@@ -2489,7 +2562,7 @@ def test_np_encode_plus_sent_to_model(self):
                     )
 
                     # TODO: add forward through JAX/Flax when PR is merged
-                    # This is currently here to make flake8 happy !
+                    # This is currently here to make ruff happy !
                     if encoded_sequence_fast is None:
                         raise ValueError("Cannot convert list to numpy tensor on  encode_plus() (fast)")
 
@@ -2872,6 +2945,7 @@ def test_special_tokens_map_equal(self):
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                # sometimes the tokenizer saved online is not the same
                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
                 tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
 
@@ -2956,7 +3030,6 @@ def test_batch_encode_dynamic_overflowing(self):
             tokenizer = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
 
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name}, {tokenizer.__class__.__name__})"):
-
                 if is_torch_available():
                     returned_tensor = "pt"
                 elif is_tf_available():
@@ -3136,19 +3209,27 @@ def test_build_inputs_with_special_tokens(self):
                 # output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair)
                 # self.assertEqual(output_p, output_r)
 
-                # Input tokens id
-                input_simple = tokenizer_p.encode("This is a sample input", add_special_tokens=False)
-                input_pair = tokenizer_p.encode("This is a sample pair", add_special_tokens=False)
+                input_pairs = [
+                    ("", ""),
+                    ("", "This is a sample pair"),
+                    ("This is a sample input", ""),
+                    ("This is a sample input", "This is a sample pair"),
+                ]
 
-                # Generate output
-                output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple)
-                output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple)
-                self.assertEqual(output_p, output_r)
+                for sample_input, sample_pair in input_pairs:
+                    # Input tokens id
+                    input_simple = tokenizer_p.encode(sample_input, add_special_tokens=False)
+                    input_pair = tokenizer_p.encode(sample_pair, add_special_tokens=False)
 
-                # Generate pair output
-                output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair)
-                output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair)
-                self.assertEqual(output_p, output_r)
+                    # Generate output
+                    output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple)
+                    output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple)
+                    self.assertEqual(output_p, output_r)
+
+                    # Generate pair output
+                    output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair)
+                    output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair)
+                    self.assertEqual(output_p, output_r)
 
     def test_padding(self, max_length=50):
         if not self.test_slow_tokenizer:
@@ -3496,8 +3577,8 @@ def test_embeded_special_tokens(self):
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
                 tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
                 sentence = "A, <mask> AllenNLP sentence."
                 tokens_r = tokenizer_r.encode_plus(
                     sentence,
@@ -3579,9 +3660,7 @@ def test_compare_prepare_for_model(self):
     def test_special_tokens_initialization(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-
                 added_tokens = [AddedToken("<special>", lstrip=True)]
-
                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(
                     pretrained_name, additional_special_tokens=added_tokens, **kwargs
                 )
@@ -3592,6 +3671,7 @@ def test_special_tokens_initialization(self):
                 self.assertTrue(special_token_id in r_output)
 
                 if self.test_slow_tokenizer:
+                    # in rust fast, you lose the information of the AddedToken when initializing with `additional_special_tokens`
                     tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
                         pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
                     )
@@ -3609,37 +3689,32 @@ def test_special_tokens_initialization(self):
                     self.assertTrue(special_token_id in cr_output)
 
     def test_special_tokens_initialization_with_non_empty_additional_special_tokens(self):
+        # This test no longer support rust tokenizers, because the only file that should be looked
+        # at by the fast tokenizer with the new saving format is `tokenizer_config.json`.
+        # The previous behaviour is very strange too. Fast tokenizer should not save 3 files, but just one. Can never do slow from fast.
         tokenizer_list = []
         if self.test_slow_tokenizer:
             tokenizer_list.append((self.tokenizer_class, self.get_tokenizer()))
 
-        if self.test_rust_tokenizer:
-            tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer()))
-
         for tokenizer_class, tokenizer_utils in tokenizer_list:
             with tempfile.TemporaryDirectory() as tmp_dir:
                 tokenizer_utils.save_pretrained(tmp_dir)
-
-                with open(os.path.join(tmp_dir, "special_tokens_map.json"), encoding="utf-8") as json_file:
-                    special_tokens_map = json.load(json_file)
-
-                with open(os.path.join(tmp_dir, "tokenizer_config.json"), encoding="utf-8") as json_file:
+                # only legacy save will check this
+                tokenizer_path = "tokenizer_config.json"
+                with open(os.path.join(tmp_dir, tokenizer_path), encoding="utf-8") as json_file:
                     tokenizer_config = json.load(json_file)
 
-                special_tokens_map["additional_special_tokens"] = ["an_additional_special_token"]
                 tokenizer_config["additional_special_tokens"] = ["an_additional_special_token"]
 
-                with open(os.path.join(tmp_dir, "special_tokens_map.json"), "w", encoding="utf-8") as outfile:
-                    json.dump(special_tokens_map, outfile)
-                with open(os.path.join(tmp_dir, "tokenizer_config.json"), "w", encoding="utf-8") as outfile:
+                with open(os.path.join(tmp_dir, tokenizer_path), "w", encoding="utf-8") as outfile:
                     json.dump(tokenizer_config, outfile)
 
                 # the following checks allow us to verify that our test works as expected, i.e. that the tokenizer takes
                 # into account the new value of additional_special_tokens given in the "tokenizer_config.json" and
                 # "special_tokens_map.json" files
-                tokenizer_without_change_in_init = tokenizer_class.from_pretrained(
-                    tmp_dir,
-                )
+
+                # TODO ArthurZ ... Ok so for legacy we have to support this I guess..... (special_tokens_map + additional)
+                tokenizer_without_change_in_init = tokenizer_class.from_pretrained(tmp_dir)
                 self.assertIn(
                     "an_additional_special_token", tokenizer_without_change_in_init.additional_special_tokens
                 )
@@ -3771,17 +3846,18 @@ def test_training_new_tokenizer_with_special_tokens_change(self):
                     ):
                         find = True
                         break
+                special_token.content = new_special_token_str
                 self.assertTrue(
                     find,
-                    f"'{new_special_token_str}' doesn't appear in the list "
-                    f"'{new_tokenizer.all_special_tokens_extended}' as an AddedToken with the same parameters as "
-                    f"'{special_token}' in the list {tokenizer.all_special_tokens_extended}",
+                    f"'{special_token.__repr__()}' should appear as an `AddedToken` in the all_special_tokens_extended = "
+                    f"{[k for k in new_tokenizer.all_special_tokens_extended if str(k)==new_special_token_str]} but it is missing"
+                    ", this means that the new tokenizers did not keep the `rstrip`, `lstrip`, `normalized` etc attributes.",
                 )
             elif special_token not in special_tokens_map:
                 # The special token must appear identically in the list of the new tokenizer.
                 self.assertTrue(
                     special_token in new_tokenizer.all_special_tokens_extended,
-                    f"'{special_token}' should be in {new_tokenizer.all_special_tokens_extended}",
+                    f"'{special_token.__repr__()}' should be in {new_tokenizer.all_special_tokens_extended}",
                 )
 
             else:
@@ -3902,238 +3978,74 @@ def test_save_slow_from_fast_and_reload_fast(self):
                     # Should not raise an error
                     self.rust_tokenizer_class.from_pretrained(tmp_dir_2)
 
-
-class TokenizerUtilTester(unittest.TestCase):
-    def test_cached_files_are_used_when_internet_is_down(self):
-        # A mock response for an HTTP head request to emulate server down
-        response_mock = mock.Mock()
-        response_mock.status_code = 500
-        response_mock.headers = {}
-        response_mock.raise_for_status.side_effect = HTTPError
-        response_mock.json.return_value = {}
-
-        # Download this model to make sure it's in the cache.
-        _ = BertTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        # Under the mock environment we get a 500 error when trying to reach the tokenizer.
-        with mock.patch("requests.request", return_value=response_mock) as mock_head:
-            _ = BertTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert")
-            # This check we did call the fake head request
-            mock_head.assert_called()
-
-    @require_tokenizers
-    def test_cached_files_are_used_when_internet_is_down_missing_files(self):
-        # A mock response for an HTTP head request to emulate server down
-        response_mock = mock.Mock()
-        response_mock.status_code = 500
-        response_mock.headers = {}
-        response_mock.raise_for_status.side_effect = HTTPError
-        response_mock.json.return_value = {}
-
-        # Download this model to make sure it's in the cache.
-        _ = GPT2TokenizerFast.from_pretrained("gpt2")
-
-        # Under the mock environment we get a 500 error when trying to reach the tokenizer.
-        with mock.patch("requests.request", return_value=response_mock) as mock_head:
-            _ = GPT2TokenizerFast.from_pretrained("gpt2")
-            # This check we did call the fake head request
-            mock_head.assert_called()
-
-    def test_legacy_load_from_one_file(self):
-        # This test is for deprecated behavior and can be removed in v5
-        try:
-            tmp_file = tempfile.mktemp()
-            with open(tmp_file, "wb") as f:
-                http_get("https://huggingface.co/albert-base-v1/resolve/main/spiece.model", f)
-
-            _ = AlbertTokenizer.from_pretrained(tmp_file)
-        finally:
-            os.remove(tmp_file)
-
-        # Supporting this legacy load introduced a weird bug where the tokenizer would load local files if they are in
-        # the current folder and have the right name.
-        if os.path.isfile("tokenizer.json"):
-            # We skip the test if the user has a `tokenizer.json` in this folder to avoid deleting it.
+    # TODO This is ran for all models but only tests bert...
+    def test_clean_up_tokenization_spaces(self):
+        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        assert tokenizer.clean_up_tokenization_spaces is True
+
+        tokens = tokenizer.encode("This shouldn't be! He'll go.")
+        decoded = tokenizer.decode(tokens)
+        assert decoded == "[CLS] this shouldn't be! he'll go. [SEP]"
+
+        tokenizer.clean_up_tokenization_spaces = False
+        decoded = tokenizer.decode(tokens)
+        assert decoded == "[CLS] this shouldn ' t be ! he ' ll go . [SEP]"
+        assert decoded == tokenizer.decode(tokens, clean_up_tokenization_spaces=False)
+
+        # Fast from slow
+        with tempfile.TemporaryDirectory() as tmp_dir_2:
+            tokenizer.save_pretrained(tmp_dir_2)
+            tokenizer_fast = BertTokenizerFast.from_pretrained(tmp_dir_2)
+            del tokenizer
+
+        assert tokenizer_fast.clean_up_tokenization_spaces is False
+        decoded = tokenizer_fast.decode(tokens)
+        # fast and slow don't have the same output when we don't cleanup
+        # tokenization space. Here `be!` vs `be !` and `go.` vs `go .`
+        assert decoded == "[CLS] this shouldn ' t be! he ' ll go. [SEP]"
+
+        tokenizer_fast.clean_up_tokenization_spaces = True
+        assert tokenizer_fast.clean_up_tokenization_spaces is True
+
+        decoded = tokenizer_fast.decode(tokens)
+        assert decoded == "[CLS] this shouldn't be! he'll go. [SEP]"
+
+        # Slow from fast
+        with tempfile.TemporaryDirectory() as tmp_dir_2:
+            tokenizer_fast.clean_up_tokenization_spaces = False
+            tokenizer_fast.save_pretrained(tmp_dir_2)
+            tokenizer = BertTokenizer.from_pretrained(tmp_dir_2)
+
+        assert tokenizer.clean_up_tokenization_spaces is False
+        decoded = tokenizer.decode(tokens)
+        assert decoded == "[CLS] this shouldn ' t be ! he ' ll go . [SEP]"
+
+        tokenizer.clean_up_tokenization_spaces = True
+        decoded = tokenizer.decode(tokens)
+        assert decoded == "[CLS] this shouldn't be! he'll go. [SEP]"
+
+    def test_split_special_tokens(self):
+        if not self.test_slow_tokenizer:
             return
-        try:
-            with open("tokenizer.json", "wb") as f:
-                http_get("https://huggingface.co/hf-internal-testing/tiny-random-bert/blob/main/tokenizer.json", f)
-            tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-            # The tiny random BERT has a vocab size of 1024, tiny gpt2 as a vocab size of 1000
-            self.assertEqual(tokenizer.vocab_size, 1000)
-            # Tokenizer should depend on the remote checkpoint, not the local tokenizer.json file.
-
-        finally:
-            os.remove("tokenizer.json")
-
-    def test_legacy_load_from_url(self):
-        # This test is for deprecated behavior and can be removed in v5
-        _ = AlbertTokenizer.from_pretrained("https://huggingface.co/albert-base-v1/resolve/main/spiece.model")
-
-
-@is_staging_test
-class TokenizerPushToHubTester(unittest.TestCase):
-    vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "bla", "blou"]
-
-    @classmethod
-    def setUpClass(cls):
-        cls._token = TOKEN
-        set_access_token(TOKEN)
-        HfFolder.save_token(TOKEN)
-
-    @classmethod
-    def tearDownClass(cls):
-        try:
-            delete_repo(token=cls._token, repo_id="test-tokenizer")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="valid_org/test-tokenizer-org")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="test-dynamic-tokenizer")
-        except HTTPError:
-            pass
-
-    def test_push_to_hub(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            vocab_file = os.path.join(tmp_dir, "vocab.txt")
-            with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
-                vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
-            tokenizer = BertTokenizer(vocab_file)
-
-        tokenizer.push_to_hub("test-tokenizer", use_auth_token=self._token)
-        new_tokenizer = BertTokenizer.from_pretrained(f"{USER}/test-tokenizer")
-        self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab)
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="test-tokenizer")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            tokenizer.save_pretrained(tmp_dir, repo_id="test-tokenizer", push_to_hub=True, use_auth_token=self._token)
-
-        new_tokenizer = BertTokenizer.from_pretrained(f"{USER}/test-tokenizer")
-        self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab)
-
-    def test_push_to_hub_in_organization(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            vocab_file = os.path.join(tmp_dir, "vocab.txt")
-            with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
-                vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
-            tokenizer = BertTokenizer(vocab_file)
-
-        tokenizer.push_to_hub("valid_org/test-tokenizer-org", use_auth_token=self._token)
-        new_tokenizer = BertTokenizer.from_pretrained("valid_org/test-tokenizer-org")
-        self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab)
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="valid_org/test-tokenizer-org")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            tokenizer.save_pretrained(
-                tmp_dir, repo_id="valid_org/test-tokenizer-org", push_to_hub=True, use_auth_token=self._token
-            )
 
-        new_tokenizer = BertTokenizer.from_pretrained("valid_org/test-tokenizer-org")
-        self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab)
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            special_token = "[SPECIAL_TOKEN]"
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
 
-    @require_tokenizers
-    def test_push_to_hub_dynamic_tokenizer(self):
-        CustomTokenizer.register_for_auto_class()
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            vocab_file = os.path.join(tmp_dir, "vocab.txt")
-            with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
-                vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
-            tokenizer = CustomTokenizer(vocab_file)
-
-        # No fast custom tokenizer
-        tokenizer.push_to_hub("test-dynamic-tokenizer", use_auth_token=self._token)
-
-        tokenizer = AutoTokenizer.from_pretrained(f"{USER}/test-dynamic-tokenizer", trust_remote_code=True)
-        # Can't make an isinstance check because the new_model.config is from the CustomTokenizer class of a dynamic module
-        self.assertEqual(tokenizer.__class__.__name__, "CustomTokenizer")
-
-        # Fast and slow custom tokenizer
-        CustomTokenizerFast.register_for_auto_class()
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            vocab_file = os.path.join(tmp_dir, "vocab.txt")
-            with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
-                vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
-
-            bert_tokenizer = BertTokenizerFast.from_pretrained(tmp_dir)
-            bert_tokenizer.save_pretrained(tmp_dir)
-            tokenizer = CustomTokenizerFast.from_pretrained(tmp_dir)
-
-        tokenizer.push_to_hub("test-dynamic-tokenizer", use_auth_token=self._token)
-
-        tokenizer = AutoTokenizer.from_pretrained(f"{USER}/test-dynamic-tokenizer", trust_remote_code=True)
-        # Can't make an isinstance check because the new_model.config is from the FakeConfig class of a dynamic module
-        self.assertEqual(tokenizer.__class__.__name__, "CustomTokenizerFast")
-        tokenizer = AutoTokenizer.from_pretrained(
-            f"{USER}/test-dynamic-tokenizer", use_fast=False, trust_remote_code=True
-        )
-        # Can't make an isinstance check because the new_model.config is from the FakeConfig class of a dynamic module
-        self.assertEqual(tokenizer.__class__.__name__, "CustomTokenizer")
-
-
-class TrieTest(unittest.TestCase):
-    def test_trie(self):
-        trie = Trie()
-        trie.add("Hello 友達")
-        self.assertEqual(trie.data, {"H": {"e": {"l": {"l": {"o": {" ": {"友": {"達": {"": 1}}}}}}}}})
-        trie.add("Hello")
-        trie.data
-        self.assertEqual(trie.data, {"H": {"e": {"l": {"l": {"o": {"": 1, " ": {"友": {"達": {"": 1}}}}}}}}})
-
-    def test_trie_split(self):
-        trie = Trie()
-        self.assertEqual(trie.split("[CLS] This is a extra_id_100"), ["[CLS] This is a extra_id_100"])
-        trie.add("[CLS]")
-        trie.add("extra_id_1")
-        trie.add("extra_id_100")
-        self.assertEqual(trie.split("[CLS] This is a extra_id_100"), ["[CLS]", " This is a ", "extra_id_100"])
-
-    def test_trie_single(self):
-        trie = Trie()
-        trie.add("A")
-        self.assertEqual(trie.split("ABC"), ["A", "BC"])
-        self.assertEqual(trie.split("BCA"), ["BC", "A"])
-
-    def test_trie_final(self):
-        trie = Trie()
-        trie.add("TOKEN]")
-        trie.add("[SPECIAL_TOKEN]")
-        self.assertEqual(trie.split("This is something [SPECIAL_TOKEN]"), ["This is something ", "[SPECIAL_TOKEN]"])
-
-    def test_trie_subtokens(self):
-        trie = Trie()
-        trie.add("A")
-        trie.add("P")
-        trie.add("[SPECIAL_TOKEN]")
-        self.assertEqual(trie.split("This is something [SPECIAL_TOKEN]"), ["This is something ", "[SPECIAL_TOKEN]"])
-
-    def test_trie_suffix_tokens(self):
-        trie = Trie()
-        trie.add("AB")
-        trie.add("B")
-        trie.add("C")
-        self.assertEqual(trie.split("ABC"), ["AB", "C"])
-
-    def test_trie_skip(self):
-        trie = Trie()
-        trie.add("ABC")
-        trie.add("B")
-        trie.add("CD")
-        self.assertEqual(trie.split("ABCD"), ["ABC", "D"])
-
-    def test_cut_text_hardening(self):
-        # Even if the offsets are wrong, we necessarily output correct string
-        # parts.
-        trie = Trie()
-        parts = trie.cut_text("ABC", [0, 0, 2, 1, 2, 3])
-        self.assertEqual(parts, ["AB", "C"])
+                if not tokenizer.is_fast:
+                    # bloom, gptneox etc only have a fast
+                    tokenizer.add_special_tokens({"additional_special_tokens": [special_token]})
+                    encoded_special_token = tokenizer.encode(special_token, add_special_tokens=False)
+                    self.assertEqual(len(encoded_special_token), 1)
+
+                    encoded_split_special_token = tokenizer.encode(
+                        special_token, add_special_tokens=False, split_special_tokens=True
+                    )
+                    if len(encoded_split_special_token) == 1:
+                        # if we have subword tokenization or special vocab
+                        self.assertTrue(
+                            encoded_split_special_token[0] != tokenizer.convert_tokens_to_ids(special_token)
+                        )
+                    else:
+                        self.assertTrue(len(encoded_split_special_token) > 1)
diff --git a/tests/test_tokenization_utils.py b/tests/test_tokenization_utils.py
new file mode 100644
index 000000000000..2984de97fdcb
--- /dev/null
+++ b/tests/test_tokenization_utils.py
@@ -0,0 +1,280 @@
+# coding=utf-8
+# Copyright 2019 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import tempfile
+import unittest
+import unittest.mock as mock
+from pathlib import Path
+
+from huggingface_hub import HfFolder, delete_repo
+from huggingface_hub.file_download import http_get
+from requests.exceptions import HTTPError
+
+from transformers import (
+    AlbertTokenizer,
+    AutoTokenizer,
+    BertTokenizer,
+    BertTokenizerFast,
+    GPT2TokenizerFast,
+    is_tokenizers_available,
+)
+from transformers.testing_utils import TOKEN, USER, is_staging_test, require_tokenizers
+from transformers.tokenization_utils import Trie
+
+
+sys.path.append(str(Path(__file__).parent.parent / "utils"))
+
+from test_module.custom_tokenization import CustomTokenizer  # noqa E402
+
+
+if is_tokenizers_available():
+    from test_module.custom_tokenization_fast import CustomTokenizerFast
+
+
+class TokenizerUtilTester(unittest.TestCase):
+    def test_cached_files_are_used_when_internet_is_down(self):
+        # A mock response for an HTTP head request to emulate server down
+        response_mock = mock.Mock()
+        response_mock.status_code = 500
+        response_mock.headers = {}
+        response_mock.raise_for_status.side_effect = HTTPError
+        response_mock.json.return_value = {}
+
+        # Download this model to make sure it's in the cache.
+        _ = BertTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        # Under the mock environment we get a 500 error when trying to reach the tokenizer.
+        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
+            _ = BertTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert")
+            # This check we did call the fake head request
+            mock_head.assert_called()
+
+    @require_tokenizers
+    def test_cached_files_are_used_when_internet_is_down_missing_files(self):
+        # A mock response for an HTTP head request to emulate server down
+        response_mock = mock.Mock()
+        response_mock.status_code = 500
+        response_mock.headers = {}
+        response_mock.raise_for_status.side_effect = HTTPError
+        response_mock.json.return_value = {}
+
+        # Download this model to make sure it's in the cache.
+        _ = GPT2TokenizerFast.from_pretrained("gpt2")
+
+        # Under the mock environment we get a 500 error when trying to reach the tokenizer.
+        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
+            _ = GPT2TokenizerFast.from_pretrained("gpt2")
+            # This check we did call the fake head request
+            mock_head.assert_called()
+
+    def test_legacy_load_from_one_file(self):
+        # This test is for deprecated behavior and can be removed in v5
+        try:
+            tmp_file = tempfile.mktemp()
+            with open(tmp_file, "wb") as f:
+                http_get("https://huggingface.co/albert-base-v1/resolve/main/spiece.model", f)
+
+            _ = AlbertTokenizer.from_pretrained(tmp_file)
+        finally:
+            os.remove(tmp_file)
+
+        # Supporting this legacy load introduced a weird bug where the tokenizer would load local files if they are in
+        # the current folder and have the right name.
+        if os.path.isfile("tokenizer.json"):
+            # We skip the test if the user has a `tokenizer.json` in this folder to avoid deleting it.
+            return
+        try:
+            with open("tokenizer.json", "wb") as f:
+                http_get("https://huggingface.co/hf-internal-testing/tiny-random-bert/blob/main/tokenizer.json", f)
+            tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+            # The tiny random BERT has a vocab size of 1024, tiny gpt2 as a vocab size of 1000
+            self.assertEqual(tokenizer.vocab_size, 1000)
+            # Tokenizer should depend on the remote checkpoint, not the local tokenizer.json file.
+
+        finally:
+            os.remove("tokenizer.json")
+
+    def test_legacy_load_from_url(self):
+        # This test is for deprecated behavior and can be removed in v5
+        _ = AlbertTokenizer.from_pretrained("https://huggingface.co/albert-base-v1/resolve/main/spiece.model")
+
+
+@is_staging_test
+class TokenizerPushToHubTester(unittest.TestCase):
+    vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "bla", "blou"]
+
+    @classmethod
+    def setUpClass(cls):
+        cls._token = TOKEN
+        HfFolder.save_token(TOKEN)
+
+    @classmethod
+    def tearDownClass(cls):
+        try:
+            delete_repo(token=cls._token, repo_id="test-tokenizer")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="valid_org/test-tokenizer-org")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="test-dynamic-tokenizer")
+        except HTTPError:
+            pass
+
+    def test_push_to_hub(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            vocab_file = os.path.join(tmp_dir, "vocab.txt")
+            with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
+                vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
+            tokenizer = BertTokenizer(vocab_file)
+
+        tokenizer.push_to_hub("test-tokenizer", use_auth_token=self._token)
+        new_tokenizer = BertTokenizer.from_pretrained(f"{USER}/test-tokenizer")
+        self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab)
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="test-tokenizer")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            tokenizer.save_pretrained(tmp_dir, repo_id="test-tokenizer", push_to_hub=True, use_auth_token=self._token)
+
+        new_tokenizer = BertTokenizer.from_pretrained(f"{USER}/test-tokenizer")
+        self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab)
+
+    def test_push_to_hub_in_organization(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            vocab_file = os.path.join(tmp_dir, "vocab.txt")
+            with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
+                vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
+            tokenizer = BertTokenizer(vocab_file)
+
+        tokenizer.push_to_hub("valid_org/test-tokenizer-org", use_auth_token=self._token)
+        new_tokenizer = BertTokenizer.from_pretrained("valid_org/test-tokenizer-org")
+        self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab)
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="valid_org/test-tokenizer-org")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            tokenizer.save_pretrained(
+                tmp_dir, repo_id="valid_org/test-tokenizer-org", push_to_hub=True, use_auth_token=self._token
+            )
+
+        new_tokenizer = BertTokenizer.from_pretrained("valid_org/test-tokenizer-org")
+        self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab)
+
+    @require_tokenizers
+    def test_push_to_hub_dynamic_tokenizer(self):
+        CustomTokenizer.register_for_auto_class()
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            vocab_file = os.path.join(tmp_dir, "vocab.txt")
+            with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
+                vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
+            tokenizer = CustomTokenizer(vocab_file)
+
+        # No fast custom tokenizer
+        tokenizer.push_to_hub("test-dynamic-tokenizer", use_auth_token=self._token)
+
+        tokenizer = AutoTokenizer.from_pretrained(f"{USER}/test-dynamic-tokenizer", trust_remote_code=True)
+        # Can't make an isinstance check because the new_model.config is from the CustomTokenizer class of a dynamic module
+        self.assertEqual(tokenizer.__class__.__name__, "CustomTokenizer")
+
+        # Fast and slow custom tokenizer
+        CustomTokenizerFast.register_for_auto_class()
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            vocab_file = os.path.join(tmp_dir, "vocab.txt")
+            with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
+                vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
+
+            bert_tokenizer = BertTokenizerFast.from_pretrained(tmp_dir)
+            bert_tokenizer.save_pretrained(tmp_dir)
+            tokenizer = CustomTokenizerFast.from_pretrained(tmp_dir)
+
+        tokenizer.push_to_hub("test-dynamic-tokenizer", use_auth_token=self._token)
+
+        tokenizer = AutoTokenizer.from_pretrained(f"{USER}/test-dynamic-tokenizer", trust_remote_code=True)
+        # Can't make an isinstance check because the new_model.config is from the FakeConfig class of a dynamic module
+        self.assertEqual(tokenizer.__class__.__name__, "CustomTokenizerFast")
+        tokenizer = AutoTokenizer.from_pretrained(
+            f"{USER}/test-dynamic-tokenizer", use_fast=False, trust_remote_code=True
+        )
+        # Can't make an isinstance check because the new_model.config is from the FakeConfig class of a dynamic module
+        self.assertEqual(tokenizer.__class__.__name__, "CustomTokenizer")
+
+
+class TrieTest(unittest.TestCase):
+    def test_trie(self):
+        trie = Trie()
+        trie.add("Hello 友達")
+        self.assertEqual(trie.data, {"H": {"e": {"l": {"l": {"o": {" ": {"友": {"達": {"": 1}}}}}}}}})
+        trie.add("Hello")
+        trie.data
+        self.assertEqual(trie.data, {"H": {"e": {"l": {"l": {"o": {"": 1, " ": {"友": {"達": {"": 1}}}}}}}}})
+
+    def test_trie_split(self):
+        trie = Trie()
+        self.assertEqual(trie.split("[CLS] This is a extra_id_100"), ["[CLS] This is a extra_id_100"])
+        trie.add("[CLS]")
+        trie.add("extra_id_1")
+        trie.add("extra_id_100")
+        self.assertEqual(trie.split("[CLS] This is a extra_id_100"), ["[CLS]", " This is a ", "extra_id_100"])
+
+    def test_trie_single(self):
+        trie = Trie()
+        trie.add("A")
+        self.assertEqual(trie.split("ABC"), ["A", "BC"])
+        self.assertEqual(trie.split("BCA"), ["BC", "A"])
+
+    def test_trie_final(self):
+        trie = Trie()
+        trie.add("TOKEN]")
+        trie.add("[SPECIAL_TOKEN]")
+        self.assertEqual(trie.split("This is something [SPECIAL_TOKEN]"), ["This is something ", "[SPECIAL_TOKEN]"])
+
+    def test_trie_subtokens(self):
+        trie = Trie()
+        trie.add("A")
+        trie.add("P")
+        trie.add("[SPECIAL_TOKEN]")
+        self.assertEqual(trie.split("This is something [SPECIAL_TOKEN]"), ["This is something ", "[SPECIAL_TOKEN]"])
+
+    def test_trie_suffix_tokens(self):
+        trie = Trie()
+        trie.add("AB")
+        trie.add("B")
+        trie.add("C")
+        self.assertEqual(trie.split("ABC"), ["AB", "C"])
+
+    def test_trie_skip(self):
+        trie = Trie()
+        trie.add("ABC")
+        trie.add("B")
+        trie.add("CD")
+        self.assertEqual(trie.split("ABCD"), ["ABC", "D"])
+
+    def test_cut_text_hardening(self):
+        # Even if the offsets are wrong, we necessarily output correct string
+        # parts.
+        trie = Trie()
+        parts = trie.cut_text("ABC", [0, 0, 2, 1, 2, 3])
+        self.assertEqual(parts, ["AB", "C"])
diff --git a/tests/tokenization/test_tokenization_fast.py b/tests/tokenization/test_tokenization_fast.py
index da98d17d7722..fc95bad6d054 100644
--- a/tests/tokenization/test_tokenization_fast.py
+++ b/tests/tokenization/test_tokenization_fast.py
@@ -52,6 +52,12 @@ def test_tokenizer_mismatch_warning(self):
         # model
         pass
 
+    @unittest.skip(
+        "We disable this test for PreTrainedTokenizerFast because it is the only tokenizer that is not linked to any model"
+    )
+    def test_encode_decode_with_spaces(self):
+        pass
+
     def test_pretrained_model_lists(self):
         # We disable this test for PreTrainedTokenizerFast because it is the only tokenizer that is not linked to any
         # model
@@ -109,6 +115,58 @@ def test_training_new_tokenizer_with_bytelevel(self):
         encoding_ids = new_tokenizer.encode("a🤗")
         self.assertEqual(encoding_ids, [64, 172, 253, 97, 245])
 
+    def test_init_from_tokenizers_model(self):
+        from tokenizers import Tokenizer
+
+        sentences = ["Hello, y'all!", "How are you 😁 ? There should not be any issue right?"]
+
+        tokenizer = Tokenizer.from_pretrained("t5-base")
+        # Enable padding
+        tokenizer.enable_padding(pad_id=0, pad_token="<pad>", length=512, pad_to_multiple_of=8)
+        self.assertEqual(
+            tokenizer.padding,
+            {
+                "length": 512,
+                "pad_to_multiple_of": 8,
+                "pad_id": 0,
+                "pad_token": "<pad>",
+                "pad_type_id": 0,
+                "direction": "right",
+            },
+        )
+        fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
+        tmpdirname = tempfile.mkdtemp()
+        fast_tokenizer.save_pretrained(tmpdirname)
+        fast_from_saved = PreTrainedTokenizerFast.from_pretrained(tmpdirname)
+        for tok in [fast_tokenizer, fast_from_saved]:
+            self.assertEqual(tok.pad_token_id, 0)
+            self.assertEqual(tok.padding_side, "right")
+            self.assertEqual(tok.pad_token, "<pad>")
+            self.assertEqual(tok.init_kwargs["max_length"], 512)
+            self.assertEqual(tok.init_kwargs["pad_to_multiple_of"], 8)
+            # fmt: off
+            self.assertEqual(tok(sentences, padding = True), {'input_ids': [[8774, 6, 3, 63, 31, 1748, 55, 1, 0, 0, 0, 0,0, 0, 0, 0],[ 571, 33, 25, 3, 2, 3, 58, 290, 225, 59, 36, 136, 962, 269, 58, 1]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]})
+            # fmt: on
+
+        tokenizer.enable_truncation(8, stride=0, strategy="longest_first", direction="right")
+        self.assertEqual(
+            tokenizer.truncation, {"max_length": 8, "stride": 0, "strategy": "longest_first", "direction": "right"}
+        )
+        fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
+        tmpdirname = tempfile.mkdtemp()
+        fast_tokenizer.save_pretrained(tmpdirname)
+        fast_from_saved = PreTrainedTokenizerFast.from_pretrained(tmpdirname)
+        for tok in [fast_tokenizer, fast_from_saved]:
+            self.assertEqual(tok.truncation_side, "right")
+            self.assertEqual(tok.init_kwargs["truncation_strategy"], "longest_first")
+            self.assertEqual(tok.init_kwargs["max_length"], 8)
+            self.assertEqual(tok.init_kwargs["stride"], 0)
+            # NOTE even if the model has a default max_length, it is not used...
+            # thus tok(sentences, truncation = True) does nothing and does not warn either
+            # fmt: off
+            self.assertEqual(tok(sentences, truncation = True, max_length = 8), {'input_ids': [[8774, 6, 3, 63, 31, 1748, 55, 1],[ 571, 33, 25, 3, 2, 3, 58, 1]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0],[0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1]]})
+            # fmt: on
+
 
 @require_tokenizers
 class TokenizerVersioningTest(unittest.TestCase):
diff --git a/tests/tokenization/test_tokenization_utils.py b/tests/tokenization/test_tokenization_utils.py
index a655b84dc16c..186fabb7aea0 100644
--- a/tests/tokenization/test_tokenization_utils.py
+++ b/tests/tokenization/test_tokenization_utils.py
@@ -23,14 +23,12 @@
 
 import numpy as np
 
-# Ensure there are no circular imports when importing the parent class
-from transformers import PreTrainedTokenizerFast
-
 from transformers import (
     BatchEncoding,
     BertTokenizer,
     BertTokenizerFast,
     PreTrainedTokenizer,
+    PreTrainedTokenizerFast,
     TensorType,
     TokenSpan,
     is_tokenizers_available,
diff --git a/tests/tools/__init__.py b/tests/tools/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/tools/test_agent_types.py b/tests/tools/test_agent_types.py
new file mode 100644
index 000000000000..a1cc4f70cc65
--- /dev/null
+++ b/tests/tools/test_agent_types.py
@@ -0,0 +1,121 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import tempfile
+import unittest
+import uuid
+from pathlib import Path
+
+from transformers.testing_utils import get_tests_dir, require_soundfile, require_torch, require_vision
+from transformers.tools.agent_types import AgentAudio, AgentImage, AgentText
+from transformers.utils import is_soundfile_availble, is_torch_available, is_vision_available
+
+
+if is_torch_available():
+    import torch
+
+if is_soundfile_availble():
+    import soundfile as sf
+
+if is_vision_available():
+    from PIL import Image
+
+
+def get_new_path(suffix="") -> str:
+    directory = tempfile.mkdtemp()
+    return os.path.join(directory, str(uuid.uuid4()) + suffix)
+
+
+@require_soundfile
+@require_torch
+class AgentAudioTests(unittest.TestCase):
+    def test_from_tensor(self):
+        tensor = torch.rand(12, dtype=torch.float64) - 0.5
+        agent_type = AgentAudio(tensor)
+        path = str(agent_type.to_string())
+
+        # Ensure that the tensor and the agent_type's tensor are the same
+        self.assertTrue(torch.allclose(tensor, agent_type.to_raw(), atol=1e-4))
+
+        del agent_type
+
+        # Ensure the path remains even after the object deletion
+        self.assertTrue(os.path.exists(path))
+
+        # Ensure that the file contains the same value as the original tensor
+        new_tensor, _ = sf.read(path)
+        self.assertTrue(torch.allclose(tensor, torch.tensor(new_tensor), atol=1e-4))
+
+    def test_from_string(self):
+        tensor = torch.rand(12, dtype=torch.float64) - 0.5
+        path = get_new_path(suffix=".wav")
+        sf.write(path, tensor, 16000)
+
+        agent_type = AgentAudio(path)
+
+        self.assertTrue(torch.allclose(tensor, agent_type.to_raw(), atol=1e-4))
+        self.assertEqual(agent_type.to_string(), path)
+
+
+@require_vision
+@require_torch
+class AgentImageTests(unittest.TestCase):
+    def test_from_tensor(self):
+        tensor = torch.randint(0, 256, (64, 64, 3))
+        agent_type = AgentImage(tensor)
+        path = str(agent_type.to_string())
+
+        # Ensure that the tensor and the agent_type's tensor are the same
+        self.assertTrue(torch.allclose(tensor, agent_type._tensor, atol=1e-4))
+
+        self.assertIsInstance(agent_type.to_raw(), Image.Image)
+
+        # Ensure the path remains even after the object deletion
+        del agent_type
+        self.assertTrue(os.path.exists(path))
+
+    def test_from_string(self):
+        path = Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png"
+        image = Image.open(path)
+        agent_type = AgentImage(path)
+
+        self.assertTrue(path.samefile(agent_type.to_string()))
+        self.assertTrue(image == agent_type.to_raw())
+
+        # Ensure the path remains even after the object deletion
+        del agent_type
+        self.assertTrue(os.path.exists(path))
+
+    def test_from_image(self):
+        path = Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png"
+        image = Image.open(path)
+        agent_type = AgentImage(image)
+
+        self.assertFalse(path.samefile(agent_type.to_string()))
+        self.assertTrue(image == agent_type.to_raw())
+
+        # Ensure the path remains even after the object deletion
+        del agent_type
+        self.assertTrue(os.path.exists(path))
+
+
+class AgentTextTests(unittest.TestCase):
+    def test_from_string(self):
+        string = "Hey!"
+        agent_type = AgentText(string)
+
+        self.assertEqual(string, agent_type.to_string())
+        self.assertEqual(string, agent_type.to_raw())
+        self.assertEqual(string, agent_type)
diff --git a/tests/tools/test_document_question_answering.py b/tests/tools/test_document_question_answering.py
new file mode 100644
index 000000000000..1d77bcb47080
--- /dev/null
+++ b/tests/tools/test_document_question_answering.py
@@ -0,0 +1,56 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from datasets import load_dataset
+
+from transformers import load_tool
+
+from .test_tools_common import ToolTesterMixin
+
+
+class DocumentQuestionAnsweringToolTester(unittest.TestCase, ToolTesterMixin):
+    def setUp(self):
+        self.tool = load_tool("document-question-answering")
+        self.tool.setup()
+        self.remote_tool = load_tool("document-question-answering", remote=True)
+
+    def test_exact_match_arg(self):
+        dataset = load_dataset("hf-internal-testing/example-documents", split="test")
+        document = dataset[0]["image"]
+
+        result = self.tool(document, "When is the coffee break?")
+        self.assertEqual(result, "11-14 to 11:39 a.m.")
+
+    def test_exact_match_arg_remote(self):
+        dataset = load_dataset("hf-internal-testing/example-documents", split="test")
+        document = dataset[0]["image"]
+
+        result = self.remote_tool(document, "When is the coffee break?")
+        self.assertEqual(result, "11-14 to 11:39 a.m.")
+
+    def test_exact_match_kwarg(self):
+        dataset = load_dataset("hf-internal-testing/example-documents", split="test")
+        document = dataset[0]["image"]
+
+        self.tool(document=document, question="When is the coffee break?")
+
+    def test_exact_match_kwarg_remote(self):
+        dataset = load_dataset("hf-internal-testing/example-documents", split="test")
+        document = dataset[0]["image"]
+
+        result = self.remote_tool(document=document, question="When is the coffee break?")
+        self.assertEqual(result, "11-14 to 11:39 a.m.")
diff --git a/tests/tools/test_image_captioning.py b/tests/tools/test_image_captioning.py
new file mode 100644
index 000000000000..fcd06eb44435
--- /dev/null
+++ b/tests/tools/test_image_captioning.py
@@ -0,0 +1,53 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from pathlib import Path
+
+from transformers import is_vision_available, load_tool
+from transformers.testing_utils import get_tests_dir
+
+from .test_tools_common import ToolTesterMixin
+
+
+if is_vision_available():
+    from PIL import Image
+
+
+class ImageCaptioningToolTester(unittest.TestCase, ToolTesterMixin):
+    def setUp(self):
+        self.tool = load_tool("image-captioning")
+        self.tool.setup()
+        self.remote_tool = load_tool("image-captioning", remote=True)
+
+    def test_exact_match_arg(self):
+        image = Image.open(Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png")
+        result = self.tool(image)
+        self.assertEqual(result, "two cats sleeping on a couch")
+
+    def test_exact_match_arg_remote(self):
+        image = Image.open(Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png")
+        result = self.remote_tool(image)
+        self.assertEqual(result, "two cats sleeping on a couch")
+
+    def test_exact_match_kwarg(self):
+        image = Image.open(Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png")
+        result = self.tool(image=image)
+        self.assertEqual(result, "two cats sleeping on a couch")
+
+    def test_exact_match_kwarg_remote(self):
+        image = Image.open(Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png")
+        result = self.remote_tool(image=image)
+        self.assertEqual(result, "two cats sleeping on a couch")
diff --git a/tests/tools/test_image_question_answering.py b/tests/tools/test_image_question_answering.py
new file mode 100644
index 000000000000..035b1b4fa045
--- /dev/null
+++ b/tests/tools/test_image_question_answering.py
@@ -0,0 +1,53 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from pathlib import Path
+
+from transformers import is_vision_available, load_tool
+from transformers.testing_utils import get_tests_dir
+
+from .test_tools_common import ToolTesterMixin
+
+
+if is_vision_available():
+    from PIL import Image
+
+
+class ImageQuestionAnsweringToolTester(unittest.TestCase, ToolTesterMixin):
+    def setUp(self):
+        self.tool = load_tool("image-question-answering")
+        self.tool.setup()
+        self.remote_tool = load_tool("image-question-answering", remote=True)
+
+    def test_exact_match_arg(self):
+        image = Image.open(Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png")
+        result = self.tool(image, "How many cats are sleeping on the couch?")
+        self.assertEqual(result, "2")
+
+    def test_exact_match_arg_remote(self):
+        image = Image.open(Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png")
+        result = self.remote_tool(image, "How many cats are sleeping on the couch?")
+        self.assertEqual(result, "2")
+
+    def test_exact_match_kwarg(self):
+        image = Image.open(Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png")
+        result = self.tool(image=image, question="How many cats are sleeping on the couch?")
+        self.assertEqual(result, "2")
+
+    def test_exact_match_kwarg_remote(self):
+        image = Image.open(Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png")
+        result = self.remote_tool(image=image, question="How many cats are sleeping on the couch?")
+        self.assertEqual(result, "2")
diff --git a/tests/tools/test_image_segmentation.py b/tests/tools/test_image_segmentation.py
new file mode 100644
index 000000000000..2f003f2c8b91
--- /dev/null
+++ b/tests/tools/test_image_segmentation.py
@@ -0,0 +1,53 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from pathlib import Path
+
+from transformers import is_vision_available, load_tool
+from transformers.testing_utils import get_tests_dir
+
+from .test_tools_common import ToolTesterMixin
+
+
+if is_vision_available():
+    from PIL import Image
+
+
+class ImageSegmentationToolTester(unittest.TestCase, ToolTesterMixin):
+    def setUp(self):
+        self.tool = load_tool("image-segmentation")
+        self.tool.setup()
+        self.remote_tool = load_tool("image-segmentation", remote=True)
+
+    def test_exact_match_arg(self):
+        image = Image.open(Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png")
+        result = self.tool(image, "cat")
+        self.assertTrue(isinstance(result, Image.Image))
+
+    def test_exact_match_arg_remote(self):
+        image = Image.open(Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png")
+        result = self.remote_tool(image, "cat")
+        self.assertTrue(isinstance(result, Image.Image))
+
+    def test_exact_match_kwarg(self):
+        image = Image.open(Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png")
+        result = self.tool(image=image, label="cat")
+        self.assertTrue(isinstance(result, Image.Image))
+
+    def test_exact_match_kwarg_remote(self):
+        image = Image.open(Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png")
+        result = self.remote_tool(image=image, label="cat")
+        self.assertTrue(isinstance(result, Image.Image))
diff --git a/tests/tools/test_python_interpreter.py b/tests/tools/test_python_interpreter.py
new file mode 100644
index 000000000000..b9a38b4a21f1
--- /dev/null
+++ b/tests/tools/test_python_interpreter.py
@@ -0,0 +1,131 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers.testing_utils import CaptureStdout
+from transformers.tools.python_interpreter import evaluate
+
+
+# Fake function we will use as tool
+def add_two(x):
+    return x + 2
+
+
+class PythonInterpreterTester(unittest.TestCase):
+    def test_evaluate_assign(self):
+        code = "x = 3"
+        state = {}
+        result = evaluate(code, {}, state=state)
+        assert result == 3
+        self.assertDictEqual(state, {"x": 3})
+
+        code = "x = y"
+        state = {"y": 5}
+        result = evaluate(code, {}, state=state)
+        # evaluate returns the value of the last assignment.
+        assert result == 5
+        self.assertDictEqual(state, {"x": 5, "y": 5})
+
+    def test_evaluate_call(self):
+        code = "y = add_two(x)"
+        state = {"x": 3}
+        result = evaluate(code, {"add_two": add_two}, state=state)
+        assert result == 5
+        self.assertDictEqual(state, {"x": 3, "y": 5})
+
+        # Won't work without the tool
+        with CaptureStdout() as out:
+            result = evaluate(code, {}, state=state)
+        assert result is None
+        assert "tried to execute add_two" in out.out
+
+    def test_evaluate_constant(self):
+        code = "x = 3"
+        state = {}
+        result = evaluate(code, {}, state=state)
+        assert result == 3
+        self.assertDictEqual(state, {"x": 3})
+
+    def test_evaluate_dict(self):
+        code = "test_dict = {'x': x, 'y': add_two(x)}"
+        state = {"x": 3}
+        result = evaluate(code, {"add_two": add_two}, state=state)
+        self.assertDictEqual(result, {"x": 3, "y": 5})
+        self.assertDictEqual(state, {"x": 3, "test_dict": {"x": 3, "y": 5}})
+
+    def test_evaluate_expression(self):
+        code = "x = 3\ny = 5"
+        state = {}
+        result = evaluate(code, {}, state=state)
+        # evaluate returns the value of the last assignment.
+        assert result == 5
+        self.assertDictEqual(state, {"x": 3, "y": 5})
+
+    def test_evaluate_f_string(self):
+        code = "text = f'This is x: {x}.'"
+        state = {"x": 3}
+        result = evaluate(code, {}, state=state)
+        # evaluate returns the value of the last assignment.
+        assert result == "This is x: 3."
+        self.assertDictEqual(state, {"x": 3, "text": "This is x: 3."})
+
+    def test_evaluate_if(self):
+        code = "if x <= 3:\n    y = 2\nelse:\n    y = 5"
+        state = {"x": 3}
+        result = evaluate(code, {}, state=state)
+        # evaluate returns the value of the last assignment.
+        assert result == 2
+        self.assertDictEqual(state, {"x": 3, "y": 2})
+
+        state = {"x": 8}
+        result = evaluate(code, {}, state=state)
+        # evaluate returns the value of the last assignment.
+        assert result == 5
+        self.assertDictEqual(state, {"x": 8, "y": 5})
+
+    def test_evaluate_list(self):
+        code = "test_list = [x, add_two(x)]"
+        state = {"x": 3}
+        result = evaluate(code, {"add_two": add_two}, state=state)
+        self.assertListEqual(result, [3, 5])
+        self.assertDictEqual(state, {"x": 3, "test_list": [3, 5]})
+
+    def test_evaluate_name(self):
+        code = "y = x"
+        state = {"x": 3}
+        result = evaluate(code, {}, state=state)
+        assert result == 3
+        self.assertDictEqual(state, {"x": 3, "y": 3})
+
+    def test_evaluate_subscript(self):
+        code = "test_list = [x, add_two(x)]\ntest_list[1]"
+        state = {"x": 3}
+        result = evaluate(code, {"add_two": add_two}, state=state)
+        assert result == 5
+        self.assertDictEqual(state, {"x": 3, "test_list": [3, 5]})
+
+        code = "test_dict = {'x': x, 'y': add_two(x)}\ntest_dict['y']"
+        state = {"x": 3}
+        result = evaluate(code, {"add_two": add_two}, state=state)
+        assert result == 5
+        self.assertDictEqual(state, {"x": 3, "test_dict": {"x": 3, "y": 5}})
+
+    def test_evaluate_for(self):
+        code = "x = 0\nfor i in range(3):\n    x = i"
+        state = {}
+        result = evaluate(code, {"range": range}, state=state)
+        assert result == 2
+        self.assertDictEqual(state, {"x": 2, "i": 2})
diff --git a/tests/tools/test_speech_to_text.py b/tests/tools/test_speech_to_text.py
new file mode 100644
index 000000000000..9383cf0465f8
--- /dev/null
+++ b/tests/tools/test_speech_to_text.py
@@ -0,0 +1,38 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import is_torch_available, load_tool
+
+from .test_tools_common import ToolTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+
+class SpeechToTextToolTester(unittest.TestCase, ToolTesterMixin):
+    def setUp(self):
+        self.tool = load_tool("speech-to-text")
+        self.tool.setup()
+
+    def test_exact_match_arg(self):
+        result = self.tool(torch.ones(3000))
+        self.assertEqual(result, " you")
+
+    def test_exact_match_kwarg(self):
+        result = self.tool(audio=torch.ones(3000))
+        self.assertEqual(result, " you")
diff --git a/tests/tools/test_text_classification.py b/tests/tools/test_text_classification.py
new file mode 100644
index 000000000000..b40067490c61
--- /dev/null
+++ b/tests/tools/test_text_classification.py
@@ -0,0 +1,43 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import load_tool
+
+from .test_tools_common import ToolTesterMixin
+
+
+class TextClassificationToolTester(unittest.TestCase, ToolTesterMixin):
+    def setUp(self):
+        self.tool = load_tool("text-classification")
+        self.tool.setup()
+        self.remote_tool = load_tool("text-classification", remote=True)
+
+    def test_exact_match_arg(self):
+        result = self.tool("That's quite cool", ["positive", "negative"])
+        self.assertEqual(result, "positive")
+
+    def test_exact_match_arg_remote(self):
+        result = self.remote_tool("That's quite cool", ["positive", "negative"])
+        self.assertEqual(result, "positive")
+
+    def test_exact_match_kwarg(self):
+        result = self.tool(text="That's quite cool", labels=["positive", "negative"])
+        self.assertEqual(result, "positive")
+
+    def test_exact_match_kwarg_remote(self):
+        result = self.remote_tool(text="That's quite cool", labels=["positive", "negative"])
+        self.assertEqual(result, "positive")
diff --git a/tests/tools/test_text_question_answering.py b/tests/tools/test_text_question_answering.py
new file mode 100644
index 000000000000..aed2898f0153
--- /dev/null
+++ b/tests/tools/test_text_question_answering.py
@@ -0,0 +1,52 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import load_tool
+
+from .test_tools_common import ToolTesterMixin
+
+
+TEXT = """
+Hugging Face was founded in 2016 by French entrepreneurs Clément Delangue, Julien Chaumond, and Thomas Wolf originally as a company that developed a chatbot app targeted at teenagers.[2] After open-sourcing the model behind the chatbot, the company pivoted to focus on being a platform for machine learning.
+
+In March 2021, Hugging Face raised $40 million in a Series B funding round.[3]
+
+On April 28, 2021, the company launched the BigScience Research Workshop in collaboration with several other research groups to release an open large language model.[4] In 2022, the workshop concluded with the announcement of BLOOM, a multilingual large language model with 176 billion parameters.[5]
+"""
+
+
+class TextQuestionAnsweringToolTester(unittest.TestCase, ToolTesterMixin):
+    def setUp(self):
+        self.tool = load_tool("text-question-answering")
+        self.tool.setup()
+        self.remote_tool = load_tool("text-question-answering", remote=True)
+
+    def test_exact_match_arg(self):
+        result = self.tool(TEXT, "What did Hugging Face do in April 2021?")
+        self.assertEqual(result, "launched the BigScience Research Workshop")
+
+    def test_exact_match_arg_remote(self):
+        result = self.remote_tool(TEXT, "What did Hugging Face do in April 2021?")
+        self.assertEqual(result, "launched the BigScience Research Workshop")
+
+    def test_exact_match_kwarg(self):
+        result = self.tool(text=TEXT, question="What did Hugging Face do in April 2021?")
+        self.assertEqual(result, "launched the BigScience Research Workshop")
+
+    def test_exact_match_kwarg_remote(self):
+        result = self.remote_tool(text=TEXT, question="What did Hugging Face do in April 2021?")
+        self.assertEqual(result, "launched the BigScience Research Workshop")
diff --git a/tests/tools/test_text_summarization.py b/tests/tools/test_text_summarization.py
new file mode 100644
index 000000000000..162443f373a4
--- /dev/null
+++ b/tests/tools/test_text_summarization.py
@@ -0,0 +1,64 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import load_tool
+
+from .test_tools_common import ToolTesterMixin
+
+
+TEXT = """
+Hugging Face was founded in 2016 by French entrepreneurs Clément Delangue, Julien Chaumond, and Thomas Wolf originally as a company that developed a chatbot app targeted at teenagers.[2] After open-sourcing the model behind the chatbot, the company pivoted to focus on being a platform for machine learning.
+
+In March 2021, Hugging Face raised $40 million in a Series B funding round.[3]
+
+On April 28, 2021, the company launched the BigScience Research Workshop in collaboration with several other research groups to release an open large language model.[4] In 2022, the workshop concluded with the announcement of BLOOM, a multilingual large language model with 176 billion parameters.[5]
+"""
+
+
+class TextSummarizationToolTester(unittest.TestCase, ToolTesterMixin):
+    def setUp(self):
+        self.tool = load_tool("summarization")
+        self.tool.setup()
+        self.remote_tool = load_tool("summarization", remote=True)
+
+    def test_exact_match_arg(self):
+        result = self.tool(TEXT)
+        self.assertEqual(
+            result,
+            "Hugging Face was founded in 2016 by French entrepreneurs Clément Delangue, Julien Chaumond, and Thomas Wolf. In March 2021, Hugging Face raised $40 million in a Series B funding round. On April 28, 2021, the company launched the BigScience Research Workshop in collaboration with several other research groups to release an open large language model. In 2022, the workshop concluded with the announcement of BLOOM.",
+        )
+
+    def test_exact_match_arg_remote(self):
+        result = self.remote_tool(TEXT)
+        self.assertEqual(
+            result,
+            "Hugging Face was founded in 2016 by French entrepreneurs Clément Delangue, Julien Chaumond, and Thomas Wolf. In March 2021, Hugging Face raised $40 million in a Series B funding round. On April 28, 2021, the company launched the BigScience Research Workshop in collaboration with several other research groups to release an open large language model. In 2022, the workshop concluded with the announcement of BLOOM.",
+        )
+
+    def test_exact_match_kwarg(self):
+        result = self.tool(text=TEXT)
+        self.assertEqual(
+            result,
+            "Hugging Face was founded in 2016 by French entrepreneurs Clément Delangue, Julien Chaumond, and Thomas Wolf. In March 2021, Hugging Face raised $40 million in a Series B funding round. On April 28, 2021, the company launched the BigScience Research Workshop in collaboration with several other research groups to release an open large language model. In 2022, the workshop concluded with the announcement of BLOOM.",
+        )
+
+    def test_exact_match_kwarg_remote(self):
+        result = self.remote_tool(text=TEXT)
+        self.assertEqual(
+            result,
+            "Hugging Face was founded in 2016 by French entrepreneurs Clément Delangue, Julien Chaumond, and Thomas Wolf. In March 2021, Hugging Face raised $40 million in a Series B funding round. On April 28, 2021, the company launched the BigScience Research Workshop in collaboration with several other research groups to release an open large language model. In 2022, the workshop concluded with the announcement of BLOOM.",
+        )
diff --git a/tests/tools/test_text_to_speech.py b/tests/tools/test_text_to_speech.py
new file mode 100644
index 000000000000..a63017d27706
--- /dev/null
+++ b/tests/tools/test_text_to_speech.py
@@ -0,0 +1,58 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import load_tool
+from transformers.utils import is_torch_available
+
+
+if is_torch_available():
+    import torch
+
+from transformers.testing_utils import require_torch
+
+from .test_tools_common import ToolTesterMixin
+
+
+@require_torch
+class TextToSpeechToolTester(unittest.TestCase, ToolTesterMixin):
+    def setUp(self):
+        self.tool = load_tool("text-to-speech")
+        self.tool.setup()
+
+    def test_exact_match_arg(self):
+        # SpeechT5 isn't deterministic
+        torch.manual_seed(0)
+        result = self.tool("hey")
+        resulting_tensor = result.to_raw()
+        self.assertTrue(
+            torch.allclose(
+                resulting_tensor[:3],
+                torch.tensor([-0.0005966668832115829, -0.0003657640190795064, -0.00013439502799883485]),
+            )
+        )
+
+    def test_exact_match_kwarg(self):
+        # SpeechT5 isn't deterministic
+        torch.manual_seed(0)
+        result = self.tool("hey")
+        resulting_tensor = result.to_raw()
+        self.assertTrue(
+            torch.allclose(
+                resulting_tensor[:3],
+                torch.tensor([-0.0005966668832115829, -0.0003657640190795064, -0.00013439502799883485]),
+            )
+        )
diff --git a/tests/tools/test_tools_common.py b/tests/tools/test_tools_common.py
new file mode 100644
index 000000000000..984edfcd8c60
--- /dev/null
+++ b/tests/tools/test_tools_common.py
@@ -0,0 +1,133 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+from typing import List
+
+from transformers import is_torch_available, is_vision_available
+from transformers.testing_utils import get_tests_dir, is_tool_test
+from transformers.tools.agent_types import AGENT_TYPE_MAPPING, AgentAudio, AgentImage, AgentText
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+
+authorized_types = ["text", "image", "audio"]
+
+
+def create_inputs(input_types: List[str]):
+    inputs = []
+
+    for input_type in input_types:
+        if input_type == "text":
+            inputs.append("Text input")
+        elif input_type == "image":
+            inputs.append(
+                Image.open(Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png").resize((512, 512))
+            )
+        elif input_type == "audio":
+            inputs.append(torch.ones(3000))
+        elif isinstance(input_type, list):
+            inputs.append(create_inputs(input_type))
+        else:
+            raise ValueError(f"Invalid type requested: {input_type}")
+
+    return inputs
+
+
+def output_types(outputs: List):
+    output_types = []
+
+    for output in outputs:
+        if isinstance(output, (str, AgentText)):
+            output_types.append("text")
+        elif isinstance(output, (Image.Image, AgentImage)):
+            output_types.append("image")
+        elif isinstance(output, (torch.Tensor, AgentAudio)):
+            output_types.append("audio")
+        else:
+            raise ValueError(f"Invalid output: {output}")
+
+    return output_types
+
+
+@is_tool_test
+class ToolTesterMixin:
+    def test_inputs_outputs(self):
+        self.assertTrue(hasattr(self.tool, "inputs"))
+        self.assertTrue(hasattr(self.tool, "outputs"))
+
+        inputs = self.tool.inputs
+        for _input in inputs:
+            if isinstance(_input, list):
+                for __input in _input:
+                    self.assertTrue(__input in authorized_types)
+            else:
+                self.assertTrue(_input in authorized_types)
+
+        outputs = self.tool.outputs
+        for _output in outputs:
+            self.assertTrue(_output in authorized_types)
+
+    def test_call(self):
+        inputs = create_inputs(self.tool.inputs)
+        outputs = self.tool(*inputs)
+
+        # There is a single output
+        if len(self.tool.outputs) == 1:
+            outputs = [outputs]
+
+        self.assertListEqual(output_types(outputs), self.tool.outputs)
+
+    def test_common_attributes(self):
+        self.assertTrue(hasattr(self.tool, "description"))
+        self.assertTrue(hasattr(self.tool, "default_checkpoint"))
+        self.assertTrue(self.tool.description.startswith("This is a tool that"))
+
+    def test_agent_types_outputs(self):
+        inputs = create_inputs(self.tool.inputs)
+        outputs = self.tool(*inputs)
+
+        if not isinstance(outputs, list):
+            outputs = [outputs]
+
+        self.assertEqual(len(outputs), len(self.tool.outputs))
+
+        for output, output_type in zip(outputs, self.tool.outputs):
+            agent_type = AGENT_TYPE_MAPPING[output_type]
+            self.assertTrue(isinstance(output, agent_type))
+
+    def test_agent_types_inputs(self):
+        inputs = create_inputs(self.tool.inputs)
+
+        _inputs = []
+
+        for _input, input_type in zip(inputs, self.tool.inputs):
+            if isinstance(input_type, list):
+                _inputs.append([AGENT_TYPE_MAPPING[_input_type](_input) for _input_type in input_type])
+            else:
+                _inputs.append(AGENT_TYPE_MAPPING[input_type](_input))
+
+        # Should not raise an error
+        outputs = self.tool(*inputs)
+
+        if not isinstance(outputs, list):
+            outputs = [outputs]
+
+        self.assertEqual(len(outputs), len(self.tool.outputs))
diff --git a/tests/tools/test_translation.py b/tests/tools/test_translation.py
new file mode 100644
index 000000000000..15e1c8cd6ae4
--- /dev/null
+++ b/tests/tools/test_translation.py
@@ -0,0 +1,86 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import load_tool
+from transformers.tools.agent_types import AGENT_TYPE_MAPPING
+
+from .test_tools_common import ToolTesterMixin, output_types
+
+
+class TranslationToolTester(unittest.TestCase, ToolTesterMixin):
+    def setUp(self):
+        self.tool = load_tool("translation")
+        self.tool.setup()
+        self.remote_tool = load_tool("translation", remote=True)
+
+    def test_exact_match_arg(self):
+        result = self.tool("Hey, what's up?", src_lang="English", tgt_lang="French")
+        self.assertEqual(result, "- Hé, comment ça va?")
+
+    def test_exact_match_arg_remote(self):
+        result = self.remote_tool("Hey, what's up?", src_lang="English", tgt_lang="French")
+        self.assertEqual(result, "- Hé, comment ça va?")
+
+    def test_exact_match_kwarg(self):
+        result = self.tool(text="Hey, what's up?", src_lang="English", tgt_lang="French")
+        self.assertEqual(result, "- Hé, comment ça va?")
+
+    def test_exact_match_kwarg_remote(self):
+        result = self.remote_tool(text="Hey, what's up?", src_lang="English", tgt_lang="French")
+        self.assertEqual(result, "- Hé, comment ça va?")
+
+    def test_call(self):
+        inputs = ["Hey, what's up?", "English", "Spanish"]
+        outputs = self.tool(*inputs)
+
+        # There is a single output
+        if len(self.tool.outputs) == 1:
+            outputs = [outputs]
+
+        self.assertListEqual(output_types(outputs), self.tool.outputs)
+
+    def test_agent_types_outputs(self):
+        inputs = ["Hey, what's up?", "English", "Spanish"]
+        outputs = self.tool(*inputs)
+
+        if not isinstance(outputs, list):
+            outputs = [outputs]
+
+        self.assertEqual(len(outputs), len(self.tool.outputs))
+
+        for output, output_type in zip(outputs, self.tool.outputs):
+            agent_type = AGENT_TYPE_MAPPING[output_type]
+            self.assertTrue(isinstance(output, agent_type))
+
+    def test_agent_types_inputs(self):
+        inputs = ["Hey, what's up?", "English", "Spanish"]
+
+        _inputs = []
+
+        for _input, input_type in zip(inputs, self.tool.inputs):
+            if isinstance(input_type, list):
+                _inputs.append([AGENT_TYPE_MAPPING[_input_type](_input) for _input_type in input_type])
+            else:
+                _inputs.append(AGENT_TYPE_MAPPING[input_type](_input))
+
+        # Should not raise an error
+        outputs = self.tool(*inputs)
+
+        if not isinstance(outputs, list):
+            outputs = [outputs]
+
+        self.assertEqual(len(outputs), len(self.tool.outputs))
diff --git a/tests/trainer/test_data_collator.py b/tests/trainer/test_data_collator.py
index 39277ca8cc18..f5104cd37507 100644
--- a/tests/trainer/test_data_collator.py
+++ b/tests/trainer/test_data_collator.py
@@ -271,12 +271,17 @@ def test_data_collator_for_language_modeling(self):
         self._test_no_pad_and_pad(no_pad_features, pad_features)
 
     def test_data_collator_for_whole_word_mask(self):
-        features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
-
         tokenizer = BertTokenizer(self.vocab_file)
         data_collator = DataCollatorForWholeWordMask(tokenizer, return_tensors="pt")
+
+        features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
         batch = data_collator(features)
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 10)))
 
+        # Features can already be tensors
+        features = [{"input_ids": np.arange(10)}, {"input_ids": np.arange(10)}]
+        batch = data_collator(features)
         self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10)))
         self.assertEqual(batch["labels"].shape, torch.Size((2, 10)))
 
@@ -553,12 +558,17 @@ def test_data_collator_for_language_modeling(self):
         self._test_no_pad_and_pad(no_pad_features, pad_features)
 
     def test_data_collator_for_whole_word_mask(self):
-        features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
-
         tokenizer = BertTokenizer(self.vocab_file)
         data_collator = DataCollatorForWholeWordMask(tokenizer, return_tensors="tf")
+
+        features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
         batch = data_collator(features)
+        self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10])
+        self.assertEqual(batch["labels"].shape.as_list(), [2, 10])
 
+        # Features can already be tensors
+        features = [{"input_ids": np.arange(10)}, {"input_ids": np.arange(10)}]
+        batch = data_collator(features)
         self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10])
         self.assertEqual(batch["labels"].shape.as_list(), [2, 10])
 
@@ -825,12 +835,17 @@ def test_data_collator_for_language_modeling(self):
         self._test_no_pad_and_pad(no_pad_features, pad_features)
 
     def test_data_collator_for_whole_word_mask(self):
-        features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
-
         tokenizer = BertTokenizer(self.vocab_file)
         data_collator = DataCollatorForWholeWordMask(tokenizer, return_tensors="np")
+
+        features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
         batch = data_collator(features)
+        self.assertEqual(batch["input_ids"].shape, (2, 10))
+        self.assertEqual(batch["labels"].shape, (2, 10))
 
+        # Features can already be tensors
+        features = [{"input_ids": np.arange(10)}, {"input_ids": np.arange(10)}]
+        batch = data_collator(features)
         self.assertEqual(batch["input_ids"].shape, (2, 10))
         self.assertEqual(batch["labels"].shape, (2, 10))
 
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 01e7ed5fdc95..9d19aecd5e25 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -23,16 +23,17 @@
 import subprocess
 import sys
 import tempfile
-import time
 import unittest
+from itertools import product
 from pathlib import Path
+from typing import Dict, List
 from unittest.mock import Mock, patch
 
 import numpy as np
-
-from huggingface_hub import HfFolder, Repository, delete_repo, set_access_token
+from huggingface_hub import HfFolder, delete_repo, list_repo_commits
 from parameterized import parameterized
 from requests.exceptions import HTTPError
+
 from transformers import (
     AutoTokenizer,
     IntervalStrategy,
@@ -41,18 +42,22 @@
     is_torch_available,
     logging,
 )
+from transformers.hyperparameter_search import ALL_HYPERPARAMETER_SEARCH_BACKENDS
 from transformers.testing_utils import (
     ENDPOINT_STAGING,
     TOKEN,
     USER,
     CaptureLogger,
     TestCasePlus,
+    execute_subprocess_async,
     get_gpu_count,
     get_tests_dir,
     is_staging_test,
+    require_accelerate,
     require_intel_extension_for_pytorch,
     require_optuna,
     require_ray,
+    require_safetensors,
     require_sentencepiece,
     require_sigopt,
     require_tokenizers,
@@ -69,13 +74,16 @@
     require_wandb,
     slow,
 )
-from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
+from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, HPSearchBackend
 from transformers.training_args import OptimizerNames
 from transformers.utils import (
+    SAFE_WEIGHTS_INDEX_NAME,
+    SAFE_WEIGHTS_NAME,
     WEIGHTS_INDEX_NAME,
     WEIGHTS_NAME,
     is_apex_available,
     is_bitsandbytes_available,
+    is_safetensors_available,
     is_torchdistx_available,
 )
 from transformers.utils.hp_naming import TrialShortNamer
@@ -101,6 +109,9 @@
     )
     from transformers.modeling_utils import unwrap_model
 
+    if is_safetensors_available():
+        import safetensors.torch
+
 
 PATH_SAMPLE_TEXT = f"{get_tests_dir()}/fixtures/sample_text.txt"
 
@@ -344,8 +355,9 @@ def get_regression_trainer(a=0, b=0, double_output=False, train_len=64, eval_len
 
 
 class TrainerIntegrationCommon:
-    def check_saved_checkpoints(self, output_dir, freq, total, is_pretrained=True):
-        file_list = [WEIGHTS_NAME, "training_args.bin", "optimizer.pt", "scheduler.pt", "trainer_state.json"]
+    def check_saved_checkpoints(self, output_dir, freq, total, is_pretrained=True, safe_weights=False):
+        weights_file = WEIGHTS_NAME if not safe_weights else SAFE_WEIGHTS_NAME
+        file_list = [weights_file, "training_args.bin", "optimizer.pt", "scheduler.pt", "trainer_state.json"]
         if is_pretrained:
             file_list.append("config.json")
         for step in range(freq, total, freq):
@@ -355,7 +367,7 @@ def check_saved_checkpoints(self, output_dir, freq, total, is_pretrained=True):
                 self.assertTrue(os.path.isfile(os.path.join(checkpoint, filename)))
 
     def check_best_model_has_been_loaded(
-        self, output_dir, freq, total, trainer, metric, greater_is_better=False, is_pretrained=True
+        self, output_dir, freq, total, trainer, metric, greater_is_better=False, is_pretrained=True, safe_weights=False
     ):
         checkpoint = os.path.join(output_dir, f"checkpoint-{(total // freq) * freq}")
         log_history = TrainerState.load_from_json(os.path.join(checkpoint, "trainer_state.json")).log_history
@@ -369,7 +381,10 @@ def check_best_model_has_been_loaded(
             best_model.to(trainer.args.device)
         else:
             best_model = RegressionModel()
-            state_dict = torch.load(os.path.join(checkpoint, WEIGHTS_NAME))
+            if not safe_weights:
+                state_dict = torch.load(os.path.join(checkpoint, WEIGHTS_NAME))
+            else:
+                state_dict = safetensors.torch.load_file(os.path.join(checkpoint, SAFE_WEIGHTS_NAME))
             best_model.load_state_dict(state_dict)
             best_model.to(trainer.args.device)
         self.assertTrue(torch.allclose(best_model.a, trainer.model.a))
@@ -393,24 +408,43 @@ def check_trainer_state_are_the_same(self, trainer_state, trainer_state1):
                 _ = log1.pop(key, None)
             self.assertEqual(log, log1)
 
-    def convert_to_sharded_checkpoint(self, folder):
+    def convert_to_sharded_checkpoint(self, folder, save_safe=False, load_safe=False):
         # Converts a checkpoint of a regression model to a sharded checkpoint.
-        state_dict = torch.load(os.path.join(folder, WEIGHTS_NAME))
-        os.remove(os.path.join(folder, WEIGHTS_NAME))
+        if load_safe:
+            loader = safetensors.torch.load_file
+            weights_file = os.path.join(folder, SAFE_WEIGHTS_NAME)
+        else:
+            loader = torch.load
+            weights_file = os.path.join(folder, WEIGHTS_NAME)
+
+        if save_safe:
+            extension = "safetensors"
+            saver = safetensors.torch.save_file
+            index_file = os.path.join(folder, SAFE_WEIGHTS_INDEX_NAME)
+            shard_name = SAFE_WEIGHTS_NAME
+        else:
+            extension = "bin"
+            saver = torch.save
+            index_file = os.path.join(folder, WEIGHTS_INDEX_NAME)
+            shard_name = WEIGHTS_NAME
+
+        state_dict = loader(weights_file)
+
+        os.remove(weights_file)
         keys = list(state_dict.keys())
 
         shard_files = [
-            WEIGHTS_NAME.replace(".bin", f"-{idx+1:05d}-of-{len(keys):05d}.bin") for idx in range(len(keys))
+            shard_name.replace(f".{extension}", f"-{idx+1:05d}-of-{len(keys):05d}.{extension}")
+            for idx in range(len(keys))
         ]
         index = {"metadata": {}, "weight_map": {key: shard_files[i] for i, key in enumerate(keys)}}
 
-        save_index_file = os.path.join(folder, WEIGHTS_INDEX_NAME)
-        with open(save_index_file, "w", encoding="utf-8") as f:
+        with open(index_file, "w", encoding="utf-8") as f:
             content = json.dumps(index, indent=2, sort_keys=True) + "\n"
             f.write(content)
 
         for param_name, shard_file in zip(keys, shard_files):
-            torch.save({param_name: state_dict[param_name]}, os.path.join(folder, shard_file))
+            saver({param_name: state_dict[param_name]}, os.path.join(folder, shard_file))
 
 
 @require_torch
@@ -543,6 +577,74 @@ def test_custom_optimizer(self):
         self.assertFalse(torch.allclose(trainer.model.b, b))
         self.assertEqual(trainer.optimizer.state_dict()["param_groups"][0]["lr"], 1.0)
 
+    def test_reduce_lr_on_plateau_args(self):
+        # test passed arguments for a custom ReduceLROnPlateau scheduler
+        train_dataset = RegressionDataset(length=64)
+        eval_dataset = RegressionDataset(length=64)
+        args = TrainingArguments(
+            "./regression",
+            evaluation_strategy="epoch",
+            metric_for_best_model="eval_loss",
+        )
+        model = RegressionModel()
+        optimizer = torch.optim.SGD(model.parameters(), lr=1.0)
+        lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.2, patience=5, cooldown=2)
+        trainer = Trainer(
+            model, args, train_dataset=train_dataset, eval_dataset=eval_dataset, optimizers=(optimizer, lr_scheduler)
+        )
+        trainer.train()
+
+        self.assertIsInstance(trainer.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau)
+        self.assertEqual(trainer.lr_scheduler.factor, 0.2)
+        self.assertEqual(trainer.lr_scheduler.patience, 5)
+        self.assertEqual(trainer.lr_scheduler.cooldown, 2)
+
+    def test_reduce_lr_on_plateau(self):
+        # test the ReduceLROnPlateau scheduler
+
+        class TrainerWithLRLogs(Trainer):
+            def log(self, logs):
+                # the LR is computed after metrics and does not exist for the first epoch
+                if hasattr(self.lr_scheduler, "_last_lr"):
+                    logs["learning_rate"] = self.lr_scheduler._last_lr
+                super().log(logs)
+
+        train_dataset = RegressionDataset(length=64)
+        eval_dataset = RegressionDataset(length=64)
+
+        args = TrainingArguments(
+            "./regression",
+            lr_scheduler_type="reduce_lr_on_plateau",
+            evaluation_strategy="epoch",
+            metric_for_best_model="eval_loss",
+            num_train_epochs=10,
+            learning_rate=0.2,
+        )
+        model = RegressionModel()
+        trainer = TrainerWithLRLogs(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
+        trainer.train()
+
+        self.assertIsInstance(trainer.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau)
+        patience = trainer.lr_scheduler.patience
+
+        logs = trainer.state.log_history[1:]
+        best_loss = logs[0]["eval_loss"]
+        bad_epochs = 0
+        for i, log in enumerate(logs[:-1]):  # Compare learning rate to next epoch's
+            loss = log["eval_loss"]
+            just_decreased = False
+            if loss > best_loss:
+                bad_epochs += 1
+                if bad_epochs > patience:
+                    self.assertLess(logs[i + 1]["learning_rate"][0], log["learning_rate"][0])
+                    just_decreased = True
+                    bad_epochs = 0
+            else:
+                best_loss = loss
+                bad_epochs = 0
+            if not just_decreased:
+                self.assertEqual(logs[i + 1]["learning_rate"][0], log["learning_rate"][0])
+
     def test_adafactor_lr_none(self):
         # test the special case where lr=None, since Trainer can't not have lr_scheduler
 
@@ -564,7 +666,6 @@ def test_adafactor_lr_none(self):
     @require_torch_gpu
     @require_torch_bf16_gpu
     def test_mixed_bf16(self):
-
         # very basic test
         trainer = get_regression_trainer(learning_rate=0.1, bf16=True)
         trainer.train()
@@ -579,7 +680,6 @@ def test_mixed_bf16(self):
     @require_torch_gpu
     @require_torch_tf32
     def test_tf32(self):
-
         # very basic test
         trainer = get_regression_trainer(learning_rate=0.1, tf32=True)
         trainer.train()
@@ -700,9 +800,9 @@ def is_any_loss_nan_or_inf(log_history):
     def test_train_and_eval_dataloaders(self):
         n_gpu = max(1, torch.cuda.device_count())
         trainer = get_regression_trainer(learning_rate=0.1, per_device_train_batch_size=16)
-        self.assertEqual(trainer.get_train_dataloader().batch_size, 16 * n_gpu)
+        self.assertEqual(trainer.get_train_dataloader().total_batch_size, 16 * n_gpu)
         trainer = get_regression_trainer(learning_rate=0.1, per_device_eval_batch_size=16)
-        self.assertEqual(trainer.get_eval_dataloader().batch_size, 16 * n_gpu)
+        self.assertEqual(trainer.get_eval_dataloader().total_batch_size, 16 * n_gpu)
 
         # Check drop_last works
         trainer = get_regression_trainer(
@@ -735,67 +835,6 @@ def test_dataloader_without_dataset(self):
         trainer.train()
         trainer.evaluate()
 
-    def test_sampler_seed(self):
-        # nb: we don't want to inherit from IterableDataset to hit the right code path
-        class DummyDataset(torch.utils.data.Dataset):
-            def __init__(self, length: int = 101):
-                self.length = length
-
-            def __len__(self):
-                return self.length
-
-            def __getitem__(self, i):
-                if (i < 0) or (i >= self.length):
-                    raise IndexError
-                return {"input_ids": [i]}
-
-        class DummyModel(PreTrainedModel):
-            def __init__(self, num_params: int):
-                super().__init__(PretrainedConfig())
-                # Add some (unused) params. the point here is that randomness in model_init shouldn't influence
-                # data loader order.
-                self.params = nn.Parameter(torch.randn(num_params))
-
-            def forward(self, input_ids, labels=None):
-                if labels is not None:
-                    return torch.tensor(0.0, device=input_ids.device), input_ids
-                else:
-                    return input_ids
-
-        def _get_first_data_sample(num_params, seed, data_seed, **kwargs):
-            with tempfile.TemporaryDirectory() as tmpdir:
-                trainer = Trainer(
-                    model_init=lambda: DummyModel(num_params),
-                    args=TrainingArguments(
-                        output_dir=tmpdir,
-                        **kwargs,
-                        seed=seed,
-                        data_seed=data_seed,
-                        local_rank=-1,
-                    ),
-                    train_dataset=DummyDataset(),
-                )
-
-                return next(iter(trainer.get_train_dataloader()))
-
-        # test that the seed is passed to the sampler
-        # the codepath we want to hit is world_size <= 1, and both group_by_length
-        for group_by_length in [True, False]:
-            sample42_1 = _get_first_data_sample(num_params=10, seed=42, data_seed=42, group_by_length=group_by_length)
-            sample42_2 = _get_first_data_sample(num_params=11, seed=42, data_seed=42, group_by_length=group_by_length)
-            self.assertTrue(torch.equal(sample42_1["input_ids"], sample42_2["input_ids"]))
-
-            # should get same samples with different seed, so long as data_seed is the same
-            sample42_3 = _get_first_data_sample(num_params=11, seed=11, data_seed=42, group_by_length=group_by_length)
-            self.assertTrue(torch.equal(sample42_1["input_ids"], sample42_3["input_ids"]))
-
-            # make sure we have some randomness in the samples if data_seed is different
-            others = [
-                _get_first_data_sample(num_params=i, seed=42, data_seed=i, group_by_length=group_by_length)
-                for i in range(10)
-            ]
-            self.assertTrue(any(not torch.equal(sample42_1["input_ids"], sample["input_ids"]) for sample in others))
-
     @require_torch_multi_gpu
     def test_data_is_not_parallelized_when_model_is_parallel(self):
         model = RegressionModel()
@@ -809,9 +848,9 @@ def test_data_is_not_parallelized_when_model_is_parallel(self):
         self.assertEqual(trainer.args.n_gpu, 1)
 
         # The batch size of the training and evaluation dataloaders should be 16, not 16 * n_gpu
-        self.assertEqual(trainer.get_train_dataloader().batch_size, 16)
+        self.assertEqual(trainer.get_train_dataloader().total_batch_size, 16)
         self.assertEqual(len(trainer.get_train_dataloader()), 64 // 16)
-        self.assertEqual(trainer.get_eval_dataloader().batch_size, 16)
+        self.assertEqual(trainer.get_eval_dataloader().total_batch_size, 16)
         self.assertEqual(len(trainer.get_eval_dataloader()), 64 // 16)
 
     def test_evaluate(self):
@@ -1099,11 +1138,15 @@ def test_log_level(self):
         logger = logging.get_logger()
         log_info_string = "Running training"
 
-        # test with the default log_level - should be info and thus log on the main process
+        # test with the default log_level - should be the same as before and thus we test depending on is_info
+        is_info = logging.get_verbosity() <= 20
         with CaptureLogger(logger) as cl:
             trainer = get_regression_trainer()
             trainer.train()
-        self.assertIn(log_info_string, cl.out)
+        if is_info:
+            self.assertIn(log_info_string, cl.out)
+        else:
+            self.assertNotIn(log_info_string, cl.out)
 
         # test with low log_level - lower than info
         with CaptureLogger(logger) as cl:
@@ -1129,6 +1172,26 @@ def test_save_checkpoints(self):
             trainer.train()
             self.check_saved_checkpoints(tmpdir, 5, int(self.n_epochs * 64 / self.batch_size), False)
 
+    @require_safetensors
+    def test_safe_checkpoints(self):
+        for save_safetensors in [True, False]:
+            with tempfile.TemporaryDirectory() as tmpdir:
+                trainer = get_regression_trainer(output_dir=tmpdir, save_steps=5, save_safetensors=save_safetensors)
+                trainer.train()
+                self.check_saved_checkpoints(
+                    tmpdir, 5, int(self.n_epochs * 64 / self.batch_size), safe_weights=save_safetensors
+                )
+
+            # With a regular model that is not a PreTrainedModel
+            with tempfile.TemporaryDirectory() as tmpdir:
+                trainer = get_regression_trainer(
+                    output_dir=tmpdir, save_steps=5, pretrained=False, save_safetensors=save_safetensors
+                )
+                trainer.train()
+                self.check_saved_checkpoints(
+                    tmpdir, 5, int(self.n_epochs * 64 / self.batch_size), False, safe_weights=save_safetensors
+                )
+
     @require_torch_multi_gpu
     def test_run_seq2seq_double_train_wrap_once(self):
         # test that we don't wrap the model more than once
@@ -1149,7 +1212,13 @@ def test_can_resume_training(self):
         # won't be the same since the training dataloader is shuffled).
 
         with tempfile.TemporaryDirectory() as tmpdir:
-            kwargs = dict(output_dir=tmpdir, train_len=128, save_steps=5, learning_rate=0.1)
+            kwargs = {
+                "output_dir": tmpdir,
+                "train_len": 128,
+                "save_steps": 5,
+                "learning_rate": 0.1,
+                "logging_steps": 5,
+            }
             trainer = get_regression_trainer(**kwargs)
             trainer.train()
             (a, b) = trainer.model.a.item(), trainer.model.b.item()
@@ -1182,7 +1251,13 @@ def test_can_resume_training(self):
 
         # With a regular model that is not a PreTrainedModel
         with tempfile.TemporaryDirectory() as tmpdir:
-            kwargs = dict(output_dir=tmpdir, train_len=128, save_steps=5, learning_rate=0.1, pretrained=False)
+            kwargs = {
+                "output_dir": tmpdir,
+                "train_len": 128,
+                "save_steps": 5,
+                "learning_rate": 0.1,
+                "pretrained": False,
+            }
 
             trainer = get_regression_trainer(**kwargs)
             trainer.train()
@@ -1285,9 +1360,9 @@ def test_resume_training_with_randomness(self):
             self.assertAlmostEqual(b, b1, delta=1e-5)
 
     @slow
+    @require_accelerate
     @require_torch_non_multi_gpu
     def test_auto_batch_size_finder(self):
-
         if torch.cuda.is_available():
             torch.backends.cudnn.deterministic = True
 
@@ -1358,6 +1433,42 @@ def test_resume_training_with_shard_checkpoint(self):
             self.assertEqual(b, b1)
             self.check_trainer_state_are_the_same(state, state1)
 
+    @require_safetensors
+    @require_torch_up_to_2_gpus
+    def test_resume_training_with_safe_checkpoint(self):
+        # This test will fail for more than 2 GPUs since the batch size will get bigger and with the number of
+        # save_steps, the checkpoint will resume training at epoch 2 or more (so the data seen by the model
+        # won't be the same since the training dataloader is shuffled).
+
+        for initial_safe in [False, True]:
+            for loaded_safe in [False, True]:
+                with tempfile.TemporaryDirectory() as tmpdir:
+                    trainer = get_regression_trainer(
+                        output_dir=tmpdir,
+                        train_len=128,
+                        save_steps=5,
+                        learning_rate=0.1,
+                        save_safetensors=initial_safe,
+                    )
+                    trainer.train()
+                    (a, b) = trainer.model.a.item(), trainer.model.b.item()
+                    state = dataclasses.asdict(trainer.state)
+
+                    checkpoint = os.path.join(tmpdir, "checkpoint-5")
+                    self.convert_to_sharded_checkpoint(checkpoint, load_safe=initial_safe, save_safe=loaded_safe)
+
+                    # Reinitialize trainer
+                    trainer = get_regression_trainer(
+                        output_dir=tmpdir, train_len=128, save_steps=5, learning_rate=0.1, save_safetensors=loaded_safe
+                    )
+
+                    trainer.train(resume_from_checkpoint=checkpoint)
+                    (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
+                    state1 = dataclasses.asdict(trainer.state)
+                    self.assertEqual(a, a1)
+                    self.assertEqual(b, b1)
+                    self.check_trainer_state_are_the_same(state, state1)
+
     @require_torch_up_to_2_gpus
     def test_resume_training_with_gradient_accumulation(self):
         # This test will fail for more than 2 GPUs since the batch size will get bigger and with the number of
@@ -1507,6 +1618,30 @@ def test_load_best_model_at_end(self):
             self.check_saved_checkpoints(tmpdir, 5, total, is_pretrained=False)
             self.check_best_model_has_been_loaded(tmpdir, 5, total, trainer, "eval_loss", is_pretrained=False)
 
+    @require_safetensors
+    def test_load_best_model_from_safetensors(self):
+        total = int(self.n_epochs * 64 / self.batch_size)
+        for save_safetensors, pretrained in product([False, True], [False, True]):
+            with tempfile.TemporaryDirectory() as tmpdir:
+                trainer = get_regression_trainer(
+                    a=1.5,
+                    b=2.5,
+                    output_dir=tmpdir,
+                    learning_rate=0.1,
+                    eval_steps=5,
+                    evaluation_strategy="steps",
+                    save_steps=5,
+                    load_best_model_at_end=True,
+                    save_safetensors=save_safetensors,
+                    pretrained=pretrained,
+                )
+                self.assertFalse(trainer.args.greater_is_better)
+                trainer.train()
+                self.check_saved_checkpoints(tmpdir, 5, total, is_pretrained=pretrained, safe_weights=save_safetensors)
+                self.check_best_model_has_been_loaded(
+                    tmpdir, 5, total, trainer, "eval_loss", is_pretrained=pretrained, safe_weights=save_safetensors
+                )
+
     @slow
     def test_trainer_eval_mrpc(self):
         MODEL_ID = "bert-base-cased-finetuned-mrpc"
@@ -1548,26 +1683,6 @@ def test_training_iterable_dataset(self):
         self.assertIsInstance(loader, torch.utils.data.DataLoader)
         self.assertIsInstance(loader.sampler, torch.utils.data.dataloader._InfiniteConstantSampler)
 
-    def test_training_finite_iterable_dataset(self):
-        config = RegressionModelConfig()
-        model = RegressionPreTrainedModel(config)
-
-        batch_size = 1
-        num_samples = 10
-
-        available_steps = num_samples // batch_size
-
-        data = FiniteIterableDataset(length=num_samples)
-        train_args = TrainingArguments(
-            "..",
-            max_steps=available_steps + 1,  # set a higher number than actually available
-            per_device_train_batch_size=batch_size,
-        )
-        trainer = Trainer(model, train_dataset=data, args=train_args)
-        with self.assertLogs("transformers.trainer", level="WARNING") as logs:
-            trainer.train()
-        self.assertIn(f"stopping training at step {available_steps}!", logs.output[0])
-
     def test_evaluation_iterable_dataset(self):
         config = RegressionModelConfig(a=1.5, b=2.5)
         model = RegressionPreTrainedModel(config)
@@ -1734,7 +1849,6 @@ def check_mem_metrics(self, trainer, check_func):
             check_func("test_mem_gpu_alloc_delta", metrics)
 
     def test_mem_metrics(self):
-
         # with mem metrics enabled
         trainer = get_regression_trainer(skip_memory_metrics=False)
         self.check_mem_metrics(trainer, self.assertIn)
@@ -1745,7 +1859,6 @@ def test_mem_metrics(self):
 
     @require_torch_gpu
     def test_fp16_full_eval(self):
-
         # this is a sensitive test so let's keep debugging printouts in place for quick diagnosis.
         # it's using pretty large safety margins, but small enough to detect broken functionality.
         debug = 0
@@ -1842,6 +1955,7 @@ def test_torchdynamo_full_eval(self):
         self.assertAlmostEqual(metrics["eval_loss"], original_eval_loss)
         torchdynamo.reset()
 
+    @unittest.skip("torch 2.0.0 gives `ModuleNotFoundError: No module named 'torchdynamo'`.")
     @require_torch_non_multi_gpu
     @require_torchdynamo
     def test_torchdynamo_memory(self):
@@ -1985,6 +2099,51 @@ def test_no_wd_param_group(self):
         self.assertListEqual(trainer.optimizer.param_groups[0]["params"], wd_params)
         self.assertListEqual(trainer.optimizer.param_groups[1]["params"], no_wd_params)
 
+    @slow
+    @require_torch_multi_gpu
+    def test_end_to_end_example(self):
+        # Tests that `translation.py` will run without issues
+        script_path = os.path.abspath(
+            os.path.join(
+                os.path.dirname(__file__), "..", "..", "examples", "pytorch", "translation", "run_translation.py"
+            )
+        )
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            command = [
+                "accelerate",
+                "launch",
+                script_path,
+                "--model_name_or_path",
+                "t5-small",
+                "--per_device_train_batch_size",
+                "1",
+                "--output_dir",
+                tmpdir,
+                "--overwrite_output_dir",
+                "--do_train",
+                "--max_train_samples",
+                "64",
+                "--num_train_epochs",
+                "1",
+                "--dataset_name",
+                "wmt16",
+                "--dataset_config",
+                "ro-en",
+                "--source_lang",
+                "en",
+                "--target_lang",
+                "ro",
+                "--do_predict",
+                "--max_predict_samples",
+                "64",
+                "--predict_with_generate",
+                "--ddp_timeout",
+                "60",
+            ]
+            execute_subprocess_async(command)
+            # successful return here == success - any errors would have caused an error or a timeout in the sub-call
+
 
 @require_torch
 @is_staging_test
@@ -1992,7 +2151,6 @@ class TrainerIntegrationWithHubTester(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         cls._token = TOKEN
-        set_access_token(TOKEN)
         HfFolder.save_token(TOKEN)
 
     @classmethod
@@ -2068,21 +2226,17 @@ def test_push_to_hub_with_saves_each_epoch(self):
                 output_dir=os.path.join(tmp_dir, "test-trainer-epoch"),
                 push_to_hub=True,
                 hub_token=self._token,
+                # To avoid any flakiness if the training goes faster than the uploads.
+                hub_always_push=True,
                 save_strategy="epoch",
             )
             trainer.train()
 
-            # Wait for the async pushes to be finished
-            while trainer.push_in_progress is not None and not trainer.push_in_progress.is_done:
-                time.sleep(0.5)
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            _ = Repository(tmp_dir, clone_from=f"{USER}/test-trainer-epoch", use_auth_token=self._token)
-            commits = self.get_commit_history(tmp_dir)
-            self.assertIn("initial commit", commits)
-            # We can't test that epoch 2 and 3 are in the commits without being flaky as those might be skipped if
-            # the push for epoch 1 wasn't finished at the time.
-            self.assertIn("Training in progress, epoch 1", commits)
+        commits = list_repo_commits(f"{USER}/test-trainer-epoch", token=self._token)
+        commits = [c.title for c in commits]
+        self.assertIn("initial commit", commits)
+        for i in range(1, 4):
+            self.assertIn(f"Training in progress, epoch {i}", commits)
 
     def test_push_to_hub_with_saves_each_n_steps(self):
         num_gpus = max(1, get_gpu_count())
@@ -2094,22 +2248,21 @@ def test_push_to_hub_with_saves_each_n_steps(self):
                 output_dir=os.path.join(tmp_dir, "test-trainer-step"),
                 push_to_hub=True,
                 hub_token=self._token,
+                # To avoid any flakiness if the training goes faster than the uploads.
+                hub_always_push=True,
                 save_strategy="steps",
                 save_steps=5,
             )
             trainer.train()
 
-            # Wait for the async pushes to be finished
-            while trainer.push_in_progress is not None and not trainer.push_in_progress.is_done:
-                time.sleep(0.5)
+        commits = list_repo_commits(f"{USER}/test-trainer-step", token=self._token)
+        commits = [c.title for c in commits]
+        self.assertIn("initial commit", commits)
 
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            _ = Repository(tmp_dir, clone_from=f"{USER}/test-trainer-step", use_auth_token=self._token)
-            commits = self.get_commit_history(tmp_dir)
-            self.assertIn("initial commit", commits)
-            # We can't test that epoch 2 and 3 are in the commits without being flaky as those might be skipped if
-            # the push for epoch 1 wasn't finished at the time.
-            self.assertIn("Training in progress, step 5", commits)
+        # max_steps depend on the number of available GPUs
+        max_steps = math.ceil(trainer.args.num_train_epochs * len(trainer.get_train_dataloader()))
+        for i in range(5, max_steps, 5):
+            self.assertIn(f"Training in progress, step {i}", commits)
 
 
 @require_torch
@@ -2158,6 +2311,62 @@ def hp_name(trial):
             trainer.hyperparameter_search(direction="minimize", hp_space=hp_space, hp_name=hp_name, n_trials=4)
 
 
+@require_torch
+@require_optuna
+class TrainerHyperParameterMultiObjectOptunaIntegrationTest(unittest.TestCase):
+    def setUp(self):
+        args = TrainingArguments("..")
+        self.n_epochs = args.num_train_epochs
+        self.batch_size = args.train_batch_size
+
+    def test_hyperparameter_search(self):
+        class MyTrialShortNamer(TrialShortNamer):
+            DEFAULTS = {"a": 0, "b": 0}
+
+        def hp_space(trial):
+            return {}
+
+        def model_init(trial):
+            if trial is not None:
+                a = trial.suggest_int("a", -4, 4)
+                b = trial.suggest_int("b", -4, 4)
+            else:
+                a = 0
+                b = 0
+            config = RegressionModelConfig(a=a, b=b, double_output=False)
+
+            return RegressionPreTrainedModel(config)
+
+        def hp_name(trial):
+            return MyTrialShortNamer.shortname(trial.params)
+
+        def compute_objective(metrics: Dict[str, float]) -> List[float]:
+            return metrics["eval_loss"], metrics["eval_accuracy"]
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            trainer = get_regression_trainer(
+                output_dir=tmp_dir,
+                learning_rate=0.1,
+                logging_steps=1,
+                evaluation_strategy=IntervalStrategy.EPOCH,
+                save_strategy=IntervalStrategy.EPOCH,
+                num_train_epochs=10,
+                disable_tqdm=True,
+                load_best_model_at_end=True,
+                logging_dir="runs",
+                run_name="test",
+                model_init=model_init,
+                compute_metrics=AlmostAccuracy(),
+            )
+            trainer.hyperparameter_search(
+                direction=["minimize", "maximize"],
+                hp_space=hp_space,
+                hp_name=hp_name,
+                n_trials=4,
+                compute_objective=compute_objective,
+            )
+
+
 @require_torch
 @require_ray
 class TrainerHyperParameterRayIntegrationTest(unittest.TestCase):
@@ -2282,6 +2491,11 @@ def hp_name(trial):
         "lr": TrainingArguments.learning_rate,
     }
 
+    default_lion_kwargs = {
+        "betas": (TrainingArguments.adam_beta1, TrainingArguments.adam_beta2),
+        "lr": TrainingArguments.learning_rate,
+    }
+
     default_anyprecision_kwargs = {
         "use_kahan_summation": False,
         "momentum_dtype": torch.float32,
@@ -2333,11 +2547,59 @@ def hp_name(trial):
         optim_test_params.append(
             (
                 TrainingArguments(optim=OptimizerNames.ADAMW_BNB, output_dir="None"),
-                bnb.optim.Adam8bit,
+                bnb.optim.AdamW,
                 default_adam_kwargs,
             )
         )
 
+        optim_test_params.append(
+            (
+                TrainingArguments(optim=OptimizerNames.ADAMW_8BIT, output_dir="None"),
+                bnb.optim.AdamW,
+                default_adam_kwargs,
+            )
+        )
+
+        optim_test_params.append(
+            (
+                TrainingArguments(optim=OptimizerNames.PAGED_ADAMW, output_dir="None"),
+                bnb.optim.AdamW,
+                default_adam_kwargs,
+            )
+        )
+
+        optim_test_params.append(
+            (
+                TrainingArguments(optim=OptimizerNames.PAGED_ADAMW_8BIT, output_dir="None"),
+                bnb.optim.AdamW,
+                default_adam_kwargs,
+            )
+        )
+
+        optim_test_params.append(
+            (
+                TrainingArguments(optim=OptimizerNames.LION, output_dir="None"),
+                bnb.optim.Lion,
+                default_lion_kwargs,
+            )
+        )
+
+        optim_test_params.append(
+            (
+                TrainingArguments(optim=OptimizerNames.LION_8BIT, output_dir="None"),
+                bnb.optim.Lion,
+                default_lion_kwargs,
+            )
+        )
+
+        optim_test_params.append(
+            (
+                TrainingArguments(optim=OptimizerNames.PAGED_LION_8BIT, output_dir="None"),
+                bnb.optim.Lion,
+                default_lion_kwargs,
+            )
+        )
+
     if is_torchdistx_available():
         import torchdistx
 
@@ -2406,15 +2668,113 @@ def test_bnb_adam8bit(self):
         modules = {
             "bitsandbytes": mock,
             "bitsandbytes.optim": mock.optim,
-            "bitsandbytes.optim.Adam8bit": mock.optim.Adam8bit,
+            "bitsandbytes.optim.AdamW": mock.optim.AdamW,
         }
         with patch.dict("sys.modules", modules):
             self.check_optim_and_kwargs(
                 TrainingArguments(optim=OptimizerNames.ADAMW_BNB, output_dir="None"),
-                mock.optim.Adam8bit,
+                mock.optim.AdamW,
                 default_adam_kwargs,
             )
 
+    def test_bnb_paged_adam8bit_alias(self):
+        mock = Mock()
+        modules = {
+            "bitsandbytes": mock,
+            "bitsandbytes.optim": mock.optim,
+            "bitsandbytes.optim.AdamW": mock.optim.AdamW,
+        }
+        with patch.dict("sys.modules", modules):
+            self.check_optim_and_kwargs(
+                TrainingArguments(optim=OptimizerNames.ADAMW_8BIT, output_dir="None"),
+                mock.optim.AdamW,
+                default_adam_kwargs,
+            )
+
+    def test_bnb_paged_adam(self):
+        mock = Mock()
+        modules = {
+            "bitsandbytes": mock,
+            "bitsandbytes.optim": mock.optim,
+            "bitsandbytes.optim.AdamW": mock.optim.AdamW,
+        }
+        with patch.dict("sys.modules", modules):
+            self.check_optim_and_kwargs(
+                TrainingArguments(optim=OptimizerNames.PAGED_ADAMW, output_dir="None"),
+                mock.optim.AdamW,
+                default_adam_kwargs,
+            )
+
+    def test_bnb_paged_adam8bit(self):
+        mock = Mock()
+        modules = {
+            "bitsandbytes": mock,
+            "bitsandbytes.optim": mock.optim,
+            "bitsandbytes.optim.AdamW": mock.optim.AdamW,
+        }
+        with patch.dict("sys.modules", modules):
+            self.check_optim_and_kwargs(
+                TrainingArguments(optim=OptimizerNames.PAGED_ADAMW_8BIT, output_dir="None"),
+                mock.optim.AdamW,
+                default_adam_kwargs,
+            )
+
+    def test_bnb_lion(self):
+        mock = Mock()
+        modules = {
+            "bitsandbytes": mock,
+            "bitsandbytes.optim": mock.optim,
+            "bitsandbytes.optim.Lion": mock.optim.Lion,
+        }
+        with patch.dict("sys.modules", modules):
+            self.check_optim_and_kwargs(
+                TrainingArguments(optim=OptimizerNames.LION, output_dir="None"),
+                mock.optim.Lion,
+                default_lion_kwargs,
+            )
+
+    def test_bnb_lion8bit(self):
+        mock = Mock()
+        modules = {
+            "bitsandbytes": mock,
+            "bitsandbytes.optim": mock.optim,
+            "bitsandbytes.optim.Lion": mock.optim.Lion,
+        }
+        with patch.dict("sys.modules", modules):
+            self.check_optim_and_kwargs(
+                TrainingArguments(optim=OptimizerNames.LION_8BIT, output_dir="None"),
+                mock.optim.Lion,
+                default_lion_kwargs,
+            )
+
+    def test_bnb_paged_lion8bit(self):
+        mock = Mock()
+        modules = {
+            "bitsandbytes": mock,
+            "bitsandbytes.optim": mock.optim,
+            "bitsandbytes.optim.Lion": mock.optim.Lion,
+        }
+        with patch.dict("sys.modules", modules):
+            self.check_optim_and_kwargs(
+                TrainingArguments(optim=OptimizerNames.PAGED_LION_8BIT, output_dir="None"),
+                mock.optim.Lion,
+                default_lion_kwargs,
+            )
+
+    def test_bnb_paged_lion(self):
+        mock = Mock()
+        modules = {
+            "bitsandbytes": mock,
+            "bitsandbytes.optim": mock.optim,
+            "bitsandbytes.optim.Lion": mock.optim.Lion,
+        }
+        with patch.dict("sys.modules", modules):
+            self.check_optim_and_kwargs(
+                TrainingArguments(optim=OptimizerNames.PAGED_LION, output_dir="None"),
+                mock.optim.Lion,
+                default_lion_kwargs,
+            )
+
     def test_bnb_adam8bit_no_bnb(self):
         args = TrainingArguments(optim=OptimizerNames.ADAMW_BNB, output_dir="None")
 
@@ -2424,6 +2784,42 @@ def test_bnb_adam8bit_no_bnb(self):
             with self.assertRaises(ValueError):
                 Trainer.get_optimizer_cls_and_kwargs(args)
 
+    def test_bnb_paged_adam_no_bnb(self):
+        args = TrainingArguments(optim=OptimizerNames.PAGED_ADAMW, output_dir="None")
+
+        # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing
+        # bnb will fail even if bnb is installed.
+        with patch.dict("sys.modules", {"bitsandbytes.optim": None}):
+            with self.assertRaises(ValueError):
+                Trainer.get_optimizer_cls_and_kwargs(args)
+
+    def test_bnb_paged_adam8bit_no_bnb(self):
+        args = TrainingArguments(optim=OptimizerNames.PAGED_ADAMW_8BIT, output_dir="None")
+
+        # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing
+        # bnb will fail even if bnb is installed.
+        with patch.dict("sys.modules", {"bitsandbytes.optim": None}):
+            with self.assertRaises(ValueError):
+                Trainer.get_optimizer_cls_and_kwargs(args)
+
+    def test_bnb_paged_lion_no_bnb(self):
+        args = TrainingArguments(optim=OptimizerNames.PAGED_LION, output_dir="None")
+
+        # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing
+        # bnb will fail even if bnb is installed.
+        with patch.dict("sys.modules", {"bitsandbytes.optim": None}):
+            with self.assertRaises(ValueError):
+                Trainer.get_optimizer_cls_and_kwargs(args)
+
+    def test_bnb_paged_lion8bit_no_bnb(self):
+        args = TrainingArguments(optim=OptimizerNames.PAGED_LION_8BIT, output_dir="None")
+
+        # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing
+        # bnb will fail even if bnb is installed.
+        with patch.dict("sys.modules", {"bitsandbytes.optim": None}):
+            with self.assertRaises(ValueError):
+                Trainer.get_optimizer_cls_and_kwargs(args)
+
     def test_anyprecision_adamw(self):
         # Pretend that torchdistx is installed and mock torchdistx.optimizers.AnyPrecisionAdamW exists.
         # Trainer.get_optimizer_cls_and_kwargs does not use AnyPrecisioinAdamW. It only has to return the
@@ -2465,7 +2861,6 @@ class MyTrialShortNamer(TrialShortNamer):
             DEFAULTS = {"a": 0, "b": 0}
 
         def hp_space(trial):
-
             return {
                 "method": "random",
                 "metric": {},
@@ -2506,3 +2901,11 @@ def hp_name(params):
             trainer.hyperparameter_search(
                 direction="minimize", hp_space=hp_space, hp_name=hp_name, backend="wandb", n_trials=4, anonymous="must"
             )
+
+
+class HyperParameterSearchBackendsTest(unittest.TestCase):
+    def test_hyperparameter_search_backends(self):
+        self.assertEqual(
+            list(ALL_HYPERPARAMETER_SEARCH_BACKENDS.keys()),
+            list(HPSearchBackend),
+        )
diff --git a/tests/trainer/test_trainer_callback.py b/tests/trainer/test_trainer_callback.py
index a88ca1cb0d49..8e851132c2da 100644
--- a/tests/trainer/test_trainer_callback.py
+++ b/tests/trainer/test_trainer_callback.py
@@ -108,8 +108,8 @@ def check_callbacks_equality(self, cbs1, cbs2):
         self.assertEqual(len(cbs1), len(cbs2))
 
         # Order doesn't matter
-        cbs1 = list(sorted(cbs1, key=lambda cb: cb.__name__ if isinstance(cb, type) else cb.__class__.__name__))
-        cbs2 = list(sorted(cbs2, key=lambda cb: cb.__name__ if isinstance(cb, type) else cb.__class__.__name__))
+        cbs1 = sorted(cbs1, key=lambda cb: cb.__name__ if isinstance(cb, type) else cb.__class__.__name__)
+        cbs2 = sorted(cbs2, key=lambda cb: cb.__name__ if isinstance(cb, type) else cb.__class__.__name__)
 
         for cb1, cb2 in zip(cbs1, cbs2):
             if isinstance(cb1, type) and isinstance(cb2, type):
diff --git a/tests/trainer/test_trainer_distributed.py b/tests/trainer/test_trainer_distributed.py
index 6ed74efe510c..8f867cf0beba 100644
--- a/tests/trainer/test_trainer_distributed.py
+++ b/tests/trainer/test_trainer_distributed.py
@@ -12,16 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 from typing import Dict
 
+import numpy as np
+
 from transformers import EvalPrediction, HfArgumentParser, TrainingArguments, is_torch_available
 from transformers.testing_utils import (
     TestCasePlus,
     execute_subprocess_async,
     get_torch_dist_unique_port,
     require_torch_multi_gpu,
+    require_torch_multi_xpu,
+    require_torch_neuroncore,
+    require_torch_npu,
 )
+from transformers.training_args import ParallelMode
 from transformers.utils import logging
 
 
@@ -31,7 +36,7 @@
 if is_torch_available():
     import torch
     from torch import nn
-    from torch.utils.data import Dataset
+    from torch.utils.data import Dataset, IterableDataset
 
     from transformers import Trainer
 
@@ -61,20 +66,109 @@ def forward(self, input_ids, labels=None):
             else:
                 return input_ids
 
+    class RegressionModel(nn.Module):
+        def __init__(self, a=0, b=0, double_output=False):
+            super().__init__()
+            self.a = nn.Parameter(torch.tensor(a).float())
+            self.b = nn.Parameter(torch.tensor(b).float())
+            self.double_output = double_output
+            self.config = None
+
+        def forward(self, input_x, labels=None, **kwargs):
+            y = input_x * self.a + self.b
+            if labels is None:
+                return (y, y) if self.double_output else (y,)
+            loss = nn.functional.mse_loss(y, labels)
+            return (loss, y, y) if self.double_output else (loss, y)
+
+    class SampleIterableDataset(IterableDataset):
+        def __init__(self, a=2, b=3, length=64, seed=42, label_names=None):
+            self.dataset = RegressionDataset(a=a, b=b, length=length, seed=seed, label_names=label_names)
+
+        def __iter__(self):
+            for i in range(len(self.dataset)):
+                yield self.dataset[i]
+
+    class FiniteIterableDataset(SampleIterableDataset):
+        def __init__(self, a=2, b=3, length=64, seed=42, label_names=None):
+            super().__init__(a, b, length, seed, label_names)
+            self.current_sample = 0
+
+        def __iter__(self):
+            while self.current_sample < len(self.dataset):
+                yield self.dataset[self.current_sample]
+                self.current_sample += 1
+
+    class RegressionDataset:
+        def __init__(self, a=2, b=3, length=64, seed=42, label_names=None):
+            np.random.seed(seed)
+            self.label_names = ["labels"] if label_names is None else label_names
+            self.length = length
+            self.x = np.random.normal(size=(length,)).astype(np.float32)
+            self.ys = [a * self.x + b + np.random.normal(scale=0.1, size=(length,)) for _ in self.label_names]
+            self.ys = [y.astype(np.float32) for y in self.ys]
+
+        def __len__(self):
+            return self.length
+
+        def __getitem__(self, i):
+            result = {name: y[i] for name, y in zip(self.label_names, self.ys)}
+            result["input_x"] = self.x[i]
+            return result
+
+
+class TestTrainerDistributedNeuronCore(TestCasePlus):
+    @require_torch_neuroncore
+    def test_trainer(self):
+        distributed_args = f"""--nproc_per_node=2
+            --master_port={get_torch_dist_unique_port()}
+            {self.test_file_dir}/test_trainer_distributed.py
+        """.split()
+        output_dir = self.get_auto_remove_tmp_dir()
+        args = f"--output_dir {output_dir}".split()
+        cmd = ["torchrun"] + distributed_args + args
+        execute_subprocess_async(cmd, env=self.get_env())
+        # successful return here == success - any errors would have caused an error in the sub-call
+
+
+class TestTrainerDistributedNPU(TestCasePlus):
+    @require_torch_npu
+    def test_trainer(self):
+        distributed_args = f"""--nproc_per_node=2
+            --master_port={get_torch_dist_unique_port()}
+            {self.test_file_dir}/test_trainer_distributed.py
+        """.split()
+        output_dir = self.get_auto_remove_tmp_dir()
+        args = f"--output_dir {output_dir}".split()
+        cmd = ["torchrun"] + distributed_args + args
+        execute_subprocess_async(cmd, env=self.get_env())
+        # successful return here == success - any errors would have caused an error in the sub-call
+
 
 class TestTrainerDistributed(TestCasePlus):
     @require_torch_multi_gpu
     def test_trainer(self):
+        distributed_args = f"""--nproc_per_node={torch.cuda.device_count()}
+            --master_port={get_torch_dist_unique_port()}
+            {self.test_file_dir}/test_trainer_distributed.py
+        """.split()
+        output_dir = self.get_auto_remove_tmp_dir()
+        args = f"--output_dir {output_dir}".split()
+        cmd = ["torchrun"] + distributed_args + args
+        execute_subprocess_async(cmd, env=self.get_env())
+        # successful return here == success - any errors would have caused an error in the sub-call
 
-        distributed_args = f"""
-            -m torch.distributed.launch
-            --nproc_per_node={torch.cuda.device_count()}
+
+@require_torch_multi_xpu
+class TestTrainerDistributedXPU(TestCasePlus):
+    def test_trainer(self):
+        distributed_args = f"""--nproc_per_node={torch.xpu.device_count()}
             --master_port={get_torch_dist_unique_port()}
             {self.test_file_dir}/test_trainer_distributed.py
         """.split()
         output_dir = self.get_auto_remove_tmp_dir()
         args = f"--output_dir {output_dir}".split()
-        cmd = [sys.executable] + distributed_args + args
+        cmd = ["torchrun"] + distributed_args + args
         execute_subprocess_async(cmd, env=self.get_env())
         # successful return here == success - any errors would have caused an error in the sub-call
 
@@ -82,14 +176,14 @@ def test_trainer(self):
 if __name__ == "__main__":
     # The script below is meant to be run under torch.distributed, on a machine with multiple GPUs:
     #
-    # PYTHONPATH="src" python -m torch.distributed.launch --nproc_per_node 2 --output_dir output_dir ./tests/test_trainer_distributed.py
+    # PYTHONPATH="src" python -m torch.distributed.run --nproc_per_node 2 --output_dir output_dir ./tests/test_trainer_distributed.py
 
     parser = HfArgumentParser((TrainingArguments,))
     training_args = parser.parse_args_into_dataclasses()[0]
 
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
-        f"distributed training: {training_args.local_rank != -1}"
+        f"distributed training: {training_args.parallel_mode != ParallelMode.NOT_DISTRIBUTED}"
     )
 
     # Essentially, what we want to verify in the distributed case is that we get all samples back,
@@ -141,3 +235,14 @@ def compute_metrics(p: EvalPrediction) -> Dict:
             exit(1)
 
         trainer.args.eval_accumulation_steps = None
+
+    # Check that `dispatch_batches=False` will work on a finite iterable dataset
+
+    train_dataset = FiniteIterableDataset(label_names=["labels", "extra"], length=1)
+
+    model = RegressionModel()
+    training_args.per_device_train_batch_size = 1
+    training_args.max_steps = 1
+    training_args.dispatch_batches = False
+    trainer = Trainer(model, training_args, train_dataset=train_dataset)
+    trainer.train()
diff --git a/tests/trainer/test_trainer_seq2seq.py b/tests/trainer/test_trainer_seq2seq.py
index 6212dea1f7b5..918c22155832 100644
--- a/tests/trainer/test_trainer_seq2seq.py
+++ b/tests/trainer/test_trainer_seq2seq.py
@@ -56,8 +56,8 @@ def _map_to_encoder_decoder_inputs(batch):
             ]
             batch["decoder_attention_mask"] = outputs.attention_mask
 
-            assert all([len(x) == 512 for x in inputs.input_ids])
-            assert all([len(x) == 128 for x in outputs.input_ids])
+            assert all(len(x) == 512 for x in inputs.input_ids)
+            assert all(len(x) == 128 for x in outputs.input_ids)
 
             return batch
 
diff --git a/tests/trainer/test_trainer_utils.py b/tests/trainer/test_trainer_utils.py
index 869d19b0a1e6..ccf162677e9f 100644
--- a/tests/trainer/test_trainer_utils.py
+++ b/tests/trainer/test_trainer_utils.py
@@ -189,7 +189,7 @@ def test_group_by_length(self):
         # The biggest element should be first
         self.assertEqual(lengths[indices[0]], 50)
         # The indices should be a permutation of range(100)
-        self.assertEqual(list(sorted(indices)), list(range(100)))
+        self.assertEqual(sorted(indices), list(range(100)))
 
     def test_group_by_length_with_dict(self):
         # Get some inputs of random lengths
@@ -204,7 +204,7 @@ def test_group_by_length_with_dict(self):
         # The biggest element should be first
         self.assertEqual(len(data[indices[0]]["input_ids"]), 105)
         # The indices should be a permutation of range(6)
-        self.assertEqual(list(sorted(indices)), list(range(6)))
+        self.assertEqual(sorted(indices), list(range(6)))
 
     def test_group_by_length_with_batch_encoding(self):
         # Get some inputs of random lengths
@@ -219,7 +219,7 @@ def test_group_by_length_with_batch_encoding(self):
         # The biggest element should be first
         self.assertEqual(len(data[indices[0]]["input_ids"]), 105)
         # The indices should be a permutation of range(6)
-        self.assertEqual(list(sorted(indices)), list(range(6)))
+        self.assertEqual(sorted(indices), list(range(6)))
 
     def test_distributed_length_grouped(self):
         # Get some inputs of random lengths
@@ -232,7 +232,7 @@ def test_distributed_length_grouped(self):
         # The biggest element should be first
         self.assertEqual(lengths[indices_process_0[0]], 50)
         # The indices should be a permutation of range(100)
-        self.assertEqual(list(sorted(indices_process_0 + indices_process_1)), list(range(100)))
+        self.assertEqual(sorted(indices_process_0 + indices_process_1), list(range(100)))
 
     def test_get_parameter_names(self):
         model = nn.Sequential(TstLayer(128), nn.ModuleList([TstLayer(128), TstLayer(128)]))
diff --git a/tests/utils/test_activations.py b/tests/utils/test_activations.py
index 1e301f948a27..bc2034187210 100644
--- a/tests/utils/test_activations.py
+++ b/tests/utils/test_activations.py
@@ -51,6 +51,7 @@ def test_get_activation(self):
         get_activation("gelu_fast")
         get_activation("gelu_new")
         get_activation("gelu_python")
+        get_activation("gelu_pytorch_tanh")
         get_activation("linear")
         get_activation("mish")
         get_activation("quick_gelu")
diff --git a/tests/utils/test_add_new_model_like.py b/tests/utils/test_add_new_model_like.py
index 856523d19cd1..61ccc184f551 100644
--- a/tests/utils/test_add_new_model_like.py
+++ b/tests/utils/test_add_new_model_like.py
@@ -157,14 +157,14 @@ def test_add_content_to_text(self):
         self.assertEqual(
             add_content_to_text(test_text, line, add_before='    "bert": "BertConfig",', exact_match=True), expected
         )
-        self.assertEqual(add_content_to_text(test_text, line, add_before=re.compile('^\s*"bert":')), expected)
+        self.assertEqual(add_content_to_text(test_text, line, add_before=re.compile(r'^\s*"bert":')), expected)
 
         self.assertEqual(add_content_to_text(test_text, line, add_after="gpt"), expected)
         self.assertEqual(add_content_to_text(test_text, line, add_after="gpt", exact_match=True), test_text)
         self.assertEqual(
             add_content_to_text(test_text, line, add_after='    "gpt": "GPTConfig",', exact_match=True), expected
         )
-        self.assertEqual(add_content_to_text(test_text, line, add_after=re.compile('^\s*"gpt":')), expected)
+        self.assertEqual(add_content_to_text(test_text, line, add_after=re.compile(r'^\s*"gpt":')), expected)
 
     def test_add_content_to_file(self):
         test_text = """all_configs = {
@@ -197,7 +197,7 @@ def test_add_content_to_file(self):
             self.check_result(file_name, expected)
 
             self.init_file(file_name, test_text)
-            add_content_to_file(file_name, line, add_before=re.compile('^\s*"bert":'))
+            add_content_to_file(file_name, line, add_before=re.compile(r'^\s*"bert":'))
             self.check_result(file_name, expected)
 
             self.init_file(file_name, test_text)
@@ -213,7 +213,7 @@ def test_add_content_to_file(self):
             self.check_result(file_name, expected)
 
             self.init_file(file_name, test_text)
-            add_content_to_file(file_name, line, add_after=re.compile('^\s*"gpt":'))
+            add_content_to_file(file_name, line, add_after=re.compile(r'^\s*"gpt":'))
             self.check_result(file_name, expected)
 
     def test_simplify_replacements(self):
@@ -471,7 +471,7 @@ def test_get_model_files(self):
         bert_files = get_model_files("bert")
 
         doc_file = str(Path(bert_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/bert.mdx")
+        self.assertEqual(doc_file, "docs/source/en/model_doc/bert.md")
 
         model_files = {str(Path(f).relative_to(REPO_PATH)) for f in bert_files["model_files"]}
         self.assertEqual(model_files, BERT_MODEL_FILES)
@@ -490,7 +490,7 @@ def test_get_model_files(self):
         # VIT
         vit_files = get_model_files("vit")
         doc_file = str(Path(vit_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/vit.mdx")
+        self.assertEqual(doc_file, "docs/source/en/model_doc/vit.md")
 
         model_files = {str(Path(f).relative_to(REPO_PATH)) for f in vit_files["model_files"]}
         self.assertEqual(model_files, VIT_MODEL_FILES)
@@ -499,7 +499,7 @@ def test_get_model_files(self):
 
         test_files = {str(Path(f).relative_to(REPO_PATH)) for f in vit_files["test_files"]}
         vit_test_files = {
-            "tests/models/vit/test_feature_extraction_vit.py",
+            "tests/models/vit/test_image_processing_vit.py",
             "tests/models/vit/test_modeling_vit.py",
             "tests/models/vit/test_modeling_tf_vit.py",
             "tests/models/vit/test_modeling_flax_vit.py",
@@ -509,7 +509,7 @@ def test_get_model_files(self):
         # Wav2Vec2
         wav2vec2_files = get_model_files("wav2vec2")
         doc_file = str(Path(wav2vec2_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/wav2vec2.mdx")
+        self.assertEqual(doc_file, "docs/source/en/model_doc/wav2vec2.md")
 
         model_files = {str(Path(f).relative_to(REPO_PATH)) for f in wav2vec2_files["model_files"]}
         self.assertEqual(model_files, WAV2VEC2_MODEL_FILES)
@@ -532,7 +532,7 @@ def test_get_model_files_only_pt(self):
         bert_files = get_model_files("bert", frameworks=["pt"])
 
         doc_file = str(Path(bert_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/bert.mdx")
+        self.assertEqual(doc_file, "docs/source/en/model_doc/bert.md")
 
         model_files = {str(Path(f).relative_to(REPO_PATH)) for f in bert_files["model_files"]}
         bert_model_files = BERT_MODEL_FILES - {
@@ -553,7 +553,7 @@ def test_get_model_files_only_pt(self):
         # VIT
         vit_files = get_model_files("vit", frameworks=["pt"])
         doc_file = str(Path(vit_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/vit.mdx")
+        self.assertEqual(doc_file, "docs/source/en/model_doc/vit.md")
 
         model_files = {str(Path(f).relative_to(REPO_PATH)) for f in vit_files["model_files"]}
         vit_model_files = VIT_MODEL_FILES - {
@@ -566,7 +566,7 @@ def test_get_model_files_only_pt(self):
 
         test_files = {str(Path(f).relative_to(REPO_PATH)) for f in vit_files["test_files"]}
         vit_test_files = {
-            "tests/models/vit/test_feature_extraction_vit.py",
+            "tests/models/vit/test_image_processing_vit.py",
             "tests/models/vit/test_modeling_vit.py",
         }
         self.assertEqual(test_files, vit_test_files)
@@ -574,7 +574,7 @@ def test_get_model_files_only_pt(self):
         # Wav2Vec2
         wav2vec2_files = get_model_files("wav2vec2", frameworks=["pt"])
         doc_file = str(Path(wav2vec2_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/wav2vec2.mdx")
+        self.assertEqual(doc_file, "docs/source/en/model_doc/wav2vec2.md")
 
         model_files = {str(Path(f).relative_to(REPO_PATH)) for f in wav2vec2_files["model_files"]}
         wav2vec2_model_files = WAV2VEC2_MODEL_FILES - {
@@ -599,7 +599,7 @@ def test_get_model_files_tf_and_flax(self):
         bert_files = get_model_files("bert", frameworks=["tf", "flax"])
 
         doc_file = str(Path(bert_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/bert.mdx")
+        self.assertEqual(doc_file, "docs/source/en/model_doc/bert.md")
 
         model_files = {str(Path(f).relative_to(REPO_PATH)) for f in bert_files["model_files"]}
         bert_model_files = BERT_MODEL_FILES - {"src/transformers/models/bert/modeling_bert.py"}
@@ -618,7 +618,7 @@ def test_get_model_files_tf_and_flax(self):
         # VIT
         vit_files = get_model_files("vit", frameworks=["tf", "flax"])
         doc_file = str(Path(vit_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/vit.mdx")
+        self.assertEqual(doc_file, "docs/source/en/model_doc/vit.md")
 
         model_files = {str(Path(f).relative_to(REPO_PATH)) for f in vit_files["model_files"]}
         vit_model_files = VIT_MODEL_FILES - {"src/transformers/models/vit/modeling_vit.py"}
@@ -628,7 +628,7 @@ def test_get_model_files_tf_and_flax(self):
 
         test_files = {str(Path(f).relative_to(REPO_PATH)) for f in vit_files["test_files"]}
         vit_test_files = {
-            "tests/models/vit/test_feature_extraction_vit.py",
+            "tests/models/vit/test_image_processing_vit.py",
             "tests/models/vit/test_modeling_tf_vit.py",
             "tests/models/vit/test_modeling_flax_vit.py",
         }
@@ -637,7 +637,7 @@ def test_get_model_files_tf_and_flax(self):
         # Wav2Vec2
         wav2vec2_files = get_model_files("wav2vec2", frameworks=["tf", "flax"])
         doc_file = str(Path(wav2vec2_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/wav2vec2.mdx")
+        self.assertEqual(doc_file, "docs/source/en/model_doc/wav2vec2.md")
 
         model_files = {str(Path(f).relative_to(REPO_PATH)) for f in wav2vec2_files["model_files"]}
         wav2vec2_model_files = WAV2VEC2_MODEL_FILES - {"src/transformers/models/wav2vec2/modeling_wav2vec2.py"}
@@ -713,7 +713,7 @@ def test_retrieve_info_for_model_with_bert(self):
         self.assertEqual(test_files, bert_test_files)
 
         doc_file = str(Path(all_bert_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/bert.mdx")
+        self.assertEqual(doc_file, "docs/source/en/model_doc/bert.md")
 
         self.assertEqual(all_bert_files["module_name"], "bert")
 
@@ -762,7 +762,7 @@ def test_retrieve_info_for_model_pt_tf_with_bert(self):
         self.assertEqual(test_files, bert_test_files)
 
         doc_file = str(Path(all_bert_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/bert.mdx")
+        self.assertEqual(doc_file, "docs/source/en/model_doc/bert.md")
 
         self.assertEqual(all_bert_files["module_name"], "bert")
 
@@ -798,7 +798,7 @@ def test_retrieve_info_for_model_with_vit(self):
 
         test_files = {str(Path(f).relative_to(REPO_PATH)) for f in all_vit_files["test_files"]}
         vit_test_files = {
-            "tests/models/vit/test_feature_extraction_vit.py",
+            "tests/models/vit/test_image_processing_vit.py",
             "tests/models/vit/test_modeling_vit.py",
             "tests/models/vit/test_modeling_tf_vit.py",
             "tests/models/vit/test_modeling_flax_vit.py",
@@ -806,7 +806,7 @@ def test_retrieve_info_for_model_with_vit(self):
         self.assertEqual(test_files, vit_test_files)
 
         doc_file = str(Path(all_vit_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/vit.mdx")
+        self.assertEqual(doc_file, "docs/source/en/model_doc/vit.md")
 
         self.assertEqual(all_vit_files["module_name"], "vit")
 
@@ -860,7 +860,7 @@ def test_retrieve_info_for_model_with_wav2vec2(self):
         self.assertEqual(test_files, wav2vec2_test_files)
 
         doc_file = str(Path(all_wav2vec2_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/wav2vec2.mdx")
+        self.assertEqual(doc_file, "docs/source/en/model_doc/wav2vec2.md")
 
         self.assertEqual(all_wav2vec2_files["module_name"], "wav2vec2")
 
@@ -1476,8 +1476,8 @@ def test_duplicate_doc_file(self):
 """
 
         with tempfile.TemporaryDirectory() as tmp_dir:
-            doc_file = os.path.join(tmp_dir, "gpt2.mdx")
-            new_doc_file = os.path.join(tmp_dir, "gpt-new-new.mdx")
+            doc_file = os.path.join(tmp_dir, "gpt2.md")
+            new_doc_file = os.path.join(tmp_dir, "gpt-new-new.md")
 
             gpt2_model_patterns = ModelPatterns("GPT2", "gpt2", tokenizer_class="GPT2Tokenizer")
             new_model_patterns = ModelPatterns(
diff --git a/tests/utils/test_audio_utils.py b/tests/utils/test_audio_utils.py
new file mode 100644
index 000000000000..12d00929a969
--- /dev/null
+++ b/tests/utils/test_audio_utils.py
@@ -0,0 +1,757 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import pytest
+
+from transformers.audio_utils import (
+    amplitude_to_db,
+    hertz_to_mel,
+    mel_filter_bank,
+    mel_to_hertz,
+    power_to_db,
+    spectrogram,
+    window_function,
+)
+
+
+class AudioUtilsFunctionTester(unittest.TestCase):
+    def test_hertz_to_mel(self):
+        self.assertEqual(hertz_to_mel(0.0), 0.0)
+        self.assertAlmostEqual(hertz_to_mel(100), 150.48910241)
+
+        inputs = np.array([100, 200])
+        expected = np.array([150.48910241, 283.22989816])
+        self.assertTrue(np.allclose(hertz_to_mel(inputs), expected))
+
+        self.assertEqual(hertz_to_mel(0.0, "slaney"), 0.0)
+        self.assertEqual(hertz_to_mel(100, "slaney"), 1.5)
+
+        inputs = np.array([60, 100, 200, 1000, 1001, 2000])
+        expected = np.array([0.9, 1.5, 3.0, 15.0, 15.01453781, 25.08188016])
+        self.assertTrue(np.allclose(hertz_to_mel(inputs, "slaney"), expected))
+
+        inputs = np.array([60, 100, 200, 1000, 1001, 2000])
+        expected = np.array([92.6824, 150.4899, 283.2313, 999.9907, 1000.6534, 1521.3674])
+        self.assertTrue(np.allclose(hertz_to_mel(inputs, "kaldi"), expected))
+
+        with pytest.raises(ValueError):
+            hertz_to_mel(100, mel_scale=None)
+
+    def test_mel_to_hertz(self):
+        self.assertEqual(mel_to_hertz(0.0), 0.0)
+        self.assertAlmostEqual(mel_to_hertz(150.48910241), 100)
+
+        inputs = np.array([150.48910241, 283.22989816])
+        expected = np.array([100, 200])
+        self.assertTrue(np.allclose(mel_to_hertz(inputs), expected))
+
+        self.assertEqual(mel_to_hertz(0.0, "slaney"), 0.0)
+        self.assertEqual(mel_to_hertz(1.5, "slaney"), 100)
+
+        inputs = np.array([0.9, 1.5, 3.0, 15.0, 15.01453781, 25.08188016])
+        expected = np.array([60, 100, 200, 1000, 1001, 2000])
+        self.assertTrue(np.allclose(mel_to_hertz(inputs, "slaney"), expected))
+
+        inputs = np.array([92.6824, 150.4899, 283.2313, 999.9907, 1000.6534, 1521.3674])
+        expected = np.array([60, 100, 200, 1000, 1001, 2000])
+        self.assertTrue(np.allclose(mel_to_hertz(inputs, "kaldi"), expected))
+
+        with pytest.raises(ValueError):
+            mel_to_hertz(100, mel_scale=None)
+
+    def test_mel_filter_bank_shape(self):
+        mel_filters = mel_filter_bank(
+            num_frequency_bins=513,
+            num_mel_filters=13,
+            min_frequency=100,
+            max_frequency=4000,
+            sampling_rate=16000,
+            norm=None,
+            mel_scale="htk",
+        )
+        self.assertEqual(mel_filters.shape, (513, 13))
+
+        mel_filters = mel_filter_bank(
+            num_frequency_bins=513,
+            num_mel_filters=13,
+            min_frequency=100,
+            max_frequency=4000,
+            sampling_rate=16000,
+            norm="slaney",
+            mel_scale="slaney",
+        )
+        self.assertEqual(mel_filters.shape, (513, 13))
+
+        mel_filters = mel_filter_bank(
+            num_frequency_bins=513,
+            num_mel_filters=13,
+            min_frequency=100,
+            max_frequency=4000,
+            sampling_rate=16000,
+            norm="slaney",
+            mel_scale="slaney",
+            triangularize_in_mel_space=True,
+        )
+        self.assertEqual(mel_filters.shape, (513, 13))
+
+    def test_mel_filter_bank_htk(self):
+        mel_filters = mel_filter_bank(
+            num_frequency_bins=16,
+            num_mel_filters=4,
+            min_frequency=0,
+            max_frequency=2000,
+            sampling_rate=4000,
+            norm=None,
+            mel_scale="htk",
+        )
+        # fmt: off
+        expected = np.array([
+            [0.0       , 0.0       , 0.0       , 0.0       ],
+            [0.61454786, 0.0       , 0.0       , 0.0       ],
+            [0.82511046, 0.17488954, 0.0       , 0.0       ],
+            [0.35597035, 0.64402965, 0.0       , 0.0       ],
+            [0.0       , 0.91360726, 0.08639274, 0.0       ],
+            [0.0       , 0.55547007, 0.44452993, 0.0       ],
+            [0.0       , 0.19733289, 0.80266711, 0.0       ],
+            [0.0       , 0.0       , 0.87724349, 0.12275651],
+            [0.0       , 0.0       , 0.6038449 , 0.3961551 ],
+            [0.0       , 0.0       , 0.33044631, 0.66955369],
+            [0.0       , 0.0       , 0.05704771, 0.94295229],
+            [0.0       , 0.0       , 0.0       , 0.83483975],
+            [0.0       , 0.0       , 0.0       , 0.62612982],
+            [0.0       , 0.0       , 0.0       , 0.41741988],
+            [0.0       , 0.0       , 0.0       , 0.20870994],
+            [0.0       , 0.0       , 0.0       , 0.0       ]
+        ])
+        # fmt: on
+        self.assertTrue(np.allclose(mel_filters, expected))
+
+    def test_mel_filter_bank_slaney(self):
+        mel_filters = mel_filter_bank(
+            num_frequency_bins=16,
+            num_mel_filters=4,
+            min_frequency=0,
+            max_frequency=2000,
+            sampling_rate=4000,
+            norm=None,
+            mel_scale="slaney",
+        )
+        # fmt: off
+        expected = np.array([
+            [0.0       , 0.0       , 0.0       , 0.0       ],
+            [0.39869419, 0.0       , 0.0       , 0.0       ],
+            [0.79738839, 0.0       , 0.0       , 0.0       ],
+            [0.80391742, 0.19608258, 0.0       , 0.0       ],
+            [0.40522322, 0.59477678, 0.0       , 0.0       ],
+            [0.00652903, 0.99347097, 0.0       , 0.0       ],
+            [0.0       , 0.60796161, 0.39203839, 0.0       ],
+            [0.0       , 0.20939631, 0.79060369, 0.0       ],
+            [0.0       , 0.0       , 0.84685344, 0.15314656],
+            [0.0       , 0.0       , 0.52418477, 0.47581523],
+            [0.0       , 0.0       , 0.2015161 , 0.7984839 ],
+            [0.0       , 0.0       , 0.0       , 0.9141874 ],
+            [0.0       , 0.0       , 0.0       , 0.68564055],
+            [0.0       , 0.0       , 0.0       , 0.4570937 ],
+            [0.0       , 0.0       , 0.0       , 0.22854685],
+            [0.0       , 0.0       , 0.0       , 0.0       ]
+        ])
+        # fmt: on
+        self.assertTrue(np.allclose(mel_filters, expected))
+
+    def test_mel_filter_bank_kaldi(self):
+        mel_filters = mel_filter_bank(
+            num_frequency_bins=16,
+            num_mel_filters=4,
+            min_frequency=0,
+            max_frequency=2000,
+            sampling_rate=4000,
+            norm=None,
+            mel_scale="kaldi",
+            triangularize_in_mel_space=True,
+        )
+        # fmt: off
+        expected = np.array(
+        [[0.0000, 0.0000, 0.0000, 0.0000],
+        [0.6086, 0.0000, 0.0000, 0.0000],
+        [0.8689, 0.1311, 0.0000, 0.0000],
+        [0.4110, 0.5890, 0.0000, 0.0000],
+        [0.0036, 0.9964, 0.0000, 0.0000],
+        [0.0000, 0.6366, 0.3634, 0.0000],
+        [0.0000, 0.3027, 0.6973, 0.0000],
+        [0.0000, 0.0000, 0.9964, 0.0036],
+        [0.0000, 0.0000, 0.7135, 0.2865],
+        [0.0000, 0.0000, 0.4507, 0.5493],
+        [0.0000, 0.0000, 0.2053, 0.7947],
+        [0.0000, 0.0000, 0.0000, 0.9752],
+        [0.0000, 0.0000, 0.0000, 0.7585],
+        [0.0000, 0.0000, 0.0000, 0.5539],
+        [0.0000, 0.0000, 0.0000, 0.3599],
+        [0.0000, 0.0000, 0.0000, 0.1756]]
+        )
+        # fmt: on
+        self.assertTrue(np.allclose(mel_filters, expected, atol=5e-5))
+
+    def test_mel_filter_bank_slaney_norm(self):
+        mel_filters = mel_filter_bank(
+            num_frequency_bins=16,
+            num_mel_filters=4,
+            min_frequency=0,
+            max_frequency=2000,
+            sampling_rate=4000,
+            norm="slaney",
+            mel_scale="slaney",
+        )
+        # fmt: off
+        expected = np.array([
+            [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
+            [1.19217795e-03, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
+            [2.38435591e-03, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
+            [2.40387905e-03, 5.86232616e-04, 0.00000000e+00, 0.00000000e+00],
+            [1.21170110e-03, 1.77821783e-03, 0.00000000e+00, 0.00000000e+00],
+            [1.95231437e-05, 2.97020305e-03, 0.00000000e+00, 0.00000000e+00],
+            [0.00000000e+00, 1.81763684e-03, 1.04857612e-03, 0.00000000e+00],
+            [0.00000000e+00, 6.26036972e-04, 2.11460963e-03, 0.00000000e+00],
+            [0.00000000e+00, 0.00000000e+00, 2.26505954e-03, 3.07332945e-04],
+            [0.00000000e+00, 0.00000000e+00, 1.40202503e-03, 9.54861093e-04],
+            [0.00000000e+00, 0.00000000e+00, 5.38990521e-04, 1.60238924e-03],
+            [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.83458185e-03],
+            [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.37593638e-03],
+            [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 9.17290923e-04],
+            [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.58645462e-04],
+            [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00]
+        ])
+        # fmt: on
+        self.assertTrue(np.allclose(mel_filters, expected))
+
+    def test_window_function(self):
+        window = window_function(16, "hann")
+        self.assertEqual(len(window), 16)
+
+        # fmt: off
+        expected = np.array([
+            0.0, 0.03806023, 0.14644661, 0.30865828, 0.5, 0.69134172, 0.85355339, 0.96193977,
+            1.0, 0.96193977, 0.85355339, 0.69134172, 0.5, 0.30865828, 0.14644661, 0.03806023,
+        ])
+        # fmt: on
+        self.assertTrue(np.allclose(window, expected))
+
+    def _load_datasamples(self, num_samples):
+        from datasets import load_dataset
+
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+        return [x["array"] for x in speech_samples]
+
+    def test_spectrogram_impulse(self):
+        waveform = np.zeros(40)
+        waveform[9] = 1.0  # impulse shifted in time
+
+        spec = spectrogram(
+            waveform,
+            window_function(12, "hann", frame_length=16),
+            frame_length=16,
+            hop_length=4,
+            power=1.0,
+            center=True,
+            pad_mode="reflect",
+            onesided=True,
+        )
+        self.assertEqual(spec.shape, (9, 11))
+
+        expected = np.array([[0.0, 0.0669873, 0.9330127, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]])
+        self.assertTrue(np.allclose(spec, expected))
+
+    def test_spectrogram_integration_test(self):
+        waveform = self._load_datasamples(1)[0]
+
+        spec = spectrogram(
+            waveform,
+            window_function(400, "hann", frame_length=512),
+            frame_length=512,
+            hop_length=128,
+            power=1.0,
+            center=True,
+            pad_mode="reflect",
+            onesided=True,
+        )
+        self.assertEqual(spec.shape, (257, 732))
+
+        # fmt: off
+        expected = np.array([
+            0.02464888, 0.04648664, 0.05872392, 0.02311783, 0.0327175 ,
+            0.02433643, 0.01198814, 0.02055709, 0.01559287, 0.01394357,
+            0.01299037, 0.01728045, 0.0254554 , 0.02486533, 0.02011792,
+            0.01755333, 0.02100457, 0.02337024, 0.01436963, 0.01464558,
+            0.0211017 , 0.0193489 , 0.01272165, 0.01858462, 0.03722598,
+            0.0456542 , 0.03281558, 0.00620586, 0.02226466, 0.03618042,
+            0.03508182, 0.02271432, 0.01051649, 0.01225771, 0.02315293,
+            0.02331886, 0.01417785, 0.0106844 , 0.01791214, 0.017177  ,
+            0.02125114, 0.05028201, 0.06830665, 0.05216664, 0.01963666,
+            0.06941418, 0.11513043, 0.12257859, 0.10948435, 0.08568069,
+            0.05509328, 0.05047818, 0.047112  , 0.05060737, 0.02982424,
+            0.02803827, 0.02933729, 0.01760491, 0.00587815, 0.02117637,
+            0.0293578 , 0.03452379, 0.02194803, 0.01676056,
+        ])
+        # fmt: on
+        self.assertTrue(np.allclose(spec[:64, 400], expected))
+
+        spec = spectrogram(
+            waveform,
+            window_function(400, "hann"),
+            frame_length=400,
+            hop_length=128,
+            fft_length=512,
+            power=1.0,
+            center=True,
+            pad_mode="reflect",
+            onesided=True,
+        )
+        self.assertEqual(spec.shape, (257, 732))
+        self.assertTrue(np.allclose(spec[:64, 400], expected))
+
+        mel_filters = mel_filter_bank(
+            num_frequency_bins=256,
+            num_mel_filters=400,
+            min_frequency=20,
+            max_frequency=8000,
+            sampling_rate=16000,
+            norm=None,
+            mel_scale="kaldi",
+            triangularize_in_mel_space=True,
+        )
+
+        mel_filters = np.pad(mel_filters, ((0, 1), (0, 0)))
+
+        spec = spectrogram(
+            waveform,
+            window_function(400, "povey", periodic=False),
+            frame_length=400,
+            hop_length=160,
+            fft_length=512,
+            power=2.0,
+            center=False,
+            pad_mode="reflect",
+            onesided=True,
+            preemphasis=0.97,
+            mel_filters=mel_filters,
+            log_mel="log",
+            mel_floor=1.1920928955078125e-07,
+            remove_dc_offset=True,
+        )
+        self.assertEqual(spec.shape, (400, 584))
+
+        # fmt: off
+        expected = np.array([-15.94238515,  -8.20712299,  -8.22704352, -15.94238515,
+       -15.94238515, -15.94238515, -15.94238515, -15.94238515,
+        -6.52463769,  -7.73677889, -15.94238515, -15.94238515,
+       -15.94238515, -15.94238515,  -4.18650018,  -3.37195286,
+       -15.94238515, -15.94238515, -15.94238515, -15.94238515,
+        -4.70190154,  -2.4217066 , -15.94238515, -15.94238515,
+       -15.94238515, -15.94238515,  -5.62755239,  -3.53385194,
+       -15.94238515, -15.94238515, -15.94238515, -15.94238515,
+        -9.43303023,  -8.77480925, -15.94238515, -15.94238515,
+       -15.94238515, -15.94238515,  -4.2951092 ,  -5.51585994,
+       -15.94238515, -15.94238515, -15.94238515,  -4.40151721,
+        -3.95228878, -15.94238515, -15.94238515, -15.94238515,
+        -6.10365415,  -4.59494697, -15.94238515, -15.94238515,
+       -15.94238515,  -8.10727767,  -6.2585298 , -15.94238515,
+       -15.94238515, -15.94238515,  -5.60161702,  -4.47217004,
+       -15.94238515, -15.94238515, -15.94238515,  -5.91641988]
+        )
+        # fmt: on
+        self.assertTrue(np.allclose(spec[:64, 400], expected, atol=1e-5))
+
+    def test_spectrogram_center_padding(self):
+        waveform = self._load_datasamples(1)[0]
+
+        spec = spectrogram(
+            waveform,
+            window_function(512, "hann"),
+            frame_length=512,
+            hop_length=128,
+            center=True,
+            pad_mode="reflect",
+        )
+        self.assertEqual(spec.shape, (257, 732))
+
+        # fmt: off
+        expected = np.array([
+            0.1287945 , 0.12792738, 0.08311573, 0.03155122, 0.02470202,
+            0.00727857, 0.00910694, 0.00686163, 0.01238981, 0.01473668,
+            0.00336144, 0.00370314, 0.00600871, 0.01120164, 0.01942998,
+            0.03132008, 0.0232842 , 0.01124642, 0.02754783, 0.02423725,
+            0.00147893, 0.00038027, 0.00112299, 0.00596233, 0.00571529,
+            0.02084235, 0.0231855 , 0.00810006, 0.01837943, 0.00651339,
+            0.00093931, 0.00067426, 0.01058399, 0.01270507, 0.00151734,
+            0.00331913, 0.00302416, 0.01081792, 0.00754549, 0.00148963,
+            0.00111943, 0.00152573, 0.00608017, 0.01749986, 0.01205949,
+            0.0143082 , 0.01910573, 0.00413786, 0.03916619, 0.09873404,
+            0.08302026, 0.02673891, 0.00401255, 0.01397392, 0.00751862,
+            0.01024884, 0.01544606, 0.00638907, 0.00623633, 0.0085103 ,
+            0.00217659, 0.00276204, 0.00260835, 0.00299299,
+        ])
+        # fmt: on
+        self.assertTrue(np.allclose(spec[:64, 0], expected))
+
+        spec = spectrogram(
+            waveform,
+            window_function(512, "hann"),
+            frame_length=512,
+            hop_length=128,
+            center=True,
+            pad_mode="constant",
+        )
+        self.assertEqual(spec.shape, (257, 732))
+
+        # fmt: off
+        expected = np.array([
+            0.06558744, 0.06889656, 0.06263352, 0.04264418, 0.03404115,
+            0.03244197, 0.02279134, 0.01646339, 0.01452216, 0.00826055,
+            0.00062093, 0.0031821 , 0.00419456, 0.00689327, 0.01106367,
+            0.01712119, 0.01721762, 0.00977533, 0.01606626, 0.02275621,
+            0.01727687, 0.00992739, 0.01217688, 0.01049927, 0.01022947,
+            0.01302475, 0.01166873, 0.01081812, 0.01057327, 0.00767912,
+            0.00429567, 0.00089625, 0.00654583, 0.00912084, 0.00700984,
+            0.00225026, 0.00290545, 0.00667712, 0.00730663, 0.00410813,
+            0.00073102, 0.00219296, 0.00527618, 0.00996585, 0.01123781,
+            0.00872816, 0.01165121, 0.02047945, 0.03681747, 0.0514379 ,
+            0.05137928, 0.03960042, 0.02821562, 0.01813349, 0.01201322,
+            0.01260964, 0.00900654, 0.00207905, 0.00456714, 0.00850599,
+            0.00788239, 0.00664407, 0.00824227, 0.00628301,
+        ])
+        # fmt: on
+        self.assertTrue(np.allclose(spec[:64, 0], expected))
+
+        spec = spectrogram(
+            waveform,
+            window_function(512, "hann"),
+            frame_length=512,
+            hop_length=128,
+            center=False,
+        )
+        self.assertEqual(spec.shape, (257, 728))
+
+        # fmt: off
+        expected = np.array([
+            0.00250445, 0.02161521, 0.06232229, 0.04339567, 0.00937727,
+            0.01080616, 0.00248685, 0.0095264 , 0.00727476, 0.0079152 ,
+            0.00839946, 0.00254932, 0.00716622, 0.005559  , 0.00272623,
+            0.00581774, 0.01896395, 0.01829788, 0.01020514, 0.01632692,
+            0.00870888, 0.02065827, 0.0136022 , 0.0132382 , 0.011827  ,
+            0.00194505, 0.0189979 , 0.026874  , 0.02194014, 0.01923883,
+            0.01621437, 0.00661967, 0.00289517, 0.00470257, 0.00957801,
+            0.00191455, 0.00431664, 0.00544359, 0.01126213, 0.00785778,
+            0.00423469, 0.01322504, 0.02226548, 0.02318576, 0.03428908,
+            0.03648811, 0.0202938 , 0.011902  , 0.03226198, 0.06347476,
+            0.01306318, 0.05308729, 0.05474771, 0.03127991, 0.00998512,
+            0.01449977, 0.01272741, 0.00868176, 0.00850386, 0.00313876,
+            0.00811857, 0.00538216, 0.00685749, 0.00535275,
+        ])
+        # fmt: on
+        self.assertTrue(np.allclose(spec[:64, 0], expected))
+
+    def test_spectrogram_shapes(self):
+        waveform = self._load_datasamples(1)[0]
+
+        spec = spectrogram(
+            waveform,
+            window_function(400, "hann"),
+            frame_length=400,
+            hop_length=128,
+            power=1.0,
+            center=True,
+            pad_mode="reflect",
+            onesided=True,
+        )
+        self.assertEqual(spec.shape, (201, 732))
+
+        spec = spectrogram(
+            waveform,
+            window_function(400, "hann"),
+            frame_length=400,
+            hop_length=128,
+            power=1.0,
+            center=False,
+            pad_mode="reflect",
+            onesided=True,
+        )
+        self.assertEqual(spec.shape, (201, 729))
+
+        spec = spectrogram(
+            waveform,
+            window_function(400, "hann"),
+            frame_length=400,
+            hop_length=128,
+            fft_length=512,
+            power=1.0,
+            center=True,
+            pad_mode="reflect",
+            onesided=True,
+        )
+        self.assertEqual(spec.shape, (257, 732))
+
+        spec = spectrogram(
+            waveform,
+            window_function(400, "hann", frame_length=512),
+            frame_length=512,
+            hop_length=64,
+            power=1.0,
+            center=True,
+            pad_mode="reflect",
+            onesided=False,
+        )
+        self.assertEqual(spec.shape, (512, 1464))
+
+        spec = spectrogram(
+            waveform,
+            window_function(512, "hann"),
+            frame_length=512,
+            hop_length=64,
+            power=1.0,
+            center=True,
+            pad_mode="reflect",
+            onesided=False,
+        )
+        self.assertEqual(spec.shape, (512, 1464))
+
+        spec = spectrogram(
+            waveform,
+            window_function(512, "hann"),
+            frame_length=512,
+            hop_length=512,
+            power=1.0,
+            center=True,
+            pad_mode="reflect",
+            onesided=False,
+        )
+        self.assertEqual(spec.shape, (512, 183))
+
+    def test_mel_spectrogram(self):
+        waveform = self._load_datasamples(1)[0]
+
+        mel_filters = mel_filter_bank(
+            num_frequency_bins=513,
+            num_mel_filters=13,
+            min_frequency=100,
+            max_frequency=4000,
+            sampling_rate=16000,
+            norm=None,
+            mel_scale="htk",
+        )
+        self.assertEqual(mel_filters.shape, (513, 13))
+
+        spec = spectrogram(
+            waveform,
+            window_function(800, "hann", frame_length=1024),
+            frame_length=1024,
+            hop_length=128,
+            power=2.0,
+        )
+        self.assertEqual(spec.shape, (513, 732))
+
+        spec = spectrogram(
+            waveform,
+            window_function(800, "hann", frame_length=1024),
+            frame_length=1024,
+            hop_length=128,
+            power=2.0,
+            mel_filters=mel_filters,
+        )
+        self.assertEqual(spec.shape, (13, 732))
+
+        # fmt: off
+        expected = np.array([
+            1.08027889e+02, 1.48080673e+01, 7.70758213e+00, 9.57676639e-01,
+            8.81639061e-02, 5.26073833e-02, 1.52736155e-02, 9.95350117e-03,
+            7.95364356e-03, 1.01148004e-02, 4.29241020e-03, 9.90708797e-03,
+            9.44153646e-04
+        ])
+        # fmt: on
+        self.assertTrue(np.allclose(spec[:, 300], expected))
+
+    def test_spectrogram_power(self):
+        waveform = self._load_datasamples(1)[0]
+
+        spec = spectrogram(
+            waveform,
+            window_function(400, "hann", frame_length=512),
+            frame_length=512,
+            hop_length=128,
+            power=None,
+        )
+        self.assertEqual(spec.shape, (257, 732))
+        self.assertEqual(spec.dtype, np.complex64)
+
+        # fmt: off
+        expected = np.array([
+             0.01452305+0.01820039j, -0.01737362-0.01641946j,
+             0.0121028 +0.01565081j, -0.02794554-0.03021514j,
+             0.04719803+0.04086519j, -0.04391563-0.02779365j,
+             0.05682834+0.01571325j, -0.08604821-0.02023657j,
+             0.07497991+0.0186641j , -0.06366091-0.00922475j,
+             0.11003416+0.0114788j , -0.13677941-0.01523552j,
+             0.10934535-0.00117226j, -0.11635598+0.02551187j,
+             0.14708674-0.03469823j, -0.1328196 +0.06034218j,
+             0.12667368-0.13973421j, -0.14764774+0.18912019j,
+             0.10235471-0.12181523j, -0.00773012+0.04730498j,
+            -0.01487191-0.07312611j, -0.02739162+0.09619419j,
+             0.02895459-0.05398273j,  0.01198589+0.05276592j,
+            -0.02117299-0.10123465j,  0.00666388+0.09526499j,
+            -0.01672773-0.05649684j,  0.02723125+0.05939891j,
+            -0.01879361-0.062954j  ,  0.03686557+0.04568823j,
+            -0.07394181-0.07949649j,  0.06238583+0.13905765j,
+        ])
+        # fmt: on
+        self.assertTrue(np.allclose(spec[64:96, 321], expected))
+
+        spec = spectrogram(
+            waveform,
+            window_function(400, "hann", frame_length=512),
+            frame_length=512,
+            hop_length=128,
+            power=1.0,
+        )
+        self.assertEqual(spec.shape, (257, 732))
+        self.assertEqual(spec.dtype, np.float64)
+
+        # fmt: off
+        expected = np.array([
+            0.02328461, 0.02390484, 0.01978448, 0.04115711, 0.0624309 ,
+            0.05197181, 0.05896072, 0.08839577, 0.07726794, 0.06432579,
+            0.11063128, 0.13762532, 0.10935163, 0.11911998, 0.15112405,
+            0.14588428, 0.18860507, 0.23992978, 0.15910825, 0.04793241,
+            0.07462307, 0.10001811, 0.06125769, 0.05411011, 0.10342509,
+            0.09549777, 0.05892122, 0.06534349, 0.06569936, 0.05870678,
+            0.10856833, 0.1524107 , 0.11463385, 0.05766969, 0.12385171,
+            0.14472842, 0.11978184, 0.10353675, 0.07244056, 0.03461861,
+            0.02624896, 0.02227475, 0.01238363, 0.00885281, 0.0110049 ,
+            0.00807005, 0.01033663, 0.01703181, 0.01445856, 0.00585615,
+            0.0132431 , 0.02754132, 0.01524478, 0.0204908 , 0.07453328,
+            0.10716327, 0.07195779, 0.08816078, 0.18340898, 0.16449876,
+            0.12322842, 0.1621659 , 0.12334293, 0.06033659,
+        ])
+        # fmt: on
+        self.assertTrue(np.allclose(spec[64:128, 321], expected))
+
+        spec = spectrogram(
+            waveform,
+            window_function(400, "hann", frame_length=512),
+            frame_length=512,
+            hop_length=128,
+            power=2.0,
+        )
+        self.assertEqual(spec.shape, (257, 732))
+        self.assertEqual(spec.dtype, np.float64)
+
+        # fmt: off
+        expected = np.array([
+            5.42173162e-04, 5.71441371e-04, 3.91425507e-04, 1.69390778e-03,
+            3.89761780e-03, 2.70106923e-03, 3.47636663e-03, 7.81381316e-03,
+            5.97033510e-03, 4.13780799e-03, 1.22392802e-02, 1.89407300e-02,
+            1.19577805e-02, 1.41895693e-02, 2.28384770e-02, 2.12822221e-02,
+            3.55718732e-02, 5.75663000e-02, 2.53154356e-02, 2.29751552e-03,
+            5.56860259e-03, 1.00036217e-02, 3.75250424e-03, 2.92790355e-03,
+            1.06967501e-02, 9.11982451e-03, 3.47171025e-03, 4.26977174e-03,
+            4.31640586e-03, 3.44648538e-03, 1.17870830e-02, 2.32290216e-02,
+            1.31409196e-02, 3.32579296e-03, 1.53392460e-02, 2.09463164e-02,
+            1.43476883e-02, 1.07198600e-02, 5.24763530e-03, 1.19844836e-03,
+            6.89007982e-04, 4.96164430e-04, 1.53354369e-04, 7.83722571e-05,
+            1.21107812e-04, 6.51257360e-05, 1.06845939e-04, 2.90082477e-04,
+            2.09049831e-04, 3.42945241e-05, 1.75379610e-04, 7.58524227e-04,
+            2.32403356e-04, 4.19872697e-04, 5.55520924e-03, 1.14839673e-02,
+            5.17792348e-03, 7.77232368e-03, 3.36388536e-02, 2.70598419e-02,
+            1.51852425e-02, 2.62977779e-02, 1.52134784e-02, 3.64050455e-03,
+        ])
+        # fmt: on
+        self.assertTrue(np.allclose(spec[64:128, 321], expected))
+
+    def test_power_to_db(self):
+        spectrogram = np.zeros((2, 3))
+        spectrogram[0, 0] = 2.0
+        spectrogram[0, 1] = 0.5
+        spectrogram[0, 2] = 0.707
+        spectrogram[1, 1] = 1.0
+
+        output = power_to_db(spectrogram, reference=1.0)
+        expected = np.array([[3.01029996, -3.01029996, -1.50580586], [-100.0, 0.0, -100.0]])
+        self.assertTrue(np.allclose(output, expected))
+
+        output = power_to_db(spectrogram, reference=2.0)
+        expected = np.array([[0.0, -6.02059991, -4.51610582], [-103.01029996, -3.01029996, -103.01029996]])
+        self.assertTrue(np.allclose(output, expected))
+
+        output = power_to_db(spectrogram, min_value=1e-6)
+        expected = np.array([[3.01029996, -3.01029996, -1.50580586], [-60.0, 0.0, -60.0]])
+        self.assertTrue(np.allclose(output, expected))
+
+        output = power_to_db(spectrogram, db_range=80)
+        expected = np.array([[3.01029996, -3.01029996, -1.50580586], [-76.98970004, 0.0, -76.98970004]])
+        self.assertTrue(np.allclose(output, expected))
+
+        output = power_to_db(spectrogram, reference=2.0, db_range=80)
+        expected = np.array([[0.0, -6.02059991, -4.51610582], [-80.0, -3.01029996, -80.0]])
+        self.assertTrue(np.allclose(output, expected))
+
+        output = power_to_db(spectrogram, reference=2.0, min_value=1e-6, db_range=80)
+        expected = np.array([[0.0, -6.02059991, -4.51610582], [-63.01029996, -3.01029996, -63.01029996]])
+        self.assertTrue(np.allclose(output, expected))
+
+        with pytest.raises(ValueError):
+            power_to_db(spectrogram, reference=0.0)
+        with pytest.raises(ValueError):
+            power_to_db(spectrogram, min_value=0.0)
+        with pytest.raises(ValueError):
+            power_to_db(spectrogram, db_range=-80)
+
+    def test_amplitude_to_db(self):
+        spectrogram = np.zeros((2, 3))
+        spectrogram[0, 0] = 2.0
+        spectrogram[0, 1] = 0.5
+        spectrogram[0, 2] = 0.707
+        spectrogram[1, 1] = 1.0
+
+        output = amplitude_to_db(spectrogram, reference=1.0)
+        expected = np.array([[6.02059991, -6.02059991, -3.01161172], [-100.0, 0.0, -100.0]])
+        self.assertTrue(np.allclose(output, expected))
+
+        output = amplitude_to_db(spectrogram, reference=2.0)
+        expected = np.array([[0.0, -12.04119983, -9.03221164], [-106.02059991, -6.02059991, -106.02059991]])
+        self.assertTrue(np.allclose(output, expected))
+
+        output = amplitude_to_db(spectrogram, min_value=1e-3)
+        expected = np.array([[6.02059991, -6.02059991, -3.01161172], [-60.0, 0.0, -60.0]])
+        self.assertTrue(np.allclose(output, expected))
+
+        output = amplitude_to_db(spectrogram, db_range=80)
+        expected = np.array([[6.02059991, -6.02059991, -3.01161172], [-73.97940009, 0.0, -73.97940009]])
+        self.assertTrue(np.allclose(output, expected))
+
+        output = amplitude_to_db(spectrogram, reference=2.0, db_range=80)
+        expected = np.array([[0.0, -12.04119983, -9.03221164], [-80.0, -6.02059991, -80.0]])
+        self.assertTrue(np.allclose(output, expected))
+
+        output = amplitude_to_db(spectrogram, reference=2.0, min_value=1e-3, db_range=80)
+        expected = np.array([[0.0, -12.04119983, -9.03221164], [-66.02059991, -6.02059991, -66.02059991]])
+        self.assertTrue(np.allclose(output, expected))
+
+        with pytest.raises(ValueError):
+            amplitude_to_db(spectrogram, reference=0.0)
+        with pytest.raises(ValueError):
+            amplitude_to_db(spectrogram, min_value=0.0)
+        with pytest.raises(ValueError):
+            amplitude_to_db(spectrogram, db_range=-80)
diff --git a/tests/utils/test_backbone_utils.py b/tests/utils/test_backbone_utils.py
new file mode 100644
index 000000000000..66b7087da246
--- /dev/null
+++ b/tests/utils/test_backbone_utils.py
@@ -0,0 +1,102 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers.utils.backbone_utils import (
+    BackboneMixin,
+    get_aligned_output_features_output_indices,
+    verify_out_features_out_indices,
+)
+
+
+class BackboneUtilsTester(unittest.TestCase):
+    def test_get_aligned_output_features_output_indices(self):
+        stage_names = ["a", "b", "c"]
+
+        # Defaults to last layer if both are None
+        out_features, out_indices = get_aligned_output_features_output_indices(None, None, stage_names)
+        self.assertEqual(out_features, ["c"])
+        self.assertEqual(out_indices, [2])
+
+        # Out indices set to match out features
+        out_features, out_indices = get_aligned_output_features_output_indices(["a", "c"], None, stage_names)
+        self.assertEqual(out_features, ["a", "c"])
+        self.assertEqual(out_indices, [0, 2])
+
+        # Out features set to match out indices
+        out_features, out_indices = get_aligned_output_features_output_indices(None, [0, 2], stage_names)
+        self.assertEqual(out_features, ["a", "c"])
+        self.assertEqual(out_indices, [0, 2])
+
+        # Out features selected from negative indices
+        out_features, out_indices = get_aligned_output_features_output_indices(None, [-3, -1], stage_names)
+        self.assertEqual(out_features, ["a", "c"])
+        self.assertEqual(out_indices, [-3, -1])
+
+    def test_verify_out_features_out_indices(self):
+        # Stage names must be set
+        with self.assertRaises(ValueError):
+            verify_out_features_out_indices(["a", "b"], (0, 1), None)
+
+        # Out features must be a list
+        with self.assertRaises(ValueError):
+            verify_out_features_out_indices(("a", "b"), (0, 1), ["a", "b"])
+
+        # Out features must be a subset of stage names
+        with self.assertRaises(ValueError):
+            verify_out_features_out_indices(["a", "b"], (0, 1), ["a"])
+
+        # Out indices must be a list or tuple
+        with self.assertRaises(ValueError):
+            verify_out_features_out_indices(None, 0, ["a", "b"])
+
+        # Out indices must be a subset of stage names
+        with self.assertRaises(ValueError):
+            verify_out_features_out_indices(None, (0, 1), ["a"])
+
+        # Out features and out indices must be the same length
+        with self.assertRaises(ValueError):
+            verify_out_features_out_indices(["a", "b"], (0,), ["a", "b", "c"])
+
+        # Out features should match out indices
+        with self.assertRaises(ValueError):
+            verify_out_features_out_indices(["a", "b"], (0, 2), ["a", "b", "c"])
+
+        # Out features and out indices should be in order
+        with self.assertRaises(ValueError):
+            verify_out_features_out_indices(["b", "a"], (0, 1), ["a", "b"])
+
+        # Check passes with valid inputs
+        verify_out_features_out_indices(["a", "b", "d"], (0, 1, -1), ["a", "b", "c", "d"])
+
+    def test_backbone_mixin(self):
+        backbone = BackboneMixin()
+
+        backbone.stage_names = ["a", "b", "c"]
+        backbone._out_features = ["a", "c"]
+        backbone._out_indices = [0, 2]
+
+        # Check that the output features and indices are set correctly
+        self.assertEqual(backbone.out_features, ["a", "c"])
+        self.assertEqual(backbone.out_indices, [0, 2])
+
+        # Check out features and indices are updated correctly
+        backbone.out_features = ["a", "b"]
+        self.assertEqual(backbone.out_features, ["a", "b"])
+        self.assertEqual(backbone.out_indices, [0, 1])
+
+        backbone.out_indices = [-3, -1]
+        self.assertEqual(backbone.out_features, ["a", "c"])
+        self.assertEqual(backbone.out_indices, [-3, -1])
diff --git a/tests/utils/test_cli.py b/tests/utils/test_cli.py
index f39aa600679a..fc7b8ebb5e02 100644
--- a/tests/utils/test_cli.py
+++ b/tests/utils/test_cli.py
@@ -18,7 +18,7 @@
 import unittest
 from unittest.mock import patch
 
-from transformers.testing_utils import CaptureStd, is_pt_tf_cross_test
+from transformers.testing_utils import CaptureStd, is_pt_tf_cross_test, require_torch
 
 
 class CLITest(unittest.TestCase):
@@ -45,3 +45,47 @@ def test_cli_pt_to_tf(self):
 
         # The original repo has no TF weights -- if they exist, they were created by the CLI
         self.assertTrue(os.path.exists("/tmp/hf-internal-testing/tiny-random-gptj/tf_model.h5"))
+
+    @require_torch
+    @patch("sys.argv", ["fakeprogrampath", "download", "hf-internal-testing/tiny-random-gptj", "--cache-dir", "/tmp"])
+    def test_cli_download(self):
+        import transformers.commands.transformers_cli
+
+        # # remove any previously downloaded model to start clean
+        shutil.rmtree("/tmp/models--hf-internal-testing--tiny-random-gptj", ignore_errors=True)
+
+        # run the command
+        transformers.commands.transformers_cli.main()
+
+        # check if the model files are downloaded correctly on /tmp/models--hf-internal-testing--tiny-random-gptj
+        self.assertTrue(os.path.exists("/tmp/models--hf-internal-testing--tiny-random-gptj/blobs"))
+        self.assertTrue(os.path.exists("/tmp/models--hf-internal-testing--tiny-random-gptj/refs"))
+        self.assertTrue(os.path.exists("/tmp/models--hf-internal-testing--tiny-random-gptj/snapshots"))
+
+    @require_torch
+    @patch(
+        "sys.argv",
+        [
+            "fakeprogrampath",
+            "download",
+            "hf-internal-testing/test_dynamic_model_with_tokenizer",
+            "--trust-remote-code",
+            "--cache-dir",
+            "/tmp",
+        ],
+    )
+    def test_cli_download_trust_remote(self):
+        import transformers.commands.transformers_cli
+
+        # # remove any previously downloaded model to start clean
+        shutil.rmtree("/tmp/models--hf-internal-testing--test_dynamic_model_with_tokenizer", ignore_errors=True)
+
+        # run the command
+        transformers.commands.transformers_cli.main()
+
+        # check if the model files are downloaded correctly on /tmp/models--hf-internal-testing--test_dynamic_model_with_tokenizer
+        self.assertTrue(os.path.exists("/tmp/models--hf-internal-testing--test_dynamic_model_with_tokenizer/blobs"))
+        self.assertTrue(os.path.exists("/tmp/models--hf-internal-testing--test_dynamic_model_with_tokenizer/refs"))
+        self.assertTrue(
+            os.path.exists("/tmp/models--hf-internal-testing--test_dynamic_model_with_tokenizer/snapshots")
+        )
diff --git a/tests/utils/test_convert_slow_tokenizer.py b/tests/utils/test_convert_slow_tokenizer.py
index 8655ea4602e7..edeb06c390c0 100644
--- a/tests/utils/test_convert_slow_tokenizer.py
+++ b/tests/utils/test_convert_slow_tokenizer.py
@@ -27,6 +27,7 @@ def test_spm_converter_bytefallback_warning(self):
         with warnings.catch_warnings(record=True) as w:
             _ = SpmConverter(original_tokenizer_with_bytefallback)
         self.assertEqual(len(w), 1)
+
         self.assertIn(
             "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
             " which is not implemented in the fast tokenizers.",
diff --git a/tests/utils/test_dynamic_module_utils.py b/tests/utils/test_dynamic_module_utils.py
new file mode 100644
index 000000000000..dfdc63460cd3
--- /dev/null
+++ b/tests/utils/test_dynamic_module_utils.py
@@ -0,0 +1,129 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pytest
+
+from transformers.dynamic_module_utils import get_imports
+
+
+TOP_LEVEL_IMPORT = """
+import os
+"""
+
+IMPORT_IN_FUNCTION = """
+def foo():
+    import os
+    return False
+"""
+
+DEEPLY_NESTED_IMPORT = """
+def foo():
+    def bar():
+        if True:
+            import os
+        return False
+    return bar()
+"""
+
+TOP_LEVEL_TRY_IMPORT = """
+import os
+
+try:
+    import bar
+except ImportError:
+    raise ValueError()
+"""
+
+TRY_IMPORT_IN_FUNCTION = """
+import os
+
+def foo():
+    try:
+        import bar
+    except ImportError:
+        raise ValueError()
+"""
+
+MULTIPLE_EXCEPTS_IMPORT = """
+import os
+
+try:
+    import bar
+except (ImportError, AttributeError):
+    raise ValueError()
+"""
+
+EXCEPT_AS_IMPORT = """
+import os
+
+try:
+    import bar
+except ImportError as e:
+    raise ValueError()
+"""
+
+GENERIC_EXCEPT_IMPORT = """
+import os
+
+try:
+    import bar
+except:
+    raise ValueError()
+"""
+
+MULTILINE_TRY_IMPORT = """
+import os
+
+try:
+    import bar
+    import baz
+except ImportError:
+    raise ValueError()
+"""
+
+MULTILINE_BOTH_IMPORT = """
+import os
+
+try:
+    import bar
+    import baz
+except ImportError:
+    x = 1
+    raise ValueError()
+"""
+
+CASES = [
+    TOP_LEVEL_IMPORT,
+    IMPORT_IN_FUNCTION,
+    DEEPLY_NESTED_IMPORT,
+    TOP_LEVEL_TRY_IMPORT,
+    GENERIC_EXCEPT_IMPORT,
+    MULTILINE_TRY_IMPORT,
+    MULTILINE_BOTH_IMPORT,
+    MULTIPLE_EXCEPTS_IMPORT,
+    EXCEPT_AS_IMPORT,
+    TRY_IMPORT_IN_FUNCTION,
+]
+
+
+@pytest.mark.parametrize("case", CASES)
+def test_import_parsing(tmp_path, case):
+    tmp_file_path = os.path.join(tmp_path, "test_file.py")
+    with open(tmp_file_path, "w") as _tmp_file:
+        _tmp_file.write(case)
+
+    parsed_imports = get_imports(tmp_file_path)
+    assert parsed_imports == ["os"]
diff --git a/tests/utils/test_file_utils.py b/tests/utils/test_file_utils.py
index e7963bfa51a5..1cbde0fb18c6 100644
--- a/tests/utils/test_file_utils.py
+++ b/tests/utils/test_file_utils.py
@@ -21,10 +21,20 @@
 
 # Try to import everything from transformers to ensure every object can be loaded.
 from transformers import *  # noqa F406
-from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER
+from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER, require_flax, require_tf, require_torch
 from transformers.utils import ContextManagers, find_labels, is_flax_available, is_tf_available, is_torch_available
 
 
+if is_torch_available():
+    from transformers import BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification
+
+if is_tf_available():
+    from transformers import TFBertForPreTraining, TFBertForQuestionAnswering, TFBertForSequenceClassification
+
+if is_flax_available():
+    from transformers import FlaxBertForPreTraining, FlaxBertForQuestionAnswering, FlaxBertForSequenceClassification
+
+
 MODEL_ID = DUMMY_UNKNOWN_IDENTIFIER
 # An actual model hosted on huggingface.co
 
@@ -85,29 +95,39 @@ def test_context_managers_two_context(self, mock_stdout):
         # The output should be wrapped with an English and French welcome and goodbye
         self.assertEqual(mock_stdout.getvalue(), "Bonjour!\nWelcome!\nTransformers are awesome!\nBye!\nAu revoir!\n")
 
-    def test_find_labels(self):
-        if is_torch_available():
-            from transformers import BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification
+    @require_torch
+    def test_find_labels_pt(self):
+        self.assertEqual(find_labels(BertForSequenceClassification), ["labels"])
+        self.assertEqual(find_labels(BertForPreTraining), ["labels", "next_sentence_label"])
+        self.assertEqual(find_labels(BertForQuestionAnswering), ["start_positions", "end_positions"])
+
+        # find_labels works regardless of the class name (it detects the framework through inheritance)
+        class DummyModel(BertForSequenceClassification):
+            pass
+
+        self.assertEqual(find_labels(DummyModel), ["labels"])
+
+    @require_tf
+    def test_find_labels_tf(self):
+        self.assertEqual(find_labels(TFBertForSequenceClassification), ["labels"])
+        self.assertEqual(find_labels(TFBertForPreTraining), ["labels", "next_sentence_label"])
+        self.assertEqual(find_labels(TFBertForQuestionAnswering), ["start_positions", "end_positions"])
 
-            self.assertEqual(find_labels(BertForSequenceClassification), ["labels"])
-            self.assertEqual(find_labels(BertForPreTraining), ["labels", "next_sentence_label"])
-            self.assertEqual(find_labels(BertForQuestionAnswering), ["start_positions", "end_positions"])
+        # find_labels works regardless of the class name (it detects the framework through inheritance)
+        class DummyModel(TFBertForSequenceClassification):
+            pass
 
-        if is_tf_available():
-            from transformers import TFBertForPreTraining, TFBertForQuestionAnswering, TFBertForSequenceClassification
+        self.assertEqual(find_labels(DummyModel), ["labels"])
 
-            self.assertEqual(find_labels(TFBertForSequenceClassification), ["labels"])
-            self.assertEqual(find_labels(TFBertForPreTraining), ["labels", "next_sentence_label"])
-            self.assertEqual(find_labels(TFBertForQuestionAnswering), ["start_positions", "end_positions"])
+    @require_flax
+    def test_find_labels_flax(self):
+        # Flax models don't have labels
+        self.assertEqual(find_labels(FlaxBertForSequenceClassification), [])
+        self.assertEqual(find_labels(FlaxBertForPreTraining), [])
+        self.assertEqual(find_labels(FlaxBertForQuestionAnswering), [])
 
-        if is_flax_available():
-            # Flax models don't have labels
-            from transformers import (
-                FlaxBertForPreTraining,
-                FlaxBertForQuestionAnswering,
-                FlaxBertForSequenceClassification,
-            )
+        # find_labels works regardless of the class name (it detects the framework through inheritance)
+        class DummyModel(FlaxBertForSequenceClassification):
+            pass
 
-            self.assertEqual(find_labels(FlaxBertForSequenceClassification), [])
-            self.assertEqual(find_labels(FlaxBertForPreTraining), [])
-            self.assertEqual(find_labels(FlaxBertForQuestionAnswering), [])
+        self.assertEqual(find_labels(DummyModel), [])
diff --git a/tests/utils/test_hf_argparser.py b/tests/utils/test_hf_argparser.py
index da824f474382..c0fa748cbfa4 100644
--- a/tests/utils/test_hf_argparser.py
+++ b/tests/utils/test_hf_argparser.py
@@ -15,25 +15,24 @@
 import argparse
 import json
 import os
+import sys
 import tempfile
 import unittest
 from argparse import Namespace
 from dataclasses import dataclass, field
 from enum import Enum
 from pathlib import Path
-from typing import List, Optional
+from typing import List, Literal, Optional
 
 import yaml
+
 from transformers import HfArgumentParser, TrainingArguments
 from transformers.hf_argparser import make_choice_type_function, string_to_bool
 
 
-try:
-    # For Python versions <3.8, Literal is not in typing: https://peps.python.org/pep-0586/
-    from typing import Literal
-except ImportError:
-    # For Python 3.7
-    from typing_extensions import Literal
+# Since Python 3.10, we can use the builtin `|` operator for Union types
+# See PEP 604: https://peps.python.org/pep-0604
+is_python_no_less_than_3_10 = sys.version_info >= (3, 10)
 
 
 def list_field(default=None, metadata=None):
@@ -124,6 +123,23 @@ class StringLiteralAnnotationExample:
     foo_str: "List[str]" = list_field(default=["Hallo", "Bonjour", "Hello"])
 
 
+if is_python_no_less_than_3_10:
+
+    @dataclass
+    class WithDefaultBoolExamplePep604:
+        foo: bool = False
+        baz: bool = True
+        opt: bool | None = None
+
+    @dataclass
+    class OptionalExamplePep604:
+        foo: int | None = None
+        bar: float | None = field(default=None, metadata={"help": "help message"})
+        baz: str | None = None
+        ces: list[str] | None = list_field(default=[])
+        des: list[int] | None = list_field(default=[])
+
+
 class HfArgumentParserTest(unittest.TestCase):
     def argparsersEqual(self, a: argparse.ArgumentParser, b: argparse.ArgumentParser):
         """
@@ -166,8 +182,6 @@ def test_with_default(self):
         self.argparsersEqual(parser, expected)
 
     def test_with_default_bool(self):
-        parser = HfArgumentParser(WithDefaultBoolExample)
-
         expected = argparse.ArgumentParser()
         expected.add_argument("--foo", type=string_to_bool, default=False, const=True, nargs="?")
         expected.add_argument("--baz", type=string_to_bool, default=True, const=True, nargs="?")
@@ -175,22 +189,29 @@ def test_with_default_bool(self):
         # and its default must be set to False
         expected.add_argument("--no_baz", action="store_false", default=False, dest="baz")
         expected.add_argument("--opt", type=string_to_bool, default=None)
-        self.argparsersEqual(parser, expected)
 
-        args = parser.parse_args([])
-        self.assertEqual(args, Namespace(foo=False, baz=True, opt=None))
+        dataclass_types = [WithDefaultBoolExample]
+        if is_python_no_less_than_3_10:
+            dataclass_types.append(WithDefaultBoolExamplePep604)
 
-        args = parser.parse_args(["--foo", "--no_baz"])
-        self.assertEqual(args, Namespace(foo=True, baz=False, opt=None))
+        for dataclass_type in dataclass_types:
+            parser = HfArgumentParser(dataclass_type)
+            self.argparsersEqual(parser, expected)
 
-        args = parser.parse_args(["--foo", "--baz"])
-        self.assertEqual(args, Namespace(foo=True, baz=True, opt=None))
+            args = parser.parse_args([])
+            self.assertEqual(args, Namespace(foo=False, baz=True, opt=None))
 
-        args = parser.parse_args(["--foo", "True", "--baz", "True", "--opt", "True"])
-        self.assertEqual(args, Namespace(foo=True, baz=True, opt=True))
+            args = parser.parse_args(["--foo", "--no_baz"])
+            self.assertEqual(args, Namespace(foo=True, baz=False, opt=None))
 
-        args = parser.parse_args(["--foo", "False", "--baz", "False", "--opt", "False"])
-        self.assertEqual(args, Namespace(foo=False, baz=False, opt=False))
+            args = parser.parse_args(["--foo", "--baz"])
+            self.assertEqual(args, Namespace(foo=True, baz=True, opt=None))
+
+            args = parser.parse_args(["--foo", "True", "--baz", "True", "--opt", "True"])
+            self.assertEqual(args, Namespace(foo=True, baz=True, opt=True))
+
+            args = parser.parse_args(["--foo", "False", "--baz", "False", "--opt", "False"])
+            self.assertEqual(args, Namespace(foo=False, baz=False, opt=False))
 
     def test_with_enum(self):
         parser = HfArgumentParser(MixedTypeEnumExample)
@@ -265,21 +286,27 @@ def test_with_list(self):
         self.assertEqual(args, Namespace(foo_int=[1], bar_int=[2, 3], foo_str=["a", "b", "c"], foo_float=[0.1, 0.7]))
 
     def test_with_optional(self):
-        parser = HfArgumentParser(OptionalExample)
-
         expected = argparse.ArgumentParser()
         expected.add_argument("--foo", default=None, type=int)
         expected.add_argument("--bar", default=None, type=float, help="help message")
         expected.add_argument("--baz", default=None, type=str)
         expected.add_argument("--ces", nargs="+", default=[], type=str)
         expected.add_argument("--des", nargs="+", default=[], type=int)
-        self.argparsersEqual(parser, expected)
 
-        args = parser.parse_args([])
-        self.assertEqual(args, Namespace(foo=None, bar=None, baz=None, ces=[], des=[]))
+        dataclass_types = [OptionalExample]
+        if is_python_no_less_than_3_10:
+            dataclass_types.append(OptionalExamplePep604)
+
+        for dataclass_type in dataclass_types:
+            parser = HfArgumentParser(dataclass_type)
+
+            self.argparsersEqual(parser, expected)
+
+            args = parser.parse_args([])
+            self.assertEqual(args, Namespace(foo=None, bar=None, baz=None, ces=[], des=[]))
 
-        args = parser.parse_args("--foo 12 --bar 3.14 --baz 42 --ces a b c --des 1 2 3".split())
-        self.assertEqual(args, Namespace(foo=12, bar=3.14, baz="42", ces=["a", "b", "c"], des=[1, 2, 3]))
+            args = parser.parse_args("--foo 12 --bar 3.14 --baz 42 --ces a b c --des 1 2 3".split())
+            self.assertEqual(args, Namespace(foo=12, bar=3.14, baz="42", ces=["a", "b", "c"], des=[1, 2, 3]))
 
     def test_with_required(self):
         parser = HfArgumentParser(RequiredExample)
diff --git a/tests/utils/test_hub_utils.py b/tests/utils/test_hub_utils.py
index c8c7d0faad70..5ff1ed679e6e 100644
--- a/tests/utils/test_hub_utils.py
+++ b/tests/utils/test_hub_utils.py
@@ -19,6 +19,7 @@
 from pathlib import Path
 
 from requests.exceptions import HTTPError
+
 from transformers.utils import (
     CONFIG_NAME,
     FLAX_WEIGHTS_NAME,
@@ -35,6 +36,9 @@
 CACHE_DIR = os.path.join(TRANSFORMERS_CACHE, "models--hf-internal-testing--tiny-random-bert")
 FULL_COMMIT_HASH = "9b8c223d42b2188cb49d29af482996f9d0f3e5a6"
 
+GATED_REPO = "hf-internal-testing/dummy-gated-model"
+README_FILE = "README.md"
+
 
 class GetFromCacheTests(unittest.TestCase):
     def test_cached_file(self):
@@ -88,7 +92,7 @@ def test_non_existence_is_cached(self):
         response_mock.json.return_value = {}
 
         # Under the mock environment we get a 500 error when trying to reach the tokenizer.
-        with mock.patch("requests.request", return_value=response_mock) as mock_head:
+        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
             path = cached_file(RANDOM_BERT, "conf", _raise_exceptions_for_connection_errors=False)
             self.assertIsNone(path)
             # This check we did call the fake head request
@@ -123,3 +127,15 @@ def test_get_file_from_repo_local(self):
             self.assertEqual(get_file_from_repo(tmp_dir, "a.txt"), str(filename))
 
             self.assertIsNone(get_file_from_repo(tmp_dir, "b.txt"))
+
+    def test_get_file_gated_repo(self):
+        """Test download file from a gated repo fails with correct message when not authenticated."""
+        with self.assertRaisesRegex(EnvironmentError, "You are trying to access a gated repo."):
+            # All files except README.md are protected on a gated repo.
+            cached_file(GATED_REPO, "gated_file.txt", use_auth_token=False)
+
+    def test_has_file_gated_repo(self):
+        """Test check file existence from a gated repo fails with correct message when not authenticated."""
+        with self.assertRaisesRegex(EnvironmentError, "is a gated repository"):
+            # All files except README.md are protected on a gated repo.
+            has_file(GATED_REPO, "gated_file.txt", use_auth_token=False)
diff --git a/tests/utils/test_image_utils.py b/tests/utils/test_image_utils.py
index 6868e117c4c3..5d899c2f1ddf 100644
--- a/tests/utils/test_image_utils.py
+++ b/tests/utils/test_image_utils.py
@@ -13,15 +13,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+import tempfile
 import unittest
 
 import datasets
 import numpy as np
 import pytest
+from huggingface_hub.file_download import http_get
+from requests import ConnectTimeout, ReadTimeout
 
+from tests.pipelines.test_pipelines_document_question_answering import INVOICE_URL
 from transformers import is_torch_available, is_vision_available
-from transformers.image_utils import ChannelDimension, get_channel_dimension_axis
-from transformers.testing_utils import require_torch, require_vision
+from transformers.image_utils import ChannelDimension, get_channel_dimension_axis, make_list_of_images
+from transformers.testing_utils import is_flaky, require_torch, require_vision
 
 
 if is_torch_available():
@@ -102,6 +107,58 @@ def test_conversion_array_to_array(self):
         self.assertEqual(array5.shape, (3, 16, 32))
         self.assertTrue(np.array_equal(array5, array1))
 
+    def test_make_list_of_images_numpy(self):
+        # Test a single image is converted to a list of 1 image
+        images = np.random.randint(0, 256, (16, 32, 3))
+        images_list = make_list_of_images(images)
+        self.assertEqual(len(images_list), 1)
+        self.assertTrue(np.array_equal(images_list[0], images))
+        self.assertIsInstance(images_list, list)
+
+        # Test a batch of images is converted to a list of images
+        images = np.random.randint(0, 256, (4, 16, 32, 3))
+        images_list = make_list_of_images(images)
+        self.assertEqual(len(images_list), 4)
+        self.assertTrue(np.array_equal(images_list[0], images[0]))
+        self.assertIsInstance(images_list, list)
+
+        # Test a list of images is not modified
+        images = [np.random.randint(0, 256, (16, 32, 3)) for _ in range(4)]
+        images_list = make_list_of_images(images)
+        self.assertEqual(len(images_list), 4)
+        self.assertTrue(np.array_equal(images_list[0], images[0]))
+        self.assertIsInstance(images_list, list)
+
+        # Test batched masks with no channel dimension are converted to a list of masks
+        masks = np.random.randint(0, 2, (4, 16, 32))
+        masks_list = make_list_of_images(masks, expected_ndims=2)
+        self.assertEqual(len(masks_list), 4)
+        self.assertTrue(np.array_equal(masks_list[0], masks[0]))
+        self.assertIsInstance(masks_list, list)
+
+    @require_torch
+    def test_make_list_of_images_torch(self):
+        # Test a single image is converted to a list of 1 image
+        images = torch.randint(0, 256, (16, 32, 3))
+        images_list = make_list_of_images(images)
+        self.assertEqual(len(images_list), 1)
+        self.assertTrue(np.array_equal(images_list[0], images))
+        self.assertIsInstance(images_list, list)
+
+        # Test a batch of images is converted to a list of images
+        images = torch.randint(0, 256, (4, 16, 32, 3))
+        images_list = make_list_of_images(images)
+        self.assertEqual(len(images_list), 4)
+        self.assertTrue(np.array_equal(images_list[0], images[0]))
+        self.assertIsInstance(images_list, list)
+
+        # Test a list of images is left unchanged
+        images = [torch.randint(0, 256, (16, 32, 3)) for _ in range(4)]
+        images_list = make_list_of_images(images)
+        self.assertEqual(len(images_list), 4)
+        self.assertTrue(np.array_equal(images_list[0], images[0]))
+        self.assertIsInstance(images_list, list)
+
     @require_torch
     def test_conversion_torch_to_array(self):
         feature_extractor = ImageFeatureExtractionMixin()
@@ -426,6 +483,17 @@ def test_center_crop_tensor(self):
 
 @require_vision
 class LoadImageTester(unittest.TestCase):
+    def test_load_img_url(self):
+        img = load_image(INVOICE_URL)
+        img_arr = np.array(img)
+
+        self.assertEqual(img_arr.shape, (1061, 750, 3))
+
+    @is_flaky()
+    def test_load_img_url_timeout(self):
+        with self.assertRaises((ReadTimeout, ConnectTimeout)):
+            load_image(INVOICE_URL, timeout=0.001)
+
     def test_load_img_local(self):
         img = load_image("./tests/fixtures/tests_samples/COCO/000000039769.png")
         img_arr = np.array(img)
@@ -435,6 +503,40 @@ def test_load_img_local(self):
             (480, 640, 3),
         )
 
+    def test_load_img_base64_prefix(self):
+        try:
+            tmp_file = tempfile.mktemp()
+            with open(tmp_file, "wb") as f:
+                http_get(
+                    "https://huggingface.co/datasets/hf-internal-testing/dummy-base64-images/raw/main/image_0.txt", f
+                )
+
+            with open(tmp_file, encoding="utf-8") as b64:
+                img = load_image(b64.read())
+                img_arr = np.array(img)
+
+        finally:
+            os.remove(tmp_file)
+
+        self.assertEqual(img_arr.shape, (64, 32, 3))
+
+    def test_load_img_base64(self):
+        try:
+            tmp_file = tempfile.mktemp()
+            with open(tmp_file, "wb") as f:
+                http_get(
+                    "https://huggingface.co/datasets/hf-internal-testing/dummy-base64-images/raw/main/image_1.txt", f
+                )
+
+            with open(tmp_file, encoding="utf-8") as b64:
+                img = load_image(b64.read())
+                img_arr = np.array(img)
+
+        finally:
+            os.remove(tmp_file)
+
+        self.assertEqual(img_arr.shape, (64, 32, 3))
+
     def test_load_img_rgba(self):
         dataset = datasets.load_dataset("hf-internal-testing/fixtures_image_utils", "image", split="test")
 
@@ -514,6 +616,10 @@ def test_infer_channel_dimension(self):
         with pytest.raises(ValueError):
             infer_channel_dimension_format(np.random.randint(0, 256, (10, 1, 50)))
 
+        # But if we explicitly set one of the number of channels to 50 it works
+        inferred_dim = infer_channel_dimension_format(np.random.randint(0, 256, (10, 1, 50)), num_channels=50)
+        self.assertEqual(inferred_dim, ChannelDimension.LAST)
+
         # Test we correctly identify the channel dimension
         image = np.random.randint(0, 256, (3, 4, 5))
         inferred_dim = infer_channel_dimension_format(image)
diff --git a/tests/utils/test_logging.py b/tests/utils/test_logging.py
index 81f3d9144ad7..c9bbb8243651 100644
--- a/tests/utils/test_logging.py
+++ b/tests/utils/test_logging.py
@@ -15,8 +15,9 @@
 import os
 import unittest
 
-import transformers.models.bart.tokenization_bart
 from huggingface_hub.utils import are_progress_bars_disabled
+
+import transformers.models.bart.tokenization_bart
 from transformers import logging
 from transformers.testing_utils import CaptureLogger, mockenv, mockenv_context
 from transformers.utils.logging import disable_progress_bar, enable_progress_bar
@@ -108,6 +109,7 @@ def test_env_invalid_override(self):
 
     def test_advisory_warnings(self):
         # testing `logger.warning_advice()`
+        transformers.utils.logging._reset_library_root_logger()
 
         logger = logging.get_logger("transformers.models.bart.tokenization_bart")
         msg = "Testing 1, 2, 3"
diff --git a/tests/utils/test_model_output.py b/tests/utils/test_model_output.py
index 20ff5ceba822..abfc5427cf11 100644
--- a/tests/utils/test_model_output.py
+++ b/tests/utils/test_model_output.py
@@ -17,6 +17,7 @@
 from dataclasses import dataclass
 from typing import Optional
 
+from transformers.testing_utils import require_torch
 from transformers.utils import ModelOutput
 
 
@@ -120,3 +121,45 @@ def test_instantiate_from_iterator(self):
         x = ModelOutputTest(a=(30, 30))
         self.assertEqual(list(x.keys()), ["a"])
         self.assertEqual(x.a, (30, 30))
+
+    @require_torch
+    def test_torch_pytree(self):
+        # ensure torch.utils._pytree treats ModelOutput subclasses as nodes (and not leaves)
+        # this is important for DistributedDataParallel gradient synchronization with static_graph=True
+        import torch
+        import torch.utils._pytree
+
+        x = ModelOutputTest(a=1.0, c=2.0)
+        self.assertFalse(torch.utils._pytree._is_leaf(x))
+
+        expected_flat_outs = [1.0, 2.0]
+        expected_tree_spec = torch.utils._pytree.TreeSpec(
+            ModelOutputTest, ["a", "c"], [torch.utils._pytree.LeafSpec(), torch.utils._pytree.LeafSpec()]
+        )
+
+        actual_flat_outs, actual_tree_spec = torch.utils._pytree.tree_flatten(x)
+        self.assertEqual(expected_flat_outs, actual_flat_outs)
+        self.assertEqual(expected_tree_spec, actual_tree_spec)
+
+        unflattened_x = torch.utils._pytree.tree_unflatten(actual_flat_outs, actual_tree_spec)
+        self.assertEqual(x, unflattened_x)
+
+
+class ModelOutputTestNoDataclass(ModelOutput):
+    """Invalid test subclass of ModelOutput where @dataclass decorator is not used"""
+
+    a: float
+    b: Optional[float] = None
+    c: Optional[float] = None
+
+
+class ModelOutputSubclassTester(unittest.TestCase):
+    def test_direct_model_output(self):
+        # Check that direct usage of ModelOutput instantiates without errors
+        ModelOutput({"a": 1.1})
+
+    def test_subclass_no_dataclass(self):
+        # Check that a subclass of ModelOutput without @dataclass is invalid
+        # A valid subclass is inherently tested other unit tests above.
+        with self.assertRaises(TypeError):
+            ModelOutputTestNoDataclass(a=1.1, b=2.2, c=3.3)
diff --git a/tests/utils/test_modeling_tf_core.py b/tests/utils/test_modeling_tf_core.py
index 6a68718aac94..ebd5dfda6bc0 100644
--- a/tests/utils/test_modeling_tf_core.py
+++ b/tests/utils/test_modeling_tf_core.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import copy
 import os
 import tempfile
@@ -62,7 +64,6 @@
 
 @require_tf
 class TFCoreModelTesterMixin:
-
     model_tester = None
     all_model_classes = ()
     all_generative_model_classes = ()
@@ -110,7 +111,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> d
     @slow
     def test_graph_mode(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
+        for model_class in self.all_model_classes[:2]:
             inputs = self._prepare_for_class(inputs_dict, model_class)
             model = model_class(config)
 
@@ -124,7 +125,7 @@ def run_in_graph_mode():
     @slow
     def test_xla_mode(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
+        for model_class in self.all_model_classes[:2]:
             inputs = self._prepare_for_class(inputs_dict, model_class)
             model = model_class(config)
 
@@ -139,7 +140,7 @@ def run_in_graph_mode():
     def test_xla_fit(self):
         # This is a copy of the test_keras_fit method, but we use XLA compilation instead of eager
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
+        for model_class in self.all_model_classes[:2]:
             model = model_class(config)
             if getattr(model, "hf_compute_loss", None):
                 # Test that model correctly compute the loss with kwargs
@@ -213,20 +214,21 @@ def test_saved_model_creation_extended(self):
         encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", self.model_tester.seq_length)
         encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
 
-        for model_class in self.all_model_classes:
+        for model_class in self.all_model_classes[:2]:
             class_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
             model = model_class(config)
+            model.build()
             num_out = len(model(class_inputs_dict))
 
             for key in list(class_inputs_dict.keys()):
                 # Remove keys not in the serving signature, as the SavedModel will not be compiled to deal with them
-                if key not in model.serving.input_signature[0]:
+                if key not in model.input_signature:
                     del class_inputs_dict[key]
                 # Check it's a tensor, in case the inputs dict has some bools in it too
                 elif isinstance(class_inputs_dict[key], tf.Tensor) and class_inputs_dict[key].dtype.is_integer:
                     class_inputs_dict[key] = tf.cast(class_inputs_dict[key], tf.int32)
 
-            if set(class_inputs_dict.keys()) != set(model.serving.input_signature[0].keys()):
+            if set(class_inputs_dict.keys()) != set(model.input_signature.keys()):
                 continue  # Some models have inputs that the preparation functions don't create, we skip those
 
             with tempfile.TemporaryDirectory() as tmpdirname:
@@ -267,7 +269,7 @@ def test_mixed_precision(self):
         # try/finally block to ensure subsequent tests run in float32
         try:
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            for model_class in self.all_model_classes:
+            for model_class in self.all_model_classes[:2]:
                 class_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
                 model = model_class(config)
                 outputs = model(class_inputs_dict)
@@ -286,7 +288,7 @@ def test_train_pipeline_custom_model(self):
             del inputs_dict["decoder_head_mask"]
         if "cross_attn_head_mask" in inputs_dict:
             del inputs_dict["cross_attn_head_mask"]
-        tf_main_layer_classes = set(
+        tf_main_layer_classes = {
             module_member
             for model_class in self.all_model_classes
             for module in (import_module(model_class.__module__),)
@@ -296,7 +298,7 @@ def test_train_pipeline_custom_model(self):
             if isinstance(module_member, type)
             and tf.keras.layers.Layer in module_member.__bases__
             and getattr(module_member, "_keras_serializable", False)
-        )
+        }
 
         for main_layer_class in tf_main_layer_classes:
             # T5MainLayer needs an embed_tokens parameter when called without the inputs_embeds parameter
@@ -350,7 +352,7 @@ def test_train_pipeline_custom_model(self):
     def test_graph_mode_with_inputs_embeds(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-        for model_class in self.all_model_classes:
+        for model_class in self.all_model_classes[:2]:
             model = model_class(config)
 
             inputs = copy.deepcopy(inputs_dict)
diff --git a/tests/utils/test_offline.py b/tests/utils/test_offline.py
index 0636a4399e89..ecc7938bf380 100644
--- a/tests/utils/test_offline.py
+++ b/tests/utils/test_offline.py
@@ -15,13 +15,13 @@
 import subprocess
 import sys
 
+from transformers import BertConfig, BertModel, BertTokenizer, pipeline
 from transformers.testing_utils import TestCasePlus, require_torch
 
 
 class OfflineTests(TestCasePlus):
     @require_torch
     def test_offline_mode(self):
-
         # this test is a bit tricky since TRANSFORMERS_OFFLINE can only be changed before
         # `transformers` is loaded, and it's too late for inside pytest - so we are changing it
         # while running an external program
@@ -30,7 +30,7 @@ def test_offline_mode(self):
 
         # this must be loaded before socket.socket is monkey-patched
         load = """
-from transformers import BertConfig, BertModel, BertTokenizer
+from transformers import BertConfig, BertModel, BertTokenizer, pipeline
         """
 
         run = """
@@ -38,41 +38,75 @@ def test_offline_mode(self):
 BertConfig.from_pretrained(mname)
 BertModel.from_pretrained(mname)
 BertTokenizer.from_pretrained(mname)
+pipe = pipeline(task="fill-mask", model=mname)
 print("success")
         """
 
         mock = """
 import socket
-def offline_socket(*args, **kwargs): raise socket.error("Offline mode is enabled")
+def offline_socket(*args, **kwargs): raise RuntimeError("Offline mode is enabled, we shouldn't access internet")
 socket.socket = offline_socket
         """
 
+        # Force fetching the files so that we can use the cache
+        mname = "hf-internal-testing/tiny-random-bert"
+        BertConfig.from_pretrained(mname)
+        BertModel.from_pretrained(mname)
+        BertTokenizer.from_pretrained(mname)
+        pipeline(task="fill-mask", model=mname)
+
         # baseline - just load from_pretrained with normal network
-        cmd = [sys.executable, "-c", "\n".join([load, run])]
+        cmd = [sys.executable, "-c", "\n".join([load, run, mock])]
 
         # should succeed
         env = self.get_env()
+        # should succeed as TRANSFORMERS_OFFLINE=1 tells it to use local files
+        env["TRANSFORMERS_OFFLINE"] = "1"
         result = subprocess.run(cmd, env=env, check=False, capture_output=True)
         self.assertEqual(result.returncode, 0, result.stderr)
         self.assertIn("success", result.stdout.decode())
 
-        # next emulate no network
-        cmd = [sys.executable, "-c", "\n".join([load, mock, run])]
+    @require_torch
+    def test_offline_mode_no_internet(self):
+        # python one-liner segments
+        # this must be loaded before socket.socket is monkey-patched
+        load = """
+from transformers import BertConfig, BertModel, BertTokenizer, pipeline
+        """
 
-        # Doesn't fail anymore since the model is in the cache due to other tests, so commenting this.
-        # env["TRANSFORMERS_OFFLINE"] = "0"
-        # result = subprocess.run(cmd, env=env, check=False, capture_output=True)
-        # self.assertEqual(result.returncode, 1, result.stderr)
+        run = """
+mname = "hf-internal-testing/tiny-random-bert"
+BertConfig.from_pretrained(mname)
+BertModel.from_pretrained(mname)
+BertTokenizer.from_pretrained(mname)
+pipe = pipeline(task="fill-mask", model=mname)
+print("success")
+        """
 
-        # should succeed as TRANSFORMERS_OFFLINE=1 tells it to use local files
-        env["TRANSFORMERS_OFFLINE"] = "1"
+        mock = """
+import socket
+def offline_socket(*args, **kwargs): raise socket.error("Faking flaky internet")
+socket.socket = offline_socket
+        """
+
+        # Force fetching the files so that we can use the cache
+        mname = "hf-internal-testing/tiny-random-bert"
+        BertConfig.from_pretrained(mname)
+        BertModel.from_pretrained(mname)
+        BertTokenizer.from_pretrained(mname)
+        pipeline(task="fill-mask", model=mname)
+
+        # baseline - just load from_pretrained with normal network
+        cmd = [sys.executable, "-c", "\n".join([load, run, mock])]
+
+        # should succeed
+        env = self.get_env()
         result = subprocess.run(cmd, env=env, check=False, capture_output=True)
         self.assertEqual(result.returncode, 0, result.stderr)
         self.assertIn("success", result.stdout.decode())
 
     @require_torch
     def test_offline_mode_sharded_checkpoint(self):
-
         # this test is a bit tricky since TRANSFORMERS_OFFLINE can only be changed before
         # `transformers` is loaded, and it's too late for inside pytest - so we are changing it
         # while running an external program
@@ -93,7 +127,7 @@ def test_offline_mode_sharded_checkpoint(self):
 
         mock = """
 import socket
-def offline_socket(*args, **kwargs): raise socket.error("Offline mode is enabled")
+def offline_socket(*args, **kwargs): raise ValueError("Offline mode is enabled")
 socket.socket = offline_socket
         """
 
@@ -119,3 +153,54 @@ def offline_socket(*args, **kwargs): raise socket.error("Offline mode is enabled
         result = subprocess.run(cmd, env=env, check=False, capture_output=True)
         self.assertEqual(result.returncode, 0, result.stderr)
         self.assertIn("success", result.stdout.decode())
+
+    @require_torch
+    def test_offline_mode_pipeline_exception(self):
+        load = """
+from transformers import pipeline
+        """
+        run = """
+mname = "hf-internal-testing/tiny-random-bert"
+pipe = pipeline(model=mname)
+        """
+
+        mock = """
+import socket
+def offline_socket(*args, **kwargs): raise socket.error("Offline mode is enabled")
+socket.socket = offline_socket
+        """
+        env = self.get_env()
+        env["TRANSFORMERS_OFFLINE"] = "1"
+        cmd = [sys.executable, "-c", "\n".join([load, mock, run])]
+        result = subprocess.run(cmd, env=env, check=False, capture_output=True)
+        self.assertEqual(result.returncode, 1, result.stderr)
+        self.assertIn(
+            "You cannot infer task automatically within `pipeline` when using offline mode",
+            result.stderr.decode().replace("\n", ""),
+        )
+
+    @require_torch
+    def test_offline_model_dynamic_model(self):
+        load = """
+from transformers import AutoModel
+        """
+        run = """
+mname = "hf-internal-testing/test_dynamic_model"
+AutoModel.from_pretrained(mname, trust_remote_code=True)
+print("success")
+        """
+
+        # baseline - just load from_pretrained with normal network
+        cmd = [sys.executable, "-c", "\n".join([load, run])]
+
+        # should succeed
+        env = self.get_env()
+        result = subprocess.run(cmd, env=env, check=False, capture_output=True)
+        self.assertEqual(result.returncode, 0, result.stderr)
+        self.assertIn("success", result.stdout.decode())
+
+        # should succeed as TRANSFORMERS_OFFLINE=1 tells it to use local files
+        env["TRANSFORMERS_OFFLINE"] = "1"
+        result = subprocess.run(cmd, env=env, check=False, capture_output=True)
+        self.assertEqual(result.returncode, 0, result.stderr)
+        self.assertIn("success", result.stdout.decode())
diff --git a/tests/utils/test_skip_decorators.py b/tests/utils/test_skip_decorators.py
index 89ff0e3bafdc..6888fea23cff 100644
--- a/tests/utils/test_skip_decorators.py
+++ b/tests/utils/test_skip_decorators.py
@@ -32,8 +32,8 @@
 import unittest
 
 import pytest
-
 from parameterized import parameterized
+
 from transformers.testing_utils import require_torch, require_torch_gpu, slow, torch_device
 
 
diff --git a/tests/utils/test_versions_utils.py b/tests/utils/test_versions_utils.py
index 6bd77218d69f..14839400c2cd 100644
--- a/tests/utils/test_versions_utils.py
+++ b/tests/utils/test_versions_utils.py
@@ -12,13 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import importlib.metadata
 import sys
 
 from transformers.testing_utils import TestCasePlus
-from transformers.utils.versions import importlib_metadata, require_version, require_version_core
+from transformers.utils.versions import require_version, require_version_core
 
 
-numpy_ver = importlib_metadata.version("numpy")
+numpy_ver = importlib.metadata.version("numpy")
 python_ver = ".".join([str(x) for x in sys.version_info[:3]])
 
 
@@ -65,7 +66,7 @@ def test_core(self):
         for req in ["numpipypie>1", "numpipypie2"]:
             try:
                 require_version_core(req)
-            except importlib_metadata.PackageNotFoundError as e:
+            except importlib.metadata.PackageNotFoundError as e:
                 self.assertIn(f"The '{req}' distribution was not found and is required by this application", str(e))
                 self.assertIn("Try: pip install transformers -U", str(e))
 
@@ -84,7 +85,6 @@ def test_core(self):
                 self.assertIn("need one of ", str(e))
 
     def test_python(self):
-
         # matching requirement
         require_version("python>=3.6.0")
 
diff --git a/tests/utils/tiny_model_summary.json b/tests/utils/tiny_model_summary.json
new file mode 100644
index 000000000000..2a1efa7d88d2
--- /dev/null
+++ b/tests/utils/tiny_model_summary.json
@@ -0,0 +1,7065 @@
+{
+    "ASTForAudioClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ASTFeatureExtractor"
+        ],
+        "model_classes": [
+            "ASTForAudioClassification"
+        ],
+        "sha": "83d6e076db7768a3645401bad3204624985e1d08"
+    },
+    "ASTModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ASTFeatureExtractor"
+        ],
+        "model_classes": [
+            "ASTModel"
+        ],
+        "sha": "75e68f956f6f2c0709b01e596e7a6aecb1b29dce"
+    },
+    "AlbertForMaskedLM": {
+        "tokenizer_classes": [
+            "AlbertTokenizer",
+            "AlbertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "AlbertForMaskedLM",
+            "TFAlbertForMaskedLM"
+        ],
+        "sha": "d29de71ac29e1019c3a7762f7357f750730cb037"
+    },
+    "AlbertForMultipleChoice": {
+        "tokenizer_classes": [
+            "AlbertTokenizer",
+            "AlbertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "AlbertForMultipleChoice",
+            "TFAlbertForMultipleChoice"
+        ],
+        "sha": "242aecce6a589a2964c0f695621fa22a83751579"
+    },
+    "AlbertForPreTraining": {
+        "tokenizer_classes": [
+            "AlbertTokenizer",
+            "AlbertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "AlbertForPreTraining",
+            "TFAlbertForPreTraining"
+        ],
+        "sha": "41330be4b271687f4d88ddc96346c12aa11de983"
+    },
+    "AlbertForQuestionAnswering": {
+        "tokenizer_classes": [
+            "AlbertTokenizer",
+            "AlbertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "AlbertForQuestionAnswering",
+            "TFAlbertForQuestionAnswering"
+        ],
+        "sha": "040b81c15f437f4722349dc5b41fccd17ebd7fdc"
+    },
+    "AlbertForSequenceClassification": {
+        "tokenizer_classes": [
+            "AlbertTokenizer",
+            "AlbertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "AlbertForSequenceClassification",
+            "TFAlbertForSequenceClassification"
+        ],
+        "sha": "39c1a0e2c1c2623106d3211d751e9b32f23a91a0"
+    },
+    "AlbertForTokenClassification": {
+        "tokenizer_classes": [
+            "AlbertTokenizer",
+            "AlbertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "AlbertForTokenClassification",
+            "TFAlbertForTokenClassification"
+        ],
+        "sha": "359c3f4a311a4053a6f6d6a880db5f82c8e3ff1f"
+    },
+    "AlbertModel": {
+        "tokenizer_classes": [
+            "AlbertTokenizer",
+            "AlbertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "AlbertModel",
+            "TFAlbertModel"
+        ],
+        "sha": "34a63314686b64aaeb595ddb95006f1ff2ffda17"
+    },
+    "AlignModel": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [
+            "EfficientNetImageProcessor"
+        ],
+        "model_classes": [
+            "AlignModel"
+        ],
+        "sha": "68a4f9d3f493f44efa7c1dde6fcca23350e2c92b"
+    },
+    "AltCLIPModel": {
+        "tokenizer_classes": [
+            "XLMRobertaTokenizerFast"
+        ],
+        "processor_classes": [
+            "CLIPImageProcessor"
+        ],
+        "model_classes": [
+            "AltCLIPModel"
+        ],
+        "sha": "3106af0fd503970717c05f27218e5cacf19ba872"
+    },
+    "BarkModel": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BarkModel"
+        ],
+        "sha": "187e590fd87359cea47693e8cb11a604cd7b673c"
+    },
+    "BartForCausalLM": {
+        "tokenizer_classes": [
+            "BartTokenizer",
+            "BartTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BartForCausalLM"
+        ],
+        "sha": "c25526ac67d2dbe79fe5462af4b7908ca2fbc3ff"
+    },
+    "BartForConditionalGeneration": {
+        "tokenizer_classes": [
+            "BartTokenizer",
+            "BartTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BartForConditionalGeneration",
+            "TFBartForConditionalGeneration"
+        ],
+        "sha": "3a489a21e4b04705f4a6047924b7616a67be7e37"
+    },
+    "BartForQuestionAnswering": {
+        "tokenizer_classes": [
+            "BartTokenizer",
+            "BartTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BartForQuestionAnswering"
+        ],
+        "sha": "3ebf9aab39a57ceab55128d5fc6f61e4db0dadd4"
+    },
+    "BartForSequenceClassification": {
+        "tokenizer_classes": [
+            "BartTokenizer",
+            "BartTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BartForSequenceClassification",
+            "TFBartForSequenceClassification"
+        ],
+        "sha": "ea452fd9a928cfebd71723afa50feb20326917bc"
+    },
+    "BartModel": {
+        "tokenizer_classes": [
+            "BartTokenizer",
+            "BartTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BartModel",
+            "TFBartModel"
+        ],
+        "sha": "e5df6d1aa75f03833b2df328b9c35463f73a421b"
+    },
+    "BeitForImageClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "BeitImageProcessor"
+        ],
+        "model_classes": [
+            "BeitForImageClassification"
+        ],
+        "sha": "e997587bb890f82faad4bd25eb23d85ba21ecaaa"
+    },
+    "BeitForSemanticSegmentation": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "BeitImageProcessor"
+        ],
+        "model_classes": [
+            "BeitForSemanticSegmentation"
+        ],
+        "sha": "d4afa9e21e3fe5b087578ed68974d9b3ffc1fb22"
+    },
+    "BeitModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "BeitImageProcessor"
+        ],
+        "model_classes": [
+            "BeitModel"
+        ],
+        "sha": "5c4a051f0cca6f64d02c6168deb88413cae10d2c"
+    },
+    "BertForMaskedLM": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BertForMaskedLM",
+            "TFBertForMaskedLM"
+        ],
+        "sha": "3e32baa52ce044c75edfb5c28abd51ee8d051282"
+    },
+    "BertForMultipleChoice": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BertForMultipleChoice",
+            "TFBertForMultipleChoice"
+        ],
+        "sha": "0b8c3a6d411d1e19e5fd98d4d8631ae7616eeeaa"
+    },
+    "BertForNextSentencePrediction": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BertForNextSentencePrediction",
+            "TFBertForNextSentencePrediction"
+        ],
+        "sha": "628e70debf8864bd0b63aff7901d17d9c4f7612c"
+    },
+    "BertForPreTraining": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BertForPreTraining",
+            "TFBertForPreTraining"
+        ],
+        "sha": "c748ad37e6a200a6f64b2764191bfe13f976032f"
+    },
+    "BertForQuestionAnswering": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BertForQuestionAnswering",
+            "TFBertForQuestionAnswering"
+        ],
+        "sha": "4671ad0c21493b97c5eb2f0201192704c29876d5"
+    },
+    "BertForSequenceClassification": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BertForSequenceClassification",
+            "TFBertForSequenceClassification"
+        ],
+        "sha": "37a9d44022264c12bdf3ec257778f953b63d4aaf"
+    },
+    "BertForTokenClassification": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BertForTokenClassification",
+            "TFBertForTokenClassification"
+        ],
+        "sha": "d7dc3a0793ff6dfcb794b21130ee0f185d2c61a2"
+    },
+    "BertLMHeadModel": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BertLMHeadModel",
+            "TFBertLMHeadModel"
+        ],
+        "sha": "b4e3acc1990f3e365ffddbd54b620a26d9fb4b09"
+    },
+    "BertModel": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BertModel",
+            "TFBertModel"
+        ],
+        "sha": "3956d303d3cddf0708ff20660c1ea5f6ec30e434"
+    },
+    "BigBirdForCausalLM": {
+        "tokenizer_classes": [
+            "BigBirdTokenizer",
+            "BigBirdTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BigBirdForCausalLM"
+        ],
+        "sha": "5c7a487af5248d9c01b45d5481b7d7bb9b36e1b5"
+    },
+    "BigBirdForMaskedLM": {
+        "tokenizer_classes": [
+            "BigBirdTokenizer",
+            "BigBirdTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BigBirdForMaskedLM"
+        ],
+        "sha": "476ef8225c0f69270b577706ad4f1dda13e4dde5"
+    },
+    "BigBirdForMultipleChoice": {
+        "tokenizer_classes": [
+            "BigBirdTokenizer",
+            "BigBirdTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BigBirdForMultipleChoice"
+        ],
+        "sha": "cf93eaa1019987112c171a407745bc183a20513a"
+    },
+    "BigBirdForPreTraining": {
+        "tokenizer_classes": [
+            "BigBirdTokenizer",
+            "BigBirdTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BigBirdForPreTraining"
+        ],
+        "sha": "5fb9efa13334431e7c186a9fa314b89c4a1eee72"
+    },
+    "BigBirdForQuestionAnswering": {
+        "tokenizer_classes": [
+            "BigBirdTokenizer",
+            "BigBirdTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BigBirdForQuestionAnswering"
+        ],
+        "sha": "f82f88bd71fba819a8ffb0692915d3529e705417"
+    },
+    "BigBirdForSequenceClassification": {
+        "tokenizer_classes": [
+            "BigBirdTokenizer",
+            "BigBirdTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BigBirdForSequenceClassification"
+        ],
+        "sha": "ea398090858f9af93b54fc9a8d65cfed78ac27ff"
+    },
+    "BigBirdForTokenClassification": {
+        "tokenizer_classes": [
+            "BigBirdTokenizer",
+            "BigBirdTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BigBirdForTokenClassification"
+        ],
+        "sha": "2cdea118999fa58ba9fb0162d99e2ffa146c3df1"
+    },
+    "BigBirdModel": {
+        "tokenizer_classes": [
+            "BigBirdTokenizer",
+            "BigBirdTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BigBirdModel"
+        ],
+        "sha": "9c55989f31df156194e6997606fb14d9897e0300"
+    },
+    "BigBirdPegasusForCausalLM": {
+        "tokenizer_classes": [
+            "PegasusTokenizer",
+            "PegasusTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BigBirdPegasusForCausalLM"
+        ],
+        "sha": "49bc8816c666dee32e27cd8e00136b604eb85243"
+    },
+    "BigBirdPegasusForConditionalGeneration": {
+        "tokenizer_classes": [
+            "PegasusTokenizer",
+            "PegasusTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BigBirdPegasusForConditionalGeneration"
+        ],
+        "sha": "e791aa6d1af5a76ca0926d95b1f28bd2d8adf376"
+    },
+    "BigBirdPegasusForQuestionAnswering": {
+        "tokenizer_classes": [
+            "PegasusTokenizer",
+            "PegasusTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BigBirdPegasusForQuestionAnswering"
+        ],
+        "sha": "7650e076713ca707a37062adc8c9c1cd60dad7c7"
+    },
+    "BigBirdPegasusForSequenceClassification": {
+        "tokenizer_classes": [
+            "PegasusTokenizer",
+            "PegasusTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BigBirdPegasusForSequenceClassification"
+        ],
+        "sha": "02500e8ebd9c53528750013fb963fbdc2be34034"
+    },
+    "BigBirdPegasusModel": {
+        "tokenizer_classes": [
+            "PegasusTokenizer",
+            "PegasusTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BigBirdPegasusModel"
+        ],
+        "sha": "b07c5304dfba673cf8b9cf5cd1aa45fbfea1c2f3"
+    },
+    "BioGptForCausalLM": {
+        "tokenizer_classes": [
+            "BioGptTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BioGptForCausalLM"
+        ],
+        "sha": "07073b31da84054fd12226e3cae4cb3beb2547f9"
+    },
+    "BioGptForSequenceClassification": {
+        "tokenizer_classes": [
+            "BioGptTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BioGptForSequenceClassification"
+        ],
+        "sha": "8e18ad6218abd795e050dec324a8c827ccedacb4"
+    },
+    "BioGptForTokenClassification": {
+        "tokenizer_classes": [
+            "BioGptTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BioGptForTokenClassification"
+        ],
+        "sha": "67f8173c1a17273064d452a9031a51b67f327b6a"
+    },
+    "BioGptModel": {
+        "tokenizer_classes": [
+            "BioGptTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BioGptModel"
+        ],
+        "sha": "fe18551d0743538a990520b75707294ec57b4ebe"
+    },
+    "BitBackbone": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "BitImageProcessor"
+        ],
+        "model_classes": [
+            "BitBackbone"
+        ],
+        "sha": "2f06f6b4395b6dce2b00ac839ff757410e743cd7"
+    },
+    "BitForImageClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "BitImageProcessor"
+        ],
+        "model_classes": [
+            "BitForImageClassification"
+        ],
+        "sha": "d0d8476f2d285ddda7c42c0d4a8e4bf6f5d2bfdf"
+    },
+    "BitModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "BitImageProcessor"
+        ],
+        "model_classes": [
+            "BitModel"
+        ],
+        "sha": "30a8a9b1a6b253cc500c01cf41bc1fc9581ea5e5"
+    },
+    "BlenderbotForCausalLM": {
+        "tokenizer_classes": [
+            "BlenderbotTokenizer",
+            "BlenderbotTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BlenderbotForCausalLM"
+        ],
+        "sha": "8aad2e13e8920bca3cf988ba45f8a7b008b51a81"
+    },
+    "BlenderbotForConditionalGeneration": {
+        "tokenizer_classes": [
+            "BlenderbotTokenizer",
+            "BlenderbotTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BlenderbotForConditionalGeneration",
+            "TFBlenderbotForConditionalGeneration"
+        ],
+        "sha": "e8532878b9924fa02fb4b059b7f6e7fa372fff91"
+    },
+    "BlenderbotModel": {
+        "tokenizer_classes": [
+            "BlenderbotTokenizer",
+            "BlenderbotTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BlenderbotModel",
+            "TFBlenderbotModel"
+        ],
+        "sha": "ff848a40c30ca98eb7c6870bbb02677d5af9db55"
+    },
+    "BlenderbotSmallForCausalLM": {
+        "tokenizer_classes": [
+            "BlenderbotSmallTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BlenderbotSmallForCausalLM"
+        ],
+        "sha": "4c57c106630932eb9de4d76210a540d04616304d"
+    },
+    "BlenderbotSmallForConditionalGeneration": {
+        "tokenizer_classes": [
+            "BlenderbotSmallTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BlenderbotSmallForConditionalGeneration",
+            "TFBlenderbotSmallForConditionalGeneration"
+        ],
+        "sha": "b8db01fcf3e37a5b369cd50e169bf383b8e905d8"
+    },
+    "BlenderbotSmallModel": {
+        "tokenizer_classes": [
+            "BlenderbotSmallTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BlenderbotSmallModel",
+            "TFBlenderbotSmallModel"
+        ],
+        "sha": "0a10c70e225ec63278faffa8fabf759f063f0e55"
+    },
+    "Blip2ForConditionalGeneration": {
+        "tokenizer_classes": [
+            "GPT2Tokenizer",
+            "GPT2TokenizerFast"
+        ],
+        "processor_classes": [
+            "BlipImageProcessor"
+        ],
+        "model_classes": [
+            "Blip2ForConditionalGeneration"
+        ],
+        "sha": "35e1ef43da3554af62eb29a7b3dbbef3f3bef48e"
+    },
+    "Blip2Model": {
+        "tokenizer_classes": [
+            "GPT2Tokenizer",
+            "GPT2TokenizerFast"
+        ],
+        "processor_classes": [
+            "BlipImageProcessor"
+        ],
+        "model_classes": [
+            "Blip2Model"
+        ],
+        "sha": "c23378f225be31872fff33c103cf0ebc2454ffcc"
+    },
+    "BlipForConditionalGeneration": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [
+            "BlipImageProcessor"
+        ],
+        "model_classes": [
+            "BlipForConditionalGeneration",
+            "TFBlipForConditionalGeneration"
+        ],
+        "sha": "eaf32bc0369349deef0c777442fc185119171d1f"
+    },
+    "BlipModel": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [
+            "BlipImageProcessor"
+        ],
+        "model_classes": [
+            "BlipModel",
+            "TFBlipModel"
+        ],
+        "sha": "3d1d1c15eff22d6b2664a2d15757fa6f5d93827d"
+    },
+    "BloomForCausalLM": {
+        "tokenizer_classes": [
+            "BloomTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BloomForCausalLM"
+        ],
+        "sha": "0f4f06f162cd67d34d03ee156484e4001d468500"
+    },
+    "BloomForQuestionAnswering": {
+        "tokenizer_classes": [
+            "BloomTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BloomForQuestionAnswering"
+        ],
+        "sha": "23f369f163eef8c9c9685900440b0cbb0f3439fd"
+    },
+    "BloomForSequenceClassification": {
+        "tokenizer_classes": [
+            "BloomTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BloomForSequenceClassification"
+        ],
+        "sha": "b2280eef7172835f39b265eb0c46623257f67bbe"
+    },
+    "BloomForTokenClassification": {
+        "tokenizer_classes": [
+            "BloomTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BloomForTokenClassification"
+        ],
+        "sha": "9796aa45f99adff987c978089e11c0bd9d7b997f"
+    },
+    "BloomModel": {
+        "tokenizer_classes": [
+            "BloomTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BloomModel"
+        ],
+        "sha": "28b600fcfdc4f4938406fb518abf895620048cb2"
+    },
+    "BrosForTokenClassification": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BrosForTokenClassification"
+        ],
+        "sha": "4ec2c91936f96b93667e8946fc7abbdeeb08a6d7"
+    },
+    "BrosModel": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BrosModel"
+        ],
+        "sha": "e2464830b1874eeaf9f4b425fbe0ce8e7c7643e9"
+    },
+    "CLIPModel": {
+        "tokenizer_classes": [
+            "CLIPTokenizer",
+            "CLIPTokenizerFast"
+        ],
+        "processor_classes": [
+            "CLIPImageProcessor"
+        ],
+        "model_classes": [
+            "CLIPModel",
+            "TFCLIPModel"
+        ],
+        "sha": "0452d344074485d0e7eb5d5c12447b7c9dbc9619"
+    },
+    "CLIPSegModel": {
+        "tokenizer_classes": [
+            "CLIPTokenizer",
+            "CLIPTokenizerFast"
+        ],
+        "processor_classes": [
+            "ViTImageProcessor"
+        ],
+        "model_classes": [
+            "CLIPSegModel"
+        ],
+        "sha": "7b1305214ccc85d29b776ffbee06748693852a04"
+    },
+    "CTRLForSequenceClassification": {
+        "tokenizer_classes": [
+            "CTRLTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "CTRLForSequenceClassification",
+            "TFCTRLForSequenceClassification"
+        ],
+        "sha": "280b5a3502d607c55c9f8d9f198fe9c2802d6f73"
+    },
+    "CTRLLMHeadModel": {
+        "tokenizer_classes": [
+            "CTRLTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "CTRLLMHeadModel",
+            "TFCTRLLMHeadModel"
+        ],
+        "sha": "662381663b216f1dd3c9cd30e2e83cb4c6fc9552"
+    },
+    "CTRLModel": {
+        "tokenizer_classes": [
+            "CTRLTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "CTRLModel",
+            "TFCTRLModel"
+        ],
+        "sha": "68b19b4f132d5a191a73acd78d983cbdcf068e9c"
+    },
+    "CanineForMultipleChoice": {
+        "tokenizer_classes": [
+            "CanineTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "CanineForMultipleChoice"
+        ],
+        "sha": "fa0451453ed202f903ff7dcf6071aab6630fb89f"
+    },
+    "CanineForQuestionAnswering": {
+        "tokenizer_classes": [
+            "CanineTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "CanineForQuestionAnswering"
+        ],
+        "sha": "5e1012bb086ac2e0b1497eeb7ed14eb2183d4ecb"
+    },
+    "CanineForSequenceClassification": {
+        "tokenizer_classes": [
+            "CanineTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "CanineForSequenceClassification"
+        ],
+        "sha": "75336dc9179153869c38a8047ce4b1e02677a260"
+    },
+    "CanineForTokenClassification": {
+        "tokenizer_classes": [
+            "CanineTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "CanineForTokenClassification"
+        ],
+        "sha": "65a622ea8e12597e12f45e59d46d8dbe8461fc10"
+    },
+    "CanineModel": {
+        "tokenizer_classes": [
+            "CanineTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "CanineModel"
+        ],
+        "sha": "531ef67ad4f0b3dc7a9e5d722c774096b7401b1b"
+    },
+    "ChineseCLIPModel": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [
+            "ChineseCLIPImageProcessor"
+        ],
+        "model_classes": [
+            "ChineseCLIPModel"
+        ],
+        "sha": "504271a3c5fd9c2e877f5b4c01848bc18778c7c3"
+    },
+    "ClapModel": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [
+            "ClapFeatureExtractor"
+        ],
+        "model_classes": [
+            "ClapModel"
+        ],
+        "sha": "a7874595b900f9b2ddc79130dafc3ff48f4fbfb9"
+    },
+    "CodeGenForCausalLM": {
+        "tokenizer_classes": [
+            "CodeGenTokenizer",
+            "CodeGenTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "CodeGenForCausalLM"
+        ],
+        "sha": "a3fc69d757fd1f0aa01bcbc4337f586651c7cb10"
+    },
+    "CodeGenModel": {
+        "tokenizer_classes": [
+            "CodeGenTokenizer",
+            "CodeGenTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "CodeGenModel"
+        ],
+        "sha": "dad4941a2b7429fc6e8206fcc4a04fc40f4a0beb"
+    },
+    "ConditionalDetrForObjectDetection": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ConditionalDetrImageProcessor"
+        ],
+        "model_classes": [
+            "ConditionalDetrForObjectDetection"
+        ],
+        "sha": "762c213a0285edc84eb813a2ed90063cf971ca43"
+    },
+    "ConditionalDetrModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ConditionalDetrImageProcessor"
+        ],
+        "model_classes": [
+            "ConditionalDetrModel"
+        ],
+        "sha": "18b75874158cac520c63605293b06e0b1327c263"
+    },
+    "ConvBertForMaskedLM": {
+        "tokenizer_classes": [
+            "ConvBertTokenizer",
+            "ConvBertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "ConvBertForMaskedLM",
+            "TFConvBertForMaskedLM"
+        ],
+        "sha": "307c70e32c3d3c18aeb45e0cbdc9fcd2957d9aba"
+    },
+    "ConvBertForMultipleChoice": {
+        "tokenizer_classes": [
+            "ConvBertTokenizer",
+            "ConvBertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "ConvBertForMultipleChoice",
+            "TFConvBertForMultipleChoice"
+        ],
+        "sha": "d6561a21ffdb82d03c1822af0510eb7482ce5026"
+    },
+    "ConvBertForQuestionAnswering": {
+        "tokenizer_classes": [
+            "ConvBertTokenizer",
+            "ConvBertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "ConvBertForQuestionAnswering",
+            "TFConvBertForQuestionAnswering"
+        ],
+        "sha": "8a056da5cc421415c2a24b9f644dd95ca279411d"
+    },
+    "ConvBertForSequenceClassification": {
+        "tokenizer_classes": [
+            "ConvBertTokenizer",
+            "ConvBertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "ConvBertForSequenceClassification",
+            "TFConvBertForSequenceClassification"
+        ],
+        "sha": "8bb8b20e51d282d777cc567cacadd97a35f0811e"
+    },
+    "ConvBertForTokenClassification": {
+        "tokenizer_classes": [
+            "ConvBertTokenizer",
+            "ConvBertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "ConvBertForTokenClassification",
+            "TFConvBertForTokenClassification"
+        ],
+        "sha": "8db0dd3c2b8ccc958fa9a84801f4f837b42fcf2c"
+    },
+    "ConvBertModel": {
+        "tokenizer_classes": [
+            "ConvBertTokenizer",
+            "ConvBertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "ConvBertModel",
+            "TFConvBertModel"
+        ],
+        "sha": "c9c5b1a74f0e468d8467473cabeaa67fcdbaddb7"
+    },
+    "ConvNextBackbone": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ConvNextImageProcessor"
+        ],
+        "model_classes": [
+            "ConvNextBackbone"
+        ],
+        "sha": "499c7d6a97825b79e19663b70f3b60c4813b6bf2"
+    },
+    "ConvNextForImageClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ConvNextImageProcessor"
+        ],
+        "model_classes": [
+            "ConvNextForImageClassification",
+            "TFConvNextForImageClassification"
+        ],
+        "sha": "0b490fd6b19cdbf721025dbd6ee45dcc5828e6e3"
+    },
+    "ConvNextModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ConvNextImageProcessor"
+        ],
+        "model_classes": [
+            "ConvNextModel",
+            "TFConvNextModel"
+        ],
+        "sha": "7b3b47a57b9a9120e022b91d6067daeac55b794f"
+    },
+    "ConvNextV2Backbone": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ConvNextImageProcessor"
+        ],
+        "model_classes": [
+            "ConvNextV2Backbone"
+        ],
+        "sha": "c82fc526949dfd892a1fee3c34be6f8d80c4d3df"
+    },
+    "ConvNextV2ForImageClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ConvNextImageProcessor"
+        ],
+        "model_classes": [
+            "ConvNextV2ForImageClassification"
+        ],
+        "sha": "ee22bae1cbb87d66fc7f62f7e15a43d6ff80d3cc"
+    },
+    "ConvNextV2Model": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ConvNextImageProcessor"
+        ],
+        "model_classes": [
+            "ConvNextV2Model"
+        ],
+        "sha": "c4dd68ee1102cba05bcc483da2a88e39427b7249"
+    },
+    "CvtForImageClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ConvNextImageProcessor"
+        ],
+        "model_classes": [
+            "CvtForImageClassification",
+            "TFCvtForImageClassification"
+        ],
+        "sha": "4b1938e252fdb26a06c1f5755e07fa8f6eed2d75"
+    },
+    "CvtModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ConvNextImageProcessor"
+        ],
+        "model_classes": [
+            "CvtModel",
+            "TFCvtModel"
+        ],
+        "sha": "27fed12c174f4f4f1fe27075d1c29602fe0669f0"
+    },
+    "DPRQuestionEncoder": {
+        "tokenizer_classes": [
+            "DPRQuestionEncoderTokenizer",
+            "DPRQuestionEncoderTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "DPRQuestionEncoder",
+            "TFDPRQuestionEncoder"
+        ],
+        "sha": "09ae0269780271e0a4916f7bab1dbc4f8a76070d"
+    },
+    "DPTForDepthEstimation": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "DPTImageProcessor"
+        ],
+        "model_classes": [
+            "DPTForDepthEstimation"
+        ],
+        "sha": "11b7735d64d95b6599811631b012d2dec6eaa2c1"
+    },
+    "DPTForSemanticSegmentation": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "DPTImageProcessor"
+        ],
+        "model_classes": [
+            "DPTForSemanticSegmentation"
+        ],
+        "sha": "e140c3c716a4bf11dad875e5f5f0abd2bd4cbbcb"
+    },
+    "DPTModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "DPTImageProcessor"
+        ],
+        "model_classes": [
+            "DPTModel"
+        ],
+        "sha": "1d6ae6c0b60868dffbef0dddeda381c51c6dcba5"
+    },
+    "Data2VecAudioForAudioFrameClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "Data2VecAudioForAudioFrameClassification"
+        ],
+        "sha": "a64828b27e73fc8dd95aeb315108ca2f6a66b55f"
+    },
+    "Data2VecAudioForCTC": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "Data2VecAudioForCTC"
+        ],
+        "sha": "bb161b6a181bd2c22cf30222f46fa6ef42225744"
+    },
+    "Data2VecAudioForSequenceClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "Data2VecAudioForSequenceClassification"
+        ],
+        "sha": "8de17e0a959eca5f72b2ea59a11bc1fa744785d9"
+    },
+    "Data2VecAudioForXVector": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "Data2VecAudioForXVector"
+        ],
+        "sha": "dcb92484cf28fb4fe1dcf5d6e8d78e04382fdce9"
+    },
+    "Data2VecAudioModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "Data2VecAudioModel"
+        ],
+        "sha": "73f503fdff73b7616154f64dbe38a685cc48e8eb"
+    },
+    "Data2VecTextForCausalLM": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "Data2VecTextForCausalLM"
+        ],
+        "sha": "1f3658ce623653338cd31516551e8181aa08bb38"
+    },
+    "Data2VecTextForMaskedLM": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "Data2VecTextForMaskedLM"
+        ],
+        "sha": "fb41ac30d0faa0899bf5afaa0986df8993395ca6"
+    },
+    "Data2VecTextForMultipleChoice": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "Data2VecTextForMultipleChoice"
+        ],
+        "sha": "e7556d520ad90ebae5ad88554d45a37488d00040"
+    },
+    "Data2VecTextForQuestionAnswering": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "Data2VecTextForQuestionAnswering"
+        ],
+        "sha": "9630833d76a1fd7e96b904d87bb11b7c00ccd021"
+    },
+    "Data2VecTextForSequenceClassification": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "Data2VecTextForSequenceClassification"
+        ],
+        "sha": "156e4019c37d9592f193ba80553cd245cbccecb3"
+    },
+    "Data2VecTextForTokenClassification": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "Data2VecTextForTokenClassification"
+        ],
+        "sha": "55b3a49fdbf22479d6eb939261d4b884ea288270"
+    },
+    "Data2VecTextModel": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "Data2VecTextModel"
+        ],
+        "sha": "c21be3e4f88e8357bf33bfba8f8e05ae2e735124"
+    },
+    "Data2VecVisionForImageClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "BeitImageProcessor"
+        ],
+        "model_classes": [
+            "Data2VecVisionForImageClassification",
+            "TFData2VecVisionForImageClassification"
+        ],
+        "sha": "d640e7ced7a3fbbb8c8661a4f67b934e55406172"
+    },
+    "Data2VecVisionForSemanticSegmentation": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "BeitImageProcessor"
+        ],
+        "model_classes": [
+            "Data2VecVisionForSemanticSegmentation",
+            "TFData2VecVisionForSemanticSegmentation"
+        ],
+        "sha": "3eba3cd694fab6530b7e5da8f49d3951301c816a"
+    },
+    "Data2VecVisionModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "BeitImageProcessor"
+        ],
+        "model_classes": [
+            "Data2VecVisionModel",
+            "TFData2VecVisionModel"
+        ],
+        "sha": "2a7ad25e4359970dc70494a2f3eb98e2a3c9806d"
+    },
+    "DebertaForMaskedLM": {
+        "tokenizer_classes": [
+            "DebertaTokenizer",
+            "DebertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "DebertaForMaskedLM",
+            "TFDebertaForMaskedLM"
+        ],
+        "sha": "e0f9ada9e0f6d4d7cc39d7cbd58369b0c84de33d"
+    },
+    "DebertaForQuestionAnswering": {
+        "tokenizer_classes": [
+            "DebertaTokenizer",
+            "DebertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "DebertaForQuestionAnswering",
+            "TFDebertaForQuestionAnswering"
+        ],
+        "sha": "a3eb69cdb0b52f7d0fb730e882f1a54b9a7442ea"
+    },
+    "DebertaForSequenceClassification": {
+        "tokenizer_classes": [
+            "DebertaTokenizer",
+            "DebertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "DebertaForSequenceClassification",
+            "TFDebertaForSequenceClassification"
+        ],
+        "sha": "32af91d12c4e9b6d62b420bee93311fd77d3c933"
+    },
+    "DebertaForTokenClassification": {
+        "tokenizer_classes": [
+            "DebertaTokenizer",
+            "DebertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "DebertaForTokenClassification",
+            "TFDebertaForTokenClassification"
+        ],
+        "sha": "ba62ba2726d813e60e512476fc1b178aa3858175"
+    },
+    "DebertaModel": {
+        "tokenizer_classes": [
+            "DebertaTokenizer",
+            "DebertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "DebertaModel",
+            "TFDebertaModel"
+        ],
+        "sha": "4273294e14cd04c0e2cd1dcff5cf7e5d4fe906ba"
+    },
+    "DebertaV2ForMaskedLM": {
+        "tokenizer_classes": [
+            "DebertaV2Tokenizer",
+            "DebertaV2TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "DebertaV2ForMaskedLM",
+            "TFDebertaV2ForMaskedLM"
+        ],
+        "sha": "a053dedc2cdf32918a84277cb0c05186604496a5"
+    },
+    "DebertaV2ForMultipleChoice": {
+        "tokenizer_classes": [
+            "DebertaV2Tokenizer",
+            "DebertaV2TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "DebertaV2ForMultipleChoice",
+            "TFDebertaV2ForMultipleChoice"
+        ],
+        "sha": "07e39f520ce239b39ef8cb24cd7874d06c791063"
+    },
+    "DebertaV2ForQuestionAnswering": {
+        "tokenizer_classes": [
+            "DebertaV2Tokenizer",
+            "DebertaV2TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "DebertaV2ForQuestionAnswering",
+            "TFDebertaV2ForQuestionAnswering"
+        ],
+        "sha": "9cecb3a7fc6b95099122283644ea1f8ced287d1b"
+    },
+    "DebertaV2ForSequenceClassification": {
+        "tokenizer_classes": [
+            "DebertaV2Tokenizer",
+            "DebertaV2TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "DebertaV2ForSequenceClassification",
+            "TFDebertaV2ForSequenceClassification"
+        ],
+        "sha": "df9ea1f5c0f2ccd139b21cfb3963a5a5ebfb5b81"
+    },
+    "DebertaV2ForTokenClassification": {
+        "tokenizer_classes": [
+            "DebertaV2Tokenizer",
+            "DebertaV2TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "DebertaV2ForTokenClassification",
+            "TFDebertaV2ForTokenClassification"
+        ],
+        "sha": "51fe01989df38a540ac1abca5ee71a51365defd5"
+    },
+    "DebertaV2Model": {
+        "tokenizer_classes": [
+            "DebertaV2Tokenizer",
+            "DebertaV2TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "DebertaV2Model",
+            "TFDebertaV2Model"
+        ],
+        "sha": "211df4bd1a4a9b66c97af3f9231a5d2af8de7b9f"
+    },
+    "DeformableDetrForObjectDetection": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "DeformableDetrImageProcessor"
+        ],
+        "model_classes": [
+            "DeformableDetrForObjectDetection"
+        ],
+        "sha": "8fa0db215c458f60ae4d455d6fb067c1c5e39fdc"
+    },
+    "DeformableDetrModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "DeformableDetrImageProcessor"
+        ],
+        "model_classes": [
+            "DeformableDetrModel"
+        ],
+        "sha": "0faac5624696b03edd14694642f9804f2cd8f3da"
+    },
+    "DeiTForImageClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "DeiTImageProcessor"
+        ],
+        "model_classes": [
+            "DeiTForImageClassification",
+            "TFDeiTForImageClassification"
+        ],
+        "sha": "21fc864199dafa0130f16a45769c6b6ca22c7784"
+    },
+    "DeiTForImageClassificationWithTeacher": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "DeiTImageProcessor"
+        ],
+        "model_classes": [
+            "DeiTForImageClassificationWithTeacher",
+            "TFDeiTForImageClassificationWithTeacher"
+        ],
+        "sha": "5a5738a109e27f3d4b78a0db4cb1d3331140c10e"
+    },
+    "DeiTForMaskedImageModeling": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "DeiTImageProcessor"
+        ],
+        "model_classes": [
+            "DeiTForMaskedImageModeling",
+            "TFDeiTForMaskedImageModeling"
+        ],
+        "sha": "d5df5c538fe1efb8d668a3893d1691d505a0de06"
+    },
+    "DeiTModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "DeiTImageProcessor"
+        ],
+        "model_classes": [
+            "DeiTModel",
+            "TFDeiTModel"
+        ],
+        "sha": "0fdbff6f44b7c6933c2027fec1d7f87bec06b590"
+    },
+    "DetaForObjectDetection": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "DetaImageProcessor"
+        ],
+        "model_classes": [
+            "DetaForObjectDetection"
+        ],
+        "sha": "a15ad6ce64fbcb5021b2b99e9587c4011ef3341d"
+    },
+    "DetaModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "DetaImageProcessor"
+        ],
+        "model_classes": [
+            "DetaModel"
+        ],
+        "sha": "8820f2297ec0dec8f1875054559c8b7a162098e3"
+    },
+    "DetrForObjectDetection": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "DetrImageProcessor"
+        ],
+        "model_classes": [
+            "DetrForObjectDetection"
+        ],
+        "sha": "7dc967c53f4b3f07904c42b255346b744d0ad84e"
+    },
+    "DetrForSegmentation": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "DetrImageProcessor"
+        ],
+        "model_classes": [
+            "DetrForSegmentation"
+        ],
+        "sha": "e34330acdae359588ef853e961a78d419dc4e8eb"
+    },
+    "DetrModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "DetrImageProcessor"
+        ],
+        "model_classes": [
+            "DetrModel"
+        ],
+        "sha": "f15ce38a10c7447e8048b1681e4811322a005722"
+    },
+    "DinatBackbone": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ViTImageProcessor"
+        ],
+        "model_classes": [
+            "DinatBackbone"
+        ],
+        "sha": "3ba13790a0796d90104c207f75bb3d5d79723d51"
+    },
+    "DinatForImageClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ViTImageProcessor"
+        ],
+        "model_classes": [
+            "DinatForImageClassification"
+        ],
+        "sha": "624cf2d864a7ea2f90e24014a213e34597e8bd76"
+    },
+    "DinatModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ViTImageProcessor"
+        ],
+        "model_classes": [
+            "DinatModel"
+        ],
+        "sha": "d6c75bc51196f0a683afb12de6310fdda13efefd"
+    },
+    "Dinov2Backbone": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "BitImageProcessor"
+        ],
+        "model_classes": [
+            "Dinov2Backbone"
+        ],
+        "sha": "dbf8d2ff3092ac53c11e6525e6cbae7ace84769a"
+    },
+    "Dinov2ForImageClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "BitImageProcessor"
+        ],
+        "model_classes": [
+            "Dinov2ForImageClassification"
+        ],
+        "sha": "ae44840966456aae33641df2c8c8a4af5b457b24"
+    },
+    "Dinov2Model": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "BitImageProcessor"
+        ],
+        "model_classes": [
+            "Dinov2Model"
+        ],
+        "sha": "6f560b1cc9806bcf84fe0b0c60b5faf9c29be959"
+    },
+    "DistilBertForMaskedLM": {
+        "tokenizer_classes": [
+            "DistilBertTokenizer",
+            "DistilBertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "DistilBertForMaskedLM",
+            "TFDistilBertForMaskedLM"
+        ],
+        "sha": "b2dfda30b012821996e6e603729562d9c900bc0f"
+    },
+    "DistilBertForMultipleChoice": {
+        "tokenizer_classes": [
+            "DistilBertTokenizer",
+            "DistilBertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "DistilBertForMultipleChoice",
+            "TFDistilBertForMultipleChoice"
+        ],
+        "sha": "ec6b83129a7d1be2a6b8d58303abcca5541a5cb3"
+    },
+    "DistilBertForQuestionAnswering": {
+        "tokenizer_classes": [
+            "DistilBertTokenizer",
+            "DistilBertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "DistilBertForQuestionAnswering",
+            "TFDistilBertForQuestionAnswering"
+        ],
+        "sha": "812406b226415044469b0e0a84c4fe0ff338c5d3"
+    },
+    "DistilBertForSequenceClassification": {
+        "tokenizer_classes": [
+            "DistilBertTokenizer",
+            "DistilBertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "DistilBertForSequenceClassification",
+            "TFDistilBertForSequenceClassification"
+        ],
+        "sha": "6f427ce7b3e5aaa596938fbd98437d3875581b7b"
+    },
+    "DistilBertForTokenClassification": {
+        "tokenizer_classes": [
+            "DistilBertTokenizer",
+            "DistilBertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "DistilBertForTokenClassification",
+            "TFDistilBertForTokenClassification"
+        ],
+        "sha": "166dbe3f5d6ecd871762567069454d6ec65234b4"
+    },
+    "DistilBertModel": {
+        "tokenizer_classes": [
+            "DistilBertTokenizer",
+            "DistilBertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "DistilBertModel",
+            "TFDistilBertModel"
+        ],
+        "sha": "cc4425ad0676f3ec00e8bffe485fe83cae61041a"
+    },
+    "DonutSwinModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "DonutImageProcessor"
+        ],
+        "model_classes": [
+            "DonutSwinModel"
+        ],
+        "sha": "1b10654fbfe2f2ea410a672ab605bd5c60d3f284"
+    },
+    "EfficientFormerForImageClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "EfficientFormerImageProcessor"
+        ],
+        "model_classes": [
+            "EfficientFormerForImageClassification",
+            "TFEfficientFormerForImageClassification"
+        ],
+        "sha": "ebadb628e12f268e321fcc756fa4606f7b5b3178"
+    },
+    "EfficientFormerForImageClassificationWithTeacher": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "EfficientFormerImageProcessor"
+        ],
+        "model_classes": [
+            "EfficientFormerForImageClassificationWithTeacher",
+            "TFEfficientFormerForImageClassificationWithTeacher"
+        ],
+        "sha": "1beabce6da9cb4ebbeafcd1ef23fac36b4a269e2"
+    },
+    "EfficientFormerModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "EfficientFormerImageProcessor"
+        ],
+        "model_classes": [
+            "EfficientFormerModel",
+            "TFEfficientFormerModel"
+        ],
+        "sha": "200fae5b875844d09c8a91d1c155b72b06a517f6"
+    },
+    "EfficientNetForImageClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "EfficientNetImageProcessor"
+        ],
+        "model_classes": [
+            "EfficientNetForImageClassification"
+        ],
+        "sha": "6ed195ee636d2c0b885139da8c7b45d57ebaeee0"
+    },
+    "EfficientNetModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "EfficientNetImageProcessor"
+        ],
+        "model_classes": [
+            "EfficientNetModel"
+        ],
+        "sha": "eb03c90d4aaad98af0f19e0dfbdc41106297ffff"
+    },
+    "ElectraForCausalLM": {
+        "tokenizer_classes": [
+            "ElectraTokenizer",
+            "ElectraTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "ElectraForCausalLM"
+        ],
+        "sha": "c78396bc8cdd8db247892339de8da80d691d1d04"
+    },
+    "ElectraForMaskedLM": {
+        "tokenizer_classes": [
+            "ElectraTokenizer",
+            "ElectraTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "ElectraForMaskedLM",
+            "TFElectraForMaskedLM"
+        ],
+        "sha": "631337703dbd8d41904c39891a41c6f1edd31813"
+    },
+    "ElectraForMultipleChoice": {
+        "tokenizer_classes": [
+            "ElectraTokenizer",
+            "ElectraTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "ElectraForMultipleChoice",
+            "TFElectraForMultipleChoice"
+        ],
+        "sha": "66fdea6e22cfcbd3caa49ea82f31871c460612fa"
+    },
+    "ElectraForPreTraining": {
+        "tokenizer_classes": [
+            "ElectraTokenizer",
+            "ElectraTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "ElectraForPreTraining",
+            "TFElectraForPreTraining"
+        ],
+        "sha": "7b2d0fa8726b1180c7d6cde4f4afc3800eba7e6f"
+    },
+    "ElectraForQuestionAnswering": {
+        "tokenizer_classes": [
+            "ElectraTokenizer",
+            "ElectraTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "ElectraForQuestionAnswering",
+            "TFElectraForQuestionAnswering"
+        ],
+        "sha": "c6b127fd9f3019462e4ca2373762836207e39ce2"
+    },
+    "ElectraForSequenceClassification": {
+        "tokenizer_classes": [
+            "ElectraTokenizer",
+            "ElectraTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "ElectraForSequenceClassification",
+            "TFElectraForSequenceClassification"
+        ],
+        "sha": "41f0089ab7876abe0e28dbbd565144acb31f8127"
+    },
+    "ElectraForTokenClassification": {
+        "tokenizer_classes": [
+            "ElectraTokenizer",
+            "ElectraTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "ElectraForTokenClassification",
+            "TFElectraForTokenClassification"
+        ],
+        "sha": "1fdbbe70c1ddd16503820a1443d6a379a15ed777"
+    },
+    "ElectraModel": {
+        "tokenizer_classes": [
+            "ElectraTokenizer",
+            "ElectraTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "ElectraModel",
+            "TFElectraModel"
+        ],
+        "sha": "312b532cbef26610d80f2bd008650160cae4f7a1"
+    },
+    "EncodecModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "EncodecFeatureExtractor"
+        ],
+        "model_classes": [
+            "EncodecModel"
+        ],
+        "sha": "e14c5a2fd6529c85cd4ac5a05ee9e550ced6a006"
+    },
+    "EncoderDecoderModel": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "EncoderDecoderModel",
+            "TFEncoderDecoderModel"
+        ],
+        "sha": "1038be9fd1b87b2e0a8f33721ff8e4612d34b3b6"
+    },
+    "ErnieForCausalLM": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "ErnieForCausalLM"
+        ],
+        "sha": "b49e00112ff06c2f0a0e54499921dddcf8c3c6a8"
+    },
+    "ErnieForMaskedLM": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "ErnieForMaskedLM"
+        ],
+        "sha": "30429830d1997222d885dcfdbd36d5e02d0d34b1"
+    },
+    "ErnieForMultipleChoice": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "ErnieForMultipleChoice"
+        ],
+        "sha": "5a21144bf35dfb60560ff8249116ad4459c0069a"
+    },
+    "ErnieForNextSentencePrediction": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "ErnieForNextSentencePrediction"
+        ],
+        "sha": "ed5868efb39bf6afb29f0cf444deafcf1e50b5bc"
+    },
+    "ErnieForPreTraining": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "ErnieForPreTraining"
+        ],
+        "sha": "e4ad30d291c310fea25e6f91f91393f993513b42"
+    },
+    "ErnieForQuestionAnswering": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "ErnieForQuestionAnswering"
+        ],
+        "sha": "fe7c74b763f63a9fd864dad325385075df7c80c8"
+    },
+    "ErnieForSequenceClassification": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "ErnieForSequenceClassification"
+        ],
+        "sha": "84e0be05fcd52f54e96a69f67a2481323a58a9db"
+    },
+    "ErnieForTokenClassification": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "ErnieForTokenClassification"
+        ],
+        "sha": "91cf62c43a5a83332552ffa2d8e5e44d63a224ea"
+    },
+    "ErnieMForMultipleChoice": {
+        "tokenizer_classes": [
+            "ErnieMTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "ErnieMForMultipleChoice"
+        ],
+        "sha": "c42ee7fcb132a323ace314c32e63c8a7d36ce18f"
+    },
+    "ErnieMForQuestionAnswering": {
+        "tokenizer_classes": [
+            "ErnieMTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "ErnieMForQuestionAnswering"
+        ],
+        "sha": "2b90dee75ca87b214f96db00002aa18244ec8e84"
+    },
+    "ErnieMForSequenceClassification": {
+        "tokenizer_classes": [
+            "ErnieMTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "ErnieMForSequenceClassification"
+        ],
+        "sha": "d8368646d8b1c67b1460af9c6ec13fd9d894cae6"
+    },
+    "ErnieMForTokenClassification": {
+        "tokenizer_classes": [
+            "ErnieMTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "ErnieMForTokenClassification"
+        ],
+        "sha": "a9e29ba60fa0b7bedc2ed26a6b9911427df1ca6b"
+    },
+    "ErnieMModel": {
+        "tokenizer_classes": [
+            "ErnieMTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "ErnieMModel"
+        ],
+        "sha": "7306eac3f38c3cf6211f0e741fdb81c6cc92bc09"
+    },
+    "ErnieModel": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "ErnieModel"
+        ],
+        "sha": "b51478a9f40e353c41be3a29ccef103dcfe22b4b"
+    },
+    "EsmForMaskedLM": {
+        "tokenizer_classes": [
+            "EsmTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "EsmForMaskedLM",
+            "TFEsmForMaskedLM"
+        ],
+        "sha": "b56297b6cd64b9ba7c613d0cd146f1ecbea8115e"
+    },
+    "EsmForSequenceClassification": {
+        "tokenizer_classes": [
+            "EsmTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "EsmForSequenceClassification",
+            "TFEsmForSequenceClassification"
+        ],
+        "sha": "cc6d7ef0a4763540d67b7a4fb31bede9a7d3f245"
+    },
+    "EsmForTokenClassification": {
+        "tokenizer_classes": [
+            "EsmTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "EsmForTokenClassification",
+            "TFEsmForTokenClassification"
+        ],
+        "sha": "498953f66e260b974c504abbc863ee266d6c84a9"
+    },
+    "EsmModel": {
+        "tokenizer_classes": [
+            "EsmTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "EsmModel",
+            "TFEsmModel"
+        ],
+        "sha": "183838263b70809310117a0761542501acf64c21"
+    },
+    "FNetForMaskedLM": {
+        "tokenizer_classes": [
+            "FNetTokenizer",
+            "FNetTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "FNetForMaskedLM"
+        ],
+        "sha": "91eaae1eac894af5d96c0221ec9bcef7f1af41c8"
+    },
+    "FNetForMultipleChoice": {
+        "tokenizer_classes": [
+            "FNetTokenizer",
+            "FNetTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "FNetForMultipleChoice"
+        ],
+        "sha": "c15d98d5f7a6f3ef3099b1257949bee208d5466e"
+    },
+    "FNetForNextSentencePrediction": {
+        "tokenizer_classes": [
+            "FNetTokenizer",
+            "FNetTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "FNetForNextSentencePrediction"
+        ],
+        "sha": "c59440b44d07d61fc45a90ded7fc11d6f25b143d"
+    },
+    "FNetForPreTraining": {
+        "tokenizer_classes": [
+            "FNetTokenizer",
+            "FNetTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "FNetForPreTraining"
+        ],
+        "sha": "c05f55ccfb2f2533babd3c6e99de7749bc8081da"
+    },
+    "FNetForQuestionAnswering": {
+        "tokenizer_classes": [
+            "FNetTokenizer",
+            "FNetTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "FNetForQuestionAnswering"
+        ],
+        "sha": "47788e49dd435653fa2aa4b3ccae3572a870758e"
+    },
+    "FNetForSequenceClassification": {
+        "tokenizer_classes": [
+            "FNetTokenizer",
+            "FNetTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "FNetForSequenceClassification"
+        ],
+        "sha": "a3049b896ea6c5a32c364989c3afe604ee58b9fc"
+    },
+    "FNetForTokenClassification": {
+        "tokenizer_classes": [
+            "FNetTokenizer",
+            "FNetTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "FNetForTokenClassification"
+        ],
+        "sha": "3bcdafca57d544bb81e2f7eead1e512c168582fc"
+    },
+    "FNetModel": {
+        "tokenizer_classes": [
+            "FNetTokenizer",
+            "FNetTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "FNetModel"
+        ],
+        "sha": "48fa66de37df126504db3b658806135eb877f505"
+    },
+    "FSMTForConditionalGeneration": {
+        "tokenizer_classes": [
+            "FSMTTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "FSMTForConditionalGeneration"
+        ],
+        "sha": "6a1a981b29c8a98c1fd31bd0ad809f5575ca6c7a"
+    },
+    "FSMTModel": {
+        "tokenizer_classes": [
+            "FSMTTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "FSMTModel"
+        ],
+        "sha": "683f6f73a2ab87801f1695a72d1af63cf173ab7c"
+    },
+    "FlaubertForMultipleChoice": {
+        "tokenizer_classes": [
+            "FlaubertTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "FlaubertForMultipleChoice",
+            "TFFlaubertForMultipleChoice"
+        ],
+        "sha": "8b12bd87a63f2e86c3482431742f6d8abf6ec4fd"
+    },
+    "FlaubertForQuestionAnsweringSimple": {
+        "tokenizer_classes": [
+            "FlaubertTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "FlaubertForQuestionAnsweringSimple",
+            "TFFlaubertForQuestionAnsweringSimple"
+        ],
+        "sha": "5c0e7ad1efae7e3497f5cd6d2d9519403df49d37"
+    },
+    "FlaubertForSequenceClassification": {
+        "tokenizer_classes": [
+            "FlaubertTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "FlaubertForSequenceClassification",
+            "TFFlaubertForSequenceClassification"
+        ],
+        "sha": "762f12a8c99690be8ed2663b7af3011660174a7c"
+    },
+    "FlaubertForTokenClassification": {
+        "tokenizer_classes": [
+            "FlaubertTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "FlaubertForTokenClassification",
+            "TFFlaubertForTokenClassification"
+        ],
+        "sha": "d2ab741c937bb69ef27c89e4c86a8c9d444874ca"
+    },
+    "FlaubertModel": {
+        "tokenizer_classes": [
+            "FlaubertTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "FlaubertModel",
+            "TFFlaubertModel"
+        ],
+        "sha": "bdc2f8e17bb869393053429ec8c1c842bfeabb07"
+    },
+    "FlaubertWithLMHeadModel": {
+        "tokenizer_classes": [
+            "FlaubertTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "FlaubertWithLMHeadModel",
+            "TFFlaubertWithLMHeadModel"
+        ],
+        "sha": "f20eb0932c90061003c9cc4e109c6ea22559c4f2"
+    },
+    "FlavaForPreTraining": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [
+            "FlavaImageProcessor"
+        ],
+        "model_classes": [
+            "FlavaForPreTraining"
+        ],
+        "sha": "6e9b2094060a5fa27984c7b49e5d0e820a88b487"
+    },
+    "FlavaModel": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [
+            "FlavaImageProcessor"
+        ],
+        "model_classes": [
+            "FlavaModel"
+        ],
+        "sha": "31ebf1b7a0ef1fd5059b98e28e5ab1c366d2c482"
+    },
+    "FocalNetBackbone": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "BitImageProcessor"
+        ],
+        "model_classes": [
+            "FocalNetBackbone"
+        ],
+        "sha": "eb8c580969443cb87de7dd9a256deaface03692f"
+    },
+    "FocalNetForImageClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "BitImageProcessor"
+        ],
+        "model_classes": [
+            "FocalNetForImageClassification"
+        ],
+        "sha": "28d30ded26a3213e8fb7011a455afc3aa98b0a95"
+    },
+    "FocalNetForMaskedImageModeling": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "BitImageProcessor"
+        ],
+        "model_classes": [
+            "FocalNetForMaskedImageModeling"
+        ],
+        "sha": "0ea7626d19c9dd2f3113d977f643a1babc720bd3"
+    },
+    "FocalNetModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "BitImageProcessor"
+        ],
+        "model_classes": [
+            "FocalNetModel"
+        ],
+        "sha": "107b004e6aa14108a359b7d22bdb9aa141ec05d5"
+    },
+    "FunnelBaseModel": {
+        "tokenizer_classes": [
+            "FunnelTokenizer",
+            "FunnelTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "FunnelBaseModel",
+            "TFFunnelBaseModel"
+        ],
+        "sha": "87fed4252812df23315a56531625333e315681c6"
+    },
+    "FunnelForMaskedLM": {
+        "tokenizer_classes": [
+            "FunnelTokenizer",
+            "FunnelTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "FunnelForMaskedLM",
+            "TFFunnelForMaskedLM"
+        ],
+        "sha": "5543daf29f185cd45f2599bd6f38c96064c9c8de"
+    },
+    "FunnelForMultipleChoice": {
+        "tokenizer_classes": [
+            "FunnelTokenizer",
+            "FunnelTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "FunnelForMultipleChoice",
+            "TFFunnelForMultipleChoice"
+        ],
+        "sha": "a8bf597e37dbefb1ac5c97c4cb162c3d522a33a1"
+    },
+    "FunnelForPreTraining": {
+        "tokenizer_classes": [
+            "FunnelTokenizer",
+            "FunnelTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "FunnelForPreTraining",
+            "TFFunnelForPreTraining"
+        ],
+        "sha": "cbcb300d60aacd5950a45409b6e3f0f240c9082e"
+    },
+    "FunnelForQuestionAnswering": {
+        "tokenizer_classes": [
+            "FunnelTokenizer",
+            "FunnelTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "FunnelForQuestionAnswering",
+            "TFFunnelForQuestionAnswering"
+        ],
+        "sha": "6a5675305e096434e818486a13892cb55daffd13"
+    },
+    "FunnelForSequenceClassification": {
+        "tokenizer_classes": [
+            "FunnelTokenizer",
+            "FunnelTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "FunnelForSequenceClassification",
+            "TFFunnelForSequenceClassification"
+        ],
+        "sha": "1bc557a1e4314da21a44dee57b799e95a7025e5c"
+    },
+    "FunnelForTokenClassification": {
+        "tokenizer_classes": [
+            "FunnelTokenizer",
+            "FunnelTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "FunnelForTokenClassification",
+            "TFFunnelForTokenClassification"
+        ],
+        "sha": "693bc1217a224efd558f410ddc8ffc63739bebc3"
+    },
+    "FunnelModel": {
+        "tokenizer_classes": [
+            "FunnelTokenizer",
+            "FunnelTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "FunnelModel",
+            "TFFunnelModel"
+        ],
+        "sha": "bfbaa8fa21c3abf80b94e7168b5ecff8ec5b5f76"
+    },
+    "GLPNForDepthEstimation": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "GLPNImageProcessor"
+        ],
+        "model_classes": [
+            "GLPNForDepthEstimation"
+        ],
+        "sha": "32ca1c1ef5d33242e5e7c0433bcd773c082f0260"
+    },
+    "GLPNModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "GLPNImageProcessor"
+        ],
+        "model_classes": [
+            "GLPNModel"
+        ],
+        "sha": "24a8dbb48b1aa0ba2eba44324fcd0c78cca64dd4"
+    },
+    "GPT2ForQuestionAnswering": {
+        "tokenizer_classes": [
+            "GPT2Tokenizer",
+            "GPT2TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "GPT2ForQuestionAnswering"
+        ],
+        "sha": "a5bdd6bd4d79feece85ea9a8bd4ee5fe54c1d45b"
+    },
+    "GPT2ForSequenceClassification": {
+        "tokenizer_classes": [
+            "GPT2Tokenizer",
+            "GPT2TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "GPT2ForSequenceClassification",
+            "TFGPT2ForSequenceClassification"
+        ],
+        "sha": "90a2d78e5c7f288152f8456c3d58a43b40a58449"
+    },
+    "GPT2ForTokenClassification": {
+        "tokenizer_classes": [
+            "GPT2Tokenizer",
+            "GPT2TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "GPT2ForTokenClassification"
+        ],
+        "sha": "da78bc95b45fab2da9d43f2ca27164996e31ade1"
+    },
+    "GPT2LMHeadModel": {
+        "tokenizer_classes": [
+            "GPT2Tokenizer",
+            "GPT2TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "GPT2LMHeadModel",
+            "TFGPT2LMHeadModel"
+        ],
+        "sha": "78f56535d4ce19e9d7c0992e390085c5a4196b37"
+    },
+    "GPT2Model": {
+        "tokenizer_classes": [
+            "GPT2Tokenizer",
+            "GPT2TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "GPT2Model",
+            "TFGPT2Model"
+        ],
+        "sha": "d6694b0d8fe17978761c9305dc151780506b192e"
+    },
+    "GPTBigCodeForCausalLM": {
+        "tokenizer_classes": [
+            "GPT2Tokenizer",
+            "GPT2TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "GPTBigCodeForCausalLM"
+        ],
+        "sha": "99f7aaadf9c29669c63ef6c16f6bc5c07dbb9126"
+    },
+    "GPTBigCodeForSequenceClassification": {
+        "tokenizer_classes": [
+            "GPT2Tokenizer",
+            "GPT2TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "GPTBigCodeForSequenceClassification"
+        ],
+        "sha": "64a7398d5763161037b818314c60dd83d93d03e9"
+    },
+    "GPTBigCodeForTokenClassification": {
+        "tokenizer_classes": [
+            "GPT2Tokenizer",
+            "GPT2TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "GPTBigCodeForTokenClassification"
+        ],
+        "sha": "310537ecd22d45f71bf594b17922cf2abc338eaf"
+    },
+    "GPTBigCodeModel": {
+        "tokenizer_classes": [
+            "GPT2Tokenizer",
+            "GPT2TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "GPTBigCodeModel"
+        ],
+        "sha": "3069419084a9dc36802d47de9df3d314ccfc2f28"
+    },
+    "GPTJForCausalLM": {
+        "tokenizer_classes": [
+            "GPT2Tokenizer",
+            "GPT2TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "GPTJForCausalLM",
+            "TFGPTJForCausalLM"
+        ],
+        "sha": "1fff390baa45cb187903ebdd269c975bb9ed7386"
+    },
+    "GPTJForQuestionAnswering": {
+        "tokenizer_classes": [
+            "GPT2Tokenizer",
+            "GPT2TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "GPTJForQuestionAnswering",
+            "TFGPTJForQuestionAnswering"
+        ],
+        "sha": "3d4ec61dbed01f844d4c309971eeb5ad722c6c84"
+    },
+    "GPTJForSequenceClassification": {
+        "tokenizer_classes": [
+            "GPT2Tokenizer",
+            "GPT2TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "GPTJForSequenceClassification",
+            "TFGPTJForSequenceClassification"
+        ],
+        "sha": "4b5db259cd16ca84ae2cd79aa4851cdd14479128"
+    },
+    "GPTJModel": {
+        "tokenizer_classes": [
+            "GPT2Tokenizer",
+            "GPT2TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "GPTJModel",
+            "TFGPTJModel"
+        ],
+        "sha": "d8e1db30d08fbf57da6fc139aea3ffd63ab6226e"
+    },
+    "GPTNeoForCausalLM": {
+        "tokenizer_classes": [
+            "GPT2Tokenizer",
+            "GPT2TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "GPTNeoForCausalLM"
+        ],
+        "sha": "e88934e402c15195dd99b2947632415dd7645268"
+    },
+    "GPTNeoForQuestionAnswering": {
+        "tokenizer_classes": [
+            "GPT2Tokenizer",
+            "GPT2TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "GPTNeoForQuestionAnswering"
+        ],
+        "sha": "623883e94bd08caf9b3f839b98debeea72d5bc2b"
+    },
+    "GPTNeoForSequenceClassification": {
+        "tokenizer_classes": [
+            "GPT2Tokenizer",
+            "GPT2TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "GPTNeoForSequenceClassification"
+        ],
+        "sha": "bf2090d5d91a70eb37ba51fbdcf23afc7031fea8"
+    },
+    "GPTNeoForTokenClassification": {
+        "tokenizer_classes": [
+            "GPT2Tokenizer",
+            "GPT2TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "GPTNeoForTokenClassification"
+        ],
+        "sha": "d5208e73e24a1671219776b50fe5f96e0e4cd218"
+    },
+    "GPTNeoModel": {
+        "tokenizer_classes": [
+            "GPT2Tokenizer",
+            "GPT2TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "GPTNeoModel"
+        ],
+        "sha": "72a7cd49da613c3125a90884df4763545c594e56"
+    },
+    "GPTNeoXForCausalLM": {
+        "tokenizer_classes": [
+            "GPTNeoXTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "GPTNeoXForCausalLM"
+        ],
+        "sha": "0229cfaaa843c6b492ac2abffabb00f1ff1936f8"
+    },
+    "GPTNeoXForQuestionAnswering": {
+        "tokenizer_classes": [
+            "GPTNeoXTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "GPTNeoXForQuestionAnswering"
+        ],
+        "sha": "7d2f08c959c211129952ee03b5562add09fe6864"
+    },
+    "GPTNeoXForSequenceClassification": {
+        "tokenizer_classes": [
+            "GPTNeoXTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "GPTNeoXForSequenceClassification"
+        ],
+        "sha": "17c4b845ee2e0bb780ca2dea2d59a3d9d5d3c651"
+    },
+    "GPTNeoXForTokenClassification": {
+        "tokenizer_classes": [
+            "GPTNeoXTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "GPTNeoXForTokenClassification"
+        ],
+        "sha": "3aa4fe8a562f32230041d6d3616aa5ecc3f30192"
+    },
+    "GPTNeoXJapaneseForCausalLM": {
+        "tokenizer_classes": [
+            "GPTNeoXJapaneseTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "GPTNeoXJapaneseForCausalLM"
+        ],
+        "sha": "5fca2479f1064fd22e17f944c8fcc14f7e73f1d5"
+    },
+    "GPTNeoXJapaneseModel": {
+        "tokenizer_classes": [
+            "GPTNeoXJapaneseTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "GPTNeoXJapaneseModel"
+        ],
+        "sha": "5c6ed124150df845cfc701d70b97fdcde687be52"
+    },
+    "GPTNeoXModel": {
+        "tokenizer_classes": [
+            "GPTNeoXTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "GPTNeoXModel"
+        ],
+        "sha": "33114ba2f72189d5a2bd63f0cdb78551189242ff"
+    },
+    "GPTSanJapaneseForConditionalGeneration": {
+        "tokenizer_classes": [
+            "GPTSanJapaneseTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "GPTSanJapaneseForConditionalGeneration"
+        ],
+        "sha": "ff6a41faaa713c7fbd5d9a1a50539745f9e1178e"
+    },
+    "GitForCausalLM": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [
+            "CLIPImageProcessor"
+        ],
+        "model_classes": [
+            "GitForCausalLM"
+        ],
+        "sha": "60f9c50466ae0beeb11776ca5bfeb6473f441554"
+    },
+    "GitModel": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [
+            "CLIPImageProcessor"
+        ],
+        "model_classes": [
+            "GitModel"
+        ],
+        "sha": "3d2eb6bddf95bb4a4e59b045d4e464c730c07f41"
+    },
+    "GroupViTModel": {
+        "tokenizer_classes": [
+            "CLIPTokenizer",
+            "CLIPTokenizerFast"
+        ],
+        "processor_classes": [
+            "CLIPImageProcessor"
+        ],
+        "model_classes": [
+            "GroupViTModel",
+            "TFGroupViTModel"
+        ],
+        "sha": "05a3a02dd46cb9eb078608dec98f633c0cf559ef"
+    },
+    "HubertForCTC": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "HubertForCTC"
+        ],
+        "sha": "13431b76106f993eedcff48a75bae590a09b14f7"
+    },
+    "HubertForSequenceClassification": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "HubertForSequenceClassification"
+        ],
+        "sha": "d23f46607a900b1a55dfee4b7ed205a6823035b1"
+    },
+    "HubertModel": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "HubertModel",
+            "TFHubertModel"
+        ],
+        "sha": "3224562c86c4669db65ae7defdc5fb555b113e95"
+    },
+    "IBertForMaskedLM": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "IBertForMaskedLM"
+        ],
+        "sha": "e333a9c9d375f4d839b7e9e21d1a1c8dad58d7d1"
+    },
+    "IBertForMultipleChoice": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "IBertForMultipleChoice"
+        ],
+        "sha": "a81f7d64cd7ce5fe6cd726b23d9d14ac5d17bf53"
+    },
+    "IBertForQuestionAnswering": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "IBertForQuestionAnswering"
+        ],
+        "sha": "7b66d13d4d6801a82cbeb7f9fd853ca1630d1f8b"
+    },
+    "IBertForSequenceClassification": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "IBertForSequenceClassification"
+        ],
+        "sha": "309d57145c40f889222fe5df62f14dddf4496b38"
+    },
+    "IBertForTokenClassification": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "IBertForTokenClassification"
+        ],
+        "sha": "b032e9bff4b081b78c098b2d8bc610ac035c6ddf"
+    },
+    "IBertModel": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "IBertModel"
+        ],
+        "sha": "6749164c678d4883d455f98b1dfc98c62da8f08b"
+    },
+    "IdeficsForVisionText2Text": {
+        "tokenizer_classes": [
+            "LlamaTokenizerFast"
+        ],
+        "processor_classes": [
+            "IdeficsImageProcessor"
+        ],
+        "model_classes": [
+            "IdeficsForVisionText2Text"
+        ],
+        "sha": "2c2f2e2cd6b02a77d0cdd8c3767ba9a6267dbd20"
+    },
+    "IdeficsModel": {
+        "tokenizer_classes": [
+            "LlamaTokenizerFast"
+        ],
+        "processor_classes": [
+            "IdeficsImageProcessor"
+        ],
+        "model_classes": [
+            "IdeficsModel"
+        ],
+        "sha": "649df2e35e067efd573ff2d083784a5cf876545e"
+    },
+    "ImageGPTForCausalImageModeling": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ImageGPTImageProcessor"
+        ],
+        "model_classes": [
+            "ImageGPTForCausalImageModeling"
+        ],
+        "sha": "9a7d1fc04439ab1d9d690de9c3e7673f08568cdf"
+    },
+    "ImageGPTForImageClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ImageGPTImageProcessor"
+        ],
+        "model_classes": [
+            "ImageGPTForImageClassification"
+        ],
+        "sha": "d92c7aed4ba5de74a1f542b736010090e4a58b42"
+    },
+    "ImageGPTModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ImageGPTImageProcessor"
+        ],
+        "model_classes": [
+            "ImageGPTModel"
+        ],
+        "sha": "5a7983e48d5841704733dd0756177680ed50c074"
+    },
+    "LEDForConditionalGeneration": {
+        "tokenizer_classes": [
+            "LEDTokenizer",
+            "LEDTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "LEDForConditionalGeneration",
+            "TFLEDForConditionalGeneration"
+        ],
+        "sha": "a354b49a79351f3ea8ae7776d9f8352ae26cfc14"
+    },
+    "LEDForQuestionAnswering": {
+        "tokenizer_classes": [
+            "LEDTokenizer",
+            "LEDTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "LEDForQuestionAnswering"
+        ],
+        "sha": "47c7a75a1e650dae60ff6e9bbab0f2386946670c"
+    },
+    "LEDForSequenceClassification": {
+        "tokenizer_classes": [
+            "LEDTokenizer",
+            "LEDTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "LEDForSequenceClassification"
+        ],
+        "sha": "3571e2c9d9f2f2ec0b8fe47090330b128be05126"
+    },
+    "LEDModel": {
+        "tokenizer_classes": [
+            "LEDTokenizer",
+            "LEDTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "LEDModel",
+            "TFLEDModel"
+        ],
+        "sha": "3c3f6eb142545afc570187bfdabfe65d43dafbe4"
+    },
+    "LayoutLMForMaskedLM": {
+        "tokenizer_classes": [
+            "LayoutLMTokenizer",
+            "LayoutLMTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "LayoutLMForMaskedLM",
+            "TFLayoutLMForMaskedLM"
+        ],
+        "sha": "0368bd9bd8fd3eb43b8a3b38962b5345b8765514"
+    },
+    "LayoutLMForQuestionAnswering": {
+        "tokenizer_classes": [
+            "LayoutLMTokenizer",
+            "LayoutLMTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "LayoutLMForQuestionAnswering",
+            "TFLayoutLMForQuestionAnswering"
+        ],
+        "sha": "0d6a4bc614fccfa313c1fb6d132a250929518f85"
+    },
+    "LayoutLMForSequenceClassification": {
+        "tokenizer_classes": [
+            "LayoutLMTokenizer",
+            "LayoutLMTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "LayoutLMForSequenceClassification",
+            "TFLayoutLMForSequenceClassification"
+        ],
+        "sha": "1bd68c73dbf6c8c0526d24fbe2831be82998c440"
+    },
+    "LayoutLMForTokenClassification": {
+        "tokenizer_classes": [
+            "LayoutLMTokenizer",
+            "LayoutLMTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "LayoutLMForTokenClassification",
+            "TFLayoutLMForTokenClassification"
+        ],
+        "sha": "155e7da3f1d786aa39d957b16080c52de4a7efd7"
+    },
+    "LayoutLMModel": {
+        "tokenizer_classes": [
+            "LayoutLMTokenizer",
+            "LayoutLMTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "LayoutLMModel",
+            "TFLayoutLMModel"
+        ],
+        "sha": "14f77b30d267910f11f0fd532a91a6b85ab3a4de"
+    },
+    "LayoutLMv2ForQuestionAnswering": {
+        "tokenizer_classes": [
+            "LayoutLMv2Tokenizer",
+            "LayoutLMv2TokenizerFast"
+        ],
+        "processor_classes": [
+            "LayoutLMv2ImageProcessor"
+        ],
+        "model_classes": [
+            "LayoutLMv2ForQuestionAnswering"
+        ],
+        "sha": "f452e28dd34d3c38cce046b1cc7b0ada69f587b1"
+    },
+    "LayoutLMv2ForSequenceClassification": {
+        "tokenizer_classes": [
+            "LayoutLMv2Tokenizer",
+            "LayoutLMv2TokenizerFast"
+        ],
+        "processor_classes": [
+            "LayoutLMv2ImageProcessor"
+        ],
+        "model_classes": [
+            "LayoutLMv2ForSequenceClassification"
+        ],
+        "sha": "b483e08fd143113629ecda3dbfd57e69bfeb5f11"
+    },
+    "LayoutLMv2ForTokenClassification": {
+        "tokenizer_classes": [
+            "LayoutLMv2Tokenizer",
+            "LayoutLMv2TokenizerFast"
+        ],
+        "processor_classes": [
+            "LayoutLMv2ImageProcessor"
+        ],
+        "model_classes": [
+            "LayoutLMv2ForTokenClassification"
+        ],
+        "sha": "0721ae69bff00ecfff1b3d1521a475cde0253299"
+    },
+    "LayoutLMv2Model": {
+        "tokenizer_classes": [
+            "LayoutLMv2Tokenizer",
+            "LayoutLMv2TokenizerFast"
+        ],
+        "processor_classes": [
+            "LayoutLMv2ImageProcessor"
+        ],
+        "model_classes": [
+            "LayoutLMv2Model"
+        ],
+        "sha": "6a1b510769b344979a910a7d0bade613a9ec2dfc"
+    },
+    "LayoutLMv3ForQuestionAnswering": {
+        "tokenizer_classes": [
+            "LayoutLMv3Tokenizer",
+            "LayoutLMv3TokenizerFast"
+        ],
+        "processor_classes": [
+            "LayoutLMv3ImageProcessor"
+        ],
+        "model_classes": [
+            "LayoutLMv3ForQuestionAnswering",
+            "TFLayoutLMv3ForQuestionAnswering"
+        ],
+        "sha": "4640242388e69cf77ea2dd3ac36ec6f1b26628c8"
+    },
+    "LayoutLMv3ForSequenceClassification": {
+        "tokenizer_classes": [
+            "LayoutLMv3Tokenizer",
+            "LayoutLMv3TokenizerFast"
+        ],
+        "processor_classes": [
+            "LayoutLMv3ImageProcessor"
+        ],
+        "model_classes": [
+            "LayoutLMv3ForSequenceClassification",
+            "TFLayoutLMv3ForSequenceClassification"
+        ],
+        "sha": "96515f699874cfbfbec7a64c539ae92419e4c6dc"
+    },
+    "LayoutLMv3ForTokenClassification": {
+        "tokenizer_classes": [
+            "LayoutLMv3Tokenizer",
+            "LayoutLMv3TokenizerFast"
+        ],
+        "processor_classes": [
+            "LayoutLMv3ImageProcessor"
+        ],
+        "model_classes": [
+            "LayoutLMv3ForTokenClassification",
+            "TFLayoutLMv3ForTokenClassification"
+        ],
+        "sha": "ed4ffc464f2028fe50dfc6823f4eda78d34be7e6"
+    },
+    "LayoutLMv3Model": {
+        "tokenizer_classes": [
+            "LayoutLMv3Tokenizer",
+            "LayoutLMv3TokenizerFast"
+        ],
+        "processor_classes": [
+            "LayoutLMv3ImageProcessor"
+        ],
+        "model_classes": [
+            "LayoutLMv3Model",
+            "TFLayoutLMv3Model"
+        ],
+        "sha": "69725e5e2445e5c1c3aa8a2aa49cfd72e0a44565"
+    },
+    "LevitForImageClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "LevitImageProcessor"
+        ],
+        "model_classes": [
+            "LevitForImageClassification"
+        ],
+        "sha": "5ae8ccaa1fe1c947cb8ae6499e4a150c668bb9f0"
+    },
+    "LevitForImageClassificationWithTeacher": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "LevitImageProcessor"
+        ],
+        "model_classes": [
+            "LevitForImageClassificationWithTeacher"
+        ],
+        "sha": "568cc0d965b9bd293f240e7724314db6d50f6722"
+    },
+    "LevitModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "LevitImageProcessor"
+        ],
+        "model_classes": [
+            "LevitModel"
+        ],
+        "sha": "172efa52b50c75c3b3e498fa638f55e65b2ebf87"
+    },
+    "LiltForQuestionAnswering": {
+        "tokenizer_classes": [
+            "LayoutLMv3Tokenizer",
+            "LayoutLMv3TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "LiltForQuestionAnswering"
+        ],
+        "sha": "0a348441999e98ec003b29fc4d5a67ad22ee6ca2"
+    },
+    "LiltForSequenceClassification": {
+        "tokenizer_classes": [
+            "LayoutLMv3Tokenizer",
+            "LayoutLMv3TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "LiltForSequenceClassification"
+        ],
+        "sha": "c53ab0ba33536fe564a4a1e4f1674d990c01b83a"
+    },
+    "LiltForTokenClassification": {
+        "tokenizer_classes": [
+            "LayoutLMv3Tokenizer",
+            "LayoutLMv3TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "LiltForTokenClassification"
+        ],
+        "sha": "14f85076f9b3f7016917e324d51ebd22511a2ae5"
+    },
+    "LiltModel": {
+        "tokenizer_classes": [
+            "LayoutLMv3Tokenizer",
+            "LayoutLMv3TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "LiltModel"
+        ],
+        "sha": "3f1166cc14c532388df7e82336a8e575a813bd3f"
+    },
+    "LongT5ForConditionalGeneration": {
+        "tokenizer_classes": [
+            "T5Tokenizer",
+            "T5TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "LongT5ForConditionalGeneration"
+        ],
+        "sha": "c685cbbe706ad5c9a28689631765726a1874dcc7"
+    },
+    "LongT5Model": {
+        "tokenizer_classes": [
+            "T5Tokenizer",
+            "T5TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "LongT5Model"
+        ],
+        "sha": "6b468e55e2490565e6155690201086ac00c72062"
+    },
+    "LongformerForMaskedLM": {
+        "tokenizer_classes": [
+            "LongformerTokenizer",
+            "LongformerTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "LongformerForMaskedLM",
+            "TFLongformerForMaskedLM"
+        ],
+        "sha": "929d3bda9a1485d9bae41f9dbfc1d149c1c4e78e"
+    },
+    "LongformerForMultipleChoice": {
+        "tokenizer_classes": [
+            "LongformerTokenizer",
+            "LongformerTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "LongformerForMultipleChoice",
+            "TFLongformerForMultipleChoice"
+        ],
+        "sha": "60b1ecac6b9385ce18c7e6978ab161cce8e7f9d4"
+    },
+    "LongformerForQuestionAnswering": {
+        "tokenizer_classes": [
+            "LongformerTokenizer",
+            "LongformerTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "LongformerForQuestionAnswering",
+            "TFLongformerForQuestionAnswering"
+        ],
+        "sha": "be45ab1321b703f2200cbbcae560aaf2e2afef88"
+    },
+    "LongformerForSequenceClassification": {
+        "tokenizer_classes": [
+            "LongformerTokenizer",
+            "LongformerTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "LongformerForSequenceClassification",
+            "TFLongformerForSequenceClassification"
+        ],
+        "sha": "8bc0de0b0f740bf397eb2770ec3ce3a24f3d7af9"
+    },
+    "LongformerForTokenClassification": {
+        "tokenizer_classes": [
+            "LongformerTokenizer",
+            "LongformerTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "LongformerForTokenClassification",
+            "TFLongformerForTokenClassification"
+        ],
+        "sha": "efa33a9b6f47f0f7979af08ae8d04a5a7363a14b"
+    },
+    "LongformerModel": {
+        "tokenizer_classes": [
+            "LongformerTokenizer",
+            "LongformerTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "LongformerModel",
+            "TFLongformerModel"
+        ],
+        "sha": "b023d531688e8655fc09300ac36742588efb3240"
+    },
+    "LukeForMaskedLM": {
+        "tokenizer_classes": [
+            "LukeTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "LukeForMaskedLM"
+        ],
+        "sha": "954cf6cd2bf1f298a3956b10c36656c57387506d"
+    },
+    "LukeForMultipleChoice": {
+        "tokenizer_classes": [
+            "LukeTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "LukeForMultipleChoice"
+        ],
+        "sha": "d1310a9174ad50d60b30ad6049e165deb2539034"
+    },
+    "LukeForQuestionAnswering": {
+        "tokenizer_classes": [
+            "LukeTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "LukeForQuestionAnswering"
+        ],
+        "sha": "3ea38da4e32cb4e45bea82b2e81a8639aeba2c35"
+    },
+    "LukeForSequenceClassification": {
+        "tokenizer_classes": [
+            "LukeTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "LukeForSequenceClassification"
+        ],
+        "sha": "b5b11248aeb4f5976379d15a977aeb2677e0c0f9"
+    },
+    "LukeForTokenClassification": {
+        "tokenizer_classes": [
+            "LukeTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "LukeForTokenClassification"
+        ],
+        "sha": "8aab1a33ad26a344a6f4dfd68630e9661e174471"
+    },
+    "LukeModel": {
+        "tokenizer_classes": [
+            "LukeTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "LukeModel"
+        ],
+        "sha": "ae23a674e7297d41f33c9af86e039757dfd2d531"
+    },
+    "LxmertForPreTraining": {
+        "tokenizer_classes": [
+            "LxmertTokenizer",
+            "LxmertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "LxmertForPreTraining",
+            "TFLxmertForPreTraining"
+        ],
+        "sha": "7b0843403c187aef00f20d5087086468d9613d2c"
+    },
+    "LxmertForQuestionAnswering": {
+        "tokenizer_classes": [
+            "LxmertTokenizer",
+            "LxmertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "LxmertForQuestionAnswering"
+        ],
+        "sha": "27a74bd2cd156e46656c43ceb432c4deda0df5c1"
+    },
+    "LxmertModel": {
+        "tokenizer_classes": [
+            "LxmertTokenizer",
+            "LxmertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "LxmertModel",
+            "TFLxmertModel"
+        ],
+        "sha": "97612a0d6b14406ea9bfd7672e6974e0961cbef1"
+    },
+    "M2M100ForConditionalGeneration": {
+        "tokenizer_classes": [
+            "M2M100Tokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "M2M100ForConditionalGeneration"
+        ],
+        "sha": "32ac347092d51f658b41ffc111b67d49acdeab46"
+    },
+    "M2M100Model": {
+        "tokenizer_classes": [
+            "M2M100Tokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "M2M100Model"
+        ],
+        "sha": "e95c2ae168c7ba19f8114def40e1b1edd953b2f5"
+    },
+    "MBartForCausalLM": {
+        "tokenizer_classes": [
+            "MBartTokenizer",
+            "MBartTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MBartForCausalLM"
+        ],
+        "sha": "a45044f8056328d20a764356eca3d0746a7a195e"
+    },
+    "MBartForConditionalGeneration": {
+        "tokenizer_classes": [
+            "MBartTokenizer",
+            "MBartTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MBartForConditionalGeneration",
+            "TFMBartForConditionalGeneration"
+        ],
+        "sha": "171e918962d6c0ee56c6b070858e19e16c8dd09f"
+    },
+    "MBartForQuestionAnswering": {
+        "tokenizer_classes": [
+            "MBartTokenizer",
+            "MBartTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MBartForQuestionAnswering"
+        ],
+        "sha": "1ee08565d24777335595e0d2940e454abdcff731"
+    },
+    "MBartForSequenceClassification": {
+        "tokenizer_classes": [
+            "MBartTokenizer",
+            "MBartTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MBartForSequenceClassification"
+        ],
+        "sha": "53e9c88ecfa2475d27afe099ffa7a8bcdb7ef7e4"
+    },
+    "MBartModel": {
+        "tokenizer_classes": [
+            "MBartTokenizer",
+            "MBartTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MBartModel",
+            "TFMBartModel"
+        ],
+        "sha": "2d492b34d69dd63b411990d5c8bb692fd637e91c"
+    },
+    "MCTCTForCTC": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "MCTCTFeatureExtractor"
+        ],
+        "model_classes": [
+            "MCTCTForCTC"
+        ],
+        "sha": "895a3d74f87b344b1f0a71eae4f085941d51b5cf"
+    },
+    "MCTCTModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "MCTCTFeatureExtractor"
+        ],
+        "model_classes": [
+            "MCTCTModel"
+        ],
+        "sha": "ce73d5c2b6fe163de778697d7b0543bf00d7ffa8"
+    },
+    "MPNetForMaskedLM": {
+        "tokenizer_classes": [
+            "MPNetTokenizer",
+            "MPNetTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MPNetForMaskedLM",
+            "TFMPNetForMaskedLM"
+        ],
+        "sha": "50af96e7d0202aef86e396c136e4c4fde8afe183"
+    },
+    "MPNetForMultipleChoice": {
+        "tokenizer_classes": [
+            "MPNetTokenizer",
+            "MPNetTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MPNetForMultipleChoice",
+            "TFMPNetForMultipleChoice"
+        ],
+        "sha": "af4ff8bf296a3a51f5ab6cd9f56741e4c732487c"
+    },
+    "MPNetForQuestionAnswering": {
+        "tokenizer_classes": [
+            "MPNetTokenizer",
+            "MPNetTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MPNetForQuestionAnswering",
+            "TFMPNetForQuestionAnswering"
+        ],
+        "sha": "3e1a25c0d3243f78f81580c312ada3b39c06b428"
+    },
+    "MPNetForSequenceClassification": {
+        "tokenizer_classes": [
+            "MPNetTokenizer",
+            "MPNetTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MPNetForSequenceClassification",
+            "TFMPNetForSequenceClassification"
+        ],
+        "sha": "43da45c0a0d73c5a5567b4c7ec512ec5023e52dd"
+    },
+    "MPNetForTokenClassification": {
+        "tokenizer_classes": [
+            "MPNetTokenizer",
+            "MPNetTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MPNetForTokenClassification",
+            "TFMPNetForTokenClassification"
+        ],
+        "sha": "4e825eff24df533321ebab823eb66ce67e4ab3d9"
+    },
+    "MPNetModel": {
+        "tokenizer_classes": [
+            "MPNetTokenizer",
+            "MPNetTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MPNetModel",
+            "TFMPNetModel"
+        ],
+        "sha": "847c68344c2922e9a71fa8835b87a0f6f72b9f47"
+    },
+    "MarianForCausalLM": {
+        "tokenizer_classes": [
+            "MarianTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [],
+        "sha": "5fb205e6db8e18e3c6cdd4e4709be292ba4599f3"
+    },
+    "MarianMTModel": {
+        "tokenizer_classes": [
+            "MarianTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MarianMTModel",
+            "TFMarianMTModel"
+        ],
+        "sha": "0405f542b31561592231a86e3009d05256cbf49f"
+    },
+    "MarianModel": {
+        "tokenizer_classes": [
+            "MarianTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MarianModel",
+            "TFMarianModel"
+        ],
+        "sha": "3649748c0286c6d5179a7013a716f7314db182a8"
+    },
+    "MarkupLMForQuestionAnswering": {
+        "tokenizer_classes": [
+            "MarkupLMTokenizer",
+            "MarkupLMTokenizerFast"
+        ],
+        "processor_classes": [
+            "MarkupLMFeatureExtractor"
+        ],
+        "model_classes": [
+            "MarkupLMForQuestionAnswering"
+        ],
+        "sha": "c8bb9f93591d980362547b0bdca9f23ace2f383e"
+    },
+    "MarkupLMForSequenceClassification": {
+        "tokenizer_classes": [
+            "MarkupLMTokenizer",
+            "MarkupLMTokenizerFast"
+        ],
+        "processor_classes": [
+            "MarkupLMFeatureExtractor"
+        ],
+        "model_classes": [
+            "MarkupLMForSequenceClassification"
+        ],
+        "sha": "c2cb7245d68d76e0a5f993fc8a3de099ecebc68b"
+    },
+    "MarkupLMForTokenClassification": {
+        "tokenizer_classes": [
+            "MarkupLMTokenizer",
+            "MarkupLMTokenizerFast"
+        ],
+        "processor_classes": [
+            "MarkupLMFeatureExtractor"
+        ],
+        "model_classes": [
+            "MarkupLMForTokenClassification"
+        ],
+        "sha": "b9f924e82f400de0b34b46ee4ba276d686bd4890"
+    },
+    "MarkupLMModel": {
+        "tokenizer_classes": [
+            "MarkupLMTokenizer",
+            "MarkupLMTokenizerFast"
+        ],
+        "processor_classes": [
+            "MarkupLMFeatureExtractor"
+        ],
+        "model_classes": [
+            "MarkupLMModel"
+        ],
+        "sha": "9687ba29f1c59d978e3d4b0fa702031f88eff53b"
+    },
+    "Mask2FormerForUniversalSegmentation": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "Mask2FormerImageProcessor"
+        ],
+        "model_classes": [
+            "Mask2FormerForUniversalSegmentation"
+        ],
+        "sha": "6429a7349527c9ef140ae691b83c47702cce1bc0"
+    },
+    "Mask2FormerModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "Mask2FormerImageProcessor"
+        ],
+        "model_classes": [
+            "Mask2FormerModel"
+        ],
+        "sha": "9bee8709204024b3669d503cdfe8890182f2a075"
+    },
+    "MaskFormerForInstanceSegmentation": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "MaskFormerImageProcessor"
+        ],
+        "model_classes": [
+            "MaskFormerForInstanceSegmentation"
+        ],
+        "sha": "f844aaa81f55cb199c115f1bf95c217a70685570"
+    },
+    "MaskFormerModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "MaskFormerImageProcessor"
+        ],
+        "model_classes": [
+            "MaskFormerModel"
+        ],
+        "sha": "473b54a464bc0ccee29bc23b4f6610f32eec05af"
+    },
+    "MegaForCausalLM": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MegaForCausalLM"
+        ],
+        "sha": "6642b9da860f8b62abcfb0660feabcebf6698418"
+    },
+    "MegaForMaskedLM": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MegaForMaskedLM"
+        ],
+        "sha": "6b2d47ba03bec9e6f7eefdd4a67351fa191aae6f"
+    },
+    "MegaForMultipleChoice": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MegaForMultipleChoice"
+        ],
+        "sha": "2b1e751da36a4410473eef07a62b09227a26d504"
+    },
+    "MegaForQuestionAnswering": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MegaForQuestionAnswering"
+        ],
+        "sha": "612acd9a53c351c42514adb3c04f2057d2870be7"
+    },
+    "MegaForSequenceClassification": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MegaForSequenceClassification"
+        ],
+        "sha": "4871572da1613b7e9cfd3640c6d1129af004eefb"
+    },
+    "MegaForTokenClassification": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MegaForTokenClassification"
+        ],
+        "sha": "450d3722c3b995215d06b9c12544c99f958581c7"
+    },
+    "MegaModel": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MegaModel"
+        ],
+        "sha": "ca0862db27428893fe22f9bb5d2eb0875c2156f3"
+    },
+    "MegatronBertForCausalLM": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MegatronBertForCausalLM"
+        ],
+        "sha": "ff08d05ef8f98fdccf1f01560ec6ec4adbc8a3e3"
+    },
+    "MegatronBertForMaskedLM": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MegatronBertForMaskedLM"
+        ],
+        "sha": "2ed25e2681d26b51b404ef1347a385c5f2c86a9a"
+    },
+    "MegatronBertForMultipleChoice": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MegatronBertForMultipleChoice"
+        ],
+        "sha": "1485af4b75f8f234d2b4b5aea50ab2ec55223a15"
+    },
+    "MegatronBertForNextSentencePrediction": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MegatronBertForNextSentencePrediction"
+        ],
+        "sha": "52bc9ee1d5145344f66b088ed278f07ed3d90584"
+    },
+    "MegatronBertForPreTraining": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MegatronBertForPreTraining"
+        ],
+        "sha": "e580d0efd54e1c92789e39b32929234e36ee427f"
+    },
+    "MegatronBertForQuestionAnswering": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MegatronBertForQuestionAnswering"
+        ],
+        "sha": "7342ba042a3c30c15382d00fcb0521533fc43841"
+    },
+    "MegatronBertForSequenceClassification": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MegatronBertForSequenceClassification"
+        ],
+        "sha": "6a7cd480511d817a1e221c8f7558c55a93baed1b"
+    },
+    "MegatronBertForTokenClassification": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MegatronBertForTokenClassification"
+        ],
+        "sha": "8b5334b6ec5f025293ca861de474b57ca84bc005"
+    },
+    "MegatronBertModel": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MegatronBertModel"
+        ],
+        "sha": "f2457fbe535ba97ea13db049f53618b42e13f047"
+    },
+    "MgpstrForSceneTextRecognition": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "MgpstrProcessor"
+        ],
+        "model_classes": [
+            "MgpstrForSceneTextRecognition"
+        ],
+        "sha": "f197d5bfa1fe27b5f28a6e6d4e3ad229b753450a"
+    },
+    "MobileBertForMaskedLM": {
+        "tokenizer_classes": [
+            "MobileBertTokenizer",
+            "MobileBertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MobileBertForMaskedLM",
+            "TFMobileBertForMaskedLM"
+        ],
+        "sha": "d689e737d73ad23aed3aabd3177591fc827d1c62"
+    },
+    "MobileBertForMultipleChoice": {
+        "tokenizer_classes": [
+            "MobileBertTokenizer",
+            "MobileBertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MobileBertForMultipleChoice",
+            "TFMobileBertForMultipleChoice"
+        ],
+        "sha": "403d1f88be7eb0c769ff3a8e57eab21cc3e75afb"
+    },
+    "MobileBertForNextSentencePrediction": {
+        "tokenizer_classes": [
+            "MobileBertTokenizer",
+            "MobileBertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MobileBertForNextSentencePrediction",
+            "TFMobileBertForNextSentencePrediction"
+        ],
+        "sha": "b4d8836a0f259ee3bca9f230093836c9117c5e4d"
+    },
+    "MobileBertForPreTraining": {
+        "tokenizer_classes": [
+            "MobileBertTokenizer",
+            "MobileBertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MobileBertForPreTraining",
+            "TFMobileBertForPreTraining"
+        ],
+        "sha": "fbaa13ea6f9fcebb9fde620dd009d12510440d17"
+    },
+    "MobileBertForQuestionAnswering": {
+        "tokenizer_classes": [
+            "MobileBertTokenizer",
+            "MobileBertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MobileBertForQuestionAnswering",
+            "TFMobileBertForQuestionAnswering"
+        ],
+        "sha": "ba6a55cf2daec55bfb220c9bab0bc4ad96510087"
+    },
+    "MobileBertForSequenceClassification": {
+        "tokenizer_classes": [
+            "MobileBertTokenizer",
+            "MobileBertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MobileBertForSequenceClassification",
+            "TFMobileBertForSequenceClassification"
+        ],
+        "sha": "17ab35603bec351457e035eef2d0426538071f72"
+    },
+    "MobileBertForTokenClassification": {
+        "tokenizer_classes": [
+            "MobileBertTokenizer",
+            "MobileBertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MobileBertForTokenClassification",
+            "TFMobileBertForTokenClassification"
+        ],
+        "sha": "dee83e820e6c4f069886a5d1875bf6775897313e"
+    },
+    "MobileBertModel": {
+        "tokenizer_classes": [
+            "MobileBertTokenizer",
+            "MobileBertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MobileBertModel",
+            "TFMobileBertModel"
+        ],
+        "sha": "09b2db33ea798a762eeaf7e727e95f9ea8a6d14f"
+    },
+    "MobileNetV1ForImageClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "MobileNetV1ImageProcessor"
+        ],
+        "model_classes": [
+            "MobileNetV1ForImageClassification"
+        ],
+        "sha": "55023dbd0935f147bf1bccf960cea01ca07e0f0c"
+    },
+    "MobileNetV1Model": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "MobileNetV1ImageProcessor"
+        ],
+        "model_classes": [
+            "MobileNetV1Model"
+        ],
+        "sha": "178bd24528147a028938d6ee5c7e65c969ea37b0"
+    },
+    "MobileNetV2ForImageClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "MobileNetV2ImageProcessor"
+        ],
+        "model_classes": [
+            "MobileNetV2ForImageClassification"
+        ],
+        "sha": "ff907f740cf9ea91bc3cdf403a94ae28fbb2548a"
+    },
+    "MobileNetV2ForSemanticSegmentation": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "MobileNetV2ImageProcessor"
+        ],
+        "model_classes": [
+            "MobileNetV2ForSemanticSegmentation"
+        ],
+        "sha": "48adbc340e42882f52b54d4f5dd045e16e9ef2d6"
+    },
+    "MobileNetV2Model": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "MobileNetV2ImageProcessor"
+        ],
+        "model_classes": [
+            "MobileNetV2Model"
+        ],
+        "sha": "e876885828825472a80ef1796d89d60b901813ba"
+    },
+    "MobileViTForImageClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "MobileViTImageProcessor"
+        ],
+        "model_classes": [
+            "MobileViTForImageClassification",
+            "TFMobileViTForImageClassification"
+        ],
+        "sha": "7d0b31864f856e00f9e34e8c6781dcc7a8cdaf1e"
+    },
+    "MobileViTForSemanticSegmentation": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "MobileViTImageProcessor"
+        ],
+        "model_classes": [
+            "MobileViTForSemanticSegmentation",
+            "TFMobileViTForSemanticSegmentation"
+        ],
+        "sha": "215f727caa3c3fc94fa4df486aa706e5d99d4194"
+    },
+    "MobileViTModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "MobileViTImageProcessor"
+        ],
+        "model_classes": [
+            "MobileViTModel",
+            "TFMobileViTModel"
+        ],
+        "sha": "b3a1452e7cb44b600b21ee14f3d5382366855a46"
+    },
+    "MobileViTV2ForImageClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "MobileViTImageProcessor"
+        ],
+        "model_classes": [
+            "MobileViTV2ForImageClassification"
+        ],
+        "sha": "25752b0967ad594341d1b685401450d7f698433c"
+    },
+    "MobileViTV2ForSemanticSegmentation": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "MobileViTImageProcessor"
+        ],
+        "model_classes": [
+            "MobileViTV2ForSemanticSegmentation"
+        ],
+        "sha": "13b953f50be33219d55a12f1098be38b88000897"
+    },
+    "MobileViTV2Model": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "MobileViTImageProcessor"
+        ],
+        "model_classes": [
+            "MobileViTV2Model"
+        ],
+        "sha": "2f46357659db2d6d54d870e28073deeea1c8cb64"
+    },
+    "MptForCausalLM": {
+        "tokenizer_classes": [
+            "GPTNeoXTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MptForCausalLM"
+        ],
+        "sha": "500c869b956c65f6b1a7b4867727f124c6f5728a"
+    },
+    "MptForQuestionAnswering": {
+        "tokenizer_classes": [
+            "GPTNeoXTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MptForQuestionAnswering"
+        ],
+        "sha": "6ee46572bf61eb5e7dbbdaf00b73c4d37efc42d9"
+    },
+    "MptForSequenceClassification": {
+        "tokenizer_classes": [
+            "GPTNeoXTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MptForSequenceClassification"
+        ],
+        "sha": "f0b9153413b5dfceeb96b67d4b0f22c94bbaf64a"
+    },
+    "MptForTokenClassification": {
+        "tokenizer_classes": [
+            "GPTNeoXTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MptForTokenClassification"
+        ],
+        "sha": "3f7c3ccd67cd0b2aae56d37613429a64ef813246"
+    },
+    "MptModel": {
+        "tokenizer_classes": [
+            "GPTNeoXTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MptModel"
+        ],
+        "sha": "ea747f234556661b0c8b84a626f267066ce586bf"
+    },
+    "MraForMaskedLM": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MraForMaskedLM"
+        ],
+        "sha": "c00ee46cfd2b8fed29cc37f0a4ead40ad51a439c"
+    },
+    "MraForMultipleChoice": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MraForMultipleChoice"
+        ],
+        "sha": "f397469ba8109f64dab2d75335ea7bf0c2dbeb74"
+    },
+    "MraForQuestionAnswering": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MraForQuestionAnswering"
+        ],
+        "sha": "c2ed75acd20e5440a76d6504d9a3ebc2513011f0"
+    },
+    "MraForSequenceClassification": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MraForSequenceClassification"
+        ],
+        "sha": "f47672d3708508bda7774215bee44a92ec16ab2f"
+    },
+    "MraForTokenClassification": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MraForTokenClassification"
+        ],
+        "sha": "f0961ab5818bca473607fb94b391c186dc1d3492"
+    },
+    "MraModel": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MraModel"
+        ],
+        "sha": "315f34f30bcc4b0b66b11987726df2a80c50e271"
+    },
+    "MusicgenForCausalLM": {
+        "tokenizer_classes": [
+            "T5TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [],
+        "sha": "f67d387eaaa7c71ddf88af95eda4bf14ace08d49"
+    },
+    "MusicgenForConditionalGeneration": {
+        "tokenizer_classes": [
+            "T5TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MusicgenForConditionalGeneration"
+        ],
+        "sha": "16102cdf580e70cf0b4e0e2cda5bc75b934da92c"
+    },
+    "MvpForCausalLM": {
+        "tokenizer_classes": [
+            "MvpTokenizer",
+            "MvpTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MvpForCausalLM"
+        ],
+        "sha": "105e5f2c8a0f20d404cb71795539cda5dd49716d"
+    },
+    "MvpForConditionalGeneration": {
+        "tokenizer_classes": [
+            "MvpTokenizer",
+            "MvpTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MvpForConditionalGeneration"
+        ],
+        "sha": "b0b706f14b2f8aae288cba30ae0064e0be7e888b"
+    },
+    "MvpForQuestionAnswering": {
+        "tokenizer_classes": [
+            "MvpTokenizer",
+            "MvpTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MvpForQuestionAnswering"
+        ],
+        "sha": "82f152b36a40a4c22edcb146e6eaec636d84fa2d"
+    },
+    "MvpForSequenceClassification": {
+        "tokenizer_classes": [
+            "MvpTokenizer",
+            "MvpTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MvpForSequenceClassification"
+        ],
+        "sha": "506b68544d064001929ee9e6db3752e62972a6aa"
+    },
+    "MvpModel": {
+        "tokenizer_classes": [
+            "MvpTokenizer",
+            "MvpTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MvpModel"
+        ],
+        "sha": "3f4653184721a2bc029b27706d335ef7ddd219d5"
+    },
+    "NatBackbone": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ViTImageProcessor"
+        ],
+        "model_classes": [
+            "NatBackbone"
+        ],
+        "sha": "d5cc5eccba4da609c82e9f5c649301b9f9fee9fb"
+    },
+    "NatForImageClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ViTImageProcessor"
+        ],
+        "model_classes": [
+            "NatForImageClassification"
+        ],
+        "sha": "2ff4c9e73c49c392c02a467e87b5511fd924242a"
+    },
+    "NatModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ViTImageProcessor"
+        ],
+        "model_classes": [
+            "NatModel"
+        ],
+        "sha": "75e9756bb94d0ccdce98a8e963eeecbc66f9d573"
+    },
+    "NezhaForMaskedLM": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "NezhaForMaskedLM"
+        ],
+        "sha": "5991cca4b78f0ed7299259a71f3eeed3f3452b72"
+    },
+    "NezhaForMultipleChoice": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "NezhaForMultipleChoice"
+        ],
+        "sha": "0f6e9ec791d85ad4503acdec50b3a120f984016b"
+    },
+    "NezhaForNextSentencePrediction": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "NezhaForNextSentencePrediction"
+        ],
+        "sha": "9a34316c14ec8ecc98ff08e46760915c80098a57"
+    },
+    "NezhaForPreTraining": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "NezhaForPreTraining"
+        ],
+        "sha": "6259db427a0073061de352ea819d38a74798edd7"
+    },
+    "NezhaForQuestionAnswering": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "NezhaForQuestionAnswering"
+        ],
+        "sha": "31c6a34e85ae8c41294e0f4ef25044e00e511c4d"
+    },
+    "NezhaForSequenceClassification": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "NezhaForSequenceClassification"
+        ],
+        "sha": "db057c308ba2e05f223404de11e1816ce4bd62a9"
+    },
+    "NezhaForTokenClassification": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "NezhaForTokenClassification"
+        ],
+        "sha": "235f4e10b4a59709650c2bece3e342ec153d9cfc"
+    },
+    "NezhaModel": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "NezhaModel"
+        ],
+        "sha": "80e05ba7c55bcdd7f4d1387ef9a09a7a8e95b5ac"
+    },
+    "NllbMoeForConditionalGeneration": {
+        "tokenizer_classes": [
+            "NllbTokenizer",
+            "NllbTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "NllbMoeForConditionalGeneration"
+        ],
+        "sha": "2a7f87dffe826af3d52086888f3f3773246e5528"
+    },
+    "NllbMoeModel": {
+        "tokenizer_classes": [
+            "NllbTokenizer",
+            "NllbTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "NllbMoeModel"
+        ],
+        "sha": "9f7a2261eed4658e1aa5623be4672ba64bee7da5"
+    },
+    "NystromformerForMaskedLM": {
+        "tokenizer_classes": [
+            "AlbertTokenizer",
+            "AlbertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "NystromformerForMaskedLM"
+        ],
+        "sha": "37036847783f1e65e81ecd43803270a1ecb276f3"
+    },
+    "NystromformerForMultipleChoice": {
+        "tokenizer_classes": [
+            "AlbertTokenizer",
+            "AlbertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "NystromformerForMultipleChoice"
+        ],
+        "sha": "42a077d5ab6830e20560466eaccc525eff10c3ae"
+    },
+    "NystromformerForQuestionAnswering": {
+        "tokenizer_classes": [
+            "AlbertTokenizer",
+            "AlbertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "NystromformerForQuestionAnswering"
+        ],
+        "sha": "1cfaf79051731824db4f09989f093f87f4fceec5"
+    },
+    "NystromformerForSequenceClassification": {
+        "tokenizer_classes": [
+            "AlbertTokenizer",
+            "AlbertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "NystromformerForSequenceClassification"
+        ],
+        "sha": "d75231203066df41e9b6b25dbee9ad40e8515c18"
+    },
+    "NystromformerForTokenClassification": {
+        "tokenizer_classes": [
+            "AlbertTokenizer",
+            "AlbertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "NystromformerForTokenClassification"
+        ],
+        "sha": "5a499dc96e106bf41fc9166f2ad06527ec7ca14e"
+    },
+    "NystromformerModel": {
+        "tokenizer_classes": [
+            "AlbertTokenizer",
+            "AlbertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "NystromformerModel"
+        ],
+        "sha": "2b6adb37ec473b15d71e2eb459acea08df6940ce"
+    },
+    "OPTForCausalLM": {
+        "tokenizer_classes": [
+            "GPT2Tokenizer",
+            "GPT2TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "OPTForCausalLM",
+            "TFOPTForCausalLM"
+        ],
+        "sha": "190d1f4fc0011d2eaeaa05282e0fbd2445e4b11f"
+    },
+    "OPTForQuestionAnswering": {
+        "tokenizer_classes": [
+            "GPT2Tokenizer",
+            "GPT2TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "OPTForQuestionAnswering"
+        ],
+        "sha": "0fa9277ce10dbc3d0922b354befb684a136af00b"
+    },
+    "OPTForSequenceClassification": {
+        "tokenizer_classes": [
+            "GPT2Tokenizer",
+            "GPT2TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "OPTForSequenceClassification"
+        ],
+        "sha": "784ab288ab7280b1853ee400ef10ee2a965df352"
+    },
+    "OPTModel": {
+        "tokenizer_classes": [
+            "GPT2Tokenizer",
+            "GPT2TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "OPTModel",
+            "TFOPTModel"
+        ],
+        "sha": "901d92b8f51edb0ec9614cb185fb66a8b5d364c3"
+    },
+    "OneFormerForUniversalSegmentation": {
+        "tokenizer_classes": [
+            "CLIPTokenizer",
+            "CLIPTokenizerFast"
+        ],
+        "processor_classes": [
+            "OneFormerImageProcessor"
+        ],
+        "model_classes": [
+            "OneFormerForUniversalSegmentation"
+        ],
+        "sha": "fee1cfd676acc40f09017702ddac6504f3090d14"
+    },
+    "OneFormerModel": {
+        "tokenizer_classes": [
+            "CLIPTokenizer",
+            "CLIPTokenizerFast"
+        ],
+        "processor_classes": [
+            "OneFormerImageProcessor"
+        ],
+        "model_classes": [
+            "OneFormerModel"
+        ],
+        "sha": "4163a79328c78f93ec57942598698a138c19a577"
+    },
+    "OpenAIGPTForSequenceClassification": {
+        "tokenizer_classes": [
+            "OpenAIGPTTokenizer",
+            "OpenAIGPTTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "OpenAIGPTForSequenceClassification",
+            "TFOpenAIGPTForSequenceClassification"
+        ],
+        "sha": "c513f7f952935085f7573bf70a1ac3ad8f33434c"
+    },
+    "OpenAIGPTLMHeadModel": {
+        "tokenizer_classes": [
+            "OpenAIGPTTokenizer",
+            "OpenAIGPTTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "OpenAIGPTLMHeadModel",
+            "TFOpenAIGPTLMHeadModel"
+        ],
+        "sha": "33f59ecd860f7a998483ec7631fe32d257235461"
+    },
+    "OpenAIGPTModel": {
+        "tokenizer_classes": [
+            "OpenAIGPTTokenizer",
+            "OpenAIGPTTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "OpenAIGPTModel",
+            "TFOpenAIGPTModel"
+        ],
+        "sha": "00f6ec0a3a5276af71d08a26199e0ccbf2556fc9"
+    },
+    "OwlViTForObjectDetection": {
+        "tokenizer_classes": [
+            "CLIPTokenizer",
+            "CLIPTokenizerFast"
+        ],
+        "processor_classes": [
+            "OwlViTImageProcessor"
+        ],
+        "model_classes": [
+            "OwlViTForObjectDetection"
+        ],
+        "sha": "af958c9164f23d0f12921a8edf687f9aaa6af90e"
+    },
+    "OwlViTModel": {
+        "tokenizer_classes": [
+            "CLIPTokenizer",
+            "CLIPTokenizerFast"
+        ],
+        "processor_classes": [
+            "OwlViTImageProcessor"
+        ],
+        "model_classes": [
+            "OwlViTModel"
+        ],
+        "sha": "f0e27b2b4e53ba70e05d13dcfea8e85272b292a5"
+    },
+    "PLBartForCausalLM": {
+        "tokenizer_classes": [
+            "PLBartTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "PLBartForCausalLM"
+        ],
+        "sha": "6ee51133246dbdb18fc3681ebd62d21e421b9bb4"
+    },
+    "PLBartForConditionalGeneration": {
+        "tokenizer_classes": [
+            "PLBartTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "PLBartForConditionalGeneration"
+        ],
+        "sha": "ba191d28f4678d20b4dfed5fca5944018282cf20"
+    },
+    "PLBartForSequenceClassification": {
+        "tokenizer_classes": [
+            "PLBartTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "PLBartForSequenceClassification"
+        ],
+        "sha": "02063b3d9707fcff619a4e37a0d6e58f76e39b18"
+    },
+    "PLBartModel": {
+        "tokenizer_classes": [
+            "PLBartTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "PLBartModel"
+        ],
+        "sha": "cfbba29169b3f40d800403fc1b53982e1f88c5f8"
+    },
+    "PegasusForCausalLM": {
+        "tokenizer_classes": [
+            "PegasusTokenizer",
+            "PegasusTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "PegasusForCausalLM"
+        ],
+        "sha": "6e685a698302a3ba33e5379d3a37eb0bc1ae2f70"
+    },
+    "PegasusForConditionalGeneration": {
+        "tokenizer_classes": [
+            "PegasusTokenizer",
+            "PegasusTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "PegasusForConditionalGeneration",
+            "TFPegasusForConditionalGeneration"
+        ],
+        "sha": "15e58ee2ebc14b6e80ef2891259057ee5f049be2"
+    },
+    "PegasusModel": {
+        "tokenizer_classes": [
+            "PegasusTokenizer",
+            "PegasusTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "PegasusModel",
+            "TFPegasusModel"
+        ],
+        "sha": "fa36b24523db411ef77903453346b8be81ef73fe"
+    },
+    "PegasusXForConditionalGeneration": {
+        "tokenizer_classes": [
+            "PegasusTokenizer",
+            "PegasusTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "PegasusXForConditionalGeneration"
+        ],
+        "sha": "7588a8120f26a36c1687c14bdf1e9f9656891c1a"
+    },
+    "PegasusXModel": {
+        "tokenizer_classes": [
+            "PegasusTokenizer",
+            "PegasusTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "PegasusXModel"
+        ],
+        "sha": "a0bdff627416ac3c39c22d081f5d88d8b8fd99cc"
+    },
+    "PerceiverForImageClassificationConvProcessing": {
+        "tokenizer_classes": [
+            "PerceiverTokenizer"
+        ],
+        "processor_classes": [
+            "PerceiverImageProcessor"
+        ],
+        "model_classes": [
+            "PerceiverForImageClassificationConvProcessing"
+        ],
+        "sha": "2c1e5e62ebc9d0c931adc8c665fb05bde6c1c1f1"
+    },
+    "PerceiverForImageClassificationFourier": {
+        "tokenizer_classes": [
+            "PerceiverTokenizer"
+        ],
+        "processor_classes": [
+            "PerceiverImageProcessor"
+        ],
+        "model_classes": [
+            "PerceiverForImageClassificationFourier"
+        ],
+        "sha": "88da41b8851b76b8be0dacdb3de023db02bb031a"
+    },
+    "PerceiverForImageClassificationLearned": {
+        "tokenizer_classes": [
+            "PerceiverTokenizer"
+        ],
+        "processor_classes": [
+            "PerceiverImageProcessor"
+        ],
+        "model_classes": [
+            "PerceiverForImageClassificationLearned"
+        ],
+        "sha": "879bd1fa38d3baddb027bb2cacba2d160a741375"
+    },
+    "PerceiverForMaskedLM": {
+        "tokenizer_classes": [
+            "PerceiverTokenizer"
+        ],
+        "processor_classes": [
+            "PerceiverImageProcessor"
+        ],
+        "model_classes": [
+            "PerceiverForMaskedLM"
+        ],
+        "sha": "1d2459cbd281ef72da5682e65102aaca96183045"
+    },
+    "PerceiverForSequenceClassification": {
+        "tokenizer_classes": [
+            "PerceiverTokenizer"
+        ],
+        "processor_classes": [
+            "PerceiverImageProcessor"
+        ],
+        "model_classes": [
+            "PerceiverForSequenceClassification"
+        ],
+        "sha": "576f1f96348f0343458499fbf53d4102b5c0f2ff"
+    },
+    "PerceiverModel": {
+        "tokenizer_classes": [
+            "PerceiverTokenizer"
+        ],
+        "processor_classes": [
+            "PerceiverImageProcessor"
+        ],
+        "model_classes": [
+            "PerceiverModel"
+        ],
+        "sha": "83ec4d2d61ed62525ee033e13d144817beb29d19"
+    },
+    "PersimmonForCausalLM": {
+        "tokenizer_classes": [
+            "LlamaTokenizer",
+            "LlamaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "PersimmonForCausalLM"
+        ],
+        "sha": "454234d6496c3857f5bf3eafb784616e2cd3ea82"
+    },
+    "PersimmonForSequenceClassification": {
+        "tokenizer_classes": [
+            "LlamaTokenizer",
+            "LlamaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "PersimmonForSequenceClassification"
+        ],
+        "sha": "1d2674846543a181ca67bafa8b8f3a48bd2eefd1"
+    },
+    "PersimmonModel": {
+        "tokenizer_classes": [
+            "LlamaTokenizer",
+            "LlamaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "PersimmonModel"
+        ],
+        "sha": "b8c8d479e29e9ee048e2d0b05b001ac835ad8859"
+    },
+    "Pix2StructForConditionalGeneration": {
+        "tokenizer_classes": [
+            "T5TokenizerFast"
+        ],
+        "processor_classes": [
+            "Pix2StructImageProcessor",
+            "Pix2StructProcessor"
+        ],
+        "model_classes": [],
+        "sha": "42b3de00ad535076c4893e4ac5ae2d2748cc4ccb"
+    },
+    "PoolFormerForImageClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "PoolFormerImageProcessor"
+        ],
+        "model_classes": [
+            "PoolFormerForImageClassification"
+        ],
+        "sha": "ef04de5a6896100d457fb9553dd9789c09cca98e"
+    },
+    "PoolFormerModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "PoolFormerImageProcessor"
+        ],
+        "model_classes": [
+            "PoolFormerModel"
+        ],
+        "sha": "e8037215ebdbf795329ef6525cdc6aa547f04ace"
+    },
+    "ProphetNetForCausalLM": {
+        "tokenizer_classes": [
+            "ProphetNetTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "ProphetNetForCausalLM"
+        ],
+        "sha": "d40b1e75bbc5ea0839563457aff6eee5bc0bb03e"
+    },
+    "ProphetNetForConditionalGeneration": {
+        "tokenizer_classes": [
+            "ProphetNetTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "ProphetNetForConditionalGeneration"
+        ],
+        "sha": "d842875c41278032af39c03c66902786bb5ff2c7"
+    },
+    "ProphetNetModel": {
+        "tokenizer_classes": [
+            "ProphetNetTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "ProphetNetModel"
+        ],
+        "sha": "f1ddbbcc768c7ba54c4d75b319540c1635e65937"
+    },
+    "PvtForImageClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "PvtImageProcessor"
+        ],
+        "model_classes": [
+            "PvtForImageClassification"
+        ],
+        "sha": "589b37bd6941aff6dd248259f9eee3c422a41fde"
+    },
+    "PvtModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "PvtImageProcessor"
+        ],
+        "model_classes": [
+            "PvtModel"
+        ],
+        "sha": "c40765c382515ae627652d60e9077b6478448d48"
+    },
+    "ReformerForMaskedLM": {
+        "tokenizer_classes": [
+            "ReformerTokenizer",
+            "ReformerTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "ReformerForMaskedLM"
+        ],
+        "sha": "1e6431e42c676b525e3215e9e3cc8f1404f9f82b"
+    },
+    "ReformerForQuestionAnswering": {
+        "tokenizer_classes": [
+            "ReformerTokenizer",
+            "ReformerTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "ReformerForQuestionAnswering"
+        ],
+        "sha": "62b43977f244474bd6982c6327d0c57310258fcd"
+    },
+    "ReformerForSequenceClassification": {
+        "tokenizer_classes": [
+            "ReformerTokenizer",
+            "ReformerTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "ReformerForSequenceClassification"
+        ],
+        "sha": "67bd534a990a7dcfa02406987e7f066caa2a30e8"
+    },
+    "ReformerModel": {
+        "tokenizer_classes": [
+            "ReformerTokenizer",
+            "ReformerTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "ReformerModel"
+        ],
+        "sha": "a34ddb1389067448e9bc1323de674951cfb4cff1"
+    },
+    "ReformerModelWithLMHead": {
+        "tokenizer_classes": [
+            "ReformerTokenizer",
+            "ReformerTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [],
+        "sha": "e7a8addaea8407d4c55e144e48aee04be6cca618"
+    },
+    "RegNetForImageClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ConvNextImageProcessor"
+        ],
+        "model_classes": [
+            "RegNetForImageClassification",
+            "TFRegNetForImageClassification"
+        ],
+        "sha": "5ec67c84fc7944c0c5b386bd26820bc4d1f3b32a"
+    },
+    "RegNetModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ConvNextImageProcessor"
+        ],
+        "model_classes": [
+            "RegNetModel",
+            "TFRegNetModel"
+        ],
+        "sha": "72375e1401dc8271d4abb6295c9cee376f7b8f1a"
+    },
+    "RemBertForCausalLM": {
+        "tokenizer_classes": [
+            "RemBertTokenizer",
+            "RemBertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RemBertForCausalLM",
+            "TFRemBertForCausalLM"
+        ],
+        "sha": "8d9ae3d74a0e0a8958b4ee8c9dca3632abf52ef9"
+    },
+    "RemBertForMaskedLM": {
+        "tokenizer_classes": [
+            "RemBertTokenizer",
+            "RemBertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RemBertForMaskedLM",
+            "TFRemBertForMaskedLM"
+        ],
+        "sha": "b7c27d01e1cc3bef9ddd6a78627d700b3bffd759"
+    },
+    "RemBertForMultipleChoice": {
+        "tokenizer_classes": [
+            "RemBertTokenizer",
+            "RemBertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RemBertForMultipleChoice",
+            "TFRemBertForMultipleChoice"
+        ],
+        "sha": "2fe192677b9740cf24dd559339d46925e8ac23d4"
+    },
+    "RemBertForQuestionAnswering": {
+        "tokenizer_classes": [
+            "RemBertTokenizer",
+            "RemBertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RemBertForQuestionAnswering",
+            "TFRemBertForQuestionAnswering"
+        ],
+        "sha": "22b8ba44681b96292a1cf7f6df4ba6bb7937ec6e"
+    },
+    "RemBertForSequenceClassification": {
+        "tokenizer_classes": [
+            "RemBertTokenizer",
+            "RemBertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RemBertForSequenceClassification",
+            "TFRemBertForSequenceClassification"
+        ],
+        "sha": "20f3e89341ea15266d2685a8798142fba03c3f98"
+    },
+    "RemBertForTokenClassification": {
+        "tokenizer_classes": [
+            "RemBertTokenizer",
+            "RemBertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RemBertForTokenClassification",
+            "TFRemBertForTokenClassification"
+        ],
+        "sha": "15712ff753708da3cf0550e76e73a5d0bba7784e"
+    },
+    "RemBertModel": {
+        "tokenizer_classes": [
+            "RemBertTokenizer",
+            "RemBertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RemBertModel",
+            "TFRemBertModel"
+        ],
+        "sha": "59cc6d099b1ded0aaead8684457415b129f79e86"
+    },
+    "ResNetBackbone": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ConvNextImageProcessor"
+        ],
+        "model_classes": [
+            "ResNetBackbone"
+        ],
+        "sha": "c84a6bcf8af4b6a3403dea3cf4c55965ac39f239"
+    },
+    "ResNetForImageClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ConvNextImageProcessor"
+        ],
+        "model_classes": [
+            "ResNetForImageClassification",
+            "TFResNetForImageClassification"
+        ],
+        "sha": "34a180ad24d80811d420d7aa4fbec4a17751aaf8"
+    },
+    "ResNetModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ConvNextImageProcessor"
+        ],
+        "model_classes": [
+            "ResNetModel",
+            "TFResNetModel"
+        ],
+        "sha": "fafa6cdf9986c6cfbae360596b3574162430bcd3"
+    },
+    "RoCBertForCausalLM": {
+        "tokenizer_classes": [
+            "RoCBertTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RoCBertForCausalLM"
+        ],
+        "sha": "194d8dafc4f4142f8d31e6b4be14b55d812f923b"
+    },
+    "RoCBertForMaskedLM": {
+        "tokenizer_classes": [
+            "RoCBertTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RoCBertForMaskedLM"
+        ],
+        "sha": "8bc285f32f3b932dbd56ddf91b1170734d638eeb"
+    },
+    "RoCBertForMultipleChoice": {
+        "tokenizer_classes": [
+            "RoCBertTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RoCBertForMultipleChoice"
+        ],
+        "sha": "bb54e5ae021d728022d34b12fee3f087d9486af9"
+    },
+    "RoCBertForPreTraining": {
+        "tokenizer_classes": [
+            "RoCBertTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RoCBertForPreTraining"
+        ],
+        "sha": "86ebbd5b0bc84660ad7f505082eff19b86c137c8"
+    },
+    "RoCBertForQuestionAnswering": {
+        "tokenizer_classes": [
+            "RoCBertTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RoCBertForQuestionAnswering"
+        ],
+        "sha": "1bfc2dc3d6e76170e6dca1ff32a54a0887ff28a3"
+    },
+    "RoCBertForSequenceClassification": {
+        "tokenizer_classes": [
+            "RoCBertTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RoCBertForSequenceClassification"
+        ],
+        "sha": "c329038802241f454273894128fea38b60f7c739"
+    },
+    "RoCBertForTokenClassification": {
+        "tokenizer_classes": [
+            "RoCBertTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RoCBertForTokenClassification"
+        ],
+        "sha": "afe5ec22c2ad1d9ff6e3e64c87eb7555faaa936d"
+    },
+    "RoCBertModel": {
+        "tokenizer_classes": [
+            "RoCBertTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RoCBertModel"
+        ],
+        "sha": "29de5580d5f5d3461a88673e7b4c492a9d8a67a4"
+    },
+    "RoFormerForCausalLM": {
+        "tokenizer_classes": [
+            "RoFormerTokenizer",
+            "RoFormerTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RoFormerForCausalLM",
+            "TFRoFormerForCausalLM"
+        ],
+        "sha": "6e074219c6dd8f8b221bbfda64fba100f729f88d"
+    },
+    "RoFormerForMaskedLM": {
+        "tokenizer_classes": [
+            "RoFormerTokenizer",
+            "RoFormerTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RoFormerForMaskedLM",
+            "TFRoFormerForMaskedLM"
+        ],
+        "sha": "a3a4d05f9b29601553a77244f2adcf8194f9367c"
+    },
+    "RoFormerForMultipleChoice": {
+        "tokenizer_classes": [
+            "RoFormerTokenizer",
+            "RoFormerTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RoFormerForMultipleChoice",
+            "TFRoFormerForMultipleChoice"
+        ],
+        "sha": "aca3999a1d14f09644faed44e2cdfb28ed68a3d3"
+    },
+    "RoFormerForQuestionAnswering": {
+        "tokenizer_classes": [
+            "RoFormerTokenizer",
+            "RoFormerTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RoFormerForQuestionAnswering",
+            "TFRoFormerForQuestionAnswering"
+        ],
+        "sha": "b8a20b3a788f178b9ef64e2eb9587f693dca1b69"
+    },
+    "RoFormerForSequenceClassification": {
+        "tokenizer_classes": [
+            "RoFormerTokenizer",
+            "RoFormerTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RoFormerForSequenceClassification",
+            "TFRoFormerForSequenceClassification"
+        ],
+        "sha": "d092e2d5e62012bf4ec921e763b37865d6189216"
+    },
+    "RoFormerForTokenClassification": {
+        "tokenizer_classes": [
+            "RoFormerTokenizer",
+            "RoFormerTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RoFormerForTokenClassification",
+            "TFRoFormerForTokenClassification"
+        ],
+        "sha": "85d3a17062e1f3e0539abfe738a88203e25349b6"
+    },
+    "RoFormerModel": {
+        "tokenizer_classes": [
+            "RoFormerTokenizer",
+            "RoFormerTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RoFormerModel",
+            "TFRoFormerModel"
+        ],
+        "sha": "22e7df2f4cd66caf449f2342f63d176005afccc9"
+    },
+    "RobertaForCausalLM": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RobertaForCausalLM",
+            "TFRobertaForCausalLM"
+        ],
+        "sha": "5d1d24d56f9735402e50a2ea513ffde44487733e"
+    },
+    "RobertaForMaskedLM": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RobertaForMaskedLM",
+            "TFRobertaForMaskedLM"
+        ],
+        "sha": "b21c9daf0b3b66530bf5d45d67df5ec392b5059c"
+    },
+    "RobertaForMultipleChoice": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RobertaForMultipleChoice",
+            "TFRobertaForMultipleChoice"
+        ],
+        "sha": "10020d9546d4d7318f4d514fe13daaad07e6269f"
+    },
+    "RobertaForQuestionAnswering": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RobertaForQuestionAnswering",
+            "TFRobertaForQuestionAnswering"
+        ],
+        "sha": "eea4a81306891746bac9e7715f805a2d9dbf4be7"
+    },
+    "RobertaForSequenceClassification": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RobertaForSequenceClassification",
+            "TFRobertaForSequenceClassification"
+        ],
+        "sha": "6a6f53fc6ab98e29ed539e76b1cb76d25a2cd720"
+    },
+    "RobertaForTokenClassification": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RobertaForTokenClassification",
+            "TFRobertaForTokenClassification"
+        ],
+        "sha": "9190044c4091eb0d98ae7638c453e24846bca5d7"
+    },
+    "RobertaModel": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RobertaModel",
+            "TFRobertaModel"
+        ],
+        "sha": "181a0b8a7ad24500ec327ad07ddb225f0680ac0a"
+    },
+    "RobertaPreLayerNormForCausalLM": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RobertaPreLayerNormForCausalLM",
+            "TFRobertaPreLayerNormForCausalLM"
+        ],
+        "sha": "73b6d4531b41f295a5d310d7aa44736004a59865"
+    },
+    "RobertaPreLayerNormForMaskedLM": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RobertaPreLayerNormForMaskedLM",
+            "TFRobertaPreLayerNormForMaskedLM"
+        ],
+        "sha": "a61723c77e5ab7adc95285e7823a0a49b99af395"
+    },
+    "RobertaPreLayerNormForMultipleChoice": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RobertaPreLayerNormForMultipleChoice",
+            "TFRobertaPreLayerNormForMultipleChoice"
+        ],
+        "sha": "3dcfa62e0771358c60232a18135bfe7c7f6d715e"
+    },
+    "RobertaPreLayerNormForQuestionAnswering": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RobertaPreLayerNormForQuestionAnswering",
+            "TFRobertaPreLayerNormForQuestionAnswering"
+        ],
+        "sha": "a8e76a5a50f7df60055e5ed6a1c3af2e7d34cf01"
+    },
+    "RobertaPreLayerNormForSequenceClassification": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RobertaPreLayerNormForSequenceClassification",
+            "TFRobertaPreLayerNormForSequenceClassification"
+        ],
+        "sha": "7509cb0286d146ef2fc6beb8867ae31b92fb1b16"
+    },
+    "RobertaPreLayerNormForTokenClassification": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RobertaPreLayerNormForTokenClassification",
+            "TFRobertaPreLayerNormForTokenClassification"
+        ],
+        "sha": "3ad5814ba126b41e18c1978c970e396fab6da9bf"
+    },
+    "RobertaPreLayerNormModel": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RobertaPreLayerNormModel",
+            "TFRobertaPreLayerNormModel"
+        ],
+        "sha": "4830db38fd310404c5ab70bd00684eca0bc06ca8"
+    },
+    "RwkvForCausalLM": {
+        "tokenizer_classes": [
+            "GPTNeoXTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RwkvForCausalLM"
+        ],
+        "sha": "2f452fd46b39e39b1a6a95fa1d8232405bbb3e96"
+    },
+    "RwkvModel": {
+        "tokenizer_classes": [
+            "GPTNeoXTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RwkvModel"
+        ],
+        "sha": "88a52c9437dc3c06f65a8252490be7eb91197804"
+    },
+    "SEWDForCTC": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "SEWDForCTC"
+        ],
+        "sha": "5c7495c77ae9e0f12c0de05d3a5fb95bdcd91768"
+    },
+    "SEWDForSequenceClassification": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "SEWDForSequenceClassification"
+        ],
+        "sha": "d6cbf1164ce1999fdaf3deeb7a6eba19a3b1f873"
+    },
+    "SEWDModel": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "SEWDModel"
+        ],
+        "sha": "dde4e02219449f149bb3403bbeae127cafaf9c79"
+    },
+    "SEWForCTC": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "SEWForCTC"
+        ],
+        "sha": "4477c7a277059fba08772acf91cf3e3dd3cb073b"
+    },
+    "SEWForSequenceClassification": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "SEWForSequenceClassification"
+        ],
+        "sha": "3b90fbb1c0c3848fed18f91a0169bb297a3e6619"
+    },
+    "SEWModel": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "SEWModel"
+        ],
+        "sha": "0a0fbb844eeefa0dce62bd05db30a2bb91e5dc88"
+    },
+    "SamModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "SamImageProcessor"
+        ],
+        "model_classes": [
+            "SamModel",
+            "TFSamModel"
+        ],
+        "sha": "eca8651bc84e5ac3b1b62e784b744a6bd1b82575"
+    },
+    "SegformerForImageClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "SegformerImageProcessor"
+        ],
+        "model_classes": [
+            "SegformerForImageClassification",
+            "TFSegformerForImageClassification"
+        ],
+        "sha": "c566ae0ed382be4ed61ed6dacffa2ba663e9cc19"
+    },
+    "SegformerForSemanticSegmentation": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "SegformerImageProcessor"
+        ],
+        "model_classes": [
+            "SegformerForSemanticSegmentation",
+            "TFSegformerForSemanticSegmentation"
+        ],
+        "sha": "b73798972cdf24daafa858994713aca60e2bf90d"
+    },
+    "SegformerModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "SegformerImageProcessor"
+        ],
+        "model_classes": [
+            "SegformerModel",
+            "TFSegformerModel"
+        ],
+        "sha": "3d4ba8ed2bdf801e6afa855b9d77893f2b7f9e10"
+    },
+    "Speech2TextForConditionalGeneration": {
+        "tokenizer_classes": [
+            "Speech2TextTokenizer"
+        ],
+        "processor_classes": [
+            "Speech2TextFeatureExtractor"
+        ],
+        "model_classes": [
+            "Speech2TextForConditionalGeneration",
+            "TFSpeech2TextForConditionalGeneration"
+        ],
+        "sha": "1da80293ec78762e136cf6dd64b652693f9ab364"
+    },
+    "Speech2TextModel": {
+        "tokenizer_classes": [
+            "Speech2TextTokenizer"
+        ],
+        "processor_classes": [
+            "Speech2TextFeatureExtractor"
+        ],
+        "model_classes": [
+            "Speech2TextModel",
+            "TFSpeech2TextModel"
+        ],
+        "sha": "7c6e63bd0c15dd99ef01573d4c43f90e4920cc91"
+    },
+    "SpeechEncoderDecoderModel": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "SpeechEncoderDecoderModel"
+        ],
+        "sha": "78602ae0857728e95de4042bdca8a31ef818890a"
+    },
+    "SpeechT5ForSpeechToText": {
+        "tokenizer_classes": [
+            "SpeechT5Tokenizer"
+        ],
+        "processor_classes": [
+            "SpeechT5FeatureExtractor"
+        ],
+        "model_classes": [
+            "SpeechT5ForSpeechToText"
+        ],
+        "sha": "d46f0a83324e5865420a27a738ef203292de3479"
+    },
+    "SpeechT5ForTextToSpeech": {
+        "tokenizer_classes": [
+            "SpeechT5Tokenizer"
+        ],
+        "processor_classes": [
+            "SpeechT5FeatureExtractor"
+        ],
+        "model_classes": [
+            "SpeechT5ForTextToSpeech"
+        ],
+        "sha": "922e748d9e1ea256a8d9259782021cd3820d5924"
+    },
+    "SpeechT5Model": {
+        "tokenizer_classes": [
+            "SpeechT5Tokenizer"
+        ],
+        "processor_classes": [
+            "SpeechT5FeatureExtractor"
+        ],
+        "model_classes": [
+            "SpeechT5Model"
+        ],
+        "sha": "7b248f77ca88ffddcdb538e772f6de63a86a4f9b"
+    },
+    "SplinterForPreTraining": {
+        "tokenizer_classes": [
+            "SplinterTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "SplinterForPreTraining"
+        ],
+        "sha": "e8a94efa740f1d685fa553f49132c6f022de5389"
+    },
+    "SplinterForQuestionAnswering": {
+        "tokenizer_classes": [
+            "SplinterTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "SplinterForQuestionAnswering"
+        ],
+        "sha": "d038b7b683face4a361ab0f474d8a5b111c44c4d"
+    },
+    "SplinterModel": {
+        "tokenizer_classes": [
+            "SplinterTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "SplinterModel"
+        ],
+        "sha": "a35b13cbb7faba46dc265761bb839267eb53d248"
+    },
+    "SqueezeBertForMaskedLM": {
+        "tokenizer_classes": [
+            "SqueezeBertTokenizer",
+            "SqueezeBertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "SqueezeBertForMaskedLM"
+        ],
+        "sha": "33ce239408c22d2c98be63c9ab4607ef9ceb6d49"
+    },
+    "SqueezeBertForMultipleChoice": {
+        "tokenizer_classes": [
+            "SqueezeBertTokenizer",
+            "SqueezeBertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "SqueezeBertForMultipleChoice"
+        ],
+        "sha": "7e9e666896420c7839e27dcb280981d034ba4da5"
+    },
+    "SqueezeBertForQuestionAnswering": {
+        "tokenizer_classes": [
+            "SqueezeBertTokenizer",
+            "SqueezeBertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "SqueezeBertForQuestionAnswering"
+        ],
+        "sha": "bceb045a9ac6eb2ded7d358ed577c6dc28ea487a"
+    },
+    "SqueezeBertForSequenceClassification": {
+        "tokenizer_classes": [
+            "SqueezeBertTokenizer",
+            "SqueezeBertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "SqueezeBertForSequenceClassification"
+        ],
+        "sha": "c5aeb1f454a1d059d41a5f8dacaf784b9de0b899"
+    },
+    "SqueezeBertForTokenClassification": {
+        "tokenizer_classes": [
+            "SqueezeBertTokenizer",
+            "SqueezeBertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "SqueezeBertForTokenClassification"
+        ],
+        "sha": "70ba60ca44a380e6aa983a37b163c57217219df7"
+    },
+    "SqueezeBertModel": {
+        "tokenizer_classes": [
+            "SqueezeBertTokenizer",
+            "SqueezeBertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "SqueezeBertModel"
+        ],
+        "sha": "e0a3ac56a4047da3f921638252ead5e44438bbdb"
+    },
+    "SwiftFormerForImageClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ViTImageProcessor"
+        ],
+        "model_classes": [
+            "SwiftFormerForImageClassification"
+        ],
+        "sha": "a249b14a525d29e675b6e4af4baacd9ba7df7598"
+    },
+    "SwiftFormerModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ViTImageProcessor"
+        ],
+        "model_classes": [
+            "SwiftFormerModel"
+        ],
+        "sha": "25ba2d88c770533f8c69811d2a454a00c1d09f5d"
+    },
+    "Swin2SRModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "Swin2SRImageProcessor"
+        ],
+        "model_classes": [
+            "Swin2SRModel"
+        ],
+        "sha": "c67f6ecff9ef8675c3869c987277b0a1e040f4be"
+    },
+    "SwinBackbone": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ViTImageProcessor"
+        ],
+        "model_classes": [
+            "SwinBackbone"
+        ],
+        "sha": "89b28b8ec05a7b3357be75a77eb7809e6fd5cfef"
+    },
+    "SwinForImageClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ViTImageProcessor"
+        ],
+        "model_classes": [
+            "SwinForImageClassification",
+            "TFSwinForImageClassification"
+        ],
+        "sha": "e3c2e80f380ef79781313981da1a993dd8b8d34d"
+    },
+    "SwinForMaskedImageModeling": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ViTImageProcessor"
+        ],
+        "model_classes": [
+            "SwinForMaskedImageModeling",
+            "TFSwinForMaskedImageModeling"
+        ],
+        "sha": "d84b061fbace1bc6e697e3253e222de42053f978"
+    },
+    "SwinModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ViTImageProcessor"
+        ],
+        "model_classes": [
+            "SwinModel",
+            "TFSwinModel"
+        ],
+        "sha": "23ff641295660ec4fea399be8aa1bc14565961f8"
+    },
+    "Swinv2ForImageClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ViTImageProcessor"
+        ],
+        "model_classes": [
+            "Swinv2ForImageClassification"
+        ],
+        "sha": "3fd755cdf4cf611db83f72f9c9b00eb9257a38ca"
+    },
+    "Swinv2ForMaskedImageModeling": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ViTImageProcessor"
+        ],
+        "model_classes": [
+            "Swinv2ForMaskedImageModeling"
+        ],
+        "sha": "8375c31eb6231fde36ec6533a34ba5b28e296163"
+    },
+    "Swinv2Model": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ViTImageProcessor"
+        ],
+        "model_classes": [
+            "Swinv2Model"
+        ],
+        "sha": "70aeb72e8a266f668c8b51a517ec01003b8d6804"
+    },
+    "SwitchTransformersForConditionalGeneration": {
+        "tokenizer_classes": [
+            "T5Tokenizer",
+            "T5TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "SwitchTransformersForConditionalGeneration"
+        ],
+        "sha": "c8fcd2bb735894c78db7f1e5b51afc78aced7adb"
+    },
+    "SwitchTransformersModel": {
+        "tokenizer_classes": [
+            "T5Tokenizer",
+            "T5TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "SwitchTransformersModel"
+        ],
+        "sha": "275bbf6d389bfd0540b9f824c609c6b22a577328"
+    },
+    "T5EncoderModel": {
+        "tokenizer_classes": [
+            "T5Tokenizer",
+            "T5TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "T5EncoderModel",
+            "TFT5EncoderModel"
+        ],
+        "sha": "1c75090036a2b3740dfe2d570b889332ad8e59e8"
+    },
+    "T5ForConditionalGeneration": {
+        "tokenizer_classes": [
+            "T5Tokenizer",
+            "T5TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "T5ForConditionalGeneration",
+            "TFT5ForConditionalGeneration"
+        ],
+        "sha": "593fd6072a4e265f5cc73b1973cd8af76b261f29"
+    },
+    "T5ForQuestionAnswering": {
+        "tokenizer_classes": [
+            "T5Tokenizer",
+            "T5TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "T5ForQuestionAnswering"
+        ],
+        "sha": "b9edf2de494244ff032f67d2d7bdf6c591000c94"
+    },
+    "T5ForSequenceClassification": {
+        "tokenizer_classes": [
+            "T5Tokenizer",
+            "T5TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "T5ForSequenceClassification"
+        ],
+        "sha": "105b5c4c8e1efe927444108f1388c4f102ebad15"
+    },
+    "T5Model": {
+        "tokenizer_classes": [
+            "T5Tokenizer",
+            "T5TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "T5Model",
+            "TFT5Model"
+        ],
+        "sha": "eb3d20dda0ba77c1de618d78116a1a0c784c515c"
+    },
+    "TableTransformerForObjectDetection": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "DetrImageProcessor"
+        ],
+        "model_classes": [
+            "TableTransformerForObjectDetection"
+        ],
+        "sha": "9cf1e3f5c3555a727672a32b49f8b96c5aa20be6"
+    },
+    "TableTransformerModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "DetrImageProcessor"
+        ],
+        "model_classes": [
+            "TableTransformerModel"
+        ],
+        "sha": "7b446244d8739b0c29d98f7d537b15ad578577d5"
+    },
+    "TapasForMaskedLM": {
+        "tokenizer_classes": [
+            "TapasTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "TFTapasForMaskedLM",
+            "TapasForMaskedLM"
+        ],
+        "sha": "2cedb92dd9a3dc37ffb7d35ad5190b110992577c"
+    },
+    "TapasForQuestionAnswering": {
+        "tokenizer_classes": [
+            "TapasTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "TFTapasForQuestionAnswering",
+            "TapasForQuestionAnswering"
+        ],
+        "sha": "4cc91b9e5db662e6e392d8052587ae419896d72b"
+    },
+    "TapasForSequenceClassification": {
+        "tokenizer_classes": [
+            "TapasTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "TFTapasForSequenceClassification",
+            "TapasForSequenceClassification"
+        ],
+        "sha": "7c37bfb87a6fce2f8604bb3cab2a14e09a285e14"
+    },
+    "TapasModel": {
+        "tokenizer_classes": [
+            "TapasTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "TFTapasModel",
+            "TapasModel"
+        ],
+        "sha": "bc004af0a415afe1f566c3afe8dd4d48d08c1ce0"
+    },
+    "TimesformerForVideoClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "VideoMAEImageProcessor"
+        ],
+        "model_classes": [
+            "TimesformerForVideoClassification"
+        ],
+        "sha": "0b3b8e314618d7af34fb44477745491b44bf556d"
+    },
+    "TimesformerModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "VideoMAEImageProcessor"
+        ],
+        "model_classes": [
+            "TimesformerModel"
+        ],
+        "sha": "ea51f7ebb6426ad2b1fa1396e83f8e8ad5bc3b44"
+    },
+    "TransfoXLForSequenceClassification": {
+        "tokenizer_classes": [
+            "TransfoXLTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "TFTransfoXLForSequenceClassification",
+            "TransfoXLForSequenceClassification"
+        ],
+        "sha": "f3d370184350667d74056b979081b0bf5b0083c1"
+    },
+    "TransfoXLLMHeadModel": {
+        "tokenizer_classes": [
+            "TransfoXLTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "TFTransfoXLLMHeadModel",
+            "TransfoXLLMHeadModel"
+        ],
+        "sha": "e0d4cebcdde52d8d4c81782a1edc606830bd6afd"
+    },
+    "TransfoXLModel": {
+        "tokenizer_classes": [
+            "TransfoXLTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "TFTransfoXLModel",
+            "TransfoXLModel"
+        ],
+        "sha": "6938eeae35662a862accb01412dfc486454bdc8f"
+    },
+    "TvltForPreTraining": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "TvltProcessor"
+        ],
+        "model_classes": [
+            "TvltForPreTraining"
+        ],
+        "sha": "f7bd2833764eb6d55a921aaed81d3f21119016ae"
+    },
+    "TvltModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "TvltProcessor"
+        ],
+        "model_classes": [
+            "TvltModel"
+        ],
+        "sha": "c3cbf7a6159c038f333ce7adda2480ea3396b2b3"
+    },
+    "UMT5EncoderModel": {
+        "tokenizer_classes": [
+            "T5Tokenizer",
+            "T5TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "UMT5EncoderModel"
+        ],
+        "sha": "2894e49c9fbd17ea4b3dab56ec388be354c1a5f0"
+    },
+    "UMT5ForQuestionAnswering": {
+        "tokenizer_classes": [
+            "T5Tokenizer",
+            "T5TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "UMT5ForQuestionAnswering"
+        ],
+        "sha": "b381aa068a44200db539f2f48f4e34a5ed1cb093"
+    },
+    "UMT5ForSequenceClassification": {
+        "tokenizer_classes": [
+            "T5Tokenizer",
+            "T5TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "UMT5ForSequenceClassification"
+        ],
+        "sha": "aa9f77b7b3cff21425b7512e7c0f478af7b5db14"
+    },
+    "UMT5Model": {
+        "tokenizer_classes": [
+            "T5Tokenizer",
+            "T5TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "UMT5Model"
+        ],
+        "sha": "9180d850b24e5494442a4f7a8ca1a4c102f9babd"
+    },
+    "UniSpeechForCTC": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "UniSpeechForCTC"
+        ],
+        "sha": "102b56d76f4d74cface309801c0ad80892583751"
+    },
+    "UniSpeechForPreTraining": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "UniSpeechForPreTraining"
+        ],
+        "sha": "830be5b3e85aaae7bcc961218e417c29743d6042"
+    },
+    "UniSpeechForSequenceClassification": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "UniSpeechForSequenceClassification"
+        ],
+        "sha": "a30ac1516944757ccd8efcbcf94033a03f8708bf"
+    },
+    "UniSpeechModel": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "UniSpeechModel"
+        ],
+        "sha": "18e170eb1091715b74ace28c8c380b6bf2b6202d"
+    },
+    "UniSpeechSatForAudioFrameClassification": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "UniSpeechSatForAudioFrameClassification"
+        ],
+        "sha": "7eba5a1c6cd610928b27ecb217bb17c729a07a57"
+    },
+    "UniSpeechSatForCTC": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "UniSpeechSatForCTC"
+        ],
+        "sha": "a8617538d3a2ae990f022bb0c36b8428a4870822"
+    },
+    "UniSpeechSatForPreTraining": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "UniSpeechSatForPreTraining"
+        ],
+        "sha": "a772f66db0ab49e1050e524d7fcbe5106ebdaf96"
+    },
+    "UniSpeechSatForSequenceClassification": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "UniSpeechSatForSequenceClassification"
+        ],
+        "sha": "f1c16567bd829a6d8a7a2d167d22e9653149e625"
+    },
+    "UniSpeechSatForXVector": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "UniSpeechSatForXVector"
+        ],
+        "sha": "71cb3780cf3678f74fba00e19df82df76dca6133"
+    },
+    "UniSpeechSatModel": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "UniSpeechSatModel"
+        ],
+        "sha": "ea755bbc7c6c6aa649c58b4b000f243acbbd6b5a"
+    },
+    "UperNetForSemanticSegmentation": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "SegformerImageProcessor"
+        ],
+        "model_classes": [
+            "UperNetForSemanticSegmentation"
+        ],
+        "sha": "f1871cb388bc0b203f5397bfc06a373736c2fb9c"
+    },
+    "VanForImageClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ConvNextImageProcessor"
+        ],
+        "model_classes": [
+            "VanForImageClassification"
+        ],
+        "sha": "694eb147bc4768aeabeffbfb97732281b71a621d"
+    },
+    "VanModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ConvNextImageProcessor"
+        ],
+        "model_classes": [
+            "VanModel"
+        ],
+        "sha": "d8ac60ce952020f2b0355fc566d634b2c5ba635d"
+    },
+    "ViTForImageClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ViTImageProcessor"
+        ],
+        "model_classes": [
+            "TFViTForImageClassification",
+            "ViTForImageClassification"
+        ],
+        "sha": "5b3b44a3ed492070c273e481e30ecf4deddc5ec3"
+    },
+    "ViTForMaskedImageModeling": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ViTImageProcessor"
+        ],
+        "model_classes": [
+            "ViTForMaskedImageModeling"
+        ],
+        "sha": "d984e0b432fe195c2c26952d4f249031e7b1e2ea"
+    },
+    "ViTHybridForImageClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ViTHybridImageProcessor"
+        ],
+        "model_classes": [
+            "ViTHybridForImageClassification"
+        ],
+        "sha": "69c7c396032ffe60d54953b584394899fb95ccc1"
+    },
+    "ViTHybridModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ViTHybridImageProcessor"
+        ],
+        "model_classes": [
+            "ViTHybridModel"
+        ],
+        "sha": "077443bfefe40d625314dbd274d2ff8089624797"
+    },
+    "ViTMAEForPreTraining": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ViTImageProcessor"
+        ],
+        "model_classes": [
+            "TFViTMAEForPreTraining",
+            "ViTMAEForPreTraining"
+        ],
+        "sha": "2d98d80d9c45eef0d5b6f5426d7196bb546fe9fc"
+    },
+    "ViTMAEModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ViTImageProcessor"
+        ],
+        "model_classes": [
+            "TFViTMAEModel",
+            "ViTMAEModel"
+        ],
+        "sha": "c7c2f12c19d2dbec08851a9dac7485909629a5fd"
+    },
+    "ViTMSNForImageClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ViTImageProcessor"
+        ],
+        "model_classes": [
+            "ViTMSNForImageClassification"
+        ],
+        "sha": "feda819aa7dbb55d850130f4cf1d210858d7eb89"
+    },
+    "ViTMSNModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ViTImageProcessor"
+        ],
+        "model_classes": [
+            "ViTMSNModel"
+        ],
+        "sha": "0733abf168cb47a149821fdd2113d546e15c47de"
+    },
+    "ViTModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ViTImageProcessor"
+        ],
+        "model_classes": [
+            "TFViTModel",
+            "ViTModel"
+        ],
+        "sha": "31817b7a64ebc3333fcd4801dfbb356ab07b13dd"
+    },
+    "VideoMAEForPreTraining": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "VideoMAEImageProcessor"
+        ],
+        "model_classes": [
+            "VideoMAEForPreTraining"
+        ],
+        "sha": "9de66c4bb759dc7269a7af17bf70b3194550acaa"
+    },
+    "VideoMAEForVideoClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "VideoMAEImageProcessor"
+        ],
+        "model_classes": [
+            "VideoMAEForVideoClassification"
+        ],
+        "sha": "d3f743408386bc0ffe2d979de35335e87bc34aec"
+    },
+    "VideoMAEModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "VideoMAEImageProcessor"
+        ],
+        "model_classes": [
+            "VideoMAEModel"
+        ],
+        "sha": "a2be96beba888817d92b67525601569d830342ff"
+    },
+    "ViltForQuestionAnswering": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [
+            "ViltImageProcessor"
+        ],
+        "model_classes": [
+            "ViltForQuestionAnswering"
+        ],
+        "sha": "faeffbf43da6621717d8b13e7ebe87d58d750cb2"
+    },
+    "ViltModel": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [
+            "ViltImageProcessor"
+        ],
+        "model_classes": [
+            "ViltModel"
+        ],
+        "sha": "3a89b7b5782947c4f4125162ffe1c9cc18c9c800"
+    },
+    "VisionEncoderDecoderModel": {
+        "tokenizer_classes": [
+            "GPT2Tokenizer",
+            "GPT2TokenizerFast"
+        ],
+        "processor_classes": [
+            "ViTImageProcessor"
+        ],
+        "model_classes": [
+            "TFVisionEncoderDecoderModel",
+            "VisionEncoderDecoderModel"
+        ],
+        "sha": "23917761070cf16b26a6d033b6bff9100bbc618b"
+    },
+    "VisionTextDualEncoderModel": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [
+            "ViTImageProcessor"
+        ],
+        "model_classes": [
+            "TFVisionTextDualEncoderModel",
+            "VisionTextDualEncoderModel"
+        ],
+        "sha": "c3569ef17f66acbacb76f7ceb6f71e02d075dd6c"
+    },
+    "VisualBertForPreTraining": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "VisualBertForPreTraining"
+        ],
+        "sha": "ce5a4d93ce762971cd216cda9aef8b9ce3f0450b"
+    },
+    "VisualBertModel": {
+        "tokenizer_classes": [
+            "BertTokenizer",
+            "BertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "VisualBertModel"
+        ],
+        "sha": "85020189fb7bf1217eb9370b09bca8ec5bcfdafa"
+    },
+    "VitsModel": {
+        "tokenizer_classes": [
+            "VitsTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "VitsModel"
+        ],
+        "sha": "b9a20ca5b6a7874576e485850260578895587dd2"
+    },
+    "Wav2Vec2ConformerForAudioFrameClassification": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "Wav2Vec2ConformerForAudioFrameClassification"
+        ],
+        "sha": "e316a18a1d165b4cb51a7f28f8e8dab676da4b56"
+    },
+    "Wav2Vec2ConformerForCTC": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "Wav2Vec2ConformerForCTC"
+        ],
+        "sha": "a2ecb2985fcbb9f3ed000c12c1af6da36f5eaa3a"
+    },
+    "Wav2Vec2ConformerForPreTraining": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "Wav2Vec2ConformerForPreTraining"
+        ],
+        "sha": "099279b69e5da19efb05589804ccee210a0e57ae"
+    },
+    "Wav2Vec2ConformerForSequenceClassification": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "Wav2Vec2ConformerForSequenceClassification"
+        ],
+        "sha": "e8c1bca543c54bf15a6c026cb3761993b52cf617"
+    },
+    "Wav2Vec2ConformerForXVector": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "Wav2Vec2ConformerForXVector"
+        ],
+        "sha": "ba206a55998f16e134960728bd02006eaf39114f"
+    },
+    "Wav2Vec2ConformerModel": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "Wav2Vec2ConformerModel"
+        ],
+        "sha": "ef2fe3aa8c23e6f8696e6612061aaddecae49994"
+    },
+    "Wav2Vec2ForAudioFrameClassification": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "Wav2Vec2ForAudioFrameClassification"
+        ],
+        "sha": "ab219f119e10f56e1059966c66d23f0df3c2c343"
+    },
+    "Wav2Vec2ForCTC": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "Wav2Vec2ForCTC"
+        ],
+        "sha": "6245fbb1cb99cea5c4de1e73f81fba978fb275ac"
+    },
+    "Wav2Vec2ForMaskedLM": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "Wav2Vec2ForMaskedLM"
+        ],
+        "sha": "e083cf4fefec4df3c241dbbe5e17a84a794a89bd"
+    },
+    "Wav2Vec2ForPreTraining": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "Wav2Vec2ForPreTraining"
+        ],
+        "sha": "a8d71e216334260353ccbf5ce84cd6924f7457da"
+    },
+    "Wav2Vec2ForSequenceClassification": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "TFWav2Vec2ForSequenceClassification",
+            "Wav2Vec2ForSequenceClassification"
+        ],
+        "sha": "2000b2022abcc37100241485f5872126b70164c9"
+    },
+    "Wav2Vec2ForXVector": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "Wav2Vec2ForXVector"
+        ],
+        "sha": "f4c422db53aae061ea609f4407af7cd5b33c8942"
+    },
+    "Wav2Vec2Model": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "TFWav2Vec2Model",
+            "Wav2Vec2Model"
+        ],
+        "sha": "7a998ee3ee0619a52828a79c3eed6872fd053f37"
+    },
+    "WavLMForAudioFrameClassification": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "WavLMForAudioFrameClassification"
+        ],
+        "sha": "b135610f8d5de0b1a5bf5ed7212966135c63d6ec"
+    },
+    "WavLMForCTC": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "WavLMForCTC"
+        ],
+        "sha": "f1139c5ddf34d2327ae1f6917edd7da180b06971"
+    },
+    "WavLMForSequenceClassification": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "WavLMForSequenceClassification"
+        ],
+        "sha": "4ba5f2019b46866ce2011c993194ebda60afc028"
+    },
+    "WavLMForXVector": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "WavLMForXVector"
+        ],
+        "sha": "faf9264eac56a56d5510a0984d7e1146e4c8cf62"
+    },
+    "WavLMModel": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "WavLMModel"
+        ],
+        "sha": "e932275e37cb643be271f655bd1d649f4f4b4bd5"
+    },
+    "WhisperForAudioClassification": {
+        "tokenizer_classes": [
+            "WhisperTokenizer"
+        ],
+        "processor_classes": [
+            "WhisperFeatureExtractor"
+        ],
+        "model_classes": [
+            "WhisperForAudioClassification"
+        ],
+        "sha": "d71b13674b1a67443cd19d0594a3b5b1e5968f0d"
+    },
+    "WhisperForConditionalGeneration": {
+        "tokenizer_classes": [
+            "WhisperTokenizer",
+            "WhisperTokenizerFast"
+        ],
+        "processor_classes": [
+            "WhisperFeatureExtractor"
+        ],
+        "model_classes": [
+            "TFWhisperForConditionalGeneration",
+            "WhisperForConditionalGeneration"
+        ],
+        "sha": "598101b885b24508042d9292e54aa04bff96318e"
+    },
+    "WhisperModel": {
+        "tokenizer_classes": [
+            "WhisperTokenizer",
+            "WhisperTokenizerFast"
+        ],
+        "processor_classes": [
+            "WhisperFeatureExtractor"
+        ],
+        "model_classes": [
+            "TFWhisperModel",
+            "WhisperModel"
+        ],
+        "sha": "c04c50216bb6b0a8f4d55f2fa9f9f4cf61c8a77c"
+    },
+    "XCLIPModel": {
+        "tokenizer_classes": [
+            "CLIPTokenizer",
+            "CLIPTokenizerFast"
+        ],
+        "processor_classes": [
+            "VideoMAEImageProcessor"
+        ],
+        "model_classes": [
+            "XCLIPModel"
+        ],
+        "sha": "299ffffc6b94c3558bf7dbc38e24074c99490046"
+    },
+    "XGLMForCausalLM": {
+        "tokenizer_classes": [
+            "XGLMTokenizer",
+            "XGLMTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "TFXGLMForCausalLM",
+            "XGLMForCausalLM"
+        ],
+        "sha": "d5381ce297c249d559937c6bb6316cf1fdad2613"
+    },
+    "XGLMModel": {
+        "tokenizer_classes": [
+            "XGLMTokenizer",
+            "XGLMTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "TFXGLMModel",
+            "XGLMModel"
+        ],
+        "sha": "2b5cef167822cfaa558d259af1722e2f785cd3d5"
+    },
+    "XLMForMultipleChoice": {
+        "tokenizer_classes": [
+            "XLMTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "TFXLMForMultipleChoice",
+            "XLMForMultipleChoice"
+        ],
+        "sha": "f0c8cc6462449ac9eb9b4158e433bd3c923db3af"
+    },
+    "XLMForQuestionAnsweringSimple": {
+        "tokenizer_classes": [
+            "XLMTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "TFXLMForQuestionAnsweringSimple",
+            "XLMForQuestionAnsweringSimple"
+        ],
+        "sha": "82e93a2653cf3646eaaf02d8cc5f8ff9a4551523"
+    },
+    "XLMForSequenceClassification": {
+        "tokenizer_classes": [
+            "XLMTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "TFXLMForSequenceClassification",
+            "XLMForSequenceClassification"
+        ],
+        "sha": "2d6892f5f703be9b481bca91477032bd0e36dbe5"
+    },
+    "XLMForTokenClassification": {
+        "tokenizer_classes": [
+            "XLMTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "TFXLMForTokenClassification",
+            "XLMForTokenClassification"
+        ],
+        "sha": "9a591395e7a0643a03f5d2debb98caa3966e021c"
+    },
+    "XLMModel": {
+        "tokenizer_classes": [
+            "XLMTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "TFXLMModel",
+            "XLMModel"
+        ],
+        "sha": "022b86df246414ff712475d9ca55db690ff1d3bf"
+    },
+    "XLMRobertaXLForCausalLM": {
+        "tokenizer_classes": [
+            "XLMRobertaTokenizer",
+            "XLMRobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "XLMRobertaXLForCausalLM"
+        ],
+        "sha": "fc05408e5b33a31638476ef337719dfbb7615ef3"
+    },
+    "XLMRobertaXLForMaskedLM": {
+        "tokenizer_classes": [
+            "XLMRobertaTokenizer",
+            "XLMRobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "XLMRobertaXLForMaskedLM"
+        ],
+        "sha": "e96f198eede757e5ae2c87632fdcfb341073ef6e"
+    },
+    "XLMRobertaXLForMultipleChoice": {
+        "tokenizer_classes": [
+            "XLMRobertaTokenizer",
+            "XLMRobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "XLMRobertaXLForMultipleChoice"
+        ],
+        "sha": "52732625f1bfbbb7cb4ba1cf0963de596d81822d"
+    },
+    "XLMRobertaXLForQuestionAnswering": {
+        "tokenizer_classes": [
+            "XLMRobertaTokenizer",
+            "XLMRobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "XLMRobertaXLForQuestionAnswering"
+        ],
+        "sha": "da388fdd2d28e0757eb0c2b2c612a8ff03af2223"
+    },
+    "XLMRobertaXLForSequenceClassification": {
+        "tokenizer_classes": [
+            "XLMRobertaTokenizer",
+            "XLMRobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "XLMRobertaXLForSequenceClassification"
+        ],
+        "sha": "980721187633bcf21ac0b8edbed933527f4611df"
+    },
+    "XLMRobertaXLForTokenClassification": {
+        "tokenizer_classes": [
+            "XLMRobertaTokenizer",
+            "XLMRobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "XLMRobertaXLForTokenClassification"
+        ],
+        "sha": "37a97280faf6fef0bd946d3934d77a1b60fbf473"
+    },
+    "XLMRobertaXLModel": {
+        "tokenizer_classes": [
+            "XLMRobertaTokenizer",
+            "XLMRobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "XLMRobertaXLModel"
+        ],
+        "sha": "8fbeb39a984912e47f5d24a31be61639031a0fc3"
+    },
+    "XLMWithLMHeadModel": {
+        "tokenizer_classes": [
+            "XLMTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "TFXLMWithLMHeadModel",
+            "XLMWithLMHeadModel"
+        ],
+        "sha": "db70bdefbaf095e88b8097e4b601d9105a511afa"
+    },
+    "XLNetForMultipleChoice": {
+        "tokenizer_classes": [
+            "XLNetTokenizer",
+            "XLNetTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "TFXLNetForMultipleChoice",
+            "XLNetForMultipleChoice"
+        ],
+        "sha": "8bb7e28d0cd1e93154d3232baf5e9c79acaf9f1a"
+    },
+    "XLNetForQuestionAnsweringSimple": {
+        "tokenizer_classes": [
+            "XLNetTokenizer",
+            "XLNetTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "TFXLNetForQuestionAnsweringSimple",
+            "XLNetForQuestionAnsweringSimple"
+        ],
+        "sha": "fabd06a45d947f3d46f1b8dce2186cf3b27776dc"
+    },
+    "XLNetForSequenceClassification": {
+        "tokenizer_classes": [
+            "XLNetTokenizer",
+            "XLNetTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "TFXLNetForSequenceClassification",
+            "XLNetForSequenceClassification"
+        ],
+        "sha": "e3c194f24537ebf2c474ade60becb9397696edec"
+    },
+    "XLNetForTokenClassification": {
+        "tokenizer_classes": [
+            "XLNetTokenizer",
+            "XLNetTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "TFXLNetForTokenClassification",
+            "XLNetForTokenClassification"
+        ],
+        "sha": "16aa15029aa667046d504c4a88ceddfdd5b5fb40"
+    },
+    "XLNetLMHeadModel": {
+        "tokenizer_classes": [
+            "XLNetTokenizer",
+            "XLNetTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "TFXLNetLMHeadModel",
+            "XLNetLMHeadModel"
+        ],
+        "sha": "c9a98cc982a16ca162832a8cbea25116479bb938"
+    },
+    "XLNetModel": {
+        "tokenizer_classes": [
+            "XLNetTokenizer",
+            "XLNetTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "TFXLNetModel",
+            "XLNetModel"
+        ],
+        "sha": "1d6e231942135faf32b8d9a97773d8f6c85ca561"
+    },
+    "XmodForCausalLM": {
+        "tokenizer_classes": [
+            "XLMRobertaTokenizer",
+            "XLMRobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "XmodForCausalLM"
+        ],
+        "sha": "c6b746071f2f067099a8fb4f57ce3c27a7e4b67d"
+    },
+    "XmodForMaskedLM": {
+        "tokenizer_classes": [
+            "XLMRobertaTokenizer",
+            "XLMRobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "XmodForMaskedLM"
+        ],
+        "sha": "e1085818f4ed3c6073b2038635e5f3061208923d"
+    },
+    "XmodForMultipleChoice": {
+        "tokenizer_classes": [
+            "XLMRobertaTokenizer",
+            "XLMRobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "XmodForMultipleChoice"
+        ],
+        "sha": "c63042cdf196be3fed846421b345d439b2483f69"
+    },
+    "XmodForQuestionAnswering": {
+        "tokenizer_classes": [
+            "XLMRobertaTokenizer",
+            "XLMRobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "XmodForQuestionAnswering"
+        ],
+        "sha": "75acd3071fae9978c82618cd0f090c87aabc1f23"
+    },
+    "XmodForSequenceClassification": {
+        "tokenizer_classes": [
+            "XLMRobertaTokenizer",
+            "XLMRobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "XmodForSequenceClassification"
+        ],
+        "sha": "523a16570be048618913ac17ccd00d343bcb5e99"
+    },
+    "XmodForTokenClassification": {
+        "tokenizer_classes": [
+            "XLMRobertaTokenizer",
+            "XLMRobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "XmodForTokenClassification"
+        ],
+        "sha": "a0f0a02732b4579670dad11a69ae244ebd777b49"
+    },
+    "XmodModel": {
+        "tokenizer_classes": [
+            "XLMRobertaTokenizer",
+            "XLMRobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "XmodModel"
+        ],
+        "sha": "bc286de0035450e7dcd6bcce78098a967b9c2b6c"
+    },
+    "YolosForObjectDetection": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "YolosImageProcessor"
+        ],
+        "model_classes": [
+            "YolosForObjectDetection"
+        ],
+        "sha": "0a4aae25bfbe8b5edd4815cb00d697a6ba7d2126"
+    },
+    "YolosModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "YolosImageProcessor"
+        ],
+        "model_classes": [
+            "YolosModel"
+        ],
+        "sha": "339bc51f1914f031a550e5f95095ed4a4c22a7de"
+    },
+    "YosoForMaskedLM": {
+        "tokenizer_classes": [
+            "AlbertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "YosoForMaskedLM"
+        ],
+        "sha": "cb291bedcbec199ea195f086e3ebea6fab026bba"
+    },
+    "YosoForMultipleChoice": {
+        "tokenizer_classes": [
+            "AlbertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "YosoForMultipleChoice"
+        ],
+        "sha": "cf2d3a3f0628bc9d0da68ea8de26b12016453fee"
+    },
+    "YosoForQuestionAnswering": {
+        "tokenizer_classes": [
+            "AlbertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "YosoForQuestionAnswering"
+        ],
+        "sha": "e8c3091f674588adfa3371b3de0427a9b39dd03f"
+    },
+    "YosoForSequenceClassification": {
+        "tokenizer_classes": [
+            "AlbertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "YosoForSequenceClassification"
+        ],
+        "sha": "88132cbaa1a9a87f65b6f9813c388011377f18cf"
+    },
+    "YosoForTokenClassification": {
+        "tokenizer_classes": [
+            "AlbertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "YosoForTokenClassification"
+        ],
+        "sha": "fd2219856608d3dba70dc7b1a06af629903dec31"
+    },
+    "YosoModel": {
+        "tokenizer_classes": [
+            "AlbertTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "YosoModel"
+        ],
+        "sha": "e144d9f1fe39c21eda1177702640e126892605ce"
+    }
+}
\ No newline at end of file
diff --git a/utils/add_pipeline_model_mapping_to_test.py b/utils/add_pipeline_model_mapping_to_test.py
new file mode 100644
index 000000000000..ebefcff9afe2
--- /dev/null
+++ b/utils/add_pipeline_model_mapping_to_test.py
@@ -0,0 +1,337 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A script to add and/or update the attribute `pipeline_model_mapping` in model test files.
+
+This script will be (mostly) used in the following 2 situations:
+
+  - run within a (scheduled) CI job to:
+    - check if model test files in the library have updated `pipeline_model_mapping`,
+    - and/or update test files and (possibly) open a GitHub pull request automatically
+  - being run by a `transformers` member to quickly check and update some particular test file(s)
+
+This script is **NOT** intended to be run (manually) by community contributors.
+"""
+
+
+import argparse
+import glob
+import inspect
+import os
+import re
+import unittest
+
+from get_test_info import get_test_classes
+
+from tests.test_pipeline_mixin import pipeline_test_mapping
+
+
+PIPELINE_TEST_MAPPING = {}
+for task, _ in pipeline_test_mapping.items():
+    PIPELINE_TEST_MAPPING[task] = {"pt": None, "tf": None}
+
+
+# DO **NOT** add item to this set (unless the reason is approved)
+TEST_FILE_TO_IGNORE = {
+    "tests/models/esm/test_modeling_esmfold.py",  # The pipeline test mapping is added to `test_modeling_esm.py`
+}
+
+
+def get_framework(test_class):
+    """Infer the framework from the test class `test_class`."""
+
+    if "ModelTesterMixin" in [x.__name__ for x in test_class.__bases__]:
+        return "pt"
+    elif "TFModelTesterMixin" in [x.__name__ for x in test_class.__bases__]:
+        return "tf"
+    elif "FlaxModelTesterMixin" in [x.__name__ for x in test_class.__bases__]:
+        return "flax"
+    else:
+        return None
+
+
+def get_mapping_for_task(task, framework):
+    """Get mappings defined in `XXXPipelineTests` for the task `task`."""
+    # Use the cached results
+    if PIPELINE_TEST_MAPPING[task].get(framework, None) is not None:
+        return PIPELINE_TEST_MAPPING[task][framework]
+
+    pipeline_test_class = pipeline_test_mapping[task]["test"]
+    mapping = None
+
+    if framework == "pt":
+        mapping = getattr(pipeline_test_class, "model_mapping", None)
+    elif framework == "tf":
+        mapping = getattr(pipeline_test_class, "tf_model_mapping", None)
+
+    if mapping is not None:
+        mapping = dict(mapping.items())
+
+    # cache the results
+    PIPELINE_TEST_MAPPING[task][framework] = mapping
+    return mapping
+
+
+def get_model_for_pipeline_test(test_class, task):
+    """Get the model architecture(s) related to the test class `test_class` for a pipeline `task`."""
+    framework = get_framework(test_class)
+    if framework is None:
+        return None
+    mapping = get_mapping_for_task(task, framework)
+    if mapping is None:
+        return None
+
+    config_classes = list({model_class.config_class for model_class in test_class.all_model_classes})
+    if len(config_classes) != 1:
+        raise ValueError("There should be exactly one configuration class from `test_class.all_model_classes`.")
+
+    # This could be a list/tuple of model classes, but it's rare.
+    model_class = mapping.get(config_classes[0], None)
+    if isinstance(model_class, (tuple, list)):
+        model_class = sorted(model_class, key=lambda x: x.__name__)
+
+    return model_class
+
+
+def get_pipeline_model_mapping(test_class):
+    """Get `pipeline_model_mapping` for `test_class`."""
+    mapping = [(task, get_model_for_pipeline_test(test_class, task)) for task in pipeline_test_mapping]
+    mapping = sorted([(task, model) for task, model in mapping if model is not None], key=lambda x: x[0])
+
+    return dict(mapping)
+
+
+def get_pipeline_model_mapping_string(test_class):
+    """Get `pipeline_model_mapping` for `test_class` as a string (to be added to the test file).
+
+    This will be a 1-line string. After this is added to a test file, `make style` will format it beautifully.
+    """
+    framework = get_framework(test_class)
+    if framework == "pt":
+        framework = "torch"
+    default_value = "{}"
+
+    mapping = get_pipeline_model_mapping(test_class)
+    if len(mapping) == 0:
+        return ""
+
+    texts = []
+    for task, model_classes in mapping.items():
+        if isinstance(model_classes, (tuple, list)):
+            # A list/tuple of model classes
+            value = "(" + ", ".join([x.__name__ for x in model_classes]) + ")"
+        else:
+            # A single model class
+            value = model_classes.__name__
+        texts.append(f'"{task}": {value}')
+    text = "{" + ", ".join(texts) + "}"
+    text = f"pipeline_model_mapping = {text} if is_{framework}_available() else {default_value}"
+
+    return text
+
+
+def is_valid_test_class(test_class):
+    """Restrict to `XXXModelTesterMixin` and should be a subclass of `unittest.TestCase`."""
+    base_class_names = {"ModelTesterMixin", "TFModelTesterMixin", "FlaxModelTesterMixin"}
+    if not issubclass(test_class, unittest.TestCase):
+        return False
+    return len(base_class_names.intersection([x.__name__ for x in test_class.__bases__])) > 0
+
+
+def find_test_class(test_file):
+    """Find a test class in `test_file` to which we will add `pipeline_model_mapping`."""
+    test_classes = [x for x in get_test_classes(test_file) if is_valid_test_class(x)]
+
+    target_test_class = None
+    for test_class in test_classes:
+        # If a test class has defined `pipeline_model_mapping`, let's take it
+        if getattr(test_class, "pipeline_model_mapping", None) is not None:
+            target_test_class = test_class
+            break
+    # Take the test class with the shortest name (just a heuristic)
+    if target_test_class is None and len(test_classes) > 0:
+        target_test_class = sorted(test_classes, key=lambda x: (len(x.__name__), x.__name__))[0]
+
+    return target_test_class
+
+
+def find_block_ending(lines, start_idx, indent_level):
+    end_idx = start_idx
+    for idx, line in enumerate(lines[start_idx:]):
+        indent = len(line) - len(line.lstrip())
+        if idx == 0 or indent > indent_level or (indent == indent_level and line.strip() == ")"):
+            end_idx = start_idx + idx
+        elif idx > 0 and indent <= indent_level:
+            # Outside the definition block of `pipeline_model_mapping`
+            break
+
+    return end_idx
+
+
+def add_pipeline_model_mapping(test_class, overwrite=False):
+    """Add `pipeline_model_mapping` to `test_class`."""
+    if getattr(test_class, "pipeline_model_mapping", None) is not None:
+        if not overwrite:
+            return "", -1
+
+    line_to_add = get_pipeline_model_mapping_string(test_class)
+    if len(line_to_add) == 0:
+        return "", -1
+    line_to_add = line_to_add + "\n"
+
+    # The code defined the class `test_class`
+    class_lines, class_start_line_no = inspect.getsourcelines(test_class)
+    # `inspect` gives the code for an object, including decorator(s) if any.
+    # We (only) need the exact line of the class definition.
+    for idx, line in enumerate(class_lines):
+        if line.lstrip().startswith("class "):
+            class_lines = class_lines[idx:]
+            class_start_line_no += idx
+            break
+    class_end_line_no = class_start_line_no + len(class_lines) - 1
+
+    # The index in `class_lines` that starts the definition of `all_model_classes`, `all_generative_model_classes` or
+    # `pipeline_model_mapping`. This assumes they are defined in such order, and we take the start index of the last
+    # block that appears in a `test_class`.
+    start_idx = None
+    # The indent level of the line at `class_lines[start_idx]` (if defined)
+    indent_level = 0
+    # To record if `pipeline_model_mapping` is found in `test_class`.
+    def_line = None
+    for idx, line in enumerate(class_lines):
+        if line.strip().startswith("all_model_classes = "):
+            indent_level = len(line) - len(line.lstrip())
+            start_idx = idx
+        elif line.strip().startswith("all_generative_model_classes = "):
+            indent_level = len(line) - len(line.lstrip())
+            start_idx = idx
+        elif line.strip().startswith("pipeline_model_mapping = "):
+            indent_level = len(line) - len(line.lstrip())
+            start_idx = idx
+            def_line = line
+            break
+
+    if start_idx is None:
+        return "", -1
+    # Find the ending index (inclusive) of the above found block.
+    end_idx = find_block_ending(class_lines, start_idx, indent_level)
+
+    # Extract `is_xxx_available()` from existing blocks: some models require specific libraries like `timm` and use
+    # `is_timm_available()` instead of `is_torch_available()`.
+    # Keep leading and trailing whitespaces
+    r = re.compile(r"\s(is_\S+?_available\(\))\s")
+    for line in class_lines[start_idx : end_idx + 1]:
+        backend_condition = r.search(line)
+        if backend_condition is not None:
+            # replace the leading and trailing whitespaces to the space character " ".
+            target = " " + backend_condition[0][1:-1] + " "
+            line_to_add = r.sub(target, line_to_add)
+            break
+
+    if def_line is None:
+        # `pipeline_model_mapping` is not defined. The target index is set to the ending index (inclusive) of
+        # `all_model_classes` or `all_generative_model_classes`.
+        target_idx = end_idx
+    else:
+        # `pipeline_model_mapping` is defined. The target index is set to be one **BEFORE** its start index.
+        target_idx = start_idx - 1
+        # mark the lines of the currently existing `pipeline_model_mapping` to be removed.
+        for idx in range(start_idx, end_idx + 1):
+            # These lines are going to be removed before writing to the test file.
+            class_lines[idx] = None  # noqa
+
+    # Make sure the test class is a subclass of `PipelineTesterMixin`.
+    parent_classes = [x.__name__ for x in test_class.__bases__]
+    if "PipelineTesterMixin" not in parent_classes:
+        # Put `PipelineTesterMixin` just before `unittest.TestCase`
+        _parent_classes = [x for x in parent_classes if x != "TestCase"] + ["PipelineTesterMixin"]
+        if "TestCase" in parent_classes:
+            # Here we **assume** the original string is always with `unittest.TestCase`.
+            _parent_classes.append("unittest.TestCase")
+        parent_classes = ", ".join(_parent_classes)
+        for idx, line in enumerate(class_lines):
+            # Find the ending of the declaration of `test_class`
+            if line.strip().endswith("):"):
+                # mark the lines of the declaration of `test_class` to be removed
+                for _idx in range(idx + 1):
+                    class_lines[_idx] = None  # noqa
+                break
+        # Add the new, one-line, class declaration for `test_class`
+        class_lines[0] = f"class {test_class.__name__}({parent_classes}):\n"
+
+    # Add indentation
+    line_to_add = " " * indent_level + line_to_add
+    # Insert `pipeline_model_mapping` to `class_lines`.
+    # (The line at `target_idx` should be kept by definition!)
+    class_lines = class_lines[: target_idx + 1] + [line_to_add] + class_lines[target_idx + 1 :]
+    # Remove the lines that are marked to be removed
+    class_lines = [x for x in class_lines if x is not None]
+
+    # Move from test class to module (in order to write to the test file)
+    module_lines = inspect.getsourcelines(inspect.getmodule(test_class))[0]
+    # Be careful with the 1-off between line numbers and array indices
+    module_lines = module_lines[: class_start_line_no - 1] + class_lines + module_lines[class_end_line_no:]
+    code = "".join(module_lines)
+
+    moddule_file = inspect.getsourcefile(test_class)
+    with open(moddule_file, "w", encoding="UTF-8", newline="\n") as fp:
+        fp.write(code)
+
+    return line_to_add
+
+
+def add_pipeline_model_mapping_to_test_file(test_file, overwrite=False):
+    """Add `pipeline_model_mapping` to `test_file`."""
+    test_class = find_test_class(test_file)
+    if test_class:
+        add_pipeline_model_mapping(test_class, overwrite=overwrite)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--test_file", type=str, help="A path to the test file, starting with the repository's `tests` directory."
+    )
+    parser.add_argument(
+        "--all",
+        action="store_true",
+        help="If to check and modify all test files.",
+    )
+    parser.add_argument(
+        "--overwrite",
+        action="store_true",
+        help="If to overwrite a test class if it has already defined `pipeline_model_mapping`.",
+    )
+    args = parser.parse_args()
+
+    if not args.all and not args.test_file:
+        raise ValueError("Please specify either `test_file` or pass `--all` to check/modify all test files.")
+    elif args.all and args.test_file:
+        raise ValueError("Only one of `--test_file` and `--all` could be specified.")
+
+    test_files = []
+    if args.test_file:
+        test_files = [args.test_file]
+    else:
+        pattern = os.path.join("tests", "models", "**", "test_modeling_*.py")
+        for test_file in glob.glob(pattern):
+            # `Flax` is not concerned at this moment
+            if not test_file.startswith("test_modeling_flax_"):
+                test_files.append(test_file)
+
+    for test_file in test_files:
+        if test_file in TEST_FILE_TO_IGNORE:
+            print(f"[SKIPPED] {test_file} is skipped as it is in `TEST_FILE_TO_IGNORE` in the file {__file__}.")
+            continue
+        add_pipeline_model_mapping_to_test_file(test_file, overwrite=args.overwrite)
diff --git a/utils/check_build.py b/utils/check_build.py
new file mode 100644
index 000000000000..e3cca31f837f
--- /dev/null
+++ b/utils/check_build.py
@@ -0,0 +1,48 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import importlib
+from pathlib import Path
+
+
+# Test all the extensions added in the setup
+FILES_TO_FIND = [
+    "kernels/rwkv/wkv_cuda.cu",
+    "kernels/rwkv/wkv_op.cpp",
+    "kernels/deformable_detr/ms_deform_attn.h",
+    "kernels/deformable_detr/cuda/ms_deform_im2col_cuda.cuh",
+    "models/graphormer/algos_graphormer.pyx",
+]
+
+
+def test_custom_files_are_present(transformers_path):
+    # Test all the extensions added in the setup
+    for file in FILES_TO_FIND:
+        if not (transformers_path / file).exists():
+            return False
+    return True
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--check_lib", action="store_true", help="Whether to check the build or the actual package.")
+    args = parser.parse_args()
+    if args.check_lib:
+        transformers_module = importlib.import_module("transformers")
+        transformers_path = Path(transformers_module.__file__).parent
+    else:
+        transformers_path = Path.cwd() / "build/lib/transformers"
+    if not test_custom_files_are_present(transformers_path):
+        raise ValueError("The built release does not contain the custom files. Fix this before going further!")
diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py
new file mode 100644
index 000000000000..0f0c5b41e409
--- /dev/null
+++ b/utils/check_config_attributes.py
@@ -0,0 +1,299 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import os
+import re
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import direct_transformers_import
+
+
+# All paths are set with the intent you should run this script from the root of the repo with the command
+# python utils/check_config_docstrings.py
+PATH_TO_TRANSFORMERS = "src/transformers"
+
+
+# This is to make sure the transformers module imported is the one in the repo.
+transformers = direct_transformers_import(PATH_TO_TRANSFORMERS)
+
+CONFIG_MAPPING = transformers.models.auto.configuration_auto.CONFIG_MAPPING
+
+SPECIAL_CASES_TO_ALLOW = {
+    # used to compute the property `self.chunk_length`
+    "EncodecConfig": ["overlap"],
+    # used as `self.bert_model = BertModel(config, ...)`
+    "DPRConfig": True,
+    # not used in modeling files, but it's an important information
+    "FSMTConfig": ["langs"],
+    # used internally in the configuration class file
+    "GPTNeoConfig": ["attention_types"],
+    # used internally in the configuration class file
+    "EsmConfig": ["is_folding_model"],
+    # used during training (despite we don't have training script for these models yet)
+    "Mask2FormerConfig": ["ignore_value"],
+    # `ignore_value` used during training (despite we don't have training script for these models yet)
+    # `norm` used in conversion script (despite not using in the modeling file)
+    "OneFormerConfig": ["ignore_value", "norm"],
+    # used during preprocessing and collation, see `collating_graphormer.py`
+    "GraphormerConfig": ["spatial_pos_max"],
+    # used internally in the configuration class file
+    "T5Config": ["feed_forward_proj"],
+    # used internally in the configuration class file
+    # `tokenizer_class` get default value `T5Tokenizer` intentionally
+    "MT5Config": ["feed_forward_proj", "tokenizer_class"],
+    "UMT5Config": ["feed_forward_proj", "tokenizer_class"],
+    # used internally in the configuration class file
+    "LongT5Config": ["feed_forward_proj"],
+    # used internally in the configuration class file
+    "Pop2PianoConfig": ["feed_forward_proj"],
+    # used internally in the configuration class file
+    "SwitchTransformersConfig": ["feed_forward_proj"],
+    # having default values other than `1e-5` - we can't fix them without breaking
+    "BioGptConfig": ["layer_norm_eps"],
+    # having default values other than `1e-5` - we can't fix them without breaking
+    "GLPNConfig": ["layer_norm_eps"],
+    # having default values other than `1e-5` - we can't fix them without breaking
+    "SegformerConfig": ["layer_norm_eps"],
+    # having default values other than `1e-5` - we can't fix them without breaking
+    "CvtConfig": ["layer_norm_eps"],
+    # having default values other than `1e-5` - we can't fix them without breaking
+    "PerceiverConfig": ["layer_norm_eps"],
+    # used internally to calculate the feature size
+    "InformerConfig": ["num_static_real_features", "num_time_features"],
+    # used internally to calculate the feature size
+    "TimeSeriesTransformerConfig": ["num_static_real_features", "num_time_features"],
+    # used internally to calculate the feature size
+    "AutoformerConfig": ["num_static_real_features", "num_time_features"],
+    # used internally to calculate `mlp_dim`
+    "SamVisionConfig": ["mlp_ratio"],
+    # For (head) training, but so far not implemented
+    "ClapAudioConfig": ["num_classes"],
+    # Not used, but providing useful information to users
+    "SpeechT5HifiGanConfig": ["sampling_rate"],
+}
+
+
+# TODO (ydshieh): Check the failing cases, try to fix them or move some cases to the above block once we are sure
+SPECIAL_CASES_TO_ALLOW.update(
+    {
+        "CLIPSegConfig": True,
+        "DeformableDetrConfig": True,
+        "DetaConfig": True,
+        "DinatConfig": True,
+        "DonutSwinConfig": True,
+        "EfficientFormerConfig": True,
+        "FSMTConfig": True,
+        "JukeboxConfig": True,
+        "LayoutLMv2Config": True,
+        "MaskFormerSwinConfig": True,
+        "MT5Config": True,
+        # For backward compatibility with trust remote code models
+        "MptConfig": True,
+        "MptAttentionConfig": True,
+        "NatConfig": True,
+        "OneFormerConfig": True,
+        "PerceiverConfig": True,
+        "RagConfig": True,
+        "SpeechT5Config": True,
+        "SwinConfig": True,
+        "Swin2SRConfig": True,
+        "Swinv2Config": True,
+        "SwitchTransformersConfig": True,
+        "TableTransformerConfig": True,
+        "TapasConfig": True,
+        "TransfoXLConfig": True,
+        "UniSpeechConfig": True,
+        "UniSpeechSatConfig": True,
+        "WavLMConfig": True,
+        "WhisperConfig": True,
+        # TODO: @Arthur (for `alignment_head` and `alignment_layer`)
+        "JukeboxPriorConfig": True,
+        # TODO: @Younes (for `is_decoder`)
+        "Pix2StructTextConfig": True,
+        "IdeficsConfig": True,
+        "IdeficsVisionConfig": True,
+        "IdeficsPerceiverConfig": True,
+    }
+)
+
+
+def check_attribute_being_used(config_class, attributes, default_value, source_strings):
+    """Check if any name in `attributes` is used in one of the strings in `source_strings`
+
+    Args:
+        config_class (`type`):
+            The configuration class for which the arguments in its `__init__` will be checked.
+        attributes (`List[str]`):
+            The name of an argument (or attribute) and its variant names if any.
+        default_value (`Any`):
+            A default value for the attribute in `attributes` assigned in the `__init__` of `config_class`.
+        source_strings (`List[str]`):
+            The python source code strings in the same modeling directory where `config_class` is defined. The file
+            containing the definition of `config_class` should be excluded.
+    """
+    attribute_used = False
+    for attribute in attributes:
+        for modeling_source in source_strings:
+            # check if we can find `config.xxx`, `getattr(config, "xxx", ...)` or `getattr(self.config, "xxx", ...)`
+            if (
+                f"config.{attribute}" in modeling_source
+                or f'getattr(config, "{attribute}"' in modeling_source
+                or f'getattr(self.config, "{attribute}"' in modeling_source
+            ):
+                attribute_used = True
+            # Deal with multi-line cases
+            elif (
+                re.search(
+                    rf'getattr[ \t\v\n\r\f]*\([ \t\v\n\r\f]*(self\.)?config,[ \t\v\n\r\f]*"{attribute}"',
+                    modeling_source,
+                )
+                is not None
+            ):
+                attribute_used = True
+            # `SequenceSummary` is called with `SequenceSummary(config)`
+            elif attribute in [
+                "summary_type",
+                "summary_use_proj",
+                "summary_activation",
+                "summary_last_dropout",
+                "summary_proj_to_labels",
+                "summary_first_dropout",
+            ]:
+                if "SequenceSummary" in modeling_source:
+                    attribute_used = True
+            if attribute_used:
+                break
+        if attribute_used:
+            break
+
+    # common and important attributes, even if they do not always appear in the modeling files
+    attributes_to_allow = [
+        "bos_index",
+        "eos_index",
+        "pad_index",
+        "unk_index",
+        "mask_index",
+        "image_size",
+        "use_cache",
+        "out_features",
+        "out_indices",
+        "sampling_rate",
+    ]
+    attributes_used_in_generation = ["encoder_no_repeat_ngram_size"]
+
+    # Special cases to be allowed
+    case_allowed = True
+    if not attribute_used:
+        case_allowed = False
+        for attribute in attributes:
+            # Allow if the default value in the configuration class is different from the one in `PretrainedConfig`
+            if attribute in ["is_encoder_decoder"] and default_value is True:
+                case_allowed = True
+            elif attribute in ["tie_word_embeddings"] and default_value is False:
+                case_allowed = True
+
+            # Allow cases without checking the default value in the configuration class
+            elif attribute in attributes_to_allow + attributes_used_in_generation:
+                case_allowed = True
+            elif attribute.endswith("_token_id"):
+                case_allowed = True
+
+            # configuration class specific cases
+            if not case_allowed:
+                allowed_cases = SPECIAL_CASES_TO_ALLOW.get(config_class.__name__, [])
+                case_allowed = allowed_cases is True or attribute in allowed_cases
+
+    return attribute_used or case_allowed
+
+
+def check_config_attributes_being_used(config_class):
+    """Check the arguments in `__init__` of `config_class` are used in the modeling files in the same directory
+
+    Args:
+        config_class (`type`):
+            The configuration class for which the arguments in its `__init__` will be checked.
+    """
+    # Get the parameters in `__init__` of the configuration class, and the default values if any
+    signature = dict(inspect.signature(config_class.__init__).parameters)
+    parameter_names = [x for x in list(signature.keys()) if x not in ["self", "kwargs"]]
+    parameter_defaults = [signature[param].default for param in parameter_names]
+
+    # If `attribute_map` exists, an attribute can have different names to be used in the modeling files, and as long
+    # as one variant is used, the test should pass
+    reversed_attribute_map = {}
+    if len(config_class.attribute_map) > 0:
+        reversed_attribute_map = {v: k for k, v in config_class.attribute_map.items()}
+
+    # Get the path to modeling source files
+    config_source_file = inspect.getsourcefile(config_class)
+    model_dir = os.path.dirname(config_source_file)
+    # Let's check against all frameworks: as long as one framework uses an attribute, we are good.
+    modeling_paths = [os.path.join(model_dir, fn) for fn in os.listdir(model_dir) if fn.startswith("modeling_")]
+
+    # Get the source code strings
+    modeling_sources = []
+    for path in modeling_paths:
+        if os.path.isfile(path):
+            with open(path, encoding="utf8") as fp:
+                modeling_sources.append(fp.read())
+
+    unused_attributes = []
+    for config_param, default_value in zip(parameter_names, parameter_defaults):
+        # `attributes` here is all the variant names for `config_param`
+        attributes = [config_param]
+        # some configuration classes have non-empty `attribute_map`, and both names could be used in the
+        # corresponding modeling files. As long as one of them appears, it is fine.
+        if config_param in reversed_attribute_map:
+            attributes.append(reversed_attribute_map[config_param])
+
+        if not check_attribute_being_used(config_class, attributes, default_value, modeling_sources):
+            unused_attributes.append(attributes[0])
+
+    return sorted(unused_attributes)
+
+
+def check_config_attributes():
+    """Check the arguments in `__init__` of all configuration classes are used in  python files"""
+    configs_with_unused_attributes = {}
+    for _config_class in list(CONFIG_MAPPING.values()):
+        # Skip deprecated models
+        if "models.deprecated" in _config_class.__module__:
+            continue
+        # Some config classes are not in `CONFIG_MAPPING` (e.g. `CLIPVisionConfig`, `Blip2VisionConfig`, etc.)
+        config_classes_in_module = [
+            cls
+            for name, cls in inspect.getmembers(
+                inspect.getmodule(_config_class),
+                lambda x: inspect.isclass(x)
+                and issubclass(x, PretrainedConfig)
+                and inspect.getmodule(x) == inspect.getmodule(_config_class),
+            )
+        ]
+        for config_class in config_classes_in_module:
+            unused_attributes = check_config_attributes_being_used(config_class)
+            if len(unused_attributes) > 0:
+                configs_with_unused_attributes[config_class.__name__] = unused_attributes
+
+    if len(configs_with_unused_attributes) > 0:
+        error = "The following configuration classes contain unused attributes in the corresponding modeling files:\n"
+        for name, attributes in configs_with_unused_attributes.items():
+            error += f"{name}: {attributes}\n"
+
+        raise ValueError(error)
+
+
+if __name__ == "__main__":
+    check_config_attributes()
diff --git a/utils/check_config_docstrings.py b/utils/check_config_docstrings.py
index ba6b1b72fc89..02ec510baba6 100644
--- a/utils/check_config_docstrings.py
+++ b/utils/check_config_docstrings.py
@@ -13,11 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import importlib
 import inspect
-import os
 import re
 
+from transformers.utils import direct_transformers_import
+
 
 # All paths are set with the intent you should run this script from the root of the repo with the command
 # python utils/check_config_docstrings.py
@@ -25,27 +25,25 @@
 
 
 # This is to make sure the transformers module imported is the one in the repo.
-spec = importlib.util.spec_from_file_location(
-    "transformers",
-    os.path.join(PATH_TO_TRANSFORMERS, "__init__.py"),
-    submodule_search_locations=[PATH_TO_TRANSFORMERS],
-)
-transformers = spec.loader.load_module()
+transformers = direct_transformers_import(PATH_TO_TRANSFORMERS)
 
 CONFIG_MAPPING = transformers.models.auto.configuration_auto.CONFIG_MAPPING
 
 # Regex pattern used to find the checkpoint mentioned in the docstring of `config_class`.
 # For example, `[bert-base-uncased](https://huggingface.co/bert-base-uncased)`
-_re_checkpoint = re.compile("\[(.+?)\]\((https://huggingface\.co/.+?)\)")
+_re_checkpoint = re.compile(r"\[(.+?)\]\((https://huggingface\.co/.+?)\)")
 
 
 CONFIG_CLASSES_TO_IGNORE_FOR_DOCSTRING_CHECKPOINT_CHECK = {
     "DecisionTransformerConfig",
     "EncoderDecoderConfig",
+    "MusicgenConfig",
     "RagConfig",
     "SpeechEncoderDecoderConfig",
+    "TimmBackboneConfig",
     "VisionEncoderDecoderConfig",
     "VisionTextDualEncoderConfig",
+    "LlamaConfig",
 }
 
 
@@ -56,10 +54,12 @@ def get_checkpoint_from_config_class(config_class):
     config_source = inspect.getsource(config_class)
     checkpoints = _re_checkpoint.findall(config_source)
 
-    for checkpoint in checkpoints:
-        # Each `checkpoint` is a tuple of a checkpoint name and a checkpoint link.
-        # For example, `('bert-base-uncased', 'https://huggingface.co/bert-base-uncased')`
-        ckpt_name, ckpt_link = checkpoint
+    # Each `checkpoint` is a tuple of a checkpoint name and a checkpoint link.
+    # For example, `('bert-base-uncased', 'https://huggingface.co/bert-base-uncased')`
+    for ckpt_name, ckpt_link in checkpoints:
+        # allow the link to end with `/`
+        if ckpt_link.endswith("/"):
+            ckpt_link = ckpt_link[:-1]
 
         # verify the checkpoint name corresponds to the checkpoint link
         ckpt_link_from_name = f"https://huggingface.co/{ckpt_name}"
@@ -74,6 +74,9 @@ def check_config_docstrings_have_checkpoints():
     configs_without_checkpoint = []
 
     for config_class in list(CONFIG_MAPPING.values()):
+        # Skip deprecated models
+        if "models.deprecated" in config_class.__module__:
+            continue
         checkpoint = get_checkpoint_from_config_class(config_class)
 
         name = config_class.__name__
@@ -82,7 +85,12 @@ def check_config_docstrings_have_checkpoints():
 
     if len(configs_without_checkpoint) > 0:
         message = "\n".join(sorted(configs_without_checkpoint))
-        raise ValueError(f"The following configurations don't contain any valid checkpoint:\n{message}")
+        raise ValueError(
+            f"The following configurations don't contain any valid checkpoint:\n{message}\n\n"
+            "The requirement is to include a link pointing to one of the models of this architecture in the "
+            "docstring of the config classes listed above. The link should have be a markdown format like "
+            "[myorg/mymodel](https://huggingface.co/myorg/mymodel)."
+        )
 
 
 if __name__ == "__main__":
diff --git a/utils/check_copies.py b/utils/check_copies.py
index 48c1096f2b75..f198b9e062b3 100644
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -12,16 +12,41 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+Utility that checks whether the copies defined in the library match the original or not. This includes:
+- All code commented with `# Copied from` comments,
+- The list of models in the main README.md matches the ones in the localized READMEs,
+- Files that are registered as full copies of one another in the `FULL_COPIES` constant of this script.
+
+This also checks the list of models in the README is complete (has all models) and add a line to complete if there is
+a model missing.
+
+Use from the root of the repo with:
+
+```bash
+python utils/check_copies.py
+```
+
+for a check that will error in case of inconsistencies (used by `make repo-consistency`) or
+
+```bash
+python utils/check_copies.py --fix_and_overwrite
+```
+
+for a check that will fix all inconsistencies automatically (used by `make fix-copies`).
+"""
 
 import argparse
 import glob
-import importlib.util
 import os
 import re
+from typing import List, Optional, Tuple
 
 import black
 from doc_builder.style_doc import style_docstrings_in_code
 
+from transformers.utils import direct_transformers_import
+
 
 # All paths are set with the intent you should run this script from the root of the repo with the command
 # python utils/check_copies.py
@@ -98,20 +123,25 @@
 
 
 # This is to make sure the transformers module imported is the one in the repo.
-spec = importlib.util.spec_from_file_location(
-    "transformers",
-    os.path.join(TRANSFORMERS_PATH, "__init__.py"),
-    submodule_search_locations=[TRANSFORMERS_PATH],
-)
-transformers_module = spec.loader.load_module()
+transformers_module = direct_transformers_import(TRANSFORMERS_PATH)
+
+
+def _should_continue(line: str, indent: str) -> bool:
+    # Helper function. Returns `True` if `line` is empty, starts with the `indent` or is the end parenthesis of a
+    # function definition
+    return line.startswith(indent) or len(line.strip()) == 0 or re.search(r"^\s*\)(\s*->.*:|:)\s*$", line) is not None
 
 
-def _should_continue(line, indent):
-    return line.startswith(indent) or len(line) <= 1 or re.search(r"^\s*\)(\s*->.*:|:)\s*$", line) is not None
+def find_code_in_transformers(object_name: str) -> str:
+    """
+    Find and return the source code of an object.
 
+    Args:
+        object_name (`str`): The name of the object we want the source code of.
 
-def find_code_in_transformers(object_name):
-    """Find and return the code source code of `object_name`."""
+    Returns:
+        `str`: The source code of the object.
+    """
     parts = object_name.split(".")
     i = 0
 
@@ -144,7 +174,7 @@ def find_code_in_transformers(object_name):
         raise ValueError(f" {object_name} does not match any function or class in {module}.")
 
     # We found the beginning of the class / func, now let's find the end (when the indent diminishes).
-    start_index = line_index
+    start_index = line_index - 1
     while line_index < len(lines) and _should_continue(lines[line_index], indent):
         line_index += 1
     # Clean up empty lines at the end (if any).
@@ -160,7 +190,16 @@ def find_code_in_transformers(object_name):
 _re_fill_pattern = re.compile(r"<FILL\s+[^>]*>")
 
 
-def get_indent(code):
+def get_indent(code: str) -> str:
+    """
+    Find the indent in the first non empty line in a code sample.
+
+    Args:
+        code (`str`): The code to inspect.
+
+    Returns:
+        `str`: The indent looked at (as string).
+    """
     lines = code.split("\n")
     idx = 0
     while idx < len(lines) and len(lines[idx]) == 0:
@@ -170,24 +209,74 @@ def get_indent(code):
     return ""
 
 
-def blackify(code):
+def blackify(code: str) -> str:
     """
-    Applies the black part of our `make style` command to `code`.
+    Applies the black part of our `make style` command to some code.
+
+    Args:
+        code (`str`): The code to format.
+
+    Returns:
+        `str`: The formatted code.
     """
     has_indent = len(get_indent(code)) > 0
     if has_indent:
         code = f"class Bla:\n{code}"
-    mode = black.Mode(target_versions={black.TargetVersion.PY35}, line_length=119, preview=True)
+    mode = black.Mode(target_versions={black.TargetVersion.PY37}, line_length=119)
     result = black.format_str(code, mode=mode)
     result, _ = style_docstrings_in_code(result)
     return result[len("class Bla:\n") :] if has_indent else result
 
 
-def is_copy_consistent(filename, overwrite=False):
+def check_codes_match(observed_code: str, theoretical_code: str) -> Optional[int]:
     """
-    Check if the code commented as a copy in `filename` matches the original.
+    Checks if two version of a code match with the exception of the class/function name.
+
+    Args:
+        observed_code (`str`): The code found.
+        theoretical_code (`str`): The code to match.
 
-    Return the differences or overwrites the content depending on `overwrite`.
+    Returns:
+        `Optional[int]`: The index of the first line where there is a difference (if any) and `None` if the codes
+        match.
+    """
+    observed_code_header = observed_code.split("\n")[0]
+    theoretical_code_header = theoretical_code.split("\n")[0]
+
+    # Catch the function/class name: it is expected that those do not match.
+    _re_class_match = re.compile(r"class\s+([^\(:]+)(?:\(|:)")
+    _re_func_match = re.compile(r"def\s+([^\(]+)\(")
+    for re_pattern in [_re_class_match, _re_func_match]:
+        if re_pattern.match(observed_code_header) is not None:
+            observed_obj_name = re_pattern.search(observed_code_header).groups()[0]
+            theoretical_name = re_pattern.search(theoretical_code_header).groups()[0]
+            theoretical_code_header = theoretical_code_header.replace(theoretical_name, observed_obj_name)
+
+    # Find the first diff. Line 0 is special since we need to compare with the function/class names ignored.
+    diff_index = 0
+    if theoretical_code_header != observed_code_header:
+        return 0
+
+    diff_index = 1
+    for observed_line, theoretical_line in zip(observed_code.split("\n")[1:], theoretical_code.split("\n")[1:]):
+        if observed_line != theoretical_line:
+            return diff_index
+        diff_index += 1
+
+
+def is_copy_consistent(filename: str, overwrite: bool = False) -> Optional[List[Tuple[str, int]]]:
+    """
+    Check if the code commented as a copy in a file matches the original.
+
+    Args:
+        filename (`str`):
+            The name of the file to check.
+        overwrite (`bool`, *optional*, defaults to `False`):
+            Whether or not to overwrite the copies when they don't match.
+
+    Returns:
+        `Optional[List[Tuple[str, int]]]`: If `overwrite=False`, returns the list of differences as tuples `(str, int)`
+        with the name of the object having a diff and the line number where theere is the first diff.
     """
     with open(filename, "r", encoding="utf-8", newline="\n") as f:
         lines = f.readlines()
@@ -205,10 +294,11 @@ def is_copy_consistent(filename, overwrite=False):
         theoretical_code = find_code_in_transformers(object_name)
         theoretical_indent = get_indent(theoretical_code)
 
-        start_index = line_index + 1 if indent == theoretical_indent else line_index + 2
-        indent = theoretical_indent
-        line_index = start_index
+        start_index = line_index + 1 if indent == theoretical_indent else line_index
+        line_index = start_index + 1
 
+        subcode = "\n".join(theoretical_code.split("\n")[1:])
+        indent = get_indent(subcode)
         # Loop to check the observed code, stop when indentation diminishes or if we see a End copy comment.
         should_continue = True
         while line_index < len(lines) and should_continue:
@@ -216,6 +306,8 @@ def is_copy_consistent(filename, overwrite=False):
             if line_index >= len(lines):
                 break
             line = lines[line_index]
+            # There is a special pattern `# End copy` to stop early. It's not documented cause it shouldn't really be
+            # used.
             should_continue = _should_continue(line, indent) and re.search(f"^{indent}# End copy", line) is None
         # Clean up empty lines at the end (if any).
         while len(lines[line_index - 1]) <= 1:
@@ -237,19 +329,12 @@ def is_copy_consistent(filename, overwrite=False):
                     theoretical_code = re.sub(obj1.lower(), obj2.lower(), theoretical_code)
                     theoretical_code = re.sub(obj1.upper(), obj2.upper(), theoretical_code)
 
-            # Blackify after replacement. To be able to do that, we need the header (class or function definition)
-            # from the previous line
-            theoretical_code = blackify(lines[start_index - 1] + theoretical_code)
-            theoretical_code = theoretical_code[len(lines[start_index - 1]) :]
+            theoretical_code = blackify(theoretical_code)
 
         # Test for a diff and act accordingly.
-        if observed_code != theoretical_code:
-            diff_index = start_index + 1
-            for observed_line, theoretical_line in zip(observed_code.split("\n"), theoretical_code.split("\n")):
-                if observed_line != theoretical_line:
-                    break
-                diff_index += 1
-            diffs.append([object_name, diff_index])
+        diff_index = check_codes_match(observed_code, theoretical_code)
+        if diff_index is not None:
+            diffs.append([object_name, diff_index + start_index + 1])
             if overwrite:
                 lines = lines[:start_index] + [theoretical_code] + lines[line_index:]
                 line_index = start_index + 1
@@ -263,6 +348,14 @@ def is_copy_consistent(filename, overwrite=False):
 
 
 def check_copies(overwrite: bool = False):
+    """
+    Check every file is copy-consistent with the original. Also check the model list in the main README and other
+    READMEs are consistent.
+
+    Args:
+        overwrite (`bool`, *optional*, defaults to `False`):
+            Whether or not to overwrite the copies when they don't match.
+    """
     all_files = glob.glob(os.path.join(TRANSFORMERS_PATH, "**/*.py"), recursive=True)
     diffs = []
     for filename in all_files:
@@ -279,6 +372,13 @@ def check_copies(overwrite: bool = False):
 
 
 def check_full_copies(overwrite: bool = False):
+    """
+    Check the files that are full copies of others (as indicated in `FULL_COPIES`) are copy-consistent.
+
+    Args:
+        overwrite (`bool`, *optional*, defaults to `False`):
+            Whether or not to overwrite the copies when they don't match.
+    """
     diffs = []
     for target, source in FULL_COPIES.items():
         with open(source, "r", encoding="utf-8") as f:
@@ -302,8 +402,18 @@ def check_full_copies(overwrite: bool = False):
         )
 
 
-def get_model_list(filename, start_prompt, end_prompt):
-    """Extracts the model list from the README."""
+def get_model_list(filename: str, start_prompt: str, end_prompt: str) -> str:
+    """
+    Extracts the model list from a README.
+
+    Args:
+        filename (`str`): The name of the README file to check.
+        start_prompt (`str`): The string to look for that introduces the model list.
+        end_prompt (`str`): The string to look for that ends the model list.
+
+    Returns:
+        `str`: The model list.
+    """
     with open(os.path.join(REPO_PATH, filename), "r", encoding="utf-8", newline="\n") as f:
         lines = f.readlines()
     # Find the start of the list.
@@ -316,6 +426,7 @@ def get_model_list(filename, start_prompt, end_prompt):
     current_line = ""
     end_index = start_index
 
+    # Keep going until the end of the list.
     while not lines[end_index].startswith(end_prompt):
         if lines[end_index].startswith("1."):
             if len(current_line) > 1:
@@ -330,8 +441,21 @@ def get_model_list(filename, start_prompt, end_prompt):
     return "".join(result)
 
 
-def convert_to_localized_md(model_list, localized_model_list, format_str):
-    """Convert `model_list` to each localized README."""
+def convert_to_localized_md(model_list: str, localized_model_list: str, format_str: str) -> Tuple[bool, str]:
+    """
+    Compare the model list from the main README to the one in a localized README.
+
+    Args:
+        model_list (`str`): The model list in the main README.
+        localized_model_list (`str`): The model list in one of the localized README.
+        format_str (`str`):
+            The template for a model entry in the localized README (look at the `format_model_list` in the entries of
+            `LOCALIZED_READMES` for examples).
+
+    Returns:
+        `Tuple[bool, str]`: A tuple where the first value indicates if the READMEs match or not, and the second value
+        is the correct localized README.
+    """
 
     def _rep(match):
         title, model_link, paper_affiliations, paper_title_link, paper_authors, supplements = match.groups()
@@ -345,7 +469,8 @@ def _rep(match):
         )
 
     # This regex captures metadata from an English model description, including model title, model link,
-    # affiliations of the paper, title of the paper, authors of the paper, and supplemental data (see DistilBERT for example).
+    # affiliations of the paper, title of the paper, authors of the paper, and supplemental data (see DistilBERT for
+    # example).
     _re_capture_meta = re.compile(
         r"\*\*\[([^\]]*)\]\(([^\)]*)\)\*\* \(from ([^)]*)\)[^\[]*([^\)]*\)).*?by (.*?[A-Za-z\*]{2,}?)\. (.*)$"
     )
@@ -366,7 +491,7 @@ def _rep(match):
     model_keys = [re.search(r"\*\*\[([^\]]*)", line).groups()[0] for line in model_list.strip().split("\n")]
 
     # We exclude keys in localized README not in the main one.
-    readmes_match = not any([k not in model_keys for k in localized_model_index])
+    readmes_match = not any(k not in model_keys for k in localized_model_index)
     localized_model_index = {k: v for k, v in localized_model_index.items() if k in model_keys}
 
     for model in model_list.strip().split("\n"):
@@ -389,18 +514,21 @@ def _rep(match):
 
     sorted_index = sorted(localized_model_index.items(), key=lambda x: x[0].lower())
 
-    return readmes_match, "\n".join(map(lambda x: x[1], sorted_index)) + "\n"
+    return readmes_match, "\n".join((x[1] for x in sorted_index)) + "\n"
 
 
-def convert_readme_to_index(model_list):
-    model_list = model_list.replace("https://huggingface.co/docs/transformers/main/", "")
-    return model_list.replace("https://huggingface.co/docs/transformers/", "")
+def _find_text_in_file(filename: str, start_prompt: str, end_prompt: str) -> Tuple[str, int, int, List[str]]:
+    """
+    Find the text in a file between two prompts.
 
+    Args:
+        filename (`str`): The name of the file to look into.
+        start_prompt (`str`): The string to look for that introduces the content looked for.
+        end_prompt (`str`): The string to look for that ends the content looked for.
 
-def _find_text_in_file(filename, start_prompt, end_prompt):
-    """
-    Find the text in `filename` between a line beginning with `start_prompt` and before `end_prompt`, removing empty
-    lines.
+    Returns:
+        Tuple[str, int, int, List[str]]: The content between the two prompts, the index of the start line in the
+        original file, the index of the end line in the original file and the list of lines of that file.
     """
     with open(filename, "r", encoding="utf-8", newline="\n") as f:
         lines = f.readlines()
@@ -423,8 +551,14 @@ def _find_text_in_file(filename, start_prompt, end_prompt):
     return "".join(lines[start_index:end_index]), start_index, end_index, lines
 
 
-def check_model_list_copy(overwrite=False, max_per_line=119):
-    """Check the model lists in the README and index.rst are consistent and maybe `overwrite`."""
+def check_model_list_copy(overwrite: bool = False):
+    """
+    Check the model lists in the README is consistent with the ones in the other READMES and also with `index.nmd`.
+
+    Args:
+        overwrite (`bool`, *optional*, defaults to `False`):
+            Whether or not to overwrite the copies when they don't match.
+    """
     # Fix potential doc links in the README
     with open(os.path.join(REPO_PATH, "README.md"), "r", encoding="utf-8", newline="\n") as f:
         readme = f.read()
@@ -442,18 +576,13 @@ def check_model_list_copy(overwrite=False, max_per_line=119):
                 "automatically fix them."
             )
 
-    # If the introduction or the conclusion of the list change, the prompts may need to be updated.
-    index_list, start_index, end_index, lines = _find_text_in_file(
-        filename=os.path.join(PATH_TO_DOCS, "index.mdx"),
-        start_prompt="<!--This list is updated automatically from the README",
-        end_prompt="### Supported frameworks",
-    )
     md_list = get_model_list(
         filename="README.md",
         start_prompt=LOCALIZED_READMES["README.md"]["start_prompt"],
         end_prompt=LOCALIZED_READMES["README.md"]["end_prompt"],
     )
 
+    # Build the converted Markdown.
     converted_md_lists = []
     for filename, value in LOCALIZED_READMES.items():
         _start_prompt = value["start_prompt"]
@@ -465,17 +594,7 @@ def check_model_list_copy(overwrite=False, max_per_line=119):
 
         converted_md_lists.append((filename, readmes_match, converted_md_list, _start_prompt, _end_prompt))
 
-    converted_md_list = convert_readme_to_index(md_list)
-    if converted_md_list != index_list:
-        if overwrite:
-            with open(os.path.join(PATH_TO_DOCS, "index.mdx"), "w", encoding="utf-8", newline="\n") as f:
-                f.writelines(lines[:start_index] + [converted_md_list] + lines[end_index:])
-        else:
-            raise ValueError(
-                "The model list in the README changed and the list in `index.mdx` has not been updated. Run "
-                "`make fix-copies` to fix this."
-            )
-
+    # Compare the converted Markdowns
     for converted_md_list in converted_md_lists:
         filename, readmes_match, converted_md, _start_prompt, _end_prompt = converted_md_list
 
@@ -494,6 +613,7 @@ def check_model_list_copy(overwrite=False, max_per_line=119):
             )
 
 
+# Map a model name with the name it has in the README for the check_readme check
 SPECIAL_MODEL_NAMES = {
     "Bert Generation": "BERT For Sequence Generation",
     "BigBird": "BigBird-RoBERTa",
@@ -506,6 +626,7 @@ def check_model_list_copy(overwrite=False, max_per_line=119):
     "OpenAI GPT-2": "GPT-2",
     "OpenAI GPT": "GPT",
     "Perceiver": "Perceiver IO",
+    "SAM": "Segment Anything",
     "ViT": "Vision Transformer (ViT)",
 }
 
@@ -520,18 +641,26 @@ def check_model_list_copy(overwrite=False, max_per_line=119):
     "Speech Encoder decoder",
     "Speech2Text",
     "Speech2Text2",
+    "TimmBackbone",
     "Vision Encoder decoder",
     "VisionTextDualEncoder",
 ]
 
-
+# Template for new entries to add in the main README when we have missing models.
 README_TEMPLATE = (
     "1. **[{model_name}](https://huggingface.co/docs/main/transformers/model_doc/{model_type})** (from "
     "<FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>."
 )
 
 
-def check_readme(overwrite=False):
+def check_readme(overwrite: bool = False):
+    """
+    Check if the main README contains all the models in the library or not.
+
+    Args:
+        overwrite (`bool`, *optional*, defaults to `False`):
+            Whether or not to add an entry for the missing models using `README_TEMPLATE`.
+    """
     info = LOCALIZED_READMES["README.md"]
     models, start_index, end_index, lines = _find_text_in_file(
         os.path.join(REPO_PATH, "README.md"),
diff --git a/utils/check_doc_toc.py b/utils/check_doc_toc.py
index 67ec2f94660a..ccbff5e0b648 100644
--- a/utils/check_doc_toc.py
+++ b/utils/check_doc_toc.py
@@ -12,9 +12,29 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+This script is responsible for cleaning the model section of the table of content by removing duplicates and sorting
+the entries in alphabetical order.
+
+Usage (from the root of the repo):
+
+Check that the table of content is properly sorted (used in `make quality`):
+
+```bash
+python utils/check_doc_toc.py
+```
+
+Auto-sort the table of content if it is not properly sorted (used in `make style`):
+
+```bash
+python utils/check_doc_toc.py --fix_and_overwrite
+```
+"""
+
 
 import argparse
 from collections import defaultdict
+from typing import List
 
 import yaml
 
@@ -22,9 +42,17 @@
 PATH_TO_TOC = "docs/source/en/_toctree.yml"
 
 
-def clean_model_doc_toc(model_doc):
+def clean_model_doc_toc(model_doc: List[dict]) -> List[dict]:
     """
-    Cleans the table of content of the model documentation by removing duplicates and sorting models alphabetically.
+    Cleans a section of the table of content of the model documentation (one specific modality) by removing duplicates
+    and sorting models alphabetically.
+
+    Args:
+        model_doc (`List[dict]`):
+            The list of dictionaries extracted from the `_toctree.yml` file for this specific modality.
+
+    Returns:
+        `List[dict]`: List of dictionaries like the input, but cleaned up and sorted.
     """
     counts = defaultdict(int)
     for doc in model_doc:
@@ -33,7 +61,7 @@ def clean_model_doc_toc(model_doc):
 
     new_doc = []
     for duplicate_key in duplicates:
-        titles = list(set(doc["title"] for doc in model_doc if doc["local"] == duplicate_key))
+        titles = list({doc["title"] for doc in model_doc if doc["local"] == duplicate_key})
         if len(titles) > 1:
             raise ValueError(
                 f"{duplicate_key} is present several times in the documentation table of content at "
@@ -50,7 +78,15 @@ def clean_model_doc_toc(model_doc):
     return sorted(new_doc, key=lambda s: s["title"].lower())
 
 
-def check_model_doc(overwrite=False):
+def check_model_doc(overwrite: bool = False):
+    """
+    Check that the content of the table of content in `_toctree.yml` is clean (no duplicates and sorted for the model
+    API doc) and potentially auto-cleans it.
+
+    Args:
+        overwrite (`bool`, *optional*, defaults to `False`):
+            Whether to just check if the TOC is clean or to auto-clean it (when `overwrite=True`).
+    """
     with open(PATH_TO_TOC, encoding="utf-8") as f:
         content = yaml.safe_load(f.read())
 
@@ -67,6 +103,7 @@ def check_model_doc(overwrite=False):
 
     model_doc = api_doc[model_idx]["sections"]
 
+    # Extract the modalities and clean them one by one.
     modalities_docs = [(idx, section) for idx, section in enumerate(model_doc) if "sections" in section]
     diff = False
     for idx, modality_doc in modalities_docs:
diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py
new file mode 100644
index 000000000000..e140be28037d
--- /dev/null
+++ b/utils/check_docstrings.py
@@ -0,0 +1,1262 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Utility that checks all docstrings of public objects have an argument section matching their signature.
+
+Use from the root of the repo with:
+
+```bash
+python utils/check_docstrings.py
+```
+
+for a check that will error in case of inconsistencies (used by `make repo-consistency`).
+
+To auto-fix issues run:
+
+```bash
+python utils/check_docstrings.py --fix_and_overwrite
+```
+
+which is used by `make fix-copies` (note that this fills what it cans, you might have to manually fill information
+like argument descriptions).
+"""
+import argparse
+import ast
+import enum
+import inspect
+import operator as op
+import re
+from pathlib import Path
+from typing import Any, Optional, Tuple, Union
+
+from check_repo import ignore_undocumented
+
+from transformers.utils import direct_transformers_import
+
+
+PATH_TO_TRANSFORMERS = Path("src").resolve() / "transformers"
+
+# This is to make sure the transformers module imported is the one in the repo.
+transformers = direct_transformers_import(PATH_TO_TRANSFORMERS)
+
+OPTIONAL_KEYWORD = "*optional*"
+# Re pattern that catches args blocks in docstrings (with all variation around the name supported).
+_re_args = re.compile(r"^\s*(Args?|Arguments?|Attributes?|Params?|Parameters?):\s*$")
+# Re pattern that parses the start of an arg block: catches <name> (<description>) in those lines.
+_re_parse_arg = re.compile(r"^(\s*)(\S+)\s+\((.+)\)(?:\:|$)")
+# Re pattern that parses the end of a description of an arg (catches the default in *optional*, defaults to xxx).
+_re_parse_description = re.compile(r"\*optional\*, defaults to (.*)$")
+
+
+# This is a temporary list of objects to ignore while we progressively fix them. Do not add anything here, fix the
+# docstrings instead. If formatting should be ignored for the docstring, you can put a comment # no-format on the
+# line before the docstring.
+OBJECTS_TO_IGNORE = [
+    # Deprecated
+    "InputExample",
+    "InputFeatures",
+    # Signature is *args/**kwargs
+    # "PretrainedConfig", #ignored but could be fixed
+    # "GenerationConfig", #ignored but could be fixed
+    "TFSequenceSummary",
+    "TFBertTokenizer",
+    "TFGPT2Tokenizer",
+    # Missing arguments in the docstring
+    "ASTFeatureExtractor",
+    "AlbertModel",
+    "AlbertTokenizerFast",
+    "AlignTextModel",
+    "AlignVisionConfig",
+    "AltCLIPTextConfig",
+    "AltCLIPVisionConfig",
+    "AudioClassificationPipeline",
+    "AutoformerConfig",
+    "AutomaticSpeechRecognitionPipeline",
+    "AzureOpenAiAgent",
+    "BarkCoarseConfig",
+    "BarkConfig",
+    "BarkFineConfig",
+    "BarkSemanticConfig",
+    "BartConfig",
+    "BartTokenizerFast",
+    "BarthezTokenizerFast",
+    "BeitModel",
+    "BertConfig",
+    "BertGenerationConfig",
+    "BertGenerationTokenizer",
+    "BertJapaneseTokenizer",
+    "BertModel",
+    "BertTokenizerFast",
+    "BigBirdConfig",
+    "BigBirdForQuestionAnswering",
+    "BigBirdModel",
+    "BigBirdPegasusConfig",
+    "BigBirdTokenizerFast",
+    "BitImageProcessor",
+    "BlenderbotConfig",
+    "BlenderbotSmallConfig",
+    "BlenderbotSmallTokenizerFast",
+    "BlenderbotTokenizerFast",
+    "Blip2QFormerConfig",
+    "Blip2VisionConfig",
+    "BlipTextConfig",
+    "BlipVisionConfig",
+    "BloomConfig",
+    "BloomTokenizerFast",
+    "BridgeTowerTextConfig",
+    "BridgeTowerVisionConfig",
+    "BrosModel",
+    "CamembertConfig",
+    "CamembertModel",
+    "CamembertTokenizerFast",
+    "CanineConfig",
+    "CanineModel",
+    "CanineTokenizer",
+    "ChineseCLIPImageProcessor",
+    "ChineseCLIPTextConfig",
+    "ChineseCLIPTextModel",
+    "ChineseCLIPVisionConfig",
+    "ClapTextConfig",
+    "CodeGenConfig",
+    "CodeGenTokenizer",
+    "CodeGenTokenizerFast",
+    "CodeLlamaTokenizer",
+    "CodeLlamaTokenizerFast",
+    "ConditionalDetrConfig",
+    "ConditionalDetrImageProcessor",
+    "ConvBertConfig",
+    "ConvBertTokenizerFast",
+    "ConvNextConfig",
+    "ConvNextV2Config",
+    "ConversationalPipeline",
+    "CpmAntTokenizer",
+    "CvtConfig",
+    "CvtModel",
+    "DeiTImageProcessor",
+    "DPRConfig",
+    "DPRReaderTokenizer",
+    "DPRReaderTokenizerFast",
+    "DPTModel",
+    "Data2VecAudioConfig",
+    "Data2VecTextConfig",
+    "Data2VecTextModel",
+    "Data2VecVisionModel",
+    "DataCollatorForLanguageModeling",
+    "DebertaConfig",
+    "DebertaV2Config",
+    "DebertaV2Tokenizer",
+    "DebertaV2TokenizerFast",
+    "DecisionTransformerConfig",
+    "DeformableDetrConfig",
+    "DeformableDetrImageProcessor",
+    "DeiTModel",
+    "DepthEstimationPipeline",
+    "DetaConfig",
+    "DetaImageProcessor",
+    "DetrConfig",
+    "DetrImageProcessor",
+    "DinatModel",
+    "DistilBertConfig",
+    "DistilBertTokenizerFast",
+    "DocumentQuestionAnsweringPipeline",
+    "DonutSwinModel",
+    "EarlyStoppingCallback",
+    "EfficientFormerConfig",
+    "EfficientFormerImageProcessor",
+    "EfficientNetConfig",
+    "ElectraConfig",
+    "ElectraTokenizerFast",
+    "EncoderDecoderModel",
+    "EncoderRepetitionPenaltyLogitsProcessor",
+    "ErnieConfig",
+    "ErnieMConfig",
+    "ErnieMModel",
+    "ErnieModel",
+    "ErnieMTokenizer",
+    "EsmConfig",
+    "EsmModel",
+    "FlaxAlbertForMaskedLM",
+    "FlaxAlbertForMultipleChoice",
+    "FlaxAlbertForPreTraining",
+    "FlaxAlbertForQuestionAnswering",
+    "FlaxAlbertForSequenceClassification",
+    "FlaxAlbertForTokenClassification",
+    "FlaxAlbertModel",
+    "FlaxBartForCausalLM",
+    "FlaxBartForConditionalGeneration",
+    "FlaxBartForQuestionAnswering",
+    "FlaxBartForSequenceClassification",
+    "FlaxBartModel",
+    "FlaxBeitForImageClassification",
+    "FlaxBeitForMaskedImageModeling",
+    "FlaxBeitModel",
+    "FlaxBertForCausalLM",
+    "FlaxBertForMaskedLM",
+    "FlaxBertForMultipleChoice",
+    "FlaxBertForNextSentencePrediction",
+    "FlaxBertForPreTraining",
+    "FlaxBertForQuestionAnswering",
+    "FlaxBertForSequenceClassification",
+    "FlaxBertForTokenClassification",
+    "FlaxBertModel",
+    "FlaxBigBirdForCausalLM",
+    "FlaxBigBirdForMaskedLM",
+    "FlaxBigBirdForMultipleChoice",
+    "FlaxBigBirdForPreTraining",
+    "FlaxBigBirdForQuestionAnswering",
+    "FlaxBigBirdForSequenceClassification",
+    "FlaxBigBirdForTokenClassification",
+    "FlaxBigBirdModel",
+    "FlaxBlenderbotForConditionalGeneration",
+    "FlaxBlenderbotModel",
+    "FlaxBlenderbotSmallForConditionalGeneration",
+    "FlaxBlenderbotSmallModel",
+    "FlaxBloomForCausalLM",
+    "FlaxBloomModel",
+    "FlaxCLIPModel",
+    "FlaxDistilBertForMaskedLM",
+    "FlaxDistilBertForMultipleChoice",
+    "FlaxDistilBertForQuestionAnswering",
+    "FlaxDistilBertForSequenceClassification",
+    "FlaxDistilBertForTokenClassification",
+    "FlaxDistilBertModel",
+    "FlaxElectraForCausalLM",
+    "FlaxElectraForMaskedLM",
+    "FlaxElectraForMultipleChoice",
+    "FlaxElectraForPreTraining",
+    "FlaxElectraForQuestionAnswering",
+    "FlaxElectraForSequenceClassification",
+    "FlaxElectraForTokenClassification",
+    "FlaxElectraModel",
+    "FlaxEncoderDecoderModel",
+    "FlaxGPT2LMHeadModel",
+    "FlaxGPT2Model",
+    "FlaxGPTJForCausalLM",
+    "FlaxGPTJModel",
+    "FlaxGPTNeoForCausalLM",
+    "FlaxGPTNeoModel",
+    "FlaxMBartForConditionalGeneration",
+    "FlaxMBartForQuestionAnswering",
+    "FlaxMBartForSequenceClassification",
+    "FlaxMBartModel",
+    "FlaxMarianMTModel",
+    "FlaxMarianModel",
+    "FlaxOPTForCausalLM",
+    "FlaxPegasusForConditionalGeneration",
+    "FlaxPegasusModel",
+    "FlaxRegNetForImageClassification",
+    "FlaxRegNetModel",
+    "FlaxResNetForImageClassification",
+    "FlaxResNetModel",
+    "FlaxRoFormerForMaskedLM",
+    "FlaxRoFormerForMultipleChoice",
+    "FlaxRoFormerForQuestionAnswering",
+    "FlaxRoFormerForSequenceClassification",
+    "FlaxRoFormerForTokenClassification",
+    "FlaxRoFormerModel",
+    "FlaxRobertaForCausalLM",
+    "FlaxRobertaForMaskedLM",
+    "FlaxRobertaForMultipleChoice",
+    "FlaxRobertaForQuestionAnswering",
+    "FlaxRobertaForSequenceClassification",
+    "FlaxRobertaForTokenClassification",
+    "FlaxRobertaModel",
+    "FlaxRobertaPreLayerNormForCausalLM",
+    "FlaxRobertaPreLayerNormForMaskedLM",
+    "FlaxRobertaPreLayerNormForMultipleChoice",
+    "FlaxRobertaPreLayerNormForQuestionAnswering",
+    "FlaxRobertaPreLayerNormForSequenceClassification",
+    "FlaxRobertaPreLayerNormForTokenClassification",
+    "FlaxRobertaPreLayerNormModel",
+    "FlaxSpeechEncoderDecoderModel",
+    "FlaxViTForImageClassification",
+    "FlaxViTModel",
+    "FlaxVisionEncoderDecoderModel",
+    "FlaxVisionTextDualEncoderModel",
+    "FlaxWav2Vec2ForCTC",
+    "FlaxWav2Vec2ForPreTraining",
+    "FlaxWav2Vec2Model",
+    "FlaxWhisperForAudioClassification",
+    "FlaxWhisperForConditionalGeneration",
+    "FlaxWhisperModel",
+    "FlaxWhisperTimeStampLogitsProcessor",
+    "FlaxXGLMForCausalLM",
+    "FlaxXGLMModel",
+    "FlaxXLMRobertaForCausalLM",
+    "FlaxXLMRobertaForMaskedLM",
+    "FlaxXLMRobertaForMultipleChoice",
+    "FlaxXLMRobertaForQuestionAnswering",
+    "FlaxXLMRobertaForSequenceClassification",
+    "FlaxXLMRobertaForTokenClassification",
+    "FlaxXLMRobertaModel",
+    "FNetConfig",
+    "FNetModel",
+    "FNetTokenizerFast",
+    "FSMTConfig",
+    "FeatureExtractionPipeline",
+    "FillMaskPipeline",
+    "FlaubertConfig",
+    "FlavaConfig",
+    "FlavaForPreTraining",
+    "FlavaImageModel",
+    "FlavaImageProcessor",
+    "FlavaMultimodalModel",
+    "FlavaTextConfig",
+    "FlavaTextModel",
+    "FocalNetModel",
+    "FunnelTokenizerFast",
+    "GPT2Config",
+    "GPT2Tokenizer",
+    "GPT2TokenizerFast",
+    "GPTBigCodeConfig",
+    "GPTJConfig",
+    "GPTNeoXConfig",
+    "GPTNeoXJapaneseConfig",
+    "GPTNeoXTokenizerFast",
+    "GPTSanJapaneseConfig",
+    "GitConfig",
+    "GitVisionConfig",
+    "GraphormerConfig",
+    "GroupViTTextConfig",
+    "GroupViTVisionConfig",
+    "HerbertTokenizerFast",
+    "HubertConfig",
+    "HubertForCTC",
+    "IBertConfig",
+    "IBertModel",
+    "IdeficsConfig",
+    "IdeficsProcessor",
+    "ImageClassificationPipeline",
+    "ImageGPTConfig",
+    "ImageSegmentationPipeline",
+    "ImageToImagePipeline",
+    "ImageToTextPipeline",
+    "InformerConfig",
+    "InstructBlipQFormerConfig",
+    "JukeboxPriorConfig",
+    "JukeboxTokenizer",
+    "LEDConfig",
+    "LEDTokenizerFast",
+    "LayoutLMForQuestionAnswering",
+    "LayoutLMTokenizerFast",
+    "LayoutLMv2Config",
+    "LayoutLMv2ForQuestionAnswering",
+    "LayoutLMv2TokenizerFast",
+    "LayoutLMv3Config",
+    "LayoutLMv3ImageProcessor",
+    "LayoutLMv3TokenizerFast",
+    "LayoutXLMTokenizerFast",
+    "LevitConfig",
+    "LiltConfig",
+    "LiltModel",
+    "LlamaTokenizer",
+    "LlamaTokenizerFast",
+    "LongT5Config",
+    "LongformerConfig",
+    "LongformerModel",
+    "LongformerTokenizerFast",
+    "LukeConfig",
+    "LukeModel",
+    "LukeTokenizer",
+    "LxmertTokenizerFast",
+    "M2M100Config",
+    "M2M100Tokenizer",
+    "MarkupLMProcessor",
+    "MBart50TokenizerFast",
+    "MBartConfig",
+    "MCTCTFeatureExtractor",
+    "MPNetConfig",
+    "MPNetModel",
+    "MPNetTokenizerFast",
+    "MT5Config",
+    "MT5TokenizerFast",
+    "MarianConfig",
+    "MarianTokenizer",
+    "MarkupLMConfig",
+    "MarkupLMModel",
+    "MarkupLMTokenizer",
+    "MarkupLMTokenizerFast",
+    "Mask2FormerConfig",
+    "MaskFormerConfig",
+    "MaxTimeCriteria",
+    "MegaConfig",
+    "MegaModel",
+    "MegatronBertConfig",
+    "MegatronBertForPreTraining",
+    "MegatronBertModel",
+    "MobileBertConfig",
+    "MobileBertModel",
+    "MobileBertTokenizerFast",
+    "MobileNetV1ImageProcessor",
+    "MobileNetV1Model",
+    "MobileNetV2ImageProcessor",
+    "MobileNetV2Model",
+    "MobileViTModel",
+    "MobileViTV2Model",
+    "MLukeTokenizer",
+    "MraConfig",
+    "MusicgenDecoderConfig",
+    "MusicgenForConditionalGeneration",
+    "MvpConfig",
+    "MvpTokenizerFast",
+    "MT5Tokenizer",
+    "NatModel",
+    "NerPipeline",
+    "NezhaConfig",
+    "NezhaModel",
+    "NllbMoeConfig",
+    "NllbTokenizer",
+    "NllbTokenizerFast",
+    "NystromformerConfig",
+    "OPTConfig",
+    "ObjectDetectionPipeline",
+    "OneFormerProcessor",
+    "OpenAIGPTTokenizerFast",
+    "OpenLlamaConfig",
+    "OwlViTConfig",
+    "OwlViTModel",
+    "OwlViTTextConfig",
+    "PLBartConfig",
+    "PegasusConfig",
+    "PegasusTokenizer",
+    "PegasusTokenizerFast",
+    "PegasusXConfig",
+    "PerceiverImageProcessor",
+    "PerceiverModel",
+    "PerceiverTokenizer",
+    "PersimmonConfig",
+    "Pipeline",
+    "Pix2StructConfig",
+    "Pix2StructTextConfig",
+    "PLBartTokenizer",
+    "Pop2PianoConfig",
+    "PreTrainedTokenizer",
+    "PreTrainedTokenizerBase",
+    "PreTrainedTokenizerFast",
+    "PrefixConstrainedLogitsProcessor",
+    "ProphetNetConfig",
+    "QDQBertConfig",
+    "QDQBertModel",
+    "QuestionAnsweringPipeline",
+    "RagConfig",
+    "RagModel",
+    "RagRetriever",
+    "RagSequenceForGeneration",
+    "RagTokenForGeneration",
+    "RealmConfig",
+    "RealmForOpenQA",
+    "RealmScorer",
+    "RealmTokenizerFast",
+    "ReformerConfig",
+    "ReformerTokenizerFast",
+    "RegNetConfig",
+    "RemBertConfig",
+    "RemBertModel",
+    "RemBertTokenizer",
+    "RemBertTokenizerFast",
+    "RepetitionPenaltyLogitsProcessor",
+    "RetriBertConfig",
+    "RetriBertTokenizerFast",
+    "RoCBertConfig",
+    "RoCBertModel",
+    "RoCBertTokenizer",
+    "RoFormerConfig",
+    "RobertaConfig",
+    "RobertaModel",
+    "RobertaPreLayerNormConfig",
+    "RobertaPreLayerNormModel",
+    "RobertaTokenizerFast",
+    "RwkvConfig",
+    "SEWConfig",
+    "SEWDConfig",
+    "SEWDForCTC",
+    "SEWForCTC",
+    "SamConfig",
+    "SamPromptEncoderConfig",
+    "Seq2SeqTrainingArguments",
+    "SpecialTokensMixin",
+    "Speech2Text2Config",
+    "Speech2Text2Tokenizer",
+    "Speech2TextConfig",
+    "Speech2TextTokenizer",
+    "SpeechEncoderDecoderModel",
+    "SpeechT5Config",
+    "SpeechT5Model",
+    "SplinterConfig",
+    "SplinterTokenizerFast",
+    "SqueezeBertTokenizerFast",
+    "SummarizationPipeline",
+    "Swin2SRImageProcessor",
+    "SwinModel",
+    "Swinv2Model",
+    "SwitchTransformersConfig",
+    "T5Config",
+    "T5Tokenizer",
+    "T5TokenizerFast",
+    "TableQuestionAnsweringPipeline",
+    "TableTransformerConfig",
+    "TapasConfig",
+    "TapasModel",
+    "TapasTokenizer",
+    "Text2TextGenerationPipeline",
+    "TextClassificationPipeline",
+    "TextGenerationPipeline",
+    "TFAlbertForMaskedLM",
+    "TFAlbertForMultipleChoice",
+    "TFAlbertForPreTraining",
+    "TFAlbertForQuestionAnswering",
+    "TFAlbertForSequenceClassification",
+    "TFAlbertForTokenClassification",
+    "TFAlbertModel",
+    "TFBartForConditionalGeneration",
+    "TFBartForSequenceClassification",
+    "TFBartModel",
+    "TFBertForMaskedLM",
+    "TFBertForMultipleChoice",
+    "TFBertForNextSentencePrediction",
+    "TFBertForPreTraining",
+    "TFBertForQuestionAnswering",
+    "TFBertForSequenceClassification",
+    "TFBertForTokenClassification",
+    "TFBertModel",
+    "TFBlenderbotForConditionalGeneration",
+    "TFBlenderbotModel",
+    "TFBlenderbotSmallForConditionalGeneration",
+    "TFBlenderbotSmallModel",
+    "TFBlipForConditionalGeneration",
+    "TFBlipForImageTextRetrieval",
+    "TFBlipForQuestionAnswering",
+    "TFCLIPModel",
+    "TFCTRLForSequenceClassification",
+    "TFCTRLLMHeadModel",
+    "TFCTRLModel",
+    "TFCamembertForCausalLM",
+    "TFCamembertForMaskedLM",
+    "TFCamembertForMultipleChoice",
+    "TFCamembertForQuestionAnswering",
+    "TFCamembertForSequenceClassification",
+    "TFCamembertForTokenClassification",
+    "TFCamembertModel",
+    "TFConvBertForMaskedLM",
+    "TFConvBertForMultipleChoice",
+    "TFConvBertForQuestionAnswering",
+    "TFConvBertForSequenceClassification",
+    "TFConvBertForTokenClassification",
+    "TFConvBertModel",
+    "TFConvNextForImageClassification",
+    "TFConvNextModel",
+    "TFCvtForImageClassification",
+    "TFCvtModel",
+    "TFDPRReader",
+    "TFData2VecVisionForImageClassification",
+    "TFData2VecVisionForSemanticSegmentation",
+    "TFData2VecVisionModel",
+    "TFDebertaForMaskedLM",
+    "TFDebertaForQuestionAnswering",
+    "TFDebertaForSequenceClassification",
+    "TFDebertaForTokenClassification",
+    "TFDebertaModel",
+    "TFDebertaV2ForMaskedLM",
+    "TFDebertaV2ForMultipleChoice",
+    "TFDebertaV2ForQuestionAnswering",
+    "TFDebertaV2ForSequenceClassification",
+    "TFDebertaV2ForTokenClassification",
+    "TFDebertaV2Model",
+    "TFDeiTForImageClassification",
+    "TFDeiTForImageClassificationWithTeacher",
+    "TFDeiTForMaskedImageModeling",
+    "TFDeiTModel",
+    "TFDistilBertForMaskedLM",
+    "TFDistilBertForMultipleChoice",
+    "TFDistilBertForQuestionAnswering",
+    "TFDistilBertForSequenceClassification",
+    "TFDistilBertForTokenClassification",
+    "TFDistilBertModel",
+    "TFEfficientFormerForImageClassification",
+    "TFEfficientFormerForImageClassificationWithTeacher",
+    "TFEfficientFormerModel",
+    "TFElectraForMaskedLM",
+    "TFElectraForMultipleChoice",
+    "TFElectraForPreTraining",
+    "TFElectraForQuestionAnswering",
+    "TFElectraForSequenceClassification",
+    "TFElectraForTokenClassification",
+    "TFElectraModel",
+    "TFEncoderDecoderModel",
+    "TFEsmForMaskedLM",
+    "TFEsmForSequenceClassification",
+    "TFEsmForTokenClassification",
+    "TFEsmModel",
+    "TFFlaubertForMultipleChoice",
+    "TFFlaubertForQuestionAnsweringSimple",
+    "TFFlaubertForSequenceClassification",
+    "TFFlaubertForTokenClassification",
+    "TFFlaubertModel",
+    "TFFlaubertWithLMHeadModel",
+    "TFFunnelBaseModel",
+    "TFFunnelForMaskedLM",
+    "TFFunnelForMultipleChoice",
+    "TFFunnelForPreTraining",
+    "TFFunnelForQuestionAnswering",
+    "TFFunnelForSequenceClassification",
+    "TFFunnelForTokenClassification",
+    "TFFunnelModel",
+    "TFGPT2DoubleHeadsModel",
+    "TFGPT2ForSequenceClassification",
+    "TFGPT2LMHeadModel",
+    "TFGPT2Model",
+    "TFGPTJForCausalLM",
+    "TFGPTJForQuestionAnswering",
+    "TFGPTJForSequenceClassification",
+    "TFGPTJModel",
+    "TFGroupViTModel",
+    "TFHubertForCTC",
+    "TFHubertModel",
+    "TFLEDForConditionalGeneration",
+    "TFLEDModel",
+    "TFLayoutLMForMaskedLM",
+    "TFLayoutLMForQuestionAnswering",
+    "TFLayoutLMForSequenceClassification",
+    "TFLayoutLMForTokenClassification",
+    "TFLayoutLMModel",
+    "TFLayoutLMv3ForQuestionAnswering",
+    "TFLayoutLMv3ForSequenceClassification",
+    "TFLayoutLMv3ForTokenClassification",
+    "TFLayoutLMv3Model",
+    "TFLongformerForMaskedLM",
+    "TFLongformerForMultipleChoice",
+    "TFLongformerForQuestionAnswering",
+    "TFLongformerForSequenceClassification",
+    "TFLongformerForTokenClassification",
+    "TFLongformerModel",
+    "TFLxmertForPreTraining",
+    "TFLxmertModel",
+    "TFMBartForConditionalGeneration",
+    "TFMBartModel",
+    "TFMPNetForMaskedLM",
+    "TFMPNetForMultipleChoice",
+    "TFMPNetForQuestionAnswering",
+    "TFMPNetForSequenceClassification",
+    "TFMPNetForTokenClassification",
+    "TFMPNetModel",
+    "TFMarianMTModel",
+    "TFMarianModel",
+    "TFMobileBertForMaskedLM",
+    "TFMobileBertForMultipleChoice",
+    "TFMobileBertForNextSentencePrediction",
+    "TFMobileBertForPreTraining",
+    "TFMobileBertForQuestionAnswering",
+    "TFMobileBertForSequenceClassification",
+    "TFMobileBertForTokenClassification",
+    "TFMobileBertModel",
+    "TFMobileViTForImageClassification",
+    "TFMobileViTForSemanticSegmentation",
+    "TFMobileViTModel",
+    "TFOPTForCausalLM",
+    "TFOPTModel",
+    "TFOpenAIGPTDoubleHeadsModel",
+    "TFOpenAIGPTForSequenceClassification",
+    "TFOpenAIGPTLMHeadModel",
+    "TFOpenAIGPTModel",
+    "TFPegasusForConditionalGeneration",
+    "TFPegasusModel",
+    "TFRagModel",
+    "TFRagSequenceForGeneration",
+    "TFRagTokenForGeneration",
+    "TFRemBertForCausalLM",
+    "TFRemBertForMaskedLM",
+    "TFRemBertForMultipleChoice",
+    "TFRemBertForQuestionAnswering",
+    "TFRemBertForSequenceClassification",
+    "TFRemBertForTokenClassification",
+    "TFRemBertModel",
+    "TFRepetitionPenaltyLogitsProcessor",
+    "TFResNetForImageClassification",
+    "TFResNetModel",
+    "TFRoFormerForCausalLM",
+    "TFRoFormerForMaskedLM",
+    "TFRoFormerForMultipleChoice",
+    "TFRoFormerForQuestionAnswering",
+    "TFRoFormerForSequenceClassification",
+    "TFRoFormerForTokenClassification",
+    "TFRoFormerModel",
+    "TFRobertaForMaskedLM",
+    "TFRobertaForMultipleChoice",
+    "TFRobertaForQuestionAnswering",
+    "TFRobertaForSequenceClassification",
+    "TFRobertaForTokenClassification",
+    "TFRobertaModel",
+    "TFRobertaPreLayerNormForMaskedLM",
+    "TFRobertaPreLayerNormForMultipleChoice",
+    "TFRobertaPreLayerNormForQuestionAnswering",
+    "TFRobertaPreLayerNormForSequenceClassification",
+    "TFRobertaPreLayerNormForTokenClassification",
+    "TFRobertaPreLayerNormModel",
+    "TFSamModel",
+    "TFSegformerForImageClassification",
+    "TFSegformerForSemanticSegmentation",
+    "TFSegformerModel",
+    "TFSpeech2TextForConditionalGeneration",
+    "TFSpeech2TextModel",
+    "TFSwinForImageClassification",
+    "TFSwinForMaskedImageModeling",
+    "TFSwinModel",
+    "TFT5EncoderModel",
+    "TFT5ForConditionalGeneration",
+    "TFT5Model",
+    "TFTapasForMaskedLM",
+    "TFTapasForQuestionAnswering",
+    "TFTapasForSequenceClassification",
+    "TFTapasModel",
+    "TFTransfoXLForSequenceClassification",
+    "TFTransfoXLLMHeadModel",
+    "TFTransfoXLModel",
+    "TFViTForImageClassification",
+    "TFViTMAEForPreTraining",
+    "TFViTMAEModel",
+    "TFViTModel",
+    "TFVisionEncoderDecoderModel",
+    "TFVisionTextDualEncoderModel",
+    "TFWav2Vec2ForCTC",
+    "TFWav2Vec2Model",
+    "TFWhisperForConditionalGeneration",
+    "TFWhisperModel",
+    "TFXGLMForCausalLM",
+    "TFXGLMModel",
+    "TFXLMForMultipleChoice",
+    "TFXLMForQuestionAnsweringSimple",
+    "TFXLMForSequenceClassification",
+    "TFXLMForTokenClassification",
+    "TFXLMModel",
+    "TFXLMRobertaForCausalLM",
+    "TFXLMRobertaForMaskedLM",
+    "TFXLMRobertaForMultipleChoice",
+    "TFXLMRobertaForQuestionAnswering",
+    "TFXLMRobertaForSequenceClassification",
+    "TFXLMRobertaForTokenClassification",
+    "TFXLMRobertaModel",
+    "TFXLMWithLMHeadModel",
+    "TFXLNetForMultipleChoice",
+    "TFXLNetForQuestionAnsweringSimple",
+    "TFXLNetForSequenceClassification",
+    "TFXLNetForTokenClassification",
+    "TFXLNetLMHeadModel",
+    "TFXLNetModel",
+    "TimeSeriesTransformerConfig",
+    "TokenClassificationPipeline",
+    "TrOCRConfig",
+    "TrainerState",
+    "TrainingArguments",
+    "TrajectoryTransformerConfig",
+    "TransfoXLConfig",
+    "TranslationPipeline",
+    "TvltImageProcessor",
+    "UMT5Config",
+    "UniSpeechConfig",
+    "UniSpeechForCTC",
+    "UniSpeechSatConfig",
+    "UniSpeechSatForCTC",
+    "UperNetConfig",
+    "UperNetForSemanticSegmentation",
+    "ViTHybridImageProcessor",
+    "ViTHybridModel",
+    "ViTMSNModel",
+    "ViTModel",
+    "VideoClassificationPipeline",
+    "ViltConfig",
+    "ViltForImagesAndTextClassification",
+    "ViltModel",
+    "VisionEncoderDecoderModel",
+    "VisionTextDualEncoderModel",
+    "VisualBertConfig",
+    "VisualBertModel",
+    "VisualQuestionAnsweringPipeline",
+    "VitMatteForImageMatting",
+    "VitsTokenizer",
+    "VivitModel",
+    "Wav2Vec2CTCTokenizer",
+    "Wav2Vec2Config",
+    "Wav2Vec2ConformerConfig",
+    "Wav2Vec2ConformerForCTC",
+    "Wav2Vec2FeatureExtractor",
+    "Wav2Vec2ForCTC",
+    "Wav2Vec2PhonemeCTCTokenizer",
+    "WavLMConfig",
+    "WavLMForCTC",
+    "WhisperConfig",
+    "WhisperFeatureExtractor",
+    "WhisperForAudioClassification",
+    "WhisperTokenizer",
+    "WhisperTokenizerFast",
+    "XCLIPTextConfig",
+    "XCLIPVisionConfig",
+    "XGLMConfig",
+    "XGLMModel",
+    "XGLMTokenizerFast",
+    "XLMConfig",
+    "XLMProphetNetConfig",
+    "XLMRobertaConfig",
+    "XLMRobertaModel",
+    "XLMRobertaTokenizerFast",
+    "XLMRobertaXLConfig",
+    "XLMRobertaXLModel",
+    "XLNetConfig",
+    "XLNetTokenizerFast",
+    "XmodConfig",
+    "XmodModel",
+    "YolosImageProcessor",
+    "YolosModel",
+    "YosoConfig",
+    "ZeroShotAudioClassificationPipeline",
+    "ZeroShotClassificationPipeline",
+    "ZeroShotImageClassificationPipeline",
+    "ZeroShotObjectDetectionPipeline",
+]
+
+# Supported math operations when interpreting the value of defaults.
+MATH_OPERATORS = {
+    ast.Add: op.add,
+    ast.Sub: op.sub,
+    ast.Mult: op.mul,
+    ast.Div: op.truediv,
+    ast.Pow: op.pow,
+    ast.BitXor: op.xor,
+    ast.USub: op.neg,
+}
+
+
+def find_indent(line: str) -> int:
+    """
+    Returns the number of spaces that start a line indent.
+    """
+    search = re.search(r"^(\s*)(?:\S|$)", line)
+    if search is None:
+        return 0
+    return len(search.groups()[0])
+
+
+def stringify_default(default: Any) -> str:
+    """
+    Returns the string representation of a default value, as used in docstring: numbers are left as is, all other
+    objects are in backtiks.
+
+    Args:
+        default (`Any`): The default value to process
+
+    Returns:
+        `str`: The string representation of that default.
+    """
+    if isinstance(default, bool):
+        # We need to test for bool first as a bool passes isinstance(xxx, (int, float))
+        return f"`{default}`"
+    elif isinstance(default, enum.Enum):
+        # We need to test for enum first as an enum with int values will pass isinstance(xxx, (int, float))
+        return f"`{str(default)}`"
+    elif isinstance(default, int):
+        return str(default)
+    elif isinstance(default, float):
+        result = str(default)
+        return str(round(default, 2)) if len(result) > 6 else result
+    elif isinstance(default, str):
+        return str(default) if default.isnumeric() else f'`"{default}"`'
+    elif isinstance(default, type):
+        return f"`{default.__name__}`"
+    else:
+        return f"`{default}`"
+
+
+def eval_math_expression(expression: str) -> Optional[Union[float, int]]:
+    # Mainly taken from the excellent https://stackoverflow.com/a/9558001
+    """
+    Evaluate (safely) a mathematial expression and returns its value.
+
+    Args:
+        expression (`str`): The expression to evaluate.
+
+    Returns:
+        `Optional[Union[float, int]]`: Returns `None` if the evaluation fails in any way and the value computed
+        otherwise.
+
+    Example:
+
+    ```py
+    >>> eval_expr('2^6')
+    4
+    >>> eval_expr('2**6')
+    64
+    >>> eval_expr('1 + 2*3**(4^5) / (6 + -7)')
+    -5.0
+    ```
+    """
+    try:
+        return eval_node(ast.parse(expression, mode="eval").body)
+    except TypeError:
+        return
+
+
+def eval_node(node):
+    if isinstance(node, ast.Num):  # <number>
+        return node.n
+    elif isinstance(node, ast.BinOp):  # <left> <operator> <right>
+        return MATH_OPERATORS[type(node.op)](eval_node(node.left), eval_node(node.right))
+    elif isinstance(node, ast.UnaryOp):  # <operator> <operand> e.g., -1
+        return MATH_OPERATORS[type(node.op)](eval_node(node.operand))
+    else:
+        raise TypeError(node)
+
+
+def replace_default_in_arg_description(description: str, default: Any) -> str:
+    """
+    Catches the default value in the description of an argument inside a docstring and replaces it by the value passed.
+
+    Args:
+        description (`str`): The description of an argument in a docstring to process.
+        default (`Any`): The default value that whould be in the docstring of that argument.
+
+    Returns:
+       `str`: The description updated with the new default value.
+    """
+    # Lots of docstrings have `optional` or **opational** instead of *optional* so we do this fix here.
+    description = description.replace("`optional`", OPTIONAL_KEYWORD)
+    description = description.replace("**optional**", OPTIONAL_KEYWORD)
+    if default is inspect._empty:
+        # No default, make sure the description doesn't have any either
+        idx = description.find(OPTIONAL_KEYWORD)
+        if idx != -1:
+            description = description[:idx].rstrip()
+            if description.endswith(","):
+                description = description[:-1].rstrip()
+    elif default is None:
+        # Default None are not written, we just set `*optional*`. If there is default that is not None specified in the
+        # description, we do not erase it (as sometimes we set the default to `None` because the default is a mutable
+        # object).
+        idx = description.find(OPTIONAL_KEYWORD)
+        if idx == -1:
+            description = f"{description}, {OPTIONAL_KEYWORD}"
+        elif re.search(r"defaults to `?None`?", description) is not None:
+            len_optional = len(OPTIONAL_KEYWORD)
+            description = description[: idx + len_optional]
+    else:
+        str_default = None
+        # For numbers we may have a default that is given by a math operation (1/255 is really popular). We don't
+        # want to replace those by their actual values.
+        if isinstance(default, (int, float)) and re.search("defaults to `?(.*?)(?:`|$)", description) is not None:
+            # Grab the default and evaluate it.
+            current_default = re.search("defaults to `?(.*?)(?:`|$)", description).groups()[0]
+            if default == eval_math_expression(current_default):
+                try:
+                    # If it can be directly converted to the type of the default, it's a simple value
+                    str_default = str(type(default)(current_default))
+                except Exception:
+                    # Otherwise there is a math operator so we add a code block.
+                    str_default = f"`{current_default}`"
+
+        if str_default is None:
+            str_default = stringify_default(default)
+        # Make sure default match
+        if OPTIONAL_KEYWORD not in description:
+            description = f"{description}, {OPTIONAL_KEYWORD}, defaults to {str_default}"
+        elif _re_parse_description.search(description) is None:
+            idx = description.find(OPTIONAL_KEYWORD)
+            len_optional = len(OPTIONAL_KEYWORD)
+            description = f"{description[:idx + len_optional]}, defaults to {str_default}"
+        else:
+            description = _re_parse_description.sub(rf"*optional*, defaults to {str_default}", description)
+
+    return description
+
+
+def get_default_description(arg: inspect.Parameter) -> str:
+    """
+    Builds a default description for a parameter that was not documented.
+
+    Args:
+        arg (`inspect.Parameter`): The argument in the signature to generate a description for.
+
+    Returns:
+        `str`: The description.
+    """
+    if arg.annotation is inspect._empty:
+        arg_type = "<fill_type>"
+    elif hasattr(arg.annotation, "__name__"):
+        arg_type = arg.annotation.__name__
+    else:
+        arg_type = str(arg.annotation)
+
+    if arg.default is inspect._empty:
+        return f"`{arg_type}`"
+    elif arg.default is None:
+        return f"`{arg_type}`, {OPTIONAL_KEYWORD}"
+    else:
+        str_default = stringify_default(arg.default)
+        return f"`{arg_type}`, {OPTIONAL_KEYWORD}, defaults to {str_default}"
+
+
+def find_source_file(obj: Any) -> Path:
+    """
+    Finds the source file of an object.
+
+    Args:
+        obj (`Any`): The object whose source file we are looking for.
+
+    Returns:
+        `Path`: The source file.
+    """
+    module = obj.__module__
+    obj_file = PATH_TO_TRANSFORMERS
+    for part in module.split(".")[1:]:
+        obj_file = obj_file / part
+    return obj_file.with_suffix(".py")
+
+
+def match_docstring_with_signature(obj: Any) -> Optional[Tuple[str, str]]:
+    """
+    Matches the docstring of an object with its signature.
+
+    Args:
+        obj (`Any`): The object to process.
+
+    Returns:
+        `Optional[Tuple[str, str]]`: Returns `None` if there is no docstring or no parameters documented in the
+        docstring, otherwise returns a tuple of two strings: the current documentation of the arguments in the
+        docstring and the one matched with the signature.
+    """
+    if len(getattr(obj, "__doc__", "")) == 0:
+        # Nothing to do, there is no docstring.
+        return
+
+    # Read the docstring in the source code to see if there is a special command to ignore this object.
+    try:
+        source, _ = inspect.getsourcelines(obj)
+    except OSError:
+        source = []
+
+    idx = 0
+    while idx < len(source) and '"""' not in source[idx]:
+        idx += 1
+
+    ignore_order = False
+    if idx < len(source):
+        line_before_docstring = source[idx - 1]
+        if re.search(r"^\s*#\s*no-format\s*$", line_before_docstring):
+            # This object is ignored
+            return
+        elif re.search(r"^\s*#\s*ignore-order\s*$", line_before_docstring):
+            ignore_order = True
+
+    # Read the signature
+    signature = inspect.signature(obj).parameters
+
+    obj_doc_lines = obj.__doc__.split("\n")
+    # Get to the line where we start documenting arguments
+    idx = 0
+    while idx < len(obj_doc_lines) and _re_args.search(obj_doc_lines[idx]) is None:
+        idx += 1
+
+    if idx == len(obj_doc_lines):
+        # Nothing to do, no parameters are documented.
+        return
+
+    indent = find_indent(obj_doc_lines[idx])
+    arguments = {}
+    current_arg = None
+    idx += 1
+    start_idx = idx
+    # Keep going until the arg section is finished (nonempty line at the same indent level) or the end of the docstring.
+    while idx < len(obj_doc_lines) and (
+        len(obj_doc_lines[idx].strip()) == 0 or find_indent(obj_doc_lines[idx]) > indent
+    ):
+        if find_indent(obj_doc_lines[idx]) == indent + 4:
+            # New argument -> let's generate the proper doc for it
+            re_search_arg = _re_parse_arg.search(obj_doc_lines[idx])
+            if re_search_arg is not None:
+                _, name, description = re_search_arg.groups()
+                current_arg = name
+                if name in signature:
+                    default = signature[name].default
+                    if signature[name].kind is inspect._ParameterKind.VAR_KEYWORD:
+                        default = None
+                    new_description = replace_default_in_arg_description(description, default)
+                else:
+                    new_description = description
+                init_doc = _re_parse_arg.sub(rf"\1\2 ({new_description}):", obj_doc_lines[idx])
+                arguments[current_arg] = [init_doc]
+        elif current_arg is not None:
+            arguments[current_arg].append(obj_doc_lines[idx])
+
+        idx += 1
+
+    # We went too far by one (perhaps more if there are a lot of new lines)
+    idx -= 1
+    while len(obj_doc_lines[idx].strip()) == 0:
+        arguments[current_arg] = arguments[current_arg][:-1]
+        idx -= 1
+    # And we went too far by one again.
+    idx += 1
+
+    old_doc_arg = "\n".join(obj_doc_lines[start_idx:idx])
+
+    old_arguments = list(arguments.keys())
+    arguments = {name: "\n".join(doc) for name, doc in arguments.items()}
+    # Add missing arguments with a template
+    for name in set(signature.keys()) - set(arguments.keys()):
+        arg = signature[name]
+        # We ignore private arguments or *args/**kwargs (unless they are documented by the user)
+        if name.startswith("_") or arg.kind in [
+            inspect._ParameterKind.VAR_KEYWORD,
+            inspect._ParameterKind.VAR_POSITIONAL,
+        ]:
+            arguments[name] = ""
+        else:
+            arg_desc = get_default_description(arg)
+            arguments[name] = " " * (indent + 4) + f"{name} ({arg_desc}): <fill_docstring>"
+
+    # Arguments are sorted by the order in the signature unless a special comment is put.
+    if ignore_order:
+        new_param_docs = [arguments[name] for name in old_arguments if name in signature]
+        missing = set(signature.keys()) - set(old_arguments)
+        new_param_docs.extend([arguments[name] for name in missing if len(arguments[name]) > 0])
+    else:
+        new_param_docs = [arguments[name] for name in signature.keys() if len(arguments[name]) > 0]
+    new_doc_arg = "\n".join(new_param_docs)
+
+    return old_doc_arg, new_doc_arg
+
+
+def fix_docstring(obj: Any, old_doc_args: str, new_doc_args: str):
+    """
+    Fixes the docstring of an object by replacing its arguments documentaiton by the one matched with the signature.
+
+    Args:
+        obj (`Any`):
+            The object whose dostring we are fixing.
+        old_doc_args (`str`):
+            The current documentation of the parameters of `obj` in the docstring (as returned by
+            `match_docstring_with_signature`).
+        new_doc_args (`str`):
+            The documentation of the parameters of `obj` matched with its signature (as returned by
+            `match_docstring_with_signature`).
+    """
+    # Read the docstring in the source code and make sure we have the right part of the docstring
+    source, line_number = inspect.getsourcelines(obj)
+
+    # Get to the line where we start documenting arguments
+    idx = 0
+    while idx < len(source) and _re_args.search(source[idx]) is None:
+        idx += 1
+
+    if idx == len(source):
+        # Args are not defined in the docstring of this object
+        return
+
+    # Get to the line where we stop documenting arguments
+    indent = find_indent(source[idx])
+    idx += 1
+    start_idx = idx
+    while idx < len(source) and (len(source[idx].strip()) == 0 or find_indent(source[idx]) > indent):
+        idx += 1
+
+    idx -= 1
+    while len(source[idx].strip()) == 0:
+        idx -= 1
+    idx += 1
+
+    if "".join(source[start_idx:idx])[:-1] != old_doc_args:
+        # Args are not fully defined in the docstring of this object
+        return
+
+    obj_file = find_source_file(obj)
+    with open(obj_file, "r", encoding="utf-8") as f:
+        content = f.read()
+
+    # Replace content
+    lines = content.split("\n")
+    lines = lines[: line_number + start_idx - 1] + [new_doc_args] + lines[line_number + idx - 1 :]
+
+    print(f"Fixing the docstring of {obj.__name__} in {obj_file}.")
+    with open(obj_file, "w", encoding="utf-8") as f:
+        f.write("\n".join(lines))
+
+
+def check_docstrings(overwrite: bool = False):
+    """
+    Check docstrings of all public objects that are callables and are documented.
+
+    Args:
+        overwrite (`bool`, *optional*, defaults to `False`):
+            Whether to fix inconsistencies or not.
+    """
+    failures = []
+    hard_failures = []
+    to_clean = []
+    for name in dir(transformers):
+        # Skip objects that are private or not documented.
+        if name.startswith("_") or ignore_undocumented(name) or name in OBJECTS_TO_IGNORE:
+            continue
+
+        obj = getattr(transformers, name)
+        if not callable(obj) or not isinstance(obj, type) or getattr(obj, "__doc__", None) is None:
+            continue
+
+        # Check docstring
+        try:
+            result = match_docstring_with_signature(obj)
+            if result is not None:
+                old_doc, new_doc = result
+            else:
+                old_doc, new_doc = None, None
+        except Exception as e:
+            print(e)
+            hard_failures.append(name)
+            continue
+        if old_doc != new_doc:
+            if overwrite:
+                fix_docstring(obj, old_doc, new_doc)
+            else:
+                failures.append(name)
+        elif not overwrite and new_doc is not None and ("<fill_type>" in new_doc or "<fill_docstring>" in new_doc):
+            to_clean.append(name)
+
+    # Deal with errors
+    error_message = ""
+    if len(hard_failures) > 0:
+        error_message += (
+            "The argument part of the docstrings of the following objects could not be processed, check they are "
+            "properly formatted."
+        )
+        error_message += "\n" + "\n".join([f"- {name}" for name in hard_failures])
+    if len(failures) > 0:
+        error_message += (
+            "The following objects docstrings do not match their signature. Run `make fix-copies` to fix this."
+        )
+        error_message += "\n" + "\n".join([f"- {name}" for name in failures])
+    if len(to_clean) > 0:
+        error_message += (
+            "The following objects docstrings contain templates you need to fix: search for `<fill_type>` or "
+            "`<fill_docstring>`."
+        )
+        error_message += "\n" + "\n".join([f"- {name}" for name in to_clean])
+
+    if len(error_message) > 0:
+        error_message = "There was at least one problem when checking docstrings of public objects.\n" + error_message
+        raise ValueError(error_message)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--fix_and_overwrite", action="store_true", help="Whether to fix inconsistencies.")
+    args = parser.parse_args()
+
+    check_docstrings(overwrite=args.fix_and_overwrite)
diff --git a/utils/check_doctest_list.py b/utils/check_doctest_list.py
new file mode 100644
index 000000000000..71f2b84654e4
--- /dev/null
+++ b/utils/check_doctest_list.py
@@ -0,0 +1,85 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This script is responsible for cleaning the list of doctests by making sure the entries all exist and are in
+alphabetical order.
+
+Usage (from the root of the repo):
+
+Check that the doctest list is properly sorted and all files exist (used in `make repo-consistency`):
+
+```bash
+python utils/check_doctest_list.py
+```
+
+Auto-sort the doctest list if it is not properly sorted (used in `make fix-copies`):
+
+```bash
+python utils/check_doctest_list.py --fix_and_overwrite
+```
+"""
+import argparse
+import os
+
+
+# All paths are set with the intent you should run this script from the root of the repo with the command
+# python utils/check_doctest_list.py
+REPO_PATH = "."
+DOCTEST_FILE_PATHS = ["not_doctested.txt", "slow_documentation_tests.txt"]
+
+
+def clean_doctest_list(doctest_file: str, overwrite: bool = False):
+    """
+    Cleans the doctest in a given file.
+
+    Args:
+        doctest_file (`str`):
+            The path to the doctest file to check or clean.
+        overwrite (`bool`, *optional*, defaults to `False`):
+            Whether or not to fix problems. If `False`, will error when the file is not clean.
+    """
+    non_existent_paths = []
+    all_paths = []
+    with open(doctest_file, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            path = os.path.join(REPO_PATH, line)
+            if not (os.path.isfile(path) or os.path.isdir(path)):
+                non_existent_paths.append(line)
+            all_paths.append(line)
+
+    if len(non_existent_paths) > 0:
+        non_existent_paths = "\n".join([f"- {f}" for f in non_existent_paths])
+        raise ValueError(f"`{doctest_file}` contains non-existent paths:\n{non_existent_paths}")
+
+    sorted_paths = sorted(all_paths)
+    if all_paths != sorted_paths:
+        if not overwrite:
+            raise ValueError(
+                f"Files in `{doctest_file}` are not in alphabetical order, run `make fix-copies` to fix "
+                "this automatically."
+            )
+        with open(doctest_file, "w", encoding="utf-8") as f:
+            f.write("\n".join(sorted_paths) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--fix_and_overwrite", action="store_true", help="Whether to fix inconsistencies.")
+    args = parser.parse_args()
+
+    for doctest_file in DOCTEST_FILE_PATHS:
+        doctest_file = os.path.join(REPO_PATH, "utils", doctest_file)
+        clean_doctest_list(doctest_file, args.fix_and_overwrite)
diff --git a/utils/check_dummies.py b/utils/check_dummies.py
index c1a7b2bf68b7..a3ab6ebfa77b 100644
--- a/utils/check_dummies.py
+++ b/utils/check_dummies.py
@@ -12,10 +12,31 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+This script is responsible for making sure the dummies in utils/dummies_xxx.py are up to date with the main init.
+
+Why dummies? This is to make sure that a user can always import all objects from `transformers`, even if they don't
+have the necessary extra libs installed. Those objects will then raise helpful error message whenever the user tries
+to access one of their methods.
+
+Usage (from the root of the repo):
+
+Check that the dummy files are up to date (used in `make repo-consistency`):
+
+```bash
+python utils/check_dummies.py
+```
 
+Update the dummy files if needed (used in `make fix-copies`):
+
+```bash
+python utils/check_dummies.py --fix_and_overwrite
+```
+"""
 import argparse
 import os
 import re
+from typing import Dict, List, Optional
 
 
 # All paths are set with the intent you should run this script from the root of the repo with the command
@@ -26,13 +47,16 @@
 _re_backend = re.compile(r"is\_([a-z_]*)_available()")
 # Matches from xxx import bla
 _re_single_line_import = re.compile(r"\s+from\s+\S*\s+import\s+([^\(\s].*)\n")
+# Matches if not is_xxx_available()
 _re_test_backend = re.compile(r"^\s+if\s+not\s+\(?is\_[a-z_]*\_available\(\)")
 
 
+# Template for the dummy objects.
 DUMMY_CONSTANT = """
 {0} = None
 """
 
+
 DUMMY_CLASS = """
 class {0}(metaclass=DummyObject):
     _backends = {1}
@@ -48,8 +72,18 @@ def {0}(*args, **kwargs):
 """
 
 
-def find_backend(line):
-    """Find one (or multiple) backend in a code line of the init."""
+def find_backend(line: str) -> Optional[str]:
+    """
+    Find one (or multiple) backend in a code line of the init.
+
+    Args:
+        line (`str`): A code line in an init file.
+
+    Returns:
+        Optional[`str`]: If one (or several) backend is found, returns it. In the case of multiple backends (the line
+        contains `if is_xxx_available() and `is_yyy_available()`) returns all backends joined on `_and_` (so
+        `xxx_and_yyy` for instance).
+    """
     if _re_test_backend.search(line) is None:
         return None
     backends = [b[0] for b in _re_backend.findall(line)]
@@ -57,8 +91,13 @@ def find_backend(line):
     return "_and_".join(backends)
 
 
-def read_init():
-    """Read the init and extracts PyTorch, TensorFlow, SentencePiece and Tokenizers objects."""
+def read_init() -> Dict[str, List[str]]:
+    """
+    Read the init and extract backend-specific objects.
+
+    Returns:
+        Dict[str, List[str]]: A dictionary mapping backend name to the list of object names requiring that backend.
+    """
     with open(os.path.join(PATH_TO_TRANSFORMERS, "__init__.py"), "r", encoding="utf-8", newline="\n") as f:
         lines = f.readlines()
 
@@ -83,8 +122,10 @@ def read_init():
                 line = lines[line_index]
                 single_line_import_search = _re_single_line_import.search(line)
                 if single_line_import_search is not None:
+                    # Single-line imports
                     objects.extend(single_line_import_search.groups()[0].split(", "))
                 elif line.startswith(" " * 12):
+                    # Multiple-line imports (with 3 indent level)
                     objects.append(line[12:-2])
                 line_index += 1
 
@@ -95,8 +136,17 @@ def read_init():
     return backend_specific_objects
 
 
-def create_dummy_object(name, backend_name):
-    """Create the code for the dummy object corresponding to `name`."""
+def create_dummy_object(name: str, backend_name: str) -> str:
+    """
+    Create the code for a dummy object.
+
+    Args:
+        name (`str`): The name of the object.
+        backend_name (`str`): The name of the backend required for that object.
+
+    Returns:
+        `str`: The code of the dummy object.
+    """
     if name.isupper():
         return DUMMY_CONSTANT.format(name)
     elif name.islower():
@@ -105,17 +155,26 @@ def create_dummy_object(name, backend_name):
         return DUMMY_CLASS.format(name, backend_name)
 
 
-def create_dummy_files(backend_specific_objects=None):
-    """Create the content of the dummy files."""
+def create_dummy_files(backend_specific_objects: Optional[Dict[str, List[str]]] = None) -> Dict[str, str]:
+    """
+    Create the content of the dummy files.
+
+    Args:
+        backend_specific_objects (`Dict[str, List[str]]`, *optional*):
+            The mapping backend name to list of backend-specific objects. If not passed, will be obtained by calling
+            `read_init()`.
+
+    Returns:
+        `Dict[str, str]`: A dictionary mapping backend name to code of the corresponding backend file.
+    """
     if backend_specific_objects is None:
         backend_specific_objects = read_init()
-    # For special correspondence backend to module name as used in the function requires_modulename
+
     dummy_files = {}
 
     for backend, objects in backend_specific_objects.items():
         backend_name = "[" + ", ".join(f'"{b}"' for b in backend.split("_and_")) + "]"
         dummy_file = "# This file is autogenerated by the command `make fix-copies`, do not edit.\n"
-        dummy_file += "# flake8: noqa\n"
         dummy_file += "from ..utils import DummyObject, requires_backends\n\n"
         dummy_file += "\n".join([create_dummy_object(o, backend_name) for o in objects])
         dummy_files[backend] = dummy_file
@@ -123,10 +182,17 @@ def create_dummy_files(backend_specific_objects=None):
     return dummy_files
 
 
-def check_dummies(overwrite=False):
-    """Check if the dummy files are up to date and maybe `overwrite` with the right content."""
+def check_dummies(overwrite: bool = False):
+    """
+    Check if the dummy files are up to date and maybe `overwrite` with the right content.
+
+    Args:
+        overwrite (`bool`, *optional*, default to `False`):
+            Whether or not to overwrite the content of the dummy files. Will raise an error if they are not up to date
+            when `overwrite=False`.
+    """
     dummy_files = create_dummy_files()
-    # For special correspondence backend to shortcut as used in utils/dummy_xxx_objects.py
+    # For special correspondence backend name to shortcut as used in utils/dummy_xxx_objects.py
     short_names = {"torch": "pt"}
 
     # Locate actual dummy modules and read their content.
@@ -144,6 +210,7 @@ def check_dummies(overwrite=False):
         else:
             actual_dummies[backend] = ""
 
+    # Compare actual with what they should be.
     for backend in dummy_files.keys():
         if dummy_files[backend] != actual_dummies[backend]:
             if overwrite:
diff --git a/utils/check_inits.py b/utils/check_inits.py
index 9495746c9f44..43361adbf8f5 100644
--- a/utils/check_inits.py
+++ b/utils/check_inits.py
@@ -12,14 +12,37 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+Utility that checks the custom inits of Transformers are well-defined: Transformers uses init files that delay the
+import of an object to when it's actually needed. This is to avoid the main init importing all models, which would
+make the line `import transformers` very slow when the user has all optional dependencies installed. The inits with
+delayed imports have two halves: one definining a dictionary `_import_structure` which maps modules to the name of the
+objects in each module, and one in `TYPE_CHECKING` which looks like a normal init for type-checkers. The goal of this
+script is to check the objects defined in both halves are the same.
+
+This also checks the main init properly references all submodules, even if it doesn't import anything from them: every
+submodule should be defined as a key of `_import_structure`, with an empty list as value potentially, or the submodule
+won't be importable.
+
+Use from the root of the repo with:
+
+```bash
+python utils/check_inits.py
+```
+
+for a check that will error in case of inconsistencies (used by `make repo-consistency`).
+
+There is no auto-fix possible here sadly :-(
+"""
 
 import collections
-import importlib.util
 import os
 import re
 from pathlib import Path
+from typing import Dict, List, Optional, Tuple
 
 
+# Path is set with the intent you should run this script from the root of the repo.
 PATH_TO_TRANSFORMERS = "src/transformers"
 
 
@@ -36,9 +59,9 @@
 # Catches a line _import_struct["bla"].extend(["foo", "bar"]) or _import_struct["bla"] = ["foo", "bar"]
 _re_import_struct_add_many = re.compile(r"^\s*_import_structure\[\S*\](?:\.extend\(|\s*=\s+)\[([^\]]*)\]")
 # Catches a line with an object between quotes and a comma:     "MyModel",
-_re_quote_object = re.compile('^\s+"([^"]+)",')
+_re_quote_object = re.compile(r'^\s+"([^"]+)",')
 # Catches a line with objects between brackets only:    ["foo", "bar"],
-_re_between_brackets = re.compile("^\s+\[([^\]]+)\]")
+_re_between_brackets = re.compile(r"^\s+\[([^\]]+)\]")
 # Catches a line with from foo import bar, bla, boo
 _re_import = re.compile(r"\s+from\s+\S*\s+import\s+([^\(\s].*)\n")
 # Catches a line with try:
@@ -47,8 +70,18 @@
 _re_else = re.compile(r"^\s*else:")
 
 
-def find_backend(line):
-    """Find one (or multiple) backend in a code line of the init."""
+def find_backend(line: str) -> Optional[str]:
+    """
+    Find one (or multiple) backend in a code line of the init.
+
+    Args:
+        line (`str`): A code line of the main init.
+
+    Returns:
+        Optional[`str`]: If one (or several) backend is found, returns it. In the case of multiple backends (the line
+        contains `if is_xxx_available() and `is_yyy_available()`) returns all backends joined on `_and_` (so
+        `xxx_and_yyy` for instance).
+    """
     if _re_test_backend.search(line) is None:
         return None
     backends = [b[0] for b in _re_backend.findall(line)]
@@ -56,14 +89,23 @@ def find_backend(line):
     return "_and_".join(backends)
 
 
-def parse_init(init_file):
+def parse_init(init_file) -> Optional[Tuple[Dict[str, List[str]], Dict[str, List[str]]]]:
     """
-    Read an init_file and parse (per backend) the _import_structure objects defined and the TYPE_CHECKING objects
-    defined
+    Read an init_file and parse (per backend) the `_import_structure` objects defined and the `TYPE_CHECKING` objects
+    defined.
+
+    Args:
+        init_file (`str`): Path to the init file to inspect.
+
+    Returns:
+        `Optional[Tuple[Dict[str, List[str]], Dict[str, List[str]]]]`: A tuple of two dictionaries mapping backends to list of
+        imported objects, one for the `_import_structure` part of the init and one for the `TYPE_CHECKING` part of the
+        init. Returns `None` if the init is not a custom init.
     """
     with open(init_file, "r", encoding="utf-8", newline="\n") as f:
         lines = f.readlines()
 
+    # Get the to `_import_structure` definition.
     line_index = 0
     while line_index < len(lines) and not lines[line_index].startswith("_import_structure = {"):
         line_index += 1
@@ -79,7 +121,7 @@ def parse_init(init_file):
         # If we have everything on a single line, let's deal with it.
         if _re_one_line_import_struct.search(line):
             content = _re_one_line_import_struct.search(line).groups()[0]
-            imports = re.findall("\[([^\]]+)\]", content)
+            imports = re.findall(r"\[([^\]]+)\]", content)
             for imp in imports:
                 objects.extend([obj[1:-1] for obj in imp.split(", ")])
             line_index += 1
@@ -92,7 +134,9 @@ def parse_init(init_file):
             objects.append(line[9:-3])
         line_index += 1
 
+    # Those are stored with the key "none".
     import_dict_objects = {"none": objects}
+
     # Let's continue with backend-specific objects in _import_structure
     while not lines[line_index].startswith("if TYPE_CHECKING"):
         # If the line is an if not is_backend_available, we grab all objects associated.
@@ -152,6 +196,7 @@ def parse_init(init_file):
         line_index += 1
 
     type_hint_objects = {"none": objects}
+
     # Let's continue with backend-specific objects
     while line_index < len(lines):
         # If the line is an if is_backend_available, we grab all objects associated.
@@ -187,19 +232,33 @@ def parse_init(init_file):
     return import_dict_objects, type_hint_objects
 
 
-def analyze_results(import_dict_objects, type_hint_objects):
+def analyze_results(import_dict_objects: Dict[str, List[str]], type_hint_objects: Dict[str, List[str]]) -> List[str]:
     """
     Analyze the differences between _import_structure objects and TYPE_CHECKING objects found in an init.
+
+    Args:
+        import_dict_objects (`Dict[str, List[str]]`):
+            A dictionary mapping backend names (`"none"` for the objects independent of any specific backend) to
+            list of imported objects.
+        type_hint_objects (`Dict[str, List[str]]`):
+            A dictionary mapping backend names (`"none"` for the objects independent of any specific backend) to
+            list of imported objects.
+
+    Returns:
+        `List[str]`: The list of errors corresponding to mismatches.
     """
 
     def find_duplicates(seq):
         return [k for k, v in collections.Counter(seq).items() if v > 1]
 
+    # If one backend is missing from the other part of the init, error early.
     if list(import_dict_objects.keys()) != list(type_hint_objects.keys()):
         return ["Both sides of the init do not have the same backends!"]
 
     errors = []
+    # Find all errors.
     for key in import_dict_objects.keys():
+        # Duplicate imports in any half.
         duplicate_imports = find_duplicates(import_dict_objects[key])
         if duplicate_imports:
             errors.append(f"Duplicate _import_structure definitions for: {duplicate_imports}")
@@ -207,6 +266,7 @@ def find_duplicates(seq):
         if duplicate_type_hints:
             errors.append(f"Duplicate TYPE_CHECKING objects for: {duplicate_type_hints}")
 
+        # Missing imports in either part of the init.
         if sorted(set(import_dict_objects[key])) != sorted(set(type_hint_objects[key])):
             name = "base imports" if key == "none" else f"{key} backend"
             errors.append(f"Differences for {name}:")
@@ -238,7 +298,7 @@ def check_all_inits():
         raise ValueError("\n\n".join(failures))
 
 
-def get_transformers_submodules():
+def get_transformers_submodules() -> List[str]:
     """
     Returns the list of Transformers submodules.
     """
@@ -273,19 +333,28 @@ def get_transformers_submodules():
 
 
 def check_submodules():
+    """
+    Check all submodules of Transformers are properly registered in the main init. Error otherwise.
+    """
     # This is to make sure the transformers module imported is the one in the repo.
-    spec = importlib.util.spec_from_file_location(
-        "transformers",
-        os.path.join(PATH_TO_TRANSFORMERS, "__init__.py"),
-        submodule_search_locations=[PATH_TO_TRANSFORMERS],
-    )
-    transformers = spec.loader.load_module()
+    from transformers.utils import direct_transformers_import
+
+    transformers = direct_transformers_import(PATH_TO_TRANSFORMERS)
+
+    import_structure_keys = set(transformers._import_structure.keys())
+    # This contains all the base keys of the _import_structure object defined in the init, but if the user is missing
+    # some optional dependencies, they may not have all of them. Thus we read the init to read all additions and
+    # (potentiall re-) add them.
+    with open(os.path.join(PATH_TO_TRANSFORMERS, "__init__.py"), "r") as f:
+        init_content = f.read()
+    import_structure_keys.update(set(re.findall(r"import_structure\[\"([^\"]*)\"\]", init_content)))
 
     module_not_registered = [
         module
         for module in get_transformers_submodules()
-        if module not in IGNORE_SUBMODULES and module not in transformers._import_structure.keys()
+        if module not in IGNORE_SUBMODULES and module not in import_structure_keys
     ]
+
     if len(module_not_registered) > 0:
         list_of_modules = "\n".join(f"- {module}" for module in module_not_registered)
         raise ValueError(
diff --git a/utils/check_model_tester.py b/utils/check_model_tester.py
new file mode 100644
index 000000000000..8ace411b1a4e
--- /dev/null
+++ b/utils/check_model_tester.py
@@ -0,0 +1,63 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import glob
+import os
+
+from get_test_info import get_tester_classes
+
+
+if __name__ == "__main__":
+    failures = []
+
+    pattern = os.path.join("tests", "models", "**", "test_modeling_*.py")
+    test_files = glob.glob(pattern)
+    # TODO: deal with TF/Flax too
+    test_files = [
+        x for x in test_files if not (x.startswith("test_modeling_tf_") or x.startswith("test_modeling_flax_"))
+    ]
+
+    for test_file in test_files:
+        tester_classes = get_tester_classes(test_file)
+        for tester_class in tester_classes:
+            # A few tester classes don't have `parent` parameter in `__init__`.
+            # TODO: deal this better
+            try:
+                tester = tester_class(parent=None)
+            except Exception:
+                continue
+            if hasattr(tester, "get_config"):
+                config = tester.get_config()
+                for k, v in config.to_dict().items():
+                    if isinstance(v, int):
+                        target = None
+                        if k in ["vocab_size"]:
+                            target = 100
+                        elif k in ["max_position_embeddings"]:
+                            target = 128
+                        elif k in ["hidden_size", "d_model"]:
+                            target = 40
+                        elif k == ["num_layers", "num_hidden_layers", "num_encoder_layers", "num_decoder_layers"]:
+                            target = 5
+                        if target is not None and v > target:
+                            failures.append(
+                                f"{tester_class.__name__} will produce a `config` of type `{config.__class__.__name__}`"
+                                f' with config["{k}"] = {v} which is too large for testing! Set its value to be smaller'
+                                f" than {target}."
+                            )
+
+    if len(failures) > 0:
+        raise Exception(f"There were {len(failures)} failures:\n" + "\n".join(failures))
diff --git a/utils/check_repo.py b/utils/check_repo.py
old mode 100755
new mode 100644
index fc687ba464e9..c8bd228eaa77
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -12,19 +12,43 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-import importlib
+"""
+Utility that performs several consistency checks on the repo. This includes:
+- checking all models are properly defined in the __init__ of models/
+- checking all models are in the main __init__
+- checking all models are properly tested
+- checking all object in the main __init__ are documented
+- checking all models are in at least one auto class
+- checking all the auto mapping are properly defined (no typos, importable)
+- checking the list of deprecated models is up to date
+
+Use from the root of the repo with (as used in `make repo-consistency`):
+
+```bash
+python utils/check_repo.py
+```
+
+It has no auto-fix mode.
+"""
 import inspect
 import os
 import re
+import sys
+import types
 import warnings
 from collections import OrderedDict
 from difflib import get_close_matches
 from pathlib import Path
+from typing import List, Tuple
 
 from transformers import is_flax_available, is_tf_available, is_torch_available
 from transformers.models.auto import get_values
-from transformers.utils import ENV_VARS_TRUE_VALUES
+from transformers.models.auto.configuration_auto import CONFIG_MAPPING_NAMES
+from transformers.models.auto.feature_extraction_auto import FEATURE_EXTRACTOR_MAPPING_NAMES
+from transformers.models.auto.image_processing_auto import IMAGE_PROCESSOR_MAPPING_NAMES
+from transformers.models.auto.processing_auto import PROCESSOR_MAPPING_NAMES
+from transformers.models.auto.tokenization_auto import TOKENIZER_MAPPING_NAMES
+from transformers.utils import ENV_VARS_TRUE_VALUES, direct_transformers_import
 
 
 # All paths are set with the intent you should run this script from the root of the repo with the command
@@ -41,83 +65,40 @@
     "RealmBertModel",
     "T5Stack",
     "MT5Stack",
+    "UMT5Stack",
+    "Pop2PianoStack",
     "SwitchTransformersStack",
     "TFDPRSpanPredictor",
     "MaskFormerSwinModel",
     "MaskFormerSwinPreTrainedModel",
+    "BridgeTowerTextModel",
+    "BridgeTowerVisionModel",
 ]
 
 # Update this list for models that are not tested with a comment explaining the reason it should not be.
 # Being in this list is an exception and should **not** be the rule.
 IGNORE_NON_TESTED = PRIVATE_MODELS.copy() + [
     # models to ignore for not tested
-    "CLIPSegDecoder",  # Building part of bigger (tested) model.
-    "TableTransformerEncoder",  # Building part of bigger (tested) model.
-    "TableTransformerDecoder",  # Building part of bigger (tested) model.
-    "TimeSeriesTransformerEncoder",  # Building part of bigger (tested) model.
-    "TimeSeriesTransformerDecoder",  # Building part of bigger (tested) model.
+    "InstructBlipQFormerModel",  # Building part of bigger (tested) model.
+    "UMT5EncoderModel",  # Building part of bigger (tested) model.
+    "Blip2QFormerModel",  # Building part of bigger (tested) model.
+    "ErnieMForInformationExtraction",
+    "GraphormerDecoderHead",  # Building part of bigger (tested) model.
     "JukeboxVQVAE",  # Building part of bigger (tested) model.
     "JukeboxPrior",  # Building part of bigger (tested) model.
-    "DeformableDetrEncoder",  # Building part of bigger (tested) model.
-    "DeformableDetrDecoder",  # Building part of bigger (tested) model.
-    "OPTDecoder",  # Building part of bigger (tested) model.
-    "WhisperDecoder",  # Building part of bigger (tested) model.
-    "WhisperEncoder",  # Building part of bigger (tested) model.
     "DecisionTransformerGPT2Model",  # Building part of bigger (tested) model.
     "SegformerDecodeHead",  # Building part of bigger (tested) model.
-    "PLBartEncoder",  # Building part of bigger (tested) model.
-    "PLBartDecoder",  # Building part of bigger (tested) model.
-    "PLBartDecoderWrapper",  # Building part of bigger (tested) model.
-    "BigBirdPegasusEncoder",  # Building part of bigger (tested) model.
-    "BigBirdPegasusDecoder",  # Building part of bigger (tested) model.
-    "BigBirdPegasusDecoderWrapper",  # Building part of bigger (tested) model.
-    "DetrEncoder",  # Building part of bigger (tested) model.
-    "DetrDecoder",  # Building part of bigger (tested) model.
-    "DetrDecoderWrapper",  # Building part of bigger (tested) model.
-    "ConditionalDetrEncoder",  # Building part of bigger (tested) model.
-    "ConditionalDetrDecoder",  # Building part of bigger (tested) model.
-    "M2M100Encoder",  # Building part of bigger (tested) model.
-    "M2M100Decoder",  # Building part of bigger (tested) model.
-    "MCTCTEncoder",  # Building part of bigger (tested) model.
-    "Speech2TextEncoder",  # Building part of bigger (tested) model.
-    "Speech2TextDecoder",  # Building part of bigger (tested) model.
-    "LEDEncoder",  # Building part of bigger (tested) model.
-    "LEDDecoder",  # Building part of bigger (tested) model.
-    "BartDecoderWrapper",  # Building part of bigger (tested) model.
-    "BartEncoder",  # Building part of bigger (tested) model.
+    "MgpstrModel",  # Building part of bigger (tested) model.
     "BertLMHeadModel",  # Needs to be setup as decoder.
-    "BlenderbotSmallEncoder",  # Building part of bigger (tested) model.
-    "BlenderbotSmallDecoderWrapper",  # Building part of bigger (tested) model.
-    "BlenderbotEncoder",  # Building part of bigger (tested) model.
-    "BlenderbotDecoderWrapper",  # Building part of bigger (tested) model.
-    "MBartEncoder",  # Building part of bigger (tested) model.
-    "MBartDecoderWrapper",  # Building part of bigger (tested) model.
     "MegatronBertLMHeadModel",  # Building part of bigger (tested) model.
-    "MegatronBertEncoder",  # Building part of bigger (tested) model.
-    "MegatronBertDecoder",  # Building part of bigger (tested) model.
-    "MegatronBertDecoderWrapper",  # Building part of bigger (tested) model.
-    "MvpDecoderWrapper",  # Building part of bigger (tested) model.
-    "MvpEncoder",  # Building part of bigger (tested) model.
-    "PegasusEncoder",  # Building part of bigger (tested) model.
-    "PegasusDecoderWrapper",  # Building part of bigger (tested) model.
-    "PegasusXEncoder",  # Building part of bigger (tested) model.
-    "PegasusXDecoder",  # Building part of bigger (tested) model.
-    "PegasusXDecoderWrapper",  # Building part of bigger (tested) model.
-    "DPREncoder",  # Building part of bigger (tested) model.
-    "ProphetNetDecoderWrapper",  # Building part of bigger (tested) model.
     "RealmBertModel",  # Building part of bigger (tested) model.
     "RealmReader",  # Not regular model.
     "RealmScorer",  # Not regular model.
     "RealmForOpenQA",  # Not regular model.
     "ReformerForMaskedLM",  # Needs to be setup as decoder.
-    "Speech2Text2DecoderWrapper",  # Building part of bigger (tested) model.
-    "TFDPREncoder",  # Building part of bigger (tested) model.
     "TFElectraMainLayer",  # Building part of bigger (tested) model (should it be a TFPreTrainedModel ?)
     "TFRobertaForMultipleChoice",  # TODO: fix
     "TFRobertaPreLayerNormForMultipleChoice",  # TODO: fix
-    "TrOCRDecoderWrapper",  # Building part of bigger (tested) model.
-    "TFWhisperEncoder",  # Building part of bigger (tested) model.
-    "TFWhisperDecoder",  # Building part of bigger (tested) model.
     "SeparableConv1D",  # Building part of bigger (tested) model.
     "FlaxBartForCausalLM",  # Building part of bigger (tested) model.
     "FlaxBertForCausalLM",  # Building part of bigger (tested) model. Tested implicitly through FlaxRobertaForCausalLM.
@@ -125,6 +106,11 @@
     "TFSegformerDecodeHead",  # Not a regular model.
     "AltRobertaModel",  # Building part of bigger (tested) model.
     "BlipTextLMHeadModel",  # No need to test it as it is tested by BlipTextVision models
+    "TFBlipTextLMHeadModel",  # No need to test it as it is tested by BlipTextVision models
+    "BridgeTowerTextModel",  # No need to test it as it is tested by BridgeTowerModel model.
+    "BridgeTowerVisionModel",  # No need to test it as it is tested by BridgeTowerModel model.
+    "BarkCausalModel",  # Building part of bigger (tested) model.
+    "BarkModel",  # Does not have a forward signature - generation tested with integration tests
 ]
 
 # Update this list with test files that don't have a tester with a `all_model_classes` variable and which don't
@@ -143,35 +129,58 @@
     "models/xlm_prophetnet/test_modeling_xlm_prophetnet.py",
     "models/xlm_roberta/test_modeling_xlm_roberta.py",
     "models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py",
+    "models/vision_text_dual_encoder/test_modeling_tf_vision_text_dual_encoder.py",
     "models/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py",
     "models/decision_transformer/test_modeling_decision_transformer.py",
+    "models/bark/test_modeling_bark.py",
 ]
 
 # Update this list for models that are not in any of the auto MODEL_XXX_MAPPING. Being in this list is an exception and
 # should **not** be the rule.
 IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
     # models to ignore for model xxx mapping
+    "AlignTextModel",
+    "AlignVisionModel",
+    "ClapTextModel",
+    "ClapTextModelWithProjection",
+    "ClapAudioModel",
+    "ClapAudioModelWithProjection",
+    "Blip2ForConditionalGeneration",
+    "Blip2QFormerModel",
+    "Blip2VisionModel",
+    "ErnieMForInformationExtraction",
     "GitVisionModel",
+    "GraphormerModel",
+    "GraphormerForGraphClassification",
     "BlipForConditionalGeneration",
     "BlipForImageTextRetrieval",
     "BlipForQuestionAnswering",
     "BlipVisionModel",
     "BlipTextLMHeadModel",
     "BlipTextModel",
+    "BrosSpadeEEForTokenClassification",
+    "BrosSpadeELForTokenClassification",
+    "TFBlipForConditionalGeneration",
+    "TFBlipForImageTextRetrieval",
+    "TFBlipForQuestionAnswering",
+    "TFBlipVisionModel",
+    "TFBlipTextLMHeadModel",
+    "TFBlipTextModel",
     "Swin2SRForImageSuperResolution",
+    "BridgeTowerForImageAndTextRetrieval",
+    "BridgeTowerForMaskedLM",
+    "BridgeTowerForContrastiveLearning",
     "CLIPSegForImageSegmentation",
     "CLIPSegVisionModel",
     "CLIPSegTextModel",
     "EsmForProteinFolding",
+    "GPTSanJapaneseModel",
     "TimeSeriesTransformerForPrediction",
+    "InformerForPrediction",
+    "AutoformerForPrediction",
     "JukeboxVQVAE",
     "JukeboxPrior",
-    "PegasusXEncoder",
-    "PegasusXDecoder",
-    "PegasusXDecoderWrapper",
-    "PegasusXEncoder",
-    "PegasusXDecoder",
-    "PegasusXDecoderWrapper",
+    "SamModel",
     "DPTForDepthEstimation",
     "DecisionTransformerGPT2Model",
     "GLPNForDepthEstimation",
@@ -179,17 +188,11 @@
     "ViltForImageAndTextRetrieval",
     "ViltForTokenClassification",
     "ViltForMaskedLM",
-    "XGLMEncoder",
-    "XGLMDecoder",
-    "XGLMDecoderWrapper",
     "PerceiverForMultimodalAutoencoding",
     "PerceiverForOpticalFlow",
     "SegformerDecodeHead",
     "TFSegformerDecodeHead",
     "FlaxBeitForMaskedImageModeling",
-    "PLBartEncoder",
-    "PLBartDecoder",
-    "PLBartDecoderWrapper",
     "BeitForMaskedImageModeling",
     "ChineseCLIPTextModel",
     "ChineseCLIPVisionModel",
@@ -204,9 +207,13 @@
     "TFGroupViTTextModel",
     "TFGroupViTVisionModel",
     "FlaxCLIPTextModel",
+    "FlaxCLIPTextModelWithProjection",
     "FlaxCLIPVisionModel",
     "FlaxWav2Vec2ForCTC",
     "DetrForSegmentation",
+    "Pix2StructVisionModel",
+    "Pix2StructTextModel",
+    "Pix2StructForConditionalGeneration",
     "ConditionalDetrForSegmentation",
     "DPRReader",
     "FlaubertForQuestionAnswering",
@@ -216,11 +223,14 @@
     "FlavaMultimodalModel",
     "GPT2DoubleHeadsModel",
     "GPTSw3DoubleHeadsModel",
+    "InstructBlipVisionModel",
+    "InstructBlipQFormerModel",
     "LayoutLMForQuestionAnswering",
     "LukeForMaskedLM",
     "LukeForEntityClassification",
     "LukeForEntityPairClassification",
     "LukeForEntitySpanClassification",
+    "MgpstrModel",
     "OpenAIGPTDoubleHeadsModel",
     "OwlViTTextModel",
     "OwlViTVisionModel",
@@ -257,10 +267,35 @@
     "AltCLIPTextModel",
     "AltCLIPVisionModel",
     "AltRobertaModel",
+    "TvltForAudioVisualClassification",
+    "BarkCausalModel",
+    "BarkCoarseModel",
+    "BarkFineModel",
+    "BarkSemanticModel",
+    "MusicgenModel",
+    "MusicgenForConditionalGeneration",
+    "SpeechT5ForSpeechToSpeech",
+    "SpeechT5ForTextToSpeech",
+    "SpeechT5HifiGan",
+    "VitMatteForImageMatting",
+]
+
+# DO NOT edit this list!
+# (The corresponding pytorch objects should never have been in the main `__init__`, but it's too late to remove)
+OBJECT_TO_SKIP_IN_MAIN_INIT_CHECK = [
+    "FlaxBertLayer",
+    "FlaxBigBirdLayer",
+    "FlaxRoFormerLayer",
+    "TFBertLayer",
+    "TFLxmertEncoder",
+    "TFLxmertXLayer",
+    "TFMPNetLayer",
+    "TFMobileBertLayer",
+    "TFSegformerLayer",
+    "TFViTMAELayer",
 ]
 
-# Update this list for models that have multiple model types for the same
-# model doc
+# Update this list for models that have multiple model types for the same model doc.
 MODEL_TYPE_TO_DOC_MAPPING = OrderedDict(
     [
         ("data2vec-text", "data2vec"),
@@ -272,28 +307,55 @@
 
 
 # This is to make sure the transformers module imported is the one in the repo.
-spec = importlib.util.spec_from_file_location(
-    "transformers",
-    os.path.join(PATH_TO_TRANSFORMERS, "__init__.py"),
-    submodule_search_locations=[PATH_TO_TRANSFORMERS],
-)
-transformers = spec.loader.load_module()
+transformers = direct_transformers_import(PATH_TO_TRANSFORMERS)
+
+
+def check_missing_backends():
+    """
+    Checks if all backends are installed (otherwise the check of this script is incomplete). Will error in the CI if
+    that's not the case but only throw a warning for users running this.
+    """
+    missing_backends = []
+    if not is_torch_available():
+        missing_backends.append("PyTorch")
+    if not is_tf_available():
+        missing_backends.append("TensorFlow")
+    if not is_flax_available():
+        missing_backends.append("Flax")
+    if len(missing_backends) > 0:
+        missing = ", ".join(missing_backends)
+        if os.getenv("TRANSFORMERS_IS_CI", "").upper() in ENV_VARS_TRUE_VALUES:
+            raise Exception(
+                "Full repo consistency checks require all backends to be installed (with `pip install -e .[dev]` in the "
+                f"Transformers repo, the following are missing: {missing}."
+            )
+        else:
+            warnings.warn(
+                "Full repo consistency checks require all backends to be installed (with `pip install -e .[dev]` in the "
+                f"Transformers repo, the following are missing: {missing}. While it's probably fine as long as you "
+                "didn't make any change in one of those backends modeling files, you should probably execute the "
+                "command above to be on the safe side."
+            )
 
 
 def check_model_list():
-    """Check the model list inside the transformers library."""
+    """
+    Checks the model listed as subfolders of `models` match the models available in `transformers.models`.
+    """
     # Get the models from the directory structure of `src/transformers/models/`
     models_dir = os.path.join(PATH_TO_TRANSFORMERS, "models")
     _models = []
     for model in os.listdir(models_dir):
+        if model == "deprecated":
+            continue
         model_dir = os.path.join(models_dir, model)
         if os.path.isdir(model_dir) and "__init__.py" in os.listdir(model_dir):
             _models.append(model)
 
-    # Get the models from the directory structure of `src/transformers/models/`
+    # Get the models in the submodule `transformers.models`
     models = [model for model in dir(transformers.models) if not model.startswith("__")]
 
-    missing_models = sorted(list(set(_models).difference(models)))
+    missing_models = sorted(set(_models).difference(models))
     if missing_models:
         raise Exception(
             f"The following models should be included in {models_dir}/__init__.py: {','.join(missing_models)}."
@@ -302,8 +364,8 @@ def check_model_list():
 
 # If some modeling modules should be ignored for all checks, they should be added in the nested list
 # _ignore_modules of this function.
-def get_model_modules():
-    """Get the model modules inside the transformers library."""
+def get_model_modules() -> List[str]:
+    """Get all the model modules inside the transformers library (except deprecated models)."""
     _ignore_modules = [
         "modeling_auto",
         "modeling_encoder_decoder",
@@ -318,6 +380,7 @@ def get_model_modules():
         "modeling_speech_encoder_decoder",
         "modeling_flax_speech_encoder_decoder",
         "modeling_flax_vision_encoder_decoder",
+        "modeling_timm_backbone",
         "modeling_transfo_xl_utilities",
         "modeling_tf_auto",
         "modeling_tf_encoder_decoder",
@@ -331,18 +394,31 @@ def get_model_modules():
     modules = []
     for model in dir(transformers.models):
         # There are some magic dunder attributes in the dir, we ignore them
-        if not model.startswith("__"):
-            model_module = getattr(transformers.models, model)
-            for submodule in dir(model_module):
-                if submodule.startswith("modeling") and submodule not in _ignore_modules:
-                    modeling_module = getattr(model_module, submodule)
-                    if inspect.ismodule(modeling_module):
-                        modules.append(modeling_module)
+        if model == "deprecated" or model.startswith("__"):
+            continue
+
+        model_module = getattr(transformers.models, model)
+        for submodule in dir(model_module):
+            if submodule.startswith("modeling") and submodule not in _ignore_modules:
+                modeling_module = getattr(model_module, submodule)
+                if inspect.ismodule(modeling_module):
+                    modules.append(modeling_module)
     return modules
 
 
-def get_models(module, include_pretrained=False):
-    """Get the objects in module that are models."""
+def get_models(module: types.ModuleType, include_pretrained: bool = False) -> List[Tuple[str, type]]:
+    """
+    Get the objects in a module that are models.
+
+    Args:
+        module (`types.ModuleType`):
+            The module from which we are extracting models.
+        include_pretrained (`bool`, *optional*, defaults to `False`):
+            Whether or not to include the `PreTrainedModel` subclass (like `BertPreTrainedModel`) or not.
+
+    Returns:
+        List[Tuple[str, type]]: List of models as tuples (class name, actual class).
+    """
     models = []
     model_classes = (transformers.PreTrainedModel, transformers.TFPreTrainedModel, transformers.FlaxPreTrainedModel)
     for attr_name in dir(module):
@@ -354,19 +430,25 @@ def get_models(module, include_pretrained=False):
     return models
 
 
-def is_a_private_model(model):
-    """Returns True if the model should not be in the main init."""
-    if model in PRIVATE_MODELS:
-        return True
-
-    # Wrapper, Encoder and Decoder are all privates
+def is_building_block(model: str) -> bool:
+    """
+    Returns `True` if a model is a building block part of a bigger model.
+    """
     if model.endswith("Wrapper"):
         return True
     if model.endswith("Encoder"):
         return True
     if model.endswith("Decoder"):
         return True
-    return False
+    if model.endswith("Prenet"):
+        return True
+
+
+def is_a_private_model(model: str) -> bool:
+    """Returns `True` if the model should not be in the main init."""
+    if model in PRIVATE_MODELS:
+        return True
+    return is_building_block(model)
 
 
 def check_models_are_in_init():
@@ -386,11 +468,14 @@ def check_models_are_in_init():
 
 # If some test_modeling files should be ignored when checking models are all tested, they should be added in the
 # nested list _ignore_files of this function.
-def get_model_test_files():
-    """Get the model test files.
+def get_model_test_files() -> List[str]:
+    """
+    Get the model test files.
 
-    The returned files should NOT contain the `tests` (i.e. `PATH_TO_TESTS` defined in this script). They will be
-    considered as paths relative to `tests`. A caller has to use `os.path.join(PATH_TO_TESTS, ...)` to access the files.
+    Returns:
+        `List[str]`: The list of test files. The returned files will NOT contain the `tests` (i.e. `PATH_TO_TESTS`
+        defined in this script). They will be considered as paths relative to `tests`. A caller has to use
+        `os.path.join(PATH_TO_TESTS, ...)` to access the files.
     """
 
     _ignore_files = [
@@ -403,7 +488,6 @@ def get_model_test_files():
         "test_modeling_tf_encoder_decoder",
     ]
     test_files = []
-    # Check both `PATH_TO_TESTS` and `PATH_TO_TESTS/models`
     model_test_root = os.path.join(PATH_TO_TESTS, "models")
     model_test_dirs = []
     for x in os.listdir(model_test_root):
@@ -416,7 +500,7 @@ def get_model_test_files():
             path = os.path.join(target_dir, file_or_dir)
             if os.path.isfile(path):
                 filename = os.path.split(path)[-1]
-                if "test_modeling" in filename and not os.path.splitext(filename)[0] in _ignore_files:
+                if "test_modeling" in filename and os.path.splitext(filename)[0] not in _ignore_files:
                     file = os.path.join(*path.split(os.sep)[1:])
                     test_files.append(file)
 
@@ -425,9 +509,17 @@ def get_model_test_files():
 
 # This is a bit hacky but I didn't find a way to import the test_file as a module and read inside the tester class
 # for the all_model_classes variable.
-def find_tested_models(test_file):
-    """Parse the content of test_file to detect what's in all_model_classes"""
-    # This is a bit hacky but I didn't find a way to import the test_file as a module and read inside the class
+def find_tested_models(test_file: str) -> List[str]:
+    """
+    Parse the content of test_file to detect what's in `all_model_classes`. This detects the models that inherit from
+    the common test class.
+
+    Args:
+        test_file (`str`): The path to the test file to check
+
+    Returns:
+        `List[str]`: The list of models tested in that file.
+    """
     with open(os.path.join(PATH_TO_TESTS, test_file), "r", encoding="utf-8", newline="\n") as f:
         content = f.read()
     all_models = re.findall(r"all_model_classes\s+=\s+\(\s*\(([^\)]*)\)", content)
@@ -443,8 +535,25 @@ def find_tested_models(test_file):
         return model_tested
 
 
-def check_models_are_tested(module, test_file):
-    """Check models defined in module are tested in test_file."""
+def should_be_tested(model_name: str) -> bool:
+    """
+    Whether or not a model should be tested.
+    """
+    if model_name in IGNORE_NON_TESTED:
+        return False
+    return not is_building_block(model_name)
+
+
+def check_models_are_tested(module: types.ModuleType, test_file: str) -> List[str]:
+    """Check models defined in a module are all tested in a given file.
+
+    Args:
+        module (`types.ModuleType`): The module in which we get the models.
+        test_file (`str`): The path to the file where the module is tested.
+
+    Returns:
+        `List[str]`: The list of error messages corresponding to models not tested.
+    """
     # XxxPreTrainedModel are not tested
     defined_models = get_models(module)
     tested_models = find_tested_models(test_file)
@@ -458,7 +567,7 @@ def check_models_are_tested(module, test_file):
         ]
     failures = []
     for model_name, _ in defined_models:
-        if model_name not in tested_models and model_name not in IGNORE_NON_TESTED:
+        if model_name not in tested_models and should_be_tested(model_name):
             failures.append(
                 f"{model_name} is defined in {module.__name__} but is not tested in "
                 + f"{os.path.join(PATH_TO_TESTS, test_file)}. Add it to the all_model_classes in that file."
@@ -474,6 +583,7 @@ def check_all_models_are_tested():
     test_files = get_model_test_files()
     failures = []
     for module in modules:
+        # Matches a module to its test file.
         test_file = [file for file in test_files if f"test_{module.__name__.split('.')[-1]}.py" in file]
         if len(test_file) == 0:
             failures.append(f"{module.__name__} does not have its corresponding test file {test_file}.")
@@ -488,7 +598,7 @@ def check_all_models_are_tested():
         raise Exception(f"There were {len(failures)} failures:\n" + "\n".join(failures))
 
 
-def get_all_auto_configured_models():
+def get_all_auto_configured_models() -> List[str]:
     """Return the list of all models in at least one auto class."""
     result = set()  # To avoid duplicates we concatenate all model classes in a set.
     if is_torch_available():
@@ -503,11 +613,11 @@ def get_all_auto_configured_models():
         for attr_name in dir(transformers.models.auto.modeling_flax_auto):
             if attr_name.startswith("FLAX_MODEL_") and attr_name.endswith("MAPPING_NAMES"):
                 result = result | set(get_values(getattr(transformers.models.auto.modeling_flax_auto, attr_name)))
-    return [cls for cls in result]
+    return list(result)
 
 
-def ignore_unautoclassed(model_name):
-    """Rules to determine if `name` should be in an auto class."""
+def ignore_unautoclassed(model_name: str) -> bool:
+    """Rules to determine if a model should be in an auto class."""
     # Special white list
     if model_name in IGNORE_NON_AUTO_CONFIGURED:
         return True
@@ -517,8 +627,19 @@ def ignore_unautoclassed(model_name):
     return False
 
 
-def check_models_are_auto_configured(module, all_auto_models):
-    """Check models defined in module are each in an auto class."""
+def check_models_are_auto_configured(module: types.ModuleType, all_auto_models: List[str]) -> List[str]:
+    """
+    Check models defined in module are each in an auto class.
+
+    Args:
+        module (`types.ModuleType`):
+            The module in which we get the models.
+        all_auto_models (`List[str]`):
+            The list of all models in an auto class (as obtained with `get_all_auto_configured_models()`).
+
+    Returns:
+        `List[str]`: The list of error messages corresponding to models not tested.
+    """
     defined_models = get_models(module)
     failures = []
     for model_name, _ in defined_models:
@@ -533,27 +654,8 @@ def check_models_are_auto_configured(module, all_auto_models):
 
 def check_all_models_are_auto_configured():
     """Check all models are each in an auto class."""
-    missing_backends = []
-    if not is_torch_available():
-        missing_backends.append("PyTorch")
-    if not is_tf_available():
-        missing_backends.append("TensorFlow")
-    if not is_flax_available():
-        missing_backends.append("Flax")
-    if len(missing_backends) > 0:
-        missing = ", ".join(missing_backends)
-        if os.getenv("TRANSFORMERS_IS_CI", "").upper() in ENV_VARS_TRUE_VALUES:
-            raise Exception(
-                "Full quality checks require all backends to be installed (with `pip install -e .[dev]` in the "
-                f"Transformers repo, the following are missing: {missing}."
-            )
-        else:
-            warnings.warn(
-                "Full quality checks require all backends to be installed (with `pip install -e .[dev]` in the "
-                f"Transformers repo, the following are missing: {missing}. While it's probably fine as long as you "
-                "didn't make any change in one of those backends modeling files, you should probably execute the "
-                "command above to be on the safe side."
-            )
+    # This is where we need to check we have all backends or the check is incomplete.
+    check_missing_backends()
     modules = get_model_modules()
     all_auto_models = get_all_auto_configured_models()
     failures = []
@@ -565,11 +667,163 @@ def check_all_models_are_auto_configured():
         raise Exception(f"There were {len(failures)} failures:\n" + "\n".join(failures))
 
 
+def check_all_auto_object_names_being_defined():
+    """Check all names defined in auto (name) mappings exist in the library."""
+    # This is where we need to check we have all backends or the check is incomplete.
+    check_missing_backends()
+
+    failures = []
+    mappings_to_check = {
+        "TOKENIZER_MAPPING_NAMES": TOKENIZER_MAPPING_NAMES,
+        "IMAGE_PROCESSOR_MAPPING_NAMES": IMAGE_PROCESSOR_MAPPING_NAMES,
+        "FEATURE_EXTRACTOR_MAPPING_NAMES": FEATURE_EXTRACTOR_MAPPING_NAMES,
+        "PROCESSOR_MAPPING_NAMES": PROCESSOR_MAPPING_NAMES,
+    }
+
+    # Each auto modeling files contains multiple mappings. Let's get them in a dynamic way.
+    for module_name in ["modeling_auto", "modeling_tf_auto", "modeling_flax_auto"]:
+        module = getattr(transformers.models.auto, module_name, None)
+        if module is None:
+            continue
+        # all mappings in a single auto modeling file
+        mapping_names = [x for x in dir(module) if x.endswith("_MAPPING_NAMES")]
+        mappings_to_check.update({name: getattr(module, name) for name in mapping_names})
+
+    for name, mapping in mappings_to_check.items():
+        for _, class_names in mapping.items():
+            if not isinstance(class_names, tuple):
+                class_names = (class_names,)
+                for class_name in class_names:
+                    if class_name is None:
+                        continue
+                    # dummy object is accepted
+                    if not hasattr(transformers, class_name):
+                        # If the class name is in a model name mapping, let's not check if there is a definition in any modeling
+                        # module, if it's a private model defined in this file.
+                        if name.endswith("MODEL_MAPPING_NAMES") and is_a_private_model(class_name):
+                            continue
+                        failures.append(
+                            f"`{class_name}` appears in the mapping `{name}` but it is not defined in the library."
+                        )
+    if len(failures) > 0:
+        raise Exception(f"There were {len(failures)} failures:\n" + "\n".join(failures))
+
+
+def check_all_auto_mapping_names_in_config_mapping_names():
+    """Check all keys defined in auto mappings (mappings of names) appear in `CONFIG_MAPPING_NAMES`."""
+    # This is where we need to check we have all backends or the check is incomplete.
+    check_missing_backends()
+
+    failures = []
+    # `TOKENIZER_PROCESSOR_MAPPING_NAMES` and `AutoTokenizer` is special, and don't need to follow the rule.
+    mappings_to_check = {
+        "IMAGE_PROCESSOR_MAPPING_NAMES": IMAGE_PROCESSOR_MAPPING_NAMES,
+        "FEATURE_EXTRACTOR_MAPPING_NAMES": FEATURE_EXTRACTOR_MAPPING_NAMES,
+        "PROCESSOR_MAPPING_NAMES": PROCESSOR_MAPPING_NAMES,
+    }
+
+    # Each auto modeling files contains multiple mappings. Let's get them in a dynamic way.
+    for module_name in ["modeling_auto", "modeling_tf_auto", "modeling_flax_auto"]:
+        module = getattr(transformers.models.auto, module_name, None)
+        if module is None:
+            continue
+        # all mappings in a single auto modeling file
+        mapping_names = [x for x in dir(module) if x.endswith("_MAPPING_NAMES")]
+        mappings_to_check.update({name: getattr(module, name) for name in mapping_names})
+
+    for name, mapping in mappings_to_check.items():
+        for model_type in mapping:
+            if model_type not in CONFIG_MAPPING_NAMES:
+                failures.append(
+                    f"`{model_type}` appears in the mapping `{name}` but it is not defined in the keys of "
+                    "`CONFIG_MAPPING_NAMES`."
+                )
+    if len(failures) > 0:
+        raise Exception(f"There were {len(failures)} failures:\n" + "\n".join(failures))
+
+
+def check_all_auto_mappings_importable():
+    """Check all auto mappings can be imported."""
+    # This is where we need to check we have all backends or the check is incomplete.
+    check_missing_backends()
+
+    failures = []
+    mappings_to_check = {}
+    # Each auto modeling files contains multiple mappings. Let's get them in a dynamic way.
+    for module_name in ["modeling_auto", "modeling_tf_auto", "modeling_flax_auto"]:
+        module = getattr(transformers.models.auto, module_name, None)
+        if module is None:
+            continue
+        # all mappings in a single auto modeling file
+        mapping_names = [x for x in dir(module) if x.endswith("_MAPPING_NAMES")]
+        mappings_to_check.update({name: getattr(module, name) for name in mapping_names})
+
+    for name in mappings_to_check:
+        name = name.replace("_MAPPING_NAMES", "_MAPPING")
+        if not hasattr(transformers, name):
+            failures.append(f"`{name}`")
+    if len(failures) > 0:
+        raise Exception(f"There were {len(failures)} failures:\n" + "\n".join(failures))
+
+
+def check_objects_being_equally_in_main_init():
+    """
+    Check if a (TensorFlow or Flax) object is in the main __init__ iif its counterpart in PyTorch is.
+    """
+    attrs = dir(transformers)
+
+    failures = []
+    for attr in attrs:
+        obj = getattr(transformers, attr)
+        if not hasattr(obj, "__module__") or "models.deprecated" in obj.__module__:
+            continue
+
+        module_path = obj.__module__
+        module_name = module_path.split(".")[-1]
+        module_dir = ".".join(module_path.split(".")[:-1])
+        if (
+            module_name.startswith("modeling_")
+            and not module_name.startswith("modeling_tf_")
+            and not module_name.startswith("modeling_flax_")
+        ):
+            parent_module = sys.modules[module_dir]
+
+            frameworks = []
+            if is_tf_available():
+                frameworks.append("TF")
+            if is_flax_available():
+                frameworks.append("Flax")
+
+            for framework in frameworks:
+                other_module_path = module_path.replace("modeling_", f"modeling_{framework.lower()}_")
+                if os.path.isfile("src/" + other_module_path.replace(".", "/") + ".py"):
+                    other_module_name = module_name.replace("modeling_", f"modeling_{framework.lower()}_")
+                    other_module = getattr(parent_module, other_module_name)
+                    if hasattr(other_module, f"{framework}{attr}"):
+                        if not hasattr(transformers, f"{framework}{attr}"):
+                            if f"{framework}{attr}" not in OBJECT_TO_SKIP_IN_MAIN_INIT_CHECK:
+                                failures.append(f"{framework}{attr}")
+                    if hasattr(other_module, f"{framework}_{attr}"):
+                        if not hasattr(transformers, f"{framework}_{attr}"):
+                            if f"{framework}_{attr}" not in OBJECT_TO_SKIP_IN_MAIN_INIT_CHECK:
+                                failures.append(f"{framework}_{attr}")
+    if len(failures) > 0:
+        raise Exception(f"There were {len(failures)} failures:\n" + "\n".join(failures))
+
+
 _re_decorator = re.compile(r"^\s*@(\S+)\s+$")
 
 
-def check_decorator_order(filename):
-    """Check that in the test file `filename` the slow decorator is always last."""
+def check_decorator_order(filename: str) -> List[int]:
+    """
+    Check that in a given test file, the slow decorator is always last.
+
+    Args:
+        filename (`str`): The path to a test file to check.
+
+    Returns:
+        `List[int]`: The list of failures as a list of indices where there are problems.
+    """
     with open(filename, "r", encoding="utf-8", newline="\n") as f:
         lines = f.readlines()
     decorator_before = None
@@ -602,18 +856,23 @@ def check_all_decorator_order():
         )
 
 
-def find_all_documented_objects():
-    """Parse the content of all doc files to detect which classes and functions it documents"""
+def find_all_documented_objects() -> List[str]:
+    """
+    Parse the content of all doc files to detect which classes and functions it documents.
+
+    Returns:
+        `List[str]`: The list of all object names being documented.
+    """
     documented_obj = []
     for doc_file in Path(PATH_TO_DOC).glob("**/*.rst"):
         with open(doc_file, "r", encoding="utf-8", newline="\n") as f:
             content = f.read()
         raw_doc_objs = re.findall(r"(?:autoclass|autofunction):: transformers.(\S+)\s+", content)
         documented_obj += [obj.split(".")[-1] for obj in raw_doc_objs]
-    for doc_file in Path(PATH_TO_DOC).glob("**/*.mdx"):
+    for doc_file in Path(PATH_TO_DOC).glob("**/*.md"):
         with open(doc_file, "r", encoding="utf-8", newline="\n") as f:
             content = f.read()
-        raw_doc_objs = re.findall("\[\[autodoc\]\]\s+(\S+)\s+", content)
+        raw_doc_objs = re.findall(r"\[\[autodoc\]\]\s+(\S+)\s+", content)
         documented_obj += [obj.split(".")[-1] for obj in raw_doc_objs]
     return documented_obj
 
@@ -689,20 +948,27 @@ def find_all_documented_objects():
     "PyTorchBenchmarkArguments",
     "TensorFlowBenchmark",
     "TensorFlowBenchmarkArguments",
-    "BitBackbone",
-    "MaskFormerSwinBackbone",
-    "ResNetBackbone",
     "AutoBackbone",
+    "BitBackbone",
+    "ConvNextBackbone",
+    "ConvNextV2Backbone",
     "DinatBackbone",
-    "NatBackbone",
+    "Dinov2Backbone",
+    "FocalNetBackbone",
+    "MaskFormerSwinBackbone",
     "MaskFormerSwinConfig",
     "MaskFormerSwinModel",
+    "NatBackbone",
+    "ResNetBackbone",
     "SwinBackbone",
+    "TimmBackbone",
+    "TimmBackboneConfig",
+    "VitDetBackbone",
 ]
 
 
-def ignore_undocumented(name):
-    """Rules to determine if `name` should be undocumented."""
+def ignore_undocumented(name: str) -> bool:
+    """Rules to determine if `name` should be undocumented (returns `True` if it should not be documented)."""
     # NOT DOCUMENTED ON PURPOSE.
     # Constants uppercase are not documented.
     if name.isupper():
@@ -757,7 +1023,7 @@ def check_all_objects_are_documented():
 def check_model_type_doc_match():
     """Check all doc pages have a corresponding model type."""
     model_doc_folder = Path(PATH_TO_DOC) / "model_doc"
-    model_docs = [m.stem for m in model_doc_folder.glob("*.mdx")]
+    model_docs = [m.stem for m in model_doc_folder.glob("*.md")]
 
     model_types = list(transformers.models.auto.configuration_auto.MODEL_NAMES_MAPPING.keys())
     model_types = [MODEL_TYPE_TO_DOC_MAPPING[m] if m in MODEL_TYPE_TO_DOC_MAPPING else m for m in model_types]
@@ -789,7 +1055,7 @@ def check_model_type_doc_match():
 _re_rst_example = re.compile(r"^\s*Example.*::\s*$", flags=re.MULTILINE)
 
 
-def is_rst_docstring(docstring):
+def is_rst_docstring(docstring: str) -> True:
     """
     Returns `True` if `docstring` is written in rst.
     """
@@ -803,7 +1069,7 @@ def is_rst_docstring(docstring):
 
 
 def check_docstrings_are_in_md():
-    """Check all docstrings are in md"""
+    """Check all docstrings are written in md and nor rst."""
     files_with_rst = []
     for file in Path(PATH_TO_TRANSFORMERS).glob("**/*.py"):
         with open(file, encoding="utf-8") as f:
@@ -825,6 +1091,35 @@ def check_docstrings_are_in_md():
         )
 
 
+def check_deprecated_constant_is_up_to_date():
+    """
+    Check if the constant `DEPRECATED_MODELS` in `models/auto/configuration_auto.py` is up to date.
+    """
+    deprecated_folder = os.path.join(PATH_TO_TRANSFORMERS, "models", "deprecated")
+    deprecated_models = [m for m in os.listdir(deprecated_folder) if not m.startswith("_")]
+
+    constant_to_check = transformers.models.auto.configuration_auto.DEPRECATED_MODELS
+    message = []
+    missing_models = sorted(set(deprecated_models) - set(constant_to_check))
+    if len(missing_models) != 0:
+        missing_models = ", ".join(missing_models)
+        message.append(
+            "The following models are in the deprecated folder, make sure to add them to `DEPRECATED_MODELS` in "
+            f"`models/auto/configuration_auto.py`: {missing_models}."
+        )
+
+    extra_models = sorted(set(constant_to_check) - set(deprecated_models))
+    if len(extra_models) != 0:
+        extra_models = ", ".join(extra_models)
+        message.append(
+            "The following models are in the `DEPRECATED_MODELS` constant but not in the deprecated folder. Either "
+            f"remove them from the constant or move to the deprecated folder: {extra_models}."
+        )
+
+    if len(message) > 0:
+        raise Exception("\n".join(message))
+
+
 def check_repo_quality():
     """Check all models are properly tested and documented."""
     print("Checking all models are included.")
@@ -838,6 +1133,16 @@ def check_repo_quality():
     check_all_objects_are_documented()
     print("Checking all models are in at least one auto class.")
     check_all_models_are_auto_configured()
+    print("Checking all names in auto name mappings are defined.")
+    check_all_auto_object_names_being_defined()
+    print("Checking all keys in auto name mappings are defined in `CONFIG_MAPPING_NAMES`.")
+    check_all_auto_mapping_names_in_config_mapping_names()
+    print("Checking all auto mappings could be imported.")
+    check_all_auto_mappings_importable()
+    print("Checking all objects are equally (across frameworks) in the main __init__.")
+    check_objects_being_equally_in_main_init()
+    print("Checking the DEPRECATED_MODELS constant is up to date.")
+    check_deprecated_constant_is_up_to_date()
 
 
 if __name__ == "__main__":
diff --git a/utils/check_self_hosted_runner.py b/utils/check_self_hosted_runner.py
index f7303366ea78..7439bd270eff 100644
--- a/utils/check_self_hosted_runner.py
+++ b/utils/check_self_hosted_runner.py
@@ -4,7 +4,6 @@
 
 
 def get_runner_status(target_runners, token):
-
     offline_runners = []
 
     cmd = (
@@ -26,7 +25,7 @@ def get_runner_status(target_runners, token):
         fp.write(json.dumps(offline_runners))
 
     if len(offline_runners) > 0:
-        failed = "\n".join(offline_runners)
+        failed = "\n".join([x["name"] for x in offline_runners])
         raise ValueError(f"The following runners are offline:\n{failed}")
 
 
diff --git a/utils/check_table.py b/utils/check_table.py
index 96d0cf23d26e..6417858db096 100644
--- a/utils/check_table.py
+++ b/utils/check_table.py
@@ -12,12 +12,32 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+Utility that checks the big table in the file docs/source/en/index.md and potentially updates it.
 
+Use from the root of the repo with:
+
+```bash
+python utils/check_inits.py
+```
+
+for a check that will error in case of inconsistencies (used by `make repo-consistency`).
+
+To auto-fix issues run:
+
+```bash
+python utils/check_inits.py --fix_and_overwrite
+```
+
+which is used by `make fix-copies`.
+"""
 import argparse
 import collections
-import importlib.util
 import os
 import re
+from typing import List
+
+from transformers.utils import direct_transformers_import
 
 
 # All paths are set with the intent you should run this script from the root of the repo with the command
@@ -27,19 +47,28 @@
 REPO_PATH = "."
 
 
-def _find_text_in_file(filename, start_prompt, end_prompt):
+def _find_text_in_file(filename: str, start_prompt: str, end_prompt: str) -> str:
     """
-    Find the text in `filename` between a line beginning with `start_prompt` and before `end_prompt`, removing empty
-    lines.
+    Find the text in filename between two prompts.
+
+    Args:
+        filename (`str`): The file to search into.
+        start_prompt (`str`): A string to look for at the start of the content searched.
+        end_prompt (`str`): A string that will mark the end of the content to look for.
+
+    Returns:
+        `str`: The content between the prompts.
     """
     with open(filename, "r", encoding="utf-8", newline="\n") as f:
         lines = f.readlines()
+
     # Find the start prompt.
     start_index = 0
     while not lines[start_index].startswith(start_prompt):
         start_index += 1
     start_index += 1
 
+    # Now go until the end prompt.
     end_index = start_index
     while not lines[end_index].startswith(end_prompt):
         end_index += 1
@@ -53,40 +82,100 @@ def _find_text_in_file(filename, start_prompt, end_prompt):
     return "".join(lines[start_index:end_index]), start_index, end_index, lines
 
 
-# Add here suffixes that are used to identify models, separated by |
-ALLOWED_MODEL_SUFFIXES = "Model|Encoder|Decoder|ForConditionalGeneration"
-# Regexes that match TF/Flax/PT model names.
+# Regexes that match TF/Flax/PT model names. Add here suffixes that are used to identify models, separated by |
 _re_tf_models = re.compile(r"TF(.*)(?:Model|Encoder|Decoder|ForConditionalGeneration)")
 _re_flax_models = re.compile(r"Flax(.*)(?:Model|Encoder|Decoder|ForConditionalGeneration)")
-# Will match any TF or Flax model too so need to be in an else branch afterthe two previous regexes.
+# Will match any TF or Flax model too so need to be in an else branch after the two previous regexes.
 _re_pt_models = re.compile(r"(.*)(?:Model|Encoder|Decoder|ForConditionalGeneration)")
 
 
 # This is to make sure the transformers module imported is the one in the repo.
-spec = importlib.util.spec_from_file_location(
-    "transformers",
-    os.path.join(TRANSFORMERS_PATH, "__init__.py"),
-    submodule_search_locations=[TRANSFORMERS_PATH],
-)
-transformers_module = spec.loader.load_module()
+transformers_module = direct_transformers_import(TRANSFORMERS_PATH)
+
 
+def camel_case_split(identifier: str) -> List[str]:
+    """
+    Split a camel-cased name into words.
+
+    Args:
+        identifier (`str`): The camel-cased name to parse.
+
+    Returns:
+        `List[str]`: The list of words in the identifier (as seprated by capital letters).
 
-# Thanks to https://stackoverflow.com/questions/29916065/how-to-do-camelcase-split-in-python
-def camel_case_split(identifier):
-    "Split a camelcased `identifier` into words."
+    Example:
+
+    ```py
+    >>> camel_case_split("CamelCasedClass")
+    ["Camel", "Cased", "Class"]
+    ```
+    """
+    # Regex thanks to https://stackoverflow.com/questions/29916065/how-to-do-camelcase-split-in-python
     matches = re.finditer(".+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)", identifier)
     return [m.group(0) for m in matches]
 
 
-def _center_text(text, width):
+def _center_text(text: str, width: int) -> str:
+    """
+    Utility that will add spaces on the left and right of a text to make it centered for a given width.
+
+    Args:
+        text (`str`): The text to center.
+        width (`int`): The desired length of the result.
+
+    Returns:
+        `str`: A text of length `width` with the original `text` in the middle.
+    """
     text_length = 2 if text == "✅" or text == "❌" else len(text)
     left_indent = (width - text_length) // 2
     right_indent = width - text_length - left_indent
     return " " * left_indent + text + " " * right_indent
 
 
-def get_model_table_from_auto_modules():
-    """Generates an up-to-date model table from the content of the auto modules."""
+SPECIAL_MODEL_NAME_LINK_MAPPING = {
+    "Data2VecAudio": "[Data2VecAudio](model_doc/data2vec)",
+    "Data2VecText": "[Data2VecText](model_doc/data2vec)",
+    "Data2VecVision": "[Data2VecVision](model_doc/data2vec)",
+    "DonutSwin": "[DonutSwin](model_doc/donut)",
+}
+
+MODEL_NAMES_WITH_SAME_CONFIG = {
+    "BARThez": "BART",
+    "BARTpho": "BART",
+    "BertJapanese": "BERT",
+    "BERTweet": "BERT",
+    "BORT": "BERT",
+    "ByT5": "T5",
+    "CPM": "OpenAI GPT-2",
+    "DePlot": "Pix2Struct",
+    "DialoGPT": "OpenAI GPT-2",
+    "DiT": "BEiT",
+    "FLAN-T5": "T5",
+    "FLAN-UL2": "T5",
+    "HerBERT": "BERT",
+    "LayoutXLM": "LayoutLMv2",
+    "Llama2": "LLaMA",
+    "MatCha": "Pix2Struct",
+    "mBART-50": "mBART",
+    "Megatron-GPT2": "OpenAI GPT-2",
+    "mLUKE": "LUKE",
+    "MMS": "Wav2Vec2",
+    "NLLB": "M2M100",
+    "PhoBERT": "BERT",
+    "T5v1.1": "T5",
+    "TAPEX": "BART",
+    "UL2": "T5",
+    "Wav2Vec2Phoneme": "Wav2Vec2",
+    "XLM-V": "XLM-RoBERTa",
+    "XLS-R": "Wav2Vec2",
+    "XLSR-Wav2Vec2": "Wav2Vec2",
+}
+
+
+def get_model_table_from_auto_modules() -> str:
+    """
+    Generates an up-to-date model table from the content of the auto modules.
+    """
     # Dictionary model names to config.
     config_maping_names = transformers_module.models.auto.configuration_auto.CONFIG_MAPPING_NAMES
     model_name_to_config = {
@@ -96,9 +185,7 @@ def get_model_table_from_auto_modules():
     }
     model_name_to_prefix = {name: config.replace("Config", "") for name, config in model_name_to_config.items()}
 
-    # Dictionaries flagging if each model prefix has a slow/fast tokenizer, backend in PT/TF/Flax.
-    slow_tokenizers = collections.defaultdict(bool)
-    fast_tokenizers = collections.defaultdict(bool)
+    # Dictionaries flagging if each model prefix has a backend in PT/TF/Flax.
     pt_models = collections.defaultdict(bool)
     tf_models = collections.defaultdict(bool)
     flax_models = collections.defaultdict(bool)
@@ -106,13 +193,7 @@ def get_model_table_from_auto_modules():
     # Let's lookup through all transformers object (once).
     for attr_name in dir(transformers_module):
         lookup_dict = None
-        if attr_name.endswith("Tokenizer"):
-            lookup_dict = slow_tokenizers
-            attr_name = attr_name[:-9]
-        elif attr_name.endswith("TokenizerFast"):
-            lookup_dict = fast_tokenizers
-            attr_name = attr_name[:-13]
-        elif _re_tf_models.match(attr_name) is not None:
+        if _re_tf_models.match(attr_name) is not None:
             lookup_dict = tf_models
             attr_name = _re_tf_models.match(attr_name).groups()[0]
         elif _re_flax_models.match(attr_name) is not None:
@@ -131,12 +212,27 @@ def get_model_table_from_auto_modules():
                 attr_name = "".join(camel_case_split(attr_name)[:-1])
 
     # Let's build that table!
-    model_names = list(model_name_to_config.keys())
+    model_names = list(model_name_to_config.keys()) + list(MODEL_NAMES_WITH_SAME_CONFIG.keys())
+
+    # model name to doc link mapping
+    model_names_mapping = transformers_module.models.auto.configuration_auto.MODEL_NAMES_MAPPING
+    model_name_to_link_mapping = {value: f"[{value}](model_doc/{key})" for key, value in model_names_mapping.items()}
+    # update mapping with special model names
+    model_name_to_link_mapping = {
+        k: SPECIAL_MODEL_NAME_LINK_MAPPING[k] if k in SPECIAL_MODEL_NAME_LINK_MAPPING else v
+        for k, v in model_name_to_link_mapping.items()
+    }
+
+    # MaskFormerSwin and TimmBackbone are backbones and so not meant to be loaded and used on their own. Instead, they define architectures which can be loaded using the AutoBackbone API.
+    names_to_exclude = ["MaskFormerSwin", "TimmBackbone", "Speech2Text2"]
+    model_names = [name for name in model_names if name not in names_to_exclude]
     model_names.sort(key=str.lower)
-    columns = ["Model", "Tokenizer slow", "Tokenizer fast", "PyTorch support", "TensorFlow support", "Flax Support"]
+
+    columns = ["Model", "PyTorch support", "TensorFlow support", "Flax Support"]
     # We'll need widths to properly display everything in the center (+2 is to leave one extra space on each side).
+
     widths = [len(c) + 2 for c in columns]
-    widths[0] = max([len(name) for name in model_names]) + 2
+    widths[0] = max([len(doc_link) for doc_link in model_name_to_link_mapping.values()]) + 2
 
     # Build the table per se
     table = "|" + "|".join([_center_text(c, w) for c, w in zip(columns, widths)]) + "|\n"
@@ -144,12 +240,14 @@ def get_model_table_from_auto_modules():
     table += "|" + "|".join([":" + "-" * (w - 2) + ":" for w in widths]) + "|\n"
 
     check = {True: "✅", False: "❌"}
+
     for name in model_names:
-        prefix = model_name_to_prefix[name]
+        if name in MODEL_NAMES_WITH_SAME_CONFIG.keys():
+            prefix = model_name_to_prefix[MODEL_NAMES_WITH_SAME_CONFIG[name]]
+        else:
+            prefix = model_name_to_prefix[name]
         line = [
-            name,
-            check[slow_tokenizers[prefix]],
-            check[fast_tokenizers[prefix]],
+            model_name_to_link_mapping[name],
             check[pt_models[prefix]],
             check[tf_models[prefix]],
             check[flax_models[prefix]],
@@ -159,9 +257,15 @@ def get_model_table_from_auto_modules():
 
 
 def check_model_table(overwrite=False):
-    """Check the model table in the index.rst is consistent with the state of the lib and maybe `overwrite`."""
+    """
+    Check the model table in the index.md is consistent with the state of the lib and potentially fix it.
+
+    Args:
+        overwrite (`bool`, *optional*, defaults to `False`):
+            Whether or not to overwrite the table when it's not up to date.
+    """
     current_table, start_index, end_index, lines = _find_text_in_file(
-        filename=os.path.join(PATH_TO_DOCS, "index.mdx"),
+        filename=os.path.join(PATH_TO_DOCS, "index.md"),
         start_prompt="<!--This table is updated automatically from the auto modules",
         end_prompt="<!-- End table-->",
     )
@@ -169,64 +273,17 @@ def check_model_table(overwrite=False):
 
     if current_table != new_table:
         if overwrite:
-            with open(os.path.join(PATH_TO_DOCS, "index.mdx"), "w", encoding="utf-8", newline="\n") as f:
+            with open(os.path.join(PATH_TO_DOCS, "index.md"), "w", encoding="utf-8", newline="\n") as f:
                 f.writelines(lines[:start_index] + [new_table] + lines[end_index:])
         else:
             raise ValueError(
-                "The model table in the `index.mdx` has not been updated. Run `make fix-copies` to fix this."
+                "The model table in the `index.md` has not been updated. Run `make fix-copies` to fix this."
             )
 
 
-def has_onnx(model_type):
-    """
-    Returns whether `model_type` is supported by ONNX (by checking if there is an ONNX config) or not.
-    """
-    config_mapping = transformers_module.models.auto.configuration_auto.CONFIG_MAPPING
-    if model_type not in config_mapping:
-        return False
-    config = config_mapping[model_type]
-    config_module = config.__module__
-    module = transformers_module
-    for part in config_module.split(".")[1:]:
-        module = getattr(module, part)
-    config_name = config.__name__
-    onnx_config_name = config_name.replace("Config", "OnnxConfig")
-    return hasattr(module, onnx_config_name)
-
-
-def get_onnx_model_list():
-    """
-    Return the list of models supporting ONNX.
-    """
-    config_mapping = transformers_module.models.auto.configuration_auto.CONFIG_MAPPING
-    model_names = config_mapping = transformers_module.models.auto.configuration_auto.MODEL_NAMES_MAPPING
-    onnx_model_types = [model_type for model_type in config_mapping.keys() if has_onnx(model_type)]
-    onnx_model_names = [model_names[model_type] for model_type in onnx_model_types]
-    onnx_model_names.sort(key=lambda x: x.upper())
-    return "\n".join([f"- {name}" for name in onnx_model_names]) + "\n"
-
-
-def check_onnx_model_list(overwrite=False):
-    """Check the model list in the serialization.mdx is consistent with the state of the lib and maybe `overwrite`."""
-    current_list, start_index, end_index, lines = _find_text_in_file(
-        filename=os.path.join(PATH_TO_DOCS, "serialization.mdx"),
-        start_prompt="<!--This table is automatically generated by `make fix-copies`, do not fill manually!-->",
-        end_prompt="In the next two sections, we'll show you how to:",
-    )
-    new_list = get_onnx_model_list()
-
-    if current_list != new_list:
-        if overwrite:
-            with open(os.path.join(PATH_TO_DOCS, "serialization.mdx"), "w", encoding="utf-8", newline="\n") as f:
-                f.writelines(lines[:start_index] + [new_list] + lines[end_index:])
-        else:
-            raise ValueError("The list of ONNX-supported models needs an update. Run `make fix-copies` to fix this.")
-
-
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--fix_and_overwrite", action="store_true", help="Whether to fix inconsistencies.")
     args = parser.parse_args()
 
     check_model_table(args.fix_and_overwrite)
-    check_onnx_model_list(args.fix_and_overwrite)
diff --git a/utils/check_task_guides.py b/utils/check_task_guides.py
new file mode 100644
index 000000000000..b00ff1dc1a5a
--- /dev/null
+++ b/utils/check_task_guides.py
@@ -0,0 +1,168 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Utility that checks the list of models in the tips in the task-specific pages of the doc is up to date and potentially
+fixes it.
+
+Use from the root of the repo with:
+
+```bash
+python utils/check_task_guides.py
+```
+
+for a check that will error in case of inconsistencies (used by `make repo-consistency`).
+
+To auto-fix issues run:
+
+```bash
+python utils/check_task_guides.py --fix_and_overwrite
+```
+
+which is used by `make fix-copies`.
+"""
+import argparse
+import os
+
+from transformers.utils import direct_transformers_import
+
+
+# All paths are set with the intent you should run this script from the root of the repo with the command
+# python utils/check_task_guides.py
+TRANSFORMERS_PATH = "src/transformers"
+PATH_TO_TASK_GUIDES = "docs/source/en/tasks"
+
+
+def _find_text_in_file(filename: str, start_prompt: str, end_prompt: str) -> str:
+    """
+    Find the text in filename between two prompts.
+
+    Args:
+        filename (`str`): The file to search into.
+        start_prompt (`str`): A string to look for at the start of the content searched.
+        end_prompt (`str`): A string that will mark the end of the content to look for.
+
+    Returns:
+        `str`: The content between the prompts.
+    """
+    with open(filename, "r", encoding="utf-8", newline="\n") as f:
+        lines = f.readlines()
+    # Find the start prompt.
+    start_index = 0
+    while not lines[start_index].startswith(start_prompt):
+        start_index += 1
+    start_index += 1
+
+    # Now go until the end prompt.
+    end_index = start_index
+    while not lines[end_index].startswith(end_prompt):
+        end_index += 1
+    end_index -= 1
+
+    while len(lines[start_index]) <= 1:
+        start_index += 1
+    while len(lines[end_index]) <= 1:
+        end_index -= 1
+    end_index += 1
+    return "".join(lines[start_index:end_index]), start_index, end_index, lines
+
+
+# This is to make sure the transformers module imported is the one in the repo.
+transformers_module = direct_transformers_import(TRANSFORMERS_PATH)
+
+# Map between a task guide and the corresponding auto class.
+TASK_GUIDE_TO_MODELS = {
+    "asr.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_CTC_MAPPING_NAMES,
+    "audio_classification.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES,
+    "language_modeling.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
+    "image_classification.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES,
+    "masked_language_modeling.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_MASKED_LM_MAPPING_NAMES,
+    "multiple_choice.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES,
+    "object_detection.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES,
+    "question_answering.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES,
+    "semantic_segmentation.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES,
+    "sequence_classification.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES,
+    "summarization.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
+    "token_classification.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES,
+    "translation.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
+    "video_classification.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES,
+    "document_question_answering.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES,
+    "monocular_depth_estimation.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES,
+}
+
+# This list contains model types used in some task guides that are not in `CONFIG_MAPPING_NAMES` (therefore not in any
+# `MODEL_MAPPING_NAMES` or any `MODEL_FOR_XXX_MAPPING_NAMES`).
+SPECIAL_TASK_GUIDE_TO_MODEL_TYPES = {
+    "summarization.md": ("nllb",),
+    "translation.md": ("nllb",),
+}
+
+
+def get_model_list_for_task(task_guide: str) -> str:
+    """
+    Return the list of models supporting a given task.
+
+    Args:
+        task_guide (`str`): The name of the task guide to check.
+
+    Returns:
+        `str`: The list of models supporting this task, as links to their respective doc pages separated by commas.
+    """
+    model_maping_names = TASK_GUIDE_TO_MODELS[task_guide]
+    special_model_types = SPECIAL_TASK_GUIDE_TO_MODEL_TYPES.get(task_guide, set())
+    model_names = {
+        code: name
+        for code, name in transformers_module.MODEL_NAMES_MAPPING.items()
+        if (code in model_maping_names or code in special_model_types)
+    }
+    return ", ".join([f"[{name}](../model_doc/{code})" for code, name in model_names.items()]) + "\n"
+
+
+def check_model_list_for_task(task_guide: str, overwrite: bool = False):
+    """
+    For a given task guide, checks the model list in the generated tip for consistency with the state of the lib and
+    updates it if needed.
+
+    Args:
+        task_guide (`str`):
+            The name of the task guide to check.
+        overwrite (`bool`, *optional*, defaults to `False`):
+            Whether or not to overwrite the table when it's not up to date.
+    """
+    current_list, start_index, end_index, lines = _find_text_in_file(
+        filename=os.path.join(PATH_TO_TASK_GUIDES, task_guide),
+        start_prompt="<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->",
+        end_prompt="<!--End of the generated tip-->",
+    )
+
+    new_list = get_model_list_for_task(task_guide)
+
+    if current_list != new_list:
+        if overwrite:
+            with open(os.path.join(PATH_TO_TASK_GUIDES, task_guide), "w", encoding="utf-8", newline="\n") as f:
+                f.writelines(lines[:start_index] + [new_list] + lines[end_index:])
+        else:
+            raise ValueError(
+                f"The list of models that can be used in the {task_guide} guide needs an update. Run `make fix-copies`"
+                " to fix this."
+            )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--fix_and_overwrite", action="store_true", help="Whether to fix inconsistencies.")
+    args = parser.parse_args()
+
+    for task_guide in TASK_GUIDE_TO_MODELS.keys():
+        check_model_list_for_task(task_guide, args.fix_and_overwrite)
diff --git a/utils/create_dummy_models.py b/utils/create_dummy_models.py
index 5ca659d2628f..87f3326504f0 100644
--- a/utils/create_dummy_models.py
+++ b/utils/create_dummy_models.py
@@ -15,22 +15,25 @@
 
 import argparse
 import collections.abc
-import importlib
+import copy
 import inspect
 import json
+import multiprocessing
 import os
 import shutil
-import sys
 import tempfile
+import traceback
 from pathlib import Path
 
+from check_config_docstrings import get_checkpoint_from_config_class
 from datasets import load_dataset
+from get_test_info import get_model_to_tester_mapping, get_tester_classes_for_model
+from huggingface_hub import Repository, create_repo, hf_api, upload_folder
 
-from check_config_docstrings import get_checkpoint_from_config_class
-from huggingface_hub import Repository, create_repo, upload_folder
 from transformers import (
     CONFIG_MAPPING,
     FEATURE_EXTRACTOR_MAPPING,
+    IMAGE_PROCESSOR_MAPPING,
     PROCESSOR_MAPPING,
     TOKENIZER_MAPPING,
     AutoTokenizer,
@@ -55,7 +58,6 @@
 logging.disable_progress_bar()
 logger = logging.get_logger(__name__)
 
-sys.path.append(".")
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
 
 if not is_torch_available():
@@ -64,39 +66,122 @@
 if not is_tf_available():
     raise ValueError("Please install TensorFlow.")
 
+
 FRAMEWORKS = ["pytorch", "tensorflow"]
 INVALID_ARCH = []
 TARGET_VOCAB_SIZE = 1024
 
+data = {"training_ds": None, "testing_ds": None}
+
+COMPOSITE_MODELS = {
+    "EncoderDecoderModel": "EncoderDecoderModel-bert-bert",
+    "SpeechEncoderDecoderModel": "SpeechEncoderDecoderModel-wav2vec2-bert",
+    "VisionEncoderDecoderModel": "VisionEncoderDecoderModel-vit-gpt2",
+    "VisionTextDualEncoderModel": "VisionTextDualEncoderModel-vit-bert",
+}
+
+# This list contains the model architectures for which a tiny version could not be created.
+# Avoid to add new architectures here - unless we have verified carefully that it's (almost) impossible to create them.
+# One such case is: no model tester class is implemented for a model type (like `MT5`) because its architecture is
+# identical to another one (`MT5` is based on `T5`), but trained on different datasets or with different techniques.
+UNCONVERTIBLE_MODEL_ARCHITECTURES = {
+    "BertGenerationEncoder",
+    "BertGenerationDecoder",
+    "CamembertForSequenceClassification",
+    "CamembertForMultipleChoice",
+    "CamembertForMaskedLM",
+    "CamembertForCausalLM",
+    "CamembertForTokenClassification",
+    "CamembertForQuestionAnswering",
+    "CamembertModel",
+    "TFCamembertForMultipleChoice",
+    "TFCamembertForTokenClassification",
+    "TFCamembertForQuestionAnswering",
+    "TFCamembertForSequenceClassification",
+    "TFCamembertForMaskedLM",
+    "TFCamembertModel",
+    "TFCamembertForCausalLM",
+    "DecisionTransformerModel",
+    "GraphormerModel",
+    "InformerModel",
+    "JukeboxModel",
+    "MarianForCausalLM",
+    "MaskFormerSwinModel",
+    "MaskFormerSwinBackbone",
+    "MT5Model",
+    "MT5ForConditionalGeneration",
+    "UMT5ForConditionalGeneration",
+    "TFMT5ForConditionalGeneration",
+    "TFMT5Model",
+    "QDQBertForSequenceClassification",
+    "QDQBertForMaskedLM",
+    "QDQBertModel",
+    "QDQBertForTokenClassification",
+    "QDQBertLMHeadModel",
+    "QDQBertForMultipleChoice",
+    "QDQBertForQuestionAnswering",
+    "QDQBertForNextSentencePrediction",
+    "ReformerModelWithLMHead",
+    "RetriBertModel",
+    "Speech2Text2ForCausalLM",
+    "TimeSeriesTransformerModel",
+    "TrajectoryTransformerModel",
+    "TrOCRForCausalLM",
+    "XLMProphetNetForConditionalGeneration",
+    "XLMProphetNetForCausalLM",
+    "XLMProphetNetModel",
+    "XLMRobertaModel",
+    "XLMRobertaForTokenClassification",
+    "XLMRobertaForMultipleChoice",
+    "XLMRobertaForMaskedLM",
+    "XLMRobertaForCausalLM",
+    "XLMRobertaForSequenceClassification",
+    "XLMRobertaForQuestionAnswering",
+    "TFXLMRobertaForSequenceClassification",
+    "TFXLMRobertaForMaskedLM",
+    "TFXLMRobertaForCausalLM",
+    "TFXLMRobertaForQuestionAnswering",
+    "TFXLMRobertaModel",
+    "TFXLMRobertaForMultipleChoice",
+    "TFXLMRobertaForTokenClassification",
+}
+
 
 def get_processor_types_from_config_class(config_class, allowed_mappings=None):
     """Return a tuple of processors for `config_class`.
 
     We use `tuple` here to include (potentially) both slow & fast tokenizers.
     """
+
+    # To make a uniform return type
+    def _to_tuple(x):
+        if not isinstance(x, collections.abc.Sequence):
+            x = (x,)
+        else:
+            x = tuple(x)
+        return x
+
     if allowed_mappings is None:
-        allowed_mappings = ["processor", "tokenizer", "feature_extractor"]
+        allowed_mappings = ["processor", "tokenizer", "image_processor", "feature_extractor"]
 
     processor_types = ()
 
-    # Check first if a model has `ProcessorMixin`. Otherwise, check if it has tokenizers or a feature extractor.
+    # Check first if a model has `ProcessorMixin`. Otherwise, check if it has tokenizers, and/or an image processor or
+    # a feature extractor
     if config_class in PROCESSOR_MAPPING and "processor" in allowed_mappings:
-        processor_types = PROCESSOR_MAPPING[config_class]
-    elif config_class in TOKENIZER_MAPPING and "tokenizer" in allowed_mappings:
-        processor_types = TOKENIZER_MAPPING[config_class]
-    elif config_class in FEATURE_EXTRACTOR_MAPPING and "feature_extractor" in allowed_mappings:
-        processor_types = FEATURE_EXTRACTOR_MAPPING[config_class]
-    else:
-        # Some configurations have no processor at all. For example, generic composite models like
-        # `EncoderDecoderModel` is used for any (compatible) text models. Also, `DecisionTransformer` doesn't
-        # require any processor.
-        pass
-
-    # make a uniform return type
-    if not isinstance(processor_types, collections.abc.Sequence):
-        processor_types = (processor_types,)
+        processor_types = _to_tuple(PROCESSOR_MAPPING[config_class])
     else:
-        processor_types = tuple(processor_types)
+        if config_class in TOKENIZER_MAPPING and "tokenizer" in allowed_mappings:
+            processor_types = TOKENIZER_MAPPING[config_class]
+
+        if config_class in IMAGE_PROCESSOR_MAPPING and "image_processor" in allowed_mappings:
+            processor_types += _to_tuple(IMAGE_PROCESSOR_MAPPING[config_class])
+        elif config_class in FEATURE_EXTRACTOR_MAPPING and "feature_extractor" in allowed_mappings:
+            processor_types += _to_tuple(FEATURE_EXTRACTOR_MAPPING[config_class])
+
+    # Remark: some configurations have no processor at all. For example, generic composite models like
+    # `EncoderDecoderModel` is used for any (compatible) text models. Also, `DecisionTransformer` doesn't
+    # require any processor.
 
     # We might get `None` for some tokenizers - remove them here.
     processor_types = tuple(p for p in processor_types if p is not None)
@@ -104,7 +189,7 @@ def get_processor_types_from_config_class(config_class, allowed_mappings=None):
     return processor_types
 
 
-def get_architectures_from_config_class(config_class, arch_mappings):
+def get_architectures_from_config_class(config_class, arch_mappings, models_to_skip=None):
     """Return a tuple of all possible architectures attributed to a configuration class `config_class`.
 
     For example, BertConfig -> [BertModel, BertForMaskedLM, ..., BertForQuestionAnswering].
@@ -117,12 +202,16 @@ def get_architectures_from_config_class(config_class, arch_mappings):
     # We avoid the duplication.
     architectures = set()
 
+    if models_to_skip is None:
+        models_to_skip = []
+    models_to_skip = UNCONVERTIBLE_MODEL_ARCHITECTURES.union(models_to_skip)
+
     for mapping in arch_mappings:
         if config_class in mapping:
             models = mapping[config_class]
             models = tuple(models) if isinstance(models, collections.abc.Sequence) else (models,)
             for model in models:
-                if model.__name__ not in unexportable_model_architectures:
+                if model.__name__ not in models_to_skip:
                     architectures.add(model)
 
     architectures = tuple(architectures)
@@ -154,7 +243,7 @@ def get_config_class_from_processor_class(processor_class):
     return new_config_class
 
 
-def build_processor(config_class, processor_class):
+def build_processor(config_class, processor_class, allow_no_checkpoint=False):
     """Create a processor for `processor_class`.
 
     If a processor is not able to be built with the original arguments, this method tries to change the arguments and
@@ -177,8 +266,7 @@ def build_processor(config_class, processor_class):
     try:
         processor = processor_class.from_pretrained(checkpoint)
     except Exception as e:
-        logger.error(e)
-        pass
+        logger.error(f"{e.__class__.__name__}: {e}")
 
     # Try to get a new processor class from checkpoint. This is helpful for a checkpoint without necessary file to load
     # processor while `processor_class` is an Auto class. For example, `sew` has `Wav2Vec2Processor` in
@@ -194,7 +282,7 @@ def build_processor(config_class, processor_class):
         try:
             config = AutoConfig.from_pretrained(checkpoint)
         except Exception as e:
-            logger.error(e)
+            logger.error(f"{e.__class__.__name__}: {e}")
             config = None
         if config is not None:
             if not isinstance(config, config_class):
@@ -254,8 +342,7 @@ def build_processor(config_class, processor_class):
                 try:
                     processor = processor_class(**{k: v[0] for k, v in attrs.items()})
                 except Exception as e:
-                    logger.error(e)
-                    pass
+                    logger.error(f"{e.__class__.__name__}: {e}")
         else:
             # `checkpoint` might lack some file(s) to load a processor. For example, `facebook/hubert-base-ls960`
             # has no tokenizer file to load `Wav2Vec2CTCTokenizer`. In this case, we try to build a processor
@@ -264,6 +351,17 @@ def build_processor(config_class, processor_class):
             if config_class_from_processor_class != config_class:
                 processor = build_processor(config_class_from_processor_class, processor_class)
 
+    # Try to create an image processor or a feature extractor without any checkpoint
+    if (
+        processor is None
+        and allow_no_checkpoint
+        and (issubclass(processor_class, BaseImageProcessor) or issubclass(processor_class, FeatureExtractionMixin))
+    ):
+        try:
+            processor = processor_class()
+        except Exception as e:
+            logger.error(f"{e.__class__.__name__}: {e}")
+
     # validation
     if processor is not None:
         if not (isinstance(processor, processor_class) or processor_class.__name__.startswith("Auto")):
@@ -275,7 +373,7 @@ def build_processor(config_class, processor_class):
     return processor
 
 
-def get_tiny_config(config_class, **model_tester_kwargs):
+def get_tiny_config(config_class, model_class=None, **model_tester_kwargs):
     """Retrieve a tiny configuration from `config_class` using each model's `ModelTester`.
 
     Args:
@@ -298,15 +396,28 @@ def get_tiny_config(config_class, **model_tester_kwargs):
         module_name = model_type_to_module_name(model_type)
         if not modeling_name.startswith(module_name):
             raise ValueError(f"{modeling_name} doesn't start with {module_name}!")
-        module = importlib.import_module(f".models.{module_name}.test_modeling_{modeling_name}", package="tests")
-        camel_case_model_name = config_class.__name__.split("Config")[0]
-        model_tester_class = getattr(module, f"{camel_case_model_name}ModelTester", None)
-    except ModuleNotFoundError as e:
-        error = f"Tiny config not created for {model_type} - cannot find the testing module from the model name"
-        raise ValueError(f"{error}: {e}")
+        test_file = os.path.join("tests", "models", module_name, f"test_modeling_{modeling_name}.py")
+        models_to_model_testers = get_model_to_tester_mapping(test_file)
+        # Find the model tester class
+        model_tester_class = None
+        tester_classes = []
+        if model_class is not None:
+            tester_classes = get_tester_classes_for_model(test_file, model_class)
+        else:
+            for _tester_classes in models_to_model_testers.values():
+                tester_classes.extend(_tester_classes)
+        if len(tester_classes) > 0:
+            # sort with the length of the class names first, then the alphabetical order
+            # This is to avoid `T5EncoderOnlyModelTest` is used instead of `T5ModelTest`, which has
+            # `is_encoder_decoder=False` and causes some pipeline tests failing (also failures in `Optimum` CI).
+            # TODO: More fine grained control of the desired tester class.
+            model_tester_class = sorted(tester_classes, key=lambda x: (len(x.__name__), x.__name__))[0]
+    except ModuleNotFoundError:
+        error = f"Tiny config not created for {model_type} - cannot find the testing module from the model name."
+        raise ValueError(error)
 
     if model_tester_class is None:
-        error = f"Tiny config not created for {model_type} - no model tester is found in the testing module"
+        error = f"Tiny config not created for {model_type} - no model tester is found in the testing module."
         raise ValueError(error)
 
     # `parent` is an instance of `unittest.TestCase`, but we don't need it here.
@@ -329,18 +440,18 @@ def get_tiny_config(config_class, **model_tester_kwargs):
 
 
 def convert_tokenizer(tokenizer_fast: PreTrainedTokenizerFast):
-
-    new_tokenizer = tokenizer_fast.train_new_from_iterator(training_ds["text"], TARGET_VOCAB_SIZE, show_progress=False)
+    new_tokenizer = tokenizer_fast.train_new_from_iterator(
+        data["training_ds"]["text"], TARGET_VOCAB_SIZE, show_progress=False
+    )
 
     # Make sure it at least runs
     if not isinstance(new_tokenizer, LayoutLMv3TokenizerFast):
-        new_tokenizer(testing_ds["text"])
+        new_tokenizer(data["testing_ds"]["text"])
 
     return new_tokenizer
 
 
 def convert_feature_extractor(feature_extractor, tiny_config):
-
     to_convert = False
     kwargs = {}
     if hasattr(tiny_config, "image_size"):
@@ -379,85 +490,217 @@ def convert_processors(processors, tiny_config, output_folder, result):
     This method should not fail: we catch the errors and put them in `result["warnings"]` with descriptive messages.
     """
 
+    def _sanity_check(fast_tokenizer, slow_tokenizer, keep_fast_tokenizer=False):
+        """Set tokenizer(s) to `None` if the fast/slow tokenizers have different values for `vocab_size` or `length`.
+
+        If `keep_fast_tokenizer=True`, the fast tokenizer will be kept.
+        """
+        # sanity check 1: fast and slow tokenizers should be compatible (vocab_size)
+        if fast_tokenizer is not None and slow_tokenizer is not None:
+            if fast_tokenizer.vocab_size != slow_tokenizer.vocab_size:
+                warning_messagae = (
+                    "The fast/slow tokenizers "
+                    f"({fast_tokenizer.__class__.__name__}/{slow_tokenizer.__class__.__name__}) have different "
+                    "vocabulary size: "
+                    f"fast_tokenizer.vocab_size = {fast_tokenizer.vocab_size} and "
+                    f"slow_tokenizer.vocab_size = {slow_tokenizer.vocab_size}."
+                )
+                result["warnings"].append(warning_messagae)
+                if not keep_fast_tokenizer:
+                    fast_tokenizer = None
+                slow_tokenizer = None
+
+        # sanity check 2: fast and slow tokenizers should be compatible (length)
+        if fast_tokenizer is not None and slow_tokenizer is not None:
+            if len(fast_tokenizer) != len(slow_tokenizer):
+                warning_messagae = (
+                    f"The fast/slow tokenizers () have different length: "
+                    f"len(fast_tokenizer) = {len(fast_tokenizer)} and "
+                    f"len(slow_tokenizer) = {len(slow_tokenizer)}."
+                )
+                result["warnings"].append(warning_messagae)
+                if not keep_fast_tokenizer:
+                    fast_tokenizer = None
+                slow_tokenizer = None
+
+        return fast_tokenizer, slow_tokenizer
+
     tokenizers = []
     feature_extractors = []
     for processor in processors:
         if isinstance(processor, PreTrainedTokenizerBase):
-            tokenizers.append(processor)
+            if processor.__class__.__name__ not in {x.__class__.__name__ for x in tokenizers}:
+                tokenizers.append(processor)
         elif isinstance(processor, BaseImageProcessor):
-            feature_extractors.append(processor)
+            if processor.__class__.__name__ not in {x.__class__.__name__ for x in feature_extractors}:
+                feature_extractors.append(processor)
         elif isinstance(processor, FeatureExtractionMixin):
-            feature_extractors.append(processor)
+            if processor.__class__.__name__ not in {x.__class__.__name__ for x in feature_extractors}:
+                feature_extractors.append(processor)
         elif isinstance(processor, ProcessorMixin):
+            if hasattr(processor, "tokenizer"):
+                if processor.tokenizer.__class__.__name__ not in {x.__class__.__name__ for x in tokenizers}:
+                    tokenizers.append(processor.tokenizer)
             # Currently, we only have these 2 possibilities
-            tokenizers.append(processor.tokenizer)
-            feature_extractors.append(processor.feature_extractor)
+            if hasattr(processor, "image_processor"):
+                if processor.image_processor.__class__.__name__ not in {
+                    x.__class__.__name__ for x in feature_extractors
+                }:
+                    feature_extractors.append(processor.image_processor)
+            elif hasattr(processor, "feature_extractor"):
+                if processor.feature_extractor.__class__.__name__ not in {
+                    x.__class__.__name__ for x in feature_extractors
+                }:
+                    feature_extractors.append(processor.feature_extractor)
 
     # check the built processors have the unique type
-    num_types = len(set([x.__class__.__name__ for x in feature_extractors]))
+    num_types = len({x.__class__.__name__ for x in feature_extractors})
     if num_types >= 2:
         raise ValueError(f"`feature_extractors` should contain at most 1 type, but it contains {num_types} types!")
-    num_types = len(set([x.__class__.__name__.replace("Fast", "") for x in tokenizers]))
+    num_types = len({x.__class__.__name__.replace("Fast", "") for x in tokenizers})
     if num_types >= 2:
         raise ValueError(f"`tokenizers` should contain at most 1 tokenizer type, but it contains {num_types} types!")
 
     fast_tokenizer = None
     slow_tokenizer = None
+
     for tokenizer in tokenizers:
         if isinstance(tokenizer, PreTrainedTokenizerFast):
-            if fast_tokenizer is None:
-                fast_tokenizer = tokenizer
-                try:
-                    # Wav2Vec2ForCTC , ByT5Tokenizer etc. all are already small enough and have no fast version that can
-                    # be retrained
-                    if fast_tokenizer.vocab_size > TARGET_VOCAB_SIZE:
-                        fast_tokenizer = convert_tokenizer(tokenizer)
-                except Exception as e:
-                    result["warnings"].append(
-                        f"Failed to convert the fast tokenizer for {fast_tokenizer.__class__.__name__}: {e}"
-                    )
-                    continue
-        elif slow_tokenizer is None:
+            fast_tokenizer = tokenizer
+        else:
             slow_tokenizer = tokenizer
 
-    # Make sure the fast tokenizer can be saved
+    # If the (original) fast/slow tokenizers don't correspond, keep only the fast tokenizer.
+    # This doesn't necessarily imply the fast/slow tokenizers in a single Hub repo. has issues.
+    # It's more of an issue in `build_processor` which tries to get a checkpoint with as much effort as possible.
+    # For `YosoModel` (which uses `AlbertTokenizer(Fast)`), its real (Hub) checkpoint doesn't contain valid files to
+    # load the slower tokenizer (`AlbertTokenizer`), and it ends up finding the (canonical) checkpoint of `AlbertModel`,
+    # which has different vocabulary.
+    # TODO: Try to improve `build_processor`'s definition and/or usage to avoid the above situation in the first place.
+    fast_tokenizer, slow_tokenizer = _sanity_check(fast_tokenizer, slow_tokenizer, keep_fast_tokenizer=True)
+    original_fast_tokenizer, original_slow_tokenizer = fast_tokenizer, slow_tokenizer
+
     if fast_tokenizer:
         try:
-            fast_tokenizer.save_pretrained(output_folder)
-        except Exception as e:
+            # Wav2Vec2ForCTC , ByT5Tokenizer etc. all are already small enough and have no fast version that can
+            # be retrained
+            if fast_tokenizer.vocab_size > TARGET_VOCAB_SIZE:
+                fast_tokenizer = convert_tokenizer(fast_tokenizer)
+        except Exception:
             result["warnings"].append(
-                f"Failed to save the fast tokenizer for {fast_tokenizer.__class__.__name__}: {e}"
+                (
+                    f"Failed to convert the fast tokenizer for {fast_tokenizer.__class__.__name__}.",
+                    traceback.format_exc(),
+                )
             )
-            fast_tokenizer = None
 
-    # Make sure the slow tokenizer (if any) corresponds to the fast version (as it might be converted above)
+    # If `fast_tokenizer` exists, `slow_tokenizer` should correspond to it.
     if fast_tokenizer:
+        # Make sure the fast tokenizer can be saved
         try:
-            slow_tokenizer = AutoTokenizer.from_pretrained(output_folder, use_fast=False)
-        except Exception as e:
+            # We don't save it to `output_folder` at this moment - only at the end of this function.
+            with tempfile.TemporaryDirectory() as tmpdir:
+                fast_tokenizer.save_pretrained(tmpdir)
+                try:
+                    slow_tokenizer = AutoTokenizer.from_pretrained(tmpdir, use_fast=False)
+                except Exception:
+                    result["warnings"].append(
+                        (
+                            f"Failed to load the slow tokenizer saved from {fast_tokenizer.__class__.__name__}.",
+                            traceback.format_exc(),
+                        )
+                    )
+                    # Let's just keep the fast version
+                    slow_tokenizer = None
+        except Exception:
             result["warnings"].append(
-                f"Failed to load the slow tokenizer saved from {fast_tokenizer.__class__.__name__}: {e}"
+                (
+                    f"Failed to save the fast tokenizer for {fast_tokenizer.__class__.__name__}.",
+                    traceback.format_exc(),
+                )
             )
-            # Let's just keep the fast version
-            slow_tokenizer = None
+            fast_tokenizer = None
 
-    # If the fast version can't be created and saved, let's use the slow version
-    if not fast_tokenizer and slow_tokenizer:
-        try:
-            slow_tokenizer.save_pretrained(output_folder)
-        except Exception as e:
-            result["warnings"].append(
-                f"Failed to save the slow tokenizer for {slow_tokenizer.__class__.__name__}: {e}"
-            )
-            slow_tokenizer = None
+    # If the (possibly converted) fast/slow tokenizers don't correspond, set them to `None`, and use the original
+    # tokenizers.
+    fast_tokenizer, slow_tokenizer = _sanity_check(fast_tokenizer, slow_tokenizer, keep_fast_tokenizer=False)
+
+    # If there is any conversion failed, we keep the original tokenizers.
+    if (original_fast_tokenizer is not None and fast_tokenizer is None) or (
+        original_slow_tokenizer is not None and slow_tokenizer is None
+    ):
+        warning_messagae = (
+            "There are some issues when converting the fast/slow tokenizers. The original tokenizers from the Hub "
+            " will be used instead."
+        )
+        result["warnings"].append(warning_messagae)
+        # Let's use the original version at the end (`original_fast_tokenizer` and `original_slow_tokenizer`)
+        fast_tokenizer = original_fast_tokenizer
+        slow_tokenizer = original_slow_tokenizer
+
+    # Make sure the fast tokenizer can be saved
+    if fast_tokenizer:
+        # We don't save it to `output_folder` at this moment - only at the end of this function.
+        with tempfile.TemporaryDirectory() as tmpdir:
+            try:
+                fast_tokenizer.save_pretrained(tmpdir)
+            except Exception:
+                result["warnings"].append(
+                    (
+                        f"Failed to save the fast tokenizer for {fast_tokenizer.__class__.__name__}.",
+                        traceback.format_exc(),
+                    )
+                )
+                fast_tokenizer = None
+    # Make sure the slow tokenizer can be saved
+    if slow_tokenizer:
+        # We don't save it to `output_folder` at this moment - only at the end of this function.
+        with tempfile.TemporaryDirectory() as tmpdir:
+            try:
+                slow_tokenizer.save_pretrained(tmpdir)
+            except Exception:
+                result["warnings"].append(
+                    (
+                        f"Failed to save the slow tokenizer for {slow_tokenizer.__class__.__name__}.",
+                        traceback.format_exc(),
+                    )
+                )
+                slow_tokenizer = None
 
     # update feature extractors using the tiny config
     try:
         feature_extractors = [convert_feature_extractor(p, tiny_config) for p in feature_extractors]
-    except Exception as e:
-        result["warnings"].append(f"Failed to convert feature extractors: {e}")
+    except Exception:
+        result["warnings"].append(
+            (
+                "Failed to convert feature extractors.",
+                traceback.format_exc(),
+            )
+        )
         feature_extractors = []
 
+    if hasattr(tiny_config, "max_position_embeddings") and tiny_config.max_position_embeddings > 0:
+        if fast_tokenizer is not None:
+            if fast_tokenizer.__class__.__name__ in [
+                "RobertaTokenizerFast",
+                "XLMRobertaTokenizerFast",
+                "LongformerTokenizerFast",
+                "MPNetTokenizerFast",
+            ]:
+                fast_tokenizer.model_max_length = tiny_config.max_position_embeddings - 2
+            else:
+                fast_tokenizer.model_max_length = tiny_config.max_position_embeddings
+        if slow_tokenizer is not None:
+            if slow_tokenizer.__class__.__name__ in [
+                "RobertaTokenizer",
+                "XLMRobertaTokenizer",
+                "LongformerTokenizer",
+                "MPNetTokenizer",
+            ]:
+                slow_tokenizer.model_max_length = tiny_config.max_position_embeddings - 2
+            else:
+                slow_tokenizer.model_max_length = tiny_config.max_position_embeddings
+
     processors = [fast_tokenizer, slow_tokenizer] + feature_extractors
     processors = [p for p in processors if p is not None]
     for p in processors:
@@ -491,6 +734,12 @@ def build_model(model_arch, tiny_config, output_dir):
     if os.path.isdir(processor_output_dir):
         shutil.copytree(processor_output_dir, checkpoint_dir, dirs_exist_ok=True)
 
+    tiny_config = copy.deepcopy(tiny_config)
+
+    if any(model_arch.__name__.endswith(x) for x in ["ForCausalLM", "LMHeadModel"]):
+        tiny_config.is_encoder_decoder = False
+        tiny_config.is_decoder = True
+
     model = model_arch(config=tiny_config)
     model.save_pretrained(checkpoint_dir)
     model.from_pretrained(checkpoint_dir)
@@ -498,9 +747,9 @@ def build_model(model_arch, tiny_config, output_dir):
     return model
 
 
-def fill_result_with_error(result, error, models_to_create):
+def fill_result_with_error(result, error, trace, models_to_create):
     """Fill `result` with errors for all target model arch if we can't build processor"""
-
+    error = (error, trace)
     result["error"] = error
     for framework in FRAMEWORKS:
         if framework in models_to_create:
@@ -508,19 +757,20 @@ def fill_result_with_error(result, error, models_to_create):
             for model_arch in models_to_create[framework]:
                 result[framework][model_arch.__name__] = {"model": None, "checkpoint": None, "error": error}
 
-    result["processor"] = {type(p).__name__: p.__class__.__name__ for p in result["processor"]}
+    result["processor"] = {p.__class__.__name__: p.__class__.__name__ for p in result["processor"].values()}
 
 
-def upload_model(model_dir, organization):
+def upload_model(model_dir, organization, token):
     """Upload the tiny models"""
 
     arch_name = model_dir.split(os.path.sep)[-1]
     repo_name = f"tiny-random-{arch_name}"
+    repo_id = f"{organization}/{repo_name}"
 
     repo_exist = False
     error = None
     try:
-        create_repo(repo_id=repo_name, organization=organization, exist_ok=False, repo_type="model")
+        create_repo(repo_id=repo_id, exist_ok=False, repo_type="model", token=token)
     except Exception as e:
         error = e
         if "You already created" in str(e):
@@ -528,15 +778,14 @@ def upload_model(model_dir, organization):
             logger.warning("Remote repository exists and will be cloned.")
             repo_exist = True
             try:
-                create_repo(repo_id=repo_name, organization=organization, exist_ok=True, repo_type="model")
+                create_repo(repo_id=repo_id, exist_ok=True, repo_type="model", token=token)
             except Exception as e:
                 error = e
     if error is not None:
-        raise ValueError(error)
+        raise error
 
     with tempfile.TemporaryDirectory() as tmpdir:
-
-        repo = Repository(local_dir=tmpdir, clone_from=f"{organization}/{repo_name}")
+        repo = Repository(local_dir=tmpdir, clone_from=repo_id, token=token)
         repo.git_pull()
         shutil.copytree(model_dir, tmpdir, dirs_exist_ok=True)
 
@@ -544,23 +793,24 @@ def upload_model(model_dir, organization):
             # Open a PR on the existing Hub repo.
             hub_pr_url = upload_folder(
                 folder_path=model_dir,
-                repo_id=f"{organization}/{repo_name}",
+                repo_id=repo_id,
                 repo_type="model",
                 commit_message=f"Update tiny models for {arch_name}",
                 commit_description=f"Upload tiny models for {arch_name}",
                 create_pr=True,
+                token=token,
             )
-            logger.warning(f"PR open in {hub_pr_url}")
+            logger.warning(f"PR open in {hub_pr_url}.")
+            # TODO: We need this information?
         else:
             # Push to Hub repo directly
             repo.git_add(auto_lfs_track=True)
             repo.git_commit(f"Upload tiny models for {arch_name}")
             repo.git_push(blocking=True)  # this prints a progress bar with the upload
-            logger.warning(f"Tiny models {arch_name} pushed to {organization}/{repo_name}")
+            logger.warning(f"Tiny models {arch_name} pushed to {repo_id}.")
 
 
 def build_composite_models(config_class, output_dir):
-
     import tempfile
 
     from transformers import (
@@ -577,6 +827,7 @@ def build_composite_models(config_class, output_dir):
         SpeechEncoderDecoderModel,
         TFEncoderDecoderModel,
         TFVisionEncoderDecoderModel,
+        TFVisionTextDualEncoderModel,
         VisionEncoderDecoderModel,
         VisionTextDualEncoderModel,
         ViTConfig,
@@ -626,10 +877,9 @@ def build_composite_models(config_class, output_dir):
         encoder_class = ViTModel
         decoder_class = BertModel
         model_class = VisionTextDualEncoderModel
-        tf_model_class = None
+        tf_model_class = TFVisionTextDualEncoderModel
 
     with tempfile.TemporaryDirectory() as tmpdir:
-
         try:
             # build encoder
             models_to_create = {"processor": encoder_processor, "pytorch": (encoder_class,), "tensorflow": []}
@@ -678,7 +928,7 @@ def build_composite_models(config_class, output_dir):
                 shutil.copytree(decoder_processor_path, model_path, dirs_exist_ok=True)
 
             # fill `result`
-            result["processor"] = tuple(set([x.__name__ for x in encoder_processor + decoder_processor]))
+            result["processor"] = {x.__name__: x.__name__ for x in encoder_processor + decoder_processor}
 
             result["pytorch"] = {model_class.__name__: {"model": model_class.__name__, "checkpoint": model_path}}
 
@@ -687,9 +937,11 @@ def build_composite_models(config_class, output_dir):
                 result["tensorflow"] = {
                     tf_model_class.__name__: {"model": tf_model_class.__name__, "checkpoint": model_path}
                 }
-
-        except Exception as e:
-            result["error"] = f"Failed to build models for {config_class.__name__}: {e}"
+        except Exception:
+            result["error"] = (
+                f"Failed to build models for {config_class.__name__}.",
+                traceback.format_exc(),
+            )
 
     if not result["error"]:
         del result["error"]
@@ -722,6 +974,9 @@ def get_token_id_from_tokenizer(token_id_name, tokenizer, original_token_id):
 
 
 def get_config_overrides(config_class, processors):
+    # `Bark` configuration is too special. Let's just not handle this for now.
+    if config_class.__name__ == "BarkConfig":
+        return {}
 
     config_overrides = {}
 
@@ -738,14 +993,35 @@ def get_config_overrides(config_class, processors):
         return config_overrides
 
     # Get some properties of the (already converted) tokenizer (smaller vocab size, special token ids, etc.)
-    vocab_size = tokenizer.vocab_size
+    # We use `len(tokenizer)` instead of `tokenizer.vocab_size` to avoid potential issues for tokenizers with non-empty
+    # `added_tokens_encoder`. One example is the `DebertaV2Tokenizer` where the mask token is the extra token.
+    vocab_size = len(tokenizer)
+
+    # The original checkpoint has length `35998`, but it doesn't have ids `30400` and `30514` but instead `35998` and
+    # `35999`.
+    if config_class.__name__ == "GPTSanJapaneseConfig":
+        vocab_size += 2
+
     config_overrides["vocab_size"] = vocab_size
 
     # Used to create a new model tester with `tokenizer.vocab_size` in order to get the (updated) special token ids.
     model_tester_kwargs = {"vocab_size": vocab_size}
     # CLIP-like models have `text_model_tester` and `vision_model_tester`, and we need to pass `vocab_size` to
     # `text_model_tester` via `text_kwargs`. The same trick is also necessary for `Flava`.
-    if config_class.__name__ in ["CLIPConfig", "GroupViTConfig", "OwlViTConfig", "XCLIPConfig", "FlavaConfig"]:
+    if config_class.__name__ in [
+        "AlignConfig",
+        "AltCLIPConfig",
+        "ChineseCLIPConfig",
+        "CLIPSegConfig",
+        "ClapConfig",
+        "CLIPConfig",
+        "GroupViTConfig",
+        "OwlViTConfig",
+        "XCLIPConfig",
+        "FlavaConfig",
+        "BlipConfig",
+        "Blip2Config",
+    ]:
         del model_tester_kwargs["vocab_size"]
         model_tester_kwargs["text_kwargs"] = {"vocab_size": vocab_size}
     # `FSMTModelTester` accepts `src_vocab_size` and `tgt_vocab_size` but not `vocab_size`.
@@ -793,6 +1069,10 @@ def build(config_class, models_to_create, output_dir):
             The directory to save all the checkpoints. Each model architecture will be saved in a subdirectory under
             it. Models in different frameworks with the same architecture will be saved in the same subdirectory.
     """
+    if data["training_ds"] is None or data["testing_ds"] is None:
+        ds = load_dataset("wikitext", "wikitext-2-raw-v1")
+        data["training_ds"] = ds["train"]
+        data["testing_ds"] = ds["test"]
 
     if config_class.model_type in [
         "encoder-decoder",
@@ -813,33 +1093,35 @@ def build(config_class, models_to_create, output_dir):
 
     if len(processor_classes) == 0:
         error = f"No processor class could be found in {config_class.__name__}."
-        fill_result_with_error(result, error, models_to_create)
-        logger.error(result["error"])
+        fill_result_with_error(result, error, None, models_to_create)
+        logger.error(result["error"][0])
         return result
 
     for processor_class in processor_classes:
         try:
-            processor = build_processor(config_class, processor_class)
+            processor = build_processor(config_class, processor_class, allow_no_checkpoint=True)
             if processor is not None:
                 result["processor"][processor_class] = processor
-        except Exception as e:
-            error = f"Failed to build processor for {processor_class.__name__}: {e}"
-            fill_result_with_error(result, error, models_to_create)
-            logger.error(result["error"])
+        except Exception:
+            error = f"Failed to build processor for {processor_class.__name__}."
+            trace = traceback.format_exc()
+            fill_result_with_error(result, error, trace, models_to_create)
+            logger.error(result["error"][0])
             return result
 
     if len(result["processor"]) == 0:
         error = f"No processor could be built for {config_class.__name__}."
-        fill_result_with_error(result, error, models_to_create)
-        logger.error(result["error"])
+        fill_result_with_error(result, error, None, models_to_create)
+        logger.error(result["error"][0])
         return result
 
     try:
         tiny_config = get_tiny_config(config_class)
     except Exception as e:
         error = f"Failed to get tiny config for {config_class.__name__}: {e}"
-        fill_result_with_error(result, error, models_to_create)
-        logger.error(result["error"])
+        trace = traceback.format_exc()
+        fill_result_with_error(result, error, trace, models_to_create)
+        logger.error(result["error"][0])
         return result
 
     # Convert the processors (reduce vocabulary size, smaller image size, etc.)
@@ -847,22 +1129,24 @@ def build(config_class, models_to_create, output_dir):
     processor_output_folder = os.path.join(output_dir, "processors")
     try:
         processors = convert_processors(processors, tiny_config, processor_output_folder, result)
-    except Exception as e:
-        error = f"Failed to convert the processors: {e}"
-        result["warnings"].append(error)
+    except Exception:
+        error = "Failed to convert the processors."
+        trace = traceback.format_exc()
+        result["warnings"].append((error, trace))
 
     if len(processors) == 0:
         error = f"No processor is returned by `convert_processors` for {config_class.__name__}."
-        fill_result_with_error(result, error, models_to_create)
-        logger.error(result["error"])
+        fill_result_with_error(result, error, None, models_to_create)
+        logger.error(result["error"][0])
         return result
 
     try:
         config_overrides = get_config_overrides(config_class, processors)
     except Exception as e:
         error = f"Failure occurs while calling `get_config_overrides`: {e}"
-        fill_result_with_error(result, error, models_to_create)
-        logger.error(result["error"])
+        trace = traceback.format_exc()
+        fill_result_with_error(result, error, trace, models_to_create)
+        logger.error(result["error"][0])
         return result
 
     # Just for us to see this easily in the report
@@ -886,7 +1170,7 @@ def build(config_class, models_to_create, output_dir):
                 tiny_config.text_config_dict[k] = v
 
     if result["warnings"]:
-        logger.warning(result["warnings"])
+        logger.warning(result["warnings"][0][0])
 
     # update `result["processor"]`
     result["processor"] = {type(p).__name__: p.__class__.__name__ for p in processors}
@@ -899,13 +1183,14 @@ def build(config_class, models_to_create, output_dir):
         except Exception as e:
             model = None
             error = f"Failed to create the pytorch model for {pytorch_arch}: {e}"
+            trace = traceback.format_exc()
 
         result["pytorch"][pytorch_arch.__name__]["model"] = model.__class__.__name__ if model is not None else None
         result["pytorch"][pytorch_arch.__name__]["checkpoint"] = (
             get_checkpoint_dir(output_dir, pytorch_arch) if model is not None else None
         )
         if error is not None:
-            result["pytorch"][pytorch_arch.__name__]["error"] = error
+            result["pytorch"][pytorch_arch.__name__]["error"] = (error, trace)
             logger.error(f"{pytorch_arch.__name__}: {error}")
 
     for tensorflow_arch in models_to_create["tensorflow"]:
@@ -925,12 +1210,14 @@ def build(config_class, models_to_create, output_dir):
                 # Conversion may fail. Let's not create a model with different weights to avoid confusion (for now).
                 model = None
                 error = f"Failed to convert the pytorch model to the tensorflow model for {pt_arch}: {e}"
+                trace = traceback.format_exc()
         else:
             try:
                 model = build_model(tensorflow_arch, tiny_config, output_dir=output_dir)
             except Exception as e:
                 model = None
                 error = f"Failed to create the tensorflow model for {tensorflow_arch}: {e}"
+                trace = traceback.format_exc()
 
         result["tensorflow"][tensorflow_arch.__name__]["model"] = (
             model.__class__.__name__ if model is not None else None
@@ -939,7 +1226,7 @@ def build(config_class, models_to_create, output_dir):
             get_checkpoint_dir(output_dir, tensorflow_arch) if model is not None else None
         )
         if error is not None:
-            result["tensorflow"][tensorflow_arch.__name__]["error"] = error
+            result["tensorflow"][tensorflow_arch.__name__]["error"] = (error, trace)
             logger.error(f"{tensorflow_arch.__name__}: {error}")
 
     if not result["error"]:
@@ -950,8 +1237,62 @@ def build(config_class, models_to_create, output_dir):
     return result
 
 
-def build_failed_report(results, include_warning=True):
+def build_tiny_model_summary(results, organization=None, token=None):
+    """Build a summary: a dictionary of the form
+    {
+      model architecture name:
+        {
+          "tokenizer_classes": [...],
+          "processor_classes": [...],
+          "model_classes": [...],
+        }
+      ..
+    }
+    """
+    tiny_model_summary = {}
+    for config_name in results:
+        processors = [key for key, value in results[config_name]["processor"].items()]
+        tokenizer_classes = sorted([x for x in processors if x.endswith("TokenizerFast") or x.endswith("Tokenizer")])
+        processor_classes = sorted([x for x in processors if x not in tokenizer_classes])
+        for framework in FRAMEWORKS:
+            if framework not in results[config_name]:
+                continue
+            for arch_name in results[config_name][framework]:
+                model_classes = [arch_name]
+                base_arch_name = arch_name[2:] if arch_name.startswith("TF") else arch_name
+                # tiny model is not created for `arch_name`
+                if results[config_name][framework][arch_name]["model"] is None:
+                    model_classes = []
+                if base_arch_name not in tiny_model_summary:
+                    tiny_model_summary[base_arch_name] = {}
+                tiny_model_summary[base_arch_name].update(
+                    {
+                        "tokenizer_classes": tokenizer_classes,
+                        "processor_classes": processor_classes,
+                    }
+                )
+                tiny_model_summary[base_arch_name]["model_classes"] = sorted(
+                    tiny_model_summary[base_arch_name].get("model_classes", []) + model_classes
+                )
+                if organization is not None:
+                    repo_name = f"tiny-random-{base_arch_name}"
+                    # composite models' checkpoints have more precise repo. names on the Hub.
+                    if base_arch_name in COMPOSITE_MODELS:
+                        repo_name = f"tiny-random-{COMPOSITE_MODELS[base_arch_name]}"
+                    repo_id = f"{organization}/{repo_name}"
+                    try:
+                        commit_hash = hf_api.repo_info(repo_id, token=token).sha
+                    except Exception:
+                        # The directory is not created, but processor(s) is/are included in `results`.
+                        logger.warning(f"Failed to get information for {repo_id}.\n{traceback.format_exc()}")
+                        del tiny_model_summary[base_arch_name]
+                        continue
+                    tiny_model_summary[base_arch_name]["sha"] = commit_hash
+
+    return tiny_model_summary
+
 
+def build_failed_report(results, include_warning=True):
     failed_results = {}
     for config_name in results:
         if "error" in results[config_name]:
@@ -982,7 +1323,6 @@ def build_failed_report(results, include_warning=True):
 
 
 def build_simple_report(results):
-
     text = ""
     failed_text = ""
     for config_name in results:
@@ -992,20 +1332,59 @@ def build_simple_report(results):
             for arch_name in results[config_name][framework]:
                 if "error" in results[config_name][framework][arch_name]:
                     result = results[config_name][framework][arch_name]["error"]
-                    failed_text += f"{arch_name}: {result}\n"
+                    failed_text += f"{arch_name}: {result[0]}\n"
                 else:
-                    result = "OK"
-                text += f"{arch_name}: {result}\n"
+                    result = ("OK",)
+                text += f"{arch_name}: {result[0]}\n"
 
     return text, failed_text
 
 
-if __name__ == "__main__":
-
+def update_tiny_model_summary_file(report_path):
+    with open(os.path.join(report_path, "tiny_model_summary.json")) as fp:
+        new_data = json.load(fp)
+    with open("tests/utils/tiny_model_summary.json") as fp:
+        data = json.load(fp)
+    for key, value in new_data.items():
+        if key not in data:
+            data[key] = value
+        else:
+            for attr in ["tokenizer_classes", "processor_classes", "model_classes"]:
+                # we might get duplication here. We will remove them below when creating `updated_data`.
+                data[key][attr].extend(value[attr])
+            new_sha = value.get("sha", None)
+            if new_sha is not None:
+                data[key]["sha"] = new_sha
+
+    updated_data = {}
+    for key in sorted(data.keys()):
+        updated_data[key] = {}
+        for attr, value in data[key].items():
+            # deduplication and sort
+            updated_data[key][attr] = sorted(set(value)) if attr != "sha" else value
+
+    with open(os.path.join(report_path, "updated_tiny_model_summary.json"), "w") as fp:
+        json.dump(updated_data, fp, indent=4, ensure_ascii=False)
+
+
+def create_tiny_models(
+    output_path,
+    all,
+    model_types,
+    models_to_skip,
+    no_check,
+    upload,
+    organization,
+    token,
+    num_workers=1,
+):
     clone_path = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
     if os.getcwd() != clone_path:
         raise ValueError(f"This script should be run from the root of the clone of `transformers` {clone_path}")
 
+    report_path = os.path.join(output_path, "reports")
+    os.makedirs(report_path)
+
     _pytorch_arch_mappings = [
         x
         for x in dir(transformers_module)
@@ -1014,17 +1393,101 @@ def build_simple_report(results):
     _tensorflow_arch_mappings = [
         x for x in dir(transformers_module) if x.startswith("TF_MODEL_") and x.endswith("_MAPPING")
     ]
-    # _flax_arch_mappings = [x for x in dir(transformers_module) if x.startswith("FLAX_MODEL_") and x.endswith("_MAPPING")]
 
     pytorch_arch_mappings = [getattr(transformers_module, x) for x in _pytorch_arch_mappings]
     tensorflow_arch_mappings = [getattr(transformers_module, x) for x in _tensorflow_arch_mappings]
-    # flax_arch_mappings = [getattr(transformers_module, x) for x in _flax_arch_mappings]
 
-    unexportable_model_architectures = []
+    config_classes = CONFIG_MAPPING.values()
+    if not all:
+        config_classes = [CONFIG_MAPPING[model_type] for model_type in model_types]
+
+    # A map from config classes to tuples of processors (tokenizer, feature extractor, processor) classes
+    processor_type_map = {c: get_processor_types_from_config_class(c) for c in config_classes}
+
+    to_create = {}
+    for c in config_classes:
+        processors = processor_type_map[c]
+        models = get_architectures_from_config_class(c, pytorch_arch_mappings, models_to_skip)
+        tf_models = get_architectures_from_config_class(c, tensorflow_arch_mappings, models_to_skip)
+        if len(models) + len(tf_models) > 0:
+            to_create[c] = {"processor": processors, "pytorch": models, "tensorflow": tf_models}
+
+    results = {}
+    if num_workers <= 1:
+        for c, models_to_create in list(to_create.items()):
+            print(f"Create models for {c.__name__} ...")
+            result = build(c, models_to_create, output_dir=os.path.join(output_path, c.model_type))
+            results[c.__name__] = result
+            print("=" * 40)
+    else:
+        all_build_args = []
+        for c, models_to_create in list(to_create.items()):
+            all_build_args.append((c, models_to_create, os.path.join(output_path, c.model_type)))
+        with multiprocessing.Pool() as pool:
+            results = pool.starmap(build, all_build_args)
+            results = {buid_args[0].__name__: result for buid_args, result in zip(all_build_args, results)}
+
+    if upload:
+        if organization is None:
+            raise ValueError("The argument `organization` could not be `None`. No model is uploaded")
+
+        to_upload = []
+        for model_type in os.listdir(output_path):
+            # This is the directory containing the reports
+            if model_type == "reports":
+                continue
+            for arch in os.listdir(os.path.join(output_path, model_type)):
+                if arch == "processors":
+                    continue
+                to_upload.append(os.path.join(output_path, model_type, arch))
+        to_upload = sorted(to_upload)
+
+        upload_results = {}
+        if len(to_upload) > 0:
+            for model_dir in to_upload:
+                try:
+                    upload_model(model_dir, organization, token)
+                except Exception as e:
+                    error = f"Failed to upload {model_dir}. {e.__class__.__name__}: {e}"
+                    logger.error(error)
+                    upload_results[model_dir] = error
+
+        with open(os.path.join(report_path, "failed_uploads.json"), "w") as fp:
+            json.dump(upload_results, fp, indent=4)
+
+    # Build the tiny model summary file. The `tokenizer_classes` and `processor_classes` could be both empty lists.
+    # When using the items in this file to update the file `tests/utils/tiny_model_summary.json`, the model
+    # architectures with `tokenizer_classes` and `processor_classes` being both empty should **NOT** be added to
+    # `tests/utils/tiny_model_summary.json`.
+    tiny_model_summary = build_tiny_model_summary(results, organization=organization, token=token)
+    with open(os.path.join(report_path, "tiny_model_summary.json"), "w") as fp:
+        json.dump(tiny_model_summary, fp, indent=4)
+
+    with open(os.path.join(report_path, "tiny_model_creation_report.json"), "w") as fp:
+        json.dump(results, fp, indent=4)
+
+    # Build the warning/failure report (json format): same format as the complete `results` except this contains only
+    # warnings or errors.
+    failed_results = build_failed_report(results)
+    with open(os.path.join(report_path, "failed_report.json"), "w") as fp:
+        json.dump(failed_results, fp, indent=4)
+
+    simple_report, failed_report = build_simple_report(results)
+    # The simplified report: a .txt file with each line of format:
+    # {model architecture name}: {OK or error message}
+    with open(os.path.join(report_path, "simple_report.txt"), "w") as fp:
+        fp.write(simple_report)
+
+    # The simplified failure report: same above except this only contains line with errors
+    with open(os.path.join(report_path, "simple_failed_report.txt"), "w") as fp:
+        fp.write(failed_report)
+
+    update_tiny_model_summary_file(report_path=os.path.join(output_path, "reports"))
 
-    ds = load_dataset("wikitext", "wikitext-2-raw-v1")
-    training_ds = ds["train"]
-    testing_ds = ds["test"]
+
+if __name__ == "__main__":
+    # This has to be `spawn` to avoid hanging forever!
+    multiprocessing.set_start_method("spawn")
 
     def list_str(values):
         return values.split(",")
@@ -1042,6 +1505,14 @@ def list_str(values):
         type=list_str,
         help="Comma-separated list of model type(s) from which the tiny models will be created.",
     )
+    parser.add_argument(
+        "--models_to_skip",
+        type=list_str,
+        help=(
+            "Comma-separated list of model class names(s) from which the tiny models won't be created.\nThis is usually"
+            "the list of model classes that have their tiny versions already uploaded to the Hub."
+        ),
+    )
     parser.add_argument("--upload", action="store_true", help="If to upload the created tiny models to the Hub.")
     parser.add_argument(
         "--organization",
@@ -1049,74 +1520,25 @@ def list_str(values):
         type=str,
         help="The organization on the Hub to which the tiny models will be uploaded.",
     )
+    parser.add_argument(
+        "--token", default=None, type=str, help="A valid authentication token for HuggingFace Hub with write access."
+    )
     parser.add_argument("output_path", type=Path, help="Path indicating where to store generated model.")
+    parser.add_argument("--num_workers", default=1, type=int, help="The number of workers to run.")
 
     args = parser.parse_args()
 
     if not args.all and not args.model_types:
         raise ValueError("Please provide at least one model type or pass `--all` to export all architectures.")
 
-    config_classes = CONFIG_MAPPING.values()
-    if not args.all:
-        config_classes = [CONFIG_MAPPING[model_type] for model_type in args.model_types]
-
-    # A map from config classes to tuples of processors (tokenizer, feature extractor, processor) classes
-    processor_type_map = {c: get_processor_types_from_config_class(c) for c in config_classes}
-
-    to_create = {
-        c: {
-            "processor": processor_type_map[c],
-            "pytorch": get_architectures_from_config_class(c, pytorch_arch_mappings),
-            "tensorflow": get_architectures_from_config_class(c, tensorflow_arch_mappings),
-            # "flax": get_architectures_from_config_class(c, flax_arch_mappings),
-        }
-        for c in config_classes
-    }
-
-    results = {}
-    for c, models_to_create in list(to_create.items()):
-        print(f"Create models for {c.__name__} ...")
-        result = build(c, models_to_create, output_dir=os.path.join(args.output_path, c.model_type))
-        results[c.__name__] = result
-        print("=" * 40)
-
-    with open("tiny_model_creation_report.json", "w") as fp:
-        json.dump(results, fp, indent=4)
-
-    # Build the failure report
-    failed_results = build_failed_report(results)
-    with open("failed_report.json", "w") as fp:
-        json.dump(failed_results, fp, indent=4)
-
-    # Build the failure report
-    simple_report, failed_report = build_simple_report(results)
-    with open("simple_report.txt", "w") as fp:
-        fp.write(simple_report)
-
-    with open("simple_failed_report.txt", "w") as fp:
-        fp.write(failed_report)
-
-    if args.upload:
-        if args.organization is None:
-            raise ValueError("The argument `organization` could not be `None`. No model is uploaded")
-
-        to_upload = []
-        for model_type in os.listdir(args.output_path):
-            for arch in os.listdir(os.path.join(args.output_path, model_type)):
-                if arch == "processors":
-                    continue
-                to_upload.append(os.path.join(args.output_path, model_type, arch))
-        to_upload = sorted(to_upload)
-
-        upload_results = {}
-        if len(to_upload) > 0:
-            for model_dir in to_upload:
-                try:
-                    upload_model(model_dir, args.organization)
-                except Exception as e:
-                    error = f"Failed to upload {model_dir}: {e}"
-                    logger.error(error)
-                    upload_results[model_dir] = error
-
-        with open("failed_uploads.json", "w") as fp:
-            json.dump(upload_results, fp, indent=4)
+    create_tiny_models(
+        args.output_path,
+        args.all,
+        args.model_types,
+        args.models_to_skip,
+        args.no_check,
+        args.upload,
+        args.organization,
+        args.token,
+        args.num_workers,
+    )
diff --git a/utils/custom_init_isort.py b/utils/custom_init_isort.py
index c17ce139569f..cdf2aaffdf4b 100644
--- a/utils/custom_init_isort.py
+++ b/utils/custom_init_isort.py
@@ -12,12 +12,35 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+Utility that sorts the imports in the custom inits of Transformers. Transformers uses init files that delay the
+import of an object to when it's actually needed. This is to avoid the main init importing all models, which would
+make the line `import transformers` very slow when the user has all optional dependencies installed. The inits with
+delayed imports have two halves: one definining a dictionary `_import_structure` which maps modules to the name of the
+objects in each module, and one in `TYPE_CHECKING` which looks like a normal init for type-checkers. `isort` or `ruff`
+properly sort the second half which looks like traditionl imports, the goal of this script is to sort the first half.
 
+Use from the root of the repo with:
+
+```bash
+python utils/custom_init_isort.py
+```
+
+which will auto-sort the imports (used in `make style`).
+
+For a check only (as used in `make quality`) run:
+
+```bash
+python utils/custom_init_isort.py --check_only
+```
+"""
 import argparse
 import os
 import re
+from typing import Any, Callable, List, Optional
 
 
+# Path is defined with the intent you should run this script from the root of the repo.
 PATH_TO_TRANSFORMERS = "src/transformers"
 
 # Pattern that looks at the indentation in a line.
@@ -32,17 +55,30 @@
 _re_bracket_content = re.compile(r"\[([^\]]+)\]")
 
 
-def get_indent(line):
-    """Returns the indent in `line`."""
+def get_indent(line: str) -> str:
+    """Returns the indent in  given line (as string)."""
     search = _re_indent.search(line)
     return "" if search is None else search.groups()[0]
 
 
-def split_code_in_indented_blocks(code, indent_level="", start_prompt=None, end_prompt=None):
+def split_code_in_indented_blocks(
+    code: str, indent_level: str = "", start_prompt: Optional[str] = None, end_prompt: Optional[str] = None
+) -> List[str]:
     """
-    Split `code` into its indented blocks, starting at `indent_level`. If provided, begins splitting after
-    `start_prompt` and stops at `end_prompt` (but returns what's before `start_prompt` as a first block and what's
-    after `end_prompt` as a last block, so `code` is always the same as joining the result of this function).
+    Split some code into its indented blocks, starting at a given level.
+
+    Args:
+        code (`str`): The code to split.
+        indent_level (`str`): The indent level (as string) to use for identifying the blocks to split.
+        start_prompt (`str`, *optional*): If provided, only starts splitting at the line where this text is.
+        end_prompt (`str`, *optional*): If provided, stops splitting at a line where this text is.
+
+    Warning:
+        The text before `start_prompt` or after `end_prompt` (if provided) is not ignored, just not split. The input `code`
+        can thus be retrieved by joining the result.
+
+    Returns:
+        `List[str]`: The list of blocks.
     """
     # Let's split the code into lines and move to start_index.
     index = 0
@@ -54,12 +90,17 @@ def split_code_in_indented_blocks(code, indent_level="", start_prompt=None, end_
     else:
         blocks = []
 
-    # We split into blocks until we get to the `end_prompt` (or the end of the block).
+    # This variable contains the block treated at a given time.
     current_block = [lines[index]]
     index += 1
+    # We split into blocks until we get to the `end_prompt` (or the end of the file).
     while index < len(lines) and (end_prompt is None or not lines[index].startswith(end_prompt)):
+        # We have a non-empty line with the proper indent -> start of a new block
         if len(lines[index]) > 0 and get_indent(lines[index]) == indent_level:
+            # Store the current block in the result and rest. There are two cases: the line is part of the block (like
+            # a closing parenthesis) or not.
             if len(current_block) > 0 and get_indent(current_block[-1]).startswith(indent_level + " "):
+                # Line is part of the current block
                 current_block.append(lines[index])
                 blocks.append("\n".join(current_block))
                 if index < len(lines) - 1:
@@ -68,9 +109,11 @@ def split_code_in_indented_blocks(code, indent_level="", start_prompt=None, end_
                 else:
                     current_block = []
             else:
+                # Line is not part of the current block
                 blocks.append("\n".join(current_block))
                 current_block = [lines[index]]
         else:
+            # Just add the line to the current block
             current_block.append(lines[index])
         index += 1
 
@@ -85,8 +128,10 @@ def split_code_in_indented_blocks(code, indent_level="", start_prompt=None, end_
     return blocks
 
 
-def ignore_underscore(key):
-    "Wraps a `key` (that maps an object to string) to lower case and remove underscores."
+def ignore_underscore_and_lowercase(key: Callable[[Any], str]) -> Callable[[Any], str]:
+    """
+    Wraps a key function (as used in a sort) to lowercase and ignore underscores.
+    """
 
     def _inner(x):
         return key(x).lower().replace("_", "")
@@ -94,8 +139,22 @@ def _inner(x):
     return _inner
 
 
-def sort_objects(objects, key=None):
-    "Sort a list of `objects` following the rules of isort. `key` optionally maps an object to a str."
+def sort_objects(objects: List[Any], key: Optional[Callable[[Any], str]] = None) -> List[Any]:
+    """
+    Sort a list of objects following the rules of isort (all uppercased first, camel-cased second and lower-cased
+    last).
+
+    Args:
+        objects (`List[Any]`):
+            The list of objects to sort.
+        key (`Callable[[Any], str]`, *optional*):
+            A function taking an object as input and returning a string, used to sort them by alphabetical order.
+            If not provided, will default to noop (so a `key` must be provided if the `objects` are not of type string).
+
+    Returns:
+        `List[Any]`: The sorted list with the same elements as in the inputs
+    """
+
     # If no key is provided, we use a noop.
     def noop(x):
         return x
@@ -109,17 +168,26 @@ def noop(x):
     # Functions begin with a lowercase, they go last.
     functions = [obj for obj in objects if not key(obj)[0].isupper()]
 
-    key1 = ignore_underscore(key)
+    # Then we sort each group.
+    key1 = ignore_underscore_and_lowercase(key)
     return sorted(constants, key=key1) + sorted(classes, key=key1) + sorted(functions, key=key1)
 
 
-def sort_objects_in_import(import_statement):
+def sort_objects_in_import(import_statement: str) -> str:
     """
-    Return the same `import_statement` but with objects properly sorted.
+    Sorts the imports in a single import statement.
+
+    Args:
+        import_statement (`str`): The import statement in which to sort the imports.
+
+    Returns:
+        `str`: The same as the input, but with objects properly sorted.
     """
+
     # This inner function sort imports between [ ].
     def _replace(match):
         imports = match.groups()[0]
+        # If there is one import only, nothing to do.
         if "," not in imports:
             return f"[{imports}]"
         keys = [part.strip().replace('"', "") for part in imports.split(",")]
@@ -163,13 +231,18 @@ def _replace(match):
         return import_statement
 
 
-def sort_imports(file, check_only=True):
+def sort_imports(file: str, check_only: bool = True):
     """
-    Sort `_import_structure` imports in `file`, `check_only` determines if we only check or overwrite.
+    Sort the imports defined in the `_import_structure` of a given init.
+
+    Args:
+        file (`str`): The path to the init to check/fix.
+        check_only (`bool`, *optional*, defaults to `True`): Whether or not to just check (and not auto-fix) the init.
     """
     with open(file, encoding="utf-8") as f:
         code = f.read()
 
+    # If the file is not a custom init, there is nothing to do.
     if "_import_structure" not in code:
         return
 
@@ -232,6 +305,12 @@ def sort_imports(file, check_only=True):
 
 
 def sort_imports_in_all_inits(check_only=True):
+    """
+    Sort the imports defined in the `_import_structure` of all inits in the repo.
+
+    Args:
+        check_only (`bool`, *optional*, defaults to `True`): Whether or not to just check (and not auto-fix) the init.
+    """
     failures = []
     for root, _, files in os.walk(PATH_TO_TRANSFORMERS):
         if "__init__.py" in files:
diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt
deleted file mode 100644
index 7839f58a2016..000000000000
--- a/utils/documentation_tests.txt
+++ /dev/null
@@ -1,214 +0,0 @@
-docs/source/en/quicktour.mdx
-docs/source/es/quicktour.mdx
-docs/source/en/pipeline_tutorial.mdx
-docs/source/en/autoclass_tutorial.mdx
-docs/source/en/task_summary.mdx
-docs/source/en/model_doc/markuplm.mdx
-docs/source/en/model_doc/speech_to_text.mdx
-docs/source/en/model_doc/switch_transformers.mdx
-docs/source/en/model_doc/t5.mdx
-docs/source/en/model_doc/t5v1.1.mdx
-docs/source/en/model_doc/byt5.mdx
-docs/source/en/model_doc/tapex.mdx
-docs/source/en/model_doc/donut.mdx
-docs/source/en/model_doc/encoder-decoder.mdx
-src/transformers/generation/configuration_utils.py
-src/transformers/generation/tf_utils.py
-src/transformers/generation/utils.py
-src/transformers/models/albert/configuration_albert.py
-src/transformers/models/albert/modeling_albert.py
-src/transformers/models/albert/modeling_tf_albert.py
-src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
-src/transformers/models/bart/configuration_bart.py
-src/transformers/models/bart/modeling_bart.py
-src/transformers/models/beit/configuration_beit.py
-src/transformers/models/beit/modeling_beit.py
-src/transformers/models/bert/configuration_bert.py
-src/transformers/models/bert/modeling_bert.py
-src/transformers/models/bert/modeling_tf_bert.py
-src/transformers/models/bert_generation/configuration_bert_generation.py
-src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py
-src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
-src/transformers/models/big_bird/configuration_big_bird.py
-src/transformers/models/big_bird/modeling_big_bird.py
-src/transformers/models/blenderbot/configuration_blenderbot.py
-src/transformers/models/blenderbot/modeling_blenderbot.py
-src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
-src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
-src/transformers/models/blip/modeling_blip.py
-src/transformers/models/bloom/configuration_bloom.py
-src/transformers/models/camembert/configuration_camembert.py
-src/transformers/models/canine/configuration_canine.py
-src/transformers/models/canine/modeling_canine.py
-src/transformers/models/clip/configuration_clip.py
-src/transformers/models/clipseg/modeling_clipseg.py
-src/transformers/models/codegen/configuration_codegen.py
-src/transformers/models/conditional_detr/configuration_conditional_detr.py
-src/transformers/models/conditional_detr/modeling_conditional_detr.py
-src/transformers/models/convbert/configuration_convbert.py
-src/transformers/models/convnext/configuration_convnext.py
-src/transformers/models/convnext/modeling_convnext.py
-src/transformers/models/ctrl/configuration_ctrl.py
-src/transformers/models/ctrl/modeling_ctrl.py
-src/transformers/models/cvt/configuration_cvt.py
-src/transformers/models/cvt/modeling_cvt.py
-src/transformers/models/data2vec/configuration_data2vec_audio.py
-src/transformers/models/data2vec/configuration_data2vec_text.py
-src/transformers/models/data2vec/configuration_data2vec_vision.py
-src/transformers/models/data2vec/modeling_data2vec_audio.py
-src/transformers/models/data2vec/modeling_data2vec_vision.py
-src/transformers/models/deberta/configuration_deberta.py
-src/transformers/models/deberta/modeling_deberta.py
-src/transformers/models/deberta_v2/configuration_deberta_v2.py
-src/transformers/models/deberta_v2/modeling_deberta_v2.py
-src/transformers/models/decision_transformer/configuration_decision_transformer.py
-src/transformers/models/deformable_detr/configuration_deformable_detr.py
-src/transformers/models/deformable_detr/modeling_deformable_detr.py
-src/transformers/models/deit/configuration_deit.py
-src/transformers/models/deit/modeling_deit.py
-src/transformers/models/deit/modeling_tf_deit.py
-src/transformers/models/detr/configuration_detr.py
-src/transformers/models/detr/modeling_detr.py
-src/transformers/models/dinat/configuration_dinat.py
-src/transformers/models/dinat/modeling_dinat.py
-src/transformers/models/distilbert/configuration_distilbert.py
-src/transformers/models/dpr/configuration_dpr.py
-src/transformers/models/dpt/modeling_dpt.py
-src/transformers/models/electra/configuration_electra.py
-src/transformers/models/electra/modeling_electra.py
-src/transformers/models/electra/modeling_tf_electra.py
-src/transformers/models/ernie/configuration_ernie.py
-src/transformers/models/flava/configuration_flava.py
-src/transformers/models/fnet/configuration_fnet.py
-src/transformers/models/fsmt/configuration_fsmt.py
-src/transformers/models/git/modeling_git.py
-src/transformers/models/glpn/modeling_glpn.py
-src/transformers/models/gpt2/configuration_gpt2.py
-src/transformers/models/gpt2/modeling_gpt2.py
-src/transformers/models/gptj/modeling_gptj.py
-src/transformers/models/gpt_neo/configuration_gpt_neo.py
-src/transformers/models/gpt_neox/configuration_gpt_neox.py
-src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py
-src/transformers/models/groupvit/modeling_groupvit.py
-src/transformers/models/groupvit/modeling_tf_groupvit.py
-src/transformers/models/hubert/modeling_hubert.py
-src/transformers/models/imagegpt/configuration_imagegpt.py
-src/transformers/models/layoutlm/configuration_layoutlm.py
-src/transformers/models/layoutlm/modeling_layoutlm.py
-src/transformers/models/layoutlm/modeling_tf_layoutlm.py
-src/transformers/models/layoutlmv2/configuration_layoutlmv2.py
-src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
-src/transformers/models/layoutlmv3/configuration_layoutlmv3.py
-src/transformers/models/layoutlmv3/modeling_layoutlmv3.py
-src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py
-src/transformers/models/levit/configuration_levit.py
-src/transformers/models/lilt/modeling_lilt.py
-src/transformers/models/longformer/modeling_longformer.py
-src/transformers/models/longformer/modeling_tf_longformer.py
-src/transformers/models/longt5/modeling_longt5.py
-src/transformers/models/marian/modeling_marian.py
-src/transformers/models/markuplm/modeling_markuplm.py
-src/transformers/models/maskformer/configuration_maskformer.py
-src/transformers/models/maskformer/modeling_maskformer.py
-src/transformers/models/mbart/configuration_mbart.py
-src/transformers/models/mbart/modeling_mbart.py
-src/transformers/models/mctct/configuration_mctct.py
-src/transformers/models/megatron_bert/configuration_megatron_bert.py
-src/transformers/models/mobilebert/configuration_mobilebert.py
-src/transformers/models/mobilebert/modeling_mobilebert.py
-src/transformers/models/mobilebert/modeling_tf_mobilebert.py
-src/transformers/models/mobilenet_v1/modeling_mobilenet_v1.py
-src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py
-src/transformers/models/mobilevit/modeling_mobilevit.py
-src/transformers/models/mobilevit/modeling_tf_mobilevit.py
-src/transformers/models/nat/configuration_nat.py
-src/transformers/models/nat/modeling_nat.py
-src/transformers/models/nezha/configuration_nezha.py
-src/transformers/models/openai/configuration_openai.py
-src/transformers/models/opt/configuration_opt.py
-src/transformers/models/opt/modeling_opt.py
-src/transformers/models/opt/modeling_tf_opt.py
-src/transformers/models/owlvit/modeling_owlvit.py
-src/transformers/models/pegasus/configuration_pegasus.py
-src/transformers/models/pegasus/modeling_pegasus.py
-src/transformers/models/pegasus_x/configuration_pegasus_x.py
-src/transformers/models/perceiver/modeling_perceiver.py
-src/transformers/models/plbart/configuration_plbart.py
-src/transformers/models/plbart/modeling_plbart.py
-src/transformers/models/poolformer/configuration_poolformer.py
-src/transformers/models/poolformer/modeling_poolformer.py
-src/transformers/models/realm/configuration_realm.py
-src/transformers/models/reformer/configuration_reformer.py
-src/transformers/models/reformer/modeling_reformer.py
-src/transformers/models/regnet/modeling_regnet.py
-src/transformers/models/regnet/modeling_tf_regnet.py
-src/transformers/models/resnet/configuration_resnet.py
-src/transformers/models/resnet/modeling_resnet.py
-src/transformers/models/resnet/modeling_tf_resnet.py
-src/transformers/models/roberta/configuration_roberta.py
-src/transformers/models/roberta/modeling_roberta.py
-src/transformers/models/roberta/modeling_tf_roberta.py
-src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py
-src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
-src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py
-src/transformers/models/roc_bert/modeling_roc_bert.py
-src/transformers/models/roc_bert/tokenization_roc_bert.py
-src/transformers/models/segformer/modeling_segformer.py
-src/transformers/models/sew/configuration_sew.py
-src/transformers/models/sew/modeling_sew.py
-src/transformers/models/sew_d/configuration_sew_d.py
-src/transformers/models/sew_d/modeling_sew_d.py
-src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
-src/transformers/models/speech_to_text/configuration_speech_to_text.py
-src/transformers/models/speech_to_text/modeling_speech_to_text.py
-src/transformers/models/speech_to_text_2/configuration_speech_to_text_2.py
-src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
-src/transformers/models/segformer/modeling_tf_segformer.py
-src/transformers/models/squeezebert/configuration_squeezebert.py
-src/transformers/models/swin/configuration_swin.py
-src/transformers/models/swin/modeling_swin.py
-src/transformers/models/swin2sr/modeling_swin2sr.py
-src/transformers/models/swinv2/configuration_swinv2.py
-src/transformers/models/table_transformer/modeling_table_transformer.py
-src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
-src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
-src/transformers/models/trajectory_transformer/configuration_trajectory_transformer.py
-src/transformers/models/transfo_xl/configuration_transfo_xl.py
-src/transformers/models/trocr/configuration_trocr.py
-src/transformers/models/trocr/modeling_trocr.py
-src/transformers/models/unispeech/configuration_unispeech.py
-src/transformers/models/unispeech/modeling_unispeech.py
-src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
-src/transformers/models/van/modeling_van.py
-src/transformers/models/videomae/modeling_videomae.py
-src/transformers/models/vilt/modeling_vilt.py
-src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py
-src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
-src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py
-src/transformers/models/vit/configuration_vit.py
-src/transformers/models/vit/modeling_vit.py
-src/transformers/models/vit/modeling_tf_vit.py
-src/transformers/models/vit_mae/modeling_vit_mae.py
-src/transformers/models/vit_mae/configuration_vit_mae.py
-src/transformers/models/vit_msn/modeling_vit_msn.py
-src/transformers/models/visual_bert/configuration_visual_bert.py
-src/transformers/models/wav2vec2/configuration_wav2vec2.py
-src/transformers/models/wav2vec2/modeling_wav2vec2.py
-src/transformers/models/wav2vec2/tokenization_wav2vec2.py
-src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py
-src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
-src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
-src/transformers/models/wavlm/configuration_wavlm.py
-src/transformers/models/wavlm/modeling_wavlm.py
-src/transformers/models/whisper/configuration_whisper.py
-src/transformers/models/whisper/modeling_whisper.py
-src/transformers/models/whisper/modeling_tf_whisper.py
-src/transformers/models/xlm/configuration_xlm.py
-src/transformers/models/xlm_roberta/configuration_xlm_roberta.py
-src/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py
-src/transformers/models/xlnet/configuration_xlnet.py
-src/transformers/models/yolos/configuration_yolos.py
-src/transformers/models/yolos/modeling_yolos.py
-src/transformers/models/x_clip/modeling_x_clip.py
-src/transformers/models/yoso/configuration_yoso.py
-src/transformers/pipelines/
diff --git a/utils/extract_warnings.py b/utils/extract_warnings.py
index 48912ea6f58f..38c6ac5ecb9f 100644
--- a/utils/extract_warnings.py
+++ b/utils/extract_warnings.py
@@ -5,6 +5,7 @@
 import zipfile
 
 from get_ci_error_statistics import download_artifact, get_artifacts_links
+
 from transformers import logging
 
 
@@ -82,19 +83,14 @@ def list_str(values):
 
     parser = argparse.ArgumentParser()
     # Required parameters
-    parser.add_argument(
-        "--workflow_run_id", default=None, type=str, required=True, help="A GitHub Actions workflow run id."
-    )
+    parser.add_argument("--workflow_run_id", type=str, required=True, help="A GitHub Actions workflow run id.")
     parser.add_argument(
         "--output_dir",
-        default=None,
         type=str,
         required=True,
         help="Where to store the downloaded artifacts and other result files.",
     )
-    parser.add_argument(
-        "--token", default=None, type=str, required=True, help="A token that has actions:read permission."
-    )
+    parser.add_argument("--token", default=None, type=str, help="A token that has actions:read permission.")
     # optional parameters
     parser.add_argument(
         "--targets",
@@ -118,7 +114,7 @@ def list_str(values):
         os.makedirs(args.output_dir, exist_ok=True)
 
         # get download links
-        artifacts = get_artifacts_links(args.workflow_run_id)
+        artifacts = get_artifacts_links(args.workflow_run_id, token=args.token)
         with open(os.path.join(args.output_dir, "artifacts.json"), "w", encoding="UTF-8") as fp:
             json.dump(artifacts, fp, ensure_ascii=False, indent=4)
 
@@ -133,6 +129,6 @@ def list_str(values):
 
     # extract warnings from artifacts
     selected_warnings = extract_warnings(args.output_dir, args.targets)
-    selected_warnings = sorted(list(selected_warnings))
+    selected_warnings = sorted(selected_warnings)
     with open(os.path.join(args.output_dir, "selected_warnings.json"), "w", encoding="UTF-8") as fp:
         json.dump(selected_warnings, fp, ensure_ascii=False, indent=4)
diff --git a/utils/get_ci_error_statistics.py b/utils/get_ci_error_statistics.py
index 790ec5e3d565..93884dda1df6 100644
--- a/utils/get_ci_error_statistics.py
+++ b/utils/get_ci_error_statistics.py
@@ -2,19 +2,23 @@
 import json
 import math
 import os
-import subprocess
 import time
+import traceback
 import zipfile
 from collections import Counter
 
 import requests
 
 
-def get_job_links(workflow_run_id):
+def get_job_links(workflow_run_id, token=None):
     """Extract job names and their job links in a GitHub Actions workflow run"""
 
+    headers = None
+    if token is not None:
+        headers = {"Accept": "application/vnd.github+json", "Authorization": f"Bearer {token}"}
+
     url = f"https://api.github.com/repos/huggingface/transformers/actions/runs/{workflow_run_id}/jobs?per_page=100"
-    result = requests.get(url).json()
+    result = requests.get(url, headers=headers).json()
     job_links = {}
 
     try:
@@ -22,21 +26,25 @@ def get_job_links(workflow_run_id):
         pages_to_iterate_over = math.ceil((result["total_count"] - 100) / 100)
 
         for i in range(pages_to_iterate_over):
-            result = requests.get(url + f"&page={i + 2}").json()
+            result = requests.get(url + f"&page={i + 2}", headers=headers).json()
             job_links.update({job["name"]: job["html_url"] for job in result["jobs"]})
 
         return job_links
-    except Exception as e:
-        print("Unknown error, could not fetch links.", e)
+    except Exception:
+        print(f"Unknown error, could not fetch links:\n{traceback.format_exc()}")
 
     return {}
 
 
-def get_artifacts_links(worflow_run_id):
+def get_artifacts_links(worflow_run_id, token=None):
     """Get all artifact links from a workflow run"""
 
+    headers = None
+    if token is not None:
+        headers = {"Accept": "application/vnd.github+json", "Authorization": f"Bearer {token}"}
+
     url = f"https://api.github.com/repos/huggingface/transformers/actions/runs/{worflow_run_id}/artifacts?per_page=100"
-    result = requests.get(url).json()
+    result = requests.get(url, headers=headers).json()
     artifacts = {}
 
     try:
@@ -44,12 +52,12 @@ def get_artifacts_links(worflow_run_id):
         pages_to_iterate_over = math.ceil((result["total_count"] - 100) / 100)
 
         for i in range(pages_to_iterate_over):
-            result = requests.get(url + f"&page={i + 2}").json()
+            result = requests.get(url + f"&page={i + 2}", headers=headers).json()
             artifacts.update({artifact["name"]: artifact["archive_download_url"] for artifact in result["artifacts"]})
 
         return artifacts
-    except Exception as e:
-        print("Unknown error, could not fetch links.", e)
+    except Exception:
+        print(f"Unknown error, could not fetch links:\n{traceback.format_exc()}")
 
     return {}
 
@@ -57,23 +65,20 @@ def get_artifacts_links(worflow_run_id):
 def download_artifact(artifact_name, artifact_url, output_dir, token):
     """Download a GitHub Action artifact from a URL.
 
-    The URL is of the from `https://api.github.com/repos/huggingface/transformers/actions/artifacts/{ARTIFACT_ID}/zip`,
+    The URL is of the form `https://api.github.com/repos/huggingface/transformers/actions/artifacts/{ARTIFACT_ID}/zip`,
     but it can't be used to download directly. We need to get a redirect URL first.
     See https://docs.github.com/en/rest/actions/artifacts#download-an-artifact
     """
-    # Get the redirect URL first
-    cmd = f'curl -v -H "Accept: application/vnd.github+json" -H "Authorization: token {token}" {artifact_url}'
-    output = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-    o = output.stdout.decode("utf-8")
-    lines = o.splitlines()
+    headers = None
+    if token is not None:
+        headers = {"Accept": "application/vnd.github+json", "Authorization": f"Bearer {token}"}
 
-    for line in lines:
-        if line.startswith("< Location: "):
-            redirect_url = line[len("< Location: ") :]
-            r = requests.get(redirect_url, allow_redirects=True)
-            p = os.path.join(output_dir, f"{artifact_name}.zip")
-            open(p, "wb").write(r.content)
-            break
+    result = requests.get(artifact_url, headers=headers, allow_redirects=False)
+    download_url = result.headers["Location"]
+    response = requests.get(download_url, allow_redirects=True)
+    file_path = os.path.join(output_dir, f"{artifact_name}.zip")
+    with open(file_path, "wb") as fp:
+        fp.write(response.content)
 
 
 def get_errors_from_single_artifact(artifact_zip_path, job_links=None):
@@ -166,7 +171,7 @@ def reduce_by_model(logs, error_filter=None):
 
     logs = [(x[0], x[1], get_model(x[2])) for x in logs]
     logs = [x for x in logs if x[2] is not None]
-    tests = set([x[2] for x in logs])
+    tests = {x[2] for x in logs}
 
     r = {}
     for test in tests:
@@ -209,27 +214,21 @@ def make_github_table_per_model(reduced_by_model):
 
 
 if __name__ == "__main__":
-
     parser = argparse.ArgumentParser()
     # Required parameters
-    parser.add_argument(
-        "--workflow_run_id", default=None, type=str, required=True, help="A GitHub Actions workflow run id."
-    )
+    parser.add_argument("--workflow_run_id", type=str, required=True, help="A GitHub Actions workflow run id.")
     parser.add_argument(
         "--output_dir",
-        default=None,
         type=str,
         required=True,
         help="Where to store the downloaded artifacts and other result files.",
     )
-    parser.add_argument(
-        "--token", default=None, type=str, required=True, help="A token that has actions:read permission."
-    )
+    parser.add_argument("--token", default=None, type=str, help="A token that has actions:read permission.")
     args = parser.parse_args()
 
     os.makedirs(args.output_dir, exist_ok=True)
 
-    _job_links = get_job_links(args.workflow_run_id)
+    _job_links = get_job_links(args.workflow_run_id, token=args.token)
     job_links = {}
     # To deal with `workflow_call` event, where a job name is the combination of the job names in the caller and callee.
     # For example, `PyTorch 1.11 / Model tests (models/albert, single-gpu)`.
@@ -243,7 +242,7 @@ def make_github_table_per_model(reduced_by_model):
     with open(os.path.join(args.output_dir, "job_links.json"), "w", encoding="UTF-8") as fp:
         json.dump(job_links, fp, ensure_ascii=False, indent=4)
 
-    artifacts = get_artifacts_links(args.workflow_run_id)
+    artifacts = get_artifacts_links(args.workflow_run_id, token=args.token)
     with open(os.path.join(args.output_dir, "artifacts.json"), "w", encoding="UTF-8") as fp:
         json.dump(artifacts, fp, ensure_ascii=False, indent=4)
 
diff --git a/utils/get_github_job_time.py b/utils/get_github_job_time.py
index 5065c108aab1..af59081ffd46 100644
--- a/utils/get_github_job_time.py
+++ b/utils/get_github_job_time.py
@@ -1,5 +1,6 @@
 import argparse
 import math
+import traceback
 
 import dateutil.parser as date_parser
 import requests
@@ -25,11 +26,15 @@ def extract_time_from_single_job(job):
     return job_info
 
 
-def get_job_time(workflow_run_id):
+def get_job_time(workflow_run_id, token=None):
     """Extract time info for all jobs in a GitHub Actions workflow run"""
 
+    headers = None
+    if token is not None:
+        headers = {"Accept": "application/vnd.github+json", "Authorization": f"Bearer {token}"}
+
     url = f"https://api.github.com/repos/huggingface/transformers/actions/runs/{workflow_run_id}/jobs?per_page=100"
-    result = requests.get(url).json()
+    result = requests.get(url, headers=headers).json()
     job_time = {}
 
     try:
@@ -37,12 +42,12 @@ def get_job_time(workflow_run_id):
         pages_to_iterate_over = math.ceil((result["total_count"] - 100) / 100)
 
         for i in range(pages_to_iterate_over):
-            result = requests.get(url + f"&page={i + 2}").json()
+            result = requests.get(url + f"&page={i + 2}", headers=headers).json()
             job_time.update({job["name"]: extract_time_from_single_job(job) for job in result["jobs"]})
 
         return job_time
-    except Exception as e:
-        print("Unknown error, could not fetch links.", e)
+    except Exception:
+        print(f"Unknown error, could not fetch links:\n{traceback.format_exc()}")
 
     return {}
 
@@ -56,9 +61,7 @@ def get_job_time(workflow_run_id):
 
     parser = argparse.ArgumentParser()
     # Required parameters
-    parser.add_argument(
-        "--workflow_run_id", default=None, type=str, required=True, help="A GitHub Actions workflow run id."
-    )
+    parser.add_argument("--workflow_run_id", type=str, required=True, help="A GitHub Actions workflow run id.")
     args = parser.parse_args()
 
     job_time = get_job_time(args.workflow_run_id)
diff --git a/utils/get_modified_files.py b/utils/get_modified_files.py
index 44c60e96abba..6c8bdfcdf00c 100644
--- a/utils/get_modified_files.py
+++ b/utils/get_modified_files.py
@@ -25,7 +25,9 @@
 
 
 fork_point_sha = subprocess.check_output("git merge-base main HEAD".split()).decode("utf-8")
-modified_files = subprocess.check_output(f"git diff --name-only {fork_point_sha}".split()).decode("utf-8").split()
+modified_files = (
+    subprocess.check_output(f"git diff --diff-filter=d --name-only {fork_point_sha}".split()).decode("utf-8").split()
+)
 
 joined_dirs = "|".join(sys.argv[1:])
 regex = re.compile(rf"^({joined_dirs}).*?\.py$")
diff --git a/utils/get_previous_daily_ci.py b/utils/get_previous_daily_ci.py
new file mode 100644
index 000000000000..4e4cb0a8c10d
--- /dev/null
+++ b/utils/get_previous_daily_ci.py
@@ -0,0 +1,70 @@
+import os
+import zipfile
+
+import requests
+from get_ci_error_statistics import download_artifact, get_artifacts_links
+
+
+def get_daily_ci_runs(token, num_runs=7):
+    """Get the workflow runs of the scheduled (daily) CI.
+
+    This only selects the runs triggered by the `schedule` event on the `main` branch.
+    """
+    headers = None
+    if token is not None:
+        headers = {"Accept": "application/vnd.github+json", "Authorization": f"Bearer {token}"}
+
+    # The id of a workflow (not of a workflow run)
+    workflow_id = "636036"
+
+    url = f"https://api.github.com/repos/huggingface/transformers/actions/workflows/{workflow_id}/runs"
+    # On `main` branch + event being `schedule` + not returning PRs + only `num_runs` results
+    url += f"?branch=main&event=schedule&exclude_pull_requests=true&per_page={num_runs}"
+
+    result = requests.get(url, headers=headers).json()
+
+    return result["workflow_runs"]
+
+
+def get_last_daily_ci_runs(token):
+    """Get the last completed workflow run id of the scheduled (daily) CI."""
+    workflow_runs = get_daily_ci_runs(token)
+    workflow_run_id = None
+    for workflow_run in workflow_runs:
+        if workflow_run["status"] == "completed":
+            workflow_run_id = workflow_run["id"]
+            break
+
+    return workflow_run_id
+
+
+def get_last_daily_ci_artifacts(artifact_names, output_dir, token):
+    """Get the artifacts of last completed workflow run id of the scheduled (daily) CI."""
+    workflow_run_id = get_last_daily_ci_runs(token)
+    if workflow_run_id is not None:
+        artifacts_links = get_artifacts_links(worflow_run_id=workflow_run_id, token=token)
+        for artifact_name in artifact_names:
+            if artifact_name in artifacts_links:
+                artifact_url = artifacts_links[artifact_name]
+                download_artifact(
+                    artifact_name=artifact_name, artifact_url=artifact_url, output_dir=output_dir, token=token
+                )
+
+
+def get_last_daily_ci_reports(artifact_names, output_dir, token):
+    """Get the artifacts' content of the last completed workflow run id of the scheduled (daily) CI."""
+    get_last_daily_ci_artifacts(artifact_names, output_dir, token)
+
+    results = {}
+    for artifact_name in artifact_names:
+        artifact_zip_path = os.path.join(output_dir, f"{artifact_name}.zip")
+        if os.path.isfile(artifact_zip_path):
+            results[artifact_name] = {}
+            with zipfile.ZipFile(artifact_zip_path) as z:
+                for filename in z.namelist():
+                    if not os.path.isdir(filename):
+                        # read the file
+                        with z.open(filename) as f:
+                            results[artifact_name][filename] = f.read().decode("UTF-8")
+
+    return results
diff --git a/utils/get_test_info.py b/utils/get_test_info.py
new file mode 100644
index 000000000000..d6b451e71f3e
--- /dev/null
+++ b/utils/get_test_info.py
@@ -0,0 +1,190 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import os
+import sys
+
+
+# This is required to make the module import works (when the python process is running from the root of the repo)
+sys.path.append(".")
+
+
+r"""
+The argument `test_file` in this file refers to a model test file. This should be a string of the from
+`tests/models/*/test_modeling_*.py`.
+"""
+
+
+def get_module_path(test_file):
+    """Return the module path of a model test file."""
+    components = test_file.split(os.path.sep)
+    if components[0:2] != ["tests", "models"]:
+        raise ValueError(
+            "`test_file` should start with `tests/models/` (with `/` being the OS specific path separator). Got "
+            f"{test_file} instead."
+        )
+    test_fn = components[-1]
+    if not test_fn.endswith("py"):
+        raise ValueError(f"`test_file` should be a python file. Got {test_fn} instead.")
+    if not test_fn.startswith("test_modeling_"):
+        raise ValueError(
+            f"`test_file` should point to a file name of the form `test_modeling_*.py`. Got {test_fn} instead."
+        )
+
+    components = components[:-1] + [test_fn.replace(".py", "")]
+    test_module_path = ".".join(components)
+
+    return test_module_path
+
+
+def get_test_module(test_file):
+    """Get the module of a model test file."""
+    test_module_path = get_module_path(test_file)
+    test_module = importlib.import_module(test_module_path)
+
+    return test_module
+
+
+def get_tester_classes(test_file):
+    """Get all classes in a model test file whose names ends with `ModelTester`."""
+    tester_classes = []
+    test_module = get_test_module(test_file)
+    for attr in dir(test_module):
+        if attr.endswith("ModelTester"):
+            tester_classes.append(getattr(test_module, attr))
+
+    # sort with class names
+    return sorted(tester_classes, key=lambda x: x.__name__)
+
+
+def get_test_classes(test_file):
+    """Get all [test] classes in a model test file with attribute `all_model_classes` that are non-empty.
+
+    These are usually the (model) test classes containing the (non-slow) tests to run and are subclasses of one of the
+    classes `ModelTesterMixin`, `TFModelTesterMixin` or `FlaxModelTesterMixin`, as well as a subclass of
+    `unittest.TestCase`. Exceptions include `RagTestMixin` (and its subclasses).
+    """
+    test_classes = []
+    test_module = get_test_module(test_file)
+    for attr in dir(test_module):
+        attr_value = getattr(test_module, attr)
+        # (TF/Flax)ModelTesterMixin is also an attribute in specific model test module. Let's exclude them by checking
+        # `all_model_classes` is not empty (which also excludes other special classes).
+        model_classes = getattr(attr_value, "all_model_classes", [])
+        if len(model_classes) > 0:
+            test_classes.append(attr_value)
+
+    # sort with class names
+    return sorted(test_classes, key=lambda x: x.__name__)
+
+
+def get_model_classes(test_file):
+    """Get all model classes that appear in `all_model_classes` attributes in a model test file."""
+    test_classes = get_test_classes(test_file)
+    model_classes = set()
+    for test_class in test_classes:
+        model_classes.update(test_class.all_model_classes)
+
+    # sort with class names
+    return sorted(model_classes, key=lambda x: x.__name__)
+
+
+def get_model_tester_from_test_class(test_class):
+    """Get the model tester class of a model test class."""
+    test = test_class()
+    if hasattr(test, "setUp"):
+        test.setUp()
+
+    model_tester = None
+    if hasattr(test, "model_tester"):
+        # `(TF/Flax)ModelTesterMixin` has this attribute default to `None`. Let's skip this case.
+        if test.model_tester is not None:
+            model_tester = test.model_tester.__class__
+
+    return model_tester
+
+
+def get_test_classes_for_model(test_file, model_class):
+    """Get all [test] classes in `test_file` that have `model_class` in their `all_model_classes`."""
+    test_classes = get_test_classes(test_file)
+
+    target_test_classes = []
+    for test_class in test_classes:
+        if model_class in test_class.all_model_classes:
+            target_test_classes.append(test_class)
+
+    # sort with class names
+    return sorted(target_test_classes, key=lambda x: x.__name__)
+
+
+def get_tester_classes_for_model(test_file, model_class):
+    """Get all model tester classes in `test_file` that are associated to `model_class`."""
+    test_classes = get_test_classes_for_model(test_file, model_class)
+
+    tester_classes = []
+    for test_class in test_classes:
+        tester_class = get_model_tester_from_test_class(test_class)
+        if tester_class is not None:
+            tester_classes.append(tester_class)
+
+    # sort with class names
+    return sorted(tester_classes, key=lambda x: x.__name__)
+
+
+def get_test_to_tester_mapping(test_file):
+    """Get a mapping from [test] classes to model tester classes in `test_file`.
+
+    This uses `get_test_classes` which may return classes that are NOT subclasses of `unittest.TestCase`.
+    """
+    test_classes = get_test_classes(test_file)
+    test_tester_mapping = {test_class: get_model_tester_from_test_class(test_class) for test_class in test_classes}
+    return test_tester_mapping
+
+
+def get_model_to_test_mapping(test_file):
+    """Get a mapping from model classes to test classes in `test_file`."""
+    model_classes = get_model_classes(test_file)
+    model_test_mapping = {
+        model_class: get_test_classes_for_model(test_file, model_class) for model_class in model_classes
+    }
+    return model_test_mapping
+
+
+def get_model_to_tester_mapping(test_file):
+    """Get a mapping from model classes to model tester classes in `test_file`."""
+    model_classes = get_model_classes(test_file)
+    model_to_tester_mapping = {
+        model_class: get_tester_classes_for_model(test_file, model_class) for model_class in model_classes
+    }
+    return model_to_tester_mapping
+
+
+def to_json(o):
+    """Make the information succinct and easy to read.
+
+    Avoid the full class representation like `<class 'transformers.models.bert.modeling_bert.BertForMaskedLM'>` when
+    displaying the results. Instead, we use class name (`BertForMaskedLM`) for the readability.
+    """
+    if isinstance(o, str):
+        return o
+    elif isinstance(o, type):
+        return o.__name__
+    elif isinstance(o, (list, tuple)):
+        return [to_json(x) for x in o]
+    elif isinstance(o, dict):
+        return {to_json(k): to_json(v) for k, v in o.items()}
+    else:
+        return o
diff --git a/utils/not_doctested.txt b/utils/not_doctested.txt
new file mode 100644
index 000000000000..617385cabb94
--- /dev/null
+++ b/utils/not_doctested.txt
@@ -0,0 +1,990 @@
+docs/source/en/_config.py
+docs/source/en/accelerate.md
+docs/source/en/add_new_model.md
+docs/source/en/add_new_pipeline.md
+docs/source/en/add_tensorflow_model.md
+docs/source/en/attention.md
+docs/source/en/benchmarks.md
+docs/source/en/bertology.md
+docs/source/en/big_models.md
+docs/source/en/community.md
+docs/source/en/contributing.md
+docs/source/en/create_a_model.md
+docs/source/en/custom_models.md
+docs/source/en/custom_tools.md
+docs/source/en/debugging.md
+docs/source/en/fast_tokenizers.md
+docs/source/en/glossary.md
+docs/source/en/hpo_train.md
+docs/source/en/index.md
+docs/source/en/installation.md
+docs/source/en/internal/audio_utils.md
+docs/source/en/internal/file_utils.md
+docs/source/en/internal/image_processing_utils.md
+docs/source/en/internal/modeling_utils.md
+docs/source/en/internal/pipelines_utils.md
+docs/source/en/internal/time_series_utils.md
+docs/source/en/internal/tokenization_utils.md
+docs/source/en/internal/trainer_utils.md
+docs/source/en/llm_tutorial.md
+docs/source/en/main_classes/agent.md
+docs/source/en/main_classes/callback.md
+docs/source/en/main_classes/configuration.md
+docs/source/en/main_classes/data_collator.md
+docs/source/en/main_classes/deepspeed.md
+docs/source/en/main_classes/feature_extractor.md
+docs/source/en/main_classes/image_processor.md
+docs/source/en/main_classes/keras_callbacks.md
+docs/source/en/main_classes/logging.md
+docs/source/en/main_classes/model.md
+docs/source/en/main_classes/onnx.md
+docs/source/en/main_classes/optimizer_schedules.md
+docs/source/en/main_classes/output.md
+docs/source/en/main_classes/pipelines.md
+docs/source/en/main_classes/processors.md
+docs/source/en/main_classes/quantization.md
+docs/source/en/main_classes/tokenizer.md
+docs/source/en/main_classes/trainer.md
+docs/source/en/model_doc/albert.md
+docs/source/en/model_doc/align.md
+docs/source/en/model_doc/altclip.md
+docs/source/en/model_doc/audio-spectrogram-transformer.md
+docs/source/en/model_doc/auto.md
+docs/source/en/model_doc/autoformer.md
+docs/source/en/model_doc/bark.md
+docs/source/en/model_doc/bart.md
+docs/source/en/model_doc/barthez.md
+docs/source/en/model_doc/bartpho.md
+docs/source/en/model_doc/beit.md
+docs/source/en/model_doc/bert-generation.md
+docs/source/en/model_doc/bert-japanese.md
+docs/source/en/model_doc/bert.md
+docs/source/en/model_doc/bertweet.md
+docs/source/en/model_doc/big_bird.md
+docs/source/en/model_doc/bigbird_pegasus.md
+docs/source/en/model_doc/biogpt.md
+docs/source/en/model_doc/bit.md
+docs/source/en/model_doc/blenderbot-small.md
+docs/source/en/model_doc/blenderbot.md
+docs/source/en/model_doc/blip-2.md
+docs/source/en/model_doc/blip.md
+docs/source/en/model_doc/bloom.md
+docs/source/en/model_doc/bort.md
+docs/source/en/model_doc/bridgetower.md
+docs/source/en/model_doc/camembert.md
+docs/source/en/model_doc/canine.md
+docs/source/en/model_doc/chinese_clip.md
+docs/source/en/model_doc/clap.md
+docs/source/en/model_doc/clip.md
+docs/source/en/model_doc/clipseg.md
+docs/source/en/model_doc/codegen.md
+docs/source/en/model_doc/conditional_detr.md
+docs/source/en/model_doc/convbert.md
+docs/source/en/model_doc/convnext.md
+docs/source/en/model_doc/convnextv2.md
+docs/source/en/model_doc/cpm.md
+docs/source/en/model_doc/cpmant.md
+docs/source/en/model_doc/ctrl.md
+docs/source/en/model_doc/cvt.md
+docs/source/en/model_doc/data2vec.md
+docs/source/en/model_doc/deberta-v2.md
+docs/source/en/model_doc/deberta.md
+docs/source/en/model_doc/decision_transformer.md
+docs/source/en/model_doc/deformable_detr.md
+docs/source/en/model_doc/deit.md
+docs/source/en/model_doc/deplot.md
+docs/source/en/model_doc/deta.md
+docs/source/en/model_doc/detr.md
+docs/source/en/model_doc/dialogpt.md
+docs/source/en/model_doc/dinat.md
+docs/source/en/model_doc/dinov2.md
+docs/source/en/model_doc/distilbert.md
+docs/source/en/model_doc/dit.md
+docs/source/en/model_doc/dpr.md
+docs/source/en/model_doc/dpt.md
+docs/source/en/model_doc/efficientformer.md
+docs/source/en/model_doc/efficientnet.md
+docs/source/en/model_doc/electra.md
+docs/source/en/model_doc/encodec.md
+docs/source/en/model_doc/ernie.md
+docs/source/en/model_doc/ernie_m.md
+docs/source/en/model_doc/esm.md
+docs/source/en/model_doc/flan-t5.md
+docs/source/en/model_doc/flan-ul2.md
+docs/source/en/model_doc/flaubert.md
+docs/source/en/model_doc/flava.md
+docs/source/en/model_doc/fnet.md
+docs/source/en/model_doc/focalnet.md
+docs/source/en/model_doc/fsmt.md
+docs/source/en/model_doc/funnel.md
+docs/source/en/model_doc/git.md
+docs/source/en/model_doc/glpn.md
+docs/source/en/model_doc/gpt-sw3.md
+docs/source/en/model_doc/gpt2.md
+docs/source/en/model_doc/gpt_bigcode.md
+docs/source/en/model_doc/gpt_neo.md
+docs/source/en/model_doc/gpt_neox.md
+docs/source/en/model_doc/gpt_neox_japanese.md
+docs/source/en/model_doc/gptj.md
+docs/source/en/model_doc/gptsan-japanese.md
+docs/source/en/model_doc/graphormer.md
+docs/source/en/model_doc/groupvit.md
+docs/source/en/model_doc/herbert.md
+docs/source/en/model_doc/hubert.md
+docs/source/en/model_doc/ibert.md
+docs/source/en/model_doc/idefics.md
+docs/source/en/model_doc/imagegpt.md
+docs/source/en/model_doc/informer.md
+docs/source/en/model_doc/instructblip.md
+docs/source/en/model_doc/jukebox.md
+docs/source/en/model_doc/layoutlm.md
+docs/source/en/model_doc/layoutlmv2.md
+docs/source/en/model_doc/layoutlmv3.md
+docs/source/en/model_doc/layoutxlm.md
+docs/source/en/model_doc/led.md
+docs/source/en/model_doc/levit.md
+docs/source/en/model_doc/lilt.md
+docs/source/en/model_doc/llama.md
+docs/source/en/model_doc/llama2.md
+docs/source/en/model_doc/longformer.md
+docs/source/en/model_doc/longt5.md
+docs/source/en/model_doc/luke.md
+docs/source/en/model_doc/lxmert.md
+docs/source/en/model_doc/m2m_100.md
+docs/source/en/model_doc/marian.md
+docs/source/en/model_doc/mask2former.md
+docs/source/en/model_doc/maskformer.md
+docs/source/en/model_doc/matcha.md
+docs/source/en/model_doc/mbart.md
+docs/source/en/model_doc/mctct.md
+docs/source/en/model_doc/mega.md
+docs/source/en/model_doc/megatron-bert.md
+docs/source/en/model_doc/megatron_gpt2.md
+docs/source/en/model_doc/mgp-str.md
+docs/source/en/model_doc/mistral.md
+docs/source/en/model_doc/mluke.md
+docs/source/en/model_doc/mms.md
+docs/source/en/model_doc/mobilebert.md
+docs/source/en/model_doc/mobilenet_v1.md
+docs/source/en/model_doc/mobilenet_v2.md
+docs/source/en/model_doc/mobilevit.md
+docs/source/en/model_doc/mobilevitv2.md
+docs/source/en/model_doc/mpnet.md
+docs/source/en/model_doc/mpt.md
+docs/source/en/model_doc/mra.md
+docs/source/en/model_doc/mt5.md
+docs/source/en/model_doc/musicgen.md
+docs/source/en/model_doc/mvp.md
+docs/source/en/model_doc/nat.md
+docs/source/en/model_doc/nezha.md
+docs/source/en/model_doc/nllb-moe.md
+docs/source/en/model_doc/nllb.md
+docs/source/en/model_doc/nystromformer.md
+docs/source/en/model_doc/oneformer.md
+docs/source/en/model_doc/open-llama.md
+docs/source/en/model_doc/openai-gpt.md
+docs/source/en/model_doc/opt.md
+docs/source/en/model_doc/owlvit.md
+docs/source/en/model_doc/pegasus.md
+docs/source/en/model_doc/pegasus_x.md
+docs/source/en/model_doc/perceiver.md
+docs/source/en/model_doc/phobert.md
+docs/source/en/model_doc/pix2struct.md
+docs/source/en/model_doc/plbart.md
+docs/source/en/model_doc/poolformer.md
+docs/source/en/model_doc/pop2piano.md
+docs/source/en/model_doc/prophetnet.md
+docs/source/en/model_doc/pvt.md
+docs/source/en/model_doc/qdqbert.md
+docs/source/en/model_doc/rag.md
+docs/source/en/model_doc/realm.md
+docs/source/en/model_doc/reformer.md
+docs/source/en/model_doc/regnet.md
+docs/source/en/model_doc/rembert.md
+docs/source/en/model_doc/resnet.md
+docs/source/en/model_doc/retribert.md
+docs/source/en/model_doc/roberta-prelayernorm.md
+docs/source/en/model_doc/roberta.md
+docs/source/en/model_doc/roc_bert.md
+docs/source/en/model_doc/roformer.md
+docs/source/en/model_doc/rwkv.md
+docs/source/en/model_doc/sam.md
+docs/source/en/model_doc/segformer.md
+docs/source/en/model_doc/sew-d.md
+docs/source/en/model_doc/sew.md
+docs/source/en/model_doc/speech-encoder-decoder.md
+docs/source/en/model_doc/speech_to_text_2.md
+docs/source/en/model_doc/speecht5.md
+docs/source/en/model_doc/splinter.md
+docs/source/en/model_doc/squeezebert.md
+docs/source/en/model_doc/swiftformer.md
+docs/source/en/model_doc/swin.md
+docs/source/en/model_doc/swin2sr.md
+docs/source/en/model_doc/swinv2.md
+docs/source/en/model_doc/table-transformer.md
+docs/source/en/model_doc/tapas.md
+docs/source/en/model_doc/time_series_transformer.md
+docs/source/en/model_doc/timesformer.md
+docs/source/en/model_doc/trajectory_transformer.md
+docs/source/en/model_doc/transfo-xl.md
+docs/source/en/model_doc/trocr.md
+docs/source/en/model_doc/tvlt.md
+docs/source/en/model_doc/ul2.md
+docs/source/en/model_doc/umt5.md
+docs/source/en/model_doc/unispeech-sat.md
+docs/source/en/model_doc/unispeech.md
+docs/source/en/model_doc/upernet.md
+docs/source/en/model_doc/van.md
+docs/source/en/model_doc/videomae.md
+docs/source/en/model_doc/vilt.md
+docs/source/en/model_doc/vision-encoder-decoder.md
+docs/source/en/model_doc/vision-text-dual-encoder.md
+docs/source/en/model_doc/visual_bert.md
+docs/source/en/model_doc/vit.md
+docs/source/en/model_doc/vit_hybrid.md
+docs/source/en/model_doc/vit_mae.md
+docs/source/en/model_doc/vit_msn.md
+docs/source/en/model_doc/vivit.md
+docs/source/en/model_doc/wav2vec2-conformer.md
+docs/source/en/model_doc/wav2vec2.md
+docs/source/en/model_doc/wav2vec2_phoneme.md
+docs/source/en/model_doc/wavlm.md
+docs/source/en/model_doc/whisper.md
+docs/source/en/model_doc/xclip.md
+docs/source/en/model_doc/xglm.md
+docs/source/en/model_doc/xlm-prophetnet.md
+docs/source/en/model_doc/xlm-roberta-xl.md
+docs/source/en/model_doc/xlm-roberta.md
+docs/source/en/model_doc/xlm-v.md
+docs/source/en/model_doc/xlm.md
+docs/source/en/model_doc/xlnet.md
+docs/source/en/model_doc/xls_r.md
+docs/source/en/model_doc/xlsr_wav2vec2.md
+docs/source/en/model_doc/xmod.md
+docs/source/en/model_doc/yolos.md
+docs/source/en/model_doc/yoso.md
+docs/source/en/model_memory_anatomy.md
+docs/source/en/model_sharing.md
+docs/source/en/model_summary.md
+docs/source/en/multilingual.md
+docs/source/en/notebooks.md
+docs/source/en/pad_truncation.md
+docs/source/en/peft.md
+docs/source/en/perf_hardware.md
+docs/source/en/perf_infer_cpu.md
+docs/source/en/perf_infer_gpu_many.md
+docs/source/en/perf_infer_gpu_one.md
+docs/source/en/perf_infer_special.md
+docs/source/en/perf_torch_compile.md
+docs/source/en/perf_train_cpu.md
+docs/source/en/perf_train_cpu_many.md
+docs/source/en/perf_train_gpu_many.md
+docs/source/en/perf_train_gpu_one.md
+docs/source/en/perf_train_special.md
+docs/source/en/perf_train_tpu.md
+docs/source/en/perf_train_tpu_tf.md
+docs/source/en/performance.md
+docs/source/en/perplexity.md
+docs/source/en/philosophy.md
+docs/source/en/pipeline_webserver.md
+docs/source/en/pr_checks.md
+docs/source/en/preprocessing.md
+docs/source/en/run_scripts.md
+docs/source/en/sagemaker.md
+docs/source/en/serialization.md
+docs/source/en/tasks/asr.md
+docs/source/en/tasks/audio_classification.md
+docs/source/en/tasks/document_question_answering.md
+docs/source/en/tasks/image_captioning.md
+docs/source/en/tasks/image_classification.md
+docs/source/en/tasks/language_modeling.md
+docs/source/en/tasks/masked_language_modeling.md
+docs/source/en/tasks/monocular_depth_estimation.md
+docs/source/en/tasks/multiple_choice.md
+docs/source/en/tasks/object_detection.md
+docs/source/en/tasks/question_answering.md
+docs/source/en/tasks/semantic_segmentation.md
+docs/source/en/tasks/sequence_classification.md
+docs/source/en/tasks/summarization.md
+docs/source/en/tasks/text-to-speech.md
+docs/source/en/tasks/token_classification.md
+docs/source/en/tasks/translation.md
+docs/source/en/tasks/video_classification.md
+docs/source/en/tasks/visual_question_answering.md
+docs/source/en/tasks/zero_shot_image_classification.md
+docs/source/en/tasks/zero_shot_object_detection.md
+docs/source/en/tasks_explained.md
+docs/source/en/tf_xla.md
+docs/source/en/tflite.md
+docs/source/en/tokenizer_summary.md
+docs/source/en/torchscript.md
+docs/source/en/training.md
+docs/source/en/transformers_agents.md
+docs/source/en/troubleshooting.md
+src/transformers/activations.py
+src/transformers/activations_tf.py
+src/transformers/audio_utils.py
+src/transformers/benchmark/benchmark.py
+src/transformers/benchmark/benchmark_args.py
+src/transformers/benchmark/benchmark_args_tf.py
+src/transformers/benchmark/benchmark_args_utils.py
+src/transformers/benchmark/benchmark_tf.py
+src/transformers/benchmark/benchmark_utils.py
+src/transformers/commands/add_new_model.py
+src/transformers/commands/add_new_model_like.py
+src/transformers/commands/convert.py
+src/transformers/commands/download.py
+src/transformers/commands/env.py
+src/transformers/commands/lfs.py
+src/transformers/commands/pt_to_tf.py
+src/transformers/commands/run.py
+src/transformers/commands/serving.py
+src/transformers/commands/train.py
+src/transformers/commands/transformers_cli.py
+src/transformers/commands/user.py
+src/transformers/configuration_utils.py
+src/transformers/convert_graph_to_onnx.py
+src/transformers/convert_pytorch_checkpoint_to_tf2.py
+src/transformers/convert_slow_tokenizer.py
+src/transformers/convert_slow_tokenizers_checkpoints_to_fast.py
+src/transformers/convert_tf_hub_seq_to_seq_bert_to_pytorch.py
+src/transformers/data/data_collator.py
+src/transformers/data/datasets/glue.py
+src/transformers/data/datasets/language_modeling.py
+src/transformers/data/datasets/squad.py
+src/transformers/data/metrics/squad_metrics.py
+src/transformers/data/processors/glue.py
+src/transformers/data/processors/squad.py
+src/transformers/data/processors/utils.py
+src/transformers/data/processors/xnli.py
+src/transformers/debug_utils.py
+src/transformers/deepspeed.py
+src/transformers/dependency_versions_check.py
+src/transformers/dependency_versions_table.py
+src/transformers/dynamic_module_utils.py
+src/transformers/feature_extraction_sequence_utils.py
+src/transformers/feature_extraction_utils.py
+src/transformers/file_utils.py
+src/transformers/hf_argparser.py
+src/transformers/hyperparameter_search.py
+src/transformers/image_processing_utils.py
+src/transformers/image_transforms.py
+src/transformers/image_utils.py
+src/transformers/integrations/bitsandbytes.py
+src/transformers/integrations/deepspeed.py
+src/transformers/integrations/integration_utils.py
+src/transformers/integrations/peft.py
+src/transformers/keras_callbacks.py
+src/transformers/modelcard.py
+src/transformers/modeling_flax_outputs.py
+src/transformers/modeling_flax_pytorch_utils.py
+src/transformers/modeling_flax_utils.py
+src/transformers/modeling_outputs.py
+src/transformers/modeling_tf_outputs.py
+src/transformers/modeling_tf_pytorch_utils.py
+src/transformers/modeling_tf_utils.py
+src/transformers/modeling_utils.py
+src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py
+src/transformers/models/albert/modeling_flax_albert.py
+src/transformers/models/align/configuration_align.py
+src/transformers/models/align/convert_align_tf_to_hf.py
+src/transformers/models/align/modeling_align.py
+src/transformers/models/altclip/configuration_altclip.py
+src/transformers/models/altclip/modeling_altclip.py
+src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py
+src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
+src/transformers/models/auto/auto_factory.py
+src/transformers/models/auto/configuration_auto.py
+src/transformers/models/auto/modeling_auto.py
+src/transformers/models/auto/modeling_flax_auto.py
+src/transformers/models/auto/modeling_tf_auto.py
+src/transformers/models/autoformer/configuration_autoformer.py
+src/transformers/models/autoformer/modeling_autoformer.py
+src/transformers/models/bark/convert_suno_to_hf.py
+src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py
+src/transformers/models/bart/modeling_flax_bart.py
+src/transformers/models/bart/modeling_tf_bart.py
+src/transformers/models/beit/convert_beit_unilm_to_pytorch.py
+src/transformers/models/beit/modeling_flax_beit.py
+src/transformers/models/bert/convert_bert_original_tf2_checkpoint_to_pytorch.py
+src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py
+src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py
+src/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py
+src/transformers/models/bert/modeling_flax_bert.py
+src/transformers/models/bert_generation/modeling_bert_generation.py
+src/transformers/models/big_bird/convert_bigbird_original_tf_checkpoint_to_pytorch.py
+src/transformers/models/big_bird/modeling_flax_big_bird.py
+src/transformers/models/bigbird_pegasus/convert_bigbird_pegasus_tf_to_pytorch.py
+src/transformers/models/biogpt/configuration_biogpt.py
+src/transformers/models/biogpt/convert_biogpt_original_pytorch_checkpoint_to_pytorch.py
+src/transformers/models/biogpt/modeling_biogpt.py
+src/transformers/models/bit/configuration_bit.py
+src/transformers/models/bit/convert_bit_to_pytorch.py
+src/transformers/models/bit/modeling_bit.py
+src/transformers/models/blenderbot/convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py
+src/transformers/models/blenderbot/modeling_flax_blenderbot.py
+src/transformers/models/blenderbot/modeling_tf_blenderbot.py
+src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
+src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py
+src/transformers/models/blip/configuration_blip.py
+src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py
+src/transformers/models/blip/modeling_blip_text.py
+src/transformers/models/blip/modeling_tf_blip_text.py
+src/transformers/models/blip_2/configuration_blip_2.py
+src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py
+src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py
+src/transformers/models/bloom/modeling_bloom.py
+src/transformers/models/bloom/modeling_flax_bloom.py
+src/transformers/models/bridgetower/configuration_bridgetower.py
+src/transformers/models/bridgetower/modeling_bridgetower.py
+src/transformers/models/bros/convert_bros_to_pytorch.py
+src/transformers/models/byt5/convert_byt5_original_tf_checkpoint_to_pytorch.py
+src/transformers/models/camembert/modeling_camembert.py
+src/transformers/models/camembert/modeling_tf_camembert.py
+src/transformers/models/canine/convert_canine_original_tf_checkpoint_to_pytorch.py
+src/transformers/models/chinese_clip/configuration_chinese_clip.py
+src/transformers/models/chinese_clip/convert_chinese_clip_original_pytorch_to_hf.py
+src/transformers/models/chinese_clip/modeling_chinese_clip.py
+src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
+src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py
+src/transformers/models/clip/modeling_clip.py
+src/transformers/models/clip/modeling_flax_clip.py
+src/transformers/models/clip/modeling_tf_clip.py
+src/transformers/models/clipseg/configuration_clipseg.py
+src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
+src/transformers/models/codegen/modeling_codegen.py
+src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
+src/transformers/models/convbert/convert_convbert_original_tf1_checkpoint_to_pytorch_and_tf2.py
+src/transformers/models/convbert/modeling_convbert.py
+src/transformers/models/convbert/modeling_tf_convbert.py
+src/transformers/models/convnext/convert_convnext_to_pytorch.py
+src/transformers/models/convnext/modeling_tf_convnext.py
+src/transformers/models/convnextv2/configuration_convnextv2.py
+src/transformers/models/convnextv2/convert_convnextv2_to_pytorch.py
+src/transformers/models/convnextv2/modeling_convnextv2.py
+src/transformers/models/cpmant/configuration_cpmant.py
+src/transformers/models/cpmant/modeling_cpmant.py
+src/transformers/models/cpmant/tokenization_cpmant.py
+src/transformers/models/ctrl/modeling_tf_ctrl.py
+src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py
+src/transformers/models/cvt/modeling_tf_cvt.py
+src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
+src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py
+src/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py
+src/transformers/models/data2vec/modeling_data2vec_text.py
+src/transformers/models/data2vec/modeling_tf_data2vec_vision.py
+src/transformers/models/deberta/modeling_tf_deberta.py
+src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
+src/transformers/models/decision_transformer/modeling_decision_transformer.py
+src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py
+src/transformers/models/deformable_detr/load_custom.py
+src/transformers/models/deit/convert_deit_timm_to_pytorch.py
+src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py
+src/transformers/models/deprecated/mctct/configuration_mctct.py
+src/transformers/models/deprecated/mctct/feature_extraction_mctct.py
+src/transformers/models/deprecated/mctct/modeling_mctct.py
+src/transformers/models/deprecated/mctct/processing_mctct.py
+src/transformers/models/deprecated/mmbt/configuration_mmbt.py
+src/transformers/models/deprecated/mmbt/modeling_mmbt.py
+src/transformers/models/deprecated/open_llama/configuration_open_llama.py
+src/transformers/models/deprecated/open_llama/modeling_open_llama.py
+src/transformers/models/deprecated/retribert/configuration_retribert.py
+src/transformers/models/deprecated/retribert/modeling_retribert.py
+src/transformers/models/deprecated/retribert/tokenization_retribert.py
+src/transformers/models/deprecated/retribert/tokenization_retribert_fast.py
+src/transformers/models/deprecated/tapex/tokenization_tapex.py
+src/transformers/models/deprecated/trajectory_transformer/configuration_trajectory_transformer.py
+src/transformers/models/deprecated/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py
+src/transformers/models/deprecated/trajectory_transformer/modeling_trajectory_transformer.py
+src/transformers/models/deprecated/van/configuration_van.py
+src/transformers/models/deprecated/van/convert_van_to_pytorch.py
+src/transformers/models/deprecated/van/modeling_van.py
+src/transformers/models/deta/convert_deta_resnet_to_pytorch.py
+src/transformers/models/deta/convert_deta_swin_to_pytorch.py
+src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py
+src/transformers/models/detr/convert_detr_to_pytorch.py
+src/transformers/models/dialogpt/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py
+src/transformers/models/dinov2/configuration_dinov2.py
+src/transformers/models/dinov2/convert_dinov2_to_hf.py
+src/transformers/models/dinov2/modeling_dinov2.py
+src/transformers/models/distilbert/modeling_distilbert.py
+src/transformers/models/distilbert/modeling_flax_distilbert.py
+src/transformers/models/distilbert/modeling_tf_distilbert.py
+src/transformers/models/dit/convert_dit_unilm_to_pytorch.py
+src/transformers/models/donut/configuration_donut_swin.py
+src/transformers/models/donut/convert_donut_to_pytorch.py
+src/transformers/models/donut/modeling_donut_swin.py
+src/transformers/models/dpr/convert_dpr_original_checkpoint_to_pytorch.py
+src/transformers/models/dpr/modeling_dpr.py
+src/transformers/models/dpr/modeling_tf_dpr.py
+src/transformers/models/dpt/configuration_dpt.py
+src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py
+src/transformers/models/dpt/convert_dpt_to_pytorch.py
+src/transformers/models/efficientformer/configuration_efficientformer.py
+src/transformers/models/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py
+src/transformers/models/efficientformer/modeling_efficientformer.py
+src/transformers/models/efficientnet/configuration_efficientnet.py
+src/transformers/models/efficientnet/convert_efficientnet_to_pytorch.py
+src/transformers/models/efficientnet/modeling_efficientnet.py
+src/transformers/models/electra/convert_electra_original_tf_checkpoint_to_pytorch.py
+src/transformers/models/electra/modeling_flax_electra.py
+src/transformers/models/encodec/configuration_encodec.py
+src/transformers/models/encodec/convert_encodec_checkpoint_to_pytorch.py
+src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
+src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
+src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
+src/transformers/models/ernie/modeling_ernie.py
+src/transformers/models/esm/configuration_esm.py
+src/transformers/models/esm/convert_esm.py
+src/transformers/models/esm/modeling_esm.py
+src/transformers/models/esm/modeling_esmfold.py
+src/transformers/models/esm/modeling_tf_esm.py
+src/transformers/models/esm/openfold_utils/chunk_utils.py
+src/transformers/models/esm/openfold_utils/data_transforms.py
+src/transformers/models/esm/openfold_utils/feats.py
+src/transformers/models/esm/openfold_utils/loss.py
+src/transformers/models/esm/openfold_utils/protein.py
+src/transformers/models/esm/openfold_utils/residue_constants.py
+src/transformers/models/esm/openfold_utils/rigid_utils.py
+src/transformers/models/esm/openfold_utils/tensor_utils.py
+src/transformers/models/falcon/configuration_falcon.py
+src/transformers/models/falcon/modeling_falcon.py
+src/transformers/models/flaubert/configuration_flaubert.py
+src/transformers/models/flaubert/modeling_flaubert.py
+src/transformers/models/flaubert/modeling_tf_flaubert.py
+src/transformers/models/flava/convert_dalle_to_flava_codebook.py
+src/transformers/models/flava/convert_flava_original_pytorch_to_hf.py
+src/transformers/models/flava/modeling_flava.py
+src/transformers/models/fnet/convert_fnet_original_flax_checkpoint_to_pytorch.py
+src/transformers/models/fnet/modeling_fnet.py
+src/transformers/models/focalnet/configuration_focalnet.py
+src/transformers/models/focalnet/convert_focalnet_to_hf_format.py
+src/transformers/models/focalnet/modeling_focalnet.py
+src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py
+src/transformers/models/fsmt/modeling_fsmt.py
+src/transformers/models/funnel/configuration_funnel.py
+src/transformers/models/funnel/convert_funnel_original_tf_checkpoint_to_pytorch.py
+src/transformers/models/funnel/modeling_funnel.py
+src/transformers/models/funnel/modeling_tf_funnel.py
+src/transformers/models/git/configuration_git.py
+src/transformers/models/git/convert_git_to_pytorch.py
+src/transformers/models/glpn/configuration_glpn.py
+src/transformers/models/glpn/convert_glpn_to_pytorch.py
+src/transformers/models/gpt2/CONVERSION.md
+src/transformers/models/gpt2/convert_gpt2_original_tf_checkpoint_to_pytorch.py
+src/transformers/models/gpt2/modeling_flax_gpt2.py
+src/transformers/models/gpt2/modeling_tf_gpt2.py
+src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py
+src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
+src/transformers/models/gpt_neo/convert_gpt_neo_mesh_tf_to_pytorch.py
+src/transformers/models/gpt_neo/modeling_flax_gpt_neo.py
+src/transformers/models/gpt_neo/modeling_gpt_neo.py
+src/transformers/models/gpt_neox/modeling_gpt_neox.py
+src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
+src/transformers/models/gpt_sw3/convert_megatron_to_pytorch.py
+src/transformers/models/gptj/configuration_gptj.py
+src/transformers/models/gptj/modeling_flax_gptj.py
+src/transformers/models/gptj/modeling_tf_gptj.py
+src/transformers/models/gptsan_japanese/configuration_gptsan_japanese.py
+src/transformers/models/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py
+src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py
+src/transformers/models/graphormer/collating_graphormer.py
+src/transformers/models/graphormer/configuration_graphormer.py
+src/transformers/models/graphormer/modeling_graphormer.py
+src/transformers/models/groupvit/configuration_groupvit.py
+src/transformers/models/groupvit/convert_groupvit_nvlab_to_hf.py
+src/transformers/models/hubert/configuration_hubert.py
+src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py
+src/transformers/models/hubert/convert_hubert_original_pytorch_checkpoint_to_pytorch.py
+src/transformers/models/hubert/convert_hubert_original_s3prl_checkpoint_to_pytorch.py
+src/transformers/models/hubert/modeling_tf_hubert.py
+src/transformers/models/ibert/configuration_ibert.py
+src/transformers/models/ibert/modeling_ibert.py
+src/transformers/models/ibert/quant_modules.py
+src/transformers/models/idefics/configuration_idefics.py
+src/transformers/models/idefics/image_processing_idefics.py
+src/transformers/models/idefics/modeling_idefics.py
+src/transformers/models/idefics/perceiver.py
+src/transformers/models/idefics/processing_idefics.py
+src/transformers/models/idefics/vision.py
+src/transformers/models/imagegpt/convert_imagegpt_original_tf2_to_pytorch.py
+src/transformers/models/informer/configuration_informer.py
+src/transformers/models/informer/modeling_informer.py
+src/transformers/models/instructblip/configuration_instructblip.py
+src/transformers/models/instructblip/convert_instructblip_original_to_pytorch.py
+src/transformers/models/instructblip/modeling_instructblip.py
+src/transformers/models/instructblip/processing_instructblip.py
+src/transformers/models/jukebox/configuration_jukebox.py
+src/transformers/models/jukebox/convert_jukebox.py
+src/transformers/models/jukebox/modeling_jukebox.py
+src/transformers/models/led/configuration_led.py
+src/transformers/models/led/modeling_led.py
+src/transformers/models/led/modeling_tf_led.py
+src/transformers/models/levit/convert_levit_timm_to_pytorch.py
+src/transformers/models/levit/modeling_levit.py
+src/transformers/models/lilt/configuration_lilt.py
+src/transformers/models/llama/configuration_llama.py
+src/transformers/models/llama/convert_llama_weights_to_hf.py
+src/transformers/models/llama/modeling_llama.py
+src/transformers/models/llama/tokenization_llama_fast.py
+src/transformers/models/longformer/configuration_longformer.py
+src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py
+src/transformers/models/longt5/configuration_longt5.py
+src/transformers/models/longt5/convert_longt5x_checkpoint_to_flax.py
+src/transformers/models/longt5/modeling_flax_longt5.py
+src/transformers/models/luke/configuration_luke.py
+src/transformers/models/luke/convert_luke_original_pytorch_checkpoint_to_pytorch.py
+src/transformers/models/luke/modeling_luke.py
+src/transformers/models/lxmert/configuration_lxmert.py
+src/transformers/models/lxmert/convert_lxmert_original_tf_checkpoint_to_pytorch.py
+src/transformers/models/lxmert/modeling_lxmert.py
+src/transformers/models/lxmert/modeling_tf_lxmert.py
+src/transformers/models/m2m_100/convert_m2m100_original_checkpoint_to_pytorch.py
+src/transformers/models/m2m_100/modeling_m2m_100.py
+src/transformers/models/marian/configuration_marian.py
+src/transformers/models/marian/convert_marian_tatoeba_to_pytorch.py
+src/transformers/models/marian/convert_marian_to_pytorch.py
+src/transformers/models/marian/modeling_flax_marian.py
+src/transformers/models/marian/modeling_tf_marian.py
+src/transformers/models/markuplm/configuration_markuplm.py
+src/transformers/models/markuplm/feature_extraction_markuplm.py
+src/transformers/models/mask2former/convert_mask2former_original_pytorch_checkpoint_to_pytorch.py
+src/transformers/models/maskformer/configuration_maskformer_swin.py
+src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py
+src/transformers/models/maskformer/convert_maskformer_resnet_to_pytorch.py
+src/transformers/models/maskformer/convert_maskformer_swin_to_pytorch.py
+src/transformers/models/maskformer/modeling_maskformer_swin.py
+src/transformers/models/mbart/convert_mbart_original_checkpoint_to_pytorch.py
+src/transformers/models/mbart/modeling_flax_mbart.py
+src/transformers/models/mega/configuration_mega.py
+src/transformers/models/mega/convert_mega_original_pytorch_checkpoint_to_pytorch.py
+src/transformers/models/mega/modeling_mega.py
+src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
+src/transformers/models/megatron_bert/modeling_megatron_bert.py
+src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py
+src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
+src/transformers/models/mgp_str/configuration_mgp_str.py
+src/transformers/models/mgp_str/modeling_mgp_str.py
+src/transformers/models/mistral/configuration_mistral.py
+src/transformers/models/mistral/modeling_mistral.py
+src/transformers/models/mluke/convert_mluke_original_pytorch_checkpoint_to_pytorch.py
+src/transformers/models/mobilebert/convert_mobilebert_original_tf_checkpoint_to_pytorch.py
+src/transformers/models/mobilenet_v1/configuration_mobilenet_v1.py
+src/transformers/models/mobilenet_v1/convert_original_tf_checkpoint_to_pytorch.py
+src/transformers/models/mobilenet_v2/configuration_mobilenet_v2.py
+src/transformers/models/mobilenet_v2/convert_original_tf_checkpoint_to_pytorch.py
+src/transformers/models/mobilevit/configuration_mobilevit.py
+src/transformers/models/mobilevit/convert_mlcvnets_to_pytorch.py
+src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py
+src/transformers/models/mpnet/configuration_mpnet.py
+src/transformers/models/mpnet/modeling_mpnet.py
+src/transformers/models/mpnet/modeling_tf_mpnet.py
+src/transformers/models/mpt/configuration_mpt.py
+src/transformers/models/mpt/modeling_mpt.py
+src/transformers/models/mra/configuration_mra.py
+src/transformers/models/mra/convert_mra_pytorch_to_pytorch.py
+src/transformers/models/mra/modeling_mra.py
+src/transformers/models/mt5/configuration_mt5.py
+src/transformers/models/mt5/modeling_flax_mt5.py
+src/transformers/models/mt5/modeling_mt5.py
+src/transformers/models/mt5/modeling_tf_mt5.py
+src/transformers/models/musicgen/convert_musicgen_transformers.py
+src/transformers/models/mvp/modeling_mvp.py
+src/transformers/models/nezha/modeling_nezha.py
+src/transformers/models/nllb_moe/configuration_nllb_moe.py
+src/transformers/models/nllb_moe/convert_nllb_moe_sharded_original_checkpoint_to_pytorch.py
+src/transformers/models/nllb_moe/modeling_nllb_moe.py
+src/transformers/models/nougat/convert_nougat_to_hf.py
+src/transformers/models/nystromformer/configuration_nystromformer.py
+src/transformers/models/nystromformer/convert_nystromformer_original_pytorch_checkpoint_to_pytorch.py
+src/transformers/models/nystromformer/modeling_nystromformer.py
+src/transformers/models/oneformer/convert_to_hf_oneformer.py
+src/transformers/models/openai/convert_openai_original_tf_checkpoint_to_pytorch.py
+src/transformers/models/openai/modeling_openai.py
+src/transformers/models/openai/modeling_tf_openai.py
+src/transformers/models/opt/convert_opt_original_pytorch_checkpoint_to_pytorch.py
+src/transformers/models/opt/modeling_flax_opt.py
+src/transformers/models/owlvit/configuration_owlvit.py
+src/transformers/models/owlvit/convert_owlvit_original_flax_to_hf.py
+src/transformers/models/pegasus/convert_pegasus_tf_to_pytorch.py
+src/transformers/models/pegasus/modeling_flax_pegasus.py
+src/transformers/models/pegasus/modeling_tf_pegasus.py
+src/transformers/models/pegasus_x/modeling_pegasus_x.py
+src/transformers/models/perceiver/configuration_perceiver.py
+src/transformers/models/perceiver/convert_perceiver_haiku_to_pytorch.py
+src/transformers/models/persimmon/convert_persimmon_weights_to_hf.py
+src/transformers/models/persimmon/modeling_persimmon.py
+src/transformers/models/pix2struct/configuration_pix2struct.py
+src/transformers/models/pix2struct/convert_pix2struct_original_pytorch_to_hf.py
+src/transformers/models/pix2struct/image_processing_pix2struct.py
+src/transformers/models/pix2struct/processing_pix2struct.py
+src/transformers/models/plbart/convert_plbart_original_checkpoint_to_torch.py
+src/transformers/models/poolformer/convert_poolformer_original_to_pytorch.py
+src/transformers/models/pop2piano/convert_pop2piano_weights_to_hf.py
+src/transformers/models/pop2piano/feature_extraction_pop2piano.py
+src/transformers/models/pop2piano/processing_pop2piano.py
+src/transformers/models/pop2piano/tokenization_pop2piano.py
+src/transformers/models/prophetnet/configuration_prophetnet.py
+src/transformers/models/prophetnet/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py
+src/transformers/models/prophetnet/modeling_prophetnet.py
+src/transformers/models/pvt/configuration_pvt.py
+src/transformers/models/pvt/convert_pvt_to_pytorch.py
+src/transformers/models/pvt/image_processing_pvt.py
+src/transformers/models/pvt/modeling_pvt.py
+src/transformers/models/qdqbert/configuration_qdqbert.py
+src/transformers/models/qdqbert/modeling_qdqbert.py
+src/transformers/models/rag/configuration_rag.py
+src/transformers/models/rag/modeling_rag.py
+src/transformers/models/rag/modeling_tf_rag.py
+src/transformers/models/rag/retrieval_rag.py
+src/transformers/models/realm/modeling_realm.py
+src/transformers/models/realm/retrieval_realm.py
+src/transformers/models/reformer/convert_reformer_trax_checkpoint_to_pytorch.py
+src/transformers/models/regnet/configuration_regnet.py
+src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py
+src/transformers/models/regnet/convert_regnet_to_pytorch.py
+src/transformers/models/regnet/modeling_flax_regnet.py
+src/transformers/models/rembert/configuration_rembert.py
+src/transformers/models/rembert/convert_rembert_tf_checkpoint_to_pytorch.py
+src/transformers/models/rembert/modeling_rembert.py
+src/transformers/models/rembert/modeling_tf_rembert.py
+src/transformers/models/resnet/convert_resnet_to_pytorch.py
+src/transformers/models/resnet/modeling_flax_resnet.py
+src/transformers/models/roberta/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
+src/transformers/models/roberta/modeling_flax_roberta.py
+src/transformers/models/roberta_prelayernorm/convert_roberta_prelayernorm_original_pytorch_checkpoint_to_pytorch.py
+src/transformers/models/roberta_prelayernorm/modeling_flax_roberta_prelayernorm.py
+src/transformers/models/roc_bert/configuration_roc_bert.py
+src/transformers/models/roformer/convert_roformer_original_tf_checkpoint_to_pytorch.py
+src/transformers/models/roformer/modeling_flax_roformer.py
+src/transformers/models/roformer/modeling_roformer.py
+src/transformers/models/roformer/modeling_tf_roformer.py
+src/transformers/models/rwkv/configuration_rwkv.py
+src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py
+src/transformers/models/rwkv/modeling_rwkv.py
+src/transformers/models/sam/configuration_sam.py
+src/transformers/models/sam/convert_sam_original_to_hf_format.py
+src/transformers/models/sam/image_processing_sam.py
+src/transformers/models/sam/modeling_sam.py
+src/transformers/models/sam/modeling_tf_sam.py
+src/transformers/models/sam/processing_sam.py
+src/transformers/models/segformer/configuration_segformer.py
+src/transformers/models/segformer/convert_segformer_original_to_pytorch.py
+src/transformers/models/sew/convert_sew_original_pytorch_checkpoint_to_pytorch.py
+src/transformers/models/sew_d/convert_sew_d_original_pytorch_checkpoint_to_pytorch.py
+src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py
+src/transformers/models/speech_encoder_decoder/convert_mbart_wav2vec2_seq2seq_original_to_pytorch.py
+src/transformers/models/speech_encoder_decoder/convert_speech_to_text_wav2vec2_seq2seq_original_to_pytorch.py
+src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py
+src/transformers/models/speech_to_text/convert_s2t_fairseq_to_tfms.py
+src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
+src/transformers/models/speecht5/configuration_speecht5.py
+src/transformers/models/speecht5/convert_hifigan.py
+src/transformers/models/speecht5/convert_speecht5_original_pytorch_checkpoint_to_pytorch.py
+src/transformers/models/speecht5/number_normalizer.py
+src/transformers/models/splinter/configuration_splinter.py
+src/transformers/models/splinter/modeling_splinter.py
+src/transformers/models/squeezebert/modeling_squeezebert.py
+src/transformers/models/swiftformer/configuration_swiftformer.py
+src/transformers/models/swiftformer/convert_swiftformer_original_to_hf.py
+src/transformers/models/swiftformer/modeling_swiftformer.py
+src/transformers/models/swin/convert_swin_simmim_to_pytorch.py
+src/transformers/models/swin/convert_swin_timm_to_pytorch.py
+src/transformers/models/swin/modeling_tf_swin.py
+src/transformers/models/swin2sr/configuration_swin2sr.py
+src/transformers/models/swin2sr/convert_swin2sr_original_to_pytorch.py
+src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py
+src/transformers/models/swinv2/modeling_swinv2.py
+src/transformers/models/switch_transformers/configuration_switch_transformers.py
+src/transformers/models/switch_transformers/convert_big_switch.py
+src/transformers/models/switch_transformers/convert_switch_transformers_original_flax_checkpoint_to_pytorch.py
+src/transformers/models/switch_transformers/modeling_switch_transformers.py
+src/transformers/models/t5/configuration_t5.py
+src/transformers/models/t5/convert_t5_original_tf_checkpoint_to_pytorch.py
+src/transformers/models/t5/convert_t5x_checkpoint_to_flax.py
+src/transformers/models/t5/convert_t5x_checkpoint_to_pytorch.py
+src/transformers/models/t5/modeling_flax_t5.py
+src/transformers/models/t5/modeling_t5.py
+src/transformers/models/t5/modeling_tf_t5.py
+src/transformers/models/table_transformer/configuration_table_transformer.py
+src/transformers/models/table_transformer/convert_table_transformer_original_pytorch_checkpoint_to_pytorch.py
+src/transformers/models/tapas/configuration_tapas.py
+src/transformers/models/tapas/convert_tapas_original_tf_checkpoint_to_pytorch.py
+src/transformers/models/tapas/modeling_tapas.py
+src/transformers/models/tapas/modeling_tf_tapas.py
+src/transformers/models/timesformer/convert_timesformer_to_pytorch.py
+src/transformers/models/timm_backbone/configuration_timm_backbone.py
+src/transformers/models/timm_backbone/modeling_timm_backbone.py
+src/transformers/models/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
+src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
+src/transformers/models/transfo_xl/modeling_tf_transfo_xl_utilities.py
+src/transformers/models/transfo_xl/modeling_transfo_xl.py
+src/transformers/models/transfo_xl/modeling_transfo_xl_utilities.py
+src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py
+src/transformers/models/tvlt/configuration_tvlt.py
+src/transformers/models/tvlt/modeling_tvlt.py
+src/transformers/models/umt5/configuration_umt5.py
+src/transformers/models/umt5/convert_umt5_checkpoint_to_pytorch.py
+src/transformers/models/umt5/modeling_umt5.py
+src/transformers/models/unispeech/convert_unispeech_original_pytorch_checkpoint_to_pytorch.py
+src/transformers/models/unispeech_sat/configuration_unispeech_sat.py
+src/transformers/models/unispeech_sat/convert_unispeech_original_s3prl_checkpoint_to_pytorch.py
+src/transformers/models/unispeech_sat/convert_unispeech_sat_original_pytorch_checkpoint_to_pytorch.py
+src/transformers/models/upernet/configuration_upernet.py
+src/transformers/models/upernet/convert_convnext_upernet_to_pytorch.py
+src/transformers/models/upernet/convert_swin_upernet_to_pytorch.py
+src/transformers/models/videomae/configuration_videomae.py
+src/transformers/models/videomae/convert_videomae_to_pytorch.py
+src/transformers/models/vilt/configuration_vilt.py
+src/transformers/models/vilt/convert_vilt_original_to_pytorch.py
+src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
+src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
+src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
+src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
+src/transformers/models/visual_bert/convert_visual_bert_original_pytorch_checkpoint_to_pytorch.py
+src/transformers/models/visual_bert/modeling_visual_bert.py
+src/transformers/models/vit/convert_dino_to_pytorch.py
+src/transformers/models/vit/convert_vit_timm_to_pytorch.py
+src/transformers/models/vit/modeling_flax_vit.py
+src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
+src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
+src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
+src/transformers/models/vit_mae/convert_vit_mae_to_pytorch.py
+src/transformers/models/vit_mae/modeling_tf_vit_mae.py
+src/transformers/models/vit_msn/configuration_vit_msn.py
+src/transformers/models/vit_msn/convert_msn_to_pytorch.py
+src/transformers/models/vivit/configuration_vivit.py
+src/transformers/models/vivit/convert_vivit_flax_to_pytorch.py
+src/transformers/models/vivit/image_processing_vivit.py
+src/transformers/models/vivit/modeling_vivit.py
+src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py
+src/transformers/models/wav2vec2/convert_wav2vec2_original_s3prl_checkpoint_to_pytorch.py
+src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
+src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
+src/transformers/models/wav2vec2_conformer/convert_wav2vec2_conformer_original_pytorch_checkpoint_to_pytorch.py
+src/transformers/models/wavlm/convert_wavlm_original_pytorch_checkpoint_to_pytorch.py
+src/transformers/models/wavlm/convert_wavlm_original_s3prl_checkpoint_to_pytorch.py
+src/transformers/models/whisper/convert_openai_to_hf.py
+src/transformers/models/whisper/english_normalizer.py
+src/transformers/models/whisper/modeling_flax_whisper.py
+src/transformers/models/x_clip/configuration_x_clip.py
+src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
+src/transformers/models/xglm/configuration_xglm.py
+src/transformers/models/xglm/convert_xglm_original_ckpt_to_trfms.py
+src/transformers/models/xglm/modeling_flax_xglm.py
+src/transformers/models/xglm/modeling_tf_xglm.py
+src/transformers/models/xglm/modeling_xglm.py
+src/transformers/models/xlm/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
+src/transformers/models/xlm/modeling_tf_xlm.py
+src/transformers/models/xlm/modeling_xlm.py
+src/transformers/models/xlm_prophetnet/configuration_xlm_prophetnet.py
+src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py
+src/transformers/models/xlm_roberta/modeling_flax_xlm_roberta.py
+src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py
+src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
+src/transformers/models/xlm_roberta_xl/convert_xlm_roberta_xl_original_pytorch_checkpoint_to_pytorch.py
+src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
+src/transformers/models/xlnet/convert_xlnet_original_tf_checkpoint_to_pytorch.py
+src/transformers/models/xlnet/modeling_tf_xlnet.py
+src/transformers/models/xlnet/modeling_xlnet.py
+src/transformers/models/xmod/convert_xmod_original_pytorch_checkpoint_to_pytorch.py
+src/transformers/models/yolos/convert_yolos_to_pytorch.py
+src/transformers/models/yoso/convert_yoso_pytorch_to_pytorch.py
+src/transformers/models/yoso/modeling_yoso.py
+src/transformers/onnx/__main__.py
+src/transformers/onnx/config.py
+src/transformers/onnx/convert.py
+src/transformers/onnx/features.py
+src/transformers/onnx/utils.py
+src/transformers/optimization.py
+src/transformers/optimization_tf.py
+src/transformers/pipelines/audio_classification.py
+src/transformers/pipelines/audio_utils.py
+src/transformers/pipelines/automatic_speech_recognition.py
+src/transformers/pipelines/base.py
+src/transformers/pipelines/conversational.py
+src/transformers/pipelines/depth_estimation.py
+src/transformers/pipelines/document_question_answering.py
+src/transformers/pipelines/feature_extraction.py
+src/transformers/pipelines/fill_mask.py
+src/transformers/pipelines/image_classification.py
+src/transformers/pipelines/image_segmentation.py
+src/transformers/pipelines/image_to_text.py
+src/transformers/pipelines/mask_generation.py
+src/transformers/pipelines/object_detection.py
+src/transformers/pipelines/pt_utils.py
+src/transformers/pipelines/question_answering.py
+src/transformers/pipelines/table_question_answering.py
+src/transformers/pipelines/text_classification.py
+src/transformers/pipelines/token_classification.py
+src/transformers/pipelines/video_classification.py
+src/transformers/pipelines/visual_question_answering.py
+src/transformers/pipelines/zero_shot_audio_classification.py
+src/transformers/pipelines/zero_shot_classification.py
+src/transformers/pipelines/zero_shot_image_classification.py
+src/transformers/pipelines/zero_shot_object_detection.py
+src/transformers/processing_utils.py
+src/transformers/pytorch_utils.py
+src/transformers/sagemaker/trainer_sm.py
+src/transformers/sagemaker/training_args_sm.py
+src/transformers/testing_utils.py
+src/transformers/tf_utils.py
+src/transformers/time_series_utils.py
+src/transformers/tokenization_utils.py
+src/transformers/tokenization_utils_base.py
+src/transformers/tokenization_utils_fast.py
+src/transformers/tools/agent_types.py
+src/transformers/tools/agents.py
+src/transformers/tools/base.py
+src/transformers/tools/document_question_answering.py
+src/transformers/tools/evaluate_agent.py
+src/transformers/tools/image_captioning.py
+src/transformers/tools/image_question_answering.py
+src/transformers/tools/image_segmentation.py
+src/transformers/tools/prompts.py
+src/transformers/tools/python_interpreter.py
+src/transformers/tools/speech_to_text.py
+src/transformers/tools/text_classification.py
+src/transformers/tools/text_question_answering.py
+src/transformers/tools/text_summarization.py
+src/transformers/tools/text_to_speech.py
+src/transformers/tools/translation.py
+src/transformers/trainer.py
+src/transformers/trainer_callback.py
+src/transformers/trainer_pt_utils.py
+src/transformers/trainer_seq2seq.py
+src/transformers/trainer_tf.py
+src/transformers/trainer_utils.py
+src/transformers/training_args.py
+src/transformers/training_args_seq2seq.py
+src/transformers/training_args_tf.py
+src/transformers/utils/backbone_utils.py
+src/transformers/utils/bitsandbytes.py
+src/transformers/utils/constants.py
+src/transformers/utils/doc.py
+src/transformers/utils/dummy_detectron2_objects.py
+src/transformers/utils/dummy_essentia_and_librosa_and_pretty_midi_and_scipy_and_torch_objects.py
+src/transformers/utils/dummy_flax_objects.py
+src/transformers/utils/dummy_keras_nlp_objects.py
+src/transformers/utils/dummy_music_objects.py
+src/transformers/utils/dummy_pt_objects.py
+src/transformers/utils/dummy_sentencepiece_and_tokenizers_objects.py
+src/transformers/utils/dummy_sentencepiece_objects.py
+src/transformers/utils/dummy_speech_objects.py
+src/transformers/utils/dummy_tensorflow_text_objects.py
+src/transformers/utils/dummy_tf_objects.py
+src/transformers/utils/dummy_tokenizers_objects.py
+src/transformers/utils/dummy_vision_objects.py
+src/transformers/utils/fx.py
+src/transformers/utils/generic.py
+src/transformers/utils/hp_naming.py
+src/transformers/utils/hub.py
+src/transformers/utils/import_utils.py
+src/transformers/utils/logging.py
+src/transformers/utils/model_parallel_utils.py
+src/transformers/utils/notebook.py
+src/transformers/utils/peft_utils.py
+src/transformers/utils/quantization_config.py
+src/transformers/utils/sentencepiece_model_pb2.py
+src/transformers/utils/sentencepiece_model_pb2_new.py
+src/transformers/utils/versions.py
diff --git a/utils/notification_service.py b/utils/notification_service.py
index da315dc56aef..1d10fa5d821f 100644
--- a/utils/notification_service.py
+++ b/utils/notification_service.py
@@ -16,7 +16,6 @@
 import collections
 import functools
 import json
-import math
 import operator
 import os
 import re
@@ -25,6 +24,8 @@
 from typing import Dict, List, Optional, Union
 
 import requests
+from get_ci_error_statistics import get_job_links
+from get_previous_daily_ci import get_last_daily_ci_reports
 from slack_sdk import WebClient
 
 
@@ -192,8 +193,9 @@ def failures(self) -> Dict:
             "text": {
                 "type": "plain_text",
                 "text": (
-                    f"There were {self.n_failures} failures, out of {self.n_tests} tests.\nThe suite ran in"
-                    f" {self.time}."
+                    f"There were {self.n_failures} failures, out of {self.n_tests} tests.\n"
+                    f"Number of model failures: {self.n_model_failures}.\n"
+                    f"The suite ran in {self.time}."
                 ),
                 "emoji": True,
             },
@@ -206,17 +208,30 @@ def failures(self) -> Dict:
 
     @property
     def warnings(self) -> Dict:
+        # If something goes wrong, let's avoid the CI report failing to be sent.
+        button_text = "Check warnings (Link not found)"
+        # Use the workflow run link
+        job_link = f"https://github.com/huggingface/transformers/actions/runs/{os.environ['GITHUB_RUN_ID']}"
+        if "Extract warnings in CI artifacts" in github_actions_job_links:
+            button_text = "Check warnings"
+            # Use the actual job link
+            job_link = f"{github_actions_job_links['Extract warnings in CI artifacts']}"
+
+        huggingface_hub_warnings = [x for x in self.selected_warnings if "huggingface_hub" in x]
+        text = f"There are {len(self.selected_warnings)} warnings being selected."
+        text += f"\n{len(huggingface_hub_warnings)} of them are from `huggingface_hub`."
+
         return {
             "type": "section",
             "text": {
                 "type": "plain_text",
-                "text": f"There were {len(self.selected_warnings)} warnings being selected.",
+                "text": text,
                 "emoji": True,
             },
             "accessory": {
                 "type": "button",
-                "text": {"type": "plain_text", "text": "Check warnings", "emoji": True},
-                "url": f"{github_actions_job_links['Extract warnings in CI artifacts']}",
+                "text": {"type": "plain_text", "text": button_text, "emoji": True},
+                "url": job_link,
             },
         }
 
@@ -261,6 +276,43 @@ def category_failures(self) -> Dict:
 
         return {"type": "section", "text": {"type": "mrkdwn", "text": category_failures_report}}
 
+    def compute_diff_for_failure_reports(self, curr_failure_report, prev_failure_report):  # noqa
+        # Remove the leading and training parts that don't contain failure count information.
+        model_failures = curr_failure_report.split("\n")[3:-2]
+        prev_model_failures = prev_failure_report.split("\n")[3:-2]
+        entries_changed = set(model_failures).difference(prev_model_failures)
+
+        prev_map = {}
+        for f in prev_model_failures:
+            items = [x.strip() for x in f.split("| ")]
+            prev_map[items[-1]] = [int(x) for x in items[:-1]]
+
+        curr_map = {}
+        for f in entries_changed:
+            items = [x.strip() for x in f.split("| ")]
+            curr_map[items[-1]] = [int(x) for x in items[:-1]]
+
+        diff_map = {}
+        for k, v in curr_map.items():
+            if k not in prev_map:
+                diff_map[k] = v
+            else:
+                diff = [x - y for x, y in zip(v, prev_map[k])]
+                if max(diff) > 0:
+                    diff_map[k] = diff
+
+        entries_changed = []
+        for model_name, diff_values in diff_map.items():
+            diff = [str(x) for x in diff_values]
+            diff = [f"+{x}" if (x != "0" and not x.startswith("-")) else x for x in diff]
+            diff = [x.rjust(9) for x in diff]
+            device_report = " | ".join(diff) + " | "
+            report = f"{device_report}{model_name}"
+            entries_changed.append(report)
+        entries_changed = sorted(entries_changed, key=lambda s: s.split("| ")[-1])
+
+        return entries_changed
+
     @property
     def model_failures(self) -> Dict:
         # Obtain per-model failures
@@ -318,44 +370,89 @@ def per_model_sum(model_category_dict):
 
                 model_reports.append(report)
 
+        # (Possibly truncated) reports for the current workflow run - to be sent to Slack channels
         model_header = "Single PT |  Multi PT | Single TF |  Multi TF |     Other | Category\n"
-        sorted_model_reports = sorted(model_reports, key=lambda s: s.split("] ")[-1])
+        sorted_model_reports = sorted(model_reports, key=lambda s: s.split("| ")[-1])
         model_failures_report = prepare_reports(
             title="These following model modules had failures", header=model_header, reports=sorted_model_reports
         )
 
         module_header = "Single |  Multi | Category\n"
-        sorted_module_reports = sorted(other_module_reports, key=lambda s: s.split("] ")[-1])
+        sorted_module_reports = sorted(other_module_reports, key=lambda s: s.split("| ")[-1])
         module_failures_report = prepare_reports(
             title="The following non-model modules had failures", header=module_header, reports=sorted_module_reports
         )
 
+        # To be sent to Slack channels
         model_failure_sections = [
             {"type": "section", "text": {"type": "mrkdwn", "text": model_failures_report}},
             {"type": "section", "text": {"type": "mrkdwn", "text": module_failures_report}},
         ]
 
-        # Save complete tables (for past CI) - to be uploaded as artifacts
-        if ci_event.startswith("Past CI"):
-            model_failures_report = prepare_reports(
-                title="These following model modules had failures",
-                header=model_header,
-                reports=sorted_model_reports,
-                to_truncate=False,
-            )
-            file_path = os.path.join(os.getcwd(), "test_failure_tables/model_failures_report.txt")
-            with open(file_path, "w", encoding="UTF-8") as fp:
-                fp.write(model_failures_report)
-
-            module_failures_report = prepare_reports(
-                title="The following non-model modules had failures",
-                header=module_header,
-                reports=sorted_module_reports,
-                to_truncate=False,
+        # Save the complete (i.e. no truncation) failure tables (of the current workflow run)
+        # (to be uploaded as artifacts)
+        if not os.path.isdir(os.path.join(os.getcwd(), "test_failure_tables")):
+            os.makedirs(os.path.join(os.getcwd(), "test_failure_tables"))
+
+        model_failures_report = prepare_reports(
+            title="These following model modules had failures",
+            header=model_header,
+            reports=sorted_model_reports,
+            to_truncate=False,
+        )
+        file_path = os.path.join(os.getcwd(), "test_failure_tables/model_failures_report.txt")
+        with open(file_path, "w", encoding="UTF-8") as fp:
+            fp.write(model_failures_report)
+
+        module_failures_report = prepare_reports(
+            title="The following non-model modules had failures",
+            header=module_header,
+            reports=sorted_module_reports,
+            to_truncate=False,
+        )
+        file_path = os.path.join(os.getcwd(), "test_failure_tables/module_failures_report.txt")
+        with open(file_path, "w", encoding="UTF-8") as fp:
+            fp.write(module_failures_report)
+
+        target_workflow = "huggingface/transformers/.github/workflows/self-scheduled.yml@refs/heads/main"
+        if os.environ.get("CI_WORKFLOW_REF") == target_workflow:
+            # Get the last previously completed CI's failure tables
+            artifact_names = ["test_failure_tables"]
+            output_dir = os.path.join(os.getcwd(), "previous_reports")
+            os.makedirs(output_dir, exist_ok=True)
+            prev_tables = get_last_daily_ci_reports(
+                artifact_names=artifact_names, output_dir=output_dir, token=os.environ["ACCESS_REPO_INFO_TOKEN"]
             )
-            file_path = os.path.join(os.getcwd(), "test_failure_tables/module_failures_report.txt")
-            with open(file_path, "w", encoding="UTF-8") as fp:
-                fp.write(module_failures_report)
+
+            # if the last run produces artifact named `test_failure_tables`
+            if (
+                "test_failure_tables" in prev_tables
+                and "model_failures_report.txt" in prev_tables["test_failure_tables"]
+            ):
+                # Compute the difference of the previous/current (model failure) table
+                prev_model_failures = prev_tables["test_failure_tables"]["model_failures_report.txt"]
+                entries_changed = self.compute_diff_for_failure_reports(model_failures_report, prev_model_failures)
+                if len(entries_changed) > 0:
+                    # Save the complete difference
+                    diff_report = prepare_reports(
+                        title="Changed model modules failures",
+                        header=model_header,
+                        reports=entries_changed,
+                        to_truncate=False,
+                    )
+                    file_path = os.path.join(os.getcwd(), "test_failure_tables/changed_model_failures_report.txt")
+                    with open(file_path, "w", encoding="UTF-8") as fp:
+                        fp.write(diff_report)
+
+                    # To be sent to Slack channels
+                    diff_report = prepare_reports(
+                        title="*Changed model modules failures*",
+                        header=model_header,
+                        reports=entries_changed,
+                    )
+                    model_failure_sections.append(
+                        {"type": "section", "text": {"type": "mrkdwn", "text": diff_report}},
+                    )
 
         return model_failure_sections
 
@@ -413,7 +510,6 @@ def payload(self) -> str:
 
     @staticmethod
     def error_out(title, ci_title="", runner_not_available=False, runner_failed=False, setup_failed=False):
-
         blocks = []
         title_block = {"type": "header", "text": {"type": "plain_text", "text": title}}
         blocks.append(title_block)
@@ -475,14 +571,15 @@ def error_out(title, ci_title="", runner_not_available=False, runner_failed=Fals
         )
 
     def post(self):
+        payload = self.payload
         print("Sending the following payload")
-        print(json.dumps({"blocks": json.loads(self.payload)}))
+        print(json.dumps({"blocks": json.loads(payload)}))
 
         text = f"{self.n_failures} failures out of {self.n_tests} tests," if self.n_failures else "All tests passed."
 
         self.thread_ts = client.chat_postMessage(
             channel=os.environ["CI_SLACK_REPORT_CHANNEL_ID"],
-            blocks=self.payload,
+            blocks=payload,
             text=text,
         )
 
@@ -562,7 +659,7 @@ def post_reply(self):
                         job_result,
                         failures,
                         device,
-                        text=f"Number of failures: {sum(job_result['failed'].values())}",
+                        text=f'Number of failures: {job_result["failed"][device]}',
                     )
 
                     print("Sending the following reply")
@@ -578,44 +675,20 @@ def post_reply(self):
                     time.sleep(1)
 
 
-def get_job_links():
-    run_id = os.environ["GITHUB_RUN_ID"]
-    url = f"https://api.github.com/repos/huggingface/transformers/actions/runs/{run_id}/jobs?per_page=100"
-    result = requests.get(url).json()
-    jobs = {}
-
-    try:
-        jobs.update({job["name"]: job["html_url"] for job in result["jobs"]})
-        pages_to_iterate_over = math.ceil((result["total_count"] - 100) / 100)
-
-        for i in range(pages_to_iterate_over):
-            result = requests.get(url + f"&page={i + 2}").json()
-            jobs.update({job["name"]: job["html_url"] for job in result["jobs"]})
-
-        return jobs
-    except Exception as e:
-        print("Unknown error, could not fetch links.", e)
-
-    return {}
-
-
-def retrieve_artifact(name: str, gpu: Optional[str]):
+def retrieve_artifact(artifact_path: str, gpu: Optional[str]):
     if gpu not in [None, "single", "multi"]:
         raise ValueError(f"Invalid GPU for artifact. Passed GPU: `{gpu}`.")
 
-    if gpu is not None:
-        name = f"{gpu}-gpu_{name}"
-
     _artifact = {}
 
-    if os.path.exists(name):
-        files = os.listdir(name)
+    if os.path.exists(artifact_path):
+        files = os.listdir(artifact_path)
         for file in files:
             try:
-                with open(os.path.join(name, file)) as f:
+                with open(os.path.join(artifact_path, file)) as f:
                     _artifact[file.split(".")[0]] = f.read()
             except UnicodeDecodeError as e:
-                raise ValueError(f"Could not open {os.path.join(name, file)}.") from e
+                raise ValueError(f"Could not open {os.path.join(artifact_path, file)}.") from e
 
     return _artifact
 
@@ -638,8 +711,14 @@ def add_path(self, path: str, gpu: str = None):
 
     directories = filter(os.path.isdir, os.listdir())
     for directory in directories:
-        if directory.startswith("single-gpu"):
-            artifact_name = directory[len("single-gpu") + 1 :]
+        artifact_name = directory
+
+        name_parts = artifact_name.split("_postfix_")
+        if len(name_parts) > 1:
+            artifact_name = name_parts[0]
+
+        if artifact_name.startswith("single-gpu"):
+            artifact_name = artifact_name[len("single-gpu") + 1 :]
 
             if artifact_name in _available_artifacts:
                 _available_artifacts[artifact_name].single_gpu = True
@@ -648,8 +727,8 @@ def add_path(self, path: str, gpu: str = None):
 
             _available_artifacts[artifact_name].add_path(directory, gpu="single")
 
-        elif directory.startswith("multi-gpu"):
-            artifact_name = directory[len("multi-gpu") + 1 :]
+        elif artifact_name.startswith("multi-gpu"):
+            artifact_name = artifact_name[len("multi-gpu") + 1 :]
 
             if artifact_name in _available_artifacts:
                 _available_artifacts[artifact_name].multi_gpu = True
@@ -658,7 +737,6 @@ def add_path(self, path: str, gpu: str = None):
 
             _available_artifacts[artifact_name].add_path(directory, gpu="multi")
         else:
-            artifact_name = directory
             if artifact_name not in _available_artifacts:
                 _available_artifacts[artifact_name] = Artifact(artifact_name)
 
@@ -691,7 +769,6 @@ def prepare_reports(title, header, reports, to_truncate=True):
 
 
 if __name__ == "__main__":
-
     runner_status = os.environ.get("RUNNER_STATUS")
     runner_env_status = os.environ.get("RUNNER_ENV_STATUS")
     setup_status = os.environ.get("SETUP_STATUS")
@@ -756,6 +833,9 @@ def prepare_reports(title, header, reports, to_truncate=True):
         else:
             ci_title = f"<{ci_url}|{ci_title}>\nAuthor: {ci_author} | Merged by: {merged_by}"
 
+    elif ci_sha:
+        ci_title = f"<{ci_url}|commit: {ci_sha}>"
+
     else:
         ci_title = ""
 
@@ -772,7 +852,9 @@ def prepare_reports(title, header, reports, to_truncate=True):
         Message.error_out(title, ci_title)
         raise ValueError("Errored out.")
 
-    github_actions_job_links = get_job_links()
+    github_actions_job_links = get_job_links(
+        workflow_run_id=os.environ["GITHUB_RUN_ID"], token=os.environ["ACCESS_REPO_INFO_TOKEN"]
+    )
     available_artifacts = retrieve_available_artifacts()
 
     modeling_categories = [
@@ -813,10 +895,12 @@ def prepare_reports(title, header, reports, to_truncate=True):
         framework, version = ci_event.replace("Past CI - ", "").split("-")
         framework = "PyTorch" if framework == "pytorch" else "TensorFlow"
         job_name_prefix = f"{framework} {version}"
+    elif ci_event.startswith("Nightly CI"):
+        job_name_prefix = "Nightly CI"
 
     for model in model_results.keys():
         for artifact_path in available_artifacts[f"run_all_tests_gpu_{model}_test_reports"].paths:
-            artifact = retrieve_artifact(artifact_path["name"], artifact_path["gpu"])
+            artifact = retrieve_artifact(artifact_path["path"], artifact_path["gpu"])
             if "stats" in artifact:
                 # Link to the GitHub Action job
                 # The job names use `matrix.folder` which contain things like `models/bert` instead of `models_bert`
@@ -831,9 +915,8 @@ def prepare_reports(title, header, reports, to_truncate=True):
                 stacktraces = handle_stacktraces(artifact["failures_line"])
 
                 for line in artifact["summary_short"].split("\n"):
-                    if re.search("FAILED", line):
-
-                        line = line.replace("FAILED ", "")
+                    if line.startswith("FAILED "):
+                        line = line[len("FAILED ") :]
                         line = line.split()[0].replace("\n", "")
 
                         if artifact_path["gpu"] not in model_results[model]["failures"]:
@@ -879,7 +962,7 @@ def prepare_reports(title, header, reports, to_truncate=True):
         "Torch CUDA extension tests": "run_tests_torch_cuda_extensions_gpu_test_reports",
     }
 
-    if ci_event == "push":
+    if ci_event in ["push", "Nightly CI"] or ci_event.startswith("Past CI"):
         del additional_files["Examples directory"]
         del additional_files["PyTorch pipelines"]
         del additional_files["TensorFlow pipelines"]
@@ -897,21 +980,21 @@ def prepare_reports(title, header, reports, to_truncate=True):
     }
 
     for key in additional_results.keys():
-
         # If a whole suite of test fails, the artifact isn't available.
         if additional_files[key] not in available_artifacts:
             additional_results[key]["error"] = True
             continue
 
         for artifact_path in available_artifacts[additional_files[key]].paths:
+            # Link to the GitHub Action job
+            job_name = key
             if artifact_path["gpu"] is not None:
-                additional_results[key]["job_link"][artifact_path["gpu"]] = github_actions_job_links.get(
-                    f"{key} ({artifact_path['gpu']}-gpu)"
-                )
-            else:
-                additional_results[key]["job_link"][artifact_path["gpu"]] = github_actions_job_links.get(key)
+                job_name = f"{key} ({artifact_path['gpu']}-gpu)"
+            if job_name_prefix:
+                job_name = f"{job_name_prefix} / {job_name}"
+            additional_results[key]["job_link"][artifact_path["gpu"]] = github_actions_job_links.get(job_name)
 
-            artifact = retrieve_artifact(artifact_path["name"], artifact_path["gpu"])
+            artifact = retrieve_artifact(artifact_path["path"], artifact_path["gpu"])
             stacktraces = handle_stacktraces(artifact["failures_line"])
 
             failed, success, time_spent = handle_test_results(artifact["stats"])
@@ -924,8 +1007,8 @@ def prepare_reports(title, header, reports, to_truncate=True):
 
             if failed:
                 for line in artifact["summary_short"].split("\n"):
-                    if re.search("FAILED", line):
-                        line = line.replace("FAILED ", "")
+                    if line.startswith("FAILED "):
+                        line = line[len("FAILED ") :]
                         line = line.split()[0].replace("\n", "")
 
                         if artifact_path["gpu"] not in additional_results[key]["failures"]:
diff --git a/utils/notification_service_doc_tests.py b/utils/notification_service_doc_tests.py
index 7d5605c1cae3..aac768fb3943 100644
--- a/utils/notification_service_doc_tests.py
+++ b/utils/notification_service_doc_tests.py
@@ -52,7 +52,7 @@ def extract_first_line_failure(failures_short_lines):
     file = None
     in_error = False
     for line in failures_short_lines.split("\n"):
-        if re.search("_ \[doctest\]", line):
+        if re.search(r"_ \[doctest\]", line):
             in_error = True
             file = line.split(" ")[2]
         elif in_error and not line.split(" ")[0].isdigit():
@@ -323,14 +323,13 @@ def add_path(self, path: str):
 
 
 if __name__ == "__main__":
-
     github_actions_job_links = get_job_links()
     available_artifacts = retrieve_available_artifacts()
 
     docs = collections.OrderedDict(
         [
             ("*.py", "API Examples"),
-            ("*.mdx", "MDX Examples"),
+            ("*.md", "MD Examples"),
         ]
     )
 
@@ -359,7 +358,6 @@ def add_path(self, path: str):
         all_failures = extract_first_line_failure(artifact["failures_short"])
         for line in artifact["summary_short"].split("\n"):
             if re.search("FAILED", line):
-
                 line = line.replace("FAILED ", "")
                 line = line.split()[0].replace("\n", "")
 
diff --git a/utils/past_ci_versions.py b/utils/past_ci_versions.py
index 854127b34173..61495ab2a46f 100644
--- a/utils/past_ci_versions.py
+++ b/utils/past_ci_versions.py
@@ -4,6 +4,30 @@
 
 past_versions_testing = {
     "pytorch": {
+        "1.13": {
+            "torch": "1.13.1",
+            "torchvision": "0.14.1",
+            "torchaudio": "0.13.1",
+            "python": 3.9,
+            "cuda": "cu116",
+            "install": (
+                "python3 -m pip install --no-cache-dir -U torch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1"
+                " --extra-index-url https://download.pytorch.org/whl/cu116"
+            ),
+            "base_image": "nvidia/cuda:11.6.2-cudnn8-devel-ubuntu20.04",
+        },
+        "1.12": {
+            "torch": "1.12.1",
+            "torchvision": "0.13.1",
+            "torchaudio": "0.12.1",
+            "python": 3.9,
+            "cuda": "cu113",
+            "install": (
+                "python3 -m pip install --no-cache-dir -U torch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1"
+                " --extra-index-url https://download.pytorch.org/whl/cu113"
+            ),
+            "base_image": "nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04",
+        },
         "1.11": {
             "torch": "1.11.0",
             "torchvision": "0.12.0",
@@ -14,6 +38,7 @@
                 "python3 -m pip install --no-cache-dir -U torch==1.11.0 torchvision==0.12.0 torchaudio==0.11.0"
                 " --extra-index-url https://download.pytorch.org/whl/cu113"
             ),
+            "base_image": "nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04",
         },
         "1.10": {
             "torch": "1.10.2",
@@ -25,6 +50,7 @@
                 "python3 -m pip install --no-cache-dir -U torch==1.10.2 torchvision==0.11.3 torchaudio==0.10.2"
                 " --extra-index-url https://download.pytorch.org/whl/cu113"
             ),
+            "base_image": "nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04",
         },
         # torchaudio < 0.10 has no CUDA-enabled binary distributions
         "1.9": {
@@ -37,87 +63,44 @@
                 "python3 -m pip install --no-cache-dir -U torch==1.9.1 torchvision==0.10.1 torchaudio==0.9.1"
                 " --extra-index-url https://download.pytorch.org/whl/cu111"
             ),
+            "base_image": "nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04",
         },
-        "1.8": {
-            "torch": "1.8.1",
-            "torchvision": "0.9.1",
-            "torchaudio": "0.8.1",
-            "python": 3.9,
-            "cuda": "cu111",
-            "install": (
-                "python3 -m pip install --no-cache-dir -U torch==1.8.1 torchvision==0.9.1 torchaudio==0.8.1"
-                " --extra-index-url https://download.pytorch.org/whl/cu111"
-            ),
-        },
-        "1.7": {
-            "torch": "1.7.1",
-            "torchvision": "0.8.2",
-            "torchaudio": "0.7.2",
-            "python": 3.9,
-            "cuda": "cu110",
-            "install": (
-                "python3 -m pip install --no-cache-dir -U torch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2"
-                " --extra-index-url https://download.pytorch.org/whl/cu110"
-            ),
+    },
+    "tensorflow": {
+        "2.11": {
+            "tensorflow": "2.11.1",
+            "install": "python3 -m pip install --no-cache-dir -U tensorflow==2.11.1",
+            "base_image": "nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04",
         },
-        "1.6": {
-            "torch": "1.6.0",
-            "torchvision": "0.7.0",
-            "torchaudio": "0.6.0",
-            "python": 3.8,
-            "cuda": "cu101",
-            "install": (
-                "python3 -m pip install --no-cache-dir -U torch==1.6.0 torchvision==0.7.0 torchaudio==0.6.0"
-                " --extra-index-url https://download.pytorch.org/whl/cu101"
-            ),
+        "2.10": {
+            "tensorflow": "2.10.1",
+            "install": "python3 -m pip install --no-cache-dir -U tensorflow==2.10.1",
+            "base_image": "nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04",
         },
-        "1.5": {
-            "torch": "1.5.1",
-            "torchvision": "0.6.1",
-            "torchaudio": "0.5.1",
-            "python": 3.8,
-            "cuda": "cu101",
-            "install": (
-                "python3 -m pip install --no-cache-dir -U torch==1.5.1 torchvision==0.6.1 torchaudio==0.5.1"
-                " --extra-index-url https://download.pytorch.org/whl/cu101"
-            ),
-        },
-        "1.4": {
-            "torch": "1.4.0",
-            "torchvision": "0.5.0",
-            "torchaudio": "0.4.0",
-            "python": 3.8,
-            "cuda": "cu100",
-            "install": (
-                "python3 -m pip install --no-cache-dir -U torch==1.4.0 torchvision==0.5.0 torchaudio==0.4.0"
-                " --extra-index-url https://download.pytorch.org/whl/cu100"
-            ),
+        "2.9": {
+            "tensorflow": "2.9.3",
+            "install": "python3 -m pip install --no-cache-dir -U tensorflow==2.9.3",
+            "base_image": "nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04",
         },
-    },
-    "tensorflow": {
         "2.8": {
             "tensorflow": "2.8.2",
             "install": "python3 -m pip install --no-cache-dir -U tensorflow==2.8.2",
+            "base_image": "nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04",
         },
         "2.7": {
             "tensorflow": "2.7.3",
             "install": "python3 -m pip install --no-cache-dir -U tensorflow==2.7.3",
+            "base_image": "nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04",
         },
         "2.6": {
             "tensorflow": "2.6.5",
             "install": "python3 -m pip install --no-cache-dir -U tensorflow==2.6.5",
+            "base_image": "nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04",
         },
         "2.5": {
             "tensorflow": "2.5.3",
             "install": "python3 -m pip install --no-cache-dir -U tensorflow==2.5.3",
-        },
-        # need another `nvidia:cuda` docker image, otherwise GPU not working
-        "2.4": {
-            "tensorflow": "2.4.4",
-            "install": "python3 -m pip install --no-cache-dir -U tensorflow==2.4.4",
-            # This should be specified as a docker build argument.
-            # We keep the information here for reference only.
-            "base_docker": "nvidia/cuda:11.0.3-cudnn8-devel-ubuntu20.04",
+            "base_image": "nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04",
         },
     },
 }
@@ -125,8 +108,10 @@
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser("Choose the framework and version to install")
-    parser.add_argument("--framework", help="The framework to install. Should be `torch` or `tensorflow`", type=str)
-    parser.add_argument("--version", help="The version of the framework to install.", type=str)
+    parser.add_argument(
+        "--framework", help="The framework to install. Should be `torch` or `tensorflow`", type=str, required=True
+    )
+    parser.add_argument("--version", help="The version of the framework to install.", type=str, required=True)
     args = parser.parse_args()
 
     info = past_versions_testing[args.framework][args.version]
diff --git a/utils/prepare_for_doc_test.py b/utils/prepare_for_doc_test.py
deleted file mode 100644
index c55f3540d994..000000000000
--- a/utils/prepare_for_doc_test.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Style utils to preprocess files for doc tests.
-
-    The doc precossing function can be run on a list of files and/org
-    directories of files. It will recursively check if the files have
-    a python code snippet by looking for a ```python or ```py syntax.
-    In the default mode - `remove_new_line==False` the script will
-    add a new line before every python code ending ``` line to make
-    the docstrings ready for pytest doctests.
-    However, we don't want to have empty lines displayed in the
-    official documentation which is why the new line command can be
-    reversed by adding the flag `--remove_new_line` which sets
-    `remove_new_line==True`.
-
-    When debugging the doc tests locally, please make sure to
-    always run:
-
-    ```python utils/prepare_for_doc_test.py src docs```
-
-    before running the doc tests:
-
-    ```pytest --doctest-modules $(cat utils/documentation_tests.txt) -sv --doctest-continue-on-failure --doctest-glob="*.mdx"```
-
-    Afterwards you should revert the changes by running
-
-    ```python utils/prepare_for_doc_test.py src docs --remove_new_line```
-"""
-
-import argparse
-import os
-
-
-def process_code_block(code, add_new_line=True):
-    if add_new_line:
-        return maybe_append_new_line(code)
-    else:
-        return maybe_remove_new_line(code)
-
-
-def maybe_append_new_line(code):
-    """
-    Append new line if code snippet is a
-    Python code snippet
-    """
-    lines = code.split("\n")
-
-    if lines[0] in ["py", "python"]:
-        # add new line before last line being ```
-        last_line = lines[-1]
-        lines.pop()
-        lines.append("\n" + last_line)
-
-    return "\n".join(lines)
-
-
-def maybe_remove_new_line(code):
-    """
-    Remove new line if code snippet is a
-    Python code snippet
-    """
-    lines = code.split("\n")
-
-    if lines[0] in ["py", "python"]:
-        # add new line before last line being ```
-        lines = lines[:-2] + lines[-1:]
-
-    return "\n".join(lines)
-
-
-def process_doc_file(code_file, add_new_line=True):
-    """
-    Process given file.
-
-    Args:
-        code_file (`str` or `os.PathLike`): The file in which we want to style the docstring.
-    """
-    with open(code_file, "r", encoding="utf-8", newline="\n") as f:
-        code = f.read()
-
-    # fmt: off
-    splits = code.split("```")
-    if len(splits) % 2 != 1:
-        raise ValueError("The number of occurrences of ``` should be an even number.")
-
-    splits = [s if i % 2 == 0 else process_code_block(s, add_new_line=add_new_line) for i, s in enumerate(splits)]
-    clean_code = "```".join(splits)
-    # fmt: on
-
-    diff = clean_code != code
-    if diff:
-        print(f"Overwriting content of {code_file}.")
-        with open(code_file, "w", encoding="utf-8", newline="\n") as f:
-            f.write(clean_code)
-
-
-def process_doc_files(*files, add_new_line=True):
-    """
-    Applies doc styling or checks everything is correct in a list of files.
-
-    Args:
-        files (several `str` or `os.PathLike`): The files to treat.
-            Whether to restyle file or just check if they should be restyled.
-
-    Returns:
-        List[`str`]: The list of files changed or that should be restyled.
-    """
-    for file in files:
-        # Treat folders
-        if os.path.isdir(file):
-            files = [os.path.join(file, f) for f in os.listdir(file)]
-            files = [f for f in files if os.path.isdir(f) or f.endswith(".mdx") or f.endswith(".py")]
-            process_doc_files(*files, add_new_line=add_new_line)
-        else:
-            try:
-                process_doc_file(file, add_new_line=add_new_line)
-            except Exception:
-                print(f"There is a problem in {file}.")
-                raise
-
-
-def main(*files, add_new_line=True):
-    process_doc_files(*files, add_new_line=add_new_line)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("files", nargs="+", help="The file(s) or folder(s) to restyle.")
-    parser.add_argument(
-        "--remove_new_line",
-        action="store_true",
-        help="Whether to remove new line after each python code block instead of adding one.",
-    )
-    args = parser.parse_args()
-
-    main(*args.files, add_new_line=not args.remove_new_line)
diff --git a/utils/release.py b/utils/release.py
index 3bb75f0bebf4..8ba0ead7c6ce 100644
--- a/utils/release.py
+++ b/utils/release.py
@@ -12,7 +12,35 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+Utility that prepares the repository for releases (or patches) by updating all versions in the relevant places. It
+also performs some post-release cleanup, by updating the links in the main README to respective model doc pages (from
+main to stable).
 
+To prepare for a release, use from the root of the repo on the release branch with:
+
+```bash
+python release.py
+```
+
+or use `make pre-release`.
+
+To prepare for a patch release, use from the root of the repo on the release branch with:
+
+```bash
+python release.py --patch
+```
+
+or use `make pre-patch`.
+
+To do the post-release cleanup, use from the root of the repo on the main branch with:
+
+```bash
+python release.py --post_release
+```
+
+or use `make post-release`.
+"""
 import argparse
 import os
 import re
@@ -20,13 +48,16 @@
 import packaging.version
 
 
+# All paths are defined with the intent that this script should be run from the root of the repo.
 PATH_TO_EXAMPLES = "examples/"
+# This maps a type of file to the pattern to look for when searching where the version is defined, as well as the
+# template to follow when replacing it with the new version.
 REPLACE_PATTERNS = {
     "examples": (re.compile(r'^check_min_version\("[^"]+"\)\s*$', re.MULTILINE), 'check_min_version("VERSION")\n'),
     "init": (re.compile(r'^__version__\s+=\s+"([^"]+)"\s*$', re.MULTILINE), '__version__ = "VERSION"\n'),
     "setup": (re.compile(r'^(\s*)version\s*=\s*"[^"]+",', re.MULTILINE), r'\1version="VERSION",'),
-    "doc": (re.compile(r'^(\s*)release\s*=\s*"[^"]+"$', re.MULTILINE), 'release = "VERSION"\n'),
 }
+# This maps a type of file to its path in Transformers
 REPLACE_FILES = {
     "init": "src/transformers/__init__.py",
     "setup": "setup.py",
@@ -34,19 +65,31 @@
 README_FILE = "README.md"
 
 
-def update_version_in_file(fname, version, pattern):
-    """Update the version in one file using a specific pattern."""
+def update_version_in_file(fname: str, version: str, file_type: str):
+    """
+    Update the version of Transformers in one file.
+
+    Args:
+        fname (`str`): The path to the file where we want to update the version.
+        version (`str`): The new version to set in the file.
+        file_type (`str`): The type of the file (should be a key in `REPLACE_PATTERNS`).
+    """
     with open(fname, "r", encoding="utf-8", newline="\n") as f:
         code = f.read()
-    re_pattern, replace = REPLACE_PATTERNS[pattern]
+    re_pattern, replace = REPLACE_PATTERNS[file_type]
     replace = replace.replace("VERSION", version)
     code = re_pattern.sub(replace, code)
     with open(fname, "w", encoding="utf-8", newline="\n") as f:
         f.write(code)
 
 
-def update_version_in_examples(version):
-    """Update the version in all examples files."""
+def update_version_in_examples(version: str):
+    """
+    Update the version in all examples files.
+
+    Args:
+        version (`str`): The new version to set in the examples.
+    """
     for folder, directories, fnames in os.walk(PATH_TO_EXAMPLES):
         # Removing some of the folders with non-actively maintained examples from the walk
         if "research_projects" in directories:
@@ -55,19 +98,28 @@ def update_version_in_examples(version):
             directories.remove("legacy")
         for fname in fnames:
             if fname.endswith(".py"):
-                update_version_in_file(os.path.join(folder, fname), version, pattern="examples")
+                update_version_in_file(os.path.join(folder, fname), version, file_type="examples")
+
 
+def global_version_update(version: str, patch: bool = False):
+    """
+    Update the version in all needed files.
 
-def global_version_update(version, patch=False):
-    """Update the version in all needed files."""
+    Args:
+        version (`str`): The new version to set everywhere.
+        patch (`bool`, *optional*, defaults to `False`): Whether or not this is a patch release.
+    """
     for pattern, fname in REPLACE_FILES.items():
         update_version_in_file(fname, version, pattern)
     if not patch:
+        # We don't update the version in the examples for patch releases.
         update_version_in_examples(version)
 
 
 def clean_main_ref_in_model_list():
-    """Replace the links from main doc tp stable doc in the model list of the README."""
+    """
+    Replace the links from main doc to stable doc in the model list of the README.
+    """
     # If the introduction or the conclusion of the list change, the prompts may need to be updated.
     _start_prompt = "🤗 Transformers currently provides the following architectures"
     _end_prompt = "1. Want to contribute a new model?"
@@ -94,16 +146,26 @@ def clean_main_ref_in_model_list():
         f.writelines(lines)
 
 
-def get_version():
-    """Reads the current version in the __init__."""
+def get_version() -> packaging.version.Version:
+    """
+    Reads the current version in the main __init__.
+    """
     with open(REPLACE_FILES["init"], "r") as f:
         code = f.read()
     default_version = REPLACE_PATTERNS["init"][0].search(code).groups()[0]
     return packaging.version.parse(default_version)
 
 
-def pre_release_work(patch=False):
-    """Do all the necessary pre-release steps."""
+def pre_release_work(patch: bool = False):
+    """
+    Do all the necessary pre-release steps:
+    - figure out the next minor release version and ask confirmation
+    - update the version eveywhere
+    - clean-up the model list in the main README
+
+    Args:
+        patch (`bool`, *optional*, defaults to `False`): Whether or not this is a patch release.
+    """
     # First let's get the default version: base version if we are in dev, bump minor otherwise.
     default_version = get_version()
     if patch and default_version.is_devrelease:
@@ -115,7 +177,7 @@ def pre_release_work(patch=False):
     else:
         default_version = f"{default_version.major}.{default_version.minor + 1}.0"
 
-    # Now let's ask nicely if that's the right one.
+    # Now let's ask nicely if we have found the right version.
     version = input(f"Which version are you releasing? [{default_version}]")
     if len(version) == 0:
         version = default_version
@@ -128,7 +190,12 @@ def pre_release_work(patch=False):
 
 
 def post_release_work():
-    """Do all the necesarry post-release steps."""
+    """
+    Do all the necesarry post-release steps:
+    - figure out the next dev version and ask confirmation
+    - update the version eveywhere
+    - clean-up the model list in the main README
+    """
     # First let's get the current version
     current_version = get_version()
     dev_version = f"{current_version.major}.{current_version.minor + 1}.0.dev0"
diff --git a/utils/slow_documentation_tests.txt b/utils/slow_documentation_tests.txt
new file mode 100644
index 000000000000..f72216b13457
--- /dev/null
+++ b/utils/slow_documentation_tests.txt
@@ -0,0 +1,4 @@
+docs/source/en/generation_strategies.md
+docs/source/en/model_doc/ctrl.md
+docs/source/en/task_summary.md
+src/transformers/models/ctrl/modeling_ctrl.py
diff --git a/utils/sort_auto_mappings.py b/utils/sort_auto_mappings.py
index ef985dc43cd4..8871b6807dfc 100644
--- a/utils/sort_auto_mappings.py
+++ b/utils/sort_auto_mappings.py
@@ -12,23 +12,52 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+Utility that sorts the names in the auto mappings defines in the auto modules in alphabetical order.
 
+Use from the root of the repo with:
+
+```bash
+python utils/sort_auto_mappings.py
+```
+
+to auto-fix all the auto mappings (used in `make style`).
+
+To only check if the mappings are properly sorted (as used in `make quality`), do:
+
+```bash
+python utils/sort_auto_mappings.py --check_only
+```
+"""
 import argparse
 import os
 import re
+from typing import Optional
 
 
+# Path are set with the intent you should run this script from the root of the repo.
 PATH_TO_AUTO_MODULE = "src/transformers/models/auto"
 
 
 # re pattern that matches mapping introductions:
 #    SUPER_MODEL_MAPPING_NAMES = OrderedDict or SUPER_MODEL_MAPPING = OrderedDict
-_re_intro_mapping = re.compile("[A-Z_]+_MAPPING(\s+|_[A-Z_]+\s+)=\s+OrderedDict")
+_re_intro_mapping = re.compile(r"[A-Z_]+_MAPPING(\s+|_[A-Z_]+\s+)=\s+OrderedDict")
 # re pattern that matches identifiers in mappings
 _re_identifier = re.compile(r'\s*\(\s*"(\S[^"]+)"')
 
 
-def sort_auto_mapping(fname, overwrite: bool = False):
+def sort_auto_mapping(fname: str, overwrite: bool = False) -> Optional[bool]:
+    """
+    Sort all auto mappings in a file.
+
+    Args:
+        fname (`str`): The name of the file where we want to sort auto-mappings.
+        overwrite (`bool`, *optional*, defaults to `False`): Whether or not to fix and overwrite the file.
+
+    Returns:
+        `Optional[bool]`: Returns `None` if `overwrite=True`. Otherwise returns `True` if the file has an auto-mapping
+        improperly sorted, `False` if the file is okay.
+    """
     with open(fname, "r", encoding="utf-8") as f:
         content = f.read()
 
@@ -37,8 +66,8 @@ def sort_auto_mapping(fname, overwrite: bool = False):
     line_idx = 0
     while line_idx < len(lines):
         if _re_intro_mapping.search(lines[line_idx]) is not None:
-            indent = len(re.search(r"^(\s*)\S", lines[line_idx]).groups()[0]) + 8
             # Start of a new mapping!
+            indent = len(re.search(r"^(\s*)\S", lines[line_idx]).groups()[0]) + 8
             while not lines[line_idx].startswith(" " * indent + "("):
                 new_lines.append(lines[line_idx])
                 line_idx += 1
@@ -65,11 +94,17 @@ def sort_auto_mapping(fname, overwrite: bool = False):
     if overwrite:
         with open(fname, "w", encoding="utf-8") as f:
             f.write("\n".join(new_lines))
-    elif "\n".join(new_lines) != content:
-        return True
+    else:
+        return "\n".join(new_lines) != content
 
 
 def sort_all_auto_mappings(overwrite: bool = False):
+    """
+    Sort all auto mappings in the library.
+
+    Args:
+        overwrite (`bool`, *optional*, defaults to `False`): Whether or not to fix and overwrite the file.
+    """
     fnames = [os.path.join(PATH_TO_AUTO_MODULE, f) for f in os.listdir(PATH_TO_AUTO_MODULE) if f.endswith(".py")]
     diffs = [sort_auto_mapping(fname, overwrite=overwrite) for fname in fnames]
 
diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py
index 82501d98bc67..2fd866e9d8da 100644
--- a/utils/tests_fetcher.py
+++ b/utils/tests_fetcher.py
@@ -13,6 +13,42 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+"""
+Welcome to tests_fetcher V2.
+
+This util is designed to fetch tests to run on a PR so that only the tests impacted by the modifications are run, and
+when too many models are being impacted, only run the tests of a subset of core models. It works like this.
+
+Stage 1: Identify the modified files. For jobs that run on the main branch, it's just the diff with the last commit.
+On a PR, this takes all the files from the branching point to the current commit (so all modifications in a PR, not
+just the last commit) but excludes modifications that are on docstrings or comments only.
+
+Stage 2: Extract the tests to run. This is done by looking at the imports in each module and test file: if module A
+imports module B, then changing module B impacts module A, so the tests using module A should be run. We thus get the
+dependencies of each model and then recursively builds the 'reverse' map of dependencies to get all modules and tests
+impacted by a given file. We then only keep the tests (and only the core models tests if there are too many modules).
+
+Caveats:
+  - This module only filters tests by files (not individual tests) so it's better to have tests for different things
+    in different files.
+  - This module assumes inits are just importing things, not really building objects, so it's better to structure
+    them this way and move objects building in separate submodules.
+
+Usage:
+
+Base use to fetch the tests in a pull request
+
+```bash
+python utils/tests_fetcher.py
+```
+
+Base use to fetch the tests on a the main branch (with diff from the last commit):
+
+```bash
+python utils/tests_fetcher.py --diff_with_last_commit
+```
+"""
+
 import argparse
 import collections
 import json
@@ -20,18 +56,53 @@
 import re
 from contextlib import contextmanager
 from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union
 
 from git import Repo
 
 
-# This script is intended to be run from the root of the repo but you can adapt this constant if you need to.
-PATH_TO_TRANFORMERS = "."
+PATH_TO_REPO = Path(__file__).parent.parent.resolve()
+PATH_TO_EXAMPLES = PATH_TO_REPO / "examples"
+PATH_TO_TRANFORMERS = PATH_TO_REPO / "src/transformers"
+PATH_TO_TESTS = PATH_TO_REPO / "tests"
+
+# List here the models to always test.
+IMPORTANT_MODELS = [
+    "auto",
+    # Most downloaded models
+    "bert",
+    "clip",
+    "t5",
+    "xlm-roberta",
+    "gpt2",
+    "bart",
+    "mpnet",
+    "gpt-j",
+    "wav2vec2",
+    "deberta-v2",
+    "layoutlm",
+    "opt",
+    "longformer",
+    "vit",
+    # Pipeline-specific model (to be sure each pipeline has one model in this list)
+    "tapas",
+    "vilt",
+    "clap",
+    "detr",
+    "owlvit",
+    "dpt",
+    "videomae",
+]
 
 
 @contextmanager
-def checkout_commit(repo, commit_id):
+def checkout_commit(repo: Repo, commit_id: str):
     """
-    Context manager that checks out a commit in the repo.
+    Context manager that checks out a given commit when entered, but gets back to the reference it was at on exit.
+
+    Args:
+        repo (`git.Repo`): A git repository (for instance the Transformers repo).
+        commit_id (`str`): The commit reference to checkout inside the context manager.
     """
     current_head = repo.head.commit if repo.head.is_detached else repo.head.ref
 
@@ -43,10 +114,19 @@ def checkout_commit(repo, commit_id):
         repo.git.checkout(current_head)
 
 
-def clean_code(content):
+def clean_code(content: str) -> str:
     """
-    Remove docstrings, empty line or comments from `content`.
+    Remove docstrings, empty line or comments from some code (used to detect if a diff is real or only concern
+    comments or docstings).
+
+    Args:
+        content (`str`): The code to clean
+
+    Returns:
+        `str`: The cleaned code.
     """
+    # We need to deactivate autoformatting here to write escaped triple quotes (we cannot use real triple quotes or
+    # this would mess up the result if this function applied to this particular file).
     # fmt: off
     # Remove docstrings by splitting on triple " then triple ':
     splits = content.split('\"\"\"')
@@ -60,47 +140,86 @@ def clean_code(content):
     for line in content.split("\n"):
         # remove anything that is after a # sign.
         line = re.sub("#.*$", "", line)
-        if len(line) == 0 or line.isspace():
-            continue
-        lines_to_keep.append(line)
+        # remove white lines
+        if len(line) != 0 and not line.isspace():
+            lines_to_keep.append(line)
     return "\n".join(lines_to_keep)
 
 
-def get_all_tests():
+def keep_doc_examples_only(content: str) -> str:
     """
-    Return a list of paths to all test folders and files under `tests`. All paths are rooted at `tests`.
+    Remove everything from the code content except the doc examples (used to determined if a diff should trigger doc
+    tests or not).
+
+    Args:
+        content (`str`): The code to clean
+
+    Returns:
+        `str`: The cleaned code.
+    """
+    # Keep doc examples only by splitting on triple "`"
+    splits = content.split("```")
+    # Add leading and trailing "```" so the navigation is easier when compared to the original input `content`
+    content = "```" + "```".join(splits[1::2]) + "```"
+
+    # Remove empty lines and comments
+    lines_to_keep = []
+    for line in content.split("\n"):
+        # remove anything that is after a # sign.
+        line = re.sub("#.*$", "", line)
+        # remove white lines
+        if len(line) != 0 and not line.isspace():
+            lines_to_keep.append(line)
+    return "\n".join(lines_to_keep)
+
 
-    - folders under `tests`: `tokenization`, `pipelines`, etc. The folder `models` is excluded.
+def get_all_tests() -> List[str]:
+    """
+    Walks the `tests` folder to return a list of files/subfolders. This is used to split the tests to run when using
+    paralellism. The split is:
+
+    - folders under `tests`: (`tokenization`, `pipelines`, etc) except the subfolder `models` is excluded.
     - folders under `tests/models`: `bert`, `gpt2`, etc.
     - test files under `tests`: `test_modeling_common.py`, `test_tokenization_common.py`, etc.
     """
-    test_root_dir = os.path.join(PATH_TO_TRANFORMERS, "tests")
 
     # test folders/files directly under `tests` folder
-    tests = os.listdir(test_root_dir)
-    tests = sorted(
-        list(filter(lambda x: os.path.isdir(x) or x.startswith("tests/test_"), [f"tests/{x}" for x in tests]))
-    )
+    tests = os.listdir(PATH_TO_TESTS)
+    tests = [f"tests/{f}" for f in tests if "__pycache__" not in f]
+    tests = sorted([f for f in tests if (PATH_TO_REPO / f).is_dir() or f.startswith("tests/test_")])
 
     # model specific test folders
-    model_tests_folders = os.listdir(os.path.join(test_root_dir, "models"))
-    model_test_folders = sorted(list(filter(os.path.isdir, [f"tests/models/{x}" for x in model_tests_folders])))
+    model_test_folders = os.listdir(PATH_TO_TESTS / "models")
+    model_test_folders = [f"tests/models/{f}" for f in model_test_folders if "__pycache__" not in f]
+    model_test_folders = sorted([f for f in model_test_folders if (PATH_TO_REPO / f).is_dir()])
 
     tests.remove("tests/models")
+    # Sagemaker tests are not meant to be run on the CI.
+    if "tests/sagemaker" in tests:
+        tests.remove("tests/sagemaker")
     tests = model_test_folders + tests
 
     return tests
 
 
-def diff_is_docstring_only(repo, branching_point, filename):
+def diff_is_docstring_only(repo: Repo, branching_point: str, filename: str) -> bool:
     """
-    Check if the diff is only in docstrings in a filename.
+    Check if the diff is only in docstrings (or comments and whitespace) in a filename.
+
+    Args:
+        repo (`git.Repo`): A git repository (for instance the Transformers repo).
+        branching_point (`str`): The commit reference of where to compare for the diff.
+        filename (`str`): The filename where we want to know if the diff isonly in docstrings/comments.
+
+    Returns:
+        `bool`: Whether the diff is docstring/comments only or not.
     """
+    folder = Path(repo.working_dir)
     with checkout_commit(repo, branching_point):
-        with open(filename, "r", encoding="utf-8") as f:
+        with open(folder / filename, "r", encoding="utf-8") as f:
             old_content = f.read()
 
-    with open(filename, "r", encoding="utf-8") as f:
+    with open(folder / filename, "r", encoding="utf-8") as f:
         new_content = f.read()
 
     old_content_clean = clean_code(old_content)
@@ -109,14 +228,87 @@ def diff_is_docstring_only(repo, branching_point, filename):
     return old_content_clean == new_content_clean
 
 
-def get_modified_python_files(diff_with_last_commit=False):
+def diff_contains_doc_examples(repo: Repo, branching_point: str, filename: str) -> bool:
+    """
+    Check if the diff is only in code examples of the doc in a filename.
+
+    Args:
+        repo (`git.Repo`): A git repository (for instance the Transformers repo).
+        branching_point (`str`): The commit reference of where to compare for the diff.
+        filename (`str`): The filename where we want to know if the diff is only in codes examples.
+
+    Returns:
+        `bool`: Whether the diff is only in code examples of the doc or not.
+    """
+    folder = Path(repo.working_dir)
+    with checkout_commit(repo, branching_point):
+        with open(folder / filename, "r", encoding="utf-8") as f:
+            old_content = f.read()
+
+    with open(folder / filename, "r", encoding="utf-8") as f:
+        new_content = f.read()
+
+    old_content_clean = keep_doc_examples_only(old_content)
+    new_content_clean = keep_doc_examples_only(new_content)
+
+    return old_content_clean != new_content_clean
+
+
+def get_diff(repo: Repo, base_commit: str, commits: List[str]) -> List[str]:
+    """
+    Get the diff between a base commit and one or several commits.
+
+    Args:
+        repo (`git.Repo`):
+            A git repository (for instance the Transformers repo).
+        base_commit (`str`):
+            The commit reference of where to compare for the diff. This is the current commit, not the branching point!
+        commits (`List[str]`):
+            The list of commits with which to compare the repo at `base_commit` (so the branching point).
+
+    Returns:
+        `List[str]`: The list of Python files with a diff (files added, renamed or deleted are always returned, files
+        modified are returned if the diff in the file is not only in docstrings or comments, see
+        `diff_is_docstring_only`).
+    """
+    print("\n### DIFF ###\n")
+    code_diff = []
+    for commit in commits:
+        for diff_obj in commit.diff(base_commit):
+            # We always add new python files
+            if diff_obj.change_type == "A" and diff_obj.b_path.endswith(".py"):
+                code_diff.append(diff_obj.b_path)
+            # We check that deleted python files won't break corresponding tests.
+            elif diff_obj.change_type == "D" and diff_obj.a_path.endswith(".py"):
+                code_diff.append(diff_obj.a_path)
+            # Now for modified files
+            elif diff_obj.change_type in ["M", "R"] and diff_obj.b_path.endswith(".py"):
+                # In case of renames, we'll look at the tests using both the old and new name.
+                if diff_obj.a_path != diff_obj.b_path:
+                    code_diff.extend([diff_obj.a_path, diff_obj.b_path])
+                else:
+                    # Otherwise, we check modifications are in code and not docstrings.
+                    if diff_is_docstring_only(repo, commit, diff_obj.b_path):
+                        print(f"Ignoring diff in {diff_obj.b_path} as it only concerns docstrings or comments.")
+                    else:
+                        code_diff.append(diff_obj.a_path)
+
+    return code_diff
+
+
+def get_modified_python_files(diff_with_last_commit: bool = False) -> List[str]:
     """
     Return a list of python files that have been modified between:
 
     - the current head and the main branch if `diff_with_last_commit=False` (default)
     - the current head and its parent commit otherwise.
+
+    Returns:
+        `List[str]`: The list of Python files with a diff (files added, renamed or deleted are always returned, files
+        modified are returned if the diff in the file is not only in docstrings or comments, see
+        `diff_is_docstring_only`).
     """
-    repo = Repo(PATH_TO_TRANFORMERS)
+    repo = Repo(PATH_TO_REPO)
 
     if not diff_with_last_commit:
         print(f"main is at {repo.refs.main.commit}")
@@ -134,140 +326,348 @@ def get_modified_python_files(diff_with_last_commit=False):
         return get_diff(repo, repo.head.commit, parent_commits)
 
 
-def get_diff(repo, base_commit, commits):
+def get_diff_for_doctesting(repo: Repo, base_commit: str, commits: List[str]) -> List[str]:
     """
-    Get's the diff between one or several commits and the head of the repository.
+    Get the diff in doc examples between a base commit and one or several commits.
+
+    Args:
+        repo (`git.Repo`):
+            A git repository (for instance the Transformers repo).
+        base_commit (`str`):
+            The commit reference of where to compare for the diff. This is the current commit, not the branching point!
+        commits (`List[str]`):
+            The list of commits with which to compare the repo at `base_commit` (so the branching point).
+
+    Returns:
+        `List[str]`: The list of Python and Markdown files with a diff (files added or renamed are always returned, files
+        modified are returned if the diff in the file is only in doctest examples).
     """
     print("\n### DIFF ###\n")
     code_diff = []
     for commit in commits:
         for diff_obj in commit.diff(base_commit):
-            # We always add new python files
-            if diff_obj.change_type == "A" and diff_obj.b_path.endswith(".py"):
+            # We only consider Python files and doc files.
+            if not diff_obj.b_path.endswith(".py") and not diff_obj.b_path.endswith(".md"):
+                continue
+            # We always add new python/md files
+            if diff_obj.change_type in ["A"]:
                 code_diff.append(diff_obj.b_path)
-            # We check that deleted python files won't break corresponding tests.
-            elif diff_obj.change_type == "D" and diff_obj.a_path.endswith(".py"):
-                code_diff.append(diff_obj.a_path)
             # Now for modified files
-            elif diff_obj.change_type in ["M", "R"] and diff_obj.b_path.endswith(".py"):
+            elif diff_obj.change_type in ["M", "R"]:
                 # In case of renames, we'll look at the tests using both the old and new name.
                 if diff_obj.a_path != diff_obj.b_path:
                     code_diff.extend([diff_obj.a_path, diff_obj.b_path])
                 else:
-                    # Otherwise, we check modifications are in code and not docstrings.
-                    if diff_is_docstring_only(repo, commit, diff_obj.b_path):
-                        print(f"Ignoring diff in {diff_obj.b_path} as it only concerns docstrings or comments.")
-                    else:
+                    # Otherwise, we check modifications contain some doc example(s).
+                    if diff_contains_doc_examples(repo, commit, diff_obj.b_path):
                         code_diff.append(diff_obj.a_path)
+                    else:
+                        print(f"Ignoring diff in {diff_obj.b_path} as it doesn't contain any doc example.")
 
     return code_diff
 
 
-def get_module_dependencies(module_fname):
+def get_all_doctest_files() -> List[str]:
+    """
+    Return the complete list of python and Markdown files on which we run doctest.
+
+    At this moment, we restrict this to only take files from `src/` or `docs/source/en/` that are not in `utils/not_doctested.txt`.
+
+    Returns:
+        `List[str]`: The complete list of Python and Markdown files on which we run doctest.
+    """
+    py_files = [str(x.relative_to(PATH_TO_REPO)) for x in PATH_TO_REPO.glob("**/*.py")]
+    md_files = [str(x.relative_to(PATH_TO_REPO)) for x in PATH_TO_REPO.glob("**/*.md")]
+    test_files_to_run = py_files + md_files
+
+    # only include files in `src` or `docs/source/en/`
+    test_files_to_run = [x for x in test_files_to_run if x.startswith(("src/", "docs/source/en/"))]
+    # not include init files
+    test_files_to_run = [x for x in test_files_to_run if not x.endswith(("__init__.py",))]
+
+    # These are files not doctested yet.
+    with open("utils/not_doctested.txt") as fp:
+        not_doctested = set(fp.read().strip().split("\n"))
+
+    # So far we don't have 100% coverage for doctest. This line will be removed once we achieve 100%.
+    test_files_to_run = [x for x in test_files_to_run if x not in not_doctested]
+
+    return sorted(test_files_to_run)
+
+
+def get_new_doctest_files(repo, base_commit, branching_commit) -> List[str]:
+    """
+    Get the list of files that were removed from "utils/not_doctested.txt", between `base_commit` and
+    `branching_commit`.
+
+    Returns:
+        `List[str]`: List of files that were removed from "utils/not_doctested.txt".
+    """
+    for diff_obj in branching_commit.diff(base_commit):
+        # Ignores all but the "utils/not_doctested.txt" file.
+        if diff_obj.a_path != "utils/not_doctested.txt":
+            continue
+        # Loads the two versions
+        folder = Path(repo.working_dir)
+        with checkout_commit(repo, branching_commit):
+            with open(folder / "utils/not_doctested.txt", "r", encoding="utf-8") as f:
+                old_content = f.read()
+        with open(folder / "utils/not_doctested.txt", "r", encoding="utf-8") as f:
+            new_content = f.read()
+        # Compute the removed lines and return them
+        removed_content = set(old_content.split("\n")) - set(new_content.split("\n"))
+        return sorted(removed_content)
+    return []
+
+
+def get_doctest_files(diff_with_last_commit: bool = False) -> List[str]:
+    """
+    Return a list of python and Markdown files where doc example have been modified between:
+
+    - the current head and the main branch if `diff_with_last_commit=False` (default)
+    - the current head and its parent commit otherwise.
+
+    Returns:
+        `List[str]`: The list of Python and Markdown files with a diff (files added or renamed are always returned, files
+        modified are returned if the diff in the file is only in doctest examples).
+    """
+    repo = Repo(PATH_TO_REPO)
+
+    test_files_to_run = []  # noqa
+    if not diff_with_last_commit:
+        print(f"main is at {repo.refs.main.commit}")
+        print(f"Current head is at {repo.head.commit}")
+
+        branching_commits = repo.merge_base(repo.refs.main, repo.head)
+        for commit in branching_commits:
+            print(f"Branching commit: {commit}")
+        test_files_to_run = get_diff_for_doctesting(repo, repo.head.commit, branching_commits)
+    else:
+        print(f"main is at {repo.head.commit}")
+        parent_commits = repo.head.commit.parents
+        for commit in parent_commits:
+            print(f"Parent commit: {commit}")
+        test_files_to_run = get_diff_for_doctesting(repo, repo.head.commit, parent_commits)
+
+    all_test_files_to_run = get_all_doctest_files()
+
+    # Add to the test files to run any removed entry from "utils/not_doctested.txt".
+    new_test_files = get_new_doctest_files(repo, repo.head.commit, repo.refs.main.commit)
+    test_files_to_run = list(set(test_files_to_run + new_test_files))
+
+    # Do not run slow doctest tests on CircleCI
+    with open("utils/slow_documentation_tests.txt") as fp:
+        slow_documentation_tests = set(fp.read().strip().split("\n"))
+    test_files_to_run = [
+        x for x in test_files_to_run if x in all_test_files_to_run and x not in slow_documentation_tests
+    ]
+
+    # Make sure we did not end up with a test file that was removed
+    test_files_to_run = [f for f in test_files_to_run if (PATH_TO_REPO / f).exists()]
+
+    return sorted(test_files_to_run)
+
+
+# (:?^|\n) -> Non-catching group for the beginning of the doc or a new line.
+# \s*from\s+(\.+\S+)\s+import\s+([^\n]+) -> Line only contains from .xxx import yyy and we catch .xxx and yyy
+# (?=\n) -> Look-ahead to a new line. We can't just put \n here or using find_all on this re will only catch every
+#           other import.
+_re_single_line_relative_imports = re.compile(r"(?:^|\n)\s*from\s+(\.+\S+)\s+import\s+([^\n]+)(?=\n)")
+# (:?^|\n) -> Non-catching group for the beginning of the doc or a new line.
+# \s*from\s+(\.+\S+)\s+import\s+\(([^\)]+)\) -> Line continues with from .xxx import (yyy) and we catch .xxx and yyy
+# yyy will take multiple lines otherwise there wouldn't be parenthesis.
+_re_multi_line_relative_imports = re.compile(r"(?:^|\n)\s*from\s+(\.+\S+)\s+import\s+\(([^\)]+)\)")
+# (:?^|\n) -> Non-catching group for the beginning of the doc or a new line.
+# \s*from\s+transformers(\S*)\s+import\s+([^\n]+) -> Line only contains from transformers.xxx import yyy and we catch
+#           .xxx and yyy
+# (?=\n) -> Look-ahead to a new line. We can't just put \n here or using find_all on this re will only catch every
+#           other import.
+_re_single_line_direct_imports = re.compile(r"(?:^|\n)\s*from\s+transformers(\S*)\s+import\s+([^\n]+)(?=\n)")
+# (:?^|\n) -> Non-catching group for the beginning of the doc or a new line.
+# \s*from\s+transformers(\S*)\s+import\s+\(([^\)]+)\) -> Line continues with from transformers.xxx import (yyy) and we
+# catch .xxx and yyy. yyy will take multiple lines otherwise there wouldn't be parenthesis.
+_re_multi_line_direct_imports = re.compile(r"(?:^|\n)\s*from\s+transformers(\S*)\s+import\s+\(([^\)]+)\)")
+
+
+def extract_imports(module_fname: str, cache: Dict[str, List[str]] = None) -> List[str]:
     """
-    Get the dependencies of a module.
+    Get the imports a given module makes.
+
+    Args:
+        module_fname (`str`):
+            The name of the file of the module where we want to look at the imports (given relative to the root of
+            the repo).
+        cache (Dictionary `str` to `List[str]`, *optional*):
+            To speed up this function if it was previously called on `module_fname`, the cache of all previously
+            computed results.
+
+    Returns:
+        `List[str]`: The list of module filenames imported in the input `module_fname` (a submodule we import from that
+        is a subfolder will give its init file).
     """
-    with open(os.path.join(PATH_TO_TRANFORMERS, module_fname), "r", encoding="utf-8") as f:
+    if cache is not None and module_fname in cache:
+        return cache[module_fname]
+
+    with open(PATH_TO_REPO / module_fname, "r", encoding="utf-8") as f:
         content = f.read()
 
-    module_parts = module_fname.split(os.path.sep)
+    # Filter out all docstrings to not get imports in code examples. As before we need to deactivate formatting to
+    # keep this as escaped quotes and avoid this function failing on this file.
+    # fmt: off
+    splits = content.split('\"\"\"')
+    # fmt: on
+    content = "".join(splits[::2])
+
+    module_parts = str(module_fname).split(os.path.sep)
     imported_modules = []
 
     # Let's start with relative imports
-    relative_imports = re.findall(r"from\s+(\.+\S+)\s+import\s+([^\n]+)\n", content)
-    relative_imports = [mod for mod, imp in relative_imports if "# tests_ignore" not in imp]
-    for imp in relative_imports:
+    relative_imports = _re_single_line_relative_imports.findall(content)
+    relative_imports = [
+        (mod, imp) for mod, imp in relative_imports if "# tests_ignore" not in imp and imp.strip() != "("
+    ]
+    multiline_relative_imports = _re_multi_line_relative_imports.findall(content)
+    relative_imports += [(mod, imp) for mod, imp in multiline_relative_imports if "# tests_ignore" not in imp]
+
+    # We need to remove parts of the module name depending on the depth of the relative imports.
+    for module, imports in relative_imports:
         level = 0
-        while imp.startswith("."):
-            imp = imp[1:]
+        while module.startswith("."):
+            module = module[1:]
             level += 1
 
-        if len(imp) > 0:
-            dep_parts = module_parts[: len(module_parts) - level] + imp.split(".")
+        if len(module) > 0:
+            dep_parts = module_parts[: len(module_parts) - level] + module.split(".")
         else:
-            dep_parts = module_parts[: len(module_parts) - level] + ["__init__.py"]
+            dep_parts = module_parts[: len(module_parts) - level]
         imported_module = os.path.sep.join(dep_parts)
-        # We ignore the main init import as it's only for the __version__ that it's done
-        # and it would add everything as a dependency.
-        if not imported_module.endswith("transformers/__init__.py"):
-            imported_modules.append(imported_module)
+        imported_modules.append((imported_module, [imp.strip() for imp in imports.split(",")]))
 
     # Let's continue with direct imports
-    # The import from the transformers module are ignored for the same reason we ignored the
-    # main init before.
-    direct_imports = re.findall(r"from\s+transformers\.(\S+)\s+import\s+([^\n]+)\n", content)
-    direct_imports = [mod for mod, imp in direct_imports if "# tests_ignore" not in imp]
-    for imp in direct_imports:
-        import_parts = imp.split(".")
+    direct_imports = _re_single_line_direct_imports.findall(content)
+    direct_imports = [(mod, imp) for mod, imp in direct_imports if "# tests_ignore" not in imp and imp.strip() != "("]
+    multiline_direct_imports = _re_multi_line_direct_imports.findall(content)
+    direct_imports += [(mod, imp) for mod, imp in multiline_direct_imports if "# tests_ignore" not in imp]
+
+    # We need to find the relative path of those imports.
+    for module, imports in direct_imports:
+        import_parts = module.split(".")[1:]  # ignore the name of the repo since we add it below.
         dep_parts = ["src", "transformers"] + import_parts
-        imported_modules.append(os.path.sep.join(dep_parts))
+        imported_module = os.path.sep.join(dep_parts)
+        imported_modules.append((imported_module, [imp.strip() for imp in imports.split(",")]))
 
-    # Now let's just check that we have proper module files, or append an init for submodules
-    dependencies = []
-    for imported_module in imported_modules:
-        if os.path.isfile(os.path.join(PATH_TO_TRANFORMERS, f"{imported_module}.py")):
-            dependencies.append(f"{imported_module}.py")
-        elif os.path.isdir(os.path.join(PATH_TO_TRANFORMERS, imported_module)) and os.path.isfile(
-            os.path.sep.join([PATH_TO_TRANFORMERS, imported_module, "__init__.py"])
-        ):
-            dependencies.append(os.path.sep.join([imported_module, "__init__.py"]))
-    return dependencies
+    result = []
+    # Double check we get proper modules (either a python file or a folder with an init).
+    for module_file, imports in imported_modules:
+        if (PATH_TO_REPO / f"{module_file}.py").is_file():
+            module_file = f"{module_file}.py"
+        elif (PATH_TO_REPO / module_file).is_dir() and (PATH_TO_REPO / module_file / "__init__.py").is_file():
+            module_file = os.path.sep.join([module_file, "__init__.py"])
+        imports = [imp for imp in imports if len(imp) > 0 and re.match("^[A-Za-z0-9_]*$", imp)]
+        if len(imports) > 0:
+            result.append((module_file, imports))
 
+    if cache is not None:
+        cache[module_fname] = result
 
-def get_test_dependencies(test_fname):
-    """
-    Get the dependencies of a test file.
-    """
-    with open(os.path.join(PATH_TO_TRANFORMERS, test_fname), "r", encoding="utf-8") as f:
-        content = f.read()
+    return result
 
-    # Tests only have relative imports for other test files
-    # TODO Sylvain: handle relative imports cleanly
-    relative_imports = re.findall(r"from\s+(\.\S+)\s+import\s+([^\n]+)\n", content)
-    relative_imports = [test for test, imp in relative_imports if "# tests_ignore" not in imp]
 
-    def _convert_relative_import_to_file(relative_import):
-        level = 0
-        while relative_import.startswith("."):
-            level += 1
-            relative_import = relative_import[1:]
+def get_module_dependencies(module_fname: str, cache: Dict[str, List[str]] = None) -> List[str]:
+    """
+    Refines the result of `extract_imports` to remove subfolders and get a proper list of module filenames: if a file
+    as an import `from utils import Foo, Bar`, with `utils` being a subfolder containing many files, this will traverse
+    the `utils` init file to check where those dependencies come from: for instance the files utils/foo.py and utils/bar.py.
+
+    Warning: This presupposes that all intermediate inits are properly built (with imports from the respective
+    submodules) and work better if objects are defined in submodules and not the intermediate init (otherwise the
+    intermediate init is added, and inits usually have a lot of dependencies).
 
-        directory = os.path.sep.join(test_fname.split(os.path.sep)[:-level])
-        return os.path.join(directory, f"{relative_import.replace('.', os.path.sep)}.py")
+    Args:
+        module_fname (`str`):
+            The name of the file of the module where we want to look at the imports (given relative to the root of
+            the repo).
+        cache (Dictionary `str` to `List[str]`, *optional*):
+            To speed up this function if it was previously called on `module_fname`, the cache of all previously
+            computed results.
+
+    Returns:
+        `List[str]`: The list of module filenames imported in the input `module_fname` (with submodule imports refined).
+    """
+    dependencies = []
+    imported_modules = extract_imports(module_fname, cache=cache)
+    # The while loop is to recursively traverse all inits we may encounter: we will add things as we go.
+    while len(imported_modules) > 0:
+        new_modules = []
+        for module, imports in imported_modules:
+            # If we end up in an __init__ we are often not actually importing from this init (except in the case where
+            # the object is fully defined in the __init__)
+            if module.endswith("__init__.py"):
+                # So we get the imports from that init then try to find where our objects come from.
+                new_imported_modules = extract_imports(module, cache=cache)
+                for new_module, new_imports in new_imported_modules:
+                    if any(i in new_imports for i in imports):
+                        if new_module not in dependencies:
+                            new_modules.append((new_module, [i for i in new_imports if i in imports]))
+                        imports = [i for i in imports if i not in new_imports]
+                if len(imports) > 0:
+                    # If there are any objects lefts, they may be a submodule
+                    path_to_module = PATH_TO_REPO / module.replace("__init__.py", "")
+                    dependencies.extend(
+                        [
+                            os.path.join(module.replace("__init__.py", ""), f"{i}.py")
+                            for i in imports
+                            if (path_to_module / f"{i}.py").is_file()
+                        ]
+                    )
+                    imports = [i for i in imports if not (path_to_module / f"{i}.py").is_file()]
+                    if len(imports) > 0:
+                        # Then if there are still objects left, they are fully defined in the init, so we keep it as a
+                        # dependency.
+                        dependencies.append(module)
+            else:
+                dependencies.append(module)
 
-    dependencies = [_convert_relative_import_to_file(relative_import) for relative_import in relative_imports]
-    return [f for f in dependencies if os.path.isfile(os.path.join(PATH_TO_TRANFORMERS, f))]
+        imported_modules = new_modules
+    return dependencies
 
 
-def create_reverse_dependency_tree():
+def create_reverse_dependency_tree() -> List[Tuple[str, str]]:
     """
     Create a list of all edges (a, b) which mean that modifying a impacts b with a going over all module and test files.
     """
-    modules = [
-        str(f.relative_to(PATH_TO_TRANFORMERS))
-        for f in (Path(PATH_TO_TRANFORMERS) / "src/transformers").glob("**/*.py")
-    ]
-    module_edges = [(d, m) for m in modules for d in get_module_dependencies(m)]
-
-    tests = [str(f.relative_to(PATH_TO_TRANFORMERS)) for f in (Path(PATH_TO_TRANFORMERS) / "tests").glob("**/*.py")]
-    test_edges = [(d, t) for t in tests for d in get_test_dependencies(t)]
+    cache = {}
+    all_modules = list(PATH_TO_TRANFORMERS.glob("**/*.py")) + list(PATH_TO_TESTS.glob("**/*.py"))
+    all_modules = [str(mod.relative_to(PATH_TO_REPO)) for mod in all_modules]
+    edges = [(dep, mod) for mod in all_modules for dep in get_module_dependencies(mod, cache=cache)]
 
-    return module_edges + test_edges
+    return list(set(edges))
 
 
-def get_tree_starting_at(module, edges):
+def get_tree_starting_at(module: str, edges: List[Tuple[str, str]]) -> List[Union[str, List[str]]]:
     """
-    Returns the tree starting at a given module following all edges in the following format: [module, [list of edges
-    starting at module], [list of edges starting at the preceding level], ...]
+    Returns the tree starting at a given module following all edges.
+
+    Args:
+        module (`str`): The module that will be the root of the subtree we want.
+        eges (`List[Tuple[str, str]]`): The list of all edges of the tree.
+
+    Returns:
+        `List[Union[str, List[str]]]`: The tree to print in the following format: [module, [list of edges
+        starting at module], [list of edges starting at the preceding level], ...]
     """
     vertices_seen = [module]
-    new_edges = [edge for edge in edges if edge[0] == module and edge[1] != module]
+    new_edges = [edge for edge in edges if edge[0] == module and edge[1] != module and "__init__.py" not in edge[1]]
     tree = [module]
     while len(new_edges) > 0:
         tree.append(new_edges)
-        final_vertices = list(set(edge[1] for edge in new_edges))
+        final_vertices = list({edge[1] for edge in new_edges})
         vertices_seen.extend(final_vertices)
-        new_edges = [edge for edge in edges if edge[0] in final_vertices and edge[1] not in vertices_seen]
+        new_edges = [
+            edge
+            for edge in edges
+            if edge[0] in final_vertices and edge[1] not in vertices_seen and "__init__.py" not in edge[1]
+        ]
 
     return tree
 
@@ -275,6 +675,11 @@ def get_tree_starting_at(module, edges):
 def print_tree_deps_of(module, all_edges=None):
     """
     Prints the tree of modules depending on a given module.
+
+    Args:
+        module (`str`): The module that will be the root of the subtree we want.
+        all_eges (`List[Tuple[str, str]]`, *optional*):
+            The list of all edges of the tree. Will be set to `create_reverse_dependency_tree()` if not passed.
     """
     if all_edges is None:
         all_edges = create_reverse_dependency_tree()
@@ -285,10 +690,10 @@ def print_tree_deps_of(module, all_edges=None):
     lines = [(tree[0], tree[0])]
     for index in range(1, len(tree)):
         edges = tree[index]
-        start_edges = set([edge[0] for edge in edges])
+        start_edges = {edge[0] for edge in edges}
 
         for start in start_edges:
-            end_edges = set([edge[1] for edge in edges if edge[0] == start])
+            end_edges = {edge[1] for edge in edges if edge[0] == start}
             # We will insert all those edges just after the line showing start.
             pos = 0
             while lines[pos][1] != start:
@@ -300,295 +705,277 @@ def print_tree_deps_of(module, all_edges=None):
         print(line[0])
 
 
-def create_reverse_dependency_map():
+def init_test_examples_dependencies() -> Tuple[Dict[str, List[str]], List[str]]:
     """
-    Create the dependency map from module/test filename to the list of modules/tests that depend on it (even
-    recursively).
+    The test examples do not import from the examples (which are just scripts, not modules) so we need som extra
+    care initializing the dependency map, which is the goal of this function. It initializes the dependency map for
+    example files by linking each example to the example test file for the example framework.
+
+    Returns:
+        `Tuple[Dict[str, List[str]], List[str]]`: A tuple with two elements: the initialized dependency map which is a
+        dict test example file to list of example files potentially tested by that test file, and the list of all
+        example files (to avoid recomputing it later).
     """
-    modules = [
-        str(f.relative_to(PATH_TO_TRANFORMERS))
-        for f in (Path(PATH_TO_TRANFORMERS) / "src/transformers").glob("**/*.py")
-    ]
-    # We grab all the dependencies of each module.
-    direct_deps = {m: get_module_dependencies(m) for m in modules}
+    test_example_deps = {}
+    all_examples = []
+    for framework in ["flax", "pytorch", "tensorflow"]:
+        test_files = list((PATH_TO_EXAMPLES / framework).glob("test_*.py"))
+        all_examples.extend(test_files)
+        # Remove the files at the root of examples/framework since they are not proper examples (they are eith utils
+        # or example test files).
+        examples = [
+            f for f in (PATH_TO_EXAMPLES / framework).glob("**/*.py") if f.parent != PATH_TO_EXAMPLES / framework
+        ]
+        all_examples.extend(examples)
+        for test_file in test_files:
+            with open(test_file, "r", encoding="utf-8") as f:
+                content = f.read()
+            # Map all examples to the test files found in examples/framework.
+            test_example_deps[str(test_file.relative_to(PATH_TO_REPO))] = [
+                str(e.relative_to(PATH_TO_REPO)) for e in examples if e.name in content
+            ]
+            # Also map the test files to themselves.
+            test_example_deps[str(test_file.relative_to(PATH_TO_REPO))].append(
+                str(test_file.relative_to(PATH_TO_REPO))
+            )
+    return test_example_deps, all_examples
 
-    # We add all the dependencies of each test file
-    tests = [str(f.relative_to(PATH_TO_TRANFORMERS)) for f in (Path(PATH_TO_TRANFORMERS) / "tests").glob("**/*.py")]
-    direct_deps.update({t: get_test_dependencies(t) for t in tests})
 
-    all_files = modules + tests
+def create_reverse_dependency_map() -> Dict[str, List[str]]:
+    """
+    Create the dependency map from module/test filename to the list of modules/tests that depend on it recursively.
+
+    Returns:
+        `Dict[str, List[str]]`: The reverse dependency map as a dictionary mapping filenames to all the filenames
+        depending on it recursively. This way the tests impacted by a change in file A are the test files in the list
+        corresponding to key A in this result.
+    """
+    cache = {}
+    # Start from the example deps init.
+    example_deps, examples = init_test_examples_dependencies()
+    # Add all modules and all tests to all examples
+    all_modules = list(PATH_TO_TRANFORMERS.glob("**/*.py")) + list(PATH_TO_TESTS.glob("**/*.py")) + examples
+    all_modules = [str(mod.relative_to(PATH_TO_REPO)) for mod in all_modules]
+    # Compute the direct dependencies of all modules.
+    direct_deps = {m: get_module_dependencies(m, cache=cache) for m in all_modules}
+    direct_deps.update(example_deps)
 
     # This recurses the dependencies
     something_changed = True
     while something_changed:
         something_changed = False
-        for m in all_files:
+        for m in all_modules:
             for d in direct_deps[m]:
+                # We stop recursing at an init (cause we always end up in the main init and we don't want to add all
+                # files which the main init imports)
+                if d.endswith("__init__.py"):
+                    continue
                 if d not in direct_deps:
                     raise ValueError(f"KeyError:{d}. From {m}")
-                for dep in direct_deps[d]:
-                    if dep not in direct_deps[m]:
-                        direct_deps[m].append(dep)
-                        something_changed = True
+                new_deps = set(direct_deps[d]) - set(direct_deps[m])
+                if len(new_deps) > 0:
+                    direct_deps[m].extend(list(new_deps))
+                    something_changed = True
 
     # Finally we can build the reverse map.
     reverse_map = collections.defaultdict(list)
-    for m in all_files:
-        if m.endswith("__init__.py"):
-            reverse_map[m].extend(direct_deps[m])
+    for m in all_modules:
         for d in direct_deps[m]:
             reverse_map[d].append(m)
 
+    # For inits, we don't do the reverse deps but the direct deps: if modifying an init, we want to make sure we test
+    # all the modules impacted by that init.
+    for m in [f for f in all_modules if f.endswith("__init__.py")]:
+        direct_deps = get_module_dependencies(m, cache=cache)
+        deps = sum([reverse_map[d] for d in direct_deps if not d.endswith("__init__.py")], direct_deps)
+        reverse_map[m] = list(set(deps) - {m})
+
     return reverse_map
 
 
-# Any module file that has a test name which can't be inferred automatically from its name should go here. A better
-# approach is to (re-)name the test file accordingly, and second best to add the correspondence map here.
-SPECIAL_MODULE_TO_TEST_MAP = {
-    "commands/add_new_model_like.py": "utils/test_add_new_model_like.py",
-    "configuration_utils.py": "test_configuration_common.py",
-    "convert_graph_to_onnx.py": "onnx/test_onnx.py",
-    "data/data_collator.py": "trainer/test_data_collator.py",
-    "deepspeed.py": "deepspeed/",
-    "feature_extraction_sequence_utils.py": "test_sequence_feature_extraction_common.py",
-    "feature_extraction_utils.py": "test_feature_extraction_common.py",
-    "file_utils.py": ["utils/test_file_utils.py", "utils/test_model_output.py"],
-    "image_transforms.py": "test_image_transforms.py",
-    "utils/generic.py": ["utils/test_file_utils.py", "utils/test_model_output.py", "utils/test_generic.py"],
-    "utils/hub.py": "utils/test_hub_utils.py",
-    "modelcard.py": "utils/test_model_card.py",
-    "modeling_flax_utils.py": "test_modeling_flax_common.py",
-    "modeling_tf_utils.py": ["test_modeling_tf_common.py", "utils/test_modeling_tf_core.py"],
-    "modeling_utils.py": ["test_modeling_common.py", "utils/test_offline.py"],
-    "models/auto/modeling_auto.py": [
-        "models/auto/test_modeling_auto.py",
-        "models/auto/test_modeling_tf_pytorch.py",
-        "models/bort/test_modeling_bort.py",
-        "models/dit/test_modeling_dit.py",
-    ],
-    "models/auto/modeling_flax_auto.py": "models/auto/test_modeling_flax_auto.py",
-    "models/auto/modeling_tf_auto.py": [
-        "models/auto/test_modeling_tf_auto.py",
-        "models/auto/test_modeling_tf_pytorch.py",
-        "models/bort/test_modeling_tf_bort.py",
-    ],
-    "models/gpt2/modeling_gpt2.py": [
-        "models/gpt2/test_modeling_gpt2.py",
-        "models/megatron_gpt2/test_modeling_megatron_gpt2.py",
-    ],
-    "models/dpt/modeling_dpt.py": [
-        "models/dpt/test_modeling_dpt.py",
-        "models/dpt/test_modeling_dpt_hybrid.py",
-    ],
-    "optimization.py": "optimization/test_optimization.py",
-    "optimization_tf.py": "optimization/test_optimization_tf.py",
-    "pipelines/__init__.py": "pipelines/test_pipelines_*.py",
-    "pipelines/base.py": "pipelines/test_pipelines_*.py",
-    "pipelines/text2text_generation.py": [
-        "pipelines/test_pipelines_text2text_generation.py",
-        "pipelines/test_pipelines_summarization.py",
-        "pipelines/test_pipelines_translation.py",
-    ],
-    "pipelines/zero_shot_classification.py": "pipelines/test_pipelines_zero_shot.py",
-    "testing_utils.py": "utils/test_skip_decorators.py",
-    "tokenization_utils.py": ["test_tokenization_common.py", "tokenization/test_tokenization_utils.py"],
-    "tokenization_utils_base.py": ["test_tokenization_common.py", "tokenization/test_tokenization_utils.py"],
-    "tokenization_utils_fast.py": [
-        "test_tokenization_common.py",
-        "tokenization/test_tokenization_utils.py",
-        "tokenization/test_tokenization_fast.py",
-    ],
-    "trainer.py": [
-        "trainer/test_trainer.py",
-        "extended/test_trainer_ext.py",
-        "trainer/test_trainer_distributed.py",
-        "trainer/test_trainer_tpu.py",
-    ],
-    "train_pt_utils.py": "trainer/test_trainer_utils.py",
-    "utils/versions.py": "utils/test_versions_utils.py",
-}
-
-
-def module_to_test_file(module_fname):
-    """
-    Returns the name of the file(s) where `module_fname` is tested.
-    """
-    splits = module_fname.split(os.path.sep)
-
-    # Special map has priority
-    short_name = os.path.sep.join(splits[2:])
-    if short_name in SPECIAL_MODULE_TO_TEST_MAP:
-        test_file = SPECIAL_MODULE_TO_TEST_MAP[short_name]
-        if isinstance(test_file, str):
-            return f"tests/{test_file}"
-        return [f"tests/{f}" for f in test_file]
-
-    module_name = splits[-1]
-    # Fast tokenizers are tested in the same file as the slow ones.
-    if module_name.endswith("_fast.py"):
-        module_name = module_name.replace("_fast.py", ".py")
-
-    # Special case for pipelines submodules
-    if len(splits) >= 2 and splits[-2] == "pipelines":
-        default_test_file = f"tests/pipelines/test_pipelines_{module_name}"
-    # Special case for benchmarks submodules
-    elif len(splits) >= 2 and splits[-2] == "benchmark":
-        return ["tests/benchmark/test_benchmark.py", "tests/benchmark/test_benchmark_tf.py"]
-    # Special case for commands submodules
-    elif len(splits) >= 2 and splits[-2] == "commands":
-        return "tests/utils/test_cli.py"
-    # Special case for onnx submodules
-    elif len(splits) >= 2 and splits[-2] == "onnx":
-        return ["tests/onnx/test_features.py", "tests/onnx/test_onnx.py", "tests/onnx/test_onnx_v2.py"]
-    # Special case for utils (not the one in src/transformers, the ones at the root of the repo).
-    elif len(splits) > 0 and splits[0] == "utils":
-        default_test_file = f"tests/repo_utils/test_{module_name}"
-    elif len(splits) > 4 and splits[2] == "models":
-        default_test_file = f"tests/models/{splits[3]}/test_{module_name}"
-    elif len(splits) > 2 and splits[2].startswith("generation"):
-        default_test_file = f"tests/generation/test_{module_name}"
-    elif len(splits) > 2 and splits[2].startswith("trainer"):
-        default_test_file = f"tests/trainer/test_{module_name}"
-    else:
-        default_test_file = f"tests/utils/test_{module_name}"
-
-    if os.path.isfile(default_test_file):
-        return default_test_file
-
-    # Processing -> processor
-    if "processing" in default_test_file:
-        test_file = default_test_file.replace("processing", "processor")
-        if os.path.isfile(test_file):
-            return test_file
-
-
-# This list contains the list of test files we expect never to be launched from a change in a module/util. Those are
-# launched separately.
-EXPECTED_TEST_FILES_NEVER_TOUCHED = [
-    "tests/utils/test_doc_samples.py",  # Doc tests
-    "tests/pipelines/test_pipelines_common.py",  # Actually checked by the pipeline based file
-    "tests/sagemaker/test_single_node_gpu.py",  # SageMaker test
-    "tests/sagemaker/test_multi_node_model_parallel.py",  # SageMaker test
-    "tests/sagemaker/test_multi_node_data_parallel.py",  # SageMaker test
-    "tests/mixed_int8/test_mixed_int8.py",  # Mixed-int8 bitsandbytes test
-]
+def create_module_to_test_map(
+    reverse_map: Dict[str, List[str]] = None, filter_models: bool = False
+) -> Dict[str, List[str]]:
+    """
+    Extract the tests from the reverse_dependency_map and potentially filters the model tests.
 
+    Args:
+        reverse_map (`Dict[str, List[str]]`, *optional*):
+            The reverse dependency map as created by `create_reverse_dependency_map`. Will default to the result of
+            that function if not provided.
+        filter_models (`bool`, *optional*, defaults to `False`):
+            Whether or not to filter model tests to only include core models if a file impacts a lot of models.
+
+    Returns:
+        `Dict[str, List[str]]`: A dictionary that maps each file to the tests to execute if that file was modified.
+    """
+    if reverse_map is None:
+        reverse_map = create_reverse_dependency_map()
 
-def _print_list(l):
-    return "\n".join([f"- {f}" for f in l])
+    # Utility that tells us if a given file is a test (taking test examples into account)
+    def is_test(fname):
+        if fname.startswith("tests"):
+            return True
+        if fname.startswith("examples") and fname.split(os.path.sep)[-1].startswith("test"):
+            return True
+        return False
+
+    # Build the test map
+    test_map = {module: [f for f in deps if is_test(f)] for module, deps in reverse_map.items()}
+
+    if not filter_models:
+        return test_map
+
+    # Now we deal with the filtering if `filter_models` is True.
+    num_model_tests = len(list(PATH_TO_TESTS.glob("models/*")))
+
+    def has_many_models(tests):
+        # We filter to core models when a given file impacts more than half the model tests.
+        model_tests = {Path(t).parts[2] for t in tests if t.startswith("tests/models/")}
+        return len(model_tests) > num_model_tests // 2
 
+    def filter_tests(tests):
+        return [t for t in tests if not t.startswith("tests/models/") or Path(t).parts[2] in IMPORTANT_MODELS]
 
-def sanity_check():
+    return {module: (filter_tests(tests) if has_many_models(tests) else tests) for module, tests in test_map.items()}
+
+
+def check_imports_all_exist():
     """
-    Checks that all test files can be touched by a modification in at least one module/utils. This test ensures that
-    newly-added test files are properly mapped to some module or utils, so they can be run by the CI.
+    Isn't used per se by the test fetcher but might be used later as a quality check. Putting this here for now so the
+    code is not lost. This checks all imports in a given file do exist.
     """
-    # Grab all module and utils
-    all_files = [
-        str(p.relative_to(PATH_TO_TRANFORMERS))
-        for p in (Path(PATH_TO_TRANFORMERS) / "src/transformers").glob("**/*.py")
-    ]
-    all_files += [
-        str(p.relative_to(PATH_TO_TRANFORMERS)) for p in (Path(PATH_TO_TRANFORMERS) / "utils").glob("**/*.py")
-    ]
+    cache = {}
+    all_modules = list(PATH_TO_TRANFORMERS.glob("**/*.py")) + list(PATH_TO_TESTS.glob("**/*.py"))
+    all_modules = [str(mod.relative_to(PATH_TO_REPO)) for mod in all_modules]
+    direct_deps = {m: get_module_dependencies(m, cache=cache) for m in all_modules}
 
-    # Compute all the test files we get from those.
-    test_files_found = []
-    for f in all_files:
-        test_f = module_to_test_file(f)
-        if test_f is not None:
-            if isinstance(test_f, str):
-                test_files_found.append(test_f)
-            else:
-                test_files_found.extend(test_f)
-
-    # Some of the test files might actually be subfolders so we grab the tests inside.
-    test_files = []
-    for test_f in test_files_found:
-        if os.path.isdir(os.path.join(PATH_TO_TRANFORMERS, test_f)):
-            test_files.extend(
-                [
-                    str(p.relative_to(PATH_TO_TRANFORMERS))
-                    for p in (Path(PATH_TO_TRANFORMERS) / test_f).glob("**/test*.py")
-                ]
-            )
-        else:
-            test_files.append(test_f)
+    for module, deps in direct_deps.items():
+        for dep in deps:
+            if not (PATH_TO_REPO / dep).is_file():
+                print(f"{module} has dependency on {dep} which does not exist.")
 
-    # Compare to existing test files
-    existing_test_files = [
-        str(p.relative_to(PATH_TO_TRANFORMERS)) for p in (Path(PATH_TO_TRANFORMERS) / "tests").glob("**/test*.py")
-    ]
-    not_touched_test_files = [f for f in existing_test_files if f not in test_files]
 
-    should_be_tested = set(not_touched_test_files) - set(EXPECTED_TEST_FILES_NEVER_TOUCHED)
-    if len(should_be_tested) > 0:
-        raise ValueError(
-            "The following test files are not currently associated with any module or utils files, which means they "
-            f"will never get run by the CI:\n{_print_list(should_be_tested)}\n. Make sure the names of these test "
-            "files match the name of the module or utils they are testing, or adapt the constant "
-            "`SPECIAL_MODULE_TO_TEST_MAP` in `utils/tests_fetcher.py` to add them. If your test file is triggered "
-            "separately and is not supposed to be run by the regular CI, add it to the "
-            "`EXPECTED_TEST_FILES_NEVER_TOUCHED` constant instead."
-        )
+def _print_list(l) -> str:
+    """
+    Pretty print a list of elements with one line per element and a - starting each line.
+    """
+    return "\n".join([f"- {f}" for f in l])
+
+
+def create_json_map(test_files_to_run: List[str], json_output_file: str):
+    """
+    Creates a map from a list of tests to run to easily split them by category, when running parallelism of slow tests.
+
+    Args:
+        test_files_to_run (`List[str]`): The list of tests to run.
+        json_output_file (`str`): The path where to store the built json map.
+    """
+    if json_output_file is None:
+        return
 
+    test_map = {}
+    for test_file in test_files_to_run:
+        # `test_file` is a path to a test folder/file, starting with `tests/`. For example,
+        #   - `tests/models/bert/test_modeling_bert.py` or `tests/models/bert`
+        #   - `tests/trainer/test_trainer.py` or `tests/trainer`
+        #   - `tests/test_modeling_common.py`
+        names = test_file.split(os.path.sep)
+        if names[1] == "models":
+            # take the part like `models/bert` for modeling tests
+            key = os.path.sep.join(names[1:3])
+        elif len(names) > 2 or not test_file.endswith(".py"):
+            # test folders under `tests` or python files under them
+            # take the part like tokenization, `pipeline`, etc. for other test categories
+            key = os.path.sep.join(names[1:2])
+        else:
+            # common test files directly under `tests/`
+            key = "common"
+
+        if key not in test_map:
+            test_map[key] = []
+        test_map[key].append(test_file)
+
+    # sort the keys & values
+    keys = sorted(test_map.keys())
+    test_map = {k: " ".join(sorted(test_map[k])) for k in keys}
+    with open(json_output_file, "w", encoding="UTF-8") as fp:
+        json.dump(test_map, fp, ensure_ascii=False)
+
+
+def infer_tests_to_run(
+    output_file: str,
+    diff_with_last_commit: bool = False,
+    filter_models: bool = True,
+    json_output_file: Optional[str] = None,
+):
+    """
+    The main function called by the test fetcher. Determines the tests to run from the diff.
 
-def infer_tests_to_run(output_file, diff_with_last_commit=False, filters=None, json_output_file=None):
+    Args:
+        output_file (`str`):
+            The path where to store the summary of the test fetcher analysis. Other files will be stored in the same
+            folder:
+
+            - examples_test_list.txt: The list of examples tests to run.
+            - test_repo_utils.txt: Will indicate if the repo utils tests should be run or not.
+            - doctest_list.txt: The list of doctests to run.
+
+        diff_with_last_commit (`bool`, *optional*, defaults to `False`):
+            Whether to analyze the diff with the last commit (for use on the main branch after a PR is merged) or with
+            the branching point from main (for use on each PR).
+        filter_models (`bool`, *optional*, defaults to `True`):
+            Whether or not to filter the tests to core models only, when a file modified results in a lot of model
+            tests.
+        json_output_file (`str`, *optional*):
+            The path where to store the json file mapping categories of tests to tests to run (used for parallelism or
+            the slow tests).
+    """
     modified_files = get_modified_python_files(diff_with_last_commit=diff_with_last_commit)
     print(f"\n### MODIFIED FILES ###\n{_print_list(modified_files)}")
 
     # Create the map that will give us all impacted modules.
-    impacted_modules_map = create_reverse_dependency_map()
+    reverse_map = create_reverse_dependency_map()
     impacted_files = modified_files.copy()
     for f in modified_files:
-        if f in impacted_modules_map:
-            impacted_files.extend(impacted_modules_map[f])
+        if f in reverse_map:
+            impacted_files.extend(reverse_map[f])
 
     # Remove duplicates
-    impacted_files = sorted(list(set(impacted_files)))
+    impacted_files = sorted(set(impacted_files))
     print(f"\n### IMPACTED FILES ###\n{_print_list(impacted_files)}")
 
     # Grab the corresponding test files:
-    if "setup.py" in impacted_files:
-        test_files_to_run = ["tests"]
+    if "setup.py" in modified_files:
+        test_files_to_run = ["tests", "examples"]
         repo_utils_launch = True
     else:
-        # Grab the corresponding test files:
-        test_files_to_run = []
-        for f in impacted_files:
-            # Modified test files are always added
-            if f.startswith("tests/"):
-                test_files_to_run.append(f)
-            # Example files are tested separately
-            elif f.startswith("examples/pytorch"):
-                test_files_to_run.append("examples/pytorch/test_pytorch_examples.py")
-                test_files_to_run.append("examples/pytorch/test_accelerate_examples.py")
-            elif f.startswith("examples/flax"):
-                test_files_to_run.append("examples/flax/test_flax_examples.py")
-            else:
-                new_tests = module_to_test_file(f)
-                if new_tests is not None:
-                    if isinstance(new_tests, str):
-                        test_files_to_run.append(new_tests)
-                    else:
-                        test_files_to_run.extend(new_tests)
-
-        # Remove duplicates
-        test_files_to_run = sorted(list(set(test_files_to_run)))
+        # All modified tests need to be run.
+        test_files_to_run = [
+            f for f in modified_files if f.startswith("tests") and f.split(os.path.sep)[-1].startswith("test")
+        ]
+        # Then we grab the corresponding test files.
+        test_map = create_module_to_test_map(reverse_map=reverse_map, filter_models=filter_models)
+        for f in modified_files:
+            if f in test_map:
+                test_files_to_run.extend(test_map[f])
+        test_files_to_run = sorted(set(test_files_to_run))
+        # Remove repo utils tests
+        test_files_to_run = [f for f in test_files_to_run if not f.split(os.path.sep)[1] == "repo_utils"]
+        # Remove SageMaker tests
+        test_files_to_run = [f for f in test_files_to_run if not f.split(os.path.sep)[1] == "sagemaker"]
         # Make sure we did not end up with a test file that was removed
-        test_files_to_run = [f for f in test_files_to_run if os.path.isfile(f) or os.path.isdir(f)]
-        if filters is not None:
-            filtered_files = []
-            for filter in filters:
-                filtered_files.extend([f for f in test_files_to_run if f.startswith(filter)])
-            test_files_to_run = filtered_files
-        repo_utils_launch = any(f.split(os.path.sep)[1] == "repo_utils" for f in test_files_to_run)
+        test_files_to_run = [f for f in test_files_to_run if (PATH_TO_REPO / f).exists()]
+
+        repo_utils_launch = any(f.split(os.path.sep)[0] == "utils" for f in modified_files)
 
     if repo_utils_launch:
         repo_util_file = Path(output_file).parent / "test_repo_utils.txt"
         with open(repo_util_file, "w", encoding="utf-8") as f:
             f.write("tests/repo_utils")
 
+    examples_tests_to_run = [f for f in test_files_to_run if f.startswith("examples")]
+    test_files_to_run = [f for f in test_files_to_run if not f.startswith("examples")]
     print(f"\n### TEST TO RUN ###\n{_print_list(test_files_to_run)}")
     if len(test_files_to_run) > 0:
         with open(output_file, "w", encoding="utf-8") as f:
@@ -601,37 +988,27 @@ def infer_tests_to_run(output_file, diff_with_last_commit=False, filters=None, j
         if "tests" in test_files_to_run:
             test_files_to_run = get_all_tests()
 
-        if json_output_file is not None:
-            test_map = {}
-            for test_file in test_files_to_run:
-                # `test_file` is a path to a test folder/file, starting with `tests/`. For example,
-                #   - `tests/models/bert/test_modeling_bert.py` or `tests/models/bert`
-                #   - `tests/trainer/test_trainer.py` or `tests/trainer`
-                #   - `tests/test_modeling_common.py`
-                names = test_file.split(os.path.sep)
-                if names[1] == "models":
-                    # take the part like `models/bert` for modeling tests
-                    key = "/".join(names[1:3])
-                elif len(names) > 2 or not test_file.endswith(".py"):
-                    # test folders under `tests` or python files under them
-                    # take the part like tokenization, `pipeline`, etc. for other test categories
-                    key = "/".join(names[1:2])
-                else:
-                    # common test files directly under `tests/`
-                    key = "common"
+        create_json_map(test_files_to_run, json_output_file)
+
+    print(f"\n### EXAMPLES TEST TO RUN ###\n{_print_list(examples_tests_to_run)}")
+    if len(examples_tests_to_run) > 0:
+        # We use `all` in the case `commit_flags["test_all"]` as well as in `create_circleci_config.py` for processing
+        if examples_tests_to_run == ["examples"]:
+            examples_tests_to_run = ["all"]
+        example_file = Path(output_file).parent / "examples_test_list.txt"
+        with open(example_file, "w", encoding="utf-8") as f:
+            f.write(" ".join(examples_tests_to_run))
 
-                if key not in test_map:
-                    test_map[key] = []
-                test_map[key].append(test_file)
+    doctest_list = get_doctest_files()
 
-            # sort the keys & values
-            keys = sorted(test_map.keys())
-            test_map = {k: " ".join(sorted(test_map[k])) for k in keys}
-            with open(json_output_file, "w", encoding="UTF-8") as fp:
-                json.dump(test_map, fp, ensure_ascii=False)
+    print(f"\n### DOCTEST TO RUN ###\n{_print_list(doctest_list)}")
+    if len(doctest_list) > 0:
+        doctest_file = Path(output_file).parent / "doctest_list.txt"
+        with open(doctest_file, "w", encoding="utf-8") as f:
+            f.write(" ".join(doctest_list))
 
 
-def filter_tests(output_file, filters):
+def filter_tests(output_file: str, filters: List[str]):
     """
     Reads the content of the output file and filters out all the tests in a list of given folders.
 
@@ -658,11 +1035,34 @@ def filter_tests(output_file, filters):
         f.write(" ".join(test_files))
 
 
+def parse_commit_message(commit_message: str) -> Dict[str, bool]:
+    """
+    Parses the commit message to detect if a command is there to skip, force all or part of the CI.
+
+    Args:
+        commit_message (`str`): The commit message of the current commit.
+
+    Returns:
+        `Dict[str, bool]`: A dictionary of strings to bools with keys the following keys: `"skip"`,
+        `"test_all_models"` and `"test_all"`.
+    """
+    if commit_message is None:
+        return {"skip": False, "no_filter": False, "test_all": False}
+
+    command_search = re.search(r"\[([^\]]*)\]", commit_message)
+    if command_search is not None:
+        command = command_search.groups()[0]
+        command = command.lower().replace("-", " ").replace("_", " ")
+        skip = command in ["ci skip", "skip ci", "circleci skip", "skip circleci"]
+        no_filter = set(command.split(" ")) == {"no", "filter"}
+        test_all = set(command.split(" ")) == {"test", "all"}
+        return {"skip": skip, "no_filter": no_filter, "test_all": test_all}
+    else:
+        return {"skip": False, "no_filter": False, "test_all": False}
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--sanity_check", action="store_true", help="Only test that all tests and modules are accounted for."
-    )
     parser.add_argument(
         "--output_file", type=str, default="test_list.txt", help="Where to store the list of tests to run"
     )
@@ -677,13 +1077,6 @@ def filter_tests(output_file, filters):
         action="store_true",
         help="To fetch the tests between the current commit and the last commit",
     )
-    parser.add_argument(
-        "--filters",
-        type=str,
-        nargs="*",
-        default=["tests"],
-        help="Only keep the test files matching one of those filters.",
-    )
     parser.add_argument(
         "--filter_tests",
         action="store_true",
@@ -695,33 +1088,53 @@ def filter_tests(output_file, filters):
         help="Will only print the tree of modules depending on the file passed.",
         default=None,
     )
+    parser.add_argument(
+        "--commit_message",
+        type=str,
+        help="The commit message (which could contain a command to force all tests or skip the CI).",
+        default=None,
+    )
     args = parser.parse_args()
     if args.print_dependencies_of is not None:
         print_tree_deps_of(args.print_dependencies_of)
-    elif args.sanity_check:
-        sanity_check()
     elif args.filter_tests:
         filter_tests(args.output_file, ["pipelines", "repo_utils"])
     else:
-        repo = Repo(PATH_TO_TRANFORMERS)
+        repo = Repo(PATH_TO_REPO)
+        commit_message = repo.head.commit.message
+        commit_flags = parse_commit_message(commit_message)
+        if commit_flags["skip"]:
+            print("Force-skipping the CI")
+            quit()
+        if commit_flags["no_filter"]:
+            print("Running all tests fetched without filtering.")
+        if commit_flags["test_all"]:
+            print("Force-launching all tests")
 
         diff_with_last_commit = args.diff_with_last_commit
         if not diff_with_last_commit and not repo.head.is_detached and repo.head.ref == repo.refs.main:
             print("main branch detected, fetching tests against last commit.")
             diff_with_last_commit = True
 
-        try:
-            infer_tests_to_run(
-                args.output_file,
-                diff_with_last_commit=diff_with_last_commit,
-                filters=args.filters,
-                json_output_file=args.json_output_file,
-            )
-            filter_tests(args.output_file, ["repo_utils"])
-        except Exception as e:
-            print(f"\nError when trying to grab the relevant tests: {e}\n\nRunning all tests.")
+        if not commit_flags["test_all"]:
+            try:
+                infer_tests_to_run(
+                    args.output_file,
+                    diff_with_last_commit=diff_with_last_commit,
+                    json_output_file=args.json_output_file,
+                    filter_models=not commit_flags["no_filter"],
+                )
+                filter_tests(args.output_file, ["repo_utils"])
+            except Exception as e:
+                print(f"\nError when trying to grab the relevant tests: {e}\n\nRunning all tests.")
+                commit_flags["test_all"] = True
+
+        if commit_flags["test_all"]:
             with open(args.output_file, "w", encoding="utf-8") as f:
-                if args.filters is None:
-                    f.write("./tests/")
-                else:
-                    f.write(" ".join(args.filters))
+                f.write("tests")
+            example_file = Path(args.output_file).parent / "examples_test_list.txt"
+            with open(example_file, "w", encoding="utf-8") as f:
+                f.write("all")
+
+            test_files_to_run = get_all_tests()
+            create_json_map(test_files_to_run, args.json_output_file)
diff --git a/utils/update_metadata.py b/utils/update_metadata.py
index 6c5f3ee2b824..9a233a082ff7 100644
--- a/utils/update_metadata.py
+++ b/utils/update_metadata.py
@@ -12,18 +12,34 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+Utility that updates the metadata of the Transformers library in the repository `huggingface/transformers-metadata`.
 
+Usage for an update (as used by the GitHub action `update_metadata`):
+
+```bash
+python utils/update_metadata.py --token <token> --commit_sha <commit_sha>
+```
+
+Usage to check all pipelines are properly defined in the constant `PIPELINE_TAGS_AND_AUTO_MODELS` of this script, so
+that new pipelines are properly added as metadata (as used in `make repo-consistency`):
+
+```bash
+python utils/update_metadata.py --check-only
+```
+"""
 import argparse
 import collections
-import importlib.util
 import os
 import re
 import tempfile
+from typing import Dict, List, Tuple
 
 import pandas as pd
 from datasets import Dataset
+from huggingface_hub import hf_hub_download, upload_folder
 
-from huggingface_hub import Repository
+from transformers.utils import direct_transformers_import
 
 
 # All paths are set with the intent you should run this script from the root of the repo with the command
@@ -32,12 +48,7 @@
 
 
 # This is to make sure the transformers module imported is the one in the repo.
-spec = importlib.util.spec_from_file_location(
-    "transformers",
-    os.path.join(TRANSFORMERS_PATH, "__init__.py"),
-    submodule_search_locations=[TRANSFORMERS_PATH],
-)
-transformers_module = spec.loader.load_module()
+transformers_module = direct_transformers_import(TRANSFORMERS_PATH)
 
 
 # Regexes that match TF/Flax/PT model names.
@@ -56,6 +67,7 @@
     ("automatic-speech-recognition", "MODEL_FOR_CTC_MAPPING_NAMES", "AutoModelForCTC"),
     ("image-classification", "MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES", "AutoModelForImageClassification"),
     ("image-segmentation", "MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES", "AutoModelForImageSegmentation"),
+    ("image-to-image", "MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES", "AutoModelForImageToImage"),
     ("fill-mask", "MODEL_FOR_MASKED_LM_MAPPING_NAMES", "AutoModelForMaskedLM"),
     ("object-detection", "MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES", "AutoModelForObjectDetection"),
     (
@@ -98,22 +110,40 @@
     ("image-to-text", "MODEL_FOR_FOR_VISION_2_SEQ_MAPPING_NAMES", "AutoModelForVision2Seq"),
     (
         "zero-shot-image-classification",
-        "_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES",
-        "AutoModel",
+        "MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES",
+        "AutoModelForZeroShotImageClassification",
     ),
     ("depth-estimation", "MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES", "AutoModelForDepthEstimation"),
     ("video-classification", "MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES", "AutoModelForVideoClassification"),
+    ("mask-generation", "MODEL_FOR_MASK_GENERATION_MAPPING_NAMES", "AutoModelForMaskGeneration"),
+    ("text-to-audio", "MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES", "AutoModelForTextToSpectrogram"),
+    ("text-to-audio", "MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES", "AutoModelForTextToWaveform"),
 ]
 
 
-# Thanks to https://stackoverflow.com/questions/29916065/how-to-do-camelcase-split-in-python
-def camel_case_split(identifier):
-    "Split a camelcased `identifier` into words."
+def camel_case_split(identifier: str) -> List[str]:
+    """
+    Split a camel-cased name into words.
+
+    Args:
+        identifier (`str`): The camel-cased name to parse.
+
+    Returns:
+        `List[str]`: The list of words in the identifier (as seprated by capital letters).
+
+    Example:
+
+    ```py
+    >>> camel_case_split("CamelCasedClass")
+    ["Camel", "Cased", "Class"]
+    ```
+    """
+    # Regex thanks to https://stackoverflow.com/questions/29916065/how-to-do-camelcase-split-in-python
     matches = re.finditer(".+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)", identifier)
     return [m.group(0) for m in matches]
 
 
-def get_frameworks_table():
+def get_frameworks_table() -> pd.DataFrame:
     """
     Generates a dataframe containing the supported auto classes for each model type, using the content of the auto
     modules.
@@ -159,7 +189,8 @@ def get_frameworks_table():
     data["tensorflow"] = [tf_models[t] for t in all_models]
     data["flax"] = [flax_models[t] for t in all_models]
 
-    # Now let's use the auto-mapping names to make sure
+    # Now let's find the right processing class for each model. In order we check if there is a Processor, then a
+    # Tokenizer, then a FeatureExtractor, then an ImageProcessor
     processors = {}
     for t in all_models:
         if t in transformers_module.models.auto.processing_auto.PROCESSOR_MAPPING_NAMES:
@@ -168,6 +199,8 @@ def get_frameworks_table():
             processors[t] = "AutoTokenizer"
         elif t in transformers_module.models.auto.feature_extraction_auto.FEATURE_EXTRACTOR_MAPPING_NAMES:
             processors[t] = "AutoFeatureExtractor"
+        elif t in transformers_module.models.auto.image_processing_auto.IMAGE_PROCESSOR_MAPPING_NAMES:
+            processors[t] = "AutoFeatureExtractor"
         else:
             # Default to AutoTokenizer if a model has nothing, for backward compatibility.
             processors[t] = "AutoTokenizer"
@@ -177,10 +210,17 @@ def get_frameworks_table():
     return pd.DataFrame(data)
 
 
-def update_pipeline_and_auto_class_table(table):
+def update_pipeline_and_auto_class_table(table: Dict[str, Tuple[str, str]]) -> Dict[str, Tuple[str, str]]:
     """
-    Update the table of model class to (pipeline_tag, auto_class) without removing old keys if they don't exist
-    anymore.
+    Update the table maping models to pipelines and auto classes without removing old keys if they don't exist anymore.
+
+    Args:
+        table (`Dict[str, Tuple[str, str]]`):
+            The existing table mapping model names to a tuple containing the pipeline tag and the auto-class name with
+            which they should be used.
+
+    Returns:
+        `Dict[str, Tuple[str, str]]`: The updated table in the same format.
     """
     auto_modules = [
         transformers_module.models.auto.modeling_auto,
@@ -209,52 +249,63 @@ def update_pipeline_and_auto_class_table(table):
     return table
 
 
-def update_metadata(token, commit_sha):
+def update_metadata(token: str, commit_sha: str):
     """
-    Update the metadata for the Transformers repo.
+    Update the metadata for the Transformers repo in `huggingface/transformers-metadata`.
+
+    Args:
+        token (`str`): A valid token giving write access to `huggingface/transformers-metadata`.
+        commit_sha (`str`): The commit SHA on Transformers corresponding to this update.
     """
-    with tempfile.TemporaryDirectory() as tmp_dir:
-        repo = Repository(
-            tmp_dir, clone_from="huggingface/transformers-metadata", repo_type="dataset", use_auth_token=token
-        )
+    frameworks_table = get_frameworks_table()
+    frameworks_dataset = Dataset.from_pandas(frameworks_table)
+
+    resolved_tags_file = hf_hub_download(
+        "huggingface/transformers-metadata", "pipeline_tags.json", repo_type="dataset", token=token
+    )
+    tags_dataset = Dataset.from_json(resolved_tags_file)
+    table = {
+        tags_dataset[i]["model_class"]: (tags_dataset[i]["pipeline_tag"], tags_dataset[i]["auto_class"])
+        for i in range(len(tags_dataset))
+    }
+    table = update_pipeline_and_auto_class_table(table)
+
+    # Sort the model classes to avoid some nondeterministic updates to create false update commits.
+    model_classes = sorted(table.keys())
+    tags_table = pd.DataFrame(
+        {
+            "model_class": model_classes,
+            "pipeline_tag": [table[m][0] for m in model_classes],
+            "auto_class": [table[m][1] for m in model_classes],
+        }
+    )
+    tags_dataset = Dataset.from_pandas(tags_table)
 
-        frameworks_table = get_frameworks_table()
-        frameworks_dataset = Dataset.from_pandas(frameworks_table)
+    with tempfile.TemporaryDirectory() as tmp_dir:
         frameworks_dataset.to_json(os.path.join(tmp_dir, "frameworks.json"))
-
-        tags_dataset = Dataset.from_json(os.path.join(tmp_dir, "pipeline_tags.json"))
-        table = {
-            tags_dataset[i]["model_class"]: (tags_dataset[i]["pipeline_tag"], tags_dataset[i]["auto_class"])
-            for i in range(len(tags_dataset))
-        }
-        table = update_pipeline_and_auto_class_table(table)
-
-        # Sort the model classes to avoid some nondeterministic updates to create false update commits.
-        model_classes = sorted(list(table.keys()))
-        tags_table = pd.DataFrame(
-            {
-                "model_class": model_classes,
-                "pipeline_tag": [table[m][0] for m in model_classes],
-                "auto_class": [table[m][1] for m in model_classes],
-            }
-        )
-        tags_dataset = Dataset.from_pandas(tags_table)
         tags_dataset.to_json(os.path.join(tmp_dir, "pipeline_tags.json"))
 
-        if repo.is_repo_clean():
-            print("Nothing to commit!")
+        if commit_sha is not None:
+            commit_message = (
+                f"Update with commit {commit_sha}\n\nSee: "
+                f"https://github.com/huggingface/transformers/commit/{commit_sha}"
+            )
         else:
-            if commit_sha is not None:
-                commit_message = (
-                    f"Update with commit {commit_sha}\n\nSee: "
-                    f"https://github.com/huggingface/transformers/commit/{commit_sha}"
-                )
-            else:
-                commit_message = "Update"
-            repo.push_to_hub(commit_message)
+            commit_message = "Update"
+
+        upload_folder(
+            repo_id="huggingface/transformers-metadata",
+            folder_path=tmp_dir,
+            repo_type="dataset",
+            token=token,
+            commit_message=commit_message,
+        )
 
 
 def check_pipeline_tags():
+    """
+    Check all pipeline tags are properly defined in the `PIPELINE_TAGS_AND_AUTO_MODELS` constant of this script.
+    """
     in_table = {tag: cls for tag, _, cls in PIPELINE_TAGS_AND_AUTO_MODELS}
     pipeline_tasks = transformers_module.pipelines.SUPPORTED_TASKS
     missing = []
diff --git a/utils/update_tiny_models.py b/utils/update_tiny_models.py
new file mode 100644
index 000000000000..2e40990691ad
--- /dev/null
+++ b/utils/update_tiny_models.py
@@ -0,0 +1,200 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A script running `create_dummy_models.py` with a pre-defined set of arguments.
+
+This file is intended to be used in a CI workflow file without the need of specifying arguments. It creates and uploads
+tiny models for all model classes (if their tiny versions are not on the Hub yet), as well as produces an updated
+version of `tests/utils/tiny_model_summary.json`. That updated file should be merged into the `main` branch of
+`transformers` so the pipeline testing will use the latest created/updated tiny models.
+"""
+
+
+import argparse
+import copy
+import json
+import multiprocessing
+import os
+import time
+
+from create_dummy_models import COMPOSITE_MODELS, create_tiny_models
+from huggingface_hub import ModelFilter, hf_api
+
+import transformers
+from transformers import AutoFeatureExtractor, AutoImageProcessor, AutoTokenizer
+from transformers.image_processing_utils import BaseImageProcessor
+
+
+def get_all_model_names():
+    model_names = set()
+    # Each auto modeling files contains multiple mappings. Let's get them in a dynamic way.
+    for module_name in ["modeling_auto", "modeling_tf_auto", "modeling_flax_auto"]:
+        module = getattr(transformers.models.auto, module_name, None)
+        if module is None:
+            continue
+        # all mappings in a single auto modeling file
+        mapping_names = [
+            x
+            for x in dir(module)
+            if x.endswith("_MAPPING_NAMES")
+            and (x.startswith("MODEL_") or x.startswith("TF_MODEL_") or x.startswith("FLAX_MODEL_"))
+        ]
+        for name in mapping_names:
+            mapping = getattr(module, name)
+            if mapping is not None:
+                for v in mapping.values():
+                    if isinstance(v, (list, tuple)):
+                        model_names.update(v)
+                    elif isinstance(v, str):
+                        model_names.add(v)
+
+    return sorted(model_names)
+
+
+def get_tiny_model_names_from_repo():
+    # All model names defined in auto mappings
+    model_names = set(get_all_model_names())
+
+    with open("tests/utils/tiny_model_summary.json") as fp:
+        tiny_model_info = json.load(fp)
+    tiny_models_names = set()
+    for model_base_name in tiny_model_info:
+        tiny_models_names.update(tiny_model_info[model_base_name]["model_classes"])
+
+    # Remove a tiny model name if one of its framework implementation hasn't yet a tiny version on the Hub.
+    not_on_hub = model_names.difference(tiny_models_names)
+    for model_name in copy.copy(tiny_models_names):
+        if not model_name.startswith("TF") and f"TF{model_name}" in not_on_hub:
+            tiny_models_names.remove(model_name)
+        elif model_name.startswith("TF") and model_name[2:] in not_on_hub:
+            tiny_models_names.remove(model_name)
+
+    return sorted(tiny_models_names)
+
+
+def get_tiny_model_summary_from_hub(output_path):
+    special_models = COMPOSITE_MODELS.values()
+
+    # All tiny model base names on Hub
+    model_names = get_all_model_names()
+    models = hf_api.list_models(
+        filter=ModelFilter(
+            author="hf-internal-testing",
+        )
+    )
+    _models = set()
+    for x in models:
+        model = x.modelId
+        org, model = model.split("/")
+        if not model.startswith("tiny-random-"):
+            continue
+        model = model.replace("tiny-random-", "")
+        if not model[0].isupper():
+            continue
+        if model not in model_names and model not in special_models:
+            continue
+        _models.add(model)
+
+    models = sorted(_models)
+    # All tiny model names on Hub
+    summary = {}
+    for model in models:
+        repo_id = f"hf-internal-testing/tiny-random-{model}"
+        model = model.split("-")[0]
+        try:
+            repo_info = hf_api.repo_info(repo_id)
+            content = {
+                "tokenizer_classes": set(),
+                "processor_classes": set(),
+                "model_classes": set(),
+                "sha": repo_info.sha,
+            }
+        except Exception:
+            continue
+        try:
+            time.sleep(1)
+            tokenizer_fast = AutoTokenizer.from_pretrained(repo_id)
+            content["tokenizer_classes"].add(tokenizer_fast.__class__.__name__)
+        except Exception:
+            pass
+        try:
+            time.sleep(1)
+            tokenizer_slow = AutoTokenizer.from_pretrained(repo_id, use_fast=False)
+            content["tokenizer_classes"].add(tokenizer_slow.__class__.__name__)
+        except Exception:
+            pass
+        try:
+            time.sleep(1)
+            img_p = AutoImageProcessor.from_pretrained(repo_id)
+            content["processor_classes"].add(img_p.__class__.__name__)
+        except Exception:
+            pass
+        try:
+            time.sleep(1)
+            feat_p = AutoFeatureExtractor.from_pretrained(repo_id)
+            if not isinstance(feat_p, BaseImageProcessor):
+                content["processor_classes"].add(feat_p.__class__.__name__)
+        except Exception:
+            pass
+        try:
+            time.sleep(1)
+            model_class = getattr(transformers, model)
+            m = model_class.from_pretrained(repo_id)
+            content["model_classes"].add(m.__class__.__name__)
+        except Exception:
+            pass
+        try:
+            time.sleep(1)
+            model_class = getattr(transformers, f"TF{model}")
+            m = model_class.from_pretrained(repo_id)
+            content["model_classes"].add(m.__class__.__name__)
+        except Exception:
+            pass
+
+        content["tokenizer_classes"] = sorted(content["tokenizer_classes"])
+        content["processor_classes"] = sorted(content["processor_classes"])
+        content["model_classes"] = sorted(content["model_classes"])
+
+        summary[model] = content
+        with open(os.path.join(output_path, "hub_tiny_model_summary.json"), "w") as fp:
+            json.dump(summary, fp, ensure_ascii=False, indent=4)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num_workers", default=1, type=int, help="The number of workers to run.")
+    args = parser.parse_args()
+
+    # This has to be `spawn` to avoid hanging forever!
+    multiprocessing.set_start_method("spawn")
+
+    output_path = "tiny_models"
+    all = True
+    model_types = None
+    models_to_skip = get_tiny_model_names_from_repo()
+    no_check = True
+    upload = True
+    organization = "hf-internal-testing"
+
+    create_tiny_models(
+        output_path,
+        all,
+        model_types,
+        models_to_skip,
+        no_check,
+        upload,
+        organization,
+        token=os.environ.get("TOKEN", None),
+        num_workers=args.num_workers,
+    )